From 8390263e2b8343ef89add2a4dc7a1dcf2e9581c7 Mon Sep 17 00:00:00 2001 From: Weiwei Chen Date: Sat, 22 Mar 2025 23:12:52 -0400 Subject: [PATCH 0001/1029] Add MCLinker skeleton. --- llvm/include/llvm/MCLinker/MCLinker.h | 136 ++++++++++++++++++ llvm/lib/CMakeLists.txt | 1 + llvm/lib/MCLinker/CMakeLists.txt | 15 ++ llvm/lib/MCLinker/MCLinker.cpp | 15 ++ .../llvm-project-overlay/llvm/BUILD.bazel | 20 +++ 5 files changed, 187 insertions(+) create mode 100644 llvm/include/llvm/MCLinker/MCLinker.h create mode 100644 llvm/lib/MCLinker/CMakeLists.txt create mode 100644 llvm/lib/MCLinker/MCLinker.cpp diff --git a/llvm/include/llvm/MCLinker/MCLinker.h b/llvm/include/llvm/MCLinker/MCLinker.h new file mode 100644 index 0000000000000..9c8721700975e --- /dev/null +++ b/llvm/include/llvm/MCLinker/MCLinker.h @@ -0,0 +1,136 @@ +//===- MCLinker.h - Linker at MC level------------- -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MCLINKER_H +#define LLVM_MCLINKER_H + +#include "llvm/ADT/StringMap.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/ModuleSplitter/ModuleSplitter.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +/// This file defines data structures to help linking LLVM modules +/// at MC level (right after codegen) and AsmPrint into one .o or .s file. +/// This linking is needed because we parallelize the llvm opt and +/// llc pipelines by splitting LLVMModule into multiple splits +/// with symbol linkage changes. +/// Linking at MC level helps to fix the temporary symbol linkage change, +/// deduplicate multiple symbols among the splits. +/// This allows mojo compilation to produce 1 .o file for each program +/// (instead of one .a file with multiple .o files in .a) with reduced +/// object file size (due to symbol dedup and linkage restoration). + +struct MCInfo { + MCInfo(std::unique_ptr &&MachineModuleInfo, + LLVMModuleAndContext &&ModuleAndContext, + llvm::StringMap &FnNameToFnPtr, + std::unique_ptr &&TgtMachine, + std::unique_ptr &&McContext, + std::optional SplitIdx) + : ModuleAndContext(std::move(ModuleAndContext)), + McContext(std::move(McContext)), + MachineModuleInfo(std::move(MachineModuleInfo)), + FnNameToFnPtr(std::move(FnNameToFnPtr)), + TgtMachine(std::move(TgtMachine)), SplitIdx(SplitIdx){}; + + MCInfo(MCInfo &&Other) + : ModuleBuf(std::move(Other.ModuleBuf)), + ModuleAndContext(std::move(Other.ModuleAndContext)), + McContext(std::move(Other.McContext)), + MachineModuleInfo(std::move(Other.MachineModuleInfo)), + FnNameToFnPtr(std::move(Other.FnNameToFnPtr)), + TgtMachine(std::move(Other.TgtMachine)), SplitIdx(Other.SplitIdx) {} + + /// Serialize the llvm::Module into bytecode. + // We will deserialize it back to put into + /// a different LLVMContext that is required for linking using llvm::Linker. + std::unique_ptr ModuleBuf; + + /// Keep original module split alive because llvm::Function is kept as + /// reference in llvm::MachineFunctions and will be used during codegen. + LLVMModuleAndContext ModuleAndContext; + + /// ExternContext to MachineModuleInfo to work around the upstream bug + /// with the move constructor of MachineModuleInfo. + std::unique_ptr McContext; + + /// This is where all the MachineFunction live that we need for AsmPrint. + std::unique_ptr MachineModuleInfo; + + /// llvm::Function name to llvm::Function* map for concatenating the + /// MachineFunctions map. + llvm::StringMap FnNameToFnPtr; + + /// Keep targetMachine alive. + std::unique_ptr TgtMachine; + + /// parallel llvm module split id, mostly used for debugging. + std::optional SplitIdx; +}; + +struct SymbolAndMCInfo { + SymbolAndMCInfo() = default; + + SymbolAndMCInfo(SymbolAndMCInfo &&Other) + : SymbolLinkageTypes(std::move(Other.SymbolLinkageTypes)), + McInfos(std::move(Other.McInfos)) {} + + /// Clear member variables explicitly. + void clear(); + + /// Book-keeping original symbol linkage type if they are changed due to + /// splitting for parallel compilation. + llvm::StringMap SymbolLinkageTypes; + + /// Vector of codegen results for each parallel split before AsmPrint. + SmallVector> McInfos; +}; + +class MCLinker { +public: + MCLinker(SmallVectorImpl &SymbolAndMCInfos, + llvm::TargetMachine &TgtMachine, + llvm::StringMap SymbolLinkageTypes, + llvm::StringMap OriginalFnOrdering); + + /// Link multiple MC results and AsmPrint into one .o file. + ErrorOr> + linkAndPrint(StringRef ModuleName); + +private: + SmallVectorImpl &SymbolAndMCInfos; + llvm::TargetMachine &TgtMachine; + SmallVector McInfos; + LLVMModuleAndContext LinkedModule; + + llvm::StringMap SymbolLinkageTypes; + llvm::StringMap OriginalFnOrdering; + llvm::MachineModuleInfoWrapperPass *MachineModInfoPass = nullptr; + + /// Link llvm::Modules from each split. + Expected linkLLVMModules(StringRef ModuleName); + + // /// Get llvm::Module and prepare MachineModuleInfoWrapperPass to print if + // /// there is only one split. + // llvm::Module * + // getModuleToPrintOneSplit(llvm::TargetMachine &LlvmTgtMachine); + + /// Prepare MachineModuleInfo before AsmPrinting. + void prepareMachineModuleInfo(llvm::TargetMachine &LlvmTgtMachine); +}; + +} // namespace llvm + +#endif diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt index 2201fcda0a7fd..5879491ec2c5f 100644 --- a/llvm/lib/CMakeLists.txt +++ b/llvm/lib/CMakeLists.txt @@ -24,6 +24,7 @@ add_subdirectory(Analysis) add_subdirectory(LTO) add_subdirectory(MC) add_subdirectory(MCA) +add_subdirectory(MCLinker) add_subdirectory(ModuleSplitter) add_subdirectory(ObjCopy) add_subdirectory(Object) diff --git a/llvm/lib/MCLinker/CMakeLists.txt b/llvm/lib/MCLinker/CMakeLists.txt new file mode 100644 index 0000000000000..fe7103b105576 --- /dev/null +++ b/llvm/lib/MCLinker/CMakeLists.txt @@ -0,0 +1,15 @@ +add_llvm_component_library(LLVMMCLinker + MCLinker.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/ModuleSplitter + ${LLVM_MAIN_INCLUDE_DIR}/llvm/MCLinker + + LINK_COMPONENTS + Core + IRReader + BitReader + BitWriter + Support + TransformUtils +) diff --git a/llvm/lib/MCLinker/MCLinker.cpp b/llvm/lib/MCLinker/MCLinker.cpp new file mode 100644 index 0000000000000..34e7cfaa82273 --- /dev/null +++ b/llvm/lib/MCLinker/MCLinker.cpp @@ -0,0 +1,15 @@ +//===--- MCLinker.cpp - MCLinker --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + + +#include "llvm/MCLinker/MCLinker.h" + +using namespace llvm; +#define DEBUG_TYPE "mclinker" diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 1f04672489169..8ecd17287c272 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -2085,6 +2085,26 @@ cc_library( ], ) +cc_library( + name = "MCLinker", + srcs = glob([ + "lib/MCLinker/*.cpp", + ]), + hdrs = glob([ + "include/llvm/ModuleSplitter/*.h", + "include/llvm/MCLinker/*.h", + ]), + copts = llvm_copts, + deps = [ + ":BitReader", + ":BitWriter", + ":Core", + ":IRReader", + ":Support", + ":TransformUtils", + ], +) + cc_library( name = "ModuleSplitter", srcs = glob([ From f1fb86d5fe0c6c97a2e0d632b45f66591d4a6e31 Mon Sep 17 00:00:00 2001 From: Weiwei Chen Date: Tue, 25 Mar 2025 12:12:49 -0400 Subject: [PATCH 0002/1029] checkpoint. --- llvm/include/llvm/MCLinker/MCLinker.h | 23 +- llvm/lib/MCLinker/MCLinker.cpp | 320 ++++++++++++++++++++++++++ 2 files changed, 333 insertions(+), 10 deletions(-) diff --git a/llvm/include/llvm/MCLinker/MCLinker.h b/llvm/include/llvm/MCLinker/MCLinker.h index 9c8721700975e..41acbb2a451b7 100644 --- a/llvm/include/llvm/MCLinker/MCLinker.h +++ b/llvm/include/llvm/MCLinker/MCLinker.h @@ -32,18 +32,17 @@ namespace llvm { /// (instead of one .a file with multiple .o files in .a) with reduced /// object file size (due to symbol dedup and linkage restoration). +//============================================================================== +// MCInfo +//============================================================================== + struct MCInfo { MCInfo(std::unique_ptr &&MachineModuleInfo, LLVMModuleAndContext &&ModuleAndContext, llvm::StringMap &FnNameToFnPtr, std::unique_ptr &&TgtMachine, std::unique_ptr &&McContext, - std::optional SplitIdx) - : ModuleAndContext(std::move(ModuleAndContext)), - McContext(std::move(McContext)), - MachineModuleInfo(std::move(MachineModuleInfo)), - FnNameToFnPtr(std::move(FnNameToFnPtr)), - TgtMachine(std::move(TgtMachine)), SplitIdx(SplitIdx){}; + std::optional SplitIdx); MCInfo(MCInfo &&Other) : ModuleBuf(std::move(Other.ModuleBuf)), @@ -56,7 +55,7 @@ struct MCInfo { /// Serialize the llvm::Module into bytecode. // We will deserialize it back to put into /// a different LLVMContext that is required for linking using llvm::Linker. - std::unique_ptr ModuleBuf; + std::unique_ptr ModuleBuf = nullptr; /// Keep original module split alive because llvm::Function is kept as /// reference in llvm::MachineFunctions and will be used during codegen. @@ -80,6 +79,11 @@ struct MCInfo { std::optional SplitIdx; }; + +//============================================================================== +// SymbolAndMCInfo +//============================================================================== + struct SymbolAndMCInfo { SymbolAndMCInfo() = default; @@ -102,8 +106,7 @@ class MCLinker { public: MCLinker(SmallVectorImpl &SymbolAndMCInfos, llvm::TargetMachine &TgtMachine, - llvm::StringMap SymbolLinkageTypes, - llvm::StringMap OriginalFnOrdering); + llvm::StringMap SymbolLinkageTypes); /// Link multiple MC results and AsmPrint into one .o file. ErrorOr> @@ -116,7 +119,7 @@ class MCLinker { LLVMModuleAndContext LinkedModule; llvm::StringMap SymbolLinkageTypes; - llvm::StringMap OriginalFnOrdering; + // llvm::StringMap OriginalFnOrdering; llvm::MachineModuleInfoWrapperPass *MachineModInfoPass = nullptr; /// Link llvm::Modules from each split. diff --git a/llvm/lib/MCLinker/MCLinker.cpp b/llvm/lib/MCLinker/MCLinker.cpp index 34e7cfaa82273..5411533097729 100644 --- a/llvm/lib/MCLinker/MCLinker.cpp +++ b/llvm/lib/MCLinker/MCLinker.cpp @@ -11,5 +11,325 @@ #include "llvm/MCLinker/MCLinker.h" +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/Bitcode/BitcodeWriter.h" + using namespace llvm; #define DEBUG_TYPE "mclinker" + +//============================================================================== +// MCInfo +//============================================================================== + +MCInfo::MCInfo(std::unique_ptr &&MachineModuleInfo, + LLVMModuleAndContext &&ModuleAndContext, + llvm::StringMap &FnNameToFnPtr, + std::unique_ptr &&TgtMachine, + std::unique_ptr &&McContext, + std::optional SplitIdx) + : ModuleAndContext(std::move(ModuleAndContext)), + McContext(std::move(McContext)), + MachineModuleInfo(std::move(MachineModuleInfo)), + FnNameToFnPtr(std::move(FnNameToFnPtr)), + TgtMachine(std::move(TgtMachine)), SplitIdx(SplitIdx){ + std::string BufStr; + llvm::raw_string_ostream BufOS(BufStr); + llvm::WriteBitcodeToFile(*ModuleAndContext, BufOS); + ModuleBuf = WritableMemoryBuffer::getNewUninitMemBuffer(BufStr.size()); + memcpy(ModuleBuf->getBufferStart(), BufStr.c_str(), BufStr.size()); +} + +//============================================================================== +// SymbolAndMCInfo +//============================================================================== + +void SymbolAndMCInfo::clear() { + SymbolLinkageTypes.clear(); + McInfos.clear(); +} + +//============================================================================== +// MCLinker +//============================================================================== + +MCLinker::MCLinker( + SmallVectorImpl &SymbolAndMCInfos, + llvm::TargetMachine &TgtMachine, + llvm::StringMap SymbolLinkageTypes) + : SymbolAndMCInfos(SymbolAndMCInfos), TgtMachine(TgtMachine), + SymbolLinkageTypes(std::move(SymbolLinkageTypes)) { + + llvm::TargetMachine &LLVMTgtMachine = + static_cast(TgtMachine); + + MachineModInfoPass = + new llvm::MachineModuleInfoWrapperPass(&LLVMTgtMachine); +} + + +Expected MCLinker::linkLLVMModules(StringRef moduleName) { + Expected createModuleResult = + LinkedModule.create([&](llvm::LLVMContext &ctx) { + return std::make_unique(moduleName, ctx); + }); + + if (createModuleResult.isError()) + return Error("failed to create an empty LLVMModule for MCLinker"); + + llvm::Linker linker(*linkedModule); + + for (auto [i, smcInfos] : llvm::enumerate(symbolAndMCInfos)) { + for (auto &[key, value] : smcInfos->symbolLinkageTypes) + symbolLinkageTypes.insert({key, value}); + + for (auto [j, mcInfo] : llvm::enumerate(smcInfos->mcInfos)) { + mcInfos.push_back(mcInfo.get()); + + // Modules have to be in the same LLVMContext to be linked. + llvm::Expected> moduleOr = + llvm::parseBitcodeFile( + llvm::MemoryBufferRef( + StringRef(mcInfo->moduleBuf->getBufferStart(), + mcInfo->moduleBuf->getBufferSize()), + ""), + linkedModule->getContext()); + if (!moduleOr) + return Error("failed to serialize post-llc modules"); + + std::unique_ptr module = std::move(moduleOr.get()); + if (linker.linkInModule(std::move(module))) + return Error("failed to link post-llc modules"); + mcInfo->mcContext->setUseNamesOnTempLabels(true); + } + } + + // Restore linkage type. + for (llvm::GlobalValue &global : linkedModule->globals()) { + if (!global.hasWeakLinkage()) + continue; + auto iter = symbolLinkageTypes.find(global.getName().str()); + if (iter == symbolLinkageTypes.end()) + continue; + + global.setLinkage(iter->second); + global.setDSOLocal(true); + } + + for (llvm::Function &fn : linkedModule->functions()) { + if (!fn.hasWeakLinkage()) + continue; + + auto iter = symbolLinkageTypes.find(fn.getName().str()); + if (iter == symbolLinkageTypes.end()) + continue; + + fn.setLinkage(iter->second); + fn.setDSOLocal(true); + } + + return {}; +} + +void MCLinker::prepareMachineModuleInfo( + llvm::TargetMachine &llvmTargetMachine) { + for (auto [i, smcInfos] : llvm::enumerate(symbolAndMCInfos)) { + for (auto [j, mcInfo] : llvm::enumerate(smcInfos->mcInfos)) { + // Move MachineFunctions from each split's codegen result + // into machineModInfoPass to print out together in one .o + llvm::DenseMap> &machineFunctions = + getMachineFunctionsFromMachineModuleInfo(*mcInfo->machineModuleInfo); + + llvm::StringMap &fnNameToFnPtr = + mcInfo->fnNameToFnPtr; + + mcInfo->machineModuleInfo->getContext().setObjectFileInfo( + llvmTargetMachine.getObjFileLowering()); + + for (auto &fn : linkedModule->functions()) { + if (fn.isDeclaration()) + continue; + if (machineModInfoPass->getMMI().getMachineFunction(fn)) + continue; + + auto fnPtrIter = fnNameToFnPtr.find(fn.getName().str()); + if (fnPtrIter == fnNameToFnPtr.end()) + continue; + auto mfPtrIter = machineFunctions.find(fnPtrIter->second); + if (mfPtrIter == machineFunctions.end()) + continue; + + llvm::Function &origFn = mfPtrIter->second->getFunction(); + + machineModInfoPass->getMMI().insertFunction( + fn, std::move(mfPtrIter->second)); + + // Restore function linkage types. + if (!origFn.hasWeakLinkage()) + continue; + + auto iter = symbolLinkageTypes.find(fn.getName().str()); + if (iter == symbolLinkageTypes.end()) + continue; + + origFn.setLinkage(iter->second); + origFn.setDSOLocal(true); + } + + // Restore global variable linkage types. + for (auto &global : mcInfo->moduleAndContext->globals()) { + if (!global.hasWeakLinkage()) + continue; + auto iter = symbolLinkageTypes.find(global.getName().str()); + if (iter == symbolLinkageTypes.end()) + continue; + + global.setLinkage(iter->second); + global.setDSOLocal(true); + } + + // Release memory as soon as possible to reduce peak memory footprint. + mcInfo->machineModuleInfo.reset(); + mcInfo->fnNameToFnPtr.clear(); + mcInfo->moduleBuf.reset(); + } + } +} + +llvm::Module * +MCLinker::getModuleToPrintOneSplit(llvm::TargetMachine &llvmTargetMachine) { + auto &mcInfo = symbolAndMCInfos[0]->mcInfos[0]; + + llvm::DenseMap> + &machineFunctions = + getMachineFunctionsFromMachineModuleInfo(*mcInfo->machineModuleInfo); + + mcInfo->machineModuleInfo->getContext().setObjectFileInfo( + llvmTargetMachine.getObjFileLowering()); + + for (auto &fn : mcInfo->moduleAndContext->functions()) { + if (fn.isDeclaration()) + continue; + + auto mfPtrIter = machineFunctions.find(&fn); + if (mfPtrIter == machineFunctions.end()) + continue; + + machineModInfoPass->getMMI().insertFunction(fn, + std::move(mfPtrIter->second)); + } + + mcInfo->mcContext->setUseNamesOnTempLabels(true); + // Release memory as soon as possible to reduce peak memory footprint. + mcInfo->machineModuleInfo.reset(); + mcInfo->fnNameToFnPtr.clear(); + mcInfo->moduleBuf.reset(); + return &(*mcInfo->moduleAndContext); +} + +ErrorOr MCLinker::linkAndPrint(StringRef moduleName, + bool emitAssembly) { + + llvm::TargetMachine &llvmTargetMachine = + static_cast(targetMachine); + + llvmTargetMachine.Options.MCOptions.AsmVerbose = options.verboseOutput; + llvmTargetMachine.Options.MCOptions.PreserveAsmComments = + options.verboseOutput; + + bool hasOneSplit = + symbolAndMCInfos.size() == 1 && symbolAndMCInfos[0]->mcInfos.size() == 1; + + llvm::Module *oneSplitModule = nullptr; + + if (!hasOneSplit) { + if (isNVPTXBackend(options)) { + // For NVPTX backend to avoid false hit + // with its stale AnnotationCache which is populated during both + // llvm-opt and llc pipeline passes but is only cleared at the end of + // codegen in AsmPrint. We need to make sure that llvm-opt and llc + // are using the sname llvm::Module to that the cache can be properly + // cleaned. We currently achieve this by keeping only one split for NVPTX + // compilation. + return Error("NVPTX compilation should have multiple splits."); + } + + // link at llvm::Module level. + ErrorOrSuccess lmResult = linkLLVMModules(moduleName); + if (lmResult.isError()) + return Error(lmResult.getError()); + + prepareMachineModuleInfo(llvmTargetMachine); + + // Function ordering may be changed in the linkedModule due to Linker, + // but the original order matters for NVPTX backend to generate function + // declaration properly to avoid use before def/decl illegal instructions. + // Sort the linkedModule's functions back to to its original order + // (only definition matter, declaration doesn't). + if (isNVPTXBackend(options)) { + linkedModule->getFunctionList().sort( + [&](const auto &lhs, const auto &rhs) { + if (lhs.isDeclaration() && rhs.isDeclaration()) + return true; + + if (lhs.isDeclaration()) + return false; + + if (rhs.isDeclaration()) + return true; + + auto iter1 = originalFnOrdering.find(lhs.getName()); + if (iter1 == originalFnOrdering.end()) + return true; + auto iter2 = originalFnOrdering.find(rhs.getName()); + if (iter2 == originalFnOrdering.end()) + return true; + + return iter1->second < iter2->second; + }); + } + } else { + oneSplitModule = getModuleToPrintOneSplit(llvmTargetMachine); + oneSplitModule->setModuleIdentifier(moduleName); + } + + // Prepare AsmPrint pipeline. + WriteableBufferRef linkedObj = WriteableBuffer::get(); + + llvm::legacy::PassManager passMgr; + // Add an appropriate TargetLibraryInfo pass for the module's triple. + llvm::TargetLibraryInfoImpl targetLibInfo(llvm::Triple(options.targetTriple)); + + // Add AsmPrint pass and run the pass manager. + passMgr.add(new llvm::TargetLibraryInfoWrapperPass(targetLibInfo)); + if (KGEN::addPassesToAsmPrint(options, llvmTargetMachine, passMgr, *linkedObj, + emitAssembly + ? llvm::CodeGenFileType::AssemblyFile + : llvm::CodeGenFileType::ObjectFile, + true, machineModInfoPass, mcInfos)) { + // Release some of the AsyncValue memory to avoid + // wrong version of LLVMContext destructor being called due to + // multiple LLVM being statically linked in dylibs that have + // access to this code path. + for (SymbolAndMCInfo *smcInfo : symbolAndMCInfos) + smcInfo->clear(); + + return Error("failed to add to ObjectFile Print pass"); + } + + const_cast( + llvmTargetMachine.getObjFileLowering()) + ->Initialize(machineModInfoPass->getMMI().getContext(), targetMachine); + + llvm::Module &moduleToRun = hasOneSplit ? *oneSplitModule : *linkedModule; + passMgr.run(moduleToRun); + + // Release some of the AsyncValue memory to avoid + // wrong version of LLVMContext destructor being called due to + // multiple LLVM being statically linked in dylibs that have + // access to this code path. + for (SymbolAndMCInfo *smcInfo : symbolAndMCInfos) + smcInfo->clear(); + + return linkedObj; +} From e8879e1e50af9885636885d55ae6aceb660999ed Mon Sep 17 00:00:00 2001 From: Weiwei Chen Date: Tue, 25 Mar 2025 16:19:57 -0400 Subject: [PATCH 0003/1029] Add MCLinker and friends. --- llvm/include/llvm/MCLinker/MCLinker.h | 10 +- llvm/include/llvm/MCLinker/MCPipeline.h | 37 +++ llvm/lib/MCLinker/CMakeLists.txt | 7 +- llvm/lib/MCLinker/MCLinker.cpp | 320 ++++++++++-------------- llvm/lib/MCLinker/MCLinkerUtils.cpp | 156 ++++++++++++ llvm/lib/MCLinker/MCLinkerUtils.h | 63 +++++ llvm/lib/MCLinker/MCPipeline.cpp | 159 ++++++++++++ 7 files changed, 554 insertions(+), 198 deletions(-) create mode 100644 llvm/include/llvm/MCLinker/MCPipeline.h create mode 100644 llvm/lib/MCLinker/MCLinkerUtils.cpp create mode 100644 llvm/lib/MCLinker/MCLinkerUtils.h create mode 100644 llvm/lib/MCLinker/MCPipeline.cpp diff --git a/llvm/include/llvm/MCLinker/MCLinker.h b/llvm/include/llvm/MCLinker/MCLinker.h index 41acbb2a451b7..7050f181a97ad 100644 --- a/llvm/include/llvm/MCLinker/MCLinker.h +++ b/llvm/include/llvm/MCLinker/MCLinker.h @@ -10,8 +10,8 @@ /// //===----------------------------------------------------------------------===// -#ifndef LLVM_MCLINKER_H -#define LLVM_MCLINKER_H +#ifndef LLVM_MCLINKER_MCLINKER_H +#define LLVM_MCLINKER_MCLINKER_H #include "llvm/ADT/StringMap.h" #include "llvm/CodeGen/MachineFunction.h" @@ -79,7 +79,6 @@ struct MCInfo { std::optional SplitIdx; }; - //============================================================================== // SymbolAndMCInfo //============================================================================== @@ -109,8 +108,9 @@ class MCLinker { llvm::StringMap SymbolLinkageTypes); /// Link multiple MC results and AsmPrint into one .o file. - ErrorOr> - linkAndPrint(StringRef ModuleName); + Expected> + linkAndPrint(StringRef ModuleName, llvm::CodeGenFileType CodegenType, + bool VerboseOutput); private: SmallVectorImpl &SymbolAndMCInfos; diff --git a/llvm/include/llvm/MCLinker/MCPipeline.h b/llvm/include/llvm/MCLinker/MCPipeline.h new file mode 100644 index 0000000000000..1d78d996ace49 --- /dev/null +++ b/llvm/include/llvm/MCLinker/MCPipeline.h @@ -0,0 +1,37 @@ +//===- MCPipeline.h - Passes to run with MCLinker --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MCLINKER_MCPIPELINE_H +#define LLVM_MCLINKER_MCPIPELINE_H + +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/MCLinker/MCLinker.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +namespace mclinker { +/// Build a pipeline that does machine specific codgen but stops before +/// AsmPrint. +bool addPassesToEmitMC(llvm::TargetMachine &, llvm::legacy::PassManagerBase &, + llvm::raw_pwrite_stream &, bool, + llvm::MachineModuleInfoWrapperPass *, unsigned); + +/// Build a pipeline that does AsmPrint only. +bool addPassesToAsmPrint(llvm::TargetMachine &, llvm::legacy::PassManagerBase &, + llvm::raw_pwrite_stream &, llvm::CodeGenFileType, bool, + llvm::MachineModuleInfoWrapperPass *, + llvm::SmallVectorImpl &); +} // namespace mclinker + +} // namespace llvm + +#endif diff --git a/llvm/lib/MCLinker/CMakeLists.txt b/llvm/lib/MCLinker/CMakeLists.txt index fe7103b105576..67728c5fb0c34 100644 --- a/llvm/lib/MCLinker/CMakeLists.txt +++ b/llvm/lib/MCLinker/CMakeLists.txt @@ -1,15 +1,18 @@ add_llvm_component_library(LLVMMCLinker + MCLinkerUtils.cpp MCLinker.cpp + MCPipeline.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/ModuleSplitter ${LLVM_MAIN_INCLUDE_DIR}/llvm/MCLinker LINK_COMPONENTS - Core - IRReader BitReader BitWriter + Core + IRReader + Linker Support TransformUtils ) diff --git a/llvm/lib/MCLinker/MCLinker.cpp b/llvm/lib/MCLinker/MCLinker.cpp index 5411533097729..178933e803c0e 100644 --- a/llvm/lib/MCLinker/MCLinker.cpp +++ b/llvm/lib/MCLinker/MCLinker.cpp @@ -8,11 +8,19 @@ // //===----------------------------------------------------------------------===// - #include "llvm/MCLinker/MCLinker.h" +#include "MCLinkerUtils.h" +#include "llvm/MCLinker/MCPipeline.h" +#include "MCLinkerUtils.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" using namespace llvm; #define DEBUG_TYPE "mclinker" @@ -22,16 +30,16 @@ using namespace llvm; //============================================================================== MCInfo::MCInfo(std::unique_ptr &&MachineModuleInfo, - LLVMModuleAndContext &&ModuleAndContext, - llvm::StringMap &FnNameToFnPtr, - std::unique_ptr &&TgtMachine, - std::unique_ptr &&McContext, - std::optional SplitIdx) - : ModuleAndContext(std::move(ModuleAndContext)), - McContext(std::move(McContext)), - MachineModuleInfo(std::move(MachineModuleInfo)), - FnNameToFnPtr(std::move(FnNameToFnPtr)), - TgtMachine(std::move(TgtMachine)), SplitIdx(SplitIdx){ + LLVMModuleAndContext &&ModuleAndContext, + llvm::StringMap &FnNameToFnPtr, + std::unique_ptr &&TgtMachine, + std::unique_ptr &&McContext, + std::optional SplitIdx) + : ModuleAndContext(std::move(ModuleAndContext)), + McContext(std::move(McContext)), + MachineModuleInfo(std::move(MachineModuleInfo)), + FnNameToFnPtr(std::move(FnNameToFnPtr)), + TgtMachine(std::move(TgtMachine)), SplitIdx(SplitIdx) { std::string BufStr; llvm::raw_string_ostream BufOS(BufStr); llvm::WriteBitcodeToFile(*ModuleAndContext, BufOS); @@ -62,274 +70,204 @@ MCLinker::MCLinker( llvm::TargetMachine &LLVMTgtMachine = static_cast(TgtMachine); - MachineModInfoPass = - new llvm::MachineModuleInfoWrapperPass(&LLVMTgtMachine); + MachineModInfoPass = new llvm::MachineModuleInfoWrapperPass(&LLVMTgtMachine); } - Expected MCLinker::linkLLVMModules(StringRef moduleName) { - Expected createModuleResult = + Expected CreateModuleOr = LinkedModule.create([&](llvm::LLVMContext &ctx) { return std::make_unique(moduleName, ctx); }); - if (createModuleResult.isError()) - return Error("failed to create an empty LLVMModule for MCLinker"); + if (!CreateModuleOr) { + return make_error( + "failed to create an empty LLVMModule for MCLinker", + inconvertibleErrorCode()); + } - llvm::Linker linker(*linkedModule); + llvm::Linker ModuleLinker(*LinkedModule); - for (auto [i, smcInfos] : llvm::enumerate(symbolAndMCInfos)) { - for (auto &[key, value] : smcInfos->symbolLinkageTypes) - symbolLinkageTypes.insert({key, value}); + for (auto [i, SmcInfos] : llvm::enumerate(SymbolAndMCInfos)) { + for (auto &[key, value] : SmcInfos->SymbolLinkageTypes) + SymbolLinkageTypes.insert({key, value}); - for (auto [j, mcInfo] : llvm::enumerate(smcInfos->mcInfos)) { - mcInfos.push_back(mcInfo.get()); + for (auto [j, McInfo] : llvm::enumerate(SmcInfos->McInfos)) { + McInfos.push_back(McInfo.get()); // Modules have to be in the same LLVMContext to be linked. - llvm::Expected> moduleOr = + llvm::Expected> ModuleOr = llvm::parseBitcodeFile( llvm::MemoryBufferRef( - StringRef(mcInfo->moduleBuf->getBufferStart(), - mcInfo->moduleBuf->getBufferSize()), + StringRef(McInfo->ModuleBuf->getBufferStart(), + McInfo->ModuleBuf->getBufferSize()), ""), - linkedModule->getContext()); - if (!moduleOr) - return Error("failed to serialize post-llc modules"); - - std::unique_ptr module = std::move(moduleOr.get()); - if (linker.linkInModule(std::move(module))) - return Error("failed to link post-llc modules"); - mcInfo->mcContext->setUseNamesOnTempLabels(true); + LinkedModule->getContext()); + if (!ModuleOr) { + return make_error("failed to serialize post-llc modules", + inconvertibleErrorCode()); + } + + std::unique_ptr M = std::move(ModuleOr.get()); + + if (ModuleLinker.linkInModule(std::move(M))) { + return make_error("failed to link post-llc modules", + inconvertibleErrorCode()); + } + + McInfo->McContext->setUseNamesOnTempLabels(true); } } - // Restore linkage type. - for (llvm::GlobalValue &global : linkedModule->globals()) { - if (!global.hasWeakLinkage()) + // Restore linkage type! + for (llvm::GlobalValue &G : LinkedModule->globals()) { + if (!G.hasWeakLinkage()) continue; - auto iter = symbolLinkageTypes.find(global.getName().str()); - if (iter == symbolLinkageTypes.end()) + auto Iter = SymbolLinkageTypes.find(G.getName().str()); + if (Iter == SymbolLinkageTypes.end()) continue; - global.setLinkage(iter->second); - global.setDSOLocal(true); + G.setLinkage(Iter->second); + G.setDSOLocal(true); } - for (llvm::Function &fn : linkedModule->functions()) { - if (!fn.hasWeakLinkage()) + for (llvm::Function &F : LinkedModule->functions()) { + if (!F.hasWeakLinkage()) continue; - auto iter = symbolLinkageTypes.find(fn.getName().str()); - if (iter == symbolLinkageTypes.end()) + auto Iter = SymbolLinkageTypes.find(F.getName().str()); + if (Iter == SymbolLinkageTypes.end()) continue; - fn.setLinkage(iter->second); - fn.setDSOLocal(true); + F.setLinkage(Iter->second); + F.setDSOLocal(true); } - return {}; + return true; } void MCLinker::prepareMachineModuleInfo( llvm::TargetMachine &llvmTargetMachine) { - for (auto [i, smcInfos] : llvm::enumerate(symbolAndMCInfos)) { - for (auto [j, mcInfo] : llvm::enumerate(smcInfos->mcInfos)) { + for (auto [i, SmcInfos] : llvm::enumerate(SymbolAndMCInfos)) { + for (auto [j, McInfo] : llvm::enumerate(SmcInfos->McInfos)) { // Move MachineFunctions from each split's codegen result // into machineModInfoPass to print out together in one .o llvm::DenseMap> &machineFunctions = - getMachineFunctionsFromMachineModuleInfo(*mcInfo->machineModuleInfo); + llvm::mclinker::getMachineFunctionsFromMachineModuleInfo( + *McInfo->MachineModuleInfo); - llvm::StringMap &fnNameToFnPtr = - mcInfo->fnNameToFnPtr; + llvm::StringMap &FnNameToFnPtr = + McInfo->FnNameToFnPtr; - mcInfo->machineModuleInfo->getContext().setObjectFileInfo( - llvmTargetMachine.getObjFileLowering()); + McInfo->MachineModuleInfo->getContext().setObjectFileInfo( + TgtMachine.getObjFileLowering()); - for (auto &fn : linkedModule->functions()) { - if (fn.isDeclaration()) + for (auto &Fn : LinkedModule->functions()) { + if (Fn.isDeclaration()) continue; - if (machineModInfoPass->getMMI().getMachineFunction(fn)) + if (MachineModInfoPass->getMMI().getMachineFunction(Fn)) continue; - auto fnPtrIter = fnNameToFnPtr.find(fn.getName().str()); - if (fnPtrIter == fnNameToFnPtr.end()) + auto FnPtrIter = FnNameToFnPtr.find(Fn.getName().str()); + if (FnPtrIter == FnNameToFnPtr.end()) continue; - auto mfPtrIter = machineFunctions.find(fnPtrIter->second); - if (mfPtrIter == machineFunctions.end()) + auto MfPtrIter = machineFunctions.find(FnPtrIter->second); + if (MfPtrIter == machineFunctions.end()) continue; - llvm::Function &origFn = mfPtrIter->second->getFunction(); + llvm::Function &OrigFn = MfPtrIter->second->getFunction(); - machineModInfoPass->getMMI().insertFunction( - fn, std::move(mfPtrIter->second)); + MachineModInfoPass->getMMI().insertFunction( + Fn, std::move(MfPtrIter->second)); // Restore function linkage types. - if (!origFn.hasWeakLinkage()) + if (!OrigFn.hasWeakLinkage()) continue; - auto iter = symbolLinkageTypes.find(fn.getName().str()); - if (iter == symbolLinkageTypes.end()) + auto Iter = SymbolLinkageTypes.find(Fn.getName().str()); + if (Iter == SymbolLinkageTypes.end()) continue; - origFn.setLinkage(iter->second); - origFn.setDSOLocal(true); + OrigFn.setLinkage(Iter->second); + OrigFn.setDSOLocal(true); } // Restore global variable linkage types. - for (auto &global : mcInfo->moduleAndContext->globals()) { - if (!global.hasWeakLinkage()) + for (auto &G : McInfo->ModuleAndContext->globals()) { + if (!G.hasWeakLinkage()) continue; - auto iter = symbolLinkageTypes.find(global.getName().str()); - if (iter == symbolLinkageTypes.end()) + auto Iter = SymbolLinkageTypes.find(G.getName().str()); + if (Iter == SymbolLinkageTypes.end()) continue; - global.setLinkage(iter->second); - global.setDSOLocal(true); + G.setLinkage(Iter->second); + G.setDSOLocal(true); } // Release memory as soon as possible to reduce peak memory footprint. - mcInfo->machineModuleInfo.reset(); - mcInfo->fnNameToFnPtr.clear(); - mcInfo->moduleBuf.reset(); + McInfo->MachineModuleInfo.reset(); + McInfo->FnNameToFnPtr.clear(); + McInfo->ModuleBuf.reset(); } } } -llvm::Module * -MCLinker::getModuleToPrintOneSplit(llvm::TargetMachine &llvmTargetMachine) { - auto &mcInfo = symbolAndMCInfos[0]->mcInfos[0]; - - llvm::DenseMap> - &machineFunctions = - getMachineFunctionsFromMachineModuleInfo(*mcInfo->machineModuleInfo); - - mcInfo->machineModuleInfo->getContext().setObjectFileInfo( - llvmTargetMachine.getObjFileLowering()); - - for (auto &fn : mcInfo->moduleAndContext->functions()) { - if (fn.isDeclaration()) - continue; - - auto mfPtrIter = machineFunctions.find(&fn); - if (mfPtrIter == machineFunctions.end()) - continue; - - machineModInfoPass->getMMI().insertFunction(fn, - std::move(mfPtrIter->second)); - } - - mcInfo->mcContext->setUseNamesOnTempLabels(true); - // Release memory as soon as possible to reduce peak memory footprint. - mcInfo->machineModuleInfo.reset(); - mcInfo->fnNameToFnPtr.clear(); - mcInfo->moduleBuf.reset(); - return &(*mcInfo->moduleAndContext); -} - -ErrorOr MCLinker::linkAndPrint(StringRef moduleName, - bool emitAssembly) { - - llvm::TargetMachine &llvmTargetMachine = - static_cast(targetMachine); - - llvmTargetMachine.Options.MCOptions.AsmVerbose = options.verboseOutput; - llvmTargetMachine.Options.MCOptions.PreserveAsmComments = - options.verboseOutput; +Expected> +MCLinker::linkAndPrint(StringRef ModuleName, llvm::CodeGenFileType CodegenType, + bool VerboseOutput) { - bool hasOneSplit = - symbolAndMCInfos.size() == 1 && symbolAndMCInfos[0]->mcInfos.size() == 1; + llvm::TargetMachine &LLVMTgtMachine = + static_cast(TgtMachine); - llvm::Module *oneSplitModule = nullptr; + LLVMTgtMachine.Options.MCOptions.AsmVerbose = VerboseOutput; + LLVMTgtMachine.Options.MCOptions.PreserveAsmComments = VerboseOutput; - if (!hasOneSplit) { - if (isNVPTXBackend(options)) { - // For NVPTX backend to avoid false hit - // with its stale AnnotationCache which is populated during both - // llvm-opt and llc pipeline passes but is only cleared at the end of - // codegen in AsmPrint. We need to make sure that llvm-opt and llc - // are using the sname llvm::Module to that the cache can be properly - // cleaned. We currently achieve this by keeping only one split for NVPTX - // compilation. - return Error("NVPTX compilation should have multiple splits."); - } + // link at llvm::Module level. + Expected LMResultOr = linkLLVMModules(ModuleName); + if (!LMResultOr) + return LMResultOr.takeError(); - // link at llvm::Module level. - ErrorOrSuccess lmResult = linkLLVMModules(moduleName); - if (lmResult.isError()) - return Error(lmResult.getError()); - - prepareMachineModuleInfo(llvmTargetMachine); - - // Function ordering may be changed in the linkedModule due to Linker, - // but the original order matters for NVPTX backend to generate function - // declaration properly to avoid use before def/decl illegal instructions. - // Sort the linkedModule's functions back to to its original order - // (only definition matter, declaration doesn't). - if (isNVPTXBackend(options)) { - linkedModule->getFunctionList().sort( - [&](const auto &lhs, const auto &rhs) { - if (lhs.isDeclaration() && rhs.isDeclaration()) - return true; - - if (lhs.isDeclaration()) - return false; - - if (rhs.isDeclaration()) - return true; - - auto iter1 = originalFnOrdering.find(lhs.getName()); - if (iter1 == originalFnOrdering.end()) - return true; - auto iter2 = originalFnOrdering.find(rhs.getName()); - if (iter2 == originalFnOrdering.end()) - return true; - - return iter1->second < iter2->second; - }); - } - } else { - oneSplitModule = getModuleToPrintOneSplit(llvmTargetMachine); - oneSplitModule->setModuleIdentifier(moduleName); - } + prepareMachineModuleInfo(LLVMTgtMachine); // Prepare AsmPrint pipeline. - WriteableBufferRef linkedObj = WriteableBuffer::get(); - - llvm::legacy::PassManager passMgr; + llvm::legacy::PassManager PassMgr; + SmallString<1024> Buf; + raw_svector_ostream BufOS(Buf); // Add an appropriate TargetLibraryInfo pass for the module's triple. - llvm::TargetLibraryInfoImpl targetLibInfo(llvm::Triple(options.targetTriple)); + llvm::TargetLibraryInfoImpl TargetLibInfo(TgtMachine.getTargetTriple()); // Add AsmPrint pass and run the pass manager. - passMgr.add(new llvm::TargetLibraryInfoWrapperPass(targetLibInfo)); - if (KGEN::addPassesToAsmPrint(options, llvmTargetMachine, passMgr, *linkedObj, - emitAssembly - ? llvm::CodeGenFileType::AssemblyFile - : llvm::CodeGenFileType::ObjectFile, - true, machineModInfoPass, mcInfos)) { + PassMgr.add(new llvm::TargetLibraryInfoWrapperPass(TargetLibInfo)); + if (llvm::mclinker::addPassesToAsmPrint(LLVMTgtMachine, PassMgr, BufOS, + CodegenType, true, MachineModInfoPass, + McInfos)) { // Release some of the AsyncValue memory to avoid // wrong version of LLVMContext destructor being called due to // multiple LLVM being statically linked in dylibs that have // access to this code path. - for (SymbolAndMCInfo *smcInfo : symbolAndMCInfos) - smcInfo->clear(); + for (SymbolAndMCInfo *SmcInfo : SymbolAndMCInfos) + SmcInfo->clear(); - return Error("failed to add to ObjectFile Print pass"); + return make_error("failed to add to ObjectFile Print pass", + inconvertibleErrorCode()); } + std::unique_ptr LinkedObj = + WritableMemoryBuffer::getNewUninitMemBuffer(Buf.size()); + memcpy(LinkedObj->getBufferStart(), Buf.c_str(), Buf.size()); + const_cast( - llvmTargetMachine.getObjFileLowering()) - ->Initialize(machineModInfoPass->getMMI().getContext(), targetMachine); + LLVMTgtMachine.getObjFileLowering()) + ->Initialize(MachineModInfoPass->getMMI().getContext(), TgtMachine); - llvm::Module &moduleToRun = hasOneSplit ? *oneSplitModule : *linkedModule; - passMgr.run(moduleToRun); + PassMgr.run(*LinkedModule); // Release some of the AsyncValue memory to avoid // wrong version of LLVMContext destructor being called due to // multiple LLVM being statically linked in dylibs that have // access to this code path. - for (SymbolAndMCInfo *smcInfo : symbolAndMCInfos) - smcInfo->clear(); + for (SymbolAndMCInfo *SmcInfo : SymbolAndMCInfos) + SmcInfo->clear(); - return linkedObj; + return LinkedObj; } diff --git a/llvm/lib/MCLinker/MCLinkerUtils.cpp b/llvm/lib/MCLinker/MCLinkerUtils.cpp new file mode 100644 index 0000000000000..c01487e7a9e11 --- /dev/null +++ b/llvm/lib/MCLinker/MCLinkerUtils.cpp @@ -0,0 +1,156 @@ +//===--- MCLinkerUtils.cpp - MCLinkerUtils-----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "MCLinkerUtils.h" +#include "llvm/CodeGen/CodeGenTargetMachineImpl.h" +#include "llvm/MC/MCSubtargetInfo.h" + +namespace { + +// Helpers to access private field of llvm::MachineModuleInfo::MachineFunctions. +using MFAccessor = llvm::DenseMap> + llvm::MachineModuleInfo::*; +MFAccessor getMFAccessor(); +template struct RobberMFFromMachineModuleInfo { + friend MFAccessor getMFAccessor() { return Instance; } +}; +template struct RobberMFFromMachineModuleInfo< + &llvm::MachineModuleInfo::MachineFunctions>; + +// Helpers to access private field of llvm::MachineFunction::FunctionNumber. +using MFNumberAccessor = unsigned llvm::MachineFunction::*; +MFNumberAccessor getMFNumberAccessor(); +template struct RobberMFNumberFromMachineFunction { + friend MFNumberAccessor getMFNumberAccessor() { return Instance; } +}; +template struct RobberMFNumberFromMachineFunction< + &llvm::MachineFunction::FunctionNumber>; + +// Helpers to access private field of llvm::MachineFunction::STI. +using STIAccessor = const llvm::TargetSubtargetInfo *llvm::MachineFunction::*; +STIAccessor getSTIAccessor(); +template struct RobberSTIFromMachineFunction { + friend STIAccessor getSTIAccessor() { return Instance; } +}; +template struct RobberSTIFromMachineFunction<&llvm::MachineFunction::STI>; + +// Helpers to access private field of llvm::MachineModuleInfo::NextFnNum. +using NextFnNumAccessor = unsigned llvm::MachineModuleInfo::*; +NextFnNumAccessor getNextFnNumAccessor(); +template +struct RobberNextFnNumFromMachineModuleInfo { + friend NextFnNumAccessor getNextFnNumAccessor() { return Instance; } +}; +template struct RobberNextFnNumFromMachineModuleInfo< + &llvm::MachineModuleInfo::NextFnNum>; + +// Helpers to access private field of llvm::TargetMachine::STI. +using MCSubtargetInfoAccessor = + std::unique_ptr llvm::TargetMachine::*; +MCSubtargetInfoAccessor getMCSubtargetInfo(); +template +struct RobberMCSubtargetInfoFromTargetMachine { + friend MCSubtargetInfoAccessor getMCSubtargetInfo() { return Instance; } +}; +template struct RobberMCSubtargetInfoFromTargetMachine< + &llvm::TargetMachine::STI>; + +// Helpers to access private functions +template struct LLVMPrivateFnAccessor { + /* export it ... */ + using type = typename Tag::type; + static type Ptr; +}; + +template +typename LLVMPrivateFnAccessor::type LLVMPrivateFnAccessor::Ptr; + +template +struct LLVMPrivateFnAccessorRob : LLVMPrivateFnAccessor { + /* fill it ... */ + struct Filler { + Filler() { LLVMPrivateFnAccessor::Ptr = p; } + }; + static Filler FillerObj; +}; + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wglobal-constructors" +template +typename LLVMPrivateFnAccessorRob::Filler + LLVMPrivateFnAccessorRob::FillerObj; +#pragma GCC diagnostic pop + +// Helpers to access private functions of llvm::MachineModuleInfo::NextFnNum. +struct MCContextGetSymbolEntryAccessor { + using type = llvm::MCSymbolTableEntry &(llvm::MCContext::*)(llvm::StringRef); +}; +template struct LLVMPrivateFnAccessorRob; + +// Helpers to access private field of llvm::LLVMTargetMachine::reset. +struct TargetMachineClearSubtargetMapAccessor { + using type = void (llvm::CodeGenTargetMachineImpl::*)(); +}; +template struct LLVMPrivateFnAccessorRob< + TargetMachineClearSubtargetMapAccessor, + &llvm::CodeGenTargetMachineImpl::reset>; + +} // namespace + +llvm::DenseMap> & +llvm::mclinker::getMachineFunctionsFromMachineModuleInfo( + llvm::MachineModuleInfo &MachineModuleInfo) { + return std::invoke(getMFAccessor(), MachineModuleInfo); +} + +void llvm::mclinker::setMachineFunctionNumber(llvm::MachineFunction &Mf, + unsigned Number) { + unsigned &OrigNumber = std::invoke(getMFNumberAccessor(), Mf); + OrigNumber = Number; +} + +void llvm::mclinker::setNextFnNum(llvm::MachineModuleInfo &MMI, + unsigned Value) { + unsigned &NextFnNum = std::invoke(getNextFnNumAccessor(), MMI); + NextFnNum = Value; +} + +llvm::MCSymbolTableEntry & +llvm::mclinker::getMCContextSymbolTableEntry(llvm::StringRef Name, + llvm::MCContext &McContext) { + return (McContext.* + LLVMPrivateFnAccessor::Ptr)(Name); +} + +void llvm::mclinker::releaseTargetMachineConstants(llvm::TargetMachine &TM) { + std::unique_ptr &McSubtargetInfo = + std::invoke(getMCSubtargetInfo(), TM); + McSubtargetInfo.reset(); + + llvm::CodeGenTargetMachineImpl &TgtMachine = + static_cast(TM); + (TgtMachine.* + LLVMPrivateFnAccessor::Ptr)(); +} + +void llvm::mclinker::resetSubtargetInfo(llvm::TargetMachine &Dst, + llvm::MachineModuleInfo &MMI) { + + llvm::DenseMap> + &MFs = getMachineFunctionsFromMachineModuleInfo(MMI); + + for (auto &[Fn, MF] : MFs) { + const llvm::TargetSubtargetInfo *NewSTI = Dst.getSubtargetImpl(*Fn); + const llvm::TargetSubtargetInfo *&STI = std::invoke(getSTIAccessor(), MF); + STI = NewSTI; + } +} diff --git a/llvm/lib/MCLinker/MCLinkerUtils.h b/llvm/lib/MCLinker/MCLinkerUtils.h new file mode 100644 index 0000000000000..497f786334885 --- /dev/null +++ b/llvm/lib/MCLinker/MCLinkerUtils.h @@ -0,0 +1,63 @@ +//===- MCLinkerUtils.h - MCLinker utility Functions -------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MCLINKER_MCLINKERUTILS_H +#define LLVM_MCLINKER_MCLINKERUTILS_H + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/Module.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSymbolTableEntry.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +namespace mclinker { +// A few helper functions to access LLVM private class/struct members: +// http://bloglitb.blogspot.com/2010/07/access-to-private-members-thats-easy.html + +/// Wrapping accessing LLVM data structure's private filed accessor for +/// linking at MC-level where a few things need to be globalized such as: +/// - llvm::MachineFunction's numbering, +/// - all unique_ptrs of llvm::MachineFunctions in each split to be put +/// together for the final AsmPrint +/// - MCSymbol propagation for external global symbols to each split's +/// MCContext to avoid duplicates for X86's OrcJIT execution engine. + +/// Get private field +/// DenseMap> MachineFunctions +/// from llvm::MachineModuleInfo. +llvm::DenseMap> & +getMachineFunctionsFromMachineModuleInfo(llvm::MachineModuleInfo &); + +/// Set private field FunctionNumber in llvm::MachineFunction. +void setMachineFunctionNumber(llvm::MachineFunction &, unsigned); + +/// Set private field NextFnNum in llvm::MachineModuleInfo. +void setNextFnNum(llvm::MachineModuleInfo &, unsigned); + +/// Call private member function +/// MCSymbolTableEntry &getSymbolTableEntry(StringRef Name) +/// from llvm::MCContext. +llvm::MCSymbolTableEntry &getMCContextSymbolTableEntry(llvm::StringRef, + llvm::MCContext &); + +/// Release MCSubTargetInfo. +void releaseTargetMachineConstants(llvm::TargetMachine &); + +/// Clear SubtargetMap in SubtargetInfo. +void resetSubtargetInfo(llvm::TargetMachine &, llvm::MachineModuleInfo &); + +} // namespace mclinker +} // namespace llvm + +#endif diff --git a/llvm/lib/MCLinker/MCPipeline.cpp b/llvm/lib/MCLinker/MCPipeline.cpp new file mode 100644 index 0000000000000..9ea69d1ab226d --- /dev/null +++ b/llvm/lib/MCLinker/MCPipeline.cpp @@ -0,0 +1,159 @@ +//===--- MCPipeline.cpp - MCPipeline ----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "llvm/MCLinker/MCPipeline.h" + +#include "MCLinkerUtils.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Passes/PassBuilder.h" + +using namespace llvm; +using namespace llvm::mclinker; + +namespace { +class SetMachineFunctionBasePass : public llvm::ImmutablePass { +public: + static char ID; // Pass identification, replacement for typeid + SetMachineFunctionBasePass(llvm::MachineModuleInfo &MMI, unsigned Base); + + // Initialization and Finalization + bool doInitialization(llvm::Module &) override; + bool doFinalization(llvm::Module &) override; + +private: + llvm::MachineModuleInfo &MMI; + unsigned Base; +}; +} // namespace + +char SetMachineFunctionBasePass::ID; + +SetMachineFunctionBasePass::SetMachineFunctionBasePass( + llvm::MachineModuleInfo &MMI, unsigned Base) + : llvm::ImmutablePass(ID), MMI(MMI), Base(Base) {} + +// Initialization and Finalization +bool SetMachineFunctionBasePass::doInitialization(llvm::Module &) { + setNextFnNum(MMI, Base); + return false; +} + +bool SetMachineFunctionBasePass::doFinalization(llvm::Module &) { + return false; +} + +/// Build a pipeline that does machine specific codgen but stops before +/// AsmPrint. Returns true if failed. +bool llvm::mclinker::addPassesToEmitMC( + llvm::TargetMachine &TgtMachine, llvm::legacy::PassManagerBase &PM, + llvm::raw_pwrite_stream &Out, bool DisableVerify, + llvm::MachineModuleInfoWrapperPass *MMIWP, unsigned NumFnBase) { + // Targets may override createPassConfig to provide a target-specific + // subclass. + TargetPassConfig *PassConfig = TgtMachine.createPassConfig(PM); + + // Set PassConfig options provided by TargetMachine. + PassConfig->setDisableVerify(DisableVerify); + PM.add(PassConfig); + PM.add(MMIWP); + + auto *SetFnBaseP = new SetMachineFunctionBasePass(MMIWP->getMMI(), NumFnBase); + PM.add(SetFnBaseP); + + if (PassConfig->addISelPasses()) + return true; + + PassConfig->addMachinePasses(); + PassConfig->setInitialized(); + + return false; +} + +/// Function pass to populate external MCSymbols to other llvm module split's +/// MCContext so that they can be unique across all splits. This uniqueing +/// is required for ORCJIT (not for generating binary .o). +namespace { +class SyncX86SymbolTables : public MachineFunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + explicit SyncX86SymbolTables(SmallVectorImpl &); + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + SmallVectorImpl &McInfos; + DenseSet ExternSymbols; + + // Populate MCSymbol to all the MCContexts. + void populateSymbol(StringRef, const MCSymbolTableValue &, MCContext *); +}; +} // namespace + +char SyncX86SymbolTables::ID; + +SyncX86SymbolTables::SyncX86SymbolTables(SmallVectorImpl &McInfos) + : MachineFunctionPass(ID), McInfos(McInfos) {} + +void SyncX86SymbolTables::populateSymbol(StringRef Name, + const llvm::MCSymbolTableValue &Value, + MCContext *SrcCtx) { + for (MCInfo *McInfo : McInfos) { + MCContext &CurrCtx = *McInfo->McContext; + if (&CurrCtx == SrcCtx) + continue; + MCSymbolTableEntry &Entry = + llvm::mclinker::getMCContextSymbolTableEntry(Name, CurrCtx); + if (!Entry.second.Symbol) { + Entry.second.Symbol = Value.Symbol; + Entry.second.NextUniqueID = Value.NextUniqueID; + Entry.second.Used = Value.Used; + } + } +} + +bool SyncX86SymbolTables::runOnMachineFunction(MachineFunction &MF) { + MCContext &Ctx = MF.getContext(); + for (auto &[Name, SymbolEntry] : Ctx.getSymbols()) { + if (!SymbolEntry.Symbol || !SymbolEntry.Symbol->isExternal() || + ExternSymbols.contains(Name)) + continue; + ExternSymbols.insert(Name); + populateSymbol(Name, SymbolEntry, &Ctx); + } + return false; +} + +/// Build a pipeline that does AsmPrint only. +/// Returns true if failed. +bool llvm::mclinker::addPassesToAsmPrint( + llvm::TargetMachine &TgtMachine, llvm::legacy::PassManagerBase &PM, + llvm::raw_pwrite_stream &Out, llvm::CodeGenFileType FileType, + bool DisableVerify, llvm::MachineModuleInfoWrapperPass *MMIWP, + llvm::SmallVectorImpl &McInfos) { + TargetPassConfig *PassConfig = TgtMachine.createPassConfig(PM); + if (!PassConfig) + return true; + // Set PassConfig options provided by TargetMachine. + PassConfig->setDisableVerify(DisableVerify); + PM.add(PassConfig); + PM.add(MMIWP); + PassConfig->setInitialized(); + + bool Result = TgtMachine.addAsmPrinter(PM, Out, nullptr, FileType, + MMIWP->getMMI().getContext()); + + if (TgtMachine.getTargetTriple().isX86()) + PM.add(new SyncX86SymbolTables(McInfos)); + return Result; +} From 3ef33066bb32a9d9b76d72c6de6a7ae9ff72ddcc Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 29 Mar 2025 19:21:34 +0000 Subject: [PATCH 0004/1029] [UnrollAndJam] Do not preserve loop nests if a loop was fully unrolled. (#133510) If UnJ completely unrolls a loop and removes it entirely, the loop remains in the current loop nest. If the loop nest gets reused the loops will no longer be valid. As there is no way to remove a loop from a LoopNest, this patch removes the preserve of the LoopNestAnalysis so that it will be regenerated. Fixes #124518 --- .../Scalar/LoopUnrollAndJamPass.cpp | 15 +++++--- .../LoopUnrollAndJam/delete_middle_loop.ll | 36 +++++++++++++++++++ 2 files changed, 46 insertions(+), 5 deletions(-) create mode 100644 llvm/test/Transforms/LoopUnrollAndJam/delete_middle_loop.ll diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index f1d1f3bc1e307..4fe74c7c3bbcd 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -425,7 +425,7 @@ static bool tryToUnrollAndJamLoop(LoopNest &LN, DominatorTree &DT, LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC, DependenceInfo &DI, OptimizationRemarkEmitter &ORE, int OptLevel, - LPMUpdater &U) { + LPMUpdater &U, bool &AnyLoopRemoved) { bool DidSomething = false; ArrayRef Loops = LN.getLoops(); Loop *OutmostLoop = &LN.getOutermostLoop(); @@ -441,8 +441,11 @@ static bool tryToUnrollAndJamLoop(LoopNest &LN, DominatorTree &DT, LoopInfo &LI, tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel); if (Result != LoopUnrollResult::Unmodified) DidSomething = true; - if (L == OutmostLoop && Result == LoopUnrollResult::FullyUnrolled) - U.markLoopAsDeleted(*L, LoopName); + if (Result == LoopUnrollResult::FullyUnrolled) { + if (L == OutmostLoop) + U.markLoopAsDeleted(*L, LoopName); + AnyLoopRemoved = true; + } } return DidSomething; @@ -457,11 +460,13 @@ PreservedAnalyses LoopUnrollAndJamPass::run(LoopNest &LN, DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); OptimizationRemarkEmitter ORE(&F); + bool AnyLoopRemoved = false; if (!tryToUnrollAndJamLoop(LN, AR.DT, AR.LI, AR.SE, AR.TTI, AR.AC, DI, ORE, - OptLevel, U)) + OptLevel, U, AnyLoopRemoved)) return PreservedAnalyses::all(); auto PA = getLoopPassPreservedAnalyses(); - PA.preserve(); + if (!AnyLoopRemoved) + PA.preserve(); return PA; } diff --git a/llvm/test/Transforms/LoopUnrollAndJam/delete_middle_loop.ll b/llvm/test/Transforms/LoopUnrollAndJam/delete_middle_loop.ll new file mode 100644 index 0000000000000..f8affdb821903 --- /dev/null +++ b/llvm/test/Transforms/LoopUnrollAndJam/delete_middle_loop.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes="loop(invalidate,loop-unroll-and-jam,loop-unroll-and-jam)" -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s + +; This test completely unrolls the middle loop out of a 3-deep loop nest. + +define i16 @test_it() { +; CHECK-LABEL: define i16 @test_it() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND_LOOPEXIT:.*]]: +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: br label %[[DO_BODY2:.*]] +; CHECK: [[DO_BODY2]]: +; CHECK-NEXT: br label %[[WHILE_COND3:.*]] +; CHECK: [[WHILE_COND3]]: +; CHECK-NEXT: br i1 true, label %[[DO_COND:.*]], label %[[WHILE_COND3]] +; CHECK: [[DO_COND]]: +; CHECK-NEXT: br label %[[FOR_COND_LOOPEXIT]] +; +entry: + br label %for.cond + +for.cond: ; preds = %do.cond, %entry + br label %do.body2 + +do.body2: ; preds = %do.cond, %for.cond + br label %while.cond3 + +while.cond3: ; preds = %while.cond3, %do.body2 + br i1 true, label %do.cond, label %while.cond3 + +do.cond: ; preds = %while.cond3 + br i1 true, label %for.cond, label %do.body2 +} + From 884b19ab40c8b6e5d1fb54894c0418fd27bb93f9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 29 Mar 2025 12:22:13 -0700 Subject: [PATCH 0005/1029] [clang-tools-extra] Use *Set::insert_range (NFC) (#133589) --- .../clang-tidy/bugprone/SignalHandlerCheck.cpp | 11 ++++------- .../InconsistentDeclarationParameterNameCheck.cpp | 4 +--- clang-tools-extra/clangd/ConfigCompile.cpp | 3 +-- clang-tools-extra/clangd/index/dex/Dex.cpp | 3 +-- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp index 902490f4d33c1..27045816a80d3 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.cpp @@ -333,13 +333,10 @@ SignalHandlerCheck::SignalHandlerCheck(StringRef Name, : ClangTidyCheck(Name, Context), AsyncSafeFunctionSet(Options.get("AsyncSafeFunctionSet", AsyncSafeFunctionSetKind::POSIX)) { - if (AsyncSafeFunctionSet == AsyncSafeFunctionSetKind::Minimal) { - for (StringRef v : MinimalConformingFunctions) - ConformingFunctions.insert(v); - } else { - for (StringRef v : POSIXConformingFunctions) - ConformingFunctions.insert(v); - } + if (AsyncSafeFunctionSet == AsyncSafeFunctionSetKind::Minimal) + ConformingFunctions.insert_range(MinimalConformingFunctions); + else + ConformingFunctions.insert_range(POSIXConformingFunctions); } void SignalHandlerCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { diff --git a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp index c41f81b0f0b58..a60a4faa32a16 100644 --- a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.cpp @@ -343,9 +343,7 @@ void InconsistentDeclarationParameterNameCheck::check( void InconsistentDeclarationParameterNameCheck::markRedeclarationsAsVisited( const FunctionDecl *OriginalDeclaration) { - for (const FunctionDecl *Redecl : OriginalDeclaration->redecls()) { - VisitedDeclarations.insert(Redecl); - } + VisitedDeclarations.insert_range(OriginalDeclaration->redecls()); } } // namespace clang::tidy::readability diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp index 21304a8c0fac7..3d7f792aa136b 100644 --- a/clang-tools-extra/clangd/ConfigCompile.cpp +++ b/clang-tools-extra/clangd/ConfigCompile.cpp @@ -439,8 +439,7 @@ struct FragmentCompiler { [Normalized(std::move(Normalized))](const Params &, Config &C) { if (C.Diagnostics.SuppressAll) return; - for (llvm::StringRef N : Normalized) - C.Diagnostics.Suppress.insert(N); + C.Diagnostics.Suppress.insert_range(Normalized); }); if (F.UnusedIncludes) { diff --git a/clang-tools-extra/clangd/index/dex/Dex.cpp b/clang-tools-extra/clangd/index/dex/Dex.cpp index 5643ba0c5e4ce..575a96a112979 100644 --- a/clang-tools-extra/clangd/index/dex/Dex.cpp +++ b/clang-tools-extra/clangd/index/dex/Dex.cpp @@ -181,8 +181,7 @@ std::unique_ptr Dex::createFileProximityIterator( Sources[Path] = SourceParams(); auto PathURI = URI::create(Path).toString(); const auto PathProximityURIs = generateProximityURIs(PathURI.c_str()); - for (const auto &ProximityURI : PathProximityURIs) - ParentURIs.insert(ProximityURI); + ParentURIs.insert_range(PathProximityURIs); } // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults // for all parameters except for Proximity Path distance signal. From 9c6eca28cbd4689173bc8b0733ffe39363481103 Mon Sep 17 00:00:00 2001 From: David Green Date: Sat, 29 Mar 2025 19:25:17 +0000 Subject: [PATCH 0006/1029] [AArch64] Return an invalid cost for vscale x 2 x i128 srem. This protects against invalid size requests on scalable vectors by checking the original VT, not the legalized type when checking for scalars. The cost returned is now invalid, which lines up with the codegen not being able to produce a result. --- .../AArch64/AArch64TargetTransformInfo.cpp | 2 +- llvm/test/Analysis/CostModel/AArch64/sve-rem.ll | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index d072ad63ea3e3..e320b0e653ad4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3726,7 +3726,7 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // add/cmp/csel/csneg should have similar cost while asr/negs/and should // have similar cost. auto VT = TLI->getValueType(DL, Ty); - if (LT.second.isScalarInteger() && VT.getSizeInBits() <= 64) { + if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) { if (Op2Info.isPowerOf2()) { return ISD == ISD::SDIV ? (3 * AddCost + AsrCost) : (3 * AsrCost + AddCost); diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll b/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll index ed88f1b390b68..9d8f43cd99368 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll @@ -23,6 +23,7 @@ define void @srem() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i8 = srem <16 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32i8 = srem <32 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V64i8 = srem <64 x i8> undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = srem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i64 = srem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV4i64 = srem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV8i64 = srem undef, undef @@ -61,6 +62,7 @@ define void @srem() { %V16i8 = srem <16 x i8> undef, undef %V32i8 = srem <32 x i8> undef, undef %V64i8 = srem <64 x i8> undef, undef + %NV2i128 = srem undef, undef %NV2i64 = srem undef, undef %NV4i64 = srem undef, undef %NV8i64 = srem undef, undef @@ -102,6 +104,7 @@ define void @urem() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i8 = urem <16 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32i8 = urem <32 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V64i8 = urem <64 x i8> undef, undef +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = urem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i64 = urem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV4i64 = urem undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV8i64 = urem undef, undef @@ -140,6 +143,7 @@ define void @urem() { %V16i8 = urem <16 x i8> undef, undef %V32i8 = urem <32 x i8> undef, undef %V64i8 = urem <64 x i8> undef, undef + %NV2i128 = urem undef, undef %NV2i64 = urem undef, undef %NV4i64 = urem undef, undef %NV8i64 = urem undef, undef @@ -181,6 +185,7 @@ define void @srem_uniformconst() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 7) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 7) ; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = srem undef, splat (i128 7) ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i64 = srem undef, splat (i64 7) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV4i64 = srem undef, splat (i64 7) ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV8i64 = srem undef, splat (i64 7) @@ -219,6 +224,7 @@ define void @srem_uniformconst() { %V16i8 = srem <16 x i8> undef, splat (i8 7) %V32i8 = srem <32 x i8> undef, splat (i8 7) %V64i8 = srem <64 x i8> undef, splat (i8 7) + %NV2i128 = srem undef, splat (i128 7) %NV2i64 = srem undef, splat (i64 7) %NV4i64 = srem undef, splat (i64 7) %NV8i64 = srem undef, splat (i64 7) @@ -260,6 +266,7 @@ define void @urem_uniformconst() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 7) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 7) ; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = urem undef, splat (i128 7) ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV2i64 = urem undef, splat (i64 7) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %NV4i64 = urem undef, splat (i64 7) ; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %NV8i64 = urem undef, splat (i64 7) @@ -298,6 +305,7 @@ define void @urem_uniformconst() { %V16i8 = urem <16 x i8> undef, splat (i8 7) %V32i8 = urem <32 x i8> undef, splat (i8 7) %V64i8 = urem <64 x i8> undef, splat (i8 7) + %NV2i128 = urem undef, splat (i128 7) %NV2i64 = urem undef, splat (i64 7) %NV4i64 = urem undef, splat (i64 7) %NV8i64 = urem undef, splat (i64 7) @@ -339,6 +347,7 @@ define void @srem_uniformconstpow2() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = srem undef, splat (i128 16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i64 = srem undef, splat (i64 16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i64 = srem undef, splat (i64 16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i64 = srem undef, splat (i64 16) @@ -377,6 +386,7 @@ define void @srem_uniformconstpow2() { %V16i8 = srem <16 x i8> undef, splat (i8 16) %V32i8 = srem <32 x i8> undef, splat (i8 16) %V64i8 = srem <64 x i8> undef, splat (i8 16) + %NV2i128 = srem undef, splat (i128 16) %NV2i64 = srem undef, splat (i64 16) %NV4i64 = srem undef, splat (i64 16) %NV8i64 = srem undef, splat (i64 16) @@ -418,6 +428,7 @@ define void @urem_uniformconstpow2() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = urem undef, splat (i128 16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV2i64 = urem undef, splat (i64 16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i64 = urem undef, splat (i64 16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV8i64 = urem undef, splat (i64 16) @@ -456,6 +467,7 @@ define void @urem_uniformconstpow2() { %V16i8 = urem <16 x i8> undef, splat (i8 16) %V32i8 = urem <32 x i8> undef, splat (i8 16) %V64i8 = urem <64 x i8> undef, splat (i8 16) + %NV2i128 = urem undef, splat (i128 16) %NV2i64 = urem undef, splat (i64 16) %NV4i64 = urem undef, splat (i64 16) %NV8i64 = urem undef, splat (i64 16) @@ -497,6 +509,7 @@ define void @srem_uniformconstnegpow2() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 -16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 -16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = srem undef, splat (i128 -16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i64 = srem undef, splat (i64 -16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV4i64 = srem undef, splat (i64 -16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV8i64 = srem undef, splat (i64 -16) @@ -535,6 +548,7 @@ define void @srem_uniformconstnegpow2() { %V16i8 = srem <16 x i8> undef, splat (i8 -16) %V32i8 = srem <32 x i8> undef, splat (i8 -16) %V64i8 = srem <64 x i8> undef, splat (i8 -16) + %NV2i128 = srem undef, splat (i128 -16) %NV2i64 = srem undef, splat (i64 -16) %NV4i64 = srem undef, splat (i64 -16) %NV8i64 = srem undef, splat (i64 -16) @@ -576,6 +590,7 @@ define void @urem_uniformconstnegpow2() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 -16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 -16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = urem undef, splat (i128 -16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV2i64 = urem undef, splat (i64 -16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %NV4i64 = urem undef, splat (i64 -16) ; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %NV8i64 = urem undef, splat (i64 -16) @@ -614,6 +629,7 @@ define void @urem_uniformconstnegpow2() { %V16i8 = urem <16 x i8> undef, splat (i8 -16) %V32i8 = urem <32 x i8> undef, splat (i8 -16) %V64i8 = urem <64 x i8> undef, splat (i8 -16) + %NV2i128 = urem undef, splat (i128 -16) %NV2i64 = urem undef, splat (i64 -16) %NV4i64 = urem undef, splat (i64 -16) %NV8i64 = urem undef, splat (i64 -16) From 0cd82327ff71282b2bfc51090074a3fd63e4842d Mon Sep 17 00:00:00 2001 From: Qinkun Bao Date: Sat, 29 Mar 2025 12:54:15 -0700 Subject: [PATCH 0007/1029] Fix some typos (NFC) (#133558) --- clang/docs/BoundsSafety.rst | 4 ++-- clang/docs/ConstantInterpreter.rst | 2 +- clang/docs/HIPSupport.rst | 2 +- clang/docs/HLSL/AvailabilityDiagnostics.rst | 2 +- clang/docs/MSVCCompatibility.rst | 2 +- clang/docs/SanitizerCoverage.rst | 2 +- .../FlowSensitive/MultiVarConstantPropagationTest.cpp | 4 ++-- clang/unittests/Analysis/FlowSensitive/TransferTest.cpp | 2 +- clang/unittests/StaticAnalyzer/CallEventTest.cpp | 2 +- clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp | 4 ++-- clang/unittests/Tooling/ExecutionTest.cpp | 2 +- 11 files changed, 14 insertions(+), 14 deletions(-) diff --git a/clang/docs/BoundsSafety.rst b/clang/docs/BoundsSafety.rst index cf5b0c75c0387..4b70b34eb4100 100644 --- a/clang/docs/BoundsSafety.rst +++ b/clang/docs/BoundsSafety.rst @@ -227,7 +227,7 @@ meaning they do not have ABI implications. annotated with ``__ended_by(Q)``. In this case, the end of the range extends to the pointer ``Q``. This is used for "iterator" support in C where you're iterating from one pointer value to another until a final pointer value is - reached (and the final pointer value is not dereferencable). + reached (and the final pointer value is not dereferenceable). Accessing a pointer outside the specified bounds causes a run-time trap or a compile-time error. Also, the model maintains correctness of bounds annotations @@ -507,7 +507,7 @@ Default pointer types in ``typeof()`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When ``typeof()`` takes an expression, it respects the bounds annotation on -the expression type, including the bounds annotation is implcit. For example, +the expression type, including the bounds annotation is implicit. For example, the global variable ``g`` in the following code is implicitly ``__single`` so ``typeof(g)`` gets ``char *__single``. The similar is true for the parameter ``p``, so ``typeof(p)`` returns ``void *__single``. The local variable ``l`` is diff --git a/clang/docs/ConstantInterpreter.rst b/clang/docs/ConstantInterpreter.rst index b08cb1ce353be..a71ee4b430a6e 100644 --- a/clang/docs/ConstantInterpreter.rst +++ b/clang/docs/ConstantInterpreter.rst @@ -293,4 +293,4 @@ TypeInfoPointer ``TypeInfoPointer`` tracks two types: the type assigned to ``std::type_info`` and the type which was passed to ``typeinfo``. -It is part of the taged union in ``Pointer``. +It is part of the tagged union in ``Pointer``. diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst index 8f473c21e1918..b2ac53843aeed 100644 --- a/clang/docs/HIPSupport.rst +++ b/clang/docs/HIPSupport.rst @@ -704,7 +704,7 @@ Open Questions / Future Developments SPIR-V Support on HIPAMD ToolChain ================================== -The HIPAMD ToolChain supports targetting +The HIPAMD ToolChain supports targeting `AMDGCN Flavoured SPIR-V `_. The support for SPIR-V in the ROCm and HIPAMD ToolChain is under active development. diff --git a/clang/docs/HLSL/AvailabilityDiagnostics.rst b/clang/docs/HLSL/AvailabilityDiagnostics.rst index c2f260f268e7b..0db94f4b15a86 100644 --- a/clang/docs/HLSL/AvailabilityDiagnostics.rst +++ b/clang/docs/HLSL/AvailabilityDiagnostics.rst @@ -48,7 +48,7 @@ Strict Diagnostic Mode When strict HLSL availability diagnostic mode is enabled the compiler must report all HLSL API availability issues regardless of code reachability. The implementation of this mode takes advantage of an existing diagnostic scan in ``DiagnoseUnguardedAvailability`` class which is already traversing AST of each function as soon as the function body has been parsed. For HLSL, this pass was only slightly modified, such as making sure diagnostic messages are in the ``hlsl-availability`` group and that availability checks based on shader stage are not included if the shader stage context is unknown. -If the compilation target is a shader library, only availability based on shader model version can be diagnosed during this scan. To diagnose availability based on shader stage, the compiler needs to run the AST traversals implementated in ``DiagnoseHLSLAvailability`` at the end of the translation unit as described above. +If the compilation target is a shader library, only availability based on shader model version can be diagnosed during this scan. To diagnose availability based on shader stage, the compiler needs to run the AST traversals implemented in ``DiagnoseHLSLAvailability`` at the end of the translation unit as described above. As a result, availability based on specific shader stage will only be diagnosed in code that is reachable from a shader entry point or library export function. It also means that function bodies might be scanned multiple time. When that happens, care should be taken not to produce duplicated diagnostics. diff --git a/clang/docs/MSVCCompatibility.rst b/clang/docs/MSVCCompatibility.rst index 0b6fea597f8d3..b4a7d23e1b2c6 100644 --- a/clang/docs/MSVCCompatibility.rst +++ b/clang/docs/MSVCCompatibility.rst @@ -240,7 +240,7 @@ In the above example ``hwPopCnt`` will not be inlined into ``PopCnt`` since ``Po With a larger function that does real work the function call overhead is negligible. However in our popcnt example there is the function call overhead. There is no analog for this specific MSVC behavior in clang. -For clang we effectively have to create the dispatch function ourselves to each specfic implementation. +For clang we effectively have to create the dispatch function ourselves to each specific implementation. SIMD vector types ================= diff --git a/clang/docs/SanitizerCoverage.rst b/clang/docs/SanitizerCoverage.rst index 6ea1d14829005..f952198295ebc 100644 --- a/clang/docs/SanitizerCoverage.rst +++ b/clang/docs/SanitizerCoverage.rst @@ -314,7 +314,7 @@ will not be instrumented. void __sanitizer_cov_trace_div4(uint32_t Val); void __sanitizer_cov_trace_div8(uint64_t Val); - // Called before a GetElemementPtr (GEP) instruction + // Called before a GetElementPtr (GEP) instruction // for every non-constant array index. void __sanitizer_cov_trace_gep(uintptr_t Idx); diff --git a/clang/unittests/Analysis/FlowSensitive/MultiVarConstantPropagationTest.cpp b/clang/unittests/Analysis/FlowSensitive/MultiVarConstantPropagationTest.cpp index ed95887a45f1a..290139f5a85ce 100644 --- a/clang/unittests/Analysis/FlowSensitive/MultiVarConstantPropagationTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/MultiVarConstantPropagationTest.cpp @@ -167,8 +167,8 @@ class ConstantPropagationAnalysis ? ValueLattice(R.Val.getInt().getExtValue()) : ValueLattice::top(); } else { - // An unitialized variable holds *some* value, but we don't know what it - // is (it is implementation defined), so we set it to top. + // An uninitialized variable holds *some* value, but we don't know what + // it is (it is implementation defined), so we set it to top. Vars[Var] = ValueLattice::top(); } } else if (Nodes.getNodeAs(kJustAssignment)) { diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp index f52b73dbbdc57..214aaee9f97f6 100644 --- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp @@ -4888,7 +4888,7 @@ TEST(TransferTest, PointerEquality) { // We won't duplicate all of the tests above with `!=`, as we know that // the implementation simply negates the result of the `==` comparison. - // Instaed, just spot-check one case. + // Instead, just spot-check one case. bool p1_ne_p1 = (p1 != p1); (void)0; // [[p]] diff --git a/clang/unittests/StaticAnalyzer/CallEventTest.cpp b/clang/unittests/StaticAnalyzer/CallEventTest.cpp index d5ca72acaca29..2843572e5f800 100644 --- a/clang/unittests/StaticAnalyzer/CallEventTest.cpp +++ b/clang/unittests/StaticAnalyzer/CallEventTest.cpp @@ -61,7 +61,7 @@ void addCXXDeallocatorChecker(AnalysisASTConsumer &AnalysisConsumer, } // TODO: What we should really be testing here is all the different varieties -// of delete operators, and wether the retrieval of their arguments works as +// of delete operators, and whether the retrieval of their arguments works as // intended. At the time of writing this file, CXXDeallocatorCall doesn't pick // up on much of those due to the AST not containing CXXDeleteExpr for most of // the standard/custom deletes. diff --git a/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp b/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp index cd46efc8ad762..454eee9cf7e0a 100644 --- a/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp +++ b/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp @@ -418,8 +418,8 @@ TEST(RegisterDeps, DependencyInteraction) { // Weak dependencies are registered before strong dependencies. This is most // important for purely diagnostic checkers that are implemented as a part of - // purely modeling checkers, becuse the checker callback order will have to be - // established in between the modeling portion and the weak dependency. + // purely modeling checkers, because the checker callback order will have to + // be established in between the modeling portion and the weak dependency. EXPECT_TRUE( runCheckerOnCode("void f() {int i;}", Diags)); EXPECT_EQ(Diags, "test.RegistrationOrder: test.WeakDep\ntest." diff --git a/clang/unittests/Tooling/ExecutionTest.cpp b/clang/unittests/Tooling/ExecutionTest.cpp index b0fd7ccb950ff..c81049308f706 100644 --- a/clang/unittests/Tooling/ExecutionTest.cpp +++ b/clang/unittests/Tooling/ExecutionTest.cpp @@ -214,7 +214,7 @@ TEST(StandaloneToolTest, SimpleActionWithResult) { auto KVs = Executor.getToolResults()->AllKVResults(); ASSERT_EQ(KVs.size(), 1u); EXPECT_EQ("f", KVs[0].first); - // Currently the standlone executor returns empty corpus, revision, and + // Currently the standalone executor returns empty corpus, revision, and // compilation unit. EXPECT_EQ("::/1", KVs[0].second); From 6b9813446625408393c5a2b4d0d54470f7932944 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 29 Mar 2025 20:14:10 +0000 Subject: [PATCH 0008/1029] [VPlan] Re-enable narrowing interleave groups with interleaving. Remove the UF = 1 restriction introduced by 577631f0a5 building on top of 783a846507683, which allows updating all relevant users of the VF, VPScalarIVSteps in particular. This restores the full functionality of https://github.com/llvm/llvm-project/pull/106441. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 4 ++- ...-narrow-interleave-to-widen-memory-cost.ll | 30 +++++-------------- ...arrow-interleave-to-widen-memory-unroll.ll | 20 ++++--------- 3 files changed, 17 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ed89a6eefe311..8852540aec931 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2472,7 +2472,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, unsigned VectorRegWidth) { using namespace llvm::VPlanPatternMatch; VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); - if (VF.isScalable() || !VectorLoop || Plan.getUF() != 1) + if (VF.isScalable() || !VectorLoop) return; VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); @@ -2599,5 +2599,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, auto *Inc = cast(CanIV->getBackedgeValue()); Inc->setOperand(1, Plan.getOrAddLiveIn(ConstantInt::get( CanIV->getScalarType(), 1 * Plan.getUF()))); + Plan.getVF().replaceAllUsesWith( + Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); removeDeadRecipes(Plan); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll index d730dee9416b2..2171277650541 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll @@ -92,36 +92,22 @@ define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP3]], align 4 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[WIDE_VEC5:%.*]] = load <4 x double>, ptr [[TMP4]], align 4 -; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load <4 x double>, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[STRIDED_VEC10]] -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[WIDE_LOAD1]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = load <2 x double>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = load <2 x double>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = load <2 x double>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]] ; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x double> [[STRIDED_VEC5]], [[STRIDED_VEC11]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP15]], <4 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[TMP10]], align 4 +; CHECK-NEXT: store <2 x double> [[TMP15]], ptr [[TMP11]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll index 4e81be2d20209..e57a5758265f0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll @@ -13,24 +13,16 @@ define void @load_store_interleave_group(ptr noalias %data) { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = shl nsw i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = shl nsw i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP3]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP5]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 +; CHECK-NEXT: store <2 x i64> [[WIDE_LOAD]], ptr [[TMP4]], align 8 +; CHECK-NEXT: store <2 x i64> [[WIDE_LOAD1]], ptr [[TMP5]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: From be7f185ca16e990614113519d003bfe3685f489b Mon Sep 17 00:00:00 2001 From: Alcaro Date: Sat, 29 Mar 2025 21:34:15 +0100 Subject: [PATCH 0009/1029] [NFC][analyzer] Fix typo in VirtualCall checker docs (#133593) --- clang/docs/analyzer/checkers.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst index 482a7ca3340c5..f91b2af1fd105 100644 --- a/clang/docs/analyzer/checkers.rst +++ b/clang/docs/analyzer/checkers.rst @@ -610,7 +610,7 @@ pure virtual – but may be still surprising for the programmer.) void releaseResources() { // warn: This can call the pure virtual method A::getKind() when this is // called from the destructor. - callSomeFunction(getKind()) + callSomeFunction(getKind()); } }; @@ -936,7 +936,7 @@ checker does not report them**. void releaseResources() { // warn: This can be called within ~A() and calls A::getKind() even if // we are destructing a class that is derived from A. - callSomeFunction(getKind()) + callSomeFunction(getKind()); } }; From 3d43739753f3b50824a09343efe40a2a476488df Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 29 Mar 2025 14:26:36 -0700 Subject: [PATCH 0010/1029] [Basic] Use SmallSet::insert_range (NFC) (#133594) --- clang/include/clang/Basic/TargetInfo.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 497d68779b92b..93cffe84e2f42 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -1175,8 +1175,7 @@ class TargetInfo : public TransferrableTargetInfo, } void setRequiresImmediate(llvm::ArrayRef Exacts) { Flags |= CI_ImmediateConstant; - for (int Exact : Exacts) - ImmSet.insert(Exact); + ImmSet.insert_range(Exacts); } void setRequiresImmediate(int Exact) { Flags |= CI_ImmediateConstant; From d66af9c69b6960ad5f903cc6c0db99395dace6af Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 29 Mar 2025 14:27:10 -0700 Subject: [PATCH 0011/1029] [mlir] Use SetVector::insert_range (NFC) (#133595) --- mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp | 3 +-- mlir/lib/Tools/lsp-server-support/CompilationDatabase.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 8e407cc1b348f..91862d2e17d71 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -1542,8 +1542,7 @@ mlir::scf::tileConsumerAndFuseProducersUsingSCF( if (failed(tilingResult)) return rewriter.notifyMatchFailure(consumer, "failed to tile consumer"); - for (auto *tiledOp : tilingResult->tiledOps) - tiledAndFusedOps.insert(tiledOp); + tiledAndFusedOps.insert_range(tilingResult->tiledOps); DenseMap replacements; for (auto [origVal, replacement] : llvm::zip_equal( diff --git a/mlir/lib/Tools/lsp-server-support/CompilationDatabase.cpp b/mlir/lib/Tools/lsp-server-support/CompilationDatabase.cpp index 3688b6975a630..9ae0674383a1d 100644 --- a/mlir/lib/Tools/lsp-server-support/CompilationDatabase.cpp +++ b/mlir/lib/Tools/lsp-server-support/CompilationDatabase.cpp @@ -104,8 +104,7 @@ void CompilationDatabase::loadDatabase(StringRef filename) { } // Track the includes for the file. - for (StringRef include : it.first->second.includeDirs) - knownIncludes.insert(include); + knownIncludes.insert_range(it.first->second.includeDirs); } // Add all of the known includes to the default file info. We don't know any From abf9c1a18d2875b6a8f98eba3761b8389818e1bf Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Sat, 29 Mar 2025 22:54:57 +0100 Subject: [PATCH 0012/1029] [libc++] Switch a few attribute to use the C++11 syntax (#133293) Using the C++11 attribute syntax makes it more strict where attributes can be applied, reducing the number of possible positions an attribute can appear in. --- libcxx/include/__config | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libcxx/include/__config b/libcxx/include/__config index 070298301b0d3..30fe0ef6a3b53 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1157,8 +1157,8 @@ typedef __char32_t char32_t; # define _LIBCPP_USING_IF_EXISTS # endif -# if __has_attribute(__no_destroy__) -# define _LIBCPP_NO_DESTROY __attribute__((__no_destroy__)) +# if __has_cpp_attribute(_Clang::__no_destroy__) +# define _LIBCPP_NO_DESTROY [[_Clang::__no_destroy__]] # else # define _LIBCPP_NO_DESTROY # endif @@ -1188,14 +1188,14 @@ typedef __char32_t char32_t; # define _LIBCPP_NO_SPECIALIZATIONS # endif -# if __has_attribute(__standalone_debug__) -# define _LIBCPP_STANDALONE_DEBUG __attribute__((__standalone_debug__)) +# if __has_cpp_attribute(_Clang::__standalone_debug__) +# define _LIBCPP_STANDALONE_DEBUG [[_Clang::__standalone_debug__]] # else # define _LIBCPP_STANDALONE_DEBUG # endif -# if __has_attribute(__preferred_name__) -# define _LIBCPP_PREFERRED_NAME(x) __attribute__((__preferred_name__(x))) +# if __has_cpp_attribute(_Clang::__preferred_name__) +# define _LIBCPP_PREFERRED_NAME(x) [[_Clang::__preferred_name__(x)]] # else # define _LIBCPP_PREFERRED_NAME(x) # endif From 31c37a4a5ebc0d13d86a91c300e15177e9d77ddf Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Sat, 29 Mar 2025 15:18:44 -0700 Subject: [PATCH 0013/1029] [RISCV][TTI] Adjust VLS shuffle costing to account for sub-mask reuse (#129793) If we have a shuffle which can be split via VLA where two or more of the destinations have exactly the same elements, then we only need to account for them once in costing. The duplicate copies are are (at worst) whole register moves. Note that this change only handles the single source case. Doing the multiple source case seemed a bit more complicated, and I didn't have a motivating test case. --- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 4 ++++ llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll | 8 ++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index d5a2e1988696f..9b91de36a688a 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -436,11 +436,15 @@ costShuffleViaVRegSplitting(RISCVTTIImpl &TTI, MVT LegalVT, copy(Mask, NormalizedMask.begin()); InstructionCost Cost = 0; int NumShuffles = 0; + SmallDenseSet, unsigned>> ReusedSingleSrcShuffles; processShuffleMasks( NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {}, [&](ArrayRef RegMask, unsigned SrcReg, unsigned DestReg) { if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) return; + if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg)) + .second) + return; ++NumShuffles; Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy, RegMask, CostKind, 0, nullptr); diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll index 06c709e4cc879..23d5999237e30 100644 --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll @@ -42,7 +42,7 @@ define void @shuffle() vscale_range(2,2) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v11 = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v12 = shufflevector <4 x i16> poison, <4 x i16> poison, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v13 = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v10b = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v10b = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v14 = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v15 = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16 = shufflevector <2 x float> poison, <2 x float> poison, <2 x i32> @@ -58,7 +58,7 @@ define void @shuffle() vscale_range(2,2) { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v11 = shufflevector <2 x i16> poison, <2 x i16> poison, <2 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v12 = shufflevector <4 x i16> poison, <4 x i16> poison, <4 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v13 = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v10b = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v10b = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v14 = shufflevector <2 x i32> poison, <2 x i32> poison, <2 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v15 = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v16 = shufflevector <2 x float> poison, <2 x float> poison, <2 x i32> @@ -738,7 +738,7 @@ define void @multipart() vscale_range(2,2) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v32a4 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32a4 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v32idrev = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> @@ -758,7 +758,7 @@ define void @multipart() vscale_range(2,2) { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v32a4 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32a4 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v32idrev = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> From 2f5c836e08164ce8835d520001042efe93caf950 Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Sat, 29 Mar 2025 15:21:51 -0700 Subject: [PATCH 0014/1029] [SBProgress] Add swig support for `with` statement in Python (#133527) We recently added an explicit finalize to SBProgress, #128966. I realized while adding some additional implementations of SBProgress that we should to add `with` support for ease of use. This patch addresses adding and `__enter()__` method (which a no-op) and an `__exit()__` to swig. I also refactor the emitter for the test to leverage `with` instead of explicitly calling finalize, and I've updated the docstrings. --- .../bindings/interface/SBProgressDocstrings.i | 9 +++++++- .../bindings/interface/SBProgressExtensions.i | 13 +++++++++++ lldb/bindings/interfaces.swig | 1 + .../lldb-dap/progress/Progress_emitter.py | 23 ++++++++----------- 4 files changed, 32 insertions(+), 14 deletions(-) create mode 100644 lldb/bindings/interface/SBProgressExtensions.i diff --git a/lldb/bindings/interface/SBProgressDocstrings.i b/lldb/bindings/interface/SBProgressDocstrings.i index 8d252ef1f370c..4c001d7d5ebcb 100644 --- a/lldb/bindings/interface/SBProgressDocstrings.i +++ b/lldb/bindings/interface/SBProgressDocstrings.i @@ -46,12 +46,19 @@ rely on the garbage collection when using lldb.SBProgress. Non-deterministic progresses behave the same, but omit the total in the constructor. :: - non_deterministic_progress = lldb.SBProgress('Non deterministic progress, 'Detail', lldb.SBDebugger) + non_deterministic_progress = lldb.SBProgress('Non deterministic progress', 'Detail', lldb.SBDebugger) for i in range(10): non_deterministic_progress.Increment(1) # Explicitly send a progressEnd, otherwise this will be sent # when the python runtime cleans up this object. non_deterministic_progress.Finalize() + +Additionally for Python, progress is supported in a with statement. :: + with lldb.SBProgress('Non deterministic progress', 'Detail', lldb.SBDebugger) as progress: + for i in range(10): + progress.Increment(1) + # The progress object is automatically finalized when the with statement + ") lldb::SBProgress; %feature("docstring", diff --git a/lldb/bindings/interface/SBProgressExtensions.i b/lldb/bindings/interface/SBProgressExtensions.i new file mode 100644 index 0000000000000..6ecf3a1af93b7 --- /dev/null +++ b/lldb/bindings/interface/SBProgressExtensions.i @@ -0,0 +1,13 @@ +%extend lldb::SBProgress { +#ifdef SWIGPYTHON + %pythoncode %{ + def __enter__(self): + '''No-op for with statement''' + pass + + def __exit__(self, exc_type, exc_value, traceback): + '''Finalize the progress object''' + self.Finalize() + %} +#endif +} \ No newline at end of file diff --git a/lldb/bindings/interfaces.swig b/lldb/bindings/interfaces.swig index 08df9a1a8d539..6da56e4e0fa52 100644 --- a/lldb/bindings/interfaces.swig +++ b/lldb/bindings/interfaces.swig @@ -200,6 +200,7 @@ %include "./interface/SBModuleSpecExtensions.i" %include "./interface/SBModuleSpecListExtensions.i" %include "./interface/SBProcessExtensions.i" +%include "./interface/SBProgressExtensions.i" %include "./interface/SBProcessInfoListExtensions.i" %include "./interface/SBQueueItemExtensions.i" %include "./interface/SBScriptObjectExtensions.i" diff --git a/lldb/test/API/tools/lldb-dap/progress/Progress_emitter.py b/lldb/test/API/tools/lldb-dap/progress/Progress_emitter.py index e94a09676e067..445d1bdf4e496 100644 --- a/lldb/test/API/tools/lldb-dap/progress/Progress_emitter.py +++ b/lldb/test/API/tools/lldb-dap/progress/Progress_emitter.py @@ -88,21 +88,18 @@ def __call__(self, debugger, command, exe_ctx, result): progress = lldb.SBProgress( "Progress tester", "Initial Detail", total, debugger ) - # Check to see if total is set to None to indicate an indeterminate progress # then default to 10 steps. - if total is None: - total = 10 - - for i in range(1, total): - if cmd_options.no_details: - progress.Increment(1) - else: - progress.Increment(1, f"Step {i}") - time.sleep(cmd_options.seconds) - - # Not required for deterministic progress, but required for indeterminate progress. - progress.Finalize() + with progress: + if total is None: + total = 10 + + for i in range(1, total): + if cmd_options.no_details: + progress.Increment(1) + else: + progress.Increment(1, f"Step {i}") + time.sleep(cmd_options.seconds) def __lldb_init_module(debugger, dict): From f8ee58a3cbbe4de2ac6d006fad8363ae6d24aa57 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Sat, 29 Mar 2025 15:25:56 -0700 Subject: [PATCH 0015/1029] [RISCV] Initial codegen support for the XRivosVizip extension (#131933) This implements initial code generation support for a subset of the xrivosvizip extension. Specifically, this adds support for vzipeven, vzipodd, and vzip2a, but not vzip2b, vunzip2a, or vunzip2b. The others will follow in separate patches. One review note: The zipeven/zipodd matchers were recently rewritten to better match upstream style, so careful review there would be appreciated. The matchers don't yet support type coercion to wider types. This will be done in a future patch. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 107 +++++- llvm/lib/Target/RISCV/RISCVISelLowering.h | 7 +- llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td | 32 ++ .../fixed-vectors-shuffle-int-interleave.ll | 310 ++++++++++++++++++ .../fixed-vectors-shuffle-zipeven-zipodd.ll | 125 +++++++ 5 files changed, 575 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index beca99c10ec72..a8c83113854c9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -4563,8 +4563,10 @@ static SDValue getSingleShuffleSrc(MVT VT, SDValue V1, SDValue V2) { /// way through the source. static bool isInterleaveShuffle(ArrayRef Mask, MVT VT, int &EvenSrc, int &OddSrc, const RISCVSubtarget &Subtarget) { - // We need to be able to widen elements to the next larger integer type. - if (VT.getScalarSizeInBits() >= Subtarget.getELen()) + // We need to be able to widen elements to the next larger integer type or + // use the zip2a instruction at e64. + if (VT.getScalarSizeInBits() >= Subtarget.getELen() && + !Subtarget.hasVendorXRivosVizip()) return false; int Size = Mask.size(); @@ -4621,6 +4623,48 @@ static bool isElementRotate(const std::array, 2> &SrcInfo, SrcInfo[1].second - SrcInfo[0].second == (int)NumElts; } +static bool isAlternating(const std::array, 2> &SrcInfo, + ArrayRef Mask, bool RequiredPolarity) { + int NumElts = Mask.size(); + for (unsigned i = 0; i != NumElts; ++i) { + int M = Mask[i]; + if (M < 0) + continue; + int Src = M >= NumElts; + int Diff = (int)i - (M % NumElts); + bool C = Src == SrcInfo[1].first && Diff == SrcInfo[1].second; + assert(C != (Src == SrcInfo[0].first && Diff == SrcInfo[0].second) && + "Must match exactly one of the two slides"); + if (RequiredPolarity != (C == i % 2)) + return false; + } + return true; +} + +/// Given a shuffle which can be represented as a pair of two slides, +/// see if it is a zipeven idiom. Zipeven is: +/// vs2: a0 a1 a2 a3 +/// vs1: b0 b1 b2 b3 +/// vd: a0 b0 a2 b2 +static bool isZipEven(const std::array, 2> &SrcInfo, + ArrayRef Mask) { + return SrcInfo[0].second == 0 && SrcInfo[1].second == 1 && + isAlternating(SrcInfo, Mask, true); +} + +/// Given a shuffle which can be represented as a pair of two slides, +/// see if it is a zipodd idiom. Zipodd is: +/// vs2: a0 a1 a2 a3 +/// vs1: b0 b1 b2 b3 +/// vd: a1 b1 a3 b3 +/// Note that the operand order is swapped due to the way we canonicalize +/// the slides, so SrCInfo[0] is vs1, and SrcInfo[1] is vs2. +static bool isZipOdd(const std::array, 2> &SrcInfo, + ArrayRef Mask) { + return SrcInfo[0].second == 0 && SrcInfo[1].second == -1 && + isAlternating(SrcInfo, Mask, false); +} + // Lower a deinterleave shuffle to SRL and TRUNC. Factor must be // 2, 4, 8 and the integer type Factor-times larger than VT's // element type must be a legal element type. @@ -4880,6 +4924,34 @@ static bool isSpreadMask(ArrayRef Mask, unsigned Factor, unsigned &Index) { return true; } +static SDValue lowerVZIP(unsigned Opc, SDValue Op0, SDValue Op1, + const SDLoc &DL, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + assert(RISCVISD::RI_VZIPEVEN_VL == Opc || RISCVISD::RI_VZIPODD_VL == Opc || + RISCVISD::RI_VZIP2A_VL == Opc); + assert(Op0.getSimpleValueType() == Op1.getSimpleValueType()); + + MVT VT = Op0.getSimpleValueType(); + MVT IntVT = VT.changeVectorElementTypeToInteger(); + Op0 = DAG.getBitcast(IntVT, Op0); + Op1 = DAG.getBitcast(IntVT, Op1); + + MVT ContainerVT = IntVT; + if (VT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(DAG, IntVT, Subtarget); + Op0 = convertToScalableVector(ContainerVT, Op0, DAG, Subtarget); + Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget); + } + + auto [Mask, VL] = getDefaultVLOps(IntVT, ContainerVT, DL, DAG, Subtarget); + SDValue Passthru = DAG.getUNDEF(ContainerVT); + SDValue Res = DAG.getNode(Opc, DL, ContainerVT, Op0, Op1, Passthru, Mask, VL); + if (IntVT.isFixedLengthVector()) + Res = convertFromScalableVector(IntVT, Res, DAG, Subtarget); + Res = DAG.getBitcast(VT, Res); + return Res; +} + // Given a vector a, b, c, d return a vector Factor times longer // with Factor-1 undef's between elements. Ex: // a, undef, b, undef, c, undef, d, undef (Factor=2, Index=0) @@ -5619,6 +5691,15 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, DAG.getVectorIdxConstant(OddSrc % Size, DL)); } + // Prefer vzip2a if available. + // TODO: Extend to matching zip2b if EvenSrc and OddSrc allow. + if (Subtarget.hasVendorXRivosVizip()) { + EvenV = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), + EvenV, DAG.getVectorIdxConstant(0, DL)); + OddV = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), OddV, + DAG.getVectorIdxConstant(0, DL)); + return lowerVZIP(RISCVISD::RI_VZIP2A_VL, EvenV, OddV, DL, DAG, Subtarget); + } return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget); } @@ -5670,6 +5751,18 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, return convertFromScalableVector(VT, Res, DAG, Subtarget); } + if (Subtarget.hasVendorXRivosVizip() && isZipEven(SrcInfo, Mask)) { + SDValue Src1 = SrcInfo[0].first == 0 ? V1 : V2; + SDValue Src2 = SrcInfo[1].first == 0 ? V1 : V2; + return lowerVZIP(RISCVISD::RI_VZIPEVEN_VL, Src1, Src2, DL, DAG, + Subtarget); + } + if (Subtarget.hasVendorXRivosVizip() && isZipOdd(SrcInfo, Mask)) { + SDValue Src1 = SrcInfo[1].first == 0 ? V1 : V2; + SDValue Src2 = SrcInfo[0].first == 0 ? V1 : V2; + return lowerVZIP(RISCVISD::RI_VZIPODD_VL, Src1, Src2, DL, DAG, Subtarget); + } + // Build the mask. Note that vslideup unconditionally preserves elements // below the slide amount in the destination, and thus those elements are // undefined in the mask. If the mask ends up all true (or undef), it @@ -6733,7 +6826,7 @@ static bool hasPassthruOp(unsigned Opcode) { Opcode <= RISCVISD::LAST_STRICTFP_OPCODE && "not a RISC-V target specific op"); static_assert( - RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == 127 && + RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == 130 && RISCVISD::LAST_STRICTFP_OPCODE - RISCVISD::FIRST_STRICTFP_OPCODE == 21 && "adding target specific op should update this function"); if (Opcode >= RISCVISD::ADD_VL && Opcode <= RISCVISD::VFMAX_VL) @@ -6757,12 +6850,13 @@ static bool hasMaskOp(unsigned Opcode) { Opcode <= RISCVISD::LAST_STRICTFP_OPCODE && "not a RISC-V target specific op"); static_assert( - RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == 127 && + RISCVISD::LAST_VL_VECTOR_OP - RISCVISD::FIRST_VL_VECTOR_OP == 130 && RISCVISD::LAST_STRICTFP_OPCODE - RISCVISD::FIRST_STRICTFP_OPCODE == 21 && "adding target specific op should update this function"); if (Opcode >= RISCVISD::TRUNCATE_VECTOR_VL && Opcode <= RISCVISD::SETCC_VL) return true; - if (Opcode >= RISCVISD::VRGATHER_VX_VL && Opcode <= RISCVISD::VFIRST_VL) + if (Opcode >= RISCVISD::VRGATHER_VX_VL && + Opcode <= RISCVISD::LAST_VL_VECTOR_OP) return true; if (Opcode >= RISCVISD::STRICT_FADD_VL && Opcode <= RISCVISD::STRICT_VFROUND_NOEXCEPT_VL) @@ -21807,6 +21901,9 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VZEXT_VL) NODE_NAME_CASE(VCPOP_VL) NODE_NAME_CASE(VFIRST_VL) + NODE_NAME_CASE(RI_VZIPEVEN_VL) + NODE_NAME_CASE(RI_VZIPODD_VL) + NODE_NAME_CASE(RI_VZIP2A_VL) NODE_NAME_CASE(READ_CSR) NODE_NAME_CASE(WRITE_CSR) NODE_NAME_CASE(SWAP_CSR) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index f4d6cd86397a4..5ebdbbd51f2b1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -403,7 +403,12 @@ enum NodeType : unsigned { // vfirst.m with additional mask and VL operands. VFIRST_VL, - LAST_VL_VECTOR_OP = VFIRST_VL, + // XRivosVizip + RI_VZIPEVEN_VL, + RI_VZIPODD_VL, + RI_VZIP2A_VL, + + LAST_VL_VECTOR_OP = RI_VZIP2A_VL, // Read VLENB CSR READ_VLENB, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td index 78c4ed6f00412..3fe50503f937b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXRivos.td @@ -67,6 +67,38 @@ defm RI_VUNZIP2A_V : VALU_IV_V<"ri.vunzip2a", 0b001000>; defm RI_VUNZIP2B_V : VALU_IV_V<"ri.vunzip2b", 0b011000>; } +// These are modeled after the int binop VL nodes +def ri_vzipeven_vl : SDNode<"RISCVISD::RI_VZIPEVEN_VL", SDT_RISCVIntBinOp_VL>; +def ri_vzipodd_vl : SDNode<"RISCVISD::RI_VZIPODD_VL", SDT_RISCVIntBinOp_VL>; +def ri_vzip2a_vl : SDNode<"RISCVISD::RI_VZIP2A_VL", SDT_RISCVIntBinOp_VL>; + +multiclass RIVPseudoVALU_VV { + foreach m = MxList in + defm "" : VPseudoBinaryV_VV; +} + +let Predicates = [HasVendorXRivosVizip], + Constraints = "@earlyclobber $rd, $rd = $passthru" in { +defm PseudoRI_VZIPEVEN : RIVPseudoVALU_VV; +defm PseudoRI_VZIPODD : RIVPseudoVALU_VV; +defm PseudoRI_VZIP2A : RIVPseudoVALU_VV; +} + +multiclass RIVPatBinaryVL_VV vtilist = AllIntegerVectors, + bit isSEWAware = false> { + foreach vti = vtilist in + let Predicates = GetVTypePredicates.Predicates in + def : VPatBinaryVL_V; +} + +defm : RIVPatBinaryVL_VV; +defm : RIVPatBinaryVL_VV; +defm : RIVPatBinaryVL_VV; + //===----------------------------------------------------------------------===// // XRivosVisni //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll index 6ed288ff011e7..917613d5c786f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128 ; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512 ; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512 +; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+experimental-xrivosvizip -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZIP,RV32-ZIP +; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+experimental-xrivosvizip -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZIP,RV64-ZIP ; Test optimizing interleaves to widening arithmetic. @@ -15,6 +17,13 @@ define <4 x i8> @interleave_v2i8(<2 x i8> %x, <2 x i8> %y) { ; CHECK-NEXT: vwmaccu.vx v10, a0, v9 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret +; +; ZIP-LABEL: interleave_v2i8: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v10, v8, v9 +; ZIP-NEXT: vmv1r.v v8, v10 +; ZIP-NEXT: ret %a = shufflevector <2 x i8> %x, <2 x i8> %y, <4 x i32> ret <4 x i8> %a } @@ -28,6 +37,13 @@ define <4 x i16> @interleave_v2i16(<2 x i16> %x, <2 x i16> %y) { ; CHECK-NEXT: vwmaccu.vx v10, a0, v9 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret +; +; ZIP-LABEL: interleave_v2i16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v10, v8, v9 +; ZIP-NEXT: vmv1r.v v8, v10 +; ZIP-NEXT: ret %a = shufflevector <2 x i16> %x, <2 x i16> %y, <4 x i32> ret <4 x i16> %a } @@ -42,6 +58,13 @@ define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) { ; CHECK-NEXT: vwmaccu.vx v10, a0, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret +; +; ZIP-LABEL: interleave_v2i32: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v10, v9, v8 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret %a = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> ret <4 x i32> %a } @@ -72,6 +95,14 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) { ; V512-NEXT: vslideup.vi v11, v8, 1 ; V512-NEXT: vmerge.vvm v8, v11, v10, v0 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v2i64: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZIP-NEXT: vmv1r.v v12, v9 +; ZIP-NEXT: ri.vzip2a.vv v10, v8, v12 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret %a = shufflevector <2 x i64> %x, <2 x i64> %y, <4 x i32> ret <4 x i64> %a } @@ -95,6 +126,13 @@ define <8 x i8> @interleave_v4i8(<4 x i8> %x, <4 x i8> %y) { ; V512-NEXT: vwmaccu.vx v10, a0, v8 ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v4i8: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v10, v9, v8 +; ZIP-NEXT: vmv1r.v v8, v10 +; ZIP-NEXT: ret %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> ret <8 x i8> %a } @@ -118,6 +156,13 @@ define <8 x i16> @interleave_v4i16(<4 x i16> %x, <4 x i16> %y) { ; V512-NEXT: vwmaccu.vx v10, a0, v9 ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v4i16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v10, v8, v9 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret %a = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> ret <8 x i16> %a } @@ -141,6 +186,14 @@ define <8 x i32> @interleave_v4i32(<4 x i32> %x, <4 x i32> %y) { ; V512-NEXT: vwmaccu.vx v10, a0, v9 ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v4i32: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZIP-NEXT: vmv1r.v v12, v9 +; ZIP-NEXT: ri.vzip2a.vv v10, v8, v12 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret %a = shufflevector <4 x i32> %x, <4 x i32> %y, <8 x i32> ret <8 x i32> %a } @@ -167,6 +220,15 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) { ; V512-NEXT: vwmaccu.vx v9, a0, v10 ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v4i32_offset_2: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; ZIP-NEXT: vslidedown.vi v10, v9, 2 +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v9, v8, v10 +; ZIP-NEXT: vmv.v.v v8, v9 +; ZIP-NEXT: ret %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> ret <4 x i32> %a } @@ -198,6 +260,17 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) { ; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma ; V512-NEXT: vmerge.vvm v8, v9, v10, v0 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v4i32_offset_1: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; ZIP-NEXT: vmv.v.i v0, 8 +; ZIP-NEXT: vmv1r.v v10, v9 +; ZIP-NEXT: vslideup.vi v10, v9, 1, v0.t +; ZIP-NEXT: vmv.v.i v0, 10 +; ZIP-NEXT: ri.vzip2a.vv v11, v8, v9 +; ZIP-NEXT: vmerge.vvm v8, v11, v10, v0 +; ZIP-NEXT: ret %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> ret <4 x i32> %a } @@ -220,6 +293,13 @@ define <16 x i8> @interleave_v8i8(<8 x i8> %x, <8 x i8> %y) { ; V512-NEXT: vwmaccu.vx v10, a0, v9 ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v8i8: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v10, v8, v9 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret %a = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> ret <16 x i8> %a } @@ -244,6 +324,14 @@ define <16 x i16> @interleave_v8i16(<8 x i16> %x, <8 x i16> %y) { ; V512-NEXT: vwmaccu.vx v10, a0, v8 ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v8i16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZIP-NEXT: vmv1r.v v12, v9 +; ZIP-NEXT: ri.vzip2a.vv v10, v12, v8 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret %a = shufflevector <8 x i16> %x, <8 x i16> %y, <16 x i32> ret <16 x i16> %a } @@ -267,6 +355,14 @@ define <16 x i32> @interleave_v8i32(<8 x i32> %x, <8 x i32> %y) { ; V512-NEXT: vwmaccu.vx v10, a0, v9 ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v8i32: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; ZIP-NEXT: vmv2r.v v16, v10 +; ZIP-NEXT: ri.vzip2a.vv v12, v8, v16 +; ZIP-NEXT: vmv.v.v v8, v12 +; ZIP-NEXT: ret %a = shufflevector <8 x i32> %x, <8 x i32> %y, <16 x i32> ret <16 x i32> %a } @@ -290,6 +386,16 @@ define <32 x i8> @interleave_v16i8(<16 x i8> %x, <16 x i8> %y) { ; V512-NEXT: vwmaccu.vx v10, a0, v9 ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v16i8: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; ZIP-NEXT: vmv1r.v v12, v9 +; ZIP-NEXT: li a0, 32 +; ZIP-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v10, v8, v12 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret %a = shufflevector <16 x i8> %x, <16 x i8> %y, <32 x i32> ret <32 x i8> %a } @@ -313,6 +419,16 @@ define <32 x i16> @interleave_v16i16(<16 x i16> %x, <16 x i16> %y) { ; V512-NEXT: vwmaccu.vx v10, a0, v9 ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v16i16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; ZIP-NEXT: vmv2r.v v16, v10 +; ZIP-NEXT: li a0, 32 +; ZIP-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v12, v8, v16 +; ZIP-NEXT: vmv.v.v v8, v12 +; ZIP-NEXT: ret %a = shufflevector <16 x i16> %x, <16 x i16> %y, <32 x i32> ret <32 x i16> %a } @@ -337,6 +453,16 @@ define <32 x i32> @interleave_v16i32(<16 x i32> %x, <16 x i32> %y) { ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v8, a0, v10 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v16i32: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; ZIP-NEXT: vmv4r.v v24, v12 +; ZIP-NEXT: li a0, 32 +; ZIP-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v16, v8, v24 +; ZIP-NEXT: vmv.v.v v8, v16 +; ZIP-NEXT: ret %a = shufflevector <16 x i32> %x, <16 x i32> %y, <32 x i32> ret <32 x i32> %a } @@ -363,6 +489,16 @@ define <64 x i8> @interleave_v32i8(<32 x i8> %x, <32 x i8> %y) { ; V512-NEXT: vwmaccu.vx v10, a0, v9 ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v32i8: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; ZIP-NEXT: vmv2r.v v16, v10 +; ZIP-NEXT: li a0, 64 +; ZIP-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v12, v8, v16 +; ZIP-NEXT: vmv.v.v v8, v12 +; ZIP-NEXT: ret %a = shufflevector <32 x i8> %x, <32 x i8> %y, <64 x i32> ret <64 x i8> %a } @@ -391,6 +527,16 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) { ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v8, a0, v10 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v32i16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; ZIP-NEXT: vmv4r.v v24, v12 +; ZIP-NEXT: li a0, 64 +; ZIP-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v16, v8, v24 +; ZIP-NEXT: vmv.v.v v8, v16 +; ZIP-NEXT: ret %a = shufflevector <32 x i16> %x, <32 x i16> %y, <64 x i32> ret <64 x i16> %a } @@ -446,6 +592,78 @@ define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) { ; V512-NEXT: li a0, -1 ; V512-NEXT: vwmaccu.vx v8, a0, v12 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_v32i32: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -16 +; ZIP-NEXT: .cfi_def_cfa_offset 16 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 5 +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a1, 24 +; ZIP-NEXT: mul a0, a0, a1 +; ZIP-NEXT: add a0, sp, a0 +; ZIP-NEXT: addi a0, a0, 16 +; ZIP-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZIP-NEXT: addi a0, sp, 16 +; ZIP-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; ZIP-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; ZIP-NEXT: vslidedown.vi v24, v8, 16 +; ZIP-NEXT: li a0, 32 +; ZIP-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v16, v24, v0 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: li a2, 24 +; ZIP-NEXT: mul a1, a1, a2 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 16 +; ZIP-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload +; ZIP-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; ZIP-NEXT: vslidedown.vi v24, v24, 16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: slli a1, a1, 4 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 16 +; ZIP-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; ZIP-NEXT: lui a1, 699051 +; ZIP-NEXT: addi a1, a1, -1366 +; ZIP-NEXT: vmv.s.x v0, a1 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: slli a1, a1, 3 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 16 +; ZIP-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: slli a1, a1, 4 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 16 +; ZIP-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: slli a1, a1, 3 +; ZIP-NEXT: add a1, sp, a1 +; ZIP-NEXT: addi a1, a1, 16 +; ZIP-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; ZIP-NEXT: vsetvli zero, a0, e32, m8, ta, mu +; ZIP-NEXT: ri.vzip2a.vv v16, v8, v24, v0.t +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: li a1, 24 +; ZIP-NEXT: mul a0, a0, a1 +; ZIP-NEXT: add a0, sp, a0 +; ZIP-NEXT: addi a0, a0, 16 +; ZIP-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZIP-NEXT: addi a0, sp, 16 +; ZIP-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; ZIP-NEXT: ri.vzip2a.vv v0, v8, v24 +; ZIP-NEXT: vmv.v.v v8, v0 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: slli a0, a0, 5 +; ZIP-NEXT: add sp, sp, a0 +; ZIP-NEXT: .cfi_def_cfa sp, 16 +; ZIP-NEXT: addi sp, sp, 16 +; ZIP-NEXT: .cfi_def_cfa_offset 0 +; ZIP-NEXT: ret %a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> ret <64 x i32> %a } @@ -471,6 +689,15 @@ define <4 x i8> @unary_interleave_v4i8(<4 x i8> %x) { ; V512-NEXT: vwmaccu.vx v9, a0, v10 ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret +; +; ZIP-LABEL: unary_interleave_v4i8: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; ZIP-NEXT: vslidedown.vi v10, v8, 2 +; ZIP-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v9, v8, v10 +; ZIP-NEXT: vmv1r.v v8, v9 +; ZIP-NEXT: ret %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> ret <4 x i8> %a } @@ -498,6 +725,17 @@ define <4 x i8> @unary_interleave_v4i8_invalid(<4 x i8> %x) { ; V512-NEXT: vrgather.vv v9, v8, v10 ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret +; +; ZIP-LABEL: unary_interleave_v4i8_invalid: +; ZIP: # %bb.0: +; ZIP-NEXT: lui a0, 16 +; ZIP-NEXT: addi a0, a0, 768 +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: vmv.s.x v10, a0 +; ZIP-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; ZIP-NEXT: vrgather.vv v9, v8, v10 +; ZIP-NEXT: vmv1r.v v8, v9 +; ZIP-NEXT: ret %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> ret <4 x i8> %a } @@ -523,6 +761,15 @@ define <4 x i16> @unary_interleave_v4i16(<4 x i16> %x) { ; V512-NEXT: vwmaccu.vx v9, a0, v10 ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret +; +; ZIP-LABEL: unary_interleave_v4i16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 2, e16, mf2, ta, ma +; ZIP-NEXT: vslidedown.vi v10, v8, 2 +; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v9, v8, v10 +; ZIP-NEXT: vmv1r.v v8, v9 +; ZIP-NEXT: ret %a = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> ret <4 x i16> %a } @@ -548,6 +795,15 @@ define <4 x i32> @unary_interleave_v4i32(<4 x i32> %x) { ; V512-NEXT: vwmaccu.vx v9, a0, v10 ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret +; +; ZIP-LABEL: unary_interleave_v4i32: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; ZIP-NEXT: vslidedown.vi v10, v8, 2 +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v9, v8, v10 +; ZIP-NEXT: vmv.v.v v8, v9 +; ZIP-NEXT: ret %a = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> ret <4 x i32> %a } @@ -590,6 +846,15 @@ define <4 x i64> @unary_interleave_v4i64(<4 x i64> %x) { ; RV64-V512-NEXT: vrgather.vv v9, v8, v10 ; RV64-V512-NEXT: vmv.v.v v8, v9 ; RV64-V512-NEXT: ret +; +; ZIP-LABEL: unary_interleave_v4i64: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; ZIP-NEXT: vslidedown.vi v12, v8, 2 +; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v10, v8, v12 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret %a = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> ret <4 x i64> %a } @@ -615,6 +880,15 @@ define <8 x i8> @unary_interleave_v8i8(<8 x i8> %x) { ; V512-NEXT: vwmaccu.vx v9, a0, v10 ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret +; +; ZIP-LABEL: unary_interleave_v8i8: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 4, e8, mf2, ta, ma +; ZIP-NEXT: vslidedown.vi v10, v8, 4 +; ZIP-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v9, v8, v10 +; ZIP-NEXT: vmv1r.v v8, v9 +; ZIP-NEXT: ret %a = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> ret <8 x i8> %a } @@ -640,6 +914,15 @@ define <8 x i16> @unary_interleave_v8i16(<8 x i16> %x) { ; V512-NEXT: vwmaccu.vx v9, a0, v8 ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret +; +; ZIP-LABEL: unary_interleave_v8i16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 4, e16, m1, ta, ma +; ZIP-NEXT: vslidedown.vi v10, v8, 4 +; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v9, v10, v8 +; ZIP-NEXT: vmv.v.v v8, v9 +; ZIP-NEXT: ret %a = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> ret <8 x i16> %a } @@ -665,6 +948,15 @@ define <8 x i32> @unary_interleave_v8i32(<8 x i32> %x) { ; V512-NEXT: vwmaccu.vx v9, a0, v10 ; V512-NEXT: vmv1r.v v8, v9 ; V512-NEXT: ret +; +; ZIP-LABEL: unary_interleave_v8i32: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; ZIP-NEXT: vslidedown.vi v12, v8, 4 +; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZIP-NEXT: ri.vzip2a.vv v10, v8, v12 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret %a = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> ret <8 x i32> %a } @@ -679,6 +971,14 @@ define <4 x i8> @unary_interleave_10uu_v4i8(<4 x i8> %x) { ; CHECK-NEXT: vsll.vi v8, v8, 8 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret +; +; ZIP-LABEL: unary_interleave_10uu_v4i8: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZIP-NEXT: vsrl.vi v9, v8, 8 +; ZIP-NEXT: vsll.vi v8, v8, 8 +; ZIP-NEXT: vor.vv v8, v8, v9 +; ZIP-NEXT: ret %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> ret <4 x i8> %a } @@ -702,6 +1002,14 @@ define <16 x i16> @interleave_slp(<8 x i16> %v0, <8 x i16> %v1) { ; V512-NEXT: vwmaccu.vx v10, a0, v9 ; V512-NEXT: vmv1r.v v8, v10 ; V512-NEXT: ret +; +; ZIP-LABEL: interleave_slp: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZIP-NEXT: vmv1r.v v12, v9 +; ZIP-NEXT: ri.vzip2a.vv v10, v8, v12 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret entry: %v2 = shufflevector <8 x i16> %v0, <8 x i16> poison, <16 x i32> %v3 = shufflevector <8 x i16> %v1, <8 x i16> poison, <16 x i32> @@ -711,4 +1019,6 @@ entry: ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32-V128: {{.*}} +; RV32-ZIP: {{.*}} ; RV64-V128: {{.*}} +; RV64-ZIP: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-zipeven-zipodd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-zipeven-zipodd.ll index c97f11301a05a..0a442940366e1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-zipeven-zipodd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-zipeven-zipodd.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfhmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+experimental-xrivosvizip -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZIP,ZIP-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+experimental-xrivosvizip -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZIP,ZIP-RV64 define <4 x i32> @zipeven_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: zipeven_v4i32: @@ -9,6 +11,13 @@ define <4 x i32> @zipeven_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: vmv.v.i v0, 10 ; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipeven_v4i32: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: ri.vzipeven.vv v10, v8, v9 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %c @@ -22,6 +31,13 @@ define <4 x i32> @zipeven_v4i32_swapped(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: vslideup.vi v9, v8, 1, v0.t ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipeven_v4i32_swapped: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: ri.vzipeven.vv v10, v9, v8 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %c @@ -35,6 +51,13 @@ define <4 x i64> @zipeven_v4i64(<4 x i64> %a, <4 x i64> %b) { ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vslideup.vi v8, v10, 1, v0.t ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipeven_v4i64: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZIP-NEXT: ri.vzipeven.vv v12, v8, v10 +; ZIP-NEXT: vmv.v.v v8, v12 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %c @@ -47,6 +70,13 @@ define <4 x half> @zipeven_v4f16(<4 x half> %a, <4 x half> %b) { ; CHECK-NEXT: vmv.v.i v0, 10 ; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipeven_v4f16: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZIP-NEXT: ri.vzipeven.vv v10, v8, v9 +; ZIP-NEXT: vmv1r.v v8, v10 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> ret <4 x half> %c @@ -59,6 +89,13 @@ define <4 x float> @zipeven_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-NEXT: vmv.v.i v0, 10 ; CHECK-NEXT: vslideup.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipeven_v4f32: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: ri.vzipeven.vv v10, v8, v9 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %c @@ -72,6 +109,13 @@ define <4 x double> @zipeven_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; CHECK-NEXT: vslideup.vi v8, v10, 1, v0.t ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipeven_v4f64: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZIP-NEXT: ri.vzipeven.vv v12, v8, v10 +; ZIP-NEXT: vmv.v.v v8, v12 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %c @@ -86,6 +130,13 @@ define <4 x i32> @zipodd_v4i32(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: vslidedown.vi v9, v8, 1, v0.t ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipodd_v4i32: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: ri.vzipodd.vv v10, v8, v9 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %c @@ -98,6 +149,13 @@ define <4 x i32> @zipodd_v4i32_swapped(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: vmv.v.i v0, 5 ; CHECK-NEXT: vslidedown.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipodd_v4i32_swapped: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: ri.vzipodd.vv v10, v9, v8 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %c @@ -110,6 +168,10 @@ define <4 x i32> @zipeven_v4i32_single(<4 x i32> %a) { ; CHECK-LABEL: zipeven_v4i32_single: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipeven_v4i32_single: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: ret entry: %c = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> ret <4 x i32> %c @@ -124,6 +186,12 @@ define <4 x i32> @zipodd_v4i32_single(<4 x i32> %a) { ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vslidedown.vi v8, v8, 1 ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipodd_v4i32_single: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: vslidedown.vi v8, v8, 1 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> ret <4 x i32> %c @@ -136,6 +204,13 @@ define <4 x i32> @zipodd_v4i32_both(<4 x i32> %a) { ; CHECK-NEXT: vmv.v.i v0, 5 ; CHECK-NEXT: vslidedown.vi v8, v8, 1, v0.t ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipodd_v4i32_both: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: ri.vzipodd.vv v9, v8, v8 +; ZIP-NEXT: vmv.v.v v8, v9 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> ret <4 x i32> %c @@ -150,6 +225,13 @@ define <4 x i32> @zipeven_v4i32_both(<4 x i32> %a) { ; CHECK-NEXT: vslideup.vi v9, v8, 1, v0.t ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipeven_v4i32_both: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: ri.vzipeven.vv v9, v8, v8 +; ZIP-NEXT: vmv.v.v v8, v9 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> ret <4 x i32> %c @@ -161,6 +243,12 @@ define <4 x i32> @zipeven_v4i32_partial(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma ; CHECK-NEXT: vslideup.vi v8, v9, 1 ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipeven_v4i32_partial: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; ZIP-NEXT: vslideup.vi v8, v9, 1 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %c @@ -174,6 +262,13 @@ define <4 x i32> @zipodd_v4i32_partial(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: vslidedown.vi v9, v8, 1, v0.t ; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipodd_v4i32_partial: +; ZIP: # %bb.0: # %entry +; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZIP-NEXT: ri.vzipodd.vv v10, v8, v9 +; ZIP-NEXT: vmv.v.v v8, v10 +; ZIP-NEXT: ret entry: %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %c @@ -187,6 +282,13 @@ define <8 x i32> @zipeven_v8i32(<8 x i32> %v1, <8 x i32> %v2) { ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vslideup.vi v8, v10, 1, v0.t ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipeven_v8i32: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZIP-NEXT: ri.vzipeven.vv v12, v8, v10 +; ZIP-NEXT: vmv.v.v v8, v12 +; ZIP-NEXT: ret %out = shufflevector <8 x i32> %v1, <8 x i32> %v2, <8 x i32> ret <8 x i32> %out } @@ -200,6 +302,13 @@ define <8 x i32> @zipodd_v8i32(<8 x i32> %v1, <8 x i32> %v2) { ; CHECK-NEXT: vslidedown.vi v10, v8, 1, v0.t ; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipodd_v8i32: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZIP-NEXT: ri.vzipodd.vv v12, v8, v10 +; ZIP-NEXT: vmv.v.v v8, v12 +; ZIP-NEXT: ret %out = shufflevector <8 x i32> %v1, <8 x i32> %v2, <8 x i32> ret <8 x i32> %out } @@ -213,6 +322,13 @@ define <16 x i64> @zipeven_v16i64(<16 x i64> %v1, <16 x i64> %v2) { ; CHECK-NEXT: vmv.s.x v0, a0 ; CHECK-NEXT: vslideup.vi v8, v16, 1, v0.t ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipeven_v16i64: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; ZIP-NEXT: ri.vzipeven.vv v24, v8, v16 +; ZIP-NEXT: vmv.v.v v8, v24 +; ZIP-NEXT: ret %out = shufflevector <16 x i64> %v1, <16 x i64> %v2, <16 x i32> ret <16 x i64> %out } @@ -227,9 +343,18 @@ define <16 x i64> @zipodd_v16i64(<16 x i64> %v1, <16 x i64> %v2) { ; CHECK-NEXT: vslidedown.vi v16, v8, 1, v0.t ; CHECK-NEXT: vmv.v.v v8, v16 ; CHECK-NEXT: ret +; +; ZIP-LABEL: zipodd_v16i64: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; ZIP-NEXT: ri.vzipodd.vv v24, v8, v16 +; ZIP-NEXT: vmv.v.v v8, v24 +; ZIP-NEXT: ret %out = shufflevector <16 x i64> %v1, <16 x i64> %v2, <16 x i32> ret <16 x i64> %out } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV32: {{.*}} ; RV64: {{.*}} +; ZIP-RV32: {{.*}} +; ZIP-RV64: {{.*}} From e8059467ef0041f6735b23ef680eb9070e308e33 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Sat, 29 Mar 2025 15:46:56 -0700 Subject: [PATCH 0016/1029] [RISCV] Fix -Wsign-compare warning from f8ee58a lib/Target/RISCV/RISCVISelLowering.cpp:4629:26: error: comparison of integers of different signs: 'unsigned int' and 'int' [-Werror,-Wsign-compare] 4629 | for (unsigned i = 0; i != NumElts; ++i) { | ~ ^ ~~~~~~~ 1 error generated. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a8c83113854c9..70ec57798db71 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -4626,7 +4626,7 @@ static bool isElementRotate(const std::array, 2> &SrcInfo, static bool isAlternating(const std::array, 2> &SrcInfo, ArrayRef Mask, bool RequiredPolarity) { int NumElts = Mask.size(); - for (unsigned i = 0; i != NumElts; ++i) { + for (int i = 0; i != NumElts; ++i) { int M = Mask[i]; if (M < 0) continue; From 0c7be9392f1b8ab81be28d0ff1ebb374553ad70f Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 29 Mar 2025 16:52:16 -0700 Subject: [PATCH 0017/1029] [BOLT] Use *Set::insert_range (NFC) (#133601) --- bolt/include/bolt/Passes/DominatorAnalysis.h | 2 +- bolt/lib/Core/BinaryFunction.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/include/bolt/Passes/DominatorAnalysis.h b/bolt/include/bolt/Passes/DominatorAnalysis.h index 3f3afa943c06c..8eb470693bf10 100644 --- a/bolt/include/bolt/Passes/DominatorAnalysis.h +++ b/bolt/include/bolt/Passes/DominatorAnalysis.h @@ -54,7 +54,7 @@ class DominatorAnalysis HasNonDominatedPred = true; }); if (HasDominatedPred && HasNonDominatedPred) - Result.insert(Candidates.begin(), Candidates.end()); + Result.insert_range(Candidates); if ((*this->getStateAt(ProgramPoint::getLastPointAt(BB)))[DomIdx] && BB.succ_begin() == BB.succ_end()) Result.insert(ProgramPoint::getLastPointAt(BB)); diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 5ee33f52ec88a..09006249887f6 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1997,7 +1997,7 @@ void BinaryFunction::postProcessJumpTables() { bool BinaryFunction::validateExternallyReferencedOffsets() { SmallPtrSet JTTargets; for (const JumpTable *JT : llvm::make_second_range(JumpTables)) - JTTargets.insert(JT->Entries.begin(), JT->Entries.end()); + JTTargets.insert_range(JT->Entries); bool HasUnclaimedReference = false; for (uint64_t Destination : ExternallyReferencedOffsets) { From 8f5c3deadd79a19c8585e014581288e92462a97c Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 29 Mar 2025 16:52:36 -0700 Subject: [PATCH 0018/1029] [Analysis] Use llvm::append_range (NFC) (#133602) --- llvm/lib/Analysis/IRSimilarityIdentifier.cpp | 3 +-- llvm/lib/Analysis/IVDescriptors.cpp | 7 ++----- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 3 +-- llvm/lib/Analysis/ScalarEvolution.cpp | 9 +++------ 4 files changed, 7 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp index ca011362702ac..a6af7304b1c7e 100644 --- a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp +++ b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp @@ -78,8 +78,7 @@ void IRInstructionData::initializeInstruction() { // We capture the incoming BasicBlocks as values as well as the incoming // Values in order to check for structural similarity. if (PHINode *PN = dyn_cast(Inst)) - for (BasicBlock *BB : PN->blocks()) - OperVals.push_back(BB); + llvm::append_range(OperVals, PN->blocks()); } IRInstructionData::IRInstructionData(IRInstructionDataList &IDList) diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 45b5b2979a562..94c347b01bbfb 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -1296,11 +1296,8 @@ InductionDescriptor::InductionDescriptor(Value *Start, InductionKind K, InductionBinOp->getOpcode() == Instruction::FSub))) && "Binary opcode should be specified for FP induction"); - if (Casts) { - for (auto &Inst : *Casts) { - RedundantCasts.push_back(Inst); - } - } + if (Casts) + llvm::append_range(RedundantCasts, *Casts); } ConstantInt *InductionDescriptor::getConstIntStepValue() const { diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 57a76bc7a81e5..7f1b5dc3890a9 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -913,8 +913,7 @@ static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, // value. if (PN && InnermostLoop.contains(PN->getParent()) && PN->getParent() != InnermostLoop.getHeader()) { - for (const Use &Inc : PN->incoming_values()) - WorkList.push_back(Inc); + llvm::append_range(WorkList, PN->incoming_values()); } else AddPointer(Ptr); } diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 600a061d4435e..361206719287a 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -4221,8 +4221,7 @@ bool ScalarEvolution::canReuseInstruction( if (I->hasPoisonGeneratingAnnotations()) DropPoisonGeneratingInsts.push_back(I); - for (Value *Op : I->operands()) - Worklist.push_back(Op); + llvm::append_range(Worklist, I->operands()); } return true; } @@ -7622,8 +7621,7 @@ ScalarEvolution::getOperandsToCreate(Value *V, SmallVectorImpl &Ops) { case Instruction::GetElementPtr: assert(cast(U)->getSourceElementType()->isSized() && "GEP source element type must be sized"); - for (Value *Index : U->operands()) - Ops.push_back(Index); + llvm::append_range(Ops, U->operands()); return nullptr; case Instruction::IntToPtr: @@ -7656,8 +7654,7 @@ ScalarEvolution::getOperandsToCreate(Value *V, SmallVectorImpl &Ops) { if (CanSimplifyToUnknown()) return getUnknown(U); - for (Value *Inc : U->operands()) - Ops.push_back(Inc); + llvm::append_range(Ops, U->operands()); return nullptr; break; } From e3a3f78f35e091f005197bdd8f0464684546b5a0 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 29 Mar 2025 16:53:02 -0700 Subject: [PATCH 0019/1029] [CodeGen] Use llvm::append_range (NFC) (#133603) --- llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 6 ++---- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 3 +-- llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp | 4 +--- llvm/lib/CodeGen/MachineSink.cpp | 3 +-- llvm/lib/CodeGen/RegAllocGreedy.cpp | 3 +-- llvm/lib/CodeGen/SelectOptimize.cpp | 3 +-- llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 3 +-- llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h | 3 +-- llvm/lib/CodeGen/WindowScheduler.cpp | 3 +-- 9 files changed, 10 insertions(+), 21 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index f8afb42bf5535..85a6d67609798 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1477,8 +1477,7 @@ static uint64_t getOffsetFromIndices(const User &U, const DataLayout &DL) { for (auto Idx : IVI->indices()) Indices.push_back(ConstantInt::get(Int32Ty, Idx)); } else { - for (Value *Op : drop_begin(U.operands())) - Indices.push_back(Op); + llvm::append_range(Indices, drop_begin(U.operands())); } return 8 * static_cast( @@ -2212,8 +2211,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, case Intrinsic::fake_use: { SmallVector VRegs; for (const auto &Arg : CI.args()) - for (auto VReg : getOrCreateVRegs(*Arg)) - VRegs.push_back(VReg); + llvm::append_range(VRegs, getOrCreateVRegs(*Arg)); MIRBuilder.buildInstr(TargetOpcode::FAKE_USE, {}, VRegs); MF->setHasFakeUses(true); return true; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index c28f3c5518301..ac68eb55a6fd5 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -4906,8 +4906,7 @@ LegalizerHelper::fewerElementsVectorMultiEltType( SmallVector SplitPieces; extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces, MIRBuilder, MRI); - for (auto Reg : SplitPieces) - InputOpsPieces[UseNo].push_back(Reg); + llvm::append_range(InputOpsPieces[UseNo], SplitPieces); } } diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp index 9f11ccf21bd1f..c70c638dc016c 100644 --- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp @@ -4218,9 +4218,7 @@ std::optional InstrRefBasedLDV::resolveDbgPHIsImpl( } // Sort PHIs to validate into RPO-order. - SmallVector SortedPHIs; - for (auto &PHI : CreatedPHIs) - SortedPHIs.push_back(PHI); + SmallVector SortedPHIs(CreatedPHIs); llvm::sort(SortedPHIs, [&](LDVSSAPhi *A, LDVSSAPhi *B) { return BBToOrder[&A->getParent()->BB] < BBToOrder[&B->getParent()->BB]; diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 173193bb6266c..aa2987b6710a3 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -2326,8 +2326,7 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB, for (MCRegUnit Unit : TRI->regunits(MO.getReg())) { for (const auto &MIRegs : SeenDbgInstrs.lookup(Unit)) { auto &Regs = DbgValsToSinkMap[MIRegs.first]; - for (Register Reg : MIRegs.second) - Regs.push_back(Reg); + llvm::append_range(Regs, MIRegs.second); } } } diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index ac1e9fe1ca589..a5cd9fc7a5360 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -2195,8 +2195,7 @@ MCRegister RAGreedy::tryLastChanceRecoloring( if (tryRecoloringCandidates(RecoloringQueue, CurrentNewVRegs, FixedRegisters, RecolorStack, Depth)) { // Push the queued vregs into the main queue. - for (Register NewVReg : CurrentNewVRegs) - NewVRegs.push_back(NewVReg); + llvm::append_range(NewVRegs, CurrentNewVRegs); // Do not mess up with the global assignment process. // I.e., VirtReg must be unassigned. if (VRM->hasPhys(ThisVirtReg)) { diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp index b35f765c76489..00148b075134a 100644 --- a/llvm/lib/CodeGen/SelectOptimize.cpp +++ b/llvm/lib/CodeGen/SelectOptimize.cpp @@ -451,8 +451,7 @@ void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F, SmallVector Loops(LI->begin(), LI->end()); // Need to check size on each iteration as we accumulate child loops. for (unsigned long i = 0; i < Loops.size(); ++i) - for (Loop *ChildL : Loops[i]->getSubLoops()) - Loops.push_back(ChildL); + llvm::append_range(Loops, Loops[i]->getSubLoops()); for (Loop *L : Loops) { if (!L->isInnermost()) diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp index 5182e4124f548..4b7a9127b3fc3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp @@ -1195,8 +1195,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned, // Add rounding control registers as implicit def for function call. if (II.isCall() && MF->getFunction().hasFnAttribute(Attribute::StrictFP)) { ArrayRef RCRegs = TLI->getRoundingControlRegisters(); - for (MCPhysReg Reg : RCRegs) - UsedRegs.push_back(Reg); + llvm::append_range(UsedRegs, RCRegs); } // Finally mark unused registers as dead. diff --git a/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h index 4c6b3a5be416d..17086876ad537 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h +++ b/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h @@ -197,8 +197,7 @@ class SDDbgValue { for (const SDDbgOperand &DbgOp : getLocationOps()) if (DbgOp.getKind() == SDDbgOperand::SDNODE) Dependencies.push_back(DbgOp.getSDNode()); - for (SDNode *Node : getAdditionalDependencies()) - Dependencies.push_back(Node); + llvm::append_range(Dependencies, getAdditionalDependencies()); return Dependencies; } diff --git a/llvm/lib/CodeGen/WindowScheduler.cpp b/llvm/lib/CodeGen/WindowScheduler.cpp index 78af6314e7b2d..95c86a9ac2668 100644 --- a/llvm/lib/CodeGen/WindowScheduler.cpp +++ b/llvm/lib/CodeGen/WindowScheduler.cpp @@ -283,8 +283,7 @@ void WindowScheduler::restoreMBB() { MI.eraseFromParent(); } // Restore MBB to the state before window scheduling. - for (auto *MI : OriMIs) - MBB->push_back(MI); + llvm::append_range(*MBB, OriMIs); updateLiveIntervals(); } From f7228f38b72d6bf043aa4a412ba6ebb1d5971c53 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 29 Mar 2025 18:19:18 -0700 Subject: [PATCH 0020/1029] MCValue: Simplify code with getSubSym The MCValue::SymB MCSymbolRefExpr member might be replaced with a MCSymbol in the future. Reduce direct access. --- .../Target/AArch64/AsmParser/AArch64AsmParser.cpp | 2 +- .../Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp | 13 ++++++------- .../LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp | 2 +- llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp | 5 ++--- llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp | 6 +++--- .../lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp | 2 +- 6 files changed, 14 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 5be4bd9ec6b26..28b4cbb5efed8 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -8214,7 +8214,7 @@ bool AArch64AsmParser::classifySymbolRef( // Check that it looks like a symbol + an addend MCValue Res; bool Relocatable = Expr->evaluateAsRelocatable(Res, nullptr); - if (!Relocatable || Res.getSymB()) + if (!Relocatable || Res.getSubSym()) return false; // Treat expressions with an ELFSpec (like ":abs_g1:3", or diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 5b9fd9a29156a..e7348326a69cf 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -163,19 +163,18 @@ void ARMMachObjectWriter::recordARMScatteredHalfRelocation( uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent()); FixedValue += SecAddr; - if (const MCSymbolRefExpr *B = Target.getSymB()) { - const MCSymbol *SB = &B->getSymbol(); - + if (const MCSymbol *SB = Target.getSubSym()) { if (!SB->getFragment()) { - Asm.getContext().reportError(Fixup.getLoc(), - "symbol '" + B->getSymbol().getName() + - "' can not be undefined in a subtraction expression"); + Asm.getContext().reportError( + Fixup.getLoc(), + "symbol '" + SB->getName() + + "' can not be undefined in a subtraction expression"); return; } // Select the appropriate difference relocation type. Type = MachO::ARM_RELOC_HALF_SECTDIFF; - Value2 = Writer->getSymbolAddress(B->getSymbol(), Asm); + Value2 = Writer->getSymbolAddress(*SB, Asm); FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent()); } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index 6c27064614fed..260b0d0e31761 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -455,7 +455,7 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAssembler &Asm, std::pair FK; uint64_t FixedValueA, FixedValueB; const MCSymbol &SA = Target.getSymA()->getSymbol(); - const MCSymbol &SB = Target.getSymB()->getSymbol(); + const MCSymbol &SB = *Target.getSubSym(); bool force = !SA.isInSection() || !SB.isInSection(); if (!force) { diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp index d8019f5eb785f..03a174311a6ec 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCExpr.cpp @@ -25,9 +25,8 @@ bool M68kMCExpr::evaluateAsRelocatableImpl(MCValue &Res, if (!getSubExpr()->evaluateAsRelocatable(Res, Asm)) return false; - Res = - MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), specifier); - return Res.getSymB() ? specifier == VK_None : true; + Res.setSpecifier(specifier); + return !Res.getSubSym(); } void M68kMCExpr::visitUsedExpr(MCStreamer &S) const { S.visitUsedExpr(*Expr); } diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 640ae52d05dd1..8c6fe0b77d234 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -585,7 +585,7 @@ class MipsAsmParser : public MCTargetAsmParser { MCValue Res; if (!JalExpr->evaluateAsRelocatable(Res, nullptr)) return false; - if (Res.getSymB() != nullptr) + if (Res.getSubSym()) return false; if (Res.getConstant() != 0) return ABI.IsN32() || ABI.IsN64(); @@ -2938,7 +2938,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, Error(IDLoc, "expected relocatable expression"); return true; } - if (Res.getSymB() != nullptr) { + if (Res.getSubSym()) { Error(IDLoc, "expected relocatable expression with only one symbol"); return true; } @@ -3768,7 +3768,7 @@ void MipsAsmParser::expandMem16Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, Error(IDLoc, "expected relocatable expression"); return; } - if (Res.getSymB() != nullptr) { + if (Res.getSubSym()) { Error(IDLoc, "expected relocatable expression with only one symbol"); return; } diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp index 107e0714b026e..4c4035b32af3e 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCExpr.cpp @@ -45,7 +45,7 @@ bool XtensaMCExpr::evaluateAsRelocatableImpl(MCValue &Res, if (!getSubExpr()->evaluateAsRelocatable(Res, Asm)) return false; Res.setSpecifier(specifier); - return !Res.getSymB(); + return !Res.getSubSym(); } void XtensaMCExpr::visitUsedExpr(MCStreamer &Streamer) const { From 02af13aacb477d1f0a89375646f09b638055e3b9 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 29 Mar 2025 18:20:58 -0700 Subject: [PATCH 0021/1029] [RISCV] Simplify evaluateAsRelocatableImpl. NFC RISCVMCExpr is not created with VK_None. --- llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp index 41408c156b8a5..73c24aa8a60bb 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp @@ -94,7 +94,7 @@ bool RISCVMCExpr::evaluateAsRelocatableImpl(MCValue &Res, Res.setSpecifier(specifier); // Custom fixup types are not valid with symbol difference expressions. - return Res.getSymB() ? getSpecifier() == VK_None : true; + return !Res.getSubSym(); } void RISCVMCExpr::visitUsedExpr(MCStreamer &Streamer) const { From ad1ba15ea894ac47b0f2447db191a14ebe1b301d Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 29 Mar 2025 18:47:47 -0700 Subject: [PATCH 0022/1029] [Target] Use llvm::append_range (NFC) (#133606) --- .../AArch64/GISel/AArch64PreLegalizerCombiner.cpp | 4 +--- llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 4 +--- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 3 +-- .../Target/ARM/MVETPAndVPTOptimisationsPass.cpp | 3 +-- llvm/lib/Target/DirectX/DXILDataScalarization.cpp | 4 +--- llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp | 3 +-- llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 14 ++++---------- llvm/lib/Target/SPIRV/SPIRVUtils.cpp | 6 ++---- llvm/lib/Target/X86/X86CmovConversion.cpp | 3 +-- llvm/lib/Target/X86/X86InterleavedAccess.cpp | 4 +--- llvm/lib/Target/X86/X86WinEHState.cpp | 3 +-- 11 files changed, 15 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 6496d56d74b2c..2c559d4beb5d1 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -482,9 +482,7 @@ void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI, // the values inside a small vec extractParts(SrcReg, SrcTy, MainTy, LeftoverTy, WorkingRegisters, LeftoverRegs, B, MRI); - for (unsigned I = 0; I < LeftoverRegs.size(); I++) { - WorkingRegisters.push_back(LeftoverRegs[I]); - } + llvm::append_range(WorkingRegisters, LeftoverRegs); } else { WorkingRegisters.push_back(SrcReg); MainTy = SrcTy; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index f9facfa461748..70274a8101f89 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -1265,9 +1265,7 @@ bool AMDGPUSwLowerLDS::run() { for (Instruction *Inst : AsanInfo.Instructions) { SmallVector InterestingOperands; getInterestingMemoryOperands(M, Inst, InterestingOperands); - for (auto &Operand : InterestingOperands) { - OperandsToInstrument.push_back(Operand); - } + llvm::append_range(OperandsToInstrument, InterestingOperands); } for (auto &Operand : OperandsToInstrument) { Value *Addr = Operand.getPtr(); diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 71fe990e5ab7c..6843ec895e69c 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -6907,8 +6907,7 @@ bool ARMPipelinerLoopInfo::tooMuchRegisterPressure(SwingSchedulerDAG &SSD, SMS.getInstructions(Cycle + Stage * SMS.getInitiationInterval()); std::sort(Instrs.begin(), Instrs.end(), [](SUnit *A, SUnit *B) { return A->NodeNum > B->NodeNum; }); - for (SUnit *SU : Instrs) - ProposedSchedule.push_back(SU); + llvm::append_range(ProposedSchedule, Instrs); } // Learn whether the last use/def of each cross-iteration register is a use or diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp index 12fd8c7924565..18d5c232378a7 100644 --- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -303,8 +303,7 @@ MachineInstr *MVETPAndVPTOptimisations::CheckForLRUseInPredecessors( } Visited.insert(MBB); - for (auto *Pred : MBB->predecessors()) - Worklist.push_back(Pred); + llvm::append_range(Worklist, MBB->predecessors()); } return LoopStart; } diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp index a0dd17904f6fa..1f2700ac55647 100644 --- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp +++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp @@ -144,9 +144,7 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { return false; IRBuilder<> Builder(&GEPI); - SmallVector Indices; - for (auto &Index : GEPI.indices()) - Indices.push_back(Index); + SmallVector Indices(GEPI.indices()); Value *NewGEP = Builder.CreateGEP(NewGlobal->getValueType(), NewGlobal, Indices, diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index dd054846f03c8..02b0282bbddad 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -1062,8 +1062,7 @@ static SmallVector getInputSegmentList(ShuffleMask SM, Segs.set(M >> Shift); } - for (unsigned B : Segs.set_bits()) - SegList.push_back(B); + llvm::append_range(SegList, Segs.set_bits()); return SegList; } diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 939d9e920d05b..900bb1a8a46d2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -1241,8 +1241,7 @@ void SPIRVEmitIntrinsics::preprocessCompositeConstants(IRBuilder<> &B) { for (unsigned i = 0; i < COp->getNumElements(); ++i) Args.push_back(COp->getElementAsConstant(i)); else - for (auto &COp : AggrConst->operands()) - Args.push_back(COp); + llvm::append_range(Args, AggrConst->operands()); if (!BPrepared) { IsPhi ? B.SetInsertPointPastAllocas(I->getParent()->getParent()) : B.SetInsertPoint(I); @@ -1387,8 +1386,7 @@ Instruction *SPIRVEmitIntrinsics::visitGetElementPtrInst(GetElementPtrInst &I) { SmallVector Types = {I.getType(), I.getOperand(0)->getType()}; SmallVector Args; Args.push_back(B.getInt1(I.isInBounds())); - for (auto &Op : I.operands()) - Args.push_back(Op); + llvm::append_range(Args, I.operands()); auto *NewI = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args}); replaceAllUsesWithAndErase(B, &I, NewI); return NewI; @@ -1716,9 +1714,7 @@ Instruction *SPIRVEmitIntrinsics::visitExtractValueInst(ExtractValueInst &I) { return &I; IRBuilder<> B(I.getParent()); B.SetInsertPoint(&I); - SmallVector Args; - for (auto &Op : I.operands()) - Args.push_back(Op); + SmallVector Args(I.operands()); for (auto &Op : I.indices()) Args.push_back(B.getInt32(Op)); auto *NewI = @@ -1794,9 +1790,7 @@ Instruction *SPIRVEmitIntrinsics::visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { assert(I.getType()->isAggregateType() && "Aggregate result is expected"); IRBuilder<> B(I.getParent()); B.SetInsertPoint(&I); - SmallVector Args; - for (auto &Op : I.operands()) - Args.push_back(Op); + SmallVector Args(I.operands()); Args.push_back(B.getInt32( static_cast(getMemScope(I.getContext(), I.getSyncScopeID())))); Args.push_back(B.getInt32( diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 6bef6b7e9b16e..60b67a4f5ec5e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -689,8 +689,7 @@ bool sortBlocks(Function &F) { Order.reserve(F.size()); ReversePostOrderTraversal RPOT(&F); - for (BasicBlock *BB : RPOT) - Order.push_back(BB); + llvm::append_range(Order, RPOT); assert(&*F.begin() == Order[0]); BasicBlock *LastBlock = &*F.begin(); @@ -785,8 +784,7 @@ CallInst *buildIntrWithMD(Intrinsic::ID IntrID, ArrayRef Types, SmallVector Args; Args.push_back(Arg2); Args.push_back(buildMD(Arg)); - for (auto *Imm : Imms) - Args.push_back(Imm); + llvm::append_range(Args, Imms); return B.CreateIntrinsic(IntrID, {Types}, Args); } diff --git a/llvm/lib/Target/X86/X86CmovConversion.cpp b/llvm/lib/Target/X86/X86CmovConversion.cpp index d639ca56b77d6..488b3126b8609 100644 --- a/llvm/lib/Target/X86/X86CmovConversion.cpp +++ b/llvm/lib/Target/X86/X86CmovConversion.cpp @@ -240,8 +240,7 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { // Note that we need to check size on each iteration as we accumulate child // loops. for (int i = 0; i < (int)Loops.size(); ++i) - for (MachineLoop *Child : Loops[i]->getSubLoops()) - Loops.push_back(Child); + llvm::append_range(Loops, Loops[i]->getSubLoops()); for (MachineLoop *CurrLoop : Loops) { // Optimize only innermost loops. diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index efab93d61c7c5..1eb47e3b2cd18 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -829,10 +829,8 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI, // Holds the indices of SVI that correspond to the starting index of each // interleaved shuffle. - SmallVector Indices; auto Mask = SVI->getShuffleMask(); - for (unsigned i = 0; i < Factor; i++) - Indices.push_back(Mask[i]); + SmallVector Indices(Mask.take_front(Factor)); ArrayRef Shuffles = ArrayRef(SVI); diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp index 7d6d3f8d21f25..1bcbc7d6e6703 100644 --- a/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/llvm/lib/Target/X86/X86WinEHState.cpp @@ -721,8 +721,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { // enqueue it's successors to see if we can infer their states. InitialStates.insert({BB, PredState}); FinalStates.insert({BB, PredState}); - for (BasicBlock *SuccBB : successors(BB)) - Worklist.push_back(SuccBB); + llvm::append_range(Worklist, successors(BB)); } // Try to hoist stores from successors. From d8b078d5509d84da987c18cb357aac867051881c Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 29 Mar 2025 18:57:50 -0700 Subject: [PATCH 0023/1029] [Transforms] Use llvm::append_range (NFC) (#133607) --- .../Transforms/AggressiveInstCombine/TruncInstCombine.cpp | 3 +-- llvm/lib/Transforms/IPO/SampleProfile.cpp | 3 +-- llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp | 3 +-- llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 8 +++----- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++---- 5 files changed, 9 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp index 4d9050be5c553..1cef43b8ee5a4 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp @@ -77,8 +77,7 @@ static void getRelevantOperands(Instruction *I, SmallVectorImpl &Ops) { Ops.push_back(I->getOperand(2)); break; case Instruction::PHI: - for (Value *V : cast(I)->incoming_values()) - Ops.push_back(V); + llvm::append_range(Ops, cast(I)->incoming_values()); break; default: llvm_unreachable("Unreachable!"); diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 8b1140e8eabcd..d89da7621990a 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -1267,8 +1267,7 @@ bool SampleProfileLoader::tryInlineCandidate( // Now populate the list of newly exposed call sites. if (InlinedCallSites) { InlinedCallSites->clear(); - for (auto &I : IFI.InlinedCallSites) - InlinedCallSites->push_back(I); + llvm::append_range(*InlinedCallSites, IFI.InlinedCallSites); } if (FunctionSamples::ProfileIsCS) diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index c99238a2d5ff4..82434680b8f23 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -164,8 +164,7 @@ class DFAJumpThreading { unfold(&DTU, LI, SIToUnfold, &NewSIsToUnfold, &NewBBs); // Put newly discovered select instructions into the work list. - for (const SelectInstToUnfold &NewSIToUnfold : NewSIsToUnfold) - Stack.push_back(NewSIToUnfold); + llvm::append_range(Stack, NewSIsToUnfold); } } diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index caba873408335..00c4fcc76e791 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -737,12 +737,10 @@ void StructurizeCFG::findUndefBlocks( if (!VisitedBlock.insert(Current).second) continue; - if (FlowSet.contains(Current)) { - for (auto P : predecessors(Current)) - Stack.push_back(P); - } else if (!Incomings.contains(Current)) { + if (FlowSet.contains(Current)) + llvm::append_range(Stack, predecessors(Current)); + else if (!Incomings.contains(Current)) UndefBlks.push_back(Current); - } } } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f2be3748799db..4b4a56be19fe5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6929,10 +6929,9 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { DeadInterleavePointerOps.append(Op->op_begin(), Op->op_end()); } - for (const auto &[_, Ops] : DeadInvariantStoreOps) { - for (Value *Op : ArrayRef(Ops).drop_back()) - DeadOps.push_back(Op); - } + for (const auto &[_, Ops] : DeadInvariantStoreOps) + llvm::append_range(DeadOps, ArrayRef(Ops).drop_back()); + // Mark ops that would be trivially dead and are only used by ignored // instructions as free. BasicBlock *Header = TheLoop->getHeader(); From db603a09dabefc6847423c8968578ce6d54a7a2d Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 29 Mar 2025 19:08:07 -0700 Subject: [PATCH 0024/1029] [MC] Move ELF-specific handleAddSubRelocations to ELFObjectWriter::recordRelocation --- llvm/lib/MC/ELFObjectWriter.cpp | 6 ++++++ llvm/lib/MC/MCAssembler.cpp | 7 ------- .../Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp | 2 ++ llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp | 2 ++ 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index 22af6d9e9ad0a..f3445daf73ac1 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -1377,6 +1377,12 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm, const MCTargetOptions *TO = Ctx.getTargetOptions(); if (auto *RefB = Target.getSubSym()) { + // When there is no relocation specifier, a linker relaxation target may + // emit ADD/SUB relocations for A-B+C. + if (Target.getSymA() && Backend.handleAddSubRelocations( + Asm, *Fragment, Fixup, Target, FixedValue)) + return; + const auto &SymB = cast(*RefB); if (SymB.isUndefined()) { Ctx.reportError(Fixup.getLoc(), diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 1c79af412a4d7..835fa8af4cf8f 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -234,13 +234,6 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF, } } - // A linker relaxation target may emit ADD/SUB relocations for A-B+C. Let - // recordRelocation handle non-VK_None cases like A@plt-B+C. - if (!IsResolved && Target.getSymA() && Target.getSubSym() && - Target.getRefKind() == 0 && - getBackend().handleAddSubRelocations(*this, *DF, Fixup, Target, Value)) - return true; - return IsResolved; } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index 260b0d0e31761..c83a18746e060 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -452,6 +452,8 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, uint64_t &FixedValue) const { + assert(Target.getRefKind() == 0 && + "relocatable SymA-SymB cannot have relocation specifier"); std::pair FK; uint64_t FixedValueA, FixedValueB; const MCSymbol &SA = Target.getSymA()->getSymbol(); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 0cedbd9b8eb8d..b5c66cc1e83f5 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -590,6 +590,8 @@ bool RISCVAsmBackend::handleAddSubRelocations(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, uint64_t &FixedValue) const { + assert(Target.getRefKind() == 0 && + "relocatable SymA-SymB cannot have relocation specifier"); uint64_t FixedValueA, FixedValueB; unsigned TA = 0, TB = 0; switch (Fixup.getKind()) { From d9b3209e8640e69d328b7ead169c5304a80dd1be Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 29 Mar 2025 19:28:59 -0700 Subject: [PATCH 0025/1029] [X86] applyFixup: Remove unneeded Target.isAbsolute() and isResolved Target and isResolved are only used by ARM. --- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 3653f5ae03b14..d698c917d4382 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -690,8 +690,7 @@ static unsigned getFixupKindSize(unsigned Kind) { } void X86AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, - const MCValue &Target, - MutableArrayRef Data, + const MCValue &, MutableArrayRef Data, uint64_t Value, bool IsResolved, const MCSubtargetInfo *STI) const { unsigned Kind = Fixup.getKind(); @@ -702,9 +701,8 @@ void X86AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); int64_t SignedValue = static_cast(Value); - if ((Target.isAbsolute() || IsResolved) && - getFixupKindInfo(Fixup.getKind()).Flags & - MCFixupKindInfo::FKF_IsPCRel) { + if (IsResolved && + getFixupKindInfo(Fixup.getKind()).Flags & MCFixupKindInfo::FKF_IsPCRel) { // check that PC relative fixup fits into the fixup size. if (Size > 0 && !isIntN(Size * 8, SignedValue)) Asm.getContext().reportError( From c6d0e0435dba0a02ad1970e89083ad61922f7245 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 29 Mar 2025 19:33:09 -0700 Subject: [PATCH 0026/1029] [M68k] applyFixup: don't reference Target and IsResolved They are workarounds only needed by ARM and might be removed. --- .../M68k/MCTargetDesc/M68kAsmBackend.cpp | 28 ++++--------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp index 7f766056ab5b7..8eabc05008bd1 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp @@ -52,36 +52,18 @@ class M68kAsmBackend : public MCAsmBackend { .CasesLower("m68020", "m68030", "m68040", true) .Default(false)) {} - - void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, - const MCValue &Target, MutableArrayRef Data, - uint64_t Value, bool IsResolved, + void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &, + MutableArrayRef Data, uint64_t Value, bool, const MCSubtargetInfo *STI) const override { unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); - if (Fixup.getOffset() + Size > Data.size()) { - LLVM_DEBUG(dbgs() << "Fixup.getOffset(): " << Fixup.getOffset() << '\n'); - LLVM_DEBUG(dbgs() << "Size: " << Size << '\n'); - LLVM_DEBUG(dbgs() << "Data.size(): " << Data.size() << '\n'); - assert(Fixup.getOffset() + Size <= Data.size() && - "Invalid fixup offset!"); - } - + assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); // Check that uppper bits are either all zeros or all ones. // Specifically ignore overflow/underflow as long as the leakage is // limited to the lower bits. This is to remain compatible with // other assemblers. - if (!(isIntN(Size * 8 + 1, static_cast(Value)) || IsResolved)) { - LLVM_DEBUG(dbgs() << "Fixup.getOffset(): " << Fixup.getOffset() << '\n'); - LLVM_DEBUG(dbgs() << "Size: " << Size << '\n'); - LLVM_DEBUG(dbgs() << "Data.size(): " << Data.size() << '\n'); - LLVM_DEBUG(dbgs() << "Value: " << Value << '\n'); - LLVM_DEBUG(dbgs() << "Target: "); - LLVM_DEBUG(Target.print(dbgs())); - LLVM_DEBUG(dbgs() << '\n'); - assert(isIntN(Size * 8 + 1, static_cast(Value)) && - "Value does not fit in the Fixup field"); - } + assert(isIntN(Size * 8 + 1, static_cast(Value)) && + "Value does not fit in the Fixup field"); // Write in Big Endian for (unsigned i = 0; i != Size; ++i) From 721fabcdcdf5063bb0c1dbde3be882417a0e34a6 Mon Sep 17 00:00:00 2001 From: Weiwei Chen Date: Sat, 29 Mar 2025 23:24:58 -0400 Subject: [PATCH 0027/1029] Add unittest. --- llvm/include/llvm/MCLinker/MCLinker.h | 1 - llvm/include/llvm/MCLinker/MCPipeline.h | 2 +- llvm/lib/MCLinker/MCLinker.cpp | 9 +- llvm/lib/MCLinker/MCPipeline.cpp | 2 +- llvm/unittests/CMakeLists.txt | 1 + llvm/unittests/MCLinker/CMakeLists.txt | 24 ++++ llvm/unittests/MCLinker/MCLinkerTest.cpp | 176 +++++++++++++++++++++++ 7 files changed, 208 insertions(+), 7 deletions(-) create mode 100644 llvm/unittests/MCLinker/CMakeLists.txt create mode 100644 llvm/unittests/MCLinker/MCLinkerTest.cpp diff --git a/llvm/include/llvm/MCLinker/MCLinker.h b/llvm/include/llvm/MCLinker/MCLinker.h index 7050f181a97ad..d80137ecd8071 100644 --- a/llvm/include/llvm/MCLinker/MCLinker.h +++ b/llvm/include/llvm/MCLinker/MCLinker.h @@ -39,7 +39,6 @@ namespace llvm { struct MCInfo { MCInfo(std::unique_ptr &&MachineModuleInfo, LLVMModuleAndContext &&ModuleAndContext, - llvm::StringMap &FnNameToFnPtr, std::unique_ptr &&TgtMachine, std::unique_ptr &&McContext, std::optional SplitIdx); diff --git a/llvm/include/llvm/MCLinker/MCPipeline.h b/llvm/include/llvm/MCLinker/MCPipeline.h index 1d78d996ace49..6397a236cba80 100644 --- a/llvm/include/llvm/MCLinker/MCPipeline.h +++ b/llvm/include/llvm/MCLinker/MCPipeline.h @@ -22,7 +22,7 @@ namespace mclinker { /// Build a pipeline that does machine specific codgen but stops before /// AsmPrint. bool addPassesToEmitMC(llvm::TargetMachine &, llvm::legacy::PassManagerBase &, - llvm::raw_pwrite_stream &, bool, + bool, llvm::MachineModuleInfoWrapperPass *, unsigned); /// Build a pipeline that does AsmPrint only. diff --git a/llvm/lib/MCLinker/MCLinker.cpp b/llvm/lib/MCLinker/MCLinker.cpp index 178933e803c0e..df6f7dadb3a62 100644 --- a/llvm/lib/MCLinker/MCLinker.cpp +++ b/llvm/lib/MCLinker/MCLinker.cpp @@ -30,21 +30,22 @@ using namespace llvm; //============================================================================== MCInfo::MCInfo(std::unique_ptr &&MachineModuleInfo, - LLVMModuleAndContext &&ModuleAndContext, - llvm::StringMap &FnNameToFnPtr, + LLVMModuleAndContext &&MAndContext, std::unique_ptr &&TgtMachine, std::unique_ptr &&McContext, std::optional SplitIdx) - : ModuleAndContext(std::move(ModuleAndContext)), + : ModuleAndContext(std::move(MAndContext)), McContext(std::move(McContext)), MachineModuleInfo(std::move(MachineModuleInfo)), - FnNameToFnPtr(std::move(FnNameToFnPtr)), TgtMachine(std::move(TgtMachine)), SplitIdx(SplitIdx) { std::string BufStr; llvm::raw_string_ostream BufOS(BufStr); llvm::WriteBitcodeToFile(*ModuleAndContext, BufOS); ModuleBuf = WritableMemoryBuffer::getNewUninitMemBuffer(BufStr.size()); memcpy(ModuleBuf->getBufferStart(), BufStr.c_str(), BufStr.size()); + for(Function& F: ModuleAndContext->functions()) { + FnNameToFnPtr.insert( {F.getName(), &F}); + } } //============================================================================== diff --git a/llvm/lib/MCLinker/MCPipeline.cpp b/llvm/lib/MCLinker/MCPipeline.cpp index 9ea69d1ab226d..db5aa32eecda3 100644 --- a/llvm/lib/MCLinker/MCPipeline.cpp +++ b/llvm/lib/MCLinker/MCPipeline.cpp @@ -57,7 +57,7 @@ bool SetMachineFunctionBasePass::doFinalization(llvm::Module &) { /// AsmPrint. Returns true if failed. bool llvm::mclinker::addPassesToEmitMC( llvm::TargetMachine &TgtMachine, llvm::legacy::PassManagerBase &PM, - llvm::raw_pwrite_stream &Out, bool DisableVerify, + bool DisableVerify, llvm::MachineModuleInfoWrapperPass *MMIWP, unsigned NumFnBase) { // Targets may override createPassConfig to provide a target-specific // subclass. diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt index 81abce51b8939..9735f440a67ef 100644 --- a/llvm/unittests/CMakeLists.txt +++ b/llvm/unittests/CMakeLists.txt @@ -49,6 +49,7 @@ add_subdirectory(IR) add_subdirectory(LineEditor) add_subdirectory(Linker) add_subdirectory(MC) +add_subdirectory(MCLinker) add_subdirectory(MI) add_subdirectory(MIR) add_subdirectory(ObjCopy) diff --git a/llvm/unittests/MCLinker/CMakeLists.txt b/llvm/unittests/MCLinker/CMakeLists.txt new file mode 100644 index 0000000000000..0eed16782fad1 --- /dev/null +++ b/llvm/unittests/MCLinker/CMakeLists.txt @@ -0,0 +1,24 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Analysis + AsmParser + AsmPrinter + CodeGen + CodeGenTypes + Core + FileCheck + IRPrinter + MC + MCLinker + MIRParser + ModuleSplitter + Passes + Support + Target + TargetParser + TransformUtils + ) + +add_llvm_unittest(MCLinkerTests + MCLinkerTest.cpp + ) diff --git a/llvm/unittests/MCLinker/MCLinkerTest.cpp b/llvm/unittests/MCLinker/MCLinkerTest.cpp new file mode 100644 index 0000000000000..1563ef6211d33 --- /dev/null +++ b/llvm/unittests/MCLinker/MCLinkerTest.cpp @@ -0,0 +1,176 @@ +//===- llvm/unittest/Linker/LinkModulesTest.cpp - IRBuilder tests ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "llvm/MCLinker/MCLinker.h" +#include "llvm/MCLinker/MCPipeline.h" +#include "llvm/ModuleSplitter/ModuleSplitter.h" + +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Testing/Support/Error.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +class MCLinkerTest : public testing::Test { +protected: + static void SetUpTestCase() { + LLVMInitializeX86TargetInfo(); + LLVMInitializeX86TargetMC(); + LLVMInitializeX86Target(); + LLVMInitializeX86AsmPrinter(); + } + + // Get TargetMachine. + std::unique_ptr getTargetMachine() { + // Get target triple for X86_64 + Triple TargetTriple("x86_64--"); + std::string Error; + const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error); + if (!T) + return nullptr; + + TargetOptions Options; + return std::unique_ptr(T->createTargetMachine( + TargetTriple, "", "", Options, Reloc::Model::PIC_, {}, + CodeGenOptLevel::Default)); + } + + std::unique_ptr getMCContext(TargetMachine &TM) { + Triple TargetTriple("x86_64--"); + std::unique_ptr Ctx( + new MCContext(TargetTriple, TM.getMCAsmInfo(), TM.getMCRegisterInfo(), + TM.getMCSubtargetInfo())); + + Ctx->setObjectFileInfo(TM.getObjFileLowering()); + TM.getObjFileLowering()->Initialize(*Ctx, TM); + Ctx->setObjectFileInfo(TM.getObjFileLowering()); + return Ctx; + } + + MachineModuleInfoWrapperPass *getMMIWP(TargetMachine &TM, + MCContext &ExternMC) { + return new MachineModuleInfoWrapperPass(&TM, &ExternMC); + } + + void SetUp() override { + // Module to compile. + const char *FooStr = R""""( + define void @foo() { + call void @baz() + ret void + } + + define void @baz() { + ret void + } + + define void @bar() { + call void @baz() + ret void + } + + define void @boo() { + ret void + } + )""""; + StringRef AssemblyF(FooStr); + + TM = getTargetMachine(); + + if (!TM) + GTEST_SKIP(); + + // Parse the module. + Expected MResult = M.create( + [&](llvm::LLVMContext &Context) -> Expected> { + SMDiagnostic SMError; + std::unique_ptr M = + parseAssemblyString(AssemblyF, SMError, Context); + if (!M) { + return make_error("could not load LLVM file", + inconvertibleErrorCode()); + } + return M; + }); + + ASSERT_FALSE((!MResult)); + + M->setDataLayout(TM->createDataLayout()); + } + + LLVMModuleAndContext M; + std::unique_ptr TM; +}; + +TEST_F(MCLinkerTest, SplitModuleCompilerMCLink) { + + SymbolAndMCInfo SMCInfo; + bool Failed = false; + + auto OutputLambda = + [&](llvm::unique_function ProduceModule, + std::optional Idx, unsigned NumFunctionsBase) mutable { + LLVMModuleAndContext SubModule = ProduceModule(); + std::unique_ptr TM = getTargetMachine(); + std::unique_ptr MCCtx = getMCContext(*TM); + MachineModuleInfoWrapperPass *MMIWP = getMMIWP(*TM, *MCCtx); + + legacy::PassManager PassMgr; + mclinker::addPassesToEmitMC(*TM, PassMgr, true, MMIWP, + NumFunctionsBase); + if (!PassMgr.run(*SubModule)) + Failed = true; + + SMCInfo.McInfos.emplace_back(std::make_unique( + std::make_unique(std::move(MMIWP->getMMI())), + std::move(SubModule), std::move(TM), std::move(MCCtx), Idx)); + }; + + splitPerFunction(std::move(M), OutputLambda, SMCInfo.SymbolLinkageTypes, 0); + + std::unique_ptr TMMCLink = getTargetMachine(); + SmallVector SMCInfos{&SMCInfo}; + llvm::StringMap SymbolLinkageTypes; + + MCLinker Linker(SMCInfos, *TMMCLink, SymbolLinkageTypes); + + Expected> LinkResult = + Linker.linkAndPrint("SplitModuleCompilerMCLink", + llvm::CodeGenFileType::AssemblyFile, true); + + ASSERT_FALSE((!LinkResult)); + llvm::dbgs() << "Size: " << (*LinkResult)->getBufferSize() << "\n"; + + llvm::dbgs() << StringRef((*LinkResult)->getBufferStart()) << "\n"; +} + +} // end anonymous namespace From eba3734f04be9a830a7fe1c8e97a4b3c50f9c869 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 29 Mar 2025 21:07:24 -0700 Subject: [PATCH 0028/1029] [polly] Use *Set::insert_range (NFC) (#133609) --- polly/lib/Analysis/ScopBuilder.cpp | 9 +++------ polly/lib/Analysis/ScopDetection.cpp | 2 +- polly/lib/CodeGen/IslNodeBuilder.cpp | 3 +-- polly/lib/Support/SCEVValidator.cpp | 4 ++-- polly/lib/Transform/MaximalStaticExpansion.cpp | 8 +++----- 5 files changed, 10 insertions(+), 16 deletions(-) diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp index 76c9b4775784e..351eab7f93710 100644 --- a/polly/lib/Analysis/ScopBuilder.cpp +++ b/polly/lib/Analysis/ScopBuilder.cpp @@ -2633,8 +2633,7 @@ void ScopBuilder::checkForReductions(ScopStmt &Stmt) { if (auto *Ptr = dyn_cast(Load->getPointerOperand())) { const auto &It = State.find(Ptr); if (It != State.end()) - for (const auto &FlowInSetElem : It->second) - InvalidLoads.insert(FlowInSetElem.first); + InvalidLoads.insert_range(llvm::make_first_range(It->second)); } // If this load is used outside this stmt, invalidate it. @@ -2654,8 +2653,7 @@ void ScopBuilder::checkForReductions(ScopStmt &Stmt) { dyn_cast(Store->getPointerOperand())) { const auto &It = State.find(Ptr); if (It != State.end()) - for (const auto &FlowInSetElem : It->second) - InvalidLoads.insert(FlowInSetElem.first); + InvalidLoads.insert_range(llvm::make_first_range(It->second)); } // Propagate the uses of the value operand to the store @@ -2710,8 +2708,7 @@ void ScopBuilder::checkForReductions(ScopStmt &Stmt) { // If this operation is used outside the stmt, invalidate all the loads // which feed into it. if (UsedOutsideStmt) - for (const auto &FlowInSetElem : InstInFlowSet) - InvalidLoads.insert(FlowInSetElem.first); + InvalidLoads.insert_range(llvm::make_first_range(InstInFlowSet)); } } diff --git a/polly/lib/Analysis/ScopDetection.cpp b/polly/lib/Analysis/ScopDetection.cpp index 7ad2e53b589ae..260211bdce31f 100644 --- a/polly/lib/Analysis/ScopDetection.cpp +++ b/polly/lib/Analysis/ScopDetection.cpp @@ -500,7 +500,7 @@ bool ScopDetection::onlyValidRequiredInvariantLoads( } } - Context.RequiredILS.insert(RequiredILS.begin(), RequiredILS.end()); + Context.RequiredILS.insert_range(RequiredILS); return true; } diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp index 6affc202d0a4c..ca497927e2976 100644 --- a/polly/lib/CodeGen/IslNodeBuilder.cpp +++ b/polly/lib/CodeGen/IslNodeBuilder.cpp @@ -325,8 +325,7 @@ void IslNodeBuilder::getReferencesInSubtree(const isl::ast_node &For, SubtreeReferences References = { LI, SE, S, ValueMap, Values, SCEVs, getBlockGenerator(), nullptr}; - for (const auto &I : IDToValue) - Values.insert(I.second); + Values.insert_range(llvm::make_second_range(IDToValue)); // NOTE: this is populated in IslNodeBuilder::addParameters for (const auto &I : OutsideLoopIterations) diff --git a/polly/lib/Support/SCEVValidator.cpp b/polly/lib/Support/SCEVValidator.cpp index 599d7f9d60802..ad3d0c22295b5 100644 --- a/polly/lib/Support/SCEVValidator.cpp +++ b/polly/lib/Support/SCEVValidator.cpp @@ -83,7 +83,7 @@ class ValidatorResult final { /// Add the parameters of Source to this result. void addParamsFrom(const ValidatorResult &Source) { - Parameters.insert(Source.Parameters.begin(), Source.Parameters.end()); + Parameters.insert_range(Source.Parameters); } /// Merge a result. @@ -633,7 +633,7 @@ static bool isAffineExpr(Value *V, const Region *R, Loop *Scope, return false; auto ResultParams = Result.getParameters(); - Params.insert(ResultParams.begin(), ResultParams.end()); + Params.insert_range(ResultParams); return true; } diff --git a/polly/lib/Transform/MaximalStaticExpansion.cpp b/polly/lib/Transform/MaximalStaticExpansion.cpp index c9227ac0bfd10..0719840f74a79 100644 --- a/polly/lib/Transform/MaximalStaticExpansion.cpp +++ b/polly/lib/Transform/MaximalStaticExpansion.cpp @@ -139,8 +139,7 @@ class MaximalStaticExpansionImpl { SmallPtrSetImpl &Reads, Scop &S) { if (SAI->isValueKind()) { Writes.insert(S.getValueDef(SAI)); - for (auto MA : S.getValueUses(SAI)) - Reads.insert(MA); + Reads.insert_range(S.getValueUses(SAI)); return true; } else if (SAI->isPHIKind()) { auto Read = S.getPHIRead(SAI); @@ -399,9 +398,8 @@ class MaximalStaticExpansionImpl { /// @param Dependences The RAW dependences of the SCop. void expandPhi(Scop &S, const ScopArrayInfo *SAI, const isl::union_map &Dependences) { - SmallPtrSet Writes; - for (auto MA : S.getPHIIncomings(SAI)) - Writes.insert(MA); + SmallPtrSet Writes(llvm::from_range, + S.getPHIIncomings(SAI)); auto Read = S.getPHIRead(SAI); auto ExpandedSAI = expandAccess(Read); From 976a384ba67adf059ab9fe5550e7e67b6fc53396 Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Sun, 30 Mar 2025 01:45:00 -0300 Subject: [PATCH 0029/1029] [clang] implement common-sugar for adjusted member-pointers (#133613) --- clang/lib/AST/ASTContext.cpp | 10 +++++++++- clang/test/SemaCXX/sugar-common-types.cpp | 16 ++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index c9d1bea4c623a..2d9480ebcf00c 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -14135,7 +14135,6 @@ static QualType getCommonSugarTypeNode(ASTContext &Ctx, const Type *X, CANONICAL_TYPE(IncompleteArray) CANONICAL_TYPE(HLSLAttributedResource) CANONICAL_TYPE(LValueReference) - CANONICAL_TYPE(MemberPointer) CANONICAL_TYPE(ObjCInterface) CANONICAL_TYPE(ObjCObject) CANONICAL_TYPE(ObjCObjectPointer) @@ -14313,6 +14312,15 @@ static QualType getCommonSugarTypeNode(ASTContext &Ctx, const Type *X, return QualType(); return Ctx.getUsingType(CD, Ctx.getQualifiedType(Underlying)); } + case Type::MemberPointer: { + const auto *PX = cast(X), + *PY = cast(Y); + CXXRecordDecl *Cls = PX->getMostRecentCXXRecordDecl(); + assert(Cls == PY->getMostRecentCXXRecordDecl()); + return Ctx.getMemberPointerType( + ::getCommonPointeeType(Ctx, PX, PY), + ::getCommonQualifier(Ctx, PX, PY, /*IsSame=*/false), Cls); + } case Type::CountAttributed: { const auto *DX = cast(X), *DY = cast(Y); diff --git a/clang/test/SemaCXX/sugar-common-types.cpp b/clang/test/SemaCXX/sugar-common-types.cpp index a21032517b2ba..d58f6cdd900fc 100644 --- a/clang/test/SemaCXX/sugar-common-types.cpp +++ b/clang/test/SemaCXX/sugar-common-types.cpp @@ -186,3 +186,19 @@ namespace arrays { // expected-error@-1 {{lvalue of type 'const volatile volatile B1[1]' (aka 'const volatile volatile int[1]')}} } // namespace balanced_qualifiers } // namespace arrays + +namespace member_pointers { + template struct W { + X1 a; + Y1 b; + }; + struct W1 : W {}; + struct W2 : W {}; + + N t1 = 0 ? &W::a : &W::b; + // expected-error@-1 {{rvalue of type 'B1 W::*'}} + + // FIXME: adjusted MemberPointer does not preserve qualifier + N t3 = 0 ? &W1::a : &W2::b; + // expected-error@-1 {{rvalue of type 'B1 W::*'}} +} // namespace member_pointers From 9747bb182f430bb1bd3525b7f42e88df626e28e5 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Sat, 29 Mar 2025 22:07:56 -0700 Subject: [PATCH 0030/1029] [CodeGen][StaticDataSplitter]Support constant pool partitioning (#129781) This is a follow-up patch of https://github.com/llvm/llvm-project/pull/125756 In this PR, static-data-splitter pass produces the aggregated profile counts of constants for constant pools in a global state (`StateDataProfileInfo`), and asm printer consumes the profile counts to produce `.hot` or `.unlikely` prefixes. This implementation covers both x86 and aarch64 asm printer. --- llvm/include/llvm/CodeGen/AsmPrinter.h | 12 ++ .../CodeGen/TargetLoweringObjectFileImpl.h | 6 + .../llvm/Target/TargetLoweringObjectFile.h | 7 + llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 9 +- llvm/lib/CodeGen/StaticDataSplitter.cpp | 82 ++++++--- .../CodeGen/TargetLoweringObjectFileImpl.cpp | 35 ++++ llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 6 + llvm/lib/Target/TargetLoweringObjectFile.cpp | 12 ++ llvm/lib/Target/X86/X86AsmPrinter.cpp | 6 + .../AArch64/constant-pool-partition.ll | 172 ++++++++++++++++++ .../CodeGen/X86/constant-pool-partition.ll | 141 ++++++++++++++ 11 files changed, 460 insertions(+), 28 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/constant-pool-partition.ll create mode 100644 llvm/test/CodeGen/X86/constant-pool-partition.ll diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h index 4dd45a1a7774d..16363fbaa4f9a 100644 --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -18,6 +18,8 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" +#include "llvm/Analysis/StaticDataProfileInfo.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/DwarfStringPoolEntry.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -132,6 +134,12 @@ class AsmPrinter : public MachineFunctionPass { /// default, this is equal to CurrentFnSym. MCSymbol *CurrentFnSymForSize = nullptr; + /// Provides the profile information for constants. + const StaticDataProfileInfo *SDPI = nullptr; + + /// The profile summary information. + const ProfileSummaryInfo *PSI = nullptr; + /// Map a basic block section ID to the begin and end symbols of that section /// which determine the section's range. struct MBBSectionRange { @@ -330,6 +338,10 @@ class AsmPrinter : public MachineFunctionPass { DwarfUsesRelocationsAcrossSections = Enable; } + /// Returns a section suffix (hot or unlikely) for the constant if profiles + /// are available. Returns empty string otherwise. + StringRef getConstantSectionSuffix(const Constant *C) const; + //===------------------------------------------------------------------===// // XRay instrumentation implementation. //===------------------------------------------------------------------===// diff --git a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h index 7c929262f6823..8b0e5798d1b61 100644 --- a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h +++ b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h @@ -66,6 +66,12 @@ class TargetLoweringObjectFileELF : public TargetLoweringObjectFile { const Constant *C, Align &Alignment) const override; + /// Similar to the function above, but append \p SectionSuffix to the section + /// name. + MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, + const Constant *C, Align &Alignment, + StringRef SectionSuffix) const override; + MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override; diff --git a/llvm/include/llvm/Target/TargetLoweringObjectFile.h b/llvm/include/llvm/Target/TargetLoweringObjectFile.h index 9fc09bb7db6c2..47617424a9688 100644 --- a/llvm/include/llvm/Target/TargetLoweringObjectFile.h +++ b/llvm/include/llvm/Target/TargetLoweringObjectFile.h @@ -104,6 +104,13 @@ class TargetLoweringObjectFile : public MCObjectFileInfo { SectionKind Kind, const Constant *C, Align &Alignment) const; + /// Similar to the function above, but append \p SectionSuffix to the section + /// name. + virtual MCSection *getSectionForConstant(const DataLayout &DL, + SectionKind Kind, const Constant *C, + Align &Alignment, + StringRef SectionSuffix) const; + virtual MCSection * getSectionForMachineBasicBlock(const Function &F, const MachineBasicBlock &MBB, diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index c626202753824..2d76aa5488333 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2769,6 +2769,13 @@ namespace { } // end anonymous namespace +StringRef AsmPrinter::getConstantSectionSuffix(const Constant *C) const { + if (TM.Options.EnableStaticDataPartitioning && C && SDPI && PSI) + return SDPI->getConstantSectionPrefix(C, PSI); + + return ""; +} + /// EmitConstantPool - Print to the current output stream assembly /// representations of the constants in the constant pool MCP. This is /// used to print out constants which have been "spilled to memory" by @@ -2792,7 +2799,7 @@ void AsmPrinter::emitConstantPool() { C = CPE.Val.ConstVal; MCSection *S = getObjFileLowering().getSectionForConstant( - getDataLayout(), Kind, C, Alignment); + getDataLayout(), Kind, C, Alignment, getConstantSectionSuffix(C)); // The number of sections are small, just do a linear search from the // last section to the first. diff --git a/llvm/lib/CodeGen/StaticDataSplitter.cpp b/llvm/lib/CodeGen/StaticDataSplitter.cpp index 60501b4495082..8e12c5e5439ba 100644 --- a/llvm/lib/CodeGen/StaticDataSplitter.cpp +++ b/llvm/lib/CodeGen/StaticDataSplitter.cpp @@ -10,7 +10,7 @@ // for the following types of static data: // - Jump tables // - Module-internal global variables -// - Constant pools (TODO) +// - Constant pools // // For the original RFC of this pass please see // https://discourse.llvm.org/t/rfc-profile-guided-static-data-partitioning/83744 @@ -60,8 +60,8 @@ class StaticDataSplitter : public MachineFunctionPass { // Returns the constant if the operand refers to a global variable or constant // that gets lowered to static data sections. Otherwise, return nullptr. - const Constant *getConstant(const MachineOperand &Op, - const TargetMachine &TM); + const Constant *getConstant(const MachineOperand &Op, const TargetMachine &TM, + const MachineConstantPool *MCP); // Use profiles to partition static data. bool partitionStaticDataWithProfiles(MachineFunction &MF); @@ -89,8 +89,11 @@ class StaticDataSplitter : public MachineFunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); - // This pass does not modify the CFG. - AU.setPreservesCFG(); + // This pass does not modify any required analysis results except + // StaticDataProfileInfoWrapperPass, but StaticDataProfileInfoWrapperPass + // is made an immutable pass that it won't be re-scheduled by pass manager + // anyway. So mark setPreservesAll() here for faster compile time. + AU.setPreservesAll(); } bool runOnMachineFunction(MachineFunction &MF) override; @@ -119,40 +122,63 @@ bool StaticDataSplitter::runOnMachineFunction(MachineFunction &MF) { return Changed; } -const Constant *StaticDataSplitter::getConstant(const MachineOperand &Op, - const TargetMachine &TM) { - if (!Op.isGlobal()) +const Constant * +StaticDataSplitter::getConstant(const MachineOperand &Op, + const TargetMachine &TM, + const MachineConstantPool *MCP) { + if (!Op.isGlobal() && !Op.isCPI()) return nullptr; - // Find global variables with local linkage. - const GlobalVariable *GV = getLocalLinkageGlobalVariable(Op.getGlobal()); - // Skip 'llvm.'-prefixed global variables conservatively because they are - // often handled specially, and skip those not in static data sections. - if (!GV || GV->getName().starts_with("llvm.") || - !inStaticDataSection(*GV, TM)) + if (Op.isGlobal()) { + // Find global variables with local linkage. + const GlobalVariable *GV = getLocalLinkageGlobalVariable(Op.getGlobal()); + // Skip 'llvm.'-prefixed global variables conservatively because they are + // often handled specially, and skip those not in static data + // sections. + if (!GV || GV->getName().starts_with("llvm.") || + !inStaticDataSection(*GV, TM)) + return nullptr; + return GV; + } + assert(Op.isCPI() && "Op must be constant pool index in this branch"); + int CPI = Op.getIndex(); + if (CPI == -1) + return nullptr; + + assert(MCP != nullptr && "Constant pool info is not available."); + const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI]; + + if (CPE.isMachineConstantPoolEntry()) return nullptr; - return GV; + + return CPE.Val.ConstVal; } bool StaticDataSplitter::partitionStaticDataWithProfiles(MachineFunction &MF) { - int NumChangedJumpTables = 0; + // If any of the static data (jump tables, global variables, constant pools) + // are captured by the analysis, set `Changed` to true. Note this pass won't + // invalidate any analysis pass (see `getAnalysisUsage` above), so the main + // purpose of tracking and conveying the change (to pass manager) is + // informative as opposed to invalidating any analysis results. As an example + // of where this information is useful, `PMDataManager::dumpPassInfo` will + // only dump pass info if a local change happens, otherwise a pass appears as + // "skipped". + bool Changed = false; - const TargetMachine &TM = MF.getTarget(); MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); // Jump table could be used by either terminating instructions or // non-terminating ones, so we walk all instructions and use // `MachineOperand::isJTI()` to identify jump table operands. - // Similarly, `MachineOperand::isCPI()` can identify constant pool usages - // in the same loop. + // Similarly, `MachineOperand::isCPI()` is used to identify constant pool + // usages in the same loop. for (const auto &MBB : MF) { + std::optional Count = MBFI->getBlockProfileCount(&MBB); for (const MachineInstr &I : MBB) { for (const MachineOperand &Op : I.operands()) { - if (!Op.isJTI() && !Op.isGlobal()) + if (!Op.isJTI() && !Op.isGlobal() && !Op.isCPI()) continue; - std::optional Count = MBFI->getBlockProfileCount(&MBB); - if (Op.isJTI()) { assert(MJTI != nullptr && "Jump table info is not available."); const int JTI = Op.getIndex(); @@ -168,15 +194,16 @@ bool StaticDataSplitter::partitionStaticDataWithProfiles(MachineFunction &MF) { if (Count && PSI->isColdCount(*Count)) Hotness = MachineFunctionDataHotness::Cold; - if (MJTI->updateJumpTableEntryHotness(JTI, Hotness)) - ++NumChangedJumpTables; - } else if (const Constant *C = getConstant(Op, TM)) { + Changed |= MJTI->updateJumpTableEntryHotness(JTI, Hotness); + } else if (const Constant *C = + getConstant(Op, MF.getTarget(), MF.getConstantPool())) { SDPI->addConstantProfileCount(C, Count); + Changed = true; } } } } - return NumChangedJumpTables > 0; + return Changed; } const GlobalVariable * @@ -218,7 +245,8 @@ void StaticDataSplitter::annotateStaticDataWithoutProfiles( for (const auto &MBB : MF) for (const MachineInstr &I : MBB) for (const MachineOperand &Op : I.operands()) - if (const Constant *C = getConstant(Op, MF.getTarget())) + if (const Constant *C = + getConstant(Op, MF.getTarget(), MF.getConstantPool())) SDPI->addConstantProfileCount(C, std::nullopt); } diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index dd6d85e3662db..4c20c5dc74d9a 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1068,6 +1068,41 @@ MCSection *TargetLoweringObjectFileELF::getSectionForConstant( return DataRelROSection; } +MCSection *TargetLoweringObjectFileELF::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C, Align &Alignment, + StringRef SectionSuffix) const { + // TODO: Share code between this function and + // MCObjectInfo::initELFMCObjectFileInfo. + if (SectionSuffix.empty()) + return getSectionForConstant(DL, Kind, C, Alignment); + + auto &Context = getContext(); + if (Kind.isMergeableConst4() && MergeableConst4Section) + return Context.getELFSection(".rodata.cst4." + SectionSuffix, + ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_MERGE, 4); + if (Kind.isMergeableConst8() && MergeableConst8Section) + return Context.getELFSection(".rodata.cst8." + SectionSuffix, + ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_MERGE, 8); + if (Kind.isMergeableConst16() && MergeableConst16Section) + return Context.getELFSection(".rodata.cst16." + SectionSuffix, + ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_MERGE, 16); + if (Kind.isMergeableConst32() && MergeableConst32Section) + return Context.getELFSection(".rodata.cst32." + SectionSuffix, + ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_MERGE, 32); + if (Kind.isReadOnly()) + return Context.getELFSection(".rodata." + SectionSuffix, ELF::SHT_PROGBITS, + ELF::SHF_ALLOC); + + assert(Kind.isReadOnlyWithRel() && "Unknown section kind"); + return Context.getELFSection(".data.rel.ro." + SectionSuffix, + ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE); +} + /// Returns a unique section for the given machine basic block. MCSection *TargetLoweringObjectFileELF::getSectionForMachineBasicBlock( const Function &F, const MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index ff1aee9bda6e5..d29a72a4f6884 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -226,6 +226,12 @@ class AArch64AsmPrinter : public AsmPrinter { } bool runOnMachineFunction(MachineFunction &MF) override { + if (auto *PSIW = getAnalysisIfAvailable()) + PSI = &PSIW->getPSI(); + if (auto *SDPIW = + getAnalysisIfAvailable()) + SDPI = &SDPIW->getStaticDataProfileInfo(); + AArch64FI = MF.getInfo(); STI = &MF.getSubtarget(); diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp index cab9bc8678a58..0920c3345ecf3 100644 --- a/llvm/lib/Target/TargetLoweringObjectFile.cpp +++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp @@ -385,6 +385,18 @@ MCSection *TargetLoweringObjectFile::getSectionForConstant( return DataSection; } +MCSection *TargetLoweringObjectFile::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C, Align &Alignment, + StringRef SectionPrefix) const { + // Fallback to `getSectionForConstant` without `SectionPrefix` parameter if it + // is empty. + if (SectionPrefix.empty()) + return getSectionForConstant(DL, Kind, C, Alignment); + report_fatal_error( + "TargetLoweringObjectFile::getSectionForConstant that " + "accepts SectionPrefix is not implemented for the object file format"); +} + MCSection *TargetLoweringObjectFile::getSectionForMachineBasicBlock( const Function &F, const MachineBasicBlock &MBB, const TargetMachine &TM) const { diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp index 79aa898e18bfa..a227afe37d737 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -20,6 +20,7 @@ #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" +#include "llvm/Analysis/StaticDataProfileInfo.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineConstantPool.h" @@ -61,6 +62,11 @@ X86AsmPrinter::X86AsmPrinter(TargetMachine &TM, /// runOnMachineFunction - Emit the function body. /// bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { + if (auto *PSIW = getAnalysisIfAvailable()) + PSI = &PSIW->getPSI(); + if (auto *SDPIW = getAnalysisIfAvailable()) + SDPI = &SDPIW->getStaticDataProfileInfo(); + Subtarget = &MF.getSubtarget(); SMShadowTracker.startFunction(MF); diff --git a/llvm/test/CodeGen/AArch64/constant-pool-partition.ll b/llvm/test/CodeGen/AArch64/constant-pool-partition.ll new file mode 100644 index 0000000000000..ab627b02a1bc7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/constant-pool-partition.ll @@ -0,0 +1,172 @@ +; RUN: llc -mtriple=aarch64 -enable-split-machine-functions \ +; RUN: -partition-static-data-sections=true -function-sections=true \ +; RUN: -unique-section-names=false \ +; RUN: %s -o - 2>&1 | FileCheck %s --dump-input=always + +; Repeat the RUN command above for big-endian systems. +; RUN: llc -mtriple=aarch64_be -enable-split-machine-functions \ +; RUN: -partition-static-data-sections=true -function-sections=true \ +; RUN: -unique-section-names=false \ +; RUN: %s -o - 2>&1 | FileCheck %s --dump-input=always + +; Tests that constant pool hotness is aggregated across the module. The +; static-data-splitter processes data from cold_func first, unprofiled_func +; secondly, and then hot_func. Specifically, tests that +; - If a constant is accessed by hot functions, all constant pools for this +; constant (e.g., from an unprofiled function, or cold function) should have +; `.hot` suffix. For instance, double 0.68 is seen by both @cold_func and +; @hot_func, so two CPI emits (under label LCPI0_0 and LCPI2_0) have `.hot` +; suffix. +; - Similarly if a constant is accessed by both cold function and un-profiled +; function, constant pools for this constant should not have `.unlikely` suffix. + +;; Constant pools for function @cold_func. +; CHECK: .section .rodata.cst8.hot,"aM",@progbits,8 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .xword 0x3fe5c28f5c28f5c3 // double 0.68000000000000005 +; CHECK-NEXT: .section .rodata.cst8.unlikely,"aM",@progbits,8 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI0_1: +; CHECK-NEXT: .xword 0x3fe5eb851eb851ec // double 0.68500000000000005 +; CHECK-NEXT: .section .rodata.cst8,"aM",@progbits,8 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI0_2: +; CHECK-NEXT: .byte 0 // 0x0 +; CHECK-NEXT: .byte 4 // 0x4 +; CHECK-NEXT: .byte 8 // 0x8 +; CHECK-NEXT: .byte 12 // 0xc +; CHECK-NEXT: .byte 255 // 0xff +; CHECK-NEXT: .byte 255 // 0xff +; CHECK-NEXT: .byte 255 // 0xff +; CHECK-NEXT: .byte 255 // 0xff + +;; Constant pools for function @unprofiled_func +; CHECK: .section .rodata.cst8,"aM",@progbits,8 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .byte 0 // 0x0 +; CHECK-NEXT: .byte 4 // 0x4 +; CHECK-NEXT: .byte 8 // 0x8 +; CHECK-NEXT: .byte 12 // 0xc +; CHECK-NEXT: .byte 255 // 0xff +; CHECK-NEXT: .byte 255 // 0xff +; CHECK-NEXT: .byte 255 // 0xff +; CHECK-NEXT: .byte 255 // 0xff +; CHECK-NEXT: .section .rodata.cst16,"aM",@progbits,16 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI1_1: +; CHECK-NEXT: .word 2 // 0x2 +; CHECK-NEXT: .word 3 // 0x3 +; CHECK-NEXT: .word 5 // 0x5 +; CHECK-NEXT: .word 7 // 0x7 +; CHECK-NEXT: .section .rodata.cst16.hot,"aM",@progbits,16 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI1_2: +; CHECK-NEXT: .word 442 // 0x1ba +; CHECK-NEXT: .word 100 // 0x64 +; CHECK-NEXT: .word 0 // 0x0 +; CHECK-NEXT: .word 0 // 0x0 + +;; Constant pools for function @hot_func +; CHECK: .section .rodata.cst8.hot,"aM",@progbits,8 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI2_0: +; CHECK-NEXT: .xword 0x3fe5c28f5c28f5c3 // double 0.68000000000000005 +; CHECK-NEXT: .section .rodata.cst16.hot,"aM",@progbits,16 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI2_1: +; CHECK-NEXT: .word 0 // 0x0 +; CHECK-NEXT: .word 100 // 0x64 +; CHECK-NEXT: .word 0 // 0x0 +; CHECK-NEXT: .word 442 // 0x1ba +; CHECK-NEXT: .LCPI2_2: +; CHECK-NEXT: .word 442 // 0x1ba +; CHECK-NEXT: .word 100 // 0x64 +; CHECK-NEXT: .word 0 // 0x0 +; CHECK-NEXT: .word 0 // 0x0 + +;; For global variable @val +;; The section name remains `.rodata.cst32` without hotness prefix because +;; the variable has external linkage and not analyzed. Compiler need symbolized +;; data access profiles to annotate such global variables' hotness. +; CHECK: .section .rodata.cst32,"aM",@progbits,32 +; CHECK-NEXT: .globl val + +define i32 @cold_func(double %x, <16 x i8> %a, <16 x i8> %b) !prof !16 { + %2 = tail call i32 (...) @func_taking_arbitrary_param(double 6.800000e-01) + %num = tail call i32 (...) @func_taking_arbitrary_param(double 6.8500000e-01) + %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> ) + %t2 = bitcast <8 x i8> %t1 to <2 x i32> + %3 = extractelement <2 x i32> %t2, i32 1 + %sum = add i32 %2, %3 + %ret = add i32 %sum, %num + ret i32 %ret +} + +declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) +declare i32 @func_taking_arbitrary_param(...) + +define <4 x i1> @unprofiled_func(<16 x i8> %a, <16 x i8> %b) { + %t1 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %a, <16 x i8> %b, <8 x i8> ) + %t2 = bitcast <8 x i8> %t1 to <4 x i16> + %t3 = zext <4 x i16> %t2 to <4 x i32> + %t4 = add <4 x i32> %t3, + %cmp = icmp ule <4 x i32> , %t4 + ret <4 x i1> %cmp +} + +define <4 x i1> @hot_func(i32 %0, <4 x i32> %a) !prof !17 { + %2 = tail call i32 (...) @func_taking_arbitrary_param(double 6.800000e-01) + %b = add <4 x i32> , %a + %c = icmp ule <4 x i32> %b, + ret <4 x i1> %c +} + +@val = unnamed_addr constant i256 1 + +define i32 @main(i32 %0, ptr %1) !prof !16 { + br label %7 + +5: ; preds = %7 + %x = call double @double_func() + %a = call <16 x i8> @vector_func_16i8() + %b = call <16 x i8> @vector_func_16i8() + call void @cold_func(double %x, <16 x i8> %a, <16 x i8> %b) + ret i32 0 + +7: ; preds = %7, %2 + %8 = phi i32 [ 0, %2 ], [ %10, %7 ] + %seed_val = load i256, ptr @val + %9 = call i32 @seed(i256 %seed_val) + call void @hot_func(i32 %9) + %10 = add i32 %8, 1 + %11 = icmp eq i32 %10, 100000 + br i1 %11, label %5, label %7, !prof !18 +} + +declare i32 @seed(i256) +declare double @double_func() +declare <4 x i32> @vector_func() +declare <16 x i8> @vector_func_16i8() + +!llvm.module.flags = !{!1} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10, !11, !12} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 1460617} +!5 = !{!"MaxCount", i64 849536} +!6 = !{!"MaxInternalCount", i64 32769} +!7 = !{!"MaxFunctionCount", i64 849536} +!8 = !{!"NumCounts", i64 23784} +!9 = !{!"NumFunctions", i64 3301} +!10 = !{!"IsPartialProfile", i64 0} +!11 = !{!"PartialProfileRatio", double 0.000000e+00} +!12 = !{!"DetailedSummary", !13} +!13 = !{!14, !15} +!14 = !{i32 990000, i64 166, i32 73} +!15 = !{i32 999999, i64 3, i32 1463} +!16 = !{!"function_entry_count", i64 1} +!17 = !{!"function_entry_count", i64 100000} +!18 = !{!"branch_weights", i32 1, i32 99999} diff --git a/llvm/test/CodeGen/X86/constant-pool-partition.ll b/llvm/test/CodeGen/X86/constant-pool-partition.ll new file mode 100644 index 0000000000000..d2c87b7b3fc14 --- /dev/null +++ b/llvm/test/CodeGen/X86/constant-pool-partition.ll @@ -0,0 +1,141 @@ +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev4-linux-gnu" + +; Tests that constant pool hotness is aggregated across the module. The +; static-data-splitter processes data from @cold_func first, two functions +; without profiles secondly, and then @hot_func. Specifically, tests that +; 1. If a constant is accessed by hot functions, all constant pools for this +; constant (e.g., from an unprofiled function, or cold function) should have +; .hot suffix. +; 2. Similarly if a constant is accessed by both cold function and un-profiled +; function, constant pools for this constant should not have .unlikely suffix. + +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-split-machine-functions \ +; RUN: -partition-static-data-sections=true -function-sections=true -data-sections=true \ +; RUN: -unique-section-names=false \ +; RUN: %s -o - 2>&1 | FileCheck %s --dump-input=always + +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-split-machine-functions \ +; RUN: -partition-static-data-sections=true -function-sections=true -data-sections=true \ +; RUN: -unique-section-names=true \ +; RUN: %s -o - 2>&1 | FileCheck %s --dump-input=always + +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-split-machine-functions \ +; RUN: -partition-static-data-sections=true -function-sections=false -data-sections=false \ +; RUN: -unique-section-names=false \ +; RUN: %s -o - 2>&1 | FileCheck %s --dump-input=always + +;; For function @cold_func +; CHECK: .section .rodata.cst8.hot,"aM",@progbits,8 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .quad 0x3fe5c28f5c28f5c3 # double 0.68000000000000005 +; CHECK-NEXT: .section .rodata.cst8.unlikely,"aM",@progbits,8 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI0_1: +; CHECK-NEXT: .quad 0x3eb0000000000000 # double 9.5367431640625E-7 +; CHECK-NEXT: .section .rodata.cst8,"aM",@progbits,8 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI0_2: +; CHECK-NEXT: .quad 0x3fc0000000000000 # double 0.125 + +;; For function @unprofiled_func_double +; CHECK: .section .rodata.cst8,"aM",@progbits,8 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .quad 0x3fc0000000000000 # double 0.125 + +;; For function @unprofiled_func_float +; CHECK: .section .rodata.cst4,"aM",@progbits,4 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI2_0: +; CHECK-NEXT: .long 0x3e000000 # float 0.125 + +;; For function @hot_func +; CHECK: .section .rodata.cst8.hot,"aM",@progbits,8 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI3_0: +; CHECK-NEXT: .quad 0x3fe5c28f5c28f5c3 # double 0.68000000000000005 +; CHECK-NEXT: .section .rodata.cst16.hot,"aM",@progbits,16 +; CHECK-NEXT: .p2align +; CHECK-NEXT: .LCPI3_1: +; CHECK-NEXT: .long 2147483648 # 0x80000000 +; CHECK-NEXT: .long 2147483648 # 0x80000000 +; CHECK-NEXT: .long 2147483648 # 0x80000000 +; CHECK-NEXT: .long 2147483648 # 0x80000000 +; CHECK-NEXT: .LCPI3_2: +; CHECK-NEXT: .long 2147484090 # 0x800001ba +; CHECK-NEXT: .long 2147483748 # 0x80000064 +; CHECK-NEXT: .long 2147483648 # 0x80000000 +; CHECK-NEXT: .long 2147483648 # 0x80000000 + +; CHECK: .section .rodata.cst32,"aM",@progbits,32 +; CHECK-NEXT: .globl val + +define double @cold_func(double %x) !prof !16 { + %2 = tail call i32 (...) @func_taking_arbitrary_param(double 6.800000e-01) + %y = fmul double %x, 0x3EB0000000000000 + %z = fmul double %y, 0x3fc0000000000000 + ret double %z +} + +define double @unprofiled_func_double(double %x) { + %z = fmul double %x, 0x3fc0000000000000 + ret double %z +} + +define float @unprofiled_func_float(float %x) { + %z = fmul float %x, 0x3fc0000000000000 + ret float %z +} + +define <4 x i1> @hot_func(i32 %0, <4 x i32> %a) !prof !17 { + %2 = tail call i32 (...) @func_taking_arbitrary_param(double 6.800000e-01) + %b = icmp ule <4 x i32> %a, + ret <4 x i1> %b +} + +@val = unnamed_addr constant i256 1 + +define i32 @main(i32 %0, ptr %1) !prof !16 { + br label %7 + +5: ; preds = %7 + %x = call double @double_func() + call void @cold_func(double %x) + ret i32 0 + +7: ; preds = %7, %2 + %8 = phi i32 [ 0, %2 ], [ %10, %7 ] + %seed_val = load i256, ptr @val + %9 = call i32 @seed(i256 %seed_val) + call void @hot_func(i32 %9) + %10 = add i32 %8, 1 + %11 = icmp eq i32 %10, 100000 + br i1 %11, label %5, label %7, !prof !18 +} + +declare i32 @seed(i256) +declare double @double_func() +declare i32 @func_taking_arbitrary_param(...) + +!llvm.module.flags = !{!1} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10, !11, !12} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 1460617} +!5 = !{!"MaxCount", i64 849536} +!6 = !{!"MaxInternalCount", i64 32769} +!7 = !{!"MaxFunctionCount", i64 849536} +!8 = !{!"NumCounts", i64 23784} +!9 = !{!"NumFunctions", i64 3301} +!10 = !{!"IsPartialProfile", i64 0} +!11 = !{!"PartialProfileRatio", double 0.000000e+00} +!12 = !{!"DetailedSummary", !13} +!13 = !{!14, !15} +!14 = !{i32 990000, i64 166, i32 73} +!15 = !{i32 999999, i64 1, i32 1463} +!16 = !{!"function_entry_count", i64 1} +!17 = !{!"function_entry_count", i64 100000} +!18 = !{!"branch_weights", i32 1, i32 99999} From fea6b388055284f37852e615fbf5b40a3ba34249 Mon Sep 17 00:00:00 2001 From: YLChenZ Date: Sun, 30 Mar 2025 13:37:48 +0800 Subject: [PATCH 0031/1029] [llvm-reduce]: print short form, actionable names in the log (#133561) Closes #132696 before the patch like this: ``` ---------------------------- *** Reducing GlobalObjects... ---------------------------- *** Reducing GV Initializers... ---------------------------- *** Reducing GlobalVariables... ---------------------------- ``` after the patch like this: ``` ---------------------------- *** Reducing GlobalObjects (global-objects)... ---------------------------- *** Reducing GV Initializers (global-initializers)... ---------------------------- *** Reducing GlobalVariables (global-variables)... ---------------------------- ``` --- llvm/tools/llvm-reduce/DeltaManager.cpp | 135 +++++++----------- llvm/tools/llvm-reduce/DeltaPass.h | 24 ++++ llvm/tools/llvm-reduce/DeltaPasses.def | 68 +++++++++ llvm/tools/llvm-reduce/deltas/Delta.cpp | 24 ++-- llvm/tools/llvm-reduce/deltas/Delta.h | 4 +- .../llvm-reduce/deltas/ReduceAliases.cpp | 13 +- llvm/tools/llvm-reduce/deltas/ReduceAliases.h | 4 +- .../llvm-reduce/deltas/ReduceArguments.cpp | 7 +- .../llvm-reduce/deltas/ReduceArguments.h | 2 +- .../llvm-reduce/deltas/ReduceAttributes.cpp | 7 +- .../llvm-reduce/deltas/ReduceAttributes.h | 5 +- .../llvm-reduce/deltas/ReduceBasicBlocks.cpp | 17 +-- .../llvm-reduce/deltas/ReduceBasicBlocks.h | 7 +- .../llvm-reduce/deltas/ReduceDIMetadata.cpp | 7 +- .../llvm-reduce/deltas/ReduceDIMetadata.h | 4 +- .../llvm-reduce/deltas/ReduceDbgRecords.cpp | 8 +- .../llvm-reduce/deltas/ReduceDbgRecords.h | 4 +- .../deltas/ReduceDistinctMetadata.cpp | 10 +- .../deltas/ReduceDistinctMetadata.h | 4 +- .../deltas/ReduceFunctionBodies.cpp | 15 +- .../llvm-reduce/deltas/ReduceFunctionBodies.h | 4 +- .../llvm-reduce/deltas/ReduceFunctions.cpp | 8 +- .../llvm-reduce/deltas/ReduceFunctions.h | 3 +- .../deltas/ReduceGlobalObjects.cpp | 6 +- .../llvm-reduce/deltas/ReduceGlobalObjects.h | 2 +- .../llvm-reduce/deltas/ReduceGlobalValues.cpp | 6 +- .../llvm-reduce/deltas/ReduceGlobalValues.h | 2 +- .../deltas/ReduceGlobalVarInitializers.cpp | 9 +- .../deltas/ReduceGlobalVarInitializers.h | 4 +- .../llvm-reduce/deltas/ReduceGlobalVars.cpp | 8 +- .../llvm-reduce/deltas/ReduceGlobalVars.h | 4 +- .../llvm-reduce/deltas/ReduceIRReferences.cpp | 24 +--- .../llvm-reduce/deltas/ReduceIRReferences.h | 10 +- .../deltas/ReduceInstructionFlags.cpp | 8 +- .../deltas/ReduceInstructionFlags.h | 4 +- .../deltas/ReduceInstructionFlagsMIR.cpp | 8 +- .../deltas/ReduceInstructionFlagsMIR.h | 2 +- .../llvm-reduce/deltas/ReduceInstructions.cpp | 8 +- .../llvm-reduce/deltas/ReduceInstructions.h | 4 +- .../deltas/ReduceInstructionsMIR.cpp | 9 +- .../deltas/ReduceInstructionsMIR.h | 6 +- .../llvm-reduce/deltas/ReduceInvokes.cpp | 7 +- llvm/tools/llvm-reduce/deltas/ReduceInvokes.h | 4 +- .../deltas/ReduceMemoryOperations.cpp | 21 +-- .../deltas/ReduceMemoryOperations.h | 8 +- .../llvm-reduce/deltas/ReduceMetadata.cpp | 13 +- .../tools/llvm-reduce/deltas/ReduceMetadata.h | 6 +- .../llvm-reduce/deltas/ReduceModuleData.cpp | 6 +- .../llvm-reduce/deltas/ReduceModuleData.h | 2 +- .../llvm-reduce/deltas/ReduceOpcodes.cpp | 7 +- llvm/tools/llvm-reduce/deltas/ReduceOpcodes.h | 4 +- .../deltas/ReduceOperandBundles.cpp | 9 +- .../llvm-reduce/deltas/ReduceOperandBundles.h | 5 +- .../llvm-reduce/deltas/ReduceOperands.cpp | 27 +--- .../tools/llvm-reduce/deltas/ReduceOperands.h | 6 +- .../llvm-reduce/deltas/ReduceOperandsSkip.cpp | 7 +- .../llvm-reduce/deltas/ReduceOperandsSkip.h | 2 +- .../deltas/ReduceOperandsToArgs.cpp | 8 +- .../llvm-reduce/deltas/ReduceOperandsToArgs.h | 2 +- .../llvm-reduce/deltas/ReduceRegisterDefs.cpp | 7 +- .../llvm-reduce/deltas/ReduceRegisterDefs.h | 2 +- .../deltas/ReduceRegisterMasks.cpp | 7 +- .../llvm-reduce/deltas/ReduceRegisterMasks.h | 2 +- .../llvm-reduce/deltas/ReduceRegisterUses.cpp | 7 +- .../llvm-reduce/deltas/ReduceRegisterUses.h | 2 +- .../deltas/ReduceSpecialGlobals.cpp | 9 +- .../llvm-reduce/deltas/ReduceSpecialGlobals.h | 2 +- .../deltas/ReduceUsingSimplifyCFG.cpp | 26 ++-- .../deltas/ReduceUsingSimplifyCFG.h | 6 +- .../deltas/ReduceVirtualRegisters.cpp | 10 +- .../deltas/ReduceVirtualRegisters.h | 4 +- llvm/tools/llvm-reduce/deltas/RunIRPasses.cpp | 7 +- llvm/tools/llvm-reduce/deltas/RunIRPasses.h | 2 +- .../deltas/SimplifyInstructions.cpp | 6 +- .../llvm-reduce/deltas/SimplifyInstructions.h | 2 +- .../llvm-reduce/deltas/StripDebugInfo.cpp | 7 +- .../tools/llvm-reduce/deltas/StripDebugInfo.h | 2 +- 77 files changed, 303 insertions(+), 459 deletions(-) create mode 100644 llvm/tools/llvm-reduce/DeltaPass.h create mode 100644 llvm/tools/llvm-reduce/DeltaPasses.def diff --git a/llvm/tools/llvm-reduce/DeltaManager.cpp b/llvm/tools/llvm-reduce/DeltaManager.cpp index 624b5306bc71b..5281b1d5aebf2 100644 --- a/llvm/tools/llvm-reduce/DeltaManager.cpp +++ b/llvm/tools/llvm-reduce/DeltaManager.cpp @@ -12,9 +12,8 @@ //===----------------------------------------------------------------------===// #include "DeltaManager.h" -#include "ReducerWorkItem.h" +#include "DeltaPass.h" #include "TestRunner.h" -#include "deltas/Delta.h" #include "deltas/ReduceAliases.h" #include "deltas/ReduceArguments.h" #include "deltas/ReduceAttributes.h" @@ -71,91 +70,56 @@ static cl::list "default, run all delta passes."), cl::cat(LLVMReduceOptions), cl::CommaSeparated); -#define DELTA_PASSES \ - do { \ - DELTA_PASS("strip-debug-info", stripDebugInfoDeltaPass) \ - DELTA_PASS("functions", reduceFunctionsDeltaPass) \ - DELTA_PASS("function-bodies", reduceFunctionBodiesDeltaPass) \ - DELTA_PASS("special-globals", reduceSpecialGlobalsDeltaPass) \ - DELTA_PASS("aliases", reduceAliasesDeltaPass) \ - DELTA_PASS("ifuncs", reduceIFuncsDeltaPass) \ - DELTA_PASS("simplify-conditionals-true", reduceConditionalsTrueDeltaPass) \ - DELTA_PASS("simplify-conditionals-false", \ - reduceConditionalsFalseDeltaPass) \ - DELTA_PASS("invokes", reduceInvokesDeltaPass) \ - DELTA_PASS("unreachable-basic-blocks", \ - reduceUnreachableBasicBlocksDeltaPass) \ - DELTA_PASS("basic-blocks", reduceBasicBlocksDeltaPass) \ - DELTA_PASS("simplify-cfg", reduceUsingSimplifyCFGDeltaPass) \ - DELTA_PASS("function-data", reduceFunctionDataDeltaPass) \ - DELTA_PASS("global-values", reduceGlobalValuesDeltaPass) \ - DELTA_PASS("global-objects", reduceGlobalObjectsDeltaPass) \ - DELTA_PASS("global-initializers", reduceGlobalsInitializersDeltaPass) \ - DELTA_PASS("global-variables", reduceGlobalsDeltaPass) \ - DELTA_PASS("di-metadata", reduceDIMetadataDeltaPass) \ - DELTA_PASS("dbg-records", reduceDbgRecordDeltaPass) \ - DELTA_PASS("distinct-metadata", reduceDistinctMetadataDeltaPass) \ - DELTA_PASS("metadata", reduceMetadataDeltaPass) \ - DELTA_PASS("named-metadata", reduceNamedMetadataDeltaPass) \ - DELTA_PASS("arguments", reduceArgumentsDeltaPass) \ - DELTA_PASS("instructions", reduceInstructionsDeltaPass) \ - DELTA_PASS("simplify-instructions", simplifyInstructionsDeltaPass) \ - DELTA_PASS("ir-passes", runIRPassesDeltaPass) \ - DELTA_PASS("operands-zero", reduceOperandsZeroDeltaPass) \ - DELTA_PASS("operands-one", reduceOperandsOneDeltaPass) \ - DELTA_PASS("operands-nan", reduceOperandsNaNDeltaPass) \ - DELTA_PASS("operands-to-args", reduceOperandsToArgsDeltaPass) \ - DELTA_PASS("operands-skip", reduceOperandsSkipDeltaPass) \ - DELTA_PASS("operand-bundles", reduceOperandBundesDeltaPass) \ - DELTA_PASS("attributes", reduceAttributesDeltaPass) \ - DELTA_PASS("module-data", reduceModuleDataDeltaPass) \ - DELTA_PASS("opcodes", reduceOpcodesDeltaPass) \ - DELTA_PASS("volatile", reduceVolatileInstructionsDeltaPass) \ - DELTA_PASS("atomic-ordering", reduceAtomicOrderingDeltaPass) \ - DELTA_PASS("syncscopes", reduceAtomicSyncScopesDeltaPass) \ - DELTA_PASS("instruction-flags", reduceInstructionFlagsDeltaPass) \ - } while (false) - -#define DELTA_PASSES_MIR \ - do { \ - DELTA_PASS("instructions", reduceInstructionsMIRDeltaPass) \ - DELTA_PASS("ir-instruction-references", \ - reduceIRInstructionReferencesDeltaPass) \ - DELTA_PASS("ir-block-references", reduceIRBlockReferencesDeltaPass) \ - DELTA_PASS("ir-function-references", reduceIRFunctionReferencesDeltaPass) \ - DELTA_PASS("instruction-flags", reduceInstructionFlagsMIRDeltaPass) \ - DELTA_PASS("register-uses", reduceRegisterUsesMIRDeltaPass) \ - DELTA_PASS("register-defs", reduceRegisterDefsMIRDeltaPass) \ - DELTA_PASS("register-hints", reduceVirtualRegisterHintsDeltaPass) \ - DELTA_PASS("register-masks", reduceRegisterMasksMIRDeltaPass) \ - } while (false) +// Generate two separate Pass lists: IR_Passes and MIR_Passes +static const DeltaPass IR_Passes[] = { +#undef DELTA_PASS_IR +#undef DELTA_PASS_MIR +#define DELTA_PASS_IR(NAME, FUNC, DESC) {NAME, FUNC, DESC}, +#include "DeltaPasses.def" +#undef DELTA_PASS_IR +}; + +static const DeltaPass MIR_Passes[] = { +#undef DELTA_PASS_IR +#undef DELTA_PASS_MIR +#define DELTA_PASS_MIR(NAME, FUNC, DESC) {NAME, FUNC, DESC}, +#include "DeltaPasses.def" +#undef DELTA_PASS_MIR +}; static void runAllDeltaPasses(TestRunner &Tester, const SmallStringSet &SkipPass) { -#define DELTA_PASS(NAME, FUNC) \ - if (!SkipPass.count(NAME)) { \ - FUNC(Tester); \ - } if (Tester.getProgram().isMIR()) { - DELTA_PASSES_MIR; + for (const DeltaPass &Pass : MIR_Passes) { + if (!SkipPass.count(Pass.Name)) { + runDeltaPass(Tester, Pass); + } + } } else { - DELTA_PASSES; + for (const DeltaPass &Pass : IR_Passes) { + if (!SkipPass.count(Pass.Name)) { + runDeltaPass(Tester, Pass); + } + } } -#undef DELTA_PASS } static void runDeltaPassName(TestRunner &Tester, StringRef PassName) { -#define DELTA_PASS(NAME, FUNC) \ - if (PassName == NAME) { \ - FUNC(Tester); \ - return; \ - } if (Tester.getProgram().isMIR()) { - DELTA_PASSES_MIR; + for (const DeltaPass &Pass : MIR_Passes) { + if (PassName == Pass.Name) { + runDeltaPass(Tester, Pass); + return; + } + } } else { - DELTA_PASSES; + for (const DeltaPass &Pass : IR_Passes) { + if (PassName == Pass.Name) { + runDeltaPass(Tester, Pass); + return; + } + } } -#undef DELTA_PASS // We should have errored on unrecognized passes before trying to run // anything. @@ -164,24 +128,25 @@ static void runDeltaPassName(TestRunner &Tester, StringRef PassName) { void llvm::printDeltaPasses(raw_ostream &OS) { OS << "Delta passes (pass to `--delta-passes=` as a comma separated list):\n"; -#define DELTA_PASS(NAME, FUNC) OS << " " << NAME << "\n"; OS << " IR:\n"; - DELTA_PASSES; + for (const DeltaPass &Pass : IR_Passes) { + OS << " " << Pass.Name << '\n'; + } OS << " MIR:\n"; - DELTA_PASSES_MIR; -#undef DELTA_PASS + for (const DeltaPass &Pass : MIR_Passes) { + OS << " " << Pass.Name << '\n'; + } } // Built a set of available delta passes. static void collectPassNames(const TestRunner &Tester, SmallStringSet &NameSet) { -#define DELTA_PASS(NAME, FUNC) NameSet.insert(NAME); - if (Tester.getProgram().isMIR()) { - DELTA_PASSES_MIR; - } else { - DELTA_PASSES; + for (const DeltaPass &Pass : MIR_Passes) { + NameSet.insert(Pass.Name); + } + for (const DeltaPass &Pass : IR_Passes) { + NameSet.insert(Pass.Name); } -#undef DELTA_PASS } /// Verify all requested or skipped passes are valid names, and return them in a diff --git a/llvm/tools/llvm-reduce/DeltaPass.h b/llvm/tools/llvm-reduce/DeltaPass.h new file mode 100644 index 0000000000000..3231570bd23f4 --- /dev/null +++ b/llvm/tools/llvm-reduce/DeltaPass.h @@ -0,0 +1,24 @@ +//===--- DeltaPass.h - Delta Pass Structure --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAPASS_H +#define LLVM_TOOLS_LLVM_REDUCE_DELTAPASS_H + +#include "ReducerWorkItem.h" +#include "deltas/Delta.h" +#include "llvm/ADT/StringRef.h" + +namespace llvm { +struct DeltaPass { + StringRef Name; // e.g., "strip-debug-info" + void (*Func)(Oracle &, ReducerWorkItem &); // e.g., stripDebugInfoDeltaPass + StringRef Desc; // e.g., "Stripping Debug Info" +}; +} // namespace llvm + +#endif diff --git a/llvm/tools/llvm-reduce/DeltaPasses.def b/llvm/tools/llvm-reduce/DeltaPasses.def new file mode 100644 index 0000000000000..1b5576b48dcd0 --- /dev/null +++ b/llvm/tools/llvm-reduce/DeltaPasses.def @@ -0,0 +1,68 @@ +//===--- DeltaPasses.def - Delta Pass Definitions --------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + + +#ifndef DELTA_PASS_IR +#define DELTA_PASS_IR(NAME, FUNC, DESC) +#endif +DELTA_PASS_IR("strip-debug-info", stripDebugInfoDeltaPass, "Stripping Debug Info") +DELTA_PASS_IR("functions", reduceFunctionsDeltaPass, "Reducing Functions") +DELTA_PASS_IR("function-bodies", reduceFunctionBodiesDeltaPass, "Reducing Function Bodies") +DELTA_PASS_IR("special-globals", reduceSpecialGlobalsDeltaPass, "Reducing Special Globals") +DELTA_PASS_IR("aliases", reduceAliasesDeltaPass, "Reducing Aliases") +DELTA_PASS_IR("ifuncs", reduceIFuncsDeltaPass, "Reducing Ifuncs") +DELTA_PASS_IR("simplify-conditionals-true", reduceConditionalsTrueDeltaPass, "Reducing conditional branches to true") +DELTA_PASS_IR("simplify-conditionals-false", + reduceConditionalsFalseDeltaPass, "Reducing conditional branches to false") +DELTA_PASS_IR("invokes", reduceInvokesDeltaPass, "Reducing Invokes") +DELTA_PASS_IR("unreachable-basic-blocks", + reduceUnreachableBasicBlocksDeltaPass, "Removing Unreachable Basic Blocks") +DELTA_PASS_IR("basic-blocks", reduceBasicBlocksDeltaPass, "Reducing Basic Blocks") +DELTA_PASS_IR("simplify-cfg", reduceUsingSimplifyCFGDeltaPass, "Reducing using SimplifyCFG") +DELTA_PASS_IR("function-data", reduceFunctionDataDeltaPass, "Reducing Function Data") +DELTA_PASS_IR("global-values", reduceGlobalValuesDeltaPass, "Reducing GlobalValues") +DELTA_PASS_IR("global-objects", reduceGlobalObjectsDeltaPass, "Reducing GlobalObjects") +DELTA_PASS_IR("global-initializers", reduceGlobalsInitializersDeltaPass, "Reducing GV Initializers") +DELTA_PASS_IR("global-variables", reduceGlobalsDeltaPass, "Reducing GlobalVariables") +DELTA_PASS_IR("di-metadata", reduceDIMetadataDeltaPass, "Reducing DIMetadata") +DELTA_PASS_IR("dbg-records", reduceDbgRecordDeltaPass, "Reducing DbgRecords") +DELTA_PASS_IR("distinct-metadata", reduceDistinctMetadataDeltaPass, "Reducing Distinct Metadata") +DELTA_PASS_IR("metadata", reduceMetadataDeltaPass,"Reducing Metadata") +DELTA_PASS_IR("named-metadata", reduceNamedMetadataDeltaPass, "Reducing Named Metadata") +DELTA_PASS_IR("arguments", reduceArgumentsDeltaPass, "Reducing Arguments") +DELTA_PASS_IR("instructions", reduceInstructionsDeltaPass, "Reducing Instructions") +DELTA_PASS_IR("simplify-instructions", simplifyInstructionsDeltaPass, "Simplifying Instructions") +DELTA_PASS_IR("ir-passes", runIRPassesDeltaPass, "Running passes") +DELTA_PASS_IR("operands-zero", reduceOperandsZeroDeltaPass, "Reducing Operands to zero") +DELTA_PASS_IR("operands-one", reduceOperandsOneDeltaPass, "Reducing Operands to one") +DELTA_PASS_IR("operands-nan", reduceOperandsNaNDeltaPass, "Reducing Operands to NaN") +DELTA_PASS_IR("operands-to-args", reduceOperandsToArgsDeltaPass, "Converting operands to function arguments") +DELTA_PASS_IR("operands-skip", reduceOperandsSkipDeltaPass, "Reducing operands by skipping over instructions") +DELTA_PASS_IR("operand-bundles", reduceOperandBundesDeltaPass, "Reducing Operand Bundles") +DELTA_PASS_IR("attributes", reduceAttributesDeltaPass, "Reducing Attributes") +DELTA_PASS_IR("module-data", reduceModuleDataDeltaPass, "Reducing Module Data") +DELTA_PASS_IR("opcodes", reduceOpcodesDeltaPass, "Reducing Opcodes") +DELTA_PASS_IR("volatile", reduceVolatileInstructionsDeltaPass, "Reducing Volatile Instructions") +DELTA_PASS_IR("atomic-ordering", reduceAtomicOrderingDeltaPass, "Reducing Atomic Ordering") +DELTA_PASS_IR("syncscopes", reduceAtomicSyncScopesDeltaPass, "Reducing Atomic Sync Scopes") +DELTA_PASS_IR("instruction-flags", reduceInstructionFlagsDeltaPass, "Reducing Instruction Flags") + + +#ifndef DELTA_PASS_MIR +#define DELTA_PASS_MIR(NAME, FUNC, DESC) +#endif +DELTA_PASS_MIR("instructions", reduceInstructionsMIRDeltaPass, "Reducing Instructions") +DELTA_PASS_MIR("ir-instruction-references", + reduceIRInstructionReferencesDeltaPass, "Reducing IR references from instructions") +DELTA_PASS_MIR("ir-block-references", reduceIRBlockReferencesDeltaPass, "Reducing IR references from blocks") +DELTA_PASS_MIR("ir-function-references", reduceIRFunctionReferencesDeltaPass, "Reducing IR references from functions") +DELTA_PASS_MIR("instruction-flags", reduceInstructionFlagsMIRDeltaPass, "Reducing Instruction Flags") +DELTA_PASS_MIR("register-uses", reduceRegisterUsesMIRDeltaPass, "Reducing register uses") +DELTA_PASS_MIR("register-defs", reduceRegisterDefsMIRDeltaPass, "Reducing register defs") +DELTA_PASS_MIR("register-hints", reduceVirtualRegisterHintsDeltaPass, "Reducing virtual register hints from functions") +DELTA_PASS_MIR("register-masks", reduceRegisterMasksMIRDeltaPass, "Reducing register masks") diff --git a/llvm/tools/llvm-reduce/deltas/Delta.cpp b/llvm/tools/llvm-reduce/deltas/Delta.cpp index 6f84b6c09d145..5b9f0330f9c7e 100644 --- a/llvm/tools/llvm-reduce/deltas/Delta.cpp +++ b/llvm/tools/llvm-reduce/deltas/Delta.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "Delta.h" +#include "DeltaPass.h" #include "ReducerWorkItem.h" #include "TestRunner.h" #include "Utils.h" @@ -180,11 +181,10 @@ using SharedTaskQueue = std::deque>>; /// reduces the amount of chunks that are considered interesting by the /// given test. The number of chunks is determined by a preliminary run of the /// reduction pass where no change must be made to the module. -void llvm::runDeltaPass(TestRunner &Test, ReductionFunc ExtractChunksFromModule, - StringRef Message) { +void llvm::runDeltaPass(TestRunner &Test, const DeltaPass &Pass) { assert(!Test.getProgram().verify(&errs()) && "input module is broken before making changes"); - errs() << "*** " << Message << "...\n"; + errs() << "*** " << Pass.Desc << " (" << Pass.Name << ")...\n"; int Targets; { @@ -193,7 +193,7 @@ void llvm::runDeltaPass(TestRunner &Test, ReductionFunc ExtractChunksFromModule, // made. std::vector AllChunks = {{0, INT_MAX}}; Oracle Counter(AllChunks); - ExtractChunksFromModule(Counter, Test.getProgram()); + Pass.Func(Counter, Test.getProgram()); Targets = Counter.count(); assert(!Test.getProgram().verify(&errs()) && @@ -215,7 +215,7 @@ void llvm::runDeltaPass(TestRunner &Test, ReductionFunc ExtractChunksFromModule, Oracle NoChunksCounter(NoChunks); std::unique_ptr Clone = Test.getProgram().clone(Test.getTargetMachine()); - ExtractChunksFromModule(NoChunksCounter, *Clone); + Pass.Func(NoChunksCounter, *Clone); assert(Targets == NoChunksCounter.count() && "number of chunks changes when reducing"); #endif @@ -281,9 +281,8 @@ void llvm::runDeltaPass(TestRunner &Test, ReductionFunc ExtractChunksFromModule, Chunk ChunkToCheck = *(I + J); TaskQueue.emplace_back(ChunkThreadPool.async( ProcessChunkFromSerializedBitcode, ChunkToCheck, std::ref(Test), - ExtractChunksFromModule, UninterestingChunks, - ChunksStillConsideredInteresting, OriginalBC, - std::ref(AnyReduced))); + Pass.Func, UninterestingChunks, ChunksStillConsideredInteresting, + OriginalBC, std::ref(AnyReduced))); } // Start processing results of the queued tasks. We wait for the first @@ -305,7 +304,7 @@ void llvm::runDeltaPass(TestRunner &Test, ReductionFunc ExtractChunksFromModule, Chunk ChunkToCheck = *(I + NumScheduledTasks); TaskQueue.emplace_back(ChunkThreadPool.async( ProcessChunkFromSerializedBitcode, ChunkToCheck, - std::ref(Test), ExtractChunksFromModule, UninterestingChunks, + std::ref(Test), Pass.Func, UninterestingChunks, ChunksStillConsideredInteresting, OriginalBC, std::ref(AnyReduced))); } @@ -330,10 +329,9 @@ void llvm::runDeltaPass(TestRunner &Test, ReductionFunc ExtractChunksFromModule, // Forward I to the last chunk processed in parallel. I += NumChunksProcessed - 1; } else { - Result = - CheckChunk(*I, Test.getProgram().clone(Test.getTargetMachine()), - Test, ExtractChunksFromModule, UninterestingChunks, - ChunksStillConsideredInteresting); + Result = CheckChunk( + *I, Test.getProgram().clone(Test.getTargetMachine()), Test, + Pass.Func, UninterestingChunks, ChunksStillConsideredInteresting); } if (!Result) diff --git a/llvm/tools/llvm-reduce/deltas/Delta.h b/llvm/tools/llvm-reduce/deltas/Delta.h index 96fcea89484c0..ec2311f067299 100644 --- a/llvm/tools/llvm-reduce/deltas/Delta.h +++ b/llvm/tools/llvm-reduce/deltas/Delta.h @@ -24,6 +24,7 @@ namespace llvm { class TestRunner; +struct DeltaPass; struct Chunk { int Begin; @@ -134,8 +135,7 @@ using ReductionFunc = function_ref; /// /// Other implementations of the Delta Debugging algorithm can also be found in /// the CReduce, Delta, and Lithium projects. -void runDeltaPass(TestRunner &Test, ReductionFunc ExtractChunksFromModule, - StringRef Message); +void runDeltaPass(TestRunner &Test, const DeltaPass &Pass); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceAliases.cpp b/llvm/tools/llvm-reduce/deltas/ReduceAliases.cpp index 2f2df549b6728..00d7ce9bd763d 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceAliases.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceAliases.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "ReduceAliases.h" -#include "Delta.h" #include "Utils.h" #include "llvm/IR/Constants.h" #include "llvm/IR/GlobalValue.h" @@ -22,7 +21,7 @@ using namespace llvm; /// Removes all aliases aren't inside any of the /// desired Chunks. -static void extractAliasesFromModule(Oracle &O, ReducerWorkItem &Program) { +void llvm::reduceAliasesDeltaPass(Oracle &O, ReducerWorkItem &Program) { for (auto &GA : make_early_inc_range(Program.getModule().aliases())) { if (!O.shouldKeep()) { GA.replaceAllUsesWith(GA.getAliasee()); @@ -31,7 +30,7 @@ static void extractAliasesFromModule(Oracle &O, ReducerWorkItem &Program) { } } -static void extractIFuncsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceIFuncsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Mod = WorkItem.getModule(); std::vector IFuncs; @@ -43,11 +42,3 @@ static void extractIFuncsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { if (!IFuncs.empty()) lowerGlobalIFuncUsersAsGlobalCtor(Mod, IFuncs); } - -void llvm::reduceAliasesDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractAliasesFromModule, "Reducing Aliases"); -} - -void llvm::reduceIFuncsDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractIFuncsFromModule, "Reducing Ifuncs"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceAliases.h b/llvm/tools/llvm-reduce/deltas/ReduceAliases.h index 404677d221ca4..ce0b4443d080f 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceAliases.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceAliases.h @@ -17,8 +17,8 @@ #include "Delta.h" namespace llvm { -void reduceAliasesDeltaPass(TestRunner &Test); -void reduceIFuncsDeltaPass(TestRunner &Test); +void reduceAliasesDeltaPass(Oracle &O, ReducerWorkItem &Program); +void reduceIFuncsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp b/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp index b16f512ff6166..690cfc9ef4732 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "ReduceArguments.h" -#include "Delta.h" #include "Utils.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/Constants.h" @@ -112,7 +111,7 @@ static bool allFuncUsersRewritable(const Function &F) { /// Removes out-of-chunk arguments from functions, and modifies their calls /// accordingly. It also removes allocations of out-of-chunk arguments. -static void extractArgumentsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceArgumentsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); std::vector InitArgsToKeep; std::vector Funcs; @@ -177,7 +176,3 @@ static void extractArgumentsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { F->eraseFromParent(); } } - -void llvm::reduceArgumentsDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractArgumentsFromModule, "Reducing Arguments"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceArguments.h b/llvm/tools/llvm-reduce/deltas/ReduceArguments.h index 5adcfe89266bc..cd305451a5713 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceArguments.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceArguments.h @@ -20,7 +20,7 @@ #include "llvm/Transforms/Utils/Cloning.h" namespace llvm { -void reduceArgumentsDeltaPass(TestRunner &Test); +void reduceArgumentsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceAttributes.cpp b/llvm/tools/llvm-reduce/deltas/ReduceAttributes.cpp index 1f497089e18fc..63d7abe61bda7 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceAttributes.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceAttributes.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "ReduceAttributes.h" -#include "Delta.h" #include "TestRunner.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" @@ -166,11 +165,7 @@ class AttributeRemapper : public InstVisitor { } // namespace /// Removes out-of-chunk attributes from module. -static void extractAttributesFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceAttributesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { AttributeRemapper R(O, WorkItem.getModule()); R.visit(WorkItem.getModule()); } - -void llvm::reduceAttributesDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractAttributesFromModule, "Reducing Attributes"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceAttributes.h b/llvm/tools/llvm-reduce/deltas/ReduceAttributes.h index a2e9955ac5bb4..663f6d8c23a3b 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceAttributes.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceAttributes.h @@ -14,9 +14,10 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEATTRIBUTES_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEATTRIBUTES_H +#include "Delta.h" + namespace llvm { -class TestRunner; -void reduceAttributesDeltaPass(TestRunner &Test); +void reduceAttributesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp index da363df77d0c0..5656fdda764a4 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp @@ -23,8 +23,8 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" - #include #define DEBUG_TYPE "llvm-reduce" @@ -141,7 +141,7 @@ removeUninterestingBBsFromSwitch(SwitchInst &SwInst, /// Removes out-of-chunk arguments from functions, and modifies their calls /// accordingly. It also removes allocations of out-of-chunk arguments. -static void extractBasicBlocksFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceBasicBlocksDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { DenseSet BBsToDelete; df_iterator_default_set Reachable; @@ -188,12 +188,8 @@ static void extractBasicBlocksFromModule(Oracle &O, ReducerWorkItem &WorkItem) { } } -void llvm::reduceBasicBlocksDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractBasicBlocksFromModule, "Reducing Basic Blocks"); -} - -static void removeUnreachableBasicBlocksFromModule(Oracle &O, - ReducerWorkItem &WorkItem) { +void llvm::reduceUnreachableBasicBlocksDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { std::vector DeadBlocks; df_iterator_default_set Reachable; @@ -219,8 +215,3 @@ static void removeUnreachableBasicBlocksFromModule(Oracle &O, Reachable.clear(); } } - -void llvm::reduceUnreachableBasicBlocksDeltaPass(TestRunner &Test) { - runDeltaPass(Test, removeUnreachableBasicBlocksFromModule, - "Removing Unreachable Basic Blocks"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.h b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.h index a090d675ef822..b7a3b2867ae35 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.h @@ -14,12 +14,11 @@ #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEBASICBLOCKS_H #include "Delta.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" namespace llvm { -void reduceBasicBlocksDeltaPass(TestRunner &Test); -void reduceUnreachableBasicBlocksDeltaPass(TestRunner &Test); +void reduceBasicBlocksDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); +void reduceUnreachableBasicBlocksDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDIMetadata.cpp b/llvm/tools/llvm-reduce/deltas/ReduceDIMetadata.cpp index 9dcf722fd1d90..8d1d73785f567 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceDIMetadata.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceDIMetadata.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "ReduceDIMetadata.h" -#include "Delta.h" #include "llvm/ADT/Sequence.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" @@ -76,7 +75,7 @@ void identifyUninterestingMDNodes(Oracle &O, MDNodeList &MDs) { } } -static void extractDIMetadataFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceDIMetadataDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); MDNodeList MDs; @@ -94,7 +93,3 @@ static void extractDIMetadataFromModule(Oracle &O, ReducerWorkItem &WorkItem) { } identifyUninterestingMDNodes(O, MDs); } - -void llvm::reduceDIMetadataDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractDIMetadataFromModule, "Reducing DIMetadata"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDIMetadata.h b/llvm/tools/llvm-reduce/deltas/ReduceDIMetadata.h index 379c14a0db200..d9976fc3a2902 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceDIMetadata.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceDIMetadata.h @@ -14,10 +14,10 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEDIMETADATA_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEDIMETADATA_H -#include "TestRunner.h" +#include "Delta.h" namespace llvm { -void reduceDIMetadataDeltaPass(TestRunner &Test); +void reduceDIMetadataDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp index 25de659109c9f..4de942d459b69 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.cpp @@ -20,10 +20,12 @@ #include "ReduceDbgRecords.h" #include "Utils.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DebugProgramInstruction.h" using namespace llvm; -static void extractDbgRecordsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceDbgRecordDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &M = WorkItem.getModule(); for (auto &F : M) @@ -33,7 +35,3 @@ static void extractDbgRecordsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { if (!O.shouldKeep()) DR.eraseFromParent(); } - -void llvm::reduceDbgRecordDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractDbgRecordsFromModule, "Reducing DbgRecords"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h index 07a1e04fceaee..a122465e2a628 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceDbgRecords.h @@ -15,11 +15,9 @@ #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEDBGRECORDS_H #include "Delta.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/DebugProgramInstruction.h" namespace llvm { -void reduceDbgRecordDeltaPass(TestRunner &Test); +void reduceDbgRecordDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDistinctMetadata.cpp b/llvm/tools/llvm-reduce/deltas/ReduceDistinctMetadata.cpp index 0f46409977a33..4b3c5f58fe549 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceDistinctMetadata.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceDistinctMetadata.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "ReduceDistinctMetadata.h" -#include "Delta.h" #include "llvm/ADT/Sequence.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" @@ -118,8 +117,8 @@ static void cleanUpTemporaries(NamedMDNode &NamedNode, MDTuple *TemporaryTuple, } } -static void extractDistinctMetadataFromModule(Oracle &O, - ReducerWorkItem &WorkItem) { +void llvm::reduceDistinctMetadataDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); MDTuple *TemporaryTuple = MDTuple::getDistinct(Program.getContext(), SmallVector{}); @@ -135,8 +134,3 @@ static void extractDistinctMetadataFromModule(Oracle &O, for (NamedMDNode &NamedNode : Program.named_metadata()) cleanUpTemporaries(NamedNode, TemporaryTuple, Program); } - -void llvm::reduceDistinctMetadataDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractDistinctMetadataFromModule, - "Reducing Distinct Metadata"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDistinctMetadata.h b/llvm/tools/llvm-reduce/deltas/ReduceDistinctMetadata.h index d02e8e6107b75..e7a817c173b07 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceDistinctMetadata.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceDistinctMetadata.h @@ -14,10 +14,10 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEDISTINCTMETADATA_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEDISTINCTMETADATA_H -#include "TestRunner.h" +#include "Delta.h" namespace llvm { -void reduceDistinctMetadataDeltaPass(TestRunner &Test); +void reduceDistinctMetadataDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.cpp b/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.cpp index 21875ba00cf8b..af0ff996a1c13 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "ReduceFunctionBodies.h" -#include "Delta.h" #include "Utils.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" @@ -21,8 +20,7 @@ using namespace llvm; /// Removes all the bodies of defined functions that aren't inside any of the /// desired Chunks. -static void extractFunctionBodiesFromModule(Oracle &O, - ReducerWorkItem &WorkItem) { +void llvm::reduceFunctionBodiesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { // Delete out-of-chunk function bodies for (auto &F : WorkItem.getModule()) { if (!F.isDeclaration() && !hasAliasUse(F) && !O.shouldKeep()) { @@ -32,12 +30,7 @@ static void extractFunctionBodiesFromModule(Oracle &O, } } -void llvm::reduceFunctionBodiesDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractFunctionBodiesFromModule, - "Reducing Function Bodies"); -} - -static void reduceFunctionData(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceFunctionDataDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { for (Function &F : WorkItem.getModule()) { if (F.hasPersonalityFn()) { if (none_of(F, @@ -56,7 +49,3 @@ static void reduceFunctionData(Oracle &O, ReducerWorkItem &WorkItem) { F.setPrologueData(nullptr); } } - -void llvm::reduceFunctionDataDeltaPass(TestRunner &Test) { - runDeltaPass(Test, reduceFunctionData, "Reducing Function Data"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.h b/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.h index ae738fb1b88e3..720fb6eb26654 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceFunctionBodies.h @@ -17,8 +17,8 @@ #include "Delta.h" namespace llvm { -void reduceFunctionBodiesDeltaPass(TestRunner &Test); -void reduceFunctionDataDeltaPass(TestRunner &Test); +void reduceFunctionBodiesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); +void reduceFunctionDataDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp b/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp index 619811c89202e..44f1e52204f2f 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp @@ -13,9 +13,9 @@ //===----------------------------------------------------------------------===// #include "ReduceFunctions.h" -#include "Delta.h" #include "Utils.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include @@ -23,7 +23,7 @@ using namespace llvm; /// Removes all the Defined Functions /// that aren't inside any of the desired Chunks. -static void extractFunctionsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceFunctionsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); // Record all out-of-chunk functions. @@ -54,7 +54,3 @@ static void extractFunctionsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { cast(F)->eraseFromParent(); } } - -void llvm::reduceFunctionsDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractFunctionsFromModule, "Reducing Functions"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceFunctions.h b/llvm/tools/llvm-reduce/deltas/ReduceFunctions.h index d3ff0d9511289..6f4e61c8c1e86 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceFunctions.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceFunctions.h @@ -15,10 +15,9 @@ #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEFUNCTIONS_H #include "Delta.h" -#include "llvm/Transforms/Utils/Cloning.h" namespace llvm { -void reduceFunctionsDeltaPass(TestRunner &Test); +void reduceFunctionsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp b/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp index 1d1463a055bd8..64bf711f23d52 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.cpp @@ -19,7 +19,7 @@ static bool shouldReduceAlign(GlobalObject &GO) { static bool shouldReduceComdat(GlobalObject &GO) { return GO.hasComdat(); } -static void reduceGOs(Oracle &O, ReducerWorkItem &Program) { +void llvm::reduceGlobalObjectsDeltaPass(Oracle &O, ReducerWorkItem &Program) { for (auto &GO : Program.getModule().global_objects()) { if (shouldReduceSection(GO) && !O.shouldKeep()) GO.setSection(""); @@ -29,7 +29,3 @@ static void reduceGOs(Oracle &O, ReducerWorkItem &Program) { GO.setComdat(nullptr); } } - -void llvm::reduceGlobalObjectsDeltaPass(TestRunner &Test) { - runDeltaPass(Test, reduceGOs, "Reducing GlobalObjects"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.h b/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.h index 35c38a9ecf212..bca061e3b02cb 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceGlobalObjects.h @@ -12,7 +12,7 @@ #include "Delta.h" namespace llvm { -void reduceGlobalObjectsDeltaPass(TestRunner &Test); +void reduceGlobalObjectsDeltaPass(Oracle &O, ReducerWorkItem &Program); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp b/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp index 6e8c21008502f..577e0f5d16b63 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp @@ -41,7 +41,7 @@ static bool shouldReduceLinkage(GlobalValue &GV) { return !GV.hasExternalLinkage() && !GV.hasAppendingLinkage(); } -static void reduceGVs(Oracle &O, ReducerWorkItem &Program) { +void llvm::reduceGlobalValuesDeltaPass(Oracle &O, ReducerWorkItem &Program) { for (auto &GV : Program.getModule().global_values()) { if (shouldReduceDSOLocal(GV) && !O.shouldKeep()) GV.setDSOLocal(false); @@ -66,7 +66,3 @@ static void reduceGVs(Oracle &O, ReducerWorkItem &Program) { } } } - -void llvm::reduceGlobalValuesDeltaPass(TestRunner &Test) { - runDeltaPass(Test, reduceGVs, "Reducing GlobalValues"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.h b/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.h index 19c0707936528..f7dbc90543156 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.h @@ -17,7 +17,7 @@ #include "Delta.h" namespace llvm { -void reduceGlobalValuesDeltaPass(TestRunner &Test); +void reduceGlobalValuesDeltaPass(Oracle &O, ReducerWorkItem &Program); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceGlobalVarInitializers.cpp b/llvm/tools/llvm-reduce/deltas/ReduceGlobalVarInitializers.cpp index 4c7125217f252..e285e6f7ba67f 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceGlobalVarInitializers.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceGlobalVarInitializers.cpp @@ -14,11 +14,14 @@ #include "ReduceGlobalVarInitializers.h" #include "llvm/IR/Constants.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Value.h" +#include "llvm/Transforms/Utils/Cloning.h" using namespace llvm; /// Removes all the Initialized GVs that aren't inside the desired Chunks. -static void extractGVsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceGlobalsInitializersDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { // Drop initializers of out-of-chunk GVs for (auto &GV : WorkItem.getModule().globals()) if (GV.hasInitializer() && !O.shouldKeep()) { @@ -27,7 +30,3 @@ static void extractGVsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { GV.setComdat(nullptr); } } - -void llvm::reduceGlobalsInitializersDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractGVsFromModule, "Reducing GV Initializers"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceGlobalVarInitializers.h b/llvm/tools/llvm-reduce/deltas/ReduceGlobalVarInitializers.h index 318b29b6ca5e4..b3cb075346897 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceGlobalVarInitializers.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceGlobalVarInitializers.h @@ -15,11 +15,9 @@ #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEGLOBALVARINITIALIZERS_H #include "Delta.h" -#include "llvm/IR/Value.h" -#include "llvm/Transforms/Utils/Cloning.h" namespace llvm { -void reduceGlobalsInitializersDeltaPass(TestRunner &Test); +void reduceGlobalsInitializersDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceGlobalVars.cpp b/llvm/tools/llvm-reduce/deltas/ReduceGlobalVars.cpp index b448081ee1a27..ff5f643935b46 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceGlobalVars.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceGlobalVars.cpp @@ -14,6 +14,8 @@ #include "ReduceGlobalVars.h" #include "Utils.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/Value.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/ModuleUtils.h" using namespace llvm; @@ -23,7 +25,7 @@ static bool shouldAlwaysKeep(const GlobalVariable &GV) { } /// Removes all the GVs that aren't inside the desired Chunks. -static void extractGVsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceGlobalsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); // Get GVs inside desired chunks @@ -53,7 +55,3 @@ static void extractGVsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { cast(GV)->eraseFromParent(); } } - -void llvm::reduceGlobalsDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractGVsFromModule, "Reducing GlobalVariables"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceGlobalVars.h b/llvm/tools/llvm-reduce/deltas/ReduceGlobalVars.h index 1198dceb45368..eb1a65f2043a9 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceGlobalVars.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceGlobalVars.h @@ -15,11 +15,9 @@ #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEGLOBALVARS_H #include "Delta.h" -#include "llvm/IR/Value.h" -#include "llvm/Transforms/Utils/Cloning.h" namespace llvm { -void reduceGlobalsDeltaPass(TestRunner &Test); +void reduceGlobalsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceIRReferences.cpp b/llvm/tools/llvm-reduce/deltas/ReduceIRReferences.cpp index 4bb1eb7db1d09..231883fc1f4f3 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceIRReferences.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceIRReferences.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "ReduceIRReferences.h" -#include "Delta.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -37,14 +36,16 @@ static void dropIRReferencesFromInstructions(Oracle &O, MachineFunction &MF) { } } -static void stripIRFromInstructions(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceIRInstructionReferencesDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { for (const Function &F : WorkItem.getModule()) { if (auto *MF = WorkItem.MMI->getMachineFunction(F)) dropIRReferencesFromInstructions(O, *MF); } } -static void stripIRFromBlocks(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceIRBlockReferencesDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { for (const Function &F : WorkItem.getModule()) { if (auto *MF = WorkItem.MMI->getMachineFunction(F)) { for (MachineBasicBlock &MBB : *MF) { @@ -55,7 +56,8 @@ static void stripIRFromBlocks(Oracle &O, ReducerWorkItem &WorkItem) { } } -static void stripIRFromFunctions(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceIRFunctionReferencesDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { for (const Function &F : WorkItem.getModule()) { if (!O.shouldKeep()) { if (auto *MF = WorkItem.MMI->getMachineFunction(F)) { @@ -67,17 +69,3 @@ static void stripIRFromFunctions(Oracle &O, ReducerWorkItem &WorkItem) { } } } - -void llvm::reduceIRInstructionReferencesDeltaPass(TestRunner &Test) { - runDeltaPass(Test, stripIRFromInstructions, - "Reducing IR references from instructions"); -} - -void llvm::reduceIRBlockReferencesDeltaPass(TestRunner &Test) { - runDeltaPass(Test, stripIRFromBlocks, "Reducing IR references from blocks"); -} - -void llvm::reduceIRFunctionReferencesDeltaPass(TestRunner &Test) { - runDeltaPass(Test, stripIRFromFunctions, - "Reducing IR references from functions"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceIRReferences.h b/llvm/tools/llvm-reduce/deltas/ReduceIRReferences.h index 548559a0775b4..4394602911df1 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceIRReferences.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceIRReferences.h @@ -14,17 +14,19 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEIRREFERENCES_MIR_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEIRREFERENCES_MIR_H +#include "Delta.h" + namespace llvm { -class TestRunner; /// Remove IR references from instructions (i.e. from memory operands) -void reduceIRInstructionReferencesDeltaPass(TestRunner &Test); +void reduceIRInstructionReferencesDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem); /// Remove IR BasicBlock references (the block names) -void reduceIRBlockReferencesDeltaPass(TestRunner &Test); +void reduceIRBlockReferencesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); /// Remove IR references from function level fields (e.g. frame object names) -void reduceIRFunctionReferencesDeltaPass(TestRunner &Test); +void reduceIRFunctionReferencesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlags.cpp b/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlags.cpp index e157747004782..2937550bfec75 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlags.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlags.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "ReduceInstructionFlags.h" -#include "Delta.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -19,7 +18,8 @@ using namespace llvm; -static void reduceFlagsInModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceInstructionFlagsDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { // Keep this in sync with computeIRComplexityScoreImpl(). for (Function &F : WorkItem.getModule()) { for (Instruction &I : instructions(F)) { @@ -83,7 +83,3 @@ static void reduceFlagsInModule(Oracle &O, ReducerWorkItem &WorkItem) { } } } - -void llvm::reduceInstructionFlagsDeltaPass(TestRunner &Test) { - runDeltaPass(Test, reduceFlagsInModule, "Reducing Instruction Flags"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlags.h b/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlags.h index 1764c0199da87..005cc8390ab96 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlags.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlags.h @@ -9,10 +9,10 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEINSTRUCTIONFLAGS_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEINSTRUCTIONFLAGS_H -#include "TestRunner.h" +#include "Delta.h" namespace llvm { -void reduceInstructionFlagsDeltaPass(TestRunner &Test); +void reduceInstructionFlagsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlagsMIR.cpp b/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlagsMIR.cpp index f2895b31947ec..70dbd85a8da93 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlagsMIR.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlagsMIR.cpp @@ -14,9 +14,11 @@ #include "ReduceInstructionFlagsMIR.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfo.h" + using namespace llvm; -static void removeFlagsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceInstructionFlagsMIRDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { for (const Function &F : WorkItem.getModule()) { if (auto *MF = WorkItem.MMI->getMachineFunction(F)) { for (MachineBasicBlock &MBB : *MF) { @@ -29,7 +31,3 @@ static void removeFlagsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { } } } - -void llvm::reduceInstructionFlagsMIRDeltaPass(TestRunner &Test) { - runDeltaPass(Test, removeFlagsFromModule, "Reducing Instruction Flags"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlagsMIR.h b/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlagsMIR.h index a5a34d275e2f0..77d8eea12a2d6 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlagsMIR.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructionFlagsMIR.h @@ -17,7 +17,7 @@ #include "Delta.h" namespace llvm { -void reduceInstructionFlagsMIRDeltaPass(TestRunner &Test); +void reduceInstructionFlagsMIRDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp b/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp index e1b7924594b5e..a906584f75dd7 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructions.cpp @@ -14,6 +14,8 @@ #include "ReduceInstructions.h" #include "Utils.h" #include "llvm/IR/Constants.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include using namespace llvm; @@ -29,7 +31,7 @@ static bool shouldAlwaysKeep(const Instruction &I) { /// Removes out-of-chunk arguments from functions, and modifies their calls /// accordingly. It also removes allocations of out-of-chunk arguments. -static void extractInstrFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceInstructionsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); for (auto &F : Program) { @@ -46,7 +48,3 @@ static void extractInstrFromModule(Oracle &O, ReducerWorkItem &WorkItem) { } } } - -void llvm::reduceInstructionsDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractInstrFromModule, "Reducing Instructions"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructions.h b/llvm/tools/llvm-reduce/deltas/ReduceInstructions.h index 8c13a02cb98f3..ca3b7d521ce77 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceInstructions.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructions.h @@ -15,11 +15,9 @@ #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEINSTRUCTIONS_H #include "Delta.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" namespace llvm { -void reduceInstructionsDeltaPass(TestRunner &Test); +void reduceInstructionsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.cpp b/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.cpp index 40bc6b180fb88..24975e9f7aaa7 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.cpp @@ -12,8 +12,6 @@ //===----------------------------------------------------------------------===// #include "ReduceInstructionsMIR.h" -#include "Delta.h" - #include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" @@ -151,13 +149,10 @@ static void extractInstrFromFunction(Oracle &O, MachineFunction &MF) { MI->eraseFromParent(); } -static void extractInstrFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceInstructionsMIRDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { for (const Function &F : WorkItem.getModule()) { if (MachineFunction *MF = WorkItem.MMI->getMachineFunction(F)) extractInstrFromFunction(O, *MF); } } - -void llvm::reduceInstructionsMIRDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractInstrFromModule, "Reducing Instructions"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.h b/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.h index 70e0ac5fcf37f..e07f5636b6c8b 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.h @@ -14,10 +14,10 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEINSTRUCTIONS_MIR_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEINSTRUCTIONS_MIR_H -namespace llvm { -class TestRunner; +#include "Delta.h" -void reduceInstructionsMIRDeltaPass(TestRunner &Test); +namespace llvm { +void reduceInstructionsMIRDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInvokes.cpp b/llvm/tools/llvm-reduce/deltas/ReduceInvokes.cpp index c6425a753df54..ca0fb7156673a 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceInvokes.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceInvokes.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "ReduceInvokes.h" -#include "Delta.h" #include "llvm/IR/Instructions.h" #include "llvm/Transforms/Utils/Local.h" @@ -29,13 +28,9 @@ static void reduceInvokesInFunction(Oracle &O, Function &F) { // reduction. } -static void reduceInvokesInModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceInvokesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { for (Function &F : WorkItem.getModule()) { if (F.hasPersonalityFn()) reduceInvokesInFunction(O, F); } } - -void llvm::reduceInvokesDeltaPass(TestRunner &Test) { - runDeltaPass(Test, reduceInvokesInModule, "Reducing Invokes"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInvokes.h b/llvm/tools/llvm-reduce/deltas/ReduceInvokes.h index 9607add166005..4d246c22e6220 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceInvokes.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceInvokes.h @@ -9,10 +9,10 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEINVOKES_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEINVOKES_H -#include "TestRunner.h" +#include "Delta.h" namespace llvm { -void reduceInvokesDeltaPass(TestRunner &Test); +void reduceInvokesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceMemoryOperations.cpp b/llvm/tools/llvm-reduce/deltas/ReduceMemoryOperations.cpp index 8e73ea076034c..4584694550936 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceMemoryOperations.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceMemoryOperations.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "ReduceMemoryOperations.h" -#include "Delta.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -36,15 +35,12 @@ static void removeVolatileInFunction(Oracle &O, Function &F) { } } -static void removeVolatileInModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceVolatileInstructionsDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { for (Function &F : WorkItem.getModule()) removeVolatileInFunction(O, F); } -void llvm::reduceVolatileInstructionsDeltaPass(TestRunner &Test) { - runDeltaPass(Test, removeVolatileInModule, "Reducing Volatile Instructions"); -} - static void reduceAtomicSyncScopesInFunction(Oracle &O, Function &F) { for (Instruction &I : instructions(F)) { if (LoadInst *LI = dyn_cast(&I)) { @@ -66,17 +62,12 @@ static void reduceAtomicSyncScopesInFunction(Oracle &O, Function &F) { } } -static void reduceAtomicSyncScopesInModule(Oracle &O, +void llvm::reduceAtomicSyncScopesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { for (Function &F : WorkItem.getModule()) reduceAtomicSyncScopesInFunction(O, F); } -void llvm::reduceAtomicSyncScopesDeltaPass(TestRunner &Test) { - runDeltaPass(Test, reduceAtomicSyncScopesInModule, - "Reducing Atomic Sync Scopes"); -} - // TODO: Might be helpful to incrementally relax orders static void reduceAtomicOrderingInFunction(Oracle &O, Function &F) { for (Instruction &I : instructions(F)) { @@ -100,11 +91,7 @@ static void reduceAtomicOrderingInFunction(Oracle &O, Function &F) { } } -static void reduceAtomicOrderingInModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceAtomicOrderingDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { for (Function &F : WorkItem.getModule()) reduceAtomicOrderingInFunction(O, F); } - -void llvm::reduceAtomicOrderingDeltaPass(TestRunner &Test) { - runDeltaPass(Test, reduceAtomicOrderingInModule, "Reducing Atomic Ordering"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceMemoryOperations.h b/llvm/tools/llvm-reduce/deltas/ReduceMemoryOperations.h index ca6a770dff081..46ada3661e31d 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceMemoryOperations.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceMemoryOperations.h @@ -9,12 +9,12 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEMEMORYOPERATIONS_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEMEMORYOPERATIONS_H -#include "TestRunner.h" +#include "Delta.h" namespace llvm { -void reduceVolatileInstructionsDeltaPass(TestRunner &Test); -void reduceAtomicSyncScopesDeltaPass(TestRunner &Test); -void reduceAtomicOrderingDeltaPass(TestRunner &Test); +void reduceVolatileInstructionsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); +void reduceAtomicSyncScopesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); +void reduceAtomicOrderingDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceMetadata.cpp b/llvm/tools/llvm-reduce/deltas/ReduceMetadata.cpp index 316c74876025a..c0d0163dab5fb 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceMetadata.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceMetadata.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "ReduceMetadata.h" -#include "Delta.h" #include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/InstIterator.h" @@ -48,7 +47,7 @@ static constexpr StringLiteral ListNamedMetadata[] = { }; /// Remove unneeded arguments to named metadata. -static void reduceNamedMetadataOperands(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceNamedMetadataDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &M = WorkItem.getModule(); for (NamedMDNode &I : M.named_metadata()) { @@ -77,7 +76,7 @@ static void reduceNamedMetadataOperands(Oracle &O, ReducerWorkItem &WorkItem) { /// Removes all the Named and Unnamed Metadata Nodes, as well as any debug /// functions that aren't inside the desired Chunks. -static void extractMetadataFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceMetadataDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); // Get out-of-chunk Named metadata nodes @@ -122,11 +121,3 @@ static void extractMetadataFromModule(Oracle &O, ReducerWorkItem &WorkItem) { } } } - -void llvm::reduceMetadataDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractMetadataFromModule, "Reducing Metadata"); -} - -void llvm::reduceNamedMetadataDeltaPass(TestRunner &Test) { - runDeltaPass(Test, reduceNamedMetadataOperands, "Reducing Named Metadata"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceMetadata.h b/llvm/tools/llvm-reduce/deltas/ReduceMetadata.h index f3af31a2759bc..34bf2feb92f74 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceMetadata.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceMetadata.h @@ -14,11 +14,11 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEMETADATA_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEMETADATA_H -#include "TestRunner.h" +#include "Delta.h" namespace llvm { -void reduceMetadataDeltaPass(TestRunner &Test); -void reduceNamedMetadataDeltaPass(TestRunner &Test); +void reduceMetadataDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); +void reduceNamedMetadataDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceModuleData.cpp b/llvm/tools/llvm-reduce/deltas/ReduceModuleData.cpp index 17930abe6dbfe..4aeaef6d8d676 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceModuleData.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceModuleData.cpp @@ -14,7 +14,7 @@ using namespace llvm; -static void clearModuleData(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceModuleDataDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); if (!Program.getModuleIdentifier().empty() && !O.shouldKeep()) @@ -25,7 +25,3 @@ static void clearModuleData(Oracle &O, ReducerWorkItem &WorkItem) { if (!Program.getModuleInlineAsm().empty() && !O.shouldKeep()) Program.setModuleInlineAsm(""); } - -void llvm::reduceModuleDataDeltaPass(TestRunner &Test) { - runDeltaPass(Test, clearModuleData, "Reducing Module Data"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceModuleData.h b/llvm/tools/llvm-reduce/deltas/ReduceModuleData.h index 960fe8c6d3a6d..a5eaab9f9c59c 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceModuleData.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceModuleData.h @@ -12,7 +12,7 @@ #include "Delta.h" namespace llvm { -void reduceModuleDataDeltaPass(TestRunner &Test); +void reduceModuleDataDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp index 9fb4fd61c74e7..ceea71f68c932 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "ReduceOpcodes.h" -#include "Delta.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -240,7 +239,7 @@ static Value *reduceInstruction(Oracle &O, Module &M, Instruction &I) { return nullptr; } -static void replaceOpcodesInModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceOpcodesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Mod = WorkItem.getModule(); for (Function &F : Mod) { @@ -261,7 +260,3 @@ static void replaceOpcodesInModule(Oracle &O, ReducerWorkItem &WorkItem) { } } } - -void llvm::reduceOpcodesDeltaPass(TestRunner &Test) { - runDeltaPass(Test, replaceOpcodesInModule, "Reducing Opcodes"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.h b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.h index 79edc7f32facf..5861c2571a1bd 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.h @@ -9,10 +9,10 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEOPCODES_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEOPCODES_H -#include "TestRunner.h" +#include "Delta.h" namespace llvm { -void reduceOpcodesDeltaPass(TestRunner &Test); +void reduceOpcodesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp index d2274877f126b..e5d7b187c8107 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "ReduceOperandBundles.h" -#include "Delta.h" #include "TestRunner.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -95,8 +94,7 @@ static void maybeRewriteCallWithDifferentBundles( } /// Removes out-of-chunk operand bundles from calls. -static void extractOperandBundesFromModule(Oracle &O, - ReducerWorkItem &WorkItem) { +void llvm::reduceOperandBundesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); OperandBundleRemapper R(O); R.visit(Program); @@ -104,8 +102,3 @@ static void extractOperandBundesFromModule(Oracle &O, for (const auto &I : R.CallsToRefine) maybeRewriteCallWithDifferentBundles(I.first, I.second); } - -void llvm::reduceOperandBundesDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractOperandBundesFromModule, - "Reducing Operand Bundles"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.h b/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.h index 390b029242536..23af510f7f31c 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.h @@ -14,9 +14,10 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEOPERANDBUNDLES_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEOPERANDBUNDLES_H +#include "Delta.h" + namespace llvm { -class TestRunner; -void reduceOperandBundesDeltaPass(TestRunner &Test); +void reduceOperandBundesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp index 5babdc7d0a940..c135f0c9e5c36 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp @@ -83,7 +83,7 @@ static bool switchCaseExists(Use &Op, ConstantInt *CI) { return SI->findCaseValue(CI) != SI->case_default(); } -void llvm::reduceOperandsOneDeltaPass(TestRunner &Test) { +void llvm::reduceOperandsOneDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { auto ReduceValue = [](Use &Op) -> Value * { if (!shouldReduceOperand(Op)) return nullptr; @@ -118,15 +118,10 @@ void llvm::reduceOperandsOneDeltaPass(TestRunner &Test) { return nullptr; }; - runDeltaPass( - Test, - [ReduceValue](Oracle &O, ReducerWorkItem &WorkItem) { - extractOperandsFromModule(O, WorkItem, ReduceValue); - }, - "Reducing Operands to one"); + extractOperandsFromModule(O, WorkItem, ReduceValue); } -void llvm::reduceOperandsZeroDeltaPass(TestRunner &Test) { +void llvm::reduceOperandsZeroDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { auto ReduceValue = [](Use &Op) -> Value * { if (!shouldReduceOperand(Op)) return nullptr; @@ -148,15 +143,10 @@ void llvm::reduceOperandsZeroDeltaPass(TestRunner &Test) { // Don't replace existing zeroes. return isZero(Op) ? nullptr : Constant::getNullValue(Op->getType()); }; - runDeltaPass( - Test, - [ReduceValue](Oracle &O, ReducerWorkItem &Program) { - extractOperandsFromModule(O, Program, ReduceValue); - }, - "Reducing Operands to zero"); + extractOperandsFromModule(O, WorkItem, ReduceValue); } -void llvm::reduceOperandsNaNDeltaPass(TestRunner &Test) { +void llvm::reduceOperandsNaNDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { auto ReduceValue = [](Use &Op) -> Value * { Type *Ty = Op->getType(); if (!Ty->isFPOrFPVectorTy()) @@ -176,10 +166,5 @@ void llvm::reduceOperandsNaNDeltaPass(TestRunner &Test) { return ConstantFP::getQNaN(Ty); }; - runDeltaPass( - Test, - [ReduceValue](Oracle &O, ReducerWorkItem &Program) { - extractOperandsFromModule(O, Program, ReduceValue); - }, - "Reducing Operands to NaN"); + extractOperandsFromModule(O, WorkItem, ReduceValue); } diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperands.h b/llvm/tools/llvm-reduce/deltas/ReduceOperands.h index b4a18998f8e20..2c86ba920442b 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperands.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperands.h @@ -12,9 +12,9 @@ #include "Delta.h" namespace llvm { -void reduceOperandsOneDeltaPass(TestRunner &Test); -void reduceOperandsZeroDeltaPass(TestRunner &Test); -void reduceOperandsNaNDeltaPass(TestRunner &Test); +void reduceOperandsOneDeltaPass(Oracle &, ReducerWorkItem &); +void reduceOperandsZeroDeltaPass(Oracle &, ReducerWorkItem &); +void reduceOperandsNaNDeltaPass(Oracle &, ReducerWorkItem &); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandsSkip.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperandsSkip.cpp index 2a9d40d8c3c59..2eff3da263d31 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperandsSkip.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandsSkip.cpp @@ -194,7 +194,7 @@ opportunities(Function &F, } } -static void extractOperandsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceOperandsSkipDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); for (Function &F : Program.functions()) { @@ -229,8 +229,3 @@ static void extractOperandsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { } } } - -void llvm::reduceOperandsSkipDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractOperandsFromModule, - "Reducing operands by skipping over instructions"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandsSkip.h b/llvm/tools/llvm-reduce/deltas/ReduceOperandsSkip.h index 79897011639a2..71047110701fa 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperandsSkip.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandsSkip.h @@ -12,7 +12,7 @@ #include "Delta.h" namespace llvm { -void reduceOperandsSkipDeltaPass(TestRunner &Test); +void reduceOperandsSkipDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif /* LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEOPERANDSSKIP_H */ diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp index 39302bd09fb49..0d984622bc298 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "ReduceOperandsToArgs.h" -#include "Delta.h" #include "Utils.h" #include "llvm/ADT/Sequence.h" #include "llvm/IR/Constants.h" @@ -196,7 +195,7 @@ static void substituteOperandWithArgument(Function *OldF, OldF->eraseFromParent(); } -static void reduceOperandsToArgs(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceOperandsToArgsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); SmallVector OperandsToReduce; @@ -218,8 +217,3 @@ static void reduceOperandsToArgs(Oracle &O, ReducerWorkItem &WorkItem) { substituteOperandWithArgument(&F, OperandsToReduce); } } - -void llvm::reduceOperandsToArgsDeltaPass(TestRunner &Test) { - runDeltaPass(Test, reduceOperandsToArgs, - "Converting operands to function arguments"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.h b/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.h index 23043dd60b6ff..5d6e47c56059b 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.h @@ -12,7 +12,7 @@ #include "Delta.h" namespace llvm { -void reduceOperandsToArgsDeltaPass(TestRunner &Test); +void reduceOperandsToArgsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif /* LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEOPERANDSTOARGS_H */ diff --git a/llvm/tools/llvm-reduce/deltas/ReduceRegisterDefs.cpp b/llvm/tools/llvm-reduce/deltas/ReduceRegisterDefs.cpp index 97259649ab858..e9d2e9a7b545f 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceRegisterDefs.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceRegisterDefs.cpp @@ -110,13 +110,10 @@ static void removeDefsFromFunction(Oracle &O, MachineFunction &MF) { } } -static void removeDefsFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceRegisterDefsMIRDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { for (const Function &F : WorkItem.getModule()) { if (auto *MF = WorkItem.MMI->getMachineFunction(F)) removeDefsFromFunction(O, *MF); } } - -void llvm::reduceRegisterDefsMIRDeltaPass(TestRunner &Test) { - runDeltaPass(Test, removeDefsFromModule, "Reducing register defs"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceRegisterDefs.h b/llvm/tools/llvm-reduce/deltas/ReduceRegisterDefs.h index 031d24125bac8..88ea024722ebc 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceRegisterDefs.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceRegisterDefs.h @@ -17,7 +17,7 @@ #include "Delta.h" namespace llvm { -void reduceRegisterDefsMIRDeltaPass(TestRunner &Test); +void reduceRegisterDefsMIRDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceRegisterMasks.cpp b/llvm/tools/llvm-reduce/deltas/ReduceRegisterMasks.cpp index 953e0e51afd82..f0c4cb991f7c7 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceRegisterMasks.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceRegisterMasks.cpp @@ -60,13 +60,10 @@ static void reduceMasksInFunction(Oracle &O, MachineFunction &MF) { } } -static void reduceMasksInModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceRegisterMasksMIRDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { for (const Function &F : WorkItem.getModule()) { if (auto *MF = WorkItem.MMI->getMachineFunction(F)) reduceMasksInFunction(O, *MF); } } - -void llvm::reduceRegisterMasksMIRDeltaPass(TestRunner &Test) { - runDeltaPass(Test, reduceMasksInModule, "Reducing register masks"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceRegisterMasks.h b/llvm/tools/llvm-reduce/deltas/ReduceRegisterMasks.h index b8bb109e5c996..aee82a7c89214 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceRegisterMasks.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceRegisterMasks.h @@ -12,7 +12,7 @@ #include "Delta.h" namespace llvm { -void reduceRegisterMasksMIRDeltaPass(TestRunner &Test); +void reduceRegisterMasksMIRDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceRegisterUses.cpp b/llvm/tools/llvm-reduce/deltas/ReduceRegisterUses.cpp index a608935736d1a..6c07f13b52db3 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceRegisterUses.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceRegisterUses.cpp @@ -55,13 +55,10 @@ static void removeUsesFromFunction(Oracle &O, MachineFunction &MF) { } } -static void removeUsesFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceRegisterUsesMIRDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { for (const Function &F : WorkItem.getModule()) { if (auto *MF = WorkItem.MMI->getMachineFunction(F)) removeUsesFromFunction(O, *MF); } } - -void llvm::reduceRegisterUsesMIRDeltaPass(TestRunner &Test) { - runDeltaPass(Test, removeUsesFromModule, "Reducing register uses"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceRegisterUses.h b/llvm/tools/llvm-reduce/deltas/ReduceRegisterUses.h index 91ecba488f37c..40f6fcdb694af 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceRegisterUses.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceRegisterUses.h @@ -17,7 +17,7 @@ #include "Delta.h" namespace llvm { -void reduceRegisterUsesMIRDeltaPass(TestRunner &Test); +void reduceRegisterUsesMIRDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.cpp b/llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.cpp index aadd038033d5c..9a452d86c58a7 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.cpp @@ -15,7 +15,6 @@ //===----------------------------------------------------------------------===// #include "ReduceSpecialGlobals.h" -#include "Delta.h" #include "Utils.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Constants.h" @@ -27,8 +26,7 @@ static StringRef SpecialGlobalNames[] = {"llvm.used", "llvm.compiler.used"}; /// Removes all special globals aren't inside any of the /// desired Chunks. -static void extractSpecialGlobalsFromModule(Oracle &O, - ReducerWorkItem &WorkItem) { +void llvm::reduceSpecialGlobalsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); for (StringRef Name : SpecialGlobalNames) { @@ -40,8 +38,3 @@ static void extractSpecialGlobalsFromModule(Oracle &O, } } } - -void llvm::reduceSpecialGlobalsDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractSpecialGlobalsFromModule, - "Reducing Special Globals"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.h b/llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.h index d17790529e06b..8332a2102df97 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceSpecialGlobals.h @@ -20,7 +20,7 @@ #include "Delta.h" namespace llvm { -void reduceSpecialGlobalsDeltaPass(TestRunner &Test); +void reduceSpecialGlobalsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp index c49fcb9855d41..ec37e248da8ed 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp @@ -19,7 +19,8 @@ using namespace llvm; -static void reduceUsingSimplifyCFG(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::reduceUsingSimplifyCFGDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); SmallVector ToSimplify; for (auto &F : Program) @@ -31,9 +32,6 @@ static void reduceUsingSimplifyCFG(Oracle &O, ReducerWorkItem &WorkItem) { simplifyCFG(BB, TTI); } -void llvm::reduceUsingSimplifyCFGDeltaPass(TestRunner &Test) { - runDeltaPass(Test, reduceUsingSimplifyCFG, "Reducing using SimplifyCFG"); -} static void reduceConditionals(Oracle &O, ReducerWorkItem &WorkItem, bool Direction) { Module &M = WorkItem.getModule(); @@ -59,20 +57,12 @@ static void reduceConditionals(Oracle &O, ReducerWorkItem &WorkItem, simplifyCFG(BB, TTI); } -void llvm::reduceConditionalsTrueDeltaPass(TestRunner &Test) { - runDeltaPass( - Test, - [](Oracle &O, ReducerWorkItem &WorkItem) { - reduceConditionals(O, WorkItem, true); - }, - "Reducing conditional branches to true"); +void llvm::reduceConditionalsTrueDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { + reduceConditionals(O, WorkItem, true); } -void llvm::reduceConditionalsFalseDeltaPass(TestRunner &Test) { - runDeltaPass( - Test, - [](Oracle &O, ReducerWorkItem &WorkItem) { - reduceConditionals(O, WorkItem, false); - }, - "Reducing conditional branches to false"); +void llvm::reduceConditionalsFalseDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { + reduceConditionals(O, WorkItem, false); } diff --git a/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.h b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.h index 01a14602909b3..48dce275574e9 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.h @@ -17,9 +17,9 @@ #include "Delta.h" namespace llvm { -void reduceUsingSimplifyCFGDeltaPass(TestRunner &Test); -void reduceConditionalsTrueDeltaPass(TestRunner &Test); -void reduceConditionalsFalseDeltaPass(TestRunner &Test); +void reduceUsingSimplifyCFGDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); +void reduceConditionalsTrueDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); +void reduceConditionalsFalseDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.cpp b/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.cpp index 3ec9555c0f2f5..ed8121d99130a 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "ReduceVirtualRegisters.h" -#include "Delta.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -33,15 +32,10 @@ static void dropRegisterHintsFromFunction(Oracle &O, MachineFunction &MF) { } } -static void dropRegisterHintsFromFunctions(Oracle &O, - ReducerWorkItem &WorkItem) { +void llvm::reduceVirtualRegisterHintsDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { for (const Function &F : WorkItem.getModule()) { if (auto *MF = WorkItem.MMI->getMachineFunction(F)) dropRegisterHintsFromFunction(O, *MF); } } - -void llvm::reduceVirtualRegisterHintsDeltaPass(TestRunner &Test) { - runDeltaPass(Test, dropRegisterHintsFromFunctions, - "Reducing virtual register hints from functions"); -} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.h b/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.h index 405ba31703e54..ff8ba4a004f34 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.h @@ -14,11 +14,13 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEVIRTUALREGISTERS_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEVIRTUALREGISTERS_H +#include "Delta.h" + namespace llvm { class TestRunner; /// Remove register allocation hints from virtual registes. -void reduceVirtualRegisterHintsDeltaPass(TestRunner &Test); +void reduceVirtualRegisterHintsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm diff --git a/llvm/tools/llvm-reduce/deltas/RunIRPasses.cpp b/llvm/tools/llvm-reduce/deltas/RunIRPasses.cpp index f31c5d86dad1e..7d7355db15dd4 100644 --- a/llvm/tools/llvm-reduce/deltas/RunIRPasses.cpp +++ b/llvm/tools/llvm-reduce/deltas/RunIRPasses.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "RunIRPasses.h" -#include "Delta.h" #include "llvm/Passes/PassBuilder.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" @@ -24,7 +23,7 @@ static cl::opt "simplifycfg,infer-address-spaces)"), cl::cat(LLVMReduceOptions)); -static void runPasses(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::runIRPassesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); LoopAnalysisManager LAM; FunctionAnalysisManager FAM; @@ -47,7 +46,3 @@ static void runPasses(Oracle &O, ReducerWorkItem &WorkItem) { report_fatal_error(std::move(Err), false); MPM.run(Program, MAM); } - -void llvm::runIRPassesDeltaPass(TestRunner &Test) { - runDeltaPass(Test, runPasses, "Running passes"); -} diff --git a/llvm/tools/llvm-reduce/deltas/RunIRPasses.h b/llvm/tools/llvm-reduce/deltas/RunIRPasses.h index f1d4140d5b57f..bd7dd8adf1d89 100644 --- a/llvm/tools/llvm-reduce/deltas/RunIRPasses.h +++ b/llvm/tools/llvm-reduce/deltas/RunIRPasses.h @@ -12,7 +12,7 @@ #include "Delta.h" namespace llvm { -void runIRPassesDeltaPass(TestRunner &Test); +void runIRPassesDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/SimplifyInstructions.cpp b/llvm/tools/llvm-reduce/deltas/SimplifyInstructions.cpp index fc21593c5415c..7eb381d3c1905 100644 --- a/llvm/tools/llvm-reduce/deltas/SimplifyInstructions.cpp +++ b/llvm/tools/llvm-reduce/deltas/SimplifyInstructions.cpp @@ -19,7 +19,7 @@ using namespace llvm; /// Calls simplifyInstruction in each instruction in functions, and replaces /// their values. -static void extractInstrFromModule(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::simplifyInstructionsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { std::vector InstsToDelete; Module &Program = WorkItem.getModule(); @@ -44,7 +44,3 @@ static void extractInstrFromModule(Oracle &O, ReducerWorkItem &WorkItem) { for (Instruction *I : InstToDelete) I->eraseFromParent(); } - -void llvm::simplifyInstructionsDeltaPass(TestRunner &Test) { - runDeltaPass(Test, extractInstrFromModule, "Simplifying Instructions"); -} diff --git a/llvm/tools/llvm-reduce/deltas/SimplifyInstructions.h b/llvm/tools/llvm-reduce/deltas/SimplifyInstructions.h index 215cffcd4d12e..4e4b913e3d191 100644 --- a/llvm/tools/llvm-reduce/deltas/SimplifyInstructions.h +++ b/llvm/tools/llvm-reduce/deltas/SimplifyInstructions.h @@ -12,7 +12,7 @@ #include "Delta.h" namespace llvm { -void simplifyInstructionsDeltaPass(TestRunner &Test); +void simplifyInstructionsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif diff --git a/llvm/tools/llvm-reduce/deltas/StripDebugInfo.cpp b/llvm/tools/llvm-reduce/deltas/StripDebugInfo.cpp index c9e1261c366a7..c8077d20fdd29 100644 --- a/llvm/tools/llvm-reduce/deltas/StripDebugInfo.cpp +++ b/llvm/tools/llvm-reduce/deltas/StripDebugInfo.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "StripDebugInfo.h" -#include "Delta.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Metadata.h" @@ -15,7 +14,7 @@ using namespace llvm; /// Removes all aliases aren't inside any of the /// desired Chunks. -static void stripDebugInfoImpl(Oracle &O, ReducerWorkItem &WorkItem) { +void llvm::stripDebugInfoDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { Module &Program = WorkItem.getModule(); bool HasDebugInfo = any_of(Program.named_metadata(), [](NamedMDNode &NMD) { return NMD.getName().starts_with("llvm.dbg."); @@ -23,7 +22,3 @@ static void stripDebugInfoImpl(Oracle &O, ReducerWorkItem &WorkItem) { if (HasDebugInfo && !O.shouldKeep()) StripDebugInfo(Program); } - -void llvm::stripDebugInfoDeltaPass(TestRunner &Test) { - runDeltaPass(Test, stripDebugInfoImpl, "Stripping Debug Info"); -} diff --git a/llvm/tools/llvm-reduce/deltas/StripDebugInfo.h b/llvm/tools/llvm-reduce/deltas/StripDebugInfo.h index 56be459546e94..b88bb98ce668c 100644 --- a/llvm/tools/llvm-reduce/deltas/StripDebugInfo.h +++ b/llvm/tools/llvm-reduce/deltas/StripDebugInfo.h @@ -12,7 +12,7 @@ #include "Delta.h" namespace llvm { -void stripDebugInfoDeltaPass(TestRunner &Test); +void stripDebugInfoDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); } // namespace llvm #endif From 825ecfed9e08f27ecb65b960bb754f6d300ff625 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 29 Mar 2025 22:39:57 -0700 Subject: [PATCH 0032/1029] [ExecutionEngine] Avoid repeated hash lookups (NFC) (#133615) --- llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp | 3 +-- llvm/lib/ExecutionEngine/Orc/COFFVCRuntimeSupport.cpp | 3 +-- llvm/lib/ExecutionEngine/Orc/Core.cpp | 7 +++---- llvm/lib/ExecutionEngine/Orc/IRPartitionLayer.cpp | 3 +-- llvm/lib/ExecutionEngine/Orc/LLJIT.cpp | 6 ++---- llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp | 6 ++---- llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp | 3 +-- 7 files changed, 11 insertions(+), 20 deletions(-) diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp index 72e5f701f89a7..abe9ae7a3da16 100644 --- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp @@ -70,8 +70,7 @@ Error EHFrameEdgeFixer::operator()(LinkGraph &G) { // Sort eh-frame blocks into address order to ensure we visit CIEs before // their child FDEs. std::vector EHFrameBlocks; - for (auto *B : EHFrame->blocks()) - EHFrameBlocks.push_back(B); + llvm::append_range(EHFrameBlocks, EHFrame->blocks()); llvm::sort(EHFrameBlocks, [](const Block *LHS, const Block *RHS) { return LHS->getAddress() < RHS->getAddress(); }); diff --git a/llvm/lib/ExecutionEngine/Orc/COFFVCRuntimeSupport.cpp b/llvm/lib/ExecutionEngine/Orc/COFFVCRuntimeSupport.cpp index 94f696fa20863..c785381175284 100644 --- a/llvm/lib/ExecutionEngine/Orc/COFFVCRuntimeSupport.cpp +++ b/llvm/lib/ExecutionEngine/Orc/COFFVCRuntimeSupport.cpp @@ -86,8 +86,7 @@ Error COFFVCRuntimeBootstrapper::loadVCRuntime( if (!G) return G.takeError(); - for (auto &Lib : (*G)->getImportedDynamicLibraries()) - ImportedLibraries.push_back(Lib); + llvm::append_range(ImportedLibraries, (*G)->getImportedDynamicLibraries()); JD.addGenerator(std::move(*G)); diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp index d6673552e39fd..cbed057950aea 100644 --- a/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -127,8 +127,7 @@ void UnsatisfiedSymbolDependencies::log(raw_ostream &OS) const { SymbolsNotFound::SymbolsNotFound(std::shared_ptr SSP, SymbolNameSet Symbols) : SSP(std::move(SSP)) { - for (auto &Sym : Symbols) - this->Symbols.push_back(Sym); + llvm::append_range(this->Symbols, Symbols); assert(!this->Symbols.empty() && "Can not fail to resolve an empty set"); } @@ -2387,8 +2386,8 @@ void ExecutionSession::OL_applyQueryPhase1( // Build the definition generator stack for this JITDylib. runSessionLocked([&] { IPLS->CurDefGeneratorStack.reserve(JD.DefGenerators.size()); - for (auto &DG : reverse(JD.DefGenerators)) - IPLS->CurDefGeneratorStack.push_back(DG); + llvm::append_range(IPLS->CurDefGeneratorStack, + reverse(JD.DefGenerators)); }); // Flag that we've done our initialization. diff --git a/llvm/lib/ExecutionEngine/Orc/IRPartitionLayer.cpp b/llvm/lib/ExecutionEngine/Orc/IRPartitionLayer.cpp index 9ad171beac7fe..1a37469c35d2e 100644 --- a/llvm/lib/ExecutionEngine/Orc/IRPartitionLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/IRPartitionLayer.cpp @@ -260,8 +260,7 @@ void IRPartitionLayer::emitPartition( { std::vector HashGVs; HashGVs.reserve(GVsToExtract->size()); - for (const auto *GV : *GVsToExtract) - HashGVs.push_back(GV); + llvm::append_range(HashGVs, *GVsToExtract); llvm::sort(HashGVs, [](const GlobalValue *LHS, const GlobalValue *RHS) { return LHS->getName() < RHS->getName(); }); diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index 7d385f4cf2fbb..21ebe82c8a71a 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -57,8 +57,7 @@ Function *addHelperAndWrapper(Module &M, StringRef WrapperName, std::vector HelperArgTypes; for (auto *Arg : HelperPrefixArgs) HelperArgTypes.push_back(Arg->getType()); - for (auto *T : WrapperFnType->params()) - HelperArgTypes.push_back(T); + llvm::append_range(HelperArgTypes, WrapperFnType->params()); auto *HelperFnType = FunctionType::get(WrapperFnType->getReturnType(), HelperArgTypes, false); auto *HelperFn = Function::Create(HelperFnType, GlobalValue::ExternalLinkage, @@ -72,8 +71,7 @@ Function *addHelperAndWrapper(Module &M, StringRef WrapperName, IRBuilder<> IB(EntryBlock); std::vector HelperArgs; - for (auto *Arg : HelperPrefixArgs) - HelperArgs.push_back(Arg); + llvm::append_range(HelperArgs, HelperPrefixArgs); for (auto &Arg : WrapperFn->args()) HelperArgs.push_back(&Arg); auto *HelperResult = IB.CreateCall(HelperFn, HelperArgs); diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index 498d438bc25d4..90194d7fcc119 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -1676,10 +1676,8 @@ Error MachOPlatform::MachOPlatformPlugin::prepareSymbolTableRegistration( // those names. { SmallVector SymsToProcess; - for (auto *Sym : G.defined_symbols()) - SymsToProcess.push_back(Sym); - for (auto *Sym : G.absolute_symbols()) - SymsToProcess.push_back(Sym); + llvm::append_range(SymsToProcess, G.defined_symbols()); + llvm::append_range(SymsToProcess, G.absolute_symbols()); for (auto *Sym : SymsToProcess) { if (!Sym->hasName()) diff --git a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp index 53d5e049798aa..3d816785cb76d 100644 --- a/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp +++ b/llvm/lib/ExecutionEngine/Orc/SimpleRemoteEPC.cpp @@ -47,8 +47,7 @@ lookupSymbolsAsyncHelper(EPCGenericDylibManager &DylibMgr, return Complete(R.takeError()); Result.push_back({}); Result.back().reserve(R->size()); - for (auto Addr : *R) - Result.back().push_back(Addr); + llvm::append_range(Result.back(), *R); lookupSymbolsAsyncHelper( DylibMgr, Request.drop_front(), std::move(Result), From 8db434a4815737e4d665aeedd1495369e80ea208 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sat, 29 Mar 2025 22:40:05 -0700 Subject: [PATCH 0033/1029] [bugpoint] Avoid repeated hash lookups (NFC) (#133616) --- llvm/tools/bugpoint/CrashDebugger.cpp | 16 +++------ llvm/tools/bugpoint/OptimizerDriver.cpp | 3 +- llvm/tools/bugpoint/ToolRunner.cpp | 36 +++++++------------ llvm/tools/dsymutil/Reproducer.cpp | 3 +- llvm/tools/llvm-cov/CodeCoverage.cpp | 3 +- .../tools/llvm-debuginfod/llvm-debuginfod.cpp | 3 +- .../llvm-libtool-darwin/DependencyInfo.h | 3 +- llvm/tools/llvm-lipo/llvm-lipo.cpp | 4 +-- llvm/tools/llvm-lto2/llvm-lto2.cpp | 3 +- llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp | 3 +- llvm/tools/llvm-rc/llvm-rc.cpp | 3 +- llvm/tools/llvm-xray/xray-stacks.cpp | 23 +++++------- llvm/tools/lto/lto.cpp | 6 ++-- llvm/tools/obj2yaml/elf2yaml.cpp | 6 ++-- llvm/tools/obj2yaml/macho2yaml.cpp | 7 ++-- 15 files changed, 41 insertions(+), 81 deletions(-) diff --git a/llvm/tools/bugpoint/CrashDebugger.cpp b/llvm/tools/bugpoint/CrashDebugger.cpp index a7777f778f66e..e2f7e104c58e3 100644 --- a/llvm/tools/bugpoint/CrashDebugger.cpp +++ b/llvm/tools/bugpoint/CrashDebugger.cpp @@ -390,9 +390,7 @@ bool ReduceCrashingFunctionAttributes::TestFuncAttrs( // Pass along the set of attributes that caused the crash. Attrs.clear(); - for (Attribute A : NewAttrs.getFnAttrs()) { - Attrs.push_back(A); - } + llvm::append_range(Attrs, NewAttrs.getFnAttrs()); return true; } return false; @@ -800,8 +798,7 @@ bool ReduceCrashingInstructions::TestInsts( // Make sure to use instruction pointers that point into the now-current // module, and that they don't include any deleted blocks. Insts.clear(); - for (Instruction *Inst : Instructions) - Insts.push_back(Inst); + llvm::append_range(Insts, Instructions); return true; } // It didn't crash, try something else. @@ -870,8 +867,7 @@ bool ReduceCrashingMetadata::TestInsts(std::vector &Insts) { // Make sure to use instruction pointers that point into the now-current // module, and that they don't include any deleted blocks. Insts.clear(); - for (Instruction *I : Instructions) - Insts.push_back(I); + llvm::append_range(Insts, Instructions); return true; } // It didn't crash, try something else. @@ -1211,8 +1207,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) { assert(Fn && "Could not find function?"); std::vector Attrs; - for (Attribute A : Fn->getAttributes().getFnAttrs()) - Attrs.push_back(A); + llvm::append_range(Attrs, Fn->getAttributes().getFnAttrs()); OldSize += Attrs.size(); Expected Result = @@ -1319,8 +1314,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) { // contribute to the crash, bisect the operands of the remaining ones std::vector NamedMDOps; for (auto &NamedMD : BD.getProgram().named_metadata()) - for (auto *op : NamedMD.operands()) - NamedMDOps.push_back(op); + llvm::append_range(NamedMDOps, NamedMD.operands()); Expected Result = ReduceCrashingNamedMDOps(BD, TestFn).reduceList(NamedMDOps); if (Error E = Result.takeError()) diff --git a/llvm/tools/bugpoint/OptimizerDriver.cpp b/llvm/tools/bugpoint/OptimizerDriver.cpp index 0b29a1f17d879..56a0fa4d5ec9e 100644 --- a/llvm/tools/bugpoint/OptimizerDriver.cpp +++ b/llvm/tools/bugpoint/OptimizerDriver.cpp @@ -203,8 +203,7 @@ bool BugDriver::runPasses(Module &Program, } else Args.push_back(tool); - for (unsigned i = 0, e = OptArgs.size(); i != e; ++i) - Args.push_back(OptArgs[i]); + llvm::append_range(Args, OptArgs); // Pin to legacy PM since bugpoint has lots of infra and hacks revolving // around the legacy PM. Args.push_back("-bugpoint-enable-legacy-pm"); diff --git a/llvm/tools/bugpoint/ToolRunner.cpp b/llvm/tools/bugpoint/ToolRunner.cpp index e45c89b746aeb..f2f5966ad9d04 100644 --- a/llvm/tools/bugpoint/ToolRunner.cpp +++ b/llvm/tools/bugpoint/ToolRunner.cpp @@ -181,13 +181,11 @@ Expected LLI::ExecuteProgram(const std::string &Bitcode, } // Add any extra LLI args. - for (unsigned i = 0, e = ToolArgs.size(); i != e; ++i) - LLIArgs.push_back(ToolArgs[i]); + llvm::append_range(LLIArgs, ToolArgs); LLIArgs.push_back(Bitcode); // Add optional parameters to the running program from Argv - for (unsigned i = 0, e = Args.size(); i != e; ++i) - LLIArgs.push_back(Args[i]); + llvm::append_range(LLIArgs, Args); outs() << ""; outs().flush(); @@ -268,13 +266,11 @@ Error CustomCompiler::compileProgram(const std::string &Bitcode, std::vector ProgramArgs; ProgramArgs.push_back(CompilerCommand); - for (const auto &Arg : CompilerArgs) - ProgramArgs.push_back(Arg); + llvm::append_range(ProgramArgs, CompilerArgs); ProgramArgs.push_back(Bitcode); // Add optional parameters to the running program from Argv - for (const auto &Arg : CompilerArgs) - ProgramArgs.push_back(Arg); + llvm::append_range(ProgramArgs, CompilerArgs); if (RunProgramWithTimeout(CompilerCommand, ProgramArgs, "", "", "", Timeout, MemoryLimit)) @@ -317,13 +313,11 @@ Expected CustomExecutor::ExecuteProgram( std::vector ProgramArgs; ProgramArgs.push_back(ExecutionCommand); - for (std::size_t i = 0; i < ExecutorArgs.size(); ++i) - ProgramArgs.push_back(ExecutorArgs[i]); + llvm::append_range(ProgramArgs, ExecutorArgs); ProgramArgs.push_back(Bitcode); // Add optional parameters to the running program from Argv - for (unsigned i = 0, e = Args.size(); i != e; ++i) - ProgramArgs.push_back(Args[i]); + llvm::append_range(ProgramArgs, Args); return RunProgramWithTimeout(ExecutionCommand, ProgramArgs, InputFile, OutputFile, OutputFile, Timeout, MemoryLimit); @@ -447,8 +441,7 @@ Expected LLC::OutputCode(const std::string &Bitcode, LLCArgs.push_back(LLCPath); // Add any extra LLC args. - for (unsigned i = 0, e = ToolArgs.size(); i != e; ++i) - LLCArgs.push_back(ToolArgs[i]); + llvm::append_range(LLCArgs, ToolArgs); LLCArgs.push_back("-o"); LLCArgs.push_back(OutputAsmFile); // Output to the Asm file @@ -563,8 +556,7 @@ Expected JIT::ExecuteProgram(const std::string &Bitcode, JITArgs.push_back("-force-interpreter=false"); // Add any extra LLI args. - for (unsigned i = 0, e = ToolArgs.size(); i != e; ++i) - JITArgs.push_back(ToolArgs[i]); + llvm::append_range(JITArgs, ToolArgs); for (unsigned i = 0, e = SharedLibs.size(); i != e; ++i) { JITArgs.push_back("-load"); @@ -572,8 +564,7 @@ Expected JIT::ExecuteProgram(const std::string &Bitcode, } JITArgs.push_back(Bitcode); // Add optional parameters to the running program from Argv - for (unsigned i = 0, e = Args.size(); i != e; ++i) - JITArgs.push_back(Args[i]); + llvm::append_range(JITArgs, Args); outs() << ""; outs().flush(); @@ -674,8 +665,7 @@ Expected CC::ExecuteProgram(const std::string &ProgramFile, // most likely -L and -l options that need to come before other libraries but // after the source. Other options won't be sensitive to placement on the // command line, so this should be safe. - for (unsigned i = 0, e = ArgsForCC.size(); i != e; ++i) - CCArgs.push_back(ArgsForCC[i]); + llvm::append_range(CCArgs, ArgsForCC); CCArgs.push_back("-lm"); // Hard-code the math library... CCArgs.push_back("-O2"); // Optimize the program a bit... @@ -725,8 +715,7 @@ Expected CC::ExecuteProgram(const std::string &ProgramFile, } // Add optional parameters to the running program from Argv - for (unsigned i = 0, e = Args.size(); i != e; ++i) - ProgramArgs.push_back(Args[i]); + llvm::append_range(ProgramArgs, Args); // Now that we have a binary, run it! outs() << ""; @@ -823,8 +812,7 @@ Error CC::MakeSharedObject(const std::string &InputFile, FileType fileType, // most likely -L and -l options that need to come before other libraries but // after the source. Other options won't be sensitive to placement on the // command line, so this should be safe. - for (unsigned i = 0, e = ArgsForCC.size(); i != e; ++i) - CCArgs.push_back(ArgsForCC[i]); + llvm::append_range(CCArgs, ArgsForCC); outs() << ""; outs().flush(); diff --git a/llvm/tools/dsymutil/Reproducer.cpp b/llvm/tools/dsymutil/Reproducer.cpp index a6cc10424dc52..31e49cdd0518c 100644 --- a/llvm/tools/dsymutil/Reproducer.cpp +++ b/llvm/tools/dsymutil/Reproducer.cpp @@ -36,8 +36,7 @@ Reproducer::~Reproducer() = default; ReproducerGenerate::ReproducerGenerate(std::error_code &EC, int Argc, char **Argv, bool GenerateOnExit) : Root(createReproducerDir(EC)), GenerateOnExit(GenerateOnExit) { - for (int I = 0; I < Argc; ++I) - Args.push_back(Argv[I]); + llvm::append_range(Args, ArrayRef(Argv, Argc)); if (!Root.empty()) FC = std::make_shared(Root, Root); VFS = FileCollector::createCollectorVFS(vfs::getRealFileSystem(), FC); diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp index 921f283deedc7..a740cdd45b901 100644 --- a/llvm/tools/llvm-cov/CodeCoverage.cpp +++ b/llvm/tools/llvm-cov/CodeCoverage.cpp @@ -588,8 +588,7 @@ void CodeCoverageTool::demangleSymbols(const CoverageMapping &Coverage) { // Invoke the demangler. std::vector ArgsV; ArgsV.reserve(ViewOpts.DemanglerOpts.size()); - for (StringRef Arg : ViewOpts.DemanglerOpts) - ArgsV.push_back(Arg); + llvm::append_range(ArgsV, ViewOpts.DemanglerOpts); std::optional Redirects[] = { InputPath.str(), OutputPath.str(), {""}}; std::string ErrMsg; diff --git a/llvm/tools/llvm-debuginfod/llvm-debuginfod.cpp b/llvm/tools/llvm-debuginfod/llvm-debuginfod.cpp index 2859a36c80b0b..7b85166c1b4ae 100644 --- a/llvm/tools/llvm-debuginfod/llvm-debuginfod.cpp +++ b/llvm/tools/llvm-debuginfod/llvm-debuginfod.cpp @@ -126,8 +126,7 @@ int llvm_debuginfod_main(int argc, char **argv, const llvm::ToolContext &) { parseArgs(argc, argv); SmallVector Paths; - for (const std::string &Path : ScanPaths) - Paths.push_back(Path); + llvm::append_range(Paths, ScanPaths); DefaultThreadPool Pool(hardware_concurrency(MaxConcurrency)); DebuginfodLog Log; diff --git a/llvm/tools/llvm-libtool-darwin/DependencyInfo.h b/llvm/tools/llvm-libtool-darwin/DependencyInfo.h index 7b2f94bdbeb81..784ec3f50cd53 100644 --- a/llvm/tools/llvm-libtool-darwin/DependencyInfo.h +++ b/llvm/tools/llvm-libtool-darwin/DependencyInfo.h @@ -50,8 +50,7 @@ class DependencyInfo { // Sort the input by its names. std::vector InputNames; InputNames.reserve(Inputs.size()); - for (const auto &F : Inputs) - InputNames.push_back(F); + llvm::append_range(InputNames, Inputs); llvm::sort(InputNames); for (const auto &In : InputNames) diff --git a/llvm/tools/llvm-lipo/llvm-lipo.cpp b/llvm/tools/llvm-lipo/llvm-lipo.cpp index 3c0197e8b7bac..8c588021391b4 100644 --- a/llvm/tools/llvm-lipo/llvm-lipo.cpp +++ b/llvm/tools/llvm-lipo/llvm-lipo.cpp @@ -249,8 +249,8 @@ static Config parseLipoOptions(ArrayRef ArgsArr) { switch (ActionArgs[0]->getOption().getID()) { case LIPO_verify_arch: - for (auto A : InputArgs.getAllArgValues(LIPO_verify_arch)) - C.VerifyArchList.push_back(A); + llvm::append_range(C.VerifyArchList, + InputArgs.getAllArgValues(LIPO_verify_arch)); if (C.VerifyArchList.empty()) reportError( "verify_arch requires at least one architecture to be specified"); diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp index 4c9b47d78a1bb..76ff11b8d6412 100644 --- a/llvm/tools/llvm-lto2/llvm-lto2.cpp +++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -326,8 +326,7 @@ static int run(int argc, char **argv) { Conf.OptLevel = OptLevel - '0'; Conf.Freestanding = EnableFreestanding; - for (auto &PluginFN : PassPlugins) - Conf.PassPlugins.push_back(PluginFN); + llvm::append_range(Conf.PassPlugins, PassPlugins); if (auto Level = CodeGenOpt::parseLevel(CGOptLevel)) { Conf.CGOptLevel = *Level; } else { diff --git a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp index 680218e3fc96c..4cb64bdbe8ef9 100644 --- a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp +++ b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp @@ -1538,8 +1538,7 @@ int main(int Argc, const char **Argv) { // Initialize the filters for LinePrinter. auto propagate = [&](auto &Target, auto &Reference) { - for (std::string &Option : Reference) - Target.push_back(Option); + llvm::append_range(Target, Reference); }; propagate(opts::Filters.ExcludeTypes, opts::pretty::ExcludeTypes); diff --git a/llvm/tools/llvm-rc/llvm-rc.cpp b/llvm/tools/llvm-rc/llvm-rc.cpp index a77188c462afe..6feadc5f259b3 100644 --- a/llvm/tools/llvm-rc/llvm-rc.cpp +++ b/llvm/tools/llvm-rc/llvm-rc.cpp @@ -266,8 +266,7 @@ void preprocess(StringRef Src, StringRef Dst, const RcOptions &Opts, } } } - for (const auto &S : Opts.PreprocessArgs) - Args.push_back(S); + llvm::append_range(Args, Opts.PreprocessArgs); Args.push_back(Src); Args.push_back("-o"); Args.push_back(Dst); diff --git a/llvm/tools/llvm-xray/xray-stacks.cpp b/llvm/tools/llvm-xray/xray-stacks.cpp index aebca9048d4d4..cbf6faeb32960 100644 --- a/llvm/tools/llvm-xray/xray-stacks.cpp +++ b/llvm/tools/llvm-xray/xray-stacks.cpp @@ -267,15 +267,11 @@ static StackDuration mergeStackDuration(const StackDuration &Left, Data.IntermediateDurations.reserve(Left.IntermediateDurations.size() + Right.IntermediateDurations.size()); // Aggregate the durations. - for (auto duration : Left.TerminalDurations) - Data.TerminalDurations.push_back(duration); - for (auto duration : Right.TerminalDurations) - Data.TerminalDurations.push_back(duration); - - for (auto duration : Left.IntermediateDurations) - Data.IntermediateDurations.push_back(duration); - for (auto duration : Right.IntermediateDurations) - Data.IntermediateDurations.push_back(duration); + llvm::append_range(Data.TerminalDurations, Left.TerminalDurations); + llvm::append_range(Data.TerminalDurations, Right.TerminalDurations); + + llvm::append_range(Data.IntermediateDurations, Left.IntermediateDurations); + llvm::append_range(Data.IntermediateDurations, Right.IntermediateDurations); return Data; } @@ -506,8 +502,7 @@ class StackTrie { for (const auto &RootNodeRange : make_range(map_iterator(Roots.begin(), MapValueFn), map_iterator(Roots.end(), MapValueFn))) { - for (auto *RootNode : RootNodeRange) - RootValues.push_back(RootNode); + llvm::append_range(RootValues, RootNodeRange); } print(OS, FN, RootValues); @@ -565,8 +560,7 @@ class StackTrie { while (!S.empty()) { auto *Top = S.pop_back_val(); printSingleStack(OS, FN, ReportThread, ThreadId, Top); - for (const auto *C : Top->Callees) - S.push_back(C); + llvm::append_range(S, Top->Callees); } } } @@ -641,8 +635,7 @@ class StackTrie { TopStacksByCount.pop_back(); } } - for (const auto *C : Top->Callees) - S.push_back(C); + llvm::append_range(S, Top->Callees); } } diff --git a/llvm/tools/lto/lto.cpp b/llvm/tools/lto/lto.cpp index b377693444189..29219c9114522 100644 --- a/llvm/tools/lto/lto.cpp +++ b/llvm/tools/lto/lto.cpp @@ -475,8 +475,7 @@ void lto_set_debug_options(const char *const *options, int number) { // Need to put each suboption in a null-terminated string before passing to // parseCommandLineOptions(). std::vector Options; - for (int i = 0; i < number; ++i) - Options.push_back(options[i]); + llvm::append_range(Options, ArrayRef(options, number)); llvm::parseCommandLineOptions(Options); optionParsingState = OptParsingState::Early; @@ -498,8 +497,7 @@ void lto_codegen_debug_options_array(lto_code_gen_t cg, assert(optionParsingState != OptParsingState::Early && "early option processing already happened"); SmallVector Options; - for (int i = 0; i < number; ++i) - Options.push_back(options[i]); + llvm::append_range(Options, ArrayRef(options, number)); unwrap(cg)->setCodeGenDebugOptions(ArrayRef(Options)); } diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp index b1c8032ea2192..c56ed15501b40 100644 --- a/llvm/tools/obj2yaml/elf2yaml.cpp +++ b/llvm/tools/obj2yaml/elf2yaml.cpp @@ -1266,8 +1266,7 @@ ELFDumper::dumpSymtabShndxSection(const Elf_Shdr *Shdr) { return EntriesOrErr.takeError(); S->Entries.emplace(); - for (const Elf_Word &E : *EntriesOrErr) - S->Entries->push_back(E); + llvm::append_range(*S->Entries, *EntriesOrErr); return S.release(); } @@ -1490,8 +1489,7 @@ ELFDumper::dumpSymverSection(const Elf_Shdr *Shdr) { return VersionsOrErr.takeError(); S->Entries.emplace(); - for (const Elf_Half &E : *VersionsOrErr) - S->Entries->push_back(E); + llvm::append_range(*S->Entries, *VersionsOrErr); return S.release(); } diff --git a/llvm/tools/obj2yaml/macho2yaml.cpp b/llvm/tools/obj2yaml/macho2yaml.cpp index cdd871e8c1d68..00220123e8189 100644 --- a/llvm/tools/obj2yaml/macho2yaml.cpp +++ b/llvm/tools/obj2yaml/macho2yaml.cpp @@ -364,8 +364,7 @@ void MachODumper::dumpFunctionStarts(std::unique_ptr &Y) { MachOYAML::LinkEditData &LEData = Y->LinkEdit; auto FunctionStarts = Obj.getFunctionStarts(); - for (auto Addr : FunctionStarts) - LEData.FunctionStarts.push_back(Addr); + llvm::append_range(LEData.FunctionStarts, FunctionStarts); } void MachODumper::dumpRebaseOpcodes(std::unique_ptr &Y) { @@ -637,9 +636,7 @@ void MachODumper::dumpChainedFixups(std::unique_ptr &Y) { assert(DC.dataoff < Obj.getData().size()); assert(DC.dataoff + DC.datasize <= Obj.getData().size()); const char *Bytes = Obj.getData().data() + DC.dataoff; - for (size_t Idx = 0; Idx < DC.datasize; Idx++) { - LEData.ChainedFixups.push_back(Bytes[Idx]); - } + llvm::append_range(LEData.ChainedFixups, ArrayRef(Bytes, DC.datasize)); } break; } From a03367fd0135a0fe1be4d6e4b8e0dd59ed332003 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 30 Mar 2025 12:39:47 +0700 Subject: [PATCH 0034/1029] llvm-reduce: Remove trailing whitespace --- llvm/tools/llvm-reduce/DeltaPasses.def | 98 +++++++++++++------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/llvm/tools/llvm-reduce/DeltaPasses.def b/llvm/tools/llvm-reduce/DeltaPasses.def index 1b5576b48dcd0..060daf198c76a 100644 --- a/llvm/tools/llvm-reduce/DeltaPasses.def +++ b/llvm/tools/llvm-reduce/DeltaPasses.def @@ -10,59 +10,59 @@ #ifndef DELTA_PASS_IR #define DELTA_PASS_IR(NAME, FUNC, DESC) #endif -DELTA_PASS_IR("strip-debug-info", stripDebugInfoDeltaPass, "Stripping Debug Info") -DELTA_PASS_IR("functions", reduceFunctionsDeltaPass, "Reducing Functions") -DELTA_PASS_IR("function-bodies", reduceFunctionBodiesDeltaPass, "Reducing Function Bodies") -DELTA_PASS_IR("special-globals", reduceSpecialGlobalsDeltaPass, "Reducing Special Globals") -DELTA_PASS_IR("aliases", reduceAliasesDeltaPass, "Reducing Aliases") -DELTA_PASS_IR("ifuncs", reduceIFuncsDeltaPass, "Reducing Ifuncs") -DELTA_PASS_IR("simplify-conditionals-true", reduceConditionalsTrueDeltaPass, "Reducing conditional branches to true") -DELTA_PASS_IR("simplify-conditionals-false", - reduceConditionalsFalseDeltaPass, "Reducing conditional branches to false") -DELTA_PASS_IR("invokes", reduceInvokesDeltaPass, "Reducing Invokes") -DELTA_PASS_IR("unreachable-basic-blocks", - reduceUnreachableBasicBlocksDeltaPass, "Removing Unreachable Basic Blocks") -DELTA_PASS_IR("basic-blocks", reduceBasicBlocksDeltaPass, "Reducing Basic Blocks") -DELTA_PASS_IR("simplify-cfg", reduceUsingSimplifyCFGDeltaPass, "Reducing using SimplifyCFG") -DELTA_PASS_IR("function-data", reduceFunctionDataDeltaPass, "Reducing Function Data") -DELTA_PASS_IR("global-values", reduceGlobalValuesDeltaPass, "Reducing GlobalValues") -DELTA_PASS_IR("global-objects", reduceGlobalObjectsDeltaPass, "Reducing GlobalObjects") -DELTA_PASS_IR("global-initializers", reduceGlobalsInitializersDeltaPass, "Reducing GV Initializers") -DELTA_PASS_IR("global-variables", reduceGlobalsDeltaPass, "Reducing GlobalVariables") -DELTA_PASS_IR("di-metadata", reduceDIMetadataDeltaPass, "Reducing DIMetadata") -DELTA_PASS_IR("dbg-records", reduceDbgRecordDeltaPass, "Reducing DbgRecords") -DELTA_PASS_IR("distinct-metadata", reduceDistinctMetadataDeltaPass, "Reducing Distinct Metadata") -DELTA_PASS_IR("metadata", reduceMetadataDeltaPass,"Reducing Metadata") -DELTA_PASS_IR("named-metadata", reduceNamedMetadataDeltaPass, "Reducing Named Metadata") -DELTA_PASS_IR("arguments", reduceArgumentsDeltaPass, "Reducing Arguments") -DELTA_PASS_IR("instructions", reduceInstructionsDeltaPass, "Reducing Instructions") -DELTA_PASS_IR("simplify-instructions", simplifyInstructionsDeltaPass, "Simplifying Instructions") -DELTA_PASS_IR("ir-passes", runIRPassesDeltaPass, "Running passes") -DELTA_PASS_IR("operands-zero", reduceOperandsZeroDeltaPass, "Reducing Operands to zero") -DELTA_PASS_IR("operands-one", reduceOperandsOneDeltaPass, "Reducing Operands to one") -DELTA_PASS_IR("operands-nan", reduceOperandsNaNDeltaPass, "Reducing Operands to NaN") -DELTA_PASS_IR("operands-to-args", reduceOperandsToArgsDeltaPass, "Converting operands to function arguments") -DELTA_PASS_IR("operands-skip", reduceOperandsSkipDeltaPass, "Reducing operands by skipping over instructions") -DELTA_PASS_IR("operand-bundles", reduceOperandBundesDeltaPass, "Reducing Operand Bundles") -DELTA_PASS_IR("attributes", reduceAttributesDeltaPass, "Reducing Attributes") -DELTA_PASS_IR("module-data", reduceModuleDataDeltaPass, "Reducing Module Data") -DELTA_PASS_IR("opcodes", reduceOpcodesDeltaPass, "Reducing Opcodes") -DELTA_PASS_IR("volatile", reduceVolatileInstructionsDeltaPass, "Reducing Volatile Instructions") -DELTA_PASS_IR("atomic-ordering", reduceAtomicOrderingDeltaPass, "Reducing Atomic Ordering") -DELTA_PASS_IR("syncscopes", reduceAtomicSyncScopesDeltaPass, "Reducing Atomic Sync Scopes") +DELTA_PASS_IR("strip-debug-info", stripDebugInfoDeltaPass, "Stripping Debug Info") +DELTA_PASS_IR("functions", reduceFunctionsDeltaPass, "Reducing Functions") +DELTA_PASS_IR("function-bodies", reduceFunctionBodiesDeltaPass, "Reducing Function Bodies") +DELTA_PASS_IR("special-globals", reduceSpecialGlobalsDeltaPass, "Reducing Special Globals") +DELTA_PASS_IR("aliases", reduceAliasesDeltaPass, "Reducing Aliases") +DELTA_PASS_IR("ifuncs", reduceIFuncsDeltaPass, "Reducing Ifuncs") +DELTA_PASS_IR("simplify-conditionals-true", reduceConditionalsTrueDeltaPass, "Reducing conditional branches to true") +DELTA_PASS_IR("simplify-conditionals-false", + reduceConditionalsFalseDeltaPass, "Reducing conditional branches to false") +DELTA_PASS_IR("invokes", reduceInvokesDeltaPass, "Reducing Invokes") +DELTA_PASS_IR("unreachable-basic-blocks", + reduceUnreachableBasicBlocksDeltaPass, "Removing Unreachable Basic Blocks") +DELTA_PASS_IR("basic-blocks", reduceBasicBlocksDeltaPass, "Reducing Basic Blocks") +DELTA_PASS_IR("simplify-cfg", reduceUsingSimplifyCFGDeltaPass, "Reducing using SimplifyCFG") +DELTA_PASS_IR("function-data", reduceFunctionDataDeltaPass, "Reducing Function Data") +DELTA_PASS_IR("global-values", reduceGlobalValuesDeltaPass, "Reducing GlobalValues") +DELTA_PASS_IR("global-objects", reduceGlobalObjectsDeltaPass, "Reducing GlobalObjects") +DELTA_PASS_IR("global-initializers", reduceGlobalsInitializersDeltaPass, "Reducing GV Initializers") +DELTA_PASS_IR("global-variables", reduceGlobalsDeltaPass, "Reducing GlobalVariables") +DELTA_PASS_IR("di-metadata", reduceDIMetadataDeltaPass, "Reducing DIMetadata") +DELTA_PASS_IR("dbg-records", reduceDbgRecordDeltaPass, "Reducing DbgRecords") +DELTA_PASS_IR("distinct-metadata", reduceDistinctMetadataDeltaPass, "Reducing Distinct Metadata") +DELTA_PASS_IR("metadata", reduceMetadataDeltaPass,"Reducing Metadata") +DELTA_PASS_IR("named-metadata", reduceNamedMetadataDeltaPass, "Reducing Named Metadata") +DELTA_PASS_IR("arguments", reduceArgumentsDeltaPass, "Reducing Arguments") +DELTA_PASS_IR("instructions", reduceInstructionsDeltaPass, "Reducing Instructions") +DELTA_PASS_IR("simplify-instructions", simplifyInstructionsDeltaPass, "Simplifying Instructions") +DELTA_PASS_IR("ir-passes", runIRPassesDeltaPass, "Running passes") +DELTA_PASS_IR("operands-zero", reduceOperandsZeroDeltaPass, "Reducing Operands to zero") +DELTA_PASS_IR("operands-one", reduceOperandsOneDeltaPass, "Reducing Operands to one") +DELTA_PASS_IR("operands-nan", reduceOperandsNaNDeltaPass, "Reducing Operands to NaN") +DELTA_PASS_IR("operands-to-args", reduceOperandsToArgsDeltaPass, "Converting operands to function arguments") +DELTA_PASS_IR("operands-skip", reduceOperandsSkipDeltaPass, "Reducing operands by skipping over instructions") +DELTA_PASS_IR("operand-bundles", reduceOperandBundesDeltaPass, "Reducing Operand Bundles") +DELTA_PASS_IR("attributes", reduceAttributesDeltaPass, "Reducing Attributes") +DELTA_PASS_IR("module-data", reduceModuleDataDeltaPass, "Reducing Module Data") +DELTA_PASS_IR("opcodes", reduceOpcodesDeltaPass, "Reducing Opcodes") +DELTA_PASS_IR("volatile", reduceVolatileInstructionsDeltaPass, "Reducing Volatile Instructions") +DELTA_PASS_IR("atomic-ordering", reduceAtomicOrderingDeltaPass, "Reducing Atomic Ordering") +DELTA_PASS_IR("syncscopes", reduceAtomicSyncScopesDeltaPass, "Reducing Atomic Sync Scopes") DELTA_PASS_IR("instruction-flags", reduceInstructionFlagsDeltaPass, "Reducing Instruction Flags") #ifndef DELTA_PASS_MIR #define DELTA_PASS_MIR(NAME, FUNC, DESC) #endif -DELTA_PASS_MIR("instructions", reduceInstructionsMIRDeltaPass, "Reducing Instructions") -DELTA_PASS_MIR("ir-instruction-references", - reduceIRInstructionReferencesDeltaPass, "Reducing IR references from instructions") -DELTA_PASS_MIR("ir-block-references", reduceIRBlockReferencesDeltaPass, "Reducing IR references from blocks") -DELTA_PASS_MIR("ir-function-references", reduceIRFunctionReferencesDeltaPass, "Reducing IR references from functions") -DELTA_PASS_MIR("instruction-flags", reduceInstructionFlagsMIRDeltaPass, "Reducing Instruction Flags") -DELTA_PASS_MIR("register-uses", reduceRegisterUsesMIRDeltaPass, "Reducing register uses") -DELTA_PASS_MIR("register-defs", reduceRegisterDefsMIRDeltaPass, "Reducing register defs") -DELTA_PASS_MIR("register-hints", reduceVirtualRegisterHintsDeltaPass, "Reducing virtual register hints from functions") +DELTA_PASS_MIR("instructions", reduceInstructionsMIRDeltaPass, "Reducing Instructions") +DELTA_PASS_MIR("ir-instruction-references", + reduceIRInstructionReferencesDeltaPass, "Reducing IR references from instructions") +DELTA_PASS_MIR("ir-block-references", reduceIRBlockReferencesDeltaPass, "Reducing IR references from blocks") +DELTA_PASS_MIR("ir-function-references", reduceIRFunctionReferencesDeltaPass, "Reducing IR references from functions") +DELTA_PASS_MIR("instruction-flags", reduceInstructionFlagsMIRDeltaPass, "Reducing Instruction Flags") +DELTA_PASS_MIR("register-uses", reduceRegisterUsesMIRDeltaPass, "Reducing register uses") +DELTA_PASS_MIR("register-defs", reduceRegisterDefsMIRDeltaPass, "Reducing register defs") +DELTA_PASS_MIR("register-hints", reduceVirtualRegisterHintsDeltaPass, "Reducing virtual register hints from functions") DELTA_PASS_MIR("register-masks", reduceRegisterMasksMIRDeltaPass, "Reducing register masks") From 057b3c245234e9d9fa3a25fbf04d20e7f3fe2cff Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 30 Mar 2025 14:48:05 +0700 Subject: [PATCH 0035/1029] llvm-reduce: Trim unnecessary includes --- llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp | 2 ++ llvm/tools/llvm-reduce/deltas/ReduceArguments.h | 3 --- llvm/tools/llvm-reduce/deltas/Utils.h | 5 +++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp b/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp index 690cfc9ef4732..fdac995af32f6 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceArguments.cpp @@ -19,6 +19,8 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Operator.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include #include diff --git a/llvm/tools/llvm-reduce/deltas/ReduceArguments.h b/llvm/tools/llvm-reduce/deltas/ReduceArguments.h index cd305451a5713..ceb8d79bc13fd 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceArguments.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceArguments.h @@ -15,9 +15,6 @@ #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEARGUMENTS_H #include "Delta.h" -#include "llvm/IR/Argument.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" namespace llvm { void reduceArgumentsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); diff --git a/llvm/tools/llvm-reduce/deltas/Utils.h b/llvm/tools/llvm-reduce/deltas/Utils.h index e94aee5a91153..8cb4a3ebaf644 100644 --- a/llvm/tools/llvm-reduce/deltas/Utils.h +++ b/llvm/tools/llvm-reduce/deltas/Utils.h @@ -13,11 +13,12 @@ #ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_UTILS_H #define LLVM_TOOLS_LLVM_REDUCE_DELTAS_UTILS_H -#include "llvm/IR/Function.h" -#include "llvm/IR/Value.h" #include "llvm/Support/CommandLine.h" namespace llvm { +class Function; +class Type; +class Value; extern cl::opt Verbose; From 1cb6ba5c60c3ce19785948eb327036a455dd1457 Mon Sep 17 00:00:00 2001 From: cor3ntin Date: Sun, 30 Mar 2025 11:56:36 +0200 Subject: [PATCH 0036/1029] [Clang][NFC] Improve const correctness of constraint normalization (#133633) Follow up to #132849 --- clang/include/clang/Sema/Sema.h | 17 +++++++----- clang/include/clang/Sema/SemaConcept.h | 14 +++++----- clang/lib/Sema/SemaConcept.cpp | 36 ++++++++++++++------------ 3 files changed, 37 insertions(+), 30 deletions(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 066bce61c74c1..c74e709ce06d2 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -14660,7 +14660,8 @@ class Sema final : public SemaBase { bool First = true); const NormalizedConstraint *getNormalizedAssociatedConstraints( - NamedDecl *ConstrainedDecl, ArrayRef AssociatedConstraints); + const NamedDecl *ConstrainedDecl, + ArrayRef AssociatedConstraints); /// \brief Check whether the given declaration's associated constraints are /// at least as constrained than another declaration's according to the @@ -14670,28 +14671,30 @@ class Sema final : public SemaBase { /// at least constrained than D2, and false otherwise. /// /// \returns true if an error occurred, false otherwise. - bool IsAtLeastAsConstrained(NamedDecl *D1, MutableArrayRef AC1, - NamedDecl *D2, MutableArrayRef AC2, - bool &Result); + bool IsAtLeastAsConstrained(const NamedDecl *D1, + MutableArrayRef AC1, + const NamedDecl *D2, + MutableArrayRef AC2, bool &Result); /// If D1 was not at least as constrained as D2, but would've been if a pair /// of atomic constraints involved had been declared in a concept and not /// repeated in two separate places in code. /// \returns true if such a diagnostic was emitted, false otherwise. bool MaybeEmitAmbiguousAtomicConstraintsDiagnostic( - NamedDecl *D1, ArrayRef AC1, NamedDecl *D2, + const NamedDecl *D1, ArrayRef AC1, const NamedDecl *D2, ArrayRef AC2); private: /// Caches pairs of template-like decls whose associated constraints were /// checked for subsumption and whether or not the first's constraints did in /// fact subsume the second's. - llvm::DenseMap, bool> SubsumptionCache; + llvm::DenseMap, bool> + SubsumptionCache; /// Caches the normalized associated constraints of declarations (concepts or /// constrained declarations). If an error occurred while normalizing the /// associated constraints of the template or concept, nullptr will be cached /// here. - llvm::DenseMap NormalizationCache; + llvm::DenseMap NormalizationCache; llvm::ContextualFoldingSet SatisfactionCache; diff --git a/clang/include/clang/Sema/SemaConcept.h b/clang/include/clang/Sema/SemaConcept.h index fda22b779c636..cbb3720c30ee2 100644 --- a/clang/include/clang/Sema/SemaConcept.h +++ b/clang/include/clang/Sema/SemaConcept.h @@ -31,10 +31,10 @@ enum { ConstraintAlignment = 8 }; struct alignas(ConstraintAlignment) AtomicConstraint { const Expr *ConstraintExpr; - NamedDecl *ConstraintDecl; + const NamedDecl *ConstraintDecl; std::optional> ParameterMapping; - AtomicConstraint(const Expr *ConstraintExpr, NamedDecl *ConstraintDecl) + AtomicConstraint(const Expr *ConstraintExpr, const NamedDecl *ConstraintDecl) : ConstraintExpr(ConstraintExpr), ConstraintDecl(ConstraintDecl) {}; bool hasMatchingParameterMapping(ASTContext &C, @@ -114,9 +114,9 @@ struct NormalizedConstraint { private: static std::optional - fromConstraintExprs(Sema &S, NamedDecl *D, ArrayRef E); + fromConstraintExprs(Sema &S, const NamedDecl *D, ArrayRef E); static std::optional - fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E); + fromConstraintExpr(Sema &S, const NamedDecl *D, const Expr *E); }; struct alignas(ConstraintAlignment) NormalizedConstraintPair { @@ -137,7 +137,7 @@ struct alignas(ConstraintAlignment) FoldExpandedConstraint { }; const NormalizedConstraint *getNormalizedAssociatedConstraints( - Sema &S, NamedDecl *ConstrainedDecl, + Sema &S, const NamedDecl *ConstrainedDecl, ArrayRef AssociatedConstraints); /// \brief SubsumptionChecker establishes subsumption @@ -149,8 +149,8 @@ class SubsumptionChecker { SubsumptionChecker(Sema &SemaRef, SubsumptionCallable Callable = {}); - std::optional Subsumes(NamedDecl *DP, ArrayRef P, - NamedDecl *DQ, ArrayRef Q); + std::optional Subsumes(const NamedDecl *DP, ArrayRef P, + const NamedDecl *DQ, ArrayRef Q); bool Subsumes(const NormalizedConstraint *P, const NormalizedConstraint *Q); diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index e7e0b4cfb72a7..ebee5994bfed2 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -453,6 +453,7 @@ static ExprResult calculateConstraintSatisfaction( Sema::InstantiatingTemplate Inst( S, AtomicExpr->getBeginLoc(), Sema::InstantiatingTemplate::ConstraintSubstitution{}, + // FIXME: improve const-correctness of InstantiatingTemplate const_cast(Template), Info, AtomicExpr->getSourceRange()); if (Inst.isInvalid()) @@ -1435,9 +1436,9 @@ void Sema::DiagnoseUnsatisfiedConstraint( } } -const NormalizedConstraint * -Sema::getNormalizedAssociatedConstraints( - NamedDecl *ConstrainedDecl, ArrayRef AssociatedConstraints) { +const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints( + const NamedDecl *ConstrainedDecl, + ArrayRef AssociatedConstraints) { // In case the ConstrainedDecl comes from modules, it is necessary to use // the canonical decl to avoid different atomic constraints with the 'same' // declarations. @@ -1461,7 +1462,7 @@ Sema::getNormalizedAssociatedConstraints( } const NormalizedConstraint *clang::getNormalizedAssociatedConstraints( - Sema &S, NamedDecl *ConstrainedDecl, + Sema &S, const NamedDecl *ConstrainedDecl, ArrayRef AssociatedConstraints) { return S.getNormalizedAssociatedConstraints(ConstrainedDecl, AssociatedConstraints); @@ -1527,7 +1528,8 @@ substituteParameterMappings(Sema &S, NormalizedConstraint &N, Sema::InstantiatingTemplate Inst( S, InstLocBegin, Sema::InstantiatingTemplate::ParameterMappingSubstitution{}, - Atomic.ConstraintDecl, {InstLocBegin, InstLocEnd}); + const_cast(Atomic.ConstraintDecl), + {InstLocBegin, InstLocEnd}); if (Inst.isInvalid()) return true; if (S.SubstTemplateArguments(*Atomic.ParameterMapping, MLTAL, SubstArgs)) @@ -1591,7 +1593,7 @@ NormalizedConstraint &NormalizedConstraint::getRHS() const { } std::optional -NormalizedConstraint::fromConstraintExprs(Sema &S, NamedDecl *D, +NormalizedConstraint::fromConstraintExprs(Sema &S, const NamedDecl *D, ArrayRef E) { assert(E.size() != 0); auto Conjunction = fromConstraintExpr(S, D, E[0]); @@ -1608,7 +1610,8 @@ NormalizedConstraint::fromConstraintExprs(Sema &S, NamedDecl *D, } std::optional -NormalizedConstraint::fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E) { +NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D, + const Expr *E) { assert(E != nullptr); // C++ [temp.constr.normal]p1.1 @@ -1637,8 +1640,9 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E) { { Sema::InstantiatingTemplate Inst( S, CSE->getExprLoc(), - Sema::InstantiatingTemplate::ConstraintNormalization{}, D, - CSE->getSourceRange()); + Sema::InstantiatingTemplate::ConstraintNormalization{}, + // FIXME: improve const-correctness of InstantiatingTemplate + const_cast(D), CSE->getSourceRange()); if (Inst.isInvalid()) return std::nullopt; // C++ [temp.constr.normal]p1.1 @@ -1726,9 +1730,9 @@ bool FoldExpandedConstraint::AreCompatibleForSubsumption( return false; } -bool Sema::IsAtLeastAsConstrained(NamedDecl *D1, +bool Sema::IsAtLeastAsConstrained(const NamedDecl *D1, MutableArrayRef AC1, - NamedDecl *D2, + const NamedDecl *D2, MutableArrayRef AC2, bool &Result) { #ifndef NDEBUG @@ -1755,7 +1759,7 @@ bool Sema::IsAtLeastAsConstrained(NamedDecl *D1, return false; } - std::pair Key{D1, D2}; + std::pair Key{D1, D2}; auto CacheEntry = SubsumptionCache.find(Key); if (CacheEntry != SubsumptionCache.end()) { Result = CacheEntry->second; @@ -1789,7 +1793,7 @@ bool Sema::IsAtLeastAsConstrained(NamedDecl *D1, } bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic( - NamedDecl *D1, ArrayRef AC1, NamedDecl *D2, + const NamedDecl *D1, ArrayRef AC1, const NamedDecl *D2, ArrayRef AC2) { if (isSFINAEContext()) @@ -2055,7 +2059,7 @@ FormulaType SubsumptionChecker::Normalize(const NormalizedConstraint &NC) { FormulaType Res; auto Add = [&, this](Clause C) { - // Sort each clause and remove duplicates for faster comparisons + // Sort each clause and remove duplicates for faster comparisons. llvm::sort(C); C.erase(llvm::unique(C), C.end()); AddUniqueClauseToFormula(Res, std::move(C)); @@ -2102,9 +2106,9 @@ void SubsumptionChecker::AddUniqueClauseToFormula(Formula &F, Clause C) { F.push_back(C); } -std::optional SubsumptionChecker::Subsumes(NamedDecl *DP, +std::optional SubsumptionChecker::Subsumes(const NamedDecl *DP, ArrayRef P, - NamedDecl *DQ, + const NamedDecl *DQ, ArrayRef Q) { const NormalizedConstraint *PNormalized = getNormalizedAssociatedConstraints(SemaRef, DP, P); From faefb70c7a771ae646df3d5defe122cfff2aac7c Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Sun, 30 Mar 2025 12:02:27 +0200 Subject: [PATCH 0037/1029] [libc++][NFC] Remove _LIBCPP_DLL_VIS This macro is only ever used inside the definiton for the various visibility macros on windows. There, it's defined in multiple places with different expansions, which makes it more confusing than helpful when trying to figure out what macro expands to what. --- libcxx/include/__config | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/libcxx/include/__config b/libcxx/include/__config index 30fe0ef6a3b53..ea51d30dcda99 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -363,25 +363,22 @@ typedef __char32_t char32_t; # endif # if defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS) || (defined(__MINGW32__) && !defined(_LIBCPP_BUILDING_LIBRARY)) -# define _LIBCPP_DLL_VIS # define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS # define _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS # define _LIBCPP_OVERRIDABLE_FUNC_VIS # define _LIBCPP_EXPORTED_FROM_ABI # elif defined(_LIBCPP_BUILDING_LIBRARY) -# define _LIBCPP_DLL_VIS __declspec(dllexport) # if defined(__MINGW32__) -# define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS _LIBCPP_DLL_VIS +# define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __declspec(dllexport) # define _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS # else # define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS -# define _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS _LIBCPP_DLL_VIS +# define _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS __declspec(dllexport) # endif -# define _LIBCPP_OVERRIDABLE_FUNC_VIS _LIBCPP_DLL_VIS +# define _LIBCPP_OVERRIDABLE_FUNC_VIS __declspec(dllexport) # define _LIBCPP_EXPORTED_FROM_ABI __declspec(dllexport) # else -# define _LIBCPP_DLL_VIS __declspec(dllimport) -# define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS _LIBCPP_DLL_VIS +# define _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __declspec(dllimport) # define _LIBCPP_CLASS_TEMPLATE_INSTANTIATION_VIS # define _LIBCPP_OVERRIDABLE_FUNC_VIS # define _LIBCPP_EXPORTED_FROM_ABI __declspec(dllimport) From 10dd404d9fbe505fb189972565c577318a6b577b Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sun, 30 Mar 2025 13:12:18 +0200 Subject: [PATCH 0038/1029] [bazel] Make DeltaPasses.def available for fea6b388055284f37852e615fbf5b40a3ba34249 --- utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index d643a5e969132..072147b7b6150 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -5582,6 +5582,12 @@ binary_alias( binary = ":llvm-readobj", ) +# Workaround inability to put `.def` files into `srcs`. +cc_library( + name = "llvm-reduce-defs-lib", + textual_hdrs = glob(["tools/llvm-reduce/*.def"]), +) + cc_binary( name = "llvm-reduce", srcs = glob([ @@ -5609,6 +5615,7 @@ cc_binary( ":TargetParser", ":TransformUtils", ":config", + ":llvm-reduce-defs-lib", ], ) From 4dbcefe3806f9970c0e4f4b08d98df5253517f14 Mon Sep 17 00:00:00 2001 From: Alexander Kornienko Date: Sun, 30 Mar 2025 15:37:43 +0200 Subject: [PATCH 0039/1029] Revert the llvm::append_range change in lib/Target/X86/X86WinEHState.cpp This reverts a single file from ad1ba15ea894ac47b0f2447db191a14ebe1b301d. llvm::append_range in this context fails to compile with recent Clang and libc++: libcxx/include/__algorithm/copy_backward.h:221:68: error: invalid operands to binary expression ('llvm::SuccIterator' and 'long') ... llvm-project/llvm/lib/Target/X86/X86WinEHState.cpp:724:11: note: in instantiation of function template specialization 'llvm::append_range, llvm::iterator_range>>' requested here 724 | llvm::append_range(Worklist, successors(BB)); | ^ --- llvm/lib/Target/X86/X86WinEHState.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp index 1bcbc7d6e6703..dfdeada476695 100644 --- a/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/llvm/lib/Target/X86/X86WinEHState.cpp @@ -721,7 +721,8 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { // enqueue it's successors to see if we can infer their states. InitialStates.insert({BB, PredState}); FinalStates.insert({BB, PredState}); - llvm::append_range(Worklist, successors(BB)); + for (BasicBlock *SuccBB : successors(BB)) + Worklist.push_back(SuccBB); } // Try to hoist stores from successors. From 9a913a39442d986cf8315f88666f8f5ebd6a4a94 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 30 Mar 2025 22:04:59 +0700 Subject: [PATCH 0040/1029] InlineCostAnnotationPrinter: Fix constructing random TargetTransformInfo (#133637) Query the correct TTI for the current target instead of constructing some random default one. Also query the pass manager for ProfileSummaryInfo. This should only change the printing, not the actual result. --- llvm/lib/Analysis/InlineCost.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index e42b2bd82cf2e..9f193b610328b 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -3295,9 +3295,12 @@ InlineCostAnnotationPrinterPass::run(Function &F, [&](Function &F) -> AssumptionCache & { return FAM.getResult(F); }; - Module *M = F.getParent(); - ProfileSummaryInfo PSI(*M); - TargetTransformInfo TTI(M->getDataLayout()); + + auto &MAMProxy = FAM.getResult(F); + ProfileSummaryInfo *PSI = + MAMProxy.getCachedResult(*F.getParent()); + const TargetTransformInfo &TTI = FAM.getResult(F); + // FIXME: Redesign the usage of InlineParams to expand the scope of this pass. // In the current implementation, the type of InlineParams doesn't matter as // the pass serves only for verification of inliner's decisions. @@ -3312,7 +3315,7 @@ InlineCostAnnotationPrinterPass::run(Function &F, continue; OptimizationRemarkEmitter ORE(CalledFunction); InlineCostCallAnalyzer ICCA(*CalledFunction, *CB, Params, TTI, - GetAssumptionCache, nullptr, nullptr, &PSI, + GetAssumptionCache, nullptr, nullptr, PSI, &ORE); ICCA.analyze(); OS << " Analyzing call of " << CalledFunction->getName() From 424c8f9217b7f746eb10c97be7314556c24065cd Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 30 Mar 2025 17:30:31 +0100 Subject: [PATCH 0041/1029] [VPlan] Remove dead UF argument from VPTransformState ctor (NFC). --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++-- llvm/lib/Transforms/Vectorize/VPlan.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlanHelpers.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 4b4a56be19fe5..3fc5e716e3757 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7704,8 +7704,8 @@ DenseMap LoopVectorizationPlanner::executePlan( VPlanTransforms::convertToConcreteRecipes(BestVPlan); // Perform the actual loop transformation. - VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV, - &BestVPlan, OrigLoop->getParentLoop(), + VPTransformState State(&TTI, BestVF, LI, DT, ILV.Builder, &ILV, &BestVPlan, + OrigLoop->getParentLoop(), Legal->getWidestInductionType()); #ifdef EXPENSIVE_CHECKS diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 8b53c559f6533..1e2f70e5c103e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -216,7 +216,7 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() { } VPTransformState::VPTransformState(const TargetTransformInfo *TTI, - ElementCount VF, unsigned UF, LoopInfo *LI, + ElementCount VF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, InnerLoopVectorizer *ILV, VPlan *Plan, Loop *CurrentParentLoop, Type *CanonicalIVTy) diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index 8e5b974d887f4..bebea1915690f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -202,7 +202,7 @@ class VPLane { /// VPTransformState holds information passed down when "executing" a VPlan, /// needed for generating the output IR. struct VPTransformState { - VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF, + VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, InnerLoopVectorizer *ILV, VPlan *Plan, Loop *CurrentParentLoop, Type *CanonicalIVTy); From 5b65b4d46412f0d5838b0d67b81b6dc03c93908f Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 30 Mar 2025 10:39:43 -0700 Subject: [PATCH 0042/1029] [RISCV] Remove dead code from evaluateTargetFixup AUIPCTarget as a relocatable expression cannot have a SubSym or @-specifier. --- llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index b5c66cc1e83f5..37cd79e890263 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -560,12 +560,11 @@ bool RISCVAsmBackend::evaluateTargetFixup(const MCAssembler &Asm, } } - if (!AUIPCTarget.getSymA() || AUIPCTarget.getSubSym()) + if (!AUIPCTarget.getSymA()) return false; - const MCSymbolRefExpr *A = AUIPCTarget.getSymA(); - const MCSymbolELF &SA = cast(A->getSymbol()); - if (getSpecifier(A) != RISCVMCExpr::VK_None || SA.isUndefined()) + const MCSymbolELF &SA = cast(*AUIPCTarget.getAddSym()); + if (SA.isUndefined()) return false; bool IsResolved = &SA.getSection() == AUIPCDF->getParent() && From 5715510d00d44d97a0024caf864e649225372281 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 30 Mar 2025 11:09:56 -0700 Subject: [PATCH 0043/1029] [RISCV] Remove unused declarations and getSpecifier. NFC Remove unused declarations after #132569. Simplify some code as we no longer use MCSymbolRefExpr::VariantKind. --- llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 3 +-- llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp | 5 +---- llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h | 8 -------- 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index c767e1e60f17f..52b38c19873c1 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2104,8 +2104,7 @@ ParseStatus RISCVAsmParser::parsePseudoJumpSymbol(OperandVector &Operands) { if (getParser().parseExpression(Res, E)) return ParseStatus::Failure; - if (Res->getKind() != MCExpr::ExprKind::SymbolRef || - getSpecifier(cast(Res)) == RISCVMCExpr::VK_PLTPCREL) + if (Res->getKind() != MCExpr::ExprKind::SymbolRef) return Error(S, "operand must be a valid jump target"); Res = RISCVMCExpr::create(Res, RISCVMCExpr::VK_CALL, getContext()); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index b27f13e6b95ba..69ad3d936fbbe 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -557,10 +557,7 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, FixupKind = RISCV::fixup_riscv_tlsdesc_call; break; } - } else if ((Kind == MCExpr::SymbolRef && - getSpecifier(cast(Expr)) == - RISCVMCExpr::VK_None) || - Kind == MCExpr::Binary) { + } else if (Kind == MCExpr::SymbolRef || Kind == MCExpr::Binary) { // FIXME: Sub kind binary exprs have chance of underflow. if (MIFrm == RISCVII::InstFormatJ) { FixupKind = RISCV::fixup_riscv_jal; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h index 604d2ebc66d1c..fd6993c18d820 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h @@ -49,8 +49,6 @@ class RISCVMCExpr : public MCTargetExpr { const MCExpr *Expr; const Specifier specifier; - int64_t evaluateAsInt64(int64_t Value) const; - explicit RISCVMCExpr(const MCExpr *Expr, Specifier S) : Expr(Expr), specifier(S) {} @@ -77,8 +75,6 @@ class RISCVMCExpr : public MCTargetExpr { return getSubExpr()->findAssociatedFragment(); } - bool evaluateAsConstant(int64_t &Res) const; - static bool classof(const MCExpr *E) { return E->getKind() == MCExpr::Target; } @@ -86,10 +82,6 @@ class RISCVMCExpr : public MCTargetExpr { static std::optional getSpecifierForName(StringRef name); static StringRef getSpecifierName(Specifier Kind); }; - -static inline RISCVMCExpr::Specifier getSpecifier(const MCSymbolRefExpr *SRE) { - return RISCVMCExpr::Specifier(SRE->getKind()); -} } // end namespace llvm. #endif From 1f7f268f304d02f0cea33ab63a21de57ba4a5a3c Mon Sep 17 00:00:00 2001 From: Liqiang TAO Date: Mon, 31 Mar 2025 02:21:19 +0800 Subject: [PATCH 0044/1029] StackProtector: use isInTailCallPosition to verify tail call position (#68997) The issue is caused by [D133860](https://reviews.llvm.org/D133860). The guard would be inserted in wrong place in some cases, like the test case showed below. This patch fixed the issue by using `isInTailCallPosition()` to verify whether the tail call is in right position. --- llvm/lib/CodeGen/StackProtector.cpp | 14 ++++---------- llvm/test/CodeGen/X86/tailcc-ssp.ll | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index e823df3186a54..eb07e5d2bae4b 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -625,18 +626,11 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F, HasIRCheck = true; // If we're instrumenting a block with a tail call, the check has to be - // inserted before the call rather than between it and the return. The - // verifier guarantees that a tail call is either directly before the - // return or with a single correct bitcast of the return value in between so - // we don't need to worry about many situations here. + // inserted before the call rather than between it and the return. Instruction *Prev = CheckLoc->getPrevNonDebugInstruction(); - if (Prev && isa(Prev) && cast(Prev)->isTailCall()) - CheckLoc = Prev; - else if (Prev) { - Prev = Prev->getPrevNonDebugInstruction(); - if (Prev && isa(Prev) && cast(Prev)->isTailCall()) + if (auto *CI = dyn_cast_if_present(Prev)) + if (CI->isTailCall() && isInTailCallPosition(*CI, *TM)) CheckLoc = Prev; - } // Generate epilogue instrumentation. The epilogue intrumentation can be // function-based or inlined depending on which mechanism the target is diff --git a/llvm/test/CodeGen/X86/tailcc-ssp.ll b/llvm/test/CodeGen/X86/tailcc-ssp.ll index 914af1466147a..5211e4fe9eef9 100644 --- a/llvm/test/CodeGen/X86/tailcc-ssp.ll +++ b/llvm/test/CodeGen/X86/tailcc-ssp.ll @@ -101,3 +101,24 @@ define void @tailcall_unrelated_frame() sspreq { tail call void @bar() ret void } + +declare void @callee() +define void @caller() sspreq { +; WINDOWS-LABEL: caller: +; WINDOWS: callq callee +; WINDOWS: callq callee +; WINDOWS: cmpq __security_cookie(%rip), %rcx +; WINDOWS: jne +; WINDOWS: callq __security_check_cookie + +; LINUX-LABEL: caller: +; LINUX: callq callee@PLT +; LINUX: callq callee@PLT +; LINUX: cmpq +; LINUX: jne +; LINUX: callq __stack_chk_fail@PLT + + tail call void @callee() + call void @callee() + ret void +} From 52639d69acbed0e49fd855c8c04cd9307405e2e6 Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Sun, 30 Mar 2025 21:26:23 +0300 Subject: [PATCH 0045/1029] [clang-tidy][NFC][doc] improve "options" sections of `bugprone-` and `modernize-` checks (#133525) Improved "options" sections of `bugprone-` and `modernize-` checks: 1. Added `Options` keyword to be a delimiter between "body" and "options" parts of docs 2. Added default values where was absent. 3. Improved readability of some default values by converting `1` to `true`. --- .../docs/clang-tidy/checks/bugprone/assert-side-effect.rst | 1 + .../checks/bugprone/capturing-this-in-member-variable.rst | 3 +++ .../docs/clang-tidy/checks/bugprone/signed-char-misuse.rst | 5 ++++- .../clang-tidy/checks/bugprone/suspicious-enum-usage.rst | 1 + .../checks/bugprone/suspicious-stringview-data-usage.rst | 3 +++ .../clang-tidy/checks/bugprone/too-small-loop-variable.rst | 3 +++ .../clang-tidy/checks/bugprone/unhandled-self-assignment.rst | 3 +++ .../checks/bugprone/unintended-char-ostream-output.rst | 3 +++ .../docs/clang-tidy/checks/modernize/avoid-bind.rst | 3 ++- .../docs/clang-tidy/checks/modernize/avoid-c-arrays.rst | 3 +++ .../docs/clang-tidy/checks/modernize/loop-convert.rst | 3 +++ .../docs/clang-tidy/checks/modernize/make-shared.rst | 4 ++-- .../docs/clang-tidy/checks/modernize/make-unique.rst | 4 ++-- .../docs/clang-tidy/checks/modernize/raw-string-literal.rst | 3 +++ .../docs/clang-tidy/checks/modernize/use-emplace.rst | 3 ++- 15 files changed, 38 insertions(+), 7 deletions(-) diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/assert-side-effect.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/assert-side-effect.rst index 1355afae92e4f..3ca712b958d04 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/assert-side-effect.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/assert-side-effect.rst @@ -15,6 +15,7 @@ Options .. option:: AssertMacros A comma-separated list of the names of assert macros to be checked. + Default is `assert,NSAssert,NSCAssert`. .. option:: CheckFunctionCalls diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst index bb75e9239d9b5..b09d7d5fce959 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst @@ -32,6 +32,9 @@ Possible fixes: object types. - passing ``this`` pointer as parameter +Options +------- + .. option:: FunctionWrapperTypes A semicolon-separated list of names of types. Used to specify function diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/signed-char-misuse.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/signed-char-misuse.rst index 72860e8cf2a1d..4edbad5eac81b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/signed-char-misuse.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/signed-char-misuse.rst @@ -104,13 +104,16 @@ so both arguments will have the same type. return false; } +Options +------- + .. option:: CharTypdefsToIgnore A semicolon-separated list of typedef names. In this list, we can list typedefs for ``char`` or ``signed char``, which will be ignored by the check. This is useful when a typedef introduces an integer alias like ``sal_Int8`` or ``int8_t``. In this case, human misinterpretation is not - an issue. + an issue. Default is an empty string. .. option:: DiagnoseSignedUnsignedCharComparisons diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-enum-usage.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-enum-usage.rst index e87172414a23e..94f29ee11ee39 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-enum-usage.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-enum-usage.rst @@ -71,6 +71,7 @@ Examples: Options ------- + .. option:: StrictMode Default value: 0. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.rst index 9b38d83601810..de10da21e8442 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.rst @@ -43,6 +43,9 @@ lead to a compilation error due to the explicit nature of the ``std::string`` constructor. Consequently, developers might opt for ``sv.data()`` to resolve the compilation error, albeit introducing potential hazards as discussed. +Options +------- + .. option:: StringViewTypes Option allows users to specify custom string view-like types for analysis. It diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst index 2c3ded952aa02..077abf0af6880 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/too-small-loop-variable.rst @@ -32,6 +32,9 @@ It's recommended to enable the compiler warning `-Wtautological-constant-out-of-range-compare` as well, since check does not inspect compile-time constant loop boundaries to avoid overlaps with the warning. +Options +------- + .. option:: MagnitudeBitsUpperLimit Upper limit for the magnitude bits of the loop variable. If it's set the check diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst index d3cdd5a12fdca..3a6245d2fe35b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unhandled-self-assignment.rst @@ -118,6 +118,9 @@ temporary object into ``this`` (needs a move assignment operator): } }; +Options +------- + .. option:: WarnOnlyIfThisHasSuspiciousField When `true`, the check will warn only if the container class of the copy diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unintended-char-ostream-output.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unintended-char-ostream-output.rst index ea1051847129b..95d02b3e2ddda 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unintended-char-ostream-output.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unintended-char-ostream-output.rst @@ -39,6 +39,9 @@ Or cast to char to explicitly indicate that output should be a character. std::cout << static_cast(v); +Options +------- + .. option:: CastTypeName When `CastTypeName` is specified, the fix-it will use `CastTypeName` as the diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/avoid-bind.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/avoid-bind.rst index 10374daecb660..64e7e95db8800 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/avoid-bind.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/avoid-bind.rst @@ -50,8 +50,9 @@ Options of every placeholder parameter list. Without this, it is possible for a fix-it to perform an incorrect transformation in the case where the result of the ``bind`` is used in the context of a type erased functor such as ``std::function`` which - allows mismatched arguments. For example: + allows mismatched arguments. Default is is `false`. +For example: .. code-block:: c++ diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/avoid-c-arrays.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/avoid-c-arrays.rst index 6a386ecd0fd4b..b7a87bf23967b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/avoid-c-arrays.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/avoid-c-arrays.rst @@ -62,6 +62,9 @@ Similarly, the ``main()`` function is ignored. Its second and third parameters can be either ``char* argv[]`` or ``char** argv``, but cannot be ``std::array<>``. +Options +------- + .. option:: AllowStringArrays When set to `true` (default is `false`), variables of character array type diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/loop-convert.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/loop-convert.rst index 0c423edca1822..3f4783e220501 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/loop-convert.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/loop-convert.rst @@ -140,6 +140,9 @@ however the check can be configured to work without C++20 by specifying a function to reverse a range and optionally the header file where that function lives. +Options +------- + .. option:: UseCxx20ReverseRanges When set to true convert loops when in C++20 or later mode using diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/make-shared.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/make-shared.rst index 9c1fceaa06000..982138fc5e781 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/make-shared.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/make-shared.rst @@ -51,6 +51,6 @@ Options .. option:: IgnoreDefaultInitialization - If set to non-zero, the check does not suggest edits that will transform + If set to `false`, the check does not suggest edits that will transform default initialization into value initialization, as this can cause - performance regressions. Default is `1`. + performance regressions. Default is `true`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/make-unique.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/make-unique.rst index cd474d352bac0..1aaa8701cd0f1 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/make-unique.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/make-unique.rst @@ -51,6 +51,6 @@ Options .. option:: IgnoreDefaultInitialization - If set to non-zero, the check does not suggest edits that will transform + If set to `false`, the check does not suggest edits that will transform default initialization into value initialization, as this can cause - performance regressions. Default is `1`. + performance regressions. Default is `true`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/raw-string-literal.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/raw-string-literal.rst index 6d7589a0011bb..66e50e80fa70b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/raw-string-literal.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/raw-string-literal.rst @@ -45,6 +45,9 @@ An escaped horizontal tab, form feed, or vertical tab prevents the string literal from being converted. The presence of a horizontal tab, form feed or vertical tab in source code is not visually obvious. +Options +------- + .. option:: DelimiterStem Custom delimiter to escape characters in raw string literals. It is used in diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-emplace.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-emplace.rst index f61b93aac7c76..e020ece296475 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-emplace.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-emplace.rst @@ -163,7 +163,8 @@ Options Semicolon-separated list of containers without their template parameters and some ``emplace``-like method of the container. Example: ``vector::emplace_back``. Those methods will be checked for improper use and - the check will report when a temporary is unnecessarily created. + the check will report when a temporary is unnecessarily created. All STL + containers with such member functions are supported by default. Example ^^^^^^^ From 8ecb2f9c4a8e39620935c1933fe1fbd2b96c723a Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Sun, 30 Mar 2025 21:39:32 +0300 Subject: [PATCH 0046/1029] [ADT] Add `DenseMap::insert_range` (#133600) This PR add `DenseMap::insert_range` to `DenseMap` for consistency with existing `DenseSet::insert_range`, `SmallSet::insert_range` and `std::map::insert_range`. --- llvm/include/llvm/ADT/DenseMap.h | 6 ++++++ llvm/unittests/ADT/DenseMapTest.cpp | 22 ++++++++++++++++++++++ llvm/unittests/ADT/DenseSetTest.cpp | 7 +++++++ 3 files changed, 35 insertions(+) diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h index f0f992f8eac38..ea9ba6f47ac1a 100644 --- a/llvm/include/llvm/ADT/DenseMap.h +++ b/llvm/include/llvm/ADT/DenseMap.h @@ -14,6 +14,7 @@ #ifndef LLVM_ADT_DENSEMAP_H #define LLVM_ADT_DENSEMAP_H +#include "llvm/ADT/ADL.h" #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/EpochTracker.h" #include "llvm/Support/AlignOf.h" @@ -302,6 +303,11 @@ class DenseMapBase : public DebugEpochBase { insert(*I); } + /// Inserts range of 'std::pair' values into the map. + template void insert_range(Range &&R) { + insert(adl_begin(R), adl_end(R)); + } + template std::pair insert_or_assign(const KeyT &Key, V &&Val) { auto Ret = try_emplace(Key, std::forward(Val)); diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp index d1bbdde8dfc26..a4c045585fc28 100644 --- a/llvm/unittests/ADT/DenseMapTest.cpp +++ b/llvm/unittests/ADT/DenseMapTest.cpp @@ -379,6 +379,28 @@ TEST(DenseMapCustomTest, EqualityComparison) { EXPECT_NE(M1, M3); } +TEST(DenseMapCustomTest, InsertRange) { + DenseMap M; + + std::pair InputVals[3] = {{0, 0}, {0, 1}, {1, 2}}; + M.insert_range(InputVals); + + EXPECT_EQ(M.size(), 2u); + EXPECT_THAT(M, testing::UnorderedElementsAre(testing::Pair(0, 0), + testing::Pair(1, 2))); +} + +TEST(SmallDenseMapCustomTest, InsertRange) { + SmallDenseMap M; + + std::pair InputVals[3] = {{0, 0}, {0, 1}, {1, 2}}; + M.insert_range(InputVals); + + EXPECT_EQ(M.size(), 2u); + EXPECT_THAT(M, testing::UnorderedElementsAre(testing::Pair(0, 0), + testing::Pair(1, 2))); +} + // Test for the default minimum size of a DenseMap TEST(DenseMapCustomTest, DefaultMinReservedSizeTest) { // IF THIS VALUE CHANGE, please update InitialSizeTest, InitFromIterator, and diff --git a/llvm/unittests/ADT/DenseSetTest.cpp b/llvm/unittests/ADT/DenseSetTest.cpp index 5a8ee592ddfc7..a24f99b6bb34f 100644 --- a/llvm/unittests/ADT/DenseSetTest.cpp +++ b/llvm/unittests/ADT/DenseSetTest.cpp @@ -58,6 +58,13 @@ TEST(DenseSetTest, InsertRange) { EXPECT_THAT(set, ::testing::UnorderedElementsAre(1, 2, 3)); } +TEST(SmallDenseSetTest, InsertRange) { + llvm::SmallDenseSet set; + constexpr unsigned Args[] = {9, 7, 8}; + set.insert_range(Args); + EXPECT_THAT(set, ::testing::UnorderedElementsAre(7, 8, 9)); +} + struct TestDenseSetInfo { static inline unsigned getEmptyKey() { return ~0; } static inline unsigned getTombstoneKey() { return ~0U - 1; } From 5a3d4036cff159e32aa4ab1b11fd6a25a50a456c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 30 Mar 2025 12:12:38 -0700 Subject: [PATCH 0047/1029] Move relocation specifiers to AMDGPUMCExpr::Specifier Similar to previous migration done for all other ELF targets. Switch from the confusing `VariantKind` to `Specifier`, which aligns with Arm and IBM AIX's documentation. Moving forward, relocation specifiers should be integrated into AMDGPUMCExpr rather than MCSymbolRefExpr::SubclassData. (Note: the term AMDGPUMCExpr::VariantKind is for expressions without relocation specifiers: https://github.com/llvm/llvm-project/pull/82022 It's up to AMDGPU maintainers to integrate these constants into Specifier. ) Pull Request: https://github.com/llvm/llvm-project/pull/133608 --- llvm/include/llvm/MC/MCExpr.h | 8 ------- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 21 ++++++++++--------- .../MCTargetDesc/AMDGPUELFObjectWriter.cpp | 19 +++++++++-------- .../AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp | 17 ++++++++------- .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp | 6 +++--- .../Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h | 16 ++++++++++++++ .../MCTargetDesc/AMDGPUTargetStreamer.cpp | 4 ++-- .../AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp | 1 + 8 files changed, 52 insertions(+), 40 deletions(-) diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index 5bfbd2d9f8e71..edecfe4dd4112 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -218,14 +218,6 @@ class MCSymbolRefExpr : public MCExpr { VK_WASM_GOT_TLS, // Wasm global index of TLS symbol. VK_WASM_FUNCINDEX, // Wasm function index. - VK_AMDGPU_GOTPCREL32_LO, // symbol@gotpcrel32@lo - VK_AMDGPU_GOTPCREL32_HI, // symbol@gotpcrel32@hi - VK_AMDGPU_REL32_LO, // symbol@rel32@lo - VK_AMDGPU_REL32_HI, // symbol@rel32@hi - VK_AMDGPU_REL64, // symbol@rel64 - VK_AMDGPU_ABS32_LO, // symbol@abs32@lo - VK_AMDGPU_ABS32_HI, // symbol@abs32@hi - FirstTargetSpecifier, }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 6fa97d82a668b..3d6b974d1f027 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -17,6 +17,7 @@ #include "AMDGPUAsmPrinter.h" #include "AMDGPUMachineFunction.h" #include "MCTargetDesc/AMDGPUInstPrinter.h" +#include "MCTargetDesc/AMDGPUMCExpr.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" @@ -43,24 +44,24 @@ AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AsmPrinter &ap): Ctx(ctx), ST(st), AP(ap) { } -static MCSymbolRefExpr::VariantKind getVariantKind(unsigned MOFlags) { +static AMDGPUMCExpr::Specifier getSpecifier(unsigned MOFlags) { switch (MOFlags) { default: - return MCSymbolRefExpr::VK_None; + return AMDGPUMCExpr::S_None; case SIInstrInfo::MO_GOTPCREL: - return MCSymbolRefExpr::VK_GOTPCREL; + return AMDGPUMCExpr::S_GOTPCREL; case SIInstrInfo::MO_GOTPCREL32_LO: - return MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_LO; + return AMDGPUMCExpr::S_GOTPCREL32_LO; case SIInstrInfo::MO_GOTPCREL32_HI: - return MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_HI; + return AMDGPUMCExpr::S_GOTPCREL32_HI; case SIInstrInfo::MO_REL32_LO: - return MCSymbolRefExpr::VK_AMDGPU_REL32_LO; + return AMDGPUMCExpr::S_REL32_LO; case SIInstrInfo::MO_REL32_HI: - return MCSymbolRefExpr::VK_AMDGPU_REL32_HI; + return AMDGPUMCExpr::S_REL32_HI; case SIInstrInfo::MO_ABS32_LO: - return MCSymbolRefExpr::VK_AMDGPU_ABS32_LO; + return AMDGPUMCExpr::S_ABS32_LO; case SIInstrInfo::MO_ABS32_HI: - return MCSymbolRefExpr::VK_AMDGPU_ABS32_HI; + return AMDGPUMCExpr::S_ABS32_HI; } } @@ -85,7 +86,7 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, AP.getNameWithPrefix(SymbolName, GV); MCSymbol *Sym = Ctx.getOrCreateSymbol(SymbolName); const MCExpr *Expr = - MCSymbolRefExpr::create(Sym, getVariantKind(MO.getTargetFlags()),Ctx); + MCSymbolRefExpr::create(Sym, getSpecifier(MO.getTargetFlags()), Ctx); int64_t Offset = MO.getOffset(); if (Offset != 0) { Expr = MCBinaryExpr::createAdd(Expr, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 2d960a32339f4..50531af627e4a 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -8,6 +8,7 @@ #include "AMDGPUFixupKinds.h" #include "AMDGPUMCTargetDesc.h" +#include "MCTargetDesc/AMDGPUMCExpr.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCValue.h" @@ -45,24 +46,24 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_AMDGPU_ABS32_LO; } - switch (Target.getAccessVariant()) { + switch (AMDGPUMCExpr::Specifier(Target.getAccessVariant())) { default: break; - case MCSymbolRefExpr::VK_GOTPCREL: + case AMDGPUMCExpr::S_GOTPCREL: return ELF::R_AMDGPU_GOTPCREL; - case MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_LO: + case AMDGPUMCExpr::S_GOTPCREL32_LO: return ELF::R_AMDGPU_GOTPCREL32_LO; - case MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_HI: + case AMDGPUMCExpr::S_GOTPCREL32_HI: return ELF::R_AMDGPU_GOTPCREL32_HI; - case MCSymbolRefExpr::VK_AMDGPU_REL32_LO: + case AMDGPUMCExpr::S_REL32_LO: return ELF::R_AMDGPU_REL32_LO; - case MCSymbolRefExpr::VK_AMDGPU_REL32_HI: + case AMDGPUMCExpr::S_REL32_HI: return ELF::R_AMDGPU_REL32_HI; - case MCSymbolRefExpr::VK_AMDGPU_REL64: + case AMDGPUMCExpr::S_REL64: return ELF::R_AMDGPU_REL64; - case MCSymbolRefExpr::VK_AMDGPU_ABS32_LO: + case AMDGPUMCExpr::S_ABS32_LO: return ELF::R_AMDGPU_ABS32_LO; - case MCSymbolRefExpr::VK_AMDGPU_ABS32_HI: + case AMDGPUMCExpr::S_ABS32_HI: return ELF::R_AMDGPU_ABS32_HI; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 56c53ed587e9f..6f1d89e500ed3 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMCAsmInfo.h" +#include "MCTargetDesc/AMDGPUMCExpr.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSubtargetInfo.h" @@ -16,14 +17,14 @@ using namespace llvm; const MCAsmInfo::VariantKindDesc variantKindDescs[] = { - {MCSymbolRefExpr::VK_GOTPCREL, "gotpcrel"}, - {MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_LO, "gotpcrel32@lo"}, - {MCSymbolRefExpr::VK_AMDGPU_GOTPCREL32_HI, "gotpcrel32@hi"}, - {MCSymbolRefExpr::VK_AMDGPU_REL32_LO, "rel32@lo"}, - {MCSymbolRefExpr::VK_AMDGPU_REL32_HI, "rel32@hi"}, - {MCSymbolRefExpr::VK_AMDGPU_REL64, "rel64"}, - {MCSymbolRefExpr::VK_AMDGPU_ABS32_LO, "abs32@lo"}, - {MCSymbolRefExpr::VK_AMDGPU_ABS32_HI, "abs32@hi"}, + {AMDGPUMCExpr::S_GOTPCREL, "gotpcrel"}, + {AMDGPUMCExpr::S_GOTPCREL32_LO, "gotpcrel32@lo"}, + {AMDGPUMCExpr::S_GOTPCREL32_HI, "gotpcrel32@hi"}, + {AMDGPUMCExpr::S_REL32_LO, "rel32@lo"}, + {AMDGPUMCExpr::S_REL32_HI, "rel32@hi"}, + {AMDGPUMCExpr::S_REL64, "rel64"}, + {AMDGPUMCExpr::S_ABS32_LO, "abs32@lo"}, + {AMDGPUMCExpr::S_ABS32_HI, "abs32@hi"}, }; AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index 1391ef6dd09e5..1e82ee36dc0eb 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/AMDGPUFixupKinds.h" +#include "MCTargetDesc/AMDGPUMCExpr.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "Utils/AMDGPUBaseInfo.h" @@ -546,9 +547,8 @@ static bool needsPCRel(const MCExpr *Expr) { switch (Expr->getKind()) { case MCExpr::SymbolRef: { auto *SE = cast(Expr); - MCSymbolRefExpr::VariantKind Kind = SE->getKind(); - return Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_LO && - Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_HI; + auto Spec = AMDGPU::getSpecifier(SE); + return Spec != AMDGPUMCExpr::S_ABS32_LO && Spec != AMDGPUMCExpr::S_ABS32_HI; } case MCExpr::Binary: { auto *BE = cast(Expr); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index c0167096f022a..f38320ae79858 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -39,6 +39,19 @@ class AMDGPUMCExpr : public MCTargetExpr { AGVK_Occupancy }; + // Relocation specifiers. + enum Specifier { + S_None, + S_GOTPCREL, // symbol@gotpcrel + S_GOTPCREL32_LO, // symbol@gotpcrel32@lo + S_GOTPCREL32_HI, // symbol@gotpcrel32@hi + S_REL32_LO, // symbol@rel32@lo + S_REL32_HI, // symbol@rel32@hi + S_REL64, // symbol@rel64 + S_ABS32_LO, // symbol@abs32@lo + S_ABS32_HI, // symbol@abs32@hi + }; + private: VariantKind Kind; MCContext &Ctx; @@ -113,6 +126,9 @@ void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCExpr *foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx); +static inline AMDGPUMCExpr::Specifier getSpecifier(const MCSymbolRefExpr *SRE) { + return AMDGPUMCExpr::Specifier(SRE->getKind()); +} } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index dbc4c37a77a88..a6c97a02cb959 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -1005,8 +1005,8 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( // It implies R_AMDGPU_REL64, but ends up being R_AMDGPU_ABS64. Streamer.emitValue( MCBinaryExpr::createSub( - MCSymbolRefExpr::create(KernelCodeSymbol, - MCSymbolRefExpr::VK_AMDGPU_REL64, Context), + MCSymbolRefExpr::create(KernelCodeSymbol, AMDGPUMCExpr::S_REL64, + Context), MCSymbolRefExpr::create(KernelDescriptorSymbol, Context), Context), sizeof(amdhsa::kernel_descriptor_t::kernel_code_entry_byte_offset)); for (uint32_t i = 0; i < sizeof(amdhsa::kernel_descriptor_t::reserved1); ++i) diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index 8195c93d847b0..0d5287443c490 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -13,6 +13,7 @@ // //===----------------------------------------------------------------------===// +#include "MCTargetDesc/AMDGPUMCExpr.h" #include "MCTargetDesc/R600MCTargetDesc.h" #include "R600Defines.h" #include "llvm/MC/MCCodeEmitter.h" From 8a8c89a4c6f92412643f2bbe46a20800197321b0 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 30 Mar 2025 12:21:03 -0700 Subject: [PATCH 0048/1029] [AArch64] Use llvm::erase_if (NFC) (#133647) --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 1761f58faf0fe..6bf6ce7167833 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -5552,12 +5552,9 @@ void AArch64FrameLowering::emitRemarks( return; llvm::sort(StackAccesses); - StackAccesses.erase(llvm::remove_if(StackAccesses, - [](const StackAccess &S) { - return S.AccessTypes == - StackAccess::NotAccessed; - }), - StackAccesses.end()); + llvm::erase_if(StackAccesses, [](const StackAccess &S) { + return S.AccessTypes == StackAccess::NotAccessed; + }); SmallVector MixedObjects; SmallVector> HazardPairs; From 1c8647a25a5a7527ea546212ddff56ef88ab27b9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 30 Mar 2025 12:21:21 -0700 Subject: [PATCH 0049/1029] [mlir] Use llvm::hasSingleElement (NFC) (#133648) --- mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp index bcfd7ebccd12d..2959d67b366b9 100644 --- a/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp @@ -255,8 +255,7 @@ static bool hasOneBranchOpTo(Block &srcBlock, Block &dstBlock) { /// Returns true if the given `block` only contains one `spirv.mlir.merge` op. static bool isMergeBlock(Block &block) { - return !block.empty() && std::next(block.begin()) == block.end() && - isa(block.front()); + return llvm::hasSingleElement(block) && isa(block.front()); } /// Returns true if a `spirv.mlir.merge` op outside the merge block. From 2c73711995e4fe0f706de351eef4122b8cd8a4d7 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 30 Mar 2025 12:21:38 -0700 Subject: [PATCH 0050/1029] [TableGen] Use llvm::append_range (NFC) (#133649) --- llvm/utils/TableGen/DXILEmitter.cpp | 16 ++++------------ llvm/utils/TableGen/DecoderEmitter.cpp | 12 ++++-------- llvm/utils/TableGen/X86DisassemblerTables.cpp | 3 +-- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp index 0b553c3a3d456..0364b02c2483d 100644 --- a/llvm/utils/TableGen/DXILEmitter.cpp +++ b/llvm/utils/TableGen/DXILEmitter.cpp @@ -113,9 +113,7 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) { ParamTypeRecs.push_back(R->getValueAsDef("result")); - for (const Record *ArgTy : R->getValueAsListOfDefs("arguments")) { - ParamTypeRecs.push_back(ArgTy); - } + llvm::append_range(ParamTypeRecs, R->getValueAsListOfDefs("arguments")); size_t ParamTypeRecsSize = ParamTypeRecs.size(); // Populate OpTypes with return type and parameter types @@ -148,9 +146,7 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) { // Sort records in ascending order of DXIL version ascendingSortByVersion(Recs); - for (const Record *CR : Recs) { - OverloadRecs.push_back(CR); - } + llvm::append_range(OverloadRecs, Recs); // Get stage records Recs = R->getValueAsListOfDefs("stages"); @@ -163,9 +159,7 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) { // Sort records in ascending order of DXIL version ascendingSortByVersion(Recs); - for (const Record *CR : Recs) { - StageRecs.push_back(CR); - } + llvm::append_range(StageRecs, Recs); // Get attribute records Recs = R->getValueAsListOfDefs("attributes"); @@ -173,9 +167,7 @@ DXILOperationDesc::DXILOperationDesc(const Record *R) { // Sort records in ascending order of DXIL version ascendingSortByVersion(Recs); - for (const Record *CR : Recs) { - AttrRecs.push_back(CR); - } + llvm::append_range(AttrRecs, Recs); // Get the operation class OpClass = R->getValueAsDef("OpClass")->getName(); diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index e1344ae54b20e..cf7c02db8842e 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -1342,8 +1342,7 @@ void FilterChooser::emitPredicateTableEntry(DecoderTableInfo &TableInfo, TableInfo.Table.push_back(MCD::OPC_CheckPredicate); // Predicate index. - for (const auto PB : PBytes) - TableInfo.Table.push_back(PB); + llvm::append_range(TableInfo.Table, PBytes); // Push location for NumToSkip backpatching. TableInfo.FixupStack.back().push_back(TableInfo.Table.size()); TableInfo.Table.push_back(0); @@ -1402,15 +1401,13 @@ void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo, raw_svector_ostream S(MaskBytes); if (NeedPositiveMask) { encodeULEB128(PositiveMask.getZExtValue(), S); - for (unsigned i = 0, e = MaskBytes.size(); i != e; ++i) - TableInfo.Table.push_back(MaskBytes[i]); + llvm::append_range(TableInfo.Table, MaskBytes); } else TableInfo.Table.push_back(0); if (NeedNegativeMask) { MaskBytes.clear(); encodeULEB128(NegativeMask.getZExtValue(), S); - for (unsigned i = 0, e = MaskBytes.size(); i != e; ++i) - TableInfo.Table.push_back(MaskBytes[i]); + llvm::append_range(TableInfo.Table, MaskBytes); } else TableInfo.Table.push_back(0); } @@ -1483,8 +1480,7 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, encodeULEB128(DIdx, S); // Decoder index. - for (const auto B : Bytes) - TableInfo.Table.push_back(B); + llvm::append_range(TableInfo.Table, Bytes); if (!HasCompleteDecoder) { // Push location for NumToSkip backpatching. diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp index 5e7983a101e0b..36f752a1ebe63 100644 --- a/llvm/utils/TableGen/X86DisassemblerTables.cpp +++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp @@ -746,8 +746,7 @@ void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2, ModRMDecision.push_back(decision.instructionIDs[index]); break; case MODRM_FULL: - for (unsigned short InstructionID : decision.instructionIDs) - ModRMDecision.push_back(InstructionID); + llvm::append_range(ModRMDecision, decision.instructionIDs); break; } From 06cb7b1e14a117e8fe19b72689c8616c772c0807 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 30 Mar 2025 12:21:59 -0700 Subject: [PATCH 0051/1029] [Transforms] Use llvm::append_range (NFC) (#133650) --- .../Transforms/IPO/MemProfContextDisambiguation.cpp | 11 ++++------- llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp | 2 +- llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 6 ++---- llvm/lib/Transforms/Utils/CodeLayout.cpp | 2 +- llvm/lib/Transforms/Utils/SampleProfileInference.cpp | 4 ++-- 5 files changed, 10 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index f5ae204426170..df1f6fddeba60 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -2286,8 +2286,7 @@ void CallsiteContextGraph AllCalls; AllCalls.reserve(Node->MatchingCalls.size() + 1); AllCalls.push_back(Node->Call); - AllCalls.insert(AllCalls.end(), Node->MatchingCalls.begin(), - Node->MatchingCalls.end()); + llvm::append_range(AllCalls, Node->MatchingCalls); // First see if we can partition the calls by callee function, creating new // nodes to host each set of calls calling the same callees. This is @@ -2468,9 +2467,8 @@ bool CallsiteContextGraph::partitionCallsByCallee( // The first call becomes the primary call for this caller node, and the // rest go in the matching calls list. Info->Node->setCall(Info->Calls.front()); - Info->Node->MatchingCalls.insert(Info->Node->MatchingCalls.end(), - Info->Calls.begin() + 1, - Info->Calls.end()); + llvm::append_range(Info->Node->MatchingCalls, + llvm::drop_begin(Info->Calls)); // Save the primary call to node correspondence so that we can update // the NonAllocationCallToContextNodeMap, which is being iterated in the // caller of this function. @@ -4117,8 +4115,7 @@ bool CallsiteContextGraph::assignFunctions() { // Ignore original Node if we moved all of its contexts to clones. if (!Node->emptyContextIds()) ClonesWorklist.push_back(Node); - ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), - Node->Clones.end()); + llvm::append_range(ClonesWorklist, Node->Clones); // Now walk through all of the clones of this callsite Node that we need, // and determine the assignment to a corresponding clone of the current diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 82434680b8f23..938aab5879044 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -399,7 +399,7 @@ struct ThreadingPath { void push_back(BasicBlock *BB) { Path.push_back(BB); } void push_front(BasicBlock *BB) { Path.push_front(BB); } void appendExcludingFirst(const PathType &OtherPath) { - Path.insert(Path.end(), OtherPath.begin() + 1, OtherPath.end()); + llvm::append_range(Path, llvm::drop_begin(OtherPath)); } void print(raw_ostream &OS) const { diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 4f7956514b7b5..4c6f6f12d7138 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -3641,14 +3641,12 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, } // Next check all loops nested within L. SmallVector Worklist; - Worklist.insert(Worklist.end(), L->getSubLoops().begin(), - L->getSubLoops().end()); + llvm::append_range(Worklist, L->getSubLoops()); while (!Worklist.empty()) { auto *CurLoop = Worklist.pop_back_val(); if (!PSI->isColdBlock(CurLoop->getHeader(), BFI)) return false; - Worklist.insert(Worklist.end(), CurLoop->getSubLoops().begin(), - CurLoop->getSubLoops().end()); + llvm::append_range(Worklist, CurLoop->getSubLoops()); } return true; }; diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp index baaad8bb48f33..c76b3afef50c2 100644 --- a/llvm/lib/Transforms/Utils/CodeLayout.cpp +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -387,7 +387,7 @@ struct ChainEdge { void appendJump(JumpT *Jump) { Jumps.push_back(Jump); } void moveJumps(ChainEdge *Other) { - Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end()); + llvm::append_range(Jumps, Other->Jumps); Other->Jumps.clear(); Other->Jumps.shrink_to_fit(); } diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp index 54d46117729c9..53bcaa6d3df03 100644 --- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp +++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp @@ -672,8 +672,8 @@ class FlowAdjuster { // Concatenate the two paths std::vector Result; - Result.insert(Result.end(), ForwardPath.begin(), ForwardPath.end()); - Result.insert(Result.end(), BackwardPath.begin(), BackwardPath.end()); + llvm::append_range(Result, ForwardPath); + llvm::append_range(Result, BackwardPath); return Result; } From fd8fb7148674456ce3fb338864c9aa9f576feb22 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 30 Mar 2025 22:20:34 +0100 Subject: [PATCH 0052/1029] [VPlan] Handle scalar casts and blend in isUniformAfterVectorization. Currently should be NFC, but will be used by https://github.com/llvm/llvm-project/pull/117506. --- llvm/lib/Transforms/Vectorize/VPlanUtils.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 6ddb88308955f..aa4f446cf06bc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -45,7 +45,8 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) { return true; if (auto *Rep = dyn_cast(VPV)) return Rep->isUniform(); - if (isa(VPV)) + if (isa(VPV)) return all_of(VPV->getDefiningRecipe()->operands(), isUniformAfterVectorization); if (auto *VPI = dyn_cast(VPV)) From 5f56eaff8b033e5d87818bdc9543c1463cc0a755 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 30 Mar 2025 22:27:44 +0100 Subject: [PATCH 0053/1029] [VPlan] Remove duplicated VPDerivedIVRecipe handling (NFC). Also handled by an earlier, more general if above. --- llvm/lib/Transforms/Vectorize/VPlanUtils.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index aa4f446cf06bc..87c5797d9e452 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -54,8 +54,6 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) { ((Instruction::isBinaryOp(VPI->getOpcode()) || VPI->getOpcode() == VPInstruction::PtrAdd) && all_of(VPI->operands(), isUniformAfterVectorization)); - if (auto *IV = dyn_cast(VPV)) - return all_of(IV->operands(), isUniformAfterVectorization); // VPExpandSCEVRecipes must be placed in the entry and are alway uniform. return isa(VPV); From 848faf49f793968ae6dee1577f507d4cc68b157e Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Sun, 30 Mar 2025 15:08:40 -0700 Subject: [PATCH 0054/1029] [lldb] Combine disassembler gtest binaries for efficiency (#133539) Each of these executables is 642MB for me locally with split DWARF, and we don't need 3 statically linked gtest binaries when one will do. --- .../unittests/Disassembler/ARM/CMakeLists.txt | 12 --------- lldb/unittests/Disassembler/CMakeLists.txt | 27 ++++++++++++++++--- .../Disassembler/RISCV/CMakeLists.txt | 12 --------- .../unittests/Disassembler/x86/CMakeLists.txt | 12 --------- 4 files changed, 24 insertions(+), 39 deletions(-) delete mode 100644 lldb/unittests/Disassembler/ARM/CMakeLists.txt delete mode 100644 lldb/unittests/Disassembler/RISCV/CMakeLists.txt delete mode 100644 lldb/unittests/Disassembler/x86/CMakeLists.txt diff --git a/lldb/unittests/Disassembler/ARM/CMakeLists.txt b/lldb/unittests/Disassembler/ARM/CMakeLists.txt deleted file mode 100644 index 91af06fa19d6f..0000000000000 --- a/lldb/unittests/Disassembler/ARM/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -add_lldb_unittest(DisassemblerTests - TestArm64Disassembly.cpp - TestArmv7Disassembly.cpp - LINK_LIBS - lldbCore - lldbSymbol - lldbTarget - lldbPluginDisassemblerLLVMC - lldbPluginProcessUtility - LINK_COMPONENTS - Support - ${LLVM_TARGETS_TO_BUILD}) diff --git a/lldb/unittests/Disassembler/CMakeLists.txt b/lldb/unittests/Disassembler/CMakeLists.txt index 208f1807427f4..81aff5902db74 100644 --- a/lldb/unittests/Disassembler/CMakeLists.txt +++ b/lldb/unittests/Disassembler/CMakeLists.txt @@ -1,11 +1,32 @@ +set(disas_srcs "") + if("ARM" IN_LIST LLVM_TARGETS_TO_BUILD) - add_subdirectory(ARM) + list(APPEND + ARM/TestArm64Disassembly.cpp + ARM/TestArmv7Disassembly.cpp + ) endif() if("X86" IN_LIST LLVM_TARGETS_TO_BUILD) - add_subdirectory(x86) + list(APPEND disas_srcs + x86/TestGetControlFlowKindx86.cpp + ) endif() if("RISCV" IN_LIST LLVM_TARGETS_TO_BUILD) - add_subdirectory(RISCV) + list(APPEND disas_srcs + RISCV/TestMCDisasmInstanceRISCV.cpp + ) endif() + +add_lldb_unittest(DisassemblerTests + ${disas_srcs} + LINK_LIBS + lldbCore + lldbSymbol + lldbTarget + lldbPluginDisassemblerLLVMC + lldbPluginProcessUtility + LINK_COMPONENTS + Support + ${LLVM_TARGETS_TO_BUILD}) diff --git a/lldb/unittests/Disassembler/RISCV/CMakeLists.txt b/lldb/unittests/Disassembler/RISCV/CMakeLists.txt deleted file mode 100644 index 5bcc3e948335c..0000000000000 --- a/lldb/unittests/Disassembler/RISCV/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -add_lldb_unittest(MCDisasmInstanceRISCVTests - TestMCDisasmInstanceRISCV.cpp - LINK_LIBS - lldbCore - lldbSymbol - lldbTarget - lldbPluginDisassemblerLLVMC - lldbPluginProcessUtility - LINK_COMPONENTS - Support - ${LLVM_TARGETS_TO_BUILD} - ) diff --git a/lldb/unittests/Disassembler/x86/CMakeLists.txt b/lldb/unittests/Disassembler/x86/CMakeLists.txt deleted file mode 100644 index 31d84cf5d8365..0000000000000 --- a/lldb/unittests/Disassembler/x86/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -add_lldb_unittest(GetControlFlowKindx86Tests - TestGetControlFlowKindx86.cpp - LINK_LIBS - lldbCore - lldbSymbol - lldbTarget - lldbPluginDisassemblerLLVMC - lldbPluginProcessUtility - LINK_COMPONENTS - Support - ${LLVM_TARGETS_TO_BUILD} - ) From 92e591684576e7244d53722b04e840f28c3f03c3 Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Sun, 30 Mar 2025 15:14:48 -0700 Subject: [PATCH 0055/1029] [lldb] Fix cmake logic when no targets are configured Should fix reported lldb-remote-linux-ubuntu bot post-submit failure --- lldb/unittests/Disassembler/CMakeLists.txt | 24 ++++++++++++---------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/lldb/unittests/Disassembler/CMakeLists.txt b/lldb/unittests/Disassembler/CMakeLists.txt index 81aff5902db74..2a76158bf90fd 100644 --- a/lldb/unittests/Disassembler/CMakeLists.txt +++ b/lldb/unittests/Disassembler/CMakeLists.txt @@ -19,14 +19,16 @@ if("RISCV" IN_LIST LLVM_TARGETS_TO_BUILD) ) endif() -add_lldb_unittest(DisassemblerTests - ${disas_srcs} - LINK_LIBS - lldbCore - lldbSymbol - lldbTarget - lldbPluginDisassemblerLLVMC - lldbPluginProcessUtility - LINK_COMPONENTS - Support - ${LLVM_TARGETS_TO_BUILD}) +if (disas_srcs) + add_lldb_unittest(DisassemblerTests + ${disas_srcs} + LINK_LIBS + lldbCore + lldbSymbol + lldbTarget + lldbPluginDisassemblerLLVMC + lldbPluginProcessUtility + LINK_COMPONENTS + Support + ${LLVM_TARGETS_TO_BUILD}) +endif() From 3b3d1a5c261419da864d0883eccd040c2b72e237 Mon Sep 17 00:00:00 2001 From: Nicolas van Kempen Date: Sun, 30 Mar 2025 18:48:19 -0400 Subject: [PATCH 0056/1029] [NFC][clang-tidy] Add type annotations to check_clang_tidy (#133140) ``` > python3 -m mypy --strict clang-tools-extra/test/clang-tidy/check_clang_tidy.py Success: no issues found in 1 source file ``` --- .../test/clang-tidy/check_clang_tidy.py | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py index 5e39c05f76d86..93c49566a90e3 100755 --- a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py +++ b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py @@ -48,15 +48,16 @@ import re import subprocess import sys +from typing import List, Tuple -def write_file(file_name, text): +def write_file(file_name: str, text: str) -> None: with open(file_name, "w", encoding="utf-8") as f: f.write(text) f.truncate() -def try_run(args, raise_error=True): +def try_run(args: List[str], raise_error: bool = True) -> str: try: process_output = subprocess.check_output(args, stderr=subprocess.STDOUT).decode( errors="ignore" @@ -71,12 +72,12 @@ def try_run(args, raise_error=True): # This class represents the appearance of a message prefix in a file. class MessagePrefix: - def __init__(self, label): + def __init__(self, label: str) -> None: self.has_message = False - self.prefixes = [] + self.prefixes: List[str] = [] self.label = label - def check(self, file_check_suffix, input_text): + def check(self, file_check_suffix: str, input_text: str) -> bool: self.prefix = self.label + file_check_suffix self.has_message = self.prefix in input_text if self.has_message: @@ -85,7 +86,7 @@ def check(self, file_check_suffix, input_text): class CheckRunner: - def __init__(self, args, extra_args): + def __init__(self, args: argparse.Namespace, extra_args: List[str]) -> None: self.resource_dir = args.resource_dir self.assume_file_name = args.assume_filename self.input_file_name = args.input_file_name @@ -143,11 +144,11 @@ def __init__(self, args, extra_args): if self.resource_dir is not None: self.clang_extra_args.append("-resource-dir=%s" % self.resource_dir) - def read_input(self): + def read_input(self) -> None: with open(self.input_file_name, "r", encoding="utf-8") as input_file: self.input_text = input_file.read() - def get_prefixes(self): + def get_prefixes(self) -> None: for suffix in self.check_suffix: if suffix and not re.match("^[A-Z0-9\\-]+$", suffix): sys.exit( @@ -189,7 +190,7 @@ def get_prefixes(self): ) assert expect_diagnosis or self.expect_no_diagnosis - def prepare_test_inputs(self): + def prepare_test_inputs(self) -> None: # Remove the contents of the CHECK lines to avoid CHECKs matching on # themselves. We need to keep the comments to preserve line numbers while # avoiding empty lines which could potentially trigger formatting-related @@ -198,7 +199,7 @@ def prepare_test_inputs(self): write_file(self.temp_file_name, cleaned_test) write_file(self.original_file_name, cleaned_test) - def run_clang_tidy(self): + def run_clang_tidy(self) -> str: args = ( [ "clang-tidy", @@ -238,11 +239,11 @@ def run_clang_tidy(self): print("------------------------------------------------------------------") return clang_tidy_output - def check_no_diagnosis(self, clang_tidy_output): + def check_no_diagnosis(self, clang_tidy_output: str) -> None: if clang_tidy_output != "": sys.exit("No diagnostics were expected, but found the ones above") - def check_fixes(self): + def check_fixes(self) -> None: if self.has_check_fixes: try_run( [ @@ -254,7 +255,7 @@ def check_fixes(self): ] ) - def check_messages(self, clang_tidy_output): + def check_messages(self, clang_tidy_output: str) -> None: if self.has_check_messages: messages_file = self.temp_file_name + ".msg" write_file(messages_file, clang_tidy_output) @@ -268,7 +269,7 @@ def check_messages(self, clang_tidy_output): ] ) - def check_notes(self, clang_tidy_output): + def check_notes(self, clang_tidy_output: str) -> None: if self.has_check_notes: notes_file = self.temp_file_name + ".notes" filtered_output = [ @@ -287,7 +288,7 @@ def check_notes(self, clang_tidy_output): ] ) - def run(self): + def run(self) -> None: self.read_input() if self.export_fixes is None: self.get_prefixes() @@ -313,7 +314,7 @@ def run(self): C_STANDARDS = ["c99", ("c11", "c1x"), "c17", ("c23", "c2x"), "c2y"] -def expand_std(std): +def expand_std(std: str) -> List[str]: split_std, or_later, _ = std.partition("-or-later") if not or_later: @@ -335,11 +336,11 @@ def expand_std(std): return [std] -def csv(string): +def csv(string: str) -> List[str]: return string.split(",") -def parse_arguments(): +def parse_arguments() -> Tuple[argparse.Namespace, List[str]]: parser = argparse.ArgumentParser( prog=pathlib.Path(__file__).stem, description=__doc__, @@ -374,7 +375,7 @@ def parse_arguments(): return parser.parse_known_args() -def main(): +def main() -> None: args, extra_args = parse_arguments() abbreviated_stds = args.std From 2796e41ade306c3bf0f2e21311dff406bcf65652 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Sun, 30 Mar 2025 15:53:32 -0700 Subject: [PATCH 0057/1029] [lldb] Remove unused Version.h include in Telemetry.cpp (NFC) --- lldb/source/Core/Telemetry.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/lldb/source/Core/Telemetry.cpp b/lldb/source/Core/Telemetry.cpp index 62ebdfc027d81..c7789d43c7899 100644 --- a/lldb/source/Core/Telemetry.cpp +++ b/lldb/source/Core/Telemetry.cpp @@ -10,7 +10,6 @@ #include "lldb/Core/Telemetry.h" #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/UUID.h" -#include "lldb/Version/Version.h" #include "lldb/lldb-enumerations.h" #include "lldb/lldb-forward.h" #include "llvm/ADT/StringRef.h" From e5fcbfa2aa8291a57e5eb03cd458935b458c73c0 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sun, 30 Mar 2025 16:02:49 -0700 Subject: [PATCH 0058/1029] [clang-format] Add an option for editing enum trailing commas (#133576) Also refactor the code that removes/replaces a token. --- clang/docs/ClangFormatStyleOptions.rst | 41 ++++++++ clang/docs/ReleaseNotes.rst | 2 + clang/include/clang/Format/Format.h | 34 ++++++ clang/lib/Format/Format.cpp | 117 ++++++++++++++------- clang/unittests/Format/ConfigParseTest.cpp | 8 ++ clang/unittests/Format/FormatTest.cpp | 32 ++++++ 6 files changed, 198 insertions(+), 36 deletions(-) diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index 9ecac68ae72bf..3f8a5f49313b2 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -3976,6 +3976,47 @@ the configuration (without a prefix: ``Auto``). +.. _EnumTrailingComma: + +**EnumTrailingComma** (``EnumTrailingCommaStyle``) :versionbadge:`clang-format 21` :ref:`¶ ` + Insert a comma (if missing) or remove the comma at the end of an ``enum`` + enumerator list. + + .. warning:: + + Setting this option to any value other than ``Leave`` could lead to + incorrect code formatting due to clang-format's lack of complete semantic + information. As such, extra care should be taken to review code changes + made by this option. + + Possible values: + + * ``ETC_Leave`` (in configuration: ``Leave``) + Don't insert or remove trailing commas. + + .. code-block:: c++ + + enum { a, b, c, }; + enum Color { red, green, blue }; + + * ``ETC_Insert`` (in configuration: ``Insert``) + Insert trailing commas. + + .. code-block:: c++ + + enum { a, b, c, }; + enum Color { red, green, blue, }; + + * ``ETC_Remove`` (in configuration: ``Remove``) + Remove trailing commas. + + .. code-block:: c++ + + enum { a, b, c }; + enum Color { red, green, blue }; + + + .. _ExperimentalAutoDetectBinPacking: **ExperimentalAutoDetectBinPacking** (``Boolean``) :versionbadge:`clang-format 3.7` :ref:`¶ ` diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index e409f206f6eae..d72beb3a479b0 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -492,6 +492,8 @@ clang-format - Allow specifying the language (C, C++, or Objective-C) for a ``.h`` file by adding a special comment (e.g. ``// clang-format Language: ObjC``) near the top of the file. +- Add ``EnumTrailingComma`` option for inserting/removing commas at the end of + ``enum`` enumerator lists. libclang -------- diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index fec47a248abb4..cea5e257659d6 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -2704,6 +2704,39 @@ struct FormatStyle { /// \version 12 EmptyLineBeforeAccessModifierStyle EmptyLineBeforeAccessModifier; + /// Styles for ``enum`` trailing commas. + enum EnumTrailingCommaStyle : int8_t { + /// Don't insert or remove trailing commas. + /// \code + /// enum { a, b, c, }; + /// enum Color { red, green, blue }; + /// \endcode + ETC_Leave, + /// Insert trailing commas. + /// \code + /// enum { a, b, c, }; + /// enum Color { red, green, blue, }; + /// \endcode + ETC_Insert, + /// Remove trailing commas. + /// \code + /// enum { a, b, c }; + /// enum Color { red, green, blue }; + /// \endcode + ETC_Remove, + }; + + /// Insert a comma (if missing) or remove the comma at the end of an ``enum`` + /// enumerator list. + /// \warning + /// Setting this option to any value other than ``Leave`` could lead to + /// incorrect code formatting due to clang-format's lack of complete semantic + /// information. As such, extra care should be taken to review code changes + /// made by this option. + /// \endwarning + /// \version 21 + EnumTrailingCommaStyle EnumTrailingComma; + /// If ``true``, clang-format detects whether function calls and /// definitions are formatted with one parameter per line. /// @@ -5323,6 +5356,7 @@ struct FormatStyle { DisableFormat == R.DisableFormat && EmptyLineAfterAccessModifier == R.EmptyLineAfterAccessModifier && EmptyLineBeforeAccessModifier == R.EmptyLineBeforeAccessModifier && + EnumTrailingComma == R.EnumTrailingComma && ExperimentalAutoDetectBinPacking == R.ExperimentalAutoDetectBinPacking && FixNamespaceComments == R.FixNamespaceComments && diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 28aea86139e0d..b74a8631efe0f 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -361,6 +361,15 @@ struct ScalarEnumerationTraits< } }; +template <> +struct ScalarEnumerationTraits { + static void enumeration(IO &IO, FormatStyle::EnumTrailingCommaStyle &Value) { + IO.enumCase(Value, "Leave", FormatStyle::ETC_Leave); + IO.enumCase(Value, "Insert", FormatStyle::ETC_Insert); + IO.enumCase(Value, "Remove", FormatStyle::ETC_Remove); + } +}; + template <> struct ScalarEnumerationTraits { static void enumeration(IO &IO, FormatStyle::IndentExternBlockStyle &Value) { @@ -1042,6 +1051,7 @@ template <> struct MappingTraits { Style.EmptyLineAfterAccessModifier); IO.mapOptional("EmptyLineBeforeAccessModifier", Style.EmptyLineBeforeAccessModifier); + IO.mapOptional("EnumTrailingComma", Style.EnumTrailingComma); IO.mapOptional("ExperimentalAutoDetectBinPacking", Style.ExperimentalAutoDetectBinPacking); IO.mapOptional("FixNamespaceComments", Style.FixNamespaceComments); @@ -1558,6 +1568,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.DisableFormat = false; LLVMStyle.EmptyLineAfterAccessModifier = FormatStyle::ELAAMS_Never; LLVMStyle.EmptyLineBeforeAccessModifier = FormatStyle::ELBAMS_LogicalBlock; + LLVMStyle.EnumTrailingComma = FormatStyle::ETC_Leave; LLVMStyle.ExperimentalAutoDetectBinPacking = false; LLVMStyle.FixNamespaceComments = true; LLVMStyle.ForEachMacros.push_back("foreach"); @@ -2203,6 +2214,21 @@ FormatStyle::GetLanguageStyle(FormatStyle::LanguageKind Language) const { namespace { +void replaceToken(const FormatToken &Token, FormatToken *Next, + const SourceManager &SourceMgr, tooling::Replacements &Result, + StringRef Text = "") { + const auto &Tok = Token.Tok; + SourceLocation Start; + if (Next && Next->NewlinesBefore == 0 && Next->isNot(tok::eof)) { + Start = Tok.getLocation(); + Next->WhitespaceRange = Token.WhitespaceRange; + } else { + Start = Token.WhitespaceRange.getBegin(); + } + const auto &Range = CharSourceRange::getCharRange(Start, Tok.getEndLoc()); + cantFail(Result.add(tooling::Replacement(SourceMgr, Range, Text))); +} + class ParensRemover : public TokenAnalyzer { public: ParensRemover(const Environment &Env, const FormatStyle &Style) @@ -2229,20 +2255,8 @@ class ParensRemover : public TokenAnalyzer { continue; for (const auto *Token = Line->First; Token && !Token->Finalized; Token = Token->Next) { - if (!Token->Optional || !Token->isOneOf(tok::l_paren, tok::r_paren)) - continue; - auto *Next = Token->Next; - assert(Next && Next->isNot(tok::eof)); - SourceLocation Start; - if (Next->NewlinesBefore == 0) { - Start = Token->Tok.getLocation(); - Next->WhitespaceRange = Token->WhitespaceRange; - } else { - Start = Token->WhitespaceRange.getBegin(); - } - const auto &Range = - CharSourceRange::getCharRange(Start, Token->Tok.getEndLoc()); - cantFail(Result.add(tooling::Replacement(SourceMgr, Range, " "))); + if (Token->Optional && Token->isOneOf(tok::l_paren, tok::r_paren)) + replaceToken(*Token, Token->Next, SourceMgr, Result, " "); } } } @@ -2331,24 +2345,13 @@ class BracesRemover : public TokenAnalyzer { const auto *NextLine = I + 1 == End ? nullptr : I[1]; for (const auto *Token = Line->First; Token && !Token->Finalized; Token = Token->Next) { - if (!Token->Optional) - continue; - if (!Token->isOneOf(tok::l_brace, tok::r_brace)) + if (!Token->Optional || !Token->isOneOf(tok::l_brace, tok::r_brace)) continue; auto *Next = Token->Next; assert(Next || Token == Line->Last); if (!Next && NextLine) Next = NextLine->First; - SourceLocation Start; - if (Next && Next->NewlinesBefore == 0 && Next->isNot(tok::eof)) { - Start = Token->Tok.getLocation(); - Next->WhitespaceRange = Token->WhitespaceRange; - } else { - Start = Token->WhitespaceRange.getBegin(); - } - const auto &Range = - CharSourceRange::getCharRange(Start, Token->Tok.getEndLoc()); - cantFail(Result.add(tooling::Replacement(SourceMgr, Range, ""))); + replaceToken(*Token, Next, SourceMgr, Result); } } } @@ -2400,16 +2403,51 @@ class SemiRemover : public TokenAnalyzer { assert(Next || Token == Line->Last); if (!Next && NextLine) Next = NextLine->First; - SourceLocation Start; - if (Next && Next->NewlinesBefore == 0 && Next->isNot(tok::eof)) { - Start = Token->Tok.getLocation(); - Next->WhitespaceRange = Token->WhitespaceRange; - } else { - Start = Token->WhitespaceRange.getBegin(); + replaceToken(*Token, Next, SourceMgr, Result); + } + } + } +}; + +class EnumTrailingCommaEditor : public TokenAnalyzer { +public: + EnumTrailingCommaEditor(const Environment &Env, const FormatStyle &Style) + : TokenAnalyzer(Env, Style) {} + + std::pair + analyze(TokenAnnotator &Annotator, + SmallVectorImpl &AnnotatedLines, + FormatTokenLexer &Tokens) override { + AffectedRangeMgr.computeAffectedLines(AnnotatedLines); + tooling::Replacements Result; + editEnumTrailingComma(AnnotatedLines, Result); + return {Result, 0}; + } + +private: + void editEnumTrailingComma(SmallVectorImpl &Lines, + tooling::Replacements &Result) { + const auto &SourceMgr = Env.getSourceManager(); + for (auto *Line : Lines) { + if (!Line->Children.empty()) + editEnumTrailingComma(Line->Children, Result); + if (!Line->Affected) + continue; + for (const auto *Token = Line->First; Token && !Token->Finalized; + Token = Token->Next) { + if (Token->isNot(TT_EnumRBrace)) + continue; + const auto *BeforeRBrace = Token->getPreviousNonComment(); + assert(BeforeRBrace); + if (BeforeRBrace->is(TT_EnumLBrace)) // Empty braces. + continue; + if (BeforeRBrace->is(tok::comma)) { + if (Style.EnumTrailingComma == FormatStyle::ETC_Remove) + replaceToken(*BeforeRBrace, BeforeRBrace->Next, SourceMgr, Result); + } else if (Style.EnumTrailingComma == FormatStyle::ETC_Insert) { + cantFail(Result.add(tooling::Replacement( + SourceMgr, BeforeRBrace->Tok.getEndLoc(), 0, ","))); } - const auto &Range = - CharSourceRange::getCharRange(Start, Token->Tok.getEndLoc()); - cantFail(Result.add(tooling::Replacement(SourceMgr, Range, ""))); } } } @@ -3812,6 +3850,13 @@ reformat(const FormatStyle &Style, StringRef Code, }); } + if (Style.EnumTrailingComma != FormatStyle::ETC_Leave) { + Passes.emplace_back([&](const Environment &Env) { + return EnumTrailingCommaEditor(Env, Expanded) + .process(/*SkipAnnotation=*/true); + }); + } + if (Style.FixNamespaceComments) { Passes.emplace_back([&](const Environment &Env) { return NamespaceEndCommentsFixer(Env, Expanded).process(); diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 287191d04d885..2b08b794792e9 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -520,6 +520,14 @@ TEST(ConfigParseTest, ParsesConfiguration) { CHECK_PARSE("EmptyLineBeforeAccessModifier: Always", EmptyLineBeforeAccessModifier, FormatStyle::ELBAMS_Always); + Style.EnumTrailingComma = FormatStyle::ETC_Insert; + CHECK_PARSE("EnumTrailingComma: Leave", EnumTrailingComma, + FormatStyle::ETC_Leave); + CHECK_PARSE("EnumTrailingComma: Insert", EnumTrailingComma, + FormatStyle::ETC_Insert); + CHECK_PARSE("EnumTrailingComma: Remove", EnumTrailingComma, + FormatStyle::ETC_Remove); + Style.AlignAfterOpenBracket = FormatStyle::BAS_AlwaysBreak; CHECK_PARSE("AlignAfterOpenBracket: Align", AlignAfterOpenBracket, FormatStyle::BAS_Align); diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 0b90bd360b758..4dfa135120605 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -27902,6 +27902,38 @@ TEST_F(FormatTest, RemoveSemicolon) { verifyFormat("STRUCT(T, B) { int i; };", Style); } +TEST_F(FormatTest, EnumTrailingComma) { + constexpr StringRef Code("enum : int { /**/ };\n" + "enum {\n" + " a,\n" + " b,\n" + " c, //\n" + "};\n" + "enum Color { red, green, blue /**/ };"); + verifyFormat(Code); + + auto Style = getLLVMStyle(); + Style.EnumTrailingComma = FormatStyle::ETC_Insert; + verifyFormat("enum : int { /**/ };\n" + "enum {\n" + " a,\n" + " b,\n" + " c, //\n" + "};\n" + "enum Color { red, green, blue, /**/ };", + Code, Style); + + Style.EnumTrailingComma = FormatStyle::ETC_Remove; + verifyFormat("enum : int { /**/ };\n" + "enum {\n" + " a,\n" + " b,\n" + " c //\n" + "};\n" + "enum Color { red, green, blue /**/ };", + Code, Style); +} + TEST_F(FormatTest, BreakAfterAttributes) { constexpr StringRef Code("[[maybe_unused]] const int i;\n" "[[foo([[]])]] [[maybe_unused]]\n" From 3acccf042ab8a7b7e663bb2b2fac328d9bf65b38 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 30 Mar 2025 16:38:16 -0700 Subject: [PATCH 0059/1029] [MC] Don't print () around $ names This MIPS behavior from edb9d84dcc4824865e86f963e52d67eb50dde7f5 (2010) is obsoleted and misleading. This caused confusion in https://reviews.llvm.org/D123702 ([NVPTX] Disable parens for identifiers starting with '$') Note: $tmp was rejected by AsmParser before https://reviews.llvm.org/D75111 (2020) --- llvm/include/llvm/MC/MCAsmInfo.h | 3 - llvm/include/llvm/MC/MCExpr.h | 3 +- llvm/lib/MC/MCExpr.cpp | 14 +---- .../AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp | 2 +- .../Mips/MCTargetDesc/MipsInstPrinter.cpp | 2 +- .../Target/Mips/MCTargetDesc/MipsMCExpr.cpp | 4 +- .../NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp | 4 -- .../Xtensa/MCTargetDesc/XtensaInstPrinter.cpp | 8 +-- .../CodeGen/AArch64/arm64ec-exit-thunks.ll | 44 ++++++------- .../AArch64/arm64ec-hybrid-patchable.ll | 20 +++--- llvm/test/CodeGen/AArch64/pr58516.ll | 2 +- .../AArch64/win-catchpad-nested-cxx.ll | 10 +-- .../wineh-catchret-label-generation.ll | 2 +- llvm/test/CodeGen/AArch64/wineh-try-catch.ll | 6 +- llvm/test/CodeGen/Mips/ehframe-indirect.ll | 2 +- .../Mips/indirect-jump-hazard/long-branch.ll | 8 +-- llvm/test/CodeGen/Mips/jtstat.ll | 20 +++--- .../CodeGen/Mips/load-store-left-right.ll | 8 +-- llvm/test/CodeGen/Mips/longbranch.ll | 20 +++--- llvm/test/CodeGen/Mips/mcount.ll | 4 +- llvm/test/CodeGen/Mips/micromips-mtc-mfc.ll | 4 +- llvm/test/CodeGen/Mips/mips16ex.ll | 2 +- llvm/test/CodeGen/Mips/reloc-jalr.ll | 8 +-- llvm/test/CodeGen/Mips/shrink-wrapping.ll | 8 +-- llvm/test/CodeGen/Mips/unalignedload.ll | 16 ++--- .../xray-mips-attribute-instrumentation.ll | 10 +-- llvm/test/CodeGen/X86/catchpad-reuse.ll | 10 +-- llvm/test/CodeGen/X86/dollar-name.ll | 6 +- .../X86/seh-unwind-inline-asm-codegen.ll | 4 +- llvm/test/CodeGen/X86/stack-coloring-wineh.ll | 4 +- .../CodeGen/X86/win-catchpad-nested-cxx.ll | 14 ++--- llvm/test/CodeGen/X86/win-catchpad.ll | 16 ++--- llvm/test/CodeGen/X86/win-cleanuppad.ll | 10 +-- llvm/test/CodeGen/X86/win-funclet-cfi.ll | 4 +- llvm/test/CodeGen/X86/win32-eh.ll | 4 +- .../X86/windows-seh-EHa-CppCatchDotDotDot.ll | 8 +-- .../CodeGen/X86/windows-seh-EHa-CppDtors01.ll | 4 +- llvm/test/DebugInfo/COFF/jump-table.ll | 4 +- llvm/test/MC/ARM/arm-branches.s | 14 ++--- .../MC/AsmParser/dollars-in-identifiers.s | 2 +- llvm/test/MC/MachO/dollar-identifier.s | 2 +- llvm/test/MC/Mips/expansion-jal-sym-pic.s | 62 +++++++++---------- llvm/test/MC/Mips/macro-div.s | 24 +++---- llvm/test/MC/Mips/macro-divu.s | 6 +- llvm/test/MC/Mips/macro-rem.s | 6 +- llvm/test/MC/Mips/macro-remu.s | 4 +- llvm/test/MC/Mips/mips-fpu-instructions.s | 2 +- llvm/test/MC/Mips/mips1/valid.s | 2 +- llvm/test/MC/Mips/mips2/valid.s | 2 +- llvm/test/MC/Mips/mips32/valid.s | 2 +- llvm/test/MC/Mips/mips32r2/valid.s | 2 +- llvm/test/MC/Mips/mips32r3/valid.s | 2 +- llvm/test/MC/Mips/mips32r5/valid.s | 2 +- llvm/test/MC/Mips/mips32r6/valid.s | 2 +- llvm/test/MC/Mips/mips64r6/valid.s | 2 +- llvm/test/MC/Mips/mips_directives.s | 2 +- .../MC/Mips/reloc-directive-label-offset.s | 10 +-- 57 files changed, 227 insertions(+), 245 deletions(-) diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index d7beebf614516..3134ee02f54be 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -650,9 +650,6 @@ class MCAsmInfo { bool doDwarfFDESymbolsUseAbsDiff() const { return DwarfFDESymbolsUseAbsDiff; } bool useDwarfRegNumForCFI() const { return DwarfRegNumForCFI; } bool useParensForSymbolVariant() const { return UseParensForSymbolVariant; } - bool useParensForDollarSignNames() const { - return UseParensForDollarSignNames; - } bool supportsExtendedDwarfLocDirective() const { return SupportsExtendedDwarfLocDirective; } diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index edecfe4dd4112..d6829f2bcc734 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -81,8 +81,7 @@ class MCExpr { /// \name Utility Methods /// @{ - void print(raw_ostream &OS, const MCAsmInfo *MAI, - bool InParens = false) const; + void print(raw_ostream &OS, const MCAsmInfo *MAI) const; void dump() const; /// Returns whether the given symbol is used anywhere in the expression or diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index dd45a94ea892a..253247561354b 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -40,7 +40,7 @@ STATISTIC(MCExprEvaluate, "Number of MCExpr evaluations"); // VariantKind printing and formatting utilize MAI. operator<< (dump and some // target code) specifies MAI as nullptr and should be avoided when MAI is // needed. -void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const { +void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const { switch (getKind()) { case MCExpr::Target: return cast(this)->printImpl(OS, MAI); @@ -75,17 +75,7 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const { case MCExpr::SymbolRef: { const MCSymbolRefExpr &SRE = cast(*this); const MCSymbol &Sym = SRE.getSymbol(); - // Parenthesize names that start with $ so that they don't look like - // absolute names. - bool UseParens = MAI && MAI->useParensForDollarSignNames() && !InParens && - Sym.getName().starts_with('$'); - - if (UseParens) { - OS << '('; - Sym.print(OS, MAI); - OS << ')'; - } else - Sym.print(OS, MAI); + Sym.print(OS, MAI); const MCSymbolRefExpr::VariantKind Kind = SRE.getKind(); if (Kind != MCSymbolRefExpr::VK_None) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp index 7fff2e515b046..678a7be1f2456 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp @@ -77,7 +77,7 @@ void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { break; } for (const auto *It = Args.begin(); It != Args.end(); ++It) { - (*It)->print(OS, MAI, /*InParens=*/false); + (*It)->print(OS, MAI); if ((It + 1) != Args.end()) OS << ", "; } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp index dc7e887167d30..d743f00da273b 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp @@ -138,7 +138,7 @@ void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } assert(Op.isExpr() && "unknown operand kind in printOperand"); - Op.getExpr()->print(O, &MAI, true); + Op.getExpr()->print(O, &MAI); } void MipsInstPrinter::printJumpOperand(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp index d5eca7b65b2b1..39dc329d80222 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp @@ -45,7 +45,7 @@ void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { case MEK_DTPREL: // MEK_DTPREL is used for marking TLS DIEExpr only // and contains a regular sub-expression. - getSubExpr()->print(OS, MAI, true); + getSubExpr()->print(OS, MAI); return; case MEK_CALL_HI16: OS << "%call_hi"; @@ -125,7 +125,7 @@ void MipsMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { if (Expr->evaluateAsAbsolute(AbsVal)) OS << AbsVal; else - Expr->print(OS, MAI, true); + Expr->print(OS, MAI); OS << ')'; } diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp index b453024ba3725..614b3214fd275 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp @@ -58,10 +58,6 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple, UseIntegratedAssembler = false; - // Avoid using parens for identifiers starting with $ - ptxas does - // not expect them. - UseParensForDollarSignNames = false; - // ptxas does not support DWARF `.file fileno directory filename' // syntax as of v11.X. EnableDwarfFileDirectoryDefault = false; diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp index 5f4991b51d246..da7e9098f7544 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaInstPrinter.cpp @@ -100,7 +100,7 @@ void XtensaInstPrinter::printBranchTarget(const MCInst *MI, int OpNum, OS << '+'; OS << Val; } else if (MC.isExpr()) - MC.getExpr()->print(OS, &MAI, true); + MC.getExpr()->print(OS, &MAI); else llvm_unreachable("Invalid operand"); } @@ -115,7 +115,7 @@ void XtensaInstPrinter::printJumpTarget(const MCInst *MI, int OpNum, OS << '+'; OS << Val; } else if (MC.isExpr()) - MC.getExpr()->print(OS, &MAI, true); + MC.getExpr()->print(OS, &MAI); else llvm_unreachable("Invalid operand"); ; @@ -131,7 +131,7 @@ void XtensaInstPrinter::printCallOperand(const MCInst *MI, int OpNum, OS << '+'; OS << Val; } else if (MC.isExpr()) - MC.getExpr()->print(OS, &MAI, true); + MC.getExpr()->print(OS, &MAI); else llvm_unreachable("Invalid operand"); } @@ -149,7 +149,7 @@ void XtensaInstPrinter::printL32RTarget(const MCInst *MI, int OpNum, O << ". "; O << Value; } else if (MC.isExpr()) - MC.getExpr()->print(O, &MAI, true); + MC.getExpr()->print(O, &MAI); else llvm_unreachable("Invalid operand"); } diff --git a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll index dcc675839b714..cba7a8100930f 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll @@ -35,8 +35,8 @@ declare void @no_op() nounwind; ; CHECK-NEXT: adrp x11, no_op ; CHECK-NEXT: add x11, x11, :lo12:no_op ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$v$v) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$v$v) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$v$v +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$v$v ; CHECK-NEXT: blr x8 ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -82,8 +82,8 @@ declare i64 @simple_integers(i8, i16, i32, i64) nounwind; ; CHECK-NEXT: adrp x11, simple_integers ; CHECK-NEXT: add x11, x11, :lo12:simple_integers ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$i8$i8i8i8i8) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$i8$i8i8i8i8) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$i8$i8i8i8i8 +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$i8$i8i8i8i8 ; CHECK-NEXT: blr x8 ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -129,8 +129,8 @@ declare double @simple_floats(float, double) nounwind; ; CHECK-NEXT: adrp x11, simple_floats ; CHECK-NEXT: add x11, x11, :lo12:simple_floats ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$d$fd) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$d$fd) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$d$fd +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$d$fd ; CHECK-NEXT: blr x8 ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -176,8 +176,8 @@ declare void @has_varargs(...) nounwind; ; CHECK-NEXT: adrp x11, has_varargs ; CHECK-NEXT: add x11, x11, :lo12:has_varargs ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$v$varargs) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$v$varargs) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$v$varargs +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$v$varargs ; CHECK-NEXT: blr x8 ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -223,8 +223,8 @@ declare void @has_sret(ptr sret([100 x i8])) nounwind; ; CHECK-NEXT: adrp x11, has_sret ; CHECK-NEXT: add x11, x11, :lo12:has_sret ; CHECK-NEXT: ldr x9, [x9, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$m100$v) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$m100$v) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$m100$v +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$m100$v ; CHECK-NEXT: blr x9 ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -271,8 +271,8 @@ declare void @has_aligned_sret(ptr align 32 sret(%TSRet)) nounwind; ; CHECK: adrp x11, has_aligned_sret ; CHECK: add x11, x11, :lo12:has_aligned_sret ; CHECK: ldr x9, [x9, :lo12:__os_arm64x_check_icall] -; CHECK: adrp x10, ($iexit_thunk$cdecl$m16$v) -; CHECK: add x10, x10, :lo12:($iexit_thunk$cdecl$m16$v) +; CHECK: adrp x10, $iexit_thunk$cdecl$m16$v +; CHECK: add x10, x10, :lo12:$iexit_thunk$cdecl$m16$v ; CHECK: blr x9 ; CHECK: .seh_startepilogue ; CHECK: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -325,8 +325,8 @@ declare [2 x i8] @small_array([2 x i8], [2 x float]) nounwind; ; CHECK-NEXT: adrp x11, small_array ; CHECK-NEXT: add x11, x11, :lo12:small_array ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$m2$m2F8) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$m2$m2F8) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$m2$m2F8 +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$m2$m2F8 ; CHECK-NEXT: blr x8 ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -382,8 +382,8 @@ declare [3 x i64] @large_array([3 x i64], [2 x double], [2 x [2 x i64]]) nounwin ; CHECK-NEXT: adrp x11, large_array ; CHECK-NEXT: add x11, x11, :lo12:large_array ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$m24$m24D16m32) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$m24$m24D16m32) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$m24$m24D16m32 +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$m24$m24D16m32 ; CHECK-NEXT: blr x8 ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -446,8 +446,8 @@ declare %T2 @simple_struct(%T1, %T2, %T3, %T4) nounwind; ; CHECK-NEXT: adrp x11, simple_struct ; CHECK-NEXT: add x11, x11, :lo12:simple_struct ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$m8$i8m8m16m24) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$m8$i8m8m16m24) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$m8$i8m8m16m24 +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$m8$i8m8m16m24 ; CHECK-NEXT: blr x8 ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -499,8 +499,8 @@ declare <4 x i8> @small_vector(<4 x i8> %0) nounwind; ; CHECK-NEXT: adrp x11, small_vector ; CHECK-NEXT: add x11, x11, :lo12:small_vector ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$m$m) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$m$m) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$m$m +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$m$m ; CHECK-NEXT: blr x8 ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -549,8 +549,8 @@ declare <8 x i16> @large_vector(<8 x i16> %0) nounwind; ; CHECK-NEXT: adrp x11, large_vector ; CHECK-NEXT: add x11, x11, :lo12:large_vector ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$m16$m16) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$m16$m16) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$m16$m16 +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$m16$m16 ; CHECK-NEXT: blr x8 ; CHECK-NEXT: .seh_startepilogue ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll index 1ed6a273338ab..20ff5fc5bc5e1 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll @@ -81,8 +81,8 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: adrp x11, func ; CHECK-NEXT: add x11, x11, :lo12:func ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$v$v) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$v$v) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$v$v +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$v$v ; CHECK-NEXT: str x11, [sp, #8] ; CHECK-NEXT: blr x8 ; CHECK-NEXT: blr x11 @@ -111,8 +111,8 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: adrp x11, func ; CHECK-NEXT: add x11, x11, :lo12:func ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_dispatch_call] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$i8$v) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$i8$v) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$i8$v +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$i8$v ; CHECK-NEXT: adrp x9, "#func$hp_target" ; CHECK-NEXT: add x9, x9, :lo12:"#func$hp_target" ; CHECK-NEXT: blr x8 @@ -138,8 +138,8 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: adrp x11, has_varargs ; CHECK-NEXT: add x11, x11, :lo12:has_varargs ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_dispatch_call] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$v$varargs) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$v$varargs) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$v$varargs +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$v$varargs ; CHECK-NEXT: adrp x9, "#has_varargs$hp_target" ; CHECK-NEXT: add x9, x9, :lo12:"#has_varargs$hp_target" ; CHECK-NEXT: blr x8 @@ -165,8 +165,8 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: adrp x11, has_sret ; CHECK-NEXT: add x11, x11, :lo12:has_sret ; CHECK-NEXT: ldr x12, [x9, :lo12:__os_arm64x_dispatch_call] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$m100$v) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$m100$v) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$m100$v +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$m100$v ; CHECK-NEXT: adrp x9, "#has_sret$hp_target" ; CHECK-NEXT: add x9, x9, :lo12:"#has_sret$hp_target" ; CHECK-NEXT: blr x12 @@ -192,8 +192,8 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: adrp x11, exp ; CHECK-NEXT: add x11, x11, :lo12:exp ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_dispatch_call] -; CHECK-NEXT: adrp x10, ($iexit_thunk$cdecl$v$v) -; CHECK-NEXT: add x10, x10, :lo12:($iexit_thunk$cdecl$v$v) +; CHECK-NEXT: adrp x10, $iexit_thunk$cdecl$v$v +; CHECK-NEXT: add x10, x10, :lo12:$iexit_thunk$cdecl$v$v ; CHECK-NEXT: adrp x9, "#exp$hp_target" ; CHECK-NEXT: add x9, x9, :lo12:"#exp$hp_target" ; CHECK-NEXT: blr x8 diff --git a/llvm/test/CodeGen/AArch64/pr58516.ll b/llvm/test/CodeGen/AArch64/pr58516.ll index 3361ded48d4e2..d1775a2e707b6 100644 --- a/llvm/test/CodeGen/AArch64/pr58516.ll +++ b/llvm/test/CodeGen/AArch64/pr58516.ll @@ -56,7 +56,7 @@ define void @osfx(ptr %this) comdat personality ptr @__CxxFrameHandler3 { ; CHECK-NEXT: ret ; CHECK-NEXT: .seh_endfunclet ; CHECK-NEXT: .seh_handlerdata -; CHECK-NEXT: .word ($cppxdata$osfx)@IMGREL +; CHECK-NEXT: .word $cppxdata$osfx@IMGREL ; CHECK-NEXT: .section .text,"xr",discard,osfx ; CHECK-NEXT: .seh_endproc ; CHECK-NEXT: .def "?catch$3@?0?osfx@4HA"; diff --git a/llvm/test/CodeGen/AArch64/win-catchpad-nested-cxx.ll b/llvm/test/CodeGen/AArch64/win-catchpad-nested-cxx.ll index 6d0e9d6929709..0203c337cc68b 100644 --- a/llvm/test/CodeGen/AArch64/win-catchpad-nested-cxx.ll +++ b/llvm/test/CodeGen/AArch64/win-catchpad-nested-cxx.ll @@ -45,12 +45,12 @@ handler2: ; CHECK-LABEL: $cppxdata$try_in_catch: ; CHECK-NEXT: .word 429065506 ; CHECK-NEXT: .word 4 -; CHECK-NEXT: .word ($stateUnwindMap$try_in_catch) +; CHECK-NEXT: .word $stateUnwindMap$try_in_catch ; CHECK-NEXT: .word 2 -; CHECK-NEXT: .word ($tryMap$try_in_catch) +; CHECK-NEXT: .word $tryMap$try_in_catch ; ip2state num + ptr ; CHECK-NEXT: .word 7 -; CHECK-NEXT: .word ($ip2state$try_in_catch) +; CHECK-NEXT: .word $ip2state$try_in_catch ; unwindhelp offset ; CHECK-NEXT: .word -16 ; CHECK-NEXT: .word 0 @@ -62,12 +62,12 @@ handler2: ; CHECK-NEXT: .word 0 ; CHECK-NEXT: .word 3 ; CHECK-NEXT: .word 1 -; CHECK-NEXT: .word ($handlerMap$0$try_in_catch) +; CHECK-NEXT: .word $handlerMap$0$try_in_catch ; CHECK-NEXT: .word 2 ; CHECK-NEXT: .word 2 ; CHECK-NEXT: .word 3 ; CHECK-NEXT: .word 1 -; CHECK-NEXT: .word ($handlerMap$1$try_in_catch) +; CHECK-NEXT: .word $handlerMap$1$try_in_catch ; CHECK: $handlerMap$0$try_in_catch: ; CHECK-NEXT: .word 64 diff --git a/llvm/test/CodeGen/AArch64/wineh-catchret-label-generation.ll b/llvm/test/CodeGen/AArch64/wineh-catchret-label-generation.ll index 1f30865c98e19..3f7df585c52b4 100644 --- a/llvm/test/CodeGen/AArch64/wineh-catchret-label-generation.ll +++ b/llvm/test/CodeGen/AArch64/wineh-catchret-label-generation.ll @@ -35,7 +35,7 @@ define fastcc ptr @test_function(i1 %0, ptr %_Fmtfl.i.i, i1 %1) personality ptr ; CHECK-NEXT: ret ; CHECK-NEXT: .seh_endfunclet ; CHECK-NEXT: .seh_handlerdata -; CHECK-NEXT: .word ($cppxdata$test_function)@IMGREL +; CHECK-NEXT: .word $cppxdata$test_function@IMGREL ; CHECK-NEXT: .text ; CHECK-NEXT: .seh_endproc ; CHECK-NEXT: .def "?catch$5@?0?test_function@4HA"; diff --git a/llvm/test/CodeGen/AArch64/wineh-try-catch.ll b/llvm/test/CodeGen/AArch64/wineh-try-catch.ll index c3b5a8968d7bb..e10b05e2488fd 100644 --- a/llvm/test/CodeGen/AArch64/wineh-try-catch.ll +++ b/llvm/test/CodeGen/AArch64/wineh-try-catch.ll @@ -76,11 +76,11 @@ ; CHECK-LABEL: "$cppxdata$?func@@YAHXZ": ; CHECK-NEXT: .word 429065506 // MagicNumber ; CHECK-NEXT: .word 2 // MaxState -; CHECK-NEXT: .word ("$stateUnwindMap$?func@@YAHXZ")@IMGREL // UnwindMap +; CHECK-NEXT: .word "$stateUnwindMap$?func@@YAHXZ"@IMGREL // UnwindMap ; CHECK-NEXT: .word 1 // NumTryBlocks -; CHECK-NEXT: .word ("$tryMap$?func@@YAHXZ")@IMGREL // TryBlockMap +; CHECK-NEXT: .word "$tryMap$?func@@YAHXZ"@IMGREL // TryBlockMap ; CHECK-NEXT: .word 4 // IPMapEntries -; CHECK-NEXT: .word ("$ip2state$?func@@YAHXZ")@IMGREL // IPToStateXData +; CHECK-NEXT: .word "$ip2state$?func@@YAHXZ"@IMGREL // IPToStateXData ; CHECK-NEXT: .word -16 // UnwindHelp ; UNWIND: Function: ?func@@YAHXZ (0x0) diff --git a/llvm/test/CodeGen/Mips/ehframe-indirect.ll b/llvm/test/CodeGen/Mips/ehframe-indirect.ll index 1cd2b86a8e158..901095cc6e7f6 100644 --- a/llvm/test/CodeGen/Mips/ehframe-indirect.ll +++ b/llvm/test/CodeGen/Mips/ehframe-indirect.ll @@ -51,7 +51,7 @@ declare void @foo() ; O32: [[PC_LABEL:\$tmp[0-9]+]]: ; N32: [[PC_LABEL:\.Ltmp[0-9]+]]: ; N64: [[PC_LABEL:\.Ltmp[0-9]+]]: -; O32: .4byte ($_ZTISt9exception.DW.stub)-([[PC_LABEL]]) +; O32: .4byte $_ZTISt9exception.DW.stub-[[PC_LABEL]] ; N32: .4byte .L_ZTISt9exception.DW.stub-[[PC_LABEL]] ; N64: .4byte .L_ZTISt9exception.DW.stub-[[PC_LABEL]] ; O32: $_ZTISt9exception.DW.stub: diff --git a/llvm/test/CodeGen/Mips/indirect-jump-hazard/long-branch.ll b/llvm/test/CodeGen/Mips/indirect-jump-hazard/long-branch.ll index e8771feefad33..df15658b54f52 100644 --- a/llvm/test/CodeGen/Mips/indirect-jump-hazard/long-branch.ll +++ b/llvm/test/CodeGen/Mips/indirect-jump-hazard/long-branch.ll @@ -34,9 +34,9 @@ define void @test1(i32 signext %s) { ; O32-PIC-NEXT: # %bb.1: # %entry ; O32-PIC-NEXT: addiu $sp, $sp, -8 ; O32-PIC-NEXT: sw $ra, 0($sp) -; O32-PIC-NEXT: lui $1, %hi(($BB0_4)-($BB0_2)) +; O32-PIC-NEXT: lui $1, %hi($BB0_4-$BB0_2) ; O32-PIC-NEXT: bal $BB0_2 -; O32-PIC-NEXT: addiu $1, $1, %lo(($BB0_4)-($BB0_2)) +; O32-PIC-NEXT: addiu $1, $1, %lo($BB0_4-$BB0_2) ; O32-PIC-NEXT: $BB0_2: # %entry ; O32-PIC-NEXT: addu $1, $ra, $1 ; O32-PIC-NEXT: lw $ra, 0($sp) @@ -59,8 +59,8 @@ define void @test1(i32 signext %s) { ; O32-R6-PIC-NEXT: # %bb.1: # %entry ; O32-R6-PIC-NEXT: addiu $sp, $sp, -8 ; O32-R6-PIC-NEXT: sw $ra, 0($sp) -; O32-R6-PIC-NEXT: lui $1, %hi(($BB0_4)-($BB0_2)) -; O32-R6-PIC-NEXT: addiu $1, $1, %lo(($BB0_4)-($BB0_2)) +; O32-R6-PIC-NEXT: lui $1, %hi($BB0_4-$BB0_2) +; O32-R6-PIC-NEXT: addiu $1, $1, %lo($BB0_4-$BB0_2) ; O32-R6-PIC-NEXT: balc $BB0_2 ; O32-R6-PIC-NEXT: $BB0_2: # %entry ; O32-R6-PIC-NEXT: addu $1, $ra, $1 diff --git a/llvm/test/CodeGen/Mips/jtstat.ll b/llvm/test/CodeGen/Mips/jtstat.ll index 21d7aba6aaa61..233ff110f137c 100644 --- a/llvm/test/CodeGen/Mips/jtstat.ll +++ b/llvm/test/CodeGen/Mips/jtstat.ll @@ -59,13 +59,13 @@ sw.epilog: ; preds = %entry, %sw.bb7, %sw ; CHECK-STATIC16: li ${{[0-9]+}}, %hi($JTI{{[0-9]+}}_{{[0-9]+}}) ; CHECK-STATIC16: lw ${{[0-9]+}}, %lo($JTI{{[0-9]+}}_{{[0-9]+}})(${{[0-9]+}}) ; CHECK-STATIC16: $JTI{{[0-9]+}}_{{[0-9]+}}: -; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}}) -; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}}) -; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}}) -; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}}) -; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}}) -; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}}) -; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}}) -; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}}) -; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}}) -; CHECK-STATIC16: .4byte ($BB0_{{[0-9]+}}) +; CHECK-STATIC16: .4byte $BB0_{{[0-9]+}} +; CHECK-STATIC16: .4byte $BB0_{{[0-9]+}} +; CHECK-STATIC16: .4byte $BB0_{{[0-9]+}} +; CHECK-STATIC16: .4byte $BB0_{{[0-9]+}} +; CHECK-STATIC16: .4byte $BB0_{{[0-9]+}} +; CHECK-STATIC16: .4byte $BB0_{{[0-9]+}} +; CHECK-STATIC16: .4byte $BB0_{{[0-9]+}} +; CHECK-STATIC16: .4byte $BB0_{{[0-9]+}} +; CHECK-STATIC16: .4byte $BB0_{{[0-9]+}} +; CHECK-STATIC16: .4byte $BB0_{{[0-9]+}} diff --git a/llvm/test/CodeGen/Mips/load-store-left-right.ll b/llvm/test/CodeGen/Mips/load-store-left-right.ll index 0b7e51cbf7dc6..3f318654d83b8 100644 --- a/llvm/test/CodeGen/Mips/load-store-left-right.ll +++ b/llvm/test/CodeGen/Mips/load-store-left-right.ll @@ -944,7 +944,7 @@ define void @pass_array_byval() nounwind { ; MIPS32-EL-NEXT: lbu $1, 6($1) ; MIPS32-EL-NEXT: sll $1, $1, 16 ; MIPS32-EL-NEXT: lw $25, %call16(extern_func)($gp) -; MIPS32-EL-NEXT: .reloc ($tmp0), R_MIPS_JALR, extern_func +; MIPS32-EL-NEXT: .reloc $tmp0, R_MIPS_JALR, extern_func ; MIPS32-EL-NEXT: $tmp0: ; MIPS32-EL-NEXT: jalr $25 ; MIPS32-EL-NEXT: or $5, $2, $1 @@ -970,7 +970,7 @@ define void @pass_array_byval() nounwind { ; MIPS32-EB-NEXT: lbu $1, 6($1) ; MIPS32-EB-NEXT: sll $1, $1, 8 ; MIPS32-EB-NEXT: lw $25, %call16(extern_func)($gp) -; MIPS32-EB-NEXT: .reloc ($tmp0), R_MIPS_JALR, extern_func +; MIPS32-EB-NEXT: .reloc $tmp0, R_MIPS_JALR, extern_func ; MIPS32-EB-NEXT: $tmp0: ; MIPS32-EB-NEXT: jalr $25 ; MIPS32-EB-NEXT: or $5, $2, $1 @@ -991,7 +991,7 @@ define void @pass_array_byval() nounwind { ; MIPS32R6-EL-NEXT: sll $3, $3, 16 ; MIPS32R6-EL-NEXT: lw $4, 0($1) ; MIPS32R6-EL-NEXT: lw $25, %call16(extern_func)($gp) -; MIPS32R6-EL-NEXT: .reloc ($tmp0), R_MIPS_JALR, extern_func +; MIPS32R6-EL-NEXT: .reloc $tmp0, R_MIPS_JALR, extern_func ; MIPS32R6-EL-NEXT: $tmp0: ; MIPS32R6-EL-NEXT: jalr $25 ; MIPS32R6-EL-NEXT: or $5, $2, $3 @@ -1013,7 +1013,7 @@ define void @pass_array_byval() nounwind { ; MIPS32R6-EB-NEXT: sll $3, $3, 16 ; MIPS32R6-EB-NEXT: lw $4, 0($1) ; MIPS32R6-EB-NEXT: lw $25, %call16(extern_func)($gp) -; MIPS32R6-EB-NEXT: .reloc ($tmp0), R_MIPS_JALR, extern_func +; MIPS32R6-EB-NEXT: .reloc $tmp0, R_MIPS_JALR, extern_func ; MIPS32R6-EB-NEXT: $tmp0: ; MIPS32R6-EB-NEXT: jalr $25 ; MIPS32R6-EB-NEXT: or $5, $3, $2 diff --git a/llvm/test/CodeGen/Mips/longbranch.ll b/llvm/test/CodeGen/Mips/longbranch.ll index d348f03295811..66ee3859ae448 100644 --- a/llvm/test/CodeGen/Mips/longbranch.ll +++ b/llvm/test/CodeGen/Mips/longbranch.ll @@ -58,9 +58,9 @@ define void @test1(i32 signext %s) { ; O32-PIC-NEXT: # %bb.1: # %entry ; O32-PIC-NEXT: addiu $sp, $sp, -8 ; O32-PIC-NEXT: sw $ra, 0($sp) -; O32-PIC-NEXT: lui $1, %hi(($BB0_4)-($BB0_2)) +; O32-PIC-NEXT: lui $1, %hi($BB0_4-$BB0_2) ; O32-PIC-NEXT: bal $BB0_2 -; O32-PIC-NEXT: addiu $1, $1, %lo(($BB0_4)-($BB0_2)) +; O32-PIC-NEXT: addiu $1, $1, %lo($BB0_4-$BB0_2) ; O32-PIC-NEXT: $BB0_2: # %entry ; O32-PIC-NEXT: addu $1, $ra, $1 ; O32-PIC-NEXT: lw $ra, 0($sp) @@ -98,8 +98,8 @@ define void @test1(i32 signext %s) { ; O32-R6-PIC-NEXT: # %bb.1: # %entry ; O32-R6-PIC-NEXT: addiu $sp, $sp, -8 ; O32-R6-PIC-NEXT: sw $ra, 0($sp) -; O32-R6-PIC-NEXT: lui $1, %hi(($BB0_4)-($BB0_2)) -; O32-R6-PIC-NEXT: addiu $1, $1, %lo(($BB0_4)-($BB0_2)) +; O32-R6-PIC-NEXT: lui $1, %hi($BB0_4-$BB0_2) +; O32-R6-PIC-NEXT: addiu $1, $1, %lo($BB0_4-$BB0_2) ; O32-R6-PIC-NEXT: balc $BB0_2 ; O32-R6-PIC-NEXT: $BB0_2: # %entry ; O32-R6-PIC-NEXT: addu $1, $ra, $1 @@ -212,9 +212,9 @@ define void @test1(i32 signext %s) { ; MICROMIPS-NEXT: # %bb.1: # %entry ; MICROMIPS-NEXT: addiu $sp, $sp, -8 ; MICROMIPS-NEXT: sw $ra, 0($sp) -; MICROMIPS-NEXT: lui $1, %hi(($BB0_4)-($BB0_2)) +; MICROMIPS-NEXT: lui $1, %hi($BB0_4-$BB0_2) ; MICROMIPS-NEXT: bal $BB0_2 -; MICROMIPS-NEXT: addiu $1, $1, %lo(($BB0_4)-($BB0_2)) +; MICROMIPS-NEXT: addiu $1, $1, %lo($BB0_4-$BB0_2) ; MICROMIPS-NEXT: $BB0_2: # %entry ; MICROMIPS-NEXT: addu $1, $ra, $1 ; MICROMIPS-NEXT: lw $ra, 0($sp) @@ -261,8 +261,8 @@ define void @test1(i32 signext %s) { ; MICROMIPSR6PIC-NEXT: # %bb.1: # %entry ; MICROMIPSR6PIC-NEXT: addiu $sp, $sp, -8 ; MICROMIPSR6PIC-NEXT: sw $ra, 0($sp) -; MICROMIPSR6PIC-NEXT: lui $1, %hi(($BB0_4)-($BB0_2)) -; MICROMIPSR6PIC-NEXT: addiu $1, $1, %lo(($BB0_4)-($BB0_2)) +; MICROMIPSR6PIC-NEXT: lui $1, %hi($BB0_4-$BB0_2) +; MICROMIPSR6PIC-NEXT: addiu $1, $1, %lo($BB0_4-$BB0_2) ; MICROMIPSR6PIC-NEXT: balc $BB0_2 ; MICROMIPSR6PIC-NEXT: $BB0_2: # %entry ; MICROMIPSR6PIC-NEXT: addu $1, $ra, $1 @@ -285,9 +285,9 @@ define void @test1(i32 signext %s) { ; NACL-NEXT: # %bb.1: ; NACL-NEXT: addiu $sp, $sp, -8 ; NACL-NEXT: sw $ra, 0($sp) -; NACL-NEXT: lui $1, %hi(($BB0_4)-($BB0_2)) +; NACL-NEXT: lui $1, %hi($BB0_4-$BB0_2) ; NACL-NEXT: bal $BB0_2 -; NACL-NEXT: addiu $1, $1, %lo(($BB0_4)-($BB0_2)) +; NACL-NEXT: addiu $1, $1, %lo($BB0_4-$BB0_2) ; NACL-NEXT: $BB0_2: ; NACL-NEXT: addu $1, $ra, $1 ; NACL-NEXT: lw $ra, 0($sp) diff --git a/llvm/test/CodeGen/Mips/mcount.ll b/llvm/test/CodeGen/Mips/mcount.ll index 41100e6cbeb6f..713666ddee649 100644 --- a/llvm/test/CodeGen/Mips/mcount.ll +++ b/llvm/test/CodeGen/Mips/mcount.ll @@ -40,7 +40,7 @@ define void @foo() { ; MIPS32-PIC-NEXT: addu $gp, $2, $25 ; MIPS32-PIC-NEXT: lw $25, %call16(_mcount)($gp) ; MIPS32-PIC-NEXT: move $1, $ra -; MIPS32-PIC-NEXT: .reloc ($tmp0), R_MIPS_JALR, _mcount +; MIPS32-PIC-NEXT: .reloc $tmp0, R_MIPS_JALR, _mcount ; MIPS32-PIC-NEXT: $tmp0: ; MIPS32-PIC-NEXT: jalr $25 ; MIPS32-PIC-NEXT: addiu $sp, $sp, -8 @@ -107,7 +107,7 @@ define void @foo() { ; MIPS32-MM-PIC-NEXT: addu $gp, $2, $25 ; MIPS32-MM-PIC-NEXT: lw $25, %call16(_mcount)($gp) ; MIPS32-MM-PIC-NEXT: move $1, $ra -; MIPS32-MM-PIC-NEXT: .reloc ($tmp0), R_MICROMIPS_JALR, _mcount +; MIPS32-MM-PIC-NEXT: .reloc $tmp0, R_MICROMIPS_JALR, _mcount ; MIPS32-MM-PIC-NEXT: $tmp0: ; MIPS32-MM-PIC-NEXT: jalr $25 ; MIPS32-MM-PIC-NEXT: addiu $sp, $sp, -8 diff --git a/llvm/test/CodeGen/Mips/micromips-mtc-mfc.ll b/llvm/test/CodeGen/Mips/micromips-mtc-mfc.ll index e23f0760d8d91..66b484b47550f 100644 --- a/llvm/test/CodeGen/Mips/micromips-mtc-mfc.ll +++ b/llvm/test/CodeGen/Mips/micromips-mtc-mfc.ll @@ -12,11 +12,11 @@ define double @foo(double %a, double %b) { ; MM2-NEXT: mthc1 $zero, $f2 # encoding: [0x54,0x02,0x38,0x3b] ; MM2-NEXT: c.ule.d $f12, $f2 # encoding: [0x54,0x4c,0x05,0xfc] ; MM2-NEXT: bc1t $BB0_2 # encoding: [0x43,0xa0,A,A] -; MM2-NEXT: # fixup A - offset: 0, value: ($BB0_2), kind: fixup_MICROMIPS_PC16_S1 +; MM2-NEXT: # fixup A - offset: 0, value: $BB0_2, kind: fixup_MICROMIPS_PC16_S1 ; MM2-NEXT: nop # encoding: [0x00,0x00,0x00,0x00] ; MM2-NEXT: # %bb.1: # %entry ; MM2-NEXT: j $BB0_2 # encoding: [0b110101AA,A,A,A] -; MM2-NEXT: # fixup A - offset: 0, value: ($BB0_2), kind: fixup_MICROMIPS_26_S1 +; MM2-NEXT: # fixup A - offset: 0, value: $BB0_2, kind: fixup_MICROMIPS_26_S1 ; MM2-NEXT: nop # encoding: [0x00,0x00,0x00,0x00] ; MM2-NEXT: $BB0_2: # %return ; MM2-NEXT: jrc $ra # encoding: [0x45,0xbf] diff --git a/llvm/test/CodeGen/Mips/mips16ex.ll b/llvm/test/CodeGen/Mips/mips16ex.ll index 7dbccc7b223bd..fb9a44e767516 100644 --- a/llvm/test/CodeGen/Mips/mips16ex.ll +++ b/llvm/test/CodeGen/Mips/mips16ex.ll @@ -2,7 +2,7 @@ ;16: main: ;16-NEXT: [[TMP:.*]]: -;16-NEXT: .set $func_begin0, ([[TMP]]) +;16-NEXT: .set $func_begin0, [[TMP]] ;16-NEXT: .cfi_startproc ;16-NEXT: .cfi_personality @.str = private unnamed_addr constant [7 x i8] c"hello\0A\00", align 1 diff --git a/llvm/test/CodeGen/Mips/reloc-jalr.ll b/llvm/test/CodeGen/Mips/reloc-jalr.ll index 88bbfa7fdfc36..f7cdfbb64c285 100644 --- a/llvm/test/CodeGen/Mips/reloc-jalr.ll +++ b/llvm/test/CodeGen/Mips/reloc-jalr.ll @@ -102,9 +102,9 @@ entry: ; ALL-LABEL: checkCall: ; ALL-NOT: MIPS_JALR call void @foo() -; JALR-32: .reloc ([[TMPLABEL:\$.+]]), R_MIPS_JALR, foo +; JALR-32: .reloc [[TMPLABEL:\$.+]], R_MIPS_JALR, foo ; JALR-64: .reloc [[TMPLABEL:\..+]], R_MIPS_JALR, foo -; JALR-MM: .reloc ([[TMPLABEL:\$.+]]), R_MICROMIPS_JALR, foo +; JALR-MM: .reloc [[TMPLABEL:\$.+]], R_MICROMIPS_JALR, foo ; NORELOC-NOT: .reloc ; JALR-ALL-NEXT: [[TMPLABEL]]: ; JALR-32R2-NEXT: jalr $25 @@ -121,9 +121,9 @@ entry: ; ALL-LABEL: checkTailCall: ; ALL-NOT: MIPS_JALR tail call void @foo() -; JALR-32: .reloc ([[TMPLABEL:\$.+]]), R_MIPS_JALR, foo +; JALR-32: .reloc [[TMPLABEL:\$.+]], R_MIPS_JALR, foo ; JALR-64: .reloc [[TMPLABEL:\..+]], R_MIPS_JALR, foo -; JALR-MM: .reloc ([[TMPLABEL:\$.+]]), R_MICROMIPS_JALR, foo +; JALR-MM: .reloc [[TMPLABEL:\$.+]], R_MICROMIPS_JALR, foo ; JALR-ALL-NEXT: [[TMPLABEL]]: ; NORELOC-NOT: .reloc ; TAILCALL-32R2-NEXT: jr $25 diff --git a/llvm/test/CodeGen/Mips/shrink-wrapping.ll b/llvm/test/CodeGen/Mips/shrink-wrapping.ll index b08d2f1b64678..8153338253465 100644 --- a/llvm/test/CodeGen/Mips/shrink-wrapping.ll +++ b/llvm/test/CodeGen/Mips/shrink-wrapping.ll @@ -243,9 +243,9 @@ define i32 @foo2(i32 signext %a) { ; SHRINK-WRAP-PIC-NEXT: # %bb.1: ; SHRINK-WRAP-PIC-NEXT: addiu $sp, $sp, -8 ; SHRINK-WRAP-PIC-NEXT: sw $ra, 0($sp) -; SHRINK-WRAP-PIC-NEXT: lui $1, %hi(($BB1_4)-($BB1_2)) +; SHRINK-WRAP-PIC-NEXT: lui $1, %hi($BB1_4-$BB1_2) ; SHRINK-WRAP-PIC-NEXT: bal $BB1_2 -; SHRINK-WRAP-PIC-NEXT: addiu $1, $1, %lo(($BB1_4)-($BB1_2)) +; SHRINK-WRAP-PIC-NEXT: addiu $1, $1, %lo($BB1_4-$BB1_2) ; SHRINK-WRAP-PIC-NEXT: $BB1_2: ; SHRINK-WRAP-PIC-NEXT: addu $1, $ra, $1 ; SHRINK-WRAP-PIC-NEXT: lw $ra, 0($sp) @@ -272,9 +272,9 @@ define i32 @foo2(i32 signext %a) { ; NO-SHRINK-WRAP-PIC-NEXT: # %bb.1: ; NO-SHRINK-WRAP-PIC-NEXT: addiu $sp, $sp, -8 ; NO-SHRINK-WRAP-PIC-NEXT: sw $ra, 0($sp) -; NO-SHRINK-WRAP-PIC-NEXT: lui $1, %hi(($BB1_4)-($BB1_2)) +; NO-SHRINK-WRAP-PIC-NEXT: lui $1, %hi($BB1_4-$BB1_2) ; NO-SHRINK-WRAP-PIC-NEXT: bal $BB1_2 -; NO-SHRINK-WRAP-PIC-NEXT: addiu $1, $1, %lo(($BB1_4)-($BB1_2)) +; NO-SHRINK-WRAP-PIC-NEXT: addiu $1, $1, %lo($BB1_4-$BB1_2) ; NO-SHRINK-WRAP-PIC-NEXT: $BB1_2: ; NO-SHRINK-WRAP-PIC-NEXT: addu $1, $ra, $1 ; NO-SHRINK-WRAP-PIC-NEXT: lw $ra, 0($sp) diff --git a/llvm/test/CodeGen/Mips/unalignedload.ll b/llvm/test/CodeGen/Mips/unalignedload.ll index 912998ab9d038..5c78519e6481f 100644 --- a/llvm/test/CodeGen/Mips/unalignedload.ll +++ b/llvm/test/CodeGen/Mips/unalignedload.ll @@ -26,7 +26,7 @@ define void @bar1() nounwind { ; MIPS32-EL-NEXT: lbu $1, 3($1) ; MIPS32-EL-NEXT: sll $1, $1, 8 ; MIPS32-EL-NEXT: lw $25, %call16(foo2)($gp) -; MIPS32-EL-NEXT: .reloc ($tmp0), R_MIPS_JALR, foo2 +; MIPS32-EL-NEXT: .reloc $tmp0, R_MIPS_JALR, foo2 ; MIPS32-EL-NEXT: $tmp0: ; MIPS32-EL-NEXT: jalr $25 ; MIPS32-EL-NEXT: or $4, $1, $2 @@ -47,7 +47,7 @@ define void @bar1() nounwind { ; MIPS32-EB-NEXT: lbu $1, 2($1) ; MIPS32-EB-NEXT: sll $1, $1, 24 ; MIPS32-EB-NEXT: lw $25, %call16(foo2)($gp) -; MIPS32-EB-NEXT: .reloc ($tmp0), R_MIPS_JALR, foo2 +; MIPS32-EB-NEXT: .reloc $tmp0, R_MIPS_JALR, foo2 ; MIPS32-EB-NEXT: $tmp0: ; MIPS32-EB-NEXT: jalr $25 ; MIPS32-EB-NEXT: or $4, $1, $2 @@ -65,7 +65,7 @@ define void @bar1() nounwind { ; MIPS32R6-EL-NEXT: lw $1, %got(s2)($gp) ; MIPS32R6-EL-NEXT: lhu $4, 2($1) ; MIPS32R6-EL-NEXT: lw $25, %call16(foo2)($gp) -; MIPS32R6-EL-NEXT: .reloc ($tmp0), R_MIPS_JALR, foo2 +; MIPS32R6-EL-NEXT: .reloc $tmp0, R_MIPS_JALR, foo2 ; MIPS32R6-EL-NEXT: $tmp0: ; MIPS32R6-EL-NEXT: jalrc $25 ; MIPS32R6-EL-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload @@ -82,7 +82,7 @@ define void @bar1() nounwind { ; MIPS32R6-EB-NEXT: lw $1, %got(s2)($gp) ; MIPS32R6-EB-NEXT: lhu $1, 2($1) ; MIPS32R6-EB-NEXT: lw $25, %call16(foo2)($gp) -; MIPS32R6-EB-NEXT: .reloc ($tmp0), R_MIPS_JALR, foo2 +; MIPS32R6-EB-NEXT: .reloc $tmp0, R_MIPS_JALR, foo2 ; MIPS32R6-EB-NEXT: $tmp0: ; MIPS32R6-EB-NEXT: jalr $25 ; MIPS32R6-EB-NEXT: sll $4, $1, 16 @@ -113,7 +113,7 @@ define void @bar2() nounwind { ; MIPS32-EL-NEXT: lbu $1, 6($1) ; MIPS32-EL-NEXT: sll $1, $1, 16 ; MIPS32-EL-NEXT: lw $25, %call16(foo4)($gp) -; MIPS32-EL-NEXT: .reloc ($tmp1), R_MIPS_JALR, foo4 +; MIPS32-EL-NEXT: .reloc $tmp1, R_MIPS_JALR, foo4 ; MIPS32-EL-NEXT: $tmp1: ; MIPS32-EL-NEXT: jalr $25 ; MIPS32-EL-NEXT: or $5, $2, $1 @@ -139,7 +139,7 @@ define void @bar2() nounwind { ; MIPS32-EB-NEXT: lbu $1, 6($1) ; MIPS32-EB-NEXT: sll $1, $1, 8 ; MIPS32-EB-NEXT: lw $25, %call16(foo4)($gp) -; MIPS32-EB-NEXT: .reloc ($tmp1), R_MIPS_JALR, foo4 +; MIPS32-EB-NEXT: .reloc $tmp1, R_MIPS_JALR, foo4 ; MIPS32-EB-NEXT: $tmp1: ; MIPS32-EB-NEXT: jalr $25 ; MIPS32-EB-NEXT: or $5, $2, $1 @@ -160,7 +160,7 @@ define void @bar2() nounwind { ; MIPS32R6-EL-NEXT: sll $3, $3, 16 ; MIPS32R6-EL-NEXT: lw $4, 0($1) ; MIPS32R6-EL-NEXT: lw $25, %call16(foo4)($gp) -; MIPS32R6-EL-NEXT: .reloc ($tmp1), R_MIPS_JALR, foo4 +; MIPS32R6-EL-NEXT: .reloc $tmp1, R_MIPS_JALR, foo4 ; MIPS32R6-EL-NEXT: $tmp1: ; MIPS32R6-EL-NEXT: jalr $25 ; MIPS32R6-EL-NEXT: or $5, $2, $3 @@ -182,7 +182,7 @@ define void @bar2() nounwind { ; MIPS32R6-EB-NEXT: sll $3, $3, 16 ; MIPS32R6-EB-NEXT: lw $4, 0($1) ; MIPS32R6-EB-NEXT: lw $25, %call16(foo4)($gp) -; MIPS32R6-EB-NEXT: .reloc ($tmp1), R_MIPS_JALR, foo4 +; MIPS32R6-EB-NEXT: .reloc $tmp1, R_MIPS_JALR, foo4 ; MIPS32R6-EB-NEXT: $tmp1: ; MIPS32R6-EB-NEXT: jalr $25 ; MIPS32R6-EB-NEXT: or $5, $3, $2 diff --git a/llvm/test/CodeGen/Mips/xray-mips-attribute-instrumentation.ll b/llvm/test/CodeGen/Mips/xray-mips-attribute-instrumentation.ll index f49ee02eb6b70..26cea577699f6 100644 --- a/llvm/test/CodeGen/Mips/xray-mips-attribute-instrumentation.ll +++ b/llvm/test/CodeGen/Mips/xray-mips-attribute-instrumentation.ll @@ -53,8 +53,8 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" ; CHECK-MIPS64-NEXT: .8byte .Lxray_sled_0-[[TMP]] ; CHECK-MIPS64-NEXT: .8byte .Lfunc_begin0-([[TMP]]+8) ; CHECK-MIPS32: [[TMP:\$tmp[0-9]+]]: -; CHECK-MIPS32-NEXT: .4byte ($xray_sled_0)-([[TMP]]) -; CHECK-MIPS32-NEXT: .4byte ($func_begin0)-(([[TMP]])+4) +; CHECK-MIPS32-NEXT: .4byte $xray_sled_0-[[TMP]] +; CHECK-MIPS32-NEXT: .4byte $func_begin0-([[TMP]]+4) ; We test multiple returns in a single function to make sure we're getting all ; of them with XRay instrumentation. @@ -135,8 +135,8 @@ NotEqual: ; CHECK-MIPS64: .8byte .Lxray_sled_3 ; CHECK-MIPS64: .8byte .Lxray_sled_4 ; CHECK-MIPS32: [[TMP:\$tmp[0-9]+]]: -; CHECK-MIPS32-NEXT: .4byte ($xray_sled_2)-([[TMP]]) +; CHECK-MIPS32-NEXT: .4byte $xray_sled_2-[[TMP]] ; CHECK-MIPS32: [[TMP:\$tmp[0-9]+]]: -; CHECK-MIPS32-NEXT: .4byte ($xray_sled_3)-([[TMP]]) +; CHECK-MIPS32-NEXT: .4byte $xray_sled_3-[[TMP]] ; CHECK-MIPS32: [[TMP:\$tmp[0-9]+]]: -; CHECK-MIPS32-NEXT: .4byte ($xray_sled_4)-([[TMP]]) +; CHECK-MIPS32-NEXT: .4byte $xray_sled_4-[[TMP]] diff --git a/llvm/test/CodeGen/X86/catchpad-reuse.ll b/llvm/test/CodeGen/X86/catchpad-reuse.ll index 8f30e806ea85a..163980fddf04f 100644 --- a/llvm/test/CodeGen/X86/catchpad-reuse.ll +++ b/llvm/test/CodeGen/X86/catchpad-reuse.ll @@ -19,11 +19,11 @@ ; CHECK: $cppxdata$main: ; CHECK-NEXT: .long 429065506 # MagicNumber ; CHECK-NEXT: .long 4 # MaxState -; CHECK-NEXT: .long ($stateUnwindMap$main)@IMGREL # UnwindMap +; CHECK-NEXT: .long $stateUnwindMap$main@IMGREL # UnwindMap ; CHECK-NEXT: .long 2 # NumTryBlocks -; CHECK-NEXT: .long ($tryMap$main)@IMGREL # TryBlockMap +; CHECK-NEXT: .long $tryMap$main@IMGREL # TryBlockMap ; CHECK-NEXT: .long 5 # IPMapEntries -; CHECK-NEXT: .long ($ip2state$main)@IMGREL # IPToStateXData +; CHECK-NEXT: .long $ip2state$main@IMGREL # IPToStateXData ; CHECK-NEXT: .long 32 # UnwindHelp ; CHECK-NEXT: .long 0 # ESTypeList ; CHECK-NEXT: .long 1 # EHFlags @@ -33,12 +33,12 @@ ; CHECK-NEXT: .long 1 # TryHigh ; CHECK-NEXT: .long 2 # CatchHigh ; CHECK-NEXT: .long 1 # NumCatches -; CHECK-NEXT: .long ($handlerMap$0$main)@IMGREL # HandlerArray +; CHECK-NEXT: .long $handlerMap$0$main@IMGREL # HandlerArray ; CHECK-NEXT: .long 0 # TryLow ; CHECK-NEXT: .long 2 # TryHigh ; CHECK-NEXT: .long 3 # CatchHigh ; CHECK-NEXT: .long 1 # NumCatches -; CHECK-NEXT: .long ($handlerMap$1$main)@IMGREL # HandlerArray +; CHECK-NEXT: .long $handlerMap$1$main@IMGREL # HandlerArray ; CHECK: $handlerMap$0$main: ; CHECK-NEXT: .long 0 # Adjectives diff --git a/llvm/test/CodeGen/X86/dollar-name.ll b/llvm/test/CodeGen/X86/dollar-name.ll index bc8cf5fb46537..fc9d6a77f66e5 100644 --- a/llvm/test/CodeGen/X86/dollar-name.ll +++ b/llvm/test/CodeGen/X86/dollar-name.ll @@ -5,9 +5,9 @@ @"$qux" = external dso_local global i32 define i32 @"$foo"() nounwind { -; CHECK: movl ($bar), -; CHECK: addl ($qux), -; CHECK: calll ($hen) +; CHECK: movl $bar, +; CHECK: addl $qux, +; CHECK: calll $hen %m = load i32, ptr @"$bar" %n = load i32, ptr @"$qux" %t = add i32 %m, %n diff --git a/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll b/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll index 63a3188aaad7b..2c576df1b7549 100644 --- a/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll +++ b/llvm/test/CodeGen/X86/seh-unwind-inline-asm-codegen.ll @@ -42,11 +42,11 @@ declare dso_local void @printf(ptr, ...) ; CHECK-LABEL: $cppxdata$test: ; CHECK-NEXT: .long 429065506 # MagicNumber ; CHECK-NEXT: .long 1 # MaxState -; CHECK-NEXT: .long ($stateUnwindMap$test)@IMGREL # UnwindMap +; CHECK-NEXT: .long $stateUnwindMap$test@IMGREL # UnwindMap ; CHECK-NEXT: .long 0 # NumTryBlocks ; CHECK-NEXT: .long 0 # TryBlockMap ; CHECK-NEXT: .long 3 # IPMapEntries -; CHECK-NEXT: .long ($ip2state$test)@IMGREL # IPToStateXData +; CHECK-NEXT: .long $ip2state$test@IMGREL # IPToStateXData ; CHECK-NEXT: .long 40 # UnwindHelp ; CHECK-NEXT: .long 0 # ESTypeList ; CHECK-NEXT: .long 1 # EHFlags diff --git a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll index 198f1bf198620..e2de2ff4a392e 100644 --- a/llvm/test/CodeGen/X86/stack-coloring-wineh.ll +++ b/llvm/test/CodeGen/X86/stack-coloring-wineh.ll @@ -96,7 +96,7 @@ define void @pr66984(ptr %arg) personality ptr @__CxxFrameHandler3 { ; X86_64-NEXT: .seh_endepilogue ; X86_64-NEXT: retq ; X86_64-NEXT: .seh_handlerdata -; X86_64-NEXT: .long ($cppxdata$pr66984)@IMGREL +; X86_64-NEXT: .long $cppxdata$pr66984@IMGREL ; X86_64-NEXT: .text ; X86_64-NEXT: .seh_endproc ; X86_64-NEXT: .def "?catch$2@?0?pr66984@4HA"; @@ -124,7 +124,7 @@ define void @pr66984(ptr %arg) personality ptr @__CxxFrameHandler3 { ; X86_64-NEXT: .seh_endepilogue ; X86_64-NEXT: retq # CATCHRET ; X86_64-NEXT: .seh_handlerdata -; X86_64-NEXT: .long ($cppxdata$pr66984)@IMGREL +; X86_64-NEXT: .long $cppxdata$pr66984@IMGREL ; X86_64-NEXT: .text ; X86_64-NEXT: .seh_endproc ; X86_64-NEXT: .def "?dtor$4@?0?pr66984@4HA"; diff --git a/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll b/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll index b5d914153ffd3..bfb9c43b3fd16 100644 --- a/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll +++ b/llvm/test/CodeGen/X86/win-catchpad-nested-cxx.ll @@ -48,14 +48,14 @@ handler2: ; X64-LABEL: $cppxdata$try_in_catch: ; CHECK-NEXT: .long 429065506 ; CHECK-NEXT: .long 4 -; CHECK-NEXT: .long ($stateUnwindMap$try_in_catch) +; CHECK-NEXT: .long $stateUnwindMap$try_in_catch ; CHECK-NEXT: .long 2 -; CHECK-NEXT: .long ($tryMap$try_in_catch) +; CHECK-NEXT: .long $tryMap$try_in_catch ; ip2state num + ptr ; X86-NEXT: .long 0 ; X86-NEXT: .long 0 ; X64-NEXT: .long 7 -; X64-NEXT: .long ($ip2state$try_in_catch) +; X64-NEXT: .long $ip2state$try_in_catch ; unwindhelp offset ; X64-NEXT: .long 40 ; CHECK-NEXT: .long 0 @@ -67,24 +67,24 @@ handler2: ; X86-NEXT: .long 2 ; X86-NEXT: .long 3 ; X86-NEXT: .long 1 -; X86-NEXT: .long ($handlerMap$0$try_in_catch) +; X86-NEXT: .long $handlerMap$0$try_in_catch ; X86-NEXT: .long 0 ; X86-NEXT: .long 0 ; X86-NEXT: .long 3 ; X86-NEXT: .long 1 -; X86-NEXT: .long ($handlerMap$1$try_in_catch) +; X86-NEXT: .long $handlerMap$1$try_in_catch ; X64-LABEL: $tryMap$try_in_catch: ; X64-NEXT: .long 0 ; X64-NEXT: .long 0 ; X64-NEXT: .long 3 ; X64-NEXT: .long 1 -; X64-NEXT: .long ($handlerMap$0$try_in_catch) +; X64-NEXT: .long $handlerMap$0$try_in_catch ; X64-NEXT: .long 2 ; X64-NEXT: .long 2 ; X64-NEXT: .long 3 ; X64-NEXT: .long 1 -; X64-NEXT: .long ($handlerMap$1$try_in_catch) +; X64-NEXT: .long $handlerMap$1$try_in_catch ; CHECK: $handlerMap$0$try_in_catch: ; CHECK-NEXT: .long 64 diff --git a/llvm/test/CodeGen/X86/win-catchpad.ll b/llvm/test/CodeGen/X86/win-catchpad.ll index ceca37710e9ec..249194610e9f8 100644 --- a/llvm/test/CodeGen/X86/win-catchpad.ll +++ b/llvm/test/CodeGen/X86/win-catchpad.ll @@ -183,11 +183,11 @@ try.cont: ; X64: $cppxdata$try_catch_catch: ; X64-NEXT: .long 429065506 ; X64-NEXT: .long 2 -; X64-NEXT: .long ($stateUnwindMap$try_catch_catch)@IMGREL +; X64-NEXT: .long $stateUnwindMap$try_catch_catch@IMGREL ; X64-NEXT: .long 1 -; X64-NEXT: .long ($tryMap$try_catch_catch)@IMGREL +; X64-NEXT: .long $tryMap$try_catch_catch@IMGREL ; X64-NEXT: .long 5 -; X64-NEXT: .long ($ip2state$try_catch_catch)@IMGREL +; X64-NEXT: .long $ip2state$try_catch_catch@IMGREL ; X64-NEXT: .long 48 ; X64-NEXT: .long 0 ; X64-NEXT: .long 1 @@ -197,7 +197,7 @@ try.cont: ; X64-NEXT: .long 0 ; X64-NEXT: .long 1 ; X64-NEXT: .long 2 -; X64-NEXT: .long ($handlerMap$0$try_catch_catch)@IMGREL +; X64-NEXT: .long $handlerMap$0$try_catch_catch@IMGREL ; X64: $handlerMap$0$try_catch_catch: ; X64-NEXT: .long 0 @@ -325,11 +325,11 @@ try.cont: ; X64-LABEL: $cppxdata$branch_to_normal_dest: ; X64-NEXT: .long 429065506 ; X64-NEXT: .long 2 -; X64-NEXT: .long ($stateUnwindMap$branch_to_normal_dest)@IMGREL +; X64-NEXT: .long $stateUnwindMap$branch_to_normal_dest@IMGREL ; X64-NEXT: .long 1 -; X64-NEXT: .long ($tryMap$branch_to_normal_dest)@IMGREL +; X64-NEXT: .long $tryMap$branch_to_normal_dest@IMGREL ; X64-NEXT: .long 4 -; X64-NEXT: .long ($ip2state$branch_to_normal_dest)@IMGREL +; X64-NEXT: .long $ip2state$branch_to_normal_dest@IMGREL ; X64-NEXT: .long 40 ; X64-NEXT: .long 0 ; X64-NEXT: .long 1 @@ -345,7 +345,7 @@ try.cont: ; X64-NEXT: .long 0 ; X64-NEXT: .long 1 ; X64-NEXT: .long 1 -; X64-NEXT: .long ($handlerMap$0$branch_to_normal_dest)@IMGREL +; X64-NEXT: .long $handlerMap$0$branch_to_normal_dest@IMGREL ; X64-LABEL: $handlerMap$0$branch_to_normal_dest: ; X64-NEXT: .long 64 diff --git a/llvm/test/CodeGen/X86/win-cleanuppad.ll b/llvm/test/CodeGen/X86/win-cleanuppad.ll index 452f0a8e36d8d..e3f7f5be0049e 100644 --- a/llvm/test/CodeGen/X86/win-cleanuppad.ll +++ b/llvm/test/CodeGen/X86/win-cleanuppad.ll @@ -39,11 +39,11 @@ ehcleanup: ; preds = %entry ; CHECK: $cppxdata$simple_cleanup: ; CHECK-NEXT: .long 429065506 ; CHECK-NEXT: .long 1 -; CHECK-NEXT: .long ($stateUnwindMap$simple_cleanup)@IMGREL +; CHECK-NEXT: .long $stateUnwindMap$simple_cleanup@IMGREL ; CHECK-NEXT: .long 0 ; CHECK-NEXT: .long 0 ; CHECK-NEXT: .long 3 -; CHECK-NEXT: .long ($ip2state$simple_cleanup)@IMGREL +; CHECK-NEXT: .long $ip2state$simple_cleanup@IMGREL ; UnwindHelp offset should match the -2 store above ; CHECK-NEXT: .long 40 ; CHECK-NEXT: .long 0 @@ -114,7 +114,7 @@ cleanup.outer: ; preds = %invoke.cont.1, %c ; X86: L__ehtable$nested_cleanup: ; X86: .long 429065506 ; X86: .long 2 -; X86: .long ($stateUnwindMap$nested_cleanup) +; X86: .long $stateUnwindMap$nested_cleanup ; X86: .long 0 ; X86: .long 0 ; X86: .long 0 @@ -167,11 +167,11 @@ cleanup.outer: ; preds = %invoke.cont.1, %c ; X64: $cppxdata$nested_cleanup: ; X64-NEXT: .long 429065506 ; X64-NEXT: .long 2 -; X64-NEXT: .long ($stateUnwindMap$nested_cleanup)@IMGREL +; X64-NEXT: .long $stateUnwindMap$nested_cleanup@IMGREL ; X64-NEXT: .long 0 ; X64-NEXT: .long 0 ; X64-NEXT: .long 5 -; X64-NEXT: .long ($ip2state$nested_cleanup)@IMGREL +; X64-NEXT: .long $ip2state$nested_cleanup@IMGREL ; X64-NEXT: .long 56 ; X64-NEXT: .long 0 ; X64-NEXT: .long 1 diff --git a/llvm/test/CodeGen/X86/win-funclet-cfi.ll b/llvm/test/CodeGen/X86/win-funclet-cfi.ll index f9a1e2f0d2880..96b55772e05e4 100644 --- a/llvm/test/CodeGen/X86/win-funclet-cfi.ll +++ b/llvm/test/CodeGen/X86/win-funclet-cfi.ll @@ -61,7 +61,7 @@ declare i32 @__CxxFrameHandler3(...) ; Don't emit a reference to the LSDA. ; CHECK: .seh_handlerdata -; CHECK-NOT: .long ("$cppxdata$?f@@YAXXZ")@IMGREL +; CHECK-NOT: .long "$cppxdata$?f@@YAXXZ"@IMGREL ; CHECK-NEXT: .text ; CHECK: .seh_endproc @@ -92,6 +92,6 @@ declare i32 @__CxxFrameHandler3(...) ; Emit a reference to the LSDA. ; CHECK: .seh_handlerdata -; CHECK-NEXT: .long ("$cppxdata$?f@@YAXXZ")@IMGREL +; CHECK-NEXT: .long "$cppxdata$?f@@YAXXZ"@IMGREL ; CHECK-NEXT: .text ; CHECK: .seh_endproc diff --git a/llvm/test/CodeGen/X86/win32-eh.ll b/llvm/test/CodeGen/X86/win32-eh.ll index d3d19ede546d6..857df9882be47 100644 --- a/llvm/test/CodeGen/X86/win32-eh.ll +++ b/llvm/test/CodeGen/X86/win32-eh.ll @@ -201,9 +201,9 @@ catch: ; CHECK-LABEL: L__ehtable$use_CxxFrameHandler3: ; CHECK-NEXT: .long 429065506 ; CHECK-NEXT: .long 2 -; CHECK-NEXT: .long ($stateUnwindMap$use_CxxFrameHandler3) +; CHECK-NEXT: .long $stateUnwindMap$use_CxxFrameHandler3 ; CHECK-NEXT: .long 1 -; CHECK-NEXT: .long ($tryMap$use_CxxFrameHandler3) +; CHECK-NEXT: .long $tryMap$use_CxxFrameHandler3 ; CHECK-NEXT: .long 0 ; CHECK-NEXT: .long 0 ; CHECK-NEXT: .long 0 diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll index 944ffab24a5d1..785c2606186a6 100644 --- a/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll +++ b/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll @@ -1,10 +1,10 @@ ; RUN: llc -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: "$cppxdata$?crash@@YAXH@Z": -; CHECK: .long ("$stateUnwindMap$?crash@@YAXH@Z") -; CHECK: .long ("$tryMap$?crash@@YAXH@Z")@IMGREL # TryBlockMap +; CHECK: .long "$stateUnwindMap$?crash@@YAXH@Z" +; CHECK: .long "$tryMap$?crash@@YAXH@Z"@IMGREL # TryBlockMap ; CHECK-NEXT: .long 6 # IPMapEntries -; CHECK-NEXT: .long ("$ip2state$?crash@@YAXH@Z") +; CHECK-NEXT: .long "$ip2state$?crash@@YAXH@Z" ; CHECK-LABEL: "$stateUnwindMap$?crash@@YAXH@Z": ; CHECK-NEXT: .long -1 @@ -19,7 +19,7 @@ ; CHECK-NEXT: .long 1 ; CHECK-NEXT: .long 2 ; CHECK-NEXT: .long 1 -; CHECK-NEXT: .long ("$handlerMap$ +; CHECK-NEXT: .long "$handlerMap$ ; CHECK: "$handlerMap$0$?crash@@YAXH@Z" ; CHECK-NEXT: .long 0 diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll index 54c1d838a30fd..6c6e9c3b66804 100644 --- a/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll +++ b/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll @@ -1,8 +1,8 @@ ; RUN: llc -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: "$cppxdata$?crash@@YAXH@Z": -; CHECK: .long ("$stateUnwindMap$?crash@@YAXH@Z") -; CHECK: .long ("$ip2state$?crash@@YAXH@Z") +; CHECK: .long "$stateUnwindMap$?crash@@YAXH@Z" +; CHECK: .long "$ip2state$?crash@@YAXH@Z" ; CHECK-LABEL: "$stateUnwindMap$?crash@@YAXH@Z": ; CHECK: .long -1 diff --git a/llvm/test/DebugInfo/COFF/jump-table.ll b/llvm/test/DebugInfo/COFF/jump-table.ll index a8039809c8b77..3eda2438ea88a 100644 --- a/llvm/test/DebugInfo/COFF/jump-table.ll +++ b/llvm/test/DebugInfo/COFF/jump-table.ll @@ -58,7 +58,7 @@ ; CHECK: {{\.?}}LJTI0_0: ; I686-NEXT: .long LBB0_[[#]] ; X64-NEXT: .long .LBB0_[[#]]-.LJTI0_0 -; A32-NEXT: .byte (($MBB0_[[#]])-(.LCPI0_0+4))/2 +; A32-NEXT: .byte ($MBB0_[[#]]-(.LCPI0_0+4))/2 ; A64-NEXT: .byte (.LBB0_[[FIRSTBLOCK:[0-9]+]]-.LBB0_[[FIRSTBLOCK]])>>2 ; NOTE: thumbv7a places the jump tables just after the branch, so check for the other branch now ; A32: .LCPI0_1: @@ -66,7 +66,7 @@ ; CHECK: {{\.?}}LJTI0_1: ; I686-NEXT: .long LBB0_[[#]] ; X64-NEXT: .long .LBB0_[[#]]-.LJTI0_1 -; A32-NEXT: .byte (($MBB0_[[#]])-(.LCPI0_1+4))/2 +; A32-NEXT: .byte ($MBB0_[[#]]-(.LCPI0_1+4))/2 ; A64-NEXT: .byte (.LBB0_[[SECONDBLOCK:[0-9]+]]-.LBB0_[[SECONDBLOCK]])>>2 ; Verify CodeView diff --git a/llvm/test/MC/ARM/arm-branches.s b/llvm/test/MC/ARM/arm-branches.s index e18fa5de12584..5af5a28612ac1 100644 --- a/llvm/test/MC/ARM/arm-branches.s +++ b/llvm/test/MC/ARM/arm-branches.s @@ -28,13 +28,13 @@ bl $4 beq $4 + 4 -@ CHECK: b ($foo) @ encoding: [A,A,A,0xea] -@ CHECK: bl ($foo) @ encoding: [A,A,A,0xeb] -@ CHECK: beq ($foo) @ encoding: [A,A,A,0x0a] -@ CHECK: blx ($foo) @ encoding: [A,A,A,0xfa] -@ CHECK: b #($foo)+4 @ encoding: [A,A,A,0xea] -@ CHECK: bl ($4) @ encoding: [A,A,A,0xeb] -@ CHECK: beq #($4)+4 @ encoding: [A,A,A,0x0a] +@ CHECK: b $foo @ encoding: [A,A,A,0xea] +@ CHECK: bl $foo @ encoding: [A,A,A,0xeb] +@ CHECK: beq $foo @ encoding: [A,A,A,0x0a] +@ CHECK: blx $foo @ encoding: [A,A,A,0xfa] +@ CHECK: b #$foo+4 @ encoding: [A,A,A,0xea] +@ CHECK: bl $4 @ encoding: [A,A,A,0xeb] +@ CHECK: beq #$4+4 @ encoding: [A,A,A,0x0a] @------------------------------------------------------------------------------ @ Leading '$' should be allowed to introduce an expression diff --git a/llvm/test/MC/AsmParser/dollars-in-identifiers.s b/llvm/test/MC/AsmParser/dollars-in-identifiers.s index e56959062ad9d..2fd35535d356f 100644 --- a/llvm/test/MC/AsmParser/dollars-in-identifiers.s +++ b/llvm/test/MC/AsmParser/dollars-in-identifiers.s @@ -3,5 +3,5 @@ // CHECK: .globl $foo .globl $foo -// CHECK: .long ($foo) +// CHECK: .long $foo .long ($foo) diff --git a/llvm/test/MC/MachO/dollar-identifier.s b/llvm/test/MC/MachO/dollar-identifier.s index ca6993f7f4040..7eff63354b660 100644 --- a/llvm/test/MC/MachO/dollar-identifier.s +++ b/llvm/test/MC/MachO/dollar-identifier.s @@ -1,4 +1,4 @@ // RUN: llvm-mc -triple x86_64-apple-darwin10 %s | FileCheck %s .long $1 -// CHECK: .long ($1) +// CHECK: .long $1 diff --git a/llvm/test/MC/Mips/expansion-jal-sym-pic.s b/llvm/test/MC/Mips/expansion-jal-sym-pic.s index c7b5ccc1880bd..6f1b7c9d81b42 100644 --- a/llvm/test/MC/Mips/expansion-jal-sym-pic.s +++ b/llvm/test/MC/Mips/expansion-jal-sym-pic.s @@ -55,7 +55,7 @@ local_label: # O32: # fixup A - offset: 0, value: %got(local_label), kind: fixup_Mips_GOT # O32: addiu $25, $25, %lo(local_label) # encoding: [0x27,0x39,A,A] # O32: # fixup A - offset: 0, value: %lo(local_label), kind: fixup_Mips_LO16 -# O32-NEXT: .reloc ($tmp0), R_MIPS_JALR, local_label +# O32-NEXT: .reloc $tmp0, R_MIPS_JALR, local_label # ELF-O32: 8f 99 00 00 lw $25, 0($gp) # ELF-O32-NEXT: R_MIPS_GOT16 .text @@ -68,7 +68,7 @@ local_label: # XO32-NEXT: # fixup A - offset: 0, value: %got(local_label), kind: fixup_Mips_GOT # XO32-NEXT: addiu $25, $25, %lo(local_label) # encoding: [0x27,0x39,A,A] # XO32-NEXT: # fixup A - offset: 0, value: %lo(local_label), kind: fixup_Mips_LO16 -# XO32-NEXT: .reloc ($tmp0), R_MIPS_JALR, local_label +# XO32-NEXT: .reloc $tmp0, R_MIPS_JALR, local_label # ELF-XO32: 8f 99 00 00 lw $25, 0($gp) # ELF-XO32-NEXT: R_MIPS_GOT16 .text @@ -117,7 +117,7 @@ local_label: # O32-MM: # fixup A - offset: 0, value: %got(local_label), kind: fixup_MICROMIPS_GOT16 # O32-MM: addiu $25, $25, %lo(local_label) # encoding: [0x33,0x39,A,A] # O32-MM: # fixup A - offset: 0, value: %lo(local_label), kind: fixup_MICROMIPS_LO16 -# O32-MM-NEXT: .reloc ($tmp0), R_MICROMIPS_JALR, local_label +# O32-MM-NEXT: .reloc $tmp0, R_MICROMIPS_JALR, local_label # MIPS: jalr $25 # encoding: [0x03,0x20,0xf8,0x09] # MM: jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c] @@ -212,7 +212,7 @@ local_label: # Expanding "jal weak_label": # O32: lw $25, %call16(weak_label)($gp) # encoding: [0x8f,0x99,A,A] # O32: # fixup A - offset: 0, value: %call16(weak_label), kind: fixup_Mips_CALL16 -# O32-NEXT: .reloc ($tmp1), R_MIPS_JALR, weak_label +# O32-NEXT: .reloc $tmp1, R_MIPS_JALR, weak_label # ELF-O32: 8f 99 00 00 lw $25, 0($gp) # ELF-O32-NEXT: R_MIPS_CALL16 weak_label @@ -224,7 +224,7 @@ local_label: # XO32-NEXT: addu $25, $25, $gp # encoding: [0x03,0x3c,0xc8,0x21] # XO32-NEXT: lw $25, %call_lo(weak_label)($25) # encoding: [0x8f,0x39,A,A] # XO32-NEXT: # fixup A - offset: 0, value: %call_lo(weak_label), kind: fixup_Mips_CALL_LO16 -# XO32-NEXT: .reloc ($tmp1), R_MIPS_JALR, weak_label +# XO32-NEXT: .reloc $tmp1, R_MIPS_JALR, weak_label # ELF-XO32: 3c 19 00 00 lui $25, 0 # ELF-XO32-MEXT: R_MIPS_CALL_HI16 weak_label @@ -284,7 +284,7 @@ local_label: # O32-MM: lw $25, %call16(weak_label)($gp) # encoding: [0xff,0x3c,A,A] # O32-MM: # fixup A - offset: 0, value: %call16(weak_label), kind: fixup_MICROMIPS_CALL16 -# O32-MM-NEXT: .reloc ($tmp1), R_MICROMIPS_JALR, weak_label +# O32-MM-NEXT: .reloc $tmp1, R_MICROMIPS_JALR, weak_label # MIPS: jalr $25 # encoding: [0x03,0x20,0xf8,0x09] # MM: jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c] @@ -392,7 +392,7 @@ local_label: # Expanding "jal global_label": # O32: lw $25, %call16(global_label)($gp) # encoding: [0x8f,0x99,A,A] # O32-NEXT: # fixup A - offset: 0, value: %call16(global_label), kind: fixup_Mips_CALL16 -# O32-NEXT: .reloc ($tmp2), R_MIPS_JALR, global_label +# O32-NEXT: .reloc $tmp2, R_MIPS_JALR, global_label # ELF-O32: 8f 99 00 00 lw $25, 0($gp) # ELF-O32-NEXT: R_MIPS_CALL16 global_label @@ -404,7 +404,7 @@ local_label: # XO32-NEXT: addu $25, $25, $gp # encoding: [0x03,0x3c,0xc8,0x21] # XO32-NEXT: lw $25, %call_lo(global_label)($25) # encoding: [0x8f,0x39,A,A] # XO32-NEXT: # fixup A - offset: 0, value: %call_lo(global_label), kind: fixup_Mips_CALL_LO16 -# XO32-NEXT: .reloc ($tmp2), R_MIPS_JALR, global_label +# XO32-NEXT: .reloc $tmp2, R_MIPS_JALR, global_label # ELF-XO32: 3c 19 00 00 lui $25, 0 # ELF-XO32-NEXT: R_MIPS_CALL_HI16 global_label @@ -464,7 +464,7 @@ local_label: # O32-MM: lw $25, %call16(global_label)($gp) # encoding: [0xff,0x3c,A,A] # O32-MM-NEXT: # fixup A - offset: 0, value: %call16(global_label), kind: fixup_MICROMIPS_CALL16 -# O32-MM-NEXT: .reloc ($tmp2), R_MICROMIPS_JALR, global_label +# O32-MM-NEXT: .reloc $tmp2, R_MICROMIPS_JALR, global_label # MIPS: jalr $25 # encoding: [0x03,0x20,0xf8,0x09] # MM: jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c] @@ -580,7 +580,7 @@ local_label: # XO32-NEXT: # fixup A - offset: 0, value: %got(.text), kind: fixup_Mips_GOT # XO32-NEXT: addiu $25, $25, %lo(.text) # encoding: [0x27,0x39,A,A] # XO32-NEXT: # fixup A - offset: 0, value: %lo(.text), kind: fixup_Mips_LO16 -# XO32-NEXT: .reloc ($tmp3), R_MIPS_JALR, .text +# XO32-NEXT: .reloc $tmp3, R_MIPS_JALR, .text # ELF-XO32: 8f 99 00 00 lw $25, 0($gp) # ELF-XO32-NEXT: R_MIPS_GOT16 .text @@ -623,7 +623,7 @@ local_label: # O32-MM-NEXT: # fixup A - offset: 0, value: %got(.text), kind: fixup_MICROMIPS_GOT16 # O32-MM-NEXT: addiu $25, $25, %lo(.text) # encoding: [0x33,0x39,A,A] # O32-MM-NEXT: # fixup A - offset: 0, value: %lo(.text), kind: fixup_MICROMIPS_LO16 -# O42-MM-NEXT: .reloc ($tmp3), R_MICROMIPS_JALR, .text +# O42-MM-NEXT: .reloc $tmp3, R_MICROMIPS_JALR, .text # MIPS: jalr $25 # encoding: [0x03,0x20,0xf8,0x09] # MM: jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c] @@ -689,7 +689,7 @@ local_label: # O32-MM-NEXT: # fixup A - offset: 0, value: %got(.text+8), kind: fixup_MICROMIPS_GOT16 # O32-MM-NEXT: addiu $25, $25, %lo(.text+8) # encoding: [0x33,0x39,A,A] # O32-MM-NEXT: # fixup A - offset: 0, value: %lo(.text+8), kind: fixup_MICROMIPS_LO16 -# O42-MM-NEXT: .reloc ($tmp4), R_MICROMIPS_JALR, .text +# O42-MM-NEXT: .reloc $tmp4, R_MICROMIPS_JALR, .text # MIPS: jalr $25 # encoding: [0x03,0x20,0xf8,0x09] # MM: jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c] @@ -704,7 +704,7 @@ local_label: # O32-NEXT: # fixup A - offset: 0, value: %got($tmp4), kind: fixup_Mips_GOT # O32-NEXT: addiu $25, $25, %lo($tmp4) # encoding: [0x27,0x39,A,A] # O32-NEXT: # fixup A - offset: 0, value: %lo($tmp4), kind: fixup_Mips_LO16 -# O32-NEXT: .reloc ($tmp5), R_MIPS_JALR, ($tmp4) +# O32-NEXT: .reloc $tmp5, R_MIPS_JALR, $tmp4 # ELF-O32: 8f 99 00 00 lw $25, 0($gp) # ELF-O32-NEXT: R_MIPS_GOT16 .text @@ -717,7 +717,7 @@ local_label: # XO32-NEXT: # fixup A - offset: 0, value: %got($tmp4), kind: fixup_Mips_GOT # XO32-NEXT: addiu $25, $25, %lo($tmp4) # encoding: [0x27,0x39,A,A] # XO32-NEXT: # fixup A - offset: 0, value: %lo($tmp4), kind: fixup_Mips_LO16 -# XO32-NEXT: .reloc ($tmp5), R_MIPS_JALR, ($tmp4) +# XO32-NEXT: .reloc $tmp5, R_MIPS_JALR, $tmp4 # ELF-XO32: 8f 99 00 00 lw $25, 0($gp) # ELF-XO32-NEXT: R_MIPS_GOT16 .text @@ -760,7 +760,7 @@ local_label: # O32-MM-NEXT: # fixup A - offset: 0, value: %got($tmp4), kind: fixup_MICROMIPS_GOT16 # O32-MM-NEXT: addiu $25, $25, %lo($tmp4) # encoding: [0x33,0x39,A,A] # O32-MM-NEXT: # fixup A - offset: 0, value: %lo($tmp4), kind: fixup_MICROMIPS_LO16 -# O32-MM-NEXT: .reloc ($tmp5), R_MICROMIPS_JALR, ($tmp4) +# O32-MM-NEXT: .reloc $tmp5, R_MICROMIPS_JALR, $tmp4 # MIPS: jalr $25 # encoding: [0x03,0x20,0xf8,0x09] # MM: jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c] @@ -769,10 +769,10 @@ local_label: jal 1f+8 nop -# O32: lw $25, %got(($tmp4)+8)($gp) # encoding: [0x8f,0x99,A,A] -# O32-NEXT: # fixup A - offset: 0, value: %got(($tmp4)+8), kind: fixup_Mips_GOT -# O32-NEXT: addiu $25, $25, %lo(($tmp4)+8) # encoding: [0x27,0x39,A,A] -# O32-NEXT: # fixup A - offset: 0, value: %lo(($tmp4)+8), kind: fixup_Mips_LO16 +# O32: lw $25, %got($tmp4+8)($gp) # encoding: [0x8f,0x99,A,A] +# O32-NEXT: # fixup A - offset: 0, value: %got($tmp4+8), kind: fixup_Mips_GOT +# O32-NEXT: addiu $25, $25, %lo($tmp4+8) # encoding: [0x27,0x39,A,A] +# O32-NEXT: # fixup A - offset: 0, value: %lo($tmp4+8), kind: fixup_Mips_LO16 # O32-NOT: .reloc # ELF-O32: 8f 99 00 00 lw $25, 0($gp) @@ -782,10 +782,10 @@ local_label: # ELF-O32-NEXT: 03 20 f8 09 jalr $25 # ELF-O32-NEXT: 00 00 00 00 nop -# XO32: lw $25, %got(($tmp4)+8)($gp) # encoding: [0x8f,0x99,A,A] -# XO32-NEXT: # fixup A - offset: 0, value: %got(($tmp4)+8), kind: fixup_Mips_GOT -# XO32-NEXT: addiu $25, $25, %lo(($tmp4)+8) # encoding: [0x27,0x39,A,A] -# XO32-NEXT: # fixup A - offset: 0, value: %lo(($tmp4)+8), kind: fixup_Mips_LO16 +# XO32: lw $25, %got($tmp4+8)($gp) # encoding: [0x8f,0x99,A,A] +# XO32-NEXT: # fixup A - offset: 0, value: %got($tmp4+8), kind: fixup_Mips_GOT +# XO32-NEXT: addiu $25, $25, %lo($tmp4+8) # encoding: [0x27,0x39,A,A] +# XO32-NEXT: # fixup A - offset: 0, value: %lo($tmp4+8), kind: fixup_Mips_LO16 # XO32-NOT: .reloc # ELF-XO32: 8f 99 00 00 lw $25, 0($gp) @@ -829,10 +829,10 @@ local_label: # ELF-XN64-NEXT: 03 20 f8 09 jalr $25 # ELF-XN64-NEXT: R_MIPS_JALR/R_MIPS_NONE/R_MIPS_NONE .Ltmp0 -# O32-MM: lw $25, %got(($tmp4)+8)($gp) # encoding: [0xff,0x3c,A,A] -# O32-MM-NEXT: # fixup A - offset: 0, value: %got(($tmp4)+8), kind: fixup_MICROMIPS_GOT16 -# O32-MM-NEXT: addiu $25, $25, %lo(($tmp4)+8) # encoding: [0x33,0x39,A,A] -# O32-MM-NEXT: # fixup A - offset: 0, value: %lo(($tmp4)+8), kind: fixup_MICROMIPS_LO16 +# O32-MM: lw $25, %got($tmp4+8)($gp) # encoding: [0xff,0x3c,A,A] +# O32-MM-NEXT: # fixup A - offset: 0, value: %got($tmp4+8), kind: fixup_MICROMIPS_GOT16 +# O32-MM-NEXT: addiu $25, $25, %lo($tmp4+8) # encoding: [0x33,0x39,A,A] +# O32-MM-NEXT: # fixup A - offset: 0, value: %lo($tmp4+8), kind: fixup_MICROMIPS_LO16 # O32-MM-NOT: .reloc # MIPS: jalr $25 # encoding: [0x03,0x20,0xf8,0x09] @@ -848,7 +848,7 @@ local_label: # O32-FIXME: # fixup A - offset: 0, value: %got(forward_local), kind: fixup_Mips_GOT # O32-FIXME: addiu $25, $25, %lo(forward_local) # encoding: [0x27,0x39,A,A] # O32-FIXME:: # fixup A - offset: 0, value: %lo(forward_local), kind: fixup_Mips_LO16 -# O32-FIXME: .reloc ($tmp6), R_MIPS_JALR, forward_local +# O32-FIXME: .reloc $tmp6, R_MIPS_JALR, forward_local # ELF-O32: 8f 99 00 00 lw $25, 0($gp) # ELF-O32-NEXT: R_MIPS_GOT16 .text @@ -873,7 +873,7 @@ local_label: # O32-MM-FIXME: # fixup A - offset: 0, value: %got(forward_local), kind: fixup_MICROMIPS_GOT16 # O32-MM-FIXME: addiu $25, $25, %lo(forward_local) # encoding: [0x33,0x39,A,A] # O32-MM-FIXME: # fixup A - offset: 0, value: %lo(forward_local), kind: fixup_MICROMIPS_LO16 -# O32-MM-FIXME: .reloc ($tmp6), R_MIPS_JALR, forward_local +# O32-MM-FIXME: .reloc $tmp6, R_MIPS_JALR, forward_local # MIPS: jalr $25 # encoding: [0x03,0x20,0xf8,0x09] # MM: jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c] @@ -887,7 +887,7 @@ local_label: # O32-FIXME: # fixup A - offset: 0, value: %got(forward_local+8), kind: fixup_Mips_GOT # O32-FIXME: addiu $25, $25, %lo(forward_local+8) # encoding: [0x27,0x39,A,A] # O32-FIXME:: # fixup A - offset: 0, value: %lo(forward_local+8), kind: fixup_Mips_LO16 -# O32-FIXME: .reloc ($tmp7), R_MIPS_JALR, forward_local +# O32-FIXME: .reloc $tmp7, R_MIPS_JALR, forward_local # ELF-O32: 8f 99 00 00 lw $25, 0($gp) # ELF-O32-NEXT: R_MIPS_GOT16 .text @@ -912,7 +912,7 @@ local_label: # O32-MM-FIXME: # fixup A - offset: 0, value: %got(forward_local), kind: fixup_MICROMIPS_GOT16 # O32-MM-FIXME: addiu $25, $25, %lo(forward_local) # encoding: [0x33,0x39,A,A] # O32-MM-FIXME: # fixup A - offset: 0, value: %lo(forward_local), kind: fixup_MICROMIPS_LO16 -# O32-MM-FIXME: .reloc ($tmp6), R_MIPS_JALR, forward_local +# O32-MM-FIXME: .reloc $tmp6, R_MIPS_JALR, forward_local # MIPS: jalr $25 # encoding: [0x03,0x20,0xf8,0x09] # MM: jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c] diff --git a/llvm/test/MC/Mips/macro-div.s b/llvm/test/MC/Mips/macro-div.s index 8ce30d745bcf5..884618b667894 100644 --- a/llvm/test/MC/Mips/macro-div.s +++ b/llvm/test/MC/Mips/macro-div.s @@ -5,16 +5,16 @@ div $25,$11 # CHECK-NOTRAP: bnez $11, $tmp0 # encoding: [0x15,0x60,A,A] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp0)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp0-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: div $zero, $25, $11 # encoding: [0x03,0x2b,0x00,0x1a] # CHECK-NOTRAP: break 7 # encoding: [0x00,0x07,0x00,0x0d] # CHECK-NOTRAP: $tmp0: # CHECK-NOTRAP: addiu $1, $zero, -1 # encoding: [0x24,0x01,0xff,0xff] # CHECK-NOTRAP: bne $11, $1, $tmp1 # encoding: [0x15,0x61,A,A] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp1)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp1-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: lui $1, 32768 # encoding: [0x3c,0x01,0x80,0x00] # CHECK-NOTRAP: bne $25, $1, $tmp1 # encoding: [0x17,0x21,A,A] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp1)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp1-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: nop # encoding: [0x00,0x00,0x00,0x00] # CHECK-NOTRAP: break 6 # encoding: [0x00,0x06,0x00,0x0d] # CHECK-NOTRAP: $tmp1: @@ -23,7 +23,7 @@ # CHECK-TRAP: div $zero, $25, $11 # encoding: [0x03,0x2b,0x00,0x1a] # CHECK-TRAP: addiu $1, $zero, -1 # encoding: [0x24,0x01,0xff,0xff] # CHECK-TRAP: bne $11, $1, $tmp0 # encoding: [0x15,0x61,A,A] -# CHECK-TRAP: # fixup A - offset: 0, value: ($tmp0)-4, kind: fixup_Mips_PC16 +# CHECK-TRAP: # fixup A - offset: 0, value: $tmp0-4, kind: fixup_Mips_PC16 # CHECK-TRAP: lui $1, 32768 # encoding: [0x3c,0x01,0x80,0x00] # CHECK-TRAP: teq $25, $1, 6 # encoding: [0x03,0x21,0x01,0xb4] # CHECK-TRAP: $tmp0: @@ -31,16 +31,16 @@ div $24,$12 # CHECK-NOTRAP: bnez $12, $tmp2 # encoding: [0x15,0x80,A,A] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp2)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp2-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: div $zero, $24, $12 # encoding: [0x03,0x0c,0x00,0x1a] # CHECK-NOTRAP: break 7 # encoding: [0x00,0x07,0x00,0x0d] # CHECK-NOTRAP: $tmp2: # CHECK-NOTRAP: addiu $1, $zero, -1 # encoding: [0x24,0x01,0xff,0xff] # CHECK-NOTRAP: bne $12, $1, $tmp3 # encoding: [0x15,0x81,A,A] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp3)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp3-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: lui $1, 32768 # encoding: [0x3c,0x01,0x80,0x00] # CHECK-NOTRAP: bne $24, $1, $tmp3 # encoding: [0x17,0x01,A,A] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp3)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp3-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: nop # encoding: [0x00,0x00,0x00,0x00] # CHECK-NOTRAP: break 6 # encoding: [0x00,0x06,0x00,0x0d] # CHECK-NOTRAP: $tmp3: @@ -49,7 +49,7 @@ # CHECK-TRAP: div $zero, $24, $12 # encoding: [0x03,0x0c,0x00,0x1a] # CHECK-TRAP: addiu $1, $zero, -1 # encoding: [0x24,0x01,0xff,0xff] # CHECK-TRAP: bne $12, $1, $tmp1 # encoding: [0x15,0x81,A,A] -# CHECK-TRAP: # fixup A - offset: 0, value: ($tmp1)-4, kind: fixup_Mips_PC16 +# CHECK-TRAP: # fixup A - offset: 0, value: $tmp1-4, kind: fixup_Mips_PC16 # CHECK-TRAP: lui $1, 32768 # encoding: [0x3c,0x01,0x80,0x00] # CHECK-TRAP: teq $24, $1, 6 # encoding: [0x03,0x01,0x01,0xb4] # CHECK-TRAP: $tmp1: @@ -127,16 +127,16 @@ div $4,$5,$6 # CHECK-NOTRAP: bnez $6, $tmp4 # encoding: [0x14,0xc0,A,A] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp4)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp4-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: div $zero, $5, $6 # encoding: [0x00,0xa6,0x00,0x1a] # CHECK-NOTRAP: break 7 # encoding: [0x00,0x07,0x00,0x0d] # CHECK-NOTRAP: $tmp4: # CHECK-NOTRAP: addiu $1, $zero, -1 # encoding: [0x24,0x01,0xff,0xff] # CHECK-NOTRAP: bne $6, $1, $tmp5 # encoding: [0x14,0xc1,A,A] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp5)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp5-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: lui $1, 32768 # encoding: [0x3c,0x01,0x80,0x00] # CHECK-NOTRAP: bne $5, $1, $tmp5 # encoding: [0x14,0xa1,A,A] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp5)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp5-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: nop # encoding: [0x00,0x00,0x00,0x00] # CHECK-NOTRAP: break 6 # encoding: [0x00,0x06,0x00,0x0d] # CHECK-NOTRAP: $tmp5: @@ -145,7 +145,7 @@ # CHECK-TRAP: div $zero, $5, $6 # encoding: [0x00,0xa6,0x00,0x1a] # CHECK-TRAP: addiu $1, $zero, -1 # encoding: [0x24,0x01,0xff,0xff] # CHECK-TRAP: bne $6, $1, $tmp2 # encoding: [0x14,0xc1,A,A] -# CHECK-TRAP: # fixup A - offset: 0, value: ($tmp2)-4, kind: fixup_Mips_PC16 +# CHECK-TRAP: # fixup A - offset: 0, value: $tmp2-4, kind: fixup_Mips_PC16 # CHECK-TRAP: lui $1, 32768 # encoding: [0x3c,0x01,0x80,0x00] # CHECK-TRAP: teq $5, $1, 6 # encoding: [0x00,0xa1,0x01,0xb4] # CHECK-TRAP: $tmp2: diff --git a/llvm/test/MC/Mips/macro-divu.s b/llvm/test/MC/Mips/macro-divu.s index a3e8ae067c747..8b4b3ea4dbec2 100644 --- a/llvm/test/MC/Mips/macro-divu.s +++ b/llvm/test/MC/Mips/macro-divu.s @@ -5,7 +5,7 @@ divu $25,$11 # CHECK-NOTRAP: bnez $11, $tmp0 # encoding: [0x15,0x60,A,A] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp0)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp0-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: divu $zero, $25, $11 # encoding: [0x03,0x2b,0x00,0x1b] # CHECK-NOTRAP: break 7 # encoding: [0x00,0x07,0x00,0x0d] # CHECK-NOTRAP: $tmp0: @@ -13,7 +13,7 @@ divu $24,$12 # CHECK-NOTRAP: bnez $12, $tmp1 # encoding: [0x15,0x80,A,A] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp1)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp1-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: divu $zero, $24, $12 # encoding: [0x03,0x0c,0x00,0x1b] # CHECK-NOTRAP: break 7 # encoding: [0x00,0x07,0x00,0x0d] # CHECK-NOTRAP: $tmp1: @@ -30,7 +30,7 @@ divu $4,$5,$6 # CHECK-NOTRAP: bnez $6, $tmp2 # encoding: [0x14,0xc0,A,A] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp2)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp2-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: divu $zero, $5, $6 # encoding: [0x00,0xa6,0x00,0x1b] # CHECK-NOTRAP: break 7 # encoding: [0x00,0x07,0x00,0x0d] # CHECK-NOTRAP: $tmp2: diff --git a/llvm/test/MC/Mips/macro-rem.s b/llvm/test/MC/Mips/macro-rem.s index 40812949664d6..a33c4a098ed69 100644 --- a/llvm/test/MC/Mips/macro-rem.s +++ b/llvm/test/MC/Mips/macro-rem.s @@ -5,16 +5,16 @@ rem $4,$5 # CHECK-NOTRAP: bnez $5, $tmp0 # encoding: [A,A,0xa0,0x14] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp0)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp0-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: div $zero, $4, $5 # encoding: [0x1a,0x00,0x85,0x00] # CHECK-NOTRAP: break 7 # encoding: [0x0d,0x00,0x07,0x00] # CHECK-NOTRAP: $tmp0 # CHECK-NOTRAP: addiu $1, $zero, -1 # encoding: [0xff,0xff,0x01,0x24] # CHECK-NOTRAP: bne $5, $1, $tmp1 # encoding: [A,A,0xa1,0x14] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp1)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp1-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: lui $1, 32768 # encoding: [0x00,0x80,0x01,0x3c] # CHECK-NOTRAP: bne $4, $1, $tmp1 # encoding: [A,A,0x81,0x14] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp1)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp1-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: nop # encoding: [0x00,0x00,0x00,0x00] # CHECK-NOTRAP: break 6 # encoding: [0x0d,0x00,0x06,0x00] # CHECK-NOTRAP: $tmp1 diff --git a/llvm/test/MC/Mips/macro-remu.s b/llvm/test/MC/Mips/macro-remu.s index 5e7b150e2105e..6520d17426419 100644 --- a/llvm/test/MC/Mips/macro-remu.s +++ b/llvm/test/MC/Mips/macro-remu.s @@ -5,7 +5,7 @@ remu $4,$5 # CHECK-NOTRAP: bnez $5, $tmp0 # encoding: [A,A,0xa0,0x14] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp0)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp0-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: divu $zero, $4, $5 # encoding: [0x1b,0x00,0x85,0x00] # CHECK-NOTRAP: break 7 # encoding: [0x0d,0x00,0x07,0x00] # CHECK-NOTRAP: mfhi $4 # encoding: [0x10,0x20,0x00,0x00] @@ -82,7 +82,7 @@ remu $4,$5,$6 # CHECK-NOTRAP: bnez $6, $tmp1 # encoding: [A,A,0xc0,0x14] -# CHECK-NOTRAP: # fixup A - offset: 0, value: ($tmp1)-4, kind: fixup_Mips_PC16 +# CHECK-NOTRAP: # fixup A - offset: 0, value: $tmp1-4, kind: fixup_Mips_PC16 # CHECK-NOTRAP: divu $zero, $5, $6 # encoding: [0x1b,0x00,0xa6,0x00] # CHECK-NOTRAP: break 7 # encoding: [0x0d,0x00,0x07,0x00] # CHECK-NOTRAP: $tmp1 diff --git a/llvm/test/MC/Mips/mips-fpu-instructions.s b/llvm/test/MC/Mips/mips-fpu-instructions.s index 733231afb793c..e740372a5e9f1 100644 --- a/llvm/test/MC/Mips/mips-fpu-instructions.s +++ b/llvm/test/MC/Mips/mips-fpu-instructions.s @@ -141,7 +141,7 @@ # FP move instructions #------------------------------------------------------------------------------ # CHECK: bc1f $BB_1 # encoding: [A,A,0x00,0x45] -# CHECK: # fixup A - offset: 0, value: ($BB_1)-4, kind: fixup_Mips_PC16 +# CHECK: # fixup A - offset: 0, value: $BB_1-4, kind: fixup_Mips_PC16 # CHECK: cfc1 $6, $0 # encoding: [0x00,0x00,0x46,0x44] # CHECK: ctc1 $10, $31 # encoding: [0x00,0xf8,0xca,0x44] diff --git a/llvm/test/MC/Mips/mips1/valid.s b/llvm/test/MC/Mips/mips1/valid.s index a67c93846ac98..95d4312845c34 100644 --- a/llvm/test/MC/Mips/mips1/valid.s +++ b/llvm/test/MC/Mips/mips1/valid.s @@ -52,7 +52,7 @@ a: # CHECK-NEXT: # Date: Sun, 30 Mar 2025 16:42:49 -0700 Subject: [PATCH 0060/1029] [lldb-dap] Swapping to not use FLAG_ENUM and just defining typed enums. (#133622) Small tweak to the previous patch to make the enums in `lldb_dap::protocol` typed to work with types like `llvm::DenseSet` found by ubsan. --- lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp | 8 +- lldb/tools/lldb-dap/Protocol/ProtocolBase.h | 11 +- .../lldb-dap/Protocol/ProtocolRequests.h | 35 ++- lldb/tools/lldb-dap/Protocol/ProtocolTypes.h | 235 +++++++++--------- 4 files changed, 148 insertions(+), 141 deletions(-) diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp index 0d63e37d3eafb..87fd0df018b65 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp +++ b/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "Protocol/ProtocolBase.h" -#include "lldb/lldb-enumerations.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" @@ -32,8 +31,11 @@ static bool mapRaw(const json::Value &Params, StringLiteral Prop, namespace lldb_dap::protocol { -FLAGS_ENUM(MessageType){eMessageTypeRequest, eMessageTypeResponse, - eMessageTypeEvent}; +enum MessageType : unsigned { + eMessageTypeRequest, + eMessageTypeResponse, + eMessageTypeEvent +}; bool fromJSON(const json::Value &Params, MessageType &M, json::Path P) { auto rawType = Params.getAsString(); diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolBase.h b/lldb/tools/lldb-dap/Protocol/ProtocolBase.h index 5ac68e38cb9c4..2c647610de11c 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolBase.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolBase.h @@ -20,7 +20,6 @@ #ifndef LLDB_TOOLS_LLDB_DAP_PROTOCOL_H #define LLDB_TOOLS_LLDB_DAP_PROTOCOL_H -#include "lldb/lldb-enumerations.h" #include "llvm/Support/JSON.h" #include #include @@ -65,11 +64,11 @@ struct Event { llvm::json::Value toJSON(const Event &); bool fromJSON(const llvm::json::Value &, Event &, llvm::json::Path); -FLAGS_ENUM(ResponseMessage){ - /// The request was cancelled - eResponseMessageCancelled, - /// The request may be retried once the adapter is in a 'stopped' state - eResponseMessageNotStopped, +enum ResponseMessage : unsigned { + /// The request was cancelled + eResponseMessageCancelled, + /// The request may be retried once the adapter is in a 'stopped' state + eResponseMessageNotStopped, }; /// Response for a request. diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h index 116cf8516c52e..927106997953a 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h @@ -22,7 +22,6 @@ #include "Protocol/ProtocolBase.h" #include "Protocol/ProtocolTypes.h" -#include "lldb/lldb-enumerations.h" #include "llvm/ADT/DenseSet.h" #include "llvm/Support/JSON.h" #include @@ -57,26 +56,26 @@ bool fromJSON(const llvm::json::Value &, DisconnectArguments &, using DisconnectResponse = VoidResponse; /// Features supported by DAP clients. -FLAGS_ENUM(ClientFeature){ - eClientFeatureVariableType, - eClientFeatureVariablePaging, - eClientFeatureRunInTerminalRequest, - eClientFeatureMemoryReferences, - eClientFeatureProgressReporting, - eClientFeatureInvalidatedEvent, - eClientFeatureMemoryEvent, - /// Client supports the `argsCanBeInterpretedByShell` attribute on the - /// `runInTerminal` request. - eClientFeatureArgsCanBeInterpretedByShell, - eClientFeatureStartDebuggingRequest, - /// The client will interpret ANSI escape sequences in the display of - /// `OutputEvent.output` and `Variable.value` fields when - /// `Capabilities.supportsANSIStyling` is also enabled. - eClientFeatureANSIStyling, +enum ClientFeature : unsigned { + eClientFeatureVariableType, + eClientFeatureVariablePaging, + eClientFeatureRunInTerminalRequest, + eClientFeatureMemoryReferences, + eClientFeatureProgressReporting, + eClientFeatureInvalidatedEvent, + eClientFeatureMemoryEvent, + /// Client supports the `argsCanBeInterpretedByShell` attribute on the + /// `runInTerminal` request. + eClientFeatureArgsCanBeInterpretedByShell, + eClientFeatureStartDebuggingRequest, + /// The client will interpret ANSI escape sequences in the display of + /// `OutputEvent.output` and `Variable.value` fields when + /// `Capabilities.supportsANSIStyling` is also enabled. + eClientFeatureANSIStyling, }; /// Format of paths reported by the debug adapter. -FLAGS_ENUM(PathFormat){ePatFormatPath, ePathFormatURI}; +enum PathFormat : unsigned { ePatFormatPath, ePathFormatURI }; /// Arguments for `initialize` request. struct InitializeRequestArguments { diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h index 463f9dbbaf4ea..8f38c524ea649 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h @@ -20,7 +20,6 @@ #ifndef LLDB_TOOLS_LLDB_DAP_PROTOCOL_PROTOCOL_TYPES_H #define LLDB_TOOLS_LLDB_DAP_PROTOCOL_PROTOCOL_TYPES_H -#include "lldb/lldb-enumerations.h" #include "llvm/ADT/DenseSet.h" #include "llvm/Support/JSON.h" #include @@ -57,8 +56,12 @@ struct ExceptionBreakpointsFilter { }; llvm::json::Value toJSON(const ExceptionBreakpointsFilter &); -FLAGS_ENUM(ColumnType){eColumnTypeString, eColumnTypeNumber, eColumnTypeBoolean, - eColumnTypeTimestamp}; +enum ColumnType : unsigned { + eColumnTypeString, + eColumnTypeNumber, + eColumnTypeBoolean, + eColumnTypeTimestamp +}; /// A ColumnDescriptor specifies what module attribute to show in a column of /// the modules view, how to format it, and what the column’s label should be. @@ -87,23 +90,27 @@ llvm::json::Value toJSON(const ColumnDescriptor &); /// Names of checksum algorithms that may be supported by a debug adapter. /// Values: ‘MD5’, ‘SHA1’, ‘SHA256’, ‘timestamp’. -FLAGS_ENUM(ChecksumAlgorithm){eChecksumAlgorithmMD5, eChecksumAlgorithmSHA1, - eChecksumAlgorithmSHA256, - eChecksumAlgorithmTimestamp}; +enum ChecksumAlgorithm : unsigned { + eChecksumAlgorithmMD5, + eChecksumAlgorithmSHA1, + eChecksumAlgorithmSHA256, + eChecksumAlgorithmTimestamp +}; llvm::json::Value toJSON(const ChecksumAlgorithm &); /// Describes one or more type of breakpoint a BreakpointMode applies to. This /// is a non-exhaustive enumeration and may expand as future breakpoint types /// are added. -FLAGS_ENUM(BreakpointModeApplicability){ - /// In `SourceBreakpoint`'s. - eBreakpointModeApplicabilitySource, - /// In exception breakpoints applied in the `ExceptionFilterOptions`. - eBreakpointModeApplicabilityException, - /// In data breakpoints requested in the `DataBreakpointInfo` request. - eBreakpointModeApplicabilityData, - /// In `InstructionBreakpoint`'s. - eBreakpointModeApplicabilityInstruction}; +enum BreakpointModeApplicability : unsigned { + /// In `SourceBreakpoint`'s. + eBreakpointModeApplicabilitySource, + /// In exception breakpoints applied in the `ExceptionFilterOptions`. + eBreakpointModeApplicabilityException, + /// In data breakpoints requested in the `DataBreakpointInfo` request. + eBreakpointModeApplicabilityData, + /// In `InstructionBreakpoint`'s. + eBreakpointModeApplicabilityInstruction +}; llvm::json::Value toJSON(const BreakpointModeApplicability &); /// A `BreakpointMode` is provided as a option when setting breakpoints on @@ -126,101 +133,101 @@ struct BreakpointMode { llvm::json::Value toJSON(const BreakpointMode &); /// Debug Adapter Features flags supported by lldb-dap. -FLAGS_ENUM(AdapterFeature){ - /// The debug adapter supports ANSI escape sequences in styling of - /// `OutputEvent.output` and `Variable.value` fields. - eAdapterFeatureANSIStyling, - /// The debug adapter supports the `breakpointLocations` request. - eAdapterFeatureBreakpointLocationsRequest, - /// The debug adapter supports the `cancel` request. - eAdapterFeatureCancelRequest, - /// The debug adapter supports the `clipboard` context value in the - /// `evaluate` request. - eAdapterFeatureClipboardContext, - /// The debug adapter supports the `completions` request. - eAdapterFeatureCompletionsRequest, - /// The debug adapter supports conditional breakpoints. - eAdapterFeatureConditionalBreakpoints, - /// The debug adapter supports the `configurationDone` request. - eAdapterFeatureConfigurationDoneRequest, - /// The debug adapter supports the `asAddress` and `bytes` fields in the - /// `dataBreakpointInfo` request. - eAdapterFeatureDataBreakpointBytes, - /// The debug adapter supports data breakpoints. - eAdapterFeatureDataBreakpoints, - /// The debug adapter supports the delayed loading of parts of the stack, - /// which requires that both the `startFrame` and `levels` arguments and the - /// `totalFrames` result of the `stackTrace` request are supported. - eAdapterFeatureDelayedStackTraceLoading, - /// The debug adapter supports the `disassemble` request. - eAdapterFeatureDisassembleRequest, - /// The debug adapter supports a (side effect free) `evaluate` request for - /// data hovers. - eAdapterFeatureEvaluateForHovers, - /// The debug adapter supports `filterOptions` as an argument on the - /// `setExceptionBreakpoints` request. - eAdapterFeatureExceptionFilterOptions, - /// The debug adapter supports the `exceptionInfo` request. - eAdapterFeatureExceptionInfoRequest, - /// The debug adapter supports `exceptionOptions` on the - /// `setExceptionBreakpoints` request. - eAdapterFeatureExceptionOptions, - /// The debug adapter supports function breakpoints. - eAdapterFeatureFunctionBreakpoints, - /// The debug adapter supports the `gotoTargets` request. - eAdapterFeatureGotoTargetsRequest, - /// The debug adapter supports breakpoints that break execution after a - /// specified number of hits. - eAdapterFeatureHitConditionalBreakpoints, - /// The debug adapter supports adding breakpoints based on instruction - /// references. - eAdapterFeatureInstructionBreakpoints, - /// The debug adapter supports the `loadedSources` request. - eAdapterFeatureLoadedSourcesRequest, - /// The debug adapter supports log points by interpreting the `logMessage` - /// attribute of the `SourceBreakpoint`. - eAdapterFeatureLogPoints, - /// The debug adapter supports the `modules` request. - eAdapterFeatureModulesRequest, - /// The debug adapter supports the `readMemory` request. - eAdapterFeatureReadMemoryRequest, - /// The debug adapter supports restarting a frame. - eAdapterFeatureRestartFrame, - /// The debug adapter supports the `restart` request. In this case a client - /// should not implement `restart` by terminating and relaunching the - /// adapter but by calling the `restart` request. - eAdapterFeatureRestartRequest, - /// The debug adapter supports the `setExpression` request. - eAdapterFeatureSetExpression, - /// The debug adapter supports setting a variable to a value. - eAdapterFeatureSetVariable, - /// The debug adapter supports the `singleThread` property on the execution - /// requests (`continue`, `next`, `stepIn`, `stepOut`, `reverseContinue`, - /// `stepBack`). - eAdapterFeatureSingleThreadExecutionRequests, - /// The debug adapter supports stepping back via the `stepBack` and - /// `reverseContinue` requests. - eAdapterFeatureStepBack, - /// The debug adapter supports the `stepInTargets` request. - eAdapterFeatureStepInTargetsRequest, - /// The debug adapter supports stepping granularities (argument - /// `granularity`) for the stepping requests. - eAdapterFeatureSteppingGranularity, - /// The debug adapter supports the `terminate` request. - eAdapterFeatureTerminateRequest, - /// The debug adapter supports the `terminateThreads` request. - eAdapterFeatureTerminateThreadsRequest, - /// The debug adapter supports the `suspendDebuggee` attribute on the - /// `disconnect` request. - eAdapterFeatureSuspendDebuggee, - /// The debug adapter supports a `format` attribute on the `stackTrace`, - /// `variables`, and `evaluate` requests. - eAdapterFeatureValueFormattingOptions, - /// The debug adapter supports the `writeMemory` request. - eAdapterFeatureWriteMemoryRequest, - /// The debug adapter supports the `terminateDebuggee` attribute on the - /// `disconnect` request. - eAdapterFeatureTerminateDebuggee, +enum AdapterFeature : unsigned { + /// The debug adapter supports ANSI escape sequences in styling of + /// `OutputEvent.output` and `Variable.value` fields. + eAdapterFeatureANSIStyling, + /// The debug adapter supports the `breakpointLocations` request. + eAdapterFeatureBreakpointLocationsRequest, + /// The debug adapter supports the `cancel` request. + eAdapterFeatureCancelRequest, + /// The debug adapter supports the `clipboard` context value in the + /// `evaluate` request. + eAdapterFeatureClipboardContext, + /// The debug adapter supports the `completions` request. + eAdapterFeatureCompletionsRequest, + /// The debug adapter supports conditional breakpoints. + eAdapterFeatureConditionalBreakpoints, + /// The debug adapter supports the `configurationDone` request. + eAdapterFeatureConfigurationDoneRequest, + /// The debug adapter supports the `asAddress` and `bytes` fields in the + /// `dataBreakpointInfo` request. + eAdapterFeatureDataBreakpointBytes, + /// The debug adapter supports data breakpoints. + eAdapterFeatureDataBreakpoints, + /// The debug adapter supports the delayed loading of parts of the stack, + /// which requires that both the `startFrame` and `levels` arguments and the + /// `totalFrames` result of the `stackTrace` request are supported. + eAdapterFeatureDelayedStackTraceLoading, + /// The debug adapter supports the `disassemble` request. + eAdapterFeatureDisassembleRequest, + /// The debug adapter supports a (side effect free) `evaluate` request for + /// data hovers. + eAdapterFeatureEvaluateForHovers, + /// The debug adapter supports `filterOptions` as an argument on the + /// `setExceptionBreakpoints` request. + eAdapterFeatureExceptionFilterOptions, + /// The debug adapter supports the `exceptionInfo` request. + eAdapterFeatureExceptionInfoRequest, + /// The debug adapter supports `exceptionOptions` on the + /// `setExceptionBreakpoints` request. + eAdapterFeatureExceptionOptions, + /// The debug adapter supports function breakpoints. + eAdapterFeatureFunctionBreakpoints, + /// The debug adapter supports the `gotoTargets` request. + eAdapterFeatureGotoTargetsRequest, + /// The debug adapter supports breakpoints that break execution after a + /// specified number of hits. + eAdapterFeatureHitConditionalBreakpoints, + /// The debug adapter supports adding breakpoints based on instruction + /// references. + eAdapterFeatureInstructionBreakpoints, + /// The debug adapter supports the `loadedSources` request. + eAdapterFeatureLoadedSourcesRequest, + /// The debug adapter supports log points by interpreting the `logMessage` + /// attribute of the `SourceBreakpoint`. + eAdapterFeatureLogPoints, + /// The debug adapter supports the `modules` request. + eAdapterFeatureModulesRequest, + /// The debug adapter supports the `readMemory` request. + eAdapterFeatureReadMemoryRequest, + /// The debug adapter supports restarting a frame. + eAdapterFeatureRestartFrame, + /// The debug adapter supports the `restart` request. In this case a client + /// should not implement `restart` by terminating and relaunching the + /// adapter but by calling the `restart` request. + eAdapterFeatureRestartRequest, + /// The debug adapter supports the `setExpression` request. + eAdapterFeatureSetExpression, + /// The debug adapter supports setting a variable to a value. + eAdapterFeatureSetVariable, + /// The debug adapter supports the `singleThread` property on the execution + /// requests (`continue`, `next`, `stepIn`, `stepOut`, `reverseContinue`, + /// `stepBack`). + eAdapterFeatureSingleThreadExecutionRequests, + /// The debug adapter supports stepping back via the `stepBack` and + /// `reverseContinue` requests. + eAdapterFeatureStepBack, + /// The debug adapter supports the `stepInTargets` request. + eAdapterFeatureStepInTargetsRequest, + /// The debug adapter supports stepping granularities (argument + /// `granularity`) for the stepping requests. + eAdapterFeatureSteppingGranularity, + /// The debug adapter supports the `terminate` request. + eAdapterFeatureTerminateRequest, + /// The debug adapter supports the `terminateThreads` request. + eAdapterFeatureTerminateThreadsRequest, + /// The debug adapter supports the `suspendDebuggee` attribute on the + /// `disconnect` request. + eAdapterFeatureSuspendDebuggee, + /// The debug adapter supports a `format` attribute on the `stackTrace`, + /// `variables`, and `evaluate` requests. + eAdapterFeatureValueFormattingOptions, + /// The debug adapter supports the `writeMemory` request. + eAdapterFeatureWriteMemoryRequest, + /// The debug adapter supports the `terminateDebuggee` attribute on the + /// `disconnect` request. + eAdapterFeatureTerminateDebuggee, }; /// Information about the capabilities of a debug adapter. @@ -261,10 +268,10 @@ struct Capabilities { }; llvm::json::Value toJSON(const Capabilities &); -FLAGS_ENUM(PresentationHint){ - ePresentationHintNormal, - ePresentationHintEmphasize, - ePresentationHintDeemphasize, +enum PresentationHint : unsigned { + ePresentationHintNormal, + ePresentationHintEmphasize, + ePresentationHintDeemphasize, }; /// A `Source` is a descriptor for source code. It is returned from the debug From c9d90f15af0c5ed6ad5c5cd3aa988139a2cc34e4 Mon Sep 17 00:00:00 2001 From: Roman Belenov <103195329+r-belenov@users.noreply.github.com> Date: Mon, 31 Mar 2025 02:59:53 +0300 Subject: [PATCH 0061/1029] [Exegesis][AArch64] Use more generic cycles counter (#133376) CPU_CYCLES counter does not work on some Aarch64 CPUs; CYCLES is more generic and is equivalent to CPU_CYCLES in case the latter is supported. Longer story - CPU_CYCLES work only on CPU models explicitly recognized by libpfm4 ( via pfm_arm_detect_*() functions in https://sourceforge.net/p/perfmon2/libpfm4/ci/master/tree/lib/pfmlib_arm_armv8.c ) and its name is consistent with ARM documentation. However, the counter is architectural and is supported on all ARMv8 CPUs; libpfm4 recognizes generic PMU on unknown ARMv8 CPUs, but does not provide CPU_CYCLES event. Instead, CYCLES is provided (an alias to PERF_COUNT_HW_CPU_CYCLES). Physically, it is the same event with code 0x11. On supported architectures CYCLES also work, so the change should not introduce regression. --- llvm/lib/Target/AArch64/AArch64PfmCounters.td | 2 +- llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64PfmCounters.td b/llvm/lib/Target/AArch64/AArch64PfmCounters.td index b1d1664e3f1b1..c7132b40ca2fe 100644 --- a/llvm/lib/Target/AArch64/AArch64PfmCounters.td +++ b/llvm/lib/Target/AArch64/AArch64PfmCounters.td @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -def CpuCyclesPfmCounter : PfmCounter<"CPU_CYCLES">; +def CpuCyclesPfmCounter : PfmCounter<"CYCLES">; def DefaultPfmCounters : ProcPfmCounters { let CycleCounter = CpuCyclesPfmCounter; diff --git a/llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp index 71675d9f46739..ca5416eef39d5 100644 --- a/llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp +++ b/llvm/unittests/tools/llvm-exegesis/AArch64/TargetTest.cpp @@ -65,7 +65,7 @@ TEST_F(AArch64TargetTest, SetRegToConstant) { } TEST_F(AArch64TargetTest, DefaultPfmCounters) { - const std::string Expected = "CPU_CYCLES"; + const std::string Expected = "CYCLES"; EXPECT_EQ(ExegesisTarget_->getPfmCounters("").CycleCounter, Expected); EXPECT_EQ(ExegesisTarget_->getPfmCounters("unknown_cpu").CycleCounter, Expected); From 94122d58fc77079a291a3d008914006cb509d9db Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 31 Mar 2025 08:42:51 +0700 Subject: [PATCH 0062/1029] Lint: Replace -lint-abort-on-error cl::opt with pass parameter (#132933) --- llvm/include/llvm/Analysis/Lint.h | 10 +++++++-- llvm/lib/Analysis/Lint.cpp | 27 ++++++++++++----------- llvm/lib/Passes/PassBuilder.cpp | 6 +++++ llvm/lib/Passes/PassRegistry.def | 6 ++++- llvm/test/Analysis/Lint/abort-on-error.ll | 4 ++-- llvm/test/Analysis/Lint/const-store.ll | 4 ++-- 6 files changed, 37 insertions(+), 20 deletions(-) diff --git a/llvm/include/llvm/Analysis/Lint.h b/llvm/include/llvm/Analysis/Lint.h index 8dffa1ecb5f38..b0bc0ff4e1d89 100644 --- a/llvm/include/llvm/Analysis/Lint.h +++ b/llvm/include/llvm/Analysis/Lint.h @@ -29,14 +29,20 @@ class Function; /// /// This should only be used for debugging, because it plays games with /// PassManagers and stuff. -void lintModule(const Module &M); +void lintModule(const Module &M, bool AbortOnError = false); // Lint a function. -void lintFunction(const Function &F); +void lintFunction(const Function &F, bool AbortOnError = false); class LintPass : public PassInfoMixin { + const bool AbortOnError; + public: + LintPass(bool AbortOnError) : AbortOnError(AbortOnError) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + + void printPipeline(raw_ostream &OS, + function_ref MapClassName2PassName); }; } // namespace llvm diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp index a01672844e0ec..f05e36e2025d4 100644 --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -78,11 +78,6 @@ using namespace llvm; -static const char LintAbortOnErrorArgName[] = "lint-abort-on-error"; -static cl::opt - LintAbortOnError(LintAbortOnErrorArgName, cl::init(false), - cl::desc("In the Lint pass, abort on errors.")); - namespace { namespace MemRef { static const unsigned Read = 1; @@ -747,20 +742,26 @@ PreservedAnalyses LintPass::run(Function &F, FunctionAnalysisManager &AM) { Lint L(Mod, DL, AA, AC, DT, TLI); L.visit(F); dbgs() << L.MessagesStr.str(); - if (LintAbortOnError && !L.MessagesStr.str().empty()) - report_fatal_error(Twine("Linter found errors, aborting. (enabled by --") + - LintAbortOnErrorArgName + ")", - false); + if (AbortOnError && !L.MessagesStr.str().empty()) + report_fatal_error( + "linter found errors, aborting. (enabled by abort-on-error)", false); return PreservedAnalyses::all(); } +void LintPass::printPipeline( + raw_ostream &OS, function_ref MapClassName2PassName) { + PassInfoMixin::printPipeline(OS, MapClassName2PassName); + if (AbortOnError) + OS << ""; +} + //===----------------------------------------------------------------------===// // Implement the public interfaces to this file... //===----------------------------------------------------------------------===// /// lintFunction - Check a function for errors, printing messages on stderr. /// -void llvm::lintFunction(const Function &f) { +void llvm::lintFunction(const Function &f, bool AbortOnError) { Function &F = const_cast(f); assert(!F.isDeclaration() && "Cannot lint external functions"); @@ -775,14 +776,14 @@ void llvm::lintFunction(const Function &f) { AA.registerFunctionAnalysis(); return AA; }); - LintPass().run(F, FAM); + LintPass(AbortOnError).run(F, FAM); } /// lintModule - Check a module for errors, printing messages on stderr. /// -void llvm::lintModule(const Module &M) { +void llvm::lintModule(const Module &M, bool AbortOnError) { for (const Function &F : M) { if (!F.isDeclaration()) - lintFunction(F); + lintFunction(F, AbortOnError); } } diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 1b37e4a4fe1a3..8646c1f49ac35 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -681,6 +681,12 @@ Expected parseHardwareLoopOptions(StringRef Params) { return HardwareLoopOpts; } +/// Parser of parameters for Lint pass. +Expected parseLintOptions(StringRef Params) { + return PassBuilder::parseSinglePassOption(Params, "abort-on-error", + "LintPass"); +} + /// Parser of parameters for LoopUnroll pass. Expected parseLoopUnrollOptions(StringRef Params) { LoopUnrollOptions UnrollOpts; diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 49135c5e1a658..a43be480d6194 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -397,7 +397,6 @@ FUNCTION_PASS("kcfi", KCFIPass()) FUNCTION_PASS("kernel-info", KernelInfoPrinter(TM)) FUNCTION_PASS("lcssa", LCSSAPass()) FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass()) -FUNCTION_PASS("lint", LintPass()) FUNCTION_PASS("load-store-vectorizer", LoadStoreVectorizerPass()) FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass()) FUNCTION_PASS("loop-distribute", LoopDistributePass()) @@ -543,6 +542,11 @@ FUNCTION_PASS_WITH_PARAMS( parseInstCombineOptions, "no-use-loop-info;use-loop-info;no-verify-fixpoint;verify-fixpoint;" "max-iterations=N") +FUNCTION_PASS_WITH_PARAMS( + "lint", "LintPass", + [](bool AbortOnError) { return LintPass(AbortOnError); }, + parseLintOptions, + "abort-on-error") FUNCTION_PASS_WITH_PARAMS( "loop-unroll", "LoopUnrollPass", [](LoopUnrollOptions Opts) { return LoopUnrollPass(Opts); }, diff --git a/llvm/test/Analysis/Lint/abort-on-error.ll b/llvm/test/Analysis/Lint/abort-on-error.ll index 3efc38aea887c..0bbbcfa9d7418 100644 --- a/llvm/test/Analysis/Lint/abort-on-error.ll +++ b/llvm/test/Analysis/Lint/abort-on-error.ll @@ -1,8 +1,8 @@ -; RUN: not opt -passes=lint -disable-output --lint-abort-on-error %s 2>&1 | FileCheck %s +; RUN: not opt -passes='lint' -disable-output %s 2>&1 | FileCheck %s ; CHECK: Undefined behavior: Division by zero ; CHECK-NEXT: %b = sdiv i32 %a, 0 -; CHECK-NEXT: LLVM ERROR: Linter found errors, aborting. (enabled by --lint-abort-on-error) +; CHECK-NEXT: LLVM ERROR: linter found errors, aborting. (enabled by abort-on-error) define i32 @sdiv_by_zero(i32 %a) { %b = sdiv i32 %a, 0 diff --git a/llvm/test/Analysis/Lint/const-store.ll b/llvm/test/Analysis/Lint/const-store.ll index 030a0be3aecc2..748f752b2975f 100644 --- a/llvm/test/Analysis/Lint/const-store.ll +++ b/llvm/test/Analysis/Lint/const-store.ll @@ -1,6 +1,6 @@ -; RUN: not opt --mtriple=amdgcn --passes=lint --lint-abort-on-error %s -disable-output 2>&1 | FileCheck %s +; RUN: not opt --mtriple=amdgcn --passes='lint' %s -disable-output 2>&1 | FileCheck %s ; RUN: opt --mtriple=amdgcn --mcpu=gfx1030 --passes=lint %s -disable-output 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK0 -; RUN: opt --mtriple=x86_64 --passes=lint --lint-abort-on-error %s -disable-output 2>&1 | FileCheck %s --allow-empty --check-prefix=NOERR +; RUN: opt --mtriple=x86_64 --passes='lint' %s -disable-output 2>&1 | FileCheck %s --allow-empty --check-prefix=NOERR ; NOERR: {{^$}} define amdgpu_kernel void @store_const(ptr addrspace(4) %out, i32 %a, i32 %b) { From 6257621f41d1deb31cfbfcee993a75991a0bca13 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 30 Mar 2025 18:43:02 -0700 Subject: [PATCH 0063/1029] [llvm] Use llvm::append_range (NFC) (#133658) --- llvm/include/llvm/CodeGen/TileShapeInfo.h | 3 +-- .../Vectorize/SandboxVectorizer/Legality.h | 3 +-- llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 4 +--- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 15 +++++---------- llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp | 3 +-- llvm/lib/DebugInfo/LogicalView/Core/LVSymbol.cpp | 3 +-- .../LogicalView/Readers/LVCodeViewReader.cpp | 3 +-- llvm/lib/FileCheck/FileCheck.cpp | 6 ++---- llvm/lib/ObjCopy/COFF/COFFReader.cpp | 3 +-- llvm/lib/ObjCopy/XCOFF/XCOFFReader.cpp | 3 +-- llvm/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp | 4 +--- llvm/lib/Support/CommandLine.cpp | 4 ++-- llvm/lib/Support/Debug.cpp | 3 +-- llvm/lib/Support/SuffixTree.cpp | 3 +-- llvm/lib/TableGen/Record.cpp | 3 +-- llvm/utils/not/not.cpp | 5 +---- 16 files changed, 22 insertions(+), 46 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TileShapeInfo.h b/llvm/include/llvm/CodeGen/TileShapeInfo.h index 24f303a7d9d13..9cea327819895 100644 --- a/llvm/include/llvm/CodeGen/TileShapeInfo.h +++ b/llvm/include/llvm/CodeGen/TileShapeInfo.h @@ -48,8 +48,7 @@ class ShapeT { ColImm(InvalidImmShape) { assert(ShapesOperands.size() % 2 == 0 && "Miss row or col!"); - for (auto *Shape : ShapesOperands) - Shapes.push_back(Shape); + llvm::append_range(Shapes, ShapesOperands); if (MRI) deduceImm(MRI); diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h index 4ea840c099e70..f74dcecb7e3e6 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h @@ -43,8 +43,7 @@ class ShuffleMask { static ShuffleMask getIdentity(unsigned Sz) { IndicesVecT Indices; Indices.reserve(Sz); - for (auto Idx : seq(0, (int)Sz)) - Indices.push_back(Idx); + llvm::append_range(Indices, seq(0, (int)Sz)); return ShuffleMask(std::move(Indices)); } /// \Returns true if the mask is a perfect identity mask with consecutive diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 40e755902b724..b0d9bcc384101 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -3339,10 +3339,8 @@ Error BitcodeReader::parseConstants() { if (Record.empty()) return error("Invalid aggregate record"); - unsigned Size = Record.size(); SmallVector Elts; - for (unsigned i = 0; i != Size; ++i) - Elts.push_back(Record[i]); + llvm::append_range(Elts, Record); if (isa(CurTy)) { V = BitcodeConstant::create( diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 34ba25dccc368..49411098d9c0c 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -1209,8 +1209,7 @@ void ModuleBitcodeWriter::writeTypeTable() { TypeVals.push_back(TET->getNumTypeParameters()); for (Type *InnerTy : TET->type_params()) TypeVals.push_back(VE.getTypeID(InnerTy)); - for (unsigned IntParam : TET->int_params()) - TypeVals.push_back(IntParam); + llvm::append_range(TypeVals, TET->int_params()); break; } case Type::TypedPointerTyID: @@ -4303,10 +4302,8 @@ static void writeFunctionHeapProfileRecords( } for (auto Id : CI.StackIdIndices) Record.push_back(GetStackIndex(Id)); - if (!PerModule) { - for (auto V : CI.Clones) - Record.push_back(V); - } + if (!PerModule) + llvm::append_range(Record, CI.Clones); Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_CALLSITE_INFO : bitc::FS_COMBINED_CALLSITE_INFO, Record, CallsiteAbbrev); @@ -4326,10 +4323,8 @@ static void writeFunctionHeapProfileRecords( assert(CallStackCount <= CallStackPos.size()); Record.push_back(CallStackPos[CallStackCount++]); } - if (!PerModule) { - for (auto V : AI.Versions) - Record.push_back(V); - } + if (!PerModule) + llvm::append_range(Record, AI.Versions); assert(AI.ContextSizeInfos.empty() || AI.ContextSizeInfos.size() == AI.MIBs.size()); // Optionally emit the context size information if it exists. diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp index c8789cb959fb7..8050c0efdd7cb 100644 --- a/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Core/LVOptions.cpp @@ -445,8 +445,7 @@ void LVPatterns::addGenericPatterns(StringSet<> &Patterns) { } void LVPatterns::addOffsetPatterns(const LVOffsetSet &Patterns) { - for (const LVOffset &Entry : Patterns) - OffsetMatchInfo.push_back(Entry); + llvm::append_range(OffsetMatchInfo, Patterns); if (OffsetMatchInfo.size()) { options().setSelectOffsetPattern(); options().setSelectExecute(); diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVSymbol.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVSymbol.cpp index 4608fe20cb6df..44d073387206e 100644 --- a/llvm/lib/DebugInfo/LogicalView/Core/LVSymbol.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Core/LVSymbol.cpp @@ -182,8 +182,7 @@ void LVSymbol::getLocations(LVLocations &LocationList) const { if (!Locations) return; - for (LVLocation *Location : *Locations) - LocationList.push_back(Location); + llvm::append_range(LocationList, *Locations); } // Calculate coverage factor. diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp index 8074f1a9fddfb..e5895516b5e77 100644 --- a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp @@ -163,8 +163,7 @@ void LVCodeViewReader::cacheRelocations() { const coff_section *CoffSection = getObj().getCOFFSection(Section); auto &RM = RelocMap[CoffSection]; - for (const RelocationRef &Relocacion : Section.relocations()) - RM.push_back(Relocacion); + llvm::append_range(RM, Section.relocations()); // Sort relocations by address. llvm::sort(RM, [](RelocationRef L, RelocationRef R) { diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp index 10ca5f4d122bc..71b47a04fd131 100644 --- a/llvm/lib/FileCheck/FileCheck.cpp +++ b/llvm/lib/FileCheck/FileCheck.cpp @@ -1640,13 +1640,11 @@ static const char *DefaultCommentPrefixes[] = {"COM", "RUN"}; static void addDefaultPrefixes(FileCheckRequest &Req) { if (Req.CheckPrefixes.empty()) { - for (const char *Prefix : DefaultCheckPrefixes) - Req.CheckPrefixes.push_back(Prefix); + llvm::append_range(Req.CheckPrefixes, DefaultCheckPrefixes); Req.IsDefaultCheckPrefix = true; } if (Req.CommentPrefixes.empty()) - for (const char *Prefix : DefaultCommentPrefixes) - Req.CommentPrefixes.push_back(Prefix); + llvm::append_range(Req.CommentPrefixes, DefaultCommentPrefixes); } struct PrefixMatcher { diff --git a/llvm/lib/ObjCopy/COFF/COFFReader.cpp b/llvm/lib/ObjCopy/COFF/COFFReader.cpp index 32aceb805a2a0..62a71d41ded5f 100644 --- a/llvm/lib/ObjCopy/COFF/COFFReader.cpp +++ b/llvm/lib/ObjCopy/COFF/COFFReader.cpp @@ -70,8 +70,7 @@ Error COFFReader::readSections(Object &Obj) const { return E; S.setContentsRef(Contents); ArrayRef Relocs = COFFObj.getRelocations(Sec); - for (const coff_relocation &R : Relocs) - S.Relocs.push_back(R); + llvm::append_range(S.Relocs, Relocs); if (Expected NameOrErr = COFFObj.getSectionName(Sec)) S.Name = *NameOrErr; else diff --git a/llvm/lib/ObjCopy/XCOFF/XCOFFReader.cpp b/llvm/lib/ObjCopy/XCOFF/XCOFFReader.cpp index 8ad3021a03428..e6018ebfbec21 100644 --- a/llvm/lib/ObjCopy/XCOFF/XCOFFReader.cpp +++ b/llvm/lib/ObjCopy/XCOFF/XCOFFReader.cpp @@ -38,8 +38,7 @@ Error XCOFFReader::readSections(Object &Obj) const { XCOFFObj.relocations(Sec); if (!Relocations) return Relocations.takeError(); - for (const XCOFFRelocation32 &Rel : Relocations.get()) - ReadSec.Relocations.push_back(Rel); + llvm::append_range(ReadSec.Relocations, Relocations.get()); } Obj.Sections.push_back(std::move(ReadSec)); diff --git a/llvm/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp b/llvm/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp index 527265410809c..381330b98f711 100644 --- a/llvm/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp +++ b/llvm/lib/ObjectYAML/CodeViewYAMLDebugSections.cpp @@ -735,9 +735,7 @@ Expected> YAMLCoffSymbolRVASubsection::fromCodeViewSubsection( const DebugSymbolRVASubsectionRef &Section) { auto Result = std::make_shared(); - for (const auto &RVA : Section) { - Result->RVAs.push_back(RVA); - } + llvm::append_range(Result->RVAs, Section); return Result; } diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index e34a770b1b53e..f1dd39ce133a8 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -2435,8 +2435,8 @@ class CategorizedHelpPrinter : public HelpPrinter { // Collect registered option categories into vector in preparation for // sorting. - for (OptionCategory *Category : GlobalParser->RegisteredOptionCategories) - SortedCategories.push_back(Category); + llvm::append_range(SortedCategories, + GlobalParser->RegisteredOptionCategories); // Sort the different option categories alphabetically. assert(SortedCategories.size() > 0 && "No option categories registered!"); diff --git a/llvm/lib/Support/Debug.cpp b/llvm/lib/Support/Debug.cpp index 98a9ac4722b50..5bb04d0c22998 100644 --- a/llvm/lib/Support/Debug.cpp +++ b/llvm/lib/Support/Debug.cpp @@ -73,8 +73,7 @@ void setCurrentDebugType(const char *Type) { void setCurrentDebugTypes(const char **Types, unsigned Count) { CurrentDebugType->clear(); - for (size_t T = 0; T < Count; ++T) - CurrentDebugType->push_back(Types[T]); + llvm::append_range(*CurrentDebugType, ArrayRef(Types, Count)); } } // namespace llvm diff --git a/llvm/lib/Support/SuffixTree.cpp b/llvm/lib/Support/SuffixTree.cpp index 5abcead5037f4..b2e606c86dd57 100644 --- a/llvm/lib/Support/SuffixTree.cpp +++ b/llvm/lib/Support/SuffixTree.cpp @@ -348,8 +348,7 @@ void SuffixTree::RepeatedSubstringIterator::advance() { // Yes. Update the state to reflect this, and then bail out. N = Curr; RS.Length = Length; - for (unsigned StartIdx : RepeatedSubstringStarts) - RS.StartIndices.push_back(StartIdx); + llvm::append_range(RS.StartIndices, RepeatedSubstringStarts); break; } // At this point, either NewRS is an empty RepeatedSubstring, or it was diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index d182b647aa931..0fdf78976b691 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -1015,8 +1015,7 @@ const Init *UnOpInit::Fold(const Record *CurRec, bool IsFinal) const { const auto *InnerList = dyn_cast(InnerInit); if (!InnerList) return std::nullopt; - for (const Init *InnerElem : InnerList->getValues()) - Flattened.push_back(InnerElem); + llvm::append_range(Flattened, InnerList->getValues()); }; return Flattened; }; diff --git a/llvm/utils/not/not.cpp b/llvm/utils/not/not.cpp index 6ba59190d8ada..6f270cb0f7783 100644 --- a/llvm/utils/not/not.cpp +++ b/llvm/utils/not/not.cpp @@ -57,10 +57,7 @@ int main(int argc, const char **argv) { return 1; } - std::vector Argv; - Argv.reserve(argc); - for (int i = 0; i < argc; ++i) - Argv.push_back(argv[i]); + SmallVector Argv(ArrayRef(argv, argc)); std::string ErrMsg; int Result = sys::ExecuteAndWait(*Program, Argv, std::nullopt, {}, 0, 0, &ErrMsg); From 65734de9b93bef5b3211298b4fcc5dc79d18d31e Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Mon, 31 Mar 2025 10:26:45 +0800 Subject: [PATCH 0064/1029] [SLP] NFC. Remove the redundant MainOp and AltOp find process. (#133642) --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 850895895d44d..a4b0378abc075 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3733,16 +3733,8 @@ class BoUpSLP { Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); } if (EntryState == TreeEntry::SplitVectorize) { - auto *MainOp = - cast(*find_if(Last->Scalars, IsaPred)); - auto *AltOp = cast(*find_if(Last->Scalars, [=](Value *V) { - auto *I = dyn_cast(V); - if (!I) - return false; - InstructionsState LocalS = getSameOpcode({I, MainOp}, *TLI); - return !LocalS || LocalS.isAltShuffle(); - })); - Last->setOperations(InstructionsState(MainOp, AltOp)); + assert(S && "Split nodes must have operations."); + Last->setOperations(S); SmallPtrSet Processed; for (Value *V : VL) { auto *I = dyn_cast(V); From dad86f5931453ff3d14ba5adb93855ee780298b2 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Mon, 31 Mar 2025 13:08:02 +1100 Subject: [PATCH 0065/1029] [ORC] MapperJITLinkMemoryManager should deinitialize on abandon, not deallocate. The JITLinkMemoryManager::InFlightAlloc::abandon method should only abandon memory for the current allocation, not any other allocations. In MapperJITLinkMemoryManager this corresponds to the deinitialize operation, not the deallocate operation (which releases whole slabs of memory that may be shared by many allocations). No testcase: This was spotted by inspection. The failing program was linking concurrently when one linker instance raised an error. Through the call to abandon an entire underlying slab was deallocated, resulting in segfaults in other concurrent links that were sharing that slab. --- llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp index 93fe7eeb3ed5b..33734b8253689 100644 --- a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp @@ -43,7 +43,7 @@ class MapperJITLinkMemoryManager::InFlightAlloc } void abandon(OnAbandonedFunction OnFinalize) override { - Parent.Mapper->release({AllocAddr}, std::move(OnFinalize)); + Parent.Mapper->deinitialize({AllocAddr}, std::move(OnFinalize)); } private: From c9095aa3103460c967fd5ee5dcc695284793ef3c Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 30 Mar 2025 21:10:08 -0700 Subject: [PATCH 0066/1029] [RISCV] Cleanup assembler predicates after #133377. (#133652) Make isSImm12 look more like isUImm20LUI. Move variables closer to their use. Fold some function calls into if statements. --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 52 ++++++++++--------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 52b38c19873c1..63d0777e4ff52 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -555,50 +555,55 @@ struct RISCVOperand final : public MCParsedAsmOperand { bool isBareSymbol() const { int64_t Imm; - RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; // Must be of 'immediate' type but not a constant. if (!isImm() || evaluateConstantImm(getImm(), Imm)) return false; + + RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; return RISCVAsmParser::classifySymbolRef(getImm(), VK) && VK == RISCVMCExpr::VK_None; } bool isCallSymbol() const { int64_t Imm; - RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; // Must be of 'immediate' type but not a constant. if (!isImm() || evaluateConstantImm(getImm(), Imm)) return false; + + RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; return RISCVAsmParser::classifySymbolRef(getImm(), VK) && (VK == RISCVMCExpr::VK_CALL || VK == RISCVMCExpr::VK_CALL_PLT); } bool isPseudoJumpSymbol() const { int64_t Imm; - RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; // Must be of 'immediate' type but not a constant. if (!isImm() || evaluateConstantImm(getImm(), Imm)) return false; + + RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; return RISCVAsmParser::classifySymbolRef(getImm(), VK) && VK == RISCVMCExpr::VK_CALL; } bool isTPRelAddSymbol() const { int64_t Imm; - RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; // Must be of 'immediate' type but not a constant. if (!isImm() || evaluateConstantImm(getImm(), Imm)) return false; + + RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; return RISCVAsmParser::classifySymbolRef(getImm(), VK) && VK == RISCVMCExpr::VK_TPREL_ADD; } bool isTLSDESCCallSymbol() const { int64_t Imm; - RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; // Must be of 'immediate' type but not a constant. if (!isImm() || evaluateConstantImm(getImm(), Imm)) return false; + + RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; return RISCVAsmParser::classifySymbolRef(getImm(), VK) && VK == RISCVMCExpr::VK_TLSDESC_CALL; } @@ -838,19 +843,17 @@ struct RISCVOperand final : public MCParsedAsmOperand { } bool isSImm12() const { - RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; - int64_t Imm; - bool IsValid; if (!isImm()) return false; - bool IsConstantImm = evaluateConstantImm(getImm(), Imm); - if (!IsConstantImm) - IsValid = RISCVAsmParser::classifySymbolRef(getImm(), VK); - else - IsValid = isInt<12>(fixImmediateForRV32(Imm, isRV64Imm())); - return IsValid && - (IsConstantImm || VK == RISCVMCExpr::VK_LO || - VK == RISCVMCExpr::VK_PCREL_LO || VK == RISCVMCExpr::VK_TPREL_LO || + + int64_t Imm; + if (evaluateConstantImm(getImm(), Imm)) + return isInt<12>(fixImmediateForRV32(Imm, isRV64Imm())); + + RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; + return RISCVAsmParser::classifySymbolRef(getImm(), VK) && + (VK == RISCVMCExpr::VK_LO || VK == RISCVMCExpr::VK_PCREL_LO || + VK == RISCVMCExpr::VK_TPREL_LO || VK == RISCVMCExpr::VK_TLSDESC_LOAD_LO || VK == RISCVMCExpr::VK_TLSDESC_ADD_LO); } @@ -873,26 +876,27 @@ struct RISCVOperand final : public MCParsedAsmOperand { } bool isUImm20LUI() const { - RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; - int64_t Imm; if (!isImm()) return false; - bool IsConstantImm = evaluateConstantImm(getImm(), Imm); - if (IsConstantImm) + + int64_t Imm; + if (evaluateConstantImm(getImm(), Imm)) return isUInt<20>(Imm); + + RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; return RISCVAsmParser::classifySymbolRef(getImm(), VK) && (VK == RISCVMCExpr::VK_HI || VK == RISCVMCExpr::VK_TPREL_HI); } bool isUImm20AUIPC() const { - RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; - int64_t Imm; if (!isImm()) return false; - bool IsConstantImm = evaluateConstantImm(getImm(), Imm); - if (IsConstantImm) + + int64_t Imm; + if (evaluateConstantImm(getImm(), Imm)) return isUInt<20>(Imm); + RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; return RISCVAsmParser::classifySymbolRef(getImm(), VK) && (VK == RISCVMCExpr::VK_PCREL_HI || VK == RISCVMCExpr::VK_GOT_HI || VK == RISCVMCExpr::VK_TLS_GOT_HI || From 04a67528d303ac4be7943b2ae57222f9c9fd509a Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 30 Mar 2025 22:03:14 -0700 Subject: [PATCH 0067/1029] [MC] Simplify MCBinaryExpr/MCUnaryExpr printing by reducing parentheses (#133674) The existing pretty printer generates excessive parentheses for MCBinaryExpr expressions. This update removes unnecessary parentheses of MCBinaryExpr with +/- operators and MCUnaryExpr. Since relocatable expressions only use + and -, this change improves readability in most cases. Examples: - (SymA - SymB) + C now prints as SymA - SymB + C. This updates the output of -fexperimental-relative-c++-abi-vtables for AArch64 and x86 to `.long _ZN1B3fooEv@PLT-_ZTV1B-8` - expr + (MCTargetExpr) now prints as expr + MCTargetExpr, with this change primarily affecting AMDGPUMCExpr. --- llvm/include/llvm/MC/MCExpr.h | 3 +- llvm/lib/MC/MCExpr.cpp | 51 +- .../CodeGen/AMDGPU/agpr-register-count.ll | 14 +- .../AMDGPU/call-alias-register-usage-agpr.ll | 4 +- .../AMDGPU/call-alias-register-usage1.ll | 2 +- .../AMDGPU/call-alias-register-usage2.ll | 2 +- .../AMDGPU/call-alias-register-usage3.ll | 2 +- .../CodeGen/AMDGPU/function-resource-usage.ll | 48 +- ...-knownbits-assign-crash-gh-issue-110930.ll | 14 +- .../multi-call-resource-usage-mcexpr.ll | 8 +- llvm/test/CodeGen/AMDGPU/recursion.ll | 18 +- .../AMDGPU/recursive-resource-usage-mcexpr.ll | 6 +- .../AMDGPU/resource-optimization-remarks.ll | 4 +- .../AMDGPU/unnamed-function-resource-info.ll | 4 +- .../ARM/GlobalISel/arm-isel-globals-pic.ll | 4 +- .../ARM/GlobalISel/thumb-isel-globals-pic.ll | 4 +- llvm/test/CodeGen/ARM/elf-preemption.ll | 4 +- llvm/test/CodeGen/ARM/globals.ll | 2 +- llvm/test/CodeGen/ARM/litpool-licm.ll | 4 +- llvm/test/CodeGen/ARM/load-global.ll | 2 +- llvm/test/CodeGen/ARM/load-global2.ll | 2 +- llvm/test/CodeGen/ARM/plt-relative-reloc.ll | 6 +- llvm/test/CodeGen/ARM/stack-guard-elf.ll | 20 +- llvm/test/CodeGen/ARM/stack-guard-rwpi.ll | 2 +- ...aix-small-local-dynamic-tls-largeaccess.ll | 48 +- .../aix-small-local-exec-tls-largeaccess.ll | 32 +- .../aix-small-local-exec-tls-largeaccess2.ll | 8 +- .../aix-small-tls-globalvarattr-funcattr.ll | 4 +- .../aix-small-tls-globalvarattr-targetattr.ll | 4 +- .../CodeGen/RISCV/dso_local_equivalent.ll | 4 +- llvm/test/CodeGen/RISCV/plt-relative-reloc.ll | 4 +- llvm/test/CodeGen/Thumb2/tpsoft.ll | 2 +- llvm/test/CodeGen/X86/abi-isel.ll | 96 +- llvm/test/CodeGen/X86/atomic-minmax-i6432.ll | 2 +- .../X86/callbr-asm-instr-scheduling.ll | 2 +- .../CodeGen/X86/inline-asm-i-constraint-i1.ll | 2 +- llvm/test/CodeGen/X86/relptr-rodata.ll | 2 +- llvm/test/CodeGen/X86/wineh-coreclr.ll | 84 +- .../CodeGen/X86/x86-64-plt-relative-reloc.ll | 8 +- .../CodeGen/X86/x86-plt-relative-reloc.ll | 6 +- llvm/test/MC/AArch64/elf-reloc-ptrauth.s | 12 +- llvm/test/MC/AMDGPU/expressions.s | 4 +- llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s | 46 +- llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s | 46 +- llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s | 44 +- llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s | 34 +- llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s | 34 +- llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s | 38 +- llvm/test/MC/AMDGPU/mcexpr_amd.s | 4 +- llvm/test/MC/ARM/basic-arm-instructions.s | 4 +- llvm/test/MC/ARM/elf-movt.s | 4 +- llvm/test/MC/ARM/macho-word-reloc-thumb.s | 2 +- llvm/test/MC/AVR/inst-brbc.s | 8 +- llvm/test/MC/AVR/inst-brbs.s | 8 +- llvm/test/MC/AVR/inst-brcc.s | 12 +- llvm/test/MC/AVR/inst-brcs.s | 12 +- llvm/test/MC/AVR/inst-breq.s | 12 +- llvm/test/MC/AVR/inst-brge.s | 8 +- llvm/test/MC/AVR/inst-brhc.s | 8 +- llvm/test/MC/AVR/inst-brhs.s | 8 +- llvm/test/MC/AVR/inst-brid.s | 8 +- llvm/test/MC/AVR/inst-brie.s | 8 +- llvm/test/MC/AVR/inst-brlo.s | 8 +- llvm/test/MC/AVR/inst-brlt.s | 8 +- llvm/test/MC/AVR/inst-brmi.s | 8 +- llvm/test/MC/AVR/inst-brne.s | 12 +- llvm/test/MC/AVR/inst-brpl.s | 8 +- llvm/test/MC/AVR/inst-brsh.s | 8 +- llvm/test/MC/AVR/inst-brtc.s | 8 +- llvm/test/MC/AVR/inst-brts.s | 8 +- llvm/test/MC/AVR/inst-brvc.s | 8 +- llvm/test/MC/AVR/inst-brvs.s | 8 +- llvm/test/MC/AVR/inst-rcall.s | 16 +- llvm/test/MC/AVR/inst-rjmp.s | 28 +- llvm/test/MC/AsmParser/directive_fill.s | 2 +- .../test/MC/AsmParser/expr_symbol_modifiers.s | 2 +- llvm/test/MC/COFF/cross-section-relative.ll | 6 +- llvm/test/MC/ELF/reloc-directive.s | 4 +- llvm/test/MC/Lanai/memory.s | 4 +- .../test/MC/MachO/AArch64/cstexpr-gotpcrel.ll | 4 +- llvm/test/MC/Mips/expr1.s | 16 +- llvm/test/MC/Mips/memory-offsets.s | 8 +- llvm/test/MC/PowerPC/ppc32-tls.s | 2 +- llvm/test/MC/RISCV/rvi-pseudos.s | 2 +- llvm/test/MC/SystemZ/insn-good-z196.s | 10 +- llvm/test/MC/SystemZ/insn-good-zEC12.s | 30 +- llvm/test/MC/SystemZ/insn-good.s | 820 +++++++++--------- .../MachO/ARM/symbolized-disassembly.test | 4 +- .../MachO/ARM/symbolized-subtractor.test | 4 +- 89 files changed, 980 insertions(+), 968 deletions(-) diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index d6829f2bcc734..12830ee648ae0 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -81,7 +81,8 @@ class MCExpr { /// \name Utility Methods /// @{ - void print(raw_ostream &OS, const MCAsmInfo *MAI) const; + void print(raw_ostream &OS, const MCAsmInfo *MAI, + int SurroundingPrec = 0) const; void dump() const; /// Returns whether the given symbol is used anywhere in the expression or diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index 253247561354b..fa5c3dab1f115 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -37,10 +37,22 @@ STATISTIC(MCExprEvaluate, "Number of MCExpr evaluations"); } // end namespace stats } // end anonymous namespace +static int getPrecedence(MCBinaryExpr::Opcode Op) { + switch (Op) { + case MCBinaryExpr::Add: + case MCBinaryExpr::Sub: + return 1; + default: + return 0; + } +} + // VariantKind printing and formatting utilize MAI. operator<< (dump and some // target code) specifies MAI as nullptr and should be avoided when MAI is // needed. -void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const { +void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, + int SurroundingPrec) const { + constexpr int MaxPrec = 9; switch (getKind()) { case MCExpr::Target: return cast(this)->printImpl(OS, MAI); @@ -98,24 +110,26 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const { case MCUnaryExpr::Not: OS << '~'; break; case MCUnaryExpr::Plus: OS << '+'; break; } - bool Binary = UE.getSubExpr()->getKind() == MCExpr::Binary; - if (Binary) OS << "("; - UE.getSubExpr()->print(OS, MAI); - if (Binary) OS << ")"; + UE.getSubExpr()->print(OS, MAI, MaxPrec); return; } case MCExpr::Binary: { const MCBinaryExpr &BE = cast(*this); - - // Only print parens around the LHS if it is non-trivial. - if (isa(BE.getLHS()) || isa(BE.getLHS())) { - BE.getLHS()->print(OS, MAI); - } else { + // We want to avoid redundant parentheses for relocatable expressions like + // a-b+c. + // + // Print '(' if the current operator has lower precedence than the + // surrounding operator, or if the surrounding operator's precedence is + // unknown (set to HighPrecedence). + int Prec = getPrecedence(BE.getOpcode()); + bool Paren = Prec < SurroundingPrec; + if (Paren) OS << '('; - BE.getLHS()->print(OS, MAI); - OS << ')'; - } + // Many operators' precedence is different from C. Set the precedence to + // HighPrecedence for unknown operators. + int SubPrec = Prec ? Prec : MaxPrec; + BE.getLHS()->print(OS, MAI, SubPrec); switch (BE.getOpcode()) { case MCBinaryExpr::Add: @@ -123,6 +137,8 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const { if (const MCConstantExpr *RHSC = dyn_cast(BE.getRHS())) { if (RHSC->getValue() < 0) { OS << RHSC->getValue(); + if (Paren) + OS << ')'; return; } } @@ -150,14 +166,9 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI) const { case MCBinaryExpr::Xor: OS << '^'; break; } - // Only print parens around the LHS if it is non-trivial. - if (isa(BE.getRHS()) || isa(BE.getRHS())) { - BE.getRHS()->print(OS, MAI); - } else { - OS << '('; - BE.getRHS()->print(OS, MAI); + BE.getRHS()->print(OS, MAI, SubPrec + 1); + if (Paren) OS << ')'; - } return; } } diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll index 0e16ea10c019a..c7a20055a70d4 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -155,19 +155,19 @@ declare void @undef_func() ; GCN-LABEL: {{^}}kernel_call_undef_func: ; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0) -; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4 +; GFX90A: .amdhsa_accum_offset (((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4)/4)-1)&~65536)&63)+1)*4 ; GCN: .set kernel_call_undef_func.num_vgpr, max(32, amdgpu.max_num_vgpr) ; GCN: .set kernel_call_undef_func.num_agpr, max(0, amdgpu.max_num_agpr) ; GCN: NumVgprs: kernel_call_undef_func.num_vgpr ; GCN: NumAgprs: kernel_call_undef_func.num_agpr ; GCN: TotalNumVgprs: totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr) -; GFX908: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 4))/4)-1 -; GFX90A: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 8))/8)-1 +; GFX908: VGPRBlocks: (alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 4)/4)-1 +; GFX90A: VGPRBlocks: (alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 8)/8)-1 ; GCN: NumVGPRsForWavesPerEU: max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0) -; GFX90A: AccumOffset: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)+1)*4 -; GFX908: Occupancy: occupancy(10, 4, 256, 8, 10, max(kernel_call_undef_func.numbered_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)) -; GFX90A: Occupancy: occupancy(8, 8, 512, 8, 8, max(kernel_call_undef_func.numbered_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)) -; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63 +; GFX90A: AccumOffset: ((alignto(max(1, kernel_call_undef_func.num_vgpr), 4)/4)-1+1)*4 +; GFX908: Occupancy: occupancy(10, 4, 256, 8, 10, max(kernel_call_undef_func.numbered_sgpr+extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)) +; GFX90A: Occupancy: occupancy(8, 8, 512, 8, 8, max(kernel_call_undef_func.numbered_sgpr+extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)) +; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: (((alignto(max(1, kernel_call_undef_func.num_vgpr), 4)/4)-1)&~65536)&63 define amdgpu_kernel void @kernel_call_undef_func() #0 { bb: call void @undef_func() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll index 1d49e005234e3..9de6aea9385df 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll @@ -9,8 +9,8 @@ ; ALL-LABEL: {{^}}kernel: ; ALL: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel.num_agpr, kernel.num_vgpr), 1, 0) -; ALL-NEXT: .amdhsa_next_free_sgpr (max(kernel.numbered_sgpr+(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)) -; GFX90A-NEXT: .amdhsa_accum_offset ((((((alignto(max(1, kernel.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4 +; ALL-NEXT: .amdhsa_next_free_sgpr max(kernel.numbered_sgpr+extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1), 1, 0)-extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1) +; GFX90A-NEXT: .amdhsa_accum_offset (((((alignto(max(1, kernel.num_vgpr), 4)/4)-1)&~65536)&63)+1)*4 ; ALL: .set kernel.num_vgpr, max(41, .Laliasee_default.num_vgpr) ; ALL-NEXT: .set kernel.num_agpr, max(0, .Laliasee_default.num_agpr) diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll index cbc8e7882c45e..fe27859eb0afd 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll @@ -10,7 +10,7 @@ ; CHECK-LABEL: {{^}}kernel1: ; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel1.num_agpr, kernel1.num_vgpr), 1, 0) -; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel1.numbered_sgpr+(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)) +; CHECK-NEXT: .amdhsa_next_free_sgpr max(kernel1.numbered_sgpr+extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1), 1, 0)-extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1) ; CHECK: .set kernel1.num_vgpr, max(42, .Laliasee_vgpr32_sgpr76.num_vgpr) ; CHECK-NEXT: .set kernel1.num_agpr, max(0, .Laliasee_vgpr32_sgpr76.num_agpr) diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll index cdefbab93c62d..35b67351e85dd 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: {{^}}kernel2: ; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel2.num_agpr, kernel2.num_vgpr), 1, 0) -; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel2.numbered_sgpr+(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)) +; CHECK-NEXT: .amdhsa_next_free_sgpr max(kernel2.numbered_sgpr+extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1), 1, 0)-extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1) ; CHECK: .set kernel2.num_vgpr, max(41, .Laliasee_vgpr64_sgpr102.num_vgpr) ; CHECK-NEXT: .set kernel2.num_agpr, max(0, .Laliasee_vgpr64_sgpr102.num_agpr) diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll index 43dd0a7233604..3674d740b987b 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: {{^}}kernel3: ; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel3.num_agpr, kernel3.num_vgpr), 1, 0) -; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel3.numbered_sgpr+(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)) +; CHECK-NEXT: .amdhsa_next_free_sgpr max(kernel3.numbered_sgpr+extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1), 1, 0)-extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1) ; CHECK: .set kernel3.num_vgpr, max(41, .Laliasee_vgpr256_sgpr102.num_vgpr) ; CHECK-NEXT: .set kernel3.num_agpr, max(0, .Laliasee_vgpr256_sgpr102.num_agpr) diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll index 512d58d3f996d..e152f2ddd5253 100644 --- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll @@ -24,7 +24,7 @@ define void @use_vcc() #1 { ; GCN: .set indirect_use_vcc.num_vgpr, max(41, use_vcc.num_vgpr) ; GCN: .set indirect_use_vcc.num_agpr, max(0, use_vcc.num_agpr) ; GCN: .set indirect_use_vcc.numbered_sgpr, max(34, use_vcc.numbered_sgpr) -; GCN: .set indirect_use_vcc.private_seg_size, 16+(max(use_vcc.private_seg_size)) +; GCN: .set indirect_use_vcc.private_seg_size, 16+max(use_vcc.private_seg_size) ; GCN: .set indirect_use_vcc.uses_vcc, or(1, use_vcc.uses_vcc) ; GCN: .set indirect_use_vcc.uses_flat_scratch, or(0, use_vcc.uses_flat_scratch) ; GCN: .set indirect_use_vcc.has_dyn_sized_stack, or(0, use_vcc.has_dyn_sized_stack) @@ -42,7 +42,7 @@ define void @indirect_use_vcc() #1 { ; GCN: .set indirect_2level_use_vcc_kernel.num_vgpr, max(32, indirect_use_vcc.num_vgpr) ; GCN: .set indirect_2level_use_vcc_kernel.num_agpr, max(0, indirect_use_vcc.num_agpr) ; GCN: .set indirect_2level_use_vcc_kernel.numbered_sgpr, max(33, indirect_use_vcc.numbered_sgpr) -; GCN: .set indirect_2level_use_vcc_kernel.private_seg_size, 0+(max(indirect_use_vcc.private_seg_size)) +; GCN: .set indirect_2level_use_vcc_kernel.private_seg_size, 0+max(indirect_use_vcc.private_seg_size) ; GCN: .set indirect_2level_use_vcc_kernel.uses_vcc, or(1, indirect_use_vcc.uses_vcc) ; GCN: .set indirect_2level_use_vcc_kernel.uses_flat_scratch, or(1, indirect_use_vcc.uses_flat_scratch) ; GCN: .set indirect_2level_use_vcc_kernel.has_dyn_sized_stack, or(0, indirect_use_vcc.has_dyn_sized_stack) @@ -78,7 +78,7 @@ define void @use_flat_scratch() #1 { ; GCN: .set indirect_use_flat_scratch.num_vgpr, max(41, use_flat_scratch.num_vgpr) ; GCN: .set indirect_use_flat_scratch.num_agpr, max(0, use_flat_scratch.num_agpr) ; GCN: .set indirect_use_flat_scratch.numbered_sgpr, max(34, use_flat_scratch.numbered_sgpr) -; GCN: .set indirect_use_flat_scratch.private_seg_size, 16+(max(use_flat_scratch.private_seg_size)) +; GCN: .set indirect_use_flat_scratch.private_seg_size, 16+max(use_flat_scratch.private_seg_size) ; GCN: .set indirect_use_flat_scratch.uses_vcc, or(1, use_flat_scratch.uses_vcc) ; GCN: .set indirect_use_flat_scratch.uses_flat_scratch, or(0, use_flat_scratch.uses_flat_scratch) ; GCN: .set indirect_use_flat_scratch.has_dyn_sized_stack, or(0, use_flat_scratch.has_dyn_sized_stack) @@ -96,7 +96,7 @@ define void @indirect_use_flat_scratch() #1 { ; GCN: .set indirect_2level_use_flat_scratch_kernel.num_vgpr, max(32, indirect_use_flat_scratch.num_vgpr) ; GCN: .set indirect_2level_use_flat_scratch_kernel.num_agpr, max(0, indirect_use_flat_scratch.num_agpr) ; GCN: .set indirect_2level_use_flat_scratch_kernel.numbered_sgpr, max(33, indirect_use_flat_scratch.numbered_sgpr) -; GCN: .set indirect_2level_use_flat_scratch_kernel.private_seg_size, 0+(max(indirect_use_flat_scratch.private_seg_size)) +; GCN: .set indirect_2level_use_flat_scratch_kernel.private_seg_size, 0+max(indirect_use_flat_scratch.private_seg_size) ; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_vcc, or(1, indirect_use_flat_scratch.uses_vcc) ; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_flat_scratch, or(1, indirect_use_flat_scratch.uses_flat_scratch) ; GCN: .set indirect_2level_use_flat_scratch_kernel.has_dyn_sized_stack, or(0, indirect_use_flat_scratch.has_dyn_sized_stack) @@ -133,7 +133,7 @@ define void @use_10_vgpr() #1 { ; GCN: .set indirect_use_10_vgpr.num_vgpr, max(41, use_10_vgpr.num_vgpr) ; GCN: .set indirect_use_10_vgpr.num_agpr, max(0, use_10_vgpr.num_agpr) ; GCN: .set indirect_use_10_vgpr.numbered_sgpr, max(34, use_10_vgpr.numbered_sgpr) -; GCN: .set indirect_use_10_vgpr.private_seg_size, 16+(max(use_10_vgpr.private_seg_size)) +; GCN: .set indirect_use_10_vgpr.private_seg_size, 16+max(use_10_vgpr.private_seg_size) ; GCN: .set indirect_use_10_vgpr.uses_vcc, or(1, use_10_vgpr.uses_vcc) ; GCN: .set indirect_use_10_vgpr.uses_flat_scratch, or(0, use_10_vgpr.uses_flat_scratch) ; GCN: .set indirect_use_10_vgpr.has_dyn_sized_stack, or(0, use_10_vgpr.has_dyn_sized_stack) @@ -151,7 +151,7 @@ define void @indirect_use_10_vgpr() #0 { ; GCN: .set indirect_2_level_use_10_vgpr.num_vgpr, max(32, indirect_use_10_vgpr.num_vgpr) ; GCN: .set indirect_2_level_use_10_vgpr.num_agpr, max(0, indirect_use_10_vgpr.num_agpr) ; GCN: .set indirect_2_level_use_10_vgpr.numbered_sgpr, max(33, indirect_use_10_vgpr.numbered_sgpr) -; GCN: .set indirect_2_level_use_10_vgpr.private_seg_size, 0+(max(indirect_use_10_vgpr.private_seg_size)) +; GCN: .set indirect_2_level_use_10_vgpr.private_seg_size, 0+max(indirect_use_10_vgpr.private_seg_size) ; GCN: .set indirect_2_level_use_10_vgpr.uses_vcc, or(1, indirect_use_10_vgpr.uses_vcc) ; GCN: .set indirect_2_level_use_10_vgpr.uses_flat_scratch, or(1, indirect_use_10_vgpr.uses_flat_scratch) ; GCN: .set indirect_2_level_use_10_vgpr.has_dyn_sized_stack, or(0, indirect_use_10_vgpr.has_dyn_sized_stack) @@ -187,7 +187,7 @@ define void @use_50_vgpr() #1 { ; GCN: .set indirect_use_50_vgpr.num_vgpr, max(41, use_50_vgpr.num_vgpr) ; GCN: .set indirect_use_50_vgpr.num_agpr, max(0, use_50_vgpr.num_agpr) ; GCN: .set indirect_use_50_vgpr.numbered_sgpr, max(34, use_50_vgpr.numbered_sgpr) -; GCN: .set indirect_use_50_vgpr.private_seg_size, 16+(max(use_50_vgpr.private_seg_size)) +; GCN: .set indirect_use_50_vgpr.private_seg_size, 16+max(use_50_vgpr.private_seg_size) ; GCN: .set indirect_use_50_vgpr.uses_vcc, or(1, use_50_vgpr.uses_vcc) ; GCN: .set indirect_use_50_vgpr.uses_flat_scratch, or(0, use_50_vgpr.uses_flat_scratch) ; GCN: .set indirect_use_50_vgpr.has_dyn_sized_stack, or(0, use_50_vgpr.has_dyn_sized_stack) @@ -223,7 +223,7 @@ define void @use_80_sgpr() #1 { ; GCN: .set indirect_use_80_sgpr.num_vgpr, max(41, use_80_sgpr.num_vgpr) ; GCN: .set indirect_use_80_sgpr.num_agpr, max(0, use_80_sgpr.num_agpr) ; GCN: .set indirect_use_80_sgpr.numbered_sgpr, max(34, use_80_sgpr.numbered_sgpr) -; GCN: .set indirect_use_80_sgpr.private_seg_size, 16+(max(use_80_sgpr.private_seg_size)) +; GCN: .set indirect_use_80_sgpr.private_seg_size, 16+max(use_80_sgpr.private_seg_size) ; GCN: .set indirect_use_80_sgpr.uses_vcc, or(1, use_80_sgpr.uses_vcc) ; GCN: .set indirect_use_80_sgpr.uses_flat_scratch, or(0, use_80_sgpr.uses_flat_scratch) ; GCN: .set indirect_use_80_sgpr.has_dyn_sized_stack, or(0, use_80_sgpr.has_dyn_sized_stack) @@ -241,7 +241,7 @@ define void @indirect_use_80_sgpr() #1 { ; GCN: .set indirect_2_level_use_80_sgpr.num_vgpr, max(32, indirect_use_80_sgpr.num_vgpr) ; GCN: .set indirect_2_level_use_80_sgpr.num_agpr, max(0, indirect_use_80_sgpr.num_agpr) ; GCN: .set indirect_2_level_use_80_sgpr.numbered_sgpr, max(33, indirect_use_80_sgpr.numbered_sgpr) -; GCN: .set indirect_2_level_use_80_sgpr.private_seg_size, 0+(max(indirect_use_80_sgpr.private_seg_size)) +; GCN: .set indirect_2_level_use_80_sgpr.private_seg_size, 0+max(indirect_use_80_sgpr.private_seg_size) ; GCN: .set indirect_2_level_use_80_sgpr.uses_vcc, or(1, indirect_use_80_sgpr.uses_vcc) ; GCN: .set indirect_2_level_use_80_sgpr.uses_flat_scratch, or(1, indirect_use_80_sgpr.uses_flat_scratch) ; GCN: .set indirect_2_level_use_80_sgpr.has_dyn_sized_stack, or(0, indirect_use_80_sgpr.has_dyn_sized_stack) @@ -297,7 +297,7 @@ define void @use_stack1() #1 { ; GCN: .set indirect_use_stack.num_vgpr, max(41, use_stack0.num_vgpr) ; GCN: .set indirect_use_stack.num_agpr, max(0, use_stack0.num_agpr) ; GCN: .set indirect_use_stack.numbered_sgpr, max(34, use_stack0.numbered_sgpr) -; GCN: .set indirect_use_stack.private_seg_size, 80+(max(use_stack0.private_seg_size)) +; GCN: .set indirect_use_stack.private_seg_size, 80+max(use_stack0.private_seg_size) ; GCN: .set indirect_use_stack.uses_vcc, or(1, use_stack0.uses_vcc) ; GCN: .set indirect_use_stack.uses_flat_scratch, or(0, use_stack0.uses_flat_scratch) ; GCN: .set indirect_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack) @@ -317,7 +317,7 @@ define void @indirect_use_stack() #1 { ; GCN: .set indirect_2_level_use_stack.num_vgpr, max(32, indirect_use_stack.num_vgpr) ; GCN: .set indirect_2_level_use_stack.num_agpr, max(0, indirect_use_stack.num_agpr) ; GCN: .set indirect_2_level_use_stack.numbered_sgpr, max(33, indirect_use_stack.numbered_sgpr) -; GCN: .set indirect_2_level_use_stack.private_seg_size, 0+(max(indirect_use_stack.private_seg_size)) +; GCN: .set indirect_2_level_use_stack.private_seg_size, 0+max(indirect_use_stack.private_seg_size) ; GCN: .set indirect_2_level_use_stack.uses_vcc, or(1, indirect_use_stack.uses_vcc) ; GCN: .set indirect_2_level_use_stack.uses_flat_scratch, or(1, indirect_use_stack.uses_flat_scratch) ; GCN: .set indirect_2_level_use_stack.has_dyn_sized_stack, or(0, indirect_use_stack.has_dyn_sized_stack) @@ -337,7 +337,7 @@ define amdgpu_kernel void @indirect_2_level_use_stack() #0 { ; GCN: .set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr) ; GCN: .set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr) ; GCN: .set multi_call_use_use_stack.numbered_sgpr, max(52, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr) -; GCN: .set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) +; GCN: .set multi_call_use_use_stack.private_seg_size, 0+max(use_stack0.private_seg_size, use_stack1.private_seg_size) ; GCN: .set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc) ; GCN: .set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch) ; GCN: .set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack) @@ -358,7 +358,7 @@ declare void @external() #0 ; GCN: .set multi_call_with_external.num_vgpr, max(41, amdgpu.max_num_vgpr) ; GCN: .set multi_call_with_external.num_agpr, max(0, amdgpu.max_num_agpr) ; GCN: .set multi_call_with_external.numbered_sgpr, max(52, amdgpu.max_num_sgpr) -; GCN: .set multi_call_with_external.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) +; GCN: .set multi_call_with_external.private_seg_size, 0+max(use_stack0.private_seg_size, use_stack1.private_seg_size) ; GCN: .set multi_call_with_external.uses_vcc, 1 ; GCN: .set multi_call_with_external.uses_flat_scratch, 1 ; GCN: .set multi_call_with_external.has_dyn_sized_stack, 1 @@ -378,7 +378,7 @@ define amdgpu_kernel void @multi_call_with_external() #0 { ; GCN: .set multi_call_with_external_and_duplicates.num_vgpr, max(41, amdgpu.max_num_vgpr) ; GCN: .set multi_call_with_external_and_duplicates.num_agpr, max(0, amdgpu.max_num_agpr) ; GCN: .set multi_call_with_external_and_duplicates.numbered_sgpr, max(54, amdgpu.max_num_sgpr) -; GCN: .set multi_call_with_external_and_duplicates.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size)) +; GCN: .set multi_call_with_external_and_duplicates.private_seg_size, 0+max(use_stack0.private_seg_size, use_stack1.private_seg_size) ; GCN: .set multi_call_with_external_and_duplicates.uses_vcc, 1 ; GCN: .set multi_call_with_external_and_duplicates.uses_flat_scratch, 1 ; GCN: .set multi_call_with_external_and_duplicates.has_dyn_sized_stack, 1 @@ -467,7 +467,7 @@ ret: ; GCN: .set usage_direct_recursion.num_vgpr, max(32, direct_recursion_use_stack.num_vgpr) ; GCN: .set usage_direct_recursion.num_agpr, max(0, direct_recursion_use_stack.num_agpr) ; GCN: .set usage_direct_recursion.numbered_sgpr, max(33, direct_recursion_use_stack.numbered_sgpr) -; GCN: .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size)) +; GCN: .set usage_direct_recursion.private_seg_size, 0+max(direct_recursion_use_stack.private_seg_size) ; GCN: .set usage_direct_recursion.uses_vcc, or(1, direct_recursion_use_stack.uses_vcc) ; GCN: .set usage_direct_recursion.uses_flat_scratch, or(1, direct_recursion_use_stack.uses_flat_scratch) ; GCN: .set usage_direct_recursion.has_dyn_sized_stack, or(0, direct_recursion_use_stack.has_dyn_sized_stack) @@ -485,15 +485,15 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { ; GCN: .set multi_stage_recurse2.num_vgpr, max(43, multi_stage_recurse1.num_vgpr) ; GCN: .set multi_stage_recurse2.num_agpr, max(0, multi_stage_recurse1.num_agpr) ; GCN: .set multi_stage_recurse2.numbered_sgpr, max(34, multi_stage_recurse1.numbered_sgpr) -; GCN: .set multi_stage_recurse2.private_seg_size, 16+(max(multi_stage_recurse1.private_seg_size)) +; GCN: .set multi_stage_recurse2.private_seg_size, 16+max(multi_stage_recurse1.private_seg_size) ; GCN: .set multi_stage_recurse2.uses_vcc, or(1, multi_stage_recurse1.uses_vcc) ; GCN: .set multi_stage_recurse2.uses_flat_scratch, or(0, multi_stage_recurse1.uses_flat_scratch) ; GCN: .set multi_stage_recurse2.has_dyn_sized_stack, or(0, multi_stage_recurse1.has_dyn_sized_stack) ; GCN: .set multi_stage_recurse2.has_recursion, or(1, multi_stage_recurse1.has_recursion) ; GCN: .set multi_stage_recurse2.has_indirect_call, or(0, multi_stage_recurse1.has_indirect_call) -; GCN: TotalNumSgprs: multi_stage_recurse2.numbered_sgpr+(extrasgprs(multi_stage_recurse2.uses_vcc, multi_stage_recurse2.uses_flat_scratch, 1)) +; GCN: TotalNumSgprs: multi_stage_recurse2.numbered_sgpr+extrasgprs(multi_stage_recurse2.uses_vcc, multi_stage_recurse2.uses_flat_scratch, 1) ; GCN: NumVgprs: max(43, multi_stage_recurse1.num_vgpr) -; GCN: ScratchSize: 16+(max(multi_stage_recurse1.private_seg_size)) +; GCN: ScratchSize: 16+max(multi_stage_recurse1.private_seg_size) ; GCN-LABEL: {{^}}multi_stage_recurse1: ; GCN: .set multi_stage_recurse1.num_vgpr, max(48, amdgpu.max_num_vgpr) ; GCN: .set multi_stage_recurse1.num_agpr, max(0, amdgpu.max_num_agpr) @@ -522,7 +522,7 @@ define void @multi_stage_recurse2(i32 %val) #2 { ; GCN: .set usage_multi_stage_recurse.num_vgpr, max(32, multi_stage_recurse1.num_vgpr) ; GCN: .set usage_multi_stage_recurse.num_agpr, max(0, multi_stage_recurse1.num_agpr) ; GCN: .set usage_multi_stage_recurse.numbered_sgpr, max(33, multi_stage_recurse1.numbered_sgpr) -; GCN: .set usage_multi_stage_recurse.private_seg_size, 0+(max(multi_stage_recurse1.private_seg_size)) +; GCN: .set usage_multi_stage_recurse.private_seg_size, 0+max(multi_stage_recurse1.private_seg_size) ; GCN: .set usage_multi_stage_recurse.uses_vcc, or(1, multi_stage_recurse1.uses_vcc) ; GCN: .set usage_multi_stage_recurse.uses_flat_scratch, or(1, multi_stage_recurse1.uses_flat_scratch) ; GCN: .set usage_multi_stage_recurse.has_dyn_sized_stack, or(0, multi_stage_recurse1.has_dyn_sized_stack) @@ -540,15 +540,15 @@ define amdgpu_kernel void @usage_multi_stage_recurse(i32 %n) #0 { ; GCN: .set multi_stage_recurse_noattr2.num_vgpr, max(41, multi_stage_recurse_noattr1.num_vgpr) ; GCN: .set multi_stage_recurse_noattr2.num_agpr, max(0, multi_stage_recurse_noattr1.num_agpr) ; GCN: .set multi_stage_recurse_noattr2.numbered_sgpr, max(54, multi_stage_recurse_noattr1.numbered_sgpr) -; GCN: .set multi_stage_recurse_noattr2.private_seg_size, 16+(max(multi_stage_recurse_noattr1.private_seg_size)) +; GCN: .set multi_stage_recurse_noattr2.private_seg_size, 16+max(multi_stage_recurse_noattr1.private_seg_size) ; GCN: .set multi_stage_recurse_noattr2.uses_vcc, or(1, multi_stage_recurse_noattr1.uses_vcc) ; GCN: .set multi_stage_recurse_noattr2.uses_flat_scratch, or(0, multi_stage_recurse_noattr1.uses_flat_scratch) ; GCN: .set multi_stage_recurse_noattr2.has_dyn_sized_stack, or(0, multi_stage_recurse_noattr1.has_dyn_sized_stack) ; GCN: .set multi_stage_recurse_noattr2.has_recursion, or(0, multi_stage_recurse_noattr1.has_recursion) ; GCN: .set multi_stage_recurse_noattr2.has_indirect_call, or(0, multi_stage_recurse_noattr1.has_indirect_call) -; GCN: TotalNumSgprs: multi_stage_recurse_noattr2.numbered_sgpr+(extrasgprs(multi_stage_recurse_noattr2.uses_vcc, multi_stage_recurse_noattr2.uses_flat_scratch, 1)) +; GCN: TotalNumSgprs: multi_stage_recurse_noattr2.numbered_sgpr+extrasgprs(multi_stage_recurse_noattr2.uses_vcc, multi_stage_recurse_noattr2.uses_flat_scratch, 1) ; GCN: NumVgprs: max(41, multi_stage_recurse_noattr1.num_vgpr) -; GCN: ScratchSize: 16+(max(multi_stage_recurse_noattr1.private_seg_size)) +; GCN: ScratchSize: 16+max(multi_stage_recurse_noattr1.private_seg_size) ; GCN-LABEL: {{^}}multi_stage_recurse_noattr1: ; GCN: .set multi_stage_recurse_noattr1.num_vgpr, max(41, amdgpu.max_num_vgpr) ; GCN: .set multi_stage_recurse_noattr1.num_agpr, max(0, amdgpu.max_num_agpr) @@ -577,7 +577,7 @@ define void @multi_stage_recurse_noattr2(i32 %val) #0 { ; GCN: .set usage_multi_stage_recurse_noattrs.num_vgpr, max(32, multi_stage_recurse_noattr1.num_vgpr) ; GCN: .set usage_multi_stage_recurse_noattrs.num_agpr, max(0, multi_stage_recurse_noattr1.num_agpr) ; GCN: .set usage_multi_stage_recurse_noattrs.numbered_sgpr, max(33, multi_stage_recurse_noattr1.numbered_sgpr) -; GCN: .set usage_multi_stage_recurse_noattrs.private_seg_size, 0+(max(multi_stage_recurse_noattr1.private_seg_size)) +; GCN: .set usage_multi_stage_recurse_noattrs.private_seg_size, 0+max(multi_stage_recurse_noattr1.private_seg_size) ; GCN: .set usage_multi_stage_recurse_noattrs.uses_vcc, or(1, multi_stage_recurse_noattr1.uses_vcc) ; GCN: .set usage_multi_stage_recurse_noattrs.uses_flat_scratch, or(1, multi_stage_recurse_noattr1.uses_flat_scratch) ; GCN: .set usage_multi_stage_recurse_noattrs.has_dyn_sized_stack, or(0, multi_stage_recurse_noattr1.has_dyn_sized_stack) @@ -595,7 +595,7 @@ define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 { ; GCN: .set multi_call_with_multi_stage_recurse.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr, multi_stage_recurse1.num_vgpr) ; GCN: .set multi_call_with_multi_stage_recurse.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr, multi_stage_recurse1.num_agpr) ; GCN: .set multi_call_with_multi_stage_recurse.numbered_sgpr, max(53, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr, multi_stage_recurse1.numbered_sgpr) -; GCN: .set multi_call_with_multi_stage_recurse.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size, multi_stage_recurse1.private_seg_size)) +; GCN: .set multi_call_with_multi_stage_recurse.private_seg_size, 0+max(use_stack0.private_seg_size, use_stack1.private_seg_size, multi_stage_recurse1.private_seg_size) ; GCN: .set multi_call_with_multi_stage_recurse.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc, multi_stage_recurse1.uses_vcc) ; GCN: .set multi_call_with_multi_stage_recurse.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch, multi_stage_recurse1.uses_flat_scratch) ; GCN: .set multi_call_with_multi_stage_recurse.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack, multi_stage_recurse1.has_dyn_sized_stack) diff --git a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll index 52f380b7f80a3..60bbf4646ee03 100644 --- a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll +++ b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll @@ -111,7 +111,7 @@ define void @HU_Start() { ; CHECK: .set P_SpawnPlayer.num_vgpr, max(43, G_PlayerReborn.num_vgpr, P_SetThingPosition.num_vgpr, P_SetupPsprites.num_vgpr, HU_Start.num_vgpr) ; CHECK: .set P_SpawnPlayer.num_agpr, max(0, G_PlayerReborn.num_agpr, P_SetThingPosition.num_agpr, P_SetupPsprites.num_agpr, HU_Start.num_agpr) ; CHECK: .set P_SpawnPlayer.numbered_sgpr, max(84, G_PlayerReborn.numbered_sgpr, P_SetThingPosition.numbered_sgpr, P_SetupPsprites.numbered_sgpr, HU_Start.numbered_sgpr) -; CHECK: .set P_SpawnPlayer.private_seg_size, 16+(max(G_PlayerReborn.private_seg_size, P_SetThingPosition.private_seg_size, P_SetupPsprites.private_seg_size, HU_Start.private_seg_size)) +; CHECK: .set P_SpawnPlayer.private_seg_size, 16+max(G_PlayerReborn.private_seg_size, P_SetThingPosition.private_seg_size, P_SetupPsprites.private_seg_size, HU_Start.private_seg_size) ; CHECK: .set P_SpawnPlayer.uses_vcc, or(1, G_PlayerReborn.uses_vcc, P_SetThingPosition.uses_vcc, P_SetupPsprites.uses_vcc, HU_Start.uses_vcc) ; CHECK: .set P_SpawnPlayer.uses_flat_scratch, or(0, G_PlayerReborn.uses_flat_scratch, P_SetThingPosition.uses_flat_scratch, P_SetupPsprites.uses_flat_scratch, HU_Start.uses_flat_scratch) ; CHECK: .set P_SpawnPlayer.has_dyn_sized_stack, or(0, G_PlayerReborn.has_dyn_sized_stack, P_SetThingPosition.has_dyn_sized_stack, P_SetupPsprites.has_dyn_sized_stack, HU_Start.has_dyn_sized_stack) @@ -145,7 +145,7 @@ define void @I_Error(...) { ; CHECK: .set G_DoReborn.num_vgpr, max(44, P_RemoveMobj.num_vgpr, P_SpawnMobj.num_vgpr, P_SpawnPlayer.num_vgpr, I_Error.num_vgpr) ; CHECK: .set G_DoReborn.num_agpr, max(0, P_RemoveMobj.num_agpr, P_SpawnMobj.num_agpr, P_SpawnPlayer.num_agpr, I_Error.num_agpr) ; CHECK: .set G_DoReborn.numbered_sgpr, max(104, P_RemoveMobj.numbered_sgpr, P_SpawnMobj.numbered_sgpr, P_SpawnPlayer.numbered_sgpr, I_Error.numbered_sgpr) -; CHECK: .set G_DoReborn.private_seg_size, 32+(max(P_RemoveMobj.private_seg_size, P_SpawnMobj.private_seg_size, P_SpawnPlayer.private_seg_size, I_Error.private_seg_size)) +; CHECK: .set G_DoReborn.private_seg_size, 32+max(P_RemoveMobj.private_seg_size, P_SpawnMobj.private_seg_size, P_SpawnPlayer.private_seg_size, I_Error.private_seg_size) ; CHECK: .set G_DoReborn.uses_vcc, or(1, P_RemoveMobj.uses_vcc, P_SpawnMobj.uses_vcc, P_SpawnPlayer.uses_vcc, I_Error.uses_vcc) ; CHECK: .set G_DoReborn.uses_flat_scratch, or(0, P_RemoveMobj.uses_flat_scratch, P_SpawnMobj.uses_flat_scratch, P_SpawnPlayer.uses_flat_scratch, I_Error.uses_flat_scratch) ; CHECK: .set G_DoReborn.has_dyn_sized_stack, or(0, P_RemoveMobj.has_dyn_sized_stack, P_SpawnMobj.has_dyn_sized_stack, P_SpawnPlayer.has_dyn_sized_stack, I_Error.has_dyn_sized_stack) @@ -219,7 +219,7 @@ define void @F_Ticker() { ; CHECK: .set G_CheckDemoStatus.num_vgpr, max(43, I_Quit.num_vgpr, D_AdvanceDemo.num_vgpr, I_Error.num_vgpr) ; CHECK: .set G_CheckDemoStatus.num_agpr, max(0, I_Quit.num_agpr, D_AdvanceDemo.num_agpr, I_Error.num_agpr) ; CHECK: .set G_CheckDemoStatus.numbered_sgpr, max(84, I_Quit.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, I_Error.numbered_sgpr) -; CHECK: .set G_CheckDemoStatus.private_seg_size, 32+(max(I_Quit.private_seg_size, D_AdvanceDemo.private_seg_size, I_Error.private_seg_size)) +; CHECK: .set G_CheckDemoStatus.private_seg_size, 32+max(I_Quit.private_seg_size, D_AdvanceDemo.private_seg_size, I_Error.private_seg_size) ; CHECK: .set G_CheckDemoStatus.uses_vcc, or(1, I_Quit.uses_vcc, D_AdvanceDemo.uses_vcc, I_Error.uses_vcc) ; CHECK: .set G_CheckDemoStatus.uses_flat_scratch, or(0, I_Quit.uses_flat_scratch, D_AdvanceDemo.uses_flat_scratch, I_Error.uses_flat_scratch) ; CHECK: .set G_CheckDemoStatus.has_dyn_sized_stack, or(0, I_Quit.has_dyn_sized_stack, D_AdvanceDemo.has_dyn_sized_stack, I_Error.has_dyn_sized_stack) @@ -265,7 +265,7 @@ define ptr @P_SaveGameFile() { ; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr) ; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr) ; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr) -; CHECK: .set R_FlatNumForName.private_seg_size, 16+(max(I_Error.private_seg_size)) +; CHECK: .set R_FlatNumForName.private_seg_size, 16+max(I_Error.private_seg_size) ; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc) ; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch) ; CHECK: .set R_FlatNumForName.has_dyn_sized_stack, or(0, I_Error.has_dyn_sized_stack) @@ -280,7 +280,7 @@ define i32 @R_FlatNumForName() { ; CHECK: .set R_TextureNumForName.num_vgpr, max(42, R_FlatNumForName.num_vgpr) ; CHECK: .set R_TextureNumForName.num_agpr, max(0, R_FlatNumForName.num_agpr) ; CHECK: .set R_TextureNumForName.numbered_sgpr, max(56, R_FlatNumForName.numbered_sgpr) -; CHECK: .set R_TextureNumForName.private_seg_size, 16+(max(R_FlatNumForName.private_seg_size)) +; CHECK: .set R_TextureNumForName.private_seg_size, 16+max(R_FlatNumForName.private_seg_size) ; CHECK: .set R_TextureNumForName.uses_vcc, or(1, R_FlatNumForName.uses_vcc) ; CHECK: .set R_TextureNumForName.uses_flat_scratch, or(0, R_FlatNumForName.uses_flat_scratch) ; CHECK: .set R_TextureNumForName.has_dyn_sized_stack, or(0, R_FlatNumForName.has_dyn_sized_stack) @@ -295,7 +295,7 @@ define i32 @R_TextureNumForName() { ; CHECK: .set G_Ticker.num_vgpr, max(47, G_DoReborn.num_vgpr, F_Ticker.num_vgpr, AM_Stop.num_vgpr, F_StartFinale.num_vgpr, D_AdvanceDemo.num_vgpr, R_FlatNumForName.num_vgpr, R_TextureNumForName.num_vgpr, P_TempSaveGameFile.num_vgpr, P_SaveGameFile.num_vgpr, I_Error.num_vgpr) ; CHECK: .set G_Ticker.num_agpr, max(0, G_DoReborn.num_agpr, F_Ticker.num_agpr, AM_Stop.num_agpr, F_StartFinale.num_agpr, D_AdvanceDemo.num_agpr, R_FlatNumForName.num_agpr, R_TextureNumForName.num_agpr, P_TempSaveGameFile.num_agpr, P_SaveGameFile.num_agpr, I_Error.num_agpr) ; CHECK: .set G_Ticker.numbered_sgpr, max(105, G_DoReborn.numbered_sgpr, F_Ticker.numbered_sgpr, AM_Stop.numbered_sgpr, F_StartFinale.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, R_FlatNumForName.numbered_sgpr, R_TextureNumForName.numbered_sgpr, P_TempSaveGameFile.numbered_sgpr, P_SaveGameFile.numbered_sgpr, I_Error.numbered_sgpr) -; CHECK: .set G_Ticker.private_seg_size, 48+(max(G_DoReborn.private_seg_size, F_Ticker.private_seg_size, AM_Stop.private_seg_size, F_StartFinale.private_seg_size, D_AdvanceDemo.private_seg_size, R_FlatNumForName.private_seg_size, R_TextureNumForName.private_seg_size, P_TempSaveGameFile.private_seg_size, P_SaveGameFile.private_seg_size, I_Error.private_seg_size)) +; CHECK: .set G_Ticker.private_seg_size, 48+max(G_DoReborn.private_seg_size, F_Ticker.private_seg_size, AM_Stop.private_seg_size, F_StartFinale.private_seg_size, D_AdvanceDemo.private_seg_size, R_FlatNumForName.private_seg_size, R_TextureNumForName.private_seg_size, P_TempSaveGameFile.private_seg_size, P_SaveGameFile.private_seg_size, I_Error.private_seg_size) ; CHECK: .set G_Ticker.uses_vcc, or(1, G_DoReborn.uses_vcc, F_Ticker.uses_vcc, AM_Stop.uses_vcc, F_StartFinale.uses_vcc, D_AdvanceDemo.uses_vcc, R_FlatNumForName.uses_vcc, R_TextureNumForName.uses_vcc, P_TempSaveGameFile.uses_vcc, P_SaveGameFile.uses_vcc, I_Error.uses_vcc) ; CHECK: .set G_Ticker.uses_flat_scratch, or(0, G_DoReborn.uses_flat_scratch, F_Ticker.uses_flat_scratch, AM_Stop.uses_flat_scratch, F_StartFinale.uses_flat_scratch, D_AdvanceDemo.uses_flat_scratch, R_FlatNumForName.uses_flat_scratch, R_TextureNumForName.uses_flat_scratch, P_TempSaveGameFile.uses_flat_scratch, P_SaveGameFile.uses_flat_scratch, I_Error.uses_flat_scratch) ; CHECK: .set G_Ticker.has_dyn_sized_stack, or(0, G_DoReborn.has_dyn_sized_stack, F_Ticker.has_dyn_sized_stack, AM_Stop.has_dyn_sized_stack, F_StartFinale.has_dyn_sized_stack, D_AdvanceDemo.has_dyn_sized_stack, R_FlatNumForName.has_dyn_sized_stack, R_TextureNumForName.has_dyn_sized_stack, P_TempSaveGameFile.has_dyn_sized_stack, P_SaveGameFile.has_dyn_sized_stack, I_Error.has_dyn_sized_stack) @@ -319,7 +319,7 @@ define void @G_Ticker() { ; CHECK: .set RunTic.num_vgpr, max(47, G_CheckDemoStatus.num_vgpr, D_AdvanceDemo.num_vgpr, G_Ticker.num_vgpr) ; CHECK: .set RunTic.num_agpr, max(0, G_CheckDemoStatus.num_agpr, D_AdvanceDemo.num_agpr, G_Ticker.num_agpr) ; CHECK: .set RunTic.numbered_sgpr, max(105, G_CheckDemoStatus.numbered_sgpr, D_AdvanceDemo.numbered_sgpr, G_Ticker.numbered_sgpr) -; CHECK: .set RunTic.private_seg_size, 32+(max(G_CheckDemoStatus.private_seg_size, D_AdvanceDemo.private_seg_size, G_Ticker.private_seg_size)) +; CHECK: .set RunTic.private_seg_size, 32+max(G_CheckDemoStatus.private_seg_size, D_AdvanceDemo.private_seg_size, G_Ticker.private_seg_size) ; CHECK: .set RunTic.uses_vcc, or(1, G_CheckDemoStatus.uses_vcc, D_AdvanceDemo.uses_vcc, G_Ticker.uses_vcc) ; CHECK: .set RunTic.uses_flat_scratch, or(0, G_CheckDemoStatus.uses_flat_scratch, D_AdvanceDemo.uses_flat_scratch, G_Ticker.uses_flat_scratch) ; CHECK: .set RunTic.has_dyn_sized_stack, or(0, G_CheckDemoStatus.has_dyn_sized_stack, D_AdvanceDemo.has_dyn_sized_stack, G_Ticker.has_dyn_sized_stack) diff --git a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll index e150231e3d9e1..7a810d0067c17 100644 --- a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll @@ -20,7 +20,7 @@ entry: ; CHECK: .set baz.num_vgpr, max(49, qux.num_vgpr) ; CHECK: .set baz.num_agpr, max(0, qux.num_agpr) ; CHECK: .set baz.numbered_sgpr, max(34, qux.numbered_sgpr) -; CHECK: .set baz.private_seg_size, 16+(max(qux.private_seg_size)) +; CHECK: .set baz.private_seg_size, 16+max(qux.private_seg_size) ; CHECK: .set baz.uses_vcc, or(0, qux.uses_vcc) ; CHECK: .set baz.uses_flat_scratch, or(0, qux.uses_flat_scratch) ; CHECK: .set baz.has_dyn_sized_stack, or(0, qux.has_dyn_sized_stack) @@ -37,7 +37,7 @@ entry: ; CHECK: .set bar.num_vgpr, max(65, baz.num_vgpr, qux.num_vgpr) ; CHECK: .set bar.num_agpr, max(0, baz.num_agpr, qux.num_agpr) ; CHECK: .set bar.numbered_sgpr, max(34, baz.numbered_sgpr, qux.numbered_sgpr) -; CHECK: .set bar.private_seg_size, 16+(max(baz.private_seg_size, qux.private_seg_size)) +; CHECK: .set bar.private_seg_size, 16+max(baz.private_seg_size, qux.private_seg_size) ; CHECK: .set bar.uses_vcc, or(0, baz.uses_vcc, qux.uses_vcc) ; CHECK: .set bar.uses_flat_scratch, or(0, baz.uses_flat_scratch, qux.uses_flat_scratch) ; CHECK: .set bar.has_dyn_sized_stack, or(0, baz.has_dyn_sized_stack, qux.has_dyn_sized_stack) @@ -56,7 +56,7 @@ entry: ; CHECK: .set foo.num_vgpr, max(38, bar.num_vgpr) ; CHECK: .set foo.num_agpr, max(0, bar.num_agpr) ; CHECK: .set foo.numbered_sgpr, max(34, bar.numbered_sgpr) -; CHECK: .set foo.private_seg_size, 16+(max(bar.private_seg_size)) +; CHECK: .set foo.private_seg_size, 16+max(bar.private_seg_size) ; CHECK: .set foo.uses_vcc, or(0, bar.uses_vcc) ; CHECK: .set foo.uses_flat_scratch, or(0, bar.uses_flat_scratch) ; CHECK: .set foo.has_dyn_sized_stack, or(0, bar.has_dyn_sized_stack) @@ -73,7 +73,7 @@ entry: ; CHECK: .set usefoo.num_vgpr, max(32, foo.num_vgpr) ; CHECK: .set usefoo.num_agpr, max(0, foo.num_agpr) ; CHECK: .set usefoo.numbered_sgpr, max(33, foo.numbered_sgpr) -; CHECK: .set usefoo.private_seg_size, 0+(max(foo.private_seg_size)) +; CHECK: .set usefoo.private_seg_size, 0+max(foo.private_seg_size) ; CHECK: .set usefoo.uses_vcc, or(0, foo.uses_vcc) ; CHECK: .set usefoo.uses_flat_scratch, or(1, foo.uses_flat_scratch) ; CHECK: .set usefoo.has_dyn_sized_stack, or(0, foo.has_dyn_sized_stack) diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll index 0ff0ca1e54f6f..ff92db746b062 100644 --- a/llvm/test/CodeGen/AMDGPU/recursion.ll +++ b/llvm/test/CodeGen/AMDGPU/recursion.ll @@ -3,7 +3,7 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=V5 %s ; CHECK-LABEL: {{^}}recursive: -; CHECK: .set recursive.private_seg_size, 16+(max(16384)) +; CHECK: .set recursive.private_seg_size, 16+max(16384) ; CHECK: ScratchSize: 16 ; V5-LABEL: {{^}}recursive: @@ -22,7 +22,7 @@ define void @tail_recursive() { ret void } -; CHECK: .set calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size)) +; CHECK: .set calls_tail_recursive.private_seg_size, 0+max(tail_recursive.private_seg_size) define void @calls_tail_recursive() norecurse { tail call void @tail_recursive() ret void @@ -41,10 +41,10 @@ define void @tail_recursive_with_stack() { ; For an arbitrary recursive call, report a large number for unknown stack ; usage for code object v4 and older ; CHECK-LABEL: {{^}}calls_recursive: -; CHECK: .set calls_recursive.private_seg_size, 0+(max(16384, recursive.private_seg_size)) +; CHECK: .set calls_recursive.private_seg_size, 0+max(16384, recursive.private_seg_size) ; ; V5-LABEL: {{^}}calls_recursive: -; V5: .set calls_recursive.private_seg_size, 0+(max(recursive.private_seg_size)) +; V5: .set calls_recursive.private_seg_size, 0+max(recursive.private_seg_size) ; V5: .set calls_recursive.has_dyn_sized_stack, or(0, recursive.has_dyn_sized_stack) define amdgpu_kernel void @calls_recursive() { call void @recursive() @@ -54,7 +54,7 @@ define amdgpu_kernel void @calls_recursive() { ; Make sure we do not report a huge stack size for tail recursive ; functions ; CHECK-LABEL: {{^}}kernel_indirectly_calls_tail_recursive: -; CHECK: .set kernel_indirectly_calls_tail_recursive.private_seg_size, 0+(max(calls_tail_recursive.private_seg_size)) +; CHECK: .set kernel_indirectly_calls_tail_recursive.private_seg_size, 0+max(calls_tail_recursive.private_seg_size) define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() { call void @calls_tail_recursive() ret void @@ -65,10 +65,10 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() { ; in the kernel. ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive: -; CHECK: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(16384, tail_recursive.private_seg_size)) +; CHECK: .set kernel_calls_tail_recursive.private_seg_size, 0+max(16384, tail_recursive.private_seg_size) ; ; V5-LABEL: {{^}}kernel_calls_tail_recursive: -; V5: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size)) +; V5: .set kernel_calls_tail_recursive.private_seg_size, 0+max(tail_recursive.private_seg_size) ; V5: .set kernel_calls_tail_recursive.has_recursion, or(1, tail_recursive.has_recursion) define amdgpu_kernel void @kernel_calls_tail_recursive() { call void @tail_recursive() @@ -76,10 +76,10 @@ define amdgpu_kernel void @kernel_calls_tail_recursive() { } ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack: -; CHECK: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(16384, tail_recursive_with_stack.private_seg_size)) +; CHECK: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+max(16384, tail_recursive_with_stack.private_seg_size) ; ; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack: -; V5: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(tail_recursive_with_stack.private_seg_size)) +; V5: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+max(tail_recursive_with_stack.private_seg_size) ; V5: .set kernel_calls_tail_recursive_with_stack.has_dyn_sized_stack, or(0, tail_recursive_with_stack.has_dyn_sized_stack) define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() { call void @tail_recursive_with_stack() diff --git a/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll index ac6bd9a4ae8a6..3093349bff37c 100644 --- a/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll @@ -15,7 +15,7 @@ ; CHECK: .set baz.num_vgpr, max(61, qux.num_vgpr) ; CHECK: .set baz.num_agpr, max(0, qux.num_agpr) ; CHECK: .set baz.numbered_sgpr, max(51, qux.numbered_sgpr) -; CHECK: .set baz.private_seg_size, 16+(max(qux.private_seg_size)) +; CHECK: .set baz.private_seg_size, 16+max(qux.private_seg_size) ; CHECK: .set baz.uses_vcc, or(1, qux.uses_vcc) ; CHECK: .set baz.uses_flat_scratch, or(0, qux.uses_flat_scratch) ; CHECK: .set baz.has_dyn_sized_stack, or(0, qux.has_dyn_sized_stack) @@ -26,7 +26,7 @@ ; CHECK: .set bar.num_vgpr, max(51, baz.num_vgpr) ; CHECK: .set bar.num_agpr, max(0, baz.num_agpr) ; CHECK: .set bar.numbered_sgpr, max(61, baz.numbered_sgpr) -; CHECK: .set bar.private_seg_size, 16+(max(baz.private_seg_size)) +; CHECK: .set bar.private_seg_size, 16+max(baz.private_seg_size) ; CHECK: .set bar.uses_vcc, or(1, baz.uses_vcc) ; CHECK: .set bar.uses_flat_scratch, or(0, baz.uses_flat_scratch) ; CHECK: .set bar.has_dyn_sized_stack, or(0, baz.has_dyn_sized_stack) @@ -80,7 +80,7 @@ entry: ; CHECK: .set usefoo.num_vgpr, max(32, foo.num_vgpr) ; CHECK: .set usefoo.num_agpr, max(0, foo.num_agpr) ; CHECK: .set usefoo.numbered_sgpr, max(33, foo.numbered_sgpr) -; CHECK: .set usefoo.private_seg_size, 0+(max(foo.private_seg_size)) +; CHECK: .set usefoo.private_seg_size, 0+max(foo.private_seg_size) ; CHECK: .set usefoo.uses_vcc, or(1, foo.uses_vcc) ; CHECK: .set usefoo.uses_flat_scratch, or(1, foo.uses_flat_scratch) ; CHECK: .set usefoo.has_dyn_sized_stack, or(0, foo.has_dyn_sized_stack) diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll index ef91be9366b02..afb77ed190896 100644 --- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll +++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll @@ -146,7 +146,7 @@ define void @empty_func() !dbg !8 { ; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: test_indirect_call.num_agpr ; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: 0 ; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: True -; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(test_indirect_call.numbered_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0)) +; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(test_indirect_call.numbered_sgpr+extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0)) ; STDERR-NEXT: remark: foo.cl:64:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:64:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:64:0: LDS Size [bytes/block]: 0 @@ -164,7 +164,7 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 { ; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: test_indirect_w_static_stack.num_agpr ; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144 ; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True -; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(test_indirect_w_static_stack.numbered_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0)) +; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(test_indirect_w_static_stack.numbered_sgpr+extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0)) ; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:74:0: VGPRs Spill: 0 ; STDERR-NEXT: remark: foo.cl:74:0: LDS Size [bytes/block]: 0 diff --git a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll index c9fbd369e062d..cf5b95a729974 100644 --- a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll +++ b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll @@ -19,7 +19,7 @@ entry: ; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr) ; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr) ; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr) -; CHECK: .set __unnamed_2.private_seg_size, 16+(max(__unnamed_1.private_seg_size)) +; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size) ; CHECK: .set __unnamed_2.uses_vcc, or(0, __unnamed_1.uses_vcc) ; CHECK: .set __unnamed_2.uses_flat_scratch, or(0, __unnamed_1.uses_flat_scratch) ; CHECK: .set __unnamed_2.has_dyn_sized_stack, or(0, __unnamed_1.has_dyn_sized_stack) @@ -35,7 +35,7 @@ entry: ; CHECK: .set use.num_vgpr, max(32, __unnamed_1.num_vgpr, __unnamed_2.num_vgpr) ; CHECK: .set use.num_agpr, max(0, __unnamed_1.num_agpr, __unnamed_2.num_agpr) ; CHECK: .set use.numbered_sgpr, max(33, __unnamed_1.numbered_sgpr, __unnamed_2.numbered_sgpr) -; CHECK: .set use.private_seg_size, 0+(max(__unnamed_1.private_seg_size, __unnamed_2.private_seg_size)) +; CHECK: .set use.private_seg_size, 0+max(__unnamed_1.private_seg_size, __unnamed_2.private_seg_size) ; CHECK: .set use.uses_vcc, or(0, __unnamed_1.uses_vcc, __unnamed_2.uses_vcc) ; CHECK: .set use.uses_flat_scratch, or(1, __unnamed_1.uses_flat_scratch, __unnamed_2.uses_flat_scratch) ; CHECK: .set use.has_dyn_sized_stack, or(0, __unnamed_1.has_dyn_sized_stack, __unnamed_2.has_dyn_sized_stack) diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-isel-globals-pic.ll b/llvm/test/CodeGen/ARM/GlobalISel/arm-isel-globals-pic.ll index 80d687ba0f53d..c8adcaac19b7f 100644 --- a/llvm/test/CodeGen/ARM/GlobalISel/arm-isel-globals-pic.ll +++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-isel-globals-pic.ll @@ -41,7 +41,7 @@ define i32 @test_external_global() { ; CHECK-NEXT: bx lr ; ELF: [[LABEL]]: ; ELF: [[TMPLABEL:.L[[:alnum:]_]+]]: -; ELF: .long external_global(GOT_PREL)-(([[ANCHOR]]+8)-[[TMPLABEL]]) +; ELF: .long external_global(GOT_PREL)-([[ANCHOR]]+8-[[TMPLABEL]]) ; DARWIN-NOMOVT: [[LABEL]]: ; DARWIN-NOMOVT: .long L_external_global$non_lazy_ptr-([[ANCHOR]]+8) ; DARWIN-NOMOVT-NOT: .long L_external_global @@ -88,7 +88,7 @@ define i32 @test_external_constant() { ; CHECK-NEXT: bx lr ; ELF: [[LABEL]]: ; ELF: [[TMPLABEL:.L[[:alnum:]_]+]]: -; ELF: .long external_constant(GOT_PREL)-(([[ANCHOR]]+8)-[[TMPLABEL]]) +; ELF: .long external_constant(GOT_PREL)-([[ANCHOR]]+8-[[TMPLABEL]]) ; DARWIN-NOMOVT: [[LABEL]]: ; DARWIN-NOMOVT: .long L_external_constant$non_lazy_ptr-([[ANCHOR]]+8) ; DARWIN-NOMOVT-NOT: .long L_external_constant diff --git a/llvm/test/CodeGen/ARM/GlobalISel/thumb-isel-globals-pic.ll b/llvm/test/CodeGen/ARM/GlobalISel/thumb-isel-globals-pic.ll index e6828a52f2941..a4cc43f4dd814 100644 --- a/llvm/test/CodeGen/ARM/GlobalISel/thumb-isel-globals-pic.ll +++ b/llvm/test/CodeGen/ARM/GlobalISel/thumb-isel-globals-pic.ll @@ -41,7 +41,7 @@ define i32 @test_external_global() { ; CHECK-NEXT: bx lr ; ELF: [[LABEL]]: ; ELF: [[TMPLABEL:.L[[:alnum:]_]+]]: -; ELF: .long external_global(GOT_PREL)-(([[ANCHOR]]+4)-[[TMPLABEL]]) +; ELF: .long external_global(GOT_PREL)-([[ANCHOR]]+4-[[TMPLABEL]]) ; DARWIN-NOMOVT: [[LABEL]]: ; DARWIN-NOMOVT: .long L_external_global$non_lazy_ptr-([[ANCHOR]]+4) ; DARWIN-NOMOVT-NOT: .long L_external_global @@ -88,7 +88,7 @@ define i32 @test_external_constant() { ; CHECK-NEXT: bx lr ; ELF: [[LABEL]]: ; ELF: [[TMPLABEL:.L[[:alnum:]_]+]]: -; ELF: .long external_constant(GOT_PREL)-(([[ANCHOR]]+4)-[[TMPLABEL]]) +; ELF: .long external_constant(GOT_PREL)-([[ANCHOR]]+4-[[TMPLABEL]]) ; DARWIN-NOMOVT: [[LABEL]]: ; DARWIN-NOMOVT: .long L_external_constant$non_lazy_ptr-([[ANCHOR]]+4) ; DARWIN-NOMOVT-NOT: .long L_external_constant diff --git a/llvm/test/CodeGen/ARM/elf-preemption.ll b/llvm/test/CodeGen/ARM/elf-preemption.ll index efb1c6aa26817..154c29c1c029c 100644 --- a/llvm/test/CodeGen/ARM/elf-preemption.ll +++ b/llvm/test/CodeGen/ARM/elf-preemption.ll @@ -22,7 +22,7 @@ define ptr @get_preemptable_var() nounwind { ; PIC-NEXT: @ %bb.1: ; PIC-NEXT: .LCPI0_0: ; PIC-NEXT: .Ltmp0: -; PIC-NEXT: .long preemptable_var(GOT_PREL)-((.LPC0_0+8)-.Ltmp0) +; PIC-NEXT: .long preemptable_var(GOT_PREL)-(.LPC0_0+8-.Ltmp0) ret ptr @preemptable_var } @@ -127,7 +127,7 @@ define dso_preemptable ptr @preemptable_func() nounwind { ; PIC-NEXT: @ %bb.1: ; PIC-NEXT: .LCPI5_0: ; PIC-NEXT: .Ltmp1: -; PIC-NEXT: .long preemptable_func(GOT_PREL)-((.LPC5_0+8)-.Ltmp1) +; PIC-NEXT: .long preemptable_func(GOT_PREL)-(.LPC5_0+8-.Ltmp1) ret ptr @preemptable_func } diff --git a/llvm/test/CodeGen/ARM/globals.ll b/llvm/test/CodeGen/ARM/globals.ll index 3a36d16d53501..acd4655720b00 100644 --- a/llvm/test/CodeGen/ARM/globals.ll +++ b/llvm/test/CodeGen/ARM/globals.ll @@ -69,4 +69,4 @@ define i32 @test1() { ; LinuxPIC: .p2align 2 ; LinuxPIC: .LCPI0_0: ; LinuxPIC: .Ltmp0: -; LinuxPIC: .long G(GOT_PREL)-((.LPC0_0+8)-.Ltmp0) +; LinuxPIC: .long G(GOT_PREL)-(.LPC0_0+8-.Ltmp0) diff --git a/llvm/test/CodeGen/ARM/litpool-licm.ll b/llvm/test/CodeGen/ARM/litpool-licm.ll index f1a029b83f831..bcc15e93f4947 100644 --- a/llvm/test/CodeGen/ARM/litpool-licm.ll +++ b/llvm/test/CodeGen/ARM/litpool-licm.ll @@ -15,11 +15,11 @@ define void @func(i32 %n) { ; CHECK: [[CP1]]: ; CHECK-NEXT: [[CP1_TMP:.Ltmp[0-9]+]]: -; CHECK-NEXT: .long var(TLSGD)-(([[PCPOS1]]+4)-[[CP1_TMP]]) +; CHECK-NEXT: .long var(TLSGD)-([[PCPOS1]]+4-[[CP1_TMP]]) ; CHECK: [[CP2]]: ; CHECK-NEXT: [[CP2_TMP:.Ltmp[0-9]+]]: -; CHECK-NEXT: .long var(TLSGD)-(([[PCPOS2]]+4)-[[CP2_TMP]]) +; CHECK-NEXT: .long var(TLSGD)-([[PCPOS2]]+4-[[CP2_TMP]]) entry: br label %loop diff --git a/llvm/test/CodeGen/ARM/load-global.ll b/llvm/test/CodeGen/ARM/load-global.ll index 0d370a495d2f5..01f5b5793949e 100644 --- a/llvm/test/CodeGen/ARM/load-global.ll +++ b/llvm/test/CodeGen/ARM/load-global.ll @@ -43,7 +43,7 @@ define i32 @test1() { ; LINUX: ldr r0, .LCPI0_0 ; LINUX: ldr r0, [pc, r0] ; LINUX: ldr r0, [r0] -; LINUX: .long G(GOT_PREL)-((.LPC0_0+8)-.Ltmp0) +; LINUX: .long G(GOT_PREL)-(.LPC0_0+8-.Ltmp0) ; LINUX_T: ldr r0, .LCPI0_0 ; LINUX_T: add r0, pc diff --git a/llvm/test/CodeGen/ARM/load-global2.ll b/llvm/test/CodeGen/ARM/load-global2.ll index 08a8f4280d3b8..0ea2476388c42 100644 --- a/llvm/test/CodeGen/ARM/load-global2.ll +++ b/llvm/test/CodeGen/ARM/load-global2.ll @@ -28,7 +28,7 @@ define signext i8 @foo() { ; LINUX-PIC-NEXT: @ %bb.3: ; LINUX-PIC-NEXT: .LCPI0_0: ; LINUX-PIC-NEXT: .Ltmp0: -; LINUX-PIC-NEXT: .long x(GOT_PREL)-((.LPC0_0+8)-.Ltmp0) +; LINUX-PIC-NEXT: .long x(GOT_PREL)-(.LPC0_0+8-.Ltmp0) entry: %0 = load i8, ptr @x %tobool = icmp eq i8 %0, 0 diff --git a/llvm/test/CodeGen/ARM/plt-relative-reloc.ll b/llvm/test/CodeGen/ARM/plt-relative-reloc.ll index 414a48e5aaaed..ede891900e6d0 100644 --- a/llvm/test/CodeGen/ARM/plt-relative-reloc.ll +++ b/llvm/test/CodeGen/ARM/plt-relative-reloc.ll @@ -11,6 +11,6 @@ declare void @fn2() unnamed_addr declare void @fn3() ; CHECK: .long 0 -; CHECK-NEXT: .long (fn1(prel31)-vtable)-4 -; CHECK-NEXT: .long (fn2(prel31)-vtable)-4 -; CHECK-NEXT: .long (fn3-vtable)-4 +; CHECK-NEXT: .long fn1(prel31)-vtable-4 +; CHECK-NEXT: .long fn2(prel31)-vtable-4 +; CHECK-NEXT: .long fn3-vtable-4 diff --git a/llvm/test/CodeGen/ARM/stack-guard-elf.ll b/llvm/test/CodeGen/ARM/stack-guard-elf.ll index d0e5db7e5711b..eb40b33a5eeb2 100644 --- a/llvm/test/CodeGen/ARM/stack-guard-elf.ll +++ b/llvm/test/CodeGen/ARM/stack-guard-elf.ll @@ -43,10 +43,10 @@ define i32 @test1() #0 { ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: .LCPI0_0: ; CHECK-NEXT: .Ltmp0: -; CHECK-NEXT: .long __stack_chk_guard(GOT_PREL)-((.LPC0_0+8)-.Ltmp0) +; CHECK-NEXT: .long __stack_chk_guard(GOT_PREL)-(.LPC0_0+8-.Ltmp0) ; CHECK-NEXT: .LCPI0_1: ; CHECK-NEXT: .Ltmp1: -; CHECK-NEXT: .long __stack_chk_guard(GOT_PREL)-((.LPC0_1+8)-.Ltmp1) +; CHECK-NEXT: .long __stack_chk_guard(GOT_PREL)-(.LPC0_1+8-.Ltmp1) ; ; THUMB1-LABEL: test1: ; THUMB1: @ %bb.0: @@ -88,10 +88,10 @@ define i32 @test1() #0 { ; THUMB1-NEXT: @ %bb.3: ; THUMB1-NEXT: .LCPI0_0: ; THUMB1-NEXT: .Ltmp0: -; THUMB1-NEXT: .long __stack_chk_guard(GOT_PREL)-((.LPC0_0+4)-.Ltmp0) +; THUMB1-NEXT: .long __stack_chk_guard(GOT_PREL)-(.LPC0_0+4-.Ltmp0) ; THUMB1-NEXT: .LCPI0_1: ; THUMB1-NEXT: .Ltmp1: -; THUMB1-NEXT: .long __stack_chk_guard(GOT_PREL)-((.LPC0_1+4)-.Ltmp1) +; THUMB1-NEXT: .long __stack_chk_guard(GOT_PREL)-(.LPC0_1+4-.Ltmp1) ; ; THUMB1-PIC-LABEL: test1: ; THUMB1-PIC: @ %bb.0: @@ -133,10 +133,10 @@ define i32 @test1() #0 { ; THUMB1-PIC-NEXT: @ %bb.3: ; THUMB1-PIC-NEXT: .LCPI0_0: ; THUMB1-PIC-NEXT: .Ltmp0: -; THUMB1-PIC-NEXT: .long __stack_chk_guard(GOT_PREL)-((.LPC0_0+4)-.Ltmp0) +; THUMB1-PIC-NEXT: .long __stack_chk_guard(GOT_PREL)-(.LPC0_0+4-.Ltmp0) ; THUMB1-PIC-NEXT: .LCPI0_1: ; THUMB1-PIC-NEXT: .Ltmp1: -; THUMB1-PIC-NEXT: .long __stack_chk_guard(GOT_PREL)-((.LPC0_1+4)-.Ltmp1) +; THUMB1-PIC-NEXT: .long __stack_chk_guard(GOT_PREL)-(.LPC0_1+4-.Ltmp1) ; ; THUMB2-LABEL: test1: ; THUMB2: @ %bb.0: @@ -169,10 +169,10 @@ define i32 @test1() #0 { ; THUMB2-NEXT: @ %bb.2: ; THUMB2-NEXT: .LCPI0_0: ; THUMB2-NEXT: .Ltmp0: -; THUMB2-NEXT: .long __stack_chk_guard(GOT_PREL)-((.LPC0_0+4)-.Ltmp0) +; THUMB2-NEXT: .long __stack_chk_guard(GOT_PREL)-(.LPC0_0+4-.Ltmp0) ; THUMB2-NEXT: .LCPI0_1: ; THUMB2-NEXT: .Ltmp1: -; THUMB2-NEXT: .long __stack_chk_guard(GOT_PREL)-((.LPC0_1+4)-.Ltmp1) +; THUMB2-NEXT: .long __stack_chk_guard(GOT_PREL)-(.LPC0_1+4-.Ltmp1) ; ; THUMB2-PIC-LABEL: test1: ; THUMB2-PIC: @ %bb.0: @@ -205,10 +205,10 @@ define i32 @test1() #0 { ; THUMB2-PIC-NEXT: @ %bb.2: ; THUMB2-PIC-NEXT: .LCPI0_0: ; THUMB2-PIC-NEXT: .Ltmp0: -; THUMB2-PIC-NEXT: .long __stack_chk_guard(GOT_PREL)-((.LPC0_0+4)-.Ltmp0) +; THUMB2-PIC-NEXT: .long __stack_chk_guard(GOT_PREL)-(.LPC0_0+4-.Ltmp0) ; THUMB2-PIC-NEXT: .LCPI0_1: ; THUMB2-PIC-NEXT: .Ltmp1: -; THUMB2-PIC-NEXT: .long __stack_chk_guard(GOT_PREL)-((.LPC0_1+4)-.Ltmp1) +; THUMB2-PIC-NEXT: .long __stack_chk_guard(GOT_PREL)-(.LPC0_1+4-.Ltmp1) %a1 = alloca [256 x i32], align 4 call void @foo(ptr %a1) #3 ret i32 0 diff --git a/llvm/test/CodeGen/ARM/stack-guard-rwpi.ll b/llvm/test/CodeGen/ARM/stack-guard-rwpi.ll index f4ae9ca0b1d97..14a0c244497d4 100644 --- a/llvm/test/CodeGen/ARM/stack-guard-rwpi.ll +++ b/llvm/test/CodeGen/ARM/stack-guard-rwpi.ll @@ -16,7 +16,7 @@ ; PIC: ldr {{r[0-9]+}}, .LCPI0_0 ; PIC: .LCPI0_0: ; PIC-NEXT: .Ltmp0: -; PIC-NEXT: .long __stack_chk_guard(GOT_PREL)-((.LPC0_0+8)-.Ltmp0) +; PIC-NEXT: .long __stack_chk_guard(GOT_PREL)-(.LPC0_0+8-.Ltmp0) define dso_local i32 @foo(i32 %t) nounwind sspstrong { entry: diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-largeaccess.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-largeaccess.ll index 44d62124ac58d..742d50a0f4912 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-largeaccess.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-dynamic-tls-largeaccess.ll @@ -45,12 +45,12 @@ define signext i32 @test1() { ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r4, ElementIntTLSv1[TL]@ld(r3) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r4, 2 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r5, ElementIntTLSv1[TL]@ld+24(r3) -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r5, (ElementIntTLS4[TL]@ld+328)-65536(r3) -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r4, (ElementIntTLS2[TL]@ld+320)-65536(r3) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r5, ElementIntTLS4[TL]@ld+328-65536(r3) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r4, ElementIntTLS2[TL]@ld+320-65536(r3) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r4, 3 -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r4, (ElementIntTLS3[TL]@ld+324)-65536(r3) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r4, ElementIntTLS3[TL]@ld+324-65536(r3) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r4, 88 -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r4, (ElementIntTLS5[TL]@ld+332)-65536(r3) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r4, ElementIntTLS5[TL]@ld+332-65536(r3) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r3, 102 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: addi r1, r1, 48 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: ld r0, 16(r1) @@ -70,12 +70,12 @@ define signext i32 @test1() { ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r4, ElementIntTLSv1[TL]@ld(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r4, 2 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, ElementIntTLSv1[TL]@ld+24(r3) -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, (ElementIntTLS4[TL]@ld+328)-65536(r3) -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r4, (ElementIntTLS2[TL]@ld+320)-65536(r3) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, ElementIntTLS4[TL]@ld+328-65536(r3) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r4, ElementIntTLS2[TL]@ld+320-65536(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r4, 3 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r4, (ElementIntTLS3[TL]@ld+324)-65536(r3) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r4, ElementIntTLS3[TL]@ld+324-65536(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r4, 88 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r4, (ElementIntTLS5[TL]@ld+332)-65536(r3) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r4, ElementIntTLS5[TL]@ld+332-65536(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r3, 102 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: addi r1, r1, 48 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: ld r0, 16(r1) @@ -124,7 +124,7 @@ define i64 @test2() { ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: mr r6, r3 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r3, 212 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r4, 203 -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: std r4, (ElementLongTLS2[TL]@ld+1200)-131072(r6) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: std r4, ElementLongTLS2[TL]@ld+1200-131072(r6) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: ld r4, L..C1(r2) # target-flags(ppc-tlsgd) @MyTLSGDVar ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: std r3, ElementLongTLS6[UL]@ld+424(r6) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: ld r3, L..C2(r2) # target-flags(ppc-tlsgdm) @MyTLSGDVar @@ -133,10 +133,10 @@ define i64 @test2() { ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: std r4, 440(r3) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r3, 6 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r4, 100 -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: std r3, (ElementLongTLS3[TL]@ld+2000)-196608(r6) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: std r3, ElementLongTLS3[TL]@ld+2000-196608(r6) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r3, 882 -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: std r4, (ElementLongTLS4[TL]@ld+6800)-196608(r6) -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: std r3, (ElementLongTLS5[TL]@ld+8400)-196608(r6) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: std r4, ElementLongTLS4[TL]@ld+6800-196608(r6) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: std r3, ElementLongTLS5[TL]@ld+8400-196608(r6) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r3, 1191 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: addi r1, r1, 48 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: ld r0, 16(r1) @@ -157,7 +157,7 @@ define i64 @test2() { ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: ld r4, L..C1@l(r4) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: std r3, ElementLongTLS6[UL]@ld+424(r6) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r3, 203 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: std r3, (ElementLongTLS2[TL]@ld+1200)-131072(r6) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: std r3, ElementLongTLS2[TL]@ld+1200-131072(r6) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: addis r3, L..C2@u(r2) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: ld r3, L..C2@l(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: bla .__tls_get_addr[PR] @@ -165,10 +165,10 @@ define i64 @test2() { ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: std r4, 440(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r3, 6 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r4, 100 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: std r3, (ElementLongTLS3[TL]@ld+2000)-196608(r6) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: std r3, ElementLongTLS3[TL]@ld+2000-196608(r6) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r3, 882 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: std r4, (ElementLongTLS4[TL]@ld+6800)-196608(r6) -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: std r3, (ElementLongTLS5[TL]@ld+8400)-196608(r6) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: std r4, ElementLongTLS4[TL]@ld+6800-196608(r6) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: std r3, ElementLongTLS5[TL]@ld+8400-196608(r6) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r3, 1191 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: addi r1, r1, 48 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: ld r0, 16(r1) @@ -214,13 +214,13 @@ define signext i32 @test3() { ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: bla .__tls_get_mod[PR] ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r5, 2 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r4, 1 -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r6, (ElementIntTLS3[TL]@ld+324)-65536(r3) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r6, ElementIntTLS3[TL]@ld+324-65536(r3) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: ld r6, L..C3(r2) # target-flags(ppc-tlsld) @ElementIntTLSv2 -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r5, (ElementIntTLS2[TL]@ld+320)-65536(r3) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r5, ElementIntTLS2[TL]@ld+320-65536(r3) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r5, 88 -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r5, (ElementIntTLS5[TL]@ld+332)-65536(r3) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r5, ElementIntTLS5[TL]@ld+332-65536(r3) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r5, 4 -; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r5, (ElementIntTLS4[TL]@ld+328)-65536(r3) +; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r5, ElementIntTLS4[TL]@ld+328-65536(r3) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stwux r4, r3, r6 ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: stw r5, 24(r3) ; SMALL-LOCAL-DYNAMIC-SMALLCM64-NEXT: li r3, 102 @@ -241,13 +241,13 @@ define signext i32 @test3() { ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: bla .__tls_get_mod[PR] ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r5, 2 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r4, 1 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, (ElementIntTLS2[TL]@ld+320)-65536(r3) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, ElementIntTLS2[TL]@ld+320-65536(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r5, 3 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, (ElementIntTLS3[TL]@ld+324)-65536(r3) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, ElementIntTLS3[TL]@ld+324-65536(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r5, 88 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, (ElementIntTLS5[TL]@ld+332)-65536(r3) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, ElementIntTLS5[TL]@ld+332-65536(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r5, 4 -; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, (ElementIntTLS4[TL]@ld+328)-65536(r3) +; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, ElementIntTLS4[TL]@ld+328-65536(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stwux r4, r3, r6 ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: stw r5, 24(r3) ; SMALL-LOCAL-DYNAMIC-LARGECM64-NEXT: li r3, 102 diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll index 91013af7a3188..a6d1fa1328290 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess.ll @@ -30,12 +30,12 @@ define signext i32 @StoreArrays1() { ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLSv1[TL]@le(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 2 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLSv1[TL]@le+24(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLS4[TL]@le+328-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLS2[TL]@le+320-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 3 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLS3[TL]@le+324-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 88 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLS5[TL]@le+332-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 102 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr ; @@ -46,12 +46,12 @@ define signext i32 @StoreArrays1() { ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLSv1[TL]@le(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 2 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, mySmallLocalExecTLSv1[TL]@le+24(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, mySmallLocalExecTLS4[TL]@le+328-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS2[TL]@le+320-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 3 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS3[TL]@le+324-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 88 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS5[TL]@le+332-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 102 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr entry: @@ -90,36 +90,36 @@ define signext i32 @StoreArrays2() { ; SMALL-LOCAL-EXEC-SMALLCM64: # %bb.0: # %entry ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 2 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 1 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLS2[TL]@le+320-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 3 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLS3[TL]@le+324-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r4, L..C0(r2) # target-flags(ppc-tprel) @mySmallLocalExecTLSv2 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: add r4, r13, r4 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, 0(r4) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 4 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, 24(r4) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 88 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r3, mySmallLocalExecTLS4[TL]@le+328-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 102 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: stw r4, mySmallLocalExecTLS5[TL]@le+332-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: blr ; ; SMALL-LOCAL-EXEC-LARGECM64-LABEL: StoreArrays2: ; SMALL-LOCAL-EXEC-LARGECM64: # %bb.0: # %entry ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 2 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 3 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS2[TL]@le+320)-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS2[TL]@le+320-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addis r3, L..C0@u(r2) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: ld r3, L..C0@l(r3) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, (mySmallLocalExecTLS3[TL]@le+324)-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, mySmallLocalExecTLS3[TL]@le+324-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 1 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: add r3, r13, r3 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, 0(r3) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 4 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, 24(r3) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 88 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, (mySmallLocalExecTLS4[TL]@le+328)-65536(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, (mySmallLocalExecTLS5[TL]@le+332)-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r4, mySmallLocalExecTLS4[TL]@le+328-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: stw r3, mySmallLocalExecTLS5[TL]@le+332-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 102 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll index f7b99461be5f3..7a6db3273421f 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-local-exec-tls-largeaccess2.ll @@ -42,8 +42,8 @@ define i64 @StoreLargeAccess1() { ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r4, 100 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r3, mySmallLocalExecTLS3[TL]@le+2000(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 882 -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r4, (mySmallLocalExecTLS4[TL]@le+6800)-65536(r13) -; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r3, (mySmallLocalExecTLS5[TL]@le+8400)-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r4, mySmallLocalExecTLS4[TL]@le+6800-65536(r13) +; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: std r3, mySmallLocalExecTLS5[TL]@le+8400-65536(r13) ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: li r3, 1191 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: addi r1, r1, 48 ; SMALL-LOCAL-EXEC-SMALLCM64-NEXT: ld r0, 16(r1) @@ -70,8 +70,8 @@ define i64 @StoreLargeAccess1() { ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r4, 100 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r3, mySmallLocalExecTLS3[TL]@le+2000(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 882 -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r4, (mySmallLocalExecTLS4[TL]@le+6800)-65536(r13) -; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r3, (mySmallLocalExecTLS5[TL]@le+8400)-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r4, mySmallLocalExecTLS4[TL]@le+6800-65536(r13) +; SMALL-LOCAL-EXEC-LARGECM64-NEXT: std r3, mySmallLocalExecTLS5[TL]@le+8400-65536(r13) ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: li r3, 1191 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: addi r1, r1, 48 ; SMALL-LOCAL-EXEC-LARGECM64-NEXT: ld r0, 16(r1) diff --git a/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll b/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll index 91a2283897f33..c9a9f36bd1634 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-funcattr.ll @@ -27,9 +27,9 @@ define i64 @StoreLargeAccess1() #1 { ; COMMONCM-NEXT: stdx r5, r3, r4 ; COMMONCM-NEXT: li r3, 55 ; COMMONCM-NEXT: li r4, 64 -; COMMONCM-NEXT: std r3, (mySmallTLS2[TL]@le+696)-65536(r13) +; COMMONCM-NEXT: std r3, mySmallTLS2[TL]@le+696-65536(r13) ; COMMONCM-NEXT: li r3, 142 -; COMMONCM-NEXT: std r4, (mySmallTLS3[TL]@le+20000)-131072(r13) +; COMMONCM-NEXT: std r4, mySmallTLS3[TL]@le+20000-131072(r13) ; COMMONCM-NEXT: blr entry: %tls0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS) diff --git a/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-targetattr.ll b/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-targetattr.ll index 1e4a3b9bcc47c..3029c85bb5fa7 100644 --- a/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-targetattr.ll +++ b/llvm/test/CodeGen/PowerPC/aix-small-tls-globalvarattr-targetattr.ll @@ -29,9 +29,9 @@ define i64 @StoreLargeAccess1() { ; COMMONCM-NEXT: stdx r5, r3, r4 ; COMMONCM-NEXT: li r3, 55 ; COMMONCM-NEXT: li r4, 64 -; COMMONCM-NEXT: std r3, (mySmallTLS2[TL]@le+696)-65536(r13) +; COMMONCM-NEXT: std r3, mySmallTLS2[TL]@le+696-65536(r13) ; COMMONCM-NEXT: li r3, 142 -; COMMONCM-NEXT: std r4, (mySmallTLS3[TL]@le+20000)-131072(r13) +; COMMONCM-NEXT: std r4, mySmallTLS3[TL]@le+20000-131072(r13) ; COMMONCM-NEXT: blr entry: %tls0 = tail call align 8 ptr @llvm.threadlocal.address.p0(ptr align 8 @mySmallTLS) diff --git a/llvm/test/CodeGen/RISCV/dso_local_equivalent.ll b/llvm/test/CodeGen/RISCV/dso_local_equivalent.ll index 1ee8b1f78110b..e5e69b7bfe13b 100644 --- a/llvm/test/CodeGen/RISCV/dso_local_equivalent.ll +++ b/llvm/test/CodeGen/RISCV/dso_local_equivalent.ll @@ -26,9 +26,9 @@ declare void @extern_func() ; CHECK-NEXT: .word 0 # 0x0 ; CHECK-NEXT: .word %pltpcrel(f0) ; CHECK-NEXT: .word %pltpcrel(f1+4) -; CHECK-NEXT: .word (f2-_ZTV1B)-8 +; CHECK-NEXT: .word f2-_ZTV1B-8 ; CHECK-NEXT: .word %pltpcrel(f3+12) -; CHECK-NEXT: .word (f4-_ZTV1B)-8 +; CHECK-NEXT: .word f4-_ZTV1B-8 ; CHECK-NEXT: .size _ZTV1B, 28 declare void @f0() declare void @f1() diff --git a/llvm/test/CodeGen/RISCV/plt-relative-reloc.ll b/llvm/test/CodeGen/RISCV/plt-relative-reloc.ll index a432fc5e7e530..d2dceb773b2e9 100644 --- a/llvm/test/CodeGen/RISCV/plt-relative-reloc.ll +++ b/llvm/test/CodeGen/RISCV/plt-relative-reloc.ll @@ -16,6 +16,6 @@ declare void @fn3() ; CHECK-NEXT: .word 0 # 0x0 ; CHECK-NEXT: .word %pltpcrel(fn1) ; CHECK-NEXT: .word %pltpcrel(fn2+4) -; CHECK-NEXT: .word (fn3-vtable)-4 -; CHECK-NEXT: .word (global4-vtable)-4 +; CHECK-NEXT: .word fn3-vtable-4 +; CHECK-NEXT: .word global4-vtable-4 ; CHECK-NEXT: .size vtable, 20 diff --git a/llvm/test/CodeGen/Thumb2/tpsoft.ll b/llvm/test/CodeGen/Thumb2/tpsoft.ll index 2454bd0a98580..7222fcfa532c5 100644 --- a/llvm/test/CodeGen/Thumb2/tpsoft.ll +++ b/llvm/test/CodeGen/Thumb2/tpsoft.ll @@ -47,7 +47,7 @@ define arm_aapcs_vfpcc i32 @main() nounwind { ; ELFASM-NEXT: @ %bb.4: ; ELFASM-NEXT: .LCPI0_0: ; ELFASM-NEXT: .Ltmp0: -; ELFASM-NEXT: .long i(GOTTPOFF)-((.LPC0_0+4)-.Ltmp0) +; ELFASM-NEXT: .long i(GOTTPOFF)-(.LPC0_0+4-.Ltmp0) entry: %0 = load i32, ptr @i, align 4 switch i32 %0, label %bb2 [ diff --git a/llvm/test/CodeGen/X86/abi-isel.ll b/llvm/test/CodeGen/X86/abi-isel.ll index 9f0d89f20ceb4..2ac392c729d19 100644 --- a/llvm/test/CodeGen/X86/abi-isel.ll +++ b/llvm/test/CodeGen/X86/abi-isel.ll @@ -1656,8 +1656,8 @@ define dso_local void @qux03() nounwind { ; DARWIN-32-PIC-NEXT: calll L18$pb ; DARWIN-32-PIC-NEXT: L18$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: movl (_dsrc-L18$pb)+64(%eax), %ecx -; DARWIN-32-PIC-NEXT: movl %ecx, (_ddst-L18$pb)+64(%eax) +; DARWIN-32-PIC-NEXT: movl _dsrc-L18$pb+64(%eax), %ecx +; DARWIN-32-PIC-NEXT: movl %ecx, _ddst-L18$pb+64(%eax) ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: qux03: @@ -1727,7 +1727,7 @@ define dso_local void @qux04() nounwind { ; DARWIN-32-PIC-NEXT: calll L19$pb ; DARWIN-32-PIC-NEXT: L19$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: leal (_ddst-L19$pb)+64(%eax), %ecx +; DARWIN-32-PIC-NEXT: leal _ddst-L19$pb+64(%eax), %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _dptr-L19$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; @@ -1807,7 +1807,7 @@ define dso_local void @qux05() nounwind { ; DARWIN-32-PIC-NEXT: calll L20$pb ; DARWIN-32-PIC-NEXT: L20$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: movl (_dsrc-L20$pb)+64(%eax), %ecx +; DARWIN-32-PIC-NEXT: movl _dsrc-L20$pb+64(%eax), %ecx ; DARWIN-32-PIC-NEXT: movl _dptr-L20$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl %ecx, 64(%eax) ; DARWIN-32-PIC-NEXT: retl @@ -1888,8 +1888,8 @@ define dso_local void @qux06() nounwind { ; DARWIN-32-PIC-NEXT: calll L21$pb ; DARWIN-32-PIC-NEXT: L21$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: movl (_lsrc-L21$pb)+64(%eax), %ecx -; DARWIN-32-PIC-NEXT: movl %ecx, (_ldst-L21$pb)+64(%eax) +; DARWIN-32-PIC-NEXT: movl _lsrc-L21$pb+64(%eax), %ecx +; DARWIN-32-PIC-NEXT: movl %ecx, _ldst-L21$pb+64(%eax) ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: qux06: @@ -1959,7 +1959,7 @@ define dso_local void @qux07() nounwind { ; DARWIN-32-PIC-NEXT: calll L22$pb ; DARWIN-32-PIC-NEXT: L22$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: leal (_ldst-L22$pb)+64(%eax), %ecx +; DARWIN-32-PIC-NEXT: leal _ldst-L22$pb+64(%eax), %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _lptr-L22$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; @@ -2039,7 +2039,7 @@ define dso_local void @qux08() nounwind { ; DARWIN-32-PIC-NEXT: calll L23$pb ; DARWIN-32-PIC-NEXT: L23$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: movl (_lsrc-L23$pb)+64(%eax), %ecx +; DARWIN-32-PIC-NEXT: movl _lsrc-L23$pb+64(%eax), %ecx ; DARWIN-32-PIC-NEXT: movl _lptr-L23$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl %ecx, 64(%eax) ; DARWIN-32-PIC-NEXT: retl @@ -3887,8 +3887,8 @@ define dso_local void @off03(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L42$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: movl (_dsrc-L42$pb)+64(%eax,%ecx,4), %edx -; DARWIN-32-PIC-NEXT: movl %edx, (_ddst-L42$pb)+64(%eax,%ecx,4) +; DARWIN-32-PIC-NEXT: movl _dsrc-L42$pb+64(%eax,%ecx,4), %edx +; DARWIN-32-PIC-NEXT: movl %edx, _ddst-L42$pb+64(%eax,%ecx,4) ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: off03: @@ -3977,7 +3977,7 @@ define dso_local void @off04(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L43$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal (_ddst-L43$pb)+64(%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal _ddst-L43$pb+64(%eax,%ecx,4), %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _dptr-L43$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; @@ -4068,7 +4068,7 @@ define dso_local void @off05(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L44$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: movl (_dsrc-L44$pb)+64(%eax,%ecx,4), %edx +; DARWIN-32-PIC-NEXT: movl _dsrc-L44$pb+64(%eax,%ecx,4), %edx ; DARWIN-32-PIC-NEXT: movl _dptr-L44$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl %edx, 64(%eax,%ecx,4) ; DARWIN-32-PIC-NEXT: retl @@ -4161,8 +4161,8 @@ define dso_local void @off06(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L45$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: movl (_lsrc-L45$pb)+64(%eax,%ecx,4), %edx -; DARWIN-32-PIC-NEXT: movl %edx, (_ldst-L45$pb)+64(%eax,%ecx,4) +; DARWIN-32-PIC-NEXT: movl _lsrc-L45$pb+64(%eax,%ecx,4), %edx +; DARWIN-32-PIC-NEXT: movl %edx, _ldst-L45$pb+64(%eax,%ecx,4) ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: off06: @@ -4251,7 +4251,7 @@ define dso_local void @off07(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L46$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal (_ldst-L46$pb)+64(%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal _ldst-L46$pb+64(%eax,%ecx,4), %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _lptr-L46$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; @@ -4342,7 +4342,7 @@ define dso_local void @off08(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L47$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: movl (_lsrc-L47$pb)+64(%eax,%ecx,4), %edx +; DARWIN-32-PIC-NEXT: movl _lsrc-L47$pb+64(%eax,%ecx,4), %edx ; DARWIN-32-PIC-NEXT: movl _lptr-L47$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl %edx, 64(%eax,%ecx,4) ; DARWIN-32-PIC-NEXT: retl @@ -4711,8 +4711,8 @@ define dso_local void @moo03(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: calll L51$pb ; DARWIN-32-PIC-NEXT: L51$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: movl (_dsrc-L51$pb)+262144(%eax), %ecx -; DARWIN-32-PIC-NEXT: movl %ecx, (_ddst-L51$pb)+262144(%eax) +; DARWIN-32-PIC-NEXT: movl _dsrc-L51$pb+262144(%eax), %ecx +; DARWIN-32-PIC-NEXT: movl %ecx, _ddst-L51$pb+262144(%eax) ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: moo03: @@ -4782,7 +4782,7 @@ define dso_local void @moo04(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: calll L52$pb ; DARWIN-32-PIC-NEXT: L52$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: leal (_ddst-L52$pb)+262144(%eax), %ecx +; DARWIN-32-PIC-NEXT: leal _ddst-L52$pb+262144(%eax), %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _dptr-L52$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; @@ -4862,7 +4862,7 @@ define dso_local void @moo05(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: calll L53$pb ; DARWIN-32-PIC-NEXT: L53$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: movl (_dsrc-L53$pb)+262144(%eax), %ecx +; DARWIN-32-PIC-NEXT: movl _dsrc-L53$pb+262144(%eax), %ecx ; DARWIN-32-PIC-NEXT: movl _dptr-L53$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl %ecx, 262144(%eax) ; DARWIN-32-PIC-NEXT: retl @@ -4943,8 +4943,8 @@ define dso_local void @moo06(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: calll L54$pb ; DARWIN-32-PIC-NEXT: L54$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: movl (_lsrc-L54$pb)+262144(%eax), %ecx -; DARWIN-32-PIC-NEXT: movl %ecx, (_ldst-L54$pb)+262144(%eax) +; DARWIN-32-PIC-NEXT: movl _lsrc-L54$pb+262144(%eax), %ecx +; DARWIN-32-PIC-NEXT: movl %ecx, _ldst-L54$pb+262144(%eax) ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: moo06: @@ -5014,7 +5014,7 @@ define dso_local void @moo07(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: calll L55$pb ; DARWIN-32-PIC-NEXT: L55$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: leal (_ldst-L55$pb)+262144(%eax), %ecx +; DARWIN-32-PIC-NEXT: leal _ldst-L55$pb+262144(%eax), %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _lptr-L55$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; @@ -5094,7 +5094,7 @@ define dso_local void @moo08(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: calll L56$pb ; DARWIN-32-PIC-NEXT: L56$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: movl (_lsrc-L56$pb)+262144(%eax), %ecx +; DARWIN-32-PIC-NEXT: movl _lsrc-L56$pb+262144(%eax), %ecx ; DARWIN-32-PIC-NEXT: movl _lptr-L56$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl %ecx, 262144(%eax) ; DARWIN-32-PIC-NEXT: retl @@ -5488,8 +5488,8 @@ define dso_local void @big03(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L60$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: movl (_dsrc-L60$pb)+262144(%eax,%ecx,4), %edx -; DARWIN-32-PIC-NEXT: movl %edx, (_ddst-L60$pb)+262144(%eax,%ecx,4) +; DARWIN-32-PIC-NEXT: movl _dsrc-L60$pb+262144(%eax,%ecx,4), %edx +; DARWIN-32-PIC-NEXT: movl %edx, _ddst-L60$pb+262144(%eax,%ecx,4) ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: big03: @@ -5578,7 +5578,7 @@ define dso_local void @big04(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L61$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal (_ddst-L61$pb)+262144(%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal _ddst-L61$pb+262144(%eax,%ecx,4), %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _dptr-L61$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; @@ -5669,7 +5669,7 @@ define dso_local void @big05(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L62$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: movl (_dsrc-L62$pb)+262144(%eax,%ecx,4), %edx +; DARWIN-32-PIC-NEXT: movl _dsrc-L62$pb+262144(%eax,%ecx,4), %edx ; DARWIN-32-PIC-NEXT: movl _dptr-L62$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl %edx, 262144(%eax,%ecx,4) ; DARWIN-32-PIC-NEXT: retl @@ -5762,8 +5762,8 @@ define dso_local void @big06(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L63$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: movl (_lsrc-L63$pb)+262144(%eax,%ecx,4), %edx -; DARWIN-32-PIC-NEXT: movl %edx, (_ldst-L63$pb)+262144(%eax,%ecx,4) +; DARWIN-32-PIC-NEXT: movl _lsrc-L63$pb+262144(%eax,%ecx,4), %edx +; DARWIN-32-PIC-NEXT: movl %edx, _ldst-L63$pb+262144(%eax,%ecx,4) ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: big06: @@ -5852,7 +5852,7 @@ define dso_local void @big07(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L64$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal (_ldst-L64$pb)+262144(%eax,%ecx,4), %ecx +; DARWIN-32-PIC-NEXT: leal _ldst-L64$pb+262144(%eax,%ecx,4), %ecx ; DARWIN-32-PIC-NEXT: movl %ecx, _lptr-L64$pb(%eax) ; DARWIN-32-PIC-NEXT: retl ; @@ -5943,7 +5943,7 @@ define dso_local void @big08(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L65$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: movl (_lsrc-L65$pb)+262144(%eax,%ecx,4), %edx +; DARWIN-32-PIC-NEXT: movl _lsrc-L65$pb+262144(%eax,%ecx,4), %edx ; DARWIN-32-PIC-NEXT: movl _lptr-L65$pb(%eax), %eax ; DARWIN-32-PIC-NEXT: movl %edx, 262144(%eax,%ecx,4) ; DARWIN-32-PIC-NEXT: retl @@ -7787,7 +7787,7 @@ define dso_local ptr @bat03() nounwind { ; DARWIN-32-PIC-NEXT: calll L93$pb ; DARWIN-32-PIC-NEXT: L93$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: leal (_dsrc-L93$pb)+64(%eax), %eax +; DARWIN-32-PIC-NEXT: leal _dsrc-L93$pb+64(%eax), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: bat03: @@ -7850,7 +7850,7 @@ define dso_local ptr @bat04() nounwind { ; DARWIN-32-PIC-NEXT: calll L94$pb ; DARWIN-32-PIC-NEXT: L94$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: leal (_ddst-L94$pb)+64(%eax), %eax +; DARWIN-32-PIC-NEXT: leal _ddst-L94$pb+64(%eax), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: bat04: @@ -7988,7 +7988,7 @@ define dso_local ptr @bat06() nounwind { ; DARWIN-32-PIC-NEXT: calll L96$pb ; DARWIN-32-PIC-NEXT: L96$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: leal (_lsrc-L96$pb)+64(%eax), %eax +; DARWIN-32-PIC-NEXT: leal _lsrc-L96$pb+64(%eax), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: bat06: @@ -8051,7 +8051,7 @@ define dso_local ptr @bat07() nounwind { ; DARWIN-32-PIC-NEXT: calll L97$pb ; DARWIN-32-PIC-NEXT: L97$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: leal (_ldst-L97$pb)+64(%eax), %eax +; DARWIN-32-PIC-NEXT: leal _ldst-L97$pb+64(%eax), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: bat07: @@ -8485,7 +8485,7 @@ define dso_local ptr @bam03() nounwind { ; DARWIN-32-PIC-NEXT: calll L103$pb ; DARWIN-32-PIC-NEXT: L103$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: leal (_dsrc-L103$pb)+262144(%eax), %eax +; DARWIN-32-PIC-NEXT: leal _dsrc-L103$pb+262144(%eax), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: bam03: @@ -8548,7 +8548,7 @@ define dso_local ptr @bam04() nounwind { ; DARWIN-32-PIC-NEXT: calll L104$pb ; DARWIN-32-PIC-NEXT: L104$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: leal (_ddst-L104$pb)+262144(%eax), %eax +; DARWIN-32-PIC-NEXT: leal _ddst-L104$pb+262144(%eax), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: bam04: @@ -8686,7 +8686,7 @@ define dso_local ptr @bam06() nounwind { ; DARWIN-32-PIC-NEXT: calll L106$pb ; DARWIN-32-PIC-NEXT: L106$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: leal (_lsrc-L106$pb)+262144(%eax), %eax +; DARWIN-32-PIC-NEXT: leal _lsrc-L106$pb+262144(%eax), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: bam06: @@ -8749,7 +8749,7 @@ define dso_local ptr @bam07() nounwind { ; DARWIN-32-PIC-NEXT: calll L107$pb ; DARWIN-32-PIC-NEXT: L107$pb: ; DARWIN-32-PIC-NEXT: popl %eax -; DARWIN-32-PIC-NEXT: leal (_ldst-L107$pb)+262144(%eax), %eax +; DARWIN-32-PIC-NEXT: leal _ldst-L107$pb+262144(%eax), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: bam07: @@ -9294,7 +9294,7 @@ define dso_local ptr @cat03(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L114$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal (_dsrc-L114$pb)+64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal _dsrc-L114$pb+64(%eax,%ecx,4), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat03: @@ -9368,7 +9368,7 @@ define dso_local ptr @cat04(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L115$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal (_ddst-L115$pb)+64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal _ddst-L115$pb+64(%eax,%ecx,4), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat04: @@ -9523,7 +9523,7 @@ define dso_local ptr @cat06(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L117$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal (_lsrc-L117$pb)+64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal _lsrc-L117$pb+64(%eax,%ecx,4), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat06: @@ -9597,7 +9597,7 @@ define dso_local ptr @cat07(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L118$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal (_ldst-L118$pb)+64(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal _ldst-L118$pb+64(%eax,%ecx,4), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cat07: @@ -10153,7 +10153,7 @@ define dso_local ptr @cam03(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L125$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal (_dsrc-L125$pb)+262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal _dsrc-L125$pb+262144(%eax,%ecx,4), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam03: @@ -10227,7 +10227,7 @@ define dso_local ptr @cam04(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L126$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal (_ddst-L126$pb)+262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal _ddst-L126$pb+262144(%eax,%ecx,4), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam04: @@ -10382,7 +10382,7 @@ define dso_local ptr @cam06(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L128$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal (_lsrc-L128$pb)+262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal _lsrc-L128$pb+262144(%eax,%ecx,4), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam06: @@ -10456,7 +10456,7 @@ define dso_local ptr @cam07(i64 %i) nounwind { ; DARWIN-32-PIC-NEXT: L129$pb: ; DARWIN-32-PIC-NEXT: popl %eax ; DARWIN-32-PIC-NEXT: movl {{[0-9]+}}(%esp), %ecx -; DARWIN-32-PIC-NEXT: leal (_ldst-L129$pb)+262144(%eax,%ecx,4), %eax +; DARWIN-32-PIC-NEXT: leal _ldst-L129$pb+262144(%eax,%ecx,4), %eax ; DARWIN-32-PIC-NEXT: retl ; ; DARWIN-64-STATIC-LABEL: cam07: diff --git a/llvm/test/CodeGen/X86/atomic-minmax-i6432.ll b/llvm/test/CodeGen/X86/atomic-minmax-i6432.ll index f8cbd0a6a9ee0..362135cb1808b 100644 --- a/llvm/test/CodeGen/X86/atomic-minmax-i6432.ll +++ b/llvm/test/CodeGen/X86/atomic-minmax-i6432.ll @@ -265,7 +265,7 @@ define void @tf_bug(ptr %ptr) nounwind { ; PIC-NEXT: L4$pb: ; PIC-NEXT: popl %edi ; PIC-NEXT: movl {{[0-9]+}}(%esp), %esi -; PIC-NEXT: movl (_id-L4$pb)+4(%edi), %edx +; PIC-NEXT: movl _id-L4$pb+4(%edi), %edx ; PIC-NEXT: movl _id-L4$pb(%edi), %eax ; PIC-NEXT: .p2align 4 ; PIC-NEXT: LBB4_1: ## %atomicrmw.start diff --git a/llvm/test/CodeGen/X86/callbr-asm-instr-scheduling.ll b/llvm/test/CodeGen/X86/callbr-asm-instr-scheduling.ll index 01f3a6fcab1fb..1d1a010b95573 100644 --- a/llvm/test/CodeGen/X86/callbr-asm-instr-scheduling.ll +++ b/llvm/test/CodeGen/X86/callbr-asm-instr-scheduling.ll @@ -36,7 +36,7 @@ define i64 @early_ioremap_pmd(i64 %addr) { ; CHECK-NEXT: .Ltmp0: ; CHECK-NEXT: jmp .Ltmp1 ; CHECK-NEXT: .Ltmp2: -; CHECK-NEXT: .zero (-(((.Ltmp3-.Ltmp4)-(.Ltmp2-.Ltmp0))>0))*((.Ltmp3-.Ltmp4)-(.Ltmp2-.Ltmp0)),144 +; CHECK-NEXT: .zero -((.Ltmp3-.Ltmp4-(.Ltmp2-.Ltmp0))>0)*(.Ltmp3-.Ltmp4-(.Ltmp2-.Ltmp0)),144 ; CHECK-NEXT: .Ltmp5: entry: %0 = tail call i64 asm sideeffect "mov %cr3,$0\0A\09", "=r,=*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i64) nonnull @__force_order) diff --git a/llvm/test/CodeGen/X86/inline-asm-i-constraint-i1.ll b/llvm/test/CodeGen/X86/inline-asm-i-constraint-i1.ll index 4be7d18f8e66f..02cfec9fdfba6 100644 --- a/llvm/test/CodeGen/X86/inline-asm-i-constraint-i1.ll +++ b/llvm/test/CodeGen/X86/inline-asm-i-constraint-i1.ll @@ -2,7 +2,7 @@ ; Make sure that boolean immediates are properly (zero) extended. ; CHECK: .Ltmp[[N:[0-9]+]]: -; CHECK-NEXT: .quad (42+1)-.Ltmp[[N]] +; CHECK-NEXT: .quad 42+1-.Ltmp[[N]] target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/CodeGen/X86/relptr-rodata.ll b/llvm/test/CodeGen/X86/relptr-rodata.ll index 0ca60b8db157a..878151efccc3b 100644 --- a/llvm/test/CodeGen/X86/relptr-rodata.ll +++ b/llvm/test/CodeGen/X86/relptr-rodata.ll @@ -24,7 +24,7 @@ target triple = "x86_64-unknown-linux-gnu" ; CHECK-NEXT: .globl obj ; CHECK: obj: ; CHECK: .long 0 -; CHECK: .long (hidden_func-obj)-4 +; CHECK: .long hidden_func-obj-4 declare hidden void @hidden_func() diff --git a/llvm/test/CodeGen/X86/wineh-coreclr.ll b/llvm/test/CodeGen/X86/wineh-coreclr.ll index d30f14e272fcb..baf5eaa29d281 100644 --- a/llvm/test/CodeGen/X86/wineh-coreclr.ll +++ b/llvm/test/CodeGen/X86/wineh-coreclr.ll @@ -166,9 +166,9 @@ tail: ; Clause 1: call f(2) is guarded by catch1 ; CHECK-NEXT: .long 0 ; ^ flags (0 => catch handler) -; CHECK-NEXT: .long ([[test1_before_f2]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_before_f2]]-[[test1_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test1_after_f2]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_after_f2]]-[[test1_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test1_catch1]]-[[test1_begin]] ; ^ offset of start of handler @@ -179,9 +179,9 @@ tail: ; Clause 2: call f(2) is also guarded by catch2 ; CHECK-NEXT: .long 0 ; ^ flags (0 => catch handler) -; CHECK-NEXT: .long ([[test1_before_f2]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_before_f2]]-[[test1_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test1_after_f2]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_after_f2]]-[[test1_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test1_catch2]]-[[test1_begin]] ; ^ offset of start of handler @@ -192,9 +192,9 @@ tail: ; Clause 3: calls f(1) and f(2) are guarded by finally ; CHECK-NEXT: .long 2 ; ^ flags (2 => finally handler) -; CHECK-NEXT: .long ([[test1_before_f1]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_before_f1]]-[[test1_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test1_after_f2]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_after_f2]]-[[test1_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test1_finally]]-[[test1_begin]] ; ^ offset of start of handler @@ -208,9 +208,9 @@ tail: ; is the main function, not that funclet. ; CHECK-NEXT: .long 10 ; ^ flags (2 => finally handler | 8 => duplicate) -; CHECK-NEXT: .long ([[test1_before_f3]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_before_f3]]-[[test1_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test1_after_f3]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_after_f3]]-[[test1_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test1_finally]]-[[test1_begin]] ; ^ offset of start of handler @@ -221,9 +221,9 @@ tail: ; Clause 5: call f(5) is guarded by fault ; CHECK-NEXT: .long 4 ; ^ flags (4 => fault handler) -; CHECK-NEXT: .long ([[test1_before_f5]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_before_f5]]-[[test1_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test1_after_f5]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_after_f5]]-[[test1_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test1_fault]]-[[test1_begin]] ; ^ offset of start of handler @@ -237,9 +237,9 @@ tail: ; is the main function, not that funclet. ; CHECK-NEXT: .long 10 ; ^ flags (2 => finally handler | 8 => duplicate) -; CHECK-NEXT: .long ([[test1_before_f4]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_before_f4]]-[[test1_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test1_after_f5]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_after_f5]]-[[test1_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test1_finally]]-[[test1_begin]] ; ^ offset of start of handler @@ -253,9 +253,9 @@ tail: ; is the main function, not that funclet. ; CHECK-NEXT: .long 10 ; ^ flags (2 => finally handler | 8 => duplicate) -; CHECK-NEXT: .long ([[test1_before_f6]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_before_f6]]-[[test1_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test1_after_f6]]-[[test1_begin]])+1 +; CHECK-NEXT: .long [[test1_after_f6]]-[[test1_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test1_finally]]-[[test1_begin]] ; ^ offset of start of handler @@ -343,9 +343,9 @@ unreachable: ; Clause 1: call f(1) is guarded by fault ; CHECK-NEXT: .long 4 ; ^ flags (4 => fault handler) -; CHECK-NEXT: .long ([[test2_before_f1]]-[[test2_begin]])+1 +; CHECK-NEXT: .long [[test2_before_f1]]-[[test2_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test2_after_f1]]-[[test2_begin]])+1 +; CHECK-NEXT: .long [[test2_after_f1]]-[[test2_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test2_fault]]-[[test2_begin]] ; ^ offset of start of handler @@ -356,9 +356,9 @@ unreachable: ; Clause 2: call f(1) is also guarded by catch2 ; CHECK-NEXT: .long 0 ; ^ flags (0 => catch handler) -; CHECK-NEXT: .long ([[test2_before_f1]]-[[test2_begin]])+1 +; CHECK-NEXT: .long [[test2_before_f1]]-[[test2_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test2_after_f1]]-[[test2_begin]])+1 +; CHECK-NEXT: .long [[test2_after_f1]]-[[test2_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test2_catch2]]-[[test2_begin]] ; ^ offset of start of handler @@ -369,9 +369,9 @@ unreachable: ; Clause 3: calls f(2) is guarded by catch1 ; CHECK-NEXT: .long 0 ; ^ flags (0 => catch handler) -; CHECK-NEXT: .long ([[test2_before_f2]]-[[test2_begin]])+1 +; CHECK-NEXT: .long [[test2_before_f2]]-[[test2_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test2_after_f2]]-[[test2_begin]])+1 +; CHECK-NEXT: .long [[test2_after_f2]]-[[test2_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test2_catch1]]-[[test2_begin]] ; ^ offset of start of handler @@ -385,9 +385,9 @@ unreachable: ; is the main function, not that funclet. ; CHECK-NEXT: .long 8 ; ^ flags (0 => catch handler | 8 => duplicate) -; CHECK-NEXT: .long ([[test2_before_f2]]-[[test2_begin]])+1 +; CHECK-NEXT: .long [[test2_before_f2]]-[[test2_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test2_after_f2]]-[[test2_begin]])+1 +; CHECK-NEXT: .long [[test2_after_f2]]-[[test2_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test2_catch2]]-[[test2_begin]] ; ^ offset of start of handler @@ -559,9 +559,9 @@ unreachable: ; Clause 1: call f(1) is guarded by fault1 ; CHECK-NEXT: .long 4 ; ^ flags (4 => fault handler) -; CHECK-NEXT: .long ([[test3_before_f1]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_before_f1]]-[[test3_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test3_after_f1]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_after_f1]]-[[test3_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test3_fault1]]-[[test3_begin]] ; ^ offset of start of handler @@ -572,9 +572,9 @@ unreachable: ; Clause 3: call f(6) is guarded by catch1 ; CHECK-NEXT: .long 0 ; ^ flags (0 => catch handler) -; CHECK-NEXT: .long ([[test3_before_f6]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_before_f6]]-[[test3_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test3_after_f6]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_after_f6]]-[[test3_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test3_catch1]]-[[test3_begin]] ; ^ offset of start of handler @@ -585,9 +585,9 @@ unreachable: ; Clause 3: call f(6) is also guarded by catch2 ; CHECK-NEXT: .long 0 ; ^ flags (0 => catch handler) -; CHECK-NEXT: .long ([[test3_before_f6]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_before_f6]]-[[test3_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test3_after_f6]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_after_f6]]-[[test3_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test3_catch2]]-[[test3_begin]] ; ^ offset of start of handler @@ -601,9 +601,9 @@ unreachable: ; is fault1, not that funclet. ; CHECK-NEXT: .long 12 ; ^ flags (4 => fault handler | 8 => duplicate) -; CHECK-NEXT: .long ([[test3_before_f7]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_before_f7]]-[[test3_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test3_after_f7]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_after_f7]]-[[test3_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test3_fault5]]-[[test3_begin]] ; ^ offset of start of handler @@ -614,9 +614,9 @@ unreachable: ; Clause 5: call f(4) is guarded by fault4 ; CHECK-NEXT: .long 4 ; ^ flags (4 => fault handler) -; CHECK-NEXT: .long ([[test3_before_f4]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_before_f4]]-[[test3_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test3_after_f4]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_after_f4]]-[[test3_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test3_fault4]]-[[test3_begin]] ; ^ offset of start of handler @@ -630,9 +630,9 @@ unreachable: ; is fault1, not that funclet. ; CHECK-NEXT: .long 12 ; ^ flags (4 => fault handler) -; CHECK-NEXT: .long ([[test3_before_f4]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_before_f4]]-[[test3_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test3_after_f4]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_after_f4]]-[[test3_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test3_fault5]]-[[test3_begin]] ; ^ offset of start of handler @@ -643,9 +643,9 @@ unreachable: ; Clause 7: call f(3) is guarded by fault3 ; CHECK-NEXT: .long 4 ; ^ flags (4 => fault handler) -; CHECK-NEXT: .long ([[test3_before_f3]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_before_f3]]-[[test3_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test3_after_f3]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_after_f3]]-[[test3_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test3_fault3]]-[[test3_begin]] ; ^ offset of start of handler @@ -659,9 +659,9 @@ unreachable: ; is fault1, not that funclet. ; CHECK-NEXT: .long 12 ; ^ flags (4 => fault handler | 8 => duplicate) -; CHECK-NEXT: .long ([[test3_before_f3]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_before_f3]]-[[test3_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test3_after_f3]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_after_f3]]-[[test3_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test3_fault5]]-[[test3_begin]] ; ^ offset of start of handler @@ -672,9 +672,9 @@ unreachable: ; Clause 9: call f(2) is guarded by fault2 ; CHECK-NEXT: .long 4 ; ^ flags (4 => fault handler) -; CHECK-NEXT: .long ([[test3_before_f2]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_before_f2]]-[[test3_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test3_after_f2]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_after_f2]]-[[test3_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test3_fault2]]-[[test3_begin]] ; ^ offset of start of handler @@ -685,9 +685,9 @@ unreachable: ; Clause 10: call f(2) is guarded by fault5 ; CHECK-NEXT: .long 4 ; ^ flags (4 => fault handler) -; CHECK-NEXT: .long ([[test3_before_f2]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_before_f2]]-[[test3_begin]]+1 ; ^ offset of start of clause -; CHECK-NEXT: .long ([[test3_after_f2]]-[[test3_begin]])+1 +; CHECK-NEXT: .long [[test3_after_f2]]-[[test3_begin]]+1 ; ^ offset of end of clause ; CHECK-NEXT: .long [[test3_fault5]]-[[test3_begin]] ; ^ offset of start of handler diff --git a/llvm/test/CodeGen/X86/x86-64-plt-relative-reloc.ll b/llvm/test/CodeGen/X86/x86-64-plt-relative-reloc.ll index f949c83efd03f..54736c94af248 100644 --- a/llvm/test/CodeGen/X86/x86-64-plt-relative-reloc.ll +++ b/llvm/test/CodeGen/X86/x86-64-plt-relative-reloc.ll @@ -13,7 +13,7 @@ declare void @fn3() @global4 = external unnamed_addr global i8 ; CHECK: .long 0 -; CHECK-NEXT: .long (fn1@PLT-vtable)-4 -; CHECK-NEXT: .long (fn2@PLT-vtable)-4 -; CHECK-NEXT: .long (fn3-vtable)-4 -; CHECK-NEXT: .long (global4-vtable)-4 +; CHECK-NEXT: .long fn1@PLT-vtable-4 +; CHECK-NEXT: .long fn2@PLT-vtable-4 +; CHECK-NEXT: .long fn3-vtable-4 +; CHECK-NEXT: .long global4-vtable-4 diff --git a/llvm/test/CodeGen/X86/x86-plt-relative-reloc.ll b/llvm/test/CodeGen/X86/x86-plt-relative-reloc.ll index 8c86cd29d1c81..d5e80285b160d 100644 --- a/llvm/test/CodeGen/X86/x86-plt-relative-reloc.ll +++ b/llvm/test/CodeGen/X86/x86-plt-relative-reloc.ll @@ -11,6 +11,6 @@ declare void @fn2() unnamed_addr declare void @fn3() ; CHECK: .long 0 -; CHECK-NEXT: .long (fn1@PLT-vtable)-4 -; CHECK-NEXT: .long (fn2@PLT-vtable)-4 -; CHECK-NEXT: .long (fn3-vtable)-4 +; CHECK-NEXT: .long fn1@PLT-vtable-4 +; CHECK-NEXT: .long fn2@PLT-vtable-4 +; CHECK-NEXT: .long fn3-vtable-4 diff --git a/llvm/test/MC/AArch64/elf-reloc-ptrauth.s b/llvm/test/MC/AArch64/elf-reloc-ptrauth.s index 057b298a0a0df..bed85bcc5798b 100644 --- a/llvm/test/MC/AArch64/elf-reloc-ptrauth.s +++ b/llvm/test/MC/AArch64/elf-reloc-ptrauth.s @@ -91,25 +91,25 @@ _g9: .quad ("_g 7" + 7)@AUTH(ia,16) .quad 0 -// ASM: .xword (_g9@AUTH(ia,42))-(_g8@AUTH(ia,42)) +// ASM: .xword _g9@AUTH(ia,42)-_g8@AUTH(ia,42) .quad _g9@AUTH(ia,42) - _g8@AUTH(ia,42) .quad 0 .ifdef ASMONLY -// ASM: .xword (_g10@AUTH(ia,42))+1 +// ASM: .xword _g10@AUTH(ia,42)+1 .quad _g10@AUTH(ia,42) + 1 -// ASM: .xword 1+(_g11@AUTH(ia,42)) +// ASM: .xword 1+_g11@AUTH(ia,42) .quad 1 + _g11@AUTH(ia,42) -// ASM: .xword (1+(_g12@AUTH(ia,42)))+1 +// ASM: .xword 1+_g12@AUTH(ia,42)+1 .quad 1 + _g12@AUTH(ia,42) + 1 -// ASM: .xword (_g13@AUTH(ia,42))+(_g14@AUTH(ia,42)) +// ASM: .xword _g13@AUTH(ia,42)+_g14@AUTH(ia,42) .quad _g13@AUTH(ia,42) + _g14@AUTH(ia,42) -// ASM: .xword (_g9@AUTH(ia,42))-_g8 +// ASM: .xword _g9@AUTH(ia,42)-_g8 .quad _g9@AUTH(ia,42) - _g8 .quad 0 diff --git a/llvm/test/MC/AMDGPU/expressions.s b/llvm/test/MC/AMDGPU/expressions.s index f917347a3bd79..d0ef0d5f93736 100644 --- a/llvm/test/MC/AMDGPU/expressions.s +++ b/llvm/test/MC/AMDGPU/expressions.s @@ -269,8 +269,8 @@ BB1: v_nop_e64 BB2: s_sub_u32 vcc_lo, vcc_lo, (BB2+4)-BB1 -// VI: s_sub_u32 vcc_lo, vcc_lo, (BB2+4)-BB1 ; encoding: [0x6a,0xff,0xea,0x80,A,A,A,A] -// VI-NEXT: ; fixup A - offset: 4, value: (BB2+4)-BB1, kind: FK_Data_4 +// VI: s_sub_u32 vcc_lo, vcc_lo, BB2+4-BB1 ; encoding: [0x6a,0xff,0xea,0x80,A,A,A,A] +// VI-NEXT: ; fixup A - offset: 4, value: BB2+4-BB1, kind: FK_Data_4 s_add_u32 vcc_lo, vcc_lo, (BB2-BB1)&4294967295 // VI: s_add_u32 vcc_lo, vcc_lo, (BB2-BB1)&4294967295 ; encoding: [0x6a,0xff,0x6a,0x80,A,A,A,A] // VI-NEXT: ; fixup A - offset: 4, value: (BB2-BB1)&4294967295, kind: FK_Data_4 diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s index 7f5240d649b7f..ac06e6177d321 100644 --- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s +++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s @@ -115,35 +115,35 @@ expr_defined: // ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0 // ASM-NEXT: .amdhsa_wavefront_size32 1 -// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset ((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&1 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&128)>>7 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&256)>>8 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&512)>>9 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&1024)>>10 -// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&6144)>>11 +// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset ((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&128)>>7 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&256)>>8 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&512)>>9 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&1024)>>10 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&6144)>>11 // ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4 // ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5 // ASM-NEXT: .amdhsa_reserve_vcc defined_boolean // ASM-NEXT: .amdhsa_reserve_flat_scratch defined_boolean // ASM-NEXT: .amdhsa_reserve_xnack_mask 1 -// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12 -// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14 -// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16 -// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18 -// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21 -// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&8388608)>>23 -// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26 -// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29 -// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30 -// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31 +// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&12288)>>12 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&49152)>>14 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&196608)>>16 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&786432)>>18 +// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&2097152)>>21 +// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&8388608)>>23 +// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&67108864)>>26 +// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&536870912)>>29 +// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&1073741824)>>30 +// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&2147483648)>>31 // ASM-NEXT: .amdhsa_shared_vgpr_count 0 -// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&16777216)>>24 -// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&33554432)>>25 -// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&67108864)>>26 -// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&134217728)>>27 -// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&268435456)>>28 -// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&536870912)>>29 -// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&1073741824)>>30 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&16777216)>>24 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&33554432)>>25 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&67108864)>>26 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&134217728)>>27 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&268435456)>>28 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&536870912)>>29 +// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&1073741824)>>30 // ASM-NEXT: .end_amdhsa_kernel // ASM: .set defined_value, 41 diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s index b5b8a58b09a7f..8490f9bde2425 100644 --- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s +++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s @@ -113,34 +113,34 @@ expr_defined: // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0 // ASM-NEXT: .amdhsa_wavefront_size32 1 -// ASM-NEXT: .amdhsa_enable_private_segment ((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&1 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&128)>>7 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&256)>>8 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&512)>>9 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&1024)>>10 -// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&6144)>>11 +// ASM-NEXT: .amdhsa_enable_private_segment ((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&128)>>7 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&256)>>8 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&512)>>9 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&1024)>>10 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&6144)>>11 // ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4 // ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5 // ASM-NEXT: .amdhsa_reserve_vcc defined_boolean -// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12 -// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14 -// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16 -// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18 -// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21 -// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&8388608)>>23 -// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26 -// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29 -// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30 -// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31 +// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&12288)>>12 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&49152)>>14 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&196608)>>16 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&786432)>>18 +// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&2097152)>>21 +// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&8388608)>>23 +// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&67108864)>>26 +// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&536870912)>>29 +// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&1073741824)>>30 +// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((3769368576|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&2147483648)>>31 // ASM-NEXT: .amdhsa_shared_vgpr_count 0 // ASM-NEXT: .amdhsa_inst_pref_size 0 -// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&16777216)>>24 -// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&33554432)>>25 -// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&67108864)>>26 -// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&134217728)>>27 -// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&268435456)>>28 -// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&536870912)>>29 -// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&1073741824)>>30 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&16777216)>>24 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&33554432)>>25 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&67108864)>>26 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&134217728)>>27 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&268435456)>>28 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&536870912)>>29 +// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&1073741824)>>30 // ASM-NEXT: .end_amdhsa_kernel // ASM: .set defined_value, 41 diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s index 0efd323ae9a34..ab1a5891ab22c 100644 --- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s +++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx12.s @@ -117,32 +117,32 @@ expr_defined: // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0 // ASM-NEXT: .amdhsa_wavefront_size32 1 -// ASM-NEXT: .amdhsa_enable_private_segment ((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&1 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&128)>>7 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&256)>>8 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&512)>>9 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&1024)>>10 -// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&6144)>>11 +// ASM-NEXT: .amdhsa_enable_private_segment ((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&128)>>7 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&256)>>8 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&512)>>9 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&1024)>>10 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&6144)>>11 // ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4 // ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5 // ASM-NEXT: .amdhsa_reserve_vcc defined_boolean -// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12 -// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14 -// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16 -// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18 -// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26 -// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29 -// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30 -// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31 +// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((((3758882816|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~2097152)|(defined_boolean<<21))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&12288)>>12 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((((3758882816|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~2097152)|(defined_boolean<<21))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&49152)>>14 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((((3758882816|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~2097152)|(defined_boolean<<21))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&196608)>>16 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((((3758882816|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~2097152)|(defined_boolean<<21))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&786432)>>18 +// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((((3758882816|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~2097152)|(defined_boolean<<21))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&67108864)>>26 +// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((((3758882816|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~2097152)|(defined_boolean<<21))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&536870912)>>29 +// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((((3758882816|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~2097152)|(defined_boolean<<21))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&1073741824)>>30 +// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((((3758882816|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~2097152)|(defined_boolean<<21))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&2147483648)>>31 // ASM-NEXT: .amdhsa_inst_pref_size (((defined_value+6)<<4)&4080)>>4 -// ASM-NEXT: .amdhsa_round_robin_scheduling (((((((((((((((((((((3758882816|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~2097152))|(defined_boolean<<21))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21 -// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&16777216)>>24 -// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&33554432)>>25 -// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&67108864)>>26 -// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&134217728)>>27 -// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&268435456)>>28 -// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&536870912)>>29 -// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~1))|defined_boolean)&(~62))&1073741824)>>30 +// ASM-NEXT: .amdhsa_round_robin_scheduling (((((((((((((((((((((3758882816|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~67108864)|(defined_boolean<<26))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~2147483648)|(defined_boolean<<31))&~2097152)|(defined_boolean<<21))&~63)|((alignto(max(defined_value+4, 1), 8)/8)-1))&~960)&2097152)>>21 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&16777216)>>24 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&33554432)>>25 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&67108864)>>26 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&134217728)>>27 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&268435456)>>28 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&536870912)>>29 +// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~1)|defined_boolean)&~62)&1073741824)>>30 // ASM-NEXT: .end_amdhsa_kernel // ASM: .set defined_value, 41 diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s index 485f48c695c4d..9fae37c38735f 100644 --- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s +++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx7.s @@ -106,29 +106,29 @@ expr_defined: // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0 // ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset ((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&1 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&128)>>7 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&256)>>8 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&512)>>9 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&1024)>>10 -// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&6144)>>11 +// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset ((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&128)>>7 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&256)>>8 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&512)>>9 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&1024)>>10 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&6144)>>11 // ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4 // ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5 // ASM-NEXT: .amdhsa_reserve_vcc defined_boolean // ASM-NEXT: .amdhsa_reserve_flat_scratch defined_boolean -// ASM-NEXT: .amdhsa_float_round_mode_32 ((((((((((((11272192|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(((alignto(max(defined_value+4, 1), 4))/4)-1))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 0)), 1), 8))/8)-1)<<6))&12288)>>12 -// ASM-NEXT: .amdhsa_float_round_mode_16_64 ((((((((((((11272192|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(((alignto(max(defined_value+4, 1), 4))/4)-1))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 0)), 1), 8))/8)-1)<<6))&49152)>>14 -// ASM-NEXT: .amdhsa_float_denorm_mode_32 ((((((((((((11272192|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(((alignto(max(defined_value+4, 1), 4))/4)-1))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 0)), 1), 8))/8)-1)<<6))&196608)>>16 -// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 ((((((((((((11272192|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(((alignto(max(defined_value+4, 1), 4))/4)-1))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 0)), 1), 8))/8)-1)<<6))&786432)>>18 +// ASM-NEXT: .amdhsa_float_round_mode_32 ((((((((((((11272192|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~63)|((alignto(max(defined_value+4, 1), 4)/4)-1))&~960)|(((alignto(max(defined_value+5+extrasgprs(defined_boolean, defined_boolean, 0), 1), 8)/8)-1)<<6))&12288)>>12 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 ((((((((((((11272192|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~63)|((alignto(max(defined_value+4, 1), 4)/4)-1))&~960)|(((alignto(max(defined_value+5+extrasgprs(defined_boolean, defined_boolean, 0), 1), 8)/8)-1)<<6))&49152)>>14 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 ((((((((((((11272192|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~63)|((alignto(max(defined_value+4, 1), 4)/4)-1))&~960)|(((alignto(max(defined_value+5+extrasgprs(defined_boolean, defined_boolean, 0), 1), 8)/8)-1)<<6))&196608)>>16 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 ((((((((((((11272192|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~63)|((alignto(max(defined_value+4, 1), 4)/4)-1))&~960)|(((alignto(max(defined_value+5+extrasgprs(defined_boolean, defined_boolean, 0), 1), 8)/8)-1)<<6))&786432)>>18 // ASM-NEXT: .amdhsa_dx10_clamp 1 // ASM-NEXT: .amdhsa_ieee_mode 1 -// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&16777216)>>24 -// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&33554432)>>25 -// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&67108864)>>26 -// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&134217728)>>27 -// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&268435456)>>28 -// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&536870912)>>29 -// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&1073741824)>>30 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&16777216)>>24 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&33554432)>>25 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&67108864)>>26 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&134217728)>>27 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&268435456)>>28 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&536870912)>>29 +// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&1073741824)>>30 // ASM-NEXT: .end_amdhsa_kernel // ASM: .set defined_value, 41 diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s index 0d2e066113ee8..4b6cb01c18d8f 100644 --- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s +++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx8.s @@ -107,30 +107,30 @@ expr_defined: // ASM-NEXT: .amdhsa_user_sgpr_dispatch_id 0 // ASM-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset ((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&1 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&128)>>7 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&256)>>8 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&512)>>9 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&1024)>>10 -// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&6144)>>11 +// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset ((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&1 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&128)>>7 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&256)>>8 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&512)>>9 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&1024)>>10 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&6144)>>11 // ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4 // ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5 // ASM-NEXT: .amdhsa_reserve_vcc defined_boolean // ASM-NEXT: .amdhsa_reserve_flat_scratch defined_boolean // ASM-NEXT: .amdhsa_reserve_xnack_mask 1 -// ASM-NEXT: .amdhsa_float_round_mode_32 ((((((((((((11272192|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(((alignto(max(defined_value+4, 1), 4))/4)-1))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&12288)>>12 -// ASM-NEXT: .amdhsa_float_round_mode_16_64 ((((((((((((11272192|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(((alignto(max(defined_value+4, 1), 4))/4)-1))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&49152)>>14 -// ASM-NEXT: .amdhsa_float_denorm_mode_32 ((((((((((((11272192|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(((alignto(max(defined_value+4, 1), 4))/4)-1))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&196608)>>16 -// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 ((((((((((((11272192|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~63))|(((alignto(max(defined_value+4, 1), 4))/4)-1))&(~960))|((((alignto(max((defined_value+5)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&786432)>>18 +// ASM-NEXT: .amdhsa_float_round_mode_32 ((((((((((((11272192|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~63)|((alignto(max(defined_value+4, 1), 4)/4)-1))&~960)|(((alignto(max(defined_value+5+extrasgprs(defined_boolean, defined_boolean, 1), 1), 8)/8)-1)<<6))&12288)>>12 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 ((((((((((((11272192|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~63)|((alignto(max(defined_value+4, 1), 4)/4)-1))&~960)|(((alignto(max(defined_value+5+extrasgprs(defined_boolean, defined_boolean, 1), 1), 8)/8)-1)<<6))&49152)>>14 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 ((((((((((((11272192|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~63)|((alignto(max(defined_value+4, 1), 4)/4)-1))&~960)|(((alignto(max(defined_value+5+extrasgprs(defined_boolean, defined_boolean, 1), 1), 8)/8)-1)<<6))&196608)>>16 +// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 ((((((((((((11272192|(defined_2_bits<<12))&~49152)|(defined_2_bits<<14))&~196608)|(defined_2_bits<<16))&~786432)|(defined_2_bits<<18))&~63)|((alignto(max(defined_value+4, 1), 4)/4)-1))&~960)|(((alignto(max(defined_value+5+extrasgprs(defined_boolean, defined_boolean, 1), 1), 8)/8)-1)<<6))&786432)>>18 // ASM-NEXT: .amdhsa_dx10_clamp 1 // ASM-NEXT: .amdhsa_ieee_mode 1 -// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&16777216)>>24 -// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&33554432)>>25 -// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&67108864)>>26 -// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&134217728)>>27 -// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&268435456)>>28 -// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&536870912)>>29 -// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&1073741824)>>30 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&16777216)>>24 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&33554432)>>25 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&67108864)>>26 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&134217728)>>27 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&268435456)>>28 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&536870912)>>29 +// ASM-NEXT: .amdhsa_exception_int_div_zero (((((((((((((((((((((((((128|(defined_2_bits<<11))&~128)|(defined_boolean<<7))&~256)|(defined_boolean<<8))&~512)|(defined_boolean<<9))&~1024)|(defined_boolean<<10))&~16777216)|(defined_boolean<<24))&~33554432)|(defined_boolean<<25))&~67108864)|(defined_boolean<<26))&~134217728)|(defined_boolean<<27))&~268435456)|(defined_boolean<<28))&~536870912)|(defined_boolean<<29))&~1073741824)|(defined_boolean<<30))&~62)&1073741824)>>30 // ASM-NEXT: .end_amdhsa_kernel // ASM: .set defined_value, 41 diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s index 88b5e23a6f2c5..4b750d4d0fcf6 100644 --- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s +++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx90a.s @@ -80,33 +80,33 @@ expr_defined: // ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_length 0 // ASM-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset 0 // ASM-NEXT: .amdhsa_user_sgpr_private_segment_size 0 -// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset ((128|defined_boolean)&(~62))&1 +// ASM-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset ((128|defined_boolean)&~62)&1 // ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((128|defined_boolean)&(~62))&256)>>8 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((128|defined_boolean)&(~62))&512)>>9 -// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((128|defined_boolean)&(~62))&1024)>>10 -// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((128|defined_boolean)&(~62))&6144)>>11 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_y (((128|defined_boolean)&~62)&256)>>8 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_id_z (((128|defined_boolean)&~62)&512)>>9 +// ASM-NEXT: .amdhsa_system_sgpr_workgroup_info (((128|defined_boolean)&~62)&1024)>>10 +// ASM-NEXT: .amdhsa_system_vgpr_workitem_id (((128|defined_boolean)&~62)&6144)>>11 // ASM-NEXT: .amdhsa_next_free_vgpr defined_boolean+1 // ASM-NEXT: .amdhsa_next_free_sgpr defined_boolean+2 // ASM-NEXT: .amdhsa_accum_offset 4 // ASM-NEXT: .amdhsa_reserve_vcc defined_boolean // ASM-NEXT: .amdhsa_reserve_flat_scratch defined_boolean // ASM-NEXT: .amdhsa_reserve_xnack_mask 1 -// ASM-NEXT: .amdhsa_float_round_mode_32 ((((((((((9175040|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(((alignto(max(defined_boolean+1, 1), 8))/8)-1))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&12288)>>12 -// ASM-NEXT: .amdhsa_float_round_mode_16_64 ((((((((((9175040|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(((alignto(max(defined_boolean+1, 1), 8))/8)-1))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&49152)>>14 -// ASM-NEXT: .amdhsa_float_denorm_mode_32 ((((((((((9175040|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(((alignto(max(defined_boolean+1, 1), 8))/8)-1))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&196608)>>16 +// ASM-NEXT: .amdhsa_float_round_mode_32 ((((((((((9175040|(defined_boolean<<21))&~8388608)|(defined_boolean<<23))&~67108864)|(defined_boolean<<26))&~63)|((alignto(max(defined_boolean+1, 1), 8)/8)-1))&~960)|(((alignto(max(defined_boolean+2+extrasgprs(defined_boolean, defined_boolean, 1), 1), 8)/8)-1)<<6))&12288)>>12 +// ASM-NEXT: .amdhsa_float_round_mode_16_64 ((((((((((9175040|(defined_boolean<<21))&~8388608)|(defined_boolean<<23))&~67108864)|(defined_boolean<<26))&~63)|((alignto(max(defined_boolean+1, 1), 8)/8)-1))&~960)|(((alignto(max(defined_boolean+2+extrasgprs(defined_boolean, defined_boolean, 1), 1), 8)/8)-1)<<6))&49152)>>14 +// ASM-NEXT: .amdhsa_float_denorm_mode_32 ((((((((((9175040|(defined_boolean<<21))&~8388608)|(defined_boolean<<23))&~67108864)|(defined_boolean<<26))&~63)|((alignto(max(defined_boolean+1, 1), 8)/8)-1))&~960)|(((alignto(max(defined_boolean+2+extrasgprs(defined_boolean, defined_boolean, 1), 1), 8)/8)-1)<<6))&196608)>>16 // ASM-NEXT: .amdhsa_float_denorm_mode_16_64 3 -// ASM-NEXT: .amdhsa_dx10_clamp ((((((((((9175040|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(((alignto(max(defined_boolean+1, 1), 8))/8)-1))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&2097152)>>21 -// ASM-NEXT: .amdhsa_ieee_mode ((((((((((9175040|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(((alignto(max(defined_boolean+1, 1), 8))/8)-1))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&8388608)>>23 -// ASM-NEXT: .amdhsa_fp16_overflow ((((((((((9175040|(defined_boolean<<21))&(~8388608))|(defined_boolean<<23))&(~67108864))|(defined_boolean<<26))&(~63))|(((alignto(max(defined_boolean+1, 1), 8))/8)-1))&(~960))|((((alignto(max((defined_boolean+2)+(extrasgprs(defined_boolean, defined_boolean, 1)), 1), 8))/8)-1)<<6))&67108864)>>26 -// ASM-NEXT: .amdhsa_tg_split (((defined_boolean<<16)&(~63))&65536)>>16 -// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((128|defined_boolean)&(~62))&16777216)>>24 -// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((128|defined_boolean)&(~62))&33554432)>>25 -// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((128|defined_boolean)&(~62))&67108864)>>26 -// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((128|defined_boolean)&(~62))&134217728)>>27 -// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((128|defined_boolean)&(~62))&268435456)>>28 -// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((128|defined_boolean)&(~62))&536870912)>>29 -// ASM-NEXT: .amdhsa_exception_int_div_zero (((128|defined_boolean)&(~62))&1073741824)>>30 +// ASM-NEXT: .amdhsa_dx10_clamp ((((((((((9175040|(defined_boolean<<21))&~8388608)|(defined_boolean<<23))&~67108864)|(defined_boolean<<26))&~63)|((alignto(max(defined_boolean+1, 1), 8)/8)-1))&~960)|(((alignto(max(defined_boolean+2+extrasgprs(defined_boolean, defined_boolean, 1), 1), 8)/8)-1)<<6))&2097152)>>21 +// ASM-NEXT: .amdhsa_ieee_mode ((((((((((9175040|(defined_boolean<<21))&~8388608)|(defined_boolean<<23))&~67108864)|(defined_boolean<<26))&~63)|((alignto(max(defined_boolean+1, 1), 8)/8)-1))&~960)|(((alignto(max(defined_boolean+2+extrasgprs(defined_boolean, defined_boolean, 1), 1), 8)/8)-1)<<6))&8388608)>>23 +// ASM-NEXT: .amdhsa_fp16_overflow ((((((((((9175040|(defined_boolean<<21))&~8388608)|(defined_boolean<<23))&~67108864)|(defined_boolean<<26))&~63)|((alignto(max(defined_boolean+1, 1), 8)/8)-1))&~960)|(((alignto(max(defined_boolean+2+extrasgprs(defined_boolean, defined_boolean, 1), 1), 8)/8)-1)<<6))&67108864)>>26 +// ASM-NEXT: .amdhsa_tg_split (((defined_boolean<<16)&~63)&65536)>>16 +// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((128|defined_boolean)&~62)&16777216)>>24 +// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((128|defined_boolean)&~62)&33554432)>>25 +// ASM-NEXT: .amdhsa_exception_fp_ieee_div_zero (((128|defined_boolean)&~62)&67108864)>>26 +// ASM-NEXT: .amdhsa_exception_fp_ieee_overflow (((128|defined_boolean)&~62)&134217728)>>27 +// ASM-NEXT: .amdhsa_exception_fp_ieee_underflow (((128|defined_boolean)&~62)&268435456)>>28 +// ASM-NEXT: .amdhsa_exception_fp_ieee_inexact (((128|defined_boolean)&~62)&536870912)>>29 +// ASM-NEXT: .amdhsa_exception_int_div_zero (((128|defined_boolean)&~62)&1073741824)>>30 // ASM-NEXT: .end_amdhsa_kernel // ASM: .set defined_boolean, 1 diff --git a/llvm/test/MC/AMDGPU/mcexpr_amd.s b/llvm/test/MC/AMDGPU/mcexpr_amd.s index a9639c3acc305..d7340bb5fd2ed 100644 --- a/llvm/test/MC/AMDGPU/mcexpr_amd.s +++ b/llvm/test/MC/AMDGPU/mcexpr_amd.s @@ -74,7 +74,7 @@ .set max_neg_number, max(neg_one) // ASM: .set max_with_subexpr, 3 -// ASM: .set max_as_subexpr, 1+(max(4, 3, five)) +// ASM: .set max_as_subexpr, 1+max(4, 3, five) // ASM: .set max_recursive_subexpr, max(max(1, four), 3, max_expression_all) .set max_with_subexpr, max(((one | 3) << 3) / 8) @@ -112,7 +112,7 @@ .set or_with_or_sym, or(or, 4, 3, one, two) // ASM: .set or_with_subexpr, 3 -// ASM: .set or_as_subexpr, 1+(or(4, 3, five)) +// ASM: .set or_as_subexpr, 1+or(4, 3, five) // ASM: .set or_recursive_subexpr, or(or(1, four), 3, or_expression_all) .set or_with_subexpr, or(((one | 3) << 3) / 8) diff --git a/llvm/test/MC/ARM/basic-arm-instructions.s b/llvm/test/MC/ARM/basic-arm-instructions.s index 9f3a5cd4afa79..4c62e8f34c3cf 100644 --- a/llvm/test/MC/ARM/basic-arm-instructions.s +++ b/llvm/test/MC/ARM/basic-arm-instructions.s @@ -289,8 +289,8 @@ Lforward: @ CHECK: addseq r0, pc, #-1073741824 @ encoding: [0x03,0x01,0x9f,0x02] @ CHECK: Ltmp0: @ CHECK-NEXT: Ltmp1: -@ CHECK-NEXT: adr r0, (Ltmp1+8)+(Lback-Ltmp0) @ encoding: [A,A,0x0f'A',0xe2'A'] -@ CHECK-NEXT: @ fixup A - offset: 0, value: (Ltmp1+8)+(Lback-Ltmp0), kind: fixup_arm_adr_pcrel_12 +@ CHECK-NEXT: adr r0, Ltmp1+8+(Lback-Ltmp0) @ encoding: [A,A,0x0f'A',0xe2'A'] +@ CHECK-NEXT: @ fixup A - offset: 0, value: Ltmp1+8+(Lback-Ltmp0), kind: fixup_arm_adr_pcrel_12 @ Test right shift by 32, which is encoded as 0 add r3, r1, r2, lsr #32 diff --git a/llvm/test/MC/ARM/elf-movt.s b/llvm/test/MC/ARM/elf-movt.s index 72dad26d06664..3a9f162f7f842 100644 --- a/llvm/test/MC/ARM/elf-movt.s +++ b/llvm/test/MC/ARM/elf-movt.s @@ -26,8 +26,8 @@ bar: @ ASM-NEXT: movt r0, :upper16:(GOT-(.LPC0_2+8)) @ ASM: movw r0, :lower16:(extern_symbol+1234) @ ASM-NEXT: movt r0, :upper16:(extern_symbol+1234) -@ ASM: movw r0, :lower16:((foo-bar)+1234) -@ ASM-NEXT: movt r0, :upper16:((foo-bar)+1234) +@ ASM: movw r0, :lower16:(foo-bar+1234) +@ ASM-NEXT: movt r0, :upper16:(foo-bar+1234) @OBJ: Disassembly of section .text: @OBJ-EMPTY: diff --git a/llvm/test/MC/ARM/macho-word-reloc-thumb.s b/llvm/test/MC/ARM/macho-word-reloc-thumb.s index bd98f6b33f974..a76684d35d83a 100644 --- a/llvm/test/MC/ARM/macho-word-reloc-thumb.s +++ b/llvm/test/MC/ARM/macho-word-reloc-thumb.s @@ -4,7 +4,7 @@ @ ARM relocatable object files try to look like they're pre-linked, so the @ offsets in the instructions are a best-guess. I suspect the "-3" should b -@ CHECK: movw r1, :lower16:((_bar-8)-3) +@ CHECK: movw r1, :lower16:(_bar-8-3) @ [...] @ CHECK: .long {{[0-9]*[13579]}} diff --git a/llvm/test/MC/AVR/inst-brbc.s b/llvm/test/MC/AVR/inst-brbc.s index 4edbbfd858024..6aa6ed0863e0c 100644 --- a/llvm/test/MC/AVR/inst-brbc.s +++ b/llvm/test/MC/AVR/inst-brbc.s @@ -10,11 +10,11 @@ foo: .short 0xf74c .short 0xf4c7 -; CHECK: brvc (.Ltmp0+8)+2 ; encoding: [0bAAAAA011,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+8)+2, kind: fixup_7_pcrel +; CHECK: brvc .Ltmp0+8+2 ; encoding: [0bAAAAA011,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+8+2, kind: fixup_7_pcrel ; -; CHECK: brcc (.Ltmp1-16)+2 ; encoding: [0bAAAAA000,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-16)+2, kind: fixup_7_pcrel +; CHECK: brcc .Ltmp1-16+2 ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1-16+2, kind: fixup_7_pcrel ; INST-LABEL: : ; INST-NEXT: 23 f4 brvc .+8 diff --git a/llvm/test/MC/AVR/inst-brbs.s b/llvm/test/MC/AVR/inst-brbs.s index 3f4b134aef682..abadd10a134f8 100644 --- a/llvm/test/MC/AVR/inst-brbs.s +++ b/llvm/test/MC/AVR/inst-brbs.s @@ -10,10 +10,10 @@ foo: .short 0xf34c .short 0xf077 -; CHECK: brvs (.Ltmp0+8)+2 ; encoding: [0bAAAAA011,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+8)+2, kind: fixup_7_pcrel -; CHECK: brcs (.Ltmp1-12)+2 ; encoding: [0bAAAAA000,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-12)+2, kind: fixup_7_pcrel +; CHECK: brvs .Ltmp0+8+2 ; encoding: [0bAAAAA011,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+8+2, kind: fixup_7_pcrel +; CHECK: brcs .Ltmp1-12+2 ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1-12+2, kind: fixup_7_pcrel ; INST-LABEL: : ; INST-NEXT: 23 f0 brvs .+8 diff --git a/llvm/test/MC/AVR/inst-brcc.s b/llvm/test/MC/AVR/inst-brcc.s index dd1b2b11a6d30..90d7dc3eca1b9 100644 --- a/llvm/test/MC/AVR/inst-brcc.s +++ b/llvm/test/MC/AVR/inst-brcc.s @@ -12,12 +12,12 @@ foo: bar: -; CHECK: brcc (.Ltmp0+66)+2 ; encoding: [0bAAAAA000,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+66)+2, kind: fixup_7_pcrel -; CHECK: brcc (.Ltmp1-22)+2 ; encoding: [0bAAAAA000,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-22)+2, kind: fixup_7_pcrel -; CHECK: brcc (.Ltmp2+66)+2 ; encoding: [0bAAAAA000,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp2+66)+2, kind: fixup_7_pcrel +; CHECK: brcc .Ltmp0+66+2 ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+66+2, kind: fixup_7_pcrel +; CHECK: brcc .Ltmp1-22+2 ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1-22+2, kind: fixup_7_pcrel +; CHECK: brcc .Ltmp2+66+2 ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp2+66+2, kind: fixup_7_pcrel ; CHECK: brcc bar ; encoding: [0bAAAAA000,0b111101AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brcs.s b/llvm/test/MC/AVR/inst-brcs.s index 3fafccdb49257..74cc9a1c97805 100644 --- a/llvm/test/MC/AVR/inst-brcs.s +++ b/llvm/test/MC/AVR/inst-brcs.s @@ -12,12 +12,12 @@ foo: bar: -; CHECK: brcs (.Ltmp0+8)+2 ; encoding: [0bAAAAA000,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+8)+2, kind: fixup_7_pcrel -; CHECK: brcs (.Ltmp1+4)+2 ; encoding: [0bAAAAA000,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+4)+2, kind: fixup_7_pcrel -; CHECK: brcs (.Ltmp2+8)+2 ; encoding: [0bAAAAA000,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp2+8)+2, kind: fixup_7_pcrel +; CHECK: brcs .Ltmp0+8+2 ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+8+2, kind: fixup_7_pcrel +; CHECK: brcs .Ltmp1+4+2 ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+4+2, kind: fixup_7_pcrel +; CHECK: brcs .Ltmp2+8+2 ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp2+8+2, kind: fixup_7_pcrel ; CHECK: brcs bar ; encoding: [0bAAAAA000,0b111100AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-breq.s b/llvm/test/MC/AVR/inst-breq.s index 7a6eac6f01ad0..51bc6192e05ed 100644 --- a/llvm/test/MC/AVR/inst-breq.s +++ b/llvm/test/MC/AVR/inst-breq.s @@ -12,12 +12,12 @@ foo: bar: -; CHECK: breq (.Ltmp0-18)+2 ; encoding: [0bAAAAA001,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0-18)+2, kind: fixup_7_pcrel -; CHECK: breq (.Ltmp1-12)+2 ; encoding: [0bAAAAA001,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-12)+2, kind: fixup_7_pcrel -; CHECK: brbs 1, (.Ltmp2-18)+2 ; encoding: [0bAAAAA001,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp2-18)+2, kind: fixup_7_pcrel +; CHECK: breq .Ltmp0-18+2 ; encoding: [0bAAAAA001,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0-18+2, kind: fixup_7_pcrel +; CHECK: breq .Ltmp1-12+2 ; encoding: [0bAAAAA001,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1-12+2, kind: fixup_7_pcrel +; CHECK: brbs 1, .Ltmp2-18+2 ; encoding: [0bAAAAA001,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp2-18+2, kind: fixup_7_pcrel ; CHECK: brbs 1, bar ; encoding: [0bAAAAA001,0b111100AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brge.s b/llvm/test/MC/AVR/inst-brge.s index 6cf79db4dbd65..904f4a496e777 100644 --- a/llvm/test/MC/AVR/inst-brge.s +++ b/llvm/test/MC/AVR/inst-brge.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brge (.Ltmp0+50)+2 ; encoding: [0bAAAAA100,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+50)+2, kind: fixup_7_pcrel -; CHECK: brge (.Ltmp1+42)+2 ; encoding: [0bAAAAA100,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+42)+2, kind: fixup_7_pcrel +; CHECK: brge .Ltmp0+50+2 ; encoding: [0bAAAAA100,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+50+2, kind: fixup_7_pcrel +; CHECK: brge .Ltmp1+42+2 ; encoding: [0bAAAAA100,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+42+2, kind: fixup_7_pcrel ; CHECK: brge bar ; encoding: [0bAAAAA100,0b111101AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brhc.s b/llvm/test/MC/AVR/inst-brhc.s index 924895e4bf5df..77052e664d389 100644 --- a/llvm/test/MC/AVR/inst-brhc.s +++ b/llvm/test/MC/AVR/inst-brhc.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brhc (.Ltmp0+12)+2 ; encoding: [0bAAAAA101,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+12)+2, kind: fixup_7_pcrel -; CHECK: brhc (.Ltmp1+14)+2 ; encoding: [0bAAAAA101,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+14)+2, kind: fixup_7_pcrel +; CHECK: brhc .Ltmp0+12+2 ; encoding: [0bAAAAA101,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+12+2, kind: fixup_7_pcrel +; CHECK: brhc .Ltmp1+14+2 ; encoding: [0bAAAAA101,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+14+2, kind: fixup_7_pcrel ; CHECK: brhc bar ; encoding: [0bAAAAA101,0b111101AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brhs.s b/llvm/test/MC/AVR/inst-brhs.s index 9704ce5e7e5ac..b4c55cafd5de9 100644 --- a/llvm/test/MC/AVR/inst-brhs.s +++ b/llvm/test/MC/AVR/inst-brhs.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brhs (.Ltmp0-66)+2 ; encoding: [0bAAAAA101,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0-66)+2, kind: fixup_7_pcrel -; CHECK: brhs (.Ltmp1+14)+2 ; encoding: [0bAAAAA101,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+14)+2, kind: fixup_7_pcrel +; CHECK: brhs .Ltmp0-66+2 ; encoding: [0bAAAAA101,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0-66+2, kind: fixup_7_pcrel +; CHECK: brhs .Ltmp1+14+2 ; encoding: [0bAAAAA101,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+14+2, kind: fixup_7_pcrel ; CHECK: brhs bar ; encoding: [0bAAAAA101,0b111100AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brid.s b/llvm/test/MC/AVR/inst-brid.s index e03c293677887..4cf1869dc1b2c 100644 --- a/llvm/test/MC/AVR/inst-brid.s +++ b/llvm/test/MC/AVR/inst-brid.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brid (.Ltmp0+42)+2 ; encoding: [0bAAAAA111,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+42)+2, kind: fixup_7_pcrel -; CHECK: brid (.Ltmp1+62)+2 ; encoding: [0bAAAAA111,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+62)+2, kind: fixup_7_pcrel +; CHECK: brid .Ltmp0+42+2 ; encoding: [0bAAAAA111,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+42+2, kind: fixup_7_pcrel +; CHECK: brid .Ltmp1+62+2 ; encoding: [0bAAAAA111,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+62+2, kind: fixup_7_pcrel ; CHECK: brid bar ; encoding: [0bAAAAA111,0b111101AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brie.s b/llvm/test/MC/AVR/inst-brie.s index 74b724b20bd9e..7c7e97c2b201e 100644 --- a/llvm/test/MC/AVR/inst-brie.s +++ b/llvm/test/MC/AVR/inst-brie.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brie (.Ltmp0+20)+2 ; encoding: [0bAAAAA111,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+20)+2, kind: fixup_7_pcrel -; CHECK: brie (.Ltmp1+40)+2 ; encoding: [0bAAAAA111,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+40)+2, kind: fixup_7_pcrel +; CHECK: brie .Ltmp0+20+2 ; encoding: [0bAAAAA111,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+20+2, kind: fixup_7_pcrel +; CHECK: brie .Ltmp1+40+2 ; encoding: [0bAAAAA111,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+40+2, kind: fixup_7_pcrel ; CHECK: brie bar ; encoding: [0bAAAAA111,0b111100AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brlo.s b/llvm/test/MC/AVR/inst-brlo.s index 2726d943e0e78..9523fd5695a99 100644 --- a/llvm/test/MC/AVR/inst-brlo.s +++ b/llvm/test/MC/AVR/inst-brlo.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brlo (.Ltmp0+12)+2 ; encoding: [0bAAAAA000,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+12)+2, kind: fixup_7_pcrel -; CHECK: brlo (.Ltmp1+28)+2 ; encoding: [0bAAAAA000,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+28)+2, kind: fixup_7_pcrel +; CHECK: brlo .Ltmp0+12+2 ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+12+2, kind: fixup_7_pcrel +; CHECK: brlo .Ltmp1+28+2 ; encoding: [0bAAAAA000,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+28+2, kind: fixup_7_pcrel ; CHECK: brlo bar ; encoding: [0bAAAAA000,0b111100AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brlt.s b/llvm/test/MC/AVR/inst-brlt.s index 299a873963e5b..c309310909fa7 100644 --- a/llvm/test/MC/AVR/inst-brlt.s +++ b/llvm/test/MC/AVR/inst-brlt.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brlt (.Ltmp0+16)+2 ; encoding: [0bAAAAA100,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+16)+2, kind: fixup_7_pcrel -; CHECK: brlt (.Ltmp1+2)+2 ; encoding: [0bAAAAA100,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+2)+2, kind: fixup_7_pcrel +; CHECK: brlt .Ltmp0+16+2 ; encoding: [0bAAAAA100,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+16+2, kind: fixup_7_pcrel +; CHECK: brlt .Ltmp1+2+2 ; encoding: [0bAAAAA100,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+2+2, kind: fixup_7_pcrel ; CHECK: brlt bar ; encoding: [0bAAAAA100,0b111100AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brmi.s b/llvm/test/MC/AVR/inst-brmi.s index 96f7e484f465f..ec60bc4a14f1c 100644 --- a/llvm/test/MC/AVR/inst-brmi.s +++ b/llvm/test/MC/AVR/inst-brmi.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brmi (.Ltmp0+66)+2 ; encoding: [0bAAAAA010,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+66)+2, kind: fixup_7_pcrel -; CHECK: brmi (.Ltmp1+58)+2 ; encoding: [0bAAAAA010,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+58)+2, kind: fixup_7_pcrel +; CHECK: brmi .Ltmp0+66+2 ; encoding: [0bAAAAA010,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+66+2, kind: fixup_7_pcrel +; CHECK: brmi .Ltmp1+58+2 ; encoding: [0bAAAAA010,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+58+2, kind: fixup_7_pcrel ; CHECK: brmi bar ; encoding: [0bAAAAA010,0b111100AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brne.s b/llvm/test/MC/AVR/inst-brne.s index ab89d516681d3..2a424a3593247 100644 --- a/llvm/test/MC/AVR/inst-brne.s +++ b/llvm/test/MC/AVR/inst-brne.s @@ -12,12 +12,12 @@ foo: bar: -; CHECK: brne (.Ltmp0+10)+2 ; encoding: [0bAAAAA001,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+10)+2, kind: fixup_7_pcrel -; CHECK: brne (.Ltmp1+2)+2 ; encoding: [0bAAAAA001,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+2)+2, kind: fixup_7_pcrel -; CHECK: brbc 1, (.Ltmp2+10)+2 ; encoding: [0bAAAAA001,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp2+10)+2, kind: fixup_7_pcrel +; CHECK: brne .Ltmp0+10+2 ; encoding: [0bAAAAA001,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+10+2, kind: fixup_7_pcrel +; CHECK: brne .Ltmp1+2+2 ; encoding: [0bAAAAA001,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+2+2, kind: fixup_7_pcrel +; CHECK: brbc 1, .Ltmp2+10+2 ; encoding: [0bAAAAA001,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp2+10+2, kind: fixup_7_pcrel ; CHECK: brbc 1, bar ; encoding: [0bAAAAA001,0b111101AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brpl.s b/llvm/test/MC/AVR/inst-brpl.s index cd2f697ae8f20..d752f34ee606b 100644 --- a/llvm/test/MC/AVR/inst-brpl.s +++ b/llvm/test/MC/AVR/inst-brpl.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brpl (.Ltmp0-12)+2 ; encoding: [0bAAAAA010,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0-12)+2, kind: fixup_7_pcrel -; CHECK: brpl (.Ltmp1+18)+2 ; encoding: [0bAAAAA010,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+18)+2, kind: fixup_7_pcrel +; CHECK: brpl .Ltmp0-12+2 ; encoding: [0bAAAAA010,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0-12+2, kind: fixup_7_pcrel +; CHECK: brpl .Ltmp1+18+2 ; encoding: [0bAAAAA010,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+18+2, kind: fixup_7_pcrel ; CHECK: brpl bar ; encoding: [0bAAAAA010,0b111101AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brsh.s b/llvm/test/MC/AVR/inst-brsh.s index b066c917f72ae..95a6a52acb60c 100644 --- a/llvm/test/MC/AVR/inst-brsh.s +++ b/llvm/test/MC/AVR/inst-brsh.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brsh (.Ltmp0+32)+2 ; encoding: [0bAAAAA000,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+32)+2, kind: fixup_7_pcrel -; CHECK: brsh (.Ltmp1+70)+2 ; encoding: [0bAAAAA000,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+70)+2, kind: fixup_7_pcrel +; CHECK: brsh .Ltmp0+32+2 ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+32+2, kind: fixup_7_pcrel +; CHECK: brsh .Ltmp1+70+2 ; encoding: [0bAAAAA000,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+70+2, kind: fixup_7_pcrel ; CHECK: brsh bar ; encoding: [0bAAAAA000,0b111101AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brtc.s b/llvm/test/MC/AVR/inst-brtc.s index 64421df10baf5..d8704dc6f345d 100644 --- a/llvm/test/MC/AVR/inst-brtc.s +++ b/llvm/test/MC/AVR/inst-brtc.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brtc (.Ltmp0+52)+2 ; encoding: [0bAAAAA110,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+52)+2, kind: fixup_7_pcrel -; CHECK: brtc (.Ltmp1+50)+2 ; encoding: [0bAAAAA110,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+50)+2, kind: fixup_7_pcrel +; CHECK: brtc .Ltmp0+52+2 ; encoding: [0bAAAAA110,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+52+2, kind: fixup_7_pcrel +; CHECK: brtc .Ltmp1+50+2 ; encoding: [0bAAAAA110,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+50+2, kind: fixup_7_pcrel ; CHECK: brtc bar ; encoding: [0bAAAAA110,0b111101AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brts.s b/llvm/test/MC/AVR/inst-brts.s index bb02b6f3d475d..976f23ff8c208 100644 --- a/llvm/test/MC/AVR/inst-brts.s +++ b/llvm/test/MC/AVR/inst-brts.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brts (.Ltmp0+18)+2 ; encoding: [0bAAAAA110,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+18)+2, kind: fixup_7_pcrel -; CHECK: brts (.Ltmp1+22)+2 ; encoding: [0bAAAAA110,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+22)+2, kind: fixup_7_pcrel +; CHECK: brts .Ltmp0+18+2 ; encoding: [0bAAAAA110,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+18+2, kind: fixup_7_pcrel +; CHECK: brts .Ltmp1+22+2 ; encoding: [0bAAAAA110,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+22+2, kind: fixup_7_pcrel ; CHECK: brts bar ; encoding: [0bAAAAA110,0b111100AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brvc.s b/llvm/test/MC/AVR/inst-brvc.s index 52b9f3b9b403c..766146cc57aaf 100644 --- a/llvm/test/MC/AVR/inst-brvc.s +++ b/llvm/test/MC/AVR/inst-brvc.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brvc (.Ltmp0-28)+2 ; encoding: [0bAAAAA011,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0-28)+2, kind: fixup_7_pcrel -; CHECK: brvc (.Ltmp1-62)+2 ; encoding: [0bAAAAA011,0b111101AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-62)+2, kind: fixup_7_pcrel +; CHECK: brvc .Ltmp0-28+2 ; encoding: [0bAAAAA011,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0-28+2, kind: fixup_7_pcrel +; CHECK: brvc .Ltmp1-62+2 ; encoding: [0bAAAAA011,0b111101AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1-62+2, kind: fixup_7_pcrel ; CHECK: brvc bar ; encoding: [0bAAAAA011,0b111101AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-brvs.s b/llvm/test/MC/AVR/inst-brvs.s index 10382a8e6fd67..9ebe9c3181ee9 100644 --- a/llvm/test/MC/AVR/inst-brvs.s +++ b/llvm/test/MC/AVR/inst-brvs.s @@ -11,10 +11,10 @@ foo: bar: -; CHECK: brvs (.Ltmp0+18)+2 ; encoding: [0bAAAAA011,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+18)+2, kind: fixup_7_pcrel -; CHECK: brvs (.Ltmp1+32)+2 ; encoding: [0bAAAAA011,0b111100AA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1+32)+2, kind: fixup_7_pcrel +; CHECK: brvs .Ltmp0+18+2 ; encoding: [0bAAAAA011,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+18+2, kind: fixup_7_pcrel +; CHECK: brvs .Ltmp1+32+2 ; encoding: [0bAAAAA011,0b111100AA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1+32+2, kind: fixup_7_pcrel ; CHECK: brvs bar ; encoding: [0bAAAAA011,0b111100AA] ; CHECK-NEXT: ; fixup A - offset: 0, value: bar, kind: fixup_7_pcrel diff --git a/llvm/test/MC/AVR/inst-rcall.s b/llvm/test/MC/AVR/inst-rcall.s index 34c2ef86366c5..d0a9e6b7b0463 100644 --- a/llvm/test/MC/AVR/inst-rcall.s +++ b/llvm/test/MC/AVR/inst-rcall.s @@ -11,14 +11,14 @@ foo: rcall .+46 .short 0xdfea -; CHECK: rcall (.Ltmp0+0)+2 ; encoding: [A,0b1101AAAA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+0)+2, kind: fixup_13_pcrel -; CHECK: rcall (.Ltmp1-8)+2 ; encoding: [A,0b1101AAAA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-8)+2, kind: fixup_13_pcrel -; CHECK: rcall (.Ltmp2+12)+2 ; encoding: [A,0b1101AAAA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp2+12)+2, kind: fixup_13_pcrel -; CHECK: rcall (.Ltmp3+46)+2 ; encoding: [A,0b1101AAAA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp3+46)+2, kind: fixup_13_pcrel +; CHECK: rcall .Ltmp0+0+2 ; encoding: [A,0b1101AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+0+2, kind: fixup_13_pcrel +; CHECK: rcall .Ltmp1-8+2 ; encoding: [A,0b1101AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1-8+2, kind: fixup_13_pcrel +; CHECK: rcall .Ltmp2+12+2 ; encoding: [A,0b1101AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp2+12+2, kind: fixup_13_pcrel +; CHECK: rcall .Ltmp3+46+2 ; encoding: [A,0b1101AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp3+46+2, kind: fixup_13_pcrel ; INST-LABEL: : ; INST-NEXT: 00 d0 rcall .+0 diff --git a/llvm/test/MC/AVR/inst-rjmp.s b/llvm/test/MC/AVR/inst-rjmp.s index cf2a9d106f3d1..8971ff7ddcd8a 100644 --- a/llvm/test/MC/AVR/inst-rjmp.s +++ b/llvm/test/MC/AVR/inst-rjmp.s @@ -21,26 +21,26 @@ x: .short 0xc00f rjmp .+4094 -; CHECK: rjmp (.Ltmp0+2)+2 ; encoding: [A,0b1100AAAA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp0+2)+2, kind: fixup_13_pcrel -; CHECK: rjmp (.Ltmp1-2)+2 ; encoding: [A,0b1100AAAA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp1-2)+2, kind: fixup_13_pcrel +; CHECK: rjmp .Ltmp0+2+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp0+2+2, kind: fixup_13_pcrel +; CHECK: rjmp .Ltmp1-2+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp1-2+2, kind: fixup_13_pcrel ; CHECK: rjmp foo ; encoding: [A,0b1100AAAA] ; CHECK-NEXT: ; fixup A - offset: 0, value: foo, kind: fixup_13_pcrel -; CHECK: rjmp (.Ltmp2+8)+2 ; encoding: [A,0b1100AAAA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp2+8)+2, kind: fixup_13_pcrel +; CHECK: rjmp .Ltmp2+8+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp2+8+2, kind: fixup_13_pcrel ; CHECK: rjmp end ; encoding: [A,0b1100AAAA] ; CHECK-NEXT: ; fixup A - offset: 0, value: end, kind: fixup_13_pcrel -; CHECK: rjmp (.Ltmp3+0)+2 ; encoding: [A,0b1100AAAA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp3+0)+2, kind: fixup_13_pcrel -; CHECK: rjmp (.Ltmp4-4)+2 ; encoding: [A,0b1100AAAA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp4-4)+2, kind: fixup_13_pcrel -; CHECK: rjmp (.Ltmp5-6)+2 ; encoding: [A,0b1100AAAA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp5-6)+2, kind: fixup_13_pcrel +; CHECK: rjmp .Ltmp3+0+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp3+0+2, kind: fixup_13_pcrel +; CHECK: rjmp .Ltmp4-4+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp4-4+2, kind: fixup_13_pcrel +; CHECK: rjmp .Ltmp5-6+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp5-6+2, kind: fixup_13_pcrel ; CHECK: rjmp x ; encoding: [A,0b1100AAAA] ; CHECK-NEXT: ; fixup A - offset: 0, value: x, kind: fixup_13_pcrel -; CHECK: rjmp (.Ltmp6+4094)+2 ; encoding: [A,0b1100AAAA] -; CHECK-NEXT: ; fixup A - offset: 0, value: (.Ltmp6+4094)+2, kind: fixup_13_pcrel +; CHECK: rjmp .Ltmp6+4094+2 ; encoding: [A,0b1100AAAA] +; CHECK-NEXT: ; fixup A - offset: 0, value: .Ltmp6+4094+2, kind: fixup_13_pcrel ; INST-LABEL: : ; INST-NEXT: 01 c0 rjmp .+2 diff --git a/llvm/test/MC/AsmParser/directive_fill.s b/llvm/test/MC/AsmParser/directive_fill.s index a34112542b053..fd0a6056c9456 100644 --- a/llvm/test/MC/AsmParser/directive_fill.s +++ b/llvm/test/MC/AsmParser/directive_fill.s @@ -72,7 +72,7 @@ TEST12: .fill TEST11 - TEST12, 4, 0x12345678 # CHECK: TEST13 -# CHECK: .fill (TEST11-TEST12)+i, 4, 0x12345678 +# CHECK: .fill TEST11-TEST12+i, 4, 0x12345678 # OBJ-ERRS: [[@LINE+2]]:8: error: expected assembly-time absolute expression TEST13: .fill TEST11 - TEST12+i, 4, 0x12345678 diff --git a/llvm/test/MC/AsmParser/expr_symbol_modifiers.s b/llvm/test/MC/AsmParser/expr_symbol_modifiers.s index 7371c97cbdf4c..e465cf83d5ebe 100644 --- a/llvm/test/MC/AsmParser/expr_symbol_modifiers.s +++ b/llvm/test/MC/AsmParser/expr_symbol_modifiers.s @@ -10,5 +10,5 @@ .long a + 4@GOTPCREL // CHECK: .long a@GOTPCREL+b@GOTPCREL .long (a + b)@GOTPCREL -// CHECK: .long (10+b@GOTPCREL)+4 +// CHECK: .long 10+b@GOTPCREL+4 .long 10 + b + 4@GOTPCREL diff --git a/llvm/test/MC/COFF/cross-section-relative.ll b/llvm/test/MC/COFF/cross-section-relative.ll index 1b11a4659fee0..0f27aacd4915c 100644 --- a/llvm/test/MC/COFF/cross-section-relative.ll +++ b/llvm/test/MC/COFF/cross-section-relative.ll @@ -11,11 +11,11 @@ ;;;; cross-section relative relocations -; CHECK: .quad (g3-t1)+4 +; CHECK: .quad g3-t1+4 @t1 = global i64 add(i64 sub(i64 ptrtoint(ptr @g3 to i64), i64 ptrtoint(ptr @t1 to i64)), i64 4), section ".fix" ; CHECK: .quad g3-t2 @t2 = global i64 sub(i64 ptrtoint(ptr @g3 to i64), i64 ptrtoint(ptr @t2 to i64)), section ".fix" -; CHECK: .quad (g3-t3)-4 +; CHECK: .quad g3-t3-4 @t3 = global i64 sub(i64 sub(i64 ptrtoint(ptr @g3 to i64), i64 ptrtoint(ptr @t3 to i64)), i64 4), section ".fix" ; CHECK: .long g3-t4 @t4 = global i32 trunc(i64 sub(i64 ptrtoint(ptr @g3 to i64), i64 ptrtoint(ptr @t4 to i64)) to i32), section ".fix" @@ -32,7 +32,7 @@ %struct.EEType = type { [2 x i8], i64, i32} -; CHECK: .long (g3-t7)-16 +; CHECK: .long g3-t7-16 @t7 = global %struct.EEType { [2 x i8] c"\01\02", i64 256, diff --git a/llvm/test/MC/ELF/reloc-directive.s b/llvm/test/MC/ELF/reloc-directive.s index a4658f938d0d3..f4121ef071810 100644 --- a/llvm/test/MC/ELF/reloc-directive.s +++ b/llvm/test/MC/ELF/reloc-directive.s @@ -4,12 +4,12 @@ # RUN: llvm-readobj -r %t | FileCheck %s # ASM: .Ltmp0: -# ASM-NEXT: .reloc (.Ltmp0+3)-2, R_X86_64_NONE, foo +# ASM-NEXT: .reloc .Ltmp0+3-2, R_X86_64_NONE, foo # ASM-NEXT: .Ltmp1: # ASM-NEXT: .reloc .Ltmp1-1, R_X86_64_NONE, foo # ASM-NEXT: .Ltmp2: # ASM-NEXT: .reloc 2+.Ltmp2, R_X86_64_NONE, foo -# ASM-NEXT: .reloc (1+foo)+3, R_X86_64_NONE, data+1 +# ASM-NEXT: .reloc 1+foo+3, R_X86_64_NONE, data+1 # ASM-NEXT: .Ltmp3: # ASM-NEXT: .reloc .Ltmp3, BFD_RELOC_NONE, unused diff --git a/llvm/test/MC/Lanai/memory.s b/llvm/test/MC/Lanai/memory.s index 398cb8e123711..41dc8fba7bf29 100644 --- a/llvm/test/MC/Lanai/memory.s +++ b/llvm/test/MC/Lanai/memory.s @@ -239,9 +239,9 @@ mov hi(l+4), %r7 ! CHECK: encoding: [0x03,0x81,A,A] -! CHECK-NEXT: fixup A - offset: 0, value: (hi(l))+4, kind: FIXUP_LANAI_HI16{{$}} +! CHECK-NEXT: fixup A - offset: 0, value: hi(l)+4, kind: FIXUP_LANAI_HI16{{$}} ! CHECK-NEXT: ! CHECK-NEXT: -! CHECK-NEXT: +! CHECK-NEXT: diff --git a/llvm/test/MC/MachO/AArch64/cstexpr-gotpcrel.ll b/llvm/test/MC/MachO/AArch64/cstexpr-gotpcrel.ll index 3681ed5351839..53f43e68ac794 100644 --- a/llvm/test/MC/MachO/AArch64/cstexpr-gotpcrel.ll +++ b/llvm/test/MC/MachO/AArch64/cstexpr-gotpcrel.ll @@ -49,7 +49,7 @@ ; supported on x86-64 but not on ARM64 ; CHECK: .long 5 -; CHECK-NEXT: .long ((l_extgotequiv-_table)-44)+24 +; CHECK-NEXT: .long l_extgotequiv-_table-44+24 %struct.data { i32 4, %struct.anon { i32 5, i32 add (i32 trunc (i64 sub (i64 ptrtoint (ptr @extgotequiv to i64), i64 ptrtoint (ptr getelementptr inbounds ([4 x %struct.data], ptr @table, i32 0, i64 3, i32 1, i32 1) to i64)) @@ -67,7 +67,7 @@ to i32) ; CHECK-LABEL: _deltaplus: -; CHECK: .long (l_localgotequiv-_deltaplus)+55 +; CHECK: .long l_localgotequiv-_deltaplus+55 @deltaplus = global i32 add (i32 trunc (i64 sub (i64 ptrtoint (ptr @localgotequiv to i64), i64 ptrtoint (ptr @deltaplus to i64)) to i32), i32 55) diff --git a/llvm/test/MC/Mips/expr1.s b/llvm/test/MC/Mips/expr1.s index 7293fc11b23bd..f707091bed7bd 100644 --- a/llvm/test/MC/Mips/expr1.s +++ b/llvm/test/MC/Mips/expr1.s @@ -9,11 +9,11 @@ # 32R2-EL: lw $4, %lo(foo)($4) # encoding: [A,A,0x84,0x8c] # 32R2-EL: # fixup A - offset: 0, value: %lo(foo), kind: fixup_Mips_LO16 # 32R2-EL: lw $4, 56($4) # encoding: [0x38,0x00,0x84,0x8c] -# 32R2-EL: lui $1, %hi(foo+(%lo(8))) # encoding: [A,A,0x01,0x3c] -# 32R2-EL: # fixup A - offset: 0, value: %hi(foo+(%lo(8))), kind: fixup_Mips_HI16 +# 32R2-EL: lui $1, %hi(foo+%lo(8)) # encoding: [A,A,0x01,0x3c] +# 32R2-EL: # fixup A - offset: 0, value: %hi(foo+%lo(8)), kind: fixup_Mips_HI16 # 32R2-EL: addu $1, $1, $4 # encoding: [0x21,0x08,0x24,0x00] -# 32R2-EL: lw $4, %lo(foo+(%lo(8)))($1) # encoding: [A,A,0x24,0x8c] -# 32R2-EL: # fixup A - offset: 0, value: %lo(foo+(%lo(8))), kind: fixup_Mips_LO16 +# 32R2-EL: lw $4, %lo(foo+%lo(8))($1) # encoding: [A,A,0x24,0x8c] +# 32R2-EL: # fixup A - offset: 0, value: %lo(foo+%lo(8)), kind: fixup_Mips_LO16 # 32R2-EL: lw $4, %lo(12+foo)($4) # encoding: [A,A,0x84,0x8c] # 32R2-EL: # fixup A - offset: 0, value: %lo(12+foo), kind: fixup_Mips_LO16 # 32R2-EL: lw $4, 10($4) # encoding: [0x0a,0x00,0x84,0x8c] @@ -27,11 +27,11 @@ # MM-32R2-EL: lw $4, %lo(foo)($4) # encoding: [0x84'A',0xfc'A',0x00,0x00] # MM-32R2-EL: # fixup A - offset: 0, value: %lo(foo), kind: fixup_MICROMIPS_LO16 # MM-32R2-EL: lw $4, 56($4) # encoding: [0x84,0xfc,0x38,0x00] -# MM-32R2-EL: lui $1, %hi(foo+(%lo(8))) # encoding: [0xa1'A',0x41'A',0x00,0x00] -# MM-32R2-EL: # fixup A - offset: 0, value: %hi(foo+(%lo(8))), kind: fixup_MICROMIPS_HI16 +# MM-32R2-EL: lui $1, %hi(foo+%lo(8)) # encoding: [0xa1'A',0x41'A',0x00,0x00] +# MM-32R2-EL: # fixup A - offset: 0, value: %hi(foo+%lo(8)), kind: fixup_MICROMIPS_HI16 # MM-32R2-EL: addu $1, $1, $4 # encoding: [0x81,0x00,0x50,0x09] -# MM-32R2-EL: lw $4, %lo(foo+(%lo(8)))($1) # encoding: [0x81'A',0xfc'A',0x00,0x00] -# MM-32R2-EL: # fixup A - offset: 0, value: %lo(foo+(%lo(8))), kind: fixup_MICROMIPS_LO16 +# MM-32R2-EL: lw $4, %lo(foo+%lo(8))($1) # encoding: [0x81'A',0xfc'A',0x00,0x00] +# MM-32R2-EL: # fixup A - offset: 0, value: %lo(foo+%lo(8)), kind: fixup_MICROMIPS_LO16 # MM-32R2-EL: lw $4, %lo(12+foo)($4) # encoding: [0x84'A',0xfc'A',0x00,0x00] # MM-32R2-EL: # fixup A - offset: 0, value: %lo(12+foo), kind: fixup_MICROMIPS_LO16 # MM-32R2-EL: lw $4, 10($4) # encoding: [0x84,0xfc,0x0a,0x00] diff --git a/llvm/test/MC/Mips/memory-offsets.s b/llvm/test/MC/Mips/memory-offsets.s index 895d8c6ba5fcf..7f5a59152eee6 100644 --- a/llvm/test/MC/Mips/memory-offsets.s +++ b/llvm/test/MC/Mips/memory-offsets.s @@ -25,9 +25,9 @@ __start: lw $31, (8 * 4) % (8 * 31)($29) # CHECK: lw $ra, 32($sp) # encoding: [0x8f,0xbf,0x00,0x20] lw $31, (8 * 4) % (8)($29) # CHECK: lw $ra, 0($sp) # encoding: [0x8f,0xbf,0x00,0x00] lw $31, (8 * 4) + (8 * 31) ($29) # CHECK: lw $ra, 280($sp) # encoding: [0x8f,0xbf,0x01,0x18] - lw $31, (8*4) + (8*31) + (8*32 + __start) ($29) # CHECK: lui $ra, %hi((248+((8*32)+__start))+32) # encoding: [0x3c,0x1f,A,A] - # CHECK: # fixup A - offset: 0, value: %hi((248+((8*32)+__start))+32), kind: fixup_Mips_HI16 + lw $31, (8*4) + (8*31) + (8*32 + __start) ($29) # CHECK: lui $ra, %hi(248+((8*32)+__start)+32) # encoding: [0x3c,0x1f,A,A] + # CHECK: # fixup A - offset: 0, value: %hi(248+((8*32)+__start)+32), kind: fixup_Mips_HI16 # CHECK: addu $ra, $ra, $sp # encoding: [0x03,0xfd,0xf8,0x21] - # CHECK: lw $ra, %lo((248+((8*32)+__start))+32)($ra) # encoding: [0x8f,0xff,A,A] - # CHECK: # fixup A - offset: 0, value: %lo((248+((8*32)+__start))+32), kind: fixup_Mips_LO16 + # CHECK: lw $ra, %lo(248+((8*32)+__start)+32)($ra) # encoding: [0x8f,0xff,A,A] + # CHECK: # fixup A - offset: 0, value: %lo(248+((8*32)+__start)+32), kind: fixup_Mips_LO16 .end __start diff --git a/llvm/test/MC/PowerPC/ppc32-tls.s b/llvm/test/MC/PowerPC/ppc32-tls.s index 3acf40e57f7f3..7413b43166aee 100644 --- a/llvm/test/MC/PowerPC/ppc32-tls.s +++ b/llvm/test/MC/PowerPC/ppc32-tls.s @@ -16,7 +16,7 @@ bl __tls_get_addr(d@tlsld)@PLT+32768 bl __tls_get_addr+32768(e@tlsld)@plt # gcc -fPIC ## These are not present in the wild, but just to test we can parse them. -# ASM: bl __tls_get_addr(f@tlsld)@PLT+1+(-2) +# ASM: bl __tls_get_addr(f@tlsld)@PLT+1+-2 bl __tls_get_addr+1(f@tlsld)@PLT+-2 # ASM: bl __tls_get_addr(g@tlsld)@PLT+1+(y-x) x: diff --git a/llvm/test/MC/RISCV/rvi-pseudos.s b/llvm/test/MC/RISCV/rvi-pseudos.s index a79381a82521d..6c5307ca418ee 100644 --- a/llvm/test/MC/RISCV/rvi-pseudos.s +++ b/llvm/test/MC/RISCV/rvi-pseudos.s @@ -220,7 +220,7 @@ sw a3, zero, a4 ## Check that a complex expression can be simplified and matched. # CHECK: .Lpcrel_hi36: -# CHECK: auipc a5, %pcrel_hi((255+a_symbol)-4) +# CHECK: auipc a5, %pcrel_hi(255+a_symbol-4) # CHECK: addi a5, a5, %pcrel_lo(.Lpcrel_hi36) lla a5, (0xFF + a_symbol) - 4 diff --git a/llvm/test/MC/SystemZ/insn-good-z196.s b/llvm/test/MC/SystemZ/insn-good-z196.s index d2a7724d3a9a2..5f9812d900de7 100644 --- a/llvm/test/MC/SystemZ/insn-good-z196.s +++ b/llvm/test/MC/SystemZ/insn-good-z196.s @@ -245,16 +245,16 @@ axtra %f8, %f8, %f8, 8 #CHECK: brcth %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xcc,0x06,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL brcth %r0, -0x100000000 #CHECK: brcth %r0, .[[LAB:L.*]]-2 # encoding: [0xcc,0x06,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL brcth %r0, -2 #CHECK: brcth %r0, .[[LAB:L.*]] # encoding: [0xcc,0x06,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL brcth %r0, 0 #CHECK: brcth %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xcc,0x06,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL brcth %r0, 0xfffffffe #CHECK: brcth %r0, foo # encoding: [0xcc,0x06,A,A,A,A] @@ -266,9 +266,9 @@ brcth %r15,foo #CHECK: brcth %r3, bar+100 # encoding: [0xcc,0x36,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: brcth %r4, bar+100 # encoding: [0xcc,0x46,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL brcth %r3,bar+100 brcth %r4,bar+100 diff --git a/llvm/test/MC/SystemZ/insn-good-zEC12.s b/llvm/test/MC/SystemZ/insn-good-zEC12.s index a564491c6c36f..18577786e06f9 100644 --- a/llvm/test/MC/SystemZ/insn-good-zEC12.s +++ b/llvm/test/MC/SystemZ/insn-good-zEC12.s @@ -3,16 +3,16 @@ # RUN: llvm-mc -triple s390x-linux-gnu -mcpu=arch10 -show-encoding %s | FileCheck %s #CHECK: bpp 0, .[[LAB:L.*]]-65536, 0 # encoding: [0xc7,0x00,0x00,0x00,A,A] -#CHECK: fixup A - offset: 4, value: (.[[LAB]]-65536)+4, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 4, value: .[[LAB]]-65536+4, kind: FK_390_PC16DBL bpp 0, -0x10000, 0 #CHECK: bpp 0, .[[LAB:L.*]]-2, 0 # encoding: [0xc7,0x00,0x00,0x00,A,A] -#CHECK: fixup A - offset: 4, value: (.[[LAB]]-2)+4, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 4, value: .[[LAB]]-2+4, kind: FK_390_PC16DBL bpp 0, -2, 0 #CHECK: bpp 0, .[[LAB:L.*]], 0 # encoding: [0xc7,0x00,0x00,0x00,A,A] #CHECK: fixup A - offset: 4, value: .[[LAB]]+4, kind: FK_390_PC16DBL bpp 0, 0, 0 #CHECK: bpp 0, .[[LAB:L.*]]+65534, 0 # encoding: [0xc7,0x00,0x00,0x00,A,A] -#CHECK: fixup A - offset: 4, value: (.[[LAB]]+65534)+4, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 4, value: .[[LAB]]+65534+4, kind: FK_390_PC16DBL bpp 0, 0xfffe, 0 #CHECK: bpp 0, foo, 4095(%r3) # encoding: [0xc7,0x00,0x3f,0xff,A,A] @@ -24,9 +24,9 @@ bpp 15, foo, 1(%r11) #CHECK: bpp 3, bar+100, 4095 # encoding: [0xc7,0x30,0x0f,0xff,A,A] -#CHECK: fixup A - offset: 4, value: (bar+100)+4, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 4, value: bar+100+4, kind: FK_390_PC16DBL #CHECK: bpp 4, bar+100, 1 # encoding: [0xc7,0x40,0x00,0x01,A,A] -#CHECK: fixup A - offset: 4, value: (bar+100)+4, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 4, value: bar+100+4, kind: FK_390_PC16DBL bpp 3, bar+100, 4095 bpp 4, bar+100, 1 @@ -40,11 +40,11 @@ bpp 8, frob@PLT, 0 #CHECK: bprp 0, .[[LABA:L.*]]-4096, .[[LABB:L.*]] # encoding: [0xc5,0b0000AAAA,A,B,B,B] -#CHECK: fixup A - offset: 1, value: (.[[LABA]]-4096)+1, kind: FK_390_PC12DBL +#CHECK: fixup A - offset: 1, value: .[[LABA]]-4096+1, kind: FK_390_PC12DBL #CHECK: fixup B - offset: 3, value: .[[LABB]]+3, kind: FK_390_PC24DBL bprp 0, -0x1000, 0 #CHECK: bprp 0, .[[LABA:L.*]]-2, .[[LABB:L.*]] # encoding: [0xc5,0b0000AAAA,A,B,B,B] -#CHECK: fixup A - offset: 1, value: (.[[LABA]]-2)+1, kind: FK_390_PC12DBL +#CHECK: fixup A - offset: 1, value: .[[LABA]]-2+1, kind: FK_390_PC12DBL #CHECK: fixup B - offset: 3, value: .[[LABB]]+3, kind: FK_390_PC24DBL bprp 0, -2, 0 #CHECK: bprp 0, .[[LABA:L.*]], .[[LABB:L.*]] # encoding: [0xc5,0b0000AAAA,A,B,B,B] @@ -52,16 +52,16 @@ #CHECK: fixup B - offset: 3, value: .[[LABB]]+3, kind: FK_390_PC24DBL bprp 0, 0, 0 #CHECK: bprp 0, .[[LABA:L.*]]+4094, .[[LABB:L.*]] # encoding: [0xc5,0b0000AAAA,A,B,B,B] -#CHECK: fixup A - offset: 1, value: (.[[LABA]]+4094)+1, kind: FK_390_PC12DBL +#CHECK: fixup A - offset: 1, value: .[[LABA]]+4094+1, kind: FK_390_PC12DBL #CHECK: fixup B - offset: 3, value: .[[LABB]]+3, kind: FK_390_PC24DBL bprp 0, 0xffe, 0 #CHECK: bprp 15, .[[LABA:L.*]], .[[LABB:L.*]]-16777216 # encoding: [0xc5,0b1111AAAA,A,B,B,B] #CHECK: fixup A - offset: 1, value: .[[LABA]]+1, kind: FK_390_PC12DBL -#CHECK: fixup B - offset: 3, value: (.[[LABB]]-16777216)+3, kind: FK_390_PC24DBL +#CHECK: fixup B - offset: 3, value: .[[LABB]]-16777216+3, kind: FK_390_PC24DBL bprp 15, 0, -0x1000000 #CHECK: bprp 15, .[[LABA:L.*]], .[[LABB:L.*]]-2 # encoding: [0xc5,0b1111AAAA,A,B,B,B] #CHECK: fixup A - offset: 1, value: .[[LABA]]+1, kind: FK_390_PC12DBL -#CHECK: fixup B - offset: 3, value: (.[[LABB]]-2)+3, kind: FK_390_PC24DBL +#CHECK: fixup B - offset: 3, value: .[[LABB]]-2+3, kind: FK_390_PC24DBL bprp 15, 0, -2 #CHECK: bprp 15, .[[LABA:L.*]], .[[LABB:L.*]] # encoding: [0xc5,0b1111AAAA,A,B,B,B] #CHECK: fixup A - offset: 1, value: .[[LABA]]+1, kind: FK_390_PC12DBL @@ -69,7 +69,7 @@ bprp 15, 0, 0 #CHECK: bprp 15, .[[LABA:L.*]], .[[LABB:L.*]]+16777214 # encoding: [0xc5,0b1111AAAA,A,B,B,B] #CHECK: fixup A - offset: 1, value: .[[LABA]]+1, kind: FK_390_PC12DBL -#CHECK: fixup B - offset: 3, value: (.[[LABB]]+16777214)+3, kind: FK_390_PC24DBL +#CHECK: fixup B - offset: 3, value: .[[LABB]]+16777214+3, kind: FK_390_PC24DBL bprp 15, 0, 0xfffffe #CHECK: bprp 1, branch, target # encoding: [0xc5,0b0001AAAA,A,B,B,B] @@ -87,14 +87,14 @@ bprp 3, branch, target #CHECK: bprp 4, branch+100, target # encoding: [0xc5,0b0100AAAA,A,B,B,B] -#CHECK: fixup A - offset: 1, value: (branch+100)+1, kind: FK_390_PC12DBL +#CHECK: fixup A - offset: 1, value: branch+100+1, kind: FK_390_PC12DBL #CHECK: fixup B - offset: 3, value: target+3, kind: FK_390_PC24DBL #CHECK: bprp 5, branch, target+100 # encoding: [0xc5,0b0101AAAA,A,B,B,B] #CHECK: fixup A - offset: 1, value: branch+1, kind: FK_390_PC12DBL -#CHECK: fixup B - offset: 3, value: (target+100)+3, kind: FK_390_PC24DBL +#CHECK: fixup B - offset: 3, value: target+100+3, kind: FK_390_PC24DBL #CHECK: bprp 6, branch+100, target+100 # encoding: [0xc5,0b0110AAAA,A,B,B,B] -#CHECK: fixup A - offset: 1, value: (branch+100)+1, kind: FK_390_PC12DBL -#CHECK: fixup B - offset: 3, value: (target+100)+3, kind: FK_390_PC24DBL +#CHECK: fixup A - offset: 1, value: branch+100+1, kind: FK_390_PC12DBL +#CHECK: fixup B - offset: 3, value: target+100+3, kind: FK_390_PC24DBL bprp 4, branch+100, target bprp 5, branch, target+100 diff --git a/llvm/test/MC/SystemZ/insn-good.s b/llvm/test/MC/SystemZ/insn-good.s index 4567c029c572e..bd0db54e27ad1 100644 --- a/llvm/test/MC/SystemZ/insn-good.s +++ b/llvm/test/MC/SystemZ/insn-good.s @@ -1224,16 +1224,16 @@ bzr %r5 #CHECK: bras %r0, .[[LAB:L.*]]-65536 # encoding: [0xa7,0x05,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL #CHECK: bras %r0, .[[LAB:L.*]]-65536 # encoding: [0xa7,0x05,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL bras %r0, -0x10000 jas %r0, -0x10000 #CHECK: bras %r0, .[[LAB:L.*]]-2 # encoding: [0xa7,0x05,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL #CHECK: bras %r0, .[[LAB:L.*]]-2 # encoding: [0xa7,0x05,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL bras %r0, -2 jas %r0, -2 @@ -1245,9 +1245,9 @@ jas %r0, 0 #CHECK: bras %r0, .[[LAB:L.*]]+65534 # encoding: [0xa7,0x05,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL #CHECK: bras %r0, .[[LAB:L.*]]+65534 # encoding: [0xa7,0x05,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL bras %r0, 0xfffe jas %r0, 0xfffe @@ -1271,17 +1271,17 @@ jas %r15,foo #CHECK: bras %r0, bar+100 # encoding: [0xa7,0x05,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: bras %r14, bar+100 # encoding: [0xa7,0xe5,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: bras %r15, bar+100 # encoding: [0xa7,0xf5,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: bras %r0, bar+100 # encoding: [0xa7,0x05,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: bras %r14, bar+100 # encoding: [0xa7,0xe5,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: bras %r15, bar+100 # encoding: [0xa7,0xf5,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL bras %r0,bar+100 bras %r14,bar+100 bras %r15,bar+100 @@ -1309,21 +1309,21 @@ jas %r15,bar@PLT #CHECK: brasl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc0,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL #CHECK: brasl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc0,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL brasl %r0, -0x100000000 jasl %r0, -0x100000000 #CHECK: brasl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc0,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL #CHECK: brasl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc0,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL brasl %r0, .-0x100000000 jasl %r0, .-0x100000000 #CHECK: brasl %r0, .[[LAB:L.*]]-2 # encoding: [0xc0,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL #CHECK: brasl %r0, .[[LAB:L.*]]-2 # encoding: [0xc0,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL brasl %r0, -2 jasl %r0, -2 #CHECK: brasl %r0, .[[LAB:L.*]] # encoding: [0xc0,0x05,A,A,A,A] @@ -1333,9 +1333,9 @@ brasl %r0, 0 jasl %r0, 0 #CHECK: brasl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc0,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL #CHECK: brasl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc0,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL brasl %r0, 0xfffffffe jasl %r0, 0xfffffffe @@ -1359,17 +1359,17 @@ jasl %r15,foo #CHECK: brasl %r0, bar+100 # encoding: [0xc0,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: brasl %r14, bar+100 # encoding: [0xc0,0xe5,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: brasl %r15, bar+100 # encoding: [0xc0,0xf5,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: brasl %r0, bar+100 # encoding: [0xc0,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: brasl %r14, bar+100 # encoding: [0xc0,0xe5,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: brasl %r15, bar+100 # encoding: [0xc0,0xf5,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL brasl %r0,bar+100 brasl %r14,bar+100 brasl %r15,bar+100 @@ -1397,16 +1397,16 @@ jasl %r15,bar@PLT #CHECK: brc 0, .[[LAB:L.*]]-65536 # encoding: [0xa7,0x04,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL #CHECK: jnop .[[LAB:L.*]]-65536 # encoding: [0xa7,0x04,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL brc 0, -0x10000 jnop -0x10000 #CHECK: brc 0, .[[LAB:L.*]]-2 # encoding: [0xa7,0x04,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL #CHECK: jnop .[[LAB:L.*]]-2 # encoding: [0xa7,0x04,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL brc 0, -2 jnop -2 @@ -1418,9 +1418,9 @@ jnop 0 #CHECK: brc 0, .[[LAB:L.*]]+65534 # encoding: [0xa7,0x04,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL #CHECK: jnop .[[LAB:L.*]]+65534 # encoding: [0xa7,0x04,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL brc 0, 0xfffe jnop 0xfffe @@ -1622,114 +1622,114 @@ bru foo #CHECK: brc 0, bar+100 # encoding: [0xa7,0x04,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jnop bar+100 # encoding: [0xa7,0x04,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL brc 0, bar+100 jnop bar+100 #CHECK: jo bar+100 # encoding: [0xa7,0x14,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jo bar+100 # encoding: [0xa7,0x14,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jo bar+100 bro bar+100 #CHECK: jh bar+100 # encoding: [0xa7,0x24,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jh bar+100 # encoding: [0xa7,0x24,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jh bar+100 brh bar+100 #CHECK: jnle bar+100 # encoding: [0xa7,0x34,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jnle bar+100 # encoding: [0xa7,0x34,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jnle bar+100 brnle bar+100 #CHECK: jl bar+100 # encoding: [0xa7,0x44,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jl bar+100 # encoding: [0xa7,0x44,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jl bar+100 brl bar+100 #CHECK: jnhe bar+100 # encoding: [0xa7,0x54,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jnhe bar+100 # encoding: [0xa7,0x54,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jnhe bar+100 brnhe bar+100 #CHECK: jlh bar+100 # encoding: [0xa7,0x64,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jlh bar+100 # encoding: [0xa7,0x64,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jlh bar+100 brlh bar+100 #CHECK: jne bar+100 # encoding: [0xa7,0x74,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jne bar+100 # encoding: [0xa7,0x74,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jne bar+100 brne bar+100 #CHECK: je bar+100 # encoding: [0xa7,0x84,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: je bar+100 # encoding: [0xa7,0x84,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL je bar+100 bre bar+100 #CHECK: jnlh bar+100 # encoding: [0xa7,0x94,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jnlh bar+100 # encoding: [0xa7,0x94,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jnlh bar+100 brnlh bar+100 #CHECK: jhe bar+100 # encoding: [0xa7,0xa4,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jhe bar+100 # encoding: [0xa7,0xa4,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jhe bar+100 brhe bar+100 #CHECK: jnl bar+100 # encoding: [0xa7,0xb4,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jnl bar+100 # encoding: [0xa7,0xb4,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jnl bar+100 brnl bar+100 #CHECK: jle bar+100 # encoding: [0xa7,0xc4,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jle bar+100 # encoding: [0xa7,0xc4,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jle bar+100 brle bar+100 #CHECK: jnh bar+100 # encoding: [0xa7,0xd4,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jnh bar+100 # encoding: [0xa7,0xd4,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jnh bar+100 brnh bar+100 #CHECK: jno bar+100 # encoding: [0xa7,0xe4,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: jno bar+100 # encoding: [0xa7,0xe4,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL jno bar+100 brno bar+100 #CHECK: j bar+100 # encoding: [0xa7,0xf4,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: j bar+100 # encoding: [0xa7,0xf4,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL j bar+100 bru bar+100 @@ -1846,15 +1846,15 @@ bru bar@PLT #CHECK: brcl 0, .[[LAB:L.*]]-4294967296 # encoding: [0xc0,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL #CHECK: jgnop .[[LAB:L.*]]-4294967296 # encoding: [0xc0,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL brcl 0, -0x100000000 jgnop -0x100000000 #CHECK: brcl 0, .[[LAB:L.*]]-2 # encoding: [0xc0,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL #CHECK: jgnop .[[LAB:L.*]]-2 # encoding: [0xc0,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL brcl 0, -2 jgnop -2 #CHECK: brcl 0, .[[LAB:L.*]] # encoding: [0xc0,0x04,A,A,A,A] @@ -1864,9 +1864,9 @@ brcl 0, 0 jgnop 0 #CHECK: brcl 0, .[[LAB:L.*]]+4294967294 # encoding: [0xc0,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL #CHECK: jgnop .[[LAB:L.*]]+4294967294 # encoding: [0xc0,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL brcl 0, 0xfffffffe jgnop 0xfffffffe @@ -2064,114 +2064,114 @@ brul foo #CHECK: brcl 0, bar+100 # encoding: [0xc0,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jgnop bar+100 # encoding: [0xc0,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL brcl 0, bar+100 jgnop bar+100 #CHECK: jgo bar+100 # encoding: [0xc0,0x14,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jgo bar+100 # encoding: [0xc0,0x14,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jgo bar+100 brol bar+100 #CHECK: jgh bar+100 # encoding: [0xc0,0x24,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jgh bar+100 # encoding: [0xc0,0x24,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jgh bar+100 brhl bar+100 #CHECK: jgnle bar+100 # encoding: [0xc0,0x34,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jgnle bar+100 # encoding: [0xc0,0x34,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jgnle bar+100 brnlel bar+100 #CHECK: jgl bar+100 # encoding: [0xc0,0x44,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jgl bar+100 # encoding: [0xc0,0x44,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jgl bar+100 brll bar+100 #CHECK: jgnhe bar+100 # encoding: [0xc0,0x54,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jgnhe bar+100 # encoding: [0xc0,0x54,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jgnhe bar+100 brnhel bar+100 #CHECK: jglh bar+100 # encoding: [0xc0,0x64,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jglh bar+100 # encoding: [0xc0,0x64,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jglh bar+100 brlhl bar+100 #CHECK: jgne bar+100 # encoding: [0xc0,0x74,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jgne bar+100 # encoding: [0xc0,0x74,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jgne bar+100 brnel bar+100 #CHECK: jge bar+100 # encoding: [0xc0,0x84,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jge bar+100 # encoding: [0xc0,0x84,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jge bar+100 brel bar+100 #CHECK: jgnlh bar+100 # encoding: [0xc0,0x94,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jgnlh bar+100 # encoding: [0xc0,0x94,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jgnlh bar+100 brnlhl bar+100 #CHECK: jghe bar+100 # encoding: [0xc0,0xa4,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jghe bar+100 # encoding: [0xc0,0xa4,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jghe bar+100 brhel bar+100 #CHECK: jgnl bar+100 # encoding: [0xc0,0xb4,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jgnl bar+100 # encoding: [0xc0,0xb4,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jgnl bar+100 brnll bar+100 #CHECK: jgle bar+100 # encoding: [0xc0,0xc4,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jgle bar+100 # encoding: [0xc0,0xc4,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jgle bar+100 brlel bar+100 #CHECK: jgnh bar+100 # encoding: [0xc0,0xd4,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jgnh bar+100 # encoding: [0xc0,0xd4,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jgnh bar+100 brnhl bar+100 #CHECK: jgno bar+100 # encoding: [0xc0,0xe4,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jgno bar+100 # encoding: [0xc0,0xe4,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jgno bar+100 brnol bar+100 #CHECK: jg bar+100 # encoding: [0xc0,0xf4,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: jg bar+100 # encoding: [0xc0,0xf4,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL jg bar+100 brul bar+100 @@ -2346,15 +2346,15 @@ bctgr %r15,%r9 #CHECK: brct %r0, .[[LAB:L.*]]-65536 # encoding: [0xa7,0x06,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL #CHECK: brct %r0, .[[LAB:L.*]]-65536 # encoding: [0xa7,0x06,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL brct %r0, -0x10000 jct %r0, -0x10000 #CHECK: brct %r0, .[[LAB:L.*]]-2 # encoding: [0xa7,0x06,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL #CHECK: brct %r0, .[[LAB:L.*]]-2 # encoding: [0xa7,0x06,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL brct %r0, -2 jct %r0, -2 #CHECK: brct %r0, .[[LAB:L.*]] # encoding: [0xa7,0x06,A,A] @@ -2364,9 +2364,9 @@ brct %r0, 0 jct %r0, 0 #CHECK: brct %r0, .[[LAB:L.*]]+65534 # encoding: [0xa7,0x06,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL #CHECK: brct %r0, .[[LAB:L.*]]+65534 # encoding: [0xa7,0x06,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL brct %r0, 0xfffe jct %r0, 0xfffe #CHECK: brct %r15, .[[LAB:L.*]] # encoding: [0xa7,0xf6,A,A] @@ -2377,15 +2377,15 @@ jct %r15, 0 #CHECK: brctg %r0, .[[LAB:L.*]]-65536 # encoding: [0xa7,0x07,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL #CHECK: brctg %r0, .[[LAB:L.*]]-65536 # encoding: [0xa7,0x07,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL brctg %r0, -0x10000 jctg %r0, -0x10000 #CHECK: brctg %r0, .[[LAB:L.*]]-2 # encoding: [0xa7,0x07,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL #CHECK: brctg %r0, .[[LAB:L.*]]-2 # encoding: [0xa7,0x07,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL brctg %r0, -2 jctg %r0, -2 #CHECK: brctg %r0, .[[LAB:L.*]] # encoding: [0xa7,0x07,A,A] @@ -2395,9 +2395,9 @@ brctg %r0, 0 jctg %r0, 0 #CHECK: brctg %r0, .[[LAB:L.*]]+65534 # encoding: [0xa7,0x07,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL #CHECK: brctg %r0, .[[LAB:L.*]]+65534 # encoding: [0xa7,0x07,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL brctg %r0, 0xfffe jctg %r0, 0xfffe #CHECK: brctg %r15, .[[LAB:L.*]] # encoding: [0xa7,0xf7,A,A] @@ -2460,15 +2460,15 @@ bxhg %r0,%r0,524287(%r15) #CHECK: brxh %r0, %r2, .[[LAB:L.*]]-65536 # encoding: [0x84,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL #CHECK: brxh %r0, %r2, .[[LAB:L.*]]-65536 # encoding: [0x84,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL brxh %r0,%r2, -0x10000 jxh %r0,%r2, -0x10000 #CHECK: brxh %r0, %r2, .[[LAB:L.*]]-2 # encoding: [0x84,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL #CHECK: brxh %r0, %r2, .[[LAB:L.*]]-2 # encoding: [0x84,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL brxh %r0, %r2, -2 jxh %r0, %r2, -2 #CHECK: brxh %r0, %r2, .[[LAB:L.*]] # encoding: [0x84,0x02,A,A] @@ -2478,9 +2478,9 @@ brxh %r0,%r2, 0 jxh %r0,%r2, 0 #CHECK: brxh %r0, %r2, .[[LAB:L.*]]+65534 # encoding: [0x84,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL #CHECK: brxh %r0, %r2, .[[LAB:L.*]]+65534 # encoding: [0x84,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL brxh %r0,%r2, 0xfffe jxh %r0,%r2, 0xfffe @@ -2504,17 +2504,17 @@ jxh %r15,%r2,foo #CHECK: brxh %r0, %r2, bar+100 # encoding: [0x84,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxh %r14, %r2, bar+100 # encoding: [0x84,0xe2,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxh %r15, %r2, bar+100 # encoding: [0x84,0xf2,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxh %r0, %r2, bar+100 # encoding: [0x84,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxh %r14, %r2, bar+100 # encoding: [0x84,0xe2,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxh %r15, %r2, bar+100 # encoding: [0x84,0xf2,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL brxh %r0,%r2,bar+100 brxh %r14,%r2,bar+100 brxh %r15,%r2,bar+100 @@ -2542,15 +2542,15 @@ jxh %r15,%r2,bar@PLT #CHECK: brxhg %r0, %r2, .[[LAB:L.*]]-65536 # encoding: [0xec,0x02,A,A,0x00,0x44] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL #CHECK: brxhg %r0, %r2, .[[LAB:L.*]]-65536 # encoding: [0xec,0x02,A,A,0x00,0x44] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL brxhg %r0,%r2, -0x10000 jxhg %r0,%r2, -0x10000 #CHECK: brxhg %r0, %r2, .[[LAB:L.*]]-2 # encoding: [0xec,0x02,A,A,0x00,0x44] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL #CHECK: brxhg %r0, %r2, .[[LAB:L.*]]-2 # encoding: [0xec,0x02,A,A,0x00,0x44] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL brxhg %r0, %r2, -2 jxhg %r0, %r2, -2 #CHECK: brxhg %r0, %r2, .[[LAB:L.*]] # encoding: [0xec,0x02,A,A,0x00,0x44] @@ -2560,9 +2560,9 @@ brxhg %r0,%r2, 0 jxhg %r0,%r2, 0 #CHECK: brxhg %r0, %r2, .[[LAB:L.*]]+65534 # encoding: [0xec,0x02,A,A,0x00,0x44] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL #CHECK: brxhg %r0, %r2, .[[LAB:L.*]]+65534 # encoding: [0xec,0x02,A,A,0x00,0x44] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL brxhg %r0,%r2, 0xfffe jxhg %r0,%r2, 0xfffe @@ -2586,17 +2586,17 @@ jxhg %r15,%r2,foo #CHECK: brxhg %r0, %r2, bar+100 # encoding: [0xec,0x02,A,A,0x00,0x44] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxhg %r14, %r2, bar+100 # encoding: [0xec,0xe2,A,A,0x00,0x44] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxhg %r15, %r2, bar+100 # encoding: [0xec,0xf2,A,A,0x00,0x44] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxhg %r0, %r2, bar+100 # encoding: [0xec,0x02,A,A,0x00,0x44] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxhg %r14, %r2, bar+100 # encoding: [0xec,0xe2,A,A,0x00,0x44] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxhg %r15, %r2, bar+100 # encoding: [0xec,0xf2,A,A,0x00,0x44] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL brxhg %r0,%r2,bar+100 brxhg %r14,%r2,bar+100 brxhg %r15,%r2,bar+100 @@ -2675,15 +2675,15 @@ bxleg %r0,%r0,524287(%r15) #CHECK: brxle %r0, %r2, .[[LAB:L.*]]-65536 # encoding: [0x85,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL #CHECK: brxle %r0, %r2, .[[LAB:L.*]]-65536 # encoding: [0x85,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL brxle %r0,%r2, -0x10000 jxle %r0,%r2, -0x10000 #CHECK: brxle %r0, %r2, .[[LAB:L.*]]-2 # encoding: [0x85,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL #CHECK: brxle %r0, %r2, .[[LAB:L.*]]-2 # encoding: [0x85,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL brxle %r0, %r2, -2 jxle %r0, %r2, -2 #CHECK: brxle %r0, %r2, .[[LAB:L.*]] # encoding: [0x85,0x02,A,A] @@ -2693,9 +2693,9 @@ brxle %r0,%r2, 0 jxle %r0,%r2, 0 #CHECK: brxle %r0, %r2, .[[LAB:L.*]]+65534 # encoding: [0x85,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL #CHECK: brxle %r0, %r2, .[[LAB:L.*]]+65534 # encoding: [0x85,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL brxle %r0,%r2, 0xfffe jxle %r0,%r2, 0xfffe @@ -2719,17 +2719,17 @@ jxle %r15,%r2,foo #CHECK: brxle %r0, %r2, bar+100 # encoding: [0x85,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxle %r14, %r2, bar+100 # encoding: [0x85,0xe2,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxle %r15, %r2, bar+100 # encoding: [0x85,0xf2,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxle %r0, %r2, bar+100 # encoding: [0x85,0x02,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxle %r14, %r2, bar+100 # encoding: [0x85,0xe2,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxle %r15, %r2, bar+100 # encoding: [0x85,0xf2,A,A] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL brxle %r0,%r2,bar+100 brxle %r14,%r2,bar+100 brxle %r15,%r2,bar+100 @@ -2757,15 +2757,15 @@ jxle %r15,%r2,bar@PLT #CHECK: brxlg %r0, %r2, .[[LAB:L.*]]-65536 # encoding: [0xec,0x02,A,A,0x00,0x45] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL #CHECK: brxlg %r0, %r2, .[[LAB:L.*]]-65536 # encoding: [0xec,0x02,A,A,0x00,0x45] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL brxlg %r0,%r2, -0x10000 jxleg %r0,%r2, -0x10000 #CHECK: brxlg %r0, %r2, .[[LAB:L.*]]-2 # encoding: [0xec,0x02,A,A,0x00,0x45] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL #CHECK: brxlg %r0, %r2, .[[LAB:L.*]]-2 # encoding: [0xec,0x02,A,A,0x00,0x45] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL brxlg %r0, %r2, -2 jxleg %r0, %r2, -2 #CHECK: brxlg %r0, %r2, .[[LAB:L.*]] # encoding: [0xec,0x02,A,A,0x00,0x45] @@ -2775,9 +2775,9 @@ brxlg %r0,%r2, 0 jxleg %r0,%r2, 0 #CHECK: brxlg %r0, %r2, .[[LAB:L.*]]+65534 # encoding: [0xec,0x02,A,A,0x00,0x45] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL #CHECK: brxlg %r0, %r2, .[[LAB:L.*]]+65534 # encoding: [0xec,0x02,A,A,0x00,0x45] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL brxlg %r0,%r2, 0xfffe jxleg %r0,%r2, 0xfffe @@ -2801,17 +2801,17 @@ jxleg %r15,%r2,foo #CHECK: brxlg %r0, %r2, bar+100 # encoding: [0xec,0x02,A,A,0x00,0x45] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxlg %r14, %r2, bar+100 # encoding: [0xec,0xe2,A,A,0x00,0x45] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxlg %r15, %r2, bar+100 # encoding: [0xec,0xf2,A,A,0x00,0x45] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxlg %r0, %r2, bar+100 # encoding: [0xec,0x02,A,A,0x00,0x45] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxlg %r14, %r2, bar+100 # encoding: [0xec,0xe2,A,A,0x00,0x45] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL #CHECK: brxlg %r15, %r2, bar+100 # encoding: [0xec,0xf2,A,A,0x00,0x45] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL brxlg %r0,%r2,bar+100 brxlg %r14,%r2,bar+100 brxlg %r15,%r2,bar+100 @@ -3415,16 +3415,16 @@ cgfr %r7,%r8 #CHECK: cgfrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x0c,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL cgfrl %r0, -0x100000000 #CHECK: cgfrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc6,0x0c,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL cgfrl %r0, -2 #CHECK: cgfrl %r0, .[[LAB:L.*]] # encoding: [0xc6,0x0c,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL cgfrl %r0, 0 #CHECK: cgfrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x0c,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL cgfrl %r0, 0xfffffffe #CHECK: cgfrl %r0, foo # encoding: [0xc6,0x0c,A,A,A,A] @@ -3436,9 +3436,9 @@ cgfrl %r15,foo #CHECK: cgfrl %r3, bar+100 # encoding: [0xc6,0x3c,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: cgfrl %r4, bar+100 # encoding: [0xc6,0x4c,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL cgfrl %r3,bar+100 cgfrl %r4,bar+100 @@ -3490,16 +3490,16 @@ cghi %r0, foo #CHECK: cghrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL cghrl %r0, -0x100000000 #CHECK: cghrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc6,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL cghrl %r0, -2 #CHECK: cghrl %r0, .[[LAB:L.*]] # encoding: [0xc6,0x04,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL cghrl %r0, 0 #CHECK: cghrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL cghrl %r0, 0xfffffffe #CHECK: cghrl %r0, foo # encoding: [0xc6,0x04,A,A,A,A] @@ -3511,9 +3511,9 @@ cghrl %r15,foo #CHECK: cghrl %r3, bar+100 # encoding: [0xc6,0x34,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: cghrl %r4, bar+100 # encoding: [0xc6,0x44,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL cghrl %r3,bar+100 cghrl %r4,bar+100 @@ -3647,16 +3647,16 @@ cgij %r7, -1, 0, 0 #CHECK: cgij %r1, -66, 0, .[[LAB:L.*]]-65536 # encoding: [0xec,0x10,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL cgij %r1, -66, 0, -0x10000 #CHECK: cgij %r1, -66, 0, .[[LAB:L.*]]-2 # encoding: [0xec,0x10,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL cgij %r1, -66, 0, -2 #CHECK: cgij %r1, -66, 0, .[[LAB:L.*]] # encoding: [0xec,0x10,A,A,0xbe,0x7c] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL cgij %r1, -66, 0, 0 #CHECK: cgij %r1, -66, 0, .[[LAB:L.*]]+65534 # encoding: [0xec,0x10,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL cgij %r1, -66, 0, 0xfffe #CHECK: cgij %r1, -66, 0, foo # encoding: [0xec,0x10,A,A,0xbe,0x7c] @@ -3760,55 +3760,55 @@ cgij %r1, -66, 15, foo #CHECK: cgij %r1, -66, 0, bar+100 # encoding: [0xec,0x10,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgij %r1, -66, 0, bar+100 #CHECK: cgijh %r1, -66, bar+100 # encoding: [0xec,0x12,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgijh %r1, -66, bar+100 #CHECK: cgijnle %r1, -66, bar+100 # encoding: [0xec,0x12,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgijnle %r1, -66, bar+100 #CHECK: cgijl %r1, -66, bar+100 # encoding: [0xec,0x14,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgijl %r1, -66, bar+100 #CHECK: cgijnhe %r1, -66, bar+100 # encoding: [0xec,0x14,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgijnhe %r1, -66, bar+100 #CHECK: cgijlh %r1, -66, bar+100 # encoding: [0xec,0x16,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgijlh %r1, -66, bar+100 #CHECK: cgijne %r1, -66, bar+100 # encoding: [0xec,0x16,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgijne %r1, -66, bar+100 #CHECK: cgije %r1, -66, bar+100 # encoding: [0xec,0x18,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgije %r1, -66, bar+100 #CHECK: cgijnlh %r1, -66, bar+100 # encoding: [0xec,0x18,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgijnlh %r1, -66, bar+100 #CHECK: cgijhe %r1, -66, bar+100 # encoding: [0xec,0x1a,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgijhe %r1, -66, bar+100 #CHECK: cgijnl %r1, -66, bar+100 # encoding: [0xec,0x1a,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgijnl %r1, -66, bar+100 #CHECK: cgijle %r1, -66, bar+100 # encoding: [0xec,0x1c,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgijle %r1, -66, bar+100 #CHECK: cgijnh %r1, -66, bar+100 # encoding: [0xec,0x1c,A,A,0xbe,0x7c] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgijnh %r1, -66, bar+100 #CHECK: cgij %r1, -66, 0, bar@PLT # encoding: [0xec,0x10,A,A,0xbe,0x7c] @@ -3997,16 +3997,16 @@ cgrj %r7,%r8,0,0 #CHECK: cgrj %r1, %r2, 0, .[[LAB:L.*]]-65536 # encoding: [0xec,0x12,A,A,0x00,0x64] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL cgrj %r1, %r2, 0, -0x10000 #CHECK: cgrj %r1, %r2, 0, .[[LAB:L.*]]-2 # encoding: [0xec,0x12,A,A,0x00,0x64] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL cgrj %r1, %r2, 0, -2 #CHECK: cgrj %r1, %r2, 0, .[[LAB:L.*]] # encoding: [0xec,0x12,A,A,0x00,0x64] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL cgrj %r1, %r2, 0, 0 #CHECK: cgrj %r1, %r2, 0, .[[LAB:L.*]]+65534 # encoding: [0xec,0x12,A,A,0x00,0x64] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL cgrj %r1, %r2, 0, 0xfffe #CHECK: cgrj %r1, %r2, 0, foo # encoding: [0xec,0x12,A,A,0x00,0x64] @@ -4110,55 +4110,55 @@ cgrj %r1, %r2, 15, foo #CHECK: cgrj %r1, %r2, 0, bar+100 # encoding: [0xec,0x12,A,A,0x00,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrj %r1, %r2, 0, bar+100 #CHECK: cgrjh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x20,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrjh %r1, %r2, bar+100 #CHECK: cgrjnle %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x20,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrjnle %r1, %r2, bar+100 #CHECK: cgrjl %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x40,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrjl %r1, %r2, bar+100 #CHECK: cgrjnhe %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x40,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrjnhe %r1, %r2, bar+100 #CHECK: cgrjlh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x60,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrjlh %r1, %r2, bar+100 #CHECK: cgrjne %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x60,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrjne %r1, %r2, bar+100 #CHECK: cgrje %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x80,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrje %r1, %r2, bar+100 #CHECK: cgrjnlh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x80,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrjnlh %r1, %r2, bar+100 #CHECK: cgrjhe %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xa0,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrjhe %r1, %r2, bar+100 #CHECK: cgrjnl %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xa0,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrjnl %r1, %r2, bar+100 #CHECK: cgrjle %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xc0,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrjle %r1, %r2, bar+100 #CHECK: cgrjnh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xc0,0x64] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cgrjnh %r1, %r2, bar+100 #CHECK: cgrj %r1, %r2, 0, bar@PLT # encoding: [0xec,0x12,A,A,0x00,0x64] @@ -4214,16 +4214,16 @@ cgrjnh %r1, %r2, bar@PLT #CHECK: cgrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x08,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL cgrl %r0, -0x100000000 #CHECK: cgrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc6,0x08,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL cgrl %r0, -2 #CHECK: cgrl %r0, .[[LAB:L.*]] # encoding: [0xc6,0x08,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL cgrl %r0, 0 #CHECK: cgrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x08,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL cgrl %r0, 0xfffffffe #CHECK: cgrl %r0, foo # encoding: [0xc6,0x08,A,A,A,A] @@ -4235,9 +4235,9 @@ cgrl %r15,foo #CHECK: cgrl %r3, bar+100 # encoding: [0xc6,0x38,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: cgrl %r4, bar+100 # encoding: [0xc6,0x48,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL cgrl %r3,bar+100 cgrl %r4,bar+100 @@ -4377,16 +4377,16 @@ chi %r0, foo #CHECK: chrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL chrl %r0, -0x100000000 #CHECK: chrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc6,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL chrl %r0, -2 #CHECK: chrl %r0, .[[LAB:L.*]] # encoding: [0xc6,0x05,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL chrl %r0, 0 #CHECK: chrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL chrl %r0, 0xfffffffe #CHECK: chrl %r0, foo # encoding: [0xc6,0x05,A,A,A,A] @@ -4398,9 +4398,9 @@ chrl %r15,foo #CHECK: chrl %r3, bar+100 # encoding: [0xc6,0x35,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: chrl %r4, bar+100 # encoding: [0xc6,0x45,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL chrl %r3,bar+100 chrl %r4,bar+100 @@ -4556,16 +4556,16 @@ cij %r7, -1, 0, 0 #CHECK: cij %r1, -66, 0, .[[LAB:L.*]]-65536 # encoding: [0xec,0x10,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL cij %r1, -66, 0, -0x10000 #CHECK: cij %r1, -66, 0, .[[LAB:L.*]]-2 # encoding: [0xec,0x10,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL cij %r1, -66, 0, -2 #CHECK: cij %r1, -66, 0, .[[LAB:L.*]] # encoding: [0xec,0x10,A,A,0xbe,0x7e] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL cij %r1, -66, 0, 0 #CHECK: cij %r1, -66, 0, .[[LAB:L.*]]+65534 # encoding: [0xec,0x10,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL cij %r1, -66, 0, 0xfffe #CHECK: cij %r1, -66, 0, foo # encoding: [0xec,0x10,A,A,0xbe,0x7e] @@ -4669,55 +4669,55 @@ cij %r1, -66, 15, foo #CHECK: cij %r1, -66, 0, bar+100 # encoding: [0xec,0x10,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cij %r1, -66, 0, bar+100 #CHECK: cijh %r1, -66, bar+100 # encoding: [0xec,0x12,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cijh %r1, -66, bar+100 #CHECK: cijnle %r1, -66, bar+100 # encoding: [0xec,0x12,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cijnle %r1, -66, bar+100 #CHECK: cijl %r1, -66, bar+100 # encoding: [0xec,0x14,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cijl %r1, -66, bar+100 #CHECK: cijnhe %r1, -66, bar+100 # encoding: [0xec,0x14,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cijnhe %r1, -66, bar+100 #CHECK: cijlh %r1, -66, bar+100 # encoding: [0xec,0x16,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cijlh %r1, -66, bar+100 #CHECK: cijne %r1, -66, bar+100 # encoding: [0xec,0x16,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cijne %r1, -66, bar+100 #CHECK: cije %r1, -66, bar+100 # encoding: [0xec,0x18,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cije %r1, -66, bar+100 #CHECK: cijnlh %r1, -66, bar+100 # encoding: [0xec,0x18,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cijnlh %r1, -66, bar+100 #CHECK: cijhe %r1, -66, bar+100 # encoding: [0xec,0x1a,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cijhe %r1, -66, bar+100 #CHECK: cijnl %r1, -66, bar+100 # encoding: [0xec,0x1a,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cijnl %r1, -66, bar+100 #CHECK: cijle %r1, -66, bar+100 # encoding: [0xec,0x1c,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cijle %r1, -66, bar+100 #CHECK: cijnh %r1, -66, bar+100 # encoding: [0xec,0x1c,A,A,0xbe,0x7e] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL cijnh %r1, -66, bar+100 #CHECK: cij %r1, -66, 0, bar@PLT # encoding: [0xec,0x10,A,A,0xbe,0x7e] @@ -5025,16 +5025,16 @@ clgfr %r7,%r8 #CHECK: clgfrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x0e,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL clgfrl %r0, -0x100000000 #CHECK: clgfrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc6,0x0e,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL clgfrl %r0, -2 #CHECK: clgfrl %r0, .[[LAB:L.*]] # encoding: [0xc6,0x0e,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL clgfrl %r0, 0 #CHECK: clgfrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x0e,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL clgfrl %r0, 0xfffffffe #CHECK: clgfrl %r0, foo # encoding: [0xc6,0x0e,A,A,A,A] @@ -5046,9 +5046,9 @@ clgfrl %r15,foo #CHECK: clgfrl %r3, bar+100 # encoding: [0xc6,0x3e,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: clgfrl %r4, bar+100 # encoding: [0xc6,0x4e,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL clgfrl %r3,bar+100 clgfrl %r4,bar+100 @@ -5062,16 +5062,16 @@ clgfrl %r8,frob@PLT #CHECK: clghrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x06,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL clghrl %r0, -0x100000000 #CHECK: clghrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc6,0x06,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL clghrl %r0, -2 #CHECK: clghrl %r0, .[[LAB:L.*]] # encoding: [0xc6,0x06,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL clghrl %r0, 0 #CHECK: clghrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x06,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL clghrl %r0, 0xfffffffe #CHECK: clghrl %r0, foo # encoding: [0xc6,0x06,A,A,A,A] @@ -5083,9 +5083,9 @@ clghrl %r15,foo #CHECK: clghrl %r3, bar+100 # encoding: [0xc6,0x36,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: clghrl %r4, bar+100 # encoding: [0xc6,0x46,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL clghrl %r3,bar+100 clghrl %r4,bar+100 @@ -5205,16 +5205,16 @@ clgij %r15, 0, 0, 0 #CHECK: clgij %r1, 193, 0, .[[LAB:L.*]]-65536 # encoding: [0xec,0x10,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL clgij %r1, 193, 0, -0x10000 #CHECK: clgij %r1, 193, 0, .[[LAB:L.*]]-2 # encoding: [0xec,0x10,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL clgij %r1, 193, 0, -2 #CHECK: clgij %r1, 193, 0, .[[LAB:L.*]] # encoding: [0xec,0x10,A,A,0xc1,0x7d] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL clgij %r1, 193, 0, 0 #CHECK: clgij %r1, 193, 0, .[[LAB:L.*]]+65534 # encoding: [0xec,0x10,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL clgij %r1, 193, 0, 0xfffe #CHECK: clgij %r1, 193, 0, foo # encoding: [0xec,0x10,A,A,0xc1,0x7d] @@ -5318,55 +5318,55 @@ clgij %r1, 193, 15, foo #CHECK: clgij %r1, 193, 0, bar+100 # encoding: [0xec,0x10,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgij %r1, 193, 0, bar+100 #CHECK: clgijh %r1, 193, bar+100 # encoding: [0xec,0x12,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgijh %r1, 193, bar+100 #CHECK: clgijnle %r1, 193, bar+100 # encoding: [0xec,0x12,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgijnle %r1, 193, bar+100 #CHECK: clgijl %r1, 193, bar+100 # encoding: [0xec,0x14,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgijl %r1, 193, bar+100 #CHECK: clgijnhe %r1, 193, bar+100 # encoding: [0xec,0x14,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgijnhe %r1, 193, bar+100 #CHECK: clgijlh %r1, 193, bar+100 # encoding: [0xec,0x16,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgijlh %r1, 193, bar+100 #CHECK: clgijne %r1, 193, bar+100 # encoding: [0xec,0x16,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgijne %r1, 193, bar+100 #CHECK: clgije %r1, 193, bar+100 # encoding: [0xec,0x18,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgije %r1, 193, bar+100 #CHECK: clgijnlh %r1, 193, bar+100 # encoding: [0xec,0x18,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgijnlh %r1, 193, bar+100 #CHECK: clgijhe %r1, 193, bar+100 # encoding: [0xec,0x1a,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgijhe %r1, 193, bar+100 #CHECK: clgijnl %r1, 193, bar+100 # encoding: [0xec,0x1a,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgijnl %r1, 193, bar+100 #CHECK: clgijle %r1, 193, bar+100 # encoding: [0xec,0x1c,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgijle %r1, 193, bar+100 #CHECK: clgijnh %r1, 193, bar+100 # encoding: [0xec,0x1c,A,A,0xc1,0x7d] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgijnh %r1, 193, bar+100 #CHECK: clgij %r1, 193, 0, bar@PLT # encoding: [0xec,0x10,A,A,0xc1,0x7d] @@ -5553,16 +5553,16 @@ clgrj %r7,%r8,0,0 #CHECK: clgrj %r1, %r2, 0, .[[LAB:L.*]]-65536 # encoding: [0xec,0x12,A,A,0x00,0x65] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL clgrj %r1, %r2, 0, -0x10000 #CHECK: clgrj %r1, %r2, 0, .[[LAB:L.*]]-2 # encoding: [0xec,0x12,A,A,0x00,0x65] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL clgrj %r1, %r2, 0, -2 #CHECK: clgrj %r1, %r2, 0, .[[LAB:L.*]] # encoding: [0xec,0x12,A,A,0x00,0x65] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL clgrj %r1, %r2, 0, 0 #CHECK: clgrj %r1, %r2, 0, .[[LAB:L.*]]+65534 # encoding: [0xec,0x12,A,A,0x00,0x65] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL clgrj %r1, %r2, 0, 0xfffe #CHECK: clgrj %r1, %r2, 0, foo # encoding: [0xec,0x12,A,A,0x00,0x65] @@ -5666,55 +5666,55 @@ clgrj %r1, %r2, 15, foo #CHECK: clgrj %r1, %r2, 0, bar+100 # encoding: [0xec,0x12,A,A,0x00,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrj %r1, %r2, 0, bar+100 #CHECK: clgrjh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x20,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrjh %r1, %r2, bar+100 #CHECK: clgrjnle %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x20,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrjnle %r1, %r2, bar+100 #CHECK: clgrjl %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x40,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrjl %r1, %r2, bar+100 #CHECK: clgrjnhe %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x40,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrjnhe %r1, %r2, bar+100 #CHECK: clgrjlh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x60,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrjlh %r1, %r2, bar+100 #CHECK: clgrjne %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x60,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrjne %r1, %r2, bar+100 #CHECK: clgrje %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x80,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrje %r1, %r2, bar+100 #CHECK: clgrjnlh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x80,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrjnlh %r1, %r2, bar+100 #CHECK: clgrjhe %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xa0,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrjhe %r1, %r2, bar+100 #CHECK: clgrjnl %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xa0,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrjnl %r1, %r2, bar+100 #CHECK: clgrjle %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xc0,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrjle %r1, %r2, bar+100 #CHECK: clgrjnh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xc0,0x65] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clgrjnh %r1, %r2, bar+100 #CHECK: clgrj %r1, %r2, 0, bar@PLT # encoding: [0xec,0x12,A,A,0x00,0x65] @@ -5770,16 +5770,16 @@ clgrjnh %r1, %r2, bar@PLT #CHECK: clgrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x0a,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL clgrl %r0, -0x100000000 #CHECK: clgrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc6,0x0a,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL clgrl %r0, -2 #CHECK: clgrl %r0, .[[LAB:L.*]] # encoding: [0xc6,0x0a,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL clgrl %r0, 0 #CHECK: clgrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x0a,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL clgrl %r0, 0xfffffffe #CHECK: clgrl %r0, foo # encoding: [0xc6,0x0a,A,A,A,A] @@ -5791,9 +5791,9 @@ clgrl %r15,foo #CHECK: clgrl %r3, bar+100 # encoding: [0xc6,0x3a,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: clgrl %r4, bar+100 # encoding: [0xc6,0x4a,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL clgrl %r3,bar+100 clgrl %r4,bar+100 @@ -5823,16 +5823,16 @@ clhhsi 4095(%r15), 42 #CHECK: clhrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x07,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL clhrl %r0, -0x100000000 #CHECK: clhrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc6,0x07,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL clhrl %r0, -2 #CHECK: clhrl %r0, .[[LAB:L.*]] # encoding: [0xc6,0x07,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL clhrl %r0, 0 #CHECK: clhrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x07,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL clhrl %r0, 0xfffffffe #CHECK: clhrl %r0, foo # encoding: [0xc6,0x07,A,A,A,A] @@ -5844,9 +5844,9 @@ clhrl %r15,foo #CHECK: clhrl %r3, bar+100 # encoding: [0xc6,0x37,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: clhrl %r4, bar+100 # encoding: [0xc6,0x47,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL clhrl %r3,bar+100 clhrl %r4,bar+100 @@ -5966,16 +5966,16 @@ clij %r15, 0, 0, 0 #CHECK: clij %r1, 193, 0, .[[LAB:L.*]]-65536 # encoding: [0xec,0x10,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL clij %r1, 193, 0, -0x10000 #CHECK: clij %r1, 193, 0, .[[LAB:L.*]]-2 # encoding: [0xec,0x10,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL clij %r1, 193, 0, -2 #CHECK: clij %r1, 193, 0, .[[LAB:L.*]] # encoding: [0xec,0x10,A,A,0xc1,0x7f] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL clij %r1, 193, 0, 0 #CHECK: clij %r1, 193, 0, .[[LAB:L.*]]+65534 # encoding: [0xec,0x10,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL clij %r1, 193, 0, 0xfffe #CHECK: clij %r1, 193, 0, foo # encoding: [0xec,0x10,A,A,0xc1,0x7f] @@ -6079,55 +6079,55 @@ clij %r1, 193, 15, foo #CHECK: clij %r1, 193, 0, bar+100 # encoding: [0xec,0x10,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clij %r1, 193, 0, bar+100 #CHECK: clijh %r1, 193, bar+100 # encoding: [0xec,0x12,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clijh %r1, 193, bar+100 #CHECK: clijnle %r1, 193, bar+100 # encoding: [0xec,0x12,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clijnle %r1, 193, bar+100 #CHECK: clijl %r1, 193, bar+100 # encoding: [0xec,0x14,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clijl %r1, 193, bar+100 #CHECK: clijnhe %r1, 193, bar+100 # encoding: [0xec,0x14,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clijnhe %r1, 193, bar+100 #CHECK: clijlh %r1, 193, bar+100 # encoding: [0xec,0x16,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clijlh %r1, 193, bar+100 #CHECK: clijne %r1, 193, bar+100 # encoding: [0xec,0x16,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clijne %r1, 193, bar+100 #CHECK: clije %r1, 193, bar+100 # encoding: [0xec,0x18,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clije %r1, 193, bar+100 #CHECK: clijnlh %r1, 193, bar+100 # encoding: [0xec,0x18,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clijnlh %r1, 193, bar+100 #CHECK: clijhe %r1, 193, bar+100 # encoding: [0xec,0x1a,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clijhe %r1, 193, bar+100 #CHECK: clijnl %r1, 193, bar+100 # encoding: [0xec,0x1a,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clijnl %r1, 193, bar+100 #CHECK: clijle %r1, 193, bar+100 # encoding: [0xec,0x1c,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clijle %r1, 193, bar+100 #CHECK: clijnh %r1, 193, bar+100 # encoding: [0xec,0x1c,A,A,0xc1,0x7f] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clijnh %r1, 193, bar+100 #CHECK: clij %r1, 193, 0, bar@PLT # encoding: [0xec,0x10,A,A,0xc1,0x7f] @@ -6398,16 +6398,16 @@ clrj %r7,%r8,0,0 #CHECK: clrj %r1, %r2, 0, .[[LAB:L.*]]-65536 # encoding: [0xec,0x12,A,A,0x00,0x77] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL clrj %r1, %r2, 0, -0x10000 #CHECK: clrj %r1, %r2, 0, .[[LAB:L.*]]-2 # encoding: [0xec,0x12,A,A,0x00,0x77] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL clrj %r1, %r2, 0, -2 #CHECK: clrj %r1, %r2, 0, .[[LAB:L.*]] # encoding: [0xec,0x12,A,A,0x00,0x77] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL clrj %r1, %r2, 0, 0 #CHECK: clrj %r1, %r2, 0, .[[LAB:L.*]]+65534 # encoding: [0xec,0x12,A,A,0x00,0x77] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL clrj %r1, %r2, 0, 0xfffe #CHECK: clrj %r1, %r2, 0, foo # encoding: [0xec,0x12,A,A,0x00,0x77] @@ -6511,55 +6511,55 @@ clrj %r1, %r2, 15, foo #CHECK: clrj %r1, %r2, 0, bar+100 # encoding: [0xec,0x12,A,A,0x00,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrj %r1, %r2, 0, bar+100 #CHECK: clrjh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x20,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrjh %r1, %r2, bar+100 #CHECK: clrjnle %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x20,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrjnle %r1, %r2, bar+100 #CHECK: clrjl %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x40,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrjl %r1, %r2, bar+100 #CHECK: clrjnhe %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x40,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrjnhe %r1, %r2, bar+100 #CHECK: clrjlh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x60,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrjlh %r1, %r2, bar+100 #CHECK: clrjne %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x60,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrjne %r1, %r2, bar+100 #CHECK: clrje %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x80,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrje %r1, %r2, bar+100 #CHECK: clrjnlh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x80,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrjnlh %r1, %r2, bar+100 #CHECK: clrjhe %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xa0,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrjhe %r1, %r2, bar+100 #CHECK: clrjnl %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xa0,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrjnl %r1, %r2, bar+100 #CHECK: clrjle %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xc0,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrjle %r1, %r2, bar+100 #CHECK: clrjnh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xc0,0x77] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL clrjnh %r1, %r2, bar+100 #CHECK: clrj %r1, %r2, 0, bar@PLT # encoding: [0xec,0x12,A,A,0x00,0x77] @@ -6615,16 +6615,16 @@ clrjnh %r1, %r2, bar@PLT #CHECK: clrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x0f,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL clrl %r0, -0x100000000 #CHECK: clrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc6,0x0f,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL clrl %r0, -2 #CHECK: clrl %r0, .[[LAB:L.*]] # encoding: [0xc6,0x0f,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL clrl %r0, 0 #CHECK: clrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x0f,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL clrl %r0, 0xfffffffe #CHECK: clrl %r0, foo # encoding: [0xc6,0x0f,A,A,A,A] @@ -6636,9 +6636,9 @@ clrl %r15,foo #CHECK: clrl %r3, bar+100 # encoding: [0xc6,0x3f,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: clrl %r4, bar+100 # encoding: [0xc6,0x4f,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL clrl %r3,bar+100 clrl %r4,bar+100 @@ -6883,16 +6883,16 @@ crj %r7,%r8,0,0 #CHECK: crj %r1, %r2, 0, .[[LAB:L.*]]-65536 # encoding: [0xec,0x12,A,A,0x00,0x76] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-65536)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-65536+2, kind: FK_390_PC16DBL crj %r1, %r2, 0, -0x10000 #CHECK: crj %r1, %r2, 0, .[[LAB:L.*]]-2 # encoding: [0xec,0x12,A,A,0x00,0x76] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC16DBL crj %r1, %r2, 0, -2 #CHECK: crj %r1, %r2, 0, .[[LAB:L.*]] # encoding: [0xec,0x12,A,A,0x00,0x76] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC16DBL crj %r1, %r2, 0, 0 #CHECK: crj %r1, %r2, 0, .[[LAB:L.*]]+65534 # encoding: [0xec,0x12,A,A,0x00,0x76] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+65534)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+65534+2, kind: FK_390_PC16DBL crj %r1, %r2, 0, 0xfffe #CHECK: crj %r1, %r2, 0, foo # encoding: [0xec,0x12,A,A,0x00,0x76] @@ -6996,55 +6996,55 @@ crj %r1, %r2, 15, foo #CHECK: crj %r1, %r2, 0, bar+100 # encoding: [0xec,0x12,A,A,0x00,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crj %r1, %r2, 0, bar+100 #CHECK: crjh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x20,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crjh %r1, %r2, bar+100 #CHECK: crjnle %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x20,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crjnle %r1, %r2, bar+100 #CHECK: crjl %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x40,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crjl %r1, %r2, bar+100 #CHECK: crjnhe %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x40,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crjnhe %r1, %r2, bar+100 #CHECK: crjlh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x60,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crjlh %r1, %r2, bar+100 #CHECK: crjne %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x60,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crjne %r1, %r2, bar+100 #CHECK: crje %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x80,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crje %r1, %r2, bar+100 #CHECK: crjnlh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0x80,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crjnlh %r1, %r2, bar+100 #CHECK: crjhe %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xa0,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crjhe %r1, %r2, bar+100 #CHECK: crjnl %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xa0,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crjnl %r1, %r2, bar+100 #CHECK: crjle %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xc0,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crjle %r1, %r2, bar+100 #CHECK: crjnh %r1, %r2, bar+100 # encoding: [0xec,0x12,A,A,0xc0,0x76] -#CHECK: fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC16DBL +#CHECK: fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC16DBL crjnh %r1, %r2, bar+100 #CHECK: crj %r1, %r2, 0, bar@PLT # encoding: [0xec,0x12,A,A,0x00,0x76] @@ -7100,16 +7100,16 @@ crjnh %r1, %r2, bar@PLT #CHECK: crl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x0d,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL crl %r0, -0x100000000 #CHECK: crl %r0, .[[LAB:L.*]]-2 # encoding: [0xc6,0x0d,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL crl %r0, -2 #CHECK: crl %r0, .[[LAB:L.*]] # encoding: [0xc6,0x0d,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL crl %r0, 0 #CHECK: crl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x0d,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL crl %r0, 0xfffffffe #CHECK: crl %r0, foo # encoding: [0xc6,0x0d,A,A,A,A] @@ -7121,9 +7121,9 @@ crl %r15,foo #CHECK: crl %r3, bar+100 # encoding: [0xc6,0x3d,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: crl %r4, bar+100 # encoding: [0xc6,0x4d,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL crl %r3,bar+100 crl %r4,bar+100 @@ -8355,16 +8355,16 @@ ex %r15, 0 #CHECK: exrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x00,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL exrl %r0, -0x100000000 #CHECK: exrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc6,0x00,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL exrl %r0, -2 #CHECK: exrl %r0, .[[LAB:L.*]] # encoding: [0xc6,0x00,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL exrl %r0, 0 #CHECK: exrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x00,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL exrl %r0, 0xfffffffe #CHECK: exrl %r0, foo # encoding: [0xc6,0x00,A,A,A,A] @@ -8376,9 +8376,9 @@ exrl %r15,foo #CHECK: exrl %r3, bar+100 # encoding: [0xc6,0x30,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: exrl %r4, bar+100 # encoding: [0xc6,0x40,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL exrl %r3,bar+100 exrl %r4,bar+100 @@ -9056,16 +9056,16 @@ lamy %a0,%a0,524287(%r15) #CHECK: larl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc0,0x00,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL larl %r0, -0x100000000 #CHECK: larl %r0, .[[LAB:L.*]]-2 # encoding: [0xc0,0x00,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL larl %r0, -2 #CHECK: larl %r0, .[[LAB:L.*]] # encoding: [0xc0,0x00,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL larl %r0, 0 #CHECK: larl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc0,0x00,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL larl %r0, 0xfffffffe #CHECK: larl %r0, foo # encoding: [0xc0,0x00,A,A,A,A] @@ -9077,9 +9077,9 @@ larl %r15,foo #CHECK: larl %r3, bar+100 # encoding: [0xc0,0x30,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: larl %r4, bar+100 # encoding: [0xc0,0x40,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL larl %r3,bar+100 larl %r4,bar+100 @@ -9755,16 +9755,16 @@ lgfr %r15, %r0 #CHECK: lgfrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc4,0x0c,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL lgfrl %r0, -0x100000000 #CHECK: lgfrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc4,0x0c,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL lgfrl %r0, -2 #CHECK: lgfrl %r0, .[[LAB:L.*]] # encoding: [0xc4,0x0c,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL lgfrl %r0, 0 #CHECK: lgfrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc4,0x0c,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL lgfrl %r0, 0xfffffffe #CHECK: lgfrl %r0, foo # encoding: [0xc4,0x0c,A,A,A,A] @@ -9776,9 +9776,9 @@ lgfrl %r15,foo #CHECK: lgfrl %r3, bar+100 # encoding: [0xc4,0x3c,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: lgfrl %r4, bar+100 # encoding: [0xc4,0x4c,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL lgfrl %r3,bar+100 lgfrl %r4,bar+100 @@ -9839,16 +9839,16 @@ lghr %r15, %r0 #CHECK: lghrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc4,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL lghrl %r0, -0x100000000 #CHECK: lghrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc4,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL lghrl %r0, -2 #CHECK: lghrl %r0, .[[LAB:L.*]] # encoding: [0xc4,0x04,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL lghrl %r0, 0 #CHECK: lghrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc4,0x04,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL lghrl %r0, 0xfffffffe #CHECK: lghrl %r0, foo # encoding: [0xc4,0x04,A,A,A,A] @@ -9860,9 +9860,9 @@ lghrl %r15,foo #CHECK: lghrl %r3, bar+100 # encoding: [0xc4,0x34,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: lghrl %r4, bar+100 # encoding: [0xc4,0x44,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL lghrl %r3,bar+100 lghrl %r4,bar+100 @@ -9886,16 +9886,16 @@ lgr %r15,%r9 #CHECK: lgrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc4,0x08,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL lgrl %r0, -0x100000000 #CHECK: lgrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc4,0x08,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL lgrl %r0, -2 #CHECK: lgrl %r0, .[[LAB:L.*]] # encoding: [0xc4,0x08,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL lgrl %r0, 0 #CHECK: lgrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc4,0x08,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL lgrl %r0, 0xfffffffe #CHECK: lgrl %r0, foo # encoding: [0xc4,0x08,A,A,A,A] @@ -9907,9 +9907,9 @@ lgrl %r15,foo #CHECK: lgrl %r3, bar+100 # encoding: [0xc4,0x38,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: lgrl %r4, bar+100 # encoding: [0xc4,0x48,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL lgrl %r3,bar+100 lgrl %r4,bar+100 @@ -9963,16 +9963,16 @@ lhr %r15, %r0 #CHECK: lhrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc4,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL lhrl %r0, -0x100000000 #CHECK: lhrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc4,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL lhrl %r0, -2 #CHECK: lhrl %r0, .[[LAB:L.*]] # encoding: [0xc4,0x05,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL lhrl %r0, 0 #CHECK: lhrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc4,0x05,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL lhrl %r0, 0xfffffffe #CHECK: lhrl %r0, foo # encoding: [0xc4,0x05,A,A,A,A] @@ -9984,9 +9984,9 @@ lhrl %r15,foo #CHECK: lhrl %r3, bar+100 # encoding: [0xc4,0x35,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: lhrl %r4, bar+100 # encoding: [0xc4,0x45,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL lhrl %r3,bar+100 lhrl %r4,bar+100 @@ -10114,16 +10114,16 @@ llgfr %r15, %r0 #CHECK: llgfrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc4,0x0e,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL llgfrl %r0, -0x100000000 #CHECK: llgfrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc4,0x0e,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL llgfrl %r0, -2 #CHECK: llgfrl %r0, .[[LAB:L.*]] # encoding: [0xc4,0x0e,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL llgfrl %r0, 0 #CHECK: llgfrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc4,0x0e,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL llgfrl %r0, 0xfffffffe #CHECK: llgfrl %r0, foo # encoding: [0xc4,0x0e,A,A,A,A] @@ -10135,9 +10135,9 @@ llgfrl %r15,foo #CHECK: llgfrl %r3, bar+100 # encoding: [0xc4,0x3e,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: llgfrl %r4, bar+100 # encoding: [0xc4,0x4e,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL llgfrl %r3,bar+100 llgfrl %r4,bar+100 @@ -10181,16 +10181,16 @@ llghr %r15, %r0 #CHECK: llghrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc4,0x06,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL llghrl %r0, -0x100000000 #CHECK: llghrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc4,0x06,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL llghrl %r0, -2 #CHECK: llghrl %r0, .[[LAB:L.*]] # encoding: [0xc4,0x06,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL llghrl %r0, 0 #CHECK: llghrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc4,0x06,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL llghrl %r0, 0xfffffffe #CHECK: llghrl %r0, foo # encoding: [0xc4,0x06,A,A,A,A] @@ -10202,9 +10202,9 @@ llghrl %r15,foo #CHECK: llghrl %r3, bar+100 # encoding: [0xc4,0x36,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: llghrl %r4, bar+100 # encoding: [0xc4,0x46,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL llghrl %r3,bar+100 llghrl %r4,bar+100 @@ -10278,16 +10278,16 @@ llhr %r15, %r0 #CHECK: llhrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc4,0x02,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL llhrl %r0, -0x100000000 #CHECK: llhrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc4,0x02,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL llhrl %r0, -2 #CHECK: llhrl %r0, .[[LAB:L.*]] # encoding: [0xc4,0x02,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL llhrl %r0, 0 #CHECK: llhrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc4,0x02,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL llhrl %r0, 0xfffffffe #CHECK: llhrl %r0, foo # encoding: [0xc4,0x02,A,A,A,A] @@ -10299,9 +10299,9 @@ llhrl %r15,foo #CHECK: llhrl %r3, bar+100 # encoding: [0xc4,0x32,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: llhrl %r4, bar+100 # encoding: [0xc4,0x42,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL llhrl %r3,bar+100 llhrl %r4,bar+100 @@ -10867,16 +10867,16 @@ lrer %f15, %f15 #CHECK: lrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc4,0x0d,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL lrl %r0, -0x100000000 #CHECK: lrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc4,0x0d,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL lrl %r0, -2 #CHECK: lrl %r0, .[[LAB:L.*]] # encoding: [0xc4,0x0d,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL lrl %r0, 0 #CHECK: lrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc4,0x0d,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL lrl %r0, 0xfffffffe #CHECK: lrl %r0, foo # encoding: [0xc4,0x0d,A,A,A,A] @@ -10888,9 +10888,9 @@ lrl %r15,foo #CHECK: lrl %r3, bar+100 # encoding: [0xc4,0x3d,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: lrl %r4, bar+100 # encoding: [0xc4,0x4d,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL lrl %r3,bar+100 lrl %r4,bar+100 @@ -13452,16 +13452,16 @@ pfd 15, 0 #CHECK: pfdrl 0, .[[LAB:L.*]]-4294967296 # encoding: [0xc6,0x02,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL pfdrl 0, -0x100000000 #CHECK: pfdrl 0, .[[LAB:L.*]]-2 # encoding: [0xc6,0x02,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL pfdrl 0, -2 #CHECK: pfdrl 0, .[[LAB:L.*]] # encoding: [0xc6,0x02,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL pfdrl 0, 0 #CHECK: pfdrl 0, .[[LAB:L.*]]+4294967294 # encoding: [0xc6,0x02,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL pfdrl 0, 0xfffffffe #CHECK: pfdrl 0, foo # encoding: [0xc6,0x02,A,A,A,A] @@ -13473,9 +13473,9 @@ pfdrl 15, foo #CHECK: pfdrl 3, bar+100 # encoding: [0xc6,0x32,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: pfdrl 4, bar+100 # encoding: [0xc6,0x42,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL pfdrl 3, bar+100 pfdrl 4, bar+100 @@ -15632,16 +15632,16 @@ stg %r15, 0 #CHECK: stgrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc4,0x0b,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL stgrl %r0, -0x100000000 #CHECK: stgrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc4,0x0b,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL stgrl %r0, -2 #CHECK: stgrl %r0, .[[LAB:L.*]] # encoding: [0xc4,0x0b,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL stgrl %r0, 0 #CHECK: stgrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc4,0x0b,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL stgrl %r0, 0xfffffffe #CHECK: stgrl %r0, foo # encoding: [0xc4,0x0b,A,A,A,A] @@ -15653,9 +15653,9 @@ stgrl %r15,foo #CHECK: stgrl %r3, bar+100 # encoding: [0xc4,0x3b,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: stgrl %r4, bar+100 # encoding: [0xc4,0x4b,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL stgrl %r3,bar+100 stgrl %r4,bar+100 @@ -15685,16 +15685,16 @@ sth %r15, 0 #CHECK: sthrl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc4,0x07,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL sthrl %r0, -0x100000000 #CHECK: sthrl %r0, .[[LAB:L.*]]-2 # encoding: [0xc4,0x07,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL sthrl %r0, -2 #CHECK: sthrl %r0, .[[LAB:L.*]] # encoding: [0xc4,0x07,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL sthrl %r0, 0 #CHECK: sthrl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc4,0x07,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL sthrl %r0, 0xfffffffe #CHECK: sthrl %r0, foo # encoding: [0xc4,0x07,A,A,A,A] @@ -15706,9 +15706,9 @@ sthrl %r15,foo #CHECK: sthrl %r3, bar+100 # encoding: [0xc4,0x37,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: sthrl %r4, bar+100 # encoding: [0xc4,0x47,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL sthrl %r3,bar+100 sthrl %r4,bar+100 @@ -15968,16 +15968,16 @@ strag 4095(%r15), 0 #CHECK: strl %r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc4,0x0f,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-4294967296+2, kind: FK_390_PC32DBL strl %r0, -0x100000000 #CHECK: strl %r0, .[[LAB:L.*]]-2 # encoding: [0xc4,0x0f,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]-2)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]-2+2, kind: FK_390_PC32DBL strl %r0, -2 #CHECK: strl %r0, .[[LAB:L.*]] # encoding: [0xc4,0x0f,A,A,A,A] #CHECK: fixup A - offset: 2, value: .[[LAB]]+2, kind: FK_390_PC32DBL strl %r0, 0 #CHECK: strl %r0, .[[LAB:L.*]]+4294967294 # encoding: [0xc4,0x0f,A,A,A,A] -#CHECK: fixup A - offset: 2, value: (.[[LAB]]+4294967294)+2, kind: FK_390_PC32DBL +#CHECK: fixup A - offset: 2, value: .[[LAB]]+4294967294+2, kind: FK_390_PC32DBL strl %r0, 0xfffffffe #CHECK: strl %r0, foo # encoding: [0xc4,0x0f,A,A,A,A] @@ -15989,9 +15989,9 @@ strl %r15,foo #CHECK: strl %r3, bar+100 # encoding: [0xc4,0x3f,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL #CHECK: strl %r4, bar+100 # encoding: [0xc4,0x4f,A,A,A,A] -# fixup A - offset: 2, value: (bar+100)+2, kind: FK_390_PC32DBL +# fixup A - offset: 2, value: bar+100+2, kind: FK_390_PC32DBL strl %r3,bar+100 strl %r4,bar+100 diff --git a/llvm/test/tools/llvm-objdump/MachO/ARM/symbolized-disassembly.test b/llvm/test/tools/llvm-objdump/MachO/ARM/symbolized-disassembly.test index 730a000a373ff..f86188206b3be 100644 --- a/llvm/test/tools/llvm-objdump/MachO/ARM/symbolized-disassembly.test +++ b/llvm/test/tools/llvm-objdump/MachO/ARM/symbolized-disassembly.test @@ -1,8 +1,8 @@ // RUN: llvm-objdump -d -m --no-show-raw-insn --full-leading-addr --print-imm-hex %p/Inputs/hello.obj.macho-arm | FileCheck %s --check-prefix=OBJ // RUN: llvm-objdump -d -m --no-show-raw-insn --full-leading-addr --print-imm-hex %p/Inputs/hello.exe.macho-arm | FileCheck %s --check-prefix=EXE -OBJ: 00000006 40 f2 24 03 movw r3, :lower16:((54-14)-4) -OBJ: 0000000a c0 f2 00 03 movt r3, :upper16:((54-14)-4) +OBJ: 00000006 40 f2 24 03 movw r3, :lower16:(54-14-4) +OBJ: 0000000a c0 f2 00 03 movt r3, :upper16:(54-14-4) OBJ: 00000024 ff f7 ec ff bl _printf EXE: 0000bfa8 00 f0 28 e8 blx 0xbffc @ symbol stub for: _printf diff --git a/llvm/test/tools/llvm-objdump/MachO/ARM/symbolized-subtractor.test b/llvm/test/tools/llvm-objdump/MachO/ARM/symbolized-subtractor.test index 65df2a984cd02..bf452c2948a00 100644 --- a/llvm/test/tools/llvm-objdump/MachO/ARM/symbolized-subtractor.test +++ b/llvm/test/tools/llvm-objdump/MachO/ARM/symbolized-subtractor.test @@ -11,5 +11,5 @@ PCinst: .section __TEXT,__cstring,cstring_literals Str: .asciz "Hello world\n" -# CHECK: movw r3, :lower16:((Str-PCinst)-4) -# CHECK: movt r3, :upper16:((Str-PCinst)-4) +# CHECK: movw r3, :lower16:(Str-PCinst-4) +# CHECK: movt r3, :upper16:(Str-PCinst-4) From c6b3fd799962577488a0ecbdac806f4753e14d3c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 30 Mar 2025 22:27:47 -0700 Subject: [PATCH 0068/1029] [MC] maybeParseSectionType: test CommentString instead of AllowAtInIdentifier Rework https://reviews.llvm.org/D31026 AllowAtInIdentifier is a misnomer: it should be false for ELF targets, but is currently true as a hack to parse expr@specifier. --- llvm/lib/MC/MCParser/ELFAsmParser.cpp | 6 +++--- llvm/test/MC/ELF/gnu-type-diagnostics.s | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index 11e122bcaac23..c94ddfa087fd3 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -401,10 +401,10 @@ bool ELFAsmParser::maybeParseSectionType(StringRef &TypeName) { Lex(); if (L.isNot(AsmToken::At) && L.isNot(AsmToken::Percent) && L.isNot(AsmToken::String)) { - if (L.getAllowAtInIdentifier()) - return TokError("expected '@', '%' or \"\""); - else + if (getContext().getAsmInfo()->getCommentString().starts_with('@')) return TokError("expected '%' or \"\""); + else + return TokError("expected '@', '%' or \"\""); } if (!L.is(AsmToken::String)) Lex(); diff --git a/llvm/test/MC/ELF/gnu-type-diagnostics.s b/llvm/test/MC/ELF/gnu-type-diagnostics.s index 23c144fee1db8..dbbfc5a173dec 100644 --- a/llvm/test/MC/ELF/gnu-type-diagnostics.s +++ b/llvm/test/MC/ELF/gnu-type-diagnostics.s @@ -15,4 +15,5 @@ // CHECK: .type symbol 32 // CHECK: ^ - +.section "foo", "a", !progbits +// CHECK: [[#@LINE-1]]:22: error: expected '@', '%' or "" From 6fb674174ecc1d27b0d303c89c666d949f7afee5 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 30 Mar 2025 22:33:37 -0700 Subject: [PATCH 0069/1029] [RISCV] Fix the operand types for shift instructions in RISCVInstrInfoSFB.td. NFC Due to a copy paste mistake we used simm12 instead of the correct type. This doesn't matter in practice because we only generate these instructions with C++ code and we expand them before the AsmPrinter. --- llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index 16cc0e5a61f0b..32f533b8f1146 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -98,17 +98,17 @@ def PseudoCCADDI : Pseudo<(outs GPR:$dst), ReadSFBALU]>; def PseudoCCSLLI : Pseudo<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, simm12:$rs2), []>, + GPR:$falsev, GPR:$rs1, uimmlog2xlen:$shamt), []>, Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]>; def PseudoCCSRLI : Pseudo<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, simm12:$rs2), []>, + GPR:$falsev, GPR:$rs1, uimmlog2xlen:$shamt), []>, Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]>; def PseudoCCSRAI : Pseudo<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, simm12:$rs2), []>, + GPR:$falsev, GPR:$rs1, uimmlog2xlen:$shamt), []>, Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]>; def PseudoCCANDI : Pseudo<(outs GPR:$dst), @@ -161,17 +161,17 @@ def PseudoCCADDIW : Pseudo<(outs GPR:$dst), ReadSFBALU]>; def PseudoCCSLLIW : Pseudo<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, simm12:$rs2), []>, + GPR:$falsev, GPR:$rs1, uimm5:$shamt), []>, Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]>; def PseudoCCSRLIW : Pseudo<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, simm12:$rs2), []>, + GPR:$falsev, GPR:$rs1, uimm5:$shamt), []>, Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]>; def PseudoCCSRAIW : Pseudo<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, - GPR:$falsev, GPR:$rs1, simm12:$rs2), []>, + GPR:$falsev, GPR:$rs1, uimm5:$shamt), []>, Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]>; From 60199ee5396c2242c7d3cfa882e28312e6895c2a Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 30 Mar 2025 22:57:25 -0700 Subject: [PATCH 0070/1029] [clang] Use DenseMap::insert_range (NFC) (#133655) --- clang/lib/AST/VTableBuilder.cpp | 6 ++---- clang/lib/CodeGen/CodeGenModule.cpp | 3 +-- clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 4 ++-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/clang/lib/AST/VTableBuilder.cpp b/clang/lib/AST/VTableBuilder.cpp index 18893b996b5d6..6c97b8718c65e 100644 --- a/clang/lib/AST/VTableBuilder.cpp +++ b/clang/lib/AST/VTableBuilder.cpp @@ -3736,8 +3736,7 @@ void MicrosoftVTableContext::computeVTableRelatedInformation( } } - MethodVFTableLocations.insert(NewMethodLocations.begin(), - NewMethodLocations.end()); + MethodVFTableLocations.insert_range(NewMethodLocations); if (Context.getLangOpts().DumpVTableLayouts) dumpMethodLocations(RD, NewMethodLocations, llvm::outs()); } @@ -3824,8 +3823,7 @@ const VirtualBaseInfo &MicrosoftVTableContext::computeVBTableRelatedInformation( // virtual bases come first so that the layout is the same. const VirtualBaseInfo &BaseInfo = computeVBTableRelatedInformation(VBPtrBase); - VBI->VBTableIndices.insert(BaseInfo.VBTableIndices.begin(), - BaseInfo.VBTableIndices.end()); + VBI->VBTableIndices.insert_range(BaseInfo.VBTableIndices); } // New vbases are added to the end of the vbtable. diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 5dbd50be6ca1a..43345da268868 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -853,8 +853,7 @@ void CodeGenModule::Release() { if (CXX20ModuleInits && Primary && !Primary->isHeaderLikeModule()) EmitModuleInitializers(Primary); EmitDeferred(); - DeferredDecls.insert(EmittedDeferredDecls.begin(), - EmittedDeferredDecls.end()); + DeferredDecls.insert_range(EmittedDeferredDecls); EmittedDeferredDecls.clear(); EmitVTablesOpportunistically(); applyGlobalValReplacements(); diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 52dec2013a24f..52d922abbcaec 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -1099,7 +1099,7 @@ Expected getSymbolsFromBitcode(MemoryBufferRef Buffer, OffloadKind Kind, // If the file gets extracted we update the table with the new symbols. if (ShouldExtract) - Syms.insert(std::begin(TmpSyms), std::end(TmpSyms)); + Syms.insert_range(TmpSyms); return ShouldExtract; } @@ -1154,7 +1154,7 @@ Expected getSymbolsFromObject(const ObjectFile &Obj, OffloadKind Kind, // If the file gets extracted we update the table with the new symbols. if (ShouldExtract) - Syms.insert(std::begin(TmpSyms), std::end(TmpSyms)); + Syms.insert_range(TmpSyms); return ShouldExtract; } From 2fc08d4c31285ecec64fda7e0833d583503f10d0 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 30 Mar 2025 22:57:45 -0700 Subject: [PATCH 0071/1029] [Vectorize] Use DenseMap::insert_range (NFC) (#133656) --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3fc5e716e3757..dd392056a07ee 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5415,7 +5415,7 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && computePredInstDiscount(&I, ScalarCosts, VF) >= 0) { - ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); + ScalarCostsVF.insert_range(ScalarCosts); // Check if we decided to scalarize a call. If so, update the widening // decision of the call to CM_Scalarize with the computed scalar cost. for (const auto &[I, Cost] : ScalarCosts) { From fff8f035ac781a60f42f5ca88bdcdcdf5ed74d76 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Sun, 30 Mar 2025 22:58:03 -0700 Subject: [PATCH 0072/1029] [polly] Use DenseMap::insert_range (NFC) (#133657) --- polly/include/polly/CodeGen/IRBuilder.h | 2 +- polly/lib/CodeGen/BlockGenerators.cpp | 8 ++++---- polly/lib/CodeGen/IslNodeBuilder.cpp | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/polly/include/polly/CodeGen/IRBuilder.h b/polly/include/polly/CodeGen/IRBuilder.h index 6641ac9a0c068..5a111e9a2cb7c 100644 --- a/polly/include/polly/CodeGen/IRBuilder.h +++ b/polly/include/polly/CodeGen/IRBuilder.h @@ -79,7 +79,7 @@ class ScopAnnotator { void addAlternativeAliasBases( llvm::DenseMap, llvm::AssertingVH> &NewMap) { - AlternativeAliasBases.insert(NewMap.begin(), NewMap.end()); + AlternativeAliasBases.insert_range(NewMap); } /// Delete the set of alternative alias bases diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp index b2e3b5d32fbe2..cf2cc65e0f042 100644 --- a/polly/lib/CodeGen/BlockGenerators.cpp +++ b/polly/lib/CodeGen/BlockGenerators.cpp @@ -76,8 +76,8 @@ Value *BlockGenerator::trySynthesizeNewValue(ScopStmt &Stmt, Value *Old, return nullptr; ValueMapT VTV; - VTV.insert(BBMap.begin(), BBMap.end()); - VTV.insert(GlobalMap.begin(), GlobalMap.end()); + VTV.insert_range(BBMap); + VTV.insert_range(GlobalMap); Scop &S = *Stmt.getParent(); const DataLayout &DL = S.getFunction().getDataLayout(); @@ -1131,7 +1131,7 @@ void RegionGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT <S, // Remember value in case it is visible after this subregion. if (isDominatingSubregionExit(DT, R, BB)) - ValueMap.insert(RegionMap.begin(), RegionMap.end()); + ValueMap.insert_range(RegionMap); } // Now create a new dedicated region exit block and add it to the region map. @@ -1164,7 +1164,7 @@ void RegionGenerator::copyStmt(ScopStmt &Stmt, LoopToScevMapT <S, Instruction *BICopy = BBCopyEnd->getTerminator(); ValueMapT &RegionMap = RegionMaps[BBCopyStart]; - RegionMap.insert(StartBlockMap.begin(), StartBlockMap.end()); + RegionMap.insert_range(StartBlockMap); Builder.SetInsertPoint(BICopy); copyInstScalar(Stmt, TI, RegionMap, LTS); diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp index ca497927e2976..e818dab4f9c0c 100644 --- a/polly/lib/CodeGen/IslNodeBuilder.cpp +++ b/polly/lib/CodeGen/IslNodeBuilder.cpp @@ -895,7 +895,7 @@ void IslNodeBuilder::createUser(__isl_take isl_ast_node *User) { Id = isl_ast_expr_get_id(StmtExpr); isl_ast_expr_free(StmtExpr); - LTS.insert(OutsideLoopIterations.begin(), OutsideLoopIterations.end()); + LTS.insert_range(OutsideLoopIterations); Stmt = (ScopStmt *)isl_id_get_user(Id); auto *NewAccesses = createNewAccesses(Stmt, User); From d63cc4c87689a79d25521c9aa2ce4a335e5984e3 Mon Sep 17 00:00:00 2001 From: T-Gruber <100079402+T-Gruber@users.noreply.github.com> Date: Mon, 31 Mar 2025 08:44:28 +0200 Subject: [PATCH 0073/1029] [analyzer] Unknown array lvalue element in Store (#133381) Remove the early return for BaseRegions of type ElementRegion. Return meaningful MemRegionVal for these cases as well. Previous discussion: https://discourse.llvm.org/t/lvalueelement-returns-unknownval-for-multi-dimensional-arrays/85476 --- clang/lib/StaticAnalyzer/Core/Store.cpp | 6 +--- .../ArrayBound/assumption-reporting.c | 7 +---- clang/test/Analysis/lvalue_elements.c | 31 +++++++++++++++++++ 3 files changed, 33 insertions(+), 11 deletions(-) create mode 100644 clang/test/Analysis/lvalue_elements.c diff --git a/clang/lib/StaticAnalyzer/Core/Store.cpp b/clang/lib/StaticAnalyzer/Core/Store.cpp index 5f30fae4b7047..da6885ecd0ec5 100644 --- a/clang/lib/StaticAnalyzer/Core/Store.cpp +++ b/clang/lib/StaticAnalyzer/Core/Store.cpp @@ -511,13 +511,9 @@ SVal StoreManager::getLValueElement(QualType elementType, NonLoc Offset, // Only allow non-integer offsets if the base region has no offset itself. // FIXME: This is a somewhat arbitrary restriction. We should be using // SValBuilder here to add the two offsets without checking their types. - if (!isa(Offset)) { - if (isa(BaseRegion->StripCasts())) - return UnknownVal(); - + if (!isa(Offset)) return loc::MemRegionVal(MRMgr.getElementRegion( elementType, Offset, cast(ElemR->getSuperRegion()), Ctx)); - } const llvm::APSInt& OffI = Offset.castAs().getValue(); assert(BaseIdxI.isSigned()); diff --git a/clang/test/Analysis/ArrayBound/assumption-reporting.c b/clang/test/Analysis/ArrayBound/assumption-reporting.c index d687886ada1ae..535e623baa815 100644 --- a/clang/test/Analysis/ArrayBound/assumption-reporting.c +++ b/clang/test/Analysis/ArrayBound/assumption-reporting.c @@ -39,14 +39,9 @@ int assumingBothPointerToMiddle(int arg) { // will speak about the "byte offset" measured from the beginning of the TenElements. int *p = TenElements + 2; int a = p[arg]; - // FIXME: The following note does not appear: - // {{Assuming byte offset is non-negative and less than 40, the extent of 'TenElements'}} - // It seems that the analyzer "gives up" modeling this pointer arithmetics - // and says that `p[arg]` is just an UnknownVal (instead of calculating that - // it's equivalent to `TenElements[2+arg]`). + // expected-note@-1 {{Assuming byte offset is non-negative and less than 40, the extent of 'TenElements'}} int b = TenElements[arg]; // This is normal access, and only the lower bound is new. - // expected-note@-1 {{Assuming index is non-negative}} int c = TenElements[arg + 10]; // expected-warning@-1 {{Out of bound access to memory after the end of 'TenElements'}} // expected-note@-2 {{Access of 'TenElements' at an overflowing index, while it holds only 10 'int' elements}} diff --git a/clang/test/Analysis/lvalue_elements.c b/clang/test/Analysis/lvalue_elements.c new file mode 100644 index 0000000000000..73b9c037d80d2 --- /dev/null +++ b/clang/test/Analysis/lvalue_elements.c @@ -0,0 +1,31 @@ +// RUN: %clang_analyze_cc1 -std=c11 -analyzer-checker=debug.ExprInspection -verify %s + +void clang_analyzer_dump(int*); + +const int const_index = 1; +extern int unknown_index; +extern int array[3]; +extern int matrix[3][3]; + +int main(){ + + // expected-warning@+1 {{&Element{array,1 S64b,int}}} + clang_analyzer_dump(&array[const_index]); + + // expected-warning@+1 {{&Element{array,reg_$1,int}}} + clang_analyzer_dump(&array[unknown_index]); + + // expected-warning@+1 {{&Element{Element{matrix,1 S64b,int[3]},1 S64b,int}}} + clang_analyzer_dump(&matrix[const_index][const_index]); + + // expected-warning@+1 {{&Element{Element{matrix,reg_$1,int[3]},1 S64b,int}}} + clang_analyzer_dump(&matrix[unknown_index][const_index]); + + // expected-warning@+1 {{&Element{Element{matrix,1 S64b,int[3]},reg_$1,int}}} + clang_analyzer_dump(&matrix[const_index][unknown_index]); + + // expected-warning@+1 {{&Element{Element{matrix,reg_$1,int[3]},reg_$1,int}}} + clang_analyzer_dump(&matrix[unknown_index][unknown_index]); + + return 0; +} From 809f857d2c8edffe1dac317982b68a467710f877 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 31 Mar 2025 07:55:48 +0100 Subject: [PATCH 0074/1029] [VPlan] Support early-exit loops in optimizeForVFAndUF. (#131539) Update optimizeForVFAndUF to support early-exit loops by handling BranchOnCond(Or(..., CanonicalIV == TripCount)) via SCEV PR: https://github.com/llvm/llvm-project/pull/131539 --- .../Transforms/Vectorize/VPlanTransforms.cpp | 78 ++++++++++++++----- ...or-loop-backedge-elimination-early-exit.ll | 48 ++++-------- 2 files changed, 75 insertions(+), 51 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 8852540aec931..3ebd844d6a5a1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1163,35 +1163,75 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, return MadeChange; } -/// Try to simplify the branch condition of \p Plan. This may restrict the -/// resulting plan to \p BestVF and \p BestUF. -static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, - unsigned BestUF, - PredicatedScalarEvolution &PSE) { - VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); - VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); - auto *Term = &ExitingVPBB->back(); - // Try to simplify the branch condition if TC <= VF * UF when preparing to - // execute the plan for the main vector loop. We only do this if the - // terminator is: - // 1. BranchOnCount, or - // 2. BranchOnCond where the input is Not(ActiveLaneMask). +/// Return true if \p Cond is known to be true for given \p BestVF and \p +/// BestUF. +static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, + ElementCount BestVF, unsigned BestUF, + ScalarEvolution &SE) { using namespace llvm::VPlanPatternMatch; - if (!match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) && - !match(Term, - m_BranchOnCond(m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) + if (match(Cond, m_Binary(m_VPValue(), m_VPValue()))) + return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF, + &SE](VPValue *C) { + return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, SE); + }); + + auto *CanIV = Plan.getCanonicalIV(); + if (!match(Cond, m_Binary( + m_Specific(CanIV->getBackedgeValue()), + m_Specific(&Plan.getVectorTripCount()))) || + cast(Cond->getDefiningRecipe())->getPredicate() != + CmpInst::ICMP_EQ) return false; - ScalarEvolution &SE = *PSE.getSE(); + // The compare checks CanIV + VFxUF == vector trip count. The vector trip + // count is not conveniently available as SCEV so far, so we compare directly + // against the original trip count. This is stricter than necessary, as we + // will only return true if the trip count == vector trip count. + // TODO: Use SCEV for vector trip count once available, to cover cases where + // vector trip count == UF * VF, but original trip count != UF * VF. const SCEV *TripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE); assert(!isa(TripCount) && "Trip count SCEV must be computable"); ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF); const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements); - if (TripCount->isZero() || - !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) + return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C); +} + +/// Try to simplify the branch condition of \p Plan. This may restrict the +/// resulting plan to \p BestVF and \p BestUF. +static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, + unsigned BestUF, + PredicatedScalarEvolution &PSE) { + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); + auto *Term = &ExitingVPBB->back(); + VPValue *Cond; + ScalarEvolution &SE = *PSE.getSE(); + using namespace llvm::VPlanPatternMatch; + if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) || + match(Term, m_BranchOnCond( + m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) { + // Try to simplify the branch condition if TC <= VF * UF when the latch + // terminator is BranchOnCount or BranchOnCond where the input is + // Not(ActiveLaneMask). + const SCEV *TripCount = + vputils::getSCEVExprForVPValue(Plan.getTripCount(), SE); + assert(!isa(TripCount) && + "Trip count SCEV must be computable"); + ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF); + const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements); + if (TripCount->isZero() || + !SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) + return false; + } else if (match(Term, m_BranchOnCond(m_VPValue(Cond)))) { + // For BranchOnCond, check if we can prove the condition to be true using VF + // and UF. + if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, SE)) + return false; + } else { return false; + } // The vector loop region only executes once. If possible, completely remove // the region, otherwise replace the terminator controlling the latch with diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll index de4b265b155b6..e29b15b8991e0 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll @@ -55,16 +55,12 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn ; VF8UF2: [[VECTOR_PH]]: ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: -; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0 ; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0 ; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 ; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer -; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]]) -; VF8UF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; VF8UF2-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]] -; VF8UF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF8UF2-NEXT: br label %[[MIDDLE_SPLIT:.*]] ; VF8UF2: [[MIDDLE_SPLIT]]: ; VF8UF2-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: @@ -83,7 +79,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn ; VF8UF2: [[LOOP_LATCH]]: ; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1 ; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16 -; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]] ; VF8UF2: [[EXIT]]: ; VF8UF2-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ] ; VF8UF2-NEXT: ret i8 [[RES]] @@ -95,16 +91,12 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn ; VF16UF1: [[VECTOR_PH]]: ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF16UF1: [[VECTOR_BODY]]: -; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0 ; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0 ; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; VF16UF1-NEXT: [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer -; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]]) -; VF16UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; VF16UF1-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]] -; VF16UF1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF16UF1-NEXT: br label %[[MIDDLE_SPLIT:.*]] ; VF16UF1: [[MIDDLE_SPLIT]]: ; VF16UF1-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: @@ -123,7 +115,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn ; VF16UF1: [[LOOP_LATCH]]: ; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1 ; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16 -; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP0:![0-9]+]] ; VF16UF1: [[EXIT]]: ; VF16UF1-NEXT: [[RES:%.*]] = phi i8 [ 0, %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ 0, %[[VECTOR_EARLY_EXIT]] ] ; VF16UF1-NEXT: ret i8 [[RES]] @@ -198,23 +190,19 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer ; VF8UF2: [[VECTOR_PH]]: ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] ; VF8UF2: [[VECTOR_BODY]]: -; VF8UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0 ; VF8UF2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0 ; VF8UF2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 ; VF8UF2-NEXT: [[TMP3:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD]], zeroinitializer -; VF8UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; VF8UF2-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP3]]) -; VF8UF2-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; VF8UF2-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]] -; VF8UF2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF8UF2-NEXT: br label %[[MIDDLE_SPLIT:.*]] ; VF8UF2: [[MIDDLE_SPLIT]]: ; VF8UF2-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: ; VF8UF2-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF8UF2: [[VECTOR_EARLY_EXIT]]: ; VF8UF2-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP3]], i1 true) -; VF8UF2-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]] +; VF8UF2-NEXT: [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]] ; VF8UF2-NEXT: br label %[[EXIT]] ; VF8UF2: [[SCALAR_PH]]: ; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] @@ -228,9 +216,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer ; VF8UF2: [[LOOP_LATCH]]: ; VF8UF2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1 ; VF8UF2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16 -; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; VF8UF2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; VF8UF2: [[EXIT]]: -; VF8UF2-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP8]], %[[VECTOR_EARLY_EXIT]] ] +; VF8UF2-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP5]], %[[VECTOR_EARLY_EXIT]] ] ; VF8UF2-NEXT: ret i64 [[RES]] ; ; VF16UF1-LABEL: define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside( @@ -240,23 +228,19 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer ; VF16UF1: [[VECTOR_PH]]: ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; VF16UF1: [[VECTOR_BODY]]: -; VF16UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; VF16UF1-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0 ; VF16UF1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P_SRC]], i32 0 ; VF16UF1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; VF16UF1-NEXT: [[TMP3:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], zeroinitializer -; VF16UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; VF16UF1-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]]) -; VF16UF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; VF16UF1-NEXT: [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]] -; VF16UF1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF16UF1-NEXT: br label %[[MIDDLE_SPLIT:.*]] ; VF16UF1: [[MIDDLE_SPLIT]]: ; VF16UF1-NEXT: br i1 [[TMP4]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: ; VF16UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] ; VF16UF1: [[VECTOR_EARLY_EXIT]]: ; VF16UF1-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> [[TMP3]], i1 true) -; VF16UF1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]] +; VF16UF1-NEXT: [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]] ; VF16UF1-NEXT: br label %[[EXIT]] ; VF16UF1: [[SCALAR_PH]]: ; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] @@ -270,9 +254,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer ; VF16UF1: [[LOOP_LATCH]]: ; VF16UF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV1]], 1 ; VF16UF1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 16 -; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; VF16UF1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; VF16UF1: [[EXIT]]: -; VF16UF1-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP8]], %[[VECTOR_EARLY_EXIT]] ] +; VF16UF1-NEXT: [[RES:%.*]] = phi i64 [ [[IV1]], %[[LOOP_HEADER]] ], [ 1, %[[LOOP_LATCH]] ], [ 1, %[[MIDDLE_BLOCK]] ], [ [[TMP5]], %[[VECTOR_EARLY_EXIT]] ] ; VF16UF1-NEXT: ret i64 [[RES]] ; entry: From 842b57b77520abf202999946d3bb01b5dcabb179 Mon Sep 17 00:00:00 2001 From: Mariya Podchishchaeva Date: Mon, 31 Mar 2025 10:03:39 +0200 Subject: [PATCH 0075/1029] Reland [MS][clang] Add support for vector deleting destructors (#133451) Whereas it is UB in terms of the standard to delete an array of objects via pointer whose static type doesn't match its dynamic type, MSVC supports an extension allowing to do it. Aside from array deletion not working correctly in the mentioned case, currently not having this extension implemented causes clang to generate code that is not compatible with the code generated by MSVC, because clang always puts scalar deleting destructor to the vftable. This PR aims to resolve these problems. It was reverted due to link time errors in chromium with sanitizer coverage enabled, which is fixed by https://github.com/llvm/llvm-project/pull/131929 . The second commit of this PR also contains a fix for a runtime failure in chromium reported in https://github.com/llvm/llvm-project/pull/126240#issuecomment-2730216384 . Fixes https://github.com/llvm/llvm-project/issues/19772 --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/AST/VTableBuilder.h | 6 +- clang/include/clang/Basic/ABI.h | 9 +- clang/lib/AST/ItaniumMangle.cpp | 2 + clang/lib/AST/MicrosoftMangle.cpp | 22 +-- clang/lib/AST/VTableBuilder.cpp | 19 ++- clang/lib/CodeGen/CGCXX.cpp | 37 ++++- clang/lib/CodeGen/CGCXXABI.cpp | 14 ++ clang/lib/CodeGen/CGCXXABI.h | 7 + clang/lib/CodeGen/CGClass.cpp | 77 ++++++++- clang/lib/CodeGen/CGDebugInfo.cpp | 3 +- clang/lib/CodeGen/CGExprCXX.cpp | 42 ++++- clang/lib/CodeGen/CGVTables.cpp | 3 +- clang/lib/CodeGen/CodeGenModule.cpp | 48 ++++++ clang/lib/CodeGen/CodeGenModule.h | 6 + clang/lib/CodeGen/ItaniumCXXABI.cpp | 6 +- clang/lib/CodeGen/MicrosoftCXXABI.cpp | 49 ++++-- .../CodeGenCXX/debug-info-windows-dtor.cpp | 2 +- clang/test/CodeGenCXX/dllexport.cpp | 2 +- .../microsoft-abi-extern-template.cpp | 2 +- .../CodeGenCXX/microsoft-abi-structors.cpp | 5 +- .../test/CodeGenCXX/microsoft-abi-thunks.cpp | 3 +- .../CodeGenCXX/microsoft-abi-vftables.cpp | 20 +-- .../microsoft-abi-virtual-inheritance.cpp | 17 +- ...multiple-nonvirtual-inheritance-vdtors.cpp | 18 +-- .../microsoft-abi-vtables-return-thunks.cpp | 2 +- ...crosoft-abi-vtables-single-inheritance.cpp | 20 +-- ...-vtables-virtual-inheritance-vtordisps.cpp | 30 ++-- ...rosoft-abi-vtables-virtual-inheritance.cpp | 18 +-- .../CodeGenCXX/microsoft-no-rtti-data.cpp | 2 +- .../microsoft-vector-deleting-dtors.cpp | 152 ++++++++++++++++++ clang/test/CodeGenCXX/vtable-consteval.cpp | 4 +- clang/test/Modules/vtable-windows.cppm | 2 +- clang/test/Profile/cxx-abc-deleting-dtor.cpp | 9 +- 34 files changed, 534 insertions(+), 125 deletions(-) create mode 100644 clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index d72beb3a479b0..4b8e09d051616 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -434,6 +434,7 @@ Windows Support - Clang now can process the `i128` and `ui128` integeral suffixes when MSVC extensions are enabled. This allows for properly processing ``intsafe.h`` in the Windows SDK. +- Clang now supports MSVC vector deleting destructors (GH19772). LoongArch Support ^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/VTableBuilder.h b/clang/include/clang/AST/VTableBuilder.h index a5de41dbc22f1..e1efe8cddcc5e 100644 --- a/clang/include/clang/AST/VTableBuilder.h +++ b/clang/include/clang/AST/VTableBuilder.h @@ -150,7 +150,7 @@ class VTableComponent { bool isRTTIKind() const { return isRTTIKind(getKind()); } - GlobalDecl getGlobalDecl() const { + GlobalDecl getGlobalDecl(bool HasVectorDeletingDtors) const { assert(isUsedFunctionPointerKind() && "GlobalDecl can be created only from virtual function"); @@ -161,7 +161,9 @@ class VTableComponent { case CK_CompleteDtorPointer: return GlobalDecl(DtorDecl, CXXDtorType::Dtor_Complete); case CK_DeletingDtorPointer: - return GlobalDecl(DtorDecl, CXXDtorType::Dtor_Deleting); + return GlobalDecl(DtorDecl, (HasVectorDeletingDtors) + ? CXXDtorType::Dtor_VectorDeleting + : CXXDtorType::Dtor_Deleting); case CK_VCallOffset: case CK_VBaseOffset: case CK_OffsetToTop: diff --git a/clang/include/clang/Basic/ABI.h b/clang/include/clang/Basic/ABI.h index 231bad799a42c..48969e4f295c3 100644 --- a/clang/include/clang/Basic/ABI.h +++ b/clang/include/clang/Basic/ABI.h @@ -31,10 +31,11 @@ enum CXXCtorType { /// C++ destructor types. enum CXXDtorType { - Dtor_Deleting, ///< Deleting dtor - Dtor_Complete, ///< Complete object dtor - Dtor_Base, ///< Base object dtor - Dtor_Comdat ///< The COMDAT used for dtors + Dtor_Deleting, ///< Deleting dtor + Dtor_Complete, ///< Complete object dtor + Dtor_Base, ///< Base object dtor + Dtor_Comdat, ///< The COMDAT used for dtors + Dtor_VectorDeleting ///< Vector deleting dtor }; } // end namespace clang diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 981cdb3c806b1..49a04861ae25d 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -6004,6 +6004,8 @@ void CXXNameMangler::mangleCXXDtorType(CXXDtorType T) { case Dtor_Comdat: Out << "D5"; break; + case Dtor_VectorDeleting: + llvm_unreachable("Itanium ABI does not use vector deleting dtors"); } } diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index 15de407e122d8..7e964124a9fec 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -1484,8 +1484,9 @@ void MicrosoftCXXNameMangler::mangleCXXDtorType(CXXDtorType T) { // ::= ?_G # scalar deleting destructor case Dtor_Deleting: Out << "?_G"; return; // ::= ?_E # vector deleting destructor - // FIXME: Add a vector deleting dtor type. It goes in the vtable, so we need - // it. + case Dtor_VectorDeleting: + Out << "?_E"; + return; case Dtor_Comdat: llvm_unreachable("not expecting a COMDAT"); } @@ -2886,9 +2887,12 @@ void MicrosoftCXXNameMangler::mangleFunctionType(const FunctionType *T, // ::= @ # structors (they have no declared return type) if (IsStructor) { if (isa(D) && isStructorDecl(D)) { - // The scalar deleting destructor takes an extra int argument which is not - // reflected in the AST. - if (StructorType == Dtor_Deleting) { + // The deleting destructors take an extra argument of type int that + // indicates whether the storage for the object should be deleted and + // whether a single object or an array of objects is being destroyed. This + // extra argument is not reflected in the AST. + if (StructorType == Dtor_Deleting || + StructorType == Dtor_VectorDeleting) { Out << (PointersAre64Bit ? "PEAXI@Z" : "PAXI@Z"); return; } @@ -3861,10 +3865,10 @@ void MicrosoftMangleContextImpl::mangleCXXDtorThunk(const CXXDestructorDecl *DD, const ThunkInfo &Thunk, bool /*ElideOverrideInfo*/, raw_ostream &Out) { - // FIXME: Actually, the dtor thunk should be emitted for vector deleting - // dtors rather than scalar deleting dtors. Just use the vector deleting dtor - // mangling manually until we support both deleting dtor types. - assert(Type == Dtor_Deleting); + // The dtor thunk should use vector deleting dtor mangling, however as an + // optimization we may end up emitting only scalar deleting dtor body, so just + // use the vector deleting dtor mangling manually. + assert(Type == Dtor_Deleting || Type == Dtor_VectorDeleting); msvc_hashing_ostream MHO(Out); MicrosoftCXXNameMangler Mangler(*this, MHO, DD, Type); Mangler.getStream() << "??_E"; diff --git a/clang/lib/AST/VTableBuilder.cpp b/clang/lib/AST/VTableBuilder.cpp index 6c97b8718c65e..21f9d343c6ee7 100644 --- a/clang/lib/AST/VTableBuilder.cpp +++ b/clang/lib/AST/VTableBuilder.cpp @@ -1735,8 +1735,8 @@ void ItaniumVTableBuilder::LayoutPrimaryAndSecondaryVTables( const CXXMethodDecl *MD = I.first; const MethodInfo &MI = I.second; if (const CXXDestructorDecl *DD = dyn_cast(MD)) { - MethodVTableIndices[GlobalDecl(DD, Dtor_Complete)] - = MI.VTableIndex - AddressPoint; + MethodVTableIndices[GlobalDecl(DD, Dtor_Complete)] = + MI.VTableIndex - AddressPoint; MethodVTableIndices[GlobalDecl(DD, Dtor_Deleting)] = MI.VTableIndex + 1 - AddressPoint; } else { @@ -2657,7 +2657,11 @@ class VFTableBuilder { MethodVFTableLocation Loc(MI.VBTableIndex, WhichVFPtr.getVBaseWithVPtr(), WhichVFPtr.NonVirtualOffset, MI.VFTableIndex); if (const CXXDestructorDecl *DD = dyn_cast(MD)) { - MethodVFTableLocations[GlobalDecl(DD, Dtor_Deleting)] = Loc; + // In Microsoft ABI vftable always references vector deleting dtor. + CXXDtorType DtorTy = Context.getTargetInfo().getCXXABI().isMicrosoft() + ? Dtor_VectorDeleting + : Dtor_Deleting; + MethodVFTableLocations[GlobalDecl(DD, DtorTy)] = Loc; } else { MethodVFTableLocations[MD] = Loc; } @@ -3287,7 +3291,10 @@ void VFTableBuilder::dumpLayout(raw_ostream &Out) { const CXXDestructorDecl *DD = Component.getDestructorDecl(); DD->printQualifiedName(Out); - Out << "() [scalar deleting]"; + if (Context.getTargetInfo().getCXXABI().isMicrosoft()) + Out << "() [vector deleting]"; + else + Out << "() [scalar deleting]"; if (DD->isPureVirtual()) Out << " [pure]"; @@ -3757,7 +3764,7 @@ void MicrosoftVTableContext::dumpMethodLocations( PredefinedIdentKind::PrettyFunctionNoVirtual, MD); if (isa(MD)) { - IndicesMap[I.second] = MethodName + " [scalar deleting]"; + IndicesMap[I.second] = MethodName + " [vector deleting]"; } else { IndicesMap[I.second] = MethodName; } @@ -3873,7 +3880,7 @@ MicrosoftVTableContext::getMethodVFTableLocation(GlobalDecl GD) { assert(hasVtableSlot(cast(GD.getDecl())) && "Only use this method for virtual methods or dtors"); if (isa(GD.getDecl())) - assert(GD.getDtorType() == Dtor_Deleting); + assert(GD.getDtorType() == Dtor_VectorDeleting); GD = GD.getCanonicalDecl(); diff --git a/clang/lib/CodeGen/CGCXX.cpp b/clang/lib/CodeGen/CGCXX.cpp index 78a7b021855b7..6f47e24eed5b3 100644 --- a/clang/lib/CodeGen/CGCXX.cpp +++ b/clang/lib/CodeGen/CGCXX.cpp @@ -175,7 +175,6 @@ bool CodeGenModule::TryEmitBaseDestructorAsAlias(const CXXDestructorDecl *D) { // requires explicit comdat support in the IL. if (llvm::GlobalValue::isWeakForLinker(TargetLinkage)) return true; - // Create the alias with no name. auto *Alias = llvm::GlobalAlias::create(AliasValueType, 0, Linkage, "", Aliasee, &getModule()); @@ -201,6 +200,42 @@ bool CodeGenModule::TryEmitBaseDestructorAsAlias(const CXXDestructorDecl *D) { return false; } +/// Emit a definition as a global alias for another definition, unconditionally. +void CodeGenModule::EmitDefinitionAsAlias(GlobalDecl AliasDecl, + GlobalDecl TargetDecl) { + + llvm::Type *AliasValueType = getTypes().GetFunctionType(AliasDecl); + + StringRef MangledName = getMangledName(AliasDecl); + llvm::GlobalValue *Entry = GetGlobalValue(MangledName); + if (Entry && !Entry->isDeclaration()) + return; + auto *Aliasee = cast(GetAddrOfGlobal(TargetDecl)); + + // Determine the linkage type for the alias. + llvm::GlobalValue::LinkageTypes Linkage = getFunctionLinkage(AliasDecl); + + // Create the alias with no name. + auto *Alias = llvm::GlobalAlias::create(AliasValueType, 0, Linkage, "", + Aliasee, &getModule()); + // Destructors are always unnamed_addr. + Alias->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global); + + if (Entry) { + assert(Entry->getValueType() == AliasValueType && + Entry->getAddressSpace() == Alias->getAddressSpace() && + "declaration exists with different type"); + Alias->takeName(Entry); + Entry->replaceAllUsesWith(Alias); + Entry->eraseFromParent(); + } else { + Alias->setName(MangledName); + } + + // Set any additional necessary attributes for the alias. + SetCommonAttributes(AliasDecl, Alias); +} + llvm::Function *CodeGenModule::codegenCXXStructor(GlobalDecl GD) { const CGFunctionInfo &FnInfo = getTypes().arrangeCXXStructorDeclaration(GD); auto *Fn = cast( diff --git a/clang/lib/CodeGen/CGCXXABI.cpp b/clang/lib/CodeGen/CGCXXABI.cpp index fd35f2adfa2d2..9f77fbec21380 100644 --- a/clang/lib/CodeGen/CGCXXABI.cpp +++ b/clang/lib/CodeGen/CGCXXABI.cpp @@ -272,6 +272,20 @@ void CGCXXABI::ReadArrayCookie(CodeGenFunction &CGF, Address ptr, numElements = readArrayCookieImpl(CGF, allocAddr, cookieSize); } +void CGCXXABI::ReadArrayCookie(CodeGenFunction &CGF, Address ptr, + QualType eltTy, llvm::Value *&numElements, + llvm::Value *&allocPtr, CharUnits &cookieSize) { + assert(eltTy.isDestructedType()); + + // Derive a char* in the same address space as the pointer. + ptr = ptr.withElementType(CGF.Int8Ty); + + cookieSize = getArrayCookieSizeImpl(eltTy); + Address allocAddr = CGF.Builder.CreateConstInBoundsByteGEP(ptr, -cookieSize); + allocPtr = allocAddr.emitRawPointer(CGF); + numElements = readArrayCookieImpl(CGF, allocAddr, cookieSize); +} + llvm::Value *CGCXXABI::readArrayCookieImpl(CodeGenFunction &CGF, Address ptr, CharUnits cookieSize) { diff --git a/clang/lib/CodeGen/CGCXXABI.h b/clang/lib/CodeGen/CGCXXABI.h index 687ff7fb84444..148a7ba6df7e6 100644 --- a/clang/lib/CodeGen/CGCXXABI.h +++ b/clang/lib/CodeGen/CGCXXABI.h @@ -275,6 +275,7 @@ class CGCXXABI { virtual CatchTypeInfo getCatchAllTypeInfo(); virtual bool shouldTypeidBeNullChecked(QualType SrcRecordTy) = 0; + virtual bool hasVectorDeletingDtors() = 0; virtual void EmitBadTypeidCall(CodeGenFunction &CGF) = 0; virtual llvm::Value *EmitTypeid(CodeGenFunction &CGF, QualType SrcRecordTy, Address ThisPtr, @@ -575,6 +576,12 @@ class CGCXXABI { QualType ElementType, llvm::Value *&NumElements, llvm::Value *&AllocPtr, CharUnits &CookieSize); + /// Reads the array cookie associated with the given pointer, + /// that should have one. + void ReadArrayCookie(CodeGenFunction &CGF, Address Ptr, QualType ElementType, + llvm::Value *&NumElements, llvm::Value *&AllocPtr, + CharUnits &CookieSize); + /// Return whether the given global decl needs a VTT parameter. virtual bool NeedsVTTParameter(GlobalDecl GD); diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp index 98c93b5bb4883..f508930cc9f2b 100644 --- a/clang/lib/CodeGen/CGClass.cpp +++ b/clang/lib/CodeGen/CGClass.cpp @@ -1432,6 +1432,70 @@ static bool CanSkipVTablePointerInitialization(CodeGenFunction &CGF, return true; } +static void EmitConditionalArrayDtorCall(const CXXDestructorDecl *DD, + CodeGenFunction &CGF, + llvm::Value *ShouldDeleteCondition) { + Address ThisPtr = CGF.LoadCXXThisAddress(); + llvm::BasicBlock *ScalarBB = CGF.createBasicBlock("dtor.scalar"); + llvm::BasicBlock *callDeleteBB = + CGF.createBasicBlock("dtor.call_delete_after_array_destroy"); + llvm::BasicBlock *VectorBB = CGF.createBasicBlock("dtor.vector"); + auto *CondTy = cast(ShouldDeleteCondition->getType()); + llvm::Value *CheckTheBitForArrayDestroy = CGF.Builder.CreateAnd( + ShouldDeleteCondition, llvm::ConstantInt::get(CondTy, 2)); + llvm::Value *ShouldDestroyArray = + CGF.Builder.CreateIsNull(CheckTheBitForArrayDestroy); + CGF.Builder.CreateCondBr(ShouldDestroyArray, ScalarBB, VectorBB); + + CGF.EmitBlock(VectorBB); + + llvm::Value *numElements = nullptr; + llvm::Value *allocatedPtr = nullptr; + CharUnits cookieSize; + QualType EltTy = DD->getThisType()->getPointeeType(); + CGF.CGM.getCXXABI().ReadArrayCookie(CGF, ThisPtr, EltTy, numElements, + allocatedPtr, cookieSize); + + // Destroy the elements. + QualType::DestructionKind dtorKind = EltTy.isDestructedType(); + + assert(dtorKind); + assert(numElements && "no element count for a type with a destructor!"); + + CharUnits elementSize = CGF.getContext().getTypeSizeInChars(EltTy); + CharUnits elementAlign = + ThisPtr.getAlignment().alignmentOfArrayElement(elementSize); + + llvm::Value *arrayBegin = ThisPtr.emitRawPointer(CGF); + llvm::Value *arrayEnd = CGF.Builder.CreateInBoundsGEP( + ThisPtr.getElementType(), arrayBegin, numElements, "delete.end"); + + // We already checked that the array is not 0-length before entering vector + // deleting dtor. + CGF.emitArrayDestroy(arrayBegin, arrayEnd, EltTy, elementAlign, + CGF.getDestroyer(dtorKind), + /*checkZeroLength*/ false, CGF.needsEHCleanup(dtorKind)); + + llvm::BasicBlock *VectorBBCont = CGF.createBasicBlock("dtor.vector.cont"); + CGF.EmitBlock(VectorBBCont); + + llvm::Value *CheckTheBitForDeleteCall = CGF.Builder.CreateAnd( + ShouldDeleteCondition, llvm::ConstantInt::get(CondTy, 1)); + + llvm::Value *ShouldCallDelete = + CGF.Builder.CreateIsNull(CheckTheBitForDeleteCall); + CGF.Builder.CreateCondBr(ShouldCallDelete, CGF.ReturnBlock.getBlock(), + callDeleteBB); + CGF.EmitBlock(callDeleteBB); + const CXXDestructorDecl *Dtor = cast(CGF.CurCodeDecl); + const CXXRecordDecl *ClassDecl = Dtor->getParent(); + CGF.EmitDeleteCall(Dtor->getOperatorDelete(), allocatedPtr, + CGF.getContext().getTagDeclType(ClassDecl)); + + CGF.EmitBranchThroughCleanup(CGF.ReturnBlock); + CGF.EmitBlock(ScalarBB); +} + /// EmitDestructorBody - Emits the body of the current destructor. void CodeGenFunction::EmitDestructorBody(FunctionArgList &Args) { const CXXDestructorDecl *Dtor = cast(CurGD.getDecl()); @@ -1461,7 +1525,9 @@ void CodeGenFunction::EmitDestructorBody(FunctionArgList &Args) { // outside of the function-try-block, which means it's always // possible to delegate the destructor body to the complete // destructor. Do so. - if (DtorType == Dtor_Deleting) { + if (DtorType == Dtor_Deleting || DtorType == Dtor_VectorDeleting) { + if (CXXStructorImplicitParamValue && DtorType == Dtor_VectorDeleting) + EmitConditionalArrayDtorCall(Dtor, *this, CXXStructorImplicitParamValue); RunCleanupsScope DtorEpilogue(*this); EnterDtorCleanups(Dtor, Dtor_Deleting); if (HaveInsertPoint()) { @@ -1490,6 +1556,8 @@ void CodeGenFunction::EmitDestructorBody(FunctionArgList &Args) { switch (DtorType) { case Dtor_Comdat: llvm_unreachable("not expecting a COMDAT"); case Dtor_Deleting: llvm_unreachable("already handled deleting case"); + case Dtor_VectorDeleting: + llvm_unreachable("already handled vector deleting case"); case Dtor_Complete: assert((Body || getTarget().getCXXABI().isMicrosoft()) && @@ -1572,7 +1640,6 @@ namespace { return CGF.EmitScalarExpr(ThisArg); return CGF.LoadCXXThis(); } - /// Call the operator delete associated with the current destructor. struct CallDtorDelete final : EHScopeStack::Cleanup { CallDtorDelete() {} @@ -1591,8 +1658,10 @@ namespace { bool ReturnAfterDelete) { llvm::BasicBlock *callDeleteBB = CGF.createBasicBlock("dtor.call_delete"); llvm::BasicBlock *continueBB = CGF.createBasicBlock("dtor.continue"); - llvm::Value *ShouldCallDelete - = CGF.Builder.CreateIsNull(ShouldDeleteCondition); + auto *CondTy = cast(ShouldDeleteCondition->getType()); + llvm::Value *CheckTheBit = CGF.Builder.CreateAnd( + ShouldDeleteCondition, llvm::ConstantInt::get(CondTy, 1)); + llvm::Value *ShouldCallDelete = CGF.Builder.CreateIsNull(CheckTheBit); CGF.Builder.CreateCondBr(ShouldCallDelete, continueBB, callDeleteBB); CGF.EmitBlock(callDeleteBB); diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index ba0dec99d6ae8..52aa956121d73 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -2119,7 +2119,8 @@ llvm::DISubprogram *CGDebugInfo::CreateCXXMemberFunction( // Emit MS ABI vftable information. There is only one entry for the // deleting dtor. const auto *DD = dyn_cast(Method); - GlobalDecl GD = DD ? GlobalDecl(DD, Dtor_Deleting) : GlobalDecl(Method); + GlobalDecl GD = + DD ? GlobalDecl(DD, Dtor_VectorDeleting) : GlobalDecl(Method); MethodVFTableLocation ML = CGM.getMicrosoftVTableContext().getMethodVFTableLocation(GD); VIndex = ML.Index; diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp index 5d96959065dd9..5c11c0bceade7 100644 --- a/clang/lib/CodeGen/CGExprCXX.cpp +++ b/clang/lib/CodeGen/CGExprCXX.cpp @@ -1209,6 +1209,8 @@ void CodeGenFunction::EmitNewArrayInitializer( EmitCXXAggrConstructorCall(Ctor, NumElements, CurPtr, CCE, /*NewPointerIsChecked*/true, CCE->requiresZeroInitialization()); + if (CGM.getCXXABI().hasVectorDeletingDtors()) + CGM.requireVectorDestructorDefinition(Ctor->getParent()); return; } @@ -1912,10 +1914,8 @@ static void EmitDestroyingObjectDelete(CodeGenFunction &CGF, /// Emit the code for deleting a single object. /// \return \c true if we started emitting UnconditionalDeleteBlock, \c false /// if not. -static bool EmitObjectDelete(CodeGenFunction &CGF, - const CXXDeleteExpr *DE, - Address Ptr, - QualType ElementType, +static bool EmitObjectDelete(CodeGenFunction &CGF, const CXXDeleteExpr *DE, + Address Ptr, QualType ElementType, llvm::BasicBlock *UnconditionalDeleteBlock) { // C++11 [expr.delete]p3: // If the static type of the object to be deleted is different from its @@ -2131,6 +2131,40 @@ void CodeGenFunction::EmitCXXDeleteExpr(const CXXDeleteExpr *E) { assert(ConvertTypeForMem(DeleteTy) == Ptr.getElementType()); + if (E->isArrayForm() && CGM.getCXXABI().hasVectorDeletingDtors()) { + if (auto *RD = DeleteTy->getAsCXXRecordDecl()) { + auto *Dtor = RD->getDestructor(); + if (Dtor && Dtor->isVirtual()) { + llvm::Value *NumElements = nullptr; + llvm::Value *AllocatedPtr = nullptr; + CharUnits CookieSize; + llvm::BasicBlock *bodyBB = createBasicBlock("vdtor.call"); + llvm::BasicBlock *doneBB = createBasicBlock("vdtor.nocall"); + // Check array cookie to see if the array has 0 length. Don't call + // the destructor in that case. + CGM.getCXXABI().ReadArrayCookie(*this, Ptr, E, DeleteTy, NumElements, + AllocatedPtr, CookieSize); + + auto *CondTy = cast(NumElements->getType()); + llvm::Value *isEmpty = Builder.CreateICmpEQ( + NumElements, llvm::ConstantInt::get(CondTy, 0)); + Builder.CreateCondBr(isEmpty, doneBB, bodyBB); + + // Delete cookie for empty array. + const FunctionDecl *operatorDelete = E->getOperatorDelete(); + EmitBlock(doneBB); + EmitDeleteCall(operatorDelete, AllocatedPtr, DeleteTy, NumElements, + CookieSize); + EmitBranch(DeleteEnd); + + EmitBlock(bodyBB); + if (!EmitObjectDelete(*this, E, Ptr, DeleteTy, DeleteEnd)) + EmitBlock(DeleteEnd); + return; + } + } + } + if (E->isArrayForm()) { EmitArrayDelete(*this, E, Ptr, DeleteTy); EmitBlock(DeleteEnd); diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp index c7b36957b2e57..dcd1fa77fa834 100644 --- a/clang/lib/CodeGen/CGVTables.cpp +++ b/clang/lib/CodeGen/CGVTables.cpp @@ -769,7 +769,8 @@ void CodeGenVTables::addVTableComponent(ConstantArrayBuilder &builder, case VTableComponent::CK_FunctionPointer: case VTableComponent::CK_CompleteDtorPointer: case VTableComponent::CK_DeletingDtorPointer: { - GlobalDecl GD = component.getGlobalDecl(); + GlobalDecl GD = + component.getGlobalDecl(CGM.getCXXABI().hasVectorDeletingDtors()); const bool IsThunk = nextVTableThunkIndex < layout.vtable_thunks().size() && diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 43345da268868..8f9cf965af2b9 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -7937,3 +7937,51 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) { NewBuilder->ABI->MangleCtx = std::move(ABI->MangleCtx); } + +bool CodeGenModule::classNeedsVectorDestructor(const CXXRecordDecl *RD) { + CXXDestructorDecl *Dtor = RD->getDestructor(); + // The compiler can't know if new[]/delete[] will be used outside of the DLL, + // so just force vector deleting destructor emission if dllexport is present. + // This matches MSVC behavior. + if (Dtor && Dtor->isVirtual() && Dtor->isDefined() && + Dtor->hasAttr()) + return true; + + assert(getCXXABI().hasVectorDeletingDtors()); + return RequireVectorDeletingDtor.count(RD); +} + +void CodeGenModule::requireVectorDestructorDefinition(const CXXRecordDecl *RD) { + assert(getCXXABI().hasVectorDeletingDtors()); + RequireVectorDeletingDtor.insert(RD); + + // To reduce code size in general case we lazily emit scalar deleting + // destructor definition and an alias from vector deleting destructor to + // scalar deleting destructor. It may happen that we first emitted the scalar + // deleting destructor definition and the alias and then discovered that the + // definition of the vector deleting destructor is required. Then we need to + // remove the alias and the scalar deleting destructor and queue vector + // deleting destructor body for emission. Check if that is the case. + CXXDestructorDecl *DtorD = RD->getDestructor(); + GlobalDecl ScalarDtorGD(DtorD, Dtor_Deleting); + StringRef MangledName = getMangledName(ScalarDtorGD); + llvm::GlobalValue *Entry = GetGlobalValue(MangledName); + if (Entry && !Entry->isDeclaration()) { + GlobalDecl VectorDtorGD(DtorD, Dtor_VectorDeleting); + StringRef VDName = getMangledName(VectorDtorGD); + llvm::GlobalValue *VDEntry = GetGlobalValue(VDName); + // It exists and it should be an alias. + assert(VDEntry && isa(VDEntry)); + auto *NewFn = llvm::Function::Create( + cast(VDEntry->getValueType()), + llvm::Function::ExternalLinkage, VDName, &getModule()); + SetFunctionAttributes(VectorDtorGD, NewFn, /*IsIncompleteFunction*/ false, + /*IsThunk*/ false); + NewFn->takeName(VDEntry); + VDEntry->replaceAllUsesWith(NewFn); + VDEntry->eraseFromParent(); + Entry->replaceAllUsesWith(NewFn); + Entry->eraseFromParent(); + addDeferredDeclToEmit(VectorDtorGD); + } +} diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h index 46de3d868f901..2cf15e24180b3 100644 --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -528,6 +528,9 @@ class CodeGenModule : public CodeGenTypeCache { /// that we don't re-emit the initializer. llvm::DenseMap DelayedCXXInitPosition; + /// To remember which types did require a vector deleting dtor. + llvm::SmallPtrSet RequireVectorDeletingDtor; + typedef std::pair GlobalInitData; @@ -1542,6 +1545,7 @@ class CodeGenModule : public CodeGenTypeCache { void EmitGlobal(GlobalDecl D); bool TryEmitBaseDestructorAsAlias(const CXXDestructorDecl *D); + void EmitDefinitionAsAlias(GlobalDecl Alias, GlobalDecl Target); llvm::GlobalValue *GetGlobalValue(StringRef Ref); @@ -1809,6 +1813,8 @@ class CodeGenModule : public CodeGenTypeCache { // behavior. So projects like the Linux kernel can rely on it. return !getLangOpts().CPlusPlus; } + void requireVectorDestructorDefinition(const CXXRecordDecl *RD); + bool classNeedsVectorDestructor(const CXXRecordDecl *RD); private: bool shouldDropDLLAttribute(const Decl *D, const llvm::GlobalValue *GV) const; diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp index 77e995b4c933a..38e3a63ebfb11 100644 --- a/clang/lib/CodeGen/ItaniumCXXABI.cpp +++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp @@ -90,6 +90,8 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI { case Dtor_Comdat: llvm_unreachable("emitting dtor comdat as function?"); + case Dtor_VectorDeleting: + llvm_unreachable("unexpected dtor kind for this ABI"); } llvm_unreachable("bad dtor kind"); } @@ -179,6 +181,7 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI { } bool shouldTypeidBeNullChecked(QualType SrcRecordTy) override; + bool hasVectorDeletingDtors() override { return false; } void EmitBadTypeidCall(CodeGenFunction &CGF) override; llvm::Value *EmitTypeid(CodeGenFunction &CGF, QualType SrcRecordTy, Address ThisPtr, @@ -448,7 +451,8 @@ class ItaniumCXXABI : public CodeGen::CGCXXABI { if (!IsInlined) continue; - StringRef Name = CGM.getMangledName(VtableComponent.getGlobalDecl()); + StringRef Name = CGM.getMangledName( + VtableComponent.getGlobalDecl(/*HasVectorDeletingDtors=*/false)); auto *Entry = CGM.GetGlobalValue(Name); // This checks if virtual inline function has already been emitted. // Note that it is possible that this inline function would be emitted diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp index 40371d99e23e1..464d4370284fb 100644 --- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp +++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp @@ -70,8 +70,8 @@ class MicrosoftCXXABI : public CGCXXABI { switch (GD.getDtorType()) { case Dtor_Complete: case Dtor_Deleting: + case Dtor_VectorDeleting: return true; - case Dtor_Base: return false; @@ -145,6 +145,7 @@ class MicrosoftCXXABI : public CGCXXABI { } bool shouldTypeidBeNullChecked(QualType SrcRecordTy) override; + bool hasVectorDeletingDtors() override { return true; } void EmitBadTypeidCall(CodeGenFunction &CGF) override; llvm::Value *EmitTypeid(CodeGenFunction &CGF, QualType SrcRecordTy, Address ThisPtr, @@ -260,7 +261,7 @@ class MicrosoftCXXABI : public CGCXXABI { // There's only Dtor_Deleting in vftable but it shares the this // adjustment with the base one, so look up the deleting one instead. - LookupGD = GlobalDecl(DD, Dtor_Deleting); + LookupGD = GlobalDecl(DD, Dtor_VectorDeleting); } MethodVFTableLocation ML = CGM.getMicrosoftVTableContext().getMethodVFTableLocation(LookupGD); @@ -342,8 +343,8 @@ class MicrosoftCXXABI : public CGCXXABI { void adjustCallArgsForDestructorThunk(CodeGenFunction &CGF, GlobalDecl GD, CallArgList &CallArgs) override { - assert(GD.getDtorType() == Dtor_Deleting && - "Only deleting destructor thunks are available in this ABI"); + assert(GD.getDtorType() == Dtor_VectorDeleting && + "Only vector deleting destructor thunks are available in this ABI"); CallArgs.add(RValue::get(getStructorImplicitParamValue(CGF)), getContext().IntTy); } @@ -1090,7 +1091,8 @@ bool MicrosoftCXXABI::HasThisReturn(GlobalDecl GD) const { static bool isDeletingDtor(GlobalDecl GD) { return isa(GD.getDecl()) && - GD.getDtorType() == Dtor_Deleting; + (GD.getDtorType() == Dtor_Deleting || + GD.getDtorType() == Dtor_VectorDeleting); } bool MicrosoftCXXABI::hasMostDerivedReturn(GlobalDecl GD) const { @@ -1343,7 +1345,8 @@ MicrosoftCXXABI::buildStructorSignature(GlobalDecl GD, AddedStructorArgCounts Added; // TODO: 'for base' flag if (isa(GD.getDecl()) && - GD.getDtorType() == Dtor_Deleting) { + (GD.getDtorType() == Dtor_Deleting || + GD.getDtorType() == Dtor_VectorDeleting)) { // The scalar deleting destructor takes an implicit int parameter. ArgTys.push_back(getContext().IntTy); ++Added.Suffix; @@ -1375,7 +1378,7 @@ void MicrosoftCXXABI::setCXXDestructorDLLStorage(llvm::GlobalValue *GV, CXXDtorType DT) const { // Deleting destructor variants are never imported or exported. Give them the // default storage class. - if (DT == Dtor_Deleting) { + if (DT == Dtor_Deleting || DT == Dtor_VectorDeleting) { GV->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass); } else { const NamedDecl *ND = Dtor; @@ -1409,6 +1412,12 @@ llvm::GlobalValue::LinkageTypes MicrosoftCXXABI::getCXXDestructorLinkage( // and are emitted everywhere they are used. They are internal if the class // is internal. return llvm::GlobalValue::LinkOnceODRLinkage; + case Dtor_VectorDeleting: + // Use the weak, non-ODR linkage for vector deleting destructors to block + // inlining. This enables an MS ABI code-size saving optimization that + // allows us to avoid emitting array deletion code when arrays of a given + // type are not allocated within the final linkage unit. + return llvm::GlobalValue::WeakAnyLinkage; case Dtor_Comdat: llvm_unreachable("MS C++ ABI does not support comdat dtors"); } @@ -1440,7 +1449,7 @@ MicrosoftCXXABI::getVirtualFunctionPrologueThisAdjustment(GlobalDecl GD) { // There's no Dtor_Base in vftable but it shares the this adjustment with // the deleting one, so look it up instead. - GD = GlobalDecl(DD, Dtor_Deleting); + GD = GlobalDecl(DD, Dtor_VectorDeleting); } MethodVFTableLocation ML = @@ -1489,7 +1498,7 @@ Address MicrosoftCXXABI::adjustThisArgumentForVirtualFunctionCall( // There's only Dtor_Deleting in vftable but it shares the this adjustment // with the base one, so look up the deleting one instead. - LookupGD = GlobalDecl(DD, Dtor_Deleting); + LookupGD = GlobalDecl(DD, Dtor_VectorDeleting); } MethodVFTableLocation ML = CGM.getMicrosoftVTableContext().getMethodVFTableLocation(LookupGD); @@ -2002,20 +2011,20 @@ llvm::Value *MicrosoftCXXABI::EmitVirtualDestructorCall( auto *D = dyn_cast(E); assert((CE != nullptr) ^ (D != nullptr)); assert(CE == nullptr || CE->arg_begin() == CE->arg_end()); - assert(DtorType == Dtor_Deleting || DtorType == Dtor_Complete); + assert(DtorType == Dtor_VectorDeleting || DtorType == Dtor_Complete || + DtorType == Dtor_Deleting); // We have only one destructor in the vftable but can get both behaviors // by passing an implicit int parameter. - GlobalDecl GD(Dtor, Dtor_Deleting); + GlobalDecl GD(Dtor, Dtor_VectorDeleting); const CGFunctionInfo *FInfo = &CGM.getTypes().arrangeCXXStructorDeclaration(GD); llvm::FunctionType *Ty = CGF.CGM.getTypes().GetFunctionType(*FInfo); CGCallee Callee = CGCallee::forVirtual(CE, GD, This, Ty); ASTContext &Context = getContext(); - llvm::Value *ImplicitParam = llvm::ConstantInt::get( - llvm::IntegerType::getInt32Ty(CGF.getLLVMContext()), - DtorType == Dtor_Deleting); + uint32_t Flags = ((D && D->isArrayForm()) << 1) | (DtorType == Dtor_Deleting); + llvm::Value *ImplicitParam = CGF.Builder.getInt32(Flags); QualType ThisTy; if (CE) { @@ -4056,6 +4065,18 @@ void MicrosoftCXXABI::emitCXXStructor(GlobalDecl GD) { if (GD.getDtorType() == Dtor_Base && !CGM.TryEmitBaseDestructorAsAlias(dtor)) return; + if (GD.getDtorType() == Dtor_VectorDeleting && + !CGM.classNeedsVectorDestructor(dtor->getParent())) { + // Create GlobalDecl object with the correct type for the scalar + // deleting destructor. + GlobalDecl ScalarDtorGD(dtor, Dtor_Deleting); + + // Emit an alias from the vector deleting destructor to the scalar deleting + // destructor. + CGM.EmitDefinitionAsAlias(GD, ScalarDtorGD); + return; + } + llvm::Function *Fn = CGM.codegenCXXStructor(GD); if (Fn->isWeakForLinker()) Fn->setComdat(CGM.getModule().getOrInsertComdat(Fn->getName())); diff --git a/clang/test/CodeGenCXX/debug-info-windows-dtor.cpp b/clang/test/CodeGenCXX/debug-info-windows-dtor.cpp index beea56ce7368b..ffef45b9f7d1b 100644 --- a/clang/test/CodeGenCXX/debug-info-windows-dtor.cpp +++ b/clang/test/CodeGenCXX/debug-info-windows-dtor.cpp @@ -16,7 +16,7 @@ struct AB: A, B { template struct AB; // CHECK: define {{.*}}@"??_E?$AB@H@@W3AEPAXI@Z"({{.*}} !dbg [[THUNK_VEC_DEL_DTOR:![0-9]*]] -// CHECK: call {{.*}}@"??_G?$AB@H@@UAEPAXI@Z"({{.*}}) #{{[0-9]*}}, !dbg [[THUNK_LOC:![0-9]*]] +// CHECK: call {{.*}}@"??_E?$AB@H@@UAEPAXI@Z"({{.*}}) #{{[0-9]*}}, !dbg [[THUNK_LOC:![0-9]*]] // CHECK: define // CHECK: [[THUNK_VEC_DEL_DTOR]] = distinct !DISubprogram diff --git a/clang/test/CodeGenCXX/dllexport.cpp b/clang/test/CodeGenCXX/dllexport.cpp index c8ac526f4cbe3..16eaac75e702f 100644 --- a/clang/test/CodeGenCXX/dllexport.cpp +++ b/clang/test/CodeGenCXX/dllexport.cpp @@ -631,7 +631,7 @@ struct __declspec(dllexport) Y { struct __declspec(dllexport) Z { virtual ~Z() {} }; // The scalar deleting dtor does not get exported: -// M32-DAG: define linkonce_odr dso_local x86_thiscallcc ptr @"??_GZ@@UAEPAXI@Z" +// M32-DAG: define weak dso_local x86_thiscallcc ptr @"??_EZ@@UAEPAXI@Z" // The user-defined dtor does get exported: diff --git a/clang/test/CodeGenCXX/microsoft-abi-extern-template.cpp b/clang/test/CodeGenCXX/microsoft-abi-extern-template.cpp index ea12aa64ae305..67df330bc3263 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-extern-template.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-extern-template.cpp @@ -4,7 +4,7 @@ // own copy the vftable when emitting the available externally constructor. // CHECK: @"??_7?$Foo@H@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ -// CHECK-SAME: ptr @"??_G?$Foo@H@@UEAAPEAXI@Z" +// CHECK-SAME: ptr @"??_E?$Foo@H@@UEAAPEAXI@Z" // CHECK-SAME: ] }, comdat // CHECK-LABEL: define dso_local noundef ptr @"?f@@YAPEAU?$Foo@H@@XZ"() diff --git a/clang/test/CodeGenCXX/microsoft-abi-structors.cpp b/clang/test/CodeGenCXX/microsoft-abi-structors.cpp index 07abc3d065e5e..2ff7391ec8c8f 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-structors.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-structors.cpp @@ -52,7 +52,8 @@ struct C { // DTORS: store ptr %{{.*}}, ptr %[[RETVAL:retval]] // DTORS: %[[SHOULD_DELETE_VALUE:[0-9a-z._]+]] = load i32, ptr %[[SHOULD_DELETE_VAR]] // DTORS: call x86_thiscallcc void @"??1C@basic@@UAE@XZ"(ptr {{[^,]*}} %[[THIS:[0-9a-z]+]]) -// DTORS-NEXT: %[[CONDITION:[0-9]+]] = icmp eq i32 %[[SHOULD_DELETE_VALUE]], 0 +// DTORS-NEXT: %[[AND:[0-9]+]] = and i32 %[[SHOULD_DELETE_VALUE]], 1 +// DTORS-NEXT: %[[CONDITION:[0-9]+]] = icmp eq i32 %[[AND]], 0 // DTORS-NEXT: br i1 %[[CONDITION]], label %[[CONTINUE_LABEL:[0-9a-z._]+]], label %[[CALL_DELETE_LABEL:[0-9a-z._]+]] // // DTORS: [[CALL_DELETE_LABEL]] @@ -166,7 +167,7 @@ void foo() { // DTORS2-LABEL: define linkonce_odr dso_local x86_thiscallcc ptr @"??_EC@dtor_in_second_nvbase@@W3AEPAXI@Z"(ptr %this, i32 %should_call_delete) // Do an adjustment from B* to C*. // DTORS2: getelementptr i8, ptr %{{.*}}, i32 -4 -// DTORS2: %[[CALL:.*]] = tail call x86_thiscallcc ptr @"??_GC@dtor_in_second_nvbase@@UAEPAXI@Z" +// DTORS2: %[[CALL:.*]] = tail call x86_thiscallcc ptr @"??_EC@dtor_in_second_nvbase@@UAEPAXI@Z" // DTORS2: ret ptr %[[CALL]] } diff --git a/clang/test/CodeGenCXX/microsoft-abi-thunks.cpp b/clang/test/CodeGenCXX/microsoft-abi-thunks.cpp index 38aa81253ccad..83ec158ff7f51 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-thunks.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-thunks.cpp @@ -63,8 +63,7 @@ C::C() {} // Emits vftable and forces thunk generation. // CODEGEN-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??_EC@@W3AEPAXI@Z"(ptr noundef %this, i32 noundef %should_call_delete) {{.*}} comdat // CODEGEN: getelementptr i8, ptr {{.*}}, i32 -4 -// FIXME: should actually call _EC, not _GC. -// CODEGEN: call x86_thiscallcc noundef ptr @"??_GC@@UAEPAXI@Z" +// CODEGEN: call x86_thiscallcc noundef ptr @"??_EC@@UAEPAXI@Z" // CODEGEN: ret // CODEGEN-LABEL: define linkonce_odr dso_local x86_thiscallcc void @"?public_f@C@@W3AEXXZ"(ptr diff --git a/clang/test/CodeGenCXX/microsoft-abi-vftables.cpp b/clang/test/CodeGenCXX/microsoft-abi-vftables.cpp index bc278bdb847fc..7ceb15e40e582 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-vftables.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-vftables.cpp @@ -8,38 +8,38 @@ struct S { virtual ~S(); } s; -// RTTI-DAG: [[VTABLE_S:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4S@@6B@", ptr @"??_GS@@UAEPAXI@Z"] }, comdat($"??_7S@@6B@") +// RTTI-DAG: [[VTABLE_S:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4S@@6B@", ptr @"??_ES@@UAEPAXI@Z"] }, comdat($"??_7S@@6B@") // RTTI-DAG: @"??_7S@@6B@" = unnamed_addr alias ptr, getelementptr inbounds ({ [2 x ptr] }, ptr [[VTABLE_S]], i32 0, i32 0, i32 1) -// NO-RTTI-DAG: @"??_7S@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_GS@@UAEPAXI@Z"] } +// NO-RTTI-DAG: @"??_7S@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_ES@@UAEPAXI@Z"] } struct __declspec(dllimport) U { virtual ~U(); } u; -// RTTI-DAG: [[VTABLE_U:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4U@@6B@", ptr @"??_GU@@UAEPAXI@Z"] } +// RTTI-DAG: [[VTABLE_U:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4U@@6B@", ptr @"??_EU@@UAEPAXI@Z"] } // RTTI-DAG: @"??_SU@@6B@" = unnamed_addr alias ptr, getelementptr inbounds ({ [2 x ptr] }, ptr [[VTABLE_U]], i32 0, i32 0, i32 1) -// NO-RTTI-DAG: @"??_SU@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_GU@@UAEPAXI@Z"] } +// NO-RTTI-DAG: @"??_SU@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_EU@@UAEPAXI@Z"] } struct __declspec(dllexport) V { virtual ~V(); } v; -// RTTI-DAG: [[VTABLE_V:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4V@@6B@", ptr @"??_GV@@UAEPAXI@Z"] }, comdat($"??_7V@@6B@") +// RTTI-DAG: [[VTABLE_V:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4V@@6B@", ptr @"??_EV@@UAEPAXI@Z"] }, comdat($"??_7V@@6B@") // RTTI-DAG: @"??_7V@@6B@" = dllexport unnamed_addr alias ptr, getelementptr inbounds ({ [2 x ptr] }, ptr [[VTABLE_V]], i32 0, i32 0, i32 1) -// NO-RTTI-DAG: @"??_7V@@6B@" = weak_odr dllexport unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_GV@@UAEPAXI@Z"] } +// NO-RTTI-DAG: @"??_7V@@6B@" = weak_odr dllexport unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_EV@@UAEPAXI@Z"] } namespace { struct W { virtual ~W() {} } w; } -// RTTI-DAG: [[VTABLE_W:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4W@?A0x{{[^@]*}}@@6B@", ptr @"??_GW@?A0x{{[^@]*}}@@UAEPAXI@Z"] } +// RTTI-DAG: [[VTABLE_W:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4W@?A0x{{[^@]*}}@@6B@", ptr @"??_EW@?A0x{{[^@]*}}@@UAEPAXI@Z"] } // RTTI-DAG: @"??_7W@?A0x{{[^@]*}}@@6B@" = internal unnamed_addr alias ptr, getelementptr inbounds ({ [2 x ptr] }, ptr [[VTABLE_W]], i32 0, i32 0, i32 1) -// NO-RTTI-DAG: @"??_7W@?A0x{{[^@]*}}@@6B@" = internal unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_GW@?A0x{{[^@]*}}@@UAEPAXI@Z"] } +// NO-RTTI-DAG: @"??_7W@?A0x{{[^@]*}}@@6B@" = internal unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_EW@?A0x{{[^@]*}}@@UAEPAXI@Z"] } struct X {}; template struct Y : virtual X { @@ -49,7 +49,7 @@ template struct Y : virtual X { extern template class Y; template Y::Y(); -// RTTI-DAG: [[VTABLE_Y:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4?$Y@H@@6B@", ptr @"??_G?$Y@H@@UAEPAXI@Z"] }, comdat($"??_7?$Y@H@@6B@") +// RTTI-DAG: [[VTABLE_Y:@.*]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4?$Y@H@@6B@", ptr @"??_E?$Y@H@@UAEPAXI@Z"] }, comdat($"??_7?$Y@H@@6B@") // RTTI-DAG: @"??_7?$Y@H@@6B@" = unnamed_addr alias ptr, getelementptr inbounds ({ [2 x ptr] }, ptr [[VTABLE_Y]], i32 0, i32 0, i32 1) -// NO-RTTI-DAG: @"??_7?$Y@H@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_G?$Y@H@@UAEPAXI@Z"] }, comdat +// NO-RTTI-DAG: @"??_7?$Y@H@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_E?$Y@H@@UAEPAXI@Z"] }, comdat diff --git a/clang/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp b/clang/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp index b54775f6c5dd0..7e9dce18b2797 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp @@ -80,6 +80,15 @@ B::~B() { // CHECK2: call x86_thiscallcc void @"??1VBase@@UAE@XZ"(ptr {{[^,]*}} %[[VBASE_i8]]) // CHECK2: ret + // CHECK2-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??0B@test2@@QAE@XZ" + // CHECK2: (ptr {{[^,]*}} returned align 4 dereferenceable(4) %this, i32 noundef %is_most_derived) + // CHECK2: call x86_thiscallcc noundef ptr @"??0A@test2@@QAE@XZ"(ptr {{[^,]*}} %{{.*}}) + // CHECK2: ret + + // CHECK2-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??_GD@pr36921@@UAEPAXI@Z"( + // CHECK2: %[[THIS_RELOAD:.*]] = load ptr, ptr + // CHECK2: %[[THIS_ADJ_i8:.*]] = getelementptr inbounds i8, ptr %[[THIS_RELOAD]], i32 -4 + // CHECK2-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??_GB@@UAEPAXI@Z" // CHECK2: store ptr %{{.*}}, ptr %[[THIS_ADDR:.*]], align 4 // CHECK2: %[[THIS_i8:.*]] = getelementptr inbounds i8, ptr %[[THIS_PARAM_i8:.*]], i32 -8 @@ -293,11 +302,6 @@ void callC() { C x; } // CHECK: call x86_thiscallcc noundef ptr @"??0A@test2@@QAE@XZ"(ptr {{[^,]*}} %{{.*}}) // CHECK: ret -// CHECK2-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??0B@test2@@QAE@XZ" -// CHECK2: (ptr {{[^,]*}} returned align 4 dereferenceable(4) %this, i32 noundef %is_most_derived) -// CHECK2: call x86_thiscallcc noundef ptr @"??0A@test2@@QAE@XZ"(ptr {{[^,]*}} %{{.*}}) -// CHECK2: ret - } namespace test3 { @@ -480,9 +484,6 @@ struct B { struct C : virtual B {}; struct D : virtual A, C {}; D d; -// CHECK2-LABEL: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??_GD@pr36921@@UAEPAXI@Z"( -// CHECK2: %[[THIS_RELOAD:.*]] = load ptr, ptr -// CHECK2: %[[THIS_ADJ_i8:.*]] = getelementptr inbounds i8, ptr %[[THIS_RELOAD]], i32 -4 } namespace issue_60465 { diff --git a/clang/test/CodeGenCXX/microsoft-abi-vtables-multiple-nonvirtual-inheritance-vdtors.cpp b/clang/test/CodeGenCXX/microsoft-abi-vtables-multiple-nonvirtual-inheritance-vdtors.cpp index a407766f8ed9f..74150b0ecb535 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-vtables-multiple-nonvirtual-inheritance-vdtors.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-vtables-multiple-nonvirtual-inheritance-vdtors.cpp @@ -12,18 +12,18 @@ struct B { struct C : A, B { // CHECK-LABEL: VFTable for 'A' in 'C' (2 entries). - // CHECK-NEXT: 0 | C::~C() [scalar deleting] + // CHECK-NEXT: 0 | C::~C() [vector deleting] // CHECK-NEXT: 1 | void A::z1() // CHECK-LABEL: VFTable for 'B' in 'C' (1 entry). - // CHECK-NEXT: 0 | C::~C() [scalar deleting] + // CHECK-NEXT: 0 | C::~C() [vector deleting] // CHECK-NEXT: [this adjustment: -4 non-virtual] // CHECK-LABEL: Thunks for 'C::~C()' (1 entry). // CHECK-NEXT: 0 | [this adjustment: -4 non-virtual] // CHECK-LABEL: VFTable indices for 'C' (1 entry). - // CHECK-NEXT: 0 | C::~C() [scalar deleting] + // CHECK-NEXT: 0 | C::~C() [vector deleting] virtual ~C(); }; @@ -41,7 +41,7 @@ struct E : D, B { // CHECK-NEXT: 0 | void D::z4() // CHECK-LABEL: VFTable for 'B' in 'E' (1 entry). - // CHECK-NEXT: 0 | E::~E() [scalar deleting] + // CHECK-NEXT: 0 | E::~E() [vector deleting] // CHECK-NEXT: [this adjustment: -4 non-virtual] // CHECK-LABEL: Thunks for 'E::~E()' (1 entry). @@ -49,7 +49,7 @@ struct E : D, B { // CHECK-LABEL: VFTable indices for 'E' (1 entry). // CHECK-NEXT: -- accessible via vfptr at offset 4 -- - // CHECK-NEXT: 0 | E::~E() [scalar deleting] + // CHECK-NEXT: 0 | E::~E() [vector deleting] }; void build_vftable(E *obj) { delete obj; } @@ -61,7 +61,7 @@ struct F : D, B { // CHECK-NEXT: 0 | void D::z4() // CHECK-LABEL: VFTable for 'B' in 'F' (1 entry). - // CHECK-NEXT: 0 | F::~F() [scalar deleting] + // CHECK-NEXT: 0 | F::~F() [vector deleting] // CHECK-NEXT: [this adjustment: -4 non-virtual] // CHECK-LABEL: Thunks for 'F::~F()' (1 entry). @@ -69,7 +69,7 @@ struct F : D, B { // CHECK-LABEL: VFTable indices for 'F' (1 entry). // CHECK-NEXT: -- accessible via vfptr at offset 4 -- - // CHECK-NEXT: 0 | F::~F() [scalar deleting] + // CHECK-NEXT: 0 | F::~F() [vector deleting] }; void build_vftable(F *obj) { delete obj; } @@ -79,7 +79,7 @@ struct G : F { // CHECK-NEXT: 0 | void D::z4() // CHECK-LABEL: VFTable for 'B' in 'F' in 'G' (1 entry). - // CHECK-NEXT: 0 | G::~G() [scalar deleting] + // CHECK-NEXT: 0 | G::~G() [vector deleting] // CHECK-NEXT: [this adjustment: -4 non-virtual] // CHECK-LABEL: Thunks for 'G::~G()' (1 entry). @@ -87,7 +87,7 @@ struct G : F { // CHECK-LABEL: VFTable indices for 'G' (1 entry). // CHECK-NEXT: -- accessible via vfptr at offset 4 -- - // CHECK-NEXT: 0 | G::~G() [scalar deleting] + // CHECK-NEXT: 0 | G::~G() [vector deleting] virtual ~G(); }; diff --git a/clang/test/CodeGenCXX/microsoft-abi-vtables-return-thunks.cpp b/clang/test/CodeGenCXX/microsoft-abi-vtables-return-thunks.cpp index 5030a5dcd2a50..1a589370d3a74 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-vtables-return-thunks.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-vtables-return-thunks.cpp @@ -213,6 +213,6 @@ struct C : virtual B { C *f(); }; C c; // VFTABLES-LABEL: VFTable indices for 'pr34302::C' (2 entries). // VFTABLES-NEXT: -- accessible via vbtable index 1, vfptr at offset 0 -- -// VFTABLES-NEXT: 0 | pr34302::C::~C() [scalar deleting] +// VFTABLES-NEXT: 0 | pr34302::C::~C() [vector deleting] // VFTABLES-NEXT: 2 | C *pr34302::C::f() } diff --git a/clang/test/CodeGenCXX/microsoft-abi-vtables-single-inheritance.cpp b/clang/test/CodeGenCXX/microsoft-abi-vtables-single-inheritance.cpp index b0bf927d38f7c..c95202e8cc253 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-vtables-single-inheritance.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-vtables-single-inheritance.cpp @@ -44,10 +44,10 @@ void use(B *obj) { obj->f(); } struct C { // CHECK-LABEL: VFTable for 'C' (2 entries) - // CHECK-NEXT: 0 | C::~C() [scalar deleting] + // CHECK-NEXT: 0 | C::~C() [vector deleting] // CHECK-NEXT: 1 | void C::f() // CHECK-LABEL: VFTable indices for 'C' (2 entries). - // CHECK-NEXT: 0 | C::~C() [scalar deleting] + // CHECK-NEXT: 0 | C::~C() [vector deleting] // CHECK-NEXT: 1 | void C::f() virtual ~C(); @@ -60,10 +60,10 @@ void use(C *obj) { obj->f(); } struct D { // CHECK-LABEL: VFTable for 'D' (2 entries) // CHECK-NEXT: 0 | void D::f() - // CHECK-NEXT: 1 | D::~D() [scalar deleting] + // CHECK-NEXT: 1 | D::~D() [vector deleting] // CHECK-LABEL: VFTable indices for 'D' (2 entries) // CHECK-NEXT: 0 | void D::f() - // CHECK-NEXT: 1 | D::~D() [scalar deleting] + // CHECK-NEXT: 1 | D::~D() [vector deleting] virtual void f(); virtual ~D(); @@ -77,10 +77,10 @@ struct E : A { // CHECK-NEXT: 0 | void A::f() // CHECK-NEXT: 1 | void A::g() // CHECK-NEXT: 2 | void A::h() - // CHECK-NEXT: 3 | E::~E() [scalar deleting] + // CHECK-NEXT: 3 | E::~E() [vector deleting] // CHECK-NEXT: 4 | void E::i() // CHECK-LABEL: VFTable indices for 'E' (2 entries). - // CHECK-NEXT: 3 | E::~E() [scalar deleting] + // CHECK-NEXT: 3 | E::~E() [vector deleting] // CHECK-NEXT: 4 | void E::i() // ~E would be the key method, but it isn't used, and MS ABI has no key @@ -98,10 +98,10 @@ struct F : A { // CHECK-NEXT: 1 | void A::g() // CHECK-NEXT: 2 | void A::h() // CHECK-NEXT: 3 | void F::i() - // CHECK-NEXT: 4 | F::~F() [scalar deleting] + // CHECK-NEXT: 4 | F::~F() [vector deleting] // CHECK-LABEL: VFTable indices for 'F' (2 entries). // CHECK-NEXT: 3 | void F::i() - // CHECK-NEXT: 4 | F::~F() [scalar deleting] + // CHECK-NEXT: 4 | F::~F() [vector deleting] virtual void i(); virtual ~F(); @@ -115,12 +115,12 @@ struct G : E { // CHECK-NEXT: 0 | void G::f() // CHECK-NEXT: 1 | void A::g() // CHECK-NEXT: 2 | void A::h() - // CHECK-NEXT: 3 | G::~G() [scalar deleting] + // CHECK-NEXT: 3 | G::~G() [vector deleting] // CHECK-NEXT: 4 | void E::i() // CHECK-NEXT: 5 | void G::j() // CHECK-LABEL: VFTable indices for 'G' (3 entries). // CHECK-NEXT: 0 | void G::f() - // CHECK-NEXT: 3 | G::~G() [scalar deleting] + // CHECK-NEXT: 3 | G::~G() [vector deleting] // CHECK-NEXT: 5 | void G::j() virtual void f(); // overrides A::f() diff --git a/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance-vtordisps.cpp b/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance-vtordisps.cpp index c5ce69f5cbcac..be9f281560dcf 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance-vtordisps.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance-vtordisps.cpp @@ -57,7 +57,7 @@ struct A : virtual V1 { // CHECK-LABEL: VFTable for 'V1' in 'simple::A' (2 entries). // CHECK-NEXT: 0 | void simple::A::f() // CHECK-NEXT: [this adjustment: vtordisp at -4, 0 non-virtual] - // CHECK-NEXT: 1 | simple::A::~A() [scalar deleting] + // CHECK-NEXT: 1 | simple::A::~A() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -4, 0 non-virtual] // CHECK-LABEL: Thunks for 'simple::A::~A()' (1 entry). @@ -79,7 +79,7 @@ void use(A *obj) { obj->f(); } struct B : virtual V3 { // CHECK-LABEL: VFTable for 'Z' in 'V3' in 'simple::B' (2 entries). // CHECK-NEXT: 0 | void Z::g() - // CHECK-NEXT: 1 | simple::B::~B() [scalar deleting] + // CHECK-NEXT: 1 | simple::B::~B() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -4, 0 non-virtual] // CHECK-LABEL: Thunks for 'simple::B::~B()' (1 entry). @@ -88,7 +88,7 @@ struct B : virtual V3 { // CHECK-LABEL: VFTable for 'V2' in 'V3' in 'simple::B' (2 entries). // CHECK-NEXT: 0 | void simple::B::f() // CHECK-NEXT: [this adjustment: vtordisp at -12, 0 non-virtual] - // CHECK-NEXT: 1 | simple::B::~B() [scalar deleting] + // CHECK-NEXT: 1 | simple::B::~B() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -12, -8 non-virtual] // CHECK-LABEL: Thunks for 'simple::B::~B()' (1 entry). @@ -115,7 +115,7 @@ void use(B *obj) { obj->f(); } struct C : virtual V4 { // CHECK-LABEL: VFTable for 'Z' in 'V4' in 'simple::C' (2 entries). // CHECK-NEXT: 0 | void Z::g() - // CHECK-NEXT: 1 | simple::C::~C() [scalar deleting] + // CHECK-NEXT: 1 | simple::C::~C() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -4, 0 non-virtual] // CHECK-LABEL: Thunks for 'simple::C::~C()' (1 entry). @@ -124,7 +124,7 @@ struct C : virtual V4 { // CHECK-LABEL: VFTable for 'V1' in 'V4' in 'simple::C' (2 entries). // CHECK-NEXT: 0 | void simple::C::f() // CHECK-NEXT: [this adjustment: vtordisp at -12, 0 non-virtual] - // CHECK-NEXT: 1 | simple::C::~C() [scalar deleting] + // CHECK-NEXT: 1 | simple::C::~C() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -12, -8 non-virtual] // CHECK-LABEL: Thunks for 'simple::C::~C()' (1 entry). @@ -136,7 +136,7 @@ struct C : virtual V4 { // CHECK-LABEL: VFTable for 'V2' in 'V4' in 'simple::C' (2 entries). // CHECK-NEXT: 0 | void simple::C::f() // CHECK-NEXT: [this adjustment: vtordisp at -16, -4 non-virtual] - // CHECK-NEXT: 1 | simple::C::~C() [scalar deleting] + // CHECK-NEXT: 1 | simple::C::~C() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -16, -12 non-virtual] // CHECK-LABEL: Thunks for 'simple::C::~C()' (1 entry). @@ -162,7 +162,7 @@ class D : B { // CHECK-LABEL: VFTable for 'V2' in 'V3' in 'simple::B' in 'simple::D' (2 entries). // CHECK-NEXT: 0 | void simple::B::f() // CHECK-NEXT: [this adjustment: vtordisp at -12, -4 non-virtual] - // CHECK-NEXT: 1 | simple::D::~D() [scalar deleting] + // CHECK-NEXT: 1 | simple::D::~D() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -12, -8 non-virtual] D(); int z; @@ -180,12 +180,12 @@ struct F : virtual E { // CHECK-LABEL: VFTable for 'Z' in 'V3' in 'simple::E' in 'simple::F' (2 entries). // CHECK-NEXT: 0 | void simple::F::g() // CHECK-NEXT: [this adjustment: vtordisp at -4, 0 non-virtual] - // CHECK-NEXT: 1 | simple::F::~F() [scalar deleting] + // CHECK-NEXT: 1 | simple::F::~F() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -4, 0 non-virtual] // CHECK-LABEL: VFTable for 'V2' in 'V3' in 'simple::E' in 'simple::F' (2 entries). // CHECK-NEXT: 0 | void simple::E::f() - // CHECK-NEXT: 1 | simple::F::~F() [scalar deleting] + // CHECK-NEXT: 1 | simple::F::~F() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -12, -8 non-virtual] F(); @@ -202,12 +202,12 @@ struct G : F { // CHECK-LABEL: VFTable for 'Z' in 'V3' in 'simple::E' in 'simple::F' in 'simple::G' (2 entries). // CHECK-NEXT: 0 | void simple::F::g() // CHECK-NEXT: [this adjustment: vtordisp at -4, -4 non-virtual] - // CHECK-NEXT: 1 | simple::G::~G() [scalar deleting] + // CHECK-NEXT: 1 | simple::G::~G() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -4, 0 non-virtual] // CHECK-LABEL: VFTable for 'V2' in 'V3' in 'simple::E' in 'simple::F' in 'simple::G' (2 entries). // CHECK-NEXT: 0 | void simple::E::f() - // CHECK-NEXT: 1 | simple::G::~G() [scalar deleting] + // CHECK-NEXT: 1 | simple::G::~G() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -12, -8 non-virtual] G(); @@ -240,7 +240,7 @@ struct A : virtual simple::A { // CHECK-NEXT: 0 | void simple::A::f() // CHECK-NEXT: [this adjustment: vtordisp at -4, vbptr at 8 to the left, // CHECK-NEXT: vboffset at 8 in the vbtable, 8 non-virtual] - // CHECK-NEXT: 1 | extended::A::~A() [scalar deleting] + // CHECK-NEXT: 1 | extended::A::~A() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -4, 0 non-virtual] // CHECK-LABEL: Thunks for 'void simple::A::f()' (1 entry). @@ -265,7 +265,7 @@ struct B : virtual simple::A { // CHECK-LABEL: VFTable for 'V1' in 'simple::A' in 'extended::B' (2 entries). // ... - // CHECK: 1 | extended::B::~B() [scalar deleting] + // CHECK: 1 | extended::B::~B() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -4, 0 non-virtual] // CHECK-LABEL: Thunks for 'void simple::A::f()' (1 entry). @@ -353,7 +353,7 @@ struct G : virtual simple::A { // CHECK-NEXT: 0 | void simple::A::f() // CHECK-NEXT: [this adjustment: vtordisp at -4, vbptr at 8 to the left, // CHECK-NEXT: vboffset at 8 in the vbtable, 8 non-virtual] - // CHECK-NEXT: 1 | extended::G::~G() [scalar deleting] + // CHECK-NEXT: 1 | extended::G::~G() [vector deleting] // CHECK-NEXT: [this adjustment: vtordisp at -4, 0 non-virtual] // CHECK-LABEL: Thunks for 'void simple::A::f()' (1 entry). @@ -374,7 +374,7 @@ void use(G *obj) { obj->g(); } struct H : Z, A { // CHECK-LABEL: VFTable for 'Z' in 'extended::H' (2 entries). // CHECK-NEXT: 0 | void Z::g() - // CHECK-NEXT: 1 | extended::H::~H() [scalar deleting] + // CHECK-NEXT: 1 | extended::H::~H() [vector deleting] // CHECK-LABEL: VFTable for 'V1' in 'simple::A' in 'extended::A' in 'extended::H' (2 entries). // CHECK-NEXT: 0 | void simple::A::f() diff --git a/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance.cpp b/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance.cpp index 257ba270291c8..e5e6ea5f42c1c 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-vtables-virtual-inheritance.cpp @@ -492,7 +492,7 @@ struct X { struct Y : virtual X { // CHECK-LABEL: VFTable for 'vdtors::X' in 'vdtors::Y' (2 entries). - // CHECK-NEXT: 0 | vdtors::Y::~Y() [scalar deleting] + // CHECK-NEXT: 0 | vdtors::Y::~Y() [vector deleting] // CHECK-NEXT: 1 | void vdtors::X::zzz() // CHECK-NOT: Thunks for 'vdtors::Y::~Y()' @@ -515,7 +515,7 @@ struct U : virtual W { // CHECK-NEXT: 0 | void vdtors::Z::z() // CHECK-LABEL: VFTable for 'vdtors::X' in 'vdtors::W' in 'vdtors::U' (2 entries). - // CHECK-NEXT: 0 | vdtors::U::~U() [scalar deleting] + // CHECK-NEXT: 0 | vdtors::U::~U() [vector deleting] // CHECK-NEXT: [this adjustment: -4 non-virtual] // CHECK-NEXT: 1 | void vdtors::X::zzz() @@ -524,7 +524,7 @@ struct U : virtual W { // CHECK-LABEL: VFTable indices for 'vdtors::U' (1 entry). // CHECK-NEXT: -- accessible via vbtable index 1, vfptr at offset 4 -- - // CHECK-NEXT: 0 | vdtors::U::~U() [scalar deleting] + // CHECK-NEXT: 0 | vdtors::U::~U() [vector deleting] virtual ~U(); }; @@ -536,7 +536,7 @@ struct V : virtual W { // CHECK-NEXT: 0 | void vdtors::Z::z() // CHECK-LABEL: VFTable for 'vdtors::X' in 'vdtors::W' in 'vdtors::V' (2 entries). - // CHECK-NEXT: 0 | vdtors::V::~V() [scalar deleting] + // CHECK-NEXT: 0 | vdtors::V::~V() [vector deleting] // CHECK-NEXT: [this adjustment: -4 non-virtual] // CHECK-NEXT: 1 | void vdtors::X::zzz() @@ -545,7 +545,7 @@ struct V : virtual W { // CHECK-LABEL: VFTable indices for 'vdtors::V' (1 entry). // CHECK-NEXT: -- accessible via vbtable index 1, vfptr at offset 4 -- - // CHECK-NEXT: 0 | vdtors::V::~V() [scalar deleting] + // CHECK-NEXT: 0 | vdtors::V::~V() [vector deleting] }; V v; @@ -557,7 +557,7 @@ struct T : virtual X { struct P : T, Y { // CHECK-LABEL: VFTable for 'vdtors::X' in 'vdtors::T' in 'vdtors::P' (2 entries). - // CHECK-NEXT: 0 | vdtors::P::~P() [scalar deleting] + // CHECK-NEXT: 0 | vdtors::P::~P() [vector deleting] // CHECK-NEXT: 1 | void vdtors::X::zzz() // CHECK-NOT: Thunks for 'vdtors::P::~P()' @@ -574,18 +574,18 @@ struct Q { // PR19172: Yet another diamond we miscompiled. struct R : virtual Q, X { // CHECK-LABEL: VFTable for 'vdtors::Q' in 'vdtors::R' (1 entry). - // CHECK-NEXT: 0 | vdtors::R::~R() [scalar deleting] + // CHECK-NEXT: 0 | vdtors::R::~R() [vector deleting] // CHECK-NEXT: [this adjustment: -8 non-virtual] // CHECK-LABEL: Thunks for 'vdtors::R::~R()' (1 entry). // CHECK-NEXT: 0 | [this adjustment: -8 non-virtual] // CHECK-LABEL: VFTable for 'vdtors::X' in 'vdtors::R' (2 entries). - // CHECK-NEXT: 0 | vdtors::R::~R() [scalar deleting] + // CHECK-NEXT: 0 | vdtors::R::~R() [vector deleting] // CHECK-NEXT: 1 | void vdtors::X::zzz() // CHECK-LABEL: VFTable indices for 'vdtors::R' (1 entry). - // CHECK-NEXT: 0 | vdtors::R::~R() [scalar deleting] + // CHECK-NEXT: 0 | vdtors::R::~R() [vector deleting] virtual ~R(); }; diff --git a/clang/test/CodeGenCXX/microsoft-no-rtti-data.cpp b/clang/test/CodeGenCXX/microsoft-no-rtti-data.cpp index 069f0226ab948..c8e374e51a031 100644 --- a/clang/test/CodeGenCXX/microsoft-no-rtti-data.cpp +++ b/clang/test/CodeGenCXX/microsoft-no-rtti-data.cpp @@ -2,7 +2,7 @@ // vftable shouldn't have RTTI data in it. // CHECK-NOT: @"??_R4S@@6B@" -// CHECK: @"??_7S@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_GS@@UAEPAXI@Z"] }, comdat +// CHECK: @"??_7S@@6B@" = linkonce_odr unnamed_addr constant { [1 x ptr] } { [1 x ptr] [ptr @"??_ES@@UAEPAXI@Z"] }, comdat struct type_info; namespace std { using ::type_info; } diff --git a/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp b/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp new file mode 100644 index 0000000000000..ebff4f6a851b0 --- /dev/null +++ b/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp @@ -0,0 +1,152 @@ +// RUN: %clang_cc1 -emit-llvm %s -triple=x86_64-pc-windows-msvc -o - | FileCheck --check-prefixes=X64,CHECK %s +// RUN: %clang_cc1 -emit-llvm %s -triple=i386-pc-windows-msvc -o - | FileCheck --check-prefixes=X86,CHECK %s + +struct Bird { + virtual ~Bird(); +}; + +struct Parrot : public Bird { +// X64: @[[ParrotVtable:[0-9]+]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4Parrot@@6B@", ptr @"??_EParrot@@UEAAPEAXI@Z"] }, comdat($"??_7Parrot@@6B@") +// X86: @[[ParrotVtable:[0-9]+]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4Parrot@@6B@", ptr @"??_EParrot@@UAEPAXI@Z"] }, comdat($"??_7Parrot@@6B@") +// X64: @[[Bird:[0-9]+]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4Bird@@6B@", ptr @"??_EBird@@UEAAPEAXI@Z"] }, comdat($"??_7Bird@@6B@") +// X86: @[[Bird:[0-9]+]] = private unnamed_addr constant { [2 x ptr] } { [2 x ptr] [ptr @"??_R4Bird@@6B@", ptr @"??_EBird@@UAEPAXI@Z"] }, comdat($"??_7Bird@@6B@") + virtual ~Parrot() {} +}; + +Bird::~Bird() {} + +// For the weird bird we first emit scalar deleting destructor, then find out +// that we need vector deleting destructor and remove the alias. +struct JustAWeirdBird { + virtual ~JustAWeirdBird() {} + + bool doSmth(int n) { + JustAWeirdBird *c = new JustAWeirdBird[n]; + + delete[] c; + return true; + } +}; + +// Vector deleting dtor for Bird is an alias because no new Bird[] expressions +// in the TU. +// X64: @"??_EBird@@UEAAPEAXI@Z" = weak dso_local unnamed_addr alias ptr (ptr, i32), ptr @"??_GBird@@UEAAPEAXI@Z" +// X86: @"??_EBird@@UAEPAXI@Z" = weak dso_local unnamed_addr alias ptr (ptr, i32), ptr @"??_GBird@@UAEPAXI@Z" +// No scalar destructor for Parrot. +// CHECK-NOT: @"??_GParrot" +// No vector destructor definition for Bird. +// CHECK-NOT: define{{.*}}@"??_EBird" +// No scalar deleting dtor for JustAWeirdBird. +// CHECK-NOT: @"??_GJustAWeirdBird" + +void dealloc(Bird *p) { + delete[] p; +} + +Bird* alloc() { + Parrot* P = new Parrot[38]; + return P; +} + +void bar() { + dealloc(alloc()); + + JustAWeirdBird B; + B.doSmth(38); +} + +// CHECK-LABEL: define dso_local void @{{.*}}dealloc{{.*}}( +// CHECK-SAME: ptr noundef %[[PTR:.*]]) +// CHECK: entry: +// CHECK-NEXT: %[[PTRADDR:.*]] = alloca ptr +// CHECK-NEXT: store ptr %[[PTR]], ptr %[[PTRADDR]] +// CHECK-NEXT: %[[LPTR:.*]] = load ptr, ptr %[[PTRADDR]] +// CHECK-NEXT: %[[ISNULL:.*]] = icmp eq ptr %[[LPTR]], null +// CHECK-NEXT: br i1 %[[ISNULL]], label %delete.end, label %delete.notnull +// CHECK: delete.notnull: +// X64-NEXT: %[[COOKIEGEP:.*]] = getelementptr inbounds i8, ptr %[[LPTR]], i64 -8 +// X86-NEXT: %[[COOKIEGEP:.*]] = getelementptr inbounds i8, ptr %[[LPTR]], i32 -4 +// X64-NEXT: %[[HOWMANY:.*]] = load i64, ptr %[[COOKIEGEP]] +// X86-NEXT: %[[HOWMANY:.*]] = load i32, ptr %[[COOKIEGEP]] +// X64-NEXT: %[[ISNOELEM:.*]] = icmp eq i64 %2, 0 +// X86-NEXT: %[[ISNOELEM:.*]] = icmp eq i32 %2, 0 +// CHECK-NEXT: br i1 %[[ISNOELEM]], label %vdtor.nocall, label %vdtor.call +// CHECK: vdtor.nocall: +// X64-NEXT: %[[HOWMANYBYTES:.*]] = mul i64 8, %[[HOWMANY]] +// X86-NEXT: %[[HOWMANYBYTES:.*]] = mul i32 4, %[[HOWMANY]] +// X64-NEXT: %[[ADDCOOKIESIZE:.*]] = add i64 %[[HOWMANYBYTES]], 8 +// X86-NEXT: %[[ADDCOOKIESIZE:.*]] = add i32 %[[HOWMANYBYTES]], 4 +// X64-NEXT: call void @"??_V@YAXPEAX_K@Z"(ptr noundef %[[COOKIEGEP]], i64 noundef %[[ADDCOOKIESIZE]]) +// X86-NEXT: call void @"??_V@YAXPAXI@Z"(ptr noundef %[[COOKIEGEP]], i32 noundef %[[ADDCOOKIESIZE]]) +// CHECK-NEXT: br label %delete.end +// CHECK: vdtor.call: +// CHECK-NEXT: %[[VTABLE:.*]] = load ptr, ptr %[[LPTR]] +// CHECK-NEXT: %[[FPGEP:.*]] = getelementptr inbounds ptr, ptr %[[VTABLE]], i64 0 +// CHECK-NEXT: %[[FPLOAD:.*]] = load ptr, ptr %[[FPGEP]] +// X64-NEXT: %[[CALL:.*]] = call noundef ptr %[[FPLOAD]](ptr noundef nonnull align 8 dereferenceable(8) %[[LPTR]], i32 noundef 3) +// X86-NEXT: %[[CALL:.*]] = call x86_thiscallcc noundef ptr %[[FPLOAD]](ptr noundef nonnull align 4 dereferenceable(4) %[[LPTR]], i32 noundef 3) +// CHECK-NEXT: br label %delete.end +// CHECK: delete.end: +// CHECK-NEXT: ret void + +// Vector dtor definition for Parrot. +// X64-LABEL: define weak dso_local noundef ptr @"??_EParrot@@UEAAPEAXI@Z"( +// X64-SAME: ptr {{.*}} %[[THIS:.*]], i32 {{.*}} %[[IMPLICIT_PARAM:.*]]) unnamed_addr +// X86-LABEL: define weak dso_local x86_thiscallcc noundef ptr @"??_EParrot@@UAEPAXI@Z"( +// X86-SAME: ptr noundef nonnull align 4 dereferenceable(4) %[[THIS:.*]], i32 noundef %[[IMPLICIT_PARAM:.*]]) unnamed_addr +// CHECK: entry: +// CHECK-NEXT: %[[RET:.*]] = alloca ptr +// CHECK-NEXT: %[[IPADDR:.*]] = alloca i32 +// CHECK-NEXT: %[[THISADDR:.*]] = alloca ptr +// CHECK-NEXT: store i32 %[[IMPLICIT_PARAM]], ptr %[[IPADDR]] +// CHECK-NEXT: store ptr %[[THIS]], ptr %[[THISADDR]] +// CHECK-NEXT: %[[LTHIS:.*]] = load ptr, ptr %[[THISADDR]] +// CHECK-NEXT: store ptr %[[LTHIS]], ptr %[[RET]] +// CHECK-NEXT: %[[LIP:.*]] = load i32, ptr %[[IPADDR]] +// CHECK-NEXT: %[[SECONDBIT:.*]] = and i32 %[[LIP]], 2 +// CHECK-NEXT: %[[ISSECONDBITZERO:.*]] = icmp eq i32 %[[SECONDBIT]], 0 +// CHECK-NEXT: br i1 %[[ISSECONDBITZERO:.*]], label %dtor.scalar, label %dtor.vector +// CHECK: dtor.vector: +// X64-NEXT: %[[COOKIEGEP:.*]] = getelementptr inbounds i8, ptr %[[LTHIS]], i64 -8 +// X86-NEXT: %[[COOKIEGEP:.*]] = getelementptr inbounds i8, ptr %[[LTHIS]], i32 -4 +// X64-NEXT: %[[HOWMANY:.*]] = load i64, ptr %[[COOKIEGEP]] +// X86-NEXT: %[[HOWMANY:.*]] = load i32, ptr %[[COOKIEGEP]] +// X64-NEXT: %[[END:.*]] = getelementptr inbounds %struct.Parrot, ptr %[[LTHIS]], i64 %[[HOWMANY]] +// X86-NEXT: %[[END:.*]] = getelementptr inbounds %struct.Parrot, ptr %[[LTHIS]], i32 %[[HOWMANY]] +// CHECK-NEXT: br label %arraydestroy.body +// CHECK: arraydestroy.body: +// CHECK-NEXT: %[[PASTELEM:.*]] = phi ptr [ %delete.end, %dtor.vector ], [ %arraydestroy.element, %arraydestroy.body ] +// X64-NEXT: %[[CURELEM:.*]] = getelementptr inbounds %struct.Parrot, ptr %[[PASTELEM]], i64 -1 +// X86-NEXT: %[[CURELEM:.*]] = getelementptr inbounds %struct.Parrot, ptr %[[PASTELEM]], i32 -1 +// X64-NEXT: call void @"??1Parrot@@UEAA@XZ"(ptr noundef nonnull align 8 dereferenceable(8) %[[CURELEM]]) +// X86-NEXT: call x86_thiscallcc void @"??1Parrot@@UAE@XZ"(ptr noundef nonnull align 4 dereferenceable(4) %[[CURELEM]]) +// CHECK-NEXT: %[[DONE:.*]] = icmp eq ptr %[[CURELEM]], %[[LTHIS]] +// CHECK-NEXT: br i1 %[[DONE]], label %arraydestroy.done3, label %arraydestroy.body +// CHECK: arraydestroy.done3: +// CHECK-NEXT: br label %dtor.vector.cont +// CHECK: dtor.vector.cont: +// CHECK-NEXT: %[[FIRSTBIT:.*]] = and i32 %[[LIP]], 1 +// CHECK-NEXT: %[[ISFIRSTBITZERO:.*]] = icmp eq i32 %[[FIRSTBIT]], 0 +// CHECK-NEXT: br i1 %[[ISFIRSTBITZERO]], label %dtor.continue, label %dtor.call_delete_after_array_destroy +// CHECK: dtor.call_delete_after_array_destroy: +// X64-NEXT: call void @"??3@YAXPEAX_K@Z"(ptr noundef %[[COOKIEGEP]], i64 noundef 8) +// X86-NEXT: call void @"??3@YAXPAXI@Z"(ptr noundef %[[COOKIEGEP]], i32 noundef 4) +// CHECK-NEXT: br label %dtor.continue +// CHECK: dtor.scalar: +// X64-NEXT: call void @"??1Parrot@@UEAA@XZ"(ptr noundef nonnull align 8 dereferenceable(8) %[[LTHIS]]) +// X86-NEXT: call x86_thiscallcc void @"??1Parrot@@UAE@XZ"(ptr noundef nonnull align 4 dereferenceable(4) %[[LTHIS]]) +// CHECK-NEXT: %[[FIRSTBIT:.*]] = and i32 %[[LIP]], 1 +// CHECK-NEXT: %[[ISFIRSTBITZERO:.*]] = icmp eq i32 %[[FIRSTBIT]], 0 +// CHECK-NEXT: br i1 %[[ISFIRSTBITZERO]], label %dtor.continue, label %dtor.call_delete +// CHECK: dtor.call_delete: +// X64-NEXT: call void @"??3@YAXPEAX_K@Z"(ptr noundef %[[LTHIS]], i64 noundef 8) +// X86-NEXT: call void @"??3@YAXPAXI@Z"(ptr noundef %[[LTHIS]], i32 noundef 4) +// CHECK-NEXT: br label %dtor.continue +// CHECK: dtor.continue: +// CHECK-NEXT: %[[LOADRET:.*]] = load ptr, ptr %[[RET]] +// CHECK-NEXT: ret ptr %[[LOADRET]] + +// X64: define weak dso_local noundef ptr @"??_EJustAWeirdBird@@UEAAPEAXI@Z"( +// X64-SAME: ptr noundef nonnull align 8 dereferenceable(8) %this, i32 noundef %should_call_delete) +// X86: define weak dso_local x86_thiscallcc noundef ptr @"??_EJustAWeirdBird@@UAEPAXI@Z"( +// X86-SAME: ptr noundef nonnull align 4 dereferenceable(4) %this, i32 noundef %should_call_delete) unnamed_addr diff --git a/clang/test/CodeGenCXX/vtable-consteval.cpp b/clang/test/CodeGenCXX/vtable-consteval.cpp index 1454f6fde357d..220143465c574 100644 --- a/clang/test/CodeGenCXX/vtable-consteval.cpp +++ b/clang/test/CodeGenCXX/vtable-consteval.cpp @@ -26,7 +26,7 @@ struct B { B b; // ITANIUM-DAG: @_ZTV1C = {{.*}} constant { [4 x ptr] } {{.*}} null, ptr @_ZTI1C, ptr @_ZN1CD1Ev, ptr @_ZN1CD0Ev -// MSABI-DAG: @[[C_VFTABLE:.*]] = {{.*}} constant { [2 x ptr] } {{.*}} @"??_R4C@@6B@", ptr @"??_GC@@UEAAPEAXI@Z" +// MSABI-DAG: @[[C_VFTABLE:.*]] = {{.*}} constant { [2 x ptr] } {{.*}} @"??_R4C@@6B@", ptr @"??_EC@@UEAAPEAXI@Z" struct C { virtual ~C() = default; virtual consteval C &operator=(const C&) = default; @@ -36,7 +36,7 @@ struct C { C c; // ITANIUM-DAG: @_ZTV1D = {{.*}} constant { [4 x ptr] } {{.*}} null, ptr @_ZTI1D, ptr @_ZN1DD1Ev, ptr @_ZN1DD0Ev -// MSABI-DAG: @[[D_VFTABLE:.*]] = {{.*}} constant { [2 x ptr] } {{.*}} @"??_R4D@@6B@", ptr @"??_GD@@UEAAPEAXI@Z" +// MSABI-DAG: @[[D_VFTABLE:.*]] = {{.*}} constant { [2 x ptr] } {{.*}} @"??_R4D@@6B@", ptr @"??_ED@@UEAAPEAXI@Z" struct D : C {}; // ITANIUM-DAG: @d = {{.*}}global { ptr } { {{.*}} @_ZTV1D, // MSABI-DAG: @"?d@@3UD@@A" = {{.*}}global { ptr } { ptr @"??_7D@@6B@" } diff --git a/clang/test/Modules/vtable-windows.cppm b/clang/test/Modules/vtable-windows.cppm index dbde24c8a9bdd..e45e32d6b4d60 100644 --- a/clang/test/Modules/vtable-windows.cppm +++ b/clang/test/Modules/vtable-windows.cppm @@ -23,4 +23,4 @@ void test() { // Check that the virtual table is an unnamed_addr constant in comdat that can // be merged with the virtual table with other TUs. -// CHECK: unnamed_addr constant {{.*}}[ptr @"??_R4Fruit@@6B@", ptr @"??_GFruit@@UAEPAXI@Z", ptr @"?eval@Fruit@@UAEXXZ"{{.*}}comdat($"??_7Fruit@@6B@") +// CHECK: unnamed_addr constant {{.*}}[ptr @"??_R4Fruit@@6B@", ptr @"??_EFruit@@UAEPAXI@Z", ptr @"?eval@Fruit@@UAEXXZ"{{.*}}comdat($"??_7Fruit@@6B@") diff --git a/clang/test/Profile/cxx-abc-deleting-dtor.cpp b/clang/test/Profile/cxx-abc-deleting-dtor.cpp index c65a8e8013c35..7c2a5bbc93af3 100644 --- a/clang/test/Profile/cxx-abc-deleting-dtor.cpp +++ b/clang/test/Profile/cxx-abc-deleting-dtor.cpp @@ -24,16 +24,15 @@ DerivedABC *useABCVTable() { return new DerivedABC(); } // MSVC: @"__profn_??1ABC@@{{.*}}" = // MSVC-NOT: @"__profn_??_G{{.*}}" = -// MSVC-LABEL: define linkonce_odr dso_local noundef ptr @"??_GDerivedABC@@UEAAPEAXI@Z"(ptr {{[^,]*}} %this, {{.*}}) -// MSVC-NOT: call void @llvm.instrprof.increment({{.*}}) -// MSVC: call void @"??1DerivedABC@@UEAA@XZ"({{.*}}) -// MSVC: ret void - // MSVC-LABEL: define linkonce_odr dso_local noundef ptr @"??_GABC@@UEAAPEAXI@Z"(ptr {{[^,]*}} %this, {{.*}}) // MSVC-NOT: call void @llvm.instrprof.increment({{.*}}) // MSVC: call void @llvm.trap() // MSVC-NEXT: unreachable +// MSVC-LABEL: define linkonce_odr dso_local noundef ptr @"??_GDerivedABC@@UEAAPEAXI@Z"(ptr {{[^,]*}} %this, {{.*}}) +// MSVC-NOT: call void @llvm.instrprof.increment({{.*}}) +// MSVC: call void @"??1DerivedABC@@UEAA@XZ"({{.*}}) + // MSVC-LABEL: define linkonce_odr dso_local void @"??1DerivedABC@@UEAA@XZ"({{.*}}) // MSVC: call void @llvm.instrprof.increment({{.*}}) // MSVC: call void @"??1ABC@@UEAA@XZ"({{.*}}) From 5c65a321778b99f745d193629975fb6ced34fe07 Mon Sep 17 00:00:00 2001 From: MingYan <99472920+NexMing@users.noreply.github.com> Date: Mon, 31 Mar 2025 16:13:46 +0800 Subject: [PATCH 0076/1029] [RISCV] Vectorize phi for loop carried @llvm.vp.reduce.* (#131974) LLVM vector predication reduction intrinsics return a scalar result, but on RISC-V vector reduction instructions write the result in the first element of a vector register. So when a reduction in a loop uses a scalar phi, we end up with unnecessary scalar moves: ```asm loop: vmv.s.x v8, zero vredsum.vs v8, v10, v8 vmv.x.s a0, v8 ```` This mainly affects vector predication reduction. This tries to vectorize any scalar phis that feed into a vector predication reduction in RISCVCodeGenPrepare, converting: ```llvm vector.body: %red.phi = phi i32 [ ..., %entry ], [ %red, %vector.body ] %red = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) ``` to ```llvm vector.body: %red.phi = phi [ ..., %entry ], [ %acc.vec, %vector.body] %phi.scalar = extractelement %red.phi, i64 0 %acc = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %phi.scalar, %wide.load, splat (i1 true), i32 %evl) %acc.vec = insertelement poison, float %acc, i64 0 ``` Which eliminates the scalar -> vector -> scalar crossing during instruction selection. --------- Co-authored-by: yanming --- llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 10 +- .../RISCV/rvv/riscv-codegenprepare-asm.ll | 456 +++++++++++++++++ .../CodeGen/RISCV/rvv/riscv-codegenprepare.ll | 484 ++++++++++++++++++ 3 files changed, 946 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index 5be5345cca73a..b5cb05f30fb26 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -113,9 +113,10 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) { // vfredosum.vs v8, v8, v10 // vfmv.f.s fa0, v8 // -// This mainly affects ordered fadd reductions, since other types of reduction -// typically use element-wise vectorisation in the loop body. This tries to -// vectorize any scalar phis that feed into a fadd reduction: +// This mainly affects ordered fadd reductions and VP reductions that have a +// scalar start value, since other types of reduction typically use element-wise +// vectorisation in the loop body. This tries to vectorize any scalar phis that +// feed into these reductions: // // loop: // %phi = phi [ ..., %entry ], [ %acc, %loop ] @@ -137,7 +138,8 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { if (expandVPStrideLoad(I)) return true; - if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd) + if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd && + !isa(&I)) return false; auto *PHI = dyn_cast(I.getOperand(0)); diff --git a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll index 3bbdd1a257fdb..4e5f6e0f65489 100644 --- a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare-asm.ll @@ -42,3 +42,459 @@ vector.body: exit: ret float %acc } + +define i32 @vp_reduce_add(ptr %a) { +; CHECK-LABEL: vp_reduce_add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB1_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma +; CHECK-NEXT: slli a4, a1, 2 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: sub a2, a2, a3 +; CHECK-NEXT: vredsum.vs v8, v10, v8 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: bnez a2, .LBB1_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_and(ptr %a) { +; CHECK-LABEL: vp_reduce_and: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: lui a2, 524288 +; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB2_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma +; CHECK-NEXT: slli a4, a1, 2 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: sub a2, a2, a3 +; CHECK-NEXT: vredand.vs v8, v10, v8 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: bnez a2, .LBB2_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.and.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_or(ptr %a) { +; CHECK-LABEL: vp_reduce_or: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB3_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma +; CHECK-NEXT: slli a4, a1, 2 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: sub a2, a2, a3 +; CHECK-NEXT: vredor.vs v8, v10, v8 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: bnez a2, .LBB3_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.or.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_xor(ptr %a) { +; CHECK-LABEL: vp_reduce_xor: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB4_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma +; CHECK-NEXT: slli a4, a1, 2 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: sub a2, a2, a3 +; CHECK-NEXT: vredxor.vs v8, v10, v8 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: bnez a2, .LBB4_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.xor.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_smax(ptr %a) { +; CHECK-LABEL: vp_reduce_smax: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: lui a2, 524288 +; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB5_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma +; CHECK-NEXT: slli a4, a1, 2 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: sub a2, a2, a3 +; CHECK-NEXT: vredmax.vs v8, v10, v8 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: bnez a2, .LBB5_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.smax.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_smin(ptr %a) { +; CHECK-LABEL: vp_reduce_smin: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: lui a2, 524288 +; CHECK-NEXT: addi a2, a2, -1 +; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB6_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma +; CHECK-NEXT: slli a4, a1, 2 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: sub a2, a2, a3 +; CHECK-NEXT: vredmin.vs v8, v10, v8 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: bnez a2, .LBB6_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ 2147483647, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.smin.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_umax(ptr %a) { +; CHECK-LABEL: vp_reduce_umax: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB7_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma +; CHECK-NEXT: slli a4, a1, 2 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: sub a2, a2, a3 +; CHECK-NEXT: vredmaxu.vs v8, v10, v8 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: bnez a2, .LBB7_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.umax.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_umin(ptr %a) { +; CHECK-LABEL: vp_reduce_umin: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: lui a2, 524288 +; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB8_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma +; CHECK-NEXT: slli a4, a1, 2 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: sub a2, a2, a3 +; CHECK-NEXT: vredminu.vs v8, v10, v8 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: bnez a2, .LBB8_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.umin.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define float @vp_reduce_fadd(ptr %a) { +; CHECK-LABEL: vp_reduce_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB9_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma +; CHECK-NEXT: slli a4, a1, 2 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: sub a2, a2, a3 +; CHECK-NEXT: vfredosum.vs v8, v10, v8 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: bnez a2, .LBB9_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call float @llvm.vp.reduce.fadd.nxv4f32(float %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret float %red +} + +define float @vp_reduce_fmax(ptr %a) { +; CHECK-LABEL: vp_reduce_fmax: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB10_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma +; CHECK-NEXT: slli a4, a1, 2 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: sub a2, a2, a3 +; CHECK-NEXT: vfredmax.vs v8, v10, v8 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: bnez a2, .LBB10_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call float @llvm.vp.reduce.fmax.nxv4f32(float %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret float %red +} + +define float @vp_reduce_fmin(ptr %a) { +; CHECK-LABEL: vp_reduce_fmin: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: .LBB11_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a3, a2, e32, m2, ta, ma +; CHECK-NEXT: slli a4, a1, 2 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: sub a2, a2, a3 +; CHECK-NEXT: vfredmin.vs v8, v10, v8 +; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: bnez a2, .LBB11_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call float @llvm.vp.reduce.fmin.nxv4f32(float %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret float %red +} diff --git a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll index 006fc269050b0..8967fb8bf01ac 100644 --- a/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll +++ b/llvm/test/CodeGen/RISCV/rvv/riscv-codegenprepare.ll @@ -44,3 +44,487 @@ vector.body: exit: ret float %acc } + +define i32 @vp_reduce_add(ptr %a) { +; CHECK-LABEL: define i32 @vp_reduce_add( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ insertelement ( poison, i32 0, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true) +; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[TMP0]], i64 0 +; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 [[TMP1]], [[WIDE_LOAD]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]] +; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]] +; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0 +; CHECK-NEXT: [[TMP2]] = insertelement poison, i32 [[RED]], i64 0 +; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[RED]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.add.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_and(ptr %a) { +; CHECK-LABEL: define i32 @vp_reduce_and( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ insertelement ( poison, i32 -2147483648, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true) +; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[TMP0]], i64 0 +; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.and.nxv4i32(i32 [[TMP1]], [[WIDE_LOAD]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]] +; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]] +; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0 +; CHECK-NEXT: [[TMP2]] = insertelement poison, i32 [[RED]], i64 0 +; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[RED]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.and.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_or(ptr %a) { +; CHECK-LABEL: define i32 @vp_reduce_or( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ insertelement ( poison, i32 0, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true) +; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[TMP0]], i64 0 +; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.or.nxv4i32(i32 [[TMP1]], [[WIDE_LOAD]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]] +; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]] +; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0 +; CHECK-NEXT: [[TMP2]] = insertelement poison, i32 [[RED]], i64 0 +; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[RED]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.or.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_xor(ptr %a) { +; CHECK-LABEL: define i32 @vp_reduce_xor( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ insertelement ( poison, i32 0, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true) +; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[TMP0]], i64 0 +; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.xor.nxv4i32(i32 [[TMP1]], [[WIDE_LOAD]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]] +; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]] +; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0 +; CHECK-NEXT: [[TMP2]] = insertelement poison, i32 [[RED]], i64 0 +; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[RED]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.xor.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_smax(ptr %a) { +; CHECK-LABEL: define i32 @vp_reduce_smax( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ insertelement ( poison, i32 -2147483648, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true) +; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[TMP0]], i64 0 +; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.smax.nxv4i32(i32 [[TMP1]], [[WIDE_LOAD]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]] +; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]] +; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0 +; CHECK-NEXT: [[TMP2]] = insertelement poison, i32 [[RED]], i64 0 +; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[RED]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.smax.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_smin(ptr %a) { +; CHECK-LABEL: define i32 @vp_reduce_smin( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ insertelement ( poison, i32 2147483647, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true) +; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[TMP0]], i64 0 +; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.smin.nxv4i32(i32 [[TMP1]], [[WIDE_LOAD]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]] +; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]] +; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0 +; CHECK-NEXT: [[TMP2]] = insertelement poison, i32 [[RED]], i64 0 +; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[RED]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ 2147483647, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.smin.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_umax(ptr %a) { +; CHECK-LABEL: define i32 @vp_reduce_umax( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ insertelement ( poison, i32 0, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true) +; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[TMP0]], i64 0 +; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.umax.nxv4i32(i32 [[TMP1]], [[WIDE_LOAD]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]] +; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]] +; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0 +; CHECK-NEXT: [[TMP2]] = insertelement poison, i32 [[RED]], i64 0 +; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[RED]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ 0, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.umax.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define i32 @vp_reduce_umin(ptr %a) { +; CHECK-LABEL: define i32 @vp_reduce_umin( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ insertelement ( poison, i32 -2147483648, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true) +; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[SCALAR_IND]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call @llvm.vp.load.nxv4i32.p0(ptr [[ARRAYIDX6]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[TMP0]], i64 0 +; CHECK-NEXT: [[RED:%.*]] = tail call i32 @llvm.vp.reduce.umin.nxv4i32(i32 [[TMP1]], [[WIDE_LOAD]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]] +; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]] +; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0 +; CHECK-NEXT: [[TMP2]] = insertelement poison, i32 [[RED]], i64 0 +; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 [[RED]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi i32 [ -2147483648, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds i32, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call i32 @llvm.vp.reduce.umin.nxv4i32(i32 %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret i32 %red +} + +define float @vp_reduce_fadd(ptr %a) { +; CHECK-LABEL: define float @vp_reduce_fadd( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ insertelement ( poison, float 0.000000e+00, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true) +; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[SCALAR_IND]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call @llvm.vp.load.nxv4f32.p0(ptr [[ARRAYIDX6]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[TMP0]], i64 0 +; CHECK-NEXT: [[RED:%.*]] = tail call float @llvm.vp.reduce.fadd.nxv4f32(float [[TMP1]], [[WIDE_LOAD]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]] +; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]] +; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0 +; CHECK-NEXT: [[TMP2]] = insertelement poison, float [[RED]], i64 0 +; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret float [[RED]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call float @llvm.vp.reduce.fadd.nxv4f32(float %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret float %red +} + +define float @vp_reduce_fmax(ptr %a) { +; CHECK-LABEL: define float @vp_reduce_fmax( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ insertelement ( poison, float 0.000000e+00, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true) +; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[SCALAR_IND]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call @llvm.vp.load.nxv4f32.p0(ptr [[ARRAYIDX6]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[TMP0]], i64 0 +; CHECK-NEXT: [[RED:%.*]] = tail call float @llvm.vp.reduce.fmax.nxv4f32(float [[TMP1]], [[WIDE_LOAD]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]] +; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]] +; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0 +; CHECK-NEXT: [[TMP2]] = insertelement poison, float [[RED]], i64 0 +; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret float [[RED]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call float @llvm.vp.reduce.fmax.nxv4f32(float %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret float %red +} + +define float @vp_reduce_fmin(ptr %a) { +; CHECK-LABEL: define float @vp_reduce_fmin( +; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[TRIP_COUNT:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ], [ [[REMAINING_TRIP_COUNT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[SCALAR_IND:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXT_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ insertelement ( poison, float 0.000000e+00, i64 0), [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TRIP_COUNT]], i32 4, i1 true) +; CHECK-NEXT: [[EVL2:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[SCALAR_IND]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = tail call @llvm.vp.load.nxv4f32.p0(ptr [[ARRAYIDX6]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement [[TMP0]], i64 0 +; CHECK-NEXT: [[RED:%.*]] = tail call float @llvm.vp.reduce.fmin.nxv4f32(float [[TMP1]], [[WIDE_LOAD]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[REMAINING_TRIP_COUNT]] = sub nuw i64 [[TRIP_COUNT]], [[EVL2]] +; CHECK-NEXT: [[NEXT_IND]] = add i64 [[SCALAR_IND]], [[EVL2]] +; CHECK-NEXT: [[M:%.*]] = icmp eq i64 [[REMAINING_TRIP_COUNT]], 0 +; CHECK-NEXT: [[TMP2]] = insertelement poison, float [[RED]], i64 0 +; CHECK-NEXT: br i1 [[M]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret float [[RED]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %trip.count = phi i64 [ 1024, %entry ], [ %remaining.trip.count, %vector.body ] + %scalar.ind = phi i64 [ 0, %entry ], [ %next.ind, %vector.body ] + %red.phi = phi float [ 0.000000e+00, %entry ], [ %red, %vector.body ] + %evl = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %trip.count, i32 4, i1 true) + %evl2 = zext i32 %evl to i64 + %arrayidx6 = getelementptr inbounds float, ptr %a, i64 %scalar.ind + %wide.load = tail call @llvm.vp.load.nxv4f32.p0(ptr %arrayidx6, splat (i1 true), i32 %evl) + %red = tail call float @llvm.vp.reduce.fmin.nxv4f32(float %red.phi, %wide.load, splat (i1 true), i32 %evl) + %remaining.trip.count = sub nuw i64 %trip.count, %evl2 + %next.ind = add i64 %scalar.ind, %evl2 + %m = icmp eq i64 %remaining.trip.count, 0 + br i1 %m, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret float %red +} From 9d61eaa9ecd9a46d22a8a4efc67d31b9abba3616 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Mon, 31 Mar 2025 11:45:11 +0200 Subject: [PATCH 0077/1029] [lldb] Make GetRowForFunctionOffset compatible with discontinuous functions (#133250) The function had special handling for -1, but that is incompatible with functions whose entry point is not the first address. Use std::nullopt instead. --- lldb/include/lldb/Symbol/UnwindPlan.h | 11 ++++---- .../lldb/Target/RegisterContextUnwind.h | 21 ++++++++-------- lldb/source/Symbol/UnwindPlan.cpp | 7 +++--- lldb/source/Target/RegisterContextUnwind.cpp | 25 ++++++++++--------- 4 files changed, 33 insertions(+), 31 deletions(-) diff --git a/lldb/include/lldb/Symbol/UnwindPlan.h b/lldb/include/lldb/Symbol/UnwindPlan.h index db9aade93b6ba..9adda27b8f928 100644 --- a/lldb/include/lldb/Symbol/UnwindPlan.h +++ b/lldb/include/lldb/Symbol/UnwindPlan.h @@ -467,11 +467,12 @@ class UnwindPlan { void InsertRow(Row row, bool replace_existing = false); // Returns a pointer to the best row for the given offset into the function's - // instructions. If offset is -1 it indicates that the function start is - // unknown - the final row in the UnwindPlan is returned. In practice, the - // UnwindPlan for a function with no known start address will be the - // architectural default UnwindPlan which will only have one row. - const UnwindPlan::Row *GetRowForFunctionOffset(int offset) const; + // instructions. If offset is std::nullopt it indicates that the function + // start is unknown - the final row in the UnwindPlan is returned. In + // practice, the UnwindPlan for a function with no known start address will be + // the architectural default UnwindPlan which will only have one row. + const UnwindPlan::Row * + GetRowForFunctionOffset(std::optional offset) const; lldb::RegisterKind GetRegisterKind() const { return m_register_kind; } diff --git a/lldb/include/lldb/Target/RegisterContextUnwind.h b/lldb/include/lldb/Target/RegisterContextUnwind.h index 6cd918fedc003..c4ae29e657bfb 100644 --- a/lldb/include/lldb/Target/RegisterContextUnwind.h +++ b/lldb/include/lldb/Target/RegisterContextUnwind.h @@ -228,18 +228,17 @@ class RegisterContextUnwind : public lldb_private::RegisterContext { lldb_private::Address m_start_pc; lldb_private::Address m_current_pc; - int m_current_offset; // how far into the function we've executed; -1 if - // unknown - // 0 if no instructions have been executed yet. - - // 0 if no instructions have been executed yet. - // On architectures where the return address on the stack points - // to the instruction after the CALL, this value will have 1 - // subtracted from it. Else a function that ends in a CALL will - // have an offset pointing into the next function's address range. + /// How far into the function we've executed. 0 if no instructions have been + /// executed yet, std::nullopt if unknown. + std::optional m_current_offset; + + // How far into the function we've executed. 0 if no instructions have been + // executed yet, std::nullopt if unknown. On architectures where the return + // address on the stack points to the instruction after the CALL, this value + // will have 1 subtracted from it. Otherwise, a function that ends in a CALL + // will have an offset pointing into the next function's address range. // m_current_pc has the actual address of the "current" pc. - int m_current_offset_backed_up_one; // how far into the function we've - // executed; -1 if unknown + std::optional m_current_offset_backed_up_one; bool m_behaves_like_zeroth_frame; // this frame behaves like frame zero diff --git a/lldb/source/Symbol/UnwindPlan.cpp b/lldb/source/Symbol/UnwindPlan.cpp index 48089cbdecd97..f2846eb927bf8 100644 --- a/lldb/source/Symbol/UnwindPlan.cpp +++ b/lldb/source/Symbol/UnwindPlan.cpp @@ -417,9 +417,10 @@ void UnwindPlan::InsertRow(Row row, bool replace_existing) { } } -const UnwindPlan::Row *UnwindPlan::GetRowForFunctionOffset(int offset) const { - auto it = offset == -1 ? m_row_list.end() - : llvm::upper_bound(m_row_list, offset, RowLess()); +const UnwindPlan::Row * +UnwindPlan::GetRowForFunctionOffset(std::optional offset) const { + auto it = offset ? llvm::upper_bound(m_row_list, *offset, RowLess()) + : m_row_list.end(); if (it == m_row_list.begin()) return nullptr; // upper_bound returns the row strictly greater than our desired offset, which diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp index a035c57fbfc1c..cb3d7ee479890 100644 --- a/lldb/source/Target/RegisterContextUnwind.cpp +++ b/lldb/source/Target/RegisterContextUnwind.cpp @@ -94,8 +94,9 @@ bool RegisterContextUnwind::IsUnwindPlanValidForCurrentPC( return true; } - // if m_current_offset <= 0, we've got nothing else to try - if (m_current_offset <= 0) + // If don't have an offset or we're at the start of the function, we've got + // nothing else to try. + if (!m_current_offset || m_current_offset == 0) return false; // check pc - 1 to see if it's valid @@ -198,8 +199,8 @@ void RegisterContextUnwind::InitializeZerothFrame() { m_current_offset_backed_up_one = m_current_offset; } else { m_start_pc = m_current_pc; - m_current_offset = -1; - m_current_offset_backed_up_one = -1; + m_current_offset = std::nullopt; + m_current_offset_backed_up_one = std::nullopt; } // We've set m_frame_type and m_sym_ctx before these calls. @@ -437,8 +438,8 @@ void RegisterContextUnwind::InitializeNonZerothFrame() { m_frame_type = eNormalFrame; } m_all_registers_available = false; - m_current_offset = -1; - m_current_offset_backed_up_one = -1; + m_current_offset = std::nullopt; + m_current_offset_backed_up_one = std::nullopt; RegisterKind row_register_kind = m_full_unwind_plan_sp->GetRegisterKind(); if (const UnwindPlan::Row *row = m_full_unwind_plan_sp->GetRowForFunctionOffset(0)) { @@ -569,16 +570,16 @@ void RegisterContextUnwind::InitializeNonZerothFrame() { m_current_offset = pc - m_start_pc.GetLoadAddress(&process->GetTarget()); m_current_offset_backed_up_one = m_current_offset; if (decr_pc_and_recompute_addr_range && - m_current_offset_backed_up_one > 0) { - m_current_offset_backed_up_one--; + m_current_offset_backed_up_one != 0) { + --*m_current_offset_backed_up_one; if (m_sym_ctx_valid) { m_current_pc.SetLoadAddress(pc - 1, &process->GetTarget()); } } } else { m_start_pc = m_current_pc; - m_current_offset = -1; - m_current_offset_backed_up_one = -1; + m_current_offset = std::nullopt; + m_current_offset_backed_up_one = std::nullopt; } if (IsTrapHandlerSymbol(process, m_sym_ctx)) { @@ -746,7 +747,7 @@ bool RegisterContextUnwind::BehavesLikeZerothFrame() const { // 2. m_sym_ctx should already be filled in, and // 3. m_current_pc should have the current pc value for this frame // 4. m_current_offset_backed_up_one should have the current byte offset into -// the function, maybe backed up by 1, -1 if unknown +// the function, maybe backed up by 1, std::nullopt if unknown UnwindPlanSP RegisterContextUnwind::GetFastUnwindPlanForFrame() { UnwindPlanSP unwind_plan_sp; @@ -790,7 +791,7 @@ UnwindPlanSP RegisterContextUnwind::GetFastUnwindPlanForFrame() { // 2. m_sym_ctx should already be filled in, and // 3. m_current_pc should have the current pc value for this frame // 4. m_current_offset_backed_up_one should have the current byte offset into -// the function, maybe backed up by 1, -1 if unknown +// the function, maybe backed up by 1, std::nullopt if unknown UnwindPlanSP RegisterContextUnwind::GetFullUnwindPlanForFrame() { UnwindPlanSP unwind_plan_sp; From 0ec94983c4cf9183f0768d6e76b363e5ebc6b255 Mon Sep 17 00:00:00 2001 From: Zhaoxin Yang Date: Mon, 31 Mar 2025 17:47:50 +0800 Subject: [PATCH 0078/1029] [lld][LoongArch] Relax TLSDESC code sequence (#123677) Relax TLSDESC code sequence. Original code sequence: * pcalau12i $a0, %desc_pc_hi20(sym_desc) * addi.d $a0, $a0, %desc_pc_lo12(sym_desc) * ld.d $ra, $a0, %desc_ld(sym_desc) * jirl $ra, $ra, %desc_call(sym_desc) Cannot convert to LE/IE, but relax: * pcaddi $a0, %desc_pcrel_20(sym_desc) * ld.d $ra, $a0, %desc_ld(sym_desc) * jirl $ra, $ra, %desc_call(sym_desc) TODO: The transition from TLSDESC GD/LD to IE/LE will implement in a future patch. --- lld/ELF/Arch/LoongArch.cpp | 19 +- lld/test/ELF/loongarch-relax-tlsdesc.s | 280 +++++++++++++++++++++++++ 2 files changed, 297 insertions(+), 2 deletions(-) create mode 100644 lld/test/ELF/loongarch-relax-tlsdesc.s diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp index 709b31ed4e01a..4edc625b05cb0 100644 --- a/lld/ELF/Arch/LoongArch.cpp +++ b/lld/ELF/Arch/LoongArch.cpp @@ -766,9 +766,12 @@ static bool isPairRelaxable(ArrayRef relocs, size_t i) { // Relax code sequence. // From: // pcalau12i $a0, %pc_hi20(sym) | %ld_pc_hi20(sym) | %gd_pc_hi20(sym) +// | %desc_pc_hi20(sym) // addi.w/d $a0, $a0, %pc_lo12(sym) | %got_pc_lo12(sym) | %got_pc_lo12(sym) +// | %desc_pc_lo12(sym) // To: -// pcaddi $a0, %pc_lo12(sym) | %got_pc_lo12(sym) | %got_pc_lo12(sym) +// pcaddi $a0, %pc_lo12(sym) | %got_pc_lo12(sym) | %got_pc_lo12(sym) +// | %desc_pcrel_20(sym) // // From: // pcalau12i $a0, %got_pc_hi20(sym_got) @@ -786,7 +789,9 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, (rHi20.type == R_LARCH_TLS_GD_PC_HI20 && rLo12.type == R_LARCH_GOT_PC_LO12) || (rHi20.type == R_LARCH_TLS_LD_PC_HI20 && - rLo12.type == R_LARCH_GOT_PC_LO12))) + rLo12.type == R_LARCH_GOT_PC_LO12) || + (rHi20.type == R_LARCH_TLS_DESC_PC_HI20 && + rLo12.type == R_LARCH_TLS_DESC_PC_LO12))) return; // GOT references to absolute symbols can't be relaxed to use pcaddi in @@ -808,6 +813,8 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, dest = rHi20.sym->getVA(ctx); else if (rHi20.expr == RE_LOONGARCH_TLSGD_PAGE_PC) dest = ctx.in.got->getGlobalDynAddr(*rHi20.sym); + else if (rHi20.expr == RE_LOONGARCH_TLSDESC_PAGE_PC) + dest = ctx.in.got->getTlsDescAddr(*rHi20.sym); else { Err(ctx) << getErrorLoc(ctx, (const uint8_t *)loc) << "unknown expr (" << rHi20.expr << ") against symbol " << rHi20.sym @@ -841,6 +848,8 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i, sec.relaxAux->relocTypes[i + 2] = R_LARCH_TLS_GD_PCREL20_S2; else if (rHi20.type == R_LARCH_TLS_LD_PC_HI20) sec.relaxAux->relocTypes[i + 2] = R_LARCH_TLS_LD_PCREL20_S2; + else if (rHi20.type == R_LARCH_TLS_DESC_PC_HI20) + sec.relaxAux->relocTypes[i + 2] = R_LARCH_TLS_DESC_PCREL20_S2; else sec.relaxAux->relocTypes[i + 2] = R_LARCH_PCREL20_S2; sec.relaxAux->writes.push_back(insn(PCADDI, getD5(nextInsn), 0, 0)); @@ -947,6 +956,7 @@ static bool relax(Ctx &ctx, InputSection &sec) { case R_LARCH_GOT_PC_HI20: case R_LARCH_TLS_GD_PC_HI20: case R_LARCH_TLS_LD_PC_HI20: + case R_LARCH_TLS_DESC_PC_HI20: // The overflow check for i+2 will be carried out in isPairRelaxable. if (isPairRelaxable(relocs, i)) relaxPCHi20Lo12(ctx, sec, i, loc, r, relocs[i + 2], remove); @@ -1081,6 +1091,11 @@ void LoongArch::finalizeRelax(int passes) const { write32le(p, aux.writes[writesIdx++]); r.expr = R_TLSGD_PC; break; + case R_LARCH_TLS_DESC_PCREL20_S2: + skip = 4; + write32le(p, aux.writes[writesIdx++]); + r.expr = R_TLSDESC_PC; + break; default: llvm_unreachable("unsupported type"); } diff --git a/lld/test/ELF/loongarch-relax-tlsdesc.s b/lld/test/ELF/loongarch-relax-tlsdesc.s new file mode 100644 index 0000000000000..9ce7c5881ca96 --- /dev/null +++ b/lld/test/ELF/loongarch-relax-tlsdesc.s @@ -0,0 +1,280 @@ +# REQUIRES: loongarch +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=loongarch64 -mattr=+relax a.s -o a.64.o +# RUN: llvm-mc -filetype=obj -triple=loongarch64 -mattr=+relax c.s -o c.64.o +# RUN: ld.lld --relax -shared -soname=c.64.so c.64.o -o c.64.so + +## Test the TLSDESC relaxation. +# RUN: ld.lld --relax -shared -z now a.64.o c.64.o -o a.64.so +# RUN: llvm-readobj -r -x .got a.64.so | FileCheck --check-prefix=GD64-RELA %s +# RUN: llvm-objdump --no-show-raw-insn -dr -h a.64.so | FileCheck %s --check-prefix=GD64 + +## FIXME: The transition from TLSDESC to IE/LE has not yet been implemented. +## Keep the dynamic relocations and hand them over to dynamic linker. + +# RUN: ld.lld --relax -e 0 -z now a.64.o c.64.o -o a.64.le +# RUN: llvm-readobj -r -x .got a.64.le | FileCheck --check-prefix=LE64-RELA %s +# RUN: llvm-objdump --no-show-raw-insn -d -h a.64.le | FileCheck %s --check-prefix=LE64 + +# RUN: ld.lld --no-relax -e 0 -z now a.64.o c.64.o -o a.64.le.norelax +# RUN: llvm-objdump --no-show-raw-insn -d -h a.64.le.norelax | FileCheck %s --check-prefix=LE64-NORELAX + +# RUN: ld.lld --relax -e 0 -z now a.64.o c.64.so -o a.64.ie +# RUN: llvm-readobj -r -x .got a.64.ie | FileCheck --check-prefix=IE64-RELA %s +# RUN: llvm-objdump --no-show-raw-insn -d -h a.64.ie | FileCheck %s --check-prefix=IE64 + +# RUN: ld.lld --no-relax -e 0 -z now a.64.o c.64.so -o a.64.ie.norelax +# RUN: llvm-objdump --no-show-raw-insn -d -h a.64.ie.norelax | FileCheck %s --check-prefix=IE64-NORELAX + +# GD64-RELA: .rela.dyn { +# GD64-RELA-NEXT: 0x20460 R_LARCH_TLS_DESC64 - 0x7FF +# GD64-RELA-NEXT: 0x20430 R_LARCH_TLS_DESC64 a 0x0 +# GD64-RELA-NEXT: 0x20440 R_LARCH_TLS_DESC64 c 0x0 +# GD64-RELA-NEXT: 0x20450 R_LARCH_TLS_DESC64 d 0x0 +# GD64-RELA-NEXT: } +# GD64-RELA: Hex dump of section '.got': +# GD64-RELA-NEXT: 0x00020430 00000000 00000000 00000000 00000000 . +# GD64-RELA-NEXT: 0x00020440 00000000 00000000 00000000 00000000 . +# GD64-RELA-NEXT: 0x00020450 00000000 00000000 00000000 00000000 . +# GD64-RELA-NEXT: 0x00020460 00000000 00000000 00000000 00000000 . + +# GD64: .got 00000040 0000000000020430 + +## &.got[a]-. = 0x20430 - 0x10318 = 16454<<2 +# GD64: 10318: pcaddi $a0, 16454 +# GD64-NEXT: ld.d $ra, $a0, 0 +# GD64-NEXT: jirl $ra, $ra, 0 +# GD64-NEXT: add.d $a1, $a0, $tp + +## &.got[b]-. = 0x20430+48 - 0x10328: 0x10 pages, page offset 0x460 +## R_LARCH_RELAX does not appear in pairs. No relaxation. +# GD64: 10328: pcalau12i $a0, 16 +# GD64-NEXT: addi.d $a0, $a0, 1120 +# GD64-NEXT: ld.d $ra, $a0, 0 +# GD64-NEXT: jirl $ra, $ra, 0 +# GD64-NEXT: add.d $a2, $a0, $tp + +## &.got[c]-. = 0x20430+16 - 0x1033c: 0x10 pages, page offset 0x440 +## Without R_LARCH_RELAX relocation. No relaxation. +# GD64: 1033c: pcalau12i $a0, 16 +# GD64-NEXT: addi.d $t0, $zero, 0 +# GD64-NEXT: addi.d $a0, $a0, 1088 +# GD64-NEXT: addi.d $t0, $t0, 1 +# GD64-NEXT: ld.d $ra, $a0, 0 +# GD64-NEXT: addi.d $t0, $t0, 1 +# GD64-NEXT: jirl $ra, $ra, 0 +# GD64-NEXT: add.d $a3, $a0, $tp + +## &.got[d]-. = 0x20430+32 - 0x1035c = 16445<<2 +# GD64: 1035c: pcaddi $a0, 16445 +# GD64-NEXT: ld.d $ra, $a0, 0 +# GD64-NEXT: jirl $ra, $ra, 0 +# GD64-NEXT: add.d $a4, $a0, $tp + +# LE64-RELA: .rela.dyn { +# LE64-RELA-NEXT: 0x30280 R_LARCH_TLS_DESC64 - 0x8 +# LE64-RELA-NEXT: 0x30290 R_LARCH_TLS_DESC64 - 0x800 +# LE64-RELA-NEXT: 0x302A0 R_LARCH_TLS_DESC64 - 0x1000 +# LE64-RELA-NEXT: 0x302B0 R_LARCH_TLS_DESC64 - 0x7FF +# LE64-RELA-NEXT: } +# LE64-RELA: Hex dump of section '.got': +# LE64-RELA-NEXT: 0x00030280 00000000 00000000 00000000 00000000 . +# LE64-RELA-NEXT: 0x00030290 00000000 00000000 00000000 00000000 . +# LE64-RELA-NEXT: 0x000302a0 00000000 00000000 00000000 00000000 . +# LE64-RELA-NEXT: 0x000302b0 00000000 00000000 00000000 00000000 . + +# LE64: .got 00000040 0000000000030280 + +## &.got[a]-. = 0x30280 - 0x20228 = 16406<<2 +# LE64: 20228: pcaddi $a0, 16406 +# LE64-NEXT: ld.d $ra, $a0, 0 +# LE64-NEXT: jirl $ra, $ra, 0 +# LE64-NEXT: add.d $a1, $a0, $tp + +## &.got[b]-. = 0x30280+48 - 0x20238: 0x10 pages, page offset 0x2b0 +## R_LARCH_RELAX does not appear in pairs. No relaxation. +# LE64: 20238: pcalau12i $a0, 16 +# LE64-NEXT: addi.d $a0, $a0, 688 +# LE64-NEXT: ld.d $ra, $a0, 0 +# LE64-NEXT: jirl $ra, $ra, 0 +# LE64-NEXT: add.d $a2, $a0, $tp + +## &.got[c]-. = 0x30280+16 - 0x2024c: 0x10 pages, page offset 0x290 +## Without R_LARCH_RELAX relocation. No relaxation. +# LE64: 2024c: pcalau12i $a0, 16 +# LE64-NEXT: addi.d $t0, $zero, 0 +# LE64-NEXT: addi.d $a0, $a0, 656 +# LE64-NEXT: addi.d $t0, $t0, 1 +# LE64-NEXT: ld.d $ra, $a0, 0 +# LE64-NEXT: addi.d $t0, $t0, 1 +# LE64-NEXT: jirl $ra, $ra, 0 +# LE64-NEXT: add.d $a3, $a0, $tp + +## &.got[d]-. = 0x30280+32 - 0x2026c = 16397<<2 +# LE64: 2026c: pcaddi $a0, 16397 +# LE64-NEXT: ld.d $ra, $a0, 0 +# LE64-NEXT: jirl $ra, $ra, 0 +# LE64-NEXT: add.d $a4, $a0, $tp + +# LE64-NORELAX: .got 00000040 0000000000030288 + +## &.got[a]-. = 0x30288 - 0x20228 = 0x10 pages, page offset 0x288 +# LE64-NORELAX: 20228: pcalau12i $a0, 16 +# LE64-NORELAX-NEXT: addi.d $a0, $a0, 648 +# LE64-NORELAX-NEXT: ld.d $ra, $a0, 0 +# LE64-NORELAX-NEXT: jirl $ra, $ra, 0 +# LE64-NORELAX-NEXT: add.d $a1, $a0, $tp + +## &.got[b]-. = 0x30288+48 - 0x2023c: 0x10 pages, page offset 0x2b8 +## R_LARCH_RELAX does not appear in pairs. No relaxation. +# LE64-NORELAX: 2023c: pcalau12i $a0, 16 +# LE64-NORELAX-NEXT: addi.d $a0, $a0, 696 +# LE64-NORELAX-NEXT: ld.d $ra, $a0, 0 +# LE64-NORELAX-NEXT: jirl $ra, $ra, 0 +# LE64-NORELAX-NEXT: add.d $a2, $a0, $tp + +## &.got[c]-. = 0x30288+16 - 0x20250: 0x10 pages, page offset 0x298 +## Without R_LARCH_RELAX relocation. No relaxation. +# LE64-NORELAX: 20250: pcalau12i $a0, 16 +# LE64-NORELAX-NEXT: addi.d $t0, $zero, 0 +# LE64-NORELAX-NEXT: addi.d $a0, $a0, 664 +# LE64-NORELAX-NEXT: addi.d $t0, $t0, 1 +# LE64-NORELAX-NEXT: ld.d $ra, $a0, 0 +# LE64-NORELAX-NEXT: addi.d $t0, $t0, 1 +# LE64-NORELAX-NEXT: jirl $ra, $ra, 0 +# LE64-NORELAX-NEXT: add.d $a3, $a0, $tp + +## &.got[d]-. = 0x30288+32 - 0x20270: 0x10 pages, page offset 0x2a8 +# LE64-NORELAX: 20270: pcalau12i $a0, 16 +# LE64-NORELAX-NEXT: addi.d $a0, $a0, 680 +# LE64-NORELAX-NEXT: ld.d $ra, $a0, 0 +# LE64-NORELAX-NEXT: jirl $ra, $ra, 0 +# LE64-NORELAX-NEXT: add.d $a4, $a0, $tp + +# IE64-RELA: .rela.dyn { +# IE64-RELA-NEXT: 0x30430 R_LARCH_TLS_DESC64 - 0x8 +# IE64-RELA-NEXT: 0x30460 R_LARCH_TLS_DESC64 - 0x7FF +# IE64-RELA-NEXT: 0x30440 R_LARCH_TLS_DESC64 c 0x0 +# IE64-RELA-NEXT: 0x30450 R_LARCH_TLS_DESC64 d 0x0 +# IE64-RELA-NEXT: } +# IE64-RELA: Hex dump of section '.got': +# IE64-RELA-NEXT: 0x00030430 00000000 00000000 00000000 00000000 . +# IE64-RELA-NEXT: 0x00030440 00000000 00000000 00000000 00000000 . +# IE64-RELA-NEXT: 0x00030450 00000000 00000000 00000000 00000000 . +# IE64-RELA-NEXT: 0x00030460 00000000 00000000 00000000 00000000 . + +# IE64: .got 00000040 0000000000030430 + +## a and b are optimized to use LE. c and d are optimized to IE. +## &.got[a]-. = 0x30430 - 0x202f8 = 16462<<2 +# IE64: 202f8: pcaddi $a0, 16462 +# IE64-NEXT: ld.d $ra, $a0, 0 +# IE64-NEXT: jirl $ra, $ra, 0 +# IE64-NEXT: add.d $a1, $a0, $tp + +## &.got[b]-. = 0x30430+48 - 0x20308: 0x10 pages, page offset 0x460 +## R_LARCH_RELAX does not appear in pairs. No relaxation. +# IE64: 20308: pcalau12i $a0, 16 +# IE64-NEXT: addi.d $a0, $a0, 1120 +# IE64-NEXT: ld.d $ra, $a0, 0 +# IE64-NEXT: jirl $ra, $ra, 0 +# IE64-NEXT: add.d $a2, $a0, $tp + +## &.got[c]-. = 0x30430+16 - 0x2031c: 0x10 pages, page offset 0x440 +## Without R_LARCH_RELAX relocation. No relaxation. +# IE64: 2031c: pcalau12i $a0, 16 +# IE64-NEXT: addi.d $t0, $zero, 0 +# IE64-NEXT: addi.d $a0, $a0, 1088 +# IE64-NEXT: addi.d $t0, $t0, 1 +# IE64-NEXT: ld.d $ra, $a0, 0 +# IE64-NEXT: addi.d $t0, $t0, 1 +# IE64-NEXT: jirl $ra, $ra, 0 +# IE64-NEXT: add.d $a3, $a0, $tp + +## &.got[d]-. = 0x30430+32 - 0x2033c = 16453<<2 +# IE64: 2033c: pcaddi $a0, 16453 +# IE64-NEXT: ld.d $ra, $a0, 0 +# IE64-NEXT: jirl $ra, $ra, 0 +# IE64-NEXT: add.d $a4, $a0, $tp + +# IE64-NORELAX: .got 00000040 0000000000030438 + +## &.got[a]-. = 0x30438 - 0x202f8 = 0x10 pages, page offset 0x438 +# IE64-NORELAX: 202f8: pcalau12i $a0, 16 +# IE64-NORELAX-NEXT: addi.d $a0, $a0, 1080 +# IE64-NORELAX-NEXT: ld.d $ra, $a0, 0 +# IE64-NORELAX-NEXT: jirl $ra, $ra, 0 +# IE64-NORELAX-NEXT: add.d $a1, $a0, $tp + +## &.got[b]-. = 0x30438+48 - 0x2030c: 0x10 pages, page offset 0x468 +## R_LARCH_RELAX does not appear in pairs. No relaxation. +# IE64-NORELAX: 2030c: pcalau12i $a0, 16 +# IE64-NORELAX-NEXT: addi.d $a0, $a0, 1128 +# IE64-NORELAX-NEXT: ld.d $ra, $a0, 0 +# IE64-NORELAX-NEXT: jirl $ra, $ra, 0 +# IE64-NORELAX-NEXT: add.d $a2, $a0, $tp + +## &.got[c]-. = 0x30438+16 - 0x20320: 0x10 pages, page offset 0x448 +## Without R_LARCH_RELAX relocation. No relaxation. +# IE64-NORELAX: 20320: pcalau12i $a0, 16 +# IE64-NORELAX-NEXT: addi.d $t0, $zero, 0 +# IE64-NORELAX-NEXT: addi.d $a0, $a0, 1096 +# IE64-NORELAX-NEXT: addi.d $t0, $t0, 1 +# IE64-NORELAX-NEXT: ld.d $ra, $a0, 0 +# IE64-NORELAX-NEXT: addi.d $t0, $t0, 1 +# IE64-NORELAX-NEXT: jirl $ra, $ra, 0 +# IE64-NORELAX-NEXT: add.d $a3, $a0, $tp + +## &.got[d]-. = 0x30438+32 - 0x20340: 0x10 pages, page offset 0x458 +# IE64-NORELAX: 20340: pcalau12i $a0, 16 +# IE64-NORELAX-NEXT: addi.d $a0, $a0, 1112 +# IE64-NORELAX-NEXT: ld.d $ra, $a0, 0 +# IE64-NORELAX-NEXT: jirl $ra, $ra, 0 +# IE64-NORELAX-NEXT: add.d $a4, $a0, $tp + +#--- a.s +la.tls.desc $a0, a +add.d $a1, $a0, $tp + +# ADDI.D does not have R_LARCH_RELAX. No relaxation. +pcalau12i $a0, %desc_pc_hi20(b) +.reloc .-4, R_LARCH_RELAX, 0 +addi.d $a0, $a0, %desc_pc_lo12(b) +ld.d $ra, $a0, %desc_ld(b) +jirl $ra, $ra, %desc_call(b) +add.d $a2, $a0, $tp + +# TLSDESC to LE. No relaxation. +pcalau12i $a0, %desc_pc_hi20(c) +addi.d $t0, $zero, 0 +addi.d $a0, $a0, %desc_pc_lo12(c) +addi.d $t0, $t0, 1 +ld.d $ra, $a0, %desc_ld(c) +addi.d $t0, $t0, 1 +jirl $ra, $ra, %desc_call(c) +add.d $a3, $a0, $tp + +# PCALAU12I and ADDI.D have R_LARCH_RELAX. We perform relaxation. +pcalau12i $a0, %desc_pc_hi20(d) +.reloc .-4, R_LARCH_RELAX, 0 +addi.d $a0, $a0, %desc_pc_lo12(d) +.reloc .-4, R_LARCH_RELAX, 0 +ld.d $ra, $a0, %desc_ld(d) +jirl $ra, $ra, %desc_call(d) +add.d $a4, $a0, $tp + +.section .tbss,"awT",@nobits +.globl a +.zero 8 +a: +.zero 2039 ## Place b at 0x7ff +b: +.zero 1 + +#--- c.s +.section .tbss,"awT",@nobits +.globl c, d +c: +.zero 2048 ## Place d at 0x1000 +d: +.zero 4 From 1dee12531d6070170ed1e90a654b744d365cc56e Mon Sep 17 00:00:00 2001 From: Frank Schlimbach Date: Mon, 31 Mar 2025 12:51:45 +0200 Subject: [PATCH 0079/1029] [mlir][mpi] Lowering MPI_Allreduce (#133133) Lowering of mpi.all_reduce to LLVM function call --- mlir/include/mlir/Dialect/MPI/IR/MPI.td | 5 - mlir/include/mlir/Dialect/MPI/IR/MPIOps.td | 2 +- mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp | 223 ++++++++++++++++-- .../MPIToLLVM/{ops.mlir => mpitollvm.mlir} | 42 +++- .../Dialect/MPI/{ops.mlir => mpiops.mlir} | 8 +- 5 files changed, 243 insertions(+), 37 deletions(-) rename mlir/test/Conversion/MPIToLLVM/{ops.mlir => mpitollvm.mlir} (78%) rename mlir/test/Dialect/MPI/{ops.mlir => mpiops.mlir} (88%) diff --git a/mlir/include/mlir/Dialect/MPI/IR/MPI.td b/mlir/include/mlir/Dialect/MPI/IR/MPI.td index 7c84443e5520d..f2837e71df060 100644 --- a/mlir/include/mlir/Dialect/MPI/IR/MPI.td +++ b/mlir/include/mlir/Dialect/MPI/IR/MPI.td @@ -246,12 +246,7 @@ def MPI_OpClassEnum : I32EnumAttr<"MPI_OpClassEnum", "MPI operation class", [ MPI_OpMaxloc, MPI_OpReplace ]> { - let genSpecializedAttr = 0; let cppNamespace = "::mlir::mpi"; } -def MPI_OpClassAttr : EnumAttr { - let assemblyFormat = "`<` $value `>`"; -} - #endif // MLIR_DIALECT_MPI_IR_MPI_TD diff --git a/mlir/include/mlir/Dialect/MPI/IR/MPIOps.td b/mlir/include/mlir/Dialect/MPI/IR/MPIOps.td index db28bd09678f8..a8267b115b9e6 100644 --- a/mlir/include/mlir/Dialect/MPI/IR/MPIOps.td +++ b/mlir/include/mlir/Dialect/MPI/IR/MPIOps.td @@ -244,7 +244,7 @@ def MPI_AllReduceOp : MPI_Op<"allreduce", []> { let arguments = ( ins AnyMemRef : $sendbuf, AnyMemRef : $recvbuf, - MPI_OpClassAttr : $op + MPI_OpClassEnum : $op ); let results = (outs Optional:$retval); diff --git a/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp b/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp index d91f9512ccb8f..4e0f59305a647 100644 --- a/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp +++ b/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp @@ -47,6 +47,22 @@ static LLVM::LLVMFuncOp getOrDefineFunction(ModuleOp &moduleOp, moduleOp, loc, rewriter, name, name, type, LLVM::Linkage::External); } +std::pair getRawPtrAndSize(const Location loc, + ConversionPatternRewriter &rewriter, + Value memRef, Type elType) { + Type ptrType = LLVM::LLVMPointerType::get(rewriter.getContext()); + Value dataPtr = + rewriter.create(loc, ptrType, memRef, 1); + Value offset = rewriter.create( + loc, rewriter.getI64Type(), memRef, 2); + Value resPtr = + rewriter.create(loc, ptrType, elType, dataPtr, offset); + Value size = rewriter.create(loc, memRef, + ArrayRef{3, 0}); + size = rewriter.create(loc, rewriter.getI32Type(), size); + return {resPtr, size}; +} + /// When lowering the mpi dialect to functions calls certain details /// differ between various MPI implementations. This class will provide /// these in a generic way, depending on the MPI implementation that got @@ -77,6 +93,12 @@ class MPIImplTraits { /// type. virtual Value getDataType(const Location loc, ConversionPatternRewriter &rewriter, Type type) = 0; + + /// Gets or creates an MPI_Op value which corresponds to the given + /// enum value. + virtual Value getMPIOp(const Location loc, + ConversionPatternRewriter &rewriter, + mpi::MPI_OpClassEnum opAttr) = 0; }; //===----------------------------------------------------------------------===// @@ -94,6 +116,20 @@ class MPICHImplTraits : public MPIImplTraits { static constexpr int MPI_UINT16_T = 0x4c00023c; static constexpr int MPI_UINT32_T = 0x4c00043d; static constexpr int MPI_UINT64_T = 0x4c00083e; + static constexpr int MPI_MAX = 0x58000001; + static constexpr int MPI_MIN = 0x58000002; + static constexpr int MPI_SUM = 0x58000003; + static constexpr int MPI_PROD = 0x58000004; + static constexpr int MPI_LAND = 0x58000005; + static constexpr int MPI_BAND = 0x58000006; + static constexpr int MPI_LOR = 0x58000007; + static constexpr int MPI_BOR = 0x58000008; + static constexpr int MPI_LXOR = 0x58000009; + static constexpr int MPI_BXOR = 0x5800000a; + static constexpr int MPI_MINLOC = 0x5800000b; + static constexpr int MPI_MAXLOC = 0x5800000c; + static constexpr int MPI_REPLACE = 0x5800000d; + static constexpr int MPI_NO_OP = 0x5800000e; public: using MPIImplTraits::MPIImplTraits; @@ -136,6 +172,56 @@ class MPICHImplTraits : public MPIImplTraits { assert(false && "unsupported type"); return rewriter.create(loc, rewriter.getI32Type(), mtype); } + + Value getMPIOp(const Location loc, ConversionPatternRewriter &rewriter, + mpi::MPI_OpClassEnum opAttr) override { + int32_t op = MPI_NO_OP; + switch (opAttr) { + case mpi::MPI_OpClassEnum::MPI_OP_NULL: + op = MPI_NO_OP; + break; + case mpi::MPI_OpClassEnum::MPI_MAX: + op = MPI_MAX; + break; + case mpi::MPI_OpClassEnum::MPI_MIN: + op = MPI_MIN; + break; + case mpi::MPI_OpClassEnum::MPI_SUM: + op = MPI_SUM; + break; + case mpi::MPI_OpClassEnum::MPI_PROD: + op = MPI_PROD; + break; + case mpi::MPI_OpClassEnum::MPI_LAND: + op = MPI_LAND; + break; + case mpi::MPI_OpClassEnum::MPI_BAND: + op = MPI_BAND; + break; + case mpi::MPI_OpClassEnum::MPI_LOR: + op = MPI_LOR; + break; + case mpi::MPI_OpClassEnum::MPI_BOR: + op = MPI_BOR; + break; + case mpi::MPI_OpClassEnum::MPI_LXOR: + op = MPI_LXOR; + break; + case mpi::MPI_OpClassEnum::MPI_BXOR: + op = MPI_BXOR; + break; + case mpi::MPI_OpClassEnum::MPI_MINLOC: + op = MPI_MINLOC; + break; + case mpi::MPI_OpClassEnum::MPI_MAXLOC: + op = MPI_MAXLOC; + break; + case mpi::MPI_OpClassEnum::MPI_REPLACE: + op = MPI_REPLACE; + break; + } + return rewriter.create(loc, rewriter.getI32Type(), op); + } }; //===----------------------------------------------------------------------===// @@ -205,15 +291,74 @@ class OMPIImplTraits : public MPIImplTraits { auto context = rewriter.getContext(); // get external opaque struct pointer type - auto commStructT = + auto typeStructT = LLVM::LLVMStructType::getOpaque("ompi_predefined_datatype_t", context); // make sure global op definition exists - getOrDefineExternalStruct(loc, rewriter, mtype, commStructT); + getOrDefineExternalStruct(loc, rewriter, mtype, typeStructT); // get address of symbol return rewriter.create( loc, LLVM::LLVMPointerType::get(context), SymbolRefAttr::get(context, mtype)); } + + Value getMPIOp(const Location loc, ConversionPatternRewriter &rewriter, + mpi::MPI_OpClassEnum opAttr) override { + StringRef op; + switch (opAttr) { + case mpi::MPI_OpClassEnum::MPI_OP_NULL: + op = "ompi_mpi_no_op"; + break; + case mpi::MPI_OpClassEnum::MPI_MAX: + op = "ompi_mpi_max"; + break; + case mpi::MPI_OpClassEnum::MPI_MIN: + op = "ompi_mpi_min"; + break; + case mpi::MPI_OpClassEnum::MPI_SUM: + op = "ompi_mpi_sum"; + break; + case mpi::MPI_OpClassEnum::MPI_PROD: + op = "ompi_mpi_prod"; + break; + case mpi::MPI_OpClassEnum::MPI_LAND: + op = "ompi_mpi_land"; + break; + case mpi::MPI_OpClassEnum::MPI_BAND: + op = "ompi_mpi_band"; + break; + case mpi::MPI_OpClassEnum::MPI_LOR: + op = "ompi_mpi_lor"; + break; + case mpi::MPI_OpClassEnum::MPI_BOR: + op = "ompi_mpi_bor"; + break; + case mpi::MPI_OpClassEnum::MPI_LXOR: + op = "ompi_mpi_lxor"; + break; + case mpi::MPI_OpClassEnum::MPI_BXOR: + op = "ompi_mpi_bxor"; + break; + case mpi::MPI_OpClassEnum::MPI_MINLOC: + op = "ompi_mpi_minloc"; + break; + case mpi::MPI_OpClassEnum::MPI_MAXLOC: + op = "ompi_mpi_maxloc"; + break; + case mpi::MPI_OpClassEnum::MPI_REPLACE: + op = "ompi_mpi_replace"; + break; + } + auto context = rewriter.getContext(); + // get external opaque struct pointer type + auto opStructT = + LLVM::LLVMStructType::getOpaque("ompi_predefined_op_t", context); + // make sure global op definition exists + getOrDefineExternalStruct(loc, rewriter, op, opStructT); + // get address of symbol + return rewriter.create( + loc, LLVM::LLVMPointerType::get(context), + SymbolRefAttr::get(context, op)); + } }; std::unique_ptr MPIImplTraits::get(ModuleOp &moduleOp) { @@ -365,8 +510,6 @@ struct SendOpLowering : public ConvertOpToLLVMPattern { Location loc = op.getLoc(); MLIRContext *context = rewriter.getContext(); Type i32 = rewriter.getI32Type(); - Type i64 = rewriter.getI64Type(); - Value memRef = adaptor.getRef(); Type elemType = op.getRef().getType().getElementType(); // ptrType `!llvm.ptr` @@ -376,14 +519,8 @@ struct SendOpLowering : public ConvertOpToLLVMPattern { auto moduleOp = op->getParentOfType(); // get MPI_COMM_WORLD, dataType and pointer - Value dataPtr = - rewriter.create(loc, ptrType, memRef, 1); - Value offset = rewriter.create(loc, i64, memRef, 2); - dataPtr = - rewriter.create(loc, ptrType, elemType, dataPtr, offset); - Value size = rewriter.create(loc, memRef, - ArrayRef{3, 0}); - size = rewriter.create(loc, i32, size); + auto [dataPtr, size] = + getRawPtrAndSize(loc, rewriter, adaptor.getRef(), elemType); auto mpiTraits = MPIImplTraits::get(moduleOp); Value dataType = mpiTraits->getDataType(loc, rewriter, elemType); Value commWorld = mpiTraits->getCommWorld(loc, rewriter); @@ -425,7 +562,6 @@ struct RecvOpLowering : public ConvertOpToLLVMPattern { MLIRContext *context = rewriter.getContext(); Type i32 = rewriter.getI32Type(); Type i64 = rewriter.getI64Type(); - Value memRef = adaptor.getRef(); Type elemType = op.getRef().getType().getElementType(); // ptrType `!llvm.ptr` @@ -435,14 +571,8 @@ struct RecvOpLowering : public ConvertOpToLLVMPattern { auto moduleOp = op->getParentOfType(); // get MPI_COMM_WORLD, dataType, status_ignore and pointer - Value dataPtr = - rewriter.create(loc, ptrType, memRef, 1); - Value offset = rewriter.create(loc, i64, memRef, 2); - dataPtr = - rewriter.create(loc, ptrType, elemType, dataPtr, offset); - Value size = rewriter.create(loc, memRef, - ArrayRef{3, 0}); - size = rewriter.create(loc, i32, size); + auto [dataPtr, size] = + getRawPtrAndSize(loc, rewriter, adaptor.getRef(), elemType); auto mpiTraits = MPIImplTraits::get(moduleOp); Value dataType = mpiTraits->getDataType(loc, rewriter, elemType); Value commWorld = mpiTraits->getCommWorld(loc, rewriter); @@ -474,6 +604,55 @@ struct RecvOpLowering : public ConvertOpToLLVMPattern { } }; +//===----------------------------------------------------------------------===// +// AllReduceOpLowering +//===----------------------------------------------------------------------===// + +struct AllReduceOpLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(mpi::AllReduceOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + MLIRContext *context = rewriter.getContext(); + Type i32 = rewriter.getI32Type(); + Type elemType = op.getSendbuf().getType().getElementType(); + + // ptrType `!llvm.ptr` + Type ptrType = LLVM::LLVMPointerType::get(context); + auto moduleOp = op->getParentOfType(); + auto mpiTraits = MPIImplTraits::get(moduleOp); + auto [sendPtr, sendSize] = + getRawPtrAndSize(loc, rewriter, adaptor.getSendbuf(), elemType); + auto [recvPtr, recvSize] = + getRawPtrAndSize(loc, rewriter, adaptor.getRecvbuf(), elemType); + Value dataType = mpiTraits->getDataType(loc, rewriter, elemType); + Value mpiOp = mpiTraits->getMPIOp(loc, rewriter, op.getOp()); + Value commWorld = mpiTraits->getCommWorld(loc, rewriter); + // 'int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, + // MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)' + auto funcType = LLVM::LLVMFunctionType::get( + i32, {ptrType, ptrType, i32, dataType.getType(), mpiOp.getType(), + commWorld.getType()}); + // get or create function declaration: + LLVM::LLVMFuncOp funcDecl = + getOrDefineFunction(moduleOp, loc, rewriter, "MPI_Allreduce", funcType); + + // replace op with function call + auto funcCall = rewriter.create( + loc, funcDecl, + ValueRange{sendPtr, recvPtr, sendSize, dataType, mpiOp, commWorld}); + + if (op.getRetval()) + rewriter.replaceOp(op, funcCall.getResult()); + else + rewriter.eraseOp(op); + + return success(); + } +}; + //===----------------------------------------------------------------------===// // ConvertToLLVMPatternInterface implementation //===----------------------------------------------------------------------===// @@ -498,7 +677,7 @@ struct FuncToLLVMDialectInterface : public ConvertToLLVMPatternInterface { void mpi::populateMPIToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns) { patterns.add(converter); + SendOpLowering, RecvOpLowering, AllReduceOpLowering>(converter); } void mpi::registerConvertMPIToLLVMInterface(DialectRegistry ®istry) { diff --git a/mlir/test/Conversion/MPIToLLVM/ops.mlir b/mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir similarity index 78% rename from mlir/test/Conversion/MPIToLLVM/ops.mlir rename to mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir index 3c1b344efd50b..b630ce3a23f30 100644 --- a/mlir/test/Conversion/MPIToLLVM/ops.mlir +++ b/mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir @@ -1,13 +1,13 @@ // RUN: mlir-opt -split-input-file -convert-to-llvm %s | FileCheck %s // COM: Test MPICH ABI -// CHECK: module attributes {mpi.dlti = #dlti.map<"MPI:Implementation" = "MPICH">} { +// CHECK: module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "MPICH">} { // CHECK: llvm.func @MPI_Finalize() -> i32 // CHECK: llvm.func @MPI_Recv(!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32 // CHECK: llvm.func @MPI_Send(!llvm.ptr, i32, i32, i32, i32, i32) -> i32 // CHECK: llvm.func @MPI_Comm_rank(i32, !llvm.ptr) -> i32 // CHECK: llvm.func @MPI_Init(!llvm.ptr, !llvm.ptr) -> i32 -module attributes { mpi.dlti = #dlti.map<"MPI:Implementation" = "MPICH"> } { +module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "MPICH">} { // CHECK: llvm.func @mpi_test_mpich([[varg0:%.+]]: !llvm.ptr, [[varg1:%.+]]: !llvm.ptr, [[varg2:%.+]]: i64, [[varg3:%.+]]: i64, [[varg4:%.+]]: i64) { func.func @mpi_test_mpich(%arg0: memref<100xf32>) { @@ -73,7 +73,23 @@ module attributes { mpi.dlti = #dlti.map<"MPI:Implementation" = "MPICH"> } { // CHECK: [[v48:%.*]] = llvm.call @MPI_Recv([[v41]], [[v43]], [[v44]], [[v12]], [[v12]], [[v45]], [[v47]]) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32 %2 = mpi.recv(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval - // CHECK: [[v49:%.*]] = llvm.call @MPI_Finalize() : () -> i32 + // CHECK: [[v49:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v50:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v51:%.*]] = llvm.getelementptr [[v49]][[[v50]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + // CHECK: [[v52:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v53:%.*]] = llvm.trunc [[v52]] : i64 to i32 + // CHECK: [[v54:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v55:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v56:%.*]] = llvm.getelementptr [[v54]][[[v55]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + // CHECK: [[v57:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v58:%.*]] = llvm.trunc [[v57]] : i64 to i32 + // CHECK: [[v59:%.*]] = llvm.mlir.constant(1275069450 : i32) : i32 + // CHECK: [[v60:%.*]] = llvm.mlir.constant(1476395011 : i32) : i32 + // CHECK: [[v61:%.*]] = llvm.mlir.constant(1140850688 : i32) : i32 + // CHECK: [[v62:%.*]] = llvm.call @MPI_Allreduce([[v51]], [[v56]], [[v53]], [[v59]], [[v60]], [[v61]]) : (!llvm.ptr, !llvm.ptr, i32, i32, i32, i32) -> i32 + mpi.allreduce(%arg0, %arg0, MPI_SUM) : memref<100xf32>, memref<100xf32> + + // CHECK: llvm.call @MPI_Finalize() : () -> i32 %3 = mpi.finalize : !mpi.retval return @@ -83,7 +99,7 @@ module attributes { mpi.dlti = #dlti.map<"MPI:Implementation" = "MPICH"> } { // ----- // COM: Test OpenMPI ABI -// CHECK: module attributes {mpi.dlti = #dlti.map<"MPI:Implementation" = "OpenMPI">} { +// CHECK: module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "OpenMPI">} { // CHECK: llvm.func @MPI_Finalize() -> i32 // CHECK: llvm.func @MPI_Recv(!llvm.ptr, i32, !llvm.ptr, i32, i32, !llvm.ptr, !llvm.ptr) -> i32 // CHECK: llvm.func @MPI_Send(!llvm.ptr, i32, !llvm.ptr, i32, i32, !llvm.ptr) -> i32 @@ -91,7 +107,7 @@ module attributes { mpi.dlti = #dlti.map<"MPI:Implementation" = "MPICH"> } { // CHECK: llvm.func @MPI_Comm_rank(!llvm.ptr, !llvm.ptr) -> i32 // CHECK: llvm.mlir.global external @ompi_mpi_comm_world() {addr_space = 0 : i32} : !llvm.struct<"ompi_communicator_t", opaque> // CHECK: llvm.func @MPI_Init(!llvm.ptr, !llvm.ptr) -> i32 -module attributes { mpi.dlti = #dlti.map<"MPI:Implementation" = "OpenMPI"> } { +module attributes { dlti.map = #dlti.map<"MPI:Implementation" = "OpenMPI"> } { // CHECK: llvm.func @mpi_test_openmpi([[varg0:%.+]]: !llvm.ptr, [[varg1:%.+]]: !llvm.ptr, [[varg2:%.+]]: i64, [[varg3:%.+]]: i64, [[varg4:%.+]]: i64) { func.func @mpi_test_openmpi(%arg0: memref<100xf32>) { @@ -157,6 +173,22 @@ module attributes { mpi.dlti = #dlti.map<"MPI:Implementation" = "OpenMPI"> } { // CHECK: [[v48:%.*]] = llvm.call @MPI_Recv([[v41]], [[v43]], [[v44]], [[v12]], [[v12]], [[v45]], [[v47]]) : (!llvm.ptr, i32, !llvm.ptr, i32, i32, !llvm.ptr, !llvm.ptr) -> i32 %2 = mpi.recv(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval + // CHECK: [[v49:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v50:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v51:%.*]] = llvm.getelementptr [[v49]][[[v50]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + // CHECK: [[v52:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v53:%.*]] = llvm.trunc [[v52]] : i64 to i32 + // CHECK: [[v54:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v55:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v56:%.*]] = llvm.getelementptr [[v54]][[[v55]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + // CHECK: [[v57:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v58:%.*]] = llvm.trunc [[v57]] : i64 to i32 + // CHECK: [[v59:%.*]] = llvm.mlir.addressof @ompi_mpi_float : !llvm.ptr + // CHECK: [[v60:%.*]] = llvm.mlir.addressof @ompi_mpi_sum : !llvm.ptr + // CHECK: [[v61:%.*]] = llvm.mlir.addressof @ompi_mpi_comm_world : !llvm.ptr + // CHECK: [[v62:%.*]] = llvm.call @MPI_Allreduce([[v51]], [[v56]], [[v53]], [[v59]], [[v60]], [[v61]]) : (!llvm.ptr, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + mpi.allreduce(%arg0, %arg0, MPI_SUM) : memref<100xf32>, memref<100xf32> + // CHECK: [[v49:%.*]] = llvm.call @MPI_Finalize() : () -> i32 %3 = mpi.finalize : !mpi.retval diff --git a/mlir/test/Dialect/MPI/ops.mlir b/mlir/test/Dialect/MPI/mpiops.mlir similarity index 88% rename from mlir/test/Dialect/MPI/ops.mlir rename to mlir/test/Dialect/MPI/mpiops.mlir index f23a7e18a2ee9..fb4333611a246 100644 --- a/mlir/test/Dialect/MPI/ops.mlir +++ b/mlir/test/Dialect/MPI/mpiops.mlir @@ -48,11 +48,11 @@ func.func @mpi_test(%ref : memref<100xf32>) -> () { // CHECK-NEXT: %5 = mpi.barrier : !mpi.retval %err7 = mpi.barrier : !mpi.retval - // CHECK-NEXT: mpi.allreduce(%arg0, %arg0, ) : memref<100xf32>, memref<100xf32> - mpi.allreduce(%ref, %ref, ) : memref<100xf32>, memref<100xf32> + // CHECK-NEXT: mpi.allreduce(%arg0, %arg0, MPI_SUM) : memref<100xf32>, memref<100xf32> + mpi.allreduce(%ref, %ref, MPI_SUM) : memref<100xf32>, memref<100xf32> - // CHECK-NEXT: mpi.allreduce(%arg0, %arg0, ) : memref<100xf32>, memref<100xf32> -> !mpi.retval - %err8 = mpi.allreduce(%ref, %ref, ) : memref<100xf32>, memref<100xf32> -> !mpi.retval + // CHECK-NEXT: mpi.allreduce(%arg0, %arg0, MPI_SUM) : memref<100xf32>, memref<100xf32> -> !mpi.retval + %err8 = mpi.allreduce(%ref, %ref, MPI_SUM) : memref<100xf32>, memref<100xf32> -> !mpi.retval // CHECK-NEXT: %7 = mpi.finalize : !mpi.retval %rval = mpi.finalize : !mpi.retval From 606e0b4806b92481e8be35f4e0cfccd0074c6ee7 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Mon, 31 Mar 2025 12:56:09 +0200 Subject: [PATCH 0080/1029] [ARM64EC] Add support for function aliases on ARM64EC (#132295) Required for mingw-w64, which uses the alias attribute in its CRT. Follows ARM64EC mangling rules by mangling the alias symbol and emitting an unmangled anti-dependency alias. Since metadata is not allowed on GlobalAlias objects, extend arm64ec_unmangled_name to support multiple unmangled names and attach the alias anti-dependency name to the target function's metadata. --- .../AArch64/AArch64Arm64ECCallLowering.cpp | 37 ++++++++++++---- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 26 ++++++------ llvm/test/CodeGen/AArch64/arm64ec-alias.ll | 42 +++++++++++++++++++ llvm/test/CodeGen/AArch64/dllexport.ll | 16 +++---- 4 files changed, 92 insertions(+), 29 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/arm64ec-alias.ll diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index 066d62b3d4b4b..9553a44fb317e 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -627,10 +627,10 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) { Function::Create(Arm64Ty, GlobalValue::WeakODRLinkage, 0, ThunkName, M); GuestExit->setComdat(M->getOrInsertComdat(ThunkName)); GuestExit->setSection(".wowthk$aa"); - GuestExit->setMetadata( + GuestExit->addMetadata( "arm64ec_unmangled_name", - MDNode::get(M->getContext(), - MDString::get(M->getContext(), F->getName()))); + *MDNode::get(M->getContext(), + MDString::get(M->getContext(), F->getName()))); GuestExit->setMetadata( "arm64ec_ecmangled_name", MDNode::get(M->getContext(), @@ -803,6 +803,23 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) { DispatchFnGlobal = M->getOrInsertGlobal("__os_arm64x_dispatch_call", DispatchFnPtrType); + // Mangle names of function aliases and add the alias name to + // arm64ec_unmangled_name metadata to ensure a weak anti-dependency symbol is + // emitted for the alias as well. Do this early, before handling + // hybrid_patchable functions, to avoid mangling their aliases. + for (GlobalAlias &A : Mod.aliases()) { + auto F = dyn_cast_or_null(A.getAliaseeObject()); + if (!F) + continue; + if (std::optional MangledName = + getArm64ECMangledFunctionName(A.getName().str())) { + F->addMetadata("arm64ec_unmangled_name", + *MDNode::get(M->getContext(), + MDString::get(M->getContext(), A.getName()))); + A.setName(MangledName.value()); + } + } + DenseMap FnsMap; SetVector PatchableFns; @@ -837,20 +854,24 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) { // emitGlobalAlias to emit the right alias. auto *A = GlobalAlias::create(GlobalValue::LinkOnceODRLinkage, OrigName, &F); + auto *AM = GlobalAlias::create(GlobalValue::LinkOnceODRLinkage, + MangledName.value(), &F); + F.replaceUsesWithIf(AM, + [](Use &U) { return isa(U.getUser()); }); F.replaceAllUsesWith(A); F.setMetadata("arm64ec_exp_name", MDNode::get(M->getContext(), MDString::get(M->getContext(), "EXP+" + MangledName.value()))); A->setAliasee(&F); + AM->setAliasee(&F); if (F.hasDLLExportStorageClass()) { A->setDLLStorageClass(GlobalValue::DLLExportStorageClass); F.setDLLStorageClass(GlobalValue::DefaultStorageClass); } - FnsMap[A] = GlobalAlias::create(GlobalValue::LinkOnceODRLinkage, - MangledName.value(), &F); + FnsMap[A] = AM; PatchableFns.insert(A); } } @@ -928,9 +949,9 @@ bool AArch64Arm64ECCallLowering::processFunction( if (!F.hasLocalLinkage() || F.hasAddressTaken()) { if (std::optional MangledName = getArm64ECMangledFunctionName(F.getName().str())) { - F.setMetadata("arm64ec_unmangled_name", - MDNode::get(M->getContext(), - MDString::get(M->getContext(), F.getName()))); + F.addMetadata("arm64ec_unmangled_name", + *MDNode::get(M->getContext(), + MDString::get(M->getContext(), F.getName()))); if (F.hasComdat() && F.getComdat()->getName() == F.getName()) { Comdat *MangledComdat = M->getOrInsertComdat(MangledName.value()); SmallVector ComdatUsers = diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index d29a72a4f6884..db0652bc5949c 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -45,6 +45,7 @@ #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" @@ -1387,22 +1388,21 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() { return Sym; }; - if (MCSymbol *UnmangledSym = - getSymbolFromMetadata("arm64ec_unmangled_name")) { - MCSymbol *ECMangledSym = getSymbolFromMetadata("arm64ec_ecmangled_name"); - - if (ECMangledSym) { - // An external function, emit the alias from the unmangled symbol to - // mangled symbol name and the alias from the mangled symbol to guest - // exit thunk. + SmallVector UnmangledNames; + MF->getFunction().getMetadata("arm64ec_unmangled_name", UnmangledNames); + for (MDNode *Node : UnmangledNames) { + StringRef NameStr = cast(Node->getOperand(0))->getString(); + MCSymbol *UnmangledSym = MMI->getContext().getOrCreateSymbol(NameStr); + if (std::optional MangledName = + getArm64ECMangledFunctionName(UnmangledSym->getName())) { + MCSymbol *ECMangledSym = + MMI->getContext().getOrCreateSymbol(*MangledName); emitFunctionAlias(UnmangledSym, ECMangledSym); - emitFunctionAlias(ECMangledSym, CurrentFnSym); - } else { - // A function implementation, emit the alias from the unmangled symbol - // to mangled symbol name. - emitFunctionAlias(UnmangledSym, CurrentFnSym); } } + if (MCSymbol *ECMangledSym = + getSymbolFromMetadata("arm64ec_ecmangled_name")) + emitFunctionAlias(ECMangledSym, CurrentFnSym); } } diff --git a/llvm/test/CodeGen/AArch64/arm64ec-alias.ll b/llvm/test/CodeGen/AArch64/arm64ec-alias.ll new file mode 100644 index 0000000000000..03cc873136940 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64ec-alias.ll @@ -0,0 +1,42 @@ +; RUN: llc -mtriple arm64ec-windows-msvc -filetype asm -o - %s | FileCheck %s + +define void @func() { + ret void +} + +define dso_local void @patchable_func() hybrid_patchable { + ret void +} + +@func_alias = alias void (), ptr @func +@func_alias2 = alias void (), ptr @func_alias +@patchable_alias = alias void (), ptr @patchable_func + +; CHECK: .weak_anti_dep func_alias +; CHECK-NEXT: .set func_alias, "#func_alias" +; CHECK-NEXT: .weak_anti_dep func_alias2 +; CHECK-NEXT: .set func_alias2, "#func_alias2" +; CHECK-NEXT: .weak_anti_dep func +; CHECK-NEXT: .set func, "#func" +; CHECK: .weak_anti_dep patchable_alias +; CHECK-NEXT: .set patchable_alias, "#patchable_alias" + +; CHECK: .globl "#func_alias" +; CHECK-NEXT: .def "#func_alias"; +; CHECK-NEXT: .scl 2; +; CHECK-NEXT: .type 32; +; CHECK-NEXT: .endef +; CHECK-NEXT: .set "#func_alias", "#func" +; CHECK-NEXT: .globl "#func_alias2" +; CHECK-NEXT: .def "#func_alias2"; +; CHECK-NEXT: .scl 2; +; CHECK-NEXT: .type 32; +; CHECK-NEXT: .endef +; CHECK-NEXT: .set "#func_alias2", "#func_alias" + +; CHECK: .globl "#patchable_alias" +; CHECK-NEXT: .def "#patchable_alias"; +; CHECK-NEXT: .scl 2; +; CHECK-NEXT: .type 32; +; CHECK-NEXT: .endef +; CHECK-NEXT: .set "#patchable_alias", "#patchable_func" diff --git a/llvm/test/CodeGen/AArch64/dllexport.ll b/llvm/test/CodeGen/AArch64/dllexport.ll index 580fb5fd9e79e..e15fc0a928b66 100644 --- a/llvm/test/CodeGen/AArch64/dllexport.ll +++ b/llvm/test/CodeGen/AArch64/dllexport.ll @@ -88,10 +88,10 @@ define weak_odr dllexport void @l() { ; CHECK-GNU-EC: .ascii " -export:o,data" ; CHECK-GNU-EC: .ascii " -export:p,data" ; CHECK-GNU-EC: .ascii " -export:q,data" -; CHECK-GNU-EC: .ascii " -export:r" -; CHECK-GNU-EC: .ascii " -export:s" -; CHECK-GNU-EC: .ascii " -export:t" -; CHECK-GNU-EC: .ascii " -export:u" +; CHECK-GNU-EC: .ascii " -export:#r,EXPORTAS,r" +; CHECK-GNU-EC: .ascii " -export:#s,EXPORTAS,s" +; CHECK-GNU-EC: .ascii " -export:#t,EXPORTAS,t" +; CHECK-GNU-EC: .ascii " -export:#u,EXPORTAS,u" ; CHECK-MSVC-EC-NOT: /EXPORT:f ; CHECK-MSVC-EC-NOT: /EXPORT:#f,EXPORTAS,f ; CHECK-MSVC-EC: .ascii " /EXPORT:#g,EXPORTAS,g" @@ -106,7 +106,7 @@ define weak_odr dllexport void @l() { ; CHECK-MSVC-EC: .ascii " /EXPORT:o,DATA" ; CHECK-MSVC-EC: .ascii " /EXPORT:p,DATA" ; CHECK-MSVC-EC: .ascii " /EXPORT:q,DATA" -; CHECK-MSVC-EC: .ascii " /EXPORT:r" -; CHECK-MSVC-EC: .ascii " /EXPORT:s" -; CHECK-MSVC-EC: .ascii " /EXPORT:t" -; CHECK-MSVC-EC: .ascii " /EXPORT:u" +; CHECK-MSVC-EC: .ascii " /EXPORT:#r,EXPORTAS,r" +; CHECK-MSVC-EC: .ascii " /EXPORT:#s,EXPORTAS,s" +; CHECK-MSVC-EC: .ascii " /EXPORT:#t,EXPORTAS,t" +; CHECK-MSVC-EC: .ascii " /EXPORT:#u,EXPORTAS,u" From f82283a84ed897f06a1aaac028accbad0d5057c7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 31 Mar 2025 18:06:37 +0700 Subject: [PATCH 0081/1029] llvm-reduce: Use 80 dashes for section separator in status printing (#133686) --- llvm/tools/llvm-reduce/deltas/Delta.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llvm/tools/llvm-reduce/deltas/Delta.cpp b/llvm/tools/llvm-reduce/deltas/Delta.cpp index 5b9f0330f9c7e..d4106b0243aea 100644 --- a/llvm/tools/llvm-reduce/deltas/Delta.cpp +++ b/llvm/tools/llvm-reduce/deltas/Delta.cpp @@ -63,6 +63,10 @@ static cl::opt NumJobs( unsigned NumJobs = 1; #endif +static StringLiteral SeparatorLine = + "--------------------------------------------------------------------------" + "------\n"; + /// Splits Chunks in half and prints them. /// If unable to split (when chunk size is 1) returns false. static bool increaseGranularity(std::vector &Chunks) { @@ -223,7 +227,7 @@ void llvm::runDeltaPass(TestRunner &Test, const DeltaPass &Pass) { if (!Targets) { if (Verbose) errs() << "\nNothing to reduce\n"; - errs() << "----------------------------\n"; + errs() << SeparatorLine; return; } @@ -359,5 +363,5 @@ void llvm::runDeltaPass(TestRunner &Test, const DeltaPass &Pass) { } if (Verbose) errs() << "Couldn't increase anymore.\n"; - errs() << "----------------------------\n"; + errs() << SeparatorLine; } From 78777a204ad9a3f17f04f90040f88855f47aa50f Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 31 Mar 2025 07:28:44 -0400 Subject: [PATCH 0082/1029] [LV]Split store-load forward distance analysis from other checks, NFC (#121156) The patch splits the store-load forwarding distance analysis from other dependency analysis in LAA. Currently it supports only power-of-2 distances, required to support non-power-of-2 distances in future. Part of #100755 --- .../llvm/Analysis/LoopAccessAnalysis.h | 23 ++++++++- .../Vectorize/LoopVectorizationLegality.h | 14 +++++- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 47 ++++++++++--------- .../Transforms/Vectorize/LoopVectorize.cpp | 15 ++++-- .../safe-with-dep-distance.ll | 2 +- .../stride-access-dependence.ll | 2 +- 6 files changed, 72 insertions(+), 31 deletions(-) diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index cb6f47e3a76be..f715e0ec8dbb4 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -216,6 +216,21 @@ class MemoryDepChecker { return MaxSafeVectorWidthInBits; } + /// Return true if there are no store-load forwarding dependencies. + bool isSafeForAnyStoreLoadForwardDistances() const { + return MaxStoreLoadForwardSafeDistanceInBits == + std::numeric_limits::max(); + } + + /// Return safe power-of-2 number of elements, which do not prevent store-load + /// forwarding, multiplied by the size of the elements in bits. + uint64_t getStoreLoadForwardSafeDistanceInBits() const { + assert(!isSafeForAnyStoreLoadForwardDistances() && + "Expected the distance, that prevent store-load forwarding, to be " + "set."); + return MaxStoreLoadForwardSafeDistanceInBits; + } + /// In same cases when the dependency check fails we can still /// vectorize the loop with a dynamic array access check. bool shouldRetryWithRuntimeCheck() const { @@ -304,6 +319,11 @@ class MemoryDepChecker { /// restrictive. uint64_t MaxSafeVectorWidthInBits = -1U; + /// Maximum power-of-2 number of elements, which do not prevent store-load + /// forwarding, multiplied by the size of the elements in bits. + uint64_t MaxStoreLoadForwardSafeDistanceInBits = + std::numeric_limits::max(); + /// If we see a non-constant dependence distance we can still try to /// vectorize this loop with runtime checks. bool FoundNonConstantDistanceDependence = false; @@ -357,7 +377,8 @@ class MemoryDepChecker { /// /// \return false if we shouldn't vectorize at all or avoid larger /// vectorization factors by limiting MinDepDistBytes. - bool couldPreventStoreLoadForward(uint64_t Distance, uint64_t TypeByteSize); + bool couldPreventStoreLoadForward(uint64_t Distance, uint64_t TypeByteSize, + unsigned CommonStride = 0); /// Updates the current safety status with \p S. We can go from Safe to /// either PossiblySafeWithRtChecks or Unsafe and from diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h index c3a04f9373dbe..d654ac3ec9273 100644 --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -382,7 +382,8 @@ class LoopVectorizationLegality { const LoopAccessInfo *getLAI() const { return LAI; } bool isSafeForAnyVectorWidth() const { - return LAI->getDepChecker().isSafeForAnyVectorWidth(); + return LAI->getDepChecker().isSafeForAnyVectorWidth() && + LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances(); } uint64_t getMaxSafeVectorWidthInBits() const { @@ -406,6 +407,17 @@ class LoopVectorizationLegality { return hasUncountableEarlyExit() ? getUncountableEdge()->second : nullptr; } + /// Return true if there is store-load forwarding dependencies. + bool isSafeForAnyStoreLoadForwardDistances() const { + return LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances(); + } + + /// Return safe power-of-2 number of elements, which do not prevent store-load + /// forwarding and safe to operate simultaneously. + uint64_t getMaxStoreLoadForwardSafeDistanceInBits() const { + return LAI->getDepChecker().getStoreLoadForwardSafeDistanceInBits(); + } + /// Returns true if vector representation of the instruction \p I /// requires mask. bool isMaskRequired(const Instruction *I) const { diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 7f1b5dc3890a9..dd7b796fd0fdf 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1740,7 +1740,8 @@ bool MemoryDepChecker::Dependence::isForward() const { } bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, - uint64_t TypeByteSize) { + uint64_t TypeByteSize, + unsigned CommonStride) { // If loads occur at a distance that is not a multiple of a feasible vector // factor store-load forwarding does not take place. // Positive dependences might cause troubles because vectorizing them might @@ -1755,31 +1756,38 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, // cause any slowdowns. const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize; // Maximum vector factor. - uint64_t MaxVFWithoutSLForwardIssues = std::min( - VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes); + uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 = + std::min(VectorizerParams::MaxVectorWidth * TypeByteSize, + MaxStoreLoadForwardSafeDistanceInBits); // Compute the smallest VF at which the store and load would be misaligned. - for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues; - VF *= 2) { + for (uint64_t VF = 2 * TypeByteSize; + VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) { // If the number of vector iteration between the store and the load are // small we could incur conflicts. if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) { - MaxVFWithoutSLForwardIssues = (VF >> 1); + MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1); break; } } - if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) { + if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) { LLVM_DEBUG( dbgs() << "LAA: Distance " << Distance << " that could cause a store-load forwarding conflict\n"); return true; } - if (MaxVFWithoutSLForwardIssues < MinDepDistBytes && - MaxVFWithoutSLForwardIssues != - VectorizerParams::MaxVectorWidth * TypeByteSize) - MinDepDistBytes = MaxVFWithoutSLForwardIssues; + if (CommonStride && + MaxVFWithoutSLForwardIssuesPowerOf2 < + MaxStoreLoadForwardSafeDistanceInBits && + MaxVFWithoutSLForwardIssuesPowerOf2 != + VectorizerParams::MaxVectorWidth * TypeByteSize) { + uint64_t MaxVF = MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride; + uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; + MaxStoreLoadForwardSafeDistanceInBits = + std::min(MaxStoreLoadForwardSafeDistanceInBits, MaxVFInBits); + } return false; } @@ -2227,20 +2235,10 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, std::min(static_cast(MinDistance), MinDepDistBytes); bool IsTrueDataDependence = (!AIsWrite && BIsWrite); - uint64_t MinDepDistBytesOld = MinDepDistBytes; if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist && - couldPreventStoreLoadForward(MinDistance, TypeByteSize)) { - // Sanity check that we didn't update MinDepDistBytes when calling - // couldPreventStoreLoadForward - assert(MinDepDistBytes == MinDepDistBytesOld && - "An update to MinDepDistBytes requires an update to " - "MaxSafeVectorWidthInBits"); - (void)MinDepDistBytesOld; + couldPreventStoreLoadForward(MinDistance, TypeByteSize, *CommonStride)) return Dependence::BackwardVectorizableButPreventsForwarding; - } - // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits - // since there is a backwards dependency. uint64_t MaxVF = MinDepDistBytes / *CommonStride; LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance << " with max VF = " << MaxVF << '\n'); @@ -3005,6 +3003,11 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { if (!DC.isSafeForAnyVectorWidth()) OS << " with a maximum safe vector width of " << DC.getMaxSafeVectorWidthInBits() << " bits"; + if (!DC.isSafeForAnyStoreLoadForwardDistances()) { + uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits(); + OS << ", with a maximum safe store-load forward width of " << SLDist + << " bits"; + } if (PtrRtChecking->Need) OS << " with run-time checks"; OS << "\n"; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index dd392056a07ee..c3520dc95f8b4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3815,13 +3815,18 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from // the memory accesses that is most restrictive (involved in the smallest // dependence distance). - unsigned MaxSafeElements = - llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); + unsigned MaxSafeElementsPowerOf2 = + bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType); + if (!Legal->isSafeForAnyStoreLoadForwardDistances()) { + unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits(); + MaxSafeElementsPowerOf2 = + std::min(MaxSafeElementsPowerOf2, SLDist / WidestType); + } + auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2); + auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2); - auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements); - auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements); if (!Legal->isSafeForAnyVectorWidth()) - this->MaxSafeElements = MaxSafeElements; + this->MaxSafeElements = MaxSafeElementsPowerOf2; LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF << ".\n"); diff --git a/llvm/test/Analysis/LoopAccessAnalysis/safe-with-dep-distance.ll b/llvm/test/Analysis/LoopAccessAnalysis/safe-with-dep-distance.ll index efa3100464759..8e249b36f6445 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/safe-with-dep-distance.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/safe-with-dep-distance.ll @@ -4,7 +4,7 @@ ; for (i = 0; i < n; i++) ; A[i + 4] = A[i] * 2; -; CHECK: Memory dependences are safe with a maximum safe vector width of 64 bits +; CHECK: Memory dependences are safe with a maximum safe vector width of 64 bits, with a maximum safe store-load forward width of 64 bits target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" diff --git a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll index ef19e173b6599..335ad67faee04 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/stride-access-dependence.ll @@ -276,7 +276,7 @@ for.body: ; preds = %entry, %for.body define void @vectorizable_Read_Write(ptr nocapture %A) { ; CHECK-LABEL: 'vectorizable_Read_Write' ; CHECK-NEXT: for.body: -; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 64 bits +; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 64 bits, with a maximum safe store-load forward width of 64 bits ; CHECK-NEXT: Dependences: ; CHECK-NEXT: BackwardVectorizable: ; CHECK-NEXT: %0 = load i32, ptr %arrayidx, align 4 -> From f4d25c498a20d73b9d3e4828023486a7b2591f38 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Mon, 31 Mar 2025 12:40:41 +0100 Subject: [PATCH 0083/1029] [LV][NFC] Regenerate some SVE tests using --filter-out-after option (#132174) I recently added a new option to update_test_checks.py that can filter out all CHECK lines after a certain point. We usually don't care about checking for the original scalar loop after the vector loop because it doesn't change. Cutting out unnecessary CHECK lines makes the files smaller and hopefully the tests run quicker. --- .../AArch64/sve-cond-inv-loads.ll | 64 +---- .../AArch64/sve-epilog-vect-reductions.ll | 13 +- .../sve-epilog-vect-strict-reductions.ll | 13 +- .../LoopVectorize/AArch64/sve-epilog-vect.ll | 52 +--- .../LoopVectorize/AArch64/sve-fneg.ll | 18 +- .../AArch64/sve-gather-scatter.ll | 82 +----- .../LoopVectorize/AArch64/sve-inductions.ll | 21 +- .../AArch64/sve-interleaved-accesses.ll | 239 +----------------- .../sve-interleaved-masked-accesses.ll | 167 +----------- .../LoopVectorize/AArch64/sve-inv-store.ll | 31 +-- .../LoopVectorize/AArch64/sve-multiexit.ll | 36 +-- .../sve-runtime-check-size-based-threshold.ll | 20 +- .../AArch64/sve-tail-folding-forced.ll | 13 +- .../AArch64/sve-tail-folding-optsize.ll | 18 +- .../sve-tail-folding-overflow-checks.ll | 16 +- .../AArch64/sve-tail-folding-unroll.ll | 31 +-- .../LoopVectorize/AArch64/sve-tail-folding.ll | 150 +---------- .../AArch64/sve-vector-reverse.ll | 34 +-- .../AArch64/sve2-histcnt-epilogue.ll | 15 +- .../AArch64/sve2-histcnt-too-many-deps.ll | 25 +- .../LoopVectorize/AArch64/sve2-histcnt.ll | 176 +------------ 21 files changed, 21 insertions(+), 1213 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll index ab5600a2dc3a6..67f50832603b7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -S %s -o - | FileCheck %s @@ -34,26 +34,6 @@ define void @cond_inv_load_i32i32i16(ptr noalias nocapture %a, ptr noalias nocap ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[I_07]] -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP10]], 0 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] -; CHECK: if.then: -; CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[INV]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP11]] to i32 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_07]] -; CHECK-NEXT: store i32 [[CONV]], ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: br label [[FOR_INC]] -; CHECK: for.inc: -; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -112,25 +92,6 @@ define void @cond_inv_load_f64f64f64(ptr noalias nocapture %a, ptr noalias nocap ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[COND]], i64 [[I_08]] -; CHECK-NEXT: [[TMP9:%.*]] = load double, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt double [[TMP9]], 4.000000e-01 -; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; CHECK: if.then: -; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr [[INV]], align 8 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]] -; CHECK-NEXT: store double [[TMP10]], ptr [[ARRAYIDX2]], align 8 -; CHECK-NEXT: br label [[FOR_INC]] -; CHECK: for.inc: -; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK: exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -192,29 +153,6 @@ define void @invariant_load_cond(ptr noalias nocapture %a, ptr nocapture readonl ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[IV]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP12]], 0 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] -; CHECK: if.then: -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 168 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 -; CHECK-NEXT: br label [[FOR_INC]] -; CHECK: for.inc: -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll index d59814a04d8df..e5633462973ad 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^for.body:" ; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -epilogue-vectorization-force-VF=2 -S | FileCheck %s ; @@ -73,17 +73,6 @@ define i64 @int_reduction_add(ptr %a, i64 %N) { ; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP21]], [[VEC_EPILOG_ITER_CHECK]] ], [ 5, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[ADD]] = add i64 [[TMP29]], [[SUM]] -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ], [ [[TMP28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i64 [[ADD_LCSSA]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll index 808a2da55b8d8..7e49c69266d8c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^for.body:" ; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -epilogue-vectorization-force-VF=2 -S | FileCheck %s ; @@ -68,17 +68,6 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) { ; CHECK-NEXT: [[BC_MERGE_RDX10:%.*]] = phi float [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0xFFFFFFFFE0000000, [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD]] = fadd float [[TMP26]], [[SUM_07]] -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret float [[ADD_LCSSA]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll index ef830170ed609..5d0e6f72309cf 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^for.body:" --filter-out-after "^loop:" ; REQUIRES: asserts ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=2 -epilogue-vectorization-minimum-VF=0 -prefer-predicate-over-epilogue=scalar-epilogue \ ; RUN: -debug-only=loop-vectorize -force-target-instruction-cost=1 -S 2>%t | FileCheck %s --check-prefix=CHECK @@ -83,14 +83,6 @@ define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: store i8 1, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: exit: -; CHECK-NEXT: ret void ; ; CHECK-VF8-LABEL: @main_vf_vscale_x_16( ; CHECK-VF8-NEXT: iter.check: @@ -144,14 +136,6 @@ define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-VF8: for.body: -; CHECK-VF8-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-VF8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] -; CHECK-VF8-NEXT: store i8 1, ptr [[ARRAYIDX]], align 1 -; CHECK-VF8-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-VF8-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024 -; CHECK-VF8-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK-VF8: exit: -; CHECK-VF8-NEXT: ret void ; entry: br label %for.body @@ -236,14 +220,6 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: store i64 1, ptr [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK: exit: -; CHECK-NEXT: ret void ; ; CHECK-VF8-LABEL: @main_vf_vscale_x_2( ; CHECK-VF8-NEXT: iter.check: @@ -297,14 +273,6 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { ; CHECK-VF8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-VF8-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-VF8: for.body: -; CHECK-VF8-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-VF8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] -; CHECK-VF8-NEXT: store i64 1, ptr [[ARRAYIDX]], align 1 -; CHECK-VF8-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-VF8-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024 -; CHECK-VF8-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK-VF8: exit: -; CHECK-VF8-NEXT: ret void ; entry: br label %for.body @@ -391,15 +359,6 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[ITER_CHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: store i8 0, ptr [[PTR_IV]], align 1 -; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 1 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 10000 -; CHECK-NEXT: br i1 [[CMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] -; CHECK: exit: -; CHECK-NEXT: ret void ; ; CHECK-VF8-LABEL: @test_pr57912_pointer_induction( ; CHECK-VF8-NEXT: iter.check: @@ -456,15 +415,6 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-VF8-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END1]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[ITER_CHECK]] ] ; CHECK-VF8-NEXT: br label [[LOOP:%.*]] ; CHECK-VF8: loop: -; CHECK-VF8-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-VF8-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-VF8-NEXT: store i8 0, ptr [[PTR_IV]], align 1 -; CHECK-VF8-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 1 -; CHECK-VF8-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-VF8-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 10000 -; CHECK-VF8-NEXT: br i1 [[CMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] -; CHECK-VF8: exit: -; CHECK-VF8-NEXT: ret void ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll index ef3d5b59632dc..cfb96b4f5a61f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; RUN: opt -passes=loop-vectorize,dce -prefer-predicate-over-epilogue=scalar-epilogue \ ; RUN: -enable-epilogue-vectorization=false < %s -S | FileCheck %s @@ -59,22 +59,6 @@ define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef read ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP24:%.*]] = load half, ptr [[ARRAYIDX]], align 2 -; CHECK-NEXT: [[FNEG:%.*]] = fneg half [[TMP24]] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store half [[FNEG]], ptr [[ARRAYIDX2]], align 2 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: %cmp6 = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll index d328124e6c783..663cf4173cc91 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple aarch64-linux-gnu -mattr=+sve \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue -S %s -force-target-instruction-cost=1 -o - | FileCheck %s @@ -31,21 +31,6 @@ define void @gather_nxv4i32_ind64(ptr noalias nocapture readonly %a, ptr noalias ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[TMP10]], ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -99,22 +84,6 @@ define void @scatter_nxv4i32_ind32(ptr noalias nocapture %a, ptr noalias nocaptu ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[IDXPROM4:%.*]] = sext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IDXPROM4]] -; CHECK-NEXT: store float [[TMP10]], ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -165,23 +134,6 @@ define void @scatter_inv_nxv4i32(ptr noalias nocapture %inv, ptr noalias nocaptu ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP8]], 0 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] -; CHECK: if.then: -; CHECK-NEXT: store i32 3, ptr [[INV]], align 4 -; CHECK-NEXT: br label [[FOR_INC]] -; CHECK: for.inc: -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -236,24 +188,6 @@ define void @gather_inv_nxv4i32(ptr noalias nocapture %a, ptr noalias nocapture ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[TMP8]], 3 -; CHECK-NEXT: br i1 [[CMP2]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; CHECK: if.then: -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[INV]], align 4 -; CHECK-NEXT: store i32 [[TMP9]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: br label [[FOR_INC]] -; CHECK: for.inc: -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -323,20 +257,6 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX_IDX:%.*]] = shl i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[ARRAYIDX_IDX]] -; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[TMP16]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll index 0cd3870914283..2f90b5a332bdc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; RUN: opt -passes=loop-vectorize,dce,instcombine -force-target-instruction-cost=1 \ ; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s -S | FileCheck %s @@ -45,25 +45,6 @@ define void @cond_ind64(ptr noalias nocapture %a, ptr noalias nocapture readonly ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[AND:%.*]] = and i64 [[I_08]], 1 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[AND]], 0 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] -; CHECK: if.then: -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_08]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_08]] -; CHECK-NEXT: store i32 [[TMP13]], ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: br label [[FOR_INC]] -; CHECK: for.inc: -; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: exit: -; CHECK-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index c890cb71d34be..8bbda981895ac 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; RUN: opt -mtriple=aarch64-none-linux-gnu -S -passes=loop-vectorize,instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -mattr=+sve -scalable-vectorization=on -runtime-memory-check-threshold=24 < %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" @@ -52,11 +52,6 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -139,11 +134,6 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP5:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -230,11 +220,6 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -308,12 +293,6 @@ define i32 @test_struct_load6(ptr %S) #1 { ; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP16]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[SUB14_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i32 [[SUB14_LCSSA]] ; entry: br label %for.body @@ -418,11 +397,6 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP11:![0-9]+]] ; entry: br label %for.body @@ -488,21 +462,6 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ poison, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[LOAD]], 1 -; CHECK-NEXT: [[LSHR:%.*]] = lshr exact i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[LSHR]] -; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP13:![0-9]+]] ; entry: br label %for.body @@ -574,21 +533,6 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[LOAD]], 1 -; CHECK-NEXT: [[LSHR:%.*]] = lshr exact i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[LSHR]] -; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP15:![0-9]+]] ; entry: br label %for.body @@ -660,11 +604,6 @@ define void @load_gap_reverse(ptr noalias nocapture readonly %P1, ptr noalias no ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP17:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -726,11 +665,6 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP19:![0-9]+]] ; entry: br label %for.body @@ -815,15 +749,6 @@ define void @int_float_struct(ptr nocapture readonly %p) #0 { ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: store i32 [[ADD_LCSSA]], ptr @SA, align 4 -; CHECK-NEXT: store float [[ADD3_LCSSA]], ptr @SB, align 4 -; CHECK-NEXT: ret void -; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; entry: br label %for.body @@ -908,19 +833,6 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0 -; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 -; CHECK-NEXT: store i32 [[Z]], ptr [[P_I_X]], align 4 -; CHECK-NEXT: store i32 [[Z]], ptr [[P_I_Y]], align 4 -; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP23:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -996,22 +908,6 @@ define i32 @PR27626_1(ptr %p, i64 %n) #1 { ; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP17]]) ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP21:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0 -; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[P_I_X]], align 4 -; CHECK-NEXT: store i32 [[TMP20]], ptr [[P_I_Y]], align 4 -; CHECK-NEXT: [[TMP21]] = add nsw i32 [[TMP20]], [[S]] -; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP25:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret i32 [[TMP21]] ; entry: br label %for.body @@ -1086,21 +982,6 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0 -; CHECK-NEXT: [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 -8 -; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 -; CHECK-NEXT: store i32 [[Z]], ptr [[P_I_X]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[P_I_MINUS_1_X]], align 4 -; CHECK-NEXT: store i32 [[TMP17]], ptr [[P_I_Y]], align 4 -; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP27:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -1178,25 +1059,6 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 { ; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP18]]) ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[S:%.*]] = phi i32 [ [[TMP23:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I_PLUS_1:%.*]] = add nuw nsw i64 [[I]], 1 -; CHECK-NEXT: [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0 -; CHECK-NEXT: [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1 -; CHECK-NEXT: [[P_I_PLUS_1_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I_PLUS_1]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[P_I_X]], align 4 -; CHECK-NEXT: store i32 [[TMP21]], ptr [[P_I_PLUS_1_Y]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[P_I_Y]], align 4 -; CHECK-NEXT: [[TMP23]] = add nsw i32 [[TMP22]], [[S]] -; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 -; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END:%.*]], !llvm.loop [[LOOP29:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret i32 [[TMP23]] ; entry: br label %for.body @@ -1279,20 +1141,6 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I_PLUS_1:%.*]] = or disjoint i64 [[I]], 1 -; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] -; CHECK-NEXT: [[A_I_PLUS_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_PLUS_1]] -; CHECK-NEXT: store i32 [[Y]], ptr [[A_I]], align 4 -; CHECK-NEXT: store i32 [[Z]], ptr [[A_I_PLUS_1]], align 4 -; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 -; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP31:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -1375,23 +1223,6 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[A]], i64 [[I]] -; CHECK-NEXT: [[A_I_MINUS_1:%.*]] = getelementptr i8, ptr [[TMP19]], i64 -4 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[A]], i64 [[I]] -; CHECK-NEXT: [[A_I_MINUS_3:%.*]] = getelementptr i8, ptr [[TMP20]], i64 -12 -; CHECK-NEXT: store i32 [[X]], ptr [[A_I_MINUS_1]], align 4 -; CHECK-NEXT: store i32 [[Y]], ptr [[A_I_MINUS_3]], align 4 -; CHECK-NEXT: store i32 [[Z]], ptr [[A_I]], align 4 -; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 2 -; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP33:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -1492,32 +1323,6 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[TMP33:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD2:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV2:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ], [ [[I1:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP33]] to i32 -; CHECK-NEXT: [[I1]] = add nuw nsw i64 [[I]], 1 -; CHECK-NEXT: [[IV1:%.*]] = or disjoint i64 [[IV]], 1 -; CHECK-NEXT: [[IV2]] = add nuw nsw i64 [[IV]], 2 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[IV1]] -; CHECK-NEXT: [[LOAD1:%.*]] = load i16, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[LOAD1]] to i32 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[IV2]] -; CHECK-NEXT: [[LOAD2]] = load i16, ptr [[GEP2]], align 4 -; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[LOAD2]] to i32 -; CHECK-NEXT: [[MUL01:%.*]] = mul nsw i32 [[CONV]], [[CONV1]] -; CHECK-NEXT: [[MUL012:%.*]] = mul nsw i32 [[MUL01]], [[CONV2]] -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I]] -; CHECK-NEXT: store i32 [[MUL012]], ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[LOOP]], !llvm.loop [[LOOP40:![0-9]+]] -; CHECK: end: -; CHECK-NEXT: ret void ; entry: %.pre = load i16, ptr %a @@ -1613,43 +1418,6 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]] -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 -; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4 -; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 -; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]] -; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 -; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 -; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 -; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Z]], align 4 -; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 -; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]] -; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 -; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 -; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 -; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[T]], align 4 -; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 -; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[T27]], align 4 -; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]] -; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 -; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -1768,11 +1536,6 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]] ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index f2e0c9be2defe..9d6b691f3ed31 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 2 ; RUN: opt -mtriple=aarch64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+sve -force-vector-width=16 -force-vector-interleave=1 -scalable-vectorization=on -prefer-predicate-over-epilogue=scalar-epilogue %s 2>&1 | FileCheck %s -check-prefix=SCALAR_TAIL_FOLDING ; RUN: opt -mtriple=aarch64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+sve -force-vector-width=16 -force-vector-interleave=1 -scalable-vectorization=on -prefer-predicate-over-epilogue=predicate-dont-vectorize %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_TAIL_FOLDING @@ -70,36 +70,6 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; SCALAR_TAIL_FOLDING: scalar.ph: -; SCALAR_TAIL_FOLDING-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] -; SCALAR_TAIL_FOLDING: for.body: -; SCALAR_TAIL_FOLDING-NEXT: [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]] -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; SCALAR_TAIL_FOLDING: if.then: -; SCALAR_TAIL_FOLDING-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[IX_024]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = zext nneg i32 [[MUL]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP18]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[ADD:%.*]] = or disjoint i32 [[MUL]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = zext nneg i32 [[ADD]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP20]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP19]], i8 [[TMP21]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = zext nneg i32 [[MUL]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP22]] -; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I]], ptr [[ARRAYIDX6]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[SUB:%.*]] = sub i8 0, [[SPEC_SELECT_I]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = zext nneg i32 [[ADD]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP23]] -; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB]], ptr [[ARRAYIDX11]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] -; SCALAR_TAIL_FOLDING: for.inc: -; SCALAR_TAIL_FOLDING-NEXT: [[INC]] = add nuw nsw i32 [[IX_024]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; SCALAR_TAIL_FOLDING: for.end: -; SCALAR_TAIL_FOLDING-NEXT: ret void ; ; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided1 ; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { @@ -148,15 +118,6 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: -; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] -; PREDICATED_TAIL_FOLDING: for.body: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] -; PREDICATED_TAIL_FOLDING: if.then: -; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_INC]] -; PREDICATED_TAIL_FOLDING: for.inc: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; PREDICATED_TAIL_FOLDING: for.end: -; PREDICATED_TAIL_FOLDING-NEXT: ret void ; entry: %conv = zext i8 %guard to i32 @@ -247,28 +208,6 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; SCALAR_TAIL_FOLDING: scalar.ph: -; SCALAR_TAIL_FOLDING-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] -; SCALAR_TAIL_FOLDING: for.body: -; SCALAR_TAIL_FOLDING-NEXT: [[IX_012:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[IX_012]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = zext nneg i32 [[MUL]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP15]] -; SCALAR_TAIL_FOLDING-NEXT: store i8 1, ptr [[ARRAYIDX]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_012]], [[CONV]] -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; SCALAR_TAIL_FOLDING: if.then: -; SCALAR_TAIL_FOLDING-NEXT: [[ADD:%.*]] = or disjoint i32 [[MUL]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = zext nneg i32 [[ADD]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP16]] -; SCALAR_TAIL_FOLDING-NEXT: store i8 2, ptr [[ARRAYIDX3]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] -; SCALAR_TAIL_FOLDING: for.inc: -; SCALAR_TAIL_FOLDING-NEXT: [[INC]] = add nuw nsw i32 [[IX_012]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] -; SCALAR_TAIL_FOLDING: for.end: -; SCALAR_TAIL_FOLDING-NEXT: ret void ; ; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided2 ; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readnone captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -310,15 +249,6 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: -; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] -; PREDICATED_TAIL_FOLDING: for.body: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] -; PREDICATED_TAIL_FOLDING: if.then: -; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_INC]] -; PREDICATED_TAIL_FOLDING: for.inc: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] -; PREDICATED_TAIL_FOLDING: for.end: -; PREDICATED_TAIL_FOLDING-NEXT: ret void ; entry: %conv = zext i8 %guard to i32 @@ -408,33 +338,6 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; SCALAR_TAIL_FOLDING: scalar.ph: -; SCALAR_TAIL_FOLDING-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] -; SCALAR_TAIL_FOLDING: for.body: -; SCALAR_TAIL_FOLDING-NEXT: [[IX_018:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[IX_018]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_018]], [[CONV]] -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] -; SCALAR_TAIL_FOLDING: if.then: -; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = zext nneg i32 [[MUL]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP16]] -; SCALAR_TAIL_FOLDING-NEXT: store i8 1, ptr [[ARRAYIDX]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: br label [[IF_END]] -; SCALAR_TAIL_FOLDING: if.end: -; SCALAR_TAIL_FOLDING-NEXT: [[CMP4:%.*]] = icmp samesign ugt i32 [[IX_018]], [[CONV3]] -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP4]], label [[IF_THEN6:%.*]], label [[FOR_INC]] -; SCALAR_TAIL_FOLDING: if.then6: -; SCALAR_TAIL_FOLDING-NEXT: [[ADD:%.*]] = or disjoint i32 [[MUL]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = zext nneg i32 [[ADD]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP17]] -; SCALAR_TAIL_FOLDING-NEXT: store i8 2, ptr [[ARRAYIDX7]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] -; SCALAR_TAIL_FOLDING: for.inc: -; SCALAR_TAIL_FOLDING-NEXT: [[INC]] = add nuw nsw i32 [[IX_018]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; SCALAR_TAIL_FOLDING: for.end: -; SCALAR_TAIL_FOLDING-NEXT: ret void ; ; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided3 ; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readnone captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD1:%.*]], i8 zeroext [[GUARD2:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -481,19 +384,6 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: -; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] -; PREDICATED_TAIL_FOLDING: for.body: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[IF_THEN:%.*]], label [[IF_END:%.*]] -; PREDICATED_TAIL_FOLDING: if.then: -; PREDICATED_TAIL_FOLDING-NEXT: br label [[IF_END]] -; PREDICATED_TAIL_FOLDING: if.end: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[IF_THEN6:%.*]], label [[FOR_INC:%.*]] -; PREDICATED_TAIL_FOLDING: if.then6: -; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_INC]] -; PREDICATED_TAIL_FOLDING: for.inc: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; PREDICATED_TAIL_FOLDING: for.end: -; PREDICATED_TAIL_FOLDING-NEXT: ret void ; entry: %conv = zext i8 %guard1 to i32 @@ -613,52 +503,6 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; SCALAR_TAIL_FOLDING: scalar.ph: -; SCALAR_TAIL_FOLDING-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] -; SCALAR_TAIL_FOLDING: for.body: -; SCALAR_TAIL_FOLDING-NEXT: [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]] -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; SCALAR_TAIL_FOLDING: if.then: -; SCALAR_TAIL_FOLDING-NEXT: [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2 -; SCALAR_TAIL_FOLDING-NEXT: [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2 -; SCALAR_TAIL_FOLDING-NEXT: [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]]) -; SCALAR_TAIL_FOLDING-NEXT: [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]] -; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]]) -; SCALAR_TAIL_FOLDING-NEXT: [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]] -; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]] -; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]] -; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]] -; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1 -; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] -; SCALAR_TAIL_FOLDING: for.inc: -; SCALAR_TAIL_FOLDING-NEXT: [[INC]] = add nuw nsw i32 [[IX_024]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] -; SCALAR_TAIL_FOLDING: for.end: -; SCALAR_TAIL_FOLDING-NEXT: ret void ; ; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 ; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -721,15 +565,6 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; PREDICATED_TAIL_FOLDING: scalar.ph: -; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] -; PREDICATED_TAIL_FOLDING: for.body: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] -; PREDICATED_TAIL_FOLDING: if.then: -; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_INC]] -; PREDICATED_TAIL_FOLDING: for.inc: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] -; PREDICATED_TAIL_FOLDING: for.end: -; PREDICATED_TAIL_FOLDING-NEXT: ret void ; entry: %conv = zext i8 %guard to i32 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll index 4136a9f9e7938..8b009f1c91373 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; RUN: opt -passes=loop-vectorize -S -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -35,18 +35,6 @@ define void @inv_store_i16(ptr noalias %dst, ptr noalias readonly %src, i64 %N) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_INC24:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY14:%.*]] -; CHECK: for.body14: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY14]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[LD:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; CHECK-NEXT: store i16 [[LD]], ptr [[DST]], align 2 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_INC24]], label [[FOR_BODY14]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: for.inc24: -; CHECK-NEXT: ret void ; entry: br label %for.body14 @@ -96,23 +84,6 @@ define void @cond_inv_store_i32(ptr noalias %dst, ptr noalias readonly %src, i64 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_09:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[I_09]] -; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 0 -; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] -; CHECK: if.then: -; CHECK-NEXT: store i32 [[TMP11]], ptr [[DST]], align 4 -; CHECK-NEXT: br label [[FOR_INC]] -; CHECK: for.inc: -; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_09]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll index 846e9e6e82da5..bc4533f3011cb 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; RUN: opt -passes=loop-vectorize < %s -S -o - | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -58,22 +58,6 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] -; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY:%.*]] ] -; CHECK-NEXT: [[COND_0:%.*]] = icmp eq i32 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[COND_0]], label [[EXIT:%.*]], label [[FOR_BODY]] -; CHECK: for.body: -; CHECK-NEXT: [[A_GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] -; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[A_GEP]], align 4 -; CHECK-NEXT: [[B_GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]] -; CHECK-NEXT: store i32 [[LV]], ptr [[B_GEP]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw i32 [[IV]], 1 -; CHECK-NEXT: [[COND_1:%.*]] = icmp ult i32 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[COND_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: exit: -; CHECK-NEXT: ret void ; entry: br label %loop.header @@ -148,24 +132,6 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] -; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY:%.*]] ] -; CHECK-NEXT: [[COND_0:%.*]] = icmp eq i32 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[COND_0]], label [[EXIT_0:%.*]], label [[FOR_BODY]] -; CHECK: for.body: -; CHECK-NEXT: [[A_GEP:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] -; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[A_GEP]], align 4 -; CHECK-NEXT: [[B_GEP:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]] -; CHECK-NEXT: store i32 [[LV]], ptr [[B_GEP]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw i32 [[IV]], 1 -; CHECK-NEXT: [[COND_1:%.*]] = icmp ult i32 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[COND_1]], label [[LOOP_HEADER]], label [[EXIT_1:%.*]], !llvm.loop [[LOOP5:![0-9]+]] -; CHECK: exit.0: -; CHECK-NEXT: ret i32 1 -; CHECK: exit.1: -; CHECK-NEXT: ret i32 2 ; entry: br label %loop.header diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll index 6affb8ca8e7bd..2749b47325cbe 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; RUN: opt -passes=loop-vectorize -mattr=+sve -prefer-predicate-over-epilogue=scalar-epilogue -S %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -89,24 +89,6 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i64, ptr [[SRC_1]], i64 [[IV]] -; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[IV]] -; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8 -; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8 -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[L_1]], [[L_2]] -; CHECK-NEXT: [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[IV]] -; CHECK-NEXT: [[GEP_DST_2:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[IV]] -; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP_DST_1]], align 8 -; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP_DST_2]], align 8 -; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP10]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: exit: -; CHECK-NEXT: ret void ; entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll index 95aad199ac765..25403599977cb 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; REQUIRES: asserts ; RUN: opt -S -passes=loop-vectorize -debug-only=loop-vectorize < %s 2>%t | FileCheck %s ; RUN: cat %t | FileCheck %s --check-prefix=VPLANS @@ -77,17 +77,6 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void ; entry: br label %while.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll index 3f4caeca5d452..63bb485e7f085 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -38,22 +38,6 @@ define void @trip1025_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapt ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[I_06:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[I_06]] -; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[MUL:%.*]] = shl nsw i64 [[TMP16]], 1 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[I_06]] -; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP17]], [[MUL]] -; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX1]], align 8 -; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_06]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 1025 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll index f61d473a12ddb..b39c47cc7906d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; RUN: opt -passes='loop-vectorize,instcombine' -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all -S < %s | FileCheck %s target triple = "aarch64" @@ -34,13 +34,6 @@ define void @cannot_overflow_i32_induction_var(ptr noalias %dst, ptr readonly %s ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void ; entry: %cmp6.not = icmp eq i32 %N, 0 @@ -98,13 +91,6 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src, ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void ; entry: %cmp6.not = icmp eq i64 %N, 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll index f4c9d783a3329..ce761913ea0fc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; RUN: opt -S -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-interleave=4 -force-vector-width=4 < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -80,17 +80,6 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void ; entry: br label %while.body @@ -206,24 +195,6 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[COND_GEP:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[INDEX]] -; CHECK-NEXT: [[COND_I32:%.*]] = load i32, ptr [[COND_GEP]], align 4 -; CHECK-NEXT: [[COND_I1:%.*]] = icmp ne i32 [[COND_I32]], 0 -; CHECK-NEXT: br i1 [[COND_I1]], label [[DO_STORE:%.*]], label [[WHILE_END]] -; CHECK: do.store: -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4 -; CHECK-NEXT: br label [[WHILE_END]] -; CHECK: while.end: -; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void ; entry: br label %while.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index 314ae92c45240..aab4f33f87c0f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; RUN: opt -S -hints-allow-reordering=false -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -41,17 +41,6 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void ; entry: br label %while.body @@ -99,17 +88,6 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void ; entry: br label %while.body @@ -165,19 +143,6 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP2]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void ; entry: br label %while.body @@ -245,19 +210,6 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP2]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 4 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void ; entry: br label %while.body @@ -316,21 +268,6 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[IND]], i64 [[INDEX]] -; CHECK-NEXT: [[IND_VAL:%.*]] = load i32, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[IND_VAL]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[GEP2]], align 4 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i32, ptr [[DST]], i32 [[IND_VAL]] -; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP3]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void ; entry: br label %while.body @@ -391,18 +328,6 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[SRC]], align 4 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i32 [[VAL]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: @@ -470,26 +395,6 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP20]], 0 -; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]] -; CHECK: if.then: -; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[SRC]], align 4 -; CHECK-NEXT: br label [[IF_END]] -; CHECK: if.end: -; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[TMP21]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL_0]], ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: @@ -558,18 +463,6 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: store i32 [[VAL]], ptr [[DST]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] -; CHECK: for.end: -; CHECK-NEXT: ret void ; entry: @@ -629,21 +522,6 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr float, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: [[VAL1:%.*]] = load float, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[VAL2:%.*]] = load float, ptr [[GEP2]], align 4 -; CHECK-NEXT: [[RES:%.*]] = fdiv float [[VAL1]], [[VAL2]] -; CHECK-NEXT: store float [[RES]], ptr [[GEP2]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP19:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void ; entry: br label %while.body @@ -707,21 +585,6 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr [[GEP2]], align 4 -; CHECK-NEXT: [[RES:%.*]] = udiv i32 [[VAL1]], [[VAL2]] -; CHECK-NEXT: store i32 [[RES]], ptr [[GEP2]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP21:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void ; entry: br label %while.body @@ -770,17 +633,6 @@ define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]] -; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP23:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void ; entry: br label %while.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll index 9567123ab8ae3..592dc1c4efd47 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" ; This is the loop in c++ being vectorize in this file with ;vector.reverse ; #pragma clang loop vectorize_width(8, scalable) interleave_count(2) @@ -58,22 +58,6 @@ define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_MOD_VF]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; CHECK: for.body: -; CHECK-NEXT: [[I_08_IN:%.*]] = phi i64 [ [[I_08:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I_08]] = add nsw i64 [[I_08_IN]], -1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_08]] -; CHECK-NEXT: [[TMP26:%.*]] = load double, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP26]], 1.000000e+00 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]] -; CHECK-NEXT: store double [[ADD]], ptr [[ARRAYIDX1]], align 8 -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] ; entry: %cmp7 = icmp sgt i64 %N, 0 @@ -153,22 +137,6 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_MOD_VF]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; CHECK: for.body: -; CHECK-NEXT: [[I_09_IN:%.*]] = phi i64 [ [[I_09:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[I_09]] = add nsw i64 [[I_09_IN]], -1 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[I_09]] -; CHECK-NEXT: [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[TMP29]], 1 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[I_09]] -; CHECK-NEXT: store i64 [[ADD]], ptr [[ARRAYIDX2]], align 8 -; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_09_IN]], 1 -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; entry: %cmp8 = icmp sgt i64 %N, 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll index 84fc963833cf2..54ba0a8c4d6bc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^for.body:" --version 3 ; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -epilogue-vectorization-minimum-VF=4 -debug-only=loop-vectorize -force-vector-interleave=1 -S 2>&1 | FileCheck %s ; REQUIRES: asserts @@ -69,19 +69,6 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 % ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY1:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY1]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP13]], 1 -; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll index c430e72cea703..b292e43046731 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 3 ; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -force-vector-interleave=1 -max-dependences=2 -debug-only=loop-vectorize,loop-accesses -S 2>&1 | FileCheck %s ; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -force-vector-interleave=1 -debug-only=loop-vectorize,loop-accesses -S 2>&1 | FileCheck %s --check-prefix=NORMAL_DEP_LIMIT ; REQUIRES: asserts @@ -99,29 +99,6 @@ define void @many_deps(ptr noalias %buckets, ptr %array, ptr %indices, ptr %othe ; NORMAL_DEP_LIMIT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; NORMAL_DEP_LIMIT-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; NORMAL_DEP_LIMIT: scalar.ph: -; NORMAL_DEP_LIMIT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; NORMAL_DEP_LIMIT-NEXT: br label [[FOR_BODY1:%.*]] -; NORMAL_DEP_LIMIT: for.body: -; NORMAL_DEP_LIMIT-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] -; NORMAL_DEP_LIMIT-NEXT: [[GEP_INDICES1:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]] -; NORMAL_DEP_LIMIT-NEXT: [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES1]], align 4 -; NORMAL_DEP_LIMIT-NEXT: [[IDXPROM1:%.*]] = zext i32 [[L_IDX]] to i64 -; NORMAL_DEP_LIMIT-NEXT: [[GEP_BUCKET:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] -; NORMAL_DEP_LIMIT-NEXT: [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4 -; NORMAL_DEP_LIMIT-NEXT: [[INC:%.*]] = add nsw i32 [[L_BUCKET]], 1 -; NORMAL_DEP_LIMIT-NEXT: store i32 [[INC]], ptr [[GEP_BUCKET]], align 4 -; NORMAL_DEP_LIMIT-NEXT: [[IDX_ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV1]] -; NORMAL_DEP_LIMIT-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV1]] to i32 -; NORMAL_DEP_LIMIT-NEXT: store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4 -; NORMAL_DEP_LIMIT-NEXT: [[GEP_OTHER:%.*]] = getelementptr inbounds i32, ptr [[OTHER]], i64 [[IV1]] -; NORMAL_DEP_LIMIT-NEXT: [[L_OTHER:%.*]] = load i32, ptr [[GEP_OTHER]], align 4 -; NORMAL_DEP_LIMIT-NEXT: [[ADD_OTHER:%.*]] = add i32 [[L_OTHER]], [[IV_TRUNC]] -; NORMAL_DEP_LIMIT-NEXT: store i32 [[ADD_OTHER]], ptr [[GEP_OTHER]], align 4 -; NORMAL_DEP_LIMIT-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 -; NORMAL_DEP_LIMIT-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] -; NORMAL_DEP_LIMIT-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]] -; NORMAL_DEP_LIMIT: for.exit: -; NORMAL_DEP_LIMIT-NEXT: ret void ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll index 3b00312959d8a..dd1f77582e0be 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 3 ; RUN: opt < %s -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s ; REQUIRES: asserts @@ -56,22 +56,6 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 % ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP13]], 1 -; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -122,22 +106,6 @@ define void @simple_histogram_inc_param(ptr noalias %buckets, ptr readonly %indi ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP13]], [[INCVAL]] -; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -188,22 +156,6 @@ define void @simple_histogram_sub(ptr noalias %buckets, ptr readonly %indices, i ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP13]], -1 -; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -257,29 +209,6 @@ define void @conditional_histogram(ptr noalias %buckets, ptr readonly %indices, ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY1:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[NEXT:%.*]] ] -; CHECK-NEXT: [[CONDIDX:%.*]] = getelementptr inbounds i32, ptr [[CONDS]], i64 [[IV1]] -; CHECK-NEXT: [[CONDDATA:%.*]] = load i32, ptr [[CONDIDX]], align 4 -; CHECK-NEXT: [[IFCOND:%.*]] = icmp sgt i32 [[CONDDATA]], 5100 -; CHECK-NEXT: br i1 [[IFCOND]], label [[IFTRUE:%.*]], label [[NEXT]] -; CHECK: iftrue: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP15]], 1 -; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX3]], align 4 -; CHECK-NEXT: br label [[NEXT]] -; CHECK: next: -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -339,22 +268,6 @@ define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY1:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] -; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_INDICES]], align 4 -; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[BUCKETS]], i64 [[IDXPROM1]] -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i8 [[TMP1]], 1 -; CHECK-NEXT: store i8 [[INC]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -499,22 +412,6 @@ define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP13]], 1 -; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -569,22 +466,6 @@ define void @histogram_array_3op_gep(i64 noundef %N) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1048576 x i32], ptr @idx_array, i64 0, i64 [[IV]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[TMP9]] to i64 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [1048576 x i32], ptr @data_array, i64 0, i64 [[IDXPROM5]] -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP10]], 1 -; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -639,22 +520,6 @@ define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr reado ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY1:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] -; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]] -; CHECK-NEXT: [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4 -; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[L_IDX]] to i64 -; CHECK-NEXT: [[GEP_BUCKET:%.*]] = getelementptr inbounds [[SOMESTRUCT]], ptr [[DATA_STRUCT]], i64 1, i32 0, i64 [[IDXPROM5]] -; CHECK-NEXT: [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[L_BUCKET]], 1 -; CHECK-NEXT: store i32 [[INC]], ptr [[GEP_BUCKET]], align 4 -; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP19:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -705,11 +570,6 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -779,25 +639,6 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr % ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] -; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP17]] to i64 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] -; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP18]], 1 -; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[IDX_ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV]] -; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 -; CHECK-NEXT: store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body @@ -892,21 +733,6 @@ define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds i64, ptr [[INDICES]], i64 [[IV]] -; CHECK-NEXT: [[L_IDX:%.*]] = load i64, ptr [[GEP_INDICES]], align 4 -; CHECK-NEXT: [[GEP_BUCKET:%.*]] = getelementptr inbounds i64, ptr [[BUCKETS]], i64 [[L_IDX]] -; CHECK-NEXT: [[L_BUCKET:%.*]] = load i64, ptr [[GEP_BUCKET]], align 4 -; CHECK-NEXT: [[INC:%.*]] = add nsw i64 [[L_BUCKET]], 1 -; CHECK-NEXT: store i64 [[INC]], ptr [[GEP_BUCKET]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: ret void ; entry: br label %for.body From aad9630e42d70b4cbfd6bc544576bd96844e737d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 31 Mar 2025 13:23:06 +0100 Subject: [PATCH 0084/1029] [X86] combineINSERT_SUBVECTOR - pull out common variables. NFC. (#133705) Reduces diff for an updated version of #133083 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 72977923bac2b..76de7e888d985 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58823,6 +58823,8 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, uint64_t IdxVal = N->getConstantOperandVal(2); MVT SubVecVT = SubVec.getSimpleValueType(); + int VecNumElts = OpVT.getVectorNumElements(); + int SubVecNumElts = SubVecVT.getVectorNumElements(); if (Vec.isUndef() && SubVec.isUndef()) return DAG.getUNDEF(OpVT); @@ -58882,10 +58884,9 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, SubVec.getOperand(0).getSimpleValueType() == OpVT && (IdxVal != 0 || !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) { + SDValue ExtSrc = SubVec.getOperand(0); int ExtIdxVal = SubVec.getConstantOperandVal(1); if (ExtIdxVal != 0) { - int VecNumElts = OpVT.getVectorNumElements(); - int SubVecNumElts = SubVecVT.getVectorNumElements(); SmallVector Mask(VecNumElts); // First create an identity shuffle mask. for (int i = 0; i != VecNumElts; ++i) @@ -58893,8 +58894,7 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, // Now insert the extracted portion. for (int i = 0; i != SubVecNumElts; ++i) Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts; - - return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask); + return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask); } } @@ -58942,7 +58942,7 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, // If we're splatting the lower half subvector of a full vector load into the // upper half, attempt to create a subvector broadcast. // TODO: Drop hasOneUse checks. - if (IdxVal == (OpVT.getVectorNumElements() / 2) && + if ((int)IdxVal == (VecNumElts / 2) && Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits()) && (Vec.hasOneUse() || SubVec.hasOneUse())) { auto *VecLd = dyn_cast(Vec); From 2e54b4f9ea3fb7e83b5fc5fe19235ea82f0d4549 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 31 Mar 2025 13:58:52 +0100 Subject: [PATCH 0085/1029] [ARM] Silence signed comparison warning. NFC After f4ec179bf5295f92aa0346392a58fad54f9b458e, AbsImm is no longer signed and cannot be < 0. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index d2f9ec982ae01..a4713311e2b3e 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -19696,7 +19696,7 @@ bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { if (Subtarget->isThumb2()) return ARM_AM::getT2SOImmVal(AbsImm) != -1; // Thumb1 only has 8-bit unsigned immediate. - return AbsImm >= 0 && AbsImm <= 255; + return AbsImm <= 255; } // Return false to prevent folding From b9b9addae6c79c72394c8260106f8ba5281e9747 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 31 Mar 2025 14:22:49 +0100 Subject: [PATCH 0086/1029] [AArch64] Add bitcast + extend tests. NFC --- llvm/test/CodeGen/AArch64/bitcast-extend.ll | 273 ++++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/bitcast-extend.ll diff --git a/llvm/test/CodeGen/AArch64/bitcast-extend.ll b/llvm/test/CodeGen/AArch64/bitcast-extend.ll new file mode 100644 index 0000000000000..195c740022d10 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/bitcast-extend.ll @@ -0,0 +1,273 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI + +define <4 x i16> @z_i32_v4i16(i32 %x) { +; CHECK-SD-LABEL: z_i32_v4i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: z_i32_v4i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: mov b1, v0.b[1] +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] +; CHECK-GI-NEXT: ushll v0.8h, v2.8b, #0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret + %b = bitcast i32 %x to <4 x i8> + %e = zext <4 x i8> %b to <4 x i16> + ret <4 x i16> %e +} + +define <4 x i32> @z_i32_v4i32(i32 %x) { +; CHECK-SD-LABEL: z_i32_v4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: movi v1.2d, #0x0000ff000000ff +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: z_i32_v4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: mov b1, v0.b[2] +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v0.b[3] +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: uxtb w8, w8 +; CHECK-GI-NEXT: fmov w10, s2 +; CHECK-GI-NEXT: fmov w11, s3 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: uxtb w9, w9 +; CHECK-GI-NEXT: uxtb w10, w10 +; CHECK-GI-NEXT: uxtb w11, w11 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov v0.h[1], w10 +; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: ret + %b = bitcast i32 %x to <4 x i8> + %e = zext <4 x i8> %b to <4 x i32> + ret <4 x i32> %e +} + +define <4 x i64> @z_i32_v4i64(i32 %x) { +; CHECK-SD-LABEL: z_i32_v4i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: movi v1.2d, #0x000000000000ff +; CHECK-SD-NEXT: umov w8, v0.b[2] +; CHECK-SD-NEXT: umov w9, v0.b[0] +; CHECK-SD-NEXT: umov w10, v0.b[3] +; CHECK-SD-NEXT: umov w11, v0.b[1] +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: fmov s2, w8 +; CHECK-SD-NEXT: mov v0.s[1], w11 +; CHECK-SD-NEXT: mov v2.s[1], w10 +; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0 +; CHECK-SD-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: z_i32_v4i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: mov b1, v0.b[2] +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v0.b[3] +; CHECK-GI-NEXT: ubfx x8, x8, #0, #8 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: ubfx x9, x9, #0, #8 +; CHECK-GI-NEXT: ubfx x8, x8, #0, #8 +; CHECK-GI-NEXT: mov v1.d[0], x9 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: ubfx x9, x9, #0, #8 +; CHECK-GI-NEXT: mov v1.d[1], x9 +; CHECK-GI-NEXT: ret + %b = bitcast i32 %x to <4 x i8> + %e = zext <4 x i8> %b to <4 x i64> + ret <4 x i64> %e +} + +define <4 x i16> @s_i32_v4i16(i32 %x) { +; CHECK-SD-LABEL: s_i32_v4i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: shl v0.4h, v0.4h, #8 +; CHECK-SD-NEXT: sshr v0.4h, v0.4h, #8 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: s_i32_v4i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: mov b1, v0.b[1] +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] +; CHECK-GI-NEXT: mov b3, v0.b[2] +; CHECK-GI-NEXT: mov b0, v0.b[3] +; CHECK-GI-NEXT: mov v2.b[1], v1.b[0] +; CHECK-GI-NEXT: mov v2.b[2], v3.b[0] +; CHECK-GI-NEXT: mov v2.b[3], v0.b[0] +; CHECK-GI-NEXT: sshll v0.8h, v2.8b, #0 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret + %b = bitcast i32 %x to <4 x i8> + %e = sext <4 x i8> %b to <4 x i16> + ret <4 x i16> %e +} + +define <4 x i32> @s_i32_v4i32(i32 %x) { +; CHECK-SD-LABEL: s_i32_v4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: shl v0.4s, v0.4s, #24 +; CHECK-SD-NEXT: sshr v0.4s, v0.4s, #24 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: s_i32_v4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: mov b1, v0.b[2] +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v0.b[3] +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: sxtb w8, w8 +; CHECK-GI-NEXT: fmov w10, s2 +; CHECK-GI-NEXT: fmov w11, s3 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: sxtb w9, w9 +; CHECK-GI-NEXT: sxtb w10, w10 +; CHECK-GI-NEXT: sxtb w11, w11 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov v0.h[1], w10 +; CHECK-GI-NEXT: mov v1.h[1], w11 +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: ret + %b = bitcast i32 %x to <4 x i8> + %e = sext <4 x i8> %b to <4 x i32> + ret <4 x i32> %e +} + +define <4 x i64> @s_i32_v4i64(i32 %x) { +; CHECK-SD-LABEL: s_i32_v4i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: umov w8, v0.b[2] +; CHECK-SD-NEXT: umov w9, v0.b[0] +; CHECK-SD-NEXT: umov w10, v0.b[3] +; CHECK-SD-NEXT: umov w11, v0.b[1] +; CHECK-SD-NEXT: fmov s0, w9 +; CHECK-SD-NEXT: fmov s1, w8 +; CHECK-SD-NEXT: mov v0.s[1], w11 +; CHECK-SD-NEXT: mov v1.s[1], w10 +; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-SD-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56 +; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56 +; CHECK-SD-NEXT: sshr v0.2d, v0.2d, #56 +; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #56 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: s_i32_v4i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: mov b1, v0.b[2] +; CHECK-GI-NEXT: fmov w8, s0 +; CHECK-GI-NEXT: mov b2, v0.b[1] +; CHECK-GI-NEXT: mov b3, v0.b[3] +; CHECK-GI-NEXT: sxtb x8, w8 +; CHECK-GI-NEXT: fmov w9, s1 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: sxtb x9, w9 +; CHECK-GI-NEXT: sxtb x8, w8 +; CHECK-GI-NEXT: mov v1.d[0], x9 +; CHECK-GI-NEXT: fmov w9, s3 +; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: sxtb x9, w9 +; CHECK-GI-NEXT: mov v1.d[1], x9 +; CHECK-GI-NEXT: ret + %b = bitcast i32 %x to <4 x i8> + %e = sext <4 x i8> %b to <4 x i64> + ret <4 x i64> %e +} + +define void @extractbitcastext(i32 %bytes, ptr %output) { +; CHECK-LABEL: extractbitcastext: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ushll v1.2d, v0.2s, #0 +; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: ret + %conv = sext i32 %bytes to i64 + %b0 = bitcast i64 %conv to <8 x i8> + %b1 = zext <8 x i8> %b0 to <8 x i16> + %shuffle.i = shufflevector <8 x i16> %b1, <8 x i16> poison, <4 x i32> + %z2 = zext nneg <4 x i16> %shuffle.i to <4 x i32> + %shuffle.i23 = shufflevector <4 x i32> %z2, <4 x i32> poison, <2 x i32> + %z3 = zext nneg <2 x i32> %shuffle.i23 to <2 x i64> + %shuffle.i24 = shufflevector <4 x i32> %z2, <4 x i32> poison, <2 x i32> + %z4 = zext nneg <2 x i32> %shuffle.i24 to <2 x i64> + store <2 x i64> %z3, ptr %output, align 8 + %add.ptr = getelementptr inbounds nuw i8, ptr %output, i64 16 + store <2 x i64> %z4, ptr %add.ptr, align 8 + ret void +} + +define void @extractbitcastext_s(i32 %bytes, ptr %output) { +; CHECK-LABEL: extractbitcastext_s: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: sshll v1.2d, v0.2s, #0 +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #0 +; CHECK-NEXT: stp q1, q0, [x1] +; CHECK-NEXT: ret + %conv = sext i32 %bytes to i64 + %b0 = bitcast i64 %conv to <8 x i8> + %b1 = sext <8 x i8> %b0 to <8 x i16> + %shuffle.i = shufflevector <8 x i16> %b1, <8 x i16> poison, <4 x i32> + %s2 = sext <4 x i16> %shuffle.i to <4 x i32> + %shuffle.i23 = shufflevector <4 x i32> %s2, <4 x i32> poison, <2 x i32> + %s3 = sext <2 x i32> %shuffle.i23 to <2 x i64> + %shuffle.i24 = shufflevector <4 x i32> %s2, <4 x i32> poison, <2 x i32> + %s4 = sext <2 x i32> %shuffle.i24 to <2 x i64> + store <2 x i64> %s3, ptr %output, align 8 + %add.ptr = getelementptr inbounds nuw i8, ptr %output, i64 16 + store <2 x i64> %s4, ptr %add.ptr, align 8 + ret void +} + + From 9b32f3d0966723f443328d8b49c149bde6725899 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 31 Mar 2025 14:32:43 +0100 Subject: [PATCH 0087/1029] [DAG] visitEXTRACT_SUBVECTOR - don't return early on failure of EXTRACT_SUBVECTOR(INSERT_SUBVECTOR()) -> BITCAST fold (#133695) Always allow later folds to try to match as well. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 36 +- .../AMDGPU/load-local-redundant-copies.ll | 45 +-- .../vector-interleaved-store-i16-stride-5.ll | 12 +- .../vector-interleaved-store-i16-stride-7.ll | 200 +++++---- .../vector-interleaved-store-i8-stride-5.ll | 24 +- .../vector-interleaved-store-i8-stride-7.ll | 380 +++++++++--------- 6 files changed, 335 insertions(+), 362 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 4487b9d510cc7..dc5c5f38e3bd8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -25532,26 +25532,24 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) { // Handle only simple case where vector being inserted and vector // being extracted are of same size. EVT SmallVT = V.getOperand(1).getValueType(); - if (!NVT.bitsEq(SmallVT)) - return SDValue(); - - // Combine: - // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx) - // Into: - // indices are equal or bit offsets are equal => V1 - // otherwise => (extract_subvec V1, ExtIdx) - uint64_t InsIdx = V.getConstantOperandVal(2); - if (InsIdx * SmallVT.getScalarSizeInBits() == - ExtIdx * NVT.getScalarSizeInBits()) { - if (LegalOperations && !TLI.isOperationLegal(ISD::BITCAST, NVT)) - return SDValue(); - - return DAG.getBitcast(NVT, V.getOperand(1)); + if (NVT.bitsEq(SmallVT)) { + // Combine: + // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx) + // Into: + // indices are equal or bit offsets are equal => V1 + // otherwise => (extract_subvec V1, ExtIdx) + uint64_t InsIdx = V.getConstantOperandVal(2); + if (InsIdx * SmallVT.getScalarSizeInBits() == + ExtIdx * NVT.getScalarSizeInBits()) { + if (!LegalOperations || TLI.isOperationLegal(ISD::BITCAST, NVT)) + return DAG.getBitcast(NVT, V.getOperand(1)); + } else { + return DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, NVT, + DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)), + N->getOperand(1)); + } } - return DAG.getNode( - ISD::EXTRACT_SUBVECTOR, DL, NVT, - DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)), - N->getOperand(1)); } if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG, LegalOperations)) diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll index 64d7f93760fd5..a6ce512164b89 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll @@ -66,39 +66,38 @@ define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, ptr addrspace(8) ; CHECK-NEXT: s_mov_b32 s6, s4 ; CHECK-NEXT: s_mov_b32 s5, s3 ; CHECK-NEXT: s_mov_b32 s4, s2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, 20, v1 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 16, v1 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 12, v1 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, 8, v1 -; CHECK-NEXT: v_add_i32_e32 v8, vcc, 4, v1 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, 12, v1 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 8, v1 +; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v1 +; CHECK-NEXT: v_add_i32_e32 v6, vcc, 20, v1 +; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1 ; CHECK-NEXT: v_mov_b32_e32 v9, s0 -; CHECK-NEXT: v_add_i32_e32 v10, vcc, 20, v2 -; CHECK-NEXT: v_add_i32_e32 v11, vcc, 16, v2 +; CHECK-NEXT: v_add_i32_e32 v10, vcc, 12, v2 +; CHECK-NEXT: v_add_i32_e32 v11, vcc, 8, v2 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v7, v3 -; CHECK-NEXT: ds_read_b32 v6, v4 -; CHECK-NEXT: ds_read_b32 v5, v5 -; CHECK-NEXT: ds_read_b32 v4, v8 -; CHECK-NEXT: ds_read_b32 v8, v0 +; CHECK-NEXT: ds_read_b32 v5, v3 +; CHECK-NEXT: ds_read_b32 v4, v4 +; CHECK-NEXT: ds_read_b32 v8, v6 +; CHECK-NEXT: ds_read_b32 v7, v7 +; CHECK-NEXT: ds_read_b32 v6, v0 ; CHECK-NEXT: ds_read_b32 v3, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 12, v2 -; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2 -; CHECK-NEXT: v_add_i32_e32 v13, vcc, 4, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, 4, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 20, v2 +; CHECK-NEXT: v_add_i32_e32 v12, vcc, 16, v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc ; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc -; CHECK-NEXT: ds_read_b32 v0, v11 ; CHECK-NEXT: s_waitcnt expcnt(1) -; CHECK-NEXT: ds_read_b32 v5, v1 -; CHECK-NEXT: ds_read_b32 v4, v12 -; CHECK-NEXT: ds_read_b32 v3, v13 +; CHECK-NEXT: ds_read_b32 v4, v11 +; CHECK-NEXT: ds_read_b32 v3, v0 +; CHECK-NEXT: ds_read_b32 v1, v1 +; CHECK-NEXT: ds_read_b32 v0, v12 +; CHECK-NEXT: ds_read_b32 v5, v10 ; CHECK-NEXT: ds_read_b32 v2, v2 -; CHECK-NEXT: ds_read_b32 v1, v10 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) +; CHECK-NEXT: s_waitcnt lgkmcnt(2) ; CHECK-NEXT: exp mrt0 off, off, off, off -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc ; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc ; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc ; CHECK-NEXT: s_endpgm %load1 = load <6 x float>, ptr addrspace(3) %arg5, align 4 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index 0df63422b5d84..e4fa594f3dd72 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -449,9 +449,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 ; AVX512-NEXT: vmovq %xmm0, 32(%r9) -; AVX512-NEXT: vmovdqa %ymm1, (%r9) +; AVX512-NEXT: vmovdqa %ymm2, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -476,9 +475,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovq %xmm0, 32(%r9) -; AVX512-FCP-NEXT: vmovdqa %ymm1, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -504,9 +502,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovq %xmm0, 32(%r9) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%r9) +; AVX512DQ-NEXT: vmovdqa %ymm2, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -531,9 +528,8 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, 32(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index bc08f57e5faac..e4e013446f7a5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -1380,29 +1380,28 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9)) -; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm4 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u],zero,zero,zero,zero,ymm2[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[20,21,28,29,u,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u] +; AVX512-NEXT: vpor %ymm2, %ymm7, %ymm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2)) +; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm2 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] ; AVX512-NEXT: vpsrld $16, %xmm6, %xmm1 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512-NEXT: vpbroadcastd 12(%r10), %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6],xmm3[7] +; AVX512-NEXT: vpbroadcastd 12(%r10), %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u] -; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1)) -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512-NEXT: vmovdqa %ymm8, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1420,6 +1419,37 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,2,0] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,5,0,0,5,2,6,0] +; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[20,21,24,25] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512-FCP-NEXT: vporq %zmm9, %zmm10, %zmm9 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vpbroadcastd (%r10), %ymm11 +; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u],zero,zero,zero,zero,ymm7[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[20,21,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm6[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm6[u,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm7, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6)) ; AVX512-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1430,41 +1460,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpbroadcastd 12(%r10), %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1)) -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,2,0] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,8,9],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,28,29,20,21] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,0,0,5,2,6,0] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[20,21,24,25] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vporq %zmm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm3[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm3[u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,4,5,12,13],zero,zero,ymm3[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm3[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpbroadcastd (%r10), %ymm4 -; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & mem) | zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa %ymm8, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1505,29 +1503,28 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9)) -; AVX512DQ-NEXT: vpsrlq $48, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u],zero,zero,zero,zero,ymm2[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[20,21,28,29,u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u] +; AVX512DQ-NEXT: vpor %ymm2, %ymm7, %ymm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2)) +; AVX512DQ-NEXT: vpsrlq $48, %xmm4, %xmm2 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] ; AVX512DQ-NEXT: vpsrld $16, %xmm6, %xmm1 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512DQ-NEXT: vpbroadcastd 12(%r10), %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6],xmm3[7] +; AVX512DQ-NEXT: vpbroadcastd 12(%r10), %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u] -; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1)) -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 ; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512DQ-NEXT: vmovdqa %ymm8, 64(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1545,6 +1542,37 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,2,0] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,5,0,0,5,2,6,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[20,21,24,25] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512DQ-FCP-NEXT: vporq %zmm9, %zmm10, %zmm9 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vpbroadcastd (%r10), %ymm11 +; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u],zero,zero,zero,zero,ymm7[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[20,21,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm6[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm6[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6)) ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1555,41 +1583,9 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpbroadcastd 12(%r10), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,2,0] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,8,9],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,28,29,20,21] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,0,0,5,2,6,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[20,21,24,25] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vporq %zmm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm3[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm3[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,4,5,12,13],zero,zero,ymm3[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm3[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpbroadcastd (%r10), %ymm4 -; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%r10), %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & mem) | zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 75f8469c266b1..39f8a93a7b77a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -725,9 +725,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) -; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 ; AVX512-NEXT: vmovq %xmm1, 32(%r9) -; AVX512-NEXT: vmovdqa %ymm0, (%r9) +; AVX512-NEXT: vmovdqa %ymm3, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -756,9 +755,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vmovd %eax, %xmm1 ; AVX512-FCP-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovq %xmm1, 32(%r9) -; AVX512-FCP-NEXT: vmovdqa %ymm0, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm3, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -787,9 +785,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-NEXT: vmovd %eax, %xmm1 ; AVX512DQ-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovq %xmm1, 32(%r9) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%r9) +; AVX512DQ-NEXT: vmovdqa %ymm3, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -818,9 +815,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vmovd %eax, %xmm1 ; AVX512DQ-FCP-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 ^ (mem & (xmm1 ^ xmm0)) -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovq %xmm1, 32(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -852,9 +848,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: movw $132, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovq %xmm0, 32(%r9) -; AVX512BW-NEXT: vmovdqa %ymm1, (%r9) +; AVX512BW-NEXT: vmovdqa %ymm2, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -886,9 +881,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-FCP-NEXT: movw $132, %ax ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovq %xmm0, 32(%r9) -; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -920,9 +914,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: movw $132, %ax ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovq %xmm0, 32(%r9) -; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%r9) +; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -954,9 +947,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-FCP-NEXT: movw $132, %ax ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 32(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 328d55ca8d627..b82e663528398 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -2098,10 +2098,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] ; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 +; AVX512-NEXT: vmovdqa %ymm5, 64(%rax) ; AVX512-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -2164,10 +2163,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] ; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqa %ymm5, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -2235,10 +2233,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] ; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 +; AVX512DQ-NEXT: vmovdqa %ymm5, 64(%rax) ; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2301,10 +2298,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -2314,80 +2310,79 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm4 -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm0 -; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,7,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] +; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25] +; AVX512BW-NEXT: vpor %ymm5, %ymm4, %ymm4 +; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28] +; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[4],zero,zero,zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero +; AVX512BW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512BW-NEXT: vpermw %zmm5, %zmm6, %zmm6 +; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 +; AVX512BW-NEXT: kmovq %rcx, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} +; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 +; AVX512BW-NEXT: kmovq %rcx, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,7,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] ; AVX512BW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm5, %xmm3 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,3,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[6,14],zero,zero,zero,zero,zero,xmm7[7,15],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm3, %xmm5 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %ymm1, %ymm3, %ymm3 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,1,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-NEXT: vmovdqu8 %xmm4, %xmm6 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermw %ymm5, %ymm4, %ymm4 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm6 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,3,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[3,1,1,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm3, %ymm7, %ymm3 +; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm2 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero +; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} -; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero,zero,ymm8[25] -; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm7 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX512BW-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm1 -; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 -; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 -; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa %xmm5, 96(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} +; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa %xmm6, 96(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2417,43 +2412,42 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm5, %xmm4 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[1,3,1,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX512BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm7 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,3,1] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[3,19],zero,zero,zero,zero,zero,ymm5[28,20],zero,zero,zero,zero,zero,ymm5[29,21],zero -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm0 {%k1} -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] ; AVX512BW-FCP-NEXT: vpermw %zmm6, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,4,0,1,5,0,0,1,5,2,6] -; AVX512BW-FCP-NEXT: vpermd %zmm2, %zmm6, %zmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,0,4,0,1,5,0,0,1,5,2,6] +; AVX512BW-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] ; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,1,4,5,1,5,0,0,1,5,2,6] ; AVX512BW-FCP-NEXT: vpermd %zmm3, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57] -; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zero,zero,zmm5[1,9],zero,zero,zero,zero,zero,zmm5[18,26],zero,zero,zero,zero,zero,zmm5[19,27],zero,zero,zero,zero,zero,zmm5[20,28],zero,zero,zero,zero,zero,zmm5[33,37],zero,zero,zero,zero,zero,zmm5[34,38],zero,zero,zero,zero,zero,zmm5[51,55],zero,zero,zero,zero,zero,zmm5[56,60],zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, 96(%rax) +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 +; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm2 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 +; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa %xmm4, 96(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -2463,80 +2457,79 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm2 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm4 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm0 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,6,7,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,3,2] +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25] +; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm4, %ymm4 +; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 +; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28] +; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[4],zero,zero,zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,7,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] ; AVX512DQ-BW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm5, %xmm3 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,3,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm2[1,3,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[6,14],zero,zero,zero,zero,zero,xmm7[7,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm3, %xmm5 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,1,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm4, %xmm6 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermw %ymm5, %ymm4, %ymm4 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512DQ-BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,3,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[3,19],zero,zero,zero,zero,zero,ymm3[28,20],zero,zero,zero,zero,zero,ymm3[29,21],zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm4[3,1,1,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm7, %ymm3 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm2 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm3 {%k1} -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero,zero,ymm8[25] -; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm7 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28] -; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm6, %ymm4 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm1 -; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 -; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 -; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa %xmm5, 96(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa %xmm6, 96(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -2566,43 +2559,42 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm5, %xmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[1,3,1,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] -; AVX512DQ-BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,3,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[3,19],zero,zero,zero,zero,zero,ymm5[28,20],zero,zero,zero,zero,zero,ymm5[29,21],zero -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm6, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,4,0,1,5,0,0,1,5,2,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm2, %zmm6, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,0,4,0,1,5,0,0,1,5,2,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] ; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,1,4,5,1,5,0,0,1,5,2,6] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm3, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zmm1[33,37],zero,zero,zero,zero,zero,zmm1[34,38],zero,zero,zero,zero,zero,zmm1[51,55],zero,zero,zero,zero,zero,zmm1[56,60],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zero,zero,zmm5[1,9],zero,zero,zero,zero,zero,zmm5[18,26],zero,zero,zero,zero,zero,zmm5[19,27],zero,zero,zero,zero,zero,zmm5[20,28],zero,zero,zero,zero,zero,zmm5[33,37],zero,zero,zero,zero,zero,zmm5[34,38],zero,zero,zero,zero,zero,zmm5[51,55],zero,zero,zero,zero,zero,zmm5[56,60],zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, 96(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512DQ-BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 +; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 +; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, 96(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 From cb54026d92191e347629265d4082f5cc2cc28020 Mon Sep 17 00:00:00 2001 From: sstwcw Date: Mon, 31 Mar 2025 13:53:23 +0000 Subject: [PATCH 0088/1029] [clang-format] Recognize wait fork in Verilog (#132042) before ```Verilog wait fork ; wait fork ; wait fork ; ``` after ```Verilog wait fork; wait fork; wait fork; ``` The `wait fork` statement should not start a block. Previously the formatter treated the `fork` part as the start of a new block. Now the problem is fixed. --- clang/lib/Format/FormatToken.h | 117 ++++++++----------- clang/unittests/Format/FormatTestVerilog.cpp | 2 + 2 files changed, 53 insertions(+), 66 deletions(-) diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index 3808872d227a9..a5c2388bb143d 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -1243,6 +1243,7 @@ struct AdditionalKeywords { kw_unique0 = &IdentTable.get("unique0"); kw_uwire = &IdentTable.get("uwire"); kw_vectored = &IdentTable.get("vectored"); + kw_wait = &IdentTable.get("wait"); kw_wand = &IdentTable.get("wand"); kw_weak0 = &IdentTable.get("weak0"); kw_weak1 = &IdentTable.get("weak1"); @@ -1299,70 +1300,49 @@ struct AdditionalKeywords { // treatment like `showcancelled` or they should be treated as identifiers // like `int` and `logic`. VerilogExtraKeywords = std::unordered_set( - {kw_always, kw_always_comb, - kw_always_ff, kw_always_latch, - kw_assert, kw_assign, - kw_assume, kw_automatic, - kw_before, kw_begin, - kw_bins, kw_binsof, - kw_casex, kw_casez, - kw_celldefine, kw_checker, - kw_clocking, kw_constraint, - kw_cover, kw_covergroup, - kw_coverpoint, kw_disable, - kw_dist, kw_edge, - kw_end, kw_endcase, - kw_endchecker, kw_endclass, - kw_endclocking, kw_endfunction, - kw_endgenerate, kw_endgroup, - kw_endinterface, kw_endmodule, - kw_endpackage, kw_endprimitive, - kw_endprogram, kw_endproperty, - kw_endsequence, kw_endspecify, - kw_endtable, kw_endtask, - kw_extends, kw_final, - kw_foreach, kw_forever, - kw_fork, kw_function, - kw_generate, kw_highz0, - kw_highz1, kw_iff, - kw_ifnone, kw_ignore_bins, - kw_illegal_bins, kw_implements, - kw_import, kw_initial, - kw_inout, kw_input, - kw_inside, kw_interconnect, - kw_interface, kw_intersect, - kw_join, kw_join_any, - kw_join_none, kw_large, - kw_let, kw_local, - kw_localparam, kw_macromodule, - kw_matches, kw_medium, - kw_negedge, kw_output, - kw_package, kw_packed, - kw_parameter, kw_posedge, - kw_primitive, kw_priority, - kw_program, kw_property, - kw_pull0, kw_pull1, - kw_pure, kw_rand, - kw_randc, kw_randcase, - kw_randsequence, kw_ref, - kw_repeat, kw_sample, - kw_scalared, kw_sequence, - kw_small, kw_soft, - kw_solve, kw_specify, - kw_specparam, kw_strong0, - kw_strong1, kw_supply0, - kw_supply1, kw_table, - kw_tagged, kw_task, - kw_tri, kw_tri0, - kw_tri1, kw_triand, - kw_trior, kw_trireg, - kw_unique, kw_unique0, - kw_uwire, kw_var, - kw_vectored, kw_wand, - kw_weak0, kw_weak1, - kw_wildcard, kw_wire, - kw_with, kw_wor, - kw_verilogHash, kw_verilogHashHash}); + {kw_always, kw_always_comb, kw_always_ff, + kw_always_latch, kw_assert, kw_assign, + kw_assume, kw_automatic, kw_before, + kw_begin, kw_bins, kw_binsof, + kw_casex, kw_casez, kw_celldefine, + kw_checker, kw_clocking, kw_constraint, + kw_cover, kw_covergroup, kw_coverpoint, + kw_disable, kw_dist, kw_edge, + kw_end, kw_endcase, kw_endchecker, + kw_endclass, kw_endclocking, kw_endfunction, + kw_endgenerate, kw_endgroup, kw_endinterface, + kw_endmodule, kw_endpackage, kw_endprimitive, + kw_endprogram, kw_endproperty, kw_endsequence, + kw_endspecify, kw_endtable, kw_endtask, + kw_extends, kw_final, kw_foreach, + kw_forever, kw_fork, kw_function, + kw_generate, kw_highz0, kw_highz1, + kw_iff, kw_ifnone, kw_ignore_bins, + kw_illegal_bins, kw_implements, kw_import, + kw_initial, kw_inout, kw_input, + kw_inside, kw_interconnect, kw_interface, + kw_intersect, kw_join, kw_join_any, + kw_join_none, kw_large, kw_let, + kw_local, kw_localparam, kw_macromodule, + kw_matches, kw_medium, kw_negedge, + kw_output, kw_package, kw_packed, + kw_parameter, kw_posedge, kw_primitive, + kw_priority, kw_program, kw_property, + kw_pull0, kw_pull1, kw_pure, + kw_rand, kw_randc, kw_randcase, + kw_randsequence, kw_ref, kw_repeat, + kw_sample, kw_scalared, kw_sequence, + kw_small, kw_soft, kw_solve, + kw_specify, kw_specparam, kw_strong0, + kw_strong1, kw_supply0, kw_supply1, + kw_table, kw_tagged, kw_task, + kw_tri, kw_tri0, kw_tri1, + kw_triand, kw_trior, kw_trireg, + kw_unique, kw_unique0, kw_uwire, + kw_var, kw_vectored, kw_wait, + kw_wand, kw_weak0, kw_weak1, + kw_wildcard, kw_wire, kw_with, + kw_wor, kw_verilogHash, kw_verilogHashHash}); TableGenExtraKeywords = std::unordered_set({ kw_assert, @@ -1614,6 +1594,7 @@ struct AdditionalKeywords { IdentifierInfo *kw_unique0; IdentifierInfo *kw_uwire; IdentifierInfo *kw_vectored; + IdentifierInfo *kw_wait; IdentifierInfo *kw_wand; IdentifierInfo *kw_weak0; IdentifierInfo *kw_weak1; @@ -1849,8 +1830,12 @@ struct AdditionalKeywords { /// Returns whether \p Tok is a Verilog keyword that opens a block. bool isVerilogBegin(const FormatToken &Tok) const { // `table` is not included since it needs to be treated specially. - return !Tok.endsSequence(kw_fork, kw_disable) && - Tok.isOneOf(kw_begin, kw_fork, kw_generate, kw_specify); + if (Tok.isOneOf(kw_begin, kw_generate, kw_specify)) + return true; + if (Tok.isNot(kw_fork)) + return false; + const auto *Prev = Tok.getPreviousNonComment(); + return !(Prev && Prev->isOneOf(kw_disable, kw_wait)); } /// Returns whether \p Tok is a Verilog keyword that closes a block. diff --git a/clang/unittests/Format/FormatTestVerilog.cpp b/clang/unittests/Format/FormatTestVerilog.cpp index e4a14ff754d1a..5c50ae6fcfac8 100644 --- a/clang/unittests/Format/FormatTestVerilog.cpp +++ b/clang/unittests/Format/FormatTestVerilog.cpp @@ -160,6 +160,8 @@ TEST_F(FormatTestVerilog, Block) { // Test that 'disable fork' and 'rand join' don't get mistaken as blocks. verifyFormat("disable fork;\n" "x = x;"); + verifyFormat("wait fork;\n" + "x = x;"); verifyFormat("rand join x x;\n" "x = x;"); // The begin keyword should not be indented if it is too long to fit on the From ab7cee8a0ecf29fdb47c64c8d431a694d63390d2 Mon Sep 17 00:00:00 2001 From: sstwcw Date: Tue, 25 Mar 2025 14:10:08 +0000 Subject: [PATCH 0089/1029] [clang-format] Handle C++ keywords in other languages better (#132941) There is some code to make sure that C++ keywords that are identifiers in the other languages are not treated as keywords. Right now, the kind is set to identifier, and the identifier info is cleared. The latter is probably so that the code for identifying C++ structures does not recognize those structures by mistake when formatting a language that does not have those structures. But we did not find an instance where the language can have the sequence of tokens, the code tries to parse the structure as if it is C++ using the identifier info instead of the token kind, but without checking for the language setting. However, there are places where the code checks whether the identifier info field is null or not. They are places where an identifier and a keyword are treated the same way. For example, the name of a function in JavaScript. This patch removes the lines that clear the identifier info. This way, a C++ keyword gets treated in the same way as an identifier in those places. JavaScript New ```JavaScript async function union( myparamnameiswaytooloooong) { } ``` Old ```JavaScript async function union( myparamnameiswaytooloooong) { } ``` Java New ```Java enum union { ABC, CDE } ``` Old ```Java enum union { ABC, CDE } ``` --- clang/lib/Format/FormatTokenLexer.cpp | 3 -- clang/unittests/Format/FormatTestJS.cpp | 42 +++++++++++++++-------- clang/unittests/Format/FormatTestJava.cpp | 2 ++ 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index eed54a11684b5..014b10b206d90 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -1306,15 +1306,12 @@ FormatToken *FormatTokenLexer::getNextToken() { FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete, tok::kw_operator)) { FormatTok->Tok.setKind(tok::identifier); - FormatTok->Tok.setIdentifierInfo(nullptr); } else if (Style.isJavaScript() && FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_operator)) { FormatTok->Tok.setKind(tok::identifier); - FormatTok->Tok.setIdentifierInfo(nullptr); } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) { FormatTok->Tok.setKind(tok::identifier); - FormatTok->Tok.setIdentifierInfo(nullptr); } } else if (FormatTok->is(tok::greatergreater)) { FormatTok->Tok.setKind(tok::greater); diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp index 78c9f887a159b..3dae67fbcdfcb 100644 --- a/clang/unittests/Format/FormatTestJS.cpp +++ b/clang/unittests/Format/FormatTestJS.cpp @@ -828,12 +828,18 @@ TEST_F(FormatTestJS, AsyncFunctions) { "} "); // clang-format must not insert breaks between async and function, otherwise // automatic semicolon insertion may trigger (in particular in a class body). + auto Style = getGoogleJSStyleWithColumns(10); verifyFormat("async function\n" "hello(\n" " myparamnameiswaytooloooong) {\n" "}", "async function hello(myparamnameiswaytooloooong) {}", - getGoogleJSStyleWithColumns(10)); + Style); + verifyFormat("async function\n" + "union(\n" + " myparamnameiswaytooloooong) {\n" + "}", + Style); verifyFormat("class C {\n" " async hello(\n" " myparamnameiswaytooloooong) {\n" @@ -841,7 +847,7 @@ TEST_F(FormatTestJS, AsyncFunctions) { "}", "class C {\n" " async hello(myparamnameiswaytooloooong) {} }", - getGoogleJSStyleWithColumns(10)); + Style); verifyFormat("async function* f() {\n" " yield fetch(x);\n" "}"); @@ -1338,15 +1344,16 @@ TEST_F(FormatTestJS, WrapRespectsAutomaticSemicolonInsertion) { // The following statements must not wrap, as otherwise the program meaning // would change due to automatic semicolon insertion. // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.9.1. - verifyFormat("return aaaaa;", getGoogleJSStyleWithColumns(10)); - verifyFormat("yield aaaaa;", getGoogleJSStyleWithColumns(10)); - verifyFormat("return /* hello! */ aaaaa;", getGoogleJSStyleWithColumns(10)); - verifyFormat("continue aaaaa;", getGoogleJSStyleWithColumns(10)); - verifyFormat("continue /* hello! */ aaaaa;", getGoogleJSStyleWithColumns(10)); - verifyFormat("break aaaaa;", getGoogleJSStyleWithColumns(10)); - verifyFormat("throw aaaaa;", getGoogleJSStyleWithColumns(10)); - verifyFormat("aaaaaaaaa++;", getGoogleJSStyleWithColumns(10)); - verifyFormat("aaaaaaaaa--;", getGoogleJSStyleWithColumns(10)); + auto Style =getGoogleJSStyleWithColumns(10); + verifyFormat("return aaaaa;", Style); + verifyFormat("yield aaaaa;", Style); + verifyFormat("return /* hello! */ aaaaa;", Style); + verifyFormat("continue aaaaa;", Style); + verifyFormat("continue /* hello! */ aaaaa;", Style); + verifyFormat("break aaaaa;", Style); + verifyFormat("throw aaaaa;", Style); + verifyFormat("aaaaaaaaa++;", Style); + verifyFormat("aaaaaaaaa--;", Style); verifyFormat("return [\n" " aaa\n" "];", @@ -1366,12 +1373,13 @@ TEST_F(FormatTestJS, WrapRespectsAutomaticSemicolonInsertion) { // Ideally the foo() bit should be indented relative to the async function(). verifyFormat("async function\n" "foo() {}", - getGoogleJSStyleWithColumns(10)); - verifyFormat("await theReckoning;", getGoogleJSStyleWithColumns(10)); - verifyFormat("some['a']['b']", getGoogleJSStyleWithColumns(10)); + Style); + verifyFormat("await theReckoning;", Style); + verifyFormat("some['a']['b']", Style); + verifyFormat("union['a']['b']", Style); verifyFormat("x = (a['a']\n" " ['b']);", - getGoogleJSStyleWithColumns(10)); + Style); verifyFormat("function f() {\n" " return foo.bar(\n" " (param): param is {\n" @@ -2500,6 +2508,10 @@ TEST_F(FormatTestJS, NonNullAssertionOperator) { TEST_F(FormatTestJS, CppKeywords) { // Make sure we don't mess stuff up because of C++ keywords. verifyFormat("return operator && (aa);"); + verifyFormat("enum operator {\n" + " A = 1,\n" + " B\n" + "}"); // .. or QT ones. verifyFormat("const slots: Slot[];"); // use the "!" assertion operator to validate that clang-format understands diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp index 33998bc7ff858..e01c1d6d7e684 100644 --- a/clang/unittests/Format/FormatTestJava.cpp +++ b/clang/unittests/Format/FormatTestJava.cpp @@ -158,6 +158,8 @@ TEST_F(FormatTestJava, AnonymousClasses) { TEST_F(FormatTestJava, EnumDeclarations) { verifyFormat("enum SomeThing { ABC, CDE }"); + // A C++ keyword should not mess things up. + verifyFormat("enum union { ABC, CDE }"); verifyFormat("enum SomeThing {\n" " ABC,\n" " CDE,\n" From 0794d5cfba4b78b1bf7980a5f9434382a697df23 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Mon, 31 Mar 2025 14:56:29 +0100 Subject: [PATCH 0090/1029] [clang][Sema] Fix typo in 'offsetof' diagnostics (#133448) Before: ``` offset of on non-POD type ``` After: ``` offsetof on non-POD type ``` --------- Co-authored-by: Aaron Ballman --- clang/include/clang/Basic/DiagnosticSemaKinds.td | 4 ++-- clang/test/SemaCXX/ms_struct.cpp | 4 ++-- clang/test/SemaCXX/offsetof-0x.cpp | 2 +- clang/test/SemaCXX/offsetof.cpp | 14 +++++++------- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 1e900437d41ce..b03926db8170a 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -7031,10 +7031,10 @@ def err_offsetof_incomplete_type : Error< def err_offsetof_record_type : Error< "offsetof requires struct, union, or class type, %0 invalid">; def err_offsetof_array_type : Error<"offsetof requires array type, %0 invalid">; -def ext_offsetof_non_pod_type : ExtWarn<"offset of on non-POD type %0">, +def ext_offsetof_non_pod_type : ExtWarn<"'offsetof' on non-POD type %0">, InGroup; def ext_offsetof_non_standardlayout_type : ExtWarn< - "offset of on non-standard-layout type %0">, InGroup; + "'offsetof' on non-standard-layout type %0">, InGroup; def err_offsetof_bitfield : Error<"cannot compute offset of bit-field %0">; def err_offsetof_field_of_virtual_base : Error< "invalid application of 'offsetof' to a field of a virtual base">; diff --git a/clang/test/SemaCXX/ms_struct.cpp b/clang/test/SemaCXX/ms_struct.cpp index 995e424d1f876..409350f2606a9 100644 --- a/clang/test/SemaCXX/ms_struct.cpp +++ b/clang/test/SemaCXX/ms_struct.cpp @@ -25,7 +25,7 @@ struct B : public A { static_assert(__builtin_offsetof(B, d) == 12, "We can't allocate the bitfield into the padding under ms_struct"); -// expected-warning@-2 {{offset of on non-standard-layout type 'B'}} +// expected-warning@-2 {{'offsetof' on non-standard-layout type 'B'}} struct C { #ifdef TEST_FOR_ERROR @@ -39,5 +39,5 @@ struct C { static_assert(__builtin_offsetof(C, n) == 8, "long long field in ms_struct should be 8-byte aligned"); -// expected-warning@-2 {{offset of on non-standard-layout type 'C'}} +// expected-warning@-2 {{'offsetof' on non-standard-layout type 'C'}} diff --git a/clang/test/SemaCXX/offsetof-0x.cpp b/clang/test/SemaCXX/offsetof-0x.cpp index a3fe2fbbad72d..d8d417b6885c4 100644 --- a/clang/test/SemaCXX/offsetof-0x.cpp +++ b/clang/test/SemaCXX/offsetof-0x.cpp @@ -11,7 +11,7 @@ struct P { }; void f() { - int i = __builtin_offsetof(P, fieldThatPointsToANonPODType.m); // expected-warning{{offset of on non-standard-layout type 'P'}} + int i = __builtin_offsetof(P, fieldThatPointsToANonPODType.m); // expected-warning{{'offsetof' on non-standard-layout type 'P'}} } struct StandardLayout { diff --git a/clang/test/SemaCXX/offsetof.cpp b/clang/test/SemaCXX/offsetof.cpp index 1722b91fafc86..367a907f03775 100644 --- a/clang/test/SemaCXX/offsetof.cpp +++ b/clang/test/SemaCXX/offsetof.cpp @@ -11,12 +11,12 @@ struct P { }; void f() { - int i = __builtin_offsetof(P, fieldThatPointsToANonPODType.m); // expected-warning{{offset of on non-POD type 'P'}} + int i = __builtin_offsetof(P, fieldThatPointsToANonPODType.m); // expected-warning{{'offsetof' on non-POD type 'P'}} } struct Base { int x; }; struct Derived : Base { int y; }; -int o = __builtin_offsetof(Derived, x); // expected-warning{{offset of on non-POD type}} +int o = __builtin_offsetof(Derived, x); // expected-warning{{'offsetof' on non-POD type}} const int o2 = sizeof(__builtin_offsetof(Derived, x)); @@ -51,9 +51,9 @@ struct Derived2 : public Base1, public Base2 { int z; }; -int derived1[__builtin_offsetof(Derived2, x) == 0? 1 : -1]; // expected-warning{{offset of on non-POD type 'Derived2'}} -int derived2[__builtin_offsetof(Derived2, y) == 4? 1 : -1]; // expected-warning{{offset of on non-POD type 'Derived2'}} -int derived3[__builtin_offsetof(Derived2, z) == 8? 1 : -1]; // expected-warning{{offset of on non-POD type 'Derived2'}} +int derived1[__builtin_offsetof(Derived2, x) == 0? 1 : -1]; // expected-warning{{'offsetof' on non-POD type 'Derived2'}} +int derived2[__builtin_offsetof(Derived2, y) == 4? 1 : -1]; // expected-warning{{'offsetof' on non-POD type 'Derived2'}} +int derived3[__builtin_offsetof(Derived2, z) == 8? 1 : -1]; // expected-warning{{'offsetof' on non-POD type 'Derived2'}} // offsetof referring to anonymous struct in base. // PR7769 @@ -66,7 +66,7 @@ struct foo { struct bar : public foo { }; -int anonstruct[__builtin_offsetof(bar, x) == 0 ? 1 : -1]; // expected-warning{{offset of on non-POD type 'bar'}} +int anonstruct[__builtin_offsetof(bar, x) == 0 ? 1 : -1]; // expected-warning{{'offsetof' on non-POD type 'bar'}} struct LtoRCheck { @@ -81,7 +81,7 @@ struct Base { int Field; }; struct Derived : virtual Base { - void Fun() { (void)__builtin_offsetof(Derived, Field); } // expected-warning {{offset of on non-POD type}} \ + void Fun() { (void)__builtin_offsetof(Derived, Field); } // expected-warning {{'offsetof' on non-POD type}} \ expected-error {{invalid application of 'offsetof' to a field of a virtual base}} }; } From 96efb21e8865e8f43192979991fdccf08f1a6da4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 31 Mar 2025 13:55:38 +0100 Subject: [PATCH 0091/1029] [X86] Add regression test for insert_subvector(x,extract_subvector(broadcast)) pattern identified in #133083 Infinite loop check --- .../CodeGen/X86/insert-subvector-broadcast.ll | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 llvm/test/CodeGen/X86/insert-subvector-broadcast.ll diff --git a/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll b/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll new file mode 100644 index 0000000000000..47cd752ef80a4 --- /dev/null +++ b/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mcpu=skx | FileCheck %s + +define void @insert_subvector_broadcast_as_blend() { +; CHECK-LABEL: insert_subvector_broadcast_as_blend: +; CHECK: # %bb.0: +; CHECK-NEXT: movq (%rax), %rax +; CHECK-NEXT: incq %rax +; CHECK-NEXT: vpbroadcastq %rax, %zmm0 +; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm1 +; CHECK-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 +; CHECK-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k1 +; CHECK-NEXT: kunpckbw %k0, %k1, %k1 +; CHECK-NEXT: vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vmovdqa %xmm0, (%rax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %load4 = load i64, ptr poison, align 32 + %add = add i64 %load4, 1 + %insertelement5 = insertelement <16 x i64> zeroinitializer, i64 %add, i64 1 + %shufflevector = shufflevector <16 x i64> %insertelement5, <16 x i64> poison, <16 x i32> + %icmp6 = icmp slt <16 x i64> %shufflevector, + %shufflevector7 = shufflevector <16 x i1> poison, <16 x i1> %icmp6, <16 x i32> + %zext = zext <16 x i1> %shufflevector7 to <16 x i8> + store <16 x i8> %zext, ptr poison, align 32 + ret void +} From 8d69e953b5a894c215b0cff7c6b0687b2b492318 Mon Sep 17 00:00:00 2001 From: Stefan Pintilie Date: Mon, 31 Mar 2025 10:02:12 -0400 Subject: [PATCH 0092/1029] [RISCV] Add combine for shadd family of instructions. (#130829) For example for the following situation: %6:gpr = SLLI %2:gpr, 2 %7:gpr = ADDI killed %6:gpr, 24 %8:gpr = ADD %0:gpr, %7:gpr If we swap the two add instrucions we can merge the shift and add. The final code will look something like this: %7 = SH2ADD %0, %2 %8 = ADDI %7, 24 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 73 ++++++- .../CodeGen/RISCV/reassoc-shl-addi-add.ll | 189 ++++++++++++++++++ 2 files changed, 261 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/RISCV/reassoc-shl-addi-add.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 70ec57798db71..0dc62ef04ec0f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -29,6 +29,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/ValueTypes.h" @@ -79,6 +80,12 @@ static cl::opt "use for creating a floating-point immediate value"), cl::init(2)); +static cl::opt + ReassocShlAddiAdd("reassoc-shl-addi-add", cl::Hidden, + cl::desc("Swap add and addi in cases where the add may " + "be combined with a shift"), + cl::init(true)); + RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, const RISCVSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -14441,6 +14448,67 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::SHL, DL, VT, SHADD, DAG.getConstant(Bits, DL, VT)); } +// Check if this SDValue is an add immediate that is fed by a shift of 1, 2, +// or 3. +static SDValue combineShlAddIAddImpl(SDNode *N, SDValue AddI, SDValue Other, + SelectionDAG &DAG) { + using namespace llvm::SDPatternMatch; + + // Looking for a reg-reg add and not an addi. + if (isa(N->getOperand(1))) + return SDValue(); + + // Based on testing it seems that performance degrades if the ADDI has + // more than 2 uses. + if (AddI->use_size() > 2) + return SDValue(); + + APInt AddVal; + SDValue SHLVal; + if (!sd_match(AddI, m_Add(m_Value(SHLVal), m_ConstInt(AddVal)))) + return SDValue(); + + APInt VShift; + if (!sd_match(SHLVal, m_BinOp(ISD::SHL, m_Value(), m_ConstInt(VShift)))) + return SDValue(); + + if (VShift.slt(1) || VShift.sgt(3)) + return SDValue(); + + SDLoc DL(N); + EVT VT = N->getValueType(0); + // The shift must be positive but the add can be signed. + uint64_t ShlConst = VShift.getZExtValue(); + int64_t AddConst = AddVal.getSExtValue(); + + SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, SHLVal->getOperand(0), + DAG.getConstant(ShlConst, DL, VT), Other); + return DAG.getNode(ISD::ADD, DL, VT, SHADD, + DAG.getSignedConstant(AddConst, DL, VT)); +} + +// Optimize (add (add (shl x, c0), c1), y) -> +// (ADDI (SH*ADD y, x), c1), if c0 equals to [1|2|3]. +static SDValue combineShlAddIAdd(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + // Perform this optimization only in the zba extension. + if (!ReassocShlAddiAdd || !Subtarget.hasStdExtZba()) + return SDValue(); + + // Skip for vector types and larger types. + EVT VT = N->getValueType(0); + if (VT != Subtarget.getXLenVT()) + return SDValue(); + + SDValue AddI = N->getOperand(0); + SDValue Other = N->getOperand(1); + if (SDValue V = combineShlAddIAddImpl(N, AddI, Other, DAG)) + return V; + if (SDValue V = combineShlAddIAddImpl(N, Other, AddI, DAG)) + return V; + return SDValue(); +} + // Combine a constant select operand into its use: // // (and (select cond, -1, c), x) @@ -14682,9 +14750,12 @@ static SDValue performADDCombine(SDNode *N, return V; if (SDValue V = transformAddImmMulImm(N, DAG, Subtarget)) return V; - if (!DCI.isBeforeLegalize() && !DCI.isCalledByLegalizer()) + if (!DCI.isBeforeLegalize() && !DCI.isCalledByLegalizer()) { if (SDValue V = transformAddShlImm(N, DAG, Subtarget)) return V; + if (SDValue V = combineShlAddIAdd(N, DAG, Subtarget)) + return V; + } if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget)) return V; if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget)) diff --git a/llvm/test/CodeGen/RISCV/reassoc-shl-addi-add.ll b/llvm/test/CodeGen/RISCV/reassoc-shl-addi-add.ll new file mode 100644 index 0000000000000..88ab1c0c3eaef --- /dev/null +++ b/llvm/test/CodeGen/RISCV/reassoc-shl-addi-add.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32-unknown-elf -mattr=+zba %s -o - | FileCheck %s + +declare i32 @callee1(i32 noundef) +declare i32 @callee2(i32 noundef, i32 noundef) +declare i32 @callee(i32 noundef, i32 noundef, i32 noundef, i32 noundef) + +define void @t1(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d) #0 { +; CHECK-LABEL: t1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sh2add a2, a0, a2 +; CHECK-NEXT: sh2add a1, a0, a1 +; CHECK-NEXT: addi a1, a1, 45 +; CHECK-NEXT: addi a2, a2, 45 +; CHECK-NEXT: sh2add a3, a0, a3 +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: tail callee +entry: + %shl = shl i32 %a, 2 + %add = add nsw i32 %shl, 45 + %add1 = add nsw i32 %add, %b + %add3 = add nsw i32 %add, %c + %add5 = add nsw i32 %shl, %d + %call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add1, i32 noundef %add3, i32 noundef %add5) + ret void +} + +define void @t2(i32 noundef %a, i32 noundef %b, i32 noundef %c) #0 { +; CHECK-LABEL: t2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a5, a0, 42 +; CHECK-NEXT: add a4, a5, a1 +; CHECK-NEXT: add a3, a5, a2 +; CHECK-NEXT: mv a1, a5 +; CHECK-NEXT: mv a2, a4 +; CHECK-NEXT: tail callee +entry: + %shl = shl i32 %a, 2 + %add = add nsw i32 %shl, 42 + %add4 = add nsw i32 %add, %b + %add7 = add nsw i32 %add, %c + %call = tail call i32 @callee(i32 noundef %shl, i32 noundef %add, i32 noundef %add4, i32 noundef %add7) + ret void +} + +define void @t3(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, i32 noundef %e) #0 { +; CHECK-LABEL: t3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a5, a0, 42 +; CHECK-NEXT: add a0, a5, a1 +; CHECK-NEXT: add a1, a5, a2 +; CHECK-NEXT: add a2, a5, a3 +; CHECK-NEXT: add a3, a5, a4 +; CHECK-NEXT: tail callee +entry: + %shl = shl i32 %a, 2 + %add = add nsw i32 %shl, 42 + %add1 = add nsw i32 %add, %b + %add2 = add nsw i32 %add, %c + %add3 = add nsw i32 %add, %d + %add4 = add nsw i32 %add, %e + %call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add2, i32 noundef %add3, i32 noundef %add4) + ret void +} + +define void @t4(i32 noundef %a, i32 noundef %b) #0 { +; CHECK-LABEL: t4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sh2add a0, a0, a1 +; CHECK-NEXT: addi a0, a0, 42 +; CHECK-NEXT: tail callee1 +entry: + %shl = shl i32 %a, 2 + %add = add nsw i32 %shl, 42 + %add1 = add nsw i32 %add, %b + %call = tail call i32 @callee1(i32 noundef %add1) + ret void +} + +define void @t5(i32 noundef %a, i32 noundef %b, i32 noundef %c) #0 { +; CHECK-LABEL: t5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sh2add a2, a0, a2 +; CHECK-NEXT: sh2add a0, a0, a1 +; CHECK-NEXT: addi a0, a0, 42 +; CHECK-NEXT: addi a1, a2, 42 +; CHECK-NEXT: tail callee2 +entry: + %shl = shl i32 %a, 2 + %add = add nsw i32 %shl, 42 + %add1 = add nsw i32 %add, %b + %add2 = add nsw i32 %add, %c + %call = tail call i32 @callee2(i32 noundef %add1, i32 noundef %add2) + ret void +} + +define void @t6(i32 noundef %a, i32 noundef %b) #0 { +; CHECK-LABEL: t6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: slli a2, a0, 2 +; CHECK-NEXT: sh2add a0, a0, a1 +; CHECK-NEXT: addi a0, a0, 42 +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: tail callee +entry: + %shl = shl i32 %a, 2 + %add = add nsw i32 %shl, 42 + %add1 = add nsw i32 %add, %b + %call = tail call i32 @callee(i32 noundef %add1, i32 noundef %shl, i32 noundef %shl, i32 noundef %shl) + ret void +} + +define void @t7(i32 noundef %a, i32 noundef %b) #0 { +; CHECK-LABEL: t7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: addi a2, a0, 42 +; CHECK-NEXT: add a0, a2, a1 +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: tail callee +entry: + %shl = shl i32 %a, 2 + %add = add nsw i32 %shl, 42 + %add1 = add nsw i32 %add, %b + %call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add, i32 noundef %add, i32 noundef %add) + ret void +} + +define void @t8(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d) #0 { +; CHECK-LABEL: t8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sh3add a2, a0, a2 +; CHECK-NEXT: sh3add a1, a0, a1 +; CHECK-NEXT: lui a4, 1 +; CHECK-NEXT: addi a4, a4, 1307 +; CHECK-NEXT: add a1, a1, a4 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: sh3add a3, a0, a3 +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: tail callee +entry: + %shl = shl i32 %a, 3 + %add = add nsw i32 %shl, 5403 + %add1 = add nsw i32 %add, %b + %add3 = add nsw i32 %add, %c + %add5 = add nsw i32 %shl, %d + %call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add1, i32 noundef %add3, i32 noundef %add5) + ret void +} + +define void @t9(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d) #0 { +; CHECK-LABEL: t9: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sh2add a2, a0, a2 +; CHECK-NEXT: sh2add a1, a0, a1 +; CHECK-NEXT: addi a1, a1, -42 +; CHECK-NEXT: addi a2, a2, -42 +; CHECK-NEXT: sh2add a3, a0, a3 +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: tail callee +entry: + %shl = shl i32 %a, 2 + %add = add nsw i32 %shl, -42 + %add1 = add nsw i32 %add, %b + %add3 = add nsw i32 %add, %c + %add5 = add nsw i32 %shl, %d + %call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add1, i32 noundef %add3, i32 noundef %add5) + ret void +} + +define void @t10(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d) #0 { +; CHECK-LABEL: t10: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: tail callee +entry: + %shl = shl i32 %a, -2 + %add = add nsw i32 %shl, 42 + %add1 = add nsw i32 %add, %b + %add3 = add nsw i32 %add, %c + %add5 = add nsw i32 %shl, %d + %call = tail call i32 @callee(i32 noundef %add1, i32 noundef %add1, i32 noundef %add3, i32 noundef %add5) + ret void +} + +attributes #0 = { nounwind optsize } From c7572ae213d215d54a10c8a03db75cc01f6291bd Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Mon, 31 Mar 2025 22:05:50 +0800 Subject: [PATCH 0093/1029] [X86][AVX10] Re-target mavx10.1 and emit warning for mavx10.x-256/512 and m[no-]evex512 (#132542) The 256-bit maximum vector register size control was removed from AVX10 whitepaper, ref: https://cdrdv2.intel.com/v1/dl/getContent/784343 - Re-target m[no-]avx10.1 to enable AVX10.1 with 512-bit maximum vector register size; - Emit warning for mavx10.x-256, noting AVX10/256 is not supported; - Emit warning for mavx10.x-512, noting to use m[no-]avx10.x instead; - Emit warning for m[no-]evex512, noting AVX10/256 is not supported; This patch only changes Clang driver behavior. The features avx10.x-256/512 keep unchanged and will be removed in the next release. --- clang/docs/ReleaseNotes.rst | 11 +++++--- clang/include/clang/Driver/Options.td | 6 ++--- clang/lib/Driver/ToolChains/Arch/X86.cpp | 19 ++++++++++--- clang/test/Driver/x86-target-features.c | 34 +++++++++++++----------- 4 files changed, 44 insertions(+), 26 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 4b8e09d051616..daad01919ecd4 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -413,10 +413,13 @@ Hexagon Support X86 Support ^^^^^^^^^^^ -- Disable ``-m[no-]avx10.1`` and switch ``-m[no-]avx10.2`` to alias of 512 bit - options. -- Change ``-mno-avx10.1-512`` to alias of ``-mno-avx10.1-256`` to disable both - 256 and 512 bit instructions. +- The 256-bit maximum vector register size control was removed from + `AVX10 whitepaper _`. + * Re-target ``m[no-]avx10.1`` to enable AVX10.1 with 512-bit maximum vector register size. + * Emit warning for ``mavx10.x-256``, noting AVX10/256 is not supported. + * Emit warning for ``mavx10.x-512``, noting to use ``m[no-]avx10.x`` instead. + * Emit warning for ``m[no-]evex512``, noting AVX10/256 is not supported. + * The features avx10.x-256/512 keep unchanged and will be removed in the next release. Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 3af072242d039..89cb03cc33b98 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6409,11 +6409,11 @@ def mavx10_1_256 : Flag<["-"], "mavx10.1-256">, Group, Group; def mavx10_1_512 : Flag<["-"], "mavx10.1-512">, Group; def mno_avx10_1_512 : Flag<["-"], "mno-avx10.1-512">, Alias; -def mavx10_1 : Flag<["-"], "mavx10.1">, Flags<[Unsupported]>; -def mno_avx10_1 : Flag<["-"], "mno-avx10.1">, Flags<[Unsupported]>; +def mavx10_1 : Flag<["-"], "mavx10.1">, Group; +def mno_avx10_1 : Flag<["-"], "mno-avx10.1">, Group; def mavx10_2_256 : Flag<["-"], "mavx10.2-256">, Group; def mavx10_2_512 : Flag<["-"], "mavx10.2-512">, Group; -def mavx10_2 : Flag<["-"], "mavx10.2">, Alias; +def mavx10_2 : Flag<["-"], "mavx10.2">, Group; def mno_avx10_2 : Flag<["-"], "mno-avx10.2">, Group; def mavx2 : Flag<["-"], "mavx2">, Group; def mno_avx2 : Flag<["-"], "mno-avx2">, Group; diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp index 47c2c3e23f9fd..429b041c9c513 100644 --- a/clang/lib/Driver/ToolChains/Arch/X86.cpp +++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp @@ -243,10 +243,18 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, assert((Version == "1" || Version == "2") && "Invalid AVX10 feature name."); if (Width == "") { - assert(IsNegative && "Only negative options can omit width."); - Features.push_back(Args.MakeArgString("-" + Name + "-256")); + if (IsNegative) + Features.push_back(Args.MakeArgString("-" + Name + "-256")); + else + Features.push_back(Args.MakeArgString("+" + Name + "-512")); } else { - assert((Width == "256" || Width == "512") && "Invalid vector length."); + if (Width == "512") + D.Diag(diag::warn_drv_deprecated_arg) << Name << 1 << Name.drop_back(4); + else if (Width == "256") + D.Diag(diag::warn_drv_deprecated_custom) + << Name << "because AVX10/256 is not supported and will be removed"; + else + assert((Width == "256" || Width == "512") && "Invalid vector length."); Features.push_back(Args.MakeArgString((IsNegative ? "-" : "+") + Name)); } } @@ -275,6 +283,11 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, D.Diag(diag::err_drv_unsupported_opt_for_target) << A->getSpelling() << Triple.getTriple(); + if (A->getOption().matches(options::OPT_mevex512) || + A->getOption().matches(options::OPT_mno_evex512)) + D.Diag(diag::warn_drv_deprecated_custom) + << Name << "because AVX10/256 is not supported and will be removed"; + if (A->getOption().matches(options::OPT_mapx_features_EQ) || A->getOption().matches(options::OPT_mno_apx_features_EQ)) { diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index 18361251dcebc..6416a34898e78 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -390,15 +390,12 @@ // AVXVNNIINT16: "-target-feature" "+avxvnniint16" // NO-AVXVNNIINT16: "-target-feature" "-avxvnniint16" -// RUN: %clang --target=i386 -mevex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EVEX512 %s -// RUN: %clang --target=i386 -mno-evex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-EVEX512 %s -// EVEX512: "-target-feature" "+evex512" -// NO-EVEX512: "-target-feature" "-evex512" - -// RUN: not %clang --target=i386 -march=i386 -mavx10.1 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=UNSUPPORT-AVX10 %s -// RUN: not %clang --target=i386 -march=i386 -mno-avx10.1 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=UNSUPPORT-AVX10 %s -// RUN: %clang --target=i386 -mavx10.1-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_1_256 %s -// RUN: %clang --target=i386 -mavx10.1-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_1_512 %s +// RUN: %clang --target=i386 -mevex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=EVEX512,WARN-EVEX512 %s +// RUN: %clang --target=i386 -mno-evex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=NO-EVEX512,WARN-EVEX512 %s +// RUN: %clang --target=i386 -march=i386 -mavx10.1 %s -### -o %t.o 2>&1 -Werror | FileCheck -check-prefix=AVX10_1_512 %s +// RUN: %clang --target=i386 -march=i386 -mno-avx10.1 %s -### -o %t.o 2>&1 -Werror | FileCheck -check-prefix=NO-AVX10_1 %s +// RUN: %clang --target=i386 -mavx10.1-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_1_256,WARN-AVX10-256 %s +// RUN: %clang --target=i386 -mavx10.1-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_1_512,WARN-AVX10-512 %s // RUN: %clang --target=i386 -mavx10.1-256 -mavx10.1-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_1_512 %s // RUN: %clang --target=i386 -mavx10.1-512 -mavx10.1-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_1_256 %s // RUN: not %clang --target=i386 -march=i386 -mavx10.1-128 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=BAD-AVX10 %s @@ -406,15 +403,20 @@ // RUN: not %clang --target=i386 -march=i386 -mavx10.1024-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=BAD-AVX10 %s // RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mavx512f %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-AVX512 %s // RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mno-avx512f %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-AVX512 %s -// RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mevex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-EVEX512 %s -// RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mno-evex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10-EVEX512 %s -// RUN: %clang --target=i386 -mavx10.2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_2_512 %s -// RUN: %clang --target=i386 -mno-avx10.2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVX10_2 %s -// RUN: %clang --target=i386 -mavx10.2-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_2_256 %s -// RUN: %clang --target=i386 -mavx10.2-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX10_2_512 %s +// RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mevex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10-EVEX512,WARN-EVEX512 %s +// RUN: %clang --target=i386 -march=i386 -mavx10.1-256 -mno-evex512 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10-EVEX512,WARN-EVEX512 %s +// RUN: %clang --target=i386 -mavx10.2 %s -### -o %t.o 2>&1 -Werror | FileCheck -check-prefix=AVX10_2_512 %s +// RUN: %clang --target=i386 -mno-avx10.2 %s -### -o %t.o 2>&1 -Werror | FileCheck -check-prefix=NO-AVX10_2 %s +// RUN: %clang --target=i386 -mavx10.2-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_2_256,WARN-AVX10-256 %s +// RUN: %clang --target=i386 -mavx10.2-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_2_512,WARN-AVX10-512 %s // RUN: %clang --target=i386 -mavx10.2-256 -mavx10.1-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_2_256,AVX10_1_512 %s // RUN: %clang --target=i386 -mavx10.2-512 -mavx10.1-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_2_512,AVX10_1_256 %s -// UNSUPPORT-AVX10: error: unsupported option '-m{{.*}}avx10.1' for target 'i386' +// WARN-EVEX512: warning: argument '{{.*}}evex512' is deprecated, because AVX10/256 is not supported and will be removed [-Wdeprecated] +// WARN-AVX10-256: warning: argument 'avx10.{{.*}}-256' is deprecated, because AVX10/256 is not supported and will be removed [-Wdeprecated] +// WARN-AVX10-512: warning: argument 'avx10.{{.*}}-512' is deprecated, use 'avx10.{{.*}}' instead [-Wdeprecated] +// EVEX512: "-target-feature" "+evex512" +// NO-EVEX512: "-target-feature" "-evex512" +// NO-AVX10_1: "-target-feature" "-avx10.1-256" // NO-AVX10_2: "-target-feature" "-avx10.2-256" // AVX10_2_256: "-target-feature" "+avx10.2-256" // AVX10_2_512: "-target-feature" "+avx10.2-512" From 8078665bca1e16e33a09aea0310102077d429ada Mon Sep 17 00:00:00 2001 From: Tejas Vipin Date: Mon, 31 Mar 2025 19:36:28 +0530 Subject: [PATCH 0094/1029] [libc][math][c23] Add hypotf16 function (#131991) Implement hypot for Float16 along with tests. --- libc/config/linux/x86_64/entrypoints.txt | 1 + libc/docs/headers/math/index.rst | 2 +- libc/include/math.yaml | 8 ++ libc/src/__support/FPUtil/Hypot.h | 8 +- libc/src/__support/FPUtil/cast.h | 3 + libc/src/math/CMakeLists.txt | 1 + libc/src/math/generic/CMakeLists.txt | 16 ++++ libc/src/math/generic/hypotf16.cpp | 89 +++++++++++++++++++ libc/src/math/hypotf16.h | 21 +++++ libc/test/src/math/CMakeLists.txt | 11 +++ libc/test/src/math/HypotTest.h | 2 +- libc/test/src/math/exhaustive/CMakeLists.txt | 18 ++++ .../src/math/exhaustive/hypotf16_test.cpp | 67 ++++++++++++++ libc/test/src/math/hypotf16_test.cpp | 21 +++++ .../math/performance_testing/CMakeLists.txt | 12 +++ .../performance_testing/hypotf16_perf.cpp | 16 ++++ libc/test/src/math/smoke/CMakeLists.txt | 13 +++ libc/test/src/math/smoke/hypotf16_test.cpp | 17 ++++ 18 files changed, 321 insertions(+), 5 deletions(-) create mode 100644 libc/src/math/generic/hypotf16.cpp create mode 100644 libc/src/math/hypotf16.h create mode 100644 libc/test/src/math/exhaustive/hypotf16_test.cpp create mode 100644 libc/test/src/math/hypotf16_test.cpp create mode 100644 libc/test/src/math/performance_testing/hypotf16_perf.cpp create mode 100644 libc/test/src/math/smoke/hypotf16_test.cpp diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 648e3d5ac5281..09c8c18c04a59 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -706,6 +706,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.fromfpf16 libc.src.math.fromfpxf16 libc.src.math.getpayloadf16 + libc.src.math.hypotf16 libc.src.math.ilogbf16 libc.src.math.iscanonicalf16 libc.src.math.issignalingf16 diff --git a/libc/docs/headers/math/index.rst b/libc/docs/headers/math/index.rst index 23d010e2ab5d7..df2650065f882 100644 --- a/libc/docs/headers/math/index.rst +++ b/libc/docs/headers/math/index.rst @@ -305,7 +305,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | fsqrt | N/A | |check| | |check| | N/A | |check|\* | 7.12.14.6 | F.10.11 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| hypot | |check| | |check| | | | | 7.12.7.4 | F.10.4.4 | +| hypot | |check| | |check| | | |check| | | 7.12.7.4 | F.10.4.4 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | lgamma | | | | | | 7.12.8.3 | F.10.5.3 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/include/math.yaml b/libc/include/math.yaml index 3a06bcfc4f43e..133f9a6c034ec 100644 --- a/libc/include/math.yaml +++ b/libc/include/math.yaml @@ -1395,6 +1395,14 @@ functions: arguments: - type: float - type: float + - name: hypotf16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: ilogb standards: - stdc diff --git a/libc/src/__support/FPUtil/Hypot.h b/libc/src/__support/FPUtil/Hypot.h index 6aa808446d6d9..94da259cd42f0 100644 --- a/libc/src/__support/FPUtil/Hypot.h +++ b/libc/src/__support/FPUtil/Hypot.h @@ -30,7 +30,7 @@ LIBC_INLINE T find_leading_one(T mant, int &shift_length) { if (mant > 0) { shift_length = (sizeof(mant) * 8) - 1 - cpp::countl_zero(mant); } - return T(1) << shift_length; + return static_cast((T(1) << shift_length)); } } // namespace internal @@ -207,8 +207,10 @@ LIBC_INLINE T hypot(T x, T y) { for (StorageType current_bit = leading_one >> 1; current_bit; current_bit >>= 1) { - r = (r << 1) + ((tail_bits & current_bit) ? 1 : 0); - StorageType tmp = (y_new << 1) + current_bit; // 2*y_new(n - 1) + 2^(-n) + r = static_cast((r << 1)) + + ((tail_bits & current_bit) ? 1 : 0); + StorageType tmp = static_cast((y_new << 1)) + + current_bit; // 2*y_new(n - 1) + 2^(-n) if (r >= tmp) { r -= tmp; y_new += current_bit; diff --git a/libc/src/__support/FPUtil/cast.h b/libc/src/__support/FPUtil/cast.h index 126f3852137b7..7578bb42b18f1 100644 --- a/libc/src/__support/FPUtil/cast.h +++ b/libc/src/__support/FPUtil/cast.h @@ -18,6 +18,9 @@ namespace LIBC_NAMESPACE::fputil { +// TODO: Add optimization for known good targets with fast +// float to float16 conversion: +// https://github.com/llvm/llvm-project/issues/133517 template LIBC_INLINE constexpr cpp::enable_if_t && cpp::is_floating_point_v, diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 9fec978ece2bd..88fb73f856e82 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -313,6 +313,7 @@ add_math_entrypoint_object(getpayloadf128) add_math_entrypoint_object(hypot) add_math_entrypoint_object(hypotf) +add_math_entrypoint_object(hypotf16) add_math_entrypoint_object(ilogb) add_math_entrypoint_object(ilogbf) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index d3454803df377..de74729465ee7 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -3105,6 +3105,22 @@ add_entrypoint_object( libc.src.__support.macros.optimization ) +add_entrypoint_object( + hypotf16 + SRCS + hypotf16.cpp + HDRS + ../hypotf16.h + DEPENDS + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.sqrt + libc.src.__support.macros.optimization + libc.src.__support.macros.properties.types +) + add_entrypoint_object( fdim SRCS diff --git a/libc/src/math/generic/hypotf16.cpp b/libc/src/math/generic/hypotf16.cpp new file mode 100644 index 0000000000000..8f80986204b27 --- /dev/null +++ b/libc/src/math/generic/hypotf16.cpp @@ -0,0 +1,89 @@ +//===-- Implementation of hypotf16 function -------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/hypotf16.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/sqrt.h" +#include "src/__support/common.h" +#include "src/__support/macros/optimization.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +// For targets where conversion from float to float16 has to be +// emulated, fputil::hypot is faster +LLVM_LIBC_FUNCTION(float16, hypotf16, (float16 x, float16 y)) { + using FloatBits = fputil::FPBits; + using FPBits = fputil::FPBits; + + FPBits x_abs = FPBits(x).abs(); + FPBits y_abs = FPBits(y).abs(); + + bool x_abs_larger = x_abs.uintval() >= y_abs.uintval(); + + FPBits a_bits = x_abs_larger ? x_abs : y_abs; + FPBits b_bits = x_abs_larger ? y_abs : x_abs; + + uint16_t a_u = a_bits.uintval(); + uint16_t b_u = b_bits.uintval(); + + // Note: replacing `a_u >= FPBits::EXP_MASK` with `a_bits.is_inf_or_nan()` + // generates extra exponent bit masking instructions on x86-64. + if (LIBC_UNLIKELY(a_u >= FPBits::EXP_MASK)) { + // x or y is inf or nan + if (a_bits.is_signaling_nan() || b_bits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + if (a_bits.is_inf() || b_bits.is_inf()) + return FPBits::inf().get_val(); + return a_bits.get_val(); + } + + if (LIBC_UNLIKELY(a_u - b_u >= + static_cast((FPBits::FRACTION_LEN + 2) + << FPBits::FRACTION_LEN))) + return x_abs.get_val() + y_abs.get_val(); + + float af = fputil::cast(a_bits.get_val()); + float bf = fputil::cast(b_bits.get_val()); + + // These squares are exact. + float a_sq = af * af; + float sum_sq = fputil::multiply_add(bf, bf, a_sq); + + FloatBits result(fputil::sqrt(sum_sq)); + uint32_t r_u = result.uintval(); + + // If any of the sticky bits of the result are non-zero, except the LSB, then + // the rounded result is correct. + if (LIBC_UNLIKELY(((r_u + 1) & 0x0000'0FFE) == 0)) { + float r_d = result.get_val(); + + // Perform rounding correction. + float sum_sq_lo = fputil::multiply_add(bf, bf, a_sq - sum_sq); + float err = sum_sq_lo - fputil::multiply_add(r_d, r_d, -sum_sq); + + if (err > 0) { + r_u |= 1; + } else if ((err < 0) && (r_u & 1) == 0) { + r_u -= 1; + } else if ((r_u & 0x0000'1FFF) == 0) { + // The rounded result is exact. + fputil::clear_except_if_required(FE_INEXACT); + } + return fputil::cast(FloatBits(r_u).get_val()); + } + + return fputil::cast(result.get_val()); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/hypotf16.h b/libc/src/math/hypotf16.h new file mode 100644 index 0000000000000..2d37c61b4ee7b --- /dev/null +++ b/libc/src/math/hypotf16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for hypotf16 ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_HYPOTF16_H +#define LLVM_LIBC_SRC_MATH_HYPOTF16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 hypotf16(float16 x, float16 y); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_HYPOTF16_H diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 9a73f9fe07597..514c01834c1a4 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -1701,6 +1701,17 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + hypotf16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + hypotf16_test.cpp + DEPENDS + libc.src.math.hypotf16 +) + add_fp_unittest( nextafter_test SUITE diff --git a/libc/test/src/math/HypotTest.h b/libc/test/src/math/HypotTest.h index fd0c1b394b8f7..dc73581e67ff0 100644 --- a/libc/test/src/math/HypotTest.h +++ b/libc/test/src/math/HypotTest.h @@ -73,7 +73,7 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest { constexpr StorageType COUNT = 10'001; for (unsigned scale = 0; scale < 4; ++scale) { StorageType max_value = MAX_SUBNORMAL << scale; - StorageType step = (max_value - MIN_SUBNORMAL) / COUNT; + StorageType step = (max_value - MIN_SUBNORMAL) / COUNT + 1; for (int signs = 0; signs < 4; ++signs) { for (StorageType v = MIN_SUBNORMAL, w = max_value; v <= max_value && w >= MIN_SUBNORMAL; v += step, w -= step) { diff --git a/libc/test/src/math/exhaustive/CMakeLists.txt b/libc/test/src/math/exhaustive/CMakeLists.txt index b1927dbc19a3b..551f449c9c8db 100644 --- a/libc/test/src/math/exhaustive/CMakeLists.txt +++ b/libc/test/src/math/exhaustive/CMakeLists.txt @@ -314,6 +314,24 @@ add_fp_unittest( -lpthread ) +add_fp_unittest( + hypotf16_test + NO_RUN_POSTBUILD + NEED_MPFR + SUITE + libc_math_exhaustive_tests + SRCS + hypotf16_test.cpp + COMPILE_OPTIONS + ${libc_opt_high_flag} + DEPENDS + .exhaustive_test + libc.src.math.hypotf16 + libc.src.__support.FPUtil.fp_bits + LINK_LIBRARIES + -lpthread +) + add_fp_unittest( fmod_generic_impl_test NO_RUN_POSTBUILD diff --git a/libc/test/src/math/exhaustive/hypotf16_test.cpp b/libc/test/src/math/exhaustive/hypotf16_test.cpp new file mode 100644 index 0000000000000..f79041e6dbd77 --- /dev/null +++ b/libc/test/src/math/exhaustive/hypotf16_test.cpp @@ -0,0 +1,67 @@ +//===-- Exhaustive test for hypotf16 --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "exhaustive_test.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/Hypot.h" +#include "src/math/hypotf16.h" +#include "test/UnitTest/FPMatcher.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +struct Hypotf16Checker : public virtual LIBC_NAMESPACE::testing::Test { + using FloatType = float16; + using FPBits = LIBC_NAMESPACE::fputil::FPBits; + using StorageType = typename FPBits::StorageType; + + uint64_t check(uint16_t x_start, uint16_t x_stop, uint16_t y_start, + uint16_t y_stop, mpfr::RoundingMode rounding) { + mpfr::ForceRoundingMode r(rounding); + if (!r.success) + return true; + uint16_t xbits = x_start; + uint64_t failed = 0; + do { + float16 x = FPBits(xbits).get_val(); + uint16_t ybits = xbits; + do { + float16 y = FPBits(ybits).get_val(); + bool correct = TEST_FP_EQ(LIBC_NAMESPACE::fputil::hypot(x, y), + LIBC_NAMESPACE::hypotf16(x, y)); + // Using MPFR will be much slower. + // mpfr::BinaryInput input{x, y}; + // bool correct = TEST_MPFR_MATCH_ROUNDING_SILENTLY( + // mpfr::Operation::Hypot, input, LIBC_NAMESPACE::hypotf16(x, y), + // 0.5, + // rounding); + failed += (!correct); + } while (ybits++ < y_stop); + } while (xbits++ < x_stop); + return failed; + } +}; + +using LlvmLibcHypotf16ExhaustiveTest = + LlvmLibcExhaustiveMathTest; + +// Range of both inputs: [0, inf] +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7C00U; + +TEST_F(LlvmLibcHypotf16ExhaustiveTest, PositiveRange) { + test_full_range_all_roundings(POS_START, POS_STOP, POS_START, POS_STOP); +} + +// Range of both inputs: [-0, -inf] +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xFC00U; + +TEST_F(LlvmLibcHypotf16ExhaustiveTest, NegativeRange) { + test_full_range_all_roundings(NEG_START, NEG_STOP, NEG_START, NEG_STOP); +} diff --git a/libc/test/src/math/hypotf16_test.cpp b/libc/test/src/math/hypotf16_test.cpp new file mode 100644 index 0000000000000..37d57471a3c74 --- /dev/null +++ b/libc/test/src/math/hypotf16_test.cpp @@ -0,0 +1,21 @@ +//===-- Unittests for hypotf16 --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "HypotTest.h" + +#include "src/math/hypotf16.h" + +using LlvmLibcHypotf16Test = HypotTestTemplate; + +TEST_F(LlvmLibcHypotf16Test, SubnormalRange) { + test_subnormal_range(&LIBC_NAMESPACE::hypotf16); +} + +TEST_F(LlvmLibcHypotf16Test, NormalRange) { + test_normal_range(&LIBC_NAMESPACE::hypotf16); +} diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt index 838ed9e957ca7..d8f87e04c15d8 100644 --- a/libc/test/src/math/performance_testing/CMakeLists.txt +++ b/libc/test/src/math/performance_testing/CMakeLists.txt @@ -340,6 +340,18 @@ add_perf_binary( -fno-builtin ) +add_perf_binary( + hypotf16_perf + SRCS + hypotf16_perf.cpp + DEPENDS + .binary_op_single_output_diff + libc.src.math.hypotf16 + libc.src.__support.FPUtil.fp_bits + COMPILE_OPTIONS + -fno-builtin +) + add_perf_binary( hypotf_perf SRCS diff --git a/libc/test/src/math/performance_testing/hypotf16_perf.cpp b/libc/test/src/math/performance_testing/hypotf16_perf.cpp new file mode 100644 index 0000000000000..b53a9042171a6 --- /dev/null +++ b/libc/test/src/math/performance_testing/hypotf16_perf.cpp @@ -0,0 +1,16 @@ +//===-- Differential test for hypotf16 ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BinaryOpSingleOutputPerf.h" + +#include "src/__support/FPUtil/Hypot.h" +#include "src/math/hypotf16.h" + +BINARY_OP_SINGLE_OUTPUT_PERF(float16, float16, LIBC_NAMESPACE::hypotf16, + LIBC_NAMESPACE::fputil::hypot, + "hypotf16_perf.log") diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index b7601735c1109..a8c602b388504 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -3121,6 +3121,19 @@ add_fp_unittest( libc.src.__support.macros.properties.architectures ) +add_fp_unittest( + hypotf16_test + SUITE + libc-math-smoke-tests + SRCS + hypotf16_test.cpp + HDRS + HypotTest.h + DEPENDS + libc.src.math.hypotf16 + libc.src.__support.FPUtil.fp_bits +) + add_fp_unittest( hypot_test SUITE diff --git a/libc/test/src/math/smoke/hypotf16_test.cpp b/libc/test/src/math/smoke/hypotf16_test.cpp new file mode 100644 index 0000000000000..b48b0930431de --- /dev/null +++ b/libc/test/src/math/smoke/hypotf16_test.cpp @@ -0,0 +1,17 @@ +//===-- Unittests for hypotf16 --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "HypotTest.h" + +#include "src/math/hypotf16.h" + +using LlvmLibcHypotf16Test = HypotTestTemplate; + +TEST_F(LlvmLibcHypotf16Test, SpecialNumbers) { + test_special_numbers(&LIBC_NAMESPACE::hypotf16); +} From a61cc1b99a50d832c650132cc9956320bfe594f5 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Mon, 31 Mar 2025 10:18:25 -0400 Subject: [PATCH 0095/1029] [AMDGPU][True16][CodeGen] Skip combineDpp with t16 instructions (#128918) We only emits v_mov_b32/64_dpp. Don't combine t16 instructions with mov dpp. Update the test inputs to be legal. It is future work to emit v_mov_b16_dpp, and then update GCNDPPCombine to combine it with the 16-bit instructions. --- llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp | 5 + .../CodeGen/AMDGPU/dpp_combine-true16.mir | 27 ++++ llvm/test/CodeGen/AMDGPU/dpp_combine.ll | 34 ++++- llvm/test/CodeGen/AMDGPU/vopc_dpp-true16.mir | 124 ++++++++++++++++++ 4 files changed, 186 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/dpp_combine-true16.mir create mode 100644 llvm/test/CodeGen/AMDGPU/vopc_dpp-true16.mir diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp index b22babb4a00d8..5439ea2f59111 100644 --- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp +++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp @@ -215,6 +215,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI, bool HasVOP3DPP = ST->hasVOP3DPP(); auto OrigOp = OrigMI.getOpcode(); + if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(OrigOp)) { + LLVM_DEBUG( + dbgs() << " failed: Did not expect any 16-bit uses of dpp values\n"); + return nullptr; + } auto DPPOp = getDPPOp(OrigOp, IsShrinkable); if (DPPOp == -1) { LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n"); diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine-true16.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine-true16.mir new file mode 100644 index 0000000000000..792acda60620e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine-true16.mir @@ -0,0 +1,27 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN +# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN +# XUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150 + +# FIXME-TRUE16 add gfx1200 runline when we have those true16 instructions supported + +--- + +# V_MOV_B16_t16_e64_dpp is unsupported to combine +# GCN-label: name: vop3_u16 +# GCN: %4:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec +# GCN: %6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %5, 0, 1, 15, 15, 1, implicit $exec +name: vop3_u16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + %0:vgpr_16 = COPY $vgpr0 + %1:vgpr_16 = COPY $vgpr1 + %2:vgpr_16 = COPY $vgpr2 + %3:vgpr_16 = IMPLICIT_DEF + %4:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec + %5:vgpr_16 = V_ADD_NC_U16_t16_e64 0, %4, 0, %3, 0, 0, implicit $exec + %6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %5, 0, 1, 15, 15, 1, implicit $exec + %7:vgpr_16 = V_ADD_NC_U16_t16_e64 4, %6, 8, %5, 0, 0, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll index 5162092f78aca..926c2a3f12aab 100644 --- a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll +++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll @@ -1,7 +1,9 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN -; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16 ; GCN-LABEL: {{^}}dpp_add: ; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]], @@ -63,6 +65,30 @@ define amdgpu_kernel void @dpp_mul(ptr addrspace(1) %arg) { ret void } +; It is not expected to see a sequence of v_mov_b32_dpp feeding into a 16 bit instruction +; GCN-LABEL: {{^}}dpp_fadd_f16: +; GFX9GFX10: global_load_{{dword|b32}} [[V:v[0-9]+]], +; GFX9GFX10: v_add_f16_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +; GFX11-TRUE16: v_mov_b32_dpp {{v[0-9]+}}, {{v[0-9]+}} quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX11-TRUE16: v_add_f16_e32 +; GFX11-FAKE16: global_load_{{dword|b32}} [[V:v[0-9]+]], +; GFX11-FAKE16: v_add_f16_e64_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +define amdgpu_kernel void @dpp_fadd_f16(ptr addrspace(1) %arg) { + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id + %load = load i32, ptr addrspace(1) %gep + %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0 + %tmp01 = trunc i32 %tmp0 to i16 + %tmp1 = bitcast i16 %tmp01 to half + %tt = trunc i32 %load to i16 + %t = bitcast i16 %tt to half + %add = fadd half %tmp1, %t + %tmp2 = bitcast half %add to i16 + %tmp3 = zext i16 %tmp2 to i32 + store i32 %tmp3, ptr addrspace(1) %gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0 declare float @llvm.ceil.f32(float) diff --git a/llvm/test/CodeGen/AMDGPU/vopc_dpp-true16.mir b/llvm/test/CodeGen/AMDGPU/vopc_dpp-true16.mir new file mode 100644 index 0000000000000..8f63f6c8cb1c6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vopc_dpp-true16.mir @@ -0,0 +1,124 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN + +--- + +name: vopc +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GCN-LABEL: name: vopc + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: V_CMP_LT_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec + ; GCN-NEXT: V_CMP_CLASS_F32_e32_dpp 2, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec + ; GCN-NEXT: V_CMP_NGE_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CMP_NGE_F32_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F32_e64_dpp 0, [[COPY1]], 0, [[COPY]], 0, 1, 15, 15, 1, implicit $mode, implicit $exec + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[V_CMP_NGE_F32_e64_dpp]], 10101, implicit-def $scc + ; GCN-NEXT: V_CMP_GT_I32_e32_dpp [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = IMPLICIT_DEF + + %4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + V_CMP_LT_F32_e32 %4, %0, implicit-def $vcc, implicit $mode, implicit $exec + + %10:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + V_CMPX_GT_U32_nosdst_e64 %10, %0, implicit-def $exec, implicit $mode, implicit $exec + + %11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %12:sgpr_32 = V_CMP_CLASS_F32_e64 2, %11, %0, implicit $mode, implicit $exec + + %13:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %14:sgpr_32 = V_CMP_NGE_F32_e64 0, %13, 0, %0, 0, implicit $mode, implicit $exec + + %17:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + %18:sgpr_32 = V_CMP_NGE_F32_e64 0, %17, 0, %0, 0, implicit $mode, implicit $exec + %19:sgpr_32 = S_AND_B32 %18, 10101, implicit-def $scc + + %20:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec + V_CMP_LT_I32_e32 %0, %20, implicit-def $vcc, implicit $exec + +... +--- + +# V_MOV_B16_t16_e64_dpp is unsupported to combine +name: vopc_16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_lo16, $vgpr1_hi16, $vgpr255_hi16 + + ; GCN-LABEL: name: vopc_16 + ; GCN: liveins: $vgpr0_lo16, $vgpr1_hi16, $vgpr255_hi16 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY $vgpr0_lo16 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY $vgpr1_hi16 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY $vgpr255_hi16 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: V_CMPX_EQ_I16_t16_nosdst_e64 0, [[V_MOV_B16_t16_e64_dpp]], 0, [[COPY]], 0, implicit-def $exec, implicit-def $vcc_lo, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: [[V_CMP_CLASS_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, [[V_MOV_B16_t16_e64_dpp1]], 0, [[COPY]], 0, implicit-def $vcc_lo, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp2:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: [[V_CMP_GE_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_t16_e64 1, [[V_MOV_B16_t16_e64_dpp2]], 0, [[COPY]], 1, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp3:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec + ; GCN-NEXT: [[V_CMP_NGE_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F16_t16_e64 0, [[V_CMP_NGE_F16_t16_e64_]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec + %0:vgpr_16 = COPY $vgpr0_lo16 + %1:vgpr_16 = COPY $vgpr1_hi16 + %2:vgpr_16 = COPY $vgpr255_hi16 + %3:vgpr_16 = IMPLICIT_DEF + + %5:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec + V_CMPX_EQ_I16_t16_nosdst_e64 0, %5, 0, %0, 0, implicit-def $exec, implicit-def $vcc, implicit $mode, implicit $exec + + %6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec + %7:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, %6, 0, %0, 0, implicit-def $vcc, implicit $mode, implicit $exec + + %8:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec + %9:sgpr_32 = V_CMP_GE_F16_t16_e64 1, %8, 0, %0, 1, 0, implicit $mode, implicit $exec + + %15:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec + %16:sgpr_32 = V_CMP_NGE_F16_t16_e64 0, %16, 0, %0, 0, 0, implicit $mode, implicit $exec + +... +--- + +name: mask_not_full +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GCN-LABEL: name: mask_not_full + ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]].lo16, 0, [[COPY1]].hi16, 0, 1, 15, 14, 1, implicit $exec + ; GCN-NEXT: [[V_CMP_CLASS_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, [[V_MOV_B16_t16_e64_dpp]], 0, [[COPY]].lo16, 0, implicit-def $vcc_lo, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[COPY1]], 1, 13, 15, 1, implicit $exec + ; GCN-NEXT: [[V_CMP_GE_F32_e64_:%[0-9]+]]:sgpr_32 = V_CMP_GE_F32_e64 1, [[V_MOV_B32_dpp]], 0, [[COPY]], 1, implicit $mode, implicit $exec + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = IMPLICIT_DEF + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + %4:vgpr_16 = V_MOV_B16_t16_e64_dpp %2.lo16, 0, %1.hi16, 0, 1, 15, 14, 1, implicit $exec + %99:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, %4, 0, %0.lo16, 0, implicit-def $vcc, implicit $mode, implicit $exec + + %5:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 13, 15, 1, implicit $exec + %6:sgpr_32 = V_CMP_GE_F32_e64 1, %5, 0, %0, 1, implicit $mode, implicit $exec + +... From 4007de00a0574141695ace7a8d34aaf740a2c2e4 Mon Sep 17 00:00:00 2001 From: Devon Loehr Date: Mon, 31 Mar 2025 10:28:53 -0400 Subject: [PATCH 0096/1029] Enable unnecessary-virtual-specifier by default (#133265) This turns on the unnecessary-virtual-specifier warning in general, but disables it when building LLVM. It also tweaks the warning description to be slightly more accurate. Background: I've been working on cleaning up this warning in two codebases: LLVM and chromium (plus its dependencies). The chromium cleanup has been straightforward. Git archaeology shows that there are two reasons for the warnings: classes to which `final` was added after they were initially committed, and classes with virtual destructors that nobody remarks on. Presumably the latter case is because people are just very used to destructors being virtual. The LLVM cleanup was more surprising: I discovered that we have an [old policy](https://llvm.org/docs/CodingStandards.html#provide-a-virtual-method-anchor-for-classes-in-headers) about including out-of-line virtual functions in every class with a vtable, even `final` ones. This means our codebase has many virtual "anchor" functions which do nothing except control where the vtable is emitted, and which trigger the warning. I looked into alternatives to satisfy the policy, such as using destructors instead of introducing a new function, but it wasn't clear if they had larger implications. Overall, it seems like the warning is genuinely useful in most codebases (evidenced by chromium and its dependencies), and LLVM is an unusual case. Therefore we should enable the warning by default, and turn it off only for LLVM builds. --- clang/include/clang/Basic/DiagnosticGroups.td | 7 +++---- clang/include/clang/Basic/DiagnosticSemaKinds.td | 2 +- .../WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp | 2 +- clang/test/CXX/class/p2-0x.cpp | 2 +- clang/test/SemaCXX/MicrosoftExtensions.cpp | 1 + clang/test/SemaCXX/warn-final-dtor-non-final-class.cpp | 5 +++-- llvm/cmake/modules/HandleLLVMOptions.cmake | 6 ++++++ 7 files changed, 16 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index b9f08d96151c9..e6e9ebbc2c304 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -377,13 +377,12 @@ def CXX11WarnSuggestOverride : DiagGroup<"suggest-override">; def WarnUnnecessaryVirtualSpecifier : DiagGroup<"unnecessary-virtual-specifier"> { code Documentation = [{ Warns when a ``final`` class contains a virtual method (including virtual -destructors). Since ``final`` classes cannot be subclassed, their methods -cannot be overridden, and hence the ``virtual`` specifier is useless. +destructors) that does not override anything. Since ``final`` classes cannot be +subclassed, their methods cannot be overridden, so there is no point to +introducing new ``virtual`` methods. The warning also detects virtual methods in classes whose destructor is ``final``, for the same reason. - -The warning does not fire on virtual methods which are also marked ``override``. }]; } diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index b03926db8170a..5e45482584946 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -2733,7 +2733,7 @@ def note_final_dtor_non_final_class_silence : Note< "mark %0 as '%select{final|sealed}1' to silence this warning">; def warn_unnecessary_virtual_specifier : Warning< "virtual method %0 is inside a 'final' class and can never be overridden">, - InGroup, DefaultIgnore; + InGroup; // C++11 attributes def err_repeat_attribute : Error<"%0 attribute cannot be repeated">; diff --git a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp index 4209db14eaa52..106091b240af6 100644 --- a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp +++ b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=webkit.RefCntblBaseVirtualDtor -verify %s +// RUN: %clang_analyze_cc1 -analyzer-checker=webkit.RefCntblBaseVirtualDtor -verify %s -Wno-unnecessary-virtual-specifier #include "mock-types.h" diff --git a/clang/test/CXX/class/p2-0x.cpp b/clang/test/CXX/class/p2-0x.cpp index 5b39e0ada7e2c..2043486457baf 100644 --- a/clang/test/CXX/class/p2-0x.cpp +++ b/clang/test/CXX/class/p2-0x.cpp @@ -28,7 +28,7 @@ struct C : A { }; // expected-error {{base 'A' is marked 'final'}} namespace Test4 { -struct A final { virtual void func() = 0; }; // expected-warning {{abstract class is marked 'final'}} expected-note {{unimplemented pure virtual method 'func' in 'A'}} +struct A final { virtual void func() = 0; }; // expected-warning {{abstract class is marked 'final'}} expected-note {{unimplemented pure virtual method 'func' in 'A'}} expected-warning {{virtual method 'func' is inside a 'final' class}}} struct B { virtual void func() = 0; }; // expected-note {{unimplemented pure virtual method 'func' in 'C'}} struct C final : B { }; // expected-warning {{abstract class is marked 'final'}} diff --git a/clang/test/SemaCXX/MicrosoftExtensions.cpp b/clang/test/SemaCXX/MicrosoftExtensions.cpp index 7454a01158f6b..9f6939c1681c9 100644 --- a/clang/test/SemaCXX/MicrosoftExtensions.cpp +++ b/clang/test/SemaCXX/MicrosoftExtensions.cpp @@ -470,6 +470,7 @@ struct InheritFromSealed : SealedType {}; class SealedDestructor { // expected-note {{mark 'SealedDestructor' as 'sealed' to silence this warning}} // expected-warning@+1 {{'sealed' keyword is a Microsoft extension}} virtual ~SealedDestructor() sealed; // expected-warning {{class with destructor marked 'sealed' cannot be inherited from}} + // expected-warning@-1 {{virtual method '~SealedDestructor' is inside a 'final' class}} }; // expected-warning@+1 {{'abstract' keyword is a Microsoft extension}} diff --git a/clang/test/SemaCXX/warn-final-dtor-non-final-class.cpp b/clang/test/SemaCXX/warn-final-dtor-non-final-class.cpp index a96aa4436e818..c9c8c11e1d7ff 100644 --- a/clang/test/SemaCXX/warn-final-dtor-non-final-class.cpp +++ b/clang/test/SemaCXX/warn-final-dtor-non-final-class.cpp @@ -1,5 +1,6 @@ -// RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify %s -Wfinal-dtor-non-final-class -// RUN: %clang_cc1 -fsyntax-only -std=c++11 %s -Wfinal-dtor-non-final-class -fdiagnostics-parseable-fixits 2>&1 | FileCheck %s +// RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify %s -Wfinal-dtor-non-final-class -Wno-unnecessary-virtual-specifier +// RUN: %clang_cc1 -fsyntax-only -std=c++11 %s -Wfinal-dtor-non-final-class -Wno-unnecessary-virtual-specifier \ +// RUN: -fdiagnostics-parseable-fixits 2>&1 | FileCheck %s class A { ~A(); diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index 185c9b63aada3..f50f60ec0023f 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -690,6 +690,12 @@ endif( LLVM_COMPILER_IS_GCC_COMPATIBLE OR CMAKE_CXX_COMPILER_ID MATCHES "XL" ) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") append("-Werror=unguarded-availability-new" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 21.0) + # LLVM has a policy of including virtual "anchor" functions to control + # where the vtable is emitted. In `final` classes, these are exactly what + # this warning detects: unnecessary virtual methods. + append("-Wno-unnecessary-virtual-specifier" CMAKE_CXX_FLAGS) + endif() endif() if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND LLVM_ENABLE_LTO) From 68947342b75cc71f3ac9041d11db086d8d074336 Mon Sep 17 00:00:00 2001 From: Tom Tromey Date: Mon, 31 Mar 2025 08:42:21 -0600 Subject: [PATCH 0097/1029] Add support for fixed-point types (#129596) This adds DWARF generation for fixed-point types. This feature is needed by Ada. Note that a pre-existing GNU extension is used in one case. This has been emitted by GCC for years, and is needed because standard DWARF is otherwise incapable of representing these types. --- llvm/docs/LangRef.rst | 29 +++++ llvm/include/llvm-c/DebugInfo.h | 1 + llvm/include/llvm/AsmParser/LLToken.h | 1 + llvm/include/llvm/Bitcode/LLVMBitCodes.h | 1 + llvm/include/llvm/IR/DIBuilder.h | 36 ++++++ llvm/include/llvm/IR/DebugInfoMetadata.h | 138 ++++++++++++++++++++- llvm/include/llvm/IR/Metadata.def | 1 + llvm/lib/AsmParser/LLLexer.cpp | 5 + llvm/lib/AsmParser/LLParser.cpp | 51 ++++++++ llvm/lib/Bitcode/Reader/MetadataLoader.cpp | 33 +++++ llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 32 +++++ llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp | 28 ++++- llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h | 1 + llvm/lib/IR/AsmWriter.cpp | 29 +++++ llvm/lib/IR/DIBuilder.cpp | 31 +++++ llvm/lib/IR/DebugInfoMetadata.cpp | 43 +++++++ llvm/lib/IR/LLVMContextImpl.h | 37 ++++++ llvm/lib/IR/Verifier.cpp | 19 +++ llvm/test/Bitcode/fixedpoint_type.ll | 29 +++++ llvm/unittests/IR/DebugInfoTest.cpp | 34 +++++ 20 files changed, 577 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Bitcode/fixedpoint_type.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 806874fa76b2e..34a6bb8f13d6b 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -6212,6 +6212,35 @@ following: DW_ATE_unsigned = 7 DW_ATE_unsigned_char = 8 +.. _DIFixedPointType: + +DIFixedPointType +"""""""""""""""" + +``DIFixedPointType`` nodes represent fixed-point types. A fixed-point +type is conceptually an integer with a scale factor. +``DIFixedPointType`` is derived from ``DIBasicType`` and inherits its +attributes. However, only certain encodings are accepted: + +.. code-block:: text + + DW_ATE_signed_fixed = 13 + DW_ATE_unsigned_fixed = 14 + +There are three kinds of fixed-point type: binary, where the scale +factor is a power of 2; decimal, where the scale factor is a power of +10; and rational, where the scale factor is an arbitrary rational +number. + +.. code-block:: text + + !0 = !DIFixedPointType(name: "decimal", size: 8, encoding: DW_ATE_signed_fixed, + kind: Decimal, factor: -4) + !1 = !DIFixedPointType(name: "binary", size: 8, encoding: DW_ATE_unsigned_fixed, + kind: Binary, factor: -16) + !2 = !DIFixedPointType(name: "rational", size: 8, encoding: DW_ATE_signed_fixed, + kind: Rational, numerator: 1234, denominator: 5678) + .. _DISubroutineType: DISubroutineType diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h index 30bbaa4d34161..9fbe31d2629bd 100644 --- a/llvm/include/llvm-c/DebugInfo.h +++ b/llvm/include/llvm-c/DebugInfo.h @@ -173,6 +173,7 @@ enum { LLVMDISubrangeMetadataKind, LLVMDIEnumeratorMetadataKind, LLVMDIBasicTypeMetadataKind, + LLVMDIFixedPointTypeMetadataKind, LLVMDIDerivedTypeMetadataKind, LLVMDICompositeTypeMetadataKind, LLVMDISubroutineTypeMetadataKind, diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h index 81b9929b1fab8..a8f9c71781701 100644 --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -494,6 +494,7 @@ enum Kind { DwarfCC, // DW_CC_foo EmissionKind, // lineTablesOnly NameTableKind, // GNU + FixedPointKind, // Fixed point DwarfOp, // DW_OP_foo DIFlag, // DIFlagFoo DISPFlag, // DISPFlagFoo diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index ec2535ac85966..92b6e68d9d0a7 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -386,6 +386,7 @@ enum MetadataCodes { METADATA_ARG_LIST = 46, // [n x [type num, value num]] METADATA_ASSIGN_ID = 47, // [distinct, ...] METADATA_SUBRANGE_TYPE = 48, // [distinct, ...] + METADATA_FIXED_POINT_TYPE = 49, // [distinct, ...] }; // The constants block (CONSTANTS_BLOCK_ID) describes emission for each diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h index b63e564dfd36b..8e62b810ff147 100644 --- a/llvm/include/llvm/IR/DIBuilder.h +++ b/llvm/include/llvm/IR/DIBuilder.h @@ -215,6 +215,42 @@ namespace llvm { DINode::DIFlags Flags = DINode::FlagZero, uint32_t NumExtraInhabitants = 0); + /// Create debugging information entry for a binary fixed-point type. + /// \param Name Type name. + /// \param Encoding DWARF encoding code, either + /// dwarf::DW_ATE_signed_fixed or DW_ATE_unsigned_fixed. + /// \param Flags Optional DWARF attributes, e.g., DW_AT_endianity. + /// \param Factor Binary scale factor. + DIFixedPointType * + createBinaryFixedPointType(StringRef Name, uint64_t SizeInBits, + uint32_t AlignInBits, unsigned Encoding, + DINode::DIFlags Flags, int Factor); + + /// Create debugging information entry for a decimal fixed-point type. + /// \param Name Type name. + /// \param Encoding DWARF encoding code, either + /// dwarf::DW_ATE_signed_fixed or DW_ATE_unsigned_fixed. + /// \param Flags Optional DWARF attributes, e.g., DW_AT_endianity. + /// \param Factor Decimal scale factor. + DIFixedPointType * + createDecimalFixedPointType(StringRef Name, uint64_t SizeInBits, + uint32_t AlignInBits, unsigned Encoding, + DINode::DIFlags Flags, int Factor); + + /// Create debugging information entry for an arbitrary rational + /// fixed-point type. + /// \param Name Type name. + /// \param Encoding DWARF encoding code, either + /// dwarf::DW_ATE_signed_fixed or DW_ATE_unsigned_fixed. + /// \param Flags Optional DWARF attributes, e.g., DW_AT_endianity. + /// \param Numerator Numerator of scale factor. + /// \param Denominator Denominator of scale factor. + DIFixedPointType * + createRationalFixedPointType(StringRef Name, uint64_t SizeInBits, + uint32_t AlignInBits, unsigned Encoding, + DINode::DIFlags Flags, APInt Numerator, + APInt Denominator); + /// Create debugging information entry for a string /// type. /// \param Name Type name. diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h index 62a59ddaee599..174ff09f56bdf 100644 --- a/llvm/include/llvm/IR/DebugInfoMetadata.h +++ b/llvm/include/llvm/IR/DebugInfoMetadata.h @@ -199,6 +199,7 @@ class DINode : public MDNode { case DISubrangeKind: case DIEnumeratorKind: case DIBasicTypeKind: + case DIFixedPointTypeKind: case DIStringTypeKind: case DISubrangeTypeKind: case DIDerivedTypeKind: @@ -547,6 +548,7 @@ class DIScope : public DINode { default: return false; case DIBasicTypeKind: + case DIFixedPointTypeKind: case DIStringTypeKind: case DISubrangeTypeKind: case DIDerivedTypeKind: @@ -806,6 +808,7 @@ class DIType : public DIScope { default: return false; case DIBasicTypeKind: + case DIFixedPointTypeKind: case DIStringTypeKind: case DISubrangeTypeKind: case DIDerivedTypeKind: @@ -826,6 +829,7 @@ class DIBasicType : public DIType { unsigned Encoding; +protected: DIBasicType(LLVMContext &C, StorageType Storage, unsigned Tag, uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, uint32_t NumExtraInhabitants, DIFlags Flags, @@ -833,6 +837,13 @@ class DIBasicType : public DIType { : DIType(C, DIBasicTypeKind, Storage, Tag, 0, SizeInBits, AlignInBits, 0, NumExtraInhabitants, Flags, Ops), Encoding(Encoding) {} + DIBasicType(LLVMContext &C, unsigned ID, StorageType Storage, unsigned Tag, + uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, + uint32_t NumExtraInhabitants, DIFlags Flags, + ArrayRef Ops) + : DIType(C, ID, Storage, Tag, 0, SizeInBits, AlignInBits, 0, + NumExtraInhabitants, Flags, Ops), + Encoding(Encoding) {} ~DIBasicType() = default; static DIBasicType *getImpl(LLVMContext &Context, unsigned Tag, @@ -897,7 +908,132 @@ class DIBasicType : public DIType { std::optional getSignedness() const; static bool classof(const Metadata *MD) { - return MD->getMetadataID() == DIBasicTypeKind; + return MD->getMetadataID() == DIBasicTypeKind || + MD->getMetadataID() == DIFixedPointTypeKind; + } +}; + +/// Fixed-point type. +class DIFixedPointType : public DIBasicType { + friend class LLVMContextImpl; + friend class MDNode; + + // Actually FixedPointKind. + unsigned Kind; + // Used for binary and decimal. + int Factor; + // Used for rational. + APInt Numerator; + APInt Denominator; + + DIFixedPointType(LLVMContext &C, StorageType Storage, unsigned Tag, + uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, + DIFlags Flags, unsigned Kind, int Factor, + ArrayRef Ops) + : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, SizeInBits, + AlignInBits, Encoding, 0, Flags, Ops), + Kind(Kind), Factor(Factor) { + assert(Kind == FixedPointBinary || Kind == FixedPointDecimal); + } + DIFixedPointType(LLVMContext &C, StorageType Storage, unsigned Tag, + uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, + DIFlags Flags, unsigned Kind, APInt Numerator, + APInt Denominator, ArrayRef Ops) + : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, SizeInBits, + AlignInBits, Encoding, 0, Flags, Ops), + Kind(Kind), Factor(0), Numerator(Numerator), Denominator(Denominator) { + assert(Kind == FixedPointRational); + } + DIFixedPointType(LLVMContext &C, StorageType Storage, unsigned Tag, + uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, + DIFlags Flags, unsigned Kind, int Factor, APInt Numerator, + APInt Denominator, ArrayRef Ops) + : DIBasicType(C, DIFixedPointTypeKind, Storage, Tag, SizeInBits, + AlignInBits, Encoding, 0, Flags, Ops), + Kind(Kind), Factor(Factor), Numerator(Numerator), + Denominator(Denominator) {} + ~DIFixedPointType() = default; + + static DIFixedPointType * + getImpl(LLVMContext &Context, unsigned Tag, StringRef Name, + uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, + DIFlags Flags, unsigned Kind, int Factor, APInt Numerator, + APInt Denominator, StorageType Storage, bool ShouldCreate = true) { + return getImpl(Context, Tag, getCanonicalMDString(Context, Name), + SizeInBits, AlignInBits, Encoding, Flags, Kind, Factor, + Numerator, Denominator, Storage, ShouldCreate); + } + static DIFixedPointType * + getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, + uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding, + DIFlags Flags, unsigned Kind, int Factor, APInt Numerator, + APInt Denominator, StorageType Storage, bool ShouldCreate = true); + + TempDIFixedPointType cloneImpl() const { + return getTemporary(getContext(), getTag(), getName(), getSizeInBits(), + getAlignInBits(), getEncoding(), getFlags(), Kind, + Factor, Numerator, Denominator); + } + +public: + enum FixedPointKind : unsigned { + /// Scale factor 2^Factor. + FixedPointBinary, + /// Scale factor 10^Factor. + FixedPointDecimal, + /// Arbitrary rational scale factor. + FixedPointRational, + LastFixedPointKind = FixedPointRational, + }; + + static std::optional getFixedPointKind(StringRef Str); + static const char *fixedPointKindString(FixedPointKind); + + DEFINE_MDNODE_GET(DIFixedPointType, + (unsigned Tag, MDString *Name, uint64_t SizeInBits, + uint32_t AlignInBits, unsigned Encoding, DIFlags Flags, + unsigned Kind, int Factor, APInt Numerator, + APInt Denominator), + (Tag, Name, SizeInBits, AlignInBits, Encoding, Flags, Kind, + Factor, Numerator, Denominator)) + DEFINE_MDNODE_GET(DIFixedPointType, + (unsigned Tag, StringRef Name, uint64_t SizeInBits, + uint32_t AlignInBits, unsigned Encoding, DIFlags Flags, + unsigned Kind, int Factor, APInt Numerator, + APInt Denominator), + (Tag, Name, SizeInBits, AlignInBits, Encoding, Flags, Kind, + Factor, Numerator, Denominator)) + + TempDIFixedPointType clone() const { return cloneImpl(); } + + bool isBinary() const { return Kind == FixedPointBinary; } + bool isDecimal() const { return Kind == FixedPointDecimal; } + bool isRational() const { return Kind == FixedPointRational; } + + bool isSigned() const; + + FixedPointKind getKind() const { return static_cast(Kind); } + + int getFactorRaw() const { return Factor; } + int getFactor() const { + assert(Kind == FixedPointBinary || Kind == FixedPointDecimal); + return Factor; + } + + const APInt &getNumeratorRaw() const { return Numerator; } + const APInt &getNumerator() const { + assert(Kind == FixedPointRational); + return Numerator; + } + + const APInt &getDenominatorRaw() const { return Denominator; } + const APInt &getDenominator() const { + assert(Kind == FixedPointRational); + return Denominator; + } + + static bool classof(const Metadata *MD) { + return MD->getMetadataID() == DIFixedPointTypeKind; } }; diff --git a/llvm/include/llvm/IR/Metadata.def b/llvm/include/llvm/IR/Metadata.def index 7cb257fefbc38..511bf48707f00 100644 --- a/llvm/include/llvm/IR/Metadata.def +++ b/llvm/include/llvm/IR/Metadata.def @@ -119,6 +119,7 @@ HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DICommonBlock) HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIStringType) HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIGenericSubrange) HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DISubrangeType) +HANDLE_SPECIALIZED_MDNODE_LEAF_UNIQUABLE(DIFixedPointType) #undef HANDLE_METADATA #undef HANDLE_METADATA_LEAF diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index fd0a50d25e714..4d25b12c9ab06 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -1024,6 +1024,11 @@ lltok::Kind LLLexer::LexIdentifier() { return lltok::NameTableKind; } + if (Keyword == "Binary" || Keyword == "Decimal" || Keyword == "Rational") { + StrVal.assign(Keyword.begin(), Keyword.end()); + return lltok::FixedPointKind; + } + // Check for [us]0x[0-9A-Fa-f]+ which are Hexadecimal constant generated by // the CFE to avoid forcing it to deal with 64-bit numbers. if ((TokStart[0] == 'u' || TokStart[0] == 's') && diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 960119bab0933..b7ebffbeb7187 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -4751,6 +4751,11 @@ struct EmissionKindField : public MDUnsignedField { EmissionKindField() : MDUnsignedField(0, DICompileUnit::LastEmissionKind) {} }; +struct FixedPointKindField : public MDUnsignedField { + FixedPointKindField() + : MDUnsignedField(0, DIFixedPointType::LastFixedPointKind) {} +}; + struct NameTableKindField : public MDUnsignedField { NameTableKindField() : MDUnsignedField( @@ -4994,6 +4999,25 @@ bool LLParser::parseMDField(LocTy Loc, StringRef Name, return false; } +template <> +bool LLParser::parseMDField(LocTy Loc, StringRef Name, + FixedPointKindField &Result) { + if (Lex.getKind() == lltok::APSInt) + return parseMDField(Loc, Name, static_cast(Result)); + + if (Lex.getKind() != lltok::FixedPointKind) + return tokError("expected fixed-point kind"); + + auto Kind = DIFixedPointType::getFixedPointKind(Lex.getStrVal()); + if (!Kind) + return tokError("invalid fixed-point kind" + Twine(" '") + Lex.getStrVal() + + "'"); + assert(*Kind <= Result.Max && "Expected valid fixed-point kind"); + Result.assign(*Kind); + Lex.Lex(); + return false; +} + template <> bool LLParser::parseMDField(LocTy Loc, StringRef Name, NameTableKindField &Result) { @@ -5516,6 +5540,33 @@ bool LLParser::parseDIBasicType(MDNode *&Result, bool IsDistinct) { return false; } +/// parseDIFixedPointType: +/// ::= !DIFixedPointType(tag: DW_TAG_base_type, name: "xyz", size: 32, +/// align: 32, encoding: DW_ATE_signed_fixed, +/// flags: 0, kind: Rational, factor: 3, numerator: 1, +/// denominator: 8) +bool LLParser::parseDIFixedPointType(MDNode *&Result, bool IsDistinct) { +#define VISIT_MD_FIELDS(OPTIONAL, REQUIRED) \ + OPTIONAL(tag, DwarfTagField, (dwarf::DW_TAG_base_type)); \ + OPTIONAL(name, MDStringField, ); \ + OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX)); \ + OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX)); \ + OPTIONAL(encoding, DwarfAttEncodingField, ); \ + OPTIONAL(flags, DIFlagField, ); \ + OPTIONAL(kind, FixedPointKindField, ); \ + OPTIONAL(factor, MDSignedField, ); \ + OPTIONAL(numerator, MDAPSIntField, ); \ + OPTIONAL(denominator, MDAPSIntField, ); + PARSE_MD_FIELDS(); +#undef VISIT_MD_FIELDS + + Result = GET_OR_DISTINCT(DIFixedPointType, + (Context, tag.Val, name.Val, size.Val, align.Val, + encoding.Val, flags.Val, kind.Val, factor.Val, + numerator.Val, denominator.Val)); + return false; +} + /// parseDIStringType: /// ::= !DIStringType(name: "character(4)", size: 32, align: 32) bool LLParser::parseDIStringType(MDNode *&Result, bool IsDistinct) { diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index e87e5bde63d82..4879569200549 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -1542,6 +1542,39 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata( NextMetadataNo++; break; } + case bitc::METADATA_FIXED_POINT_TYPE: { + if (Record.size() < 11) + return error("Invalid record"); + + IsDistinct = Record[0]; + DINode::DIFlags Flags = static_cast(Record[6]); + + size_t Offset = 9; + + auto ReadWideInt = [&]() { + uint64_t Encoded = Record[Offset++]; + unsigned NumWords = Encoded >> 32; + unsigned BitWidth = Encoded & 0xffffffff; + auto Value = readWideAPInt(ArrayRef(&Record[Offset], NumWords), BitWidth); + Offset += NumWords; + return Value; + }; + + APInt Numerator = ReadWideInt(); + APInt Denominator = ReadWideInt(); + + if (Offset != Record.size()) + return error("Invalid record"); + + MetadataList.assignValue( + GET_OR_DISTINCT(DIFixedPointType, + (Context, Record[1], getMDString(Record[2]), Record[3], + Record[4], Record[5], Flags, Record[7], Record[8], + Numerator, Denominator)), + NextMetadataNo); + NextMetadataNo++; + break; + } case bitc::METADATA_STRING_TYPE: { if (Record.size() > 9 || Record.size() < 8) return error("Invalid record"); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 49411098d9c0c..4a0db9d76f44a 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -323,6 +323,9 @@ class ModuleBitcodeWriter : public ModuleBitcodeWriterBase { SmallVectorImpl &Record, unsigned Abbrev); void writeDIBasicType(const DIBasicType *N, SmallVectorImpl &Record, unsigned Abbrev); + void writeDIFixedPointType(const DIFixedPointType *N, + SmallVectorImpl &Record, + unsigned Abbrev); void writeDIStringType(const DIStringType *N, SmallVectorImpl &Record, unsigned Abbrev); void writeDIDerivedType(const DIDerivedType *N, @@ -1887,6 +1890,35 @@ void ModuleBitcodeWriter::writeDIBasicType(const DIBasicType *N, Record.clear(); } +void ModuleBitcodeWriter::writeDIFixedPointType( + const DIFixedPointType *N, SmallVectorImpl &Record, + unsigned Abbrev) { + Record.push_back(N->isDistinct()); + Record.push_back(N->getTag()); + Record.push_back(VE.getMetadataOrNullID(N->getRawName())); + Record.push_back(N->getSizeInBits()); + Record.push_back(N->getAlignInBits()); + Record.push_back(N->getEncoding()); + Record.push_back(N->getFlags()); + Record.push_back(N->getKind()); + Record.push_back(N->getFactorRaw()); + + auto WriteWideInt = [&](const APInt &Value) { + // Write an encoded word that holds the number of active words and + // the number of bits. + uint64_t NumWords = Value.getActiveWords(); + uint64_t Encoded = (NumWords << 32) | Value.getBitWidth(); + Record.push_back(Encoded); + emitWideAPInt(Record, Value); + }; + + WriteWideInt(N->getNumeratorRaw()); + WriteWideInt(N->getDenominatorRaw()); + + Stream.EmitRecord(bitc::METADATA_FIXED_POINT_TYPE, Record, Abbrev); + Record.clear(); +} + void ModuleBitcodeWriter::writeDIStringType(const DIStringType *N, SmallVectorImpl &Record, unsigned Abbrev) { diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 081828ea358af..2723b1f55ccaa 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -615,7 +615,9 @@ DIE *DwarfUnit::createTypeDIE(const DIScope *Context, DIE &ContextDIE, return &TyDIE; } construct(CTy); - } else if (auto *BT = dyn_cast(Ty)) + } else if (auto *FPT = dyn_cast(Ty)) + construct(FPT); + else if (auto *BT = dyn_cast(Ty)) construct(BT); else if (auto *ST = dyn_cast(Ty)) construct(ST); @@ -760,6 +762,30 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIBasicType *BTy) { NumExtraInhabitants); } +void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIFixedPointType *BTy) { + // Base type handling. + constructTypeDIE(Buffer, static_cast(BTy)); + + if (BTy->isBinary()) + addSInt(Buffer, dwarf::DW_AT_binary_scale, dwarf::DW_FORM_sdata, + BTy->getFactor()); + else if (BTy->isDecimal()) + addSInt(Buffer, dwarf::DW_AT_decimal_scale, dwarf::DW_FORM_sdata, + BTy->getFactor()); + else { + assert(BTy->isRational()); + DIE *ContextDIE = getOrCreateContextDIE(BTy->getScope()); + DIE &Constant = createAndAddDIE(dwarf::DW_TAG_constant, *ContextDIE); + + addInt(Constant, dwarf::DW_AT_GNU_numerator, BTy->getNumerator(), + !BTy->isSigned()); + addInt(Constant, dwarf::DW_AT_GNU_denominator, BTy->getDenominator(), + !BTy->isSigned()); + + addDIEEntry(Buffer, dwarf::DW_AT_small, Constant); + } +} + void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIStringType *STy) { // Get core information. StringRef Name = STy->getName(); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 5b0da7b09d31c..055d7173daec5 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -343,6 +343,7 @@ class DwarfUnit : public DIEUnit { void addIntAsBlock(DIE &Die, dwarf::Attribute Attribute, const APInt &Val); void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy); + void constructTypeDIE(DIE &Buffer, const DIFixedPointType *BTy); void constructTypeDIE(DIE &Buffer, const DIStringType *BTy); void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy); void constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy); diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 79547b299a903..5f0a9cdfb941a 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -1894,6 +1894,7 @@ struct MDFieldPrinter { void printEmissionKind(StringRef Name, DICompileUnit::DebugEmissionKind EK); void printNameTableKind(StringRef Name, DICompileUnit::DebugNameTableKind NTK); + void printFixedPointKind(StringRef Name, DIFixedPointType::FixedPointKind V); }; } // end anonymous namespace @@ -2030,6 +2031,11 @@ void MDFieldPrinter::printNameTableKind(StringRef Name, Out << FS << Name << ": " << DICompileUnit::nameTableKindString(NTK); } +void MDFieldPrinter::printFixedPointKind(StringRef Name, + DIFixedPointType::FixedPointKind V) { + Out << FS << Name << ": " << DIFixedPointType::fixedPointKindString(V); +} + template void MDFieldPrinter::printDwarfEnum(StringRef Name, IntTy Value, Stringifier toString, bool ShouldSkipZero) { @@ -2199,6 +2205,29 @@ static void writeDIBasicType(raw_ostream &Out, const DIBasicType *N, Out << ")"; } +static void writeDIFixedPointType(raw_ostream &Out, const DIFixedPointType *N, + AsmWriterContext &) { + Out << "!DIFixedPointType("; + MDFieldPrinter Printer(Out); + if (N->getTag() != dwarf::DW_TAG_base_type) + Printer.printTag(N); + Printer.printString("name", N->getName()); + Printer.printInt("size", N->getSizeInBits()); + Printer.printInt("align", N->getAlignInBits()); + Printer.printDwarfEnum("encoding", N->getEncoding(), + dwarf::AttributeEncodingString); + Printer.printDIFlags("flags", N->getFlags()); + Printer.printFixedPointKind("kind", N->getKind()); + if (N->isRational()) { + bool IsUnsigned = !N->isSigned(); + Printer.printAPInt("numerator", N->getNumerator(), IsUnsigned, false); + Printer.printAPInt("denominator", N->getDenominator(), IsUnsigned, false); + } else { + Printer.printInt("factor", N->getFactor()); + } + Out << ")"; +} + static void writeDIStringType(raw_ostream &Out, const DIStringType *N, AsmWriterContext &WriterCtx) { Out << "!DIStringType("; diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 3c1fd433fb948..d9cc49fdad89c 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -272,6 +272,37 @@ DIBasicType *DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits, 0, Encoding, NumExtraInhabitants, Flags); } +DIFixedPointType * +DIBuilder::createBinaryFixedPointType(StringRef Name, uint64_t SizeInBits, + uint32_t AlignInBits, unsigned Encoding, + DINode::DIFlags Flags, int Factor) { + return DIFixedPointType::get(VMContext, dwarf::DW_TAG_base_type, Name, + SizeInBits, AlignInBits, Encoding, Flags, + DIFixedPointType::FixedPointBinary, Factor, + APInt(), APInt()); +} + +DIFixedPointType * +DIBuilder::createDecimalFixedPointType(StringRef Name, uint64_t SizeInBits, + uint32_t AlignInBits, unsigned Encoding, + DINode::DIFlags Flags, int Factor) { + return DIFixedPointType::get(VMContext, dwarf::DW_TAG_base_type, Name, + SizeInBits, AlignInBits, Encoding, Flags, + DIFixedPointType::FixedPointDecimal, Factor, + APInt(), APInt()); +} + +DIFixedPointType * +DIBuilder::createRationalFixedPointType(StringRef Name, uint64_t SizeInBits, + uint32_t AlignInBits, unsigned Encoding, + DINode::DIFlags Flags, APInt Numerator, + APInt Denominator) { + return DIFixedPointType::get(VMContext, dwarf::DW_TAG_base_type, Name, + SizeInBits, AlignInBits, Encoding, Flags, + DIFixedPointType::FixedPointRational, 0, + Numerator, Denominator); +} + DIStringType *DIBuilder::createStringType(StringRef Name, uint64_t SizeInBits) { assert(!Name.empty() && "Unable to create type without name"); return DIStringType::get(VMContext, dwarf::DW_TAG_string_type, Name, diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index ae3d79fc17a59..f8c24d896df32 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -721,15 +721,58 @@ std::optional DIBasicType::getSignedness() const { switch (getEncoding()) { case dwarf::DW_ATE_signed: case dwarf::DW_ATE_signed_char: + case dwarf::DW_ATE_signed_fixed: return Signedness::Signed; case dwarf::DW_ATE_unsigned: case dwarf::DW_ATE_unsigned_char: + case dwarf::DW_ATE_unsigned_fixed: return Signedness::Unsigned; default: return std::nullopt; } } +DIFixedPointType * +DIFixedPointType::getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, + uint64_t SizeInBits, uint32_t AlignInBits, + unsigned Encoding, DIFlags Flags, unsigned Kind, + int Factor, APInt Numerator, APInt Denominator, + StorageType Storage, bool ShouldCreate) { + DEFINE_GETIMPL_LOOKUP(DIFixedPointType, + (Tag, Name, SizeInBits, AlignInBits, Encoding, Flags, + Kind, Factor, Numerator, Denominator)); + Metadata *Ops[] = {nullptr, nullptr, Name}; + DEFINE_GETIMPL_STORE(DIFixedPointType, + (Tag, SizeInBits, AlignInBits, Encoding, Flags, Kind, + Factor, Numerator, Denominator), + Ops); +} + +bool DIFixedPointType::isSigned() const { + return getEncoding() == dwarf::DW_ATE_signed_fixed; +} + +std::optional +DIFixedPointType::getFixedPointKind(StringRef Str) { + return StringSwitch>(Str) + .Case("Binary", FixedPointBinary) + .Case("Decimal", FixedPointDecimal) + .Case("Rational", FixedPointRational) + .Default(std::nullopt); +} + +const char *DIFixedPointType::fixedPointKindString(FixedPointKind V) { + switch (V) { + case FixedPointBinary: + return "Binary"; + case FixedPointDecimal: + return "Decimal"; + case FixedPointRational: + return "Rational"; + } + return nullptr; +} + DIStringType *DIStringType::getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *StringLength, Metadata *StringLengthExp, diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h index a18cf6f205623..efabe40fab192 100644 --- a/llvm/lib/IR/LLVMContextImpl.h +++ b/llvm/lib/IR/LLVMContextImpl.h @@ -494,6 +494,43 @@ template <> struct MDNodeKeyImpl { } }; +template <> struct MDNodeKeyImpl { + unsigned Tag; + MDString *Name; + uint64_t SizeInBits; + uint32_t AlignInBits; + unsigned Encoding; + unsigned Flags; + unsigned Kind; + int Factor; + APInt Numerator; + APInt Denominator; + + MDNodeKeyImpl(unsigned Tag, MDString *Name, uint64_t SizeInBits, + uint32_t AlignInBits, unsigned Encoding, unsigned Flags, + unsigned Kind, int Factor, APInt Numerator, APInt Denominator) + : Tag(Tag), Name(Name), SizeInBits(SizeInBits), AlignInBits(AlignInBits), + Encoding(Encoding), Flags(Flags), Kind(Kind), Factor(Factor), + Numerator(Numerator), Denominator(Denominator) {} + MDNodeKeyImpl(const DIFixedPointType *N) + : Tag(N->getTag()), Name(N->getRawName()), SizeInBits(N->getSizeInBits()), + AlignInBits(N->getAlignInBits()), Encoding(N->getEncoding()), + Flags(N->getFlags()), Kind(N->getKind()), Factor(N->getFactorRaw()), + Numerator(N->getNumeratorRaw()), Denominator(N->getDenominatorRaw()) {} + + bool isKeyOf(const DIFixedPointType *RHS) const { + return Name == RHS->getRawName() && SizeInBits == RHS->getSizeInBits() && + AlignInBits == RHS->getAlignInBits() && Kind == RHS->getKind() && + (RHS->isRational() ? (Numerator == RHS->getNumerator() && + Denominator == RHS->getDenominator()) + : Factor == RHS->getFactor()); + } + + unsigned getHashValue() const { + return hash_combine(Name, Flags, Kind, Factor, Numerator, Denominator); + } +}; + template <> struct MDNodeKeyImpl { unsigned Tag; MDString *Name; diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index ed86a10c3a25f..95dd3aa86b428 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -1239,6 +1239,25 @@ void Verifier::visitDIBasicType(const DIBasicType &N) { "invalid tag", &N); } +void Verifier::visitDIFixedPointType(const DIFixedPointType &N) { + visitDIBasicType(N); + + CheckDI(N.getTag() == dwarf::DW_TAG_base_type, "invalid tag", &N); + CheckDI(N.getEncoding() == dwarf::DW_ATE_signed_fixed || + N.getEncoding() == dwarf::DW_ATE_unsigned_fixed, + "invalid encoding", &N); + CheckDI(N.getKind() == DIFixedPointType::FixedPointBinary || + N.getKind() == DIFixedPointType::FixedPointDecimal || + N.getKind() == DIFixedPointType::FixedPointRational, + "invalid kind", &N); + CheckDI(N.getKind() != DIFixedPointType::FixedPointRational || + N.getFactorRaw() == 0, + "factor should be 0 for rationals", &N); + CheckDI(N.getKind() == DIFixedPointType::FixedPointRational || + (N.getNumeratorRaw() == 0 && N.getDenominatorRaw() == 0), + "numerator and denominator should be 0 for non-rationals", &N); +} + void Verifier::visitDIStringType(const DIStringType &N) { CheckDI(N.getTag() == dwarf::DW_TAG_string_type, "invalid tag", &N); CheckDI(!(N.isBigEndian() && N.isLittleEndian()), "has conflicting flags", diff --git a/llvm/test/Bitcode/fixedpoint_type.ll b/llvm/test/Bitcode/fixedpoint_type.ll new file mode 100644 index 0000000000000..bbe1fdac9a4e6 --- /dev/null +++ b/llvm/test/Bitcode/fixedpoint_type.ll @@ -0,0 +1,29 @@ +;; This test checks generation of DIFixedPointType. + +; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s + +;; Test whether DIFixedPointType is generated. +; CHECK: !DIFixedPointType(name: "fp__decimal", size: 32, align: 32, encoding: DW_ATE_signed_fixed, kind: Decimal, factor: -4) +; CHECK: !DIFixedPointType(name: "fp__rational", size: 32, align: 32, encoding: DW_ATE_unsigned_fixed, kind: Rational, numerator: 1234, denominator: 5678) +; CHECK: !DIFixedPointType(name: "fp__binary", size: 64, encoding: DW_ATE_unsigned_fixed, kind: Binary, factor: -16) + +; ModuleID = 'fixedpoint_type.ll' +source_filename = "/dir/fixedpoint_type.adb" + +!llvm.module.flags = !{!0, !1} +!llvm.dbg.cu = !{!2} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{i32 2, !"Dwarf Version", i32 4} +!2 = distinct !DICompileUnit(language: DW_LANG_Ada95, file: !3, producer: "GNAT/LLVM", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !4, imports: !4) +!3 = !DIFile(filename: "fixedpoint_type.adb", directory: "/dir") +!4 = !{} +!5 = !{!11, !12, !13} +!6 = distinct !DISubprogram(name: "fp", scope: !3, file: !3, line: 1, type: !7, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !4, retainedNodes: !9) +!7 = !DISubroutineType(types: !8) +!8 = !{null} +!9 = !{!10} +!10 = !DILocalVariable(name: "x", scope: !6, file: !3, line: 3, type: !11, align: 32) +!11 = !DIFixedPointType(name: "fp__decimal", size: 32, align: 32, encoding: DW_ATE_signed_fixed, kind: Decimal, factor: -4) +!12 = !DIFixedPointType(name: "fp__rational", size: 32, align: 32, encoding: DW_ATE_unsigned_fixed, kind: Rational, numerator: 1234, denominator: 5678) +!13 = !DIFixedPointType(name: "fp__binary", size: 64, align: 0, encoding: DW_ATE_unsigned_fixed, kind: Binary, factor: -16) diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp index d019823a5548d..8748371ae4b47 100644 --- a/llvm/unittests/IR/DebugInfoTest.cpp +++ b/llvm/unittests/IR/DebugInfoTest.cpp @@ -501,6 +501,40 @@ TEST(DIBuilder, DIEnumerator) { EXPECT_FALSE(E2); } +TEST(DIBuilder, FixedPointType) { + LLVMContext Ctx; + std::unique_ptr M(new Module("MyModule", Ctx)); + DIBuilder DIB(*M); + + DIFixedPointType *Ty = DIB.createBinaryFixedPointType( + {}, 32, 0, dwarf::DW_ATE_signed_fixed, DINode::FlagZero, -4); + EXPECT_TRUE(Ty); + EXPECT_TRUE(Ty->getKind() == DIFixedPointType::FixedPointBinary); + EXPECT_TRUE(Ty->getFactor() == -4); + EXPECT_TRUE(Ty->getEncoding() == dwarf::DW_ATE_signed_fixed); + EXPECT_TRUE(Ty->getTag() == dwarf::DW_TAG_base_type); + + Ty = DIB.createDecimalFixedPointType({}, 32, 0, dwarf::DW_ATE_unsigned_fixed, + DINode::FlagZero, -7); + EXPECT_TRUE(Ty); + EXPECT_TRUE(Ty->getKind() == DIFixedPointType::FixedPointDecimal); + EXPECT_TRUE(Ty->getFactor() == -7); + EXPECT_TRUE(Ty->getEncoding() == dwarf::DW_ATE_unsigned_fixed); + EXPECT_TRUE(Ty->getTag() == dwarf::DW_TAG_base_type); + + APSInt Num(APInt(32, 1)); + APSInt Denom(APInt(33, 72)); + Ty = DIB.createRationalFixedPointType({}, 32, 0, dwarf::DW_ATE_unsigned_fixed, + DINode::FlagZero, Num, Denom); + EXPECT_TRUE(Ty); + EXPECT_TRUE(Ty->getKind() == DIFixedPointType::FixedPointRational); + EXPECT_TRUE(Ty->getFactorRaw() == 0); + EXPECT_TRUE(Ty->getNumerator() == Num); + EXPECT_TRUE(Ty->getDenominator() == Denom); + EXPECT_TRUE(Ty->getEncoding() == dwarf::DW_ATE_unsigned_fixed); + EXPECT_TRUE(Ty->getTag() == dwarf::DW_TAG_base_type); +} + TEST(DbgAssignIntrinsicTest, replaceVariableLocationOp) { LLVMContext C; std::unique_ptr M = parseIR(C, R"( From 74b7abf15452574808834c0a08dd2af6bada2648 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 31 Mar 2025 08:10:34 -0700 Subject: [PATCH 0098/1029] [IRBuilder] Add new overload for CreateIntrinsic (#131942) Add a new `CreateIntrinsic` overload with no `Types`, useful for creating calls to non-overloaded intrinsics that don't need additional mangling. --- clang/lib/CodeGen/CGHLSLBuiltins.cpp | 14 +++-- clang/lib/CodeGen/CGHLSLRuntime.cpp | 4 +- llvm/include/llvm/IR/IRBuilder.h | 8 +++ llvm/lib/CodeGen/SafeStack.cpp | 2 +- llvm/lib/CodeGen/StackProtector.cpp | 4 +- llvm/lib/IR/AutoUpgrade.cpp | 52 +++++++++---------- .../Target/AArch64/AArch64ISelLowering.cpp | 4 +- .../AArch64/AArch64TargetTransformInfo.cpp | 2 +- .../AMDGPU/AMDGPUAsanInstrumentation.cpp | 2 +- .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 11 ++-- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 2 +- .../AMDGPU/AMDGPULowerKernelArguments.cpp | 2 +- .../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 3 +- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 6 +-- llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 18 +++---- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 +-- llvm/lib/Target/ARM/ARMISelLowering.cpp | 10 ++-- llvm/lib/Target/ARM/MVETailPredication.cpp | 2 +- llvm/lib/Target/Hexagon/HexagonGenExtract.cpp | 2 +- .../Target/Hexagon/HexagonISelLowering.cpp | 4 +- .../Target/Hexagon/HexagonVectorCombine.cpp | 2 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 2 +- llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 6 +-- llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 4 +- llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp | 2 +- .../Target/X86/X86InstCombineIntrinsic.cpp | 4 +- llvm/lib/Target/X86/X86LowerAMXType.cpp | 20 +++---- llvm/lib/Target/X86/X86WinEHState.cpp | 6 +-- .../Target/XCore/XCoreLowerThreadLocal.cpp | 2 +- llvm/lib/Transforms/IPO/CrossDSOCFI.cpp | 2 +- .../Instrumentation/BoundsChecking.cpp | 4 +- .../Instrumentation/HWAddressSanitizer.cpp | 3 +- llvm/lib/Transforms/Instrumentation/KCFI.cpp | 2 +- .../Instrumentation/MemorySanitizer.cpp | 4 +- .../Instrumentation/PGOInstrumentation.cpp | 7 ++- .../Instrumentation/ThreadSanitizer.cpp | 2 +- llvm/lib/Transforms/Scalar/SROA.cpp | 2 +- llvm/lib/Transforms/Utils/GuardUtils.cpp | 2 +- llvm/lib/Transforms/Utils/InlineFunction.cpp | 4 +- llvm/unittests/IR/IRBuilderTest.cpp | 44 +++++++++------- llvm/unittests/Transforms/Utils/LocalTest.cpp | 4 +- 41 files changed, 147 insertions(+), 141 deletions(-) diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp index 5709594a34826..136ea47451fed 100644 --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp @@ -66,13 +66,13 @@ static Value *handleHlslClip(const CallExpr *E, CodeGenFunction *CGF) { CMP = CGF->Builder.CreateIntrinsic( CGF->Builder.getInt1Ty(), CGF->CGM.getHLSLRuntime().getAnyIntrinsic(), {FCompInst}); - } else + } else { CMP = CGF->Builder.CreateFCmpOLT(Op0, FZeroConst); + } - if (CGF->CGM.getTarget().getTriple().isDXIL()) - LastInstr = - CGF->Builder.CreateIntrinsic(CGF->VoidTy, Intrinsic::dx_discard, {CMP}); - else if (CGF->CGM.getTarget().getTriple().isSPIRV()) { + if (CGF->CGM.getTarget().getTriple().isDXIL()) { + LastInstr = CGF->Builder.CreateIntrinsic(Intrinsic::dx_discard, {CMP}); + } else if (CGF->CGM.getTarget().getTriple().isSPIRV()) { BasicBlock *LT0 = CGF->createBasicBlock("lt0", CGF->CurFn); BasicBlock *End = CGF->createBasicBlock("end", CGF->CurFn); @@ -80,7 +80,7 @@ static Value *handleHlslClip(const CallExpr *E, CodeGenFunction *CGF) { CGF->Builder.SetInsertPoint(LT0); - CGF->Builder.CreateIntrinsic(CGF->VoidTy, Intrinsic::spv_discard, {}); + CGF->Builder.CreateIntrinsic(Intrinsic::spv_discard, {}); LastInstr = CGF->Builder.CreateBr(End); CGF->Builder.SetInsertPoint(End); @@ -109,7 +109,6 @@ static Value *handleHlslSplitdouble(const CallExpr *E, CodeGenFunction *CGF) { Value *HighBits = nullptr; if (CGF->CGM.getTarget().getTriple().isDXIL()) { - llvm::Type *RetElementTy = CGF->Int32Ty; if (auto *Op0VecTy = E->getArg(0)->getType()->getAs()) RetElementTy = llvm::VectorType::get( @@ -121,7 +120,6 @@ static Value *handleHlslSplitdouble(const CallExpr *E, CodeGenFunction *CGF) { LowBits = CGF->Builder.CreateExtractValue(CI, 0); HighBits = CGF->Builder.CreateExtractValue(CI, 1); - } else { // For Non DXIL targets we generate the instructions. diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index 0e859dd4a0b1d..3b1810b62a2cd 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -385,8 +385,8 @@ void CGHLSLRuntime::emitEntryFunction(const FunctionDecl *FD, SmallVector OB; if (CGM.shouldEmitConvergenceTokens()) { assert(EntryFn->isConvergent()); - llvm::Value *I = B.CreateIntrinsic( - llvm::Intrinsic::experimental_convergence_entry, {}, {}); + llvm::Value *I = + B.CreateIntrinsic(llvm::Intrinsic::experimental_convergence_entry, {}); llvm::Value *bundleArgs[] = {I}; OB.emplace_back("convergencectrl", bundleArgs); } diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index 750a99cc50dd7..07660e93253da 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -1004,6 +1004,14 @@ class IRBuilderBase { ArrayRef Args, FMFSource FMFSource = {}, const Twine &Name = ""); + /// Create a call to non-overloaded intrinsic \p ID with \p Args. If + /// \p FMFSource is provided, copy fast-math-flags from that instruction to + /// the intrinsic. + CallInst *CreateIntrinsic(Intrinsic::ID ID, ArrayRef Args, + FMFSource FMFSource = {}, const Twine &Name = "") { + return CreateIntrinsic(ID, /*Types=*/{}, Args, FMFSource, Name); + } + /// Create call to the minnum intrinsic. Value *CreateMinNum(Value *LHS, Value *RHS, FMFSource FMFSource = {}, const Twine &Name = "") { diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp index 74e9d945c1885..da229f86f24ce 100644 --- a/llvm/lib/CodeGen/SafeStack.cpp +++ b/llvm/lib/CodeGen/SafeStack.cpp @@ -367,7 +367,7 @@ Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) { if (!StackGuardVar) { TL.insertSSPDeclarations(*M); - return IRB.CreateIntrinsic(Intrinsic::stackguard, {}, {}); + return IRB.CreateIntrinsic(Intrinsic::stackguard, {}); } return IRB.CreateLoad(StackPtrTy, StackGuardVar, "StackGuard"); diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index eb07e5d2bae4b..9d2147f7395d8 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -543,7 +543,7 @@ static Value *getStackGuard(const TargetLoweringBase *TLI, Module *M, if (SupportsSelectionDAGSP) *SupportsSelectionDAGSP = true; TLI->insertSSPDeclarations(*M); - return B.CreateIntrinsic(Intrinsic::stackguard, {}, {}); + return B.CreateIntrinsic(Intrinsic::stackguard, {}); } /// Insert code into the entry block that stores the stack guard @@ -564,7 +564,7 @@ static bool CreatePrologue(Function *F, Module *M, Instruction *CheckLoc, AI = B.CreateAlloca(PtrTy, nullptr, "StackGuardSlot"); Value *GuardSlot = getStackGuard(TLI, M, B, &SupportsSelectionDAGSP); - B.CreateIntrinsic(Intrinsic::stackprotector, {}, {GuardSlot, AI}); + B.CreateIntrinsic(Intrinsic::stackprotector, {GuardSlot, AI}); return SupportsSelectionDAGSP; } diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index d18df7527fffd..963fb1b6ad8c0 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1767,7 +1767,7 @@ static Value *upgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallBase &CI, if (!IndexForm) std::swap(Args[0], Args[1]); - Value *V = Builder.CreateIntrinsic(IID, {}, Args); + Value *V = Builder.CreateIntrinsic(IID, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Builder.CreateBitCast(CI.getArgOperand(1), Ty); @@ -2022,8 +2022,8 @@ static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallBase &CI, // Replace a masked intrinsic with an older unmasked intrinsic. static Value *upgradeX86MaskedShift(IRBuilder<> &Builder, CallBase &CI, Intrinsic::ID IID) { - Value *Rep = Builder.CreateIntrinsic( - IID, {}, {CI.getArgOperand(0), CI.getArgOperand(1)}); + Value *Rep = + Builder.CreateIntrinsic(IID, {CI.getArgOperand(0), CI.getArgOperand(1)}); return emitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2)); } @@ -2280,7 +2280,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder, SmallVector Args(CI.args()); Args.pop_back(); Args.pop_back(); - Rep = Builder.CreateIntrinsic(IID, {}, Args); + Rep = Builder.CreateIntrinsic(IID, Args); unsigned NumArgs = CI.arg_size(); Rep = emitX86Select(Builder, CI.getArgOperand(NumArgs - 1), Rep, CI.getArgOperand(NumArgs - 2)); @@ -2510,7 +2510,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, : Intrinsic::x86_avx512_sqrt_pd_512; Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(3)}; - Rep = Builder.CreateIntrinsic(IID, {}, Args); + Rep = Builder.CreateIntrinsic(IID, Args); } else { Rep = Builder.CreateIntrinsic(Intrinsic::sqrt, CI->getType(), {CI->getArgOperand(0)}); @@ -2637,8 +2637,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, break; } - Rep = Builder.CreateIntrinsic(IID, {}, - {CI->getOperand(0), CI->getArgOperand(1)}); + Rep = + Builder.CreateIntrinsic(IID, {CI->getOperand(0), CI->getArgOperand(1)}); Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2)); } else if (Name.starts_with("avx512.mask.fpclass.p")) { Type *OpTy = CI->getArgOperand(0)->getType(); @@ -2660,8 +2660,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, else llvm_unreachable("Unexpected intrinsic"); - Rep = Builder.CreateIntrinsic(IID, {}, - {CI->getOperand(0), CI->getArgOperand(1)}); + Rep = + Builder.CreateIntrinsic(IID, {CI->getOperand(0), CI->getArgOperand(1)}); Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2)); } else if (Name.starts_with("avx512.cmp.p")) { SmallVector Args(CI->args()); @@ -2689,7 +2689,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, std::swap(Mask, Args.back()); Args.push_back(Mask); - Rep = Builder.CreateIntrinsic(IID, {}, Args); + Rep = Builder.CreateIntrinsic(IID, Args); } else if (Name.starts_with("avx512.mask.cmp.")) { // Integer compare intrinsics. unsigned Imm = cast(CI->getArgOperand(2))->getZExtValue(); @@ -2905,7 +2905,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, } else if (Name == "sse42.crc32.64.8") { Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C)); - Rep = Builder.CreateIntrinsic(Intrinsic::x86_sse42_crc32_32_8, {}, + Rep = Builder.CreateIntrinsic(Intrinsic::x86_sse42_crc32_32_8, {Trunc0, CI->getArgOperand(1)}); Rep = Builder.CreateZExt(Rep, CI->getType(), ""); } else if (Name.starts_with("avx.vbroadcast.s") || @@ -3395,7 +3395,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_add_pd_512; Rep = Builder.CreateIntrinsic( - IID, {}, + IID, {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3411,7 +3411,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_div_pd_512; Rep = Builder.CreateIntrinsic( - IID, {}, + IID, {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3427,7 +3427,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_mul_pd_512; Rep = Builder.CreateIntrinsic( - IID, {}, + IID, {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3443,7 +3443,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_sub_pd_512; Rep = Builder.CreateIntrinsic( - IID, {}, + IID, {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3461,7 +3461,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Intrinsic::ID IID = MinMaxTbl[IsMin][IsDouble]; Rep = Builder.CreateIntrinsic( - IID, {}, + IID, {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); @@ -3759,7 +3759,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_vfmadd_f64; else IID = Intrinsic::x86_avx512_vfmadd_f32; - Rep = Builder.CreateIntrinsic(IID, {}, Ops); + Rep = Builder.CreateIntrinsic(IID, Ops); } else { Rep = Builder.CreateFMA(A, B, C); } @@ -3812,7 +3812,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, else IID = Intrinsic::x86_avx512_vfmadd_pd_512; - Rep = Builder.CreateIntrinsic(IID, {}, {A, B, C, CI->getArgOperand(4)}); + Rep = Builder.CreateIntrinsic(IID, {A, B, C, CI->getArgOperand(4)}); } else { Rep = Builder.CreateFMA(A, B, C); } @@ -3840,7 +3840,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; Ops[2] = Builder.CreateFNeg(Ops[2]); - Rep = Builder.CreateIntrinsic(IID, {}, Ops); + Rep = Builder.CreateIntrinsic(IID, Ops); } else if (Name.starts_with("avx512.mask.vfmaddsub.p") || Name.starts_with("avx512.mask3.vfmaddsub.p") || Name.starts_with("avx512.maskz.vfmaddsub.p") || @@ -3863,7 +3863,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, if (IsSubAdd) Ops[2] = Builder.CreateFNeg(Ops[2]); - Rep = Builder.CreateIntrinsic(IID, {}, Ops); + Rep = Builder.CreateIntrinsic(IID, Ops); } else { int NumElts = cast(CI->getType())->getNumElements(); @@ -3914,7 +3914,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), CI->getArgOperand(3)}; - Rep = Builder.CreateIntrinsic(IID, {}, Args); + Rep = Builder.CreateIntrinsic(IID, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru); @@ -3941,7 +3941,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Rep = Builder.CreateIntrinsic(IID, {}, Args); + Rep = Builder.CreateIntrinsic(IID, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); @@ -3976,7 +3976,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Rep = Builder.CreateIntrinsic(IID, {}, Args); + Rep = Builder.CreateIntrinsic(IID, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); @@ -4005,7 +4005,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Rep = Builder.CreateIntrinsic(IID, {}, Args); + Rep = Builder.CreateIntrinsic(IID, Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); @@ -4027,7 +4027,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, // Make a call with 3 operands. Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Value *NewCall = Builder.CreateIntrinsic(IID, {}, Args); + Value *NewCall = Builder.CreateIntrinsic(IID, Args); // Extract the second result and store it. Value *Data = Builder.CreateExtractValue(NewCall, 1); @@ -4095,7 +4095,7 @@ static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI, Args[1] = Builder.CreateIntrinsic( Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]); - return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr, + return Builder.CreateIntrinsic(NewID, Args, /*FMFSource=*/nullptr, CI->getName()); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1c8e3afdfd718..b3335d8710a65 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -28095,7 +28095,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; Value *LoHi = - Builder.CreateIntrinsic(Int, {}, Addr, /*FMFSource=*/nullptr, "lohi"); + Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi"); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); @@ -28125,7 +28125,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilderBase &Builder) const { - Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {}, {}); + Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {}); } Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e320b0e653ad4..86c2fcf8ae2c1 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1476,7 +1476,7 @@ static std::optional instCombineRDFFR(InstCombiner &IC, auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {II.getType()}, {AllPat}); auto *RDFFR = - IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); + IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {PTrue}); RDFFR->takeName(&II); return IC.replaceInstUsesWith(II, RDFFR); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp index 6554863e08c91..19e2a6a27020d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsanInstrumentation.cpp @@ -73,7 +73,7 @@ static Instruction *genAMDGPUReportBlock(Module &M, IRBuilder<> &IRB, Trm = SplitBlockAndInsertIfThen(Cond, Trm, false); IRB.SetInsertPoint(Trm); - return IRB.CreateIntrinsic(Intrinsic::amdgcn_unreachable, {}, {}); + return IRB.CreateIntrinsic(Intrinsic::amdgcn_unreachable, {}); } static Value *createSlowPathCmp(Module &M, IRBuilder<> &IRB, Type *IntptrTy, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 76b1775f0d096..0a163f8dc7f6b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -666,7 +666,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, // Record I's original position as the entry block. PixelEntryBB = I.getParent(); - Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {}); + Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}); Instruction *const NonHelperTerminator = SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr); @@ -698,15 +698,14 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I, // using the mbcnt intrinsic. Value *Mbcnt; if (ST.isWave32()) { - Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, - {Ballot, B.getInt32(0)}); + Mbcnt = + B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {Ballot, B.getInt32(0)}); } else { Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty); Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty); - Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, + Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {ExtractLo, B.getInt32(0)}); - Mbcnt = - B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); + Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {ExtractHi, Mbcnt}); } Function *F = I.getFunction(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 0b7c5236ce4f5..9c482aeb3ea5c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -1034,7 +1034,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast( if (!HasFP32DenormalFlush && !NumIsOne) return nullptr; - return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den}); + return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den}); } Value *AMDGPUCodeGenPrepareImpl::visitFDivElement( diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 09412d1b0f1cc..a4e6768b4630d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -304,7 +304,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) { return false; CallInst *KernArgSegment = - Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {}, + Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, nullptr, F.getName() + ".kernarg.segment"); KernArgSegment->addRetAttr(Attribute::NonNull); KernArgSegment->addRetAttr( diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 55497c837ee23..3c08d1edb4991 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -532,8 +532,7 @@ class AMDGPULowerModuleLDS { auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); IRBuilder<> Builder(&*InsertAt); - It->second = - Builder.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}, {}); + It->second = Builder.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}); } return It->second; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 98a70c0dbb912..94ecb6ba9a2b8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -1068,9 +1068,9 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { if (!IsAMDHSA) { CallInst *LocalSizeY = - Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {}, {}); + Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {}); CallInst *LocalSizeZ = - Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {}, {}); + Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {}); ST.makeLIDRangeMetadata(LocalSizeY); ST.makeLIDRangeMetadata(LocalSizeZ); @@ -1113,7 +1113,7 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { // } hsa_kernel_dispatch_packet_t // CallInst *DispatchPtr = - Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}, {}); + Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}); DispatchPtr->addRetAttr(Attribute::NoAlias); DispatchPtr->addRetAttr(Attribute::NonNull); F.removeFnAttr("amdgpu-no-dispatch-ptr"); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index 70274a8101f89..cc0d374c99254 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -787,9 +787,9 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func, DebugLoc FirstDL = getOrCreateDebugLoc(&*PrevEntryBlock->begin(), Func->getSubprogram()); IRB.SetCurrentDebugLocation(FirstDL); - Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}, {}); - Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {}, {}); - Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {}, {}); + Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {}); + Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {}); + Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {}); Value *XYOr = IRB.CreateOr(WIdx, WIdy); Value *XYZOr = IRB.CreateOr(XYOr, WIdz); Value *WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0)); @@ -854,7 +854,7 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func, "Dynamic LDS size query is only supported for CO V5 and later."); // Get size from hidden dyn_lds_size argument of kernel Value *ImplicitArg = - IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {}); + IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}); Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP( ImplicitArg->getType(), ImplicitArg, {ConstantInt::get(Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)}); @@ -870,7 +870,7 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func, // Create a call to malloc function which does device global memory allocation // with size equals to all LDS global accesses size in this kernel. Value *ReturnAddress = - IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, {IRB.getInt32(0)}); + IRB.CreateIntrinsic(Intrinsic::returnaddress, {IRB.getInt32(0)}); FunctionCallee MallocFunc = M.getOrInsertFunction( StringRef("__asan_malloc_impl"), FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false)); @@ -896,7 +896,7 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func, XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock); XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock); - IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {}); + IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}); // Load malloc pointer from Sw LDS. Value *LoadMallocPtr = @@ -925,7 +925,7 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func, // Cond Free Block IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin()); - IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {}); + IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}); IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock); // Free Block @@ -936,7 +936,7 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func, StringRef("__asan_free_impl"), FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false)); Value *ReturnAddr = - IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0)); + IRB.CreateIntrinsic(Intrinsic::returnaddress, IRB.getInt32(0)); Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty); Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty); IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt}); @@ -1070,7 +1070,7 @@ void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses( SetVector LDSInstructions; getLDSMemoryInstructions(Func, LDSInstructions); - auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}, {}); + auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}); GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable; GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable; auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c8645850fe111..56149bcd8a839 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17373,8 +17373,8 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate( Value *LoadedShared = nullptr; if (FullFlatEmulation) { - CallInst *IsShared = Builder.CreateIntrinsic( - Intrinsic::amdgcn_is_shared, {}, {Addr}, nullptr, "is.shared"); + CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, + {Addr}, nullptr, "is.shared"); Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB); Builder.SetInsertPoint(SharedBB); Value *CastToLocal = Builder.CreateAddrSpaceCast( @@ -17389,8 +17389,8 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate( Builder.SetInsertPoint(CheckPrivateBB); } - CallInst *IsPrivate = Builder.CreateIntrinsic( - Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private"); + CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private, + {Addr}, nullptr, "is.private"); Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB); Builder.SetInsertPoint(PrivateBB); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index a4713311e2b3e..2290ac2728c6d 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21212,7 +21212,7 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(7), Builder.getInt32(10), Builder.getInt32(5)}; - return Builder.CreateIntrinsic(Intrinsic::arm_mcr, {}, args); + return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args); } else { // Instead of using barriers, atomic accesses on these subtargets use // libcalls. @@ -21222,7 +21222,7 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, // Only a full system barrier exists in the M-class architectures. Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; Constant *CDomain = Builder.getInt32(Domain); - return Builder.CreateIntrinsic(Intrinsic::arm_dmb, {}, CDomain); + return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain); } } @@ -21477,7 +21477,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; Value *LoHi = - Builder.CreateIntrinsic(Int, {}, Addr, /*FMFSource=*/nullptr, "lohi"); + Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi"); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); @@ -21502,7 +21502,7 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilderBase &Builder) const { if (!Subtarget->hasV7Ops()) return; - Builder.CreateIntrinsic(Intrinsic::arm_clrex, {}, {}); + Builder.CreateIntrinsic(Intrinsic::arm_clrex, {}); } Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, @@ -21523,7 +21523,7 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); if (!Subtarget->isLittle()) std::swap(Lo, Hi); - return Builder.CreateIntrinsic(Int, {}, {Lo, Hi, Addr}); + return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr}); } Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp index 98209f0cbe24f..bb07d79c9374a 100644 --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -399,7 +399,7 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, case 8: VCTPID = Intrinsic::arm_mve_vctp16; break; case 16: VCTPID = Intrinsic::arm_mve_vctp8; break; } - Value *VCTPCall = Builder.CreateIntrinsic(VCTPID, {}, Processed); + Value *VCTPCall = Builder.CreateIntrinsic(VCTPID, Processed); ActiveLaneMask->replaceAllUsesWith(VCTPCall); // Add the incoming value to the new phi. diff --git a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp index b44519a1286d0..0f0788616860e 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp @@ -212,7 +212,7 @@ bool HexagonGenExtract::convert(Instruction *In) { Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu : Intrinsic::hexagon_S2_extractup; Value *NewIn = - IRB.CreateIntrinsic(IntId, {}, {BF, IRB.getInt32(W), IRB.getInt32(SR)}); + IRB.CreateIntrinsic(IntId, {BF, IRB.getInt32(W), IRB.getInt32(SR)}); if (SL != 0) NewIn = IRB.CreateShl(NewIn, SL, CSL->getName()); In->replaceAllUsesWith(NewIn); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 1710488e4e292..4c479ac41be12 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -3854,7 +3854,7 @@ Value *HexagonTargetLowering::emitLoadLinked(IRBuilderBase &Builder, : Intrinsic::hexagon_L4_loadd_locked; Value *Call = - Builder.CreateIntrinsic(IntID, {}, Addr, /*FMFSource=*/nullptr, "larx"); + Builder.CreateIntrinsic(IntID, Addr, /*FMFSource=*/nullptr, "larx"); return Builder.CreateBitCast(Call, ValueTy); } @@ -3876,7 +3876,7 @@ Value *HexagonTargetLowering::emitStoreConditional(IRBuilderBase &Builder, Val = Builder.CreateBitCast(Val, CastTy); - Value *Call = Builder.CreateIntrinsic(IntID, {}, {Addr, Val}, + Value *Call = Builder.CreateIntrinsic(IntID, {Addr, Val}, /*FMFSource=*/nullptr, "stcx"); Value *Cmp = Builder.CreateICmpEQ(Call, Builder.getInt32(0), ""); Value *Ext = Builder.CreateZExt(Cmp, Type::getInt32Ty(M->getContext())); diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index 0760d712f9afd..d89bc41b910d0 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -2392,7 +2392,7 @@ auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo, Type *Int64Ty = Type::getInt64Ty(F.getContext()); Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst"); Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst"); - Value *Call = Builder.CreateIntrinsic(Intrinsic::hexagon_S2_valignrb, {}, + Value *Call = Builder.CreateIntrinsic(Intrinsic::hexagon_S2_valignrb, {Hi64, Lo64, Amt}, /*FMFSource=*/nullptr, "cup"); return Builder.CreateBitCast(Call, Lo->getType(), "cst"); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index ab78f33f5a630..e2ce5e4fc17e1 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -12429,7 +12429,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, //===----------------------------------------------------------------------===// static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) { - return Builder.CreateIntrinsic(Id, {}, {}); + return Builder.CreateIntrinsic(Id, {}); } // The mappings for emitLeading/TrailingFence is taken from diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 900bb1a8a46d2..6afbac5f7c3cb 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -1197,7 +1197,7 @@ void SPIRVEmitIntrinsics::preprocessUndefs(IRBuilder<> &B) { setInsertPointSkippingPhis(B, I); BPrepared = true; } - auto *IntrUndef = B.CreateIntrinsic(Intrinsic::spv_undef, {}, {}); + auto *IntrUndef = B.CreateIntrinsic(Intrinsic::spv_undef, {}); Worklist.push(IntrUndef); I->replaceUsesOfWith(Op, IntrUndef); AggrConsts[IntrUndef] = AggrUndef; @@ -1309,7 +1309,7 @@ Instruction *SPIRVEmitIntrinsics::visitCallInst(CallInst &Call) { IRBuilder<> B(Call.getParent()); B.SetInsertPoint(&Call); - B.CreateIntrinsic(Intrinsic::spv_inline_asm, {}, {Args}); + B.CreateIntrinsic(Intrinsic::spv_inline_asm, {Args}); return &Call; } @@ -1806,7 +1806,7 @@ Instruction *SPIRVEmitIntrinsics::visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { Instruction *SPIRVEmitIntrinsics::visitUnreachableInst(UnreachableInst &I) { IRBuilder<> B(I.getParent()); B.SetInsertPoint(&I); - B.CreateIntrinsic(Intrinsic::spv_unreachable, {}, {}); + B.CreateIntrinsic(Intrinsic::spv_unreachable, {}); return &I; } diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 439828d9759a5..bb71da49316f3 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -1847,9 +1847,9 @@ void SPIRVGlobalRegistry::buildAssignType(IRBuilder<> &B, Type *Ty, SmallVector ArgMDs{ MDNode::get(Ctx, ValueAsMetadata::getConstant(OfType)), MDString::get(Ctx, Arg->getName())}; - B.CreateIntrinsic(Intrinsic::spv_value_md, {}, + B.CreateIntrinsic(Intrinsic::spv_value_md, {MetadataAsValue::get(Ctx, MDTuple::get(Ctx, ArgMDs))}); - AssignCI = B.CreateIntrinsic(Intrinsic::fake_use, {}, {Arg}); + AssignCI = B.CreateIntrinsic(Intrinsic::fake_use, {Arg}); } else { AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type, {Arg->getType()}, OfType, Arg, {}, B); diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp index 32b416ab1970c..7ead4c82fb7e6 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp @@ -615,7 +615,7 @@ class SPIRVStructurizer : public FunctionPass { getSpirvLoopControlOperandsFromLoopMetadata(L); for (unsigned Imm : LoopControlImms) Args.emplace_back(llvm::ConstantInt::get(Builder.getInt32Ty(), Imm)); - Builder.CreateIntrinsic(Intrinsic::spv_loop_merge, {}, {Args}); + Builder.CreateIntrinsic(Intrinsic::spv_loop_merge, {Args}); Modified = true; } diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index a44c583a1ca51..c4d349044fe80 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -1869,7 +1869,7 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI. if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { Value *Args[] = {Op0, CILength, CIIndex}; - return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_extrqi, {}, Args); + return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_extrqi, Args); } } @@ -1966,7 +1966,7 @@ static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, Constant *CIIndex = ConstantInt::get(IntTy8, Index, false); Value *Args[] = {Op0, Op1, CILength, CIIndex}; - return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_insertqi, {}, Args); + return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_insertqi, Args); } return nullptr; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index eacdc8a3f639a..54f5977fe76eb 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -380,7 +380,7 @@ void X86LowerAMXType::combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast) { std::array Args = {Row, Col, I8Ptr, Stride}; Value *NewInst = - Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, {}, Args); + Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, Args); Bitcast->replaceAllUsesWith(NewInst); } @@ -405,7 +405,7 @@ void X86LowerAMXType::combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST) { Value *Stride = Builder.getInt64(64); Value *I8Ptr = ST->getOperand(1); std::array Args = {Row, Col, I8Ptr, Stride, Tile}; - Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, {}, Args); + Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, Args); if (Bitcast->hasOneUse()) return; // %13 = bitcast x86_amx %src to <256 x i32> @@ -455,7 +455,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { std::tie(Row, Col) = SC->getShape(II, OpNo); std::array Args = {Row, Col, I8Ptr, Stride}; Value *NewInst = - Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, {}, Args); + Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, Args); Bitcast->replaceAllUsesWith(NewInst); } else { // %2 = bitcast x86_amx %src to <256 x i32> @@ -472,7 +472,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) { Value *Row = II->getOperand(0); Value *Col = II->getOperand(1); std::array Args = {Row, Col, I8Ptr, Stride, Src}; - Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, {}, Args); + Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, Args); Value *NewInst = Builder.CreateLoad(Bitcast->getType(), AllocaAddr); Bitcast->replaceAllUsesWith(NewInst); } @@ -612,7 +612,7 @@ static Instruction *createTileStore(Instruction *TileDef, Value *Ptr) { std::array Args = {Row, Col, Ptr, Stride, TileDef}; Instruction *TileStore = - Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, {}, Args); + Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, Args); return TileStore; } @@ -643,7 +643,7 @@ static void replaceWithTileLoad(Use &U, Value *Ptr, bool IsPHI = false) { std::array Args = {Row, Col, Ptr, Stride}; Value *TileLoad = - Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, {}, Args); + Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, Args); UserI->replaceUsesOfWith(V, TileLoad); } @@ -1124,7 +1124,7 @@ bool X86LowerAMXCast::combineCastStore(IntrinsicInst *Cast, StoreInst *ST) { Value *Stride = Builder.CreateSExt(Col, Builder.getInt64Ty()); Value *I8Ptr = Builder.CreateBitCast(ST->getOperand(1), Builder.getPtrTy()); std::array Args = {Row, Col, I8Ptr, Stride, Tile}; - Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, {}, Args); + Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, Args); return true; } @@ -1169,7 +1169,7 @@ bool X86LowerAMXCast::combineLoadCast(IntrinsicInst *Cast, LoadInst *LD) { std::array Args = {Row, Col, I8Ptr, Stride}; Value *NewInst = - Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, {}, Args); + Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, Args); Cast->replaceAllUsesWith(NewInst); return EraseLoad; @@ -1357,7 +1357,7 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) { std::array Args = { Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())}; Value *NewInst = - Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, {}, Args); + Builder.CreateIntrinsic(Intrinsic::x86_tileloadd64_internal, Args); AMXCast->replaceAllUsesWith(NewInst); AMXCast->eraseFromParent(); } else { @@ -1376,7 +1376,7 @@ bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) { Value *Col = II->getOperand(1); std::array Args = { Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty()), Src}; - Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, {}, Args); + Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, Args); Value *NewInst = Builder.CreateLoad(AMXCast->getType(), AllocaAddr); AMXCast->replaceAllUsesWith(NewInst); AMXCast->eraseFromParent(); diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp index dfdeada476695..27111fce45662 100644 --- a/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/llvm/lib/Target/X86/X86WinEHState.cpp @@ -374,7 +374,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { } Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) { - return Builder.CreateIntrinsic(Intrinsic::x86_seh_lsda, {}, F); + return Builder.CreateIntrinsic(Intrinsic::x86_seh_lsda, F); } /// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls @@ -649,13 +649,13 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { // that it can recover the original frame pointer. IRBuilder<> Builder(RegNode->getNextNode()); Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getPtrTy()); - Builder.CreateIntrinsic(Intrinsic::x86_seh_ehregnode, {}, {RegNodeI8}); + Builder.CreateIntrinsic(Intrinsic::x86_seh_ehregnode, {RegNodeI8}); if (EHGuardNode) { IRBuilder<> Builder(EHGuardNode->getNextNode()); Value *EHGuardNodeI8 = Builder.CreateBitCast(EHGuardNode, Builder.getPtrTy()); - Builder.CreateIntrinsic(Intrinsic::x86_seh_ehguard, {}, {EHGuardNodeI8}); + Builder.CreateIntrinsic(Intrinsic::x86_seh_ehguard, {EHGuardNodeI8}); } // Calculate state numbers. diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp index 31528bf1f0fae..3870e80f9559b 100644 --- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp +++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp @@ -156,7 +156,7 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) { for (User *U : Users) { Instruction *Inst = cast(U); IRBuilder<> Builder(Inst); - Value *ThreadID = Builder.CreateIntrinsic(Intrinsic::xcore_getid, {}, {}); + Value *ThreadID = Builder.CreateIntrinsic(Intrinsic::xcore_getid, {}); Value *Addr = Builder.CreateInBoundsGEP(NewGV->getValueType(), NewGV, {Builder.getInt64(0), ThreadID}); U->replaceUsesOfWith(GV, Addr); diff --git a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp index 2d884078940cc..a848eac6f3e4c 100644 --- a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp +++ b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -126,7 +126,7 @@ void CrossDSOCFI::buildCFICheck(Module &M) { IRBuilder<> IRBTest(TestBB); Value *Test = IRBTest.CreateIntrinsic( - Intrinsic::type_test, {}, + Intrinsic::type_test, {&Addr, MetadataAsValue::get(Ctx, ConstantAsMetadata::get(CaseTypeId))}); BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB); diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index 14c331b3b748e..9239ae8741afb 100644 --- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -111,10 +111,10 @@ static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal, static CallInst *InsertTrap(BuilderTy &IRB, bool DebugTrapBB, std::optional GuardKind) { if (!DebugTrapBB) - return IRB.CreateIntrinsic(Intrinsic::trap, {}, {}); + return IRB.CreateIntrinsic(Intrinsic::trap, {}); return IRB.CreateIntrinsic( - Intrinsic::ubsantrap, {}, + Intrinsic::ubsantrap, ConstantInt::get(IRB.getInt8Ty(), GuardKind.has_value() ? GuardKind.value() diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 65bb9c33e1772..61dfc6411fc3a 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -1007,14 +1007,13 @@ void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite, UseShortGranules ? Intrinsic::hwasan_check_memaccess_shortgranules_fixedshadow : Intrinsic::hwasan_check_memaccess_fixedshadow, - {}, {Ptr, ConstantInt::get(Int32Ty, AccessInfo), ConstantInt::get(Int64Ty, Mapping.offset())}); } else { IRB.CreateIntrinsic( UseShortGranules ? Intrinsic::hwasan_check_memaccess_shortgranules : Intrinsic::hwasan_check_memaccess, - {}, {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)}); + {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)}); } } diff --git a/llvm/lib/Transforms/Instrumentation/KCFI.cpp b/llvm/lib/Transforms/Instrumentation/KCFI.cpp index 38fc99429122d..bfed678854943 100644 --- a/llvm/lib/Transforms/Instrumentation/KCFI.cpp +++ b/llvm/lib/Transforms/Instrumentation/KCFI.cpp @@ -109,7 +109,7 @@ PreservedAnalyses KCFIPass::run(Function &F, FunctionAnalysisManager &AM) { Instruction *ThenTerm = SplitBlockAndInsertIfThen(Test, Call, false, VeryUnlikelyWeights); Builder.SetInsertPoint(ThenTerm); - Builder.CreateIntrinsic(Intrinsic::debugtrap, {}, {}); + Builder.CreateIntrinsic(Intrinsic::debugtrap, {}); ++NumKCFIChecks; } diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 612aaa5dbd43c..1cea53f695292 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -1208,7 +1208,7 @@ struct MemorySanitizerVisitor : public InstVisitor { MS.initializeCallbacks(*F.getParent(), TLI); FnPrologueEnd = IRBuilder<>(&F.getEntryBlock(), F.getEntryBlock().getFirstNonPHIIt()) - .CreateIntrinsic(Intrinsic::donothing, {}, {}); + .CreateIntrinsic(Intrinsic::donothing, {}); if (MS.CompileKernel) { IRBuilder<> IRB(FnPrologueEnd); @@ -3550,7 +3550,7 @@ struct MemorySanitizerVisitor : public InstVisitor { } Value *S = IRB.CreateIntrinsic(getSignedPackIntrinsic(I.getIntrinsicID()), - {}, {S1_ext, S2_ext}, /*FMFSource=*/nullptr, + {S1_ext, S2_ext}, /*FMFSource=*/nullptr, "_msprop_vector_pack"); if (MMXEltSizeInBits) S = IRB.CreateBitCast(S, getShadowTy(&I)); diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 1fca354e3825f..7c73c16db02c8 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -956,7 +956,7 @@ void FunctionInstrumenter::instrument() { // llvm.instrprof.cover(i8* , i64 , i32 , // i32 ) Builder.CreateIntrinsic( - Intrinsic::instrprof_cover, {}, + Intrinsic::instrprof_cover, {NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)}); return; } @@ -1013,7 +1013,7 @@ void FunctionInstrumenter::instrument() { IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt()); // llvm.instrprof.timestamp(i8* , i64 , i32 , // i32 ) - Builder.CreateIntrinsic(Intrinsic::instrprof_timestamp, {}, + Builder.CreateIntrinsic(Intrinsic::instrprof_timestamp, {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I)}); @@ -1028,7 +1028,6 @@ void FunctionInstrumenter::instrument() { // i32 ) Builder.CreateIntrinsic(PGOBlockCoverage ? Intrinsic::instrprof_cover : Intrinsic::instrprof_increment, - {}, {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I++)}); @@ -1772,7 +1771,7 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) { auto *NormalizedFuncNameVarPtr = ConstantExpr::getPointerBitCastOrAddrSpaceCast( FuncNameVar, PointerType::get(M->getContext(), 0)); - Builder.CreateIntrinsic(Intrinsic::instrprof_increment_step, {}, + Builder.CreateIntrinsic(Intrinsic::instrprof_increment_step, {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step}); diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 2b403b695c1d2..1811d145f9907 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -573,7 +573,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F, InstrumentationIRBuilder IRB(&F.getEntryBlock(), F.getEntryBlock().getFirstNonPHIIt()); Value *ReturnAddress = - IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0)); + IRB.CreateIntrinsic(Intrinsic::returnaddress, IRB.getInt32(0)); IRB.CreateCall(TsanFuncEntry, ReturnAddress); EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 86be20c799a68..4e444d8d4cefc 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -3876,7 +3876,7 @@ class AggLoadStoreRewriter : public InstVisitor { for (Instruction *I : FakeUses) { IRB.SetInsertPoint(I); for (auto *V : Components) - IRB.CreateIntrinsic(Intrinsic::fake_use, {}, {V}); + IRB.CreateIntrinsic(Intrinsic::fake_use, {V}); I->eraseFromParent(); } } diff --git a/llvm/lib/Transforms/Utils/GuardUtils.cpp b/llvm/lib/Transforms/Utils/GuardUtils.cpp index dfcfddaca145f..46ad951d0a812 100644 --- a/llvm/lib/Transforms/Utils/GuardUtils.cpp +++ b/llvm/lib/Transforms/Utils/GuardUtils.cpp @@ -71,7 +71,7 @@ void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic, // guard's condition. IRBuilder<> B(CheckBI); auto *WC = B.CreateIntrinsic(Intrinsic::experimental_widenable_condition, - {}, {}, nullptr, "widenable_cond"); + {}, nullptr, "widenable_cond"); CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC, "exiplicit_guard_cond")); assert(isWidenableBranch(CheckBI) && "Branch must be widenable."); diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 1404867fda6bc..131fbe654c11c 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -2159,7 +2159,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, // call. if (IsUnsafeClaimRV) { Builder.SetInsertPoint(II); - Builder.CreateIntrinsic(Intrinsic::objc_release, {}, RetOpnd); + Builder.CreateIntrinsic(Intrinsic::objc_release, RetOpnd); } II->eraseFromParent(); InsertRetainCall = false; @@ -2193,7 +2193,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, // matching autoreleaseRV or an annotated call in the callee. Emit a call // to objc_retain. Builder.SetInsertPoint(RI); - Builder.CreateIntrinsic(Intrinsic::objc_retain, {}, RetOpnd); + Builder.CreateIntrinsic(Intrinsic::objc_retain, RetOpnd); } } } diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp index e9e9d7b11a36c..b7eb0af728331 100644 --- a/llvm/unittests/IR/IRBuilderTest.cpp +++ b/llvm/unittests/IR/IRBuilderTest.cpp @@ -81,7 +81,12 @@ TEST_F(IRBuilderTest, Intrinsics) { II = cast(Result); EXPECT_EQ(II->getIntrinsicID(), Intrinsic::maximum); - Result = Builder.CreateIntrinsic(Intrinsic::readcyclecounter, {}, {}); + Result = Builder.CreateIntrinsic(Intrinsic::readcyclecounter, + ArrayRef{}, {}); + II = cast(Result); + EXPECT_EQ(II->getIntrinsicID(), Intrinsic::readcyclecounter); + + Result = Builder.CreateIntrinsic(Intrinsic::readcyclecounter, {}); II = cast(Result); EXPECT_EQ(II->getIntrinsicID(), Intrinsic::readcyclecounter); @@ -134,7 +139,7 @@ TEST_F(IRBuilderTest, Intrinsics) { EXPECT_FALSE(II->hasNoNaNs()); Result = Builder.CreateIntrinsic( - Intrinsic::set_rounding, {}, + Intrinsic::set_rounding, {Builder.getInt32(static_cast(RoundingMode::TowardZero))}); II = cast(Result); EXPECT_EQ(II->getIntrinsicID(), Intrinsic::set_rounding); @@ -174,17 +179,17 @@ TEST_F(IRBuilderTest, IntrinsicsWithScalableVectors) { Type *DstVecTy = VectorType::get(Builder.getInt32Ty(), 4, true); Type *PredTy = VectorType::get(Builder.getInt1Ty(), 4, true); - SmallVector ArgTys; - ArgTys.push_back(UndefValue::get(DstVecTy)); - ArgTys.push_back(UndefValue::get(PredTy)); - ArgTys.push_back(UndefValue::get(SrcVecTy)); + SmallVector Args; + Args.push_back(UndefValue::get(DstVecTy)); + Args.push_back(UndefValue::get(PredTy)); + Args.push_back(UndefValue::get(SrcVecTy)); - Call = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fcvtzs_i32f16, {}, - ArgTys, nullptr, "aarch64.sve.fcvtzs.i32f16"); + Call = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_fcvtzs_i32f16, Args, + nullptr, "aarch64.sve.fcvtzs.i32f16"); FTy = Call->getFunctionType(); EXPECT_EQ(FTy->getReturnType(), DstVecTy); - for (unsigned i = 0; i != ArgTys.size(); ++i) - EXPECT_EQ(FTy->getParamType(i), ArgTys[i]->getType()); + for (unsigned i = 0; i != Args.size(); ++i) + EXPECT_EQ(FTy->getParamType(i), Args[i]->getType()); // Test scalable flag isn't dropped for intrinsic defined with // LLVMScalarOrSameVectorWidth. @@ -193,19 +198,18 @@ TEST_F(IRBuilderTest, IntrinsicsWithScalableVectors) { Type *PtrToVecTy = Builder.getPtrTy(); PredTy = VectorType::get(Builder.getInt1Ty(), 4, true); - ArgTys.clear(); - ArgTys.push_back(UndefValue::get(PtrToVecTy)); - ArgTys.push_back(UndefValue::get(Builder.getInt32Ty())); - ArgTys.push_back(UndefValue::get(PredTy)); - ArgTys.push_back(UndefValue::get(VecTy)); + Args.clear(); + Args.push_back(UndefValue::get(PtrToVecTy)); + Args.push_back(UndefValue::get(Builder.getInt32Ty())); + Args.push_back(UndefValue::get(PredTy)); + Args.push_back(UndefValue::get(VecTy)); - Call = Builder.CreateIntrinsic(Intrinsic::masked_load, - {VecTy, PtrToVecTy}, ArgTys, - nullptr, "masked.load"); + Call = Builder.CreateIntrinsic(Intrinsic::masked_load, {VecTy, PtrToVecTy}, + Args, nullptr, "masked.load"); FTy = Call->getFunctionType(); EXPECT_EQ(FTy->getReturnType(), VecTy); - for (unsigned i = 0; i != ArgTys.size(); ++i) - EXPECT_EQ(FTy->getParamType(i), ArgTys[i]->getType()); + for (unsigned i = 0; i != Args.size(); ++i) + EXPECT_EQ(FTy->getParamType(i), Args[i]->getType()); } TEST_F(IRBuilderTest, CreateVScale) { diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp index 3b3c45d969971..8d8f991e9ea49 100644 --- a/llvm/unittests/Transforms/Utils/LocalTest.cpp +++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp @@ -1243,8 +1243,8 @@ TEST(Local, CanReplaceOperandWithVariable) { // immarg. Type *PtrPtr = B.getPtrTy(0); Value *Alloca = B.CreateAlloca(PtrPtr, (unsigned)0); - CallInst *GCRoot = B.CreateIntrinsic(Intrinsic::gcroot, {}, - {Alloca, Constant::getNullValue(PtrPtr)}); + CallInst *GCRoot = B.CreateIntrinsic( + Intrinsic::gcroot, {Alloca, Constant::getNullValue(PtrPtr)}); EXPECT_TRUE(canReplaceOperandWithVariable(GCRoot, 0)); // Alloca EXPECT_FALSE(canReplaceOperandWithVariable(GCRoot, 1)); EXPECT_FALSE(canReplaceOperandWithVariable(GCRoot, 2)); From 50949ebf523cc09cc911a12691fb79b6ac97102a Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 31 Mar 2025 08:19:41 -0700 Subject: [PATCH 0099/1029] [lldb] Expose the Target API mutex through the SB API (#133295) Expose u target API mutex through the SB API. This is motivated by lldb-dap, which is built on top of the SB API and needs a way to execute a series of SB API calls in an atomic manner (see #131242). We can solve this problem by either introducing an additional layer of locking at the DAP level or by exposing the existing locking at the SB API level. This patch implements the second approach. This was discussed in an RFC on Discourse [0]. The original implementation exposed a move-only lock rather than a mutex [1] which doesn't work well with SWIG 4.0 [2]. This implement the alternative solution of exposing the mutex rather than the lock. The SBMutex conforms to the BasicLockable requirement [3] (which is why the methods are called `lock` and `unlock` rather than Lock and Unlock) so it can be used as `std::lock_guard` and `std::unique_lock`. [0]: https://discourse.llvm.org/t/rfc-exposing-the-target-api-lock-through-the-sb-api/85215/6 [1]: https://github.com/llvm/llvm-project/pull/131404 [2]: https://discourse.llvm.org/t/rfc-bumping-the-minimum-swig-version-to-4-1-0/85377/9 [3]: https://en.cppreference.com/w/cpp/named_req/BasicLockable --- lldb/bindings/interface/SBMutexExtensions.i | 12 ++++ lldb/bindings/interfaces.swig | 4 +- lldb/include/lldb/API/LLDB.h | 1 + lldb/include/lldb/API/SBDefines.h | 1 + lldb/include/lldb/API/SBMutex.h | 45 ++++++++++++++ lldb/include/lldb/API/SBTarget.h | 4 +- lldb/source/API/CMakeLists.txt | 1 + lldb/source/API/SBMutex.cpp | 60 +++++++++++++++++++ lldb/source/API/SBTarget.cpp | 16 +++-- .../API/python_api/target/TestTargetAPI.py | 24 ++++++++ lldb/unittests/API/CMakeLists.txt | 1 + lldb/unittests/API/SBMutexTest.cpp | 57 ++++++++++++++++++ 12 files changed, 220 insertions(+), 6 deletions(-) create mode 100644 lldb/bindings/interface/SBMutexExtensions.i create mode 100644 lldb/include/lldb/API/SBMutex.h create mode 100644 lldb/source/API/SBMutex.cpp create mode 100644 lldb/unittests/API/SBMutexTest.cpp diff --git a/lldb/bindings/interface/SBMutexExtensions.i b/lldb/bindings/interface/SBMutexExtensions.i new file mode 100644 index 0000000000000..32d3fee468697 --- /dev/null +++ b/lldb/bindings/interface/SBMutexExtensions.i @@ -0,0 +1,12 @@ +%extend lldb::SBMutex { +#ifdef SWIGPYTHON + %pythoncode %{ + def __enter__(self): + self.lock() + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.unlock() + %} +#endif +} diff --git a/lldb/bindings/interfaces.swig b/lldb/bindings/interfaces.swig index 6da56e4e0fa52..e71ed136f20e6 100644 --- a/lldb/bindings/interfaces.swig +++ b/lldb/bindings/interfaces.swig @@ -51,6 +51,7 @@ %include "./interface/SBMemoryRegionInfoListDocstrings.i" %include "./interface/SBModuleDocstrings.i" %include "./interface/SBModuleSpecDocstrings.i" +%include "./interface/SBMutexExtensions.i" %include "./interface/SBPlatformDocstrings.i" %include "./interface/SBProcessDocstrings.i" %include "./interface/SBProcessInfoDocstrings.i" @@ -121,8 +122,8 @@ %include "lldb/API/SBHostOS.h" %include "lldb/API/SBInstruction.h" %include "lldb/API/SBInstructionList.h" -%include "lldb/API/SBLanguages.h" %include "lldb/API/SBLanguageRuntime.h" +%include "lldb/API/SBLanguages.h" %include "lldb/API/SBLaunchInfo.h" %include "lldb/API/SBLineEntry.h" %include "lldb/API/SBListener.h" @@ -130,6 +131,7 @@ %include "lldb/API/SBMemoryRegionInfoList.h" %include "lldb/API/SBModule.h" %include "lldb/API/SBModuleSpec.h" +%include "lldb/API/SBMutex.h" %include "lldb/API/SBPlatform.h" %include "lldb/API/SBProcess.h" %include "lldb/API/SBProcessInfo.h" diff --git a/lldb/include/lldb/API/LLDB.h b/lldb/include/lldb/API/LLDB.h index 126fcef31b416..6485f35302a1c 100644 --- a/lldb/include/lldb/API/LLDB.h +++ b/lldb/include/lldb/API/LLDB.h @@ -50,6 +50,7 @@ #include "lldb/API/SBMemoryRegionInfoList.h" #include "lldb/API/SBModule.h" #include "lldb/API/SBModuleSpec.h" +#include "lldb/API/SBMutex.h" #include "lldb/API/SBPlatform.h" #include "lldb/API/SBProcess.h" #include "lldb/API/SBProcessInfo.h" diff --git a/lldb/include/lldb/API/SBDefines.h b/lldb/include/lldb/API/SBDefines.h index ed5a80da117a5..85f6bbeea5bf9 100644 --- a/lldb/include/lldb/API/SBDefines.h +++ b/lldb/include/lldb/API/SBDefines.h @@ -89,6 +89,7 @@ class LLDB_API SBMemoryRegionInfoList; class LLDB_API SBModule; class LLDB_API SBModuleSpec; class LLDB_API SBModuleSpecList; +class LLDB_API SBMutex; class LLDB_API SBPlatform; class LLDB_API SBPlatformConnectOptions; class LLDB_API SBPlatformShellCommand; diff --git a/lldb/include/lldb/API/SBMutex.h b/lldb/include/lldb/API/SBMutex.h new file mode 100644 index 0000000000000..717d5f86cbc1c --- /dev/null +++ b/lldb/include/lldb/API/SBMutex.h @@ -0,0 +1,45 @@ +//===-- SBMutex.h ---------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_API_SBMUTEX_H +#define LLDB_API_SBMUTEX_H + +#include "lldb/API/SBDefines.h" +#include "lldb/lldb-forward.h" +#include + +namespace lldb { + +class LLDB_API SBMutex { +public: + SBMutex(); + SBMutex(const SBMutex &rhs); + const SBMutex &operator=(const SBMutex &rhs); + ~SBMutex(); + + /// Returns true if this lock has ownership of the underlying mutex. + bool IsValid() const; + + /// Blocking operation that takes ownership of this lock. + void lock() const; + + /// Releases ownership of this lock. + void unlock() const; + +private: + // Private constructor used by SBTarget to create the Target API mutex. + // Requires a friend declaration. + SBMutex(lldb::TargetSP target_sp); + friend class SBTarget; + + std::shared_ptr m_opaque_sp; +}; + +} // namespace lldb + +#endif diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h index bb912ab41d0fe..17735fdca6559 100644 --- a/lldb/include/lldb/API/SBTarget.h +++ b/lldb/include/lldb/API/SBTarget.h @@ -342,7 +342,7 @@ class LLDB_API SBTarget { uint32_t GetAddressByteSize(); const char *GetTriple(); - + const char *GetABIName(); const char *GetLabel() const; @@ -946,6 +946,8 @@ class LLDB_API SBTarget { /// An error if a Trace already exists or the trace couldn't be created. lldb::SBTrace CreateTrace(SBError &error); + lldb::SBMutex GetAPIMutex() const; + protected: friend class SBAddress; friend class SBAddressRange; diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt index 48d5cde5bf592..3bc569608e458 100644 --- a/lldb/source/API/CMakeLists.txt +++ b/lldb/source/API/CMakeLists.txt @@ -81,6 +81,7 @@ add_lldb_library(liblldb SHARED ${option_framework} SBMemoryRegionInfoList.cpp SBModule.cpp SBModuleSpec.cpp + SBMutex.cpp SBPlatform.cpp SBProcess.cpp SBProcessInfo.cpp diff --git a/lldb/source/API/SBMutex.cpp b/lldb/source/API/SBMutex.cpp new file mode 100644 index 0000000000000..445076b5a9174 --- /dev/null +++ b/lldb/source/API/SBMutex.cpp @@ -0,0 +1,60 @@ +//===-- SBMutex.cpp -------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/API/SBMutex.h" +#include "lldb/Target/Target.h" +#include "lldb/Utility/Instrumentation.h" +#include "lldb/lldb-forward.h" +#include +#include + +using namespace lldb; +using namespace lldb_private; + +SBMutex::SBMutex() : m_opaque_sp(std::make_shared()) { + LLDB_INSTRUMENT_VA(this); +} + +SBMutex::SBMutex(const SBMutex &rhs) : m_opaque_sp(rhs.m_opaque_sp) { + LLDB_INSTRUMENT_VA(this); +} + +const SBMutex &SBMutex::operator=(const SBMutex &rhs) { + LLDB_INSTRUMENT_VA(this); + + m_opaque_sp = rhs.m_opaque_sp; + return *this; +} + +SBMutex::SBMutex(lldb::TargetSP target_sp) + : m_opaque_sp(std::shared_ptr( + target_sp, &target_sp->GetAPIMutex())) { + LLDB_INSTRUMENT_VA(this, target_sp); +} + +SBMutex::~SBMutex() { LLDB_INSTRUMENT_VA(this); } + +bool SBMutex::IsValid() const { + LLDB_INSTRUMENT_VA(this); + + return static_cast(m_opaque_sp); +} + +void SBMutex::lock() const { + LLDB_INSTRUMENT_VA(this); + + if (m_opaque_sp) + m_opaque_sp->lock(); +} + +void SBMutex::unlock() const { + LLDB_INSTRUMENT_VA(this); + + if (m_opaque_sp) + m_opaque_sp->unlock(); +} diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp index dd9caa724ea36..0fed1bbfed6a7 100644 --- a/lldb/source/API/SBTarget.cpp +++ b/lldb/source/API/SBTarget.cpp @@ -7,10 +7,6 @@ //===----------------------------------------------------------------------===// #include "lldb/API/SBTarget.h" -#include "lldb/Utility/Instrumentation.h" -#include "lldb/Utility/LLDBLog.h" -#include "lldb/lldb-public.h" - #include "lldb/API/SBBreakpoint.h" #include "lldb/API/SBDebugger.h" #include "lldb/API/SBEnvironment.h" @@ -20,6 +16,7 @@ #include "lldb/API/SBListener.h" #include "lldb/API/SBModule.h" #include "lldb/API/SBModuleSpec.h" +#include "lldb/API/SBMutex.h" #include "lldb/API/SBProcess.h" #include "lldb/API/SBSourceManager.h" #include "lldb/API/SBStream.h" @@ -58,11 +55,14 @@ #include "lldb/Utility/ArchSpec.h" #include "lldb/Utility/Args.h" #include "lldb/Utility/FileSpec.h" +#include "lldb/Utility/Instrumentation.h" +#include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/ProcessInfo.h" #include "lldb/Utility/RegularExpression.h" #include "lldb/ValueObject/ValueObjectConstResult.h" #include "lldb/ValueObject/ValueObjectList.h" #include "lldb/ValueObject/ValueObjectVariable.h" +#include "lldb/lldb-public.h" #include "Commands/CommandObjectBreakpoint.h" #include "lldb/Interpreter/CommandReturnObject.h" @@ -2439,3 +2439,11 @@ lldb::SBTrace SBTarget::CreateTrace(lldb::SBError &error) { } return SBTrace(); } + +lldb::SBMutex SBTarget::GetAPIMutex() const { + LLDB_INSTRUMENT_VA(this); + + if (TargetSP target_sp = GetSP()) + return lldb::SBMutex(target_sp); + return lldb::SBMutex(); +} diff --git a/lldb/test/API/python_api/target/TestTargetAPI.py b/lldb/test/API/python_api/target/TestTargetAPI.py index 155a25b576b03..67b9d192bc625 100644 --- a/lldb/test/API/python_api/target/TestTargetAPI.py +++ b/lldb/test/API/python_api/target/TestTargetAPI.py @@ -537,3 +537,27 @@ def test_setting_selected_target_with_invalid_target(self): """Make sure we don't crash when trying to select invalid target.""" target = lldb.SBTarget() self.dbg.SetSelectedTarget(target) + + @no_debug_info_test + def test_get_api_mutex(self): + """Make sure we can lock and unlock the API mutex from Python.""" + target = self.dbg.GetDummyTarget() + + mutex = target.GetAPIMutex() + self.assertTrue(mutex.IsValid()) + mutex.lock() + # The API call below doesn't actually matter, it's just there to + # confirm we don't block on the API lock. + target.BreakpointCreateByName("foo", "bar") + mutex.unlock() + + @no_debug_info_test + def test_get_api_mutex_with_statement(self): + """Make sure we can lock and unlock the API mutex using a with-statement from Python.""" + target = self.dbg.GetDummyTarget() + + with target.GetAPIMutex() as mutex: + self.assertTrue(mutex.IsValid()) + # The API call below doesn't actually matter, it's just there to + # confirm we don't block on the API lock. + target.BreakpointCreateByName("foo", "bar") diff --git a/lldb/unittests/API/CMakeLists.txt b/lldb/unittests/API/CMakeLists.txt index fe2ff684a5d92..8bdc806878239 100644 --- a/lldb/unittests/API/CMakeLists.txt +++ b/lldb/unittests/API/CMakeLists.txt @@ -1,6 +1,7 @@ add_lldb_unittest(APITests SBCommandInterpreterTest.cpp SBLineEntryTest.cpp + SBMutexTest.cpp LINK_LIBS liblldb diff --git a/lldb/unittests/API/SBMutexTest.cpp b/lldb/unittests/API/SBMutexTest.cpp new file mode 100644 index 0000000000000..0b888c2725aa9 --- /dev/null +++ b/lldb/unittests/API/SBMutexTest.cpp @@ -0,0 +1,57 @@ +//===-- SBMutexTest.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Use the umbrella header for -Wdocumentation. +#include "lldb/API/LLDB.h" + +#include "TestingSupport/SubsystemRAII.h" +#include "lldb/API/SBDebugger.h" +#include "lldb/API/SBTarget.h" +#include "gtest/gtest.h" +#include +#include +#include +#include + +using namespace lldb; +using namespace lldb_private; + +class SBMutexTest : public testing::Test { +protected: + void SetUp() override { debugger = SBDebugger::Create(); } + void TearDown() override { SBDebugger::Destroy(debugger); } + + SubsystemRAII subsystems; + SBDebugger debugger; +}; + +TEST_F(SBMutexTest, LockTest) { + lldb::SBTarget target = debugger.GetDummyTarget(); + + std::future f; + { + std::atomic locked = false; + lldb::SBMutex lock = target.GetAPIMutex(); + std::lock_guard lock_guard(lock); + ASSERT_FALSE(locked.exchange(true)); + + f = std::async(std::launch::async, [&]() { + ASSERT_TRUE(locked); + target.BreakpointCreateByName("foo", "bar"); + ASSERT_FALSE(locked); + }); + ASSERT_TRUE(f.valid()); + + // Wait 500ms to confirm the thread is blocked. + auto status = f.wait_for(std::chrono::milliseconds(500)); + ASSERT_EQ(status, std::future_status::timeout); + + ASSERT_TRUE(locked.exchange(false)); + } + f.wait(); +} From 945c494e2c3c078e26ff521ef3e9455e0ff764ac Mon Sep 17 00:00:00 2001 From: Yuval Deutscher Date: Mon, 31 Mar 2025 18:20:40 +0300 Subject: [PATCH 0100/1029] [lldb] Use correct path for lldb-server executable (#131519) Hey, This solves an issue where running lldb-server-20 with a non-absolute path (for example, when it's installed into `/usr/bin` and the user runs it as `lldb-server-20 ...` and not `/usr/bin/lldb-server-20 ...`) fails with `error: spawn_process failed: execve failed: No such file or directory`. The underlying issue is that when run that way, it attempts to execute a binary named `lldb-server-20` from its current directory. This is also a mild security hazard because lldb-server is often being run as root in the directory /tmp, meaning that an unprivileged user can create the file /tmp/lldb-server-20 and lldb-server will execute it as root. (although, well, it's a debugging server we're talking about, so that may not be a real concern) I haven't previously contributed to this project; if you want me to change anything in the code please don't hesitate to let me know. --- lldb/tools/lldb-server/lldb-platform.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/lldb/tools/lldb-server/lldb-platform.cpp b/lldb/tools/lldb-server/lldb-platform.cpp index 880b45b989b9c..51174a0f443c3 100644 --- a/lldb/tools/lldb-server/lldb-platform.cpp +++ b/lldb/tools/lldb-server/lldb-platform.cpp @@ -31,6 +31,7 @@ #include "Plugins/Process/gdb-remote/ProcessGDBRemoteLog.h" #include "lldb/Host/ConnectionFileDescriptor.h" #include "lldb/Host/HostGetOpt.h" +#include "lldb/Host/HostInfo.h" #include "lldb/Host/MainLoop.h" #include "lldb/Host/OptionParser.h" #include "lldb/Host/Socket.h" @@ -256,8 +257,9 @@ static void client_handle(GDBRemoteCommunicationServerPlatform &platform, printf("Disconnected.\n"); } -static Status spawn_process(const char *progname, const Socket *conn_socket, - uint16_t gdb_port, const lldb_private::Args &args, +static Status spawn_process(const char *progname, const FileSpec &prog, + const Socket *conn_socket, uint16_t gdb_port, + const lldb_private::Args &args, const std::string &log_file, const StringRef log_channels, MainLoop &main_loop) { Status error; @@ -267,9 +269,10 @@ static Status spawn_process(const char *progname, const Socket *conn_socket, ProcessLaunchInfo launch_info; - FileSpec self_spec(progname, FileSpec::Style::native); - launch_info.SetExecutableFile(self_spec, true); + launch_info.SetExecutableFile(prog, false); + launch_info.SetArg0(progname); Args &self_args = launch_info.GetArguments(); + self_args.AppendArgument(progname); self_args.AppendArgument(llvm::StringRef("platform")); self_args.AppendArgument(llvm::StringRef("--child-platform-fd")); self_args.AppendArgument(llvm::to_string(shared_socket.GetSendableFD())); @@ -551,9 +554,10 @@ int main_platform(int argc, char *argv[]) { log_channels, &main_loop, &platform_handles](std::unique_ptr sock_up) { printf("Connection established.\n"); - Status error = spawn_process(progname, sock_up.get(), - gdbserver_port, inferior_arguments, - log_file, log_channels, main_loop); + Status error = spawn_process( + progname, HostInfo::GetProgramFileSpec(), sock_up.get(), + gdbserver_port, inferior_arguments, log_file, log_channels, + main_loop); if (error.Fail()) { Log *log = GetLog(LLDBLog::Platform); LLDB_LOGF(log, "spawn_process failed: %s", error.AsCString()); From b91f978647c1468ae3dd469971a8185c0b788830 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Mon, 31 Mar 2025 15:20:59 +0000 Subject: [PATCH 0101/1029] [gn build] Port 50949ebf523c --- llvm/utils/gn/secondary/lldb/source/API/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn index 5e833cae6a4b1..cf6e3515d0c09 100644 --- a/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/API/BUILD.gn @@ -80,6 +80,7 @@ target(liblldb_type, "liblldb") { "SBMemoryRegionInfoList.cpp", "SBModule.cpp", "SBModuleSpec.cpp", + "SBMutex.cpp", "SBPlatform.cpp", "SBProcess.cpp", "SBProcessInfo.cpp", From c20bea09c25214e15cf704ea3749894b6d531ce1 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 31 Mar 2025 16:24:45 +0100 Subject: [PATCH 0102/1029] [LV] Regen a test with UTC (#133432) --- .../runtime-checks-difference.ll | 214 +++++++++--------- 1 file changed, 113 insertions(+), 101 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll index ecdc4ed416d47..618f64315553d 100644 --- a/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll +++ b/llvm/test/Transforms/LoopVectorize/runtime-checks-difference.ll @@ -1,18 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "vector.ph" --version 5 ; RUN: opt %s -passes=loop-vectorize -hoist-runtime-checks=false -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" define void @same_step_and_size(ptr %a, ptr %b, i64 %n) { -; CHECK-LABEL: @same_step_and_size( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64 -; CHECK-NEXT: [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck -; CHECK: vector.memcheck: +; CHECK-LABEL: define void @same_step_and_size( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[B1]], [[A2]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 -; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 [[DIFF_CHECK]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]] ; entry: br label %loop @@ -33,16 +35,17 @@ exit: } define void @same_step_and_size_no_dominance_between_accesses(ptr %a, ptr %b, i64 %n, i64 %x) { -; CHECK-LABEL: @same_step_and_size_no_dominance_between_accesses( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64 -; CHECK-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck -; CHECK: vector.memcheck: +; CHECK-LABEL: define void @same_step_and_size_no_dominance_between_accesses( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]], i64 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[B2:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[A1:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[B2]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 -; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 [[DIFF_CHECK]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]] ; entry: br label %loop @@ -72,19 +75,20 @@ exit: } define void @different_steps_and_different_access_sizes(ptr %a, ptr %b, i64 %n) { -; CHECK-LABEL: @different_steps_and_different_access_sizes( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck -; CHECK: vector.memcheck: -; CHECK-NEXT: [[N_SHL_2:%.]] = shl i64 %n, 2 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]] -; CHECK-NEXT: [[N_SHL_1:%.]] = shl i64 %n, 1 -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr %a, i64 [[N_SHL_1]] -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr %b, [[SCEVGEP4]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr %a, [[SCEVGEP]] +; CHECK-LABEL: define void @different_steps_and_different_access_sizes( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[N]], 1 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]] ; entry: br label %loop @@ -106,17 +110,18 @@ exit: } define void @steps_match_but_different_access_sizes_1(ptr %a, ptr %b, i64 %n) { -; CHECK-LABEL: @steps_match_but_different_access_sizes_1( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64 -; CHECK-NEXT: [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck -; CHECK: vector.memcheck: +; CHECK-LABEL: define void @steps_match_but_different_access_sizes_1( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[B1]], -2 ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[A2]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 -; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 [[DIFF_CHECK]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]] ; entry: br label %loop @@ -140,17 +145,18 @@ exit: ; Same as @steps_match_but_different_access_sizes_1, but with source and sink ; accesses flipped. define void @steps_match_but_different_access_sizes_2(ptr %a, ptr %b, i64 %n) { -; CHECK-LABEL: @steps_match_but_different_access_sizes_2( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[B2:%.*]] = ptrtoint ptr [[B:%.*]] to i64 -; CHECK-NEXT: [[A1:%.*]] = ptrtoint ptr [[A:%.*]] to i64 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck -; CHECK: vector.memcheck: +; CHECK-LABEL: define void @steps_match_but_different_access_sizes_2( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[B2:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[A1:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[A1]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[B2]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 -; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 [[DIFF_CHECK]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]] ; entry: br label %loop @@ -175,26 +181,29 @@ exit: ; one of the add-recs used is invariant in the inner loop. ; Test case for PR57315. define void @nested_loop_outer_iv_addrec_invariant_in_inner1(ptr %a, ptr %b, i64 %n) { -; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner1( -; CHECK: entry: -; CHECK-NEXT: [[N_SHL_2:%.]] = shl i64 %n, 2 -; CHECK-NEXT: [[B_GEP_UPPER:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]] -; CHECK-NEXT: br label %outer - -; CHECK: outer.header: -; CHECK: [[OUTER_IV_SHL_2:%.]] = shl i64 %outer.iv, 2 -; CHECK-NEXT: [[A_GEP_UPPER:%.*]] = getelementptr nuw i8, ptr %a, i64 [[OUTER_IV_SHL_2]] -; CHECK-NEXT: [[OUTER_IV_4:%.]] = add i64 [[OUTER_IV_SHL_2]], 4 -; CHECK-NEXT: [[A_GEP_UPPER_4:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_4]] -; CHECK: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck - -; CHECK: vector.memcheck: -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[A_GEP_UPPER]], [[B_GEP_UPPER]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr %b, [[A_GEP_UPPER_4]] +; CHECK-LABEL: define void @nested_loop_outer_iv_addrec_invariant_in_inner1( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: br label %[[OUTER_HEADER:.*]] +; CHECK: [[OUTER_HEADER]]: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr nuw i8, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 4 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP1]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]] ; + + entry: br label %outer.header @@ -225,26 +234,29 @@ exit: ; Same as @nested_loop_outer_iv_addrec_invariant_in_inner1 but with dependence ; sink and source swapped. define void @nested_loop_outer_iv_addrec_invariant_in_inner2(ptr %a, ptr %b, i64 %n) { -; CHECK-LABEL: @nested_loop_outer_iv_addrec_invariant_in_inner2( -; CHECK: entry: -; CHECK-NEXT: [[N_SHL_2:%.]] = shl i64 %n, 2 -; CHECK-NEXT: [[B_GEP_UPPER:%.*]] = getelementptr i8, ptr %b, i64 [[N_SHL_2]] -; CHECK-NEXT: br label %outer - -; CHECK: outer.header: -; CHECK: [[OUTER_IV_SHL_2:%.]] = shl i64 %outer.iv, 2 -; CHECK-NEXT: [[A_GEP_UPPER:%.*]] = getelementptr nuw i8, ptr %a, i64 [[OUTER_IV_SHL_2]] -; CHECK-NEXT: [[OUTER_IV_4:%.]] = add i64 [[OUTER_IV_SHL_2]], 4 -; CHECK-NEXT: [[A_GEP_UPPER_4:%.*]] = getelementptr i8, ptr %a, i64 [[OUTER_IV_4]] -; CHECK: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %scalar.ph, label %vector.memcheck - -; CHECK: vector.memcheck: -; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr %b, [[A_GEP_UPPER_4]] -; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A_GEP_UPPER]], [[B_GEP_UPPER]] +; CHECK-LABEL: define void @nested_loop_outer_iv_addrec_invariant_in_inner2( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[N]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]] +; CHECK-NEXT: br label %[[OUTER_HEADER:.*]] +; CHECK: [[OUTER_HEADER]]: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[OUTER_IV]], 2 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr nuw i8, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 4 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[OUTER_IV]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %scalar.ph, label %vector.ph +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]] ; + + entry: br label %outer.header @@ -276,20 +288,21 @@ exit: ; of the outer loop as start value. It is sufficient to subtract the start ; values (%dst, %src) of the outer AddRecs. define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %m, i64 noundef %n) { -; CHECK-LABEL: @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64 -; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST:%.*]] to i64 +; CHECK-LABEL: define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec( +; CHECK-SAME: ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i64 noundef [[M:%.*]], i64 noundef [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-NEXT: [[SUB:%.*]] = sub i64 [[DST1]], [[SRC2]] -; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] -; CHECK: outer.loop: -; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ] +; CHECK-NEXT: br label %[[OUTER_LOOP:.*]] +; CHECK: [[OUTER_LOOP]]: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[OUTER_IV]], [[N]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] -; CHECK: vector.memcheck: +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[SUB]], 16 -; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[DIFF_CHECK]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]] ; entry: br label %outer.loop @@ -321,15 +334,16 @@ outer.exit: } define void @use_diff_checks_when_retrying_with_rt_checks(i64 %off, ptr %dst, ptr %src) { -; CHECK-LABEL: @use_diff_checks_when_retrying_with_rt_checks( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr %src to i64 -; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr %dst to i64 -; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.memcheck -; CHECK: vector.memcheck: -; CHECK-NEXT: [[TMP0:%.*]] = mul i64 %off, -8 +; CHECK-LABEL: define void @use_diff_checks_when_retrying_with_rt_checks( +; CHECK-SAME: i64 [[OFF:%.*]], ptr [[DST:%.*]], ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64 +; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-NEXT: br i1 false, [[SCALAR_PH:label %.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[OFF]], -8 ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 %off, 3 +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[OFF]], 3 ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[DST1]], [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[TMP2]], [[SRC2]] ; CHECK-NEXT: [[DIFF_CHECK3:%.*]] = icmp ult i64 [[TMP3]], 32 @@ -346,9 +360,7 @@ define void @use_diff_checks_when_retrying_with_rt_checks(i64 %off, ptr %dst, pt ; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[SRC2]] ; CHECK-NEXT: [[DIFF_CHECK8:%.*]] = icmp ult i64 [[TMP9]], 32 ; CHECK-NEXT: [[CONFLICT_RDX9:%.*]] = or i1 [[CONFLICT_RDX7]], [[DIFF_CHECK8]] -; CHECK-NEXT: br i1 [[CONFLICT_RDX9]], label %scalar.ph, label %vector.ph -; CHECK: vector.ph: -; CHECK-NEXT: br label %vector.body +; CHECK-NEXT: br i1 [[CONFLICT_RDX9]], [[SCALAR_PH]], [[VECTOR_PH:label %.*]] ; entry: br label %loop From bd862a459d75ef137235853c994dea97a0bc7794 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 31 Mar 2025 16:25:00 +0100 Subject: [PATCH 0103/1029] [AMDGPU] Add subtarget feature for v_lshl_add_u64. NFC. (#133723) --- llvm/lib/Target/AMDGPU/AMDGPU.td | 10 +++++++++- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 ++- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 2 +- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 84619dd656f35..6963b24dd8a5e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1269,6 +1269,10 @@ def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32 "Use a block size of 32 for dynamic VGPR allocation (default is 16)" >; +def FeatureLshlAddU64Inst + : SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true", + "Has v_lshl_add_u64 instruction">; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -1622,7 +1626,8 @@ def FeatureISAVersion9_4_Common : FeatureSet< FeatureAtomicFMinFMaxF64FlatInsts, FeatureAgentScopeFineGrainedRemoteMemoryAtomics, FeatureMemoryAtomicFAddF32DenormalSupport, - FeatureFlatBufferGlobalAtomicFaddF64Inst + FeatureFlatBufferGlobalAtomicFaddF64Inst, + FeatureLshlAddU64Inst, ]>; def FeatureISAVersion9_5_Common : FeatureSet< @@ -2554,6 +2559,9 @@ def HasXF32Insts : Predicate<"Subtarget->hasXF32Insts()">, def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">, AssemblerPredicate<(all_of FeatureAshrPkInsts)>; +def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">, + AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>; + // Include AMDGPU TD files include "SISchedule.td" include "GCNProcessors.td" diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 7384278d81cc1..301e4c0275ad4 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -257,6 +257,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMinimum3Maximum3F32 = false; bool HasMinimum3Maximum3F16 = false; bool HasMinimum3Maximum3PKF16 = false; + bool HasLshlAddU64Inst = false; bool RequiresCOV6 = false; @@ -1140,7 +1141,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasMovB64() const { return GFX940Insts; } - bool hasLshlAddB64() const { return GFX940Insts; } + bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; } bool enableSIScheduler() const { return EnableSIScheduler; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 56149bcd8a839..96c113cc5d24c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5246,7 +5246,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineOperand &Src0 = MI.getOperand(1); MachineOperand &Src1 = MI.getOperand(2); - if (IsAdd && ST.hasLshlAddB64()) { + if (IsAdd && ST.hasLshlAddU64Inst()) { auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64), Dest.getReg()) .add(Src0) diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 14da3447a2256..9feb5df2f9203 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -679,7 +679,7 @@ defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile>; let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0, From 799e9053641a6478d3144866a97737b37b87c260 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 31 Mar 2025 08:29:20 -0700 Subject: [PATCH 0104/1029] [lldb] Create a default rate limit constant in Progress (NFC) (#133506) In #133211, Greg suggested making the rate limit configurable through a setting. Although adding the setting is easy, the two places where we currently use rate limiting aren't tied to a particular debugger. Although it'd be possible to hook up, given how few progress events currently implement rate limiting, I don't think it's worth threading this through, if that's even possible. I still think it's a good idea to be consistent and make it easy to pick the same rate limiting value, so I've moved it into a constant in the Progress class. --- lldb/include/lldb/Core/Progress.h | 4 ++++ lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp | 2 +- .../Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lldb/include/lldb/Core/Progress.h b/lldb/include/lldb/Core/Progress.h index 3003568e8946b..93e34084d7ec1 100644 --- a/lldb/include/lldb/Core/Progress.h +++ b/lldb/include/lldb/Core/Progress.h @@ -115,6 +115,10 @@ class Progress { /// Used to indicate a non-deterministic progress report static constexpr uint64_t kNonDeterministicTotal = UINT64_MAX; + /// The default report time for high frequency progress reports. + static constexpr std::chrono::milliseconds kDefaultHighFrequencyReportTime = + std::chrono::milliseconds(20); + private: void ReportProgress(); static std::atomic g_id; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp index 6f2c45e74132c..047967a30d098 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/ManualDWARFIndex.cpp @@ -81,7 +81,7 @@ void ManualDWARFIndex::Index() { const uint64_t total_progress = units_to_index.size() * 2 + 8; Progress progress("Manually indexing DWARF", module_desc.GetData(), total_progress, /*debugger=*/nullptr, - /*minimum_report_time=*/std::chrono::milliseconds(20)); + Progress::kDefaultHighFrequencyReportTime); // Share one thread pool across operations to avoid the overhead of // recreating the threads. diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp index e346d588a449f..ce351274b4576 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp @@ -724,7 +724,7 @@ void SymbolFileDWARFDebugMap::ForEachSymbolFile( const size_t num_oso_idxs = m_compile_unit_infos.size(); Progress progress(std::move(description), "", num_oso_idxs, /*debugger=*/nullptr, - /*minimum_report_time=*/std::chrono::milliseconds(20)); + Progress::kDefaultHighFrequencyReportTime); for (uint32_t oso_idx = 0; oso_idx < num_oso_idxs; ++oso_idx) { if (SymbolFileDWARF *oso_dwarf = GetSymbolFileByOSOIndex(oso_idx)) { progress.Increment(oso_idx, oso_dwarf->GetObjectFile() From ea06f7f96fb1ce5a77439cf1a26f97c2f2488648 Mon Sep 17 00:00:00 2001 From: Paul Bowen-Huggett Date: Mon, 31 Mar 2025 17:51:34 +0200 Subject: [PATCH 0105/1029] [RISCV] For RV32C, disassembly of c.slli should fail when immediate > 31 (#133713) Fixes #133712. The change causes `c.slli` instructions whose immediate has bit 5 set to be rejected when disassembling RV32C. Added a test to exhaustively cover c.slli for 32 bit targets. A minor tweak to make the debug output a little more readable. The spec. (20240411) says: > For RV32C, shamt[5] must be zero; the code points with shamt[5]=1 are designated for custom extensions. For RV32C and RV64C, the shift amount must be non-zero; the code points with shamt=0 are HINTs. For all base ISAs, the code points with rd=x0 are HINTs, except those with shamt[5]=1 in RV32C. --- .../RISCV/Disassembler/RISCVDisassembler.cpp | 12 +- llvm/test/MC/Disassembler/RISCV/c_slli.txt | 3119 +++++++++++++++++ 2 files changed, 3127 insertions(+), 4 deletions(-) create mode 100644 llvm/test/MC/Disassembler/RISCV/c_slli.txt diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index b46b72b4b73e9..b22a4a7246c23 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -558,8 +558,12 @@ static DecodeStatus decodeRVCInstrRdRs1UImm(MCInst &Inst, uint32_t Insn, const MCDisassembler *Decoder) { Inst.addOperand(MCOperand::createReg(RISCV::X0)); Inst.addOperand(Inst.getOperand(0)); - uint32_t UImm6 = - fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); + + uint32_t UImm6 = fieldFromInstruction(Insn, 12, 1) << 5; + // On RV32C, uimm[5]=1 is reserved for custom extensions. + if (UImm6 != 0 && Decoder->getSubtargetInfo().hasFeature(RISCV::Feature32Bit)) + return MCDisassembler::Fail; + UImm6 |= fieldFromInstruction(Insn, 2, 5); [[maybe_unused]] DecodeStatus Result = decodeUImmOperand<6>(Inst, UImm6, Address, Decoder); assert(Result == MCDisassembler::Success && "Invalid immediate"); @@ -784,7 +788,7 @@ DecodeStatus RISCVDisassembler::getInstruction16(MCInst &MI, uint64_t &Size, if (!Entry.haveContainedFeatures(STI.getFeatureBits())) continue; - LLVM_DEBUG(dbgs() << "Trying " << Entry.Desc << "table:\n"); + LLVM_DEBUG(dbgs() << "Trying " << Entry.Desc << " table:\n"); DecodeStatus Result = decodeInstruction(Entry.Table, MI, Insn, Address, this, STI); if (Result == MCDisassembler::Fail) @@ -820,7 +824,7 @@ DecodeStatus RISCVDisassembler::getInstruction48(MCInst &MI, uint64_t &Size, if (!Entry.haveContainedFeatures(STI.getFeatureBits())) continue; - LLVM_DEBUG(dbgs() << "Trying " << Entry.Desc << "table:\n"); + LLVM_DEBUG(dbgs() << "Trying " << Entry.Desc << " table:\n"); DecodeStatus Result = decodeInstruction(Entry.Table, MI, Insn, Address, this, STI); if (Result == MCDisassembler::Fail) diff --git a/llvm/test/MC/Disassembler/RISCV/c_slli.txt b/llvm/test/MC/Disassembler/RISCV/c_slli.txt new file mode 100644 index 0000000000000..6e8dc4c05147f --- /dev/null +++ b/llvm/test/MC/Disassembler/RISCV/c_slli.txt @@ -0,0 +1,3119 @@ +# This test verifies the disassembler's behavior for all C.LUI opcodes with the +# exception of those assigned to C.ADDI16SP +# +# RUN: llvm-mc --disassemble -triple=riscv32 -mattr=+c -M no-aliases \ +# RUN: --show-encoding < %s | FileCheck --check-prefixes=GOOD %s +# +# RUN: llvm-mc --disassemble -triple=riscv64 -mattr=+c -M no-aliases \ +# RUN: --show-encoding < %s | FileCheck --check-prefixes=GOOD,GOOD64 %s +# +# RUN: llvm-mc --disassemble -triple=riscv32 -mattr=+c -M no-aliases \ +# RUN: --show-encoding < %s 2>&1 | FileCheck --check-prefix=BAD32 %s +# +# RUN: llvm-mc --disassemble -triple=riscv32 -mattr=+c -mattr=+no-rvc-hints \ +# RUN: -M no-aliases --show-encoding < %s 2>&1 | \ +# RUN: FileCheck --check-prefix=NOHINTS %s + +0x06 0x00 # GOOD: c.slli zero, 1 +0x06 0x00 # NOHINTS: invalid instruction encoding +0x0A 0x00 # GOOD: c.slli zero, 2 +0x0A 0x00 # NOHINTS: invalid instruction encoding +0x0E 0x00 # GOOD: c.slli zero, 3 +0x0E 0x00 # NOHINTS: invalid instruction encoding +0x12 0x00 # GOOD: c.slli zero, 4 +0x12 0x00 # NOHINTS: invalid instruction encoding +0x16 0x00 # GOOD: c.slli zero, 5 +0x16 0x00 # NOHINTS: invalid instruction encoding +0x1A 0x00 # GOOD: c.slli zero, 6 +0x1A 0x00 # NOHINTS: invalid instruction encoding +0x1E 0x00 # GOOD: c.slli zero, 7 +0x1E 0x00 # NOHINTS: invalid instruction encoding +0x22 0x00 # GOOD: c.slli zero, 8 +0x22 0x00 # NOHINTS: invalid instruction encoding +0x26 0x00 # GOOD: c.slli zero, 9 +0x26 0x00 # NOHINTS: invalid instruction encoding +0x2A 0x00 # GOOD: c.slli zero, 10 +0x2A 0x00 # NOHINTS: invalid instruction encoding +0x2E 0x00 # GOOD: c.slli zero, 11 +0x2E 0x00 # NOHINTS: invalid instruction encoding +0x32 0x00 # GOOD: c.slli zero, 12 +0x32 0x00 # NOHINTS: invalid instruction encoding +0x36 0x00 # GOOD: c.slli zero, 13 +0x36 0x00 # NOHINTS: invalid instruction encoding +0x3A 0x00 # GOOD: c.slli zero, 14 +0x3A 0x00 # NOHINTS: invalid instruction encoding +0x3E 0x00 # GOOD: c.slli zero, 15 +0x3E 0x00 # NOHINTS: invalid instruction encoding +0x42 0x00 # GOOD: c.slli zero, 16 +0x42 0x00 # NOHINTS: invalid instruction encoding +0x46 0x00 # GOOD: c.slli zero, 17 +0x46 0x00 # NOHINTS: invalid instruction encoding +0x4A 0x00 # GOOD: c.slli zero, 18 +0x4A 0x00 # NOHINTS: invalid instruction encoding +0x4E 0x00 # GOOD: c.slli zero, 19 +0x4E 0x00 # NOHINTS: invalid instruction encoding +0x52 0x00 # GOOD: c.slli zero, 20 +0x52 0x00 # NOHINTS: invalid instruction encoding +0x56 0x00 # GOOD: c.slli zero, 21 +0x56 0x00 # NOHINTS: invalid instruction encoding +0x5A 0x00 # GOOD: c.slli zero, 22 +0x5A 0x00 # NOHINTS: invalid instruction encoding +0x5E 0x00 # GOOD: c.slli zero, 23 +0x5E 0x00 # NOHINTS: invalid instruction encoding +0x62 0x00 # GOOD: c.slli zero, 24 +0x62 0x00 # NOHINTS: invalid instruction encoding +0x66 0x00 # GOOD: c.slli zero, 25 +0x66 0x00 # NOHINTS: invalid instruction encoding +0x6A 0x00 # GOOD: c.slli zero, 26 +0x6A 0x00 # NOHINTS: invalid instruction encoding +0x6E 0x00 # GOOD: c.slli zero, 27 +0x6E 0x00 # NOHINTS: invalid instruction encoding +0x72 0x00 # GOOD: c.slli zero, 28 +0x72 0x00 # NOHINTS: invalid instruction encoding +0x76 0x00 # GOOD: c.slli zero, 29 +0x76 0x00 # NOHINTS: invalid instruction encoding +0x7A 0x00 # GOOD: c.slli zero, 30 +0x7A 0x00 # NOHINTS: invalid instruction encoding +0x7E 0x00 # GOOD: c.slli zero, 31 +0x7E 0x00 # NOHINTS: invalid instruction encoding +0x02 0x10 # BAD32: invalid instruction encoding +0x02 0x10 # GOOD64: c.slli zero, 32 +0x02 0x10 # NOHINTS: invalid instruction encoding +0x06 0x10 # BAD32: invalid instruction encoding +0x06 0x10 # GOOD64: c.slli zero, 33 +0x06 0x10 # NOHINTS: invalid instruction encoding +0x0A 0x10 # BAD32: invalid instruction encoding +0x0A 0x10 # GOOD64: c.slli zero, 34 +0x0A 0x10 # NOHINTS: invalid instruction encoding +0x0E 0x10 # BAD32: invalid instruction encoding +0x0E 0x10 # GOOD64: c.slli zero, 35 +0x0E 0x10 # NOHINTS: invalid instruction encoding +0x12 0x10 # BAD32: invalid instruction encoding +0x12 0x10 # GOOD64: c.slli zero, 36 +0x12 0x10 # NOHINTS: invalid instruction encoding +0x16 0x10 # BAD32: invalid instruction encoding +0x16 0x10 # GOOD64: c.slli zero, 37 +0x16 0x10 # NOHINTS: invalid instruction encoding +0x1A 0x10 # BAD32: invalid instruction encoding +0x1A 0x10 # GOOD64: c.slli zero, 38 +0x1A 0x10 # NOHINTS: invalid instruction encoding +0x1E 0x10 # BAD32: invalid instruction encoding +0x1E 0x10 # GOOD64: c.slli zero, 39 +0x1E 0x10 # NOHINTS: invalid instruction encoding +0x22 0x10 # BAD32: invalid instruction encoding +0x22 0x10 # GOOD64: c.slli zero, 40 +0x22 0x10 # NOHINTS: invalid instruction encoding +0x26 0x10 # BAD32: invalid instruction encoding +0x26 0x10 # GOOD64: c.slli zero, 41 +0x26 0x10 # NOHINTS: invalid instruction encoding +0x2A 0x10 # BAD32: invalid instruction encoding +0x2A 0x10 # GOOD64: c.slli zero, 42 +0x2A 0x10 # NOHINTS: invalid instruction encoding +0x2E 0x10 # BAD32: invalid instruction encoding +0x2E 0x10 # GOOD64: c.slli zero, 43 +0x2E 0x10 # NOHINTS: invalid instruction encoding +0x32 0x10 # BAD32: invalid instruction encoding +0x32 0x10 # GOOD64: c.slli zero, 44 +0x32 0x10 # NOHINTS: invalid instruction encoding +0x36 0x10 # BAD32: invalid instruction encoding +0x36 0x10 # GOOD64: c.slli zero, 45 +0x36 0x10 # NOHINTS: invalid instruction encoding +0x3A 0x10 # BAD32: invalid instruction encoding +0x3A 0x10 # GOOD64: c.slli zero, 46 +0x3A 0x10 # NOHINTS: invalid instruction encoding +0x3E 0x10 # BAD32: invalid instruction encoding +0x3E 0x10 # GOOD64: c.slli zero, 47 +0x3E 0x10 # NOHINTS: invalid instruction encoding +0x42 0x10 # BAD32: invalid instruction encoding +0x42 0x10 # GOOD64: c.slli zero, 48 +0x42 0x10 # NOHINTS: invalid instruction encoding +0x46 0x10 # BAD32: invalid instruction encoding +0x46 0x10 # GOOD64: c.slli zero, 49 +0x46 0x10 # NOHINTS: invalid instruction encoding +0x4A 0x10 # BAD32: invalid instruction encoding +0x4A 0x10 # GOOD64: c.slli zero, 50 +0x4A 0x10 # NOHINTS: invalid instruction encoding +0x4E 0x10 # BAD32: invalid instruction encoding +0x4E 0x10 # GOOD64: c.slli zero, 51 +0x4E 0x10 # NOHINTS: invalid instruction encoding +0x52 0x10 # BAD32: invalid instruction encoding +0x52 0x10 # GOOD64: c.slli zero, 52 +0x52 0x10 # NOHINTS: invalid instruction encoding +0x56 0x10 # BAD32: invalid instruction encoding +0x56 0x10 # GOOD64: c.slli zero, 53 +0x56 0x10 # NOHINTS: invalid instruction encoding +0x5A 0x10 # BAD32: invalid instruction encoding +0x5A 0x10 # GOOD64: c.slli zero, 54 +0x5A 0x10 # NOHINTS: invalid instruction encoding +0x5E 0x10 # BAD32: invalid instruction encoding +0x5E 0x10 # GOOD64: c.slli zero, 55 +0x5E 0x10 # NOHINTS: invalid instruction encoding +0x62 0x10 # BAD32: invalid instruction encoding +0x62 0x10 # GOOD64: c.slli zero, 56 +0x62 0x10 # NOHINTS: invalid instruction encoding +0x66 0x10 # BAD32: invalid instruction encoding +0x66 0x10 # GOOD64: c.slli zero, 57 +0x66 0x10 # NOHINTS: invalid instruction encoding +0x6A 0x10 # BAD32: invalid instruction encoding +0x6A 0x10 # GOOD64: c.slli zero, 58 +0x6A 0x10 # NOHINTS: invalid instruction encoding +0x6E 0x10 # BAD32: invalid instruction encoding +0x6E 0x10 # GOOD64: c.slli zero, 59 +0x6E 0x10 # NOHINTS: invalid instruction encoding +0x72 0x10 # BAD32: invalid instruction encoding +0x72 0x10 # GOOD64: c.slli zero, 60 +0x72 0x10 # NOHINTS: invalid instruction encoding +0x76 0x10 # BAD32: invalid instruction encoding +0x76 0x10 # GOOD64: c.slli zero, 61 +0x76 0x10 # NOHINTS: invalid instruction encoding +0x7A 0x10 # BAD32: invalid instruction encoding +0x7A 0x10 # GOOD64: c.slli zero, 62 +0x7A 0x10 # NOHINTS: invalid instruction encoding +0x7E 0x10 # BAD32: invalid instruction encoding +0x7E 0x10 # GOOD64: c.slli zero, 63 +0x7E 0x10 # NOHINTS: invalid instruction encoding +0x86 0x00 # GOOD: c.slli ra, 1 +0x8A 0x00 # GOOD: c.slli ra, 2 +0x8E 0x00 # GOOD: c.slli ra, 3 +0x92 0x00 # GOOD: c.slli ra, 4 +0x96 0x00 # GOOD: c.slli ra, 5 +0x9A 0x00 # GOOD: c.slli ra, 6 +0x9E 0x00 # GOOD: c.slli ra, 7 +0xA2 0x00 # GOOD: c.slli ra, 8 +0xA6 0x00 # GOOD: c.slli ra, 9 +0xAA 0x00 # GOOD: c.slli ra, 10 +0xAE 0x00 # GOOD: c.slli ra, 11 +0xB2 0x00 # GOOD: c.slli ra, 12 +0xB6 0x00 # GOOD: c.slli ra, 13 +0xBA 0x00 # GOOD: c.slli ra, 14 +0xBE 0x00 # GOOD: c.slli ra, 15 +0xC2 0x00 # GOOD: c.slli ra, 16 +0xC6 0x00 # GOOD: c.slli ra, 17 +0xCA 0x00 # GOOD: c.slli ra, 18 +0xCE 0x00 # GOOD: c.slli ra, 19 +0xD2 0x00 # GOOD: c.slli ra, 20 +0xD6 0x00 # GOOD: c.slli ra, 21 +0xDA 0x00 # GOOD: c.slli ra, 22 +0xDE 0x00 # GOOD: c.slli ra, 23 +0xE2 0x00 # GOOD: c.slli ra, 24 +0xE6 0x00 # GOOD: c.slli ra, 25 +0xEA 0x00 # GOOD: c.slli ra, 26 +0xEE 0x00 # GOOD: c.slli ra, 27 +0xF2 0x00 # GOOD: c.slli ra, 28 +0xF6 0x00 # GOOD: c.slli ra, 29 +0xFA 0x00 # GOOD: c.slli ra, 30 +0xFE 0x00 # GOOD: c.slli ra, 31 +0x82 0x10 # BAD32: invalid instruction encoding +0x82 0x10 # GOOD64: c.slli ra, 32 +0x86 0x10 # BAD32: invalid instruction encoding +0x86 0x10 # GOOD64: c.slli ra, 33 +0x8A 0x10 # BAD32: invalid instruction encoding +0x8A 0x10 # GOOD64: c.slli ra, 34 +0x8E 0x10 # BAD32: invalid instruction encoding +0x8E 0x10 # GOOD64: c.slli ra, 35 +0x92 0x10 # BAD32: invalid instruction encoding +0x92 0x10 # GOOD64: c.slli ra, 36 +0x96 0x10 # BAD32: invalid instruction encoding +0x96 0x10 # GOOD64: c.slli ra, 37 +0x9A 0x10 # BAD32: invalid instruction encoding +0x9A 0x10 # GOOD64: c.slli ra, 38 +0x9E 0x10 # BAD32: invalid instruction encoding +0x9E 0x10 # GOOD64: c.slli ra, 39 +0xA2 0x10 # BAD32: invalid instruction encoding +0xA2 0x10 # GOOD64: c.slli ra, 40 +0xA6 0x10 # BAD32: invalid instruction encoding +0xA6 0x10 # GOOD64: c.slli ra, 41 +0xAA 0x10 # BAD32: invalid instruction encoding +0xAA 0x10 # GOOD64: c.slli ra, 42 +0xAE 0x10 # BAD32: invalid instruction encoding +0xAE 0x10 # GOOD64: c.slli ra, 43 +0xB2 0x10 # BAD32: invalid instruction encoding +0xB2 0x10 # GOOD64: c.slli ra, 44 +0xB6 0x10 # BAD32: invalid instruction encoding +0xB6 0x10 # GOOD64: c.slli ra, 45 +0xBA 0x10 # BAD32: invalid instruction encoding +0xBA 0x10 # GOOD64: c.slli ra, 46 +0xBE 0x10 # BAD32: invalid instruction encoding +0xBE 0x10 # GOOD64: c.slli ra, 47 +0xC2 0x10 # BAD32: invalid instruction encoding +0xC2 0x10 # GOOD64: c.slli ra, 48 +0xC6 0x10 # BAD32: invalid instruction encoding +0xC6 0x10 # GOOD64: c.slli ra, 49 +0xCA 0x10 # BAD32: invalid instruction encoding +0xCA 0x10 # GOOD64: c.slli ra, 50 +0xCE 0x10 # BAD32: invalid instruction encoding +0xCE 0x10 # GOOD64: c.slli ra, 51 +0xD2 0x10 # BAD32: invalid instruction encoding +0xD2 0x10 # GOOD64: c.slli ra, 52 +0xD6 0x10 # BAD32: invalid instruction encoding +0xD6 0x10 # GOOD64: c.slli ra, 53 +0xDA 0x10 # BAD32: invalid instruction encoding +0xDA 0x10 # GOOD64: c.slli ra, 54 +0xDE 0x10 # BAD32: invalid instruction encoding +0xDE 0x10 # GOOD64: c.slli ra, 55 +0xE2 0x10 # BAD32: invalid instruction encoding +0xE2 0x10 # GOOD64: c.slli ra, 56 +0xE6 0x10 # BAD32: invalid instruction encoding +0xE6 0x10 # GOOD64: c.slli ra, 57 +0xEA 0x10 # BAD32: invalid instruction encoding +0xEA 0x10 # GOOD64: c.slli ra, 58 +0xEE 0x10 # BAD32: invalid instruction encoding +0xEE 0x10 # GOOD64: c.slli ra, 59 +0xF2 0x10 # BAD32: invalid instruction encoding +0xF2 0x10 # GOOD64: c.slli ra, 60 +0xF6 0x10 # BAD32: invalid instruction encoding +0xF6 0x10 # GOOD64: c.slli ra, 61 +0xFA 0x10 # BAD32: invalid instruction encoding +0xFA 0x10 # GOOD64: c.slli ra, 62 +0xFE 0x10 # BAD32: invalid instruction encoding +0xFE 0x10 # GOOD64: c.slli ra, 63 +0x06 0x01 # GOOD: c.slli sp, 1 +0x0A 0x01 # GOOD: c.slli sp, 2 +0x0E 0x01 # GOOD: c.slli sp, 3 +0x12 0x01 # GOOD: c.slli sp, 4 +0x16 0x01 # GOOD: c.slli sp, 5 +0x1A 0x01 # GOOD: c.slli sp, 6 +0x1E 0x01 # GOOD: c.slli sp, 7 +0x22 0x01 # GOOD: c.slli sp, 8 +0x26 0x01 # GOOD: c.slli sp, 9 +0x2A 0x01 # GOOD: c.slli sp, 10 +0x2E 0x01 # GOOD: c.slli sp, 11 +0x32 0x01 # GOOD: c.slli sp, 12 +0x36 0x01 # GOOD: c.slli sp, 13 +0x3A 0x01 # GOOD: c.slli sp, 14 +0x3E 0x01 # GOOD: c.slli sp, 15 +0x42 0x01 # GOOD: c.slli sp, 16 +0x46 0x01 # GOOD: c.slli sp, 17 +0x4A 0x01 # GOOD: c.slli sp, 18 +0x4E 0x01 # GOOD: c.slli sp, 19 +0x52 0x01 # GOOD: c.slli sp, 20 +0x56 0x01 # GOOD: c.slli sp, 21 +0x5A 0x01 # GOOD: c.slli sp, 22 +0x5E 0x01 # GOOD: c.slli sp, 23 +0x62 0x01 # GOOD: c.slli sp, 24 +0x66 0x01 # GOOD: c.slli sp, 25 +0x6A 0x01 # GOOD: c.slli sp, 26 +0x6E 0x01 # GOOD: c.slli sp, 27 +0x72 0x01 # GOOD: c.slli sp, 28 +0x76 0x01 # GOOD: c.slli sp, 29 +0x7A 0x01 # GOOD: c.slli sp, 30 +0x7E 0x01 # GOOD: c.slli sp, 31 +0x02 0x11 # BAD32: invalid instruction encoding +0x02 0x11 # GOOD64: c.slli sp, 32 +0x06 0x11 # BAD32: invalid instruction encoding +0x06 0x11 # GOOD64: c.slli sp, 33 +0x0A 0x11 # BAD32: invalid instruction encoding +0x0A 0x11 # GOOD64: c.slli sp, 34 +0x0E 0x11 # BAD32: invalid instruction encoding +0x0E 0x11 # GOOD64: c.slli sp, 35 +0x12 0x11 # BAD32: invalid instruction encoding +0x12 0x11 # GOOD64: c.slli sp, 36 +0x16 0x11 # BAD32: invalid instruction encoding +0x16 0x11 # GOOD64: c.slli sp, 37 +0x1A 0x11 # BAD32: invalid instruction encoding +0x1A 0x11 # GOOD64: c.slli sp, 38 +0x1E 0x11 # BAD32: invalid instruction encoding +0x1E 0x11 # GOOD64: c.slli sp, 39 +0x22 0x11 # BAD32: invalid instruction encoding +0x22 0x11 # GOOD64: c.slli sp, 40 +0x26 0x11 # BAD32: invalid instruction encoding +0x26 0x11 # GOOD64: c.slli sp, 41 +0x2A 0x11 # BAD32: invalid instruction encoding +0x2A 0x11 # GOOD64: c.slli sp, 42 +0x2E 0x11 # BAD32: invalid instruction encoding +0x2E 0x11 # GOOD64: c.slli sp, 43 +0x32 0x11 # BAD32: invalid instruction encoding +0x32 0x11 # GOOD64: c.slli sp, 44 +0x36 0x11 # BAD32: invalid instruction encoding +0x36 0x11 # GOOD64: c.slli sp, 45 +0x3A 0x11 # BAD32: invalid instruction encoding +0x3A 0x11 # GOOD64: c.slli sp, 46 +0x3E 0x11 # BAD32: invalid instruction encoding +0x3E 0x11 # GOOD64: c.slli sp, 47 +0x42 0x11 # BAD32: invalid instruction encoding +0x42 0x11 # GOOD64: c.slli sp, 48 +0x46 0x11 # BAD32: invalid instruction encoding +0x46 0x11 # GOOD64: c.slli sp, 49 +0x4A 0x11 # BAD32: invalid instruction encoding +0x4A 0x11 # GOOD64: c.slli sp, 50 +0x4E 0x11 # BAD32: invalid instruction encoding +0x4E 0x11 # GOOD64: c.slli sp, 51 +0x52 0x11 # BAD32: invalid instruction encoding +0x52 0x11 # GOOD64: c.slli sp, 52 +0x56 0x11 # BAD32: invalid instruction encoding +0x56 0x11 # GOOD64: c.slli sp, 53 +0x5A 0x11 # BAD32: invalid instruction encoding +0x5A 0x11 # GOOD64: c.slli sp, 54 +0x5E 0x11 # BAD32: invalid instruction encoding +0x5E 0x11 # GOOD64: c.slli sp, 55 +0x62 0x11 # BAD32: invalid instruction encoding +0x62 0x11 # GOOD64: c.slli sp, 56 +0x66 0x11 # BAD32: invalid instruction encoding +0x66 0x11 # GOOD64: c.slli sp, 57 +0x6A 0x11 # BAD32: invalid instruction encoding +0x6A 0x11 # GOOD64: c.slli sp, 58 +0x6E 0x11 # BAD32: invalid instruction encoding +0x6E 0x11 # GOOD64: c.slli sp, 59 +0x72 0x11 # BAD32: invalid instruction encoding +0x72 0x11 # GOOD64: c.slli sp, 60 +0x76 0x11 # BAD32: invalid instruction encoding +0x76 0x11 # GOOD64: c.slli sp, 61 +0x7A 0x11 # BAD32: invalid instruction encoding +0x7A 0x11 # GOOD64: c.slli sp, 62 +0x7E 0x11 # BAD32: invalid instruction encoding +0x7E 0x11 # GOOD64: c.slli sp, 63 +0x86 0x01 # GOOD: c.slli gp, 1 +0x8A 0x01 # GOOD: c.slli gp, 2 +0x8E 0x01 # GOOD: c.slli gp, 3 +0x92 0x01 # GOOD: c.slli gp, 4 +0x96 0x01 # GOOD: c.slli gp, 5 +0x9A 0x01 # GOOD: c.slli gp, 6 +0x9E 0x01 # GOOD: c.slli gp, 7 +0xA2 0x01 # GOOD: c.slli gp, 8 +0xA6 0x01 # GOOD: c.slli gp, 9 +0xAA 0x01 # GOOD: c.slli gp, 10 +0xAE 0x01 # GOOD: c.slli gp, 11 +0xB2 0x01 # GOOD: c.slli gp, 12 +0xB6 0x01 # GOOD: c.slli gp, 13 +0xBA 0x01 # GOOD: c.slli gp, 14 +0xBE 0x01 # GOOD: c.slli gp, 15 +0xC2 0x01 # GOOD: c.slli gp, 16 +0xC6 0x01 # GOOD: c.slli gp, 17 +0xCA 0x01 # GOOD: c.slli gp, 18 +0xCE 0x01 # GOOD: c.slli gp, 19 +0xD2 0x01 # GOOD: c.slli gp, 20 +0xD6 0x01 # GOOD: c.slli gp, 21 +0xDA 0x01 # GOOD: c.slli gp, 22 +0xDE 0x01 # GOOD: c.slli gp, 23 +0xE2 0x01 # GOOD: c.slli gp, 24 +0xE6 0x01 # GOOD: c.slli gp, 25 +0xEA 0x01 # GOOD: c.slli gp, 26 +0xEE 0x01 # GOOD: c.slli gp, 27 +0xF2 0x01 # GOOD: c.slli gp, 28 +0xF6 0x01 # GOOD: c.slli gp, 29 +0xFA 0x01 # GOOD: c.slli gp, 30 +0xFE 0x01 # GOOD: c.slli gp, 31 +0x82 0x11 # BAD32: invalid instruction encoding +0x82 0x11 # GOOD64: c.slli gp, 32 +0x86 0x11 # BAD32: invalid instruction encoding +0x86 0x11 # GOOD64: c.slli gp, 33 +0x8A 0x11 # BAD32: invalid instruction encoding +0x8A 0x11 # GOOD64: c.slli gp, 34 +0x8E 0x11 # BAD32: invalid instruction encoding +0x8E 0x11 # GOOD64: c.slli gp, 35 +0x92 0x11 # BAD32: invalid instruction encoding +0x92 0x11 # GOOD64: c.slli gp, 36 +0x96 0x11 # BAD32: invalid instruction encoding +0x96 0x11 # GOOD64: c.slli gp, 37 +0x9A 0x11 # BAD32: invalid instruction encoding +0x9A 0x11 # GOOD64: c.slli gp, 38 +0x9E 0x11 # BAD32: invalid instruction encoding +0x9E 0x11 # GOOD64: c.slli gp, 39 +0xA2 0x11 # BAD32: invalid instruction encoding +0xA2 0x11 # GOOD64: c.slli gp, 40 +0xA6 0x11 # BAD32: invalid instruction encoding +0xA6 0x11 # GOOD64: c.slli gp, 41 +0xAA 0x11 # BAD32: invalid instruction encoding +0xAA 0x11 # GOOD64: c.slli gp, 42 +0xAE 0x11 # BAD32: invalid instruction encoding +0xAE 0x11 # GOOD64: c.slli gp, 43 +0xB2 0x11 # BAD32: invalid instruction encoding +0xB2 0x11 # GOOD64: c.slli gp, 44 +0xB6 0x11 # BAD32: invalid instruction encoding +0xB6 0x11 # GOOD64: c.slli gp, 45 +0xBA 0x11 # BAD32: invalid instruction encoding +0xBA 0x11 # GOOD64: c.slli gp, 46 +0xBE 0x11 # BAD32: invalid instruction encoding +0xBE 0x11 # GOOD64: c.slli gp, 47 +0xC2 0x11 # BAD32: invalid instruction encoding +0xC2 0x11 # GOOD64: c.slli gp, 48 +0xC6 0x11 # BAD32: invalid instruction encoding +0xC6 0x11 # GOOD64: c.slli gp, 49 +0xCA 0x11 # BAD32: invalid instruction encoding +0xCA 0x11 # GOOD64: c.slli gp, 50 +0xCE 0x11 # BAD32: invalid instruction encoding +0xCE 0x11 # GOOD64: c.slli gp, 51 +0xD2 0x11 # BAD32: invalid instruction encoding +0xD2 0x11 # GOOD64: c.slli gp, 52 +0xD6 0x11 # BAD32: invalid instruction encoding +0xD6 0x11 # GOOD64: c.slli gp, 53 +0xDA 0x11 # BAD32: invalid instruction encoding +0xDA 0x11 # GOOD64: c.slli gp, 54 +0xDE 0x11 # BAD32: invalid instruction encoding +0xDE 0x11 # GOOD64: c.slli gp, 55 +0xE2 0x11 # BAD32: invalid instruction encoding +0xE2 0x11 # GOOD64: c.slli gp, 56 +0xE6 0x11 # BAD32: invalid instruction encoding +0xE6 0x11 # GOOD64: c.slli gp, 57 +0xEA 0x11 # BAD32: invalid instruction encoding +0xEA 0x11 # GOOD64: c.slli gp, 58 +0xEE 0x11 # BAD32: invalid instruction encoding +0xEE 0x11 # GOOD64: c.slli gp, 59 +0xF2 0x11 # BAD32: invalid instruction encoding +0xF2 0x11 # GOOD64: c.slli gp, 60 +0xF6 0x11 # BAD32: invalid instruction encoding +0xF6 0x11 # GOOD64: c.slli gp, 61 +0xFA 0x11 # BAD32: invalid instruction encoding +0xFA 0x11 # GOOD64: c.slli gp, 62 +0xFE 0x11 # BAD32: invalid instruction encoding +0xFE 0x11 # GOOD64: c.slli gp, 63 +0x06 0x02 # GOOD: c.slli tp, 1 +0x0A 0x02 # GOOD: c.slli tp, 2 +0x0E 0x02 # GOOD: c.slli tp, 3 +0x12 0x02 # GOOD: c.slli tp, 4 +0x16 0x02 # GOOD: c.slli tp, 5 +0x1A 0x02 # GOOD: c.slli tp, 6 +0x1E 0x02 # GOOD: c.slli tp, 7 +0x22 0x02 # GOOD: c.slli tp, 8 +0x26 0x02 # GOOD: c.slli tp, 9 +0x2A 0x02 # GOOD: c.slli tp, 10 +0x2E 0x02 # GOOD: c.slli tp, 11 +0x32 0x02 # GOOD: c.slli tp, 12 +0x36 0x02 # GOOD: c.slli tp, 13 +0x3A 0x02 # GOOD: c.slli tp, 14 +0x3E 0x02 # GOOD: c.slli tp, 15 +0x42 0x02 # GOOD: c.slli tp, 16 +0x46 0x02 # GOOD: c.slli tp, 17 +0x4A 0x02 # GOOD: c.slli tp, 18 +0x4E 0x02 # GOOD: c.slli tp, 19 +0x52 0x02 # GOOD: c.slli tp, 20 +0x56 0x02 # GOOD: c.slli tp, 21 +0x5A 0x02 # GOOD: c.slli tp, 22 +0x5E 0x02 # GOOD: c.slli tp, 23 +0x62 0x02 # GOOD: c.slli tp, 24 +0x66 0x02 # GOOD: c.slli tp, 25 +0x6A 0x02 # GOOD: c.slli tp, 26 +0x6E 0x02 # GOOD: c.slli tp, 27 +0x72 0x02 # GOOD: c.slli tp, 28 +0x76 0x02 # GOOD: c.slli tp, 29 +0x7A 0x02 # GOOD: c.slli tp, 30 +0x7E 0x02 # GOOD: c.slli tp, 31 +0x02 0x12 # BAD32: invalid instruction encoding +0x02 0x12 # GOOD64: c.slli tp, 32 +0x06 0x12 # BAD32: invalid instruction encoding +0x06 0x12 # GOOD64: c.slli tp, 33 +0x0A 0x12 # BAD32: invalid instruction encoding +0x0A 0x12 # GOOD64: c.slli tp, 34 +0x0E 0x12 # BAD32: invalid instruction encoding +0x0E 0x12 # GOOD64: c.slli tp, 35 +0x12 0x12 # BAD32: invalid instruction encoding +0x12 0x12 # GOOD64: c.slli tp, 36 +0x16 0x12 # BAD32: invalid instruction encoding +0x16 0x12 # GOOD64: c.slli tp, 37 +0x1A 0x12 # BAD32: invalid instruction encoding +0x1A 0x12 # GOOD64: c.slli tp, 38 +0x1E 0x12 # BAD32: invalid instruction encoding +0x1E 0x12 # GOOD64: c.slli tp, 39 +0x22 0x12 # BAD32: invalid instruction encoding +0x22 0x12 # GOOD64: c.slli tp, 40 +0x26 0x12 # BAD32: invalid instruction encoding +0x26 0x12 # GOOD64: c.slli tp, 41 +0x2A 0x12 # BAD32: invalid instruction encoding +0x2A 0x12 # GOOD64: c.slli tp, 42 +0x2E 0x12 # BAD32: invalid instruction encoding +0x2E 0x12 # GOOD64: c.slli tp, 43 +0x32 0x12 # BAD32: invalid instruction encoding +0x32 0x12 # GOOD64: c.slli tp, 44 +0x36 0x12 # BAD32: invalid instruction encoding +0x36 0x12 # GOOD64: c.slli tp, 45 +0x3A 0x12 # BAD32: invalid instruction encoding +0x3A 0x12 # GOOD64: c.slli tp, 46 +0x3E 0x12 # BAD32: invalid instruction encoding +0x3E 0x12 # GOOD64: c.slli tp, 47 +0x42 0x12 # BAD32: invalid instruction encoding +0x42 0x12 # GOOD64: c.slli tp, 48 +0x46 0x12 # BAD32: invalid instruction encoding +0x46 0x12 # GOOD64: c.slli tp, 49 +0x4A 0x12 # BAD32: invalid instruction encoding +0x4A 0x12 # GOOD64: c.slli tp, 50 +0x4E 0x12 # BAD32: invalid instruction encoding +0x4E 0x12 # GOOD64: c.slli tp, 51 +0x52 0x12 # BAD32: invalid instruction encoding +0x52 0x12 # GOOD64: c.slli tp, 52 +0x56 0x12 # BAD32: invalid instruction encoding +0x56 0x12 # GOOD64: c.slli tp, 53 +0x5A 0x12 # BAD32: invalid instruction encoding +0x5A 0x12 # GOOD64: c.slli tp, 54 +0x5E 0x12 # BAD32: invalid instruction encoding +0x5E 0x12 # GOOD64: c.slli tp, 55 +0x62 0x12 # BAD32: invalid instruction encoding +0x62 0x12 # GOOD64: c.slli tp, 56 +0x66 0x12 # BAD32: invalid instruction encoding +0x66 0x12 # GOOD64: c.slli tp, 57 +0x6A 0x12 # BAD32: invalid instruction encoding +0x6A 0x12 # GOOD64: c.slli tp, 58 +0x6E 0x12 # BAD32: invalid instruction encoding +0x6E 0x12 # GOOD64: c.slli tp, 59 +0x72 0x12 # BAD32: invalid instruction encoding +0x72 0x12 # GOOD64: c.slli tp, 60 +0x76 0x12 # BAD32: invalid instruction encoding +0x76 0x12 # GOOD64: c.slli tp, 61 +0x7A 0x12 # BAD32: invalid instruction encoding +0x7A 0x12 # GOOD64: c.slli tp, 62 +0x7E 0x12 # BAD32: invalid instruction encoding +0x7E 0x12 # GOOD64: c.slli tp, 63 +0x86 0x02 # GOOD: c.slli t0, 1 +0x8A 0x02 # GOOD: c.slli t0, 2 +0x8E 0x02 # GOOD: c.slli t0, 3 +0x92 0x02 # GOOD: c.slli t0, 4 +0x96 0x02 # GOOD: c.slli t0, 5 +0x9A 0x02 # GOOD: c.slli t0, 6 +0x9E 0x02 # GOOD: c.slli t0, 7 +0xA2 0x02 # GOOD: c.slli t0, 8 +0xA6 0x02 # GOOD: c.slli t0, 9 +0xAA 0x02 # GOOD: c.slli t0, 10 +0xAE 0x02 # GOOD: c.slli t0, 11 +0xB2 0x02 # GOOD: c.slli t0, 12 +0xB6 0x02 # GOOD: c.slli t0, 13 +0xBA 0x02 # GOOD: c.slli t0, 14 +0xBE 0x02 # GOOD: c.slli t0, 15 +0xC2 0x02 # GOOD: c.slli t0, 16 +0xC6 0x02 # GOOD: c.slli t0, 17 +0xCA 0x02 # GOOD: c.slli t0, 18 +0xCE 0x02 # GOOD: c.slli t0, 19 +0xD2 0x02 # GOOD: c.slli t0, 20 +0xD6 0x02 # GOOD: c.slli t0, 21 +0xDA 0x02 # GOOD: c.slli t0, 22 +0xDE 0x02 # GOOD: c.slli t0, 23 +0xE2 0x02 # GOOD: c.slli t0, 24 +0xE6 0x02 # GOOD: c.slli t0, 25 +0xEA 0x02 # GOOD: c.slli t0, 26 +0xEE 0x02 # GOOD: c.slli t0, 27 +0xF2 0x02 # GOOD: c.slli t0, 28 +0xF6 0x02 # GOOD: c.slli t0, 29 +0xFA 0x02 # GOOD: c.slli t0, 30 +0xFE 0x02 # GOOD: c.slli t0, 31 +0x82 0x12 # BAD32: invalid instruction encoding +0x82 0x12 # GOOD64: c.slli t0, 32 +0x86 0x12 # BAD32: invalid instruction encoding +0x86 0x12 # GOOD64: c.slli t0, 33 +0x8A 0x12 # BAD32: invalid instruction encoding +0x8A 0x12 # GOOD64: c.slli t0, 34 +0x8E 0x12 # BAD32: invalid instruction encoding +0x8E 0x12 # GOOD64: c.slli t0, 35 +0x92 0x12 # BAD32: invalid instruction encoding +0x92 0x12 # GOOD64: c.slli t0, 36 +0x96 0x12 # BAD32: invalid instruction encoding +0x96 0x12 # GOOD64: c.slli t0, 37 +0x9A 0x12 # BAD32: invalid instruction encoding +0x9A 0x12 # GOOD64: c.slli t0, 38 +0x9E 0x12 # BAD32: invalid instruction encoding +0x9E 0x12 # GOOD64: c.slli t0, 39 +0xA2 0x12 # BAD32: invalid instruction encoding +0xA2 0x12 # GOOD64: c.slli t0, 40 +0xA6 0x12 # BAD32: invalid instruction encoding +0xA6 0x12 # GOOD64: c.slli t0, 41 +0xAA 0x12 # BAD32: invalid instruction encoding +0xAA 0x12 # GOOD64: c.slli t0, 42 +0xAE 0x12 # BAD32: invalid instruction encoding +0xAE 0x12 # GOOD64: c.slli t0, 43 +0xB2 0x12 # BAD32: invalid instruction encoding +0xB2 0x12 # GOOD64: c.slli t0, 44 +0xB6 0x12 # BAD32: invalid instruction encoding +0xB6 0x12 # GOOD64: c.slli t0, 45 +0xBA 0x12 # BAD32: invalid instruction encoding +0xBA 0x12 # GOOD64: c.slli t0, 46 +0xBE 0x12 # BAD32: invalid instruction encoding +0xBE 0x12 # GOOD64: c.slli t0, 47 +0xC2 0x12 # BAD32: invalid instruction encoding +0xC2 0x12 # GOOD64: c.slli t0, 48 +0xC6 0x12 # BAD32: invalid instruction encoding +0xC6 0x12 # GOOD64: c.slli t0, 49 +0xCA 0x12 # BAD32: invalid instruction encoding +0xCA 0x12 # GOOD64: c.slli t0, 50 +0xCE 0x12 # BAD32: invalid instruction encoding +0xCE 0x12 # GOOD64: c.slli t0, 51 +0xD2 0x12 # BAD32: invalid instruction encoding +0xD2 0x12 # GOOD64: c.slli t0, 52 +0xD6 0x12 # BAD32: invalid instruction encoding +0xD6 0x12 # GOOD64: c.slli t0, 53 +0xDA 0x12 # BAD32: invalid instruction encoding +0xDA 0x12 # GOOD64: c.slli t0, 54 +0xDE 0x12 # BAD32: invalid instruction encoding +0xDE 0x12 # GOOD64: c.slli t0, 55 +0xE2 0x12 # BAD32: invalid instruction encoding +0xE2 0x12 # GOOD64: c.slli t0, 56 +0xE6 0x12 # BAD32: invalid instruction encoding +0xE6 0x12 # GOOD64: c.slli t0, 57 +0xEA 0x12 # BAD32: invalid instruction encoding +0xEA 0x12 # GOOD64: c.slli t0, 58 +0xEE 0x12 # BAD32: invalid instruction encoding +0xEE 0x12 # GOOD64: c.slli t0, 59 +0xF2 0x12 # BAD32: invalid instruction encoding +0xF2 0x12 # GOOD64: c.slli t0, 60 +0xF6 0x12 # BAD32: invalid instruction encoding +0xF6 0x12 # GOOD64: c.slli t0, 61 +0xFA 0x12 # BAD32: invalid instruction encoding +0xFA 0x12 # GOOD64: c.slli t0, 62 +0xFE 0x12 # BAD32: invalid instruction encoding +0xFE 0x12 # GOOD64: c.slli t0, 63 +0x06 0x03 # GOOD: c.slli t1, 1 +0x0A 0x03 # GOOD: c.slli t1, 2 +0x0E 0x03 # GOOD: c.slli t1, 3 +0x12 0x03 # GOOD: c.slli t1, 4 +0x16 0x03 # GOOD: c.slli t1, 5 +0x1A 0x03 # GOOD: c.slli t1, 6 +0x1E 0x03 # GOOD: c.slli t1, 7 +0x22 0x03 # GOOD: c.slli t1, 8 +0x26 0x03 # GOOD: c.slli t1, 9 +0x2A 0x03 # GOOD: c.slli t1, 10 +0x2E 0x03 # GOOD: c.slli t1, 11 +0x32 0x03 # GOOD: c.slli t1, 12 +0x36 0x03 # GOOD: c.slli t1, 13 +0x3A 0x03 # GOOD: c.slli t1, 14 +0x3E 0x03 # GOOD: c.slli t1, 15 +0x42 0x03 # GOOD: c.slli t1, 16 +0x46 0x03 # GOOD: c.slli t1, 17 +0x4A 0x03 # GOOD: c.slli t1, 18 +0x4E 0x03 # GOOD: c.slli t1, 19 +0x52 0x03 # GOOD: c.slli t1, 20 +0x56 0x03 # GOOD: c.slli t1, 21 +0x5A 0x03 # GOOD: c.slli t1, 22 +0x5E 0x03 # GOOD: c.slli t1, 23 +0x62 0x03 # GOOD: c.slli t1, 24 +0x66 0x03 # GOOD: c.slli t1, 25 +0x6A 0x03 # GOOD: c.slli t1, 26 +0x6E 0x03 # GOOD: c.slli t1, 27 +0x72 0x03 # GOOD: c.slli t1, 28 +0x76 0x03 # GOOD: c.slli t1, 29 +0x7A 0x03 # GOOD: c.slli t1, 30 +0x7E 0x03 # GOOD: c.slli t1, 31 +0x02 0x13 # BAD32: invalid instruction encoding +0x02 0x13 # GOOD64: c.slli t1, 32 +0x06 0x13 # BAD32: invalid instruction encoding +0x06 0x13 # GOOD64: c.slli t1, 33 +0x0A 0x13 # BAD32: invalid instruction encoding +0x0A 0x13 # GOOD64: c.slli t1, 34 +0x0E 0x13 # BAD32: invalid instruction encoding +0x0E 0x13 # GOOD64: c.slli t1, 35 +0x12 0x13 # BAD32: invalid instruction encoding +0x12 0x13 # GOOD64: c.slli t1, 36 +0x16 0x13 # BAD32: invalid instruction encoding +0x16 0x13 # GOOD64: c.slli t1, 37 +0x1A 0x13 # BAD32: invalid instruction encoding +0x1A 0x13 # GOOD64: c.slli t1, 38 +0x1E 0x13 # BAD32: invalid instruction encoding +0x1E 0x13 # GOOD64: c.slli t1, 39 +0x22 0x13 # BAD32: invalid instruction encoding +0x22 0x13 # GOOD64: c.slli t1, 40 +0x26 0x13 # BAD32: invalid instruction encoding +0x26 0x13 # GOOD64: c.slli t1, 41 +0x2A 0x13 # BAD32: invalid instruction encoding +0x2A 0x13 # GOOD64: c.slli t1, 42 +0x2E 0x13 # BAD32: invalid instruction encoding +0x2E 0x13 # GOOD64: c.slli t1, 43 +0x32 0x13 # BAD32: invalid instruction encoding +0x32 0x13 # GOOD64: c.slli t1, 44 +0x36 0x13 # BAD32: invalid instruction encoding +0x36 0x13 # GOOD64: c.slli t1, 45 +0x3A 0x13 # BAD32: invalid instruction encoding +0x3A 0x13 # GOOD64: c.slli t1, 46 +0x3E 0x13 # BAD32: invalid instruction encoding +0x3E 0x13 # GOOD64: c.slli t1, 47 +0x42 0x13 # BAD32: invalid instruction encoding +0x42 0x13 # GOOD64: c.slli t1, 48 +0x46 0x13 # BAD32: invalid instruction encoding +0x46 0x13 # GOOD64: c.slli t1, 49 +0x4A 0x13 # BAD32: invalid instruction encoding +0x4A 0x13 # GOOD64: c.slli t1, 50 +0x4E 0x13 # BAD32: invalid instruction encoding +0x4E 0x13 # GOOD64: c.slli t1, 51 +0x52 0x13 # BAD32: invalid instruction encoding +0x52 0x13 # GOOD64: c.slli t1, 52 +0x56 0x13 # BAD32: invalid instruction encoding +0x56 0x13 # GOOD64: c.slli t1, 53 +0x5A 0x13 # BAD32: invalid instruction encoding +0x5A 0x13 # GOOD64: c.slli t1, 54 +0x5E 0x13 # BAD32: invalid instruction encoding +0x5E 0x13 # GOOD64: c.slli t1, 55 +0x62 0x13 # BAD32: invalid instruction encoding +0x62 0x13 # GOOD64: c.slli t1, 56 +0x66 0x13 # BAD32: invalid instruction encoding +0x66 0x13 # GOOD64: c.slli t1, 57 +0x6A 0x13 # BAD32: invalid instruction encoding +0x6A 0x13 # GOOD64: c.slli t1, 58 +0x6E 0x13 # BAD32: invalid instruction encoding +0x6E 0x13 # GOOD64: c.slli t1, 59 +0x72 0x13 # BAD32: invalid instruction encoding +0x72 0x13 # GOOD64: c.slli t1, 60 +0x76 0x13 # BAD32: invalid instruction encoding +0x76 0x13 # GOOD64: c.slli t1, 61 +0x7A 0x13 # BAD32: invalid instruction encoding +0x7A 0x13 # GOOD64: c.slli t1, 62 +0x7E 0x13 # BAD32: invalid instruction encoding +0x7E 0x13 # GOOD64: c.slli t1, 63 +0x86 0x03 # GOOD: c.slli t2, 1 +0x8A 0x03 # GOOD: c.slli t2, 2 +0x8E 0x03 # GOOD: c.slli t2, 3 +0x92 0x03 # GOOD: c.slli t2, 4 +0x96 0x03 # GOOD: c.slli t2, 5 +0x9A 0x03 # GOOD: c.slli t2, 6 +0x9E 0x03 # GOOD: c.slli t2, 7 +0xA2 0x03 # GOOD: c.slli t2, 8 +0xA6 0x03 # GOOD: c.slli t2, 9 +0xAA 0x03 # GOOD: c.slli t2, 10 +0xAE 0x03 # GOOD: c.slli t2, 11 +0xB2 0x03 # GOOD: c.slli t2, 12 +0xB6 0x03 # GOOD: c.slli t2, 13 +0xBA 0x03 # GOOD: c.slli t2, 14 +0xBE 0x03 # GOOD: c.slli t2, 15 +0xC2 0x03 # GOOD: c.slli t2, 16 +0xC6 0x03 # GOOD: c.slli t2, 17 +0xCA 0x03 # GOOD: c.slli t2, 18 +0xCE 0x03 # GOOD: c.slli t2, 19 +0xD2 0x03 # GOOD: c.slli t2, 20 +0xD6 0x03 # GOOD: c.slli t2, 21 +0xDA 0x03 # GOOD: c.slli t2, 22 +0xDE 0x03 # GOOD: c.slli t2, 23 +0xE2 0x03 # GOOD: c.slli t2, 24 +0xE6 0x03 # GOOD: c.slli t2, 25 +0xEA 0x03 # GOOD: c.slli t2, 26 +0xEE 0x03 # GOOD: c.slli t2, 27 +0xF2 0x03 # GOOD: c.slli t2, 28 +0xF6 0x03 # GOOD: c.slli t2, 29 +0xFA 0x03 # GOOD: c.slli t2, 30 +0xFE 0x03 # GOOD: c.slli t2, 31 +0x82 0x13 # BAD32: invalid instruction encoding +0x82 0x13 # GOOD64: c.slli t2, 32 +0x86 0x13 # BAD32: invalid instruction encoding +0x86 0x13 # GOOD64: c.slli t2, 33 +0x8A 0x13 # BAD32: invalid instruction encoding +0x8A 0x13 # GOOD64: c.slli t2, 34 +0x8E 0x13 # BAD32: invalid instruction encoding +0x8E 0x13 # GOOD64: c.slli t2, 35 +0x92 0x13 # BAD32: invalid instruction encoding +0x92 0x13 # GOOD64: c.slli t2, 36 +0x96 0x13 # BAD32: invalid instruction encoding +0x96 0x13 # GOOD64: c.slli t2, 37 +0x9A 0x13 # BAD32: invalid instruction encoding +0x9A 0x13 # GOOD64: c.slli t2, 38 +0x9E 0x13 # BAD32: invalid instruction encoding +0x9E 0x13 # GOOD64: c.slli t2, 39 +0xA2 0x13 # BAD32: invalid instruction encoding +0xA2 0x13 # GOOD64: c.slli t2, 40 +0xA6 0x13 # BAD32: invalid instruction encoding +0xA6 0x13 # GOOD64: c.slli t2, 41 +0xAA 0x13 # BAD32: invalid instruction encoding +0xAA 0x13 # GOOD64: c.slli t2, 42 +0xAE 0x13 # BAD32: invalid instruction encoding +0xAE 0x13 # GOOD64: c.slli t2, 43 +0xB2 0x13 # BAD32: invalid instruction encoding +0xB2 0x13 # GOOD64: c.slli t2, 44 +0xB6 0x13 # BAD32: invalid instruction encoding +0xB6 0x13 # GOOD64: c.slli t2, 45 +0xBA 0x13 # BAD32: invalid instruction encoding +0xBA 0x13 # GOOD64: c.slli t2, 46 +0xBE 0x13 # BAD32: invalid instruction encoding +0xBE 0x13 # GOOD64: c.slli t2, 47 +0xC2 0x13 # BAD32: invalid instruction encoding +0xC2 0x13 # GOOD64: c.slli t2, 48 +0xC6 0x13 # BAD32: invalid instruction encoding +0xC6 0x13 # GOOD64: c.slli t2, 49 +0xCA 0x13 # BAD32: invalid instruction encoding +0xCA 0x13 # GOOD64: c.slli t2, 50 +0xCE 0x13 # BAD32: invalid instruction encoding +0xCE 0x13 # GOOD64: c.slli t2, 51 +0xD2 0x13 # BAD32: invalid instruction encoding +0xD2 0x13 # GOOD64: c.slli t2, 52 +0xD6 0x13 # BAD32: invalid instruction encoding +0xD6 0x13 # GOOD64: c.slli t2, 53 +0xDA 0x13 # BAD32: invalid instruction encoding +0xDA 0x13 # GOOD64: c.slli t2, 54 +0xDE 0x13 # BAD32: invalid instruction encoding +0xDE 0x13 # GOOD64: c.slli t2, 55 +0xE2 0x13 # BAD32: invalid instruction encoding +0xE2 0x13 # GOOD64: c.slli t2, 56 +0xE6 0x13 # BAD32: invalid instruction encoding +0xE6 0x13 # GOOD64: c.slli t2, 57 +0xEA 0x13 # BAD32: invalid instruction encoding +0xEA 0x13 # GOOD64: c.slli t2, 58 +0xEE 0x13 # BAD32: invalid instruction encoding +0xEE 0x13 # GOOD64: c.slli t2, 59 +0xF2 0x13 # BAD32: invalid instruction encoding +0xF2 0x13 # GOOD64: c.slli t2, 60 +0xF6 0x13 # BAD32: invalid instruction encoding +0xF6 0x13 # GOOD64: c.slli t2, 61 +0xFA 0x13 # BAD32: invalid instruction encoding +0xFA 0x13 # GOOD64: c.slli t2, 62 +0xFE 0x13 # BAD32: invalid instruction encoding +0xFE 0x13 # GOOD64: c.slli t2, 63 +0x06 0x04 # GOOD: c.slli s0, 1 +0x0A 0x04 # GOOD: c.slli s0, 2 +0x0E 0x04 # GOOD: c.slli s0, 3 +0x12 0x04 # GOOD: c.slli s0, 4 +0x16 0x04 # GOOD: c.slli s0, 5 +0x1A 0x04 # GOOD: c.slli s0, 6 +0x1E 0x04 # GOOD: c.slli s0, 7 +0x22 0x04 # GOOD: c.slli s0, 8 +0x26 0x04 # GOOD: c.slli s0, 9 +0x2A 0x04 # GOOD: c.slli s0, 10 +0x2E 0x04 # GOOD: c.slli s0, 11 +0x32 0x04 # GOOD: c.slli s0, 12 +0x36 0x04 # GOOD: c.slli s0, 13 +0x3A 0x04 # GOOD: c.slli s0, 14 +0x3E 0x04 # GOOD: c.slli s0, 15 +0x42 0x04 # GOOD: c.slli s0, 16 +0x46 0x04 # GOOD: c.slli s0, 17 +0x4A 0x04 # GOOD: c.slli s0, 18 +0x4E 0x04 # GOOD: c.slli s0, 19 +0x52 0x04 # GOOD: c.slli s0, 20 +0x56 0x04 # GOOD: c.slli s0, 21 +0x5A 0x04 # GOOD: c.slli s0, 22 +0x5E 0x04 # GOOD: c.slli s0, 23 +0x62 0x04 # GOOD: c.slli s0, 24 +0x66 0x04 # GOOD: c.slli s0, 25 +0x6A 0x04 # GOOD: c.slli s0, 26 +0x6E 0x04 # GOOD: c.slli s0, 27 +0x72 0x04 # GOOD: c.slli s0, 28 +0x76 0x04 # GOOD: c.slli s0, 29 +0x7A 0x04 # GOOD: c.slli s0, 30 +0x7E 0x04 # GOOD: c.slli s0, 31 +0x02 0x14 # BAD32: invalid instruction encoding +0x02 0x14 # GOOD64: c.slli s0, 32 +0x06 0x14 # BAD32: invalid instruction encoding +0x06 0x14 # GOOD64: c.slli s0, 33 +0x0A 0x14 # BAD32: invalid instruction encoding +0x0A 0x14 # GOOD64: c.slli s0, 34 +0x0E 0x14 # BAD32: invalid instruction encoding +0x0E 0x14 # GOOD64: c.slli s0, 35 +0x12 0x14 # BAD32: invalid instruction encoding +0x12 0x14 # GOOD64: c.slli s0, 36 +0x16 0x14 # BAD32: invalid instruction encoding +0x16 0x14 # GOOD64: c.slli s0, 37 +0x1A 0x14 # BAD32: invalid instruction encoding +0x1A 0x14 # GOOD64: c.slli s0, 38 +0x1E 0x14 # BAD32: invalid instruction encoding +0x1E 0x14 # GOOD64: c.slli s0, 39 +0x22 0x14 # BAD32: invalid instruction encoding +0x22 0x14 # GOOD64: c.slli s0, 40 +0x26 0x14 # BAD32: invalid instruction encoding +0x26 0x14 # GOOD64: c.slli s0, 41 +0x2A 0x14 # BAD32: invalid instruction encoding +0x2A 0x14 # GOOD64: c.slli s0, 42 +0x2E 0x14 # BAD32: invalid instruction encoding +0x2E 0x14 # GOOD64: c.slli s0, 43 +0x32 0x14 # BAD32: invalid instruction encoding +0x32 0x14 # GOOD64: c.slli s0, 44 +0x36 0x14 # BAD32: invalid instruction encoding +0x36 0x14 # GOOD64: c.slli s0, 45 +0x3A 0x14 # BAD32: invalid instruction encoding +0x3A 0x14 # GOOD64: c.slli s0, 46 +0x3E 0x14 # BAD32: invalid instruction encoding +0x3E 0x14 # GOOD64: c.slli s0, 47 +0x42 0x14 # BAD32: invalid instruction encoding +0x42 0x14 # GOOD64: c.slli s0, 48 +0x46 0x14 # BAD32: invalid instruction encoding +0x46 0x14 # GOOD64: c.slli s0, 49 +0x4A 0x14 # BAD32: invalid instruction encoding +0x4A 0x14 # GOOD64: c.slli s0, 50 +0x4E 0x14 # BAD32: invalid instruction encoding +0x4E 0x14 # GOOD64: c.slli s0, 51 +0x52 0x14 # BAD32: invalid instruction encoding +0x52 0x14 # GOOD64: c.slli s0, 52 +0x56 0x14 # BAD32: invalid instruction encoding +0x56 0x14 # GOOD64: c.slli s0, 53 +0x5A 0x14 # BAD32: invalid instruction encoding +0x5A 0x14 # GOOD64: c.slli s0, 54 +0x5E 0x14 # BAD32: invalid instruction encoding +0x5E 0x14 # GOOD64: c.slli s0, 55 +0x62 0x14 # BAD32: invalid instruction encoding +0x62 0x14 # GOOD64: c.slli s0, 56 +0x66 0x14 # BAD32: invalid instruction encoding +0x66 0x14 # GOOD64: c.slli s0, 57 +0x6A 0x14 # BAD32: invalid instruction encoding +0x6A 0x14 # GOOD64: c.slli s0, 58 +0x6E 0x14 # BAD32: invalid instruction encoding +0x6E 0x14 # GOOD64: c.slli s0, 59 +0x72 0x14 # BAD32: invalid instruction encoding +0x72 0x14 # GOOD64: c.slli s0, 60 +0x76 0x14 # BAD32: invalid instruction encoding +0x76 0x14 # GOOD64: c.slli s0, 61 +0x7A 0x14 # BAD32: invalid instruction encoding +0x7A 0x14 # GOOD64: c.slli s0, 62 +0x7E 0x14 # BAD32: invalid instruction encoding +0x7E 0x14 # GOOD64: c.slli s0, 63 +0x86 0x04 # GOOD: c.slli s1, 1 +0x8A 0x04 # GOOD: c.slli s1, 2 +0x8E 0x04 # GOOD: c.slli s1, 3 +0x92 0x04 # GOOD: c.slli s1, 4 +0x96 0x04 # GOOD: c.slli s1, 5 +0x9A 0x04 # GOOD: c.slli s1, 6 +0x9E 0x04 # GOOD: c.slli s1, 7 +0xA2 0x04 # GOOD: c.slli s1, 8 +0xA6 0x04 # GOOD: c.slli s1, 9 +0xAA 0x04 # GOOD: c.slli s1, 10 +0xAE 0x04 # GOOD: c.slli s1, 11 +0xB2 0x04 # GOOD: c.slli s1, 12 +0xB6 0x04 # GOOD: c.slli s1, 13 +0xBA 0x04 # GOOD: c.slli s1, 14 +0xBE 0x04 # GOOD: c.slli s1, 15 +0xC2 0x04 # GOOD: c.slli s1, 16 +0xC6 0x04 # GOOD: c.slli s1, 17 +0xCA 0x04 # GOOD: c.slli s1, 18 +0xCE 0x04 # GOOD: c.slli s1, 19 +0xD2 0x04 # GOOD: c.slli s1, 20 +0xD6 0x04 # GOOD: c.slli s1, 21 +0xDA 0x04 # GOOD: c.slli s1, 22 +0xDE 0x04 # GOOD: c.slli s1, 23 +0xE2 0x04 # GOOD: c.slli s1, 24 +0xE6 0x04 # GOOD: c.slli s1, 25 +0xEA 0x04 # GOOD: c.slli s1, 26 +0xEE 0x04 # GOOD: c.slli s1, 27 +0xF2 0x04 # GOOD: c.slli s1, 28 +0xF6 0x04 # GOOD: c.slli s1, 29 +0xFA 0x04 # GOOD: c.slli s1, 30 +0xFE 0x04 # GOOD: c.slli s1, 31 +0x82 0x14 # BAD32: invalid instruction encoding +0x82 0x14 # GOOD64: c.slli s1, 32 +0x86 0x14 # BAD32: invalid instruction encoding +0x86 0x14 # GOOD64: c.slli s1, 33 +0x8A 0x14 # BAD32: invalid instruction encoding +0x8A 0x14 # GOOD64: c.slli s1, 34 +0x8E 0x14 # BAD32: invalid instruction encoding +0x8E 0x14 # GOOD64: c.slli s1, 35 +0x92 0x14 # BAD32: invalid instruction encoding +0x92 0x14 # GOOD64: c.slli s1, 36 +0x96 0x14 # BAD32: invalid instruction encoding +0x96 0x14 # GOOD64: c.slli s1, 37 +0x9A 0x14 # BAD32: invalid instruction encoding +0x9A 0x14 # GOOD64: c.slli s1, 38 +0x9E 0x14 # BAD32: invalid instruction encoding +0x9E 0x14 # GOOD64: c.slli s1, 39 +0xA2 0x14 # BAD32: invalid instruction encoding +0xA2 0x14 # GOOD64: c.slli s1, 40 +0xA6 0x14 # BAD32: invalid instruction encoding +0xA6 0x14 # GOOD64: c.slli s1, 41 +0xAA 0x14 # BAD32: invalid instruction encoding +0xAA 0x14 # GOOD64: c.slli s1, 42 +0xAE 0x14 # BAD32: invalid instruction encoding +0xAE 0x14 # GOOD64: c.slli s1, 43 +0xB2 0x14 # BAD32: invalid instruction encoding +0xB2 0x14 # GOOD64: c.slli s1, 44 +0xB6 0x14 # BAD32: invalid instruction encoding +0xB6 0x14 # GOOD64: c.slli s1, 45 +0xBA 0x14 # BAD32: invalid instruction encoding +0xBA 0x14 # GOOD64: c.slli s1, 46 +0xBE 0x14 # BAD32: invalid instruction encoding +0xBE 0x14 # GOOD64: c.slli s1, 47 +0xC2 0x14 # BAD32: invalid instruction encoding +0xC2 0x14 # GOOD64: c.slli s1, 48 +0xC6 0x14 # BAD32: invalid instruction encoding +0xC6 0x14 # GOOD64: c.slli s1, 49 +0xCA 0x14 # BAD32: invalid instruction encoding +0xCA 0x14 # GOOD64: c.slli s1, 50 +0xCE 0x14 # BAD32: invalid instruction encoding +0xCE 0x14 # GOOD64: c.slli s1, 51 +0xD2 0x14 # BAD32: invalid instruction encoding +0xD2 0x14 # GOOD64: c.slli s1, 52 +0xD6 0x14 # BAD32: invalid instruction encoding +0xD6 0x14 # GOOD64: c.slli s1, 53 +0xDA 0x14 # BAD32: invalid instruction encoding +0xDA 0x14 # GOOD64: c.slli s1, 54 +0xDE 0x14 # BAD32: invalid instruction encoding +0xDE 0x14 # GOOD64: c.slli s1, 55 +0xE2 0x14 # BAD32: invalid instruction encoding +0xE2 0x14 # GOOD64: c.slli s1, 56 +0xE6 0x14 # BAD32: invalid instruction encoding +0xE6 0x14 # GOOD64: c.slli s1, 57 +0xEA 0x14 # BAD32: invalid instruction encoding +0xEA 0x14 # GOOD64: c.slli s1, 58 +0xEE 0x14 # BAD32: invalid instruction encoding +0xEE 0x14 # GOOD64: c.slli s1, 59 +0xF2 0x14 # BAD32: invalid instruction encoding +0xF2 0x14 # GOOD64: c.slli s1, 60 +0xF6 0x14 # BAD32: invalid instruction encoding +0xF6 0x14 # GOOD64: c.slli s1, 61 +0xFA 0x14 # BAD32: invalid instruction encoding +0xFA 0x14 # GOOD64: c.slli s1, 62 +0xFE 0x14 # BAD32: invalid instruction encoding +0xFE 0x14 # GOOD64: c.slli s1, 63 +0x06 0x05 # GOOD: c.slli a0, 1 +0x0A 0x05 # GOOD: c.slli a0, 2 +0x0E 0x05 # GOOD: c.slli a0, 3 +0x12 0x05 # GOOD: c.slli a0, 4 +0x16 0x05 # GOOD: c.slli a0, 5 +0x1A 0x05 # GOOD: c.slli a0, 6 +0x1E 0x05 # GOOD: c.slli a0, 7 +0x22 0x05 # GOOD: c.slli a0, 8 +0x26 0x05 # GOOD: c.slli a0, 9 +0x2A 0x05 # GOOD: c.slli a0, 10 +0x2E 0x05 # GOOD: c.slli a0, 11 +0x32 0x05 # GOOD: c.slli a0, 12 +0x36 0x05 # GOOD: c.slli a0, 13 +0x3A 0x05 # GOOD: c.slli a0, 14 +0x3E 0x05 # GOOD: c.slli a0, 15 +0x42 0x05 # GOOD: c.slli a0, 16 +0x46 0x05 # GOOD: c.slli a0, 17 +0x4A 0x05 # GOOD: c.slli a0, 18 +0x4E 0x05 # GOOD: c.slli a0, 19 +0x52 0x05 # GOOD: c.slli a0, 20 +0x56 0x05 # GOOD: c.slli a0, 21 +0x5A 0x05 # GOOD: c.slli a0, 22 +0x5E 0x05 # GOOD: c.slli a0, 23 +0x62 0x05 # GOOD: c.slli a0, 24 +0x66 0x05 # GOOD: c.slli a0, 25 +0x6A 0x05 # GOOD: c.slli a0, 26 +0x6E 0x05 # GOOD: c.slli a0, 27 +0x72 0x05 # GOOD: c.slli a0, 28 +0x76 0x05 # GOOD: c.slli a0, 29 +0x7A 0x05 # GOOD: c.slli a0, 30 +0x7E 0x05 # GOOD: c.slli a0, 31 +0x02 0x15 # BAD32: invalid instruction encoding +0x02 0x15 # GOOD64: c.slli a0, 32 +0x06 0x15 # BAD32: invalid instruction encoding +0x06 0x15 # GOOD64: c.slli a0, 33 +0x0A 0x15 # BAD32: invalid instruction encoding +0x0A 0x15 # GOOD64: c.slli a0, 34 +0x0E 0x15 # BAD32: invalid instruction encoding +0x0E 0x15 # GOOD64: c.slli a0, 35 +0x12 0x15 # BAD32: invalid instruction encoding +0x12 0x15 # GOOD64: c.slli a0, 36 +0x16 0x15 # BAD32: invalid instruction encoding +0x16 0x15 # GOOD64: c.slli a0, 37 +0x1A 0x15 # BAD32: invalid instruction encoding +0x1A 0x15 # GOOD64: c.slli a0, 38 +0x1E 0x15 # BAD32: invalid instruction encoding +0x1E 0x15 # GOOD64: c.slli a0, 39 +0x22 0x15 # BAD32: invalid instruction encoding +0x22 0x15 # GOOD64: c.slli a0, 40 +0x26 0x15 # BAD32: invalid instruction encoding +0x26 0x15 # GOOD64: c.slli a0, 41 +0x2A 0x15 # BAD32: invalid instruction encoding +0x2A 0x15 # GOOD64: c.slli a0, 42 +0x2E 0x15 # BAD32: invalid instruction encoding +0x2E 0x15 # GOOD64: c.slli a0, 43 +0x32 0x15 # BAD32: invalid instruction encoding +0x32 0x15 # GOOD64: c.slli a0, 44 +0x36 0x15 # BAD32: invalid instruction encoding +0x36 0x15 # GOOD64: c.slli a0, 45 +0x3A 0x15 # BAD32: invalid instruction encoding +0x3A 0x15 # GOOD64: c.slli a0, 46 +0x3E 0x15 # BAD32: invalid instruction encoding +0x3E 0x15 # GOOD64: c.slli a0, 47 +0x42 0x15 # BAD32: invalid instruction encoding +0x42 0x15 # GOOD64: c.slli a0, 48 +0x46 0x15 # BAD32: invalid instruction encoding +0x46 0x15 # GOOD64: c.slli a0, 49 +0x4A 0x15 # BAD32: invalid instruction encoding +0x4A 0x15 # GOOD64: c.slli a0, 50 +0x4E 0x15 # BAD32: invalid instruction encoding +0x4E 0x15 # GOOD64: c.slli a0, 51 +0x52 0x15 # BAD32: invalid instruction encoding +0x52 0x15 # GOOD64: c.slli a0, 52 +0x56 0x15 # BAD32: invalid instruction encoding +0x56 0x15 # GOOD64: c.slli a0, 53 +0x5A 0x15 # BAD32: invalid instruction encoding +0x5A 0x15 # GOOD64: c.slli a0, 54 +0x5E 0x15 # BAD32: invalid instruction encoding +0x5E 0x15 # GOOD64: c.slli a0, 55 +0x62 0x15 # BAD32: invalid instruction encoding +0x62 0x15 # GOOD64: c.slli a0, 56 +0x66 0x15 # BAD32: invalid instruction encoding +0x66 0x15 # GOOD64: c.slli a0, 57 +0x6A 0x15 # BAD32: invalid instruction encoding +0x6A 0x15 # GOOD64: c.slli a0, 58 +0x6E 0x15 # BAD32: invalid instruction encoding +0x6E 0x15 # GOOD64: c.slli a0, 59 +0x72 0x15 # BAD32: invalid instruction encoding +0x72 0x15 # GOOD64: c.slli a0, 60 +0x76 0x15 # BAD32: invalid instruction encoding +0x76 0x15 # GOOD64: c.slli a0, 61 +0x7A 0x15 # BAD32: invalid instruction encoding +0x7A 0x15 # GOOD64: c.slli a0, 62 +0x7E 0x15 # BAD32: invalid instruction encoding +0x7E 0x15 # GOOD64: c.slli a0, 63 +0x86 0x05 # GOOD: c.slli a1, 1 +0x8A 0x05 # GOOD: c.slli a1, 2 +0x8E 0x05 # GOOD: c.slli a1, 3 +0x92 0x05 # GOOD: c.slli a1, 4 +0x96 0x05 # GOOD: c.slli a1, 5 +0x9A 0x05 # GOOD: c.slli a1, 6 +0x9E 0x05 # GOOD: c.slli a1, 7 +0xA2 0x05 # GOOD: c.slli a1, 8 +0xA6 0x05 # GOOD: c.slli a1, 9 +0xAA 0x05 # GOOD: c.slli a1, 10 +0xAE 0x05 # GOOD: c.slli a1, 11 +0xB2 0x05 # GOOD: c.slli a1, 12 +0xB6 0x05 # GOOD: c.slli a1, 13 +0xBA 0x05 # GOOD: c.slli a1, 14 +0xBE 0x05 # GOOD: c.slli a1, 15 +0xC2 0x05 # GOOD: c.slli a1, 16 +0xC6 0x05 # GOOD: c.slli a1, 17 +0xCA 0x05 # GOOD: c.slli a1, 18 +0xCE 0x05 # GOOD: c.slli a1, 19 +0xD2 0x05 # GOOD: c.slli a1, 20 +0xD6 0x05 # GOOD: c.slli a1, 21 +0xDA 0x05 # GOOD: c.slli a1, 22 +0xDE 0x05 # GOOD: c.slli a1, 23 +0xE2 0x05 # GOOD: c.slli a1, 24 +0xE6 0x05 # GOOD: c.slli a1, 25 +0xEA 0x05 # GOOD: c.slli a1, 26 +0xEE 0x05 # GOOD: c.slli a1, 27 +0xF2 0x05 # GOOD: c.slli a1, 28 +0xF6 0x05 # GOOD: c.slli a1, 29 +0xFA 0x05 # GOOD: c.slli a1, 30 +0xFE 0x05 # GOOD: c.slli a1, 31 +0x82 0x15 # BAD32: invalid instruction encoding +0x82 0x15 # GOOD64: c.slli a1, 32 +0x86 0x15 # BAD32: invalid instruction encoding +0x86 0x15 # GOOD64: c.slli a1, 33 +0x8A 0x15 # BAD32: invalid instruction encoding +0x8A 0x15 # GOOD64: c.slli a1, 34 +0x8E 0x15 # BAD32: invalid instruction encoding +0x8E 0x15 # GOOD64: c.slli a1, 35 +0x92 0x15 # BAD32: invalid instruction encoding +0x92 0x15 # GOOD64: c.slli a1, 36 +0x96 0x15 # BAD32: invalid instruction encoding +0x96 0x15 # GOOD64: c.slli a1, 37 +0x9A 0x15 # BAD32: invalid instruction encoding +0x9A 0x15 # GOOD64: c.slli a1, 38 +0x9E 0x15 # BAD32: invalid instruction encoding +0x9E 0x15 # GOOD64: c.slli a1, 39 +0xA2 0x15 # BAD32: invalid instruction encoding +0xA2 0x15 # GOOD64: c.slli a1, 40 +0xA6 0x15 # BAD32: invalid instruction encoding +0xA6 0x15 # GOOD64: c.slli a1, 41 +0xAA 0x15 # BAD32: invalid instruction encoding +0xAA 0x15 # GOOD64: c.slli a1, 42 +0xAE 0x15 # BAD32: invalid instruction encoding +0xAE 0x15 # GOOD64: c.slli a1, 43 +0xB2 0x15 # BAD32: invalid instruction encoding +0xB2 0x15 # GOOD64: c.slli a1, 44 +0xB6 0x15 # BAD32: invalid instruction encoding +0xB6 0x15 # GOOD64: c.slli a1, 45 +0xBA 0x15 # BAD32: invalid instruction encoding +0xBA 0x15 # GOOD64: c.slli a1, 46 +0xBE 0x15 # BAD32: invalid instruction encoding +0xBE 0x15 # GOOD64: c.slli a1, 47 +0xC2 0x15 # BAD32: invalid instruction encoding +0xC2 0x15 # GOOD64: c.slli a1, 48 +0xC6 0x15 # BAD32: invalid instruction encoding +0xC6 0x15 # GOOD64: c.slli a1, 49 +0xCA 0x15 # BAD32: invalid instruction encoding +0xCA 0x15 # GOOD64: c.slli a1, 50 +0xCE 0x15 # BAD32: invalid instruction encoding +0xCE 0x15 # GOOD64: c.slli a1, 51 +0xD2 0x15 # BAD32: invalid instruction encoding +0xD2 0x15 # GOOD64: c.slli a1, 52 +0xD6 0x15 # BAD32: invalid instruction encoding +0xD6 0x15 # GOOD64: c.slli a1, 53 +0xDA 0x15 # BAD32: invalid instruction encoding +0xDA 0x15 # GOOD64: c.slli a1, 54 +0xDE 0x15 # BAD32: invalid instruction encoding +0xDE 0x15 # GOOD64: c.slli a1, 55 +0xE2 0x15 # BAD32: invalid instruction encoding +0xE2 0x15 # GOOD64: c.slli a1, 56 +0xE6 0x15 # BAD32: invalid instruction encoding +0xE6 0x15 # GOOD64: c.slli a1, 57 +0xEA 0x15 # BAD32: invalid instruction encoding +0xEA 0x15 # GOOD64: c.slli a1, 58 +0xEE 0x15 # BAD32: invalid instruction encoding +0xEE 0x15 # GOOD64: c.slli a1, 59 +0xF2 0x15 # BAD32: invalid instruction encoding +0xF2 0x15 # GOOD64: c.slli a1, 60 +0xF6 0x15 # BAD32: invalid instruction encoding +0xF6 0x15 # GOOD64: c.slli a1, 61 +0xFA 0x15 # BAD32: invalid instruction encoding +0xFA 0x15 # GOOD64: c.slli a1, 62 +0xFE 0x15 # BAD32: invalid instruction encoding +0xFE 0x15 # GOOD64: c.slli a1, 63 +0x06 0x06 # GOOD: c.slli a2, 1 +0x0A 0x06 # GOOD: c.slli a2, 2 +0x0E 0x06 # GOOD: c.slli a2, 3 +0x12 0x06 # GOOD: c.slli a2, 4 +0x16 0x06 # GOOD: c.slli a2, 5 +0x1A 0x06 # GOOD: c.slli a2, 6 +0x1E 0x06 # GOOD: c.slli a2, 7 +0x22 0x06 # GOOD: c.slli a2, 8 +0x26 0x06 # GOOD: c.slli a2, 9 +0x2A 0x06 # GOOD: c.slli a2, 10 +0x2E 0x06 # GOOD: c.slli a2, 11 +0x32 0x06 # GOOD: c.slli a2, 12 +0x36 0x06 # GOOD: c.slli a2, 13 +0x3A 0x06 # GOOD: c.slli a2, 14 +0x3E 0x06 # GOOD: c.slli a2, 15 +0x42 0x06 # GOOD: c.slli a2, 16 +0x46 0x06 # GOOD: c.slli a2, 17 +0x4A 0x06 # GOOD: c.slli a2, 18 +0x4E 0x06 # GOOD: c.slli a2, 19 +0x52 0x06 # GOOD: c.slli a2, 20 +0x56 0x06 # GOOD: c.slli a2, 21 +0x5A 0x06 # GOOD: c.slli a2, 22 +0x5E 0x06 # GOOD: c.slli a2, 23 +0x62 0x06 # GOOD: c.slli a2, 24 +0x66 0x06 # GOOD: c.slli a2, 25 +0x6A 0x06 # GOOD: c.slli a2, 26 +0x6E 0x06 # GOOD: c.slli a2, 27 +0x72 0x06 # GOOD: c.slli a2, 28 +0x76 0x06 # GOOD: c.slli a2, 29 +0x7A 0x06 # GOOD: c.slli a2, 30 +0x7E 0x06 # GOOD: c.slli a2, 31 +0x02 0x16 # BAD32: invalid instruction encoding +0x02 0x16 # GOOD64: c.slli a2, 32 +0x06 0x16 # BAD32: invalid instruction encoding +0x06 0x16 # GOOD64: c.slli a2, 33 +0x0A 0x16 # BAD32: invalid instruction encoding +0x0A 0x16 # GOOD64: c.slli a2, 34 +0x0E 0x16 # BAD32: invalid instruction encoding +0x0E 0x16 # GOOD64: c.slli a2, 35 +0x12 0x16 # BAD32: invalid instruction encoding +0x12 0x16 # GOOD64: c.slli a2, 36 +0x16 0x16 # BAD32: invalid instruction encoding +0x16 0x16 # GOOD64: c.slli a2, 37 +0x1A 0x16 # BAD32: invalid instruction encoding +0x1A 0x16 # GOOD64: c.slli a2, 38 +0x1E 0x16 # BAD32: invalid instruction encoding +0x1E 0x16 # GOOD64: c.slli a2, 39 +0x22 0x16 # BAD32: invalid instruction encoding +0x22 0x16 # GOOD64: c.slli a2, 40 +0x26 0x16 # BAD32: invalid instruction encoding +0x26 0x16 # GOOD64: c.slli a2, 41 +0x2A 0x16 # BAD32: invalid instruction encoding +0x2A 0x16 # GOOD64: c.slli a2, 42 +0x2E 0x16 # BAD32: invalid instruction encoding +0x2E 0x16 # GOOD64: c.slli a2, 43 +0x32 0x16 # BAD32: invalid instruction encoding +0x32 0x16 # GOOD64: c.slli a2, 44 +0x36 0x16 # BAD32: invalid instruction encoding +0x36 0x16 # GOOD64: c.slli a2, 45 +0x3A 0x16 # BAD32: invalid instruction encoding +0x3A 0x16 # GOOD64: c.slli a2, 46 +0x3E 0x16 # BAD32: invalid instruction encoding +0x3E 0x16 # GOOD64: c.slli a2, 47 +0x42 0x16 # BAD32: invalid instruction encoding +0x42 0x16 # GOOD64: c.slli a2, 48 +0x46 0x16 # BAD32: invalid instruction encoding +0x46 0x16 # GOOD64: c.slli a2, 49 +0x4A 0x16 # BAD32: invalid instruction encoding +0x4A 0x16 # GOOD64: c.slli a2, 50 +0x4E 0x16 # BAD32: invalid instruction encoding +0x4E 0x16 # GOOD64: c.slli a2, 51 +0x52 0x16 # BAD32: invalid instruction encoding +0x52 0x16 # GOOD64: c.slli a2, 52 +0x56 0x16 # BAD32: invalid instruction encoding +0x56 0x16 # GOOD64: c.slli a2, 53 +0x5A 0x16 # BAD32: invalid instruction encoding +0x5A 0x16 # GOOD64: c.slli a2, 54 +0x5E 0x16 # BAD32: invalid instruction encoding +0x5E 0x16 # GOOD64: c.slli a2, 55 +0x62 0x16 # BAD32: invalid instruction encoding +0x62 0x16 # GOOD64: c.slli a2, 56 +0x66 0x16 # BAD32: invalid instruction encoding +0x66 0x16 # GOOD64: c.slli a2, 57 +0x6A 0x16 # BAD32: invalid instruction encoding +0x6A 0x16 # GOOD64: c.slli a2, 58 +0x6E 0x16 # BAD32: invalid instruction encoding +0x6E 0x16 # GOOD64: c.slli a2, 59 +0x72 0x16 # BAD32: invalid instruction encoding +0x72 0x16 # GOOD64: c.slli a2, 60 +0x76 0x16 # BAD32: invalid instruction encoding +0x76 0x16 # GOOD64: c.slli a2, 61 +0x7A 0x16 # BAD32: invalid instruction encoding +0x7A 0x16 # GOOD64: c.slli a2, 62 +0x7E 0x16 # BAD32: invalid instruction encoding +0x7E 0x16 # GOOD64: c.slli a2, 63 +0x86 0x06 # GOOD: c.slli a3, 1 +0x8A 0x06 # GOOD: c.slli a3, 2 +0x8E 0x06 # GOOD: c.slli a3, 3 +0x92 0x06 # GOOD: c.slli a3, 4 +0x96 0x06 # GOOD: c.slli a3, 5 +0x9A 0x06 # GOOD: c.slli a3, 6 +0x9E 0x06 # GOOD: c.slli a3, 7 +0xA2 0x06 # GOOD: c.slli a3, 8 +0xA6 0x06 # GOOD: c.slli a3, 9 +0xAA 0x06 # GOOD: c.slli a3, 10 +0xAE 0x06 # GOOD: c.slli a3, 11 +0xB2 0x06 # GOOD: c.slli a3, 12 +0xB6 0x06 # GOOD: c.slli a3, 13 +0xBA 0x06 # GOOD: c.slli a3, 14 +0xBE 0x06 # GOOD: c.slli a3, 15 +0xC2 0x06 # GOOD: c.slli a3, 16 +0xC6 0x06 # GOOD: c.slli a3, 17 +0xCA 0x06 # GOOD: c.slli a3, 18 +0xCE 0x06 # GOOD: c.slli a3, 19 +0xD2 0x06 # GOOD: c.slli a3, 20 +0xD6 0x06 # GOOD: c.slli a3, 21 +0xDA 0x06 # GOOD: c.slli a3, 22 +0xDE 0x06 # GOOD: c.slli a3, 23 +0xE2 0x06 # GOOD: c.slli a3, 24 +0xE6 0x06 # GOOD: c.slli a3, 25 +0xEA 0x06 # GOOD: c.slli a3, 26 +0xEE 0x06 # GOOD: c.slli a3, 27 +0xF2 0x06 # GOOD: c.slli a3, 28 +0xF6 0x06 # GOOD: c.slli a3, 29 +0xFA 0x06 # GOOD: c.slli a3, 30 +0xFE 0x06 # GOOD: c.slli a3, 31 +0x82 0x16 # BAD32: invalid instruction encoding +0x82 0x16 # GOOD64: c.slli a3, 32 +0x86 0x16 # BAD32: invalid instruction encoding +0x86 0x16 # GOOD64: c.slli a3, 33 +0x8A 0x16 # BAD32: invalid instruction encoding +0x8A 0x16 # GOOD64: c.slli a3, 34 +0x8E 0x16 # BAD32: invalid instruction encoding +0x8E 0x16 # GOOD64: c.slli a3, 35 +0x92 0x16 # BAD32: invalid instruction encoding +0x92 0x16 # GOOD64: c.slli a3, 36 +0x96 0x16 # BAD32: invalid instruction encoding +0x96 0x16 # GOOD64: c.slli a3, 37 +0x9A 0x16 # BAD32: invalid instruction encoding +0x9A 0x16 # GOOD64: c.slli a3, 38 +0x9E 0x16 # BAD32: invalid instruction encoding +0x9E 0x16 # GOOD64: c.slli a3, 39 +0xA2 0x16 # BAD32: invalid instruction encoding +0xA2 0x16 # GOOD64: c.slli a3, 40 +0xA6 0x16 # BAD32: invalid instruction encoding +0xA6 0x16 # GOOD64: c.slli a3, 41 +0xAA 0x16 # BAD32: invalid instruction encoding +0xAA 0x16 # GOOD64: c.slli a3, 42 +0xAE 0x16 # BAD32: invalid instruction encoding +0xAE 0x16 # GOOD64: c.slli a3, 43 +0xB2 0x16 # BAD32: invalid instruction encoding +0xB2 0x16 # GOOD64: c.slli a3, 44 +0xB6 0x16 # BAD32: invalid instruction encoding +0xB6 0x16 # GOOD64: c.slli a3, 45 +0xBA 0x16 # BAD32: invalid instruction encoding +0xBA 0x16 # GOOD64: c.slli a3, 46 +0xBE 0x16 # BAD32: invalid instruction encoding +0xBE 0x16 # GOOD64: c.slli a3, 47 +0xC2 0x16 # BAD32: invalid instruction encoding +0xC2 0x16 # GOOD64: c.slli a3, 48 +0xC6 0x16 # BAD32: invalid instruction encoding +0xC6 0x16 # GOOD64: c.slli a3, 49 +0xCA 0x16 # BAD32: invalid instruction encoding +0xCA 0x16 # GOOD64: c.slli a3, 50 +0xCE 0x16 # BAD32: invalid instruction encoding +0xCE 0x16 # GOOD64: c.slli a3, 51 +0xD2 0x16 # BAD32: invalid instruction encoding +0xD2 0x16 # GOOD64: c.slli a3, 52 +0xD6 0x16 # BAD32: invalid instruction encoding +0xD6 0x16 # GOOD64: c.slli a3, 53 +0xDA 0x16 # BAD32: invalid instruction encoding +0xDA 0x16 # GOOD64: c.slli a3, 54 +0xDE 0x16 # BAD32: invalid instruction encoding +0xDE 0x16 # GOOD64: c.slli a3, 55 +0xE2 0x16 # BAD32: invalid instruction encoding +0xE2 0x16 # GOOD64: c.slli a3, 56 +0xE6 0x16 # BAD32: invalid instruction encoding +0xE6 0x16 # GOOD64: c.slli a3, 57 +0xEA 0x16 # BAD32: invalid instruction encoding +0xEA 0x16 # GOOD64: c.slli a3, 58 +0xEE 0x16 # BAD32: invalid instruction encoding +0xEE 0x16 # GOOD64: c.slli a3, 59 +0xF2 0x16 # BAD32: invalid instruction encoding +0xF2 0x16 # GOOD64: c.slli a3, 60 +0xF6 0x16 # BAD32: invalid instruction encoding +0xF6 0x16 # GOOD64: c.slli a3, 61 +0xFA 0x16 # BAD32: invalid instruction encoding +0xFA 0x16 # GOOD64: c.slli a3, 62 +0xFE 0x16 # BAD32: invalid instruction encoding +0xFE 0x16 # GOOD64: c.slli a3, 63 +0x06 0x07 # GOOD: c.slli a4, 1 +0x0A 0x07 # GOOD: c.slli a4, 2 +0x0E 0x07 # GOOD: c.slli a4, 3 +0x12 0x07 # GOOD: c.slli a4, 4 +0x16 0x07 # GOOD: c.slli a4, 5 +0x1A 0x07 # GOOD: c.slli a4, 6 +0x1E 0x07 # GOOD: c.slli a4, 7 +0x22 0x07 # GOOD: c.slli a4, 8 +0x26 0x07 # GOOD: c.slli a4, 9 +0x2A 0x07 # GOOD: c.slli a4, 10 +0x2E 0x07 # GOOD: c.slli a4, 11 +0x32 0x07 # GOOD: c.slli a4, 12 +0x36 0x07 # GOOD: c.slli a4, 13 +0x3A 0x07 # GOOD: c.slli a4, 14 +0x3E 0x07 # GOOD: c.slli a4, 15 +0x42 0x07 # GOOD: c.slli a4, 16 +0x46 0x07 # GOOD: c.slli a4, 17 +0x4A 0x07 # GOOD: c.slli a4, 18 +0x4E 0x07 # GOOD: c.slli a4, 19 +0x52 0x07 # GOOD: c.slli a4, 20 +0x56 0x07 # GOOD: c.slli a4, 21 +0x5A 0x07 # GOOD: c.slli a4, 22 +0x5E 0x07 # GOOD: c.slli a4, 23 +0x62 0x07 # GOOD: c.slli a4, 24 +0x66 0x07 # GOOD: c.slli a4, 25 +0x6A 0x07 # GOOD: c.slli a4, 26 +0x6E 0x07 # GOOD: c.slli a4, 27 +0x72 0x07 # GOOD: c.slli a4, 28 +0x76 0x07 # GOOD: c.slli a4, 29 +0x7A 0x07 # GOOD: c.slli a4, 30 +0x7E 0x07 # GOOD: c.slli a4, 31 +0x02 0x17 # BAD32: invalid instruction encoding +0x02 0x17 # GOOD64: c.slli a4, 32 +0x06 0x17 # BAD32: invalid instruction encoding +0x06 0x17 # GOOD64: c.slli a4, 33 +0x0A 0x17 # BAD32: invalid instruction encoding +0x0A 0x17 # GOOD64: c.slli a4, 34 +0x0E 0x17 # BAD32: invalid instruction encoding +0x0E 0x17 # GOOD64: c.slli a4, 35 +0x12 0x17 # BAD32: invalid instruction encoding +0x12 0x17 # GOOD64: c.slli a4, 36 +0x16 0x17 # BAD32: invalid instruction encoding +0x16 0x17 # GOOD64: c.slli a4, 37 +0x1A 0x17 # BAD32: invalid instruction encoding +0x1A 0x17 # GOOD64: c.slli a4, 38 +0x1E 0x17 # BAD32: invalid instruction encoding +0x1E 0x17 # GOOD64: c.slli a4, 39 +0x22 0x17 # BAD32: invalid instruction encoding +0x22 0x17 # GOOD64: c.slli a4, 40 +0x26 0x17 # BAD32: invalid instruction encoding +0x26 0x17 # GOOD64: c.slli a4, 41 +0x2A 0x17 # BAD32: invalid instruction encoding +0x2A 0x17 # GOOD64: c.slli a4, 42 +0x2E 0x17 # BAD32: invalid instruction encoding +0x2E 0x17 # GOOD64: c.slli a4, 43 +0x32 0x17 # BAD32: invalid instruction encoding +0x32 0x17 # GOOD64: c.slli a4, 44 +0x36 0x17 # BAD32: invalid instruction encoding +0x36 0x17 # GOOD64: c.slli a4, 45 +0x3A 0x17 # BAD32: invalid instruction encoding +0x3A 0x17 # GOOD64: c.slli a4, 46 +0x3E 0x17 # BAD32: invalid instruction encoding +0x3E 0x17 # GOOD64: c.slli a4, 47 +0x42 0x17 # BAD32: invalid instruction encoding +0x42 0x17 # GOOD64: c.slli a4, 48 +0x46 0x17 # BAD32: invalid instruction encoding +0x46 0x17 # GOOD64: c.slli a4, 49 +0x4A 0x17 # BAD32: invalid instruction encoding +0x4A 0x17 # GOOD64: c.slli a4, 50 +0x4E 0x17 # BAD32: invalid instruction encoding +0x4E 0x17 # GOOD64: c.slli a4, 51 +0x52 0x17 # BAD32: invalid instruction encoding +0x52 0x17 # GOOD64: c.slli a4, 52 +0x56 0x17 # BAD32: invalid instruction encoding +0x56 0x17 # GOOD64: c.slli a4, 53 +0x5A 0x17 # BAD32: invalid instruction encoding +0x5A 0x17 # GOOD64: c.slli a4, 54 +0x5E 0x17 # BAD32: invalid instruction encoding +0x5E 0x17 # GOOD64: c.slli a4, 55 +0x62 0x17 # BAD32: invalid instruction encoding +0x62 0x17 # GOOD64: c.slli a4, 56 +0x66 0x17 # BAD32: invalid instruction encoding +0x66 0x17 # GOOD64: c.slli a4, 57 +0x6A 0x17 # BAD32: invalid instruction encoding +0x6A 0x17 # GOOD64: c.slli a4, 58 +0x6E 0x17 # BAD32: invalid instruction encoding +0x6E 0x17 # GOOD64: c.slli a4, 59 +0x72 0x17 # BAD32: invalid instruction encoding +0x72 0x17 # GOOD64: c.slli a4, 60 +0x76 0x17 # BAD32: invalid instruction encoding +0x76 0x17 # GOOD64: c.slli a4, 61 +0x7A 0x17 # BAD32: invalid instruction encoding +0x7A 0x17 # GOOD64: c.slli a4, 62 +0x7E 0x17 # BAD32: invalid instruction encoding +0x7E 0x17 # GOOD64: c.slli a4, 63 +0x86 0x07 # GOOD: c.slli a5, 1 +0x8A 0x07 # GOOD: c.slli a5, 2 +0x8E 0x07 # GOOD: c.slli a5, 3 +0x92 0x07 # GOOD: c.slli a5, 4 +0x96 0x07 # GOOD: c.slli a5, 5 +0x9A 0x07 # GOOD: c.slli a5, 6 +0x9E 0x07 # GOOD: c.slli a5, 7 +0xA2 0x07 # GOOD: c.slli a5, 8 +0xA6 0x07 # GOOD: c.slli a5, 9 +0xAA 0x07 # GOOD: c.slli a5, 10 +0xAE 0x07 # GOOD: c.slli a5, 11 +0xB2 0x07 # GOOD: c.slli a5, 12 +0xB6 0x07 # GOOD: c.slli a5, 13 +0xBA 0x07 # GOOD: c.slli a5, 14 +0xBE 0x07 # GOOD: c.slli a5, 15 +0xC2 0x07 # GOOD: c.slli a5, 16 +0xC6 0x07 # GOOD: c.slli a5, 17 +0xCA 0x07 # GOOD: c.slli a5, 18 +0xCE 0x07 # GOOD: c.slli a5, 19 +0xD2 0x07 # GOOD: c.slli a5, 20 +0xD6 0x07 # GOOD: c.slli a5, 21 +0xDA 0x07 # GOOD: c.slli a5, 22 +0xDE 0x07 # GOOD: c.slli a5, 23 +0xE2 0x07 # GOOD: c.slli a5, 24 +0xE6 0x07 # GOOD: c.slli a5, 25 +0xEA 0x07 # GOOD: c.slli a5, 26 +0xEE 0x07 # GOOD: c.slli a5, 27 +0xF2 0x07 # GOOD: c.slli a5, 28 +0xF6 0x07 # GOOD: c.slli a5, 29 +0xFA 0x07 # GOOD: c.slli a5, 30 +0xFE 0x07 # GOOD: c.slli a5, 31 +0x82 0x17 # BAD32: invalid instruction encoding +0x82 0x17 # GOOD64: c.slli a5, 32 +0x86 0x17 # BAD32: invalid instruction encoding +0x86 0x17 # GOOD64: c.slli a5, 33 +0x8A 0x17 # BAD32: invalid instruction encoding +0x8A 0x17 # GOOD64: c.slli a5, 34 +0x8E 0x17 # BAD32: invalid instruction encoding +0x8E 0x17 # GOOD64: c.slli a5, 35 +0x92 0x17 # BAD32: invalid instruction encoding +0x92 0x17 # GOOD64: c.slli a5, 36 +0x96 0x17 # BAD32: invalid instruction encoding +0x96 0x17 # GOOD64: c.slli a5, 37 +0x9A 0x17 # BAD32: invalid instruction encoding +0x9A 0x17 # GOOD64: c.slli a5, 38 +0x9E 0x17 # BAD32: invalid instruction encoding +0x9E 0x17 # GOOD64: c.slli a5, 39 +0xA2 0x17 # BAD32: invalid instruction encoding +0xA2 0x17 # GOOD64: c.slli a5, 40 +0xA6 0x17 # BAD32: invalid instruction encoding +0xA6 0x17 # GOOD64: c.slli a5, 41 +0xAA 0x17 # BAD32: invalid instruction encoding +0xAA 0x17 # GOOD64: c.slli a5, 42 +0xAE 0x17 # BAD32: invalid instruction encoding +0xAE 0x17 # GOOD64: c.slli a5, 43 +0xB2 0x17 # BAD32: invalid instruction encoding +0xB2 0x17 # GOOD64: c.slli a5, 44 +0xB6 0x17 # BAD32: invalid instruction encoding +0xB6 0x17 # GOOD64: c.slli a5, 45 +0xBA 0x17 # BAD32: invalid instruction encoding +0xBA 0x17 # GOOD64: c.slli a5, 46 +0xBE 0x17 # BAD32: invalid instruction encoding +0xBE 0x17 # GOOD64: c.slli a5, 47 +0xC2 0x17 # BAD32: invalid instruction encoding +0xC2 0x17 # GOOD64: c.slli a5, 48 +0xC6 0x17 # BAD32: invalid instruction encoding +0xC6 0x17 # GOOD64: c.slli a5, 49 +0xCA 0x17 # BAD32: invalid instruction encoding +0xCA 0x17 # GOOD64: c.slli a5, 50 +0xCE 0x17 # BAD32: invalid instruction encoding +0xCE 0x17 # GOOD64: c.slli a5, 51 +0xD2 0x17 # BAD32: invalid instruction encoding +0xD2 0x17 # GOOD64: c.slli a5, 52 +0xD6 0x17 # BAD32: invalid instruction encoding +0xD6 0x17 # GOOD64: c.slli a5, 53 +0xDA 0x17 # BAD32: invalid instruction encoding +0xDA 0x17 # GOOD64: c.slli a5, 54 +0xDE 0x17 # BAD32: invalid instruction encoding +0xDE 0x17 # GOOD64: c.slli a5, 55 +0xE2 0x17 # BAD32: invalid instruction encoding +0xE2 0x17 # GOOD64: c.slli a5, 56 +0xE6 0x17 # BAD32: invalid instruction encoding +0xE6 0x17 # GOOD64: c.slli a5, 57 +0xEA 0x17 # BAD32: invalid instruction encoding +0xEA 0x17 # GOOD64: c.slli a5, 58 +0xEE 0x17 # BAD32: invalid instruction encoding +0xEE 0x17 # GOOD64: c.slli a5, 59 +0xF2 0x17 # BAD32: invalid instruction encoding +0xF2 0x17 # GOOD64: c.slli a5, 60 +0xF6 0x17 # BAD32: invalid instruction encoding +0xF6 0x17 # GOOD64: c.slli a5, 61 +0xFA 0x17 # BAD32: invalid instruction encoding +0xFA 0x17 # GOOD64: c.slli a5, 62 +0xFE 0x17 # BAD32: invalid instruction encoding +0xFE 0x17 # GOOD64: c.slli a5, 63 +0x06 0x08 # GOOD: c.slli a6, 1 +0x0A 0x08 # GOOD: c.slli a6, 2 +0x0E 0x08 # GOOD: c.slli a6, 3 +0x12 0x08 # GOOD: c.slli a6, 4 +0x16 0x08 # GOOD: c.slli a6, 5 +0x1A 0x08 # GOOD: c.slli a6, 6 +0x1E 0x08 # GOOD: c.slli a6, 7 +0x22 0x08 # GOOD: c.slli a6, 8 +0x26 0x08 # GOOD: c.slli a6, 9 +0x2A 0x08 # GOOD: c.slli a6, 10 +0x2E 0x08 # GOOD: c.slli a6, 11 +0x32 0x08 # GOOD: c.slli a6, 12 +0x36 0x08 # GOOD: c.slli a6, 13 +0x3A 0x08 # GOOD: c.slli a6, 14 +0x3E 0x08 # GOOD: c.slli a6, 15 +0x42 0x08 # GOOD: c.slli a6, 16 +0x46 0x08 # GOOD: c.slli a6, 17 +0x4A 0x08 # GOOD: c.slli a6, 18 +0x4E 0x08 # GOOD: c.slli a6, 19 +0x52 0x08 # GOOD: c.slli a6, 20 +0x56 0x08 # GOOD: c.slli a6, 21 +0x5A 0x08 # GOOD: c.slli a6, 22 +0x5E 0x08 # GOOD: c.slli a6, 23 +0x62 0x08 # GOOD: c.slli a6, 24 +0x66 0x08 # GOOD: c.slli a6, 25 +0x6A 0x08 # GOOD: c.slli a6, 26 +0x6E 0x08 # GOOD: c.slli a6, 27 +0x72 0x08 # GOOD: c.slli a6, 28 +0x76 0x08 # GOOD: c.slli a6, 29 +0x7A 0x08 # GOOD: c.slli a6, 30 +0x7E 0x08 # GOOD: c.slli a6, 31 +0x02 0x18 # BAD32: invalid instruction encoding +0x02 0x18 # GOOD64: c.slli a6, 32 +0x06 0x18 # BAD32: invalid instruction encoding +0x06 0x18 # GOOD64: c.slli a6, 33 +0x0A 0x18 # BAD32: invalid instruction encoding +0x0A 0x18 # GOOD64: c.slli a6, 34 +0x0E 0x18 # BAD32: invalid instruction encoding +0x0E 0x18 # GOOD64: c.slli a6, 35 +0x12 0x18 # BAD32: invalid instruction encoding +0x12 0x18 # GOOD64: c.slli a6, 36 +0x16 0x18 # BAD32: invalid instruction encoding +0x16 0x18 # GOOD64: c.slli a6, 37 +0x1A 0x18 # BAD32: invalid instruction encoding +0x1A 0x18 # GOOD64: c.slli a6, 38 +0x1E 0x18 # BAD32: invalid instruction encoding +0x1E 0x18 # GOOD64: c.slli a6, 39 +0x22 0x18 # BAD32: invalid instruction encoding +0x22 0x18 # GOOD64: c.slli a6, 40 +0x26 0x18 # BAD32: invalid instruction encoding +0x26 0x18 # GOOD64: c.slli a6, 41 +0x2A 0x18 # BAD32: invalid instruction encoding +0x2A 0x18 # GOOD64: c.slli a6, 42 +0x2E 0x18 # BAD32: invalid instruction encoding +0x2E 0x18 # GOOD64: c.slli a6, 43 +0x32 0x18 # BAD32: invalid instruction encoding +0x32 0x18 # GOOD64: c.slli a6, 44 +0x36 0x18 # BAD32: invalid instruction encoding +0x36 0x18 # GOOD64: c.slli a6, 45 +0x3A 0x18 # BAD32: invalid instruction encoding +0x3A 0x18 # GOOD64: c.slli a6, 46 +0x3E 0x18 # BAD32: invalid instruction encoding +0x3E 0x18 # GOOD64: c.slli a6, 47 +0x42 0x18 # BAD32: invalid instruction encoding +0x42 0x18 # GOOD64: c.slli a6, 48 +0x46 0x18 # BAD32: invalid instruction encoding +0x46 0x18 # GOOD64: c.slli a6, 49 +0x4A 0x18 # BAD32: invalid instruction encoding +0x4A 0x18 # GOOD64: c.slli a6, 50 +0x4E 0x18 # BAD32: invalid instruction encoding +0x4E 0x18 # GOOD64: c.slli a6, 51 +0x52 0x18 # BAD32: invalid instruction encoding +0x52 0x18 # GOOD64: c.slli a6, 52 +0x56 0x18 # BAD32: invalid instruction encoding +0x56 0x18 # GOOD64: c.slli a6, 53 +0x5A 0x18 # BAD32: invalid instruction encoding +0x5A 0x18 # GOOD64: c.slli a6, 54 +0x5E 0x18 # BAD32: invalid instruction encoding +0x5E 0x18 # GOOD64: c.slli a6, 55 +0x62 0x18 # BAD32: invalid instruction encoding +0x62 0x18 # GOOD64: c.slli a6, 56 +0x66 0x18 # BAD32: invalid instruction encoding +0x66 0x18 # GOOD64: c.slli a6, 57 +0x6A 0x18 # BAD32: invalid instruction encoding +0x6A 0x18 # GOOD64: c.slli a6, 58 +0x6E 0x18 # BAD32: invalid instruction encoding +0x6E 0x18 # GOOD64: c.slli a6, 59 +0x72 0x18 # BAD32: invalid instruction encoding +0x72 0x18 # GOOD64: c.slli a6, 60 +0x76 0x18 # BAD32: invalid instruction encoding +0x76 0x18 # GOOD64: c.slli a6, 61 +0x7A 0x18 # BAD32: invalid instruction encoding +0x7A 0x18 # GOOD64: c.slli a6, 62 +0x7E 0x18 # BAD32: invalid instruction encoding +0x7E 0x18 # GOOD64: c.slli a6, 63 +0x86 0x08 # GOOD: c.slli a7, 1 +0x8A 0x08 # GOOD: c.slli a7, 2 +0x8E 0x08 # GOOD: c.slli a7, 3 +0x92 0x08 # GOOD: c.slli a7, 4 +0x96 0x08 # GOOD: c.slli a7, 5 +0x9A 0x08 # GOOD: c.slli a7, 6 +0x9E 0x08 # GOOD: c.slli a7, 7 +0xA2 0x08 # GOOD: c.slli a7, 8 +0xA6 0x08 # GOOD: c.slli a7, 9 +0xAA 0x08 # GOOD: c.slli a7, 10 +0xAE 0x08 # GOOD: c.slli a7, 11 +0xB2 0x08 # GOOD: c.slli a7, 12 +0xB6 0x08 # GOOD: c.slli a7, 13 +0xBA 0x08 # GOOD: c.slli a7, 14 +0xBE 0x08 # GOOD: c.slli a7, 15 +0xC2 0x08 # GOOD: c.slli a7, 16 +0xC6 0x08 # GOOD: c.slli a7, 17 +0xCA 0x08 # GOOD: c.slli a7, 18 +0xCE 0x08 # GOOD: c.slli a7, 19 +0xD2 0x08 # GOOD: c.slli a7, 20 +0xD6 0x08 # GOOD: c.slli a7, 21 +0xDA 0x08 # GOOD: c.slli a7, 22 +0xDE 0x08 # GOOD: c.slli a7, 23 +0xE2 0x08 # GOOD: c.slli a7, 24 +0xE6 0x08 # GOOD: c.slli a7, 25 +0xEA 0x08 # GOOD: c.slli a7, 26 +0xEE 0x08 # GOOD: c.slli a7, 27 +0xF2 0x08 # GOOD: c.slli a7, 28 +0xF6 0x08 # GOOD: c.slli a7, 29 +0xFA 0x08 # GOOD: c.slli a7, 30 +0xFE 0x08 # GOOD: c.slli a7, 31 +0x82 0x18 # BAD32: invalid instruction encoding +0x82 0x18 # GOOD64: c.slli a7, 32 +0x86 0x18 # BAD32: invalid instruction encoding +0x86 0x18 # GOOD64: c.slli a7, 33 +0x8A 0x18 # BAD32: invalid instruction encoding +0x8A 0x18 # GOOD64: c.slli a7, 34 +0x8E 0x18 # BAD32: invalid instruction encoding +0x8E 0x18 # GOOD64: c.slli a7, 35 +0x92 0x18 # BAD32: invalid instruction encoding +0x92 0x18 # GOOD64: c.slli a7, 36 +0x96 0x18 # BAD32: invalid instruction encoding +0x96 0x18 # GOOD64: c.slli a7, 37 +0x9A 0x18 # BAD32: invalid instruction encoding +0x9A 0x18 # GOOD64: c.slli a7, 38 +0x9E 0x18 # BAD32: invalid instruction encoding +0x9E 0x18 # GOOD64: c.slli a7, 39 +0xA2 0x18 # BAD32: invalid instruction encoding +0xA2 0x18 # GOOD64: c.slli a7, 40 +0xA6 0x18 # BAD32: invalid instruction encoding +0xA6 0x18 # GOOD64: c.slli a7, 41 +0xAA 0x18 # BAD32: invalid instruction encoding +0xAA 0x18 # GOOD64: c.slli a7, 42 +0xAE 0x18 # BAD32: invalid instruction encoding +0xAE 0x18 # GOOD64: c.slli a7, 43 +0xB2 0x18 # BAD32: invalid instruction encoding +0xB2 0x18 # GOOD64: c.slli a7, 44 +0xB6 0x18 # BAD32: invalid instruction encoding +0xB6 0x18 # GOOD64: c.slli a7, 45 +0xBA 0x18 # BAD32: invalid instruction encoding +0xBA 0x18 # GOOD64: c.slli a7, 46 +0xBE 0x18 # BAD32: invalid instruction encoding +0xBE 0x18 # GOOD64: c.slli a7, 47 +0xC2 0x18 # BAD32: invalid instruction encoding +0xC2 0x18 # GOOD64: c.slli a7, 48 +0xC6 0x18 # BAD32: invalid instruction encoding +0xC6 0x18 # GOOD64: c.slli a7, 49 +0xCA 0x18 # BAD32: invalid instruction encoding +0xCA 0x18 # GOOD64: c.slli a7, 50 +0xCE 0x18 # BAD32: invalid instruction encoding +0xCE 0x18 # GOOD64: c.slli a7, 51 +0xD2 0x18 # BAD32: invalid instruction encoding +0xD2 0x18 # GOOD64: c.slli a7, 52 +0xD6 0x18 # BAD32: invalid instruction encoding +0xD6 0x18 # GOOD64: c.slli a7, 53 +0xDA 0x18 # BAD32: invalid instruction encoding +0xDA 0x18 # GOOD64: c.slli a7, 54 +0xDE 0x18 # BAD32: invalid instruction encoding +0xDE 0x18 # GOOD64: c.slli a7, 55 +0xE2 0x18 # BAD32: invalid instruction encoding +0xE2 0x18 # GOOD64: c.slli a7, 56 +0xE6 0x18 # BAD32: invalid instruction encoding +0xE6 0x18 # GOOD64: c.slli a7, 57 +0xEA 0x18 # BAD32: invalid instruction encoding +0xEA 0x18 # GOOD64: c.slli a7, 58 +0xEE 0x18 # BAD32: invalid instruction encoding +0xEE 0x18 # GOOD64: c.slli a7, 59 +0xF2 0x18 # BAD32: invalid instruction encoding +0xF2 0x18 # GOOD64: c.slli a7, 60 +0xF6 0x18 # BAD32: invalid instruction encoding +0xF6 0x18 # GOOD64: c.slli a7, 61 +0xFA 0x18 # BAD32: invalid instruction encoding +0xFA 0x18 # GOOD64: c.slli a7, 62 +0xFE 0x18 # BAD32: invalid instruction encoding +0xFE 0x18 # GOOD64: c.slli a7, 63 +0x06 0x09 # GOOD: c.slli s2, 1 +0x0A 0x09 # GOOD: c.slli s2, 2 +0x0E 0x09 # GOOD: c.slli s2, 3 +0x12 0x09 # GOOD: c.slli s2, 4 +0x16 0x09 # GOOD: c.slli s2, 5 +0x1A 0x09 # GOOD: c.slli s2, 6 +0x1E 0x09 # GOOD: c.slli s2, 7 +0x22 0x09 # GOOD: c.slli s2, 8 +0x26 0x09 # GOOD: c.slli s2, 9 +0x2A 0x09 # GOOD: c.slli s2, 10 +0x2E 0x09 # GOOD: c.slli s2, 11 +0x32 0x09 # GOOD: c.slli s2, 12 +0x36 0x09 # GOOD: c.slli s2, 13 +0x3A 0x09 # GOOD: c.slli s2, 14 +0x3E 0x09 # GOOD: c.slli s2, 15 +0x42 0x09 # GOOD: c.slli s2, 16 +0x46 0x09 # GOOD: c.slli s2, 17 +0x4A 0x09 # GOOD: c.slli s2, 18 +0x4E 0x09 # GOOD: c.slli s2, 19 +0x52 0x09 # GOOD: c.slli s2, 20 +0x56 0x09 # GOOD: c.slli s2, 21 +0x5A 0x09 # GOOD: c.slli s2, 22 +0x5E 0x09 # GOOD: c.slli s2, 23 +0x62 0x09 # GOOD: c.slli s2, 24 +0x66 0x09 # GOOD: c.slli s2, 25 +0x6A 0x09 # GOOD: c.slli s2, 26 +0x6E 0x09 # GOOD: c.slli s2, 27 +0x72 0x09 # GOOD: c.slli s2, 28 +0x76 0x09 # GOOD: c.slli s2, 29 +0x7A 0x09 # GOOD: c.slli s2, 30 +0x7E 0x09 # GOOD: c.slli s2, 31 +0x02 0x19 # BAD32: invalid instruction encoding +0x02 0x19 # GOOD64: c.slli s2, 32 +0x06 0x19 # BAD32: invalid instruction encoding +0x06 0x19 # GOOD64: c.slli s2, 33 +0x0A 0x19 # BAD32: invalid instruction encoding +0x0A 0x19 # GOOD64: c.slli s2, 34 +0x0E 0x19 # BAD32: invalid instruction encoding +0x0E 0x19 # GOOD64: c.slli s2, 35 +0x12 0x19 # BAD32: invalid instruction encoding +0x12 0x19 # GOOD64: c.slli s2, 36 +0x16 0x19 # BAD32: invalid instruction encoding +0x16 0x19 # GOOD64: c.slli s2, 37 +0x1A 0x19 # BAD32: invalid instruction encoding +0x1A 0x19 # GOOD64: c.slli s2, 38 +0x1E 0x19 # BAD32: invalid instruction encoding +0x1E 0x19 # GOOD64: c.slli s2, 39 +0x22 0x19 # BAD32: invalid instruction encoding +0x22 0x19 # GOOD64: c.slli s2, 40 +0x26 0x19 # BAD32: invalid instruction encoding +0x26 0x19 # GOOD64: c.slli s2, 41 +0x2A 0x19 # BAD32: invalid instruction encoding +0x2A 0x19 # GOOD64: c.slli s2, 42 +0x2E 0x19 # BAD32: invalid instruction encoding +0x2E 0x19 # GOOD64: c.slli s2, 43 +0x32 0x19 # BAD32: invalid instruction encoding +0x32 0x19 # GOOD64: c.slli s2, 44 +0x36 0x19 # BAD32: invalid instruction encoding +0x36 0x19 # GOOD64: c.slli s2, 45 +0x3A 0x19 # BAD32: invalid instruction encoding +0x3A 0x19 # GOOD64: c.slli s2, 46 +0x3E 0x19 # BAD32: invalid instruction encoding +0x3E 0x19 # GOOD64: c.slli s2, 47 +0x42 0x19 # BAD32: invalid instruction encoding +0x42 0x19 # GOOD64: c.slli s2, 48 +0x46 0x19 # BAD32: invalid instruction encoding +0x46 0x19 # GOOD64: c.slli s2, 49 +0x4A 0x19 # BAD32: invalid instruction encoding +0x4A 0x19 # GOOD64: c.slli s2, 50 +0x4E 0x19 # BAD32: invalid instruction encoding +0x4E 0x19 # GOOD64: c.slli s2, 51 +0x52 0x19 # BAD32: invalid instruction encoding +0x52 0x19 # GOOD64: c.slli s2, 52 +0x56 0x19 # BAD32: invalid instruction encoding +0x56 0x19 # GOOD64: c.slli s2, 53 +0x5A 0x19 # BAD32: invalid instruction encoding +0x5A 0x19 # GOOD64: c.slli s2, 54 +0x5E 0x19 # BAD32: invalid instruction encoding +0x5E 0x19 # GOOD64: c.slli s2, 55 +0x62 0x19 # BAD32: invalid instruction encoding +0x62 0x19 # GOOD64: c.slli s2, 56 +0x66 0x19 # BAD32: invalid instruction encoding +0x66 0x19 # GOOD64: c.slli s2, 57 +0x6A 0x19 # BAD32: invalid instruction encoding +0x6A 0x19 # GOOD64: c.slli s2, 58 +0x6E 0x19 # BAD32: invalid instruction encoding +0x6E 0x19 # GOOD64: c.slli s2, 59 +0x72 0x19 # BAD32: invalid instruction encoding +0x72 0x19 # GOOD64: c.slli s2, 60 +0x76 0x19 # BAD32: invalid instruction encoding +0x76 0x19 # GOOD64: c.slli s2, 61 +0x7A 0x19 # BAD32: invalid instruction encoding +0x7A 0x19 # GOOD64: c.slli s2, 62 +0x7E 0x19 # BAD32: invalid instruction encoding +0x7E 0x19 # GOOD64: c.slli s2, 63 +0x86 0x09 # GOOD: c.slli s3, 1 +0x8A 0x09 # GOOD: c.slli s3, 2 +0x8E 0x09 # GOOD: c.slli s3, 3 +0x92 0x09 # GOOD: c.slli s3, 4 +0x96 0x09 # GOOD: c.slli s3, 5 +0x9A 0x09 # GOOD: c.slli s3, 6 +0x9E 0x09 # GOOD: c.slli s3, 7 +0xA2 0x09 # GOOD: c.slli s3, 8 +0xA6 0x09 # GOOD: c.slli s3, 9 +0xAA 0x09 # GOOD: c.slli s3, 10 +0xAE 0x09 # GOOD: c.slli s3, 11 +0xB2 0x09 # GOOD: c.slli s3, 12 +0xB6 0x09 # GOOD: c.slli s3, 13 +0xBA 0x09 # GOOD: c.slli s3, 14 +0xBE 0x09 # GOOD: c.slli s3, 15 +0xC2 0x09 # GOOD: c.slli s3, 16 +0xC6 0x09 # GOOD: c.slli s3, 17 +0xCA 0x09 # GOOD: c.slli s3, 18 +0xCE 0x09 # GOOD: c.slli s3, 19 +0xD2 0x09 # GOOD: c.slli s3, 20 +0xD6 0x09 # GOOD: c.slli s3, 21 +0xDA 0x09 # GOOD: c.slli s3, 22 +0xDE 0x09 # GOOD: c.slli s3, 23 +0xE2 0x09 # GOOD: c.slli s3, 24 +0xE6 0x09 # GOOD: c.slli s3, 25 +0xEA 0x09 # GOOD: c.slli s3, 26 +0xEE 0x09 # GOOD: c.slli s3, 27 +0xF2 0x09 # GOOD: c.slli s3, 28 +0xF6 0x09 # GOOD: c.slli s3, 29 +0xFA 0x09 # GOOD: c.slli s3, 30 +0xFE 0x09 # GOOD: c.slli s3, 31 +0x82 0x19 # BAD32: invalid instruction encoding +0x82 0x19 # GOOD64: c.slli s3, 32 +0x86 0x19 # BAD32: invalid instruction encoding +0x86 0x19 # GOOD64: c.slli s3, 33 +0x8A 0x19 # BAD32: invalid instruction encoding +0x8A 0x19 # GOOD64: c.slli s3, 34 +0x8E 0x19 # BAD32: invalid instruction encoding +0x8E 0x19 # GOOD64: c.slli s3, 35 +0x92 0x19 # BAD32: invalid instruction encoding +0x92 0x19 # GOOD64: c.slli s3, 36 +0x96 0x19 # BAD32: invalid instruction encoding +0x96 0x19 # GOOD64: c.slli s3, 37 +0x9A 0x19 # BAD32: invalid instruction encoding +0x9A 0x19 # GOOD64: c.slli s3, 38 +0x9E 0x19 # BAD32: invalid instruction encoding +0x9E 0x19 # GOOD64: c.slli s3, 39 +0xA2 0x19 # BAD32: invalid instruction encoding +0xA2 0x19 # GOOD64: c.slli s3, 40 +0xA6 0x19 # BAD32: invalid instruction encoding +0xA6 0x19 # GOOD64: c.slli s3, 41 +0xAA 0x19 # BAD32: invalid instruction encoding +0xAA 0x19 # GOOD64: c.slli s3, 42 +0xAE 0x19 # BAD32: invalid instruction encoding +0xAE 0x19 # GOOD64: c.slli s3, 43 +0xB2 0x19 # BAD32: invalid instruction encoding +0xB2 0x19 # GOOD64: c.slli s3, 44 +0xB6 0x19 # BAD32: invalid instruction encoding +0xB6 0x19 # GOOD64: c.slli s3, 45 +0xBA 0x19 # BAD32: invalid instruction encoding +0xBA 0x19 # GOOD64: c.slli s3, 46 +0xBE 0x19 # BAD32: invalid instruction encoding +0xBE 0x19 # GOOD64: c.slli s3, 47 +0xC2 0x19 # BAD32: invalid instruction encoding +0xC2 0x19 # GOOD64: c.slli s3, 48 +0xC6 0x19 # BAD32: invalid instruction encoding +0xC6 0x19 # GOOD64: c.slli s3, 49 +0xCA 0x19 # BAD32: invalid instruction encoding +0xCA 0x19 # GOOD64: c.slli s3, 50 +0xCE 0x19 # BAD32: invalid instruction encoding +0xCE 0x19 # GOOD64: c.slli s3, 51 +0xD2 0x19 # BAD32: invalid instruction encoding +0xD2 0x19 # GOOD64: c.slli s3, 52 +0xD6 0x19 # BAD32: invalid instruction encoding +0xD6 0x19 # GOOD64: c.slli s3, 53 +0xDA 0x19 # BAD32: invalid instruction encoding +0xDA 0x19 # GOOD64: c.slli s3, 54 +0xDE 0x19 # BAD32: invalid instruction encoding +0xDE 0x19 # GOOD64: c.slli s3, 55 +0xE2 0x19 # BAD32: invalid instruction encoding +0xE2 0x19 # GOOD64: c.slli s3, 56 +0xE6 0x19 # BAD32: invalid instruction encoding +0xE6 0x19 # GOOD64: c.slli s3, 57 +0xEA 0x19 # BAD32: invalid instruction encoding +0xEA 0x19 # GOOD64: c.slli s3, 58 +0xEE 0x19 # BAD32: invalid instruction encoding +0xEE 0x19 # GOOD64: c.slli s3, 59 +0xF2 0x19 # BAD32: invalid instruction encoding +0xF2 0x19 # GOOD64: c.slli s3, 60 +0xF6 0x19 # BAD32: invalid instruction encoding +0xF6 0x19 # GOOD64: c.slli s3, 61 +0xFA 0x19 # BAD32: invalid instruction encoding +0xFA 0x19 # GOOD64: c.slli s3, 62 +0xFE 0x19 # BAD32: invalid instruction encoding +0xFE 0x19 # GOOD64: c.slli s3, 63 +0x06 0x0A # GOOD: c.slli s4, 1 +0x0A 0x0A # GOOD: c.slli s4, 2 +0x0E 0x0A # GOOD: c.slli s4, 3 +0x12 0x0A # GOOD: c.slli s4, 4 +0x16 0x0A # GOOD: c.slli s4, 5 +0x1A 0x0A # GOOD: c.slli s4, 6 +0x1E 0x0A # GOOD: c.slli s4, 7 +0x22 0x0A # GOOD: c.slli s4, 8 +0x26 0x0A # GOOD: c.slli s4, 9 +0x2A 0x0A # GOOD: c.slli s4, 10 +0x2E 0x0A # GOOD: c.slli s4, 11 +0x32 0x0A # GOOD: c.slli s4, 12 +0x36 0x0A # GOOD: c.slli s4, 13 +0x3A 0x0A # GOOD: c.slli s4, 14 +0x3E 0x0A # GOOD: c.slli s4, 15 +0x42 0x0A # GOOD: c.slli s4, 16 +0x46 0x0A # GOOD: c.slli s4, 17 +0x4A 0x0A # GOOD: c.slli s4, 18 +0x4E 0x0A # GOOD: c.slli s4, 19 +0x52 0x0A # GOOD: c.slli s4, 20 +0x56 0x0A # GOOD: c.slli s4, 21 +0x5A 0x0A # GOOD: c.slli s4, 22 +0x5E 0x0A # GOOD: c.slli s4, 23 +0x62 0x0A # GOOD: c.slli s4, 24 +0x66 0x0A # GOOD: c.slli s4, 25 +0x6A 0x0A # GOOD: c.slli s4, 26 +0x6E 0x0A # GOOD: c.slli s4, 27 +0x72 0x0A # GOOD: c.slli s4, 28 +0x76 0x0A # GOOD: c.slli s4, 29 +0x7A 0x0A # GOOD: c.slli s4, 30 +0x7E 0x0A # GOOD: c.slli s4, 31 +0x02 0x1A # BAD32: invalid instruction encoding +0x02 0x1A # GOOD64: c.slli s4, 32 +0x06 0x1A # BAD32: invalid instruction encoding +0x06 0x1A # GOOD64: c.slli s4, 33 +0x0A 0x1A # BAD32: invalid instruction encoding +0x0A 0x1A # GOOD64: c.slli s4, 34 +0x0E 0x1A # BAD32: invalid instruction encoding +0x0E 0x1A # GOOD64: c.slli s4, 35 +0x12 0x1A # BAD32: invalid instruction encoding +0x12 0x1A # GOOD64: c.slli s4, 36 +0x16 0x1A # BAD32: invalid instruction encoding +0x16 0x1A # GOOD64: c.slli s4, 37 +0x1A 0x1A # BAD32: invalid instruction encoding +0x1A 0x1A # GOOD64: c.slli s4, 38 +0x1E 0x1A # BAD32: invalid instruction encoding +0x1E 0x1A # GOOD64: c.slli s4, 39 +0x22 0x1A # BAD32: invalid instruction encoding +0x22 0x1A # GOOD64: c.slli s4, 40 +0x26 0x1A # BAD32: invalid instruction encoding +0x26 0x1A # GOOD64: c.slli s4, 41 +0x2A 0x1A # BAD32: invalid instruction encoding +0x2A 0x1A # GOOD64: c.slli s4, 42 +0x2E 0x1A # BAD32: invalid instruction encoding +0x2E 0x1A # GOOD64: c.slli s4, 43 +0x32 0x1A # BAD32: invalid instruction encoding +0x32 0x1A # GOOD64: c.slli s4, 44 +0x36 0x1A # BAD32: invalid instruction encoding +0x36 0x1A # GOOD64: c.slli s4, 45 +0x3A 0x1A # BAD32: invalid instruction encoding +0x3A 0x1A # GOOD64: c.slli s4, 46 +0x3E 0x1A # BAD32: invalid instruction encoding +0x3E 0x1A # GOOD64: c.slli s4, 47 +0x42 0x1A # BAD32: invalid instruction encoding +0x42 0x1A # GOOD64: c.slli s4, 48 +0x46 0x1A # BAD32: invalid instruction encoding +0x46 0x1A # GOOD64: c.slli s4, 49 +0x4A 0x1A # BAD32: invalid instruction encoding +0x4A 0x1A # GOOD64: c.slli s4, 50 +0x4E 0x1A # BAD32: invalid instruction encoding +0x4E 0x1A # GOOD64: c.slli s4, 51 +0x52 0x1A # BAD32: invalid instruction encoding +0x52 0x1A # GOOD64: c.slli s4, 52 +0x56 0x1A # BAD32: invalid instruction encoding +0x56 0x1A # GOOD64: c.slli s4, 53 +0x5A 0x1A # BAD32: invalid instruction encoding +0x5A 0x1A # GOOD64: c.slli s4, 54 +0x5E 0x1A # BAD32: invalid instruction encoding +0x5E 0x1A # GOOD64: c.slli s4, 55 +0x62 0x1A # BAD32: invalid instruction encoding +0x62 0x1A # GOOD64: c.slli s4, 56 +0x66 0x1A # BAD32: invalid instruction encoding +0x66 0x1A # GOOD64: c.slli s4, 57 +0x6A 0x1A # BAD32: invalid instruction encoding +0x6A 0x1A # GOOD64: c.slli s4, 58 +0x6E 0x1A # BAD32: invalid instruction encoding +0x6E 0x1A # GOOD64: c.slli s4, 59 +0x72 0x1A # BAD32: invalid instruction encoding +0x72 0x1A # GOOD64: c.slli s4, 60 +0x76 0x1A # BAD32: invalid instruction encoding +0x76 0x1A # GOOD64: c.slli s4, 61 +0x7A 0x1A # BAD32: invalid instruction encoding +0x7A 0x1A # GOOD64: c.slli s4, 62 +0x7E 0x1A # BAD32: invalid instruction encoding +0x7E 0x1A # GOOD64: c.slli s4, 63 +0x86 0x0A # GOOD: c.slli s5, 1 +0x8A 0x0A # GOOD: c.slli s5, 2 +0x8E 0x0A # GOOD: c.slli s5, 3 +0x92 0x0A # GOOD: c.slli s5, 4 +0x96 0x0A # GOOD: c.slli s5, 5 +0x9A 0x0A # GOOD: c.slli s5, 6 +0x9E 0x0A # GOOD: c.slli s5, 7 +0xA2 0x0A # GOOD: c.slli s5, 8 +0xA6 0x0A # GOOD: c.slli s5, 9 +0xAA 0x0A # GOOD: c.slli s5, 10 +0xAE 0x0A # GOOD: c.slli s5, 11 +0xB2 0x0A # GOOD: c.slli s5, 12 +0xB6 0x0A # GOOD: c.slli s5, 13 +0xBA 0x0A # GOOD: c.slli s5, 14 +0xBE 0x0A # GOOD: c.slli s5, 15 +0xC2 0x0A # GOOD: c.slli s5, 16 +0xC6 0x0A # GOOD: c.slli s5, 17 +0xCA 0x0A # GOOD: c.slli s5, 18 +0xCE 0x0A # GOOD: c.slli s5, 19 +0xD2 0x0A # GOOD: c.slli s5, 20 +0xD6 0x0A # GOOD: c.slli s5, 21 +0xDA 0x0A # GOOD: c.slli s5, 22 +0xDE 0x0A # GOOD: c.slli s5, 23 +0xE2 0x0A # GOOD: c.slli s5, 24 +0xE6 0x0A # GOOD: c.slli s5, 25 +0xEA 0x0A # GOOD: c.slli s5, 26 +0xEE 0x0A # GOOD: c.slli s5, 27 +0xF2 0x0A # GOOD: c.slli s5, 28 +0xF6 0x0A # GOOD: c.slli s5, 29 +0xFA 0x0A # GOOD: c.slli s5, 30 +0xFE 0x0A # GOOD: c.slli s5, 31 +0x82 0x1A # BAD32: invalid instruction encoding +0x82 0x1A # GOOD64: c.slli s5, 32 +0x86 0x1A # BAD32: invalid instruction encoding +0x86 0x1A # GOOD64: c.slli s5, 33 +0x8A 0x1A # BAD32: invalid instruction encoding +0x8A 0x1A # GOOD64: c.slli s5, 34 +0x8E 0x1A # BAD32: invalid instruction encoding +0x8E 0x1A # GOOD64: c.slli s5, 35 +0x92 0x1A # BAD32: invalid instruction encoding +0x92 0x1A # GOOD64: c.slli s5, 36 +0x96 0x1A # BAD32: invalid instruction encoding +0x96 0x1A # GOOD64: c.slli s5, 37 +0x9A 0x1A # BAD32: invalid instruction encoding +0x9A 0x1A # GOOD64: c.slli s5, 38 +0x9E 0x1A # BAD32: invalid instruction encoding +0x9E 0x1A # GOOD64: c.slli s5, 39 +0xA2 0x1A # BAD32: invalid instruction encoding +0xA2 0x1A # GOOD64: c.slli s5, 40 +0xA6 0x1A # BAD32: invalid instruction encoding +0xA6 0x1A # GOOD64: c.slli s5, 41 +0xAA 0x1A # BAD32: invalid instruction encoding +0xAA 0x1A # GOOD64: c.slli s5, 42 +0xAE 0x1A # BAD32: invalid instruction encoding +0xAE 0x1A # GOOD64: c.slli s5, 43 +0xB2 0x1A # BAD32: invalid instruction encoding +0xB2 0x1A # GOOD64: c.slli s5, 44 +0xB6 0x1A # BAD32: invalid instruction encoding +0xB6 0x1A # GOOD64: c.slli s5, 45 +0xBA 0x1A # BAD32: invalid instruction encoding +0xBA 0x1A # GOOD64: c.slli s5, 46 +0xBE 0x1A # BAD32: invalid instruction encoding +0xBE 0x1A # GOOD64: c.slli s5, 47 +0xC2 0x1A # BAD32: invalid instruction encoding +0xC2 0x1A # GOOD64: c.slli s5, 48 +0xC6 0x1A # BAD32: invalid instruction encoding +0xC6 0x1A # GOOD64: c.slli s5, 49 +0xCA 0x1A # BAD32: invalid instruction encoding +0xCA 0x1A # GOOD64: c.slli s5, 50 +0xCE 0x1A # BAD32: invalid instruction encoding +0xCE 0x1A # GOOD64: c.slli s5, 51 +0xD2 0x1A # BAD32: invalid instruction encoding +0xD2 0x1A # GOOD64: c.slli s5, 52 +0xD6 0x1A # BAD32: invalid instruction encoding +0xD6 0x1A # GOOD64: c.slli s5, 53 +0xDA 0x1A # BAD32: invalid instruction encoding +0xDA 0x1A # GOOD64: c.slli s5, 54 +0xDE 0x1A # BAD32: invalid instruction encoding +0xDE 0x1A # GOOD64: c.slli s5, 55 +0xE2 0x1A # BAD32: invalid instruction encoding +0xE2 0x1A # GOOD64: c.slli s5, 56 +0xE6 0x1A # BAD32: invalid instruction encoding +0xE6 0x1A # GOOD64: c.slli s5, 57 +0xEA 0x1A # BAD32: invalid instruction encoding +0xEA 0x1A # GOOD64: c.slli s5, 58 +0xEE 0x1A # BAD32: invalid instruction encoding +0xEE 0x1A # GOOD64: c.slli s5, 59 +0xF2 0x1A # BAD32: invalid instruction encoding +0xF2 0x1A # GOOD64: c.slli s5, 60 +0xF6 0x1A # BAD32: invalid instruction encoding +0xF6 0x1A # GOOD64: c.slli s5, 61 +0xFA 0x1A # BAD32: invalid instruction encoding +0xFA 0x1A # GOOD64: c.slli s5, 62 +0xFE 0x1A # BAD32: invalid instruction encoding +0xFE 0x1A # GOOD64: c.slli s5, 63 +0x06 0x0B # GOOD: c.slli s6, 1 +0x0A 0x0B # GOOD: c.slli s6, 2 +0x0E 0x0B # GOOD: c.slli s6, 3 +0x12 0x0B # GOOD: c.slli s6, 4 +0x16 0x0B # GOOD: c.slli s6, 5 +0x1A 0x0B # GOOD: c.slli s6, 6 +0x1E 0x0B # GOOD: c.slli s6, 7 +0x22 0x0B # GOOD: c.slli s6, 8 +0x26 0x0B # GOOD: c.slli s6, 9 +0x2A 0x0B # GOOD: c.slli s6, 10 +0x2E 0x0B # GOOD: c.slli s6, 11 +0x32 0x0B # GOOD: c.slli s6, 12 +0x36 0x0B # GOOD: c.slli s6, 13 +0x3A 0x0B # GOOD: c.slli s6, 14 +0x3E 0x0B # GOOD: c.slli s6, 15 +0x42 0x0B # GOOD: c.slli s6, 16 +0x46 0x0B # GOOD: c.slli s6, 17 +0x4A 0x0B # GOOD: c.slli s6, 18 +0x4E 0x0B # GOOD: c.slli s6, 19 +0x52 0x0B # GOOD: c.slli s6, 20 +0x56 0x0B # GOOD: c.slli s6, 21 +0x5A 0x0B # GOOD: c.slli s6, 22 +0x5E 0x0B # GOOD: c.slli s6, 23 +0x62 0x0B # GOOD: c.slli s6, 24 +0x66 0x0B # GOOD: c.slli s6, 25 +0x6A 0x0B # GOOD: c.slli s6, 26 +0x6E 0x0B # GOOD: c.slli s6, 27 +0x72 0x0B # GOOD: c.slli s6, 28 +0x76 0x0B # GOOD: c.slli s6, 29 +0x7A 0x0B # GOOD: c.slli s6, 30 +0x7E 0x0B # GOOD: c.slli s6, 31 +0x02 0x1B # BAD32: invalid instruction encoding +0x02 0x1B # GOOD64: c.slli s6, 32 +0x06 0x1B # BAD32: invalid instruction encoding +0x06 0x1B # GOOD64: c.slli s6, 33 +0x0A 0x1B # BAD32: invalid instruction encoding +0x0A 0x1B # GOOD64: c.slli s6, 34 +0x0E 0x1B # BAD32: invalid instruction encoding +0x0E 0x1B # GOOD64: c.slli s6, 35 +0x12 0x1B # BAD32: invalid instruction encoding +0x12 0x1B # GOOD64: c.slli s6, 36 +0x16 0x1B # BAD32: invalid instruction encoding +0x16 0x1B # GOOD64: c.slli s6, 37 +0x1A 0x1B # BAD32: invalid instruction encoding +0x1A 0x1B # GOOD64: c.slli s6, 38 +0x1E 0x1B # BAD32: invalid instruction encoding +0x1E 0x1B # GOOD64: c.slli s6, 39 +0x22 0x1B # BAD32: invalid instruction encoding +0x22 0x1B # GOOD64: c.slli s6, 40 +0x26 0x1B # BAD32: invalid instruction encoding +0x26 0x1B # GOOD64: c.slli s6, 41 +0x2A 0x1B # BAD32: invalid instruction encoding +0x2A 0x1B # GOOD64: c.slli s6, 42 +0x2E 0x1B # BAD32: invalid instruction encoding +0x2E 0x1B # GOOD64: c.slli s6, 43 +0x32 0x1B # BAD32: invalid instruction encoding +0x32 0x1B # GOOD64: c.slli s6, 44 +0x36 0x1B # BAD32: invalid instruction encoding +0x36 0x1B # GOOD64: c.slli s6, 45 +0x3A 0x1B # BAD32: invalid instruction encoding +0x3A 0x1B # GOOD64: c.slli s6, 46 +0x3E 0x1B # BAD32: invalid instruction encoding +0x3E 0x1B # GOOD64: c.slli s6, 47 +0x42 0x1B # BAD32: invalid instruction encoding +0x42 0x1B # GOOD64: c.slli s6, 48 +0x46 0x1B # BAD32: invalid instruction encoding +0x46 0x1B # GOOD64: c.slli s6, 49 +0x4A 0x1B # BAD32: invalid instruction encoding +0x4A 0x1B # GOOD64: c.slli s6, 50 +0x4E 0x1B # BAD32: invalid instruction encoding +0x4E 0x1B # GOOD64: c.slli s6, 51 +0x52 0x1B # BAD32: invalid instruction encoding +0x52 0x1B # GOOD64: c.slli s6, 52 +0x56 0x1B # BAD32: invalid instruction encoding +0x56 0x1B # GOOD64: c.slli s6, 53 +0x5A 0x1B # BAD32: invalid instruction encoding +0x5A 0x1B # GOOD64: c.slli s6, 54 +0x5E 0x1B # BAD32: invalid instruction encoding +0x5E 0x1B # GOOD64: c.slli s6, 55 +0x62 0x1B # BAD32: invalid instruction encoding +0x62 0x1B # GOOD64: c.slli s6, 56 +0x66 0x1B # BAD32: invalid instruction encoding +0x66 0x1B # GOOD64: c.slli s6, 57 +0x6A 0x1B # BAD32: invalid instruction encoding +0x6A 0x1B # GOOD64: c.slli s6, 58 +0x6E 0x1B # BAD32: invalid instruction encoding +0x6E 0x1B # GOOD64: c.slli s6, 59 +0x72 0x1B # BAD32: invalid instruction encoding +0x72 0x1B # GOOD64: c.slli s6, 60 +0x76 0x1B # BAD32: invalid instruction encoding +0x76 0x1B # GOOD64: c.slli s6, 61 +0x7A 0x1B # BAD32: invalid instruction encoding +0x7A 0x1B # GOOD64: c.slli s6, 62 +0x7E 0x1B # BAD32: invalid instruction encoding +0x7E 0x1B # GOOD64: c.slli s6, 63 +0x86 0x0B # GOOD: c.slli s7, 1 +0x8A 0x0B # GOOD: c.slli s7, 2 +0x8E 0x0B # GOOD: c.slli s7, 3 +0x92 0x0B # GOOD: c.slli s7, 4 +0x96 0x0B # GOOD: c.slli s7, 5 +0x9A 0x0B # GOOD: c.slli s7, 6 +0x9E 0x0B # GOOD: c.slli s7, 7 +0xA2 0x0B # GOOD: c.slli s7, 8 +0xA6 0x0B # GOOD: c.slli s7, 9 +0xAA 0x0B # GOOD: c.slli s7, 10 +0xAE 0x0B # GOOD: c.slli s7, 11 +0xB2 0x0B # GOOD: c.slli s7, 12 +0xB6 0x0B # GOOD: c.slli s7, 13 +0xBA 0x0B # GOOD: c.slli s7, 14 +0xBE 0x0B # GOOD: c.slli s7, 15 +0xC2 0x0B # GOOD: c.slli s7, 16 +0xC6 0x0B # GOOD: c.slli s7, 17 +0xCA 0x0B # GOOD: c.slli s7, 18 +0xCE 0x0B # GOOD: c.slli s7, 19 +0xD2 0x0B # GOOD: c.slli s7, 20 +0xD6 0x0B # GOOD: c.slli s7, 21 +0xDA 0x0B # GOOD: c.slli s7, 22 +0xDE 0x0B # GOOD: c.slli s7, 23 +0xE2 0x0B # GOOD: c.slli s7, 24 +0xE6 0x0B # GOOD: c.slli s7, 25 +0xEA 0x0B # GOOD: c.slli s7, 26 +0xEE 0x0B # GOOD: c.slli s7, 27 +0xF2 0x0B # GOOD: c.slli s7, 28 +0xF6 0x0B # GOOD: c.slli s7, 29 +0xFA 0x0B # GOOD: c.slli s7, 30 +0xFE 0x0B # GOOD: c.slli s7, 31 +0x82 0x1B # BAD32: invalid instruction encoding +0x82 0x1B # GOOD64: c.slli s7, 32 +0x86 0x1B # BAD32: invalid instruction encoding +0x86 0x1B # GOOD64: c.slli s7, 33 +0x8A 0x1B # BAD32: invalid instruction encoding +0x8A 0x1B # GOOD64: c.slli s7, 34 +0x8E 0x1B # BAD32: invalid instruction encoding +0x8E 0x1B # GOOD64: c.slli s7, 35 +0x92 0x1B # BAD32: invalid instruction encoding +0x92 0x1B # GOOD64: c.slli s7, 36 +0x96 0x1B # BAD32: invalid instruction encoding +0x96 0x1B # GOOD64: c.slli s7, 37 +0x9A 0x1B # BAD32: invalid instruction encoding +0x9A 0x1B # GOOD64: c.slli s7, 38 +0x9E 0x1B # BAD32: invalid instruction encoding +0x9E 0x1B # GOOD64: c.slli s7, 39 +0xA2 0x1B # BAD32: invalid instruction encoding +0xA2 0x1B # GOOD64: c.slli s7, 40 +0xA6 0x1B # BAD32: invalid instruction encoding +0xA6 0x1B # GOOD64: c.slli s7, 41 +0xAA 0x1B # BAD32: invalid instruction encoding +0xAA 0x1B # GOOD64: c.slli s7, 42 +0xAE 0x1B # BAD32: invalid instruction encoding +0xAE 0x1B # GOOD64: c.slli s7, 43 +0xB2 0x1B # BAD32: invalid instruction encoding +0xB2 0x1B # GOOD64: c.slli s7, 44 +0xB6 0x1B # BAD32: invalid instruction encoding +0xB6 0x1B # GOOD64: c.slli s7, 45 +0xBA 0x1B # BAD32: invalid instruction encoding +0xBA 0x1B # GOOD64: c.slli s7, 46 +0xBE 0x1B # BAD32: invalid instruction encoding +0xBE 0x1B # GOOD64: c.slli s7, 47 +0xC2 0x1B # BAD32: invalid instruction encoding +0xC2 0x1B # GOOD64: c.slli s7, 48 +0xC6 0x1B # BAD32: invalid instruction encoding +0xC6 0x1B # GOOD64: c.slli s7, 49 +0xCA 0x1B # BAD32: invalid instruction encoding +0xCA 0x1B # GOOD64: c.slli s7, 50 +0xCE 0x1B # BAD32: invalid instruction encoding +0xCE 0x1B # GOOD64: c.slli s7, 51 +0xD2 0x1B # BAD32: invalid instruction encoding +0xD2 0x1B # GOOD64: c.slli s7, 52 +0xD6 0x1B # BAD32: invalid instruction encoding +0xD6 0x1B # GOOD64: c.slli s7, 53 +0xDA 0x1B # BAD32: invalid instruction encoding +0xDA 0x1B # GOOD64: c.slli s7, 54 +0xDE 0x1B # BAD32: invalid instruction encoding +0xDE 0x1B # GOOD64: c.slli s7, 55 +0xE2 0x1B # BAD32: invalid instruction encoding +0xE2 0x1B # GOOD64: c.slli s7, 56 +0xE6 0x1B # BAD32: invalid instruction encoding +0xE6 0x1B # GOOD64: c.slli s7, 57 +0xEA 0x1B # BAD32: invalid instruction encoding +0xEA 0x1B # GOOD64: c.slli s7, 58 +0xEE 0x1B # BAD32: invalid instruction encoding +0xEE 0x1B # GOOD64: c.slli s7, 59 +0xF2 0x1B # BAD32: invalid instruction encoding +0xF2 0x1B # GOOD64: c.slli s7, 60 +0xF6 0x1B # BAD32: invalid instruction encoding +0xF6 0x1B # GOOD64: c.slli s7, 61 +0xFA 0x1B # BAD32: invalid instruction encoding +0xFA 0x1B # GOOD64: c.slli s7, 62 +0xFE 0x1B # BAD32: invalid instruction encoding +0xFE 0x1B # GOOD64: c.slli s7, 63 +0x06 0x0C # GOOD: c.slli s8, 1 +0x0A 0x0C # GOOD: c.slli s8, 2 +0x0E 0x0C # GOOD: c.slli s8, 3 +0x12 0x0C # GOOD: c.slli s8, 4 +0x16 0x0C # GOOD: c.slli s8, 5 +0x1A 0x0C # GOOD: c.slli s8, 6 +0x1E 0x0C # GOOD: c.slli s8, 7 +0x22 0x0C # GOOD: c.slli s8, 8 +0x26 0x0C # GOOD: c.slli s8, 9 +0x2A 0x0C # GOOD: c.slli s8, 10 +0x2E 0x0C # GOOD: c.slli s8, 11 +0x32 0x0C # GOOD: c.slli s8, 12 +0x36 0x0C # GOOD: c.slli s8, 13 +0x3A 0x0C # GOOD: c.slli s8, 14 +0x3E 0x0C # GOOD: c.slli s8, 15 +0x42 0x0C # GOOD: c.slli s8, 16 +0x46 0x0C # GOOD: c.slli s8, 17 +0x4A 0x0C # GOOD: c.slli s8, 18 +0x4E 0x0C # GOOD: c.slli s8, 19 +0x52 0x0C # GOOD: c.slli s8, 20 +0x56 0x0C # GOOD: c.slli s8, 21 +0x5A 0x0C # GOOD: c.slli s8, 22 +0x5E 0x0C # GOOD: c.slli s8, 23 +0x62 0x0C # GOOD: c.slli s8, 24 +0x66 0x0C # GOOD: c.slli s8, 25 +0x6A 0x0C # GOOD: c.slli s8, 26 +0x6E 0x0C # GOOD: c.slli s8, 27 +0x72 0x0C # GOOD: c.slli s8, 28 +0x76 0x0C # GOOD: c.slli s8, 29 +0x7A 0x0C # GOOD: c.slli s8, 30 +0x7E 0x0C # GOOD: c.slli s8, 31 +0x02 0x1C # BAD32: invalid instruction encoding +0x02 0x1C # GOOD64: c.slli s8, 32 +0x06 0x1C # BAD32: invalid instruction encoding +0x06 0x1C # GOOD64: c.slli s8, 33 +0x0A 0x1C # BAD32: invalid instruction encoding +0x0A 0x1C # GOOD64: c.slli s8, 34 +0x0E 0x1C # BAD32: invalid instruction encoding +0x0E 0x1C # GOOD64: c.slli s8, 35 +0x12 0x1C # BAD32: invalid instruction encoding +0x12 0x1C # GOOD64: c.slli s8, 36 +0x16 0x1C # BAD32: invalid instruction encoding +0x16 0x1C # GOOD64: c.slli s8, 37 +0x1A 0x1C # BAD32: invalid instruction encoding +0x1A 0x1C # GOOD64: c.slli s8, 38 +0x1E 0x1C # BAD32: invalid instruction encoding +0x1E 0x1C # GOOD64: c.slli s8, 39 +0x22 0x1C # BAD32: invalid instruction encoding +0x22 0x1C # GOOD64: c.slli s8, 40 +0x26 0x1C # BAD32: invalid instruction encoding +0x26 0x1C # GOOD64: c.slli s8, 41 +0x2A 0x1C # BAD32: invalid instruction encoding +0x2A 0x1C # GOOD64: c.slli s8, 42 +0x2E 0x1C # BAD32: invalid instruction encoding +0x2E 0x1C # GOOD64: c.slli s8, 43 +0x32 0x1C # BAD32: invalid instruction encoding +0x32 0x1C # GOOD64: c.slli s8, 44 +0x36 0x1C # BAD32: invalid instruction encoding +0x36 0x1C # GOOD64: c.slli s8, 45 +0x3A 0x1C # BAD32: invalid instruction encoding +0x3A 0x1C # GOOD64: c.slli s8, 46 +0x3E 0x1C # BAD32: invalid instruction encoding +0x3E 0x1C # GOOD64: c.slli s8, 47 +0x42 0x1C # BAD32: invalid instruction encoding +0x42 0x1C # GOOD64: c.slli s8, 48 +0x46 0x1C # BAD32: invalid instruction encoding +0x46 0x1C # GOOD64: c.slli s8, 49 +0x4A 0x1C # BAD32: invalid instruction encoding +0x4A 0x1C # GOOD64: c.slli s8, 50 +0x4E 0x1C # BAD32: invalid instruction encoding +0x4E 0x1C # GOOD64: c.slli s8, 51 +0x52 0x1C # BAD32: invalid instruction encoding +0x52 0x1C # GOOD64: c.slli s8, 52 +0x56 0x1C # BAD32: invalid instruction encoding +0x56 0x1C # GOOD64: c.slli s8, 53 +0x5A 0x1C # BAD32: invalid instruction encoding +0x5A 0x1C # GOOD64: c.slli s8, 54 +0x5E 0x1C # BAD32: invalid instruction encoding +0x5E 0x1C # GOOD64: c.slli s8, 55 +0x62 0x1C # BAD32: invalid instruction encoding +0x62 0x1C # GOOD64: c.slli s8, 56 +0x66 0x1C # BAD32: invalid instruction encoding +0x66 0x1C # GOOD64: c.slli s8, 57 +0x6A 0x1C # BAD32: invalid instruction encoding +0x6A 0x1C # GOOD64: c.slli s8, 58 +0x6E 0x1C # BAD32: invalid instruction encoding +0x6E 0x1C # GOOD64: c.slli s8, 59 +0x72 0x1C # BAD32: invalid instruction encoding +0x72 0x1C # GOOD64: c.slli s8, 60 +0x76 0x1C # BAD32: invalid instruction encoding +0x76 0x1C # GOOD64: c.slli s8, 61 +0x7A 0x1C # BAD32: invalid instruction encoding +0x7A 0x1C # GOOD64: c.slli s8, 62 +0x7E 0x1C # BAD32: invalid instruction encoding +0x7E 0x1C # GOOD64: c.slli s8, 63 +0x86 0x0C # GOOD: c.slli s9, 1 +0x8A 0x0C # GOOD: c.slli s9, 2 +0x8E 0x0C # GOOD: c.slli s9, 3 +0x92 0x0C # GOOD: c.slli s9, 4 +0x96 0x0C # GOOD: c.slli s9, 5 +0x9A 0x0C # GOOD: c.slli s9, 6 +0x9E 0x0C # GOOD: c.slli s9, 7 +0xA2 0x0C # GOOD: c.slli s9, 8 +0xA6 0x0C # GOOD: c.slli s9, 9 +0xAA 0x0C # GOOD: c.slli s9, 10 +0xAE 0x0C # GOOD: c.slli s9, 11 +0xB2 0x0C # GOOD: c.slli s9, 12 +0xB6 0x0C # GOOD: c.slli s9, 13 +0xBA 0x0C # GOOD: c.slli s9, 14 +0xBE 0x0C # GOOD: c.slli s9, 15 +0xC2 0x0C # GOOD: c.slli s9, 16 +0xC6 0x0C # GOOD: c.slli s9, 17 +0xCA 0x0C # GOOD: c.slli s9, 18 +0xCE 0x0C # GOOD: c.slli s9, 19 +0xD2 0x0C # GOOD: c.slli s9, 20 +0xD6 0x0C # GOOD: c.slli s9, 21 +0xDA 0x0C # GOOD: c.slli s9, 22 +0xDE 0x0C # GOOD: c.slli s9, 23 +0xE2 0x0C # GOOD: c.slli s9, 24 +0xE6 0x0C # GOOD: c.slli s9, 25 +0xEA 0x0C # GOOD: c.slli s9, 26 +0xEE 0x0C # GOOD: c.slli s9, 27 +0xF2 0x0C # GOOD: c.slli s9, 28 +0xF6 0x0C # GOOD: c.slli s9, 29 +0xFA 0x0C # GOOD: c.slli s9, 30 +0xFE 0x0C # GOOD: c.slli s9, 31 +0x82 0x1C # BAD32: invalid instruction encoding +0x82 0x1C # GOOD64: c.slli s9, 32 +0x86 0x1C # BAD32: invalid instruction encoding +0x86 0x1C # GOOD64: c.slli s9, 33 +0x8A 0x1C # BAD32: invalid instruction encoding +0x8A 0x1C # GOOD64: c.slli s9, 34 +0x8E 0x1C # BAD32: invalid instruction encoding +0x8E 0x1C # GOOD64: c.slli s9, 35 +0x92 0x1C # BAD32: invalid instruction encoding +0x92 0x1C # GOOD64: c.slli s9, 36 +0x96 0x1C # BAD32: invalid instruction encoding +0x96 0x1C # GOOD64: c.slli s9, 37 +0x9A 0x1C # BAD32: invalid instruction encoding +0x9A 0x1C # GOOD64: c.slli s9, 38 +0x9E 0x1C # BAD32: invalid instruction encoding +0x9E 0x1C # GOOD64: c.slli s9, 39 +0xA2 0x1C # BAD32: invalid instruction encoding +0xA2 0x1C # GOOD64: c.slli s9, 40 +0xA6 0x1C # BAD32: invalid instruction encoding +0xA6 0x1C # GOOD64: c.slli s9, 41 +0xAA 0x1C # BAD32: invalid instruction encoding +0xAA 0x1C # GOOD64: c.slli s9, 42 +0xAE 0x1C # BAD32: invalid instruction encoding +0xAE 0x1C # GOOD64: c.slli s9, 43 +0xB2 0x1C # BAD32: invalid instruction encoding +0xB2 0x1C # GOOD64: c.slli s9, 44 +0xB6 0x1C # BAD32: invalid instruction encoding +0xB6 0x1C # GOOD64: c.slli s9, 45 +0xBA 0x1C # BAD32: invalid instruction encoding +0xBA 0x1C # GOOD64: c.slli s9, 46 +0xBE 0x1C # BAD32: invalid instruction encoding +0xBE 0x1C # GOOD64: c.slli s9, 47 +0xC2 0x1C # BAD32: invalid instruction encoding +0xC2 0x1C # GOOD64: c.slli s9, 48 +0xC6 0x1C # BAD32: invalid instruction encoding +0xC6 0x1C # GOOD64: c.slli s9, 49 +0xCA 0x1C # BAD32: invalid instruction encoding +0xCA 0x1C # GOOD64: c.slli s9, 50 +0xCE 0x1C # BAD32: invalid instruction encoding +0xCE 0x1C # GOOD64: c.slli s9, 51 +0xD2 0x1C # BAD32: invalid instruction encoding +0xD2 0x1C # GOOD64: c.slli s9, 52 +0xD6 0x1C # BAD32: invalid instruction encoding +0xD6 0x1C # GOOD64: c.slli s9, 53 +0xDA 0x1C # BAD32: invalid instruction encoding +0xDA 0x1C # GOOD64: c.slli s9, 54 +0xDE 0x1C # BAD32: invalid instruction encoding +0xDE 0x1C # GOOD64: c.slli s9, 55 +0xE2 0x1C # BAD32: invalid instruction encoding +0xE2 0x1C # GOOD64: c.slli s9, 56 +0xE6 0x1C # BAD32: invalid instruction encoding +0xE6 0x1C # GOOD64: c.slli s9, 57 +0xEA 0x1C # BAD32: invalid instruction encoding +0xEA 0x1C # GOOD64: c.slli s9, 58 +0xEE 0x1C # BAD32: invalid instruction encoding +0xEE 0x1C # GOOD64: c.slli s9, 59 +0xF2 0x1C # BAD32: invalid instruction encoding +0xF2 0x1C # GOOD64: c.slli s9, 60 +0xF6 0x1C # BAD32: invalid instruction encoding +0xF6 0x1C # GOOD64: c.slli s9, 61 +0xFA 0x1C # BAD32: invalid instruction encoding +0xFA 0x1C # GOOD64: c.slli s9, 62 +0xFE 0x1C # BAD32: invalid instruction encoding +0xFE 0x1C # GOOD64: c.slli s9, 63 +0x06 0x0D # GOOD: c.slli s10, 1 +0x0A 0x0D # GOOD: c.slli s10, 2 +0x0E 0x0D # GOOD: c.slli s10, 3 +0x12 0x0D # GOOD: c.slli s10, 4 +0x16 0x0D # GOOD: c.slli s10, 5 +0x1A 0x0D # GOOD: c.slli s10, 6 +0x1E 0x0D # GOOD: c.slli s10, 7 +0x22 0x0D # GOOD: c.slli s10, 8 +0x26 0x0D # GOOD: c.slli s10, 9 +0x2A 0x0D # GOOD: c.slli s10, 10 +0x2E 0x0D # GOOD: c.slli s10, 11 +0x32 0x0D # GOOD: c.slli s10, 12 +0x36 0x0D # GOOD: c.slli s10, 13 +0x3A 0x0D # GOOD: c.slli s10, 14 +0x3E 0x0D # GOOD: c.slli s10, 15 +0x42 0x0D # GOOD: c.slli s10, 16 +0x46 0x0D # GOOD: c.slli s10, 17 +0x4A 0x0D # GOOD: c.slli s10, 18 +0x4E 0x0D # GOOD: c.slli s10, 19 +0x52 0x0D # GOOD: c.slli s10, 20 +0x56 0x0D # GOOD: c.slli s10, 21 +0x5A 0x0D # GOOD: c.slli s10, 22 +0x5E 0x0D # GOOD: c.slli s10, 23 +0x62 0x0D # GOOD: c.slli s10, 24 +0x66 0x0D # GOOD: c.slli s10, 25 +0x6A 0x0D # GOOD: c.slli s10, 26 +0x6E 0x0D # GOOD: c.slli s10, 27 +0x72 0x0D # GOOD: c.slli s10, 28 +0x76 0x0D # GOOD: c.slli s10, 29 +0x7A 0x0D # GOOD: c.slli s10, 30 +0x7E 0x0D # GOOD: c.slli s10, 31 +0x02 0x1D # BAD32: invalid instruction encoding +0x02 0x1D # GOOD64: c.slli s10, 32 +0x06 0x1D # BAD32: invalid instruction encoding +0x06 0x1D # GOOD64: c.slli s10, 33 +0x0A 0x1D # BAD32: invalid instruction encoding +0x0A 0x1D # GOOD64: c.slli s10, 34 +0x0E 0x1D # BAD32: invalid instruction encoding +0x0E 0x1D # GOOD64: c.slli s10, 35 +0x12 0x1D # BAD32: invalid instruction encoding +0x12 0x1D # GOOD64: c.slli s10, 36 +0x16 0x1D # BAD32: invalid instruction encoding +0x16 0x1D # GOOD64: c.slli s10, 37 +0x1A 0x1D # BAD32: invalid instruction encoding +0x1A 0x1D # GOOD64: c.slli s10, 38 +0x1E 0x1D # BAD32: invalid instruction encoding +0x1E 0x1D # GOOD64: c.slli s10, 39 +0x22 0x1D # BAD32: invalid instruction encoding +0x22 0x1D # GOOD64: c.slli s10, 40 +0x26 0x1D # BAD32: invalid instruction encoding +0x26 0x1D # GOOD64: c.slli s10, 41 +0x2A 0x1D # BAD32: invalid instruction encoding +0x2A 0x1D # GOOD64: c.slli s10, 42 +0x2E 0x1D # BAD32: invalid instruction encoding +0x2E 0x1D # GOOD64: c.slli s10, 43 +0x32 0x1D # BAD32: invalid instruction encoding +0x32 0x1D # GOOD64: c.slli s10, 44 +0x36 0x1D # BAD32: invalid instruction encoding +0x36 0x1D # GOOD64: c.slli s10, 45 +0x3A 0x1D # BAD32: invalid instruction encoding +0x3A 0x1D # GOOD64: c.slli s10, 46 +0x3E 0x1D # BAD32: invalid instruction encoding +0x3E 0x1D # GOOD64: c.slli s10, 47 +0x42 0x1D # BAD32: invalid instruction encoding +0x42 0x1D # GOOD64: c.slli s10, 48 +0x46 0x1D # BAD32: invalid instruction encoding +0x46 0x1D # GOOD64: c.slli s10, 49 +0x4A 0x1D # BAD32: invalid instruction encoding +0x4A 0x1D # GOOD64: c.slli s10, 50 +0x4E 0x1D # BAD32: invalid instruction encoding +0x4E 0x1D # GOOD64: c.slli s10, 51 +0x52 0x1D # BAD32: invalid instruction encoding +0x52 0x1D # GOOD64: c.slli s10, 52 +0x56 0x1D # BAD32: invalid instruction encoding +0x56 0x1D # GOOD64: c.slli s10, 53 +0x5A 0x1D # BAD32: invalid instruction encoding +0x5A 0x1D # GOOD64: c.slli s10, 54 +0x5E 0x1D # BAD32: invalid instruction encoding +0x5E 0x1D # GOOD64: c.slli s10, 55 +0x62 0x1D # BAD32: invalid instruction encoding +0x62 0x1D # GOOD64: c.slli s10, 56 +0x66 0x1D # BAD32: invalid instruction encoding +0x66 0x1D # GOOD64: c.slli s10, 57 +0x6A 0x1D # BAD32: invalid instruction encoding +0x6A 0x1D # GOOD64: c.slli s10, 58 +0x6E 0x1D # BAD32: invalid instruction encoding +0x6E 0x1D # GOOD64: c.slli s10, 59 +0x72 0x1D # BAD32: invalid instruction encoding +0x72 0x1D # GOOD64: c.slli s10, 60 +0x76 0x1D # BAD32: invalid instruction encoding +0x76 0x1D # GOOD64: c.slli s10, 61 +0x7A 0x1D # BAD32: invalid instruction encoding +0x7A 0x1D # GOOD64: c.slli s10, 62 +0x7E 0x1D # BAD32: invalid instruction encoding +0x7E 0x1D # GOOD64: c.slli s10, 63 +0x86 0x0D # GOOD: c.slli s11, 1 +0x8A 0x0D # GOOD: c.slli s11, 2 +0x8E 0x0D # GOOD: c.slli s11, 3 +0x92 0x0D # GOOD: c.slli s11, 4 +0x96 0x0D # GOOD: c.slli s11, 5 +0x9A 0x0D # GOOD: c.slli s11, 6 +0x9E 0x0D # GOOD: c.slli s11, 7 +0xA2 0x0D # GOOD: c.slli s11, 8 +0xA6 0x0D # GOOD: c.slli s11, 9 +0xAA 0x0D # GOOD: c.slli s11, 10 +0xAE 0x0D # GOOD: c.slli s11, 11 +0xB2 0x0D # GOOD: c.slli s11, 12 +0xB6 0x0D # GOOD: c.slli s11, 13 +0xBA 0x0D # GOOD: c.slli s11, 14 +0xBE 0x0D # GOOD: c.slli s11, 15 +0xC2 0x0D # GOOD: c.slli s11, 16 +0xC6 0x0D # GOOD: c.slli s11, 17 +0xCA 0x0D # GOOD: c.slli s11, 18 +0xCE 0x0D # GOOD: c.slli s11, 19 +0xD2 0x0D # GOOD: c.slli s11, 20 +0xD6 0x0D # GOOD: c.slli s11, 21 +0xDA 0x0D # GOOD: c.slli s11, 22 +0xDE 0x0D # GOOD: c.slli s11, 23 +0xE2 0x0D # GOOD: c.slli s11, 24 +0xE6 0x0D # GOOD: c.slli s11, 25 +0xEA 0x0D # GOOD: c.slli s11, 26 +0xEE 0x0D # GOOD: c.slli s11, 27 +0xF2 0x0D # GOOD: c.slli s11, 28 +0xF6 0x0D # GOOD: c.slli s11, 29 +0xFA 0x0D # GOOD: c.slli s11, 30 +0xFE 0x0D # GOOD: c.slli s11, 31 +0x82 0x1D # BAD32: invalid instruction encoding +0x82 0x1D # GOOD64: c.slli s11, 32 +0x86 0x1D # BAD32: invalid instruction encoding +0x86 0x1D # GOOD64: c.slli s11, 33 +0x8A 0x1D # BAD32: invalid instruction encoding +0x8A 0x1D # GOOD64: c.slli s11, 34 +0x8E 0x1D # BAD32: invalid instruction encoding +0x8E 0x1D # GOOD64: c.slli s11, 35 +0x92 0x1D # BAD32: invalid instruction encoding +0x92 0x1D # GOOD64: c.slli s11, 36 +0x96 0x1D # BAD32: invalid instruction encoding +0x96 0x1D # GOOD64: c.slli s11, 37 +0x9A 0x1D # BAD32: invalid instruction encoding +0x9A 0x1D # GOOD64: c.slli s11, 38 +0x9E 0x1D # BAD32: invalid instruction encoding +0x9E 0x1D # GOOD64: c.slli s11, 39 +0xA2 0x1D # BAD32: invalid instruction encoding +0xA2 0x1D # GOOD64: c.slli s11, 40 +0xA6 0x1D # BAD32: invalid instruction encoding +0xA6 0x1D # GOOD64: c.slli s11, 41 +0xAA 0x1D # BAD32: invalid instruction encoding +0xAA 0x1D # GOOD64: c.slli s11, 42 +0xAE 0x1D # BAD32: invalid instruction encoding +0xAE 0x1D # GOOD64: c.slli s11, 43 +0xB2 0x1D # BAD32: invalid instruction encoding +0xB2 0x1D # GOOD64: c.slli s11, 44 +0xB6 0x1D # BAD32: invalid instruction encoding +0xB6 0x1D # GOOD64: c.slli s11, 45 +0xBA 0x1D # BAD32: invalid instruction encoding +0xBA 0x1D # GOOD64: c.slli s11, 46 +0xBE 0x1D # BAD32: invalid instruction encoding +0xBE 0x1D # GOOD64: c.slli s11, 47 +0xC2 0x1D # BAD32: invalid instruction encoding +0xC2 0x1D # GOOD64: c.slli s11, 48 +0xC6 0x1D # BAD32: invalid instruction encoding +0xC6 0x1D # GOOD64: c.slli s11, 49 +0xCA 0x1D # BAD32: invalid instruction encoding +0xCA 0x1D # GOOD64: c.slli s11, 50 +0xCE 0x1D # BAD32: invalid instruction encoding +0xCE 0x1D # GOOD64: c.slli s11, 51 +0xD2 0x1D # BAD32: invalid instruction encoding +0xD2 0x1D # GOOD64: c.slli s11, 52 +0xD6 0x1D # BAD32: invalid instruction encoding +0xD6 0x1D # GOOD64: c.slli s11, 53 +0xDA 0x1D # BAD32: invalid instruction encoding +0xDA 0x1D # GOOD64: c.slli s11, 54 +0xDE 0x1D # BAD32: invalid instruction encoding +0xDE 0x1D # GOOD64: c.slli s11, 55 +0xE2 0x1D # BAD32: invalid instruction encoding +0xE2 0x1D # GOOD64: c.slli s11, 56 +0xE6 0x1D # BAD32: invalid instruction encoding +0xE6 0x1D # GOOD64: c.slli s11, 57 +0xEA 0x1D # BAD32: invalid instruction encoding +0xEA 0x1D # GOOD64: c.slli s11, 58 +0xEE 0x1D # BAD32: invalid instruction encoding +0xEE 0x1D # GOOD64: c.slli s11, 59 +0xF2 0x1D # BAD32: invalid instruction encoding +0xF2 0x1D # GOOD64: c.slli s11, 60 +0xF6 0x1D # BAD32: invalid instruction encoding +0xF6 0x1D # GOOD64: c.slli s11, 61 +0xFA 0x1D # BAD32: invalid instruction encoding +0xFA 0x1D # GOOD64: c.slli s11, 62 +0xFE 0x1D # BAD32: invalid instruction encoding +0xFE 0x1D # GOOD64: c.slli s11, 63 +0x06 0x0E # GOOD: c.slli t3, 1 +0x0A 0x0E # GOOD: c.slli t3, 2 +0x0E 0x0E # GOOD: c.slli t3, 3 +0x12 0x0E # GOOD: c.slli t3, 4 +0x16 0x0E # GOOD: c.slli t3, 5 +0x1A 0x0E # GOOD: c.slli t3, 6 +0x1E 0x0E # GOOD: c.slli t3, 7 +0x22 0x0E # GOOD: c.slli t3, 8 +0x26 0x0E # GOOD: c.slli t3, 9 +0x2A 0x0E # GOOD: c.slli t3, 10 +0x2E 0x0E # GOOD: c.slli t3, 11 +0x32 0x0E # GOOD: c.slli t3, 12 +0x36 0x0E # GOOD: c.slli t3, 13 +0x3A 0x0E # GOOD: c.slli t3, 14 +0x3E 0x0E # GOOD: c.slli t3, 15 +0x42 0x0E # GOOD: c.slli t3, 16 +0x46 0x0E # GOOD: c.slli t3, 17 +0x4A 0x0E # GOOD: c.slli t3, 18 +0x4E 0x0E # GOOD: c.slli t3, 19 +0x52 0x0E # GOOD: c.slli t3, 20 +0x56 0x0E # GOOD: c.slli t3, 21 +0x5A 0x0E # GOOD: c.slli t3, 22 +0x5E 0x0E # GOOD: c.slli t3, 23 +0x62 0x0E # GOOD: c.slli t3, 24 +0x66 0x0E # GOOD: c.slli t3, 25 +0x6A 0x0E # GOOD: c.slli t3, 26 +0x6E 0x0E # GOOD: c.slli t3, 27 +0x72 0x0E # GOOD: c.slli t3, 28 +0x76 0x0E # GOOD: c.slli t3, 29 +0x7A 0x0E # GOOD: c.slli t3, 30 +0x7E 0x0E # GOOD: c.slli t3, 31 +0x02 0x1E # BAD32: invalid instruction encoding +0x02 0x1E # GOOD64: c.slli t3, 32 +0x06 0x1E # BAD32: invalid instruction encoding +0x06 0x1E # GOOD64: c.slli t3, 33 +0x0A 0x1E # BAD32: invalid instruction encoding +0x0A 0x1E # GOOD64: c.slli t3, 34 +0x0E 0x1E # BAD32: invalid instruction encoding +0x0E 0x1E # GOOD64: c.slli t3, 35 +0x12 0x1E # BAD32: invalid instruction encoding +0x12 0x1E # GOOD64: c.slli t3, 36 +0x16 0x1E # BAD32: invalid instruction encoding +0x16 0x1E # GOOD64: c.slli t3, 37 +0x1A 0x1E # BAD32: invalid instruction encoding +0x1A 0x1E # GOOD64: c.slli t3, 38 +0x1E 0x1E # BAD32: invalid instruction encoding +0x1E 0x1E # GOOD64: c.slli t3, 39 +0x22 0x1E # BAD32: invalid instruction encoding +0x22 0x1E # GOOD64: c.slli t3, 40 +0x26 0x1E # BAD32: invalid instruction encoding +0x26 0x1E # GOOD64: c.slli t3, 41 +0x2A 0x1E # BAD32: invalid instruction encoding +0x2A 0x1E # GOOD64: c.slli t3, 42 +0x2E 0x1E # BAD32: invalid instruction encoding +0x2E 0x1E # GOOD64: c.slli t3, 43 +0x32 0x1E # BAD32: invalid instruction encoding +0x32 0x1E # GOOD64: c.slli t3, 44 +0x36 0x1E # BAD32: invalid instruction encoding +0x36 0x1E # GOOD64: c.slli t3, 45 +0x3A 0x1E # BAD32: invalid instruction encoding +0x3A 0x1E # GOOD64: c.slli t3, 46 +0x3E 0x1E # BAD32: invalid instruction encoding +0x3E 0x1E # GOOD64: c.slli t3, 47 +0x42 0x1E # BAD32: invalid instruction encoding +0x42 0x1E # GOOD64: c.slli t3, 48 +0x46 0x1E # BAD32: invalid instruction encoding +0x46 0x1E # GOOD64: c.slli t3, 49 +0x4A 0x1E # BAD32: invalid instruction encoding +0x4A 0x1E # GOOD64: c.slli t3, 50 +0x4E 0x1E # BAD32: invalid instruction encoding +0x4E 0x1E # GOOD64: c.slli t3, 51 +0x52 0x1E # BAD32: invalid instruction encoding +0x52 0x1E # GOOD64: c.slli t3, 52 +0x56 0x1E # BAD32: invalid instruction encoding +0x56 0x1E # GOOD64: c.slli t3, 53 +0x5A 0x1E # BAD32: invalid instruction encoding +0x5A 0x1E # GOOD64: c.slli t3, 54 +0x5E 0x1E # BAD32: invalid instruction encoding +0x5E 0x1E # GOOD64: c.slli t3, 55 +0x62 0x1E # BAD32: invalid instruction encoding +0x62 0x1E # GOOD64: c.slli t3, 56 +0x66 0x1E # BAD32: invalid instruction encoding +0x66 0x1E # GOOD64: c.slli t3, 57 +0x6A 0x1E # BAD32: invalid instruction encoding +0x6A 0x1E # GOOD64: c.slli t3, 58 +0x6E 0x1E # BAD32: invalid instruction encoding +0x6E 0x1E # GOOD64: c.slli t3, 59 +0x72 0x1E # BAD32: invalid instruction encoding +0x72 0x1E # GOOD64: c.slli t3, 60 +0x76 0x1E # BAD32: invalid instruction encoding +0x76 0x1E # GOOD64: c.slli t3, 61 +0x7A 0x1E # BAD32: invalid instruction encoding +0x7A 0x1E # GOOD64: c.slli t3, 62 +0x7E 0x1E # BAD32: invalid instruction encoding +0x7E 0x1E # GOOD64: c.slli t3, 63 +0x86 0x0E # GOOD: c.slli t4, 1 +0x8A 0x0E # GOOD: c.slli t4, 2 +0x8E 0x0E # GOOD: c.slli t4, 3 +0x92 0x0E # GOOD: c.slli t4, 4 +0x96 0x0E # GOOD: c.slli t4, 5 +0x9A 0x0E # GOOD: c.slli t4, 6 +0x9E 0x0E # GOOD: c.slli t4, 7 +0xA2 0x0E # GOOD: c.slli t4, 8 +0xA6 0x0E # GOOD: c.slli t4, 9 +0xAA 0x0E # GOOD: c.slli t4, 10 +0xAE 0x0E # GOOD: c.slli t4, 11 +0xB2 0x0E # GOOD: c.slli t4, 12 +0xB6 0x0E # GOOD: c.slli t4, 13 +0xBA 0x0E # GOOD: c.slli t4, 14 +0xBE 0x0E # GOOD: c.slli t4, 15 +0xC2 0x0E # GOOD: c.slli t4, 16 +0xC6 0x0E # GOOD: c.slli t4, 17 +0xCA 0x0E # GOOD: c.slli t4, 18 +0xCE 0x0E # GOOD: c.slli t4, 19 +0xD2 0x0E # GOOD: c.slli t4, 20 +0xD6 0x0E # GOOD: c.slli t4, 21 +0xDA 0x0E # GOOD: c.slli t4, 22 +0xDE 0x0E # GOOD: c.slli t4, 23 +0xE2 0x0E # GOOD: c.slli t4, 24 +0xE6 0x0E # GOOD: c.slli t4, 25 +0xEA 0x0E # GOOD: c.slli t4, 26 +0xEE 0x0E # GOOD: c.slli t4, 27 +0xF2 0x0E # GOOD: c.slli t4, 28 +0xF6 0x0E # GOOD: c.slli t4, 29 +0xFA 0x0E # GOOD: c.slli t4, 30 +0xFE 0x0E # GOOD: c.slli t4, 31 +0x82 0x1E # BAD32: invalid instruction encoding +0x82 0x1E # GOOD64: c.slli t4, 32 +0x86 0x1E # BAD32: invalid instruction encoding +0x86 0x1E # GOOD64: c.slli t4, 33 +0x8A 0x1E # BAD32: invalid instruction encoding +0x8A 0x1E # GOOD64: c.slli t4, 34 +0x8E 0x1E # BAD32: invalid instruction encoding +0x8E 0x1E # GOOD64: c.slli t4, 35 +0x92 0x1E # BAD32: invalid instruction encoding +0x92 0x1E # GOOD64: c.slli t4, 36 +0x96 0x1E # BAD32: invalid instruction encoding +0x96 0x1E # GOOD64: c.slli t4, 37 +0x9A 0x1E # BAD32: invalid instruction encoding +0x9A 0x1E # GOOD64: c.slli t4, 38 +0x9E 0x1E # BAD32: invalid instruction encoding +0x9E 0x1E # GOOD64: c.slli t4, 39 +0xA2 0x1E # BAD32: invalid instruction encoding +0xA2 0x1E # GOOD64: c.slli t4, 40 +0xA6 0x1E # BAD32: invalid instruction encoding +0xA6 0x1E # GOOD64: c.slli t4, 41 +0xAA 0x1E # BAD32: invalid instruction encoding +0xAA 0x1E # GOOD64: c.slli t4, 42 +0xAE 0x1E # BAD32: invalid instruction encoding +0xAE 0x1E # GOOD64: c.slli t4, 43 +0xB2 0x1E # BAD32: invalid instruction encoding +0xB2 0x1E # GOOD64: c.slli t4, 44 +0xB6 0x1E # BAD32: invalid instruction encoding +0xB6 0x1E # GOOD64: c.slli t4, 45 +0xBA 0x1E # BAD32: invalid instruction encoding +0xBA 0x1E # GOOD64: c.slli t4, 46 +0xBE 0x1E # BAD32: invalid instruction encoding +0xBE 0x1E # GOOD64: c.slli t4, 47 +0xC2 0x1E # BAD32: invalid instruction encoding +0xC2 0x1E # GOOD64: c.slli t4, 48 +0xC6 0x1E # BAD32: invalid instruction encoding +0xC6 0x1E # GOOD64: c.slli t4, 49 +0xCA 0x1E # BAD32: invalid instruction encoding +0xCA 0x1E # GOOD64: c.slli t4, 50 +0xCE 0x1E # BAD32: invalid instruction encoding +0xCE 0x1E # GOOD64: c.slli t4, 51 +0xD2 0x1E # BAD32: invalid instruction encoding +0xD2 0x1E # GOOD64: c.slli t4, 52 +0xD6 0x1E # BAD32: invalid instruction encoding +0xD6 0x1E # GOOD64: c.slli t4, 53 +0xDA 0x1E # BAD32: invalid instruction encoding +0xDA 0x1E # GOOD64: c.slli t4, 54 +0xDE 0x1E # BAD32: invalid instruction encoding +0xDE 0x1E # GOOD64: c.slli t4, 55 +0xE2 0x1E # BAD32: invalid instruction encoding +0xE2 0x1E # GOOD64: c.slli t4, 56 +0xE6 0x1E # BAD32: invalid instruction encoding +0xE6 0x1E # GOOD64: c.slli t4, 57 +0xEA 0x1E # BAD32: invalid instruction encoding +0xEA 0x1E # GOOD64: c.slli t4, 58 +0xEE 0x1E # BAD32: invalid instruction encoding +0xEE 0x1E # GOOD64: c.slli t4, 59 +0xF2 0x1E # BAD32: invalid instruction encoding +0xF2 0x1E # GOOD64: c.slli t4, 60 +0xF6 0x1E # BAD32: invalid instruction encoding +0xF6 0x1E # GOOD64: c.slli t4, 61 +0xFA 0x1E # BAD32: invalid instruction encoding +0xFA 0x1E # GOOD64: c.slli t4, 62 +0xFE 0x1E # BAD32: invalid instruction encoding +0xFE 0x1E # GOOD64: c.slli t4, 63 +0x06 0x0F # GOOD: c.slli t5, 1 +0x0A 0x0F # GOOD: c.slli t5, 2 +0x0E 0x0F # GOOD: c.slli t5, 3 +0x12 0x0F # GOOD: c.slli t5, 4 +0x16 0x0F # GOOD: c.slli t5, 5 +0x1A 0x0F # GOOD: c.slli t5, 6 +0x1E 0x0F # GOOD: c.slli t5, 7 +0x22 0x0F # GOOD: c.slli t5, 8 +0x26 0x0F # GOOD: c.slli t5, 9 +0x2A 0x0F # GOOD: c.slli t5, 10 +0x2E 0x0F # GOOD: c.slli t5, 11 +0x32 0x0F # GOOD: c.slli t5, 12 +0x36 0x0F # GOOD: c.slli t5, 13 +0x3A 0x0F # GOOD: c.slli t5, 14 +0x3E 0x0F # GOOD: c.slli t5, 15 +0x42 0x0F # GOOD: c.slli t5, 16 +0x46 0x0F # GOOD: c.slli t5, 17 +0x4A 0x0F # GOOD: c.slli t5, 18 +0x4E 0x0F # GOOD: c.slli t5, 19 +0x52 0x0F # GOOD: c.slli t5, 20 +0x56 0x0F # GOOD: c.slli t5, 21 +0x5A 0x0F # GOOD: c.slli t5, 22 +0x5E 0x0F # GOOD: c.slli t5, 23 +0x62 0x0F # GOOD: c.slli t5, 24 +0x66 0x0F # GOOD: c.slli t5, 25 +0x6A 0x0F # GOOD: c.slli t5, 26 +0x6E 0x0F # GOOD: c.slli t5, 27 +0x72 0x0F # GOOD: c.slli t5, 28 +0x76 0x0F # GOOD: c.slli t5, 29 +0x7A 0x0F # GOOD: c.slli t5, 30 +0x7E 0x0F # GOOD: c.slli t5, 31 +0x02 0x1F # BAD32: invalid instruction encoding +0x02 0x1F # GOOD64: c.slli t5, 32 +0x06 0x1F # BAD32: invalid instruction encoding +0x06 0x1F # GOOD64: c.slli t5, 33 +0x0A 0x1F # BAD32: invalid instruction encoding +0x0A 0x1F # GOOD64: c.slli t5, 34 +0x0E 0x1F # BAD32: invalid instruction encoding +0x0E 0x1F # GOOD64: c.slli t5, 35 +0x12 0x1F # BAD32: invalid instruction encoding +0x12 0x1F # GOOD64: c.slli t5, 36 +0x16 0x1F # BAD32: invalid instruction encoding +0x16 0x1F # GOOD64: c.slli t5, 37 +0x1A 0x1F # BAD32: invalid instruction encoding +0x1A 0x1F # GOOD64: c.slli t5, 38 +0x1E 0x1F # BAD32: invalid instruction encoding +0x1E 0x1F # GOOD64: c.slli t5, 39 +0x22 0x1F # BAD32: invalid instruction encoding +0x22 0x1F # GOOD64: c.slli t5, 40 +0x26 0x1F # BAD32: invalid instruction encoding +0x26 0x1F # GOOD64: c.slli t5, 41 +0x2A 0x1F # BAD32: invalid instruction encoding +0x2A 0x1F # GOOD64: c.slli t5, 42 +0x2E 0x1F # BAD32: invalid instruction encoding +0x2E 0x1F # GOOD64: c.slli t5, 43 +0x32 0x1F # BAD32: invalid instruction encoding +0x32 0x1F # GOOD64: c.slli t5, 44 +0x36 0x1F # BAD32: invalid instruction encoding +0x36 0x1F # GOOD64: c.slli t5, 45 +0x3A 0x1F # BAD32: invalid instruction encoding +0x3A 0x1F # GOOD64: c.slli t5, 46 +0x3E 0x1F # BAD32: invalid instruction encoding +0x3E 0x1F # GOOD64: c.slli t5, 47 +0x42 0x1F # BAD32: invalid instruction encoding +0x42 0x1F # GOOD64: c.slli t5, 48 +0x46 0x1F # BAD32: invalid instruction encoding +0x46 0x1F # GOOD64: c.slli t5, 49 +0x4A 0x1F # BAD32: invalid instruction encoding +0x4A 0x1F # GOOD64: c.slli t5, 50 +0x4E 0x1F # BAD32: invalid instruction encoding +0x4E 0x1F # GOOD64: c.slli t5, 51 +0x52 0x1F # BAD32: invalid instruction encoding +0x52 0x1F # GOOD64: c.slli t5, 52 +0x56 0x1F # BAD32: invalid instruction encoding +0x56 0x1F # GOOD64: c.slli t5, 53 +0x5A 0x1F # BAD32: invalid instruction encoding +0x5A 0x1F # GOOD64: c.slli t5, 54 +0x5E 0x1F # BAD32: invalid instruction encoding +0x5E 0x1F # GOOD64: c.slli t5, 55 +0x62 0x1F # BAD32: invalid instruction encoding +0x62 0x1F # GOOD64: c.slli t5, 56 +0x66 0x1F # BAD32: invalid instruction encoding +0x66 0x1F # GOOD64: c.slli t5, 57 +0x6A 0x1F # BAD32: invalid instruction encoding +0x6A 0x1F # GOOD64: c.slli t5, 58 +0x6E 0x1F # BAD32: invalid instruction encoding +0x6E 0x1F # GOOD64: c.slli t5, 59 +0x72 0x1F # BAD32: invalid instruction encoding +0x72 0x1F # GOOD64: c.slli t5, 60 +0x76 0x1F # BAD32: invalid instruction encoding +0x76 0x1F # GOOD64: c.slli t5, 61 +0x7A 0x1F # BAD32: invalid instruction encoding +0x7A 0x1F # GOOD64: c.slli t5, 62 +0x7E 0x1F # BAD32: invalid instruction encoding +0x7E 0x1F # GOOD64: c.slli t5, 63 +0x86 0x0F # GOOD: c.slli t6, 1 +0x8A 0x0F # GOOD: c.slli t6, 2 +0x8E 0x0F # GOOD: c.slli t6, 3 +0x92 0x0F # GOOD: c.slli t6, 4 +0x96 0x0F # GOOD: c.slli t6, 5 +0x9A 0x0F # GOOD: c.slli t6, 6 +0x9E 0x0F # GOOD: c.slli t6, 7 +0xA2 0x0F # GOOD: c.slli t6, 8 +0xA6 0x0F # GOOD: c.slli t6, 9 +0xAA 0x0F # GOOD: c.slli t6, 10 +0xAE 0x0F # GOOD: c.slli t6, 11 +0xB2 0x0F # GOOD: c.slli t6, 12 +0xB6 0x0F # GOOD: c.slli t6, 13 +0xBA 0x0F # GOOD: c.slli t6, 14 +0xBE 0x0F # GOOD: c.slli t6, 15 +0xC2 0x0F # GOOD: c.slli t6, 16 +0xC6 0x0F # GOOD: c.slli t6, 17 +0xCA 0x0F # GOOD: c.slli t6, 18 +0xCE 0x0F # GOOD: c.slli t6, 19 +0xD2 0x0F # GOOD: c.slli t6, 20 +0xD6 0x0F # GOOD: c.slli t6, 21 +0xDA 0x0F # GOOD: c.slli t6, 22 +0xDE 0x0F # GOOD: c.slli t6, 23 +0xE2 0x0F # GOOD: c.slli t6, 24 +0xE6 0x0F # GOOD: c.slli t6, 25 +0xEA 0x0F # GOOD: c.slli t6, 26 +0xEE 0x0F # GOOD: c.slli t6, 27 +0xF2 0x0F # GOOD: c.slli t6, 28 +0xF6 0x0F # GOOD: c.slli t6, 29 +0xFA 0x0F # GOOD: c.slli t6, 30 +0xFE 0x0F # GOOD: c.slli t6, 31 +0x82 0x1F # BAD32: invalid instruction encoding +0x82 0x1F # GOOD64: c.slli t6, 32 +0x86 0x1F # BAD32: invalid instruction encoding +0x86 0x1F # GOOD64: c.slli t6, 33 +0x8A 0x1F # BAD32: invalid instruction encoding +0x8A 0x1F # GOOD64: c.slli t6, 34 +0x8E 0x1F # BAD32: invalid instruction encoding +0x8E 0x1F # GOOD64: c.slli t6, 35 +0x92 0x1F # BAD32: invalid instruction encoding +0x92 0x1F # GOOD64: c.slli t6, 36 +0x96 0x1F # BAD32: invalid instruction encoding +0x96 0x1F # GOOD64: c.slli t6, 37 +0x9A 0x1F # BAD32: invalid instruction encoding +0x9A 0x1F # GOOD64: c.slli t6, 38 +0x9E 0x1F # BAD32: invalid instruction encoding +0x9E 0x1F # GOOD64: c.slli t6, 39 +0xA2 0x1F # BAD32: invalid instruction encoding +0xA2 0x1F # GOOD64: c.slli t6, 40 +0xA6 0x1F # BAD32: invalid instruction encoding +0xA6 0x1F # GOOD64: c.slli t6, 41 +0xAA 0x1F # BAD32: invalid instruction encoding +0xAA 0x1F # GOOD64: c.slli t6, 42 +0xAE 0x1F # BAD32: invalid instruction encoding +0xAE 0x1F # GOOD64: c.slli t6, 43 +0xB2 0x1F # BAD32: invalid instruction encoding +0xB2 0x1F # GOOD64: c.slli t6, 44 +0xB6 0x1F # BAD32: invalid instruction encoding +0xB6 0x1F # GOOD64: c.slli t6, 45 +0xBA 0x1F # BAD32: invalid instruction encoding +0xBA 0x1F # GOOD64: c.slli t6, 46 +0xBE 0x1F # BAD32: invalid instruction encoding +0xBE 0x1F # GOOD64: c.slli t6, 47 +0xC2 0x1F # BAD32: invalid instruction encoding +0xC2 0x1F # GOOD64: c.slli t6, 48 +0xC6 0x1F # BAD32: invalid instruction encoding +0xC6 0x1F # GOOD64: c.slli t6, 49 +0xCA 0x1F # BAD32: invalid instruction encoding +0xCA 0x1F # GOOD64: c.slli t6, 50 +0xCE 0x1F # BAD32: invalid instruction encoding +0xCE 0x1F # GOOD64: c.slli t6, 51 +0xD2 0x1F # BAD32: invalid instruction encoding +0xD2 0x1F # GOOD64: c.slli t6, 52 +0xD6 0x1F # BAD32: invalid instruction encoding +0xD6 0x1F # GOOD64: c.slli t6, 53 +0xDA 0x1F # BAD32: invalid instruction encoding +0xDA 0x1F # GOOD64: c.slli t6, 54 +0xDE 0x1F # BAD32: invalid instruction encoding +0xDE 0x1F # GOOD64: c.slli t6, 55 +0xE2 0x1F # BAD32: invalid instruction encoding +0xE2 0x1F # GOOD64: c.slli t6, 56 +0xE6 0x1F # BAD32: invalid instruction encoding +0xE6 0x1F # GOOD64: c.slli t6, 57 +0xEA 0x1F # BAD32: invalid instruction encoding +0xEA 0x1F # GOOD64: c.slli t6, 58 +0xEE 0x1F # BAD32: invalid instruction encoding +0xEE 0x1F # GOOD64: c.slli t6, 59 +0xF2 0x1F # BAD32: invalid instruction encoding +0xF2 0x1F # GOOD64: c.slli t6, 60 +0xF6 0x1F # BAD32: invalid instruction encoding +0xF6 0x1F # GOOD64: c.slli t6, 61 +0xFA 0x1F # BAD32: invalid instruction encoding +0xFA 0x1F # GOOD64: c.slli t6, 62 +0xFE 0x1F # BAD32: invalid instruction encoding +0xFE 0x1F # GOOD64: c.slli t6, 63 From 3fd0eaae52503ee2bbdffc75753acc4bcc72fe60 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 31 Mar 2025 16:54:04 +0100 Subject: [PATCH 0106/1029] [libclc][amdgpu] Implement native_exp2 via AMD builtin (#133696) This came up during a discussion on #129679, which has been split out as a preparatory commit. An example of the AMDGPU codegen is: define <2 x float> @_Z10native_expDv2_f(<2 x float> %val) { %mul = fmul afn <2 x float> %val, splat (float 0x3FF7154760000000) %0 = extractelement <2 x float> %mul, i64 0 %1 = tail call float @llvm.amdgcn.exp2.f32(float %0) %vecinit.i = insertelement <2 x float> poison, float %1, i64 0 %2 = extractelement <2 x float> %mul, i64 1 %3 = tail call float @llvm.amdgcn.exp2.f32(float %2) %vecinit2.i = insertelement <2 x float> %vecinit.i, float %3, i64 1 ret <2 x float> %vecinit2.i } define <2 x float> @_Z11native_exp2Dv2_f(<2 x float> %x) { %0 = extractelement <2 x float> %x, i64 0 %1 = tail call float @llvm.amdgcn.exp2.f32(float %0) %vecinit = insertelement <2 x float> poison, float %1, i64 0 %2 = extractelement <2 x float> %x, i64 1 %3 = tail call float @llvm.amdgcn.exp2.f32(float %2) %vecinit2 = insertelement <2 x float> %vecinit, float %3, i64 1 ret <2 x float> %vecinit2 } --- libclc/amdgpu/lib/SOURCES | 1 + libclc/amdgpu/lib/math/native_exp2.cl | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 libclc/amdgpu/lib/math/native_exp2.cl diff --git a/libclc/amdgpu/lib/SOURCES b/libclc/amdgpu/lib/SOURCES index d7782a2ae14dc..ed5e45a37c18d 100644 --- a/libclc/amdgpu/lib/SOURCES +++ b/libclc/amdgpu/lib/SOURCES @@ -1,4 +1,5 @@ math/native_exp.cl +math/native_exp2.cl math/native_log.cl math/native_log10.cl math/half_exp.cl diff --git a/libclc/amdgpu/lib/math/native_exp2.cl b/libclc/amdgpu/lib/math/native_exp2.cl new file mode 100644 index 0000000000000..39ae914b19634 --- /dev/null +++ b/libclc/amdgpu/lib/math/native_exp2.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +_CLC_OVERLOAD _CLC_DEF float native_exp2(float val) { + return __builtin_amdgcn_exp2f(val); +} + +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, native_exp2, float) From 87602f6d03ada10d0de7f1440320ffec4eb86790 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 31 Mar 2025 16:55:23 +0100 Subject: [PATCH 0107/1029] [libclc] Fix unresolved reference to missing table (#133691) Splitting the 'ln_tbl' into two in db98e292 wasn't done thoroughly enough as some references to the old table still remained. This commit fixes the unresolved references by updating to the new split table. --- libclc/clc/include/clc/math/tables.h | 1 - libclc/clc/lib/generic/math/clc_log_base.h | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/libclc/clc/include/clc/math/tables.h b/libclc/clc/include/clc/math/tables.h index f2118082b1bdb..3120a18cc996e 100644 --- a/libclc/clc/include/clc/math/tables.h +++ b/libclc/clc/include/clc/math/tables.h @@ -78,7 +78,6 @@ CLC_TABLE_FUNCTION_DECL(float, exp_tbl_ep_tail); #pragma OPENCL EXTENSION cl_khr_fp64 : enable -TABLE_FUNCTION_DECL(double2, ln_tbl); CLC_TABLE_FUNCTION_DECL(double, ln_tbl_lo); CLC_TABLE_FUNCTION_DECL(double, ln_tbl_hi); CLC_TABLE_FUNCTION_DECL(double, atan_jby256_tbl_head); diff --git a/libclc/clc/lib/generic/math/clc_log_base.h b/libclc/clc/lib/generic/math/clc_log_base.h index 7d084c037f67e..9418535db827d 100644 --- a/libclc/clc/lib/generic/math/clc_log_base.h +++ b/libclc/clc/lib/generic/math/clc_log_base.h @@ -261,9 +261,8 @@ __clc_log(double x) int index = __clc_as_int2(ux).hi >> 13; index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1); - double2 tv = USE_TABLE(ln_tbl, index - 64); - double z1 = tv.s0; - double q = tv.s1; + double z1 = USE_TABLE(ln_tbl_lo, index - 64); + double q = USE_TABLE(ln_tbl_hi, index - 64); double f1 = index * 0x1.0p-7; double f2 = f - f1; From c180e249d0013474d502cd779ec65b33cf7e9468 Mon Sep 17 00:00:00 2001 From: 3405691582 Date: Mon, 31 Mar 2025 12:17:55 -0400 Subject: [PATCH 0108/1029] Fix crash lowering stack guard on OpenBSD/aarch64. (#125416) TargetLoweringBase::getIRStackGuard refers to a platform-specific guard variable. Before this change, TargetLoweringBase::getSDagStackGuard only referred to a different variable. This means that SelectionDAGBuilder's getLoadStackGuard does not get memory operands. However, AArch64InstrInfo::expandPostRAPseudo assumes that the passed MachineInstr has nonzero memoperands, causing a segfault. We have two possible options here: either disabling the LOAD_STACK_GUARD node entirely in AArch64TargetLowering::useLoadStackGuardNode or just making the platform-specific values match across TargetLoweringBase. Here, we try the latter. --- llvm/lib/CodeGen/TargetLoweringBase.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 3da66a4113334..91ae9040581ee 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2005,6 +2005,9 @@ void TargetLoweringBase::insertSSPDeclarations(Module &M) const { // Currently only support "standard" __stack_chk_guard. // TODO: add LOAD_STACK_GUARD support. Value *TargetLoweringBase::getSDagStackGuard(const Module &M) const { + if (getTargetMachine().getTargetTriple().isOSOpenBSD()) { + return M.getNamedValue("__guard_local"); + } return M.getNamedValue("__stack_chk_guard"); } From 66b0b0466bbd995146aadaf2cd18de5476c19941 Mon Sep 17 00:00:00 2001 From: Han-Chung Wang Date: Mon, 31 Mar 2025 09:29:54 -0700 Subject: [PATCH 0109/1029] [MLIR][NFC] Fix incomplete boundary comments. (#133516) I observed that we have the boundary comments in the codebase like: ``` //===----------------------------------------------------------------------===// // ... //===----------------------------------------------------------------------===// ``` I also observed that there are incomplete boundary comments. The revision is generated by a script that completes the boundary comments. ``` //===----------------------------------------------------------------------===// // ... ... ``` Signed-off-by: hanhanW --- mlir/include/mlir-c/Rewrite.h | 2 ++ mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 4 ++++ .../mlir/Dialect/PDLInterp/IR/PDLInterpOps.td | 2 ++ mlir/include/mlir/IR/AsmState.h | 2 ++ mlir/include/mlir/IR/BuiltinAttributes.h | 2 ++ mlir/include/mlir/IR/BuiltinTypes.td | 18 ++++++++++++++++++ mlir/include/mlir/IR/CommonAttrConstraints.td | 6 ++++++ mlir/include/mlir/IR/CommonTypeConstraints.td | 2 ++ mlir/include/mlir/IR/EnumAttr.td | 1 + mlir/include/mlir/IR/OpDefinition.h | 10 ++++++++++ mlir/include/mlir/IR/Properties.td | 3 +++ mlir/include/mlir/IR/TypeRange.h | 3 +++ mlir/include/mlir/IR/ValueRange.h | 6 ++++++ .../mlir/Interfaces/SideEffectInterfaces.td | 1 + mlir/include/mlir/Pass/PassOptions.h | 3 +++ mlir/include/mlir/Tools/PDLL/AST/Nodes.h | 10 ++++++++++ mlir/include/mlir/Tools/PDLL/AST/Types.h | 2 ++ mlir/lib/Analysis/CallGraph.cpp | 1 + mlir/lib/AsmParser/AsmParserState.cpp | 2 ++ mlir/lib/AsmParser/Parser.cpp | 2 ++ mlir/lib/Bytecode/Reader/BytecodeReader.cpp | 5 +++++ mlir/lib/Bytecode/Writer/BytecodeWriter.cpp | 6 ++++++ mlir/lib/CAPI/Transforms/Rewrite.cpp | 2 ++ .../Conversion/PDLToPDLInterp/Predicate.cpp | 4 ++++ mlir/lib/Conversion/PDLToPDLInterp/Predicate.h | 14 ++++++++++++++ .../Conversion/PDLToPDLInterp/PredicateTree.h | 4 ++++ mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 1 + mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp | 2 ++ mlir/lib/IR/BuiltinAttributes.cpp | 5 +++++ mlir/lib/IR/BuiltinDialectBytecode.cpp | 1 + mlir/lib/IR/OperationSupport.cpp | 6 ++++++ mlir/lib/IR/PDL/PDLPatternMatch.cpp | 1 + mlir/lib/IR/PatternMatch.cpp | 5 +++++ mlir/lib/IR/SymbolTable.cpp | 4 ++++ mlir/lib/IR/TypeRange.cpp | 1 + mlir/lib/IR/ValueRange.cpp | 4 ++++ mlir/lib/Pass/PassRegistry.cpp | 3 +++ mlir/lib/Rewrite/ByteCode.cpp | 2 ++ mlir/lib/TableGen/CodeGenHelpers.cpp | 3 +++ mlir/lib/Tools/PDLL/AST/Nodes.cpp | 3 +++ mlir/lib/Tools/PDLL/AST/Types.cpp | 2 ++ mlir/lib/Tools/PDLL/Parser/Parser.cpp | 7 +++++++ mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp | 8 ++++++++ .../Tools/mlir-pdll-lsp-server/LSPServer.cpp | 10 ++++++++++ mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp | 5 +++++ .../lib/Transforms/Utils/DialectConversion.cpp | 7 +++++++ mlir/lib/Transforms/Utils/Inliner.cpp | 1 + mlir/lib/Transforms/Utils/RegionUtils.cpp | 2 ++ mlir/test/lib/Dialect/Test/TestEnumDefs.td | 1 + mlir/test/lib/Dialect/Test/TestOpDefs.cpp | 8 ++++++++ mlir/test/lib/Dialect/Test/TestOps.td | 10 ++++++++++ mlir/test/lib/Dialect/Test/TestOpsSyntax.cpp | 3 +++ mlir/test/lib/Dialect/Test/TestOpsSyntax.td | 4 ++++ mlir/test/lib/Dialect/Test/TestPatterns.cpp | 2 ++ mlir/test/mlir-tblgen/op-format-invalid.td | 10 ++++++++++ mlir/test/mlir-tblgen/op-format-spec.td | 8 ++++++++ mlir/test/mlir-tblgen/op-format.td | 1 + mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp | 5 +++++ mlir/tools/mlir-tblgen/FormatGen.cpp | 1 + mlir/tools/mlir-tblgen/OpFormatGen.cpp | 4 ++++ 60 files changed, 257 insertions(+) diff --git a/mlir/include/mlir-c/Rewrite.h b/mlir/include/mlir-c/Rewrite.h index d8f2275b61532..61d3446317550 100644 --- a/mlir/include/mlir-c/Rewrite.h +++ b/mlir/include/mlir-c/Rewrite.h @@ -48,6 +48,7 @@ mlirRewriterBaseGetContext(MlirRewriterBase rewriter); //===----------------------------------------------------------------------===// /// Insertion points methods +//===----------------------------------------------------------------------===// // These do not include functions using Block::iterator or Region::iterator, as // they are not exposed by the C API yet. Similarly for methods using @@ -101,6 +102,7 @@ mlirRewriterBaseGetBlock(MlirRewriterBase rewriter); //===----------------------------------------------------------------------===// /// Block and operation creation/insertion/cloning +//===----------------------------------------------------------------------===// // These functions do not include the IRMapper, as it is not yet exposed by the // C API. diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 18177b9e24f7d..8945466f5ef5b 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -145,6 +145,7 @@ class ROCDL_DimGetterFunctionOp : ROCDL_IntrPure1Op<"mbcnt." # mnemonic>, @@ -205,6 +206,7 @@ def ROCDL_ReadlaneOp : ROCDL_IntrOp<"readlane", [], [0], [AllTypesMatch<["res", //===----------------------------------------------------------------------===// // Thread index and Block index +//===----------------------------------------------------------------------===// def ROCDL_ThreadIdXOp : ROCDL_SpecialIdRegisterOp<"workitem.id.x">; def ROCDL_ThreadIdYOp : ROCDL_SpecialIdRegisterOp<"workitem.id.y">; @@ -216,6 +218,7 @@ def ROCDL_BlockIdZOp : ROCDL_SpecialIdRegisterOp<"workgroup.id.z">; //===----------------------------------------------------------------------===// // Thread range and Block range +//===----------------------------------------------------------------------===// def ROCDL_BlockDimXOp : ROCDL_DimGetterFunctionOp<"workgroup.dim.x", "__ockl_get_local_size", 0>; @@ -237,6 +240,7 @@ def ROCDL_GridDimZOp : ROCDL_DimGetterFunctionOp<"grid.dim.z", //===----------------------------------------------------------------------===// // Synchronization primitives +//===----------------------------------------------------------------------===// // Emits the waintcnt instruction. The bitfield's semantics depend // on the target chipset diff --git a/mlir/include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td b/mlir/include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td index 901acc0e6733b..b8ad86b62e827 100644 --- a/mlir/include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td +++ b/mlir/include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td @@ -60,6 +60,7 @@ class PDLInterp_Op traits = []> : //===----------------------------------------------------------------------===// // PDLInterp_PredicateOp +//===----------------------------------------------------------------------===// // Check operations evaluate a predicate on a positional value and then // conditionally branch on the result. @@ -70,6 +71,7 @@ class PDLInterp_PredicateOp traits = []> : //===----------------------------------------------------------------------===// // PDLInterp_SwitchOp +//===----------------------------------------------------------------------===// // Switch operations evaluate a predicate on a positional value and then // conditionally branch on the result. diff --git a/mlir/include/mlir/IR/AsmState.h b/mlir/include/mlir/IR/AsmState.h index edbd3bb6fc15d..5e9311742bd94 100644 --- a/mlir/include/mlir/IR/AsmState.h +++ b/mlir/include/mlir/IR/AsmState.h @@ -81,6 +81,7 @@ class AsmStateImpl; //===----------------------------------------------------------------------===// // Resource Entry +//===----------------------------------------------------------------------===// class HeapAsmResourceBlob; @@ -330,6 +331,7 @@ class AsmParsedResourceEntry { //===----------------------------------------------------------------------===// // Resource Parser/Printer +//===----------------------------------------------------------------------===// /// This class represents an instance of a resource parser. This class should be /// implemented by non-dialect clients that want to inject additional resources diff --git a/mlir/include/mlir/IR/BuiltinAttributes.h b/mlir/include/mlir/IR/BuiltinAttributes.h index 901df3a25a46f..67fab7ebc13ba 100644 --- a/mlir/include/mlir/IR/BuiltinAttributes.h +++ b/mlir/include/mlir/IR/BuiltinAttributes.h @@ -717,6 +717,7 @@ using DenseResourceElementsHandle = DialectResourceBlobHandle; namespace mlir { //===----------------------------------------------------------------------===// // DenseArrayAttr +//===----------------------------------------------------------------------===// namespace detail { /// Base class for DenseArrayAttr that is instantiated and specialized for each @@ -772,6 +773,7 @@ using DenseF64ArrayAttr = detail::DenseArrayAttrImpl; //===----------------------------------------------------------------------===// // DenseResourceElementsAttr +//===----------------------------------------------------------------------===// namespace detail { /// Base class for DenseResourceElementsAttr that is instantiated and diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td index af474b3e3ec47..bcd76d928cf83 100644 --- a/mlir/include/mlir/IR/BuiltinTypes.td +++ b/mlir/include/mlir/IR/BuiltinTypes.td @@ -98,6 +98,7 @@ class Builtin_CachedFloatType { let summary = "8-bit floating point with 2 bit mantissa"; @@ -119,6 +120,7 @@ def Builtin_Float8E5M2 : Builtin_FloatType<"Float8E5M2", "f8E5M2"> { //===----------------------------------------------------------------------===// // Float8E4M3Type +//===----------------------------------------------------------------------===// def Builtin_Float8E4M3 : Builtin_FloatType<"Float8E4M3", "f8E4M3"> { let summary = "8-bit floating point with 3 bit mantissa"; @@ -138,6 +140,7 @@ def Builtin_Float8E4M3 : Builtin_FloatType<"Float8E4M3", "f8E4M3"> { //===----------------------------------------------------------------------===// // Float8E4M3FNType +//===----------------------------------------------------------------------===// def Builtin_Float8E4M3FN : Builtin_FloatType<"Float8E4M3FN", "f8E4M3FN"> { let summary = "8-bit floating point with 3 bit mantissa"; @@ -160,6 +163,7 @@ def Builtin_Float8E4M3FN : Builtin_FloatType<"Float8E4M3FN", "f8E4M3FN"> { //===----------------------------------------------------------------------===// // Float8E5M2FNUZType +//===----------------------------------------------------------------------===// def Builtin_Float8E5M2FNUZ : Builtin_FloatType<"Float8E5M2FNUZ", "f8E5M2FNUZ"> { let summary = "8-bit floating point with 2 bit mantissa"; @@ -182,6 +186,7 @@ def Builtin_Float8E5M2FNUZ : Builtin_FloatType<"Float8E5M2FNUZ", "f8E5M2FNUZ"> { //===----------------------------------------------------------------------===// // Float8E4M3FNUZType +//===----------------------------------------------------------------------===// def Builtin_Float8E4M3FNUZ : Builtin_FloatType<"Float8E4M3FNUZ", "f8E4M3FNUZ"> { let summary = "8-bit floating point with 3 bit mantissa"; @@ -204,6 +209,7 @@ def Builtin_Float8E4M3FNUZ : Builtin_FloatType<"Float8E4M3FNUZ", "f8E4M3FNUZ"> { //===----------------------------------------------------------------------===// // Float8E4M3B11FNUZType +//===----------------------------------------------------------------------===// def Builtin_Float8E4M3B11FNUZ : Builtin_FloatType<"Float8E4M3B11FNUZ", "f8E4M3B11FNUZ"> { let summary = "8-bit floating point with 3 bit mantissa"; @@ -226,6 +232,7 @@ def Builtin_Float8E4M3B11FNUZ : Builtin_FloatType<"Float8E4M3B11FNUZ", "f8E4M3B1 //===----------------------------------------------------------------------===// // Float8E3M4Type +//===----------------------------------------------------------------------===// def Builtin_Float8E3M4 : Builtin_FloatType<"Float8E3M4", "f8E3M4"> { let summary = "8-bit floating point with 3 bits exponent and 4 bit mantissa"; @@ -245,6 +252,7 @@ def Builtin_Float8E3M4 : Builtin_FloatType<"Float8E3M4", "f8E3M4"> { //===----------------------------------------------------------------------===// // Float4E2M1FNType +//===----------------------------------------------------------------------===// def Builtin_Float4E2M1FN : Builtin_FloatType<"Float4E2M1FN", "f4E2M1FN"> { let summary = "4-bit floating point with 2-bit exponent and 1-bit mantissa"; @@ -266,6 +274,7 @@ def Builtin_Float4E2M1FN : Builtin_FloatType<"Float4E2M1FN", "f4E2M1FN"> { //===----------------------------------------------------------------------===// // Float6E2M3FNType +//===----------------------------------------------------------------------===// def Builtin_Float6E2M3FN : Builtin_FloatType<"Float6E2M3FN", "f6E2M3FN"> { let summary = "6-bit floating point with 2-bit exponent and 3-bit mantissa"; @@ -287,6 +296,7 @@ def Builtin_Float6E2M3FN : Builtin_FloatType<"Float6E2M3FN", "f6E2M3FN"> { //===----------------------------------------------------------------------===// // Float6E3M2FNType +//===----------------------------------------------------------------------===// def Builtin_Float6E3M2FN : Builtin_FloatType<"Float6E3M2FN", "f6E3M2FN"> { let summary = "6-bit floating point with 3-bit exponent and 2-bit mantissa"; @@ -308,6 +318,7 @@ def Builtin_Float6E3M2FN : Builtin_FloatType<"Float6E3M2FN", "f6E3M2FN"> { //===----------------------------------------------------------------------===// // Float8E8M0FNUType +//===----------------------------------------------------------------------===// def Builtin_Float8E8M0FNU : Builtin_FloatType<"Float8E8M0FNU", "f8E8M0FNU"> { let summary = "8-bit floating point with 8-bit exponent, no mantissa or sign"; @@ -331,6 +342,7 @@ def Builtin_Float8E8M0FNU : Builtin_FloatType<"Float8E8M0FNU", "f8E8M0FNU"> { //===----------------------------------------------------------------------===// // BFloat16Type +//===----------------------------------------------------------------------===// def Builtin_BFloat16 : Builtin_CachedFloatType<"BFloat16", "bf16", /*declaredInterfaceMethods=*/["scaleElementBitwidth"]> { @@ -339,6 +351,7 @@ def Builtin_BFloat16 : Builtin_CachedFloatType<"BFloat16", "bf16", //===----------------------------------------------------------------------===// // Float16Type +//===----------------------------------------------------------------------===// def Builtin_Float16 : Builtin_CachedFloatType<"Float16", "f16", /*declaredInterfaceMethods=*/["scaleElementBitwidth"]> { @@ -347,6 +360,7 @@ def Builtin_Float16 : Builtin_CachedFloatType<"Float16", "f16", //===----------------------------------------------------------------------===// // FloatTF32Type +//===----------------------------------------------------------------------===// def Builtin_FloatTF32 : Builtin_CachedFloatType<"FloatTF32", "tf32"> { let summary = "TF32 floating-point type"; @@ -354,6 +368,7 @@ def Builtin_FloatTF32 : Builtin_CachedFloatType<"FloatTF32", "tf32"> { //===----------------------------------------------------------------------===// // Float32Type +//===----------------------------------------------------------------------===// def Builtin_Float32 : Builtin_CachedFloatType<"Float32", "f32", /*declaredInterfaceMethods=*/["scaleElementBitwidth"]> { @@ -362,6 +377,7 @@ def Builtin_Float32 : Builtin_CachedFloatType<"Float32", "f32", //===----------------------------------------------------------------------===// // Float64Type +//===----------------------------------------------------------------------===// def Builtin_Float64 : Builtin_CachedFloatType<"Float64", "f64"> { let summary = "64-bit floating-point type"; @@ -369,6 +385,7 @@ def Builtin_Float64 : Builtin_CachedFloatType<"Float64", "f64"> { //===----------------------------------------------------------------------===// // Float80Type +//===----------------------------------------------------------------------===// def Builtin_Float80 : Builtin_CachedFloatType<"Float80", "f80"> { let summary = "80-bit floating-point type"; @@ -376,6 +393,7 @@ def Builtin_Float80 : Builtin_CachedFloatType<"Float80", "f80"> { //===----------------------------------------------------------------------===// // Float128Type +//===----------------------------------------------------------------------===// def Builtin_Float128 : Builtin_CachedFloatType<"Float128", "f128"> { let summary = "128-bit floating-point type"; diff --git a/mlir/include/mlir/IR/CommonAttrConstraints.td b/mlir/include/mlir/IR/CommonAttrConstraints.td index 2beb1e8110afe..e91a13fea5c7f 100644 --- a/mlir/include/mlir/IR/CommonAttrConstraints.td +++ b/mlir/include/mlir/IR/CommonAttrConstraints.td @@ -23,6 +23,7 @@ include "mlir/IR/DialectBase.td" //===----------------------------------------------------------------------===// // Base attribute definition +//===----------------------------------------------------------------------===// // Base class for all attributes. class Attr : @@ -91,6 +92,7 @@ class DialectAttr : //===----------------------------------------------------------------------===// // Attribute modifier definition +//===----------------------------------------------------------------------===// // Decorates an attribute to have an (unvalidated) default value if not present. class DefaultValuedAttr : @@ -150,6 +152,7 @@ class DefaultValuedOptionalStrAttr //===----------------------------------------------------------------------===// // Primitive attribute kinds +//===----------------------------------------------------------------------===// // A generic attribute that must be constructed around a specific buildable type // `attrValType`. Backed by MLIR attribute kind `attrKind`. @@ -395,6 +398,7 @@ def UnitAttr : Attr($_self)">, "unit attrib //===----------------------------------------------------------------------===// // Composite attribute kinds +//===----------------------------------------------------------------------===// class DictionaryAttrBase : Attr { @@ -681,6 +685,7 @@ def FlatSymbolRefArrayAttr : //===----------------------------------------------------------------------===// // Derive attribute kinds +//===----------------------------------------------------------------------===// // DerivedAttr are attributes whose value is computed from properties // of the operation. They do not require additional storage and are @@ -714,6 +719,7 @@ class DerivedTypeAttr : DerivedAttr<"::mlir::Type", body> { //===----------------------------------------------------------------------===// // Constant attribute kinds +//===----------------------------------------------------------------------===// // Represents a constant attribute of specific Attr type. A constant // attribute can be specified only of attributes that have a constant diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td index 601517717978e..e6f17ded4628b 100644 --- a/mlir/include/mlir/IR/CommonTypeConstraints.td +++ b/mlir/include/mlir/IR/CommonTypeConstraints.td @@ -679,6 +679,7 @@ def AnyShaped: ShapedContainerType<[AnyType], IsShapedTypePred, "shaped", //===----------------------------------------------------------------------===// // Tensor types. +//===----------------------------------------------------------------------===// // Unranked tensor type whose element type is from the given `allowedTypes` // list, and which additionally satisfies an optional list of predicates. @@ -755,6 +756,7 @@ def AnyStaticShapeTensor : StaticShapeTensorOf<[AnyType]>; //===----------------------------------------------------------------------===// // Memref type. +//===----------------------------------------------------------------------===// // Any unranked memref whose element type is from the given `allowedTypes` list. class UnrankedMemRefOf allowedTypes> : diff --git a/mlir/include/mlir/IR/EnumAttr.td b/mlir/include/mlir/IR/EnumAttr.td index e5406546b1950..931126a155fbb 100644 --- a/mlir/include/mlir/IR/EnumAttr.td +++ b/mlir/include/mlir/IR/EnumAttr.td @@ -13,6 +13,7 @@ include "mlir/IR/AttrTypeBase.td" //===----------------------------------------------------------------------===// // Enum attribute kinds +//===----------------------------------------------------------------------===// // Additional information for an enum case. class EnumCase { diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h index 4fad61580b31a..237d48756c749 100644 --- a/mlir/include/mlir/IR/OpDefinition.h +++ b/mlir/include/mlir/IR/OpDefinition.h @@ -385,6 +385,7 @@ class TraitBase { //===----------------------------------------------------------------------===// // Operand Traits +//===----------------------------------------------------------------------===// namespace detail { /// Utility trait base that provides accessors for derived traits that have @@ -514,6 +515,7 @@ class VariadicOperands //===----------------------------------------------------------------------===// // Region Traits +//===----------------------------------------------------------------------===// /// This class provides verification for ops that are known to have zero /// regions. @@ -606,6 +608,7 @@ class VariadicRegions //===----------------------------------------------------------------------===// // Result Traits +//===----------------------------------------------------------------------===// /// This class provides return value APIs for ops that are known to have /// zero results. @@ -757,6 +760,7 @@ class VariadicResults //===----------------------------------------------------------------------===// // Terminator Traits +//===----------------------------------------------------------------------===// /// This class indicates that the regions associated with this op don't have /// terminators. @@ -868,6 +872,7 @@ class VariadicSuccessors //===----------------------------------------------------------------------===// // SingleBlock +//===----------------------------------------------------------------------===// /// This class provides APIs and verifiers for ops with regions having a single /// block. @@ -949,6 +954,7 @@ struct SingleBlock : public TraitBase { //===----------------------------------------------------------------------===// // SingleBlockImplicitTerminator +//===----------------------------------------------------------------------===// /// This class provides APIs and verifiers for ops with regions having a single /// block that must terminate with `TerminatorOpType`. @@ -1034,6 +1040,7 @@ struct hasSingleBlockImplicitTerminator { //===----------------------------------------------------------------------===// // Misc Traits +//===----------------------------------------------------------------------===// /// This class provides verification for ops that are known to have the same /// operand shape: all operands are scalars, vectors/tensors of the same @@ -1514,6 +1521,7 @@ bool hasElementwiseMappableTraits(Operation *op); namespace op_definition_impl { //===----------------------------------------------------------------------===// // Trait Existence +//===----------------------------------------------------------------------===// /// Returns true if this given Trait ID matches the IDs of any of the provided /// trait types `Traits`. @@ -1532,6 +1540,7 @@ inline bool hasTrait<>(TypeID traitID) { //===----------------------------------------------------------------------===// // Trait Folding +//===----------------------------------------------------------------------===// /// Trait to check if T provides a 'foldTrait' method for single result /// operations. @@ -1604,6 +1613,7 @@ static LogicalResult foldTraits(Operation *op, ArrayRef operands, //===----------------------------------------------------------------------===// // Trait Verification +//===----------------------------------------------------------------------===// /// Trait to check if T provides a `verifyTrait` method. template diff --git a/mlir/include/mlir/IR/Properties.td b/mlir/include/mlir/IR/Properties.td index 212b85876c8df..8bd8343790402 100644 --- a/mlir/include/mlir/IR/Properties.td +++ b/mlir/include/mlir/IR/Properties.td @@ -210,6 +210,7 @@ defvar writeMlirBytecodeWithConvertToAttribute = [{ //===----------------------------------------------------------------------===// // Primitive property kinds +//===----------------------------------------------------------------------===// // Any kind of integer stored as properties. class IntProp : @@ -360,6 +361,7 @@ def UnitProperty : _cls_UnitProp, Deprecated<"moved to shorter name UnitProp">; //===----------------------------------------------------------------------===// // Property field overwrites +//===----------------------------------------------------------------------===// /// Class for giving a property a default value. /// This doesn't change anything about the property other than giving it a default @@ -419,6 +421,7 @@ class ConfinedProperty //===----------------------------------------------------------------------===// // Primitive property combinators +//===----------------------------------------------------------------------===// /// Create a variable named `name` of `prop`'s storage type that is initialized /// to the correct default value, if there is one. diff --git a/mlir/include/mlir/IR/TypeRange.h b/mlir/include/mlir/IR/TypeRange.h index 9c2fbb3884188..3fb58d78617c0 100644 --- a/mlir/include/mlir/IR/TypeRange.h +++ b/mlir/include/mlir/IR/TypeRange.h @@ -23,6 +23,7 @@ namespace mlir { //===----------------------------------------------------------------------===// // TypeRange +//===----------------------------------------------------------------------===// /// This class provides an abstraction over the various different ranges of /// value types. In many cases, this prevents the need to explicitly materialize @@ -82,6 +83,7 @@ inline raw_ostream &operator<<(raw_ostream &os, const TypeRange &types) { //===----------------------------------------------------------------------===// // TypeRangeRange +//===----------------------------------------------------------------------===// using TypeRangeRangeIterator = llvm::mapped_iterator::iterator, @@ -111,6 +113,7 @@ class TypeRangeRange : public llvm::iterator_range { //===----------------------------------------------------------------------===// // ValueTypeRange +//===----------------------------------------------------------------------===// /// This class implements iteration on the types of a given range of values. template diff --git a/mlir/include/mlir/IR/ValueRange.h b/mlir/include/mlir/IR/ValueRange.h index a807b77ad077f..0c33e2b596b98 100644 --- a/mlir/include/mlir/IR/ValueRange.h +++ b/mlir/include/mlir/IR/ValueRange.h @@ -36,6 +36,7 @@ class MutableOperandRangeRange; //===----------------------------------------------------------------------===// // OperandRange +//===----------------------------------------------------------------------===// /// This class implements the operand iterators for the Operation class. class OperandRange final : public llvm::detail::indexed_accessor_range_base< @@ -73,6 +74,7 @@ class OperandRange final : public llvm::detail::indexed_accessor_range_base< //===----------------------------------------------------------------------===// // OperandRangeRange +//===----------------------------------------------------------------------===// /// This class represents a contiguous range of operand ranges, e.g. from a /// VariadicOfVariadic operand group. @@ -109,6 +111,7 @@ class OperandRangeRange final //===----------------------------------------------------------------------===// // MutableOperandRange +//===----------------------------------------------------------------------===// /// This class provides a mutable adaptor for a range of operands. It allows for /// setting, inserting, and erasing operands from the given range. @@ -196,6 +199,7 @@ class MutableOperandRange { //===----------------------------------------------------------------------===// // MutableOperandRangeRange +//===----------------------------------------------------------------------===// /// This class represents a contiguous range of mutable operand ranges, e.g. /// from a VariadicOfVariadic operand group. @@ -235,6 +239,7 @@ class MutableOperandRangeRange final //===----------------------------------------------------------------------===// // ResultRange +//===----------------------------------------------------------------------===// /// This class implements the result iterators for the Operation class. class ResultRange final @@ -368,6 +373,7 @@ class ResultRange::UseIterator final //===----------------------------------------------------------------------===// // ValueRange +//===----------------------------------------------------------------------===// /// This class provides an abstraction over the different types of ranges over /// Values. In many cases, this prevents the need to explicitly materialize a diff --git a/mlir/include/mlir/Interfaces/SideEffectInterfaces.td b/mlir/include/mlir/Interfaces/SideEffectInterfaces.td index b2ab4fee9d29c..b292174fccb36 100644 --- a/mlir/include/mlir/Interfaces/SideEffectInterfaces.td +++ b/mlir/include/mlir/Interfaces/SideEffectInterfaces.td @@ -45,6 +45,7 @@ class MemoryEffects effects = []> //===----------------------------------------------------------------------===// // Effects +//===----------------------------------------------------------------------===// // The following effect indicates that the operation allocates from some // resource. An 'allocate' effect implies only allocation of the resource, and diff --git a/mlir/include/mlir/Pass/PassOptions.h b/mlir/include/mlir/Pass/PassOptions.h index b5a9c25e3baca..68588279e2f5a 100644 --- a/mlir/include/mlir/Pass/PassOptions.h +++ b/mlir/include/mlir/Pass/PassOptions.h @@ -406,6 +406,7 @@ namespace llvm { namespace cl { //===----------------------------------------------------------------------===// // std::vector+SmallVector +//===----------------------------------------------------------------------===// namespace detail { template @@ -470,6 +471,7 @@ class parser> //===----------------------------------------------------------------------===// // OpPassManager: OptionValue +//===----------------------------------------------------------------------===// template <> struct OptionValue final : GenericOptionValue { @@ -514,6 +516,7 @@ struct OptionValue final : GenericOptionValue { //===----------------------------------------------------------------------===// // OpPassManager: Parser +//===----------------------------------------------------------------------===// extern template class basic_parser; diff --git a/mlir/include/mlir/Tools/PDLL/AST/Nodes.h b/mlir/include/mlir/Tools/PDLL/AST/Nodes.h index aed2562e4d30d..f174ac2f476f6 100644 --- a/mlir/include/mlir/Tools/PDLL/AST/Nodes.h +++ b/mlir/include/mlir/Tools/PDLL/AST/Nodes.h @@ -247,6 +247,7 @@ class OpRewriteStmt : public Stmt { //===----------------------------------------------------------------------===// // EraseStmt +//===----------------------------------------------------------------------===// /// This statement represents the `erase` statement in PDLL. This statement /// erases the given root operation, corresponding roughly to the @@ -261,6 +262,7 @@ class EraseStmt final : public Node::NodeBase { //===----------------------------------------------------------------------===// // ReplaceStmt +//===----------------------------------------------------------------------===// /// This statement represents the `replace` statement in PDLL. This statement /// replace the given root operation with a set of values, corresponding roughly @@ -292,6 +294,7 @@ class ReplaceStmt final : public Node::NodeBase, //===----------------------------------------------------------------------===// // RewriteStmt +//===----------------------------------------------------------------------===// /// This statement represents an operation rewrite that contains a block of /// nested rewrite commands. This allows for building more complex operation @@ -478,6 +481,7 @@ class MemberAccessExpr : public Node::NodeBase { //===----------------------------------------------------------------------===// // AllResultsMemberAccessExpr +//===----------------------------------------------------------------------===// /// This class represents an instance of MemberAccessExpr that references all /// results of an operation. @@ -742,6 +746,7 @@ class CoreConstraintDecl : public ConstraintDecl { //===----------------------------------------------------------------------===// // AttrConstraintDecl +//===----------------------------------------------------------------------===// /// The class represents an Attribute constraint, and constrains a variable to /// be an Attribute. @@ -765,6 +770,7 @@ class AttrConstraintDecl //===----------------------------------------------------------------------===// // OpConstraintDecl +//===----------------------------------------------------------------------===// /// The class represents an Operation constraint, and constrains a variable to /// be an Operation. @@ -790,6 +796,7 @@ class OpConstraintDecl //===----------------------------------------------------------------------===// // TypeConstraintDecl +//===----------------------------------------------------------------------===// /// The class represents a Type constraint, and constrains a variable to be a /// Type. @@ -804,6 +811,7 @@ class TypeConstraintDecl //===----------------------------------------------------------------------===// // TypeRangeConstraintDecl +//===----------------------------------------------------------------------===// /// The class represents a TypeRange constraint, and constrains a variable to be /// a TypeRange. @@ -818,6 +826,7 @@ class TypeRangeConstraintDecl //===----------------------------------------------------------------------===// // ValueConstraintDecl +//===----------------------------------------------------------------------===// /// The class represents a Value constraint, and constrains a variable to be a /// Value. @@ -840,6 +849,7 @@ class ValueConstraintDecl //===----------------------------------------------------------------------===// // ValueRangeConstraintDecl +//===----------------------------------------------------------------------===// /// The class represents a ValueRange constraint, and constrains a variable to /// be a ValueRange. diff --git a/mlir/include/mlir/Tools/PDLL/AST/Types.h b/mlir/include/mlir/Tools/PDLL/AST/Types.h index 89c8e193ddc32..08d15bd764dfe 100644 --- a/mlir/include/mlir/Tools/PDLL/AST/Types.h +++ b/mlir/include/mlir/Tools/PDLL/AST/Types.h @@ -198,6 +198,7 @@ class RangeType : public Type::TypeBase { //===----------------------------------------------------------------------===// // TypeRangeType +//===----------------------------------------------------------------------===// /// This class represents a PDLL type that corresponds to an mlir::TypeRange. class TypeRangeType : public RangeType { @@ -213,6 +214,7 @@ class TypeRangeType : public RangeType { //===----------------------------------------------------------------------===// // ValueRangeType +//===----------------------------------------------------------------------===// /// This class represents a PDLL type that corresponds to an mlir::ValueRange. class ValueRangeType : public RangeType { diff --git a/mlir/lib/Analysis/CallGraph.cpp b/mlir/lib/Analysis/CallGraph.cpp index 780c7caee767c..d6fe62d8e58d6 100644 --- a/mlir/lib/Analysis/CallGraph.cpp +++ b/mlir/lib/Analysis/CallGraph.cpp @@ -173,6 +173,7 @@ void CallGraph::eraseNode(CallGraphNode *node) { //===----------------------------------------------------------------------===// // Printing +//===----------------------------------------------------------------------===// /// Dump the graph in a human readable format. void CallGraph::dump() const { print(llvm::errs()); } diff --git a/mlir/lib/AsmParser/AsmParserState.cpp b/mlir/lib/AsmParser/AsmParserState.cpp index 9b2b686aee782..2684b42516670 100644 --- a/mlir/lib/AsmParser/AsmParserState.cpp +++ b/mlir/lib/AsmParser/AsmParserState.cpp @@ -119,6 +119,7 @@ AsmParserState &AsmParserState::operator=(AsmParserState &&other) { //===----------------------------------------------------------------------===// // Access State +//===----------------------------------------------------------------------===// auto AsmParserState::getBlockDefs() const -> iterator_range { return llvm::make_pointee_range(llvm::ArrayRef(impl->blocks)); @@ -216,6 +217,7 @@ SMRange AsmParserState::convertIdLocToRange(SMLoc loc) { //===----------------------------------------------------------------------===// // Populate State +//===----------------------------------------------------------------------===// void AsmParserState::initialize(Operation *topLevelOp) { startOperationDefinition(topLevelOp->getName()); diff --git a/mlir/lib/AsmParser/Parser.cpp b/mlir/lib/AsmParser/Parser.cpp index 168231af9b410..6240fdc833501 100644 --- a/mlir/lib/AsmParser/Parser.cpp +++ b/mlir/lib/AsmParser/Parser.cpp @@ -435,6 +435,7 @@ ParseResult Parser::parseOptionalKeywordOrString(std::string *result) { //===----------------------------------------------------------------------===// // Resource Parsing +//===----------------------------------------------------------------------===// FailureOr Parser::parseResourceHandle(const OpAsmDialectInterface *dialect, @@ -478,6 +479,7 @@ Parser::parseResourceHandle(Dialect *dialect) { //===----------------------------------------------------------------------===// // Code Completion +//===----------------------------------------------------------------------===// ParseResult Parser::codeCompleteDialectName() { state.codeCompleteContext->completeDialectName(); diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp index 1204f1c069b1e..0f2057cb32ff1 100644 --- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp +++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp @@ -1733,6 +1733,7 @@ LogicalResult BytecodeReader::Impl::parseVersion(EncodingReader &reader) { //===----------------------------------------------------------------------===// // Dialect Section +//===----------------------------------------------------------------------===// LogicalResult BytecodeDialect::load(const DialectReader &reader, MLIRContext *ctx) { @@ -1874,6 +1875,7 @@ BytecodeReader::Impl::parseOpName(EncodingReader &reader, //===----------------------------------------------------------------------===// // Resource Section +//===----------------------------------------------------------------------===// LogicalResult BytecodeReader::Impl::parseResourceSection( EncodingReader &reader, std::optional> resourceData, @@ -1902,6 +1904,7 @@ LogicalResult BytecodeReader::Impl::parseResourceSection( //===----------------------------------------------------------------------===// // UseListOrder Helpers +//===----------------------------------------------------------------------===// FailureOr BytecodeReader::Impl::parseUseListOrderForRange(EncodingReader &reader, @@ -2060,6 +2063,7 @@ LogicalResult BytecodeReader::Impl::processUseLists(Operation *topLevelOp) { //===----------------------------------------------------------------------===// // IR Section +//===----------------------------------------------------------------------===// LogicalResult BytecodeReader::Impl::parseIRSection(ArrayRef sectionData, @@ -2460,6 +2464,7 @@ LogicalResult BytecodeReader::Impl::parseBlockArguments(EncodingReader &reader, //===----------------------------------------------------------------------===// // Value Processing +//===----------------------------------------------------------------------===// Value BytecodeReader::Impl::parseOperand(EncodingReader &reader) { std::vector &values = valueScopes.back().values; diff --git a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp index cc5aaed416512..404d361a50c1f 100644 --- a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp +++ b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp @@ -772,6 +772,7 @@ LogicalResult BytecodeWriter::write(Operation *rootOp, raw_ostream &os) { //===----------------------------------------------------------------------===// // Dialects +//===----------------------------------------------------------------------===// /// Write the given entries in contiguous groups with the same parent dialect. /// Each dialect sub-group is encoded with the parent dialect and number of @@ -855,6 +856,7 @@ void BytecodeWriter::writeDialectSection(EncodingEmitter &emitter) { //===----------------------------------------------------------------------===// // Attributes and Types +//===----------------------------------------------------------------------===// void BytecodeWriter::writeAttrTypeSection(EncodingEmitter &emitter) { EncodingEmitter attrTypeEmitter; @@ -936,6 +938,7 @@ void BytecodeWriter::writeAttrTypeSection(EncodingEmitter &emitter) { //===----------------------------------------------------------------------===// // Operations +//===----------------------------------------------------------------------===// LogicalResult BytecodeWriter::writeBlock(EncodingEmitter &emitter, Block *block) { @@ -1215,6 +1218,7 @@ LogicalResult BytecodeWriter::writeIRSection(EncodingEmitter &emitter, //===----------------------------------------------------------------------===// // Resources +//===----------------------------------------------------------------------===// namespace { /// This class represents a resource builder implementation for the MLIR @@ -1327,6 +1331,7 @@ void BytecodeWriter::writeResourceSection(Operation *op, //===----------------------------------------------------------------------===// // Strings +//===----------------------------------------------------------------------===// void BytecodeWriter::writeStringSection(EncodingEmitter &emitter) { EncodingEmitter stringEmitter; @@ -1336,6 +1341,7 @@ void BytecodeWriter::writeStringSection(EncodingEmitter &emitter) { //===----------------------------------------------------------------------===// // Properties +//===----------------------------------------------------------------------===// void BytecodeWriter::writePropertiesSection(EncodingEmitter &emitter) { EncodingEmitter propertiesEmitter; diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp index c4717ca613319..a4df97f7beace 100644 --- a/mlir/lib/CAPI/Transforms/Rewrite.cpp +++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp @@ -29,6 +29,7 @@ MlirContext mlirRewriterBaseGetContext(MlirRewriterBase rewriter) { //===----------------------------------------------------------------------===// /// Insertion points methods +//===----------------------------------------------------------------------===// void mlirRewriterBaseClearInsertionPoint(MlirRewriterBase rewriter) { unwrap(rewriter)->clearInsertionPoint(); @@ -69,6 +70,7 @@ MlirBlock mlirRewriterBaseGetBlock(MlirRewriterBase rewriter) { //===----------------------------------------------------------------------===// /// Block and operation creation/insertion/cloning +//===----------------------------------------------------------------------===// MlirBlock mlirRewriterBaseCreateBlockBefore(MlirRewriterBase rewriter, MlirBlock insertBefore, diff --git a/mlir/lib/Conversion/PDLToPDLInterp/Predicate.cpp b/mlir/lib/Conversion/PDLToPDLInterp/Predicate.cpp index a12f3171e7afa..92524723754af 100644 --- a/mlir/lib/Conversion/PDLToPDLInterp/Predicate.cpp +++ b/mlir/lib/Conversion/PDLToPDLInterp/Predicate.cpp @@ -26,6 +26,7 @@ unsigned Position::getOperationDepth() const { //===----------------------------------------------------------------------===// // AttributePosition +//===----------------------------------------------------------------------===// AttributePosition::AttributePosition(const KeyTy &key) : Base(key) { parent = key.first; @@ -33,6 +34,7 @@ AttributePosition::AttributePosition(const KeyTy &key) : Base(key) { //===----------------------------------------------------------------------===// // OperandPosition +//===----------------------------------------------------------------------===// OperandPosition::OperandPosition(const KeyTy &key) : Base(key) { parent = key.first; @@ -40,6 +42,7 @@ OperandPosition::OperandPosition(const KeyTy &key) : Base(key) { //===----------------------------------------------------------------------===// // OperandGroupPosition +//===----------------------------------------------------------------------===// OperandGroupPosition::OperandGroupPosition(const KeyTy &key) : Base(key) { parent = std::get<0>(key); @@ -47,6 +50,7 @@ OperandGroupPosition::OperandGroupPosition(const KeyTy &key) : Base(key) { //===----------------------------------------------------------------------===// // OperationPosition +//===----------------------------------------------------------------------===// bool OperationPosition::isOperandDefiningOp() const { return isa_and_nonnull(parent); diff --git a/mlir/lib/Conversion/PDLToPDLInterp/Predicate.h b/mlir/lib/Conversion/PDLToPDLInterp/Predicate.h index 5ad2c477573a5..12a752d4e7dbf 100644 --- a/mlir/lib/Conversion/PDLToPDLInterp/Predicate.h +++ b/mlir/lib/Conversion/PDLToPDLInterp/Predicate.h @@ -166,6 +166,7 @@ class Position : public StorageUniquer::BaseStorage { //===----------------------------------------------------------------------===// // AttributePosition +//===----------------------------------------------------------------------===// /// A position describing an attribute of an operation. struct AttributePosition @@ -180,6 +181,7 @@ struct AttributePosition //===----------------------------------------------------------------------===// // AttributeLiteralPosition +//===----------------------------------------------------------------------===// /// A position describing a literal attribute. struct AttributeLiteralPosition @@ -190,6 +192,7 @@ struct AttributeLiteralPosition //===----------------------------------------------------------------------===// // ForEachPosition +//===----------------------------------------------------------------------===// /// A position describing an iterative choice of an operation. struct ForEachPosition : public PredicateBase emitError, //===----------------------------------------------------------------------===// // DataLayoutTypeInterface +//===----------------------------------------------------------------------===// llvm::TypeSize LLVMArrayType::getTypeSizeInBits(const DataLayout &dataLayout, @@ -255,6 +256,7 @@ LLVMFunctionType::verify(function_ref emitError, //===----------------------------------------------------------------------===// // DataLayoutTypeInterface +//===----------------------------------------------------------------------===// constexpr const static uint64_t kDefaultPointerSizeBits = 64; constexpr const static uint64_t kDefaultPointerAlignment = 8; diff --git a/mlir/lib/IR/BuiltinAttributes.cpp b/mlir/lib/IR/BuiltinAttributes.cpp index 112e3f376bd41..daf79dc5de981 100644 --- a/mlir/lib/IR/BuiltinAttributes.cpp +++ b/mlir/lib/IR/BuiltinAttributes.cpp @@ -600,6 +600,7 @@ static bool hasSameElementsOrSplat(ShapedType type, const Values &values) { //===----------------------------------------------------------------------===// // AttributeElementIterator +//===----------------------------------------------------------------------===// DenseElementsAttr::AttributeElementIterator::AttributeElementIterator( DenseElementsAttr attr, size_t index) @@ -647,6 +648,7 @@ Attribute DenseElementsAttr::AttributeElementIterator::operator*() const { //===----------------------------------------------------------------------===// // BoolElementIterator +//===----------------------------------------------------------------------===// DenseElementsAttr::BoolElementIterator::BoolElementIterator( DenseElementsAttr attr, size_t dataIndex) @@ -659,6 +661,7 @@ bool DenseElementsAttr::BoolElementIterator::operator*() const { //===----------------------------------------------------------------------===// // IntElementIterator +//===----------------------------------------------------------------------===// DenseElementsAttr::IntElementIterator::IntElementIterator( DenseElementsAttr attr, size_t dataIndex) @@ -674,6 +677,7 @@ APInt DenseElementsAttr::IntElementIterator::operator*() const { //===----------------------------------------------------------------------===// // ComplexIntElementIterator +//===----------------------------------------------------------------------===// DenseElementsAttr::ComplexIntElementIterator::ComplexIntElementIterator( DenseElementsAttr attr, size_t dataIndex) @@ -1552,6 +1556,7 @@ ArrayRef DenseResourceElementsAttr::getData() { //===----------------------------------------------------------------------===// // DenseResourceElementsAttrBase +//===----------------------------------------------------------------------===// namespace { /// Instantiations of this class provide utilities for interacting with native diff --git a/mlir/lib/IR/BuiltinDialectBytecode.cpp b/mlir/lib/IR/BuiltinDialectBytecode.cpp index 6095c6bcb2ce3..31aff47ceaa67 100644 --- a/mlir/lib/IR/BuiltinDialectBytecode.cpp +++ b/mlir/lib/IR/BuiltinDialectBytecode.cpp @@ -29,6 +29,7 @@ namespace { //===----------------------------------------------------------------------===// // Utility functions +//===----------------------------------------------------------------------===// // TODO: Move these to separate file. diff --git a/mlir/lib/IR/OperationSupport.cpp b/mlir/lib/IR/OperationSupport.cpp index 1b2cda19de1e8..7c9e6c89d4d8e 100644 --- a/mlir/lib/IR/OperationSupport.cpp +++ b/mlir/lib/IR/OperationSupport.cpp @@ -383,6 +383,7 @@ MutableArrayRef detail::OperandStorage::resize(Operation *owner, //===----------------------------------------------------------------------===// // OperandRange +//===----------------------------------------------------------------------===// unsigned OperandRange::getBeginOperandIndex() const { assert(!empty() && "range must not be empty"); @@ -395,6 +396,7 @@ OperandRangeRange OperandRange::split(DenseI32ArrayAttr segmentSizes) const { //===----------------------------------------------------------------------===// // OperandRangeRange +//===----------------------------------------------------------------------===// OperandRangeRange::OperandRangeRange(OperandRange operands, Attribute operandSegments) @@ -419,6 +421,7 @@ OperandRange OperandRangeRange::dereference(const OwnerT &object, //===----------------------------------------------------------------------===// // MutableOperandRange +//===----------------------------------------------------------------------===// /// Construct a new mutable range from the given operand, operand start index, /// and range length. @@ -542,6 +545,7 @@ MutableArrayRef::iterator MutableOperandRange::end() const { //===----------------------------------------------------------------------===// // MutableOperandRangeRange +//===----------------------------------------------------------------------===// MutableOperandRangeRange::MutableOperandRangeRange( const MutableOperandRange &operands, NamedAttribute operandSegmentAttr) @@ -571,6 +575,7 @@ MutableOperandRange MutableOperandRangeRange::dereference(const OwnerT &object, //===----------------------------------------------------------------------===// // ResultRange +//===----------------------------------------------------------------------===// ResultRange::ResultRange(OpResult result) : ResultRange(static_cast(Value(result).getImpl()), @@ -637,6 +642,7 @@ void ResultRange::replaceUsesWithIf( //===----------------------------------------------------------------------===// // ValueRange +//===----------------------------------------------------------------------===// ValueRange::ValueRange(ArrayRef values) : ValueRange(values.data(), values.size()) {} diff --git a/mlir/lib/IR/PDL/PDLPatternMatch.cpp b/mlir/lib/IR/PDL/PDLPatternMatch.cpp index da07cc462a5a1..0d05153bf7dab 100644 --- a/mlir/lib/IR/PDL/PDLPatternMatch.cpp +++ b/mlir/lib/IR/PDL/PDLPatternMatch.cpp @@ -113,6 +113,7 @@ void PDLPatternModule::attachConfigToPatterns(ModuleOp module, //===----------------------------------------------------------------------===// // Function Registry +//===----------------------------------------------------------------------===// void PDLPatternModule::registerConstraintFunction( StringRef name, PDLConstraintFunction constraintFn) { diff --git a/mlir/lib/IR/PatternMatch.cpp b/mlir/lib/IR/PatternMatch.cpp index 3e3c06bebf142..1e6084822a99a 100644 --- a/mlir/lib/IR/PatternMatch.cpp +++ b/mlir/lib/IR/PatternMatch.cpp @@ -35,6 +35,7 @@ unsigned short PatternBenefit::getBenefit() const { //===----------------------------------------------------------------------===// // OperationName Root Constructors +//===----------------------------------------------------------------------===// Pattern::Pattern(StringRef rootName, PatternBenefit benefit, MLIRContext *context, ArrayRef generatedNames) @@ -43,6 +44,7 @@ Pattern::Pattern(StringRef rootName, PatternBenefit benefit, //===----------------------------------------------------------------------===// // MatchAnyOpTypeTag Root Constructors +//===----------------------------------------------------------------------===// Pattern::Pattern(MatchAnyOpTypeTag tag, PatternBenefit benefit, MLIRContext *context, ArrayRef generatedNames) @@ -50,6 +52,7 @@ Pattern::Pattern(MatchAnyOpTypeTag tag, PatternBenefit benefit, //===----------------------------------------------------------------------===// // MatchInterfaceOpTypeTag Root Constructors +//===----------------------------------------------------------------------===// Pattern::Pattern(MatchInterfaceOpTypeTag tag, TypeID interfaceID, PatternBenefit benefit, MLIRContext *context, @@ -59,6 +62,7 @@ Pattern::Pattern(MatchInterfaceOpTypeTag tag, TypeID interfaceID, //===----------------------------------------------------------------------===// // MatchTraitOpTypeTag Root Constructors +//===----------------------------------------------------------------------===// Pattern::Pattern(MatchTraitOpTypeTag tag, TypeID traitID, PatternBenefit benefit, MLIRContext *context, @@ -68,6 +72,7 @@ Pattern::Pattern(MatchTraitOpTypeTag tag, TypeID traitID, //===----------------------------------------------------------------------===// // General Constructors +//===----------------------------------------------------------------------===// Pattern::Pattern(const void *rootValue, RootKind rootKind, ArrayRef generatedNames, PatternBenefit benefit, diff --git a/mlir/lib/IR/SymbolTable.cpp b/mlir/lib/IR/SymbolTable.cpp index 71adfc467611b..71422191c1455 100644 --- a/mlir/lib/IR/SymbolTable.cpp +++ b/mlir/lib/IR/SymbolTable.cpp @@ -759,6 +759,7 @@ static bool isReferencePrefixOf(SymbolRefAttr subRef, SymbolRefAttr ref) { //===----------------------------------------------------------------------===// // SymbolTable::getSymbolUses +//===----------------------------------------------------------------------===// /// The implementation of SymbolTable::getSymbolUses below. template @@ -789,6 +790,7 @@ auto SymbolTable::getSymbolUses(Region *from) -> std::optional { //===----------------------------------------------------------------------===// // SymbolTable::getSymbolUses +//===----------------------------------------------------------------------===// /// The implementation of SymbolTable::getSymbolUses below. template @@ -828,6 +830,7 @@ auto SymbolTable::getSymbolUses(Operation *symbol, Region *from) //===----------------------------------------------------------------------===// // SymbolTable::symbolKnownUseEmpty +//===----------------------------------------------------------------------===// /// The implementation of SymbolTable::symbolKnownUseEmpty below. template @@ -863,6 +866,7 @@ bool SymbolTable::symbolKnownUseEmpty(Operation *symbol, Region *from) { //===----------------------------------------------------------------------===// // SymbolTable::replaceAllSymbolUses +//===----------------------------------------------------------------------===// /// Generates a new symbol reference attribute with a new leaf reference. static SymbolRefAttr generateNewRefAttr(SymbolRefAttr oldAttr, diff --git a/mlir/lib/IR/TypeRange.cpp b/mlir/lib/IR/TypeRange.cpp index f8878303727d4..26ffc0d5485de 100644 --- a/mlir/lib/IR/TypeRange.cpp +++ b/mlir/lib/IR/TypeRange.cpp @@ -13,6 +13,7 @@ using namespace mlir; //===----------------------------------------------------------------------===// // TypeRange +//===----------------------------------------------------------------------===// TypeRange::TypeRange(ArrayRef types) : TypeRange(types.data(), types.size()) { diff --git a/mlir/lib/IR/ValueRange.cpp b/mlir/lib/IR/ValueRange.cpp index bd2e10098e61d..a76c0c1f83cf1 100644 --- a/mlir/lib/IR/ValueRange.cpp +++ b/mlir/lib/IR/ValueRange.cpp @@ -13,6 +13,7 @@ using namespace mlir; //===----------------------------------------------------------------------===// // TypeRangeRange +//===----------------------------------------------------------------------===// TypeRangeRange OperandRangeRange::getTypes() const { return TypeRangeRange(*this); @@ -22,6 +23,7 @@ TypeRangeRange OperandRangeRange::getType() const { return getTypes(); } //===----------------------------------------------------------------------===// // OperandRange +//===----------------------------------------------------------------------===// OperandRange::type_range OperandRange::getTypes() const { return {begin(), end()}; @@ -31,6 +33,7 @@ OperandRange::type_range OperandRange::getType() const { return getTypes(); } //===----------------------------------------------------------------------===// // ResultRange +//===----------------------------------------------------------------------===// ResultRange::type_range ResultRange::getTypes() const { return {begin(), end()}; @@ -40,6 +43,7 @@ ResultRange::type_range ResultRange::getType() const { return getTypes(); } //===----------------------------------------------------------------------===// // ValueRange +//===----------------------------------------------------------------------===// ValueRange::type_range ValueRange::getTypes() const { return {begin(), end()}; } diff --git a/mlir/lib/Pass/PassRegistry.cpp b/mlir/lib/Pass/PassRegistry.cpp index ece2fdaed0dfd..7c294f08a32bb 100644 --- a/mlir/lib/Pass/PassRegistry.cpp +++ b/mlir/lib/Pass/PassRegistry.cpp @@ -418,6 +418,7 @@ size_t detail::PassOptions::getOptionWidth() const { //===----------------------------------------------------------------------===// // OpPassManager: OptionValue +//===----------------------------------------------------------------------===// llvm::cl::OptionValue::OptionValue() = default; llvm::cl::OptionValue::OptionValue( @@ -470,6 +471,7 @@ void llvm::cl::OptionValue::anchor() {} //===----------------------------------------------------------------------===// // OpPassManager: Parser +//===----------------------------------------------------------------------===// namespace llvm { namespace cl { @@ -1028,6 +1030,7 @@ LogicalResult PassPipelineCLParser::addToPipeline( //===----------------------------------------------------------------------===// // PassNameCLParser +//===----------------------------------------------------------------------===// /// Construct a pass pipeline parser with the given command line description. PassNameCLParser::PassNameCLParser(StringRef arg, StringRef description) diff --git a/mlir/lib/Rewrite/ByteCode.cpp b/mlir/lib/Rewrite/ByteCode.cpp index d2d9ec2787a28..26510f976efc9 100644 --- a/mlir/lib/Rewrite/ByteCode.cpp +++ b/mlir/lib/Rewrite/ByteCode.cpp @@ -179,6 +179,7 @@ static constexpr ByteCodeField kInferTypesMarker = //===----------------------------------------------------------------------===// // Generator +//===----------------------------------------------------------------------===// namespace { struct ByteCodeLiveRange; @@ -1086,6 +1087,7 @@ void PDLByteCode::initializeMutableState(PDLByteCodeMutableState &state) const { //===----------------------------------------------------------------------===// // ByteCode Execution +//===----------------------------------------------------------------------===// namespace { /// This class is an instantiation of the PDLResultList that provides access to diff --git a/mlir/lib/TableGen/CodeGenHelpers.cpp b/mlir/lib/TableGen/CodeGenHelpers.cpp index 747af1ce5a4d3..0a07071fea217 100644 --- a/mlir/lib/TableGen/CodeGenHelpers.cpp +++ b/mlir/lib/TableGen/CodeGenHelpers.cpp @@ -65,6 +65,7 @@ void StaticVerifierFunctionEmitter::emitPatternConstraints( //===----------------------------------------------------------------------===// // Constraint Getters +//===----------------------------------------------------------------------===// StringRef StaticVerifierFunctionEmitter::getTypeConstraintFn( const Constraint &constraint) const { @@ -100,6 +101,7 @@ StringRef StaticVerifierFunctionEmitter::getRegionConstraintFn( //===----------------------------------------------------------------------===// // Constraint Emission +//===----------------------------------------------------------------------===// /// Code templates for emitting type, attribute, successor, and region /// constraints. Each of these templates require the following arguments: @@ -234,6 +236,7 @@ void StaticVerifierFunctionEmitter::emitPatternConstraints() { //===----------------------------------------------------------------------===// // Constraint Uniquing +//===----------------------------------------------------------------------===// /// An attribute constraint that references anything other than itself and the /// current op cannot be generically extracted into a function. Most diff --git a/mlir/lib/Tools/PDLL/AST/Nodes.cpp b/mlir/lib/Tools/PDLL/AST/Nodes.cpp index 654ff24454cb1..ee2fe0fb9e3c3 100644 --- a/mlir/lib/Tools/PDLL/AST/Nodes.cpp +++ b/mlir/lib/Tools/PDLL/AST/Nodes.cpp @@ -214,6 +214,7 @@ LetStmt *LetStmt::create(Context &ctx, SMRange loc, VariableDecl *varDecl) { //===----------------------------------------------------------------------===// // EraseStmt +//===----------------------------------------------------------------------===// EraseStmt *EraseStmt::create(Context &ctx, SMRange loc, Expr *rootOp) { return new (ctx.getAllocator().Allocate()) EraseStmt(loc, rootOp); @@ -221,6 +222,7 @@ EraseStmt *EraseStmt::create(Context &ctx, SMRange loc, Expr *rootOp) { //===----------------------------------------------------------------------===// // ReplaceStmt +//===----------------------------------------------------------------------===// ReplaceStmt *ReplaceStmt::create(Context &ctx, SMRange loc, Expr *rootOp, ArrayRef replExprs) { @@ -235,6 +237,7 @@ ReplaceStmt *ReplaceStmt::create(Context &ctx, SMRange loc, Expr *rootOp, //===----------------------------------------------------------------------===// // RewriteStmt +//===----------------------------------------------------------------------===// RewriteStmt *RewriteStmt::create(Context &ctx, SMRange loc, Expr *rootOp, CompoundStmt *rewriteBody) { diff --git a/mlir/lib/Tools/PDLL/AST/Types.cpp b/mlir/lib/Tools/PDLL/AST/Types.cpp index 081f85d69a2f6..1468ac2a280d5 100644 --- a/mlir/lib/Tools/PDLL/AST/Types.cpp +++ b/mlir/lib/Tools/PDLL/AST/Types.cpp @@ -103,6 +103,7 @@ Type RangeType::getElementType() const { //===----------------------------------------------------------------------===// // TypeRangeType +//===----------------------------------------------------------------------===// bool TypeRangeType::classof(Type type) { RangeType range = mlir::dyn_cast(type); @@ -116,6 +117,7 @@ TypeRangeType TypeRangeType::get(Context &context) { //===----------------------------------------------------------------------===// // ValueRangeType +//===----------------------------------------------------------------------===// bool ValueRangeType::classof(Type type) { RangeType range = mlir::dyn_cast(type); diff --git a/mlir/lib/Tools/PDLL/Parser/Parser.cpp b/mlir/lib/Tools/PDLL/Parser/Parser.cpp index 575fb4aacd947..c0e2252bdebc5 100644 --- a/mlir/lib/Tools/PDLL/Parser/Parser.cpp +++ b/mlir/lib/Tools/PDLL/Parser/Parser.cpp @@ -762,6 +762,7 @@ LogicalResult Parser::convertTupleExpressionTo( //===----------------------------------------------------------------------===// // Directives +//===----------------------------------------------------------------------===// LogicalResult Parser::parseDirective(SmallVectorImpl &decls) { StringRef directive = curToken.getSpelling(); @@ -1021,6 +1022,7 @@ Parser::createODSNativePDLLConstraintDecl(const tblgen::Constraint &constraint, //===----------------------------------------------------------------------===// // Decls +//===----------------------------------------------------------------------===// FailureOr Parser::parseTopLevelDecl() { FailureOr decl; @@ -1786,6 +1788,7 @@ FailureOr Parser::parseArgOrResultConstraint() { //===----------------------------------------------------------------------===// // Exprs +//===----------------------------------------------------------------------===// FailureOr Parser::parseExpr() { if (curToken.is(Token::underscore)) @@ -2249,6 +2252,7 @@ FailureOr Parser::parseUnderscoreExpr() { //===----------------------------------------------------------------------===// // Stmts +//===----------------------------------------------------------------------===// FailureOr Parser::parseStmt(bool expectTerminalSemicolon) { FailureOr stmt; @@ -2482,6 +2486,7 @@ FailureOr Parser::parseRewriteStmt() { //===----------------------------------------------------------------------===// // Decls +//===----------------------------------------------------------------------===// ast::CallableDecl *Parser::tryExtractCallableDecl(ast::Node *node) { // Unwrap reference expressions. @@ -2681,6 +2686,7 @@ Parser::validateTypeRangeConstraintExpr(const ast::Expr *typeExpr) { //===----------------------------------------------------------------------===// // Exprs +//===----------------------------------------------------------------------===// FailureOr Parser::createCallExpr(SMRange loc, ast::Expr *parentExpr, @@ -3057,6 +3063,7 @@ Parser::createTupleExpr(SMRange loc, ArrayRef elements, //===----------------------------------------------------------------------===// // Stmts +//===----------------------------------------------------------------------===// FailureOr Parser::createEraseStmt(SMRange loc, ast::Expr *rootOp) { diff --git a/mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp b/mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp index 0f23366f6fe80..250412359d753 100644 --- a/mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp +++ b/mlir/lib/Tools/mlir-lsp-server/LSPServer.cpp @@ -101,6 +101,7 @@ struct LSPServer { //===----------------------------------------------------------------------===// // Initialization +//===----------------------------------------------------------------------===// void LSPServer::onInitialize(const InitializeParams ¶ms, Callback reply) { @@ -160,6 +161,7 @@ void LSPServer::onShutdown(const NoParams &, Callback reply) { //===----------------------------------------------------------------------===// // Document Change +//===----------------------------------------------------------------------===// void LSPServer::onDocumentDidOpen(const DidOpenTextDocumentParams ¶ms) { PublishDiagnosticsParams diagParams(params.textDocument.uri, @@ -200,6 +202,7 @@ void LSPServer::onDocumentDidChange(const DidChangeTextDocumentParams ¶ms) { //===----------------------------------------------------------------------===// // Definitions and References +//===----------------------------------------------------------------------===// void LSPServer::onGoToDefinition(const TextDocumentPositionParams ¶ms, Callback> reply) { @@ -217,6 +220,7 @@ void LSPServer::onReference(const ReferenceParams ¶ms, //===----------------------------------------------------------------------===// // Hover +//===----------------------------------------------------------------------===// void LSPServer::onHover(const TextDocumentPositionParams ¶ms, Callback> reply) { @@ -225,6 +229,7 @@ void LSPServer::onHover(const TextDocumentPositionParams ¶ms, //===----------------------------------------------------------------------===// // Document Symbols +//===----------------------------------------------------------------------===// void LSPServer::onDocumentSymbol(const DocumentSymbolParams ¶ms, Callback> reply) { @@ -235,6 +240,7 @@ void LSPServer::onDocumentSymbol(const DocumentSymbolParams ¶ms, //===----------------------------------------------------------------------===// // Code Completion +//===----------------------------------------------------------------------===// void LSPServer::onCompletion(const CompletionParams ¶ms, Callback reply) { @@ -243,6 +249,7 @@ void LSPServer::onCompletion(const CompletionParams ¶ms, //===----------------------------------------------------------------------===// // Code Action +//===----------------------------------------------------------------------===// void LSPServer::onCodeAction(const CodeActionParams ¶ms, Callback reply) { @@ -267,6 +274,7 @@ void LSPServer::onCodeAction(const CodeActionParams ¶ms, //===----------------------------------------------------------------------===// // Bytecode +//===----------------------------------------------------------------------===// void LSPServer::onConvertFromBytecode( const MLIRConvertBytecodeParams ¶ms, diff --git a/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp b/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp index f02372367e38c..c15f17025e3c1 100644 --- a/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp +++ b/mlir/lib/Tools/mlir-pdll-lsp-server/LSPServer.cpp @@ -114,6 +114,7 @@ struct LSPServer { //===----------------------------------------------------------------------===// // Initialization +//===----------------------------------------------------------------------===// void LSPServer::onInitialize(const InitializeParams ¶ms, Callback reply) { @@ -164,6 +165,7 @@ void LSPServer::onShutdown(const NoParams &, Callback reply) { //===----------------------------------------------------------------------===// // Document Change +//===----------------------------------------------------------------------===// void LSPServer::onDocumentDidOpen(const DidOpenTextDocumentParams ¶ms) { PublishDiagnosticsParams diagParams(params.textDocument.uri, @@ -198,6 +200,7 @@ void LSPServer::onDocumentDidChange(const DidChangeTextDocumentParams ¶ms) { //===----------------------------------------------------------------------===// // Definitions and References +//===----------------------------------------------------------------------===// void LSPServer::onGoToDefinition(const TextDocumentPositionParams ¶ms, Callback> reply) { @@ -215,6 +218,7 @@ void LSPServer::onReference(const ReferenceParams ¶ms, //===----------------------------------------------------------------------===// // DocumentLink +//===----------------------------------------------------------------------===// void LSPServer::onDocumentLink(const DocumentLinkParams ¶ms, Callback> reply) { @@ -225,6 +229,7 @@ void LSPServer::onDocumentLink(const DocumentLinkParams ¶ms, //===----------------------------------------------------------------------===// // Hover +//===----------------------------------------------------------------------===// void LSPServer::onHover(const TextDocumentPositionParams ¶ms, Callback> reply) { @@ -233,6 +238,7 @@ void LSPServer::onHover(const TextDocumentPositionParams ¶ms, //===----------------------------------------------------------------------===// // Document Symbols +//===----------------------------------------------------------------------===// void LSPServer::onDocumentSymbol(const DocumentSymbolParams ¶ms, Callback> reply) { @@ -243,6 +249,7 @@ void LSPServer::onDocumentSymbol(const DocumentSymbolParams ¶ms, //===----------------------------------------------------------------------===// // Code Completion +//===----------------------------------------------------------------------===// void LSPServer::onCompletion(const CompletionParams ¶ms, Callback reply) { @@ -251,6 +258,7 @@ void LSPServer::onCompletion(const CompletionParams ¶ms, //===----------------------------------------------------------------------===// // Signature Help +//===----------------------------------------------------------------------===// void LSPServer::onSignatureHelp(const TextDocumentPositionParams ¶ms, Callback reply) { @@ -259,6 +267,7 @@ void LSPServer::onSignatureHelp(const TextDocumentPositionParams ¶ms, //===----------------------------------------------------------------------===// // Inlay Hints +//===----------------------------------------------------------------------===// void LSPServer::onInlayHint(const InlayHintsParams ¶ms, Callback> reply) { @@ -269,6 +278,7 @@ void LSPServer::onInlayHint(const InlayHintsParams ¶ms, //===----------------------------------------------------------------------===// // PDLL ViewOutput +//===----------------------------------------------------------------------===// void LSPServer::onPDLLViewOutput( const PDLLViewOutputParams ¶ms, diff --git a/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp b/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp index b62f68db9d60f..993eea376bc26 100644 --- a/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp +++ b/mlir/lib/Tools/tblgen-lsp-server/LSPServer.cpp @@ -82,6 +82,7 @@ struct LSPServer { //===----------------------------------------------------------------------===// // Initialization +//===----------------------------------------------------------------------===// void LSPServer::onInitialize(const InitializeParams ¶ms, Callback reply) { @@ -116,6 +117,7 @@ void LSPServer::onShutdown(const NoParams &, Callback reply) { //===----------------------------------------------------------------------===// // Document Change +//===----------------------------------------------------------------------===// void LSPServer::onDocumentDidOpen(const DidOpenTextDocumentParams ¶ms) { PublishDiagnosticsParams diagParams(params.textDocument.uri, @@ -150,6 +152,7 @@ void LSPServer::onDocumentDidChange(const DidChangeTextDocumentParams ¶ms) { //===----------------------------------------------------------------------===// // Definitions and References +//===----------------------------------------------------------------------===// void LSPServer::onGoToDefinition(const TextDocumentPositionParams ¶ms, Callback> reply) { @@ -167,6 +170,7 @@ void LSPServer::onReference(const ReferenceParams ¶ms, //===----------------------------------------------------------------------===// // DocumentLink +//===----------------------------------------------------------------------===// void LSPServer::onDocumentLink(const DocumentLinkParams ¶ms, Callback> reply) { @@ -177,6 +181,7 @@ void LSPServer::onDocumentLink(const DocumentLinkParams ¶ms, //===----------------------------------------------------------------------===// // Hover +//===----------------------------------------------------------------------===// void LSPServer::onHover(const TextDocumentPositionParams ¶ms, Callback> reply) { diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index b9475a7cc95a8..444c505b64232 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -1198,6 +1198,7 @@ void ConversionPatternRewriterImpl::applyRewrites() { //===----------------------------------------------------------------------===// // State Management +//===----------------------------------------------------------------------===// RewriterState ConversionPatternRewriterImpl::getCurrentState() { return RewriterState(rewrites.size(), ignoredOps.size(), replacedOps.size()); @@ -1288,6 +1289,7 @@ bool ConversionPatternRewriterImpl::wasOpReplaced(Operation *op) const { //===----------------------------------------------------------------------===// // Type Conversion +//===----------------------------------------------------------------------===// FailureOr ConversionPatternRewriterImpl::convertRegionTypes( ConversionPatternRewriter &rewriter, Region *region, @@ -1502,6 +1504,7 @@ Value ConversionPatternRewriterImpl::findOrBuildReplacementValue( //===----------------------------------------------------------------------===// // Rewriter Notification Hooks +//===----------------------------------------------------------------------===// void ConversionPatternRewriterImpl::notifyOperationInserted( Operation *op, OpBuilder::InsertPoint previous) { @@ -2336,6 +2339,7 @@ LogicalResult OperationLegalizer::legalizePatternRootUpdates( //===----------------------------------------------------------------------===// // Cost Model +//===----------------------------------------------------------------------===// void OperationLegalizer::buildLegalizationGraph( LegalizationPatterns &anyOpLegalizerPatterns, @@ -3355,6 +3359,7 @@ void mlir::registerConversionPDLFunctions(RewritePatternSet &patterns) { //===----------------------------------------------------------------------===// // Partial Conversion +//===----------------------------------------------------------------------===// LogicalResult mlir::applyPartialConversion( ArrayRef ops, const ConversionTarget &target, @@ -3372,6 +3377,7 @@ mlir::applyPartialConversion(Operation *op, const ConversionTarget &target, //===----------------------------------------------------------------------===// // Full Conversion +//===----------------------------------------------------------------------===// LogicalResult mlir::applyFullConversion(ArrayRef ops, const ConversionTarget &target, @@ -3390,6 +3396,7 @@ LogicalResult mlir::applyFullConversion(Operation *op, //===----------------------------------------------------------------------===// // Analysis Conversion +//===----------------------------------------------------------------------===// /// Find a common IsolatedFromAbove ancestor of the given ops. If at least one /// op is a top-level module op (which is expected to be isolated from above), diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp index 756f5e379e7dd..f511504594cfa 100644 --- a/mlir/lib/Transforms/Utils/Inliner.cpp +++ b/mlir/lib/Transforms/Utils/Inliner.cpp @@ -65,6 +65,7 @@ static void walkReferencedSymbolNodes( //===----------------------------------------------------------------------===// // CGUseList +//===----------------------------------------------------------------------===// namespace { /// This struct tracks the uses of callgraph nodes that can be dropped when diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp index 18e079d153161..fc9492efa5805 100644 --- a/mlir/lib/Transforms/Utils/RegionUtils.cpp +++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp @@ -489,6 +489,7 @@ LogicalResult mlir::runRegionDCE(RewriterBase &rewriter, //===----------------------------------------------------------------------===// // BlockEquivalenceData +//===----------------------------------------------------------------------===// namespace { /// This class contains the information for comparing the equivalencies of two @@ -557,6 +558,7 @@ unsigned BlockEquivalenceData::getOrderOf(Value value) const { //===----------------------------------------------------------------------===// // BlockMergeCluster +//===----------------------------------------------------------------------===// namespace { /// This class represents a cluster of blocks to be merged together. diff --git a/mlir/test/lib/Dialect/Test/TestEnumDefs.td b/mlir/test/lib/Dialect/Test/TestEnumDefs.td index 7441ea5a9726b..5b785a600aad2 100644 --- a/mlir/test/lib/Dialect/Test/TestEnumDefs.td +++ b/mlir/test/lib/Dialect/Test/TestEnumDefs.td @@ -77,6 +77,7 @@ def TestBitEnumVerticalBar //===----------------------------------------------------------------------===// // Test Patterns (Multi-result Ops) +//===----------------------------------------------------------------------===// def MultiResultOpKind1: I64EnumAttrCase<"kind1", 1>; def MultiResultOpKind2: I64EnumAttrCase<"kind2", 2>; diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp index f6b8a0005f285..454a12bac9ab3 100644 --- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp +++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp @@ -730,6 +730,7 @@ LogicalResult TestVerifiersOp::verifyRegions() { //===----------------------------------------------------------------------===// // TestWithBoundsOp +//===----------------------------------------------------------------------===// void TestWithBoundsOp::inferResultRanges(ArrayRef argRanges, SetIntRangeFn setResultRanges) { @@ -738,6 +739,7 @@ void TestWithBoundsOp::inferResultRanges(ArrayRef argRanges, //===----------------------------------------------------------------------===// // TestWithBoundsRegionOp +//===----------------------------------------------------------------------===// ParseResult TestWithBoundsRegionOp::parse(OpAsmParser &parser, OperationState &result) { @@ -771,6 +773,7 @@ void TestWithBoundsRegionOp::inferResultRanges( //===----------------------------------------------------------------------===// // TestIncrementOp +//===----------------------------------------------------------------------===// void TestIncrementOp::inferResultRanges(ArrayRef argRanges, SetIntRangeFn setResultRanges) { @@ -783,6 +786,7 @@ void TestIncrementOp::inferResultRanges(ArrayRef argRanges, //===----------------------------------------------------------------------===// // TestReflectBoundsOp +//===----------------------------------------------------------------------===// void TestReflectBoundsOp::inferResultRanges( ArrayRef argRanges, SetIntRangeFn setResultRanges) { @@ -1124,6 +1128,7 @@ void ReadBufferOp::getEffects( //===----------------------------------------------------------------------===// // TestCallAndStoreOp +//===----------------------------------------------------------------------===// CallInterfaceCallable TestCallAndStoreOp::getCallableForCallee() { return getCallee(); @@ -1143,6 +1148,7 @@ MutableOperandRange TestCallAndStoreOp::getArgOperandsMutable() { //===----------------------------------------------------------------------===// // TestCallOnDeviceOp +//===----------------------------------------------------------------------===// CallInterfaceCallable TestCallOnDeviceOp::getCallableForCallee() { return getCallee(); @@ -1162,6 +1168,7 @@ MutableOperandRange TestCallOnDeviceOp::getArgOperandsMutable() { //===----------------------------------------------------------------------===// // TestStoreWithARegion +//===----------------------------------------------------------------------===// void TestStoreWithARegion::getSuccessorRegions( RegionBranchPoint point, SmallVectorImpl ®ions) { @@ -1173,6 +1180,7 @@ void TestStoreWithARegion::getSuccessorRegions( //===----------------------------------------------------------------------===// // TestStoreWithALoopRegion +//===----------------------------------------------------------------------===// void TestStoreWithALoopRegion::getSuccessorRegions( RegionBranchPoint point, SmallVectorImpl ®ions) { diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index f653e4465cfef..d8024145e711f 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -1462,6 +1462,7 @@ def TestDialectCanonicalizerOp : TEST_Op<"dialect_canonicalizable"> { //===----------------------------------------------------------------------===// // Test Patterns (Symbol Binding) +//===----------------------------------------------------------------------===// // Test symbol binding. def OpSymbolBindingA : TEST_Op<"symbol_binding_a", []> { @@ -1503,6 +1504,7 @@ def : Pat<(OpSymbolBindingNoResult:$op $operand), //===----------------------------------------------------------------------===// // Test Patterns (Attributes) +//===----------------------------------------------------------------------===// // Test matching against op attributes. def OpAttrMatch1 : TEST_Op<"match_op_attribute1"> { @@ -1618,6 +1620,7 @@ def : Pattern< //===----------------------------------------------------------------------===// // Test Patterns (Variadic Ops) +//===----------------------------------------------------------------------===// def OneVResOneVOperandOp1 : TEST_Op<"one_variadic_out_one_variadic_in1"> { let arguments = (ins Variadic); @@ -1803,6 +1806,7 @@ def : Pat< //===----------------------------------------------------------------------===// // Test Patterns (either) +//===----------------------------------------------------------------------===// def TestEitherOpA : TEST_Op<"either_op_a"> { let arguments = (ins AnyInteger:$arg0, AnyInteger:$arg1, AnyInteger:$arg2); @@ -1845,6 +1849,7 @@ def : Pat<(TestEitherOpB (either (TestEitherHelperOpA I32:$either_helper_0), //===----------------------------------------------------------------------===// // Test Patterns (Location) +//===----------------------------------------------------------------------===// // Test that we can specify locations for generated ops. def : Pat<(TestLocationSrcOp:$res1 @@ -1863,6 +1868,7 @@ def : Pat<(TestLocationSrcNoResOp:$loc //===----------------------------------------------------------------------===// // Test Patterns (Type Builders) +//===----------------------------------------------------------------------===// def SourceOp : TEST_Op<"source_op"> { let arguments = (ins AnyInteger:$arg, AnyI32Attr:$tag); @@ -1913,6 +1919,7 @@ def : Pat<(SourceOp $val, ConstantAttr:$attr), //===----------------------------------------------------------------------===// // Test Patterns (Trailing Directives) +//===----------------------------------------------------------------------===// // Test that we can specify both `location` and `returnType` directives. def : Pat<(SourceOp $val, ConstantAttr:$attr), @@ -2079,6 +2086,7 @@ def ParseB64BytesOp : TEST_Op<"parse_b64"> { //===----------------------------------------------------------------------===// // Test region argument list parsing. +//===----------------------------------------------------------------------===// def IsolatedRegionOp : TEST_Op<"isolated_region", [IsolatedFromAbove]> { let summary = "isolated region operation"; @@ -2139,6 +2147,7 @@ def AffineScopeOp : TEST_Op<"affine_scope", [AffineScope]> { //===----------------------------------------------------------------------===// // Custom printer/parser +//===----------------------------------------------------------------------===// def CustomDimensionListAttrOp : TEST_Op<"custom_dimension_list_attr"> { let description = [{ @@ -2164,6 +2173,7 @@ def OptionalCustomAttrOp : TEST_Op<"optional_custom_attr"> { //===----------------------------------------------------------------------===// // Test OpAsmInterface. +//===----------------------------------------------------------------------===// def AsmInterfaceOp : TEST_Op<"asm_interface_op"> { let results = (outs AnyType:$first, Variadic:$middle_results, diff --git a/mlir/test/lib/Dialect/Test/TestOpsSyntax.cpp b/mlir/test/lib/Dialect/Test/TestOpsSyntax.cpp index 664951f2a11bb..7a9f07030215d 100644 --- a/mlir/test/lib/Dialect/Test/TestOpsSyntax.cpp +++ b/mlir/test/lib/Dialect/Test/TestOpsSyntax.cpp @@ -21,6 +21,7 @@ using namespace test; //===----------------------------------------------------------------------===// // Parsing +//===----------------------------------------------------------------------===// static ParseResult parseCustomOptionalOperand( OpAsmParser &parser, @@ -155,6 +156,7 @@ static ParseResult parseCustomDirectiveOptionalOperandRef( //===----------------------------------------------------------------------===// // Printing +//===----------------------------------------------------------------------===// static void printCustomOptionalOperand(OpAsmPrinter &printer, Operation *, Value optOperand) { @@ -291,6 +293,7 @@ ::llvm::LogicalResult FormatInferType2Op::inferReturnTypes( //===----------------------------------------------------------------------===// // Test WrapRegionOp - wrapping op exercising `parseGenericOperation()`. +//===----------------------------------------------------------------------===// ParseResult WrappingRegionOp::parse(OpAsmParser &parser, OperationState &result) { diff --git a/mlir/test/lib/Dialect/Test/TestOpsSyntax.td b/mlir/test/lib/Dialect/Test/TestOpsSyntax.td index 9c199f0c3b6fc..d9003428e3746 100644 --- a/mlir/test/lib/Dialect/Test/TestOpsSyntax.td +++ b/mlir/test/lib/Dialect/Test/TestOpsSyntax.td @@ -475,6 +475,7 @@ def FormatQualifiedNestedType : TEST_Op<"format_qual_cpmd_nested_type"> { //===----------------------------------------------------------------------===// // Custom Directives +//===----------------------------------------------------------------------===// def FormatCustomDirectiveOperands : TEST_Op<"format_custom_directive_operands", [AttrSizedOperandSegments]> { @@ -599,6 +600,7 @@ def FormatLiteralFollowingOptionalGroup //===----------------------------------------------------------------------===// // AllTypesMatch type inference +//===----------------------------------------------------------------------===// def FormatAllTypesMatchVarOp : TEST_Op<"format_all_types_match_var", [ AllTypesMatch<["value1", "value2", "result"]> @@ -618,6 +620,7 @@ def FormatAllTypesMatchAttrOp : TEST_Op<"format_all_types_match_attr", [ //===----------------------------------------------------------------------===// // TypesMatchWith type inference +//===----------------------------------------------------------------------===// def FormatTypesMatchVarOp : TEST_Op<"format_types_match_var", [ TypesMatchWith<"result type matches operand", "value", "result", "$_self"> @@ -655,6 +658,7 @@ def FormatTypesMatchContextOp : TEST_Op<"format_types_match_context", [ //===----------------------------------------------------------------------===// // InferTypeOpInterface type inference in assembly format +//===----------------------------------------------------------------------===// def FormatInferTypeOp : TEST_Op<"format_infer_type", [InferTypeOpInterface]> { let results = (outs AnyType); diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index bfdcaf431eeff..db02a122872d9 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -772,6 +772,7 @@ void TestDerivedAttributeDriver::runOnOperation() { namespace { //===----------------------------------------------------------------------===// // Region-Block Rewrite Testing +//===----------------------------------------------------------------------===// /// This pattern applies a signature conversion to a block inside a detached /// region. @@ -958,6 +959,7 @@ struct TestUndoPropertiesModification : public ConversionPattern { //===----------------------------------------------------------------------===// // Type-Conversion Rewrite Testing +//===----------------------------------------------------------------------===// /// This patterns erases a region operation that has had a type conversion. struct TestDropOpSignatureConversion : public ConversionPattern { diff --git a/mlir/test/mlir-tblgen/op-format-invalid.td b/mlir/test/mlir-tblgen/op-format-invalid.td index 3461f14fa5f01..2f29543f67381 100644 --- a/mlir/test/mlir-tblgen/op-format-invalid.td +++ b/mlir/test/mlir-tblgen/op-format-invalid.td @@ -19,6 +19,7 @@ class TestFormat_Op traits = []> //===----------------------------------------------------------------------===// // attr-dict +//===----------------------------------------------------------------------===// // CHECK: error: 'attr-dict' directive not found def DirectiveAttrDictInvalidA : TestFormat_Op<[{ @@ -38,6 +39,7 @@ def DirectiveAttrDictInvalidD : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // custom +//===----------------------------------------------------------------------===// // CHECK: error: expected '<' before custom directive name def DirectiveCustomInvalidA : TestFormat_Op<[{ @@ -70,6 +72,7 @@ def DirectiveCustomInvalidH : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // functional-type +//===----------------------------------------------------------------------===// // CHECK: error: 'functional-type' is only valid as a top-level directive def DirectiveFunctionalTypeInvalidA : TestFormat_Op<[{ @@ -98,6 +101,7 @@ def DirectiveFunctionalTypeInvalidF : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // operands +//===----------------------------------------------------------------------===// // CHECK: error: 'operands' directive creates overlap in format def DirectiveOperandsInvalidA : TestFormat_Op<[{ @@ -110,6 +114,7 @@ def DirectiveOperandsInvalidB : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // ref +//===----------------------------------------------------------------------===// // CHECK: error: 'ref' is only valid within a `custom` directive def DirectiveRefInvalidA : TestFormat_Op<[{ @@ -193,6 +198,7 @@ def DirectiveRefInvalidO : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // regions +//===----------------------------------------------------------------------===// // CHECK: error: 'regions' directive creates overlap in format def DirectiveRegionsInvalidA : TestFormat_Op<[{ @@ -218,6 +224,7 @@ def DirectiveRegionsInvalidD : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // results +//===----------------------------------------------------------------------===// // CHECK: error: 'results' directive can can only be used as a child to a 'type' directive def DirectiveResultsInvalidA : TestFormat_Op<[{ @@ -226,6 +233,7 @@ def DirectiveResultsInvalidA : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // successors +//===----------------------------------------------------------------------===// // CHECK: error: 'successors' is only valid as a top-level directive def DirectiveSuccessorsInvalidA : TestFormat_Op<[{ @@ -234,6 +242,7 @@ def DirectiveSuccessorsInvalidA : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // type +//===----------------------------------------------------------------------===// // CHECK: error: expected '(' before argument list def DirectiveTypeInvalidA : TestFormat_Op<[{ @@ -250,6 +259,7 @@ def DirectiveTypeInvalidC : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // functional-type/type operands +//===----------------------------------------------------------------------===// // CHECK: error: literals may only be used in the top-level section of the format def DirectiveTypeZOperandInvalidA : TestFormat_Op<[{ diff --git a/mlir/test/mlir-tblgen/op-format-spec.td b/mlir/test/mlir-tblgen/op-format-spec.td index ad2a055bc78b0..02bf65609b21a 100644 --- a/mlir/test/mlir-tblgen/op-format-spec.td +++ b/mlir/test/mlir-tblgen/op-format-spec.td @@ -19,6 +19,7 @@ class TestFormat_Op traits = []> //===----------------------------------------------------------------------===// // attr-dict +//===----------------------------------------------------------------------===// // CHECK-NOT: error def DirectiveAttrDictValidA : TestFormat_Op<[{ @@ -30,6 +31,7 @@ def DirectiveAttrDictValidB : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // custom +//===----------------------------------------------------------------------===// // CHECK-NOT: error def DirectiveCustomValidA : TestFormat_Op<[{ @@ -50,6 +52,7 @@ def DirectiveCustomValidE : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // functional-type +//===----------------------------------------------------------------------===// // CHECK-NOT: error def DirectiveFunctionalTypeValid : TestFormat_Op<[{ @@ -58,6 +61,7 @@ def DirectiveFunctionalTypeValid : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // operands +//===----------------------------------------------------------------------===// // CHECK-NOT: error: def DirectiveOperandsValid : TestFormat_Op<[{ @@ -66,6 +70,7 @@ def DirectiveOperandsValid : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // regions +//===----------------------------------------------------------------------===// // CHECK-NOT: error: def DirectiveRegionsValid : TestFormat_Op<[{ @@ -74,6 +79,7 @@ def DirectiveRegionsValid : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // results +//===----------------------------------------------------------------------===// // CHECK-NOT: error: def DirectiveResultsInvalidA : TestFormat_Op<[{ @@ -82,6 +88,7 @@ def DirectiveResultsInvalidA : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // successors +//===----------------------------------------------------------------------===// // CHECK-NOT: error: def DirectiveSuccessorsInvalidA : TestFormat_Op<[{ @@ -90,6 +97,7 @@ def DirectiveSuccessorsInvalidA : TestFormat_Op<[{ //===----------------------------------------------------------------------===// // type +//===----------------------------------------------------------------------===// // CHECK-NOT: error: def DirectiveTypeValid : TestFormat_Op<[{ diff --git a/mlir/test/mlir-tblgen/op-format.td b/mlir/test/mlir-tblgen/op-format.td index 73f9315f6bcfe..09e068b91a40b 100644 --- a/mlir/test/mlir-tblgen/op-format.td +++ b/mlir/test/mlir-tblgen/op-format.td @@ -16,6 +16,7 @@ class TestFormat_Op traits = []> //===----------------------------------------------------------------------===// // custom +//===----------------------------------------------------------------------===// // CHECK-LABEL: CustomStringLiteralA::parse // CHECK: parseFoo({{.*}}, parser.getBuilder().getI1Type()) diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp index 4ab3e9250c29d..cf0d827942949 100644 --- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp +++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp @@ -444,6 +444,7 @@ void DefGen::emitInterfaceMethods() { //===----------------------------------------------------------------------===// // Builder Emission +//===----------------------------------------------------------------------===// SmallVector DefGen::getBuilderParams(std::initializer_list prefix) const { @@ -546,6 +547,7 @@ void DefGen::emitCheckedCustomBuilder(const AttrOrTypeBuilder &builder) { //===----------------------------------------------------------------------===// // Interface Method Emission +//===----------------------------------------------------------------------===// void DefGen::emitTraitMethods(const InterfaceTrait &trait) { // Get the set of methods that should always be declared. @@ -577,6 +579,7 @@ void DefGen::emitTraitMethod(const InterfaceMethod &method) { //===----------------------------------------------------------------------===// // Storage Class Emission +//===----------------------------------------------------------------------===// void DefGen::emitStorageConstructor() { Constructor *ctor = @@ -1079,6 +1082,7 @@ bool {0}(::mlir::Type type) { //===----------------------------------------------------------------------===// // AttrDef +//===----------------------------------------------------------------------===// static llvm::cl::OptionCategory attrdefGenCat("Options for -gen-attrdef-*"); static llvm::cl::opt @@ -1101,6 +1105,7 @@ static mlir::GenRegistration //===----------------------------------------------------------------------===// // TypeDef +//===----------------------------------------------------------------------===// static llvm::cl::OptionCategory typedefGenCat("Options for -gen-typedef-*"); static llvm::cl::opt diff --git a/mlir/tools/mlir-tblgen/FormatGen.cpp b/mlir/tools/mlir-tblgen/FormatGen.cpp index d145f3e5a23dd..dd9b41bc90aef 100644 --- a/mlir/tools/mlir-tblgen/FormatGen.cpp +++ b/mlir/tools/mlir-tblgen/FormatGen.cpp @@ -223,6 +223,7 @@ FailureOr> FormatParser::parse() { //===----------------------------------------------------------------------===// // Element Parsing +//===----------------------------------------------------------------------===// FailureOr FormatParser::parseElement(Context ctx) { if (curToken.is(FormatToken::literal)) diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp index 3a7a7aaf3a5dd..ca2c1d4a8ad04 100644 --- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp +++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp @@ -35,6 +35,7 @@ using llvm::StringMap; //===----------------------------------------------------------------------===// // VariableElement +//===----------------------------------------------------------------------===// namespace { /// This class represents an instance of an op variable element. A variable @@ -140,6 +141,7 @@ struct AttributeLikeVariable : public VariableElement { //===----------------------------------------------------------------------===// // DirectiveElement +//===----------------------------------------------------------------------===// namespace { /// This class represents the `operands` directive. This directive represents @@ -424,6 +426,7 @@ struct OperationFormat { //===----------------------------------------------------------------------===// // Parser Gen +//===----------------------------------------------------------------------===// /// Returns true if we can format the given attribute as an enum in the /// parser format. @@ -1951,6 +1954,7 @@ void OperationFormat::genParserVariadicSegmentResolution(Operator &op, //===----------------------------------------------------------------------===// // PrinterGen +//===----------------------------------------------------------------------===// /// The code snippet used to generate a printer call for a region of an // operation that has the SingleBlockImplicitTerminator trait. From 94b04b411903e97bd228c6bdbdb845c29f6de6a1 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 31 Mar 2025 09:40:33 -0700 Subject: [PATCH 0110/1029] [lldb] Include the version in the lldbassert error message (#133740) Include the LLDB version in the lldbassert error message, and prompt users to include it in the bug report. The majority of users that bother filing a bug report just copy past the stack trace and often forget to include this important detail. By putting it after the backtrace and before the prompt, I'm hoping it'll get copy-pasted in. rdar://146793016 --- lldb/source/Core/CMakeLists.txt | 1 + lldb/source/Core/Debugger.cpp | 5 +++-- lldb/source/Utility/LLDBAssert.cpp | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt index e8bdb0613b3ff..0a08da0fec230 100644 --- a/lldb/source/Core/CMakeLists.txt +++ b/lldb/source/Core/CMakeLists.txt @@ -70,6 +70,7 @@ add_lldb_library(lldbCore lldbTarget lldbUtility lldbValueObject + lldbVersion lldbPluginCPlusPlusLanguage lldbPluginObjCLanguage ${LLDB_CURSES_LIBS} diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index ec7f841320217..51029f91eb12d 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -1525,8 +1525,9 @@ bool Debugger::FormatDisassemblerAddress(const FormatEntity::Entry *format, void Debugger::AssertCallback(llvm::StringRef message, llvm::StringRef backtrace, llvm::StringRef prompt) { - Debugger::ReportError( - llvm::formatv("{0}\n{1}{2}", message, backtrace, prompt).str()); + Debugger::ReportError(llvm::formatv("{0}\n{1}{2}\n{3}", message, backtrace, + GetVersion(), prompt) + .str()); } void Debugger::SetLoggingCallback(lldb::LogOutputCallback log_callback, diff --git a/lldb/source/Utility/LLDBAssert.cpp b/lldb/source/Utility/LLDBAssert.cpp index d7adb52f95fa4..b84c581ccf822 100644 --- a/lldb/source/Utility/LLDBAssert.cpp +++ b/lldb/source/Utility/LLDBAssert.cpp @@ -54,8 +54,8 @@ void _lldb_assert(bool expression, const char *expr_text, const char *func, expr_text, func, file, line) .str(), buffer, - "Please file a bug report against lldb reporting this failure log, and " - "as many details as possible"); + "Please file a bug report against lldb and include the backtrace, the " + "version and as many details as possible."); } void SetLLDBAssertCallback(LLDBAssertCallback callback) { From c5b3fe209408c89c1ca21f103a8fd45fb48c3138 Mon Sep 17 00:00:00 2001 From: Alan Zhao Date: Mon, 31 Mar 2025 09:42:34 -0700 Subject: [PATCH 0111/1029] [clang] Automatically add the `returns_twice` attribute to certain functions even if `-fno-builtin` is set (#133511) Certain functions require the `returns_twice` attribute in order to produce correct codegen. However, `-fno-builtin` removes all knowledge of functions that require this attribute, so this PR modifies Clang to add the `returns_twice` attribute even if `-fno-builtin` is set. This behavior is also consistent with what GCC does. It's not (easily) possible to get the builtin information from `Builtins.td` because `-fno-builtin` causes Clang to never initialize any builtins, so functions never get tokenized as functions/builtins that require `returns_twice`. Therefore, the most straightforward solution is to explicitly hard code the function names that require `returns_twice`. Fixes #122840 --- clang/lib/CodeGen/CGCall.cpp | 9 +++++++++ clang/test/CodeGen/2003-08-20-vfork-bug.c | 5 ++++- clang/test/CodeGen/setjmp.c | 15 +++++++++++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 3a1db4f1a7a90..3cefa3b0c585c 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2607,6 +2607,15 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, if (shouldDisableTailCalls()) FuncAttrs.addAttribute("disable-tail-calls", "true"); + // These functions require the returns_twice attribute for correct codegen, + // but the attribute may not be added if -fno-builtin is specified. We + // explicitly add that attribute here. + static const llvm::StringSet<> ReturnsTwiceFn{ + "_setjmpex", "setjmp", "_setjmp", "vfork", + "sigsetjmp", "__sigsetjmp", "savectx", "getcontext"}; + if (ReturnsTwiceFn.contains(Name)) + FuncAttrs.addAttribute(llvm::Attribute::ReturnsTwice); + // CPU/feature overrides. addDefaultFunctionDefinitionAttributes // handles these separately to set them based on the global defaults. GetCPUAndFeaturesAttributes(CalleeInfo.getCalleeDecl(), FuncAttrs); diff --git a/clang/test/CodeGen/2003-08-20-vfork-bug.c b/clang/test/CodeGen/2003-08-20-vfork-bug.c index 4966ab20904d4..438604f321da3 100644 --- a/clang/test/CodeGen/2003-08-20-vfork-bug.c +++ b/clang/test/CodeGen/2003-08-20-vfork-bug.c @@ -1,5 +1,8 @@ -// RUN: %clang_cc1 -emit-llvm %s -o /dev/null +// RUN: %clang_cc1 -x c %s -triple x86_64-linux-gnu -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -x c %s -triple x86_64-linux-gnu -emit-llvm -fno-builtin -o - | FileCheck %s +// CHECK: ; Function Attrs: returns_twice +// CHECK-NEXT: declare {{.*}} @vfork( extern int vfork(void); void test() { vfork(); diff --git a/clang/test/CodeGen/setjmp.c b/clang/test/CodeGen/setjmp.c index 77dde35e69cae..00341e459941a 100644 --- a/clang/test/CodeGen/setjmp.c +++ b/clang/test/CodeGen/setjmp.c @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -x c %s -triple x86_64-linux-gnu -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -x c %s -triple x86_64-linux-gnu -emit-llvm -fno-builtin -o - | FileCheck %s // RUN: %clang_cc1 -x c++ %s -triple x86_64-linux-gnu -emit-llvm -o - | FileCheck %s #ifdef __cplusplus @@ -6,13 +7,17 @@ extern "C" { #endif struct __jmp_buf_tag { int n; }; +struct __ucontext_t_tag { int n; }; int setjmp(struct __jmp_buf_tag*); int sigsetjmp(struct __jmp_buf_tag*, int); int _setjmp(struct __jmp_buf_tag*); int __sigsetjmp(struct __jmp_buf_tag*, int); +int _setjmpex(struct __jmp_buf_tag* env); +int getcontext(struct __ucontext_t_tag*); typedef struct __jmp_buf_tag jmp_buf[1]; typedef struct __jmp_buf_tag sigjmp_buf[1]; +typedef struct __ucontext_t_tag ucontext_t[1]; #ifdef __cplusplus } @@ -20,6 +25,7 @@ typedef struct __jmp_buf_tag sigjmp_buf[1]; void f(void) { jmp_buf jb; + ucontext_t ut; // CHECK: call {{.*}}@setjmp( setjmp(jb); // CHECK: call {{.*}}@sigsetjmp( @@ -28,6 +34,10 @@ void f(void) { _setjmp(jb); // CHECK: call {{.*}}@__sigsetjmp( __sigsetjmp(jb, 0); + // CHECK: call {{.*}}@_setjmpex( + _setjmpex(jb); + // CHECK: call {{.*}}@getcontext( + getcontext(ut); } // CHECK: ; Function Attrs: returns_twice @@ -42,3 +52,8 @@ void f(void) { // CHECK: ; Function Attrs: returns_twice // CHECK-NEXT: declare {{.*}} @__sigsetjmp( +// CHECK: ; Function Attrs: returns_twice +// CHECK-NEXT: declare {{.*}} @_setjmpex( + +// CHECK: ; Function Attrs: returns_twice +// CHECK-NEXT: declare {{.*}} @getcontext( From 9cdab16da99ad9fdb823853fbc634008229e284f Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 31 Mar 2025 17:44:39 +0100 Subject: [PATCH 0112/1029] [AArch64] Remove CODE llc run lines from costmodel tests. NFC The code is already tested in CodeGen/AArch64 tests such as neon-perm.ll and the select- tests. --- .../CostModel/AArch64/shuffle-transpose.ll | 196 ++--- .../CostModel/AArch64/vector-select.ll | 762 +++++++----------- 2 files changed, 373 insertions(+), 585 deletions(-) diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll index 6680f70aafe4d..35cfa888d5d76 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll @@ -1,220 +1,220 @@ -; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefix=COST -; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -; COST-LABEL: trn1.v8i8 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> -; CODE-LABEL: trn1.v8i8 -; CODE: trn1 v0.8b, v0.8b, v1.8b define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) { +; CHECK-LABEL: 'trn1.v8i8' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %tmp0 +; %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ret <8 x i8> %tmp0 } -; COST-LABEL: trn2.v8i8 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> -; CODE-LABEL: trn2.v8i8 -; CODE: trn2 v0.8b, v0.8b, v1.8b define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) { +; CHECK-LABEL: 'trn2.v8i8' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %tmp0 +; %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ret <8 x i8> %tmp0 } -; COST-LABEL: trn1.v16i8 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> -; CODE-LABEL: trn1.v16i8 -; CODE: trn1 v0.16b, v0.16b, v1.16b define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) { +; CHECK-LABEL: 'trn1.v16i8' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %tmp0 +; %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ret <16 x i8> %tmp0 } -; COST-LABEL: trn2.v16i8 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> -; CODE-LABEL: trn2.v16i8 -; CODE: trn2 v0.16b, v0.16b, v1.16b define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) { +; CHECK-LABEL: 'trn2.v16i8' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %tmp0 +; %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ret <16 x i8> %tmp0 } -; COST-LABEL: trn1.v4i16 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> -; CODE-LABEL: trn1.v4i16 -; CODE: trn1 v0.4h, v0.4h, v1.4h define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) { +; CHECK-LABEL: 'trn1.v4i16' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %tmp0 +; %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ret <4 x i16> %tmp0 } -; COST-LABEL: trn2.v4i16 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> -; CODE-LABEL: trn2.v4i16 -; CODE: trn2 v0.4h, v0.4h, v1.4h define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) { +; CHECK-LABEL: 'trn2.v4i16' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %tmp0 +; %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ret <4 x i16> %tmp0 } -; COST-LABEL: trn1.v8i16 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> -; CODE-LABEL: trn1.v8i16 -; CODE: trn1 v0.8h, v0.8h, v1.8h define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) { +; CHECK-LABEL: 'trn1.v8i16' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %tmp0 +; %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ret <8 x i16> %tmp0 } -; COST-LABEL: trn2.v8i16 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> -; CODE-LABEL: trn2.v8i16 -; CODE: trn2 v0.8h, v0.8h, v1.8h define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) { +; CHECK-LABEL: 'trn2.v8i16' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %tmp0 +; %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ret <8 x i16> %tmp0 } -; COST-LABEL: trn1.v2i32 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> -; CODE-LABEL: trn1.v2i32 -; CODE: zip1 v0.2s, v0.2s, v1.2s define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) { +; CHECK-LABEL: 'trn1.v2i32' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %tmp0 +; %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> ret <2 x i32> %tmp0 } -; COST-LABEL: trn2.v2i32 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> -; CODE-LABEL: trn2.v2i32 -; CODE: zip2 v0.2s, v0.2s, v1.2s define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) { +; CHECK-LABEL: 'trn2.v2i32' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %tmp0 +; %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> ret <2 x i32> %tmp0 } -; COST-LABEL: trn1.v4i32 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> -; CODE-LABEL: trn1.v4i32 -; CODE: trn1 v0.4s, v0.4s, v1.4s define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) { +; CHECK-LABEL: 'trn1.v4i32' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %tmp0 +; %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ret <4 x i32> %tmp0 } -; COST-LABEL: trn2.v4i32 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> -; CODE-LABEL: trn2.v4i32 -; CODE: trn2 v0.4s, v0.4s, v1.4s define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) { +; CHECK-LABEL: 'trn2.v4i32' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %tmp0 +; %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ret <4 x i32> %tmp0 } -; COST-LABEL: trn1.v2i64 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> -; CODE-LABEL: trn1.v2i64 -; CODE: zip1 v0.2d, v0.2d, v1.2d define <2 x i64> @trn1.v2i64(<2 x i64> %v0, <2 x i64> %v1) { +; CHECK-LABEL: 'trn1.v2i64' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %tmp0 +; %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> ret <2 x i64> %tmp0 } -; COST-LABEL: trn2.v2i64 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> -; CODE-LABEL: trn2.v2i64 -; CODE: zip2 v0.2d, v0.2d, v1.2d define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) { +; CHECK-LABEL: 'trn2.v2i64' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %tmp0 +; %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> ret <2 x i64> %tmp0 } -; COST-LABEL: trn1.v2f32 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> -; CODE-LABEL: trn1.v2f32 -; CODE: zip1 v0.2s, v0.2s, v1.2s define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) { +; CHECK-LABEL: 'trn1.v2f32' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %tmp0 +; %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> ret <2 x float> %tmp0 } -; COST-LABEL: trn2.v2f32 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> -; CODE-LABEL: trn2.v2f32 -; CODE: zip2 v0.2s, v0.2s, v1.2s define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) { +; CHECK-LABEL: 'trn2.v2f32' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %tmp0 +; %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> ret <2 x float> %tmp0 } -; COST-LABEL: trn1.v4f32 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> -; CODE-LABEL: trn1.v4f32 -; CODE: trn1 v0.4s, v0.4s, v1.4s define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) { +; CHECK-LABEL: 'trn1.v4f32' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %tmp0 +; %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ret <4 x float> %tmp0 } -; COST-LABEL: trn2.v4f32 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> -; CODE-LABEL: trn2.v4f32 -; CODE: trn2 v0.4s, v0.4s, v1.4s define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) { +; CHECK-LABEL: 'trn2.v4f32' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %tmp0 +; %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ret <4 x float> %tmp0 } -; COST-LABEL: trn1.v2f64 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> -; CODE-LABEL: trn1.v2f64 -; CODE: zip1 v0.2d, v0.2d, v1.2d define <2 x double> @trn1.v2f64(<2 x double> %v0, <2 x double> %v1) { +; CHECK-LABEL: 'trn1.v2f64' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %tmp0 +; %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> ret <2 x double> %tmp0 } -; COST-LABEL: trn2.v2f64 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> -; CODE-LABEL: trn2.v2f64 -; CODE: zip2 v0.2d, v0.2d, v1.2d define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) { +; CHECK-LABEL: 'trn2.v2f64' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %tmp0 +; %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> ret <2 x double> %tmp0 } -; COST-LABEL: trn1.v4f16 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> -; CODE-LABEL: trn1.v4f16 -; CODE: trn1 v0.4h, v0.4h, v1.4h define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) { +; CHECK-LABEL: 'trn1.v4f16' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %tmp0 +; %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> ret <4 x half> %tmp0 } -; COST-LABEL: trn2.v4f16 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> -; CODE-LABEL: trn2.v4f16 -; CODE: trn2 v0.4h, v0.4h, v1.4h define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) { +; CHECK-LABEL: 'trn2.v4f16' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %tmp0 +; %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> ret <4 x half> %tmp0 } -; COST-LABEL: trn1.v8f16 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> -; CODE-LABEL: trn1.v8f16 -; CODE: trn1 v0.8h, v0.8h, v1.8h define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) { +; CHECK-LABEL: 'trn1.v8f16' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %tmp0 +; %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> ret <8 x half> %tmp0 } -; COST-LABEL: trn2.v8f16 -; COST: Found an estimated cost of 1 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> -; CODE-LABEL: trn2.v8f16 -; CODE: trn2 v0.8h, v0.8h, v1.8h define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) { +; CHECK-LABEL: 'trn2.v8f16' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %tmp0 +; %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> ret <8 x half> %tmp0 } diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll index cf57479c39598..e35eabfb35f58 100644 --- a/llvm/test/Analysis/CostModel/AArch64/vector-select.ll +++ b/llvm/test/Analysis/CostModel/AArch64/vector-select.ll @@ -1,172 +1,116 @@ -; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=COST,COST-NOFP16 -; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print" 2>&1 -disable-output -mattr=+fullfp16 | FileCheck %s --check-prefixes=COST,COST-FULLFP16 -; RUN: llc < %s -mtriple=aarch64--linux-gnu -mattr=+fullfp16 | FileCheck %s --check-prefix=CODE +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefixes=COST,COST-NOFP16 +; RUN: opt < %s -mtriple=aarch64--linux-gnu -passes="print" -cost-kind=all 2>&1 -disable-output -mattr=+fullfp16 | FileCheck %s --check-prefixes=COST,COST-FULLFP16 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -; COST-LABEL: v8i8_select_eq -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = icmp eq <8 x i8> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x i8> %a, <8 x i8> %c - -; CODE-LABEL: v8i8_select_eq -; CODE: bb.0 -; CODE-NEXT: cmeq v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret - define <8 x i8> @v8i8_select_eq(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { +; COST-LABEL: 'v8i8_select_eq' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = icmp eq <8 x i8> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <8 x i1> %cmp.1, <8 x i8> %a, <8 x i8> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %s.1 +; %cmp.1 = icmp eq <8 x i8> %a, %b %s.1 = select <8 x i1> %cmp.1, <8 x i8> %a, <8 x i8> %c ret <8 x i8> %s.1 } -; COST-LABEL: v16i8_select_sgt -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = icmp sgt <16 x i8> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <16 x i1> %cmp.1, <16 x i8> %a, <16 x i8> %c - -; CODE-LABEL: v16i8_select_sgt -; CODE: bb.0 -; CODE-NEXT: cmgt v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret - define <16 x i8> @v16i8_select_sgt(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { +; COST-LABEL: 'v16i8_select_sgt' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = icmp sgt <16 x i8> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <16 x i1> %cmp.1, <16 x i8> %a, <16 x i8> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %s.1 +; %cmp.1 = icmp sgt <16 x i8> %a, %b %s.1 = select <16 x i1> %cmp.1, <16 x i8> %a, <16 x i8> %c ret <16 x i8> %s.1 } -; COST-LABEL: v4i16_select_ne -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = icmp ne <4 x i16> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x i16> %a, <4 x i16> %c - -; CODE-LABEL: v4i16_select_ne -; CODE: bb.0 -; CODE-NEXT: cmeq v{{.+}}.4h, v{{.+}}.4h, v{{.+}}.4h -; CODE-NEXT: bit v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret - define <4 x i16> @v4i16_select_ne(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) { +; COST-LABEL: 'v4i16_select_ne' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = icmp ne <4 x i16> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x i16> %a, <4 x i16> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %s.1 +; %cmp.1 = icmp ne <4 x i16> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x i16> %a, <4 x i16> %c ret <4 x i16> %s.1 } -; COST-LABEL: v8i16_select_ugt -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = icmp ugt <8 x i16> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x i16> %a, <8 x i16> %c - -; CODE-LABEL: v8i16_select_ugt -; CODE: bb.0 -; CODE-NEXT: cmhi v{{.+}}.8h, v{{.+}}.8h, v{{.+}}.8h -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret - define <8 x i16> @v8i16_select_ugt(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { +; COST-LABEL: 'v8i16_select_ugt' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = icmp ugt <8 x i16> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <8 x i1> %cmp.1, <8 x i16> %a, <8 x i16> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %s.1 +; %cmp.1 = icmp ugt <8 x i16> %a, %b %s.1 = select <8 x i1> %cmp.1, <8 x i16> %a, <8 x i16> %c ret <8 x i16> %s.1 } -; COST-LABEL: v2i32_select_ule -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = icmp ule <2 x i32> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x i32> %a, <2 x i32> %c - -; CODE-LABEL: v2i32_select_ule -; CODE: bb.0 -; CODE-NEXT: cmhs v{{.+}}.2s, v{{.+}}.2s, v{{.+}}.2s -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret - define <2 x i32> @v2i32_select_ule(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { +; COST-LABEL: 'v2i32_select_ule' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = icmp ule <2 x i32> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x i32> %a, <2 x i32> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %s.1 +; %cmp.1 = icmp ule <2 x i32> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x i32> %a, <2 x i32> %c ret <2 x i32> %s.1 } -; COST-LABEL: v4i32_select_ult -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = icmp ult <4 x i32> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x i32> %a, <4 x i32> %c - -; CODE-LABEL: v4i32_select_ult -; CODE: bb.0 -; CODE-NEXT: cmhi v{{.+}}.4s, v{{.+}}.4s, v{{.+}}.4s -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret - define <4 x i32> @v4i32_select_ult(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; COST-LABEL: 'v4i32_select_ult' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = icmp ult <4 x i32> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x i32> %a, <4 x i32> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %s.1 +; %cmp.1 = icmp ult <4 x i32> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x i32> %a, <4 x i32> %c ret <4 x i32> %s.1 } -; COST-LABEL: v2i64_select_sle -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = icmp sle <2 x i64> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x i64> %a, <2 x i64> %c - -; CODE-LABEL: v2i64_select_sle -; CODE: bb.0 -; CODE-NEXT: cmge v{{.+}}.2d, v{{.+}}.2d, v{{.+}}.2d -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret - define <2 x i64> @v2i64_select_sle(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { +; COST-LABEL: 'v2i64_select_sle' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = icmp sle <2 x i64> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x i64> %a, <2 x i64> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %s.1 +; %cmp.1 = icmp sle <2 x i64> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x i64> %a, <2 x i64> %c ret <2 x i64> %s.1 } -; COST-LABEL: v3i64_select_sle -; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cmp.1 = icmp sle <3 x i64> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <3 x i1> %cmp.1, <3 x i64> %a, <3 x i64> %c - -; CODE-LABEL: v3i64_select_sle -; CODE: bb.0 -; CODE: mov -; CODE: mov -; CODE: mov -; CODE: cmge -; CODE: ldr -; CODE: bif -; CODE: cmge -; CODE: bif -; CODE: ext -; CODE: ret - define <3 x i64> @v3i64_select_sle(<3 x i64> %a, <3 x i64> %b, <3 x i64> %c) { +; COST-LABEL: 'v3i64_select_sle' +; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = icmp sle <3 x i64> %a, %b +; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <3 x i1> %cmp.1, <3 x i64> %a, <3 x i64> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <3 x i64> %s.1 +; %cmp.1 = icmp sle <3 x i64> %a, %b %s.1 = select <3 x i1> %cmp.1, <3 x i64> %a, <3 x i64> %c ret <3 x i64> %s.1 } -; COST-LABEL: v2i64_select_no_cmp -; COST-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %s.1 = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b - -; CODE-LABEL: v2i64_select_no_cmp -; CODE: bb.0 -; CODE-NEXT: ushll v{{.+}}.2d, v{{.+}}.2s, #0 -; CODE-NEXT: shl v{{.+}}.2d, v{{.+}}.2d, #63 -; CODE-NEXT: cmlt v{{.+}}.2d, v{{.+}}.2d, #0 -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret - define <2 x i64> @v2i64_select_no_cmp(<2 x i64> %a, <2 x i64> %b, <2 x i1> %cond) { +; COST-LABEL: 'v2i64_select_no_cmp' +; COST-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %s.1 +; %s.1 = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b ret <2 x i64> %s.1 } define <4 x half> @v4f16_select_ogt(<4 x half> %a, <4 x half> %b, <4 x half> %c) { -; COST-LABEL: v4f16_select_ogt -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cmp.1 = fcmp ogt <4 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ogt <4 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; -; CODE-LABEL: v4f16_select_ogt -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.4h, v{{.+}}.4h, v{{.+}}.4h -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v4f16_select_ogt' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ogt <4 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v4f16_select_ogt' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ogt <4 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 ; %cmp.1 = fcmp ogt <4 x half> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c @@ -174,17 +118,15 @@ define <4 x half> @v4f16_select_ogt(<4 x half> %a, <4 x half> %b, <4 x half> %c) } define <8 x half> @v8f16_select_ogt(<8 x half> %a, <8 x half> %b, <8 x half> %c) { -; COST-LABEL: v8f16_select_ogt -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp ogt <8 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ogt <8 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; -; CODE-LABEL: v8f16_select_ogt -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.8h, v{{.+}}.8h, v{{.+}}.8h -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v8f16_select_ogt' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ogt <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v8f16_select_ogt' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ogt <8 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 ; %cmp.1 = fcmp ogt <8 x half> %a, %b %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -192,15 +134,10 @@ define <8 x half> @v8f16_select_ogt(<8 x half> %a, <8 x half> %b, <8 x half> %c) } define <2 x float> @v2f32_select_ogt(<2 x float> %a, <2 x float> %b, <2 x float> %c) { -; COST-LABEL: v2f32_select_ogt -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ogt <2 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c -; -; CODE-LABEL: v2f32_select_ogt -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.2s, v{{.+}}.2s, v{{.+}}.2s -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-LABEL: 'v2f32_select_ogt' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ogt <2 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %s.1 ; %cmp.1 = fcmp ogt <2 x float> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c @@ -208,15 +145,10 @@ define <2 x float> @v2f32_select_ogt(<2 x float> %a, <2 x float> %b, <2 x float> } define <4 x float> @v4f32_select_ogt(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; COST-LABEL: v4f32_select_ogt -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ogt <4 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c -; -; CODE-LABEL: v4f32_select_ogt -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.4s, v{{.+}}.4s, v{{.+}}.4s -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v4f32_select_ogt' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ogt <4 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %s.1 ; %cmp.1 = fcmp ogt <4 x float> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c @@ -224,15 +156,10 @@ define <4 x float> @v4f32_select_ogt(<4 x float> %a, <4 x float> %b, <4 x float> } define <2 x double> @v2f64_select_ogt(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; COST-LABEL: v2f64_select_ogt -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ogt <2 x double> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c -; -; CODE-LABEL: v2f64_select_ogt -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.2d, v{{.+}}.2d, v{{.+}}.2d -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v2f64_select_ogt' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ogt <2 x double> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %s.1 ; %cmp.1 = fcmp ogt <2 x double> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c @@ -240,17 +167,15 @@ define <2 x double> @v2f64_select_ogt(<2 x double> %a, <2 x double> %b, <2 x dou } define <4 x half> @v4f16_select_oge(<4 x half> %a, <4 x half> %b, <4 x half> %c) { -; COST-LABEL: v4f16_select_oge -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cmp.1 = fcmp oge <4 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp oge <4 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; -; CODE-LABEL: v4f16_select_oge -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.4h, v{{.+}}.4h, v{{.+}}.4h -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v4f16_select_oge' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oge <4 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v4f16_select_oge' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp oge <4 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 ; %cmp.1 = fcmp oge <4 x half> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c @@ -258,17 +183,15 @@ define <4 x half> @v4f16_select_oge(<4 x half> %a, <4 x half> %b, <4 x half> %c) } define <8 x half> @v8f16_select_oge(<8 x half> %a, <8 x half> %b, <8 x half> %c) { -; COST-LABEL: v8f16_select_oge -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp oge <8 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp oge <8 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; -; CODE-LABEL: v8f16_select_oge -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.8h, v{{.+}}.8h, v{{.+}}.8h -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v8f16_select_oge' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oge <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v8f16_select_oge' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp oge <8 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 ; %cmp.1 = fcmp oge <8 x half> %a, %b %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -276,15 +199,10 @@ define <8 x half> @v8f16_select_oge(<8 x half> %a, <8 x half> %b, <8 x half> %c) } define <2 x float> @v2f32_select_oge(<2 x float> %a, <2 x float> %b, <2 x float> %c) { -; COST-LABEL: v2f32_select_oge -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp oge <2 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c -; -; CODE-LABEL: v2f32_select_oge -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.2s, v{{.+}}.2s, v{{.+}}.2s -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-LABEL: 'v2f32_select_oge' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp oge <2 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %s.1 ; %cmp.1 = fcmp oge <2 x float> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c @@ -292,15 +210,10 @@ define <2 x float> @v2f32_select_oge(<2 x float> %a, <2 x float> %b, <2 x float> } define <4 x float> @v4f32_select_oge(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; COST-LABEL: v4f32_select_oge -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp oge <4 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c -; -; CODE-LABEL: v4f32_select_oge -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.4s, v{{.+}}.4s, v{{.+}}.4s -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v4f32_select_oge' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp oge <4 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %s.1 ; %cmp.1 = fcmp oge <4 x float> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c @@ -308,15 +221,10 @@ define <4 x float> @v4f32_select_oge(<4 x float> %a, <4 x float> %b, <4 x float> } define <2 x double> @v2f64_select_oge(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; COST-LABEL: v2f64_select_oge -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp oge <2 x double> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c -; -; CODE-LABEL: v2f64_select_oge -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.2d, v{{.+}}.2d, v{{.+}}.2d -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v2f64_select_oge' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp oge <2 x double> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %s.1 ; %cmp.1 = fcmp oge <2 x double> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c @@ -324,17 +232,15 @@ define <2 x double> @v2f64_select_oge(<2 x double> %a, <2 x double> %b, <2 x dou } define <4 x half> @v4f16_select_olt(<4 x half> %a, <4 x half> %b, <4 x half> %c) { -; COST-LABEL: v4f16_select_olt -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cmp.1 = fcmp olt <4 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp olt <4 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; -; CODE-LABEL: v4f16_select_olt -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.4h, v{{.+}}.4h, v{{.+}}.4h -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v4f16_select_olt' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp olt <4 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v4f16_select_olt' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp olt <4 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 ; %cmp.1 = fcmp olt <4 x half> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c @@ -342,17 +248,15 @@ define <4 x half> @v4f16_select_olt(<4 x half> %a, <4 x half> %b, <4 x half> %c) } define <8 x half> @v8f16_select_olt(<8 x half> %a, <8 x half> %b, <8 x half> %c) { -; COST-LABEL: v8f16_select_olt -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp olt <8 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp olt <8 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; -; CODE-LABEL: v8f16_select_olt -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.8h, v{{.+}}.8h, v{{.+}}.8h -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v8f16_select_olt' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp olt <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v8f16_select_olt' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp olt <8 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 ; %cmp.1 = fcmp olt <8 x half> %a, %b %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -360,15 +264,10 @@ define <8 x half> @v8f16_select_olt(<8 x half> %a, <8 x half> %b, <8 x half> %c) } define <2 x float> @v2f32_select_olt(<2 x float> %a, <2 x float> %b, <2 x float> %c) { -; COST-LABEL: v2f32_select_olt -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp olt <2 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c -; -; CODE-LABEL: v2f32_select_olt -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.2s, v{{.+}}.2s, v{{.+}}.2s -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-LABEL: 'v2f32_select_olt' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp olt <2 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %s.1 ; %cmp.1 = fcmp olt <2 x float> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c @@ -376,15 +275,10 @@ define <2 x float> @v2f32_select_olt(<2 x float> %a, <2 x float> %b, <2 x float> } define <4 x float> @v4f32_select_olt(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; COST-LABEL: v4f32_select_olt -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp olt <4 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c -; -; CODE-LABEL: v4f32_select_olt -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.4s, v{{.+}}.4s, v{{.+}}.4s -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v4f32_select_olt' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp olt <4 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %s.1 ; %cmp.1 = fcmp olt <4 x float> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c @@ -392,15 +286,10 @@ define <4 x float> @v4f32_select_olt(<4 x float> %a, <4 x float> %b, <4 x float> } define <2 x double> @v2f64_select_olt(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; COST-LABEL: v2f64_select_olt -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp olt <2 x double> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c -; -; CODE-LABEL: v2f64_select_olt -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.2d, v{{.+}}.2d, v{{.+}}.2d -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v2f64_select_olt' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp olt <2 x double> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %s.1 ; %cmp.1 = fcmp olt <2 x double> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c @@ -408,17 +297,15 @@ define <2 x double> @v2f64_select_olt(<2 x double> %a, <2 x double> %b, <2 x dou } define <4 x half> @v4f16_select_ole(<4 x half> %a, <4 x half> %b, <4 x half> %c) { -; COST-LABEL: v4f16_select_ole -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cmp.1 = fcmp ole <4 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ole <4 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; -; CODE-LABEL: v4f16_select_ole -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.4h, v{{.+}}.4h, v{{.+}}.4h -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v4f16_select_ole' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ole <4 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v4f16_select_ole' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ole <4 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 ; %cmp.1 = fcmp ole <4 x half> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c @@ -426,17 +313,15 @@ define <4 x half> @v4f16_select_ole(<4 x half> %a, <4 x half> %b, <4 x half> %c) } define <8 x half> @v8f16_select_ole(<8 x half> %a, <8 x half> %b, <8 x half> %c) { -; COST-LABEL: v8f16_select_ole -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp ole <8 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ole <8 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; -; CODE-LABEL: v8f16_select_ole -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.8h, v{{.+}}.8h, v{{.+}}.8h -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v8f16_select_ole' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp ole <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v8f16_select_ole' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ole <8 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 ; %cmp.1 = fcmp ole <8 x half> %a, %b %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -444,15 +329,10 @@ define <8 x half> @v8f16_select_ole(<8 x half> %a, <8 x half> %b, <8 x half> %c) } define <2 x float> @v2f32_select_ole(<2 x float> %a, <2 x float> %b, <2 x float> %c) { -; COST-LABEL: v2f32_select_ole -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ole <2 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c -; -; CODE-LABEL: v2f32_select_ole -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.2s, v{{.+}}.2s, v{{.+}}.2s -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-LABEL: 'v2f32_select_ole' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ole <2 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %s.1 ; %cmp.1 = fcmp ole <2 x float> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c @@ -460,15 +340,10 @@ define <2 x float> @v2f32_select_ole(<2 x float> %a, <2 x float> %b, <2 x float> } define <4 x float> @v4f32_select_ole(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; COST-LABEL: v4f32_select_ole -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ole <4 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c -; -; CODE-LABEL: v4f32_select_ole -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.4s, v{{.+}}.4s, v{{.+}}.4s -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v4f32_select_ole' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ole <4 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %s.1 ; %cmp.1 = fcmp ole <4 x float> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c @@ -476,15 +351,10 @@ define <4 x float> @v4f32_select_ole(<4 x float> %a, <4 x float> %b, <4 x float> } define <2 x double> @v2f64_select_ole(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; COST-LABEL: v2f64_select_ole -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ole <2 x double> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c -; -; CODE-LABEL: v2f64_select_ole -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.2d, v{{.+}}.2d, v{{.+}}.2d -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v2f64_select_ole' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ole <2 x double> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %s.1 ; %cmp.1 = fcmp ole <2 x double> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c @@ -492,17 +362,15 @@ define <2 x double> @v2f64_select_ole(<2 x double> %a, <2 x double> %b, <2 x dou } define <4 x half> @v4f16_select_oeq(<4 x half> %a, <4 x half> %b, <4 x half> %c) { -; COST-LABEL: v4f16_select_oeq -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cmp.1 = fcmp oeq <4 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp oeq <4 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; -; CODE-LABEL: v4f16_select_oeq -; CODE: bb.0 -; CODE-NEXT: fcmeq v{{.+}}.4h, v{{.+}}.4h, v{{.+}}.4h -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v4f16_select_oeq' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oeq <4 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v4f16_select_oeq' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp oeq <4 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 ; %cmp.1 = fcmp oeq <4 x half> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c @@ -510,17 +378,15 @@ define <4 x half> @v4f16_select_oeq(<4 x half> %a, <4 x half> %b, <4 x half> %c) } define <8 x half> @v8f16_select_oeq(<8 x half> %a, <8 x half> %b, <8 x half> %c) { -; COST-LABEL: v8f16_select_oeq -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp oeq <8 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp oeq <8 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; -; CODE-LABEL: v8f16_select_oeq -; CODE: bb.0 -; CODE-NEXT: fcmeq v{{.+}}.8h, v{{.+}}.8h, v{{.+}}.8h -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v8f16_select_oeq' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp oeq <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v8f16_select_oeq' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp oeq <8 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 ; %cmp.1 = fcmp oeq <8 x half> %a, %b %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -528,15 +394,10 @@ define <8 x half> @v8f16_select_oeq(<8 x half> %a, <8 x half> %b, <8 x half> %c) } define <2 x float> @v2f32_select_oeq(<2 x float> %a, <2 x float> %b, <2 x float> %c) { -; COST-LABEL: v2f32_select_oeq -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp oeq <2 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c -; -; CODE-LABEL: v2f32_select_oeq -; CODE: bb.0 -; CODE-NEXT: fcmeq v{{.+}}.2s, v{{.+}}.2s, v{{.+}}.2s -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-LABEL: 'v2f32_select_oeq' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp oeq <2 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %s.1 ; %cmp.1 = fcmp oeq <2 x float> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c @@ -544,15 +405,10 @@ define <2 x float> @v2f32_select_oeq(<2 x float> %a, <2 x float> %b, <2 x float> } define <4 x float> @v4f32_select_oeq(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; COST-LABEL: v4f32_select_oeq -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp oeq <4 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c -; -; CODE-LABEL: v4f32_select_oeq -; CODE: bb.0 -; CODE-NEXT: fcmeq v{{.+}}.4s, v{{.+}}.4s, v{{.+}}.4s -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v4f32_select_oeq' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp oeq <4 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %s.1 ; %cmp.1 = fcmp oeq <4 x float> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c @@ -560,15 +416,10 @@ define <4 x float> @v4f32_select_oeq(<4 x float> %a, <4 x float> %b, <4 x float> } define <2 x double> @v2f64_select_oeq(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; COST-LABEL: v2f64_select_oeq -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp oeq <2 x double> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c -; -; CODE-LABEL: v2f64_select_oeq -; CODE: bb.0 -; CODE-NEXT: fcmeq v{{.+}}.2d, v{{.+}}.2d, v{{.+}}.2d -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v2f64_select_oeq' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp oeq <2 x double> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %s.1 ; %cmp.1 = fcmp oeq <2 x double> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c @@ -576,19 +427,15 @@ define <2 x double> @v2f64_select_oeq(<2 x double> %a, <2 x double> %b, <2 x dou } define <4 x half> @v4f16_select_one(<4 x half> %a, <4 x half> %b, <4 x half> %c) { -; COST-LABEL: v4f16_select_one -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cmp.1 = fcmp one <4 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp one <4 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; -; CODE-LABEL: v4f16_select_one -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.4h, v{{.+}}.4h, v{{.+}}.4h -; CODE-NEXT: fcmgt v{{.+}}.4h, v{{.+}}.4h, v{{.+}}.4h -; CODE-NEXT: orr v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v4f16_select_one' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp one <4 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v4f16_select_one' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp one <4 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 ; %cmp.1 = fcmp one <4 x half> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c @@ -596,19 +443,15 @@ define <4 x half> @v4f16_select_one(<4 x half> %a, <4 x half> %b, <4 x half> %c) } define <8 x half> @v8f16_select_one(<8 x half> %a, <8 x half> %b, <8 x half> %c) { -; COST-LABEL: v8f16_select_one -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp one <8 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp one <8 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; -; CODE-LABEL: v8f16_select_one -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.8h, v{{.+}}.8h, v{{.+}}.8h -; CODE-NEXT: fcmgt v{{.+}}.8h, v{{.+}}.8h, v{{.+}}.8h -; CODE-NEXT: orr v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v8f16_select_one' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp one <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v8f16_select_one' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp one <8 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 ; %cmp.1 = fcmp one <8 x half> %a, %b %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -616,17 +459,12 @@ define <8 x half> @v8f16_select_one(<8 x half> %a, <8 x half> %b, <8 x half> %c) } define <2 x float> @v2f32_select_one(<2 x float> %a, <2 x float> %b, <2 x float> %c) { -; COST-LABEL: v2f32_select_one -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp one <2 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c - -; CODE-LABEL: v2f32_select_one -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.2s, v{{.+}}.2s, v{{.+}}.2s -; CODE-NEXT: fcmgt v{{.+}}.2s, v{{.+}}.2s, v{{.+}}.2s -; CODE-NEXT: orr v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-LABEL: 'v2f32_select_one' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp one <2 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %s.1 +; + %cmp.1 = fcmp one <2 x float> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c @@ -634,17 +472,12 @@ define <2 x float> @v2f32_select_one(<2 x float> %a, <2 x float> %b, <2 x float> } define <4 x float> @v4f32_select_one(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; COST-LABEL: v4f32_select_one -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp one <4 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c - -; CODE-LABEL: v4f32_select_one -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.4s, v{{.+}}.4s, v{{.+}}.4s -; CODE-NEXT: fcmgt v{{.+}}.4s, v{{.+}}.4s, v{{.+}}.4s -; CODE-NEXT: orr v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v4f32_select_one' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp one <4 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %s.1 +; + %cmp.1 = fcmp one <4 x float> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c @@ -652,17 +485,10 @@ define <4 x float> @v4f32_select_one(<4 x float> %a, <4 x float> %b, <4 x float> } define <2 x double> @v2f64_select_one(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; COST-LABEL: v2f64_select_one -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp one <2 x double> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c -; -; CODE-LABEL: v2f64_select_one -; CODE: bb.0 -; CODE-NEXT: fcmgt v{{.+}}.2d, v{{.+}}.2d, v{{.+}}.2d -; CODE-NEXT: fcmgt v{{.+}}.2d, v{{.+}}.2d, v{{.+}}.2d -; CODE-NEXT: orr v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v2f64_select_one' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp one <2 x double> %a, %b +; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %s.1 ; %cmp.1 = fcmp one <2 x double> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c @@ -670,17 +496,15 @@ define <2 x double> @v2f64_select_one(<2 x double> %a, <2 x double> %b, <2 x dou } define <4 x half> @v4f16_select_une(<4 x half> %a, <4 x half> %b, <4 x half> %c) { -; COST-LABEL: v4f16_select_une -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cmp.1 = fcmp une <4 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp une <4 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c -; -; CODE-LABEL: v4f16_select_une -; CODE: bb.0 -; CODE-NEXT: fcmeq v{{.+}}.4h, v{{.+}}.4h, v{{.+}}.4h -; CODE-NEXT: bit v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v4f16_select_une' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp une <4 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v4f16_select_une' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp une <4 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x half> %s.1 ; %cmp.1 = fcmp une <4 x half> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x half> %a, <4 x half> %c @@ -688,17 +512,15 @@ define <4 x half> @v4f16_select_une(<4 x half> %a, <4 x half> %b, <4 x half> %c) } define <8 x half> @v8f16_select_une(<8 x half> %a, <8 x half> %b, <8 x half> %c) { -; COST-LABEL: v8f16_select_une -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp une <8 x half> %a, %b -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp une <8 x half> %a, %b -; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c -; -; CODE-LABEL: v8f16_select_une -; CODE: bb.0 -; CODE-NEXT: fcmeq v{{.+}}.8h, v{{.+}}.8h, v{{.+}}.8h -; CODE-NEXT: bit v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-NOFP16-LABEL: 'v8f16_select_une' +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %cmp.1 = fcmp une <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-NOFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 +; +; COST-FULLFP16-LABEL: 'v8f16_select_une' +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp une <8 x half> %a, %b +; COST-FULLFP16-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c +; COST-FULLFP16-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x half> %s.1 ; %cmp.1 = fcmp une <8 x half> %a, %b %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -706,15 +528,10 @@ define <8 x half> @v8f16_select_une(<8 x half> %a, <8 x half> %b, <8 x half> %c) } define <2 x float> @v2f32_select_une(<2 x float> %a, <2 x float> %b, <2 x float> %c) { -; COST-LABEL: v2f32_select_une -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp une <2 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c -; -; CODE-LABEL: v2f32_select_une -; CODE: bb.0 -; CODE-NEXT: fcmeq v{{.+}}.2s, v{{.+}}.2s, v{{.+}}.2s -; CODE-NEXT: bit v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-LABEL: 'v2f32_select_une' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp une <2 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %s.1 ; %cmp.1 = fcmp une <2 x float> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c @@ -722,15 +539,10 @@ define <2 x float> @v2f32_select_une(<2 x float> %a, <2 x float> %b, <2 x float> } define <4 x float> @v4f32_select_une(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; COST-LABEL: v4f32_select_une -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp une <4 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c -; -; CODE-LABEL: v4f32_select_une -; CODE: bb.0 -; CODE-NEXT: fcmeq v{{.+}}.4s, v{{.+}}.4s, v{{.+}}.4s -; CODE-NEXT: bit v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v4f32_select_une' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp une <4 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %s.1 ; %cmp.1 = fcmp une <4 x float> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c @@ -738,15 +550,10 @@ define <4 x float> @v4f32_select_une(<4 x float> %a, <4 x float> %b, <4 x float> } define <2 x double> @v2f64_select_une(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; COST-LABEL: v2f64_select_une -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp une <2 x double> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c -; -; CODE-LABEL: v2f64_select_une -; CODE: bb.0 -; CODE-NEXT: fcmeq v{{.+}}.2d, v{{.+}}.2d, v{{.+}}.2d -; CODE-NEXT: bit v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v2f64_select_une' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp une <2 x double> %a, %b +; COST-NEXT: Cost Model: Found costs of 1 for: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %s.1 ; %cmp.1 = fcmp une <2 x double> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c @@ -754,17 +561,10 @@ define <2 x double> @v2f64_select_une(<2 x double> %a, <2 x double> %b, <2 x dou } define <2 x float> @v2f32_select_ord(<2 x float> %a, <2 x float> %b, <2 x float> %c) { -; COST-LABEL: v2f32_select_ord -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ord <2 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c -; -; CODE-LABEL: v2f32_select_ord -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.2s, v{{.+}}.2s, v{{.+}}.2s -; CODE-NEXT: fcmgt v{{.+}}.2s, v{{.+}}.2s, v{{.+}}.2s -; CODE-NEXT: orr v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: bif v{{.+}}.8b, v{{.+}}.8b, v{{.+}}.8b -; CODE-NEXT: ret +; COST-LABEL: 'v2f32_select_ord' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ord <2 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x float> %s.1 ; %cmp.1 = fcmp ord <2 x float> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x float> %a, <2 x float> %c @@ -772,17 +572,12 @@ define <2 x float> @v2f32_select_ord(<2 x float> %a, <2 x float> %b, <2 x float> } define <4 x float> @v4f32_select_ord(<4 x float> %a, <4 x float> %b, <4 x float> %c) { -; COST-LABEL: v4f32_select_ord -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ord <4 x float> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c - -; CODE-LABEL: v4f32_select_ord -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.4s, v{{.+}}.4s, v{{.+}}.4s -; CODE-NEXT: fcmgt v{{.+}}.4s, v{{.+}}.4s, v{{.+}}.4s -; CODE-NEXT: orr v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v4f32_select_ord' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ord <4 x float> %a, %b +; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x float> %s.1 +; + %cmp.1 = fcmp ord <4 x float> %a, %b %s.1 = select <4 x i1> %cmp.1, <4 x float> %a, <4 x float> %c @@ -790,17 +585,10 @@ define <4 x float> @v4f32_select_ord(<4 x float> %a, <4 x float> %b, <4 x float> } define <2 x double> @v2f64_select_ord(<2 x double> %a, <2 x double> %b, <2 x double> %c) { -; COST-LABEL: v2f64_select_ord -; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ord <2 x double> %a, %b -; COST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c -; -; CODE-LABEL: v2f64_select_ord -; CODE: bb.0 -; CODE-NEXT: fcmge v{{.+}}.2d, v{{.+}}.2d, v{{.+}}.2d -; CODE-NEXT: fcmgt v{{.+}}.2d, v{{.+}}.2d, v{{.+}}.2d -; CODE-NEXT: orr v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: bif v{{.+}}.16b, v{{.+}}.16b, v{{.+}}.16b -; CODE-NEXT: ret +; COST-LABEL: 'v2f64_select_ord' +; COST-NEXT: Cost Model: Found costs of 1 for: %cmp.1 = fcmp ord <2 x double> %a, %b +; COST-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c +; COST-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x double> %s.1 ; %cmp.1 = fcmp ord <2 x double> %a, %b %s.1 = select <2 x i1> %cmp.1, <2 x double> %a, <2 x double> %c From 11dd7d98a6ecd2374289b6a217e358e503d4778a Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 31 Mar 2025 18:53:01 +0200 Subject: [PATCH 0113/1029] [clang][bytecode] Reject constexpr-unknown values from comparisons (#133701) --- clang/lib/AST/ByteCode/Interp.cpp | 18 +++-- clang/lib/AST/ByteCode/Interp.h | 7 ++ .../ByteCode/codegen-constexpr-unknown.cpp | 72 +++++++++++++++++++ .../AST/ByteCode/codegen-mutable-read.cpp | 36 ---------- 4 files changed, 92 insertions(+), 41 deletions(-) create mode 100644 clang/test/AST/ByteCode/codegen-constexpr-unknown.cpp delete mode 100644 clang/test/AST/ByteCode/codegen-mutable-read.cpp diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index 187477713bef8..0acfe01a42410 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -302,6 +302,17 @@ void cleanupAfterFunctionCall(InterpState &S, CodePtr OpPC, TYPE_SWITCH(Ty, S.Stk.discard()); } +// FIXME: Instead of using this fairly expensive test, we should +// just mark constexpr-unknown values when creating them. +bool isConstexprUnknown(const Pointer &P) { + if (!P.isBlockPointer()) + return false; + if (P.isDummy()) + return false; + const VarDecl *VD = P.block()->getDescriptor()->asVarDecl(); + return VD && VD->hasLocalStorage(); +} + bool CheckBCPResult(InterpState &S, const Pointer &Ptr) { if (Ptr.isDummy()) return false; @@ -607,11 +618,8 @@ bool CheckMutable(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { // variables in Compiler.cpp:visitDeclRef. Revisiting a so far // unknown variable will get the same EvalID and we end up allowing // reads from mutable members of it. - if (!S.inConstantContext()) { - if (const VarDecl *VD = Ptr.block()->getDescriptor()->asVarDecl(); - VD && VD->hasLocalStorage()) - return false; - } + if (!S.inConstantContext() && isConstexprUnknown(Ptr)) + return false; return true; } diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index ee4139fbc9530..938077a9f10ae 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -173,6 +173,8 @@ static bool handleOverflow(InterpState &S, CodePtr OpPC, const T &SrcValue) { bool handleFixedPointOverflow(InterpState &S, CodePtr OpPC, const FixedPoint &FP); +bool isConstexprUnknown(const Pointer &P); + enum class ShiftDir { Left, Right }; /// Checks if the shift operation is legal. @@ -1062,6 +1064,11 @@ inline bool CmpHelperEQ(InterpState &S, CodePtr OpPC, CompareFn Fn) { } } + if (!S.inConstantContext()) { + if (isConstexprUnknown(LHS) || isConstexprUnknown(RHS)) + return false; + } + if (Pointer::hasSameBase(LHS, RHS)) { unsigned VL = LHS.getByteOffset(); unsigned VR = RHS.getByteOffset(); diff --git a/clang/test/AST/ByteCode/codegen-constexpr-unknown.cpp b/clang/test/AST/ByteCode/codegen-constexpr-unknown.cpp new file mode 100644 index 0000000000000..f62117d5f7bec --- /dev/null +++ b/clang/test/AST/ByteCode/codegen-constexpr-unknown.cpp @@ -0,0 +1,72 @@ +// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -fcxx-exceptions -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -fcxx-exceptions -o - %s -fexperimental-new-constant-interpreter | FileCheck %s + + +/// In the if expression below, the read from s.i should fail. +/// If it doesn't, and we actually read the value 0, the call to +/// func() will always occur, resuliting in a runtime failure. + +struct S { + mutable int i = 0; +}; + +void func() { + __builtin_abort(); +}; + +void setI(const S &s) { + s.i = 12; +} + +int main() { + const S s; + + setI(s); + + if (s.i == 0) + func(); + + return 0; +} + +// CHECK: define dso_local noundef i32 @main() +// CHECK: br +// CHECK: if.then +// CHECK: if.end +// CHECK: ret i32 0 + + +/// Similarly, here we revisit the BindingDecl. +struct F { int x; }; +int main2() { + const F const s{99}; + const auto& [r1] = s; + if (&r1 != &s.x) + __builtin_abort(); + return 0; +} +// CHECK: define dso_local noundef i32 @_Z5main2v() +// CHECK: br +// CHECK: if.then +// CHECK: if.end +// CHECK: ret i32 0 + +/// The comparison here should work and return 0. +class X { +public: + X(); + X(const X&); + X(const volatile X &); + ~X(); +}; +extern X OuterX; +X test24() { + X x; + if (&x == &OuterX) + throw 0; + return x; +} + +// CHECK: define dso_local void @_Z6test24v +// CHECK-NOT: lpad +// CHECK-NOT: eh.resume diff --git a/clang/test/AST/ByteCode/codegen-mutable-read.cpp b/clang/test/AST/ByteCode/codegen-mutable-read.cpp deleted file mode 100644 index afa46d34b0673..0000000000000 --- a/clang/test/AST/ByteCode/codegen-mutable-read.cpp +++ /dev/null @@ -1,36 +0,0 @@ -// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -o - %s -fexperimental-new-constant-interpreter | FileCheck %s - - -/// In the if expression below, the read from s.i should fail. -/// If it doesn't, and we actually read the value 0, the call to -/// func() will always occur, resuliting in a runtime failure. - -struct S { - mutable int i = 0; -}; - -void func() { - __builtin_abort(); -}; - -void setI(const S &s) { - s.i = 12; -} - -int main() { - const S s; - - setI(s); - - if (s.i == 0) - func(); - - return 0; -} - -// CHECK: define dso_local noundef i32 @main() -// CHECK: br -// CHECK: if.then -// CHECK: if.end -// CHECK: ret i32 0 From a0e1e680d28c4ef5f87be948d1d223fbfda2950c Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Mon, 31 Mar 2025 18:53:12 +0200 Subject: [PATCH 0114/1029] [clang][bytecode] Return Invalid() on non-constexpr builtins (#133700) So the diagnostic output matches with the current interpreter --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 2 +- clang/test/AST/ByteCode/builtin-functions.cpp | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 4d125e4c202d2..3029314ddbad8 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2125,7 +2125,7 @@ static bool interp__builtin_memchr(InterpState &S, CodePtr OpPC, bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, const CallExpr *Call, uint32_t BuiltinID) { if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID)) - return false; + return Invalid(S, OpPC); const InterpFrame *Frame = S.Current; diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp index 8408286314bb8..40f7a18119751 100644 --- a/clang/test/AST/ByteCode/builtin-functions.cpp +++ b/clang/test/AST/ByteCode/builtin-functions.cpp @@ -1591,3 +1591,12 @@ namespace WMemChr { constexpr bool c = !wcschr(L"hello", L'h'); // both-error {{constant expression}} \ // both-note {{non-constexpr function 'wcschr' cannot be used in a constant expression}} } + +namespace Invalid { + constexpr int test() { // both-error {{never produces a constant expression}} + __builtin_abort(); // both-note 2{{subexpression not valid in a constant expression}} + return 0; + } + static_assert(test() == 0); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} +} From f0b3bdd6dfa035653de5ead7b8d0582a8c0c158e Mon Sep 17 00:00:00 2001 From: David Peixotto Date: Mon, 31 Mar 2025 09:53:46 -0700 Subject: [PATCH 0115/1029] [lldb] Remove raw access to PluginInstances vector (#132884) Remove raw access to PluginInstances vector This commit modifies the PluginInstances class to remove direct access to the m_instances vector. Instead, we expose a new `GetSnapshot` method that returns a copy of the current state of the instances vector. All external iteration over the instances is updated to use the new method. The motivation for the change is to allow modifying the way we store instances without having to change all the clients. This is a preliminary change to allow enabling/disabling of plugins in which case we want to iterate over only enabled plugins. We also considered using a custom iterator that wraps the vector iterator and can skip over disabled instances. That works, but the iterator code is a bit messy with all template and typedefs to make a compliant iterator. --- lldb/source/Core/PluginManager.cpp | 168 ++++++++++++++--------------- 1 file changed, 84 insertions(+), 84 deletions(-) diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp index 80c9465f9af72..95eb940efcef2 100644 --- a/lldb/source/Core/PluginManager.cpp +++ b/lldb/source/Core/PluginManager.cpp @@ -226,30 +226,26 @@ template class PluginInstances { } typename Instance::CallbackType GetCallbackAtIndex(uint32_t idx) { - if (Instance *instance = GetInstanceAtIndex(idx)) + if (const Instance *instance = GetInstanceAtIndex(idx)) return instance->create_callback; return nullptr; } llvm::StringRef GetDescriptionAtIndex(uint32_t idx) { - if (Instance *instance = GetInstanceAtIndex(idx)) + if (const Instance *instance = GetInstanceAtIndex(idx)) return instance->description; return ""; } llvm::StringRef GetNameAtIndex(uint32_t idx) { - if (Instance *instance = GetInstanceAtIndex(idx)) + if (const Instance *instance = GetInstanceAtIndex(idx)) return instance->name; return ""; } typename Instance::CallbackType GetCallbackForName(llvm::StringRef name) { - if (name.empty()) - return nullptr; - for (auto &instance : m_instances) { - if (name == instance.name) - return instance.create_callback; - } + if (const Instance *instance = GetInstanceForName(name)) + return instance->create_callback; return nullptr; } @@ -260,12 +256,33 @@ template class PluginInstances { } } - const std::vector &GetInstances() const { return m_instances; } - std::vector &GetInstances() { return m_instances; } + // Return a copy of all the enabled instances. + // Note that this is a copy of the internal state so modifications + // to the returned instances will not be reflected back to instances + // stored by the PluginInstances object. + std::vector GetSnapshot() { return m_instances; } + + const Instance *GetInstanceAtIndex(uint32_t idx) { + uint32_t count = 0; + + return FindEnabledInstance( + [&](const Instance &instance) { return count++ == idx; }); + } + + const Instance *GetInstanceForName(llvm::StringRef name) { + if (name.empty()) + return nullptr; - Instance *GetInstanceAtIndex(uint32_t idx) { - if (idx < m_instances.size()) - return &m_instances[idx]; + return FindEnabledInstance( + [&](const Instance &instance) { return instance.name == name; }); + } + + const Instance * + FindEnabledInstance(std::function predicate) const { + for (const auto &instance : m_instances) { + if (predicate(instance)) + return &instance; + } return nullptr; } @@ -571,17 +588,15 @@ PluginManager::GetLanguageRuntimeCreateCallbackAtIndex(uint32_t idx) { LanguageRuntimeGetCommandObject PluginManager::GetLanguageRuntimeGetCommandObjectAtIndex(uint32_t idx) { - const auto &instances = GetLanguageRuntimeInstances().GetInstances(); - if (idx < instances.size()) - return instances[idx].command_callback; + if (auto instance = GetLanguageRuntimeInstances().GetInstanceAtIndex(idx)) + return instance->command_callback; return nullptr; } LanguageRuntimeGetExceptionPrecondition PluginManager::GetLanguageRuntimeGetExceptionPreconditionAtIndex(uint32_t idx) { - const auto &instances = GetLanguageRuntimeInstances().GetInstances(); - if (idx < instances.size()) - return instances[idx].precondition_callback; + if (auto instance = GetLanguageRuntimeInstances().GetInstanceAtIndex(idx)) + return instance->precondition_callback; return nullptr; } @@ -643,12 +658,7 @@ bool PluginManager::IsRegisteredObjectFilePluginName(llvm::StringRef name) { if (name.empty()) return false; - const auto &instances = GetObjectFileInstances().GetInstances(); - for (auto &instance : instances) { - if (instance.name == name) - return true; - } - return false; + return GetObjectFileInstances().GetInstanceForName(name) != nullptr; } bool PluginManager::RegisterPlugin( @@ -674,29 +684,24 @@ PluginManager::GetObjectFileCreateCallbackAtIndex(uint32_t idx) { ObjectFileCreateMemoryInstance PluginManager::GetObjectFileCreateMemoryCallbackAtIndex(uint32_t idx) { - const auto &instances = GetObjectFileInstances().GetInstances(); - if (idx < instances.size()) - return instances[idx].create_memory_callback; + if (auto instance = GetObjectFileInstances().GetInstanceAtIndex(idx)) + return instance->create_memory_callback; return nullptr; } ObjectFileGetModuleSpecifications PluginManager::GetObjectFileGetModuleSpecificationsCallbackAtIndex( uint32_t idx) { - const auto &instances = GetObjectFileInstances().GetInstances(); - if (idx < instances.size()) - return instances[idx].get_module_specifications; + if (auto instance = GetObjectFileInstances().GetInstanceAtIndex(idx)) + return instance->get_module_specifications; return nullptr; } ObjectFileCreateMemoryInstance PluginManager::GetObjectFileCreateMemoryCallbackForPluginName( llvm::StringRef name) { - const auto &instances = GetObjectFileInstances().GetInstances(); - for (auto &instance : instances) { - if (instance.name == name) - return instance.create_memory_callback; - } + if (auto instance = GetObjectFileInstances().GetInstanceForName(name)) + return instance->create_memory_callback; return nullptr; } @@ -729,7 +734,7 @@ Status PluginManager::SaveCore(const lldb::ProcessSP &process_sp, // Fall back to object plugins. const auto &plugin_name = options.GetPluginName().value_or(""); - auto &instances = GetObjectFileInstances().GetInstances(); + auto instances = GetObjectFileInstances().GetSnapshot(); for (auto &instance : instances) { if (plugin_name.empty() || instance.name == plugin_name) { if (instance.save_core && instance.save_core(process_sp, options, error)) @@ -791,18 +796,16 @@ PluginManager::GetObjectContainerCreateCallbackAtIndex(uint32_t idx) { ObjectContainerCreateMemoryInstance PluginManager::GetObjectContainerCreateMemoryCallbackAtIndex(uint32_t idx) { - const auto &instances = GetObjectContainerInstances().GetInstances(); - if (idx < instances.size()) - return instances[idx].create_memory_callback; + if (auto instance = GetObjectContainerInstances().GetInstanceAtIndex(idx)) + return instance->create_memory_callback; return nullptr; } ObjectFileGetModuleSpecifications PluginManager::GetObjectContainerGetModuleSpecificationsCallbackAtIndex( uint32_t idx) { - const auto &instances = GetObjectContainerInstances().GetInstances(); - if (idx < instances.size()) - return instances[idx].get_module_specifications; + if (auto instance = GetObjectContainerInstances().GetInstanceAtIndex(idx)) + return instance->get_module_specifications; return nullptr; } @@ -849,7 +852,7 @@ PluginManager::GetPlatformCreateCallbackForPluginName(llvm::StringRef name) { void PluginManager::AutoCompletePlatformName(llvm::StringRef name, CompletionRequest &request) { - for (const auto &instance : GetPlatformInstances().GetInstances()) { + for (const auto &instance : GetPlatformInstances().GetSnapshot()) { if (instance.name.starts_with(name)) request.AddCompletion(instance.name); } @@ -897,7 +900,7 @@ PluginManager::GetProcessCreateCallbackForPluginName(llvm::StringRef name) { void PluginManager::AutoCompleteProcessName(llvm::StringRef name, CompletionRequest &request) { - for (const auto &instance : GetProcessInstances().GetInstances()) { + for (const auto &instance : GetProcessInstances().GetSnapshot()) { if (instance.name.starts_with(name)) request.AddCompletion(instance.name, instance.description); } @@ -935,11 +938,11 @@ bool PluginManager::UnregisterPlugin( lldb::RegisterTypeBuilderSP PluginManager::GetRegisterTypeBuilder(Target &target) { - const auto &instances = GetRegisterTypeBuilderInstances().GetInstances(); // We assume that RegisterTypeBuilderClang is the only instance of this plugin // type and is always present. - assert(instances.size()); - return instances[0].create_callback(target); + auto instance = GetRegisterTypeBuilderInstances().GetInstanceAtIndex(0); + assert(instance); + return instance->create_callback(target); } #pragma mark ScriptInterpreter @@ -984,7 +987,7 @@ PluginManager::GetScriptInterpreterCreateCallbackAtIndex(uint32_t idx) { lldb::ScriptInterpreterSP PluginManager::GetScriptInterpreterForLanguage(lldb::ScriptLanguage script_lang, Debugger &debugger) { - const auto &instances = GetScriptInterpreterInstances().GetInstances(); + const auto instances = GetScriptInterpreterInstances().GetSnapshot(); ScriptInterpreterCreateInstance none_instance = nullptr; for (const auto &instance : instances) { if (instance.language == lldb::eScriptLanguageNone) @@ -1046,13 +1049,12 @@ PluginManager::GetStructuredDataPluginCreateCallbackAtIndex(uint32_t idx) { StructuredDataFilterLaunchInfo PluginManager::GetStructuredDataFilterCallbackAtIndex( uint32_t idx, bool &iteration_complete) { - const auto &instances = GetStructuredDataPluginInstances().GetInstances(); - if (idx < instances.size()) { + if (auto instance = + GetStructuredDataPluginInstances().GetInstanceAtIndex(idx)) { iteration_complete = false; - return instances[idx].filter_callback; - } else { - iteration_complete = true; + return instance->filter_callback; } + iteration_complete = true; return nullptr; } @@ -1167,7 +1169,7 @@ PluginManager::GetSymbolLocatorCreateCallbackAtIndex(uint32_t idx) { ModuleSpec PluginManager::LocateExecutableObjectFile(const ModuleSpec &module_spec) { - auto &instances = GetSymbolLocatorInstances().GetInstances(); + auto instances = GetSymbolLocatorInstances().GetSnapshot(); for (auto &instance : instances) { if (instance.locate_executable_object_file) { std::optional result = @@ -1181,7 +1183,7 @@ PluginManager::LocateExecutableObjectFile(const ModuleSpec &module_spec) { FileSpec PluginManager::LocateExecutableSymbolFile( const ModuleSpec &module_spec, const FileSpecList &default_search_paths) { - auto &instances = GetSymbolLocatorInstances().GetInstances(); + auto instances = GetSymbolLocatorInstances().GetSnapshot(); for (auto &instance : instances) { if (instance.locate_executable_symbol_file) { std::optional result = instance.locate_executable_symbol_file( @@ -1197,7 +1199,7 @@ bool PluginManager::DownloadObjectAndSymbolFile(ModuleSpec &module_spec, Status &error, bool force_lookup, bool copy_executable) { - auto &instances = GetSymbolLocatorInstances().GetInstances(); + auto instances = GetSymbolLocatorInstances().GetSnapshot(); for (auto &instance : instances) { if (instance.download_object_symbol_file) { if (instance.download_object_symbol_file(module_spec, error, force_lookup, @@ -1211,7 +1213,7 @@ bool PluginManager::DownloadObjectAndSymbolFile(ModuleSpec &module_spec, FileSpec PluginManager::FindSymbolFileInBundle(const FileSpec &symfile_bundle, const UUID *uuid, const ArchSpec *arch) { - auto &instances = GetSymbolLocatorInstances().GetInstances(); + auto instances = GetSymbolLocatorInstances().GetSnapshot(); for (auto &instance : instances) { if (instance.find_symbol_file_in_bundle) { std::optional result = @@ -1272,21 +1274,20 @@ PluginManager::GetTraceCreateCallback(llvm::StringRef plugin_name) { TraceCreateInstanceForLiveProcess PluginManager::GetTraceCreateCallbackForLiveProcess(llvm::StringRef plugin_name) { - for (const TraceInstance &instance : GetTracePluginInstances().GetInstances()) - if (instance.name == plugin_name) - return instance.create_callback_for_live_process; + if (auto instance = GetTracePluginInstances().GetInstanceForName(plugin_name)) + return instance->create_callback_for_live_process; + return nullptr; } llvm::StringRef PluginManager::GetTraceSchema(llvm::StringRef plugin_name) { - for (const TraceInstance &instance : GetTracePluginInstances().GetInstances()) - if (instance.name == plugin_name) - return instance.schema; + if (auto instance = GetTracePluginInstances().GetInstanceForName(plugin_name)) + return instance->schema; return llvm::StringRef(); } llvm::StringRef PluginManager::GetTraceSchema(size_t index) { - if (TraceInstance *instance = + if (const TraceInstance *instance = GetTracePluginInstances().GetInstanceAtIndex(index)) return instance->schema; return llvm::StringRef(); @@ -1335,7 +1336,7 @@ bool PluginManager::UnregisterPlugin( ThreadTraceExportCommandCreator PluginManager::GetThreadTraceExportCommandCreatorAtIndex(uint32_t index) { - if (TraceExporterInstance *instance = + if (const TraceExporterInstance *instance = GetTraceExporterInstances().GetInstanceAtIndex(index)) return instance->create_thread_trace_export_command; return nullptr; @@ -1438,9 +1439,9 @@ bool PluginManager::UnregisterPlugin( InstrumentationRuntimeGetType PluginManager::GetInstrumentationRuntimeGetTypeCallbackAtIndex(uint32_t idx) { - const auto &instances = GetInstrumentationRuntimeInstances().GetInstances(); - if (idx < instances.size()) - return instances[idx].get_type_callback; + if (auto instance = + GetInstrumentationRuntimeInstances().GetInstanceAtIndex(idx)) + return instance->get_type_callback; return nullptr; } @@ -1493,7 +1494,7 @@ PluginManager::GetTypeSystemCreateCallbackAtIndex(uint32_t idx) { } LanguageSet PluginManager::GetAllTypeSystemSupportedLanguagesForTypes() { - const auto &instances = GetTypeSystemInstances().GetInstances(); + const auto instances = GetTypeSystemInstances().GetSnapshot(); LanguageSet all; for (unsigned i = 0; i < instances.size(); ++i) all.bitvector |= instances[i].supported_languages_for_types.bitvector; @@ -1501,7 +1502,7 @@ LanguageSet PluginManager::GetAllTypeSystemSupportedLanguagesForTypes() { } LanguageSet PluginManager::GetAllTypeSystemSupportedLanguagesForExpressions() { - const auto &instances = GetTypeSystemInstances().GetInstances(); + const auto instances = GetTypeSystemInstances().GetSnapshot(); LanguageSet all; for (unsigned i = 0; i < instances.size(); ++i) all.bitvector |= instances[i].supported_languages_for_expressions.bitvector; @@ -1545,7 +1546,7 @@ bool PluginManager::UnregisterPlugin( } uint32_t PluginManager::GetNumScriptedInterfaces() { - return GetScriptedInterfaceInstances().GetInstances().size(); + return GetScriptedInterfaceInstances().GetSnapshot().size(); } llvm::StringRef PluginManager::GetScriptedInterfaceNameAtIndex(uint32_t index) { @@ -1559,17 +1560,16 @@ PluginManager::GetScriptedInterfaceDescriptionAtIndex(uint32_t index) { lldb::ScriptLanguage PluginManager::GetScriptedInterfaceLanguageAtIndex(uint32_t idx) { - const auto &instances = GetScriptedInterfaceInstances().GetInstances(); - return idx < instances.size() ? instances[idx].language - : ScriptLanguage::eScriptLanguageNone; + if (auto instance = GetScriptedInterfaceInstances().GetInstanceAtIndex(idx)) + return instance->language; + return ScriptLanguage::eScriptLanguageNone; } ScriptedInterfaceUsages PluginManager::GetScriptedInterfaceUsagesAtIndex(uint32_t idx) { - const auto &instances = GetScriptedInterfaceInstances().GetInstances(); - if (idx >= instances.size()) - return {}; - return instances[idx].usages; + if (auto instance = GetScriptedInterfaceInstances().GetInstanceAtIndex(idx)) + return instance->usages; + return {}; } #pragma mark REPL @@ -1606,13 +1606,13 @@ REPLCreateInstance PluginManager::GetREPLCreateCallbackAtIndex(uint32_t idx) { } LanguageSet PluginManager::GetREPLSupportedLanguagesAtIndex(uint32_t idx) { - const auto &instances = GetREPLInstances().GetInstances(); - return idx < instances.size() ? instances[idx].supported_languages - : LanguageSet(); + if (auto instance = GetREPLInstances().GetInstanceAtIndex(idx)) + return instance->supported_languages; + return LanguageSet(); } LanguageSet PluginManager::GetREPLAllTypeSystemSupportedLanguages() { - const auto &instances = GetREPLInstances().GetInstances(); + const auto instances = GetREPLInstances().GetSnapshot(); LanguageSet all; for (unsigned i = 0; i < instances.size(); ++i) all.bitvector |= instances[i].supported_languages.bitvector; From 514f984a8d28abf095df0a293294a40a2e811b8f Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Mon, 31 Mar 2025 09:55:07 -0700 Subject: [PATCH 0116/1029] [CIR][NFC] Fix warnings in ClangIR code (#133134) This fixes unused variable warnings that have crept into the ClangIR code. In some cases the variable will be needed later, but all unused variables are being removed here. They can be reintroduced when they are needed. --- clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 2 +- clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 3 +-- clang/lib/CIR/CodeGen/CIRGenFunction.h | 4 ---- .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 20 ++----------------- .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h | 7 ++----- 5 files changed, 6 insertions(+), 30 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index 8fd09b4cfefeb..f01e03a89981d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -170,7 +170,7 @@ mlir::Value CIRGenFunction::evaluateExprAsBool(const Expr *e) { SourceLocation loc = e->getExprLoc(); assert(!cir::MissingFeatures::pgoUse()); - if (const MemberPointerType *MPT = e->getType()->getAs()) { + if (e->getType()->getAs()) { cgm.errorNYI(e->getSourceRange(), "evaluateExprAsBool: member pointer type"); return createDummyValue(getLoc(loc), boolTy); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 52bd3b2933744..2cf92dfbf3a5b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -1366,8 +1366,7 @@ mlir::Value ScalarExprEmitter::VisitUnaryExprOrTypeTraitExpr( const mlir::Location loc = cgf.getLoc(e->getSourceRange()); if (auto kind = e->getKind(); kind == UETT_SizeOf || kind == UETT_DataSizeOf) { - if (const VariableArrayType *variableArrTy = - cgf.getContext().getAsVariableArrayType(typeToSize)) { + if (cgf.getContext().getAsVariableArrayType(typeToSize)) { cgf.getCIRGenModule().errorNYI(e->getSourceRange(), "sizeof operator for VariableArrayType", e->getStmtClassName()); diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 87d10ff4cd954..3b8171eea9ee0 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -243,10 +243,6 @@ class CIRGenFunction : public CIRGenTypeCache { // class is upstreamed. CIRGenFunction &cgf; - // Block containing cleanup code for things initialized in this lexical - // context (scope). - mlir::Block *cleanupBlock = nullptr; - // Points to the scope entry block. This is useful, for instance, for // helping to insert allocas before finalizing any recursive CodeGen from // switches. diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 1c2b9ad05a132..b19be53947f99 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -126,21 +126,6 @@ static mlir::Value emitToMemory(mlir::ConversionPatternRewriter &rewriter, return value; } -static mlir::Value -emitCirAttrToMemory(mlir::Operation *parentOp, mlir::Attribute attr, - mlir::ConversionPatternRewriter &rewriter, - const mlir::TypeConverter *converter, - mlir::DataLayout const &dataLayout) { - - mlir::Value loweredValue = - lowerCirAttrAsValue(parentOp, attr, rewriter, converter); - if (auto boolAttr = mlir::dyn_cast(attr)) { - return emitToMemory(rewriter, dataLayout, boolAttr.getType(), loweredValue); - } - - return loweredValue; -} - mlir::LLVM::Linkage convertLinkage(cir::GlobalLinkageKind linkage) { using CIR = cir::GlobalLinkageKind; using LLVM = mlir::LLVM::Linkage; @@ -261,7 +246,7 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstArrayAttr attr) { mlir::Location loc = parentOp->getLoc(); mlir::Value result; - if (auto zeros = attr.getTrailingZerosNum()) { + if (attr.hasTrailingZeros()) { mlir::Type arrayTy = attr.getType(); result = rewriter.create( loc, converter->convertType(arrayTy)); @@ -1251,13 +1236,12 @@ void ConvertCIRToLLVMPass::runOnOperation() { patterns.add(converter, patterns.getContext(), dl); patterns.add(converter, patterns.getContext(), dl); patterns.add(converter, patterns.getContext(), dl); - patterns.add(converter, patterns.getContext(), - dl); patterns.add< // clang-format off CIRToLLVMBinOpLowering, CIRToLLVMBrCondOpLowering, CIRToLLVMBrOpLowering, + CIRToLLVMConstantOpLowering, CIRToLLVMFuncOpLowering, CIRToLLVMTrapOpLowering, CIRToLLVMUnaryOpLowering diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h index ef0bb2deaccdf..b2926e75d1303 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h @@ -113,13 +113,10 @@ class CIRToLLVMStoreOpLowering class CIRToLLVMConstantOpLowering : public mlir::OpConversionPattern { - mlir::DataLayout const &dataLayout; - public: CIRToLLVMConstantOpLowering(const mlir::TypeConverter &typeConverter, - mlir::MLIRContext *context, - mlir::DataLayout const &dataLayout) - : OpConversionPattern(typeConverter, context), dataLayout(dataLayout) { + mlir::MLIRContext *context) + : OpConversionPattern(typeConverter, context) { setHasBoundedRewriteRecursion(); } From dcc2faecd8aebc64eb541aebe0005ecceffef558 Mon Sep 17 00:00:00 2001 From: Helena Kotas Date: Mon, 31 Mar 2025 10:05:59 -0700 Subject: [PATCH 0117/1029] [HLSL] Fix codegen to support classes in `cbuffer` (#132828) Fixes #132309 --- clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp | 26 ++++++------ clang/test/CodeGenHLSL/cbuffer.hlsl | 41 +++++++++++++++++-- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp index e0f5b0f59ef40..b546b6dd574ff 100644 --- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp +++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp @@ -52,11 +52,11 @@ static unsigned getScalarOrVectorSizeInBytes(llvm::Type *Ty) { namespace clang { namespace CodeGen { -// Creates a layout type for given struct with HLSL constant buffer layout -// taking into account PackOffsets, if provided. +// Creates a layout type for given struct or class with HLSL constant buffer +// layout taking into account PackOffsets, if provided. // Previously created layout types are cached by CGHLSLRuntime. // -// The function iterates over all fields of the StructType (including base +// The function iterates over all fields of the record type (including base // classes) and calls layoutField to converts each field to its corresponding // LLVM type and to calculate its HLSL constant buffer layout. Any embedded // structs (or arrays of structs) are converted to target layout types as well. @@ -67,12 +67,11 @@ namespace CodeGen { // -1 value instead. These elements must be placed at the end of the layout // after all of the elements with specific offset. llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType( - const RecordType *StructType, - const llvm::SmallVector *PackOffsets) { + const RecordType *RT, const llvm::SmallVector *PackOffsets) { // check if we already have the layout type for this struct if (llvm::TargetExtType *Ty = - CGM.getHLSLRuntime().getHLSLBufferLayoutType(StructType)) + CGM.getHLSLRuntime().getHLSLBufferLayoutType(RT)) return Ty; SmallVector Layout; @@ -87,7 +86,7 @@ llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType( // iterate over all fields of the record, including fields on base classes llvm::SmallVector RecordTypes; - RecordTypes.push_back(StructType); + RecordTypes.push_back(RT); while (RecordTypes.back()->getAsCXXRecordDecl()->getNumBases()) { CXXRecordDecl *D = RecordTypes.back()->getAsCXXRecordDecl(); assert(D->getNumBases() == 1 && @@ -148,7 +147,7 @@ llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType( // create the layout struct type; anonymous struct have empty name but // non-empty qualified name - const CXXRecordDecl *Decl = StructType->getAsCXXRecordDecl(); + const CXXRecordDecl *Decl = RT->getAsCXXRecordDecl(); std::string Name = Decl->getName().empty() ? "anon" : Decl->getQualifiedNameAsString(); llvm::StructType *StructTy = @@ -158,7 +157,7 @@ llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType( llvm::TargetExtType *NewLayoutTy = llvm::TargetExtType::get( CGM.getLLVMContext(), LayoutTypeName, {StructTy}, Layout); if (NewLayoutTy) - CGM.getHLSLRuntime().addHLSLBufferLayoutType(StructType, NewLayoutTy); + CGM.getHLSLRuntime().addHLSLBufferLayoutType(RT, NewLayoutTy); return NewLayoutTy; } @@ -202,9 +201,9 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, } // For array of structures, create a new array with a layout type // instead of the structure type. - if (Ty->isStructureType()) { + if (Ty->isStructureOrClassType()) { llvm::Type *NewTy = - cast(createLayoutType(Ty->getAsStructureType())); + cast(createLayoutType(Ty->getAs())); if (!NewTy) return false; assert(isa(NewTy) && "expected target type"); @@ -220,9 +219,10 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, ArrayStride = llvm::alignTo(ElemSize, CBufferRowSizeInBytes); ElemOffset = (Packoffset != -1) ? Packoffset : NextRowOffset; - } else if (FieldTy->isStructureType()) { + } else if (FieldTy->isStructureOrClassType()) { // Create a layout type for the structure - ElemLayoutTy = createLayoutType(FieldTy->getAsStructureType()); + ElemLayoutTy = + createLayoutType(cast(FieldTy->getAs())); if (!ElemLayoutTy) return false; assert(isa(ElemLayoutTy) && "expected target type"); diff --git a/clang/test/CodeGenHLSL/cbuffer.hlsl b/clang/test/CodeGenHLSL/cbuffer.hlsl index 98948ea6811e3..db06cea808b62 100644 --- a/clang/test/CodeGenHLSL/cbuffer.hlsl +++ b/clang/test/CodeGenHLSL/cbuffer.hlsl @@ -13,6 +13,12 @@ // CHECK: %C = type <{ i32, target("dx.Layout", %A, 8, 0) }> // CHECK: %__cblayout_D = type <{ [2 x [3 x target("dx.Layout", %B, 14, 0, 8)]] }> +// CHECK: %__cblayout_CBClasses = type <{ target("dx.Layout", %K, 4, 0), target("dx.Layout", %L, 8, 0, 4), +// CHECK-SAME: target("dx.Layout", %M, 68, 0), [10 x target("dx.Layout", %K, 4, 0)] }> +// CHECK: %K = type <{ float }> +// CHECK: %L = type <{ float, float }> +// CHECK: %M = type <{ [5 x target("dx.Layout", %K, 4, 0)] }> + // CHECK: %__cblayout_CBMix = type <{ [2 x target("dx.Layout", %Test, 8, 0, 4)], float, [3 x [2 x <2 x float>]], float, // CHECK-SAME: target("dx.Layout", %anon, 4, 0), double, target("dx.Layout", %anon.0, 8, 0), float, <1 x double>, i16 }> @@ -133,6 +139,33 @@ cbuffer CBStructs { uint16_t3 f; }; + +class K { + float i; +}; + +class L : K { + float j; +}; + +class M { + K array[5]; +}; + +cbuffer CBClasses { + K k; + L l; + M m; + K ka[10]; +}; + +// CHECK: @CBClasses.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBClasses, +// CHECK-SAME: 260, 0, 16, 32, 112)) +// CHECK: @k = external addrspace(2) global target("dx.Layout", %K, 4, 0), align 4 +// CHECK: @l = external addrspace(2) global target("dx.Layout", %L, 8, 0, 4), align 4 +// CHECK: @m = external addrspace(2) global target("dx.Layout", %M, 68, 0), align 4 +// CHECK: @ka = external addrspace(2) global [10 x target("dx.Layout", %K, 4, 0)], align 4 + struct Test { float a, b; }; @@ -237,7 +270,7 @@ RWBuffer Buf; [numthreads(4,1,1)] void main() { - Buf[0] = a1 + b1.z + c1[2] + a.f1.y + f1 + B1[0].x + B10.z + D1.B2; + Buf[0] = a1 + b1.z + c1[2] + a.f1.y + f1 + B1[0].x + ka[2].i + B10.z + D1.B2; } // CHECK: define internal void @_GLOBAL__sub_I_cbuffer.hlsl() @@ -245,8 +278,8 @@ void main() { // CHECK-NEXT: call void @_init_resource_CBScalars.cb() // CHECK-NEXT: call void @_init_resource_CBArrays.cb() -// CHECK: !hlsl.cbs = !{![[CBSCALARS:[0-9]+]], ![[CBVECTORS:[0-9]+]], ![[CBARRAYS:[0-9]+]], ![[CBSTRUCTS:[0-9]+]], ![[CBMIX:[0-9]+]], -// CHECK-SAME: ![[CB_A:[0-9]+]], ![[CB_B:[0-9]+]], ![[CB_C:[0-9]+]]} +// CHECK: !hlsl.cbs = !{![[CBSCALARS:[0-9]+]], ![[CBVECTORS:[0-9]+]], ![[CBARRAYS:[0-9]+]], ![[CBSTRUCTS:[0-9]+]], ![[CBCLASSES:[0-9]+]], +// CHECK-SAME: ![[CBMIX:[0-9]+]], ![[CB_A:[0-9]+]], ![[CB_B:[0-9]+]], ![[CB_C:[0-9]+]]} // CHECK: ![[CBSCALARS]] = !{ptr @CBScalars.cb, ptr addrspace(2) @a1, ptr addrspace(2) @a2, ptr addrspace(2) @a3, ptr addrspace(2) @a4, // CHECK-SAME: ptr addrspace(2) @a5, ptr addrspace(2) @a6, ptr addrspace(2) @a7, ptr addrspace(2) @a8} @@ -260,6 +293,8 @@ void main() { // CHECK: ![[CBSTRUCTS]] = !{ptr @CBStructs.cb, ptr addrspace(2) @a, ptr addrspace(2) @b, ptr addrspace(2) @c, ptr addrspace(2) @array_of_A, // CHECK-SAME: ptr addrspace(2) @d, ptr addrspace(2) @e, ptr addrspace(2) @f} +// CHECK: ![[CBCLASSES]] = !{ptr @CBClasses.cb, ptr addrspace(2) @k, ptr addrspace(2) @l, ptr addrspace(2) @m, ptr addrspace(2) @ka} + // CHECK: ![[CBMIX]] = !{ptr @CBMix.cb, ptr addrspace(2) @test, ptr addrspace(2) @f1, ptr addrspace(2) @f2, ptr addrspace(2) @f3, // CHECK-SAME: ptr addrspace(2) @f4, ptr addrspace(2) @f5, ptr addrspace(2) @f6, ptr addrspace(2) @f7, ptr addrspace(2) @f8, ptr addrspace(2) @f9} From 42d1a1cb849195304dc93004f5808ef82904cd89 Mon Sep 17 00:00:00 2001 From: Sam Clegg Date: Mon, 31 Mar 2025 10:20:40 -0700 Subject: [PATCH 0118/1029] [WebAssembly] Fix signatures of frexpf family of libcalls (#133289) Fixes: https://github.com/emscripten-core/emscripten/issues/23997 --- .../WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index 94d80f19e48af..ce795d3dedc6a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -278,9 +278,9 @@ struct RuntimeLibcallSignatureTable { Table[RTLIB::LDEXP_F32] = f32_func_f32_i32; Table[RTLIB::LDEXP_F64] = f64_func_f64_i32; Table[RTLIB::LDEXP_F128] = i64_i64_func_i64_i64_i32; - Table[RTLIB::FREXP_F32] = f32_func_f32_i32; - Table[RTLIB::FREXP_F64] = f64_func_f64_i32; - Table[RTLIB::FREXP_F128] = i64_i64_func_i64_i64_i32; + Table[RTLIB::FREXP_F32] = f32_func_f32_iPTR; + Table[RTLIB::FREXP_F64] = f64_func_f64_iPTR; + Table[RTLIB::FREXP_F128] = i64_i64_func_i64_i64_iPTR; Table[RTLIB::MODF_F32] = f32_func_f32_iPTR; Table[RTLIB::MODF_F64] = f64_func_f64_iPTR; Table[RTLIB::MODF_F128] = i64_i64_func_i64_i64_iPTR; From 8b06da16827829c9b142f591303d3c66aab926b9 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Mon, 31 Mar 2025 10:24:30 -0700 Subject: [PATCH 0119/1029] [mlir][memref] Improve runtime verification for `memref.subview` (#132545) This commit addresses a TODO in the runtime verification of `memref.subview`. Each dimension is now verified: the offset must be in-bounds and the slice must not run out-of-bounds. This commit aligns runtime verification with static op verification (which was improved in #133086). --- .../Transforms/RuntimeOpVerification.cpp | 73 ++++++++++--------- .../MemRef/subview-runtime-verification.mlir | 42 +++++++---- 2 files changed, 66 insertions(+), 49 deletions(-) diff --git a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp index 134e8b5efcfdf..4537977226087 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp @@ -327,47 +327,52 @@ struct ReinterpretCastOpInterface } }; -/// Verifies that the linear bounds of a subview op are within the linear bounds -/// of the base memref: low >= baseLow && high <= baseHigh -/// TODO: This is not yet a full runtime verification of subview. For example, -/// consider: -/// %m = memref.alloc(%c10, %c10) : memref<10x10xf32> -/// memref.subview %m[%c0, %c0][%c20, %c2][%c1, %c1] -/// : memref to memref -/// The subview is in-bounds of the entire base memref but the first dimension -/// is out-of-bounds. Future work would verify the bounds on a per-dimension -/// basis. struct SubViewOpInterface : public RuntimeVerifiableOpInterface::ExternalModel { void generateRuntimeVerification(Operation *op, OpBuilder &builder, Location loc) const { auto subView = cast(op); - auto baseMemref = cast>(subView.getSource()); - auto resultMemref = cast>(subView.getResult()); + MemRefType sourceType = subView.getSource().getType(); - builder.setInsertionPointAfter(op); - - // Compute the linear bounds of the base memref - auto [baseLow, baseHigh] = computeLinearBounds(builder, loc, baseMemref); - - // Compute the linear bounds of the resulting memref - auto [low, high] = computeLinearBounds(builder, loc, resultMemref); - - // Check low >= baseLow - auto geLow = builder.createOrFold( - loc, arith::CmpIPredicate::sge, low, baseLow); - - // Check high <= baseHigh - auto leHigh = builder.createOrFold( - loc, arith::CmpIPredicate::sle, high, baseHigh); - - auto assertCond = builder.createOrFold(loc, geLow, leHigh); - - builder.create( - loc, assertCond, - RuntimeVerifiableOpInterface::generateErrorMessage( - op, "subview is out-of-bounds of the base memref")); + // For each dimension, assert that: + // 0 <= offset < dim_size + // 0 <= offset + (size - 1) * stride < dim_size + Value zero = builder.create(loc, 0); + Value one = builder.create(loc, 1); + auto metadataOp = + builder.create(loc, subView.getSource()); + for (int64_t i = 0, e = sourceType.getRank(); i < e; ++i) { + Value offset = getValueOrCreateConstantIndexOp( + builder, loc, subView.getMixedOffsets()[i]); + Value size = getValueOrCreateConstantIndexOp(builder, loc, + subView.getMixedSizes()[i]); + Value stride = getValueOrCreateConstantIndexOp( + builder, loc, subView.getMixedStrides()[i]); + + // Verify that offset is in-bounds. + Value dimSize = metadataOp.getSizes()[i]; + Value offsetInBounds = + generateInBoundsCheck(builder, loc, offset, zero, dimSize); + builder.create( + loc, offsetInBounds, + RuntimeVerifiableOpInterface::generateErrorMessage( + op, "offset " + std::to_string(i) + " is out-of-bounds")); + + // Verify that slice does not run out-of-bounds. + Value sizeMinusOne = builder.create(loc, size, one); + Value sizeMinusOneTimesStride = + builder.create(loc, sizeMinusOne, stride); + Value lastPos = + builder.create(loc, offset, sizeMinusOneTimesStride); + Value lastPosInBounds = + generateInBoundsCheck(builder, loc, lastPos, zero, dimSize); + builder.create( + loc, lastPosInBounds, + RuntimeVerifiableOpInterface::generateErrorMessage( + op, "Subview runs out-of-bounds along dimension" + + std::to_string(i))); + } } }; diff --git a/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir index 3cac37a082c30..ec7e4085f2fa5 100644 --- a/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir +++ b/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir @@ -39,38 +39,50 @@ func.func @main() { %alloca_4 = memref.alloca() : memref<4x4xf32> %alloca_4_dyn = memref.cast %alloca_4 : memref<4x4xf32> to memref - // Offset is out-of-bounds + // Offset is out-of-bounds and slice runs out-of-bounds // CHECK: ERROR: Runtime op verification failed - // CHECK-NEXT: "memref.subview" - // CHECK-NEXT: ^ subview is out-of-bounds of the base memref + // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref, index, index, index) -> memref> + // CHECK-NEXT: ^ offset 0 is out-of-bounds + // CHECK-NEXT: Location: loc({{.*}}) + // CHECK: ERROR: Runtime op verification failed + // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref, index, index, index) -> memref> + // CHECK-NEXT: ^ subview runs out-of-bounds along dimension 0 // CHECK-NEXT: Location: loc({{.*}}) func.call @subview_dynamic_rank_reduce(%alloca_4_dyn, %5, %5, %1) : (memref, index, index, index) -> () - // Offset is out-of-bounds + // Offset is out-of-bounds and slice runs out-of-bounds + // CHECK: ERROR: Runtime op verification failed + // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<1xf32>, index) -> memref<1xf32, strided<[1], offset: ?>> + // CHECK-NEXT: ^ offset 0 is out-of-bounds + // CHECK-NEXT: Location: loc({{.*}}) // CHECK: ERROR: Runtime op verification failed - // CHECK-NEXT: "memref.subview" - // CHECK-NEXT: ^ subview is out-of-bounds of the base memref + // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<1xf32>, index) -> memref<1xf32, strided<[1], offset: ?>> + // CHECK-NEXT: ^ subview runs out-of-bounds along dimension 0 // CHECK-NEXT: Location: loc({{.*}}) func.call @subview(%alloca, %1) : (memref<1xf32>, index) -> () - // Offset is out-of-bounds + // Offset is out-of-bounds and slice runs out-of-bounds + // CHECK: ERROR: Runtime op verification failed + // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<1xf32>, index) -> memref<1xf32, strided<[1], offset: ?>> + // CHECK-NEXT: ^ offset 0 is out-of-bounds + // CHECK-NEXT: Location: loc({{.*}}) // CHECK: ERROR: Runtime op verification failed - // CHECK-NEXT: "memref.subview" - // CHECK-NEXT: ^ subview is out-of-bounds of the base memref + // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref<1xf32>, index) -> memref<1xf32, strided<[1], offset: ?>> + // CHECK-NEXT: ^ subview runs out-of-bounds along dimension 0 // CHECK-NEXT: Location: loc({{.*}}) func.call @subview(%alloca, %n1) : (memref<1xf32>, index) -> () - // Size is out-of-bounds + // Slice runs out-of-bounds due to size // CHECK: ERROR: Runtime op verification failed - // CHECK-NEXT: "memref.subview" - // CHECK-NEXT: ^ subview is out-of-bounds of the base memref + // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref, index, index, index) -> memref> + // CHECK-NEXT: ^ subview runs out-of-bounds along dimension 0 // CHECK-NEXT: Location: loc({{.*}}) func.call @subview_dynamic(%alloca_4_dyn, %0, %5, %1) : (memref, index, index, index) -> () - // Stride is out-of-bounds + // Slice runs out-of-bounds due to stride // CHECK: ERROR: Runtime op verification failed - // CHECK-NEXT: "memref.subview" - // CHECK-NEXT: ^ subview is out-of-bounds of the base memref + // CHECK-NEXT: "memref.subview"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) <{operandSegmentSizes = array, static_offsets = array, static_sizes = array, static_strides = array}> : (memref, index, index, index) -> memref> + // CHECK-NEXT: ^ subview runs out-of-bounds along dimension 0 // CHECK-NEXT: Location: loc({{.*}}) func.call @subview_dynamic(%alloca_4_dyn, %0, %4, %4) : (memref, index, index, index) -> () From e4b9486056fab7a262fdafbe70acf393c9767d12 Mon Sep 17 00:00:00 2001 From: Finn Plummer <50529406+inbelic@users.noreply.github.com> Date: Mon, 31 Mar 2025 10:26:51 -0700 Subject: [PATCH 0120/1029] [HLSL][RootSignature] Implement parsing of a DescriptorTable with empty clauses (#133302) - defines the Parser class and an initial set of helper methods to support consuming tokens. functionality is demonstrated through a simple empty descriptor table test case - defines an initial in-memory representation of a DescriptorTable - implements a test harness that will be used to validate the correct diagnostics are generated. it will construct a dummy pre-processor with diagnostics consumer to do so Implements the first part of https://github.com/llvm/llvm-project/issues/126569 --- .../clang/Basic/DiagnosticParseKinds.td | 4 + .../clang/Lex/HLSLRootSignatureTokenKinds.def | 23 +- .../include/clang/Lex/LexHLSLRootSignature.h | 15 +- .../clang/Parse/ParseHLSLRootSignature.h | 107 ++++++++ clang/lib/Parse/CMakeLists.txt | 1 + clang/lib/Parse/ParseHLSLRootSignature.cpp | 166 ++++++++++++ clang/unittests/CMakeLists.txt | 1 + .../Lex/LexHLSLRootSignatureTest.cpp | 4 +- clang/unittests/Parse/CMakeLists.txt | 23 ++ .../Parse/ParseHLSLRootSignatureTest.cpp | 245 ++++++++++++++++++ .../llvm/Frontend/HLSL/HLSLRootSignature.h | 44 ++++ 11 files changed, 620 insertions(+), 13 deletions(-) create mode 100644 clang/include/clang/Parse/ParseHLSLRootSignature.h create mode 100644 clang/lib/Parse/ParseHLSLRootSignature.cpp create mode 100644 clang/unittests/Parse/CMakeLists.txt create mode 100644 clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp create mode 100644 llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 86c361b4dbcf7..2582e1e5ef0f6 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1830,4 +1830,8 @@ def err_hlsl_virtual_function def err_hlsl_virtual_inheritance : Error<"virtual inheritance is unsupported in HLSL">; +// HLSL Root Siganture diagnostic messages +def err_hlsl_unexpected_end_of_params + : Error<"expected %0 to denote end of parameters, or, another valid parameter of %1">; + } // end of Parser diagnostics diff --git a/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def b/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def index e6df763920430..c514d3456146a 100644 --- a/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def +++ b/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def @@ -14,16 +14,16 @@ //===----------------------------------------------------------------------===// #ifndef TOK -#define TOK(X) +#define TOK(X, SPELLING) #endif #ifndef PUNCTUATOR -#define PUNCTUATOR(X,Y) TOK(pu_ ## X) +#define PUNCTUATOR(X,Y) TOK(pu_ ## X, Y) #endif #ifndef KEYWORD -#define KEYWORD(X) TOK(kw_ ## X) +#define KEYWORD(X) TOK(kw_ ## X, #X) #endif #ifndef ENUM -#define ENUM(NAME, LIT) TOK(en_ ## NAME) +#define ENUM(NAME, LIT) TOK(en_ ## NAME, LIT) #endif // Defines the various types of enum @@ -49,15 +49,15 @@ #endif // General Tokens: -TOK(invalid) -TOK(end_of_stream) -TOK(int_literal) +TOK(invalid, "invalid identifier") +TOK(end_of_stream, "end of stream") +TOK(int_literal, "integer literal") // Register Tokens: -TOK(bReg) -TOK(tReg) -TOK(uReg) -TOK(sReg) +TOK(bReg, "b register") +TOK(tReg, "t register") +TOK(uReg, "u register") +TOK(sReg, "s register") // Punctuators: PUNCTUATOR(l_paren, '(') @@ -69,6 +69,7 @@ PUNCTUATOR(plus, '+') PUNCTUATOR(minus, '-') // RootElement Keywords: +KEYWORD(RootSignature) // used only for diagnostic messaging KEYWORD(DescriptorTable) // DescriptorTable Keywords: diff --git a/clang/include/clang/Lex/LexHLSLRootSignature.h b/clang/include/clang/Lex/LexHLSLRootSignature.h index 21c44e0351d9e..a7e1f782b767f 100644 --- a/clang/include/clang/Lex/LexHLSLRootSignature.h +++ b/clang/include/clang/Lex/LexHLSLRootSignature.h @@ -13,6 +13,7 @@ #ifndef LLVM_CLANG_LEX_LEXHLSLROOTSIGNATURE_H #define LLVM_CLANG_LEX_LEXHLSLROOTSIGNATURE_H +#include "clang/Basic/Diagnostic.h" #include "clang/Basic/SourceLocation.h" #include "llvm/ADT/SmallVector.h" @@ -24,7 +25,7 @@ namespace hlsl { struct RootSignatureToken { enum Kind { -#define TOK(X) X, +#define TOK(X, SPELLING) X, #include "clang/Lex/HLSLRootSignatureTokenKinds.def" }; @@ -43,6 +44,18 @@ struct RootSignatureToken { }; using TokenKind = enum RootSignatureToken::Kind; +inline const DiagnosticBuilder &operator<<(const DiagnosticBuilder &DB, + const TokenKind Kind) { + switch (Kind) { +#define TOK(X, SPELLING) \ + case TokenKind::X: \ + DB << SPELLING; \ + break; +#include "clang/Lex/HLSLRootSignatureTokenKinds.def" + } + return DB; +} + class RootSignatureLexer { public: RootSignatureLexer(StringRef Signature, clang::SourceLocation SourceLoc) diff --git a/clang/include/clang/Parse/ParseHLSLRootSignature.h b/clang/include/clang/Parse/ParseHLSLRootSignature.h new file mode 100644 index 0000000000000..43b41315b88b5 --- /dev/null +++ b/clang/include/clang/Parse/ParseHLSLRootSignature.h @@ -0,0 +1,107 @@ +//===--- ParseHLSLRootSignature.h -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the RootSignatureParser interface. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_PARSE_PARSEHLSLROOTSIGNATURE_H +#define LLVM_CLANG_PARSE_PARSEHLSLROOTSIGNATURE_H + +#include "clang/Basic/DiagnosticParse.h" +#include "clang/Lex/LexHLSLRootSignature.h" +#include "clang/Lex/Preprocessor.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" + +#include "llvm/Frontend/HLSL/HLSLRootSignature.h" + +namespace clang { +namespace hlsl { + +class RootSignatureParser { +public: + RootSignatureParser(SmallVector &Elements, + RootSignatureLexer &Lexer, clang::Preprocessor &PP); + + /// Consumes tokens from the Lexer and constructs the in-memory + /// representations of the RootElements. Tokens are consumed until an + /// error is encountered or the end of the buffer. + /// + /// Returns true if a parsing error is encountered. + bool parse(); + +private: + DiagnosticsEngine &getDiags() { return PP.getDiagnostics(); } + + // All private Parse.* methods follow a similar pattern: + // - Each method will start with an assert to denote what the CurToken is + // expected to be and will parse from that token forward + // + // - Therefore, it is the callers responsibility to ensure that you are + // at the correct CurToken. This should be done with the pattern of: + // + // if (TryConsumeExpectedToken(TokenKind)) + // if (Parse.*()) + // return true; + // + // or, + // + // if (ConsumeExpectedToken(TokenKind, ...)) + // return true; + // if (Parse.*()) + // return true; + // + // - All methods return true if a parsing error is encountered. It is the + // callers responsibility to propogate this error up, or deal with it + // otherwise + // + // - An error will be raised if the proceeding tokens are not what is + // expected, or, there is a lexing error + + /// Root Element parse methods: + bool parseDescriptorTable(); + bool parseDescriptorTableClause(); + + /// Invoke the Lexer to consume a token and update CurToken with the result + void consumeNextToken() { CurToken = Lexer.ConsumeToken(); } + + /// Return true if the next token one of the expected kinds + bool peekExpectedToken(TokenKind Expected); + bool peekExpectedToken(ArrayRef AnyExpected); + + /// Consumes the next token and report an error if it is not of the expected + /// kind. + /// + /// Returns true if there was an error reported. + bool consumeExpectedToken(TokenKind Expected, + unsigned DiagID = diag::err_expected, + TokenKind Context = TokenKind::invalid); + + /// Peek if the next token is of the expected kind and if it is then consume + /// it. + /// + /// Returns true if it successfully matches the expected kind and the token + /// was consumed. + bool tryConsumeExpectedToken(TokenKind Expected); + bool tryConsumeExpectedToken(ArrayRef Expected); + +private: + SmallVector &Elements; + RootSignatureLexer &Lexer; + + clang::Preprocessor &PP; + + RootSignatureToken CurToken; +}; + +} // namespace hlsl +} // namespace clang + +#endif // LLVM_CLANG_PARSE_PARSEHLSLROOTSIGNATURE_H diff --git a/clang/lib/Parse/CMakeLists.txt b/clang/lib/Parse/CMakeLists.txt index 22e902f7e1bc5..00fde537bb9c6 100644 --- a/clang/lib/Parse/CMakeLists.txt +++ b/clang/lib/Parse/CMakeLists.txt @@ -14,6 +14,7 @@ add_clang_library(clangParse ParseExpr.cpp ParseExprCXX.cpp ParseHLSL.cpp + ParseHLSLRootSignature.cpp ParseInit.cpp ParseObjc.cpp ParseOpenMP.cpp diff --git a/clang/lib/Parse/ParseHLSLRootSignature.cpp b/clang/lib/Parse/ParseHLSLRootSignature.cpp new file mode 100644 index 0000000000000..33caca5fa1c82 --- /dev/null +++ b/clang/lib/Parse/ParseHLSLRootSignature.cpp @@ -0,0 +1,166 @@ +//=== ParseHLSLRootSignature.cpp - Parse Root Signature -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Parse/ParseHLSLRootSignature.h" + +#include "llvm/Support/raw_ostream.h" + +using namespace llvm::hlsl::rootsig; + +namespace clang { +namespace hlsl { + +RootSignatureParser::RootSignatureParser(SmallVector &Elements, + RootSignatureLexer &Lexer, + Preprocessor &PP) + : Elements(Elements), Lexer(Lexer), PP(PP), CurToken(SourceLocation()) {} + +bool RootSignatureParser::parse() { + // Iterate as many RootElements as possible + while (tryConsumeExpectedToken(TokenKind::kw_DescriptorTable)) { + // Dispatch onto parser method. + // We guard against the unreachable here as we just ensured that CurToken + // will be one of the kinds in the while condition + switch (CurToken.Kind) { + case TokenKind::kw_DescriptorTable: + if (parseDescriptorTable()) + return true; + break; + default: + llvm_unreachable("Switch for consumed token was not provided"); + } + + if (!tryConsumeExpectedToken(TokenKind::pu_comma)) + break; + } + + if (!tryConsumeExpectedToken(TokenKind::end_of_stream)) { + getDiags().Report(CurToken.TokLoc, diag::err_hlsl_unexpected_end_of_params) + << /*expected=*/TokenKind::end_of_stream + << /*param of=*/TokenKind::kw_RootSignature; + return true; + } + return false; +} + +bool RootSignatureParser::parseDescriptorTable() { + assert(CurToken.Kind == TokenKind::kw_DescriptorTable && + "Expects to only be invoked starting at given keyword"); + + DescriptorTable Table; + + if (consumeExpectedToken(TokenKind::pu_l_paren, diag::err_expected_after, + CurToken.Kind)) + return true; + + // Iterate as many Clauses as possible + while (tryConsumeExpectedToken({TokenKind::kw_CBV, TokenKind::kw_SRV, + TokenKind::kw_UAV, TokenKind::kw_Sampler})) { + if (parseDescriptorTableClause()) + return true; + + Table.NumClauses++; + + if (!tryConsumeExpectedToken(TokenKind::pu_comma)) + break; + } + + if (!tryConsumeExpectedToken(TokenKind::pu_r_paren)) { + getDiags().Report(CurToken.TokLoc, diag::err_hlsl_unexpected_end_of_params) + << /*expected=*/TokenKind::pu_r_paren + << /*param of=*/TokenKind::kw_DescriptorTable; + return true; + } + + Elements.push_back(Table); + return false; +} + +bool RootSignatureParser::parseDescriptorTableClause() { + assert((CurToken.Kind == TokenKind::kw_CBV || + CurToken.Kind == TokenKind::kw_SRV || + CurToken.Kind == TokenKind::kw_UAV || + CurToken.Kind == TokenKind::kw_Sampler) && + "Expects to only be invoked starting at given keyword"); + + DescriptorTableClause Clause; + switch (CurToken.Kind) { + default: + llvm_unreachable("Switch for consumed token was not provided"); + case TokenKind::kw_CBV: + Clause.Type = ClauseType::CBuffer; + break; + case TokenKind::kw_SRV: + Clause.Type = ClauseType::SRV; + break; + case TokenKind::kw_UAV: + Clause.Type = ClauseType::UAV; + break; + case TokenKind::kw_Sampler: + Clause.Type = ClauseType::Sampler; + break; + } + + if (consumeExpectedToken(TokenKind::pu_l_paren, diag::err_expected_after, + CurToken.Kind)) + return true; + + if (consumeExpectedToken(TokenKind::pu_r_paren, diag::err_expected_after, + CurToken.Kind)) + return true; + + Elements.push_back(Clause); + return false; +} + +bool RootSignatureParser::peekExpectedToken(TokenKind Expected) { + return peekExpectedToken(ArrayRef{Expected}); +} + +bool RootSignatureParser::peekExpectedToken(ArrayRef AnyExpected) { + RootSignatureToken Result = Lexer.PeekNextToken(); + return llvm::is_contained(AnyExpected, Result.Kind); +} + +bool RootSignatureParser::consumeExpectedToken(TokenKind Expected, + unsigned DiagID, + TokenKind Context) { + if (tryConsumeExpectedToken(Expected)) + return false; + + // Report unexpected token kind error + DiagnosticBuilder DB = getDiags().Report(CurToken.TokLoc, DiagID); + switch (DiagID) { + case diag::err_expected: + DB << Expected; + break; + case diag::err_expected_either: + case diag::err_expected_after: + DB << Expected << Context; + break; + default: + break; + } + return true; +} + +bool RootSignatureParser::tryConsumeExpectedToken(TokenKind Expected) { + return tryConsumeExpectedToken(ArrayRef{Expected}); +} + +bool RootSignatureParser::tryConsumeExpectedToken( + ArrayRef AnyExpected) { + // If not the expected token just return + if (!peekExpectedToken(AnyExpected)) + return false; + consumeNextToken(); + return true; +} + +} // namespace hlsl +} // namespace clang diff --git a/clang/unittests/CMakeLists.txt b/clang/unittests/CMakeLists.txt index 85d265426ec80..9b3ce8aa7de73 100644 --- a/clang/unittests/CMakeLists.txt +++ b/clang/unittests/CMakeLists.txt @@ -25,6 +25,7 @@ endfunction() add_subdirectory(Basic) add_subdirectory(Lex) +add_subdirectory(Parse) add_subdirectory(Driver) if(CLANG_ENABLE_STATIC_ANALYZER) add_subdirectory(Analysis) diff --git a/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp b/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp index d72a842922f98..04af01ef97dea 100644 --- a/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp +++ b/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp @@ -85,6 +85,8 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexAllTokensTest) { (),|=+- + RootSignature + DescriptorTable CBV SRV UAV Sampler @@ -113,7 +115,7 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexAllTokensTest) { SmallVector Tokens; SmallVector Expected = { -#define TOK(NAME) hlsl::TokenKind::NAME, +#define TOK(NAME, SPELLING) hlsl::TokenKind::NAME, #include "clang/Lex/HLSLRootSignatureTokenKinds.def" }; diff --git a/clang/unittests/Parse/CMakeLists.txt b/clang/unittests/Parse/CMakeLists.txt new file mode 100644 index 0000000000000..eeb58174568cd --- /dev/null +++ b/clang/unittests/Parse/CMakeLists.txt @@ -0,0 +1,23 @@ +set(LLVM_LINK_COMPONENTS + Support + ) +add_clang_unittest(ParseTests + ParseHLSLRootSignatureTest.cpp + ) +clang_target_link_libraries(ParseTests + PRIVATE + clangAST + clangASTMatchers + clangBasic + clangFrontend + clangParse + clangSema + clangSerialization + clangTooling + ) +target_link_libraries(ParseTests + PRIVATE + LLVMTestingAnnotations + LLVMTestingSupport + clangTesting + ) diff --git a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp new file mode 100644 index 0000000000000..acdf455a5d6aa --- /dev/null +++ b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp @@ -0,0 +1,245 @@ +//=== ParseHLSLRootSignatureTest.cpp - Parse Root Signature tests ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TargetInfo.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/HeaderSearchOptions.h" +#include "clang/Lex/Lexer.h" +#include "clang/Lex/ModuleLoader.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/PreprocessorOptions.h" + +#include "clang/Lex/LexHLSLRootSignature.h" +#include "clang/Parse/ParseHLSLRootSignature.h" +#include "gtest/gtest.h" + +using namespace clang; +using namespace llvm::hlsl::rootsig; + +namespace { + +// Diagnostic helper for helper tests +class ExpectedDiagConsumer : public DiagnosticConsumer { + virtual void anchor() {} + + void HandleDiagnostic(DiagnosticsEngine::Level DiagLevel, + const Diagnostic &Info) override { + if (!FirstDiag || !ExpectedDiagID.has_value()) { + Satisfied = false; + return; + } + FirstDiag = false; + + Satisfied = ExpectedDiagID.value() == Info.getID(); + } + + bool FirstDiag = true; + bool Satisfied = false; + std::optional ExpectedDiagID; + +public: + void setNoDiag() { + Satisfied = true; + ExpectedDiagID = std::nullopt; + } + + void setExpected(unsigned DiagID) { + Satisfied = false; + ExpectedDiagID = DiagID; + } + + bool isSatisfied() { return Satisfied; } +}; + +// The test fixture. +class ParseHLSLRootSignatureTest : public ::testing::Test { +protected: + ParseHLSLRootSignatureTest() + : FileMgr(FileMgrOpts), DiagID(new DiagnosticIDs()), + Consumer(new ExpectedDiagConsumer()), + Diags(DiagID, new DiagnosticOptions, Consumer), + SourceMgr(Diags, FileMgr), TargetOpts(new TargetOptions) { + // This is an arbitrarily chosen target triple to create the target info. + TargetOpts->Triple = "dxil"; + Target = TargetInfo::CreateTargetInfo(Diags, TargetOpts); + } + + std::unique_ptr createPP(StringRef Source, + TrivialModuleLoader &ModLoader) { + std::unique_ptr Buf = + llvm::MemoryBuffer::getMemBuffer(Source); + SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf))); + + HeaderSearchOptions SearchOpts; + HeaderSearch HeaderInfo(SearchOpts, SourceMgr, Diags, LangOpts, + Target.get()); + std::unique_ptr PP = std::make_unique( + std::make_shared(), Diags, LangOpts, SourceMgr, + HeaderInfo, ModLoader, + /*IILookup =*/nullptr, + /*OwnsHeaderSearch =*/false); + PP->Initialize(*Target); + PP->EnterMainSourceFile(); + return PP; + } + + FileSystemOptions FileMgrOpts; + FileManager FileMgr; + IntrusiveRefCntPtr DiagID; + ExpectedDiagConsumer *Consumer; + DiagnosticsEngine Diags; + SourceManager SourceMgr; + LangOptions LangOpts; + std::shared_ptr TargetOpts; + IntrusiveRefCntPtr Target; +}; + +// Valid Parser Tests + +TEST_F(ParseHLSLRootSignatureTest, ValidParseEmptyTest) { + const llvm::StringLiteral Source = R"cc()cc"; + + TrivialModuleLoader ModLoader; + auto PP = createPP(Source, ModLoader); + auto TokLoc = SourceLocation(); + + hlsl::RootSignatureLexer Lexer(Source, TokLoc); + SmallVector Elements; + hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); + + // Test no diagnostics produced + Consumer->setNoDiag(); + + ASSERT_FALSE(Parser.parse()); + ASSERT_EQ((int)Elements.size(), 0); + + ASSERT_TRUE(Consumer->isSatisfied()); +} + +TEST_F(ParseHLSLRootSignatureTest, ValidParseDTClausesTest) { + const llvm::StringLiteral Source = R"cc( + DescriptorTable( + CBV(), + SRV(), + Sampler(), + UAV() + ), + DescriptorTable() + )cc"; + + TrivialModuleLoader ModLoader; + auto PP = createPP(Source, ModLoader); + auto TokLoc = SourceLocation(); + + hlsl::RootSignatureLexer Lexer(Source, TokLoc); + SmallVector Elements; + hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); + + // Test no diagnostics produced + Consumer->setNoDiag(); + + ASSERT_FALSE(Parser.parse()); + + // First Descriptor Table with 4 elements + RootElement Elem = Elements[0]; + ASSERT_TRUE(std::holds_alternative(Elem)); + ASSERT_EQ(std::get(Elem).Type, ClauseType::CBuffer); + + Elem = Elements[1]; + ASSERT_TRUE(std::holds_alternative(Elem)); + ASSERT_EQ(std::get(Elem).Type, ClauseType::SRV); + + Elem = Elements[2]; + ASSERT_TRUE(std::holds_alternative(Elem)); + ASSERT_EQ(std::get(Elem).Type, ClauseType::Sampler); + + Elem = Elements[3]; + ASSERT_TRUE(std::holds_alternative(Elem)); + ASSERT_EQ(std::get(Elem).Type, ClauseType::UAV); + + Elem = Elements[4]; + ASSERT_TRUE(std::holds_alternative(Elem)); + ASSERT_EQ(std::get(Elem).NumClauses, (uint32_t)4); + + // Empty Descriptor Table + Elem = Elements[5]; + ASSERT_TRUE(std::holds_alternative(Elem)); + ASSERT_EQ(std::get(Elem).NumClauses, 0u); + ASSERT_TRUE(Consumer->isSatisfied()); +} + +// Invalid Parser Tests + +TEST_F(ParseHLSLRootSignatureTest, InvalidParseUnexpectedTokenTest) { + const llvm::StringLiteral Source = R"cc( + DescriptorTable() + space + )cc"; + + TrivialModuleLoader ModLoader; + auto PP = createPP(Source, ModLoader); + auto TokLoc = SourceLocation(); + + hlsl::RootSignatureLexer Lexer(Source, TokLoc); + SmallVector Elements; + hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); + + // Test correct diagnostic produced + Consumer->setExpected(diag::err_hlsl_unexpected_end_of_params); + ASSERT_TRUE(Parser.parse()); + + ASSERT_TRUE(Consumer->isSatisfied()); +} + +TEST_F(ParseHLSLRootSignatureTest, InvalidParseInvalidTokenTest) { + const llvm::StringLiteral Source = R"cc( + notAnIdentifier + )cc"; + + TrivialModuleLoader ModLoader; + auto PP = createPP(Source, ModLoader); + auto TokLoc = SourceLocation(); + + hlsl::RootSignatureLexer Lexer(Source, TokLoc); + SmallVector Elements; + hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); + + // Test correct diagnostic produced - invalid token + Consumer->setExpected(diag::err_hlsl_unexpected_end_of_params); + ASSERT_TRUE(Parser.parse()); + + ASSERT_TRUE(Consumer->isSatisfied()); +} + +TEST_F(ParseHLSLRootSignatureTest, InvalidParseUnexpectedEndOfStreamTest) { + const llvm::StringLiteral Source = R"cc( + DescriptorTable + )cc"; + + TrivialModuleLoader ModLoader; + auto PP = createPP(Source, ModLoader); + auto TokLoc = SourceLocation(); + + hlsl::RootSignatureLexer Lexer(Source, TokLoc); + SmallVector Elements; + hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); + + // Test correct diagnostic produced - end of stream + Consumer->setExpected(diag::err_expected_after); + ASSERT_TRUE(Parser.parse()); + + ASSERT_TRUE(Consumer->isSatisfied()); +} + +} // anonymous namespace diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h new file mode 100644 index 0000000000000..c1b67844c747f --- /dev/null +++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h @@ -0,0 +1,44 @@ +//===- HLSLRootSignature.h - HLSL Root Signature helper objects -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file This file contains helper objects for working with HLSL Root +/// Signatures. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_FRONTEND_HLSL_HLSLROOTSIGNATURE_H +#define LLVM_FRONTEND_HLSL_HLSLROOTSIGNATURE_H + +#include "llvm/Support/DXILABI.h" +#include + +namespace llvm { +namespace hlsl { +namespace rootsig { + +// Definitions of the in-memory data layout structures + +// Models the end of a descriptor table and stores its visibility +struct DescriptorTable { + uint32_t NumClauses = 0; // The number of clauses in the table +}; + +// Models DTClause : CBV | SRV | UAV | Sampler, by collecting like parameters +using ClauseType = llvm::dxil::ResourceClass; +struct DescriptorTableClause { + ClauseType Type; +}; + +// Models RootElement : DescriptorTable | DescriptorTableClause +using RootElement = std::variant; + +} // namespace rootsig +} // namespace hlsl +} // namespace llvm + +#endif // LLVM_FRONTEND_HLSL_HLSLROOTSIGNATURE_H From 5edf12738441d54239a5043e727b4dac709728c7 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Mon, 31 Mar 2025 10:28:55 -0700 Subject: [PATCH 0121/1029] [mlir][memref] Verify out-of-bounds access for memref.subview (#133086) * Improve the verifier of `memref.subview` to detect out-of-bounds extractions. * Improve the documentation of `memref.subview` to make clear that out-of-bounds extractions are not allowed. Rewrite examples to use the new `strided<>` notation instead of `affine_map` layout maps. Also remove all unrelated operations (`memref.alloc`) from the examples. * Fix various test cases where `memref.subview` ops ran out-of-bounds. * Update canonicalizations patterns to ensure that they do not fold IR if it would generate IR that no longer verifies. Related discussion on Discourse: https://discourse.llvm.org/t/out-of-bounds-semantics-of-memref-subview/85293 This is a re-upload of #131876, which was reverted due to failing GPU tests. These tests were faulty and fixed in #133051. --- .../mlir/Dialect/MemRef/IR/MemRefOps.td | 133 ++++++------------ .../mlir/Interfaces/ViewLikeInterface.h | 17 +-- mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp | 13 +- mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 8 +- .../expand-then-convert-to-llvm.mlir | 18 +-- mlir/test/Dialect/Linalg/promote.mlir | 28 ++-- mlir/test/Dialect/MemRef/canonicalize.mlir | 22 ++- .../MemRef/expand-strided-metadata.mlir | 66 ++++----- .../Dialect/MemRef/fold-memref-alias-ops.mlir | 14 +- mlir/test/Dialect/MemRef/invalid.mlir | 16 +++ mlir/test/Dialect/MemRef/subview.mlir | 2 +- mlir/test/Transforms/canonicalize.mlir | 72 +++++----- mlir/test/Transforms/compose-subview.mlir | 8 +- 13 files changed, 203 insertions(+), 214 deletions(-) diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td index 134cca5800918..3edc2433c85ea 100644 --- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td +++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td @@ -1859,11 +1859,11 @@ def SubViewOp : MemRef_OpWithOffsetSizesAndStrides<"subview", [ ]> { let summary = "memref subview operation"; let description = [{ - The "subview" operation converts a memref type to another memref type - which represents a reduced-size view of the original memref as specified by - the operation's offsets, sizes and strides arguments. + The `subview` operation converts a memref type to a memref type which + represents a reduced-size view of the original memref as specified by the + operation's offsets, sizes and strides arguments. - The SubView operation supports the following arguments: + The `subview` operation supports the following arguments: * source: the "base" memref on which to create a "view" memref. * offsets: memref-rank number of offsets into the "base" memref at which to @@ -1876,118 +1876,73 @@ def SubViewOp : MemRef_OpWithOffsetSizesAndStrides<"subview", [ The representation based on offsets, sizes and strides support a partially-static specification via attributes specified through the `static_offsets`, `static_sizes` and `static_strides` arguments. A special - sentinel value ShapedType::kDynamic encodes that the corresponding entry has - a dynamic value. + sentinel value `ShapedType::kDynamic` encodes that the corresponding entry + has a dynamic value. - A subview operation may additionally reduce the rank of the resulting view - by removing dimensions that are statically known to be of size 1. + A `subview` operation may additionally reduce the rank of the resulting + view by removing dimensions that are statically known to be of size 1. + + In the absence of rank reductions, the resulting memref type is computed + as follows: + ``` + result_sizes[i] = size_operands[i] + result_strides[i] = src_strides[i] * stride_operands[i] + result_offset = src_offset + dot_product(offset_operands, src_strides) + ``` + + The offset, size and stride operands must be in-bounds with respect to the + source memref. When possible, the static operation verifier will detect + out-of-bounds subviews. Subviews that cannot be confirmed to be in-bounds + or out-of-bounds based on compile-time information are valid. However, + performing an out-of-bounds subview at runtime is undefined behavior. Example 1: ```mlir - %0 = memref.alloc() : memref<64x4xf32, affine_map<(d0, d1) -> (d0 * 4 + d1)>> - - // Create a sub-view of "base" memref '%0' with offset arguments '%c0', - // dynamic sizes for each dimension, and stride arguments '%c1'. - %1 = memref.subview %0[%c0, %c0][%size0, %size1][%c1, %c1] - : memref<64x4xf32, affine_map<(d0, d1) -> (d0 * 4 + d1)>> to - memref (d0 * s1 + d1 + s0)>> + // Subview of static memref with strided layout at static offsets, sizes + // and strides. + %1 = memref.subview %0[4, 2][8, 2][3, 2] + : memref<64x4xf32, strided<[7, 9], offset: 91>> to + memref<8x2xf32, strided<[21, 18], offset: 137>> ``` Example 2: ```mlir - %0 = memref.alloc() : memref<8x16x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>> - - // Create a sub-view of "base" memref '%0' with dynamic offsets, sizes, + // Subview of static memref with identity layout at dynamic offsets, sizes // and strides. - // Note that dynamic offsets are represented by the linearized dynamic - // offset symbol 's0' in the subview memref layout map, and that the - // dynamic strides operands, after being applied to the base memref - // strides in each dimension, are represented in the view memref layout - // map as symbols 's1', 's2' and 's3'. - %1 = memref.subview %0[%i, %j, %k][%size0, %size1, %size2][%x, %y, %z] - : memref<8x16x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>> to - memref (d0 * s1 + d1 * s2 + d2 * s3 + s0)>> + %1 = memref.subview %0[%off0, %off1][%sz0, %sz1][%str0, %str1] + : memref<64x4xf32> to memref> ``` Example 3: ```mlir - %0 = memref.alloc() : memref<8x16x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>> - - // Subview with constant offsets, sizes and strides. - %1 = memref.subview %0[0, 2, 0][4, 4, 4][1, 1, 1] - : memref<8x16x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>> to - memref<4x4x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2 + 8)>> + // Subview of dynamic memref with strided layout at dynamic offsets and + // strides, but static sizes. + %1 = memref.subview %0[%off0, %off1][4, 4][%str0, %str1] + : memref> to + memref<4x4xf32, strided<[?, ?], offset: ?>> ``` Example 4: ```mlir - %0 = memref.alloc(%arg0, %arg1) : memref - - // Subview with constant size, but dynamic offsets and - // strides. The resulting memref has a static shape, but if the - // base memref has an affine map to describe the layout, the result - // memref also uses an affine map to describe the layout. The - // strides of the result memref is computed as follows: - // - // Let #map1 represents the layout of the base memref, and #map2 - // represents the layout of the result memref. A #mapsubview can be - // constructed to map an index from the result memref to the base - // memref (note that the description below uses more convenient - // naming for symbols, while in affine maps, symbols are - // represented as unsigned numbers that identify that symbol in the - // given affine map. - // - // #mapsubview = (d0, d1)[o0, o1, t0, t1] -> (d0 * t0 + o0, d1 * t1 + o1) - // - // where, o0, o1, ... are offsets, and t0, t1, ... are strides. Then, - // - // #map2 = #map1.compose(#mapsubview) - // - // If the layout map is represented as - // - // #map1 = (d0, d1)[s0, s1, s2] -> (d0 * s1 + d1 * s2 + s0) - // - // then, - // - // #map2 = (d0, d1)[s0, s1, s2, o0, o1, t0, t1] -> - // (d0 * s1 * t0 + d1 * s2 * t1 + o0 * s1 + o1 * s2 + s0) - // - // Representing this canonically - // - // #map2 = (d0, d1)[r0, r1, r2] -> (d0 * r1 + d1 * r2 + r0) - // - // where, r0 = o0 * s1 + o1 * s2 + s0, r1 = s1 * t0, r2 = s2 * t1. - %1 = memref.subview %0[%i, %j][4, 4][%x, %y] : - : memref (d0 * s1 + d1 * s2 + s0)>> to - memref<4x4xf32, affine_map<(d0, d1)[r0, r1, r2] -> (d0 * r1 + d1 * r2 + r0)>> - - // Note that the subview op does not guarantee that the result - // memref is "inbounds" w.r.t to base memref. It is upto the client - // to ensure that the subview is accessed in a manner that is - // in-bounds. + // Rank-reducing subviews. + %1 = memref.subview %0[0, 0, 0][1, 16, 4][1, 1, 1] + : memref<8x16x4xf32> to memref<16x4xf32> + %3 = memref.subview %2[3, 4, 2][1, 6, 3][1, 1, 1] + : memref<8x16x4xf32> to memref<6x3xf32, strided<[4, 1], offset: 210>> ``` Example 5: ```mlir - // Rank-reducing subview. - %1 = memref.subview %0[0, 0, 0][1, 16, 4][1, 1, 1] : - memref<8x16x4xf32> to memref<16x4xf32> - - // Original layout: - // (d0, d1, d2) -> (64 * d0 + 16 * d1 + d2) - // Subviewed layout: - // (d0, d1, d2) -> (64 * (d0 + 3) + 4 * (d1 + 4) + d2 + 2) = (64 * d0 + 4 * d1 + d2 + 210) - // After rank reducing: - // (d0, d1) -> (4 * d0 + d1 + 210) - %3 = memref.subview %2[3, 4, 2][1, 6, 3][1, 1, 1] : - memref<8x16x4xf32> to memref<6x3xf32, strided<[4, 1], offset: 210>> + // Identity subview. The subview is the full source memref. + %1 = memref.subview %0[0, 0, 0] [8, 16, 4] [1, 1, 1] + : memref<8x16x4xf32> to memref<8x16x4xf32> ``` + }]; let arguments = (ins AnyMemRef:$source, diff --git a/mlir/include/mlir/Interfaces/ViewLikeInterface.h b/mlir/include/mlir/Interfaces/ViewLikeInterface.h index e74326dba7c80..14427a97a5502 100644 --- a/mlir/include/mlir/Interfaces/ViewLikeInterface.h +++ b/mlir/include/mlir/Interfaces/ViewLikeInterface.h @@ -76,8 +76,7 @@ SliceBoundsVerificationResult verifyInBoundsSlice( /// returns the new result type of the op, based on the new offsets, sizes and /// strides. `CastOpFunc` is used to generate a cast op if the result type of /// the op has changed. -template +template class OpWithOffsetSizesAndStridesConstantArgumentFolder final : public OpRewritePattern { public: @@ -95,14 +94,12 @@ class OpWithOffsetSizesAndStridesConstantArgumentFolder final failed(foldDynamicIndexList(mixedStrides))) return failure(); - if (CheckInBounds) { - // Pattern does not apply if the produced op would not verify. - SliceBoundsVerificationResult sliceResult = verifyInBoundsSlice( - cast(op.getSource().getType()).getShape(), mixedOffsets, - mixedSizes, mixedStrides); - if (!sliceResult.isValid) - return failure(); - } + // Pattern does not apply if the produced op would not verify. + SliceBoundsVerificationResult sliceResult = verifyInBoundsSlice( + cast(op.getSource().getType()).getShape(), mixedOffsets, + mixedSizes, mixedStrides); + if (!sliceResult.isValid) + return failure(); // Compute the new result type. auto resultType = diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp index 59434dccc117b..123666848f83a 100644 --- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp +++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp @@ -2977,6 +2977,9 @@ static LogicalResult produceSubViewErrorMsg(SliceVerificationResult result, LogicalResult SubViewOp::verify() { MemRefType baseType = getSourceType(); MemRefType subViewType = getType(); + ArrayRef staticOffsets = getStaticOffsets(); + ArrayRef staticSizes = getStaticSizes(); + ArrayRef staticStrides = getStaticStrides(); // The base memref and the view memref should be in the same memory space. if (baseType.getMemorySpace() != subViewType.getMemorySpace()) @@ -2991,7 +2994,7 @@ LogicalResult SubViewOp::verify() { // Compute the expected result type, assuming that there are no rank // reductions. MemRefType expectedType = SubViewOp::inferResultType( - baseType, getStaticOffsets(), getStaticSizes(), getStaticStrides()); + baseType, staticOffsets, staticSizes, staticStrides); // Verify all properties of a shaped type: rank, element type and dimension // sizes. This takes into account potential rank reductions. @@ -3025,6 +3028,14 @@ LogicalResult SubViewOp::verify() { return produceSubViewErrorMsg(SliceVerificationResult::LayoutMismatch, *this, expectedType); + // Verify that offsets, sizes, strides do not run out-of-bounds with respect + // to the base memref. + SliceBoundsVerificationResult boundsResult = + verifyInBoundsSlice(baseType.getShape(), staticOffsets, staticSizes, + staticStrides, /*generateErrorMessage=*/true); + if (!boundsResult.isValid) + return getOperation()->emitError(boundsResult.errorMessage); + return success(); } diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index 5f8493de991f3..d589f627d896e 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -2617,10 +2617,10 @@ struct SliceCanonicalizer { void ExtractSliceOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { - results.add, - ExtractSliceOpCastFolder>(context); + results.add< + OpWithOffsetSizesAndStridesConstantArgumentFolder< + ExtractSliceOp, SliceReturnTypeCanonicalizer, SliceCanonicalizer>, + ExtractSliceOpCastFolder>(context); } // diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir index 5517eafb588e8..fe91d26d5a251 100644 --- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir +++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir @@ -192,7 +192,7 @@ func.func @subview_const_stride(%0 : memref<64x4xf32, strided<[4, 1], offset: 0> // CHECK-LABEL: func @subview_const_stride_and_offset( // CHECK-SAME: %[[MEM:.*]]: memref<{{.*}}> -func.func @subview_const_stride_and_offset(%0 : memref<64x4xf32, strided<[4, 1], offset: 0>>) -> memref<62x3xf32, strided<[4, 1], offset: 8>> { +func.func @subview_const_stride_and_offset(%0 : memref<64x8xf32, strided<[8, 1], offset: 0>>) -> memref<62x3xf32, strided<[8, 1], offset: 2>> { // The last "insertvalue" that populates the memref descriptor from the function arguments. // CHECK: %[[MEMREF:.*]] = builtin.unrealized_conversion_cast %[[MEM]] @@ -201,21 +201,21 @@ func.func @subview_const_stride_and_offset(%0 : memref<64x4xf32, strided<[4, 1], // CHECK: %[[DESC:.*]] = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[DESC0:.*]] = llvm.insertvalue %[[BASE]], %[[DESC]][0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[DESC1:.*]] = llvm.insertvalue %[[BASE_ALIGNED]], %[[DESC0]][1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - // CHECK: %[[CST_OFF:.*]] = llvm.mlir.constant(8 : index) : i64 + // CHECK: %[[CST_OFF:.*]] = llvm.mlir.constant(2 : index) : i64 // CHECK: %[[DESC2:.*]] = llvm.insertvalue %[[CST_OFF]], %[[DESC1]][2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[CST_SIZE0:.*]] = llvm.mlir.constant(62 : index) : i64 // CHECK: %[[DESC3:.*]] = llvm.insertvalue %[[CST_SIZE0]], %[[DESC2]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - // CHECK: %[[CST_STRIDE0:.*]] = llvm.mlir.constant(4 : index) : i64 + // CHECK: %[[CST_STRIDE0:.*]] = llvm.mlir.constant(8 : index) : i64 // CHECK: %[[DESC4:.*]] = llvm.insertvalue %[[CST_STRIDE0]], %[[DESC3]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[CST_SIZE1:.*]] = llvm.mlir.constant(3 : index) : i64 // CHECK: %[[DESC5:.*]] = llvm.insertvalue %[[CST_SIZE1]], %[[DESC4]][3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[CST_STRIDE1:.*]] = llvm.mlir.constant(1 : index) : i64 // CHECK: %[[DESC6:.*]] = llvm.insertvalue %[[CST_STRIDE1]], %[[DESC5]][4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %1 = memref.subview %0[0, 8][62, 3][1, 1] : - memref<64x4xf32, strided<[4, 1], offset: 0>> - to memref<62x3xf32, strided<[4, 1], offset: 8>> - return %1 : memref<62x3xf32, strided<[4, 1], offset: 8>> + %1 = memref.subview %0[0, 2][62, 3][1, 1] : + memref<64x8xf32, strided<[8, 1], offset: 0>> + to memref<62x3xf32, strided<[8, 1], offset: 2>> + return %1 : memref<62x3xf32, strided<[8, 1], offset: 2>> } // ----- @@ -238,7 +238,7 @@ func.func @subview_mixed_static_dynamic(%0 : memref<64x4xf32, strided<[4, 1], of // CHECK: %[[TMP:.*]] = builtin.unrealized_conversion_cast %[[DESCSTRIDE0]] : i64 to index // CHECK: %[[DESCSTRIDE0_V2:.*]] = builtin.unrealized_conversion_cast %[[TMP]] : index to i64 // CHECK: %[[OFF0:.*]] = llvm.mul %[[ARG1]], %[[STRIDE0]] overflow : i64 - // CHECK: %[[BASE_OFF:.*]] = llvm.mlir.constant(8 : index) : i64 + // CHECK: %[[BASE_OFF:.*]] = llvm.mlir.constant(2 : index) : i64 // CHECK: %[[OFF2:.*]] = llvm.add %[[OFF0]], %[[BASE_OFF]] : i64 // CHECK: %[[TMP:.*]] = builtin.unrealized_conversion_cast %[[OFF2]] : i64 to index // CHECK: %[[OFF2:.*]] = builtin.unrealized_conversion_cast %[[TMP]] : index to i64 @@ -253,7 +253,7 @@ func.func @subview_mixed_static_dynamic(%0 : memref<64x4xf32, strided<[4, 1], of // CHECK: %[[CST_STRIDE1:.*]] = llvm.mlir.constant(1 : index) : i64 // CHECK: %[[DESC6:.*]] = llvm.insertvalue %[[CST_STRIDE1]], %[[DESC5]][4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> - %1 = memref.subview %0[%arg1, 8][62, %arg2][%arg0, 1] : + %1 = memref.subview %0[%arg1, 2][62, %arg2][%arg0, 1] : memref<64x4xf32, strided<[4, 1], offset: 0>> to memref<62x?xf32, strided<[?, 1], offset: ?>> return %1 : memref<62x?xf32, strided<[?, 1], offset: ?>> diff --git a/mlir/test/Dialect/Linalg/promote.mlir b/mlir/test/Dialect/Linalg/promote.mlir index 00b8c649b82c3..bab606c3a8169 100644 --- a/mlir/test/Dialect/Linalg/promote.mlir +++ b/mlir/test/Dialect/Linalg/promote.mlir @@ -287,18 +287,18 @@ module attributes {transform.with_named_sequence} { #map = affine_map<(d0, d1) -> (d0, d1)> // CHECK-LABEL: func.func @linalg_generic_update_all_function_inputs_outputs( - // CHECK-SAME: %[[VAL_0:.*]]: memref<3x4xf32, 1>, - // CHECK-SAME: %[[VAL_1:.*]]: memref<3x4xf32, 1>) -> memref<3x4xf32, 1> { -func.func @linalg_generic_update_all_function_inputs_outputs(%arg0: memref<3x4xf32, 1>, %arg1: memref<3x4xf32, 1>) -> memref<3x4xf32, 1> { - // CHECK: %[[VAL_2:.*]] = memref.alloc() {alignment = 64 : i64} : memref<3x4xf32, 1> - // CHECK: %[[VAL_3:.*]] = memref.subview %[[VAL_0]][0, 0] [4, 3] [1, 1] : memref<3x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> - // CHECK: %[[VAL_4:.*]] = memref.subview %[[VAL_1]][0, 0] [4, 3] [1, 1] : memref<3x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> - // CHECK: %[[VAL_5:.*]] = memref.subview %[[VAL_2]][0, 0] [4, 3] [1, 1] : memref<3x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> - - %alloc = memref.alloc() {alignment = 64 : i64} : memref<3x4xf32, 1> - %subview = memref.subview %arg0[0, 0] [4, 3] [1, 1] : memref<3x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> - %subview_0 = memref.subview %arg1[0, 0] [4, 3] [1, 1] : memref<3x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> - %subview_1 = memref.subview %alloc[0, 0] [4, 3] [1, 1] : memref<3x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> + // CHECK-SAME: %[[VAL_0:.*]]: memref<8x4xf32, 1>, + // CHECK-SAME: %[[VAL_1:.*]]: memref<8x4xf32, 1>) -> memref<8x4xf32, 1> { +func.func @linalg_generic_update_all_function_inputs_outputs(%arg0: memref<8x4xf32, 1>, %arg1: memref<8x4xf32, 1>) -> memref<8x4xf32, 1> { + // CHECK: %[[VAL_2:.*]] = memref.alloc() {alignment = 64 : i64} : memref<8x4xf32, 1> + // CHECK: %[[VAL_3:.*]] = memref.subview %[[VAL_0]][0, 0] [4, 3] [1, 1] : memref<8x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> + // CHECK: %[[VAL_4:.*]] = memref.subview %[[VAL_1]][0, 0] [4, 3] [1, 1] : memref<8x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> + // CHECK: %[[VAL_5:.*]] = memref.subview %[[VAL_2]][0, 0] [4, 3] [1, 1] : memref<8x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> + + %alloc = memref.alloc() {alignment = 64 : i64} : memref<8x4xf32, 1> + %subview = memref.subview %arg0[0, 0] [4, 3] [1, 1] : memref<8x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> + %subview_0 = memref.subview %arg1[0, 0] [4, 3] [1, 1] : memref<8x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> + %subview_1 = memref.subview %alloc[0, 0] [4, 3] [1, 1] : memref<8x4xf32, 1> to memref<4x3xf32, strided<[4, 1]>, 1> // CHECK: %[[VAL_6:.*]] = arith.constant 0 : index // CHECK: %[[VAL_7:.*]] = arith.constant 4 : index @@ -376,10 +376,10 @@ func.func @linalg_generic_update_all_function_inputs_outputs(%arg0: memref<3x4xf // CHECK: memref.dealloc %[[VAL_22]] : memref<48xi8, #gpu.address_space> // CHECK: memref.dealloc %[[VAL_41]] : memref<48xi8, #gpu.address_space> // CHECK: memref.dealloc %[[VAL_60]] : memref<48xi8, #gpu.address_space> - // CHECK: return %[[VAL_2]] : memref<3x4xf32, 1> + // CHECK: return %[[VAL_2]] : memref<8x4xf32, 1> // CHECK: } - return %alloc : memref<3x4xf32, 1> + return %alloc : memref<8x4xf32, 1> } diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir index 02110bc2892d0..5d8a7d3f64e8f 100644 --- a/mlir/test/Dialect/MemRef/canonicalize.mlir +++ b/mlir/test/Dialect/MemRef/canonicalize.mlir @@ -635,9 +635,9 @@ func.func @fold_no_op_subview(%arg0 : memref<20x42xf32>) -> memref<20x42xf32, st // ----- -func.func @no_fold_subview_with_non_zero_offset(%arg0 : memref<20x42xf32>) -> memref<20x42xf32, strided<[42, 1], offset: 1>> { - %0 = memref.subview %arg0[0, 1] [20, 42] [1, 1] : memref<20x42xf32> to memref<20x42xf32, strided<[42, 1], offset: 1>> - return %0 : memref<20x42xf32, strided<[42, 1], offset: 1>> +func.func @no_fold_subview_with_non_zero_offset(%arg0 : memref<20x42xf32>) -> memref<20x41xf32, strided<[42, 1], offset: 1>> { + %0 = memref.subview %arg0[0, 1] [20, 41] [1, 1] : memref<20x42xf32> to memref<20x41xf32, strided<[42, 1], offset: 1>> + return %0 : memref<20x41xf32, strided<[42, 1], offset: 1>> } // CHECK-LABEL: func @no_fold_subview_with_non_zero_offset( // CHECK: %[[SUBVIEW:.+]] = memref.subview @@ -645,9 +645,9 @@ func.func @no_fold_subview_with_non_zero_offset(%arg0 : memref<20x42xf32>) -> me // ----- -func.func @no_fold_subview_with_non_unit_stride(%arg0 : memref<20x42xf32>) -> memref<20x42xf32, strided<[42, 2]>> { - %0 = memref.subview %arg0[0, 0] [20, 42] [1, 2] : memref<20x42xf32> to memref<20x42xf32, strided<[42, 2]>> - return %0 : memref<20x42xf32, strided<[42, 2]>> +func.func @no_fold_subview_with_non_unit_stride(%arg0 : memref<20x42xf32>) -> memref<20x5xf32, strided<[42, 2]>> { + %0 = memref.subview %arg0[0, 0] [20, 5] [1, 2] : memref<20x42xf32> to memref<20x5xf32, strided<[42, 2]>> + return %0 : memref<20x5xf32, strided<[42, 2]>> } // CHECK-LABEL: func @no_fold_subview_with_non_unit_stride( // CHECK: %[[SUBVIEW:.+]] = memref.subview @@ -655,6 +655,16 @@ func.func @no_fold_subview_with_non_unit_stride(%arg0 : memref<20x42xf32>) -> me // ----- +// CHECK-LABEL: func @no_fold_invalid_dynamic_slice +// CHECK: memref.subview %arg0[2] [%{{.*}}] [1] : memref<10xf32> to memref> +func.func @no_fold_invalid_dynamic_slice(%arg0: memref<10xf32>) -> memref> { + %c11 = arith.constant 11 : index + %0 = memref.subview %arg0 [2][%c11][1] : memref<10xf32> to memref> + func.return %0 : memref> +} + +// ----- + func.func @no_fold_dynamic_no_op_subview(%arg0 : memref) -> memref> { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index diff --git a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir index 647731db439c0..1e6b0111fa4c7 100644 --- a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir +++ b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir @@ -119,39 +119,39 @@ func.func @extract_strided_metadata_of_subview(%base: memref<5x4xf32>) // when dynamic sizes are involved. // See extract_strided_metadata_of_subview for an explanation of the actual // expansion. -// Orig strides: [64, 4, 1] +// Orig strides: [384, 24, 1] // Sub strides: [1, 1, 1] -// => New strides: [64, 4, 1] +// => New strides: [384, 24, 1] // // Orig offset: 0 // Sub offsets: [3, 4, 2] -// => Final offset: 3 * 64 + 4 * 4 + 2 * 1 + 0 == 210 +// => Final offset: 3 * 384 + 4 * 24 + 2 * 1 + 0 == 1250 // // Final sizes == subview sizes == [%size, 6, 3] // // CHECK-LABEL: func @extract_strided_metadata_of_subview_with_dynamic_size -// CHECK-SAME: (%[[ARG:.*]]: memref<8x16x4xf32>, +// CHECK-SAME: (%[[ARG:.*]]: memref<8x16x24xf32>, // CHECK-SAME: %[[DYN_SIZE:.*]]: index) // -// CHECK-DAG: %[[C210:.*]] = arith.constant 210 : index -// CHECK-DAG: %[[C64:.*]] = arith.constant 64 : index +// CHECK-DAG: %[[C1250:.*]] = arith.constant 1250 : index +// CHECK-DAG: %[[C384:.*]] = arith.constant 384 : index // CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C24:.*]] = arith.constant 24 : index // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // // CHECK-DAG: %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:3, %[[STRIDES:.*]]:3 = memref.extract_strided_metadata %[[ARG]] // -// CHECK: return %[[BASE]], %[[C210]], %[[DYN_SIZE]], %[[C6]], %[[C3]], %[[C64]], %[[C4]], %[[C1]] +// CHECK: return %[[BASE]], %[[C1250]], %[[DYN_SIZE]], %[[C6]], %[[C3]], %[[C384]], %[[C24]], %[[C1]] func.func @extract_strided_metadata_of_subview_with_dynamic_size( - %base: memref<8x16x4xf32>, %size: index) + %base: memref<8x16x24xf32>, %size: index) -> (memref, index, index, index, index, index, index, index) { %subview = memref.subview %base[3, 4, 2][%size, 6, 3][1, 1, 1] : - memref<8x16x4xf32> to memref> + memref<8x16x24xf32> to memref> %base_buffer, %offset, %sizes:3, %strides:3 = memref.extract_strided_metadata %subview : - memref> + memref> -> memref, index, index, index, index, index, index, index return %base_buffer, %offset, %sizes#0, %sizes#1, %sizes#2, %strides#0, %strides#1, %strides#2 : @@ -167,37 +167,37 @@ func.func @extract_strided_metadata_of_subview_with_dynamic_size( // See extract_strided_metadata_of_subview for an explanation of the actual // expansion. // -// Orig strides: [64, 4, 1] +// Orig strides: [384, 24, 1] // Sub strides: [1, 1, 1] -// => New strides: [64, 4, 1] -// Final strides == filterOutReducedDim(new strides, 0) == [4 , 1] +// => New strides: [384, 24, 1] +// Final strides == filterOutReducedDim(new strides, 0) == [24 , 1] // // Orig offset: 0 // Sub offsets: [3, 4, 2] -// => Final offset: 3 * 64 + 4 * 4 + 2 * 1 + 0 == 210 +// => Final offset: 3 * 384 + 4 * 24 + 2 * 1 + 0 == 1250 // // Final sizes == filterOutReducedDim(subview sizes, 0) == [6, 3] // // CHECK-LABEL: func @extract_strided_metadata_of_rank_reduced_subview -// CHECK-SAME: (%[[ARG:.*]]: memref<8x16x4xf32>) +// CHECK-SAME: (%[[ARG:.*]]: memref<8x16x24xf32>) // -// CHECK-DAG: %[[C210:.*]] = arith.constant 210 : index +// CHECK-DAG: %[[C1250:.*]] = arith.constant 1250 : index // CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[C24:.*]] = arith.constant 24 : index // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // // CHECK-DAG: %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:3, %[[STRIDES:.*]]:3 = memref.extract_strided_metadata %[[ARG]] // -// CHECK: return %[[BASE]], %[[C210]], %[[C6]], %[[C3]], %[[C4]], %[[C1]] -func.func @extract_strided_metadata_of_rank_reduced_subview(%base: memref<8x16x4xf32>) +// CHECK: return %[[BASE]], %[[C1250]], %[[C6]], %[[C3]], %[[C24]], %[[C1]] +func.func @extract_strided_metadata_of_rank_reduced_subview(%base: memref<8x16x24xf32>) -> (memref, index, index, index, index, index) { %subview = memref.subview %base[3, 4, 2][1, 6, 3][1, 1, 1] : - memref<8x16x4xf32> to memref<6x3xf32, strided<[4, 1], offset: 210>> + memref<8x16x24xf32> to memref<6x3xf32, strided<[24, 1], offset: 1250>> %base_buffer, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %subview : - memref<6x3xf32, strided<[4,1], offset: 210>> + memref<6x3xf32, strided<[24, 1], offset: 1250>> -> memref, index, index, index, index, index return %base_buffer, %offset, %sizes#0, %sizes#1, %strides#0, %strides#1 : @@ -215,21 +215,21 @@ func.func @extract_strided_metadata_of_rank_reduced_subview(%base: memref<8x16x4 // See extract_strided_metadata_of_subview for an explanation of the actual // expansion. // -// Orig strides: [64, 4, 1] +// Orig strides: [384, 24, 1] // Sub strides: [1, %stride, 1] -// => New strides: [64, 4 * %stride, 1] -// Final strides == filterOutReducedDim(new strides, 0) == [4 * %stride , 1] +// => New strides: [384, 24 * %stride, 1] +// Final strides == filterOutReducedDim(new strides, 0) == [24 * %stride , 1] // // Orig offset: 0 // Sub offsets: [3, 4, 2] -// => Final offset: 3 * 64 + 4 * 4 + 2 * 1 + 0 == 210 +// => Final offset: 3 * 384 + 4 * 24 + 2 * 1 + 0 == 1250 // -// CHECK-DAG: #[[$STRIDE1_MAP:.*]] = affine_map<()[s0] -> (s0 * 4)> +// CHECK-DAG: #[[$STRIDE1_MAP:.*]] = affine_map<()[s0] -> (s0 * 24)> // CHECK-LABEL: func @extract_strided_metadata_of_rank_reduced_subview_w_variable_strides -// CHECK-SAME: (%[[ARG:.*]]: memref<8x16x4xf32>, +// CHECK-SAME: (%[[ARG:.*]]: memref<8x16x24xf32>, // CHECK-SAME: %[[DYN_STRIDE:.*]]: index) // -// CHECK-DAG: %[[C210:.*]] = arith.constant 210 : index +// CHECK-DAG: %[[C1250:.*]] = arith.constant 1250 : index // CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index @@ -238,16 +238,16 @@ func.func @extract_strided_metadata_of_rank_reduced_subview(%base: memref<8x16x4 // // CHECK-DAG: %[[DIM1_STRIDE:.*]] = affine.apply #[[$STRIDE1_MAP]]()[%[[DYN_STRIDE]]] // -// CHECK: return %[[BASE]], %[[C210]], %[[C6]], %[[C3]], %[[DIM1_STRIDE]], %[[C1]] +// CHECK: return %[[BASE]], %[[C1250]], %[[C6]], %[[C3]], %[[DIM1_STRIDE]], %[[C1]] func.func @extract_strided_metadata_of_rank_reduced_subview_w_variable_strides( - %base: memref<8x16x4xf32>, %stride: index) + %base: memref<8x16x24xf32>, %stride: index) -> (memref, index, index, index, index, index) { %subview = memref.subview %base[3, 4, 2][1, 6, 3][1, %stride, 1] : - memref<8x16x4xf32> to memref<6x3xf32, strided<[?, 1], offset: 210>> + memref<8x16x24xf32> to memref<6x3xf32, strided<[?, 1], offset: 1250>> %base_buffer, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %subview : - memref<6x3xf32, strided<[?, 1], offset: 210>> + memref<6x3xf32, strided<[?, 1], offset: 1250>> -> memref, index, index, index, index, index return %base_buffer, %offset, %sizes#0, %sizes#1, %strides#0, %strides#1 : diff --git a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir index 327cacf7d9a20..067cdb5c5fd20 100644 --- a/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir +++ b/mlir/test/Dialect/MemRef/fold-memref-alias-ops.mlir @@ -632,17 +632,17 @@ func.func @fold_static_stride_subview_with_affine_load_store_collapse_shape_with // CHECK: #[[$map:.*]] = affine_map<()[s0] -> (s0 + 2)> // CHECK-LABEL: func @subview_of_subview( -// CHECK-SAME: %[[m:.*]]: memref<1x1024xf32, 3>, %[[pos:.*]]: index +// CHECK-SAME: %[[m:.*]]: memref<8x1024xf32, 3>, %[[pos:.*]]: index // CHECK: %[[add:.*]] = affine.apply #[[$map]]()[%arg1] -// CHECK: memref.subview %arg0[4, %[[add]]] [1, 1] [1, 1] : memref<1x1024xf32, 3> to memref, 3> -func.func @subview_of_subview(%m: memref<1x1024xf32, 3>, %pos: index) +// CHECK: memref.subview %arg0[4, %[[add]]] [1, 1] [1, 1] : memref<8x1024xf32, 3> to memref, 3> +func.func @subview_of_subview(%m: memref<8x1024xf32, 3>, %pos: index) -> memref, 3> { - %0 = memref.subview %m[3, %pos] [1, 2] [1, 1] - : memref<1x1024xf32, 3> - to memref<1x2xf32, strided<[1024, 1], offset: ?>, 3> + %0 = memref.subview %m[3, %pos] [5, 7] [1, 1] + : memref<8x1024xf32, 3> + to memref<5x7xf32, strided<[1024, 1], offset: ?>, 3> %1 = memref.subview %0[1, 2] [1, 1] [1, 1] - : memref<1x2xf32, strided<[1024, 1], offset: ?>, 3> + : memref<5x7xf32, strided<[1024, 1], offset: ?>, 3> to memref, 3> return %1 : memref, 3> } diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir index f72ad48245f81..34fc4775924e7 100644 --- a/mlir/test/Dialect/MemRef/invalid.mlir +++ b/mlir/test/Dialect/MemRef/invalid.mlir @@ -723,6 +723,22 @@ func.func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) { // ----- +func.func @invalid_subview(%arg0: memref<10xf32>) { + // expected-error@+1 {{offset 0 is out-of-bounds: 10 >= 10}} + %0 = memref.subview %arg0 [10][1][1] : memref<10xf32> to memref<1xf32, strided<[1], offset: 10>> + return +} + +// ----- + +func.func @invalid_subview(%arg0: memref<9xf32>) { + // expected-error@+1 {{slice along dimension 0 runs out-of-bounds: 9 >= 9}} + %0 = memref.subview %arg0 [3][4][2] : memref<9xf32> to memref<4xf32, strided<[2], offset: 3>> + return +} + +// ----- + func.func @invalid_rank_reducing_subview(%arg0 : index, %arg1 : index, %arg2 : index) { %0 = memref.alloc() : memref<8x16x4xf32> // expected-error@+1 {{expected result type to be 'memref<8x16x4xf32, strided<[64, 4, 1]>>' or a rank-reduced version. (mismatch of result sizes)}} diff --git a/mlir/test/Dialect/MemRef/subview.mlir b/mlir/test/Dialect/MemRef/subview.mlir index 135a1124066e4..fd8aaaf86b2d8 100644 --- a/mlir/test/Dialect/MemRef/subview.mlir +++ b/mlir/test/Dialect/MemRef/subview.mlir @@ -90,7 +90,7 @@ func.func @memref_subview(%arg0 : index, %arg1 : index, %arg2 : index) { // CHECK: memref.subview %{{.*}}[0, 0, 0] [1, 16, 4] [1, 1, 1] : memref<8x16x4xf32> to memref<16x4xf32> %21 = memref.subview %20[0, 0, 0][1, 16, 4][1, 1, 1] : memref<8x16x4xf32> to memref<16x4xf32> - %22 = memref.subview %20[3, 4, 2][1, 6, 3][1, 1, 1] : memref<8x16x4xf32> to memref<6x3xf32, strided<[4, 1], offset: 210>> + %22 = memref.subview %20[3, 4, 1][1, 6, 3][1, 1, 1] : memref<8x16x4xf32> to memref<6x3xf32, strided<[4, 1], offset: 209>> %23 = memref.alloc() : memref %78 = memref.subview %23[] [] [] : memref to memref diff --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir index 9b74362b6ee75..8e02c06a0a293 100644 --- a/mlir/test/Transforms/canonicalize.mlir +++ b/mlir/test/Transforms/canonicalize.mlir @@ -752,83 +752,83 @@ func.func @subview(%arg0 : index, %arg1 : index) -> (index, index) { %c15 = arith.constant 15 : index // CHECK: %[[ALLOC0:.*]] = memref.alloc() - %0 = memref.alloc() : memref<8x16x4xf32, strided<[64, 4, 1], offset: 0>> + %0 = memref.alloc() : memref<128x96x64xf32, strided<[6144, 64, 1], offset: 0>> // Test: subview with constant base memref and constant operands is folded. // Note that the subview uses the base memrefs layout map because it used // zero offset and unit stride arguments. // CHECK: memref.subview %[[ALLOC0]][0, 0, 0] [7, 11, 2] [1, 1, 1] : - // CHECK-SAME: memref<8x16x4xf32, strided<[64, 4, 1]>> - // CHECK-SAME: to memref<7x11x2xf32, strided<[64, 4, 1]>> + // CHECK-SAME: memref<128x96x64xf32, strided<[6144, 64, 1]>> + // CHECK-SAME: to memref<7x11x2xf32, strided<[6144, 64, 1]>> %1 = memref.subview %0[%c0, %c0, %c0] [%c7, %c11, %c2] [%c1, %c1, %c1] - : memref<8x16x4xf32, strided<[64, 4, 1], offset: 0>> to + : memref<128x96x64xf32, strided<[6144, 64, 1], offset: 0>> to memref> %v0 = memref.load %1[%c0, %c0, %c0] : memref> // Test: subview with one dynamic operand can also be folded. // CHECK: memref.subview %[[ALLOC0]][0, %[[ARG0]], 0] [7, 11, 15] [1, 1, 1] : - // CHECK-SAME: memref<8x16x4xf32, strided<[64, 4, 1]>> - // CHECK-SAME: to memref<7x11x15xf32, strided<[64, 4, 1], offset: ?>> + // CHECK-SAME: memref<128x96x64xf32, strided<[6144, 64, 1]>> + // CHECK-SAME: to memref<7x11x15xf32, strided<[6144, 64, 1], offset: ?>> %2 = memref.subview %0[%c0, %arg0, %c0] [%c7, %c11, %c15] [%c1, %c1, %c1] - : memref<8x16x4xf32, strided<[64, 4, 1], offset: 0>> to + : memref<128x96x64xf32, strided<[6144, 64, 1], offset: 0>> to memref> memref.store %v0, %2[%c0, %c0, %c0] : memref> // CHECK: %[[ALLOC1:.*]] = memref.alloc(%[[ARG0]]) %3 = memref.alloc(%arg0) : memref> // Test: subview with constant operands but dynamic base memref is folded as long as the strides and offset of the base memref are static. - // CHECK: memref.subview %[[ALLOC1]][0, 0, 0] [7, 11, 15] [1, 1, 1] : + // CHECK: memref.subview %[[ALLOC1]][0, 0, 0] [7, 11, 2] [1, 1, 1] : // CHECK-SAME: memref> - // CHECK-SAME: to memref<7x11x15xf32, strided<[64, 4, 1]>> - %4 = memref.subview %3[%c0, %c0, %c0] [%c7, %c11, %c15] [%c1, %c1, %c1] + // CHECK-SAME: to memref<7x11x2xf32, strided<[64, 4, 1]>> + %4 = memref.subview %3[%c0, %c0, %c0] [%c7, %c11, %c2] [%c1, %c1, %c1] : memref> to memref> memref.store %v0, %4[%c0, %c0, %c0] : memref> // Test: subview offset operands are folded correctly w.r.t. base strides. // CHECK: memref.subview %[[ALLOC0]][1, 2, 7] [7, 11, 2] [1, 1, 1] : - // CHECK-SAME: memref<8x16x4xf32, strided<[64, 4, 1]>> to - // CHECK-SAME: memref<7x11x2xf32, strided<[64, 4, 1], offset: 79>> + // CHECK-SAME: memref<128x96x64xf32, strided<[6144, 64, 1]>> to + // CHECK-SAME: memref<7x11x2xf32, strided<[6144, 64, 1], offset: 6279>> %5 = memref.subview %0[%c1, %c2, %c7] [%c7, %c11, %c2] [%c1, %c1, %c1] - : memref<8x16x4xf32, strided<[64, 4, 1], offset: 0>> to + : memref<128x96x64xf32, strided<[6144, 64, 1], offset: 0>> to memref> memref.store %v0, %5[%c0, %c0, %c0] : memref> // Test: subview stride operands are folded correctly w.r.t. base strides. // CHECK: memref.subview %[[ALLOC0]][0, 0, 0] [7, 11, 2] [2, 7, 11] : - // CHECK-SAME: memref<8x16x4xf32, strided<[64, 4, 1]>> - // CHECK-SAME: to memref<7x11x2xf32, strided<[128, 28, 11]>> + // CHECK-SAME: memref<128x96x64xf32, strided<[6144, 64, 1]>> + // CHECK-SAME: to memref<7x11x2xf32, strided<[12288, 448, 11]>> %6 = memref.subview %0[%c0, %c0, %c0] [%c7, %c11, %c2] [%c2, %c7, %c11] - : memref<8x16x4xf32, strided<[64, 4, 1], offset: 0>> to + : memref<128x96x64xf32, strided<[6144, 64, 1], offset: 0>> to memref> memref.store %v0, %6[%c0, %c0, %c0] : memref> // Test: subview shape are folded, but offsets and strides are not even if base memref is static // CHECK: memref.subview %[[ALLOC0]][%[[ARG0]], %[[ARG0]], %[[ARG0]]] [7, 11, 2] [%[[ARG1]], %[[ARG1]], %[[ARG1]]] : - // CHECK-SAME: memref<8x16x4xf32, strided<[64, 4, 1]>> to + // CHECK-SAME: memref<128x96x64xf32, strided<[6144, 64, 1]>> to // CHECK-SAME: memref<7x11x2xf32, strided<[?, ?, ?], offset: ?>> %10 = memref.subview %0[%arg0, %arg0, %arg0] [%c7, %c11, %c2] [%arg1, %arg1, %arg1] : - memref<8x16x4xf32, strided<[64, 4, 1], offset: 0>> to + memref<128x96x64xf32, strided<[6144, 64, 1], offset: 0>> to memref> memref.store %v0, %10[%arg1, %arg1, %arg1] : memref> // Test: subview strides are folded, but offsets and shape are not even if base memref is static // CHECK: memref.subview %[[ALLOC0]][%[[ARG0]], %[[ARG0]], %[[ARG0]]] [%[[ARG1]], %[[ARG1]], %[[ARG1]]] [2, 7, 11] : - // CHECK-SAME: memref<8x16x4xf32, strided<[64, 4, 1]>> to - // CHECK-SAME: memref> + // CHECK-SAME: memref<128x96x64xf32, strided<[6144, 64, 1]>> to + // CHECK-SAME: memref> %11 = memref.subview %0[%arg0, %arg0, %arg0] [%arg1, %arg1, %arg1] [%c2, %c7, %c11] : - memref<8x16x4xf32, strided<[64, 4, 1], offset: 0>> to + memref<128x96x64xf32, strided<[6144, 64, 1], offset: 0>> to memref> memref.store %v0, %11[%arg0, %arg0, %arg0] : memref> // Test: subview offsets are folded, but strides and shape are not even if base memref is static // CHECK: memref.subview %[[ALLOC0]][1, 2, 7] [%[[ARG1]], %[[ARG1]], %[[ARG1]]] [%[[ARG0]], %[[ARG0]], %[[ARG0]]] : - // CHECK-SAME: memref<8x16x4xf32, strided<[64, 4, 1]>> to - // CHECK-SAME: memref> + // CHECK-SAME: memref<128x96x64xf32, strided<[6144, 64, 1]>> to + // CHECK-SAME: memref> %13 = memref.subview %0[%c1, %c2, %c7] [%arg1, %arg1, %arg1] [%arg0, %arg0, %arg0] : - memref<8x16x4xf32, strided<[64, 4, 1], offset: 0>> to + memref<128x96x64xf32, strided<[6144, 64, 1], offset: 0>> to memref> memref.store %v0, %13[%arg1, %arg1, %arg1] : memref> @@ -862,27 +862,27 @@ func.func @subview(%arg0 : index, %arg1 : index) -> (index, index) { memref> memref.store %v0, %17[%arg0, %arg0, %arg0] : memref> - // CHECK: %[[ALLOC3:.*]] = memref.alloc() : memref<12x4xf32> - %18 = memref.alloc() : memref<12x4xf32> + // CHECK: %[[ALLOC3:.*]] = memref.alloc() : memref<128x64xf32> + %18 = memref.alloc() : memref<128x64xf32> %c4 = arith.constant 4 : index // TEST: subview strides are maintained when sizes are folded // CHECK: memref.subview %[[ALLOC3]][%arg1, %arg1] [2, 4] [1, 1] : - // CHECK-SAME: memref<12x4xf32> to - // CHECK-SAME: memref<2x4xf32, strided<[4, 1], offset: ?> + // CHECK-SAME: memref<128x64xf32> to + // CHECK-SAME: memref<2x4xf32, strided<[64, 1], offset: ?> %19 = memref.subview %18[%arg1, %arg1] [%c2, %c4] [1, 1] : - memref<12x4xf32> to - memref> - memref.store %v0, %19[%arg1, %arg1] : memref> + memref<128x64xf32> to + memref> + memref.store %v0, %19[%arg1, %arg1] : memref> // TEST: subview strides and sizes are maintained when offsets are folded // CHECK: memref.subview %[[ALLOC3]][2, 4] [12, 4] [1, 1] : - // CHECK-SAME: memref<12x4xf32> to - // CHECK-SAME: memref<12x4xf32, strided<[4, 1], offset: 12>> + // CHECK-SAME: memref<128x64xf32> to + // CHECK-SAME: memref<12x4xf32, strided<[64, 1], offset: 132>> %20 = memref.subview %18[%c2, %c4] [12, 4] [1, 1] : - memref<12x4xf32> to - memref<12x4xf32, strided<[4, 1], offset: ?>> - memref.store %v0, %20[%arg1, %arg1] : memref<12x4xf32, strided<[4, 1], offset: ?>> + memref<128x64xf32> to + memref<12x4xf32, strided<[64, 1], offset: ?>> + memref.store %v0, %20[%arg1, %arg1] : memref<12x4xf32, strided<[64, 1], offset: ?>> // Test: dim on subview is rewritten to size operand. %7 = memref.dim %4, %c0 : memref> diff --git a/mlir/test/Transforms/compose-subview.mlir b/mlir/test/Transforms/compose-subview.mlir index 22ffd836c68ed..53fbb8a356def 100644 --- a/mlir/test/Transforms/compose-subview.mlir +++ b/mlir/test/Transforms/compose-subview.mlir @@ -53,10 +53,10 @@ func.func @subview_strided(%input: memref<4x1024xf32>) -> memref<1x128xf32, stri // ----- // CHECK-LABEL: func.func @subview_strided( -// CHECK-SAME: %[[VAL_0:.*]]: memref<4x1024xf32>) -> memref<1x64xf32, strided<[4096, 4], offset: 4480>> { -func.func @subview_strided(%input: memref<4x1024xf32>) -> memref<1x64xf32, strided<[4096, 4], offset: 4480>> { - // CHECK: %[[VAL_1:.*]] = memref.subview %[[VAL_0]][4, 384] [1, 64] [4, 4] : memref<4x1024xf32> to memref<1x64xf32, strided<[4096, 4], offset: 4480>> - %0 = memref.subview %input[2, 256] [2, 256] [2, 2] : memref<4x1024xf32> to memref<2x256xf32, strided<[2048, 2], offset: 2304>> +// CHECK-SAME: %[[VAL_0:.*]]: memref<8x1024xf32>) -> memref<1x64xf32, strided<[4096, 4], offset: 4480>> { +func.func @subview_strided(%input: memref<8x1024xf32>) -> memref<1x64xf32, strided<[4096, 4], offset: 4480>> { + // CHECK: %[[VAL_1:.*]] = memref.subview %[[VAL_0]][4, 384] [1, 64] [4, 4] : memref<8x1024xf32> to memref<1x64xf32, strided<[4096, 4], offset: 4480>> + %0 = memref.subview %input[2, 256] [2, 256] [2, 2] : memref<8x1024xf32> to memref<2x256xf32, strided<[2048, 2], offset: 2304>> %1 = memref.subview %0[1, 64] [1, 64] [2, 2] : memref<2x256xf32, strided<[2048, 2], offset: 2304>> to memref<1x64xf32, strided<[4096, 4], offset: 4480>> return %1 : memref<1x64xf32, strided<[4096, 4], offset: 4480>> } From e9a3ea2218b754a96be4f44240c3f8ee9cbd26c9 Mon Sep 17 00:00:00 2001 From: Dominik Steenken Date: Mon, 31 Mar 2025 19:30:06 +0200 Subject: [PATCH 0122/1029] [SystemZ, DebugInfo] Instrument SystemZ backend passes for Instr-Ref DebugInfo (#133061) This PR instruments the optimization passes in the SystemZ backend with calls to `MachineFunction::substituteDebugValuesForInst` where instruction substitutions are made to instructions that may compute tracked values. Tests are also added for each of the substitutions that were inserted. Details on the individual passes follow. ### systemz-copy-physregs When a copy targets an access register, we redirect the copy via an auxiliary register. This leads to the final result being written by a newly inserted SAR instruction, rather than the original MI, so we need to update the debug value tracking to account for this. ### systemz-long-branch This pass relaxes relative branch instructions based on the actual locations of blocks. Only one of the branch instructions qualifies for debug value tracking: BRCT, i.e. branch-relative-on-count, which subtracts 1 from a register and branches if the result is not zero. This is relaxed into an add-immediate and a conditional branch, so any `debug-instr-number` present must move to the add-immediate instruction. ### systemz-post-rewrite This pass replaces `LOCRMux` and `SELRMux` pseudoinstructions with either the real versions of those instructions, or with branching programs that implement the intent of the Pseudo. In all these cases, any `debug-instr-number` attached to the pseudo needs to be reallocated to the appropriate instruction in the result, either LOCR, SELR, or a COPY. ### systemz-elim-compare Similar to systemz-long-branch, for this pass, only few substitutions are necessary, since it mainly deals with conditional branch instructions. The only exceptiona are again branch-relative-on-count, as it modifies a counter as part of the instruction, as well as any of the load instructions that are affected. --- .../Target/SystemZ/SystemZCopyPhysRegs.cpp | 6 +- .../lib/Target/SystemZ/SystemZElimCompare.cpp | 9 +++ llvm/lib/Target/SystemZ/SystemZLongBranch.cpp | 13 ++-- .../lib/Target/SystemZ/SystemZPostRewrite.cpp | 33 ++++++---- .../SystemZ/Large/debug-instrref-brct.py | 32 +++++++++ .../SystemZ/debug-instrref-copyphysregs.mir | 22 +++++++ .../SystemZ/debug-instrref-elimcompare.mir | 65 +++++++++++++++++++ .../SystemZ/debug-instrref-postrewrite.mir | 24 +++++++ 8 files changed, 188 insertions(+), 16 deletions(-) create mode 100644 llvm/test/CodeGen/SystemZ/Large/debug-instrref-brct.py create mode 100644 llvm/test/CodeGen/SystemZ/debug-instrref-copyphysregs.mir create mode 100644 llvm/test/CodeGen/SystemZ/debug-instrref-elimcompare.mir create mode 100644 llvm/test/CodeGen/SystemZ/debug-instrref-postrewrite.mir diff --git a/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp b/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp index 8979ce4386607..a6cf0f57aaf06 100644 --- a/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp +++ b/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp @@ -75,6 +75,7 @@ bool SystemZCopyPhysRegs::visitMBB(MachineBasicBlock &MBB) { DebugLoc DL = MI->getDebugLoc(); Register SrcReg = MI->getOperand(1).getReg(); Register DstReg = MI->getOperand(0).getReg(); + if (DstReg.isVirtual() && (SrcReg == SystemZ::CC || SystemZ::AR32BitRegClass.contains(SrcReg))) { Register Tmp = MRI->createVirtualRegister(&SystemZ::GR32BitRegClass); @@ -89,7 +90,10 @@ bool SystemZCopyPhysRegs::visitMBB(MachineBasicBlock &MBB) { SystemZ::AR32BitRegClass.contains(DstReg)) { Register Tmp = MRI->createVirtualRegister(&SystemZ::GR32BitRegClass); MI->getOperand(0).setReg(Tmp); - BuildMI(MBB, MBBI, DL, TII->get(SystemZ::SAR), DstReg).addReg(Tmp); + MachineInstr *NMI = + BuildMI(MBB, MBBI, DL, TII->get(SystemZ::SAR), DstReg).addReg(Tmp); + // SAR now writes the final value to DstReg, so update debug values. + MBB.getParent()->substituteDebugValuesForInst(*MI, *NMI); Modified = true; } } diff --git a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp index 9f4d4aaa68fa3..789365fb9e311 100644 --- a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp +++ b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp @@ -227,6 +227,9 @@ bool SystemZElimCompare::convertToBRCT( // this is not necessary there. if (BRCT != SystemZ::BRCTH) MIB.addReg(SystemZ::CC, RegState::ImplicitDefine | RegState::Dead); + // The debug instr tracking for the counter now used by BRCT needs to be + // updated. + MI.getParent()->getParent()->substituteDebugValuesForInst(MI, *MIB); MI.eraseFromParent(); return true; } @@ -268,6 +271,9 @@ bool SystemZElimCompare::convertToLoadAndTrap( .add(MI.getOperand(1)) .add(MI.getOperand(2)) .add(MI.getOperand(3)); + // The debug instr tracking for the load target now used by the load-and-trap + // needs to be updated. + MI.getParent()->getParent()->substituteDebugValuesForInst(MI, *Branch); MI.eraseFromParent(); return true; } @@ -288,6 +294,9 @@ bool SystemZElimCompare::convertToLoadAndTest( for (const auto &MO : MI.operands()) MIB.add(MO); MIB.setMemRefs(MI.memoperands()); + // The debug instr tracking for the load target now needs to be updated + // because the load has moved to a new instruction + MI.getParent()->getParent()->substituteDebugValuesForInst(MI, *MIB); MI.eraseFromParent(); // Mark instruction as not raising an FP exception if applicable. We already diff --git a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp index 36d76235398ed..f19b932f3c731 100644 --- a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp +++ b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp @@ -374,16 +374,19 @@ void SystemZLongBranch::splitBranchOnCount(MachineInstr *MI, unsigned AddOpcode) { MachineBasicBlock *MBB = MI->getParent(); DebugLoc DL = MI->getDebugLoc(); - BuildMI(*MBB, MI, DL, TII->get(AddOpcode)) - .add(MI->getOperand(0)) - .add(MI->getOperand(1)) - .addImm(-1); + MachineInstr *AddImm = BuildMI(*MBB, MI, DL, TII->get(AddOpcode)) + .add(MI->getOperand(0)) + .add(MI->getOperand(1)) + .addImm(-1); MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL)) .addImm(SystemZ::CCMASK_ICMP) .addImm(SystemZ::CCMASK_CMP_NE) .add(MI->getOperand(2)); // The implicit use of CC is a killing use. BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo()); + // The result of the BRANCH ON COUNT MI is the new count in register 0, so the + // debug tracking needs to go to the result of the Add immediate. + MBB->getParent()->substituteDebugValuesForInst(*MI, *AddImm); MI->eraseFromParent(); } @@ -402,6 +405,8 @@ void SystemZLongBranch::splitCompareBranch(MachineInstr *MI, .add(MI->getOperand(3)); // The implicit use of CC is a killing use. BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo()); + // Since we are replacing branches that did not compute any value, no debug + // value substitution is necessary. MI->eraseFromParent(); } diff --git a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp index 4b16bcf95d51c..ffeba87795625 100644 --- a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp +++ b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" using namespace llvm; @@ -108,15 +109,19 @@ void SystemZPostRewrite::selectSELRMux(MachineBasicBlock &MBB, bool DestIsHigh = SystemZ::isHighReg(DestReg); bool Src1IsHigh = SystemZ::isHighReg(Src1Reg); bool Src2IsHigh = SystemZ::isHighReg(Src2Reg); + // A copy instruction that we might create, held here for the purpose of + // debug instr value tracking. + MachineInstr *CopyInst = nullptr; // In rare cases both sources are the same register (after // machine-cse). This must be handled as it may lead to wrong-code (after // machine-cp) if the kill flag on Src1 isn't cleared (with // expandCondMove()). if (Src1Reg == Src2Reg) { - BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(), - TII->get(SystemZ::COPY), DestReg) - .addReg(Src1Reg, getRegState(Src1MO) & getRegState(Src2MO)); + CopyInst = BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(), + TII->get(SystemZ::COPY), DestReg) + .addReg(Src1Reg, getRegState(Src1MO) & getRegState(Src2MO)); + MBB.getParent()->substituteDebugValuesForInst(*MBBI, *CopyInst, 1); MBBI->eraseFromParent(); return; } @@ -126,21 +131,24 @@ void SystemZPostRewrite::selectSELRMux(MachineBasicBlock &MBB, // first. But only if this doesn't clobber the other source. if (DestReg != Src1Reg && DestReg != Src2Reg) { if (DestIsHigh != Src1IsHigh) { - BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(), - TII->get(SystemZ::COPY), DestReg) - .addReg(Src1Reg, getRegState(Src1MO)); + CopyInst = BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(), + TII->get(SystemZ::COPY), DestReg) + .addReg(Src1Reg, getRegState(Src1MO)); Src1MO.setReg(DestReg); Src1Reg = DestReg; Src1IsHigh = DestIsHigh; } else if (DestIsHigh != Src2IsHigh) { - BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(), - TII->get(SystemZ::COPY), DestReg) - .addReg(Src2Reg, getRegState(Src2MO)); + CopyInst = BuildMI(*MBBI->getParent(), MBBI, MBBI->getDebugLoc(), + TII->get(SystemZ::COPY), DestReg) + .addReg(Src2Reg, getRegState(Src2MO)); Src2MO.setReg(DestReg); Src2Reg = DestReg; Src2IsHigh = DestIsHigh; } } + // if a copy instruction was inserted, record the debug value substitution + if (CopyInst) + MBB.getParent()->substituteDebugValuesForInst(*MBBI, *CopyInst, 1); // If the destination (now) matches one source, prefer this to be first. if (DestReg != Src1Reg && DestReg == Src2Reg) { @@ -204,8 +212,11 @@ bool SystemZPostRewrite::expandCondMove(MachineBasicBlock &MBB, // In MoveMBB, emit an instruction to move SrcReg into DestReg, // then fall through to RestMBB. - BuildMI(*MoveMBB, MoveMBB->end(), DL, TII->get(SystemZ::COPY), DestReg) - .addReg(MI.getOperand(2).getReg(), getRegState(MI.getOperand(2))); + MachineInstr *CopyInst = + BuildMI(*MoveMBB, MoveMBB->end(), DL, TII->get(SystemZ::COPY), DestReg) + .addReg(MI.getOperand(2).getReg(), getRegState(MI.getOperand(2))); + // record the debug value substitution for CopyInst + MBB.getParent()->substituteDebugValuesForInst(*MBBI, *CopyInst, 1); MoveMBB->addSuccessor(RestMBB); NextMBBI = MBB.end(); diff --git a/llvm/test/CodeGen/SystemZ/Large/debug-instrref-brct.py b/llvm/test/CodeGen/SystemZ/Large/debug-instrref-brct.py new file mode 100644 index 0000000000000..36c3836bfca59 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/Large/debug-instrref-brct.py @@ -0,0 +1,32 @@ +# RUN: %python %s | llc -mtriple=s390x-linux-gnu -x mir --run-pass=systemz-long-branch \ +# RUN: | FileCheck %s + +# CHECK: debugValueSubstitutions: +# CHECK: - { srcinst: 1, srcop: 0, dstinst: 3, dstop: 0, subreg: 0 } +# CHECK: - { srcinst: 1, srcop: 3, dstinst: 3, dstop: 3, subreg: 0 } +# CHECK-NEXT: constants: [] +# CHECK: $r3l = AHI $r3l, -1 +# CHECK-NEXT: BRCL 14, 6, %bb.2 +print(" name: main") +print(" alignment: 16") +print(" tracksRegLiveness: true") +print(" liveins: ") +print(" - { reg: '$r1d', virtual-reg: '' }") +print(" - { reg: '$r2d', virtual-reg: '' }") +print(" - { reg: '$r3l', virtual-reg: '' }") +print(" - { reg: '$r4l', virtual-reg: '' }") +print(" debugValueSubstitutions: []") +print(" body: |") +print(" bb.0:") +print(" liveins: $r3l, $r4l, $r2d, $r3d") +print(" $r3l = BRCT $r3l, %bb.2, implicit-def $cc, debug-instr-number 1") +print(" J %bb.1, debug-instr-number 2") +print(" bb.1:") +print(" liveins: $r1d, $r2d") +for i in range(0, 8192): + print(" $r1d = LGR $r2d") + print(" $r2d = LGR $r1d") +print(" Return implicit $r2d") +print(" bb.2:") +print(" liveins: $r4l") +print(" Return implicit $r4l") diff --git a/llvm/test/CodeGen/SystemZ/debug-instrref-copyphysregs.mir b/llvm/test/CodeGen/SystemZ/debug-instrref-copyphysregs.mir new file mode 100644 index 0000000000000..ef0c4810731d6 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/debug-instrref-copyphysregs.mir @@ -0,0 +1,22 @@ +# Check that the backend properly tracks debug-instr-references across the +# copy-physregs pass. +# +# RUN: llc %s -mtriple=s390x-linux-gnu -run-pass=systemz-copy-physregs \ +# RUN: -o - 2>&1 | FileCheck %s + +# COPY 1: Copy VirtReg to AR +# COPY 2: Copy AR to VirtReg +# COPY 3: Copy CC to VirtReg +# CHECK: name: foo +# CHECK: debugValueSubstitutions: +# these are the correct substitutions +# CHECK-NEXT: - { srcinst: 1, srcop: 0, dstinst: 4, dstop: 0, subreg: 0 } +# we also need to make sure that these are the only substitutions +# CHECK-NEXT: constants: [] +name: foo +body: | + bb.0: + liveins: $a1 + COPY def $a1, %1:gr32bit, debug-instr-number 1 + COPY def %2:gr32bit, $a1, debug-instr-number 2 + COPY def %3:gr32bit, $cc, debug-instr-number 3 diff --git a/llvm/test/CodeGen/SystemZ/debug-instrref-elimcompare.mir b/llvm/test/CodeGen/SystemZ/debug-instrref-elimcompare.mir new file mode 100644 index 0000000000000..9382b7ad18fca --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/debug-instrref-elimcompare.mir @@ -0,0 +1,65 @@ +# Check that the backend properly tracks debug-instr-references across the +# elim-compare pass. +# +# RUN: llc %s -mtriple=s390x-linux-gnu -mcpu=z14 -run-pass=systemz-elim-compare \ +# RUN: -o - 2>&1 | FileCheck %s + +# bb.0 - elimination of CHI, modification of BRC, no substitutions +# bb.1 - elimination of CHI, replacement of LR with LTR, one substitution +# bb.2 - elimination of L and CHI, modification of CondTrap into LAT, one substitution +# CHECK: name: foo +# CHECK: debugValueSubstitutions: +# these are the correct substitutions +# CHECK-NEXT: - { srcinst: 5, srcop: 0, dstinst: 13, dstop: 0, subreg: 0 } +# CHECK-NEXT: - { srcinst: 7, srcop: 0, dstinst: 9, dstop: 0, subreg: 0 } +# CHECK-NEXT: - { srcinst: 10, srcop: 0, dstinst: 14, dstop: 0, subreg: 0 } +# we also need to make sure that these are the only substitutions +# CHECK-NEXT: constants: [] +--- +name: foo +tracksRegLiveness: true +liveins: + - { reg: '$r2l', virtual-reg: '' } + - { reg: '$r3l', virtual-reg: '' } + - { reg: '$r4l', virtual-reg: '' } + - { reg: '$r5d', virtual-reg: '' } +debugValueSubstitutions: [] +body: | + bb.0: + successors: %bb.1(0x80000000) + liveins: $r2l, $r3l, $r4l, $r5d + + renamable $r3l = nsw AR killed renamable $r3l, renamable $r2l, implicit-def dead $cc, debug-instr-number 1 + CHI renamable $r3l, 0, implicit-def $cc, debug-instr-number 2 + BRC 14, 12, %bb.1, implicit $cc, debug-instr-number 3 + + bb.1: + successors: %bb.2(0x80000000) + liveins: $r2l, $r3l, $r4l, $r5d + + CHI renamable $r2l, 0, implicit-def $cc, debug-instr-number 4 + renamable $r3l = LR renamable $r2l, debug-instr-number 5 + BRC 14, 8, %bb.2, implicit killed $cc, debug-instr-number 6 + + bb.2: + successors: %bb.3(0x80000000) + liveins: $r2l, $r3l, $r4l, $r5d + + renamable $r2l = L killed renamable $r5d, 0, $noreg, debug-instr-number 7 + CHI renamable $r2l, 0, implicit-def $cc, debug-instr-number 8 + CondTrap 14, 8, implicit killed $cc, debug-instr-number 9 + J %bb.3 + + bb.3: + successors: %bb.4(080000000) + liveins: $r2l, $r3l, $r4l, $r5d + + renamable $r3l = L renamable $r5d, 0, $noreg, debug-instr-number 10 + CHI renamable $r3l, 0, implicit-def $cc, debug-instr-number 11 + BRC 14, 8, %bb.4, implicit killed $cc, debug-instr-number 12 + + bb.4: + $r2l = LHI 2 + Return implicit $r2l + +... diff --git a/llvm/test/CodeGen/SystemZ/debug-instrref-postrewrite.mir b/llvm/test/CodeGen/SystemZ/debug-instrref-postrewrite.mir new file mode 100644 index 0000000000000..a0bb2c1b9ed83 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/debug-instrref-postrewrite.mir @@ -0,0 +1,24 @@ +# Check that the backend properly tracks debug-instr-references across the +# post-rewrite pass. +# +# RUN: llc %s -mtriple=s390x-linux-gnu -run-pass=systemz-post-rewrite \ +# RUN: -o - 2>&1 | FileCheck %s + +# SELRMux 1: simple replace with copy +# SELRMux 2: simple mutation into selfhr +# SELRMux 3: replace with if-then-else without prior copy +# SELRMux 4: replace with if-then-else with prior copy +# CHECK: name: foo +# CHECK: debugValueSubstitutions: +# CHECK-NEXT: - { srcinst: 1, srcop: 0, dstinst: 5, dstop: 0, subreg: 0 } +# CHECK-NEXT: - { srcinst: 3, srcop: 0, dstinst: 6, dstop: 0, subreg: 0 } +# CHECK-NEXT: - { srcinst: 4, srcop: 0, dstinst: 7, dstop: 0, subreg: 0 } +# CHECK-NEXT: - { srcinst: 4, srcop: 0, dstinst: 8, dstop: 0, subreg: 0 } +name: foo +body: | + bb.0: + liveins: $r2h, $r3h, $r2l, $r3l, $cc + SELRMux def $r2h, renamable $r3l, renamable $r3l, 1, 2, implicit $cc, debug-instr-number 1 + SELRMux def $r1h, renamable $r2h, renamable $r3h, 1, 2, implicit $cc, debug-instr-number 2 + SELRMux def $r2h, renamable $r2h, renamable $r3l, 1, 2, implicit $cc, debug-instr-number 3 + SELRMux def $r1h, renamable $r2l, renamable $r3l, 1, 2, implicit $cc, debug-instr-number 4 From bc3b1b06c6e59a0de5b4b3607816a9255ca01df9 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Mon, 31 Mar 2025 10:38:55 -0700 Subject: [PATCH 0123/1029] [mlir][memref] Fix build after #132545 (#133760) There was a typo in the error message. --- mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp index 4537977226087..cd92026562da9 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp @@ -370,7 +370,7 @@ struct SubViewOpInterface builder.create( loc, lastPosInBounds, RuntimeVerifiableOpInterface::generateErrorMessage( - op, "Subview runs out-of-bounds along dimension" + + op, "subview runs out-of-bounds along dimension " + std::to_string(i))); } } From 7eb99b85995a49972f7a3d5c68cf3543014a787b Mon Sep 17 00:00:00 2001 From: Wael Yehia Date: Mon, 31 Mar 2025 17:33:57 +0000 Subject: [PATCH 0124/1029] [AIX][PGO] Add testcase for D136192 Reviewed By: mandlebug Differential Revision: https://reviews.llvm.org/D136192 --- .../AIX/pgo-lto-bcdtor-function-section.test | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100755 compiler-rt/test/profile/AIX/pgo-lto-bcdtor-function-section.test diff --git a/compiler-rt/test/profile/AIX/pgo-lto-bcdtor-function-section.test b/compiler-rt/test/profile/AIX/pgo-lto-bcdtor-function-section.test new file mode 100755 index 0000000000000..36170716b56c2 --- /dev/null +++ b/compiler-rt/test/profile/AIX/pgo-lto-bcdtor-function-section.test @@ -0,0 +1,119 @@ +// RUN: split-file %s %t +// RUN: cd %t +// + +//--- foo.c +int foo() { return 3; } + +//--- main.c +int foo(); +int main() { return foo() - 3; } + +// # no LTO +// ## PGO, with and without function-sections, and all permutations of -bcdtors +// RUN: %clang_pgogen -O2 -c -fno-function-sections foo.c +// RUN: %clang_pgogen -O2 -c -fno-function-sections main.c +// +// RUN: %clang_pgogen -Wl,-bcdtors:all foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=BOTH +// +// RUN: %clang_pgogen -Wl,-bcdtors:mbr foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=BOTH + +// RUN: %clang_pgogen -O2 -c -ffunction-sections foo.c +// RUN: %clang_pgogen -O2 -c -ffunction-sections main.c +// +// RUN: %clang_pgogen -Wl,-bcdtors:all foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=BOTH +// +// RUN: %clang_pgogen -Wl,-bcdtors:mbr foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=BOTH + +// ## no PGO at compile step, but PGO at link step. +// RUN: %clang -O2 -c foo.c +// RUN: %clang -O2 -c main.c +// +// RUN: %clang_pgogen -Wl,-bcdtors:all foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=NONE +// +// RUN: %clang_pgogen -Wl,-bcdtors:mbr foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=NONE + +// # LTO, with and without function-sections, and all permutations of -bcdtors +// ## LTO one file, no PGO at compile, PGO at link +// RUN: %clang -O2 -c foo.c +// RUN: %clang -O2 -c -flto main.c +// +// RUN: %clang_pgogen -flto -fno-function-sections -Wl,-bcdtors:all foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=NONE +// +// RUN: %clang_pgogen -flto -fno-function-sections -Wl,-bcdtors:mbr foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=NONE +// +// RUN: %clang_pgogen -flto -ffunction-sections -Wl,-bcdtors:all foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=NONE +// +// RUN: %clang_pgogen -flto -ffunction-sections -Wl,-bcdtors:mbr foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=NONE + +// ## LTO one file, PGO at compile and link +// RUN: %clang -O2 -c -fno-function-sections foo.c +// RUN: %clang_pgogen -O2 -c -flto main.c +// RUN: %clang_pgogen -flto -fno-function-sections -Wl,-bcdtors:all foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=MAIN + +// RUN: %clang -O2 -c -flto foo.c +// RUN: %clang_pgogen -O2 -c -fno-function-sections main.c +// RUN: %clang_pgogen -flto -fno-function-sections -Wl,-bcdtors:mbr foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=MAIN + +// RUN: %clang -O2 -c -flto foo.c +// RUN: %clang_pgogen -O2 -c -ffunction-sections main.c +// RUN: %clang_pgogen -flto -ffunction-sections -Wl,-bcdtors:all foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=MAIN + +// RUN: %clang -O2 -c -ffunction-sections foo.c +// RUN: %clang_pgogen -O2 -c -flto main.c +// RUN: %clang_pgogen -flto -ffunction-sections -Wl,-bcdtors:mbr foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=MAIN + +// ## LTO and PGO both files +// RUN: %clang_pgogen -O2 -c -flto foo.c +// RUN: %clang_pgogen -O2 -c -flto main.c +// +// RUN: %clang_pgogen -flto -fno-function-sections -Wl,-bcdtors:all foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=BOTH +// +// RUN: %clang_pgogen -flto -fno-function-sections -Wl,-bcdtors:mbr foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=BOTH +// +// RUN: %clang_pgogen -flto -ffunction-sections -Wl,-bcdtors:all foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=BOTH +// +// RUN: %clang_pgogen -flto -ffunction-sections -Wl,-bcdtors:mbr foo.o main.o +// RUN: rm -f default* && %run a.out +// RUN: llvm-profdata show --all-functions default* | FileCheck %s -check-prefix=BOTH + +// BOTH-DAG: foo: +// BOTH-DAG: main: +// MAIN-NOT: foo: +// MAIN: main: +// MAIN-NOT: foo: +// NONE: Total functions: 0 From 143c37123b93a3dbd0fafd0296516ac1ab2afc36 Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Mon, 31 Mar 2025 19:41:29 +0200 Subject: [PATCH 0125/1029] [CIR] Upstream zero init for global variables (#133100) This change adds zero initialization for global variables --- .../CIR/Dialect/Builder/CIRBaseBuilder.h | 24 ++++++++ clang/lib/CIR/CodeGen/CIRGenModule.cpp | 7 ++- clang/test/CIR/CodeGen/array.cpp | 8 +-- clang/test/CIR/Lowering/array.cpp | 8 +-- clang/test/CIR/Lowering/global-var-simple.cpp | 58 +++++++++---------- clang/test/CIR/Lowering/hello.c | 2 +- clang/test/CIR/global-var-simple.cpp | 52 ++++++++--------- 7 files changed, 92 insertions(+), 67 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index ac7658276ec37..8b17cb7446afa 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -67,6 +67,30 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return create(loc, attr.getType(), attr); } + mlir::TypedAttr getConstNullPtrAttr(mlir::Type t) { + assert(mlir::isa(t) && "expected cir.ptr"); + return getConstPtrAttr(t, 0); + } + + mlir::TypedAttr getZeroAttr(mlir::Type t) { + return cir::ZeroAttr::get(getContext(), t); + } + + mlir::TypedAttr getZeroInitAttr(mlir::Type ty) { + if (mlir::isa(ty)) + return cir::IntAttr::get(ty, 0); + if (cir::isAnyFloatingPointType(ty)) + return cir::FPAttr::getZero(ty); + if (auto arrTy = mlir::dyn_cast(ty)) + return getZeroAttr(arrTy); + if (auto ptrTy = mlir::dyn_cast(ty)) + return getConstNullPtrAttr(ptrTy); + if (mlir::isa(ty)) { + return getCIRBoolAttr(false); + } + llvm_unreachable("Zero initializer for given type is NYI"); + } + cir::ConstantOp getBool(bool state, mlir::Location loc) { return create(loc, getBoolTy(), getCIRBoolAttr(state)); } diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 9776a4e09f9e0..2a37d6c7d1888 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -140,17 +140,20 @@ void CIRGenModule::emitGlobalVarDefinition(const clang::VarDecl *vd, // certain constant expressions is implemented for now. const VarDecl *initDecl; const Expr *initExpr = vd->getAnyInitializer(initDecl); + mlir::Attribute initializer; if (initExpr) { - mlir::Attribute initializer; if (APValue *value = initDecl->evaluateValue()) { ConstantEmitter emitter(*this); initializer = emitter.tryEmitPrivateForMemory(*value, astTy); } else { errorNYI(initExpr->getSourceRange(), "non-constant initializer"); } - varOp.setInitialValueAttr(initializer); + } else { + initializer = builder.getZeroInitAttr(convertType(astTy)); } + varOp.setInitialValueAttr(initializer); + // Set CIR's linkage type as appropriate. cir::GlobalLinkageKind linkage = getCIRLinkageVarDefinition(vd, /*IsConstant=*/false); diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp index a59880352e050..1e74275eab058 100644 --- a/clang/test/CIR/CodeGen/array.cpp +++ b/clang/test/CIR/CodeGen/array.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o - 2>&1 | FileCheck %s int a[10]; -// CHECK: cir.global external @a : !cir.array +// CHECK: cir.global external @a = #cir.zero : !cir.array int aa[10][5]; -// CHECK: cir.global external @aa : !cir.array x 10> +// CHECK: cir.global external @aa = #cir.zero : !cir.array x 10> extern int b[10]; -// CHECK: cir.global external @b : !cir.array +// CHECK: cir.global external @b = #cir.zero : !cir.array extern int bb[10][5]; -// CHECK: cir.global external @bb : !cir.array x 10> +// CHECK: cir.global external @bb = #cir.zero : !cir.array x 10> int c[10] = {}; // CHECK: cir.global external @c = #cir.zero : !cir.array diff --git a/clang/test/CIR/Lowering/array.cpp b/clang/test/CIR/Lowering/array.cpp index 763980b9124a3..4fb996aefe79e 100644 --- a/clang/test/CIR/Lowering/array.cpp +++ b/clang/test/CIR/Lowering/array.cpp @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o - 2>&1 | FileCheck %s int a[10]; -// CHECK: @a = external dso_local global [10 x i32] +// CHECK: @a = dso_local global [10 x i32] zeroinitializer int aa[10][5]; -// CHECK: @aa = external dso_local global [10 x [5 x i32]] +// CHECK: @aa = dso_local global [10 x [5 x i32]] zeroinitializer extern int b[10]; -// CHECK: @b = external dso_local global [10 x i32] +// CHECK: @b = dso_local global [10 x i32] zeroinitializer extern int bb[10][5]; -// CHECK: @bb = external dso_local global [10 x [5 x i32]] +// CHECK: @bb = dso_local global [10 x [5 x i32]] zeroinitializer int c[10] = {}; // CHECK: @c = dso_local global [10 x i32] zeroinitializer diff --git a/clang/test/CIR/Lowering/global-var-simple.cpp b/clang/test/CIR/Lowering/global-var-simple.cpp index ab8c6660a311b..33b418430d478 100644 --- a/clang/test/CIR/Lowering/global-var-simple.cpp +++ b/clang/test/CIR/Lowering/global-var-simple.cpp @@ -1,21 +1,19 @@ // Global variables of intergal types -// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o - | FileCheck %s -// Note: Currently unsupported features include default zero-initialization -// and alignment. The fact that "external" is only printed for globals -// without an initializer is a quirk of the LLVM AsmWriter. +// Note: Currently unsupported features include alignment.. char c; -// CHECK: @c = external dso_local global i8 +// CHECK: @c = dso_local global i8 0 signed char sc; -// CHECK: @sc = external dso_local global i8 +// CHECK: @sc = dso_local global i8 0 unsigned char uc; -// CHECK: @uc = external dso_local global i8 +// CHECK: @uc = dso_local global i8 0 short ss; -// CHECK: @ss = external dso_local global i16 +// CHECK: @ss = dso_local global i16 0 unsigned short us = 100; // CHECK: @us = dso_local global i16 100 @@ -24,82 +22,82 @@ int si = 42; // CHECK: @si = dso_local global i32 42 unsigned ui; -// CHECK: @ui = external dso_local global i32 +// CHECK: @ui = dso_local global i32 0 long sl; -// CHECK: @sl = external dso_local global i64 +// CHECK: @sl = dso_local global i64 0 unsigned long ul; -// CHECK: @ul = external dso_local global i64 +// CHECK: @ul = dso_local global i64 0 long long sll; -// CHECK: @sll = external dso_local global i64 +// CHECK: @sll = dso_local global i64 0 unsigned long long ull = 123456; // CHECK: @ull = dso_local global i64 123456 __int128 s128; -// CHECK: @s128 = external dso_local global i128 +// CHECK: @s128 = dso_local global i128 0 unsigned __int128 u128; -// CHECK: @u128 = external dso_local global i128 +// CHECK: @u128 = dso_local global i128 0 wchar_t wc; -// CHECK: @wc = external dso_local global i32 +// CHECK: @wc = dso_local global i32 0 char8_t c8; -// CHECK: @c8 = external dso_local global i8 +// CHECK: @c8 = dso_local global i8 0 char16_t c16; -// CHECK: @c16 = external dso_local global i16 +// CHECK: @c16 = dso_local global i16 0 char32_t c32; -// CHECK: @c32 = external dso_local global i32 +// CHECK: @c32 = dso_local global i32 0 _BitInt(20) sb20; -// CHECK: @sb20 = external dso_local global i20 +// CHECK: @sb20 = dso_local global i20 0 unsigned _BitInt(48) ub48; -// CHECK: @ub48 = external dso_local global i48 +// CHECK: @ub48 = dso_local global i48 0 bool boolfalse = false; // CHECK: @boolfalse = dso_local global i8 0 _Float16 f16; -// CHECK: @f16 = external dso_local global half +// CHECK: @f16 = dso_local global half __bf16 bf16; -// CHECK: @bf16 = external dso_local global bfloat +// CHECK: @bf16 = dso_local global bfloat float f; -// CHECK: @f = external dso_local global float +// CHECK: @f = dso_local global float 0.000000e+00 double d = 1.25; // CHECK: @d = dso_local global double 1.250000e+00 long double ld; -// CHECK: @ld = external dso_local global x86_fp80 +// CHECK: @ld = dso_local global x86_fp80 0xK00 __float128 f128; -// CHECK: @f128 = external dso_local global fp128 +// CHECK: @f128 = dso_local global fp128 0xL00 void *vp; -// CHECK: @vp = external dso_local global ptr{{$}} +// CHECK: @vp = dso_local global ptr null int *ip = 0; // CHECK: @ip = dso_local global ptr null double *dp; -// CHECK: @dp = external dso_local global ptr{{$}} +// CHECK: @dp = dso_local global ptr null char **cpp; -// CHECK: @cpp = external dso_local global ptr{{$}} +// CHECK: @cpp = dso_local global ptr null void (*fp)(); -// CHECK: @fp = external dso_local global ptr{{$}} +// CHECK: @fp = dso_local global ptr null int (*fpii)(int) = 0; // CHECK: @fpii = dso_local global ptr null void (*fpvar)(int, ...); -// CHECK: @fpvar = external dso_local global ptr{{$}} +// CHECK: @fpvar = dso_local global ptr null diff --git a/clang/test/CIR/Lowering/hello.c b/clang/test/CIR/Lowering/hello.c index ff78b6e6f6a5e..f45beafdcb533 100644 --- a/clang/test/CIR/Lowering/hello.c +++ b/clang/test/CIR/Lowering/hello.c @@ -3,7 +3,7 @@ int a; -// CHECK: @a = external dso_local global i32 +// CHECK: @a = dso_local global i32 0 int b = 2; diff --git a/clang/test/CIR/global-var-simple.cpp b/clang/test/CIR/global-var-simple.cpp index 020ef5f09c650..9a52925303504 100644 --- a/clang/test/CIR/global-var-simple.cpp +++ b/clang/test/CIR/global-var-simple.cpp @@ -2,16 +2,16 @@ // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o - | FileCheck %s char c; -// CHECK: cir.global external @c : !s8i +// CHECK: cir.global external @c = #cir.int<0> : !s8i signed char sc; -// CHECK: cir.global external @sc : !s8i +// CHECK: cir.global external @sc = #cir.int<0> : !s8i unsigned char uc; -// CHECK: cir.global external @uc : !u8i +// CHECK: cir.global external @uc = #cir.int<0> : !u8i short ss; -// CHECK: cir.global external @ss : !s16i +// CHECK: cir.global external @ss = #cir.int<0> : !s16i unsigned short us = 100; // CHECK: cir.global external @us = #cir.int<100> : !u16i @@ -20,82 +20,82 @@ int si = 42; // CHECK: cir.global external @si = #cir.int<42> : !s32i unsigned ui; -// CHECK: cir.global external @ui : !u32i +// CHECK: cir.global external @ui = #cir.int<0> : !u32i long sl; -// CHECK: cir.global external @sl : !s64i +// CHECK: cir.global external @sl = #cir.int<0> : !s64i unsigned long ul; -// CHECK: cir.global external @ul : !u64i +// CHECK: cir.global external @ul = #cir.int<0> : !u64i long long sll; -// CHECK: cir.global external @sll : !s64i +// CHECK: cir.global external @sll = #cir.int<0> : !s64i unsigned long long ull = 123456; // CHECK: cir.global external @ull = #cir.int<123456> : !u64i __int128 s128; -// CHECK: cir.global external @s128 : !s128i +// CHECK: cir.global external @s128 = #cir.int<0> : !s128i unsigned __int128 u128; -// CHECK: cir.global external @u128 : !u128i +// CHECK: cir.global external @u128 = #cir.int<0> : !u128i wchar_t wc; -// CHECK: cir.global external @wc : !s32i +// CHECK: cir.global external @wc = #cir.int<0> : !s32i char8_t c8; -// CHECK: cir.global external @c8 : !u8i +// CHECK: cir.global external @c8 = #cir.int<0> : !u8i char16_t c16; -// CHECK: cir.global external @c16 : !u16i +// CHECK: cir.global external @c16 = #cir.int<0> : !u16i char32_t c32; -// CHECK: cir.global external @c32 : !u32i +// CHECK: cir.global external @c32 = #cir.int<0> : !u32i _BitInt(20) sb20; -// CHECK: cir.global external @sb20 : !cir.int +// CHECK: cir.global external @sb20 = #cir.int<0> : !cir.int unsigned _BitInt(48) ub48; -// CHECK: cir.global external @ub48 : !cir.int +// CHECK: cir.global external @ub48 = #cir.int<0> : !cir.int bool boolfalse = false; // CHECK: cir.global external @boolfalse = #false _Float16 f16; -// CHECK: cir.global external @f16 : !cir.f16 +// CHECK: cir.global external @f16 = #cir.fp<0.000000e+00> : !cir.f16 __bf16 bf16; -// CHECK: cir.global external @bf16 : !cir.bf16 +// CHECK: cir.global external @bf16 = #cir.fp<0.000000e+00> : !cir.bf16 float f; -// CHECK: cir.global external @f : !cir.float +// CHECK: cir.global external @f = #cir.fp<0.000000e+00> : !cir.float double d = 1.25; // CHECK: cir.global external @d = #cir.fp<1.250000e+00> : !cir.double long double ld; -// CHECK: cir.global external @ld : !cir.long_double +// CHECK: cir.global external @ld = #cir.fp<0.000000e+00> : !cir.long_double __float128 f128; -// CHECK: cir.global external @f128 : !cir.f128 +// CHECK: cir.global external @f128 = #cir.fp<0.000000e+00> : !cir.f128 void *vp; -// CHECK: cir.global external @vp : !cir.ptr +// CHECK: cir.global external @vp = #cir.ptr : !cir.ptr int *ip = 0; // CHECK: cir.global external @ip = #cir.ptr : !cir.ptr double *dp; -// CHECK: cir.global external @dp : !cir.ptr +// CHECK: cir.global external @dp = #cir.ptr : !cir.ptr char **cpp; -// CHECK: cir.global external @cpp : !cir.ptr> +// CHECK: cir.global external @cpp = #cir.ptr : !cir.ptr> void (*fp)(); -// CHECK: cir.global external @fp : !cir.ptr> +// CHECK: cir.global external @fp = #cir.ptr : !cir.ptr> int (*fpii)(int) = 0; // CHECK: cir.global external @fpii = #cir.ptr : !cir.ptr !s32i>> void (*fpvar)(int, ...); -// CHECK: cir.global external @fpvar : !cir.ptr> +// CHECK: cir.global external @fpvar = #cir.ptr : !cir.ptr> From 2d7add6e2e56baf46504a8a22dec42b61f63360f Mon Sep 17 00:00:00 2001 From: Daniel Thornburgh Date: Mon, 31 Mar 2025 10:44:40 -0700 Subject: [PATCH 0126/1029] [LLD][ELF] Allow memory region in OVERLAY (#133540) This allows the contents of OVERLAYs to be attributed to memory regions. This is the only clean way to overlap VMAs in linker scripts that choose to primarily use memory regions to lay out addresses. This also simplifies OVERLAY expansion to better match GNU LD. Expressions for the first section's LMA and VMA are not generated if the user did not provide them. This allows the LMA/VMA offset to be preserved across multiple overlays in the same region, as with regular sections. Closes #129816 --- lld/ELF/LinkerScript.cpp | 15 +++++++++++- lld/ELF/LinkerScript.h | 1 + lld/ELF/OutputSections.h | 1 + lld/ELF/ScriptParser.cpp | 23 ++++++++++-------- lld/test/ELF/linkerscript/overlay.test | 33 ++++++++++++++++++++++++++ 5 files changed, 62 insertions(+), 11 deletions(-) diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp index e19823f2ea752..8149513d821e4 100644 --- a/lld/ELF/LinkerScript.cpp +++ b/lld/ELF/LinkerScript.cpp @@ -182,7 +182,18 @@ void LinkerScript::expandMemoryRegions(uint64_t size) { void LinkerScript::expandOutputSection(uint64_t size) { state->outSec->size += size; - expandMemoryRegions(size); + size_t regionSize = size; + if (state->outSec->inOverlay) { + // Expand the overlay if necessary, and expand the region by the + // corresponding amount. + if (state->outSec->size > state->overlaySize) { + regionSize = state->outSec->size - state->overlaySize; + state->overlaySize = state->outSec->size; + } else { + regionSize = 0; + } + } + expandMemoryRegions(regionSize); } void LinkerScript::setDot(Expr e, const Twine &loc, bool inSec) { @@ -1218,6 +1229,8 @@ bool LinkerScript::assignOffsets(OutputSection *sec) { // We can call this method multiple times during the creation of // thunks and want to start over calculation each time. sec->size = 0; + if (sec->firstInOverlay) + state->overlaySize = 0; // We visited SectionsCommands from processSectionCommands to // layout sections. Now, we visit SectionsCommands again to fix diff --git a/lld/ELF/LinkerScript.h b/lld/ELF/LinkerScript.h index 0a2dda13f4ef8..80c4f564afabc 100644 --- a/lld/ELF/LinkerScript.h +++ b/lld/ELF/LinkerScript.h @@ -311,6 +311,7 @@ class LinkerScript final { MemoryRegion *lmaRegion = nullptr; uint64_t lmaOffset = 0; uint64_t tbssAddr = 0; + uint64_t overlaySize; }; Ctx &ctx; diff --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h index 3ab36a21ce488..77654f85ca78c 100644 --- a/lld/ELF/OutputSections.h +++ b/lld/ELF/OutputSections.h @@ -102,6 +102,7 @@ class OutputSection final : public SectionBase { bool expressionsUseSymbols = false; bool usedInExpression = false; bool inOverlay = false; + bool firstInOverlay = false; // Tracks whether the section has ever had an input section added to it, even // if the section was later removed (e.g. because it is a synthetic section diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index 4c52bfda7a70e..4345b7bac1173 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -561,37 +561,40 @@ void ScriptParser::readSearchDir() { // https://sourceware.org/binutils/docs/ld/Overlay-Description.html#Overlay-Description SmallVector ScriptParser::readOverlay() { Expr addrExpr; - if (consume(":")) { - addrExpr = [s = ctx.script] { return s->getDot(); }; - } else { + if (!consume(":")) { addrExpr = readExpr(); expect(":"); } - // When AT is omitted, LMA should equal VMA. script->getDot() when evaluating - // lmaExpr will ensure this, even if the start address is specified. - Expr lmaExpr = consume("AT") ? readParenExpr() - : [s = ctx.script] { return s->getDot(); }; + Expr lmaExpr = consume("AT") ? readParenExpr() : Expr{}; expect("{"); SmallVector v; OutputSection *prev = nullptr; while (!errCount(ctx) && !consume("}")) { // VA is the same for all sections. The LMAs are consecutive in memory - // starting from the base load address specified. + // starting from the base load address. OutputDesc *osd = readOverlaySectionDescription(); osd->osec.addrExpr = addrExpr; if (prev) { osd->osec.lmaExpr = [=] { return prev->getLMA() + prev->size; }; } else { osd->osec.lmaExpr = lmaExpr; - // Use first section address for subsequent sections as initial addrExpr - // can be DOT. Ensure the first section, even if empty, is not discarded. + // Use first section address for subsequent sections. Ensure the first + // section, even if empty, is not discarded. osd->osec.usedInExpression = true; addrExpr = [=]() -> ExprValue { return {&osd->osec, false, 0, ""}; }; } v.push_back(osd); prev = &osd->osec; } + if (!v.empty()) + static_cast(v.front())->osec.firstInOverlay = true; + if (consume(">")) { + StringRef regionName = readName(); + for (SectionCommand *od : v) + static_cast(od)->osec.memoryRegionName = + std::string(regionName); + } // According to the specification, at the end of the overlay, the location // counter should be equal to the overlay base address plus size of the diff --git a/lld/test/ELF/linkerscript/overlay.test b/lld/test/ELF/linkerscript/overlay.test index 7c64303b45659..e230134ad5541 100644 --- a/lld/test/ELF/linkerscript/overlay.test +++ b/lld/test/ELF/linkerscript/overlay.test @@ -41,6 +41,23 @@ # ERR2-NEXT:>>> .out.aaa { *(.aaa) } > AX AT>FLASH # ERR2-NEXT:>>> ^ +# RUN: ld.lld a.o -T region.t -o region +# RUN: llvm-readelf --sections -l region | FileCheck --check-prefix=REGION %s + +# REGION: Name Type Address Off Size +# REGION: .big1 PROGBITS 0000000000001000 001000 000008 +# REGION-NEXT: .small1 PROGBITS 0000000000001000 002000 000004 +# REGION: .big2 PROGBITS 0000000000001008 002008 000008 +# REGION-NEXT: .small2 PROGBITS 0000000000001008 003008 000004 +# REGION-NEXT: .text PROGBITS 0000000000001010 003010 000001 + +# REGION: Program Headers: +# REGION: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align +# REGION-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000008 0x000008 R 0x1000 +# REGION-NEXT: LOAD 0x002000 0x0000000000001000 0x0000000000001008 0x000010 0x000010 R 0x1000 +# REGION-NEXT: LOAD 0x003008 0x0000000000001008 0x0000000000001018 0x000004 0x000004 R 0x1000 +# REGION-NEXT: LOAD 0x003010 0x0000000000001010 0x0000000000001020 0x000001 0x000001 R E 0x1000 + #--- a.s .globl _start _start: @@ -76,6 +93,22 @@ SECTIONS { .text : { *(.text) } } +#--- region.t +MEMORY { region : ORIGIN = 0x1000, LENGTH = 0x1000 } +SECTIONS { +## Memory region instead of explicit address. + OVERLAY : { + .big1 { *(.big1) } + .small1 { *(.small1) } + } >region + OVERLAY : { + .big2 { *(.big2) } + .small2 { *(.small2) } + } >region + .text : { *(.text) } >region + /DISCARD/ : { *(.big* .small*) } +} + #--- err1.t SECTIONS { OVERLAY 0x1000 : AT ( 0x2000 ) { From bfd8cc0a3e82c5e6345a66dd5db5242accb6874b Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 31 Mar 2025 10:31:18 -0700 Subject: [PATCH 0127/1029] [SLP]Fix a check for the whole register use Need to check the value type, not the return type, of the instructions, when doing the analysis for the whole register use to prevent a compiler crash. Fixes #133751 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 3 +- .../X86/cmp-values-non-full-registers.ll | 41 +++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/cmp-values-non-full-registers.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a4b0378abc075..0679eac176584 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8854,7 +8854,8 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. if ((UserTreeIdx.UserTE && UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) || - !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) { + !hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), + VL.size())) { LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " "for nodes with padding.\n"); auto Invalid = ScheduleBundle::invalid(); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp-values-non-full-registers.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp-values-non-full-registers.ll new file mode 100644 index 0000000000000..35b49944541b2 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp-values-non-full-registers.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-grtev4-linux-gnu < %s | FileCheck %s + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[BB1:.*]], label %[[BB2:.*]] +; CHECK: [[BB2]]: +; CHECK-NEXT: [[CMP14_US:%.*]] = fcmp ogt x86_fp80 0xK00000000000000000000, 0xK00000000000000000000 +; CHECK-NEXT: [[IMAX_2_US:%.*]] = select i1 [[CMP14_US]], i64 0, i64 0 +; CHECK-NEXT: [[JMAX_2_US:%.*]] = select i1 [[CMP14_US]], i64 0, i64 0 +; CHECK-NEXT: [[CMP13_US:%.*]] = fcmp olt x86_fp80 0xK00000000000000000000, 0xK00000000000000000000 +; CHECK-NEXT: [[IMIN_2_US:%.*]] = select i1 [[CMP13_US]], i64 0, i64 0 +; CHECK-NEXT: [[JMIN_2_US:%.*]] = select i1 [[CMP13_US]], i64 0, i64 0 +; CHECK-NEXT: br label %[[BB1]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[STOREMERGE64:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IMIN_2_US]], %[[BB2]] ] +; CHECK-NEXT: [[STOREMERGE63:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[JMIN_2_US]], %[[BB2]] ] +; CHECK-NEXT: [[STOREMERGE62:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IMAX_2_US]], %[[BB2]] ] +; CHECK-NEXT: [[STOREMERGE:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[JMAX_2_US]], %[[BB2]] ] +; CHECK-NEXT: ret void +; +entry: + br i1 false, label %bb1, label %bb2 + +bb2: + %cmp14.us = fcmp ogt x86_fp80 0xK00000000000000000000, 0xK00000000000000000000 + %imax.2.us = select i1 %cmp14.us, i64 0, i64 0 + %jmax.2.us = select i1 %cmp14.us, i64 0, i64 0 + %cmp13.us = fcmp olt x86_fp80 0xK00000000000000000000, 0xK00000000000000000000 + %imin.2.us = select i1 %cmp13.us, i64 0, i64 0 + %jmin.2.us = select i1 %cmp13.us, i64 0, i64 0 + br label %bb1 + +bb1: + %storemerge64 = phi i64 [ 0, %entry ], [ %imin.2.us, %bb2 ] + %storemerge63 = phi i64 [ 0, %entry ], [ %jmin.2.us, %bb2 ] + %storemerge62 = phi i64 [ 0, %entry ], [ %imax.2.us, %bb2 ] + %storemerge = phi i64 [ 0, %entry ], [ %jmax.2.us, %bb2 ] + ret void +} From 980d66caae62de9b56422a2fdce3f535c2ab325f Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Mon, 31 Mar 2025 10:58:24 -0700 Subject: [PATCH 0128/1029] [llvm-exegesis] Error Out If Perf Counter is Not Fully Enabled (#132892) Perf counters can be multiplexed if there are too many that need to be scheduled on a core at the same time (and they exceed the available PMUs). Other processes (especially system ones in certain environments, not commonly on Desktop Linux from what I've seen) can also interfere. This will impact the measurement fidelity as the counter is not actually counting cycles/uops the entire time. This patch makes it so that we error out in these cases so the user gets a visible indication things have gone wrong rather than things failing silently. --- llvm/tools/llvm-exegesis/lib/Error.cpp | 10 ++++++++++ llvm/tools/llvm-exegesis/lib/Error.h | 12 ++++++++++++ llvm/tools/llvm-exegesis/lib/PerfHelper.cpp | 16 ++++++++++++---- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/llvm/tools/llvm-exegesis/lib/Error.cpp b/llvm/tools/llvm-exegesis/lib/Error.cpp index 2eee3f2f54c59..2908df25ddb1a 100644 --- a/llvm/tools/llvm-exegesis/lib/Error.cpp +++ b/llvm/tools/llvm-exegesis/lib/Error.cpp @@ -49,5 +49,15 @@ void SnippetSignal::log(raw_ostream &OS) const { #endif // LLVM_ON_UNIX } +char PerfCounterNotFullyEnabled::ID; + +std::error_code PerfCounterNotFullyEnabled::convertToErrorCode() const { + return inconvertibleErrorCode(); +} + +void PerfCounterNotFullyEnabled::log(raw_ostream &OS) const { + OS << "The perf counter was not scheduled on the CPU the entire time."; +} + } // namespace exegesis } // namespace llvm diff --git a/llvm/tools/llvm-exegesis/lib/Error.h b/llvm/tools/llvm-exegesis/lib/Error.h index 4a3e48997f24f..9b71fe8f56897 100644 --- a/llvm/tools/llvm-exegesis/lib/Error.h +++ b/llvm/tools/llvm-exegesis/lib/Error.h @@ -76,6 +76,18 @@ class SnippetSignal : public SnippetExecutionFailure { int SignalNumber; }; +// A class representing a case where a perf counter was only partially +// scheduled, most likely due to perf counter contention. +struct PerfCounterNotFullyEnabled + : public ErrorInfo { + static char ID; + PerfCounterNotFullyEnabled() {} + + void log(raw_ostream &OS) const override; + + std::error_code convertToErrorCode() const override; +}; + } // namespace exegesis } // namespace llvm diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp index 3f3288ceb1e4f..585ef0624ca82 100644 --- a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp +++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "PerfHelper.h" +#include "Error.h" #include "llvm/Config/config.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" @@ -117,6 +118,8 @@ void ConfiguredEvent::initRealEvent(const pid_t ProcessID, const int GroupFD) { const int CPU = -1; const uint32_t Flags = 0; perf_event_attr AttrCopy = *Event.attribute(); + AttrCopy.read_format = + PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; FileDescriptor = perf_event_open(&AttrCopy, ProcessID, CPU, GroupFD, Flags); if (FileDescriptor == -1) { errs() << "Unable to open event. ERRNO: " << strerror(errno) @@ -132,15 +135,20 @@ void ConfiguredEvent::initRealEvent(const pid_t ProcessID, const int GroupFD) { Expected> ConfiguredEvent::readOrError(StringRef /*unused*/) const { - int64_t Count = 0; - ssize_t ReadSize = ::read(FileDescriptor, &Count, sizeof(Count)); + int64_t EventInfo[3] = {0, 0, 0}; + ssize_t ReadSize = ::read(FileDescriptor, &EventInfo, sizeof(EventInfo)); - if (ReadSize != sizeof(Count)) + if (ReadSize != sizeof(EventInfo)) return make_error("Failed to read event counter", errc::io_error); + int64_t EventTimeEnabled = EventInfo[1]; + int64_t EventTimeRunning = EventInfo[2]; + if (EventTimeEnabled != EventTimeRunning) + return make_error(); + SmallVector Result; - Result.push_back(Count); + Result.push_back(EventInfo[0]); return Result; } From 668edb43a0e7d8fb13564d6966c250b93b2130db Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 31 Mar 2025 19:09:03 +0100 Subject: [PATCH 0129/1029] [AArch64] Update more costmodel tests with -cost-kind=all. NFC --- .../Analysis/CostModel/AArch64/bitreverse.ll | 66 +++--- llvm/test/Analysis/CostModel/AArch64/bswap.ll | 32 +-- .../AArch64/cost-scalable-vector-gep.ll | 18 +- llvm/test/Analysis/CostModel/AArch64/ctlz.ll | 74 +++--- llvm/test/Analysis/CostModel/AArch64/ctpop.ll | 82 +++---- llvm/test/Analysis/CostModel/AArch64/cttz.ll | 74 +++--- .../Analysis/CostModel/AArch64/cttz_elts.ll | 170 +++++++------- .../CostModel/AArch64/no-sve-no-neon.ll | 12 +- llvm/test/Analysis/CostModel/AArch64/store.ll | 8 +- .../Analysis/CostModel/AArch64/vec3-ops.ll | 220 +++++++++--------- .../CostModel/AArch64/vector-reverse.ll | 40 ++-- 11 files changed, 399 insertions(+), 397 deletions(-) diff --git a/llvm/test/Analysis/CostModel/AArch64/bitreverse.ll b/llvm/test/Analysis/CostModel/AArch64/bitreverse.ll index 44e14bc78424d..b2184e8ed3fe6 100644 --- a/llvm/test/Analysis/CostModel/AArch64/bitreverse.ll +++ b/llvm/test/Analysis/CostModel/AArch64/bitreverse.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=aarch64-linux-gnu -passes="print" 2>&1 -disable-output | FileCheck %s +; RUN: opt < %s -mtriple=aarch64-linux-gnu -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s ; Verify the cost of scalar bitreverse instructions. @@ -12,8 +12,8 @@ declare i64 @llvm.bitreverse.i64(i64) define i64 @var_bitreverse_i64(i64 %a) { ; CHECK-LABEL: 'var_bitreverse_i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 1 for: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %bitreverse ; %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a) ret i64 %bitreverse @@ -21,8 +21,8 @@ define i64 @var_bitreverse_i64(i64 %a) { define i32 @var_bitreverse_i32(i32 %a) { ; CHECK-LABEL: 'var_bitreverse_i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 1 for: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %bitreverse ; %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a) ret i32 %bitreverse @@ -30,8 +30,8 @@ define i32 @var_bitreverse_i32(i32 %a) { define i16 @var_bitreverse_i16(i16 %a) { ; CHECK-LABEL: 'var_bitreverse_i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 2 for: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %bitreverse ; %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a) ret i16 %bitreverse @@ -39,8 +39,8 @@ define i16 @var_bitreverse_i16(i16 %a) { define i8 @var_bitreverse_i8(i8 %a) { ; CHECK-LABEL: 'var_bitreverse_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 2 for: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %bitreverse ; %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a) ret i8 %bitreverse @@ -65,8 +65,8 @@ declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) define <1 x i64> @var_bitreverse_v1i64(<1 x i64> %a) { ; CHECK-LABEL: 'var_bitreverse_v1i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <1 x i64> %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 2 for: %bitreverse = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <1 x i64> %bitreverse ; %bitreverse = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %a) ret <1 x i64> %bitreverse @@ -74,8 +74,8 @@ define <1 x i64> @var_bitreverse_v1i64(<1 x i64> %a) { define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) { ; CHECK-LABEL: 'var_bitreverse_v2i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 2 for: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %bitreverse ; %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ret <2 x i64> %bitreverse @@ -83,8 +83,8 @@ define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) { define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) { ; CHECK-LABEL: 'var_bitreverse_v4i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 4 for: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %bitreverse ; %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ret <4 x i64> %bitreverse @@ -92,8 +92,8 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) { define <2 x i32> @var_bitreverse_v2i32(<2 x i32> %a) { ; CHECK-LABEL: 'var_bitreverse_v2i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 2 for: %bitreverse = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %bitreverse ; %bitreverse = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %a) ret <2 x i32> %bitreverse @@ -101,8 +101,8 @@ define <2 x i32> @var_bitreverse_v2i32(<2 x i32> %a) { define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) { ; CHECK-LABEL: 'var_bitreverse_v4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 2 for: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %bitreverse ; %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ret <4 x i32> %bitreverse @@ -110,8 +110,8 @@ define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) { define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) { ; CHECK-LABEL: 'var_bitreverse_v8i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 4 for: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %bitreverse ; %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ret <8 x i32> %bitreverse @@ -119,8 +119,8 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) { define <4 x i16> @var_bitreverse_v4i16(<4 x i16> %a) { ; CHECK-LABEL: 'var_bitreverse_v4i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 2 for: %bitreverse = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %bitreverse ; %bitreverse = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %a) ret <4 x i16> %bitreverse @@ -128,8 +128,8 @@ define <4 x i16> @var_bitreverse_v4i16(<4 x i16> %a) { define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) { ; CHECK-LABEL: 'var_bitreverse_v8i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 2 for: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %bitreverse ; %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ret <8 x i16> %bitreverse @@ -137,8 +137,8 @@ define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) { define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) { ; CHECK-LABEL: 'var_bitreverse_v16i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 4 for: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %bitreverse ; %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ret <16 x i16> %bitreverse @@ -146,8 +146,8 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) { define <8 x i8> @var_bitreverse_v8i8(<8 x i8> %a) { ; CHECK-LABEL: 'var_bitreverse_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 1 for: %bitreverse = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %bitreverse ; %bitreverse = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) ret <8 x i8> %bitreverse @@ -155,8 +155,8 @@ define <8 x i8> @var_bitreverse_v8i8(<8 x i8> %a) { define <16 x i8> @var_bitreverse_v16i8(<16 x i8> %a) { ; CHECK-LABEL: 'var_bitreverse_v16i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 1 for: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %bitreverse ; %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ret <16 x i8> %bitreverse @@ -164,8 +164,8 @@ define <16 x i8> @var_bitreverse_v16i8(<16 x i8> %a) { define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) { ; CHECK-LABEL: 'var_bitreverse_v32i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse +; CHECK-NEXT: Cost Model: Found costs of 2 for: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %bitreverse ; %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ret <32 x i8> %bitreverse diff --git a/llvm/test/Analysis/CostModel/AArch64/bswap.ll b/llvm/test/Analysis/CostModel/AArch64/bswap.ll index 6cb2975a12b82..8dad1f218577a 100644 --- a/llvm/test/Analysis/CostModel/AArch64/bswap.ll +++ b/llvm/test/Analysis/CostModel/AArch64/bswap.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2 -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=aarch64--linux-gnu < %s | FileCheck %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64--linux-gnu < %s | FileCheck %s ; Verify the cost of bswap instructions. @@ -22,10 +22,10 @@ declare <4 x i48> @llvm.bswap.v4i48(<4 x i48>) define void @scalar() { ; CHECK-LABEL: 'scalar' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b16 = call i16 @llvm.bswap.i16(i16 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b32 = call i32 @llvm.bswap.i32(i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b64 = call i64 @llvm.bswap.i64(i64 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 1 for: %b16 = call i16 @llvm.bswap.i16(i16 undef) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %b32 = call i32 @llvm.bswap.i32(i32 undef) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %b64 = call i64 @llvm.bswap.i64(i64 undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %b16 = call i16 @llvm.bswap.i16(i16 undef) %b32 = call i32 @llvm.bswap.i32(i32 undef) @@ -35,17 +35,17 @@ define void @scalar() { define void @neon() { ; CHECK-LABEL: 'neon' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = call <3 x i32> @llvm.bswap.v3i32(<3 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4i48 = call <4 x i48> @llvm.bswap.v4i48(<4 x i48> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i16 = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %v16i16 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2i32 = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> undef) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i32 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> undef) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %v8i32 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> undef) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v2i64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> undef) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %v4i64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> undef) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %v3i32 = call <3 x i32> @llvm.bswap.v3i32(<3 x i32> undef) +; CHECK-NEXT: Cost Model: Found costs of 12 for: %v4i48 = call <4 x i48> @llvm.bswap.v4i48(<4 x i48> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %v4i16 = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> undef) %v8i16 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> undef) diff --git a/llvm/test/Analysis/CostModel/AArch64/cost-scalable-vector-gep.ll b/llvm/test/Analysis/CostModel/AArch64/cost-scalable-vector-gep.ll index 7c16ab41bf369..11af2c270c383 100644 --- a/llvm/test/Analysis/CostModel/AArch64/cost-scalable-vector-gep.ll +++ b/llvm/test/Analysis/CostModel/AArch64/cost-scalable-vector-gep.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2 -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s ; This regression test is verifying that a GEP instruction performed on a ; scalable vector does not produce a 'assumption that TypeSize is not scalable' @@ -9,11 +9,11 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @gep_scalable_types(ptr %ptr) { ; CHECK-LABEL: 'gep_scalable_types' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr , ptr %ptr, i32 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr [2 x ], ptr %ptr, i32 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr target("aarch64.svcount"), ptr %ptr, i32 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep4 = getelementptr [2 x target("aarch64.svcount")], ptr %ptr, i32 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 1 for: %gep1 = getelementptr , ptr %ptr, i32 2 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %gep2 = getelementptr [2 x ], ptr %ptr, i32 2 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %gep3 = getelementptr target("aarch64.svcount"), ptr %ptr, i32 2 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %gep4 = getelementptr [2 x target("aarch64.svcount")], ptr %ptr, i32 2 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %gep1 = getelementptr , ptr %ptr, i32 2 %gep2 = getelementptr [2 x ], ptr %ptr, i32 2 @@ -24,9 +24,9 @@ define void @gep_scalable_types(ptr %ptr) { define ptr @sext_gep(ptr %p, i32 %a) { ; CHECK-LABEL: 'sext_gep' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %b = sext i32 %a to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = getelementptr , ptr %p, i64 %b -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret ptr %r +; CHECK-NEXT: Cost Model: Found costs of 1 for: %b = sext i32 %a to i64 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r = getelementptr , ptr %p, i64 %b +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret ptr %r ; %b = sext i32 %a to i64 %r = getelementptr , ptr %p, i64 %b diff --git a/llvm/test/Analysis/CostModel/AArch64/ctlz.ll b/llvm/test/Analysis/CostModel/AArch64/ctlz.ll index 913a81b148c60..4ad359d4d2c68 100644 --- a/llvm/test/Analysis/CostModel/AArch64/ctlz.ll +++ b/llvm/test/Analysis/CostModel/AArch64/ctlz.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=aarch64 -passes="print" 2>&1 -disable-output | FileCheck %s +; RUN: opt < %s -mtriple=aarch64 -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s ; Verify the cost of scalar ctlz instructions. @@ -8,8 +8,8 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define i64 @test_ctlz_i64(i64 %a) { ; ; CHECK-LABEL: 'test_ctlz_i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %ctlz ; %ctlz = call i64 @llvm.ctlz.i64(i64 %a) ret i64 %ctlz @@ -18,8 +18,8 @@ define i64 @test_ctlz_i64(i64 %a) { define i32 @test_ctlz_i32(i32 %a) { ; ; CHECK-LABEL: 'test_ctlz_i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %ctlz ; %ctlz = call i32 @llvm.ctlz.i32(i32 %a) ret i32 %ctlz @@ -28,8 +28,8 @@ define i32 @test_ctlz_i32(i32 %a) { define i16 @test_ctlz_i16(i16 %a) { ; ; CHECK-LABEL: 'test_ctlz_i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %ctlz ; %ctlz = call i16 @llvm.ctlz.i16(i16 %a) ret i16 %ctlz @@ -38,8 +38,8 @@ define i16 @test_ctlz_i16(i16 %a) { define i8 @test_ctlz_i8(i8 %a) { ; ; CHECK-LABEL: 'test_ctlz_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %ctlz ; %ctlz = call i8 @llvm.ctlz.i8(i8 %a) ret i8 %ctlz @@ -55,8 +55,8 @@ declare i8 @llvm.ctlz.i8(i8) define <2 x i64> @test_ctlz_v2i64(<2 x i64> %a) { ; ; CHECK-LABEL: 'test_ctlz_v2i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 10 for: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %ctlz ; %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) ret <2 x i64> %ctlz @@ -65,8 +65,8 @@ define <2 x i64> @test_ctlz_v2i64(<2 x i64> %a) { define <2 x i32> @test_ctlz_v2i32(<2 x i32> %a) { ; ; CHECK-LABEL: 'test_ctlz_v2i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %ctlz ; %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) ret <2 x i32> %ctlz @@ -75,8 +75,8 @@ define <2 x i32> @test_ctlz_v2i32(<2 x i32> %a) { define <4 x i32> @test_ctlz_v4i32(<4 x i32> %a) { ; ; CHECK-LABEL: 'test_ctlz_v4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %ctlz ; %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) ret <4 x i32> %ctlz @@ -85,8 +85,8 @@ define <4 x i32> @test_ctlz_v4i32(<4 x i32> %a) { define <2 x i16> @test_ctlz_v2i16(<2 x i16> %a) { ; ; CHECK-LABEL: 'test_ctlz_v2i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i16> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i16> %ctlz ; %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %a, i1 false) ret <2 x i16> %ctlz @@ -95,8 +95,8 @@ define <2 x i16> @test_ctlz_v2i16(<2 x i16> %a) { define <4 x i16> @test_ctlz_v4i16(<4 x i16> %a) { ; ; CHECK-LABEL: 'test_ctlz_v4i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %ctlz ; %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) ret <4 x i16> %ctlz @@ -105,8 +105,8 @@ define <4 x i16> @test_ctlz_v4i16(<4 x i16> %a) { define <8 x i16> @test_ctlz_v8i16(<8 x i16> %a) { ; ; CHECK-LABEL: 'test_ctlz_v8i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %ctlz ; %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) ret <8 x i16> %ctlz @@ -115,8 +115,8 @@ define <8 x i16> @test_ctlz_v8i16(<8 x i16> %a) { define <2 x i8> @test_ctlz_v2i8(<2 x i8> %a) { ; ; CHECK-LABEL: 'test_ctlz_v2i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i8> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i8> %ctlz ; %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %a, i1 false) ret <2 x i8> %ctlz @@ -125,8 +125,8 @@ define <2 x i8> @test_ctlz_v2i8(<2 x i8> %a) { define <4 x i8> @test_ctlz_v4i8(<4 x i8> %a) { ; ; CHECK-LABEL: 'test_ctlz_v4i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %ctlz ; %ctlz = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %a, i1 false) ret <4 x i8> %ctlz @@ -135,8 +135,8 @@ define <4 x i8> @test_ctlz_v4i8(<4 x i8> %a) { define <8 x i8> @test_ctlz_v8i8(<8 x i8> %a) { ; ; CHECK-LABEL: 'test_ctlz_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %ctlz ; %ctlz = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) ret <8 x i8> %ctlz @@ -145,8 +145,8 @@ define <8 x i8> @test_ctlz_v8i8(<8 x i8> %a) { define <16 x i8> @test_ctlz_v16i8(<16 x i8> %a) { ; ; CHECK-LABEL: 'test_ctlz_v16i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %ctlz ; %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) ret <16 x i8> %ctlz @@ -154,8 +154,8 @@ define <16 x i8> @test_ctlz_v16i8(<16 x i8> %a) { define <4 x i64> @test_ctlz_v4i64(<4 x i64> %a) { ; CHECK-LABEL: 'test_ctlz_v4i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 20 for: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %ctlz ; %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false) ret <4 x i64> %ctlz @@ -163,8 +163,8 @@ define <4 x i64> @test_ctlz_v4i64(<4 x i64> %a) { define <8 x i32> @test_ctlz_v8i32(<8 x i32> %a) { ; CHECK-LABEL: 'test_ctlz_v8i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 4 for: %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %ctlz ; %ctlz = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false) ret <8 x i32> %ctlz @@ -172,8 +172,8 @@ define <8 x i32> @test_ctlz_v8i32(<8 x i32> %a) { define <16 x i16> @test_ctlz_v16i16(<16 x i16> %a) { ; CHECK-LABEL: 'test_ctlz_v16i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 4 for: %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %ctlz ; %ctlz = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false) ret <16 x i16> %ctlz @@ -181,8 +181,8 @@ define <16 x i16> @test_ctlz_v16i16(<16 x i16> %a) { define <32 x i8> @test_ctlz_v32i8(<32 x i8> %a) { ; CHECK-LABEL: 'test_ctlz_v32i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctlz +; CHECK-NEXT: Cost Model: Found costs of 4 for: %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %ctlz ; %ctlz = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false) ret <32 x i8> %ctlz diff --git a/llvm/test/Analysis/CostModel/AArch64/ctpop.ll b/llvm/test/Analysis/CostModel/AArch64/ctpop.ll index ba1033076e372..013432991f5ae 100644 --- a/llvm/test/Analysis/CostModel/AArch64/ctpop.ll +++ b/llvm/test/Analysis/CostModel/AArch64/ctpop.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=aarch64 -passes="print" 2>&1 -disable-output | FileCheck %s +; RUN: opt < %s -mtriple=aarch64 -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s ; Verify the cost of scalar ctpop instructions. @@ -7,8 +7,8 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define i64 @test_ctpop_i64(i64 %a) { ; CHECK-LABEL: 'test_ctpop_i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i64 @llvm.ctpop.i64(i64 %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctpop +; CHECK-NEXT: Cost Model: Found costs of 4 for: %ctpop = call i64 @llvm.ctpop.i64(i64 %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %ctpop ; %ctpop = call i64 @llvm.ctpop.i64(i64 %a) ret i64 %ctpop @@ -16,8 +16,8 @@ define i64 @test_ctpop_i64(i64 %a) { define i32 @test_ctpop_i32(i32 %a) { ; CHECK-LABEL: 'test_ctpop_i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctpop = call i32 @llvm.ctpop.i32(i32 %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctpop +; CHECK-NEXT: Cost Model: Found costs of 5 for: %ctpop = call i32 @llvm.ctpop.i32(i32 %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %ctpop ; %ctpop = call i32 @llvm.ctpop.i32(i32 %a) ret i32 %ctpop @@ -25,8 +25,8 @@ define i32 @test_ctpop_i32(i32 %a) { define i16 @test_ctpop_i16(i16 %a) { ; CHECK-LABEL: 'test_ctpop_i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctpop = call i16 @llvm.ctpop.i16(i16 %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctpop +; CHECK-NEXT: Cost Model: Found costs of 5 for: %ctpop = call i16 @llvm.ctpop.i16(i16 %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %ctpop ; %ctpop = call i16 @llvm.ctpop.i16(i16 %a) ret i16 %ctpop @@ -34,8 +34,8 @@ define i16 @test_ctpop_i16(i16 %a) { define i8 @test_ctpop_i8(i8 %a) { ; CHECK-LABEL: 'test_ctpop_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctpop = call i8 @llvm.ctpop.i8(i8 %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctpop +; CHECK-NEXT: Cost Model: Found costs of 5 for: %ctpop = call i8 @llvm.ctpop.i8(i8 %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %ctpop ; %ctpop = call i8 @llvm.ctpop.i8(i8 %a) ret i8 %ctpop @@ -50,8 +50,8 @@ declare i8 @llvm.ctpop.i8(i8) define <2 x i64> @test_ctpop_v2i64(<2 x i64> %a) { ; CHECK-LABEL: 'test_ctpop_v2i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 4 for: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %ctpop ; %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) ret <2 x i64> %ctpop @@ -59,8 +59,8 @@ define <2 x i64> @test_ctpop_v2i64(<2 x i64> %a) { define <2 x i32> @test_ctpop_v2i32(<2 x i32> %a) { ; CHECK-LABEL: 'test_ctpop_v2i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 3 for: %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %ctpop ; %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) ret <2 x i32> %ctpop @@ -68,8 +68,8 @@ define <2 x i32> @test_ctpop_v2i32(<2 x i32> %a) { define <4 x i32> @test_ctpop_v4i32(<4 x i32> %a) { ; CHECK-LABEL: 'test_ctpop_v4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 3 for: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %ctpop ; %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a) ret <4 x i32> %ctpop @@ -77,8 +77,8 @@ define <4 x i32> @test_ctpop_v4i32(<4 x i32> %a) { define <2 x i16> @test_ctpop_v2i16(<2 x i16> %a) { ; CHECK-LABEL: 'test_ctpop_v2i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i16> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 4 for: %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i16> %ctpop ; %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %a) ret <2 x i16> %ctpop @@ -86,8 +86,8 @@ define <2 x i16> @test_ctpop_v2i16(<2 x i16> %a) { define <4 x i16> @test_ctpop_v4i16(<4 x i16> %a) { ; CHECK-LABEL: 'test_ctpop_v4i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %ctpop ; %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %a) ret <4 x i16> %ctpop @@ -95,8 +95,8 @@ define <4 x i16> @test_ctpop_v4i16(<4 x i16> %a) { define <8 x i16> @test_ctpop_v8i16(<8 x i16> %a) { ; CHECK-LABEL: 'test_ctpop_v8i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %ctpop ; %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a) ret <8 x i16> %ctpop @@ -104,8 +104,8 @@ define <8 x i16> @test_ctpop_v8i16(<8 x i16> %a) { define <2 x i8> @test_ctpop_v2i8(<2 x i8> %a) { ; CHECK-LABEL: 'test_ctpop_v2i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i8> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 4 for: %ctpop = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i8> %ctpop ; %ctpop = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %a) ret <2 x i8> %ctpop @@ -113,8 +113,8 @@ define <2 x i8> @test_ctpop_v2i8(<2 x i8> %a) { define <4 x i8> @test_ctpop_v4i8(<4 x i8> %a) { ; CHECK-LABEL: 'test_ctpop_v4i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctpop = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 3 for: %ctpop = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %ctpop ; %ctpop = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %a) ret <4 x i8> %ctpop @@ -122,8 +122,8 @@ define <4 x i8> @test_ctpop_v4i8(<4 x i8> %a) { define <8 x i8> @test_ctpop_v8i8(<8 x i8> %a) { ; CHECK-LABEL: 'test_ctpop_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %ctpop ; %ctpop = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) ret <8 x i8> %ctpop @@ -131,8 +131,8 @@ define <8 x i8> @test_ctpop_v8i8(<8 x i8> %a) { define <16 x i8> @test_ctpop_v16i8(<16 x i8> %a) { ; CHECK-LABEL: 'test_ctpop_v16i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 1 for: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %ctpop ; %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) ret <16 x i8> %ctpop @@ -140,8 +140,8 @@ define <16 x i8> @test_ctpop_v16i8(<16 x i8> %a) { define <4 x i64> @test_ctpop_v4i64(<4 x i64> %a) { ; CHECK-LABEL: 'test_ctpop_v4i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 8 for: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %ctpop ; %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a) ret <4 x i64> %ctpop @@ -149,8 +149,8 @@ define <4 x i64> @test_ctpop_v4i64(<4 x i64> %a) { define <8 x i32> @test_ctpop_v8i32(<8 x i32> %a) { ; CHECK-LABEL: 'test_ctpop_v8i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 6 for: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %ctpop ; %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a) ret <8 x i32> %ctpop @@ -158,8 +158,8 @@ define <8 x i32> @test_ctpop_v8i32(<8 x i32> %a) { define <16 x i16> @test_ctpop_v16i16(<16 x i16> %a) { ; CHECK-LABEL: 'test_ctpop_v16i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 4 for: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %ctpop ; %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a) ret <16 x i16> %ctpop @@ -167,8 +167,8 @@ define <16 x i16> @test_ctpop_v16i16(<16 x i16> %a) { define <32 x i8> @test_ctpop_v32i8(<32 x i8> %a) { ; CHECK-LABEL: 'test_ctpop_v32i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 2 for: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %ctpop ; %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a) ret <32 x i8> %ctpop @@ -176,8 +176,8 @@ define <32 x i8> @test_ctpop_v32i8(<32 x i8> %a) { define i64 @test_ctpop_noneon_i64(i64 %a) "target-features"="-fp-armv8,-neon" { ; CHECK-LABEL: 'test_ctpop_noneon_i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %ctpop = call i64 @llvm.ctpop.i64(i64 %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctpop +; CHECK-NEXT: Cost Model: Found costs of 12 for: %ctpop = call i64 @llvm.ctpop.i64(i64 %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %ctpop ; %ctpop = call i64 @llvm.ctpop.i64(i64 %a) ret i64 %ctpop @@ -185,8 +185,8 @@ define i64 @test_ctpop_noneon_i64(i64 %a) "target-features"="-fp-armv8,-neon" { define <2 x i64> @test_ctpop_noneon_v2i64(<2 x i64> %a) "target-features"="-fp-armv8,-neon" { ; CHECK-LABEL: 'test_ctpop_noneon_v2i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop +; CHECK-NEXT: Cost Model: Found costs of 24 for: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %ctpop ; %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) ret <2 x i64> %ctpop diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz.ll b/llvm/test/Analysis/CostModel/AArch64/cttz.ll index 5947e251d8a06..021eb73234f31 100644 --- a/llvm/test/Analysis/CostModel/AArch64/cttz.ll +++ b/llvm/test/Analysis/CostModel/AArch64/cttz.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=aarch64 -passes="print" 2>&1 -disable-output | FileCheck %s +; RUN: opt < %s -mtriple=aarch64 -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s ; Verify the cost of scalar ctlz instructions. @@ -8,8 +8,8 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define i64 @test_cttz_i64(i64 %a) { ; ; CHECK-LABEL: 'test_cttz_i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %cttz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %cttz ; %cttz = call i64 @llvm.cttz.i64(i64 %a) ret i64 %cttz @@ -18,8 +18,8 @@ define i64 @test_cttz_i64(i64 %a) { define i32 @test_cttz_i32(i32 %a) { ; ; CHECK-LABEL: 'test_cttz_i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %cttz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %cttz ; %cttz = call i32 @llvm.cttz.i32(i32 %a) ret i32 %cttz @@ -28,8 +28,8 @@ define i32 @test_cttz_i32(i32 %a) { define i16 @test_cttz_i16(i16 %a) { ; ; CHECK-LABEL: 'test_cttz_i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %cttz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i16 %cttz ; %cttz = call i16 @llvm.cttz.i16(i16 %a) ret i16 %cttz @@ -38,8 +38,8 @@ define i16 @test_cttz_i16(i16 %a) { define i8 @test_cttz_i8(i8 %a) { ; ; CHECK-LABEL: 'test_cttz_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %cttz +; CHECK-NEXT: Cost Model: Found costs of 1 for: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i8 %cttz ; %cttz = call i8 @llvm.cttz.i8(i8 %a) ret i8 %cttz @@ -55,8 +55,8 @@ declare i8 @llvm.cttz.i8(i8) define <2 x i64> @test_cttz_v2i64(<2 x i64> %a) { ; ; CHECK-LABEL: 'test_cttz_v2i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz +; CHECK-NEXT: Cost Model: Found costs of 10 for: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %cttz ; %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) ret <2 x i64> %cttz @@ -65,8 +65,8 @@ define <2 x i64> @test_cttz_v2i64(<2 x i64> %a) { define <2 x i32> @test_cttz_v2i32(<2 x i32> %a) { ; ; CHECK-LABEL: 'test_cttz_v2i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %cttz +; CHECK-NEXT: Cost Model: Found costs of 10 for: %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i32> %cttz ; %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true) ret <2 x i32> %cttz @@ -75,8 +75,8 @@ define <2 x i32> @test_cttz_v2i32(<2 x i32> %a) { define <4 x i32> @test_cttz_v4i32(<4 x i32> %a) { ; ; CHECK-LABEL: 'test_cttz_v4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz +; CHECK-NEXT: Cost Model: Found costs of 20 for: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %cttz ; %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true) ret <4 x i32> %cttz @@ -85,8 +85,8 @@ define <4 x i32> @test_cttz_v4i32(<4 x i32> %a) { define <2 x i16> @test_cttz_v2i16(<2 x i16> %a) { ; ; CHECK-LABEL: 'test_cttz_v2i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i16> %cttz +; CHECK-NEXT: Cost Model: Found costs of 10 for: %cttz = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i16> %cttz ; %cttz = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true) ret <2 x i16> %cttz @@ -95,8 +95,8 @@ define <2 x i16> @test_cttz_v2i16(<2 x i16> %a) { define <4 x i16> @test_cttz_v4i16(<4 x i16> %a) { ; ; CHECK-LABEL: 'test_cttz_v4i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %cttz +; CHECK-NEXT: Cost Model: Found costs of 20 for: %cttz = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i16> %cttz ; %cttz = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true) ret <4 x i16> %cttz @@ -105,8 +105,8 @@ define <4 x i16> @test_cttz_v4i16(<4 x i16> %a) { define <8 x i16> @test_cttz_v8i16(<8 x i16> %a) { ; ; CHECK-LABEL: 'test_cttz_v8i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz +; CHECK-NEXT: Cost Model: Found costs of 40 for: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %cttz ; %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true) ret <8 x i16> %cttz @@ -115,8 +115,8 @@ define <8 x i16> @test_cttz_v8i16(<8 x i16> %a) { define <2 x i8> @test_cttz_v2i8(<2 x i8> %a) { ; ; CHECK-LABEL: 'test_cttz_v2i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i8> %cttz +; CHECK-NEXT: Cost Model: Found costs of 10 for: %cttz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i8> %cttz ; %cttz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true) ret <2 x i8> %cttz @@ -125,8 +125,8 @@ define <2 x i8> @test_cttz_v2i8(<2 x i8> %a) { define <4 x i8> @test_cttz_v4i8(<4 x i8> %a) { ; ; CHECK-LABEL: 'test_cttz_v4i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %cttz +; CHECK-NEXT: Cost Model: Found costs of 20 for: %cttz = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i8> %cttz ; %cttz = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true) ret <4 x i8> %cttz @@ -135,8 +135,8 @@ define <4 x i8> @test_cttz_v4i8(<4 x i8> %a) { define <8 x i8> @test_cttz_v8i8(<8 x i8> %a) { ; ; CHECK-LABEL: 'test_cttz_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cttz = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %cttz +; CHECK-NEXT: Cost Model: Found costs of 40 for: %cttz = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %cttz ; %cttz = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true) ret <8 x i8> %cttz @@ -145,8 +145,8 @@ define <8 x i8> @test_cttz_v8i8(<8 x i8> %a) { define <16 x i8> @test_cttz_v16i8(<16 x i8> %a) { ; ; CHECK-LABEL: 'test_cttz_v16i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz +; CHECK-NEXT: Cost Model: Found costs of 80 for: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %cttz ; %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true) ret <16 x i8> %cttz @@ -155,8 +155,8 @@ define <16 x i8> @test_cttz_v16i8(<16 x i8> %a) { define <4 x i64> @test_cttz_v4i64(<4 x i64> %a) { ; ; CHECK-LABEL: 'test_cttz_v4i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %cttz +; CHECK-NEXT: Cost Model: Found costs of 20 for: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i64> %cttz ; %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true) ret <4 x i64> %cttz @@ -165,8 +165,8 @@ define <4 x i64> @test_cttz_v4i64(<4 x i64> %a) { define <8 x i32> @test_cttz_v8i32(<8 x i32> %a) { ; ; CHECK-LABEL: 'test_cttz_v8i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %cttz +; CHECK-NEXT: Cost Model: Found costs of 40 for: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i32> %cttz ; %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true) ret <8 x i32> %cttz @@ -175,8 +175,8 @@ define <8 x i32> @test_cttz_v8i32(<8 x i32> %a) { define <16 x i16> @test_cttz_v16i16(<16 x i16> %a) { ; ; CHECK-LABEL: 'test_cttz_v16i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %cttz +; CHECK-NEXT: Cost Model: Found costs of 80 for: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i16> %cttz ; %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true) ret <16 x i16> %cttz @@ -185,8 +185,8 @@ define <16 x i16> @test_cttz_v16i16(<16 x i16> %a) { define <32 x i8> @test_cttz_v32i8(<32 x i8> %a) { ; ; CHECK-LABEL: 'test_cttz_v32i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %cttz +; CHECK-NEXT: Cost Model: Found costs of 160 for: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <32 x i8> %cttz ; %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true) ret <32 x i8> %cttz diff --git a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll index 5a2d08a17fca4..15d09e00a4ee5 100644 --- a/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll +++ b/llvm/test/Analysis/CostModel/AArch64/cttz_elts.ll @@ -1,50 +1,50 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4 -; RUN: opt < %s -passes="print" 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s +; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s define void @foo_no_vscale_range() { ; CHECK-LABEL: 'foo_no_vscale_range' -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of Invalid for: %res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:96 CodeSize:37 Lat:37 SizeLat:37 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:21 Lat:21 SizeLat:21 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:9 Lat:9 SizeLat:9 for: %res.i64.v32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:9 Lat:9 SizeLat:9 for: %res.i32.v32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:96 CodeSize:37 Lat:37 SizeLat:37 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:21 Lat:21 SizeLat:21 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.v16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:9 Lat:9 SizeLat:9 for: %res.i64.v32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.v32i1(<32 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.v16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:9 Lat:9 SizeLat:9 for: %res.i32.v32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.v32i1(<32 x i1> undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %res.i64.nxv1i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv1i1( undef, i1 true) %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) @@ -97,27 +97,27 @@ define void @foo_no_vscale_range() { define void @foo_vscale_range_1_16() vscale_range(1,16) { ; CHECK-LABEL: 'foo_vscale_range_1_16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:13 Lat:13 SizeLat:13 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:13 Lat:13 SizeLat:13 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:13 Lat:13 SizeLat:13 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:13 Lat:13 SizeLat:13 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) @@ -146,27 +146,27 @@ define void @foo_vscale_range_1_16() vscale_range(1,16) { define void @foo_vscale_range_1_16384() vscale_range(1,16384) { ; CHECK-LABEL: 'foo_vscale_range_1_16384' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 true) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 false) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:21 Lat:21 SizeLat:21 for: %res.i64.nxv32i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:21 Lat:21 SizeLat:21 for: %res.i32.nxv32i1.zip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 true) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv2i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv4i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv8i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i64.nxv16i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:21 Lat:21 SizeLat:21 for: %res.i64.nxv32i1.nzip = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv2i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv2i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv4i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv4i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv8i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %res.i32.nxv16i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:48 CodeSize:21 Lat:21 SizeLat:21 for: %res.i32.nxv32i1.nzip = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( undef, i1 false) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %res.i64.nxv2i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv2i1( undef, i1 true) %res.i64.nxv4i1.zip = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( undef, i1 true) diff --git a/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll b/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll index 3100f4f8a3f66..20b83bec6cf49 100644 --- a/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll +++ b/llvm/test/Analysis/CostModel/AArch64/no-sve-no-neon.ll @@ -1,18 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 2 -; RUN: opt -mattr=-neon < %s -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK-NONEON -; RUN: opt -mattr=+sve,-neon < %s -passes="print" 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK-WITHSVE +; RUN: opt -mattr=-neon < %s -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK-NONEON +; RUN: opt -mattr=+sve,-neon < %s -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK-WITHSVE target triple = "aarch64-unknown-linux-gnu" target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @uitofp() { ; CHECK-NONEON-LABEL: 'uitofp' -; CHECK-NONEON-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %conv = uitofp <16 x i64> undef to <16 x float> -; CHECK-NONEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NONEON-NEXT: Cost Model: Found costs of RThru:48 CodeSize:1 Lat:1 SizeLat:1 for: %conv = uitofp <16 x i64> undef to <16 x float> +; CHECK-NONEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-WITHSVE-LABEL: 'uitofp' -; CHECK-WITHSVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %conv = uitofp <16 x i64> undef to <16 x float> -; CHECK-WITHSVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-WITHSVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %conv = uitofp <16 x i64> undef to <16 x float> +; CHECK-WITHSVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %conv = uitofp <16 x i64> undef to <16 x float> ret void diff --git a/llvm/test/Analysis/CostModel/AArch64/store.ll b/llvm/test/Analysis/CostModel/AArch64/store.ll index aae4afd331e9d..4452a34b37072 100644 --- a/llvm/test/Analysis/CostModel/AArch64/store.ll +++ b/llvm/test/Analysis/CostModel/AArch64/store.ll @@ -3,10 +3,12 @@ ; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64-unknown -mattr=slow-misaligned-128store | FileCheck %s --check-prefix=SLOW_MISALIGNED_128_STORE target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-v256:32:256-a0:0:32-n32-S32" + +; If FeatureSlowMisaligned128Store is set, we penalize 128-bit stores. +; The unlegalized 256-bit stores are further penalized when legalized down +; to 128-bit stores. + define void @getMemoryOpCost() { - ; If FeatureSlowMisaligned128Store is set, we penalize 128-bit stores. - ; The unlegalized 256-bit stores are further penalized when legalized down - ; to 128-bit stores. ; CHECK-LABEL: 'getMemoryOpCost' ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:1 SizeLat:2 for: store <4 x i64> undef, ptr undef, align 4 ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:1 SizeLat:2 for: store <8 x i32> undef, ptr undef, align 4 diff --git a/llvm/test/Analysis/CostModel/AArch64/vec3-ops.ll b/llvm/test/Analysis/CostModel/AArch64/vec3-ops.ll index cf80bd677228a..6bcf3c705ef6f 100644 --- a/llvm/test/Analysis/CostModel/AArch64/vec3-ops.ll +++ b/llvm/test/Analysis/CostModel/AArch64/vec3-ops.ll @@ -1,15 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=arm64-apple-macosx < %s | FileCheck %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=arm64-apple-macosx < %s | FileCheck %s define void @vec3_i32(<3 x i32> %a, <3 x i32> %b, ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec3_i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <3 x i32>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <3 x i32> %l, %b -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp = icmp uge <3 x i32> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <3 x i32> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sel = select <3 x i1> %cmp, <3 x i32> %add, <3 x i32> %sub -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <3 x i32> %sel, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <3 x i32>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <3 x i32> %l, %b +; CHECK-NEXT: Cost Model: Found costs of 1 for: %cmp = icmp uge <3 x i32> %add, %a +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sub = sub <3 x i32> %add, %a +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sel = select <3 x i1> %cmp, <3 x i32> %add, <3 x i32> %sub +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <3 x i32> %sel, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <3 x i32>, ptr %src, align 1 %add = add <3 x i32> %l, %b @@ -22,13 +22,13 @@ define void @vec3_i32(<3 x i32> %a, <3 x i32> %b, ptr %src, ptr %dst) { define void @vec3_i32_default_alignment(<3 x i32> %a, <3 x i32> %b, ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec3_i32_default_alignment' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l = load <3 x i32>, ptr %src, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <3 x i32> %l, %b -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp = icmp uge <3 x i32> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <3 x i32> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sel = select <3 x i1> %cmp, <3 x i32> %add, <3 x i32> %sub -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <3 x i32> %sel, ptr %dst, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <3 x i32>, ptr %src, align 16 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <3 x i32> %l, %b +; CHECK-NEXT: Cost Model: Found costs of 1 for: %cmp = icmp uge <3 x i32> %add, %a +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sub = sub <3 x i32> %add, %a +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sel = select <3 x i1> %cmp, <3 x i32> %add, <3 x i32> %sub +; CHECK-NEXT: Cost Model: Found costs of 1 for: store <3 x i32> %sel, ptr %dst, align 16 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <3 x i32>, ptr %src %add = add <3 x i32> %l, %b @@ -41,13 +41,13 @@ define void @vec3_i32_default_alignment(<3 x i32> %a, <3 x i32> %b, ptr %src, pt define void @vec3_i16(<3 x i16> %a, <3 x i16> %b, ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec3_i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <3 x i16>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <3 x i16> %l, %b -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp = icmp uge <3 x i16> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <3 x i16> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sel = select <3 x i1> %cmp, <3 x i16> %add, <3 x i16> %sub -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <3 x i16> %sel, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <3 x i16>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <3 x i16> %l, %b +; CHECK-NEXT: Cost Model: Found costs of 1 for: %cmp = icmp uge <3 x i16> %add, %a +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sub = sub <3 x i16> %add, %a +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sel = select <3 x i1> %cmp, <3 x i16> %add, <3 x i16> %sub +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <3 x i16> %sel, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <3 x i16>, ptr %src, align 1 %add = add <3 x i16> %l, %b @@ -60,10 +60,10 @@ define void @vec3_i16(<3 x i16> %a, <3 x i16> %b, ptr %src, ptr %dst) { define void @vec7_i16(ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec7_i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %l = load <7 x i16>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <7 x i16> %l, %l -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: store <7 x i16> %add, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <7 x i16>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <7 x i16> %l, %l +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: store <7 x i16> %add, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <7 x i16>, ptr %src, align 1 %add = add <7 x i16> %l, %l @@ -73,10 +73,10 @@ define void @vec7_i16(ptr %src, ptr %dst) { define void @vec6_i16(ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec6_i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <6 x i16>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <6 x i16> %l, %l -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <6 x i16> %add, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <6 x i16>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <6 x i16> %l, %l +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <6 x i16> %add, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <6 x i16>, ptr %src, align 1 %add = add <6 x i16> %l, %l @@ -86,10 +86,10 @@ define void @vec6_i16(ptr %src, ptr %dst) { define void @vec5_i16(ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec5_i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <5 x i16>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <5 x i16> %l, %l -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <5 x i16> %add, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <5 x i16>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <5 x i16> %l, %l +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <5 x i16> %add, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <5 x i16>, ptr %src, align 1 %add = add <5 x i16> %l, %l @@ -99,13 +99,13 @@ define void @vec5_i16(ptr %src, ptr %dst) { define void @vec3_i16_zext_i32(<3 x i32> %a, <3 x i32> %b, ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec3_i16_zext_i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <3 x i16>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l.ext = zext <3 x i16> %l to <3 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <3 x i32> %l.ext, %b -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <3 x i32> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub.trunc = trunc <3 x i32> %sub to <3 x i16> -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <3 x i16> %sub.trunc, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <3 x i16>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %l.ext = zext <3 x i16> %l to <3 x i32> +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <3 x i32> %l.ext, %b +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sub = sub <3 x i32> %add, %a +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sub.trunc = trunc <3 x i32> %sub to <3 x i16> +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <3 x i16> %sub.trunc, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <3 x i16>, ptr %src, align 1 %l.ext = zext <3 x i16> %l to <3 x i32> @@ -118,13 +118,13 @@ define void @vec3_i16_zext_i32(<3 x i32> %a, <3 x i32> %b, ptr %src, ptr %dst) { define void @vec3_i8(<3 x i8> %a, <3 x i8> %b, ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec3_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %l = load <3 x i8>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <3 x i8> %l, %b -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp = icmp uge <3 x i8> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <3 x i8> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sel = select <3 x i1> %cmp, <3 x i8> %add, <3 x i8> %sub -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: store <3 x i8> %sel, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <3 x i8>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <3 x i8> %l, %b +; CHECK-NEXT: Cost Model: Found costs of 1 for: %cmp = icmp uge <3 x i8> %add, %a +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sub = sub <3 x i8> %add, %a +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sel = select <3 x i1> %cmp, <3 x i8> %add, <3 x i8> %sub +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: store <3 x i8> %sel, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <3 x i8>, ptr %src, align 1 %add = add <3 x i8> %l, %b @@ -137,13 +137,13 @@ define void @vec3_i8(<3 x i8> %a, <3 x i8> %b, ptr %src, ptr %dst) { define void @vec3_i8_zext_i32(<3 x i32> %a, <3 x i32> %b, ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec3_i8_zext_i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %l = load <3 x i8>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l.ext = zext <3 x i8> %l to <3 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <3 x i32> %l.ext, %b -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <3 x i32> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub.trunc = trunc <3 x i32> %sub to <3 x i8> -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: store <3 x i8> %sub.trunc, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <3 x i8>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %l.ext = zext <3 x i8> %l to <3 x i32> +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <3 x i32> %l.ext, %b +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sub = sub <3 x i32> %add, %a +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sub.trunc = trunc <3 x i32> %sub to <3 x i8> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: store <3 x i8> %sub.trunc, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <3 x i8>, ptr %src, align 1 %l.ext = zext <3 x i8> %l to <3 x i32> @@ -156,13 +156,13 @@ define void @vec3_i8_zext_i32(<3 x i32> %a, <3 x i32> %b, ptr %src, ptr %dst) { define void @vec3_i8_sext_i32(<3 x i32> %a, <3 x i32> %b, ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec3_i8_sext_i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %l = load <3 x i8>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %l.ext = sext <3 x i8> %l to <3 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <3 x i32> %l.ext, %b -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <3 x i32> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub.trunc = trunc <3 x i32> %sub to <3 x i8> -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: store <3 x i8> %sub.trunc, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <3 x i8>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %l.ext = sext <3 x i8> %l to <3 x i32> +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <3 x i32> %l.ext, %b +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sub = sub <3 x i32> %add, %a +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sub.trunc = trunc <3 x i32> %sub to <3 x i8> +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: store <3 x i8> %sub.trunc, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <3 x i8>, ptr %src, align 1 %l.ext = sext <3 x i8> %l to <3 x i32> @@ -175,11 +175,11 @@ define void @vec3_i8_sext_i32(<3 x i32> %a, <3 x i32> %b, ptr %src, ptr %dst) { define void @vec3_i30(<3 x i30> %a, <3 x i30> %b, ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec3_i30' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %l = load <3 x i30>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <3 x i30> %l, %b -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <3 x i30> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: store <3 x i30> %sub, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <3 x i30>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <3 x i30> %l, %b +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sub = sub <3 x i30> %add, %a +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: store <3 x i30> %sub, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <3 x i30>, ptr %src, align 1 %add = add <3 x i30> %l, %b @@ -190,11 +190,11 @@ define void @vec3_i30(<3 x i30> %a, <3 x i30> %b, ptr %src, ptr %dst) { define void @vec3_float(<3 x float> %a, <3 x float> %b, ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec3_float' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <3 x float>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = fadd <3 x float> %l, %b -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = fsub <3 x float> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <3 x float> %sub, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <3 x float>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %add = fadd <3 x float> %l, %b +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %sub = fsub <3 x float> %add, %a +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <3 x float> %sub, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <3 x float>, ptr %src, align 1 %add = fadd <3 x float> %l, %b @@ -205,11 +205,11 @@ define void @vec3_float(<3 x float> %a, <3 x float> %b, ptr %src, ptr %dst) { define void @vec3_half(<3 x half> %a, <3 x half> %b, ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec3_half' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <3 x half>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %add = fadd <3 x half> %l, %b -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sub = fsub <3 x half> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <3 x half> %sub, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <3 x half>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %add = fadd <3 x half> %l, %b +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %sub = fsub <3 x half> %add, %a +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <3 x half> %sub, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <3 x half>, ptr %src, align 1 %add = fadd <3 x half> %l, %b @@ -220,10 +220,10 @@ define void @vec3_half(<3 x half> %a, <3 x half> %b, ptr %src, ptr %dst) { define void @vec15_i8(ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec15_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %l = load <15 x i8>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <15 x i8> %l, %l -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <15 x i8> %add, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <15 x i8>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <15 x i8> %l, %l +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <15 x i8> %add, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <15 x i8>, ptr %src, align 1 %add = add <15 x i8> %l, %l @@ -233,10 +233,10 @@ define void @vec15_i8(ptr %src, ptr %dst) { define void @vec14_i8(ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec14_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %l = load <14 x i8>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <14 x i8> %l, %l -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: store <14 x i8> %add, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <14 x i8>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <14 x i8> %l, %l +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: store <14 x i8> %add, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <14 x i8>, ptr %src, align 1 %add = add <14 x i8> %l, %l @@ -246,10 +246,10 @@ define void @vec14_i8(ptr %src, ptr %dst) { define void @vec13_i8(ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec13_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %l = load <13 x i8>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <13 x i8> %l, %l -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: store <13 x i8> %add, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <13 x i8>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <13 x i8> %l, %l +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: store <13 x i8> %add, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <13 x i8>, ptr %src, align 1 %add = add <13 x i8> %l, %l @@ -259,10 +259,10 @@ define void @vec13_i8(ptr %src, ptr %dst) { define void @vec12_i8(ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec12_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <12 x i8>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <12 x i8> %l, %l -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <12 x i8> %add, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <12 x i8>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <12 x i8> %l, %l +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <12 x i8> %add, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <12 x i8>, ptr %src, align 1 %add = add <12 x i8> %l, %l @@ -272,10 +272,10 @@ define void @vec12_i8(ptr %src, ptr %dst) { define void @vec11_i8(ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec11_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %l = load <11 x i8>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <11 x i8> %l, %l -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: store <11 x i8> %add, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <11 x i8>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <11 x i8> %l, %l +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: store <11 x i8> %add, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <11 x i8>, ptr %src, align 1 %add = add <11 x i8> %l, %l @@ -285,11 +285,11 @@ define void @vec11_i8(ptr %src, ptr %dst) { define void @vec7_i8(<7 x i8> %a, <7 x i8> %b, ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec7_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %l = load <7 x i8>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <7 x i8> %l, %b -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sub = sub <7 x i8> %add, %a -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: store <7 x i8> %sub, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <7 x i8>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <7 x i8> %l, %b +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sub = sub <7 x i8> %add, %a +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: store <7 x i8> %sub, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <7 x i8>, ptr %src, align 1 %add = add <7 x i8> %l, %b @@ -300,10 +300,10 @@ define void @vec7_i8(<7 x i8> %a, <7 x i8> %b, ptr %src, ptr %dst) { define void @vec6_i8(ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec6_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <6 x i8>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <6 x i8> %l, %l -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <6 x i8> %add, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <6 x i8>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <6 x i8> %l, %l +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <6 x i8> %add, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <6 x i8>, ptr %src, align 1 %add = add <6 x i8> %l, %l @@ -313,10 +313,10 @@ define void @vec6_i8(ptr %src, ptr %dst) { define void @vec5_i8(ptr %src, ptr %dst) { ; CHECK-LABEL: 'vec5_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %l = load <5 x i8>, ptr %src, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add = add <5 x i8> %l, %l -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <5 x i8> %add, ptr %dst, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %l = load <5 x i8>, ptr %src, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add = add <5 x i8> %l, %l +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <5 x i8> %add, ptr %dst, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %l = load <5 x i8>, ptr %src, align 1 %add = add <5 x i8> %l, %l diff --git a/llvm/test/Analysis/CostModel/AArch64/vector-reverse.ll b/llvm/test/Analysis/CostModel/AArch64/vector-reverse.ll index c0ba310c257a4..3f02d53163603 100644 --- a/llvm/test/Analysis/CostModel/AArch64/vector-reverse.ll +++ b/llvm/test/Analysis/CostModel/AArch64/vector-reverse.ll @@ -1,29 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; Check getIntrinsicInstrCost in BasicTTIImpl.h for vector.reverse +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s +; Check getIntrinsicInstrCost in BasicTTIImpl.h for vector.reverse target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -define void @vector_reverse() #0{ +define void @vector_reverse() #0 { ; CHECK-LABEL: 'vector_reverse' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = call <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %4 = call <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %6 = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %10 = call <16 x half> @llvm.vector.reverse.v16f16(<16 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %12 = call <8 x float> @llvm.vector.reverse.v8f32(<8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %14 = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = call <8 x bfloat> @llvm.vector.reverse.v8bf16(<8 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %16 = call <16 x bfloat> @llvm.vector.reverse.v16bf16(<16 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 2 for: %1 = call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %2 = call <32 x i8> @llvm.vector.reverse.v32i8(<32 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %3 = call <8 x i16> @llvm.vector.reverse.v8i16(<8 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %4 = call <16 x i16> @llvm.vector.reverse.v16i16(<16 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %5 = call <4 x i32> @llvm.vector.reverse.v4i32(<4 x i32> undef) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %6 = call <8 x i32> @llvm.vector.reverse.v8i32(<8 x i32> undef) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %7 = call <2 x i64> @llvm.vector.reverse.v2i64(<2 x i64> undef) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %8 = call <4 x i64> @llvm.vector.reverse.v4i64(<4 x i64> undef) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %9 = call <8 x half> @llvm.vector.reverse.v8f16(<8 x half> undef) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %10 = call <16 x half> @llvm.vector.reverse.v16f16(<16 x half> undef) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %11 = call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> undef) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %12 = call <8 x float> @llvm.vector.reverse.v8f32(<8 x float> undef) +; CHECK-NEXT: Cost Model: Found costs of 1 for: %13 = call <2 x double> @llvm.vector.reverse.v2f64(<2 x double> undef) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %14 = call <4 x double> @llvm.vector.reverse.v4f64(<4 x double> undef) +; CHECK-NEXT: Cost Model: Found costs of 2 for: %15 = call <8 x bfloat> @llvm.vector.reverse.v8bf16(<8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %16 = call <16 x bfloat> @llvm.vector.reverse.v16bf16(<16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; call <16 x i8> @llvm.vector.reverse.v16i8(<16 x i8> undef) From 5877bef3854d5dc5eaeb308636bcc31b12240563 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 31 Mar 2025 19:19:14 +0100 Subject: [PATCH 0130/1029] [LAA] Remove unneeded findValue calls (NFC). Use findLeader directly instead if going through findValue, getLeaderValue. This is simpler and more efficient. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index dd7b796fd0fdf..e7d6984caeba3 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -520,14 +520,13 @@ void RuntimePointerChecking::groupChecks( Pointers[I].IsWritePtr); SmallVector Groups; - auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access)); // Because DepCands is constructed by visiting accesses in the order in // which they appear in alias sets (which is deterministic) and the // iteration order within an equivalence class member is only dependent on // the order in which unions and insertions are performed on the // equivalence class, the iteration order is deterministic. - for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end(); + for (auto MI = DepCands.findLeader(Access), ME = DepCands.member_end(); MI != ME; ++MI) { auto PointerI = PositionMap.find(MI->getPointer()); assert(PointerI != PositionMap.end() && @@ -2264,13 +2263,9 @@ bool MemoryDepChecker::areDepsSafe(const DepCandidates &AccessSets, if (Visited.count(CurAccess)) continue; - // Get the relevant memory access set. - EquivalenceClasses::iterator I = - AccessSets.findValue(AccessSets.getLeaderValue(CurAccess)); - // Check accesses within this set. EquivalenceClasses::member_iterator AI = - AccessSets.member_begin(I); + AccessSets.findLeader(CurAccess); EquivalenceClasses::member_iterator AE = AccessSets.member_end(); From cc2b4326142e6df6755472edaf264a0af4fe599a Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Mon, 31 Mar 2025 14:21:22 -0400 Subject: [PATCH 0131/1029] [Clang][Cmake] fix libtool duplicate member name warnings (#133619) fixes #133199 PR #132252 Created a second file that shared `.cpp` in `clang/lib/CodeGen/CMakeLists.txt` For example There were two `AMDGPU.cpp`'s one in `TargetBuiltins` and the other in `Targets`. Even though these were in different directories `libtool` warns that it might not distinguish them because they share the same base name. There are two potential fixes. The easy fix is to rename one of them and keep one cmake file. That solution though doesn't future proof this problem in the event of a third `.cpp` and it seems teams want to just use the target name https://github.com/llvm/llvm-project/pull/132252#issuecomment-2758178483. The alternative fix is to seperate the cmake files into their own sub directories. I chose to create static libraries. It might of been possible to build an OBJECT, but I only saw examples of this in compiler-rt and test directories so assumed there was a reason it wasn't used. --- clang/lib/CodeGen/CMakeLists.txt | 49 +++++-------------- clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 2 +- .../lib/CodeGen/TargetBuiltins/CMakeLists.txt | 19 +++++++ clang/lib/CodeGen/Targets/CMakeLists.txt | 35 +++++++++++++ 4 files changed, 67 insertions(+), 38 deletions(-) create mode 100644 clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt create mode 100644 clang/lib/CodeGen/Targets/CMakeLists.txt diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt index ebe2fbd7db295..cdf9f909a3675 100644 --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -116,45 +116,8 @@ add_clang_library(clangCodeGen PatternInit.cpp SanitizerMetadata.cpp SwiftCallingConv.cpp - TargetBuiltins/ARM.cpp - TargetBuiltins/AMDGPU.cpp - TargetBuiltins/Hexagon.cpp - TargetBuiltins/NVPTX.cpp - TargetBuiltins/PPC.cpp - TargetBuiltins/RISCV.cpp - TargetBuiltins/SPIR.cpp - TargetBuiltins/SystemZ.cpp - TargetBuiltins/WebAssembly.cpp - TargetBuiltins/X86.cpp TargetInfo.cpp - Targets/AArch64.cpp - Targets/AMDGPU.cpp - Targets/ARC.cpp - Targets/ARM.cpp - Targets/AVR.cpp - Targets/BPF.cpp - Targets/CSKY.cpp - Targets/DirectX.cpp - Targets/Hexagon.cpp - Targets/Lanai.cpp - Targets/LoongArch.cpp - Targets/M68k.cpp - Targets/MSP430.cpp - Targets/Mips.cpp - Targets/NVPTX.cpp - Targets/PNaCl.cpp - Targets/PPC.cpp - Targets/RISCV.cpp - Targets/SPIR.cpp - Targets/Sparc.cpp - Targets/SystemZ.cpp - Targets/TCE.cpp - Targets/VE.cpp - Targets/WebAssembly.cpp - Targets/X86.cpp - Targets/XCore.cpp VarBypassDetector.cpp - DEPENDS vt_gen intrinsics_gen @@ -170,4 +133,16 @@ add_clang_library(clangCodeGen clangFrontend clangLex clangSerialization + clangCodeGenTargetBuiltins + clangCodeGenTargets + ) + + target_include_directories(clangCodeGen + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/TargetBuiltins + ${CMAKE_CURRENT_SOURCE_DIR}/Targets ) + + add_subdirectory(TargetBuiltins) + add_subdirectory(Targets) diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index b56b739094ff3..577fee05d4af6 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -1,4 +1,4 @@ -//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===// +//===------- AMDGPU.cpp - Emit LLVM Code for builtins ---------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt b/clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt new file mode 100644 index 0000000000000..8526c063b4593 --- /dev/null +++ b/clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt @@ -0,0 +1,19 @@ +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..) + +add_clang_library(clangCodeGenTargetBuiltins STATIC + ARM.cpp + AMDGPU.cpp + Hexagon.cpp + NVPTX.cpp + PPC.cpp + RISCV.cpp + SPIR.cpp + SystemZ.cpp + WebAssembly.cpp + X86.cpp +) + +target_link_libraries(clangCodeGenTargetBuiltins + PRIVATE + clangCodeGen +) diff --git a/clang/lib/CodeGen/Targets/CMakeLists.txt b/clang/lib/CodeGen/Targets/CMakeLists.txt new file mode 100644 index 0000000000000..fd79b6191b379 --- /dev/null +++ b/clang/lib/CodeGen/Targets/CMakeLists.txt @@ -0,0 +1,35 @@ +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..) + +add_clang_library(clangCodeGenTargets STATIC + AArch64.cpp + AMDGPU.cpp + ARC.cpp + ARM.cpp + AVR.cpp + BPF.cpp + CSKY.cpp + DirectX.cpp + Hexagon.cpp + Lanai.cpp + LoongArch.cpp + M68k.cpp + MSP430.cpp + Mips.cpp + NVPTX.cpp + PNaCl.cpp + PPC.cpp + RISCV.cpp + SPIR.cpp + Sparc.cpp + SystemZ.cpp + TCE.cpp + VE.cpp + WebAssembly.cpp + X86.cpp + XCore.cpp +) + +target_link_libraries(clangCodeGenTargets + PRIVATE + clangCodeGen +) From 514536731f7adb05ab6ffaaf9f2912b8df418b75 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Mon, 31 Mar 2025 11:29:09 -0700 Subject: [PATCH 0132/1029] [bazel] Add missing dep after e4b9486056fab7a262fdafbe70acf393c9767d12 --- utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index c1e96047c6256..989d944fb5aa2 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1224,6 +1224,7 @@ cc_library( ":lex", ":parse_attr_gen", ":sema", + "//llvm:FrontendHLSL", "//llvm:FrontendOpenMP", "//llvm:MC", "//llvm:MCParser", From a415b7f86e8c50e4e789d4651f3231d5078c5859 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Mon, 31 Mar 2025 13:36:32 -0500 Subject: [PATCH 0133/1029] [WebAssembly] Add more lowerings for wide-arithmetic (#132430) This commit is the result of investigation and discussion on WebAssembly/wide-arithmetic#6 where alternatives to the `i64.add128` instruction were discussed but ultimately deferred to a future proposal. In spite of this though I wanted to apply a few changes to the LLVM backend here with `wide-arithmetic` enabled for a few minor changes: * A lowering for the `ISD::UADDO` node is added which uses `add128` where the upper bits of the two operands are constant zeros and the result of the 128-bit addition is the result of the overflowing addition. * The high bits of a `I64_ADD128` node are now flagged as "known zero" if the upper bits of the inputs are also zero, assisting this `UADDO` lowering to ensure the backend knows that the carry result is a 1-bit result. A few tests were then added to showcase various lowerings for various operations that can be done with wide-arithmetic. They don't all optimize super well at this time but I wanted to add them as a reference here regardless to have them on-hand for future evaluations if necessary. --- .../WebAssembly/WebAssemblyISelLowering.cpp | 44 +++++- .../WebAssembly/WebAssemblyISelLowering.h | 1 + .../CodeGen/WebAssembly/wide-arithmetic.ll | 134 ++++++++++++++++++ 3 files changed, 176 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 9ae46e709d823..82d3b8e292e60 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -170,6 +170,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setOperationAction(ISD::SUB, MVT::i128, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom); + setOperationAction(ISD::UADDO, MVT::i64, Custom); } if (Subtarget->hasNontrappingFPToInt()) @@ -1109,6 +1110,18 @@ void WebAssemblyTargetLowering::computeKnownBitsForTargetNode( } } } + + // For 128-bit addition if the upper bits are all zero then it's known that + // the upper bits of the result will have all bits guaranteed zero except the + // first. + case WebAssemblyISD::I64_ADD128: + if (Op.getResNo() == 1) { + SDValue LHS_HI = Op.getOperand(1); + SDValue RHS_HI = Op.getOperand(3); + if (isNullConstant(LHS_HI) && isNullConstant(RHS_HI)) + Known.Zero.setBitsFrom(1); + } + break; } } @@ -1678,6 +1691,8 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op, case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: return LowerMUL_LOHI(Op, DAG); + case ISD::UADDO: + return LowerUADDO(Op, DAG); } } @@ -1794,10 +1809,33 @@ SDValue WebAssemblyTargetLowering::LowerMUL_LOHI(SDValue Op, } SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); - SDValue Hi = + SDValue Lo = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::i64, MVT::i64), LHS, RHS); - SDValue Lo(Hi.getNode(), 1); - SDValue Ops[] = {Hi, Lo}; + SDValue Hi(Lo.getNode(), 1); + SDValue Ops[] = {Lo, Hi}; + return DAG.getMergeValues(Ops, DL); +} + +// Lowers `UADDO` intrinsics to an `i64.add128` instruction when it's enabled. +// +// This enables generating a single wasm instruction for this operation where +// the upper half of both operands are constant zeros. The upper half of the +// result is then whether the overflow happened. +SDValue WebAssemblyTargetLowering::LowerUADDO(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->hasWideArithmetic()); + assert(Op.getValueType() == MVT::i64); + assert(Op.getOpcode() == ISD::UADDO); + SDLoc DL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + SDValue Result = + DAG.getNode(WebAssemblyISD::I64_ADD128, DL, + DAG.getVTList(MVT::i64, MVT::i64), LHS, Zero, RHS, Zero); + SDValue CarryI64(Result.getNode(), 1); + SDValue CarryI32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, CarryI64); + SDValue Ops[] = {Result, CarryI32}; return DAG.getMergeValues(Ops, DL); } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h index 90d31e38a7076..72401a7a259c0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -133,6 +133,7 @@ class WebAssemblyTargetLowering final : public TargetLowering { SDValue LowerStore(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL_LOHI(SDValue Op, SelectionDAG &DAG) const; SDValue Replace128Op(SDNode *N, SelectionDAG &DAG) const; + SDValue LowerUADDO(SDValue Op, SelectionDAG &DAG) const; // Custom DAG combine hooks SDValue diff --git a/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll b/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll index deff551d0eabd..71974b012a2b6 100644 --- a/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll +++ b/llvm/test/CodeGen/WebAssembly/wide-arithmetic.ll @@ -130,3 +130,137 @@ define i64 @mul_i128_only_lo(i128 %a, i128 %b) { %d = trunc i128 %c to i64 ret i64 %d } + +declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) +declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) + +; This is a codegen test to see the effect of overflowing adds on signed +; integers with wide-arithmetic enabled. At this time it doesn't actually +; generate anything differently than without wide-arithmetic but this has also +; been useful for evaluating the proposal. +define { i64, i1 } @add_wide_s(i64 %a, i64 %b) { +; CHECK-LABEL: add_wide_s: +; CHECK: .functype add_wide_s (i32, i64, i64) -> () +; CHECK-NEXT: .local i64 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i64.add +; CHECK-NEXT: local.tee 3 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i64.const 0 +; CHECK-NEXT: i64.lt_s +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64.lt_s +; CHECK-NEXT: i32.xor +; CHECK-NEXT: i32.store8 8 +; CHECK-NEXT: # fallthrough-return + %pair = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) + ret { i64, i1 } %pair +} + +define { i64, i1 } @add_wide_u(i64 %a, i64 %b) { +; CHECK-LABEL: add_wide_u: +; CHECK: .functype add_wide_u (i32, i64, i64) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64.const 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i64.const 0 +; CHECK-NEXT: i64.add128 +; CHECK-NEXT: local.set 1 +; CHECK-NEXT: local.set 2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64.store8 8 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) + ret { i64, i1 } %pair +} + +; This is a model of a hypothetical `i64.add_wide3_u` instruction using LLVM +; intrinsics. In theory this should optimize better (to the equivalent below) +; but it doesn't currently. +define { i64, i64 } @add_wide3_u_via_intrinsics(i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: add_wide3_u_via_intrinsics: +; CHECK: .functype add_wide3_u_via_intrinsics (i32, i64, i64, i64) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64.const 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i64.const 0 +; CHECK-NEXT: i64.add128 +; CHECK-NEXT: local.set 2 +; CHECK-NEXT: i64.const 0 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i64.const 0 +; CHECK-NEXT: i64.add128 +; CHECK-NEXT: local.set 1 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64.add +; CHECK-NEXT: i64.store 8 +; CHECK-NEXT: # fallthrough-return + %pair = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) + %t0 = extractvalue { i64, i1 } %pair, 0 + %carry1 = extractvalue { i64, i1 } %pair, 1 + + %pair2 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %t0, i64 %c) + %ret1 = extractvalue { i64, i1 } %pair2, 0 + %carry2 = extractvalue { i64, i1 } %pair2, 1 + + %carry1_64 = zext i1 %carry1 to i64 + %carry2_64 = zext i1 %carry2 to i64 + %ret2 = add i64 %carry1_64, %carry2_64 + + %r0 = insertvalue { i64, i64 } poison, i64 %ret1, 0 + %r1 = insertvalue { i64, i64 } %r0, i64 %ret2, 1 + ret { i64, i64 } %r1 +} + +; This is a model of a hypothetical `i64.add_wide3_u` instruction using 128-bit +; integer addition. This optimizes better than the above currently. +define { i64, i64 } @add_wide3_u_via_i128(i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: add_wide3_u_via_i128: +; CHECK: .functype add_wide3_u_via_i128 (i32, i64, i64, i64) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64.const 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i64.const 0 +; CHECK-NEXT: i64.add128 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i64.const 0 +; CHECK-NEXT: i64.add128 +; CHECK-NEXT: local.set 1 +; CHECK-NEXT: local.set 2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64.store 8 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %a128 = zext i64 %a to i128 + %b128 = zext i64 %b to i128 + %c128 = zext i64 %c to i128 + %t0 = add i128 %a128, %b128 + %t1 = add i128 %t0, %c128 + %result = trunc i128 %t1 to i64 + %t2 = lshr i128 %t1, 64 + %carry = trunc i128 %t2 to i64 + + %ret0 = insertvalue { i64, i64 } poison, i64 %result, 0 + %ret1 = insertvalue { i64, i64 } %ret0, i64 %carry, 1 + ret { i64, i64 } %ret1 +} From c1bf5e62877cf3a769886505a687def47cd65e05 Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Mon, 31 Mar 2025 14:39:20 -0400 Subject: [PATCH 0134/1029] [DirectX] Fix build break caused by bitcode writer changes (#133770) commit: https://github.com/llvm/llvm-project/commit/68947342b75cc71f3ac9041d11db086d8d074336 added a new `writeDIFixedPointType` function. However, `DIFixedPointType` is not supported in DXIL so we need to add a fail fast case for this to fix the build. this change fixes a build break introduced by https://github.com/llvm/llvm-project/pull/129596 --- llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index ea33ee5759cb7..2a736c91c05c7 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -237,6 +237,11 @@ class DXILBitcodeWriter { SmallVectorImpl &Record, unsigned Abbrev); void writeDIBasicType(const DIBasicType *N, SmallVectorImpl &Record, unsigned Abbrev); + void writeDIFixedPointType(const DIFixedPointType *N, + SmallVectorImpl &Record, + unsigned Abbrev) { + llvm_unreachable("DXIL cannot contain DIFixedPointType Nodes"); + } void writeDIStringType(const DIStringType *N, SmallVectorImpl &Record, unsigned Abbrev) { llvm_unreachable("DXIL cannot contain DIStringType Nodes"); From 0ac8cb1b3df724f549a62f6b277745af3d50b23a Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Mon, 31 Mar 2025 11:41:43 -0700 Subject: [PATCH 0135/1029] [flang] Recognize fir.pack_array in LoopVersioning. (#133191) This change enables LoopVersioning when `fir.pack_array` is met in the def-use chain. It fixes a couple of huge performance regressions caused by enabling `-frepack-arrays`. --- flang/docs/ArrayRepacking.md | 2 + .../Optimizer/Transforms/LoopVersioning.cpp | 31 ++++++++++---- .../loop-versioning-with-repack-arrays.fir | 40 +++++++++++++++++++ 3 files changed, 65 insertions(+), 8 deletions(-) create mode 100644 flang/test/Transforms/loop-versioning-with-repack-arrays.fir diff --git a/flang/docs/ArrayRepacking.md b/flang/docs/ArrayRepacking.md index f22e26ce49738..87cfc5d1bb4bc 100755 --- a/flang/docs/ArrayRepacking.md +++ b/flang/docs/ArrayRepacking.md @@ -432,6 +432,8 @@ There is an existing optimization pass (controlled via `-f[no-]version-loops-for The array repacking is targeting better data cache utilization, and is not intended to enable more unit-strided vectorization for the assumed-shape arrays. At the same time, combining array repacking with the loop versioning may provide better performance for programs where the actual array arguments are non-contiguous, but then their repacked copies can be accessed using unit strides. +It is suggested that the LoopVersioning pass is run before the lowering of `fir.pack_array` and `fir.unpack_array` operations, and recognizes `fir.pack_array` on the path from `fir.declare` to the function entry block argument. The pass generates the dynamic contiguity checks, and multiversions the loops. In case the repacking actually happens, the most optimal versions of the loops are executed. + In cases where `fir.pack_array` is statically known to produce a copy that is contiguous in the innermost dimension, the loop versioning pass can skip the generation of the dynamic checks. ### Driver: user options diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp index 1f3495569c9dd..42e149bb3dba2 100644 --- a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp +++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp @@ -184,13 +184,28 @@ getRankAndElementSize(const fir::KindMapping &kindMap, return {0, 0}; } -/// if a value comes from a fir.declare, follow it to the original source, -/// otherwise return the value -static mlir::Value unwrapFirDeclare(mlir::Value val) { - // fir.declare is for source code variables. We don't have declares of - // declares - if (fir::DeclareOp declare = val.getDefiningOp()) - return declare.getMemref(); +/// If a value comes from a fir.declare of fir.pack_array, +/// follow it to the original source, otherwise return the value. +static mlir::Value unwrapPassThroughOps(mlir::Value val) { + // Instead of unwrapping fir.declare, we may try to start + // the analysis in this pass from fir.declare's instead + // of the function entry block arguments. This way the loop + // versioning would work even after FIR inlining. + while (true) { + if (fir::DeclareOp declare = val.getDefiningOp()) { + val = declare.getMemref(); + continue; + } + // fir.pack_array might be met before fir.declare - this is how + // it is orifinally generated. + // It might also be met after fir.declare - after the optimization + // passes that sink fir.pack_array closer to the uses. + if (auto packArray = val.getDefiningOp()) { + val = packArray.getArray(); + continue; + } + break; + } return val; } @@ -242,7 +257,7 @@ static mlir::Value unwrapReboxOp(mlir::Value val) { /// normalize a value (removing fir.declare and fir.rebox) so that we can /// more conveniently spot values which came from function arguments static mlir::Value normaliseVal(mlir::Value val) { - return unwrapFirDeclare(unwrapReboxOp(val)); + return unwrapPassThroughOps(unwrapReboxOp(val)); } /// some FIR operations accept a fir.shape, a fir.shift or a fir.shapeshift. diff --git a/flang/test/Transforms/loop-versioning-with-repack-arrays.fir b/flang/test/Transforms/loop-versioning-with-repack-arrays.fir new file mode 100644 index 0000000000000..7a2cac4b56a24 --- /dev/null +++ b/flang/test/Transforms/loop-versioning-with-repack-arrays.fir @@ -0,0 +1,40 @@ +// RUN: fir-opt --loop-versioning %s | FileCheck %s + +// Check that LoopVersioning kicks in when there is fir.pack_array +// in between fir.declare and the block argument. + +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { + func.func @_QPtest(%arg0: !fir.box> {fir.bindc_name = "x"}) { + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 1.000000e+00 : f32 + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 stack whole : (!fir.box>) -> !fir.box> + %2 = fir.declare %1 dummy_scope %0 {uniq_name = "_QFtestEx"} : (!fir.box>, !fir.dscope) -> !fir.box> + %3 = fir.rebox %2 : (!fir.box>) -> !fir.box> + %4:3 = fir.box_dims %3, %c0 : (!fir.box>, index) -> (index, index, index) + fir.do_loop %arg1 = %c1 to %4#1 step %c1 unordered { + %5 = fir.array_coor %2 %arg1 : (!fir.box>, index) -> !fir.ref + fir.store %cst to %5 : !fir.ref + } + fir.unpack_array %1 to %arg0 stack : !fir.box> + return + } +} +// CHECK-LABEL: func.func @_QPtest( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_5:.*]] = fir.pack_array %[[VAL_0]] stack whole : (!fir.box>) -> !fir.box> +// CHECK: %[[VAL_6:.*]] = fir.declare %[[VAL_5]] dummy_scope %{{.*}} {uniq_name = "_QFtestEx"} : (!fir.box>, !fir.dscope) -> !fir.box> +// CHECK: %[[VAL_10:.*]]:3 = fir.box_dims %[[VAL_6]], %{{.*}} : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_11:.*]] = arith.constant 4 : index +// CHECK: %[[VAL_12:.*]] = arith.cmpi eq, %[[VAL_10]]#2, %[[VAL_11]] : index +// CHECK: fir.if %[[VAL_12]] { +// CHECK: fir.do_loop {{.*}} { +// CHECK: fir.coordinate_of {{.*}} : (!fir.ref>, index) -> !fir.ref +// CHECK: } +// CHECK: } else { +// CHECK: fir.do_loop {{.*}} { +// CHECK: fir.array_coor {{.*}} : (!fir.box>, index) -> !fir.ref +// CHECK: } +// CHECK: } +// CHECK: fir.unpack_array %[[VAL_5]] to %[[VAL_0]] stack : !fir.box> From 5f268d04f9898cb0f8d4a1371a7b22dc3c35e5fc Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Mon, 31 Mar 2025 11:42:17 -0700 Subject: [PATCH 0136/1029] [flang] Code generation for fir.pack/unpack_array. (#132080) The code generation relies on `ShallowCopyDirect` runtime to copy data between the original and the temporary arrays (both directions). The allocations are done by the compiler generated code. The heap allocations could have been passed to `ShallowCopy` runtime, but I decided to expose the allocations so that the temporary descriptor passed to `ShallowCopyDirect` has `nocapture` - maybe this will be better for LLVM optimizations. --- .../flang/Optimizer/Builder/FIRBuilder.h | 54 + .../flang/Optimizer/CodeGen/CGPasses.td | 11 + .../include/flang/Optimizer/CodeGen/CodeGen.h | 1 + flang/lib/Optimizer/Builder/FIRBuilder.cpp | 93 ++ flang/lib/Optimizer/Builder/HLFIRTools.cpp | 26 +- flang/lib/Optimizer/CodeGen/CMakeLists.txt | 1 + .../Optimizer/CodeGen/LowerRepackArrays.cpp | 330 +++++ .../HLFIR/Transforms/BufferizeHLFIR.cpp | 76 +- flang/lib/Optimizer/Passes/Pipelines.cpp | 1 + flang/test/Driver/bbc-mlir-pass-pipeline.f90 | 1 + .../test/Driver/mlir-debug-pass-pipeline.f90 | 1 + flang/test/Driver/mlir-pass-pipeline.f90 | 1 + flang/test/Fir/basic-program.fir | 1 + flang/test/HLFIR/elemental-codegen.fir | 6 +- flang/test/Transforms/lower-repack-arrays.fir | 1141 +++++++++++++++++ 15 files changed, 1665 insertions(+), 79 deletions(-) create mode 100644 flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp create mode 100644 flang/test/Transforms/lower-repack-arrays.fir diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h index 003b4358572c1..1583cfb3f5b51 100644 --- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h +++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h @@ -268,6 +268,40 @@ class FirOpBuilder : public mlir::OpBuilder, public mlir::OpBuilder::Listener { mlir::ValueRange lenParams = {}, llvm::ArrayRef attrs = {}); + /// Sample genDeclare callback for createArrayTemp() below. + /// It creates fir.declare operation using the given operands. + /// \p memref is the base of the allocated temporary, + /// which may be !fir.ref> or !fir.ref>. + static mlir::Value genTempDeclareOp(fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Value memref, + llvm::StringRef name, mlir::Value shape, + llvm::ArrayRef typeParams, + fir::FortranVariableFlagsAttr attrs); + + /// Create a temporary array with the given \p arrayType, + /// \p shape, \p extents and \p typeParams. An optional + /// \p polymorphicMold specifies the entity which dynamic type + /// has to be used for the allocation. + /// \p genDeclare callback generates a declare operation + /// for the created temporary. FIR passes may use genTempDeclareOp() + /// function above that creates fir.declare. + /// HLFIR passes may provide their own callback that generates + /// hlfir.declare. Some passes may provide a callback that + /// just passes through the base of the temporary. + /// If \p useStack is true, the function will try to do the allocation + /// in stack memory (which is not always possible currently). + /// The first return value is the base of the temporary object, + /// which may be !fir.ref> or !fir.ref>. + /// The second return value is true, if the actual allocation + /// was done in heap memory. + std::pair + createArrayTemp(mlir::Location loc, fir::SequenceType arrayType, + mlir::Value shape, llvm::ArrayRef extents, + llvm::ArrayRef typeParams, + const std::function &genDeclare, + mlir::Value polymorphicMold, bool useStack = false, + llvm::StringRef tmpName = ".tmp.array"); + /// Create an LLVM stack save intrinsic op. Returns the saved stack pointer. /// The stack address space is fetched from the data layout of the current /// module. @@ -596,6 +630,15 @@ class FirOpBuilder : public mlir::OpBuilder, public mlir::OpBuilder::Listener { return result; } + /// Compare two pointer-like values using the given predicate. + mlir::Value genPtrCompare(mlir::Location loc, + mlir::arith::CmpIPredicate predicate, + mlir::Value ptr1, mlir::Value ptr2) { + ptr1 = createConvert(loc, getIndexType(), ptr1); + ptr2 = createConvert(loc, getIndexType(), ptr2); + return create(loc, predicate, ptr1, ptr2); + } + private: /// Set attributes (e.g. FastMathAttr) to \p op operation /// based on the current attributes setting. @@ -850,6 +893,17 @@ llvm::SmallVector deduceOptimalExtents(mlir::ValueRange extents1, /// %result1 = arith.select %p4, %c0, %e1 : index llvm::SmallVector updateRuntimeExtentsForEmptyArrays( fir::FirOpBuilder &builder, mlir::Location loc, mlir::ValueRange extents); + +/// Given \p box of type fir::BaseBoxType representing an array, +/// the function generates code to fetch the lower bounds, +/// the extents and the strides from the box. The values are returned via +/// \p lbounds, \p extents and \p strides. +void genDimInfoFromBox(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value box, + llvm::SmallVectorImpl *lbounds, + llvm::SmallVectorImpl *extents, + llvm::SmallVectorImpl *strides); + } // namespace fir::factory #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H diff --git a/flang/include/flang/Optimizer/CodeGen/CGPasses.td b/flang/include/flang/Optimizer/CodeGen/CGPasses.td index 2e097faec5403..df0ecf5540776 100644 --- a/flang/include/flang/Optimizer/CodeGen/CGPasses.td +++ b/flang/include/flang/Optimizer/CodeGen/CGPasses.td @@ -99,4 +99,15 @@ def BoxedProcedurePass : Pass<"boxed-procedure", "mlir::ModuleOp"> { ]; } +def LowerRepackArraysPass : Pass<"lower-repack-arrays", "mlir::ModuleOp"> { + let summary = "Convert fir.pack/unpack_array to other FIR operations"; + let description = [{ + Convert fir.pack/unpack_array operations to other FIR operations + and Fortran runtime calls that implement the semantics + of packing/unpacking. + }]; + let dependentDialects = ["fir::FIROpsDialect", "mlir::arith::ArithDialect", + "mlir::func::FuncDialect"]; +} + #endif // FORTRAN_OPTIMIZER_CODEGEN_FIR_PASSES diff --git a/flang/include/flang/Optimizer/CodeGen/CodeGen.h b/flang/include/flang/Optimizer/CodeGen/CodeGen.h index 255b1950c8425..0398d0f248e08 100644 --- a/flang/include/flang/Optimizer/CodeGen/CodeGen.h +++ b/flang/include/flang/Optimizer/CodeGen/CodeGen.h @@ -26,6 +26,7 @@ struct NameUniquer; #define GEN_PASS_DECL_CODEGENREWRITE #define GEN_PASS_DECL_TARGETREWRITEPASS #define GEN_PASS_DECL_BOXEDPROCEDUREPASS +#define GEN_PASS_DECL_LOWERREPACKARRAYSPASS #include "flang/Optimizer/CodeGen/CGPasses.h.inc" /// FIR to LLVM translation pass options. diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index b7f8a8d3a9d56..fdc155ef2ef18 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -11,6 +11,7 @@ #include "flang/Optimizer/Builder/Character.h" #include "flang/Optimizer/Builder/Complex.h" #include "flang/Optimizer/Builder/MutableBox.h" +#include "flang/Optimizer/Builder/Runtime/Allocatable.h" #include "flang/Optimizer/Builder/Runtime/Assign.h" #include "flang/Optimizer/Builder/Runtime/Derived.h" #include "flang/Optimizer/Builder/Todo.h" @@ -362,6 +363,72 @@ mlir::Value fir::FirOpBuilder::createHeapTemporary( name, dynamicLength, dynamicShape, attrs); } +std::pair fir::FirOpBuilder::createArrayTemp( + mlir::Location loc, fir::SequenceType arrayType, mlir::Value shape, + llvm::ArrayRef extents, llvm::ArrayRef typeParams, + const std::function &genDeclare, + mlir::Value polymorphicMold, bool useStack, llvm::StringRef tmpName) { + if (polymorphicMold) { + // Create *allocated* polymorphic temporary using the dynamic type + // of the mold and the provided shape/extents. The created temporary + // array will be written element per element, that is why it has to be + // allocated. + mlir::Type boxHeapType = fir::HeapType::get(arrayType); + mlir::Value alloc = fir::factory::genNullBoxStorage( + *this, loc, fir::ClassType::get(boxHeapType)); + fir::FortranVariableFlagsAttr declAttrs = + fir::FortranVariableFlagsAttr::get( + getContext(), fir::FortranVariableFlagsEnum::allocatable); + + mlir::Value base = genDeclare(*this, loc, alloc, tmpName, + /*shape=*/nullptr, typeParams, declAttrs); + + int rank = extents.size(); + fir::runtime::genAllocatableApplyMold(*this, loc, alloc, polymorphicMold, + rank); + if (!extents.empty()) { + mlir::Type idxTy = getIndexType(); + mlir::Value one = createIntegerConstant(loc, idxTy, 1); + unsigned dim = 0; + for (mlir::Value extent : extents) { + mlir::Value dimIndex = createIntegerConstant(loc, idxTy, dim++); + fir::runtime::genAllocatableSetBounds(*this, loc, alloc, dimIndex, one, + extent); + } + } + if (!typeParams.empty()) { + // We should call AllocatableSetDerivedLength() here. + // TODO: does the mold provide the length parameters or + // the operation itself or should they be in sync? + TODO(loc, "polymorphic type with length parameters"); + } + fir::runtime::genAllocatableAllocate(*this, loc, alloc); + + return {base, /*isHeapAllocation=*/true}; + } + mlir::Value allocmem; + if (useStack) + allocmem = createTemporary(loc, arrayType, tmpName, extents, typeParams); + else + allocmem = + createHeapTemporary(loc, arrayType, tmpName, extents, typeParams); + mlir::Value base = genDeclare(*this, loc, allocmem, tmpName, shape, + typeParams, fir::FortranVariableFlagsAttr{}); + return {base, !useStack}; +} + +mlir::Value fir::FirOpBuilder::genTempDeclareOp( + fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value memref, + llvm::StringRef name, mlir::Value shape, + llvm::ArrayRef typeParams, + fir::FortranVariableFlagsAttr fortranAttrs) { + auto nameAttr = mlir::StringAttr::get(builder.getContext(), name); + return builder.create(loc, memref.getType(), memref, shape, + typeParams, + /*dummy_scope=*/nullptr, nameAttr, + fortranAttrs, cuf::DataAttributeAttr{}); +} + mlir::Value fir::FirOpBuilder::genStackSave(mlir::Location loc) { mlir::Type voidPtr = mlir::LLVM::LLVMPointerType::get( getContext(), fir::factory::getAllocaAddressSpace(&getDataLayout())); @@ -1825,3 +1892,29 @@ llvm::SmallVector fir::factory::updateRuntimeExtentsForEmptyArrays( } return newExtents; } + +void fir::factory::genDimInfoFromBox( + fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value box, + llvm::SmallVectorImpl *lbounds, + llvm::SmallVectorImpl *extents, + llvm::SmallVectorImpl *strides) { + auto boxType = mlir::dyn_cast(box.getType()); + assert(boxType && "must be a box"); + if (!lbounds && !extents && !strides) + return; + + unsigned rank = fir::getBoxRank(boxType); + assert(rank != 0 && "must be an array of known rank"); + mlir::Type idxTy = builder.getIndexType(); + for (unsigned i = 0; i < rank; ++i) { + mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i); + auto dimInfo = + builder.create(loc, idxTy, idxTy, idxTy, box, dim); + if (lbounds) + lbounds->push_back(dimInfo.getLowerBound()); + if (extents) + extents->push_back(dimInfo.getExtent()); + if (strides) + strides->push_back(dimInfo.getByteStride()); + } +} diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp index 85fd742db6beb..06a3e177da1d0 100644 --- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp +++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp @@ -95,24 +95,6 @@ getExplicitLbounds(fir::FortranVariableOpInterface var) { return {}; } -static void -genLboundsAndExtentsFromBox(mlir::Location loc, fir::FirOpBuilder &builder, - hlfir::Entity boxEntity, - llvm::SmallVectorImpl &lbounds, - llvm::SmallVectorImpl *extents) { - assert(mlir::isa(boxEntity.getType()) && "must be a box"); - mlir::Type idxTy = builder.getIndexType(); - const int rank = boxEntity.getRank(); - for (int i = 0; i < rank; ++i) { - mlir::Value dim = builder.createIntegerConstant(loc, idxTy, i); - auto dimInfo = builder.create(loc, idxTy, idxTy, idxTy, - boxEntity, dim); - lbounds.push_back(dimInfo.getLowerBound()); - if (extents) - extents->push_back(dimInfo.getExtent()); - } -} - static llvm::SmallVector getNonDefaultLowerBounds(mlir::Location loc, fir::FirOpBuilder &builder, hlfir::Entity entity) { @@ -128,8 +110,8 @@ getNonDefaultLowerBounds(mlir::Location loc, fir::FirOpBuilder &builder, if (entity.isMutableBox()) entity = hlfir::derefPointersAndAllocatables(loc, builder, entity); llvm::SmallVector lowerBounds; - genLboundsAndExtentsFromBox(loc, builder, entity, lowerBounds, - /*extents=*/nullptr); + fir::factory::genDimInfoFromBox(builder, loc, entity, &lowerBounds, + /*extents=*/nullptr, /*strides=*/nullptr); return lowerBounds; } @@ -1149,8 +1131,8 @@ static fir::ExtendedValue translateVariableToExtendedValue( variable.mayHaveNonDefaultLowerBounds()) { // This special case avoids generating two sets of identical // fir.box_dim to get both the lower bounds and extents. - genLboundsAndExtentsFromBox(loc, builder, variable, nonDefaultLbounds, - &extents); + fir::factory::genDimInfoFromBox(builder, loc, variable, &nonDefaultLbounds, + &extents, /*strides=*/nullptr); } else { extents = getVariableExtents(loc, builder, variable); nonDefaultLbounds = getNonDefaultLowerBounds(loc, builder, variable); diff --git a/flang/lib/Optimizer/CodeGen/CMakeLists.txt b/flang/lib/Optimizer/CodeGen/CMakeLists.txt index 553c20bb85d38..f730c7fd03948 100644 --- a/flang/lib/Optimizer/CodeGen/CMakeLists.txt +++ b/flang/lib/Optimizer/CodeGen/CMakeLists.txt @@ -4,6 +4,7 @@ add_flang_library(FIRCodeGen CodeGen.cpp CodeGenOpenMP.cpp FIROpPatterns.cpp + LowerRepackArrays.cpp PreCGRewrite.cpp TBAABuilder.cpp Target.cpp diff --git a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp new file mode 100644 index 0000000000000..0acc034c47152 --- /dev/null +++ b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp @@ -0,0 +1,330 @@ +//===-- LowerRepackArrays.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This pass expands fir.pack_array and fir.unpack_array operations +/// into sequences of other FIR operations and Fortran runtime calls. +/// This pass is using structured control flow FIR operations such +/// as fir.if, so its placement in the pipeline should guarantee +/// further lowering of these operations. +/// +/// A fir.pack_array operation is converted into a sequence of checks +/// identifying whether an array needs to be copied into a contiguous +/// temporary. When the checks pass, a new memory allocation is done +/// for the temporary array (in either stack or heap memory). +/// If `fir.pack_array` does not have no_copy attribute, then +/// the original array is shallow-copied into the temporary. +/// +/// A fir.unpack_array operations is converted into a check +/// of whether the original and the temporary arrays are different +/// memory. When the check passes, the temporary array might be +/// shallow-copied into the original array, and then the temporary +/// array is deallocated (if it was allocated in stack memory, +/// then there is no explicit deallocation). +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/CodeGen/CodeGen.h" + +#include "flang/Optimizer/Builder/Character.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/MutableBox.h" +#include "flang/Optimizer/Builder/Runtime/Allocatable.h" +#include "flang/Optimizer/Builder/Runtime/Transformational.h" +#include "flang/Optimizer/Builder/Todo.h" +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +namespace fir { +#define GEN_PASS_DEF_LOWERREPACKARRAYSPASS +#include "flang/Optimizer/CodeGen/CGPasses.h.inc" +} // namespace fir + +#define DEBUG_TYPE "lower-repack-arrays" + +namespace { +class PackArrayConversion : public mlir::OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + mlir::LogicalResult + matchAndRewrite(fir::PackArrayOp op, + mlir::PatternRewriter &rewriter) const override; + +private: + static constexpr llvm::StringRef bufferName = ".repacked"; + + // Return value of fir::BaseBoxType that represents a temporary + // array created for the original box with given extents and + // type parameters. The new box has the default lower bounds. + // If useStack is true, then the temporary will be allocated + // in stack memory (when possible). + static mlir::Value allocateTempBuffer(fir::FirOpBuilder &builder, + mlir::Location loc, bool useStack, + mlir::Value origBox, + llvm::ArrayRef extents, + llvm::ArrayRef typeParams); + + // Generate value of fir::BaseBoxType that represents the result + // of the given fir.pack_array operation. The original box + // is assumed to be present (though, it may represent an empty array). + static mlir::FailureOr genRepackedBox(fir::FirOpBuilder &builder, + mlir::Location loc, + fir::PackArrayOp packOp); +}; + +class UnpackArrayConversion + : public mlir::OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + mlir::LogicalResult + matchAndRewrite(fir::UnpackArrayOp op, + mlir::PatternRewriter &rewriter) const override; +}; +} // anonymous namespace + +// Return true iff for the given original boxed array we can +// allocate temporary memory in stack memory. +// This function is used to synchronize allocation/deallocation +// implied by fir.pack_array and fir.unpack_array, because +// the presence of the stack attribute does not automatically +// mean that the allocation is actually done in stack memory. +// For example, we always do the heap allocation for polymorphic +// types using Fortran runtime. +// Adding the polymorpic mold to fir.alloca and then using +// Fortran runtime to compute the allocation size could probably +// resolve this limitation. +static bool canAllocateTempOnStack(mlir::Value box) { + return !fir::isPolymorphicType(box.getType()); +} + +mlir::LogicalResult +PackArrayConversion::matchAndRewrite(fir::PackArrayOp op, + mlir::PatternRewriter &rewriter) const { + mlir::Location loc = op.getLoc(); + fir::FirOpBuilder builder(rewriter, op.getOperation()); + if (op.getMaxSize() || op.getMaxElementSize() || op.getMinStride()) + TODO(loc, "fir.pack_array with constraints"); + if (op.getHeuristics() != fir::PackArrayHeuristics::None) + TODO(loc, "fir.pack_array with heuristics"); + + mlir::Value box = op.getArray(); + auto boxType = mlir::cast(box.getType()); + + // For now we have to always check if the box is present. + auto isPresent = + builder.create(loc, builder.getI1Type(), box); + + fir::IfOp ifOp = builder.create(loc, boxType, isPresent, + /*withElseRegion=*/true); + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + // The box is present. + auto newBox = genRepackedBox(builder, loc, op); + if (mlir::failed(newBox)) + return newBox; + builder.create(loc, *newBox); + + // The box is not present. Return original box. + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + builder.create(loc, box); + + rewriter.replaceOp(op, ifOp.getResult(0)); + return mlir::success(); +} + +mlir::Value PackArrayConversion::allocateTempBuffer( + fir::FirOpBuilder &builder, mlir::Location loc, bool useStack, + mlir::Value origBox, llvm::ArrayRef extents, + llvm::ArrayRef typeParams) { + auto tempType = mlir::cast( + fir::extractSequenceType(origBox.getType())); + assert(tempType.getDimension() == extents.size() && + "number of extents does not match the rank"); + + mlir::Value shape = builder.genShape(loc, extents); + auto [base, isHeapAllocation] = builder.createArrayTemp( + loc, tempType, shape, extents, typeParams, + fir::FirOpBuilder::genTempDeclareOp, + fir::isPolymorphicType(origBox.getType()) ? origBox : nullptr, useStack, + bufferName); + // Make sure canAllocateTempOnStack() can recognize when + // the temporary is actually allocated on the stack + // by createArrayTemp(). Otherwise, we may miss dynamic + // deallocation when lowering fir.unpack_array. + if (useStack && canAllocateTempOnStack(origBox)) + assert(!isHeapAllocation && "temp must have been allocated on the stack"); + + if (isHeapAllocation) + if (auto baseType = mlir::dyn_cast(base.getType())) + if (mlir::isa(baseType.getEleTy())) + return builder.create(loc, base); + + mlir::Type ptrType = base.getType(); + mlir::Type tempBoxType = fir::BoxType::get(mlir::isa(ptrType) + ? ptrType + : fir::unwrapRefType(ptrType)); + mlir::Value newBox = + builder.createBox(loc, tempBoxType, base, shape, /*slice=*/nullptr, + typeParams, /*tdesc=*/nullptr); + return newBox; +} + +mlir::FailureOr +PackArrayConversion::genRepackedBox(fir::FirOpBuilder &builder, + mlir::Location loc, fir::PackArrayOp op) { + mlir::OpBuilder::InsertionGuard guard(builder); + mlir::Value box = op.getArray(); + llvm::SmallVector typeParams(op.getTypeparams().begin(), + op.getTypeparams().end()); + auto boxType = mlir::cast(box.getType()); + mlir::Type indexType = builder.getIndexType(); + + // If type parameters are not specified by fir.pack_array, + // figure out how many of them we need to read from the box. + unsigned numTypeParams = 0; + if (typeParams.size() == 0) { + if (auto recordType = + mlir::dyn_cast(boxType.unwrapInnerType())) + if (recordType.getNumLenParams() != 0) + TODO(loc, + "allocating temporary for a parameterized derived type array"); + + if (auto charType = + mlir::dyn_cast(boxType.unwrapInnerType())) { + if (charType.hasDynamicLen()) { + // Read one length parameter from the box. + numTypeParams = 1; + } else { + // Place the constant length into typeParams. + mlir::Value length = + builder.createIntegerConstant(loc, indexType, charType.getLen()); + typeParams.push_back(length); + } + } + } + + // Create a temporay iff the original is not contigous and is not empty. + auto isNotContiguous = builder.genNot( + loc, builder.create(loc, box, op.getInnermost())); + auto dataAddr = + builder.create(loc, fir::boxMemRefType(boxType), box); + auto isNotEmpty = + builder.create(loc, builder.getI1Type(), dataAddr); + auto doPack = + builder.create(loc, isNotContiguous, isNotEmpty); + + fir::IfOp ifOp = + builder.create(loc, boxType, doPack, /*withElseRegion=*/true); + + // Return original box. + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + builder.create(loc, box); + + // Create a new box. + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + + // Get lower bounds and extents from the box. + llvm::SmallVector lbounds, extents; + fir::factory::genDimInfoFromBox(builder, loc, box, &lbounds, &extents, + /*strides=*/nullptr); + // Get the type parameters from the box, if needed. + llvm::SmallVector assumedTypeParams; + if (numTypeParams != 0) { + if (auto charType = + mlir::dyn_cast(boxType.unwrapInnerType())) + if (charType.hasDynamicLen()) { + fir::factory::CharacterExprHelper charHelper(builder, loc); + mlir::Value len = charHelper.readLengthFromBox(box, charType); + typeParams.push_back(builder.createConvert(loc, indexType, len)); + } + + if (numTypeParams != typeParams.size()) + return emitError(loc) << "failed to compute the type parameters for " + << op.getOperation() << '\n'; + } + + mlir::Value tempBox = + allocateTempBuffer(builder, loc, op.getStack(), box, extents, typeParams); + if (!op.getNoCopy()) + fir::runtime::genShallowCopy(builder, loc, tempBox, box, + /*resultIsAllocated=*/true); + + // Set lower bounds after the original box. + mlir::Value shift = builder.genShift(loc, lbounds); + tempBox = builder.create(loc, boxType, tempBox, shift, + /*slice=*/nullptr); + builder.create(loc, tempBox); + + return ifOp.getResult(0); +} + +mlir::LogicalResult +UnpackArrayConversion::matchAndRewrite(fir::UnpackArrayOp op, + mlir::PatternRewriter &rewriter) const { + mlir::Location loc = op.getLoc(); + fir::FirOpBuilder builder(rewriter, op.getOperation()); + mlir::Type predicateType = builder.getI1Type(); + mlir::Value tempBox = op.getTemp(); + mlir::Value originalBox = op.getOriginal(); + + // For now we have to always check if the box is present. + auto isPresent = + builder.create(loc, predicateType, originalBox); + + builder.genIfThen(loc, isPresent).genThen([&]() { + mlir::Type addrType = + fir::HeapType::get(fir::extractSequenceType(tempBox.getType())); + mlir::Value tempAddr = + builder.create(loc, addrType, tempBox); + mlir::Value originalAddr = + builder.create(loc, addrType, originalBox); + + auto isNotSame = builder.genPtrCompare(loc, mlir::arith::CmpIPredicate::ne, + tempAddr, originalAddr); + builder.genIfThen(loc, isNotSame).genThen([&]() {}); + // Copy from temporary to the original. + if (!op.getNoCopy()) + fir::runtime::genShallowCopy(builder, loc, originalBox, tempBox, + /*resultIsAllocated=*/true); + + // Deallocate, if it was allocated in heap. + // Note that the stack attribute does not always mean + // that the allocation was actually done in stack memory. + // There are currently cases where we delegate the allocation + // to the runtime that uses heap memory, even when the stack + // attribute is set on fir.pack_array. + if (!op.getStack() || !canAllocateTempOnStack(originalBox)) + builder.create(loc, tempAddr); + }); + rewriter.eraseOp(op); + return mlir::success(); +} + +namespace { +class LowerRepackArraysPass + : public fir::impl::LowerRepackArraysPassBase { +public: + using LowerRepackArraysPassBase< + LowerRepackArraysPass>::LowerRepackArraysPassBase; + + void runOnOperation() override final { + auto *context = &getContext(); + mlir::ModuleOp module = getOperation(); + mlir::RewritePatternSet patterns(context); + patterns.insert(context); + patterns.insert(context); + mlir::GreedyRewriteConfig config; + config.enableRegionSimplification = + mlir::GreedySimplifyRegionLevel::Disabled; + (void)applyPatternsGreedily(module, std::move(patterns), config); + } +}; + +} // anonymous namespace diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp index 30e7ef7890953..8a36214def167 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp @@ -105,60 +105,27 @@ static mlir::Value getBufferizedExprMustFreeFlag(mlir::Value bufferizedExpr) { static std::pair createArrayTemp(mlir::Location loc, fir::FirOpBuilder &builder, mlir::Type exprType, mlir::Value shape, - mlir::ValueRange extents, mlir::ValueRange lenParams, + llvm::ArrayRef extents, + llvm::ArrayRef lenParams, std::optional polymorphicMold) { - mlir::Type sequenceType = hlfir::getFortranElementOrSequenceType(exprType); - llvm::StringRef tmpName{".tmp.array"}; - - if (polymorphicMold) { - // Create *allocated* polymorphic temporary using the dynamic type - // of the mold and the provided shape/extents. The created temporary - // array will be written element per element, that is why it has to be - // allocated. - mlir::Type boxHeapType = fir::HeapType::get(sequenceType); - mlir::Value alloc = fir::factory::genNullBoxStorage( - builder, loc, fir::ClassType::get(boxHeapType)); - mlir::Value isHeapAlloc = builder.createBool(loc, true); - fir::FortranVariableFlagsAttr declAttrs = - fir::FortranVariableFlagsAttr::get( - builder.getContext(), fir::FortranVariableFlagsEnum::allocatable); - + auto sequenceType = mlir::cast( + hlfir::getFortranElementOrSequenceType(exprType)); + + auto genTempDeclareOp = + [](fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value memref, + llvm::StringRef name, mlir::Value shape, + llvm::ArrayRef typeParams, + fir::FortranVariableFlagsAttr attrs) -> mlir::Value { auto declareOp = - builder.create(loc, alloc, tmpName, - /*shape=*/nullptr, lenParams, - /*dummy_scope=*/nullptr, declAttrs); - - int rank = extents.size(); - fir::runtime::genAllocatableApplyMold(builder, loc, alloc, - polymorphicMold->getFirBase(), rank); - if (!extents.empty()) { - mlir::Type idxTy = builder.getIndexType(); - mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); - unsigned dim = 0; - for (mlir::Value extent : extents) { - mlir::Value dimIndex = builder.createIntegerConstant(loc, idxTy, dim++); - fir::runtime::genAllocatableSetBounds(builder, loc, alloc, dimIndex, - one, extent); - } - } - if (!lenParams.empty()) { - // We should call AllocatableSetDerivedLength() here. - // TODO: does the mold provide the length parameters or - // the operation itself or should they be in sync? - TODO(loc, "polymorphic type with length parameters in HLFIR"); - } - fir::runtime::genAllocatableAllocate(builder, loc, alloc); - - return {hlfir::Entity{declareOp.getBase()}, isHeapAlloc}; - } + builder.create(loc, memref, name, shape, typeParams, + /*dummy_scope=*/nullptr, attrs); + return declareOp.getBase(); + }; - mlir::Value allocmem = builder.createHeapTemporary(loc, sequenceType, tmpName, - extents, lenParams); - auto declareOp = builder.create( - loc, allocmem, tmpName, shape, lenParams, - /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{}); - mlir::Value trueVal = builder.createBool(loc, true); - return {hlfir::Entity{declareOp.getBase()}, trueVal}; + auto [base, isHeapAlloc] = builder.createArrayTemp( + loc, sequenceType, shape, extents, lenParams, genTempDeclareOp, + polymorphicMold ? polymorphicMold->getFirBase() : nullptr); + return {hlfir::Entity{base}, builder.createBool(loc, isHeapAlloc)}; } /// Copy \p source into a new temporary and package the temporary into a @@ -786,9 +753,10 @@ struct ElementalOpConversion if (adaptor.getMold()) mold = getBufferizedExprStorage(adaptor.getMold()); auto extents = hlfir::getIndexExtents(loc, builder, shape); - auto [temp, cleanup] = - createArrayTemp(loc, builder, elemental.getType(), shape, extents, - adaptor.getTypeparams(), mold); + llvm::SmallVector typeParams(adaptor.getTypeparams().begin(), + adaptor.getTypeparams().end()); + auto [temp, cleanup] = createArrayTemp(loc, builder, elemental.getType(), + shape, extents, typeParams, mold); // If the box load is needed, we'd better place it outside // of the loop nest. temp = derefPointersAndAllocatables(loc, builder, temp); diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index 3aea021e596f6..6ec19556625bc 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -198,6 +198,7 @@ void createDefaultFIROptimizerPassPipeline(mlir::PassManager &pm, pm.addPass(fir::createPolymorphicOpConversion()); pm.addPass(fir::createAssumedRankOpConversion()); + pm.addPass(fir::createLowerRepackArraysPass()); // Expand FIR operations that may use SCF dialect for their // implementation. This is a mandatory pass. pm.addPass(fir::createSimplifyFIROperations( diff --git a/flang/test/Driver/bbc-mlir-pass-pipeline.f90 b/flang/test/Driver/bbc-mlir-pass-pipeline.f90 index 276ef818622a1..137c19608c38f 100644 --- a/flang/test/Driver/bbc-mlir-pass-pipeline.f90 +++ b/flang/test/Driver/bbc-mlir-pass-pipeline.f90 @@ -47,6 +47,7 @@ ! CHECK-NEXT: PolymorphicOpConversion ! CHECK-NEXT: AssumedRankOpConversion +! CHECK-NEXT: LowerRepackArraysPass ! CHECK-NEXT: SimplifyFIROperations ! CHECK-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private'] diff --git a/flang/test/Driver/mlir-debug-pass-pipeline.f90 b/flang/test/Driver/mlir-debug-pass-pipeline.f90 index 70fa0cf5ae47c..42a71b2d6adc3 100644 --- a/flang/test/Driver/mlir-debug-pass-pipeline.f90 +++ b/flang/test/Driver/mlir-debug-pass-pipeline.f90 @@ -77,6 +77,7 @@ ! ALL-NEXT: PolymorphicOpConversion ! ALL-NEXT: AssumedRankOpConversion +! ALL-NEXT: LowerRepackArraysPass ! ALL-NEXT: SimplifyFIROperations ! ALL-NEXT: Pipeline Collection : ['fir.global', 'func.func', 'omp.declare_reduction', 'omp.private'] diff --git a/flang/test/Driver/mlir-pass-pipeline.f90 b/flang/test/Driver/mlir-pass-pipeline.f90 index 852764be1f136..45370895db397 100644 --- a/flang/test/Driver/mlir-pass-pipeline.f90 +++ b/flang/test/Driver/mlir-pass-pipeline.f90 @@ -101,6 +101,7 @@ ! ALL-NEXT: PolymorphicOpConversion ! ALL-NEXT: AssumedRankOpConversion +! ALL-NEXT: LowerRepackArraysPass ! ALL-NEXT: SimplifyFIROperations ! O2-NEXT: AddAliasTags diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir index 90bff80da1915..ded42886aad44 100644 --- a/flang/test/Fir/basic-program.fir +++ b/flang/test/Fir/basic-program.fir @@ -99,6 +99,7 @@ func.func @_QQmain() { // PASSES-NEXT: PolymorphicOpConversion // PASSES-NEXT: AssumedRankOpConversion +// PASSES-NEXT: LowerRepackArraysPass // PASSES-NEXT: SimplifyFIROperations // PASSES-NEXT: AddAliasTags diff --git a/flang/test/HLFIR/elemental-codegen.fir b/flang/test/HLFIR/elemental-codegen.fir index 2443217f557f8..c05c05cfa0413 100644 --- a/flang/test/HLFIR/elemental-codegen.fir +++ b/flang/test/HLFIR/elemental-codegen.fir @@ -166,7 +166,6 @@ func.func @test_polymorphic(%arg0: !fir.class> {fir.bindc_ // CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_9]], %[[VAL_9]] : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_11:.*]] = fir.embox %[[VAL_8]](%[[VAL_10]]) : (!fir.heap>>, !fir.shape<2>) -> !fir.class>>> // CHECK: fir.store %[[VAL_11]] to %[[VAL_4]] : !fir.ref>>>> -// CHECK: %[[VAL_12:.*]] = arith.constant true // CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_4]] {fortran_attrs = #fir.var_attrs, uniq_name = ".tmp.array"} : (!fir.ref>>>>) -> (!fir.ref>>>>, !fir.ref>>>>) // CHECK: %[[RANK:.*]] = arith.constant 2 : i32 // CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_4]] : (!fir.ref>>>>) -> !fir.ref> @@ -193,6 +192,7 @@ func.func @test_polymorphic(%arg0: !fir.class> {fir.bindc_ // CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_4]] : (!fir.ref>>>>) -> !fir.ref> // CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_31]] : (!fir.ref>) -> !fir.ref // CHECK: %[[VAL_38:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_36]], %[[VAL_34]], %[[VAL_35]], %[[VAL_37]], %[[VAL_33]]) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: %[[VAL_12:.*]] = arith.constant true // CHECK: %[[VAL_39:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref>>>> // CHECK: %[[VAL_40:.*]] = arith.constant 1 : index // CHECK: fir.do_loop %[[VAL_41:.*]] = %[[VAL_40]] to %[[EX1]] step %[[VAL_40]] unordered { @@ -250,7 +250,6 @@ func.func @test_polymorphic_expr(%arg0: !fir.class> {fir.b // CHECK: %[[VAL_11:.*]] = fir.shape %[[VAL_10]], %[[VAL_10]] : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_12:.*]] = fir.embox %[[VAL_9]](%[[VAL_11]]) : (!fir.heap>>, !fir.shape<2>) -> !fir.class>>> // CHECK: fir.store %[[VAL_12]] to %[[VAL_5]] : !fir.ref>>>> -// CHECK: %[[VAL_13:.*]] = arith.constant true // CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_5]] {fortran_attrs = #fir.var_attrs, uniq_name = ".tmp.array"} : (!fir.ref>>>>) -> (!fir.ref>>>>, !fir.ref>>>>) // CHECK: %[[VAL_15:.*]] = arith.constant 2 : i32 // CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_5]] : (!fir.ref>>>>) -> !fir.ref> @@ -277,6 +276,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class> {fir.b // CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_5]] : (!fir.ref>>>>) -> !fir.ref> // CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_32]] : (!fir.ref>) -> !fir.ref // CHECK: %[[VAL_39:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_37]], %[[VAL_35]], %[[VAL_36]], %[[VAL_38]], %[[VAL_34]]) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: %[[VAL_13:.*]] = arith.constant true // CHECK: %[[VAL_40:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref>>>> // CHECK: %[[VAL_41:.*]] = arith.constant 1 : index // CHECK: fir.do_loop %[[VAL_42:.*]] = %[[VAL_41]] to %[[VAL_3]] step %[[VAL_41]] unordered { @@ -303,7 +303,6 @@ func.func @test_polymorphic_expr(%arg0: !fir.class> {fir.b // CHECK: %[[VAL_60:.*]] = fir.shape %[[VAL_59]], %[[VAL_59]] : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_61:.*]] = fir.embox %[[VAL_58]](%[[VAL_60]]) : (!fir.heap>>, !fir.shape<2>) -> !fir.class>>> // CHECK: fir.store %[[VAL_61]] to %[[VAL_4]] : !fir.ref>>>> -// CHECK: %[[VAL_62:.*]] = arith.constant true // CHECK: %[[VAL_63:.*]]:2 = hlfir.declare %[[VAL_4]] {fortran_attrs = #fir.var_attrs, uniq_name = ".tmp.array"} : (!fir.ref>>>>) -> (!fir.ref>>>>, !fir.ref>>>>) // CHECK: %[[VAL_64:.*]] = arith.constant 2 : i32 // CHECK: %[[VAL_65:.*]] = fir.convert %[[VAL_4]] : (!fir.ref>>>>) -> !fir.ref> @@ -330,6 +329,7 @@ func.func @test_polymorphic_expr(%arg0: !fir.class> {fir.b // CHECK: %[[VAL_86:.*]] = fir.convert %[[VAL_4]] : (!fir.ref>>>>) -> !fir.ref> // CHECK: %[[VAL_87:.*]] = fir.convert %[[VAL_81]] : (!fir.ref>) -> !fir.ref // CHECK: %[[VAL_88:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_86]], %[[VAL_84]], %[[VAL_85]], %[[VAL_87]], %[[VAL_83]]) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: %[[VAL_62:.*]] = arith.constant true // CHECK: %[[VAL_89:.*]] = fir.load %[[VAL_63]]#0 : !fir.ref>>>> // CHECK: %[[VAL_90:.*]] = arith.constant 1 : index // CHECK: fir.do_loop %[[VAL_91:.*]] = %[[VAL_90]] to %[[VAL_3]] step %[[VAL_90]] unordered { diff --git a/flang/test/Transforms/lower-repack-arrays.fir b/flang/test/Transforms/lower-repack-arrays.fir new file mode 100644 index 0000000000000..7317d8f49f074 --- /dev/null +++ b/flang/test/Transforms/lower-repack-arrays.fir @@ -0,0 +1,1141 @@ +// RUN: fir-opt --lower-repack-arrays %s | FileCheck %s +// Test trivial type array repacking. +func.func @_QPtest1(%arg0: !fir.box> {fir.bindc_name = "x"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 heap innermost : (!fir.box>) -> !fir.box> + %2 = fir.declare %1 dummy_scope %0 {uniq_name = "_QFtest1Ex"} : (!fir.box>, !fir.dscope) -> !fir.box> + fir.unpack_array %1 to %arg0 heap : !fir.box> + return +} +// CHECK-LABEL: func.func @_QPtest1( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant false +// CHECK: %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_7:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>) -> i1 +// CHECK: %[[VAL_8:.*]] = fir.if %[[VAL_7]] -> (!fir.box>) { +// CHECK: %[[VAL_9:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.box>) -> i1 +// CHECK: %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_9]], %[[VAL_5]] : i1 +// CHECK: %[[VAL_11:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>) -> !fir.ref>> +// CHECK: %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref>>) -> i1 +// CHECK: %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1 +// CHECK: %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box>) { +// CHECK: %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_18:.*]] = fir.allocmem !fir.array, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked", uniq_name = ""} +// CHECK: %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.heap>, !fir.shape<2>) -> !fir.heap> +// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> +// CHECK: %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box> +// CHECK: fir.result %[[VAL_26]] : !fir.box> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box> +// CHECK: } +// CHECK: fir.result %[[VAL_14]] : !fir.box> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box> +// CHECK: } +// CHECK: %[[VAL_27:.*]] = fir.declare %[[VAL_8]] dummy_scope %[[VAL_6]] {uniq_name = "_QFtest1Ex"} : (!fir.box>, !fir.dscope) -> !fir.box> +// CHECK: %[[VAL_28:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>) -> i1 +// CHECK: fir.if %[[VAL_28]] { +// CHECK: %[[VAL_29:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box>) -> !fir.heap> +// CHECK: %[[VAL_31:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>) -> !fir.heap> +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (!fir.heap>) -> index +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (!fir.heap>) -> index +// CHECK: %[[VAL_33:.*]] = arith.cmpi ne, %[[VAL_30]], %[[VAL_32]] : index +// CHECK: fir.if %[[VAL_33]] { +// CHECK: %[[VAL_34:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_8]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_34]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_35]], %[[VAL_36]], %[[VAL_37]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: fir.freemem %[[VAL_29]] : !fir.heap> +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test 'stack whole' repacking. +func.func @_QPtest1_whole(%arg0: !fir.box> {fir.bindc_name = "x"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 stack whole : (!fir.box>) -> !fir.box> + %2 = fir.declare %1 dummy_scope %0 {uniq_name = "_QFtest1_wholeEx"} : (!fir.box>, !fir.dscope) -> !fir.box> + fir.unpack_array %1 to %arg0 stack : !fir.box> + return +} +// CHECK-LABEL: func.func @_QPtest1_whole( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant false +// CHECK: %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_7:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>) -> i1 +// CHECK: %[[VAL_8:.*]] = fir.if %[[VAL_7]] -> (!fir.box>) { +// CHECK: %[[VAL_9:.*]] = fir.is_contiguous_box %[[VAL_0]] whole : (!fir.box>) -> i1 +// CHECK: %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_9]], %[[VAL_5]] : i1 +// CHECK: %[[VAL_11:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>) -> !fir.ref>> +// CHECK: %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref>>) -> i1 +// CHECK: %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1 +// CHECK: %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box>) { +// CHECK: %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_18:.*]] = fir.alloca !fir.array, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked"} +// CHECK: %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.ref>, !fir.shape<2>) -> !fir.ref> +// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.ref>, !fir.shape<2>) -> !fir.box> +// CHECK: %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box>, !fir.shift<2>) -> !fir.box> +// CHECK: fir.result %[[VAL_26]] : !fir.box> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box> +// CHECK: } +// CHECK: fir.result %[[VAL_14]] : !fir.box> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box> +// CHECK: } +// CHECK: %[[VAL_27:.*]] = fir.declare %[[VAL_8]] dummy_scope %[[VAL_6]] {uniq_name = "_QFtest1_wholeEx"} : (!fir.box>, !fir.dscope) -> !fir.box> +// CHECK: %[[VAL_28:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>) -> i1 +// CHECK: fir.if %[[VAL_28]] { +// CHECK: %[[VAL_29:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box>) -> !fir.heap> +// CHECK: %[[VAL_31:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>) -> !fir.heap> +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (!fir.heap>) -> index +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (!fir.heap>) -> index +// CHECK: %[[VAL_33:.*]] = arith.cmpi ne, %[[VAL_30]], %[[VAL_32]] : index +// CHECK: fir.if %[[VAL_33]] { +// CHECK: %[[VAL_34:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_8]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_34]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_35]], %[[VAL_36]], %[[VAL_37]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test unpacking with no_copy. +func.func @_QPtest1_in(%arg0: !fir.box> {fir.bindc_name = "x"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 heap innermost : (!fir.box>) -> !fir.box> + %2 = fir.declare %1 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest1_inEx"} : (!fir.box>, !fir.dscope) -> !fir.box> + fir.unpack_array %1 to %arg0 heap no_copy : !fir.box> + return +} +// CHECK-LABEL: func.func @_QPtest1_in( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = arith.constant false +// CHECK: %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_6:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>) -> i1 +// CHECK: %[[VAL_7:.*]] = fir.if %[[VAL_6]] -> (!fir.box>) { +// CHECK: %[[VAL_8:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.box>) -> i1 +// CHECK: %[[VAL_9:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_4]] : i1 +// CHECK: %[[VAL_10:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>) -> !fir.ref>> +// CHECK: %[[VAL_11:.*]] = fir.is_present %[[VAL_10]] : (!fir.ref>>) -> i1 +// CHECK: %[[VAL_12:.*]] = arith.andi %[[VAL_9]], %[[VAL_11]] : i1 +// CHECK: %[[VAL_13:.*]] = fir.if %[[VAL_12]] -> (!fir.box>) { +// CHECK: %[[VAL_14:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_16:.*]] = fir.shape %[[VAL_14]]#1, %[[VAL_15]]#1 : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_17:.*]] = fir.allocmem !fir.array, %[[VAL_14]]#1, %[[VAL_15]]#1 {bindc_name = ".repacked", uniq_name = ""} +// CHECK: %[[VAL_18:.*]] = fir.declare %[[VAL_17]](%[[VAL_16]]) {uniq_name = ".repacked"} : (!fir.heap>, !fir.shape<2>) -> !fir.heap> +// CHECK: %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_16]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> +// CHECK: %[[VAL_20:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_19]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_20]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_24:.*]] = fir.shift %[[VAL_14]]#0, %[[VAL_15]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_25:.*]] = fir.rebox %[[VAL_19]](%[[VAL_24]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box> +// CHECK: fir.result %[[VAL_25]] : !fir.box> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box> +// CHECK: } +// CHECK: fir.result %[[VAL_13]] : !fir.box> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box> +// CHECK: } +// CHECK: %[[VAL_26:.*]] = fir.declare %[[VAL_7]] dummy_scope %[[VAL_5]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest1_inEx"} : (!fir.box>, !fir.dscope) -> !fir.box> +// CHECK: %[[VAL_27:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>) -> i1 +// CHECK: fir.if %[[VAL_27]] { +// CHECK: %[[VAL_28:.*]] = fir.box_addr %[[VAL_7]] : (!fir.box>) -> !fir.heap> +// CHECK: %[[VAL_30:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>) -> !fir.heap> +// CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (!fir.heap>) -> index +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.heap>) -> index +// CHECK: %[[VAL_32:.*]] = arith.cmpi ne, %[[VAL_29]], %[[VAL_31]] : index +// CHECK: fir.if %[[VAL_32]] { +// CHECK: fir.freemem %[[VAL_28]] : !fir.heap> +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test packing with no_copy. +func.func @_QPtest1_out(%arg0: !fir.box> {fir.bindc_name = "x"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 heap innermost no_copy : (!fir.box>) -> !fir.box> + %2 = fir.declare %1 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest1_outEx"} : (!fir.box>, !fir.dscope) -> !fir.box> + fir.unpack_array %1 to %arg0 heap : !fir.box> + return +} +// CHECK-LABEL: func.func @_QPtest1_out( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = arith.constant false +// CHECK: %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_6:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>) -> i1 +// CHECK: %[[VAL_7:.*]] = fir.if %[[VAL_6]] -> (!fir.box>) { +// CHECK: %[[VAL_8:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.box>) -> i1 +// CHECK: %[[VAL_9:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_4]] : i1 +// CHECK: %[[VAL_10:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>) -> !fir.ref>> +// CHECK: %[[VAL_11:.*]] = fir.is_present %[[VAL_10]] : (!fir.ref>>) -> i1 +// CHECK: %[[VAL_12:.*]] = arith.andi %[[VAL_9]], %[[VAL_11]] : i1 +// CHECK: %[[VAL_13:.*]] = fir.if %[[VAL_12]] -> (!fir.box>) { +// CHECK: %[[VAL_14:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_2]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_16:.*]] = fir.shape %[[VAL_14]]#1, %[[VAL_15]]#1 : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_17:.*]] = fir.allocmem !fir.array, %[[VAL_14]]#1, %[[VAL_15]]#1 {bindc_name = ".repacked", uniq_name = ""} +// CHECK: %[[VAL_18:.*]] = fir.declare %[[VAL_17]](%[[VAL_16]]) {uniq_name = ".repacked"} : (!fir.heap>, !fir.shape<2>) -> !fir.heap> +// CHECK: %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_16]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> +// CHECK: %[[VAL_20:.*]] = fir.shift %[[VAL_14]]#0, %[[VAL_15]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_21:.*]] = fir.rebox %[[VAL_19]](%[[VAL_20]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box> +// CHECK: fir.result %[[VAL_21]] : !fir.box> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box> +// CHECK: } +// CHECK: fir.result %[[VAL_13]] : !fir.box> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box> +// CHECK: } +// CHECK: %[[VAL_22:.*]] = fir.declare %[[VAL_7]] dummy_scope %[[VAL_5]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest1_outEx"} : (!fir.box>, !fir.dscope) -> !fir.box> +// CHECK: %[[VAL_23:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>) -> i1 +// CHECK: fir.if %[[VAL_23]] { +// CHECK: %[[VAL_24:.*]] = fir.box_addr %[[VAL_7]] : (!fir.box>) -> !fir.heap> +// CHECK: %[[VAL_26:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>) -> !fir.heap> +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (!fir.heap>) -> index +// CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (!fir.heap>) -> index +// CHECK: %[[VAL_28:.*]] = arith.cmpi ne, %[[VAL_25]], %[[VAL_27]] : index +// CHECK: fir.if %[[VAL_28]] { +// CHECK: %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_7]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: fir.freemem %[[VAL_24]] : !fir.heap> +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test character array with dynamic length and heap allocation +func.func @_QPtest2(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.box>> {fir.bindc_name = "x"}) { + %c0_i32 = arith.constant 0 : i32 + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFtest2En"} : (!fir.ref, !fir.dscope) -> !fir.ref + %2 = fir.load %1 : !fir.ref + %3 = arith.cmpi sgt, %2, %c0_i32 : i32 + %4 = arith.select %3, %2, %c0_i32 : i32 + %5 = fir.pack_array %arg1 heap innermost typeparams %4 : (!fir.box>>, i32) -> !fir.box>> + %6 = fir.declare %5 typeparams %4 dummy_scope %0 {uniq_name = "_QFtest2Ex"} : (!fir.box>>, i32, !fir.dscope) -> !fir.box>> + fir.unpack_array %5 to %arg1 heap : !fir.box>> + return +} +// CHECK-LABEL: func.func @_QPtest2( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant +// CHECK: %[[VAL_4:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = arith.constant false +// CHECK: %[[VAL_7:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_9:.*]] = fir.declare %[[VAL_0]] dummy_scope %[[VAL_8]] {uniq_name = "_QFtest2En"} : (!fir.ref, !fir.dscope) -> !fir.ref +// CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref +// CHECK: %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_7]] : i32 +// CHECK: %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_10]], %[[VAL_7]] : i32 +// CHECK: %[[VAL_13:.*]] = fir.is_present %[[VAL_1]] : (!fir.box>>) -> i1 +// CHECK: %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box>>) { +// CHECK: %[[VAL_15:.*]] = fir.is_contiguous_box %[[VAL_1]] innermost : (!fir.box>>) -> i1 +// CHECK: %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_6]] : i1 +// CHECK: %[[VAL_17:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box>>) -> !fir.ref>>> +// CHECK: %[[VAL_18:.*]] = fir.is_present %[[VAL_17]] : (!fir.ref>>>) -> i1 +// CHECK: %[[VAL_19:.*]] = arith.andi %[[VAL_16]], %[[VAL_18]] : i1 +// CHECK: %[[VAL_20:.*]] = fir.if %[[VAL_19]] -> (!fir.box>>) { +// CHECK: %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_5]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_22:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_4]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_23:.*]] = fir.shape %[[VAL_21]]#1, %[[VAL_22]]#1 : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_24:.*]] = fir.allocmem !fir.array>(%[[VAL_12]] : i32), %[[VAL_21]]#1, %[[VAL_22]]#1 {bindc_name = ".repacked", uniq_name = ""} +// CHECK: %[[VAL_25:.*]] = fir.declare %[[VAL_24]](%[[VAL_23]]) typeparams %[[VAL_12]] {uniq_name = ".repacked"} : (!fir.heap>>, !fir.shape<2>, i32) -> !fir.heap>> +// CHECK: %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_23]]) typeparams %[[VAL_12]] : (!fir.heap>>, !fir.shape<2>, i32) -> !fir.box>>> +// CHECK: %[[VAL_27:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_26]] : (!fir.box>>>) -> !fir.box +// CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_1]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_28]], %[[VAL_29]], %[[VAL_30]], %[[VAL_3]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_31:.*]] = fir.shift %[[VAL_21]]#0, %[[VAL_22]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_32:.*]] = fir.rebox %[[VAL_26]](%[[VAL_31]]) : (!fir.box>>>, !fir.shift<2>) -> !fir.box>> +// CHECK: fir.result %[[VAL_32]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_1]] : !fir.box>> +// CHECK: } +// CHECK: fir.result %[[VAL_20]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_1]] : !fir.box>> +// CHECK: } +// CHECK: %[[VAL_33:.*]] = fir.declare %[[VAL_14]] typeparams %[[VAL_12]] dummy_scope %[[VAL_8]] {uniq_name = "_QFtest2Ex"} : (!fir.box>>, i32, !fir.dscope) -> !fir.box>> +// CHECK: %[[VAL_34:.*]] = fir.is_present %[[VAL_1]] : (!fir.box>>) -> i1 +// CHECK: fir.if %[[VAL_34]] { +// CHECK: %[[VAL_35:.*]] = fir.box_addr %[[VAL_14]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_37:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_35]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_36]], %[[VAL_38]] : index +// CHECK: fir.if %[[VAL_39]] { +// CHECK: %[[VAL_40:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_1]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_42:.*]] = fir.convert %[[VAL_14]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_43:.*]] = fir.convert %[[VAL_40]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_41]], %[[VAL_42]], %[[VAL_43]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: fir.freemem %[[VAL_35]] : !fir.heap>> +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test character array with dynamic length and stack allocation +func.func @_QPtest2_stack(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.box>> {fir.bindc_name = "x"}) { + %c0_i32 = arith.constant 0 : i32 + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QFtest2_stackEn"} : (!fir.ref, !fir.dscope) -> !fir.ref + %2 = fir.load %1 : !fir.ref + %3 = arith.cmpi sgt, %2, %c0_i32 : i32 + %4 = arith.select %3, %2, %c0_i32 : i32 + %5 = fir.pack_array %arg1 stack innermost typeparams %4 : (!fir.box>>, i32) -> !fir.box>> + %6 = fir.declare %5 typeparams %4 dummy_scope %0 {uniq_name = "_QFtest2_stackEx"} : (!fir.box>>, i32, !fir.dscope) -> !fir.box>> + fir.unpack_array %5 to %arg1 stack : !fir.box>> + return +} +// CHECK-LABEL: func.func @_QPtest2_stack( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant +// CHECK: %[[VAL_4:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = arith.constant false +// CHECK: %[[VAL_7:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_9:.*]] = fir.declare %[[VAL_0]] dummy_scope %[[VAL_8]] {uniq_name = "_QFtest2_stackEn"} : (!fir.ref, !fir.dscope) -> !fir.ref +// CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref +// CHECK: %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_7]] : i32 +// CHECK: %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_10]], %[[VAL_7]] : i32 +// CHECK: %[[VAL_13:.*]] = fir.is_present %[[VAL_1]] : (!fir.box>>) -> i1 +// CHECK: %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box>>) { +// CHECK: %[[VAL_15:.*]] = fir.is_contiguous_box %[[VAL_1]] innermost : (!fir.box>>) -> i1 +// CHECK: %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_15]], %[[VAL_6]] : i1 +// CHECK: %[[VAL_17:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box>>) -> !fir.ref>>> +// CHECK: %[[VAL_18:.*]] = fir.is_present %[[VAL_17]] : (!fir.ref>>>) -> i1 +// CHECK: %[[VAL_19:.*]] = arith.andi %[[VAL_16]], %[[VAL_18]] : i1 +// CHECK: %[[VAL_20:.*]] = fir.if %[[VAL_19]] -> (!fir.box>>) { +// CHECK: %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_5]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_22:.*]]:3 = fir.box_dims %[[VAL_1]], %[[VAL_4]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_23:.*]] = fir.shape %[[VAL_21]]#1, %[[VAL_22]]#1 : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_24:.*]] = fir.alloca !fir.array>(%[[VAL_12]] : i32), %[[VAL_21]]#1, %[[VAL_22]]#1 {bindc_name = ".repacked"} +// CHECK: %[[VAL_25:.*]] = fir.declare %[[VAL_24]](%[[VAL_23]]) typeparams %[[VAL_12]] {uniq_name = ".repacked"} : (!fir.ref>>, !fir.shape<2>, i32) -> !fir.ref>> +// CHECK: %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_23]]) typeparams %[[VAL_12]] : (!fir.ref>>, !fir.shape<2>, i32) -> !fir.box>> +// CHECK: %[[VAL_27:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_26]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_1]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_28]], %[[VAL_29]], %[[VAL_30]], %[[VAL_3]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_31:.*]] = fir.shift %[[VAL_21]]#0, %[[VAL_22]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_32:.*]] = fir.rebox %[[VAL_26]](%[[VAL_31]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box>> +// CHECK: fir.result %[[VAL_32]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_1]] : !fir.box>> +// CHECK: } +// CHECK: fir.result %[[VAL_20]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_1]] : !fir.box>> +// CHECK: } +// CHECK: %[[VAL_33:.*]] = fir.declare %[[VAL_14]] typeparams %[[VAL_12]] dummy_scope %[[VAL_8]] {uniq_name = "_QFtest2_stackEx"} : (!fir.box>>, i32, !fir.dscope) -> !fir.box>> +// CHECK: %[[VAL_34:.*]] = fir.is_present %[[VAL_1]] : (!fir.box>>) -> i1 +// CHECK: fir.if %[[VAL_34]] { +// CHECK: %[[VAL_35:.*]] = fir.box_addr %[[VAL_14]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_37:.*]] = fir.box_addr %[[VAL_1]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_35]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_36]], %[[VAL_38]] : index +// CHECK: fir.if %[[VAL_39]] { +// CHECK: %[[VAL_40:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_1]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_42:.*]] = fir.convert %[[VAL_14]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_43:.*]] = fir.convert %[[VAL_40]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_41]], %[[VAL_42]], %[[VAL_43]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test character array with assumed length and heap allocation. +func.func @_QPtest3(%arg0: !fir.box>> {fir.bindc_name = "x"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 heap innermost : (!fir.box>>) -> !fir.box>> + %2 = fir.declare %1 dummy_scope %0 {uniq_name = "_QFtest3Ex"} : (!fir.box>>, !fir.dscope) -> !fir.box>> + fir.unpack_array %1 to %arg0 heap : !fir.box>> + return +} +// CHECK-LABEL: func.func @_QPtest3( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant false +// CHECK: %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_7:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>>) -> i1 +// CHECK: %[[VAL_8:.*]] = fir.if %[[VAL_7]] -> (!fir.box>>) { +// CHECK: %[[VAL_9:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.box>>) -> i1 +// CHECK: %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_9]], %[[VAL_5]] : i1 +// CHECK: %[[VAL_11:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>>) -> !fir.ref>>> +// CHECK: %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref>>>) -> i1 +// CHECK: %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1 +// CHECK: %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box>>) { +// CHECK: %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_17:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box>>) -> index +// CHECK: %[[VAL_18:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_19:.*]] = fir.allocmem !fir.array>(%[[VAL_17]] : index), %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked", uniq_name = ""} +// CHECK: %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_17]] {uniq_name = ".repacked"} : (!fir.heap>>, !fir.shape<2>, index) -> !fir.heap>> +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) typeparams %[[VAL_17]] : (!fir.heap>>, !fir.shape<2>, index) -> !fir.box>>> +// CHECK: %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box>>>) -> !fir.box +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_26:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box>>>, !fir.shift<2>) -> !fir.box>> +// CHECK: fir.result %[[VAL_27]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box>> +// CHECK: } +// CHECK: fir.result %[[VAL_14]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box>> +// CHECK: } +// CHECK: %[[VAL_28:.*]] = fir.declare %[[VAL_8]] dummy_scope %[[VAL_6]] {uniq_name = "_QFtest3Ex"} : (!fir.box>>, !fir.dscope) -> !fir.box>> +// CHECK: %[[VAL_29:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>>) -> i1 +// CHECK: fir.if %[[VAL_29]] { +// CHECK: %[[VAL_30:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_32:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_31]], %[[VAL_33]] : index +// CHECK: fir.if %[[VAL_34]] { +// CHECK: %[[VAL_35:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_8]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_35]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_36]], %[[VAL_37]], %[[VAL_38]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: fir.freemem %[[VAL_30]] : !fir.heap>> +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test character array with assumed length and stack allocation. +func.func @_QPtest3_stack(%arg0: !fir.box>> {fir.bindc_name = "x"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 stack innermost : (!fir.box>>) -> !fir.box>> + %2 = fir.declare %1 dummy_scope %0 {uniq_name = "_QFtest3_stackEx"} : (!fir.box>>, !fir.dscope) -> !fir.box>> + fir.unpack_array %1 to %arg0 stack : !fir.box>> + return +} +// CHECK-LABEL: func.func @_QPtest3_stack( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant false +// CHECK: %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_7:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>>) -> i1 +// CHECK: %[[VAL_8:.*]] = fir.if %[[VAL_7]] -> (!fir.box>>) { +// CHECK: %[[VAL_9:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.box>>) -> i1 +// CHECK: %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_9]], %[[VAL_5]] : i1 +// CHECK: %[[VAL_11:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>>) -> !fir.ref>>> +// CHECK: %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref>>>) -> i1 +// CHECK: %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1 +// CHECK: %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box>>) { +// CHECK: %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_17:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box>>) -> index +// CHECK: %[[VAL_18:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_19:.*]] = fir.alloca !fir.array>(%[[VAL_17]] : index), %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked"} +// CHECK: %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_17]] {uniq_name = ".repacked"} : (!fir.ref>>, !fir.shape<2>, index) -> !fir.ref>> +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) typeparams %[[VAL_17]] : (!fir.ref>>, !fir.shape<2>, index) -> !fir.box>> +// CHECK: %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_26:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box>> +// CHECK: fir.result %[[VAL_27]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box>> +// CHECK: } +// CHECK: fir.result %[[VAL_14]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box>> +// CHECK: } +// CHECK: %[[VAL_28:.*]] = fir.declare %[[VAL_8]] dummy_scope %[[VAL_6]] {uniq_name = "_QFtest3_stackEx"} : (!fir.box>>, !fir.dscope) -> !fir.box>> +// CHECK: %[[VAL_29:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>>) -> i1 +// CHECK: fir.if %[[VAL_29]] { +// CHECK: %[[VAL_30:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_32:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_31]], %[[VAL_33]] : index +// CHECK: fir.if %[[VAL_34]] { +// CHECK: %[[VAL_35:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_8]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_35]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_36]], %[[VAL_37]], %[[VAL_38]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test character array with constant length and heap allocation. +func.func @_QPtest4(%arg0: !fir.box>> {fir.bindc_name = "x"}) { + %c10 = arith.constant 10 : index + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 heap innermost : (!fir.box>>) -> !fir.box>> + %2 = fir.declare %1 typeparams %c10 dummy_scope %0 {uniq_name = "_QFtest4Ex"} : (!fir.box>>, index, !fir.dscope) -> !fir.box>> + fir.unpack_array %1 to %arg0 heap : !fir.box>> + return +} +// CHECK-LABEL: func.func @_QPtest4( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant false +// CHECK: %[[VAL_6:.*]] = arith.constant 10 : index +// CHECK: %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_8:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>>) -> i1 +// CHECK: %[[VAL_9:.*]] = fir.if %[[VAL_8]] -> (!fir.box>>) { +// CHECK: %[[VAL_10:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.box>>) -> i1 +// CHECK: %[[VAL_11:.*]] = arith.cmpi eq, %[[VAL_10]], %[[VAL_5]] : i1 +// CHECK: %[[VAL_12:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>>) -> !fir.ref>>> +// CHECK: %[[VAL_13:.*]] = fir.is_present %[[VAL_12]] : (!fir.ref>>>) -> i1 +// CHECK: %[[VAL_14:.*]] = arith.andi %[[VAL_11]], %[[VAL_13]] : i1 +// CHECK: %[[VAL_15:.*]] = fir.if %[[VAL_14]] -> (!fir.box>>) { +// CHECK: %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_18:.*]] = fir.shape %[[VAL_16]]#1, %[[VAL_17]]#1 : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_19:.*]] = fir.allocmem !fir.array>, %[[VAL_16]]#1, %[[VAL_17]]#1 {bindc_name = ".repacked", uniq_name = ""} +// CHECK: %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_6]] {uniq_name = ".repacked"} : (!fir.heap>>, !fir.shape<2>, index) -> !fir.heap>> +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) : (!fir.heap>>, !fir.shape<2>) -> !fir.box>>> +// CHECK: %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box>>>) -> !fir.box +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_26:.*]] = fir.shift %[[VAL_16]]#0, %[[VAL_17]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box>>>, !fir.shift<2>) -> !fir.box>> +// CHECK: fir.result %[[VAL_27]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box>> +// CHECK: } +// CHECK: fir.result %[[VAL_15]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box>> +// CHECK: } +// CHECK: %[[VAL_28:.*]] = fir.declare %[[VAL_9]] typeparams %[[VAL_6]] dummy_scope %[[VAL_7]] {uniq_name = "_QFtest4Ex"} : (!fir.box>>, index, !fir.dscope) -> !fir.box>> +// CHECK: %[[VAL_29:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>>) -> i1 +// CHECK: fir.if %[[VAL_29]] { +// CHECK: %[[VAL_30:.*]] = fir.box_addr %[[VAL_9]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_32:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_31]], %[[VAL_33]] : index +// CHECK: fir.if %[[VAL_34]] { +// CHECK: %[[VAL_35:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_9]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_35]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_36]], %[[VAL_37]], %[[VAL_38]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: fir.freemem %[[VAL_30]] : !fir.heap>> +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test character array with constant length and stack allocation. +func.func @_QPtest4_stack(%arg0: !fir.box>> {fir.bindc_name = "x"}) { + %c10 = arith.constant 10 : index + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 stack innermost : (!fir.box>>) -> !fir.box>> + %2 = fir.declare %1 typeparams %c10 dummy_scope %0 {uniq_name = "_QFtest4_stackEx"} : (!fir.box>>, index, !fir.dscope) -> !fir.box>> + fir.unpack_array %1 to %arg0 stack : !fir.box>> + return +} +// CHECK-LABEL: func.func @_QPtest4_stack( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant false +// CHECK: %[[VAL_6:.*]] = arith.constant 10 : index +// CHECK: %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_8:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>>) -> i1 +// CHECK: %[[VAL_9:.*]] = fir.if %[[VAL_8]] -> (!fir.box>>) { +// CHECK: %[[VAL_10:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.box>>) -> i1 +// CHECK: %[[VAL_11:.*]] = arith.cmpi eq, %[[VAL_10]], %[[VAL_5]] : i1 +// CHECK: %[[VAL_12:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>>) -> !fir.ref>>> +// CHECK: %[[VAL_13:.*]] = fir.is_present %[[VAL_12]] : (!fir.ref>>>) -> i1 +// CHECK: %[[VAL_14:.*]] = arith.andi %[[VAL_11]], %[[VAL_13]] : i1 +// CHECK: %[[VAL_15:.*]] = fir.if %[[VAL_14]] -> (!fir.box>>) { +// CHECK: %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_18:.*]] = fir.shape %[[VAL_16]]#1, %[[VAL_17]]#1 : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_19:.*]] = fir.alloca !fir.array>, %[[VAL_16]]#1, %[[VAL_17]]#1 {bindc_name = ".repacked"} +// CHECK: %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_6]] {uniq_name = ".repacked"} : (!fir.ref>>, !fir.shape<2>, index) -> !fir.ref>> +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) : (!fir.ref>>, !fir.shape<2>) -> !fir.box>> +// CHECK: %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_26:.*]] = fir.shift %[[VAL_16]]#0, %[[VAL_17]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box>> +// CHECK: fir.result %[[VAL_27]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box>> +// CHECK: } +// CHECK: fir.result %[[VAL_15]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box>> +// CHECK: } +// CHECK: %[[VAL_28:.*]] = fir.declare %[[VAL_9]] typeparams %[[VAL_6]] dummy_scope %[[VAL_7]] {uniq_name = "_QFtest4_stackEx"} : (!fir.box>>, index, !fir.dscope) -> !fir.box>> +// CHECK: %[[VAL_29:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>>) -> i1 +// CHECK: fir.if %[[VAL_29]] { +// CHECK: %[[VAL_30:.*]] = fir.box_addr %[[VAL_9]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_32:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_34:.*]] = arith.cmpi ne, %[[VAL_31]], %[[VAL_33]] : index +// CHECK: fir.if %[[VAL_34]] { +// CHECK: %[[VAL_35:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_9]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_35]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_36]], %[[VAL_37]], %[[VAL_38]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test derived type array with heap allocation. +func.func @_QPtest5(%arg0: !fir.box>> {fir.bindc_name = "x"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 heap innermost : (!fir.box>>) -> !fir.box>> + %2 = fir.declare %1 dummy_scope %0 {uniq_name = "_QFtest5Ex"} : (!fir.box>>, !fir.dscope) -> !fir.box>> + fir.unpack_array %1 to %arg0 heap : !fir.box>> + return +} +// CHECK-LABEL: func.func @_QPtest5( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant false +// CHECK: %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_7:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>>) -> i1 +// CHECK: %[[VAL_8:.*]] = fir.if %[[VAL_7]] -> (!fir.box>>) { +// CHECK: %[[VAL_9:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.box>>) -> i1 +// CHECK: %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_9]], %[[VAL_5]] : i1 +// CHECK: %[[VAL_11:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>>) -> !fir.ref>>> +// CHECK: %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref>>>) -> i1 +// CHECK: %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1 +// CHECK: %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box>>) { +// CHECK: %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_18:.*]] = fir.allocmem !fir.array>, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked", uniq_name = ""} +// CHECK: %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.heap>>, !fir.shape<2>) -> !fir.heap>> +// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.heap>>, !fir.shape<2>) -> !fir.box>>> +// CHECK: %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box>>>) -> !fir.box +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box>>>, !fir.shift<2>) -> !fir.box>> +// CHECK: fir.result %[[VAL_26]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box>> +// CHECK: } +// CHECK: fir.result %[[VAL_14]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box>> +// CHECK: } +// CHECK: %[[VAL_27:.*]] = fir.declare %[[VAL_8]] dummy_scope %[[VAL_6]] {uniq_name = "_QFtest5Ex"} : (!fir.box>>, !fir.dscope) -> !fir.box>> +// CHECK: %[[VAL_28:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>>) -> i1 +// CHECK: fir.if %[[VAL_28]] { +// CHECK: %[[VAL_29:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_31:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_33:.*]] = arith.cmpi ne, %[[VAL_30]], %[[VAL_32]] : index +// CHECK: fir.if %[[VAL_33]] { +// CHECK: %[[VAL_34:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_8]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_34]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_35]], %[[VAL_36]], %[[VAL_37]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: fir.freemem %[[VAL_29]] : !fir.heap>> +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test derived type array with stack allocation. +func.func @_QPtest5_stack(%arg0: !fir.box>> {fir.bindc_name = "x"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 stack innermost : (!fir.box>>) -> !fir.box>> + %2 = fir.declare %1 dummy_scope %0 {uniq_name = "_QFtest5_stackEx"} : (!fir.box>>, !fir.dscope) -> !fir.box>> + fir.unpack_array %1 to %arg0 stack : !fir.box>> + return +} +// CHECK-LABEL: func.func @_QPtest5_stack( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant false +// CHECK: %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_7:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>>) -> i1 +// CHECK: %[[VAL_8:.*]] = fir.if %[[VAL_7]] -> (!fir.box>>) { +// CHECK: %[[VAL_9:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.box>>) -> i1 +// CHECK: %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_9]], %[[VAL_5]] : i1 +// CHECK: %[[VAL_11:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>>) -> !fir.ref>>> +// CHECK: %[[VAL_12:.*]] = fir.is_present %[[VAL_11]] : (!fir.ref>>>) -> i1 +// CHECK: %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1 +// CHECK: %[[VAL_14:.*]] = fir.if %[[VAL_13]] -> (!fir.box>>) { +// CHECK: %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_16:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_3]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_18:.*]] = fir.alloca !fir.array>, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked"} +// CHECK: %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.ref>>, !fir.shape<2>) -> !fir.ref>> +// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.ref>>, !fir.shape<2>) -> !fir.box>> +// CHECK: %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box>> +// CHECK: fir.result %[[VAL_26]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box>> +// CHECK: } +// CHECK: fir.result %[[VAL_14]] : !fir.box>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.box>> +// CHECK: } +// CHECK: %[[VAL_27:.*]] = fir.declare %[[VAL_8]] dummy_scope %[[VAL_6]] {uniq_name = "_QFtest5_stackEx"} : (!fir.box>>, !fir.dscope) -> !fir.box>> +// CHECK: %[[VAL_28:.*]] = fir.is_present %[[VAL_0]] : (!fir.box>>) -> i1 +// CHECK: fir.if %[[VAL_28]] { +// CHECK: %[[VAL_29:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_31:.*]] = fir.box_addr %[[VAL_0]] : (!fir.box>>) -> !fir.heap>> +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_33:.*]] = arith.cmpi ne, %[[VAL_30]], %[[VAL_32]] : index +// CHECK: fir.if %[[VAL_33]] { +// CHECK: %[[VAL_34:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_8]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_34]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_35]], %[[VAL_36]], %[[VAL_37]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test polymorphic type array with heap allocation. +func.func @_QPtest6(%arg0: !fir.class>> {fir.bindc_name = "x"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 heap innermost : (!fir.class>>) -> !fir.class>> + %2 = fir.declare %1 dummy_scope %0 {uniq_name = "_QFtest6Ex"} : (!fir.class>>, !fir.dscope) -> !fir.class>> + fir.unpack_array %1 to %arg0 heap : !fir.class>> + return +} +// CHECK-LABEL: func.func @_QPtest6( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.class>> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_4:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = arith.constant false +// CHECK: %[[VAL_7:.*]] = fir.alloca !fir.class>>> +// CHECK: %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_9:.*]] = fir.is_present %[[VAL_0]] : (!fir.class>>) -> i1 +// CHECK: %[[VAL_10:.*]] = fir.if %[[VAL_9]] -> (!fir.class>>) { +// CHECK: %[[VAL_11:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.class>>) -> i1 +// CHECK: %[[VAL_12:.*]] = arith.cmpi eq, %[[VAL_11]], %[[VAL_6]] : i1 +// CHECK: %[[VAL_13:.*]] = fir.box_addr %[[VAL_0]] : (!fir.class>>) -> !fir.ref>>> +// CHECK: %[[VAL_14:.*]] = fir.is_present %[[VAL_13]] : (!fir.ref>>>) -> i1 +// CHECK: %[[VAL_15:.*]] = arith.andi %[[VAL_12]], %[[VAL_14]] : i1 +// CHECK: %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (!fir.class>>) { +// CHECK: %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.class>>, index) -> (index, index, index) +// CHECK: %[[VAL_18:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.class>>, index) -> (index, index, index) +// CHECK: %[[VAL_19:.*]] = fir.zero_bits !fir.heap>> +// CHECK: %[[VAL_20:.*]] = fir.shape %[[VAL_5]], %[[VAL_5]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_19]](%[[VAL_20]]) : (!fir.heap>>, !fir.shape<2>) -> !fir.class>>> +// CHECK: fir.store %[[VAL_21]] to %[[VAL_7]] : !fir.ref>>>> +// CHECK: %[[VAL_22:.*]] = fir.declare %[[VAL_7]] {fortran_attrs = #fir.var_attrs, uniq_name = ".repacked"} : (!fir.ref>>>>) -> !fir.ref>>>> +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>>) -> !fir.ref> +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.class>>) -> !fir.box +// CHECK: fir.call @_FortranAAllocatableApplyMold(%[[VAL_23]], %[[VAL_24]], %[[VAL_3]]) : (!fir.ref>, !fir.box, i32) -> () +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>>) -> !fir.ref> +// CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_5]] : (index) -> i32 +// CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_4]] : (index) -> i64 +// CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_17]]#1 : (index) -> i64 +// CHECK: fir.call @_FortranAAllocatableSetBounds(%[[VAL_25]], %[[VAL_26]], %[[VAL_27]], %[[VAL_28]]) : (!fir.ref>, i32, i64, i64) -> () +// CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>>) -> !fir.ref> +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_4]] : (index) -> i32 +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_4]] : (index) -> i64 +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_18]]#1 : (index) -> i64 +// CHECK: fir.call @_FortranAAllocatableSetBounds(%[[VAL_29]], %[[VAL_30]], %[[VAL_31]], %[[VAL_32]]) : (!fir.ref>, i32, i64, i64) -> () +// CHECK: %[[VAL_33:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_34:.*]] = fir.absent !fir.box +// CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>>) -> !fir.ref> +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_33]] : (!fir.ref>) -> !fir.ref +// CHECK: %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: %[[VAL_38:.*]] = fir.load %[[VAL_22]] : !fir.ref>>>> +// CHECK: %[[VAL_39:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.class>>>) -> !fir.box +// CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_0]] : (!fir.class>>) -> !fir.box +// CHECK: %[[VAL_42:.*]] = fir.convert %[[VAL_39]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_40]], %[[VAL_41]], %[[VAL_42]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_43:.*]] = fir.shift %[[VAL_17]]#0, %[[VAL_18]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_44:.*]] = fir.rebox %[[VAL_38]](%[[VAL_43]]) : (!fir.class>>>, !fir.shift<2>) -> !fir.class>> +// CHECK: fir.result %[[VAL_44]] : !fir.class>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.class>> +// CHECK: } +// CHECK: fir.result %[[VAL_16]] : !fir.class>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.class>> +// CHECK: } +// CHECK: %[[VAL_45:.*]] = fir.declare %[[VAL_10]] dummy_scope %[[VAL_8]] {uniq_name = "_QFtest6Ex"} : (!fir.class>>, !fir.dscope) -> !fir.class>> +// CHECK: %[[VAL_46:.*]] = fir.is_present %[[VAL_0]] : (!fir.class>>) -> i1 +// CHECK: fir.if %[[VAL_46]] { +// CHECK: %[[VAL_47:.*]] = fir.box_addr %[[VAL_10]] : (!fir.class>>) -> !fir.heap>> +// CHECK: %[[VAL_49:.*]] = fir.box_addr %[[VAL_0]] : (!fir.class>>) -> !fir.heap>> +// CHECK: %[[VAL_48:.*]] = fir.convert %[[VAL_47]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_50:.*]] = fir.convert %[[VAL_49]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_51:.*]] = arith.cmpi ne, %[[VAL_48]], %[[VAL_50]] : index +// CHECK: fir.if %[[VAL_51]] { +// CHECK: %[[VAL_52:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_53:.*]] = fir.convert %[[VAL_0]] : (!fir.class>>) -> !fir.box +// CHECK: %[[VAL_54:.*]] = fir.convert %[[VAL_10]] : (!fir.class>>) -> !fir.box +// CHECK: %[[VAL_55:.*]] = fir.convert %[[VAL_52]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_53]], %[[VAL_54]], %[[VAL_55]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: fir.freemem %[[VAL_47]] : !fir.heap>> +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test polymorphic type array with requested stack allocation. +// The actual allocation is done in heap memory. +func.func @_QPtest6_stack(%arg0: !fir.class>> {fir.bindc_name = "x"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 stack innermost : (!fir.class>>) -> !fir.class>> + %2 = fir.declare %1 dummy_scope %0 {uniq_name = "_QFtest6_stackEx"} : (!fir.class>>, !fir.dscope) -> !fir.class>> + fir.unpack_array %1 to %arg0 stack : !fir.class>> + return +} +// CHECK-LABEL: func.func @_QPtest6_stack( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.class>> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_4:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = arith.constant false +// CHECK: %[[VAL_7:.*]] = fir.alloca !fir.class>>> +// CHECK: %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_9:.*]] = fir.is_present %[[VAL_0]] : (!fir.class>>) -> i1 +// CHECK: %[[VAL_10:.*]] = fir.if %[[VAL_9]] -> (!fir.class>>) { +// CHECK: %[[VAL_11:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.class>>) -> i1 +// CHECK: %[[VAL_12:.*]] = arith.cmpi eq, %[[VAL_11]], %[[VAL_6]] : i1 +// CHECK: %[[VAL_13:.*]] = fir.box_addr %[[VAL_0]] : (!fir.class>>) -> !fir.ref>>> +// CHECK: %[[VAL_14:.*]] = fir.is_present %[[VAL_13]] : (!fir.ref>>>) -> i1 +// CHECK: %[[VAL_15:.*]] = arith.andi %[[VAL_12]], %[[VAL_14]] : i1 +// CHECK: %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (!fir.class>>) { +// CHECK: %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.class>>, index) -> (index, index, index) +// CHECK: %[[VAL_18:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.class>>, index) -> (index, index, index) +// CHECK: %[[VAL_19:.*]] = fir.zero_bits !fir.heap>> +// CHECK: %[[VAL_20:.*]] = fir.shape %[[VAL_5]], %[[VAL_5]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_19]](%[[VAL_20]]) : (!fir.heap>>, !fir.shape<2>) -> !fir.class>>> +// CHECK: fir.store %[[VAL_21]] to %[[VAL_7]] : !fir.ref>>>> +// CHECK: %[[VAL_22:.*]] = fir.declare %[[VAL_7]] {fortran_attrs = #fir.var_attrs, uniq_name = ".repacked"} : (!fir.ref>>>>) -> !fir.ref>>>> +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>>) -> !fir.ref> +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.class>>) -> !fir.box +// CHECK: fir.call @_FortranAAllocatableApplyMold(%[[VAL_23]], %[[VAL_24]], %[[VAL_3]]) : (!fir.ref>, !fir.box, i32) -> () +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>>) -> !fir.ref> +// CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_5]] : (index) -> i32 +// CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_4]] : (index) -> i64 +// CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_17]]#1 : (index) -> i64 +// CHECK: fir.call @_FortranAAllocatableSetBounds(%[[VAL_25]], %[[VAL_26]], %[[VAL_27]], %[[VAL_28]]) : (!fir.ref>, i32, i64, i64) -> () +// CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>>) -> !fir.ref> +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_4]] : (index) -> i32 +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_4]] : (index) -> i64 +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_18]]#1 : (index) -> i64 +// CHECK: fir.call @_FortranAAllocatableSetBounds(%[[VAL_29]], %[[VAL_30]], %[[VAL_31]], %[[VAL_32]]) : (!fir.ref>, i32, i64, i64) -> () +// CHECK: %[[VAL_33:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_34:.*]] = fir.absent !fir.box +// CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>>) -> !fir.ref> +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_33]] : (!fir.ref>) -> !fir.ref +// CHECK: %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: %[[VAL_38:.*]] = fir.load %[[VAL_22]] : !fir.ref>>>> +// CHECK: %[[VAL_39:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.class>>>) -> !fir.box +// CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_0]] : (!fir.class>>) -> !fir.box +// CHECK: %[[VAL_42:.*]] = fir.convert %[[VAL_39]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_40]], %[[VAL_41]], %[[VAL_42]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_43:.*]] = fir.shift %[[VAL_17]]#0, %[[VAL_18]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_44:.*]] = fir.rebox %[[VAL_38]](%[[VAL_43]]) : (!fir.class>>>, !fir.shift<2>) -> !fir.class>> +// CHECK: fir.result %[[VAL_44]] : !fir.class>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.class>> +// CHECK: } +// CHECK: fir.result %[[VAL_16]] : !fir.class>> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.class>> +// CHECK: } +// CHECK: %[[VAL_45:.*]] = fir.declare %[[VAL_10]] dummy_scope %[[VAL_8]] {uniq_name = "_QFtest6_stackEx"} : (!fir.class>>, !fir.dscope) -> !fir.class>> +// CHECK: %[[VAL_46:.*]] = fir.is_present %[[VAL_0]] : (!fir.class>>) -> i1 +// CHECK: fir.if %[[VAL_46]] { +// CHECK: %[[VAL_47:.*]] = fir.box_addr %[[VAL_10]] : (!fir.class>>) -> !fir.heap>> +// CHECK: %[[VAL_49:.*]] = fir.box_addr %[[VAL_0]] : (!fir.class>>) -> !fir.heap>> +// CHECK: %[[VAL_48:.*]] = fir.convert %[[VAL_47]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_50:.*]] = fir.convert %[[VAL_49]] : (!fir.heap>>) -> index +// CHECK: %[[VAL_51:.*]] = arith.cmpi ne, %[[VAL_48]], %[[VAL_50]] : index +// CHECK: fir.if %[[VAL_51]] { +// CHECK: %[[VAL_52:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_53:.*]] = fir.convert %[[VAL_0]] : (!fir.class>>) -> !fir.box +// CHECK: %[[VAL_54:.*]] = fir.convert %[[VAL_10]] : (!fir.class>>) -> !fir.box +// CHECK: %[[VAL_55:.*]] = fir.convert %[[VAL_52]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_53]], %[[VAL_54]], %[[VAL_55]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: fir.freemem %[[VAL_47]] : !fir.heap>> +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test unlimited polymorphic type array with heap allocation. +func.func @_QPtest7(%arg0: !fir.class> {fir.bindc_name = "x"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 heap innermost : (!fir.class>) -> !fir.class> + %2 = fir.declare %1 dummy_scope %0 {uniq_name = "_QFtest7Ex"} : (!fir.class>, !fir.dscope) -> !fir.class> + fir.unpack_array %1 to %arg0 heap : !fir.class> + return +} +// CHECK-LABEL: func.func @_QPtest7( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.class> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_4:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = arith.constant false +// CHECK: %[[VAL_7:.*]] = fir.alloca !fir.class>> +// CHECK: %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_9:.*]] = fir.is_present %[[VAL_0]] : (!fir.class>) -> i1 +// CHECK: %[[VAL_10:.*]] = fir.if %[[VAL_9]] -> (!fir.class>) { +// CHECK: %[[VAL_11:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.class>) -> i1 +// CHECK: %[[VAL_12:.*]] = arith.cmpi eq, %[[VAL_11]], %[[VAL_6]] : i1 +// CHECK: %[[VAL_13:.*]] = fir.box_addr %[[VAL_0]] : (!fir.class>) -> !fir.ref>> +// CHECK: %[[VAL_14:.*]] = fir.is_present %[[VAL_13]] : (!fir.ref>>) -> i1 +// CHECK: %[[VAL_15:.*]] = arith.andi %[[VAL_12]], %[[VAL_14]] : i1 +// CHECK: %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (!fir.class>) { +// CHECK: %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.class>, index) -> (index, index, index) +// CHECK: %[[VAL_18:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.class>, index) -> (index, index, index) +// CHECK: %[[VAL_19:.*]] = fir.zero_bits !fir.heap> +// CHECK: %[[VAL_20:.*]] = fir.shape %[[VAL_5]], %[[VAL_5]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_19]](%[[VAL_20]]) : (!fir.heap>, !fir.shape<2>) -> !fir.class>> +// CHECK: fir.store %[[VAL_21]] to %[[VAL_7]] : !fir.ref>>> +// CHECK: %[[VAL_22:.*]] = fir.declare %[[VAL_7]] {fortran_attrs = #fir.var_attrs, uniq_name = ".repacked"} : (!fir.ref>>>) -> !fir.ref>>> +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.class>) -> !fir.box +// CHECK: fir.call @_FortranAAllocatableApplyMold(%[[VAL_23]], %[[VAL_24]], %[[VAL_3]]) : (!fir.ref>, !fir.box, i32) -> () +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_5]] : (index) -> i32 +// CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_4]] : (index) -> i64 +// CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_17]]#1 : (index) -> i64 +// CHECK: fir.call @_FortranAAllocatableSetBounds(%[[VAL_25]], %[[VAL_26]], %[[VAL_27]], %[[VAL_28]]) : (!fir.ref>, i32, i64, i64) -> () +// CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_4]] : (index) -> i32 +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_4]] : (index) -> i64 +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_18]]#1 : (index) -> i64 +// CHECK: fir.call @_FortranAAllocatableSetBounds(%[[VAL_29]], %[[VAL_30]], %[[VAL_31]], %[[VAL_32]]) : (!fir.ref>, i32, i64, i64) -> () +// CHECK: %[[VAL_33:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_34:.*]] = fir.absent !fir.box +// CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_33]] : (!fir.ref>) -> !fir.ref +// CHECK: %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: %[[VAL_38:.*]] = fir.load %[[VAL_22]] : !fir.ref>>> +// CHECK: %[[VAL_39:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.class>>) -> !fir.box +// CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_0]] : (!fir.class>) -> !fir.box +// CHECK: %[[VAL_42:.*]] = fir.convert %[[VAL_39]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_40]], %[[VAL_41]], %[[VAL_42]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_43:.*]] = fir.shift %[[VAL_17]]#0, %[[VAL_18]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_44:.*]] = fir.rebox %[[VAL_38]](%[[VAL_43]]) : (!fir.class>>, !fir.shift<2>) -> !fir.class> +// CHECK: fir.result %[[VAL_44]] : !fir.class> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.class> +// CHECK: } +// CHECK: fir.result %[[VAL_16]] : !fir.class> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.class> +// CHECK: } +// CHECK: %[[VAL_45:.*]] = fir.declare %[[VAL_10]] dummy_scope %[[VAL_8]] {uniq_name = "_QFtest7Ex"} : (!fir.class>, !fir.dscope) -> !fir.class> +// CHECK: %[[VAL_46:.*]] = fir.is_present %[[VAL_0]] : (!fir.class>) -> i1 +// CHECK: fir.if %[[VAL_46]] { +// CHECK: %[[VAL_47:.*]] = fir.box_addr %[[VAL_10]] : (!fir.class>) -> !fir.heap> +// CHECK: %[[VAL_49:.*]] = fir.box_addr %[[VAL_0]] : (!fir.class>) -> !fir.heap> +// CHECK: %[[VAL_48:.*]] = fir.convert %[[VAL_47]] : (!fir.heap>) -> index +// CHECK: %[[VAL_50:.*]] = fir.convert %[[VAL_49]] : (!fir.heap>) -> index +// CHECK: %[[VAL_51:.*]] = arith.cmpi ne, %[[VAL_48]], %[[VAL_50]] : index +// CHECK: fir.if %[[VAL_51]] { +// CHECK: %[[VAL_52:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_53:.*]] = fir.convert %[[VAL_0]] : (!fir.class>) -> !fir.box +// CHECK: %[[VAL_54:.*]] = fir.convert %[[VAL_10]] : (!fir.class>) -> !fir.box +// CHECK: %[[VAL_55:.*]] = fir.convert %[[VAL_52]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_53]], %[[VAL_54]], %[[VAL_55]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: fir.freemem %[[VAL_47]] : !fir.heap> +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } + +// Test unlimited polymorphic type array with requested stack allocation. +// The actual allocation is done in heap memory. +func.func @_QPtest7_stack(%arg0: !fir.class> {fir.bindc_name = "x"}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.pack_array %arg0 stack innermost : (!fir.class>) -> !fir.class> + %2 = fir.declare %1 dummy_scope %0 {uniq_name = "_QFtest7Ex"} : (!fir.class>, !fir.dscope) -> !fir.class> + fir.unpack_array %1 to %arg0 stack : !fir.class> + return +} +// CHECK-LABEL: func.func @_QPtest7_stack( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.class> {fir.bindc_name = "x"}) { +// CHECK: %[[VAL_1:.*]] = arith.constant +// CHECK: %[[VAL_2:.*]] = arith.constant +// CHECK: %[[VAL_3:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_4:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = arith.constant false +// CHECK: %[[VAL_7:.*]] = fir.alloca !fir.class>> +// CHECK: %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_9:.*]] = fir.is_present %[[VAL_0]] : (!fir.class>) -> i1 +// CHECK: %[[VAL_10:.*]] = fir.if %[[VAL_9]] -> (!fir.class>) { +// CHECK: %[[VAL_11:.*]] = fir.is_contiguous_box %[[VAL_0]] innermost : (!fir.class>) -> i1 +// CHECK: %[[VAL_12:.*]] = arith.cmpi eq, %[[VAL_11]], %[[VAL_6]] : i1 +// CHECK: %[[VAL_13:.*]] = fir.box_addr %[[VAL_0]] : (!fir.class>) -> !fir.ref>> +// CHECK: %[[VAL_14:.*]] = fir.is_present %[[VAL_13]] : (!fir.ref>>) -> i1 +// CHECK: %[[VAL_15:.*]] = arith.andi %[[VAL_12]], %[[VAL_14]] : i1 +// CHECK: %[[VAL_16:.*]] = fir.if %[[VAL_15]] -> (!fir.class>) { +// CHECK: %[[VAL_17:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_5]] : (!fir.class>, index) -> (index, index, index) +// CHECK: %[[VAL_18:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_4]] : (!fir.class>, index) -> (index, index, index) +// CHECK: %[[VAL_19:.*]] = fir.zero_bits !fir.heap> +// CHECK: %[[VAL_20:.*]] = fir.shape %[[VAL_5]], %[[VAL_5]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_19]](%[[VAL_20]]) : (!fir.heap>, !fir.shape<2>) -> !fir.class>> +// CHECK: fir.store %[[VAL_21]] to %[[VAL_7]] : !fir.ref>>> +// CHECK: %[[VAL_22:.*]] = fir.declare %[[VAL_7]] {fortran_attrs = #fir.var_attrs, uniq_name = ".repacked"} : (!fir.ref>>>) -> !fir.ref>>> +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.class>) -> !fir.box +// CHECK: fir.call @_FortranAAllocatableApplyMold(%[[VAL_23]], %[[VAL_24]], %[[VAL_3]]) : (!fir.ref>, !fir.box, i32) -> () +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_5]] : (index) -> i32 +// CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_4]] : (index) -> i64 +// CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_17]]#1 : (index) -> i64 +// CHECK: fir.call @_FortranAAllocatableSetBounds(%[[VAL_25]], %[[VAL_26]], %[[VAL_27]], %[[VAL_28]]) : (!fir.ref>, i32, i64, i64) -> () +// CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_4]] : (index) -> i32 +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_4]] : (index) -> i64 +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_18]]#1 : (index) -> i64 +// CHECK: fir.call @_FortranAAllocatableSetBounds(%[[VAL_29]], %[[VAL_30]], %[[VAL_31]], %[[VAL_32]]) : (!fir.ref>, i32, i64, i64) -> () +// CHECK: %[[VAL_33:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_34:.*]] = fir.absent !fir.box +// CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_33]] : (!fir.ref>) -> !fir.ref +// CHECK: %[[VAL_37:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_35]], %[[VAL_6]], %[[VAL_34]], %[[VAL_36]], %[[VAL_2]]) : (!fir.ref>, i1, !fir.box, !fir.ref, i32) -> i32 +// CHECK: %[[VAL_38:.*]] = fir.load %[[VAL_22]] : !fir.ref>>> +// CHECK: %[[VAL_39:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (!fir.class>>) -> !fir.box +// CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_0]] : (!fir.class>) -> !fir.box +// CHECK: %[[VAL_42:.*]] = fir.convert %[[VAL_39]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_40]], %[[VAL_41]], %[[VAL_42]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: %[[VAL_43:.*]] = fir.shift %[[VAL_17]]#0, %[[VAL_18]]#0 : (index, index) -> !fir.shift<2> +// CHECK: %[[VAL_44:.*]] = fir.rebox %[[VAL_38]](%[[VAL_43]]) : (!fir.class>>, !fir.shift<2>) -> !fir.class> +// CHECK: fir.result %[[VAL_44]] : !fir.class> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.class> +// CHECK: } +// CHECK: fir.result %[[VAL_16]] : !fir.class> +// CHECK: } else { +// CHECK: fir.result %[[VAL_0]] : !fir.class> +// CHECK: } +// CHECK: %[[VAL_45:.*]] = fir.declare %[[VAL_10]] dummy_scope %[[VAL_8]] {uniq_name = "_QFtest7Ex"} : (!fir.class>, !fir.dscope) -> !fir.class> +// CHECK: %[[VAL_46:.*]] = fir.is_present %[[VAL_0]] : (!fir.class>) -> i1 +// CHECK: fir.if %[[VAL_46]] { +// CHECK: %[[VAL_47:.*]] = fir.box_addr %[[VAL_10]] : (!fir.class>) -> !fir.heap> +// CHECK: %[[VAL_49:.*]] = fir.box_addr %[[VAL_0]] : (!fir.class>) -> !fir.heap> +// CHECK: %[[VAL_48:.*]] = fir.convert %[[VAL_47]] : (!fir.heap>) -> index +// CHECK: %[[VAL_50:.*]] = fir.convert %[[VAL_49]] : (!fir.heap>) -> index +// CHECK: %[[VAL_51:.*]] = arith.cmpi ne, %[[VAL_48]], %[[VAL_50]] : index +// CHECK: fir.if %[[VAL_51]] { +// CHECK: %[[VAL_52:.*]] = fir.address_of(@{{_QQcl.*}} +// CHECK: %[[VAL_53:.*]] = fir.convert %[[VAL_0]] : (!fir.class>) -> !fir.box +// CHECK: %[[VAL_54:.*]] = fir.convert %[[VAL_10]] : (!fir.class>) -> !fir.box +// CHECK: %[[VAL_55:.*]] = fir.convert %[[VAL_52]] : (!fir.ref>) -> !fir.ref +// CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_53]], %[[VAL_54]], %[[VAL_55]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () +// CHECK: fir.freemem %[[VAL_47]] : !fir.heap> +// CHECK: } +// CHECK: } +// CHECK: return +// CHECK: } From 76e7bddf9d36074e0d67c21c5d67c2171abd70c4 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Mon, 31 Mar 2025 20:46:04 +0200 Subject: [PATCH 0137/1029] [libc++][NFC] Replace __attribute__((__const__)) with [[__gnu__::__const__]] --- libcxx/include/__system_error/error_category.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcxx/include/__system_error/error_category.h b/libcxx/include/__system_error/error_category.h index bfe7bc24a5d3d..7233e22110447 100644 --- a/libcxx/include/__system_error/error_category.h +++ b/libcxx/include/__system_error/error_category.h @@ -67,8 +67,8 @@ class _LIBCPP_HIDDEN __do_message : public error_category { string message(int __ev) const override; }; -__attribute__((__const__)) _LIBCPP_EXPORTED_FROM_ABI const error_category& generic_category() _NOEXCEPT; -__attribute__((__const__)) _LIBCPP_EXPORTED_FROM_ABI const error_category& system_category() _NOEXCEPT; +[[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI const error_category& generic_category() _NOEXCEPT; +[[__gnu__::__const__]] _LIBCPP_EXPORTED_FROM_ABI const error_category& system_category() _NOEXCEPT; _LIBCPP_END_NAMESPACE_STD From 0bdc9e6d080009dc87b9458181c5a41cc13f26ae Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 31 Mar 2025 11:46:23 -0700 Subject: [PATCH 0138/1029] [lldb-dap] Replace GetBreakpointLabel with kDAPBreakpointLabel constant (NFC) (#133746) Replace GetBreakpointLabel with kDAPBreakpointLabel constant to avoid an unnecessary function call. --- lldb/tools/lldb-dap/Breakpoint.cpp | 4 +--- lldb/tools/lldb-dap/BreakpointBase.cpp | 16 ---------------- lldb/tools/lldb-dap/BreakpointBase.h | 16 +++++++++++++++- lldb/tools/lldb-dap/ExceptionBreakpoint.cpp | 4 +--- .../Handler/InitializeRequestHandler.cpp | 8 ++++---- 5 files changed, 21 insertions(+), 27 deletions(-) diff --git a/lldb/tools/lldb-dap/Breakpoint.cpp b/lldb/tools/lldb-dap/Breakpoint.cpp index b3bfa61595a82..eba534dcc51c7 100644 --- a/lldb/tools/lldb-dap/Breakpoint.cpp +++ b/lldb/tools/lldb-dap/Breakpoint.cpp @@ -72,9 +72,7 @@ void Breakpoint::CreateJsonObject(llvm::json::Object &object) { bool Breakpoint::MatchesName(const char *name) { return bp.MatchesName(name); } void Breakpoint::SetBreakpoint() { - // See comments in BreakpointBase::GetBreakpointLabel() for details of why - // we add a label to our breakpoints. - bp.AddName(GetBreakpointLabel()); + bp.AddName(kDAPBreakpointLabel); if (!condition.empty()) SetCondition(); if (!hitCondition.empty()) diff --git a/lldb/tools/lldb-dap/BreakpointBase.cpp b/lldb/tools/lldb-dap/BreakpointBase.cpp index 7979bac098766..15fecaf691199 100644 --- a/lldb/tools/lldb-dap/BreakpointBase.cpp +++ b/lldb/tools/lldb-dap/BreakpointBase.cpp @@ -26,19 +26,3 @@ void BreakpointBase::UpdateBreakpoint(const BreakpointBase &request_bp) { SetHitCondition(); } } - -const char *BreakpointBase::GetBreakpointLabel() { - // Breakpoints in LLDB can have names added to them which are kind of like - // labels or categories. All breakpoints that are set through the IDE UI get - // sent through the various DAP set*Breakpoint packets, and these - // breakpoints will be labeled with this name so if breakpoint update events - // come in for breakpoints that the IDE doesn't know about, like if a - // breakpoint is set manually using the debugger console, we won't report any - // updates on them and confused the IDE. This function gets called by all of - // the breakpoint classes after they set breakpoints to mark a breakpoint as - // a UI breakpoint. We can later check a lldb::SBBreakpoint object that comes - // in via LLDB breakpoint changed events and check the breakpoint by calling - // "bool lldb::SBBreakpoint::MatchesName(const char *)" to check if a - // breakpoint in one of the UI breakpoints that we should report changes for. - return "dap"; -} diff --git a/lldb/tools/lldb-dap/BreakpointBase.h b/lldb/tools/lldb-dap/BreakpointBase.h index 3c248dd1736d0..0b036dd1985b3 100644 --- a/lldb/tools/lldb-dap/BreakpointBase.h +++ b/lldb/tools/lldb-dap/BreakpointBase.h @@ -10,6 +10,7 @@ #define LLDB_TOOLS_LLDB_DAP_BREAKPOINTBASE_H #include "DAPForward.h" +#include "llvm/ADT/StringRef.h" #include namespace lldb_dap { @@ -34,7 +35,20 @@ struct BreakpointBase { void UpdateBreakpoint(const BreakpointBase &request_bp); - static const char *GetBreakpointLabel(); + /// Breakpoints in LLDB can have names added to them which are kind of like + /// labels or categories. All breakpoints that are set through DAP get sent + /// through the various DAP set*Breakpoint packets, and these breakpoints will + /// be labeled with this name so if breakpoint update events come in for + /// breakpoints that the client doesn't know about, like if a breakpoint is + /// set manually using the debugger console, we won't report any updates on + /// them and confused the client. This label gets added by all of the + /// breakpoint classes after they set breakpoints to mark a breakpoint as a + /// DAP breakpoint. We can later check a lldb::SBBreakpoint object that comes + /// in via LLDB breakpoint changed events and check the breakpoint by calling + /// "bool lldb::SBBreakpoint::MatchesName(const char *)" to check if a + /// breakpoint in one of the DAP breakpoints that we should report changes + /// for. + static constexpr const char *kDAPBreakpointLabel = "dap"; }; } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp index 0fb865c19e574..15aee55ad923e 100644 --- a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp +++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp @@ -20,9 +20,7 @@ void ExceptionBreakpoint::SetBreakpoint() { bool throw_value = filter.find("_throw") != std::string::npos; bp = dap.target.BreakpointCreateForException(language, catch_value, throw_value); - // See comments in BreakpointBase::GetBreakpointLabel() for details of why - // we add a label to our breakpoints. - bp.AddName(BreakpointBase::GetBreakpointLabel()); + bp.AddName(BreakpointBase::kDAPBreakpointLabel); } void ExceptionBreakpoint::ClearBreakpoint() { diff --git a/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp index e7c565aad13a3..b4250cd6becb3 100644 --- a/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/InitializeRequestHandler.cpp @@ -197,13 +197,13 @@ static void EventThreadFunction(DAP &dap) { auto bp = Breakpoint( dap, lldb::SBBreakpoint::GetBreakpointFromEvent(event)); // If the breakpoint was set through DAP, it will have the - // BreakpointBase::GetBreakpointLabel() label. Regardless - // of whether locations were added, removed, or resolved, the - // breakpoint isn't going away and the reason is always "changed". + // BreakpointBase::kDAPBreakpointLabel. Regardless of whether + // locations were added, removed, or resolved, the breakpoint isn't + // going away and the reason is always "changed". if ((event_type & lldb::eBreakpointEventTypeLocationsAdded || event_type & lldb::eBreakpointEventTypeLocationsRemoved || event_type & lldb::eBreakpointEventTypeLocationsResolved) && - bp.MatchesName(BreakpointBase::GetBreakpointLabel())) { + bp.MatchesName(BreakpointBase::kDAPBreakpointLabel)) { // As the DAP client already knows the path of this breakpoint, we // don't need to send it back as part of the "changed" event. This // avoids sending paths that should be source mapped. Note that From 2653eb52d1c700740a96ae61484bb1899e783c82 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 31 Mar 2025 19:49:39 +0100 Subject: [PATCH 0139/1029] [AArch64] Add -cost-kind=all coverage for sve cost tests. NFC --- .../CostModel/AArch64/sve-arith-fp.ll | 116 +- .../Analysis/CostModel/AArch64/sve-arith.ll | 46 +- .../Analysis/CostModel/AArch64/sve-cast.ll | 3630 ++++++++--------- .../Analysis/CostModel/AArch64/sve-cmpsel.ll | 93 +- .../Analysis/CostModel/AArch64/sve-div.ll | 594 +-- .../Analysis/CostModel/AArch64/sve-ext.ll | 44 +- .../Analysis/CostModel/AArch64/sve-fpext.ll | 25 +- .../Analysis/CostModel/AArch64/sve-fptoi.ll | 148 +- .../Analysis/CostModel/AArch64/sve-fptrunc.ll | 22 +- .../CostModel/AArch64/sve-insert-extract.ll | 246 +- .../Analysis/CostModel/AArch64/sve-itofp.ll | 196 +- .../Analysis/CostModel/AArch64/sve-ldst.ll | 140 +- .../Analysis/CostModel/AArch64/sve-math.ll | 2 +- .../Analysis/CostModel/AArch64/sve-min-max.ll | 2 +- .../Analysis/CostModel/AArch64/sve-rem.ll | 610 +-- .../AArch64/sve-shuffle-broadcast.ll | 55 +- .../Analysis/CostModel/AArch64/sve-trunc.ll | 68 +- .../Analysis/CostModel/AArch64/sve-vscale.ll | 10 +- 18 files changed, 3036 insertions(+), 3011 deletions(-) diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll index 770d3087b0752..dc95eacca28d4 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll @@ -1,20 +1,20 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -enable-no-nans-fp-math -passes="print" 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 -mattr=+sve | FileCheck %s +; RUN: opt < %s -enable-no-nans-fp-math -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+fullfp16 -mattr=+sve | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @fadd() { ; CHECK-LABEL: 'fadd' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fadd undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fadd undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fadd undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V1F32 = fadd undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fadd undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fadd undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fadd undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fadd undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fadd undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found costs of Invalid for: %V1F32 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fadd undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V4F16 = fadd undef, undef %V8F16 = fadd undef, undef @@ -33,16 +33,16 @@ define void @fadd() { define void @fsub() { ; CHECK-LABEL: 'fsub' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fsub undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fsub undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fsub undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V1F32 = fsub undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fsub undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fsub undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fsub undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fsub undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fsub undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found costs of Invalid for: %V1F32 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fsub undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V4F16 = fsub undef, undef %V8F16 = fsub undef, undef @@ -61,16 +61,16 @@ define void @fsub() { define void @fneg() { ; CHECK-LABEL: 'fneg' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F16 = fneg undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F16 = fneg undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8F16 = fneg undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16F16 = fneg undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = fneg undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = fneg undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = fneg undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = fneg undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = fneg undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F16 = fneg undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fneg undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fneg undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fneg undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fneg undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fneg undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fneg undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fneg undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fneg undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2F16 = fneg undef %V4F16 = fneg undef @@ -89,15 +89,15 @@ define void @fneg() { define void @fmul() { ; CHECK-LABEL: 'fmul' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fmul undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fmul undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fmul undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fmul undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fmul undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fmul undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fmul undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fmul undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fmul undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fmul undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fmul undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fmul undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fmul undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fmul undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fmul undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fmul undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V4F16 = fmul undef, undef %V8F16 = fmul undef, undef @@ -115,15 +115,15 @@ define void @fmul() { define void @fdiv() { ; CHECK-LABEL: 'fdiv' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F16 = fdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8F16 = fdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16F16 = fdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = fdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = fdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8F32 = fdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = fdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = fdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = fdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = fdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16F16 = fdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = fdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = fdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8F32 = fdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = fdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4F64 = fdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V4F16 = fdiv undef, undef %V8F16 = fdiv undef, undef @@ -141,15 +141,15 @@ define void @fdiv() { define void @frem() { ; CHECK-LABEL: 'frem' -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V4F16 = frem undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V8F16 = frem undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V16F16 = frem undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V2F32 = frem undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V4F32 = frem undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V8F32 = frem undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V2F64 = frem undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %V4F64 = frem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = frem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = frem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V16F16 = frem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = frem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = frem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F32 = frem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = frem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F64 = frem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V4F16 = frem undef, undef %V8F16 = frem undef, undef diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-arith.ll b/llvm/test/Analysis/CostModel/AArch64/sve-arith.ll index 46450e68f40e2..75af1df08594f 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-arith.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-arith.ll @@ -1,15 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -passes="print" 2>&1 -disable-output -aarch64-sve-vector-bits-min=128 | FileCheck %s -D#VBITS=128 +; RUN: opt < %s -passes="print" -cost-kind=all 2>&1 -disable-output -aarch64-sve-vector-bits-min=128 | FileCheck %s -D#VBITS=128 target triple = "aarch64-unknown-linux-gnu" define void @scalable_sdiv() #0 { ; CHECK-LABEL: 'scalable_sdiv' -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %sdiv_nxv16i8 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %sdiv_nxv8i16 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sdiv_nxv4i32 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sdiv_nxv2i64 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %sdiv_nxv16i8 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %sdiv_nxv8i16 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %sdiv_nxv4i32 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %sdiv_nxv2i64 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; entry: %sdiv_nxv16i8 = sdiv undef, undef @@ -22,11 +22,11 @@ entry: define void @scalable_udiv() #0 { ; CHECK-LABEL: 'scalable_udiv' -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %udiv_nxv16i8 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %udiv_nxv8i16 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %udiv_nxv4i32 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %udiv_nxv2i64 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %udiv_nxv16i8 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %udiv_nxv8i16 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %udiv_nxv4i32 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %udiv_nxv2i64 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; entry: %udiv_nxv16i8 = udiv undef, undef @@ -39,12 +39,12 @@ entry: define void @scalable_mul() #0 { ; CHECK-LABEL: 'scalable_mul' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv16i8 = mul undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv8i16 = mul undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv4i32 = mul undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mul_nxv2i64 = mul undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %mul_nxv1i64 = mul undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 1 for: %mul_nxv16i8 = mul undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %mul_nxv8i16 = mul undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %mul_nxv4i32 = mul undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %mul_nxv2i64 = mul undef, undef +; CHECK-NEXT: Cost Model: Found costs of Invalid for: %mul_nxv1i64 = mul undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; entry: %mul_nxv16i8 = mul undef, undef @@ -58,12 +58,12 @@ entry: define void @scalable_add() #0 { ; CHECK-LABEL: 'scalable_add' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_nxv16i8 = add undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_nxv8i16 = add undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_nxv4i32 = add undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %add_nxv2i64 = add undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %add_nxv1i64 = add undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add_nxv16i8 = add undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add_nxv8i16 = add undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add_nxv4i32 = add undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %add_nxv2i64 = add undef, undef +; CHECK-NEXT: Cost Model: Found costs of Invalid for: %add_nxv1i64 = add undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; entry: %add_nxv16i8 = add undef, undef diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll b/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll index bab8e53999baa..cfb130eb5ec32 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll @@ -1,295 +1,295 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve %s | FileCheck --check-prefixes=CHECK,CHECK-SVE %s -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -force-streaming-compatible %s | FileCheck --check-prefixes=CHECK,SVE128-NO-NEON %s -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck --check-prefixes=CHECK,FIXED-MIN-256 %s -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=2048 %s | FileCheck --check-prefixes=CHECK,FIXED-MIN-2048 %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve %s | FileCheck --check-prefixes=CHECK,CHECK-SVE %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -force-streaming-compatible %s | FileCheck --check-prefixes=CHECK,SVE128-NO-NEON %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck --check-prefixes=CHECK,FIXED-MIN-256 %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=2048 %s | FileCheck --check-prefixes=CHECK,FIXED-MIN-2048 %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @ext() { ; CHECK-SVE-LABEL: 'ext' -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i1 undef to i8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i1 undef to i8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i1 undef to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i1 undef to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r4 = sext i1 undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r5 = zext i1 undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i1 undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i1 undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r9 = sext i8 undef to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r10 = zext i8 undef to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r11 = sext i8 undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r12 = zext i8 undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r13 = sext i8 undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r14 = zext i8 undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r17 = sext i16 undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r18 = zext i16 undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r19 = sext i16 undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r20 = zext i16 undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r24 = sext i32 undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r25 = zext i32 undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i16 = sext <2 x i8> undef to <2 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i8i16 = zext <2 x i8> undef to <2 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i32 = sext <2 x i8> undef to <2 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i8i32 = zext <2 x i8> undef to <2 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i64 = sext <2 x i8> undef to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i8i64 = zext <2 x i8> undef to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i16i32 = sext <2 x i16> undef to <2 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i16i32 = zext <2 x i16> undef to <2 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i16i64 = sext <2 x i16> undef to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i16i64 = zext <2 x i16> undef to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i32i64 = sext <2 x i32> undef to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i32i64 = zext <2 x i32> undef to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i16 = sext <4 x i8> undef to <4 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i8i16 = zext <4 x i8> undef to <4 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i32 = sext <4 x i8> undef to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i8i32 = zext <4 x i8> undef to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s4i8i64 = sext <4 x i8> undef to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %z4i8i64 = zext <4 x i8> undef to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i16i32 = sext <4 x i16> undef to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i16i32 = zext <4 x i16> undef to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s4i16i64 = sext <4 x i16> undef to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %z4i16i64 = zext <4 x i16> undef to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s4i32i64 = sext <4 x i32> undef to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z4i32i64 = zext <4 x i32> undef to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i8i16 = sext <8 x i8> undef to <8 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8i8i16 = zext <8 x i8> undef to <8 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s8i8i32 = sext <8 x i8> undef to <8 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %z8i8i32 = zext <8 x i8> undef to <8 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %s8i8i64 = sext <8 x i8> undef to <8 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %z8i8i64 = zext <8 x i8> undef to <8 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s8i16i32 = sext <8 x i16> undef to <8 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z8i16i32 = zext <8 x i16> undef to <8 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %s8i16i64 = sext <8 x i16> undef to <8 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %z8i16i64 = zext <8 x i16> undef to <8 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8i32i64 = sext <8 x i32> undef to <8 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %z8i32i64 = zext <8 x i32> undef to <8 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s16i8i16 = sext <16 x i8> undef to <16 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z16i8i16 = zext <16 x i8> undef to <16 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %s16i8i32 = sext <16 x i8> undef to <16 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %z16i8i32 = zext <16 x i8> undef to <16 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %s16i8i64 = sext <16 x i8> undef to <16 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %z16i8i64 = zext <16 x i8> undef to <16 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16i16i32 = sext <16 x i16> undef to <16 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %z16i16i32 = zext <16 x i16> undef to <16 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %s16i16i64 = sext <16 x i16> undef to <16 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %z16i16i64 = zext <16 x i16> undef to <16 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %s16i32i64 = sext <16 x i32> undef to <16 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %z16i32i64 = zext <16 x i32> undef to <16 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SVE128-NO-NEON-LABEL: 'ext' -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i1 undef to i8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i1 undef to i8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i1 undef to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i1 undef to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r4 = sext i1 undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r5 = zext i1 undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i1 undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i1 undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r9 = sext i8 undef to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r10 = zext i8 undef to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r11 = sext i8 undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r12 = zext i8 undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r13 = sext i8 undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r14 = zext i8 undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r17 = sext i16 undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r18 = zext i16 undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r19 = sext i16 undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r20 = zext i16 undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r24 = sext i32 undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r25 = zext i32 undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i16 = sext <2 x i8> undef to <2 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i8i16 = zext <2 x i8> undef to <2 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i32 = sext <2 x i8> undef to <2 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i8i32 = zext <2 x i8> undef to <2 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i64 = sext <2 x i8> undef to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i8i64 = zext <2 x i8> undef to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i16i32 = sext <2 x i16> undef to <2 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i16i32 = zext <2 x i16> undef to <2 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i16i64 = sext <2 x i16> undef to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i16i64 = zext <2 x i16> undef to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i32i64 = sext <2 x i32> undef to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i32i64 = zext <2 x i32> undef to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i16 = sext <4 x i8> undef to <4 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i8i16 = zext <4 x i8> undef to <4 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i32 = sext <4 x i8> undef to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i8i32 = zext <4 x i8> undef to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s4i8i64 = sext <4 x i8> undef to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z4i8i64 = zext <4 x i8> undef to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i16i32 = sext <4 x i16> undef to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i16i32 = zext <4 x i16> undef to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s4i16i64 = sext <4 x i16> undef to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z4i16i64 = zext <4 x i16> undef to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s4i32i64 = sext <4 x i32> undef to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z4i32i64 = zext <4 x i32> undef to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i8i16 = sext <8 x i8> undef to <8 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8i8i16 = zext <8 x i8> undef to <8 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s8i8i32 = sext <8 x i8> undef to <8 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z8i8i32 = zext <8 x i8> undef to <8 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8i8i64 = sext <8 x i8> undef to <8 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %z8i8i64 = zext <8 x i8> undef to <8 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s8i16i32 = sext <8 x i16> undef to <8 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z8i16i32 = zext <8 x i16> undef to <8 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8i16i64 = sext <8 x i16> undef to <8 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %z8i16i64 = zext <8 x i16> undef to <8 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8i32i64 = sext <8 x i32> undef to <8 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %z8i32i64 = zext <8 x i32> undef to <8 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s16i8i16 = sext <16 x i8> undef to <16 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z16i8i16 = zext <16 x i8> undef to <16 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16i8i32 = sext <16 x i8> undef to <16 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %z16i8i32 = zext <16 x i8> undef to <16 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %s16i8i64 = sext <16 x i8> undef to <16 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %z16i8i64 = zext <16 x i8> undef to <16 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16i16i32 = sext <16 x i16> undef to <16 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %z16i16i32 = zext <16 x i16> undef to <16 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %s16i16i64 = sext <16 x i16> undef to <16 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %z16i16i64 = zext <16 x i16> undef to <16 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %s16i32i64 = sext <16 x i32> undef to <16 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %z16i32i64 = zext <16 x i32> undef to <16 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; FIXED-MIN-256-LABEL: 'ext' -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i1 undef to i8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i1 undef to i8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i1 undef to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i1 undef to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r4 = sext i1 undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r5 = zext i1 undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i1 undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i1 undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r9 = sext i8 undef to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r10 = zext i8 undef to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r11 = sext i8 undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r12 = zext i8 undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r13 = sext i8 undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r14 = zext i8 undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r17 = sext i16 undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r18 = zext i16 undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r19 = sext i16 undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r20 = zext i16 undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r24 = sext i32 undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r25 = zext i32 undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i16 = sext <2 x i8> undef to <2 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i8i16 = zext <2 x i8> undef to <2 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i32 = sext <2 x i8> undef to <2 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i8i32 = zext <2 x i8> undef to <2 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i64 = sext <2 x i8> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i8i64 = zext <2 x i8> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i16i32 = sext <2 x i16> undef to <2 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i16i32 = zext <2 x i16> undef to <2 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i16i64 = sext <2 x i16> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i16i64 = zext <2 x i16> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i32i64 = sext <2 x i32> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i32i64 = zext <2 x i32> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i16 = sext <4 x i8> undef to <4 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i8i16 = zext <4 x i8> undef to <4 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i32 = sext <4 x i8> undef to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i8i32 = zext <4 x i8> undef to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i64 = sext <4 x i8> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i8i64 = zext <4 x i8> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i16i32 = sext <4 x i16> undef to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i16i32 = zext <4 x i16> undef to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i16i64 = sext <4 x i16> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i16i64 = zext <4 x i16> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i32i64 = sext <4 x i32> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i32i64 = zext <4 x i32> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i8i16 = sext <8 x i8> undef to <8 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8i8i16 = zext <8 x i8> undef to <8 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i8i32 = sext <8 x i8> undef to <8 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8i8i32 = zext <8 x i8> undef to <8 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s8i8i64 = sext <8 x i8> undef to <8 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z8i8i64 = zext <8 x i8> undef to <8 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i16i32 = sext <8 x i16> undef to <8 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8i16i32 = zext <8 x i16> undef to <8 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s8i16i64 = sext <8 x i16> undef to <8 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z8i16i64 = zext <8 x i16> undef to <8 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s8i32i64 = sext <8 x i32> undef to <8 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z8i32i64 = zext <8 x i32> undef to <8 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s16i8i16 = sext <16 x i8> undef to <16 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z16i8i16 = zext <16 x i8> undef to <16 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s16i8i32 = sext <16 x i8> undef to <16 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z16i8i32 = zext <16 x i8> undef to <16 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16i8i64 = sext <16 x i8> undef to <16 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %z16i8i64 = zext <16 x i8> undef to <16 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s16i16i32 = sext <16 x i16> undef to <16 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %z16i16i32 = zext <16 x i16> undef to <16 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16i16i64 = sext <16 x i16> undef to <16 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %z16i16i64 = zext <16 x i16> undef to <16 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16i32i64 = sext <16 x i32> undef to <16 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %z16i32i64 = zext <16 x i32> undef to <16 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; FIXED-MIN-2048-LABEL: 'ext' -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i1 undef to i8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i1 undef to i8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i1 undef to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i1 undef to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r4 = sext i1 undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r5 = zext i1 undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i1 undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i1 undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r9 = sext i8 undef to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r10 = zext i8 undef to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r11 = sext i8 undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r12 = zext i8 undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r13 = sext i8 undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r14 = zext i8 undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r17 = sext i16 undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r18 = zext i16 undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r19 = sext i16 undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r20 = zext i16 undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r24 = sext i32 undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r25 = zext i32 undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i16 = sext <2 x i8> undef to <2 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i8i16 = zext <2 x i8> undef to <2 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i32 = sext <2 x i8> undef to <2 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i8i32 = zext <2 x i8> undef to <2 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i64 = sext <2 x i8> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i8i64 = zext <2 x i8> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i16i32 = sext <2 x i16> undef to <2 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i16i32 = zext <2 x i16> undef to <2 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i16i64 = sext <2 x i16> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i16i64 = zext <2 x i16> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i32i64 = sext <2 x i32> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z2i32i64 = zext <2 x i32> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i16 = sext <4 x i8> undef to <4 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i8i16 = zext <4 x i8> undef to <4 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i32 = sext <4 x i8> undef to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i8i32 = zext <4 x i8> undef to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i64 = sext <4 x i8> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i8i64 = zext <4 x i8> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i16i32 = sext <4 x i16> undef to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i16i32 = zext <4 x i16> undef to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i16i64 = sext <4 x i16> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i16i64 = zext <4 x i16> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i32i64 = sext <4 x i32> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z4i32i64 = zext <4 x i32> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i8i16 = sext <8 x i8> undef to <8 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8i8i16 = zext <8 x i8> undef to <8 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i8i32 = sext <8 x i8> undef to <8 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8i8i32 = zext <8 x i8> undef to <8 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i8i64 = sext <8 x i8> undef to <8 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8i8i64 = zext <8 x i8> undef to <8 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i16i32 = sext <8 x i16> undef to <8 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8i16i32 = zext <8 x i16> undef to <8 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i16i64 = sext <8 x i16> undef to <8 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8i16i64 = zext <8 x i16> undef to <8 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i32i64 = sext <8 x i32> undef to <8 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8i32i64 = zext <8 x i32> undef to <8 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s16i8i16 = sext <16 x i8> undef to <16 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z16i8i16 = zext <16 x i8> undef to <16 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s16i8i32 = sext <16 x i8> undef to <16 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z16i8i32 = zext <16 x i8> undef to <16 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s16i8i64 = sext <16 x i8> undef to <16 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z16i8i64 = zext <16 x i8> undef to <16 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s16i16i32 = sext <16 x i16> undef to <16 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z16i16i32 = zext <16 x i16> undef to <16 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s16i16i64 = sext <16 x i16> undef to <16 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z16i16i64 = zext <16 x i16> undef to <16 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s16i32i64 = sext <16 x i32> undef to <16 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z16i32i64 = zext <16 x i32> undef to <16 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i1 undef to i8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i1 undef to i8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i1 undef to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i1 undef to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i1 undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i1 undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i1 undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i1 undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r9 = sext i8 undef to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i8 undef to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r11 = sext i8 undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r12 = zext i8 undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r13 = sext i8 undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r14 = zext i8 undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r17 = sext i16 undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r18 = zext i16 undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r19 = sext i16 undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r20 = zext i16 undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r24 = sext i32 undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i32 undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i8i16 = sext <2 x i8> undef to <2 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i8i16 = zext <2 x i8> undef to <2 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i8i32 = sext <2 x i8> undef to <2 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i8i32 = zext <2 x i8> undef to <2 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = sext <2 x i8> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i8i64 = zext <2 x i8> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i16i32 = sext <2 x i16> undef to <2 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i16i32 = zext <2 x i16> undef to <2 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = sext <2 x i16> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i16i64 = zext <2 x i16> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = sext <2 x i32> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z2i32i64 = zext <2 x i32> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i8i16 = sext <4 x i8> undef to <4 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i8i16 = zext <4 x i8> undef to <4 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = sext <4 x i8> undef to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i8i32 = zext <4 x i8> undef to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i8i64 = sext <4 x i8> undef to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i8i64 = zext <4 x i8> undef to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = sext <4 x i16> undef to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i16i32 = zext <4 x i16> undef to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i16i64 = sext <4 x i16> undef to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i16i64 = zext <4 x i16> undef to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i32i64 = sext <4 x i32> undef to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z4i32i64 = zext <4 x i32> undef to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = sext <8 x i8> undef to <8 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i8i16 = zext <8 x i8> undef to <8 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i8i32 = sext <8 x i8> undef to <8 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i8i32 = zext <8 x i8> undef to <8 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i8i64 = sext <8 x i8> undef to <8 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i8i64 = zext <8 x i8> undef to <8 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i16i32 = sext <8 x i16> undef to <8 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i16i32 = zext <8 x i16> undef to <8 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i16i64 = sext <8 x i16> undef to <8 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i16i64 = zext <8 x i16> undef to <8 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i32i64 = sext <8 x i32> undef to <8 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z8i32i64 = zext <8 x i32> undef to <8 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i8i16 = sext <16 x i8> undef to <16 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i8i16 = zext <16 x i8> undef to <16 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i8i32 = sext <16 x i8> undef to <16 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i8i32 = zext <16 x i8> undef to <16 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i8i64 = sext <16 x i8> undef to <16 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i8i64 = zext <16 x i8> undef to <16 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i16i32 = sext <16 x i16> undef to <16 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i16i32 = zext <16 x i16> undef to <16 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i16i64 = sext <16 x i16> undef to <16 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i16i64 = zext <16 x i16> undef to <16 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s16i32i64 = sext <16 x i32> undef to <16 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %z16i32i64 = zext <16 x i32> undef to <16 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %r0 = sext i1 undef to i8 %r1 = zext i1 undef to i8 @@ -368,152 +368,152 @@ define void @ext() { define void @trunc() { ; CHECK-SVE-LABEL: 'trunc' -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i8i16 = trunc <2 x i16> undef to <2 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i8i32 = trunc <2 x i32> undef to <2 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i64 = trunc <2 x i64> undef to <2 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i16i32 = trunc <2 x i32> undef to <2 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i16i64 = trunc <2 x i64> undef to <2 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i32i64 = trunc <2 x i64> undef to <2 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i8i16 = trunc <4 x i16> undef to <4 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i32 = trunc <4 x i32> undef to <4 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s4i8i64 = trunc <4 x i64> undef to <4 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i16i32 = trunc <4 x i32> undef to <4 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s4i16i64 = trunc <4 x i64> undef to <4 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i32i64 = trunc <4 x i64> undef to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i8i16 = trunc <8 x i16> undef to <8 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s8i8i32 = trunc <8 x i32> undef to <8 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8i8i64 = trunc <8 x i64> undef to <8 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i16i32 = trunc <8 x i32> undef to <8 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s8i16i64 = trunc <8 x i64> undef to <8 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s8i32i64 = trunc <8 x i64> undef to <8 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s16i8i16 = trunc <16 x i16> undef to <16 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s16i8i32 = trunc <16 x i32> undef to <16 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %s16i8i64 = trunc <16 x i64> undef to <16 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s16i16i32 = trunc <16 x i32> undef to <16 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %s16i16i64 = trunc <16 x i64> undef to <16 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16i32i64 = trunc <16 x i64> undef to <16 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SVE128-NO-NEON-LABEL: 'trunc' -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i8i16 = trunc <2 x i16> undef to <2 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i8i32 = trunc <2 x i32> undef to <2 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i8i64 = trunc <2 x i64> undef to <2 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i16i32 = trunc <2 x i32> undef to <2 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i16i64 = trunc <2 x i64> undef to <2 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i32i64 = trunc <2 x i64> undef to <2 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i8i16 = trunc <4 x i16> undef to <4 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i8i32 = trunc <4 x i32> undef to <4 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i8i64 = trunc <4 x i64> undef to <4 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i16i32 = trunc <4 x i32> undef to <4 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i16i64 = trunc <4 x i64> undef to <4 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i32i64 = trunc <4 x i64> undef to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i8i16 = trunc <8 x i16> undef to <8 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i8i32 = trunc <8 x i32> undef to <8 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i8i64 = trunc <8 x i64> undef to <8 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i16i32 = trunc <8 x i32> undef to <8 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i16i64 = trunc <8 x i64> undef to <8 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i32i64 = trunc <8 x i64> undef to <8 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i8i16 = trunc <16 x i16> undef to <16 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i8i32 = trunc <16 x i32> undef to <16 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i8i64 = trunc <16 x i64> undef to <16 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i16i32 = trunc <16 x i32> undef to <16 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i16i64 = trunc <16 x i64> undef to <16 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i32i64 = trunc <16 x i64> undef to <16 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; FIXED-MIN-256-LABEL: 'trunc' -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i8i16 = trunc <2 x i16> undef to <2 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i8i32 = trunc <2 x i32> undef to <2 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i64 = trunc <2 x i64> undef to <2 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i16i32 = trunc <2 x i32> undef to <2 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i16i64 = trunc <2 x i64> undef to <2 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i32i64 = trunc <2 x i64> undef to <2 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i8i16 = trunc <4 x i16> undef to <4 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i32 = trunc <4 x i32> undef to <4 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i8i64 = trunc <4 x i64> undef to <4 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i16i32 = trunc <4 x i32> undef to <4 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i16i64 = trunc <4 x i64> undef to <4 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i32i64 = trunc <4 x i64> undef to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i8i16 = trunc <8 x i16> undef to <8 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i8i32 = trunc <8 x i32> undef to <8 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i8i64 = trunc <8 x i64> undef to <8 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i16i32 = trunc <8 x i32> undef to <8 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i16i64 = trunc <8 x i64> undef to <8 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i32i64 = trunc <8 x i64> undef to <8 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i8i16 = trunc <16 x i16> undef to <16 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i8i32 = trunc <16 x i32> undef to <16 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i8i64 = trunc <16 x i64> undef to <16 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i16i32 = trunc <16 x i32> undef to <16 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i16i64 = trunc <16 x i64> undef to <16 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i32i64 = trunc <16 x i64> undef to <16 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; FIXED-MIN-2048-LABEL: 'trunc' -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = trunc i8 undef to i1 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i16 undef to i1 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i16 undef to i8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r21 = trunc i32 undef to i1 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r22 = trunc i32 undef to i8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r23 = trunc i32 undef to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r26 = trunc i64 undef to i1 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r27 = trunc i64 undef to i8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r28 = trunc i64 undef to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r29 = trunc i64 undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i8i16 = trunc <2 x i16> undef to <2 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i8i32 = trunc <2 x i32> undef to <2 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i8i64 = trunc <2 x i64> undef to <2 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s2i16i32 = trunc <2 x i32> undef to <2 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i16i64 = trunc <2 x i64> undef to <2 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s2i32i64 = trunc <2 x i64> undef to <2 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i8i16 = trunc <4 x i16> undef to <4 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i8i32 = trunc <4 x i32> undef to <4 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i8i64 = trunc <4 x i64> undef to <4 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s4i16i32 = trunc <4 x i32> undef to <4 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i16i64 = trunc <4 x i64> undef to <4 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s4i32i64 = trunc <4 x i64> undef to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8i8i16 = trunc <8 x i16> undef to <8 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i8i32 = trunc <8 x i32> undef to <8 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i8i64 = trunc <8 x i64> undef to <8 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i16i32 = trunc <8 x i32> undef to <8 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i16i64 = trunc <8 x i64> undef to <8 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s8i32i64 = trunc <8 x i64> undef to <8 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i8i16 = trunc <16 x i16> undef to <16 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i8i32 = trunc <16 x i32> undef to <16 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i8i64 = trunc <16 x i64> undef to <16 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i16i32 = trunc <16 x i32> undef to <16 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i16i64 = trunc <16 x i64> undef to <16 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %s16i32i64 = trunc <16 x i64> undef to <16 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r8 = trunc i8 undef to i1 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i16 undef to i1 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i16 undef to i8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r21 = trunc i32 undef to i1 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r22 = trunc i32 undef to i8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r23 = trunc i32 undef to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r26 = trunc i64 undef to i1 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r27 = trunc i64 undef to i8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r28 = trunc i64 undef to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r29 = trunc i64 undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s2i8i16 = trunc <2 x i16> undef to <2 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s2i8i32 = trunc <2 x i32> undef to <2 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i8i64 = trunc <2 x i64> undef to <2 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s2i16i32 = trunc <2 x i32> undef to <2 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i16i64 = trunc <2 x i64> undef to <2 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s2i32i64 = trunc <2 x i64> undef to <2 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s4i8i16 = trunc <4 x i16> undef to <4 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i8i32 = trunc <4 x i32> undef to <4 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s4i8i64 = trunc <4 x i64> undef to <4 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s4i16i32 = trunc <4 x i32> undef to <4 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s4i16i64 = trunc <4 x i64> undef to <4 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s4i32i64 = trunc <4 x i64> undef to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %s8i8i16 = trunc <8 x i16> undef to <8 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i8i32 = trunc <8 x i32> undef to <8 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i8i64 = trunc <8 x i64> undef to <8 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i16i32 = trunc <8 x i32> undef to <8 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i16i64 = trunc <8 x i64> undef to <8 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s8i32i64 = trunc <8 x i64> undef to <8 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i8i16 = trunc <16 x i16> undef to <16 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i8i32 = trunc <16 x i32> undef to <16 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i8i64 = trunc <16 x i64> undef to <16 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i16i32 = trunc <16 x i32> undef to <16 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i16i64 = trunc <16 x i64> undef to <16 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %s16i32i64 = trunc <16 x i64> undef to <16 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %r8 = trunc i8 undef to i1 %r15 = trunc i16 undef to i1 @@ -558,856 +558,856 @@ define void @trunc() { define i32 @casts_no_users() { ; CHECK-SVE-LABEL: 'casts_no_users' -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r52 = sitofp i1 undef to double -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r53 = uitofp i1 undef to double -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r56 = sitofp i8 undef to double -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r57 = uitofp i8 undef to double -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r60 = sitofp i16 undef to double -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r61 = uitofp i16 undef to double -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r64 = sitofp i32 undef to double -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r65 = uitofp i32 undef to double -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r86 = fpext <2 x float> undef to <2 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r87 = fpext <4 x float> undef to <4 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r88 = fpext <8 x float> undef to <8 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r89 = fpext <16 x float> undef to <16 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r226 = uitofp <8 x i32> undef to <8 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r227 = sitofp <8 x i32> undef to <8 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %r246 = uitofp <16 x i32> undef to <16 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %r247 = sitofp <16 x i32> undef to <16 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> undef to <4 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> undef to <8 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> undef to <16 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float undef to double +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> undef to <4 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> undef to <8 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> undef to <16 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> undef to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> undef to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> undef to <2 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> undef to <2 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> undef to <2 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> undef to <2 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> undef to <2 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> undef to <2 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> undef to <2 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> undef to <2 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> undef to <4 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> undef to <4 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> undef to <4 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> undef to <4 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> undef to <4 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> undef to <4 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> undef to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> undef to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> undef to <4 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> undef to <4 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> undef to <4 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> undef to <4 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> undef to <4 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> undef to <4 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> undef to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> undef to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> undef to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> undef to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> undef to <8 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:41 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> undef to <8 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> undef to <8 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> undef to <8 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> undef to <8 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> undef to <8 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> undef to <8 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> undef to <8 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> undef to <8 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> undef to <8 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> undef to <8 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> undef to <8 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> undef to <8 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> undef to <8 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> undef to <8 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> undef to <8 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> undef to <8 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> undef to <8 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> undef to <8 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> undef to <8 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> undef to <16 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:83 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> undef to <16 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> undef to <16 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> undef to <16 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> undef to <16 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> undef to <16 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> undef to <16 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> undef to <16 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> undef to <16 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> undef to <16 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> undef to <16 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:87 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> undef to <16 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> undef to <16 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:23 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> undef to <16 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> undef to <16 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:22 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> undef to <16 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> undef to <16 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> undef to <16 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> undef to <16 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> undef to <16 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> undef to <2 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> undef to <2 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> undef to <2 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> undef to <2 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> undef to <2 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> undef to <2 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> undef to <2 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> undef to <2 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> undef to <4 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> undef to <4 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> undef to <4 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> undef to <4 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> undef to <4 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> undef to <4 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> undef to <4 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> undef to <4 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> undef to <4 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> undef to <4 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> undef to <4 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> undef to <4 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> undef to <8 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> undef to <8 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> undef to <8 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> undef to <8 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> undef to <8 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> undef to <8 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> undef to <8 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:36 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> undef to <8 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> undef to <8 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> undef to <8 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> undef to <8 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:19 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> undef to <8 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> undef to <8 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> undef to <8 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> undef to <8 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> undef to <8 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> undef to <16 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> undef to <16 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> undef to <16 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> undef to <16 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> undef to <16 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> undef to <16 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> undef to <16 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:72 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> undef to <16 x float> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> undef to <16 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> undef to <16 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:39 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> undef to <16 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:38 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> undef to <16 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> undef to <16 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> undef to <16 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> undef to <16 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> undef to <16 x double> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SVE128-NO-NEON-LABEL: 'casts_no_users' -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r52 = sitofp i1 undef to double -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r53 = uitofp i1 undef to double -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r56 = sitofp i8 undef to double -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r57 = uitofp i8 undef to double -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r60 = sitofp i16 undef to double -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r61 = uitofp i16 undef to double -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r64 = sitofp i32 undef to double -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r65 = uitofp i32 undef to double -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r86 = fpext <2 x float> undef to <2 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r87 = fpext <4 x float> undef to <4 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r88 = fpext <8 x float> undef to <8 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r89 = fpext <16 x float> undef to <16 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r226 = uitofp <8 x i32> undef to <8 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r227 = sitofp <8 x i32> undef to <8 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r246 = uitofp <16 x i32> undef to <16 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r247 = sitofp <16 x i32> undef to <16 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r82 = fptrunc <4 x double> undef to <4 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> undef to <8 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> undef to <16 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float undef to double +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r87 = fpext <4 x float> undef to <4 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> undef to <8 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> undef to <16 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r98 = fptoui <2 x float> undef to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r99 = fptosi <2 x float> undef to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r100 = fptoui <2 x double> undef to <2 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r101 = fptosi <2 x double> undef to <2 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r102 = fptoui <2 x double> undef to <2 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r103 = fptosi <2 x double> undef to <2 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r104 = fptoui <2 x double> undef to <2 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r105 = fptosi <2 x double> undef to <2 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r106 = fptoui <2 x double> undef to <2 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r107 = fptosi <2 x double> undef to <2 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x float> undef to <4 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x float> undef to <4 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x float> undef to <4 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x float> undef to <4 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x float> undef to <4 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x float> undef to <4 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x float> undef to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x float> undef to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r120 = fptoui <4 x double> undef to <4 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r121 = fptosi <4 x double> undef to <4 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r122 = fptoui <4 x double> undef to <4 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r123 = fptosi <4 x double> undef to <4 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r124 = fptoui <4 x double> undef to <4 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r125 = fptosi <4 x double> undef to <4 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r126 = fptoui <4 x double> undef to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r127 = fptosi <4 x double> undef to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r128 = fptoui <4 x double> undef to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r129 = fptosi <4 x double> undef to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x float> undef to <8 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x float> undef to <8 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x float> undef to <8 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x float> undef to <8 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r134 = fptoui <8 x float> undef to <8 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r135 = fptosi <8 x float> undef to <8 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x float> undef to <8 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x float> undef to <8 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> undef to <8 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> undef to <8 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> undef to <8 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> undef to <8 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> undef to <8 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> undef to <8 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> undef to <8 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> undef to <8 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> undef to <8 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> undef to <8 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> undef to <8 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> undef to <8 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> undef to <16 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> undef to <16 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> undef to <16 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> undef to <16 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> undef to <16 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> undef to <16 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> undef to <16 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> undef to <16 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> undef to <16 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> undef to <16 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> undef to <16 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> undef to <16 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> undef to <16 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> undef to <16 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> undef to <16 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> undef to <16 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> undef to <16 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> undef to <16 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> undef to <16 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> undef to <16 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r174 = uitofp <2 x i16> undef to <2 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r175 = sitofp <2 x i16> undef to <2 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r178 = uitofp <2 x i64> undef to <2 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r179 = sitofp <2 x i64> undef to <2 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r184 = uitofp <2 x i16> undef to <2 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r185 = sitofp <2 x i16> undef to <2 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r186 = uitofp <2 x i32> undef to <2 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r187 = sitofp <2 x i32> undef to <2 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r194 = uitofp <4 x i16> undef to <4 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r195 = sitofp <4 x i16> undef to <4 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r198 = uitofp <4 x i64> undef to <4 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r199 = sitofp <4 x i64> undef to <4 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r200 = uitofp <4 x i1> undef to <4 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r201 = sitofp <4 x i1> undef to <4 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r204 = uitofp <4 x i16> undef to <4 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r205 = sitofp <4 x i16> undef to <4 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r206 = uitofp <4 x i32> undef to <4 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r207 = sitofp <4 x i32> undef to <4 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r208 = uitofp <4 x i64> undef to <4 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r209 = sitofp <4 x i64> undef to <4 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r210 = uitofp <8 x i1> undef to <8 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r211 = sitofp <8 x i1> undef to <8 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r214 = uitofp <8 x i16> undef to <8 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r215 = sitofp <8 x i16> undef to <8 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r216 = uitofp <8 x i32> undef to <8 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r217 = sitofp <8 x i32> undef to <8 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> undef to <8 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> undef to <8 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> undef to <8 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> undef to <8 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> undef to <8 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> undef to <8 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> undef to <8 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> undef to <8 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> undef to <8 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> undef to <8 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> undef to <16 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> undef to <16 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> undef to <16 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> undef to <16 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> undef to <16 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> undef to <16 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> undef to <16 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> undef to <16 x float> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> undef to <16 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> undef to <16 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> undef to <16 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> undef to <16 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> undef to <16 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> undef to <16 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> undef to <16 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> undef to <16 x double> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; FIXED-MIN-256-LABEL: 'casts_no_users' -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r52 = sitofp i1 undef to double -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r53 = uitofp i1 undef to double -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r56 = sitofp i8 undef to double -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r57 = uitofp i8 undef to double -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r60 = sitofp i16 undef to double -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r61 = uitofp i16 undef to double -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r64 = sitofp i32 undef to double -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r65 = uitofp i32 undef to double -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r86 = fpext <2 x float> undef to <2 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r87 = fpext <4 x float> undef to <4 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r88 = fpext <8 x float> undef to <8 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r89 = fpext <16 x float> undef to <16 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r226 = uitofp <8 x i32> undef to <8 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r227 = sitofp <8 x i32> undef to <8 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r246 = uitofp <16 x i32> undef to <16 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r247 = sitofp <16 x i32> undef to <16 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r82 = fptrunc <4 x double> undef to <4 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r83 = fptrunc <8 x double> undef to <8 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r84 = fptrunc <16 x double> undef to <16 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float undef to double +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r87 = fpext <4 x float> undef to <4 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r88 = fpext <8 x float> undef to <8 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r89 = fpext <16 x float> undef to <16 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> undef to <2 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> undef to <2 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> undef to <2 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> undef to <2 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> undef to <2 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> undef to <2 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> undef to <2 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> undef to <2 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> undef to <4 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> undef to <4 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> undef to <4 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> undef to <4 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> undef to <4 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> undef to <4 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r118 = fptoui <4 x float> undef to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r119 = fptosi <4 x float> undef to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r120 = fptoui <4 x double> undef to <4 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r121 = fptosi <4 x double> undef to <4 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r122 = fptoui <4 x double> undef to <4 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r123 = fptosi <4 x double> undef to <4 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r124 = fptoui <4 x double> undef to <4 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r125 = fptosi <4 x double> undef to <4 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r126 = fptoui <4 x double> undef to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r127 = fptosi <4 x double> undef to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r128 = fptoui <4 x double> undef to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r129 = fptosi <4 x double> undef to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r130 = fptoui <8 x float> undef to <8 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r131 = fptosi <8 x float> undef to <8 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r132 = fptoui <8 x float> undef to <8 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r133 = fptosi <8 x float> undef to <8 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x float> undef to <8 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x float> undef to <8 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r136 = fptoui <8 x float> undef to <8 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r137 = fptosi <8 x float> undef to <8 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x float> undef to <8 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x float> undef to <8 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r140 = fptoui <8 x double> undef to <8 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r141 = fptosi <8 x double> undef to <8 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r142 = fptoui <8 x double> undef to <8 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r143 = fptosi <8 x double> undef to <8 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r144 = fptoui <8 x double> undef to <8 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r145 = fptosi <8 x double> undef to <8 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r146 = fptoui <8 x double> undef to <8 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r147 = fptosi <8 x double> undef to <8 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r148 = fptoui <8 x double> undef to <8 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r149 = fptosi <8 x double> undef to <8 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x float> undef to <16 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x float> undef to <16 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x float> undef to <16 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x float> undef to <16 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x float> undef to <16 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x float> undef to <16 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x float> undef to <16 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x float> undef to <16 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x float> undef to <16 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x float> undef to <16 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r160 = fptoui <16 x double> undef to <16 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r161 = fptosi <16 x double> undef to <16 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r162 = fptoui <16 x double> undef to <16 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r163 = fptosi <16 x double> undef to <16 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r164 = fptoui <16 x double> undef to <16 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r165 = fptosi <16 x double> undef to <16 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r166 = fptoui <16 x double> undef to <16 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r167 = fptosi <16 x double> undef to <16 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r168 = fptoui <16 x double> undef to <16 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r169 = fptosi <16 x double> undef to <16 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> undef to <2 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> undef to <2 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> undef to <2 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> undef to <2 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> undef to <2 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> undef to <2 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> undef to <2 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> undef to <2 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> undef to <4 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> undef to <4 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r198 = uitofp <4 x i64> undef to <4 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r199 = sitofp <4 x i64> undef to <4 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r200 = uitofp <4 x i1> undef to <4 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r201 = sitofp <4 x i1> undef to <4 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r204 = uitofp <4 x i16> undef to <4 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r205 = sitofp <4 x i16> undef to <4 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r206 = uitofp <4 x i32> undef to <4 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r207 = sitofp <4 x i32> undef to <4 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r208 = uitofp <4 x i64> undef to <4 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r209 = sitofp <4 x i64> undef to <4 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r210 = uitofp <8 x i1> undef to <8 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r211 = sitofp <8 x i1> undef to <8 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r214 = uitofp <8 x i16> undef to <8 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r215 = sitofp <8 x i16> undef to <8 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r216 = uitofp <8 x i32> undef to <8 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r217 = sitofp <8 x i32> undef to <8 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r218 = uitofp <8 x i64> undef to <8 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r219 = sitofp <8 x i64> undef to <8 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r220 = uitofp <8 x i1> undef to <8 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r221 = sitofp <8 x i1> undef to <8 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r224 = uitofp <8 x i16> undef to <8 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r225 = sitofp <8 x i16> undef to <8 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r226 = uitofp <8 x i32> undef to <8 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r227 = sitofp <8 x i32> undef to <8 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r228 = uitofp <8 x i64> undef to <8 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r229 = sitofp <8 x i64> undef to <8 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r230 = uitofp <16 x i1> undef to <16 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r231 = sitofp <16 x i1> undef to <16 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r234 = uitofp <16 x i16> undef to <16 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r235 = sitofp <16 x i16> undef to <16 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r236 = uitofp <16 x i32> undef to <16 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r237 = sitofp <16 x i32> undef to <16 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r238 = uitofp <16 x i64> undef to <16 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r239 = sitofp <16 x i64> undef to <16 x float> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r240 = uitofp <16 x i1> undef to <16 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r241 = sitofp <16 x i1> undef to <16 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r244 = uitofp <16 x i16> undef to <16 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r245 = sitofp <16 x i16> undef to <16 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r246 = uitofp <16 x i32> undef to <16 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r247 = sitofp <16 x i32> undef to <16 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r248 = uitofp <16 x i64> undef to <16 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r249 = sitofp <16 x i64> undef to <16 x double> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; FIXED-MIN-2048-LABEL: 'casts_no_users' -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui float undef to i1 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi float undef to i1 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui float undef to i8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi float undef to i8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui float undef to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi float undef to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui float undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi float undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui float undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi float undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r40 = fptoui double undef to i1 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r41 = fptosi double undef to i1 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r42 = fptoui double undef to i8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r43 = fptosi double undef to i8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r44 = fptoui double undef to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r45 = fptosi double undef to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r46 = fptoui double undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r47 = fptosi double undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r48 = fptoui double undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r49 = fptosi double undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r50 = sitofp i1 undef to float -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r51 = uitofp i1 undef to float -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r52 = sitofp i1 undef to double -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r53 = uitofp i1 undef to double -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r54 = sitofp i8 undef to float -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r55 = uitofp i8 undef to float -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r56 = sitofp i8 undef to double -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r57 = uitofp i8 undef to double -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r58 = sitofp i16 undef to float -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r59 = uitofp i16 undef to float -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r60 = sitofp i16 undef to double -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r61 = uitofp i16 undef to double -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r62 = sitofp i32 undef to float -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r63 = uitofp i32 undef to float -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r64 = sitofp i32 undef to double -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r65 = uitofp i32 undef to double -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r66 = sitofp i64 undef to float -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r67 = uitofp i64 undef to float -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r68 = sitofp i64 undef to double -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r69 = uitofp i64 undef to double -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r80 = fptrunc double undef to float -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r81 = fptrunc <2 x double> undef to <2 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r82 = fptrunc <4 x double> undef to <4 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r83 = fptrunc <8 x double> undef to <8 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r84 = fptrunc <16 x double> undef to <16 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r85 = fpext float undef to double -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r86 = fpext <2 x float> undef to <2 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r87 = fpext <4 x float> undef to <4 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r88 = fpext <8 x float> undef to <8 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r89 = fpext <16 x float> undef to <16 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r173 = sitofp <2 x i8> undef to <2 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r174 = uitofp <2 x i16> undef to <2 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r175 = sitofp <2 x i16> undef to <2 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r176 = uitofp <2 x i32> undef to <2 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r177 = sitofp <2 x i32> undef to <2 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r178 = uitofp <2 x i64> undef to <2 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r179 = sitofp <2 x i64> undef to <2 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r193 = sitofp <4 x i8> undef to <4 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r194 = uitofp <4 x i16> undef to <4 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r195 = sitofp <4 x i16> undef to <4 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r196 = uitofp <4 x i32> undef to <4 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r197 = sitofp <4 x i32> undef to <4 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r198 = uitofp <4 x i64> undef to <4 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r199 = sitofp <4 x i64> undef to <4 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r213 = sitofp <8 x i8> undef to <8 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r214 = uitofp <8 x i16> undef to <8 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r215 = sitofp <8 x i16> undef to <8 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r216 = uitofp <8 x i32> undef to <8 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r217 = sitofp <8 x i32> undef to <8 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r218 = uitofp <8 x i64> undef to <8 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r219 = sitofp <8 x i64> undef to <8 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r226 = uitofp <8 x i32> undef to <8 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r227 = sitofp <8 x i32> undef to <8 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r233 = sitofp <16 x i8> undef to <16 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r234 = uitofp <16 x i16> undef to <16 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r235 = sitofp <16 x i16> undef to <16 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r236 = uitofp <16 x i32> undef to <16 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r237 = sitofp <16 x i32> undef to <16 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r238 = uitofp <16 x i64> undef to <16 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r239 = sitofp <16 x i64> undef to <16 x float> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r246 = uitofp <16 x i32> undef to <16 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r247 = sitofp <16 x i32> undef to <16 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui float undef to i1 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi float undef to i1 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui float undef to i8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi float undef to i8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui float undef to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi float undef to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui float undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi float undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui float undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi float undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r40 = fptoui double undef to i1 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r41 = fptosi double undef to i1 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r42 = fptoui double undef to i8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r43 = fptosi double undef to i8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r44 = fptoui double undef to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r45 = fptosi double undef to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r46 = fptoui double undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r47 = fptosi double undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r48 = fptoui double undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r49 = fptosi double undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r50 = sitofp i1 undef to float +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r51 = uitofp i1 undef to float +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r52 = sitofp i1 undef to double +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r53 = uitofp i1 undef to double +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r54 = sitofp i8 undef to float +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r55 = uitofp i8 undef to float +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r56 = sitofp i8 undef to double +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r57 = uitofp i8 undef to double +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r58 = sitofp i16 undef to float +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r59 = uitofp i16 undef to float +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r60 = sitofp i16 undef to double +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r61 = uitofp i16 undef to double +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r62 = sitofp i32 undef to float +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r63 = uitofp i32 undef to float +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r64 = sitofp i32 undef to double +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r65 = uitofp i32 undef to double +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r66 = sitofp i64 undef to float +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r67 = uitofp i64 undef to float +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r68 = sitofp i64 undef to double +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r69 = uitofp i64 undef to double +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r80 = fptrunc double undef to float +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r81 = fptrunc <2 x double> undef to <2 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r82 = fptrunc <4 x double> undef to <4 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r83 = fptrunc <8 x double> undef to <8 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r84 = fptrunc <16 x double> undef to <16 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r85 = fpext float undef to double +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r86 = fpext <2 x float> undef to <2 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r87 = fpext <4 x float> undef to <4 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r88 = fpext <8 x float> undef to <8 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r89 = fpext <16 x float> undef to <16 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x float> undef to <2 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x float> undef to <2 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x float> undef to <2 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x float> undef to <2 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x float> undef to <2 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x float> undef to <2 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x float> undef to <2 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x float> undef to <2 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x float> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x float> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r100 = fptoui <2 x double> undef to <2 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r101 = fptosi <2 x double> undef to <2 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r102 = fptoui <2 x double> undef to <2 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r103 = fptosi <2 x double> undef to <2 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r104 = fptoui <2 x double> undef to <2 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r105 = fptosi <2 x double> undef to <2 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r106 = fptoui <2 x double> undef to <2 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r107 = fptosi <2 x double> undef to <2 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r108 = fptoui <2 x double> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r109 = fptosi <2 x double> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r110 = fptoui <4 x float> undef to <4 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %r111 = fptosi <4 x float> undef to <4 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r112 = fptoui <4 x float> undef to <4 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r113 = fptosi <4 x float> undef to <4 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r114 = fptoui <4 x float> undef to <4 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r115 = fptosi <4 x float> undef to <4 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x float> undef to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x float> undef to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r118 = fptoui <4 x float> undef to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r119 = fptosi <4 x float> undef to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r120 = fptoui <4 x double> undef to <4 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r121 = fptosi <4 x double> undef to <4 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r122 = fptoui <4 x double> undef to <4 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r123 = fptosi <4 x double> undef to <4 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r124 = fptoui <4 x double> undef to <4 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r125 = fptosi <4 x double> undef to <4 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r126 = fptoui <4 x double> undef to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r127 = fptosi <4 x double> undef to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r128 = fptoui <4 x double> undef to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r129 = fptosi <4 x double> undef to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r130 = fptoui <8 x float> undef to <8 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r131 = fptosi <8 x float> undef to <8 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r132 = fptoui <8 x float> undef to <8 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r133 = fptosi <8 x float> undef to <8 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x float> undef to <8 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x float> undef to <8 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r136 = fptoui <8 x float> undef to <8 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r137 = fptosi <8 x float> undef to <8 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r138 = fptoui <8 x float> undef to <8 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r139 = fptosi <8 x float> undef to <8 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r140 = fptoui <8 x double> undef to <8 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r141 = fptosi <8 x double> undef to <8 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r142 = fptoui <8 x double> undef to <8 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r143 = fptosi <8 x double> undef to <8 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r144 = fptoui <8 x double> undef to <8 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r145 = fptosi <8 x double> undef to <8 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r146 = fptoui <8 x double> undef to <8 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r147 = fptosi <8 x double> undef to <8 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r148 = fptoui <8 x double> undef to <8 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r149 = fptosi <8 x double> undef to <8 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r150 = fptoui <16 x float> undef to <16 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r151 = fptosi <16 x float> undef to <16 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r152 = fptoui <16 x float> undef to <16 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r153 = fptosi <16 x float> undef to <16 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r154 = fptoui <16 x float> undef to <16 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r155 = fptosi <16 x float> undef to <16 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r156 = fptoui <16 x float> undef to <16 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r157 = fptosi <16 x float> undef to <16 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r158 = fptoui <16 x float> undef to <16 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r159 = fptosi <16 x float> undef to <16 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r160 = fptoui <16 x double> undef to <16 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r161 = fptosi <16 x double> undef to <16 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r162 = fptoui <16 x double> undef to <16 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r163 = fptosi <16 x double> undef to <16 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r164 = fptoui <16 x double> undef to <16 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r165 = fptosi <16 x double> undef to <16 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r166 = fptoui <16 x double> undef to <16 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r167 = fptosi <16 x double> undef to <16 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r168 = fptoui <16 x double> undef to <16 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r169 = fptosi <16 x double> undef to <16 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r170 = uitofp <2 x i1> undef to <2 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r171 = sitofp <2 x i1> undef to <2 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r172 = uitofp <2 x i8> undef to <2 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r173 = sitofp <2 x i8> undef to <2 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r174 = uitofp <2 x i16> undef to <2 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r175 = sitofp <2 x i16> undef to <2 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r176 = uitofp <2 x i32> undef to <2 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r177 = sitofp <2 x i32> undef to <2 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r178 = uitofp <2 x i64> undef to <2 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r179 = sitofp <2 x i64> undef to <2 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r180 = uitofp <2 x i1> undef to <2 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r181 = sitofp <2 x i1> undef to <2 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r182 = uitofp <2 x i8> undef to <2 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r183 = sitofp <2 x i8> undef to <2 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r184 = uitofp <2 x i16> undef to <2 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r185 = sitofp <2 x i16> undef to <2 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r186 = uitofp <2 x i32> undef to <2 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r187 = sitofp <2 x i32> undef to <2 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r188 = uitofp <2 x i64> undef to <2 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r189 = sitofp <2 x i64> undef to <2 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r190 = uitofp <4 x i1> undef to <4 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r191 = sitofp <4 x i1> undef to <4 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r192 = uitofp <4 x i8> undef to <4 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r193 = sitofp <4 x i8> undef to <4 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r194 = uitofp <4 x i16> undef to <4 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r195 = sitofp <4 x i16> undef to <4 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r196 = uitofp <4 x i32> undef to <4 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r197 = sitofp <4 x i32> undef to <4 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r198 = uitofp <4 x i64> undef to <4 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r199 = sitofp <4 x i64> undef to <4 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r200 = uitofp <4 x i1> undef to <4 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r201 = sitofp <4 x i1> undef to <4 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r202 = uitofp <4 x i8> undef to <4 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r203 = sitofp <4 x i8> undef to <4 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r204 = uitofp <4 x i16> undef to <4 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r205 = sitofp <4 x i16> undef to <4 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r206 = uitofp <4 x i32> undef to <4 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r207 = sitofp <4 x i32> undef to <4 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r208 = uitofp <4 x i64> undef to <4 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r209 = sitofp <4 x i64> undef to <4 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r210 = uitofp <8 x i1> undef to <8 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r211 = sitofp <8 x i1> undef to <8 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r212 = uitofp <8 x i8> undef to <8 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r213 = sitofp <8 x i8> undef to <8 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r214 = uitofp <8 x i16> undef to <8 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r215 = sitofp <8 x i16> undef to <8 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r216 = uitofp <8 x i32> undef to <8 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r217 = sitofp <8 x i32> undef to <8 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r218 = uitofp <8 x i64> undef to <8 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r219 = sitofp <8 x i64> undef to <8 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r220 = uitofp <8 x i1> undef to <8 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r221 = sitofp <8 x i1> undef to <8 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r222 = uitofp <8 x i8> undef to <8 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r223 = sitofp <8 x i8> undef to <8 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r224 = uitofp <8 x i16> undef to <8 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r225 = sitofp <8 x i16> undef to <8 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r226 = uitofp <8 x i32> undef to <8 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r227 = sitofp <8 x i32> undef to <8 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r228 = uitofp <8 x i64> undef to <8 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r229 = sitofp <8 x i64> undef to <8 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r230 = uitofp <16 x i1> undef to <16 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r231 = sitofp <16 x i1> undef to <16 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r232 = uitofp <16 x i8> undef to <16 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r233 = sitofp <16 x i8> undef to <16 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r234 = uitofp <16 x i16> undef to <16 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r235 = sitofp <16 x i16> undef to <16 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r236 = uitofp <16 x i32> undef to <16 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r237 = sitofp <16 x i32> undef to <16 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r238 = uitofp <16 x i64> undef to <16 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r239 = sitofp <16 x i64> undef to <16 x float> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r240 = uitofp <16 x i1> undef to <16 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r241 = sitofp <16 x i1> undef to <16 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r242 = uitofp <16 x i8> undef to <16 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r243 = sitofp <16 x i8> undef to <16 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r244 = uitofp <16 x i16> undef to <16 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r245 = sitofp <16 x i16> undef to <16 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r246 = uitofp <16 x i32> undef to <16 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r247 = sitofp <16 x i32> undef to <16 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r248 = uitofp <16 x i64> undef to <16 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r249 = sitofp <16 x i64> undef to <16 x double> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %r30 = fptoui float undef to i1 %r31 = fptosi float undef to i1 @@ -1639,62 +1639,62 @@ define i32 @casts_no_users() { define i32 @casts_with_users(i8 %a, i16 %b, i32 %c, i64 %d, i1 %e) { ; CHECK-LABEL: 'casts_with_users' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r0 = sext i8 %a to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r1 = sext i8 %a to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = sext i8 %a to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r3 = sext i16 %b to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r4 = sext i16 %b to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r5 = sext i32 %c to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %r0, ptr undef, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %r1, ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %r2, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %r3, ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %r4, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %r5, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r6 = zext i8 %a to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r7 = zext i8 %a to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = zext i8 %a to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r9 = zext i16 %b to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r10 = zext i16 %b to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r11 = zext i32 %c to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %r6, ptr undef, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %r7, ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %r8, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %r9, ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %r10, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %r11, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r12 = trunc i64 %d to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r13 = trunc i64 %d to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r14 = trunc i64 %d to i8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r15 = trunc i32 %c to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r16 = trunc i32 %c to i8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r17 = trunc i16 %b to i8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tie.trunc.8 = add i8 %r14, %r16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tie.trunc.8.1 = add i8 %tie.trunc.8, %r17 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tie.trunc.16 = add i16 %r13, %r15 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %free.trunc.16.8 = trunc i16 %r6 to i8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %free.trunc.32.8 = trunc i32 %r7 to i8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %free.trunc.32.16 = trunc i32 %r9 to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %free.trunc.64.8 = trunc i64 %r8 to i8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %free.trunc.64.16 = trunc i64 %r10 to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %free.trunc.64.32 = trunc i64 %r11 to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r18 = sext i1 %e to i8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r19 = sext i1 %e to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r20 = sext i1 %e to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r21 = sext i1 %e to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r22 = zext i1 %e to i8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r23 = zext i1 %e to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r24 = zext i1 %e to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r25 = zext i1 %e to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 %r18, ptr undef, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %r19, ptr undef, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %r20, ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %r21, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 %r22, ptr undef, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %r23, ptr undef, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %r24, ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i64 %r25, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r12 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r0 = sext i8 %a to i16 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r1 = sext i8 %a to i32 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r2 = sext i8 %a to i64 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r3 = sext i16 %b to i32 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r4 = sext i16 %b to i64 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r5 = sext i32 %c to i64 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i16 %r0, ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i32 %r1, ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i64 %r2, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i32 %r3, ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i64 %r4, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i64 %r5, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r6 = zext i8 %a to i16 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r7 = zext i8 %a to i32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r8 = zext i8 %a to i64 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r9 = zext i16 %b to i32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r10 = zext i16 %b to i64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r11 = zext i32 %c to i64 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i16 %r6, ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i32 %r7, ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i64 %r8, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i32 %r9, ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i64 %r10, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i64 %r11, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r12 = trunc i64 %d to i32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r13 = trunc i64 %d to i16 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r14 = trunc i64 %d to i8 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r15 = trunc i32 %c to i16 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r16 = trunc i32 %c to i8 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r17 = trunc i16 %b to i8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tie.trunc.8 = add i8 %r14, %r16 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tie.trunc.8.1 = add i8 %tie.trunc.8, %r17 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %tie.trunc.16 = add i16 %r13, %r15 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %free.trunc.16.8 = trunc i16 %r6 to i8 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %free.trunc.32.8 = trunc i32 %r7 to i8 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %free.trunc.32.16 = trunc i32 %r9 to i16 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %free.trunc.64.8 = trunc i64 %r8 to i8 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %free.trunc.64.16 = trunc i64 %r10 to i16 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %free.trunc.64.32 = trunc i64 %r11 to i32 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r18 = sext i1 %e to i8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r19 = sext i1 %e to i16 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r20 = sext i1 %e to i32 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r21 = sext i1 %e to i64 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r22 = zext i1 %e to i8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r23 = zext i1 %e to i16 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %r24 = zext i1 %e to i32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r25 = zext i1 %e to i64 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i8 %r18, ptr undef, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i16 %r19, ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i32 %r20, ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i64 %r21, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i8 %r22, ptr undef, align 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i16 %r23, ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i32 %r24, ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i64 %r25, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r12 ; %r0 = sext i8 %a to i16 %r1 = sext i8 %a to i32 @@ -1760,15 +1760,15 @@ define i32 @casts_with_users(i8 %a, i16 %b, i32 %c, i64 %d, i1 %e) { define i32 @bitcasts() { ; CHECK-LABEL: 'bitcasts' -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a = bitcast i32 undef to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %b = bitcast float undef to float -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = bitcast i32 undef to float -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %d = bitcast float undef to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %e = bitcast i64 undef to double -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f = bitcast double undef to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %g = bitcast half undef to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %h = bitcast i16 undef to half -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; CHECK-NEXT: Cost Model: Found costs of 0 for: %a = bitcast i32 undef to i32 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %b = bitcast float undef to float +; CHECK-NEXT: Cost Model: Found costs of 1 for: %c = bitcast i32 undef to float +; CHECK-NEXT: Cost Model: Found costs of 1 for: %d = bitcast float undef to i32 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %e = bitcast i64 undef to double +; CHECK-NEXT: Cost Model: Found costs of 1 for: %f = bitcast double undef to i64 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %g = bitcast half undef to i16 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %h = bitcast i16 undef to half +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %a = bitcast i32 undef to i32 %b = bitcast float undef to float @@ -1783,184 +1783,184 @@ define i32 @bitcasts() { define i32 @load_extends() #0 { ; CHECK-SVE-LABEL: 'load_extends' -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, ptr undef, align 1 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, ptr undef, align 2 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, ptr undef, align 4 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %loadv2i8 = load <2 x i8>, ptr undef, align 2 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv4i8 = load <4 x i8>, ptr undef, align 4 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, ptr undef, align 8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %loadv2i16 = load <2 x i16>, ptr undef, align 4 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, ptr undef, align 8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, ptr undef, align 8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i32 = load <4 x i32>, ptr undef, align 16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadnxv2i32 = load , ptr undef, align 8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadnxv4i32 = load , ptr undef, align 16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r4 = sext i8 %loadi8 to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r5 = zext i8 %loadi8 to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = sext i16 %loadi16 to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r9 = zext i16 %loadi16 to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r10 = sext i32 %loadi32 to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r11 = zext i32 %loadi32 to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v12 = sext <4 x i32> %loadv4i32 to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v13 = zext <4 x i32> %loadv4i32 to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v14 = sext %loadnxv2i32 to -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v15 = zext %loadnxv2i32 to -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16 = sext %loadnxv4i32 to -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v17 = zext %loadnxv4i32 to -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadi8 = load i8, ptr undef, align 1 +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadi16 = load i16, ptr undef, align 2 +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadi32 = load i32, ptr undef, align 4 +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %loadv2i8 = load <2 x i8>, ptr undef, align 2 +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %loadv4i8 = load <4 x i8>, ptr undef, align 4 +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv8i8 = load <8 x i8>, ptr undef, align 8 +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %loadv2i16 = load <2 x i16>, ptr undef, align 4 +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv4i16 = load <4 x i16>, ptr undef, align 8 +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv2i32 = load <2 x i32>, ptr undef, align 8 +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv4i32 = load <4 x i32>, ptr undef, align 16 +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadnxv2i32 = load , ptr undef, align 8 +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadnxv4i32 = load , ptr undef, align 16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i8 %loadi8 to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i8 %loadi8 to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i8 %loadi8 to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i8 %loadi8 to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i8 %loadi8 to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i8 %loadi8 to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i16 %loadi16 to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i16 %loadi16 to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r8 = sext i16 %loadi16 to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r9 = zext i16 %loadi16 to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r10 = sext i32 %loadi32 to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %r11 = zext i32 %loadi32 to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v12 = sext <4 x i32> %loadv4i32 to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v13 = zext <4 x i32> %loadv4i32 to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %v14 = sext %loadnxv2i32 to +; CHECK-SVE-NEXT: Cost Model: Found costs of 0 for: %v15 = zext %loadnxv2i32 to +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16 = sext %loadnxv4i32 to +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v17 = zext %loadnxv4i32 to +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; SVE128-NO-NEON-LABEL: 'load_extends' -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, ptr undef, align 1 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, ptr undef, align 2 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, ptr undef, align 4 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i8 = load <2 x i8>, ptr undef, align 2 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i8 = load <4 x i8>, ptr undef, align 4 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, ptr undef, align 8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i16 = load <2 x i16>, ptr undef, align 4 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, ptr undef, align 8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, ptr undef, align 8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i32 = load <4 x i32>, ptr undef, align 16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadnxv2i32 = load , ptr undef, align 8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadnxv4i32 = load , ptr undef, align 16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r4 = sext i8 %loadi8 to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r5 = zext i8 %loadi8 to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = sext i16 %loadi16 to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r9 = zext i16 %loadi16 to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r10 = sext i32 %loadi32 to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r11 = zext i32 %loadi32 to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v12 = sext <4 x i32> %loadv4i32 to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v13 = zext <4 x i32> %loadv4i32 to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v14 = sext %loadnxv2i32 to -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v15 = zext %loadnxv2i32 to -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16 = sext %loadnxv4i32 to -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v17 = zext %loadnxv4i32 to -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadi8 = load i8, ptr undef, align 1 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadi16 = load i16, ptr undef, align 2 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadi32 = load i32, ptr undef, align 4 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv2i8 = load <2 x i8>, ptr undef, align 2 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv4i8 = load <4 x i8>, ptr undef, align 4 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv8i8 = load <8 x i8>, ptr undef, align 8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv2i16 = load <2 x i16>, ptr undef, align 4 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv4i16 = load <4 x i16>, ptr undef, align 8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv2i32 = load <2 x i32>, ptr undef, align 8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv4i32 = load <4 x i32>, ptr undef, align 16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadnxv2i32 = load , ptr undef, align 8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadnxv4i32 = load , ptr undef, align 16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i8 %loadi8 to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i8 %loadi8 to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i8 %loadi8 to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i8 %loadi8 to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i8 %loadi8 to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i8 %loadi8 to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i16 %loadi16 to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i16 %loadi16 to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r8 = sext i16 %loadi16 to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r9 = zext i16 %loadi16 to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r10 = sext i32 %loadi32 to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %r11 = zext i32 %loadi32 to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v12 = sext <4 x i32> %loadv4i32 to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v13 = zext <4 x i32> %loadv4i32 to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v14 = sext %loadnxv2i32 to +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 0 for: %v15 = zext %loadnxv2i32 to +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16 = sext %loadnxv4i32 to +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v17 = zext %loadnxv4i32 to +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; FIXED-MIN-256-LABEL: 'load_extends' -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, ptr undef, align 1 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, ptr undef, align 2 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, ptr undef, align 4 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i8 = load <2 x i8>, ptr undef, align 2 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i8 = load <4 x i8>, ptr undef, align 4 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, ptr undef, align 8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i16 = load <2 x i16>, ptr undef, align 4 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, ptr undef, align 8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, ptr undef, align 8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i32 = load <4 x i32>, ptr undef, align 16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadnxv2i32 = load , ptr undef, align 8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadnxv4i32 = load , ptr undef, align 16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r4 = sext i8 %loadi8 to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r5 = zext i8 %loadi8 to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = sext i16 %loadi16 to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r9 = zext i16 %loadi16 to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r10 = sext i32 %loadi32 to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r11 = zext i32 %loadi32 to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v12 = sext <4 x i32> %loadv4i32 to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v13 = zext <4 x i32> %loadv4i32 to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v14 = sext %loadnxv2i32 to -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v15 = zext %loadnxv2i32 to -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16 = sext %loadnxv4i32 to -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v17 = zext %loadnxv4i32 to -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadi8 = load i8, ptr undef, align 1 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadi16 = load i16, ptr undef, align 2 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadi32 = load i32, ptr undef, align 4 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv2i8 = load <2 x i8>, ptr undef, align 2 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv4i8 = load <4 x i8>, ptr undef, align 4 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv8i8 = load <8 x i8>, ptr undef, align 8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv2i16 = load <2 x i16>, ptr undef, align 4 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv4i16 = load <4 x i16>, ptr undef, align 8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv2i32 = load <2 x i32>, ptr undef, align 8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv4i32 = load <4 x i32>, ptr undef, align 16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadnxv2i32 = load , ptr undef, align 8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadnxv4i32 = load , ptr undef, align 16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i8 %loadi8 to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i8 %loadi8 to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i8 %loadi8 to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i8 %loadi8 to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i8 %loadi8 to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i8 %loadi8 to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i16 %loadi16 to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i16 %loadi16 to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r8 = sext i16 %loadi16 to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r9 = zext i16 %loadi16 to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r10 = sext i32 %loadi32 to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %r11 = zext i32 %loadi32 to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %v12 = sext <4 x i32> %loadv4i32 to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %v13 = zext <4 x i32> %loadv4i32 to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %v14 = sext %loadnxv2i32 to +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 0 for: %v15 = zext %loadnxv2i32 to +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16 = sext %loadnxv4i32 to +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v17 = zext %loadnxv4i32 to +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; ; FIXED-MIN-2048-LABEL: 'load_extends' -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, ptr undef, align 1 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, ptr undef, align 2 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, ptr undef, align 4 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i8 = load <2 x i8>, ptr undef, align 2 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i8 = load <4 x i8>, ptr undef, align 4 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, ptr undef, align 8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i16 = load <2 x i16>, ptr undef, align 4 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, ptr undef, align 8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, ptr undef, align 8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i32 = load <4 x i32>, ptr undef, align 16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadnxv2i32 = load , ptr undef, align 8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadnxv4i32 = load , ptr undef, align 16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext i8 %loadi8 to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r3 = zext i8 %loadi8 to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r4 = sext i8 %loadi8 to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r5 = zext i8 %loadi8 to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r6 = sext i16 %loadi16 to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r7 = zext i16 %loadi16 to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r8 = sext i16 %loadi16 to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r9 = zext i16 %loadi16 to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r10 = sext i32 %loadi32 to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r11 = zext i32 %loadi32 to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v12 = sext <4 x i32> %loadv4i32 to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v13 = zext <4 x i32> %loadv4i32 to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v14 = sext %loadnxv2i32 to -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v15 = zext %loadnxv2i32 to -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16 = sext %loadnxv4i32 to -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v17 = zext %loadnxv4i32 to -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadi8 = load i8, ptr undef, align 1 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadi16 = load i16, ptr undef, align 2 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadi32 = load i32, ptr undef, align 4 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv2i8 = load <2 x i8>, ptr undef, align 2 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv4i8 = load <4 x i8>, ptr undef, align 4 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv8i8 = load <8 x i8>, ptr undef, align 8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv2i16 = load <2 x i16>, ptr undef, align 4 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv4i16 = load <4 x i16>, ptr undef, align 8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv2i32 = load <2 x i32>, ptr undef, align 8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadv4i32 = load <4 x i32>, ptr undef, align 16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadnxv2i32 = load , ptr undef, align 8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %loadnxv4i32 = load , ptr undef, align 16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r0 = sext i8 %loadi8 to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r1 = zext i8 %loadi8 to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r2 = sext i8 %loadi8 to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r3 = zext i8 %loadi8 to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r4 = sext i8 %loadi8 to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r5 = zext i8 %loadi8 to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r6 = sext i16 %loadi16 to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r7 = zext i16 %loadi16 to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r8 = sext i16 %loadi16 to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r9 = zext i16 %loadi16 to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r10 = sext i32 %loadi32 to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %r11 = zext i32 %loadi32 to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %v0 = sext <8 x i8> %loadv8i8 to <8 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %v1 = zext <8 x i8> %loadv8i8 to <8 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %v2 = sext <4 x i8> %loadv4i8 to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %v3 = zext <4 x i8> %loadv4i8 to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %v4 = sext <2 x i8> %loadv2i8 to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %v5 = zext <2 x i8> %loadv2i8 to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %v6 = sext <4 x i16> %loadv4i16 to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %v7 = zext <4 x i16> %loadv4i16 to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %v8 = sext <2 x i16> %loadv2i16 to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %v9 = zext <2 x i16> %loadv2i16 to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %v10 = sext <2 x i32> %loadv2i32 to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %v11 = zext <2 x i32> %loadv2i32 to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %v12 = sext <4 x i32> %loadv4i32 to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %v13 = zext <4 x i32> %loadv4i32 to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %v14 = sext %loadnxv2i32 to +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 0 for: %v15 = zext %loadnxv2i32 to +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v16 = sext %loadnxv4i32 to +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %v17 = zext %loadnxv4i32 to +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %loadi8 = load i8, ptr undef %loadi16 = load i16, ptr undef @@ -2012,19 +2012,19 @@ define i32 @load_extends() #0 { define i32 @store_truncs() { ; CHECK-LABEL: 'store_truncs' -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r0 = trunc i64 undef to i8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 %r0, ptr undef, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = trunc i64 undef to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %r1, ptr undef, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r2 = trunc i64 undef to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %r2, ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r3 = trunc i32 undef to i8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 %r3, ptr undef, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r4 = trunc i32 undef to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %r4, ptr undef, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r5 = trunc i16 undef to i8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i8 %r5, ptr undef, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r0 = trunc i64 undef to i8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i8 %r0, ptr undef, align 1 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r1 = trunc i64 undef to i16 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i16 %r1, ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r2 = trunc i64 undef to i32 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i32 %r2, ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r3 = trunc i32 undef to i8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i8 %r3, ptr undef, align 1 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r4 = trunc i32 undef to i16 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i16 %r4, ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %r5 = trunc i16 undef to i8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store i8 %r5, ptr undef, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 undef ; %r0 = trunc i64 undef to i8 store i8 %r0, ptr undef @@ -2043,23 +2043,23 @@ define i32 @store_truncs() { define void @extend_extract() { ; CHECK-LABEL: 'extend_extract' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e8 = extractelement <8 x i8> undef, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e16 = extractelement <8 x i16> undef, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e32 = extractelement <8 x i32> undef, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8_16 = sext i8 %e8 to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8_16 = zext i8 %e8 to i16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8_32 = sext i8 %e8 to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8_32 = zext i8 %e8 to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8_64 = sext i8 %e8 to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %z8_64 = zext i8 %e8 to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s16_32 = sext i16 %e16 to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z16_32 = zext i16 %e16 to i32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s16_64 = sext i16 %e16 to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %z16_64 = zext i16 %e16 to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s32_64 = sext i32 %e32 to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %z32_64 = zext i32 %e32 to i64 -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: call void @use(i16 %s8_16, i16 %z8_16, i32 %s8_32, i32 %z8_32, i64 %s8_64, i64 %z8_64, i32 %s16_32, i32 %z16_32, i64 %s16_64, i64 %z16_64, i64 %s32_64, i64 %z32_64) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e8 = extractelement <8 x i8> undef, i32 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e16 = extractelement <8 x i16> undef, i32 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e32 = extractelement <8 x i32> undef, i32 1 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %s8_16 = sext i8 %e8 to i16 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %z8_16 = zext i8 %e8 to i16 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %s8_32 = sext i8 %e8 to i32 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %z8_32 = zext i8 %e8 to i32 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %s8_64 = sext i8 %e8 to i64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %z8_64 = zext i8 %e8 to i64 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %s16_32 = sext i16 %e16 to i32 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %z16_32 = zext i16 %e16 to i32 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %s16_64 = sext i16 %e16 to i64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %z16_64 = zext i16 %e16 to i64 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %s32_64 = sext i32 %e32 to i64 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %z32_64 = zext i32 %e32 to i64 +; CHECK-NEXT: Cost Model: Found costs of 13 for: call void @use(i16 %s8_16, i16 %z8_16, i32 %s8_32, i32 %z8_32, i64 %s8_64, i64 %z8_64, i32 %s16_32, i32 %z16_32, i64 %s16_64, i64 %z16_64, i64 %s32_64, i64 %z32_64) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %e8 = extractelement <8 x i8> undef, i32 1 %e16 = extractelement <8 x i16> undef, i32 1 @@ -2084,296 +2084,296 @@ declare void @use(i16, i16, i32, i32, i64, i64, i32, i32, i64, i64, i64, i64) define void @fp16cast() { ; CHECK-SVE-LABEL: 'fp16cast' -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui half undef to i1 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi half undef to i1 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui half undef to i8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi half undef to i8 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui half undef to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi half undef to i16 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui half undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi half undef to i32 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui half undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi half undef to i64 -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x half> undef to <2 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x half> undef to <2 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x half> undef to <2 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x half> undef to <2 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x half> undef to <2 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r113 = fptosi <4 x half> undef to <4 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r114 = fptoui <4 x half> undef to <4 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x half> undef to <4 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x half> undef to <8 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r154 = fptoui <16 x half> undef to <16 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r155 = fptosi <16 x half> undef to <16 x i16> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r250 = uitofp <8 x i1> undef to <8 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r251 = sitofp <8 x i1> undef to <8 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r252 = uitofp <8 x i8> undef to <8 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r253 = sitofp <8 x i8> undef to <8 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r254 = uitofp <8 x i16> undef to <8 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r255 = sitofp <8 x i16> undef to <8 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r256 = uitofp <8 x i32> undef to <8 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r257 = sitofp <8 x i32> undef to <8 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r258 = uitofp <8 x i64> undef to <8 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r259 = sitofp <8 x i64> undef to <8 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r260 = uitofp <16 x i1> undef to <16 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r261 = sitofp <16 x i1> undef to <16 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r262 = uitofp <16 x i8> undef to <16 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r263 = sitofp <16 x i8> undef to <16 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r264 = uitofp <16 x i16> undef to <16 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r265 = sitofp <16 x i16> undef to <16 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r266 = uitofp <16 x i32> undef to <16 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r267 = sitofp <16 x i32> undef to <16 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %r268 = uitofp <16 x i64> undef to <16 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %r269 = sitofp <16 x i64> undef to <16 x half> -; CHECK-SVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64 +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> undef to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> undef to <4 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> undef to <8 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> undef to <8 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> undef to <16 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:81 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> undef to <16 x i1> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> undef to <16 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> undef to <16 x i8> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> undef to <16 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> undef to <16 x i16> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:86 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> undef to <8 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> undef to <8 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> undef to <16 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> undef to <16 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> undef to <16 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> undef to <16 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half> +; CHECK-SVE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; SVE128-NO-NEON-LABEL: 'fp16cast' -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui half undef to i1 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi half undef to i1 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui half undef to i8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi half undef to i8 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui half undef to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi half undef to i16 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui half undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi half undef to i32 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui half undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi half undef to i64 -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x half> undef to <2 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x half> undef to <2 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x half> undef to <2 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x half> undef to <2 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x half> undef to <2 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r113 = fptosi <4 x half> undef to <4 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r114 = fptoui <4 x half> undef to <4 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x half> undef to <4 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x half> undef to <8 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r154 = fptoui <16 x half> undef to <16 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r155 = fptosi <16 x half> undef to <16 x i16> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r250 = uitofp <8 x i1> undef to <8 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r251 = sitofp <8 x i1> undef to <8 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r252 = uitofp <8 x i8> undef to <8 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r253 = sitofp <8 x i8> undef to <8 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r254 = uitofp <8 x i16> undef to <8 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r255 = sitofp <8 x i16> undef to <8 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r256 = uitofp <8 x i32> undef to <8 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r257 = sitofp <8 x i32> undef to <8 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r258 = uitofp <8 x i64> undef to <8 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r259 = sitofp <8 x i64> undef to <8 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r260 = uitofp <16 x i1> undef to <16 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r261 = sitofp <16 x i1> undef to <16 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r262 = uitofp <16 x i8> undef to <16 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r263 = sitofp <16 x i8> undef to <16 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r264 = uitofp <16 x i16> undef to <16 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r265 = sitofp <16 x i16> undef to <16 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r266 = uitofp <16 x i32> undef to <16 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r267 = sitofp <16 x i32> undef to <16 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r268 = uitofp <16 x i64> undef to <16 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r269 = sitofp <16 x i64> undef to <16 x half> -; SVE128-NO-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64 +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r98 = fptoui <2 x half> undef to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r99 = fptosi <2 x half> undef to <2 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r116 = fptoui <4 x half> undef to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r117 = fptosi <4 x half> undef to <4 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r118 = fptoui <4 x half> undef to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r119 = fptosi <4 x half> undef to <4 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r130 = fptoui <8 x half> undef to <8 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r131 = fptosi <8 x half> undef to <8 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r132 = fptoui <8 x half> undef to <8 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r133 = fptosi <8 x half> undef to <8 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r136 = fptoui <8 x half> undef to <8 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r137 = fptosi <8 x half> undef to <8 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r150 = fptoui <16 x half> undef to <16 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r151 = fptosi <16 x half> undef to <16 x i1> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r152 = fptoui <16 x half> undef to <16 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r153 = fptosi <16 x half> undef to <16 x i8> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r154 = fptoui <16 x half> undef to <16 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r155 = fptosi <16 x half> undef to <16 x i16> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r256 = uitofp <8 x i32> undef to <8 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r257 = sitofp <8 x i32> undef to <8 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r260 = uitofp <16 x i1> undef to <16 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r261 = sitofp <16 x i1> undef to <16 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r264 = uitofp <16 x i16> undef to <16 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r265 = sitofp <16 x i16> undef to <16 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half> +; SVE128-NO-NEON-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; FIXED-MIN-256-LABEL: 'fp16cast' -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui half undef to i1 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi half undef to i1 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui half undef to i8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi half undef to i8 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui half undef to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi half undef to i16 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui half undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi half undef to i32 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui half undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi half undef to i64 -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x half> undef to <2 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x half> undef to <2 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x half> undef to <2 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x half> undef to <2 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x half> undef to <2 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r113 = fptosi <4 x half> undef to <4 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r114 = fptoui <4 x half> undef to <4 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x half> undef to <4 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x half> undef to <8 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r154 = fptoui <16 x half> undef to <16 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r155 = fptosi <16 x half> undef to <16 x i16> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r250 = uitofp <8 x i1> undef to <8 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r251 = sitofp <8 x i1> undef to <8 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r252 = uitofp <8 x i8> undef to <8 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r253 = sitofp <8 x i8> undef to <8 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r254 = uitofp <8 x i16> undef to <8 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r255 = sitofp <8 x i16> undef to <8 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r256 = uitofp <8 x i32> undef to <8 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r257 = sitofp <8 x i32> undef to <8 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r258 = uitofp <8 x i64> undef to <8 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r259 = sitofp <8 x i64> undef to <8 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r260 = uitofp <16 x i1> undef to <16 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r261 = sitofp <16 x i1> undef to <16 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r262 = uitofp <16 x i8> undef to <16 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r263 = sitofp <16 x i8> undef to <16 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r264 = uitofp <16 x i16> undef to <16 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r265 = sitofp <16 x i16> undef to <16 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r266 = uitofp <16 x i32> undef to <16 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r267 = sitofp <16 x i32> undef to <16 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r268 = uitofp <16 x i64> undef to <16 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r269 = sitofp <16 x i64> undef to <16 x half> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64 +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r118 = fptoui <4 x half> undef to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r119 = fptosi <4 x half> undef to <4 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r136 = fptoui <8 x half> undef to <8 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r137 = fptosi <8 x half> undef to <8 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r138 = fptoui <8 x half> undef to <8 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r139 = fptosi <8 x half> undef to <8 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r150 = fptoui <16 x half> undef to <16 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r151 = fptosi <16 x half> undef to <16 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r152 = fptoui <16 x half> undef to <16 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r153 = fptosi <16 x half> undef to <16 x i8> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r154 = fptoui <16 x half> undef to <16 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r155 = fptosi <16 x half> undef to <16 x i16> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r156 = fptoui <16 x half> undef to <16 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r157 = fptosi <16 x half> undef to <16 x i32> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r158 = fptoui <16 x half> undef to <16 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r159 = fptosi <16 x half> undef to <16 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r256 = uitofp <8 x i32> undef to <8 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r257 = sitofp <8 x i32> undef to <8 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r258 = uitofp <8 x i64> undef to <8 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r259 = sitofp <8 x i64> undef to <8 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r260 = uitofp <16 x i1> undef to <16 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r261 = sitofp <16 x i1> undef to <16 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r264 = uitofp <16 x i16> undef to <16 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of 1 for: %r265 = sitofp <16 x i16> undef to <16 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r266 = uitofp <16 x i32> undef to <16 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r267 = sitofp <16 x i32> undef to <16 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r268 = uitofp <16 x i64> undef to <16 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %r269 = sitofp <16 x i64> undef to <16 x half> +; FIXED-MIN-256-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; FIXED-MIN-2048-LABEL: 'fp16cast' -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r30 = fptoui half undef to i1 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r31 = fptosi half undef to i1 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r32 = fptoui half undef to i8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r33 = fptosi half undef to i8 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r34 = fptoui half undef to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r35 = fptosi half undef to i16 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r36 = fptoui half undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r37 = fptosi half undef to i32 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r38 = fptoui half undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r39 = fptosi half undef to i64 -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r90 = fptoui <2 x half> undef to <2 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r91 = fptosi <2 x half> undef to <2 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r92 = fptoui <2 x half> undef to <2 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r93 = fptosi <2 x half> undef to <2 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r94 = fptoui <2 x half> undef to <2 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r113 = fptosi <4 x half> undef to <4 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r114 = fptoui <4 x half> undef to <4 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x half> undef to <4 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x half> undef to <8 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r154 = fptoui <16 x half> undef to <16 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r155 = fptosi <16 x half> undef to <16 x i16> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r250 = uitofp <8 x i1> undef to <8 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r251 = sitofp <8 x i1> undef to <8 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r252 = uitofp <8 x i8> undef to <8 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r253 = sitofp <8 x i8> undef to <8 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r254 = uitofp <8 x i16> undef to <8 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r255 = sitofp <8 x i16> undef to <8 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r256 = uitofp <8 x i32> undef to <8 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r257 = sitofp <8 x i32> undef to <8 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r258 = uitofp <8 x i64> undef to <8 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r259 = sitofp <8 x i64> undef to <8 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r260 = uitofp <16 x i1> undef to <16 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r261 = sitofp <16 x i1> undef to <16 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r262 = uitofp <16 x i8> undef to <16 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r263 = sitofp <16 x i8> undef to <16 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r264 = uitofp <16 x i16> undef to <16 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r265 = sitofp <16 x i16> undef to <16 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r266 = uitofp <16 x i32> undef to <16 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r267 = sitofp <16 x i32> undef to <16 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r268 = uitofp <16 x i64> undef to <16 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r269 = sitofp <16 x i64> undef to <16 x half> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r30 = fptoui half undef to i1 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r31 = fptosi half undef to i1 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r32 = fptoui half undef to i8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r33 = fptosi half undef to i8 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r34 = fptoui half undef to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r35 = fptosi half undef to i16 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r36 = fptoui half undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r37 = fptosi half undef to i32 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r38 = fptoui half undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r39 = fptosi half undef to i64 +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r90 = fptoui <2 x half> undef to <2 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r91 = fptosi <2 x half> undef to <2 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r92 = fptoui <2 x half> undef to <2 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r93 = fptosi <2 x half> undef to <2 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r94 = fptoui <2 x half> undef to <2 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r95 = fptosi <2 x half> undef to <2 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r96 = fptoui <2 x half> undef to <2 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r97 = fptosi <2 x half> undef to <2 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r98 = fptoui <2 x half> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %r99 = fptosi <2 x half> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r110 = fptoui <4 x half> undef to <4 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r111 = fptosi <4 x half> undef to <4 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r112 = fptoui <4 x half> undef to <4 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r113 = fptosi <4 x half> undef to <4 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r114 = fptoui <4 x half> undef to <4 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r115 = fptosi <4 x half> undef to <4 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r116 = fptoui <4 x half> undef to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r117 = fptosi <4 x half> undef to <4 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r118 = fptoui <4 x half> undef to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r119 = fptosi <4 x half> undef to <4 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r130 = fptoui <8 x half> undef to <8 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %r131 = fptosi <8 x half> undef to <8 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r132 = fptoui <8 x half> undef to <8 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r133 = fptosi <8 x half> undef to <8 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r134 = fptoui <8 x half> undef to <8 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r135 = fptosi <8 x half> undef to <8 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r136 = fptoui <8 x half> undef to <8 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r137 = fptosi <8 x half> undef to <8 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r138 = fptoui <8 x half> undef to <8 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r139 = fptosi <8 x half> undef to <8 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r150 = fptoui <16 x half> undef to <16 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r151 = fptosi <16 x half> undef to <16 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r152 = fptoui <16 x half> undef to <16 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r153 = fptosi <16 x half> undef to <16 x i8> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r154 = fptoui <16 x half> undef to <16 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r155 = fptosi <16 x half> undef to <16 x i16> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r156 = fptoui <16 x half> undef to <16 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r157 = fptosi <16 x half> undef to <16 x i32> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r158 = fptoui <16 x half> undef to <16 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r159 = fptosi <16 x half> undef to <16 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r250 = uitofp <8 x i1> undef to <8 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r251 = sitofp <8 x i1> undef to <8 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r252 = uitofp <8 x i8> undef to <8 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r253 = sitofp <8 x i8> undef to <8 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r254 = uitofp <8 x i16> undef to <8 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r255 = sitofp <8 x i16> undef to <8 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r256 = uitofp <8 x i32> undef to <8 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r257 = sitofp <8 x i32> undef to <8 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r258 = uitofp <8 x i64> undef to <8 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r259 = sitofp <8 x i64> undef to <8 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r260 = uitofp <16 x i1> undef to <16 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r261 = sitofp <16 x i1> undef to <16 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r262 = uitofp <16 x i8> undef to <16 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r263 = sitofp <16 x i8> undef to <16 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r264 = uitofp <16 x i16> undef to <16 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r265 = sitofp <16 x i16> undef to <16 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r266 = uitofp <16 x i32> undef to <16 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r267 = sitofp <16 x i32> undef to <16 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r268 = uitofp <16 x i64> undef to <16 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of 1 for: %r269 = sitofp <16 x i64> undef to <16 x half> +; FIXED-MIN-2048-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %r30 = fptoui half undef to i1 %r31 = fptosi half undef to i1 diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll b/llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll index 46af4a023e681..9a0e5c3eb7964 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-cmpsel.ll @@ -1,14 +1,17 @@ -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" ; Check icmp for legal integer vectors. define void @cmp_legal_int() { ; CHECK-LABEL: 'cmp_legal_int' -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp ne undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp ne undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp ne undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %1 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %2 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %3 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %4 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; %1 = icmp ne undef, undef %2 = icmp ne undef, undef %3 = icmp ne undef, undef @@ -19,8 +22,9 @@ define void @cmp_legal_int() { ; Check icmp for an illegal integer vector. define @cmp_nxv4i64() { ; CHECK-LABEL: 'cmp_nxv4i64' -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res = icmp ne undef, undef -; CHECK: Cost Model: Found an estimated cost of 0 for instruction: ret %res +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret %res +; %res = icmp ne undef, undef ret %res } @@ -28,10 +32,12 @@ define @cmp_nxv4i64() { ; Check icmp for legal predicate vectors. define void @cmp_legal_pred() { ; CHECK-LABEL: 'cmp_legal_pred' -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp ne undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp ne undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp ne undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %1 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %2 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %3 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %4 = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; %1 = icmp ne undef, undef %2 = icmp ne undef, undef %3 = icmp ne undef, undef @@ -42,8 +48,9 @@ define void @cmp_legal_pred() { ; Check icmp for an illegal predicate vector. define @cmp_nxv32i1() { ; CHECK-LABEL: 'cmp_nxv32i1' -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res = icmp ne undef, undef -; CHECK: Cost Model: Found an estimated cost of 0 for instruction: ret %res +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res = icmp ne undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret %res +; %res = icmp ne undef, undef ret %res } @@ -51,10 +58,12 @@ define @cmp_nxv32i1() { ; Check fcmp for legal FP vectors define void @cmp_legal_fp() #0 { ; CHECK-LABEL: 'cmp_legal_fp' -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %1 = fcmp oge undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %2 = fcmp oge undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %3 = fcmp oge undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %4 = fcmp oge undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %1 = fcmp oge undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %2 = fcmp oge undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %3 = fcmp oge undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %4 = fcmp oge undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; %1 = fcmp oge undef, undef %2 = fcmp oge undef, undef %3 = fcmp oge undef, undef @@ -65,8 +74,9 @@ define void @cmp_legal_fp() #0 { ; Check fcmp for an illegal FP vector define @cmp_nxv16f16() { ; CHECK-LABEL: 'cmp_nxv16f16' -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res = fcmp oge undef, undef -; CHECK: Cost Model: Found an estimated cost of 0 for instruction: ret %res +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res = fcmp oge undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret %res +; %res = fcmp oge undef, undef ret %res } @@ -74,10 +84,12 @@ define @cmp_nxv16f16() { ; Check select for legal integer vectors define void @sel_legal_int() { ; CHECK-LABEL: 'sel_legal_int' -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %1 = select undef, undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %2 = select undef, undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %3 = select undef, undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %4 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %1 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %2 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %3 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %4 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; %1 = select undef, undef, undef %2 = select undef, undef, undef %3 = select undef, undef, undef @@ -88,8 +100,9 @@ define void @sel_legal_int() { ; Check select for an illegal integer vector define @sel_nxv16i16() { ; CHECK-LABEL: 'sel_nxv16i16' -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res = select undef, undef, undef -; CHECK: Cost Model: Found an estimated cost of 0 for instruction: ret %res +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret %res +; %res = select undef, undef, undef ret %res } @@ -97,10 +110,12 @@ define @sel_nxv16i16() { ; Check select for a legal FP vector define void @sel_legal_fp() #0 { ; CHECK-LABEL: 'sel_legal_fp' -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %1 = select undef, undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %2 = select undef, undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %3 = select undef, undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %4 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %1 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %2 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %3 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %4 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; %1 = select undef, undef, undef %2 = select undef, undef, undef %3 = select undef, undef, undef @@ -111,8 +126,9 @@ define void @sel_legal_fp() #0 { ; Check select for an illegal FP vector define @sel_nxv8f32() { ; CHECK-LABEL: 'sel_nxv8f32' -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res = select undef, undef, undef -; CHECK: Cost Model: Found an estimated cost of 0 for instruction: ret %res +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret %res +; %res = select undef, undef, undef ret %res } @@ -120,10 +136,12 @@ define @sel_nxv8f32() { ; Check select for a legal predicate vector define void @sel_legal_pred() { ; CHECK-LABEL: 'sel_legal_pred' -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %1 = select undef, undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %2 = select undef, undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %3 = select undef, undef, undef -; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %4 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %1 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %2 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %3 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of 1 for: %4 = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; %1 = select undef, undef, undef %2 = select undef, undef, undef %3 = select undef, undef, undef @@ -134,8 +152,9 @@ define void @sel_legal_pred() { ; Check select for an illegal predicate vector define @sel_nxv32i1() { ; CHECK-LABEL: 'sel_nxv32i1' -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %res = select undef, undef, undef -; CHECK: Cost Model: Found an estimated cost of 0 for instruction: ret %res +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %res = select undef, undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret %res +; %res = select undef, undef, undef ret %res } diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-div.ll b/llvm/test/Analysis/CostModel/AArch64/sve-div.ll index c38cdcd8b9a34..480c3146a210d 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-div.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-div.ll @@ -1,47 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes="print" 2>&1 -disable-output | FileCheck %s +; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @sdiv() { ; CHECK-LABEL: 'sdiv' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = sdiv <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = sdiv <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i64 = sdiv <8 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = sdiv <2 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = sdiv <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = sdiv <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = sdiv <16 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2i16 = sdiv <2 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i16 = sdiv <4 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = sdiv <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = sdiv <16 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = sdiv <32 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2i8 = sdiv <2 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i8 = sdiv <4 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i8 = sdiv <8 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = sdiv <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = sdiv <32 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = sdiv <64 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i64 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i64 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i64 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i32 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i32 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV8i32 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV16i32 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i16 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i16 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i16 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV16i16 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %NV32i16 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i8 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i8 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i8 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV16i8 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %NV32i8 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %NV64i8 = sdiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = sdiv <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i64 = sdiv <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i32 = sdiv <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:64 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i64 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV8i32 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:64 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = sdiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = sdiv <2 x i64> undef, undef %V4i64 = sdiv <4 x i64> undef, undef @@ -84,43 +84,43 @@ define void @sdiv() { define void @udiv() { ; CHECK-LABEL: 'udiv' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2i64 = udiv <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = udiv <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i64 = udiv <8 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = udiv <2 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = udiv <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i32 = udiv <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = udiv <16 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2i16 = udiv <2 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i16 = udiv <4 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = udiv <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = udiv <16 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = udiv <32 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2i8 = udiv <2 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i8 = udiv <4 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i8 = udiv <8 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = udiv <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = udiv <32 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = udiv <64 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i64 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i64 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i64 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i32 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i32 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV8i32 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV16i32 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i16 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i16 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i16 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV16i16 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %NV32i16 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i8 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i8 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i8 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV16i8 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %NV32i8 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %NV64i8 = udiv undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = udiv <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i64 = udiv <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = udiv <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = udiv <2 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = udiv <4 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i32 = udiv <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = udiv <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = udiv <2 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = udiv <4 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = udiv <8 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = udiv <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = udiv <32 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = udiv <2 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = udiv <4 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = udiv <8 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = udiv <16 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = udiv <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:64 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = udiv <64 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i64 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV8i32 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:64 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = udiv undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = udiv <2 x i64> undef, undef %V4i64 = udiv <4 x i64> undef, undef @@ -163,43 +163,43 @@ define void @udiv() { define void @sdiv_uniformconst() { ; CHECK-LABEL: 'sdiv_uniformconst' -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2i64 = sdiv <2 x i64> undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i64 = sdiv <4 x i64> undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i64 = sdiv <8 x i64> undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = sdiv <2 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = sdiv <8 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = sdiv <16 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = sdiv <2 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i16 = sdiv <4 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = sdiv <2 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i8 = sdiv <4 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i8 = sdiv <8 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i64 = sdiv undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV4i64 = sdiv undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV8i64 = sdiv undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i32 = sdiv undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV4i32 = sdiv undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV8i32 = sdiv undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV16i32 = sdiv undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i16 = sdiv undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV4i16 = sdiv undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV8i16 = sdiv undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV16i16 = sdiv undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV32i16 = sdiv undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i8 = sdiv undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV4i8 = sdiv undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV8i8 = sdiv undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV16i8 = sdiv undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV32i8 = sdiv undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV64i8 = sdiv undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = sdiv undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = sdiv undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = sdiv undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = sdiv undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = sdiv undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = sdiv undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = sdiv undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = sdiv undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = sdiv undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = sdiv undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = sdiv undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = sdiv undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = sdiv undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = sdiv undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = sdiv undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = sdiv undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = sdiv undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = sdiv undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = sdiv <2 x i64> undef, splat (i64 7) %V4i64 = sdiv <4 x i64> undef, splat (i64 7) @@ -242,43 +242,43 @@ define void @sdiv_uniformconst() { define void @udiv_uniformconst() { ; CHECK-LABEL: 'udiv_uniformconst' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = udiv <2 x i64> undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i64 = udiv <4 x i64> undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = udiv <8 x i64> undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2i32 = udiv <2 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = udiv <4 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = udiv <8 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = udiv <16 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2i16 = udiv <2 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i16 = udiv <4 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = udiv <32 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2i8 = udiv <2 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i8 = udiv <4 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i8 = udiv <8 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i64 = udiv undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV4i64 = udiv undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV8i64 = udiv undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i32 = udiv undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i32 = udiv undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i32 = udiv undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV16i32 = udiv undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i16 = udiv undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i16 = udiv undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV8i16 = udiv undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV16i16 = udiv undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV32i16 = udiv undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i8 = udiv undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i8 = udiv undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV8i8 = udiv undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV16i8 = udiv undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV32i8 = udiv undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV64i8 = udiv undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = udiv <2 x i64> undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = udiv <4 x i64> undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = udiv <8 x i64> undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = udiv <2 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = udiv <4 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = udiv <8 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = udiv <16 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = udiv <2 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = udiv <4 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = udiv <8 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = udiv <16 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = udiv <32 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = udiv <2 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = udiv <4 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = udiv <8 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = udiv <16 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = udiv <32 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = udiv <64 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i64 = udiv undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = udiv undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = udiv undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i32 = udiv undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i32 = udiv undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = udiv undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = udiv undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i16 = udiv undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i16 = udiv undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV8i16 = udiv undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = udiv undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = udiv undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i8 = udiv undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i8 = udiv undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV8i8 = udiv undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV16i8 = udiv undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = udiv undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = udiv undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = udiv <2 x i64> undef, splat (i64 7) %V4i64 = udiv <4 x i64> undef, splat (i64 7) @@ -321,43 +321,43 @@ define void @udiv_uniformconst() { define void @sdiv_uniformconstpow2() { ; CHECK-LABEL: 'sdiv_uniformconstpow2' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i64 = sdiv <2 x i64> undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i64 = sdiv <4 x i64> undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i64 = sdiv <8 x i64> undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2i32 = sdiv <2 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i32 = sdiv <4 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = sdiv <8 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i32 = sdiv <16 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2i16 = sdiv <2 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i16 = sdiv <4 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i16 = sdiv <8 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = sdiv <16 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = sdiv <32 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2i8 = sdiv <2 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i8 = sdiv <4 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i8 = sdiv <8 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i64 = sdiv undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i64 = sdiv undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i64 = sdiv undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i32 = sdiv undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i32 = sdiv undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV8i32 = sdiv undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV16i32 = sdiv undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i16 = sdiv undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i16 = sdiv undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV8i16 = sdiv undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV16i16 = sdiv undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV32i16 = sdiv undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i8 = sdiv undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i8 = sdiv undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV8i8 = sdiv undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV16i8 = sdiv undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV32i8 = sdiv undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV64i8 = sdiv undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = sdiv undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i64 = sdiv undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = sdiv undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = sdiv undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = sdiv undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV8i32 = sdiv undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = sdiv undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = sdiv undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = sdiv undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = sdiv undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV16i16 = sdiv undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = sdiv undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = sdiv undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = sdiv undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = sdiv undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = sdiv undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV32i8 = sdiv undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = sdiv undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = sdiv <2 x i64> undef, splat (i64 16) %V4i64 = sdiv <4 x i64> undef, splat (i64 16) @@ -400,43 +400,43 @@ define void @sdiv_uniformconstpow2() { define void @udiv_uniformconstpow2() { ; CHECK-LABEL: 'udiv_uniformconstpow2' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = udiv <2 x i64> undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = udiv <4 x i64> undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = udiv <8 x i64> undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = udiv <2 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = udiv <4 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = udiv <8 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = udiv <16 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = udiv <2 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = udiv <4 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = udiv <8 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = udiv <16 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32i16 = udiv <32 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = udiv <2 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = udiv <4 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = udiv <8 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV2i64 = udiv undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i64 = udiv undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV8i64 = udiv undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV2i32 = udiv undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV4i32 = udiv undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV8i32 = udiv undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV16i32 = udiv undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV2i16 = udiv undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV4i16 = udiv undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV8i16 = udiv undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV16i16 = udiv undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV32i16 = udiv undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV2i8 = udiv undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV4i8 = udiv undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV8i8 = udiv undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV16i8 = udiv undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV32i8 = udiv undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV64i8 = udiv undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = udiv <2 x i64> undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = udiv <4 x i64> undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i64 = udiv <8 x i64> undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = udiv <2 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = udiv <4 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = udiv <8 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16i32 = udiv <16 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = udiv <2 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = udiv <4 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = udiv <8 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = udiv <16 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V32i16 = udiv <32 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = udiv <2 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = udiv <4 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = udiv <8 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = udiv <16 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = udiv <32 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V64i8 = udiv <64 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = udiv undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = udiv undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV8i64 = udiv undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = udiv undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = udiv undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = udiv undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV16i32 = udiv undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = udiv undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = udiv undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = udiv undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = udiv undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV32i16 = udiv undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = udiv undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = udiv undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = udiv undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = udiv undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = udiv undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV64i8 = udiv undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = udiv <2 x i64> undef, splat (i64 16) %V4i64 = udiv <4 x i64> undef, splat (i64 16) @@ -479,43 +479,43 @@ define void @udiv_uniformconstpow2() { define void @sdiv_uniformconstnegpow2() { ; CHECK-LABEL: 'sdiv_uniformconstnegpow2' -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2i64 = sdiv <2 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i64 = sdiv undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV4i64 = sdiv undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV8i64 = sdiv undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i32 = sdiv undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV4i32 = sdiv undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV8i32 = sdiv undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV16i32 = sdiv undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i16 = sdiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV4i16 = sdiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV8i16 = sdiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV16i16 = sdiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV32i16 = sdiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i8 = sdiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV4i8 = sdiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV8i8 = sdiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV16i8 = sdiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV32i8 = sdiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV64i8 = sdiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = sdiv undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = sdiv undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = sdiv undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = sdiv undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = sdiv undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = sdiv undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = sdiv undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = sdiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = sdiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = sdiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = sdiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = sdiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = sdiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = sdiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = sdiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = sdiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = sdiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = sdiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = sdiv <2 x i64> undef, splat (i64 -16) %V4i64 = sdiv <4 x i64> undef, splat (i64 -16) @@ -558,43 +558,43 @@ define void @sdiv_uniformconstnegpow2() { define void @udiv_uniformconstnegpow2() { ; CHECK-LABEL: 'udiv_uniformconstnegpow2' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i64 = udiv <2 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4i64 = udiv <4 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8i64 = udiv <8 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2i32 = udiv <2 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = udiv <4 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = udiv <8 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = udiv <16 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2i16 = udiv <2 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i16 = udiv <4 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = udiv <32 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2i8 = udiv <2 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i8 = udiv <4 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i8 = udiv <8 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i8 = udiv <16 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32i8 = udiv <32 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64i8 = udiv <64 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i64 = udiv undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV4i64 = udiv undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV8i64 = udiv undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i32 = udiv undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i32 = udiv undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i32 = udiv undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV16i32 = udiv undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i16 = udiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i16 = udiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV8i16 = udiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV16i16 = udiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV32i16 = udiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i8 = udiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i8 = udiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV8i8 = udiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV16i8 = udiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV32i8 = udiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV64i8 = udiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = udiv <2 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = udiv <4 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = udiv <8 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = udiv <2 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = udiv <4 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = udiv <8 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = udiv <16 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = udiv <2 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = udiv <4 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = udiv <8 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = udiv <16 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = udiv <32 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = udiv <2 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = udiv <4 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = udiv <8 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = udiv <16 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = udiv <32 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = udiv <64 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i64 = udiv undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = udiv undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = udiv undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i32 = udiv undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i32 = udiv undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = udiv undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = udiv undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i16 = udiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i16 = udiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV8i16 = udiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = udiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = udiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i8 = udiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i8 = udiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV8i8 = udiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV16i8 = udiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = udiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = udiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = udiv <2 x i64> undef, splat (i64 -16) %V4i64 = udiv <4 x i64> undef, splat (i64 -16) diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll b/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll index cbb05620a9270..b8876540900b6 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-ext.ll @@ -1,31 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -S -o - < %s | FileCheck %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -S -o - < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" define void @sve_ext() { ; CHECK-LABEL: 'sve_ext' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %zext_nxv16_i8_to_i16 = zext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %zext_nxv16_i8_to_i32 = zext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %zext_nxv16_i8_to_i64 = zext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %zext_nxv8_i16_to_i32 = zext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %zext_nxv8_i16_to_i64 = zext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %zext_nxv4_i32_to_i64 = zext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %zext_nxv4_i8_to_i64 = zext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %zext_nxv8_i8_to_i32 = zext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %zext_nxv4_i16_to_i64 = zext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %zext_nxv8_i8_to_i64 = zext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_nxv16_i8_to_i16 = sext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %sext_nxv16_i8_to_i32 = sext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %sext_nxv16_i8_to_i64 = sext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_nxv8_i16_to_i32 = sext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %sext_nxv8_i16_to_i64 = sext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext_nxv4_i32_to_i64 = sext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_nxv4_i8_to_i64 = sext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_nxv8_i8_to_i32 = sext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext_nxv4_i16_to_i64 = sext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %sext_nxv8_i8_to_i64 = sext undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i16 = zext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i32 = zext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv16_i8_to_i64 = zext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i16_to_i32 = zext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i16_to_i64 = zext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i32_to_i64 = zext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i8_to_i64 = zext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i8_to_i32 = zext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv4_i16_to_i64 = zext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zext_nxv8_i8_to_i64 = zext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i16 = sext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i32 = sext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv16_i8_to_i64 = sext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i16_to_i32 = sext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i16_to_i64 = sext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i32_to_i64 = sext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i8_to_i64 = sext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i8_to_i32 = sext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv4_i16_to_i64 = sext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sext_nxv8_i8_to_i64 = sext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %zext_nxv16_i8_to_i16 = zext undef to %zext_nxv16_i8_to_i32 = zext undef to diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll index b31e30fa52fe3..117315cc8b710 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-fpext.ll @@ -1,19 +1,22 @@ -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -S -o - < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -S -o - < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" define void @sve_fpext() { - ;CHECK-LABEL: 'sve_fpext' - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_to_f32 = fpext undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4_f16_to_f32 = fpext undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv8_f16_to_f32 = fpext undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_to_f64 = fpext undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4_f16_to_f64 = fpext undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nxv8_f16_to_f64 = fpext undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f32_to_f64 = fpext undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nxv4_f32_to_f64 = fpext undef to - ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8_f32_to_f64 = fpext undef to +; CHECK-LABEL: 'sve_fpext' +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_to_f32 = fpext undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv4_f16_to_f32 = fpext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f32 = fpext undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_to_f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_to_f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_to_f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f32_to_f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f32_to_f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f32_to_f64 = fpext undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; %nxv2_f16_to_f32 = fpext undef to %nxv4_f16_to_f32 = fpext undef to %nxv8_f16_to_f32 = fpext undef to diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll index 3448a165881d1..06ed58dc0ca25 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-fptoi.ll @@ -1,84 +1,84 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -o - -S < %s | FileCheck %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -o - -S < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" define void @sve-fptoi() { ; CHECK-LABEL: 'sve-fptoi' -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nv1f16_to_si8 = fptosi undef to -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nv1f16_to_ui8 = fptoui undef to -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nv1f16_to_si32 = fptosi undef to -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nv1f16_to_ui32 = fptoui undef to -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nv1f16_to_si64 = fptosi undef to -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nv1f16_to_ui64 = fptoui undef to -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nv1f32_to_si8 = fptosi undef to -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nv1f32_to_ui8 = fptoui undef to -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nv1f32_to_si16 = fptosi undef to -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nv1f32_to_ui16 = fptoui undef to -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nv1f32_to_si64 = fptosi undef to -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %nv1f32_to_ui64 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv1f64_to_si8 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv1f64_to_ui8 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv1f64_to_si16 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv1f64_to_ui16 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv1f64_to_si32 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv1f64_to_ui32 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f16_to_si8 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f16_to_ui8 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f16_to_si32 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f16_to_ui32 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f16_to_si64 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f16_to_ui64 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f32_to_si8 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f32_to_ui8 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f32_to_si16 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f32_to_ui16 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f32_to_si64 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f32_to_ui64 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f64_to_si8 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f64_to_ui8 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f64_to_si16 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f64_to_ui16 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f64_to_si32 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2f64_to_ui32 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4f16_to_si8 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4f16_to_ui8 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4f16_to_si32 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4f16_to_ui32 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nv4f16_to_si64 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nv4f16_to_ui64 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4f32_to_si8 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4f32_to_ui8 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4f32_to_si16 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4f32_to_ui16 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nv4f32_to_si64 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nv4f32_to_ui64 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv4f64_to_si8 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv4f64_to_ui8 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv4f64_to_si16 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv4f64_to_ui16 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv4f64_to_si32 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv4f64_to_ui32 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv8f16_to_si8 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv8f16_to_ui8 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nv8f16_to_si32 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nv8f16_to_ui32 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %nv8f16_to_si64 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %nv8f16_to_ui64 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv8f32_to_si8 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv8f32_to_ui8 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv8f32_to_si16 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv8f32_to_ui16 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nv8f32_to_si64 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nv8f32_to_ui64 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nv8f64_to_si8 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nv8f64_to_ui8 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nv8f64_to_si16 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nv8f64_to_ui16 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv8f64_to_si32 = fptosi undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv8f64_to_ui32 = fptoui undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si8 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui8 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si32 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui32 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_si64 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f16_to_ui64 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si8 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui8 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si16 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui16 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_si64 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:1 Lat:1 SizeLat:1 for: %nv1f32_to_ui64 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_si8 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_ui8 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_si16 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_ui16 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_si32 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv1f64_to_ui32 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_si8 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_ui8 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_si32 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_ui32 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_si64 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f16_to_ui64 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_si8 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_ui8 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_si16 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_ui16 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_si64 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f32_to_ui64 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_si8 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_ui8 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_si16 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_ui16 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_si32 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2f64_to_ui32 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f16_to_si8 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f16_to_ui8 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f16_to_si32 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f16_to_ui32 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f16_to_si64 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f16_to_ui64 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f32_to_si8 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f32_to_ui8 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f32_to_si16 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4f32_to_ui16 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f32_to_si64 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f32_to_ui64 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si8 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui8 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si16 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui16 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_si32 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4f64_to_ui32 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv8f16_to_si8 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv8f16_to_ui8 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_si32 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_ui32 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_si64 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f16_to_ui64 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si8 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui8 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si16 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui16 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_si64 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f32_to_ui64 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si8 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui8 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si16 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui16 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_si32 = fptosi undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8f64_to_ui32 = fptoui undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %nv1f16_to_si8 = fptosi undef to %nv1f16_to_ui8 = fptoui undef to diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll index 020d34deb8c69..a17c6ce42d7d7 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll @@ -1,21 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -S -o - < %s | FileCheck %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -S -o - < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" define void @sve_fptruncs() { ; CHECK-LABEL: 'sve_fptruncs' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_from_f32 = fptrunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv4_f16_from_f32 = fptrunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv8_f16_from_f32 = fptrunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_from_f64 = fptrunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4_f16_from_f64 = fptrunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nxv8_f16_from_f64 = fptrunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f32_from_f64 = fptrunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nxv4_f32_from_f64 = fptrunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nxv8_f32_from_f64 = fptrunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_from_f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv4_f16_from_f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f32 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f16_from_f64 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f16_from_f64 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f16_from_f64 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nxv2_f32_from_f64 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nxv4_f32_from_f64 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nxv8_f32_from_f64 = fptrunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %nxv2_f16_from_f32 = fptrunc undef to %nxv4_f16_from_f32 = fptrunc undef to diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll b/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll index e6357dacf95d8..5fcb1184045e3 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll @@ -1,42 +1,42 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt -passes="print" 2>&1 -disable-output -S < %s | FileCheck --check-prefix=CHECK-DEFAULT %s -; RUN: opt -aarch64-insert-extract-base-cost=0 -passes="print" 2>&1 -disable-output -S < %s | FileCheck --check-prefix=CHECK-LOW %s -; RUN: opt -aarch64-insert-extract-base-cost=100000 -passes="print" 2>&1 -disable-output -S < %s | FileCheck --check-prefix=CHECK-HIGH %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -S < %s | FileCheck --check-prefix=CHECK-DEFAULT %s +; RUN: opt -aarch64-insert-extract-base-cost=0 -passes="print" -cost-kind=all 2>&1 -disable-output -S < %s | FileCheck --check-prefix=CHECK-LOW %s +; RUN: opt -aarch64-insert-extract-base-cost=100000 -passes="print" -cost-kind=all 2>&1 -disable-output -S < %s | FileCheck --check-prefix=CHECK-HIGH %s target triple = "aarch64-unknown-linux-gnu" target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @ins_el0() #0 { ; CHECK-DEFAULT-LABEL: 'ins_el0' -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = insertelement zeroinitializer, i64 0, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %vi1 = insertelement zeroinitializer, i1 false, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v0 = insertelement zeroinitializer, i8 0, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1 = insertelement zeroinitializer, i16 0, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2 = insertelement zeroinitializer, i32 0, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v3 = insertelement zeroinitializer, i64 0, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of 0 for: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of 0 for: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-LOW-LABEL: 'ins_el0' -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = insertelement zeroinitializer, i64 0, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %vi1 = insertelement zeroinitializer, i1 false, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v0 = insertelement zeroinitializer, i8 0, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v1 = insertelement zeroinitializer, i16 0, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v2 = insertelement zeroinitializer, i32 0, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v3 = insertelement zeroinitializer, i64 0, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of 0 for: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of 0 for: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-HIGH-LABEL: 'ins_el0' -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100001 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v3 = insertelement zeroinitializer, i64 0, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100001 CodeSize:2 Lat:100001 SizeLat:100001 for: %vi1 = insertelement zeroinitializer, i1 false, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v0 = insertelement zeroinitializer, i8 0, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v1 = insertelement zeroinitializer, i16 0, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v2 = insertelement zeroinitializer, i32 0, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v3 = insertelement zeroinitializer, i64 0, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of 0 for: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of 0 for: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %vi1 = insertelement zeroinitializer, i1 0, i64 0 %v0 = insertelement zeroinitializer, i8 0, i64 0 @@ -50,34 +50,34 @@ define void @ins_el0() #0 { define void @ins_el1() #0 { ; CHECK-DEFAULT-LABEL: 'ins_el1' -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = insertelement zeroinitializer, i64 0, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %vi1 = insertelement zeroinitializer, i1 false, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v0 = insertelement zeroinitializer, i8 0, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1 = insertelement zeroinitializer, i16 0, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2 = insertelement zeroinitializer, i32 0, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v3 = insertelement zeroinitializer, i64 0, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-LOW-LABEL: 'ins_el1' -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = insertelement zeroinitializer, i64 0, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %vi1 = insertelement zeroinitializer, i1 false, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v0 = insertelement zeroinitializer, i8 0, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v1 = insertelement zeroinitializer, i16 0, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v2 = insertelement zeroinitializer, i32 0, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v3 = insertelement zeroinitializer, i64 0, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-HIGH-LABEL: 'ins_el1' -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100001 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v3 = insertelement zeroinitializer, i64 0, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100001 CodeSize:2 Lat:100001 SizeLat:100001 for: %vi1 = insertelement zeroinitializer, i1 false, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v0 = insertelement zeroinitializer, i8 0, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v1 = insertelement zeroinitializer, i16 0, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v2 = insertelement zeroinitializer, i32 0, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v3 = insertelement zeroinitializer, i64 0, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %vi1 = insertelement zeroinitializer, i1 0, i64 1 %v0 = insertelement zeroinitializer, i8 0, i64 1 @@ -92,34 +92,34 @@ define void @ins_el1() #0 { define void @ext_el0() #0 { ; CHECK-DEFAULT-LABEL: 'ext_el0' -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = extractelement zeroinitializer, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v0 = extractelement zeroinitializer, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = extractelement zeroinitializer, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = extractelement zeroinitializer, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = extractelement zeroinitializer, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement zeroinitializer, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement zeroinitializer, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %vi1 = extractelement zeroinitializer, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v0 = extractelement zeroinitializer, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1 = extractelement zeroinitializer, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2 = extractelement zeroinitializer, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v3 = extractelement zeroinitializer, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of 0 for: %v4 = extractelement zeroinitializer, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of 0 for: %v5 = extractelement zeroinitializer, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-LOW-LABEL: 'ext_el0' -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vi1 = extractelement zeroinitializer, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement zeroinitializer, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement zeroinitializer, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement zeroinitializer, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = extractelement zeroinitializer, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement zeroinitializer, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement zeroinitializer, i64 0 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %vi1 = extractelement zeroinitializer, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v0 = extractelement zeroinitializer, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v1 = extractelement zeroinitializer, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v2 = extractelement zeroinitializer, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v3 = extractelement zeroinitializer, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of 0 for: %v4 = extractelement zeroinitializer, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of 0 for: %v5 = extractelement zeroinitializer, i64 0 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-HIGH-LABEL: 'ext_el0' -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100001 for instruction: %vi1 = extractelement zeroinitializer, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = extractelement zeroinitializer, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = extractelement zeroinitializer, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = extractelement zeroinitializer, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v3 = extractelement zeroinitializer, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement zeroinitializer, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement zeroinitializer, i64 0 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100001 CodeSize:2 Lat:100001 SizeLat:100001 for: %vi1 = extractelement zeroinitializer, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v0 = extractelement zeroinitializer, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v1 = extractelement zeroinitializer, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v2 = extractelement zeroinitializer, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v3 = extractelement zeroinitializer, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of 0 for: %v4 = extractelement zeroinitializer, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of 0 for: %v5 = extractelement zeroinitializer, i64 0 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %vi1 = extractelement zeroinitializer, i64 0 %v0 = extractelement zeroinitializer, i64 0 @@ -133,34 +133,34 @@ define void @ext_el0() #0 { define void @ext_el1() #0 { ; CHECK-DEFAULT-LABEL: 'ext_el1' -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v0 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v5 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %vi1 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v0 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v3 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v5 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-LOW-LABEL: 'ext_el1' -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vi1 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %vi1 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v0 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v1 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v2 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v3 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v4 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v5 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-HIGH-LABEL: 'ext_el1' -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100001 for instruction: %vi1 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v3 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v4 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v5 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100001 CodeSize:2 Lat:100001 SizeLat:100001 for: %vi1 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v0 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v1 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v2 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v3 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v4 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v5 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %vi1 = extractelement zeroinitializer, i64 1 %v0 = extractelement zeroinitializer, i64 1 @@ -176,34 +176,34 @@ define void @ext_el1() #0 { ; Test the behaviour in the presence of a CPU-specific override in AArch64Subtarget (via attribute set). define void @test_override_cpu_given() #1 { ; CHECK-DEFAULT-LABEL: 'test_override_cpu_given' -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v0 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v5 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:3 CodeSize:2 Lat:3 SizeLat:3 for: %vi1 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v0 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v3 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v5 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-LOW-LABEL: 'test_override_cpu_given' -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %vi1 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v0 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v1 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v3 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement zeroinitializer, i64 1 -; CHECK-LOW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %vi1 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v0 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v1 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v2 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v3 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v4 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:0 SizeLat:0 for: %v5 = extractelement zeroinitializer, i64 1 +; CHECK-LOW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-HIGH-LABEL: 'test_override_cpu_given' -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100001 for instruction: %vi1 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v0 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v1 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v2 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v3 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v4 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 100000 for instruction: %v5 = extractelement zeroinitializer, i64 1 -; CHECK-HIGH-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100001 CodeSize:2 Lat:100001 SizeLat:100001 for: %vi1 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v0 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v1 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v2 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v3 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v4 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:100000 CodeSize:1 Lat:100000 SizeLat:100000 for: %v5 = extractelement zeroinitializer, i64 1 +; CHECK-HIGH-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %vi1 = extractelement zeroinitializer, i64 1 %v0 = extractelement zeroinitializer, i64 1 diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-itofp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-itofp.ll index 960afd8af8383..8a85973a2afd1 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-itofp.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-itofp.ll @@ -1,108 +1,108 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -o - -S < %s | FileCheck %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -o - -S < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" define void @sve-itofp() { ; CHECK-LABEL: 'sve-itofp' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv2si8_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv2ui8_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2si16_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2ui16_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2si32_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2ui32_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2si64_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2ui64_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv2si8_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv2ui8_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2si16_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2ui16_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2si32_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2ui32_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2si64_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2ui64_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv2si8_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv2ui8_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2si16_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2ui16_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2si32_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2ui32_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2si64_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv2ui64_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv4si8_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv4ui8_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4si16_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4ui16_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4si32_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4ui32_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv4si64_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv4ui64_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv4si8_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv4ui8_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4si16_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4ui16_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4si32_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv4ui32_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv4si64_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv4ui64_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nv4si8_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nv4ui8_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv4si16_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv4ui16_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv4si32_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv4ui32_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv4si64_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv4ui64_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv8si8_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv8ui8_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv8si16_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %nv8ui16_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv8si32_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %nv8ui32_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nv8si64_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nv8ui64_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nv8si8_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %nv8ui8_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv8si16_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv8ui16_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv8si32_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv8ui32_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv8si64_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv8ui64_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %nv8si8_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %nv8ui8_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %nv8si16_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %nv8ui16_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nv8si32_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nv8ui32_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nv8si64_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nv8ui64_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv16si8_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv16ui8_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv16si16_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %nv16ui16_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv16si32_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %nv16ui32_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %nv16si64_to_f16 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %nv16ui64_to_f16 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %nv16si8_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %nv16ui8_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nv16si16_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nv16ui16_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nv16si32_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %nv16ui32_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nv16si64_to_f32 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %nv16ui64_to_f32 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nv16si8_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %nv16ui8_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %nv16si16_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %nv16ui16_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %nv16si32_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %nv16ui32_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nv16si64_to_f64 = sitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %nv16ui64_to_f64 = uitofp poison to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv2si8_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv2ui8_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2si16_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2ui16_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2si32_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2ui32_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2si64_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2ui64_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv2si8_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv2ui8_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2si16_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2ui16_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2si32_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2ui32_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2si64_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2ui64_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv2si8_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv2ui8_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2si16_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2ui16_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2si32_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2ui32_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2si64_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv2ui64_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv4si8_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv4ui8_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4si16_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4ui16_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4si32_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4ui32_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4si64_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4ui64_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv4si8_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv4ui8_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4si16_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4ui16_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4si32_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv4ui32_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4si64_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv4ui64_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv4si8_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv4ui8_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv4si16_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv4ui16_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv4si32_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv4ui32_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv4si64_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv4ui64_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv8si8_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv8ui8_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv8si16_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %nv8ui16_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8si32_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %nv8ui32_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8si64_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8ui64_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8si8_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %nv8ui8_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8si16_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8ui16_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv8si32_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv8ui32_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8si64_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv8ui64_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %nv8si8_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:21 CodeSize:1 Lat:1 SizeLat:1 for: %nv8ui8_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %nv8si16_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %nv8ui16_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %nv8si32_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %nv8ui32_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv8si64_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv8ui64_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv16si8_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv16ui8_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv16si16_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %nv16ui16_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv16si32_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %nv16ui32_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %nv16si64_to_f16 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %nv16ui64_to_f16 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %nv16si8_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:1 Lat:1 SizeLat:1 for: %nv16ui8_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %nv16si16_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %nv16ui16_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv16si32_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %nv16ui32_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %nv16si64_to_f32 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %nv16ui64_to_f32 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %nv16si8_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:43 CodeSize:1 Lat:1 SizeLat:1 for: %nv16ui8_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %nv16si16_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:40 CodeSize:1 Lat:1 SizeLat:1 for: %nv16ui16_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:1 Lat:1 SizeLat:1 for: %nv16si32_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:1 Lat:1 SizeLat:1 for: %nv16ui32_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nv16si64_to_f64 = sitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %nv16ui64_to_f64 = uitofp poison to +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %nv2si8_to_f16 = sitofp poison to diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll b/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll index 225c1ebe60b64..6be5397e8e0e3 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-ldst.ll @@ -1,18 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @scalable_loads() { ; CHECK-LABEL: 'scalable_loads' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.nxv8i8 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.nxv16i8 = load , ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.nxv32i8 = load , ptr undef, align 32 -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv1i64 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res.nxv32i1 = load , ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %res.nxv16i1 = load , ptr undef, align 2 -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %res.nxv4i1 = load , ptr undef, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %res.nxv8i8 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %res.nxv16i8 = load , ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %res.nxv32i8 = load , ptr undef, align 32 +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:Invalid Lat:4 SizeLat:Invalid for: %res.nxv1i64 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:4 SizeLat:2 for: %res.nxv32i1 = load , ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %res.nxv16i1 = load , ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:Invalid Lat:4 SizeLat:Invalid for: %res.nxv4i1 = load , ptr undef, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %res.nxv8i8 = load , ptr undef %res.nxv16i8 = load , ptr undef @@ -26,14 +26,14 @@ define void @scalable_loads() { define void @scalable_stores() { ; CHECK-LABEL: 'scalable_stores' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store undef, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store undef, ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store undef, ptr undef, align 32 -; CHECK-NEXT: Cost Model: Invalid cost for instruction: store undef, ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store undef, ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store undef, ptr undef, align 2 -; CHECK-NEXT: Cost Model: Invalid cost for instruction: store undef, ptr undef, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 1 for: store undef, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store undef, ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:1 SizeLat:2 for: store undef, ptr undef, align 32 +; CHECK-NEXT: Cost Model: Found costs of Invalid for: store undef, ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:2 Lat:1 SizeLat:2 for: store undef, ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 1 for: store undef, ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found costs of Invalid for: store undef, ptr undef, align 1 +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; store undef, ptr undef store undef, ptr undef @@ -47,59 +47,59 @@ define void @scalable_stores() { define void @scalable_ext_loads() { ; CHECK-LABEL: 'scalable_ext_loads' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv16i8 = load , ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %zext.nxv16i8to16 = zext %load.nxv16i8 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv8i8 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %zext.nxv8i8to16 = zext %load.nxv8i8 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv4i8 = load , ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %zext.nxv4i8to32 = zext %load.nxv4i8 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv2i8 = load , ptr undef, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %zext.nxv2i8to64 = zext %load.nxv2i8 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv8i16 = load , ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %zext.nxv8i16to32 = zext %load.nxv8i16 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv4i16 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %zext.nxv4i16to32 = zext %load.nxv4i16 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv2i16 = load , ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %zext.nxv2i16to64 = zext %load.nxv2i16 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv4i32 = load , ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %zext.nxv4i32to64 = zext %load.nxv4i32 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv2i32 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %zext.nxv2i32to64 = zext %load.nxv2i32 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv4i8.2 = load , ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %zext.nxv4i8to64 = zext %load.nxv4i8.2 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv4i16.2 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %zext.nxv4i16to64 = zext %load.nxv4i16.2 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv8i8.2 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %zext.nxv8i8to32 = zext %load.nxv8i8.2 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load.nxv8i8.3 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %zext.nxv8i8to64 = zext %load.nxv8i8.3 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv16i8 = load , ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext.nxv16i8to16 = sext %load2.nxv16i8 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv8i8 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sext.nxv8i8to16 = sext %load2.nxv8i8 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv4i8 = load , ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sext.nxv4i8to32 = sext %load2.nxv4i8 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv2i8 = load , ptr undef, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sext.nxv2i8to64 = sext %load2.nxv2i8 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv8i16 = load , ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext.nxv8i16to32 = sext %load2.nxv8i16 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv4i16 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sext.nxv4i16to32 = sext %load2.nxv4i16 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv2i16 = load , ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sext.nxv2i16to64 = sext %load2.nxv2i16 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv4i32 = load , ptr undef, align 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sext.nxv4i32to64 = sext %load2.nxv4i32 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv2i32 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sext.nxv2i32to64 = sext %load2.nxv2i32 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv4i8.2 = load , ptr undef, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext.nxv4i8to64 = sext %load2.nxv4i8.2 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv4i16.2 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext.nxv4i16to64 = sext %load2.nxv4i16.2 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv8i8.2 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sext.nxv8i8to32 = sext %load2.nxv8i8.2 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load2.nxv8i8.3 = load , ptr undef, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sext.nxv8i8to64 = sext %load2.nxv8i8.3 to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv16i8 = load , ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv16i8to16 = zext %load.nxv16i8 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv8i8 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zext.nxv8i8to16 = zext %load.nxv8i8 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv4i8 = load , ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zext.nxv4i8to32 = zext %load.nxv4i8 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv2i8 = load , ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zext.nxv2i8to64 = zext %load.nxv2i8 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv8i16 = load , ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv8i16to32 = zext %load.nxv8i16 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv4i16 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zext.nxv4i16to32 = zext %load.nxv4i16 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv2i16 = load , ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zext.nxv2i16to64 = zext %load.nxv2i16 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv4i32 = load , ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv4i32to64 = zext %load.nxv4i32 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv2i32 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %zext.nxv2i32to64 = zext %load.nxv2i32 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv4i8.2 = load , ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %zext.nxv4i8to64 = zext %load.nxv4i8.2 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv4i16.2 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %zext.nxv4i16to64 = zext %load.nxv4i16.2 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv8i8.2 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %zext.nxv8i8to32 = zext %load.nxv8i8.2 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load.nxv8i8.3 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zext.nxv8i8to64 = zext %load.nxv8i8.3 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv16i8 = load , ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv16i8to16 = sext %load2.nxv16i8 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv8i8 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sext.nxv8i8to16 = sext %load2.nxv8i8 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv4i8 = load , ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sext.nxv4i8to32 = sext %load2.nxv4i8 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv2i8 = load , ptr undef, align 2 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sext.nxv2i8to64 = sext %load2.nxv2i8 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv8i16 = load , ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv8i16to32 = sext %load2.nxv8i16 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv4i16 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sext.nxv4i16to32 = sext %load2.nxv4i16 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv2i16 = load , ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sext.nxv2i16to64 = sext %load2.nxv2i16 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv4i32 = load , ptr undef, align 16 +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv4i32to64 = sext %load2.nxv4i32 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv2i32 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 0 for: %sext.nxv2i32to64 = sext %load2.nxv2i32 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv4i8.2 = load , ptr undef, align 4 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sext.nxv4i8to64 = sext %load2.nxv4i8.2 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv4i16.2 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sext.nxv4i16to64 = sext %load2.nxv4i16.2 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv8i8.2 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of 1 for: %sext.nxv8i8to32 = sext %load2.nxv8i8.2 to +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %load2.nxv8i8.3 = load , ptr undef, align 8 +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sext.nxv8i8to64 = sext %load2.nxv8i8.3 to +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-math.ll b/llvm/test/Analysis/CostModel/AArch64/sve-math.ll index 17cedbb7a712f..c75f266e0983e 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-math.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-math.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt -mtriple=aarch64-- -mattr=+sve -passes="print" 2>&1 -disable-output -cost-kind=all < %s | FileCheck %s +; RUN: opt -mtriple=aarch64-- -mattr=+sve -passes="print" -cost-kind=all 2>&1 -disable-output < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-min-max.ll b/llvm/test/Analysis/CostModel/AArch64/sve-min-max.ll index 0d637e0b603de..829ce127493ed 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-min-max.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-min-max.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sve -passes="print" 2>&1 -disable-output -cost-kind=all | FileCheck %s +; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sve -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll b/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll index 9d8f43cd99368..e2488735de4b5 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll @@ -1,48 +1,48 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes="print" 2>&1 -disable-output | FileCheck %s +; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes="print" -cost-kind=all 2>&1 -disable-output | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @srem() { ; CHECK-LABEL: 'srem' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2i64 = srem <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i64 = srem <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i64 = srem <8 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = srem <2 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i32 = srem <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = srem <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i32 = srem <16 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2i16 = srem <2 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i16 = srem <4 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8i16 = srem <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16i16 = srem <16 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32i16 = srem <32 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2i8 = srem <2 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4i8 = srem <4 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8i8 = srem <8 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i8 = srem <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32i8 = srem <32 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V64i8 = srem <64 x i8> undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i64 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV4i64 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV8i64 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i32 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i32 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i32 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV16i32 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i16 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i16 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV8i16 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV16i16 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %NV32i16 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i8 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i8 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV8i8 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %NV16i8 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %NV32i8 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %NV64i8 = srem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i64 = srem <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = srem <2 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i32 = srem <4 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = srem <2 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = srem <4 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = srem <8 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:40 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = srem <2 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = srem <4 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = srem <8 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:18 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = srem <16 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:36 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:72 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NV2i128 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i64 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i32 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i32 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i16 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i16 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:40 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i8 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i8 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:18 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:36 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:72 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = srem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = srem <2 x i64> undef, undef %V4i64 = srem <4 x i64> undef, undef @@ -86,44 +86,44 @@ define void @srem() { define void @urem() { ; CHECK-LABEL: 'urem' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2i64 = urem <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i64 = urem <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i64 = urem <8 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2i32 = urem <2 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i32 = urem <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = urem <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i32 = urem <16 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2i16 = urem <2 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i16 = urem <4 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8i16 = urem <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16i16 = urem <16 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V32i16 = urem <32 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2i8 = urem <2 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4i8 = urem <4 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8i8 = urem <8 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i8 = urem <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32i8 = urem <32 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V64i8 = urem <64 x i8> undef, undef -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i64 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV4i64 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV8i64 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i32 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i32 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i32 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %NV16i32 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i16 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i16 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV8i16 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV16i16 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %NV32i16 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV2i8 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i8 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV8i8 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %NV16i8 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %NV32i8 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %NV64i8 = urem undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i64 = urem <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = urem <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = urem <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = urem <2 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i32 = urem <4 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = urem <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = urem <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = urem <2 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = urem <4 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = urem <8 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = urem <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:40 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = urem <32 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = urem <2 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = urem <4 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = urem <8 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:18 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = urem <16 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:36 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = urem <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:72 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = urem <64 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NV2i128 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i64 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i32 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i32 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i16 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i16 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:40 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV2i8 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i8 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:18 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:36 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:72 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = urem undef, undef +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = urem <2 x i64> undef, undef %V4i64 = urem <4 x i64> undef, undef @@ -167,44 +167,44 @@ define void @urem() { define void @srem_uniformconst() { ; CHECK-LABEL: 'srem_uniformconst' -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2i64 = srem <2 x i64> undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i64 = srem <4 x i64> undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i64 = srem <8 x i64> undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = srem <2 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = srem <4 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = srem <8 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = srem <16 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = srem <2 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i16 = srem <4 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = srem <8 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = srem <2 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i8 = srem <4 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i8 = srem <8 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = srem undef, splat (i128 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i64 = srem undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV4i64 = srem undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV8i64 = srem undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i32 = srem undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV4i32 = srem undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV8i32 = srem undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV16i32 = srem undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i16 = srem undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV4i16 = srem undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV8i16 = srem undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV16i16 = srem undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV32i16 = srem undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i8 = srem undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV4i8 = srem undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV8i8 = srem undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV16i8 = srem undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV32i8 = srem undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV64i8 = srem undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = srem <2 x i64> undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = srem <2 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = srem <4 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = srem <2 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = srem <4 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = srem <8 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = srem <2 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = srem <4 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = srem <8 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = srem <16 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NV2i128 = srem undef, splat (i128 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = srem undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = srem undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = srem undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = srem undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = srem undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = srem undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = srem undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = srem undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = srem undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = srem undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = srem undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = srem undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = srem undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = srem undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = srem undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = srem undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = srem undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = srem undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = srem <2 x i64> undef, splat (i64 7) %V4i64 = srem <4 x i64> undef, splat (i64 7) @@ -248,44 +248,44 @@ define void @srem_uniformconst() { define void @urem_uniformconst() { ; CHECK-LABEL: 'urem_uniformconst' -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2i64 = urem <2 x i64> undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i64 = urem <4 x i64> undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i64 = urem <8 x i64> undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2i32 = urem <2 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = urem <4 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = urem <8 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i32 = urem <16 x i32> undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2i16 = urem <2 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i16 = urem <4 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2i8 = urem <2 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i8 = urem <4 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i8 = urem <8 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = urem undef, splat (i128 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV2i64 = urem undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %NV4i64 = urem undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %NV8i64 = urem undef, splat (i64 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV2i32 = urem undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV4i32 = urem undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %NV8i32 = urem undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %NV16i32 = urem undef, splat (i32 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV2i16 = urem undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV4i16 = urem undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV8i16 = urem undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %NV16i16 = urem undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %NV32i16 = urem undef, splat (i16 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV2i8 = urem undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV4i8 = urem undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV8i8 = urem undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV16i8 = urem undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %NV32i8 = urem undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %NV64i8 = urem undef, splat (i8 7) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = urem <2 x i64> undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = urem <4 x i64> undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = urem <8 x i64> undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = urem <2 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = urem <4 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = urem <8 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = urem <16 x i32> undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = urem <2 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = urem <4 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = urem <8 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = urem <16 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = urem <32 x i16> undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = urem <2 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = urem <4 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = urem <8 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = urem <16 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = urem <32 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = urem <64 x i8> undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NV2i128 = urem undef, splat (i128 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = urem undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = urem undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = urem undef, splat (i64 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = urem undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = urem undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = urem undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = urem undef, splat (i32 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = urem undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = urem undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = urem undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = urem undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = urem undef, splat (i16 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = urem undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = urem undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = urem undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = urem undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = urem undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = urem undef, splat (i8 7) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = urem <2 x i64> undef, splat (i64 7) %V4i64 = urem <4 x i64> undef, splat (i64 7) @@ -329,44 +329,44 @@ define void @urem_uniformconst() { define void @srem_uniformconstpow2() { ; CHECK-LABEL: 'srem_uniformconstpow2' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2i64 = srem <2 x i64> undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i64 = srem <4 x i64> undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i64 = srem <8 x i64> undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2i32 = srem <2 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i32 = srem <4 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = srem <8 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i32 = srem <16 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2i16 = srem <2 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i16 = srem <4 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i16 = srem <8 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2i8 = srem <2 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4i8 = srem <4 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i8 = srem <8 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = srem undef, splat (i128 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i64 = srem undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV4i64 = srem undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV8i64 = srem undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i32 = srem undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i32 = srem undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV8i32 = srem undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV16i32 = srem undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i16 = srem undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i16 = srem undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV8i16 = srem undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV16i16 = srem undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV32i16 = srem undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV2i8 = srem undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i8 = srem undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV8i8 = srem undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV16i8 = srem undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV32i8 = srem undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %NV64i8 = srem undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i64 = srem <2 x i64> undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i32 = srem <2 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i32 = srem <4 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i16 = srem <2 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i16 = srem <4 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i16 = srem <8 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i8 = srem <2 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i8 = srem <4 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i8 = srem <8 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16i8 = srem <16 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NV2i128 = srem undef, splat (i128 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = srem undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i64 = srem undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = srem undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = srem undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = srem undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV8i32 = srem undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = srem undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = srem undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = srem undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = srem undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV16i16 = srem undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = srem undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = srem undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = srem undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = srem undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = srem undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV32i8 = srem undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = srem undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = srem <2 x i64> undef, splat (i64 16) %V4i64 = srem <4 x i64> undef, splat (i64 16) @@ -410,44 +410,44 @@ define void @srem_uniformconstpow2() { define void @urem_uniformconstpow2() { ; CHECK-LABEL: 'urem_uniformconstpow2' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = urem <2 x i64> undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = urem <4 x i64> undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8i64 = urem <8 x i64> undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = urem <2 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = urem <4 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = urem <8 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = urem <16 x i32> undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = urem <2 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = urem <4 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = urem <8 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = urem <2 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = urem <4 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = urem <8 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = urem undef, splat (i128 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV2i64 = urem undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV4i64 = urem undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV8i64 = urem undef, splat (i64 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV2i32 = urem undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV4i32 = urem undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV8i32 = urem undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV16i32 = urem undef, splat (i32 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV2i16 = urem undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV4i16 = urem undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV8i16 = urem undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV16i16 = urem undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV32i16 = urem undef, splat (i16 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV2i8 = urem undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV4i8 = urem undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV8i8 = urem undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %NV16i8 = urem undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %NV32i8 = urem undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %NV64i8 = urem undef, splat (i8 16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = urem <2 x i64> undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = urem <4 x i64> undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i64 = urem <8 x i64> undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = urem <2 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = urem <4 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = urem <8 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16i32 = urem <16 x i32> undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = urem <2 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = urem <4 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = urem <8 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = urem <16 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V32i16 = urem <32 x i16> undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = urem <2 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = urem <4 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = urem <8 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = urem <16 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = urem <32 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V64i8 = urem <64 x i8> undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NV2i128 = urem undef, splat (i128 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = urem undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = urem undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV8i64 = urem undef, splat (i64 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = urem undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = urem undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = urem undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV16i32 = urem undef, splat (i32 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = urem undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = urem undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = urem undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = urem undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV32i16 = urem undef, splat (i16 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = urem undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = urem undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = urem undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = urem undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = urem undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV64i8 = urem undef, splat (i8 16) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = urem <2 x i64> undef, splat (i64 16) %V4i64 = urem <4 x i64> undef, splat (i64 16) @@ -491,44 +491,44 @@ define void @urem_uniformconstpow2() { define void @srem_uniformconstnegpow2() { ; CHECK-LABEL: 'srem_uniformconstnegpow2' -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2i64 = srem <2 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i64 = srem <4 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i64 = srem <8 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i32 = srem <2 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = srem <4 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = srem <8 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = srem <16 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i16 = srem <2 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i16 = srem <4 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = srem <8 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = srem <16 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = srem <32 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2i8 = srem <2 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i8 = srem <4 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i8 = srem <8 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i8 = srem <16 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32i8 = srem <32 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64i8 = srem <64 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = srem undef, splat (i128 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i64 = srem undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV4i64 = srem undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV8i64 = srem undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i32 = srem undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV4i32 = srem undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV8i32 = srem undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV16i32 = srem undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i16 = srem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV4i16 = srem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV8i16 = srem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV16i16 = srem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV32i16 = srem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV2i8 = srem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV4i8 = srem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV8i8 = srem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %NV16i8 = srem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %NV32i8 = srem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %NV64i8 = srem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = srem <2 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = srem <2 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = srem <4 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = srem <2 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = srem <4 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = srem <8 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = srem <2 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = srem <4 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = srem <8 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = srem <16 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NV2i128 = srem undef, splat (i128 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = srem undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = srem undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = srem undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = srem undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = srem undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = srem undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = srem undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = srem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = srem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = srem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = srem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = srem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = srem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = srem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = srem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = srem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = srem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = srem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = srem <2 x i64> undef, splat (i64 -16) %V4i64 = srem <4 x i64> undef, splat (i64 -16) @@ -572,44 +572,44 @@ define void @srem_uniformconstnegpow2() { define void @urem_uniformconstnegpow2() { ; CHECK-LABEL: 'urem_uniformconstnegpow2' -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2i64 = urem <2 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4i64 = urem <4 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i64 = urem <8 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2i32 = urem <2 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = urem <4 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = urem <8 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i32 = urem <16 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2i16 = urem <2 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i16 = urem <4 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = urem <16 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = urem <32 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2i8 = urem <2 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i8 = urem <4 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i8 = urem <8 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = urem <16 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32i8 = urem <32 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64i8 = urem <64 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Invalid cost for instruction: %NV2i128 = urem undef, splat (i128 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV2i64 = urem undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %NV4i64 = urem undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %NV8i64 = urem undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV2i32 = urem undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV4i32 = urem undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %NV8i32 = urem undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %NV16i32 = urem undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV2i16 = urem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV4i16 = urem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV8i16 = urem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %NV16i16 = urem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %NV32i16 = urem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV2i8 = urem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV4i8 = urem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV8i8 = urem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %NV16i8 = urem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %NV32i8 = urem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %NV64i8 = urem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = urem <2 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = urem <4 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = urem <8 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = urem <2 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = urem <4 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = urem <8 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = urem <16 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = urem <2 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = urem <4 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = urem <8 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = urem <16 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = urem <32 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = urem <2 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = urem <4 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = urem <8 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = urem <16 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = urem <32 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = urem <64 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NV2i128 = urem undef, splat (i128 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = urem undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = urem undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = urem undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = urem undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = urem undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = urem undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = urem undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = urem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = urem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = urem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = urem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = urem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = urem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = urem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = urem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = urem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = urem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = urem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = urem <2 x i64> undef, splat (i64 -16) %V4i64 = urem <4 x i64> undef, splat (i64 -16) diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/AArch64/sve-shuffle-broadcast.ll index a2526d9f5591a..729e04a18c9de 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-shuffle-broadcast.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-shuffle-broadcast.ll @@ -1,36 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 ; Check getShuffleCost for SK_BroadCast with scalable vector -; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s +; RUN: opt -passes="print" -cost-kind=all 2>&1 -disable-output -mtriple=aarch64--linux-gnu -mattr=sve < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @broadcast() #0{ ; CHECK-LABEL: 'broadcast' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %zero = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %5 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %7 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %14 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %16 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %21 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = shufflevector undef, undef, zeroinitializer -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %zero = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 2 for: %1 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %2 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 2 for: %3 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %4 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 2 for: %5 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %6 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 2 for: %7 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %8 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %9 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %10 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 2 for: %11 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %12 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %13 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 2 for: %14 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %15 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 2 for: %16 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %17 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %18 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %19 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 2 for: %20 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %21 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %22 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %23 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of 1 for: %24 = shufflevector undef, undef, zeroinitializer +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void +; ; CHECK-NETX: Cost Model: Found an estimated cost of 0 for instruction: ret void %zero = shufflevector undef, undef, zeroinitializer diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll index e754d264c1b41..397b73753e680 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll @@ -1,43 +1,43 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -mtriple=aarch64-linux-gnu -mattr=+sve -passes="print" 2>&1 -disable-output < %s | FileCheck %s +; RUN: opt -mtriple=aarch64-linux-gnu -mattr=+sve -passes="print" -cost-kind=all 2>&1 -disable-output < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @sve_truncs() { ; CHECK-LABEL: 'sve_truncs' -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv2i8_to_i1 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv2i16_to_i1 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv2i32_to_i1 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv2i64_to_i1 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv4i8_to_i1 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv4i16_to_i1 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv4i32_to_i1 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %trunc_nxv4i64_to_i1 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv8i8_to_i1 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv8i16_to_i1 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %trunc_nxv8i32_to_i1 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %trunc_nxv8i64_to_i1 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv2i16_to_i8 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv2i32_to_i8 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv2i64_to_i8 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv2i32_to_i16 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv2i64_to_i16 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv2i64_to_i32 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv4i16_to_i8 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv4i32_to_i8 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_nxv4i64_to_i8 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv4i32_to_i16 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_nxv4i64_to_i16 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_nxv4i64_to_i32 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv8i16_to_i8 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_nxv8i32_to_i8 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %trunc_nxv8i64_to_i8 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_nxv8i32_to_i16 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %trunc_nxv8i64_to_i16 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %trunc_nxv16i16_to_i8 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %trunc_nxv16i32_to_i8 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %trunc_nxv16i64_to_i8 = trunc undef to -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i8_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i16_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i32_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv2i64_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i8_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i16_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i32_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv4i64_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i8_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i16_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i32_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:11 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i1 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i16_to_i8 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i32_to_i8 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i8 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i32_to_i16 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i16 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv2i64_to_i32 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv4i16_to_i8 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv4i32_to_i8 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i8 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv4i32_to_i16 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i16 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv4i64_to_i32 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 0 for: %trunc_nxv8i16_to_i8 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv8i32_to_i8 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i8 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv8i32_to_i16 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv8i64_to_i16 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of 1 for: %trunc_nxv16i16_to_i8 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv16i32_to_i8 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %trunc_nxv16i64_to_i8 = trunc undef to +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %trunc_nxv2i8_to_i1 = trunc undef to %trunc_nxv2i16_to_i1 = trunc undef to diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-vscale.ll b/llvm/test/Analysis/CostModel/AArch64/sve-vscale.ll index d85546166625e..ea1ec36918b59 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-vscale.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-vscale.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt -mtriple=aarch64-linux-gnu -mattr=+sve -passes="print" 2>&1 -disable-output < %s | FileCheck %s +; RUN: opt -mtriple=aarch64-linux-gnu -mattr=+sve -passes="print" -cost-kind=all 2>&1 -disable-output < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define i32 @vscale32() { ; CHECK-LABEL: 'vscale32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = call i32 @llvm.vscale.i32() -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %c +; CHECK-NEXT: Cost Model: Found costs of 1 for: %c = call i32 @llvm.vscale.i32() +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %c ; %c = call i32 @llvm.vscale.i32() ret i32 %c @@ -14,8 +14,8 @@ define i32 @vscale32() { define i64 @vscale64() { ; CHECK-LABEL: 'vscale64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c = call i64 @llvm.vscale.i64() -; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %c +; CHECK-NEXT: Cost Model: Found costs of 1 for: %c = call i64 @llvm.vscale.i64() +; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %c ; %c = call i64 @llvm.vscale.i64() ret i64 %c From b46c60272ef4163d3174cfd0fb48ab1cf63b4f62 Mon Sep 17 00:00:00 2001 From: Sandeep Dasgupta Date: Mon, 31 Mar 2025 11:50:32 -0700 Subject: [PATCH 0140/1029] Remove unused function from StripFuncQuantTypes (#121594) `StripFuncQuantTypes::isLegalType` is unused and can be safely removed. --- mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp index de834fed90e42..4009faa21576d 100644 --- a/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp +++ b/mlir/lib/Dialect/Quant/Transforms/StripFuncQuantTypes.cpp @@ -65,14 +65,6 @@ class QuantizedTypeConverter : public TypeConverter { class StripFuncQuantTypes : public impl::StripFuncQuantTypesBase { - // Return whether a type is considered legal when occurring in the header of - // a function or as an operand to a 'return' op. - static bool isLegalType(Type type) { - if (auto tensorType = dyn_cast(type)) - return isLegalType(tensorType.getElementType()); - return !isa(type); - } - public: void runOnOperation() override { From 8e390dedd71d0c2bcbe8775aee2e234ef7a5b787 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 31 Mar 2025 20:11:00 +0100 Subject: [PATCH 0141/1029] [EquivalenceClasses] Replace findValue with contains (NFC). Replace remaining use of findValue with more compact and limited contains(). --- llvm/include/llvm/ADT/EquivalenceClasses.h | 7 +++---- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h index c375d6e77b12a..f9c7819f18806 100644 --- a/llvm/include/llvm/ADT/EquivalenceClasses.h +++ b/llvm/include/llvm/ADT/EquivalenceClasses.h @@ -179,10 +179,9 @@ class EquivalenceClasses { return member_iterator(nullptr); } - /// findValue - Return an iterator to the specified value. If it does not - /// exist, end() is returned. - iterator findValue(const ElemTy &V) const { - return TheMapping.find(V); + /// Returns true if \p V is contained an equivalence class. + bool contains(const ElemTy &V) const { + return TheMapping.find(V) != TheMapping.end(); } /// getLeaderValue - Return the leader for the specified value that is in the diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index e7d6984caeba3..47ff31b9a0525 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1230,7 +1230,7 @@ bool AccessAnalysis::canCheckPtrAtRT( [this](const Value *Ptr) { MemAccessInfo AccessWrite(const_cast(Ptr), true); - return DepCands.findValue(AccessWrite) == DepCands.end(); + return !DepCands.contains(AccessWrite); })) && "Can only skip updating CanDoRT below, if all entries in AS " "are reads or there is at most 1 entry"); From 5a3079421ba7645b8c604709de397c6091611f2a Mon Sep 17 00:00:00 2001 From: AnastasiyaChernikova Date: Mon, 31 Mar 2025 22:13:46 +0300 Subject: [PATCH 0142/1029] Add RISC-V support information to readme (#132699) --- llvm/docs/CommandGuide/llvm-exegesis.rst | 9 +++++---- llvm/tools/llvm-exegesis/README.md | 3 +++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst index 8266d891a5e6b..821d5f7e8305f 100644 --- a/llvm/docs/CommandGuide/llvm-exegesis.rst +++ b/llvm/docs/CommandGuide/llvm-exegesis.rst @@ -34,10 +34,11 @@ SUPPORTED PLATFORMS ------------------- :program:`llvm-exegesis` currently only supports X86 (64-bit only), ARM -(AArch64 only, snippet generation is sparse), MIPS, and PowerPC (PowerPC64LE -only) on Linux for benchmarking. Not all benchmarking functionality is -guaranteed to work on every platform. :program:`llvm-exegesis` also has a -separate analysis mode that is supported on every platform that LLVM is. +(AArch64 only, snippet generation is sparse), MIPS, PowerPC (PowerPC64LE +only) and RISC-V (RV64I/E and RV32I/E) on Linux for benchmarking. Not all +benchmarking functionality is guaranteed to work on every platform. +:program:`llvm-exegesis` also has a separate analysis mode that is supported on +every platform that LLVM is. To enable benchmarking in llvm-exegesis, LLVM must be configured and built with `LLVM_ENABLE_LIBPFM` enabled, as :program:`llvm-exegesis` depends on libpfm4 diff --git a/llvm/tools/llvm-exegesis/README.md b/llvm/tools/llvm-exegesis/README.md index deb0f230f032f..aabc4de1f3ab4 100644 --- a/llvm/tools/llvm-exegesis/README.md +++ b/llvm/tools/llvm-exegesis/README.md @@ -32,6 +32,9 @@ architectures: e.g. pseudo instructions and most register classes are not supported. * MIPS * PowerPC (PowerPC64LE only) +* RISC-V + * RV64I/E, RV32I/E and extensions supported by LLVM's RISC-V backend with + some limitations. Note that not all benchmarking functionality is guaranteed to work on all platforms. From 89c25c54f96623e0e973b678a24f572fd204e132 Mon Sep 17 00:00:00 2001 From: Andrei Safronov Date: Mon, 31 Mar 2025 22:24:59 +0300 Subject: [PATCH 0143/1029] [Xtensa] Implement windowed register call ABI. (#130001) Implement base windowed register call ABI. By defaullt use rotation window by 8 registers. --- llvm/lib/Target/Xtensa/XtensaCallingConv.td | 20 +- .../lib/Target/Xtensa/XtensaFrameLowering.cpp | 236 +++++++++++++----- llvm/lib/Target/Xtensa/XtensaFrameLowering.h | 1 + llvm/lib/Target/Xtensa/XtensaISelLowering.cpp | 77 +++++- llvm/lib/Target/Xtensa/XtensaISelLowering.h | 5 + llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp | 18 +- llvm/lib/Target/Xtensa/XtensaInstrInfo.h | 4 +- llvm/lib/Target/Xtensa/XtensaInstrInfo.td | 22 +- .../Target/Xtensa/XtensaMachineFunctionInfo.h | 4 + llvm/lib/Target/Xtensa/XtensaOperators.td | 11 + llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp | 12 +- llvm/lib/Target/Xtensa/XtensaSubtarget.h | 12 +- llvm/test/CodeGen/Xtensa/aligned_alloc.ll | 32 +++ .../CodeGen/Xtensa/calling-conv-windowed.ll | 103 ++++++++ llvm/test/CodeGen/Xtensa/callw.ll | 52 ++++ llvm/test/CodeGen/Xtensa/saverestore.ll | 84 +++++-- 16 files changed, 565 insertions(+), 128 deletions(-) create mode 100644 llvm/test/CodeGen/Xtensa/aligned_alloc.ll create mode 100644 llvm/test/CodeGen/Xtensa/calling-conv-windowed.ll create mode 100644 llvm/test/CodeGen/Xtensa/callw.ll diff --git a/llvm/lib/Target/Xtensa/XtensaCallingConv.td b/llvm/lib/Target/Xtensa/XtensaCallingConv.td index a348b4c890b22..2c48f8f86cafb 100644 --- a/llvm/lib/Target/Xtensa/XtensaCallingConv.td +++ b/llvm/lib/Target/Xtensa/XtensaCallingConv.td @@ -9,16 +9,30 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -// Xtensa return value calling convention +// Xtensa base calling convention //===----------------------------------------------------------------------===// +// Xtensa return value def RetCC_Xtensa : CallingConv<[ // First two return values go in a2, a3, a4, a5 CCIfType<[i32], CCAssignToReg<[A2, A3, A4, A5]>>, CCIfType<[i64], CCAssignToRegWithShadow<[A2, A4], [A3, A5]>> ]>; +// Callee-saved register lists +def CSR_Xtensa : CalleeSavedRegs<(add A0, A12, A13, A14, A15)>; + //===----------------------------------------------------------------------===// -// Callee-saved register lists. +// Xtensa windowed calling convention. Currently by default implemented +// rotation window by 8 registers. //===----------------------------------------------------------------------===// +// Xtensa return value for 8 registers window +def RetCCW8_Xtensa : CallingConv<[ + //First two return values go in a10, a11, a12, a13 + CCIfType<[i32], CCAssignToReg<[A10, A11, A12, A13]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[A10, A12], [A11, A13]>> +]>; -def CSR_Xtensa : CalleeSavedRegs<(add A0, A12, A13, A14, A15)>; +// Callee-saved register lists for rotation window by 8 registers +def CSRW8_Xtensa : CalleeSavedRegs<(add)> { + let OtherPreserved = (add A0, SP, A2, A3, A4, A5, A6, A7); +} diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp index d09aac613f623..cf9a2a052978d 100644 --- a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp @@ -23,10 +23,16 @@ using namespace llvm; +// Minimum frame = reg save area (4 words) plus static chain (1 word) +// and the total number of words must be a multiple of 128 bits. +// Width of a word, in units (bytes). +#define UNITS_PER_WORD 4 +#define MIN_FRAME_SIZE (8 * UNITS_PER_WORD) + XtensaFrameLowering::XtensaFrameLowering(const XtensaSubtarget &STI) : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(4), 0, Align(4)), - TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {} + STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {} bool XtensaFrameLowering::hasFPImpl(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -43,6 +49,7 @@ void XtensaFrameLowering::emitPrologue(MachineFunction &MF, MCRegister SP = Xtensa::SP; MCRegister FP = TRI->getFrameRegister(MF); const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo(); + XtensaMachineFunctionInfo *XtensaFI = MF.getInfo(); // First, compute final stack size. uint64_t StackSize = MFI.getStackSize(); @@ -51,76 +58,153 @@ void XtensaFrameLowering::emitPrologue(MachineFunction &MF, // Round up StackSize to 16*N StackSize += (16 - StackSize) & 0xf; - // No need to allocate space on the stack. - if (StackSize == 0 && !MFI.adjustsStack()) - return; - - // Adjust stack. - TII.adjustStackPtr(SP, -StackSize, MBB, MBBI); - - // emit ".cfi_def_cfa_offset StackSize" - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize)); - BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - - const std::vector &CSI = MFI.getCalleeSavedInfo(); - - if (!CSI.empty()) { - // Find the instruction past the last instruction that saves a - // callee-saved register to the stack. The callee-saved store - // instructions are placed at the begin of basic block, so - // iterate over instruction sequence and check that - // save instructions are placed correctly. - for (unsigned i = 0, e = CSI.size(); i < e; ++i) { -#ifndef NDEBUG - const CalleeSavedInfo &Info = CSI[i]; - int FI = Info.getFrameIdx(); - int StoreFI = 0; + if (STI.isWindowedABI()) { + StackSize += 32; + uint64_t MaxAlignment = MFI.getMaxAlign().value(); + if (MaxAlignment > 32) + StackSize += MaxAlignment; + + if (StackSize <= 32760) { + BuildMI(MBB, MBBI, DL, TII.get(Xtensa::ENTRY)) + .addReg(SP) + .addImm(StackSize); + } else { + // Use a8 as a temporary since a0-a7 may be live. + MCRegister TmpReg = Xtensa::A8; + + BuildMI(MBB, MBBI, DL, TII.get(Xtensa::ENTRY)) + .addReg(SP) + .addImm(MIN_FRAME_SIZE); + TII.loadImmediate(MBB, MBBI, &TmpReg, StackSize - MIN_FRAME_SIZE); + BuildMI(MBB, MBBI, DL, TII.get(Xtensa::SUB), TmpReg) + .addReg(SP) + .addReg(TmpReg); + BuildMI(MBB, MBBI, DL, TII.get(Xtensa::MOVSP), SP).addReg(TmpReg); + } - // Checking that the instruction is exactly as expected - bool IsStoreInst = false; - if (MBBI->getOpcode() == TargetOpcode::COPY && Info.isSpilledToReg()) { - Register DstReg = MBBI->getOperand(0).getReg(); - Register Reg = MBBI->getOperand(1).getReg(); - IsStoreInst = Info.getDstReg() == DstReg.asMCReg() && - Info.getReg() == Reg.asMCReg(); - } else { - Register Reg = TII.isStoreToStackSlot(*MBBI, StoreFI); - IsStoreInst = Reg.asMCReg() == Info.getReg() && StoreFI == FI; - } - assert(IsStoreInst && - "Unexpected callee-saved register store instruction"); -#endif - ++MBBI; + // Calculate how much is needed to have the correct alignment. + // Change offset to: alignment + difference. + // For example, in case of alignment of 128: + // diff_to_128_aligned_address = (128 - (SP & 127)) + // new_offset = SP + diff_to_128_aligned_address + // This is safe to do because we increased the stack size by MaxAlignment. + MCRegister Reg, RegMisAlign; + if (MaxAlignment > 32) { + TII.loadImmediate(MBB, MBBI, &RegMisAlign, MaxAlignment - 1); + TII.loadImmediate(MBB, MBBI, &Reg, MaxAlignment); + BuildMI(MBB, MBBI, DL, TII.get(Xtensa::AND)) + .addReg(RegMisAlign, RegState::Define) + .addReg(FP) + .addReg(RegMisAlign); + BuildMI(MBB, MBBI, DL, TII.get(Xtensa::SUB), RegMisAlign) + .addReg(Reg) + .addReg(RegMisAlign); + BuildMI(MBB, MBBI, DL, TII.get(Xtensa::ADD), SP) + .addReg(SP) + .addReg(RegMisAlign, RegState::Kill); } - // Iterate over list of callee-saved registers and emit .cfi_offset - // directives. - for (const auto &I : CSI) { - int64_t Offset = MFI.getObjectOffset(I.getFrameIdx()); - MCRegister Reg = I.getReg(); + // Store FP register in A8, because FP may be used to pass function + // arguments + if (XtensaFI->isSaveFrameRegister()) { + BuildMI(MBB, MBBI, DL, TII.get(Xtensa::OR), Xtensa::A8) + .addReg(FP) + .addReg(FP); + } - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( - nullptr, MRI->getDwarfRegNum(Reg, 1), Offset)); + // if framepointer enabled, set it to point to the stack pointer. + if (hasFP(MF)) { + // Insert instruction "move $fp, $sp" at this location. + BuildMI(MBB, MBBI, DL, TII.get(Xtensa::OR), FP) + .addReg(SP) + .addReg(SP) + .setMIFlag(MachineInstr::FrameSetup); + + MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa( + nullptr, MRI->getDwarfRegNum(FP, true), StackSize); + unsigned CFIIndex = MF.addFrameInst(Inst); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } else { + // emit ".cfi_def_cfa_offset StackSize" + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } - } + } else { + // No need to allocate space on the stack. + if (StackSize == 0 && !MFI.adjustsStack()) + return; - // if framepointer enabled, set it to point to the stack pointer. - if (hasFP(MF)) { - // Insert instruction "move $fp, $sp" at this location. - BuildMI(MBB, MBBI, DL, TII.get(Xtensa::OR), FP) - .addReg(SP) - .addReg(SP) - .setMIFlag(MachineInstr::FrameSetup); - - // emit ".cfi_def_cfa_register $fp" - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaRegister( - nullptr, MRI->getDwarfRegNum(FP, true))); + // Adjust stack. + TII.adjustStackPtr(SP, -StackSize, MBB, MBBI); + + // emit ".cfi_def_cfa_offset StackSize" + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); + + const std::vector &CSI = MFI.getCalleeSavedInfo(); + + if (!CSI.empty()) { + // Find the instruction past the last instruction that saves a + // callee-saved register to the stack. The callee-saved store + // instructions are placed at the begin of basic block, so + // iterate over instruction sequence and check that + // save instructions are placed correctly. + for (unsigned i = 0, e = CSI.size(); i < e; ++i) { +#ifndef NDEBUG + const CalleeSavedInfo &Info = CSI[i]; + int FI = Info.getFrameIdx(); + int StoreFI = 0; + + // Checking that the instruction is exactly as expected + bool IsStoreInst = false; + if (MBBI->getOpcode() == TargetOpcode::COPY && Info.isSpilledToReg()) { + Register DstReg = MBBI->getOperand(0).getReg(); + Register Reg = MBBI->getOperand(1).getReg(); + IsStoreInst = Info.getDstReg() == DstReg.asMCReg() && + Info.getReg() == Reg.asMCReg(); + } else { + Register Reg = TII.isStoreToStackSlot(*MBBI, StoreFI); + IsStoreInst = Reg.asMCReg() == Info.getReg() && StoreFI == FI; + } + assert(IsStoreInst && + "Unexpected callee-saved register store instruction"); +#endif + ++MBBI; + } + + // Iterate over list of callee-saved registers and emit .cfi_offset + // directives. + for (const auto &I : CSI) { + int64_t Offset = MFI.getObjectOffset(I.getFrameIdx()); + MCRegister Reg = I.getReg(); + + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset( + nullptr, MRI->getDwarfRegNum(Reg, 1), Offset)); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + } + + // if framepointer enabled, set it to point to the stack pointer. + if (hasFP(MF)) { + // Insert instruction "move $fp, $sp" at this location. + BuildMI(MBB, MBBI, DL, TII.get(Xtensa::OR), FP) + .addReg(SP) + .addReg(SP) + .setMIFlag(MachineInstr::FrameSetup); + + // emit ".cfi_def_cfa_register $fp" + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createDefCfaRegister( + nullptr, MRI->getDwarfRegNum(FP, true))); + BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } } if (StackSize != PrevStackSize) { @@ -179,10 +263,22 @@ void XtensaFrameLowering::emitEpilogue(MachineFunction &MF, "Unexpected callee-saved register restore instruction"); #endif } - - BuildMI(MBB, I, DL, TII.get(Xtensa::OR), SP).addReg(FP).addReg(FP); + if (STI.isWindowedABI()) { + // In most architectures, we need to explicitly restore the stack pointer + // before returning. + // + // For Xtensa Windowed Register option, it is not needed to explicitly + // restore the stack pointer. Reason being is that on function return, + // the window of the caller (including the old stack pointer) gets + // restored anyways. + } else { + BuildMI(MBB, I, DL, TII.get(Xtensa::OR), SP).addReg(FP).addReg(FP); + } } + if (STI.isWindowedABI()) + return; + // Get the number of bytes from FrameInfo uint64_t StackSize = MFI.getStackSize(); @@ -199,6 +295,9 @@ bool XtensaFrameLowering::spillCalleeSavedRegisters( MachineFunction *MF = MBB.getParent(); MachineBasicBlock &EntryBlock = *(MF->begin()); + if (STI.isWindowedABI()) + return true; + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { // Add the callee-saved register as live-in. Do not add if the register is // A0 and return address is taken, because it will be implemented in @@ -224,6 +323,8 @@ bool XtensaFrameLowering::spillCalleeSavedRegisters( bool XtensaFrameLowering::restoreCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, MutableArrayRef CSI, const TargetRegisterInfo *TRI) const { + if (STI.isWindowedABI()) + return true; return TargetFrameLowering::restoreCalleeSavedRegisters(MBB, MI, CSI, TRI); } @@ -231,9 +332,6 @@ bool XtensaFrameLowering::restoreCalleeSavedRegisters( MachineBasicBlock::iterator XtensaFrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - const XtensaInstrInfo &TII = - *static_cast(MF.getSubtarget().getInstrInfo()); - if (!hasReservedCallFrame(MF)) { int64_t Amount = I->getOperand(0).getImm(); @@ -249,7 +347,11 @@ MachineBasicBlock::iterator XtensaFrameLowering::eliminateCallFramePseudoInstr( void XtensaFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const { - unsigned FP = TRI->getFrameRegister(MF); + MCRegister FP = TRI->getFrameRegister(MF); + + if (STI.isWindowedABI()) { + return; + } TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.h b/llvm/lib/Target/Xtensa/XtensaFrameLowering.h index 3f946e1ea730f..f0095b8774154 100644 --- a/llvm/lib/Target/Xtensa/XtensaFrameLowering.h +++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.h @@ -18,6 +18,7 @@ class XtensaInstrInfo; class XtensaRegisterInfo; class XtensaFrameLowering : public TargetFrameLowering { + const XtensaSubtarget &STI; const XtensaInstrInfo &TII; const XtensaRegisterInfo *TRI; diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp index 57f0cbbc36c24..d4ee2ca72ad38 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp @@ -42,6 +42,15 @@ static bool isLongCall(const char *str) { return true; } +// The calling conventions in XtensaCallingConv.td are described in terms of the +// callee's register window. This function translates registers to the +// corresponding caller window %o register. +static unsigned toCallerWindow(unsigned Reg) { + if (Reg >= Xtensa::A2 && Reg <= Xtensa::A7) + return Reg - Xtensa::A2 + Xtensa::A10; + return Reg; +} + XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM, const XtensaSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -339,7 +348,18 @@ SDValue XtensaTargetLowering::LowerFormalArguments( // Transform the arguments stored on // physical registers into virtual ones - Register Reg = MF.addLiveIn(VA.getLocReg(), &Xtensa::ARRegClass); + Register Reg = 0; + MCRegister FrameReg = Subtarget.getRegisterInfo()->getFrameRegister(MF); + + // Argument passed in FrameReg in Windowed ABI we save in A8 (in + // emitPrologue), so load argument from A8 + if (Subtarget.isWindowedABI() && (VA.getLocReg() == FrameReg)) { + Reg = MF.addLiveIn(Xtensa::A8, &Xtensa::ARRegClass); + XtensaFI->setSaveFrameRegister(); + } else { + Reg = MF.addLiveIn(VA.getLocReg(), &Xtensa::ARRegClass); + } + SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); // If this is an 8 or 16-bit value, it has been passed promoted @@ -538,6 +558,8 @@ XtensaTargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue Glue; for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) { unsigned Reg = RegsToPass[I].first; + if (Subtarget.isWindowedABI()) + Reg = toCallerWindow(Reg); Chain = DAG.getCopyToReg(Chain, DL, Reg, RegsToPass[I].second, Glue); Glue = Chain.getValue(1); } @@ -587,6 +609,8 @@ XtensaTargetLowering::LowerCall(CallLoweringInfo &CLI, // known live into the call. for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) { unsigned Reg = RegsToPass[I].first; + if (Subtarget.isWindowedABI()) + Reg = toCallerWindow(Reg); Ops.push_back(DAG.getRegister(Reg, RegsToPass[I].second.getValueType())); } @@ -595,7 +619,9 @@ XtensaTargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(Glue); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getNode(XtensaISD::CALL, DL, NodeTys, Ops); + Chain = DAG.getNode(Subtarget.isWindowedABI() ? XtensaISD::CALLW8 + : XtensaISD::CALL, + DL, NodeTys, Ops); Glue = Chain.getValue(1); // Mark the end of the call, which is glued to the call itself. @@ -606,7 +632,8 @@ XtensaTargetLowering::LowerCall(CallLoweringInfo &CLI, // Assign locations to each value returned by this call. SmallVector RetLocs; CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext()); - RetCCInfo.AnalyzeCallResult(Ins, RetCC_Xtensa); + RetCCInfo.AnalyzeCallResult(Ins, Subtarget.isWindowedABI() ? RetCCW8_Xtensa + : RetCC_Xtensa); // Copy all of the result registers out of their specified physreg. for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) { @@ -648,7 +675,9 @@ XtensaTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue Glue; // Quick exit for void returns if (RetLocs.empty()) - return DAG.getNode(XtensaISD::RET, DL, MVT::Other, Chain); + return DAG.getNode(Subtarget.isWindowedABI() ? XtensaISD::RETW + : XtensaISD::RET, + DL, MVT::Other, Chain); // Copy the result values into the output registers. SmallVector RetOps; @@ -672,7 +701,9 @@ XtensaTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, if (Glue.getNode()) RetOps.push_back(Glue); - return DAG.getNode(XtensaISD::RET, DL, MVT::Other, RetOps); + return DAG.getNode(Subtarget.isWindowedABI() ? XtensaISD::RETW + : XtensaISD::RET, + DL, MVT::Other, RetOps); } static unsigned getBranchOpcode(ISD::CondCode Cond) { @@ -864,8 +895,14 @@ SDValue XtensaTargetLowering::LowerSTACKSAVE(SDValue Op, SDValue XtensaTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const { - return DAG.getCopyToReg(Op.getOperand(0), SDLoc(Op), Xtensa::SP, - Op.getOperand(1)); + SDValue Chain = Op.getOperand(0); + SDValue NewSP = Op.getOperand(1); + + if (Subtarget.isWindowedABI()) { + return DAG.getNode(XtensaISD::MOVSP, SDLoc(Op), MVT::Other, Chain, NewSP); + } + + return DAG.getCopyToReg(Chain, SDLoc(Op), Xtensa::SP, NewSP); } SDValue XtensaTargetLowering::LowerFRAMEADDR(SDValue Op, @@ -884,7 +921,7 @@ SDValue XtensaTargetLowering::LowerFRAMEADDR(SDValue Op, EVT VT = Op.getValueType(); SDLoc DL(Op); - Register FrameRegister = Subtarget.getRegisterInfo()->getFrameRegister(MF); + MCRegister FrameRegister = Subtarget.getRegisterInfo()->getFrameRegister(MF); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameRegister, VT); return FrameAddr; @@ -903,10 +940,15 @@ SDValue XtensaTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue SizeRoundUp = DAG.getNode(ISD::AND, DL, VT, SizeTmp, DAG.getSignedConstant(~31, DL, MVT::i32)); - unsigned SPReg = Xtensa::SP; + MCRegister SPReg = Xtensa::SP; SDValue SP = DAG.getCopyFromReg(Chain, DL, SPReg, VT); SDValue NewSP = DAG.getNode(ISD::SUB, DL, VT, SP, SizeRoundUp); // Value - Chain = DAG.getCopyToReg(SP.getValue(1), DL, SPReg, NewSP); // Output chain + if (Subtarget.isWindowedABI()) { + Chain = DAG.getNode(XtensaISD::MOVSP, SDLoc(Op), MVT::Other, SP.getValue(1), + NewSP); + } else { + Chain = DAG.getCopyToReg(SP.getValue(1), DL, SPReg, NewSP); // Output chain + } SDValue NewVal = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i32); Chain = NewVal.getValue(1); @@ -1230,12 +1272,18 @@ const char *XtensaTargetLowering::getTargetNodeName(unsigned Opcode) const { return "XtensaISD::BR_JT"; case XtensaISD::CALL: return "XtensaISD::CALL"; + case XtensaISD::CALLW8: + return "XtensaISD::CALLW8"; case XtensaISD::EXTUI: return "XtensaISD::EXTUI"; + case XtensaISD::MOVSP: + return "XtensaISD::MOVSP"; case XtensaISD::PCREL_WRAPPER: return "XtensaISD::PCREL_WRAPPER"; case XtensaISD::RET: return "XtensaISD::RET"; + case XtensaISD::RETW: + return "XtensaISD::RETW"; case XtensaISD::SELECT_CC: return "XtensaISD::SELECT_CC"; case XtensaISD::SRCL: @@ -1339,6 +1387,15 @@ MachineBasicBlock *XtensaTargetLowering::EmitInstrWithCustomInserter( } return MBB; } + case Xtensa::MOVSP_P: { + MachineOperand &NewSP = MI.getOperand(0); + + BuildMI(*MBB, MI, DL, TII.get(Xtensa::MOVSP), Xtensa::SP) + .addReg(NewSP.getReg()); + MI.eraseFromParent(); + + return MBB; + } default: llvm_unreachable("Unexpected instr type to insert"); } diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.h b/llvm/lib/Target/Xtensa/XtensaISelLowering.h index a959299d8ca6a..c7d4f41b1f08e 100644 --- a/llvm/lib/Target/Xtensa/XtensaISelLowering.h +++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.h @@ -29,16 +29,21 @@ enum { // is the target address. The arguments start at operand 2. // There is an optional glue operand at the end. CALL, + // Call with rotation window by 8 registers + CALLW8, // Extract unsigned immediate. Operand 0 is value, operand 1 // is bit position of the field [0..31], operand 2 is bit size // of the field [1..16] EXTUI, + MOVSP, + // Wraps a TargetGlobalAddress that should be loaded using PC-relative // accesses. Operand 0 is the address. PCREL_WRAPPER, RET, + RETW, // Select with condition operator - This selects between a true value and // a false value (ops #2 and #3) based on the boolean result of comparing diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp index da2883e1902ca..005532b864c41 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp @@ -76,7 +76,7 @@ Register XtensaInstrInfo::isStoreToStackSlot(const MachineInstr &MI, } /// Adjust SP by Amount bytes. -void XtensaInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount, +void XtensaInstrInfo::adjustStackPtr(MCRegister SP, int64_t Amount, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); @@ -88,21 +88,25 @@ void XtensaInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount, const TargetRegisterClass *RC = &Xtensa::ARRegClass; // create virtual reg to store immediate - unsigned Reg = RegInfo.createVirtualRegister(RC); + MCRegister Reg = RegInfo.createVirtualRegister(RC); if (isInt<8>(Amount)) { // addi sp, sp, amount BuildMI(MBB, I, DL, get(Xtensa::ADDI), Reg).addReg(SP).addImm(Amount); } else { // Expand immediate that doesn't fit in 8-bit. - unsigned Reg1; + MCRegister Reg1; loadImmediate(MBB, I, &Reg1, Amount); BuildMI(MBB, I, DL, get(Xtensa::ADD), Reg) .addReg(SP) .addReg(Reg1, RegState::Kill); } - BuildMI(MBB, I, DL, get(Xtensa::OR), SP) - .addReg(Reg, RegState::Kill) - .addReg(Reg, RegState::Kill); + if (STI.isWindowedABI()) { + BuildMI(MBB, I, DL, get(Xtensa::MOVSP), SP).addReg(Reg, RegState::Kill); + } else { + BuildMI(MBB, I, DL, get(Xtensa::OR), SP) + .addReg(Reg, RegState::Kill) + .addReg(Reg, RegState::Kill); + } } void XtensaInstrInfo::copyPhysReg(MachineBasicBlock &MBB, @@ -156,7 +160,7 @@ void XtensaInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC, void XtensaInstrInfo::loadImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned *Reg, int64_t Value) const { + MCRegister *Reg, int64_t Value) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); const TargetRegisterClass *RC = &Xtensa::ARRegClass; diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h index 9bb2114427146..1808cb36d8a9b 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.h +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.h @@ -35,7 +35,7 @@ class XtensaInstrInfo : public XtensaGenInstrInfo { public: XtensaInstrInfo(const XtensaSubtarget &STI); - void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB, + void adjustStackPtr(MCRegister SP, int64_t Amount, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; unsigned getInstSizeInBytes(const MachineInstr &MI) const override; @@ -73,7 +73,7 @@ class XtensaInstrInfo : public XtensaGenInstrInfo { // Emit code before MBBI in MI to move immediate value Value into // physical register Reg. void loadImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned *Reg, int64_t Value) const; + MCRegister *Reg, int64_t Value) const; bool reverseBranchCondition(SmallVectorImpl &Cond) const override; diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td index 1f397e3ecac35..2de19f62e14c5 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td @@ -709,22 +709,40 @@ let isCall = 1, Defs = [A0] in { } } +// Windowed call patterns. Currently rotation +// window by 8 is implemented. +def : Pat<(Xtensa_callw8 (i32 tglobaladdr:$dst)), + (CALL8 tglobaladdr:$dst)>; +def : Pat<(Xtensa_callw8 (i32 texternalsym:$dst)), + (CALL8 texternalsym:$dst)>; +def : Pat<(Xtensa_callw8 AR:$dst), + (CALLX8 AR:$dst)>; + def MOVSP : RRR_Inst<0x00, 0x00, 0x00, (outs AR:$t), (ins AR:$s), "movsp\t$t, $s", []>, Requires<[HasWindowed]> { let r = 0x01; } +// Use this pseudo operation instead of getCopyToReg function to +// update SP register. +let usesCustomInserter = 1, Defs = [SP], Predicates = [HasWindowed] in { + def MOVSP_P : Pseudo<(outs), (ins AR:$s), + "!movsp_p\tsp, $s", [(Xtensa_movsp AR:$s)]>; +} + let isReturn = 1, isTerminator = 1, isBarrier = 1, Uses = [A0] in { def RETW_N : RRRN_Inst<0x0D, (outs), (ins), - "retw.n", []>, Requires<[HasWindowed, HasDensity]> { + "retw.n", [(Xtensa_retw)]>, + Requires<[HasWindowed, HasDensity]> { let r = 0x0F; let s = 0; let t = 1; } def RETW : CALLX_Inst<0x00, 0x00, 0x00, (outs), (ins), - "retw", []>, Requires<[HasWindowed]> { + "retw", [(Xtensa_retw)]>, + Requires<[HasWindowed]> { let m = 0x2; let n = 0x1; let s = 0; diff --git a/llvm/lib/Target/Xtensa/XtensaMachineFunctionInfo.h b/llvm/lib/Target/Xtensa/XtensaMachineFunctionInfo.h index c430562091ba7..bc051d9ca14fa 100644 --- a/llvm/lib/Target/Xtensa/XtensaMachineFunctionInfo.h +++ b/llvm/lib/Target/Xtensa/XtensaMachineFunctionInfo.h @@ -27,6 +27,7 @@ class XtensaMachineFunctionInfo : public MachineFunctionInfo { unsigned VarArgsFirstGPR; int VarArgsOnStackFrameIndex; int VarArgsInRegsFrameIndex; + bool SaveFrameRegister = false; public: explicit XtensaMachineFunctionInfo(const Function &F, @@ -50,6 +51,9 @@ class XtensaMachineFunctionInfo : public MachineFunctionInfo { // Get and set the frame index of the first stack vararg. int getVarArgsInRegsFrameIndex() const { return VarArgsInRegsFrameIndex; } void setVarArgsInRegsFrameIndex(int FI) { VarArgsInRegsFrameIndex = FI; } + + bool isSaveFrameRegister() const { return SaveFrameRegister; } + void setSaveFrameRegister() { SaveFrameRegister = true; } }; } // namespace llvm diff --git a/llvm/lib/Target/Xtensa/XtensaOperators.td b/llvm/lib/Target/Xtensa/XtensaOperators.td index 3dd73b44f336a..12b81fccec479 100644 --- a/llvm/lib/Target/Xtensa/XtensaOperators.td +++ b/llvm/lib/Target/Xtensa/XtensaOperators.td @@ -31,15 +31,23 @@ def SDT_XtensaSRC : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCi def SDT_XtensaEXTUI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; +def SDT_XtensaMOVSP : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; + //===----------------------------------------------------------------------===// // Node definitions //===----------------------------------------------------------------------===// def Xtensa_call: SDNode<"XtensaISD::CALL", SDT_XtensaCall, [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>; +def Xtensa_callw8: SDNode<"XtensaISD::CALLW8", SDT_XtensaCall, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>; + def Xtensa_ret: SDNode<"XtensaISD::RET", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def Xtensa_retw: SDNode<"XtensaISD::RETW", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + def Xtensa_pcrel_wrapper: SDNode<"XtensaISD::PCREL_WRAPPER", SDT_XtensaWrapPtr, []>; def Xtensa_callseq_start: SDNode<"ISD::CALLSEQ_START", SDT_XtensaCallSeqStart, @@ -59,3 +67,6 @@ def Xtensa_srcl: SDNode<"XtensaISD::SRCL", SDT_XtensaSRC>; def Xtensa_srcr: SDNode<"XtensaISD::SRCR", SDT_XtensaSRC>; def Xtensa_extui: SDNode<"XtensaISD::EXTUI", SDT_XtensaEXTUI>; + +def Xtensa_movsp: SDNode<"XtensaISD::MOVSP", SDT_XtensaMOVSP, + [SDNPHasChain, SDNPSideEffect, SDNPInGlue]>; diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp index 4a8bafc540df0..74633050861c2 100644 --- a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp +++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp @@ -34,13 +34,14 @@ XtensaRegisterInfo::XtensaRegisterInfo(const XtensaSubtarget &STI) const uint16_t * XtensaRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - return CSR_Xtensa_SaveList; + return Subtarget.isWindowedABI() ? CSRW8_Xtensa_SaveList + : CSR_Xtensa_SaveList; } const uint32_t * XtensaRegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const { - return CSR_Xtensa_RegMask; + return Subtarget.isWindowedABI() ? CSRW8_Xtensa_RegMask : CSR_Xtensa_RegMask; } BitVector XtensaRegisterInfo::getReservedRegs(const MachineFunction &MF) const { @@ -82,7 +83,7 @@ bool XtensaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // 4. Locations for eh data registers. // Everything else is referenced relative to whatever register // getFrameRegister() returns. - unsigned FrameReg; + MCRegister FrameReg; if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI)) FrameReg = Xtensa::SP; else @@ -107,7 +108,7 @@ bool XtensaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = II->getDebugLoc(); unsigned ADD = Xtensa::ADD; - unsigned Reg; + MCRegister Reg; const XtensaInstrInfo &TII = *static_cast( MBB.getParent()->getSubtarget().getInstrInfo()); @@ -129,5 +130,6 @@ bool XtensaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, Register XtensaRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); - return TFI->hasFP(MF) ? Xtensa::A15 : Xtensa::SP; + return TFI->hasFP(MF) ? (Subtarget.isWindowedABI() ? Xtensa::A7 : Xtensa::A15) + : Xtensa::SP; } diff --git a/llvm/lib/Target/Xtensa/XtensaSubtarget.h b/llvm/lib/Target/Xtensa/XtensaSubtarget.h index 770f73905b337..05c0c07e93a96 100644 --- a/llvm/lib/Target/Xtensa/XtensaSubtarget.h +++ b/llvm/lib/Target/Xtensa/XtensaSubtarget.h @@ -30,17 +30,17 @@ class StringRef; class XtensaSubtarget : public XtensaGenSubtargetInfo { private: +// Bool members corresponding to the SubtargetFeatures defined in tablegen +#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ + bool ATTRIBUTE = DEFAULT; +#include "XtensaGenSubtargetInfo.inc" + const Triple &TargetTriple; XtensaInstrInfo InstrInfo; XtensaTargetLowering TLInfo; SelectionDAGTargetInfo TSInfo; XtensaFrameLowering FrameLowering; -// Bool members corresponding to the SubtargetFeatures defined in tablegen -#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ - bool ATTRIBUTE = DEFAULT; -#include "XtensaGenSubtargetInfo.inc" - XtensaSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); public: @@ -72,6 +72,8 @@ class XtensaSubtarget : public XtensaGenSubtargetInfo { bool hasBoolean() const { return HasBoolean; } + bool isWindowedABI() const { return hasWindowed(); } + // Automatically generated by tblgen. void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); }; diff --git a/llvm/test/CodeGen/Xtensa/aligned_alloc.ll b/llvm/test/CodeGen/Xtensa/aligned_alloc.ll new file mode 100644 index 0000000000000..ebb24d9272ddc --- /dev/null +++ b/llvm/test/CodeGen/Xtensa/aligned_alloc.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=xtensa -O0 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=XTENSA + +define i8 @loadi8_128(i8 %a) { +; XTENSA-LABEL: loadi8_128: +; XTENSA: .cfi_startproc +; XTENSA-NEXT: # %bb.0: +; XTENSA-NEXT: addi a8, a1, -128 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: .cfi_def_cfa_offset 128 +; XTENSA-NEXT: s32i a0, a1, 124 # 4-byte Folded Spill +; XTENSA-NEXT: .cfi_offset a0, -4 +; XTENSA-NEXT: addi a2, a1, 0 +; XTENSA-NEXT: movi a3, 0 +; XTENSA-NEXT: movi a4, 64 +; XTENSA-NEXT: l32r a8, .LCPI0_0 +; XTENSA-NEXT: callx0 a8 +; XTENSA-NEXT: l8ui a2, a1, 0 +; XTENSA-NEXT: l32i a0, a1, 124 # 4-byte Folded Reload +; XTENSA-NEXT: movi a8, 128 +; XTENSA-NEXT: add a8, a1, a8 +; XTENSA-NEXT: or a1, a8, a8 +; XTENSA-NEXT: ret + %aligned = alloca i8, align 128 + call void @llvm.memset.p0.i64(ptr noundef nonnull align 64 dereferenceable(64) %aligned, i8 0, i64 64, i1 false) + %1 = load i8, ptr %aligned, align 128 + ret i8 %1 +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) diff --git a/llvm/test/CodeGen/Xtensa/calling-conv-windowed.ll b/llvm/test/CodeGen/Xtensa/calling-conv-windowed.ll new file mode 100644 index 0000000000000..2b6b018019c52 --- /dev/null +++ b/llvm/test/CodeGen/Xtensa/calling-conv-windowed.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=xtensa -O1 -mattr=+windowed -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=XTENSA + +; Check placement of first 6 arguments in registers and 7th argument on stack +define dso_local i32 @test1(i32 noundef %a, i32 noundef %b, i32 noundef %c, i32 noundef %d, i32 noundef %e, i32 noundef %f, ptr nocapture noundef readonly byval(i32) align 4 %p) { +; XTENSA-LABEL: test1: +; XTENSA: .cfi_startproc +; XTENSA-NEXT: # %bb.0: +; XTENSA-NEXT: entry a1, 32 +; XTENSA-NEXT: .cfi_def_cfa_offset 32 +; XTENSA-NEXT: add a8, a7, a2 +; XTENSA-NEXT: l32i a9, a1, 32 +; XTENSA-NEXT: add a2, a8, a9 +; XTENSA-NEXT: retw + %l = load i32, ptr %p, align 4 + %sum = add nsw i32 %f, %a + %1 = add nsw i32 %sum, %l + ret i32 %1 +} + +; Check placement of second i64 argument in registers +define dso_local i32 @test2(i32 noundef %a, i64 noundef %b, i32 noundef %c) { +; XTENSA-LABEL: test2: +; XTENSA: .cfi_startproc +; XTENSA-NEXT: # %bb.0: +; XTENSA-NEXT: entry a1, 32 +; XTENSA-NEXT: .cfi_def_cfa_offset 32 +; XTENSA-NEXT: add a8, a6, a2 +; XTENSA-NEXT: add a2, a8, a4 +; XTENSA-NEXT: retw + %tr = trunc i64 %b to i32 + %sum = add nsw i32 %c, %a + %1 = add nsw i32 %sum, %tr + ret i32 %1 +} + +; Check placement of first argument typeof i8 in register +define dso_local i32 @test3(i8 noundef signext %a, i64 noundef %b, i32 noundef %c) { +; XTENSA-LABEL: test3: +; XTENSA: .cfi_startproc +; XTENSA-NEXT: # %bb.0: +; XTENSA-NEXT: entry a1, 32 +; XTENSA-NEXT: .cfi_def_cfa_offset 32 +; XTENSA-NEXT: add a8, a2, a6 +; XTENSA-NEXT: add a2, a8, a4 +; XTENSA-NEXT: retw + %tr = trunc i64 %b to i32 + %se = sext i8 %a to i32 + %sum = add nsw i32 %se, %c + %1 = add nsw i32 %sum, %tr + ret i32 %1 +} + +; Check placement of 4th argument typeof i64 on stack +define dso_local i32 @test4(i8 noundef signext %a, i64 noundef %b, i32 noundef %c, ptr nocapture noundef readonly byval(i64) align 8 %p) { +; XTENSA-LABEL: test4: +; XTENSA: .cfi_startproc +; XTENSA-NEXT: # %bb.0: +; XTENSA-NEXT: entry a1, 32 +; XTENSA-NEXT: .cfi_def_cfa_offset 32 +; XTENSA-NEXT: add a8, a2, a6 +; XTENSA-NEXT: add a8, a8, a4 +; XTENSA-NEXT: l32i a9, a1, 32 +; XTENSA-NEXT: add a2, a8, a9 +; XTENSA-NEXT: retw + %l = load i64, ptr %p, align 8 + %tr1 = trunc i64 %b to i32 + %tr2 = trunc i64 %l to i32 + %se = sext i8 %a to i32 + %sum1 = add nsw i32 %se, %c + %sum2 = add nsw i32 %sum1, %tr1 + %1 = add nsw i32 %sum2, %tr2 + ret i32 %1 +} + +; Check placement of 128 bit structure on registers +define dso_local i32 @test5([4 x i32] %a, i32 noundef %b) { +; XTENSA-LABEL: test5: +; XTENSA: .cfi_startproc +; XTENSA-NEXT: # %bb.0: +; XTENSA-NEXT: entry a1, 32 +; XTENSA-NEXT: .cfi_def_cfa_offset 32 +; XTENSA-NEXT: add a2, a2, a6 +; XTENSA-NEXT: retw + %ev = extractvalue [4 x i32] %a, 0 + %1 = add nsw i32 %ev, %b + ret i32 %1 +} + +; Check placement of 128 bit structure on stack +define dso_local i32 @test6(i32 noundef %a, [4 x i32] %b) { +; XTENSA-LABEL: test6: +; XTENSA: .cfi_startproc +; XTENSA-NEXT: # %bb.0: +; XTENSA-NEXT: entry a1, 32 +; XTENSA-NEXT: .cfi_def_cfa_offset 32 +; XTENSA-NEXT: add a2, a3, a2 +; XTENSA-NEXT: retw + %ev = extractvalue [4 x i32] %b, 0 + %1 = add nsw i32 %ev, %a + ret i32 %1 +} diff --git a/llvm/test/CodeGen/Xtensa/callw.ll b/llvm/test/CodeGen/Xtensa/callw.ll new file mode 100644 index 0000000000000..21549bcf22678 --- /dev/null +++ b/llvm/test/CodeGen/Xtensa/callw.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=xtensa -mattr=+windowed < %s | FileCheck %s + +declare i32 @external_function(i32) + +define i32 @test_call_external(i32 %a) nounwind { +; CHECK-LABEL: test_call_external: +; CHECK: # %bb.0: +; CHECK-NEXT: entry a1, 32 +; CHECK-NEXT: l32r a8, .LCPI0_0 +; CHECK-NEXT: or a10, a2, a2 +; CHECK-NEXT: callx8 a8 +; CHECK-NEXT: or a2, a10, a10 +; CHECK-NEXT: retw + %1 = call i32 @external_function(i32 %a) + ret i32 %1 +} + +define i32 @defined_function(i32 %a) nounwind { +; CHECK-LABEL: defined_function: +; CHECK: # %bb.0: +; CHECK-NEXT: entry a1, 32 +; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: retw + %1 = add i32 %a, 1 + ret i32 %1 +} + +define i32 @test_call_defined(i32 %a) nounwind { +; CHECK-LABEL: test_call_defined: +; CHECK: # %bb.0: +; CHECK-NEXT: entry a1, 32 +; CHECK-NEXT: l32r a8, .LCPI2_0 +; CHECK-NEXT: or a10, a2, a2 +; CHECK-NEXT: callx8 a8 +; CHECK-NEXT: or a2, a10, a10 +; CHECK-NEXT: retw + %1 = call i32 @defined_function(i32 %a) nounwind + ret i32 %1 +} + +define i32 @test_call_indirect(ptr %a, i32 %b) nounwind { +; CHECK-LABEL: test_call_indirect: +; CHECK: # %bb.0: +; CHECK-NEXT: entry a1, 32 +; CHECK-NEXT: or a10, a3, a3 +; CHECK-NEXT: callx8 a2 +; CHECK-NEXT: or a2, a10, a10 +; CHECK-NEXT: retw + %1 = call i32 %a(i32 %b) + ret i32 %1 +} diff --git a/llvm/test/CodeGen/Xtensa/saverestore.ll b/llvm/test/CodeGen/Xtensa/saverestore.ll index 69c8b16ab601d..82c0dab1a680a 100644 --- a/llvm/test/CodeGen/Xtensa/saverestore.ll +++ b/llvm/test/CodeGen/Xtensa/saverestore.ll @@ -1,4 +1,6 @@ -; RUN: llc --mtriple=xtensa < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=xtensa < %s | FileCheck -check-prefixes=CHECK %s +; RUN: llc --mtriple=xtensa --mattr=+windowed < %s | FileCheck -check-prefixes=CHECK-WINDOWED %s declare ptr @llvm.stacksave() @@ -7,32 +9,60 @@ declare void @llvm.stackrestore(ptr) declare void @use_addr(ptr) define void @test_saverestore(i64 %n) { -; CHECK: # %bb.0: -; CHECK-NEXT: addi a8, a1, -16 -; CHECK-NEXT: or a1, a8, a8 -; CHECK: s32i a0, a1, 8 -; CHECK-NEXT: s32i a12, a1, 4 -; CHECK-NEXT: s32i a15, a1, 0 -; CHECK: or a15, a1, a1 -; CHECK: addi a8, a2, 3 -; CHECK-NEXT: movi a9, -4 -; CHECK-NEXT: and a8, a8, a9 -; CHECK-NEXT: addi a8, a8, 31 -; CHECK-NEXT: movi a9, -32 -; CHECK-NEXT: and a8, a8, a9 -; CHECK-NEXT: or a12, a1, a1 -; CHECK-NEXT: sub a1, a1, a8 -; CHECK-NEXT: or a2, a1, a1 -; CHECK-NEXT: l32r a8, .LCPI0_0 -; CHECK-NEXT: callx0 a8 -; CHECK-NEXT: or a1, a12, a12 -; CHECK-NEXT: or a1, a15, a15 -; CHECK-NEXT: l32i a15, a1, 0 -; CHECK-NEXT: l32i a12, a1, 4 -; CHECK-NEXT: l32i a0, a1, 8 -; CHECK-NEXT: addi a8, a1, 16 -; CHECK-NEXT: or a1, a8, a8 -; CHECK-NEXT: ret +; CHECK-LABEL: test_saverestore: +; CHECK: .cfi_startproc +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: addi a8, a1, -16 +; CHECK-NEXT: or a1, a8, a8 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: s32i a0, a1, 8 # 4-byte Folded Spill +; CHECK-NEXT: s32i a12, a1, 4 # 4-byte Folded Spill +; CHECK-NEXT: s32i a15, a1, 0 # 4-byte Folded Spill +; CHECK-NEXT: .cfi_offset a0, -4 +; CHECK-NEXT: .cfi_offset a12, -8 +; CHECK-NEXT: .cfi_offset a15, -12 +; CHECK-NEXT: or a15, a1, a1 +; CHECK-NEXT: .cfi_def_cfa_register a15 +; CHECK-NEXT: addi a8, a2, 3 +; CHECK-NEXT: movi a9, -4 +; CHECK-NEXT: and a8, a8, a9 +; CHECK-NEXT: addi a8, a8, 31 +; CHECK-NEXT: movi a9, -32 +; CHECK-NEXT: and a8, a8, a9 +; CHECK-NEXT: or a12, a1, a1 +; CHECK-NEXT: sub a1, a1, a8 +; CHECK-NEXT: or a2, a1, a1 +; CHECK-NEXT: l32r a8, .LCPI0_0 +; CHECK-NEXT: callx0 a8 +; CHECK-NEXT: or a1, a12, a12 +; CHECK-NEXT: or a1, a15, a15 +; CHECK-NEXT: l32i a15, a1, 0 # 4-byte Folded Reload +; CHECK-NEXT: l32i a12, a1, 4 # 4-byte Folded Reload +; CHECK-NEXT: l32i a0, a1, 8 # 4-byte Folded Reload +; CHECK-NEXT: addi a8, a1, 16 +; CHECK-NEXT: or a1, a8, a8 +; CHECK-NEXT: ret +; +; CHECK-WINDOWED-LABEL: test_saverestore: +; CHECK-WINDOWED: .cfi_startproc +; CHECK-WINDOWED-NEXT: # %bb.0: +; CHECK-WINDOWED-NEXT: entry a1, 32 +; CHECK-WINDOWED-NEXT: or a7, a1, a1 +; CHECK-WINDOWED-NEXT: .cfi_def_cfa a7, 32 +; CHECK-WINDOWED-NEXT: addi a8, a2, 3 +; CHECK-WINDOWED-NEXT: movi a9, -4 +; CHECK-WINDOWED-NEXT: and a8, a8, a9 +; CHECK-WINDOWED-NEXT: addi a8, a8, 31 +; CHECK-WINDOWED-NEXT: movi a9, -32 +; CHECK-WINDOWED-NEXT: and a8, a8, a9 +; CHECK-WINDOWED-NEXT: or a6, a1, a1 +; CHECK-WINDOWED-NEXT: sub a8, a1, a8 +; CHECK-WINDOWED-NEXT: movsp a1, a8 +; CHECK-WINDOWED-NEXT: or a10, a1, a1 +; CHECK-WINDOWED-NEXT: l32r a8, .LCPI0_0 +; CHECK-WINDOWED-NEXT: callx8 a8 +; CHECK-WINDOWED-NEXT: movsp a1, a6 +; CHECK-WINDOWED-NEXT: retw %sp = call ptr @llvm.stacksave.p0() %addr = alloca i8, i64 %n From b01e5b23dd880e9686cc4151c7d1b1737cbdd98e Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Mon, 31 Mar 2025 12:26:25 -0700 Subject: [PATCH 0144/1029] [ctxprof][nfc] Refactor `__llvm_ctx_profile_start_context` (#133744) Most of the functionality will be reused with the auto-root detection mechanism (which is introduced subsequently in PR #133147). --- .../lib/ctx_profile/CtxInstrProfiling.cpp | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp index da291e0bbabdd..10a6a8c1f71e5 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp +++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp @@ -277,6 +277,25 @@ ContextRoot *FunctionData::getOrAllocateContextRoot() { return Root; } +ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid, + uint32_t Counters, uint32_t Callsites) + SANITIZER_NO_THREAD_SAFETY_ANALYSIS { + IsUnderContext = true; + __sanitizer::atomic_fetch_add(&Root->TotalEntries, 1, + __sanitizer::memory_order_relaxed); + if (!Root->FirstMemBlock) { + setupContext(Root, Guid, Counters, Callsites); + } + if (Root->Taken.TryLock()) { + __llvm_ctx_profile_current_context_root = Root; + onContextEnter(*Root->FirstNode); + return Root->FirstNode; + } + // If this thread couldn't take the lock, return scratch context. + __llvm_ctx_profile_current_context_root = nullptr; + return TheScratchContext; +} + ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid, uint32_t NumCounters) { @@ -369,24 +388,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, ContextNode *__llvm_ctx_profile_start_context( FunctionData *FData, GUID Guid, uint32_t Counters, uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { - IsUnderContext = true; - - auto *Root = FData->getOrAllocateContextRoot(); - - __sanitizer::atomic_fetch_add(&Root->TotalEntries, 1, - __sanitizer::memory_order_relaxed); - - if (!Root->FirstMemBlock) { - setupContext(Root, Guid, Counters, Callsites); - } - if (Root->Taken.TryLock()) { - __llvm_ctx_profile_current_context_root = Root; - onContextEnter(*Root->FirstNode); - return Root->FirstNode; - } - // If this thread couldn't take the lock, return scratch context. - __llvm_ctx_profile_current_context_root = nullptr; - return TheScratchContext; + return tryStartContextGivenRoot(FData->getOrAllocateContextRoot(), Guid, + Counters, Callsites); } void __llvm_ctx_profile_release_context(FunctionData *FData) From 616f447fc84bdc7655117f1b303d895dc3b93e4d Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 31 Mar 2025 20:38:12 +0100 Subject: [PATCH 0145/1029] Revert "[EquivalenceClasses] Replace findValue with contains (NFC)." Breaks clang builds. This reverts commit 8e390dedd71d0c2bcbe8775aee2e234ef7a5b787. --- llvm/include/llvm/ADT/EquivalenceClasses.h | 7 ++++--- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h index f9c7819f18806..c375d6e77b12a 100644 --- a/llvm/include/llvm/ADT/EquivalenceClasses.h +++ b/llvm/include/llvm/ADT/EquivalenceClasses.h @@ -179,9 +179,10 @@ class EquivalenceClasses { return member_iterator(nullptr); } - /// Returns true if \p V is contained an equivalence class. - bool contains(const ElemTy &V) const { - return TheMapping.find(V) != TheMapping.end(); + /// findValue - Return an iterator to the specified value. If it does not + /// exist, end() is returned. + iterator findValue(const ElemTy &V) const { + return TheMapping.find(V); } /// getLeaderValue - Return the leader for the specified value that is in the diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 47ff31b9a0525..e7d6984caeba3 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1230,7 +1230,7 @@ bool AccessAnalysis::canCheckPtrAtRT( [this](const Value *Ptr) { MemAccessInfo AccessWrite(const_cast(Ptr), true); - return !DepCands.contains(AccessWrite); + return DepCands.findValue(AccessWrite) == DepCands.end(); })) && "Can only skip updating CanDoRT below, if all entries in AS " "are reads or there is at most 1 entry"); From 225f6ddb32f7ac56b7f66b47d99fdcb54f2843ca Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Mon, 31 Mar 2025 12:49:54 -0700 Subject: [PATCH 0146/1029] [ctxprof][nfc] Remove redundant `SANITIZER_NO_THREAD_SAFETY_ANALYSIS` (#133784) With the refactoring in PR #133744, `__llvm_ctx_profile_start_context` doesn't need to be marked `SANITIZER_NO_THREAD_SAFETY_ANALYSIS` because `tryStartContextGivenRoot` (where the bulk of the logic went) is. --- compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp index 10a6a8c1f71e5..e08d555c61ff7 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp +++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp @@ -385,9 +385,9 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, return Ret; } -ContextNode *__llvm_ctx_profile_start_context( - FunctionData *FData, GUID Guid, uint32_t Counters, - uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { +ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid, + uint32_t Counters, + uint32_t Callsites) { return tryStartContextGivenRoot(FData->getOrAllocateContextRoot(), Guid, Counters, Callsites); } From c63246645eeb3fddcf86b36e815c30e8b2af6d44 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Mon, 31 Mar 2025 14:58:06 -0500 Subject: [PATCH 0147/1029] [WebAssembly] Add a missing `break` statement (#133783) This fixes an issue introduced in #132430 where a `break;` statement was accidentally missing causing unintended fall-through. --- llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 82d3b8e292e60..794db887bd073 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1109,6 +1109,7 @@ void WebAssemblyTargetLowering::computeKnownBitsForTargetNode( break; } } + break; } // For 128-bit addition if the upper bits are all zero then it's known that From b739a3cb651dd4af2f1a47fe1f0427fe2d9460ef Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 31 Mar 2025 21:01:28 +0100 Subject: [PATCH 0148/1029] [VPlan] Add m_Deferred. NFC (#133736) This copies over the implementation of m_Deferred which allows matching values that were bound in the pattern, and uses it for the (X && Y) || (X && !Y) -> X simplifcation. --- .../Transforms/Vectorize/VPlanPatternMatch.h | 18 ++++++++++++++++++ .../Transforms/Vectorize/VPlanTransforms.cpp | 5 ++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 3b45894ebfbce..2cd23efcf3eab 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -66,6 +66,24 @@ struct specificval_ty { inline specificval_ty m_Specific(const VPValue *VPV) { return VPV; } +/// Stores a reference to the VPValue *, not the VPValue * itself, +/// thus can be used in commutative matchers. +struct deferredval_ty { + VPValue *const &Val; + + deferredval_ty(VPValue *const &V) : Val(V) {} + + bool match(VPValue *const V) const { return V == Val; } +}; + +/// Like m_Specific(), but works if the specific value to match is determined +/// as part of the same match() expression. For example: +/// m_Mul(m_VPValue(X), m_Specific(X)) is incorrect, because m_Specific() will +/// bind X before the pattern match starts. +/// m_Mul(m_VPValue(X), m_Deferred(X)) is correct, and will check against +/// whichever value m_VPValue(X) populated. +inline deferredval_ty m_Deferred(VPValue *const &V) { return V; } + /// Match a specified integer value or vector of all elements of that /// value. \p BitWidth optionally specifies the bitwidth the matched constant /// must have. If it is 0, the matched constant can have any bitwidth. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 3ebd844d6a5a1..b0aaf7870c542 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1053,11 +1053,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X // && (Y || Z) and (X || !X) into true. This requires queuing newly created // recipes to be visited during simplification. - VPValue *X, *Y, *X1, *Y1; + VPValue *X, *Y; if (match(&R, m_c_BinaryOr(m_LogicalAnd(m_VPValue(X), m_VPValue(Y)), - m_LogicalAnd(m_VPValue(X1), m_Not(m_VPValue(Y1))))) && - X == X1 && Y == Y1) { + m_LogicalAnd(m_Deferred(X), m_Not(m_Deferred(Y)))))) { R.getVPSingleValue()->replaceAllUsesWith(X); R.eraseFromParent(); return; From 0b31f08537746beff4d5e0df44221cbe5a9237c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Mon, 31 Mar 2025 13:17:21 -0700 Subject: [PATCH 0149/1029] [flang][cuda] Add support for NV_CUDAFOR_DEVICE_IS_MANAGED (#133778) Add support for the environment variable `NV_CUDAFOR_DEVICE_IS_MANAGED` as described in the documentation: https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#controlling-device-data-is-managed. This mainly switch device allocation to managed allocation. --- flang-rt/include/flang-rt/runtime/environment.h | 1 + flang-rt/lib/cuda/allocator.cpp | 8 +++++++- flang-rt/lib/cuda/memory.cpp | 8 +++++++- flang-rt/lib/runtime/environment.cpp | 13 +++++++++++++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/flang-rt/include/flang-rt/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h index 142add432b5f7..ca6c2a7d44484 100644 --- a/flang-rt/include/flang-rt/runtime/environment.h +++ b/flang-rt/include/flang-rt/runtime/environment.h @@ -59,6 +59,7 @@ struct ExecutionEnvironment { // CUDA related variables std::size_t cudaStackLimit{0}; // ACC_OFFLOAD_STACK_SIZE + bool cudaDeviceIsManaged{false}; // NV_CUDAFOR_DEVICE_IS_MANAGED }; RT_OFFLOAD_VAR_GROUP_BEGIN diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp index 4199bf04b33f0..d6529957bc939 100644 --- a/flang-rt/lib/cuda/allocator.cpp +++ b/flang-rt/lib/cuda/allocator.cpp @@ -9,6 +9,7 @@ #include "flang/Runtime/CUDA/allocator.h" #include "flang-rt/runtime/allocator-registry.h" #include "flang-rt/runtime/derived.h" +#include "flang-rt/runtime/environment.h" #include "flang-rt/runtime/stat.h" #include "flang-rt/runtime/terminator.h" #include "flang-rt/runtime/type-info.h" @@ -43,7 +44,12 @@ void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); } void *CUFAllocDevice(std::size_t sizeInBytes) { void *p; - CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes)); + if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) { + CUDA_REPORT_IF_ERROR( + cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal)); + } else { + CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes)); + } return p; } diff --git a/flang-rt/lib/cuda/memory.cpp b/flang-rt/lib/cuda/memory.cpp index adc24ff223729..766f6847946cb 100644 --- a/flang-rt/lib/cuda/memory.cpp +++ b/flang-rt/lib/cuda/memory.cpp @@ -9,6 +9,7 @@ #include "flang/Runtime/CUDA/memory.h" #include "flang-rt/runtime/assign-impl.h" #include "flang-rt/runtime/descriptor.h" +#include "flang-rt/runtime/environment.h" #include "flang-rt/runtime/terminator.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" @@ -26,7 +27,12 @@ void *RTDEF(CUFMemAlloc)( void *ptr = nullptr; if (bytes != 0) { if (type == kMemTypeDevice) { - CUDA_REPORT_IF_ERROR(cudaMalloc((void **)&ptr, bytes)); + if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) { + CUDA_REPORT_IF_ERROR( + cudaMallocManaged((void **)&ptr, bytes, cudaMemAttachGlobal)); + } else { + CUDA_REPORT_IF_ERROR(cudaMalloc((void **)&ptr, bytes)); + } } else if (type == kMemTypeManaged || type == kMemTypeUnified) { CUDA_REPORT_IF_ERROR( cudaMallocManaged((void **)&ptr, bytes, cudaMemAttachGlobal)); diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp index 15380ba148df5..cf2c65dd4fac0 100644 --- a/flang-rt/lib/runtime/environment.cpp +++ b/flang-rt/lib/runtime/environment.cpp @@ -155,6 +155,19 @@ void ExecutionEnvironment::Configure(int ac, const char *av[], } } + if (auto *x{std::getenv("NV_CUDAFOR_DEVICE_IS_MANAGED")}) { + char *end; + auto n{std::strtol(x, &end, 10)}; + if (n >= 0 && n <= 1 && *end == '\0') { + cudaDeviceIsManaged = n != 0; + } else { + std::fprintf(stderr, + "Fortran runtime: NV_CUDAFOR_DEVICE_IS_MANAGED=%s is invalid; " + "ignored\n", + x); + } + } + // TODO: Set RP/ROUND='PROCESSOR_DEFINED' from environment } From 4e8fbc60710ebec25d9d456eccf28678e04415c9 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 31 Mar 2025 21:23:35 +0100 Subject: [PATCH 0150/1029] [LV] Add epilogue vectorization tests for FindLastIV reductions. Add missing test coverage for #126836. --- .../AArch64/epilog-iv-select-cmp.ll | 122 +++++++++++++++++ .../LoopVectorize/epilog-iv-select-cmp.ll | 125 ++++++++++++++++-- 2 files changed, 236 insertions(+), 11 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll new file mode 100644 index 0000000000000..25404964d8058 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macosx -S %s | FileCheck %s + +define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { +; CHECK-LABEL: define i8 @select_icmp_var_start( +; CHECK-SAME: ptr [[A:%.*]], i8 [[N:%.*]], i8 [[START:%.*]]) { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i8 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 32 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 32 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ splat (i8 -128), %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i8> [ splat (i8 -128), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <16 x i8> [[VEC_IND]], splat (i8 16) +; CHECK-NEXT: [[INDEX4:%.*]] = trunc i32 [[INDEX]] to i8 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[INDEX4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD]], splat (i8 3) +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq <16 x i8> [[WIDE_LOAD3]], splat (i8 3) +; CHECK-NEXT: [[TMP10]] = select <16 x i1> [[TMP17]], <16 x i8> [[VEC_IND]], <16 x i8> [[VEC_PHI]] +; CHECK-NEXT: [[TMP11]] = select <16 x i1> [[TMP23]], <16 x i8> [[STEP_ADD]], <16 x i8> [[VEC_PHI2]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[STEP_ADD]], splat (i8 16) +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[RDX_MINMAX]]) +; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP13]], -128 +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP13]], i8 [[START]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[START]] +; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i8 -128, i8 [[BC_MERGE_RDX]] +; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i32 [[TMP2]], 8 +; CHECK-NEXT: [[N_VEC5:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF4]] +; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[N_VEC5]] to i8 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i8> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <8 x i8> poison, i8 [[TMP15]], i64 0 +; CHECK-NEXT: [[DOTSPLAT11:%.*]] = shufflevector <8 x i8> [[DOTSPLATINSERT10]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX6:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND7:%.*]] = phi <8 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <8 x i8> [ [[DOTSPLAT11]], %[[VEC_EPILOG_PH]] ], [ [[TMP20:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = trunc i32 [[INDEX6]] to i8 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, ptr [[TMP18]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <8 x i8> [[WIDE_LOAD12]], splat (i8 3) +; CHECK-NEXT: [[TMP20]] = select <8 x i1> [[TMP19]], <8 x i8> [[VEC_IND7]], <8 x i8> [[VEC_PHI9]] +; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i32 [[INDEX6]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT8]] = add <8 x i8> [[VEC_IND7]], splat (i8 8) +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT13]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[TMP21]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP22:%.*]] = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> [[TMP20]]) +; CHECK-NEXT: [[RDX_SELECT_CMP14:%.*]] = icmp ne i8 [[TMP22]], -128 +; CHECK-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i8 [[TMP22]], i8 [[START]] +; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]] +; CHECK-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL17:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX18:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i8 [ [[BC_RESUME_VAL17]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX18]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP1]], align 8 +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 3 +; CHECK-NEXT: [[SEL]] = select i1 [[C]], i8 [[IV1]], i8 [[RDX]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i8 [[IV1]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i8 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i8 [[SEL_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i8 [ %start, %entry ], [ %sel, %loop ] + %gep = getelementptr inbounds i8, ptr %a, i8 %iv + %l = load i8, ptr %gep, align 8 + %c = icmp eq i8 %l, 3 + %sel = select i1 %c, i8 %iv, i8 %rdx + %iv.next = add nuw nsw i8 %iv, 1 + %ec = icmp eq i8 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret i8 %sel +} diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index c007d6baeb53c..ee154ea5a169a 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -epilogue-vectorization-force-VF=4 -S < %s | FileCheck %s define i64 @select_icmp_const(ptr %a, i64 %n) { @@ -208,13 +208,116 @@ loop: exit: ret i64 %sel } -;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} -;. + +define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { +; CHECK-LABEL: define i8 @select_icmp_var_start( +; CHECK-SAME: ptr [[A:%.*]], i8 [[N:%.*]], i8 [[START:%.*]]) { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i8 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ splat (i8 -128), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[INDEX]] to i8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], splat (i8 3) +; CHECK-NEXT: [[TMP8]] = select <4 x i1> [[TMP7]], <4 x i8> [[VEC_IND]], <4 x i8> [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4) +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP10:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP8]]) +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i8 [[TMP10]], -128 +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i8 [[TMP10]], i8 [[START]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8 +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[START]] +; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i8 -128, i8 [[BC_MERGE_RDX]] +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[TMP2]], 4 +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF2]] +; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[N_VEC3]] to i8 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i8> [[DOTSPLAT]], +; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0 +; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i8> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i8> [ [[DOTSPLAT9]], %[[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX4]] to i8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i8>, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD10]], splat (i8 3) +; CHECK-NEXT: [[TMP17]] = select <4 x i1> [[TMP16]], <4 x i8> [[VEC_IND5]], <4 x i8> [[VEC_PHI7]] +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i32 [[INDEX4]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i8> [[VEC_IND5]], splat (i8 4) +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT11]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[TMP18]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP19:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP17]]) +; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP19]], -128 +; CHECK-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP19]], i8 [[START]] +; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N14]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i8 [ [[TMP13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi i8 [ [[RDX_SELECT13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i8 [ [[BC_MERGE_RDX16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SEL:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i8 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP]], align 8 +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 3 +; CHECK-NEXT: [[SEL]] = select i1 [[C]], i8 [[IV]], i8 [[RDX]] +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i8 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i8 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i8 [[SEL_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i8 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i8 [ %start, %entry ], [ %sel, %loop ] + %gep = getelementptr inbounds i8, ptr %a, i8 %iv + %l = load i8, ptr %gep, align 8 + %c = icmp eq i8 %l, 3 + %sel = select i1 %c, i8 %iv, i8 %rdx + %iv.next = add nuw nsw i8 %iv, 1 + %ec = icmp eq i8 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret i8 %sel +} From 6afe5e5d1a6dcbf7ba83acf4c57b964f58c364a1 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 31 Mar 2025 21:28:39 +0100 Subject: [PATCH 0151/1029] [LV][EVL] Peek through combination tail-folded + predicated masks (#133430) If a recipe was predicated and tail folded at the same time, it will have a mask like EMIT vp<%header-mask> = icmp ule canonical-iv, backedge-tc EMIT vp<%mask> = logical-and vp<%header-mask>, vp<%pred-mask> When converting to an EVL recipe, if the mask isn't exactly just the header-mask we copy the whole logical-and. We can remove this redundant logical-and (because it's now covered by EVL) and just use vp<%pred-mask> instead. This lets us remove the widened canonical IV in more places. --- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 4 ++++ ...vectorize-force-tail-with-evl-cond-reduction.ll | 12 +----------- ...ctorize-force-tail-with-evl-masked-loadstore.ll | 14 ++------------ ...orize-force-tail-with-evl-reverse-load-store.ll | 11 ++--------- 4 files changed, 9 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index b0aaf7870c542..9a041c83438dc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1895,6 +1895,10 @@ static VPRecipeBase *createEVLRecipe(VPValue *HeaderMask, using namespace llvm::VPlanPatternMatch; auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * { assert(OrigMask && "Unmasked recipe when folding tail"); + // HeaderMask will be handled using EVL. + VPValue *Mask; + if (match(OrigMask, m_LogicalAnd(m_Specific(HeaderMask), m_VPValue(Mask)))) + return Mask; return HeaderMask == OrigMask ? nullptr : OrigMask; }; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index a40255c031619..354cc91c6f01f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -348,11 +348,8 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] ; IF-EVL-INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP7]] ; IF-EVL-INLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-INLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-INLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 -; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL-INLOOP: vector.body: ; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -360,18 +357,11 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) -; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = add zeroinitializer, [[TMP14]] -; IF-EVL-INLOOP-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP15]] -; IF-EVL-INLOOP-NEXT: [[TMP16:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] ; IF-EVL-INLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]] ; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 ; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], splat (i1 true), i32 [[TMP12]]) ; IF-EVL-INLOOP-NEXT: [[TMP19:%.*]] = icmp sgt [[VP_OP_LOAD]], splat (i32 3) -; IF-EVL-INLOOP-NEXT: [[TMP20:%.*]] = select [[TMP16]], [[TMP19]], zeroinitializer -; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, [[VP_OP_LOAD]], [[TMP20]], i32 [[TMP12]]) +; IF-EVL-INLOOP-NEXT: [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, [[VP_OP_LOAD]], [[TMP19]], i32 [[TMP12]]) ; IF-EVL-INLOOP-NEXT: [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]] ; IF-EVL-INLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP12]] to i64 ; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll index bf3f01343eb24..73c14f86e2782 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll @@ -24,33 +24,23 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP6]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1 ; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP9:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP9]], i32 4, i1 true) -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] -; IF-EVL-NEXT: [[TMP14:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT2]] ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP16]], splat (i1 true), i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp ne [[VP_OP_LOAD]], zeroinitializer -; IF-EVL-NEXT: [[TMP18:%.*]] = select [[TMP14]], [[TMP17]], zeroinitializer ; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i32 0 -; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[TMP18]], i32 [[TMP10]]) +; IF-EVL-NEXT: [[VP_OP_LOAD3:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[TMP17]], i32 [[TMP10]]) ; IF-EVL-NEXT: [[VP_OP:%.*]] = add [[VP_OP_LOAD]], [[VP_OP_LOAD3]] -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP20]], [[TMP18]], i32 [[TMP10]]) +; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_OP]], ptr align 4 [[TMP20]], [[TMP17]], i32 [[TMP10]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP10]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index b6d92caa46ab0..695af0d241159 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -132,25 +132,18 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[EVL_BASED_IV]], i64 0 -; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; IF-EVL-NEXT: [[TMP8:%.*]] = call @llvm.stepvector.nxv4i64() -; IF-EVL-NEXT: [[TMP9:%.*]] = add zeroinitializer, [[TMP8]] -; IF-EVL-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP9]] -; IF-EVL-NEXT: [[TMP10:%.*]] = icmp ule [[VEC_IV]], splat (i64 1023) ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], -1 ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX3]] ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 ; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], splat (i32 100) -; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP10]], [[TMP14]], zeroinitializer ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]] ; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]] ; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]] ; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]] -; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP15]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]] @@ -160,7 +153,7 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] ; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP15]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP14]], splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP25]], [[VP_REVERSE_MASK6]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]] From a8d2d169c7add4b0106ae76e186cf815c0b84825 Mon Sep 17 00:00:00 2001 From: Tom Yang Date: Mon, 31 Mar 2025 13:29:31 -0700 Subject: [PATCH 0152/1029] Parallelize module loading in POSIX dyld code (#130912) This patch improves LLDB launch time on Linux machines for **preload scenarios**, particularly for executables with a lot of shared library dependencies (or modules). Specifically: * Launching a binary with `target.preload-symbols = true` * Attaching to a process with `target.preload-symbols = true`. It's completely controlled by a new flag added in the first commit `plugin.dynamic-loader.posix-dyld.parallel-module-load`, which *defaults to false*. This was inspired by similar work on Darwin #110646. Some rough numbers to showcase perf improvement, run on a very beefy machine: * Executable with ~5600 modules: baseline 45s, improvement 15s * Executable with ~3800 modules: baseline 25s, improvement 10s * Executable with ~6650 modules: baseline 67s, improvement 20s * Executable with ~12500 modules: baseline 185s, improvement 85s * Executable with ~14700 modules: baseline 235s, improvement 120s A lot of targets we deal with have a *ton* of modules, and unfortunately we're unable to convince other folks to reduce the number of modules, so performance improvements like this can be very impactful for user experience. This patch achieves the performance improvement by parallelizing `DynamicLoaderPOSIXDYLD::RefreshModules` for the launch scenario, and `DynamicLoaderPOSIXDYLD::LoadAllCurrentModules` for the attach scenario. The commits have some context on their specific changes as well -- hopefully this helps the review. # More context on implementation We discovered the bottlenecks by via `perf record -g -p ` on a Linux machine. With an executable known to have 1000s of shared library dependencies, I ran ``` (lldb) b main (lldb) r # taking a while ``` and showed the resulting perf trace (snippet shown) ``` Samples: 85K of event 'cycles:P', Event count (approx.): 54615855812 Children Self Command Shared Object Symbol - 93.54% 0.00% intern-state libc.so.6 [.] clone3 clone3 start_thread lldb_private::HostNativeThreadBase::ThreadCreateTrampoline(void*) r std::_Function_handler::_M_invoke(std::_Any_data const&) lldb_private::Process::RunPrivateStateThread(bool) n - lldb_private::Process::HandlePrivateEvent(std::shared_ptr&) - 93.54% lldb_private::Process::ShouldBroadcastEvent(lldb_private::Event*) - 93.54% lldb_private::ThreadList::ShouldStop(lldb_private::Event*) - lldb_private::Thread::ShouldStop(lldb_private::Event*) * - 93.53% lldb_private::StopInfoBreakpoint::ShouldStopSynchronous(lldb_private::Event*) t - 93.52% lldb_private::BreakpointSite::ShouldStop(lldb_private::StoppointCallbackContext*) i lldb_private::BreakpointLocationCollection::ShouldStop(lldb_private::StoppointCallbackContext*) k lldb_private::BreakpointLocation::ShouldStop(lldb_private::StoppointCallbackContext*) b lldb_private::BreakpointOptions::InvokeCallback(lldb_private::StoppointCallbackContext*, unsigned long, unsigned long) i DynamicLoaderPOSIXDYLD::RendezvousBreakpointHit(void*, lldb_private::StoppointCallbackContext*, unsigned long, unsigned lo - DynamicLoaderPOSIXDYLD::RefreshModules() O - 93.42% DynamicLoaderPOSIXDYLD::RefreshModules()::$_0::operator()(DYLDRendezvous::SOEntry const&) const u - 93.40% DynamicLoaderPOSIXDYLD::LoadModuleAtAddress(lldb_private::FileSpec const&, unsigned long, unsigned long, bools - lldb_private::DynamicLoader::LoadModuleAtAddress(lldb_private::FileSpec const&, unsigned long, unsigned long, boos - 83.90% lldb_private::DynamicLoader::FindModuleViaTarget(lldb_private::FileSpec const&) o - 83.01% lldb_private::Target::GetOrCreateModule(lldb_private::ModuleSpec const&, bool, lldb_private::Status* - 77.89% lldb_private::Module::PreloadSymbols() - 44.06% lldb_private::Symtab::PreloadSymbols() - 43.66% lldb_private::Symtab::InitNameIndexes() ... ``` We saw that majority of time was spent in `RefreshModules`, with the main culprit within it `LoadModuleAtAddress` which eventually calls `PreloadSymbols`. At first, `DynamicLoaderPOSIXDYLD::LoadModuleAtAddress` appears fairly independent -- most of it deals with different files and then getting or creating Modules from these files. The portions that aren't independent seem to deal with ModuleLists, which appear concurrency safe. There were members of `DynamicLoaderPOSIXDYLD` I had to synchronize though: namely `m_loaded_modules` which `DynamicLoaderPOSIXDYLD` maintains to map its loaded modules to their link addresses. Without synchronizing this, I ran into SEGFAULTS and other issues when running `check-lldb`. I also locked the assignment and comparison of `m_interpreter_module`, which may be unnecessary. # Alternate implementations When creating this patch, another implementation I considered was directly background-ing the call to `Module::PreloadSymbol` in `Target::GetOrCreateModule`. It would have the added benefit of working across platforms generically, and appeared to be concurrency safe. It was done via `Debugger::GetThreadPool().async` directly. However, there were a ton of concurrency issues, so I abandoned that approach for now. # Testing With the feature active, I tested via `ninja check-lldb` on both Debug and Release builds several times (~5 or 6 altogether?), and didn't spot additional failing or flaky tests. I also tested manually on several different binaries, some with around 14000 modules, but just basic operations: launching, reaching main, setting breakpoint, stepping, showing some backtraces. I've also tested with the flag off just to make sure things behave properly synchronously. --- lldb/include/lldb/Target/Target.h | 2 + .../POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp | 147 +++++++++++++----- .../POSIX-DYLD/DynamicLoaderPOSIXDYLD.h | 17 +- lldb/source/Target/Target.cpp | 6 + lldb/source/Target/TargetProperties.td | 3 + 5 files changed, 133 insertions(+), 42 deletions(-) diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h index 3cdbe9221a0bc..29183cc267721 100644 --- a/lldb/include/lldb/Target/Target.h +++ b/lldb/include/lldb/Target/Target.h @@ -118,6 +118,8 @@ class TargetProperties : public Properties { llvm::StringRef GetLaunchWorkingDirectory() const; + bool GetParallelModuleLoad() const; + const char *GetDisassemblyFlavor() const; const char *GetDisassemblyCPU() const; diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp index 53ba11ac21bd3..326b6910b5267 100644 --- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp +++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.cpp @@ -10,6 +10,7 @@ #include "DynamicLoaderPOSIXDYLD.h" #include "lldb/Breakpoint/BreakpointLocation.h" +#include "lldb/Core/Debugger.h" #include "lldb/Core/Module.h" #include "lldb/Core/ModuleSpec.h" #include "lldb/Core/PluginManager.h" @@ -25,6 +26,7 @@ #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/ProcessInfo.h" +#include "llvm/Support/ThreadPool.h" #include #include @@ -184,16 +186,37 @@ void DynamicLoaderPOSIXDYLD::DidLaunch() { Status DynamicLoaderPOSIXDYLD::CanLoadImage() { return Status(); } +void DynamicLoaderPOSIXDYLD::SetLoadedModule(const ModuleSP &module_sp, + addr_t link_map_addr) { + llvm::sys::ScopedWriter lock(m_loaded_modules_rw_mutex); + m_loaded_modules[module_sp] = link_map_addr; +} + +void DynamicLoaderPOSIXDYLD::UnloadModule(const ModuleSP &module_sp) { + llvm::sys::ScopedWriter lock(m_loaded_modules_rw_mutex); + m_loaded_modules.erase(module_sp); +} + +std::optional +DynamicLoaderPOSIXDYLD::GetLoadedModuleLinkAddr(const ModuleSP &module_sp) { + llvm::sys::ScopedReader lock(m_loaded_modules_rw_mutex); + auto it = m_loaded_modules.find(module_sp); + if (it != m_loaded_modules.end()) + return it->second; + return std::nullopt; +} + void DynamicLoaderPOSIXDYLD::UpdateLoadedSections(ModuleSP module, addr_t link_map_addr, addr_t base_addr, bool base_addr_is_offset) { - m_loaded_modules[module] = link_map_addr; + SetLoadedModule(module, link_map_addr); + UpdateLoadedSectionsCommon(module, base_addr, base_addr_is_offset); } void DynamicLoaderPOSIXDYLD::UnloadSections(const ModuleSP module) { - m_loaded_modules.erase(module); + UnloadModule(module); UnloadSectionsCommon(module); } @@ -401,7 +424,7 @@ void DynamicLoaderPOSIXDYLD::RefreshModules() { // The rendezvous class doesn't enumerate the main module, so track that // ourselves here. ModuleSP executable = GetTargetExecutable(); - m_loaded_modules[executable] = m_rendezvous.GetLinkMapAddress(); + SetLoadedModule(executable, m_rendezvous.GetLinkMapAddress()); DYLDRendezvous::iterator I; DYLDRendezvous::iterator E; @@ -423,34 +446,70 @@ void DynamicLoaderPOSIXDYLD::RefreshModules() { E = m_rendezvous.end(); m_initial_modules_added = true; } - for (; I != E; ++I) { - // Don't load a duplicate copy of ld.so if we have already loaded it - // earlier in LoadInterpreterModule. If we instead loaded then unloaded it - // later, the section information for ld.so would be removed. That - // information is required for placing breakpoints on Arm/Thumb systems. - if ((m_interpreter_module.lock() != nullptr) && - (I->base_addr == m_interpreter_base)) - continue; - - ModuleSP module_sp = - LoadModuleAtAddress(I->file_spec, I->link_addr, I->base_addr, true); - if (!module_sp.get()) - continue; - - if (module_sp->GetObjectFile()->GetBaseAddress().GetLoadAddress( - &m_process->GetTarget()) == m_interpreter_base) { - ModuleSP interpreter_sp = m_interpreter_module.lock(); - if (m_interpreter_module.lock() == nullptr) { - m_interpreter_module = module_sp; - } else if (module_sp == interpreter_sp) { - // Module already loaded. - continue; - } - } - loaded_modules.AppendIfNeeded(module_sp); - new_modules.Append(module_sp); + // Synchronize reading and writing of `m_interpreter_module`. + std::mutex interpreter_module_mutex; + // We should be able to take SOEntry as reference since the data + // exists for the duration of this call in `m_rendezvous`. + auto load_module_fn = + [this, &loaded_modules, &new_modules, + &interpreter_module_mutex](const DYLDRendezvous::SOEntry &so_entry) { + // Don't load a duplicate copy of ld.so if we have already loaded it + // earlier in LoadInterpreterModule. If we instead loaded then + // unloaded it later, the section information for ld.so would be + // removed. That information is required for placing breakpoints on + // Arm/Thumb systems. + { + // `m_interpreter_module` may be modified by another thread at the + // same time, so we guard the access here. + std::lock_guard lock(interpreter_module_mutex); + if ((m_interpreter_module.lock() != nullptr) && + (so_entry.base_addr == m_interpreter_base)) + return; + } + + ModuleSP module_sp = LoadModuleAtAddress( + so_entry.file_spec, so_entry.link_addr, so_entry.base_addr, true); + if (!module_sp.get()) + return; + + { + // `m_interpreter_module` may be modified by another thread at the + // same time, so we guard the access here. + std::lock_guard lock(interpreter_module_mutex); + // Set the interpreter module, if this is the interpreter. + if (module_sp->GetObjectFile()->GetBaseAddress().GetLoadAddress( + &m_process->GetTarget()) == m_interpreter_base) { + ModuleSP interpreter_sp = m_interpreter_module.lock(); + if (m_interpreter_module.lock() == nullptr) { + m_interpreter_module = module_sp; + } else if (module_sp == interpreter_sp) { + // Module already loaded. + return; + } + } + } + + // Note: in a multi-threaded environment, these module lists may be + // appended to out-of-order. This is fine, since there's no + // expectation for `loaded_modules` or `new_modules` to be in any + // particular order, and appending to each module list is thread-safe. + // Also, `new_modules` is only used for the `ModulesDidLoad` call at + // the end of this function. + loaded_modules.AppendIfNeeded(module_sp); + new_modules.Append(module_sp); + }; + + if (m_process->GetTarget().GetParallelModuleLoad()) { + llvm::ThreadPoolTaskGroup task_group(Debugger::GetThreadPool()); + for (; I != E; ++I) + task_group.async(load_module_fn, *I); + task_group.wait(); + } else { + for (; I != E; ++I) + load_module_fn(*I); } + m_process->GetTarget().ModulesDidLoad(new_modules); } @@ -636,7 +695,7 @@ void DynamicLoaderPOSIXDYLD::LoadAllCurrentModules() { // The rendezvous class doesn't enumerate the main module, so track that // ourselves here. ModuleSP executable = GetTargetExecutable(); - m_loaded_modules[executable] = m_rendezvous.GetLinkMapAddress(); + SetLoadedModule(executable, m_rendezvous.GetLinkMapAddress()); std::vector module_names; for (I = m_rendezvous.begin(), E = m_rendezvous.end(); I != E; ++I) @@ -644,19 +703,31 @@ void DynamicLoaderPOSIXDYLD::LoadAllCurrentModules() { m_process->PrefetchModuleSpecs( module_names, m_process->GetTarget().GetArchitecture().GetTriple()); - for (I = m_rendezvous.begin(), E = m_rendezvous.end(); I != E; ++I) { - ModuleSP module_sp = - LoadModuleAtAddress(I->file_spec, I->link_addr, I->base_addr, true); + auto load_module_fn = [this, &module_list, + &log](const DYLDRendezvous::SOEntry &so_entry) { + ModuleSP module_sp = LoadModuleAtAddress( + so_entry.file_spec, so_entry.link_addr, so_entry.base_addr, true); if (module_sp.get()) { LLDB_LOG(log, "LoadAllCurrentModules loading module: {0}", - I->file_spec.GetFilename()); + so_entry.file_spec.GetFilename()); module_list.Append(module_sp); } else { Log *log = GetLog(LLDBLog::DynamicLoader); LLDB_LOGF( log, "DynamicLoaderPOSIXDYLD::%s failed loading module %s at 0x%" PRIx64, - __FUNCTION__, I->file_spec.GetPath().c_str(), I->base_addr); + __FUNCTION__, so_entry.file_spec.GetPath().c_str(), + so_entry.base_addr); + } + }; + if (m_process->GetTarget().GetParallelModuleLoad()) { + llvm::ThreadPoolTaskGroup task_group(Debugger::GetThreadPool()); + for (I = m_rendezvous.begin(), E = m_rendezvous.end(); I != E; ++I) + task_group.async(load_module_fn, *I); + task_group.wait(); + } else { + for (I = m_rendezvous.begin(), E = m_rendezvous.end(); I != E; ++I) { + load_module_fn(*I); } } @@ -728,15 +799,15 @@ DynamicLoaderPOSIXDYLD::GetThreadLocalData(const lldb::ModuleSP module_sp, const lldb::ThreadSP thread, lldb::addr_t tls_file_addr) { Log *log = GetLog(LLDBLog::DynamicLoader); - auto it = m_loaded_modules.find(module_sp); - if (it == m_loaded_modules.end()) { + std::optional link_map_addr_opt = GetLoadedModuleLinkAddr(module_sp); + if (!link_map_addr_opt.has_value()) { LLDB_LOGF( log, "GetThreadLocalData error: module(%s) not found in loaded modules", module_sp->GetObjectName().AsCString()); return LLDB_INVALID_ADDRESS; } - addr_t link_map = it->second; + addr_t link_map = link_map_addr_opt.value(); if (link_map == LLDB_INVALID_ADDRESS || link_map == 0) { LLDB_LOGF(log, "GetThreadLocalData error: invalid link map address=0x%" PRIx64, diff --git a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h index bde334aaca40b..6efb92673a13c 100644 --- a/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h +++ b/lldb/source/Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h @@ -93,10 +93,6 @@ class DynamicLoaderPOSIXDYLD : public lldb_private::DynamicLoader { /// Contains the pointer to the interpret module, if loaded. std::weak_ptr m_interpreter_module; - /// Loaded module list. (link map for each module) - std::map> - m_loaded_modules; - /// Returns true if the process is for a core file. bool IsCoreFile() const; @@ -180,6 +176,19 @@ class DynamicLoaderPOSIXDYLD : public lldb_private::DynamicLoader { DynamicLoaderPOSIXDYLD(const DynamicLoaderPOSIXDYLD &) = delete; const DynamicLoaderPOSIXDYLD & operator=(const DynamicLoaderPOSIXDYLD &) = delete; + + /// Loaded module list. (link map for each module) + /// This may be accessed in a multi-threaded context. Use the accessor methods + /// to access `m_loaded_modules` safely. + std::map> + m_loaded_modules; + llvm::sys::RWMutex m_loaded_modules_rw_mutex; + + void SetLoadedModule(const lldb::ModuleSP &module_sp, + lldb::addr_t link_map_addr); + void UnloadModule(const lldb::ModuleSP &module_sp); + std::optional + GetLoadedModuleLinkAddr(const lldb::ModuleSP &module_sp); }; #endif // LLDB_SOURCE_PLUGINS_DYNAMICLOADER_POSIX_DYLD_DYNAMICLOADERPOSIXDYLD_H diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index c26bca546891e..09c0c0b8a5db0 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -4488,6 +4488,12 @@ llvm::StringRef TargetProperties::GetLaunchWorkingDirectory() const { idx, g_target_properties[idx].default_cstr_value); } +bool TargetProperties::GetParallelModuleLoad() const { + const uint32_t idx = ePropertyParallelModuleLoad; + return GetPropertyAtIndexAs( + idx, g_target_properties[idx].default_uint_value != 0); +} + const char *TargetProperties::GetDisassemblyFlavor() const { const uint32_t idx = ePropertyDisassemblyFlavor; const char *return_value; diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td index 38a345dfd8849..3940ac00a2bd9 100644 --- a/lldb/source/Target/TargetProperties.td +++ b/lldb/source/Target/TargetProperties.td @@ -217,6 +217,9 @@ let Definition = "target" in { "launched. If you change this setting, the new value will only apply to " "subsequent launches. Commands that take an explicit working directory " "will override this setting.">; + def ParallelModuleLoad: Property<"parallel-module-load", "Boolean">, + DefaultTrue, + Desc<"Enable loading of modules in parallel for the dynamic loader.">; } let Definition = "process_experimental" in { From 5e2860a8d375ded2d2912894e380fefc8cb1f23a Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Mon, 31 Mar 2025 13:38:09 -0700 Subject: [PATCH 0153/1029] Revert "[HLSL][RootSignature] Implement parsing of a DescriptorTable with empty clauses" (#133790) Reverts llvm/llvm-project#133302 Reverting to inspect build failures that were introduced from use of the `clang::Preprocessor` in unit testing, as well as, the warning about an unused declaration. See linked issue for failures. --- .../clang/Basic/DiagnosticParseKinds.td | 4 - .../clang/Lex/HLSLRootSignatureTokenKinds.def | 23 +- .../include/clang/Lex/LexHLSLRootSignature.h | 15 +- .../clang/Parse/ParseHLSLRootSignature.h | 107 -------- clang/lib/Parse/CMakeLists.txt | 1 - clang/lib/Parse/ParseHLSLRootSignature.cpp | 166 ------------ clang/unittests/CMakeLists.txt | 1 - .../Lex/LexHLSLRootSignatureTest.cpp | 4 +- clang/unittests/Parse/CMakeLists.txt | 23 -- .../Parse/ParseHLSLRootSignatureTest.cpp | 245 ------------------ .../llvm/Frontend/HLSL/HLSLRootSignature.h | 44 ---- 11 files changed, 13 insertions(+), 620 deletions(-) delete mode 100644 clang/include/clang/Parse/ParseHLSLRootSignature.h delete mode 100644 clang/lib/Parse/ParseHLSLRootSignature.cpp delete mode 100644 clang/unittests/Parse/CMakeLists.txt delete mode 100644 clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp delete mode 100644 llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 2582e1e5ef0f6..86c361b4dbcf7 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1830,8 +1830,4 @@ def err_hlsl_virtual_function def err_hlsl_virtual_inheritance : Error<"virtual inheritance is unsupported in HLSL">; -// HLSL Root Siganture diagnostic messages -def err_hlsl_unexpected_end_of_params - : Error<"expected %0 to denote end of parameters, or, another valid parameter of %1">; - } // end of Parser diagnostics diff --git a/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def b/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def index c514d3456146a..e6df763920430 100644 --- a/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def +++ b/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def @@ -14,16 +14,16 @@ //===----------------------------------------------------------------------===// #ifndef TOK -#define TOK(X, SPELLING) +#define TOK(X) #endif #ifndef PUNCTUATOR -#define PUNCTUATOR(X,Y) TOK(pu_ ## X, Y) +#define PUNCTUATOR(X,Y) TOK(pu_ ## X) #endif #ifndef KEYWORD -#define KEYWORD(X) TOK(kw_ ## X, #X) +#define KEYWORD(X) TOK(kw_ ## X) #endif #ifndef ENUM -#define ENUM(NAME, LIT) TOK(en_ ## NAME, LIT) +#define ENUM(NAME, LIT) TOK(en_ ## NAME) #endif // Defines the various types of enum @@ -49,15 +49,15 @@ #endif // General Tokens: -TOK(invalid, "invalid identifier") -TOK(end_of_stream, "end of stream") -TOK(int_literal, "integer literal") +TOK(invalid) +TOK(end_of_stream) +TOK(int_literal) // Register Tokens: -TOK(bReg, "b register") -TOK(tReg, "t register") -TOK(uReg, "u register") -TOK(sReg, "s register") +TOK(bReg) +TOK(tReg) +TOK(uReg) +TOK(sReg) // Punctuators: PUNCTUATOR(l_paren, '(') @@ -69,7 +69,6 @@ PUNCTUATOR(plus, '+') PUNCTUATOR(minus, '-') // RootElement Keywords: -KEYWORD(RootSignature) // used only for diagnostic messaging KEYWORD(DescriptorTable) // DescriptorTable Keywords: diff --git a/clang/include/clang/Lex/LexHLSLRootSignature.h b/clang/include/clang/Lex/LexHLSLRootSignature.h index a7e1f782b767f..21c44e0351d9e 100644 --- a/clang/include/clang/Lex/LexHLSLRootSignature.h +++ b/clang/include/clang/Lex/LexHLSLRootSignature.h @@ -13,7 +13,6 @@ #ifndef LLVM_CLANG_LEX_LEXHLSLROOTSIGNATURE_H #define LLVM_CLANG_LEX_LEXHLSLROOTSIGNATURE_H -#include "clang/Basic/Diagnostic.h" #include "clang/Basic/SourceLocation.h" #include "llvm/ADT/SmallVector.h" @@ -25,7 +24,7 @@ namespace hlsl { struct RootSignatureToken { enum Kind { -#define TOK(X, SPELLING) X, +#define TOK(X) X, #include "clang/Lex/HLSLRootSignatureTokenKinds.def" }; @@ -44,18 +43,6 @@ struct RootSignatureToken { }; using TokenKind = enum RootSignatureToken::Kind; -inline const DiagnosticBuilder &operator<<(const DiagnosticBuilder &DB, - const TokenKind Kind) { - switch (Kind) { -#define TOK(X, SPELLING) \ - case TokenKind::X: \ - DB << SPELLING; \ - break; -#include "clang/Lex/HLSLRootSignatureTokenKinds.def" - } - return DB; -} - class RootSignatureLexer { public: RootSignatureLexer(StringRef Signature, clang::SourceLocation SourceLoc) diff --git a/clang/include/clang/Parse/ParseHLSLRootSignature.h b/clang/include/clang/Parse/ParseHLSLRootSignature.h deleted file mode 100644 index 43b41315b88b5..0000000000000 --- a/clang/include/clang/Parse/ParseHLSLRootSignature.h +++ /dev/null @@ -1,107 +0,0 @@ -//===--- ParseHLSLRootSignature.h -------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines the RootSignatureParser interface. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_CLANG_PARSE_PARSEHLSLROOTSIGNATURE_H -#define LLVM_CLANG_PARSE_PARSEHLSLROOTSIGNATURE_H - -#include "clang/Basic/DiagnosticParse.h" -#include "clang/Lex/LexHLSLRootSignature.h" -#include "clang/Lex/Preprocessor.h" - -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" - -#include "llvm/Frontend/HLSL/HLSLRootSignature.h" - -namespace clang { -namespace hlsl { - -class RootSignatureParser { -public: - RootSignatureParser(SmallVector &Elements, - RootSignatureLexer &Lexer, clang::Preprocessor &PP); - - /// Consumes tokens from the Lexer and constructs the in-memory - /// representations of the RootElements. Tokens are consumed until an - /// error is encountered or the end of the buffer. - /// - /// Returns true if a parsing error is encountered. - bool parse(); - -private: - DiagnosticsEngine &getDiags() { return PP.getDiagnostics(); } - - // All private Parse.* methods follow a similar pattern: - // - Each method will start with an assert to denote what the CurToken is - // expected to be and will parse from that token forward - // - // - Therefore, it is the callers responsibility to ensure that you are - // at the correct CurToken. This should be done with the pattern of: - // - // if (TryConsumeExpectedToken(TokenKind)) - // if (Parse.*()) - // return true; - // - // or, - // - // if (ConsumeExpectedToken(TokenKind, ...)) - // return true; - // if (Parse.*()) - // return true; - // - // - All methods return true if a parsing error is encountered. It is the - // callers responsibility to propogate this error up, or deal with it - // otherwise - // - // - An error will be raised if the proceeding tokens are not what is - // expected, or, there is a lexing error - - /// Root Element parse methods: - bool parseDescriptorTable(); - bool parseDescriptorTableClause(); - - /// Invoke the Lexer to consume a token and update CurToken with the result - void consumeNextToken() { CurToken = Lexer.ConsumeToken(); } - - /// Return true if the next token one of the expected kinds - bool peekExpectedToken(TokenKind Expected); - bool peekExpectedToken(ArrayRef AnyExpected); - - /// Consumes the next token and report an error if it is not of the expected - /// kind. - /// - /// Returns true if there was an error reported. - bool consumeExpectedToken(TokenKind Expected, - unsigned DiagID = diag::err_expected, - TokenKind Context = TokenKind::invalid); - - /// Peek if the next token is of the expected kind and if it is then consume - /// it. - /// - /// Returns true if it successfully matches the expected kind and the token - /// was consumed. - bool tryConsumeExpectedToken(TokenKind Expected); - bool tryConsumeExpectedToken(ArrayRef Expected); - -private: - SmallVector &Elements; - RootSignatureLexer &Lexer; - - clang::Preprocessor &PP; - - RootSignatureToken CurToken; -}; - -} // namespace hlsl -} // namespace clang - -#endif // LLVM_CLANG_PARSE_PARSEHLSLROOTSIGNATURE_H diff --git a/clang/lib/Parse/CMakeLists.txt b/clang/lib/Parse/CMakeLists.txt index 00fde537bb9c6..22e902f7e1bc5 100644 --- a/clang/lib/Parse/CMakeLists.txt +++ b/clang/lib/Parse/CMakeLists.txt @@ -14,7 +14,6 @@ add_clang_library(clangParse ParseExpr.cpp ParseExprCXX.cpp ParseHLSL.cpp - ParseHLSLRootSignature.cpp ParseInit.cpp ParseObjc.cpp ParseOpenMP.cpp diff --git a/clang/lib/Parse/ParseHLSLRootSignature.cpp b/clang/lib/Parse/ParseHLSLRootSignature.cpp deleted file mode 100644 index 33caca5fa1c82..0000000000000 --- a/clang/lib/Parse/ParseHLSLRootSignature.cpp +++ /dev/null @@ -1,166 +0,0 @@ -//=== ParseHLSLRootSignature.cpp - Parse Root Signature -------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang/Parse/ParseHLSLRootSignature.h" - -#include "llvm/Support/raw_ostream.h" - -using namespace llvm::hlsl::rootsig; - -namespace clang { -namespace hlsl { - -RootSignatureParser::RootSignatureParser(SmallVector &Elements, - RootSignatureLexer &Lexer, - Preprocessor &PP) - : Elements(Elements), Lexer(Lexer), PP(PP), CurToken(SourceLocation()) {} - -bool RootSignatureParser::parse() { - // Iterate as many RootElements as possible - while (tryConsumeExpectedToken(TokenKind::kw_DescriptorTable)) { - // Dispatch onto parser method. - // We guard against the unreachable here as we just ensured that CurToken - // will be one of the kinds in the while condition - switch (CurToken.Kind) { - case TokenKind::kw_DescriptorTable: - if (parseDescriptorTable()) - return true; - break; - default: - llvm_unreachable("Switch for consumed token was not provided"); - } - - if (!tryConsumeExpectedToken(TokenKind::pu_comma)) - break; - } - - if (!tryConsumeExpectedToken(TokenKind::end_of_stream)) { - getDiags().Report(CurToken.TokLoc, diag::err_hlsl_unexpected_end_of_params) - << /*expected=*/TokenKind::end_of_stream - << /*param of=*/TokenKind::kw_RootSignature; - return true; - } - return false; -} - -bool RootSignatureParser::parseDescriptorTable() { - assert(CurToken.Kind == TokenKind::kw_DescriptorTable && - "Expects to only be invoked starting at given keyword"); - - DescriptorTable Table; - - if (consumeExpectedToken(TokenKind::pu_l_paren, diag::err_expected_after, - CurToken.Kind)) - return true; - - // Iterate as many Clauses as possible - while (tryConsumeExpectedToken({TokenKind::kw_CBV, TokenKind::kw_SRV, - TokenKind::kw_UAV, TokenKind::kw_Sampler})) { - if (parseDescriptorTableClause()) - return true; - - Table.NumClauses++; - - if (!tryConsumeExpectedToken(TokenKind::pu_comma)) - break; - } - - if (!tryConsumeExpectedToken(TokenKind::pu_r_paren)) { - getDiags().Report(CurToken.TokLoc, diag::err_hlsl_unexpected_end_of_params) - << /*expected=*/TokenKind::pu_r_paren - << /*param of=*/TokenKind::kw_DescriptorTable; - return true; - } - - Elements.push_back(Table); - return false; -} - -bool RootSignatureParser::parseDescriptorTableClause() { - assert((CurToken.Kind == TokenKind::kw_CBV || - CurToken.Kind == TokenKind::kw_SRV || - CurToken.Kind == TokenKind::kw_UAV || - CurToken.Kind == TokenKind::kw_Sampler) && - "Expects to only be invoked starting at given keyword"); - - DescriptorTableClause Clause; - switch (CurToken.Kind) { - default: - llvm_unreachable("Switch for consumed token was not provided"); - case TokenKind::kw_CBV: - Clause.Type = ClauseType::CBuffer; - break; - case TokenKind::kw_SRV: - Clause.Type = ClauseType::SRV; - break; - case TokenKind::kw_UAV: - Clause.Type = ClauseType::UAV; - break; - case TokenKind::kw_Sampler: - Clause.Type = ClauseType::Sampler; - break; - } - - if (consumeExpectedToken(TokenKind::pu_l_paren, diag::err_expected_after, - CurToken.Kind)) - return true; - - if (consumeExpectedToken(TokenKind::pu_r_paren, diag::err_expected_after, - CurToken.Kind)) - return true; - - Elements.push_back(Clause); - return false; -} - -bool RootSignatureParser::peekExpectedToken(TokenKind Expected) { - return peekExpectedToken(ArrayRef{Expected}); -} - -bool RootSignatureParser::peekExpectedToken(ArrayRef AnyExpected) { - RootSignatureToken Result = Lexer.PeekNextToken(); - return llvm::is_contained(AnyExpected, Result.Kind); -} - -bool RootSignatureParser::consumeExpectedToken(TokenKind Expected, - unsigned DiagID, - TokenKind Context) { - if (tryConsumeExpectedToken(Expected)) - return false; - - // Report unexpected token kind error - DiagnosticBuilder DB = getDiags().Report(CurToken.TokLoc, DiagID); - switch (DiagID) { - case diag::err_expected: - DB << Expected; - break; - case diag::err_expected_either: - case diag::err_expected_after: - DB << Expected << Context; - break; - default: - break; - } - return true; -} - -bool RootSignatureParser::tryConsumeExpectedToken(TokenKind Expected) { - return tryConsumeExpectedToken(ArrayRef{Expected}); -} - -bool RootSignatureParser::tryConsumeExpectedToken( - ArrayRef AnyExpected) { - // If not the expected token just return - if (!peekExpectedToken(AnyExpected)) - return false; - consumeNextToken(); - return true; -} - -} // namespace hlsl -} // namespace clang diff --git a/clang/unittests/CMakeLists.txt b/clang/unittests/CMakeLists.txt index 9b3ce8aa7de73..85d265426ec80 100644 --- a/clang/unittests/CMakeLists.txt +++ b/clang/unittests/CMakeLists.txt @@ -25,7 +25,6 @@ endfunction() add_subdirectory(Basic) add_subdirectory(Lex) -add_subdirectory(Parse) add_subdirectory(Driver) if(CLANG_ENABLE_STATIC_ANALYZER) add_subdirectory(Analysis) diff --git a/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp b/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp index 04af01ef97dea..d72a842922f98 100644 --- a/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp +++ b/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp @@ -85,8 +85,6 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexAllTokensTest) { (),|=+- - RootSignature - DescriptorTable CBV SRV UAV Sampler @@ -115,7 +113,7 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexAllTokensTest) { SmallVector Tokens; SmallVector Expected = { -#define TOK(NAME, SPELLING) hlsl::TokenKind::NAME, +#define TOK(NAME) hlsl::TokenKind::NAME, #include "clang/Lex/HLSLRootSignatureTokenKinds.def" }; diff --git a/clang/unittests/Parse/CMakeLists.txt b/clang/unittests/Parse/CMakeLists.txt deleted file mode 100644 index eeb58174568cd..0000000000000 --- a/clang/unittests/Parse/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -set(LLVM_LINK_COMPONENTS - Support - ) -add_clang_unittest(ParseTests - ParseHLSLRootSignatureTest.cpp - ) -clang_target_link_libraries(ParseTests - PRIVATE - clangAST - clangASTMatchers - clangBasic - clangFrontend - clangParse - clangSema - clangSerialization - clangTooling - ) -target_link_libraries(ParseTests - PRIVATE - LLVMTestingAnnotations - LLVMTestingSupport - clangTesting - ) diff --git a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp deleted file mode 100644 index acdf455a5d6aa..0000000000000 --- a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp +++ /dev/null @@ -1,245 +0,0 @@ -//=== ParseHLSLRootSignatureTest.cpp - Parse Root Signature tests ---------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "clang/Basic/Diagnostic.h" -#include "clang/Basic/DiagnosticOptions.h" -#include "clang/Basic/FileManager.h" -#include "clang/Basic/LangOptions.h" -#include "clang/Basic/SourceLocation.h" -#include "clang/Basic/SourceManager.h" -#include "clang/Basic/TargetInfo.h" -#include "clang/Lex/HeaderSearch.h" -#include "clang/Lex/HeaderSearchOptions.h" -#include "clang/Lex/Lexer.h" -#include "clang/Lex/ModuleLoader.h" -#include "clang/Lex/Preprocessor.h" -#include "clang/Lex/PreprocessorOptions.h" - -#include "clang/Lex/LexHLSLRootSignature.h" -#include "clang/Parse/ParseHLSLRootSignature.h" -#include "gtest/gtest.h" - -using namespace clang; -using namespace llvm::hlsl::rootsig; - -namespace { - -// Diagnostic helper for helper tests -class ExpectedDiagConsumer : public DiagnosticConsumer { - virtual void anchor() {} - - void HandleDiagnostic(DiagnosticsEngine::Level DiagLevel, - const Diagnostic &Info) override { - if (!FirstDiag || !ExpectedDiagID.has_value()) { - Satisfied = false; - return; - } - FirstDiag = false; - - Satisfied = ExpectedDiagID.value() == Info.getID(); - } - - bool FirstDiag = true; - bool Satisfied = false; - std::optional ExpectedDiagID; - -public: - void setNoDiag() { - Satisfied = true; - ExpectedDiagID = std::nullopt; - } - - void setExpected(unsigned DiagID) { - Satisfied = false; - ExpectedDiagID = DiagID; - } - - bool isSatisfied() { return Satisfied; } -}; - -// The test fixture. -class ParseHLSLRootSignatureTest : public ::testing::Test { -protected: - ParseHLSLRootSignatureTest() - : FileMgr(FileMgrOpts), DiagID(new DiagnosticIDs()), - Consumer(new ExpectedDiagConsumer()), - Diags(DiagID, new DiagnosticOptions, Consumer), - SourceMgr(Diags, FileMgr), TargetOpts(new TargetOptions) { - // This is an arbitrarily chosen target triple to create the target info. - TargetOpts->Triple = "dxil"; - Target = TargetInfo::CreateTargetInfo(Diags, TargetOpts); - } - - std::unique_ptr createPP(StringRef Source, - TrivialModuleLoader &ModLoader) { - std::unique_ptr Buf = - llvm::MemoryBuffer::getMemBuffer(Source); - SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf))); - - HeaderSearchOptions SearchOpts; - HeaderSearch HeaderInfo(SearchOpts, SourceMgr, Diags, LangOpts, - Target.get()); - std::unique_ptr PP = std::make_unique( - std::make_shared(), Diags, LangOpts, SourceMgr, - HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); - PP->Initialize(*Target); - PP->EnterMainSourceFile(); - return PP; - } - - FileSystemOptions FileMgrOpts; - FileManager FileMgr; - IntrusiveRefCntPtr DiagID; - ExpectedDiagConsumer *Consumer; - DiagnosticsEngine Diags; - SourceManager SourceMgr; - LangOptions LangOpts; - std::shared_ptr TargetOpts; - IntrusiveRefCntPtr Target; -}; - -// Valid Parser Tests - -TEST_F(ParseHLSLRootSignatureTest, ValidParseEmptyTest) { - const llvm::StringLiteral Source = R"cc()cc"; - - TrivialModuleLoader ModLoader; - auto PP = createPP(Source, ModLoader); - auto TokLoc = SourceLocation(); - - hlsl::RootSignatureLexer Lexer(Source, TokLoc); - SmallVector Elements; - hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); - - // Test no diagnostics produced - Consumer->setNoDiag(); - - ASSERT_FALSE(Parser.parse()); - ASSERT_EQ((int)Elements.size(), 0); - - ASSERT_TRUE(Consumer->isSatisfied()); -} - -TEST_F(ParseHLSLRootSignatureTest, ValidParseDTClausesTest) { - const llvm::StringLiteral Source = R"cc( - DescriptorTable( - CBV(), - SRV(), - Sampler(), - UAV() - ), - DescriptorTable() - )cc"; - - TrivialModuleLoader ModLoader; - auto PP = createPP(Source, ModLoader); - auto TokLoc = SourceLocation(); - - hlsl::RootSignatureLexer Lexer(Source, TokLoc); - SmallVector Elements; - hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); - - // Test no diagnostics produced - Consumer->setNoDiag(); - - ASSERT_FALSE(Parser.parse()); - - // First Descriptor Table with 4 elements - RootElement Elem = Elements[0]; - ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::CBuffer); - - Elem = Elements[1]; - ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::SRV); - - Elem = Elements[2]; - ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::Sampler); - - Elem = Elements[3]; - ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::UAV); - - Elem = Elements[4]; - ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).NumClauses, (uint32_t)4); - - // Empty Descriptor Table - Elem = Elements[5]; - ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).NumClauses, 0u); - ASSERT_TRUE(Consumer->isSatisfied()); -} - -// Invalid Parser Tests - -TEST_F(ParseHLSLRootSignatureTest, InvalidParseUnexpectedTokenTest) { - const llvm::StringLiteral Source = R"cc( - DescriptorTable() - space - )cc"; - - TrivialModuleLoader ModLoader; - auto PP = createPP(Source, ModLoader); - auto TokLoc = SourceLocation(); - - hlsl::RootSignatureLexer Lexer(Source, TokLoc); - SmallVector Elements; - hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); - - // Test correct diagnostic produced - Consumer->setExpected(diag::err_hlsl_unexpected_end_of_params); - ASSERT_TRUE(Parser.parse()); - - ASSERT_TRUE(Consumer->isSatisfied()); -} - -TEST_F(ParseHLSLRootSignatureTest, InvalidParseInvalidTokenTest) { - const llvm::StringLiteral Source = R"cc( - notAnIdentifier - )cc"; - - TrivialModuleLoader ModLoader; - auto PP = createPP(Source, ModLoader); - auto TokLoc = SourceLocation(); - - hlsl::RootSignatureLexer Lexer(Source, TokLoc); - SmallVector Elements; - hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); - - // Test correct diagnostic produced - invalid token - Consumer->setExpected(diag::err_hlsl_unexpected_end_of_params); - ASSERT_TRUE(Parser.parse()); - - ASSERT_TRUE(Consumer->isSatisfied()); -} - -TEST_F(ParseHLSLRootSignatureTest, InvalidParseUnexpectedEndOfStreamTest) { - const llvm::StringLiteral Source = R"cc( - DescriptorTable - )cc"; - - TrivialModuleLoader ModLoader; - auto PP = createPP(Source, ModLoader); - auto TokLoc = SourceLocation(); - - hlsl::RootSignatureLexer Lexer(Source, TokLoc); - SmallVector Elements; - hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); - - // Test correct diagnostic produced - end of stream - Consumer->setExpected(diag::err_expected_after); - ASSERT_TRUE(Parser.parse()); - - ASSERT_TRUE(Consumer->isSatisfied()); -} - -} // anonymous namespace diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h deleted file mode 100644 index c1b67844c747f..0000000000000 --- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h +++ /dev/null @@ -1,44 +0,0 @@ -//===- HLSLRootSignature.h - HLSL Root Signature helper objects -----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -/// -/// \file This file contains helper objects for working with HLSL Root -/// Signatures. -/// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_FRONTEND_HLSL_HLSLROOTSIGNATURE_H -#define LLVM_FRONTEND_HLSL_HLSLROOTSIGNATURE_H - -#include "llvm/Support/DXILABI.h" -#include - -namespace llvm { -namespace hlsl { -namespace rootsig { - -// Definitions of the in-memory data layout structures - -// Models the end of a descriptor table and stores its visibility -struct DescriptorTable { - uint32_t NumClauses = 0; // The number of clauses in the table -}; - -// Models DTClause : CBV | SRV | UAV | Sampler, by collecting like parameters -using ClauseType = llvm::dxil::ResourceClass; -struct DescriptorTableClause { - ClauseType Type; -}; - -// Models RootElement : DescriptorTable | DescriptorTableClause -using RootElement = std::variant; - -} // namespace rootsig -} // namespace hlsl -} // namespace llvm - -#endif // LLVM_FRONTEND_HLSL_HLSLROOTSIGNATURE_H From baacd1287bfb17608068485c2554dd4455ac29a0 Mon Sep 17 00:00:00 2001 From: Sandeep Dasgupta Date: Mon, 31 Mar 2025 13:45:43 -0700 Subject: [PATCH 0154/1029] Fix printing of `mlirUniformQuantizedSubChannelTypeGetNumBlockSizes` in 32-bit machine. (#133763) Fixes the issue reported in https://github.com/llvm/llvm-project/pull/120172#issuecomment-2763212827 cc @mgorny --- mlir/test/CAPI/quant.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/test/CAPI/quant.c b/mlir/test/CAPI/quant.c index 30f376ebeb112..be4595fd39e81 100644 --- a/mlir/test/CAPI/quant.c +++ b/mlir/test/CAPI/quant.c @@ -268,7 +268,7 @@ void testUniformSubChannelType(MlirContext ctx) { mlirTypeIsNull(illegalSubChannel)); // CHECK: num dims: 2 - fprintf(stderr, "num dims: %" PRId64 "\n", + fprintf(stderr, "num dims: %" PRIdPTR "\n", mlirUniformQuantizedSubChannelTypeGetNumBlockSizes(subChannel)); // CHECK: axis-block-size-pair[0]: 0:1 From eefefb5da798a5e51b864235007e89043fcea16a Mon Sep 17 00:00:00 2001 From: Sandeep Dasgupta Date: Mon, 31 Mar 2025 13:45:54 -0700 Subject: [PATCH 0155/1029] Fix sub-channel quantized type documentation (#133765) fixes the issue reported in https://github.com/llvm/llvm-project/pull/120172#issuecomment-2748367578 --- mlir/include/mlir/Dialect/Quant/IR/QuantBase.td | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/include/mlir/Dialect/Quant/IR/QuantBase.td b/mlir/include/mlir/Dialect/Quant/IR/QuantBase.td index 0d97889960019..23bf5cf15e256 100644 --- a/mlir/include/mlir/Dialect/Quant/IR/QuantBase.td +++ b/mlir/include/mlir/Dialect/Quant/IR/QuantBase.td @@ -279,6 +279,7 @@ def Quant_Dialect : Dialect { // Correct. The quantized type now includes 3 scale values, matching the // size of dimension 1 of the result tensor. %result = quant.qcast %input : tensor to tensor> + ``` ## Sub-channel quantization integrity From cb7c223625f950c5ad14fed39aad8b358874fcf0 Mon Sep 17 00:00:00 2001 From: Paul Osmialowski Date: Mon, 31 Mar 2025 21:55:58 +0100 Subject: [PATCH 0156/1029] [clang][driver] Fix -fveclib=ArmPL issue: with -nostdlib do not link against libm (#133578) Although combining -fveclib=ArmPL with -nostdlib is a rare situation, it should still be supported correctly and should effect in avoidance of linking against libm. --- clang/lib/Driver/ToolChains/CommonArgs.cpp | 8 +++++--- clang/test/Driver/fveclib.c | 6 ++++++ flang/test/Driver/fveclib.f90 | 6 ++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 41cfa3d2e4f8c..5aac20e1cdf44 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -515,7 +515,7 @@ void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs, // // 1. On Linux, link only when actually needed. // - // 2. Prefer libm functions over libamath. + // 2. Prefer libm functions over libamath (when no -nostdlib in use). // // 3. Link against libm to resolve libamath dependencies. // @@ -523,9 +523,11 @@ void tools::AddLinkerInputs(const ToolChain &TC, const InputInfoList &Inputs, CmdArgs.push_back(Args.MakeArgString("--push-state")); CmdArgs.push_back(Args.MakeArgString("--as-needed")); } - CmdArgs.push_back(Args.MakeArgString("-lm")); + if (!Args.hasArg(options::OPT_nostdlib)) + CmdArgs.push_back(Args.MakeArgString("-lm")); CmdArgs.push_back(Args.MakeArgString("-lamath")); - CmdArgs.push_back(Args.MakeArgString("-lm")); + if (!Args.hasArg(options::OPT_nostdlib)) + CmdArgs.push_back(Args.MakeArgString("-lm")); if (Triple.isOSLinux()) CmdArgs.push_back(Args.MakeArgString("--pop-state")); addArchSpecificRPath(TC, Args, CmdArgs); diff --git a/clang/test/Driver/fveclib.c b/clang/test/Driver/fveclib.c index 7d0985c4dd4f4..78b5316b67e47 100644 --- a/clang/test/Driver/fveclib.c +++ b/clang/test/Driver/fveclib.c @@ -116,11 +116,17 @@ /// Verify that vectorized routines library is being linked in. // RUN: %clang -### --target=aarch64-pc-windows-msvc -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-LINKING-ARMPL-MSVC %s // RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-LINKING-ARMPL-LINUX %s +// RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL %s -nostdlib 2>&1 | FileCheck --check-prefix=CHECK-LINKING-ARMPL-NOSTDLIB-LINUX %s // RUN: %clang -### --target=aarch64-linux-gnu -fveclib=ArmPL %s -lamath 2>&1 | FileCheck --check-prefix=CHECK-LINKING-AMATH-BEFORE-ARMPL-LINUX %s // RUN: %clang -### --target=arm64-apple-darwin -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-LINKING-ARMPL-DARWIN %s +// RUN: %clang -### --target=arm64-apple-darwin -fveclib=ArmPL -nostdlib %s 2>&1 | FileCheck --check-prefix=CHECK-LINKING-ARMPL-NOSTDLIB-DARWIN %s // RUN: %clang -### --target=arm64-apple-darwin -fveclib=ArmPL %s -lamath 2>&1 | FileCheck --check-prefix=CHECK-LINKING-AMATH-BEFORE-ARMPL-DARWIN %s // CHECK-LINKING-ARMPL-LINUX: "--push-state" "--as-needed" "-lm" "-lamath" "-lm" "--pop-state" +// CHECK-LINKING-ARMPL-NOSTDLIB-LINUX: "--push-state" "--as-needed" "-lamath" "--pop-state" +// CHECK-LINKING-ARMPL-NOSTDLIB-LINUX-NOT: "-lm" // CHECK-LINKING-ARMPL-DARWIN: "-lm" "-lamath" "-lm" +// CHECK-LINKING-ARMPL-NOSTDLIB-DARWIN: "-lamath" +// CHECK-LINKING-ARMPL-NOSTDLIB-DARWIN-NOT: "-lm" // CHECK-LINKING-ARMPL-MSVC: "--dependent-lib=amath" // CHECK-LINKING-AMATH-BEFORE-ARMPL-LINUX: "-lamath" {{.*}}"--push-state" "--as-needed" "-lm" "-lamath" "-lm" "--pop-state" // CHECK-LINKING-AMATH-BEFORE-ARMPL-DARWIN: "-lamath" {{.*}}"-lm" "-lamath" "-lm" diff --git a/flang/test/Driver/fveclib.f90 b/flang/test/Driver/fveclib.f90 index 490ce974724a6..7c2540b91ba79 100644 --- a/flang/test/Driver/fveclib.f90 +++ b/flang/test/Driver/fveclib.f90 @@ -33,11 +33,17 @@ ! RUN: %flang -### --target=aarch64-pc-windows-msvc -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-LINKING-ARMPL-MSVC %s ! RUN: %flang -### --target=aarch64-linux-gnu -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-LINKING-ARMPL-LINUX %s +! RUN: %flang -### --target=aarch64-linux-gnu -fveclib=ArmPL -nostdlib %s 2>&1 | FileCheck --check-prefix=CHECK-LINKING-ARMPL-NOSTDLIB-LINUX %s ! RUN: %flang -### --target=aarch64-linux-gnu -fveclib=ArmPL %s -lamath 2>&1 | FileCheck --check-prefix=CHECK-LINKING-AMATH-BEFORE-ARMPL-LINUX %s ! RUN: %flang -### --target=arm64-apple-darwin -fveclib=ArmPL %s 2>&1 | FileCheck --check-prefix=CHECK-LINKING-ARMPL-DARWIN %s +! RUN: %flang -### --target=arm64-apple-darwin -fveclib=ArmPL -nostdlib %s 2>&1 | FileCheck --check-prefix=CHECK-LINKING-ARMPL-NOSTDLIB-DARWIN %s ! RUN: %flang -### --target=arm64-apple-darwin -fveclib=ArmPL %s -lamath 2>&1 | FileCheck --check-prefix=CHECK-LINKING-AMATH-BEFORE-ARMPL-DARWIN %s ! CHECK-LINKING-ARMPL-LINUX: "--push-state" "--as-needed" "-lm" "-lamath" "-lm" "--pop-state" +! CHECK-LINKING-ARMPL-NOSTDLIB-LINUX: "--push-state" "--as-needed" "-lamath" "--pop-state" +! CHECK-LINKING-ARMPL-NOSTDLIB-LINUX-NOT: "-lm" ! CHECK-LINKING-ARMPL-DARWIN: "-lm" "-lamath" "-lm" +! CHECK-LINKING-ARMPL-NOSTDLIB-DARWIN: "-lamath" +! CHECK-LINKING-ARMPL-NOSTDLIB-DARWIN-NOT: "-lm" ! CHECK-LINKING-ARMPL-MSVC: "--dependent-lib=amath" ! CHECK-LINKING-AMATH-BEFORE-ARMPL-LINUX: "-lamath" {{.*}}"--push-state" "--as-needed" "-lm" "-lamath" "-lm" "--pop-state" ! CHECK-LINKING-AMATH-BEFORE-ARMPL-DARWIN: "-lamath" {{.*}}"-lm" "-lamath" "-lm" From bdae91b08b5b7fcd13f55db7f36ccaebd5c0b571 Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Mon, 31 Mar 2025 17:00:38 -0400 Subject: [PATCH 0157/1029] Revert "[Clang][Cmake] fix libtool duplicate member name warnings" (#133795) Reverts llvm/llvm-project#133619 --- clang/lib/CodeGen/CMakeLists.txt | 49 ++++++++++++++----- clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 2 +- .../lib/CodeGen/TargetBuiltins/CMakeLists.txt | 19 ------- clang/lib/CodeGen/Targets/CMakeLists.txt | 35 ------------- 4 files changed, 38 insertions(+), 67 deletions(-) delete mode 100644 clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt delete mode 100644 clang/lib/CodeGen/Targets/CMakeLists.txt diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt index cdf9f909a3675..ebe2fbd7db295 100644 --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -116,8 +116,45 @@ add_clang_library(clangCodeGen PatternInit.cpp SanitizerMetadata.cpp SwiftCallingConv.cpp + TargetBuiltins/ARM.cpp + TargetBuiltins/AMDGPU.cpp + TargetBuiltins/Hexagon.cpp + TargetBuiltins/NVPTX.cpp + TargetBuiltins/PPC.cpp + TargetBuiltins/RISCV.cpp + TargetBuiltins/SPIR.cpp + TargetBuiltins/SystemZ.cpp + TargetBuiltins/WebAssembly.cpp + TargetBuiltins/X86.cpp TargetInfo.cpp + Targets/AArch64.cpp + Targets/AMDGPU.cpp + Targets/ARC.cpp + Targets/ARM.cpp + Targets/AVR.cpp + Targets/BPF.cpp + Targets/CSKY.cpp + Targets/DirectX.cpp + Targets/Hexagon.cpp + Targets/Lanai.cpp + Targets/LoongArch.cpp + Targets/M68k.cpp + Targets/MSP430.cpp + Targets/Mips.cpp + Targets/NVPTX.cpp + Targets/PNaCl.cpp + Targets/PPC.cpp + Targets/RISCV.cpp + Targets/SPIR.cpp + Targets/Sparc.cpp + Targets/SystemZ.cpp + Targets/TCE.cpp + Targets/VE.cpp + Targets/WebAssembly.cpp + Targets/X86.cpp + Targets/XCore.cpp VarBypassDetector.cpp + DEPENDS vt_gen intrinsics_gen @@ -133,16 +170,4 @@ add_clang_library(clangCodeGen clangFrontend clangLex clangSerialization - clangCodeGenTargetBuiltins - clangCodeGenTargets - ) - - target_include_directories(clangCodeGen - PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/TargetBuiltins - ${CMAKE_CURRENT_SOURCE_DIR}/Targets ) - - add_subdirectory(TargetBuiltins) - add_subdirectory(Targets) diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 577fee05d4af6..b56b739094ff3 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -1,4 +1,4 @@ -//===------- AMDGPU.cpp - Emit LLVM Code for builtins ---------------------===// +//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt b/clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt deleted file mode 100644 index 8526c063b4593..0000000000000 --- a/clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..) - -add_clang_library(clangCodeGenTargetBuiltins STATIC - ARM.cpp - AMDGPU.cpp - Hexagon.cpp - NVPTX.cpp - PPC.cpp - RISCV.cpp - SPIR.cpp - SystemZ.cpp - WebAssembly.cpp - X86.cpp -) - -target_link_libraries(clangCodeGenTargetBuiltins - PRIVATE - clangCodeGen -) diff --git a/clang/lib/CodeGen/Targets/CMakeLists.txt b/clang/lib/CodeGen/Targets/CMakeLists.txt deleted file mode 100644 index fd79b6191b379..0000000000000 --- a/clang/lib/CodeGen/Targets/CMakeLists.txt +++ /dev/null @@ -1,35 +0,0 @@ -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..) - -add_clang_library(clangCodeGenTargets STATIC - AArch64.cpp - AMDGPU.cpp - ARC.cpp - ARM.cpp - AVR.cpp - BPF.cpp - CSKY.cpp - DirectX.cpp - Hexagon.cpp - Lanai.cpp - LoongArch.cpp - M68k.cpp - MSP430.cpp - Mips.cpp - NVPTX.cpp - PNaCl.cpp - PPC.cpp - RISCV.cpp - SPIR.cpp - Sparc.cpp - SystemZ.cpp - TCE.cpp - VE.cpp - WebAssembly.cpp - X86.cpp - XCore.cpp -) - -target_link_libraries(clangCodeGenTargets - PRIVATE - clangCodeGen -) From 32f24029c72dae175c9e2cc81f931f065a2ba347 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 31 Mar 2025 22:27:39 +0100 Subject: [PATCH 0158/1029] Reapply "[EquivalenceClasses] Replace findValue with contains (NFC)." This reverts the revert commit 616f447fc84bdc7655117f1b303d895dc3b93e4d. It includes updates to remaining users in Polly and Clang, to avoid failures when building those projects. --- .../Analysis/FlowSensitive/SimplifyConstraints.cpp | 12 +++++------- llvm/include/llvm/ADT/EquivalenceClasses.h | 7 +++---- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 2 +- polly/lib/Analysis/ScopBuilder.cpp | 3 +-- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/clang/lib/Analysis/FlowSensitive/SimplifyConstraints.cpp b/clang/lib/Analysis/FlowSensitive/SimplifyConstraints.cpp index cc20202768b92..02ec0d0213300 100644 --- a/clang/lib/Analysis/FlowSensitive/SimplifyConstraints.cpp +++ b/clang/lib/Analysis/FlowSensitive/SimplifyConstraints.cpp @@ -64,9 +64,9 @@ projectToLeaders(const llvm::DenseSet &Atoms, // `LeaderIt`. static llvm::SmallVector atomsInEquivalenceClass(const llvm::EquivalenceClasses &EquivalentAtoms, - llvm::EquivalenceClasses::iterator LeaderIt) { + const Atom &At) { llvm::SmallVector Result; - for (auto MemberIt = EquivalentAtoms.member_begin(LeaderIt); + for (auto MemberIt = EquivalentAtoms.findLeader(At); MemberIt != EquivalentAtoms.member_end(); ++MemberIt) Result.push_back(*MemberIt); return Result; @@ -159,19 +159,17 @@ void simplifyConstraints(llvm::SetVector &Constraints, if (TrueAtoms.contains(At) || FalseAtoms.contains(At)) continue; llvm::SmallVector Atoms = - atomsInEquivalenceClass(EquivalentAtoms, It); + atomsInEquivalenceClass(EquivalentAtoms, At); if (Atoms.size() == 1) continue; std::sort(Atoms.begin(), Atoms.end()); Info->EquivalentAtoms.push_back(std::move(Atoms)); } for (Atom At : TrueAtoms) - Info->TrueAtoms.append(atomsInEquivalenceClass( - EquivalentAtoms, EquivalentAtoms.findValue(At))); + Info->TrueAtoms.append(atomsInEquivalenceClass(EquivalentAtoms, At)); std::sort(Info->TrueAtoms.begin(), Info->TrueAtoms.end()); for (Atom At : FalseAtoms) - Info->FalseAtoms.append(atomsInEquivalenceClass( - EquivalentAtoms, EquivalentAtoms.findValue(At))); + Info->FalseAtoms.append(atomsInEquivalenceClass(EquivalentAtoms, At)); std::sort(Info->FalseAtoms.begin(), Info->FalseAtoms.end()); } } diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h index c375d6e77b12a..f9c7819f18806 100644 --- a/llvm/include/llvm/ADT/EquivalenceClasses.h +++ b/llvm/include/llvm/ADT/EquivalenceClasses.h @@ -179,10 +179,9 @@ class EquivalenceClasses { return member_iterator(nullptr); } - /// findValue - Return an iterator to the specified value. If it does not - /// exist, end() is returned. - iterator findValue(const ElemTy &V) const { - return TheMapping.find(V); + /// Returns true if \p V is contained an equivalence class. + bool contains(const ElemTy &V) const { + return TheMapping.find(V) != TheMapping.end(); } /// getLeaderValue - Return the leader for the specified value that is in the diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index e7d6984caeba3..47ff31b9a0525 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1230,7 +1230,7 @@ bool AccessAnalysis::canCheckPtrAtRT( [this](const Value *Ptr) { MemAccessInfo AccessWrite(const_cast(Ptr), true); - return DepCands.findValue(AccessWrite) == DepCands.end(); + return !DepCands.contains(AccessWrite); })) && "Can only skip updating CanDoRT below, if all entries in AS " "are reads or there is at most 1 entry"); diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp index 351eab7f93710..c0babb85f5c46 100644 --- a/polly/lib/Analysis/ScopBuilder.cpp +++ b/polly/lib/Analysis/ScopBuilder.cpp @@ -1856,8 +1856,7 @@ static void joinOperandTree(EquivalenceClasses &UnionFind, continue; // Check if OpInst is in the BB and is a modeled instruction. - auto OpVal = UnionFind.findValue(OpInst); - if (OpVal == UnionFind.end()) + if (!UnionFind.contains(OpInst)) continue; UnionFind.unionSets(Inst, OpInst); From f30c6a047d73322ce7f4fba4367eadc49565827d Mon Sep 17 00:00:00 2001 From: Keith Smiley Date: Mon, 31 Mar 2025 14:38:58 -0700 Subject: [PATCH 0159/1029] [bazel] Format BUILD files with buildifier (#133802) --- utils/bazel/llvm-project-overlay/bolt/BUILD.bazel | 2 +- .../clang-tools-extra/include-cleaner/BUILD.bazel | 2 +- utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel | 4 ++-- .../llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel | 4 ++-- utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 4 ++-- .../bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel index 7595404cb10a9..b5cd6dbcbd4f9 100644 --- a/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/bolt/BUILD.bazel @@ -70,8 +70,8 @@ cc_binary( ":Profile", ":Rewrite", ":RuntimeLibs", - ":TargetConfig", ":TargetAArch64", + ":TargetConfig", ":TargetX86", ":Utils", "//llvm:AllTargetsAsmParsers", diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/include-cleaner/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/include-cleaner/BUILD.bazel index 5b210ad80c874..e34fbe55cb781 100644 --- a/utils/bazel/llvm-project-overlay/clang-tools-extra/include-cleaner/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/include-cleaner/BUILD.bazel @@ -119,6 +119,7 @@ expand_template( lit_test( name = "%s.test" % src, srcs = [src], + args = ["-svv"], data = glob(["test/Inputs/**/*"]) + [ "test/lit.cfg.py", "test/lit.site.cfg.py", @@ -127,7 +128,6 @@ expand_template( "//llvm:count", "//llvm:not", ], - args = ["-svv"], ) for src in glob(["test/*.cpp"]) ] diff --git a/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel b/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel index 765cde42dd062..bcf947a230144 100644 --- a/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/compiler-rt/BUILD.bazel @@ -67,11 +67,11 @@ cc_library( "lib/orc/error.h", "lib/orc/executor_address.h", "lib/orc/executor_symbol_def.h", - "lib/orc/rtti.h", "lib/orc/interval_map.h", "lib/orc/interval_set.h", "lib/orc/jit_dispatch.h", "lib/orc/record_section_tracker.h", + "lib/orc/rtti.h", "lib/orc/simple_packed_serialization.h", "lib/orc/stl_extras.h", "lib/orc/string_pool.h", @@ -86,8 +86,8 @@ cc_library( srcs = [ "lib/orc/debug.cpp", "lib/orc/dlfcn_wrapper.cpp", - "lib/orc/rtti.cpp", "lib/orc/log_error_to_stderr.cpp", + "lib/orc/rtti.cpp", "lib/orc/run_program_wrapper.cpp", ] + select({ "@platforms//os:macos": [ diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel index ee27d6398cee5..cf25376878347 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel @@ -209,12 +209,12 @@ math_test( # name = "f16fma", # hdrs = ["FmaTest.h"], # ) -# +# # math_test( # name = "f16fmaf", # hdrs = ["FmaTest.h"], # ) -# +# # math_test( # name = "f16fmal", # hdrs = ["FmaTest.h"], diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel index ba1dd4b4f60e2..f29cc028c52c0 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel @@ -664,10 +664,10 @@ cc_library( "//mlir:SPIRVDialect", "//mlir:SideEffectInterfaces", "//mlir:Support", - "//mlir:TransformUtils", - "//mlir:Transforms", "//mlir:TransformDialect", "//mlir:TransformDialectInterfaces", + "//mlir:TransformUtils", + "//mlir:Transforms", ], ) diff --git a/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel index adc2a000786aa..b8d136c174bd4 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel @@ -9,13 +9,13 @@ package(default_visibility = ["//visibility:public"]) name = "%s.test" % src, srcs = [src], data = [ + "Vector/vector-sink-transform.mlir", "//llvm:llvm-symbolizer", "//mlir:mlir-opt", "//mlir:mlir-pdll", "//mlir:mlir-reduce", "//mlir:mlir-translate", "//mlir/test:lit_data", - "Vector/vector-sink-transform.mlir", ] + glob([ "IRDL/*.irdl.mlir", "Linalg/td/*.mlir", From 6ff33edcdc29be049829934399b2fb2585252439 Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Mon, 31 Mar 2025 14:59:41 -0700 Subject: [PATCH 0160/1029] [alpha.webkit.NoUnretainedMemberChecker] Ignore system-header-defined ivar / property of a forward declared type (#133755) Prior to this PR, we were emitting warnings for Objective-C ivars and properties if the forward declaration of the type appeared first in a non-system header. This PR fixes the checker so tha we'd ignore ivars and properties defined for a forward declared type. --- .../StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp | 4 ++++ clang/test/Analysis/Checkers/WebKit/unretained-members.mm | 2 ++ 2 files changed, 6 insertions(+) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp index 89df1a725ab92..a003fc200727c 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp @@ -133,6 +133,8 @@ class RawPtrRefMemberChecker void visitIvarDecl(const ObjCContainerDecl *CD, const ObjCIvarDecl *Ivar) const { + if (BR->getSourceManager().isInSystemHeader(Ivar->getLocation())) + return; auto QT = Ivar->getType(); const Type *IvarType = QT.getTypePtrOrNull(); if (!IvarType) @@ -154,6 +156,8 @@ class RawPtrRefMemberChecker void visitObjCPropertyDecl(const ObjCContainerDecl *CD, const ObjCPropertyDecl *PD) const { + if (BR->getSourceManager().isInSystemHeader(PD->getLocation())) + return; auto QT = PD->getType(); const Type *PropType = QT.getTypePtrOrNull(); if (!PropType) diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-members.mm b/clang/test/Analysis/Checkers/WebKit/unretained-members.mm index 92d70a94427c0..fff1f8ede091b 100644 --- a/clang/test/Analysis/Checkers/WebKit/unretained-members.mm +++ b/clang/test/Analysis/Checkers/WebKit/unretained-members.mm @@ -1,5 +1,7 @@ // RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.NoUnretainedMemberChecker -verify %s +@class SystemObject; + #include "objc-mock-types.h" #include "mock-system-header.h" From 7793bae97d2bad36d870c6df438a6fc034f15112 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Mon, 31 Mar 2025 15:30:05 -0700 Subject: [PATCH 0161/1029] [workflows] Add missing -y option to apt-get for abi tests (#133337) --- .github/workflows/libclang-abi-tests.yml | 4 ++-- .github/workflows/llvm-tests.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml index 27cb7c4f75513..4d47c07f42205 100644 --- a/.github/workflows/libclang-abi-tests.yml +++ b/.github/workflows/libclang-abi-tests.yml @@ -104,7 +104,7 @@ jobs: - name: Install abi-compliance-checker run: | sudo apt-get update - sudo apt-get install abi-dumper autoconf pkg-config + sudo apt-get install -y abi-dumper autoconf pkg-config - name: Install universal-ctags run: | git clone https://github.com/universal-ctags/ctags.git @@ -157,7 +157,7 @@ jobs: - name: Install abi-compliance-checker run: | sudo apt-get update - sudo apt-get install abi-compliance-checker + sudo apt-get install -y abi-compliance-checker - name: Compare ABI run: | for lib in ${{ needs.abi-dump-setup.outputs.ABI_LIBS }}; do diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml index 3b44ef6dca502..a9bd8db462cf7 100644 --- a/.github/workflows/llvm-tests.yml +++ b/.github/workflows/llvm-tests.yml @@ -92,7 +92,7 @@ jobs: - name: Install abi-compliance-checker run: | sudo apt-get update - sudo apt-get install abi-dumper autoconf pkg-config + sudo apt-get -y install abi-dumper autoconf pkg-config - name: Install universal-ctags run: | git clone https://github.com/universal-ctags/ctags.git @@ -166,7 +166,7 @@ jobs: - name: Install abi-compliance-checker run: | sudo apt-get update - sudo apt-get install abi-compliance-checker + sudo apt-get -y install abi-compliance-checker - name: Compare ABI run: | if [ -s symbol-list/llvm.symbols ]; then From 4492632432190ed8ab3bc39ff8ee5ba9a89256cf Mon Sep 17 00:00:00 2001 From: John Harrison Date: Mon, 31 Mar 2025 15:51:07 -0700 Subject: [PATCH 0162/1029] [lldb-dap] Do not take ownership of stdin. (#133811) There isn't any benefit to taking ownership of stdin and it may cause issues if `Transport` is dealloced. --- lldb/tools/lldb-dap/lldb-dap.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index 062c3a5f989f3..b91c62e921428 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -571,9 +571,9 @@ int main(int argc, char *argv[]) { } lldb::IOObjectSP input = std::make_shared( - fileno(stdin), File::eOpenOptionReadOnly, true); + fileno(stdin), File::eOpenOptionReadOnly, NativeFile::Unowned); lldb::IOObjectSP output = std::make_shared( - stdout_fd, File::eOpenOptionWriteOnly, false); + stdout_fd, File::eOpenOptionWriteOnly, NativeFile::Unowned); constexpr llvm::StringLiteral client_name = "stdin/stdout"; Transport transport(client_name, log.get(), input, output); From 40c859a704399c04c74311bdd25144a78e2eb093 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 31 Mar 2025 15:58:36 -0700 Subject: [PATCH 0163/1029] [TableGen] Use size returned by encodeULEB128 to simplify some code. NFC (#133750) We can use the length to insert all the bytes at once instead of partially decoding them to insert one byte at a time. --- llvm/utils/TableGen/DecoderEmitter.cpp | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index cf7c02db8842e..ecf9c84f86a6d 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -1430,16 +1430,12 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, unsigned NumBits = Islands[I - 1].NumBits; assert(isUInt<8>(NumBits) && "NumBits overflowed uint8 table entry!"); TableInfo.Table.push_back(MCD::OPC_CheckField); - uint8_t Buffer[16], *P; - encodeULEB128(Islands[I - 1].StartBit, Buffer); - for (P = Buffer; *P >= 128; ++P) - TableInfo.Table.push_back(*P); - TableInfo.Table.push_back(*P); + uint8_t Buffer[16]; + unsigned Len = encodeULEB128(Islands[I - 1].StartBit, Buffer); + TableInfo.Table.insert(TableInfo.Table.end(), Buffer, Buffer + Len); TableInfo.Table.push_back(NumBits); - encodeULEB128(Islands[I - 1].FieldVal, Buffer); - for (P = Buffer; *P >= 128; ++P) - TableInfo.Table.push_back(*P); - TableInfo.Table.push_back(*P); + Len = encodeULEB128(Islands[I - 1].FieldVal, Buffer); + TableInfo.Table.insert(TableInfo.Table.end(), Buffer, Buffer + Len); // Push location for NumToSkip backpatching. TableInfo.FixupStack.back().push_back(TableInfo.Table.size()); // The fixup is always 24-bits, so go ahead and allocate the space @@ -1469,11 +1465,9 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, TableInfo.Table.push_back(HasCompleteDecoder ? MCD::OPC_Decode : MCD::OPC_TryDecode); NumEncodingsSupported++; - uint8_t Buffer[16], *p; - encodeULEB128(Opc.Opcode, Buffer); - for (p = Buffer; *p >= 128; ++p) - TableInfo.Table.push_back(*p); - TableInfo.Table.push_back(*p); + uint8_t Buffer[16]; + unsigned Len = encodeULEB128(Opc.Opcode, Buffer); + TableInfo.Table.insert(TableInfo.Table.end(), Buffer, Buffer + Len); SmallString<16> Bytes; raw_svector_ostream S(Bytes); From 46457ed1dfbfaf4ccc9245813450ba3fd561f067 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 31 Mar 2025 16:04:31 -0700 Subject: [PATCH 0164/1029] [lldb] Convert Breakpoint & Watchpoints structs to classes (NFC) (#133780) Convert Breakpoint & Watchpoints structs to classes to provide proper access control. This is in preparation for adopting SBMutex to protect the underlying SBBreakpoint and SBWatchpoint. --- lldb/tools/lldb-dap/Breakpoint.cpp | 30 ++++++----- lldb/tools/lldb-dap/Breakpoint.h | 14 +++-- lldb/tools/lldb-dap/BreakpointBase.cpp | 14 ++--- lldb/tools/lldb-dap/BreakpointBase.h | 25 +++++---- lldb/tools/lldb-dap/DAP.cpp | 8 +-- lldb/tools/lldb-dap/DAP.h | 2 +- lldb/tools/lldb-dap/DAPForward.h | 16 +++--- lldb/tools/lldb-dap/ExceptionBreakpoint.cpp | 18 +++---- lldb/tools/lldb-dap/ExceptionBreakpoint.h | 27 ++++++---- lldb/tools/lldb-dap/FunctionBreakpoint.cpp | 6 +-- lldb/tools/lldb-dap/FunctionBreakpoint.h | 12 +++-- .../Handler/ExceptionInfoRequestHandler.cpp | 4 +- .../Handler/SetBreakpointsRequestHandler.cpp | 7 +-- .../SetDataBreakpointsRequestHandler.cpp | 4 +- .../SetExceptionBreakpointsRequestHandler.cpp | 4 +- .../SetFunctionBreakpointsRequestHandler.cpp | 8 +-- ...etInstructionBreakpointsRequestHandler.cpp | 6 +-- lldb/tools/lldb-dap/InstructionBreakpoint.cpp | 12 ++--- lldb/tools/lldb-dap/InstructionBreakpoint.h | 19 ++++--- lldb/tools/lldb-dap/JSONUtils.cpp | 8 +-- lldb/tools/lldb-dap/SourceBreakpoint.cpp | 50 +++++++++--------- lldb/tools/lldb-dap/SourceBreakpoint.h | 52 ++++++++++--------- lldb/tools/lldb-dap/Watchpoint.cpp | 27 +++++----- lldb/tools/lldb-dap/Watchpoint.h | 22 ++++---- 24 files changed, 217 insertions(+), 178 deletions(-) diff --git a/lldb/tools/lldb-dap/Breakpoint.cpp b/lldb/tools/lldb-dap/Breakpoint.cpp index eba534dcc51c7..e02f62076f935 100644 --- a/lldb/tools/lldb-dap/Breakpoint.cpp +++ b/lldb/tools/lldb-dap/Breakpoint.cpp @@ -19,21 +19,21 @@ using namespace lldb_dap; -void Breakpoint::SetCondition() { bp.SetCondition(condition.c_str()); } +void Breakpoint::SetCondition() { m_bp.SetCondition(m_condition.c_str()); } void Breakpoint::SetHitCondition() { uint64_t hitCount = 0; - if (llvm::to_integer(hitCondition, hitCount)) - bp.SetIgnoreCount(hitCount - 1); + if (llvm::to_integer(m_hit_condition, hitCount)) + m_bp.SetIgnoreCount(hitCount - 1); } void Breakpoint::CreateJsonObject(llvm::json::Object &object) { // Each breakpoint location is treated as a separate breakpoint for VS code. // They don't have the notion of a single breakpoint with multiple locations. - if (!bp.IsValid()) + if (!m_bp.IsValid()) return; - object.try_emplace("verified", bp.GetNumResolvedLocations() > 0); - object.try_emplace("id", bp.GetID()); + object.try_emplace("verified", m_bp.GetNumResolvedLocations() > 0); + object.try_emplace("id", m_bp.GetID()); // VS Code DAP doesn't currently allow one breakpoint to have multiple // locations so we just report the first one. If we report all locations // then the IDE starts showing the wrong line numbers and locations for @@ -43,20 +43,20 @@ void Breakpoint::CreateJsonObject(llvm::json::Object &object) { // this as the breakpoint location since it will have a complete location // that is at least loaded in the current process. lldb::SBBreakpointLocation bp_loc; - const auto num_locs = bp.GetNumLocations(); + const auto num_locs = m_bp.GetNumLocations(); for (size_t i = 0; i < num_locs; ++i) { - bp_loc = bp.GetLocationAtIndex(i); + bp_loc = m_bp.GetLocationAtIndex(i); if (bp_loc.IsResolved()) break; } // If not locations are resolved, use the first location. if (!bp_loc.IsResolved()) - bp_loc = bp.GetLocationAtIndex(0); + bp_loc = m_bp.GetLocationAtIndex(0); auto bp_addr = bp_loc.GetAddress(); if (bp_addr.IsValid()) { std::string formatted_addr = - "0x" + llvm::utohexstr(bp_addr.GetLoadAddress(bp.GetTarget())); + "0x" + llvm::utohexstr(bp_addr.GetLoadAddress(m_bp.GetTarget())); object.try_emplace("instructionReference", formatted_addr); auto line_entry = bp_addr.GetLineEntry(); const auto line = line_entry.GetLine(); @@ -69,12 +69,14 @@ void Breakpoint::CreateJsonObject(llvm::json::Object &object) { } } -bool Breakpoint::MatchesName(const char *name) { return bp.MatchesName(name); } +bool Breakpoint::MatchesName(const char *name) { + return m_bp.MatchesName(name); +} void Breakpoint::SetBreakpoint() { - bp.AddName(kDAPBreakpointLabel); - if (!condition.empty()) + m_bp.AddName(kDAPBreakpointLabel); + if (!m_condition.empty()) SetCondition(); - if (!hitCondition.empty()) + if (!m_hit_condition.empty()) SetHitCondition(); } diff --git a/lldb/tools/lldb-dap/Breakpoint.h b/lldb/tools/lldb-dap/Breakpoint.h index a726f27e59ee0..580017125af44 100644 --- a/lldb/tools/lldb-dap/Breakpoint.h +++ b/lldb/tools/lldb-dap/Breakpoint.h @@ -15,12 +15,12 @@ namespace lldb_dap { -struct Breakpoint : public BreakpointBase { - // The LLDB breakpoint associated wit this source breakpoint - lldb::SBBreakpoint bp; - +class Breakpoint : public BreakpointBase { +public: Breakpoint(DAP &d, const llvm::json::Object &obj) : BreakpointBase(d, obj) {} - Breakpoint(DAP &d, lldb::SBBreakpoint bp) : BreakpointBase(d), bp(bp) {} + Breakpoint(DAP &d, lldb::SBBreakpoint bp) : BreakpointBase(d), m_bp(bp) {} + + lldb::break_id_t GetID() const { return m_bp.GetID(); } void SetCondition() override; void SetHitCondition() override; @@ -28,6 +28,10 @@ struct Breakpoint : public BreakpointBase { bool MatchesName(const char *name); void SetBreakpoint(); + +protected: + /// The LLDB breakpoint associated wit this source breakpoint. + lldb::SBBreakpoint m_bp; }; } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/BreakpointBase.cpp b/lldb/tools/lldb-dap/BreakpointBase.cpp index 15fecaf691199..331ce8efee9bc 100644 --- a/lldb/tools/lldb-dap/BreakpointBase.cpp +++ b/lldb/tools/lldb-dap/BreakpointBase.cpp @@ -13,16 +13,18 @@ using namespace lldb_dap; BreakpointBase::BreakpointBase(DAP &d, const llvm::json::Object &obj) - : dap(d), condition(std::string(GetString(obj, "condition").value_or(""))), - hitCondition(std::string(GetString(obj, "hitCondition").value_or(""))) {} + : m_dap(d), + m_condition(std::string(GetString(obj, "condition").value_or(""))), + m_hit_condition( + std::string(GetString(obj, "hitCondition").value_or(""))) {} void BreakpointBase::UpdateBreakpoint(const BreakpointBase &request_bp) { - if (condition != request_bp.condition) { - condition = request_bp.condition; + if (m_condition != request_bp.m_condition) { + m_condition = request_bp.m_condition; SetCondition(); } - if (hitCondition != request_bp.hitCondition) { - hitCondition = request_bp.hitCondition; + if (m_hit_condition != request_bp.m_hit_condition) { + m_hit_condition = request_bp.m_hit_condition; SetHitCondition(); } } diff --git a/lldb/tools/lldb-dap/BreakpointBase.h b/lldb/tools/lldb-dap/BreakpointBase.h index 0b036dd1985b3..4c13326624831 100644 --- a/lldb/tools/lldb-dap/BreakpointBase.h +++ b/lldb/tools/lldb-dap/BreakpointBase.h @@ -15,17 +15,9 @@ namespace lldb_dap { -struct BreakpointBase { - // Associated DAP session. - DAP &dap; - - // An optional expression for conditional breakpoints. - std::string condition; - // An optional expression that controls how many hits of the breakpoint are - // ignored. The backend is expected to interpret the expression as needed - std::string hitCondition; - - explicit BreakpointBase(DAP &d) : dap(d) {} +class BreakpointBase { +public: + explicit BreakpointBase(DAP &d) : m_dap(d) {} BreakpointBase(DAP &d, const llvm::json::Object &obj); virtual ~BreakpointBase() = default; @@ -49,6 +41,17 @@ struct BreakpointBase { /// breakpoint in one of the DAP breakpoints that we should report changes /// for. static constexpr const char *kDAPBreakpointLabel = "dap"; + +protected: + /// Associated DAP session. + DAP &m_dap; + + /// An optional expression for conditional breakpoints. + std::string m_condition; + + /// An optional expression that controls how many hits of the breakpoint are + /// ignored. The backend is expected to interpret the expression as needed + std::string m_hit_condition; }; } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index 512cabdf77880..8951384212f11 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -162,7 +162,7 @@ void DAP::PopulateExceptionBreakpoints() { }); } -ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const std::string &filter) { +ExceptionBreakpoint *DAP::GetExceptionBreakpoint(llvm::StringRef filter) { // PopulateExceptionBreakpoints() is called after g_dap.debugger is created // in a request-initialize. // @@ -181,7 +181,7 @@ ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const std::string &filter) { PopulateExceptionBreakpoints(); for (auto &bp : *exception_breakpoints) { - if (bp.filter == filter) + if (bp.GetFilter() == filter) return &bp; } return nullptr; @@ -192,7 +192,7 @@ ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const lldb::break_id_t bp_id) { PopulateExceptionBreakpoints(); for (auto &bp : *exception_breakpoints) { - if (bp.bp.GetID() == bp_id) + if (bp.GetID() == bp_id) return &bp; } return nullptr; @@ -1066,7 +1066,7 @@ void DAP::SetThreadFormat(llvm::StringRef format) { InstructionBreakpoint * DAP::GetInstructionBreakpoint(const lldb::break_id_t bp_id) { for (auto &bp : instruction_breakpoints) { - if (bp.second.bp.GetID() == bp_id) + if (bp.second.GetID() == bp_id) return &bp.second; } return nullptr; diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index 6689980806047..4357bdd5cc80f 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -236,7 +236,7 @@ struct DAP { void operator=(const DAP &rhs) = delete; /// @} - ExceptionBreakpoint *GetExceptionBreakpoint(const std::string &filter); + ExceptionBreakpoint *GetExceptionBreakpoint(llvm::StringRef filter); ExceptionBreakpoint *GetExceptionBreakpoint(const lldb::break_id_t bp_id); /// Redirect stdout and stderr fo the IDE's console output. diff --git a/lldb/tools/lldb-dap/DAPForward.h b/lldb/tools/lldb-dap/DAPForward.h index 58e034ed1cc77..6620d5fd33642 100644 --- a/lldb/tools/lldb-dap/DAPForward.h +++ b/lldb/tools/lldb-dap/DAPForward.h @@ -12,16 +12,16 @@ // IWYU pragma: begin_exports namespace lldb_dap { -struct BreakpointBase; -struct ExceptionBreakpoint; -struct FunctionBreakpoint; -struct SourceBreakpoint; -struct Watchpoint; -struct InstructionBreakpoint; -struct DAP; -class Log; class BaseRequestHandler; +class BreakpointBase; +class ExceptionBreakpoint; +class FunctionBreakpoint; +class InstructionBreakpoint; +class Log; class ResponseHandler; +class SourceBreakpoint; +class Watchpoint; +struct DAP; } // namespace lldb_dap namespace lldb { diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp index 15aee55ad923e..d8109daf89129 100644 --- a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp +++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp @@ -14,20 +14,20 @@ namespace lldb_dap { void ExceptionBreakpoint::SetBreakpoint() { - if (bp.IsValid()) + if (m_bp.IsValid()) return; - bool catch_value = filter.find("_catch") != std::string::npos; - bool throw_value = filter.find("_throw") != std::string::npos; - bp = dap.target.BreakpointCreateForException(language, catch_value, - throw_value); - bp.AddName(BreakpointBase::kDAPBreakpointLabel); + bool catch_value = m_filter.find("_catch") != std::string::npos; + bool throw_value = m_filter.find("_throw") != std::string::npos; + m_bp = m_dap.target.BreakpointCreateForException(m_language, catch_value, + throw_value); + m_bp.AddName(BreakpointBase::kDAPBreakpointLabel); } void ExceptionBreakpoint::ClearBreakpoint() { - if (!bp.IsValid()) + if (!m_bp.IsValid()) return; - dap.target.BreakpointDelete(bp.GetID()); - bp = lldb::SBBreakpoint(); + m_dap.target.BreakpointDelete(m_bp.GetID()); + m_bp = lldb::SBBreakpoint(); } } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.h b/lldb/tools/lldb-dap/ExceptionBreakpoint.h index b83c5ef777352..319b472a89a34 100644 --- a/lldb/tools/lldb-dap/ExceptionBreakpoint.h +++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.h @@ -12,25 +12,34 @@ #include "DAPForward.h" #include "lldb/API/SBBreakpoint.h" #include "lldb/lldb-enumerations.h" +#include "llvm/ADT/StringRef.h" #include #include namespace lldb_dap { -struct ExceptionBreakpoint { - DAP &dap; - std::string filter; - std::string label; - lldb::LanguageType language; - bool default_value = false; - lldb::SBBreakpoint bp; +class ExceptionBreakpoint { +public: ExceptionBreakpoint(DAP &d, std::string f, std::string l, lldb::LanguageType lang) - : dap(d), filter(std::move(f)), label(std::move(l)), language(lang), - bp() {} + : m_dap(d), m_filter(std::move(f)), m_label(std::move(l)), + m_language(lang), m_bp() {} void SetBreakpoint(); void ClearBreakpoint(); + + lldb::break_id_t GetID() const { return m_bp.GetID(); } + llvm::StringRef GetFilter() const { return m_filter; } + llvm::StringRef GetLabel() const { return m_label; } + + static constexpr bool kDefaultValue = false; + +protected: + DAP &m_dap; + std::string m_filter; + std::string m_label; + lldb::LanguageType m_language; + lldb::SBBreakpoint m_bp; }; } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/FunctionBreakpoint.cpp b/lldb/tools/lldb-dap/FunctionBreakpoint.cpp index cafae32b662f2..2fb6e8fafc2fa 100644 --- a/lldb/tools/lldb-dap/FunctionBreakpoint.cpp +++ b/lldb/tools/lldb-dap/FunctionBreakpoint.cpp @@ -14,12 +14,12 @@ namespace lldb_dap { FunctionBreakpoint::FunctionBreakpoint(DAP &d, const llvm::json::Object &obj) : Breakpoint(d, obj), - functionName(std::string(GetString(obj, "name").value_or(""))) {} + m_function_name(std::string(GetString(obj, "name").value_or(""))) {} void FunctionBreakpoint::SetBreakpoint() { - if (functionName.empty()) + if (m_function_name.empty()) return; - bp = dap.target.BreakpointCreateByName(functionName.c_str()); + m_bp = m_dap.target.BreakpointCreateByName(m_function_name.c_str()); Breakpoint::SetBreakpoint(); } diff --git a/lldb/tools/lldb-dap/FunctionBreakpoint.h b/lldb/tools/lldb-dap/FunctionBreakpoint.h index 93f0b93b35291..7100360cd7ec1 100644 --- a/lldb/tools/lldb-dap/FunctionBreakpoint.h +++ b/lldb/tools/lldb-dap/FunctionBreakpoint.h @@ -14,13 +14,17 @@ namespace lldb_dap { -struct FunctionBreakpoint : public Breakpoint { - std::string functionName; - +class FunctionBreakpoint : public Breakpoint { +public: FunctionBreakpoint(DAP &dap, const llvm::json::Object &obj); - // Set this breakpoint in LLDB as a new breakpoint + /// Set this breakpoint in LLDB as a new breakpoint. void SetBreakpoint(); + + llvm::StringRef GetFunctionName() const { return m_function_name; } + +protected: + std::string m_function_name; }; } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp index 2f4d4efd1b189..924ea63ed1593 100644 --- a/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/ExceptionInfoRequestHandler.cpp @@ -125,8 +125,8 @@ void ExceptionInfoRequestHandler::operator()( else if (stopReason == lldb::eStopReasonBreakpoint) { ExceptionBreakpoint *exc_bp = dap.GetExceptionBPFromStopReason(thread); if (exc_bp) { - EmplaceSafeString(body, "exceptionId", exc_bp->filter); - EmplaceSafeString(body, "description", exc_bp->label); + EmplaceSafeString(body, "exceptionId", exc_bp->GetFilter()); + EmplaceSafeString(body, "description", exc_bp->GetLabel()); } else { body.try_emplace("exceptionId", "exception"); } diff --git a/lldb/tools/lldb-dap/Handler/SetBreakpointsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/SetBreakpointsRequestHandler.cpp index 5ca2c9c01965e..dc0368852101f 100644 --- a/lldb/tools/lldb-dap/Handler/SetBreakpointsRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/SetBreakpointsRequestHandler.cpp @@ -143,7 +143,8 @@ void SetBreakpointsRequestHandler::operator()( const auto *bp_obj = bp.getAsObject(); if (bp_obj) { SourceBreakpoint src_bp(dap, *bp_obj); - std::pair bp_pos(src_bp.line, src_bp.column); + std::pair bp_pos(src_bp.GetLine(), + src_bp.GetColumn()); request_bps.try_emplace(bp_pos, src_bp); const auto [iv, inserted] = dap.source_breakpoints[path].try_emplace(bp_pos, src_bp); @@ -153,7 +154,7 @@ void SetBreakpointsRequestHandler::operator()( else iv->getSecond().UpdateBreakpoint(src_bp); AppendBreakpoint(&iv->getSecond(), response_breakpoints, path, - src_bp.line); + src_bp.GetLine()); } } } @@ -167,7 +168,7 @@ void SetBreakpointsRequestHandler::operator()( auto request_pos = request_bps.find(old_bp.first); if (request_pos == request_bps.end()) { // This breakpoint no longer exists in this source file, delete it - dap.target.BreakpointDelete(old_bp.second.bp.GetID()); + dap.target.BreakpointDelete(old_bp.second.GetID()); old_src_bp_pos->second.erase(old_bp.first); } } diff --git a/lldb/tools/lldb-dap/Handler/SetDataBreakpointsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/SetDataBreakpointsRequestHandler.cpp index 87310131255e1..365c9f0d722d4 100644 --- a/lldb/tools/lldb-dap/Handler/SetDataBreakpointsRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/SetDataBreakpointsRequestHandler.cpp @@ -97,9 +97,9 @@ void SetDataBreakpointsRequestHandler::operator()( // backward. std::set addresses; for (auto iter = watchpoints.rbegin(); iter != watchpoints.rend(); ++iter) { - if (addresses.count(iter->addr) == 0) { + if (addresses.count(iter->GetAddress()) == 0) { iter->SetWatchpoint(); - addresses.insert(iter->addr); + addresses.insert(iter->GetAddress()); } } for (auto wp : watchpoints) diff --git a/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp index 8be5d870a070f..09d4fea2a9a22 100644 --- a/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/SetExceptionBreakpointsRequestHandler.cpp @@ -70,9 +70,9 @@ void SetExceptionBreakpointsRequestHandler::operator()( const auto *filters = arguments->getArray("filters"); // Keep a list of any exception breakpoint filter names that weren't set // so we can clear any exception breakpoints if needed. - std::set unset_filters; + std::set unset_filters; for (const auto &bp : *dap.exception_breakpoints) - unset_filters.insert(bp.filter); + unset_filters.insert(bp.GetFilter()); for (const auto &value : *filters) { const auto filter = GetAsString(value); diff --git a/lldb/tools/lldb-dap/Handler/SetFunctionBreakpointsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/SetFunctionBreakpointsRequestHandler.cpp index 945df68936bac..c45dc0d0d6553 100644 --- a/lldb/tools/lldb-dap/Handler/SetFunctionBreakpointsRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/SetFunctionBreakpointsRequestHandler.cpp @@ -110,15 +110,15 @@ void SetFunctionBreakpointsRequestHandler::operator()( if (!bp_obj) continue; FunctionBreakpoint fn_bp(dap, *bp_obj); - const auto [it, inserted] = - dap.function_breakpoints.try_emplace(fn_bp.functionName, dap, *bp_obj); + const auto [it, inserted] = dap.function_breakpoints.try_emplace( + fn_bp.GetFunctionName(), dap, *bp_obj); if (inserted) it->second.SetBreakpoint(); else it->second.UpdateBreakpoint(fn_bp); AppendBreakpoint(&it->second, response_breakpoints); - seen.erase(fn_bp.functionName); + seen.erase(fn_bp.GetFunctionName()); } // Remove any breakpoints that are no longer in our list @@ -126,7 +126,7 @@ void SetFunctionBreakpointsRequestHandler::operator()( auto fn_bp = dap.function_breakpoints.find(name); if (fn_bp == dap.function_breakpoints.end()) continue; - dap.target.BreakpointDelete(fn_bp->second.bp.GetID()); + dap.target.BreakpointDelete(fn_bp->second.GetID()); dap.function_breakpoints.erase(name); } diff --git a/lldb/tools/lldb-dap/Handler/SetInstructionBreakpointsRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/SetInstructionBreakpointsRequestHandler.cpp index b1e47942de8e6..4e555ad605a26 100644 --- a/lldb/tools/lldb-dap/Handler/SetInstructionBreakpointsRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/SetInstructionBreakpointsRequestHandler.cpp @@ -223,20 +223,20 @@ void SetInstructionBreakpointsRequestHandler::operator()( // Read instruction breakpoint request. InstructionBreakpoint inst_bp(dap, *bp_obj); const auto [iv, inserted] = dap.instruction_breakpoints.try_emplace( - inst_bp.instructionAddressReference, dap, *bp_obj); + inst_bp.GetInstructionAddressReference(), dap, *bp_obj); if (inserted) iv->second.SetBreakpoint(); else iv->second.UpdateBreakpoint(inst_bp); AppendBreakpoint(&iv->second, response_breakpoints); - seen.erase(inst_bp.instructionAddressReference); + seen.erase(inst_bp.GetInstructionAddressReference()); } for (const auto &addr : seen) { auto inst_bp = dap.instruction_breakpoints.find(addr); if (inst_bp == dap.instruction_breakpoints.end()) continue; - dap.target.BreakpointDelete(inst_bp->second.bp.GetID()); + dap.target.BreakpointDelete(inst_bp->second.GetID()); dap.instruction_breakpoints.erase(addr); } diff --git a/lldb/tools/lldb-dap/InstructionBreakpoint.cpp b/lldb/tools/lldb-dap/InstructionBreakpoint.cpp index 710787625ec58..dfdc6319ac9e8 100644 --- a/lldb/tools/lldb-dap/InstructionBreakpoint.cpp +++ b/lldb/tools/lldb-dap/InstructionBreakpoint.cpp @@ -16,19 +16,19 @@ namespace lldb_dap { -// Instruction Breakpoint InstructionBreakpoint::InstructionBreakpoint(DAP &d, const llvm::json::Object &obj) - : Breakpoint(d, obj), instructionAddressReference(LLDB_INVALID_ADDRESS), - offset(GetInteger(obj, "offset").value_or(0)) { + : Breakpoint(d, obj), m_instruction_address_reference(LLDB_INVALID_ADDRESS), + m_offset(GetInteger(obj, "offset").value_or(0)) { GetString(obj, "instructionReference") .value_or("") - .getAsInteger(0, instructionAddressReference); - instructionAddressReference += offset; + .getAsInteger(0, m_instruction_address_reference); + m_instruction_address_reference += m_offset; } void InstructionBreakpoint::SetBreakpoint() { - bp = dap.target.BreakpointCreateByAddress(instructionAddressReference); + m_bp = + m_dap.target.BreakpointCreateByAddress(m_instruction_address_reference); Breakpoint::SetBreakpoint(); } diff --git a/lldb/tools/lldb-dap/InstructionBreakpoint.h b/lldb/tools/lldb-dap/InstructionBreakpoint.h index b2e66a9db9e20..6ed980e00d038 100644 --- a/lldb/tools/lldb-dap/InstructionBreakpoint.h +++ b/lldb/tools/lldb-dap/InstructionBreakpoint.h @@ -17,16 +17,21 @@ namespace lldb_dap { -// Instruction Breakpoint -struct InstructionBreakpoint : public Breakpoint { - - lldb::addr_t instructionAddressReference; - int32_t offset; - +/// Instruction Breakpoint +class InstructionBreakpoint : public Breakpoint { +public: InstructionBreakpoint(DAP &d, const llvm::json::Object &obj); - // Set instruction breakpoint in LLDB as a new breakpoint + /// Set instruction breakpoint in LLDB as a new breakpoint. void SetBreakpoint(); + + lldb::addr_t GetInstructionAddressReference() const { + return m_instruction_address_reference; + } + +protected: + lldb::addr_t m_instruction_address_reference; + int32_t m_offset; }; } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 9773b91a35a45..590137e48199d 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -560,9 +560,9 @@ llvm::json::Object CreateEventObject(const llvm::StringRef event_name) { protocol::ExceptionBreakpointsFilter CreateExceptionBreakpointFilter(const ExceptionBreakpoint &bp) { protocol::ExceptionBreakpointsFilter filter; - filter.filter = bp.filter; - filter.label = bp.label; - filter.defaultState = bp.default_value; + filter.filter = bp.GetFilter(); + filter.label = bp.GetLabel(); + filter.defaultState = ExceptionBreakpoint::kDefaultValue; return filter; } @@ -940,7 +940,7 @@ llvm::json::Value CreateThreadStopped(DAP &dap, lldb::SBThread &thread, ExceptionBreakpoint *exc_bp = dap.GetExceptionBPFromStopReason(thread); if (exc_bp) { body.try_emplace("reason", "exception"); - EmplaceSafeString(body, "description", exc_bp->label); + EmplaceSafeString(body, "description", exc_bp->GetLabel()); } else { InstructionBreakpoint *inst_bp = dap.GetInstructionBPFromStopReason(thread); diff --git a/lldb/tools/lldb-dap/SourceBreakpoint.cpp b/lldb/tools/lldb-dap/SourceBreakpoint.cpp index 4c6b36119c84f..150fa6af44d3a 100644 --- a/lldb/tools/lldb-dap/SourceBreakpoint.cpp +++ b/lldb/tools/lldb-dap/SourceBreakpoint.cpp @@ -26,24 +26,24 @@ namespace lldb_dap { SourceBreakpoint::SourceBreakpoint(DAP &dap, const llvm::json::Object &obj) : Breakpoint(dap, obj), - logMessage(GetString(obj, "logMessage").value_or("").str()), - line( + m_log_message(GetString(obj, "logMessage").value_or("").str()), + m_line( GetInteger(obj, "line").value_or(LLDB_INVALID_LINE_NUMBER)), - column(GetInteger(obj, "column") - .value_or(LLDB_INVALID_COLUMN_NUMBER)) {} + m_column(GetInteger(obj, "column") + .value_or(LLDB_INVALID_COLUMN_NUMBER)) {} void SourceBreakpoint::SetBreakpoint(const llvm::StringRef source_path) { lldb::SBFileSpecList module_list; - bp = dap.target.BreakpointCreateByLocation(source_path.str().c_str(), line, - column, 0, module_list); - if (!logMessage.empty()) + m_bp = m_dap.target.BreakpointCreateByLocation( + source_path.str().c_str(), m_line, m_column, 0, module_list); + if (!m_log_message.empty()) SetLogMessage(); Breakpoint::SetBreakpoint(); } void SourceBreakpoint::UpdateBreakpoint(const SourceBreakpoint &request_bp) { - if (logMessage != request_bp.logMessage) { - logMessage = request_bp.logMessage; + if (m_log_message != request_bp.m_log_message) { + m_log_message = request_bp.m_log_message; SetLogMessage(); } BreakpointBase::UpdateBreakpoint(request_bp); @@ -52,13 +52,13 @@ void SourceBreakpoint::UpdateBreakpoint(const SourceBreakpoint &request_bp) { lldb::SBError SourceBreakpoint::AppendLogMessagePart(llvm::StringRef part, bool is_expr) { if (is_expr) { - logMessageParts.emplace_back(part, is_expr); + m_log_message_parts.emplace_back(part, is_expr); } else { std::string formatted; lldb::SBError error = FormatLogText(part, formatted); if (error.Fail()) return error; - logMessageParts.emplace_back(formatted, is_expr); + m_log_message_parts.emplace_back(formatted, is_expr); } return lldb::SBError(); } @@ -195,7 +195,7 @@ lldb::SBError SourceBreakpoint::FormatLogText(llvm::StringRef text, // The function tries to parse logMessage into a list of LogMessageParts // for easy later access in BreakpointHitCallback. void SourceBreakpoint::SetLogMessage() { - logMessageParts.clear(); + m_log_message_parts.clear(); // Contains unmatched open curly braces indices. std::vector unmatched_curly_braces; @@ -209,10 +209,10 @@ void SourceBreakpoint::SetLogMessage() { // Part1 - parse matched_curly_braces_ranges. // locating all curly braced expression ranges in logMessage. // The algorithm takes care of nested and imbalanced curly braces. - for (size_t i = 0; i < logMessage.size(); ++i) { - if (logMessage[i] == '{') { + for (size_t i = 0; i < m_log_message.size(); ++i) { + if (m_log_message[i] == '{') { unmatched_curly_braces.push_back(i); - } else if (logMessage[i] == '}') { + } else if (m_log_message[i] == '}') { if (unmatched_curly_braces.empty()) // Nothing to match. continue; @@ -252,7 +252,7 @@ void SourceBreakpoint::SetLogMessage() { size_t raw_text_len = curly_braces_range.first - last_raw_text_start; if (raw_text_len > 0) { error = AppendLogMessagePart( - llvm::StringRef(logMessage.c_str() + last_raw_text_start, + llvm::StringRef(m_log_message.c_str() + last_raw_text_start, raw_text_len), /*is_expr=*/false); if (error.Fail()) { @@ -265,7 +265,7 @@ void SourceBreakpoint::SetLogMessage() { assert(curly_braces_range.second > curly_braces_range.first); size_t expr_len = curly_braces_range.second - curly_braces_range.first - 1; error = AppendLogMessagePart( - llvm::StringRef(logMessage.c_str() + curly_braces_range.first + 1, + llvm::StringRef(m_log_message.c_str() + curly_braces_range.first + 1, expr_len), /*is_expr=*/true); if (error.Fail()) { @@ -277,10 +277,10 @@ void SourceBreakpoint::SetLogMessage() { } // Trailing raw text after close curly brace. assert(last_raw_text_start >= 0); - if (logMessage.size() > (size_t)last_raw_text_start) { + if (m_log_message.size() > (size_t)last_raw_text_start) { error = AppendLogMessagePart( - llvm::StringRef(logMessage.c_str() + last_raw_text_start, - logMessage.size() - last_raw_text_start), + llvm::StringRef(m_log_message.c_str() + last_raw_text_start, + m_log_message.size() - last_raw_text_start), /*is_expr=*/false); if (error.Fail()) { NotifyLogMessageError(error.GetCString()); @@ -288,13 +288,13 @@ void SourceBreakpoint::SetLogMessage() { } } - bp.SetCallback(BreakpointHitCallback, this); + m_bp.SetCallback(BreakpointHitCallback, this); } void SourceBreakpoint::NotifyLogMessageError(llvm::StringRef error) { std::string message = "Log message has error: "; message += error; - dap.SendOutput(OutputType::Console, message); + m_dap.SendOutput(OutputType::Console, message); } /*static*/ @@ -309,7 +309,7 @@ bool SourceBreakpoint::BreakpointHitCallback( std::string output; for (const SourceBreakpoint::LogMessagePart &messagePart : - bp->logMessageParts) { + bp->m_log_message_parts) { if (messagePart.is_expr) { // Try local frame variables first before fall back to expression // evaluation @@ -320,7 +320,7 @@ bool SourceBreakpoint::BreakpointHitCallback( if (value.GetError().Fail()) value = frame.EvaluateExpression(expr); output += - VariableDescription(value, bp->dap.enable_auto_variable_summaries) + VariableDescription(value, bp->m_dap.enable_auto_variable_summaries) .display_value; } else { output += messagePart.text; @@ -328,7 +328,7 @@ bool SourceBreakpoint::BreakpointHitCallback( } if (!output.empty() && output.back() != '\n') output.push_back('\n'); // Ensure log message has line break. - bp->dap.SendOutput(OutputType::Console, output.c_str()); + bp->m_dap.SendOutput(OutputType::Console, output.c_str()); // Do not stop. return false; diff --git a/lldb/tools/lldb-dap/SourceBreakpoint.h b/lldb/tools/lldb-dap/SourceBreakpoint.h index 064bd29d9fc79..d01411547d12a 100644 --- a/lldb/tools/lldb-dap/SourceBreakpoint.h +++ b/lldb/tools/lldb-dap/SourceBreakpoint.h @@ -19,23 +19,8 @@ namespace lldb_dap { -struct SourceBreakpoint : public Breakpoint { - // logMessage part can be either a raw text or an expression. - struct LogMessagePart { - LogMessagePart(llvm::StringRef text, bool is_expr) - : text(text), is_expr(is_expr) {} - std::string text; - bool is_expr; - }; - // If this attribute exists and is non-empty, the backend must not 'break' - // (stop) but log the message instead. Expressions within {} are - // interpolated. - std::string logMessage; - std::vector logMessageParts; - - uint32_t line; ///< The source line of the breakpoint or logpoint - uint32_t column; ///< An optional source column of the breakpoint - +class SourceBreakpoint : public Breakpoint { +public: SourceBreakpoint(DAP &d, const llvm::json::Object &obj); // Set this breakpoint in LLDB as a new breakpoint @@ -52,14 +37,33 @@ struct SourceBreakpoint : public Breakpoint { static bool BreakpointHitCallback(void *baton, lldb::SBProcess &process, lldb::SBThread &thread, lldb::SBBreakpointLocation &location); -}; -inline bool operator<(const SourceBreakpoint &lhs, - const SourceBreakpoint &rhs) { - if (lhs.line == rhs.line) - return lhs.column < rhs.column; - return lhs.line < rhs.line; -} + inline bool operator<(const SourceBreakpoint &rhs) { + if (m_line == rhs.m_line) + return m_column < rhs.m_column; + return m_line < rhs.m_line; + } + + uint32_t GetLine() const { return m_line; } + uint32_t GetColumn() const { return m_column; } + +protected: + // logMessage part can be either a raw text or an expression. + struct LogMessagePart { + LogMessagePart(llvm::StringRef text, bool is_expr) + : text(text), is_expr(is_expr) {} + std::string text; + bool is_expr; + }; + // If this attribute exists and is non-empty, the backend must not 'break' + // (stop) but log the message instead. Expressions within {} are + // interpolated. + std::string m_log_message; + std::vector m_log_message_parts; + + uint32_t m_line; ///< The source line of the breakpoint or logpoint + uint32_t m_column; ///< An optional source column of the breakpoint +}; } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/Watchpoint.cpp b/lldb/tools/lldb-dap/Watchpoint.cpp index 8681057c8d3f2..a94cbcdbc4122 100644 --- a/lldb/tools/lldb-dap/Watchpoint.cpp +++ b/lldb/tools/lldb-dap/Watchpoint.cpp @@ -23,36 +23,37 @@ Watchpoint::Watchpoint(DAP &d, const llvm::json::Object &obj) llvm::StringRef dataId = GetString(obj, "dataId").value_or(""); std::string accessType = GetString(obj, "accessType").value_or("").str(); auto [addr_str, size_str] = dataId.split('/'); - llvm::to_integer(addr_str, addr, 16); - llvm::to_integer(size_str, size); - options.SetWatchpointTypeRead(accessType != "write"); + llvm::to_integer(addr_str, m_addr, 16); + llvm::to_integer(size_str, m_size); + m_options.SetWatchpointTypeRead(accessType != "write"); if (accessType != "read") - options.SetWatchpointTypeWrite(lldb::eWatchpointWriteTypeOnModify); + m_options.SetWatchpointTypeWrite(lldb::eWatchpointWriteTypeOnModify); } -void Watchpoint::SetCondition() { wp.SetCondition(condition.c_str()); } +void Watchpoint::SetCondition() { m_wp.SetCondition(m_condition.c_str()); } void Watchpoint::SetHitCondition() { uint64_t hitCount = 0; - if (llvm::to_integer(hitCondition, hitCount)) - wp.SetIgnoreCount(hitCount - 1); + if (llvm::to_integer(m_hit_condition, hitCount)) + m_wp.SetIgnoreCount(hitCount - 1); } void Watchpoint::CreateJsonObject(llvm::json::Object &object) { - if (!error.IsValid() || error.Fail()) { + if (!m_error.IsValid() || m_error.Fail()) { object.try_emplace("verified", false); - if (error.Fail()) - EmplaceSafeString(object, "message", error.GetCString()); + if (m_error.Fail()) + EmplaceSafeString(object, "message", m_error.GetCString()); } else { object.try_emplace("verified", true); } } void Watchpoint::SetWatchpoint() { - wp = dap.target.WatchpointCreateByAddress(addr, size, options, error); - if (!condition.empty()) + m_wp = m_dap.target.WatchpointCreateByAddress(m_addr, m_size, m_options, + m_error); + if (!m_condition.empty()) SetCondition(); - if (!hitCondition.empty()) + if (!m_hit_condition.empty()) SetHitCondition(); } } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/Watchpoint.h b/lldb/tools/lldb-dap/Watchpoint.h index 77cea67bb9781..bf52b41281f29 100644 --- a/lldb/tools/lldb-dap/Watchpoint.h +++ b/lldb/tools/lldb-dap/Watchpoint.h @@ -19,22 +19,26 @@ namespace lldb_dap { -struct Watchpoint : public BreakpointBase { - lldb::addr_t addr; - size_t size; - lldb::SBWatchpointOptions options; - // The LLDB breakpoint associated wit this watchpoint. - lldb::SBWatchpoint wp; - lldb::SBError error; - +class Watchpoint : public BreakpointBase { +public: Watchpoint(DAP &d, const llvm::json::Object &obj); - Watchpoint(DAP &d, lldb::SBWatchpoint wp) : BreakpointBase(d), wp(wp) {} + Watchpoint(DAP &d, lldb::SBWatchpoint wp) : BreakpointBase(d), m_wp(wp) {} void SetCondition() override; void SetHitCondition() override; void CreateJsonObject(llvm::json::Object &object) override; void SetWatchpoint(); + + lldb::addr_t GetAddress() const { return m_addr; } + +protected: + lldb::addr_t m_addr; + size_t m_size; + lldb::SBWatchpointOptions m_options; + /// The LLDB breakpoint associated wit this watchpoint. + lldb::SBWatchpoint m_wp; + lldb::SBError m_error; }; } // namespace lldb_dap From 5d1f27f349f5b63646c714e8c26cc5716a4b5abc Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Mon, 31 Mar 2025 16:06:51 -0700 Subject: [PATCH 0165/1029] GlobalISel: neg (and x, 1) --> SIGN_EXTEND_INREG x, 1 (#131367) The pattern ```LLVM %shl = shl i32 %x, 31 %ashr = ashr i32 %shl, 31 ``` would be combined to `G_EXT_INREG %x, 1` by GlobalISel. However InstCombine normalizes this pattern to: ```LLVM %and = and i32 %x, 1 %neg = sub i32 0, %and ``` This adds a combiner for this variant as well. --- .../include/llvm/Target/GlobalISel/Combine.td | 13 ++- .../combine-neg-and-one-to-sext-inreg.mir | 81 +++++++++++++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-neg-and-one-to-sext-inreg.mir diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 1a967fe56b7b0..deed9315c72d8 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -747,6 +747,17 @@ def shl_ashr_to_sext_inreg : GICombineRule< (apply [{ Helper.applyAshShlToSextInreg(*${root}, ${info});}]) >; +// Fold sub 0, (and x, 1) -> sext_inreg x, 1 +def neg_and_one_to_sext_inreg : GICombineRule< + (defs root:$dst), + (match (G_AND $and, $x, 1), + (G_SUB $dst, 0, $and), + [{ return MRI.hasOneNonDBGUse(${and}.getReg()) && + Helper.isLegalOrBeforeLegalizer( + {TargetOpcode::G_SEXT_INREG, {MRI.getType(${x}.getReg())}}); }]), + (apply (G_SEXT_INREG $dst, $x, 1)) +>; + // Fold and(and(x, C1), C2) -> C1&C2 ? and(x, C1&C2) : 0 def overlapping_and: GICombineRule < (defs root:$root, build_fn_matchinfo:$info), @@ -2013,7 +2024,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, undef_combines, identity_combines, phi_combines, simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shifts_too_big, reassocs, ptr_add_immed_chain, cmp_combines, - shl_ashr_to_sext_inreg, sext_inreg_of_load, + shl_ashr_to_sext_inreg, neg_and_one_to_sext_inreg, sext_inreg_of_load, width_reduction_combines, select_combines, known_bits_simplifications, trunc_shift, not_cmp_fold, opt_brcond_by_inverting_cond, diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-neg-and-one-to-sext-inreg.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-neg-and-one-to-sext-inreg.mir new file mode 100644 index 0000000000000..194a15f940da7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-neg-and-one-to-sext-inreg.mir @@ -0,0 +1,81 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -o - -mtriple aarch64-- -run-pass=aarch64-prelegalizer-combiner %s | FileCheck %s +--- +name: test_combine_neg_and_one_to_sext_inreg_s32 +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_neg_and_one_to_sext_inreg_s32 + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s32) = COPY $w0 + ; CHECK-NEXT: %res:_(s32) = G_SEXT_INREG %x, 1 + ; CHECK-NEXT: $w0 = COPY %res(s32) + %x:_(s32) = COPY $w0 + %one:_(s32) = G_CONSTANT i32 1 + %zero:_(s32) = G_CONSTANT i32 0 + %and:_(s32) = G_AND %x:_, %one:_ + %res:_(s32) = G_SUB %zero:_, %and:_ + $w0 = COPY %res:_(s32) +... +--- +name: test_combine_neg_and_one_to_sext_inreg_s64 +body: | + bb.1: + liveins: $x0 + ; CHECK-LABEL: name: test_combine_neg_and_one_to_sext_inreg_s64 + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s64) = COPY $x0 + ; CHECK-NEXT: %res:_(s64) = G_SEXT_INREG %x, 1 + ; CHECK-NEXT: $x0 = COPY %res(s64) + %x:_(s64) = COPY $x0 + %one:_(s64) = G_CONSTANT i64 1 + %zero:_(s64) = G_CONSTANT i64 0 + %and:_(s64) = G_AND %x:_, %one:_ + %res:_(s64) = G_SUB %zero:_, %and:_ + $x0 = COPY %res:_(s64) +... +--- +name: test_combine_neg_and_one_to_sext_inreg_multiuse_should_not_transform +body: | + bb.1: + liveins: $x0 + ; CHECK-LABEL: name: test_combine_neg_and_one_to_sext_inreg_multiuse_should_not_transform + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s64) = COPY $x0 + ; CHECK-NEXT: %one:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: %zero:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: %and:_(s64) = G_AND %x, %one + ; CHECK-NEXT: %res:_(s64) = G_SUB %zero, %and + ; CHECK-NEXT: $x0 = COPY %res(s64) + ; CHECK-NEXT: $x1 = COPY %and(s64) + %x:_(s64) = COPY $x0 + %one:_(s64) = G_CONSTANT i64 1 + %zero:_(s64) = G_CONSTANT i64 0 + %and:_(s64) = G_AND %x:_, %one:_ + %res:_(s64) = G_SUB %zero:_, %and:_ + $x0 = COPY %res:_(s64) + $x1 = COPY %and:_(s64) +... +--- +name: test_combine_neg_and_one_to_sext_inreg_v2i32 +body: | + bb.1: + liveins: $d0 + ; CHECK-LABEL: name: test_combine_neg_and_one_to_sext_inreg_v2i32 + ; CHECK: liveins: $d0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(<2 x s32>) = COPY $d0 + ; CHECK-NEXT: %res:_(<2 x s32>) = G_SEXT_INREG %x, 1 + ; CHECK-NEXT: $d0 = COPY %res(<2 x s32>) + %x:_(<2 x s32>) = COPY $d0 + %one:_(s32) = G_CONSTANT i32 1 + %ones:_(<2 x s32>) = G_BUILD_VECTOR %one, %one + %zero:_(s32) = G_CONSTANT i32 0 + %zeros:_(<2 x s32>) = G_BUILD_VECTOR %zero, %zero + %and:_(<2 x s32>) = G_AND %x:_, %ones:_ + %res:_(<2 x s32>) = G_SUB %zeros:_, %and:_ + $d0 = COPY %res:_(<2 x s32>) +... From 348374028970c956f2e49ab7553b495d7408ccd9 Mon Sep 17 00:00:00 2001 From: Zequan Wu Date: Mon, 31 Mar 2025 16:13:46 -0700 Subject: [PATCH 0166/1029] Reland "Symbolize line zero as if no source info is available (#124846)" (#133798) This land commits 23aca2f88dd5d2447e69496c89c3ed42a56f9c31 and 1b15a89a23c631a8e2d096dad4afe456970572c0. https://github.com/llvm/llvm-project/pull/128619 makes symbolizer to always use debug info when available so we can reland this chagnge. --- llvm/lib/DebugInfo/DWARF/DWARFContext.cpp | 2 +- llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp | 3 ++- llvm/test/tools/llvm-symbolizer/skip-line-zero.s | 4 ++-- llvm/test/tools/llvm-symbolizer/sym-verbose.test | 5 ++--- offload/test/sanitizer/kernel_crash_many.c | 14 +++++++------- offload/test/sanitizer/kernel_trap.c | 1 - offload/test/sanitizer/kernel_trap.cpp | 4 ++-- offload/test/sanitizer/kernel_trap_many.c | 2 +- 8 files changed, 17 insertions(+), 18 deletions(-) diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp index e76e518ef8595..9c5d43dd4fb77 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -1870,7 +1870,7 @@ DWARFContext::getInliningInfoForAddress(object::SectionedAddress Address, LineTable->getFileLineInfoForAddress( {Address.Address, Address.SectionIndex}, Spec.ApproximateLine, CU->getCompilationDir(), Spec.FLIKind, Frame); - } else { + } else if (CallLine != 0) { // Otherwise, use call file, call line and call column from // previous DIE in inlined chain. if (LineTable) diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp index 55a3bfa459c3c..62bf3d4ecaaf0 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp @@ -1539,7 +1539,8 @@ bool DWARFDebugLine::LineTable::getFileLineInfoForAddress( return false; // Take file number and line/column from the row. const auto &Row = Rows[RowIndex]; - if (!getFileNameByIndex(Row.File, CompDir, Kind, Result.FileName)) + if (Row.Line == 0 || + !getFileNameByIndex(Row.File, CompDir, Kind, Result.FileName)) return false; Result.Line = Row.Line; Result.Column = Row.Column; diff --git a/llvm/test/tools/llvm-symbolizer/skip-line-zero.s b/llvm/test/tools/llvm-symbolizer/skip-line-zero.s index e9fbea558e0eb..74dfb5cdc1aae 100644 --- a/llvm/test/tools/llvm-symbolizer/skip-line-zero.s +++ b/llvm/test/tools/llvm-symbolizer/skip-line-zero.s @@ -20,13 +20,13 @@ ## Check that without '--skip-line-zero', line zero is displayed for a line-table entry which has no source correspondence. # RUN: llvm-symbolizer --obj=%t.o -f=none 0x16d4 | FileCheck --strict-whitespace --match-full-lines --check-prefix=DISABLE %s -# DISABLE:main.c:0:0 +# DISABLE:??:0:0 ## Check that the '--skip-line-zero' does not cross sequence boundaries. ## If it fails to find in the current sequence then line zero is returned for the queried address. # RUN: llvm-symbolizer --obj=%t.o -f=none --skip-line-zero 0x16c0 | FileCheck --strict-whitespace --match-full-lines --check-prefix=FAIL-ACROSS-SEQ %s -# FAIL-ACROSS-SEQ:main.c:0:0 +# FAIL-ACROSS-SEQ:??:0:0 ## Check that with '--skip-line-zero', the last non-zero line in the current sequence is displayed. # RUN: llvm-symbolizer --obj=%t.o -f=none --skip-line-zero 0x1717 | FileCheck --strict-whitespace --match-full-lines --check-prefix=WITHIN-SEQ %s diff --git a/llvm/test/tools/llvm-symbolizer/sym-verbose.test b/llvm/test/tools/llvm-symbolizer/sym-verbose.test index 831fd6c7f0507..224c317f558a1 100644 --- a/llvm/test/tools/llvm-symbolizer/sym-verbose.test +++ b/llvm/test/tools/llvm-symbolizer/sym-verbose.test @@ -50,13 +50,12 @@ CHECK-NEXT: Column: 0 CHECK: 0x4005ad CHECK-NEXT: foo -CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c +CHECK-NEXT: Filename: ?? CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c CHECK-NEXT: Function start line: 4 CHECK-NEXT: Function start address: 0x400590 CHECK-NEXT: Line: 0 -CHECK-NEXT: Column: 30 -CHECK-NEXT: Discriminator: 4 +CHECK-NEXT: Column: 0 CHECK-NEXT: main CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c CHECK-NEXT: Function start filename: /tmp{{[\\/]}}discrim.c diff --git a/offload/test/sanitizer/kernel_crash_many.c b/offload/test/sanitizer/kernel_crash_many.c index f1d17ca2b76e2..9fd8af48f51fe 100644 --- a/offload/test/sanitizer/kernel_crash_many.c +++ b/offload/test/sanitizer/kernel_crash_many.c @@ -37,36 +37,36 @@ int main(void) { // CHECK: Kernel 1: {{.*}} (__omp_offloading_{{.*}}_main_l22) // CHECK: launchKernel // NDEBG: main -// DEBUG: main {{.*}}kernel_crash_many.c: +// DEBUG: main {{.*}}kernel_crash_many.c // // CHECK: Kernel 2: {{.*}} (__omp_offloading_{{.*}}_main_l22) // CHECK: launchKernel // NDEBG: main -// DEBUG: main {{.*}}kernel_crash_many.c: +// DEBUG: main {{.*}}kernel_crash_many.c // // CHECK: Kernel 3: {{.*}} (__omp_offloading_{{.*}}_main_l22) // CHECK: launchKernel // NDEBG: main -// DEBUG: main {{.*}}kernel_crash_many.c: +// DEBUG: main {{.*}}kernel_crash_many.c // // CHECK: Kernel 4: {{.*}} (__omp_offloading_{{.*}}_main_l22) // CHECK: launchKernel // NDEBG: main -// DEBUG: main {{.*}}kernel_crash_many.c: +// DEBUG: main {{.*}}kernel_crash_many.c // // CHECK: Kernel 5: {{.*}} (__omp_offloading_{{.*}}_main_l22) // CHECK: launchKernel // NDEBG: main -// DEBUG: main {{.*}}kernel_crash_many.c: +// DEBUG: main {{.*}}kernel_crash_many.c // // CHECK: Kernel 6: {{.*}} (__omp_offloading_{{.*}}_main_l22) // CHECK: launchKernel // NDEBG: main -// DEBUG: main {{.*}}kernel_crash_many.c: +// DEBUG: main {{.*}}kernel_crash_many.c // // CHECK: Kernel 7: {{.*}} (__omp_offloading_{{.*}}_main_l22) // CHECK: launchKernel // NDEBG: main -// DEBUG: main {{.*}}kernel_crash_many.c: +// DEBUG: main {{.*}}kernel_crash_many.c // // CHECK-NOT: Kernel {{[[0-9]]+}}: diff --git a/offload/test/sanitizer/kernel_trap.c b/offload/test/sanitizer/kernel_trap.c index 7a166bfda5ee9..3a531bd74c980 100644 --- a/offload/test/sanitizer/kernel_trap.c +++ b/offload/test/sanitizer/kernel_trap.c @@ -39,5 +39,4 @@ int main(void) { // CHECK: OFFLOAD ERROR: Kernel 'omp target in main @ 30 (__omp_offloading_{{.*}}_main_l30)' // CHECK: OFFLOAD ERROR: execution interrupted by hardware trap instruction // TRACE: launchKernel -// TRACE: main // clang-format on diff --git a/offload/test/sanitizer/kernel_trap.cpp b/offload/test/sanitizer/kernel_trap.cpp index c67b3857fabba..44858be6cd3f6 100644 --- a/offload/test/sanitizer/kernel_trap.cpp +++ b/offload/test/sanitizer/kernel_trap.cpp @@ -47,6 +47,6 @@ int main(void) { // TRACE: launchKernel // NDEBG: cxx_function_name(int, S*) // NDEBG: main -// DEBUG: cxx_function_name(int, S*) {{.*}}kernel_trap.cpp: -// DEBUG: main {{.*}}kernel_trap.cpp: +// DEBUG: cxx_function_name(int, S*) {{.*}}kernel_trap.cpp +// DEBUG: main {{.*}}kernel_trap.cpp // clang-format on diff --git a/offload/test/sanitizer/kernel_trap_many.c b/offload/test/sanitizer/kernel_trap_many.c index f2e63794168b2..061c0fe225d4b 100644 --- a/offload/test/sanitizer/kernel_trap_many.c +++ b/offload/test/sanitizer/kernel_trap_many.c @@ -32,4 +32,4 @@ int main(void) { // TRACE: OFFLOAD ERROR: execution interrupted by hardware trap instruction // TRACE: launchKernel // NDEBG: main -// DEBUG: main {{.*}}kernel_trap_many.c: +// DEBUG: main {{.*}}kernel_trap_many.c From 5b8d8bb90a2c9674364e35803e45370c35c35c4a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 1 Apr 2025 06:24:59 +0700 Subject: [PATCH 0167/1029] Inliner: Fix missing test coverage for incompatible gc rejection (#133708) --- .../Inline/no-inline-incompatible-gc.ll | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 llvm/test/Transforms/Inline/no-inline-incompatible-gc.ll diff --git a/llvm/test/Transforms/Inline/no-inline-incompatible-gc.ll b/llvm/test/Transforms/Inline/no-inline-incompatible-gc.ll new file mode 100644 index 0000000000000..531801df7cc46 --- /dev/null +++ b/llvm/test/Transforms/Inline/no-inline-incompatible-gc.ll @@ -0,0 +1,140 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes='cgscc(inline)' -pass-remarks=inline -pass-remarks-missed=inline < %s 2> %t.err | FileCheck %s +; RUN: FileCheck -implicit-check-not=remark -check-prefix=REMARK %s < %t.err + +; REMARK: remark: :0:0: 'callee_with_gc' inlined into 'caller_no_gc' +; REMARK-NEXT: remark: :0:0: 'callee_with_gc' inlined into 'caller_same_gc' +; REMARK-NEXT: remark: :0:0: 'callee_with_gc' is not inlined into 'caller_incompatible_gc': incompatible GC +; REMARK-NEXT: remark: :0:0: 'callee_with_gc' inlined into 'caller_inline_first_caller' +; REMARK-NEXT: remark: :0:0: 'callee_with_other_gc' is not inlined into 'caller_inline_first_caller': incompatible GC +; REMARK-NEXT: remark: :0:0: 'callee_with_gc' inlined into 'caller_inline_second_caller' +; REMARK-NEXT: remark: :0:0: 'callee_with_other_gc' is not inlined into 'caller_inline_second_caller': incompatible GC + +%IntArray = type { i32, [0 x ptr] } + +; Callee gc propagates to the caller +define i32 @caller_no_gc() { +; CHECK-LABEL: define i32 @caller_no_gc() gc "example" { +; CHECK-NEXT: [[ROOT_I:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[ROOT_I]]) +; CHECK-NEXT: call void @llvm.gcroot(ptr [[ROOT_I]], ptr null) +; CHECK-NEXT: [[OBJ_I:%.*]] = call ptr @h() +; CHECK-NEXT: store ptr [[OBJ_I]], ptr [[ROOT_I]], align 8 +; CHECK-NEXT: [[LENGTH_I:%.*]] = load i32, ptr [[OBJ_I]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[ROOT_I]]) +; CHECK-NEXT: ret i32 [[LENGTH_I]] +; + %x = call i32 @callee_with_gc() + ret i32 %x +} + +; Inline of matching gc allowed. +define i32 @caller_same_gc() gc "example" { +; CHECK-LABEL: define i32 @caller_same_gc() gc "example" { +; CHECK-NEXT: [[ROOT_I:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[ROOT_I]]) +; CHECK-NEXT: call void @llvm.gcroot(ptr [[ROOT_I]], ptr null) +; CHECK-NEXT: [[OBJ_I:%.*]] = call ptr @h() +; CHECK-NEXT: store ptr [[OBJ_I]], ptr [[ROOT_I]], align 8 +; CHECK-NEXT: [[LENGTH_I:%.*]] = load i32, ptr [[OBJ_I]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[ROOT_I]]) +; CHECK-NEXT: ret i32 [[LENGTH_I]] +; + %x = call i32 @callee_with_gc() + ret i32 %x +} + +; Reject inline with mismatched gc +define i32 @caller_incompatible_gc() gc "incompatible" { +; CHECK-LABEL: define i32 @caller_incompatible_gc() gc "incompatible" { +; CHECK-NEXT: [[X:%.*]] = call i32 @callee_with_gc() +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @callee_with_gc() + ret i32 %x +} + +define i32 @callee_with_gc() gc "example" { +; CHECK-LABEL: define i32 @callee_with_gc() gc "example" { +; CHECK-NEXT: [[ROOT:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: call void @llvm.gcroot(ptr [[ROOT]], ptr null) +; CHECK-NEXT: [[OBJ:%.*]] = call ptr @h() +; CHECK-NEXT: store ptr [[OBJ]], ptr [[ROOT]], align 8 +; CHECK-NEXT: [[LENGTH_PTR:%.*]] = getelementptr [[INTARRAY:%.*]], ptr [[OBJ]], i32 0, i32 0 +; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_PTR]], align 4 +; CHECK-NEXT: ret i32 [[LENGTH]] +; + %root = alloca ptr, align 8 + call void @llvm.gcroot(ptr %root, ptr null) + %obj = call ptr @h() + store ptr %obj, ptr %root, align 8 + %Length.ptr = getelementptr %IntArray, ptr %obj, i32 0, i32 0 + %Length = load i32, ptr %Length.ptr, align 4 + ret i32 %Length +} + +define i32 @callee_with_other_gc() gc "other-example" { +; CHECK-LABEL: define i32 @callee_with_other_gc() gc "other-example" { +; CHECK-NEXT: [[ROOT:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: call void @llvm.gcroot(ptr [[ROOT]], ptr null) +; CHECK-NEXT: [[OBJ:%.*]] = call ptr @h() +; CHECK-NEXT: store ptr [[OBJ]], ptr [[ROOT]], align 8 +; CHECK-NEXT: [[LENGTH_PTR:%.*]] = getelementptr [[INTARRAY:%.*]], ptr [[OBJ]], i32 0, i32 0 +; CHECK-NEXT: [[LENGTH:%.*]] = load i32, ptr [[LENGTH_PTR]], align 4 +; CHECK-NEXT: ret i32 [[LENGTH]] +; + %root = alloca ptr, align 8 + call void @llvm.gcroot(ptr %root, ptr null) + %obj = call ptr @h() + store ptr %obj, ptr %root, align 8 + %Length.ptr = getelementptr %IntArray, ptr %obj, i32 0, i32 0 + %Length = load i32, ptr %Length.ptr, align 4 + ret i32 %Length +} + +; After inlining the first call, inline is blocked of the second call +; since the gc type propagates to the caller. +define i32 @caller_inline_first_caller() { +; CHECK-LABEL: define i32 @caller_inline_first_caller() gc "example" { +; CHECK-NEXT: [[ROOT_I:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[ROOT_I]]) +; CHECK-NEXT: call void @llvm.gcroot(ptr [[ROOT_I]], ptr null) +; CHECK-NEXT: [[OBJ_I:%.*]] = call ptr @h() +; CHECK-NEXT: store ptr [[OBJ_I]], ptr [[ROOT_I]], align 8 +; CHECK-NEXT: [[LENGTH_I:%.*]] = load i32, ptr [[OBJ_I]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[ROOT_I]]) +; CHECK-NEXT: [[Y:%.*]] = call i32 @callee_with_other_gc() +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LENGTH_I]], [[Y]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %x = call i32 @callee_with_gc() + %y = call i32 @callee_with_other_gc() + %add = add i32 %x, %y + ret i32 %add +} + +; We can't inline the first call due to the incompatible gc, but can +; inline the second +define i32 @caller_inline_second_caller() gc "example" { +; CHECK-LABEL: define i32 @caller_inline_second_caller() gc "example" { +; CHECK-NEXT: [[ROOT_I:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[ROOT_I]]) +; CHECK-NEXT: call void @llvm.gcroot(ptr [[ROOT_I]], ptr null) +; CHECK-NEXT: [[OBJ_I:%.*]] = call ptr @h() +; CHECK-NEXT: store ptr [[OBJ_I]], ptr [[ROOT_I]], align 8 +; CHECK-NEXT: [[LENGTH_I:%.*]] = load i32, ptr [[OBJ_I]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[ROOT_I]]) +; CHECK-NEXT: [[Y:%.*]] = call i32 @callee_with_other_gc() +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LENGTH_I]], [[Y]] +; CHECK-NEXT: ret i32 [[ADD]] +; + %x = call i32 @callee_with_gc() + %y = call i32 @callee_with_other_gc() + %add = add i32 %x, %y + ret i32 %add +} + +declare ptr @h() + +declare void @llvm.gcroot(ptr, ptr) #0 +attributes #0 = { nounwind } From cf6a452cc734e00a1207514bb1fc2b123e31bb5f Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 31 Mar 2025 19:33:40 -0400 Subject: [PATCH 0168/1029] [SLP]Fix same/alternate analysis in split node analysis for compares getSameOpcode in some cases may consider 2 compares as having same opcode, even though previously they were considered as alternate. It may happen, because getSameOpcode looses info about previous instructions and their states. Need to use isAlternateInstruction function instead for the correct analysis. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/133769 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 7 +++-- .../X86/vector-reductions-logical.ll | 14 ++++----- .../SLPVectorizer/X86/reduction-logical.ll | 18 ++++++------ .../SLPVectorizer/revec-reduction-logical.ll | 29 ++++++++++++------- 4 files changed, 39 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0679eac176584..18c896767b6d2 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9013,8 +9013,11 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, Op1Indices.set(Idx); continue; } - InstructionsState NewS = getSameOpcode({LocalState.getMainOp(), I}, *TLI); - if (NewS && !NewS.isAltShuffle()) { + if ((LocalState.getAltOpcode() != LocalState.getOpcode() && + I->getOpcode() == LocalState.getOpcode()) || + (LocalState.getAltOpcode() == LocalState.getOpcode() && + !isAlternateInstruction(I, LocalState.getMainOp(), + LocalState.getAltOp(), *TLI))) { Op1.push_back(V); Op1Indices.set(Idx); continue; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll index 035f3f145bf8d..ee51e467c5b8e 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll @@ -86,10 +86,9 @@ return: define float @test_merge_anyof_v4sf(<4 x float> %t) { ; CHECK-LABEL: @test_merge_anyof_v4sf( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[T:%.*]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <8 x float> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = fcmp olt <8 x float> [[TMP0]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[T]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP0]], <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 @@ -401,10 +400,9 @@ return: define float @test_merge_anyof_v4si(<4 x i32> %t) { ; CHECK-LABEL: @test_merge_anyof_v4si( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[T:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <8 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i32> [[T:%.*]], splat (i32 1) +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[T]], splat (i32 255) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP0]], <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP5]], 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index e0b3ff714162f..81da11dc42e88 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -207,10 +207,10 @@ define i1 @logical_and_icmp_subvec(<4 x i32> %x) { define i1 @logical_and_icmp_clamp(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42) +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP3]], <4 x i1> [[TMP1]], i64 4) ; CHECK-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]]) ; CHECK-NEXT: ret i1 [[TMP6]] @@ -239,12 +239,12 @@ define i1 @logical_and_icmp_clamp(<4 x i32> %x) { define i1 @logical_and_icmp_clamp_extra_use_cmp(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp_extra_use_cmp( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i1> [[TMP4]], i32 6 +; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 ; CHECK-NEXT: call void @use1(i1 [[TMP5]]) +; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP8]], <4 x i1> [[TMP1]], i64 4) ; CHECK-NEXT: [[TMP6:%.*]] = freeze <8 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP6]]) ; CHECK-NEXT: ret i1 [[TMP7]] diff --git a/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll index 25f161e9f1276..250c60a61fea1 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec-reduction-logical.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=x86_64 -S | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=x86_64 -S | FileCheck %s --check-prefixes=CHECK,X86 %} +; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-revec -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s --check-prefixes=CHECK,AARCH64 %} define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_diff_preds( @@ -28,14 +28,23 @@ define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) { } define i1 @logical_and_icmp_clamp(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]]) -; CHECK-NEXT: ret i1 [[TMP6]] +; X86-LABEL: @logical_and_icmp_clamp( +; X86-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], splat (i32 42) +; X86-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[X]], splat (i32 17) +; X86-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <8 x i32> +; X86-NEXT: [[TMP4:%.*]] = call <8 x i1> @llvm.vector.insert.v8i1.v4i1(<8 x i1> [[TMP3]], <4 x i1> [[TMP1]], i64 4) +; X86-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]] +; X86-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]]) +; X86-NEXT: ret i1 [[TMP6]] +; +; AARCH64-LABEL: @logical_and_icmp_clamp( +; AARCH64-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> +; AARCH64-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP1]], +; AARCH64-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], +; AARCH64-NEXT: [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> [[TMP3]], <8 x i32> +; AARCH64-NEXT: [[TMP5:%.*]] = freeze <8 x i1> [[TMP4]] +; AARCH64-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP5]]) +; AARCH64-NEXT: ret i1 [[TMP6]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 From 66db3ccd8c370e56d8f34a55f8cb137dd21b7ced Mon Sep 17 00:00:00 2001 From: Jakub Kuderski Date: Mon, 31 Mar 2025 19:56:46 -0400 Subject: [PATCH 0169/1029] [mlir] Update vector return types for `.getMixed`* methods (NFC) (#133821) Drop small size to make vector types match the generic helper `getMixedValues` in `StaticValueUtils.h`. This saves some needles vector copies. I didn't find any local variables that need updating. --- mlir/include/mlir/Interfaces/ViewLikeInterface.td | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Interfaces/ViewLikeInterface.td b/mlir/include/mlir/Interfaces/ViewLikeInterface.td index 9397f271e1bc6..d1401c238381e 100644 --- a/mlir/include/mlir/Interfaces/ViewLikeInterface.td +++ b/mlir/include/mlir/Interfaces/ViewLikeInterface.td @@ -160,7 +160,7 @@ def OffsetSizeAndStrideOpInterface : OpInterface<"OffsetSizeAndStrideOpInterface /*desc=*/[{ Return a vector of all the static or dynamic offsets of the op. }], - /*retTy=*/"::llvm::SmallVector<::mlir::OpFoldResult, 4>", + /*retTy=*/"::llvm::SmallVector<::mlir::OpFoldResult>", /*methodName=*/"getMixedOffsets", /*args=*/(ins), /*methodBody=*/"", @@ -174,7 +174,7 @@ def OffsetSizeAndStrideOpInterface : OpInterface<"OffsetSizeAndStrideOpInterface /*desc=*/[{ Return a vector of all the static or dynamic sizes of the op. }], - /*retTy=*/"::llvm::SmallVector<::mlir::OpFoldResult, 4>", + /*retTy=*/"::llvm::SmallVector<::mlir::OpFoldResult>", /*methodName=*/"getMixedSizes", /*args=*/(ins), /*methodBody=*/"", @@ -188,7 +188,7 @@ def OffsetSizeAndStrideOpInterface : OpInterface<"OffsetSizeAndStrideOpInterface /*desc=*/[{ Return a vector of all the static or dynamic strides of the op. }], - /*retTy=*/"::llvm::SmallVector<::mlir::OpFoldResult, 4>", + /*retTy=*/"::llvm::SmallVector<::mlir::OpFoldResult>", /*methodName=*/"getMixedStrides", /*args=*/(ins), /*methodBody=*/"", From de053bb4b0db64aebdff7719ff6ce75487f6ba5d Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Tue, 1 Apr 2025 08:00:22 +0800 Subject: [PATCH 0170/1029] LLVM/Test: Add vectorizing testcases for fminimumnum and fminimumnum (#133690) Vectorizing of fminimumnum and fminimumnum have not support yet. Let's add the testcase for it now, and we will update the testcase when we support it. --- .../Transforms/LoopVectorize/fminimumnum.ll | 489 ++++++++++++++++++ 1 file changed, 489 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/fminimumnum.ll diff --git a/llvm/test/Transforms/LoopVectorize/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/fminimumnum.ll new file mode 100644 index 0000000000000..66ad9e9a0e5dd --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/fminimumnum.ll @@ -0,0 +1,489 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet. +; RUN: opt --passes=loop-vectorize --mtriple=riscv64 -mattr="+zvfh,+v" -S < %s | FileCheck %s --check-prefix=RV64 +; RUN: opt --passes=loop-vectorize --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s --check-prefix=ARM64 +; RUN: opt --passes=loop-vectorize --mtriple=x86_64 -S < %s | FileCheck %s --check-prefix=X64 + +@input1_f32 = global [4096 x float] zeroinitializer, align 4 +@input2_f32 = global [4096 x float] zeroinitializer, align 4 +@output_f32 = global [4096 x float] zeroinitializer, align 4 +@input1_f64 = global [4096 x double] zeroinitializer, align 8 +@input2_f64 = global [4096 x double] zeroinitializer, align 8 +@output_f64 = global [4096 x double] zeroinitializer, align 8 +@input1_f16 = global [4096 x half] zeroinitializer, align 2 +@input2_f16 = global [4096 x half] zeroinitializer, align 2 +@output_f16 = global [4096 x half] zeroinitializer, align 2 + +define void @f32min() { +; RV64-LABEL: define void @f32min( +; RV64-SAME: ) #[[ATTR0:[0-9]+]] { +; RV64-NEXT: [[ENTRY:.*]]: +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_COND_CLEANUP:.*]]: +; RV64-NEXT: ret void +; RV64: [[FOR_BODY]]: +; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; RV64-NEXT: [[TMP16:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP14]], float [[TMP15]]) +; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: store float [[TMP16]], ptr [[ARRAYIDX4]], align 4 +; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +; ARM64-LABEL: define void @f32min( +; ARM64-SAME: ) #[[ATTR0:[0-9]+]] { +; ARM64-NEXT: [[ENTRY:.*]]: +; ARM64-NEXT: br label %[[FOR_BODY:.*]] +; ARM64: [[FOR_COND_CLEANUP:.*]]: +; ARM64-NEXT: ret void +; ARM64: [[FOR_BODY]]: +; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; ARM64-NEXT: [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]]) +; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: store float [[TMP14]], ptr [[ARRAYIDX4]], align 4 +; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +; X64-LABEL: define void @f32min() { +; X64-NEXT: [[ENTRY:.*]]: +; X64-NEXT: br label %[[FOR_BODY:.*]] +; X64: [[FOR_COND_CLEANUP:.*]]: +; X64-NEXT: ret void +; X64: [[FOR_BODY]]: +; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; X64-NEXT: [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]]) +; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: store float [[TMP14]], ptr [[ARRAYIDX4]], align 4 +; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4 + %2 = tail call float @llvm.minimumnum.f32(float %0, float %1) + %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 %indvars.iv + store float %2, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare float @llvm.minimumnum.f32(float, float) + +define void @f32max() { +; RV64-LABEL: define void @f32max( +; RV64-SAME: ) #[[ATTR0]] { +; RV64-NEXT: [[ENTRY:.*]]: +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_COND_CLEANUP:.*]]: +; RV64-NEXT: ret void +; RV64: [[FOR_BODY]]: +; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; RV64-NEXT: [[TMP16:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP14]], float [[TMP15]]) +; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: store float [[TMP16]], ptr [[ARRAYIDX4]], align 4 +; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +; ARM64-LABEL: define void @f32max( +; ARM64-SAME: ) #[[ATTR0]] { +; ARM64-NEXT: [[ENTRY:.*]]: +; ARM64-NEXT: br label %[[FOR_BODY:.*]] +; ARM64: [[FOR_COND_CLEANUP:.*]]: +; ARM64-NEXT: ret void +; ARM64: [[FOR_BODY]]: +; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; ARM64-NEXT: [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]]) +; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: store float [[TMP14]], ptr [[ARRAYIDX4]], align 4 +; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +; X64-LABEL: define void @f32max() { +; X64-NEXT: [[ENTRY:.*]]: +; X64-NEXT: br label %[[FOR_BODY:.*]] +; X64: [[FOR_COND_CLEANUP:.*]]: +; X64-NEXT: ret void +; X64: [[FOR_BODY]]: +; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; X64-NEXT: [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]]) +; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: store float [[TMP14]], ptr [[ARRAYIDX4]], align 4 +; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4 + %2 = tail call float @llvm.maximumnum.f32(float %0, float %1) + %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 %indvars.iv + store float %2, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare float @llvm.maximumnum.f32(float, float) + +define void @f64min() { +; RV64-LABEL: define void @f64min( +; RV64-SAME: ) #[[ATTR0]] { +; RV64-NEXT: [[ENTRY:.*]]: +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_COND_CLEANUP:.*]]: +; RV64-NEXT: ret void +; RV64: [[FOR_BODY]]: +; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; RV64-NEXT: [[TMP16:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP14]], double [[TMP15]]) +; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: store double [[TMP16]], ptr [[ARRAYIDX4]], align 8 +; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +; ARM64-LABEL: define void @f64min( +; ARM64-SAME: ) #[[ATTR0]] { +; ARM64-NEXT: [[ENTRY:.*]]: +; ARM64-NEXT: br label %[[FOR_BODY:.*]] +; ARM64: [[FOR_COND_CLEANUP:.*]]: +; ARM64-NEXT: ret void +; ARM64: [[FOR_BODY]]: +; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; ARM64-NEXT: [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]]) +; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: store double [[TMP14]], ptr [[ARRAYIDX4]], align 8 +; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +; X64-LABEL: define void @f64min() { +; X64-NEXT: [[ENTRY:.*]]: +; X64-NEXT: br label %[[FOR_BODY:.*]] +; X64: [[FOR_COND_CLEANUP:.*]]: +; X64-NEXT: ret void +; X64: [[FOR_BODY]]: +; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; X64-NEXT: [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]]) +; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: store double [[TMP14]], ptr [[ARRAYIDX4]], align 8 +; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 %indvars.iv + %0 = load double, ptr %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 %indvars.iv + %1 = load double, ptr %arrayidx2, align 8 + %2 = tail call double @llvm.minimumnum.f64(double %0, double %1) + %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 %indvars.iv + store double %2, ptr %arrayidx4, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare double @llvm.minimumnum.f64(double, double) + +define void @f64max() { +; RV64-LABEL: define void @f64max( +; RV64-SAME: ) #[[ATTR0]] { +; RV64-NEXT: [[ENTRY:.*]]: +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_COND_CLEANUP:.*]]: +; RV64-NEXT: ret void +; RV64: [[FOR_BODY]]: +; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; RV64-NEXT: [[TMP16:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP14]], double [[TMP15]]) +; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: store double [[TMP16]], ptr [[ARRAYIDX4]], align 8 +; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +; ARM64-LABEL: define void @f64max( +; ARM64-SAME: ) #[[ATTR0]] { +; ARM64-NEXT: [[ENTRY:.*]]: +; ARM64-NEXT: br label %[[FOR_BODY:.*]] +; ARM64: [[FOR_COND_CLEANUP:.*]]: +; ARM64-NEXT: ret void +; ARM64: [[FOR_BODY]]: +; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; ARM64-NEXT: [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]]) +; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: store double [[TMP14]], ptr [[ARRAYIDX4]], align 8 +; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +; X64-LABEL: define void @f64max() { +; X64-NEXT: [[ENTRY:.*]]: +; X64-NEXT: br label %[[FOR_BODY:.*]] +; X64: [[FOR_COND_CLEANUP:.*]]: +; X64-NEXT: ret void +; X64: [[FOR_BODY]]: +; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; X64-NEXT: [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]]) +; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: store double [[TMP14]], ptr [[ARRAYIDX4]], align 8 +; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 %indvars.iv + %0 = load double, ptr %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 %indvars.iv + %1 = load double, ptr %arrayidx2, align 8 + %2 = tail call double @llvm.maximumnum.f64(double %0, double %1) + %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 %indvars.iv + store double %2, ptr %arrayidx4, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare double @llvm.maximumnum.f64(double, double) + +define void @f16min() { +; RV64-LABEL: define void @f16min( +; RV64-SAME: ) #[[ATTR0]] { +; RV64-NEXT: [[ENTRY:.*]]: +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_COND_CLEANUP:.*]]: +; RV64-NEXT: ret void +; RV64: [[FOR_BODY]]: +; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP14:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP15:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 +; RV64-NEXT: [[TMP16:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP14]], half [[TMP15]]) +; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: store half [[TMP16]], ptr [[ARRAYIDX4]], align 2 +; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +; ARM64-LABEL: define void @f16min( +; ARM64-SAME: ) #[[ATTR0]] { +; ARM64-NEXT: [[ENTRY:.*]]: +; ARM64-NEXT: br label %[[FOR_BODY:.*]] +; ARM64: [[FOR_COND_CLEANUP:.*]]: +; ARM64-NEXT: ret void +; ARM64: [[FOR_BODY]]: +; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 +; ARM64-NEXT: [[TMP10:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP8]], half [[TMP9]]) +; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: store half [[TMP10]], ptr [[ARRAYIDX4]], align 2 +; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +; X64-LABEL: define void @f16min() { +; X64-NEXT: [[ENTRY:.*]]: +; X64-NEXT: br label %[[FOR_BODY:.*]] +; X64: [[FOR_COND_CLEANUP:.*]]: +; X64-NEXT: ret void +; X64: [[FOR_BODY]]: +; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 +; X64-NEXT: [[TMP10:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP8]], half [[TMP9]]) +; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: store half [[TMP10]], ptr [[ARRAYIDX4]], align 2 +; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 %indvars.iv + %0 = load half, ptr %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 %indvars.iv + %1 = load half, ptr %arrayidx2, align 2 + %2 = tail call half @llvm.minimumnum.f16(half %0, half %1) + %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 %indvars.iv + store half %2, ptr %arrayidx4, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare half @llvm.minimumnum.f16(half, half) + +define void @f16max() { +; RV64-LABEL: define void @f16max( +; RV64-SAME: ) #[[ATTR0]] { +; RV64-NEXT: [[ENTRY:.*]]: +; RV64-NEXT: br label %[[FOR_BODY:.*]] +; RV64: [[FOR_COND_CLEANUP:.*]]: +; RV64-NEXT: ret void +; RV64: [[FOR_BODY]]: +; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP14:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: [[TMP15:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 +; RV64-NEXT: [[TMP16:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP14]], half [[TMP15]]) +; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]] +; RV64-NEXT: store half [[TMP16]], ptr [[ARRAYIDX4]], align 2 +; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +; ARM64-LABEL: define void @f16max( +; ARM64-SAME: ) #[[ATTR0]] { +; ARM64-NEXT: [[ENTRY:.*]]: +; ARM64-NEXT: br label %[[FOR_BODY:.*]] +; ARM64: [[FOR_COND_CLEANUP:.*]]: +; ARM64-NEXT: ret void +; ARM64: [[FOR_BODY]]: +; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 +; ARM64-NEXT: [[TMP10:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP8]], half [[TMP9]]) +; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]] +; ARM64-NEXT: store half [[TMP10]], ptr [[ARRAYIDX4]], align 2 +; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +; X64-LABEL: define void @f16max() { +; X64-NEXT: [[ENTRY:.*]]: +; X64-NEXT: br label %[[FOR_BODY:.*]] +; X64: [[FOR_COND_CLEANUP:.*]]: +; X64-NEXT: ret void +; X64: [[FOR_BODY]]: +; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 +; X64-NEXT: [[TMP10:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP8]], half [[TMP9]]) +; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]] +; X64-NEXT: store half [[TMP10]], ptr [[ARRAYIDX4]], align 2 +; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 %indvars.iv + %0 = load half, ptr %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 %indvars.iv + %1 = load half, ptr %arrayidx2, align 2 + %2 = tail call half @llvm.maximumnum.f16(half %0, half %1) + %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 %indvars.iv + store half %2, ptr %arrayidx4, align 2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare half @llvm.maximumnum.f16(half, half) From 02837acaaf2cfdfcbf77e4a7f6629575edb6ffb4 Mon Sep 17 00:00:00 2001 From: Mariusz Borsa Date: Mon, 31 Mar 2025 17:06:41 -0700 Subject: [PATCH 0171/1029] [Sanitizers][Darwin][Test] Remove community incompliant internal link from sources (#133187) The malloc_zone.cpp test currently fails on Darwin hosts, in SanitizerCommon tests with lsan enabled. Need to XFAIL this test to buy time to investigate this failure. Also we're trying to bring the number of test failing on Darwin bots to 0, to get clearer signal of any new failures. rdar://145873843 Co-authored-by: Mariusz Borsa --- .../test/sanitizer_common/TestCases/Darwin/malloc_zone.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/sanitizer_common/TestCases/Darwin/malloc_zone.cpp b/compiler-rt/test/sanitizer_common/TestCases/Darwin/malloc_zone.cpp index ffe2100634f57..5aa087fb4ca12 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Darwin/malloc_zone.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Darwin/malloc_zone.cpp @@ -17,7 +17,7 @@ // UBSan does not install a malloc zone. // XFAIL: ubsan // -// Currently fails on darwin/lsan rdar://145873843 +// Currently fails on darwin/lsan // XFAIL: darwin && lsan #include From 091051fb7f48dd9f1d3f119aa7dcbd38d9d6f076 Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Mon, 31 Mar 2025 21:13:44 -0300 Subject: [PATCH 0172/1029] [libc] Add myself as maintainer of the riscv port (#133757) Co-authored-by: Joseph Huber --- libc/Maintainers.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/libc/Maintainers.rst b/libc/Maintainers.rst index 7991397532da5..8f3c251a0edb1 100644 --- a/libc/Maintainers.rst +++ b/libc/Maintainers.rst @@ -49,4 +49,9 @@ UEFI | Tristan Ross | tristan.ross\@midstall.com (email), `RossComputerGuy `_ (github) +RISC-V +------ +| Mikhail R. Gadelha +| mikhail\@igalia.com (email), `mikhailramalho `_ (github) + .. TODO: add "Inactive Maintainers" section when needed. From 0248d277cabab370b48114cc62aff393b273971b Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Mon, 31 Mar 2025 20:28:29 -0400 Subject: [PATCH 0173/1029] Reland [HIP] fix host min/max in header (#133590) CUDA defines min/max functions for host in global namespace. HIP header needs to define them too to be compatible. Currently only min/max(int, int) is defined. This causes wrong result for arguments that are out of range for int. This patch defines host min/max functions to be compatible with CUDA. Since some HIP apps defined min/max functions by themselves, newly added min/max function are under the control of macro `__HIP_DEFINE_EXTENDED_HOST_MIN_MAX__`, which is 0 by default. In the future, this will change to 1 by default after most existing HIP apps adopt this change. Also allows users to define `__HIP_NO_HOST_MIN_MAX_IN_GLOBAL_NAMESPACE__` to disable host max/min in global namespace. min/max functions with mixed signed/unsigned integer parameters are not defined unless `__HIP_DEFINE_MIXED_HOST_MIN_MAX__` is defined. Fixes: SWDEV-446564 --- clang/lib/Headers/__clang_hip_math.h | 86 ++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 6 deletions(-) diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h index f6c06eaf4afe0..759e742c9d012 100644 --- a/clang/lib/Headers/__clang_hip_math.h +++ b/clang/lib/Headers/__clang_hip_math.h @@ -1311,15 +1311,89 @@ float min(float __x, float __y) { return __builtin_fminf(__x, __y); } __DEVICE__ double min(double __x, double __y) { return __builtin_fmin(__x, __y); } -#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) -__host__ inline static int min(int __arg1, int __arg2) { - return __arg1 < __arg2 ? __arg1 : __arg2; +// Define host min/max functions. +#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) && \ + !defined(__HIP_NO_HOST_MIN_MAX_IN_GLOBAL_NAMESPACE__) + +// TODO: make this default to 1 after existing HIP apps adopting this change. +#ifndef __HIP_DEFINE_EXTENDED_HOST_MIN_MAX__ +#define __HIP_DEFINE_EXTENDED_HOST_MIN_MAX__ 0 +#endif + +#ifndef __HIP_DEFINE_MIXED_HOST_MIN_MAX__ +#define __HIP_DEFINE_MIXED_HOST_MIN_MAX__ 0 +#endif + +#pragma push_macro("DEFINE_MIN_MAX_FUNCTIONS") +#pragma push_macro("DEFINE_MIN_MAX_FUNCTIONS") +#define DEFINE_MIN_MAX_FUNCTIONS(ret_type, type1, type2) \ + inline ret_type min(const type1 __a, const type2 __b) { \ + return (__a < __b) ? __a : __b; \ + } \ + inline ret_type max(const type1 __a, const type2 __b) { \ + return (__a > __b) ? __a : __b; \ + } + +// Define min and max functions for same type comparisons +DEFINE_MIN_MAX_FUNCTIONS(int, int, int) + +#if __HIP_DEFINE_EXTENDED_HOST_MIN_MAX__ +DEFINE_MIN_MAX_FUNCTIONS(unsigned int, unsigned int, unsigned int) +DEFINE_MIN_MAX_FUNCTIONS(long, long, long) +DEFINE_MIN_MAX_FUNCTIONS(unsigned long, unsigned long, unsigned long) +DEFINE_MIN_MAX_FUNCTIONS(long long, long long, long long) +DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, unsigned long long, + unsigned long long) +#endif // if __HIP_DEFINE_EXTENDED_HOST_MIN_MAX__ + +// The host min/max functions below accept mixed signed/unsigned integer +// parameters and perform unsigned comparisons, which may produce unexpected +// results if a signed integer was passed unintentionally. To avoid this +// happening silently, these overloaded functions are not defined by default. +// However, for compatibility with CUDA, they will be defined if users define +// __HIP_DEFINE_MIXED_HOST_MIN_MAX__. +#if __HIP_DEFINE_MIXED_HOST_MIN_MAX__ +DEFINE_MIN_MAX_FUNCTIONS(unsigned int, int, unsigned int) +DEFINE_MIN_MAX_FUNCTIONS(unsigned int, unsigned int, int) +DEFINE_MIN_MAX_FUNCTIONS(unsigned long, long, unsigned long) +DEFINE_MIN_MAX_FUNCTIONS(unsigned long, unsigned long, long) +DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, long long, unsigned long long) +DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, unsigned long long, long long) +#endif // if __HIP_DEFINE_MIXED_HOST_MIN_MAX__ + +// Floating-point comparisons using built-in functions +#if __HIP_DEFINE_EXTENDED_HOST_MIN_MAX__ +inline float min(float const __a, float const __b) { + return __builtin_fminf(__a, __b); +} +inline double min(double const __a, double const __b) { + return __builtin_fmin(__a, __b); +} +inline double min(float const __a, double const __b) { + return __builtin_fmin(__a, __b); +} +inline double min(double const __a, float const __b) { + return __builtin_fmin(__a, __b); } -__host__ inline static int max(int __arg1, int __arg2) { - return __arg1 > __arg2 ? __arg1 : __arg2; +inline float max(float const __a, float const __b) { + return __builtin_fmaxf(__a, __b); +} +inline double max(double const __a, double const __b) { + return __builtin_fmax(__a, __b); } -#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) +inline double max(float const __a, double const __b) { + return __builtin_fmax(__a, __b); +} +inline double max(double const __a, float const __b) { + return __builtin_fmax(__a, __b); +} +#endif // if __HIP_DEFINE_EXTENDED_HOST_MIN_MAX__ + +#pragma pop_macro("DEFINE_MIN_MAX_FUNCTIONS") + +#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) && + // !defined(__HIP_NO_HOST_MIN_MAX_IN_GLOBAL_NAMESPACE__) #endif #pragma pop_macro("__DEVICE__") From f77f2b9c566858b3c6605ab02f4bbd56000f732f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 1 Apr 2025 07:34:31 +0700 Subject: [PATCH 0174/1029] llvm-reduce: Try to preserve instruction metadata as argument attributes (#133557) Fixes #131825 --- llvm/include/llvm/IR/Attributes.h | 6 ++ llvm/lib/IR/Attributes.cpp | 32 ++++++++ ...operands-to-args-metadata-to-attributes.ll | 77 +++++++++++++++++++ .../deltas/ReduceOperandsToArgs.cpp | 11 ++- 4 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/llvm-reduce/reduce-operands-to-args-metadata-to-attributes.ll diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h index d6533b9bcbea1..5252f26f398d2 100644 --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -42,6 +42,7 @@ class ConstantRangeList; class FoldingSetNodeID; class Function; class LLVMContext; +class Instruction; class Type; class raw_ostream; enum FPClassTest : unsigned; @@ -1285,6 +1286,11 @@ class AttrBuilder { /// Add initializes attribute. AttrBuilder &addInitializesAttr(const ConstantRangeList &CRL); + /// Add 0 or more parameter attributes which are equivalent to metadata + /// attached to \p I. e.g. !align -> align. This assumes the argument type is + /// the same as the original instruction and the attribute is compatible. + AttrBuilder &addFromEquivalentMetadata(const Instruction &I); + ArrayRef attrs() const { return Attrs; } bool operator==(const AttrBuilder &B) const; diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index 8da1dfe914818..8cb8b0d927afd 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -2291,6 +2291,38 @@ AttrBuilder &AttrBuilder::addInitializesAttr(const ConstantRangeList &CRL) { return addConstantRangeListAttr(Attribute::Initializes, CRL.rangesRef()); } +AttrBuilder &AttrBuilder::addFromEquivalentMetadata(const Instruction &I) { + if (const MDNode *NonNull = I.getMetadata(LLVMContext::MD_nonnull)) + addAttribute(Attribute::NonNull); + + if (const MDNode *NoUndef = I.getMetadata(LLVMContext::MD_noundef)) + addAttribute(Attribute::NoUndef); + + if (const MDNode *Align = I.getMetadata(LLVMContext::MD_align)) { + ConstantInt *CI = mdconst::extract(Align->getOperand(0)); + addAlignmentAttr(CI->getZExtValue()); + } + + if (const MDNode *Dereferenceable = + I.getMetadata(LLVMContext::MD_dereferenceable)) { + ConstantInt *CI = + mdconst::extract(Dereferenceable->getOperand(0)); + addDereferenceableAttr(CI->getZExtValue()); + } + + if (const MDNode *DereferenceableOrNull = + I.getMetadata(LLVMContext::MD_dereferenceable_or_null)) { + ConstantInt *CI = + mdconst::extract(DereferenceableOrNull->getOperand(0)); + addDereferenceableAttr(CI->getZExtValue()); + } + + if (const MDNode *Range = I.getMetadata(LLVMContext::MD_range)) + addRangeAttr(getConstantRangeFromMetadata(*Range)); + + return *this; +} + AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) { // TODO: Could make this O(n) as we're merging two sorted lists. for (const auto &I : B.attrs()) diff --git a/llvm/test/tools/llvm-reduce/reduce-operands-to-args-metadata-to-attributes.ll b/llvm/test/tools/llvm-reduce/reduce-operands-to-args-metadata-to-attributes.ll new file mode 100644 index 0000000000000..913ba9d3218fd --- /dev/null +++ b/llvm/test/tools/llvm-reduce/reduce-operands-to-args-metadata-to-attributes.ll @@ -0,0 +1,77 @@ +; Check that equivalent parameter attributes are introduced when +; moving instructions with metadata to arguments. + +; RUN: llvm-reduce %s -o %t --abort-on-invalid-reduction --delta-passes=operands-to-args --test FileCheck --test-arg %s --test-arg --check-prefix=INTERESTING --test-arg --input-file +; RUN: FileCheck --input-file %t --check-prefix=REDUCED %s + +; INTERESTING-LABEL: define ptr @use_nonnull( +; REDUCED-LABEL: define ptr @use_nonnull(ptr nonnull %nonnull) { +define ptr @use_nonnull() { + %nonnull = load ptr, ptr null, !nonnull !0 + ret ptr %nonnull +} + +; INTERESTING-LABEL: define void @use_noundef( +; REDUCED-LABEL: define void @use_noundef(ptr noundef %noundef, <2 x ptr> noundef %noundef_vec) { +define void @use_noundef() { + %noundef = load ptr, ptr null, !noundef !0 + %noundef_vec = load <2 x ptr>, ptr null, !noundef !0 + store ptr %noundef, ptr null + store <2 x ptr> %noundef_vec, ptr null + ret void +} + +; INTERESTING-LABEL: define ptr @use_align( +; REDUCED-LABEL: define ptr @use_align(ptr align 16 %align) { +define ptr @use_align() { + %align = load ptr, ptr null, !align !1 + ret ptr %align +} + +; INTERESTING-LABEL: define ptr @use_dereferenceable( +; REDUCED-LABEL: define ptr @use_dereferenceable(ptr dereferenceable(12345) %deref) { +define ptr @use_dereferenceable() { + %deref = load ptr, ptr null, !dereferenceable !2 + ret ptr %deref +} + +; INTERESTING-LABEL: define ptr @use_dereferenceable_or_null( +; REDUCED-LABEL: define ptr @use_dereferenceable_or_null(ptr dereferenceable(77777) %deref) { +define ptr @use_dereferenceable_or_null() { + %deref = load ptr, ptr null, !dereferenceable_or_null !3 + ret ptr %deref +} + +; INTERESTING-LABEL: define void @use_range( +; REDUCED-LABEL: define void @use_range(i32 range(i32 8, 25) %simple_range, i32 range(i32 8, 420) %disjoint_range, i32 range(i32 42, 0) %wrapping_range, <2 x i32> range(i32 8, 25) %vector_range) { +define void @use_range() { + %simple_range = load i32, ptr null, !range !4 + %disjoint_range = load i32, ptr null, !range !5 + %wrapping_range = load i32, ptr null, !range !6 + %vector_range = load <2 x i32>, ptr null, !range !4 + store i32 %simple_range, ptr null + store i32 %disjoint_range, ptr null + store i32 %wrapping_range, ptr null + store <2 x i32> %vector_range, ptr null + ret void +} + +; INTERESTING-LABEL: define void @use_noundef_range( +; REDUCED-LABEL: define void @use_noundef_range(i32 noundef range(i32 8, 25) %load, <2 x i32> noundef range(i32 8, 25) %load_vec) { +define void @use_noundef_range() { + %load = load i32, ptr null, !range !4, !noundef !0 + %load_vec = load <2 x i32>, ptr null, !range !4, !noundef !0 + store i32 %load, ptr null + store <2 x i32> %load_vec, ptr null + ret void +} + + + +!0 = !{} +!1 = !{i64 16} +!2 = !{i64 12345} +!3 = !{i64 77777} +!4 = !{i32 8, i32 25} +!5 = !{i32 8, i32 25, i32 69, i32 420} +!6 = !{i32 42, i32 0} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp index 0d984622bc298..3548130d3276a 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp @@ -159,6 +159,8 @@ static void substituteOperandWithArgument(Function *OldF, VMap[&OldArg] = &NewArg; // Add mapping to VMap } + LLVMContext &Ctx = OldF->getContext(); + // Adjust the new parameters. ValueToValueMapTy OldValMap; for (auto Z : zip_first(UniqueValues, drop_begin(NewF->args(), ArgOffset))) { @@ -175,9 +177,16 @@ static void substituteOperandWithArgument(Function *OldF, // Replace the actual operands. for (Use *Op : OpsToReplace) { - Value *NewArg = OldValMap.lookup(Op->get()); + Argument *NewArg = cast(OldValMap.lookup(Op->get())); auto *NewUser = cast(VMap.lookup(Op->getUser())); + // Try to preserve any information contained metadata annotations as the + // equivalent parameter attributes if possible. + if (auto *MDSrcInst = dyn_cast(Op)) { + AttrBuilder AB(Ctx); + NewArg->addAttrs(AB.addFromEquivalentMetadata(*MDSrcInst)); + } + if (PHINode *NewPhi = dyn_cast(NewUser)) { PHINode *OldPhi = cast(Op->getUser()); BasicBlock *OldBB = OldPhi->getIncomingBlock(*Op); From f9282475b305c0d2428640fa6586fe70f9b9f8d6 Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Tue, 1 Apr 2025 08:48:10 +0800 Subject: [PATCH 0175/1029] Revert "LLVM/Test: Add vectorizing testcases for fminimumnum and fminimumnum (#133690)" This reverts commit de053bb4b0db64aebdff7719ff6ce75487f6ba5d. --- .../Transforms/LoopVectorize/fminimumnum.ll | 489 ------------------ 1 file changed, 489 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/fminimumnum.ll diff --git a/llvm/test/Transforms/LoopVectorize/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/fminimumnum.ll deleted file mode 100644 index 66ad9e9a0e5dd..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/fminimumnum.ll +++ /dev/null @@ -1,489 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet. -; RUN: opt --passes=loop-vectorize --mtriple=riscv64 -mattr="+zvfh,+v" -S < %s | FileCheck %s --check-prefix=RV64 -; RUN: opt --passes=loop-vectorize --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s --check-prefix=ARM64 -; RUN: opt --passes=loop-vectorize --mtriple=x86_64 -S < %s | FileCheck %s --check-prefix=X64 - -@input1_f32 = global [4096 x float] zeroinitializer, align 4 -@input2_f32 = global [4096 x float] zeroinitializer, align 4 -@output_f32 = global [4096 x float] zeroinitializer, align 4 -@input1_f64 = global [4096 x double] zeroinitializer, align 8 -@input2_f64 = global [4096 x double] zeroinitializer, align 8 -@output_f64 = global [4096 x double] zeroinitializer, align 8 -@input1_f16 = global [4096 x half] zeroinitializer, align 2 -@input2_f16 = global [4096 x half] zeroinitializer, align 2 -@output_f16 = global [4096 x half] zeroinitializer, align 2 - -define void @f32min() { -; RV64-LABEL: define void @f32min( -; RV64-SAME: ) #[[ATTR0:[0-9]+]] { -; RV64-NEXT: [[ENTRY:.*]]: -; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_COND_CLEANUP:.*]]: -; RV64-NEXT: ret void -; RV64: [[FOR_BODY]]: -; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; RV64-NEXT: [[TMP16:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP14]], float [[TMP15]]) -; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: store float [[TMP16]], ptr [[ARRAYIDX4]], align 4 -; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -; ARM64-LABEL: define void @f32min( -; ARM64-SAME: ) #[[ATTR0:[0-9]+]] { -; ARM64-NEXT: [[ENTRY:.*]]: -; ARM64-NEXT: br label %[[FOR_BODY:.*]] -; ARM64: [[FOR_COND_CLEANUP:.*]]: -; ARM64-NEXT: ret void -; ARM64: [[FOR_BODY]]: -; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; ARM64-NEXT: [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]]) -; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: store float [[TMP14]], ptr [[ARRAYIDX4]], align 4 -; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -; X64-LABEL: define void @f32min() { -; X64-NEXT: [[ENTRY:.*]]: -; X64-NEXT: br label %[[FOR_BODY:.*]] -; X64: [[FOR_COND_CLEANUP:.*]]: -; X64-NEXT: ret void -; X64: [[FOR_BODY]]: -; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; X64-NEXT: [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]]) -; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: store float [[TMP14]], ptr [[ARRAYIDX4]], align 4 -; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -entry: - br label %for.body - -for.cond.cleanup: ; preds = %for.body - ret void - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 %indvars.iv - %0 = load float, ptr %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 %indvars.iv - %1 = load float, ptr %arrayidx2, align 4 - %2 = tail call float @llvm.minimumnum.f32(float %0, float %1) - %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 %indvars.iv - store float %2, ptr %arrayidx4, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body -} - -declare float @llvm.minimumnum.f32(float, float) - -define void @f32max() { -; RV64-LABEL: define void @f32max( -; RV64-SAME: ) #[[ATTR0]] { -; RV64-NEXT: [[ENTRY:.*]]: -; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_COND_CLEANUP:.*]]: -; RV64-NEXT: ret void -; RV64: [[FOR_BODY]]: -; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; RV64-NEXT: [[TMP16:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP14]], float [[TMP15]]) -; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: store float [[TMP16]], ptr [[ARRAYIDX4]], align 4 -; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -; ARM64-LABEL: define void @f32max( -; ARM64-SAME: ) #[[ATTR0]] { -; ARM64-NEXT: [[ENTRY:.*]]: -; ARM64-NEXT: br label %[[FOR_BODY:.*]] -; ARM64: [[FOR_COND_CLEANUP:.*]]: -; ARM64-NEXT: ret void -; ARM64: [[FOR_BODY]]: -; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; ARM64-NEXT: [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]]) -; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: store float [[TMP14]], ptr [[ARRAYIDX4]], align 4 -; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -; X64-LABEL: define void @f32max() { -; X64-NEXT: [[ENTRY:.*]]: -; X64-NEXT: br label %[[FOR_BODY:.*]] -; X64: [[FOR_COND_CLEANUP:.*]]: -; X64-NEXT: ret void -; X64: [[FOR_BODY]]: -; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; X64-NEXT: [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]]) -; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: store float [[TMP14]], ptr [[ARRAYIDX4]], align 4 -; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -entry: - br label %for.body - -for.cond.cleanup: ; preds = %for.body - ret void - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds nuw [4096 x float], ptr @input1_f32, i64 0, i64 %indvars.iv - %0 = load float, ptr %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr @input2_f32, i64 0, i64 %indvars.iv - %1 = load float, ptr %arrayidx2, align 4 - %2 = tail call float @llvm.maximumnum.f32(float %0, float %1) - %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr @output_f32, i64 0, i64 %indvars.iv - store float %2, ptr %arrayidx4, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body -} - -declare float @llvm.maximumnum.f32(float, float) - -define void @f64min() { -; RV64-LABEL: define void @f64min( -; RV64-SAME: ) #[[ATTR0]] { -; RV64-NEXT: [[ENTRY:.*]]: -; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_COND_CLEANUP:.*]]: -; RV64-NEXT: ret void -; RV64: [[FOR_BODY]]: -; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8 -; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 -; RV64-NEXT: [[TMP16:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP14]], double [[TMP15]]) -; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: store double [[TMP16]], ptr [[ARRAYIDX4]], align 8 -; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -; ARM64-LABEL: define void @f64min( -; ARM64-SAME: ) #[[ATTR0]] { -; ARM64-NEXT: [[ENTRY:.*]]: -; ARM64-NEXT: br label %[[FOR_BODY:.*]] -; ARM64: [[FOR_COND_CLEANUP:.*]]: -; ARM64-NEXT: ret void -; ARM64: [[FOR_BODY]]: -; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8 -; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 -; ARM64-NEXT: [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]]) -; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: store double [[TMP14]], ptr [[ARRAYIDX4]], align 8 -; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -; X64-LABEL: define void @f64min() { -; X64-NEXT: [[ENTRY:.*]]: -; X64-NEXT: br label %[[FOR_BODY:.*]] -; X64: [[FOR_COND_CLEANUP:.*]]: -; X64-NEXT: ret void -; X64: [[FOR_BODY]]: -; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8 -; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 -; X64-NEXT: [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]]) -; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: store double [[TMP14]], ptr [[ARRAYIDX4]], align 8 -; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -entry: - br label %for.body - -for.cond.cleanup: ; preds = %for.body - ret void - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 %indvars.iv - %0 = load double, ptr %arrayidx, align 8 - %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 %indvars.iv - %1 = load double, ptr %arrayidx2, align 8 - %2 = tail call double @llvm.minimumnum.f64(double %0, double %1) - %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 %indvars.iv - store double %2, ptr %arrayidx4, align 8 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body -} - -declare double @llvm.minimumnum.f64(double, double) - -define void @f64max() { -; RV64-LABEL: define void @f64max( -; RV64-SAME: ) #[[ATTR0]] { -; RV64-NEXT: [[ENTRY:.*]]: -; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_COND_CLEANUP:.*]]: -; RV64-NEXT: ret void -; RV64: [[FOR_BODY]]: -; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8 -; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 -; RV64-NEXT: [[TMP16:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP14]], double [[TMP15]]) -; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: store double [[TMP16]], ptr [[ARRAYIDX4]], align 8 -; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -; ARM64-LABEL: define void @f64max( -; ARM64-SAME: ) #[[ATTR0]] { -; ARM64-NEXT: [[ENTRY:.*]]: -; ARM64-NEXT: br label %[[FOR_BODY:.*]] -; ARM64: [[FOR_COND_CLEANUP:.*]]: -; ARM64-NEXT: ret void -; ARM64: [[FOR_BODY]]: -; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8 -; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 -; ARM64-NEXT: [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]]) -; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: store double [[TMP14]], ptr [[ARRAYIDX4]], align 8 -; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -; X64-LABEL: define void @f64max() { -; X64-NEXT: [[ENTRY:.*]]: -; X64-NEXT: br label %[[FOR_BODY:.*]] -; X64: [[FOR_COND_CLEANUP:.*]]: -; X64-NEXT: ret void -; X64: [[FOR_BODY]]: -; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: [[TMP12:%.*]] = load double, ptr [[ARRAYIDX]], align 8 -; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: [[TMP13:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 -; X64-NEXT: [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]]) -; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: store double [[TMP14]], ptr [[ARRAYIDX4]], align 8 -; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -entry: - br label %for.body - -for.cond.cleanup: ; preds = %for.body - ret void - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds nuw [4096 x double], ptr @input1_f64, i64 0, i64 %indvars.iv - %0 = load double, ptr %arrayidx, align 8 - %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr @input2_f64, i64 0, i64 %indvars.iv - %1 = load double, ptr %arrayidx2, align 8 - %2 = tail call double @llvm.maximumnum.f64(double %0, double %1) - %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr @output_f64, i64 0, i64 %indvars.iv - store double %2, ptr %arrayidx4, align 8 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body -} - -declare double @llvm.maximumnum.f64(double, double) - -define void @f16min() { -; RV64-LABEL: define void @f16min( -; RV64-SAME: ) #[[ATTR0]] { -; RV64-NEXT: [[ENTRY:.*]]: -; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_COND_CLEANUP:.*]]: -; RV64-NEXT: ret void -; RV64: [[FOR_BODY]]: -; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP14:%.*]] = load half, ptr [[ARRAYIDX]], align 2 -; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP15:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 -; RV64-NEXT: [[TMP16:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP14]], half [[TMP15]]) -; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: store half [[TMP16]], ptr [[ARRAYIDX4]], align 2 -; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -; ARM64-LABEL: define void @f16min( -; ARM64-SAME: ) #[[ATTR0]] { -; ARM64-NEXT: [[ENTRY:.*]]: -; ARM64-NEXT: br label %[[FOR_BODY:.*]] -; ARM64: [[FOR_COND_CLEANUP:.*]]: -; ARM64-NEXT: ret void -; ARM64: [[FOR_BODY]]: -; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2 -; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 -; ARM64-NEXT: [[TMP10:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP8]], half [[TMP9]]) -; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: store half [[TMP10]], ptr [[ARRAYIDX4]], align 2 -; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -; X64-LABEL: define void @f16min() { -; X64-NEXT: [[ENTRY:.*]]: -; X64-NEXT: br label %[[FOR_BODY:.*]] -; X64: [[FOR_COND_CLEANUP:.*]]: -; X64-NEXT: ret void -; X64: [[FOR_BODY]]: -; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2 -; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 -; X64-NEXT: [[TMP10:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP8]], half [[TMP9]]) -; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: store half [[TMP10]], ptr [[ARRAYIDX4]], align 2 -; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -entry: - br label %for.body - -for.cond.cleanup: ; preds = %for.body - ret void - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 %indvars.iv - %0 = load half, ptr %arrayidx, align 2 - %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 %indvars.iv - %1 = load half, ptr %arrayidx2, align 2 - %2 = tail call half @llvm.minimumnum.f16(half %0, half %1) - %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 %indvars.iv - store half %2, ptr %arrayidx4, align 2 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body -} - -declare half @llvm.minimumnum.f16(half, half) - -define void @f16max() { -; RV64-LABEL: define void @f16max( -; RV64-SAME: ) #[[ATTR0]] { -; RV64-NEXT: [[ENTRY:.*]]: -; RV64-NEXT: br label %[[FOR_BODY:.*]] -; RV64: [[FOR_COND_CLEANUP:.*]]: -; RV64-NEXT: ret void -; RV64: [[FOR_BODY]]: -; RV64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; RV64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP14:%.*]] = load half, ptr [[ARRAYIDX]], align 2 -; RV64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: [[TMP15:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 -; RV64-NEXT: [[TMP16:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP14]], half [[TMP15]]) -; RV64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]] -; RV64-NEXT: store half [[TMP16]], ptr [[ARRAYIDX4]], align 2 -; RV64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; RV64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; RV64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -; ARM64-LABEL: define void @f16max( -; ARM64-SAME: ) #[[ATTR0]] { -; ARM64-NEXT: [[ENTRY:.*]]: -; ARM64-NEXT: br label %[[FOR_BODY:.*]] -; ARM64: [[FOR_COND_CLEANUP:.*]]: -; ARM64-NEXT: ret void -; ARM64: [[FOR_BODY]]: -; ARM64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; ARM64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2 -; ARM64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 -; ARM64-NEXT: [[TMP10:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP8]], half [[TMP9]]) -; ARM64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]] -; ARM64-NEXT: store half [[TMP10]], ptr [[ARRAYIDX4]], align 2 -; ARM64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; ARM64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; ARM64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -; X64-LABEL: define void @f16max() { -; X64-NEXT: [[ENTRY:.*]]: -; X64-NEXT: br label %[[FOR_BODY:.*]] -; X64: [[FOR_COND_CLEANUP:.*]]: -; X64-NEXT: ret void -; X64: [[FOR_BODY]]: -; X64-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; X64-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: [[TMP8:%.*]] = load half, ptr [[ARRAYIDX]], align 2 -; X64-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: [[TMP9:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 -; X64-NEXT: [[TMP10:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP8]], half [[TMP9]]) -; X64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 [[INDVARS_IV]] -; X64-NEXT: store half [[TMP10]], ptr [[ARRAYIDX4]], align 2 -; X64-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; X64-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; X64-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]] -; -entry: - br label %for.body - -for.cond.cleanup: ; preds = %for.body - ret void - -for.body: ; preds = %entry, %for.body - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds nuw [4096 x half], ptr @input1_f16, i64 0, i64 %indvars.iv - %0 = load half, ptr %arrayidx, align 2 - %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr @input2_f16, i64 0, i64 %indvars.iv - %1 = load half, ptr %arrayidx2, align 2 - %2 = tail call half @llvm.maximumnum.f16(half %0, half %1) - %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr @output_f16, i64 0, i64 %indvars.iv - store half %2, ptr %arrayidx4, align 2 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, 4096 - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body -} - -declare half @llvm.maximumnum.f16(half, half) From 508a6b2e01069f12150321ec779b3d30d4e76a6e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 31 Mar 2025 17:59:02 -0700 Subject: [PATCH 0176/1029] [RISCV] Use decodeUImmLog2XLenNonZeroOperand in decodeRVCInstrRdRs1UImm. NFC (#133759) decodeUImmLog2XLenNonZeroOperand already contains the uimm5 check for RV32 so we can reuse it. This makes C_SLLI_HINT code more similar to the tblgen code for C_SLLI. --- .../RISCV/Disassembler/RISCVDisassembler.cpp | 26 ++++++++----------- llvm/lib/Target/RISCV/RISCVInstrInfoC.td | 2 +- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index b22a4a7246c23..cda34ac01d7c0 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -488,9 +488,10 @@ static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, uint32_t Insn, uint64_t Address, const MCDisassembler *Decoder); -static DecodeStatus decodeRVCInstrRdRs1UImm(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); +static DecodeStatus +decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn, + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn, uint64_t Address, @@ -553,21 +554,16 @@ static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, uint32_t Insn, return MCDisassembler::Success; } -static DecodeStatus decodeRVCInstrRdRs1UImm(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { +static DecodeStatus +decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn, + uint64_t Address, + const MCDisassembler *Decoder) { Inst.addOperand(MCOperand::createReg(RISCV::X0)); Inst.addOperand(Inst.getOperand(0)); - uint32_t UImm6 = fieldFromInstruction(Insn, 12, 1) << 5; - // On RV32C, uimm[5]=1 is reserved for custom extensions. - if (UImm6 != 0 && Decoder->getSubtargetInfo().hasFeature(RISCV::Feature32Bit)) - return MCDisassembler::Fail; - UImm6 |= fieldFromInstruction(Insn, 2, 5); - [[maybe_unused]] DecodeStatus Result = - decodeUImmOperand<6>(Inst, UImm6, Address, Decoder); - assert(Result == MCDisassembler::Success && "Invalid immediate"); - return MCDisassembler::Success; + uint32_t UImm6 = + fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); + return decodeUImmLog2XLenNonZeroOperand(Inst, UImm6, Address, Decoder); } static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index 199d056986dc2..eafd2844a691c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -661,7 +661,7 @@ def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb), Sched<[WriteShiftImm, ReadShiftImm]> { let Constraints = "$rd = $rd_wb"; let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdRs1UImm"; + let DecoderMethod = "decodeRVCInstrRdRs1UImmLog2XLenNonZero"; } def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd), From b2d272ccfb9407ded7e54d1eabd5b5743aa9dd1b Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Mon, 31 Mar 2025 18:31:33 -0700 Subject: [PATCH 0177/1029] [BOLT][X86] Fix getTargetSymbol() (#133834) In 96e5ee2, I inadvertently broke the way non-trivial symbol references got updated from non-optimized code. The breakage was a consequence of `getTargetSymbol(MCExpr *)` not returning a symbol when the parameter was a binary expression. Fix `getTargetSymbol()` to cover such cases. --- bolt/include/bolt/Core/MCPlusBuilder.h | 6 +++- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 12 ++----- bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp | 10 +----- bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 6 +--- bolt/test/X86/lite-mode-target-expr.s | 33 +++++++++++++++++++ 5 files changed, 42 insertions(+), 25 deletions(-) create mode 100644 bolt/test/X86/lite-mode-target-expr.s diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 6860f021eb849..1458d36d4813a 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -1266,7 +1266,11 @@ class MCPlusBuilder { /// Return MCSymbol extracted from the expression. virtual const MCSymbol *getTargetSymbol(const MCExpr *Expr) const { - if (auto *SymbolRefExpr = dyn_cast(Expr)) + if (auto *BinaryExpr = dyn_cast(Expr)) + return getTargetSymbol(BinaryExpr->getLHS()); + + auto *SymbolRefExpr = dyn_cast(Expr); + if (SymbolRefExpr && SymbolRefExpr->getKind() == MCSymbolRefExpr::VK_None) return &SymbolRefExpr->getSymbol(); return nullptr; diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index d238a1df5c7d7..0fd127bfeba41 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -862,20 +862,12 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { if (AArchExpr && AArchExpr->getSubExpr()) return getTargetSymbol(AArchExpr->getSubExpr()); - auto *BinExpr = dyn_cast(Expr); - if (BinExpr) - return getTargetSymbol(BinExpr->getLHS()); - - auto *SymExpr = dyn_cast(Expr); - if (SymExpr && SymExpr->getKind() == MCSymbolRefExpr::VK_None) - return &SymExpr->getSymbol(); - - return nullptr; + return MCPlusBuilder::getTargetSymbol(Expr); } const MCSymbol *getTargetSymbol(const MCInst &Inst, unsigned OpNum = 0) const override { - if (!getSymbolRefOperandNum(Inst, OpNum)) + if (!OpNum && !getSymbolRefOperandNum(Inst, OpNum)) return nullptr; const MCOperand &Op = Inst.getOperand(OpNum); diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp index edbbdc491aee4..4320c679acd54 100644 --- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp +++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp @@ -338,15 +338,7 @@ class RISCVMCPlusBuilder : public MCPlusBuilder { if (RISCVExpr && RISCVExpr->getSubExpr()) return getTargetSymbol(RISCVExpr->getSubExpr()); - auto *BinExpr = dyn_cast(Expr); - if (BinExpr) - return getTargetSymbol(BinExpr->getLHS()); - - auto *SymExpr = dyn_cast(Expr); - if (SymExpr && SymExpr->getKind() == MCSymbolRefExpr::VK_None) - return &SymExpr->getSymbol(); - - return nullptr; + return MCPlusBuilder::getTargetSymbol(Expr); } const MCSymbol *getTargetSymbol(const MCInst &Inst, diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 4016ffe18dc02..0b2617600f5c0 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -1796,11 +1796,7 @@ class X86MCPlusBuilder : public MCPlusBuilder { if (!Op.isExpr()) return nullptr; - auto *SymExpr = dyn_cast(Op.getExpr()); - if (!SymExpr || SymExpr->getKind() != MCSymbolRefExpr::VK_None) - return nullptr; - - return &SymExpr->getSymbol(); + return MCPlusBuilder::getTargetSymbol(Op.getExpr()); } bool analyzeBranch(InstructionIterator Begin, InstructionIterator End, diff --git a/bolt/test/X86/lite-mode-target-expr.s b/bolt/test/X86/lite-mode-target-expr.s new file mode 100644 index 0000000000000..5480748c20f05 --- /dev/null +++ b/bolt/test/X86/lite-mode-target-expr.s @@ -0,0 +1,33 @@ +## Check that llvm-bolt properly updates references in unoptimized code when +## such references are non-trivial expressions. + +# RUN: %clang %cflags %s -o %t.exe -Wl,-q -no-pie +# RUN: llvm-bolt %t.exe -o %t.bolt --funcs=_start +# RUN: llvm-objdump -d --disassemble-symbols=_start %t.bolt > %t.out +# RUN: llvm-objdump -d --disassemble-symbols=cold %t.bolt >> %t.out +# RUN: FileCheck %s < %t.out + +## _start() will be optimized and assigned a new address. +# CHECK: [[#%x,ADDR:]] <_start>: + +## cold() is not optimized, but references to _start are updated. +# CHECK-LABEL: : +# CHECK-NEXT: movl $0x[[#ADDR - 1]], %ecx +# CHECK-NEXT: movl $0x[[#ADDR]], %ecx +# CHECK-NEXT: movl $0x[[#ADDR + 1]], %ecx + + .text + .globl cold + .type cold, %function +cold: + movl $_start-1, %ecx + movl $_start, %ecx + movl $_start+1, %ecx + ret + .size cold, .-cold + + .globl _start + .type _start, %function +_start: + ret + .size _start, .-_start From 0b8c8ed04211dae629811f24e6033e5c2185508f Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 31 Mar 2025 19:36:05 -0700 Subject: [PATCH 0178/1029] [lldb] Fix use-after-free in SBMutexTest (#133840) The `locked` variable can be accessed from the asynchronous thread until the call to f.wait() completes. However, the variable is scoped in a lexical block that ends before that, leading to a use-after-free. --- lldb/unittests/API/SBMutexTest.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lldb/unittests/API/SBMutexTest.cpp b/lldb/unittests/API/SBMutexTest.cpp index 0b888c2725aa9..aafad59d58c17 100644 --- a/lldb/unittests/API/SBMutexTest.cpp +++ b/lldb/unittests/API/SBMutexTest.cpp @@ -32,10 +32,9 @@ class SBMutexTest : public testing::Test { TEST_F(SBMutexTest, LockTest) { lldb::SBTarget target = debugger.GetDummyTarget(); - + std::atomic locked = false; std::future f; { - std::atomic locked = false; lldb::SBMutex lock = target.GetAPIMutex(); std::lock_guard lock_guard(lock); ASSERT_FALSE(locked.exchange(true)); From a417a868cd2dad41765e43715379a54289f7da67 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Mon, 31 Mar 2025 19:50:36 -0700 Subject: [PATCH 0179/1029] [lldb-dap] Enable runInTerminal tests on macOS. (#133824) These tests are currently filtered on macOS if your on an M1 (or newer) device. These tests do work on macOS, for me at least on M1 Max with macOS 15.3.2 and Xcode 16.2. Enabling them again, but if we have CI problems with them we can keep them disabled. --- .../restart/TestDAP_restart_runInTerminal.py | 4 ++-- .../runInTerminal/TestDAP_runInTerminal.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py index 5a9938c25c2c8..a94c9860c1508 100644 --- a/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py +++ b/lldb/test/API/tools/lldb-dap/restart/TestDAP_restart_runInTerminal.py @@ -21,7 +21,7 @@ def isTestSupported(self): return False @skipIfWindows - @skipIf(archs=["arm"]) # Always times out on buildbot + @skipIf(oslist=["linux"], archs=["arm"]) # Always times out on buildbot def test_basic_functionality(self): """ Test basic restarting functionality when the process is running in @@ -61,7 +61,7 @@ def test_basic_functionality(self): ) @skipIfWindows - @skipIf(archs=["arm"]) # Always times out on buildbot + @skipIf(oslist=["linux"], archs=["arm"]) # Always times out on buildbot def test_stopOnEntry(self): """ Check that stopOnEntry works correctly when using runInTerminal. diff --git a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py index 9141565ac1b9b..9aab7ca3293db 100644 --- a/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py +++ b/lldb/test/API/tools/lldb-dap/runInTerminal/TestDAP_runInTerminal.py @@ -44,7 +44,7 @@ def isTestSupported(self): return False @skipIfWindows - @skipIf(archs=no_match(["x86_64"])) + @skipIf(oslist=["linux"], archs=no_match(["x86_64"])) def test_runInTerminal(self): if not self.isTestSupported(): return @@ -90,7 +90,7 @@ def test_runInTerminal(self): env = self.dap_server.request_evaluate("foo")["body"]["result"] self.assertIn("bar", env) - @skipIf(archs=no_match(["x86_64"])) + @skipIf(oslist=["linux"], archs=no_match(["x86_64"])) def test_runInTerminalWithObjectEnv(self): if not self.isTestSupported(): return @@ -114,7 +114,7 @@ def test_runInTerminalWithObjectEnv(self): self.assertEqual("BAR", request_envs["FOO"]) @skipIfWindows - @skipIf(archs=no_match(["x86_64"])) + @skipIf(oslist=["linux"], archs=no_match(["x86_64"])) def test_runInTerminalInvalidTarget(self): if not self.isTestSupported(): return @@ -133,7 +133,7 @@ def test_runInTerminalInvalidTarget(self): ) @skipIfWindows - @skipIf(archs=no_match(["x86_64"])) + @skipIf(oslist=["linux"], archs=no_match(["x86_64"])) def test_missingArgInRunInTerminalLauncher(self): if not self.isTestSupported(): return @@ -148,7 +148,7 @@ def test_missingArgInRunInTerminalLauncher(self): ) @skipIfWindows - @skipIf(archs=no_match(["x86_64"])) + @skipIf(oslist=["linux"], archs=no_match(["x86_64"])) def test_FakeAttachedRunInTerminalLauncherWithInvalidProgram(self): if not self.isTestSupported(): return @@ -175,7 +175,7 @@ def test_FakeAttachedRunInTerminalLauncherWithInvalidProgram(self): self.assertIn("No such file or directory", stderr) @skipIfWindows - @skipIf(archs=no_match(["x86_64"])) + @skipIf(oslist=["linux"], archs=no_match(["x86_64"])) def test_FakeAttachedRunInTerminalLauncherWithValidProgram(self): if not self.isTestSupported(): return @@ -202,7 +202,7 @@ def test_FakeAttachedRunInTerminalLauncherWithValidProgram(self): self.assertIn("foo", stdout) @skipIfWindows - @skipIf(archs=no_match(["x86_64"])) + @skipIf(oslist=["linux"], archs=no_match(["x86_64"])) def test_FakeAttachedRunInTerminalLauncherAndCheckEnvironment(self): if not self.isTestSupported(): return @@ -223,7 +223,7 @@ def test_FakeAttachedRunInTerminalLauncherAndCheckEnvironment(self): self.assertIn("FOO=BAR", stdout) @skipIfWindows - @skipIf(archs=no_match(["x86_64"])) + @skipIf(oslist=["linux"], archs=no_match(["x86_64"])) def test_NonAttachedRunInTerminalLauncher(self): if not self.isTestSupported(): return From 145b4a39504b88a695f1f85f4d9da991bb9a2656 Mon Sep 17 00:00:00 2001 From: Shoreshen <372660931@qq.com> Date: Tue, 1 Apr 2025 11:18:17 +0800 Subject: [PATCH 0180/1029] [AMDGPU][CodeGenPrepare] Narrow 64 bit math to 32 bit if profitable (#130577) For Add, Sub, Mul with Int64 type, if profitable, then do: 1. Trunc operands to Int32 type 2. Apply 32 bit Add/Sub/Mul 3. Zext to Int64 type --- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 84 +++++++ .../AMDGPU/amdgpu-codegenprepare-mul24.ll | 5 +- .../atomic_optimizations_global_pointer.ll | 52 ++-- .../CodeGen/AMDGPU/narrow_math_for_and.ll | 231 ++++++++++++++++++ llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 9 +- 5 files changed, 347 insertions(+), 34 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 9c482aeb3ea5c..eb5c160670992 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -1561,6 +1561,87 @@ void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const { llvm_unreachable("not a division"); } +Type *findSmallestLegalBits(Instruction *I, int OrigBit, int MaxBitsNeeded, + const TargetLowering *TLI, const DataLayout &DL) { + if (MaxBitsNeeded >= OrigBit) + return nullptr; + + Type *NewType = I->getType()->getWithNewBitWidth(MaxBitsNeeded); + while (OrigBit > MaxBitsNeeded) { + if (TLI->isOperationLegalOrCustom( + TLI->InstructionOpcodeToISD(I->getOpcode()), + TLI->getValueType(DL, NewType, true))) + return NewType; + + MaxBitsNeeded *= 2; + NewType = I->getType()->getWithNewBitWidth(MaxBitsNeeded); + } + return nullptr; +} + +static bool tryNarrowMathIfNoOverflow(Instruction *I, const TargetLowering *TLI, + const TargetTransformInfo &TTI, + const DataLayout &DL) { + unsigned Opc = I->getOpcode(); + Type *OldType = I->getType(); + + if (Opc != Instruction::Add && Opc != Instruction::Mul) + return false; + + unsigned OrigBit = OldType->getScalarSizeInBits(); + unsigned MaxBitsNeeded = OrigBit; + + switch (Opc) { + case Instruction::Add: + MaxBitsNeeded = KnownBits::add(computeKnownBits(I->getOperand(0), DL), + computeKnownBits(I->getOperand(1), DL)) + .countMaxActiveBits(); + break; + case Instruction::Mul: + MaxBitsNeeded = KnownBits::mul(computeKnownBits(I->getOperand(0), DL), + computeKnownBits(I->getOperand(1), DL)) + .countMaxActiveBits(); + break; + default: + llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and " + "Instruction::Mul."); + } + + MaxBitsNeeded = std::max(bit_ceil(MaxBitsNeeded), 8); + Type *NewType = findSmallestLegalBits(I, OrigBit, MaxBitsNeeded, TLI, DL); + + if (!NewType) + return false; + + // Old cost + InstructionCost OldCost = + TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput); + // New cost of new op + InstructionCost NewCost = + TTI.getArithmeticInstrCost(Opc, NewType, TTI::TCK_RecipThroughput); + // New cost of narrowing 2 operands (use trunc) + NewCost += 2 * TTI.getCastInstrCost(Instruction::Trunc, NewType, OldType, + TTI.getCastContextHint(I), + TTI::TCK_RecipThroughput); + // New cost of zext narrowed result to original type + NewCost += + TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType, + TTI.getCastContextHint(I), TTI::TCK_RecipThroughput); + if (NewCost >= OldCost) + return false; + + IRBuilder<> Builder(I); + Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType); + Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType); + Value *Arith = + Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1); + + Value *Zext = Builder.CreateZExt(Arith, OldType); + I->replaceAllUsesWith(Zext); + I->eraseFromParent(); + return true; +} + bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (foldBinOpIntoSelect(I)) return true; @@ -1645,6 +1726,9 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { } } + Changed = tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(), + TM.getTargetTransformInfo(F), DL); + return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll index 296b817bc8f75..d7c35a8b007c6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll @@ -414,7 +414,10 @@ define i64 @umul24_i64_2(i64 %lhs, i64 %rhs) { ; DISABLED-LABEL: @umul24_i64_2( ; DISABLED-NEXT: [[LHS24:%.*]] = and i64 [[LHS:%.*]], 65535 ; DISABLED-NEXT: [[RHS24:%.*]] = and i64 [[RHS:%.*]], 65535 -; DISABLED-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; DISABLED-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 +; DISABLED-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 +; DISABLED-NEXT: [[TMP3:%.*]] = mul i32 [[TMP1]], [[TMP2]] +; DISABLED-NEXT: [[MUL:%.*]] = zext i32 [[TMP3]] to i64 ; DISABLED-NEXT: ret i64 [[MUL]] ; %lhs24 = and i64 %lhs, 65535 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 62083b3e67ab6..e2dfcf55b7856 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1823,22 +1823,22 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_mov_b32 s9, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-NEXT: s_cbranch_execz .LBB3_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7] +; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1264-NEXT: v_mov_b32_e32 v1, 0 +; GFX1264-NEXT: s_wait_alu 0xfffe +; GFX1264-NEXT: s_mul_i32 s6, s6, 5 ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5 -; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mov_b32_e32 v0, s6 -; GFX1264-NEXT: v_mov_b32_e32 v1, s7 +; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 @@ -1860,20 +1860,19 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-LABEL: add_i64_constant: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-NEXT: s_mov_b32 s7, exec_lo -; GFX1232-NEXT: s_mov_b32 s5, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX1232-NEXT: s_mov_b32 s6, exec_lo +; GFX1232-NEXT: s_mov_b32 s4, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB3_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7 +; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 +; GFX1232-NEXT: s_mul_i32 s5, s5, 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 @@ -1881,8 +1880,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB3_2: -; GFX1232-NEXT: s_wait_alu 0xfffe -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 @@ -5372,22 +5370,22 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_mov_b32 s9, 0 -; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-NEXT: s_cbranch_execz .LBB9_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7] +; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1264-NEXT: v_mov_b32_e32 v1, 0 +; GFX1264-NEXT: s_wait_alu 0xfffe +; GFX1264-NEXT: s_mul_i32 s6, s6, 5 ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5 -; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mov_b32_e32 v0, s6 -; GFX1264-NEXT: v_mov_b32_e32 v1, s7 +; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 @@ -5412,20 +5410,19 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-LABEL: sub_i64_constant: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-NEXT: s_mov_b32 s7, exec_lo -; GFX1232-NEXT: s_mov_b32 s5, 0 -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX1232-NEXT: s_mov_b32 s6, exec_lo +; GFX1232-NEXT: s_mov_b32 s4, exec_lo +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB9_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7 +; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 +; GFX1232-NEXT: s_mul_i32 s5, s5, 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 @@ -5433,8 +5430,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB9_2: -; GFX1232-NEXT: s_wait_alu 0xfffe -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2 diff --git a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll new file mode 100644 index 0000000000000..3f49b1e550595 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll @@ -0,0 +1,231 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s + +define i64 @narrow_add(i64 %a, i64 %b) { +; CHECK-LABEL: narrow_add: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and i64 %a, 2147483647 + %zext1 = and i64 %b, 2147483647 + %add = add i64 %zext0, %zext1 + ret i64 %add +} + +define i64 @narrow_add_1(i64 %a, i64 %b) { +; CHECK-LABEL: narrow_add_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and i64 %a, 2147483647 + %zext1 = and i64 %b, 2147483648 + %add = add i64 %zext0, %zext1 + ret i64 %add +} + +define <2 x i64> @narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 { +; CHECK-LABEL: narrow_add_vec: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v4 +; CHECK-NEXT: v_and_b32_e32 v2, 30, v2 +; CHECK-NEXT: v_and_b32_e32 v3, 0x7ffffffe, v6 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_add_co_u32 v0, s0, v0, v1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_add_co_u32 v2, s0, v2, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and <2 x i64> %a, + %zext1 = and <2 x i64> %b, + %add = add <2 x i64> %zext0, %zext1 + ret <2 x i64> %add +} + +define <2 x i32> @narrow_add_vec_1(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: narrow_add_vec_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v1, 0x3fff, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 0x4000, v0 +; CHECK-NEXT: v_and_b32_e32 v3, 0x4001, v3 +; CHECK-NEXT: v_and_b32_e32 v2, 0x4000, v2 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; CHECK-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; CHECK-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_pk_add_u16 v1, v0, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 0xc000, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and <2 x i32> %a, + %zext1 = and <2 x i32> %b, + %add = add <2 x i32> %zext0, %zext1 + ret <2 x i32> %add +} + +define i64 @narrow_mul(i64 %a, i64 %b) { +; CHECK-LABEL: narrow_mul: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v1, 2, v2 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_mul_lo_u32 v0, v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and i64 %a, 2147483647 + %zext1 = and i64 %b, 2 + %mul = mul i64 %zext0, %zext1 + ret i64 %mul +} + +define i64 @narrow_mul_1(i64 %a, i64 %b) { +; CHECK-LABEL: narrow_mul_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v1, 0xf73594, v0 +; CHECK-NEXT: v_and_b32_e32 v2, 0x100, v2 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_mul_u32_u24_e32 v0, v1, v2 +; CHECK-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v2 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and i64 %a, 16201108 + %zext1 = and i64 %b, 256 + %mul = mul i64 %zext0, %zext1 + ret i64 %mul +} + +define <2 x i64> @narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 { +; CHECK-LABEL: narrow_mul_vec: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x2d48aff, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 0x50, v4 +; CHECK-NEXT: v_and_b32_e32 v3, 50, v2 +; CHECK-NEXT: v_and_b32_e32 v4, 20, v6 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; CHECK-NEXT: v_mul_lo_u32 v0, v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mul_u32_u24_e32 v2, v3, v4 +; CHECK-NEXT: v_mul_hi_u32_u24_e32 v3, v3, v4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and <2 x i64> %a, + %zext1 = and <2 x i64> %b, + %mul = mul <2 x i64> %zext0, %zext1 + ret <2 x i64> %mul +} + +define <2 x i32> @narrow_add_mul_1(<2 x i32> %a, <2 x i32> %b) #0 { +; CHECK-LABEL: narrow_add_mul_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v1, 0x4000, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 0x4000, v0 +; CHECK-NEXT: v_and_b32_e32 v2, 3, v2 +; CHECK-NEXT: v_and_b32_e32 v3, 2, v3 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; CHECK-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; CHECK-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and <2 x i32> %a, + %zext1 = and <2 x i32> %b, + %mul = mul <2 x i32> %zext0, %zext1 + ret <2 x i32> %mul +} + +define i64 @no_narrow_add(i64 %a, i64 %b) { +; CHECK-LABEL: no_narrow_add: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 0x80000000, v2 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_add_co_u32 v0, s0, v0, v1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and i64 %a, 2147483648 + %zext1 = and i64 %b, 2147483648 + %add = add i64 %zext0, %zext1 + ret i64 %add +} + +define i64 @no_narrow_add_1(i64 %a, i64 %b) { +; CHECK-LABEL: no_narrow_add_1: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v1, 1, v2 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_add_co_u32 v0, s0, v0, v1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and i64 %a, 4294967295 + %zext1 = and i64 %b, 1 + %add = add i64 %zext0, %zext1 + ret i64 %add +} + +define <2 x i64> @no_narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 { +; CHECK-LABEL: no_narrow_add_vec: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 0x80000000, v4 +; CHECK-NEXT: v_and_b32_e32 v2, 30, v2 +; CHECK-NEXT: v_and_b32_e32 v3, 0x7ffffffe, v6 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_add_co_u32 v0, s0, v0, v1 +; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_add_co_u32 v2, s0, v2, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and <2 x i64> %a, + %zext1 = and <2 x i64> %b, + %add = add <2 x i64> %zext0, %zext1 + ret <2 x i64> %add +} + +define i64 @no_narrow_mul(i64 %a, i64 %b) { +; CHECK-LABEL: no_narrow_mul: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 0x80000000, v0 +; CHECK-NEXT: v_and_b32_e32 v1, 2, v2 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and i64 %a, 2147483648 + %zext1 = and i64 %b, 2 + %mul = mul i64 %zext0, %zext1 + ret i64 %mul +} + +define <2 x i64> @no_narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 { +; CHECK-LABEL: no_narrow_mul_vec: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v1, 0x8000, v0 +; CHECK-NEXT: v_and_b32_e32 v3, 0x20000, v4 +; CHECK-NEXT: v_and_b32_e32 v4, 50, v2 +; CHECK-NEXT: v_and_b32_e32 v5, 20, v6 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; CHECK-NEXT: v_mul_u32_u24_e32 v0, v1, v3 +; CHECK-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v3 +; CHECK-NEXT: v_mul_u32_u24_e32 v2, v4, v5 +; CHECK-NEXT: v_mul_hi_u32_u24_e32 v3, v4, v5 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %zext0 = and <2 x i64> %a, + %zext1 = and <2 x i64> %b, + %mul = mul <2 x i64> %zext0, %zext1 + ret <2 x i64> %mul +} diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index 4290590e99711..4eb7761bfbddd 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -508,17 +508,16 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; SI-LABEL: widen_i1_zext_to_i64_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[0:1], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s2, s2, 1 -; SI-NEXT: s_add_u32 s4, s2, 0x3e7 -; SI-NEXT: s_addc_u32 s5, 0, 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_and_b32 s2, s2, 0xff +; SI-NEXT: s_addk_i32 s2, 0x3e7 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; From 71cf59219162be67392b435dfcb9b280e1ff8681 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 31 Mar 2025 20:23:34 -0700 Subject: [PATCH 0181/1029] [IR] Fix -Wunused-but-set-variable --- llvm/lib/IR/Attributes.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index 8cb8b0d927afd..33ac8bfaf4e7c 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -2292,10 +2292,10 @@ AttrBuilder &AttrBuilder::addInitializesAttr(const ConstantRangeList &CRL) { } AttrBuilder &AttrBuilder::addFromEquivalentMetadata(const Instruction &I) { - if (const MDNode *NonNull = I.getMetadata(LLVMContext::MD_nonnull)) + if (I.hasMetadata(LLVMContext::MD_nonnull)) addAttribute(Attribute::NonNull); - if (const MDNode *NoUndef = I.getMetadata(LLVMContext::MD_noundef)) + if (I.hasMetadata(LLVMContext::MD_noundef)) addAttribute(Attribute::NoUndef); if (const MDNode *Align = I.getMetadata(LLVMContext::MD_align)) { From dd862356e20d2d7e0d0356dff5bd80623c14febc Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 31 Mar 2025 20:44:29 -0700 Subject: [PATCH 0182/1029] AsmPrinter: Remove ELF's special lowerRelativeReference for unnamed_addr function https://reviews.llvm.org/D17938 introduced lowerRelativeReference to give ConstantExpr sub (A-B) special semantics in ELF: when `A` is an `unnamed_addr` function, create a PLT-generating relocation. This was intended for C++ relative vtables, but C++ relative vtable ended up using DSOLocalEquivalent (lowerDSOLocalEquivalent). This special treatment of `unnamed_addr` seems unusual. Let's remove it. Only COFF needs an overload to generate a @IMGREL32 relocation specifier (llvm/test/MC/COFF/cross-section-relative.ll). Pull Request: https://github.com/llvm/llvm-project/pull/132684 --- .../CodeGen/TargetLoweringObjectFileImpl.h | 4 ---- .../CodeGen/TargetLoweringObjectFileImpl.cpp | 18 ------------------ ...plt-relative-reloc.ll => relative-reloc.ll} | 5 +++-- ...plt-relative-reloc.ll => relative-reloc.ll} | 5 +++-- ...-relative-reloc.ll => relative-reloc-32.ll} | 4 ++-- ...-relative-reloc.ll => relative-reloc-64.ll} | 5 +++-- 6 files changed, 11 insertions(+), 30 deletions(-) rename llvm/test/CodeGen/ARM/{plt-relative-reloc.ll => relative-reloc.ll} (78%) rename llvm/test/CodeGen/RISCV/{plt-relative-reloc.ll => relative-reloc.ll} (84%) rename llvm/test/CodeGen/X86/{x86-plt-relative-reloc.ll => relative-reloc-32.ll} (89%) rename llvm/test/CodeGen/X86/{x86-64-plt-relative-reloc.ll => relative-reloc-64.ll} (84%) diff --git a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h index 8b0e5798d1b61..f035d81e85ddb 100644 --- a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h +++ b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h @@ -125,10 +125,6 @@ class TargetLoweringObjectFileELF : public TargetLoweringObjectFile { lowerSymbolDifference(const MCSymbol *LHS, const MCSymbol *RHS, int64_t Addend, std::optional PCRelativeOffset) const; - const MCExpr *lowerRelativeReference(const GlobalValue *LHS, - const GlobalValue *RHS, int64_t Addend, - std::optional PCRelativeOffset, - const TargetMachine &TM) const override; const MCExpr *lowerDSOLocalEquivalent(const MCSymbol *LHS, const MCSymbol *RHS, int64_t Addend, diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 4c20c5dc74d9a..c9415292e88f7 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1233,24 +1233,6 @@ const MCExpr *TargetLoweringObjectFileELF::lowerSymbolDifference( return Res; } -const MCExpr *TargetLoweringObjectFileELF::lowerRelativeReference( - const GlobalValue *LHS, const GlobalValue *RHS, int64_t Addend, - std::optional PCRelativeOffset, const TargetMachine &TM) const { - // We may only use a PLT-relative relocation to refer to unnamed_addr - // functions. - if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy()) - return nullptr; - - // Basic correctness checks. - if (LHS->getType()->getPointerAddressSpace() != 0 || - RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() || - RHS->isThreadLocal()) - return nullptr; - - return lowerSymbolDifference(TM.getSymbol(LHS), TM.getSymbol(RHS), Addend, - PCRelativeOffset); -} - // Reference the PLT entry of a function, optionally with a subtrahend (`RHS`). const MCExpr *TargetLoweringObjectFileELF::lowerDSOLocalEquivalent( const MCSymbol *LHS, const MCSymbol *RHS, int64_t Addend, diff --git a/llvm/test/CodeGen/ARM/plt-relative-reloc.ll b/llvm/test/CodeGen/ARM/relative-reloc.ll similarity index 78% rename from llvm/test/CodeGen/ARM/plt-relative-reloc.ll rename to llvm/test/CodeGen/ARM/relative-reloc.ll index ede891900e6d0..65053726e66bf 100644 --- a/llvm/test/CodeGen/ARM/plt-relative-reloc.ll +++ b/llvm/test/CodeGen/ARM/relative-reloc.ll @@ -10,7 +10,8 @@ declare void @fn1() unnamed_addr declare void @fn2() unnamed_addr declare void @fn3() +;; Create a PC-relative relocation that the linker might decline if the addend symbol is preemptible. ; CHECK: .long 0 -; CHECK-NEXT: .long fn1(prel31)-vtable-4 -; CHECK-NEXT: .long fn2(prel31)-vtable-4 +; CHECK-NEXT: .long fn1-vtable-4 +; CHECK-NEXT: .long fn2-vtable-4 ; CHECK-NEXT: .long fn3-vtable-4 diff --git a/llvm/test/CodeGen/RISCV/plt-relative-reloc.ll b/llvm/test/CodeGen/RISCV/relative-reloc.ll similarity index 84% rename from llvm/test/CodeGen/RISCV/plt-relative-reloc.ll rename to llvm/test/CodeGen/RISCV/relative-reloc.ll index d2dceb773b2e9..6c94b9fce9308 100644 --- a/llvm/test/CodeGen/RISCV/plt-relative-reloc.ll +++ b/llvm/test/CodeGen/RISCV/relative-reloc.ll @@ -12,10 +12,11 @@ declare void @fn2() unnamed_addr declare void @fn3() @global4 = external unnamed_addr global i8 +;; Create a PC-relative relocation that the linker might decline if the addend symbol is preemptible. ; CHECK: vtable: ; CHECK-NEXT: .word 0 # 0x0 -; CHECK-NEXT: .word %pltpcrel(fn1) -; CHECK-NEXT: .word %pltpcrel(fn2+4) +; CHECK-NEXT: .word fn1-vtable-4 +; CHECK-NEXT: .word fn2-vtable-4 ; CHECK-NEXT: .word fn3-vtable-4 ; CHECK-NEXT: .word global4-vtable-4 ; CHECK-NEXT: .size vtable, 20 diff --git a/llvm/test/CodeGen/X86/x86-plt-relative-reloc.ll b/llvm/test/CodeGen/X86/relative-reloc-32.ll similarity index 89% rename from llvm/test/CodeGen/X86/x86-plt-relative-reloc.ll rename to llvm/test/CodeGen/X86/relative-reloc-32.ll index d5e80285b160d..7d0b1fd546a00 100644 --- a/llvm/test/CodeGen/X86/x86-plt-relative-reloc.ll +++ b/llvm/test/CodeGen/X86/relative-reloc-32.ll @@ -11,6 +11,6 @@ declare void @fn2() unnamed_addr declare void @fn3() ; CHECK: .long 0 -; CHECK-NEXT: .long fn1@PLT-vtable-4 -; CHECK-NEXT: .long fn2@PLT-vtable-4 +; CHECK-NEXT: .long fn1-vtable-4 +; CHECK-NEXT: .long fn2-vtable-4 ; CHECK-NEXT: .long fn3-vtable-4 diff --git a/llvm/test/CodeGen/X86/x86-64-plt-relative-reloc.ll b/llvm/test/CodeGen/X86/relative-reloc-64.ll similarity index 84% rename from llvm/test/CodeGen/X86/x86-64-plt-relative-reloc.ll rename to llvm/test/CodeGen/X86/relative-reloc-64.ll index 54736c94af248..6f88edfa075b8 100644 --- a/llvm/test/CodeGen/X86/x86-64-plt-relative-reloc.ll +++ b/llvm/test/CodeGen/X86/relative-reloc-64.ll @@ -12,8 +12,9 @@ declare void @fn2() unnamed_addr declare void @fn3() @global4 = external unnamed_addr global i8 +;; Create a PC-relative relocation that the linker might decline if the addend symbol is preemptible. ; CHECK: .long 0 -; CHECK-NEXT: .long fn1@PLT-vtable-4 -; CHECK-NEXT: .long fn2@PLT-vtable-4 +; CHECK-NEXT: .long fn1-vtable-4 +; CHECK-NEXT: .long fn2-vtable-4 ; CHECK-NEXT: .long fn3-vtable-4 ; CHECK-NEXT: .long global4-vtable-4 From 27b49288f7678d19cbda31904b6b5dbaa86124e7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 31 Mar 2025 20:47:08 -0700 Subject: [PATCH 0183/1029] [RISCV] Add exhaustive diassember tests for c.slli64. NFC (#133820) The c.slli encoding with a shift of 0 is c.slli64 for RV128 and a hint for RV32 and RV64. Add a test for this encoding to the exhaustive c.slli test. --- llvm/test/MC/Disassembler/RISCV/c_slli.txt | 96 ++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/llvm/test/MC/Disassembler/RISCV/c_slli.txt b/llvm/test/MC/Disassembler/RISCV/c_slli.txt index 6e8dc4c05147f..98ff2b132cc85 100644 --- a/llvm/test/MC/Disassembler/RISCV/c_slli.txt +++ b/llvm/test/MC/Disassembler/RISCV/c_slli.txt @@ -14,6 +14,8 @@ # RUN: -M no-aliases --show-encoding < %s 2>&1 | \ # RUN: FileCheck --check-prefix=NOHINTS %s +0x02 0x00 # GOOD: c.slli64 zero +0x02 0x00 # NOHINTS: invalid instruction encoding 0x06 0x00 # GOOD: c.slli zero, 1 0x06 0x00 # NOHINTS: invalid instruction encoding 0x0A 0x00 # GOOD: c.slli zero, 2 @@ -172,6 +174,10 @@ 0x7E 0x10 # BAD32: invalid instruction encoding 0x7E 0x10 # GOOD64: c.slli zero, 63 0x7E 0x10 # NOHINTS: invalid instruction encoding +# GOOD: c.slli64 ra +# NOHINTS: invalid instruction encoding +0x82 0x00 +0x86 0x00 # GOOD: c.slli ra, 1 0x86 0x00 # GOOD: c.slli ra, 1 0x8A 0x00 # GOOD: c.slli ra, 2 0x8E 0x00 # GOOD: c.slli ra, 3 @@ -267,6 +273,9 @@ 0xFA 0x10 # GOOD64: c.slli ra, 62 0xFE 0x10 # BAD32: invalid instruction encoding 0xFE 0x10 # GOOD64: c.slli ra, 63 +# GOOD: c.slli64 sp +# NOHINTS: invalid instruction encoding +0x02 0x01 0x06 0x01 # GOOD: c.slli sp, 1 0x0A 0x01 # GOOD: c.slli sp, 2 0x0E 0x01 # GOOD: c.slli sp, 3 @@ -362,6 +371,9 @@ 0x7A 0x11 # GOOD64: c.slli sp, 62 0x7E 0x11 # BAD32: invalid instruction encoding 0x7E 0x11 # GOOD64: c.slli sp, 63 +# GOOD: c.slli64 gp +# NOHINTS: invalid instruction encoding +0x82 0x01 0x86 0x01 # GOOD: c.slli gp, 1 0x8A 0x01 # GOOD: c.slli gp, 2 0x8E 0x01 # GOOD: c.slli gp, 3 @@ -457,6 +469,9 @@ 0xFA 0x11 # GOOD64: c.slli gp, 62 0xFE 0x11 # BAD32: invalid instruction encoding 0xFE 0x11 # GOOD64: c.slli gp, 63 +# GOOD: c.slli64 tp +# NOHINTS: invalid instruction encoding +0x02 0x02 0x06 0x02 # GOOD: c.slli tp, 1 0x0A 0x02 # GOOD: c.slli tp, 2 0x0E 0x02 # GOOD: c.slli tp, 3 @@ -552,6 +567,9 @@ 0x7A 0x12 # GOOD64: c.slli tp, 62 0x7E 0x12 # BAD32: invalid instruction encoding 0x7E 0x12 # GOOD64: c.slli tp, 63 +# GOOD: c.slli64 t0 +# NOHINTS: invalid instruction encoding +0x82 0x02 0x86 0x02 # GOOD: c.slli t0, 1 0x8A 0x02 # GOOD: c.slli t0, 2 0x8E 0x02 # GOOD: c.slli t0, 3 @@ -647,6 +665,9 @@ 0xFA 0x12 # GOOD64: c.slli t0, 62 0xFE 0x12 # BAD32: invalid instruction encoding 0xFE 0x12 # GOOD64: c.slli t0, 63 +# GOOD: c.slli64 t1 +# NOHINTS: invalid instruction encoding +0x02 0x03 0x06 0x03 # GOOD: c.slli t1, 1 0x0A 0x03 # GOOD: c.slli t1, 2 0x0E 0x03 # GOOD: c.slli t1, 3 @@ -742,6 +763,9 @@ 0x7A 0x13 # GOOD64: c.slli t1, 62 0x7E 0x13 # BAD32: invalid instruction encoding 0x7E 0x13 # GOOD64: c.slli t1, 63 +# GOOD: c.slli64 t2 +# NOHINTS: invalid instruction encoding +0x82 0x03 0x86 0x03 # GOOD: c.slli t2, 1 0x8A 0x03 # GOOD: c.slli t2, 2 0x8E 0x03 # GOOD: c.slli t2, 3 @@ -837,6 +861,9 @@ 0xFA 0x13 # GOOD64: c.slli t2, 62 0xFE 0x13 # BAD32: invalid instruction encoding 0xFE 0x13 # GOOD64: c.slli t2, 63 +# GOOD: c.slli64 s0 +# NOHINTS: invalid instruction encoding +0x02 0x04 0x06 0x04 # GOOD: c.slli s0, 1 0x0A 0x04 # GOOD: c.slli s0, 2 0x0E 0x04 # GOOD: c.slli s0, 3 @@ -932,6 +959,9 @@ 0x7A 0x14 # GOOD64: c.slli s0, 62 0x7E 0x14 # BAD32: invalid instruction encoding 0x7E 0x14 # GOOD64: c.slli s0, 63 +# GOOD: c.slli64 s1 +# NOHINTS: invalid instruction encoding +0x82 0x04 0x86 0x04 # GOOD: c.slli s1, 1 0x8A 0x04 # GOOD: c.slli s1, 2 0x8E 0x04 # GOOD: c.slli s1, 3 @@ -1027,6 +1057,9 @@ 0xFA 0x14 # GOOD64: c.slli s1, 62 0xFE 0x14 # BAD32: invalid instruction encoding 0xFE 0x14 # GOOD64: c.slli s1, 63 +# GOOD: c.slli64 a0 +# NOHINTS: invalid instruction encoding +0x02 0x05 0x06 0x05 # GOOD: c.slli a0, 1 0x0A 0x05 # GOOD: c.slli a0, 2 0x0E 0x05 # GOOD: c.slli a0, 3 @@ -1122,6 +1155,9 @@ 0x7A 0x15 # GOOD64: c.slli a0, 62 0x7E 0x15 # BAD32: invalid instruction encoding 0x7E 0x15 # GOOD64: c.slli a0, 63 +# GOOD: c.slli64 a1 +# NOHINTS: invalid instruction encoding +0x82 0x05 0x86 0x05 # GOOD: c.slli a1, 1 0x8A 0x05 # GOOD: c.slli a1, 2 0x8E 0x05 # GOOD: c.slli a1, 3 @@ -1217,6 +1253,9 @@ 0xFA 0x15 # GOOD64: c.slli a1, 62 0xFE 0x15 # BAD32: invalid instruction encoding 0xFE 0x15 # GOOD64: c.slli a1, 63 +# GOOD: c.slli64 a2 +# NOHINTS: invalid instruction encoding +0x02 0x06 0x06 0x06 # GOOD: c.slli a2, 1 0x0A 0x06 # GOOD: c.slli a2, 2 0x0E 0x06 # GOOD: c.slli a2, 3 @@ -1312,6 +1351,9 @@ 0x7A 0x16 # GOOD64: c.slli a2, 62 0x7E 0x16 # BAD32: invalid instruction encoding 0x7E 0x16 # GOOD64: c.slli a2, 63 +# GOOD: c.slli64 a3 +# NOHINTS: invalid instruction encoding +0x82 0x06 0x86 0x06 # GOOD: c.slli a3, 1 0x8A 0x06 # GOOD: c.slli a3, 2 0x8E 0x06 # GOOD: c.slli a3, 3 @@ -1407,6 +1449,9 @@ 0xFA 0x16 # GOOD64: c.slli a3, 62 0xFE 0x16 # BAD32: invalid instruction encoding 0xFE 0x16 # GOOD64: c.slli a3, 63 +# GOOD: c.slli64 a4 +# NOHINTS: invalid instruction encoding +0x02 0x07 0x06 0x07 # GOOD: c.slli a4, 1 0x0A 0x07 # GOOD: c.slli a4, 2 0x0E 0x07 # GOOD: c.slli a4, 3 @@ -1502,6 +1547,9 @@ 0x7A 0x17 # GOOD64: c.slli a4, 62 0x7E 0x17 # BAD32: invalid instruction encoding 0x7E 0x17 # GOOD64: c.slli a4, 63 +# GOOD: c.slli64 a5 +# NOHINTS: invalid instruction encoding +0x82 0x07 0x86 0x07 # GOOD: c.slli a5, 1 0x8A 0x07 # GOOD: c.slli a5, 2 0x8E 0x07 # GOOD: c.slli a5, 3 @@ -1597,6 +1645,9 @@ 0xFA 0x17 # GOOD64: c.slli a5, 62 0xFE 0x17 # BAD32: invalid instruction encoding 0xFE 0x17 # GOOD64: c.slli a5, 63 +# GOOD: c.slli64 a6 +# NOHINTS: invalid instruction encoding +0x02 0x08 0x06 0x08 # GOOD: c.slli a6, 1 0x0A 0x08 # GOOD: c.slli a6, 2 0x0E 0x08 # GOOD: c.slli a6, 3 @@ -1692,6 +1743,9 @@ 0x7A 0x18 # GOOD64: c.slli a6, 62 0x7E 0x18 # BAD32: invalid instruction encoding 0x7E 0x18 # GOOD64: c.slli a6, 63 +# GOOD: c.slli64 a7 +# NOHINTS: invalid instruction encoding +0x82 0x08 0x86 0x08 # GOOD: c.slli a7, 1 0x8A 0x08 # GOOD: c.slli a7, 2 0x8E 0x08 # GOOD: c.slli a7, 3 @@ -1787,6 +1841,9 @@ 0xFA 0x18 # GOOD64: c.slli a7, 62 0xFE 0x18 # BAD32: invalid instruction encoding 0xFE 0x18 # GOOD64: c.slli a7, 63 +# GOOD: c.slli64 s2 +# NOHINTS: invalid instruction encoding +0x02 0x09 0x06 0x09 # GOOD: c.slli s2, 1 0x0A 0x09 # GOOD: c.slli s2, 2 0x0E 0x09 # GOOD: c.slli s2, 3 @@ -1882,6 +1939,9 @@ 0x7A 0x19 # GOOD64: c.slli s2, 62 0x7E 0x19 # BAD32: invalid instruction encoding 0x7E 0x19 # GOOD64: c.slli s2, 63 +# GOOD: c.slli64 s3 +# NOHINTS: invalid instruction encoding +0x82 0x09 0x86 0x09 # GOOD: c.slli s3, 1 0x8A 0x09 # GOOD: c.slli s3, 2 0x8E 0x09 # GOOD: c.slli s3, 3 @@ -1977,6 +2037,9 @@ 0xFA 0x19 # GOOD64: c.slli s3, 62 0xFE 0x19 # BAD32: invalid instruction encoding 0xFE 0x19 # GOOD64: c.slli s3, 63 +# GOOD: c.slli64 s4 +# NOHINTS: invalid instruction encoding +0x02 0x0A 0x06 0x0A # GOOD: c.slli s4, 1 0x0A 0x0A # GOOD: c.slli s4, 2 0x0E 0x0A # GOOD: c.slli s4, 3 @@ -2072,6 +2135,9 @@ 0x7A 0x1A # GOOD64: c.slli s4, 62 0x7E 0x1A # BAD32: invalid instruction encoding 0x7E 0x1A # GOOD64: c.slli s4, 63 +# GOOD: c.slli64 s5 +# NOHINTS: invalid instruction encoding +0x82 0x0A 0x86 0x0A # GOOD: c.slli s5, 1 0x8A 0x0A # GOOD: c.slli s5, 2 0x8E 0x0A # GOOD: c.slli s5, 3 @@ -2167,6 +2233,9 @@ 0xFA 0x1A # GOOD64: c.slli s5, 62 0xFE 0x1A # BAD32: invalid instruction encoding 0xFE 0x1A # GOOD64: c.slli s5, 63 +# GOOD: c.slli64 s6 +# NOHINTS: invalid instruction encoding +0x02 0x0B 0x06 0x0B # GOOD: c.slli s6, 1 0x0A 0x0B # GOOD: c.slli s6, 2 0x0E 0x0B # GOOD: c.slli s6, 3 @@ -2262,6 +2331,9 @@ 0x7A 0x1B # GOOD64: c.slli s6, 62 0x7E 0x1B # BAD32: invalid instruction encoding 0x7E 0x1B # GOOD64: c.slli s6, 63 +# GOOD: c.slli64 s7 +# NOHINTS: invalid instruction encoding +0x82 0x0B 0x86 0x0B # GOOD: c.slli s7, 1 0x8A 0x0B # GOOD: c.slli s7, 2 0x8E 0x0B # GOOD: c.slli s7, 3 @@ -2357,6 +2429,9 @@ 0xFA 0x1B # GOOD64: c.slli s7, 62 0xFE 0x1B # BAD32: invalid instruction encoding 0xFE 0x1B # GOOD64: c.slli s7, 63 +# GOOD: c.slli64 s8 +# NOHINTS: invalid instruction encoding +0x02 0x0C 0x06 0x0C # GOOD: c.slli s8, 1 0x0A 0x0C # GOOD: c.slli s8, 2 0x0E 0x0C # GOOD: c.slli s8, 3 @@ -2452,6 +2527,9 @@ 0x7A 0x1C # GOOD64: c.slli s8, 62 0x7E 0x1C # BAD32: invalid instruction encoding 0x7E 0x1C # GOOD64: c.slli s8, 63 +# GOOD: c.slli64 s9 +# NOHINTS: invalid instruction encoding +0x82 0x0C 0x86 0x0C # GOOD: c.slli s9, 1 0x8A 0x0C # GOOD: c.slli s9, 2 0x8E 0x0C # GOOD: c.slli s9, 3 @@ -2547,6 +2625,9 @@ 0xFA 0x1C # GOOD64: c.slli s9, 62 0xFE 0x1C # BAD32: invalid instruction encoding 0xFE 0x1C # GOOD64: c.slli s9, 63 +# GOOD: c.slli64 s10 +# NOHINTS: invalid instruction encoding +0x02 0x0D 0x06 0x0D # GOOD: c.slli s10, 1 0x0A 0x0D # GOOD: c.slli s10, 2 0x0E 0x0D # GOOD: c.slli s10, 3 @@ -2642,6 +2723,9 @@ 0x7A 0x1D # GOOD64: c.slli s10, 62 0x7E 0x1D # BAD32: invalid instruction encoding 0x7E 0x1D # GOOD64: c.slli s10, 63 +# GOOD: c.slli64 s11 +# NOHINTS: invalid instruction encoding +0x82 0x0D 0x86 0x0D # GOOD: c.slli s11, 1 0x8A 0x0D # GOOD: c.slli s11, 2 0x8E 0x0D # GOOD: c.slli s11, 3 @@ -2737,6 +2821,9 @@ 0xFA 0x1D # GOOD64: c.slli s11, 62 0xFE 0x1D # BAD32: invalid instruction encoding 0xFE 0x1D # GOOD64: c.slli s11, 63 +# GOOD: c.slli64 t3 +# NOHINTS: invalid instruction encoding +0x02 0x0E 0x06 0x0E # GOOD: c.slli t3, 1 0x0A 0x0E # GOOD: c.slli t3, 2 0x0E 0x0E # GOOD: c.slli t3, 3 @@ -2832,6 +2919,9 @@ 0x7A 0x1E # GOOD64: c.slli t3, 62 0x7E 0x1E # BAD32: invalid instruction encoding 0x7E 0x1E # GOOD64: c.slli t3, 63 +# GOOD: c.slli64 t4 +# NOHINTS: invalid instruction encoding +0x82 0x0E 0x86 0x0E # GOOD: c.slli t4, 1 0x8A 0x0E # GOOD: c.slli t4, 2 0x8E 0x0E # GOOD: c.slli t4, 3 @@ -2927,6 +3017,9 @@ 0xFA 0x1E # GOOD64: c.slli t4, 62 0xFE 0x1E # BAD32: invalid instruction encoding 0xFE 0x1E # GOOD64: c.slli t4, 63 +# GOOD: c.slli64 t5 +# NOHINTS: invalid instruction encoding +0x02 0x0F 0x06 0x0F # GOOD: c.slli t5, 1 0x0A 0x0F # GOOD: c.slli t5, 2 0x0E 0x0F # GOOD: c.slli t5, 3 @@ -3022,6 +3115,9 @@ 0x7A 0x1F # GOOD64: c.slli t5, 62 0x7E 0x1F # BAD32: invalid instruction encoding 0x7E 0x1F # GOOD64: c.slli t5, 63 +# GOOD: c.slli64 t6 +# NOHINTS: invalid instruction encoding +0x82 0x0F 0x86 0x0F # GOOD: c.slli t6, 1 0x8A 0x0F # GOOD: c.slli t6, 2 0x8E 0x0F # GOOD: c.slli t6, 3 From ea68b228816dfbe27f3e1ba1149116587758d56c Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 31 Mar 2025 20:49:51 -0700 Subject: [PATCH 0184/1029] [RISCV] Prevent disassembling RVC hint instructions with x16-x31 for RVE. (#133805) We can't ignore the return value form the GPR decode function, as it contains the RVE check. --- .../RISCV/Disassembler/RISCVDisassembler.cpp | 38 ++++++++++++------- llvm/test/MC/RISCV/rve-invalid.s | 26 +++++++++---- 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index cda34ac01d7c0..4e6d2b642c4ce 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -523,13 +523,13 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn, uint64_t Address, const MCDisassembler *Decoder) { + DecodeStatus S = MCDisassembler::Success; uint32_t Rd = fieldFromInstruction(Insn, 7, 5); - [[maybe_unused]] DecodeStatus Result = - DecodeGPRNoX0RegisterClass(Inst, Rd, Address, Decoder); - assert(Result == MCDisassembler::Success && "Invalid register"); + if (!Check(S, DecodeGPRNoX0RegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; Inst.addOperand(Inst.getOperand(0)); Inst.addOperand(MCOperand::createImm(0)); - return MCDisassembler::Success; + return S; } static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, @@ -569,34 +569,44 @@ decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn, static DecodeStatus decodeRVCInstrRdRs2(MCInst &Inst, uint32_t Insn, uint64_t Address, const MCDisassembler *Decoder) { + DecodeStatus S = MCDisassembler::Success; uint32_t Rd = fieldFromInstruction(Insn, 7, 5); uint32_t Rs2 = fieldFromInstruction(Insn, 2, 5); - DecodeGPRRegisterClass(Inst, Rd, Address, Decoder); - DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder); - return MCDisassembler::Success; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder))) + return MCDisassembler::Fail; + return S; } static DecodeStatus decodeRVCInstrRdRs1Rs2(MCInst &Inst, uint32_t Insn, uint64_t Address, const MCDisassembler *Decoder) { + DecodeStatus S = MCDisassembler::Success; uint32_t Rd = fieldFromInstruction(Insn, 7, 5); uint32_t Rs2 = fieldFromInstruction(Insn, 2, 5); - DecodeGPRRegisterClass(Inst, Rd, Address, Decoder); + if (!Check(S, DecodeGPRRegisterClass(Inst, Rd, Address, Decoder))) + return MCDisassembler::Fail; Inst.addOperand(Inst.getOperand(0)); - DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder); - return MCDisassembler::Success; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rs2, Address, Decoder))) + return MCDisassembler::Fail; + return S; } static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, uint64_t Address, const MCDisassembler *Decoder) { + DecodeStatus S = MCDisassembler::Success; uint32_t Rd1 = fieldFromInstruction(Insn, 7, 5); uint32_t Rs1 = fieldFromInstruction(Insn, 15, 5); uint32_t Rd2 = fieldFromInstruction(Insn, 20, 5); uint32_t UImm2 = fieldFromInstruction(Insn, 25, 2); - DecodeGPRRegisterClass(Inst, Rd1, Address, Decoder); - DecodeGPRRegisterClass(Inst, Rd2, Address, Decoder); - DecodeGPRRegisterClass(Inst, Rs1, Address, Decoder); + if (!Check(S, DecodeGPRRegisterClass(Inst, Rd1, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rd2, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeGPRRegisterClass(Inst, Rs1, Address, Decoder))) + return MCDisassembler::Fail; [[maybe_unused]] DecodeStatus Result = decodeUImmOperand<2>(Inst, UImm2, Address, Decoder); assert(Result == MCDisassembler::Success && "Invalid immediate"); @@ -610,7 +620,7 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, else Inst.addOperand(MCOperand::createImm(4)); - return MCDisassembler::Success; + return S; } static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, diff --git a/llvm/test/MC/RISCV/rve-invalid.s b/llvm/test/MC/RISCV/rve-invalid.s index 95dc156f250a3..0b1e8961dc89d 100644 --- a/llvm/test/MC/RISCV/rve-invalid.s +++ b/llvm/test/MC/RISCV/rve-invalid.s @@ -1,16 +1,17 @@ -# RUN: not llvm-mc -triple riscv32 -mattr=+e < %s 2>&1 | FileCheck %s -# RUN: llvm-mc -filetype=obj -triple=riscv32 < %s \ -# RUN: | llvm-objdump --mattr=+e -M no-aliases -d -r - \ +# RUN: not llvm-mc -triple riscv32 -mattr=+e,+zca < %s 2>&1 | FileCheck %s +# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zca < %s \ +# RUN: | llvm-objdump --mattr=+e,+zca -M no-aliases -d -r - \ # RUN: | FileCheck -check-prefix=CHECK-DIS %s -# RUN: not llvm-mc -triple riscv64 -mattr=+e < %s 2>&1 | FileCheck %s -# RUN: llvm-mc -filetype=obj -triple=riscv64 < %s \ -# RUN: | llvm-objdump --mattr=+e -M no-aliases -d -r - \ +# RUN: not llvm-mc -triple riscv64 -mattr=+e,+zca < %s 2>&1 | FileCheck %s +# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zca < %s \ +# RUN: | llvm-objdump --mattr=+e,+zca -M no-aliases -d -r - \ # RUN: | FileCheck -check-prefix=CHECK-DIS %s # Perform a simple check that registers x16-x31 (and the equivalent ABI names) # are rejected for RV32E/RV64E, when both assembling and disassembling. - +.option push +.option exact # CHECK-DIS: 00001837 # CHECK: :[[@LINE+1]]:5: error: invalid operand for instruction lui x16, 1 @@ -108,3 +109,14 @@ auipc t5, 31 # CHECK-DIS: 00020f97 # CHECK: :[[@LINE+1]]:7: error: invalid operand for instruction auipc t6, 32 +.option pop + +# CHECK-DIS: 0f81 +# CHECK: :[[@LINE+1]]:8: error: register must be a GPR excluding zero (x0) +c.addi x31, 0 +# CHECK-DIS: 9846 +# CHECK: :[[@LINE+1]]:7: error: register must be a GPR excluding zero (x0) +c.add x16, x17 +# CHECK-DIS: 8046 +# CHECK: :[[@LINE+1]]:10: error: register must be a GPR excluding zero (x0) +c.mv x0, x17 From 386aca4a3c9ed55c8fe2d9738dff4bcf57fb4f10 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 31 Mar 2025 20:54:19 -0700 Subject: [PATCH 0185/1029] [RISCV] Correct disassembly of cm.push/pop for RVE. (#133816) We shouldn't disassemble any encoding that refers to registers x16-x31 with RV32E. --- .../RISCV/Disassembler/RISCVDisassembler.cpp | 18 ++++++++++------ llvm/test/MC/RISCV/rv32e-xqccmp-invalid.s | 21 +++++++++++++++++++ llvm/test/MC/RISCV/rv32e-zcmp-invalid.s | 18 ++++++++++++++++ llvm/test/MC/RISCV/rv64e-xqccmp-invalid.s | 21 +++++++++++++++++++ llvm/test/MC/RISCV/rv64e-zcmp-invalid.s | 18 ++++++++++++++++ 5 files changed, 90 insertions(+), 6 deletions(-) create mode 100644 llvm/test/MC/RISCV/rv32e-xqccmp-invalid.s create mode 100644 llvm/test/MC/RISCV/rv32e-zcmp-invalid.s create mode 100644 llvm/test/MC/RISCV/rv64e-xqccmp-invalid.s create mode 100644 llvm/test/MC/RISCV/rv64e-zcmp-invalid.s diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 4e6d2b642c4ce..fe1ab6523a68b 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -506,10 +506,12 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, const MCDisassembler *Decoder); static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, - uint64_t Address, const void *Decoder); + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, - uint64_t Address, const void *Decoder); + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus decodeZcmpSpimm(MCInst &Inst, uint32_t Imm, uint64_t Address, const void *Decoder); @@ -624,16 +626,20 @@ static DecodeStatus decodeXTHeadMemPair(MCInst &Inst, uint32_t Insn, } static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, - uint64_t Address, const void *Decoder) { - if (Imm < RISCVZC::RA) + uint64_t Address, + const MCDisassembler *Decoder) { + bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); + if (Imm < RISCVZC::RA || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) return MCDisassembler::Fail; Inst.addOperand(MCOperand::createImm(Imm)); return MCDisassembler::Success; } static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, - uint64_t Address, const void *Decoder) { - if (Imm < RISCVZC::RA_S0) + uint64_t Address, + const MCDisassembler *Decoder) { + bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); + if (Imm < RISCVZC::RA_S0 || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) return MCDisassembler::Fail; Inst.addOperand(MCOperand::createImm(Imm)); return MCDisassembler::Success; diff --git a/llvm/test/MC/RISCV/rv32e-xqccmp-invalid.s b/llvm/test/MC/RISCV/rv32e-xqccmp-invalid.s new file mode 100644 index 0000000000000..6c3ef3000e77e --- /dev/null +++ b/llvm/test/MC/RISCV/rv32e-xqccmp-invalid.s @@ -0,0 +1,21 @@ +# RUN: not llvm-mc -triple riscv32 -mattr=+e,+experimental-xqccmp < %s 2>&1 | FileCheck %s +# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-xqccmp < %s \ +# RUN: | llvm-objdump --mattr=+e,+experimental-xqccmp -M no-aliases -d -r - \ +# RUN: | FileCheck -check-prefix=CHECK-DIS %s + +# Perform a simple check that registers x16-x31 (and the equivalent ABI names) +# are rejected for RV32E, when both assembling and disassembling. + + +# CHECK-DIS: b872 +# CHECK: :[[@LINE+1]]:19: error: invalid register +qc.cm.push {ra,s0-s2}, -16 +# CHECK-DIS: be72 +# CHECK: :[[@LINE+1]]:21: error: invalid register +qc.cm.popret {ra,s0-s2}, 16 +# CHECK-DIS: ba72 +# CHECK: :[[@LINE+1]]:21: error: register list must end with '}' +qc.cm.pop {x1, x8-x9, x18}, 16 +# CHECK-DIS: b972 +# CHECK: :[[@LINE+1]]:24: error: register list must end with '}' +qc.cm.pushfp {x1, x8-x9, x18}, -16 diff --git a/llvm/test/MC/RISCV/rv32e-zcmp-invalid.s b/llvm/test/MC/RISCV/rv32e-zcmp-invalid.s new file mode 100644 index 0000000000000..eaf6b350c2341 --- /dev/null +++ b/llvm/test/MC/RISCV/rv32e-zcmp-invalid.s @@ -0,0 +1,18 @@ +# RUN: not llvm-mc -triple riscv32 -mattr=+e,+zcmp < %s 2>&1 | FileCheck %s +# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+zcmp < %s \ +# RUN: | llvm-objdump --mattr=+e,+zcmp -M no-aliases -d -r - \ +# RUN: | FileCheck -check-prefix=CHECK-DIS %s + +# Perform a simple check that registers x16-x31 (and the equivalent ABI names) +# are rejected for RV32E, when both assembling and disassembling. + + +# CHECK-DIS: b872 +# CHECK: :[[@LINE+1]]:16: error: invalid register +cm.push {ra,s0-s2}, -16 +# CHECK-DIS: be72 +# CHECK: :[[@LINE+1]]:18: error: invalid register +cm.popret {ra,s0-s2}, 16 +# CHECK-DIS: ba72 +# CHECK: :[[@LINE+1]]:18: error: register list must end with '}' +cm.pop {x1, x8-x9, x18}, 16 diff --git a/llvm/test/MC/RISCV/rv64e-xqccmp-invalid.s b/llvm/test/MC/RISCV/rv64e-xqccmp-invalid.s new file mode 100644 index 0000000000000..f34ce83448070 --- /dev/null +++ b/llvm/test/MC/RISCV/rv64e-xqccmp-invalid.s @@ -0,0 +1,21 @@ +# RUN: not llvm-mc -triple riscv64 -mattr=+e,+experimental-xqccmp < %s 2>&1 | FileCheck %s +# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-xqccmp < %s \ +# RUN: | llvm-objdump --mattr=+e,+experimental-xqccmp -M no-aliases -d -r - \ +# RUN: | FileCheck -check-prefix=CHECK-DIS %s + +# Perform a simple check that registers x16-x31 (and the equivalent ABI names) +# are rejected for RV64E, when both assembling and disassembling. + + +# CHECK-DIS: b872 +# CHECK: :[[@LINE+1]]:19: error: invalid register +qc.cm.push {ra,s0-s2}, -32 +# CHECK-DIS: be72 +# CHECK: :[[@LINE+1]]:21: error: invalid register +qc.cm.popret {ra,s0-s2}, 32 +# CHECK-DIS: ba72 +# CHECK: :[[@LINE+1]]:21: error: register list must end with '}' +qc.cm.pop {x1, x8-x9, x18}, 32 +# CHECK-DIS: b972 +# CHECK: :[[@LINE+1]]:24: error: register list must end with '}' +qc.cm.pushfp {x1, x8-x9, x18}, -32 diff --git a/llvm/test/MC/RISCV/rv64e-zcmp-invalid.s b/llvm/test/MC/RISCV/rv64e-zcmp-invalid.s new file mode 100644 index 0000000000000..e99721d96a17c --- /dev/null +++ b/llvm/test/MC/RISCV/rv64e-zcmp-invalid.s @@ -0,0 +1,18 @@ +# RUN: not llvm-mc -triple riscv64 -mattr=+e,+zcmp < %s 2>&1 | FileCheck %s +# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+zcmp < %s \ +# RUN: | llvm-objdump --mattr=+e,+zcmp -M no-aliases -d -r - \ +# RUN: | FileCheck -check-prefix=CHECK-DIS %s + +# Perform a simple check that registers x16-x31 (and the equivalent ABI names) +# are rejected for RV64E, when both assembling and disassembling. + + +# CHECK-DIS: b872 +# CHECK: :[[@LINE+1]]:16: error: invalid register +cm.push {ra,s0-s2}, -32 +# CHECK-DIS: be72 +# CHECK: :[[@LINE+1]]:18: error: invalid register +cm.popret {ra,s0-s2}, 32 +# CHECK-DIS: ba72 +# CHECK: :[[@LINE+1]]:18: error: register list must end with '}' +cm.pop {x1, x8-x9, x18}, 32 From b3c7d5951673cf45150f80744a89866c6646eb71 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 31 Mar 2025 21:03:26 -0700 Subject: [PATCH 0186/1029] [lld] Use DenseMap::insert_range (NFC) (#133845) --- lld/COFF/DebugTypes.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lld/COFF/DebugTypes.cpp b/lld/COFF/DebugTypes.cpp index d4d80bfd92efb..28e6993e3eeaa 100644 --- a/lld/COFF/DebugTypes.cpp +++ b/lld/COFF/DebugTypes.cpp @@ -1187,8 +1187,7 @@ void TypeMerger::mergeTypesWithGHash() { // Build a global map of from function ID to function type. for (TpiSource *source : ctx.tpiSourceList) { - for (auto idToType : source->funcIdToType) - funcIdToType.insert(idToType); + funcIdToType.insert_range(source->funcIdToType); source->funcIdToType.clear(); } From e3adf6bbfc72de043cffb3144079a9eb85e9ca40 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 31 Mar 2025 21:49:07 -0700 Subject: [PATCH 0187/1029] [RISCV] Use decodeCLUIImmOperand when disassembling C_LUI_HINT. (#133789) This correctly rejects imm==0 and prints 1048575 instead of -1. I've modified the test to only have each hex pattern once with different check lines before it. This ensures we don't have more invalid messages printed than we're checking for. --- .../RISCV/Disassembler/RISCVDisassembler.cpp | 29 +- llvm/lib/Target/RISCV/RISCVInstrInfoC.td | 4 +- .../MC/Disassembler/RISCV/c_lui_disasm.txt | 8080 ++++++++++++----- 3 files changed, 6047 insertions(+), 2066 deletions(-) diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index fe1ab6523a68b..5f268006c6fdd 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -484,9 +484,13 @@ static DecodeStatus decodeRVCInstrRdRs1ImmZero(MCInst &Inst, uint32_t Insn, uint64_t Address, const MCDisassembler *Decoder); -static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder); +static DecodeStatus decodeRVCInstrRdSImm6(MCInst &Inst, uint32_t Insn, + uint64_t Address, + const MCDisassembler *Decoder); + +static DecodeStatus decodeRVCInstrRdCLUIImm(MCInst &Inst, uint32_t Insn, + uint64_t Address, + const MCDisassembler *Decoder); static DecodeStatus decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn, @@ -544,18 +548,27 @@ static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, return MCDisassembler::Success; } -static DecodeStatus decodeRVCInstrRdSImm(MCInst &Inst, uint32_t Insn, - uint64_t Address, - const MCDisassembler *Decoder) { +static DecodeStatus decodeRVCInstrRdSImm6(MCInst &Inst, uint32_t Insn, + uint64_t Address, + const MCDisassembler *Decoder) { Inst.addOperand(MCOperand::createReg(RISCV::X0)); - uint32_t SImm6 = + uint32_t Imm = fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); [[maybe_unused]] DecodeStatus Result = - decodeSImmOperand<6>(Inst, SImm6, Address, Decoder); + decodeSImmOperand<6>(Inst, Imm, Address, Decoder); assert(Result == MCDisassembler::Success && "Invalid immediate"); return MCDisassembler::Success; } +static DecodeStatus decodeRVCInstrRdCLUIImm(MCInst &Inst, uint32_t Insn, + uint64_t Address, + const MCDisassembler *Decoder) { + Inst.addOperand(MCOperand::createReg(RISCV::X0)); + uint32_t Imm = + fieldFromInstruction(Insn, 12, 1) << 5 | fieldFromInstruction(Insn, 2, 5); + return decodeCLUIImmOperand(Inst, Imm, Address, Decoder); +} + static DecodeStatus decodeRVCInstrRdRs1UImmLog2XLenNonZero(MCInst &Inst, uint32_t Insn, uint64_t Address, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index eafd2844a691c..718d95aa1a4bc 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -629,7 +629,7 @@ def C_LI_HINT : RVInst16CI<0b010, 0b01, (outs GPRX0:$rd), (ins simm6:$imm), "c.li", "$rd, $imm">, Sched<[WriteIALU]> { let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdSImm"; + let DecoderMethod = "decodeRVCInstrRdSImm6"; } def C_LUI_HINT : RVInst16CI<0b011, 0b01, (outs GPRX0:$rd), @@ -637,7 +637,7 @@ def C_LUI_HINT : RVInst16CI<0b011, 0b01, (outs GPRX0:$rd), "c.lui", "$rd, $imm">, Sched<[WriteIALU]> { let Inst{11-7} = 0; - let DecoderMethod = "decodeRVCInstrRdSImm"; + let DecoderMethod = "decodeRVCInstrRdCLUIImm"; } def C_MV_HINT : RVInst16CR<0b1000, 0b10, (outs GPRX0:$rs1), (ins GPRNoX0:$rs2), diff --git a/llvm/test/MC/Disassembler/RISCV/c_lui_disasm.txt b/llvm/test/MC/Disassembler/RISCV/c_lui_disasm.txt index e0002acd5ac6f..17889c15cbf95 100644 --- a/llvm/test/MC/Disassembler/RISCV/c_lui_disasm.txt +++ b/llvm/test/MC/Disassembler/RISCV/c_lui_disasm.txt @@ -15,2059 +15,6027 @@ # RUN: -M no-aliases --show-encoding < %s 2>&1 | \ # RUN: FileCheck --check-prefix=NOHINTS %s -0x01 0x60 # BAD: invalid instruction encoding -0x01 0x60 # NOHINTS: invalid instruction encoding -0x05 0x60 # GOOD: c.lui zero, 1 -0x05 0x60 # NOHINTS: invalid instruction encoding -0x09 0x60 # GOOD: c.lui zero, 2 -0x09 0x60 # NOHINTS: invalid instruction encoding -0x0D 0x60 # GOOD: c.lui zero, 3 -0x0D 0x60 # NOHINTS: invalid instruction encoding -0x11 0x60 # GOOD: c.lui zero, 4 -0x11 0x60 # NOHINTS: invalid instruction encoding -0x15 0x60 # GOOD: c.lui zero, 5 -0x15 0x60 # NOHINTS: invalid instruction encoding -0x19 0x60 # GOOD: c.lui zero, 6 -0x19 0x60 # NOHINTS: invalid instruction encoding -0x1D 0x60 # GOOD: c.lui zero, 7 -0x1D 0x60 # NOHINTS: invalid instruction encoding -0x21 0x60 # GOOD: c.lui zero, 8 -0x21 0x60 # NOHINTS: invalid instruction encoding -0x25 0x60 # GOOD: c.lui zero, 9 -0x25 0x60 # NOHINTS: invalid instruction encoding -0x29 0x60 # GOOD: c.lui zero, 10 -0x29 0x60 # NOHINTS: invalid instruction encoding -0x2D 0x60 # GOOD: c.lui zero, 11 -0x2D 0x60 # NOHINTS: invalid instruction encoding -0x31 0x60 # GOOD: c.lui zero, 12 -0x31 0x60 # NOHINTS: invalid instruction encoding -0x35 0x60 # GOOD: c.lui zero, 13 -0x35 0x60 # NOHINTS: invalid instruction encoding -0x39 0x60 # GOOD: c.lui zero, 14 -0x39 0x60 # NOHINTS: invalid instruction encoding -0x3D 0x60 # GOOD: c.lui zero, 15 -0x3D 0x60 # NOHINTS: invalid instruction encoding -0x41 0x60 # GOOD: c.lui zero, 16 -0x41 0x60 # NOHINTS: invalid instruction encoding -0x45 0x60 # GOOD: c.lui zero, 17 -0x45 0x60 # NOHINTS: invalid instruction encoding -0x49 0x60 # GOOD: c.lui zero, 18 -0x49 0x60 # NOHINTS: invalid instruction encoding -0x4D 0x60 # GOOD: c.lui zero, 19 -0x4D 0x60 # NOHINTS: invalid instruction encoding -0x51 0x60 # GOOD: c.lui zero, 20 -0x51 0x60 # NOHINTS: invalid instruction encoding -0x55 0x60 # GOOD: c.lui zero, 21 -0x55 0x60 # NOHINTS: invalid instruction encoding -0x59 0x60 # GOOD: c.lui zero, 22 -0x59 0x60 # NOHINTS: invalid instruction encoding -0x5D 0x60 # GOOD: c.lui zero, 23 -0x5D 0x60 # NOHINTS: invalid instruction encoding -0x61 0x60 # GOOD: c.lui zero, 24 -0x61 0x60 # NOHINTS: invalid instruction encoding -0x65 0x60 # GOOD: c.lui zero, 25 -0x65 0x60 # NOHINTS: invalid instruction encoding -0x69 0x60 # GOOD: c.lui zero, 26 -0x69 0x60 # NOHINTS: invalid instruction encoding -0x6D 0x60 # GOOD: c.lui zero, 27 -0x6D 0x60 # NOHINTS: invalid instruction encoding -0x71 0x60 # GOOD: c.lui zero, 28 -0x71 0x60 # NOHINTS: invalid instruction encoding -0x75 0x60 # GOOD: c.lui zero, 29 -0x75 0x60 # NOHINTS: invalid instruction encoding -0x79 0x60 # GOOD: c.lui zero, 30 -0x79 0x60 # NOHINTS: invalid instruction encoding -0x7D 0x60 # GOOD: c.lui zero, 31 -0x7D 0x60 # NOHINTS: invalid instruction encoding -0x01 0x70 # GOOD: c.lui zero, -32 -0x01 0x70 # NOHINTS: invalid instruction encoding -0x05 0x70 # GOOD: c.lui zero, -31 -0x05 0x70 # NOHINTS: invalid instruction encoding -0x09 0x70 # GOOD: c.lui zero, -30 -0x09 0x70 # NOHINTS: invalid instruction encoding -0x0D 0x70 # GOOD: c.lui zero, -29 -0x0D 0x70 # NOHINTS: invalid instruction encoding -0x11 0x70 # GOOD: c.lui zero, -28 -0x11 0x70 # NOHINTS: invalid instruction encoding -0x15 0x70 # GOOD: c.lui zero, -27 -0x15 0x70 # NOHINTS: invalid instruction encoding -0x19 0x70 # GOOD: c.lui zero, -26 -0x19 0x70 # NOHINTS: invalid instruction encoding -0x1D 0x70 # GOOD: c.lui zero, -25 -0x1D 0x70 # NOHINTS: invalid instruction encoding -0x21 0x70 # GOOD: c.lui zero, -24 -0x21 0x70 # NOHINTS: invalid instruction encoding -0x25 0x70 # GOOD: c.lui zero, -23 -0x25 0x70 # NOHINTS: invalid instruction encoding -0x29 0x70 # GOOD: c.lui zero, -22 -0x29 0x70 # NOHINTS: invalid instruction encoding -0x2D 0x70 # GOOD: c.lui zero, -21 -0x2D 0x70 # NOHINTS: invalid instruction encoding -0x31 0x70 # GOOD: c.lui zero, -20 -0x31 0x70 # NOHINTS: invalid instruction encoding -0x35 0x70 # GOOD: c.lui zero, -19 -0x35 0x70 # NOHINTS: invalid instruction encoding -0x39 0x70 # GOOD: c.lui zero, -18 -0x39 0x70 # NOHINTS: invalid instruction encoding -0x3D 0x70 # GOOD: c.lui zero, -17 -0x3D 0x70 # NOHINTS: invalid instruction encoding -0x41 0x70 # GOOD: c.lui zero, -16 -0x41 0x70 # NOHINTS: invalid instruction encoding -0x45 0x70 # GOOD: c.lui zero, -15 -0x45 0x70 # NOHINTS: invalid instruction encoding -0x49 0x70 # GOOD: c.lui zero, -14 -0x49 0x70 # NOHINTS: invalid instruction encoding -0x4D 0x70 # GOOD: c.lui zero, -13 -0x4D 0x70 # NOHINTS: invalid instruction encoding -0x51 0x70 # GOOD: c.lui zero, -12 -0x51 0x70 # NOHINTS: invalid instruction encoding -0x55 0x70 # GOOD: c.lui zero, -11 -0x55 0x70 # NOHINTS: invalid instruction encoding -0x59 0x70 # GOOD: c.lui zero, -10 -0x59 0x70 # NOHINTS: invalid instruction encoding -0x5D 0x70 # GOOD: c.lui zero, -9 -0x5D 0x70 # NOHINTS: invalid instruction encoding -0x61 0x70 # GOOD: c.lui zero, -8 -0x61 0x70 # NOHINTS: invalid instruction encoding -0x65 0x70 # GOOD: c.lui zero, -7 -0x65 0x70 # NOHINTS: invalid instruction encoding -0x69 0x70 # GOOD: c.lui zero, -6 -0x69 0x70 # NOHINTS: invalid instruction encoding -0x6D 0x70 # GOOD: c.lui zero, -5 -0x6D 0x70 # NOHINTS: invalid instruction encoding -0x71 0x70 # GOOD: c.lui zero, -4 -0x71 0x70 # NOHINTS: invalid instruction encoding -0x75 0x70 # GOOD: c.lui zero, -3 -0x75 0x70 # NOHINTS: invalid instruction encoding -0x79 0x70 # GOOD: c.lui zero, -2 -0x79 0x70 # NOHINTS: invalid instruction encoding -0x7D 0x70 # GOOD: c.lui zero, -1 -0x7D 0x70 # NOHINTS: invalid instruction encoding -0x81 0x60 # BAD: invalid instruction encoding -0x81 0x60 # MOP: c.mop.1 -0x85 0x60 # GOOD: c.lui ra, 1 -0x89 0x60 # GOOD: c.lui ra, 2 -0x8D 0x60 # GOOD: c.lui ra, 3 -0x91 0x60 # GOOD: c.lui ra, 4 -0x95 0x60 # GOOD: c.lui ra, 5 -0x99 0x60 # GOOD: c.lui ra, 6 -0x9D 0x60 # GOOD: c.lui ra, 7 -0xA1 0x60 # GOOD: c.lui ra, 8 -0xA5 0x60 # GOOD: c.lui ra, 9 -0xA9 0x60 # GOOD: c.lui ra, 10 -0xAD 0x60 # GOOD: c.lui ra, 11 -0xB1 0x60 # GOOD: c.lui ra, 12 -0xB5 0x60 # GOOD: c.lui ra, 13 -0xB9 0x60 # GOOD: c.lui ra, 14 -0xBD 0x60 # GOOD: c.lui ra, 15 -0xC1 0x60 # GOOD: c.lui ra, 16 -0xC5 0x60 # GOOD: c.lui ra, 17 -0xC9 0x60 # GOOD: c.lui ra, 18 -0xCD 0x60 # GOOD: c.lui ra, 19 -0xD1 0x60 # GOOD: c.lui ra, 20 -0xD5 0x60 # GOOD: c.lui ra, 21 -0xD9 0x60 # GOOD: c.lui ra, 22 -0xDD 0x60 # GOOD: c.lui ra, 23 -0xE1 0x60 # GOOD: c.lui ra, 24 -0xE5 0x60 # GOOD: c.lui ra, 25 -0xE9 0x60 # GOOD: c.lui ra, 26 -0xED 0x60 # GOOD: c.lui ra, 27 -0xF1 0x60 # GOOD: c.lui ra, 28 -0xF5 0x60 # GOOD: c.lui ra, 29 -0xF9 0x60 # GOOD: c.lui ra, 30 -0xFD 0x60 # GOOD: c.lui ra, 31 -0x81 0x70 # GOOD: c.lui ra, 1048544 -0x85 0x70 # GOOD: c.lui ra, 1048545 -0x89 0x70 # GOOD: c.lui ra, 1048546 -0x8D 0x70 # GOOD: c.lui ra, 1048547 -0x91 0x70 # GOOD: c.lui ra, 1048548 -0x95 0x70 # GOOD: c.lui ra, 1048549 -0x99 0x70 # GOOD: c.lui ra, 1048550 -0x9D 0x70 # GOOD: c.lui ra, 1048551 -0xA1 0x70 # GOOD: c.lui ra, 1048552 -0xA5 0x70 # GOOD: c.lui ra, 1048553 -0xA9 0x70 # GOOD: c.lui ra, 1048554 -0xAD 0x70 # GOOD: c.lui ra, 1048555 -0xB1 0x70 # GOOD: c.lui ra, 1048556 -0xB5 0x70 # GOOD: c.lui ra, 1048557 -0xB9 0x70 # GOOD: c.lui ra, 1048558 -0xBD 0x70 # GOOD: c.lui ra, 1048559 -0xC1 0x70 # GOOD: c.lui ra, 1048560 -0xC5 0x70 # GOOD: c.lui ra, 1048561 -0xC9 0x70 # GOOD: c.lui ra, 1048562 -0xCD 0x70 # GOOD: c.lui ra, 1048563 -0xD1 0x70 # GOOD: c.lui ra, 1048564 -0xD5 0x70 # GOOD: c.lui ra, 1048565 -0xD9 0x70 # GOOD: c.lui ra, 1048566 -0xDD 0x70 # GOOD: c.lui ra, 1048567 -0xE1 0x70 # GOOD: c.lui ra, 1048568 -0xE5 0x70 # GOOD: c.lui ra, 1048569 -0xE9 0x70 # GOOD: c.lui ra, 1048570 -0xED 0x70 # GOOD: c.lui ra, 1048571 -0xF1 0x70 # GOOD: c.lui ra, 1048572 -0xF5 0x70 # GOOD: c.lui ra, 1048573 -0xF9 0x70 # GOOD: c.lui ra, 1048574 -0xFD 0x70 # GOOD: c.lui ra, 1048575 -0x81 0x61 # BAD: invalid instruction encoding -0x81 0x61 # MOP: c.mop.3 -0x85 0x61 # GOOD: c.lui gp, 1 -0x89 0x61 # GOOD: c.lui gp, 2 -0x8D 0x61 # GOOD: c.lui gp, 3 -0x91 0x61 # GOOD: c.lui gp, 4 -0x95 0x61 # GOOD: c.lui gp, 5 -0x99 0x61 # GOOD: c.lui gp, 6 -0x9D 0x61 # GOOD: c.lui gp, 7 -0xA1 0x61 # GOOD: c.lui gp, 8 -0xA5 0x61 # GOOD: c.lui gp, 9 -0xA9 0x61 # GOOD: c.lui gp, 10 -0xAD 0x61 # GOOD: c.lui gp, 11 -0xB1 0x61 # GOOD: c.lui gp, 12 -0xB5 0x61 # GOOD: c.lui gp, 13 -0xB9 0x61 # GOOD: c.lui gp, 14 -0xBD 0x61 # GOOD: c.lui gp, 15 -0xC1 0x61 # GOOD: c.lui gp, 16 -0xC5 0x61 # GOOD: c.lui gp, 17 -0xC9 0x61 # GOOD: c.lui gp, 18 -0xCD 0x61 # GOOD: c.lui gp, 19 -0xD1 0x61 # GOOD: c.lui gp, 20 -0xD5 0x61 # GOOD: c.lui gp, 21 -0xD9 0x61 # GOOD: c.lui gp, 22 -0xDD 0x61 # GOOD: c.lui gp, 23 -0xE1 0x61 # GOOD: c.lui gp, 24 -0xE5 0x61 # GOOD: c.lui gp, 25 -0xE9 0x61 # GOOD: c.lui gp, 26 -0xED 0x61 # GOOD: c.lui gp, 27 -0xF1 0x61 # GOOD: c.lui gp, 28 -0xF5 0x61 # GOOD: c.lui gp, 29 -0xF9 0x61 # GOOD: c.lui gp, 30 -0xFD 0x61 # GOOD: c.lui gp, 31 -0x81 0x71 # GOOD: c.lui gp, 1048544 -0x85 0x71 # GOOD: c.lui gp, 1048545 -0x89 0x71 # GOOD: c.lui gp, 1048546 -0x8D 0x71 # GOOD: c.lui gp, 1048547 -0x91 0x71 # GOOD: c.lui gp, 1048548 -0x95 0x71 # GOOD: c.lui gp, 1048549 -0x99 0x71 # GOOD: c.lui gp, 1048550 -0x9D 0x71 # GOOD: c.lui gp, 1048551 -0xA1 0x71 # GOOD: c.lui gp, 1048552 -0xA5 0x71 # GOOD: c.lui gp, 1048553 -0xA9 0x71 # GOOD: c.lui gp, 1048554 -0xAD 0x71 # GOOD: c.lui gp, 1048555 -0xB1 0x71 # GOOD: c.lui gp, 1048556 -0xB5 0x71 # GOOD: c.lui gp, 1048557 -0xB9 0x71 # GOOD: c.lui gp, 1048558 -0xBD 0x71 # GOOD: c.lui gp, 1048559 -0xC1 0x71 # GOOD: c.lui gp, 1048560 -0xC5 0x71 # GOOD: c.lui gp, 1048561 -0xC9 0x71 # GOOD: c.lui gp, 1048562 -0xCD 0x71 # GOOD: c.lui gp, 1048563 -0xD1 0x71 # GOOD: c.lui gp, 1048564 -0xD5 0x71 # GOOD: c.lui gp, 1048565 -0xD9 0x71 # GOOD: c.lui gp, 1048566 -0xDD 0x71 # GOOD: c.lui gp, 1048567 -0xE1 0x71 # GOOD: c.lui gp, 1048568 -0xE5 0x71 # GOOD: c.lui gp, 1048569 -0xE9 0x71 # GOOD: c.lui gp, 1048570 -0xED 0x71 # GOOD: c.lui gp, 1048571 -0xF1 0x71 # GOOD: c.lui gp, 1048572 -0xF5 0x71 # GOOD: c.lui gp, 1048573 -0xF9 0x71 # GOOD: c.lui gp, 1048574 -0xFD 0x71 # GOOD: c.lui gp, 1048575 -0x01 0x62 # BAD: invalid instruction encoding -0x05 0x62 # GOOD: c.lui tp, 1 -0x09 0x62 # GOOD: c.lui tp, 2 -0x0D 0x62 # GOOD: c.lui tp, 3 -0x11 0x62 # GOOD: c.lui tp, 4 -0x15 0x62 # GOOD: c.lui tp, 5 -0x19 0x62 # GOOD: c.lui tp, 6 -0x1D 0x62 # GOOD: c.lui tp, 7 -0x21 0x62 # GOOD: c.lui tp, 8 -0x25 0x62 # GOOD: c.lui tp, 9 -0x29 0x62 # GOOD: c.lui tp, 10 -0x2D 0x62 # GOOD: c.lui tp, 11 -0x31 0x62 # GOOD: c.lui tp, 12 -0x35 0x62 # GOOD: c.lui tp, 13 -0x39 0x62 # GOOD: c.lui tp, 14 -0x3D 0x62 # GOOD: c.lui tp, 15 -0x41 0x62 # GOOD: c.lui tp, 16 -0x45 0x62 # GOOD: c.lui tp, 17 -0x49 0x62 # GOOD: c.lui tp, 18 -0x4D 0x62 # GOOD: c.lui tp, 19 -0x51 0x62 # GOOD: c.lui tp, 20 -0x55 0x62 # GOOD: c.lui tp, 21 -0x59 0x62 # GOOD: c.lui tp, 22 -0x5D 0x62 # GOOD: c.lui tp, 23 -0x61 0x62 # GOOD: c.lui tp, 24 -0x65 0x62 # GOOD: c.lui tp, 25 -0x69 0x62 # GOOD: c.lui tp, 26 -0x6D 0x62 # GOOD: c.lui tp, 27 -0x71 0x62 # GOOD: c.lui tp, 28 -0x75 0x62 # GOOD: c.lui tp, 29 -0x79 0x62 # GOOD: c.lui tp, 30 -0x7D 0x62 # GOOD: c.lui tp, 31 -0x01 0x72 # GOOD: c.lui tp, 1048544 -0x05 0x72 # GOOD: c.lui tp, 1048545 -0x09 0x72 # GOOD: c.lui tp, 1048546 -0x0D 0x72 # GOOD: c.lui tp, 1048547 -0x11 0x72 # GOOD: c.lui tp, 1048548 -0x15 0x72 # GOOD: c.lui tp, 1048549 -0x19 0x72 # GOOD: c.lui tp, 1048550 -0x1D 0x72 # GOOD: c.lui tp, 1048551 -0x21 0x72 # GOOD: c.lui tp, 1048552 -0x25 0x72 # GOOD: c.lui tp, 1048553 -0x29 0x72 # GOOD: c.lui tp, 1048554 -0x2D 0x72 # GOOD: c.lui tp, 1048555 -0x31 0x72 # GOOD: c.lui tp, 1048556 -0x35 0x72 # GOOD: c.lui tp, 1048557 -0x39 0x72 # GOOD: c.lui tp, 1048558 -0x3D 0x72 # GOOD: c.lui tp, 1048559 -0x41 0x72 # GOOD: c.lui tp, 1048560 -0x45 0x72 # GOOD: c.lui tp, 1048561 -0x49 0x72 # GOOD: c.lui tp, 1048562 -0x4D 0x72 # GOOD: c.lui tp, 1048563 -0x51 0x72 # GOOD: c.lui tp, 1048564 -0x55 0x72 # GOOD: c.lui tp, 1048565 -0x59 0x72 # GOOD: c.lui tp, 1048566 -0x5D 0x72 # GOOD: c.lui tp, 1048567 -0x61 0x72 # GOOD: c.lui tp, 1048568 -0x65 0x72 # GOOD: c.lui tp, 1048569 -0x69 0x72 # GOOD: c.lui tp, 1048570 -0x6D 0x72 # GOOD: c.lui tp, 1048571 -0x71 0x72 # GOOD: c.lui tp, 1048572 -0x75 0x72 # GOOD: c.lui tp, 1048573 -0x79 0x72 # GOOD: c.lui tp, 1048574 -0x7D 0x72 # GOOD: c.lui tp, 1048575 -0x81 0x62 # BAD: invalid instruction encoding -0x81 0x62 # MOP: c.mop.5 -0x85 0x62 # GOOD: c.lui t0, 1 -0x89 0x62 # GOOD: c.lui t0, 2 -0x8D 0x62 # GOOD: c.lui t0, 3 -0x91 0x62 # GOOD: c.lui t0, 4 -0x95 0x62 # GOOD: c.lui t0, 5 -0x99 0x62 # GOOD: c.lui t0, 6 -0x9D 0x62 # GOOD: c.lui t0, 7 -0xA1 0x62 # GOOD: c.lui t0, 8 -0xA5 0x62 # GOOD: c.lui t0, 9 -0xA9 0x62 # GOOD: c.lui t0, 10 -0xAD 0x62 # GOOD: c.lui t0, 11 -0xB1 0x62 # GOOD: c.lui t0, 12 -0xB5 0x62 # GOOD: c.lui t0, 13 -0xB9 0x62 # GOOD: c.lui t0, 14 -0xBD 0x62 # GOOD: c.lui t0, 15 -0xC1 0x62 # GOOD: c.lui t0, 16 -0xC5 0x62 # GOOD: c.lui t0, 17 -0xC9 0x62 # GOOD: c.lui t0, 18 -0xCD 0x62 # GOOD: c.lui t0, 19 -0xD1 0x62 # GOOD: c.lui t0, 20 -0xD5 0x62 # GOOD: c.lui t0, 21 -0xD9 0x62 # GOOD: c.lui t0, 22 -0xDD 0x62 # GOOD: c.lui t0, 23 -0xE1 0x62 # GOOD: c.lui t0, 24 -0xE5 0x62 # GOOD: c.lui t0, 25 -0xE9 0x62 # GOOD: c.lui t0, 26 -0xED 0x62 # GOOD: c.lui t0, 27 -0xF1 0x62 # GOOD: c.lui t0, 28 -0xF5 0x62 # GOOD: c.lui t0, 29 -0xF9 0x62 # GOOD: c.lui t0, 30 -0xFD 0x62 # GOOD: c.lui t0, 31 -0x81 0x72 # GOOD: c.lui t0, 1048544 -0x85 0x72 # GOOD: c.lui t0, 1048545 -0x89 0x72 # GOOD: c.lui t0, 1048546 -0x8D 0x72 # GOOD: c.lui t0, 1048547 -0x91 0x72 # GOOD: c.lui t0, 1048548 -0x95 0x72 # GOOD: c.lui t0, 1048549 -0x99 0x72 # GOOD: c.lui t0, 1048550 -0x9D 0x72 # GOOD: c.lui t0, 1048551 -0xA1 0x72 # GOOD: c.lui t0, 1048552 -0xA5 0x72 # GOOD: c.lui t0, 1048553 -0xA9 0x72 # GOOD: c.lui t0, 1048554 -0xAD 0x72 # GOOD: c.lui t0, 1048555 -0xB1 0x72 # GOOD: c.lui t0, 1048556 -0xB5 0x72 # GOOD: c.lui t0, 1048557 -0xB9 0x72 # GOOD: c.lui t0, 1048558 -0xBD 0x72 # GOOD: c.lui t0, 1048559 -0xC1 0x72 # GOOD: c.lui t0, 1048560 -0xC5 0x72 # GOOD: c.lui t0, 1048561 -0xC9 0x72 # GOOD: c.lui t0, 1048562 -0xCD 0x72 # GOOD: c.lui t0, 1048563 -0xD1 0x72 # GOOD: c.lui t0, 1048564 -0xD5 0x72 # GOOD: c.lui t0, 1048565 -0xD9 0x72 # GOOD: c.lui t0, 1048566 -0xDD 0x72 # GOOD: c.lui t0, 1048567 -0xE1 0x72 # GOOD: c.lui t0, 1048568 -0xE5 0x72 # GOOD: c.lui t0, 1048569 -0xE9 0x72 # GOOD: c.lui t0, 1048570 -0xED 0x72 # GOOD: c.lui t0, 1048571 -0xF1 0x72 # GOOD: c.lui t0, 1048572 -0xF5 0x72 # GOOD: c.lui t0, 1048573 -0xF9 0x72 # GOOD: c.lui t0, 1048574 -0xFD 0x72 # GOOD: c.lui t0, 1048575 -0x01 0x63 # BAD: invalid instruction encoding -0x05 0x63 # GOOD: c.lui t1, 1 -0x09 0x63 # GOOD: c.lui t1, 2 -0x0D 0x63 # GOOD: c.lui t1, 3 -0x11 0x63 # GOOD: c.lui t1, 4 -0x15 0x63 # GOOD: c.lui t1, 5 -0x19 0x63 # GOOD: c.lui t1, 6 -0x1D 0x63 # GOOD: c.lui t1, 7 -0x21 0x63 # GOOD: c.lui t1, 8 -0x25 0x63 # GOOD: c.lui t1, 9 -0x29 0x63 # GOOD: c.lui t1, 10 -0x2D 0x63 # GOOD: c.lui t1, 11 -0x31 0x63 # GOOD: c.lui t1, 12 -0x35 0x63 # GOOD: c.lui t1, 13 -0x39 0x63 # GOOD: c.lui t1, 14 -0x3D 0x63 # GOOD: c.lui t1, 15 -0x41 0x63 # GOOD: c.lui t1, 16 -0x45 0x63 # GOOD: c.lui t1, 17 -0x49 0x63 # GOOD: c.lui t1, 18 -0x4D 0x63 # GOOD: c.lui t1, 19 -0x51 0x63 # GOOD: c.lui t1, 20 -0x55 0x63 # GOOD: c.lui t1, 21 -0x59 0x63 # GOOD: c.lui t1, 22 -0x5D 0x63 # GOOD: c.lui t1, 23 -0x61 0x63 # GOOD: c.lui t1, 24 -0x65 0x63 # GOOD: c.lui t1, 25 -0x69 0x63 # GOOD: c.lui t1, 26 -0x6D 0x63 # GOOD: c.lui t1, 27 -0x71 0x63 # GOOD: c.lui t1, 28 -0x75 0x63 # GOOD: c.lui t1, 29 -0x79 0x63 # GOOD: c.lui t1, 30 -0x7D 0x63 # GOOD: c.lui t1, 31 -0x01 0x73 # GOOD: c.lui t1, 1048544 -0x05 0x73 # GOOD: c.lui t1, 1048545 -0x09 0x73 # GOOD: c.lui t1, 1048546 -0x0D 0x73 # GOOD: c.lui t1, 1048547 -0x11 0x73 # GOOD: c.lui t1, 1048548 -0x15 0x73 # GOOD: c.lui t1, 1048549 -0x19 0x73 # GOOD: c.lui t1, 1048550 -0x1D 0x73 # GOOD: c.lui t1, 1048551 -0x21 0x73 # GOOD: c.lui t1, 1048552 -0x25 0x73 # GOOD: c.lui t1, 1048553 -0x29 0x73 # GOOD: c.lui t1, 1048554 -0x2D 0x73 # GOOD: c.lui t1, 1048555 -0x31 0x73 # GOOD: c.lui t1, 1048556 -0x35 0x73 # GOOD: c.lui t1, 1048557 -0x39 0x73 # GOOD: c.lui t1, 1048558 -0x3D 0x73 # GOOD: c.lui t1, 1048559 -0x41 0x73 # GOOD: c.lui t1, 1048560 -0x45 0x73 # GOOD: c.lui t1, 1048561 -0x49 0x73 # GOOD: c.lui t1, 1048562 -0x4D 0x73 # GOOD: c.lui t1, 1048563 -0x51 0x73 # GOOD: c.lui t1, 1048564 -0x55 0x73 # GOOD: c.lui t1, 1048565 -0x59 0x73 # GOOD: c.lui t1, 1048566 -0x5D 0x73 # GOOD: c.lui t1, 1048567 -0x61 0x73 # GOOD: c.lui t1, 1048568 -0x65 0x73 # GOOD: c.lui t1, 1048569 -0x69 0x73 # GOOD: c.lui t1, 1048570 -0x6D 0x73 # GOOD: c.lui t1, 1048571 -0x71 0x73 # GOOD: c.lui t1, 1048572 -0x75 0x73 # GOOD: c.lui t1, 1048573 -0x79 0x73 # GOOD: c.lui t1, 1048574 -0x7D 0x73 # GOOD: c.lui t1, 1048575 -0x81 0x63 # BAD: invalid instruction encoding -0x81 0x63 # MOP: c.mop.7 -0x85 0x63 # GOOD: c.lui t2, 1 -0x89 0x63 # GOOD: c.lui t2, 2 -0x8D 0x63 # GOOD: c.lui t2, 3 -0x91 0x63 # GOOD: c.lui t2, 4 -0x95 0x63 # GOOD: c.lui t2, 5 -0x99 0x63 # GOOD: c.lui t2, 6 -0x9D 0x63 # GOOD: c.lui t2, 7 -0xA1 0x63 # GOOD: c.lui t2, 8 -0xA5 0x63 # GOOD: c.lui t2, 9 -0xA9 0x63 # GOOD: c.lui t2, 10 -0xAD 0x63 # GOOD: c.lui t2, 11 -0xB1 0x63 # GOOD: c.lui t2, 12 -0xB5 0x63 # GOOD: c.lui t2, 13 -0xB9 0x63 # GOOD: c.lui t2, 14 -0xBD 0x63 # GOOD: c.lui t2, 15 -0xC1 0x63 # GOOD: c.lui t2, 16 -0xC5 0x63 # GOOD: c.lui t2, 17 -0xC9 0x63 # GOOD: c.lui t2, 18 -0xCD 0x63 # GOOD: c.lui t2, 19 -0xD1 0x63 # GOOD: c.lui t2, 20 -0xD5 0x63 # GOOD: c.lui t2, 21 -0xD9 0x63 # GOOD: c.lui t2, 22 -0xDD 0x63 # GOOD: c.lui t2, 23 -0xE1 0x63 # GOOD: c.lui t2, 24 -0xE5 0x63 # GOOD: c.lui t2, 25 -0xE9 0x63 # GOOD: c.lui t2, 26 -0xED 0x63 # GOOD: c.lui t2, 27 -0xF1 0x63 # GOOD: c.lui t2, 28 -0xF5 0x63 # GOOD: c.lui t2, 29 -0xF9 0x63 # GOOD: c.lui t2, 30 -0xFD 0x63 # GOOD: c.lui t2, 31 -0x81 0x73 # GOOD: c.lui t2, 1048544 -0x85 0x73 # GOOD: c.lui t2, 1048545 -0x89 0x73 # GOOD: c.lui t2, 1048546 -0x8D 0x73 # GOOD: c.lui t2, 1048547 -0x91 0x73 # GOOD: c.lui t2, 1048548 -0x95 0x73 # GOOD: c.lui t2, 1048549 -0x99 0x73 # GOOD: c.lui t2, 1048550 -0x9D 0x73 # GOOD: c.lui t2, 1048551 -0xA1 0x73 # GOOD: c.lui t2, 1048552 -0xA5 0x73 # GOOD: c.lui t2, 1048553 -0xA9 0x73 # GOOD: c.lui t2, 1048554 -0xAD 0x73 # GOOD: c.lui t2, 1048555 -0xB1 0x73 # GOOD: c.lui t2, 1048556 -0xB5 0x73 # GOOD: c.lui t2, 1048557 -0xB9 0x73 # GOOD: c.lui t2, 1048558 -0xBD 0x73 # GOOD: c.lui t2, 1048559 -0xC1 0x73 # GOOD: c.lui t2, 1048560 -0xC5 0x73 # GOOD: c.lui t2, 1048561 -0xC9 0x73 # GOOD: c.lui t2, 1048562 -0xCD 0x73 # GOOD: c.lui t2, 1048563 -0xD1 0x73 # GOOD: c.lui t2, 1048564 -0xD5 0x73 # GOOD: c.lui t2, 1048565 -0xD9 0x73 # GOOD: c.lui t2, 1048566 -0xDD 0x73 # GOOD: c.lui t2, 1048567 -0xE1 0x73 # GOOD: c.lui t2, 1048568 -0xE5 0x73 # GOOD: c.lui t2, 1048569 -0xE9 0x73 # GOOD: c.lui t2, 1048570 -0xED 0x73 # GOOD: c.lui t2, 1048571 -0xF1 0x73 # GOOD: c.lui t2, 1048572 -0xF5 0x73 # GOOD: c.lui t2, 1048573 -0xF9 0x73 # GOOD: c.lui t2, 1048574 -0xFD 0x73 # GOOD: c.lui t2, 1048575 -0x01 0x64 # BAD: invalid instruction encoding -0x05 0x64 # GOOD: c.lui s0, 1 -0x09 0x64 # GOOD: c.lui s0, 2 -0x0D 0x64 # GOOD: c.lui s0, 3 -0x11 0x64 # GOOD: c.lui s0, 4 -0x15 0x64 # GOOD: c.lui s0, 5 -0x19 0x64 # GOOD: c.lui s0, 6 -0x1D 0x64 # GOOD: c.lui s0, 7 -0x21 0x64 # GOOD: c.lui s0, 8 -0x25 0x64 # GOOD: c.lui s0, 9 -0x29 0x64 # GOOD: c.lui s0, 10 -0x2D 0x64 # GOOD: c.lui s0, 11 -0x31 0x64 # GOOD: c.lui s0, 12 -0x35 0x64 # GOOD: c.lui s0, 13 -0x39 0x64 # GOOD: c.lui s0, 14 -0x3D 0x64 # GOOD: c.lui s0, 15 -0x41 0x64 # GOOD: c.lui s0, 16 -0x45 0x64 # GOOD: c.lui s0, 17 -0x49 0x64 # GOOD: c.lui s0, 18 -0x4D 0x64 # GOOD: c.lui s0, 19 -0x51 0x64 # GOOD: c.lui s0, 20 -0x55 0x64 # GOOD: c.lui s0, 21 -0x59 0x64 # GOOD: c.lui s0, 22 -0x5D 0x64 # GOOD: c.lui s0, 23 -0x61 0x64 # GOOD: c.lui s0, 24 -0x65 0x64 # GOOD: c.lui s0, 25 -0x69 0x64 # GOOD: c.lui s0, 26 -0x6D 0x64 # GOOD: c.lui s0, 27 -0x71 0x64 # GOOD: c.lui s0, 28 -0x75 0x64 # GOOD: c.lui s0, 29 -0x79 0x64 # GOOD: c.lui s0, 30 -0x7D 0x64 # GOOD: c.lui s0, 31 -0x01 0x74 # GOOD: c.lui s0, 1048544 -0x05 0x74 # GOOD: c.lui s0, 1048545 -0x09 0x74 # GOOD: c.lui s0, 1048546 -0x0D 0x74 # GOOD: c.lui s0, 1048547 -0x11 0x74 # GOOD: c.lui s0, 1048548 -0x15 0x74 # GOOD: c.lui s0, 1048549 -0x19 0x74 # GOOD: c.lui s0, 1048550 -0x1D 0x74 # GOOD: c.lui s0, 1048551 -0x21 0x74 # GOOD: c.lui s0, 1048552 -0x25 0x74 # GOOD: c.lui s0, 1048553 -0x29 0x74 # GOOD: c.lui s0, 1048554 -0x2D 0x74 # GOOD: c.lui s0, 1048555 -0x31 0x74 # GOOD: c.lui s0, 1048556 -0x35 0x74 # GOOD: c.lui s0, 1048557 -0x39 0x74 # GOOD: c.lui s0, 1048558 -0x3D 0x74 # GOOD: c.lui s0, 1048559 -0x41 0x74 # GOOD: c.lui s0, 1048560 -0x45 0x74 # GOOD: c.lui s0, 1048561 -0x49 0x74 # GOOD: c.lui s0, 1048562 -0x4D 0x74 # GOOD: c.lui s0, 1048563 -0x51 0x74 # GOOD: c.lui s0, 1048564 -0x55 0x74 # GOOD: c.lui s0, 1048565 -0x59 0x74 # GOOD: c.lui s0, 1048566 -0x5D 0x74 # GOOD: c.lui s0, 1048567 -0x61 0x74 # GOOD: c.lui s0, 1048568 -0x65 0x74 # GOOD: c.lui s0, 1048569 -0x69 0x74 # GOOD: c.lui s0, 1048570 -0x6D 0x74 # GOOD: c.lui s0, 1048571 -0x71 0x74 # GOOD: c.lui s0, 1048572 -0x75 0x74 # GOOD: c.lui s0, 1048573 -0x79 0x74 # GOOD: c.lui s0, 1048574 -0x7D 0x74 # GOOD: c.lui s0, 1048575 -0x81 0x64 # BAD: invalid instruction encoding -0x81 0x64 # MOP: c.mop.9 -0x85 0x64 # GOOD: c.lui s1, 1 -0x89 0x64 # GOOD: c.lui s1, 2 -0x8D 0x64 # GOOD: c.lui s1, 3 -0x91 0x64 # GOOD: c.lui s1, 4 -0x95 0x64 # GOOD: c.lui s1, 5 -0x99 0x64 # GOOD: c.lui s1, 6 -0x9D 0x64 # GOOD: c.lui s1, 7 -0xA1 0x64 # GOOD: c.lui s1, 8 -0xA5 0x64 # GOOD: c.lui s1, 9 -0xA9 0x64 # GOOD: c.lui s1, 10 -0xAD 0x64 # GOOD: c.lui s1, 11 -0xB1 0x64 # GOOD: c.lui s1, 12 -0xB5 0x64 # GOOD: c.lui s1, 13 -0xB9 0x64 # GOOD: c.lui s1, 14 -0xBD 0x64 # GOOD: c.lui s1, 15 -0xC1 0x64 # GOOD: c.lui s1, 16 -0xC5 0x64 # GOOD: c.lui s1, 17 -0xC9 0x64 # GOOD: c.lui s1, 18 -0xCD 0x64 # GOOD: c.lui s1, 19 -0xD1 0x64 # GOOD: c.lui s1, 20 -0xD5 0x64 # GOOD: c.lui s1, 21 -0xD9 0x64 # GOOD: c.lui s1, 22 -0xDD 0x64 # GOOD: c.lui s1, 23 -0xE1 0x64 # GOOD: c.lui s1, 24 -0xE5 0x64 # GOOD: c.lui s1, 25 -0xE9 0x64 # GOOD: c.lui s1, 26 -0xED 0x64 # GOOD: c.lui s1, 27 -0xF1 0x64 # GOOD: c.lui s1, 28 -0xF5 0x64 # GOOD: c.lui s1, 29 -0xF9 0x64 # GOOD: c.lui s1, 30 -0xFD 0x64 # GOOD: c.lui s1, 31 -0x81 0x74 # GOOD: c.lui s1, 1048544 -0x85 0x74 # GOOD: c.lui s1, 1048545 -0x89 0x74 # GOOD: c.lui s1, 1048546 -0x8D 0x74 # GOOD: c.lui s1, 1048547 -0x91 0x74 # GOOD: c.lui s1, 1048548 -0x95 0x74 # GOOD: c.lui s1, 1048549 -0x99 0x74 # GOOD: c.lui s1, 1048550 -0x9D 0x74 # GOOD: c.lui s1, 1048551 -0xA1 0x74 # GOOD: c.lui s1, 1048552 -0xA5 0x74 # GOOD: c.lui s1, 1048553 -0xA9 0x74 # GOOD: c.lui s1, 1048554 -0xAD 0x74 # GOOD: c.lui s1, 1048555 -0xB1 0x74 # GOOD: c.lui s1, 1048556 -0xB5 0x74 # GOOD: c.lui s1, 1048557 -0xB9 0x74 # GOOD: c.lui s1, 1048558 -0xBD 0x74 # GOOD: c.lui s1, 1048559 -0xC1 0x74 # GOOD: c.lui s1, 1048560 -0xC5 0x74 # GOOD: c.lui s1, 1048561 -0xC9 0x74 # GOOD: c.lui s1, 1048562 -0xCD 0x74 # GOOD: c.lui s1, 1048563 -0xD1 0x74 # GOOD: c.lui s1, 1048564 -0xD5 0x74 # GOOD: c.lui s1, 1048565 -0xD9 0x74 # GOOD: c.lui s1, 1048566 -0xDD 0x74 # GOOD: c.lui s1, 1048567 -0xE1 0x74 # GOOD: c.lui s1, 1048568 -0xE5 0x74 # GOOD: c.lui s1, 1048569 -0xE9 0x74 # GOOD: c.lui s1, 1048570 -0xED 0x74 # GOOD: c.lui s1, 1048571 -0xF1 0x74 # GOOD: c.lui s1, 1048572 -0xF5 0x74 # GOOD: c.lui s1, 1048573 -0xF9 0x74 # GOOD: c.lui s1, 1048574 -0xFD 0x74 # GOOD: c.lui s1, 1048575 -0x01 0x65 # BAD: invalid instruction encoding -0x05 0x65 # GOOD: c.lui a0, 1 -0x09 0x65 # GOOD: c.lui a0, 2 -0x0D 0x65 # GOOD: c.lui a0, 3 -0x11 0x65 # GOOD: c.lui a0, 4 -0x15 0x65 # GOOD: c.lui a0, 5 -0x19 0x65 # GOOD: c.lui a0, 6 -0x1D 0x65 # GOOD: c.lui a0, 7 -0x21 0x65 # GOOD: c.lui a0, 8 -0x25 0x65 # GOOD: c.lui a0, 9 -0x29 0x65 # GOOD: c.lui a0, 10 -0x2D 0x65 # GOOD: c.lui a0, 11 -0x31 0x65 # GOOD: c.lui a0, 12 -0x35 0x65 # GOOD: c.lui a0, 13 -0x39 0x65 # GOOD: c.lui a0, 14 -0x3D 0x65 # GOOD: c.lui a0, 15 -0x41 0x65 # GOOD: c.lui a0, 16 -0x45 0x65 # GOOD: c.lui a0, 17 -0x49 0x65 # GOOD: c.lui a0, 18 -0x4D 0x65 # GOOD: c.lui a0, 19 -0x51 0x65 # GOOD: c.lui a0, 20 -0x55 0x65 # GOOD: c.lui a0, 21 -0x59 0x65 # GOOD: c.lui a0, 22 -0x5D 0x65 # GOOD: c.lui a0, 23 -0x61 0x65 # GOOD: c.lui a0, 24 -0x65 0x65 # GOOD: c.lui a0, 25 -0x69 0x65 # GOOD: c.lui a0, 26 -0x6D 0x65 # GOOD: c.lui a0, 27 -0x71 0x65 # GOOD: c.lui a0, 28 -0x75 0x65 # GOOD: c.lui a0, 29 -0x79 0x65 # GOOD: c.lui a0, 30 -0x7D 0x65 # GOOD: c.lui a0, 31 -0x01 0x75 # GOOD: c.lui a0, 1048544 -0x05 0x75 # GOOD: c.lui a0, 1048545 -0x09 0x75 # GOOD: c.lui a0, 1048546 -0x0D 0x75 # GOOD: c.lui a0, 1048547 -0x11 0x75 # GOOD: c.lui a0, 1048548 -0x15 0x75 # GOOD: c.lui a0, 1048549 -0x19 0x75 # GOOD: c.lui a0, 1048550 -0x1D 0x75 # GOOD: c.lui a0, 1048551 -0x21 0x75 # GOOD: c.lui a0, 1048552 -0x25 0x75 # GOOD: c.lui a0, 1048553 -0x29 0x75 # GOOD: c.lui a0, 1048554 -0x2D 0x75 # GOOD: c.lui a0, 1048555 -0x31 0x75 # GOOD: c.lui a0, 1048556 -0x35 0x75 # GOOD: c.lui a0, 1048557 -0x39 0x75 # GOOD: c.lui a0, 1048558 -0x3D 0x75 # GOOD: c.lui a0, 1048559 -0x41 0x75 # GOOD: c.lui a0, 1048560 -0x45 0x75 # GOOD: c.lui a0, 1048561 -0x49 0x75 # GOOD: c.lui a0, 1048562 -0x4D 0x75 # GOOD: c.lui a0, 1048563 -0x51 0x75 # GOOD: c.lui a0, 1048564 -0x55 0x75 # GOOD: c.lui a0, 1048565 -0x59 0x75 # GOOD: c.lui a0, 1048566 -0x5D 0x75 # GOOD: c.lui a0, 1048567 -0x61 0x75 # GOOD: c.lui a0, 1048568 -0x65 0x75 # GOOD: c.lui a0, 1048569 -0x69 0x75 # GOOD: c.lui a0, 1048570 -0x6D 0x75 # GOOD: c.lui a0, 1048571 -0x71 0x75 # GOOD: c.lui a0, 1048572 -0x75 0x75 # GOOD: c.lui a0, 1048573 -0x79 0x75 # GOOD: c.lui a0, 1048574 -0x7D 0x75 # GOOD: c.lui a0, 1048575 -0x81 0x65 # BAD: invalid instruction encoding -0x81 0x65 # MOP: c.mop.11 -0x85 0x65 # GOOD: c.lui a1, 1 -0x89 0x65 # GOOD: c.lui a1, 2 -0x8D 0x65 # GOOD: c.lui a1, 3 -0x91 0x65 # GOOD: c.lui a1, 4 -0x95 0x65 # GOOD: c.lui a1, 5 -0x99 0x65 # GOOD: c.lui a1, 6 -0x9D 0x65 # GOOD: c.lui a1, 7 -0xA1 0x65 # GOOD: c.lui a1, 8 -0xA5 0x65 # GOOD: c.lui a1, 9 -0xA9 0x65 # GOOD: c.lui a1, 10 -0xAD 0x65 # GOOD: c.lui a1, 11 -0xB1 0x65 # GOOD: c.lui a1, 12 -0xB5 0x65 # GOOD: c.lui a1, 13 -0xB9 0x65 # GOOD: c.lui a1, 14 -0xBD 0x65 # GOOD: c.lui a1, 15 -0xC1 0x65 # GOOD: c.lui a1, 16 -0xC5 0x65 # GOOD: c.lui a1, 17 -0xC9 0x65 # GOOD: c.lui a1, 18 -0xCD 0x65 # GOOD: c.lui a1, 19 -0xD1 0x65 # GOOD: c.lui a1, 20 -0xD5 0x65 # GOOD: c.lui a1, 21 -0xD9 0x65 # GOOD: c.lui a1, 22 -0xDD 0x65 # GOOD: c.lui a1, 23 -0xE1 0x65 # GOOD: c.lui a1, 24 -0xE5 0x65 # GOOD: c.lui a1, 25 -0xE9 0x65 # GOOD: c.lui a1, 26 -0xED 0x65 # GOOD: c.lui a1, 27 -0xF1 0x65 # GOOD: c.lui a1, 28 -0xF5 0x65 # GOOD: c.lui a1, 29 -0xF9 0x65 # GOOD: c.lui a1, 30 -0xFD 0x65 # GOOD: c.lui a1, 31 -0x81 0x75 # GOOD: c.lui a1, 1048544 -0x85 0x75 # GOOD: c.lui a1, 1048545 -0x89 0x75 # GOOD: c.lui a1, 1048546 -0x8D 0x75 # GOOD: c.lui a1, 1048547 -0x91 0x75 # GOOD: c.lui a1, 1048548 -0x95 0x75 # GOOD: c.lui a1, 1048549 -0x99 0x75 # GOOD: c.lui a1, 1048550 -0x9D 0x75 # GOOD: c.lui a1, 1048551 -0xA1 0x75 # GOOD: c.lui a1, 1048552 -0xA5 0x75 # GOOD: c.lui a1, 1048553 -0xA9 0x75 # GOOD: c.lui a1, 1048554 -0xAD 0x75 # GOOD: c.lui a1, 1048555 -0xB1 0x75 # GOOD: c.lui a1, 1048556 -0xB5 0x75 # GOOD: c.lui a1, 1048557 -0xB9 0x75 # GOOD: c.lui a1, 1048558 -0xBD 0x75 # GOOD: c.lui a1, 1048559 -0xC1 0x75 # GOOD: c.lui a1, 1048560 -0xC5 0x75 # GOOD: c.lui a1, 1048561 -0xC9 0x75 # GOOD: c.lui a1, 1048562 -0xCD 0x75 # GOOD: c.lui a1, 1048563 -0xD1 0x75 # GOOD: c.lui a1, 1048564 -0xD5 0x75 # GOOD: c.lui a1, 1048565 -0xD9 0x75 # GOOD: c.lui a1, 1048566 -0xDD 0x75 # GOOD: c.lui a1, 1048567 -0xE1 0x75 # GOOD: c.lui a1, 1048568 -0xE5 0x75 # GOOD: c.lui a1, 1048569 -0xE9 0x75 # GOOD: c.lui a1, 1048570 -0xED 0x75 # GOOD: c.lui a1, 1048571 -0xF1 0x75 # GOOD: c.lui a1, 1048572 -0xF5 0x75 # GOOD: c.lui a1, 1048573 -0xF9 0x75 # GOOD: c.lui a1, 1048574 -0xFD 0x75 # GOOD: c.lui a1, 1048575 -0x01 0x66 # BAD: invalid instruction encoding -0x05 0x66 # GOOD: c.lui a2, 1 -0x09 0x66 # GOOD: c.lui a2, 2 -0x0D 0x66 # GOOD: c.lui a2, 3 -0x11 0x66 # GOOD: c.lui a2, 4 -0x15 0x66 # GOOD: c.lui a2, 5 -0x19 0x66 # GOOD: c.lui a2, 6 -0x1D 0x66 # GOOD: c.lui a2, 7 -0x21 0x66 # GOOD: c.lui a2, 8 -0x25 0x66 # GOOD: c.lui a2, 9 -0x29 0x66 # GOOD: c.lui a2, 10 -0x2D 0x66 # GOOD: c.lui a2, 11 -0x31 0x66 # GOOD: c.lui a2, 12 -0x35 0x66 # GOOD: c.lui a2, 13 -0x39 0x66 # GOOD: c.lui a2, 14 -0x3D 0x66 # GOOD: c.lui a2, 15 -0x41 0x66 # GOOD: c.lui a2, 16 -0x45 0x66 # GOOD: c.lui a2, 17 -0x49 0x66 # GOOD: c.lui a2, 18 -0x4D 0x66 # GOOD: c.lui a2, 19 -0x51 0x66 # GOOD: c.lui a2, 20 -0x55 0x66 # GOOD: c.lui a2, 21 -0x59 0x66 # GOOD: c.lui a2, 22 -0x5D 0x66 # GOOD: c.lui a2, 23 -0x61 0x66 # GOOD: c.lui a2, 24 -0x65 0x66 # GOOD: c.lui a2, 25 -0x69 0x66 # GOOD: c.lui a2, 26 -0x6D 0x66 # GOOD: c.lui a2, 27 -0x71 0x66 # GOOD: c.lui a2, 28 -0x75 0x66 # GOOD: c.lui a2, 29 -0x79 0x66 # GOOD: c.lui a2, 30 -0x7D 0x66 # GOOD: c.lui a2, 31 -0x01 0x76 # GOOD: c.lui a2, 1048544 -0x05 0x76 # GOOD: c.lui a2, 1048545 -0x09 0x76 # GOOD: c.lui a2, 1048546 -0x0D 0x76 # GOOD: c.lui a2, 1048547 -0x11 0x76 # GOOD: c.lui a2, 1048548 -0x15 0x76 # GOOD: c.lui a2, 1048549 -0x19 0x76 # GOOD: c.lui a2, 1048550 -0x1D 0x76 # GOOD: c.lui a2, 1048551 -0x21 0x76 # GOOD: c.lui a2, 1048552 -0x25 0x76 # GOOD: c.lui a2, 1048553 -0x29 0x76 # GOOD: c.lui a2, 1048554 -0x2D 0x76 # GOOD: c.lui a2, 1048555 -0x31 0x76 # GOOD: c.lui a2, 1048556 -0x35 0x76 # GOOD: c.lui a2, 1048557 -0x39 0x76 # GOOD: c.lui a2, 1048558 -0x3D 0x76 # GOOD: c.lui a2, 1048559 -0x41 0x76 # GOOD: c.lui a2, 1048560 -0x45 0x76 # GOOD: c.lui a2, 1048561 -0x49 0x76 # GOOD: c.lui a2, 1048562 -0x4D 0x76 # GOOD: c.lui a2, 1048563 -0x51 0x76 # GOOD: c.lui a2, 1048564 -0x55 0x76 # GOOD: c.lui a2, 1048565 -0x59 0x76 # GOOD: c.lui a2, 1048566 -0x5D 0x76 # GOOD: c.lui a2, 1048567 -0x61 0x76 # GOOD: c.lui a2, 1048568 -0x65 0x76 # GOOD: c.lui a2, 1048569 -0x69 0x76 # GOOD: c.lui a2, 1048570 -0x6D 0x76 # GOOD: c.lui a2, 1048571 -0x71 0x76 # GOOD: c.lui a2, 1048572 -0x75 0x76 # GOOD: c.lui a2, 1048573 -0x79 0x76 # GOOD: c.lui a2, 1048574 -0x7D 0x76 # GOOD: c.lui a2, 1048575 -0x81 0x66 # BAD: invalid instruction encoding -0x81 0x66 # MOP: c.mop.13 -0x85 0x66 # GOOD: c.lui a3, 1 -0x89 0x66 # GOOD: c.lui a3, 2 -0x8D 0x66 # GOOD: c.lui a3, 3 -0x91 0x66 # GOOD: c.lui a3, 4 -0x95 0x66 # GOOD: c.lui a3, 5 -0x99 0x66 # GOOD: c.lui a3, 6 -0x9D 0x66 # GOOD: c.lui a3, 7 -0xA1 0x66 # GOOD: c.lui a3, 8 -0xA5 0x66 # GOOD: c.lui a3, 9 -0xA9 0x66 # GOOD: c.lui a3, 10 -0xAD 0x66 # GOOD: c.lui a3, 11 -0xB1 0x66 # GOOD: c.lui a3, 12 -0xB5 0x66 # GOOD: c.lui a3, 13 -0xB9 0x66 # GOOD: c.lui a3, 14 -0xBD 0x66 # GOOD: c.lui a3, 15 -0xC1 0x66 # GOOD: c.lui a3, 16 -0xC5 0x66 # GOOD: c.lui a3, 17 -0xC9 0x66 # GOOD: c.lui a3, 18 -0xCD 0x66 # GOOD: c.lui a3, 19 -0xD1 0x66 # GOOD: c.lui a3, 20 -0xD5 0x66 # GOOD: c.lui a3, 21 -0xD9 0x66 # GOOD: c.lui a3, 22 -0xDD 0x66 # GOOD: c.lui a3, 23 -0xE1 0x66 # GOOD: c.lui a3, 24 -0xE5 0x66 # GOOD: c.lui a3, 25 -0xE9 0x66 # GOOD: c.lui a3, 26 -0xED 0x66 # GOOD: c.lui a3, 27 -0xF1 0x66 # GOOD: c.lui a3, 28 -0xF5 0x66 # GOOD: c.lui a3, 29 -0xF9 0x66 # GOOD: c.lui a3, 30 -0xFD 0x66 # GOOD: c.lui a3, 31 -0x81 0x76 # GOOD: c.lui a3, 1048544 -0x85 0x76 # GOOD: c.lui a3, 1048545 -0x89 0x76 # GOOD: c.lui a3, 1048546 -0x8D 0x76 # GOOD: c.lui a3, 1048547 -0x91 0x76 # GOOD: c.lui a3, 1048548 -0x95 0x76 # GOOD: c.lui a3, 1048549 -0x99 0x76 # GOOD: c.lui a3, 1048550 -0x9D 0x76 # GOOD: c.lui a3, 1048551 -0xA1 0x76 # GOOD: c.lui a3, 1048552 -0xA5 0x76 # GOOD: c.lui a3, 1048553 -0xA9 0x76 # GOOD: c.lui a3, 1048554 -0xAD 0x76 # GOOD: c.lui a3, 1048555 -0xB1 0x76 # GOOD: c.lui a3, 1048556 -0xB5 0x76 # GOOD: c.lui a3, 1048557 -0xB9 0x76 # GOOD: c.lui a3, 1048558 -0xBD 0x76 # GOOD: c.lui a3, 1048559 -0xC1 0x76 # GOOD: c.lui a3, 1048560 -0xC5 0x76 # GOOD: c.lui a3, 1048561 -0xC9 0x76 # GOOD: c.lui a3, 1048562 -0xCD 0x76 # GOOD: c.lui a3, 1048563 -0xD1 0x76 # GOOD: c.lui a3, 1048564 -0xD5 0x76 # GOOD: c.lui a3, 1048565 -0xD9 0x76 # GOOD: c.lui a3, 1048566 -0xDD 0x76 # GOOD: c.lui a3, 1048567 -0xE1 0x76 # GOOD: c.lui a3, 1048568 -0xE5 0x76 # GOOD: c.lui a3, 1048569 -0xE9 0x76 # GOOD: c.lui a3, 1048570 -0xED 0x76 # GOOD: c.lui a3, 1048571 -0xF1 0x76 # GOOD: c.lui a3, 1048572 -0xF5 0x76 # GOOD: c.lui a3, 1048573 -0xF9 0x76 # GOOD: c.lui a3, 1048574 -0xFD 0x76 # GOOD: c.lui a3, 1048575 -0x01 0x67 # BAD: invalid instruction encoding -0x05 0x67 # GOOD: c.lui a4, 1 -0x09 0x67 # GOOD: c.lui a4, 2 -0x0D 0x67 # GOOD: c.lui a4, 3 -0x11 0x67 # GOOD: c.lui a4, 4 -0x15 0x67 # GOOD: c.lui a4, 5 -0x19 0x67 # GOOD: c.lui a4, 6 -0x1D 0x67 # GOOD: c.lui a4, 7 -0x21 0x67 # GOOD: c.lui a4, 8 -0x25 0x67 # GOOD: c.lui a4, 9 -0x29 0x67 # GOOD: c.lui a4, 10 -0x2D 0x67 # GOOD: c.lui a4, 11 -0x31 0x67 # GOOD: c.lui a4, 12 -0x35 0x67 # GOOD: c.lui a4, 13 -0x39 0x67 # GOOD: c.lui a4, 14 -0x3D 0x67 # GOOD: c.lui a4, 15 -0x41 0x67 # GOOD: c.lui a4, 16 -0x45 0x67 # GOOD: c.lui a4, 17 -0x49 0x67 # GOOD: c.lui a4, 18 -0x4D 0x67 # GOOD: c.lui a4, 19 -0x51 0x67 # GOOD: c.lui a4, 20 -0x55 0x67 # GOOD: c.lui a4, 21 -0x59 0x67 # GOOD: c.lui a4, 22 -0x5D 0x67 # GOOD: c.lui a4, 23 -0x61 0x67 # GOOD: c.lui a4, 24 -0x65 0x67 # GOOD: c.lui a4, 25 -0x69 0x67 # GOOD: c.lui a4, 26 -0x6D 0x67 # GOOD: c.lui a4, 27 -0x71 0x67 # GOOD: c.lui a4, 28 -0x75 0x67 # GOOD: c.lui a4, 29 -0x79 0x67 # GOOD: c.lui a4, 30 -0x7D 0x67 # GOOD: c.lui a4, 31 -0x01 0x77 # GOOD: c.lui a4, 1048544 -0x05 0x77 # GOOD: c.lui a4, 1048545 -0x09 0x77 # GOOD: c.lui a4, 1048546 -0x0D 0x77 # GOOD: c.lui a4, 1048547 -0x11 0x77 # GOOD: c.lui a4, 1048548 -0x15 0x77 # GOOD: c.lui a4, 1048549 -0x19 0x77 # GOOD: c.lui a4, 1048550 -0x1D 0x77 # GOOD: c.lui a4, 1048551 -0x21 0x77 # GOOD: c.lui a4, 1048552 -0x25 0x77 # GOOD: c.lui a4, 1048553 -0x29 0x77 # GOOD: c.lui a4, 1048554 -0x2D 0x77 # GOOD: c.lui a4, 1048555 -0x31 0x77 # GOOD: c.lui a4, 1048556 -0x35 0x77 # GOOD: c.lui a4, 1048557 -0x39 0x77 # GOOD: c.lui a4, 1048558 -0x3D 0x77 # GOOD: c.lui a4, 1048559 -0x41 0x77 # GOOD: c.lui a4, 1048560 -0x45 0x77 # GOOD: c.lui a4, 1048561 -0x49 0x77 # GOOD: c.lui a4, 1048562 -0x4D 0x77 # GOOD: c.lui a4, 1048563 -0x51 0x77 # GOOD: c.lui a4, 1048564 -0x55 0x77 # GOOD: c.lui a4, 1048565 -0x59 0x77 # GOOD: c.lui a4, 1048566 -0x5D 0x77 # GOOD: c.lui a4, 1048567 -0x61 0x77 # GOOD: c.lui a4, 1048568 -0x65 0x77 # GOOD: c.lui a4, 1048569 -0x69 0x77 # GOOD: c.lui a4, 1048570 -0x6D 0x77 # GOOD: c.lui a4, 1048571 -0x71 0x77 # GOOD: c.lui a4, 1048572 -0x75 0x77 # GOOD: c.lui a4, 1048573 -0x79 0x77 # GOOD: c.lui a4, 1048574 -0x7D 0x77 # GOOD: c.lui a4, 1048575 -0x81 0x67 # BAD: invalid instruction encoding -0x81 0x67 # MOP: c.mop.15 -0x85 0x67 # GOOD: c.lui a5, 1 -0x89 0x67 # GOOD: c.lui a5, 2 -0x8D 0x67 # GOOD: c.lui a5, 3 -0x91 0x67 # GOOD: c.lui a5, 4 -0x95 0x67 # GOOD: c.lui a5, 5 -0x99 0x67 # GOOD: c.lui a5, 6 -0x9D 0x67 # GOOD: c.lui a5, 7 -0xA1 0x67 # GOOD: c.lui a5, 8 -0xA5 0x67 # GOOD: c.lui a5, 9 -0xA9 0x67 # GOOD: c.lui a5, 10 -0xAD 0x67 # GOOD: c.lui a5, 11 -0xB1 0x67 # GOOD: c.lui a5, 12 -0xB5 0x67 # GOOD: c.lui a5, 13 -0xB9 0x67 # GOOD: c.lui a5, 14 -0xBD 0x67 # GOOD: c.lui a5, 15 -0xC1 0x67 # GOOD: c.lui a5, 16 -0xC5 0x67 # GOOD: c.lui a5, 17 -0xC9 0x67 # GOOD: c.lui a5, 18 -0xCD 0x67 # GOOD: c.lui a5, 19 -0xD1 0x67 # GOOD: c.lui a5, 20 -0xD5 0x67 # GOOD: c.lui a5, 21 -0xD9 0x67 # GOOD: c.lui a5, 22 -0xDD 0x67 # GOOD: c.lui a5, 23 -0xE1 0x67 # GOOD: c.lui a5, 24 -0xE5 0x67 # GOOD: c.lui a5, 25 -0xE9 0x67 # GOOD: c.lui a5, 26 -0xED 0x67 # GOOD: c.lui a5, 27 -0xF1 0x67 # GOOD: c.lui a5, 28 -0xF5 0x67 # GOOD: c.lui a5, 29 -0xF9 0x67 # GOOD: c.lui a5, 30 -0xFD 0x67 # GOOD: c.lui a5, 31 -0x81 0x77 # GOOD: c.lui a5, 1048544 -0x85 0x77 # GOOD: c.lui a5, 1048545 -0x89 0x77 # GOOD: c.lui a5, 1048546 -0x8D 0x77 # GOOD: c.lui a5, 1048547 -0x91 0x77 # GOOD: c.lui a5, 1048548 -0x95 0x77 # GOOD: c.lui a5, 1048549 -0x99 0x77 # GOOD: c.lui a5, 1048550 -0x9D 0x77 # GOOD: c.lui a5, 1048551 -0xA1 0x77 # GOOD: c.lui a5, 1048552 -0xA5 0x77 # GOOD: c.lui a5, 1048553 -0xA9 0x77 # GOOD: c.lui a5, 1048554 -0xAD 0x77 # GOOD: c.lui a5, 1048555 -0xB1 0x77 # GOOD: c.lui a5, 1048556 -0xB5 0x77 # GOOD: c.lui a5, 1048557 -0xB9 0x77 # GOOD: c.lui a5, 1048558 -0xBD 0x77 # GOOD: c.lui a5, 1048559 -0xC1 0x77 # GOOD: c.lui a5, 1048560 -0xC5 0x77 # GOOD: c.lui a5, 1048561 -0xC9 0x77 # GOOD: c.lui a5, 1048562 -0xCD 0x77 # GOOD: c.lui a5, 1048563 -0xD1 0x77 # GOOD: c.lui a5, 1048564 -0xD5 0x77 # GOOD: c.lui a5, 1048565 -0xD9 0x77 # GOOD: c.lui a5, 1048566 -0xDD 0x77 # GOOD: c.lui a5, 1048567 -0xE1 0x77 # GOOD: c.lui a5, 1048568 -0xE5 0x77 # GOOD: c.lui a5, 1048569 -0xE9 0x77 # GOOD: c.lui a5, 1048570 -0xED 0x77 # GOOD: c.lui a5, 1048571 -0xF1 0x77 # GOOD: c.lui a5, 1048572 -0xF5 0x77 # GOOD: c.lui a5, 1048573 -0xF9 0x77 # GOOD: c.lui a5, 1048574 -0xFD 0x77 # GOOD: c.lui a5, 1048575 -0x01 0x68 # BAD: invalid instruction encoding -0x05 0x68 # GOOD: c.lui a6, 1 -0x09 0x68 # GOOD: c.lui a6, 2 -0x0D 0x68 # GOOD: c.lui a6, 3 -0x11 0x68 # GOOD: c.lui a6, 4 -0x15 0x68 # GOOD: c.lui a6, 5 -0x19 0x68 # GOOD: c.lui a6, 6 -0x1D 0x68 # GOOD: c.lui a6, 7 -0x21 0x68 # GOOD: c.lui a6, 8 -0x25 0x68 # GOOD: c.lui a6, 9 -0x29 0x68 # GOOD: c.lui a6, 10 -0x2D 0x68 # GOOD: c.lui a6, 11 -0x31 0x68 # GOOD: c.lui a6, 12 -0x35 0x68 # GOOD: c.lui a6, 13 -0x39 0x68 # GOOD: c.lui a6, 14 -0x3D 0x68 # GOOD: c.lui a6, 15 -0x41 0x68 # GOOD: c.lui a6, 16 -0x45 0x68 # GOOD: c.lui a6, 17 -0x49 0x68 # GOOD: c.lui a6, 18 -0x4D 0x68 # GOOD: c.lui a6, 19 -0x51 0x68 # GOOD: c.lui a6, 20 -0x55 0x68 # GOOD: c.lui a6, 21 -0x59 0x68 # GOOD: c.lui a6, 22 -0x5D 0x68 # GOOD: c.lui a6, 23 -0x61 0x68 # GOOD: c.lui a6, 24 -0x65 0x68 # GOOD: c.lui a6, 25 -0x69 0x68 # GOOD: c.lui a6, 26 -0x6D 0x68 # GOOD: c.lui a6, 27 -0x71 0x68 # GOOD: c.lui a6, 28 -0x75 0x68 # GOOD: c.lui a6, 29 -0x79 0x68 # GOOD: c.lui a6, 30 -0x7D 0x68 # GOOD: c.lui a6, 31 -0x01 0x78 # GOOD: c.lui a6, 1048544 -0x05 0x78 # GOOD: c.lui a6, 1048545 -0x09 0x78 # GOOD: c.lui a6, 1048546 -0x0D 0x78 # GOOD: c.lui a6, 1048547 -0x11 0x78 # GOOD: c.lui a6, 1048548 -0x15 0x78 # GOOD: c.lui a6, 1048549 -0x19 0x78 # GOOD: c.lui a6, 1048550 -0x1D 0x78 # GOOD: c.lui a6, 1048551 -0x21 0x78 # GOOD: c.lui a6, 1048552 -0x25 0x78 # GOOD: c.lui a6, 1048553 -0x29 0x78 # GOOD: c.lui a6, 1048554 -0x2D 0x78 # GOOD: c.lui a6, 1048555 -0x31 0x78 # GOOD: c.lui a6, 1048556 -0x35 0x78 # GOOD: c.lui a6, 1048557 -0x39 0x78 # GOOD: c.lui a6, 1048558 -0x3D 0x78 # GOOD: c.lui a6, 1048559 -0x41 0x78 # GOOD: c.lui a6, 1048560 -0x45 0x78 # GOOD: c.lui a6, 1048561 -0x49 0x78 # GOOD: c.lui a6, 1048562 -0x4D 0x78 # GOOD: c.lui a6, 1048563 -0x51 0x78 # GOOD: c.lui a6, 1048564 -0x55 0x78 # GOOD: c.lui a6, 1048565 -0x59 0x78 # GOOD: c.lui a6, 1048566 -0x5D 0x78 # GOOD: c.lui a6, 1048567 -0x61 0x78 # GOOD: c.lui a6, 1048568 -0x65 0x78 # GOOD: c.lui a6, 1048569 -0x69 0x78 # GOOD: c.lui a6, 1048570 -0x6D 0x78 # GOOD: c.lui a6, 1048571 -0x71 0x78 # GOOD: c.lui a6, 1048572 -0x75 0x78 # GOOD: c.lui a6, 1048573 -0x79 0x78 # GOOD: c.lui a6, 1048574 -0x7D 0x78 # GOOD: c.lui a6, 1048575 -0x81 0x68 # BAD: invalid instruction encoding -0x85 0x68 # GOOD: c.lui a7, 1 -0x89 0x68 # GOOD: c.lui a7, 2 -0x8D 0x68 # GOOD: c.lui a7, 3 -0x91 0x68 # GOOD: c.lui a7, 4 -0x95 0x68 # GOOD: c.lui a7, 5 -0x99 0x68 # GOOD: c.lui a7, 6 -0x9D 0x68 # GOOD: c.lui a7, 7 -0xA1 0x68 # GOOD: c.lui a7, 8 -0xA5 0x68 # GOOD: c.lui a7, 9 -0xA9 0x68 # GOOD: c.lui a7, 10 -0xAD 0x68 # GOOD: c.lui a7, 11 -0xB1 0x68 # GOOD: c.lui a7, 12 -0xB5 0x68 # GOOD: c.lui a7, 13 -0xB9 0x68 # GOOD: c.lui a7, 14 -0xBD 0x68 # GOOD: c.lui a7, 15 -0xC1 0x68 # GOOD: c.lui a7, 16 -0xC5 0x68 # GOOD: c.lui a7, 17 -0xC9 0x68 # GOOD: c.lui a7, 18 -0xCD 0x68 # GOOD: c.lui a7, 19 -0xD1 0x68 # GOOD: c.lui a7, 20 -0xD5 0x68 # GOOD: c.lui a7, 21 -0xD9 0x68 # GOOD: c.lui a7, 22 -0xDD 0x68 # GOOD: c.lui a7, 23 -0xE1 0x68 # GOOD: c.lui a7, 24 -0xE5 0x68 # GOOD: c.lui a7, 25 -0xE9 0x68 # GOOD: c.lui a7, 26 -0xED 0x68 # GOOD: c.lui a7, 27 -0xF1 0x68 # GOOD: c.lui a7, 28 -0xF5 0x68 # GOOD: c.lui a7, 29 -0xF9 0x68 # GOOD: c.lui a7, 30 -0xFD 0x68 # GOOD: c.lui a7, 31 -0x81 0x78 # GOOD: c.lui a7, 1048544 -0x85 0x78 # GOOD: c.lui a7, 1048545 -0x89 0x78 # GOOD: c.lui a7, 1048546 -0x8D 0x78 # GOOD: c.lui a7, 1048547 -0x91 0x78 # GOOD: c.lui a7, 1048548 -0x95 0x78 # GOOD: c.lui a7, 1048549 -0x99 0x78 # GOOD: c.lui a7, 1048550 -0x9D 0x78 # GOOD: c.lui a7, 1048551 -0xA1 0x78 # GOOD: c.lui a7, 1048552 -0xA5 0x78 # GOOD: c.lui a7, 1048553 -0xA9 0x78 # GOOD: c.lui a7, 1048554 -0xAD 0x78 # GOOD: c.lui a7, 1048555 -0xB1 0x78 # GOOD: c.lui a7, 1048556 -0xB5 0x78 # GOOD: c.lui a7, 1048557 -0xB9 0x78 # GOOD: c.lui a7, 1048558 -0xBD 0x78 # GOOD: c.lui a7, 1048559 -0xC1 0x78 # GOOD: c.lui a7, 1048560 -0xC5 0x78 # GOOD: c.lui a7, 1048561 -0xC9 0x78 # GOOD: c.lui a7, 1048562 -0xCD 0x78 # GOOD: c.lui a7, 1048563 -0xD1 0x78 # GOOD: c.lui a7, 1048564 -0xD5 0x78 # GOOD: c.lui a7, 1048565 -0xD9 0x78 # GOOD: c.lui a7, 1048566 -0xDD 0x78 # GOOD: c.lui a7, 1048567 -0xE1 0x78 # GOOD: c.lui a7, 1048568 -0xE5 0x78 # GOOD: c.lui a7, 1048569 -0xE9 0x78 # GOOD: c.lui a7, 1048570 -0xED 0x78 # GOOD: c.lui a7, 1048571 -0xF1 0x78 # GOOD: c.lui a7, 1048572 -0xF5 0x78 # GOOD: c.lui a7, 1048573 -0xF9 0x78 # GOOD: c.lui a7, 1048574 -0xFD 0x78 # GOOD: c.lui a7, 1048575 -0x01 0x69 # BAD: invalid instruction encoding -0x05 0x69 # GOOD: c.lui s2, 1 -0x09 0x69 # GOOD: c.lui s2, 2 -0x0D 0x69 # GOOD: c.lui s2, 3 -0x11 0x69 # GOOD: c.lui s2, 4 -0x15 0x69 # GOOD: c.lui s2, 5 -0x19 0x69 # GOOD: c.lui s2, 6 -0x1D 0x69 # GOOD: c.lui s2, 7 -0x21 0x69 # GOOD: c.lui s2, 8 -0x25 0x69 # GOOD: c.lui s2, 9 -0x29 0x69 # GOOD: c.lui s2, 10 -0x2D 0x69 # GOOD: c.lui s2, 11 -0x31 0x69 # GOOD: c.lui s2, 12 -0x35 0x69 # GOOD: c.lui s2, 13 -0x39 0x69 # GOOD: c.lui s2, 14 -0x3D 0x69 # GOOD: c.lui s2, 15 -0x41 0x69 # GOOD: c.lui s2, 16 -0x45 0x69 # GOOD: c.lui s2, 17 -0x49 0x69 # GOOD: c.lui s2, 18 -0x4D 0x69 # GOOD: c.lui s2, 19 -0x51 0x69 # GOOD: c.lui s2, 20 -0x55 0x69 # GOOD: c.lui s2, 21 -0x59 0x69 # GOOD: c.lui s2, 22 -0x5D 0x69 # GOOD: c.lui s2, 23 -0x61 0x69 # GOOD: c.lui s2, 24 -0x65 0x69 # GOOD: c.lui s2, 25 -0x69 0x69 # GOOD: c.lui s2, 26 -0x6D 0x69 # GOOD: c.lui s2, 27 -0x71 0x69 # GOOD: c.lui s2, 28 -0x75 0x69 # GOOD: c.lui s2, 29 -0x79 0x69 # GOOD: c.lui s2, 30 -0x7D 0x69 # GOOD: c.lui s2, 31 -0x01 0x79 # GOOD: c.lui s2, 1048544 -0x05 0x79 # GOOD: c.lui s2, 1048545 -0x09 0x79 # GOOD: c.lui s2, 1048546 -0x0D 0x79 # GOOD: c.lui s2, 1048547 -0x11 0x79 # GOOD: c.lui s2, 1048548 -0x15 0x79 # GOOD: c.lui s2, 1048549 -0x19 0x79 # GOOD: c.lui s2, 1048550 -0x1D 0x79 # GOOD: c.lui s2, 1048551 -0x21 0x79 # GOOD: c.lui s2, 1048552 -0x25 0x79 # GOOD: c.lui s2, 1048553 -0x29 0x79 # GOOD: c.lui s2, 1048554 -0x2D 0x79 # GOOD: c.lui s2, 1048555 -0x31 0x79 # GOOD: c.lui s2, 1048556 -0x35 0x79 # GOOD: c.lui s2, 1048557 -0x39 0x79 # GOOD: c.lui s2, 1048558 -0x3D 0x79 # GOOD: c.lui s2, 1048559 -0x41 0x79 # GOOD: c.lui s2, 1048560 -0x45 0x79 # GOOD: c.lui s2, 1048561 -0x49 0x79 # GOOD: c.lui s2, 1048562 -0x4D 0x79 # GOOD: c.lui s2, 1048563 -0x51 0x79 # GOOD: c.lui s2, 1048564 -0x55 0x79 # GOOD: c.lui s2, 1048565 -0x59 0x79 # GOOD: c.lui s2, 1048566 -0x5D 0x79 # GOOD: c.lui s2, 1048567 -0x61 0x79 # GOOD: c.lui s2, 1048568 -0x65 0x79 # GOOD: c.lui s2, 1048569 -0x69 0x79 # GOOD: c.lui s2, 1048570 -0x6D 0x79 # GOOD: c.lui s2, 1048571 -0x71 0x79 # GOOD: c.lui s2, 1048572 -0x75 0x79 # GOOD: c.lui s2, 1048573 -0x79 0x79 # GOOD: c.lui s2, 1048574 -0x7D 0x79 # GOOD: c.lui s2, 1048575 -0x81 0x69 # BAD: invalid instruction encoding -0x85 0x69 # GOOD: c.lui s3, 1 -0x89 0x69 # GOOD: c.lui s3, 2 -0x8D 0x69 # GOOD: c.lui s3, 3 -0x91 0x69 # GOOD: c.lui s3, 4 -0x95 0x69 # GOOD: c.lui s3, 5 -0x99 0x69 # GOOD: c.lui s3, 6 -0x9D 0x69 # GOOD: c.lui s3, 7 -0xA1 0x69 # GOOD: c.lui s3, 8 -0xA5 0x69 # GOOD: c.lui s3, 9 -0xA9 0x69 # GOOD: c.lui s3, 10 -0xAD 0x69 # GOOD: c.lui s3, 11 -0xB1 0x69 # GOOD: c.lui s3, 12 -0xB5 0x69 # GOOD: c.lui s3, 13 -0xB9 0x69 # GOOD: c.lui s3, 14 -0xBD 0x69 # GOOD: c.lui s3, 15 -0xC1 0x69 # GOOD: c.lui s3, 16 -0xC5 0x69 # GOOD: c.lui s3, 17 -0xC9 0x69 # GOOD: c.lui s3, 18 -0xCD 0x69 # GOOD: c.lui s3, 19 -0xD1 0x69 # GOOD: c.lui s3, 20 -0xD5 0x69 # GOOD: c.lui s3, 21 -0xD9 0x69 # GOOD: c.lui s3, 22 -0xDD 0x69 # GOOD: c.lui s3, 23 -0xE1 0x69 # GOOD: c.lui s3, 24 -0xE5 0x69 # GOOD: c.lui s3, 25 -0xE9 0x69 # GOOD: c.lui s3, 26 -0xED 0x69 # GOOD: c.lui s3, 27 -0xF1 0x69 # GOOD: c.lui s3, 28 -0xF5 0x69 # GOOD: c.lui s3, 29 -0xF9 0x69 # GOOD: c.lui s3, 30 -0xFD 0x69 # GOOD: c.lui s3, 31 -0x81 0x79 # GOOD: c.lui s3, 1048544 -0x85 0x79 # GOOD: c.lui s3, 1048545 -0x89 0x79 # GOOD: c.lui s3, 1048546 -0x8D 0x79 # GOOD: c.lui s3, 1048547 -0x91 0x79 # GOOD: c.lui s3, 1048548 -0x95 0x79 # GOOD: c.lui s3, 1048549 -0x99 0x79 # GOOD: c.lui s3, 1048550 -0x9D 0x79 # GOOD: c.lui s3, 1048551 -0xA1 0x79 # GOOD: c.lui s3, 1048552 -0xA5 0x79 # GOOD: c.lui s3, 1048553 -0xA9 0x79 # GOOD: c.lui s3, 1048554 -0xAD 0x79 # GOOD: c.lui s3, 1048555 -0xB1 0x79 # GOOD: c.lui s3, 1048556 -0xB5 0x79 # GOOD: c.lui s3, 1048557 -0xB9 0x79 # GOOD: c.lui s3, 1048558 -0xBD 0x79 # GOOD: c.lui s3, 1048559 -0xC1 0x79 # GOOD: c.lui s3, 1048560 -0xC5 0x79 # GOOD: c.lui s3, 1048561 -0xC9 0x79 # GOOD: c.lui s3, 1048562 -0xCD 0x79 # GOOD: c.lui s3, 1048563 -0xD1 0x79 # GOOD: c.lui s3, 1048564 -0xD5 0x79 # GOOD: c.lui s3, 1048565 -0xD9 0x79 # GOOD: c.lui s3, 1048566 -0xDD 0x79 # GOOD: c.lui s3, 1048567 -0xE1 0x79 # GOOD: c.lui s3, 1048568 -0xE5 0x79 # GOOD: c.lui s3, 1048569 -0xE9 0x79 # GOOD: c.lui s3, 1048570 -0xED 0x79 # GOOD: c.lui s3, 1048571 -0xF1 0x79 # GOOD: c.lui s3, 1048572 -0xF5 0x79 # GOOD: c.lui s3, 1048573 -0xF9 0x79 # GOOD: c.lui s3, 1048574 -0xFD 0x79 # GOOD: c.lui s3, 1048575 -0x01 0x6A # BAD: invalid instruction encoding -0x05 0x6A # GOOD: c.lui s4, 1 -0x09 0x6A # GOOD: c.lui s4, 2 -0x0D 0x6A # GOOD: c.lui s4, 3 -0x11 0x6A # GOOD: c.lui s4, 4 -0x15 0x6A # GOOD: c.lui s4, 5 -0x19 0x6A # GOOD: c.lui s4, 6 -0x1D 0x6A # GOOD: c.lui s4, 7 -0x21 0x6A # GOOD: c.lui s4, 8 -0x25 0x6A # GOOD: c.lui s4, 9 -0x29 0x6A # GOOD: c.lui s4, 10 -0x2D 0x6A # GOOD: c.lui s4, 11 -0x31 0x6A # GOOD: c.lui s4, 12 -0x35 0x6A # GOOD: c.lui s4, 13 -0x39 0x6A # GOOD: c.lui s4, 14 -0x3D 0x6A # GOOD: c.lui s4, 15 -0x41 0x6A # GOOD: c.lui s4, 16 -0x45 0x6A # GOOD: c.lui s4, 17 -0x49 0x6A # GOOD: c.lui s4, 18 -0x4D 0x6A # GOOD: c.lui s4, 19 -0x51 0x6A # GOOD: c.lui s4, 20 -0x55 0x6A # GOOD: c.lui s4, 21 -0x59 0x6A # GOOD: c.lui s4, 22 -0x5D 0x6A # GOOD: c.lui s4, 23 -0x61 0x6A # GOOD: c.lui s4, 24 -0x65 0x6A # GOOD: c.lui s4, 25 -0x69 0x6A # GOOD: c.lui s4, 26 -0x6D 0x6A # GOOD: c.lui s4, 27 -0x71 0x6A # GOOD: c.lui s4, 28 -0x75 0x6A # GOOD: c.lui s4, 29 -0x79 0x6A # GOOD: c.lui s4, 30 -0x7D 0x6A # GOOD: c.lui s4, 31 -0x01 0x7A # GOOD: c.lui s4, 1048544 -0x05 0x7A # GOOD: c.lui s4, 1048545 -0x09 0x7A # GOOD: c.lui s4, 1048546 -0x0D 0x7A # GOOD: c.lui s4, 1048547 -0x11 0x7A # GOOD: c.lui s4, 1048548 -0x15 0x7A # GOOD: c.lui s4, 1048549 -0x19 0x7A # GOOD: c.lui s4, 1048550 -0x1D 0x7A # GOOD: c.lui s4, 1048551 -0x21 0x7A # GOOD: c.lui s4, 1048552 -0x25 0x7A # GOOD: c.lui s4, 1048553 -0x29 0x7A # GOOD: c.lui s4, 1048554 -0x2D 0x7A # GOOD: c.lui s4, 1048555 -0x31 0x7A # GOOD: c.lui s4, 1048556 -0x35 0x7A # GOOD: c.lui s4, 1048557 -0x39 0x7A # GOOD: c.lui s4, 1048558 -0x3D 0x7A # GOOD: c.lui s4, 1048559 -0x41 0x7A # GOOD: c.lui s4, 1048560 -0x45 0x7A # GOOD: c.lui s4, 1048561 -0x49 0x7A # GOOD: c.lui s4, 1048562 -0x4D 0x7A # GOOD: c.lui s4, 1048563 -0x51 0x7A # GOOD: c.lui s4, 1048564 -0x55 0x7A # GOOD: c.lui s4, 1048565 -0x59 0x7A # GOOD: c.lui s4, 1048566 -0x5D 0x7A # GOOD: c.lui s4, 1048567 -0x61 0x7A # GOOD: c.lui s4, 1048568 -0x65 0x7A # GOOD: c.lui s4, 1048569 -0x69 0x7A # GOOD: c.lui s4, 1048570 -0x6D 0x7A # GOOD: c.lui s4, 1048571 -0x71 0x7A # GOOD: c.lui s4, 1048572 -0x75 0x7A # GOOD: c.lui s4, 1048573 -0x79 0x7A # GOOD: c.lui s4, 1048574 -0x7D 0x7A # GOOD: c.lui s4, 1048575 -0x81 0x6A # BAD: invalid instruction encoding -0x85 0x6A # GOOD: c.lui s5, 1 -0x89 0x6A # GOOD: c.lui s5, 2 -0x8D 0x6A # GOOD: c.lui s5, 3 -0x91 0x6A # GOOD: c.lui s5, 4 -0x95 0x6A # GOOD: c.lui s5, 5 -0x99 0x6A # GOOD: c.lui s5, 6 -0x9D 0x6A # GOOD: c.lui s5, 7 -0xA1 0x6A # GOOD: c.lui s5, 8 -0xA5 0x6A # GOOD: c.lui s5, 9 -0xA9 0x6A # GOOD: c.lui s5, 10 -0xAD 0x6A # GOOD: c.lui s5, 11 -0xB1 0x6A # GOOD: c.lui s5, 12 -0xB5 0x6A # GOOD: c.lui s5, 13 -0xB9 0x6A # GOOD: c.lui s5, 14 -0xBD 0x6A # GOOD: c.lui s5, 15 -0xC1 0x6A # GOOD: c.lui s5, 16 -0xC5 0x6A # GOOD: c.lui s5, 17 -0xC9 0x6A # GOOD: c.lui s5, 18 -0xCD 0x6A # GOOD: c.lui s5, 19 -0xD1 0x6A # GOOD: c.lui s5, 20 -0xD5 0x6A # GOOD: c.lui s5, 21 -0xD9 0x6A # GOOD: c.lui s5, 22 -0xDD 0x6A # GOOD: c.lui s5, 23 -0xE1 0x6A # GOOD: c.lui s5, 24 -0xE5 0x6A # GOOD: c.lui s5, 25 -0xE9 0x6A # GOOD: c.lui s5, 26 -0xED 0x6A # GOOD: c.lui s5, 27 -0xF1 0x6A # GOOD: c.lui s5, 28 -0xF5 0x6A # GOOD: c.lui s5, 29 -0xF9 0x6A # GOOD: c.lui s5, 30 -0xFD 0x6A # GOOD: c.lui s5, 31 -0x81 0x7A # GOOD: c.lui s5, 1048544 -0x85 0x7A # GOOD: c.lui s5, 1048545 -0x89 0x7A # GOOD: c.lui s5, 1048546 -0x8D 0x7A # GOOD: c.lui s5, 1048547 -0x91 0x7A # GOOD: c.lui s5, 1048548 -0x95 0x7A # GOOD: c.lui s5, 1048549 -0x99 0x7A # GOOD: c.lui s5, 1048550 -0x9D 0x7A # GOOD: c.lui s5, 1048551 -0xA1 0x7A # GOOD: c.lui s5, 1048552 -0xA5 0x7A # GOOD: c.lui s5, 1048553 -0xA9 0x7A # GOOD: c.lui s5, 1048554 -0xAD 0x7A # GOOD: c.lui s5, 1048555 -0xB1 0x7A # GOOD: c.lui s5, 1048556 -0xB5 0x7A # GOOD: c.lui s5, 1048557 -0xB9 0x7A # GOOD: c.lui s5, 1048558 -0xBD 0x7A # GOOD: c.lui s5, 1048559 -0xC1 0x7A # GOOD: c.lui s5, 1048560 -0xC5 0x7A # GOOD: c.lui s5, 1048561 -0xC9 0x7A # GOOD: c.lui s5, 1048562 -0xCD 0x7A # GOOD: c.lui s5, 1048563 -0xD1 0x7A # GOOD: c.lui s5, 1048564 -0xD5 0x7A # GOOD: c.lui s5, 1048565 -0xD9 0x7A # GOOD: c.lui s5, 1048566 -0xDD 0x7A # GOOD: c.lui s5, 1048567 -0xE1 0x7A # GOOD: c.lui s5, 1048568 -0xE5 0x7A # GOOD: c.lui s5, 1048569 -0xE9 0x7A # GOOD: c.lui s5, 1048570 -0xED 0x7A # GOOD: c.lui s5, 1048571 -0xF1 0x7A # GOOD: c.lui s5, 1048572 -0xF5 0x7A # GOOD: c.lui s5, 1048573 -0xF9 0x7A # GOOD: c.lui s5, 1048574 -0xFD 0x7A # GOOD: c.lui s5, 1048575 -0x01 0x6B # BAD: invalid instruction encoding -0x05 0x6B # GOOD: c.lui s6, 1 -0x09 0x6B # GOOD: c.lui s6, 2 -0x0D 0x6B # GOOD: c.lui s6, 3 -0x11 0x6B # GOOD: c.lui s6, 4 -0x15 0x6B # GOOD: c.lui s6, 5 -0x19 0x6B # GOOD: c.lui s6, 6 -0x1D 0x6B # GOOD: c.lui s6, 7 -0x21 0x6B # GOOD: c.lui s6, 8 -0x25 0x6B # GOOD: c.lui s6, 9 -0x29 0x6B # GOOD: c.lui s6, 10 -0x2D 0x6B # GOOD: c.lui s6, 11 -0x31 0x6B # GOOD: c.lui s6, 12 -0x35 0x6B # GOOD: c.lui s6, 13 -0x39 0x6B # GOOD: c.lui s6, 14 -0x3D 0x6B # GOOD: c.lui s6, 15 -0x41 0x6B # GOOD: c.lui s6, 16 -0x45 0x6B # GOOD: c.lui s6, 17 -0x49 0x6B # GOOD: c.lui s6, 18 -0x4D 0x6B # GOOD: c.lui s6, 19 -0x51 0x6B # GOOD: c.lui s6, 20 -0x55 0x6B # GOOD: c.lui s6, 21 -0x59 0x6B # GOOD: c.lui s6, 22 -0x5D 0x6B # GOOD: c.lui s6, 23 -0x61 0x6B # GOOD: c.lui s6, 24 -0x65 0x6B # GOOD: c.lui s6, 25 -0x69 0x6B # GOOD: c.lui s6, 26 -0x6D 0x6B # GOOD: c.lui s6, 27 -0x71 0x6B # GOOD: c.lui s6, 28 -0x75 0x6B # GOOD: c.lui s6, 29 -0x79 0x6B # GOOD: c.lui s6, 30 -0x7D 0x6B # GOOD: c.lui s6, 31 -0x01 0x7B # GOOD: c.lui s6, 1048544 -0x05 0x7B # GOOD: c.lui s6, 1048545 -0x09 0x7B # GOOD: c.lui s6, 1048546 -0x0D 0x7B # GOOD: c.lui s6, 1048547 -0x11 0x7B # GOOD: c.lui s6, 1048548 -0x15 0x7B # GOOD: c.lui s6, 1048549 -0x19 0x7B # GOOD: c.lui s6, 1048550 -0x1D 0x7B # GOOD: c.lui s6, 1048551 -0x21 0x7B # GOOD: c.lui s6, 1048552 -0x25 0x7B # GOOD: c.lui s6, 1048553 -0x29 0x7B # GOOD: c.lui s6, 1048554 -0x2D 0x7B # GOOD: c.lui s6, 1048555 -0x31 0x7B # GOOD: c.lui s6, 1048556 -0x35 0x7B # GOOD: c.lui s6, 1048557 -0x39 0x7B # GOOD: c.lui s6, 1048558 -0x3D 0x7B # GOOD: c.lui s6, 1048559 -0x41 0x7B # GOOD: c.lui s6, 1048560 -0x45 0x7B # GOOD: c.lui s6, 1048561 -0x49 0x7B # GOOD: c.lui s6, 1048562 -0x4D 0x7B # GOOD: c.lui s6, 1048563 -0x51 0x7B # GOOD: c.lui s6, 1048564 -0x55 0x7B # GOOD: c.lui s6, 1048565 -0x59 0x7B # GOOD: c.lui s6, 1048566 -0x5D 0x7B # GOOD: c.lui s6, 1048567 -0x61 0x7B # GOOD: c.lui s6, 1048568 -0x65 0x7B # GOOD: c.lui s6, 1048569 -0x69 0x7B # GOOD: c.lui s6, 1048570 -0x6D 0x7B # GOOD: c.lui s6, 1048571 -0x71 0x7B # GOOD: c.lui s6, 1048572 -0x75 0x7B # GOOD: c.lui s6, 1048573 -0x79 0x7B # GOOD: c.lui s6, 1048574 -0x7D 0x7B # GOOD: c.lui s6, 1048575 -0x81 0x6B # BAD: invalid instruction encoding -0x85 0x6B # GOOD: c.lui s7, 1 -0x89 0x6B # GOOD: c.lui s7, 2 -0x8D 0x6B # GOOD: c.lui s7, 3 -0x91 0x6B # GOOD: c.lui s7, 4 -0x95 0x6B # GOOD: c.lui s7, 5 -0x99 0x6B # GOOD: c.lui s7, 6 -0x9D 0x6B # GOOD: c.lui s7, 7 -0xA1 0x6B # GOOD: c.lui s7, 8 -0xA5 0x6B # GOOD: c.lui s7, 9 -0xA9 0x6B # GOOD: c.lui s7, 10 -0xAD 0x6B # GOOD: c.lui s7, 11 -0xB1 0x6B # GOOD: c.lui s7, 12 -0xB5 0x6B # GOOD: c.lui s7, 13 -0xB9 0x6B # GOOD: c.lui s7, 14 -0xBD 0x6B # GOOD: c.lui s7, 15 -0xC1 0x6B # GOOD: c.lui s7, 16 -0xC5 0x6B # GOOD: c.lui s7, 17 -0xC9 0x6B # GOOD: c.lui s7, 18 -0xCD 0x6B # GOOD: c.lui s7, 19 -0xD1 0x6B # GOOD: c.lui s7, 20 -0xD5 0x6B # GOOD: c.lui s7, 21 -0xD9 0x6B # GOOD: c.lui s7, 22 -0xDD 0x6B # GOOD: c.lui s7, 23 -0xE1 0x6B # GOOD: c.lui s7, 24 -0xE5 0x6B # GOOD: c.lui s7, 25 -0xE9 0x6B # GOOD: c.lui s7, 26 -0xED 0x6B # GOOD: c.lui s7, 27 -0xF1 0x6B # GOOD: c.lui s7, 28 -0xF5 0x6B # GOOD: c.lui s7, 29 -0xF9 0x6B # GOOD: c.lui s7, 30 -0xFD 0x6B # GOOD: c.lui s7, 31 -0x81 0x7B # GOOD: c.lui s7, 1048544 -0x85 0x7B # GOOD: c.lui s7, 1048545 -0x89 0x7B # GOOD: c.lui s7, 1048546 -0x8D 0x7B # GOOD: c.lui s7, 1048547 -0x91 0x7B # GOOD: c.lui s7, 1048548 -0x95 0x7B # GOOD: c.lui s7, 1048549 -0x99 0x7B # GOOD: c.lui s7, 1048550 -0x9D 0x7B # GOOD: c.lui s7, 1048551 -0xA1 0x7B # GOOD: c.lui s7, 1048552 -0xA5 0x7B # GOOD: c.lui s7, 1048553 -0xA9 0x7B # GOOD: c.lui s7, 1048554 -0xAD 0x7B # GOOD: c.lui s7, 1048555 -0xB1 0x7B # GOOD: c.lui s7, 1048556 -0xB5 0x7B # GOOD: c.lui s7, 1048557 -0xB9 0x7B # GOOD: c.lui s7, 1048558 -0xBD 0x7B # GOOD: c.lui s7, 1048559 -0xC1 0x7B # GOOD: c.lui s7, 1048560 -0xC5 0x7B # GOOD: c.lui s7, 1048561 -0xC9 0x7B # GOOD: c.lui s7, 1048562 -0xCD 0x7B # GOOD: c.lui s7, 1048563 -0xD1 0x7B # GOOD: c.lui s7, 1048564 -0xD5 0x7B # GOOD: c.lui s7, 1048565 -0xD9 0x7B # GOOD: c.lui s7, 1048566 -0xDD 0x7B # GOOD: c.lui s7, 1048567 -0xE1 0x7B # GOOD: c.lui s7, 1048568 -0xE5 0x7B # GOOD: c.lui s7, 1048569 -0xE9 0x7B # GOOD: c.lui s7, 1048570 -0xED 0x7B # GOOD: c.lui s7, 1048571 -0xF1 0x7B # GOOD: c.lui s7, 1048572 -0xF5 0x7B # GOOD: c.lui s7, 1048573 -0xF9 0x7B # GOOD: c.lui s7, 1048574 -0xFD 0x7B # GOOD: c.lui s7, 1048575 -0x01 0x6C # BAD: invalid instruction encoding -0x05 0x6C # GOOD: c.lui s8, 1 -0x09 0x6C # GOOD: c.lui s8, 2 -0x0D 0x6C # GOOD: c.lui s8, 3 -0x11 0x6C # GOOD: c.lui s8, 4 -0x15 0x6C # GOOD: c.lui s8, 5 -0x19 0x6C # GOOD: c.lui s8, 6 -0x1D 0x6C # GOOD: c.lui s8, 7 -0x21 0x6C # GOOD: c.lui s8, 8 -0x25 0x6C # GOOD: c.lui s8, 9 -0x29 0x6C # GOOD: c.lui s8, 10 -0x2D 0x6C # GOOD: c.lui s8, 11 -0x31 0x6C # GOOD: c.lui s8, 12 -0x35 0x6C # GOOD: c.lui s8, 13 -0x39 0x6C # GOOD: c.lui s8, 14 -0x3D 0x6C # GOOD: c.lui s8, 15 -0x41 0x6C # GOOD: c.lui s8, 16 -0x45 0x6C # GOOD: c.lui s8, 17 -0x49 0x6C # GOOD: c.lui s8, 18 -0x4D 0x6C # GOOD: c.lui s8, 19 -0x51 0x6C # GOOD: c.lui s8, 20 -0x55 0x6C # GOOD: c.lui s8, 21 -0x59 0x6C # GOOD: c.lui s8, 22 -0x5D 0x6C # GOOD: c.lui s8, 23 -0x61 0x6C # GOOD: c.lui s8, 24 -0x65 0x6C # GOOD: c.lui s8, 25 -0x69 0x6C # GOOD: c.lui s8, 26 -0x6D 0x6C # GOOD: c.lui s8, 27 -0x71 0x6C # GOOD: c.lui s8, 28 -0x75 0x6C # GOOD: c.lui s8, 29 -0x79 0x6C # GOOD: c.lui s8, 30 -0x7D 0x6C # GOOD: c.lui s8, 31 -0x01 0x7C # GOOD: c.lui s8, 1048544 -0x05 0x7C # GOOD: c.lui s8, 1048545 -0x09 0x7C # GOOD: c.lui s8, 1048546 -0x0D 0x7C # GOOD: c.lui s8, 1048547 -0x11 0x7C # GOOD: c.lui s8, 1048548 -0x15 0x7C # GOOD: c.lui s8, 1048549 -0x19 0x7C # GOOD: c.lui s8, 1048550 -0x1D 0x7C # GOOD: c.lui s8, 1048551 -0x21 0x7C # GOOD: c.lui s8, 1048552 -0x25 0x7C # GOOD: c.lui s8, 1048553 -0x29 0x7C # GOOD: c.lui s8, 1048554 -0x2D 0x7C # GOOD: c.lui s8, 1048555 -0x31 0x7C # GOOD: c.lui s8, 1048556 -0x35 0x7C # GOOD: c.lui s8, 1048557 -0x39 0x7C # GOOD: c.lui s8, 1048558 -0x3D 0x7C # GOOD: c.lui s8, 1048559 -0x41 0x7C # GOOD: c.lui s8, 1048560 -0x45 0x7C # GOOD: c.lui s8, 1048561 -0x49 0x7C # GOOD: c.lui s8, 1048562 -0x4D 0x7C # GOOD: c.lui s8, 1048563 -0x51 0x7C # GOOD: c.lui s8, 1048564 -0x55 0x7C # GOOD: c.lui s8, 1048565 -0x59 0x7C # GOOD: c.lui s8, 1048566 -0x5D 0x7C # GOOD: c.lui s8, 1048567 -0x61 0x7C # GOOD: c.lui s8, 1048568 -0x65 0x7C # GOOD: c.lui s8, 1048569 -0x69 0x7C # GOOD: c.lui s8, 1048570 -0x6D 0x7C # GOOD: c.lui s8, 1048571 -0x71 0x7C # GOOD: c.lui s8, 1048572 -0x75 0x7C # GOOD: c.lui s8, 1048573 -0x79 0x7C # GOOD: c.lui s8, 1048574 -0x7D 0x7C # GOOD: c.lui s8, 1048575 -0x81 0x6C # BAD: invalid instruction encoding -0x85 0x6C # GOOD: c.lui s9, 1 -0x89 0x6C # GOOD: c.lui s9, 2 -0x8D 0x6C # GOOD: c.lui s9, 3 -0x91 0x6C # GOOD: c.lui s9, 4 -0x95 0x6C # GOOD: c.lui s9, 5 -0x99 0x6C # GOOD: c.lui s9, 6 -0x9D 0x6C # GOOD: c.lui s9, 7 -0xA1 0x6C # GOOD: c.lui s9, 8 -0xA5 0x6C # GOOD: c.lui s9, 9 -0xA9 0x6C # GOOD: c.lui s9, 10 -0xAD 0x6C # GOOD: c.lui s9, 11 -0xB1 0x6C # GOOD: c.lui s9, 12 -0xB5 0x6C # GOOD: c.lui s9, 13 -0xB9 0x6C # GOOD: c.lui s9, 14 -0xBD 0x6C # GOOD: c.lui s9, 15 -0xC1 0x6C # GOOD: c.lui s9, 16 -0xC5 0x6C # GOOD: c.lui s9, 17 -0xC9 0x6C # GOOD: c.lui s9, 18 -0xCD 0x6C # GOOD: c.lui s9, 19 -0xD1 0x6C # GOOD: c.lui s9, 20 -0xD5 0x6C # GOOD: c.lui s9, 21 -0xD9 0x6C # GOOD: c.lui s9, 22 -0xDD 0x6C # GOOD: c.lui s9, 23 -0xE1 0x6C # GOOD: c.lui s9, 24 -0xE5 0x6C # GOOD: c.lui s9, 25 -0xE9 0x6C # GOOD: c.lui s9, 26 -0xED 0x6C # GOOD: c.lui s9, 27 -0xF1 0x6C # GOOD: c.lui s9, 28 -0xF5 0x6C # GOOD: c.lui s9, 29 -0xF9 0x6C # GOOD: c.lui s9, 30 -0xFD 0x6C # GOOD: c.lui s9, 31 -0x81 0x7C # GOOD: c.lui s9, 1048544 -0x85 0x7C # GOOD: c.lui s9, 1048545 -0x89 0x7C # GOOD: c.lui s9, 1048546 -0x8D 0x7C # GOOD: c.lui s9, 1048547 -0x91 0x7C # GOOD: c.lui s9, 1048548 -0x95 0x7C # GOOD: c.lui s9, 1048549 -0x99 0x7C # GOOD: c.lui s9, 1048550 -0x9D 0x7C # GOOD: c.lui s9, 1048551 -0xA1 0x7C # GOOD: c.lui s9, 1048552 -0xA5 0x7C # GOOD: c.lui s9, 1048553 -0xA9 0x7C # GOOD: c.lui s9, 1048554 -0xAD 0x7C # GOOD: c.lui s9, 1048555 -0xB1 0x7C # GOOD: c.lui s9, 1048556 -0xB5 0x7C # GOOD: c.lui s9, 1048557 -0xB9 0x7C # GOOD: c.lui s9, 1048558 -0xBD 0x7C # GOOD: c.lui s9, 1048559 -0xC1 0x7C # GOOD: c.lui s9, 1048560 -0xC5 0x7C # GOOD: c.lui s9, 1048561 -0xC9 0x7C # GOOD: c.lui s9, 1048562 -0xCD 0x7C # GOOD: c.lui s9, 1048563 -0xD1 0x7C # GOOD: c.lui s9, 1048564 -0xD5 0x7C # GOOD: c.lui s9, 1048565 -0xD9 0x7C # GOOD: c.lui s9, 1048566 -0xDD 0x7C # GOOD: c.lui s9, 1048567 -0xE1 0x7C # GOOD: c.lui s9, 1048568 -0xE5 0x7C # GOOD: c.lui s9, 1048569 -0xE9 0x7C # GOOD: c.lui s9, 1048570 -0xED 0x7C # GOOD: c.lui s9, 1048571 -0xF1 0x7C # GOOD: c.lui s9, 1048572 -0xF5 0x7C # GOOD: c.lui s9, 1048573 -0xF9 0x7C # GOOD: c.lui s9, 1048574 -0xFD 0x7C # GOOD: c.lui s9, 1048575 -0x01 0x6D # BAD: invalid instruction encoding -0x05 0x6D # GOOD: c.lui s10, 1 -0x09 0x6D # GOOD: c.lui s10, 2 -0x0D 0x6D # GOOD: c.lui s10, 3 -0x11 0x6D # GOOD: c.lui s10, 4 -0x15 0x6D # GOOD: c.lui s10, 5 -0x19 0x6D # GOOD: c.lui s10, 6 -0x1D 0x6D # GOOD: c.lui s10, 7 -0x21 0x6D # GOOD: c.lui s10, 8 -0x25 0x6D # GOOD: c.lui s10, 9 -0x29 0x6D # GOOD: c.lui s10, 10 -0x2D 0x6D # GOOD: c.lui s10, 11 -0x31 0x6D # GOOD: c.lui s10, 12 -0x35 0x6D # GOOD: c.lui s10, 13 -0x39 0x6D # GOOD: c.lui s10, 14 -0x3D 0x6D # GOOD: c.lui s10, 15 -0x41 0x6D # GOOD: c.lui s10, 16 -0x45 0x6D # GOOD: c.lui s10, 17 -0x49 0x6D # GOOD: c.lui s10, 18 -0x4D 0x6D # GOOD: c.lui s10, 19 -0x51 0x6D # GOOD: c.lui s10, 20 -0x55 0x6D # GOOD: c.lui s10, 21 -0x59 0x6D # GOOD: c.lui s10, 22 -0x5D 0x6D # GOOD: c.lui s10, 23 -0x61 0x6D # GOOD: c.lui s10, 24 -0x65 0x6D # GOOD: c.lui s10, 25 -0x69 0x6D # GOOD: c.lui s10, 26 -0x6D 0x6D # GOOD: c.lui s10, 27 -0x71 0x6D # GOOD: c.lui s10, 28 -0x75 0x6D # GOOD: c.lui s10, 29 -0x79 0x6D # GOOD: c.lui s10, 30 -0x7D 0x6D # GOOD: c.lui s10, 31 -0x01 0x7D # GOOD: c.lui s10, 1048544 -0x05 0x7D # GOOD: c.lui s10, 1048545 -0x09 0x7D # GOOD: c.lui s10, 1048546 -0x0D 0x7D # GOOD: c.lui s10, 1048547 -0x11 0x7D # GOOD: c.lui s10, 1048548 -0x15 0x7D # GOOD: c.lui s10, 1048549 -0x19 0x7D # GOOD: c.lui s10, 1048550 -0x1D 0x7D # GOOD: c.lui s10, 1048551 -0x21 0x7D # GOOD: c.lui s10, 1048552 -0x25 0x7D # GOOD: c.lui s10, 1048553 -0x29 0x7D # GOOD: c.lui s10, 1048554 -0x2D 0x7D # GOOD: c.lui s10, 1048555 -0x31 0x7D # GOOD: c.lui s10, 1048556 -0x35 0x7D # GOOD: c.lui s10, 1048557 -0x39 0x7D # GOOD: c.lui s10, 1048558 -0x3D 0x7D # GOOD: c.lui s10, 1048559 -0x41 0x7D # GOOD: c.lui s10, 1048560 -0x45 0x7D # GOOD: c.lui s10, 1048561 -0x49 0x7D # GOOD: c.lui s10, 1048562 -0x4D 0x7D # GOOD: c.lui s10, 1048563 -0x51 0x7D # GOOD: c.lui s10, 1048564 -0x55 0x7D # GOOD: c.lui s10, 1048565 -0x59 0x7D # GOOD: c.lui s10, 1048566 -0x5D 0x7D # GOOD: c.lui s10, 1048567 -0x61 0x7D # GOOD: c.lui s10, 1048568 -0x65 0x7D # GOOD: c.lui s10, 1048569 -0x69 0x7D # GOOD: c.lui s10, 1048570 -0x6D 0x7D # GOOD: c.lui s10, 1048571 -0x71 0x7D # GOOD: c.lui s10, 1048572 -0x75 0x7D # GOOD: c.lui s10, 1048573 -0x79 0x7D # GOOD: c.lui s10, 1048574 -0x7D 0x7D # GOOD: c.lui s10, 1048575 -0x81 0x6D # BAD: invalid instruction encoding -0x85 0x6D # GOOD: c.lui s11, 1 -0x89 0x6D # GOOD: c.lui s11, 2 -0x8D 0x6D # GOOD: c.lui s11, 3 -0x91 0x6D # GOOD: c.lui s11, 4 -0x95 0x6D # GOOD: c.lui s11, 5 -0x99 0x6D # GOOD: c.lui s11, 6 -0x9D 0x6D # GOOD: c.lui s11, 7 -0xA1 0x6D # GOOD: c.lui s11, 8 -0xA5 0x6D # GOOD: c.lui s11, 9 -0xA9 0x6D # GOOD: c.lui s11, 10 -0xAD 0x6D # GOOD: c.lui s11, 11 -0xB1 0x6D # GOOD: c.lui s11, 12 -0xB5 0x6D # GOOD: c.lui s11, 13 -0xB9 0x6D # GOOD: c.lui s11, 14 -0xBD 0x6D # GOOD: c.lui s11, 15 -0xC1 0x6D # GOOD: c.lui s11, 16 -0xC5 0x6D # GOOD: c.lui s11, 17 -0xC9 0x6D # GOOD: c.lui s11, 18 -0xCD 0x6D # GOOD: c.lui s11, 19 -0xD1 0x6D # GOOD: c.lui s11, 20 -0xD5 0x6D # GOOD: c.lui s11, 21 -0xD9 0x6D # GOOD: c.lui s11, 22 -0xDD 0x6D # GOOD: c.lui s11, 23 -0xE1 0x6D # GOOD: c.lui s11, 24 -0xE5 0x6D # GOOD: c.lui s11, 25 -0xE9 0x6D # GOOD: c.lui s11, 26 -0xED 0x6D # GOOD: c.lui s11, 27 -0xF1 0x6D # GOOD: c.lui s11, 28 -0xF5 0x6D # GOOD: c.lui s11, 29 -0xF9 0x6D # GOOD: c.lui s11, 30 -0xFD 0x6D # GOOD: c.lui s11, 31 -0x81 0x7D # GOOD: c.lui s11, 1048544 -0x85 0x7D # GOOD: c.lui s11, 1048545 -0x89 0x7D # GOOD: c.lui s11, 1048546 -0x8D 0x7D # GOOD: c.lui s11, 1048547 -0x91 0x7D # GOOD: c.lui s11, 1048548 -0x95 0x7D # GOOD: c.lui s11, 1048549 -0x99 0x7D # GOOD: c.lui s11, 1048550 -0x9D 0x7D # GOOD: c.lui s11, 1048551 -0xA1 0x7D # GOOD: c.lui s11, 1048552 -0xA5 0x7D # GOOD: c.lui s11, 1048553 -0xA9 0x7D # GOOD: c.lui s11, 1048554 -0xAD 0x7D # GOOD: c.lui s11, 1048555 -0xB1 0x7D # GOOD: c.lui s11, 1048556 -0xB5 0x7D # GOOD: c.lui s11, 1048557 -0xB9 0x7D # GOOD: c.lui s11, 1048558 -0xBD 0x7D # GOOD: c.lui s11, 1048559 -0xC1 0x7D # GOOD: c.lui s11, 1048560 -0xC5 0x7D # GOOD: c.lui s11, 1048561 -0xC9 0x7D # GOOD: c.lui s11, 1048562 -0xCD 0x7D # GOOD: c.lui s11, 1048563 -0xD1 0x7D # GOOD: c.lui s11, 1048564 -0xD5 0x7D # GOOD: c.lui s11, 1048565 -0xD9 0x7D # GOOD: c.lui s11, 1048566 -0xDD 0x7D # GOOD: c.lui s11, 1048567 -0xE1 0x7D # GOOD: c.lui s11, 1048568 -0xE5 0x7D # GOOD: c.lui s11, 1048569 -0xE9 0x7D # GOOD: c.lui s11, 1048570 -0xED 0x7D # GOOD: c.lui s11, 1048571 -0xF1 0x7D # GOOD: c.lui s11, 1048572 -0xF5 0x7D # GOOD: c.lui s11, 1048573 -0xF9 0x7D # GOOD: c.lui s11, 1048574 -0xFD 0x7D # GOOD: c.lui s11, 1048575 -0x01 0x6E # BAD: invalid instruction encoding -0x05 0x6E # GOOD: c.lui t3, 1 -0x09 0x6E # GOOD: c.lui t3, 2 -0x0D 0x6E # GOOD: c.lui t3, 3 -0x11 0x6E # GOOD: c.lui t3, 4 -0x15 0x6E # GOOD: c.lui t3, 5 -0x19 0x6E # GOOD: c.lui t3, 6 -0x1D 0x6E # GOOD: c.lui t3, 7 -0x21 0x6E # GOOD: c.lui t3, 8 -0x25 0x6E # GOOD: c.lui t3, 9 -0x29 0x6E # GOOD: c.lui t3, 10 -0x2D 0x6E # GOOD: c.lui t3, 11 -0x31 0x6E # GOOD: c.lui t3, 12 -0x35 0x6E # GOOD: c.lui t3, 13 -0x39 0x6E # GOOD: c.lui t3, 14 -0x3D 0x6E # GOOD: c.lui t3, 15 -0x41 0x6E # GOOD: c.lui t3, 16 -0x45 0x6E # GOOD: c.lui t3, 17 -0x49 0x6E # GOOD: c.lui t3, 18 -0x4D 0x6E # GOOD: c.lui t3, 19 -0x51 0x6E # GOOD: c.lui t3, 20 -0x55 0x6E # GOOD: c.lui t3, 21 -0x59 0x6E # GOOD: c.lui t3, 22 -0x5D 0x6E # GOOD: c.lui t3, 23 -0x61 0x6E # GOOD: c.lui t3, 24 -0x65 0x6E # GOOD: c.lui t3, 25 -0x69 0x6E # GOOD: c.lui t3, 26 -0x6D 0x6E # GOOD: c.lui t3, 27 -0x71 0x6E # GOOD: c.lui t3, 28 -0x75 0x6E # GOOD: c.lui t3, 29 -0x79 0x6E # GOOD: c.lui t3, 30 -0x7D 0x6E # GOOD: c.lui t3, 31 -0x01 0x7E # GOOD: c.lui t3, 1048544 -0x05 0x7E # GOOD: c.lui t3, 1048545 -0x09 0x7E # GOOD: c.lui t3, 1048546 -0x0D 0x7E # GOOD: c.lui t3, 1048547 -0x11 0x7E # GOOD: c.lui t3, 1048548 -0x15 0x7E # GOOD: c.lui t3, 1048549 -0x19 0x7E # GOOD: c.lui t3, 1048550 -0x1D 0x7E # GOOD: c.lui t3, 1048551 -0x21 0x7E # GOOD: c.lui t3, 1048552 -0x25 0x7E # GOOD: c.lui t3, 1048553 -0x29 0x7E # GOOD: c.lui t3, 1048554 -0x2D 0x7E # GOOD: c.lui t3, 1048555 -0x31 0x7E # GOOD: c.lui t3, 1048556 -0x35 0x7E # GOOD: c.lui t3, 1048557 -0x39 0x7E # GOOD: c.lui t3, 1048558 -0x3D 0x7E # GOOD: c.lui t3, 1048559 -0x41 0x7E # GOOD: c.lui t3, 1048560 -0x45 0x7E # GOOD: c.lui t3, 1048561 -0x49 0x7E # GOOD: c.lui t3, 1048562 -0x4D 0x7E # GOOD: c.lui t3, 1048563 -0x51 0x7E # GOOD: c.lui t3, 1048564 -0x55 0x7E # GOOD: c.lui t3, 1048565 -0x59 0x7E # GOOD: c.lui t3, 1048566 -0x5D 0x7E # GOOD: c.lui t3, 1048567 -0x61 0x7E # GOOD: c.lui t3, 1048568 -0x65 0x7E # GOOD: c.lui t3, 1048569 -0x69 0x7E # GOOD: c.lui t3, 1048570 -0x6D 0x7E # GOOD: c.lui t3, 1048571 -0x71 0x7E # GOOD: c.lui t3, 1048572 -0x75 0x7E # GOOD: c.lui t3, 1048573 -0x79 0x7E # GOOD: c.lui t3, 1048574 -0x7D 0x7E # GOOD: c.lui t3, 1048575 -0x81 0x6E # BAD: invalid instruction encoding -0x85 0x6E # GOOD: c.lui t4, 1 -0x89 0x6E # GOOD: c.lui t4, 2 -0x8D 0x6E # GOOD: c.lui t4, 3 -0x91 0x6E # GOOD: c.lui t4, 4 -0x95 0x6E # GOOD: c.lui t4, 5 -0x99 0x6E # GOOD: c.lui t4, 6 -0x9D 0x6E # GOOD: c.lui t4, 7 -0xA1 0x6E # GOOD: c.lui t4, 8 -0xA5 0x6E # GOOD: c.lui t4, 9 -0xA9 0x6E # GOOD: c.lui t4, 10 -0xAD 0x6E # GOOD: c.lui t4, 11 -0xB1 0x6E # GOOD: c.lui t4, 12 -0xB5 0x6E # GOOD: c.lui t4, 13 -0xB9 0x6E # GOOD: c.lui t4, 14 -0xBD 0x6E # GOOD: c.lui t4, 15 -0xC1 0x6E # GOOD: c.lui t4, 16 -0xC5 0x6E # GOOD: c.lui t4, 17 -0xC9 0x6E # GOOD: c.lui t4, 18 -0xCD 0x6E # GOOD: c.lui t4, 19 -0xD1 0x6E # GOOD: c.lui t4, 20 -0xD5 0x6E # GOOD: c.lui t4, 21 -0xD9 0x6E # GOOD: c.lui t4, 22 -0xDD 0x6E # GOOD: c.lui t4, 23 -0xE1 0x6E # GOOD: c.lui t4, 24 -0xE5 0x6E # GOOD: c.lui t4, 25 -0xE9 0x6E # GOOD: c.lui t4, 26 -0xED 0x6E # GOOD: c.lui t4, 27 -0xF1 0x6E # GOOD: c.lui t4, 28 -0xF5 0x6E # GOOD: c.lui t4, 29 -0xF9 0x6E # GOOD: c.lui t4, 30 -0xFD 0x6E # GOOD: c.lui t4, 31 -0x81 0x7E # GOOD: c.lui t4, 1048544 -0x85 0x7E # GOOD: c.lui t4, 1048545 -0x89 0x7E # GOOD: c.lui t4, 1048546 -0x8D 0x7E # GOOD: c.lui t4, 1048547 -0x91 0x7E # GOOD: c.lui t4, 1048548 -0x95 0x7E # GOOD: c.lui t4, 1048549 -0x99 0x7E # GOOD: c.lui t4, 1048550 -0x9D 0x7E # GOOD: c.lui t4, 1048551 -0xA1 0x7E # GOOD: c.lui t4, 1048552 -0xA5 0x7E # GOOD: c.lui t4, 1048553 -0xA9 0x7E # GOOD: c.lui t4, 1048554 -0xAD 0x7E # GOOD: c.lui t4, 1048555 -0xB1 0x7E # GOOD: c.lui t4, 1048556 -0xB5 0x7E # GOOD: c.lui t4, 1048557 -0xB9 0x7E # GOOD: c.lui t4, 1048558 -0xBD 0x7E # GOOD: c.lui t4, 1048559 -0xC1 0x7E # GOOD: c.lui t4, 1048560 -0xC5 0x7E # GOOD: c.lui t4, 1048561 -0xC9 0x7E # GOOD: c.lui t4, 1048562 -0xCD 0x7E # GOOD: c.lui t4, 1048563 -0xD1 0x7E # GOOD: c.lui t4, 1048564 -0xD5 0x7E # GOOD: c.lui t4, 1048565 -0xD9 0x7E # GOOD: c.lui t4, 1048566 -0xDD 0x7E # GOOD: c.lui t4, 1048567 -0xE1 0x7E # GOOD: c.lui t4, 1048568 -0xE5 0x7E # GOOD: c.lui t4, 1048569 -0xE9 0x7E # GOOD: c.lui t4, 1048570 -0xED 0x7E # GOOD: c.lui t4, 1048571 -0xF1 0x7E # GOOD: c.lui t4, 1048572 -0xF5 0x7E # GOOD: c.lui t4, 1048573 -0xF9 0x7E # GOOD: c.lui t4, 1048574 -0xFD 0x7E # GOOD: c.lui t4, 1048575 -0x01 0x6F # BAD: invalid instruction encoding -0x05 0x6F # GOOD: c.lui t5, 1 -0x09 0x6F # GOOD: c.lui t5, 2 -0x0D 0x6F # GOOD: c.lui t5, 3 -0x11 0x6F # GOOD: c.lui t5, 4 -0x15 0x6F # GOOD: c.lui t5, 5 -0x19 0x6F # GOOD: c.lui t5, 6 -0x1D 0x6F # GOOD: c.lui t5, 7 -0x21 0x6F # GOOD: c.lui t5, 8 -0x25 0x6F # GOOD: c.lui t5, 9 -0x29 0x6F # GOOD: c.lui t5, 10 -0x2D 0x6F # GOOD: c.lui t5, 11 -0x31 0x6F # GOOD: c.lui t5, 12 -0x35 0x6F # GOOD: c.lui t5, 13 -0x39 0x6F # GOOD: c.lui t5, 14 -0x3D 0x6F # GOOD: c.lui t5, 15 -0x41 0x6F # GOOD: c.lui t5, 16 -0x45 0x6F # GOOD: c.lui t5, 17 -0x49 0x6F # GOOD: c.lui t5, 18 -0x4D 0x6F # GOOD: c.lui t5, 19 -0x51 0x6F # GOOD: c.lui t5, 20 -0x55 0x6F # GOOD: c.lui t5, 21 -0x59 0x6F # GOOD: c.lui t5, 22 -0x5D 0x6F # GOOD: c.lui t5, 23 -0x61 0x6F # GOOD: c.lui t5, 24 -0x65 0x6F # GOOD: c.lui t5, 25 -0x69 0x6F # GOOD: c.lui t5, 26 -0x6D 0x6F # GOOD: c.lui t5, 27 -0x71 0x6F # GOOD: c.lui t5, 28 -0x75 0x6F # GOOD: c.lui t5, 29 -0x79 0x6F # GOOD: c.lui t5, 30 -0x7D 0x6F # GOOD: c.lui t5, 31 -0x01 0x7F # GOOD: c.lui t5, 1048544 -0x05 0x7F # GOOD: c.lui t5, 1048545 -0x09 0x7F # GOOD: c.lui t5, 1048546 -0x0D 0x7F # GOOD: c.lui t5, 1048547 -0x11 0x7F # GOOD: c.lui t5, 1048548 -0x15 0x7F # GOOD: c.lui t5, 1048549 -0x19 0x7F # GOOD: c.lui t5, 1048550 -0x1D 0x7F # GOOD: c.lui t5, 1048551 -0x21 0x7F # GOOD: c.lui t5, 1048552 -0x25 0x7F # GOOD: c.lui t5, 1048553 -0x29 0x7F # GOOD: c.lui t5, 1048554 -0x2D 0x7F # GOOD: c.lui t5, 1048555 -0x31 0x7F # GOOD: c.lui t5, 1048556 -0x35 0x7F # GOOD: c.lui t5, 1048557 -0x39 0x7F # GOOD: c.lui t5, 1048558 -0x3D 0x7F # GOOD: c.lui t5, 1048559 -0x41 0x7F # GOOD: c.lui t5, 1048560 -0x45 0x7F # GOOD: c.lui t5, 1048561 -0x49 0x7F # GOOD: c.lui t5, 1048562 -0x4D 0x7F # GOOD: c.lui t5, 1048563 -0x51 0x7F # GOOD: c.lui t5, 1048564 -0x55 0x7F # GOOD: c.lui t5, 1048565 -0x59 0x7F # GOOD: c.lui t5, 1048566 -0x5D 0x7F # GOOD: c.lui t5, 1048567 -0x61 0x7F # GOOD: c.lui t5, 1048568 -0x65 0x7F # GOOD: c.lui t5, 1048569 -0x69 0x7F # GOOD: c.lui t5, 1048570 -0x6D 0x7F # GOOD: c.lui t5, 1048571 -0x71 0x7F # GOOD: c.lui t5, 1048572 -0x75 0x7F # GOOD: c.lui t5, 1048573 -0x79 0x7F # GOOD: c.lui t5, 1048574 -0x7D 0x7F # GOOD: c.lui t5, 1048575 -0x81 0x6F # BAD: invalid instruction encoding -0x85 0x6F # GOOD: c.lui t6, 1 -0x89 0x6F # GOOD: c.lui t6, 2 -0x8D 0x6F # GOOD: c.lui t6, 3 -0x91 0x6F # GOOD: c.lui t6, 4 -0x95 0x6F # GOOD: c.lui t6, 5 -0x99 0x6F # GOOD: c.lui t6, 6 -0x9D 0x6F # GOOD: c.lui t6, 7 -0xA1 0x6F # GOOD: c.lui t6, 8 -0xA5 0x6F # GOOD: c.lui t6, 9 -0xA9 0x6F # GOOD: c.lui t6, 10 -0xAD 0x6F # GOOD: c.lui t6, 11 -0xB1 0x6F # GOOD: c.lui t6, 12 -0xB5 0x6F # GOOD: c.lui t6, 13 -0xB9 0x6F # GOOD: c.lui t6, 14 -0xBD 0x6F # GOOD: c.lui t6, 15 -0xC1 0x6F # GOOD: c.lui t6, 16 -0xC5 0x6F # GOOD: c.lui t6, 17 -0xC9 0x6F # GOOD: c.lui t6, 18 -0xCD 0x6F # GOOD: c.lui t6, 19 -0xD1 0x6F # GOOD: c.lui t6, 20 -0xD5 0x6F # GOOD: c.lui t6, 21 -0xD9 0x6F # GOOD: c.lui t6, 22 -0xDD 0x6F # GOOD: c.lui t6, 23 -0xE1 0x6F # GOOD: c.lui t6, 24 -0xE5 0x6F # GOOD: c.lui t6, 25 -0xE9 0x6F # GOOD: c.lui t6, 26 -0xED 0x6F # GOOD: c.lui t6, 27 -0xF1 0x6F # GOOD: c.lui t6, 28 -0xF5 0x6F # GOOD: c.lui t6, 29 -0xF9 0x6F # GOOD: c.lui t6, 30 -0xFD 0x6F # GOOD: c.lui t6, 31 -0x81 0x7F # GOOD: c.lui t6, 1048544 -0x85 0x7F # GOOD: c.lui t6, 1048545 -0x89 0x7F # GOOD: c.lui t6, 1048546 -0x8D 0x7F # GOOD: c.lui t6, 1048547 -0x91 0x7F # GOOD: c.lui t6, 1048548 -0x95 0x7F # GOOD: c.lui t6, 1048549 -0x99 0x7F # GOOD: c.lui t6, 1048550 -0x9D 0x7F # GOOD: c.lui t6, 1048551 -0xA1 0x7F # GOOD: c.lui t6, 1048552 -0xA5 0x7F # GOOD: c.lui t6, 1048553 -0xA9 0x7F # GOOD: c.lui t6, 1048554 -0xAD 0x7F # GOOD: c.lui t6, 1048555 -0xB1 0x7F # GOOD: c.lui t6, 1048556 -0xB5 0x7F # GOOD: c.lui t6, 1048557 -0xB9 0x7F # GOOD: c.lui t6, 1048558 -0xBD 0x7F # GOOD: c.lui t6, 1048559 -0xC1 0x7F # GOOD: c.lui t6, 1048560 -0xC5 0x7F # GOOD: c.lui t6, 1048561 -0xC9 0x7F # GOOD: c.lui t6, 1048562 -0xCD 0x7F # GOOD: c.lui t6, 1048563 -0xD1 0x7F # GOOD: c.lui t6, 1048564 -0xD5 0x7F # GOOD: c.lui t6, 1048565 -0xD9 0x7F # GOOD: c.lui t6, 1048566 -0xDD 0x7F # GOOD: c.lui t6, 1048567 -0xE1 0x7F # GOOD: c.lui t6, 1048568 -0xE5 0x7F # GOOD: c.lui t6, 1048569 -0xE9 0x7F # GOOD: c.lui t6, 1048570 -0xED 0x7F # GOOD: c.lui t6, 1048571 -0xF1 0x7F # GOOD: c.lui t6, 1048572 -0xF5 0x7F # GOOD: c.lui t6, 1048573 -0xF9 0x7F # GOOD: c.lui t6, 1048574 -0xFD 0x7F # GOOD: c.lui t6, 1048575 +# BAD: invalid instruction encoding +# NOHINTS: invalid instruction encoding +0x01 0x60 + +# GOOD: c.lui zero, 1 +# NOHINTS: invalid instruction encoding +0x05 0x60 + +# GOOD: c.lui zero, 2 +# NOHINTS: invalid instruction encoding +0x09 0x60 + +# GOOD: c.lui zero, 3 +# NOHINTS: invalid instruction encoding +0x0D 0x60 + +# GOOD: c.lui zero, 4 +# NOHINTS: invalid instruction encoding +0x11 0x060 + +# GOOD: c.lui zero, 5 +# NOHINTS: invalid instruction encoding +0x15 0x60 + +# GOOD: c.lui zero, 6 +# NOHINTS: invalid instruction encoding +0x19 0x60 + +# GOOD: c.lui zero, 7 +# NOHINTS: invalid instruction encoding +0x1D 0x60 + +# GOOD: c.lui zero, 8 +# NOHINTS: invalid instruction encoding +0x21 0x60 + +# GOOD: c.lui zero, 9 +# NOHINTS: invalid instruction encoding +0x25 0x60 + +# GOOD: c.lui zero, 10 +# NOHINTS: invalid instruction encoding +0x29 0x60 + +# GOOD: c.lui zero, 11 +# NOHINTS: invalid instruction encoding +0x2D 0x60 + +# GOOD: c.lui zero, 12 +# NOHINTS: invalid instruction encoding +0x31 0x60 + +# GOOD: c.lui zero, 13 +# NOHINTS: invalid instruction encoding +0x35 0x60 + +# GOOD: c.lui zero, 14 +# NOHINTS: invalid instruction encoding +0x39 0x60 + +# GOOD: c.lui zero, 15 +# NOHINTS: invalid instruction encoding +0x3D 0x60 + +# GOOD: c.lui zero, 16 +# NOHINTS: invalid instruction encoding +0x41 0x60 + +# GOOD: c.lui zero, 17 +# NOHINTS: invalid instruction encoding +0x45 0x60 + +# GOOD: c.lui zero, 18 +# NOHINTS: invalid instruction encoding +0x49 0x60 + +# GOOD: c.lui zero, 19 +# NOHINTS: invalid instruction encoding +0x4D 0x60 + +# GOOD: c.lui zero, 20 +# NOHINTS: invalid instruction encoding +0x51 0x60 + +# GOOD: c.lui zero, 21 +# NOHINTS: invalid instruction encoding +0x55 0x60 + +# GOOD: c.lui zero, 22 +# NOHINTS: invalid instruction encoding +0x59 0x60 + +# GOOD: c.lui zero, 23 +# NOHINTS: invalid instruction encoding +0x5D 0x60 + +# GOOD: c.lui zero, 24 +# NOHINTS: invalid instruction encoding +0x61 0x60 + +# GOOD: c.lui zero, 25 +# NOHINTS: invalid instruction encoding +0x65 0x60 + +# GOOD: c.lui zero, 26 +# NOHINTS: invalid instruction encoding +0x69 0x60 + +# GOOD: c.lui zero, 27 +# NOHINTS: invalid instruction encoding +0x6D 0x60 + +# GOOD: c.lui zero, 28 +# NOHINTS: invalid instruction encoding +0x71 0x60 + +# GOOD: c.lui zero, 29 +# NOHINTS: invalid instruction encoding +0x75 0x60 + +# GOOD: c.lui zero, 30 +# NOHINTS: invalid instruction encoding +0x79 0x60 + +# GOOD: c.lui zero, 31 +# NOHINTS: invalid instruction encoding +0x7D 0x60 + +# GOOD: c.lui zero, 1048544 +# NOHINTS: invalid instruction encoding +0x01 0x70 + +# GOOD: c.lui zero, 1048545 +# NOHINTS: invalid instruction encoding +0x05 0x70 + +# GOOD: c.lui zero, 1048546 +# NOHINTS: invalid instruction encoding +0x09 0x70 + +# GOOD: c.lui zero, 1048547 +# NOHINTS: invalid instruction encoding +0x0D 0x70 + +# GOOD: c.lui zero, 1048548 +# NOHINTS: invalid instruction encoding +0x11 0x70 + +# GOOD: c.lui zero, 1048549 +# NOHINTS: invalid instruction encoding +0x15 0x70 + +# GOOD: c.lui zero, 1048550 +# NOHINTS: invalid instruction encoding +0x19 0x70 + +# GOOD: c.lui zero, 1048551 +# NOHINTS: invalid instruction encoding +0x1D 0x70 + +# GOOD: c.lui zero, 1048552 +# NOHINTS: invalid instruction encoding +0x21 0x70 + +# GOOD: c.lui zero, 1048553 +# NOHINTS: invalid instruction encoding +0x25 0x70 + +# GOOD: c.lui zero, 1048554 +# NOHINTS: invalid instruction encoding +0x29 0x70 + +# GOOD: c.lui zero, 1048555 +# NOHINTS: invalid instruction encoding +0x2D 0x70 + +# GOOD: c.lui zero, 1048556 +# NOHINTS: invalid instruction encoding +0x31 0x70 + +# GOOD: c.lui zero, 1048557 +# NOHINTS: invalid instruction encoding +0x35 0x70 + +# GOOD: c.lui zero, 1048558 +# NOHINTS: invalid instruction encoding +0x39 0x70 + +# GOOD: c.lui zero, 1048559 +# NOHINTS: invalid instruction encoding +0x3D 0x70 + +# GOOD: c.lui zero, 1048560 +# NOHINTS: invalid instruction encoding +0x41 0x70 + +# GOOD: c.lui zero, 1048561 +# NOHINTS: invalid instruction encoding +0x45 0x70 + +# GOOD: c.lui zero, 1048562 +# NOHINTS: invalid instruction encoding +0x49 0x70 + +# GOOD: c.lui zero, 1048563 +# NOHINTS: invalid instruction encoding +0x4D 0x70 + +# GOOD: c.lui zero, 1048564 +# NOHINTS: invalid instruction encoding +0x51 0x70 + +# GOOD: c.lui zero, 1048565 +# NOHINTS: invalid instruction encoding +0x55 0x70 + +# GOOD: c.lui zero, 1048566 +# NOHINTS: invalid instruction encoding +0x59 0x70 + +# GOOD: c.lui zero, 1048567 +# NOHINTS: invalid instruction encoding +0x5D 0x70 + +# GOOD: c.lui zero, 1048568 +# NOHINTS: invalid instruction encoding +0x61 0x70 + +# GOOD: c.lui zero, 1048569 +# NOHINTS: invalid instruction encoding +0x65 0x70 + +# GOOD: c.lui zero, 1048570 +# NOHINTS: invalid instruction encoding +0x69 0x70 + +# GOOD: c.lui zero, 1048571 +# NOHINTS: invalid instruction encoding +0x6D 0x70 + +# GOOD: c.lui zero, 1048572 +# NOHINTS: invalid instruction encoding +0x71 0x70 + +# GOOD: c.lui zero, 1048573 +# NOHINTS: invalid instruction encoding +0x75 0x70 + +# GOOD: c.lui zero, 1048574 +# NOHINTS: invalid instruction encoding +0x79 0x70 + +# GOOD: c.lui zero, 1048575 +# NOHINTS: invalid instruction encoding +0x7D 0x70 + +# BAD: invalid instruction encoding +# MOP: c.mop.1 +0x81 0x60 + +# GOOD: c.lui ra, 1 +0x85 0x60 + +# GOOD: c.lui ra, 2 +0x89 0x60 + +# GOOD: c.lui ra, 3 +0x8D 0x60 + +# GOOD: c.lui ra, 4 +0x91 0x60 + +# GOOD: c.lui ra, 5 +0x95 0x60 + +# GOOD: c.lui ra, 6 +0x99 0x60 + +# GOOD: c.lui ra, 7 +0x9D 0x60 + +# GOOD: c.lui ra, 8 +0xA1 0x60 + +# GOOD: c.lui ra, 9 +0xA5 0x60 + +# GOOD: c.lui ra, 10 +0xA9 0x60 + +# GOOD: c.lui ra, 11 +0xAD 0x60 + +# GOOD: c.lui ra, 12 +0xB1 0x60 + +# GOOD: c.lui ra, 13 +0xB5 0x60 + +# GOOD: c.lui ra, 14 +0xB9 0x60 + +# GOOD: c.lui ra, 15 +0xBD 0x60 + +# GOOD: c.lui ra, 16 +0xC1 0x60 + +# GOOD: c.lui ra, 17 +0xC5 0x60 + +# GOOD: c.lui ra, 18 +0xC9 0x60 + +# GOOD: c.lui ra, 19 +0xCD 0x60 + +# GOOD: c.lui ra, 20 +0xD1 0x60 + +# GOOD: c.lui ra, 21 +0xD5 0x60 + +# GOOD: c.lui ra, 22 +0xD9 0x60 + +# GOOD: c.lui ra, 23 +0xDD 0x60 + +# GOOD: c.lui ra, 24 +0xE1 0x60 + +# GOOD: c.lui ra, 25 +0xE5 0x60 + +# GOOD: c.lui ra, 26 +0xE9 0x60 + +# GOOD: c.lui ra, 27 +0xED 0x60 + +# GOOD: c.lui ra, 28 +0xF1 0x60 + +# GOOD: c.lui ra, 29 +0xF5 0x60 + +# GOOD: c.lui ra, 30 +0xF9 0x60 + +# GOOD: c.lui ra, 31 +0xFD 0x60 + +# GOOD: c.lui ra, 1048544 +0x81 0x70 + +# GOOD: c.lui ra, 1048545 +0x85 0x70 + +# GOOD: c.lui ra, 1048546 +0x89 0x70 + +# GOOD: c.lui ra, 1048547 +0x8D 0x70 + +# GOOD: c.lui ra, 1048548 +0x91 0x70 + +# GOOD: c.lui ra, 1048549 +0x95 0x70 + +# GOOD: c.lui ra, 1048550 +0x99 0x70 + +# GOOD: c.lui ra, 1048551 +0x9D 0x70 + +# GOOD: c.lui ra, 1048552 +0xA1 0x70 + +# GOOD: c.lui ra, 1048553 +0xA5 0x70 + +# GOOD: c.lui ra, 1048554 +0xA9 0x70 + +# GOOD: c.lui ra, 1048555 +0xAD 0x70 + +# GOOD: c.lui ra, 1048556 +0xB1 0x70 + +# GOOD: c.lui ra, 1048557 +0xB5 0x70 + +# GOOD: c.lui ra, 1048558 +0xB9 0x70 + +# GOOD: c.lui ra, 1048559 +0xBD 0x70 + +# GOOD: c.lui ra, 1048560 +0xC1 0x70 + +# GOOD: c.lui ra, 1048561 +0xC5 0x70 + +# GOOD: c.lui ra, 1048562 +0xC9 0x70 + +# GOOD: c.lui ra, 1048563 +0xCD 0x70 + +# GOOD: c.lui ra, 1048564 +0xD1 0x70 + +# GOOD: c.lui ra, 1048565 +0xD5 0x70 + +# GOOD: c.lui ra, 1048566 +0xD9 0x70 + +# GOOD: c.lui ra, 1048567 +0xDD 0x70 + +# GOOD: c.lui ra, 1048568 +0xE1 0x70 + +# GOOD: c.lui ra, 1048569 +0xE5 0x70 + +# GOOD: c.lui ra, 1048570 +0xE9 0x70 + +# GOOD: c.lui ra, 1048571 +0xED 0x70 + +# GOOD: c.lui ra, 1048572 +0xF1 0x70 + +# GOOD: c.lui ra, 1048573 +0xF5 0x70 + +# GOOD: c.lui ra, 1048574 +0xF9 0x70 + +# GOOD: c.lui ra, 1048575 +0xFD 0x70 + +# BAD: invalid instruction encoding +# MOP: c.mop.3 +0x81 0x61 + +# GOOD: c.lui gp, 1 +0x85 0x61 + +# GOOD: c.lui gp, 2 +0x89 0x61 + +# GOOD: c.lui gp, 3 +0x8D 0x61 + +# GOOD: c.lui gp, 4 +0x91 0x61 + +# GOOD: c.lui gp, 5 +0x95 0x61 + +# GOOD: c.lui gp, 6 +0x99 0x61 + +# GOOD: c.lui gp, 7 +0x9D 0x61 + +# GOOD: c.lui gp, 8 +0xA1 0x61 + +# GOOD: c.lui gp, 9 +0xA5 0x61 + +# GOOD: c.lui gp, 10 +0xA9 0x61 + +# GOOD: c.lui gp, 11 +0xAD 0x61 + +# GOOD: c.lui gp, 12 +0xB1 0x61 + +# GOOD: c.lui gp, 13 +0xB5 0x61 + +# GOOD: c.lui gp, 14 +0xB9 0x61 + +# GOOD: c.lui gp, 15 +0xBD 0x61 + +# GOOD: c.lui gp, 16 +0xC1 0x61 + +# GOOD: c.lui gp, 17 +0xC5 0x61 + +# GOOD: c.lui gp, 18 +0xC9 0x61 + +# GOOD: c.lui gp, 19 +0xCD 0x61 + +# GOOD: c.lui gp, 20 +0xD1 0x61 + +# GOOD: c.lui gp, 21 +0xD5 0x61 + +# GOOD: c.lui gp, 22 +0xD9 0x61 + +# GOOD: c.lui gp, 23 +0xDD 0x61 + +# GOOD: c.lui gp, 24 +0xE1 0x61 + +# GOOD: c.lui gp, 25 +0xE5 0x61 + +# GOOD: c.lui gp, 26 +0xE9 0x61 + +# GOOD: c.lui gp, 27 +0xED 0x61 + +# GOOD: c.lui gp, 28 +0xF1 0x61 + +# GOOD: c.lui gp, 29 +0xF5 0x61 + +# GOOD: c.lui gp, 30 +0xF9 0x61 + +# GOOD: c.lui gp, 31 +0xFD 0x61 + +# GOOD: c.lui gp, 1048544 +0x81 0x71 + +# GOOD: c.lui gp, 1048545 +0x85 0x71 + +# GOOD: c.lui gp, 1048546 +0x89 0x71 + +# GOOD: c.lui gp, 1048547 +0x8D 0x71 + +# GOOD: c.lui gp, 1048548 +0x91 0x71 + +# GOOD: c.lui gp, 1048549 +0x95 0x71 + +# GOOD: c.lui gp, 1048550 +0x99 0x71 + +# GOOD: c.lui gp, 1048551 +0x9D 0x71 + +# GOOD: c.lui gp, 1048552 +0xA1 0x71 + +# GOOD: c.lui gp, 1048553 +0xA5 0x71 + +# GOOD: c.lui gp, 1048554 +0xA9 0x71 + +# GOOD: c.lui gp, 1048555 +0xAD 0x71 + +# GOOD: c.lui gp, 1048556 +0xB1 0x71 + +# GOOD: c.lui gp, 1048557 +0xB5 0x71 + +# GOOD: c.lui gp, 1048558 +0xB9 0x71 + +# GOOD: c.lui gp, 1048559 +0xBD 0x71 + +# GOOD: c.lui gp, 1048560 +0xC1 0x71 + +# GOOD: c.lui gp, 1048561 +0xC5 0x71 + +# GOOD: c.lui gp, 1048562 +0xC9 0x71 + +# GOOD: c.lui gp, 1048563 +0xCD 0x71 + +# GOOD: c.lui gp, 1048564 +0xD1 0x71 + +# GOOD: c.lui gp, 1048565 +0xD5 0x71 + +# GOOD: c.lui gp, 1048566 +0xD9 0x71 + +# GOOD: c.lui gp, 1048567 +0xDD 0x71 + +# GOOD: c.lui gp, 1048568 +0xE1 0x71 + +# GOOD: c.lui gp, 1048569 +0xE5 0x71 + +# GOOD: c.lui gp, 1048570 +0xE9 0x71 + +# GOOD: c.lui gp, 1048571 +0xED 0x71 + +# GOOD: c.lui gp, 1048572 +0xF1 0x71 + +# GOOD: c.lui gp, 1048573 +0xF5 0x71 + +# GOOD: c.lui gp, 1048574 +0xF9 0x71 + +# GOOD: c.lui gp, 1048575 +0xFD 0x71 + +# BAD: invalid instruction encoding +0x01 0x62 + +# GOOD: c.lui tp, 1 +0x05 0x62 + +# GOOD: c.lui tp, 2 +0x09 0x62 + +# GOOD: c.lui tp, 3 +0x0D 0x62 + +# GOOD: c.lui tp, 4 +0x11 0x62 + +# GOOD: c.lui tp, 5 +0x15 0x62 + +# GOOD: c.lui tp, 6 +0x19 0x62 + +# GOOD: c.lui tp, 7 +0x1D 0x62 + +# GOOD: c.lui tp, 8 +0x21 0x62 + +# GOOD: c.lui tp, 9 +0x25 0x62 + +# GOOD: c.lui tp, 10 +0x29 0x62 + +# GOOD: c.lui tp, 11 +0x2D 0x62 + +# GOOD: c.lui tp, 12 +0x31 0x62 + +# GOOD: c.lui tp, 13 +0x35 0x62 + +# GOOD: c.lui tp, 14 +0x39 0x62 + +# GOOD: c.lui tp, 15 +0x3D 0x62 + +# GOOD: c.lui tp, 16 +0x41 0x62 + +# GOOD: c.lui tp, 17 +0x45 0x62 + +# GOOD: c.lui tp, 18 +0x49 0x62 + +# GOOD: c.lui tp, 19 +0x4D 0x62 + +# GOOD: c.lui tp, 20 +0x51 0x62 + +# GOOD: c.lui tp, 21 +0x55 0x62 + +# GOOD: c.lui tp, 22 +0x59 0x62 + +# GOOD: c.lui tp, 23 +0x5D 0x62 + +# GOOD: c.lui tp, 24 +0x61 0x62 + +# GOOD: c.lui tp, 25 +0x65 0x62 + +# GOOD: c.lui tp, 26 +0x69 0x62 + +# GOOD: c.lui tp, 27 +0x6D 0x62 + +# GOOD: c.lui tp, 28 +0x71 0x62 + +# GOOD: c.lui tp, 29 +0x75 0x62 + +# GOOD: c.lui tp, 30 +0x79 0x62 + +# GOOD: c.lui tp, 31 +0x7D 0x62 + +# GOOD: c.lui tp, 1048544 +0x01 0x72 + +# GOOD: c.lui tp, 1048545 +0x05 0x72 + +# GOOD: c.lui tp, 1048546 +0x09 0x72 + +# GOOD: c.lui tp, 1048547 +0x0D 0x72 + +# GOOD: c.lui tp, 1048548 +0x11 0x72 + +# GOOD: c.lui tp, 1048549 +0x15 0x72 + +# GOOD: c.lui tp, 1048550 +0x19 0x72 + +# GOOD: c.lui tp, 1048551 +0x1D 0x72 + +# GOOD: c.lui tp, 1048552 +0x21 0x72 + +# GOOD: c.lui tp, 1048553 +0x25 0x72 + +# GOOD: c.lui tp, 1048554 +0x29 0x72 + +# GOOD: c.lui tp, 1048555 +0x2D 0x72 + +# GOOD: c.lui tp, 1048556 +0x31 0x72 + +# GOOD: c.lui tp, 1048557 +0x35 0x72 + +# GOOD: c.lui tp, 1048558 +0x39 0x72 + +# GOOD: c.lui tp, 1048559 +0x3D 0x72 + +# GOOD: c.lui tp, 1048560 +0x41 0x72 + +# GOOD: c.lui tp, 1048561 +0x45 0x72 + +# GOOD: c.lui tp, 1048562 +0x49 0x72 + +# GOOD: c.lui tp, 1048563 +0x4D 0x72 + +# GOOD: c.lui tp, 1048564 +0x51 0x72 + +# GOOD: c.lui tp, 1048565 +0x55 0x72 + +# GOOD: c.lui tp, 1048566 +0x59 0x72 + +# GOOD: c.lui tp, 1048567 +0x5D 0x72 + +# GOOD: c.lui tp, 1048568 +0x61 0x72 + +# GOOD: c.lui tp, 1048569 +0x65 0x72 + +# GOOD: c.lui tp, 1048570 +0x69 0x72 + +# GOOD: c.lui tp, 1048571 +0x6D 0x72 + +# GOOD: c.lui tp, 1048572 +0x71 0x72 + +# GOOD: c.lui tp, 1048573 +0x75 0x72 + +# GOOD: c.lui tp, 1048574 +0x79 0x72 + +# GOOD: c.lui tp, 1048575 +0x7D 0x72 + +# BAD: invalid instruction encoding +# MOP: c.mop.5 +0x81 0x62 + +# GOOD: c.lui t0, 1 +0x85 0x62 + +# GOOD: c.lui t0, 2 +0x89 0x62 + +# GOOD: c.lui t0, 3 +0x8D 0x62 + +# GOOD: c.lui t0, 4 +0x91 0x62 + +# GOOD: c.lui t0, 5 +0x95 0x62 + +# GOOD: c.lui t0, 6 +0x99 0x62 + +# GOOD: c.lui t0, 7 +0x9D 0x62 + +# GOOD: c.lui t0, 8 +0xA1 0x62 + +# GOOD: c.lui t0, 9 +0xA5 0x62 + +# GOOD: c.lui t0, 10 +0xA9 0x62 + +# GOOD: c.lui t0, 11 +0xAD 0x62 + +# GOOD: c.lui t0, 12 +0xB1 0x62 + +# GOOD: c.lui t0, 13 +0xB5 0x62 + +# GOOD: c.lui t0, 14 +0xB9 0x62 + +# GOOD: c.lui t0, 15 +0xBD 0x62 + +# GOOD: c.lui t0, 16 +0xC1 0x62 + +# GOOD: c.lui t0, 17 +0xC5 0x62 + +# GOOD: c.lui t0, 18 +0xC9 0x62 + +# GOOD: c.lui t0, 19 +0xCD 0x62 + +# GOOD: c.lui t0, 20 +0xD1 0x62 + +# GOOD: c.lui t0, 21 +0xD5 0x62 + +# GOOD: c.lui t0, 22 +0xD9 0x62 + +# GOOD: c.lui t0, 23 +0xDD 0x62 + +# GOOD: c.lui t0, 24 +0xE1 0x62 + +# GOOD: c.lui t0, 25 +0xE5 0x62 + +# GOOD: c.lui t0, 26 +0xE9 0x62 + +# GOOD: c.lui t0, 27 +0xED 0x62 + +# GOOD: c.lui t0, 28 +0xF1 0x62 + +# GOOD: c.lui t0, 29 +0xF5 0x62 + +# GOOD: c.lui t0, 30 +0xF9 0x62 + +# GOOD: c.lui t0, 31 +0xFD 0x62 + +# GOOD: c.lui t0, 1048544 +0x81 0x72 + +# GOOD: c.lui t0, 1048545 +0x85 0x72 + +# GOOD: c.lui t0, 1048546 +0x89 0x72 + +# GOOD: c.lui t0, 1048547 +0x8D 0x72 + +# GOOD: c.lui t0, 1048548 +0x91 0x72 + +# GOOD: c.lui t0, 1048549 +0x95 0x72 + +# GOOD: c.lui t0, 1048550 +0x99 0x72 + +# GOOD: c.lui t0, 1048551 +0x9D 0x72 + +# GOOD: c.lui t0, 1048552 +0xA1 0x72 + +# GOOD: c.lui t0, 1048553 +0xA5 0x72 + +# GOOD: c.lui t0, 1048554 +0xA9 0x72 + +# GOOD: c.lui t0, 1048555 +0xAD 0x72 + +# GOOD: c.lui t0, 1048556 +0xB1 0x72 + +# GOOD: c.lui t0, 1048557 +0xB5 0x72 + +# GOOD: c.lui t0, 1048558 +0xB9 0x72 + +# GOOD: c.lui t0, 1048559 +0xBD 0x72 + +# GOOD: c.lui t0, 1048560 +0xC1 0x72 + +# GOOD: c.lui t0, 1048561 +0xC5 0x72 + +# GOOD: c.lui t0, 1048562 +0xC9 0x72 + +# GOOD: c.lui t0, 1048563 +0xCD 0x72 + +# GOOD: c.lui t0, 1048564 +0xD1 0x72 + +# GOOD: c.lui t0, 1048565 +0xD5 0x72 + +# GOOD: c.lui t0, 1048566 +0xD9 0x72 + +# GOOD: c.lui t0, 1048567 +0xDD 0x72 + +# GOOD: c.lui t0, 1048568 +0xE1 0x72 + +# GOOD: c.lui t0, 1048569 +0xE5 0x72 + +# GOOD: c.lui t0, 1048570 +0xE9 0x72 + +# GOOD: c.lui t0, 1048571 +0xED 0x72 + +# GOOD: c.lui t0, 1048572 +0xF1 0x72 + +# GOOD: c.lui t0, 1048573 +0xF5 0x72 + +# GOOD: c.lui t0, 1048574 +0xF9 0x72 + +# GOOD: c.lui t0, 1048575 +0xFD 0x72 + +# BAD: invalid instruction encoding +0x01 0x63 + +# GOOD: c.lui t1, 1 +0x05 0x63 + +# GOOD: c.lui t1, 2 +0x09 0x63 + +# GOOD: c.lui t1, 3 +0x0D 0x63 + +# GOOD: c.lui t1, 4 +0x11 0x63 + +# GOOD: c.lui t1, 5 +0x15 0x63 + +# GOOD: c.lui t1, 6 +0x19 0x63 + +# GOOD: c.lui t1, 7 +0x1D 0x63 + +# GOOD: c.lui t1, 8 +0x21 0x63 + +# GOOD: c.lui t1, 9 +0x25 0x63 + +# GOOD: c.lui t1, 10 +0x29 0x63 + +# GOOD: c.lui t1, 11 +0x2D 0x63 + +# GOOD: c.lui t1, 12 +0x31 0x63 + +# GOOD: c.lui t1, 13 +0x35 0x63 + +# GOOD: c.lui t1, 14 +0x39 0x63 + +# GOOD: c.lui t1, 15 +0x3D 0x63 + +# GOOD: c.lui t1, 16 +0x41 0x63 + +# GOOD: c.lui t1, 17 +0x45 0x63 + +# GOOD: c.lui t1, 18 +0x49 0x63 + +# GOOD: c.lui t1, 19 +0x4D 0x63 + +# GOOD: c.lui t1, 20 +0x51 0x63 + +# GOOD: c.lui t1, 21 +0x55 0x63 + +# GOOD: c.lui t1, 22 +0x59 0x63 + +# GOOD: c.lui t1, 23 +0x5D 0x63 + +# GOOD: c.lui t1, 24 +0x61 0x63 + +# GOOD: c.lui t1, 25 +0x65 0x63 + +# GOOD: c.lui t1, 26 +0x69 0x63 + +# GOOD: c.lui t1, 27 +0x6D 0x63 + +# GOOD: c.lui t1, 28 +0x71 0x63 + +# GOOD: c.lui t1, 29 +0x75 0x63 + +# GOOD: c.lui t1, 30 +0x79 0x63 + +# GOOD: c.lui t1, 31 +0x7D 0x63 + +# GOOD: c.lui t1, 1048544 +0x01 0x73 + +# GOOD: c.lui t1, 1048545 +0x05 0x73 + +# GOOD: c.lui t1, 1048546 +0x09 0x73 + +# GOOD: c.lui t1, 1048547 +0x0D 0x73 + +# GOOD: c.lui t1, 1048548 +0x11 0x73 + +# GOOD: c.lui t1, 1048549 +0x15 0x73 + +# GOOD: c.lui t1, 1048550 +0x19 0x73 + +# GOOD: c.lui t1, 1048551 +0x1D 0x73 + +# GOOD: c.lui t1, 1048552 +0x21 0x73 + +# GOOD: c.lui t1, 1048553 +0x25 0x73 + +# GOOD: c.lui t1, 1048554 +0x29 0x73 + +# GOOD: c.lui t1, 1048555 +0x2D 0x73 + +# GOOD: c.lui t1, 1048556 +0x31 0x73 + +# GOOD: c.lui t1, 1048557 +0x35 0x73 + +# GOOD: c.lui t1, 1048558 +0x39 0x73 + +# GOOD: c.lui t1, 1048559 +0x3D 0x73 + +# GOOD: c.lui t1, 1048560 +0x41 0x73 + +# GOOD: c.lui t1, 1048561 +0x45 0x73 + +# GOOD: c.lui t1, 1048562 +0x49 0x73 + +# GOOD: c.lui t1, 1048563 +0x4D 0x73 + +# GOOD: c.lui t1, 1048564 +0x51 0x73 + +# GOOD: c.lui t1, 1048565 +0x55 0x73 + +# GOOD: c.lui t1, 1048566 +0x59 0x73 + +# GOOD: c.lui t1, 1048567 +0x5D 0x73 + +# GOOD: c.lui t1, 1048568 +0x61 0x73 + +# GOOD: c.lui t1, 1048569 +0x65 0x73 + +# GOOD: c.lui t1, 1048570 +0x69 0x73 + +# GOOD: c.lui t1, 1048571 +0x6D 0x73 + +# GOOD: c.lui t1, 1048572 +0x71 0x73 + +# GOOD: c.lui t1, 1048573 +0x75 0x73 + +# GOOD: c.lui t1, 1048574 +0x79 0x73 + +# GOOD: c.lui t1, 1048575 +0x7D 0x73 + +# BAD: invalid instruction encoding +# MOP: c.mop.7 +0x81 0x63 + +# GOOD: c.lui t2, 1 +0x85 0x63 + +# GOOD: c.lui t2, 2 +0x89 0x63 + +# GOOD: c.lui t2, 3 +0x8D 0x63 + +# GOOD: c.lui t2, 4 +0x91 0x63 + +# GOOD: c.lui t2, 5 +0x95 0x63 + +# GOOD: c.lui t2, 6 +0x99 0x63 + +# GOOD: c.lui t2, 7 +0x9D 0x63 + +# GOOD: c.lui t2, 8 +0xA1 0x63 + +# GOOD: c.lui t2, 9 +0xA5 0x63 + +# GOOD: c.lui t2, 10 +0xA9 0x63 + +# GOOD: c.lui t2, 11 +0xAD 0x63 + +# GOOD: c.lui t2, 12 +0xB1 0x63 + +# GOOD: c.lui t2, 13 +0xB5 0x63 + +# GOOD: c.lui t2, 14 +0xB9 0x63 + +# GOOD: c.lui t2, 15 +0xBD 0x63 + +# GOOD: c.lui t2, 16 +0xC1 0x63 + +# GOOD: c.lui t2, 17 +0xC5 0x63 + +# GOOD: c.lui t2, 18 +0xC9 0x63 + +# GOOD: c.lui t2, 19 +0xCD 0x63 + +# GOOD: c.lui t2, 20 +0xD1 0x63 + +# GOOD: c.lui t2, 21 +0xD5 0x63 + +# GOOD: c.lui t2, 22 +0xD9 0x63 + +# GOOD: c.lui t2, 23 +0xDD 0x63 + +# GOOD: c.lui t2, 24 +0xE1 0x63 + +# GOOD: c.lui t2, 25 +0xE5 0x63 + +# GOOD: c.lui t2, 26 +0xE9 0x63 + +# GOOD: c.lui t2, 27 +0xED 0x63 + +# GOOD: c.lui t2, 28 +0xF1 0x63 + +# GOOD: c.lui t2, 29 +0xF5 0x63 + +# GOOD: c.lui t2, 30 +0xF9 0x63 + +# GOOD: c.lui t2, 31 +0xFD 0x63 + +# GOOD: c.lui t2, 1048544 +0x81 0x73 + +# GOOD: c.lui t2, 1048545 +0x85 0x73 + +# GOOD: c.lui t2, 1048546 +0x89 0x73 + +# GOOD: c.lui t2, 1048547 +0x8D 0x73 + +# GOOD: c.lui t2, 1048548 +0x91 0x73 + +# GOOD: c.lui t2, 1048549 +0x95 0x73 + +# GOOD: c.lui t2, 1048550 +0x99 0x73 + +# GOOD: c.lui t2, 1048551 +0x9D 0x73 + +# GOOD: c.lui t2, 1048552 +0xA1 0x73 + +# GOOD: c.lui t2, 1048553 +0xA5 0x73 + +# GOOD: c.lui t2, 1048554 +0xA9 0x73 + +# GOOD: c.lui t2, 1048555 +0xAD 0x73 + +# GOOD: c.lui t2, 1048556 +0xB1 0x73 + +# GOOD: c.lui t2, 1048557 +0xB5 0x73 + +# GOOD: c.lui t2, 1048558 +0xB9 0x73 + +# GOOD: c.lui t2, 1048559 +0xBD 0x73 + +# GOOD: c.lui t2, 1048560 +0xC1 0x73 + +# GOOD: c.lui t2, 1048561 +0xC5 0x73 + +# GOOD: c.lui t2, 1048562 +0xC9 0x73 + +# GOOD: c.lui t2, 1048563 +0xCD 0x73 + +# GOOD: c.lui t2, 1048564 +0xD1 0x73 + +# GOOD: c.lui t2, 1048565 +0xD5 0x73 + +# GOOD: c.lui t2, 1048566 +0xD9 0x73 + +# GOOD: c.lui t2, 1048567 +0xDD 0x73 + +# GOOD: c.lui t2, 1048568 +0xE1 0x73 + +# GOOD: c.lui t2, 1048569 +0xE5 0x73 + +# GOOD: c.lui t2, 1048570 +0xE9 0x73 + +# GOOD: c.lui t2, 1048571 +0xED 0x73 + +# GOOD: c.lui t2, 1048572 +0xF1 0x73 + +# GOOD: c.lui t2, 1048573 +0xF5 0x73 + +# GOOD: c.lui t2, 1048574 +0xF9 0x73 + +# GOOD: c.lui t2, 1048575 +0xFD 0x73 + +# BAD: invalid instruction encoding +0x01 0x64 + +# GOOD: c.lui s0, 1 +0x05 0x64 + +# GOOD: c.lui s0, 2 +0x09 0x64 + +# GOOD: c.lui s0, 3 +0x0D 0x64 + +# GOOD: c.lui s0, 4 +0x11 0x64 + +# GOOD: c.lui s0, 5 +0x15 0x64 + +# GOOD: c.lui s0, 6 +0x19 0x64 + +# GOOD: c.lui s0, 7 +0x1D 0x64 + +# GOOD: c.lui s0, 8 +0x21 0x64 + +# GOOD: c.lui s0, 9 +0x25 0x64 + +# GOOD: c.lui s0, 10 +0x29 0x64 + +# GOOD: c.lui s0, 11 +0x2D 0x64 + +# GOOD: c.lui s0, 12 +0x31 0x64 + +# GOOD: c.lui s0, 13 +0x35 0x64 + +# GOOD: c.lui s0, 14 +0x39 0x64 + +# GOOD: c.lui s0, 15 +0x3D 0x64 + +# GOOD: c.lui s0, 16 +0x41 0x64 + +# GOOD: c.lui s0, 17 +0x45 0x64 + +# GOOD: c.lui s0, 18 +0x49 0x64 + +# GOOD: c.lui s0, 19 +0x4D 0x64 + +# GOOD: c.lui s0, 20 +0x51 0x64 + +# GOOD: c.lui s0, 21 +0x55 0x64 + +# GOOD: c.lui s0, 22 +0x59 0x64 + +# GOOD: c.lui s0, 23 +0x5D 0x64 + +# GOOD: c.lui s0, 24 +0x61 0x64 + +# GOOD: c.lui s0, 25 +0x65 0x64 + +# GOOD: c.lui s0, 26 +0x69 0x64 + +# GOOD: c.lui s0, 27 +0x6D 0x64 + +# GOOD: c.lui s0, 28 +0x71 0x64 + +# GOOD: c.lui s0, 29 +0x75 0x64 + +# GOOD: c.lui s0, 30 +0x79 0x64 + +# GOOD: c.lui s0, 31 +0x7D 0x64 + +# GOOD: c.lui s0, 1048544 +0x01 0x74 + +# GOOD: c.lui s0, 1048545 +0x05 0x74 + +# GOOD: c.lui s0, 1048546 +0x09 0x74 + +# GOOD: c.lui s0, 1048547 +0x0D 0x74 + +# GOOD: c.lui s0, 1048548 +0x11 0x74 + +# GOOD: c.lui s0, 1048549 +0x15 0x74 + +# GOOD: c.lui s0, 1048550 +0x19 0x74 + +# GOOD: c.lui s0, 1048551 +0x1D 0x74 + +# GOOD: c.lui s0, 1048552 +0x21 0x74 + +# GOOD: c.lui s0, 1048553 +0x25 0x74 + +# GOOD: c.lui s0, 1048554 +0x29 0x74 + +# GOOD: c.lui s0, 1048555 +0x2D 0x74 + +# GOOD: c.lui s0, 1048556 +0x31 0x74 + +# GOOD: c.lui s0, 1048557 +0x35 0x74 + +# GOOD: c.lui s0, 1048558 +0x39 0x74 + +# GOOD: c.lui s0, 1048559 +0x3D 0x74 + +# GOOD: c.lui s0, 1048560 +0x41 0x74 + +# GOOD: c.lui s0, 1048561 +0x45 0x74 + +# GOOD: c.lui s0, 1048562 +0x49 0x74 + +# GOOD: c.lui s0, 1048563 +0x4D 0x74 + +# GOOD: c.lui s0, 1048564 +0x51 0x74 + +# GOOD: c.lui s0, 1048565 +0x55 0x74 + +# GOOD: c.lui s0, 1048566 +0x59 0x74 + +# GOOD: c.lui s0, 1048567 +0x5D 0x74 + +# GOOD: c.lui s0, 1048568 +0x61 0x74 + +# GOOD: c.lui s0, 1048569 +0x65 0x74 + +# GOOD: c.lui s0, 1048570 +0x69 0x74 + +# GOOD: c.lui s0, 1048571 +0x6D 0x74 + +# GOOD: c.lui s0, 1048572 +0x71 0x74 + +# GOOD: c.lui s0, 1048573 +0x75 0x74 + +# GOOD: c.lui s0, 1048574 +0x79 0x74 + +# GOOD: c.lui s0, 1048575 +0x7D 0x74 + +# BAD: invalid instruction encoding +# MOP: c.mop.9 +0x81 0x64 + +# GOOD: c.lui s1, 1 +0x85 0x64 + +# GOOD: c.lui s1, 2 +0x89 0x64 + +# GOOD: c.lui s1, 3 +0x8D 0x64 + +# GOOD: c.lui s1, 4 +0x91 0x64 + +# GOOD: c.lui s1, 5 +0x95 0x64 + +# GOOD: c.lui s1, 6 +0x99 0x64 + +# GOOD: c.lui s1, 7 +0x9D 0x64 + +# GOOD: c.lui s1, 8 +0xA1 0x64 + +# GOOD: c.lui s1, 9 +0xA5 0x64 + +# GOOD: c.lui s1, 10 +0xA9 0x64 + +# GOOD: c.lui s1, 11 +0xAD 0x64 + +# GOOD: c.lui s1, 12 +0xB1 0x64 + +# GOOD: c.lui s1, 13 +0xB5 0x64 + +# GOOD: c.lui s1, 14 +0xB9 0x64 + +# GOOD: c.lui s1, 15 +0xBD 0x64 + +# GOOD: c.lui s1, 16 +0xC1 0x64 + +# GOOD: c.lui s1, 17 +0xC5 0x64 + +# GOOD: c.lui s1, 18 +0xC9 0x64 + +# GOOD: c.lui s1, 19 +0xCD 0x64 + +# GOOD: c.lui s1, 20 +0xD1 0x64 + +# GOOD: c.lui s1, 21 +0xD5 0x64 + +# GOOD: c.lui s1, 22 +0xD9 0x64 + +# GOOD: c.lui s1, 23 +0xDD 0x64 + +# GOOD: c.lui s1, 24 +0xE1 0x64 + +# GOOD: c.lui s1, 25 +0xE5 0x64 + +# GOOD: c.lui s1, 26 +0xE9 0x64 + +# GOOD: c.lui s1, 27 +0xED 0x64 + +# GOOD: c.lui s1, 28 +0xF1 0x64 + +# GOOD: c.lui s1, 29 +0xF5 0x64 + +# GOOD: c.lui s1, 30 +0xF9 0x64 + +# GOOD: c.lui s1, 31 +0xFD 0x64 + +# GOOD: c.lui s1, 1048544 +0x81 0x74 + +# GOOD: c.lui s1, 1048545 +0x85 0x74 + +# GOOD: c.lui s1, 1048546 +0x89 0x74 + +# GOOD: c.lui s1, 1048547 +0x8D 0x74 + +# GOOD: c.lui s1, 1048548 +0x91 0x74 + +# GOOD: c.lui s1, 1048549 +0x95 0x74 + +# GOOD: c.lui s1, 1048550 +0x99 0x74 + +# GOOD: c.lui s1, 1048551 +0x9D 0x74 + +# GOOD: c.lui s1, 1048552 +0xA1 0x74 + +# GOOD: c.lui s1, 1048553 +0xA5 0x74 + +# GOOD: c.lui s1, 1048554 +0xA9 0x74 + +# GOOD: c.lui s1, 1048555 +0xAD 0x74 + +# GOOD: c.lui s1, 1048556 +0xB1 0x74 + +# GOOD: c.lui s1, 1048557 +0xB5 0x74 + +# GOOD: c.lui s1, 1048558 +0xB9 0x74 + +# GOOD: c.lui s1, 1048559 +0xBD 0x74 + +# GOOD: c.lui s1, 1048560 +0xC1 0x74 + +# GOOD: c.lui s1, 1048561 +0xC5 0x74 + +# GOOD: c.lui s1, 1048562 +0xC9 0x74 + +# GOOD: c.lui s1, 1048563 +0xCD 0x74 + +# GOOD: c.lui s1, 1048564 +0xD1 0x74 + +# GOOD: c.lui s1, 1048565 +0xD5 0x74 + +# GOOD: c.lui s1, 1048566 +0xD9 0x74 + +# GOOD: c.lui s1, 1048567 +0xDD 0x74 + +# GOOD: c.lui s1, 1048568 +0xE1 0x74 + +# GOOD: c.lui s1, 1048569 +0xE5 0x74 + +# GOOD: c.lui s1, 1048570 +0xE9 0x74 + +# GOOD: c.lui s1, 1048571 +0xED 0x74 + +# GOOD: c.lui s1, 1048572 +0xF1 0x74 + +# GOOD: c.lui s1, 1048573 +0xF5 0x74 + +# GOOD: c.lui s1, 1048574 +0xF9 0x74 + +# GOOD: c.lui s1, 1048575 +0xFD 0x74 + +# BAD: invalid instruction encoding +0x01 0x65 + +# GOOD: c.lui a0, 1 +0x05 0x65 + +# GOOD: c.lui a0, 2 +0x09 0x65 + +# GOOD: c.lui a0, 3 +0x0D 0x65 + +# GOOD: c.lui a0, 4 +0x11 0x65 + +# GOOD: c.lui a0, 5 +0x15 0x65 + +# GOOD: c.lui a0, 6 +0x19 0x65 + +# GOOD: c.lui a0, 7 +0x1D 0x65 + +# GOOD: c.lui a0, 8 +0x21 0x65 + +# GOOD: c.lui a0, 9 +0x25 0x65 + +# GOOD: c.lui a0, 10 +0x29 0x65 + +# GOOD: c.lui a0, 11 +0x2D 0x65 + +# GOOD: c.lui a0, 12 +0x31 0x65 + +# GOOD: c.lui a0, 13 +0x35 0x65 + +# GOOD: c.lui a0, 14 +0x39 0x65 + +# GOOD: c.lui a0, 15 +0x3D 0x65 + +# GOOD: c.lui a0, 16 +0x41 0x65 + +# GOOD: c.lui a0, 17 +0x45 0x65 + +# GOOD: c.lui a0, 18 +0x49 0x65 + +# GOOD: c.lui a0, 19 +0x4D 0x65 + +# GOOD: c.lui a0, 20 +0x51 0x65 + +# GOOD: c.lui a0, 21 +0x55 0x65 + +# GOOD: c.lui a0, 22 +0x59 0x65 + +# GOOD: c.lui a0, 23 +0x5D 0x65 + +# GOOD: c.lui a0, 24 +0x61 0x65 + +# GOOD: c.lui a0, 25 +0x65 0x65 + +# GOOD: c.lui a0, 26 +0x69 0x65 + +# GOOD: c.lui a0, 27 +0x6D 0x65 + +# GOOD: c.lui a0, 28 +0x71 0x65 + +# GOOD: c.lui a0, 29 +0x75 0x65 + +# GOOD: c.lui a0, 30 +0x79 0x65 + +# GOOD: c.lui a0, 31 +0x7D 0x65 + +# GOOD: c.lui a0, 1048544 +0x01 0x75 + +# GOOD: c.lui a0, 1048545 +0x05 0x75 + +# GOOD: c.lui a0, 1048546 +0x09 0x75 + +# GOOD: c.lui a0, 1048547 +0x0D 0x75 + +# GOOD: c.lui a0, 1048548 +0x11 0x75 + +# GOOD: c.lui a0, 1048549 +0x15 0x75 + +# GOOD: c.lui a0, 1048550 +0x19 0x75 + +# GOOD: c.lui a0, 1048551 +0x1D 0x75 + +# GOOD: c.lui a0, 1048552 +0x21 0x75 + +# GOOD: c.lui a0, 1048553 +0x25 0x75 + +# GOOD: c.lui a0, 1048554 +0x29 0x75 + +# GOOD: c.lui a0, 1048555 +0x2D 0x75 + +# GOOD: c.lui a0, 1048556 +0x31 0x75 + +# GOOD: c.lui a0, 1048557 +0x35 0x75 + +# GOOD: c.lui a0, 1048558 +0x39 0x75 + +# GOOD: c.lui a0, 1048559 +0x3D 0x75 + +# GOOD: c.lui a0, 1048560 +0x41 0x75 + +# GOOD: c.lui a0, 1048561 +0x45 0x75 + +# GOOD: c.lui a0, 1048562 +0x49 0x75 + +# GOOD: c.lui a0, 1048563 +0x4D 0x75 + +# GOOD: c.lui a0, 1048564 +0x51 0x75 + +# GOOD: c.lui a0, 1048565 +0x55 0x75 + +# GOOD: c.lui a0, 1048566 +0x59 0x75 + +# GOOD: c.lui a0, 1048567 +0x5D 0x75 + +# GOOD: c.lui a0, 1048568 +0x61 0x75 + +# GOOD: c.lui a0, 1048569 +0x65 0x75 + +# GOOD: c.lui a0, 1048570 +0x69 0x75 + +# GOOD: c.lui a0, 1048571 +0x6D 0x75 + +# GOOD: c.lui a0, 1048572 +0x71 0x75 + +# GOOD: c.lui a0, 1048573 +0x75 0x75 + +# GOOD: c.lui a0, 1048574 +0x79 0x75 + +# GOOD: c.lui a0, 1048575 +0x7D 0x75 + +# BAD: invalid instruction encoding +# MOP: c.mop.11 +0x81 0x65 + +# GOOD: c.lui a1, 1 +0x85 0x65 + +# GOOD: c.lui a1, 2 +0x89 0x65 + +# GOOD: c.lui a1, 3 +0x8D 0x65 + +# GOOD: c.lui a1, 4 +0x91 0x65 + +# GOOD: c.lui a1, 5 +0x95 0x65 + +# GOOD: c.lui a1, 6 +0x99 0x65 + +# GOOD: c.lui a1, 7 +0x9D 0x65 + +# GOOD: c.lui a1, 8 +0xA1 0x65 + +# GOOD: c.lui a1, 9 +0xA5 0x65 + +# GOOD: c.lui a1, 10 +0xA9 0x65 + +# GOOD: c.lui a1, 11 +0xAD 0x65 + +# GOOD: c.lui a1, 12 +0xB1 0x65 + +# GOOD: c.lui a1, 13 +0xB5 0x65 + +# GOOD: c.lui a1, 14 +0xB9 0x65 + +# GOOD: c.lui a1, 15 +0xBD 0x65 + +# GOOD: c.lui a1, 16 +0xC1 0x65 + +# GOOD: c.lui a1, 17 +0xC5 0x65 + +# GOOD: c.lui a1, 18 +0xC9 0x65 + +# GOOD: c.lui a1, 19 +0xCD 0x65 + +# GOOD: c.lui a1, 20 +0xD1 0x65 + +# GOOD: c.lui a1, 21 +0xD5 0x65 + +# GOOD: c.lui a1, 22 +0xD9 0x65 + +# GOOD: c.lui a1, 23 +0xDD 0x65 + +# GOOD: c.lui a1, 24 +0xE1 0x65 + +# GOOD: c.lui a1, 25 +0xE5 0x65 + +# GOOD: c.lui a1, 26 +0xE9 0x65 + +# GOOD: c.lui a1, 27 +0xED 0x65 + +# GOOD: c.lui a1, 28 +0xF1 0x65 + +# GOOD: c.lui a1, 29 +0xF5 0x65 + +# GOOD: c.lui a1, 30 +0xF9 0x65 + +# GOOD: c.lui a1, 31 +0xFD 0x65 + +# GOOD: c.lui a1, 1048544 +0x81 0x75 + +# GOOD: c.lui a1, 1048545 +0x85 0x75 + +# GOOD: c.lui a1, 1048546 +0x89 0x75 + +# GOOD: c.lui a1, 1048547 +0x8D 0x75 + +# GOOD: c.lui a1, 1048548 +0x91 0x75 + +# GOOD: c.lui a1, 1048549 +0x95 0x75 + +# GOOD: c.lui a1, 1048550 +0x99 0x75 + +# GOOD: c.lui a1, 1048551 +0x9D 0x75 + +# GOOD: c.lui a1, 1048552 +0xA1 0x75 + +# GOOD: c.lui a1, 1048553 +0xA5 0x75 + +# GOOD: c.lui a1, 1048554 +0xA9 0x75 + +# GOOD: c.lui a1, 1048555 +0xAD 0x75 + +# GOOD: c.lui a1, 1048556 +0xB1 0x75 + +# GOOD: c.lui a1, 1048557 +0xB5 0x75 + +# GOOD: c.lui a1, 1048558 +0xB9 0x75 + +# GOOD: c.lui a1, 1048559 +0xBD 0x75 + +# GOOD: c.lui a1, 1048560 +0xC1 0x75 + +# GOOD: c.lui a1, 1048561 +0xC5 0x75 + +# GOOD: c.lui a1, 1048562 +0xC9 0x75 + +# GOOD: c.lui a1, 1048563 +0xCD 0x75 + +# GOOD: c.lui a1, 1048564 +0xD1 0x75 + +# GOOD: c.lui a1, 1048565 +0xD5 0x75 + +# GOOD: c.lui a1, 1048566 +0xD9 0x75 + +# GOOD: c.lui a1, 1048567 +0xDD 0x75 + +# GOOD: c.lui a1, 1048568 +0xE1 0x75 + +# GOOD: c.lui a1, 1048569 +0xE5 0x75 + +# GOOD: c.lui a1, 1048570 +0xE9 0x75 + +# GOOD: c.lui a1, 1048571 +0xED 0x75 + +# GOOD: c.lui a1, 1048572 +0xF1 0x75 + +# GOOD: c.lui a1, 1048573 +0xF5 0x75 + +# GOOD: c.lui a1, 1048574 +0xF9 0x75 + +# GOOD: c.lui a1, 1048575 +0xFD 0x75 + +# BAD: invalid instruction encoding +0x01 0x66 + +# GOOD: c.lui a2, 1 +0x05 0x66 + +# GOOD: c.lui a2, 2 +0x09 0x66 + +# GOOD: c.lui a2, 3 +0x0D 0x66 + +# GOOD: c.lui a2, 4 +0x11 0x66 + +# GOOD: c.lui a2, 5 +0x15 0x66 + +# GOOD: c.lui a2, 6 +0x19 0x66 + +# GOOD: c.lui a2, 7 +0x1D 0x66 + +# GOOD: c.lui a2, 8 +0x21 0x66 + +# GOOD: c.lui a2, 9 +0x25 0x66 + +# GOOD: c.lui a2, 10 +0x29 0x66 + +# GOOD: c.lui a2, 11 +0x2D 0x66 + +# GOOD: c.lui a2, 12 +0x31 0x66 + +# GOOD: c.lui a2, 13 +0x35 0x66 + +# GOOD: c.lui a2, 14 +0x39 0x66 + +# GOOD: c.lui a2, 15 +0x3D 0x66 + +# GOOD: c.lui a2, 16 +0x41 0x66 + +# GOOD: c.lui a2, 17 +0x45 0x66 + +# GOOD: c.lui a2, 18 +0x49 0x66 + +# GOOD: c.lui a2, 19 +0x4D 0x66 + +# GOOD: c.lui a2, 20 +0x51 0x66 + +# GOOD: c.lui a2, 21 +0x55 0x66 + +# GOOD: c.lui a2, 22 +0x59 0x66 + +# GOOD: c.lui a2, 23 +0x5D 0x66 + +# GOOD: c.lui a2, 24 +0x61 0x66 + +# GOOD: c.lui a2, 25 +0x65 0x66 + +# GOOD: c.lui a2, 26 +0x69 0x66 + +# GOOD: c.lui a2, 27 +0x6D 0x66 + +# GOOD: c.lui a2, 28 +0x71 0x66 + +# GOOD: c.lui a2, 29 +0x75 0x66 + +# GOOD: c.lui a2, 30 +0x79 0x66 + +# GOOD: c.lui a2, 31 +0x7D 0x66 + +# GOOD: c.lui a2, 1048544 +0x01 0x76 + +# GOOD: c.lui a2, 1048545 +0x05 0x76 + +# GOOD: c.lui a2, 1048546 +0x09 0x76 + +# GOOD: c.lui a2, 1048547 +0x0D 0x76 + +# GOOD: c.lui a2, 1048548 +0x11 0x76 + +# GOOD: c.lui a2, 1048549 +0x15 0x76 + +# GOOD: c.lui a2, 1048550 +0x19 0x76 + +# GOOD: c.lui a2, 1048551 +0x1D 0x76 + +# GOOD: c.lui a2, 1048552 +0x21 0x76 + +# GOOD: c.lui a2, 1048553 +0x25 0x76 + +# GOOD: c.lui a2, 1048554 +0x29 0x76 + +# GOOD: c.lui a2, 1048555 +0x2D 0x76 + +# GOOD: c.lui a2, 1048556 +0x31 0x76 + +# GOOD: c.lui a2, 1048557 +0x35 0x76 + +# GOOD: c.lui a2, 1048558 +0x39 0x76 + +# GOOD: c.lui a2, 1048559 +0x3D 0x76 + +# GOOD: c.lui a2, 1048560 +0x41 0x76 + +# GOOD: c.lui a2, 1048561 +0x45 0x76 + +# GOOD: c.lui a2, 1048562 +0x49 0x76 + +# GOOD: c.lui a2, 1048563 +0x4D 0x76 + +# GOOD: c.lui a2, 1048564 +0x51 0x76 + +# GOOD: c.lui a2, 1048565 +0x55 0x76 + +# GOOD: c.lui a2, 1048566 +0x59 0x76 + +# GOOD: c.lui a2, 1048567 +0x5D 0x76 + +# GOOD: c.lui a2, 1048568 +0x61 0x76 + +# GOOD: c.lui a2, 1048569 +0x65 0x76 + +# GOOD: c.lui a2, 1048570 +0x69 0x76 + +# GOOD: c.lui a2, 1048571 +0x6D 0x76 + +# GOOD: c.lui a2, 1048572 +0x71 0x76 + +# GOOD: c.lui a2, 1048573 +0x75 0x76 + +# GOOD: c.lui a2, 1048574 +0x79 0x76 + +# GOOD: c.lui a2, 1048575 +0x7D 0x76 + +# BAD: invalid instruction encoding +# MOP: c.mop.13 +0x81 0x66 + +# GOOD: c.lui a3, 1 +0x85 0x66 + +# GOOD: c.lui a3, 2 +0x89 0x66 + +# GOOD: c.lui a3, 3 +0x8D 0x66 + +# GOOD: c.lui a3, 4 +0x91 0x66 + +# GOOD: c.lui a3, 5 +0x95 0x66 + +# GOOD: c.lui a3, 6 +0x99 0x66 + +# GOOD: c.lui a3, 7 +0x9D 0x66 + +# GOOD: c.lui a3, 8 +0xA1 0x66 + +# GOOD: c.lui a3, 9 +0xA5 0x66 + +# GOOD: c.lui a3, 10 +0xA9 0x66 + +# GOOD: c.lui a3, 11 +0xAD 0x66 + +# GOOD: c.lui a3, 12 +0xB1 0x66 + +# GOOD: c.lui a3, 13 +0xB5 0x66 + +# GOOD: c.lui a3, 14 +0xB9 0x66 + +# GOOD: c.lui a3, 15 +0xBD 0x66 + +# GOOD: c.lui a3, 16 +0xC1 0x66 + +# GOOD: c.lui a3, 17 +0xC5 0x66 + +# GOOD: c.lui a3, 18 +0xC9 0x66 + +# GOOD: c.lui a3, 19 +0xCD 0x66 + +# GOOD: c.lui a3, 20 +0xD1 0x66 + +# GOOD: c.lui a3, 21 +0xD5 0x66 + +# GOOD: c.lui a3, 22 +0xD9 0x66 + +# GOOD: c.lui a3, 23 +0xDD 0x66 + +# GOOD: c.lui a3, 24 +0xE1 0x66 + +# GOOD: c.lui a3, 25 +0xE5 0x66 + +# GOOD: c.lui a3, 26 +0xE9 0x66 + +# GOOD: c.lui a3, 27 +0xED 0x66 + +# GOOD: c.lui a3, 28 +0xF1 0x66 + +# GOOD: c.lui a3, 29 +0xF5 0x66 + +# GOOD: c.lui a3, 30 +0xF9 0x66 + +# GOOD: c.lui a3, 31 +0xFD 0x66 + +# GOOD: c.lui a3, 1048544 +0x81 0x76 + +# GOOD: c.lui a3, 1048545 +0x85 0x76 + +# GOOD: c.lui a3, 1048546 +0x89 0x76 + +# GOOD: c.lui a3, 1048547 +0x8D 0x76 + +# GOOD: c.lui a3, 1048548 +0x91 0x76 + +# GOOD: c.lui a3, 1048549 +0x95 0x76 + +# GOOD: c.lui a3, 1048550 +0x99 0x76 + +# GOOD: c.lui a3, 1048551 +0x9D 0x76 + +# GOOD: c.lui a3, 1048552 +0xA1 0x76 + +# GOOD: c.lui a3, 1048553 +0xA5 0x76 + +# GOOD: c.lui a3, 1048554 +0xA9 0x76 + +# GOOD: c.lui a3, 1048555 +0xAD 0x76 + +# GOOD: c.lui a3, 1048556 +0xB1 0x76 + +# GOOD: c.lui a3, 1048557 +0xB5 0x76 + +# GOOD: c.lui a3, 1048558 +0xB9 0x76 + +# GOOD: c.lui a3, 1048559 +0xBD 0x76 + +# GOOD: c.lui a3, 1048560 +0xC1 0x76 + +# GOOD: c.lui a3, 1048561 +0xC5 0x76 + +# GOOD: c.lui a3, 1048562 +0xC9 0x76 + +# GOOD: c.lui a3, 1048563 +0xCD 0x76 + +# GOOD: c.lui a3, 1048564 +0xD1 0x76 + +# GOOD: c.lui a3, 1048565 +0xD5 0x76 + +# GOOD: c.lui a3, 1048566 +0xD9 0x76 + +# GOOD: c.lui a3, 1048567 +0xDD 0x76 + +# GOOD: c.lui a3, 1048568 +0xE1 0x76 + +# GOOD: c.lui a3, 1048569 +0xE5 0x76 + +# GOOD: c.lui a3, 1048570 +0xE9 0x76 + +# GOOD: c.lui a3, 1048571 +0xED 0x76 + +# GOOD: c.lui a3, 1048572 +0xF1 0x76 + +# GOOD: c.lui a3, 1048573 +0xF5 0x76 + +# GOOD: c.lui a3, 1048574 +0xF9 0x76 + +# GOOD: c.lui a3, 1048575 +0xFD 0x76 + +# BAD: invalid instruction encoding +0x01 0x67 + +# GOOD: c.lui a4, 1 +0x05 0x67 + +# GOOD: c.lui a4, 2 +0x09 0x67 + +# GOOD: c.lui a4, 3 +0x0D 0x67 + +# GOOD: c.lui a4, 4 +0x11 0x67 + +# GOOD: c.lui a4, 5 +0x15 0x67 + +# GOOD: c.lui a4, 6 +0x19 0x67 + +# GOOD: c.lui a4, 7 +0x1D 0x67 + +# GOOD: c.lui a4, 8 +0x21 0x67 + +# GOOD: c.lui a4, 9 +0x25 0x67 + +# GOOD: c.lui a4, 10 +0x29 0x67 + +# GOOD: c.lui a4, 11 +0x2D 0x67 + +# GOOD: c.lui a4, 12 +0x31 0x67 + +# GOOD: c.lui a4, 13 +0x35 0x67 + +# GOOD: c.lui a4, 14 +0x39 0x67 + +# GOOD: c.lui a4, 15 +0x3D 0x67 + +# GOOD: c.lui a4, 16 +0x41 0x67 + +# GOOD: c.lui a4, 17 +0x45 0x67 + +# GOOD: c.lui a4, 18 +0x49 0x67 + +# GOOD: c.lui a4, 19 +0x4D 0x67 + +# GOOD: c.lui a4, 20 +0x51 0x67 + +# GOOD: c.lui a4, 21 +0x55 0x67 + +# GOOD: c.lui a4, 22 +0x59 0x67 + +# GOOD: c.lui a4, 23 +0x5D 0x67 + +# GOOD: c.lui a4, 24 +0x61 0x67 + +# GOOD: c.lui a4, 25 +0x65 0x67 + +# GOOD: c.lui a4, 26 +0x69 0x67 + +# GOOD: c.lui a4, 27 +0x6D 0x67 + +# GOOD: c.lui a4, 28 +0x71 0x67 + +# GOOD: c.lui a4, 29 +0x75 0x67 + +# GOOD: c.lui a4, 30 +0x79 0x67 + +# GOOD: c.lui a4, 31 +0x7D 0x67 + +# GOOD: c.lui a4, 1048544 +0x01 0x77 + +# GOOD: c.lui a4, 1048545 +0x05 0x77 + +# GOOD: c.lui a4, 1048546 +0x09 0x77 + +# GOOD: c.lui a4, 1048547 +0x0D 0x77 + +# GOOD: c.lui a4, 1048548 +0x11 0x77 + +# GOOD: c.lui a4, 1048549 +0x15 0x77 + +# GOOD: c.lui a4, 1048550 +0x19 0x77 + +# GOOD: c.lui a4, 1048551 +0x1D 0x77 + +# GOOD: c.lui a4, 1048552 +0x21 0x77 + +# GOOD: c.lui a4, 1048553 +0x25 0x77 + +# GOOD: c.lui a4, 1048554 +0x29 0x77 + +# GOOD: c.lui a4, 1048555 +0x2D 0x77 + +# GOOD: c.lui a4, 1048556 +0x31 0x77 + +# GOOD: c.lui a4, 1048557 +0x35 0x77 + +# GOOD: c.lui a4, 1048558 +0x39 0x77 + +# GOOD: c.lui a4, 1048559 +0x3D 0x77 + +# GOOD: c.lui a4, 1048560 +0x41 0x77 + +# GOOD: c.lui a4, 1048561 +0x45 0x77 + +# GOOD: c.lui a4, 1048562 +0x49 0x77 + +# GOOD: c.lui a4, 1048563 +0x4D 0x77 + +# GOOD: c.lui a4, 1048564 +0x51 0x77 + +# GOOD: c.lui a4, 1048565 +0x55 0x77 + +# GOOD: c.lui a4, 1048566 +0x59 0x77 + +# GOOD: c.lui a4, 1048567 +0x5D 0x77 + +# GOOD: c.lui a4, 1048568 +0x61 0x77 + +# GOOD: c.lui a4, 1048569 +0x65 0x77 + +# GOOD: c.lui a4, 1048570 +0x69 0x77 + +# GOOD: c.lui a4, 1048571 +0x6D 0x77 + +# GOOD: c.lui a4, 1048572 +0x71 0x77 + +# GOOD: c.lui a4, 1048573 +0x75 0x77 + +# GOOD: c.lui a4, 1048574 +0x79 0x77 + +# GOOD: c.lui a4, 1048575 +0x7D 0x77 + +# BAD: invalid instruction encoding +# MOP: c.mop.15 +0x81 0x67 + +# GOOD: c.lui a5, 1 +0x85 0x67 + +# GOOD: c.lui a5, 2 +0x89 0x67 + +# GOOD: c.lui a5, 3 +0x8D 0x67 + +# GOOD: c.lui a5, 4 +0x91 0x67 + +# GOOD: c.lui a5, 5 +0x95 0x67 + +# GOOD: c.lui a5, 6 +0x99 0x67 + +# GOOD: c.lui a5, 7 +0x9D 0x67 + +# GOOD: c.lui a5, 8 +0xA1 0x67 + +# GOOD: c.lui a5, 9 +0xA5 0x67 + +# GOOD: c.lui a5, 10 +0xA9 0x67 + +# GOOD: c.lui a5, 11 +0xAD 0x67 + +# GOOD: c.lui a5, 12 +0xB1 0x67 + +# GOOD: c.lui a5, 13 +0xB5 0x67 + +# GOOD: c.lui a5, 14 +0xB9 0x67 + +# GOOD: c.lui a5, 15 +0xBD 0x67 + +# GOOD: c.lui a5, 16 +0xC1 0x67 + +# GOOD: c.lui a5, 17 +0xC5 0x67 + +# GOOD: c.lui a5, 18 +0xC9 0x67 + +# GOOD: c.lui a5, 19 +0xCD 0x67 + +# GOOD: c.lui a5, 20 +0xD1 0x67 + +# GOOD: c.lui a5, 21 +0xD5 0x67 + +# GOOD: c.lui a5, 22 +0xD9 0x67 + +# GOOD: c.lui a5, 23 +0xDD 0x67 + +# GOOD: c.lui a5, 24 +0xE1 0x67 + +# GOOD: c.lui a5, 25 +0xE5 0x67 + +# GOOD: c.lui a5, 26 +0xE9 0x67 + +# GOOD: c.lui a5, 27 +0xED 0x67 + +# GOOD: c.lui a5, 28 +0xF1 0x67 + +# GOOD: c.lui a5, 29 +0xF5 0x67 + +# GOOD: c.lui a5, 30 +0xF9 0x67 + +# GOOD: c.lui a5, 31 +0xFD 0x67 + +# GOOD: c.lui a5, 1048544 +0x81 0x77 + +# GOOD: c.lui a5, 1048545 +0x85 0x77 + +# GOOD: c.lui a5, 1048546 +0x89 0x77 + +# GOOD: c.lui a5, 1048547 +0x8D 0x77 + +# GOOD: c.lui a5, 1048548 +0x91 0x77 + +# GOOD: c.lui a5, 1048549 +0x95 0x77 + +# GOOD: c.lui a5, 1048550 +0x99 0x77 + +# GOOD: c.lui a5, 1048551 +0x9D 0x77 + +# GOOD: c.lui a5, 1048552 +0xA1 0x77 + +# GOOD: c.lui a5, 1048553 +0xA5 0x77 + +# GOOD: c.lui a5, 1048554 +0xA9 0x77 + +# GOOD: c.lui a5, 1048555 +0xAD 0x77 + +# GOOD: c.lui a5, 1048556 +0xB1 0x77 + +# GOOD: c.lui a5, 1048557 +0xB5 0x77 + +# GOOD: c.lui a5, 1048558 +0xB9 0x77 + +# GOOD: c.lui a5, 1048559 +0xBD 0x77 + +# GOOD: c.lui a5, 1048560 +0xC1 0x77 + +# GOOD: c.lui a5, 1048561 +0xC5 0x77 + +# GOOD: c.lui a5, 1048562 +0xC9 0x77 + +# GOOD: c.lui a5, 1048563 +0xCD 0x77 + +# GOOD: c.lui a5, 1048564 +0xD1 0x77 + +# GOOD: c.lui a5, 1048565 +0xD5 0x77 + +# GOOD: c.lui a5, 1048566 +0xD9 0x77 + +# GOOD: c.lui a5, 1048567 +0xDD 0x77 + +# GOOD: c.lui a5, 1048568 +0xE1 0x77 + +# GOOD: c.lui a5, 1048569 +0xE5 0x77 + +# GOOD: c.lui a5, 1048570 +0xE9 0x77 + +# GOOD: c.lui a5, 1048571 +0xED 0x77 + +# GOOD: c.lui a5, 1048572 +0xF1 0x77 + +# GOOD: c.lui a5, 1048573 +0xF5 0x77 + +# GOOD: c.lui a5, 1048574 +0xF9 0x77 + +# GOOD: c.lui a5, 1048575 +0xFD 0x77 + +# BAD: invalid instruction encoding +0x01 0x68 + +# GOOD: c.lui a6, 1 +0x05 0x68 + +# GOOD: c.lui a6, 2 +0x09 0x68 + +# GOOD: c.lui a6, 3 +0x0D 0x68 + +# GOOD: c.lui a6, 4 +0x11 0x68 + +# GOOD: c.lui a6, 5 +0x15 0x68 + +# GOOD: c.lui a6, 6 +0x19 0x68 + +# GOOD: c.lui a6, 7 +0x1D 0x68 + +# GOOD: c.lui a6, 8 +0x21 0x68 + +# GOOD: c.lui a6, 9 +0x25 0x68 + +# GOOD: c.lui a6, 10 +0x29 0x68 + +# GOOD: c.lui a6, 11 +0x2D 0x68 + +# GOOD: c.lui a6, 12 +0x31 0x68 + +# GOOD: c.lui a6, 13 +0x35 0x68 + +# GOOD: c.lui a6, 14 +0x39 0x68 + +# GOOD: c.lui a6, 15 +0x3D 0x68 + +# GOOD: c.lui a6, 16 +0x41 0x68 + +# GOOD: c.lui a6, 17 +0x45 0x68 + +# GOOD: c.lui a6, 18 +0x49 0x68 + +# GOOD: c.lui a6, 19 +0x4D 0x68 + +# GOOD: c.lui a6, 20 +0x51 0x68 + +# GOOD: c.lui a6, 21 +0x55 0x68 + +# GOOD: c.lui a6, 22 +0x59 0x68 + +# GOOD: c.lui a6, 23 +0x5D 0x68 + +# GOOD: c.lui a6, 24 +0x61 0x68 + +# GOOD: c.lui a6, 25 +0x65 0x68 + +# GOOD: c.lui a6, 26 +0x69 0x68 + +# GOOD: c.lui a6, 27 +0x6D 0x68 + +# GOOD: c.lui a6, 28 +0x71 0x68 + +# GOOD: c.lui a6, 29 +0x75 0x68 + +# GOOD: c.lui a6, 30 +0x79 0x68 + +# GOOD: c.lui a6, 31 +0x7D 0x68 + +# GOOD: c.lui a6, 1048544 +0x01 0x78 + +# GOOD: c.lui a6, 1048545 +0x05 0x78 + +# GOOD: c.lui a6, 1048546 +0x09 0x78 + +# GOOD: c.lui a6, 1048547 +0x0D 0x78 + +# GOOD: c.lui a6, 1048548 +0x11 0x78 + +# GOOD: c.lui a6, 1048549 +0x15 0x78 + +# GOOD: c.lui a6, 1048550 +0x19 0x78 + +# GOOD: c.lui a6, 1048551 +0x1D 0x78 + +# GOOD: c.lui a6, 1048552 +0x21 0x78 + +# GOOD: c.lui a6, 1048553 +0x25 0x78 + +# GOOD: c.lui a6, 1048554 +0x29 0x78 + +# GOOD: c.lui a6, 1048555 +0x2D 0x78 + +# GOOD: c.lui a6, 1048556 +0x31 0x78 + +# GOOD: c.lui a6, 1048557 +0x35 0x78 + +# GOOD: c.lui a6, 1048558 +0x39 0x78 + +# GOOD: c.lui a6, 1048559 +0x3D 0x78 + +# GOOD: c.lui a6, 1048560 +0x41 0x78 + +# GOOD: c.lui a6, 1048561 +0x45 0x78 + +# GOOD: c.lui a6, 1048562 +0x49 0x78 + +# GOOD: c.lui a6, 1048563 +0x4D 0x78 + +# GOOD: c.lui a6, 1048564 +0x51 0x78 + +# GOOD: c.lui a6, 1048565 +0x55 0x78 + +# GOOD: c.lui a6, 1048566 +0x59 0x78 + +# GOOD: c.lui a6, 1048567 +0x5D 0x78 + +# GOOD: c.lui a6, 1048568 +0x61 0x78 + +# GOOD: c.lui a6, 1048569 +0x65 0x78 + +# GOOD: c.lui a6, 1048570 +0x69 0x78 + +# GOOD: c.lui a6, 1048571 +0x6D 0x78 + +# GOOD: c.lui a6, 1048572 +0x71 0x78 + +# GOOD: c.lui a6, 1048573 +0x75 0x78 + +# GOOD: c.lui a6, 1048574 +0x79 0x78 + +# GOOD: c.lui a6, 1048575 +0x7D 0x78 + +# BAD: invalid instruction encoding +0x81 0x68 + +# GOOD: c.lui a7, 1 +0x85 0x68 + +# GOOD: c.lui a7, 2 +0x89 0x68 + +# GOOD: c.lui a7, 3 +0x8D 0x68 + +# GOOD: c.lui a7, 4 +0x91 0x68 + +# GOOD: c.lui a7, 5 +0x95 0x68 + +# GOOD: c.lui a7, 6 +0x99 0x68 + +# GOOD: c.lui a7, 7 +0x9D 0x68 + +# GOOD: c.lui a7, 8 +0xA1 0x68 + +# GOOD: c.lui a7, 9 +0xA5 0x68 + +# GOOD: c.lui a7, 10 +0xA9 0x68 + +# GOOD: c.lui a7, 11 +0xAD 0x68 + +# GOOD: c.lui a7, 12 +0xB1 0x68 + +# GOOD: c.lui a7, 13 +0xB5 0x68 + +# GOOD: c.lui a7, 14 +0xB9 0x68 + +# GOOD: c.lui a7, 15 +0xBD 0x68 + +# GOOD: c.lui a7, 16 +0xC1 0x68 + +# GOOD: c.lui a7, 17 +0xC5 0x68 + +# GOOD: c.lui a7, 18 +0xC9 0x68 + +# GOOD: c.lui a7, 19 +0xCD 0x68 + +# GOOD: c.lui a7, 20 +0xD1 0x68 + +# GOOD: c.lui a7, 21 +0xD5 0x68 + +# GOOD: c.lui a7, 22 +0xD9 0x68 + +# GOOD: c.lui a7, 23 +0xDD 0x68 + +# GOOD: c.lui a7, 24 +0xE1 0x68 + +# GOOD: c.lui a7, 25 +0xE5 0x68 + +# GOOD: c.lui a7, 26 +0xE9 0x68 + +# GOOD: c.lui a7, 27 +0xED 0x68 + +# GOOD: c.lui a7, 28 +0xF1 0x68 + +# GOOD: c.lui a7, 29 +0xF5 0x68 + +# GOOD: c.lui a7, 30 +0xF9 0x68 + +# GOOD: c.lui a7, 31 +0xFD 0x68 + +# GOOD: c.lui a7, 1048544 +0x81 0x78 + +# GOOD: c.lui a7, 1048545 +0x85 0x78 + +# GOOD: c.lui a7, 1048546 +0x89 0x78 + +# GOOD: c.lui a7, 1048547 +0x8D 0x78 + +# GOOD: c.lui a7, 1048548 +0x91 0x78 + +# GOOD: c.lui a7, 1048549 +0x95 0x78 + +# GOOD: c.lui a7, 1048550 +0x99 0x78 + +# GOOD: c.lui a7, 1048551 +0x9D 0x78 + +# GOOD: c.lui a7, 1048552 +0xA1 0x78 + +# GOOD: c.lui a7, 1048553 +0xA5 0x78 + +# GOOD: c.lui a7, 1048554 +0xA9 0x78 + +# GOOD: c.lui a7, 1048555 +0xAD 0x78 + +# GOOD: c.lui a7, 1048556 +0xB1 0x78 + +# GOOD: c.lui a7, 1048557 +0xB5 0x78 + +# GOOD: c.lui a7, 1048558 +0xB9 0x78 + +# GOOD: c.lui a7, 1048559 +0xBD 0x78 + +# GOOD: c.lui a7, 1048560 +0xC1 0x78 + +# GOOD: c.lui a7, 1048561 +0xC5 0x78 + +# GOOD: c.lui a7, 1048562 +0xC9 0x78 + +# GOOD: c.lui a7, 1048563 +0xCD 0x78 + +# GOOD: c.lui a7, 1048564 +0xD1 0x78 + +# GOOD: c.lui a7, 1048565 +0xD5 0x78 + +# GOOD: c.lui a7, 1048566 +0xD9 0x78 + +# GOOD: c.lui a7, 1048567 +0xDD 0x78 + +# GOOD: c.lui a7, 1048568 +0xE1 0x78 + +# GOOD: c.lui a7, 1048569 +0xE5 0x78 + +# GOOD: c.lui a7, 1048570 +0xE9 0x78 + +# GOOD: c.lui a7, 1048571 +0xED 0x78 + +# GOOD: c.lui a7, 1048572 +0xF1 0x78 + +# GOOD: c.lui a7, 1048573 +0xF5 0x78 + +# GOOD: c.lui a7, 1048574 +0xF9 0x78 + +# GOOD: c.lui a7, 1048575 +0xFD 0x78 + +# BAD: invalid instruction encoding +0x01 0x69 + +# GOOD: c.lui s2, 1 +0x05 0x69 + +# GOOD: c.lui s2, 2 +0x09 0x69 + +# GOOD: c.lui s2, 3 +0x0D 0x69 + +# GOOD: c.lui s2, 4 +0x11 0x69 + +# GOOD: c.lui s2, 5 +0x15 0x69 + +# GOOD: c.lui s2, 6 +0x19 0x69 + +# GOOD: c.lui s2, 7 +0x1D 0x69 + +# GOOD: c.lui s2, 8 +0x21 0x69 + +# GOOD: c.lui s2, 9 +0x25 0x69 + +# GOOD: c.lui s2, 10 +0x29 0x69 + +# GOOD: c.lui s2, 11 +0x2D 0x69 + +# GOOD: c.lui s2, 12 +0x31 0x69 + +# GOOD: c.lui s2, 13 +0x35 0x69 + +# GOOD: c.lui s2, 14 +0x39 0x69 + +# GOOD: c.lui s2, 15 +0x3D 0x69 + +# GOOD: c.lui s2, 16 +0x41 0x69 + +# GOOD: c.lui s2, 17 +0x45 0x69 + +# GOOD: c.lui s2, 18 +0x49 0x69 + +# GOOD: c.lui s2, 19 +0x4D 0x69 + +# GOOD: c.lui s2, 20 +0x51 0x69 + +# GOOD: c.lui s2, 21 +0x55 0x69 + +# GOOD: c.lui s2, 22 +0x59 0x69 + +# GOOD: c.lui s2, 23 +0x5D 0x69 + +# GOOD: c.lui s2, 24 +0x61 0x69 + +# GOOD: c.lui s2, 25 +0x65 0x69 + +# GOOD: c.lui s2, 26 +0x69 0x69 + +# GOOD: c.lui s2, 27 +0x6D 0x69 + +# GOOD: c.lui s2, 28 +0x71 0x69 + +# GOOD: c.lui s2, 29 +0x75 0x69 + +# GOOD: c.lui s2, 30 +0x79 0x69 + +# GOOD: c.lui s2, 31 +0x7D 0x69 + +# GOOD: c.lui s2, 1048544 +0x01 0x79 + +# GOOD: c.lui s2, 1048545 +0x05 0x79 + +# GOOD: c.lui s2, 1048546 +0x09 0x79 + +# GOOD: c.lui s2, 1048547 +0x0D 0x79 + +# GOOD: c.lui s2, 1048548 +0x11 0x79 + +# GOOD: c.lui s2, 1048549 +0x15 0x79 + +# GOOD: c.lui s2, 1048550 +0x19 0x79 + +# GOOD: c.lui s2, 1048551 +0x1D 0x79 + +# GOOD: c.lui s2, 1048552 +0x21 0x79 + +# GOOD: c.lui s2, 1048553 +0x25 0x79 + +# GOOD: c.lui s2, 1048554 +0x29 0x79 + +# GOOD: c.lui s2, 1048555 +0x2D 0x79 + +# GOOD: c.lui s2, 1048556 +0x31 0x79 + +# GOOD: c.lui s2, 1048557 +0x35 0x79 + +# GOOD: c.lui s2, 1048558 +0x39 0x79 + +# GOOD: c.lui s2, 1048559 +0x3D 0x79 + +# GOOD: c.lui s2, 1048560 +0x41 0x79 + +# GOOD: c.lui s2, 1048561 +0x45 0x79 + +# GOOD: c.lui s2, 1048562 +0x49 0x79 + +# GOOD: c.lui s2, 1048563 +0x4D 0x79 + +# GOOD: c.lui s2, 1048564 +0x51 0x79 + +# GOOD: c.lui s2, 1048565 +0x55 0x79 + +# GOOD: c.lui s2, 1048566 +0x59 0x79 + +# GOOD: c.lui s2, 1048567 +0x5D 0x79 + +# GOOD: c.lui s2, 1048568 +0x61 0x79 + +# GOOD: c.lui s2, 1048569 +0x65 0x79 + +# GOOD: c.lui s2, 1048570 +0x69 0x79 + +# GOOD: c.lui s2, 1048571 +0x6D 0x79 + +# GOOD: c.lui s2, 1048572 +0x71 0x79 + +# GOOD: c.lui s2, 1048573 +0x75 0x79 + +# GOOD: c.lui s2, 1048574 +0x79 0x79 + +# GOOD: c.lui s2, 1048575 +0x7D 0x79 + +# BAD: invalid instruction encoding +0x81 0x69 + +# GOOD: c.lui s3, 1 +0x85 0x69 + +# GOOD: c.lui s3, 2 +0x89 0x69 + +# GOOD: c.lui s3, 3 +0x8D 0x69 + +# GOOD: c.lui s3, 4 +0x91 0x69 + +# GOOD: c.lui s3, 5 +0x95 0x69 + +# GOOD: c.lui s3, 6 +0x99 0x69 + +# GOOD: c.lui s3, 7 +0x9D 0x69 + +# GOOD: c.lui s3, 8 +0xA1 0x69 + +# GOOD: c.lui s3, 9 +0xA5 0x69 + +# GOOD: c.lui s3, 10 +0xA9 0x69 + +# GOOD: c.lui s3, 11 +0xAD 0x69 + +# GOOD: c.lui s3, 12 +0xB1 0x69 + +# GOOD: c.lui s3, 13 +0xB5 0x69 + +# GOOD: c.lui s3, 14 +0xB9 0x69 + +# GOOD: c.lui s3, 15 +0xBD 0x69 + +# GOOD: c.lui s3, 16 +0xC1 0x69 + +# GOOD: c.lui s3, 17 +0xC5 0x69 + +# GOOD: c.lui s3, 18 +0xC9 0x69 + +# GOOD: c.lui s3, 19 +0xCD 0x69 + +# GOOD: c.lui s3, 20 +0xD1 0x69 + +# GOOD: c.lui s3, 21 +0xD5 0x69 + +# GOOD: c.lui s3, 22 +0xD9 0x69 + +# GOOD: c.lui s3, 23 +0xDD 0x69 + +# GOOD: c.lui s3, 24 +0xE1 0x69 + +# GOOD: c.lui s3, 25 +0xE5 0x69 + +# GOOD: c.lui s3, 26 +0xE9 0x69 + +# GOOD: c.lui s3, 27 +0xED 0x69 + +# GOOD: c.lui s3, 28 +0xF1 0x69 + +# GOOD: c.lui s3, 29 +0xF5 0x69 + +# GOOD: c.lui s3, 30 +0xF9 0x69 + +# GOOD: c.lui s3, 31 +0xFD 0x69 + +# GOOD: c.lui s3, 1048544 +0x81 0x79 + +# GOOD: c.lui s3, 1048545 +0x85 0x79 + +# GOOD: c.lui s3, 1048546 +0x89 0x79 + +# GOOD: c.lui s3, 1048547 +0x8D 0x79 + +# GOOD: c.lui s3, 1048548 +0x91 0x79 + +# GOOD: c.lui s3, 1048549 +0x95 0x79 + +# GOOD: c.lui s3, 1048550 +0x99 0x79 + +# GOOD: c.lui s3, 1048551 +0x9D 0x79 + +# GOOD: c.lui s3, 1048552 +0xA1 0x79 + +# GOOD: c.lui s3, 1048553 +0xA5 0x79 + +# GOOD: c.lui s3, 1048554 +0xA9 0x79 + +# GOOD: c.lui s3, 1048555 +0xAD 0x79 + +# GOOD: c.lui s3, 1048556 +0xB1 0x79 + +# GOOD: c.lui s3, 1048557 +0xB5 0x79 + +# GOOD: c.lui s3, 1048558 +0xB9 0x79 + +# GOOD: c.lui s3, 1048559 +0xBD 0x79 + +# GOOD: c.lui s3, 1048560 +0xC1 0x79 + +# GOOD: c.lui s3, 1048561 +0xC5 0x79 + +# GOOD: c.lui s3, 1048562 +0xC9 0x79 + +# GOOD: c.lui s3, 1048563 +0xCD 0x79 + +# GOOD: c.lui s3, 1048564 +0xD1 0x79 + +# GOOD: c.lui s3, 1048565 +0xD5 0x79 + +# GOOD: c.lui s3, 1048566 +0xD9 0x79 + +# GOOD: c.lui s3, 1048567 +0xDD 0x79 + +# GOOD: c.lui s3, 1048568 +0xE1 0x79 + +# GOOD: c.lui s3, 1048569 +0xE5 0x79 + +# GOOD: c.lui s3, 1048570 +0xE9 0x79 + +# GOOD: c.lui s3, 1048571 +0xED 0x79 + +# GOOD: c.lui s3, 1048572 +0xF1 0x79 + +# GOOD: c.lui s3, 1048573 +0xF5 0x79 + +# GOOD: c.lui s3, 1048574 +0xF9 0x79 + +# GOOD: c.lui s3, 1048575 +0xFD 0x79 + +# BAD: invalid instruction encoding +0x01 0x6A + +# GOOD: c.lui s4, 1 +0x05 0x6A + +# GOOD: c.lui s4, 2 +0x09 0x6A + +# GOOD: c.lui s4, 3 +0x0D 0x6A + +# GOOD: c.lui s4, 4 +0x11 0x6A + +# GOOD: c.lui s4, 5 +0x15 0x6A + +# GOOD: c.lui s4, 6 +0x19 0x6A + +# GOOD: c.lui s4, 7 +0x1D 0x6A + +# GOOD: c.lui s4, 8 +0x21 0x6A + +# GOOD: c.lui s4, 9 +0x25 0x6A + +# GOOD: c.lui s4, 10 +0x29 0x6A + +# GOOD: c.lui s4, 11 +0x2D 0x6A + +# GOOD: c.lui s4, 12 +0x31 0x6A + +# GOOD: c.lui s4, 13 +0x35 0x6A + +# GOOD: c.lui s4, 14 +0x39 0x6A + +# GOOD: c.lui s4, 15 +0x3D 0x6A + +# GOOD: c.lui s4, 16 +0x41 0x6A + +# GOOD: c.lui s4, 17 +0x45 0x6A + +# GOOD: c.lui s4, 18 +0x49 0x6A + +# GOOD: c.lui s4, 19 +0x4D 0x6A + +# GOOD: c.lui s4, 20 +0x51 0x6A + +# GOOD: c.lui s4, 21 +0x55 0x6A + +# GOOD: c.lui s4, 22 +0x59 0x6A + +# GOOD: c.lui s4, 23 +0x5D 0x6A + +# GOOD: c.lui s4, 24 +0x61 0x6A + +# GOOD: c.lui s4, 25 +0x65 0x6A + +# GOOD: c.lui s4, 26 +0x69 0x6A + +# GOOD: c.lui s4, 27 +0x6D 0x6A + +# GOOD: c.lui s4, 28 +0x71 0x6A + +# GOOD: c.lui s4, 29 +0x75 0x6A + +# GOOD: c.lui s4, 30 +0x79 0x6A + +# GOOD: c.lui s4, 31 +0x7D 0x6A + +# GOOD: c.lui s4, 1048544 +0x01 0x7A + +# GOOD: c.lui s4, 1048545 +0x05 0x7A + +# GOOD: c.lui s4, 1048546 +0x09 0x7A + +# GOOD: c.lui s4, 1048547 +0x0D 0x7A + +# GOOD: c.lui s4, 1048548 +0x11 0x7A + +# GOOD: c.lui s4, 1048549 +0x15 0x7A + +# GOOD: c.lui s4, 1048550 +0x19 0x7A + +# GOOD: c.lui s4, 1048551 +0x1D 0x7A + +# GOOD: c.lui s4, 1048552 +0x21 0x7A + +# GOOD: c.lui s4, 1048553 +0x25 0x7A + +# GOOD: c.lui s4, 1048554 +0x29 0x7A + +# GOOD: c.lui s4, 1048555 +0x2D 0x7A + +# GOOD: c.lui s4, 1048556 +0x31 0x7A + +# GOOD: c.lui s4, 1048557 +0x35 0x7A + +# GOOD: c.lui s4, 1048558 +0x39 0x7A + +# GOOD: c.lui s4, 1048559 +0x3D 0x7A + +# GOOD: c.lui s4, 1048560 +0x41 0x7A + +# GOOD: c.lui s4, 1048561 +0x45 0x7A + +# GOOD: c.lui s4, 1048562 +0x49 0x7A + +# GOOD: c.lui s4, 1048563 +0x4D 0x7A + +# GOOD: c.lui s4, 1048564 +0x51 0x7A + +# GOOD: c.lui s4, 1048565 +0x55 0x7A + +# GOOD: c.lui s4, 1048566 +0x59 0x7A + +# GOOD: c.lui s4, 1048567 +0x5D 0x7A + +# GOOD: c.lui s4, 1048568 +0x61 0x7A + +# GOOD: c.lui s4, 1048569 +0x65 0x7A + +# GOOD: c.lui s4, 1048570 +0x69 0x7A + +# GOOD: c.lui s4, 1048571 +0x6D 0x7A + +# GOOD: c.lui s4, 1048572 +0x71 0x7A + +# GOOD: c.lui s4, 1048573 +0x75 0x7A + +# GOOD: c.lui s4, 1048574 +0x79 0x7A + +# GOOD: c.lui s4, 1048575 +0x7D 0x7A + +# BAD: invalid instruction encoding +0x81 0x6A + +# GOOD: c.lui s5, 1 +0x85 0x6A + +# GOOD: c.lui s5, 2 +0x89 0x6A + +# GOOD: c.lui s5, 3 +0x8D 0x6A + +# GOOD: c.lui s5, 4 +0x91 0x6A + +# GOOD: c.lui s5, 5 +0x95 0x6A + +# GOOD: c.lui s5, 6 +0x99 0x6A + +# GOOD: c.lui s5, 7 +0x9D 0x6A + +# GOOD: c.lui s5, 8 +0xA1 0x6A + +# GOOD: c.lui s5, 9 +0xA5 0x6A + +# GOOD: c.lui s5, 10 +0xA9 0x6A + +# GOOD: c.lui s5, 11 +0xAD 0x6A + +# GOOD: c.lui s5, 12 +0xB1 0x6A + +# GOOD: c.lui s5, 13 +0xB5 0x6A + +# GOOD: c.lui s5, 14 +0xB9 0x6A + +# GOOD: c.lui s5, 15 +0xBD 0x6A + +# GOOD: c.lui s5, 16 +0xC1 0x6A + +# GOOD: c.lui s5, 17 +0xC5 0x6A + +# GOOD: c.lui s5, 18 +0xC9 0x6A + +# GOOD: c.lui s5, 19 +0xCD 0x6A + +# GOOD: c.lui s5, 20 +0xD1 0x6A + +# GOOD: c.lui s5, 21 +0xD5 0x6A + +# GOOD: c.lui s5, 22 +0xD9 0x6A + +# GOOD: c.lui s5, 23 +0xDD 0x6A + +# GOOD: c.lui s5, 24 +0xE1 0x6A + +# GOOD: c.lui s5, 25 +0xE5 0x6A + +# GOOD: c.lui s5, 26 +0xE9 0x6A + +# GOOD: c.lui s5, 27 +0xED 0x6A + +# GOOD: c.lui s5, 28 +0xF1 0x6A + +# GOOD: c.lui s5, 29 +0xF5 0x6A + +# GOOD: c.lui s5, 30 +0xF9 0x6A + +# GOOD: c.lui s5, 31 +0xFD 0x6A + +# GOOD: c.lui s5, 1048544 +0x81 0x7A + +# GOOD: c.lui s5, 1048545 +0x85 0x7A + +# GOOD: c.lui s5, 1048546 +0x89 0x7A + +# GOOD: c.lui s5, 1048547 +0x8D 0x7A + +# GOOD: c.lui s5, 1048548 +0x91 0x7A + +# GOOD: c.lui s5, 1048549 +0x95 0x7A + +# GOOD: c.lui s5, 1048550 +0x99 0x7A + +# GOOD: c.lui s5, 1048551 +0x9D 0x7A + +# GOOD: c.lui s5, 1048552 +0xA1 0x7A + +# GOOD: c.lui s5, 1048553 +0xA5 0x7A + +# GOOD: c.lui s5, 1048554 +0xA9 0x7A + +# GOOD: c.lui s5, 1048555 +0xAD 0x7A + +# GOOD: c.lui s5, 1048556 +0xB1 0x7A + +# GOOD: c.lui s5, 1048557 +0xB5 0x7A + +# GOOD: c.lui s5, 1048558 +0xB9 0x7A + +# GOOD: c.lui s5, 1048559 +0xBD 0x7A + +# GOOD: c.lui s5, 1048560 +0xC1 0x7A + +# GOOD: c.lui s5, 1048561 +0xC5 0x7A + +# GOOD: c.lui s5, 1048562 +0xC9 0x7A + +# GOOD: c.lui s5, 1048563 +0xCD 0x7A + +# GOOD: c.lui s5, 1048564 +0xD1 0x7A + +# GOOD: c.lui s5, 1048565 +0xD5 0x7A + +# GOOD: c.lui s5, 1048566 +0xD9 0x7A + +# GOOD: c.lui s5, 1048567 +0xDD 0x7A + +# GOOD: c.lui s5, 1048568 +0xE1 0x7A + +# GOOD: c.lui s5, 1048569 +0xE5 0x7A + +# GOOD: c.lui s5, 1048570 +0xE9 0x7A + +# GOOD: c.lui s5, 1048571 +0xED 0x7A + +# GOOD: c.lui s5, 1048572 +0xF1 0x7A + +# GOOD: c.lui s5, 1048573 +0xF5 0x7A + +# GOOD: c.lui s5, 1048574 +0xF9 0x7A + +# GOOD: c.lui s5, 1048575 +0xFD 0x7A + +# BAD: invalid instruction encoding +0x01 0x6B + +# GOOD: c.lui s6, 1 +0x05 0x6B + +# GOOD: c.lui s6, 2 +0x09 0x6B + +# GOOD: c.lui s6, 3 +0x0D 0x6B + +# GOOD: c.lui s6, 4 +0x11 0x6B + +# GOOD: c.lui s6, 5 +0x15 0x6B + +# GOOD: c.lui s6, 6 +0x19 0x6B + +# GOOD: c.lui s6, 7 +0x1D 0x6B + +# GOOD: c.lui s6, 8 +0x21 0x6B + +# GOOD: c.lui s6, 9 +0x25 0x6B + +# GOOD: c.lui s6, 10 +0x29 0x6B + +# GOOD: c.lui s6, 11 +0x2D 0x6B + +# GOOD: c.lui s6, 12 +0x31 0x6B + +# GOOD: c.lui s6, 13 +0x35 0x6B + +# GOOD: c.lui s6, 14 +0x39 0x6B + +# GOOD: c.lui s6, 15 +0x3D 0x6B + +# GOOD: c.lui s6, 16 +0x41 0x6B + +# GOOD: c.lui s6, 17 +0x45 0x6B + +# GOOD: c.lui s6, 18 +0x49 0x6B + +# GOOD: c.lui s6, 19 +0x4D 0x6B + +# GOOD: c.lui s6, 20 +0x51 0x6B + +# GOOD: c.lui s6, 21 +0x55 0x6B + +# GOOD: c.lui s6, 22 +0x59 0x6B + +# GOOD: c.lui s6, 23 +0x5D 0x6B + +# GOOD: c.lui s6, 24 +0x61 0x6B + +# GOOD: c.lui s6, 25 +0x65 0x6B + +# GOOD: c.lui s6, 26 +0x69 0x6B + +# GOOD: c.lui s6, 27 +0x6D 0x6B + +# GOOD: c.lui s6, 28 +0x71 0x6B + +# GOOD: c.lui s6, 29 +0x75 0x6B + +# GOOD: c.lui s6, 30 +0x79 0x6B + +# GOOD: c.lui s6, 31 +0x7D 0x6B + +# GOOD: c.lui s6, 1048544 +0x01 0x7B + +# GOOD: c.lui s6, 1048545 +0x05 0x7B + +# GOOD: c.lui s6, 1048546 +0x09 0x7B + +# GOOD: c.lui s6, 1048547 +0x0D 0x7B + +# GOOD: c.lui s6, 1048548 +0x11 0x7B + +# GOOD: c.lui s6, 1048549 +0x15 0x7B + +# GOOD: c.lui s6, 1048550 +0x19 0x7B + +# GOOD: c.lui s6, 1048551 +0x1D 0x7B + +# GOOD: c.lui s6, 1048552 +0x21 0x7B + +# GOOD: c.lui s6, 1048553 +0x25 0x7B + +# GOOD: c.lui s6, 1048554 +0x29 0x7B + +# GOOD: c.lui s6, 1048555 +0x2D 0x7B + +# GOOD: c.lui s6, 1048556 +0x31 0x7B + +# GOOD: c.lui s6, 1048557 +0x35 0x7B + +# GOOD: c.lui s6, 1048558 +0x39 0x7B + +# GOOD: c.lui s6, 1048559 +0x3D 0x7B + +# GOOD: c.lui s6, 1048560 +0x41 0x7B + +# GOOD: c.lui s6, 1048561 +0x45 0x7B + +# GOOD: c.lui s6, 1048562 +0x49 0x7B + +# GOOD: c.lui s6, 1048563 +0x4D 0x7B + +# GOOD: c.lui s6, 1048564 +0x51 0x7B + +# GOOD: c.lui s6, 1048565 +0x55 0x7B + +# GOOD: c.lui s6, 1048566 +0x59 0x7B + +# GOOD: c.lui s6, 1048567 +0x5D 0x7B + +# GOOD: c.lui s6, 1048568 +0x61 0x7B + +# GOOD: c.lui s6, 1048569 +0x65 0x7B + +# GOOD: c.lui s6, 1048570 +0x69 0x7B + +# GOOD: c.lui s6, 1048571 +0x6D 0x7B + +# GOOD: c.lui s6, 1048572 +0x71 0x7B + +# GOOD: c.lui s6, 1048573 +0x75 0x7B + +# GOOD: c.lui s6, 1048574 +0x79 0x7B + +# GOOD: c.lui s6, 1048575 +0x7D 0x7B + +# BAD: invalid instruction encoding +0x81 0x6B + +# GOOD: c.lui s7, 1 +0x85 0x6B + +# GOOD: c.lui s7, 2 +0x89 0x6B + +# GOOD: c.lui s7, 3 +0x8D 0x6B + +# GOOD: c.lui s7, 4 +0x91 0x6B + +# GOOD: c.lui s7, 5 +0x95 0x6B + +# GOOD: c.lui s7, 6 +0x99 0x6B + +# GOOD: c.lui s7, 7 +0x9D 0x6B + +# GOOD: c.lui s7, 8 +0xA1 0x6B + +# GOOD: c.lui s7, 9 +0xA5 0x6B + +# GOOD: c.lui s7, 10 +0xA9 0x6B + +# GOOD: c.lui s7, 11 +0xAD 0x6B + +# GOOD: c.lui s7, 12 +0xB1 0x6B + +# GOOD: c.lui s7, 13 +0xB5 0x6B + +# GOOD: c.lui s7, 14 +0xB9 0x6B + +# GOOD: c.lui s7, 15 +0xBD 0x6B + +# GOOD: c.lui s7, 16 +0xC1 0x6B + +# GOOD: c.lui s7, 17 +0xC5 0x6B + +# GOOD: c.lui s7, 18 +0xC9 0x6B + +# GOOD: c.lui s7, 19 +0xCD 0x6B + +# GOOD: c.lui s7, 20 +0xD1 0x6B + +# GOOD: c.lui s7, 21 +0xD5 0x6B + +# GOOD: c.lui s7, 22 +0xD9 0x6B + +# GOOD: c.lui s7, 23 +0xDD 0x6B + +# GOOD: c.lui s7, 24 +0xE1 0x6B + +# GOOD: c.lui s7, 25 +0xE5 0x6B + +# GOOD: c.lui s7, 26 +0xE9 0x6B + +# GOOD: c.lui s7, 27 +0xED 0x6B + +# GOOD: c.lui s7, 28 +0xF1 0x6B + +# GOOD: c.lui s7, 29 +0xF5 0x6B + +# GOOD: c.lui s7, 30 +0xF9 0x6B + +# GOOD: c.lui s7, 31 +0xFD 0x6B + +# GOOD: c.lui s7, 1048544 +0x81 0x7B + +# GOOD: c.lui s7, 1048545 +0x85 0x7B + +# GOOD: c.lui s7, 1048546 +0x89 0x7B + +# GOOD: c.lui s7, 1048547 +0x8D 0x7B + +# GOOD: c.lui s7, 1048548 +0x91 0x7B + +# GOOD: c.lui s7, 1048549 +0x95 0x7B + +# GOOD: c.lui s7, 1048550 +0x99 0x7B + +# GOOD: c.lui s7, 1048551 +0x9D 0x7B + +# GOOD: c.lui s7, 1048552 +0xA1 0x7B + +# GOOD: c.lui s7, 1048553 +0xA5 0x7B + +# GOOD: c.lui s7, 1048554 +0xA9 0x7B + +# GOOD: c.lui s7, 1048555 +0xAD 0x7B + +# GOOD: c.lui s7, 1048556 +0xB1 0x7B + +# GOOD: c.lui s7, 1048557 +0xB5 0x7B + +# GOOD: c.lui s7, 1048558 +0xB9 0x7B + +# GOOD: c.lui s7, 1048559 +0xBD 0x7B + +# GOOD: c.lui s7, 1048560 +0xC1 0x7B + +# GOOD: c.lui s7, 1048561 +0xC5 0x7B + +# GOOD: c.lui s7, 1048562 +0xC9 0x7B + +# GOOD: c.lui s7, 1048563 +0xCD 0x7B + +# GOOD: c.lui s7, 1048564 +0xD1 0x7B + +# GOOD: c.lui s7, 1048565 +0xD5 0x7B + +# GOOD: c.lui s7, 1048566 +0xD9 0x7B + +# GOOD: c.lui s7, 1048567 +0xDD 0x7B + +# GOOD: c.lui s7, 1048568 +0xE1 0x7B + +# GOOD: c.lui s7, 1048569 +0xE5 0x7B + +# GOOD: c.lui s7, 1048570 +0xE9 0x7B + +# GOOD: c.lui s7, 1048571 +0xED 0x7B + +# GOOD: c.lui s7, 1048572 +0xF1 0x7B + +# GOOD: c.lui s7, 1048573 +0xF5 0x7B + +# GOOD: c.lui s7, 1048574 +0xF9 0x7B + +# GOOD: c.lui s7, 1048575 +0xFD 0x7B + +# BAD: invalid instruction encoding +0x01 0x6C + +# GOOD: c.lui s8, 1 +0x05 0x6C + +# GOOD: c.lui s8, 2 +0x09 0x6C + +# GOOD: c.lui s8, 3 +0x0D 0x6C + +# GOOD: c.lui s8, 4 +0x11 0x6C + +# GOOD: c.lui s8, 5 +0x15 0x6C + +# GOOD: c.lui s8, 6 +0x19 0x6C + +# GOOD: c.lui s8, 7 +0x1D 0x6C + +# GOOD: c.lui s8, 8 +0x21 0x6C + +# GOOD: c.lui s8, 9 +0x25 0x6C + +# GOOD: c.lui s8, 10 +0x29 0x6C + +# GOOD: c.lui s8, 11 +0x2D 0x6C + +# GOOD: c.lui s8, 12 +0x31 0x6C + +# GOOD: c.lui s8, 13 +0x35 0x6C + +# GOOD: c.lui s8, 14 +0x39 0x6C + +# GOOD: c.lui s8, 15 +0x3D 0x6C + +# GOOD: c.lui s8, 16 +0x41 0x6C + +# GOOD: c.lui s8, 17 +0x45 0x6C + +# GOOD: c.lui s8, 18 +0x49 0x6C + +# GOOD: c.lui s8, 19 +0x4D 0x6C + +# GOOD: c.lui s8, 20 +0x51 0x6C + +# GOOD: c.lui s8, 21 +0x55 0x6C + +# GOOD: c.lui s8, 22 +0x59 0x6C + +# GOOD: c.lui s8, 23 +0x5D 0x6C + +# GOOD: c.lui s8, 24 +0x61 0x6C + +# GOOD: c.lui s8, 25 +0x65 0x6C + +# GOOD: c.lui s8, 26 +0x69 0x6C + +# GOOD: c.lui s8, 27 +0x6D 0x6C + +# GOOD: c.lui s8, 28 +0x71 0x6C + +# GOOD: c.lui s8, 29 +0x75 0x6C + +# GOOD: c.lui s8, 30 +0x79 0x6C + +# GOOD: c.lui s8, 31 +0x7D 0x6C + +# GOOD: c.lui s8, 1048544 +0x01 0x7C + +# GOOD: c.lui s8, 1048545 +0x05 0x7C + +# GOOD: c.lui s8, 1048546 +0x09 0x7C + +# GOOD: c.lui s8, 1048547 +0x0D 0x7C + +# GOOD: c.lui s8, 1048548 +0x11 0x7C + +# GOOD: c.lui s8, 1048549 +0x15 0x7C + +# GOOD: c.lui s8, 1048550 +0x19 0x7C + +# GOOD: c.lui s8, 1048551 +0x1D 0x7C + +# GOOD: c.lui s8, 1048552 +0x21 0x7C + +# GOOD: c.lui s8, 1048553 +0x25 0x7C + +# GOOD: c.lui s8, 1048554 +0x29 0x7C + +# GOOD: c.lui s8, 1048555 +0x2D 0x7C + +# GOOD: c.lui s8, 1048556 +0x31 0x7C + +# GOOD: c.lui s8, 1048557 +0x35 0x7C + +# GOOD: c.lui s8, 1048558 +0x39 0x7C + +# GOOD: c.lui s8, 1048559 +0x3D 0x7C + +# GOOD: c.lui s8, 1048560 +0x41 0x7C + +# GOOD: c.lui s8, 1048561 +0x45 0x7C + +# GOOD: c.lui s8, 1048562 +0x49 0x7C + +# GOOD: c.lui s8, 1048563 +0x4D 0x7C + +# GOOD: c.lui s8, 1048564 +0x51 0x7C + +# GOOD: c.lui s8, 1048565 +0x55 0x7C + +# GOOD: c.lui s8, 1048566 +0x59 0x7C + +# GOOD: c.lui s8, 1048567 +0x5D 0x7C + +# GOOD: c.lui s8, 1048568 +0x61 0x7C + +# GOOD: c.lui s8, 1048569 +0x65 0x7C + +# GOOD: c.lui s8, 1048570 +0x69 0x7C + +# GOOD: c.lui s8, 1048571 +0x6D 0x7C + +# GOOD: c.lui s8, 1048572 +0x71 0x7C + +# GOOD: c.lui s8, 1048573 +0x75 0x7C + +# GOOD: c.lui s8, 1048574 +0x79 0x7C + +# GOOD: c.lui s8, 1048575 +0x7D 0x7C + +# BAD: invalid instruction encoding +0x81 0x6C + +# GOOD: c.lui s9, 1 +0x85 0x6C + +# GOOD: c.lui s9, 2 +0x89 0x6C + +# GOOD: c.lui s9, 3 +0x8D 0x6C + +# GOOD: c.lui s9, 4 +0x91 0x6C + +# GOOD: c.lui s9, 5 +0x95 0x6C + +# GOOD: c.lui s9, 6 +0x99 0x6C + +# GOOD: c.lui s9, 7 +0x9D 0x6C + +# GOOD: c.lui s9, 8 +0xA1 0x6C + +# GOOD: c.lui s9, 9 +0xA5 0x6C + +# GOOD: c.lui s9, 10 +0xA9 0x6C + +# GOOD: c.lui s9, 11 +0xAD 0x6C + +# GOOD: c.lui s9, 12 +0xB1 0x6C + +# GOOD: c.lui s9, 13 +0xB5 0x6C + +# GOOD: c.lui s9, 14 +0xB9 0x6C + +# GOOD: c.lui s9, 15 +0xBD 0x6C + +# GOOD: c.lui s9, 16 +0xC1 0x6C + +# GOOD: c.lui s9, 17 +0xC5 0x6C + +# GOOD: c.lui s9, 18 +0xC9 0x6C + +# GOOD: c.lui s9, 19 +0xCD 0x6C + +# GOOD: c.lui s9, 20 +0xD1 0x6C + +# GOOD: c.lui s9, 21 +0xD5 0x6C + +# GOOD: c.lui s9, 22 +0xD9 0x6C + +# GOOD: c.lui s9, 23 +0xDD 0x6C + +# GOOD: c.lui s9, 24 +0xE1 0x6C + +# GOOD: c.lui s9, 25 +0xE5 0x6C + +# GOOD: c.lui s9, 26 +0xE9 0x6C + +# GOOD: c.lui s9, 27 +0xED 0x6C + +# GOOD: c.lui s9, 28 +0xF1 0x6C + +# GOOD: c.lui s9, 29 +0xF5 0x6C + +# GOOD: c.lui s9, 30 +0xF9 0x6C + +# GOOD: c.lui s9, 31 +0xFD 0x6C + +# GOOD: c.lui s9, 1048544 +0x81 0x7C + +# GOOD: c.lui s9, 1048545 +0x85 0x7C + +# GOOD: c.lui s9, 1048546 +0x89 0x7C + +# GOOD: c.lui s9, 1048547 +0x8D 0x7C + +# GOOD: c.lui s9, 1048548 +0x91 0x7C + +# GOOD: c.lui s9, 1048549 +0x95 0x7C + +# GOOD: c.lui s9, 1048550 +0x99 0x7C + +# GOOD: c.lui s9, 1048551 +0x9D 0x7C + +# GOOD: c.lui s9, 1048552 +0xA1 0x7C + +# GOOD: c.lui s9, 1048553 +0xA5 0x7C + +# GOOD: c.lui s9, 1048554 +0xA9 0x7C + +# GOOD: c.lui s9, 1048555 +0xAD 0x7C + +# GOOD: c.lui s9, 1048556 +0xB1 0x7C + +# GOOD: c.lui s9, 1048557 +0xB5 0x7C + +# GOOD: c.lui s9, 1048558 +0xB9 0x7C + +# GOOD: c.lui s9, 1048559 +0xBD 0x7C + +# GOOD: c.lui s9, 1048560 +0xC1 0x7C + +# GOOD: c.lui s9, 1048561 +0xC5 0x7C + +# GOOD: c.lui s9, 1048562 +0xC9 0x7C + +# GOOD: c.lui s9, 1048563 +0xCD 0x7C + +# GOOD: c.lui s9, 1048564 +0xD1 0x7C + +# GOOD: c.lui s9, 1048565 +0xD5 0x7C + +# GOOD: c.lui s9, 1048566 +0xD9 0x7C + +# GOOD: c.lui s9, 1048567 +0xDD 0x7C + +# GOOD: c.lui s9, 1048568 +0xE1 0x7C + +# GOOD: c.lui s9, 1048569 +0xE5 0x7C + +# GOOD: c.lui s9, 1048570 +0xE9 0x7C + +# GOOD: c.lui s9, 1048571 +0xED 0x7C + +# GOOD: c.lui s9, 1048572 +0xF1 0x7C + +# GOOD: c.lui s9, 1048573 +0xF5 0x7C + +# GOOD: c.lui s9, 1048574 +0xF9 0x7C + +# GOOD: c.lui s9, 1048575 +0xFD 0x7C + +# BAD: invalid instruction encoding +0x01 0x6D + +# GOOD: c.lui s10, 1 +0x05 0x6D + +# GOOD: c.lui s10, 2 +0x09 0x6D + +# GOOD: c.lui s10, 3 +0x0D 0x6D + +# GOOD: c.lui s10, 4 +0x11 0x6D + +# GOOD: c.lui s10, 5 +0x15 0x6D + +# GOOD: c.lui s10, 6 +0x19 0x6D + +# GOOD: c.lui s10, 7 +0x1D 0x6D + +# GOOD: c.lui s10, 8 +0x21 0x6D + +# GOOD: c.lui s10, 9 +0x25 0x6D + +# GOOD: c.lui s10, 10 +0x29 0x6D + +# GOOD: c.lui s10, 11 +0x2D 0x6D + +# GOOD: c.lui s10, 12 +0x31 0x6D + +# GOOD: c.lui s10, 13 +0x35 0x6D + +# GOOD: c.lui s10, 14 +0x39 0x6D + +# GOOD: c.lui s10, 15 +0x3D 0x6D + +# GOOD: c.lui s10, 16 +0x41 0x6D + +# GOOD: c.lui s10, 17 +0x45 0x6D + +# GOOD: c.lui s10, 18 +0x49 0x6D + +# GOOD: c.lui s10, 19 +0x4D 0x6D + +# GOOD: c.lui s10, 20 +0x51 0x6D + +# GOOD: c.lui s10, 21 +0x55 0x6D + +# GOOD: c.lui s10, 22 +0x59 0x6D + +# GOOD: c.lui s10, 23 +0x5D 0x6D + +# GOOD: c.lui s10, 24 +0x61 0x6D + +# GOOD: c.lui s10, 25 +0x65 0x6D + +# GOOD: c.lui s10, 26 +0x69 0x6D + +# GOOD: c.lui s10, 27 +0x6D 0x6D + +# GOOD: c.lui s10, 28 +0x71 0x6D + +# GOOD: c.lui s10, 29 +0x75 0x6D + +# GOOD: c.lui s10, 30 +0x79 0x6D + +# GOOD: c.lui s10, 31 +0x7D 0x6D + +# GOOD: c.lui s10, 1048544 +0x01 0x7D + +# GOOD: c.lui s10, 1048545 +0x05 0x7D + +# GOOD: c.lui s10, 1048546 +0x09 0x7D + +# GOOD: c.lui s10, 1048547 +0x0D 0x7D + +# GOOD: c.lui s10, 1048548 +0x11 0x7D + +# GOOD: c.lui s10, 1048549 +0x15 0x7D + +# GOOD: c.lui s10, 1048550 +0x19 0x7D + +# GOOD: c.lui s10, 1048551 +0x1D 0x7D + +# GOOD: c.lui s10, 1048552 +0x21 0x7D + +# GOOD: c.lui s10, 1048553 +0x25 0x7D + +# GOOD: c.lui s10, 1048554 +0x29 0x7D + +# GOOD: c.lui s10, 1048555 +0x2D 0x7D + +# GOOD: c.lui s10, 1048556 +0x31 0x7D + +# GOOD: c.lui s10, 1048557 +0x35 0x7D + +# GOOD: c.lui s10, 1048558 +0x39 0x7D + +# GOOD: c.lui s10, 1048559 +0x3D 0x7D + +# GOOD: c.lui s10, 1048560 +0x41 0x7D + +# GOOD: c.lui s10, 1048561 +0x45 0x7D + +# GOOD: c.lui s10, 1048562 +0x49 0x7D + +# GOOD: c.lui s10, 1048563 +0x4D 0x7D + +# GOOD: c.lui s10, 1048564 +0x51 0x7D + +# GOOD: c.lui s10, 1048565 +0x55 0x7D + +# GOOD: c.lui s10, 1048566 +0x59 0x7D + +# GOOD: c.lui s10, 1048567 +0x5D 0x7D + +# GOOD: c.lui s10, 1048568 +0x61 0x7D + +# GOOD: c.lui s10, 1048569 +0x65 0x7D + +# GOOD: c.lui s10, 1048570 +0x69 0x7D + +# GOOD: c.lui s10, 1048571 +0x6D 0x7D + +# GOOD: c.lui s10, 1048572 +0x71 0x7D + +# GOOD: c.lui s10, 1048573 +0x75 0x7D + +# GOOD: c.lui s10, 1048574 +0x79 0x7D + +# GOOD: c.lui s10, 1048575 +0x7D 0x7D + +# BAD: invalid instruction encoding +0x81 0x6D + +# GOOD: c.lui s11, 1 +0x85 0x6D + +# GOOD: c.lui s11, 2 +0x89 0x6D + +# GOOD: c.lui s11, 3 +0x8D 0x6D + +# GOOD: c.lui s11, 4 +0x91 0x6D + +# GOOD: c.lui s11, 5 +0x95 0x6D + +# GOOD: c.lui s11, 6 +0x99 0x6D + +# GOOD: c.lui s11, 7 +0x9D 0x6D + +# GOOD: c.lui s11, 8 +0xA1 0x6D + +# GOOD: c.lui s11, 9 +0xA5 0x6D + +# GOOD: c.lui s11, 10 +0xA9 0x6D + +# GOOD: c.lui s11, 11 +0xAD 0x6D + +# GOOD: c.lui s11, 12 +0xB1 0x6D + +# GOOD: c.lui s11, 13 +0xB5 0x6D + +# GOOD: c.lui s11, 14 +0xB9 0x6D + +# GOOD: c.lui s11, 15 +0xBD 0x6D + +# GOOD: c.lui s11, 16 +0xC1 0x6D + +# GOOD: c.lui s11, 17 +0xC5 0x6D + +# GOOD: c.lui s11, 18 +0xC9 0x6D + +# GOOD: c.lui s11, 19 +0xCD 0x6D + +# GOOD: c.lui s11, 20 +0xD1 0x6D + +# GOOD: c.lui s11, 21 +0xD5 0x6D + +# GOOD: c.lui s11, 22 +0xD9 0x6D + +# GOOD: c.lui s11, 23 +0xDD 0x6D + +# GOOD: c.lui s11, 24 +0xE1 0x6D + +# GOOD: c.lui s11, 25 +0xE5 0x6D + +# GOOD: c.lui s11, 26 +0xE9 0x6D + +# GOOD: c.lui s11, 27 +0xED 0x6D + +# GOOD: c.lui s11, 28 +0xF1 0x6D + +# GOOD: c.lui s11, 29 +0xF5 0x6D + +# GOOD: c.lui s11, 30 +0xF9 0x6D + +# GOOD: c.lui s11, 31 +0xFD 0x6D + +# GOOD: c.lui s11, 1048544 +0x81 0x7D + +# GOOD: c.lui s11, 1048545 +0x85 0x7D + +# GOOD: c.lui s11, 1048546 +0x89 0x7D + +# GOOD: c.lui s11, 1048547 +0x8D 0x7D + +# GOOD: c.lui s11, 1048548 +0x91 0x7D + +# GOOD: c.lui s11, 1048549 +0x95 0x7D + +# GOOD: c.lui s11, 1048550 +0x99 0x7D + +# GOOD: c.lui s11, 1048551 +0x9D 0x7D + +# GOOD: c.lui s11, 1048552 +0xA1 0x7D + +# GOOD: c.lui s11, 1048553 +0xA5 0x7D + +# GOOD: c.lui s11, 1048554 +0xA9 0x7D + +# GOOD: c.lui s11, 1048555 +0xAD 0x7D + +# GOOD: c.lui s11, 1048556 +0xB1 0x7D + +# GOOD: c.lui s11, 1048557 +0xB5 0x7D + +# GOOD: c.lui s11, 1048558 +0xB9 0x7D + +# GOOD: c.lui s11, 1048559 +0xBD 0x7D + +# GOOD: c.lui s11, 1048560 +0xC1 0x7D + +# GOOD: c.lui s11, 1048561 +0xC5 0x7D + +# GOOD: c.lui s11, 1048562 +0xC9 0x7D + +# GOOD: c.lui s11, 1048563 +0xCD 0x7D + +# GOOD: c.lui s11, 1048564 +0xD1 0x7D + +# GOOD: c.lui s11, 1048565 +0xD5 0x7D + +# GOOD: c.lui s11, 1048566 +0xD9 0x7D + +# GOOD: c.lui s11, 1048567 +0xDD 0x7D + +# GOOD: c.lui s11, 1048568 +0xE1 0x7D + +# GOOD: c.lui s11, 1048569 +0xE5 0x7D + +# GOOD: c.lui s11, 1048570 +0xE9 0x7D + +# GOOD: c.lui s11, 1048571 +0xED 0x7D + +# GOOD: c.lui s11, 1048572 +0xF1 0x7D + +# GOOD: c.lui s11, 1048573 +0xF5 0x7D + +# GOOD: c.lui s11, 1048574 +0xF9 0x7D + +# GOOD: c.lui s11, 1048575 +0xFD 0x7D + +# BAD: invalid instruction encoding +0x01 0x6E + +# GOOD: c.lui t3, 1 +0x05 0x6E + +# GOOD: c.lui t3, 2 +0x09 0x6E + +# GOOD: c.lui t3, 3 +0x0D 0x6E + +# GOOD: c.lui t3, 4 +0x11 0x6E + +# GOOD: c.lui t3, 5 +0x15 0x6E + +# GOOD: c.lui t3, 6 +0x19 0x6E + +# GOOD: c.lui t3, 7 +0x1D 0x6E + +# GOOD: c.lui t3, 8 +0x21 0x6E + +# GOOD: c.lui t3, 9 +0x25 0x6E + +# GOOD: c.lui t3, 10 +0x29 0x6E + +# GOOD: c.lui t3, 11 +0x2D 0x6E + +# GOOD: c.lui t3, 12 +0x31 0x6E + +# GOOD: c.lui t3, 13 +0x35 0x6E + +# GOOD: c.lui t3, 14 +0x39 0x6E + +# GOOD: c.lui t3, 15 +0x3D 0x6E + +# GOOD: c.lui t3, 16 +0x41 0x6E + +# GOOD: c.lui t3, 17 +0x45 0x6E + +# GOOD: c.lui t3, 18 +0x49 0x6E + +# GOOD: c.lui t3, 19 +0x4D 0x6E + +# GOOD: c.lui t3, 20 +0x51 0x6E + +# GOOD: c.lui t3, 21 +0x55 0x6E + +# GOOD: c.lui t3, 22 +0x59 0x6E + +# GOOD: c.lui t3, 23 +0x5D 0x6E + +# GOOD: c.lui t3, 24 +0x61 0x6E + +# GOOD: c.lui t3, 25 +0x65 0x6E + +# GOOD: c.lui t3, 26 +0x69 0x6E + +# GOOD: c.lui t3, 27 +0x6D 0x6E + +# GOOD: c.lui t3, 28 +0x71 0x6E + +# GOOD: c.lui t3, 29 +0x75 0x6E + +# GOOD: c.lui t3, 30 +0x79 0x6E + +# GOOD: c.lui t3, 31 +0x7D 0x6E + +# GOOD: c.lui t3, 1048544 +0x01 0x7E + +# GOOD: c.lui t3, 1048545 +0x05 0x7E + +# GOOD: c.lui t3, 1048546 +0x09 0x7E + +# GOOD: c.lui t3, 1048547 +0x0D 0x7E + +# GOOD: c.lui t3, 1048548 +0x11 0x7E + +# GOOD: c.lui t3, 1048549 +0x15 0x7E + +# GOOD: c.lui t3, 1048550 +0x19 0x7E + +# GOOD: c.lui t3, 1048551 +0x1D 0x7E + +# GOOD: c.lui t3, 1048552 +0x21 0x7E + +# GOOD: c.lui t3, 1048553 +0x25 0x7E + +# GOOD: c.lui t3, 1048554 +0x29 0x7E + +# GOOD: c.lui t3, 1048555 +0x2D 0x7E + +# GOOD: c.lui t3, 1048556 +0x31 0x7E + +# GOOD: c.lui t3, 1048557 +0x35 0x7E + +# GOOD: c.lui t3, 1048558 +0x39 0x7E + +# GOOD: c.lui t3, 1048559 +0x3D 0x7E + +# GOOD: c.lui t3, 1048560 +0x41 0x7E + +# GOOD: c.lui t3, 1048561 +0x45 0x7E + +# GOOD: c.lui t3, 1048562 +0x49 0x7E + +# GOOD: c.lui t3, 1048563 +0x4D 0x7E + +# GOOD: c.lui t3, 1048564 +0x51 0x7E + +# GOOD: c.lui t3, 1048565 +0x55 0x7E + +# GOOD: c.lui t3, 1048566 +0x59 0x7E + +# GOOD: c.lui t3, 1048567 +0x5D 0x7E + +# GOOD: c.lui t3, 1048568 +0x61 0x7E + +# GOOD: c.lui t3, 1048569 +0x65 0x7E + +# GOOD: c.lui t3, 1048570 +0x69 0x7E + +# GOOD: c.lui t3, 1048571 +0x6D 0x7E + +# GOOD: c.lui t3, 1048572 +0x71 0x7E + +# GOOD: c.lui t3, 1048573 +0x75 0x7E + +# GOOD: c.lui t3, 1048574 +0x79 0x7E + +# GOOD: c.lui t3, 1048575 +0x7D 0x7E + +# BAD: invalid instruction encoding +0x81 0x6E + +# GOOD: c.lui t4, 1 +0x85 0x6E + +# GOOD: c.lui t4, 2 +0x89 0x6E + +# GOOD: c.lui t4, 3 +0x8D 0x6E + +# GOOD: c.lui t4, 4 +0x91 0x6E + +# GOOD: c.lui t4, 5 +0x95 0x6E + +# GOOD: c.lui t4, 6 +0x99 0x6E + +# GOOD: c.lui t4, 7 +0x9D 0x6E + +# GOOD: c.lui t4, 8 +0xA1 0x6E + +# GOOD: c.lui t4, 9 +0xA5 0x6E + +# GOOD: c.lui t4, 10 +0xA9 0x6E + +# GOOD: c.lui t4, 11 +0xAD 0x6E + +# GOOD: c.lui t4, 12 +0xB1 0x6E + +# GOOD: c.lui t4, 13 +0xB5 0x6E + +# GOOD: c.lui t4, 14 +0xB9 0x6E + +# GOOD: c.lui t4, 15 +0xBD 0x6E + +# GOOD: c.lui t4, 16 +0xC1 0x6E + +# GOOD: c.lui t4, 17 +0xC5 0x6E + +# GOOD: c.lui t4, 18 +0xC9 0x6E + +# GOOD: c.lui t4, 19 +0xCD 0x6E + +# GOOD: c.lui t4, 20 +0xD1 0x6E + +# GOOD: c.lui t4, 21 +0xD5 0x6E + +# GOOD: c.lui t4, 22 +0xD9 0x6E + +# GOOD: c.lui t4, 23 +0xDD 0x6E + +# GOOD: c.lui t4, 24 +0xE1 0x6E + +# GOOD: c.lui t4, 25 +0xE5 0x6E + +# GOOD: c.lui t4, 26 +0xE9 0x6E + +# GOOD: c.lui t4, 27 +0xED 0x6E + +# GOOD: c.lui t4, 28 +0xF1 0x6E + +# GOOD: c.lui t4, 29 +0xF5 0x6E + +# GOOD: c.lui t4, 30 +0xF9 0x6E + +# GOOD: c.lui t4, 31 +0xFD 0x6E + +# GOOD: c.lui t4, 1048544 +0x81 0x7E + +# GOOD: c.lui t4, 1048545 +0x85 0x7E + +# GOOD: c.lui t4, 1048546 +0x89 0x7E + +# GOOD: c.lui t4, 1048547 +0x8D 0x7E + +# GOOD: c.lui t4, 1048548 +0x91 0x7E + +# GOOD: c.lui t4, 1048549 +0x95 0x7E + +# GOOD: c.lui t4, 1048550 +0x99 0x7E + +# GOOD: c.lui t4, 1048551 +0x9D 0x7E + +# GOOD: c.lui t4, 1048552 +0xA1 0x7E + +# GOOD: c.lui t4, 1048553 +0xA5 0x7E + +# GOOD: c.lui t4, 1048554 +0xA9 0x7E + +# GOOD: c.lui t4, 1048555 +0xAD 0x7E + +# GOOD: c.lui t4, 1048556 +0xB1 0x7E + +# GOOD: c.lui t4, 1048557 +0xB5 0x7E + +# GOOD: c.lui t4, 1048558 +0xB9 0x7E + +# GOOD: c.lui t4, 1048559 +0xBD 0x7E + +# GOOD: c.lui t4, 1048560 +0xC1 0x7E + +# GOOD: c.lui t4, 1048561 +0xC5 0x7E + +# GOOD: c.lui t4, 1048562 +0xC9 0x7E + +# GOOD: c.lui t4, 1048563 +0xCD 0x7E + +# GOOD: c.lui t4, 1048564 +0xD1 0x7E + +# GOOD: c.lui t4, 1048565 +0xD5 0x7E + +# GOOD: c.lui t4, 1048566 +0xD9 0x7E + +# GOOD: c.lui t4, 1048567 +0xDD 0x7E + +# GOOD: c.lui t4, 1048568 +0xE1 0x7E + +# GOOD: c.lui t4, 1048569 +0xE5 0x7E + +# GOOD: c.lui t4, 1048570 +0xE9 0x7E + +# GOOD: c.lui t4, 1048571 +0xED 0x7E + +# GOOD: c.lui t4, 1048572 +0xF1 0x7E + +# GOOD: c.lui t4, 1048573 +0xF5 0x7E + +# GOOD: c.lui t4, 1048574 +0xF9 0x7E + +# GOOD: c.lui t4, 1048575 +0xFD 0x7E + +# BAD: invalid instruction encoding +0x01 0x6F + +# GOOD: c.lui t5, 1 +0x05 0x6F + +# GOOD: c.lui t5, 2 +0x09 0x6F + +# GOOD: c.lui t5, 3 +0x0D 0x6F + +# GOOD: c.lui t5, 4 +0x11 0x6F + +# GOOD: c.lui t5, 5 +0x15 0x6F + +# GOOD: c.lui t5, 6 +0x19 0x6F + +# GOOD: c.lui t5, 7 +0x1D 0x6F + +# GOOD: c.lui t5, 8 +0x21 0x6F + +# GOOD: c.lui t5, 9 +0x25 0x6F + +# GOOD: c.lui t5, 10 +0x29 0x6F + +# GOOD: c.lui t5, 11 +0x2D 0x6F + +# GOOD: c.lui t5, 12 +0x31 0x6F + +# GOOD: c.lui t5, 13 +0x35 0x6F + +# GOOD: c.lui t5, 14 +0x39 0x6F + +# GOOD: c.lui t5, 15 +0x3D 0x6F + +# GOOD: c.lui t5, 16 +0x41 0x6F + +# GOOD: c.lui t5, 17 +0x45 0x6F + +# GOOD: c.lui t5, 18 +0x49 0x6F + +# GOOD: c.lui t5, 19 +0x4D 0x6F + +# GOOD: c.lui t5, 20 +0x51 0x6F + +# GOOD: c.lui t5, 21 +0x55 0x6F + +# GOOD: c.lui t5, 22 +0x59 0x6F + +# GOOD: c.lui t5, 23 +0x5D 0x6F + +# GOOD: c.lui t5, 24 +0x61 0x6F + +# GOOD: c.lui t5, 25 +0x65 0x6F + +# GOOD: c.lui t5, 26 +0x69 0x6F + +# GOOD: c.lui t5, 27 +0x6D 0x6F + +# GOOD: c.lui t5, 28 +0x71 0x6F + +# GOOD: c.lui t5, 29 +0x75 0x6F + +# GOOD: c.lui t5, 30 +0x79 0x6F + +# GOOD: c.lui t5, 31 +0x7D 0x6F + +# GOOD: c.lui t5, 1048544 +0x01 0x7F + +# GOOD: c.lui t5, 1048545 +0x05 0x7F + +# GOOD: c.lui t5, 1048546 +0x09 0x7F + +# GOOD: c.lui t5, 1048547 +0x0D 0x7F + +# GOOD: c.lui t5, 1048548 +0x11 0x7F + +# GOOD: c.lui t5, 1048549 +0x15 0x7F + +# GOOD: c.lui t5, 1048550 +0x19 0x7F + +# GOOD: c.lui t5, 1048551 +0x1D 0x7F + +# GOOD: c.lui t5, 1048552 +0x21 0x7F + +# GOOD: c.lui t5, 1048553 +0x25 0x7F + +# GOOD: c.lui t5, 1048554 +0x29 0x7F + +# GOOD: c.lui t5, 1048555 +0x2D 0x7F + +# GOOD: c.lui t5, 1048556 +0x31 0x7F + +# GOOD: c.lui t5, 1048557 +0x35 0x7F + +# GOOD: c.lui t5, 1048558 +0x39 0x7F + +# GOOD: c.lui t5, 1048559 +0x3D 0x7F + +# GOOD: c.lui t5, 1048560 +0x41 0x7F + +# GOOD: c.lui t5, 1048561 +0x45 0x7F + +# GOOD: c.lui t5, 1048562 +0x49 0x7F + +# GOOD: c.lui t5, 1048563 +0x4D 0x7F + +# GOOD: c.lui t5, 1048564 +0x51 0x7F + +# GOOD: c.lui t5, 1048565 +0x55 0x7F + +# GOOD: c.lui t5, 1048566 +0x59 0x7F + +# GOOD: c.lui t5, 1048567 +0x5D 0x7F + +# GOOD: c.lui t5, 1048568 +0x61 0x7F + +# GOOD: c.lui t5, 1048569 +0x65 0x7F + +# GOOD: c.lui t5, 1048570 +0x69 0x7F + +# GOOD: c.lui t5, 1048571 +0x6D 0x7F + +# GOOD: c.lui t5, 1048572 +0x71 0x7F + +# GOOD: c.lui t5, 1048573 +0x75 0x7F + +# GOOD: c.lui t5, 1048574 +0x79 0x7F + +# GOOD: c.lui t5, 1048575 +0x7D 0x7F + +# BAD: invalid instruction encoding +0x81 0x6F + +# GOOD: c.lui t6, 1 +0x85 0x6F + +# GOOD: c.lui t6, 2 +0x89 0x6F + +# GOOD: c.lui t6, 3 +0x8D 0x6F + +# GOOD: c.lui t6, 4 +0x91 0x6F + +# GOOD: c.lui t6, 5 +0x95 0x6F + +# GOOD: c.lui t6, 6 +0x99 0x6F + +# GOOD: c.lui t6, 7 +0x9D 0x6F + +# GOOD: c.lui t6, 8 +0xA1 0x6F + +# GOOD: c.lui t6, 9 +0xA5 0x6F + +# GOOD: c.lui t6, 10 +0xA9 0x6F + +# GOOD: c.lui t6, 11 +0xAD 0x6F + +# GOOD: c.lui t6, 12 +0xB1 0x6F + +# GOOD: c.lui t6, 13 +0xB5 0x6F + +# GOOD: c.lui t6, 14 +0xB9 0x6F + +# GOOD: c.lui t6, 15 +0xBD 0x6F + +# GOOD: c.lui t6, 16 +0xC1 0x6F + +# GOOD: c.lui t6, 17 +0xC5 0x6F + +# GOOD: c.lui t6, 18 +0xC9 0x6F + +# GOOD: c.lui t6, 19 +0xCD 0x6F + +# GOOD: c.lui t6, 20 +0xD1 0x6F + +# GOOD: c.lui t6, 21 +0xD5 0x6F + +# GOOD: c.lui t6, 22 +0xD9 0x6F + +# GOOD: c.lui t6, 23 +0xDD 0x6F + +# GOOD: c.lui t6, 24 +0xE1 0x6F + +# GOOD: c.lui t6, 25 +0xE5 0x6F + +# GOOD: c.lui t6, 26 +0xE9 0x6F + +# GOOD: c.lui t6, 27 +0xED 0x6F + +# GOOD: c.lui t6, 28 +0xF1 0x6F + +# GOOD: c.lui t6, 29 +0xF5 0x6F + +# GOOD: c.lui t6, 30 +0xF9 0x6F + +# GOOD: c.lui t6, 31 +0xFD 0x6F + +# GOOD: c.lui t6, 1048544 +0x81 0x7F + +# GOOD: c.lui t6, 1048545 +0x85 0x7F + +# GOOD: c.lui t6, 1048546 +0x89 0x7F + +# GOOD: c.lui t6, 1048547 +0x8D 0x7F + +# GOOD: c.lui t6, 1048548 +0x91 0x7F + +# GOOD: c.lui t6, 1048549 +0x95 0x7F + +# GOOD: c.lui t6, 1048550 +0x99 0x7F + +# GOOD: c.lui t6, 1048551 +0x9D 0x7F + +# GOOD: c.lui t6, 1048552 +0xA1 0x7F + +# GOOD: c.lui t6, 1048553 +0xA5 0x7F + +# GOOD: c.lui t6, 1048554 +0xA9 0x7F + +# GOOD: c.lui t6, 1048555 +0xAD 0x7F + +# GOOD: c.lui t6, 1048556 +0xB1 0x7F + +# GOOD: c.lui t6, 1048557 +0xB5 0x7F + +# GOOD: c.lui t6, 1048558 +0xB9 0x7F + +# GOOD: c.lui t6, 1048559 +0xBD 0x7F + +# GOOD: c.lui t6, 1048560 +0xC1 0x7F + +# GOOD: c.lui t6, 1048561 +0xC5 0x7F + +# GOOD: c.lui t6, 1048562 +0xC9 0x7F + +# GOOD: c.lui t6, 1048563 +0xCD 0x7F + +# GOOD: c.lui t6, 1048564 +0xD1 0x7F + +# GOOD: c.lui t6, 1048565 +0xD5 0x7F + +# GOOD: c.lui t6, 1048566 +0xD9 0x7F + +# GOOD: c.lui t6, 1048567 +0xDD 0x7F + +# GOOD: c.lui t6, 1048568 +0xE1 0x7F + +# GOOD: c.lui t6, 1048569 +0xE5 0x7F + +# GOOD: c.lui t6, 1048570 +0xE9 0x7F + +# GOOD: c.lui t6, 1048571 +0xED 0x7F + +# GOOD: c.lui t6, 1048572 +0xF1 0x7F + +# GOOD: c.lui t6, 1048573 +0xF5 0x7F + +# GOOD: c.lui t6, 1048574 +0xF9 0x7F + +# GOOD: c.lui t6, 1048575 +0xFD 0x7F + From eb2aba4a648c055533db10f5348e483ead4561ec Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 31 Mar 2025 21:53:02 -0700 Subject: [PATCH 0188/1029] [RISCV] Remove extra call to MatchRegisterName in parseRegListCommon. NFC Update RegEnd after each call to MatchRegisterName end of calling it again. --- llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 63d0777e4ff52..c1670326143e3 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2636,7 +2636,8 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, if (getLexer().isNot(AsmToken::Identifier)) return Error(getLoc(), "invalid register"); StringRef EndName = getLexer().getTok().getIdentifier(); - if (MatchRegisterName(EndName) != RISCV::X18) + RegEnd = MatchRegisterName(EndName); + if (RegEnd != RISCV::X18) return Error(getLoc(), "second contiguous registers pair of register list " "must start from 'x18'"); @@ -2647,11 +2648,11 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, if (getLexer().isNot(AsmToken::Identifier)) return Error(getLoc(), "invalid register"); EndName = getLexer().getTok().getIdentifier(); - if (!MatchRegisterName(EndName)) + RegEnd = MatchRegisterName(EndName); + if (!RegEnd) return Error(getLoc(), "invalid register"); getLexer().Lex(); } - RegEnd = MatchRegisterName(EndName); } } From ee3c892b3570281698170130a435f9c6b32c3ef5 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 31 Mar 2025 22:11:06 -0700 Subject: [PATCH 0189/1029] [clang-tidy] Use DenseMap::insert_range (NFC) (#133844) We can safely switch to insert_range here because SyntheticStmtSourceMap starts out empty in the constructor. Also TheCFG->synthetic_stmts() comes from DenseMap, so we know that the keys are unique. That is, operator[] and insert are equivalent in this particular case. --- clang-tools-extra/clang-tidy/utils/ExprSequence.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp b/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp index 145a5fe378b3e..685277d8bfbca 100644 --- a/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp +++ b/clang-tools-extra/clang-tidy/utils/ExprSequence.cpp @@ -84,9 +84,7 @@ getAllInitListForms(const InitListExpr *InitList) { ExprSequence::ExprSequence(const CFG *TheCFG, const Stmt *Root, ASTContext *TheContext) : Context(TheContext), Root(Root) { - for (const auto &SyntheticStmt : TheCFG->synthetic_stmts()) { - SyntheticStmtSourceMap[SyntheticStmt.first] = SyntheticStmt.second; - } + SyntheticStmtSourceMap.insert_range(TheCFG->synthetic_stmts()); } bool ExprSequence::inSequence(const Stmt *Before, const Stmt *After) const { From 4d68cf384df6be405598ec23be0e23e0837db3a3 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 31 Mar 2025 22:11:22 -0700 Subject: [PATCH 0190/1029] [lldb] Use DenseMap::insert_range (NFC) (#133846) --- lldb/source/Plugins/ABI/X86/ABIX86.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Plugins/ABI/X86/ABIX86.cpp b/lldb/source/Plugins/ABI/X86/ABIX86.cpp index 3c55a6ea17e8f..db170700d3f65 100644 --- a/lldb/source/Plugins/ABI/X86/ABIX86.cpp +++ b/lldb/source/Plugins/ABI/X86/ABIX86.cpp @@ -191,7 +191,7 @@ BaseRegToRegsMap makeBaseRegMap(bool is64bit) { // higher YMM registers (specific to amd64) YMM(8), YMM(9), YMM(10), YMM(11), YMM(12), YMM(13), YMM(14), YMM(15)}}; - out.insert(amd64_regs.begin(), amd64_regs.end()); + out.insert_range(amd64_regs); } return out; From 2de7b6ca4e978e5acb9530ed5495d490fb3c9501 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 31 Mar 2025 22:11:34 -0700 Subject: [PATCH 0191/1029] [ExecutionEngine] Use DenseMap::insert_range (NFC) (#133847) We can safely switch to insert_range here because LR starts out empty. Also, *Result is a DenseMap, so we know that the keys are unique. --- llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp index 941a9bdae7059..edb29d99f47bb 100644 --- a/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp @@ -108,8 +108,7 @@ class LinkGraphLinkingLayer::JITLinkCtx final : public JITLinkContext { LookupContinuation->run(Result.takeError()); else { AsyncLookupResult LR; - for (auto &KV : *Result) - LR[KV.first] = KV.second; + LR.insert_range(*Result); LookupContinuation->run(std::move(LR)); } }; From 091dcb8fc2b6ccb88c2975076e94f3cb6530db46 Mon Sep 17 00:00:00 2001 From: Thirumalai Shaktivel <74826228+Thirumalai-Shaktivel@users.noreply.github.com> Date: Tue, 1 Apr 2025 11:35:44 +0530 Subject: [PATCH 0192/1029] [Flang] Make a private copy for the common block variables in copyin clause (#111359) Fixes: https://github.com/llvm/llvm-project/issues/82949 --- flang/lib/Lower/Bridge.cpp | 11 ++++++-- flang/test/Lower/OpenMP/copyin.f90 | 42 ++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 6e6e88a32517c..178f3e066fb2b 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -948,13 +948,20 @@ class FirConverter : public Fortran::lower::AbstractConverter { std::function insertSymbols = [&](const Fortran::semantics::Symbol &oriSymbol, bool collectSymbol) { - if (collectSymbol && oriSymbol.test(flag)) + if (collectSymbol && oriSymbol.test(flag)) { symbolSet.insert(&oriSymbol); - else if (checkHostAssociatedSymbols) + } else if (const auto *commonDetails = + oriSymbol.detailsIf< + Fortran::semantics::CommonBlockDetails>()) { + for (const auto &mem : commonDetails->objects()) + if (collectSymbol && mem->test(flag)) + symbolSet.insert(&(*mem).GetUltimate()); + } else if (checkHostAssociatedSymbols) { if (const auto *details{ oriSymbol .detailsIf()}) insertSymbols(details->symbol(), true); + } }; insertSymbols(sym, collectSymbols); }; diff --git a/flang/test/Lower/OpenMP/copyin.f90 b/flang/test/Lower/OpenMP/copyin.f90 index 5424c978e1da9..be70f90ca3e60 100644 --- a/flang/test/Lower/OpenMP/copyin.f90 +++ b/flang/test/Lower/OpenMP/copyin.f90 @@ -480,6 +480,48 @@ subroutine allocatable2() !$omp end parallel end subroutine +! CHECK-LABEL: func.func @_QPcommon_3() { +! [...] +! CHECK: omp.parallel { +! CHECK: %[[VAL_22:.*]] = omp.threadprivate %[[VAL_0:.*]] : !fir.ref> -> !fir.ref> +! CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_22:.*]] : (!fir.ref>) -> !fir.ref> +! CHECK: %[[VAL_24:.*]] = arith.constant 0 : index +! CHECK: %[[VAL_25:.*]] = fir.coordinate_of %[[VAL_23:.*]], %[[VAL_24:.*]] : (!fir.ref>, index) -> !fir.ref +! CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_25:.*]] : (!fir.ref) -> !fir.ref +! CHECK: %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_26:.*]] {uniq_name = "_QFcommon_3Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_22:.*]] : (!fir.ref>) -> !fir.ref> +! CHECK: %[[VAL_29:.*]] = arith.constant 4 : index +! CHECK: %[[VAL_30:.*]] = fir.coordinate_of %[[VAL_28:.*]], %[[VAL_29:.*]] : (!fir.ref>, index) -> !fir.ref +! CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_30:.*]] : (!fir.ref) -> !fir.ref +! CHECK: %[[VAL_32:.*]]:2 = hlfir.declare %[[VAL_31:.*]] {uniq_name = "_QFcommon_3Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_22:.*]] : (!fir.ref>) -> !fir.ref> +! CHECK: %[[VAL_34:.*]] = arith.constant 8 : index +! CHECK: %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_33:.*]], %[[VAL_34:.*]] : (!fir.ref>, index) -> !fir.ref +! CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_35:.*]] : (!fir.ref) -> !fir.ref>> +! CHECK: %[[VAL_37:.*]]:2 = hlfir.declare %[[VAL_36:.*]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFcommon_3Earr"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) +! CHECK: %[[VAL_38:.*]] = fir.load %[[VAL_16:.*]]#0 : !fir.ref +! CHECK: hlfir.assign %[[VAL_38:.*]] to %[[VAL_27:.*]]#0 : i32, !fir.ref +! CHECK: %[[VAL_39:.*]] = fir.load %[[VAL_21:.*]]#0 : !fir.ref +! CHECK: hlfir.assign %[[VAL_39:.*]] to %[[VAL_32:.*]]#0 : i32, !fir.ref +! CHECK: %[[VAL_40:.*]] = fir.load %[[VAL_11:.*]]#0 : !fir.ref>> +! CHECK: fir.store %[[VAL_40:.*]] to %[[VAL_37:.*]]#0 : !fir.ref>> +! CHECK: omp.barrier +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } + +subroutine common_3() + integer :: x, y + integer, pointer :: arr + common /c3/ x, y, arr + !$omp threadprivate(/c3/) + + !$omp parallel copyin(/c3/) + call sub_3() + !$omp end parallel +end subroutine + ! CHECK: func.func @_QPallocatable3() { ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFallocatable3Ea) : !fir.ref>> ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFallocatable3Ea"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) From bae3577002b6bda92837723a06a4ca5c498d300f Mon Sep 17 00:00:00 2001 From: Jean-Didier PAILLEUX Date: Tue, 1 Apr 2025 08:07:26 +0200 Subject: [PATCH 0193/1029] [flang] Define ERF, ERFC and ERFC_SCALED intrinsics with Q and D prefix (#125217) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `ERF`, `ERFC` and `ERFC_SCALED` intrinsics prefixed by `Q` and `D` are missing. Codes such as `CP2K`(https://github.com/cp2k/cp2k) and `TurboRVB`(https://github.com/sissaschool/turborvb) use these intrinsics just like defined in the GNU standard and here: https://www.ibm.com/docs/fr/xl-fortran-aix/16.1.0?topic=reference-intrinsic-procedures These intrinsics are based on the existing intrinsics but apply a restriction on the type kind. - `DERF`, `DERFC` and `DERFC_SCALED` are for double précision only. - `QERF`, `QERFC` and `QERFC_SCALED` are for quad précision only. --- flang/docs/Intrinsics.md | 10 +++++-- flang/lib/Evaluate/intrinsics.cpp | 30 ++++++++++++++++++- flang/test/Lower/Intrinsics/erf.f90 | 16 ++++++++++ flang/test/Lower/Intrinsics/erf_real16.f90 | 4 ++- flang/test/Lower/Intrinsics/erfc.f90 | 10 +++++++ flang/test/Lower/Intrinsics/erfc_real16.f90 | 4 ++- flang/test/Lower/Intrinsics/erfc_scaled.f90 | 11 +++++++ .../Lower/Intrinsics/erfc_scaled_real16.f90 | 9 ++++++ flang/test/Semantics/erf.f90 | 29 ++++++++++++++++++ flang/test/Semantics/erfc.f90 | 29 ++++++++++++++++++ flang/test/Semantics/erfc_scaled.f90 | 29 ++++++++++++++++++ 11 files changed, 176 insertions(+), 5 deletions(-) create mode 100644 flang/test/Lower/Intrinsics/erf.f90 create mode 100644 flang/test/Lower/Intrinsics/erfc_scaled_real16.f90 create mode 100644 flang/test/Semantics/erf.f90 create mode 100644 flang/test/Semantics/erfc.f90 create mode 100644 flang/test/Semantics/erfc_scaled.f90 diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index c5c45c2f87d35..b09de8ee77645 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -241,8 +241,14 @@ BESSEL_Y0(REAL(k) X) -> REAL(k) BESSEL_Y1(REAL(k) X) -> REAL(k) BESSEL_YN(INTEGER(n) N, REAL(k) X) -> REAL(k) ERF(REAL(k) X) -> REAL(k) +DERF(REAL(8) X) -> REAL(8) +QERF(REAL(16) X) -> REAL(16) ERFC(REAL(k) X) -> REAL(k) +DERFC(REAL(8) X) -> REAL(8) +QERFC(REAL(16) X) -> REAL(16) ERFC_SCALED(REAL(k) X) -> REAL(k) +DERFC_SCALED(REAL(8) X) -> REAL(8) +QERFC_SCALED(REAL(16) X) -> REAL(16) FRACTION(REAL(k) X) -> REAL(k) GAMMA(REAL(k) X) -> REAL(k) HYPOT(REAL(k) X, REAL(k) Y) -> REAL(k) = SQRT(X*X+Y*Y) without spurious overflow @@ -810,7 +816,7 @@ otherwise an error message will be produced by f18 when attempting to fold relat | C/C++ Host Type | Intrinsic Functions with Host Standard C++ Library Based Folding Support | | --- | --- | -| float, double and long double | ACOS, ACOSH, ASINH, ATAN, ATAN2, ATANH, COS, COSH, ERF, ERFC, EXP, GAMMA, HYPOT, LOG, LOG10, LOG_GAMMA, MOD, SIN, SQRT, SINH, SQRT, TAN, TANH | +| float, double and long double | ACOS, ACOSH, ASINH, ATAN, ATAN2, ATANH, COS, COSH, DERF, DERFC, ERF, ERFC, EXP, GAMMA, HYPOT, LOG, LOG10, LOG_GAMMA, MOD, QERF, QERFC, SIN, SQRT, SINH, SQRT, TAN, TANH | | std::complex for float, double and long double| ACOS, ACOSH, ASIN, ASINH, ATAN, ATANH, COS, COSH, EXP, LOG, SIN, SINH, SQRT, TAN, TANH | On top of the default usage of C++ standard library functions for folding described @@ -829,7 +835,7 @@ types related to host float and double types. | C/C++ Host Type | Additional Intrinsic Function Folding Support with Libpgmath (Optional) | | --- | --- | -|float and double| BESSEL_J0, BESSEL_J1, BESSEL_JN (elemental only), BESSEL_Y0, BESSEL_Y1, BESSEL_Yn (elemental only), ERFC_SCALED | +|float and double| BESSEL_J0, BESSEL_J1, BESSEL_JN (elemental only), BESSEL_Y0, BESSEL_Y1, BESSEL_Yn (elemental only), DERFC_SCALED, ERFC_SCALED, QERFC_SCALED | Libpgmath comes in three variants (precise, relaxed and fast). So far, only the precise version is used for intrinsic function folding in f18. It guarantees the greatest numerical precision. diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index ae77dc8d11f44..2f34b12ca80bf 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -8,6 +8,7 @@ #include "flang/Evaluate/intrinsics.h" #include "flang/Common/enum-set.h" +#include "flang/Common/float128.h" #include "flang/Common/idioms.h" #include "flang/Evaluate/check-expression.h" #include "flang/Evaluate/common.h" @@ -83,7 +84,7 @@ static constexpr CategorySet AnyType{IntrinsicType | DerivedType}; ENUM_CLASS(KindCode, none, defaultIntegerKind, defaultRealKind, // is also the default COMPLEX kind - doublePrecision, defaultCharKind, defaultLogicalKind, + doublePrecision, quadPrecision, defaultCharKind, defaultLogicalKind, greaterOrEqualToKind, // match kind value greater than or equal to a single // explicit kind value any, // matches any kind value; each instance is independent @@ -139,6 +140,7 @@ static constexpr TypePattern DoublePrecision{ RealType, KindCode::doublePrecision}; static constexpr TypePattern DoublePrecisionComplex{ ComplexType, KindCode::doublePrecision}; +static constexpr TypePattern QuadPrecision{RealType, KindCode::quadPrecision}; static constexpr TypePattern SubscriptInt{IntType, KindCode::subscript}; // Match any kind of some intrinsic or derived types @@ -1199,6 +1201,9 @@ static const SpecificIntrinsicInterface specificIntrinsicFunction[]{ DoublePrecision}, "dim"}, {{"derf", {{"x", DoublePrecision}}, DoublePrecision}, "erf"}, + {{"derfc", {{"x", DoublePrecision}}, DoublePrecision}, "erfc"}, + {{"derfc_scaled", {{"x", DoublePrecision}}, DoublePrecision}, + "erfc_scaled"}, {{"dexp", {{"x", DoublePrecision}}, DoublePrecision}, "exp"}, {{"dfloat", {{"a", AnyInt}}, DoublePrecision}, "real", true}, {{"dim", {{"x", DefaultReal}, {"y", DefaultReal}}, DefaultReal}}, @@ -1299,6 +1304,9 @@ static const SpecificIntrinsicInterface specificIntrinsicFunction[]{ "min", true, true}, {{"mod", {{"a", DefaultInt}, {"p", DefaultInt}}, DefaultInt}}, {{"nint", {{"a", DefaultReal}}, DefaultInt}}, + {{"qerf", {{"x", QuadPrecision}}, QuadPrecision}, "erf"}, + {{"qerfc", {{"x", QuadPrecision}}, QuadPrecision}, "erfc"}, + {{"qerfc_scaled", {{"x", QuadPrecision}}, QuadPrecision}, "erfc_scaled"}, {{"sign", {{"a", DefaultReal}, {"b", DefaultReal}}, DefaultReal}}, {{"sin", {{"x", DefaultReal}}, DefaultReal}}, {{"sinh", {{"x", DefaultReal}}, DefaultReal}}, @@ -2033,6 +2041,9 @@ std::optional IntrinsicInterface::Match( case KindCode::doublePrecision: argOk = type->kind() == defaults.doublePrecisionKind(); break; + case KindCode::quadPrecision: + argOk = type->kind() == defaults.quadPrecisionKind(); + break; case KindCode::defaultCharKind: argOk = type->kind() == defaults.GetDefaultKind(TypeCategory::Character); break; @@ -2343,6 +2354,18 @@ std::optional IntrinsicInterface::Match( CHECK(FloatingType.test(*category)); resultType = DynamicType{*category, defaults.doublePrecisionKind()}; break; + case KindCode::quadPrecision: + CHECK(result.categorySet == CategorySet{*category}); + CHECK(FloatingType.test(*category)); + resultType = DynamicType{*category, defaults.quadPrecisionKind()}; + if (!context.targetCharacteristics().CanSupportType( + *category, defaults.quadPrecisionKind())) { + messages.Say( + "%s(KIND=%jd) type not supported on this target."_err_en_US, + parser::ToUpperCaseLetters(EnumToString(*category)), + defaults.quadPrecisionKind()); + } + break; case KindCode::defaultLogicalKind: CHECK(result.categorySet == LogicalType); CHECK(*category == TypeCategory::Logical); @@ -3341,6 +3364,7 @@ static DynamicType GetReturnType(const SpecificIntrinsicInterface &interface, case KindCode::defaultIntegerKind: break; case KindCode::doublePrecision: + case KindCode::quadPrecision: case KindCode::defaultRealKind: category = TypeCategory::Real; break; @@ -3349,6 +3373,8 @@ static DynamicType GetReturnType(const SpecificIntrinsicInterface &interface, } int kind{interface.result.kindCode == KindCode::doublePrecision ? defaults.doublePrecisionKind() + : interface.result.kindCode == KindCode::quadPrecision + ? defaults.quadPrecisionKind() : defaults.GetDefaultKind(category)}; return DynamicType{category, kind}; } @@ -3589,6 +3615,8 @@ DynamicType IntrinsicProcTable::Implementation::GetSpecificType( TypeCategory category{set.LeastElement().value()}; if (pattern.kindCode == KindCode::doublePrecision) { return DynamicType{category, defaults_.doublePrecisionKind()}; + } else if (pattern.kindCode == KindCode::quadPrecision) { + return DynamicType{category, defaults_.quadPrecisionKind()}; } else if (category == TypeCategory::Character) { // All character arguments to specific intrinsic functions are // assumed-length. diff --git a/flang/test/Lower/Intrinsics/erf.f90 b/flang/test/Lower/Intrinsics/erf.f90 new file mode 100644 index 0000000000000..b76ea1746d3df --- /dev/null +++ b/flang/test/Lower/Intrinsics/erf.f90 @@ -0,0 +1,16 @@ +! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL,FAST %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL,FAST %s +! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL,RELAXED %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL,RELAXED %s +! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL,PRECISE %s +! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL,PRECISE %s + +function dtest_real8(x) + real(8) :: x, dtest_real8 + dtest_real8 = derf(x) +end function + +! ALL-LABEL: @_QPdtest_real8 +! FAST: {{%[A-Za-z0-9._]+}} = math.erf {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! RELAXED: {{%[A-Za-z0-9._]+}} = math.erf {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @erf({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 diff --git a/flang/test/Lower/Intrinsics/erf_real16.f90 b/flang/test/Lower/Intrinsics/erf_real16.f90 index da40816946171..e9cc6175c1284 100644 --- a/flang/test/Lower/Intrinsics/erf_real16.f90 +++ b/flang/test/Lower/Intrinsics/erf_real16.f90 @@ -4,6 +4,8 @@ ! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s ! CHECK: fir.call @_FortranAErfF128({{.*}}){{.*}}: (f128) -> f128 - real(16) :: a, b +! CHECK: fir.call @_FortranAErfF128({{.*}}){{.*}}: (f128) -> f128 + real(16) :: a, b, c b = erf(a) + c = qerf(a) end diff --git a/flang/test/Lower/Intrinsics/erfc.f90 b/flang/test/Lower/Intrinsics/erfc.f90 index 164e958bb2912..c02e252445fc0 100644 --- a/flang/test/Lower/Intrinsics/erfc.f90 +++ b/flang/test/Lower/Intrinsics/erfc.f90 @@ -24,3 +24,13 @@ function test_real8(x) ! FAST: {{%[A-Za-z0-9._]+}} = math.erfc {{%[A-Za-z0-9._]+}} {{.*}}: f64 ! RELAXED: {{%[A-Za-z0-9._]+}} = math.erfc {{%[A-Za-z0-9._]+}} {{.*}}: f64 ! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @erfc({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 + +function dtest_real8(x) + real(8) :: x, dtest_real8 + dtest_real8 = derfc(x) +end function + +! ALL-LABEL: @_QPdtest_real8 +! FAST: {{%[A-Za-z0-9._]+}} = math.erfc {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! RELAXED: {{%[A-Za-z0-9._]+}} = math.erfc {{%[A-Za-z0-9._]+}} {{.*}}: f64 +! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @erfc({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64 diff --git a/flang/test/Lower/Intrinsics/erfc_real16.f90 b/flang/test/Lower/Intrinsics/erfc_real16.f90 index 7e3daa27768c7..d63c4d80df043 100644 --- a/flang/test/Lower/Intrinsics/erfc_real16.f90 +++ b/flang/test/Lower/Intrinsics/erfc_real16.f90 @@ -4,6 +4,8 @@ ! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s ! CHECK: fir.call @_FortranAErfcF128({{.*}}){{.*}}: (f128) -> f128 - real(16) :: a, b +! CHECK: fir.call @_FortranAErfcF128({{.*}}){{.*}}: (f128) -> f128 + real(16) :: a, b, c b = erfc(a) + c = qerfc(a) end diff --git a/flang/test/Lower/Intrinsics/erfc_scaled.f90 b/flang/test/Lower/Intrinsics/erfc_scaled.f90 index ab5e90cb2409e..f30f316176f38 100644 --- a/flang/test/Lower/Intrinsics/erfc_scaled.f90 +++ b/flang/test/Lower/Intrinsics/erfc_scaled.f90 @@ -21,3 +21,14 @@ function erfc_scaled8(x) ! CHECK: %[[a1:.*]] = fir.load %[[x]] : !fir.ref ! CHECK: %{{.*}} = fir.call @_FortranAErfcScaled8(%[[a1]]) {{.*}}: (f64) -> f64 end function erfc_scaled8 + + +! CHECK-LABEL: func @_QPderfc_scaled8( +! CHECK-SAME: %[[x:[^:]+]]: !fir.ref{{.*}}) -> f64 +function derfc_scaled8(x) + real(kind=8) :: derfc_scaled8 + real(kind=8) :: x + derfc_scaled8 = derfc_scaled(x); +! CHECK: %[[a1:.*]] = fir.load %[[x]] : !fir.ref +! CHECK: %{{.*}} = fir.call @_FortranAErfcScaled8(%[[a1]]) {{.*}}: (f64) -> f64 +end function derfc_scaled8 diff --git a/flang/test/Lower/Intrinsics/erfc_scaled_real16.f90 b/flang/test/Lower/Intrinsics/erfc_scaled_real16.f90 new file mode 100644 index 0000000000000..15c22e6142611 --- /dev/null +++ b/flang/test/Lower/Intrinsics/erfc_scaled_real16.f90 @@ -0,0 +1,9 @@ +! REQUIRES: flang-supports-f128-math +! RUN: bbc -emit-fir %s -o - | FileCheck %s +! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s +! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s + +! CHECK: fir.call @_FortranAErfcScaled16({{.*}}) {{.*}}: (f128) -> f128 + real(16) :: a, b + b = qerfc_scaled(a) +end diff --git a/flang/test/Semantics/erf.f90 b/flang/test/Semantics/erf.f90 new file mode 100644 index 0000000000000..591b4c31992d1 --- /dev/null +++ b/flang/test/Semantics/erf.f90 @@ -0,0 +1,29 @@ +! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck --check-prefix=ERROR %s + +function derf8_error4(x) + real(kind=8) :: derf8_error4 + real(kind=4) :: x + derf8_error4 = derf(x); +! ERROR: Actual argument for 'x=' has bad type or kind 'REAL(4)' +end function derf8_error4 + +function derf8_error16(x) + real(kind=8) :: derf8_error16 + real(kind=16) :: x + derf8_error16 = derf(x); +! ERROR: Actual argument for 'x=' has bad type or kind 'REAL(16)' +end function derf8_error16 + +function qerf16_error4(x) + real(kind=16) :: qerf16_error4 + real(kind=4) :: x + qerf16_error4 = qerf(x); +! ERROR: Actual argument for 'x=' has bad type or kind 'REAL(4)' +end function qerf16_error4 + +function qerf16_error8(x) + real(kind=16) :: qerf16_error8 + real(kind=8) :: x + qerf16_error8 = qerf(x); +! ERROR: Actual argument for 'x=' has bad type or kind 'REAL(8)' +end function qerf16_error8 diff --git a/flang/test/Semantics/erfc.f90 b/flang/test/Semantics/erfc.f90 new file mode 100644 index 0000000000000..ae3273bcc7e31 --- /dev/null +++ b/flang/test/Semantics/erfc.f90 @@ -0,0 +1,29 @@ +! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck --check-prefix=ERROR %s + +function derfc8_error4(x) + real(kind=8) :: derfc8_error4 + real(kind=4) :: x + derfc8_error4 = derfc(x); +! ERROR: Actual argument for 'x=' has bad type or kind 'REAL(4)' +end function derfc8_error4 + +function derfc8_error16(x) + real(kind=8) :: derfc8_error16 + real(kind=16) :: x + derfc8_error16 = derfc(x); +! ERROR: Actual argument for 'x=' has bad type or kind 'REAL(16)' +end function derfc8_error16 + +function qerfc16_error4(x) + real(kind=16) :: qerfc16_error4 + real(kind=4) :: x + qerfc16_error4 = qerfc(x); +! ERROR: Actual argument for 'x=' has bad type or kind 'REAL(4)' +end function qerfc16_error4 + +function qerfc16_error8(x) + real(kind=16) :: qerfc16_error8 + real(kind=8) :: x + qerfc16_error8 = qerfc(x); +! ERROR: Actual argument for 'x=' has bad type or kind 'REAL(8)' +end function qerfc16_error8 diff --git a/flang/test/Semantics/erfc_scaled.f90 b/flang/test/Semantics/erfc_scaled.f90 new file mode 100644 index 0000000000000..5e6cd502c7db7 --- /dev/null +++ b/flang/test/Semantics/erfc_scaled.f90 @@ -0,0 +1,29 @@ +! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck --check-prefix=ERROR %s + +function derfc_scaled8_error4(x) + real(kind=8) :: derfc_scaled8_error4 + real(kind=4) :: x + derfc_scaled8_error4 = derfc_scaled(x); +! ERROR: Actual argument for 'x=' has bad type or kind 'REAL(4)' +end function derfc_scaled8_error4 + +function derfc_scaled8_error16(x) + real(kind=8) :: derfc_scaled8_error16 + real(kind=16) :: x + derfc_scaled8_error16 = derfc_scaled(x); +! ERROR: Actual argument for 'x=' has bad type or kind 'REAL(16)' +end function derfc_scaled8_error16 + +function qerfc_scaled16_error4(x) + real(kind=16) :: qerfc_scaled16_error4 + real(kind=4) :: x + qerfc_scaled16_error4 = qerfc_scaled(x); +! ERROR: Actual argument for 'x=' has bad type or kind 'REAL(4)' +end function qerfc_scaled16_error4 + +function qerfc_scaled16_error8(x) + real(kind=16) :: qerfc_scaled16_error8 + real(kind=8) :: x + qerfc_scaled16_error8 = qerfc_scaled(x); +! ERROR: Actual argument for 'x=' has bad type or kind 'REAL(8)' +end function qerfc_scaled16_error8 From d3be29642fa65e5ade434d860cfcc193f8278d4e Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Mon, 31 Mar 2025 23:16:41 -0700 Subject: [PATCH 0194/1029] [clang-format] Correctly annotate pointer/reference in _Generic (#133673) Fix #133663 --- clang/lib/Format/TokenAnnotator.cpp | 4 ++++ clang/unittests/Format/TokenAnnotatorTest.cpp | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index d87b3a6088bd8..dfb59e8d6f420 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -1417,6 +1417,10 @@ class AnnotatingParser { } } else if (Contexts.back().ContextType == Context::C11GenericSelection) { Tok->setType(TT_GenericSelectionColon); + auto *Prev = Tok->getPreviousNonComment(); + assert(Prev); + if (Prev->isPointerOrReference()) + Prev->setFinalizedType(TT_PointerOrReference); } else if (CurrentToken && CurrentToken->is(tok::numeric_constant)) { Tok->setType(TT_BitFieldColon); } else if (Contexts.size() == 1 && diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index ac5e979aea071..af9fd574b068c 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -363,6 +363,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) { ASSERT_EQ(Tokens.size(), 20u) << Tokens; EXPECT_TOKEN(Tokens[14], tok::star, TT_PointerOrReference); + Tokens = annotate("#define foo(x) _Generic(x, bar *: 1, default: 0)"); + ASSERT_EQ(Tokens.size(), 20u) << Tokens; + EXPECT_TOKEN(Tokens[11], tok::star, TT_PointerOrReference); + Tokens = annotate("Thingy kConfig = {\n" " 1,\n" " (uint16_t)(kScale * height_pixels),\n" From fe3e9c2b46504e7b197245b3946abfec563e77d4 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Mon, 31 Mar 2025 23:17:44 -0700 Subject: [PATCH 0195/1029] [Analysis] Avoid repeated hash lookups (NFC) (#133045) --- llvm/lib/Analysis/ScalarEvolution.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 361206719287a..14f9a1bec8939 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -15440,9 +15440,9 @@ void ScalarEvolution::LoopGuards::collectFromBlock( // Bail out, unless we have a non-wrapping, monotonic range. if (ExactRegion.isWrappedSet() || ExactRegion.isFullSet()) return false; - auto I = RewriteMap.find(LHSUnknown); - const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : LHSUnknown; - RewriteMap[LHSUnknown] = SE.getUMaxExpr( + auto [I, Inserted] = RewriteMap.try_emplace(LHSUnknown); + const SCEV *RewrittenLHS = Inserted ? LHSUnknown : I->second; + I->second = SE.getUMaxExpr( SE.getConstant(ExactRegion.getUnsignedMin()), SE.getUMinExpr(RewrittenLHS, SE.getConstant(ExactRegion.getUnsignedMax()))); From aa889ed129ff26d9341c50a9eaba4db728ca6212 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Mon, 31 Mar 2025 23:31:17 -0700 Subject: [PATCH 0196/1029] [lldb] Fix statusline terminal resizing Simplify and fix the logic to clear the old statusline when the terminal window dimensions have changed. I accidentally broke the terminal resizing behavior when addressing code review feedback. I'd really like to figure out a way to test this. PExpect isn't a good fit for this, because I really need to check the result, rather than the control characters, as the latter doesn't tell me whether any part of the old statusline is still visible. --- lldb/include/lldb/Core/Statusline.h | 15 +------ lldb/source/Core/Statusline.cpp | 64 ++++++++++------------------- 2 files changed, 23 insertions(+), 56 deletions(-) diff --git a/lldb/include/lldb/Core/Statusline.h b/lldb/include/lldb/Core/Statusline.h index c1449f0f69081..521b9f2526f6b 100644 --- a/lldb/include/lldb/Core/Statusline.h +++ b/lldb/include/lldb/Core/Statusline.h @@ -10,8 +10,6 @@ #define LLDB_CORE_STATUSLINE_H #include "lldb/lldb-forward.h" -#include "llvm/ADT/StringRef.h" -#include #include #include @@ -34,10 +32,6 @@ class Statusline { /// Inform the statusline that the terminal dimensions have changed. void TerminalSizeChanged(); -protected: - /// Pad and trim the given string to fit to the given width. - static std::string TrimAndPad(std::string str, size_t width); - private: /// Draw the statusline with the given text. void Draw(std::string msg); @@ -46,20 +40,15 @@ class Statusline { void UpdateTerminalProperties(); enum ScrollWindowMode { - ScrollWindowExtend, - ScrollWindowShrink, + EnableStatusline, + DisableStatusline, }; /// Set the scroll window for the given mode. void UpdateScrollWindow(ScrollWindowMode mode); - /// Clear the statusline (without redrawing the background). - void Reset(); - Debugger &m_debugger; std::string m_last_str; - - volatile std::sig_atomic_t m_terminal_size_has_changed = 1; uint64_t m_terminal_width = 0; uint64_t m_terminal_height = 0; }; diff --git a/lldb/source/Core/Statusline.cpp b/lldb/source/Core/Statusline.cpp index c01388eb7e7b5..c18fbb6c5561e 100644 --- a/lldb/source/Core/Statusline.cpp +++ b/lldb/source/Core/Statusline.cpp @@ -23,7 +23,8 @@ #define ANSI_SAVE_CURSOR ESCAPE "7" #define ANSI_RESTORE_CURSOR ESCAPE "8" #define ANSI_CLEAR_BELOW ESCAPE "[J" -#define ANSI_CLEAR_LINE "\r\x1B[2K" +#define ANSI_CURSOR_DOWN ESCAPE "[B" +#define ANSI_CLEAR_LINE ESCAPE "[2K" #define ANSI_SET_SCROLL_ROWS ESCAPE "[0;%ur" #define ANSI_TO_START_OF_ROW ESCAPE "[%u;0f" #define ANSI_UP_ROWS ESCAPE "[%dA" @@ -31,12 +32,16 @@ using namespace lldb; using namespace lldb_private; -Statusline::Statusline(Debugger &debugger) : m_debugger(debugger) { Enable(); } +Statusline::Statusline(Debugger &debugger) + : m_debugger(debugger), m_terminal_width(m_debugger.GetTerminalWidth()), + m_terminal_height(m_debugger.GetTerminalHeight()) { + Enable(); +} Statusline::~Statusline() { Disable(); } void Statusline::TerminalSizeChanged() { - m_terminal_size_has_changed = 1; + UpdateTerminalProperties(); // This definitely isn't signal safe, but the best we can do, until we // have proper signal-catching thread. @@ -44,20 +49,16 @@ void Statusline::TerminalSizeChanged() { } void Statusline::Enable() { - UpdateTerminalProperties(); - // Reduce the scroll window to make space for the status bar below. - UpdateScrollWindow(ScrollWindowShrink); + UpdateScrollWindow(EnableStatusline); // Draw the statusline. - Redraw(); + Redraw(/*update=*/true); } void Statusline::Disable() { - UpdateTerminalProperties(); - // Extend the scroll window to cover the status bar. - UpdateScrollWindow(ScrollWindowExtend); + UpdateScrollWindow(DisableStatusline); } void Statusline::Draw(std::string str) { @@ -65,8 +66,6 @@ void Statusline::Draw(std::string str) { if (!stream_sp) return; - UpdateTerminalProperties(); - m_last_str = str; str = ansi::TrimAndPad(str, m_terminal_width); @@ -80,58 +79,37 @@ void Statusline::Draw(std::string str) { locked_stream << ANSI_RESTORE_CURSOR; } -void Statusline::Reset() { - lldb::LockableStreamFileSP stream_sp = m_debugger.GetOutputStreamSP(); - if (!stream_sp) - return; - - LockedStreamFile locked_stream = stream_sp->Lock(); - locked_stream << ANSI_SAVE_CURSOR; - locked_stream.Printf(ANSI_TO_START_OF_ROW, - static_cast(m_terminal_height)); - locked_stream << ANSI_CLEAR_LINE; - locked_stream << ANSI_RESTORE_CURSOR; -} - void Statusline::UpdateTerminalProperties() { - if (m_terminal_size_has_changed == 0) - return; - - // Clear the previous statusline using the previous dimensions. - Reset(); - + UpdateScrollWindow(DisableStatusline); m_terminal_width = m_debugger.GetTerminalWidth(); m_terminal_height = m_debugger.GetTerminalHeight(); - - // Set the scroll window based on the new terminal height. - UpdateScrollWindow(ScrollWindowShrink); - - // Clear the flag. - m_terminal_size_has_changed = 0; + UpdateScrollWindow(EnableStatusline); } void Statusline::UpdateScrollWindow(ScrollWindowMode mode) { + assert(m_terminal_width != 0 && m_terminal_height != 0); + lldb::LockableStreamFileSP stream_sp = m_debugger.GetOutputStreamSP(); if (!stream_sp) return; const unsigned scroll_height = - (mode == ScrollWindowExtend) ? m_terminal_height : m_terminal_height - 1; + (mode == DisableStatusline) ? m_terminal_height : m_terminal_height - 1; LockedStreamFile locked_stream = stream_sp->Lock(); locked_stream << ANSI_SAVE_CURSOR; locked_stream.Printf(ANSI_SET_SCROLL_ROWS, scroll_height); locked_stream << ANSI_RESTORE_CURSOR; switch (mode) { - case ScrollWindowExtend: - // Clear the screen below to hide the old statusline. - locked_stream << ANSI_CLEAR_BELOW; - break; - case ScrollWindowShrink: + case EnableStatusline: // Move everything on the screen up. locked_stream.Printf(ANSI_UP_ROWS, 1); locked_stream << '\n'; break; + case DisableStatusline: + // Clear the screen below to hide the old statusline. + locked_stream << ANSI_CLEAR_BELOW; + break; } } From 49f080afc4466ddf415d7fc7e98989c0bd07d8ea Mon Sep 17 00:00:00 2001 From: Frank Schlimbach Date: Tue, 1 Apr 2025 08:58:55 +0200 Subject: [PATCH 0197/1029] [mlir][mpi] Mandatory Communicator (#133280) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is replacing #125361 - communicator is mandatory - new mpi.comm_world - new mp.comm_split - lowering and test --------- Co-authored-by: Sergio Sánchez Ramírez --- mlir/include/mlir/Dialect/MPI/IR/MPIOps.td | 132 ++++++++++++------ mlir/include/mlir/Dialect/MPI/IR/MPITypes.td | 11 ++ mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp | 132 +++++++++++++++--- mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp | 24 +++- mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir | 116 +++++++++------ .../MeshToMPI/convert-mesh-to-mpi.mlir | 62 ++++---- mlir/test/Dialect/MPI/mpiops.mlir | 87 +++++++----- 7 files changed, 389 insertions(+), 175 deletions(-) diff --git a/mlir/include/mlir/Dialect/MPI/IR/MPIOps.td b/mlir/include/mlir/Dialect/MPI/IR/MPIOps.td index a8267b115b9e6..d78aa92d201e7 100644 --- a/mlir/include/mlir/Dialect/MPI/IR/MPIOps.td +++ b/mlir/include/mlir/Dialect/MPI/IR/MPIOps.td @@ -37,26 +37,41 @@ def MPI_InitOp : MPI_Op<"init", []> { let assemblyFormat = "attr-dict (`:` type($retval)^)?"; } +//===----------------------------------------------------------------------===// +// CommWorldOp +//===----------------------------------------------------------------------===// + +def MPI_CommWorldOp : MPI_Op<"comm_world", []> { + let summary = "Get the World communicator, equivalent to `MPI_COMM_WORLD`"; + let description = [{ + This operation returns the predefined MPI_COMM_WORLD communicator. + }]; + + let results = (outs MPI_Comm : $comm); + + let assemblyFormat = "attr-dict `:` type(results)"; +} + //===----------------------------------------------------------------------===// // CommRankOp //===----------------------------------------------------------------------===// def MPI_CommRankOp : MPI_Op<"comm_rank", []> { let summary = "Get the current rank, equivalent to " - "`MPI_Comm_rank(MPI_COMM_WORLD, &rank)`"; + "`MPI_Comm_rank(comm, &rank)`"; let description = [{ - Communicators other than `MPI_COMM_WORLD` are not supported for now. - This operation can optionally return an `!mpi.retval` value that can be used to check for errors. }]; + let arguments = (ins MPI_Comm : $comm); + let results = ( outs Optional : $retval, I32 : $rank ); - let assemblyFormat = "attr-dict `:` type(results)"; + let assemblyFormat = "`(` $comm `)` attr-dict `:` type(results)"; } //===----------------------------------------------------------------------===// @@ -65,20 +80,48 @@ def MPI_CommRankOp : MPI_Op<"comm_rank", []> { def MPI_CommSizeOp : MPI_Op<"comm_size", []> { let summary = "Get the size of the group associated to the communicator, " - "equivalent to `MPI_Comm_size(MPI_COMM_WORLD, &size)`"; + "equivalent to `MPI_Comm_size(comm, &size)`"; let description = [{ - Communicators other than `MPI_COMM_WORLD` are not supported for now. - This operation can optionally return an `!mpi.retval` value that can be used to check for errors. }]; + let arguments = (ins MPI_Comm : $comm); + let results = ( outs Optional : $retval, I32 : $size ); - let assemblyFormat = "attr-dict `:` type(results)"; + let assemblyFormat = "`(` $comm `)` attr-dict `:` type(results)"; +} + +//===----------------------------------------------------------------------===// +// CommSplitOp +//===----------------------------------------------------------------------===// + +def MPI_CommSplitOp : MPI_Op<"comm_split", []> { + let summary = "Partition the group associated with the given communicator into " + "disjoint subgroups"; + let description = [{ + This operation splits the communicator into multiple sub-communicators. + The color value determines the group of processes that will be part of the + new communicator. The key value determines the rank of the calling process + in the new communicator. + + This operation can optionally return an `!mpi.retval` value that can be used + to check for errors. + }]; + + let arguments = (ins MPI_Comm : $comm, I32 : $color, I32 : $key); + + let results = ( + outs Optional : $retval, + MPI_Comm : $newcomm + ); + + let assemblyFormat = "`(` $comm `,` $color `,` $key `)` attr-dict `:` " + "type(results)"; } //===----------------------------------------------------------------------===// @@ -87,14 +130,12 @@ def MPI_CommSizeOp : MPI_Op<"comm_size", []> { def MPI_SendOp : MPI_Op<"send", []> { let summary = - "Equivalent to `MPI_Send(ptr, size, dtype, dest, tag, MPI_COMM_WORLD)`"; + "Equivalent to `MPI_Send(ptr, size, dtype, dest, tag, comm)`"; let description = [{ MPI_Send performs a blocking send of `size` elements of type `dtype` to rank `dest`. The `tag` value and communicator enables the library to determine the matching of multiple sends and receives between the same ranks. - Communicators other than `MPI_COMM_WORLD` are not supported for now. - This operation can optionally return an `!mpi.retval` value that can be used to check for errors. }]; @@ -102,12 +143,13 @@ def MPI_SendOp : MPI_Op<"send", []> { let arguments = ( ins AnyMemRef : $ref, I32 : $tag, - I32 : $dest + I32 : $dest, + MPI_Comm : $comm ); let results = (outs Optional:$retval); - let assemblyFormat = "`(` $ref `,` $tag `,` $dest `)` attr-dict `:` " + let assemblyFormat = "`(` $ref `,` $tag `,` $dest `,` $comm `)` attr-dict `:` " "type($ref) `,` type($tag) `,` type($dest)" "(`->` type($retval)^)?"; let hasCanonicalizer = 1; @@ -119,15 +161,13 @@ def MPI_SendOp : MPI_Op<"send", []> { def MPI_ISendOp : MPI_Op<"isend", []> { let summary = - "Equivalent to `MPI_Isend(ptr, size, dtype, dest, tag, MPI_COMM_WORLD)`"; + "Equivalent to `MPI_Isend(ptr, size, dtype, dest, tag, comm)`"; let description = [{ MPI_Isend begins a non-blocking send of `size` elements of type `dtype` to rank `dest`. The `tag` value and communicator enables the library to determine the matching of multiple sends and receives between the same ranks. - Communicators other than `MPI_COMM_WORLD` are not supported for now. - This operation can optionally return an `!mpi.retval` value that can be used to check for errors. }]; @@ -135,7 +175,8 @@ def MPI_ISendOp : MPI_Op<"isend", []> { let arguments = ( ins AnyMemRef : $ref, I32 : $tag, - I32 : $rank + I32 : $dest, + MPI_Comm : $comm ); let results = ( @@ -143,8 +184,8 @@ def MPI_ISendOp : MPI_Op<"isend", []> { MPI_Request : $req ); - let assemblyFormat = "`(` $ref `,` $tag `,` $rank `)` attr-dict " - "`:` type($ref) `,` type($tag) `,` type($rank) " + let assemblyFormat = "`(` $ref `,` $tag `,` $dest `,` $comm`)` attr-dict " + "`:` type($ref) `,` type($tag) `,` type($dest) " "`->` type(results)"; let hasCanonicalizer = 1; } @@ -155,14 +196,13 @@ def MPI_ISendOp : MPI_Op<"isend", []> { def MPI_RecvOp : MPI_Op<"recv", []> { let summary = "Equivalent to `MPI_Recv(ptr, size, dtype, source, tag, " - "MPI_COMM_WORLD, MPI_STATUS_IGNORE)`"; + "comm, MPI_STATUS_IGNORE)`"; let description = [{ MPI_Recv performs a blocking receive of `size` elements of type `dtype` from rank `source`. The `tag` value and communicator enables the library to determine the matching of multiple sends and receives between the same ranks. - Communicators other than `MPI_COMM_WORLD` are not supported for now. The MPI_Status is set to `MPI_STATUS_IGNORE`, as the status object is not yet ported to MLIR. @@ -172,13 +212,14 @@ def MPI_RecvOp : MPI_Op<"recv", []> { let arguments = ( ins AnyMemRef : $ref, - I32 : $tag, I32 : $source + I32 : $tag, I32 : $source, + MPI_Comm : $comm ); let results = (outs Optional:$retval); - let assemblyFormat = "`(` $ref `,` $tag `,` $source `)` attr-dict `:` " - "type($ref) `,` type($tag) `,` type($source)" + let assemblyFormat = "`(` $ref `,` $tag `,` $source `,` $comm `)` attr-dict" + " `:` type($ref) `,` type($tag) `,` type($source) " "(`->` type($retval)^)?"; let hasCanonicalizer = 1; } @@ -188,16 +229,14 @@ def MPI_RecvOp : MPI_Op<"recv", []> { //===----------------------------------------------------------------------===// def MPI_IRecvOp : MPI_Op<"irecv", []> { - let summary = "Equivalent to `MPI_Irecv(ptr, size, dtype, dest, tag, " - "MPI_COMM_WORLD, &req)`"; + let summary = "Equivalent to `MPI_Irecv(ptr, size, dtype, source, tag, " + "comm, &req)`"; let description = [{ MPI_Irecv begins a non-blocking receive of `size` elements of type `dtype` - from rank `dest`. The `tag` value and communicator enables the library to + from rank `source`. The `tag` value and communicator enables the library to determine the matching of multiple sends and receives between the same ranks. - Communicators other than `MPI_COMM_WORLD` are not supported for now. - This operation can optionally return an `!mpi.retval` value that can be used to check for errors. }]; @@ -205,7 +244,8 @@ def MPI_IRecvOp : MPI_Op<"irecv", []> { let arguments = ( ins AnyMemRef : $ref, I32 : $tag, - I32 : $rank + I32 : $source, + MPI_Comm : $comm ); let results = ( @@ -213,9 +253,9 @@ def MPI_IRecvOp : MPI_Op<"irecv", []> { MPI_Request : $req ); - let assemblyFormat = "`(` $ref `,` $tag `,` $rank `)` attr-dict `:`" - "type($ref) `,` type($tag) `,` type($rank) `->`" - "type(results)"; + let assemblyFormat = "`(` $ref `,` $tag `,` $source `,` $comm`)` attr-dict " + "`:` type($ref) `,` type($tag) `,` type($source)" + "`->` type(results)"; let hasCanonicalizer = 1; } @@ -224,8 +264,7 @@ def MPI_IRecvOp : MPI_Op<"irecv", []> { //===----------------------------------------------------------------------===// def MPI_AllReduceOp : MPI_Op<"allreduce", []> { - let summary = "Equivalent to `MPI_Allreduce(sendbuf, recvbuf, op, " - "MPI_COMM_WORLD)`"; + let summary = "Equivalent to `MPI_Allreduce(sendbuf, recvbuf, op, comm)`"; let description = [{ MPI_Allreduce performs a reduction operation on the values in the sendbuf array and stores the result in the recvbuf array. The operation is @@ -235,8 +274,6 @@ def MPI_AllReduceOp : MPI_Op<"allreduce", []> { Currently only the `MPI_Op` predefined in the standard (e.g. `MPI_SUM`) are supported. - Communicators other than `MPI_COMM_WORLD` are not supported for now. - This operation can optionally return an `!mpi.retval` value that can be used to check for errors. }]; @@ -244,13 +281,14 @@ def MPI_AllReduceOp : MPI_Op<"allreduce", []> { let arguments = ( ins AnyMemRef : $sendbuf, AnyMemRef : $recvbuf, - MPI_OpClassEnum : $op + MPI_OpClassEnum : $op, + MPI_Comm : $comm ); let results = (outs Optional:$retval); - let assemblyFormat = "`(` $sendbuf `,` $recvbuf `,` $op `)` attr-dict `:`" - "type($sendbuf) `,` type($recvbuf)" + let assemblyFormat = "`(` $sendbuf `,` $recvbuf `,` $op `,` $comm `)` " + "attr-dict `:` type($sendbuf) `,` type($recvbuf) " "(`->` type($retval)^)?"; } @@ -259,20 +297,23 @@ def MPI_AllReduceOp : MPI_Op<"allreduce", []> { //===----------------------------------------------------------------------===// def MPI_Barrier : MPI_Op<"barrier", []> { - let summary = "Equivalent to `MPI_Barrier(MPI_COMM_WORLD)`"; + let summary = "Equivalent to `MPI_Barrier(comm)`"; let description = [{ MPI_Barrier blocks execution until all processes in the communicator have reached this routine. - Communicators other than `MPI_COMM_WORLD` are not supported for now. - This operation can optionally return an `!mpi.retval` value that can be used to check for errors. }]; + let arguments = (ins MPI_Comm : $comm); + let results = (outs Optional:$retval); - let assemblyFormat = "attr-dict (`:` type($retval) ^)?"; + let assemblyFormat = [{ + `(` $comm `)` attr-dict + (`->` type($retval)^)? + }]; } //===----------------------------------------------------------------------===// @@ -295,8 +336,7 @@ def MPI_Wait : MPI_Op<"wait", []> { let results = (outs Optional:$retval); - let assemblyFormat = "`(` $req `)` attr-dict `:` type($req) " - "(`->` type($retval) ^)?"; + let assemblyFormat = "`(` $req `)` attr-dict `:` type($req) (`->` type($retval) ^)?"; } //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/MPI/IR/MPITypes.td b/mlir/include/mlir/Dialect/MPI/IR/MPITypes.td index a55d30e778e22..adc35a70b5837 100644 --- a/mlir/include/mlir/Dialect/MPI/IR/MPITypes.td +++ b/mlir/include/mlir/Dialect/MPI/IR/MPITypes.td @@ -40,6 +40,17 @@ def MPI_Retval : MPI_Type<"Retval", "retval"> { }]; } +//===----------------------------------------------------------------------===// +// mpi::CommType +//===----------------------------------------------------------------------===// + +def MPI_Comm : MPI_Type<"Comm", "comm"> { + let summary = "MPI communicator handler"; + let description = [{ + This type represents a handler for the MPI communicator. + }]; +} + //===----------------------------------------------------------------------===// // mpi::RequestType //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp b/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp index 4e0f59305a647..9df5e992e8ebd 100644 --- a/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp +++ b/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp @@ -83,9 +83,17 @@ class MPIImplTraits { ModuleOp &getModuleOp() { return moduleOp; } /// Gets or creates MPI_COMM_WORLD as a Value. + /// Different MPI implementations have different communicator types. + /// Using i64 as a portable, intermediate type. + /// Appropriate cast needs to take place before calling MPI functions. virtual Value getCommWorld(const Location loc, ConversionPatternRewriter &rewriter) = 0; + /// Type converter provides i64 type for communicator type. + /// Converts to native type, which might be ptr or int or whatever. + virtual Value castComm(const Location loc, + ConversionPatternRewriter &rewriter, Value comm) = 0; + /// Get the MPI_STATUS_IGNORE value (typically a pointer type). virtual intptr_t getStatusIgnore() = 0; @@ -139,10 +147,15 @@ class MPICHImplTraits : public MPIImplTraits { Value getCommWorld(const Location loc, ConversionPatternRewriter &rewriter) override { static constexpr int MPI_COMM_WORLD = 0x44000000; - return rewriter.create(loc, rewriter.getI32Type(), + return rewriter.create(loc, rewriter.getI64Type(), MPI_COMM_WORLD); } + Value castComm(const Location loc, ConversionPatternRewriter &rewriter, + Value comm) override { + return rewriter.create(loc, rewriter.getI32Type(), comm); + } + intptr_t getStatusIgnore() override { return 1; } Value getDataType(const Location loc, ConversionPatternRewriter &rewriter, @@ -256,9 +269,16 @@ class OMPIImplTraits : public MPIImplTraits { getOrDefineExternalStruct(loc, rewriter, name, commStructT); // get address of symbol - return rewriter.create( + auto comm = rewriter.create( loc, LLVM::LLVMPointerType::get(context), SymbolRefAttr::get(context, name)); + return rewriter.create(loc, rewriter.getI64Type(), comm); + } + + Value castComm(const Location loc, ConversionPatternRewriter &rewriter, + Value comm) override { + return rewriter.create( + loc, LLVM::LLVMPointerType::get(rewriter.getContext()), comm); } intptr_t getStatusIgnore() override { return 0; } @@ -440,6 +460,78 @@ struct FinalizeOpLowering : public ConvertOpToLLVMPattern { } }; +//===----------------------------------------------------------------------===// +// CommWorldOpLowering +//===----------------------------------------------------------------------===// + +struct CommWorldOpLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(mpi::CommWorldOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // grab a reference to the global module op: + auto moduleOp = op->getParentOfType(); + auto mpiTraits = MPIImplTraits::get(moduleOp); + // get MPI_COMM_WORLD + rewriter.replaceOp(op, mpiTraits->getCommWorld(op.getLoc(), rewriter)); + + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// CommSplitOpLowering +//===----------------------------------------------------------------------===// + +struct CommSplitOpLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(mpi::CommSplitOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + // grab a reference to the global module op: + auto moduleOp = op->getParentOfType(); + auto mpiTraits = MPIImplTraits::get(moduleOp); + Type i32 = rewriter.getI32Type(); + Type ptrType = LLVM::LLVMPointerType::get(op->getContext()); + Location loc = op.getLoc(); + + // get communicator + Value comm = mpiTraits->castComm(loc, rewriter, adaptor.getComm()); + auto one = rewriter.create(loc, i32, 1); + auto outPtr = + rewriter.create(loc, ptrType, comm.getType(), one); + + // int MPI_Comm_split(MPI_Comm comm, int color, int key, MPI_Comm * newcomm) + auto funcType = + LLVM::LLVMFunctionType::get(i32, {comm.getType(), i32, i32, ptrType}); + // get or create function declaration: + LLVM::LLVMFuncOp funcDecl = getOrDefineFunction(moduleOp, loc, rewriter, + "MPI_Comm_split", funcType); + + auto callOp = rewriter.create( + loc, funcDecl, + ValueRange{comm, adaptor.getColor(), adaptor.getKey(), + outPtr.getRes()}); + + // load the communicator into a register + auto res = rewriter.create(loc, i32, outPtr.getResult()); + + // if retval is checked, replace uses of retval with the results from the + // call op + SmallVector replacements; + if (op.getRetval()) + replacements.push_back(callOp.getResult()); + + // replace op + replacements.push_back(res.getRes()); + rewriter.replaceOp(op, replacements); + + return success(); + } +}; + //===----------------------------------------------------------------------===// // CommRankOpLowering //===----------------------------------------------------------------------===// @@ -462,21 +554,21 @@ struct CommRankOpLowering : public ConvertOpToLLVMPattern { auto moduleOp = op->getParentOfType(); auto mpiTraits = MPIImplTraits::get(moduleOp); - // get MPI_COMM_WORLD - Value commWorld = mpiTraits->getCommWorld(loc, rewriter); + // get communicator + Value comm = mpiTraits->castComm(loc, rewriter, adaptor.getComm()); // LLVM Function type representing `i32 MPI_Comm_rank(ptr, ptr)` auto rankFuncType = - LLVM::LLVMFunctionType::get(i32, {commWorld.getType(), ptrType}); + LLVM::LLVMFunctionType::get(i32, {comm.getType(), ptrType}); // get or create function declaration: LLVM::LLVMFuncOp initDecl = getOrDefineFunction( moduleOp, loc, rewriter, "MPI_Comm_rank", rankFuncType); - // replace init with function call + // replace with function call auto one = rewriter.create(loc, i32, 1); auto rankptr = rewriter.create(loc, ptrType, i32, one); auto callOp = rewriter.create( - loc, initDecl, ValueRange{commWorld, rankptr.getRes()}); + loc, initDecl, ValueRange{comm, rankptr.getRes()}); // load the rank into a register auto loadedRank = @@ -523,12 +615,12 @@ struct SendOpLowering : public ConvertOpToLLVMPattern { getRawPtrAndSize(loc, rewriter, adaptor.getRef(), elemType); auto mpiTraits = MPIImplTraits::get(moduleOp); Value dataType = mpiTraits->getDataType(loc, rewriter, elemType); - Value commWorld = mpiTraits->getCommWorld(loc, rewriter); + Value comm = mpiTraits->castComm(loc, rewriter, adaptor.getComm()); // LLVM Function type representing `i32 MPI_send(data, count, datatype, dst, // tag, comm)` auto funcType = LLVM::LLVMFunctionType::get( - i32, {ptrType, i32, dataType.getType(), i32, i32, commWorld.getType()}); + i32, {ptrType, i32, dataType.getType(), i32, i32, comm.getType()}); // get or create function declaration: LLVM::LLVMFuncOp funcDecl = getOrDefineFunction(moduleOp, loc, rewriter, "MPI_Send", funcType); @@ -537,7 +629,7 @@ struct SendOpLowering : public ConvertOpToLLVMPattern { auto funcCall = rewriter.create( loc, funcDecl, ValueRange{dataPtr, size, dataType, adaptor.getDest(), adaptor.getTag(), - commWorld}); + comm}); if (op.getRetval()) rewriter.replaceOp(op, funcCall.getResult()); else @@ -575,7 +667,7 @@ struct RecvOpLowering : public ConvertOpToLLVMPattern { getRawPtrAndSize(loc, rewriter, adaptor.getRef(), elemType); auto mpiTraits = MPIImplTraits::get(moduleOp); Value dataType = mpiTraits->getDataType(loc, rewriter, elemType); - Value commWorld = mpiTraits->getCommWorld(loc, rewriter); + Value comm = mpiTraits->castComm(loc, rewriter, adaptor.getComm()); Value statusIgnore = rewriter.create( loc, i64, mpiTraits->getStatusIgnore()); statusIgnore = @@ -585,7 +677,7 @@ struct RecvOpLowering : public ConvertOpToLLVMPattern { // tag, comm)` auto funcType = LLVM::LLVMFunctionType::get(i32, {ptrType, i32, dataType.getType(), i32, - i32, commWorld.getType(), ptrType}); + i32, comm.getType(), ptrType}); // get or create function declaration: LLVM::LLVMFuncOp funcDecl = getOrDefineFunction(moduleOp, loc, rewriter, "MPI_Recv", funcType); @@ -594,7 +686,7 @@ struct RecvOpLowering : public ConvertOpToLLVMPattern { auto funcCall = rewriter.create( loc, funcDecl, ValueRange{dataPtr, size, dataType, adaptor.getSource(), - adaptor.getTag(), commWorld, statusIgnore}); + adaptor.getTag(), comm, statusIgnore}); if (op.getRetval()) rewriter.replaceOp(op, funcCall.getResult()); else @@ -629,7 +721,8 @@ struct AllReduceOpLowering : public ConvertOpToLLVMPattern { getRawPtrAndSize(loc, rewriter, adaptor.getRecvbuf(), elemType); Value dataType = mpiTraits->getDataType(loc, rewriter, elemType); Value mpiOp = mpiTraits->getMPIOp(loc, rewriter, op.getOp()); - Value commWorld = mpiTraits->getCommWorld(loc, rewriter); + Value commWorld = mpiTraits->castComm(loc, rewriter, adaptor.getComm()); + // 'int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, // MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)' auto funcType = LLVM::LLVMFunctionType::get( @@ -676,8 +769,15 @@ struct FuncToLLVMDialectInterface : public ConvertToLLVMPatternInterface { void mpi::populateMPIToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns) { - patterns.add(converter); + // Using i64 as a portable, intermediate type for !mpi.comm. + // It would be nicer to somehow get the right type directly, but TLDI is not + // available here. + converter.addConversion([](mpi::CommType type) { + return IntegerType::get(type.getContext(), 64); + }); + patterns.add(converter); } void mpi::registerConvertMPIToLLVMInterface(DialectRegistry ®istry) { diff --git a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp index 87c2938e4e52b..cafbf835de22f 100644 --- a/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp +++ b/mlir/lib/Conversion/MeshToMPI/MeshToMPI.cpp @@ -310,11 +310,16 @@ class ConvertProcessLinearIndexOp } // Otherwise call create mpi::CommRankOp - auto rank = rewriter - .create( - loc, TypeRange{mpi::RetvalType::get(op->getContext()), - rewriter.getI32Type()}) - .getRank(); + auto ctx = op.getContext(); + Value commWorld = + rewriter.create(loc, mpi::CommType::get(ctx)); + auto rank = + rewriter + .create( + loc, + TypeRange{mpi::RetvalType::get(ctx), rewriter.getI32Type()}, + commWorld) + .getRank(); rewriter.replaceOpWithNewOp(op, rewriter.getIndexType(), rank); return success(); @@ -652,6 +657,9 @@ struct ConvertUpdateHaloOp : public OpConversionPattern { auto upperSendOffset = rewriter.create( loc, upperRecvOffset, toValue(haloSizes[currHaloDim * 2])); + Value commWorld = rewriter.create( + loc, mpi::CommType::get(op->getContext())); + // Make sure we send/recv in a way that does not lead to a dead-lock. // The current approach is by far not optimal, this should be at least // be a red-black pattern or using MPI_sendrecv. @@ -680,7 +688,8 @@ struct ConvertUpdateHaloOp : public OpConversionPattern { auto subview = builder.create( loc, array, offsets, dimSizes, strides); builder.create(loc, subview, buffer); - builder.create(loc, TypeRange{}, buffer, tag, to); + builder.create(loc, TypeRange{}, buffer, tag, to, + commWorld); builder.create(loc); }); // if has neighbor: receive halo data into buffer and copy to array @@ -688,7 +697,8 @@ struct ConvertUpdateHaloOp : public OpConversionPattern { loc, hasFrom, [&](OpBuilder &builder, Location loc) { offsets[dim] = upperHalo ? OpFoldResult(upperRecvOffset) : OpFoldResult(lowerRecvOffset); - builder.create(loc, TypeRange{}, buffer, tag, from); + builder.create(loc, TypeRange{}, buffer, tag, from, + commWorld); auto subview = builder.create( loc, array, offsets, dimSizes, strides); builder.create(loc, buffer, subview); diff --git a/mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir b/mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir index b630ce3a23f30..174f7c79b9d50 100644 --- a/mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir +++ b/mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir @@ -3,6 +3,7 @@ // COM: Test MPICH ABI // CHECK: module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "MPICH">} { // CHECK: llvm.func @MPI_Finalize() -> i32 +// CHECK: llvm.func @MPI_Comm_split(i32, i32, i32, !llvm.ptr) -> i32 // CHECK: llvm.func @MPI_Recv(!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32 // CHECK: llvm.func @MPI_Send(!llvm.ptr, i32, i32, i32, i32, i32) -> i32 // CHECK: llvm.func @MPI_Comm_rank(i32, !llvm.ptr) -> i32 @@ -22,11 +23,14 @@ module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "MPICH">} { // CHECK: [[v7:%.*]] = llvm.call @MPI_Init([[v6]], [[v6]]) : (!llvm.ptr, !llvm.ptr) -> i32 %0 = mpi.init : !mpi.retval - // CHECK: [[v8:%.*]] = llvm.mlir.constant(1140850688 : i32) : i32 + // CHECK: [[comm:%.*]] = llvm.mlir.constant(1140850688 : i64) : i64 + %comm = mpi.comm_world : !mpi.comm + + // CHECK: [[v8:%.*]] = llvm.trunc [[comm]] : i64 to i32 // CHECK: [[v9:%.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: [[v10:%.*]] = llvm.alloca [[v9]] x i32 : (i32) -> !llvm.ptr // CHECK: [[v11:%.*]] = llvm.call @MPI_Comm_rank([[v8]], [[v10]]) : (i32, !llvm.ptr) -> i32 - %retval, %rank = mpi.comm_rank : !mpi.retval, i32 + %retval, %rank = mpi.comm_rank(%comm) : !mpi.retval, i32 // CHECK: [[v12:%.*]] = llvm.load [[v10]] : !llvm.ptr -> i32 // CHECK: [[v13:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -35,9 +39,9 @@ module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "MPICH">} { // CHECK: [[v16:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v17:%.*]] = llvm.trunc [[v16]] : i64 to i32 // CHECK: [[v18:%.*]] = llvm.mlir.constant(1275069450 : i32) : i32 - // CHECK: [[v19:%.*]] = llvm.mlir.constant(1140850688 : i32) : i32 - // CHECK: [[v20:%.*]] = llvm.call @MPI_Send([[v15]], [[v17]], [[v18]], [[v12]], [[v12]], [[v19]]) : (!llvm.ptr, i32, i32, i32, i32, i32) -> i32 - mpi.send(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 + // CHECK: [[comm_1:%.*]] = llvm.trunc [[comm]] : i64 to i32 + // CHECK: [[v20:%.*]] = llvm.call @MPI_Send([[v15]], [[v17]], [[v18]], [[v12]], [[v12]], [[comm_1]]) : (!llvm.ptr, i32, i32, i32, i32, i32) -> i32 + mpi.send(%arg0, %rank, %rank, %comm) : memref<100xf32>, i32, i32 // CHECK: [[v21:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v22:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -45,9 +49,9 @@ module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "MPICH">} { // CHECK: [[v24:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v25:%.*]] = llvm.trunc [[v24]] : i64 to i32 // CHECK: [[v26:%.*]] = llvm.mlir.constant(1275069450 : i32) : i32 - // CHECK: [[v27:%.*]] = llvm.mlir.constant(1140850688 : i32) : i32 - // CHECK: [[v28:%.*]] = llvm.call @MPI_Send([[v23]], [[v25]], [[v26]], [[v12]], [[v12]], [[v27]]) : (!llvm.ptr, i32, i32, i32, i32, i32) -> i32 - %1 = mpi.send(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval + // CHECK: [[comm_2:%.*]] = llvm.trunc [[comm]] : i64 to i32 + // CHECK: [[v28:%.*]] = llvm.call @MPI_Send([[v23]], [[v25]], [[v26]], [[v12]], [[v12]], [[comm_2]]) : (!llvm.ptr, i32, i32, i32, i32, i32) -> i32 + %1 = mpi.send(%arg0, %rank, %rank, %comm) : memref<100xf32>, i32, i32 -> !mpi.retval // CHECK: [[v29:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v30:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -55,11 +59,11 @@ module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "MPICH">} { // CHECK: [[v32:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v33:%.*]] = llvm.trunc [[v32]] : i64 to i32 // CHECK: [[v34:%.*]] = llvm.mlir.constant(1275069450 : i32) : i32 - // CHECK: [[v35:%.*]] = llvm.mlir.constant(1140850688 : i32) : i32 + // CHECK: [[comm_3:%.*]] = llvm.trunc [[comm]] : i64 to i32 // CHECK: [[v36:%.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: [[v37:%.*]] = llvm.inttoptr [[v36]] : i64 to !llvm.ptr - // CHECK: [[v38:%.*]] = llvm.call @MPI_Recv([[v31]], [[v33]], [[v34]], [[v12]], [[v12]], [[v35]], [[v37]]) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32 - mpi.recv(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 + // CHECK: [[v38:%.*]] = llvm.call @MPI_Recv([[v31]], [[v33]], [[v34]], [[v12]], [[v12]], [[comm_3]], [[v37]]) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32 + mpi.recv(%arg0, %rank, %rank, %comm) : memref<100xf32>, i32, i32 // CHECK: [[v39:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v40:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -67,27 +71,38 @@ module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "MPICH">} { // CHECK: [[v42:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v43:%.*]] = llvm.trunc [[v42]] : i64 to i32 // CHECK: [[v44:%.*]] = llvm.mlir.constant(1275069450 : i32) : i32 - // CHECK: [[v45:%.*]] = llvm.mlir.constant(1140850688 : i32) : i32 + // CHECK: [[comm_4:%.*]] = llvm.trunc [[comm]] : i64 to i32 // CHECK: [[v46:%.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: [[v47:%.*]] = llvm.inttoptr [[v46]] : i64 to !llvm.ptr - // CHECK: [[v48:%.*]] = llvm.call @MPI_Recv([[v41]], [[v43]], [[v44]], [[v12]], [[v12]], [[v45]], [[v47]]) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32 - %2 = mpi.recv(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval - - // CHECK: [[v49:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - // CHECK: [[v50:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - // CHECK: [[v51:%.*]] = llvm.getelementptr [[v49]][[[v50]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - // CHECK: [[v52:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - // CHECK: [[v53:%.*]] = llvm.trunc [[v52]] : i64 to i32 - // CHECK: [[v54:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - // CHECK: [[v55:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - // CHECK: [[v56:%.*]] = llvm.getelementptr [[v54]][[[v55]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32 - // CHECK: [[v57:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> - // CHECK: [[v58:%.*]] = llvm.trunc [[v57]] : i64 to i32 - // CHECK: [[v59:%.*]] = llvm.mlir.constant(1275069450 : i32) : i32 - // CHECK: [[v60:%.*]] = llvm.mlir.constant(1476395011 : i32) : i32 - // CHECK: [[v61:%.*]] = llvm.mlir.constant(1140850688 : i32) : i32 - // CHECK: [[v62:%.*]] = llvm.call @MPI_Allreduce([[v51]], [[v56]], [[v53]], [[v59]], [[v60]], [[v61]]) : (!llvm.ptr, !llvm.ptr, i32, i32, i32, i32) -> i32 - mpi.allreduce(%arg0, %arg0, MPI_SUM) : memref<100xf32>, memref<100xf32> + // CHECK: [[v48:%.*]] = llvm.call @MPI_Recv([[v41]], [[v43]], [[v44]], [[v12]], [[v12]], [[comm_4]], [[v47]]) : (!llvm.ptr, i32, i32, i32, i32, i32, !llvm.ptr) -> i32 + %2 = mpi.recv(%arg0, %rank, %rank, %comm) : memref<100xf32>, i32, i32 -> !mpi.retval + + // CHECK: [[v51:%.*]] = llvm.mlir.constant(10 : i32) : i32 + %color = arith.constant 10 : i32 + // CHECK: [[v52:%.*]] = llvm.mlir.constant(22 : i32) : i32 + %key = arith.constant 22 : i32 + // CHECK: [[v53:%.*]] = llvm.trunc [[comm]] : i64 to i32 + // CHECK: [[v54:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK: [[v55:%.*]] = llvm.alloca [[v54]] x i32 : (i32) -> !llvm.ptr + // CHECK: [[v56:%.*]] = llvm.call @MPI_Comm_split([[v53]], [[v51]], [[v52]], [[v55]]) : (i32, i32, i32, !llvm.ptr) -> i32 + // CHECK: [[v57:%.*]] = llvm.load [[v55]] : !llvm.ptr -> i32 + %split = mpi.comm_split(%comm, %color, %key) : !mpi.comm + + // CHECK: [[v59:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v60:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v61:%.*]] = llvm.getelementptr [[v59]][[[v60]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + // CHECK: [[v62:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v63:%.*]] = llvm.trunc [[v62]] : i64 to i32 + // CHECK: [[v64:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v65:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v66:%.*]] = llvm.getelementptr [[v64]][[[v65]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + // CHECK: [[v67:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // CHECK: [[v68:%.*]] = llvm.trunc [[v67]] : i64 to i32 + // CHECK: [[v69:%.*]] = llvm.mlir.constant(1275069450 : i32) : i32 + // CHECK: [[v70:%.*]] = llvm.mlir.constant(1476395011 : i32) : i32 + // CHECK: [[v71:%.*]] = llvm.trunc [[comm]] : i64 to i32 + // CHECK: [[v72:%.*]] = llvm.call @MPI_Allreduce([[v61]], [[v66]], [[v63]], [[v69]], [[v70]], [[v71]]) : (!llvm.ptr, !llvm.ptr, i32, i32, i32, i32) -> i32 + mpi.allreduce(%arg0, %arg0, MPI_SUM, %comm) : memref<100xf32>, memref<100xf32> // CHECK: llvm.call @MPI_Finalize() : () -> i32 %3 = mpi.finalize : !mpi.retval @@ -101,6 +116,7 @@ module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "MPICH">} { // COM: Test OpenMPI ABI // CHECK: module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "OpenMPI">} { // CHECK: llvm.func @MPI_Finalize() -> i32 +// CHECK: llvm.func @MPI_Comm_split(!llvm.ptr, i32, i32, !llvm.ptr) -> i32 // CHECK: llvm.func @MPI_Recv(!llvm.ptr, i32, !llvm.ptr, i32, i32, !llvm.ptr, !llvm.ptr) -> i32 // CHECK: llvm.func @MPI_Send(!llvm.ptr, i32, !llvm.ptr, i32, i32, !llvm.ptr) -> i32 // CHECK: llvm.mlir.global external @ompi_mpi_float() {addr_space = 0 : i32} : !llvm.struct<"ompi_predefined_datatype_t", opaque> @@ -122,11 +138,14 @@ module attributes { dlti.map = #dlti.map<"MPI:Implementation" = "OpenMPI"> } { // CHECK: [[v7:%.*]] = llvm.call @MPI_Init([[v6]], [[v6]]) : (!llvm.ptr, !llvm.ptr) -> i32 %0 = mpi.init : !mpi.retval + %comm = mpi.comm_world : !mpi.comm // CHECK: [[v8:%.*]] = llvm.mlir.addressof @ompi_mpi_comm_world : !llvm.ptr + // CHECK: [[comm:%.*]] = llvm.ptrtoint [[v8]] : !llvm.ptr to i64 + // CHECK: [[comm_1:%.*]] = llvm.inttoptr [[comm]] : i64 to !llvm.ptr // CHECK: [[v9:%.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: [[v10:%.*]] = llvm.alloca [[v9]] x i32 : (i32) -> !llvm.ptr - // CHECK: [[v11:%.*]] = llvm.call @MPI_Comm_rank([[v8]], [[v10]]) : (!llvm.ptr, !llvm.ptr) -> i32 - %retval, %rank = mpi.comm_rank : !mpi.retval, i32 + // CHECK: [[v11:%.*]] = llvm.call @MPI_Comm_rank([[comm_1]], [[v10]]) : (!llvm.ptr, !llvm.ptr) -> i32 + %retval, %rank = mpi.comm_rank(%comm) : !mpi.retval, i32 // CHECK: [[v12:%.*]] = llvm.load [[v10]] : !llvm.ptr -> i32 // CHECK: [[v13:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -135,9 +154,9 @@ module attributes { dlti.map = #dlti.map<"MPI:Implementation" = "OpenMPI"> } { // CHECK: [[v16:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v17:%.*]] = llvm.trunc [[v16]] : i64 to i32 // CHECK: [[v18:%.*]] = llvm.mlir.addressof @ompi_mpi_float : !llvm.ptr - // CHECK: [[v19:%.*]] = llvm.mlir.addressof @ompi_mpi_comm_world : !llvm.ptr + // CHECK: [[v19:%.*]] = llvm.inttoptr [[comm]] : i64 to !llvm.ptr // CHECK: [[v20:%.*]] = llvm.call @MPI_Send([[v15]], [[v17]], [[v18]], [[v12]], [[v12]], [[v19]]) : (!llvm.ptr, i32, !llvm.ptr, i32, i32, !llvm.ptr) -> i32 - mpi.send(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 + mpi.send(%arg0, %rank, %rank, %comm) : memref<100xf32>, i32, i32 // CHECK: [[v21:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v22:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -145,9 +164,9 @@ module attributes { dlti.map = #dlti.map<"MPI:Implementation" = "OpenMPI"> } { // CHECK: [[v24:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v25:%.*]] = llvm.trunc [[v24]] : i64 to i32 // CHECK: [[v26:%.*]] = llvm.mlir.addressof @ompi_mpi_float : !llvm.ptr - // CHECK: [[v27:%.*]] = llvm.mlir.addressof @ompi_mpi_comm_world : !llvm.ptr + // CHECK: [[v27:%.*]] = llvm.inttoptr [[comm]] : i64 to !llvm.ptr // CHECK: [[v28:%.*]] = llvm.call @MPI_Send([[v23]], [[v25]], [[v26]], [[v12]], [[v12]], [[v27]]) : (!llvm.ptr, i32, !llvm.ptr, i32, i32, !llvm.ptr) -> i32 - %1 = mpi.send(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval + %1 = mpi.send(%arg0, %rank, %rank, %comm) : memref<100xf32>, i32, i32 -> !mpi.retval // CHECK: [[v29:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v30:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -155,11 +174,11 @@ module attributes { dlti.map = #dlti.map<"MPI:Implementation" = "OpenMPI"> } { // CHECK: [[v32:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v33:%.*]] = llvm.trunc [[v32]] : i64 to i32 // CHECK: [[v34:%.*]] = llvm.mlir.addressof @ompi_mpi_float : !llvm.ptr - // CHECK: [[v35:%.*]] = llvm.mlir.addressof @ompi_mpi_comm_world : !llvm.ptr + // CHECK: [[v35:%.*]] = llvm.inttoptr [[comm]] : i64 to !llvm.ptr // CHECK: [[v36:%.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK: [[v37:%.*]] = llvm.inttoptr [[v36]] : i64 to !llvm.ptr // CHECK: [[v38:%.*]] = llvm.call @MPI_Recv([[v31]], [[v33]], [[v34]], [[v12]], [[v12]], [[v35]], [[v37]]) : (!llvm.ptr, i32, !llvm.ptr, i32, i32, !llvm.ptr, !llvm.ptr) -> i32 - mpi.recv(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 + mpi.recv(%arg0, %rank, %rank, %comm) : memref<100xf32>, i32, i32 // CHECK: [[v39:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v40:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -167,11 +186,11 @@ module attributes { dlti.map = #dlti.map<"MPI:Implementation" = "OpenMPI"> } { // CHECK: [[v42:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v43:%.*]] = llvm.trunc [[v42]] : i64 to i32 // CHECK: [[v44:%.*]] = llvm.mlir.addressof @ompi_mpi_float : !llvm.ptr - // CHECK: [[v45:%.*]] = llvm.mlir.addressof @ompi_mpi_comm_world : !llvm.ptr + // CHECK: [[v45:%.*]] = llvm.inttoptr [[comm]] : i64 to !llvm.ptr // CHECK: [[v46:%.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK: [[v47:%.*]] = llvm.inttoptr [[v46]] : i64 to !llvm.ptr // CHECK: [[v48:%.*]] = llvm.call @MPI_Recv([[v41]], [[v43]], [[v44]], [[v12]], [[v12]], [[v45]], [[v47]]) : (!llvm.ptr, i32, !llvm.ptr, i32, i32, !llvm.ptr, !llvm.ptr) -> i32 - %2 = mpi.recv(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval + %2 = mpi.recv(%arg0, %rank, %rank, %comm) : memref<100xf32>, i32, i32 -> !mpi.retval // CHECK: [[v49:%.*]] = llvm.extractvalue [[v5]][1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v50:%.*]] = llvm.extractvalue [[v5]][2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> @@ -185,11 +204,22 @@ module attributes { dlti.map = #dlti.map<"MPI:Implementation" = "OpenMPI"> } { // CHECK: [[v58:%.*]] = llvm.trunc [[v57]] : i64 to i32 // CHECK: [[v59:%.*]] = llvm.mlir.addressof @ompi_mpi_float : !llvm.ptr // CHECK: [[v60:%.*]] = llvm.mlir.addressof @ompi_mpi_sum : !llvm.ptr - // CHECK: [[v61:%.*]] = llvm.mlir.addressof @ompi_mpi_comm_world : !llvm.ptr + // CHECK: [[v61:%.*]] = llvm.inttoptr [[comm]] : i64 to !llvm.ptr // CHECK: [[v62:%.*]] = llvm.call @MPI_Allreduce([[v51]], [[v56]], [[v53]], [[v59]], [[v60]], [[v61]]) : (!llvm.ptr, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 - mpi.allreduce(%arg0, %arg0, MPI_SUM) : memref<100xf32>, memref<100xf32> + mpi.allreduce(%arg0, %arg0, MPI_SUM, %comm) : memref<100xf32>, memref<100xf32> + + // CHECK: [[v71:%.*]] = llvm.mlir.constant(10 : i32) : i32 + %color = arith.constant 10 : i32 + // CHECK: [[v72:%.*]] = llvm.mlir.constant(22 : i32) : i32 + %key = arith.constant 22 : i32 + // CHECK: [[v73:%.*]] = llvm.inttoptr [[comm]] : i64 to !llvm.ptr + // CHECK: [[v74:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK: [[v75:%.*]] = llvm.alloca [[v74]] x !llvm.ptr : (i32) -> !llvm.ptr + // CHECK: [[v76:%.*]] = llvm.call @MPI_Comm_split([[v73]], [[v71]], [[v72]], [[v75]]) : (!llvm.ptr, i32, i32, !llvm.ptr) -> i32 + // CHECK: [[v77:%.*]] = llvm.load [[v75]] : !llvm.ptr -> i32 + %split = mpi.comm_split(%comm, %color, %key) : !mpi.comm - // CHECK: [[v49:%.*]] = llvm.call @MPI_Finalize() : () -> i32 + // CHECK: llvm.call @MPI_Finalize() : () -> i32 %3 = mpi.finalize : !mpi.retval return diff --git a/mlir/test/Conversion/MeshToMPI/convert-mesh-to-mpi.mlir b/mlir/test/Conversion/MeshToMPI/convert-mesh-to-mpi.mlir index 4e60c6f0d4e44..23756bb66928d 100644 --- a/mlir/test/Conversion/MeshToMPI/convert-mesh-to-mpi.mlir +++ b/mlir/test/Conversion/MeshToMPI/convert-mesh-to-mpi.mlir @@ -4,7 +4,7 @@ // CHECK: mesh.mesh @mesh0 mesh.mesh @mesh0(shape = 3x4x5) func.func @process_multi_index() -> (index, index, index) { - // CHECK: mpi.comm_rank : !mpi.retval, i32 + // CHECK: mpi.comm_rank // CHECK-DAG: %[[v4:.*]] = arith.remsi // CHECK-DAG: %[[v0:.*]] = arith.remsi // CHECK-DAG: %[[v1:.*]] = arith.remsi @@ -15,7 +15,7 @@ func.func @process_multi_index() -> (index, index, index) { // CHECK-LABEL: func @process_linear_index func.func @process_linear_index() -> index { - // CHECK: %[[RES:.*]], %[[rank:.*]] = mpi.comm_rank : !mpi.retval, i32 + // CHECK: %[[RES:.*]], %[[rank:.*]] = mpi.comm_rank // CHECK: %[[cast:.*]] = arith.index_cast %[[rank]] : i32 to index %0 = mesh.process_linear_index on @mesh0 : index // CHECK: return %[[cast]] : index @@ -113,17 +113,17 @@ module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 1> } { // CHECK: [[vc91_i32:%.*]] = arith.constant 91 : i32 // CHECK-NEXT: [[vc0_i32:%.*]] = arith.constant 0 : i32 // CHECK-NEXT: [[vc2_i32:%.*]] = arith.constant 2 : i32 + // CHECK-NEXT: [[v0:%.*]] = mpi.comm_world : !mpi.comm // CHECK-NEXT: [[valloc:%.*]] = memref.alloc() : memref<2x120x120xi8> - // CHECK-NEXT: [[vsubview:%.*]] = memref.subview [[varg0]][118, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8 - // CHECK-NEXT: memref.copy [[vsubview]], [[valloc]] : memref<2x120x120xi8 - // CHECK-SAME: to memref<2x120x120xi8> - // CHECK-NEXT: mpi.send([[valloc]], [[vc91_i32]], [[vc2_i32]]) : memref<2x120x120xi8>, i32, i32 - // CHECK-NEXT: mpi.recv([[valloc]], [[vc91_i32]], [[vc0_i32]]) : memref<2x120x120xi8>, i32, i32 - // CHECK-NEXT: [[vsubview_0:%.*]] = memref.subview [[varg0]][0, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8 - // CHECK-NEXT: memref.copy [[valloc]], [[vsubview_0]] : memref<2x120x120xi8> to memref<2x120x120xi8 + // CHECK-NEXT: [[vsubview:%.*]] = memref.subview [[varg0]][118, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1], offset: 1699200>> + // CHECK-NEXT: memref.copy [[vsubview]], [[valloc]] : memref<2x120x120xi8, strided<[14400, 120, 1], offset: 1699200>> to memref<2x120x120xi8> + // CHECK-NEXT: mpi.send([[valloc]], [[vc91_i32]], [[vc2_i32]], [[v0]]) : memref<2x120x120xi8>, i32, i32 + // CHECK-NEXT: mpi.recv([[valloc]], [[vc91_i32]], [[vc0_i32]], [[v0]]) : memref<2x120x120xi8>, i32, i32 + // CHECK-NEXT: [[vsubview_0:%.*]] = memref.subview [[varg0]][0, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1]>> + // CHECK-NEXT: memref.copy [[valloc]], [[vsubview_0]] : memref<2x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1]>> // CHECK-NEXT: memref.dealloc [[valloc]] : memref<2x120x120xi8> %res = mesh.update_halo %arg0 on @mesh0 split_axes = [[0]] halo_sizes = [2, 0] : memref<120x120x120xi8> - // CHECK: return [[res:%.*]] : memref<120x120x120xi8> + // CHECK: return [[varg0]] : memref<120x120x120xi8> return %res : memref<120x120x120xi8> } } @@ -140,41 +140,44 @@ module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 24> } { // CHECK-NEXT: [[vc91_i32:%.*]] = arith.constant 91 : i32 // CHECK-NEXT: [[vc4_i32:%.*]] = arith.constant 4 : i32 // CHECK-NEXT: [[vc44_i32:%.*]] = arith.constant 44 : i32 + // CHECK-NEXT: [[v0:%.*]] = mpi.comm_world : !mpi.comm // CHECK-NEXT: [[valloc:%.*]] = memref.alloc() : memref<117x113x5xi8> // CHECK-NEXT: [[vsubview:%.*]] = memref.subview [[varg0]][1, 3, 109] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>> // CHECK-NEXT: memref.copy [[vsubview]], [[valloc]] : memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>> to memref<117x113x5xi8> - // CHECK-NEXT: mpi.send([[valloc]], [[vc91_i32]], [[vc44_i32]]) : memref<117x113x5xi8>, i32, i32 - // CHECK-NEXT: mpi.recv([[valloc]], [[vc91_i32]], [[vc4_i32]]) : memref<117x113x5xi8>, i32, i32 + // CHECK-NEXT: mpi.send([[valloc]], [[vc91_i32]], [[vc44_i32]], [[v0]]) : memref<117x113x5xi8>, i32, i32 + // CHECK-NEXT: mpi.recv([[valloc]], [[vc91_i32]], [[vc4_i32]], [[v0]]) : memref<117x113x5xi8>, i32, i32 // CHECK-NEXT: [[vsubview_0:%.*]] = memref.subview [[varg0]][1, 3, 0] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14760>> // CHECK-NEXT: memref.copy [[valloc]], [[vsubview_0]] : memref<117x113x5xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14760>> // CHECK-NEXT: memref.dealloc [[valloc]] : memref<117x113x5xi8> // CHECK-NEXT: [[valloc_1:%.*]] = memref.alloc() : memref<117x113x6xi8> // CHECK-NEXT: [[vsubview_2:%.*]] = memref.subview [[varg0]][1, 3, 5] [117, 113, 6] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14765>> // CHECK-NEXT: memref.copy [[vsubview_2]], [[valloc_1]] : memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14765>> to memref<117x113x6xi8> - // CHECK-NEXT: mpi.send([[valloc_1]], [[vc91_i32]], [[vc4_i32]]) : memref<117x113x6xi8>, i32, i32 - // CHECK-NEXT: mpi.recv([[valloc_1]], [[vc91_i32]], [[vc44_i32]]) : memref<117x113x6xi8>, i32, i32 + // CHECK-NEXT: mpi.send([[valloc_1]], [[vc91_i32]], [[vc4_i32]], [[v0]]) : memref<117x113x6xi8>, i32, i32 + // CHECK-NEXT: mpi.recv([[valloc_1]], [[vc91_i32]], [[vc44_i32]], [[v0]]) : memref<117x113x6xi8>, i32, i32 // CHECK-NEXT: [[vsubview_3:%.*]] = memref.subview [[varg0]][1, 3, 114] [117, 113, 6] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14874>> // CHECK-NEXT: memref.copy [[valloc_1]], [[vsubview_3]] : memref<117x113x6xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14874>> // CHECK-NEXT: memref.dealloc [[valloc_1]] : memref<117x113x6xi8> + // CHECK-NEXT: [[v1:%.*]] = mpi.comm_world : !mpi.comm // CHECK-NEXT: [[valloc_4:%.*]] = memref.alloc() : memref<117x3x120xi8> // CHECK-NEXT: [[vsubview_5:%.*]] = memref.subview [[varg0]][1, 113, 0] [117, 3, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<117x3x120xi8, strided<[14400, 120, 1], offset: 27960>> // CHECK-NEXT: memref.copy [[vsubview_5]], [[valloc_4]] : memref<117x3x120xi8, strided<[14400, 120, 1], offset: 27960>> to memref<117x3x120xi8> - // CHECK-NEXT: mpi.send([[valloc_4]], [[vc91_i32]], [[vc29_i32]]) : memref<117x3x120xi8>, i32, i32 + // CHECK-NEXT: mpi.send([[valloc_4]], [[vc91_i32]], [[vc29_i32]], [[v1]]) : memref<117x3x120xi8>, i32, i32 // CHECK-NEXT: memref.dealloc [[valloc_4]] : memref<117x3x120xi8> // CHECK-NEXT: [[valloc_6:%.*]] = memref.alloc() : memref<117x4x120xi8> - // CHECK-NEXT: mpi.recv([[valloc_6]], [[vc91_i32]], [[vc29_i32]]) : memref<117x4x120xi8>, i32, i32 + // CHECK-NEXT: mpi.recv([[valloc_6]], [[vc91_i32]], [[vc29_i32]], [[v1]]) : memref<117x4x120xi8>, i32, i32 // CHECK-NEXT: [[vsubview_7:%.*]] = memref.subview [[varg0]][1, 116, 0] [117, 4, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<117x4x120xi8, strided<[14400, 120, 1], offset: 28320>> // CHECK-NEXT: memref.copy [[valloc_6]], [[vsubview_7]] : memref<117x4x120xi8> to memref<117x4x120xi8, strided<[14400, 120, 1], offset: 28320>> // CHECK-NEXT: memref.dealloc [[valloc_6]] : memref<117x4x120xi8> + // CHECK-NEXT: [[v2:%.*]] = mpi.comm_world : !mpi.comm // CHECK-NEXT: [[valloc_8:%.*]] = memref.alloc() : memref<1x120x120xi8> - // CHECK-NEXT: mpi.recv([[valloc_8]], [[vc91_i32]], [[vc23_i32]]) : memref<1x120x120xi8>, i32, i32 + // CHECK-NEXT: mpi.recv([[valloc_8]], [[vc91_i32]], [[vc23_i32]], [[v2]]) : memref<1x120x120xi8>, i32, i32 // CHECK-NEXT: [[vsubview_9:%.*]] = memref.subview [[varg0]][0, 0, 0] [1, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<1x120x120xi8, strided<[14400, 120, 1]>> // CHECK-NEXT: memref.copy [[valloc_8]], [[vsubview_9]] : memref<1x120x120xi8> to memref<1x120x120xi8, strided<[14400, 120, 1]>> // CHECK-NEXT: memref.dealloc [[valloc_8]] : memref<1x120x120xi8> // CHECK-NEXT: [[valloc_10:%.*]] = memref.alloc() : memref<2x120x120xi8> // CHECK-NEXT: [[vsubview_11:%.*]] = memref.subview [[varg0]][1, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1], offset: 14400>> // CHECK-NEXT: memref.copy [[vsubview_11]], [[valloc_10]] : memref<2x120x120xi8, strided<[14400, 120, 1], offset: 14400>> to memref<2x120x120xi8> - // CHECK-NEXT: mpi.send([[valloc_10]], [[vc91_i32]], [[vc23_i32]]) : memref<2x120x120xi8>, i32, i32 + // CHECK-NEXT: mpi.send([[valloc_10]], [[vc91_i32]], [[vc23_i32]], [[v2]]) : memref<2x120x120xi8>, i32, i32 // CHECK-NEXT: memref.dealloc [[valloc_10]] : memref<2x120x120xi8> %res = mesh.update_halo %arg0 on @mesh0 split_axes = [[2], [1], [0]] halo_sizes = [1, 2, 3, 4, 5, 6] : memref<120x120x120xi8> // CHECK: return [[varg0]] : memref<120x120x120xi8> @@ -191,45 +194,48 @@ module attributes { mpi.dlti = #dlti.map<"MPI:comm_world_rank" = 24> } { // CHECK-NEXT: [[vc4_i32:%.*]] = arith.constant 4 : i32 // CHECK-NEXT: [[vc91_i32:%.*]] = arith.constant 91 : i32 // CHECK-NEXT: [[v0:%.*]] = bufferization.to_memref [[varg0]] : tensor<120x120x120xi8> to memref<120x120x120xi8> + // CHECK-NEXT: [[v1:%.*]] = mpi.comm_world : !mpi.comm // CHECK-NEXT: [[valloc:%.*]] = memref.alloc() : memref<117x113x5xi8> // CHECK-NEXT: [[vsubview:%.*]] = memref.subview [[v0]][1, 3, 109] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>> // CHECK-NEXT: memref.copy [[vsubview]], [[valloc]] : memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14869>> to memref<117x113x5xi8> - // CHECK-NEXT: mpi.send([[valloc]], [[vc91_i32]], [[vc44_i32]]) : memref<117x113x5xi8>, i32, i32 - // CHECK-NEXT: mpi.recv([[valloc]], [[vc91_i32]], [[vc4_i32]]) : memref<117x113x5xi8>, i32, i32 + // CHECK-NEXT: mpi.send([[valloc]], [[vc91_i32]], [[vc44_i32]], [[v1]]) : memref<117x113x5xi8>, i32, i32 + // CHECK-NEXT: mpi.recv([[valloc]], [[vc91_i32]], [[vc4_i32]], [[v1]]) : memref<117x113x5xi8>, i32, i32 // CHECK-NEXT: [[vsubview_0:%.*]] = memref.subview [[v0]][1, 3, 0] [117, 113, 5] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14760>> // CHECK-NEXT: memref.copy [[valloc]], [[vsubview_0]] : memref<117x113x5xi8> to memref<117x113x5xi8, strided<[14400, 120, 1], offset: 14760>> // CHECK-NEXT: memref.dealloc [[valloc]] : memref<117x113x5xi8> // CHECK-NEXT: [[valloc_1:%.*]] = memref.alloc() : memref<117x113x6xi8> // CHECK-NEXT: [[vsubview_2:%.*]] = memref.subview [[v0]][1, 3, 5] [117, 113, 6] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14765>> // CHECK-NEXT: memref.copy [[vsubview_2]], [[valloc_1]] : memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14765>> to memref<117x113x6xi8> - // CHECK-NEXT: mpi.send([[valloc_1]], [[vc91_i32]], [[vc4_i32]]) : memref<117x113x6xi8>, i32, i32 - // CHECK-NEXT: mpi.recv([[valloc_1]], [[vc91_i32]], [[vc44_i32]]) : memref<117x113x6xi8>, i32, i32 + // CHECK-NEXT: mpi.send([[valloc_1]], [[vc91_i32]], [[vc4_i32]], [[v1]]) : memref<117x113x6xi8>, i32, i32 + // CHECK-NEXT: mpi.recv([[valloc_1]], [[vc91_i32]], [[vc44_i32]], [[v1]]) : memref<117x113x6xi8>, i32, i32 // CHECK-NEXT: [[vsubview_3:%.*]] = memref.subview [[v0]][1, 3, 114] [117, 113, 6] [1, 1, 1] : memref<120x120x120xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14874>> // CHECK-NEXT: memref.copy [[valloc_1]], [[vsubview_3]] : memref<117x113x6xi8> to memref<117x113x6xi8, strided<[14400, 120, 1], offset: 14874>> // CHECK-NEXT: memref.dealloc [[valloc_1]] : memref<117x113x6xi8> + // CHECK-NEXT: [[v2:%.*]] = mpi.comm_world : !mpi.comm // CHECK-NEXT: [[valloc_4:%.*]] = memref.alloc() : memref<117x3x120xi8> // CHECK-NEXT: [[vsubview_5:%.*]] = memref.subview [[v0]][1, 113, 0] [117, 3, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<117x3x120xi8, strided<[14400, 120, 1], offset: 27960>> // CHECK-NEXT: memref.copy [[vsubview_5]], [[valloc_4]] : memref<117x3x120xi8, strided<[14400, 120, 1], offset: 27960>> to memref<117x3x120xi8> - // CHECK-NEXT: mpi.send([[valloc_4]], [[vc91_i32]], [[vc29_i32]]) : memref<117x3x120xi8>, i32, i32 + // CHECK-NEXT: mpi.send([[valloc_4]], [[vc91_i32]], [[vc29_i32]], [[v2]]) : memref<117x3x120xi8>, i32, i32 // CHECK-NEXT: memref.dealloc [[valloc_4]] : memref<117x3x120xi8> // CHECK-NEXT: [[valloc_6:%.*]] = memref.alloc() : memref<117x4x120xi8> - // CHECK-NEXT: mpi.recv([[valloc_6]], [[vc91_i32]], [[vc29_i32]]) : memref<117x4x120xi8>, i32, i32 + // CHECK-NEXT: mpi.recv([[valloc_6]], [[vc91_i32]], [[vc29_i32]], [[v2]]) : memref<117x4x120xi8>, i32, i32 // CHECK-NEXT: [[vsubview_7:%.*]] = memref.subview [[v0]][1, 116, 0] [117, 4, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<117x4x120xi8, strided<[14400, 120, 1], offset: 28320>> // CHECK-NEXT: memref.copy [[valloc_6]], [[vsubview_7]] : memref<117x4x120xi8> to memref<117x4x120xi8, strided<[14400, 120, 1], offset: 28320>> // CHECK-NEXT: memref.dealloc [[valloc_6]] : memref<117x4x120xi8> + // CHECK-NEXT: [[v3:%.*]] = mpi.comm_world : !mpi.comm // CHECK-NEXT: [[valloc_8:%.*]] = memref.alloc() : memref<1x120x120xi8> - // CHECK-NEXT: mpi.recv([[valloc_8]], [[vc91_i32]], [[vc23_i32]]) : memref<1x120x120xi8>, i32, i32 + // CHECK-NEXT: mpi.recv([[valloc_8]], [[vc91_i32]], [[vc23_i32]], [[v3]]) : memref<1x120x120xi8>, i32, i32 // CHECK-NEXT: [[vsubview_9:%.*]] = memref.subview [[v0]][0, 0, 0] [1, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<1x120x120xi8, strided<[14400, 120, 1]>> // CHECK-NEXT: memref.copy [[valloc_8]], [[vsubview_9]] : memref<1x120x120xi8> to memref<1x120x120xi8, strided<[14400, 120, 1]>> // CHECK-NEXT: memref.dealloc [[valloc_8]] : memref<1x120x120xi8> // CHECK-NEXT: [[valloc_10:%.*]] = memref.alloc() : memref<2x120x120xi8> // CHECK-NEXT: [[vsubview_11:%.*]] = memref.subview [[v0]][1, 0, 0] [2, 120, 120] [1, 1, 1] : memref<120x120x120xi8> to memref<2x120x120xi8, strided<[14400, 120, 1], offset: 14400>> // CHECK-NEXT: memref.copy [[vsubview_11]], [[valloc_10]] : memref<2x120x120xi8, strided<[14400, 120, 1], offset: 14400>> to memref<2x120x120xi8> - // CHECK-NEXT: mpi.send([[valloc_10]], [[vc91_i32]], [[vc23_i32]]) : memref<2x120x120xi8>, i32, i32 + // CHECK-NEXT: mpi.send([[valloc_10]], [[vc91_i32]], [[vc23_i32]], [[v3]]) : memref<2x120x120xi8>, i32, i32 // CHECK-NEXT: memref.dealloc [[valloc_10]] : memref<2x120x120xi8> - // CHECK-NEXT: [[v1:%.*]] = bufferization.to_tensor [[v0]] restrict writable : memref<120x120x120xi8> to tensor<120x120x120xi8> + // CHECK-NEXT: [[v4:%.*]] = bufferization.to_tensor [[v0]] restrict writable : memref<120x120x120xi8> to tensor<120x120x120xi8> %res = mesh.update_halo %arg0 on @mesh0 split_axes = [[2], [1], [0]] halo_sizes = [1, 2, 3, 4, 5, 6] : tensor<120x120x120xi8> - // CHECK: return [[v1]] : tensor<120x120x120xi8> + // CHECK-NEXT: return [[v4]] : tensor<120x120x120xi8> return %res : tensor<120x120x120xi8> } } diff --git a/mlir/test/Dialect/MPI/mpiops.mlir b/mlir/test/Dialect/MPI/mpiops.mlir index fb4333611a246..ef457628fe2c4 100644 --- a/mlir/test/Dialect/MPI/mpiops.mlir +++ b/mlir/test/Dialect/MPI/mpiops.mlir @@ -1,66 +1,83 @@ // RUN: mlir-opt %s | mlir-opt | FileCheck %s +// CHECK-LABEL: func.func @mpi_test( +// CHECK-SAME: [[varg0:%.*]]: memref<100xf32>) { func.func @mpi_test(%ref : memref<100xf32>) -> () { // Note: the !mpi.retval result is optional on all operations except mpi.error_class - // CHECK: %0 = mpi.init : !mpi.retval + // CHECK-NEXT: [[v0:%.*]] = mpi.init : !mpi.retval %err = mpi.init : !mpi.retval - // CHECK-NEXT: %retval, %rank = mpi.comm_rank : !mpi.retval, i32 - %retval, %rank = mpi.comm_rank : !mpi.retval, i32 + // CHECK-NEXT: [[v1:%.*]] = mpi.comm_world : !mpi.comm + %comm = mpi.comm_world : !mpi.comm - // CHECK-NEXT: %retval_0, %size = mpi.comm_size : !mpi.retval, i32 - %retval_0, %size = mpi.comm_size : !mpi.retval, i32 + // CHECK-NEXT: [[vrank:%.*]] = mpi.comm_rank([[v1]]) : i32 + %rank = mpi.comm_rank(%comm) : i32 - // CHECK-NEXT: mpi.send(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 - mpi.send(%ref, %rank, %rank) : memref<100xf32>, i32, i32 + // CHECK-NEXT: [[vretval:%.*]], [[vrank_0:%.*]] = mpi.comm_rank([[v1]]) : !mpi.retval, i32 + %retval, %rank_1 = mpi.comm_rank(%comm) : !mpi.retval, i32 - // CHECK-NEXT: %1 = mpi.send(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval - %err2 = mpi.send(%ref, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval + // CHECK-NEXT: [[vsize:%.*]] = mpi.comm_size([[v1]]) : i32 + %size = mpi.comm_size(%comm) : i32 - // CHECK-NEXT: mpi.recv(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 - mpi.recv(%ref, %rank, %rank) : memref<100xf32>, i32, i32 + // CHECK-NEXT: [[vretval_1:%.*]], [[vsize_2:%.*]] = mpi.comm_size([[v1]]) : !mpi.retval, i32 + %retval_0, %size_1 = mpi.comm_size(%comm) : !mpi.retval, i32 - // CHECK-NEXT: %2 = mpi.recv(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval - %err3 = mpi.recv(%ref, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval + // CHECK-NEXT: [[vnewcomm:%.*]] = mpi.comm_split([[v1]], [[vrank]], [[vrank]]) : !mpi.comm + %new_comm = mpi.comm_split(%comm, %rank, %rank) : !mpi.comm - // CHECK-NEXT: %req = mpi.isend(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.request - %req = mpi.isend(%ref, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.request + // CHECK-NEXT: [[vretval_3:%.*]], [[vnewcomm_4:%.*]] = mpi.comm_split([[v1]], [[vrank]], [[vrank]]) : !mpi.retval, !mpi.comm + %retval_1, %new_comm_1 = mpi.comm_split(%comm, %rank, %rank) : !mpi.retval, !mpi.comm - // CHECK-NEXT: %retval_1, %req_2 = mpi.isend(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval, !mpi.request - %err4, %req2 = mpi.isend(%ref, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval, !mpi.request + // CHECK-NEXT: mpi.send([[varg0]], [[vrank]], [[vrank]], [[v1]]) : memref<100xf32>, i32, i32 + mpi.send(%ref, %rank, %rank, %comm) : memref<100xf32>, i32, i32 - // CHECK-NEXT: %req_3 = mpi.irecv(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.request - %req3 = mpi.irecv(%ref, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.request + // CHECK-NEXT: [[v2:%.*]] = mpi.send([[varg0]], [[vrank]], [[vrank]], [[v1]]) : memref<100xf32>, i32, i32 -> !mpi.retval + %retval_2 = mpi.send(%ref, %rank, %rank, %comm) : memref<100xf32>, i32, i32 -> !mpi.retval - // CHECK-NEXT: %retval_4, %req_5 = mpi.irecv(%arg0, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval, !mpi.request - %err5, %req4 = mpi.irecv(%ref, %rank, %rank) : memref<100xf32>, i32, i32 -> !mpi.retval, !mpi.request + // CHECK-NEXT: mpi.recv([[varg0]], [[vrank]], [[vrank]], [[v1]]) : memref<100xf32>, i32, i32 + mpi.recv(%ref, %rank, %rank, %comm) : memref<100xf32>, i32, i32 - // CHECK-NEXT: mpi.wait(%req) : !mpi.request - mpi.wait(%req) : !mpi.request + // CHECK-NEXT: [[v3:%.*]] = mpi.recv([[varg0]], [[vrank]], [[vrank]], [[v1]]) : memref<100xf32>, i32, i32 -> !mpi.retval + %retval_3 = mpi.recv(%ref, %rank, %rank, %comm) : memref<100xf32>, i32, i32 -> !mpi.retval - // CHECK-NEXT: %3 = mpi.wait(%req_2) : !mpi.request -> !mpi.retval + // CHECK-NEXT: [[vretval_5:%.*]], [[vreq:%.*]] = mpi.isend([[varg0]], [[vrank]], [[vrank]], [[v1]]) : memref<100xf32>, i32, i32 -> !mpi.retval, !mpi.request + %err4, %req2 = mpi.isend(%ref, %rank, %rank, %comm) : memref<100xf32>, i32, i32 -> !mpi.retval, !mpi.request + + // CHECK-NEXT: [[vreq_6:%.*]] = mpi.isend([[varg0]], [[vrank]], [[vrank]], [[v1]]) : memref<100xf32>, i32, i32 -> !mpi.request + %req1 = mpi.isend(%ref, %rank, %rank, %comm) : memref<100xf32>, i32, i32 -> !mpi.request + + // CHECK-NEXT: [[vreq_7:%.*]] = mpi.irecv([[varg0]], [[vrank]], [[vrank]], [[v1]]) : memref<100xf32>, i32, i32 -> !mpi.request + %req3 = mpi.irecv(%ref, %rank, %rank, %comm) : memref<100xf32>, i32, i32 -> !mpi.request + + // CHECK-NEXT: [[vretval_8:%.*]], [[vreq_9:%.*]] = mpi.irecv([[varg0]], [[vrank]], [[vrank]], [[v1]]) : memref<100xf32>, i32, i32 -> !mpi.retval, !mpi.request + %err5, %req4 = mpi.irecv(%ref, %rank, %rank, %comm) : memref<100xf32>, i32, i32 -> !mpi.retval, !mpi.request + + // CHECK-NEXT: mpi.wait([[vreq_9]]) : !mpi.request + mpi.wait(%req4) : !mpi.request + + // CHECK-NEXT: [[v4:%.*]] = mpi.wait([[vreq]]) : !mpi.request -> !mpi.retval %err6 = mpi.wait(%req2) : !mpi.request -> !mpi.retval - // CHECK-NEXT: mpi.barrier : !mpi.retval - mpi.barrier : !mpi.retval + // CHECK-NEXT: mpi.barrier([[v1]]) + mpi.barrier(%comm) - // CHECK-NEXT: %5 = mpi.barrier : !mpi.retval - %err7 = mpi.barrier : !mpi.retval + // CHECK-NEXT: [[v5:%.*]] = mpi.barrier([[v1]]) -> !mpi.retval + %err7 = mpi.barrier(%comm) -> !mpi.retval - // CHECK-NEXT: mpi.allreduce(%arg0, %arg0, MPI_SUM) : memref<100xf32>, memref<100xf32> - mpi.allreduce(%ref, %ref, MPI_SUM) : memref<100xf32>, memref<100xf32> + // CHECK-NEXT: [[v6:%.*]] = mpi.allreduce([[varg0]], [[varg0]], MPI_SUM, [[v1]]) : memref<100xf32>, memref<100xf32> -> !mpi.retval + %err8 = mpi.allreduce(%ref, %ref, MPI_SUM, %comm) : memref<100xf32>, memref<100xf32> -> !mpi.retval - // CHECK-NEXT: mpi.allreduce(%arg0, %arg0, MPI_SUM) : memref<100xf32>, memref<100xf32> -> !mpi.retval - %err8 = mpi.allreduce(%ref, %ref, MPI_SUM) : memref<100xf32>, memref<100xf32> -> !mpi.retval + // CHECK-NEXT: mpi.allreduce([[varg0]], [[varg0]], MPI_SUM, [[v1]]) : memref<100xf32>, memref<100xf32> + mpi.allreduce(%ref, %ref, MPI_SUM, %comm) : memref<100xf32>, memref<100xf32> - // CHECK-NEXT: %7 = mpi.finalize : !mpi.retval + // CHECK-NEXT: [[v7:%.*]] = mpi.finalize : !mpi.retval %rval = mpi.finalize : !mpi.retval - // CHECK-NEXT: %8 = mpi.retval_check %retval = : i1 + // CHECK-NEXT: [[v8:%.*]] = mpi.retval_check [[vretval:%.*]] = : i1 %res = mpi.retval_check %retval = : i1 - // CHECK-NEXT: %9 = mpi.error_class %0 : !mpi.retval + // CHECK-NEXT: [[v9:%.*]] = mpi.error_class [[v0]] : !mpi.retval %errclass = mpi.error_class %err : !mpi.retval // CHECK-NEXT: return From 7267dbfe1032f5ebd698403848fab4bbfcbe0b19 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Tue, 1 Apr 2025 09:00:46 +0200 Subject: [PATCH 0198/1029] [clang][bytecode] Fix comparing the addresses of union members (#133852) Union members get the same address, so we can't just use `Pointer::getByteOffset()`. --- clang/lib/AST/ByteCode/Interp.h | 11 ++++++++- clang/lib/AST/ByteCode/Pointer.cpp | 33 ++++++++++++++++++++++++++ clang/lib/AST/ByteCode/Pointer.h | 4 +++- clang/test/AST/ByteCode/unions.cpp | 38 ++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 938077a9f10ae..6fe1d4b1f95ae 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1070,9 +1070,18 @@ inline bool CmpHelperEQ(InterpState &S, CodePtr OpPC, CompareFn Fn) { } if (Pointer::hasSameBase(LHS, RHS)) { + if (LHS.inUnion() && RHS.inUnion()) { + // If the pointers point into a union, things are a little more + // complicated since the offset we save in interp::Pointer can't be used + // to compare the pointers directly. + size_t A = LHS.computeOffsetForComparison(); + size_t B = RHS.computeOffsetForComparison(); + S.Stk.push(BoolT::from(Fn(Compare(A, B)))); + return true; + } + unsigned VL = LHS.getByteOffset(); unsigned VR = RHS.getByteOffset(); - // In our Pointer class, a pointer to an array and a pointer to the first // element in the same array are NOT equal. They have the same Base value, // but a different Offset. This is a pretty rare case, so we fix this here diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp index 79b47c26992ae..5b522610a22f1 100644 --- a/clang/lib/AST/ByteCode/Pointer.cpp +++ b/clang/lib/AST/ByteCode/Pointer.cpp @@ -339,6 +339,39 @@ void Pointer::print(llvm::raw_ostream &OS) const { } } +/// Compute an integer that can be used to compare this pointer to +/// another one. +size_t Pointer::computeOffsetForComparison() const { + if (!isBlockPointer()) + return Offset; + + size_t Result = 0; + Pointer P = *this; + while (!P.isRoot()) { + if (P.isArrayRoot()) { + P = P.getBase(); + continue; + } + if (P.isArrayElement()) { + P = P.expand(); + Result += (P.getIndex() * P.elemSize()); + P = P.getArray(); + continue; + } + + if (const Record *R = P.getBase().getRecord(); R && R->isUnion()) { + // Direct child of a union - all have offset 0. + P = P.getBase(); + continue; + } + + Result += P.getInlineDesc()->Offset; + P = P.getBase(); + } + + return Result; +} + std::string Pointer::toDiagnosticString(const ASTContext &Ctx) const { if (isZero()) return "nullptr"; diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h index fd33ee9955f55..988237d39fff4 100644 --- a/clang/lib/AST/ByteCode/Pointer.h +++ b/clang/lib/AST/ByteCode/Pointer.h @@ -417,7 +417,7 @@ class Pointer { return false; } bool inUnion() const { - if (isBlockPointer()) + if (isBlockPointer() && asBlockPointer().Base >= sizeof(InlineDescriptor)) return getInlineDesc()->InUnion; return false; }; @@ -727,6 +727,8 @@ class Pointer { /// Prints the pointer. void print(llvm::raw_ostream &OS) const; + size_t computeOffsetForComparison() const; + private: friend class Block; friend class DeadBlock; diff --git a/clang/test/AST/ByteCode/unions.cpp b/clang/test/AST/ByteCode/unions.cpp index 66b8389606b85..3911a2b2f7dde 100644 --- a/clang/test/AST/ByteCode/unions.cpp +++ b/clang/test/AST/ByteCode/unions.cpp @@ -600,3 +600,41 @@ namespace MoveOrAssignOp { static_assert(foo()); } #endif + +namespace AddressComparison { + union { + int a; + int c; + } U; + static_assert(__builtin_addressof(U.a) == (void*)__builtin_addressof(U.c)); + static_assert(&U.a == &U.c); + + + struct { + union { + struct { + int a; + int b; + } a; + struct { + int b; + int a; + }b; + } u; + int b; + } S; + + static_assert(&S.u.a.a == &S.u.b.b); + static_assert(&S.u.a.b != &S.u.b.b); + static_assert(&S.u.a.b == &S.u.b.b); // both-error {{failed}} + + + union { + int a[2]; + int b[2]; + } U2; + + static_assert(&U2.a[0] == &U2.b[0]); + static_assert(&U2.a[0] != &U2.b[1]); + static_assert(&U2.a[0] == &U2.b[1]); // both-error {{failed}} +} From 36978fadb8e14c944b71fa63b876012cb2c444c2 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 1 Apr 2025 00:06:04 -0700 Subject: [PATCH 0199/1029] [MC] Add UseAtForSpecifier Some ELF targets don't use @ for relocation specifiers. We should not report `error: invalid variant` when @ is used. Attempt to make expr@specifier parsing less hacky. --- llvm/include/llvm/MC/MCAsmInfo.h | 12 +++++++---- llvm/include/llvm/MC/MCParser/MCAsmLexer.h | 1 + llvm/lib/MC/MCExpr.cpp | 2 +- llvm/lib/MC/MCParser/AsmLexer.cpp | 5 ++++- llvm/lib/MC/MCParser/AsmParser.cpp | 13 ++++++------ llvm/lib/MC/MCParser/ELFAsmParser.cpp | 8 ++++++++ .../Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp | 6 ++++-- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 20 +++++++++++-------- .../RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp | 1 + llvm/test/MC/RISCV/pseudo-jump-invalid.s | 3 ++- llvm/test/MC/RISCV/rv32i-aliases-invalid.s | 2 +- llvm/test/MC/RISCV/rv64i-aliases-invalid.s | 2 +- 12 files changed, 50 insertions(+), 25 deletions(-) diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index 3134ee02f54be..6714abac5c726 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -377,9 +377,12 @@ class MCAsmInfo { /// names in .cfi_* directives. Defaults to false. bool DwarfRegNumForCFI = false; - /// True if target uses parens to indicate the symbol variant instead of @. - /// For example, foo(plt) instead of foo@plt. Defaults to false. - bool UseParensForSymbolVariant = false; + /// True if target uses @ (expr@specifier) for relocation specifiers. + bool UseAtForSpecifier = true; + + /// (ARM-specific) Uses parens for relocation specifier in data + /// directives, e.g. .word foo(got). + bool UseParensForSpecifier = false; /// True if the target uses parens for symbol names starting with /// '$' character to distinguish them from absolute names. @@ -649,7 +652,8 @@ class MCAsmInfo { bool doDwarfFDESymbolsUseAbsDiff() const { return DwarfFDESymbolsUseAbsDiff; } bool useDwarfRegNumForCFI() const { return DwarfRegNumForCFI; } - bool useParensForSymbolVariant() const { return UseParensForSymbolVariant; } + bool useAtForSpecifier() const { return UseAtForSpecifier; } + bool useParensForSpecifier() const { return UseParensForSpecifier; } bool supportsExtendedDwarfLocDirective() const { return SupportsExtendedDwarfLocDirective; } diff --git a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h index 9affb1f980bb0..61b89b9a103f4 100644 --- a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h +++ b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h @@ -15,6 +15,7 @@ #include #include #include +#include namespace llvm { diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index fa5c3dab1f115..773df74291064 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -93,7 +93,7 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, if (Kind != MCSymbolRefExpr::VK_None) { if (!MAI) // should only be used by dump() OS << "@'; - else if (MAI->useParensForSymbolVariant()) // ARM + else if (MAI->useParensForSpecifier()) // ARM OS << '(' << MAI->getSpecifierName(Kind) << ')'; else OS << '@' << MAI->getSpecifierName(Kind); diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp index 23836438027c0..8715f94d51fe5 100644 --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -32,7 +32,10 @@ using namespace llvm; AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { - AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@"); + // For COFF targets, this is true, while for ELF targets, it should be false. + // Currently, @specifier parsing depends on '@' being included in the token. + AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@") && + MAI.useAtForSpecifier(); LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers(); } diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 65a38009a8488..17417f292e053 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -1191,9 +1191,9 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, return false; } } - // Parse symbol variant + // Parse an optional relocation specifier. std::pair Split; - if (!MAI.useParensForSymbolVariant()) { + if (MAI.useAtForSpecifier()) { if (FirstTokenKind == AsmToken::String) { if (Lexer.is(AsmToken::At)) { Lex(); // eat @ @@ -1207,8 +1207,8 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, } else { Split = Identifier.split('@'); } - } else if (Lexer.is(AsmToken::LParen)) { - Lex(); // eat '('. + } else if (MAI.useParensForSpecifier() && + parseOptionalToken(AsmToken::LParen)) { StringRef VName; parseIdentifier(VName); if (parseRParen()) @@ -1231,7 +1231,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, if (MaybeVariant) { SymbolName = Split.first; Variant = MCSymbolRefExpr::VariantKind(*MaybeVariant); - } else if (MAI.doesAllowAtInName() && !MAI.useParensForSymbolVariant()) { + } else if (MAI.doesAllowAtInName()) { Variant = MCSymbolRefExpr::VK_None; } else { return Error(SMLoc::getFromPointer(Split.second.begin()), @@ -1463,7 +1463,8 @@ bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) { // As a special case, we support 'a op b @ modifier' by rewriting the // expression to include the modifier. This is inefficient, but in general we // expect users to use 'a@modifier op b'. - if (parseOptionalToken(AsmToken::At)) { + if (Ctx.getAsmInfo()->useAtForSpecifier() && + parseOptionalToken(AsmToken::At)) { if (Lexer.isNot(AsmToken::Identifier)) return TokError("unexpected symbol modifier following '@'"); diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index c94ddfa087fd3..70550d269002b 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" @@ -741,6 +742,13 @@ bool ELFAsmParser::parseDirectiveType(StringRef, SMLoc) { // Handle the identifier as the key symbol. MCSymbol *Sym = getContext().getOrCreateSymbol(Name); + bool AllowAt = getLexer().getAllowAtInIdentifier(); + if (!AllowAt && + !getContext().getAsmInfo()->getCommentString().starts_with("@")) + getLexer().setAllowAtInIdentifier(true); + auto _ = + make_scope_exit([&]() { getLexer().setAllowAtInIdentifier(AllowAt); }); + // NOTE the comma is optional in all cases. It is only documented as being // optional in the first case, however, GAS will silently treat the comma as // optional in all cases. Furthermore, although the documentation states that diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp index f38b73a784632..789f7ec09d759 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp @@ -101,7 +101,8 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo(const Triple &TheTriple) { } // foo(plt) instead of foo@plt - UseParensForSymbolVariant = true; + UseAtForSpecifier = false; + UseParensForSpecifier = true; initializeVariantKinds(variantKindDescs); } @@ -148,7 +149,8 @@ ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() { SupportsDebugInformation = true; ExceptionsType = ExceptionHandling::WinEH; WinEHEncodingType = WinEH::EncodingType::Itanium; - UseParensForSymbolVariant = true; + UseAtForSpecifier = false; + UseParensForSpecifier = true; DwarfRegNumForCFI = false; diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index c1670326143e3..d65eaac3716a1 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2079,19 +2079,23 @@ ParseStatus RISCVAsmParser::parseCallSymbol(OperandVector &Operands) { if (getLexer().getKind() != AsmToken::Identifier) return ParseStatus::NoMatch; + std::string Identifier(getTok().getIdentifier()); - // Avoid parsing the register in `call rd, foo` as a call symbol. - if (getLexer().peekTok().getKind() != AsmToken::EndOfStatement) + if (getLexer().peekTok().is(AsmToken::At)) { + Lex(); + Lex(); + StringRef PLT; + if (getParser().parseIdentifier(PLT) || PLT != "plt") + return ParseStatus::Failure; + } else if (!getLexer().peekTok().is(AsmToken::EndOfStatement)) { + // Avoid parsing the register in `call rd, foo` as a call symbol. return ParseStatus::NoMatch; - - StringRef Identifier; - if (getParser().parseIdentifier(Identifier)) - return ParseStatus::Failure; + } else { + Lex(); + } SMLoc E = SMLoc::getFromPointer(S.getPointer() + Identifier.size()); - RISCVMCExpr::Specifier Kind = RISCVMCExpr::VK_CALL_PLT; - (void)Identifier.consume_back("@plt"); MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier); Res = MCSymbolRefExpr::create(Sym, getContext()); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp index 7e9b312d3c25e..d1e8ec9d6b54a 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCAsmInfo.cpp @@ -26,6 +26,7 @@ RISCVMCAsmInfo::RISCVMCAsmInfo(const Triple &TT) { AlignmentIsInBytes = false; SupportsDebugInformation = true; ExceptionsType = ExceptionHandling::DwarfCFI; + UseAtForSpecifier = false; Data16bitsDirective = "\t.half\t"; Data32bitsDirective = "\t.word\t"; } diff --git a/llvm/test/MC/RISCV/pseudo-jump-invalid.s b/llvm/test/MC/RISCV/pseudo-jump-invalid.s index 834b5a186b007..18640b6617ea3 100644 --- a/llvm/test/MC/RISCV/pseudo-jump-invalid.s +++ b/llvm/test/MC/RISCV/pseudo-jump-invalid.s @@ -1,5 +1,6 @@ # RUN: not llvm-mc -triple riscv32 < %s 2>&1 | FileCheck %s jump 1234, x31 # CHECK: :[[@LINE]]:6: error: operand must be a valid jump target -jump foo@plt, x31 # CHECK: :[[@LINE]]:10: error: invalid variant 'plt' +jump foo@plt, x31 # CHECK: :[[@LINE]]:9: error: unexpected token jump %pcrel_lo(1234), x31 # CHECK: :[[@LINE]]:6: error: unknown token in expression +jump foo@xxx # CHECK: :[[@LINE]]:9: error: unexpected token diff --git a/llvm/test/MC/RISCV/rv32i-aliases-invalid.s b/llvm/test/MC/RISCV/rv32i-aliases-invalid.s index 7f54fe720ea48..63bc1fa09a4a2 100644 --- a/llvm/test/MC/RISCV/rv32i-aliases-invalid.s +++ b/llvm/test/MC/RISCV/rv32i-aliases-invalid.s @@ -32,7 +32,7 @@ lla x1, %hi(1234) # CHECK: :[[@LINE]]:9: error: operand either must be a bare sy lla x1, %lo(1234) # CHECK: :[[@LINE]]:9: error: operand either must be a bare symbol name or an immediate integer in the range [-2147483648, 4294967295] lla x1, %hi(foo) # CHECK: :[[@LINE]]:9: error: operand either must be a bare symbol name or an immediate integer in the range [-2147483648, 4294967295] lla x1, %lo(foo) # CHECK: :[[@LINE]]:9: error: operand either must be a bare symbol name or an immediate integer in the range [-2147483648, 4294967295] -lla a2, foo@plt # CHECK: :[[@LINE]]:17: error: '@plt' operand not valid for instruction +lla a2, foo@plt # CHECK: :[[@LINE]]:12: error: unexpected token negw x1, x2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}} sext.w x3, x4 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}} diff --git a/llvm/test/MC/RISCV/rv64i-aliases-invalid.s b/llvm/test/MC/RISCV/rv64i-aliases-invalid.s index 1bd4e78007c83..cc35346cb8801 100644 --- a/llvm/test/MC/RISCV/rv64i-aliases-invalid.s +++ b/llvm/test/MC/RISCV/rv64i-aliases-invalid.s @@ -26,7 +26,7 @@ lla x1, %lo(1234) # CHECK: :[[@LINE]]:9: error: operand either must be a constan lla x1, %hi(foo) # CHECK: :[[@LINE]]:9: error: operand either must be a constant 64-bit integer or a bare symbol name lla x1, %lo(foo) # CHECK: :[[@LINE]]:9: error: operand either must be a constant 64-bit integer or a bare symbol name lla a1, foo+foo # CHECK: :[[@LINE]]:9: error: operand either must be a constant 64-bit integer or a bare symbol name -lla a2, foo@plt # CHECK: :[[@LINE]]:17: error: '@plt' operand not valid for instruction +lla a2, foo@plt # CHECK: :[[@LINE]]:12: error: unexpected token rdinstreth x29 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV32I Base Instruction Set{{$}} rdcycleh x27 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV32I Base Instruction Set{{$}} From 6b647de031a7d590663a791a503525f21cb98d03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ningning=20Shi=28=E5=8F=B2=E5=AE=81=E5=AE=81=29?= Date: Tue, 1 Apr 2025 15:23:34 +0800 Subject: [PATCH 0200/1029] [NFC] Remove the unused hasMinSize() (#133838) The 'hasOptSize()' is 'hasFnAttribute(Attribute::OptimizeForSize) || hasMinSize()', so we don't need another 'hasMinSize()'. --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ++---- llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 2 +- .../Transforms/Instrumentation/PGOForceFunctionAttrs.cpp | 2 +- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 +-- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b3335d8710a65..e0be0d83f7513 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16947,8 +16947,7 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion( // transform unless the conversion is in a loop block guaranteed to execute // and we are not optimizing for size. Function *F = I->getParent()->getParent(); - if (!L || L->getHeader() != I->getParent() || F->hasMinSize() || - F->hasOptSize()) + if (!L || L->getHeader() != I->getParent() || F->hasOptSize()) return false; auto *SrcTy = dyn_cast(I->getOperand(0)->getType()); @@ -28524,8 +28523,7 @@ bool AArch64TargetLowering::shouldLocalize( if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64) break; auto APF = MI.getOperand(1).getFPImm()->getValueAPF(); - bool OptForSize = - MF.getFunction().hasOptSize() || MF.getFunction().hasMinSize(); + bool OptForSize = MF.getFunction().hasOptSize(); if (isFPImmLegal(APF, EVT::getFloatingPointVT(Ty.getScalarSizeInBits()), OptForSize)) return true; // Constant should be cheap. diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index 56b1639143d8b..e88f33d6859ec 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -1332,7 +1332,7 @@ bool RISCVLegalizerInfo::legalizeCustom( const Function &F = MF.getFunction(); // TODO: if PSI and BFI are present, add " || // llvm::shouldOptForSize(*CurMBB, PSI, BFI)". - bool ShouldOptForSize = F.hasOptSize() || F.hasMinSize(); + bool ShouldOptForSize = F.hasOptSize(); const ConstantInt *ConstVal = MI.getOperand(1).getCImm(); if (!shouldBeInConstantPool(ConstVal->getValue(), ShouldOptForSize)) return true; diff --git a/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp b/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp index 450c191a896da..8f5540de6c16b 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOForceFunctionAttrs.cpp @@ -20,7 +20,7 @@ static bool shouldRunOnFunction(Function &F, ProfileSummaryInfo &PSI, if (F.isDeclaration()) return false; // Respect existing attributes. - if (F.hasOptNone() || F.hasOptSize() || F.hasMinSize()) + if (F.hasOptNone() || F.hasOptSize()) return false; if (F.hasFnAttribute(Attribute::Cold)) return true; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index c3520dc95f8b4..55cc801e91452 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4729,8 +4729,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( return Result; } - if (OrigLoop->getHeader()->getParent()->hasOptSize() || - OrigLoop->getHeader()->getParent()->hasMinSize()) { + if (OrigLoop->getHeader()->getParent()->hasOptSize()) { LLVM_DEBUG( dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); return Result; From 5ff8c036063d83c6eff495de7709b12875113d62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Csan=C3=A1d=20Hajd=C3=BA?= Date: Tue, 1 Apr 2025 09:35:27 +0200 Subject: [PATCH 0201/1029] [AArch64] Bugfix when using execute-only and memtag sanitizer together (#133084) Support for execute-only code generation (#125687) introduced a bug in the case where the memtag sanitizer is used in a module containing a mix of execute-only and non-execute-only functions. The bug is caused by using `return` instead of `break` to short-circuit a loop, which meant that the rest of the function dealing with memtag sanitizer logic wasn't run. --- .../MCTargetDesc/AArch64ELFStreamer.cpp | 16 +++++++++++----- llvm/test/MC/AArch64/execute-only-memtag.ll | 18 ++++++++++++++++++ 2 files changed, 29 insertions(+), 5 deletions(-) create mode 100644 llvm/test/MC/AArch64/execute-only-memtag.ll diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 98bd102d8f4c1..b12a12436db81 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -511,11 +511,17 @@ void AArch64TargetELFStreamer::finish() { })) { auto *Text = static_cast(Ctx.getObjectFileInfo()->getTextSection()); - for (auto &F : *Text) - if (auto *DF = dyn_cast(&F)) - if (!DF->getContents().empty()) - return; - Text->setFlags(Text->getFlags() | ELF::SHF_AARCH64_PURECODE); + bool Empty = true; + for (auto &F : *Text) { + if (auto *DF = dyn_cast(&F)) { + if (!DF->getContents().empty()) { + Empty = false; + break; + } + } + } + if (Empty) + Text->setFlags(Text->getFlags() | ELF::SHF_AARCH64_PURECODE); } MCSectionELF *MemtagSec = nullptr; diff --git a/llvm/test/MC/AArch64/execute-only-memtag.ll b/llvm/test/MC/AArch64/execute-only-memtag.ll new file mode 100644 index 0000000000000..02daf3179101f --- /dev/null +++ b/llvm/test/MC/AArch64/execute-only-memtag.ll @@ -0,0 +1,18 @@ +; RUN: llc %s -mtriple=aarch64-linux-android31 -filetype=obj -o %t.o +; RUN: llvm-readelf -r %t.o | FileCheck %s + +; CHECK: Relocation section '.rela.memtag.globals.static' at offset {{.*}} contains 1 entries: +; CHECK-NEXT: Type {{.*}} Symbol's Name +; CHECK-NEXT: R_AARCH64_NONE {{.*}} global + +@global = global i32 1, sanitize_memtag + +define void @foo() { + ret void +} + +define void @bar() #0 { + ret void +} + +attributes #0 = { "target-features"="+execute-only" } From c5afcfe0bb44067b2cd050ed9cff311eada9cc37 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 1 Apr 2025 08:39:29 +0100 Subject: [PATCH 0202/1029] [X86] combineINSERT_SUBVECTOR - fold insert_subvector(base,extract_subvector(broadcast)) -> blend shuffle(base,broadcast) (REAPPLIED) (#133724) If the broadcast is already the full vector width, try to prefer a blend/vshuff64x2 over a vector insertion which is usually lower latency (and sometimes a lower uop count), and reduces changes in vector sizes that can interfere with further combines. Updated version of #133083 - which lead to infinite loops due to shuffle lowering recreating the INSERT_SUBVECTOR pattern, this variant creates the BLENDI/SHUF128 nodes directly. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 24 +++++++++++++++++++ .../any_extend_vector_inreg_of_broadcast.ll | 24 +++++++++---------- ...d_vector_inreg_of_broadcast_from_memory.ll | 16 ++++++------- .../CodeGen/X86/insert-subvector-broadcast.ll | 2 +- .../vector-interleaved-store-i32-stride-5.ll | 2 +- 5 files changed, 46 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 76de7e888d985..5fff78f7a173a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -58896,6 +58896,30 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts; return DAG.getVectorShuffle(OpVT, dl, Vec, ExtSrc, Mask); } + // If we're broadcasting, see if we can use a blend instead of + // extract/insert pair. Ensure that the subvector is aligned with the + // insertion/extractions. + if ((ExtIdxVal % SubVecNumElts) == 0 && (IdxVal % SubVecNumElts) == 0 && + (ExtSrc.getOpcode() == X86ISD::VBROADCAST || + ExtSrc.getOpcode() == X86ISD::VBROADCAST_LOAD || + (ExtSrc.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && + cast(ExtSrc)->getMemoryVT() == SubVecVT))) { + if (OpVT.is256BitVector() && SubVecVT.is128BitVector()) { + uint64_t BlendMask = IdxVal == 0 ? 0x0F : 0xF0; + SDValue Blend = DAG.getNode( + X86ISD::BLENDI, dl, MVT::v8f32, DAG.getBitcast(MVT::v8f32, Vec), + DAG.getBitcast(MVT::v8f32, ExtSrc), + DAG.getTargetConstant(BlendMask, dl, MVT::i8)); + return DAG.getBitcast(OpVT, Blend); + } else if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) { + SDValue Lo = DAG.getBitcast(MVT::v8f64, IdxVal == 0 ? ExtSrc : Vec); + SDValue Hi = DAG.getBitcast(MVT::v8f64, IdxVal == 0 ? Vec : ExtSrc); + SDValue Shuffle = + DAG.getNode(X86ISD::SHUF128, dl, MVT::v8f64, Lo, Hi, + getV4X86ShuffleImm8ForMask({0, 1, 2, 3}, dl, DAG)); + return DAG.getBitcast(OpVT, Shuffle); + } + } } // Match concat_vector style patterns. diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 8125e062e7ffd..6f4e7abda8b00 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -2573,7 +2573,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -2590,7 +2590,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,15,3,4,15,6,7,15,9,10,15,12,13,15] ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -2835,7 +2835,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -2852,7 +2852,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -2868,7 +2868,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3096,7 +3096,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3113,7 +3113,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3129,7 +3129,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3612,7 +3612,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3628,7 +3628,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3864,7 +3864,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -3880,7 +3880,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx) diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 305509ca7fc3f..52f856befa130 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -2239,7 +2239,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) @@ -2253,7 +2253,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) @@ -2267,7 +2267,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -2458,7 +2458,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa %ymm1, 32(%rdx) @@ -2472,7 +2472,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rdx) @@ -2486,7 +2486,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,15,12,13,14] ; AVX512BW-NEXT: vpbroadcastb (%rdi), %ymm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -3095,7 +3095,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx) @@ -3107,7 +3107,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx) diff --git a/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll b/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll index 47cd752ef80a4..9b35857804022 100644 --- a/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/insert-subvector-broadcast.ll @@ -9,7 +9,7 @@ define void @insert_subvector_broadcast_as_blend() { ; CHECK-NEXT: vpbroadcastq %rax, %zmm0 ; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm1 +; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; CHECK-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k0 ; CHECK-NEXT: vpcmpltq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k1 ; CHECK-NEXT: kunpckbw %k0, %k1, %k1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index e88a651d29cef..89330122fa239 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -350,7 +350,7 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3] ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6],ymm5[7] ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm5 = mem[0,1,0,1] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 +; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3],ymm7[4],ymm4[5,6,7] ; AVX-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,u,2,u,u,u,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7] From 64d493f987dc24b3d7e45daade9b0e8bfa1cc471 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 1 Apr 2025 08:45:46 +0100 Subject: [PATCH 0203/1029] [EquivalenceClasses] Return ECValue directly from insert (NFC). Removes a redundant lookup in the mapping.: --- .../FlowSensitive/SimplifyConstraints.cpp | 2 +- llvm/include/llvm/ADT/EquivalenceClasses.h | 23 ++++++++++++------- llvm/lib/Transforms/IPO/LowerTypeTests.cpp | 2 +- llvm/lib/Transforms/Utils/SplitModule.cpp | 2 +- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/clang/lib/Analysis/FlowSensitive/SimplifyConstraints.cpp b/clang/lib/Analysis/FlowSensitive/SimplifyConstraints.cpp index 02ec0d0213300..bbd73ef9b2f03 100644 --- a/clang/lib/Analysis/FlowSensitive/SimplifyConstraints.cpp +++ b/clang/lib/Analysis/FlowSensitive/SimplifyConstraints.cpp @@ -155,7 +155,7 @@ void simplifyConstraints(llvm::SetVector &Constraints, It != End; ++It) { if (!It->isLeader()) continue; - Atom At = *EquivalentAtoms.findLeader(It); + Atom At = *EquivalentAtoms.findLeader(*It); if (TrueAtoms.contains(At) || FalseAtoms.contains(At)) continue; llvm::SmallVector Atoms = diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h index f9c7819f18806..be46a4445c3f2 100644 --- a/llvm/include/llvm/ADT/EquivalenceClasses.h +++ b/llvm/include/llvm/ADT/EquivalenceClasses.h @@ -175,6 +175,11 @@ class EquivalenceClasses { // Only leaders provide anything to iterate over. return member_iterator(I->isLeader() ? &*I : nullptr); } + member_iterator member_begin(const ECValue &ECV) const { + // Only leaders provide anything to iterate over. + return member_iterator(ECV.getLeader()); + } + member_iterator member_end() const { return member_iterator(nullptr); } @@ -216,26 +221,28 @@ class EquivalenceClasses { /// insert - Insert a new value into the union/find set, ignoring the request /// if the value already exists. - iterator insert(const ElemTy &Data) { - return TheMapping.insert(ECValue(Data)).first; + const ECValue &insert(const ElemTy &Data) { + return *TheMapping.insert(ECValue(Data)).first; } /// findLeader - Given a value in the set, return a member iterator for the /// equivalence class it is in. This does the path-compression part that /// makes union-find "union findy". This returns an end iterator if the value /// is not in the equivalence class. - member_iterator findLeader(iterator I) const { - if (I == TheMapping.end()) return member_end(); - return member_iterator(I->getLeader()); - } member_iterator findLeader(const ElemTy &V) const { - return findLeader(TheMapping.find(V)); + auto I = TheMapping.find(V); + if (I == TheMapping.end()) + return member_iterator(nullptr); + return findLeader(*I); + } + member_iterator findLeader(const ECValue &ECV) const { + return member_iterator(ECV.getLeader()); } /// union - Merge the two equivalence sets for the specified values, inserting /// them if they do not already exist in the equivalence set. member_iterator unionSets(const ElemTy &V1, const ElemTy &V2) { - iterator V1I = insert(V1), V2I = insert(V2); + const ECValue &V1I = insert(V1), &V2I = insert(V2); return unionSets(findLeader(V1I), findLeader(V2I)); } member_iterator unionSets(member_iterator L1, member_iterator L2) { diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 6cc5d1dc78d99..299b3a9162389 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -2239,7 +2239,7 @@ bool LowerTypeTestsModule::lower() { auto Ins = TypeIdUsers.insert({TypeId, {}}); if (Ins.second) { // Add the type identifier to the equivalence class. - GlobalClassesTy::iterator GCI = GlobalClasses.insert(TypeId); + auto &GCI = GlobalClasses.insert(TypeId); GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI); // Add the referenced globals to the type identifier's equivalence class. diff --git a/llvm/lib/Transforms/Utils/SplitModule.cpp b/llvm/lib/Transforms/Utils/SplitModule.cpp index 4f174fbe48b75..07956f3a191c4 100644 --- a/llvm/lib/Transforms/Utils/SplitModule.cpp +++ b/llvm/lib/Transforms/Utils/SplitModule.cpp @@ -203,7 +203,7 @@ static void findPartitions(Module &M, ClusterIDMapType &ClusterIDMap, << "\n"); for (ClusterMapType::member_iterator MI = - GVtoClusterMap.findLeader(I.second); + GVtoClusterMap.findLeader(*I.second); MI != GVtoClusterMap.member_end(); ++MI) { if (!Visited.insert(*MI).second) continue; From 14c50986405731030b001d85cc8cd4a2c9f86388 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 1 Apr 2025 14:48:14 +0700 Subject: [PATCH 0204/1029] llvm-reduce: Use takeName when moving arguments in operands-to-args (#133851) --- llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp index 3548130d3276a..4bae588d60c14 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp @@ -155,8 +155,8 @@ static void substituteOperandWithArgument(Function *OldF, Argument &OldArg = std::get<0>(Z); Argument &NewArg = std::get<1>(Z); - NewArg.setName(OldArg.getName()); // Copy the name over... - VMap[&OldArg] = &NewArg; // Add mapping to VMap + NewArg.takeName(&OldArg); // Copy the name over... + VMap[&OldArg] = &NewArg; // Add mapping to VMap } LLVMContext &Ctx = OldF->getContext(); From fd0785e67ca66fb7f59ae504b637285d27d4e5c9 Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Tue, 1 Apr 2025 00:49:05 -0700 Subject: [PATCH 0205/1029] [mlir][tosa] Reorder ERF op to align with TOSA spec (#133814) Minor non-functional change of the dialect to better align with the operator order from the TOSA specification: https://www.mlplatform.org/tosa/tosa_spec.html Signed-off-by: Jerry Ge --- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 52 ++++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 1ba44db02f25f..75f167afd9dd0 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -499,21 +499,16 @@ def Tosa_ClampOp : Tosa_ElementwiseUnaryOp<"clamp"> { } //===----------------------------------------------------------------------===// -// Operator: sigmoid +// Operator: erf //===----------------------------------------------------------------------===// -def Tosa_SigmoidOp : Tosa_ElementwiseUnaryOp<"sigmoid"> { - let summary = "Computes elementwise sigmoid of input."; +def Tosa_ErfOp : Tosa_ElementwiseUnaryOp<"erf"> { + let summary = "Computes gauss error function of input."; let description = [{ - Applies the sigmoid logistic function to each element of the input tensor: - $ sigmoid(x) = \frac{1}{1 + e^{-x}} $. - - For quantized integer data types, the TABLE operator should be used instead. - Each implementation may choose an appropriate TABLE given the scale and zero - point of the input data. Eight or sixteen bit precision tables may be used - based on the input tensor to the sigmoid function. The sigmoid table has 513 - entries each of 16-bit precision and covering the input range -16.0 to +16.0 - in steps of 1/16. + Gauss error function: $ erf(x) = \frac{2}{\sqrt{\pi}} \int_{0}^{x} e^{-t^2} dt $ + For quantized integer data types, the TABLE operator should be used instead + with the following definition. The ERF table has 513 entries each of + 16-bit precision and covering the input range -4.0 to +4.0 in steps of 1/64. }]; let arguments = (ins @@ -528,21 +523,26 @@ def Tosa_SigmoidOp : Tosa_ElementwiseUnaryOp<"sigmoid"> { Profile<[Tosa_PRO_FP]>, Extension<[Tosa_EXT_BF16]>, ]; + + let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)"; } //===----------------------------------------------------------------------===// -// Operator: tanh +// Operator: sigmoid //===----------------------------------------------------------------------===// -def Tosa_TanhOp : Tosa_ElementwiseUnaryOp<"tanh"> { - let summary = "Computes elementwise hyperbolic tangent of input."; +def Tosa_SigmoidOp : Tosa_ElementwiseUnaryOp<"sigmoid"> { + let summary = "Computes elementwise sigmoid of input."; let description = [{ - Parameterized hyperbolic tangent: $ tanh(x) = \frac{1 - e^{-2x}}{1 + e^{-2x}} $. + Applies the sigmoid logistic function to each element of the input tensor: + $ sigmoid(x) = \frac{1}{1 + e^{-x}} $. For quantized integer data types, the TABLE operator should be used instead. Each implementation may choose an appropriate TABLE given the scale and zero point of the input data. Eight or sixteen bit precision tables may be used - based on the input tensor to the tanh function. + based on the input tensor to the sigmoid function. The sigmoid table has 513 + entries each of 16-bit precision and covering the input range -16.0 to +16.0 + in steps of 1/16. }]; let arguments = (ins @@ -560,16 +560,18 @@ def Tosa_TanhOp : Tosa_ElementwiseUnaryOp<"tanh"> { } //===----------------------------------------------------------------------===// -// Operator: erf +// Operator: tanh //===----------------------------------------------------------------------===// -def Tosa_ErfOp : Tosa_ElementwiseUnaryOp<"erf"> { - let summary = "Computes gauss error function of input."; +def Tosa_TanhOp : Tosa_ElementwiseUnaryOp<"tanh"> { + let summary = "Computes elementwise hyperbolic tangent of input."; let description = [{ - Gauss error function: $ erf(x) = \frac{2}{\sqrt{\pi}} \int_{0}^{x} e^{-t^2} dt $ - For quantized integer data types, the TABLE operator should be used instead - with the following definition. The ERF table has 513 entries each of - 16-bit precision and covering the input range -4.0 to +4.0 in steps of 1/64. + Parameterized hyperbolic tangent: $ tanh(x) = \frac{1 - e^{-2x}}{1 + e^{-2x}} $. + + For quantized integer data types, the TABLE operator should be used instead. + Each implementation may choose an appropriate TABLE given the scale and zero + point of the input data. Eight or sixteen bit precision tables may be used + based on the input tensor to the tanh function. }]; let arguments = (ins @@ -584,8 +586,6 @@ def Tosa_ErfOp : Tosa_ElementwiseUnaryOp<"erf"> { Profile<[Tosa_PRO_FP]>, Extension<[Tosa_EXT_BF16]>, ]; - - let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)"; } //===----------------------------------------------------------------------===// From 7a2b160e76e23d8fa62750af20e1e25f08803784 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 1 Apr 2025 09:19:50 +0100 Subject: [PATCH 0206/1029] [libclc] Move rootn to the CLC library; optimize (#133735) The function was already nominally in the CLC namespace; this commit just moves it over. This commit also vectorizes the builtin to avoid scalarization. --- .../include/clc}/math/clc_rootn.h | 11 +- libclc/clc/lib/generic/SOURCES | 1 + libclc/clc/lib/generic/math/clc_rootn.cl | 21 + libclc/clc/lib/generic/math/clc_rootn.inc | 403 ++++++++++++++++++ libclc/clspv/lib/SOURCES | 1 - libclc/generic/include/clc/math/rootn.h | 6 +- libclc/generic/include/clc/math/rootn.inc | 9 - libclc/generic/include/math/clc_rootn.inc | 9 - libclc/generic/lib/SOURCES | 1 - libclc/generic/lib/math/clc_rootn.cl | 389 ----------------- libclc/generic/lib/math/rootn.cl | 6 +- libclc/generic/lib/math/rootn.inc | 11 - libclc/spirv/lib/SOURCES | 1 - 13 files changed, 443 insertions(+), 426 deletions(-) rename libclc/{generic/include => clc/include/clc}/math/clc_rootn.h (64%) create mode 100644 libclc/clc/lib/generic/math/clc_rootn.cl create mode 100644 libclc/clc/lib/generic/math/clc_rootn.inc delete mode 100644 libclc/generic/include/clc/math/rootn.inc delete mode 100644 libclc/generic/include/math/clc_rootn.inc delete mode 100644 libclc/generic/lib/math/clc_rootn.cl delete mode 100644 libclc/generic/lib/math/rootn.inc diff --git a/libclc/generic/include/math/clc_rootn.h b/libclc/clc/include/clc/math/clc_rootn.h similarity index 64% rename from libclc/generic/include/math/clc_rootn.h rename to libclc/clc/include/clc/math/clc_rootn.h index e4c9bd847302b..b1e69025d97a2 100644 --- a/libclc/generic/include/math/clc_rootn.h +++ b/libclc/clc/include/clc/math/clc_rootn.h @@ -6,6 +6,15 @@ // //===----------------------------------------------------------------------===// -#define __CLC_BODY +#ifndef __CLC_MATH_CLC_ROOTN_H__ +#define __CLC_MATH_CLC_ROOTN_H__ + +#define __CLC_BODY +#define __CLC_FUNCTION __clc_rootn + #include + #undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_ROOTN_H__ diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index 3c648edd52a7c..6a1ba95362220 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -52,6 +52,7 @@ math/clc_powr.cl math/clc_remainder.cl math/clc_remquo.cl math/clc_rint.cl +math/clc_rootn.cl math/clc_round.cl math/clc_rsqrt.cl math/clc_sincos_helpers.cl diff --git a/libclc/clc/lib/generic/math/clc_rootn.cl b/libclc/clc/lib/generic/math/clc_rootn.cl new file mode 100644 index 0000000000000..da397cf66da62 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_rootn.cl @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/math/clc_rootn.inc b/libclc/clc/lib/generic/math/clc_rootn.inc new file mode 100644 index 0000000000000..4fa56bdd84abe --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_rootn.inc @@ -0,0 +1,403 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Computes pow using log and exp +// +// x^y = exp(y * log(x)) +// +// We take care not to lose precision in the intermediate steps. +// +// When computing log, calculate it in splits: +// +// r = f * (p_invead + p_inv_tail) +// r = rh + rt +// +// Calculate log polynomial using r, in end addition, do: +// +// poly = poly + ((rh-r) + rt) +// +// lth = -r +// ltt = ((xexp * log2_t) - poly) + logT +// lt = lth + ltt +// +// lh = (xexp * log2_h) + logH +// l = lh + lt +// +// Calculate final log answer as gh and gt: +// +// gh = l & higher-half bits +// gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh)) +// +// yh = y & higher-half bits +// yt = y - yh +// +// Before entering computation of exp: +// +// vs = ((yt*gt + yt*gh) + yh*gt) +// v = vs + yh*gh +// vt = ((yh*gh - v) + vs) +// +// In calculation of exp, add vt to r that is used for poly. +// +// At the end of exp, do: +// +// ((((expT * poly) + expT) + expH*poly) + expH) +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x, + __CLC_INTN ny) { + __CLC_GENTYPE y = MATH_RECIP(__CLC_CONVERT_GENTYPE(ny)); + + __CLC_INTN ix = __CLC_AS_INTN(x); + __CLC_INTN ax = ix & EXSIGNBIT_SP32; + __CLC_INTN xpos = ix == ax; + + __CLC_INTN iy = __CLC_AS_INTN(y); + __CLC_INTN ay = iy & EXSIGNBIT_SP32; + __CLC_INTN ypos = iy == ay; + + // Extra precise log calculation + // First handle case that x is close to 1 + __CLC_GENTYPE r = 1.0f - __CLC_AS_GENTYPE(ax); + __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f; + __CLC_GENTYPE r2 = r * r; + + // Coefficients are just 1/3, 1/4, 1/5 and 1/6 + __CLC_GENTYPE poly = __clc_mad( + r, + __clc_mad(r, + __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f), + 0x1.99999ap-3f), + 0x1.000000p-2f), + 0x1.555556p-2f); + + poly *= r2 * r; + + __CLC_GENTYPE lth_near1 = -r2 * 0.5f; + __CLC_GENTYPE ltt_near1 = -poly; + __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1; + __CLC_GENTYPE lh_near1 = -r; + __CLC_GENTYPE l_near1 = lh_near1 + lt_near1; + + // Computations for x not near 1 + __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; + __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m); + __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f); + __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253); + __CLC_INTN c = m == -127; + __CLC_INTN ixn = c ? ixs : ax; + __CLC_GENTYPE mfn = c ? mfs : mf; + + __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1); + + // F - Y + __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) - + __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32)); + + indx = indx >> 16; + __CLC_GENTYPE rh = f * USE_TABLE(log_inv_tbl_ep_head, indx); + __CLC_GENTYPE rt = f * USE_TABLE(log_inv_tbl_ep_tail, indx); + ; + r = rh + rt; + + poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * + (r * r); + poly += (rh - r) + rt; + + const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234 + const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833 + __CLC_GENTYPE lth = -r; + __CLC_GENTYPE ltt = + __clc_mad(mfn, LOG2_TAIL, -poly) + USE_TABLE(loge_tbl_hi, indx); + __CLC_GENTYPE lt = lth + ltt; + __CLC_GENTYPE lh = __clc_mad(mfn, LOG2_HEAD, USE_TABLE(loge_tbl_lo, indx)); + __CLC_GENTYPE l = lh + lt; + + // Select near 1 or not + lth = near1 ? lth_near1 : lth; + ltt = near1 ? ltt_near1 : ltt; + lt = near1 ? lt_near1 : lt; + lh = near1 ? lh_near1 : lh; + l = near1 ? l_near1 : l; + + __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000); + __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh); + + __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000); + + __CLC_GENTYPE fny = __CLC_CONVERT_GENTYPE(ny); + __CLC_GENTYPE fnyh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(fny) & 0xfffff000); + __CLC_GENTYPE fnyt = __CLC_CONVERT_GENTYPE(ny - __CLC_CONVERT_INTN(fnyh)); + __CLC_GENTYPE yt = + MATH_DIVIDE(__clc_mad(-fnyt, yh, __clc_mad(-fnyh, yh, 1.0f)), fny); + + __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt)); + __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s); + __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s; + + // Extra precise exp of ylogx + const __CLC_GENTYPE R_64_BY_LOG2 = + 0x1.715476p+6f; // 64/log2 : 92.332482616893657 + __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2); + __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n); + + __CLC_INTN j = n & 0x3f; + m = n >> 6; + __CLC_INTN m2 = m << EXPSHIFTBITS_SP32; + + // log2/64 lead: 0.0108032227 + const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f; + // log2/64 tail: 0.0000272020388 + const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f; + r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) + + ylogx_t; + + // Truncated Taylor series for e^r + poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, + 0x1.000000p-1f), + r * r, r); + + __CLC_GENTYPE exph = USE_TABLE(exp_tbl_ep_head, j); + __CLC_GENTYPE expt = USE_TABLE(exp_tbl_ep_tail, j); + + __CLC_GENTYPE expylogx = + __clc_mad(exph, poly, __clc_mad(expt, poly, expt)) + exph; + __CLC_GENTYPE sexpylogx = + __clc_fp32_subnormals_supported() + ? expylogx * __CLC_AS_GENTYPE((__CLC_INTN)0x1 << (m + 149)) + : 0.0f; + + __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2); + expylogx = m < -125 ? sexpylogx : texpylogx; + + // Result is +-Inf if (ylogx + ylogx_t) > 128*log2 + expylogx = ((ylogx > 0x1.62e430p+6f) | + (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)) + ? __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32) + : expylogx; + + // Result is 0 if ylogx < -149*log2 + expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx; + + // Classify y: + // inty = 0 means not an integer. + // inty = 1 means odd integer. + // inty = 2 means even integer. + + __CLC_INTN inty = 2 - (ny & 1); + + __CLC_GENTYPE signval = + __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32)); + expylogx = ((inty == 1) & !xpos) ? signval : expylogx; + __CLC_INTN ret = __CLC_AS_INTN(expylogx); + + // Corner case handling + __CLC_BIT_INTN x_is_ninf = ix == (__CLC_INTN)NINFBITPATT_SP32; + __CLC_BIT_INTN x_is_pinf = ix == (__CLC_INTN)PINFBITPATT_SP32; + + ret = (!xpos & (inty == 2)) ? __CLC_AS_INTN((__CLC_GENTYPE)__CLC_GENTYPE_NAN) + : ret; + __CLC_INTN xinf = + xpos ? (__CLC_INTN)PINFBITPATT_SP32 : (__CLC_INTN)NINFBITPATT_SP32; + ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret; + ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret; + ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret; + __CLC_INTN xzero = xpos ? 0 : (__CLC_INTN)0x80000000; + ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret; + ret = (x_is_ninf & ypos & (inty == 1)) ? (__CLC_INTN)NINFBITPATT_SP32 : ret; + ret = (x_is_ninf & !ypos & (inty == 1)) ? (__CLC_INTN)0x80000000 : ret; + ret = (x_is_pinf & !ypos) ? 0 : ret; + ret = (x_is_pinf & ypos) ? PINFBITPATT_SP32 : ret; + ret = ax > PINFBITPATT_SP32 ? ix : ret; + ret = ny == 0 ? __CLC_AS_INTN((__CLC_GENTYPE)__CLC_GENTYPE_NAN) : ret; + + return __CLC_AS_GENTYPE(ret); +} + +#elif __CLC_FPSIZE == 64 + +_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x, + __CLC_INTN ny) { + const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08; + const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01; + + __CLC_GENTYPE dny = __CLC_CONVERT_GENTYPE(ny); + __CLC_GENTYPE y = 1.0 / dny; + + __CLC_LONGN ux = __CLC_AS_LONGN(x); + __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x)); + __CLC_BIT_INTN xpos = ax == ux; + + __CLC_LONGN uy = __CLC_AS_LONGN(y); + __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y)); + __CLC_BIT_INTN ypos = ay == uy; + + // Extended precision log + __CLC_GENTYPE v, vt; + { + __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023; + __CLC_INTN mask_exp_1023 = exp == -1023; + __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp); + __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL; + + __CLC_LONGN temp_ux = + __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0); + exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045; + __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp); + __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL; + + xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp; + mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa; + + __CLC_LONGN rax = (mantissa & 0x000ff00000000000) + + ((mantissa & 0x0000080000000000) << 1); + __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44); + + __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L); + __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L); + __CLC_GENTYPE f = F - Y; + __CLC_GENTYPE log_h = USE_TABLE(log_f_inv_tbl_head, index); + __CLC_GENTYPE log_t = USE_TABLE(log_f_inv_tbl_tail, index); + __CLC_GENTYPE f_inv = (log_h + log_t) * f; + __CLC_GENTYPE r1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L); + __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t); + __CLC_GENTYPE r = r1 + r2; + + __CLC_GENTYPE poly = __clc_fma( + r, + __clc_fma(r, + __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), + 1.0 / 4.0), + 1.0 / 3.0); + poly = poly * r * r * r; + + __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1; + __CLC_GENTYPE poly0h = r1 + hr1r1; + __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1; + poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t; + + log_h = USE_TABLE(powlog_tbl_head, index); + log_t = USE_TABLE(powlog_tbl_tail, index); + + __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly; + __CLC_GENTYPE resT = resT_t - poly0h; + __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h); + __CLC_GENTYPE resT_h = poly0h; + + __CLC_GENTYPE H = resT + resH; + __CLC_GENTYPE H_h = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L); + __CLC_GENTYPE T = + (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h); + H = H_h; + + __CLC_GENTYPE y_head = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L); + __CLC_GENTYPE y_tail = y - y_head; + + __CLC_GENTYPE fnyh = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(dny) & 0xfffffffffff00000); + __CLC_GENTYPE fnyt = __CLC_CONVERT_GENTYPE(ny - __CLC_CONVERT_INTN(fnyh)); + y_tail = __clc_fma(-fnyt, y_head, __clc_fma(-fnyh, y_head, 1.0)) / dny; + + __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T)); + v = __clc_fma(y_head, H, temp); + vt = __clc_fma(y_head, H, -v) + temp; + } + + // Now calculate exp of (v,vt) + + __CLC_GENTYPE expv; + { + const __CLC_GENTYPE max_exp_arg = 709.782712893384; + const __CLC_GENTYPE min_exp_arg = -745.1332191019411; + const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366; + const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081; + const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10; + + __CLC_GENTYPE temp = v * sixtyfour_by_lnof2; + __CLC_INTN n = __CLC_CONVERT_INTN(temp); + __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n); + __CLC_INTN j = n & 0x0000003f; + __CLC_INTN m = n >> 6; + + __CLC_GENTYPE f1 = USE_TABLE(two_to_jby64_ep_tbl_head, j); + __CLC_GENTYPE f2 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); + __CLC_GENTYPE f = f1 + f2; + + __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v); + __CLC_GENTYPE r2 = dn * lnof2_by_64_tail; + __CLC_GENTYPE r = (r1 + r2) + vt; + + __CLC_GENTYPE q = + __clc_fma(r, + __clc_fma(r, + __clc_fma(r, + __clc_fma(r, 1.38889490863777199667e-03, + 8.33336798434219616221e-03), + 4.16666666662260795726e-02), + 1.66666666665260878863e-01), + 5.00000000000000008883e-01); + q = __clc_fma(r * r, q, r); + + expv = __clc_fma(f, q, f2) + f1; + expv = __clc_ldexp(expv, m); + + expv = v > max_exp_arg ? __CLC_AS_GENTYPE((__CLC_ULONGN)0x7FF0000000000000L) + : expv; + expv = v < min_exp_arg ? 0.0 : expv; + } + + // See whether y is an integer. + // inty = 0 means not an integer. + // inty = 1 means odd integer. + // inty = 2 means even integer. + + __CLC_LONGN inty = __CLC_CONVERT_LONGN(2 - (ny & 1)); + + expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0; + + __CLC_LONGN ret = __CLC_AS_LONGN(expv); + + // Now all the edge cases + __CLC_BIT_INTN x_is_ninf = ux == (__CLC_LONGN)NINFBITPATT_DP64; + __CLC_BIT_INTN x_is_pinf = ux == (__CLC_LONGN)PINFBITPATT_DP64; + ret = (!xpos & (inty == 2)) ? __CLC_AS_LONGN((__CLC_GENTYPE)__CLC_GENTYPE_NAN) + : ret; + __CLC_LONGN xinf = + xpos ? (__CLC_LONGN)PINFBITPATT_DP64 : (__CLC_LONGN)NINFBITPATT_DP64; + ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret; + ret = + ((ax == 0L) & !ypos & (inty == 2)) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret; + ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret; + __CLC_LONGN xzero = xpos ? 0L : (__CLC_LONGN)0x8000000000000000L; + ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret; + ret = (x_is_ninf & ypos & (inty == 1)) ? (__CLC_LONGN)NINFBITPATT_DP64 : ret; + ret = (x_is_ninf & !ypos & (inty == 1)) ? (__CLC_LONGN)0x8000000000000000L + : ret; + ret = (x_is_pinf & !ypos) ? 0L : ret; + ret = (x_is_pinf & ypos) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret; + ret = ax > (__CLC_LONGN)PINFBITPATT_DP64 ? ux : ret; + ret = __CLC_CONVERT_LONGN(ny == 0) + ? __CLC_AS_LONGN((__CLC_GENTYPE)__CLC_GENTYPE_NAN) + : ret; + return __CLC_AS_GENTYPE(ret); +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x, + __CLC_INTN y) { + return __CLC_CONVERT_GENTYPE(__clc_rootn(__CLC_CONVERT_FLOATN(x), y)); +} + +#endif diff --git a/libclc/clspv/lib/SOURCES b/libclc/clspv/lib/SOURCES index c0a84f2a554f0..1c86fb0fbc8e8 100644 --- a/libclc/clspv/lib/SOURCES +++ b/libclc/clspv/lib/SOURCES @@ -17,7 +17,6 @@ subnormal_config.cl ../../generic/lib/math/atanpi.cl ../../generic/lib/math/cbrt.cl ../../generic/lib/math/clc_exp10.cl -../../generic/lib/math/clc_rootn.cl ../../generic/lib/math/clc_tan.cl ../../generic/lib/math/clc_tanpi.cl ../../generic/lib/math/cos.cl diff --git a/libclc/generic/include/clc/math/rootn.h b/libclc/generic/include/clc/math/rootn.h index fecbec1d2d8fb..41df8cd9a8210 100644 --- a/libclc/generic/include/clc/math/rootn.h +++ b/libclc/generic/include/clc/math/rootn.h @@ -6,6 +6,10 @@ // //===----------------------------------------------------------------------===// -#define __CLC_BODY +#define __CLC_BODY +#define __CLC_FUNCTION rootn + #include + #undef __CLC_BODY +#undef __CLC_FUNCTION diff --git a/libclc/generic/include/clc/math/rootn.inc b/libclc/generic/include/clc/math/rootn.inc deleted file mode 100644 index 667b904a14b75..0000000000000 --- a/libclc/generic/include/clc/math/rootn.inc +++ /dev/null @@ -1,9 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE rootn(__CLC_GENTYPE a, __CLC_INTN b); diff --git a/libclc/generic/include/math/clc_rootn.inc b/libclc/generic/include/math/clc_rootn.inc deleted file mode 100644 index 391f9069c7e46..0000000000000 --- a/libclc/generic/include/math/clc_rootn.inc +++ /dev/null @@ -1,9 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE a, __CLC_INTN b); diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES index 1c3aff16035c5..a93444af0c954 100644 --- a/libclc/generic/lib/SOURCES +++ b/libclc/generic/lib/SOURCES @@ -162,7 +162,6 @@ math/powr.cl math/remainder.cl math/remquo.cl math/rint.cl -math/clc_rootn.cl math/rootn.cl math/round.cl math/rsqrt.cl diff --git a/libclc/generic/lib/math/clc_rootn.cl b/libclc/generic/lib/math/clc_rootn.cl deleted file mode 100644 index f642792e1275f..0000000000000 --- a/libclc/generic/lib/math/clc_rootn.cl +++ /dev/null @@ -1,389 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// compute pow using log and exp -// x^y = exp(y * log(x)) -// -// we take care not to lose precision in the intermediate steps -// -// When computing log, calculate it in splits, -// -// r = f * (p_invead + p_inv_tail) -// r = rh + rt -// -// calculate log polynomial using r, in end addition, do -// poly = poly + ((rh-r) + rt) -// -// lth = -r -// ltt = ((xexp * log2_t) - poly) + logT -// lt = lth + ltt -// -// lh = (xexp * log2_h) + logH -// l = lh + lt -// -// Calculate final log answer as gh and gt, -// gh = l & higher-half bits -// gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh)) -// -// yh = y & higher-half bits -// yt = y - yh -// -// Before entering computation of exp, -// vs = ((yt*gt + yt*gh) + yh*gt) -// v = vs + yh*gh -// vt = ((yh*gh - v) + vs) -// -// In calculation of exp, add vt to r that is used for poly -// At the end of exp, do -// ((((expT * poly) + expT) + expH*poly) + expH) - -_CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny) { - float y = MATH_RECIP((float)ny); - - int ix = __clc_as_int(x); - int ax = ix & EXSIGNBIT_SP32; - int xpos = ix == ax; - - int iy = __clc_as_int(y); - int ay = iy & EXSIGNBIT_SP32; - int ypos = iy == ay; - - // Extra precise log calculation - // First handle case that x is close to 1 - float r = 1.0f - __clc_as_float(ax); - int near1 = __clc_fabs(r) < 0x1.0p-4f; - float r2 = r * r; - - // Coefficients are just 1/3, 1/4, 1/5 and 1/6 - float poly = __clc_mad( - r, - __clc_mad(r, - __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f), - 0x1.99999ap-3f), - 0x1.000000p-2f), - 0x1.555556p-2f); - - poly *= r2 * r; - - float lth_near1 = -r2 * 0.5f; - float ltt_near1 = -poly; - float lt_near1 = lth_near1 + ltt_near1; - float lh_near1 = -r; - float l_near1 = lh_near1 + lt_near1; - - // Computations for x not near 1 - int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32; - float mf = (float)m; - int ixs = __clc_as_int(__clc_as_float(ax | 0x3f800000) - 1.0f); - float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253); - int c = m == -127; - int ixn = c ? ixs : ax; - float mfn = c ? mfs : mf; - - int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1); - - // F - Y - float f = __clc_as_float(0x3f000000 | indx) - - __clc_as_float(0x3f000000 | (ixn & MANTBITS_SP32)); - - indx = indx >> 16; - float2 tv; - tv.s0 = USE_TABLE(log_inv_tbl_ep_head, indx); - tv.s1 = USE_TABLE(log_inv_tbl_ep_tail, indx); - float rh = f * tv.s0; - float rt = f * tv.s1; - r = rh + rt; - - poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * - (r * r); - poly += (rh - r) + rt; - - const float LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234 - const float LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833 - tv.s0 = USE_TABLE(loge_tbl_lo, indx); - tv.s1 = USE_TABLE(loge_tbl_hi, indx); - float lth = -r; - float ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + tv.s1; - float lt = lth + ltt; - float lh = __clc_mad(mfn, LOG2_HEAD, tv.s0); - float l = lh + lt; - - // Select near 1 or not - lth = near1 ? lth_near1 : lth; - ltt = near1 ? ltt_near1 : ltt; - lt = near1 ? lt_near1 : lt; - lh = near1 ? lh_near1 : lh; - l = near1 ? l_near1 : l; - - float gh = __clc_as_float(__clc_as_int(l) & 0xfffff000); - float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh); - - float yh = __clc_as_float(iy & 0xfffff000); - - float fny = (float)ny; - float fnyh = __clc_as_float(__clc_as_int(fny) & 0xfffff000); - float fnyt = (float)(ny - (int)fnyh); - float yt = MATH_DIVIDE(__clc_mad(-fnyt, yh, __clc_mad(-fnyh, yh, 1.0f)), fny); - - float ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt)); - float ylogx = __clc_mad(yh, gh, ylogx_s); - float ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s; - - // Extra precise exp of ylogx - const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657 - int n = __clc_convert_int(ylogx * R_64_BY_LOG2); - float nf = (float)n; - - int j = n & 0x3f; - m = n >> 6; - int m2 = m << EXPSHIFTBITS_SP32; - - // log2/64 lead: 0.0108032227 - const float R_LOG2_BY_64_LD = 0x1.620000p-7f; - // log2/64 tail: 0.0000272020388 - const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; - r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) + - ylogx_t; - - // Truncated Taylor series for e^r - poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, - 0x1.000000p-1f), - r * r, r); - - tv.s0 = USE_TABLE(exp_tbl_ep_head, j); - tv.s1 = USE_TABLE(exp_tbl_ep_tail, j); - - float expylogx = - __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0; - float sexpylogx = __clc_fp32_subnormals_supported() - ? expylogx * __clc_as_float(0x1 << (m + 149)) - : 0.0f; - - float texpylogx = __clc_as_float(__clc_as_int(expylogx) + m2); - expylogx = m < -125 ? sexpylogx : texpylogx; - - // Result is +-Inf if (ylogx + ylogx_t) > 128*log2 - expylogx = ((ylogx > 0x1.62e430p+6f) | - (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)) - ? __clc_as_float(PINFBITPATT_SP32) - : expylogx; - - // Result is 0 if ylogx < -149*log2 - expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx; - - // Classify y: - // inty = 0 means not an integer. - // inty = 1 means odd integer. - // inty = 2 means even integer. - - int inty = 2 - (ny & 1); - - float signval = __clc_as_float((__clc_as_uint(expylogx) ^ SIGNBIT_SP32)); - expylogx = ((inty == 1) & !xpos) ? signval : expylogx; - int ret = __clc_as_int(expylogx); - - // Corner case handling - ret = (!xpos & (inty == 2)) ? QNANBITPATT_SP32 : ret; - int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32; - ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret; - ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret; - ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret; - int xzero = xpos ? 0 : 0x80000000; - ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret; - ret = - ((ix == NINFBITPATT_SP32) & ypos & (inty == 1)) ? NINFBITPATT_SP32 : ret; - ret = ((ix == NINFBITPATT_SP32) & !ypos & (inty == 1)) ? 0x80000000 : ret; - ret = ((ix == PINFBITPATT_SP32) & !ypos) ? 0 : ret; - ret = ((ix == PINFBITPATT_SP32) & ypos) ? PINFBITPATT_SP32 : ret; - ret = ax > PINFBITPATT_SP32 ? ix : ret; - ret = ny == 0 ? QNANBITPATT_SP32 : ret; - - return __clc_as_float(ret); -} -_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_rootn, float, int) - -#ifdef cl_khr_fp64 -_CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) { - const double real_log2_tail = 5.76999904754328540596e-08; - const double real_log2_lead = 6.93147122859954833984e-01; - - double dny = (double)ny; - double y = 1.0 / dny; - - long ux = __clc_as_long(x); - long ax = ux & (~SIGNBIT_DP64); - int xpos = ax == ux; - - long uy = __clc_as_long(y); - long ay = uy & (~SIGNBIT_DP64); - int ypos = ay == uy; - - // Extended precision log - double v, vt; - { - int exp = (int)(ax >> 52) - 1023; - int mask_exp_1023 = exp == -1023; - double xexp = (double)exp; - long mantissa = ax & 0x000FFFFFFFFFFFFFL; - - long temp_ux = - __clc_as_long(__clc_as_double(0x3ff0000000000000L | mantissa) - 1.0); - exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045; - double xexp1 = (double)exp; - long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL; - - xexp = mask_exp_1023 ? xexp1 : xexp; - mantissa = mask_exp_1023 ? mantissa1 : mantissa; - - long rax = (mantissa & 0x000ff00000000000) + - ((mantissa & 0x0000080000000000) << 1); - int index = rax >> 44; - - double F = __clc_as_double(rax | 0x3FE0000000000000L); - double Y = __clc_as_double(mantissa | 0x3FE0000000000000L); - double f = F - Y; - double log_h = USE_TABLE(log_f_inv_tbl_head, index); - double log_t = USE_TABLE(log_f_inv_tbl_tail, index); - double f_inv = (log_h + log_t) * f; - double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L); - double r2 = __clc_fma(-F, r1, f) * (log_h + log_t); - double r = r1 + r2; - - double poly = __clc_fma( - r, - __clc_fma(r, - __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0), - 1.0 / 4.0), - 1.0 / 3.0); - poly = poly * r * r * r; - - double hr1r1 = 0.5 * r1 * r1; - double poly0h = r1 + hr1r1; - double poly0t = r1 - poly0h + hr1r1; - poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t; - - log_h = USE_TABLE(powlog_tbl_head, index); - log_t = USE_TABLE(powlog_tbl_tail, index); - - double resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly; - double resT = resT_t - poly0h; - double resH = __clc_fma(xexp, real_log2_lead, log_h); - double resT_h = poly0h; - - double H = resT + resH; - double H_h = __clc_as_double(__clc_as_long(H) & 0xfffffffff8000000L); - double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h); - H = H_h; - - double y_head = __clc_as_double(uy & 0xfffffffff8000000L); - double y_tail = y - y_head; - - double fnyh = __clc_as_double(__clc_as_long(dny) & 0xfffffffffff00000); - double fnyt = (double)(ny - (int)fnyh); - y_tail = __clc_fma(-fnyt, y_head, __clc_fma(-fnyh, y_head, 1.0)) / dny; - - double temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T)); - v = __clc_fma(y_head, H, temp); - vt = __clc_fma(y_head, H, -v) + temp; - } - - // Now calculate exp of (v,vt) - - double expv; - { - const double max_exp_arg = 709.782712893384; - const double min_exp_arg = -745.1332191019411; - const double sixtyfour_by_lnof2 = 92.33248261689366; - const double lnof2_by_64_head = 0.010830424260348081; - const double lnof2_by_64_tail = -4.359010638708991e-10; - - double temp = v * sixtyfour_by_lnof2; - int n = (int)temp; - double dn = (double)n; - int j = n & 0x0000003f; - int m = n >> 6; - - double f1 = USE_TABLE(two_to_jby64_ep_tbl_head, j); - double f2 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); - double f = f1 + f2; - - double r1 = __clc_fma(dn, -lnof2_by_64_head, v); - double r2 = dn * lnof2_by_64_tail; - double r = (r1 + r2) + vt; - - double q = - __clc_fma(r, - __clc_fma(r, - __clc_fma(r, - __clc_fma(r, 1.38889490863777199667e-03, - 8.33336798434219616221e-03), - 4.16666666662260795726e-02), - 1.66666666665260878863e-01), - 5.00000000000000008883e-01); - q = __clc_fma(r * r, q, r); - - expv = __clc_fma(f, q, f2) + f1; - expv = ldexp(expv, m); - - expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv; - expv = v < min_exp_arg ? 0.0 : expv; - } - - // See whether y is an integer. - // inty = 0 means not an integer. - // inty = 1 means odd integer. - // inty = 2 means even integer. - - int inty = 2 - (ny & 1); - - expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0; - - long ret = __clc_as_long(expv); - - // Now all the edge cases - ret = (!xpos & (inty == 2)) ? QNANBITPATT_DP64 : ret; - long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64; - ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret; - ret = ((ax == 0L) & !ypos & (inty == 2)) ? PINFBITPATT_DP64 : ret; - ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret; - long xzero = xpos ? 0L : 0x8000000000000000L; - ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret; - ret = - ((ux == NINFBITPATT_DP64) & ypos & (inty == 1)) ? NINFBITPATT_DP64 : ret; - ret = ((ux == NINFBITPATT_DP64) & !ypos & (inty == 1)) ? 0x8000000000000000L - : ret; - ret = ((ux == PINFBITPATT_DP64) & !ypos) ? 0L : ret; - ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret; - ret = ax > PINFBITPATT_DP64 ? ux : ret; - ret = ny == 0 ? QNANBITPATT_DP64 : ret; - return __clc_as_double(ret); -} -_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_rootn, double, int) -#endif - -#ifdef cl_khr_fp16 - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -_CLC_OVERLOAD _CLC_DEF half __clc_rootn(half x, int y) { - return (half)__clc_rootn((float)x, y); -} - -_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_rootn, half, int); - -#endif diff --git a/libclc/generic/lib/math/rootn.cl b/libclc/generic/lib/math/rootn.cl index 92c3de46843c5..b14248636e149 100644 --- a/libclc/generic/lib/math/rootn.cl +++ b/libclc/generic/lib/math/rootn.cl @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include +#include -#include - -#define __CLC_BODY +#define FUNCTION rootn +#define __CLC_BODY #include diff --git a/libclc/generic/lib/math/rootn.inc b/libclc/generic/lib/math/rootn.inc deleted file mode 100644 index a01e72017ac22..0000000000000 --- a/libclc/generic/lib/math/rootn.inc +++ /dev/null @@ -1,11 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE rootn(__CLC_GENTYPE x, __CLC_INTN y) { - return __clc_rootn(x, y); -} diff --git a/libclc/spirv/lib/SOURCES b/libclc/spirv/lib/SOURCES index 921a257814817..ad9f44a6149ae 100644 --- a/libclc/spirv/lib/SOURCES +++ b/libclc/spirv/lib/SOURCES @@ -63,7 +63,6 @@ math/fma.cl ../../generic/lib/math/powr.cl ../../generic/lib/math/remainder.cl ../../generic/lib/math/remquo.cl -../../generic/lib/math/clc_rootn.cl ../../generic/lib/math/rootn.cl ../../generic/lib/math/sin.cl ../../generic/lib/math/sincos.cl From ad48fffb53003333456b327a2a3f22bd81a77a00 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 1 Apr 2025 09:20:54 +0100 Subject: [PATCH 0207/1029] [libclc] Move several 'native' builtins to CLC library (#129679) This commit moves the 'native' builtins that use asm statements to generate LLVM intrinsics to the CLC library. In doing so it converts them to use the appropriate elementwise builtin to generate the same intrinsic; there are no codegen changes to any target except to AMDGPU targets where `native_log` is no longer custom implemented and instead used the clang elementwise builtin. This work forms part of #127196 and indeed with this commit there are no 'generic' builtins using/abusing asm statements - the remaining builtins are specific to the amdgpu and r600 targets. --- libclc/CMakeLists.txt | 21 ++++++++-- libclc/amdgpu/lib/SOURCES | 4 -- libclc/amdgpu/lib/math/native_log.inc | 11 ----- libclc/clc/include/clc/math/clc_native_cos.h | 22 ++++++++++ libclc/clc/include/clc/math/clc_native_exp.h | 22 ++++++++++ libclc/clc/include/clc/math/clc_native_exp2.h | 22 ++++++++++ libclc/clc/include/clc/math/clc_native_log.h | 22 ++++++++++ .../clc/include/clc/math/clc_native_log10.h | 22 ++++++++++ libclc/clc/include/clc/math/clc_native_log2.h | 22 ++++++++++ .../clc/include/clc/math/clc_native_rsqrt.h | 22 ++++++++++ libclc/clc/include/clc/math/clc_native_sin.h | 22 ++++++++++ libclc/clc/include/clc/math/clc_native_sqrt.h | 22 ++++++++++ libclc/clc/include/clc/math/unary_intrin.inc | 42 ------------------- libclc/clc/lib/amdgpu/SOURCES | 3 ++ .../lib/amdgpu/math/clc_native_exp.cl} | 6 ++- .../lib/amdgpu/math/clc_native_exp.inc} | 4 +- .../lib/amdgpu/math/clc_native_exp2.cl} | 6 +-- .../lib/amdgpu/math/clc_native_log10.cl} | 6 ++- .../lib/amdgpu/math/clc_native_log10.inc} | 4 +- libclc/clc/lib/generic/SOURCES | 9 ++++ libclc/clc/lib/generic/math/clc_native_cos.cl | 16 +++++++ libclc/clc/lib/generic/math/clc_native_exp.cl | 16 +++++++ .../clc/lib/generic/math/clc_native_exp2.cl | 16 +++++++ libclc/clc/lib/generic/math/clc_native_log.cl | 16 +++++++ .../clc/lib/generic/math/clc_native_log10.cl | 16 +++++++ .../clc/lib/generic/math/clc_native_log2.cl | 16 +++++++ .../lib/generic/math/clc_native_rsqrt.cl} | 5 ++- .../lib/generic/math/clc_native_rsqrt.inc} | 4 +- libclc/clc/lib/generic/math/clc_native_sin.cl | 16 +++++++ .../clc/lib/generic/math/clc_native_sqrt.cl | 16 +++++++ libclc/clc/lib/r600/SOURCES | 1 + .../lib/r600/math/clc_native_rsqrt.cl} | 9 ++-- libclc/generic/lib/math/native_cos.cl | 7 ++-- libclc/generic/lib/math/native_exp.cl | 7 ++-- libclc/generic/lib/math/native_exp2.cl | 7 ++-- libclc/generic/lib/math/native_log.cl | 7 ++-- libclc/generic/lib/math/native_log10.cl | 7 ++-- libclc/generic/lib/math/native_log2.cl | 6 ++- libclc/generic/lib/math/native_rsqrt.cl | 5 ++- libclc/generic/lib/math/native_sin.cl | 7 ++-- libclc/generic/lib/math/native_sqrt.cl | 7 ++-- .../lib/math/native_unary_intrinsic.inc | 26 ------------ libclc/r600/lib/SOURCES | 1 - 43 files changed, 416 insertions(+), 132 deletions(-) delete mode 100644 libclc/amdgpu/lib/math/native_log.inc create mode 100644 libclc/clc/include/clc/math/clc_native_cos.h create mode 100644 libclc/clc/include/clc/math/clc_native_exp.h create mode 100644 libclc/clc/include/clc/math/clc_native_exp2.h create mode 100644 libclc/clc/include/clc/math/clc_native_log.h create mode 100644 libclc/clc/include/clc/math/clc_native_log10.h create mode 100644 libclc/clc/include/clc/math/clc_native_log2.h create mode 100644 libclc/clc/include/clc/math/clc_native_rsqrt.h create mode 100644 libclc/clc/include/clc/math/clc_native_sin.h create mode 100644 libclc/clc/include/clc/math/clc_native_sqrt.h delete mode 100644 libclc/clc/include/clc/math/unary_intrin.inc rename libclc/{amdgpu/lib/math/native_log10.cl => clc/lib/amdgpu/math/clc_native_exp.cl} (74%) rename libclc/{amdgpu/lib/math/native_exp.inc => clc/lib/amdgpu/math/clc_native_exp.inc} (75%) rename libclc/{amdgpu/lib/math/native_exp2.cl => clc/lib/amdgpu/math/clc_native_exp2.cl} (72%) rename libclc/{amdgpu/lib/math/native_log.cl => clc/lib/amdgpu/math/clc_native_log10.cl} (74%) rename libclc/{amdgpu/lib/math/native_log10.inc => clc/lib/amdgpu/math/clc_native_log10.inc} (73%) create mode 100644 libclc/clc/lib/generic/math/clc_native_cos.cl create mode 100644 libclc/clc/lib/generic/math/clc_native_exp.cl create mode 100644 libclc/clc/lib/generic/math/clc_native_exp2.cl create mode 100644 libclc/clc/lib/generic/math/clc_native_log.cl create mode 100644 libclc/clc/lib/generic/math/clc_native_log10.cl create mode 100644 libclc/clc/lib/generic/math/clc_native_log2.cl rename libclc/{amdgpu/lib/math/native_exp.cl => clc/lib/generic/math/clc_native_rsqrt.cl} (79%) rename libclc/{generic/lib/math/native_rsqrt.inc => clc/lib/generic/math/clc_native_rsqrt.inc} (76%) create mode 100644 libclc/clc/lib/generic/math/clc_native_sin.cl create mode 100644 libclc/clc/lib/generic/math/clc_native_sqrt.cl rename libclc/{r600/lib/math/native_rsqrt.cl => clc/lib/r600/math/clc_native_rsqrt.cl} (65%) delete mode 100644 libclc/generic/lib/math/native_unary_intrinsic.inc diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 59a70a200c95c..efe7f5804e8fb 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -263,6 +263,23 @@ if ( clspv-- IN_LIST LIBCLC_TARGETS_TO_BUILD OR clspv64-- IN_LIST LIBCLC_TARGETS endif() set_source_files_properties( + # CLC builtins + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_cos.cl + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_exp2.cl + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_exp.cl + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_log10.cl + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_log2.cl + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_log.cl + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_rsqrt.cl + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_sin.cl + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_sqrt.cl + # Target-specific CLC builtins + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/amdgpu/math/clc_native_exp2.cl + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/amdgpu/math/clc_native_exp.cl + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/amdgpu/math/clc_native_log10.cl + # Target-specific OpenCL builtins + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/r600/math/clc_native_rsqrt.cl + # OpenCL builtins ${CMAKE_CURRENT_SOURCE_DIR}/generic/lib/math/native_cos.cl ${CMAKE_CURRENT_SOURCE_DIR}/generic/lib/math/native_divide.cl ${CMAKE_CURRENT_SOURCE_DIR}/generic/lib/math/native_exp.cl @@ -277,10 +294,6 @@ set_source_files_properties( ${CMAKE_CURRENT_SOURCE_DIR}/generic/lib/math/native_sin.cl ${CMAKE_CURRENT_SOURCE_DIR}/generic/lib/math/native_sqrt.cl ${CMAKE_CURRENT_SOURCE_DIR}/generic/lib/math/native_tan.cl - ${CMAKE_CURRENT_SOURCE_DIR}/amdgpu/lib/math/native_exp.cl - ${CMAKE_CURRENT_SOURCE_DIR}/amdgpu/lib/math/native_log.cl - ${CMAKE_CURRENT_SOURCE_DIR}/amdgpu/lib/math/native_log10.cl - ${CMAKE_CURRENT_SOURCE_DIR}/r600/lib/math/native_rsqrt.cl PROPERTIES COMPILE_OPTIONS -fapprox-func ) diff --git a/libclc/amdgpu/lib/SOURCES b/libclc/amdgpu/lib/SOURCES index ed5e45a37c18d..ab5da40711aa4 100644 --- a/libclc/amdgpu/lib/SOURCES +++ b/libclc/amdgpu/lib/SOURCES @@ -1,7 +1,3 @@ -math/native_exp.cl -math/native_exp2.cl -math/native_log.cl -math/native_log10.cl math/half_exp.cl math/half_exp10.cl math/half_exp2.cl diff --git a/libclc/amdgpu/lib/math/native_log.inc b/libclc/amdgpu/lib/math/native_log.inc deleted file mode 100644 index 820e4929f02cf..0000000000000 --- a/libclc/amdgpu/lib/math/native_log.inc +++ /dev/null @@ -1,11 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE native_log(__CLC_GENTYPE val) { - return native_log2(val) * (1.0f / M_LOG2E_F); -} diff --git a/libclc/clc/include/clc/math/clc_native_cos.h b/libclc/clc/include/clc/math/clc_native_cos.h new file mode 100644 index 0000000000000..8a580f13ad2aa --- /dev/null +++ b/libclc/clc/include/clc/math/clc_native_cos.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_NATIVE_COS_H__ +#define __CLC_MATH_CLC_NATIVE_COS_H__ + +#define __FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_cos +#define __CLC_BODY + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION +#undef __FLOAT_ONLY + +#endif // __CLC_MATH_CLC_NATIVE_COS_H__ diff --git a/libclc/clc/include/clc/math/clc_native_exp.h b/libclc/clc/include/clc/math/clc_native_exp.h new file mode 100644 index 0000000000000..48a1be616ea3e --- /dev/null +++ b/libclc/clc/include/clc/math/clc_native_exp.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_NATIVE_EXP_H__ +#define __CLC_MATH_CLC_NATIVE_EXP_H__ + +#define __FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_exp +#define __CLC_BODY + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION +#undef __FLOAT_ONLY + +#endif // __CLC_MATH_CLC_NATIVE_EXP_H__ diff --git a/libclc/clc/include/clc/math/clc_native_exp2.h b/libclc/clc/include/clc/math/clc_native_exp2.h new file mode 100644 index 0000000000000..bc0b32d6212b5 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_native_exp2.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_NATIVE_EXP2_H__ +#define __CLC_MATH_CLC_NATIVE_EXP2_H__ + +#define __FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_exp2 +#define __CLC_BODY + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION +#undef __FLOAT_ONLY + +#endif // __CLC_MATH_CLC_NATIVE_EXP2_H__ diff --git a/libclc/clc/include/clc/math/clc_native_log.h b/libclc/clc/include/clc/math/clc_native_log.h new file mode 100644 index 0000000000000..ea0362503f670 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_native_log.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_NATIVE_LOG_H__ +#define __CLC_MATH_CLC_NATIVE_LOG_H__ + +#define __FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_log +#define __CLC_BODY + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION +#undef __FLOAT_ONLY + +#endif // __CLC_MATH_CLC_NATIVE_LOG_H__ diff --git a/libclc/clc/include/clc/math/clc_native_log10.h b/libclc/clc/include/clc/math/clc_native_log10.h new file mode 100644 index 0000000000000..c5cceeeba2952 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_native_log10.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_NATIVE_LOG10_H__ +#define __CLC_MATH_CLC_NATIVE_LOG10_H__ + +#define __FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_log10 +#define __CLC_BODY + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION +#undef __FLOAT_ONLY + +#endif // __CLC_MATH_CLC_NATIVE_LOG10_H__ diff --git a/libclc/clc/include/clc/math/clc_native_log2.h b/libclc/clc/include/clc/math/clc_native_log2.h new file mode 100644 index 0000000000000..25375970cedc0 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_native_log2.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_NATIVE_LOG2_H__ +#define __CLC_MATH_CLC_NATIVE_LOG2_H__ + +#define __FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_log2 +#define __CLC_BODY + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION +#undef __FLOAT_ONLY + +#endif // __CLC_MATH_CLC_NATIVE_LOG2_H__ diff --git a/libclc/clc/include/clc/math/clc_native_rsqrt.h b/libclc/clc/include/clc/math/clc_native_rsqrt.h new file mode 100644 index 0000000000000..59fd2134107db --- /dev/null +++ b/libclc/clc/include/clc/math/clc_native_rsqrt.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_NATIVE_RSQRT_H__ +#define __CLC_MATH_CLC_NATIVE_RSQRT_H__ + +#define __FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_rsqrt +#define __CLC_BODY + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION +#undef __FLOAT_ONLY + +#endif // __CLC_MATH_CLC_NATIVE_RSQRT_H__ diff --git a/libclc/clc/include/clc/math/clc_native_sin.h b/libclc/clc/include/clc/math/clc_native_sin.h new file mode 100644 index 0000000000000..878e19882ae6b --- /dev/null +++ b/libclc/clc/include/clc/math/clc_native_sin.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_NATIVE_SIN_H__ +#define __CLC_MATH_CLC_NATIVE_SIN_H__ + +#define __FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_sin +#define __CLC_BODY + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION +#undef __FLOAT_ONLY + +#endif // __CLC_MATH_CLC_NATIVE_SIN_H__ diff --git a/libclc/clc/include/clc/math/clc_native_sqrt.h b/libclc/clc/include/clc/math/clc_native_sqrt.h new file mode 100644 index 0000000000000..afff77c89b22f --- /dev/null +++ b/libclc/clc/include/clc/math/clc_native_sqrt.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_NATIVE_SQRT_H__ +#define __CLC_MATH_CLC_NATIVE_SQRT_H__ + +#define __FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_sqrt +#define __CLC_BODY + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION +#undef __FLOAT_ONLY + +#endif // __CLC_MATH_CLC_NATIVE_SQRT_H__ diff --git a/libclc/clc/include/clc/math/unary_intrin.inc b/libclc/clc/include/clc/math/unary_intrin.inc deleted file mode 100644 index 8028470114b8e..0000000000000 --- a/libclc/clc/include/clc/math/unary_intrin.inc +++ /dev/null @@ -1,42 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include - -_CLC_OVERLOAD float __CLC_FUNCTION(float f) __asm(__CLC_INTRINSIC ".f32"); -_CLC_OVERLOAD float2 __CLC_FUNCTION(float2 f) __asm(__CLC_INTRINSIC ".v2f32"); -_CLC_OVERLOAD float3 __CLC_FUNCTION(float3 f) __asm(__CLC_INTRINSIC ".v3f32"); -_CLC_OVERLOAD float4 __CLC_FUNCTION(float4 f) __asm(__CLC_INTRINSIC ".v4f32"); -_CLC_OVERLOAD float8 __CLC_FUNCTION(float8 f) __asm(__CLC_INTRINSIC ".v8f32"); -_CLC_OVERLOAD float16 __CLC_FUNCTION(float16 f) __asm(__CLC_INTRINSIC - ".v16f32"); - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_OVERLOAD double __CLC_FUNCTION(double d) __asm(__CLC_INTRINSIC ".f64"); -_CLC_OVERLOAD double2 __CLC_FUNCTION(double2 d) __asm(__CLC_INTRINSIC ".v2f64"); -_CLC_OVERLOAD double3 __CLC_FUNCTION(double3 d) __asm(__CLC_INTRINSIC ".v3f64"); -_CLC_OVERLOAD double4 __CLC_FUNCTION(double4 d) __asm(__CLC_INTRINSIC ".v4f64"); -_CLC_OVERLOAD double8 __CLC_FUNCTION(double8 d) __asm(__CLC_INTRINSIC ".v8f64"); -_CLC_OVERLOAD double16 __CLC_FUNCTION(double16 d) __asm(__CLC_INTRINSIC - ".v16f64"); -#endif - -#ifdef cl_khr_fp16 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -_CLC_OVERLOAD half __CLC_FUNCTION(half d) __asm(__CLC_INTRINSIC ".f16"); -_CLC_OVERLOAD half2 __CLC_FUNCTION(half2 d) __asm(__CLC_INTRINSIC ".v2f16"); -_CLC_OVERLOAD half3 __CLC_FUNCTION(half3 d) __asm(__CLC_INTRINSIC ".v3f16"); -_CLC_OVERLOAD half4 __CLC_FUNCTION(half4 d) __asm(__CLC_INTRINSIC ".v4f16"); -_CLC_OVERLOAD half8 __CLC_FUNCTION(half8 d) __asm(__CLC_INTRINSIC ".v8f16"); -_CLC_OVERLOAD half16 __CLC_FUNCTION(half16 d) __asm(__CLC_INTRINSIC ".v16f16"); -#endif - -#undef __CLC_FUNCTION -#undef __CLC_INTRINSIC diff --git a/libclc/clc/lib/amdgpu/SOURCES b/libclc/clc/lib/amdgpu/SOURCES index fd64a862021e8..31e07b608c4cc 100644 --- a/libclc/clc/lib/amdgpu/SOURCES +++ b/libclc/clc/lib/amdgpu/SOURCES @@ -1 +1,4 @@ +math/clc_native_exp2.cl +math/clc_native_exp.cl +math/clc_native_log10.cl math/clc_sqrt_fp64.cl diff --git a/libclc/amdgpu/lib/math/native_log10.cl b/libclc/clc/lib/amdgpu/math/clc_native_exp.cl similarity index 74% rename from libclc/amdgpu/lib/math/native_log10.cl rename to libclc/clc/lib/amdgpu/math/clc_native_exp.cl index 7cbe1f98988d5..591ecb0ac00b5 100644 --- a/libclc/amdgpu/lib/math/native_log10.cl +++ b/libclc/clc/lib/amdgpu/math/clc_native_exp.cl @@ -6,8 +6,10 @@ // //===----------------------------------------------------------------------===// -#include +#include +#include +#include -#define __CLC_BODY +#define __CLC_BODY #define __FLOAT_ONLY #include diff --git a/libclc/amdgpu/lib/math/native_exp.inc b/libclc/clc/lib/amdgpu/math/clc_native_exp.inc similarity index 75% rename from libclc/amdgpu/lib/math/native_exp.inc rename to libclc/clc/lib/amdgpu/math/clc_native_exp.inc index d7dbd888d2988..cf2d48c4054ed 100644 --- a/libclc/amdgpu/lib/math/native_exp.inc +++ b/libclc/clc/lib/amdgpu/math/clc_native_exp.inc @@ -6,6 +6,6 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE native_exp(__CLC_GENTYPE val) { - return native_exp2(val * M_LOG2E_F); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_exp(__CLC_GENTYPE val) { + return __clc_native_exp2(val * M_LOG2E_F); } diff --git a/libclc/amdgpu/lib/math/native_exp2.cl b/libclc/clc/lib/amdgpu/math/clc_native_exp2.cl similarity index 72% rename from libclc/amdgpu/lib/math/native_exp2.cl rename to libclc/clc/lib/amdgpu/math/clc_native_exp2.cl index 39ae914b19634..76b1850fce574 100644 --- a/libclc/amdgpu/lib/math/native_exp2.cl +++ b/libclc/clc/lib/amdgpu/math/clc_native_exp2.cl @@ -6,11 +6,11 @@ // //===----------------------------------------------------------------------===// -#include #include +#include -_CLC_OVERLOAD _CLC_DEF float native_exp2(float val) { +_CLC_OVERLOAD _CLC_DEF float __clc_native_exp2(float val) { return __builtin_amdgcn_exp2f(val); } -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, native_exp2, float) +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_native_exp2, float) diff --git a/libclc/amdgpu/lib/math/native_log.cl b/libclc/clc/lib/amdgpu/math/clc_native_log10.cl similarity index 74% rename from libclc/amdgpu/lib/math/native_log.cl rename to libclc/clc/lib/amdgpu/math/clc_native_log10.cl index ce3258bb5b37b..0668a635d24d7 100644 --- a/libclc/amdgpu/lib/math/native_log.cl +++ b/libclc/clc/lib/amdgpu/math/clc_native_log10.cl @@ -6,8 +6,10 @@ // //===----------------------------------------------------------------------===// -#include +#include +#include +#include -#define __CLC_BODY +#define __CLC_BODY #define __FLOAT_ONLY #include diff --git a/libclc/amdgpu/lib/math/native_log10.inc b/libclc/clc/lib/amdgpu/math/clc_native_log10.inc similarity index 73% rename from libclc/amdgpu/lib/math/native_log10.inc rename to libclc/clc/lib/amdgpu/math/clc_native_log10.inc index 2aef43a8ed1a5..c91d698609793 100644 --- a/libclc/amdgpu/lib/math/native_log10.inc +++ b/libclc/clc/lib/amdgpu/math/clc_native_log10.inc @@ -6,6 +6,6 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE native_log10(__CLC_GENTYPE val) { - return native_log2(val) * (M_LN2_F / M_LN10_F); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_log10(__CLC_GENTYPE val) { + return __clc_native_log2(val) * (M_LN2_F / M_LN10_F); } diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index 6a1ba95362220..c31963c59e950 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -45,6 +45,15 @@ math/clc_log2.cl math/clc_mad.cl math/clc_modf.cl math/clc_nan.cl +math/clc_native_cos.cl +math/clc_native_exp.cl +math/clc_native_exp2.cl +math/clc_native_log.cl +math/clc_native_log10.cl +math/clc_native_log2.cl +math/clc_native_rsqrt.cl +math/clc_native_sin.cl +math/clc_native_sqrt.cl math/clc_nextafter.cl math/clc_pow.cl math/clc_pown.cl diff --git a/libclc/clc/lib/generic/math/clc_native_cos.cl b/libclc/clc/lib/generic/math/clc_native_cos.cl new file mode 100644 index 0000000000000..de56fdec48d24 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_native_cos.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __FLOAT_ONLY +#define FUNCTION __clc_native_cos +#define __CLC_FUNCTION(x) __builtin_elementwise_cos +#define __CLC_BODY + +#include diff --git a/libclc/clc/lib/generic/math/clc_native_exp.cl b/libclc/clc/lib/generic/math/clc_native_exp.cl new file mode 100644 index 0000000000000..400270a6163a4 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_native_exp.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __FLOAT_ONLY +#define FUNCTION __clc_native_exp +#define __CLC_FUNCTION(x) __builtin_elementwise_exp +#define __CLC_BODY + +#include diff --git a/libclc/clc/lib/generic/math/clc_native_exp2.cl b/libclc/clc/lib/generic/math/clc_native_exp2.cl new file mode 100644 index 0000000000000..427d901fcdb19 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_native_exp2.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __FLOAT_ONLY +#define FUNCTION __clc_native_exp2 +#define __CLC_FUNCTION(x) __builtin_elementwise_exp2 +#define __CLC_BODY + +#include diff --git a/libclc/clc/lib/generic/math/clc_native_log.cl b/libclc/clc/lib/generic/math/clc_native_log.cl new file mode 100644 index 0000000000000..85f188b654282 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_native_log.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __FLOAT_ONLY +#define FUNCTION __clc_native_log +#define __CLC_FUNCTION(x) __builtin_elementwise_log +#define __CLC_BODY + +#include diff --git a/libclc/clc/lib/generic/math/clc_native_log10.cl b/libclc/clc/lib/generic/math/clc_native_log10.cl new file mode 100644 index 0000000000000..624018e4481bf --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_native_log10.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __FLOAT_ONLY +#define FUNCTION __clc_native_log10 +#define __CLC_FUNCTION(x) __builtin_elementwise_log10 +#define __CLC_BODY + +#include diff --git a/libclc/clc/lib/generic/math/clc_native_log2.cl b/libclc/clc/lib/generic/math/clc_native_log2.cl new file mode 100644 index 0000000000000..2c8c18e61ca5d --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_native_log2.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __FLOAT_ONLY +#define FUNCTION __clc_native_log2 +#define __CLC_FUNCTION(x) __builtin_elementwise_log2 +#define __CLC_BODY + +#include diff --git a/libclc/amdgpu/lib/math/native_exp.cl b/libclc/clc/lib/generic/math/clc_native_rsqrt.cl similarity index 79% rename from libclc/amdgpu/lib/math/native_exp.cl rename to libclc/clc/lib/generic/math/clc_native_rsqrt.cl index e62b79d4ec9fa..d5e6fcdae491f 100644 --- a/libclc/amdgpu/lib/math/native_exp.cl +++ b/libclc/clc/lib/generic/math/clc_native_rsqrt.cl @@ -6,8 +6,9 @@ // //===----------------------------------------------------------------------===// -#include +#include +#include -#define __CLC_BODY +#define __CLC_BODY #define __FLOAT_ONLY #include diff --git a/libclc/generic/lib/math/native_rsqrt.inc b/libclc/clc/lib/generic/math/clc_native_rsqrt.inc similarity index 76% rename from libclc/generic/lib/math/native_rsqrt.inc rename to libclc/clc/lib/generic/math/clc_native_rsqrt.inc index 058209bcb8a15..7a3b0b2af2721 100644 --- a/libclc/generic/lib/math/native_rsqrt.inc +++ b/libclc/clc/lib/generic/math/clc_native_rsqrt.inc @@ -6,6 +6,6 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE native_rsqrt(__CLC_GENTYPE val) { - return 1.0f / native_sqrt(val); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_rsqrt(__CLC_GENTYPE val) { + return 1.0f / __clc_native_sqrt(val); } diff --git a/libclc/clc/lib/generic/math/clc_native_sin.cl b/libclc/clc/lib/generic/math/clc_native_sin.cl new file mode 100644 index 0000000000000..22b988bf4375f --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_native_sin.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __FLOAT_ONLY +#define FUNCTION __clc_native_sin +#define __CLC_FUNCTION(x) __builtin_elementwise_sin +#define __CLC_BODY + +#include diff --git a/libclc/clc/lib/generic/math/clc_native_sqrt.cl b/libclc/clc/lib/generic/math/clc_native_sqrt.cl new file mode 100644 index 0000000000000..ed022ef1fee1f --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_native_sqrt.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __FLOAT_ONLY +#define FUNCTION __clc_native_sqrt +#define __CLC_FUNCTION(x) __builtin_elementwise_sqrt +#define __CLC_BODY + +#include diff --git a/libclc/clc/lib/r600/SOURCES b/libclc/clc/lib/r600/SOURCES index 8f66107e0454e..8d5caf167aa4e 100644 --- a/libclc/clc/lib/r600/SOURCES +++ b/libclc/clc/lib/r600/SOURCES @@ -1 +1,2 @@ +math/clc_native_rsqrt.cl math/clc_rsqrt_override.cl diff --git a/libclc/r600/lib/math/native_rsqrt.cl b/libclc/clc/lib/r600/math/clc_native_rsqrt.cl similarity index 65% rename from libclc/r600/lib/math/native_rsqrt.cl rename to libclc/clc/lib/r600/math/clc_native_rsqrt.cl index e916147c3e057..ee09814eb1e76 100644 --- a/libclc/r600/lib/math/native_rsqrt.cl +++ b/libclc/clc/lib/r600/math/clc_native_rsqrt.cl @@ -6,12 +6,11 @@ // //===----------------------------------------------------------------------===// -#include #include +#include -_CLC_OVERLOAD _CLC_DEF float native_rsqrt(float x) -{ - return __builtin_r600_recipsqrt_ieeef(x); +_CLC_OVERLOAD _CLC_DEF float __clc_native_rsqrt(float x) { + return __builtin_r600_recipsqrt_ieeef(x); } -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, native_rsqrt, float); +_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_native_rsqrt, float); diff --git a/libclc/generic/lib/math/native_cos.cl b/libclc/generic/lib/math/native_cos.cl index d81a067f11044..0f4d46c1d73e4 100644 --- a/libclc/generic/lib/math/native_cos.cl +++ b/libclc/generic/lib/math/native_cos.cl @@ -7,9 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#define __CLC_NATIVE_INTRINSIC cos - -#define __CLC_BODY #define __FLOAT_ONLY +#define FUNCTION native_cos +#define __CLC_BODY + #include diff --git a/libclc/generic/lib/math/native_exp.cl b/libclc/generic/lib/math/native_exp.cl index 8f4531343415c..c1d08ec2c7e1f 100644 --- a/libclc/generic/lib/math/native_exp.cl +++ b/libclc/generic/lib/math/native_exp.cl @@ -7,9 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#define __CLC_NATIVE_INTRINSIC exp - -#define __CLC_BODY #define __FLOAT_ONLY +#define FUNCTION native_exp +#define __CLC_BODY + #include diff --git a/libclc/generic/lib/math/native_exp2.cl b/libclc/generic/lib/math/native_exp2.cl index ecde4a6761d22..ceb570733b974 100644 --- a/libclc/generic/lib/math/native_exp2.cl +++ b/libclc/generic/lib/math/native_exp2.cl @@ -7,9 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#define __CLC_NATIVE_INTRINSIC exp2 - -#define __CLC_BODY #define __FLOAT_ONLY +#define FUNCTION native_exp2 +#define __CLC_BODY + #include diff --git a/libclc/generic/lib/math/native_log.cl b/libclc/generic/lib/math/native_log.cl index 5731e09d21c9c..adc2ff495f8b5 100644 --- a/libclc/generic/lib/math/native_log.cl +++ b/libclc/generic/lib/math/native_log.cl @@ -7,9 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#define __CLC_NATIVE_INTRINSIC log - -#define __CLC_BODY #define __FLOAT_ONLY +#define FUNCTION native_log +#define __CLC_BODY + #include diff --git a/libclc/generic/lib/math/native_log10.cl b/libclc/generic/lib/math/native_log10.cl index eab7a6f14d035..f63292124f3b7 100644 --- a/libclc/generic/lib/math/native_log10.cl +++ b/libclc/generic/lib/math/native_log10.cl @@ -7,9 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#define __CLC_NATIVE_INTRINSIC log10 - -#define __CLC_BODY #define __FLOAT_ONLY +#define FUNCTION native_log10 +#define __CLC_BODY + #include diff --git a/libclc/generic/lib/math/native_log2.cl b/libclc/generic/lib/math/native_log2.cl index 0db4be0b5e083..6b079872b1e0a 100644 --- a/libclc/generic/lib/math/native_log2.cl +++ b/libclc/generic/lib/math/native_log2.cl @@ -7,8 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#define __CLC_NATIVE_INTRINSIC log2 -#define __CLC_BODY #define __FLOAT_ONLY +#define FUNCTION native_log2 +#define __CLC_BODY + #include diff --git a/libclc/generic/lib/math/native_rsqrt.cl b/libclc/generic/lib/math/native_rsqrt.cl index 14430c04fb72d..cb49b2d1d6706 100644 --- a/libclc/generic/lib/math/native_rsqrt.cl +++ b/libclc/generic/lib/math/native_rsqrt.cl @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#define __CLC_BODY #define __FLOAT_ONLY +#define FUNCTION native_rsqrt +#define __CLC_BODY + #include diff --git a/libclc/generic/lib/math/native_sin.cl b/libclc/generic/lib/math/native_sin.cl index 0e2ced09fa2dd..50265b3936272 100644 --- a/libclc/generic/lib/math/native_sin.cl +++ b/libclc/generic/lib/math/native_sin.cl @@ -7,9 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#define __CLC_NATIVE_INTRINSIC sin - -#define __CLC_BODY #define __FLOAT_ONLY +#define FUNCTION native_sin +#define __CLC_BODY + #include diff --git a/libclc/generic/lib/math/native_sqrt.cl b/libclc/generic/lib/math/native_sqrt.cl index 1b668e5976ef7..4cd022e8bbeba 100644 --- a/libclc/generic/lib/math/native_sqrt.cl +++ b/libclc/generic/lib/math/native_sqrt.cl @@ -7,9 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#define __CLC_NATIVE_INTRINSIC sqrt - -#define __CLC_BODY #define __FLOAT_ONLY +#define FUNCTION native_sqrt +#define __CLC_BODY + #include diff --git a/libclc/generic/lib/math/native_unary_intrinsic.inc b/libclc/generic/lib/math/native_unary_intrinsic.inc deleted file mode 100644 index c118ec095692f..0000000000000 --- a/libclc/generic/lib/math/native_unary_intrinsic.inc +++ /dev/null @@ -1,26 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#ifdef __CLC_SCALAR -#define __CLC_FUNCTION __CLC_XCONCAT(__clc_native_, __CLC_NATIVE_INTRINSIC) -#define __CLC_INTRINSIC "llvm." __CLC_XSTR(__CLC_NATIVE_INTRINSIC) - -#undef cl_khr_fp64 -#include - -#endif - -#define __CLC_FUNCTION __CLC_XCONCAT(native_, __CLC_NATIVE_INTRINSIC) - -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE val) { - return __CLC_XCONCAT(__clc_native_, __CLC_NATIVE_INTRINSIC)(val); -} - -#undef __CLC_FUNCTION diff --git a/libclc/r600/lib/SOURCES b/libclc/r600/lib/SOURCES index cad45785dc483..4342ac38201c1 100644 --- a/libclc/r600/lib/SOURCES +++ b/libclc/r600/lib/SOURCES @@ -1,6 +1,5 @@ math/fmax.cl math/fmin.cl -math/native_rsqrt.cl synchronization/barrier.cl workitem/get_global_offset.cl workitem/get_group_id.cl From 1cf6786e322ddc787d793dbb48d59b4f9827fef3 Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Tue, 1 Apr 2025 16:27:43 +0800 Subject: [PATCH 0208/1029] [mlir] Improve error handling for dense attribute parsing in complex types (#133220) - For splat dense attributes, the number of parsed elements must be 2. - For non-splat dense attributes, the number of parsed elements must be twice the number of elements in the type. Fixes #132859. --- mlir/lib/AsmParser/AttributeParser.cpp | 13 +++++++++++++ mlir/test/IR/invalid-builtin-attributes.mlir | 15 +++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/mlir/lib/AsmParser/AttributeParser.cpp b/mlir/lib/AsmParser/AttributeParser.cpp index 52ab736bac03a..93a24dee29ad2 100644 --- a/mlir/lib/AsmParser/AttributeParser.cpp +++ b/mlir/lib/AsmParser/AttributeParser.cpp @@ -566,6 +566,19 @@ DenseElementsAttr TensorLiteralParser::getAttr(SMLoc loc, ShapedType type) { if (ComplexType complexTy = dyn_cast(eltType)) { eltType = complexTy.getElementType(); isComplex = true; + // Complex types have 2 elements. + if (shape.empty() && storage.size() != 2) { + p.emitError(loc) << "parsed " << storage.size() << " elements, but type (" + << complexTy << ") expected 2 elements"; + return nullptr; + } + if (!shape.empty() && + storage.size() != static_cast(type.getNumElements()) * 2) { + p.emitError(loc) << "parsed " << storage.size() << " elements, but type (" + << type << ") expected " << type.getNumElements() * 2 + << " elements"; + return nullptr; + } } // Handle integer and index types. diff --git a/mlir/test/IR/invalid-builtin-attributes.mlir b/mlir/test/IR/invalid-builtin-attributes.mlir index d2c11536404ea..58d4940eaf874 100644 --- a/mlir/test/IR/invalid-builtin-attributes.mlir +++ b/mlir/test/IR/invalid-builtin-attributes.mlir @@ -63,6 +63,21 @@ func.func @elementsattr_toolarge1() -> () { // ----- +// expected-error@+1 {{parsed 1 elements, but type ('complex') expected 2 elements}} +#attr = dense<0> : tensor<2xcomplex> + +// ----- + +// expected-error@+1 {{parsed 2 elements, but type ('tensor<2xcomplex>') expected 4 elements}} +#attr = dense<[0, 1]> : tensor<2xcomplex> + +// ----- + +// expected-error@+1 {{parsed 3 elements, but type ('tensor<2xcomplex>') expected 4 elements}} +#attr = dense<[0, (0, 1)]> : tensor<2xcomplex> + +// ----- + func.func @elementsattr_toolarge2() -> () { "foo"(){bar = dense<[-777]> : tensor<1xi8>} : () -> () // expected-error {{integer constant out of range}} } From af0b0ce665e2a36dd60c2aa70e78cc54cf20b7e3 Mon Sep 17 00:00:00 2001 From: Valery Pykhtin Date: Tue, 1 Apr 2025 10:27:58 +0200 Subject: [PATCH 0209/1029] [AMDGPU] Fix SIFoldOperandsImpl::tryFoldZeroHighBits when met non-reg src1 operand. (#133761) This happens when a constant is propagated to a V_AND 0xFFFF, reg instruction. Fixes failures like: ``` llc: /github/llvm-project/llvm/include/llvm/CodeGen/MachineOperand.h:366: llvm::Register llvm::MachineOperand::getReg() const: Assertion `isReg() && "This is not a register operand!"' failed. Stack dump: 0. Program arguments: /github/llvm-project/build/Debug/bin/llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -run-pass si-fold-operands /github/llvm-project/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir -o - 1. Running pass 'Function Pass Manager' on module '/github/llvm-project/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir'. 2. Running pass 'SI Fold Operands' on function '@test_tryFoldZeroHighBits_skips_nonreg' ... #12 0x00007f5a55005cfc llvm::MachineOperand::getReg() const /github/llvm-project/llvm/include/llvm/CodeGen/MachineOperand.h:0:5 #13 0x00007f5a555c6bf5 (anonymous namespace)::SIFoldOperandsImpl::tryFoldZeroHighBits(llvm::MachineInstr&) const /github/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp:1459:36 #14 0x00007f5a555c63ad (anonymous namespace)::SIFoldOperandsImpl::run(llvm::MachineFunction&) /github/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp:2455:11 #15 0x00007f5a555c6780 (anonymous namespace)::SIFoldOperandsLegacy::runOnMachineFunction ``` --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 2 +- .../fold-zero-high-bits-skips-non-reg.mir | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index cc15dd7cb495c..46bd5d8044c45 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1453,7 +1453,7 @@ bool SIFoldOperandsImpl::tryFoldZeroHighBits(MachineInstr &MI) const { return false; std::optional Src0Imm = getImmOrMaterializedImm(MI.getOperand(1)); - if (!Src0Imm || *Src0Imm != 0xffff) + if (!Src0Imm || *Src0Imm != 0xffff || !MI.getOperand(2).isReg()) return false; Register Src1 = MI.getOperand(2).getReg(); diff --git a/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir b/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir new file mode 100644 index 0000000000000..b1aa88969c5bb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold-zero-high-bits-skips-non-reg.mir @@ -0,0 +1,17 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -run-pass si-fold-operands %s -o - | FileCheck %s +--- +name: test_tryFoldZeroHighBits_skips_nonreg +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: test_tryFoldZeroHighBits_skips_nonreg + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1 + ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 65535, 0, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit [[V_AND_B32_e64_]] + %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %1:vreg_64 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1 + %2:vgpr_32 = V_AND_B32_e64 65535, %1.sub0, implicit $exec + S_NOP 0, implicit %2 +... From 9e5bfbf77db0945f59c0d18012a8e6d43c711b3a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 1 Apr 2025 09:28:33 +0100 Subject: [PATCH 0210/1029] [EquivalenceClasses] Update member_begin to take ECValue (NFC). Remove a level of indirection and update code to use range-based for loops. --- .../Analysis/FlowSensitive/SimplifyConstraints.cpp | 7 +++---- llvm/include/llvm/ADT/EquivalenceClasses.h | 8 ++------ llvm/lib/Analysis/VectorUtils.cpp | 10 ++++++---- .../Target/AArch64/AArch64A57FPLoadBalancing.cpp | 6 ++++-- llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 6 +++--- llvm/lib/Transforms/IPO/LowerTypeTests.cpp | 4 ++-- llvm/lib/Transforms/Scalar/Float2Int.cpp | 14 +++++++------- llvm/lib/Transforms/Scalar/LoopDistribute.cpp | 11 +++++------ llvm/lib/Transforms/Utils/SplitModule.cpp | 2 +- llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp | 5 ++--- 10 files changed, 35 insertions(+), 38 deletions(-) diff --git a/clang/lib/Analysis/FlowSensitive/SimplifyConstraints.cpp b/clang/lib/Analysis/FlowSensitive/SimplifyConstraints.cpp index bbd73ef9b2f03..69a90334c9df5 100644 --- a/clang/lib/Analysis/FlowSensitive/SimplifyConstraints.cpp +++ b/clang/lib/Analysis/FlowSensitive/SimplifyConstraints.cpp @@ -151,11 +151,10 @@ void simplifyConstraints(llvm::SetVector &Constraints, } if (Info) { - for (auto It = EquivalentAtoms.begin(), End = EquivalentAtoms.end(); - It != End; ++It) { - if (!It->isLeader()) + for (const auto &E : EquivalentAtoms) { + if (!E.isLeader()) continue; - Atom At = *EquivalentAtoms.findLeader(*It); + Atom At = *EquivalentAtoms.findLeader(E); if (TrueAtoms.contains(At) || FalseAtoms.contains(At)) continue; llvm::SmallVector Atoms = diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h index be46a4445c3f2..4f692052847aa 100644 --- a/llvm/include/llvm/ADT/EquivalenceClasses.h +++ b/llvm/include/llvm/ADT/EquivalenceClasses.h @@ -148,7 +148,7 @@ class EquivalenceClasses { TheMapping.clear(); for (iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) if (I->isLeader()) { - member_iterator MI = RHS.member_begin(I); + member_iterator MI = RHS.member_begin(*I); member_iterator LeaderIt = member_begin(insert(*MI)); for (++MI; MI != member_end(); ++MI) unionSets(LeaderIt, member_begin(insert(*MI))); @@ -171,13 +171,9 @@ class EquivalenceClasses { /// member_* Iterate over the members of an equivalence class. class member_iterator; - member_iterator member_begin(iterator I) const { - // Only leaders provide anything to iterate over. - return member_iterator(I->isLeader() ? &*I : nullptr); - } member_iterator member_begin(const ECValue &ECV) const { // Only leaders provide anything to iterate over. - return member_iterator(ECV.getLeader()); + return member_iterator(ECV.isLeader() ? ECV.getLeader() : nullptr); } member_iterator member_end() const { diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 91ba68fe03324..f57186589a325 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -843,9 +843,11 @@ llvm::computeMinimumValueSizes(ArrayRef Blocks, DemandedBits &DB, if (U->getType()->isIntegerTy() && DBits.count(U) == 0) DBits[ECs.getOrInsertLeaderValue(I.first)] |= ~0ULL; - for (auto I = ECs.begin(), E = ECs.end(); I != E; ++I) { + for (const auto &E : ECs) { + if (!E.isLeader()) + continue; uint64_t LeaderDemandedBits = 0; - for (Value *M : llvm::make_range(ECs.member_begin(I), ECs.member_end())) + for (Value *M : make_range(ECs.member_begin(E), ECs.member_end())) LeaderDemandedBits |= DBits[M]; uint64_t MinBW = llvm::bit_width(LeaderDemandedBits); @@ -857,7 +859,7 @@ llvm::computeMinimumValueSizes(ArrayRef Blocks, DemandedBits &DB, // indvars. // If we are required to shrink a PHI, abandon this entire equivalence class. bool Abort = false; - for (Value *M : llvm::make_range(ECs.member_begin(I), ECs.member_end())) + for (Value *M : make_range(ECs.member_begin(E), ECs.member_end())) if (isa(M) && MinBW < M->getType()->getScalarSizeInBits()) { Abort = true; break; @@ -865,7 +867,7 @@ llvm::computeMinimumValueSizes(ArrayRef Blocks, DemandedBits &DB, if (Abort) continue; - for (Value *M : llvm::make_range(ECs.member_begin(I), ECs.member_end())) { + for (Value *M : make_range(ECs.member_begin(E), ECs.member_end())) { auto *MI = dyn_cast(M); if (!MI) continue; diff --git a/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index c27ec8e6dc6b3..c6d40cb00b252 100644 --- a/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -367,8 +367,10 @@ bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) { // Convert the EquivalenceClasses to a simpler set of sets. std::vector > V; - for (auto I = EC.begin(), E = EC.end(); I != E; ++I) { - std::vector Cs(EC.member_begin(I), EC.member_end()); + for (const auto &E : EC) { + if (!E.isLeader()) + continue; + std::vector Cs(EC.member_begin(E), EC.member_end()); if (Cs.empty()) continue; V.push_back(std::move(Cs)); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index 212b16cb3da6e..4a700bd213ed5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -1016,12 +1016,12 @@ void RecursiveSearchSplitting::setupWorkList() { }); } - for (auto I = NodeEC.begin(), E = NodeEC.end(); I != E; ++I) { - if (!I->isLeader()) + for (const auto &Node : NodeEC) { + if (!Node.isLeader()) continue; BitVector Cluster = SG.createNodesBitVector(); - for (auto MI = NodeEC.member_begin(I); MI != NodeEC.member_end(); ++MI) { + for (auto MI = NodeEC.member_begin(Node); MI != NodeEC.member_end(); ++MI) { const SplitGraph::Node &N = SG.getNode(*MI); if (N.isGraphEntryPoint()) N.getDependencies(Cluster); diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 299b3a9162389..11f9f0271395b 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -2350,7 +2350,7 @@ bool LowerTypeTestsModule::lower() { ++NumTypeIdDisjointSets; unsigned MaxUniqueId = 0; - for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I); + for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(*I); MI != GlobalClasses.member_end(); ++MI) { if (auto *MD = dyn_cast_if_present(*MI)) MaxUniqueId = std::max(MaxUniqueId, TypeIdInfo[MD].UniqueId); @@ -2368,7 +2368,7 @@ bool LowerTypeTestsModule::lower() { std::vector Globals; std::vector ICallBranchFunnels; for (GlobalClassesTy::member_iterator MI = - GlobalClasses.member_begin(S.first); + GlobalClasses.member_begin(*S.first); MI != GlobalClasses.member_end(); ++MI) { if (isa(*MI)) TypeIds.push_back(cast(*MI)); diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp index 9d23c89943009..85c376c564d35 100644 --- a/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -311,14 +311,15 @@ bool Float2IntPass::validateAndTransform(const DataLayout &DL) { bool MadeChange = false; // Iterate over every disjoint partition of the def-use graph. - for (auto It = ECs.begin(), E = ECs.end(); It != E; ++It) { + for (const auto &E : ECs) { + if (!E.isLeader()) + continue; ConstantRange R(MaxIntegerBW + 1, false); bool Fail = false; Type *ConvertedToTy = nullptr; // For every member of the partition, union all the ranges together. - for (auto MI = ECs.member_begin(It), ME = ECs.member_end(); - MI != ME; ++MI) { + for (auto MI = ECs.member_begin(E), ME = ECs.member_end(); MI != ME; ++MI) { Instruction *I = *MI; auto SeenI = SeenInsts.find(I); if (SeenI == SeenInsts.end()) @@ -348,8 +349,8 @@ bool Float2IntPass::validateAndTransform(const DataLayout &DL) { // If the set was empty, or we failed, or the range is poisonous, // bail out. - if (ECs.member_begin(It) == ECs.member_end() || Fail || - R.isFullSet() || R.isSignWrappedSet()) + if (ECs.member_begin(E) == ECs.member_end() || Fail || R.isFullSet() || + R.isSignWrappedSet()) continue; assert(ConvertedToTy && "Must have set the convertedtoty by this point!"); @@ -388,8 +389,7 @@ bool Float2IntPass::validateAndTransform(const DataLayout &DL) { } } - for (auto MI = ECs.member_begin(It), ME = ECs.member_end(); - MI != ME; ++MI) + for (auto MI = ECs.member_begin(E), ME = ECs.member_end(); MI != ME; ++MI) convert(*MI, Ty); MadeChange = true; } diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index efbd1b89aca8f..5f03d854b51e6 100644 --- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -385,14 +385,13 @@ class InstPartitionContainer { // Merge the member of an equivalence class into its class leader. This // makes the members empty. - for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); - I != E; ++I) { - if (!I->isLeader()) + for (const auto &C : ToBeMerged) { + if (!C.isLeader()) continue; - auto PartI = I->getData(); - for (auto *PartJ : make_range(std::next(ToBeMerged.member_begin(I)), - ToBeMerged.member_end())) { + auto PartI = C.getData(); + for (auto *PartJ : make_range(std::next(ToBeMerged.member_begin(C)), + ToBeMerged.member_end())) { PartJ->moveTo(*PartI); } } diff --git a/llvm/lib/Transforms/Utils/SplitModule.cpp b/llvm/lib/Transforms/Utils/SplitModule.cpp index 07956f3a191c4..507e3c7a42737 100644 --- a/llvm/lib/Transforms/Utils/SplitModule.cpp +++ b/llvm/lib/Transforms/Utils/SplitModule.cpp @@ -182,7 +182,7 @@ static void findPartitions(Module &M, ClusterIDMapType &ClusterIDMap, I != E; ++I) if (I->isLeader()) Sets.push_back( - std::make_pair(std::distance(GVtoClusterMap.member_begin(I), + std::make_pair(std::distance(GVtoClusterMap.member_begin(*I), GVtoClusterMap.member_end()), I)); diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp index 770494405810d..feaa3602f88cb 100644 --- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp @@ -581,10 +581,9 @@ CombineRuleOperandTypeChecker::getRuleEqClasses() const { if (DebugTypeInfer) { errs() << "Final Type Equivalence Classes: "; - for (auto ClassIt = TECs.begin(); ClassIt != TECs.end(); ++ClassIt) { + for (const auto &Class : TECs) { // only print non-empty classes. - if (auto MembIt = TECs.member_begin(ClassIt); - MembIt != TECs.member_end()) { + if (auto MembIt = TECs.member_begin(Class); MembIt != TECs.member_end()) { errs() << '['; StringRef Sep = ""; for (; MembIt != TECs.member_end(); ++MembIt) { From 6c3adaafe3f2139fba5ef3865cbcbba93dbab645 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Tue, 1 Apr 2025 09:45:16 +0100 Subject: [PATCH 0211/1029] [AARCH64][Neon] switch to using bitcasts in arm_neon.h where appropriate (#127043) Currently arm_neon.h emits C-style casts to do vector type casts. This relies on implicit conversion between vector types to be enabled, which is currently deprecated behaviour and soon will disappear. To ensure NEON code will keep working afterwards, this patch changes all this vector type casts into bitcasts. Co-authored-by: Momchil Velikov --- clang/include/clang/Basic/TargetBuiltins.h | 4 + clang/include/clang/Basic/arm_neon.td | 68 +- clang/lib/CodeGen/CodeGenFunction.h | 8 +- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 102 +- .../CodeGen/AArch64/bf16-dotprod-intrinsics.c | 384 +- .../CodeGen/AArch64/bf16-getset-intrinsics.c | 30 +- .../AArch64/bf16-reinterpret-intrinsics.c | 452 +- .../fp8-intrinsics/acle_neon_fp8_cvt.c | 44 +- .../fp8-intrinsics/acle_neon_fp8_fdot.c | 84 +- .../fp8-intrinsics/acle_neon_fp8_fmla.c | 84 +- .../acle_neon_fp8_reinterpret.c | 158 +- clang/test/CodeGen/AArch64/neon-2velem.c | 1826 +- clang/test/CodeGen/AArch64/neon-extract.c | 373 +- clang/test/CodeGen/AArch64/neon-fma.c | 146 +- clang/test/CodeGen/AArch64/neon-fp16fml.c | 1426 +- .../AArch64/neon-intrinsics-constrained.c | 1862 +- clang/test/CodeGen/AArch64/neon-intrinsics.c | 26255 ++++++++----- .../CodeGen/AArch64/neon-ldst-one-rcpc3.c | 40 +- clang/test/CodeGen/AArch64/neon-ldst-one.c | 8535 ++--- .../CodeGen/AArch64/neon-misc-constrained.c | 111 +- clang/test/CodeGen/AArch64/neon-misc.c | 4130 +- clang/test/CodeGen/AArch64/neon-perm.c | 2877 +- .../neon-scalar-x-indexed-elem-constrained.c | 308 +- .../AArch64/neon-scalar-x-indexed-elem.c | 653 +- clang/test/CodeGen/AArch64/neon-vcmla.c | 1314 +- clang/test/CodeGen/AArch64/poly-add.c | 2 +- clang/test/CodeGen/AArch64/poly128.c | 56 +- clang/test/CodeGen/AArch64/poly64.c | 781 +- .../CodeGen/AArch64/v8.1a-neon-intrinsics.c | 98 +- .../v8.2a-neon-intrinsics-constrained.c | 902 +- .../AArch64/v8.2a-neon-intrinsics-generic.c | 288 +- .../CodeGen/AArch64/v8.2a-neon-intrinsics.c | 1184 +- .../AArch64/v8.5a-neon-frint3264-intrinsic.c | 251 +- .../CodeGen/AArch64/v8.6a-neon-intrinsics.c | 232 +- .../CodeGen/arm-bf16-dotprod-intrinsics.c | 386 +- .../test/CodeGen/arm-bf16-getset-intrinsics.c | 32 +- .../test/CodeGen/arm-neon-directed-rounding.c | 347 +- clang/test/CodeGen/arm-neon-fma.c | 66 +- clang/test/CodeGen/arm-neon-numeric-maxmin.c | 62 +- clang/test/CodeGen/arm-neon-vcvtX.c | 114 +- clang/test/CodeGen/arm-neon-vst.c | 4138 +- clang/test/CodeGen/arm64-vrnd-constrained.c | 219 +- clang/test/CodeGen/arm64-vrnd.c | 121 +- clang/test/CodeGen/arm64_vcreate.c | 21 +- clang/test/CodeGen/arm64_vdupq_n_f64.c | 96 +- clang/test/CodeGen/arm_neon_intrinsics.c | 31749 ++++++++++------ clang/utils/TableGen/NeonEmitter.cpp | 28 +- .../v8.2a-neon-intrinsics-constrained.ll | 276 + 48 files changed, 56329 insertions(+), 36394 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.ll diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h index 4781054240b5b..c1ba65064f159 100644 --- a/clang/include/clang/Basic/TargetBuiltins.h +++ b/clang/include/clang/Basic/TargetBuiltins.h @@ -263,6 +263,10 @@ namespace clang { EltType ET = getEltType(); return ET == Poly8 || ET == Poly16 || ET == Poly64; } + bool isFloatingPoint() const { + EltType ET = getEltType(); + return ET == Float16 || ET == Float32 || ET == Float64 || ET == BFloat16; + } bool isUnsigned() const { return (Flags & UnsignedFlag) != 0; } bool isQuad() const { return (Flags & QuadFlag) != 0; } unsigned getEltSizeInBits() const { diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 3e73dd054933f..ab0051efe5159 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -31,8 +31,8 @@ def OP_MLAL : Op<(op "+", $p0, (call "vmull", $p1, $p2))>; def OP_MULLHi : Op<(call "vmull", (call "vget_high", $p0), (call "vget_high", $p1))>; def OP_MULLHi_P64 : Op<(call "vmull", - (cast "poly64_t", (call "vget_high", $p0)), - (cast "poly64_t", (call "vget_high", $p1)))>; + (bitcast "poly64_t", (call "vget_high", $p0)), + (bitcast "poly64_t", (call "vget_high", $p1)))>; def OP_MULLHi_N : Op<(call "vmull_n", (call "vget_high", $p0), $p1)>; def OP_MLALHi : Op<(call "vmlal", $p0, (call "vget_high", $p1), (call "vget_high", $p2))>; @@ -95,11 +95,11 @@ def OP_TRN2 : Op<(shuffle $p0, $p1, (interleave def OP_ZIP2 : Op<(shuffle $p0, $p1, (highhalf (interleave mask0, mask1)))>; def OP_UZP2 : Op<(shuffle $p0, $p1, (add (decimate (rotl mask0, 1), 2), (decimate (rotl mask1, 1), 2)))>; -def OP_EQ : Op<(cast "R", (op "==", $p0, $p1))>; -def OP_GE : Op<(cast "R", (op ">=", $p0, $p1))>; -def OP_LE : Op<(cast "R", (op "<=", $p0, $p1))>; -def OP_GT : Op<(cast "R", (op ">", $p0, $p1))>; -def OP_LT : Op<(cast "R", (op "<", $p0, $p1))>; +def OP_EQ : Op<(bitcast "R", (op "==", $p0, $p1))>; +def OP_GE : Op<(bitcast "R", (op ">=", $p0, $p1))>; +def OP_LE : Op<(bitcast "R", (op "<=", $p0, $p1))>; +def OP_GT : Op<(bitcast "R", (op ">", $p0, $p1))>; +def OP_LT : Op<(bitcast "R", (op "<", $p0, $p1))>; def OP_NEG : Op<(op "-", $p0)>; def OP_NOT : Op<(op "~", $p0)>; def OP_AND : Op<(op "&", $p0, $p1)>; @@ -108,20 +108,20 @@ def OP_XOR : Op<(op "^", $p0, $p1)>; def OP_ANDN : Op<(op "&", $p0, (op "~", $p1))>; def OP_ORN : Op<(op "|", $p0, (op "~", $p1))>; def OP_CAST : LOp<[(save_temp $promote, $p0), - (cast "R", $promote)]>; + (bitcast "R", $promote)]>; def OP_HI : Op<(shuffle $p0, $p0, (highhalf mask0))>; def OP_LO : Op<(shuffle $p0, $p0, (lowhalf mask0))>; def OP_CONC : Op<(shuffle $p0, $p1, (add mask0, mask1))>; def OP_DUP : Op<(dup $p0)>; def OP_DUP_LN : Op<(call_mangled "splat_lane", $p0, $p1)>; -def OP_SEL : Op<(cast "R", (op "|", - (op "&", $p0, (cast $p0, $p1)), - (op "&", (op "~", $p0), (cast $p0, $p2))))>; +def OP_SEL : Op<(bitcast "R", (op "|", + (op "&", $p0, (bitcast $p0, $p1)), + (op "&", (op "~", $p0), (bitcast $p0, $p2))))>; def OP_REV16 : Op<(shuffle $p0, $p0, (rev 16, mask0))>; def OP_REV32 : Op<(shuffle $p0, $p0, (rev 32, mask0))>; def OP_REV64 : Op<(shuffle $p0, $p0, (rev 64, mask0))>; def OP_XTN : Op<(call "vcombine", $p0, (call "vmovn", $p1))>; -def OP_SQXTUN : Op<(call "vcombine", (cast $p0, "U", $p0), +def OP_SQXTUN : Op<(call "vcombine", (bitcast $p0, "U", $p0), (call "vqmovun", $p1))>; def OP_QXTN : Op<(call "vcombine", $p0, (call "vqmovn", $p1))>; def OP_VCVT_NA_HI_F16 : Op<(call "vcombine", $p0, (call "vcvt_f16_f32", $p1))>; @@ -129,12 +129,12 @@ def OP_VCVT_NA_HI_F32 : Op<(call "vcombine", $p0, (call "vcvt_f32_f64", $p1))>; def OP_VCVT_EX_HI_F32 : Op<(call "vcvt_f32_f16", (call "vget_high", $p0))>; def OP_VCVT_EX_HI_F64 : Op<(call "vcvt_f64_f32", (call "vget_high", $p0))>; def OP_VCVTX_HI : Op<(call "vcombine", $p0, (call "vcvtx_f32", $p1))>; -def OP_REINT : Op<(cast "R", $p0)>; +def OP_REINT : Op<(bitcast "R", $p0)>; def OP_ADDHNHi : Op<(call "vcombine", $p0, (call "vaddhn", $p1, $p2))>; def OP_RADDHNHi : Op<(call "vcombine", $p0, (call "vraddhn", $p1, $p2))>; def OP_SUBHNHi : Op<(call "vcombine", $p0, (call "vsubhn", $p1, $p2))>; def OP_RSUBHNHi : Op<(call "vcombine", $p0, (call "vrsubhn", $p1, $p2))>; -def OP_ABDL : Op<(cast "R", (call "vmovl", (cast $p0, "U", +def OP_ABDL : Op<(bitcast "R", (call "vmovl", (bitcast $p0, "U", (call "vabd", $p0, $p1))))>; def OP_ABDLHi : Op<(call "vabdl", (call "vget_high", $p0), (call "vget_high", $p1))>; @@ -152,15 +152,15 @@ def OP_QDMLSLHi : Op<(call "vqdmlsl", $p0, (call "vget_high", $p1), (call "vget_high", $p2))>; def OP_QDMLSLHi_N : Op<(call "vqdmlsl_n", $p0, (call "vget_high", $p1), $p2)>; def OP_DIV : Op<(op "/", $p0, $p1)>; -def OP_LONG_HI : Op<(cast "R", (call (name_replace "_high_", "_"), +def OP_LONG_HI : Op<(bitcast "R", (call (name_replace "_high_", "_"), (call "vget_high", $p0), $p1))>; -def OP_NARROW_HI : Op<(cast "R", (call "vcombine", - (cast "R", "H", $p0), - (cast "R", "H", +def OP_NARROW_HI : Op<(bitcast "R", (call "vcombine", + (bitcast "R", "H", $p0), + (bitcast "R", "H", (call (name_replace "_high_", "_"), $p1, $p2))))>; def OP_MOVL_HI : LOp<[(save_temp $a1, (call "vget_high", $p0)), - (cast "R", + (bitcast "R", (call "vshll_n", $a1, (literal "int32_t", "0")))]>; def OP_COPY_LN : Op<(call "vset_lane", (call "vget_lane", $p2, $p3), $p0, $p1)>; def OP_SCALAR_MUL_LN : Op<(op "*", $p0, (call "vget_lane", $p1, $p2))>; @@ -221,18 +221,18 @@ def OP_FMLSL_LN_Hi : Op<(call "vfmlsl_high", $p0, $p1, def OP_USDOT_LN : Op<(call "vusdot", $p0, $p1, - (cast "8", "S", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)))>; + (bitcast "8", "S", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)))>; def OP_USDOT_LNQ : Op<(call "vusdot", $p0, $p1, - (cast "8", "S", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)))>; + (bitcast "8", "S", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)))>; // sudot splats the second vector and then calls vusdot def OP_SUDOT_LN : Op<(call "vusdot", $p0, - (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)), $p1)>; + (bitcast "8", "U", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)), $p1)>; def OP_SUDOT_LNQ : Op<(call "vusdot", $p0, - (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)), $p1)>; + (bitcast "8", "U", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)), $p1)>; def OP_BFDOT_LN : Op<(call "vbfdot", $p0, $p1, @@ -263,7 +263,7 @@ def OP_VCVT_BF16_F32_A32 : Op<(call "__a32_vcvt_bf16", $p0)>; def OP_VCVT_BF16_F32_LO_A32 - : Op<(call "vcombine", (cast "bfloat16x4_t", (literal "uint64_t", "0ULL")), + : Op<(call "vcombine", (bitcast "bfloat16x4_t", (literal "uint64_t", "0ULL")), (call "__a32_vcvt_bf16", $p0))>; def OP_VCVT_BF16_F32_HI_A32 : Op<(call "vcombine", (call "__a32_vcvt_bf16", $p1), @@ -924,12 +924,12 @@ def CFMLE : SOpInst<"vcle", "U..", "lUldQdQlQUl", OP_LE>; def CFMGT : SOpInst<"vcgt", "U..", "lUldQdQlQUl", OP_GT>; def CFMLT : SOpInst<"vclt", "U..", "lUldQdQlQUl", OP_LT>; -def CMEQ : SInst<"vceqz", "U.", +def CMEQ : SInst<"vceqz", "U(.!)", "csilfUcUsUiUlPcPlQcQsQiQlQfQUcQUsQUiQUlQPcdQdQPl">; -def CMGE : SInst<"vcgez", "U.", "csilfdQcQsQiQlQfQd">; -def CMLE : SInst<"vclez", "U.", "csilfdQcQsQiQlQfQd">; -def CMGT : SInst<"vcgtz", "U.", "csilfdQcQsQiQlQfQd">; -def CMLT : SInst<"vcltz", "U.", "csilfdQcQsQiQlQfQd">; +def CMGE : SInst<"vcgez", "U(.!)", "csilfdQcQsQiQlQfQd">; +def CMLE : SInst<"vclez", "U(.!)", "csilfdQcQsQiQlQfQd">; +def CMGT : SInst<"vcgtz", "U(.!)", "csilfdQcQsQiQlQfQd">; +def CMLT : SInst<"vcltz", "U(.!)", "csilfdQcQsQiQlQfQd">; //////////////////////////////////////////////////////////////////////////////// // Max/Min Integer @@ -1667,11 +1667,11 @@ let TargetGuard = "fullfp16,neon" in { // ARMv8.2-A FP16 one-operand vector intrinsics. // Comparison - def CMEQH : SInst<"vceqz", "U.", "hQh">; - def CMGEH : SInst<"vcgez", "U.", "hQh">; - def CMGTH : SInst<"vcgtz", "U.", "hQh">; - def CMLEH : SInst<"vclez", "U.", "hQh">; - def CMLTH : SInst<"vcltz", "U.", "hQh">; + def CMEQH : SInst<"vceqz", "U(.!)", "hQh">; + def CMGEH : SInst<"vcgez", "U(.!)", "hQh">; + def CMGTH : SInst<"vcgtz", "U(.!)", "hQh">; + def CMLEH : SInst<"vclez", "U(.!)", "hQh">; + def CMLTH : SInst<"vcltz", "U(.!)", "hQh">; // Vector conversion def VCVT_F16 : SInst<"vcvt_f16", "F(.!)", "sUsQsQUs">; diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index ca00a0e8c6cf4..dd73d3b3a75f3 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4694,10 +4694,10 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *EmitTargetBuiltinExpr(unsigned BuiltinID, const CallExpr *E, ReturnValueSlot ReturnValue); - llvm::Value *EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty, - const llvm::CmpInst::Predicate Fp, - const llvm::CmpInst::Predicate Ip, - const llvm::Twine &Name = ""); + llvm::Value * + EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty, + const llvm::CmpInst::Predicate Pred, + const llvm::Twine &Name = ""); llvm::Value *EmitARMBuiltinExpr(unsigned BuiltinID, const CallExpr *E, ReturnValueSlot ReturnValue, llvm::Triple::ArchType Arch); diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 63b3d17f97f85..afe25b5418424 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -1750,8 +1750,9 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( // Determine the type of this overloaded NEON intrinsic. NeonTypeFlags Type(NeonTypeConst->getZExtValue()); - bool Usgn = Type.isUnsigned(); - bool Quad = Type.isQuad(); + const bool Usgn = Type.isUnsigned(); + const bool Quad = Type.isQuad(); + const bool Floating = Type.isFloatingPoint(); const bool HasLegalHalfType = getTarget().hasLegalHalfType(); const bool AllowBFloatArgsAndRet = getTargetHooks().getABIInfo().allowBFloatArgsAndRet(); @@ -1852,24 +1853,28 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( } case NEON::BI__builtin_neon_vceqz_v: case NEON::BI__builtin_neon_vceqzq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ, - ICmpInst::ICMP_EQ, "vceqz"); + return EmitAArch64CompareBuiltinExpr( + Ops[0], Ty, Floating ? ICmpInst::FCMP_OEQ : ICmpInst::ICMP_EQ, "vceqz"); case NEON::BI__builtin_neon_vcgez_v: case NEON::BI__builtin_neon_vcgezq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE, - ICmpInst::ICMP_SGE, "vcgez"); + return EmitAArch64CompareBuiltinExpr( + Ops[0], Ty, Floating ? ICmpInst::FCMP_OGE : ICmpInst::ICMP_SGE, + "vcgez"); case NEON::BI__builtin_neon_vclez_v: case NEON::BI__builtin_neon_vclezq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE, - ICmpInst::ICMP_SLE, "vclez"); + return EmitAArch64CompareBuiltinExpr( + Ops[0], Ty, Floating ? ICmpInst::FCMP_OLE : ICmpInst::ICMP_SLE, + "vclez"); case NEON::BI__builtin_neon_vcgtz_v: case NEON::BI__builtin_neon_vcgtzq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT, - ICmpInst::ICMP_SGT, "vcgtz"); + return EmitAArch64CompareBuiltinExpr( + Ops[0], Ty, Floating ? ICmpInst::FCMP_OGT : ICmpInst::ICMP_SGT, + "vcgtz"); case NEON::BI__builtin_neon_vcltz_v: case NEON::BI__builtin_neon_vcltzq_v: - return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT, - ICmpInst::ICMP_SLT, "vcltz"); + return EmitAArch64CompareBuiltinExpr( + Ops[0], Ty, Floating ? ICmpInst::FCMP_OLT : ICmpInst::ICMP_SLT, + "vcltz"); case NEON::BI__builtin_neon_vclz_v: case NEON::BI__builtin_neon_vclzq_v: // We generate target-independent intrinsic, which needs a second argument @@ -2432,28 +2437,32 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( return Builder.CreateBitCast(Result, ResultType, NameHint); } -Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr( - Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp, - const CmpInst::Predicate Ip, const Twine &Name) { - llvm::Type *OTy = Op->getType(); - - // FIXME: this is utterly horrific. We should not be looking at previous - // codegen context to find out what needs doing. Unfortunately TableGen - // currently gives us exactly the same calls for vceqz_f32 and vceqz_s32 - // (etc). - if (BitCastInst *BI = dyn_cast(Op)) - OTy = BI->getOperand(0)->getType(); - - Op = Builder.CreateBitCast(Op, OTy); - if (OTy->getScalarType()->isFloatingPointTy()) { - if (Fp == CmpInst::FCMP_OEQ) - Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy)); +Value * +CodeGenFunction::EmitAArch64CompareBuiltinExpr(Value *Op, llvm::Type *Ty, + const CmpInst::Predicate Pred, + const Twine &Name) { + + if (isa(Ty)) { + // Vector types are cast to i8 vectors. Recover original type. + Op = Builder.CreateBitCast(Op, Ty); + } + + if (CmpInst::isFPPredicate(Pred)) { + if (Pred == CmpInst::FCMP_OEQ) + Op = Builder.CreateFCmp(Pred, Op, Constant::getNullValue(Op->getType())); else - Op = Builder.CreateFCmpS(Fp, Op, Constant::getNullValue(OTy)); + Op = Builder.CreateFCmpS(Pred, Op, Constant::getNullValue(Op->getType())); } else { - Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy)); + Op = Builder.CreateICmp(Pred, Op, Constant::getNullValue(Op->getType())); } - return Builder.CreateSExt(Op, Ty, Name); + + llvm::Type *ResTy = Ty; + if (auto *VTy = dyn_cast(Ty)) + ResTy = FixedVectorType::get( + IntegerType::get(getLLVMContext(), VTy->getScalarSizeInBits()), + VTy->getNumElements()); + + return Builder.CreateSExt(Op, ResTy, Name); } static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef Ops, @@ -5955,45 +5964,66 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return Builder.CreateFAdd(Op0, Op1, "vpaddd"); } case NEON::BI__builtin_neon_vceqzd_s64: + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitAArch64CompareBuiltinExpr( + Ops[0], ConvertType(E->getCallReturnType(getContext())), + ICmpInst::ICMP_EQ, "vceqz"); case NEON::BI__builtin_neon_vceqzd_f64: case NEON::BI__builtin_neon_vceqzs_f32: case NEON::BI__builtin_neon_vceqzh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), - ICmpInst::FCMP_OEQ, ICmpInst::ICMP_EQ, "vceqz"); + ICmpInst::FCMP_OEQ, "vceqz"); case NEON::BI__builtin_neon_vcgezd_s64: + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitAArch64CompareBuiltinExpr( + Ops[0], ConvertType(E->getCallReturnType(getContext())), + ICmpInst::ICMP_SGE, "vcgez"); case NEON::BI__builtin_neon_vcgezd_f64: case NEON::BI__builtin_neon_vcgezs_f32: case NEON::BI__builtin_neon_vcgezh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), - ICmpInst::FCMP_OGE, ICmpInst::ICMP_SGE, "vcgez"); + ICmpInst::FCMP_OGE, "vcgez"); case NEON::BI__builtin_neon_vclezd_s64: + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitAArch64CompareBuiltinExpr( + Ops[0], ConvertType(E->getCallReturnType(getContext())), + ICmpInst::ICMP_SLE, "vclez"); case NEON::BI__builtin_neon_vclezd_f64: case NEON::BI__builtin_neon_vclezs_f32: case NEON::BI__builtin_neon_vclezh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), - ICmpInst::FCMP_OLE, ICmpInst::ICMP_SLE, "vclez"); + ICmpInst::FCMP_OLE, "vclez"); case NEON::BI__builtin_neon_vcgtzd_s64: + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitAArch64CompareBuiltinExpr( + Ops[0], ConvertType(E->getCallReturnType(getContext())), + ICmpInst::ICMP_SGT, "vcgtz"); case NEON::BI__builtin_neon_vcgtzd_f64: case NEON::BI__builtin_neon_vcgtzs_f32: case NEON::BI__builtin_neon_vcgtzh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), - ICmpInst::FCMP_OGT, ICmpInst::ICMP_SGT, "vcgtz"); + ICmpInst::FCMP_OGT, "vcgtz"); case NEON::BI__builtin_neon_vcltzd_s64: + Ops.push_back(EmitScalarExpr(E->getArg(0))); + return EmitAArch64CompareBuiltinExpr( + Ops[0], ConvertType(E->getCallReturnType(getContext())), + ICmpInst::ICMP_SLT, "vcltz"); + case NEON::BI__builtin_neon_vcltzd_f64: case NEON::BI__builtin_neon_vcltzs_f32: case NEON::BI__builtin_neon_vcltzh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), - ICmpInst::FCMP_OLT, ICmpInst::ICMP_SLT, "vcltz"); + ICmpInst::FCMP_OLT, "vcltz"); case NEON::BI__builtin_neon_vceqzd_u64: { Ops.push_back(EmitScalarExpr(E->getArg(0))); diff --git a/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c index 877d83c0fa395..2097495b3baee 100644 --- a/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c +++ b/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \ -// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -8,10 +8,16 @@ // CHECK-LABEL: @test_vbfdot_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VBFDOT_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT2_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[VBFDOT_I]], <4 x bfloat> [[VBFDOT1_I]], <4 x bfloat> [[VBFDOT2_I]]) // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { @@ -20,10 +26,16 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { // CHECK-LABEL: @test_vbfdotq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFDOT_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT2_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[VBFDOT_I]], <8 x bfloat> [[VBFDOT1_I]], <8 x bfloat> [[VBFDOT2_I]]) // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ @@ -32,19 +44,24 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ // CHECK-LABEL: @test_vbfdot_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_128:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-NEXT: [[__REINT1_128:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: store <4 x bfloat> [[B:%.*]], ptr [[__REINT_128]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_128]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer -// CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_128]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_128]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[LANE]] to <2 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x float> [[R:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-NEXT: [[VBFDOT_I:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT2_I:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[VBFDOT_I]], <4 x bfloat> [[VBFDOT1_I]], <4 x bfloat> [[VBFDOT2_I]]) // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ @@ -53,19 +70,24 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ // CHECK-LABEL: @test_vbfdotq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_130:%.*]] = alloca <8 x bfloat>, align 16 -// CHECK-NEXT: [[__REINT1_130:%.*]] = alloca <4 x float>, align 16 -// CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_130]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_130]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> -// CHECK-NEXT: store <4 x float> [[LANE]], ptr [[__REINT1_130]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_130]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP4]], <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[LANE]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK-NEXT: [[VBFDOT_I:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT2_I:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[VBFDOT_I]], <8 x bfloat> [[VBFDOT1_I]], <8 x bfloat> [[VBFDOT2_I]]) // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -74,19 +96,24 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b // CHECK-LABEL: @test_vbfdot_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_132:%.*]] = alloca <8 x bfloat>, align 16 -// CHECK-NEXT: [[__REINT1_132:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_132]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_132]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> -// CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_132]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_132]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP4]], <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[LANE]] to <2 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x float> [[R:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-NEXT: [[VBFDOT_I:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT2_I:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[VBFDOT_I]], <4 x bfloat> [[VBFDOT1_I]], <4 x bfloat> [[VBFDOT2_I]]) // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { @@ -95,19 +122,24 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) // CHECK-LABEL: @test_vbfdotq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_126:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-NEXT: [[__REINT1_126:%.*]] = alloca <4 x float>, align 16 -// CHECK-NEXT: store <4 x bfloat> [[B:%.*]], ptr [[__REINT_126]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_126]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> zeroinitializer -// CHECK-NEXT: store <4 x float> [[LANE]], ptr [[__REINT1_126]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_126]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[LANE]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK-NEXT: [[VBFDOT_I:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT2_I:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[VBFDOT_I]], <8 x bfloat> [[VBFDOT1_I]], <8 x bfloat> [[VBFDOT2_I]]) // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { @@ -116,12 +148,20 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) // CHECK-LABEL: @test_vbfmmlaq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMMLAQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMMLAQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMMLAQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[VBFMMLAQ_F32_I]], <8 x bfloat> [[VBFMMLAQ_F321_I]], <8 x bfloat> [[VBFMMLAQ_F322_I]]) // CHECK-NEXT: [[VBFMMLAQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMMLAQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMMLAQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmmlaq_f32(r, a, b); @@ -129,12 +169,20 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlalbq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALBQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMLALBQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALBQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[VBFMLALBQ_F32_I]], <8 x bfloat> [[VBFMLALBQ_F321_I]], <8 x bfloat> [[VBFMLALBQ_F322_I]]) // CHECK-NEXT: [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMLALBQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_f32(r, a, b); @@ -142,12 +190,20 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlaltq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALTQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMLALTQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALTQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[VBFMLALTQ_F32_I]], <8 x bfloat> [[VBFMLALTQ_F321_I]], <8 x bfloat> [[VBFMLALTQ_F322_I]]) // CHECK-NEXT: [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMLALTQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_f32(r, a, b); @@ -157,26 +213,34 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4 -// CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5 +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[VGET_LANE10:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x bfloat> [[VECINIT6]], bfloat [[VGET_LANE10]], i32 2 +// CHECK-NEXT: [[VGET_LANE16:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <8 x bfloat> [[VECINIT12]], bfloat [[VGET_LANE16]], i32 3 +// CHECK-NEXT: [[VGET_LANE22:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT24:%.*]] = insertelement <8 x bfloat> [[VECINIT18]], bfloat [[VGET_LANE22]], i32 4 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6 -// CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) +// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT24]], bfloat [[VGET_LANE28]], i32 5 +// CHECK-NEXT: [[VGET_LANE34:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT36:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE34]], i32 6 +// CHECK-NEXT: [[VGET_LANE40:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x bfloat> [[VECINIT36]], bfloat [[VGET_LANE40]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT42]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALBQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMLALBQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALBQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[VBFMLALBQ_F32_I]], <8 x bfloat> [[VBFMLALBQ_F321_I]], <8 x bfloat> [[VBFMLALBQ_F322_I]]) // CHECK-NEXT: [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMLALBQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlalbq_lane_f32(r, a, b, 0); @@ -186,26 +250,34 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t // CHECK-NEXT: entry: // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGETQ_LANE8]], i32 2 -// CHECK-NEXT: [[VGETQ_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGETQ_LANE13]], i32 3 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGETQ_LANE18]], i32 4 -// CHECK-NEXT: [[VGETQ_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGETQ_LANE23]], i32 5 +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[VGETQ_LANE10:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x bfloat> [[VECINIT6]], bfloat [[VGETQ_LANE10]], i32 2 +// CHECK-NEXT: [[VGETQ_LANE16:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <8 x bfloat> [[VECINIT12]], bfloat [[VGETQ_LANE16]], i32 3 +// CHECK-NEXT: [[VGETQ_LANE22:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT24:%.*]] = insertelement <8 x bfloat> [[VECINIT18]], bfloat [[VGETQ_LANE22]], i32 4 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGETQ_LANE28]], i32 6 -// CHECK-NEXT: [[VGETQ_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGETQ_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) +// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT24]], bfloat [[VGETQ_LANE28]], i32 5 +// CHECK-NEXT: [[VGETQ_LANE34:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT36:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGETQ_LANE34]], i32 6 +// CHECK-NEXT: [[VGETQ_LANE40:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x bfloat> [[VECINIT36]], bfloat [[VGETQ_LANE40]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT42]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALBQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMLALBQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALBQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[VBFMLALBQ_F32_I]], <8 x bfloat> [[VBFMLALBQ_F321_I]], <8 x bfloat> [[VBFMLALBQ_F322_I]]) // CHECK-NEXT: [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMLALBQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_laneq_f32(r, a, b, 3); @@ -215,26 +287,34 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t // CHECK-NEXT: entry: // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4 -// CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5 +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[VGET_LANE10:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x bfloat> [[VECINIT6]], bfloat [[VGET_LANE10]], i32 2 +// CHECK-NEXT: [[VGET_LANE16:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <8 x bfloat> [[VECINIT12]], bfloat [[VGET_LANE16]], i32 3 +// CHECK-NEXT: [[VGET_LANE22:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT24:%.*]] = insertelement <8 x bfloat> [[VECINIT18]], bfloat [[VGET_LANE22]], i32 4 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6 -// CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) +// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT24]], bfloat [[VGET_LANE28]], i32 5 +// CHECK-NEXT: [[VGET_LANE34:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT36:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE34]], i32 6 +// CHECK-NEXT: [[VGET_LANE40:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x bfloat> [[VECINIT36]], bfloat [[VGET_LANE40]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT42]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALTQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMLALTQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALTQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[VBFMLALTQ_F32_I]], <8 x bfloat> [[VBFMLALTQ_F321_I]], <8 x bfloat> [[VBFMLALTQ_F322_I]]) // CHECK-NEXT: [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMLALTQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlaltq_lane_f32(r, a, b, 0); @@ -244,26 +324,34 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t // CHECK-NEXT: entry: // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGETQ_LANE8]], i32 2 -// CHECK-NEXT: [[VGETQ_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGETQ_LANE13]], i32 3 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGETQ_LANE18]], i32 4 -// CHECK-NEXT: [[VGETQ_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGETQ_LANE23]], i32 5 +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[VGETQ_LANE10:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x bfloat> [[VECINIT6]], bfloat [[VGETQ_LANE10]], i32 2 +// CHECK-NEXT: [[VGETQ_LANE16:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <8 x bfloat> [[VECINIT12]], bfloat [[VGETQ_LANE16]], i32 3 +// CHECK-NEXT: [[VGETQ_LANE22:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT24:%.*]] = insertelement <8 x bfloat> [[VECINIT18]], bfloat [[VGETQ_LANE22]], i32 4 // CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGETQ_LANE28]], i32 6 -// CHECK-NEXT: [[VGETQ_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGETQ_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) +// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT24]], bfloat [[VGETQ_LANE28]], i32 5 +// CHECK-NEXT: [[VGETQ_LANE34:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT36:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGETQ_LANE34]], i32 6 +// CHECK-NEXT: [[VGETQ_LANE40:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x bfloat> [[VECINIT36]], bfloat [[VGETQ_LANE40]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT42]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALTQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMLALTQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALTQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[VBFMLALTQ_F32_I]], <8 x bfloat> [[VBFMLALTQ_F321_I]], <8 x bfloat> [[VBFMLALTQ_F322_I]]) // CHECK-NEXT: [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMLALTQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_laneq_f32(r, a, b, 3); diff --git a/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c index 9da2cd5af3221..d54e56697f8b8 100644 --- a/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c +++ b/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \ -// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -45,9 +45,10 @@ bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) { // CHECK-LABEL: @test_vdup_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x bfloat> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP2]], <4 x i32> // CHECK-NEXT: ret <4 x bfloat> [[LANE]] // bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) { @@ -56,9 +57,10 @@ bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) { // CHECK-LABEL: @test_vdupq_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x bfloat> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP2]], <8 x i32> // CHECK-NEXT: ret <8 x bfloat> [[LANE]] // bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) { @@ -67,9 +69,10 @@ bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) { // CHECK-LABEL: @test_vdup_laneq_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x bfloat> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP2]], <4 x i32> // CHECK-NEXT: ret <4 x bfloat> [[LANE]] // bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) { @@ -78,9 +81,10 @@ bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) { // CHECK-LABEL: @test_vdupq_laneq_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x bfloat> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP2]], <8 x i32> // CHECK-NEXT: ret <8 x bfloat> [[LANE]] // bfloat16x8_t test_vdupq_laneq_bf16(bfloat16x8_t v) { diff --git a/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c index 2b271ac88462b..88f2305e2782c 100644 --- a/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c +++ b/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c @@ -1,333 +1,413 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target #include -// CHECK-LABEL: @test_vreinterpret_bf16_s8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A:%.*]] to <4 x bfloat> -// CHECK-NEXT: ret <4 x bfloat> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <4 x bfloat> +// CHECK-NEXT: ret <4 x bfloat> [[TMP1]] // bfloat16x4_t test_vreinterpret_bf16_s8(int8x8_t a) { return vreinterpret_bf16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_s16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_s16(int16x4_t a) { return vreinterpret_bf16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_s32(int32x2_t a) { return vreinterpret_bf16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_f32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <4 x bfloat> -// CHECK-NEXT: ret <4 x bfloat> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <4 x bfloat> +// CHECK-NEXT: ret <4 x bfloat> [[TMP1]] // bfloat16x4_t test_vreinterpret_bf16_f32(float32x2_t a) { return vreinterpret_bf16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_u8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A:%.*]] to <4 x bfloat> -// CHECK-NEXT: ret <4 x bfloat> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <4 x bfloat> +// CHECK-NEXT: ret <4 x bfloat> [[TMP1]] // bfloat16x4_t test_vreinterpret_bf16_u8(uint8x8_t a) { return vreinterpret_bf16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_u16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_u16(uint16x4_t a) { return vreinterpret_bf16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_u32(uint32x2_t a) { return vreinterpret_bf16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_p8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A:%.*]] to <4 x bfloat> -// CHECK-NEXT: ret <4 x bfloat> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <4 x bfloat> +// CHECK-NEXT: ret <4 x bfloat> [[TMP1]] // bfloat16x4_t test_vreinterpret_bf16_p8(poly8x8_t a) { return vreinterpret_bf16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_p16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_p16(poly16x4_t a) { return vreinterpret_bf16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_u64(uint64x1_t a) { return vreinterpret_bf16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_s64(int64x1_t a) { return vreinterpret_bf16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_s8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x bfloat> -// CHECK-NEXT: ret <8 x bfloat> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <8 x bfloat> +// CHECK-NEXT: ret <8 x bfloat> [[TMP1]] // bfloat16x8_t test_vreinterpretq_bf16_s8(int8x16_t a) { return vreinterpretq_bf16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_s16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_s16(int16x8_t a) { return vreinterpretq_bf16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_s32(int32x4_t a) { return vreinterpretq_bf16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_f32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <8 x bfloat> -// CHECK-NEXT: ret <8 x bfloat> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x bfloat> +// CHECK-NEXT: ret <8 x bfloat> [[TMP1]] // bfloat16x8_t test_vreinterpretq_bf16_f32(float32x4_t a) { return vreinterpretq_bf16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_u8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x bfloat> -// CHECK-NEXT: ret <8 x bfloat> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <8 x bfloat> +// CHECK-NEXT: ret <8 x bfloat> [[TMP1]] // bfloat16x8_t test_vreinterpretq_bf16_u8(uint8x16_t a) { return vreinterpretq_bf16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_u16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_u16(uint16x8_t a) { return vreinterpretq_bf16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_u32(uint32x4_t a) { return vreinterpretq_bf16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_p8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x bfloat> -// CHECK-NEXT: ret <8 x bfloat> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <8 x bfloat> +// CHECK-NEXT: ret <8 x bfloat> [[TMP1]] // bfloat16x8_t test_vreinterpretq_bf16_p8(poly8x16_t a) { return vreinterpretq_bf16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_p16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_p16(poly16x8_t a) { return vreinterpretq_bf16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_u64(uint64x2_t a) { return vreinterpretq_bf16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_s64(int64x2_t a) { return vreinterpretq_bf16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_p64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_p64(poly64x1_t a) { return vreinterpret_bf16_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_p64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_p64(poly64x2_t a) { return vreinterpretq_bf16_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_p128( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p128( +// CHECK-SAME: i128 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_p128(poly128_t a) { return vreinterpretq_bf16_p128(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_f64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <4 x bfloat> -// CHECK-NEXT: ret <4 x bfloat> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <4 x bfloat> +// CHECK-NEXT: ret <4 x bfloat> [[TMP1]] // bfloat16x4_t test_vreinterpret_bf16_f64(float64x1_t a) { return vreinterpret_bf16_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_f64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <8 x bfloat> -// CHECK-NEXT: ret <8 x bfloat> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <8 x bfloat> +// CHECK-NEXT: ret <8 x bfloat> [[TMP1]] // bfloat16x8_t test_vreinterpretq_bf16_f64(float64x2_t a) { return vreinterpretq_bf16_f64(a); } -// CHECK-LABEL: @test_vreinterpret_s8_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] // int8x8_t test_vreinterpret_s8_bf16(bfloat16x4_t a) { return vreinterpret_s8_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[TMP0]] // int16x4_t test_vreinterpret_s16_bf16(bfloat16x4_t a) { return vreinterpret_s16_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <2 x i32> +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x i32> // CHECK-NEXT: ret <2 x i32> [[TMP0]] // int32x2_t test_vreinterpret_s32_bf16(bfloat16x4_t a) { return vreinterpret_s32_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <2 x float> -// CHECK-NEXT: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] // float32x2_t test_vreinterpret_f32_bf16(bfloat16x4_t a) { return vreinterpret_f32_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] // uint8x8_t test_vreinterpret_u8_bf16(bfloat16x4_t a) { return vreinterpret_u8_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[TMP0]] // uint16x4_t test_vreinterpret_u16_bf16(bfloat16x4_t a) { return vreinterpret_u16_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <2 x i32> +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x i32> // CHECK-NEXT: ret <2 x i32> [[TMP0]] // uint32x2_t test_vreinterpret_u32_bf16(bfloat16x4_t a) { return vreinterpret_u32_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] // poly8x8_t test_vreinterpret_p8_bf16(bfloat16x4_t a) { return vreinterpret_p8_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[TMP0]] // poly16x4_t test_vreinterpret_p16_bf16(bfloat16x4_t a) { return vreinterpret_p16_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <1 x i64> -// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] // uint64x1_t test_vreinterpret_u64_bf16(bfloat16x4_t a) { return vreinterpret_u64_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <1 x i64> -// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] // int64x1_t test_vreinterpret_s64_bf16(bfloat16x4_t a) { return vreinterpret_s64_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_p64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <1 x i64> -// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] // poly64x1_t test_vreinterpret_p64_bf16(bfloat16x4_t a) { return vreinterpret_p64_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] // int8x16_t test_vreinterpretq_s8_bf16(bfloat16x8_t a) { return vreinterpretq_s8_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[TMP0]] // int16x8_t test_vreinterpretq_s16_bf16(bfloat16x8_t a) { return vreinterpretq_s16_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <4 x i32> +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP0]] // int32x4_t test_vreinterpretq_s32_bf16(bfloat16x8_t a) { return vreinterpretq_s32_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <4 x float> -// CHECK-NEXT: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] // float32x4_t test_vreinterpretq_f32_bf16(bfloat16x8_t a) { return vreinterpretq_f32_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] // uint8x16_t test_vreinterpretq_u8_bf16(bfloat16x8_t a) { return vreinterpretq_u8_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[TMP0]] // uint16x8_t test_vreinterpretq_u16_bf16(bfloat16x8_t a) { return vreinterpretq_u16_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <4 x i32> +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP0]] // uint32x4_t test_vreinterpretq_u32_bf16(bfloat16x8_t a) { return vreinterpretq_u32_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] // poly8x16_t test_vreinterpretq_p8_bf16(bfloat16x8_t a) { return vreinterpretq_p8_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[TMP0]] // poly16x8_t test_vreinterpretq_p16_bf16(bfloat16x8_t a) { return vreinterpretq_p16_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <2 x i64> +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[TMP0]] // uint64x2_t test_vreinterpretq_u64_bf16(bfloat16x8_t a) { return vreinterpretq_u64_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <2 x i64> +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[TMP0]] // int64x2_t test_vreinterpretq_s64_bf16(bfloat16x8_t a) { return vreinterpretq_s64_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <2 x i64> +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[TMP0]] // poly64x2_t test_vreinterpretq_p64_bf16(bfloat16x8_t a) { return vreinterpretq_p64_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_p128_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to i128 +// CHECK-LABEL: define dso_local i128 @test_vreinterpretq_p128_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] // poly128_t test_vreinterpretq_p128_bf16(bfloat16x8_t a) { return vreinterpretq_p128_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_f64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] // float64x1_t test_vreinterpret_f64_bf16(bfloat16x4_t a) { return vreinterpret_f64_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <2 x double> -// CHECK-NEXT: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] // float64x2_t test_vreinterpretq_f64_bf16(bfloat16x8_t a) { return vreinterpretq_f64_bf16(a); } diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c index 4305b840f2a05..9913c54567719 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,sroa | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,sroa | FileCheck %s -check-prefix CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -S -O3 -o /dev/null %s @@ -274,19 +274,27 @@ mfloat8x16_t test_vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn, // CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f16_fpm( // CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <8 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]]) +// CHECK-NEXT: [[VFCVTN_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VFCVTN1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VFCVTN_I]], <4 x half> [[VFCVTN1_I]]) // CHECK-NEXT: ret <8 x i8> [[VFCVTN2_I]] // // CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f16_fpm13__Float16x4_tS_m( // CHECK-CXX-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8> -// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <8 x i8> +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <4 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <4 x i16> +// CHECK-CXX-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]]) +// CHECK-CXX-NEXT: [[VFCVTN_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-CXX-NEXT: [[VFCVTN1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VFCVTN_I]], <4 x half> [[VFCVTN1_I]]) // CHECK-CXX-NEXT: ret <8 x i8> [[VFCVTN2_I]] // mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) { @@ -296,19 +304,27 @@ mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) { // CHECK-LABEL: define dso_local <16 x i8> @test_vcvtq_mf8_f16_fpm( // CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <16 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]]) +// CHECK-NEXT: [[VFCVTN_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VFCVTN1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VFCVTN_I]], <8 x half> [[VFCVTN1_I]]) // CHECK-NEXT: ret <16 x i8> [[VFCVTN2_I]] // // CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z22test_vcvtq_mf8_f16_fpm13__Float16x8_tS_m( // CHECK-CXX-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8> -// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]]) +// CHECK-CXX-NEXT: [[VFCVTN_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-CXX-NEXT: [[VFCVTN1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VFCVTN_I]], <8 x half> [[VFCVTN1_I]]) // CHECK-CXX-NEXT: ret <16 x i8> [[VFCVTN2_I]] // mfloat8x16_t test_vcvtq_mf8_f16_fpm(float16x8_t vn, float16x8_t vm, fpm_t fpm) { diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c index 4d2f5d550c4dc..44db59df6c1c4 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,sroa | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,sroa | FileCheck %s -check-prefix CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -O3 -Werror -Wall -S -o /dev/null %s @@ -12,17 +12,21 @@ // CHECK-LABEL: define dso_local <4 x half> @test_vdot_f16( // CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT21_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]]) +// CHECK-NEXT: [[FDOT2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[FDOT21_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[FDOT2_I]], <8 x i8> [[VN]], <8 x i8> [[VM]]) // CHECK-NEXT: ret <4 x half> [[FDOT21_I]] // // CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z13test_vdot_f1613__Float16x4_t13__Mfloat8x8_tS0_m( // CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8> +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <4 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-CXX-NEXT: [[FDOT21_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]]) +// CHECK-CXX-NEXT: [[FDOT2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-CXX-NEXT: [[FDOT21_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[FDOT2_I]], <8 x i8> [[VN]], <8 x i8> [[VM]]) // CHECK-CXX-NEXT: ret <4 x half> [[FDOT21_I]] // float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) { @@ -32,17 +36,21 @@ float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t // CHECK-LABEL: define dso_local <8 x half> @test_vdotq_f16( // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT21_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) +// CHECK-NEXT: [[FDOT2_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[FDOT21_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[FDOT2_I]], <16 x i8> [[VN]], <16 x i8> [[VM]]) // CHECK-NEXT: ret <8 x half> [[FDOT21_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z14test_vdotq_f1613__Float16x8_t14__Mfloat8x16_tS0_m( // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-CXX-NEXT: [[FDOT21_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) +// CHECK-CXX-NEXT: [[FDOT2_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-CXX-NEXT: [[FDOT21_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[FDOT2_I]], <16 x i8> [[VN]], <16 x i8> [[VM]]) // CHECK-CXX-NEXT: ret <8 x half> [[FDOT21_I]] // float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) { @@ -52,21 +60,23 @@ float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm // CHECK-LABEL: define dso_local <4 x half> @test_vdot_lane_f16( // CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3) +// CHECK-NEXT: [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[TMP2]], i32 3) // CHECK-NEXT: ret <4 x half> [[FDOT2_LANE1]] // // CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z18test_vdot_lane_f1613__Float16x4_t13__Mfloat8x8_tS0_m( // CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8> -// CHECK-CXX-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <4 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-CXX-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-CXX-NEXT: [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-CXX-NEXT: [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3) +// CHECK-CXX-NEXT: [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-CXX-NEXT: [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[TMP2]], i32 3) // CHECK-CXX-NEXT: ret <4 x half> [[FDOT2_LANE1]] // float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) { @@ -76,18 +86,20 @@ float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, f // CHECK-LABEL: define dso_local <4 x half> @test_vdot_laneq_f16( // CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> // CHECK-NEXT: [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7) // CHECK-NEXT: ret <4 x half> [[FDOT2_LANE1]] // // CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z19test_vdot_laneq_f1613__Float16x4_t13__Mfloat8x8_t14__Mfloat8x16_tm( // CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8> +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <4 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-CXX-NEXT: [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-CXX-NEXT: [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> // CHECK-CXX-NEXT: [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7) // CHECK-CXX-NEXT: ret <4 x half> [[FDOT2_LANE1]] // @@ -98,21 +110,23 @@ float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, // CHECK-LABEL: define dso_local <8 x half> @test_vdotq_lane_f16( // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3) +// CHECK-NEXT: [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP2]], i32 3) // CHECK-NEXT: ret <8 x half> [[FDOT2_LANE1]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z19test_vdotq_lane_f1613__Float16x8_t14__Mfloat8x16_t13__Mfloat8x8_tm( // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> -// CHECK-CXX-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-CXX-NEXT: [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-CXX-NEXT: [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3) +// CHECK-CXX-NEXT: [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-CXX-NEXT: [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP2]], i32 3) // CHECK-CXX-NEXT: ret <8 x half> [[FDOT2_LANE1]] // float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) { @@ -122,18 +136,20 @@ float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, // CHECK-LABEL: define dso_local <8 x half> @test_vdotq_laneq_f16( // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> // CHECK-NEXT: [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7) // CHECK-NEXT: ret <8 x half> [[FDOT2_LANE1]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z20test_vdotq_laneq_f1613__Float16x8_t14__Mfloat8x16_tS0_m( // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-CXX-NEXT: [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-CXX-NEXT: [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> // CHECK-CXX-NEXT: [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7) // CHECK-CXX-NEXT: ret <8 x half> [[FDOT2_LANE1]] // diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c index 736538073cb39..d4f074a92b05b 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,sroa | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,sroa | FileCheck %s -check-prefix CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -S -o /dev/null %s @@ -11,17 +11,21 @@ // CHECK-LABEL: define dso_local <8 x half> @test_vmlalb( // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) +// CHECK-NEXT: [[VMLAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VMLAL_I]], <16 x i8> [[VN]], <16 x i8> [[VM]]) // CHECK-NEXT: ret <8 x half> [[VMLAL1_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z11test_vmlalb13__Float16x8_t14__Mfloat8x16_tS0_m( // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) +// CHECK-CXX-NEXT: [[VMLAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-CXX-NEXT: [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VMLAL_I]], <16 x i8> [[VN]], <16 x i8> [[VM]]) // CHECK-CXX-NEXT: ret <8 x half> [[VMLAL1_I]] // float16x8_t test_vmlalb(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) { @@ -31,17 +35,21 @@ float16x8_t test_vmlalb(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t // CHECK-LABEL: define dso_local <8 x half> @test_vmlalt( // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) +// CHECK-NEXT: [[VMLAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VMLAL_I]], <16 x i8> [[VN]], <16 x i8> [[VM]]) // CHECK-NEXT: ret <8 x half> [[VMLAL1_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z11test_vmlalt13__Float16x8_t14__Mfloat8x16_tS0_m( // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) +// CHECK-CXX-NEXT: [[VMLAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-CXX-NEXT: [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VMLAL_I]], <16 x i8> [[VN]], <16 x i8> [[VM]]) // CHECK-CXX-NEXT: ret <8 x half> [[VMLAL1_I]] // float16x8_t test_vmlalt(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) { @@ -123,21 +131,23 @@ float32x4_t test_vmlalltt(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_ // CHECK-LABEL: define dso_local <8 x half> @test_vmlalb_lane( // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 0) +// CHECK-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP2]], i32 0) // CHECK-NEXT: ret <8 x half> [[VMLAL_LANE1]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z16test_vmlalb_lane13__Float16x8_t14__Mfloat8x16_t13__Mfloat8x8_tm( // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> -// CHECK-CXX-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-CXX-NEXT: [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 0) +// CHECK-CXX-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-CXX-NEXT: [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP2]], i32 0) // CHECK-CXX-NEXT: ret <8 x half> [[VMLAL_LANE1]] // float16x8_t test_vmlalb_lane(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) { @@ -147,18 +157,20 @@ float16x8_t test_vmlalb_lane(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fp // CHECK-LABEL: define dso_local <8 x half> @test_vmlalb_laneq( // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> // CHECK-NEXT: [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 0) // CHECK-NEXT: ret <8 x half> [[VMLAL_LANE1]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z17test_vmlalb_laneq13__Float16x8_t14__Mfloat8x16_tS0_m( // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-CXX-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> // CHECK-CXX-NEXT: [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 0) // CHECK-CXX-NEXT: ret <8 x half> [[VMLAL_LANE1]] // @@ -169,21 +181,23 @@ float16x8_t test_vmlalb_laneq(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, // CHECK-LABEL: define dso_local <8 x half> @test_vmlalt_lane( // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 7) +// CHECK-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP2]], i32 7) // CHECK-NEXT: ret <8 x half> [[VMLAL_LANE1]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z16test_vmlalt_lane13__Float16x8_t14__Mfloat8x16_t13__Mfloat8x8_tm( // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> -// CHECK-CXX-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-CXX-NEXT: [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 7) +// CHECK-CXX-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-CXX-NEXT: [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP2]], i32 7) // CHECK-CXX-NEXT: ret <8 x half> [[VMLAL_LANE1]] // float16x8_t test_vmlalt_lane(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) { @@ -193,18 +207,20 @@ float16x8_t test_vmlalt_lane(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fp // CHECK-LABEL: define dso_local <8 x half> @test_vmlalt_laneq( // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> // CHECK-NEXT: [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 15) // CHECK-NEXT: ret <8 x half> [[VMLAL_LANE1]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z17test_vmlalt_laneq13__Float16x8_t14__Mfloat8x16_tS0_m( // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> // CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-CXX-NEXT: [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> // CHECK-CXX-NEXT: [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 15) // CHECK-CXX-NEXT: ret <8 x half> [[VMLAL_LANE1]] // diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_reinterpret.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_reinterpret.c index 201d4dbbe34ad..adf5fceb9ceb9 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_reinterpret.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_reinterpret.c @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 #include -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,sroa | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,sroa | FileCheck %s -check-prefix CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -S -o /dev/null %s @@ -23,14 +23,16 @@ poly8x8_t test_vreinterpret_p8_mf8(mfloat8x8_t v) { // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_mf8( // CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x i64> -// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] // // CHECK-CXX-LABEL: define dso_local noundef <1 x i64> @_Z25test_vreinterpret_p64_mf813__Mfloat8x8_t( // CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x i64> -// CHECK-CXX-NEXT: ret <1 x i64> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to i64 +// CHECK-CXX-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-CXX-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] // poly64x1_t test_vreinterpret_p64_mf8(mfloat8x8_t v) { return vreinterpret_p64_mf8(v); @@ -182,14 +184,16 @@ int8x16_t test_vreinterpretq_s8_mf8(mfloat8x16_t v) { // CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_mf8( // CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <2 x double> -// CHECK-NEXT: ret <2 x double> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] // // CHECK-CXX-LABEL: define dso_local noundef <2 x double> @_Z26test_vreinterpretq_f64_mf814__Mfloat8x16_t( // CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <2 x double> -// CHECK-CXX-NEXT: ret <2 x double> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <2 x i64> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-CXX-NEXT: ret <2 x double> [[TMP1]] // float64x2_t test_vreinterpretq_f64_mf8(mfloat8x16_t v) { return vreinterpretq_f64_mf8(v); @@ -197,14 +201,16 @@ float64x2_t test_vreinterpretq_f64_mf8(mfloat8x16_t v) { // CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_mf8( // CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <4 x float> -// CHECK-NEXT: ret <4 x float> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] // // CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z26test_vreinterpretq_f32_mf814__Mfloat8x16_t( // CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <4 x float> -// CHECK-CXX-NEXT: ret <4 x float> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <4 x i32> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-CXX-NEXT: ret <4 x float> [[TMP1]] // float32x4_t test_vreinterpretq_f32_mf8(mfloat8x16_t v) { return vreinterpretq_f32_mf8(v); @@ -212,14 +218,16 @@ float32x4_t test_vreinterpretq_f32_mf8(mfloat8x16_t v) { // CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_mf8( // CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <8 x half> -// CHECK-NEXT: ret <8 x half> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP1]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vreinterpretq_f16_mf814__Mfloat8x16_t( // CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <8 x half> -// CHECK-CXX-NEXT: ret <8 x half> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <8 x half> +// CHECK-CXX-NEXT: ret <8 x half> [[TMP1]] // float16x8_t test_vreinterpretq_f16_mf8(mfloat8x16_t v) { return vreinterpretq_f16_mf8(v); @@ -386,14 +394,16 @@ mfloat8x16_t test_vreinterpretq_mf8_s8(int8x16_t v) { // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_f64( // CHECK-SAME: <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V]] to <16 x i8> -// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] // // CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_f6413__Float64x2_t( // CHECK-CXX-SAME: <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V]] to <16 x i8> -// CHECK-CXX-NEXT: ret <16 x i8> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-CXX-NEXT: ret <16 x i8> [[TMP1]] // mfloat8x16_t test_vreinterpretq_mf8_f64(float64x2_t v) { return vreinterpretq_mf8_f64(v); @@ -401,14 +411,16 @@ mfloat8x16_t test_vreinterpretq_mf8_f64(float64x2_t v) { // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_f32( // CHECK-SAME: <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] // // CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_f3213__Float32x4_t( // CHECK-CXX-SAME: <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-CXX-NEXT: ret <16 x i8> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <4 x i32> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-CXX-NEXT: ret <16 x i8> [[TMP1]] // mfloat8x16_t test_vreinterpretq_mf8_f32(float32x4_t v) { return vreinterpretq_mf8_f32(v); @@ -416,14 +428,16 @@ mfloat8x16_t test_vreinterpretq_mf8_f32(float32x4_t v) { // CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_f16( // CHECK-SAME: <8 x half> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[V]] to <16 x i8> -// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[V]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] // // CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_f1613__Float16x8_t( // CHECK-CXX-SAME: <8 x half> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[V]] to <16 x i8> -// CHECK-CXX-NEXT: ret <16 x i8> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[V]] to <8 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-CXX-NEXT: ret <16 x i8> [[TMP1]] // mfloat8x16_t test_vreinterpretq_mf8_f16(float16x8_t v) { return vreinterpretq_mf8_f16(v); @@ -519,14 +533,16 @@ uint32x2_t test_vreinterpret_u32_mf8(mfloat8x8_t v) { // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_mf8( // CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x i64> -// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] // // CHECK-CXX-LABEL: define dso_local noundef <1 x i64> @_Z25test_vreinterpret_u64_mf813__Mfloat8x8_t( // CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x i64> -// CHECK-CXX-NEXT: ret <1 x i64> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to i64 +// CHECK-CXX-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-CXX-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] // uint64x1_t test_vreinterpret_u64_mf8(mfloat8x8_t v) { return vreinterpret_u64_mf8(v); @@ -562,14 +578,18 @@ int8x8_t test_vreinterpret_s8_mf8(mfloat8x8_t v) { // CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_mf8( // CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] // // CHECK-CXX-LABEL: define dso_local noundef <1 x double> @_Z25test_vreinterpret_f64_mf813__Mfloat8x8_t( // CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x double> -// CHECK-CXX-NEXT: ret <1 x double> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to i64 +// CHECK-CXX-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-CXX-NEXT: ret <1 x double> [[TMP1]] // float64x1_t test_vreinterpret_f64_mf8(mfloat8x8_t v) { return vreinterpret_f64_mf8(v); @@ -577,14 +597,16 @@ float64x1_t test_vreinterpret_f64_mf8(mfloat8x8_t v) { // CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_mf8( // CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <2 x float> -// CHECK-NEXT: ret <2 x float> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] // // CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z25test_vreinterpret_f32_mf813__Mfloat8x8_t( // CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <2 x float> -// CHECK-CXX-NEXT: ret <2 x float> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <2 x i32> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-CXX-NEXT: ret <2 x float> [[TMP1]] // float32x2_t test_vreinterpret_f32_mf8(mfloat8x8_t v) { return vreinterpret_f32_mf8(v); @@ -592,14 +614,16 @@ float32x2_t test_vreinterpret_f32_mf8(mfloat8x8_t v) { // CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_mf8( // CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <4 x half> -// CHECK-NEXT: ret <4 x half> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP1]] // // CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z25test_vreinterpret_f16_mf813__Mfloat8x8_t( // CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <4 x half> -// CHECK-CXX-NEXT: ret <4 x half> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <4 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <4 x half> +// CHECK-CXX-NEXT: ret <4 x half> [[TMP1]] // float16x4_t test_vreinterpret_f16_mf8(mfloat8x8_t v) { return vreinterpret_f16_mf8(v); @@ -622,14 +646,16 @@ int32x2_t test_vreinterpret_s32_mf8(mfloat8x8_t v) { // CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_mf8( // CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x i64> -// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] // // CHECK-CXX-LABEL: define dso_local noundef <1 x i64> @_Z25test_vreinterpret_s64_mf813__Mfloat8x8_t( // CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x i64> -// CHECK-CXX-NEXT: ret <1 x i64> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to i64 +// CHECK-CXX-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-CXX-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] // int64x1_t test_vreinterpret_s64_mf8(mfloat8x8_t v) { return vreinterpret_s64_mf8(v); @@ -751,14 +777,18 @@ mfloat8x8_t test_vreinterpret_mf8_s8(int8x8_t v) { // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_f64( // CHECK-SAME: <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V]] to <8 x i8> -// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] // // CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_f6413__Float64x1_t( // CHECK-CXX-SAME: <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V]] to <8 x i8> -// CHECK-CXX-NEXT: ret <8 x i8> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V]] to i64 +// CHECK-CXX-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-CXX-NEXT: ret <8 x i8> [[TMP1]] // mfloat8x8_t test_vreinterpret_mf8_f64(float64x1_t v) { return vreinterpret_mf8_f64(v); @@ -766,14 +796,16 @@ mfloat8x8_t test_vreinterpret_mf8_f64(float64x1_t v) { // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_f32( // CHECK-SAME: <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] // // CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_f3213__Float32x2_t( // CHECK-CXX-SAME: <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-CXX-NEXT: ret <8 x i8> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <2 x i32> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-CXX-NEXT: ret <8 x i8> [[TMP1]] // mfloat8x8_t test_vreinterpret_mf8_f32(float32x2_t v) { return vreinterpret_mf8_f32(v); @@ -781,14 +813,16 @@ mfloat8x8_t test_vreinterpret_mf8_f32(float32x2_t v) { // CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_f16( // CHECK-SAME: <4 x half> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[V]] to <8 x i8> -// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[V]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] // // CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_f1613__Float16x4_t( // CHECK-CXX-SAME: <4 x half> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[V]] to <8 x i8> -// CHECK-CXX-NEXT: ret <8 x i8> [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[V]] to <4 x i16> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-CXX-NEXT: ret <8 x i8> [[TMP1]] // mfloat8x8_t test_vreinterpret_mf8_f16(float16x4_t v) { return vreinterpret_mf8_f16(v); diff --git a/clang/test/CodeGen/AArch64/neon-2velem.c b/clang/test/CodeGen/AArch64/neon-2velem.c index 75bdeb92fd9ca..2bc7212cde9f8 100644 --- a/clang/test/CodeGen/AArch64/neon-2velem.c +++ b/clang/test/CodeGen/AArch64/neon-2velem.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -407,13 +407,16 @@ uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) { // CHECK-LABEL: @test_vfma_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) // CHECK-NEXT: ret <2 x float> [[FMLA2]] // @@ -423,13 +426,16 @@ float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: @test_vfmaq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <4 x i32> +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) // CHECK-NEXT: ret <4 x float> [[FMLA2]] // @@ -439,15 +445,18 @@ float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: @test_vfma_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) -// CHECK-NEXT: ret <2 x float> [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP7]], <2 x float> [[TMP6]]) +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { return vfma_laneq_f32(a, b, v, 3); @@ -455,15 +464,18 @@ float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: @test_vfmaq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) -// CHECK-NEXT: ret <4 x float> [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <4 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]]) +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { return vfmaq_laneq_f32(a, b, v, 3); @@ -471,14 +483,17 @@ float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { // CHECK-LABEL: @test_vfms_lane_f32( // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) // CHECK-NEXT: ret <2 x float> [[FMLA2]] // @@ -488,14 +503,17 @@ float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: @test_vfmsq_lane_f32( // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <4 x i32> +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) // CHECK-NEXT: ret <4 x float> [[FMLA2]] // @@ -505,16 +523,19 @@ float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: @test_vfms_laneq_f32( // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) -// CHECK-NEXT: ret <2 x float> [[TMP6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP7]], <2 x float> [[TMP6]]) +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { return vfms_laneq_f32(a, b, v, 3); @@ -522,16 +543,19 @@ float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: @test_vfmsq_laneq_f32( // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) -// CHECK-NEXT: ret <4 x float> [[TMP6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <4 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]]) +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { return vfmsq_laneq_f32(a, b, v, 3); @@ -539,13 +563,17 @@ float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { // CHECK-LABEL: @test_vfmaq_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to i64 +// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]]) // CHECK-NEXT: ret <2 x double> [[FMLA2]] // @@ -555,15 +583,18 @@ float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { // CHECK-LABEL: @test_vfmaq_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) -// CHECK-NEXT: ret <2 x double> [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]]) +// CHECK-NEXT: ret <2 x double> [[TMP9]] // float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { return vfmaq_laneq_f64(a, b, v, 1); @@ -571,14 +602,18 @@ float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { // CHECK-LABEL: @test_vfmsq_lane_f64( // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to i64 +// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]]) // CHECK-NEXT: ret <2 x double> [[FMLA2]] // @@ -588,16 +623,19 @@ float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { // CHECK-LABEL: @test_vfmsq_laneq_f64( // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) -// CHECK-NEXT: ret <2 x double> [[TMP6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]]) +// CHECK-NEXT: ret <2 x double> [[TMP9]] // float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { return vfmsq_laneq_f64(a, b, v, 1); @@ -653,7 +691,9 @@ float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -668,7 +708,9 @@ int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -683,7 +725,9 @@ int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -698,7 +742,9 @@ int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -714,7 +760,9 @@ int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -730,7 +778,9 @@ int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -746,7 +796,9 @@ int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -762,7 +814,9 @@ int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -777,7 +831,9 @@ int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -792,7 +848,9 @@ int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -807,7 +865,9 @@ int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -822,7 +882,9 @@ int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -838,7 +900,9 @@ int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -854,7 +918,9 @@ int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -870,7 +936,9 @@ int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -886,7 +954,9 @@ int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -901,7 +971,9 @@ int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -916,7 +988,9 @@ int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -931,7 +1005,9 @@ int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -946,7 +1022,9 @@ int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -962,7 +1040,9 @@ int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -978,7 +1058,9 @@ int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -994,7 +1076,9 @@ int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -1010,7 +1094,9 @@ int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -1025,7 +1111,9 @@ int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1040,7 +1128,9 @@ int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1055,7 +1145,9 @@ int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1070,7 +1162,9 @@ int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1086,7 +1180,9 @@ int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1102,7 +1198,9 @@ int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1118,7 +1216,9 @@ int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1134,7 +1234,9 @@ int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1149,7 +1251,9 @@ int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { @@ -1163,7 +1267,9 @@ int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { @@ -1177,7 +1283,9 @@ int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { @@ -1191,7 +1299,9 @@ uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { @@ -1206,7 +1316,9 @@ uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { @@ -1221,7 +1333,9 @@ int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { @@ -1236,7 +1350,9 @@ int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { @@ -1251,7 +1367,9 @@ uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { @@ -1265,7 +1383,9 @@ uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { @@ -1279,7 +1399,9 @@ int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { @@ -1293,7 +1415,9 @@ int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { @@ -1307,7 +1431,9 @@ uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { @@ -1322,7 +1448,9 @@ uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { @@ -1337,7 +1465,9 @@ int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { @@ -1352,7 +1482,9 @@ int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { @@ -1367,7 +1499,9 @@ uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { @@ -1382,8 +1516,11 @@ uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { @@ -1398,8 +1535,11 @@ int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { @@ -1415,8 +1555,11 @@ int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { @@ -1432,8 +1575,11 @@ int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { @@ -1448,8 +1594,11 @@ int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { @@ -1464,8 +1613,11 @@ int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { @@ -1481,8 +1633,11 @@ int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { @@ -1498,8 +1653,11 @@ int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { @@ -1513,9 +1671,12 @@ int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] // int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) { return vqdmull_lane_s16(a, v, 3); @@ -1528,9 +1689,12 @@ int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP4]] // int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) { return vqdmull_lane_s32(a, v, 1); @@ -1543,9 +1707,12 @@ int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] // int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) { return vqdmull_laneq_s16(a, v, 3); @@ -1558,9 +1725,12 @@ int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP4]] // int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) { return vqdmull_laneq_s32(a, v, 3); @@ -1574,9 +1744,12 @@ int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] // int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) { return vqdmull_high_lane_s16(a, v, 3); @@ -1590,9 +1763,12 @@ int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP4]] // int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) { return vqdmull_high_lane_s32(a, v, 1); @@ -1606,9 +1782,12 @@ int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] // int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) { return vqdmull_high_laneq_s16(a, v, 7); @@ -1622,9 +1801,12 @@ int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP4]] // int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) { return vqdmull_high_laneq_s32(a, v, 3); @@ -1736,9 +1918,10 @@ int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // @@ -1749,14 +1932,18 @@ float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmul_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0 -// CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] -// CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP5]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[V:%.*]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to double +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP5]], i32 0 +// CHECK-NEXT: [[TMP6:%.*]] = fmul double [[TMP4]], [[EXTRACT]] +// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP7]] // float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) { return vmul_lane_f64(a, v, 0); @@ -1764,9 +1951,10 @@ float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) { // CHECK-LABEL: @test_vmulq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // @@ -1776,9 +1964,11 @@ float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP2]], <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // @@ -1788,9 +1978,10 @@ float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { // CHECK-LABEL: @test_vmul_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // @@ -1800,14 +1991,17 @@ float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) { // CHECK-LABEL: @test_vmul_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -// CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] -// CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP5]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to double +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 +// CHECK-NEXT: [[TMP6:%.*]] = fmul double [[TMP4]], [[EXTRACT]] +// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP7]] // float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) { return vmul_laneq_f64(a, v, 1); @@ -1815,9 +2009,10 @@ float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // @@ -1827,9 +2022,10 @@ float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP2]], <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // @@ -1839,12 +2035,17 @@ float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulx_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[LANE]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { @@ -1853,12 +2054,17 @@ float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[LANE]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { @@ -1867,12 +2073,18 @@ float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP2]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x double> [[LANE]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { @@ -1881,12 +2093,17 @@ float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { // CHECK-LABEL: @test_vmulx_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[LANE]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { @@ -1895,12 +2112,17 @@ float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[LANE]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { @@ -1909,12 +2131,17 @@ float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP2]], <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x double> [[LANE]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { @@ -2323,13 +2550,16 @@ uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) { // CHECK-LABEL: @test_vfma_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) // CHECK-NEXT: ret <2 x float> [[FMLA2]] // @@ -2339,13 +2569,16 @@ float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: @test_vfmaq_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) // CHECK-NEXT: ret <4 x float> [[FMLA2]] // @@ -2355,15 +2588,18 @@ float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: @test_vfma_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) -// CHECK-NEXT: ret <2 x float> [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP7]], <2 x float> [[TMP6]]) +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { return vfma_laneq_f32(a, b, v, 0); @@ -2371,15 +2607,18 @@ float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: @test_vfmaq_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) -// CHECK-NEXT: ret <4 x float> [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]]) +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { return vfmaq_laneq_f32(a, b, v, 0); @@ -2387,14 +2626,17 @@ float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) // CHECK-LABEL: @test_vfms_lane_f32_0( // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> // CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) // CHECK-NEXT: ret <2 x float> [[FMLA2]] // @@ -2404,14 +2646,17 @@ float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: @test_vfmsq_lane_f32_0( // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) // CHECK-NEXT: ret <4 x float> [[FMLA2]] // @@ -2421,16 +2666,19 @@ float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: @test_vfms_laneq_f32_0( // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) -// CHECK-NEXT: ret <2 x float> [[TMP6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP7]], <2 x float> [[TMP6]]) +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { return vfms_laneq_f32(a, b, v, 0); @@ -2438,16 +2686,19 @@ float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: @test_vfmsq_laneq_f32_0( // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) -// CHECK-NEXT: ret <4 x float> [[TMP6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP8]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP7]], <4 x float> [[TMP6]]) +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { return vfmsq_laneq_f32(a, b, v, 0); @@ -2455,15 +2706,18 @@ float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) // CHECK-LABEL: @test_vfmaq_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) -// CHECK-NEXT: ret <2 x double> [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]]) +// CHECK-NEXT: ret <2 x double> [[TMP9]] // float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { return vfmaq_laneq_f64(a, b, v, 0); @@ -2471,16 +2725,19 @@ float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) // CHECK-LABEL: @test_vfmsq_laneq_f64_0( // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) -// CHECK-NEXT: ret <2 x double> [[TMP6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP8]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP7]], <2 x double> [[TMP6]]) +// CHECK-NEXT: ret <2 x double> [[TMP9]] // float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { return vfmsq_laneq_f64(a, b, v, 0); @@ -2493,7 +2750,9 @@ float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2508,7 +2767,9 @@ int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2523,7 +2784,9 @@ int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2538,7 +2801,9 @@ int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2554,7 +2819,9 @@ int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2570,7 +2837,9 @@ int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2586,7 +2855,9 @@ int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2602,7 +2873,9 @@ int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2617,7 +2890,9 @@ int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2632,7 +2907,9 @@ int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2647,7 +2924,9 @@ int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2662,7 +2941,9 @@ int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2678,7 +2959,9 @@ int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2694,7 +2977,9 @@ int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2710,7 +2995,9 @@ int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2726,7 +3013,9 @@ int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2741,7 +3030,9 @@ int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2756,7 +3047,9 @@ int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2771,7 +3064,9 @@ int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2786,7 +3081,9 @@ int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2802,7 +3099,9 @@ int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2818,7 +3117,9 @@ int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2834,7 +3135,9 @@ int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2850,7 +3153,9 @@ int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2865,7 +3170,9 @@ int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2880,7 +3187,9 @@ int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2895,7 +3204,9 @@ int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2910,7 +3221,9 @@ int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2926,7 +3239,9 @@ int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2942,7 +3257,9 @@ int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2958,7 +3275,9 @@ int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2974,7 +3293,9 @@ int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2989,7 +3310,9 @@ int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { @@ -3003,7 +3326,9 @@ int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { @@ -3017,7 +3342,9 @@ int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { @@ -3031,7 +3358,9 @@ uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { @@ -3046,7 +3375,9 @@ uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { @@ -3061,7 +3392,9 @@ int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { @@ -3076,7 +3409,9 @@ int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) { @@ -3091,7 +3426,9 @@ uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) { @@ -3105,7 +3442,9 @@ uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { @@ -3119,7 +3458,9 @@ int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { @@ -3133,7 +3474,9 @@ int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { @@ -3147,7 +3490,9 @@ uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { @@ -3162,7 +3507,9 @@ uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { @@ -3177,7 +3524,9 @@ int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { @@ -3192,7 +3541,9 @@ int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) { @@ -3207,7 +3558,9 @@ uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) { @@ -3222,8 +3575,11 @@ uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { @@ -3238,8 +3594,11 @@ int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { @@ -3255,8 +3614,11 @@ int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { @@ -3272,8 +3634,11 @@ int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { @@ -3288,8 +3653,11 @@ int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { @@ -3304,8 +3672,11 @@ int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { @@ -3321,8 +3692,11 @@ int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { @@ -3338,8 +3712,11 @@ int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { @@ -3353,9 +3730,12 @@ int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] // int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) { return vqdmull_lane_s16(a, v, 0); @@ -3368,9 +3748,12 @@ int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP4]] // int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) { return vqdmull_lane_s32(a, v, 0); @@ -3383,9 +3766,12 @@ int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] // int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) { return vqdmull_laneq_s16(a, v, 0); @@ -3398,9 +3784,12 @@ int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP4]] // int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) { return vqdmull_laneq_s32(a, v, 0); @@ -3414,9 +3803,12 @@ int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] // int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { return vqdmull_high_lane_s16(a, v, 0); @@ -3430,9 +3822,12 @@ int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP4]] // int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { return vqdmull_high_lane_s32(a, v, 0); @@ -3446,9 +3841,12 @@ int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] // int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { return vqdmull_high_laneq_s16(a, v, 0); @@ -3462,9 +3860,12 @@ int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP4]] // int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { return vqdmull_high_laneq_s32(a, v, 0); @@ -3576,9 +3977,10 @@ int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // @@ -3588,9 +3990,10 @@ float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // @@ -3600,9 +4003,10 @@ float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmul_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // @@ -3612,14 +4016,17 @@ float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) { // CHECK-LABEL: @test_vmul_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 -// CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] -// CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP5]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to double +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +// CHECK-NEXT: [[TMP6:%.*]] = fmul double [[TMP4]], [[EXTRACT]] +// CHECK-NEXT: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP7]] // float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) { return vmul_laneq_f64(a, v, 0); @@ -3627,9 +4034,10 @@ float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulq_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // @@ -3639,9 +4047,10 @@ float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP2]], <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // @@ -3651,12 +4060,17 @@ float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulx_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[LANE]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { @@ -3665,12 +4079,17 @@ float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[LANE]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { @@ -3679,12 +4098,18 @@ float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP2]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x double> [[LANE]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { @@ -3693,12 +4118,17 @@ float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { // CHECK-LABEL: @test_vmulx_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[LANE]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { @@ -3707,12 +4137,17 @@ float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[LANE]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { @@ -3721,12 +4156,17 @@ float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP2]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[A:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x double> [[LANE]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { @@ -3742,7 +4182,9 @@ float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] // int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) { @@ -3756,7 +4198,9 @@ int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] // int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) { @@ -3772,7 +4216,9 @@ int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] // uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) { @@ -3786,7 +4232,9 @@ uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] // uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) { @@ -3802,9 +4250,12 @@ uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V1_I_I]]) // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) { return vqdmull_high_n_s16(a, b); @@ -3817,9 +4268,12 @@ int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V1_I_I]]) // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> -// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) { return vqdmull_high_n_s32(a, b); @@ -3834,7 +4288,9 @@ int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] // @@ -3849,7 +4305,9 @@ int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I]] // @@ -3866,7 +4324,9 @@ int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] // @@ -3881,7 +4341,9 @@ uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I]] // @@ -3899,8 +4361,11 @@ uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]] // int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { @@ -3915,8 +4380,11 @@ int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) -// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]] // int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { @@ -3932,7 +4400,9 @@ int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] // @@ -3947,7 +4417,9 @@ int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I]] // @@ -3964,7 +4436,9 @@ int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] // @@ -3979,7 +4453,9 @@ uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I]] // @@ -3997,8 +4473,11 @@ uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]] // int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { @@ -4013,8 +4492,11 @@ int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) -// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]] // int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { @@ -4060,11 +4542,17 @@ float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i32 0 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) -// CHECK-NEXT: ret <2 x float> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]]) +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) { return vfma_n_f32(a, b, n); @@ -4073,11 +4561,20 @@ float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) { // CHECK-LABEL: @test_vfma_n_f64( // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i32 0 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) -// CHECK-NEXT: ret <1 x double> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to i64 +// CHECK-NEXT: [[__P2_ADDR_I3_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I2_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__P2_ADDR_I3_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[TMP9:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], <1 x double> [[TMP6]]) +// CHECK-NEXT: ret <1 x double> [[TMP9]] // float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) { return vfma_n_f64(a, b, n); @@ -4089,11 +4586,17 @@ float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) -// CHECK-NEXT: ret <4 x float> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]]) +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { return vfmaq_n_f32(a, b, n); @@ -4104,11 +4607,17 @@ float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[B:%.*]] // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i32 0 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) -// CHECK-NEXT: ret <2 x float> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]]) +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) { return vfms_n_f32(a, b, n); @@ -4118,11 +4627,20 @@ float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) { // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[B:%.*]] // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i32 0 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) -// CHECK-NEXT: ret <1 x double> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to i64 +// CHECK-NEXT: [[__P2_ADDR_I3_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I2_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__P2_ADDR_I3_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[TMP9:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], <1 x double> [[TMP6]]) +// CHECK-NEXT: ret <1 x double> [[TMP9]] // float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) { return vfms_n_f64(a, b, n); @@ -4135,11 +4653,17 @@ float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) -// CHECK-NEXT: ret <4 x float> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]]) +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { return vfmsq_n_f32(a, b, n); @@ -4261,7 +4785,9 @@ uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] // int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) { @@ -4274,7 +4800,9 @@ int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] // int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) { @@ -4289,7 +4817,9 @@ int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] // uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) { @@ -4302,7 +4832,9 @@ uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] // uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) { @@ -4317,9 +4849,12 @@ uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V1_I_I]]) // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) { return vqdmull_n_s16(a, b); @@ -4331,9 +4866,12 @@ int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V1_I_I]]) // CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> -// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] // int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) { return vqdmull_n_s32(a, b); @@ -4347,9 +4885,12 @@ int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) // CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] // int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) { return vqdmulh_n_s16(a, b); @@ -4367,9 +4908,12 @@ int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) { // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) +// CHECK-NEXT: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) // CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] // int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) { return vqdmulhq_n_s16(a, b); @@ -4381,9 +4925,12 @@ int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) // CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] // int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) { return vqdmulh_n_s32(a, b); @@ -4397,9 +4944,12 @@ int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) +// CHECK-NEXT: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) // CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) { return vqdmulhq_n_s32(a, b); @@ -4413,9 +4963,12 @@ int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) // CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] // int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) { return vqrdmulh_n_s16(a, b); @@ -4433,9 +4986,12 @@ int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) { // CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) +// CHECK-NEXT: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) // CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] // int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) { return vqrdmulhq_n_s16(a, b); @@ -4447,9 +5003,12 @@ int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) // CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] // int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) { return vqrdmulh_n_s32(a, b); @@ -4463,9 +5022,12 @@ int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) +// CHECK-NEXT: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) // CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] // int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) { return vqrdmulhq_n_s32(a, b); @@ -4595,7 +5157,9 @@ uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] // @@ -4609,7 +5173,9 @@ int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I]] // @@ -4625,7 +5191,9 @@ int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] // @@ -4639,7 +5207,9 @@ uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I]] // @@ -4656,8 +5226,11 @@ uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]] // int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { @@ -4671,8 +5244,11 @@ int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) -// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]] // int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { @@ -4803,7 +5379,9 @@ uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] // @@ -4817,7 +5395,9 @@ int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I]] // @@ -4833,7 +5413,9 @@ int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] // @@ -4847,7 +5429,9 @@ uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I]] // @@ -4864,8 +5448,11 @@ uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]] // int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { @@ -4879,8 +5466,11 @@ int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) -// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]] // int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { @@ -4999,8 +5589,11 @@ uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { @@ -5015,8 +5608,11 @@ int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { @@ -5032,8 +5628,11 @@ int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { @@ -5049,8 +5648,11 @@ int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { @@ -5169,8 +5771,11 @@ uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { @@ -5185,8 +5790,11 @@ int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { @@ -5202,8 +5810,11 @@ int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { @@ -5219,8 +5830,11 @@ int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { @@ -5443,8 +6057,11 @@ uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { @@ -5459,8 +6076,11 @@ int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { @@ -5476,8 +6096,11 @@ int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { @@ -5493,8 +6116,11 @@ int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { @@ -5613,8 +6239,11 @@ uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { @@ -5629,8 +6258,11 @@ int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { @@ -5646,8 +6278,11 @@ int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { @@ -5663,8 +6298,11 @@ int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> // CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { diff --git a/clang/test/CodeGen/AArch64/neon-extract.c b/clang/test/CodeGen/AArch64/neon-extract.c index e5699f813131f..75dba0d93406a 100644 --- a/clang/test/CodeGen/AArch64/neon-extract.c +++ b/clang/test/CodeGen/AArch64/neon-extract.c @@ -1,246 +1,329 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target #include -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vext_s8(<8 x i8> noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[VEXT]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vext_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) { return vext_s8(a, b, 2); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vext_s16(<4 x i16> noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i16> [[VEXT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vext_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[VEXT]] +// int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) { return vext_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x i32> @test_vext_s32(<2 x i32> noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i32> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vext_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VEXT]] +// int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) { return vext_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vext_s64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[VEXT]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vext_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[VEXT]] +// int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) { return vext_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vextq_s8(<16 x i8> noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[VEXT]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vextq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) { return vextq_s8(a, b, 2); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vextq_s16(<8 x i16> noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK: ret <8 x i16> [[VEXT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vextq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[VEXT]] +// int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) { return vextq_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <4 x i32> @test_vextq_s32(<4 x i32> noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i32> [[VEXT]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vextq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VEXT]] +// int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) { return vextq_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vextq_s64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i64> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vextq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VEXT]] +// int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) { return vextq_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vext_u8(<8 x i8> noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[VEXT]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vext_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) { return vext_u8(a, b, 2); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vext_u16(<4 x i16> noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i16> [[VEXT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vext_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[VEXT]] +// uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) { return vext_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x i32> @test_vext_u32(<2 x i32> noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i32> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vext_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VEXT]] +// uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) { return vext_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vext_u64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[VEXT]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vext_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[VEXT]] +// uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) { return vext_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vextq_u8(<16 x i8> noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[VEXT]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vextq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) { return vextq_u8(a, b, 2); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vextq_u16(<8 x i16> noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK: ret <8 x i16> [[VEXT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vextq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[VEXT]] +// uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) { return vextq_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <4 x i32> @test_vextq_u32(<4 x i32> noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i32> [[VEXT]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vextq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VEXT]] +// uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) { return vextq_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vextq_u64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i64> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vextq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VEXT]] +// uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) { return vextq_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vext_f32(<2 x float> noundef %a, <2 x float> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> -// CHECK: ret <2 x float> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x float> @test_vext_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[VEXT]] +// float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) { return vext_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vext_f64(<1 x double> noundef %a, <1 x double> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK: [[VEXT:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x double> [[VEXT]] +// CHECK-LABEL: define dso_local <1 x double> @test_vext_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <1 x double> [[TMP4]], <1 x double> [[TMP5]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x double> [[VEXT]] +// float64x1_t test_vext_f64(float64x1_t a, float64x1_t b) { return vext_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vextq_f32(<4 x float> noundef %a, <4 x float> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> -// CHECK: ret <4 x float> [[VEXT]] +// CHECK-LABEL: define dso_local <4 x float> @test_vextq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[VEXT]] +// float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) { return vextq_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <2 x double> @test_vextq_f64(<2 x double> noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> -// CHECK: ret <2 x double> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x double> @test_vextq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[VEXT]] +// float64x2_t test_vextq_f64(float64x2_t a, float64x2_t b) { return vextq_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vext_p8(<8 x i8> noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[VEXT]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vext_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) { return vext_p8(a, b, 2); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vext_p16(<4 x i16> noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i16> [[VEXT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vext_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[VEXT]] +// poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) { return vext_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vextq_p8(<16 x i8> noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[VEXT]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vextq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) { return vextq_p8(a, b, 2); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vextq_p16(<8 x i16> noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK: ret <8 x i16> [[VEXT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vextq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[VEXT]] +// poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) { return vextq_p16(a, b, 3); } diff --git a/clang/test/CodeGen/AArch64/neon-fma.c b/clang/test/CodeGen/AArch64/neon-fma.c index b87c531b8b231..06531ce0a372b 100644 --- a/clang/test/CodeGen/AArch64/neon-fma.c +++ b/clang/test/CodeGen/AArch64/neon-fma.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -64,9 +64,10 @@ float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vmla_lane_f32_0 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[ADD]] @@ -78,9 +79,10 @@ float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_lane_f32_0 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[ADD]] @@ -92,9 +94,10 @@ float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmla_laneq_f32_0 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[ADD]] @@ -106,9 +109,10 @@ float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_laneq_f32_0 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[ADD]] @@ -120,9 +124,10 @@ float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) // CHECK-LABEL: define {{[^@]+}}@test_vmls_lane_f32_0 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[SUB]] @@ -134,9 +139,10 @@ float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_lane_f32_0 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[SUB]] @@ -148,9 +154,10 @@ float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmls_laneq_f32_0 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[SUB]] @@ -162,9 +169,10 @@ float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_laneq_f32_0 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[SUB]] @@ -176,9 +184,10 @@ float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) // CHECK-LABEL: define {{[^@]+}}@test_vmla_lane_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[ADD]] @@ -190,9 +199,10 @@ float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_lane_f32 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[ADD]] @@ -204,9 +214,10 @@ float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmla_laneq_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[ADD]] @@ -218,9 +229,10 @@ float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_laneq_f32 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[ADD]] @@ -232,9 +244,10 @@ float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmls_lane_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[SUB]] @@ -246,9 +259,10 @@ float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_lane_f32 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[SUB]] @@ -259,9 +273,10 @@ float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmls_laneq_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[SUB]] @@ -273,9 +288,10 @@ float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_laneq_f32 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[SUB]] @@ -289,11 +305,17 @@ float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]]) -// CHECK-NEXT: ret <2 x double> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]]) +// CHECK-NEXT: ret <2 x double> [[TMP9]] // float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) { return vfmaq_n_f64(a, b, c); @@ -305,11 +327,17 @@ float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) { // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x double> [[B]] // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]]) -// CHECK-NEXT: ret <2 x double> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG_I]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]]) +// CHECK-NEXT: ret <2 x double> [[TMP9]] // float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) { return vfmsq_n_f64(a, b, c); diff --git a/clang/test/CodeGen/AArch64/neon-fp16fml.c b/clang/test/CodeGen/AArch64/neon-fp16fml.c index 976045d6e79f3..0f69dbaa0f4d6 100644 --- a/clang/test/CodeGen/AArch64/neon-fp16fml.c +++ b/clang/test/CodeGen/AArch64/neon-fp16fml.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.2a -target-feature +neon -target-feature +fp16fml \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target @@ -12,10 +12,16 @@ // CHECK-LABEL: @test_vfmlal_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VFMLAL_LOW_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VFMLAL_LOW1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[VFMLAL_LOW2_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[VFMLAL_LOW_I]], <4 x half> [[VFMLAL_LOW1_I]], <4 x half> [[VFMLAL_LOW2_I]]) // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]] // float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -24,10 +30,16 @@ float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: @test_vfmlsl_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VFMLSL_LOW_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VFMLSL_LOW1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[VFMLSL_LOW2_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[VFMLSL_LOW_I]], <4 x half> [[VFMLSL_LOW1_I]], <4 x half> [[VFMLSL_LOW2_I]]) // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]] // float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -36,10 +48,16 @@ float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: @test_vfmlal_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VFMLAL_HIGH_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VFMLAL_HIGH1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[VFMLAL_HIGH2_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[VFMLAL_HIGH_I]], <4 x half> [[VFMLAL_HIGH1_I]], <4 x half> [[VFMLAL_HIGH2_I]]) // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]] // float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -48,10 +66,16 @@ float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: @test_vfmlsl_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VFMLSL_HIGH_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VFMLSL_HIGH1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[VFMLSL_HIGH2_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[VFMLSL_HIGH_I]], <4 x half> [[VFMLSL_HIGH1_I]], <4 x half> [[VFMLSL_HIGH2_I]]) // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]] // float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -60,10 +84,16 @@ float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: @test_vfmlalq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VFMLAL_LOW_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VFMLAL_LOW1_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[VFMLAL_LOW2_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[VFMLAL_LOW_I]], <8 x half> [[VFMLAL_LOW1_I]], <8 x half> [[VFMLAL_LOW2_I]]) // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]] // float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -72,10 +102,16 @@ float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: @test_vfmlslq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VFMLSL_LOW_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VFMLSL_LOW1_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[VFMLSL_LOW2_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[VFMLSL_LOW_I]], <8 x half> [[VFMLSL_LOW1_I]], <8 x half> [[VFMLSL_LOW2_I]]) // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]] // float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -84,10 +120,16 @@ float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: @test_vfmlalq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VFMLAL_HIGH_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VFMLAL_HIGH1_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[VFMLAL_HIGH2_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[VFMLAL_HIGH_I]], <8 x half> [[VFMLAL_HIGH1_I]], <8 x half> [[VFMLAL_HIGH2_I]]) // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]] // float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -96,10 +138,16 @@ float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: @test_vfmlslq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VFMLSL_HIGH_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VFMLSL_HIGH1_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[VFMLSL_HIGH2_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[VFMLSL_HIGH_I]], <8 x half> [[VFMLSL_HIGH1_I]], <8 x half> [[VFMLSL_HIGH2_I]]) // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]] // float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -110,42 +158,32 @@ float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: @test_vfmlal_lane_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[C:%.*]] to <4 x i16> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGET_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGET_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <4 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE29:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGET_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <4 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT32]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> +// CHECK-NEXT: [[VFMLAL_LOW_I:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> +// CHECK-NEXT: [[VFMLAL_LOW1_I:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half> +// CHECK-NEXT: [[VFMLAL_LOW2_I:%.*]] = bitcast <8 x i8> [[TMP13]] to <4 x half> +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[VFMLAL_LOW_I]], <4 x half> [[VFMLAL_LOW1_I]], <4 x half> [[VFMLAL_LOW2_I]]) // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]] // float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -154,42 +192,32 @@ float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c // CHECK-LABEL: @test_vfmlal_lane_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[C:%.*]] to <4 x i16> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGET_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGET_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <4 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE29:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGET_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <4 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT32]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> +// CHECK-NEXT: [[VFMLAL_HIGH_I:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> +// CHECK-NEXT: [[VFMLAL_HIGH1_I:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half> +// CHECK-NEXT: [[VFMLAL_HIGH2_I:%.*]] = bitcast <8 x i8> [[TMP13]] to <4 x half> +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[VFMLAL_HIGH_I]], <4 x half> [[VFMLAL_HIGH1_I]], <4 x half> [[VFMLAL_HIGH2_I]]) // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]] // float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -198,74 +226,48 @@ float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t // CHECK-LABEL: @test_vfmlalq_lane_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84734:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84735:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84744:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84745:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84754:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84755:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84764:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84765:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[C:%.*]] to <4 x i16> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGET_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGET_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <8 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE29:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGET_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <8 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE39:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2 +// CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[VGET_LANE39]] to half +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x half> [[VECINIT32]], half [[TMP9]], i32 4 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE49:%.*]] = extractelement <4 x i16> [[TMP10]], i32 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[VGET_LANE49]] to half +// CHECK-NEXT: [[VECINIT52:%.*]] = insertelement <8 x half> [[VECINIT42]], half [[TMP11]], i32 5 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE59:%.*]] = extractelement <4 x i16> [[TMP12]], i32 2 +// CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[VGET_LANE59]] to half +// CHECK-NEXT: [[VECINIT62:%.*]] = insertelement <8 x half> [[VECINIT52]], half [[TMP13]], i32 6 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE69:%.*]] = extractelement <4 x i16> [[TMP14]], i32 2 +// CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[VGET_LANE69]] to half +// CHECK-NEXT: [[VECINIT72:%.*]] = insertelement <8 x half> [[VECINIT62]], half [[TMP15]], i32 7 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT72]] to <8 x i16> +// CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP16]] to <16 x i8> +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i16> [[TMP17]] to <16 x i8> +// CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP18]] to <16 x i8> +// CHECK-NEXT: [[VFMLAL_LOW_I:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x float> +// CHECK-NEXT: [[VFMLAL_LOW1_I:%.*]] = bitcast <16 x i8> [[TMP20]] to <8 x half> +// CHECK-NEXT: [[VFMLAL_LOW2_I:%.*]] = bitcast <16 x i8> [[TMP21]] to <8 x half> +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[VFMLAL_LOW_I]], <8 x half> [[VFMLAL_LOW1_I]], <8 x half> [[VFMLAL_LOW2_I]]) // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]] // float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) { @@ -274,74 +276,48 @@ float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t // CHECK-LABEL: @test_vfmlalq_lane_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84734:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84735:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84744:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84745:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84754:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84755:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84764:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84765:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[C:%.*]] to <4 x i16> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGET_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGET_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <8 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE29:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGET_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <8 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE39:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3 +// CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[VGET_LANE39]] to half +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x half> [[VECINIT32]], half [[TMP9]], i32 4 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE49:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[VGET_LANE49]] to half +// CHECK-NEXT: [[VECINIT52:%.*]] = insertelement <8 x half> [[VECINIT42]], half [[TMP11]], i32 5 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE59:%.*]] = extractelement <4 x i16> [[TMP12]], i32 3 +// CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[VGET_LANE59]] to half +// CHECK-NEXT: [[VECINIT62:%.*]] = insertelement <8 x half> [[VECINIT52]], half [[TMP13]], i32 6 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE69:%.*]] = extractelement <4 x i16> [[TMP14]], i32 3 +// CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[VGET_LANE69]] to half +// CHECK-NEXT: [[VECINIT72:%.*]] = insertelement <8 x half> [[VECINIT62]], half [[TMP15]], i32 7 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT72]] to <8 x i16> +// CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP16]] to <16 x i8> +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i16> [[TMP17]] to <16 x i8> +// CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP18]] to <16 x i8> +// CHECK-NEXT: [[VFMLAL_HIGH_I:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x float> +// CHECK-NEXT: [[VFMLAL_HIGH1_I:%.*]] = bitcast <16 x i8> [[TMP20]] to <8 x half> +// CHECK-NEXT: [[VFMLAL_HIGH2_I:%.*]] = bitcast <16 x i8> [[TMP21]] to <8 x half> +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[VFMLAL_HIGH_I]], <8 x half> [[VFMLAL_HIGH1_I]], <8 x half> [[VFMLAL_HIGH2_I]]) // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]] // float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) { @@ -350,42 +326,32 @@ float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t // CHECK-LABEL: @test_vfmlal_laneq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[C:%.*]] to <8 x i16> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGETQ_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE9:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGETQ_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE19:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGETQ_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <4 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE29:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGETQ_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <4 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT32]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> +// CHECK-NEXT: [[VFMLAL_LOW_I:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> +// CHECK-NEXT: [[VFMLAL_LOW1_I:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half> +// CHECK-NEXT: [[VFMLAL_LOW2_I:%.*]] = bitcast <8 x i8> [[TMP13]] to <4 x half> +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[VFMLAL_LOW_I]], <4 x half> [[VFMLAL_LOW1_I]], <4 x half> [[VFMLAL_LOW2_I]]) // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]] // float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) { @@ -394,42 +360,32 @@ float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t // CHECK-LABEL: @test_vfmlal_laneq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[C:%.*]] to <8 x i16> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGETQ_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE9:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGETQ_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE19:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGETQ_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <4 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE29:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGETQ_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <4 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT32]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> +// CHECK-NEXT: [[VFMLAL_HIGH_I:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> +// CHECK-NEXT: [[VFMLAL_HIGH1_I:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half> +// CHECK-NEXT: [[VFMLAL_HIGH2_I:%.*]] = bitcast <8 x i8> [[TMP13]] to <4 x half> +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[VFMLAL_HIGH_I]], <4 x half> [[VFMLAL_HIGH1_I]], <4 x half> [[VFMLAL_HIGH2_I]]) // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]] // float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) { @@ -438,74 +394,48 @@ float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t // CHECK-LABEL: @test_vfmlalq_laneq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85034:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85035:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85044:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85045:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85054:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85055:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85064:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85065:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[C:%.*]] to <8 x i16> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGETQ_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE9:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGETQ_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE19:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGETQ_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <8 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE29:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGETQ_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <8 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE39:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 +// CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[VGETQ_LANE39]] to half +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x half> [[VECINIT32]], half [[TMP9]], i32 4 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE49:%.*]] = extractelement <8 x i16> [[TMP10]], i32 6 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[VGETQ_LANE49]] to half +// CHECK-NEXT: [[VECINIT52:%.*]] = insertelement <8 x half> [[VECINIT42]], half [[TMP11]], i32 5 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE59:%.*]] = extractelement <8 x i16> [[TMP12]], i32 6 +// CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[VGETQ_LANE59]] to half +// CHECK-NEXT: [[VECINIT62:%.*]] = insertelement <8 x half> [[VECINIT52]], half [[TMP13]], i32 6 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE69:%.*]] = extractelement <8 x i16> [[TMP14]], i32 6 +// CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[VGETQ_LANE69]] to half +// CHECK-NEXT: [[VECINIT72:%.*]] = insertelement <8 x half> [[VECINIT62]], half [[TMP15]], i32 7 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT72]] to <8 x i16> +// CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP16]] to <16 x i8> +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i16> [[TMP17]] to <16 x i8> +// CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP18]] to <16 x i8> +// CHECK-NEXT: [[VFMLAL_LOW_I:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x float> +// CHECK-NEXT: [[VFMLAL_LOW1_I:%.*]] = bitcast <16 x i8> [[TMP20]] to <8 x half> +// CHECK-NEXT: [[VFMLAL_LOW2_I:%.*]] = bitcast <16 x i8> [[TMP21]] to <8 x half> +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[VFMLAL_LOW_I]], <8 x half> [[VFMLAL_LOW1_I]], <8 x half> [[VFMLAL_LOW2_I]]) // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]] // float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -514,74 +444,48 @@ float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t // CHECK-LABEL: @test_vfmlalq_laneq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85034:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85035:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85044:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85045:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85054:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85055:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85064:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85065:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[C:%.*]] to <8 x i16> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGETQ_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE9:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGETQ_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE19:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGETQ_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <8 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE29:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGETQ_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <8 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE39:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 +// CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[VGETQ_LANE39]] to half +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x half> [[VECINIT32]], half [[TMP9]], i32 4 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE49:%.*]] = extractelement <8 x i16> [[TMP10]], i32 7 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[VGETQ_LANE49]] to half +// CHECK-NEXT: [[VECINIT52:%.*]] = insertelement <8 x half> [[VECINIT42]], half [[TMP11]], i32 5 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE59:%.*]] = extractelement <8 x i16> [[TMP12]], i32 7 +// CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[VGETQ_LANE59]] to half +// CHECK-NEXT: [[VECINIT62:%.*]] = insertelement <8 x half> [[VECINIT52]], half [[TMP13]], i32 6 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE69:%.*]] = extractelement <8 x i16> [[TMP14]], i32 7 +// CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[VGETQ_LANE69]] to half +// CHECK-NEXT: [[VECINIT72:%.*]] = insertelement <8 x half> [[VECINIT62]], half [[TMP15]], i32 7 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT72]] to <8 x i16> +// CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP16]] to <16 x i8> +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i16> [[TMP17]] to <16 x i8> +// CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP18]] to <16 x i8> +// CHECK-NEXT: [[VFMLAL_HIGH_I:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x float> +// CHECK-NEXT: [[VFMLAL_HIGH1_I:%.*]] = bitcast <16 x i8> [[TMP20]] to <8 x half> +// CHECK-NEXT: [[VFMLAL_HIGH2_I:%.*]] = bitcast <16 x i8> [[TMP21]] to <8 x half> +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[VFMLAL_HIGH_I]], <8 x half> [[VFMLAL_HIGH1_I]], <8 x half> [[VFMLAL_HIGH2_I]]) // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]] // float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -590,42 +494,32 @@ float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_ // CHECK-LABEL: @test_vfmlsl_lane_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[C:%.*]] to <4 x i16> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGET_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGET_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <4 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE29:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGET_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <4 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT32]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> +// CHECK-NEXT: [[VFMLSL_LOW_I:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> +// CHECK-NEXT: [[VFMLSL_LOW1_I:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half> +// CHECK-NEXT: [[VFMLSL_LOW2_I:%.*]] = bitcast <8 x i8> [[TMP13]] to <4 x half> +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[VFMLSL_LOW_I]], <4 x half> [[VFMLSL_LOW1_I]], <4 x half> [[VFMLSL_LOW2_I]]) // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]] // float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -634,42 +528,32 @@ float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c // CHECK-LABEL: @test_vfmlsl_lane_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[C:%.*]] to <4 x i16> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGET_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGET_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <4 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE29:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGET_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <4 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT32]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> +// CHECK-NEXT: [[VFMLSL_HIGH_I:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> +// CHECK-NEXT: [[VFMLSL_HIGH1_I:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half> +// CHECK-NEXT: [[VFMLSL_HIGH2_I:%.*]] = bitcast <8 x i8> [[TMP13]] to <4 x half> +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[VFMLSL_HIGH_I]], <4 x half> [[VFMLSL_HIGH1_I]], <4 x half> [[VFMLSL_HIGH2_I]]) // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]] // float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -678,74 +562,48 @@ float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t // CHECK-LABEL: @test_vfmlslq_lane_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84734:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84735:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84744:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84745:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84754:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84755:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84764:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84765:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[C:%.*]] to <4 x i16> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGET_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGET_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <8 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE29:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGET_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <8 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE39:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2 +// CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[VGET_LANE39]] to half +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x half> [[VECINIT32]], half [[TMP9]], i32 4 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE49:%.*]] = extractelement <4 x i16> [[TMP10]], i32 2 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[VGET_LANE49]] to half +// CHECK-NEXT: [[VECINIT52:%.*]] = insertelement <8 x half> [[VECINIT42]], half [[TMP11]], i32 5 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE59:%.*]] = extractelement <4 x i16> [[TMP12]], i32 2 +// CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[VGET_LANE59]] to half +// CHECK-NEXT: [[VECINIT62:%.*]] = insertelement <8 x half> [[VECINIT52]], half [[TMP13]], i32 6 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE69:%.*]] = extractelement <4 x i16> [[TMP14]], i32 2 +// CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[VGET_LANE69]] to half +// CHECK-NEXT: [[VECINIT72:%.*]] = insertelement <8 x half> [[VECINIT62]], half [[TMP15]], i32 7 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT72]] to <8 x i16> +// CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP16]] to <16 x i8> +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i16> [[TMP17]] to <16 x i8> +// CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP18]] to <16 x i8> +// CHECK-NEXT: [[VFMLSL_LOW_I:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x float> +// CHECK-NEXT: [[VFMLSL_LOW1_I:%.*]] = bitcast <16 x i8> [[TMP20]] to <8 x half> +// CHECK-NEXT: [[VFMLSL_LOW2_I:%.*]] = bitcast <16 x i8> [[TMP21]] to <8 x half> +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[VFMLSL_LOW_I]], <8 x half> [[VFMLSL_LOW1_I]], <8 x half> [[VFMLSL_LOW2_I]]) // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]] // float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) { @@ -754,74 +612,48 @@ float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t // CHECK-LABEL: @test_vfmlslq_lane_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84734:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84735:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84744:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84745:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84754:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84755:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84764:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84765:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[C:%.*]] to <4 x i16> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGET_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE19:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGET_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <8 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE29:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGET_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <8 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE39:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3 +// CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[VGET_LANE39]] to half +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x half> [[VECINIT32]], half [[TMP9]], i32 4 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE49:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[VGET_LANE49]] to half +// CHECK-NEXT: [[VECINIT52:%.*]] = insertelement <8 x half> [[VECINIT42]], half [[TMP11]], i32 5 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE59:%.*]] = extractelement <4 x i16> [[TMP12]], i32 3 +// CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[VGET_LANE59]] to half +// CHECK-NEXT: [[VECINIT62:%.*]] = insertelement <8 x half> [[VECINIT52]], half [[TMP13]], i32 6 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE69:%.*]] = extractelement <4 x i16> [[TMP14]], i32 3 +// CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[VGET_LANE69]] to half +// CHECK-NEXT: [[VECINIT72:%.*]] = insertelement <8 x half> [[VECINIT62]], half [[TMP15]], i32 7 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT72]] to <8 x i16> +// CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP16]] to <16 x i8> +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i16> [[TMP17]] to <16 x i8> +// CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP18]] to <16 x i8> +// CHECK-NEXT: [[VFMLSL_HIGH_I:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x float> +// CHECK-NEXT: [[VFMLSL_HIGH1_I:%.*]] = bitcast <16 x i8> [[TMP20]] to <8 x half> +// CHECK-NEXT: [[VFMLSL_HIGH2_I:%.*]] = bitcast <16 x i8> [[TMP21]] to <8 x half> +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[VFMLSL_HIGH_I]], <8 x half> [[VFMLSL_HIGH1_I]], <8 x half> [[VFMLSL_HIGH2_I]]) // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]] // float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) { @@ -830,42 +662,32 @@ float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t // CHECK-LABEL: @test_vfmlsl_laneq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[C:%.*]] to <8 x i16> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGETQ_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE9:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGETQ_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE19:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGETQ_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <4 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE29:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGETQ_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <4 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT32]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> +// CHECK-NEXT: [[VFMLSL_LOW_I:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> +// CHECK-NEXT: [[VFMLSL_LOW1_I:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half> +// CHECK-NEXT: [[VFMLSL_LOW2_I:%.*]] = bitcast <8 x i8> [[TMP13]] to <4 x half> +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[VFMLSL_LOW_I]], <4 x half> [[VFMLSL_LOW1_I]], <4 x half> [[VFMLSL_LOW2_I]]) // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]] // float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) { @@ -874,42 +696,32 @@ float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t // CHECK-LABEL: @test_vfmlsl_laneq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[C:%.*]] to <8 x i16> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGETQ_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE9:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGETQ_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE19:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGETQ_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <4 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE29:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGETQ_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <4 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT32]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-NEXT: [[TMP13:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> +// CHECK-NEXT: [[VFMLSL_HIGH_I:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> +// CHECK-NEXT: [[VFMLSL_HIGH1_I:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half> +// CHECK-NEXT: [[VFMLSL_HIGH2_I:%.*]] = bitcast <8 x i8> [[TMP13]] to <4 x half> +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[VFMLSL_HIGH_I]], <4 x half> [[VFMLSL_HIGH1_I]], <4 x half> [[VFMLSL_HIGH2_I]]) // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]] // float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) { @@ -918,74 +730,48 @@ float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t // CHECK-LABEL: @test_vfmlslq_laneq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85034:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85035:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85044:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85045:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85054:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85055:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85064:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85065:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[C:%.*]] to <8 x i16> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGETQ_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE9:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGETQ_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE19:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGETQ_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <8 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE29:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGETQ_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <8 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE39:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 +// CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[VGETQ_LANE39]] to half +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x half> [[VECINIT32]], half [[TMP9]], i32 4 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE49:%.*]] = extractelement <8 x i16> [[TMP10]], i32 6 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[VGETQ_LANE49]] to half +// CHECK-NEXT: [[VECINIT52:%.*]] = insertelement <8 x half> [[VECINIT42]], half [[TMP11]], i32 5 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE59:%.*]] = extractelement <8 x i16> [[TMP12]], i32 6 +// CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[VGETQ_LANE59]] to half +// CHECK-NEXT: [[VECINIT62:%.*]] = insertelement <8 x half> [[VECINIT52]], half [[TMP13]], i32 6 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE69:%.*]] = extractelement <8 x i16> [[TMP14]], i32 6 +// CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[VGETQ_LANE69]] to half +// CHECK-NEXT: [[VECINIT72:%.*]] = insertelement <8 x half> [[VECINIT62]], half [[TMP15]], i32 7 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT72]] to <8 x i16> +// CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP16]] to <16 x i8> +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i16> [[TMP17]] to <16 x i8> +// CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP18]] to <16 x i8> +// CHECK-NEXT: [[VFMLSL_LOW_I:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x float> +// CHECK-NEXT: [[VFMLSL_LOW1_I:%.*]] = bitcast <16 x i8> [[TMP20]] to <8 x half> +// CHECK-NEXT: [[VFMLSL_LOW2_I:%.*]] = bitcast <16 x i8> [[TMP21]] to <8 x half> +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[VFMLSL_LOW_I]], <8 x half> [[VFMLSL_LOW1_I]], <8 x half> [[VFMLSL_LOW2_I]]) // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]] // float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -994,74 +780,48 @@ float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t // CHECK-LABEL: @test_vfmlslq_laneq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85034:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85035:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85044:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85045:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85054:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85055:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85064:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85065:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[C:%.*]] to <8 x i16> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGETQ_LANE]] to half // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE9:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i16 [[VGETQ_LANE9]] to half +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE19:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[VGETQ_LANE19]] to half +// CHECK-NEXT: [[VECINIT22:%.*]] = insertelement <8 x half> [[VECINIT12]], half [[TMP5]], i32 2 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE29:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i16 [[VGETQ_LANE29]] to half +// CHECK-NEXT: [[VECINIT32:%.*]] = insertelement <8 x half> [[VECINIT22]], half [[TMP7]], i32 3 +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE39:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 +// CHECK-NEXT: [[TMP9:%.*]] = bitcast i16 [[VGETQ_LANE39]] to half +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x half> [[VECINIT32]], half [[TMP9]], i32 4 +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE49:%.*]] = extractelement <8 x i16> [[TMP10]], i32 7 +// CHECK-NEXT: [[TMP11:%.*]] = bitcast i16 [[VGETQ_LANE49]] to half +// CHECK-NEXT: [[VECINIT52:%.*]] = insertelement <8 x half> [[VECINIT42]], half [[TMP11]], i32 5 +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE59:%.*]] = extractelement <8 x i16> [[TMP12]], i32 7 +// CHECK-NEXT: [[TMP13:%.*]] = bitcast i16 [[VGETQ_LANE59]] to half +// CHECK-NEXT: [[VECINIT62:%.*]] = insertelement <8 x half> [[VECINIT52]], half [[TMP13]], i32 6 +// CHECK-NEXT: [[TMP14:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[VGETQ_LANE69:%.*]] = extractelement <8 x i16> [[TMP14]], i32 7 +// CHECK-NEXT: [[TMP15:%.*]] = bitcast i16 [[VGETQ_LANE69]] to half +// CHECK-NEXT: [[VECINIT72:%.*]] = insertelement <8 x half> [[VECINIT62]], half [[TMP15]], i32 7 +// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT72]] to <8 x i16> +// CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP16]] to <16 x i8> +// CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i16> [[TMP17]] to <16 x i8> +// CHECK-NEXT: [[TMP21:%.*]] = bitcast <8 x i16> [[TMP18]] to <16 x i8> +// CHECK-NEXT: [[VFMLSL_HIGH_I:%.*]] = bitcast <16 x i8> [[TMP19]] to <4 x float> +// CHECK-NEXT: [[VFMLSL_HIGH1_I:%.*]] = bitcast <16 x i8> [[TMP20]] to <8 x half> +// CHECK-NEXT: [[VFMLSL_HIGH2_I:%.*]] = bitcast <16 x i8> [[TMP21]] to <8 x half> +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[VFMLSL_HIGH_I]], <8 x half> [[VFMLSL_HIGH1_I]], <8 x half> [[VFMLSL_HIGH2_I]]) // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]] // float32x4_t test_vfmlslq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c b/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c index 15ae7eea820e8..ba32cfb7f3bae 100644 --- a/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c +++ b/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c @@ -1,12 +1,13 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -disable-O0-optnone \ -// RUN: -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg \ -// RUN: | FileCheck --check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s +// RUN: -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa \ +// RUN: | FileCheck --check-prefixes=UNCONSTRAINED %s // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -disable-O0-optnone \ // RUN: -ffp-exception-behavior=strict \ -// RUN: -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg \ -// RUN: | FileCheck --check-prefixes=COMMON,COMMONIR,CONSTRAINED %s +// RUN: -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa \ +// RUN: | FileCheck --check-prefixes=CONSTRAINED %s // REQUIRES: aarch64-registered-target @@ -14,804 +15,1759 @@ #include -// COMMON-LABEL: test_vadd_f32 -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <2 x float> %v1, %v2 -// CONSTRAINED: [[ADD_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vadd_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0:[0-9]+]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x float> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vadd_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0:[0-9]+]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3:[0-9]+]] +// CONSTRAINED-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) { return vadd_f32(v1, v2); } -// COMMON-LABEL: test_vaddq_f32 -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <4 x float> %v1, %v2 -// CONSTRAINED: [[ADD_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vaddq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <4 x float> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vaddq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) { return vaddq_f32(v1, v2); } -// COMMON-LABEL: test_vsub_f32 -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <2 x float> %v1, %v2 -// CONSTRAINED: [[SUB_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vsub_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x float> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vsub_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) { return vsub_f32(v1, v2); } -// COMMON-LABEL: test_vsubq_f32 -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <4 x float> %v1, %v2 -// CONSTRAINED: [[SUB_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vsubq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <4 x float> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vsubq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) { return vsubq_f32(v1, v2); } -// COMMON-LABEL: test_vsubq_f64 -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <2 x double> %v1, %v2 -// CONSTRAINED: [[SUB_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vsubq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x double> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vsubq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[SUB_I]] +// float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) { return vsubq_f64(v1, v2); } -// COMMON-LABEL: test_vmul_f32 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x float> %v1, %v2 -// CONSTRAINED: [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[MUL_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vmul_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x float> [[MUL_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vmul_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[MUL_I]] +// float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) { return vmul_f32(v1, v2); } -// COMMON-LABEL: test_vmulq_f32 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <4 x float> %v1, %v2 -// CONSTRAINED: [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[MUL_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vmulq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <4 x float> [[MUL_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vmulq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[MUL_I]] +// float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) { return vmulq_f32(v1, v2); } -// COMMON-LABEL: test_vmulq_f64 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x double> %v1, %v2 -// CONSTRAINED: [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[MUL_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vmulq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x double> [[MUL_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vmulq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[MUL_I]] +// float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) { return vmulq_f64(v1, v2); } -// COMMON-LABEL: test_vmla_f32 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3 -// CONSTRAINED: [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %v2, <2 x float> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <2 x float> %v1, [[MUL_I]] -// CONSTRAINED: [[ADD_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %v1, <2 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vmla_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[V2]], [[V3]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[V1]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <2 x float> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vmla_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> [[V2]], <2 x float> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> [[V1]], <2 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vmla_f32(v1, v2, v3); } -// COMMON-LABEL: test_vmlaq_f32 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3 -// CONSTRAINED: [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %v2, <4 x float> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <4 x float> %v1, [[MUL_I]] -// CONSTRAINED: [[ADD_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %v1, <4 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vmlaq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[V2]], [[V3]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[V1]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <4 x float> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vmlaq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> [[V2]], <4 x float> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> [[V1]], <4 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vmlaq_f32(v1, v2, v3); } -// COMMON-LABEL: test_vmlaq_f64 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3 -// CONSTRAINED: [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %v2, <2 x double> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <2 x double> %v1, [[MUL_I]] -// CONSTRAINED: [[ADD_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %v1, <2 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vmlaq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[V2]], [[V3]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <2 x double> [[V1]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <2 x double> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vmlaq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[V2]], <2 x double> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> [[V1]], <2 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[ADD_I]] +// float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vmlaq_f64(v1, v2, v3); } -// COMMON-LABEL: test_vmls_f32 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3 -// CONSTRAINED: [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %v2, <2 x float> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <2 x float> %v1, [[MUL_I]] -// CONSTRAINED: [[SUB_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %v1, <2 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vmls_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[V2]], [[V3]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[V1]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <2 x float> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vmls_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> [[V2]], <2 x float> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> [[V1]], <2 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vmls_f32(v1, v2, v3); } -// COMMON-LABEL: test_vmlsq_f32 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3 -// CONSTRAINED: [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %v2, <4 x float> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <4 x float> %v1, [[MUL_I]] -// CONSTRAINED: [[SUB_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> %v1, <4 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vmlsq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[V2]], [[V3]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[V1]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <4 x float> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vmlsq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> [[V2]], <4 x float> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> [[V1]], <4 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vmlsq_f32(v1, v2, v3); } -// COMMON-LABEL: test_vmlsq_f64 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3 -// CONSTRAINED: [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %v2, <2 x double> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <2 x double> %v1, [[MUL_I]] -// CONSTRAINED: [[SUB_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %v1, <2 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vmlsq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[V2]], [[V3]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <2 x double> [[V1]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <2 x double> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vmlsq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[V2]], <2 x double> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> [[V1]], <2 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[SUB_I]] +// float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vmlsq_f64(v1, v2, v3); } -// COMMON-LABEL: test_vfma_f32 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %v2, <2 x float> %v3, <2 x float> %v1) -// CONSTRAINED: [[TMP3:%.*]] = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %v2, <2 x float> %v3, <2 x float> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vfma_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <2 x float> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vfma_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[TMP9]] +// float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vfma_f32(v1, v2, v3); } -// COMMON-LABEL: test_vfmaq_f32 -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %v2, <4 x float> %v3, <4 x float> %v1) -// CONSTRAINED: [[TMP3:%.*]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %v2, <4 x float> %v3, <4 x float> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vfmaq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <4 x float> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vfmaq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[TMP9]] +// float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vfmaq_f32(v1, v2, v3); } -// COMMON-LABEL: test_vfmaq_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %v3, <2 x double> %v1) -// CONSTRAINED: [[TMP3:%.*]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %v2, <2 x double> %v3, <2 x double> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vfmaq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <2 x double> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vfmaq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[TMP9]] +// float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vfmaq_f64(v1, v2, v3); } -// COMMON-LABEL: test_vfms_f32 -// COMMONIR: [[SUB_I:%.*]] = fneg <2 x float> %v2 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %v3, <2 x float> %v1) -// CONSTRAINED: [[TMP3:%.*]] = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %v3, <2 x float> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vfms_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[V2]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <2 x i32> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <2 x float> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vfms_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[V2]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <2 x i32> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[TMP9]] +// float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vfms_f32(v1, v2, v3); } -// COMMON-LABEL: test_vfmsq_f32 -// COMMONIR: [[SUB_I:%.*]] = fneg <4 x float> %v2 -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %v3, <4 x float> %v1) -// CONSTRAINED: [[TMP3:%.*]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %v3, <4 x float> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vfmsq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[V2]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <4 x float> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vfmsq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[V2]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <4 x i32> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[TMP9]] +// float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vfmsq_f32(v1, v2, v3); } -// COMMON-LABEL: test_vfmsq_f64 -// COMMONIR: [[SUB_I:%.*]] = fneg <2 x double> %v2 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <2 x double> [[SUB_I]] to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> %v3, <2 x double> %v1) -// CONSTRAINED: [[TMP3:%.*]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> %v3, <2 x double> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vfmsq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <2 x double> [[V2]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG_I]] to <2 x i64> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <2 x double> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vfmsq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <2 x double> [[V2]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG_I]] to <2 x i64> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[TMP9]] +// float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vfmsq_f64(v1, v2, v3); } -// COMMON-LABEL: test_vdivq_f64 -// UNCONSTRAINED: [[DIV_I:%.*]] = fdiv <2 x double> %v1, %v2 -// CONSTRAINED: [[DIV_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[DIV_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vdivq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[DIV_I:%.*]] = fdiv <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x double> [[DIV_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vdivq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[DIV_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[DIV_I]] +// float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) { return vdivq_f64(v1, v2); } -// COMMON-LABEL: test_vdivq_f32 -// UNCONSTRAINED: [[DIV_I:%.*]] = fdiv <4 x float> %v1, %v2 -// CONSTRAINED: [[DIV_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[DIV_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vdivq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[DIV_I:%.*]] = fdiv <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <4 x float> [[DIV_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vdivq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[DIV_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[DIV_I]] +// float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) { return vdivq_f32(v1, v2); } -// COMMON-LABEL: test_vdiv_f32 -// UNCONSTRAINED: [[DIV_I:%.*]] = fdiv <2 x float> %v1, %v2 -// CONSTRAINED: [[DIV_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fdiv.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[DIV_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vdiv_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[DIV_I:%.*]] = fdiv <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x float> [[DIV_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vdiv_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[DIV_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fdiv.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[DIV_I]] +// float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) { return vdiv_f32(v1, v2); } -// COMMON-LABEL: test_vceq_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oeq <2 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// COMMONIR: ret <2 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vceq_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oeq <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// UNCONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vceq_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) { return vceq_f32(v1, v2); } -// COMMON-LABEL: test_vceq_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oeq <1 x double> %a, %b -// CONSTRAINED: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// COMMONIR: ret <1 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vceq_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oeq <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// UNCONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vceq_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vceq_f64(float64x1_t a, float64x1_t b) { return vceq_f64(a, b); } -// COMMON-LABEL: test_vceqq_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oeq <4 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// COMMONIR: ret <4 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vceqq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oeq <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// UNCONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vceqq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) { return vceqq_f32(v1, v2); } -// COMMON-LABEL: test_vceqq_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oeq <2 x double> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// COMMONIR: ret <2 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vceqq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oeq <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// UNCONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vceqq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) { return vceqq_f64(v1, v2); } -// COMMON-LABEL: test_vcge_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oge <2 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// COMMONIR: ret <2 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcge_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oge <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// UNCONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcge_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) { return vcge_f32(v1, v2); } -// COMMON-LABEL: test_vcge_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oge <1 x double> %a, %b -// CONSTRAINED: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// COMMONIR: ret <1 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcge_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oge <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// UNCONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcge_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcge_f64(float64x1_t a, float64x1_t b) { return vcge_f64(a, b); } -// COMMON-LABEL: test_vcgeq_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oge <4 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// COMMONIR: ret <4 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcgeq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oge <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// UNCONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcgeq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) { return vcgeq_f32(v1, v2); } -// COMMON-LABEL: test_vcgeq_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oge <2 x double> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// COMMONIR: ret <2 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcgeq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oge <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// UNCONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcgeq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) { return vcgeq_f64(v1, v2); } -// COMMON-LABEL: test_vcle_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ole <2 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// COMMONIR: ret <2 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcle_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ole <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// UNCONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcle_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) { return vcle_f32(v1, v2); } -// COMMON-LABEL: test_vcle_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ole <1 x double> %a, %b -// CONSTRAINED: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// COMMONIR: ret <1 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcle_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ole <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// UNCONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcle_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcle_f64(float64x1_t a, float64x1_t b) { return vcle_f64(a, b); } -// COMMON-LABEL: test_vcleq_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ole <4 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// COMMONIR: ret <4 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcleq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ole <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// UNCONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcleq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) { return vcleq_f32(v1, v2); } -// COMMON-LABEL: test_vcleq_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ole <2 x double> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// COMMONIR: ret <2 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcleq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ole <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// UNCONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcleq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) { return vcleq_f64(v1, v2); } -// COMMON-LABEL: test_vcgt_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ogt <2 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// COMMONIR: ret <2 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcgt_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ogt <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// UNCONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcgt_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) { return vcgt_f32(v1, v2); } -// COMMON-LABEL: test_vcgt_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ogt <1 x double> %a, %b -// CONSTRAINED: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// COMMONIR: ret <1 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcgt_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ogt <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// UNCONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcgt_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcgt_f64(float64x1_t a, float64x1_t b) { return vcgt_f64(a, b); } -// COMMON-LABEL: test_vcgtq_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ogt <4 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// COMMONIR: ret <4 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcgtq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ogt <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// UNCONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcgtq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) { return vcgtq_f32(v1, v2); } -// COMMON-LABEL: test_vcgtq_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ogt <2 x double> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// COMMONIR: ret <2 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcgtq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ogt <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// UNCONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcgtq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) { return vcgtq_f64(v1, v2); } -// COMMON-LABEL: test_vclt_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp olt <2 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// COMMONIR: ret <2 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vclt_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// UNCONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vclt_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) { return vclt_f32(v1, v2); } -// COMMON-LABEL: test_vclt_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp olt <1 x double> %a, %b -// CONSTRAINED: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// COMMONIR: ret <1 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vclt_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp olt <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// UNCONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vclt_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vclt_f64(float64x1_t a, float64x1_t b) { return vclt_f64(a, b); } -// COMMON-LABEL: test_vcltq_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp olt <4 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// COMMONIR: ret <4 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcltq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// UNCONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcltq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) { return vcltq_f32(v1, v2); } -// COMMON-LABEL: test_vcltq_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp olt <2 x double> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// COMMONIR: ret <2 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcltq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// UNCONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcltq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) { return vcltq_f64(v1, v2); } -// COMMON-LABEL: test_vpadds_f32 -// COMMONIR: [[LANE0_I:%.*]] = extractelement <2 x float> %a, i64 0 -// COMMONIR: [[LANE1_I:%.*]] = extractelement <2 x float> %a, i64 1 -// UNCONSTRAINED: [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]] -// CONSTRAINED: [[VPADDD_I:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LANE0_I]], float [[LANE1_I]], metadata !"round.tonearest", metadata !"fpexcept.strict" -// COMMONIR: ret float [[VPADDD_I]] +// UNCONSTRAINED-LABEL: define dso_local float @test_vpadds_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[LANE0_I:%.*]] = extractelement <2 x float> [[A]], i64 0 +// UNCONSTRAINED-NEXT: [[LANE1_I:%.*]] = extractelement <2 x float> [[A]], i64 1 +// UNCONSTRAINED-NEXT: [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]] +// UNCONSTRAINED-NEXT: ret float [[VPADDD_I]] +// +// CONSTRAINED-LABEL: define dso_local float @test_vpadds_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[LANE0_I:%.*]] = extractelement <2 x float> [[A]], i64 0 +// CONSTRAINED-NEXT: [[LANE1_I:%.*]] = extractelement <2 x float> [[A]], i64 1 +// CONSTRAINED-NEXT: [[VPADDD_I:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LANE0_I]], float [[LANE1_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret float [[VPADDD_I]] +// float32_t test_vpadds_f32(float32x2_t a) { return vpadds_f32(a); } -// COMMON-LABEL: test_vpaddd_f64 -// COMMONIR: [[LANE0_I:%.*]] = extractelement <2 x double> %a, i64 0 -// COMMONIR: [[LANE1_I:%.*]] = extractelement <2 x double> %a, i64 1 -// UNCONSTRAINED: [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]] -// CONSTRAINED: [[VPADDD_I:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LANE0_I]], double [[LANE1_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret double [[VPADDD_I]] +// UNCONSTRAINED-LABEL: define dso_local double @test_vpaddd_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[LANE0_I:%.*]] = extractelement <2 x double> [[A]], i64 0 +// UNCONSTRAINED-NEXT: [[LANE1_I:%.*]] = extractelement <2 x double> [[A]], i64 1 +// UNCONSTRAINED-NEXT: [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]] +// UNCONSTRAINED-NEXT: ret double [[VPADDD_I]] +// +// CONSTRAINED-LABEL: define dso_local double @test_vpaddd_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[LANE0_I:%.*]] = extractelement <2 x double> [[A]], i64 0 +// CONSTRAINED-NEXT: [[LANE1_I:%.*]] = extractelement <2 x double> [[A]], i64 1 +// CONSTRAINED-NEXT: [[VPADDD_I:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LANE0_I]], double [[LANE1_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret double [[VPADDD_I]] +// float64_t test_vpaddd_f64(float64x2_t a) { return vpaddd_f64(a); } -// COMMON-LABEL: test_vcvts_f32_s32 -// UNCONSTRAINED: [[TMP0:%.*]] = sitofp i32 %a to float -// CONSTRAINED: [[TMP0:%.*]] = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret float [[TMP0]] +// UNCONSTRAINED-LABEL: define dso_local float @test_vcvts_f32_s32( +// UNCONSTRAINED-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = sitofp i32 [[A]] to float +// UNCONSTRAINED-NEXT: ret float [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local float @test_vcvts_f32_s32( +// CONSTRAINED-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret float [[TMP0]] +// float32_t test_vcvts_f32_s32(int32_t a) { return vcvts_f32_s32(a); } -// COMMON-LABEL: test_vcvtd_f64_s64 -// UNCONSTRAINED: [[TMP0:%.*]] = sitofp i64 %a to double -// CONSTRAINED: [[TMP0:%.*]] = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret double [[TMP0]] +// UNCONSTRAINED-LABEL: define dso_local double @test_vcvtd_f64_s64( +// UNCONSTRAINED-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = sitofp i64 [[A]] to double +// UNCONSTRAINED-NEXT: ret double [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local double @test_vcvtd_f64_s64( +// CONSTRAINED-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret double [[TMP0]] +// float64_t test_vcvtd_f64_s64(int64_t a) { return vcvtd_f64_s64(a); } -// COMMON-LABEL: test_vcvts_f32_u32 -// UNCONSTRAINED: [[TMP0:%.*]] = uitofp i32 %a to float -// CONSTRAINED: [[TMP0:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret float [[TMP0]] +// UNCONSTRAINED-LABEL: define dso_local float @test_vcvts_f32_u32( +// UNCONSTRAINED-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = uitofp i32 [[A]] to float +// UNCONSTRAINED-NEXT: ret float [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local float @test_vcvts_f32_u32( +// CONSTRAINED-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret float [[TMP0]] +// float32_t test_vcvts_f32_u32(uint32_t a) { return vcvts_f32_u32(a); } // XXX should verify the type of registers -// COMMON-LABEL: test_vcvtd_f64_u64 -// UNCONSTRAINED: [[TMP0:%.*]] = uitofp i64 %a to double -// CONSTRAINED: [[TMP0:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret double [[TMP0]] +// UNCONSTRAINED-LABEL: define dso_local double @test_vcvtd_f64_u64( +// UNCONSTRAINED-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = uitofp i64 [[A]] to double +// UNCONSTRAINED-NEXT: ret double [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local double @test_vcvtd_f64_u64( +// CONSTRAINED-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret double [[TMP0]] +// float64_t test_vcvtd_f64_u64(uint64_t a) { return vcvtd_f64_u64(a); } -// COMMON-LABEL: test_vceqs_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oeq float %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vceqs_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oeq float [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vceqs_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f32(float [[A]], float [[B]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vceqs_f32(float32_t a, float32_t b) { return (uint32_t)vceqs_f32(a, b); } -// COMMON-LABEL: test_vceqd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oeq double %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vceqd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oeq double [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vceqd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[A]], double [[B]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vceqd_f64(float64_t a, float64_t b) { return (uint64_t)vceqd_f64(a, b); } -// COMMON-LABEL: test_vceqzs_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oeq float %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCEQZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vceqzs_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oeq float [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCEQZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vceqzs_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f32(float [[A]], float 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCEQZ_I]] +// uint32_t test_vceqzs_f32(float32_t a) { return (uint32_t)vceqzs_f32(a); } -// COMMON-LABEL: test_vceqzd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oeq double %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCEQZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vceqzd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oeq double [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCEQZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vceqzd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[A]], double 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCEQZ_I]] +// uint64_t test_vceqzd_f64(float64_t a) { return (uint64_t)vceqzd_f64(a); } -// COMMON-LABEL: test_vcges_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oge float %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcges_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oge float [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vcges_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float [[B]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vcges_f32(float32_t a, float32_t b) { return (uint32_t)vcges_f32(a, b); } -// COMMON-LABEL: test_vcged_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oge double %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcged_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oge double [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcged_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double [[B]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcged_f64(float64_t a, float64_t b) { return (uint64_t)vcged_f64(a, b); } -// COMMON-LABEL: test_vcgezs_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oge float %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float 0.000000e+00, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCGEZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcgezs_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oge float [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCGEZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vcgezs_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float 0.000000e+00, metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCGEZ_I]] +// uint32_t test_vcgezs_f32(float32_t a) { return (uint32_t)vcgezs_f32(a); } -// COMMON-LABEL: test_vcgezd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oge double %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double 0.000000e+00, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCGEZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcgezd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oge double [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCGEZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcgezd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double 0.000000e+00, metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCGEZ_I]] +// uint64_t test_vcgezd_f64(float64_t a) { return (uint64_t)vcgezd_f64(a); } -// COMMON-LABEL: test_vcgts_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ogt float %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcgts_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ogt float [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vcgts_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float [[B]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vcgts_f32(float32_t a, float32_t b) { return (uint32_t)vcgts_f32(a, b); } -// COMMON-LABEL: test_vcgtd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ogt double %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcgtd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ogt double [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcgtd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double [[B]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcgtd_f64(float64_t a, float64_t b) { return (uint64_t)vcgtd_f64(a, b); } -// COMMON-LABEL: test_vcgtzs_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ogt float %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float 0.000000e+00, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCGTZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcgtzs_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ogt float [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCGTZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vcgtzs_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float 0.000000e+00, metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCGTZ_I]] +// uint32_t test_vcgtzs_f32(float32_t a) { return (uint32_t)vcgtzs_f32(a); } -// COMMON-LABEL: test_vcgtzd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ogt double %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double 0.000000e+00, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCGTZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcgtzd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ogt double [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCGTZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcgtzd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double 0.000000e+00, metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCGTZ_I]] +// uint64_t test_vcgtzd_f64(float64_t a) { return (uint64_t)vcgtzd_f64(a); } -// COMMON-LABEL: test_vcles_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ole float %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcles_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ole float [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vcles_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float [[B]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vcles_f32(float32_t a, float32_t b) { return (uint32_t)vcles_f32(a, b); } -// COMMON-LABEL: test_vcled_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ole double %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcled_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ole double [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcled_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double [[B]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcled_f64(float64_t a, float64_t b) { return (uint64_t)vcled_f64(a, b); } -// COMMON-LABEL: test_vclezs_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ole float %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCLEZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vclezs_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ole float [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCLEZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vclezs_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCLEZ_I]] +// uint32_t test_vclezs_f32(float32_t a) { return (uint32_t)vclezs_f32(a); } -// COMMON-LABEL: test_vclezd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ole double %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCLEZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vclezd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ole double [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCLEZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vclezd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCLEZ_I]] +// uint64_t test_vclezd_f64(float64_t a) { return (uint64_t)vclezd_f64(a); } -// COMMON-LABEL: test_vclts_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp olt float %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vclts_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp olt float [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vclts_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float [[B]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vclts_f32(float32_t a, float32_t b) { return (uint32_t)vclts_f32(a, b); } -// COMMON-LABEL: test_vcltd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp olt double %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcltd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp olt double [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcltd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double [[B]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcltd_f64(float64_t a, float64_t b) { return (uint64_t)vcltd_f64(a, b); } -// COMMON-LABEL: test_vcltzs_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp olt float %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float 0.000000e+00, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCLTZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcltzs_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp olt float [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCLTZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vcltzs_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float 0.000000e+00, metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCLTZ_I]] +// uint32_t test_vcltzs_f32(float32_t a) { return (uint32_t)vcltzs_f32(a); } -// COMMON-LABEL: test_vcltzd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp olt double %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double 0.000000e+00, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCLTZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcltzd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp olt double [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCLTZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcltzd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double 0.000000e+00, metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCLTZ_I]] +// uint64_t test_vcltzd_f64(float64_t a) { return (uint64_t)vcltzd_f64(a); } -// COMMON-LABEL: test_vadd_f64 -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <1 x double> %a, %b -// CONSTRAINED: [[ADD_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> %a, <1 x double> %b, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vadd_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: ret <1 x double> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vadd_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[ADD_I]] +// float64x1_t test_vadd_f64(float64x1_t a, float64x1_t b) { return vadd_f64(a, b); } -// COMMON-LABEL: test_vmul_f64 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <1 x double> %a, %b -// CONSTRAINED: [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> %a, <1 x double> %b, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[MUL_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vmul_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: ret <1 x double> [[MUL_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vmul_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[MUL_I]] +// float64x1_t test_vmul_f64(float64x1_t a, float64x1_t b) { return vmul_f64(a, b); } -// COMMON-LABEL: test_vdiv_f64 -// UNCONSTRAINED: [[DIV_I:%.*]] = fdiv <1 x double> %a, %b -// CONSTRAINED: [[DIV_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fdiv.v1f64(<1 x double> %a, <1 x double> %b, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[DIV_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vdiv_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[DIV_I:%.*]] = fdiv <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: ret <1 x double> [[DIV_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vdiv_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[DIV_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fdiv.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[DIV_I]] +// float64x1_t test_vdiv_f64(float64x1_t a, float64x1_t b) { return vdiv_f64(a, b); } -// COMMON-LABEL: test_vmla_f64 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <1 x double> %b, %c -// CONSTRAINED: [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> %b, <1 x double> %c, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <1 x double> %a, [[MUL_I]] -// CONSTRAINED: [[ADD_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> %a, <1 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vmla_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <1 x double> [[B]], [[C]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <1 x double> [[A]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <1 x double> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vmla_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> [[B]], <1 x double> [[C]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> [[A]], <1 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[ADD_I]] +// float64x1_t test_vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vmla_f64(a, b, c); } -// COMMON-LABEL: test_vmls_f64 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <1 x double> %b, %c -// CONSTRAINED: [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> %b, <1 x double> %c, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <1 x double> %a, [[MUL_I]] -// CONSTRAINED: [[SUB_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> %a, <1 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vmls_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <1 x double> [[B]], [[C]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <1 x double> [[A]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <1 x double> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vmls_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> [[B]], <1 x double> [[C]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> [[A]], <1 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[SUB_I]] +// float64x1_t test_vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vmls_f64(a, b, c); } -// COMMON-LABEL: test_vfma_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a) -// CONSTRAINED: [[TMP3:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// UNCONSTRAINED-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[C]] to i64 +// UNCONSTRAINED-NEXT: [[__P2_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__P2_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], <1 x double> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CONSTRAINED-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[C]] to i64 +// CONSTRAINED-NEXT: [[__P2_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__P2_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], <1 x double> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[TMP9]] +// float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vfma_f64(a, b, c); } -// COMMON-LABEL: test_vfms_f64 -// COMMONIR: [[SUB_I:%.*]] = fneg <1 x double> %b -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> %c, <1 x double> %a) -// CONSTRAINED: [[TMP3:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> %c, <1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[B]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to i64 +// UNCONSTRAINED-NEXT: [[__P1_ADDR_I_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[C]] to i64 +// UNCONSTRAINED-NEXT: [[__P2_ADDR_I_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__P2_ADDR_I_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], <1 x double> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[B]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__P0_ADDR_I_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to i64 +// CONSTRAINED-NEXT: [[__P1_ADDR_I_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[C]] to i64 +// CONSTRAINED-NEXT: [[__P2_ADDR_I_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__P2_ADDR_I_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], <1 x double> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[TMP9]] +// float64x1_t test_vfms_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vfms_f64(a, b, c); } -// COMMON-LABEL: test_vsub_f64 -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <1 x double> %a, %b -// CONSTRAINED: [[SUB_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> %a, <1 x double> %b, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vsub_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: ret <1 x double> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vsub_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[SUB_I]] +// float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) { return vsub_f64(a, b); } -// COMMON-LABEL: test_vcvt_s64_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %a) -// COMMONIR: ret <1 x i64> [[TMP1]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_s64_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) +// UNCONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_s64_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CONSTRAINED-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// int64x1_t test_vcvt_s64_f64(float64x1_t a) { return vcvt_s64_f64(a); } -// COMMON-LABEL: test_vcvt_u64_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %a) -// COMMONIR: ret <1 x i64> [[TMP1]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_u64_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) +// UNCONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_u64_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CONSTRAINED-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// uint64x1_t test_vcvt_u64_f64(float64x1_t a) { return vcvt_u64_f64(a); } -// COMMON-LABEL: test_vcvt_f64_s64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// UNCONSTRAINED: [[VCVT_I:%.*]] = sitofp <1 x i64> %a to <1 x double> -// CONSTRAINED: [[VCVT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VCVT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vcvt_f64_s64( +// UNCONSTRAINED-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// UNCONSTRAINED-NEXT: [[VCVT_I:%.*]] = sitofp <1 x i64> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: ret <1 x double> [[VCVT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vcvt_f64_s64( +// CONSTRAINED-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CONSTRAINED-NEXT: [[VCVT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> [[TMP1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VCVT_I]] +// float64x1_t test_vcvt_f64_s64(int64x1_t a) { return vcvt_f64_s64(a); } -// COMMON-LABEL: test_vcvt_f64_u64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// UNCONSTRAINED: [[VCVT_I:%.*]] = uitofp <1 x i64> %a to <1 x double> -// CONSTRAINED: [[VCVT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VCVT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vcvt_f64_u64( +// UNCONSTRAINED-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// UNCONSTRAINED-NEXT: [[VCVT_I:%.*]] = uitofp <1 x i64> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: ret <1 x double> [[VCVT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vcvt_f64_u64( +// CONSTRAINED-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CONSTRAINED-NEXT: [[VCVT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64> [[TMP1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VCVT_I]] +// float64x1_t test_vcvt_f64_u64(uint64x1_t a) { return vcvt_f64_u64(a); } -// COMMON-LABEL: test_vrnda_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> %a) -// CONSTRAINED: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double> %a, metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VRNDA1_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrnda_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> [[VRNDA_I]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VRNDA1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrnda_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CONSTRAINED-NEXT: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double> [[VRNDA_I]], metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VRNDA1_I]] +// float64x1_t test_vrnda_f64(float64x1_t a) { return vrnda_f64(a); } -// COMMON-LABEL: test_vrndp_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> %a) -// CONSTRAINED: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double> %a, metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VRNDP1_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndp_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> [[VRNDP_I]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VRNDP1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndp_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CONSTRAINED-NEXT: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double> [[VRNDP_I]], metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VRNDP1_I]] +// float64x1_t test_vrndp_f64(float64x1_t a) { return vrndp_f64(a); } -// COMMON-LABEL: test_vrndm_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> %a) -// CONSTRAINED: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double> %a, metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VRNDM1_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndm_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> [[VRNDM_I]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VRNDM1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndm_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CONSTRAINED-NEXT: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double> [[VRNDM_I]], metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VRNDM1_I]] +// float64x1_t test_vrndm_f64(float64x1_t a) { return vrndm_f64(a); } -// COMMON-LABEL: test_vrndx_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> %a) -// CONSTRAINED: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.rint.v1f64(<1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VRNDX1_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndx_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> [[VRNDX_I]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VRNDX1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndx_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CONSTRAINED-NEXT: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.rint.v1f64(<1 x double> [[VRNDX_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VRNDX1_I]] +// float64x1_t test_vrndx_f64(float64x1_t a) { return vrndx_f64(a); } -// COMMON-LABEL: test_vrnd_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> %a) -// CONSTRAINED: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double> %a, metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VRNDZ1_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrnd_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> [[VRNDZ_I]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VRNDZ1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrnd_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CONSTRAINED-NEXT: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double> [[VRNDZ_I]], metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VRNDZ1_I]] +// float64x1_t test_vrnd_f64(float64x1_t a) { return vrnd_f64(a); } -// COMMON-LABEL: test_vrndi_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a) -// CONSTRAINED: [[VRNDI1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.nearbyint.v1f64(<1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VRNDI1_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndi_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[VRNDI_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: [[VRNDI_V1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> [[VRNDI_V_I]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VRNDI_V1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndi_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[VRNDI_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CONSTRAINED-NEXT: [[VRNDI_V1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.nearbyint.v1f64(<1 x double> [[VRNDI_V_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VRNDI_V1_I]] +// float64x1_t test_vrndi_f64(float64x1_t a) { return vrndi_f64(a); } -// COMMON-LABEL: test_vsqrt_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a) -// CONSTRAINED: [[VSQRT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.sqrt.v1f64(<1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VSQRT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vsqrt_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// UNCONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> [[TMP2]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VSQRT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vsqrt_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.sqrt.v1f64(<1 x double> [[TMP2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VSQRT_I]] +// float64x1_t test_vsqrt_f64(float64x1_t a) { return vsqrt_f64(a); } diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c index 271ae056308d2..791f0a1a29409 100644 --- a/clang/test/CodeGen/AArch64/neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c @@ -1,17418 +1,23567 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -disable-O0-optnone \ // RUN: -flax-vector-conversions=none -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: @test_vadd_s8( -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, %v2 -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vadd_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// int8x8_t test_vadd_s8(int8x8_t v1, int8x8_t v2) { return vadd_s8(v1, v2); } -// CHECK-LABEL: @test_vadd_s16( -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, %v2 -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vadd_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// int16x4_t test_vadd_s16(int16x4_t v1, int16x4_t v2) { return vadd_s16(v1, v2); } -// CHECK-LABEL: @test_vadd_s32( -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, %v2 -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vadd_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vadd_s32(int32x2_t v1, int32x2_t v2) { return vadd_s32(v1, v2); } -// CHECK-LABEL: @test_vadd_s64( -// CHECK: [[ADD_I:%.*]] = add <1 x i64> %v1, %v2 -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vadd_s64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// int64x1_t test_vadd_s64(int64x1_t v1, int64x1_t v2) { return vadd_s64(v1, v2); } -// CHECK-LABEL: @test_vadd_f32( -// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %v1, %v2 -// CHECK: ret <2 x float> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vadd_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) { return vadd_f32(v1, v2); } -// CHECK-LABEL: @test_vadd_u8( -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, %v2 -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vadd_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// uint8x8_t test_vadd_u8(uint8x8_t v1, uint8x8_t v2) { return vadd_u8(v1, v2); } -// CHECK-LABEL: @test_vadd_u16( -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, %v2 -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vadd_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vadd_u16(uint16x4_t v1, uint16x4_t v2) { return vadd_u16(v1, v2); } -// CHECK-LABEL: @test_vadd_u32( -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, %v2 -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vadd_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vadd_u32(uint32x2_t v1, uint32x2_t v2) { return vadd_u32(v1, v2); } -// CHECK-LABEL: @test_vadd_u64( -// CHECK: [[ADD_I:%.*]] = add <1 x i64> %v1, %v2 -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vadd_u64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// uint64x1_t test_vadd_u64(uint64x1_t v1, uint64x1_t v2) { return vadd_u64(v1, v2); } -// CHECK-LABEL: @test_vaddq_s8( -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, %v2 -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// int8x16_t test_vaddq_s8(int8x16_t v1, int8x16_t v2) { return vaddq_s8(v1, v2); } -// CHECK-LABEL: @test_vaddq_s16( -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, %v2 -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddq_s16(int16x8_t v1, int16x8_t v2) { return vaddq_s16(v1, v2); } -// CHECK-LABEL: @test_vaddq_s32( -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, %v2 -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddq_s32(int32x4_t v1, int32x4_t v2) { return vaddq_s32(v1, v2); } -// CHECK-LABEL: @test_vaddq_s64( -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %v1, %v2 -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddq_s64(int64x2_t v1, int64x2_t v2) { return vaddq_s64(v1, v2); } -// CHECK-LABEL: @test_vaddq_f32( -// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %v1, %v2 -// CHECK: ret <4 x float> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vaddq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) { return vaddq_f32(v1, v2); } -// CHECK-LABEL: @test_vaddq_f64( -// CHECK: [[ADD_I:%.*]] = fadd <2 x double> %v1, %v2 -// CHECK: ret <2 x double> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vaddq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x double> [[ADD_I]] +// float64x2_t test_vaddq_f64(float64x2_t v1, float64x2_t v2) { return vaddq_f64(v1, v2); } -// CHECK-LABEL: @test_vaddq_u8( -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, %v2 -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// uint8x16_t test_vaddq_u8(uint8x16_t v1, uint8x16_t v2) { return vaddq_u8(v1, v2); } -// CHECK-LABEL: @test_vaddq_u16( -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, %v2 -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddq_u16(uint16x8_t v1, uint16x8_t v2) { return vaddq_u16(v1, v2); } -// CHECK-LABEL: @test_vaddq_u32( -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, %v2 -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddq_u32(uint32x4_t v1, uint32x4_t v2) { return vaddq_u32(v1, v2); } -// CHECK-LABEL: @test_vaddq_u64( -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %v1, %v2 -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddq_u64(uint64x2_t v1, uint64x2_t v2) { return vaddq_u64(v1, v2); } -// CHECK-LABEL: @test_vsub_s8( -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2 -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsub_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// int8x8_t test_vsub_s8(int8x8_t v1, int8x8_t v2) { return vsub_s8(v1, v2); } -// CHECK-LABEL: @test_vsub_s16( -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2 -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsub_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// int16x4_t test_vsub_s16(int16x4_t v1, int16x4_t v2) { return vsub_s16(v1, v2); } -// CHECK-LABEL: @test_vsub_s32( -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2 -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsub_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vsub_s32(int32x2_t v1, int32x2_t v2) { return vsub_s32(v1, v2); } -// CHECK-LABEL: @test_vsub_s64( -// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2 -// CHECK: ret <1 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsub_s64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <1 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <1 x i64> [[SUB_I]] +// int64x1_t test_vsub_s64(int64x1_t v1, int64x1_t v2) { return vsub_s64(v1, v2); } -// CHECK-LABEL: @test_vsub_f32( -// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %v1, %v2 -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vsub_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) { return vsub_f32(v1, v2); } -// CHECK-LABEL: @test_vsub_u8( -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2 -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsub_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// uint8x8_t test_vsub_u8(uint8x8_t v1, uint8x8_t v2) { return vsub_u8(v1, v2); } -// CHECK-LABEL: @test_vsub_u16( -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2 -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsub_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// uint16x4_t test_vsub_u16(uint16x4_t v1, uint16x4_t v2) { return vsub_u16(v1, v2); } -// CHECK-LABEL: @test_vsub_u32( -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2 -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsub_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// uint32x2_t test_vsub_u32(uint32x2_t v1, uint32x2_t v2) { return vsub_u32(v1, v2); } -// CHECK-LABEL: @test_vsub_u64( -// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2 -// CHECK: ret <1 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsub_u64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <1 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <1 x i64> [[SUB_I]] +// uint64x1_t test_vsub_u64(uint64x1_t v1, uint64x1_t v2) { return vsub_u64(v1, v2); } -// CHECK-LABEL: @test_vsubq_s8( -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2 -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsubq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// int8x16_t test_vsubq_s8(int8x16_t v1, int8x16_t v2) { return vsubq_s8(v1, v2); } -// CHECK-LABEL: @test_vsubq_s16( -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2 -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubq_s16(int16x8_t v1, int16x8_t v2) { return vsubq_s16(v1, v2); } -// CHECK-LABEL: @test_vsubq_s32( -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2 -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubq_s32(int32x4_t v1, int32x4_t v2) { return vsubq_s32(v1, v2); } -// CHECK-LABEL: @test_vsubq_s64( -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2 -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubq_s64(int64x2_t v1, int64x2_t v2) { return vsubq_s64(v1, v2); } -// CHECK-LABEL: @test_vsubq_f32( -// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %v1, %v2 -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vsubq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) { return vsubq_f32(v1, v2); } -// CHECK-LABEL: @test_vsubq_f64( -// CHECK: [[SUB_I:%.*]] = fsub <2 x double> %v1, %v2 -// CHECK: ret <2 x double> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vsubq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x double> [[SUB_I]] +// float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) { return vsubq_f64(v1, v2); } -// CHECK-LABEL: @test_vsubq_u8( -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2 -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsubq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// uint8x16_t test_vsubq_u8(uint8x16_t v1, uint8x16_t v2) { return vsubq_u8(v1, v2); } -// CHECK-LABEL: @test_vsubq_u16( -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2 -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubq_u16(uint16x8_t v1, uint16x8_t v2) { return vsubq_u16(v1, v2); } -// CHECK-LABEL: @test_vsubq_u32( -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2 -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubq_u32(uint32x4_t v1, uint32x4_t v2) { return vsubq_u32(v1, v2); } -// CHECK-LABEL: @test_vsubq_u64( -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2 -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubq_u64(uint64x2_t v1, uint64x2_t v2) { return vsubq_u64(v1, v2); } -// CHECK-LABEL: @test_vmul_s8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2 -// CHECK: ret <8 x i8> [[MUL_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmul_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i8> [[MUL_I]] +// int8x8_t test_vmul_s8(int8x8_t v1, int8x8_t v2) { return vmul_s8(v1, v2); } -// CHECK-LABEL: @test_vmul_s16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2 -// CHECK: ret <4 x i16> [[MUL_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmul_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i16> [[MUL_I]] +// int16x4_t test_vmul_s16(int16x4_t v1, int16x4_t v2) { return vmul_s16(v1, v2); } -// CHECK-LABEL: @test_vmul_s32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2 -// CHECK: ret <2 x i32> [[MUL_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmul_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i32> [[MUL_I]] +// int32x2_t test_vmul_s32(int32x2_t v1, int32x2_t v2) { return vmul_s32(v1, v2); } -// CHECK-LABEL: @test_vmul_f32( -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %v1, %v2 -// CHECK: ret <2 x float> [[MUL_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmul_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x float> [[MUL_I]] +// float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) { return vmul_f32(v1, v2); } -// CHECK-LABEL: @test_vmul_u8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2 -// CHECK: ret <8 x i8> [[MUL_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmul_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i8> [[MUL_I]] +// uint8x8_t test_vmul_u8(uint8x8_t v1, uint8x8_t v2) { return vmul_u8(v1, v2); } -// CHECK-LABEL: @test_vmul_u16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2 -// CHECK: ret <4 x i16> [[MUL_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmul_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i16> [[MUL_I]] +// uint16x4_t test_vmul_u16(uint16x4_t v1, uint16x4_t v2) { return vmul_u16(v1, v2); } -// CHECK-LABEL: @test_vmul_u32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2 -// CHECK: ret <2 x i32> [[MUL_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmul_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i32> [[MUL_I]] +// uint32x2_t test_vmul_u32(uint32x2_t v1, uint32x2_t v2) { return vmul_u32(v1, v2); } -// CHECK-LABEL: @test_vmulq_s8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2 -// CHECK: ret <16 x i8> [[MUL_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmulq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <16 x i8> [[MUL_I]] +// int8x16_t test_vmulq_s8(int8x16_t v1, int8x16_t v2) { return vmulq_s8(v1, v2); } -// CHECK-LABEL: @test_vmulq_s16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2 -// CHECK: ret <8 x i16> [[MUL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmulq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i16> [[MUL_I]] +// int16x8_t test_vmulq_s16(int16x8_t v1, int16x8_t v2) { return vmulq_s16(v1, v2); } -// CHECK-LABEL: @test_vmulq_s32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2 -// CHECK: ret <4 x i32> [[MUL_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmulq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i32> [[MUL_I]] +// int32x4_t test_vmulq_s32(int32x4_t v1, int32x4_t v2) { return vmulq_s32(v1, v2); } -// CHECK-LABEL: @test_vmulq_u8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2 -// CHECK: ret <16 x i8> [[MUL_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmulq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <16 x i8> [[MUL_I]] +// uint8x16_t test_vmulq_u8(uint8x16_t v1, uint8x16_t v2) { return vmulq_u8(v1, v2); } -// CHECK-LABEL: @test_vmulq_u16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2 -// CHECK: ret <8 x i16> [[MUL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmulq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i16> [[MUL_I]] +// uint16x8_t test_vmulq_u16(uint16x8_t v1, uint16x8_t v2) { return vmulq_u16(v1, v2); } -// CHECK-LABEL: @test_vmulq_u32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2 -// CHECK: ret <4 x i32> [[MUL_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmulq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i32> [[MUL_I]] +// uint32x4_t test_vmulq_u32(uint32x4_t v1, uint32x4_t v2) { return vmulq_u32(v1, v2); } -// CHECK-LABEL: @test_vmulq_f32( -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %v1, %v2 -// CHECK: ret <4 x float> [[MUL_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vmulq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x float> [[MUL_I]] +// float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) { return vmulq_f32(v1, v2); } -// CHECK-LABEL: @test_vmulq_f64( -// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %v1, %v2 -// CHECK: ret <2 x double> [[MUL_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vmulq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x double> [[MUL_I]] +// float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) { return vmulq_f64(v1, v2); } -// CHECK-LABEL: @test_vmul_p8( -// CHECK: [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VMUL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmul_p8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VMUL_V_I]] +// poly8x8_t test_vmul_p8(poly8x8_t v1, poly8x8_t v2) { return vmul_p8(v1, v2); } -// CHECK-LABEL: @test_vmulq_p8( -// CHECK: [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VMULQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmulq_p8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VMULQ_V_I]] +// poly8x16_t test_vmulq_p8(poly8x16_t v1, poly8x16_t v2) { return vmulq_p8(v1, v2); } -// CHECK-LABEL: @test_vmla_s8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmla_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// int8x8_t test_vmla_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) { return vmla_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vmla_s16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]] -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[ADD_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmla_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[ADD_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vmla_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) { return (int8x8_t)vmla_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vmla_s32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmla_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vmla_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) { return vmla_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vmla_f32( -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %v1, [[MUL_I]] -// CHECK: ret <2 x float> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmla_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vmla_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vmla_u8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmla_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// uint8x8_t test_vmla_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) { return vmla_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vmla_u16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmla_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vmla_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) { return vmla_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vmla_u32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmla_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vmla_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) { return vmla_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_s8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmlaq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// int8x16_t test_vmlaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) { return vmlaq_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_s16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlaq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vmlaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) { return vmlaq_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_s32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlaq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vmlaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) { return vmlaq_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_f32( -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %v1, [[MUL_I]] -// CHECK: ret <4 x float> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vmlaq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vmlaq_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_u8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmlaq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// uint8x16_t test_vmlaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) { return vmlaq_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_u16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlaq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vmlaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) { return vmlaq_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_u32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlaq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vmlaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) { return vmlaq_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_f64( -// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = fadd <2 x double> %v1, [[MUL_I]] -// CHECK: ret <2 x double> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vmlaq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x double> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x double> [[ADD_I]] +// float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vmlaq_f64(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_s8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]] -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmls_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// int8x8_t test_vmls_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) { return vmls_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_s16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]] -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SUB_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmls_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SUB_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vmls_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) { return (int8x8_t)vmls_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_s32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]] -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmls_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vmls_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) { return vmls_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_f32( -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %v1, [[MUL_I]] -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmls_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vmls_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_u8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]] -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmls_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// uint8x8_t test_vmls_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) { return vmls_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_u16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]] -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmls_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// uint16x4_t test_vmls_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) { return vmls_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_u32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]] -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmls_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// uint32x2_t test_vmls_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) { return vmls_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_s8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]] -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmlsq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// int8x16_t test_vmlsq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) { return vmlsq_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_s16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vmlsq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) { return vmlsq_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_s32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vmlsq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) { return vmlsq_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_f32( -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %v1, [[MUL_I]] -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vmlsq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vmlsq_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_u8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]] -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmlsq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// uint8x16_t test_vmlsq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) { return vmlsq_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_u16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vmlsq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) { return vmlsq_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_u32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vmlsq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) { return vmlsq_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_f64( -// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = fsub <2 x double> %v1, [[MUL_I]] -// CHECK: ret <2 x double> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vmlsq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x double> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x double> [[SUB_I]] +// float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vmlsq_f64(v1, v2, v3); } -// CHECK-LABEL: @test_vfma_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8> -// CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %v2, <2 x float> %v3, <2 x float> %v1) -// CHECK: ret <2 x float> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x float> @test_vfma_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]]) +// CHECK-NEXT: ret <2 x float> [[TMP9]] +// float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vfma_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vfmaq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> -// CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %v2, <4 x float> %v3, <4 x float> %v1) -// CHECK: ret <4 x float> [[TMP3]] +// CHECK-LABEL: define dso_local <4 x float> @test_vfmaq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]]) +// CHECK-NEXT: ret <4 x float> [[TMP9]] +// float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vfmaq_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vfmaq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> -// CHECK: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %v3, <2 x double> %v1) -// CHECK: ret <2 x double> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x double> @test_vfmaq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]]) +// CHECK-NEXT: ret <2 x double> [[TMP9]] +// float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vfmaq_f64(v1, v2, v3); } -// CHECK-LABEL: @test_vfms_f32( -// CHECK: [[SUB_I:%.*]] = fneg <2 x float> %v2 -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8> -// CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %v3, <2 x float> %v1) -// CHECK: ret <2 x float> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x float> @test_vfms_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]]) +// CHECK-NEXT: ret <2 x float> [[TMP9]] +// float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vfms_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vfmsq_f32( -// CHECK: [[SUB_I:%.*]] = fneg <4 x float> %v2 -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> -// CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %v3, <4 x float> %v1) -// CHECK: ret <4 x float> [[TMP3]] +// CHECK-LABEL: define dso_local <4 x float> @test_vfmsq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]]) +// CHECK-NEXT: ret <4 x float> [[TMP9]] +// float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vfmsq_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vfmsq_f64( -// CHECK: [[SUB_I:%.*]] = fneg <2 x double> %v2 -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB_I]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> -// CHECK: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> %v3, <2 x double> %v1) -// CHECK: ret <2 x double> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x double> @test_vfmsq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x double> [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG_I]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x double> [[TMP6]]) +// CHECK-NEXT: ret <2 x double> [[TMP9]] +// float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vfmsq_f64(v1, v2, v3); } -// CHECK-LABEL: @test_vdivq_f64( -// CHECK: [[DIV_I:%.*]] = fdiv <2 x double> %v1, %v2 -// CHECK: ret <2 x double> [[DIV_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vdivq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DIV_I:%.*]] = fdiv <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x double> [[DIV_I]] +// float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) { return vdivq_f64(v1, v2); } -// CHECK-LABEL: @test_vdivq_f32( -// CHECK: [[DIV_I:%.*]] = fdiv <4 x float> %v1, %v2 -// CHECK: ret <4 x float> [[DIV_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vdivq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DIV_I:%.*]] = fdiv <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x float> [[DIV_I]] +// float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) { return vdivq_f32(v1, v2); } -// CHECK-LABEL: @test_vdiv_f32( -// CHECK: [[DIV_I:%.*]] = fdiv <2 x float> %v1, %v2 -// CHECK: ret <2 x float> [[DIV_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vdiv_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DIV_I:%.*]] = fdiv <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x float> [[DIV_I]] +// float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) { return vdiv_f32(v1, v2); } -// CHECK-LABEL: @test_vaba_s8( -// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vaba_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[V2]], <8 x i8> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[VABD_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// int8x8_t test_vaba_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) { return vaba_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vaba_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %v2, <4 x i16> %v3) -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vaba_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V3]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// int16x4_t test_vaba_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) { return vaba_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vaba_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %v2, <2 x i32> %v3) -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vaba_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V3]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) { return vaba_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vaba_u8( -// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vaba_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[V2]], <8 x i8> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[VABD_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// uint8x8_t test_vaba_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) { return vaba_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vaba_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %v2, <4 x i16> %v3) -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vaba_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V3]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vaba_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) { return vaba_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vaba_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %v2, <2 x i32> %v3) -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vaba_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V3]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vaba_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) { return vaba_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vabaq_s8( -// CHECK: [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vabaq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> [[V2]], <16 x i8> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[VABD_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// int8x16_t test_vabaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) { return vabaq_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vabaq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %v2, <8 x i16> %v3) -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabaq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V3]] to <16 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vabaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) { return vabaq_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vabaq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %v2, <4 x i32> %v3) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabaq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V3]] to <16 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vabaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) { return vabaq_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vabaq_u8( -// CHECK: [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vabaq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> [[V2]], <16 x i8> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[VABD_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// uint8x16_t test_vabaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) { return vabaq_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vabaq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %v2, <8 x i16> %v3) -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabaq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V3]] to <16 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vabaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) { return vabaq_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vabaq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %v2, <4 x i32> %v3) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabaq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V3]] to <16 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) { return vabaq_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vabd_s8( -// CHECK: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VABD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vabd_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VABD_I]] +// int8x8_t test_vabd_s8(int8x8_t v1, int8x8_t v2) { return vabd_s8(v1, v2); } -// CHECK-LABEL: @test_vabd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: ret <4 x i16> [[VABD2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vabd_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VABD2_I]] +// int16x4_t test_vabd_s16(int16x4_t v1, int16x4_t v2) { return vabd_s16(v1, v2); } -// CHECK-LABEL: @test_vabd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: ret <2 x i32> [[VABD2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vabd_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VABD2_I]] +// int32x2_t test_vabd_s32(int32x2_t v1, int32x2_t v2) { return vabd_s32(v1, v2); } -// CHECK-LABEL: @test_vabd_u8( -// CHECK: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VABD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vabd_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VABD_I]] +// uint8x8_t test_vabd_u8(uint8x8_t v1, uint8x8_t v2) { return vabd_u8(v1, v2); } -// CHECK-LABEL: @test_vabd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: ret <4 x i16> [[VABD2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vabd_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VABD2_I]] +// uint16x4_t test_vabd_u16(uint16x4_t v1, uint16x4_t v2) { return vabd_u16(v1, v2); } -// CHECK-LABEL: @test_vabd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: ret <2 x i32> [[VABD2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vabd_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VABD2_I]] +// uint32x2_t test_vabd_u32(uint32x2_t v1, uint32x2_t v2) { return vabd_u32(v1, v2); } -// CHECK-LABEL: @test_vabd_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %v1, <2 x float> %v2) -// CHECK: ret <2 x float> [[VABD2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vabd_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[VABD_I]], <2 x float> [[VABD1_I]]) +// CHECK-NEXT: ret <2 x float> [[VABD2_I]] +// float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) { return vabd_f32(v1, v2); } -// CHECK-LABEL: @test_vabdq_s8( -// CHECK: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VABD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vabdq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VABD_I]] +// int8x16_t test_vabdq_s8(int8x16_t v1, int8x16_t v2) { return vabdq_s8(v1, v2); } -// CHECK-LABEL: @test_vabdq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: ret <8 x i16> [[VABD2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabdq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VABD2_I]] +// int16x8_t test_vabdq_s16(int16x8_t v1, int16x8_t v2) { return vabdq_s16(v1, v2); } -// CHECK-LABEL: @test_vabdq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: ret <4 x i32> [[VABD2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabdq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VABD2_I]] +// int32x4_t test_vabdq_s32(int32x4_t v1, int32x4_t v2) { return vabdq_s32(v1, v2); } -// CHECK-LABEL: @test_vabdq_u8( -// CHECK: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VABD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vabdq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VABD_I]] +// uint8x16_t test_vabdq_u8(uint8x16_t v1, uint8x16_t v2) { return vabdq_u8(v1, v2); } -// CHECK-LABEL: @test_vabdq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: ret <8 x i16> [[VABD2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabdq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VABD2_I]] +// uint16x8_t test_vabdq_u16(uint16x8_t v1, uint16x8_t v2) { return vabdq_u16(v1, v2); } -// CHECK-LABEL: @test_vabdq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: ret <4 x i32> [[VABD2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabdq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VABD2_I]] +// uint32x4_t test_vabdq_u32(uint32x4_t v1, uint32x4_t v2) { return vabdq_u32(v1, v2); } -// CHECK-LABEL: @test_vabdq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %v1, <4 x float> %v2) -// CHECK: ret <4 x float> [[VABD2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vabdq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[VABD_I]], <4 x float> [[VABD1_I]]) +// CHECK-NEXT: ret <4 x float> [[VABD2_I]] +// float32x4_t test_vabdq_f32(float32x4_t v1, float32x4_t v2) { return vabdq_f32(v1, v2); } -// CHECK-LABEL: @test_vabdq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %v1, <2 x double> %v2) -// CHECK: ret <2 x double> [[VABD2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vabdq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> [[VABD_I]], <2 x double> [[VABD1_I]]) +// CHECK-NEXT: ret <2 x double> [[VABD2_I]] +// float64x2_t test_vabdq_f64(float64x2_t v1, float64x2_t v2) { return vabdq_f64(v1, v2); } -// CHECK-LABEL: @test_vbsl_s8( -// CHECK: [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2 -// CHECK: [[TMP0:%.*]] = xor <8 x i8> %v1, splat (i8 -1) -// CHECK: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3 -// CHECK: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] -// CHECK: ret <8 x i8> [[VBSL2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], [[V3]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <8 x i8> [[VBSL2_I]] +// int8x8_t test_vbsl_s8(uint8x8_t v1, int8x8_t v2, int8x8_t v3) { return vbsl_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <4 x i16> %v1, splat (i16 -1) -// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[TMP4]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[V3]] to <8 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP4]] +// int8x8_t test_vbsl_s16(uint16x4_t v1, int16x4_t v2, int16x4_t v3) { return (int8x8_t)vbsl_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <2 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <2 x i32> %v1, splat (i32 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <2 x i32> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vbsl_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[V3]] to <8 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], splat (i32 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <2 x i32> [[VBSL5_I]] +// int32x2_t test_vbsl_s32(uint32x2_t v1, int32x2_t v2, int32x2_t v3) { return vbsl_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <1 x i64> %v1, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <1 x i64> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vbsl_s64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]], <1 x i64> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[V3]] to <8 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <1 x i64> [[VBSL5_I]] +// int64x1_t test_vbsl_s64(uint64x1_t v1, int64x1_t v2, int64x1_t v3) { return vbsl_s64(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_u8( -// CHECK: [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2 -// CHECK: [[TMP0:%.*]] = xor <8 x i8> %v1, splat (i8 -1) -// CHECK: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3 -// CHECK: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] -// CHECK: ret <8 x i8> [[VBSL2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], [[V3]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <8 x i8> [[VBSL2_I]] +// uint8x8_t test_vbsl_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) { return vbsl_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <4 x i16> %v1, splat (i16 -1) -// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <4 x i16> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vbsl_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[V3]] to <8 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <4 x i16> [[VBSL5_I]] +// uint16x4_t test_vbsl_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) { return vbsl_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <2 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <2 x i32> %v1, splat (i32 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <2 x i32> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vbsl_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[V3]] to <8 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], splat (i32 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <2 x i32> [[VBSL5_I]] +// uint32x2_t test_vbsl_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) { return vbsl_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <1 x i64> %v1, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <1 x i64> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vbsl_u64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]], <1 x i64> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[V3]] to <8 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <1 x i64> [[VBSL5_I]] +// uint64x1_t test_vbsl_u64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) { return vbsl_u64(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_f32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x float> %v3 to <8 x i8> -// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> -// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> -// CHECK: [[VBSL3_I:%.*]] = and <2 x i32> %v1, [[VBSL1_I]] -// CHECK: [[TMP4:%.*]] = xor <2 x i32> %v1, splat (i32 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP4]], [[VBSL2_I]] -// CHECK: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float> -// CHECK: ret <2 x float> [[TMP5]] +// CHECK-LABEL: define dso_local <2 x float> @test_vbsl_f32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[VBSL_I]], splat (i32 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP5]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP6]] +// float32x2_t test_vbsl_f32(uint32x2_t v1, float32x2_t v2, float32x2_t v3) { return vbsl_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v3 to <8 x i8> -// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> -// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> %v1, [[VBSL1_I]] -// CHECK: [[TMP3:%.*]] = xor <1 x i64> %v1, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]] -// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double> -// CHECK: ret <1 x double> [[TMP4]] +// CHECK-LABEL: define dso_local <1 x double> @test_vbsl_f64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x double> noundef [[V2:%.*]], <1 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V2]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[V3]] to i64 +// CHECK-NEXT: [[__P2_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__P2_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP5:%.*]] = xor <1 x i64> [[VBSL_I]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP5]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP6]] +// float64x1_t test_vbsl_f64(uint64x1_t v1, float64x1_t v2, float64x1_t v3) { return vbsl_f64(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_p8( -// CHECK: [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2 -// CHECK: [[TMP0:%.*]] = xor <8 x i8> %v1, splat (i8 -1) -// CHECK: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3 -// CHECK: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] -// CHECK: ret <8 x i8> [[VBSL2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_p8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], [[V3]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <8 x i8> [[VBSL2_I]] +// poly8x8_t test_vbsl_p8(uint8x8_t v1, poly8x8_t v2, poly8x8_t v3) { return vbsl_p8(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <4 x i16> %v1, splat (i16 -1) -// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <4 x i16> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vbsl_p16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[V3]] to <8 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <4 x i16> [[VBSL5_I]] +// poly16x4_t test_vbsl_p16(uint16x4_t v1, poly16x4_t v2, poly16x4_t v3) { return vbsl_p16(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_s8( -// CHECK: [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2 -// CHECK: [[TMP0:%.*]] = xor <16 x i8> %v1, splat (i8 -1) -// CHECK: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3 -// CHECK: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] -// CHECK: ret <16 x i8> [[VBSL2_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], [[V3]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <16 x i8> [[VBSL2_I]] +// int8x16_t test_vbslq_s8(uint8x16_t v1, int8x16_t v2, int8x16_t v3) { return vbslq_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <8 x i16> %v1, splat (i16 -1) -// CHECK: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <8 x i16> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vbslq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[V3]] to <16 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <8 x i16> [[VBSL5_I]] +// int16x8_t test_vbslq_s16(uint16x8_t v1, int16x8_t v2, int16x8_t v3) { return vbslq_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <4 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <4 x i32> %v1, splat (i32 -1) -// CHECK: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <4 x i32> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vbslq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[V3]] to <16 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], splat (i32 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <4 x i32> [[VBSL5_I]] +// int32x4_t test_vbslq_s32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) { return vbslq_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <2 x i64> %v1, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <2 x i64> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vbslq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]], <2 x i64> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[V3]] to <16 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <2 x i64> [[VBSL5_I]] +// int64x2_t test_vbslq_s64(uint64x2_t v1, int64x2_t v2, int64x2_t v3) { return vbslq_s64(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_u8( -// CHECK: [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2 -// CHECK: [[TMP0:%.*]] = xor <16 x i8> %v1, splat (i8 -1) -// CHECK: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3 -// CHECK: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] -// CHECK: ret <16 x i8> [[VBSL2_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], [[V3]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <16 x i8> [[VBSL2_I]] +// uint8x16_t test_vbslq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) { return vbslq_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <8 x i16> %v1, splat (i16 -1) -// CHECK: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <8 x i16> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vbslq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[V3]] to <16 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <8 x i16> [[VBSL5_I]] +// uint16x8_t test_vbslq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) { return vbslq_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <4 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <4 x i32> %v1, splat (i32 -1) -// CHECK: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <4 x i32> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vbslq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[V3]] to <16 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], splat (i32 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <4 x i32> [[VBSL5_I]] +// int32x4_t test_vbslq_u32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) { return vbslq_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <2 x i64> %v1, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <2 x i64> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vbslq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]], <2 x i64> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[V3]] to <16 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <2 x i64> [[VBSL5_I]] +// uint64x2_t test_vbslq_u64(uint64x2_t v1, uint64x2_t v2, uint64x2_t v3) { return vbslq_u64(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> -// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> -// CHECK: [[VBSL3_I:%.*]] = and <4 x i32> %v1, [[VBSL1_I]] -// CHECK: [[TMP3:%.*]] = xor <4 x i32> %v1, splat (i32 -1) -// CHECK: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]] -// CHECK: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float> -// CHECK: ret <4 x float> [[TMP4]] +// CHECK-LABEL: define dso_local <4 x float> @test_vbslq_f32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[VBSL_I]], splat (i32 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP5]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP6]] +// float32x4_t test_vbslq_f32(uint32x4_t v1, float32x4_t v2, float32x4_t v3) { return vbslq_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_p8( -// CHECK: [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2 -// CHECK: [[TMP0:%.*]] = xor <16 x i8> %v1, splat (i8 -1) -// CHECK: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3 -// CHECK: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] -// CHECK: ret <16 x i8> [[VBSL2_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_p8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], [[V3]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <16 x i8> [[VBSL2_I]] +// poly8x16_t test_vbslq_p8(uint8x16_t v1, poly8x16_t v2, poly8x16_t v3) { return vbslq_p8(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <8 x i16> %v1, splat (i16 -1) -// CHECK: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <8 x i16> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vbslq_p16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[V3]] to <16 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <8 x i16> [[VBSL5_I]] +// poly16x8_t test_vbslq_p16(uint16x8_t v1, poly16x8_t v2, poly16x8_t v3) { return vbslq_p16(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> -// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> -// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> %v1, [[VBSL1_I]] -// CHECK: [[TMP3:%.*]] = xor <2 x i64> %v1, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]] -// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double> -// CHECK: ret <2 x double> [[TMP4]] +// CHECK-LABEL: define dso_local <2 x double> @test_vbslq_f64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i64> [[VBSL_I]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP5]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP6]] +// float64x2_t test_vbslq_f64(uint64x2_t v1, float64x2_t v2, float64x2_t v3) { return vbslq_f64(v1, v2, v3); } -// CHECK-LABEL: @test_vrecps_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %v1, <2 x float> %v2) -// CHECK: ret <2 x float> [[VRECPS_V2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrecps_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) +// CHECK-NEXT: [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] +// float32x2_t test_vrecps_f32(float32x2_t v1, float32x2_t v2) { return vrecps_f32(v1, v2); } -// CHECK-LABEL: @test_vrecpsq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %v1, <4 x float> %v2) -// CHECK: [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VRECPSQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrecpsq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) +// CHECK-NEXT: [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// float32x4_t test_vrecpsq_f32(float32x4_t v1, float32x4_t v2) { return vrecpsq_f32(v1, v2); } -// CHECK-LABEL: @test_vrecpsq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> %v1, <2 x double> %v2) -// CHECK: [[VRECPSQ_V3_I:%.*]] = bitcast <2 x double> [[VRECPSQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x double> [[VRECPSQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrecpsq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> [[VRECPSQ_V_I]], <2 x double> [[VRECPSQ_V1_I]]) +// CHECK-NEXT: [[VRECPSQ_V3_I:%.*]] = bitcast <2 x double> [[VRECPSQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP5]] +// float64x2_t test_vrecpsq_f64(float64x2_t v1, float64x2_t v2) { return vrecpsq_f64(v1, v2); } -// CHECK-LABEL: @test_vrsqrts_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %v1, <2 x float> %v2) -// CHECK: [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VRSQRTS_V2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrsqrts_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) +// CHECK-NEXT: [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] +// float32x2_t test_vrsqrts_f32(float32x2_t v1, float32x2_t v2) { return vrsqrts_f32(v1, v2); } -// CHECK-LABEL: @test_vrsqrtsq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %v1, <4 x float> %v2) -// CHECK: [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VRSQRTSQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrsqrtsq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) +// CHECK-NEXT: [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// float32x4_t test_vrsqrtsq_f32(float32x4_t v1, float32x4_t v2) { return vrsqrtsq_f32(v1, v2); } -// CHECK-LABEL: @test_vrsqrtsq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> %v1, <2 x double> %v2) -// CHECK: [[VRSQRTSQ_V3_I:%.*]] = bitcast <2 x double> [[VRSQRTSQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x double> [[VRSQRTSQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrsqrtsq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> [[VRSQRTSQ_V_I]], <2 x double> [[VRSQRTSQ_V1_I]]) +// CHECK-NEXT: [[VRSQRTSQ_V3_I:%.*]] = bitcast <2 x double> [[VRSQRTSQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP5]] +// float64x2_t test_vrsqrtsq_f64(float64x2_t v1, float64x2_t v2) { return vrsqrtsq_f64(v1, v2); } -// CHECK-LABEL: @test_vcage_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %v1, <2 x float> %v2) -// CHECK: ret <2 x i32> [[VCAGE_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcage_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCAGE_V2_I]] +// uint32x2_t test_vcage_f32(float32x2_t v1, float32x2_t v2) { return vcage_f32(v1, v2); } -// CHECK-LABEL: @test_vcage_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x i64> [[VCAGE_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcage_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[VCAGE_V_I]], <1 x double> [[VCAGE_V1_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCAGE_V2_I]] +// uint64x1_t test_vcage_f64(float64x1_t a, float64x1_t b) { return vcage_f64(a, b); } -// CHECK-LABEL: @test_vcageq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %v1, <4 x float> %v2) -// CHECK: ret <4 x i32> [[VCAGEQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcageq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCAGEQ_V2_I]] +// uint32x4_t test_vcageq_f32(float32x4_t v1, float32x4_t v2) { return vcageq_f32(v1, v2); } -// CHECK-LABEL: @test_vcageq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %v1, <2 x double> %v2) -// CHECK: ret <2 x i64> [[VCAGEQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcageq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[VCAGEQ_V_I]], <2 x double> [[VCAGEQ_V1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCAGEQ_V2_I]] +// uint64x2_t test_vcageq_f64(float64x2_t v1, float64x2_t v2) { return vcageq_f64(v1, v2); } -// CHECK-LABEL: @test_vcagt_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %v1, <2 x float> %v2) -// CHECK: ret <2 x i32> [[VCAGT_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcagt_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCAGT_V2_I]] +// uint32x2_t test_vcagt_f32(float32x2_t v1, float32x2_t v2) { return vcagt_f32(v1, v2); } -// CHECK-LABEL: @test_vcagt_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x i64> [[VCAGT_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcagt_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[VCAGT_V_I]], <1 x double> [[VCAGT_V1_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCAGT_V2_I]] +// uint64x1_t test_vcagt_f64(float64x1_t a, float64x1_t b) { return vcagt_f64(a, b); } -// CHECK-LABEL: @test_vcagtq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %v1, <4 x float> %v2) -// CHECK: ret <4 x i32> [[VCAGTQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcagtq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCAGTQ_V2_I]] +// uint32x4_t test_vcagtq_f32(float32x4_t v1, float32x4_t v2) { return vcagtq_f32(v1, v2); } -// CHECK-LABEL: @test_vcagtq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %v1, <2 x double> %v2) -// CHECK: ret <2 x i64> [[VCAGTQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcagtq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[VCAGTQ_V_I]], <2 x double> [[VCAGTQ_V1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCAGTQ_V2_I]] +// uint64x2_t test_vcagtq_f64(float64x2_t v1, float64x2_t v2) { return vcagtq_f64(v1, v2); } -// CHECK-LABEL: @test_vcale_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %v2, <2 x float> %v1) -// CHECK: ret <2 x i32> [[VCALE_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcale_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCALE_V2_I]] +// uint32x2_t test_vcale_f32(float32x2_t v1, float32x2_t v2) { return vcale_f32(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. } -// CHECK-LABEL: @test_vcale_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> %b, <1 x double> %a) -// CHECK: ret <1 x i64> [[VCALE_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcale_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[VCALE_V_I]], <1 x double> [[VCALE_V1_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCALE_V2_I]] +// uint64x1_t test_vcale_f64(float64x1_t a, float64x1_t b) { return vcale_f64(a, b); } -// CHECK-LABEL: @test_vcaleq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %v2, <4 x float> %v1) -// CHECK: ret <4 x i32> [[VCALEQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcaleq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCALEQ_V2_I]] +// uint32x4_t test_vcaleq_f32(float32x4_t v1, float32x4_t v2) { return vcaleq_f32(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. } -// CHECK-LABEL: @test_vcaleq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %v2, <2 x double> %v1) -// CHECK: ret <2 x i64> [[VCALEQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcaleq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[VCALEQ_V_I]], <2 x double> [[VCALEQ_V1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCALEQ_V2_I]] +// uint64x2_t test_vcaleq_f64(float64x2_t v1, float64x2_t v2) { return vcaleq_f64(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. } -// CHECK-LABEL: @test_vcalt_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %v2, <2 x float> %v1) -// CHECK: ret <2 x i32> [[VCALT_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcalt_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V1]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCALT_V2_I]] +// uint32x2_t test_vcalt_f32(float32x2_t v1, float32x2_t v2) { return vcalt_f32(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. } -// CHECK-LABEL: @test_vcalt_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> %b, <1 x double> %a) -// CHECK: ret <1 x i64> [[VCALT_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcalt_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[VCALT_V_I]], <1 x double> [[VCALT_V1_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCALT_V2_I]] +// uint64x1_t test_vcalt_f64(float64x1_t a, float64x1_t b) { return vcalt_f64(a, b); } -// CHECK-LABEL: @test_vcaltq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %v2, <4 x float> %v1) -// CHECK: ret <4 x i32> [[VCALTQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcaltq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V1]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCALTQ_V2_I]] +// uint32x4_t test_vcaltq_f32(float32x4_t v1, float32x4_t v2) { return vcaltq_f32(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. } -// CHECK-LABEL: @test_vcaltq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %v2, <2 x double> %v1) -// CHECK: ret <2 x i64> [[VCALTQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcaltq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V1]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[VCALTQ_V_I]], <2 x double> [[VCALTQ_V1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCALTQ_V2_I]] +// uint64x2_t test_vcaltq_f64(float64x2_t v1, float64x2_t v2) { return vcaltq_f64(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. } -// CHECK-LABEL: @test_vtst_s8( -// CHECK: [[TMP0:%.*]] = and <8 x i8> %v1, %v2 -// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> -// CHECK: ret <8 x i8> [[VTST_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtst_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VTST_I]] +// uint8x8_t test_vtst_s8(int8x8_t v1, int8x8_t v2) { return vtst_s8(v1, v2); } -// CHECK-LABEL: @test_vtst_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -// CHECK: ret <4 x i16> [[VTST_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtst_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VTST_I]] +// uint16x4_t test_vtst_s16(int16x4_t v1, int16x4_t v2) { return vtst_s16(v1, v2); } -// CHECK-LABEL: @test_vtst_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <2 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> -// CHECK: ret <2 x i32> [[VTST_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vtst_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VTST_I]] +// uint32x2_t test_vtst_s32(int32x2_t v1, int32x2_t v2) { return vtst_s32(v1, v2); } -// CHECK-LABEL: @test_vtst_u8( -// CHECK: [[TMP0:%.*]] = and <8 x i8> %v1, %v2 -// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> -// CHECK: ret <8 x i8> [[VTST_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtst_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VTST_I]] +// uint8x8_t test_vtst_u8(uint8x8_t v1, uint8x8_t v2) { return vtst_u8(v1, v2); } -// CHECK-LABEL: @test_vtst_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -// CHECK: ret <4 x i16> [[VTST_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtst_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VTST_I]] +// uint16x4_t test_vtst_u16(uint16x4_t v1, uint16x4_t v2) { return vtst_u16(v1, v2); } -// CHECK-LABEL: @test_vtst_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <2 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> -// CHECK: ret <2 x i32> [[VTST_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vtst_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VTST_I]] +// uint32x2_t test_vtst_u32(uint32x2_t v1, uint32x2_t v2) { return vtst_u32(v1, v2); } -// CHECK-LABEL: @test_vtstq_s8( -// CHECK: [[TMP0:%.*]] = and <16 x i8> %v1, %v2 -// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> -// CHECK: ret <16 x i8> [[VTST_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtstq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VTST_I]] +// uint8x16_t test_vtstq_s8(int8x16_t v1, int8x16_t v2) { return vtstq_s8(v1, v2); } -// CHECK-LABEL: @test_vtstq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <8 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> -// CHECK: ret <8 x i16> [[VTST_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtstq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VTST_I]] +// uint16x8_t test_vtstq_s16(int16x8_t v1, int16x8_t v2) { return vtstq_s16(v1, v2); } -// CHECK-LABEL: @test_vtstq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> -// CHECK: ret <4 x i32> [[VTST_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vtstq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VTST_I]] +// uint32x4_t test_vtstq_s32(int32x4_t v1, int32x4_t v2) { return vtstq_s32(v1, v2); } -// CHECK-LABEL: @test_vtstq_u8( -// CHECK: [[TMP0:%.*]] = and <16 x i8> %v1, %v2 -// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> -// CHECK: ret <16 x i8> [[VTST_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtstq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VTST_I]] +// uint8x16_t test_vtstq_u8(uint8x16_t v1, uint8x16_t v2) { return vtstq_u8(v1, v2); } -// CHECK-LABEL: @test_vtstq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <8 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> -// CHECK: ret <8 x i16> [[VTST_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtstq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VTST_I]] +// uint16x8_t test_vtstq_u16(uint16x8_t v1, uint16x8_t v2) { return vtstq_u16(v1, v2); } -// CHECK-LABEL: @test_vtstq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> -// CHECK: ret <4 x i32> [[VTST_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vtstq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VTST_I]] +// uint32x4_t test_vtstq_u32(uint32x4_t v1, uint32x4_t v2) { return vtstq_u32(v1, v2); } -// CHECK-LABEL: @test_vtstq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <2 x i64> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64> -// CHECK: ret <2 x i64> [[VTST_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtstq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VTST_I]] +// uint64x2_t test_vtstq_s64(int64x2_t v1, int64x2_t v2) { return vtstq_s64(v1, v2); } -// CHECK-LABEL: @test_vtstq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <2 x i64> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64> -// CHECK: ret <2 x i64> [[VTST_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtstq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VTST_I]] +// uint64x2_t test_vtstq_u64(uint64x2_t v1, uint64x2_t v2) { return vtstq_u64(v1, v2); } -// CHECK-LABEL: @test_vtst_p8( -// CHECK: [[TMP0:%.*]] = and <8 x i8> %v1, %v2 -// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> -// CHECK: ret <8 x i8> [[VTST_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtst_p8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VTST_I]] +// uint8x8_t test_vtst_p8(poly8x8_t v1, poly8x8_t v2) { return vtst_p8(v1, v2); } -// CHECK-LABEL: @test_vtst_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -// CHECK: ret <4 x i16> [[VTST_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtst_p16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VTST_I]] +// uint16x4_t test_vtst_p16(poly16x4_t v1, poly16x4_t v2) { return vtst_p16(v1, v2); } -// CHECK-LABEL: @test_vtstq_p8( -// CHECK: [[TMP0:%.*]] = and <16 x i8> %v1, %v2 -// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> -// CHECK: ret <16 x i8> [[VTST_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtstq_p8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VTST_I]] +// uint8x16_t test_vtstq_p8(poly8x16_t v1, poly8x16_t v2) { return vtstq_p8(v1, v2); } -// CHECK-LABEL: @test_vtstq_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <8 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> -// CHECK: ret <8 x i16> [[VTST_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtstq_p16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VTST_I]] +// uint16x8_t test_vtstq_p16(poly16x8_t v1, poly16x8_t v2) { return vtstq_p16(v1, v2); } -// CHECK-LABEL: @test_vtst_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <1 x i64> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <1 x i64> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64> -// CHECK: ret <1 x i64> [[VTST_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vtst_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VTST_I]] +// uint64x1_t test_vtst_s64(int64x1_t a, int64x1_t b) { return vtst_s64(a, b); } -// CHECK-LABEL: @test_vtst_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <1 x i64> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <1 x i64> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64> -// CHECK: ret <1 x i64> [[VTST_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vtst_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VTST_I]] +// uint64x1_t test_vtst_u64(uint64x1_t a, uint64x1_t b) { return vtst_u64(a, b); } -// CHECK-LABEL: @test_vceq_s8( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vceq_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vceq_s8(int8x8_t v1, int8x8_t v2) { return vceq_s8(v1, v2); } -// CHECK-LABEL: @test_vceq_s16( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vceq_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vceq_s16(int16x4_t v1, int16x4_t v2) { return vceq_s16(v1, v2); } -// CHECK-LABEL: @test_vceq_s32( -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vceq_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_s32(int32x2_t v1, int32x2_t v2) { return vceq_s32(v1, v2); } -// CHECK-LABEL: @test_vceq_s64( -// CHECK: [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceq_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vceq_s64(int64x1_t a, int64x1_t b) { return vceq_s64(a, b); } -// CHECK-LABEL: @test_vceq_u64( -// CHECK: [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceq_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vceq_u64(uint64x1_t a, uint64x1_t b) { return vceq_u64(a, b); } -// CHECK-LABEL: @test_vceq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oeq <2 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vceq_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) { return vceq_f32(v1, v2); } -// CHECK-LABEL: @test_vceq_f64( -// CHECK: [[CMP_I:%.*]] = fcmp oeq <1 x double> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceq_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <1 x double> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vceq_f64(float64x1_t a, float64x1_t b) { return vceq_f64(a, b); } -// CHECK-LABEL: @test_vceq_u8( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vceq_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vceq_u8(uint8x8_t v1, uint8x8_t v2) { return vceq_u8(v1, v2); } -// CHECK-LABEL: @test_vceq_u16( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vceq_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vceq_u16(uint16x4_t v1, uint16x4_t v2) { return vceq_u16(v1, v2); } -// CHECK-LABEL: @test_vceq_u32( -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vceq_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_u32(uint32x2_t v1, uint32x2_t v2) { return vceq_u32(v1, v2); } -// CHECK-LABEL: @test_vceq_p8( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vceq_p8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vceq_p8(poly8x8_t v1, poly8x8_t v2) { return vceq_p8(v1, v2); } -// CHECK-LABEL: @test_vceqq_s8( -// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vceqq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vceqq_s8(int8x16_t v1, int8x16_t v2) { return vceqq_s8(v1, v2); } -// CHECK-LABEL: @test_vceqq_s16( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vceqq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vceqq_s16(int16x8_t v1, int16x8_t v2) { return vceqq_s16(v1, v2); } -// CHECK-LABEL: @test_vceqq_s32( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vceqq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_s32(int32x4_t v1, int32x4_t v2) { return vceqq_s32(v1, v2); } -// CHECK-LABEL: @test_vceqq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oeq <4 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vceqq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) { return vceqq_f32(v1, v2); } -// CHECK-LABEL: @test_vceqq_u8( -// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vceqq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vceqq_u8(uint8x16_t v1, uint8x16_t v2) { return vceqq_u8(v1, v2); } -// CHECK-LABEL: @test_vceqq_u16( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vceqq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vceqq_u16(uint16x8_t v1, uint16x8_t v2) { return vceqq_u16(v1, v2); } -// CHECK-LABEL: @test_vceqq_u32( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vceqq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_u32(uint32x4_t v1, uint32x4_t v2) { return vceqq_u32(v1, v2); } -// CHECK-LABEL: @test_vceqq_p8( -// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vceqq_p8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vceqq_p8(poly8x16_t v1, poly8x16_t v2) { return vceqq_p8(v1, v2); } -// CHECK-LABEL: @test_vceqq_s64( -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vceqq_s64(int64x2_t v1, int64x2_t v2) { return vceqq_s64(v1, v2); } -// CHECK-LABEL: @test_vceqq_u64( -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vceqq_u64(uint64x2_t v1, uint64x2_t v2) { return vceqq_u64(v1, v2); } -// CHECK-LABEL: @test_vceqq_f64( -// CHECK: [[CMP_I:%.*]] = fcmp oeq <2 x double> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) { return vceqq_f64(v1, v2); } -// CHECK-LABEL: @test_vcge_s8( -// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcge_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcge_s8(int8x8_t v1, int8x8_t v2) { return vcge_s8(v1, v2); } -// CHECK-LABEL: @test_vcge_s16( -// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcge_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcge_s16(int16x4_t v1, int16x4_t v2) { return vcge_s16(v1, v2); } -// CHECK-LABEL: @test_vcge_s32( -// CHECK: [[CMP_I:%.*]] = icmp sge <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcge_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_s32(int32x2_t v1, int32x2_t v2) { return vcge_s32(v1, v2); } -// CHECK-LABEL: @test_vcge_s64( -// CHECK: [[CMP_I:%.*]] = icmp sge <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcge_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcge_s64(int64x1_t a, int64x1_t b) { return vcge_s64(a, b); } -// CHECK-LABEL: @test_vcge_u64( -// CHECK: [[CMP_I:%.*]] = icmp uge <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcge_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcge_u64(uint64x1_t a, uint64x1_t b) { return vcge_u64(a, b); } -// CHECK-LABEL: @test_vcge_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oge <2 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcge_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) { return vcge_f32(v1, v2); } -// CHECK-LABEL: @test_vcge_f64( -// CHECK: [[CMP_I:%.*]] = fcmp oge <1 x double> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcge_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <1 x double> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcge_f64(float64x1_t a, float64x1_t b) { return vcge_f64(a, b); } -// CHECK-LABEL: @test_vcge_u8( -// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcge_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcge_u8(uint8x8_t v1, uint8x8_t v2) { return vcge_u8(v1, v2); } -// CHECK-LABEL: @test_vcge_u16( -// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcge_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcge_u16(uint16x4_t v1, uint16x4_t v2) { return vcge_u16(v1, v2); } -// CHECK-LABEL: @test_vcge_u32( -// CHECK: [[CMP_I:%.*]] = icmp uge <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcge_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_u32(uint32x2_t v1, uint32x2_t v2) { return vcge_u32(v1, v2); } -// CHECK-LABEL: @test_vcgeq_s8( -// CHECK: [[CMP_I:%.*]] = icmp sge <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcgeq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgeq_s8(int8x16_t v1, int8x16_t v2) { return vcgeq_s8(v1, v2); } -// CHECK-LABEL: @test_vcgeq_s16( -// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgeq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgeq_s16(int16x8_t v1, int16x8_t v2) { return vcgeq_s16(v1, v2); } -// CHECK-LABEL: @test_vcgeq_s32( -// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgeq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_s32(int32x4_t v1, int32x4_t v2) { return vcgeq_s32(v1, v2); } -// CHECK-LABEL: @test_vcgeq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oge <4 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgeq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) { return vcgeq_f32(v1, v2); } -// CHECK-LABEL: @test_vcgeq_u8( -// CHECK: [[CMP_I:%.*]] = icmp uge <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcgeq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgeq_u8(uint8x16_t v1, uint8x16_t v2) { return vcgeq_u8(v1, v2); } -// CHECK-LABEL: @test_vcgeq_u16( -// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgeq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgeq_u16(uint16x8_t v1, uint16x8_t v2) { return vcgeq_u16(v1, v2); } -// CHECK-LABEL: @test_vcgeq_u32( -// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgeq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_u32(uint32x4_t v1, uint32x4_t v2) { return vcgeq_u32(v1, v2); } -// CHECK-LABEL: @test_vcgeq_s64( -// CHECK: [[CMP_I:%.*]] = icmp sge <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgeq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgeq_s64(int64x2_t v1, int64x2_t v2) { return vcgeq_s64(v1, v2); } -// CHECK-LABEL: @test_vcgeq_u64( -// CHECK: [[CMP_I:%.*]] = icmp uge <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgeq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgeq_u64(uint64x2_t v1, uint64x2_t v2) { return vcgeq_u64(v1, v2); } -// CHECK-LABEL: @test_vcgeq_f64( -// CHECK: [[CMP_I:%.*]] = fcmp oge <2 x double> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgeq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) { return vcgeq_f64(v1, v2); } -// CHECK-LABEL: @test_vcle_s8( -// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] // Notes about vcle: // LE condition predicate implemented as GE, so check reversed operands. // Using registers other than v0, v1 are possible, but would be odd. +// CHECK-LABEL: define dso_local <8 x i8> @test_vcle_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcle_s8(int8x8_t v1, int8x8_t v2) { return vcle_s8(v1, v2); } -// CHECK-LABEL: @test_vcle_s16( -// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcle_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcle_s16(int16x4_t v1, int16x4_t v2) { return vcle_s16(v1, v2); } -// CHECK-LABEL: @test_vcle_s32( -// CHECK: [[CMP_I:%.*]] = icmp sle <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcle_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_s32(int32x2_t v1, int32x2_t v2) { return vcle_s32(v1, v2); } -// CHECK-LABEL: @test_vcle_s64( -// CHECK: [[CMP_I:%.*]] = icmp sle <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcle_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcle_s64(int64x1_t a, int64x1_t b) { return vcle_s64(a, b); } -// CHECK-LABEL: @test_vcle_u64( -// CHECK: [[CMP_I:%.*]] = icmp ule <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcle_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcle_u64(uint64x1_t a, uint64x1_t b) { return vcle_u64(a, b); } -// CHECK-LABEL: @test_vcle_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ole <2 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcle_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) { return vcle_f32(v1, v2); } -// CHECK-LABEL: @test_vcle_f64( -// CHECK: [[CMP_I:%.*]] = fcmp ole <1 x double> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcle_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <1 x double> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcle_f64(float64x1_t a, float64x1_t b) { return vcle_f64(a, b); } -// CHECK-LABEL: @test_vcle_u8( -// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcle_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcle_u8(uint8x8_t v1, uint8x8_t v2) { return vcle_u8(v1, v2); } -// CHECK-LABEL: @test_vcle_u16( -// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcle_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcle_u16(uint16x4_t v1, uint16x4_t v2) { return vcle_u16(v1, v2); } -// CHECK-LABEL: @test_vcle_u32( -// CHECK: [[CMP_I:%.*]] = icmp ule <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcle_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_u32(uint32x2_t v1, uint32x2_t v2) { return vcle_u32(v1, v2); } -// CHECK-LABEL: @test_vcleq_s8( -// CHECK: [[CMP_I:%.*]] = icmp sle <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcleq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcleq_s8(int8x16_t v1, int8x16_t v2) { return vcleq_s8(v1, v2); } -// CHECK-LABEL: @test_vcleq_s16( -// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcleq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcleq_s16(int16x8_t v1, int16x8_t v2) { return vcleq_s16(v1, v2); } -// CHECK-LABEL: @test_vcleq_s32( -// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcleq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_s32(int32x4_t v1, int32x4_t v2) { return vcleq_s32(v1, v2); } -// CHECK-LABEL: @test_vcleq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ole <4 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcleq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) { return vcleq_f32(v1, v2); } -// CHECK-LABEL: @test_vcleq_u8( -// CHECK: [[CMP_I:%.*]] = icmp ule <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcleq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcleq_u8(uint8x16_t v1, uint8x16_t v2) { return vcleq_u8(v1, v2); } -// CHECK-LABEL: @test_vcleq_u16( -// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcleq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcleq_u16(uint16x8_t v1, uint16x8_t v2) { return vcleq_u16(v1, v2); } -// CHECK-LABEL: @test_vcleq_u32( -// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcleq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_u32(uint32x4_t v1, uint32x4_t v2) { return vcleq_u32(v1, v2); } -// CHECK-LABEL: @test_vcleq_s64( -// CHECK: [[CMP_I:%.*]] = icmp sle <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcleq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcleq_s64(int64x2_t v1, int64x2_t v2) { return vcleq_s64(v1, v2); } -// CHECK-LABEL: @test_vcleq_u64( -// CHECK: [[CMP_I:%.*]] = icmp ule <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcleq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcleq_u64(uint64x2_t v1, uint64x2_t v2) { return vcleq_u64(v1, v2); } -// CHECK-LABEL: @test_vcleq_f64( -// CHECK: [[CMP_I:%.*]] = fcmp ole <2 x double> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcleq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) { return vcleq_f64(v1, v2); } -// CHECK-LABEL: @test_vcgt_s8( -// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcgt_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcgt_s8(int8x8_t v1, int8x8_t v2) { return vcgt_s8(v1, v2); } -// CHECK-LABEL: @test_vcgt_s16( -// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgt_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcgt_s16(int16x4_t v1, int16x4_t v2) { return vcgt_s16(v1, v2); } -// CHECK-LABEL: @test_vcgt_s32( -// CHECK: [[CMP_I:%.*]] = icmp sgt <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgt_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_s32(int32x2_t v1, int32x2_t v2) { return vcgt_s32(v1, v2); } -// CHECK-LABEL: @test_vcgt_s64( -// CHECK: [[CMP_I:%.*]] = icmp sgt <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgt_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcgt_s64(int64x1_t a, int64x1_t b) { return vcgt_s64(a, b); } -// CHECK-LABEL: @test_vcgt_u64( -// CHECK: [[CMP_I:%.*]] = icmp ugt <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgt_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcgt_u64(uint64x1_t a, uint64x1_t b) { return vcgt_u64(a, b); } -// CHECK-LABEL: @test_vcgt_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ogt <2 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgt_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) { return vcgt_f32(v1, v2); } -// CHECK-LABEL: @test_vcgt_f64( -// CHECK: [[CMP_I:%.*]] = fcmp ogt <1 x double> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgt_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <1 x double> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcgt_f64(float64x1_t a, float64x1_t b) { return vcgt_f64(a, b); } -// CHECK-LABEL: @test_vcgt_u8( -// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcgt_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcgt_u8(uint8x8_t v1, uint8x8_t v2) { return vcgt_u8(v1, v2); } -// CHECK-LABEL: @test_vcgt_u16( -// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgt_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcgt_u16(uint16x4_t v1, uint16x4_t v2) { return vcgt_u16(v1, v2); } -// CHECK-LABEL: @test_vcgt_u32( -// CHECK: [[CMP_I:%.*]] = icmp ugt <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgt_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_u32(uint32x2_t v1, uint32x2_t v2) { return vcgt_u32(v1, v2); } -// CHECK-LABEL: @test_vcgtq_s8( -// CHECK: [[CMP_I:%.*]] = icmp sgt <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcgtq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgtq_s8(int8x16_t v1, int8x16_t v2) { return vcgtq_s8(v1, v2); } -// CHECK-LABEL: @test_vcgtq_s16( -// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgtq_s16(int16x8_t v1, int16x8_t v2) { return vcgtq_s16(v1, v2); } -// CHECK-LABEL: @test_vcgtq_s32( -// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_s32(int32x4_t v1, int32x4_t v2) { return vcgtq_s32(v1, v2); } -// CHECK-LABEL: @test_vcgtq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ogt <4 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) { return vcgtq_f32(v1, v2); } -// CHECK-LABEL: @test_vcgtq_u8( -// CHECK: [[CMP_I:%.*]] = icmp ugt <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcgtq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgtq_u8(uint8x16_t v1, uint8x16_t v2) { return vcgtq_u8(v1, v2); } -// CHECK-LABEL: @test_vcgtq_u16( -// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgtq_u16(uint16x8_t v1, uint16x8_t v2) { return vcgtq_u16(v1, v2); } -// CHECK-LABEL: @test_vcgtq_u32( -// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_u32(uint32x4_t v1, uint32x4_t v2) { return vcgtq_u32(v1, v2); } -// CHECK-LABEL: @test_vcgtq_s64( -// CHECK: [[CMP_I:%.*]] = icmp sgt <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgtq_s64(int64x2_t v1, int64x2_t v2) { return vcgtq_s64(v1, v2); } -// CHECK-LABEL: @test_vcgtq_u64( -// CHECK: [[CMP_I:%.*]] = icmp ugt <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgtq_u64(uint64x2_t v1, uint64x2_t v2) { return vcgtq_u64(v1, v2); } -// CHECK-LABEL: @test_vcgtq_f64( -// CHECK: [[CMP_I:%.*]] = fcmp ogt <2 x double> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) { return vcgtq_f64(v1, v2); } -// CHECK-LABEL: @test_vclt_s8( -// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] // Notes about vclt: // LT condition predicate implemented as GT, so check reversed operands. // Using registers other than v0, v1 are possible, but would be odd. +// CHECK-LABEL: define dso_local <8 x i8> @test_vclt_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vclt_s8(int8x8_t v1, int8x8_t v2) { return vclt_s8(v1, v2); } -// CHECK-LABEL: @test_vclt_s16( -// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclt_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vclt_s16(int16x4_t v1, int16x4_t v2) { return vclt_s16(v1, v2); } -// CHECK-LABEL: @test_vclt_s32( -// CHECK: [[CMP_I:%.*]] = icmp slt <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclt_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_s32(int32x2_t v1, int32x2_t v2) { return vclt_s32(v1, v2); } -// CHECK-LABEL: @test_vclt_s64( -// CHECK: [[CMP_I:%.*]] = icmp slt <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vclt_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vclt_s64(int64x1_t a, int64x1_t b) { return vclt_s64(a, b); } -// CHECK-LABEL: @test_vclt_u64( -// CHECK: [[CMP_I:%.*]] = icmp ult <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vclt_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vclt_u64(uint64x1_t a, uint64x1_t b) { return vclt_u64(a, b); } -// CHECK-LABEL: @test_vclt_f32( -// CHECK: [[CMP_I:%.*]] = fcmp olt <2 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclt_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) { return vclt_f32(v1, v2); } -// CHECK-LABEL: @test_vclt_f64( -// CHECK: [[CMP_I:%.*]] = fcmp olt <1 x double> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vclt_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <1 x double> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vclt_f64(float64x1_t a, float64x1_t b) { return vclt_f64(a, b); } -// CHECK-LABEL: @test_vclt_u8( -// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vclt_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vclt_u8(uint8x8_t v1, uint8x8_t v2) { return vclt_u8(v1, v2); } -// CHECK-LABEL: @test_vclt_u16( -// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclt_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vclt_u16(uint16x4_t v1, uint16x4_t v2) { return vclt_u16(v1, v2); } -// CHECK-LABEL: @test_vclt_u32( -// CHECK: [[CMP_I:%.*]] = icmp ult <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclt_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_u32(uint32x2_t v1, uint32x2_t v2) { return vclt_u32(v1, v2); } -// CHECK-LABEL: @test_vcltq_s8( -// CHECK: [[CMP_I:%.*]] = icmp slt <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcltq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcltq_s8(int8x16_t v1, int8x16_t v2) { return vcltq_s8(v1, v2); } -// CHECK-LABEL: @test_vcltq_s16( -// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcltq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcltq_s16(int16x8_t v1, int16x8_t v2) { return vcltq_s16(v1, v2); } -// CHECK-LABEL: @test_vcltq_s32( -// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcltq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_s32(int32x4_t v1, int32x4_t v2) { return vcltq_s32(v1, v2); } -// CHECK-LABEL: @test_vcltq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp olt <4 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcltq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) { return vcltq_f32(v1, v2); } -// CHECK-LABEL: @test_vcltq_u8( -// CHECK: [[CMP_I:%.*]] = icmp ult <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcltq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcltq_u8(uint8x16_t v1, uint8x16_t v2) { return vcltq_u8(v1, v2); } -// CHECK-LABEL: @test_vcltq_u16( -// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcltq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcltq_u16(uint16x8_t v1, uint16x8_t v2) { return vcltq_u16(v1, v2); } -// CHECK-LABEL: @test_vcltq_u32( -// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcltq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_u32(uint32x4_t v1, uint32x4_t v2) { return vcltq_u32(v1, v2); } -// CHECK-LABEL: @test_vcltq_s64( -// CHECK: [[CMP_I:%.*]] = icmp slt <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcltq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcltq_s64(int64x2_t v1, int64x2_t v2) { return vcltq_s64(v1, v2); } -// CHECK-LABEL: @test_vcltq_u64( -// CHECK: [[CMP_I:%.*]] = icmp ult <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcltq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcltq_u64(uint64x2_t v1, uint64x2_t v2) { return vcltq_u64(v1, v2); } -// CHECK-LABEL: @test_vcltq_f64( -// CHECK: [[CMP_I:%.*]] = fcmp olt <2 x double> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcltq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) { return vcltq_f64(v1, v2); } -// CHECK-LABEL: @test_vhadd_s8( -// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VHADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vhadd_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VHADD_V_I]] +// int8x8_t test_vhadd_s8(int8x8_t v1, int8x8_t v2) { return vhadd_s8(v1, v2); } -// CHECK-LABEL: @test_vhadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vhadd_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) +// CHECK-NEXT: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vhadd_s16(int16x4_t v1, int16x4_t v2) { return vhadd_s16(v1, v2); } -// CHECK-LABEL: @test_vhadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vhadd_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) +// CHECK-NEXT: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vhadd_s32(int32x2_t v1, int32x2_t v2) { return vhadd_s32(v1, v2); } -// CHECK-LABEL: @test_vhadd_u8( -// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VHADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vhadd_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VHADD_V_I]] +// uint8x8_t test_vhadd_u8(uint8x8_t v1, uint8x8_t v2) { return vhadd_u8(v1, v2); } -// CHECK-LABEL: @test_vhadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vhadd_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) +// CHECK-NEXT: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vhadd_u16(uint16x4_t v1, uint16x4_t v2) { return vhadd_u16(v1, v2); } -// CHECK-LABEL: @test_vhadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vhadd_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) +// CHECK-NEXT: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vhadd_u32(uint32x2_t v1, uint32x2_t v2) { return vhadd_u32(v1, v2); } -// CHECK-LABEL: @test_vhaddq_s8( -// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VHADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vhaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VHADDQ_V_I]] +// int8x16_t test_vhaddq_s8(int8x16_t v1, int8x16_t v2) { return vhaddq_s8(v1, v2); } -// CHECK-LABEL: @test_vhaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vhaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) +// CHECK-NEXT: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vhaddq_s16(int16x8_t v1, int16x8_t v2) { return vhaddq_s16(v1, v2); } -// CHECK-LABEL: @test_vhaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vhaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) +// CHECK-NEXT: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vhaddq_s32(int32x4_t v1, int32x4_t v2) { return vhaddq_s32(v1, v2); } -// CHECK-LABEL: @test_vhaddq_u8( -// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VHADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vhaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VHADDQ_V_I]] +// uint8x16_t test_vhaddq_u8(uint8x16_t v1, uint8x16_t v2) { return vhaddq_u8(v1, v2); } -// CHECK-LABEL: @test_vhaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vhaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) +// CHECK-NEXT: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vhaddq_u16(uint16x8_t v1, uint16x8_t v2) { return vhaddq_u16(v1, v2); } -// CHECK-LABEL: @test_vhaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vhaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) +// CHECK-NEXT: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vhaddq_u32(uint32x4_t v1, uint32x4_t v2) { return vhaddq_u32(v1, v2); } -// CHECK-LABEL: @test_vhsub_s8( -// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VHSUB_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vhsub_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VHSUB_V_I]] +// int8x8_t test_vhsub_s8(int8x8_t v1, int8x8_t v2) { return vhsub_s8(v1, v2); } -// CHECK-LABEL: @test_vhsub_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHSUB_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vhsub_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) +// CHECK-NEXT: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vhsub_s16(int16x4_t v1, int16x4_t v2) { return vhsub_s16(v1, v2); } -// CHECK-LABEL: @test_vhsub_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHSUB_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vhsub_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) +// CHECK-NEXT: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vhsub_s32(int32x2_t v1, int32x2_t v2) { return vhsub_s32(v1, v2); } -// CHECK-LABEL: @test_vhsub_u8( -// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VHSUB_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vhsub_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VHSUB_V_I]] +// uint8x8_t test_vhsub_u8(uint8x8_t v1, uint8x8_t v2) { return vhsub_u8(v1, v2); } -// CHECK-LABEL: @test_vhsub_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHSUB_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vhsub_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) +// CHECK-NEXT: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vhsub_u16(uint16x4_t v1, uint16x4_t v2) { return vhsub_u16(v1, v2); } -// CHECK-LABEL: @test_vhsub_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHSUB_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vhsub_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) +// CHECK-NEXT: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vhsub_u32(uint32x2_t v1, uint32x2_t v2) { return vhsub_u32(v1, v2); } -// CHECK-LABEL: @test_vhsubq_s8( -// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VHSUBQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vhsubq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VHSUBQ_V_I]] +// int8x16_t test_vhsubq_s8(int8x16_t v1, int8x16_t v2) { return vhsubq_s8(v1, v2); } -// CHECK-LABEL: @test_vhsubq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vhsubq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) +// CHECK-NEXT: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vhsubq_s16(int16x8_t v1, int16x8_t v2) { return vhsubq_s16(v1, v2); } -// CHECK-LABEL: @test_vhsubq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vhsubq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) +// CHECK-NEXT: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vhsubq_s32(int32x4_t v1, int32x4_t v2) { return vhsubq_s32(v1, v2); } -// CHECK-LABEL: @test_vhsubq_u8( -// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VHSUBQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vhsubq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VHSUBQ_V_I]] +// uint8x16_t test_vhsubq_u8(uint8x16_t v1, uint8x16_t v2) { return vhsubq_u8(v1, v2); } -// CHECK-LABEL: @test_vhsubq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vhsubq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) +// CHECK-NEXT: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vhsubq_u16(uint16x8_t v1, uint16x8_t v2) { return vhsubq_u16(v1, v2); } -// CHECK-LABEL: @test_vhsubq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vhsubq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) +// CHECK-NEXT: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vhsubq_u32(uint32x4_t v1, uint32x4_t v2) { return vhsubq_u32(v1, v2); } -// CHECK-LABEL: @test_vrhadd_s8( -// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VRHADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrhadd_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VRHADD_V_I]] +// int8x8_t test_vrhadd_s8(int8x8_t v1, int8x8_t v2) { return vrhadd_s8(v1, v2); } -// CHECK-LABEL: @test_vrhadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRHADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrhadd_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) +// CHECK-NEXT: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vrhadd_s16(int16x4_t v1, int16x4_t v2) { return vrhadd_s16(v1, v2); } -// CHECK-LABEL: @test_vrhadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRHADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrhadd_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) +// CHECK-NEXT: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vrhadd_s32(int32x2_t v1, int32x2_t v2) { return vrhadd_s32(v1, v2); } -// CHECK-LABEL: @test_vrhadd_u8( -// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VRHADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrhadd_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VRHADD_V_I]] +// uint8x8_t test_vrhadd_u8(uint8x8_t v1, uint8x8_t v2) { return vrhadd_u8(v1, v2); } -// CHECK-LABEL: @test_vrhadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRHADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrhadd_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) +// CHECK-NEXT: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vrhadd_u16(uint16x4_t v1, uint16x4_t v2) { return vrhadd_u16(v1, v2); } -// CHECK-LABEL: @test_vrhadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRHADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrhadd_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V2]] to <8 x i8> +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) +// CHECK-NEXT: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vrhadd_u32(uint32x2_t v1, uint32x2_t v2) { return vrhadd_u32(v1, v2); } -// CHECK-LABEL: @test_vrhaddq_s8( -// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VRHADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrhaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VRHADDQ_V_I]] +// int8x16_t test_vrhaddq_s8(int8x16_t v1, int8x16_t v2) { return vrhaddq_s8(v1, v2); } -// CHECK-LABEL: @test_vrhaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrhaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) +// CHECK-NEXT: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vrhaddq_s16(int16x8_t v1, int16x8_t v2) { return vrhaddq_s16(v1, v2); } -// CHECK-LABEL: @test_vrhaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrhaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) +// CHECK-NEXT: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vrhaddq_s32(int32x4_t v1, int32x4_t v2) { return vrhaddq_s32(v1, v2); } -// CHECK-LABEL: @test_vrhaddq_u8( -// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VRHADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrhaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VRHADDQ_V_I]] +// uint8x16_t test_vrhaddq_u8(uint8x16_t v1, uint8x16_t v2) { return vrhaddq_u8(v1, v2); } -// CHECK-LABEL: @test_vrhaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrhaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) +// CHECK-NEXT: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vrhaddq_u16(uint16x8_t v1, uint16x8_t v2) { return vrhaddq_u16(v1, v2); } -// CHECK-LABEL: @test_vrhaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrhaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V2]] to <16 x i8> +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) +// CHECK-NEXT: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vrhaddq_u32(uint32x4_t v1, uint32x4_t v2) { return vrhaddq_u32(v1, v2); } -// CHECK-LABEL: @test_vqadd_s8( -// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQADD_V_I]] +// int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) { return vqadd_s8(a, b); } -// CHECK-LABEL: @test_vqadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) +// CHECK-NEXT: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) { return vqadd_s16(a, b); } -// CHECK-LABEL: @test_vqadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) +// CHECK-NEXT: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) { return vqadd_s32(a, b); } -// CHECK-LABEL: @test_vqadd_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQADD_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqadd_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) +// CHECK-NEXT: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) { return vqadd_s64(a, b); } -// CHECK-LABEL: @test_vqadd_u8( -// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQADD_V_I]] +// uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) { return vqadd_u8(a, b); } -// CHECK-LABEL: @test_vqadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) +// CHECK-NEXT: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) { return vqadd_u16(a, b); } -// CHECK-LABEL: @test_vqadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) +// CHECK-NEXT: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) { return vqadd_u32(a, b); } -// CHECK-LABEL: @test_vqadd_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQADD_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqadd_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) +// CHECK-NEXT: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) { return vqadd_u64(a, b); } -// CHECK-LABEL: @test_vqaddq_s8( -// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQADDQ_V_I]] +// int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) { return vqaddq_s8(a, b); } -// CHECK-LABEL: @test_vqaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) +// CHECK-NEXT: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) { return vqaddq_s16(a, b); } -// CHECK-LABEL: @test_vqaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) +// CHECK-NEXT: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) { return vqaddq_s32(a, b); } -// CHECK-LABEL: @test_vqaddq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQADDQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) +// CHECK-NEXT: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) { return vqaddq_s64(a, b); } -// CHECK-LABEL: @test_vqaddq_u8( -// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQADDQ_V_I]] +// uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) { return vqaddq_u8(a, b); } -// CHECK-LABEL: @test_vqaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) +// CHECK-NEXT: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) { return vqaddq_u16(a, b); } -// CHECK-LABEL: @test_vqaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) +// CHECK-NEXT: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) { return vqaddq_u32(a, b); } -// CHECK-LABEL: @test_vqaddq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQADDQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqaddq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) +// CHECK-NEXT: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) { return vqaddq_u64(a, b); } -// CHECK-LABEL: @test_vqsub_s8( -// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSUB_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqsub_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSUB_V_I]] +// int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) { return vqsub_s8(a, b); } -// CHECK-LABEL: @test_vqsub_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqsub_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) +// CHECK-NEXT: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) { return vqsub_s16(a, b); } -// CHECK-LABEL: @test_vqsub_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqsub_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) +// CHECK-NEXT: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) { return vqsub_s32(a, b); } -// CHECK-LABEL: @test_vqsub_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSUB_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqsub_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) +// CHECK-NEXT: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) { return vqsub_s64(a, b); } -// CHECK-LABEL: @test_vqsub_u8( -// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSUB_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqsub_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSUB_V_I]] +// uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) { return vqsub_u8(a, b); } -// CHECK-LABEL: @test_vqsub_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqsub_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) +// CHECK-NEXT: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) { return vqsub_u16(a, b); } -// CHECK-LABEL: @test_vqsub_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqsub_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) +// CHECK-NEXT: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) { return vqsub_u32(a, b); } -// CHECK-LABEL: @test_vqsub_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSUB_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqsub_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) +// CHECK-NEXT: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) { return vqsub_u64(a, b); } -// CHECK-LABEL: @test_vqsubq_s8( -// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSUBQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqsubq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSUBQ_V_I]] +// int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) { return vqsubq_s8(a, b); } -// CHECK-LABEL: @test_vqsubq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqsubq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) +// CHECK-NEXT: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) { return vqsubq_s16(a, b); } -// CHECK-LABEL: @test_vqsubq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqsubq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) +// CHECK-NEXT: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) { return vqsubq_s32(a, b); } -// CHECK-LABEL: @test_vqsubq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqsubq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) +// CHECK-NEXT: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) { return vqsubq_s64(a, b); } -// CHECK-LABEL: @test_vqsubq_u8( -// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSUBQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqsubq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSUBQ_V_I]] +// uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) { return vqsubq_u8(a, b); } -// CHECK-LABEL: @test_vqsubq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqsubq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) +// CHECK-NEXT: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) { return vqsubq_u16(a, b); } -// CHECK-LABEL: @test_vqsubq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqsubq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) +// CHECK-NEXT: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) { return vqsubq_u32(a, b); } -// CHECK-LABEL: @test_vqsubq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqsubq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) +// CHECK-NEXT: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) { return vqsubq_u64(a, b); } -// CHECK-LABEL: @test_vshl_s8( -// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VSHL_V_I]] +// int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) { return vshl_s8(a, b); } -// CHECK-LABEL: @test_vshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) +// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) { return vshl_s16(a, b); } -// CHECK-LABEL: @test_vshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) +// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) { return vshl_s32(a, b); } -// CHECK-LABEL: @test_vshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) +// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) { return vshl_s64(a, b); } -// CHECK-LABEL: @test_vshl_u8( -// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VSHL_V_I]] +// uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) { return vshl_u8(a, b); } -// CHECK-LABEL: @test_vshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) +// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) { return vshl_u16(a, b); } -// CHECK-LABEL: @test_vshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) +// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) { return vshl_u32(a, b); } -// CHECK-LABEL: @test_vshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) +// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) { return vshl_u64(a, b); } -// CHECK-LABEL: @test_vshlq_s8( -// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VSHLQ_V_I]] +// int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) { return vshlq_s8(a, b); } -// CHECK-LABEL: @test_vshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) +// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) { return vshlq_s16(a, b); } -// CHECK-LABEL: @test_vshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) +// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) { return vshlq_s32(a, b); } -// CHECK-LABEL: @test_vshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) +// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) { return vshlq_s64(a, b); } -// CHECK-LABEL: @test_vshlq_u8( -// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VSHLQ_V_I]] +// uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) { return vshlq_u8(a, b); } -// CHECK-LABEL: @test_vshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) +// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) { return vshlq_u16(a, b); } -// CHECK-LABEL: @test_vshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) +// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) { return vshlq_u32(a, b); } -// CHECK-LABEL: @test_vshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) +// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) { return vshlq_u64(a, b); } -// CHECK-LABEL: @test_vqshl_s8( -// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_V_I]] +// int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) { return vqshl_s8(a, b); } -// CHECK-LABEL: @test_vqshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) +// CHECK-NEXT: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) { return vqshl_s16(a, b); } -// CHECK-LABEL: @test_vqshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) +// CHECK-NEXT: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) { return vqshl_s32(a, b); } -// CHECK-LABEL: @test_vqshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) +// CHECK-NEXT: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) { return vqshl_s64(a, b); } -// CHECK-LABEL: @test_vqshl_u8( -// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_V_I]] +// uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) { return vqshl_u8(a, b); } -// CHECK-LABEL: @test_vqshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) +// CHECK-NEXT: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) { return vqshl_u16(a, b); } -// CHECK-LABEL: @test_vqshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) +// CHECK-NEXT: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) { return vqshl_u32(a, b); } -// CHECK-LABEL: @test_vqshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) +// CHECK-NEXT: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) { return vqshl_u64(a, b); } -// CHECK-LABEL: @test_vqshlq_s8( -// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSHLQ_V_I]] +// int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) { return vqshlq_s8(a, b); } -// CHECK-LABEL: @test_vqshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) +// CHECK-NEXT: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) { return vqshlq_s16(a, b); } -// CHECK-LABEL: @test_vqshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) +// CHECK-NEXT: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) { return vqshlq_s32(a, b); } -// CHECK-LABEL: @test_vqshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) +// CHECK-NEXT: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) { return vqshlq_s64(a, b); } -// CHECK-LABEL: @test_vqshlq_u8( -// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSHLQ_V_I]] +// uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) { return vqshlq_u8(a, b); } -// CHECK-LABEL: @test_vqshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) +// CHECK-NEXT: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) { return vqshlq_u16(a, b); } -// CHECK-LABEL: @test_vqshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) +// CHECK-NEXT: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) { return vqshlq_u32(a, b); } -// CHECK-LABEL: @test_vqshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) +// CHECK-NEXT: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) { return vqshlq_u64(a, b); } -// CHECK-LABEL: @test_vrshl_s8( -// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VRSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRSHL_V_I]] +// int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) { return vrshl_s8(a, b); } -// CHECK-LABEL: @test_vrshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) +// CHECK-NEXT: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) { return vrshl_s16(a, b); } -// CHECK-LABEL: @test_vrshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) +// CHECK-NEXT: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) { return vrshl_s32(a, b); } -// CHECK-LABEL: @test_vrshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VRSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vrshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) +// CHECK-NEXT: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) { return vrshl_s64(a, b); } -// CHECK-LABEL: @test_vrshl_u8( -// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VRSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRSHL_V_I]] +// uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) { return vrshl_u8(a, b); } -// CHECK-LABEL: @test_vrshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) +// CHECK-NEXT: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) { return vrshl_u16(a, b); } -// CHECK-LABEL: @test_vrshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) +// CHECK-NEXT: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) { return vrshl_u32(a, b); } -// CHECK-LABEL: @test_vrshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VRSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vrshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) +// CHECK-NEXT: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) { return vrshl_u64(a, b); } -// CHECK-LABEL: @test_vrshlq_s8( -// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VRSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VRSHLQ_V_I]] +// int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) { return vrshlq_s8(a, b); } -// CHECK-LABEL: @test_vrshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) +// CHECK-NEXT: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) { return vrshlq_s16(a, b); } -// CHECK-LABEL: @test_vrshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) +// CHECK-NEXT: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) { return vrshlq_s32(a, b); } -// CHECK-LABEL: @test_vrshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vrshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) +// CHECK-NEXT: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) { return vrshlq_s64(a, b); } -// CHECK-LABEL: @test_vrshlq_u8( -// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VRSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VRSHLQ_V_I]] +// uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) { return vrshlq_u8(a, b); } -// CHECK-LABEL: @test_vrshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) +// CHECK-NEXT: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) { return vrshlq_u16(a, b); } -// CHECK-LABEL: @test_vrshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) +// CHECK-NEXT: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) { return vrshlq_u32(a, b); } -// CHECK-LABEL: @test_vrshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vrshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) +// CHECK-NEXT: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) { return vrshlq_u64(a, b); } -// CHECK-LABEL: @test_vqrshl_s8( -// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQRSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQRSHL_V_I]] +// int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) { return vqrshl_s8(a, b); } -// CHECK-LABEL: @test_vqrshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) +// CHECK-NEXT: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) { return vqrshl_s16(a, b); } -// CHECK-LABEL: @test_vqrshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) +// CHECK-NEXT: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) { return vqrshl_s32(a, b); } -// CHECK-LABEL: @test_vqrshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQRSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqrshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) +// CHECK-NEXT: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) { return vqrshl_s64(a, b); } -// CHECK-LABEL: @test_vqrshl_u8( -// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQRSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQRSHL_V_I]] +// uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) { return vqrshl_u8(a, b); } -// CHECK-LABEL: @test_vqrshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) +// CHECK-NEXT: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) { return vqrshl_u16(a, b); } -// CHECK-LABEL: @test_vqrshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) +// CHECK-NEXT: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) { return vqrshl_u32(a, b); } -// CHECK-LABEL: @test_vqrshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQRSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqrshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) +// CHECK-NEXT: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) { return vqrshl_u64(a, b); } -// CHECK-LABEL: @test_vqrshlq_s8( -// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQRSHLQ_V_I]] +// int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) { return vqrshlq_s8(a, b); } -// CHECK-LABEL: @test_vqrshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) +// CHECK-NEXT: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) { return vqrshlq_s16(a, b); } -// CHECK-LABEL: @test_vqrshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) +// CHECK-NEXT: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) { return vqrshlq_s32(a, b); } -// CHECK-LABEL: @test_vqrshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqrshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) +// CHECK-NEXT: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) { return vqrshlq_s64(a, b); } -// CHECK-LABEL: @test_vqrshlq_u8( -// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQRSHLQ_V_I]] +// uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) { return vqrshlq_u8(a, b); } -// CHECK-LABEL: @test_vqrshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) +// CHECK-NEXT: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) { return vqrshlq_u16(a, b); } -// CHECK-LABEL: @test_vqrshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) +// CHECK-NEXT: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) { return vqrshlq_u32(a, b); } -// CHECK-LABEL: @test_vqrshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqrshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) +// CHECK-NEXT: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) { return vqrshlq_u64(a, b); } -// CHECK-LABEL: @test_vsli_n_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 0) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsli_n_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 0) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// poly64x1_t test_vsli_n_p64(poly64x1_t a, poly64x1_t b) { return vsli_n_p64(a, b, 0); } -// CHECK-LABEL: @test_vsliq_n_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 0) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsliq_n_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 0) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// poly64x2_t test_vsliq_n_p64(poly64x2_t a, poly64x2_t b) { return vsliq_n_p64(a, b, 0); } -// CHECK-LABEL: @test_vmax_s8( -// CHECK: [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMAX_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmax_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMAX_I]] +// int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) { return vmax_s8(a, b); } -// CHECK-LABEL: @test_vmax_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmax_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VMAX2_I]] +// int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) { return vmax_s16(a, b); } -// CHECK-LABEL: @test_vmax_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmax_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VMAX2_I]] +// int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) { return vmax_s32(a, b); } -// CHECK-LABEL: @test_vmax_u8( -// CHECK: [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMAX_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmax_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMAX_I]] +// uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) { return vmax_u8(a, b); } -// CHECK-LABEL: @test_vmax_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmax_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VMAX2_I]] +// uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) { return vmax_u16(a, b); } -// CHECK-LABEL: @test_vmax_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmax_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VMAX2_I]] +// uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) { return vmax_u32(a, b); } -// CHECK-LABEL: @test_vmax_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmax_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> [[VMAX_I]], <2 x float> [[VMAX1_I]]) +// CHECK-NEXT: ret <2 x float> [[VMAX2_I]] +// float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) { return vmax_f32(a, b); } -// CHECK-LABEL: @test_vmaxq_s8( -// CHECK: [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMAX_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmaxq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMAX_I]] +// int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) { return vmaxq_s8(a, b); } -// CHECK-LABEL: @test_vmaxq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmaxq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VMAX2_I]] +// int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) { return vmaxq_s16(a, b); } -// CHECK-LABEL: @test_vmaxq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmaxq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMAX2_I]] +// int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) { return vmaxq_s32(a, b); } -// CHECK-LABEL: @test_vmaxq_u8( -// CHECK: [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMAX_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmaxq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMAX_I]] +// uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) { return vmaxq_u8(a, b); } -// CHECK-LABEL: @test_vmaxq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmaxq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VMAX2_I]] +// uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) { return vmaxq_u16(a, b); } -// CHECK-LABEL: @test_vmaxq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmaxq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMAX2_I]] +// uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) { return vmaxq_u32(a, b); } -// CHECK-LABEL: @test_vmaxq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vmaxq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> [[VMAX_I]], <4 x float> [[VMAX1_I]]) +// CHECK-NEXT: ret <4 x float> [[VMAX2_I]] +// float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) { return vmaxq_f32(a, b); } -// CHECK-LABEL: @test_vmaxq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vmaxq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> [[VMAX_I]], <2 x double> [[VMAX1_I]]) +// CHECK-NEXT: ret <2 x double> [[VMAX2_I]] +// float64x2_t test_vmaxq_f64(float64x2_t a, float64x2_t b) { return vmaxq_f64(a, b); } -// CHECK-LABEL: @test_vmin_s8( -// CHECK: [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMIN_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmin_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMIN_I]] +// int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) { return vmin_s8(a, b); } -// CHECK-LABEL: @test_vmin_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmin_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VMIN2_I]] +// int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) { return vmin_s16(a, b); } -// CHECK-LABEL: @test_vmin_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmin_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VMIN2_I]] +// int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) { return vmin_s32(a, b); } -// CHECK-LABEL: @test_vmin_u8( -// CHECK: [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMIN_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmin_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMIN_I]] +// uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) { return vmin_u8(a, b); } -// CHECK-LABEL: @test_vmin_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmin_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VMIN2_I]] +// uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) { return vmin_u16(a, b); } -// CHECK-LABEL: @test_vmin_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmin_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VMIN2_I]] +// uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) { return vmin_u32(a, b); } -// CHECK-LABEL: @test_vmin_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmin_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> [[VMIN_I]], <2 x float> [[VMIN1_I]]) +// CHECK-NEXT: ret <2 x float> [[VMIN2_I]] +// float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) { return vmin_f32(a, b); } -// CHECK-LABEL: @test_vminq_s8( -// CHECK: [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMIN_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vminq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMIN_I]] +// int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) { return vminq_s8(a, b); } -// CHECK-LABEL: @test_vminq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vminq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VMIN2_I]] +// int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) { return vminq_s16(a, b); } -// CHECK-LABEL: @test_vminq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vminq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMIN2_I]] +// int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) { return vminq_s32(a, b); } -// CHECK-LABEL: @test_vminq_u8( -// CHECK: [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMIN_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vminq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMIN_I]] +// uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) { return vminq_u8(a, b); } -// CHECK-LABEL: @test_vminq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vminq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VMIN2_I]] +// uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) { return vminq_u16(a, b); } -// CHECK-LABEL: @test_vminq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vminq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMIN2_I]] +// uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) { return vminq_u32(a, b); } -// CHECK-LABEL: @test_vminq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vminq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> [[VMIN_I]], <4 x float> [[VMIN1_I]]) +// CHECK-NEXT: ret <4 x float> [[VMIN2_I]] +// float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) { return vminq_f32(a, b); } -// CHECK-LABEL: @test_vminq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vminq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> [[VMIN_I]], <2 x double> [[VMIN1_I]]) +// CHECK-NEXT: ret <2 x double> [[VMIN2_I]] +// float64x2_t test_vminq_f64(float64x2_t a, float64x2_t b) { return vminq_f64(a, b); } -// CHECK-LABEL: @test_vmaxnm_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VMAXNM2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmaxnm_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> [[VMAXNM_I]], <2 x float> [[VMAXNM1_I]]) +// CHECK-NEXT: ret <2 x float> [[VMAXNM2_I]] +// float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) { return vmaxnm_f32(a, b); } -// CHECK-LABEL: @test_vmaxnmq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VMAXNM2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vmaxnmq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> [[VMAXNM_I]], <4 x float> [[VMAXNM1_I]]) +// CHECK-NEXT: ret <4 x float> [[VMAXNM2_I]] +// float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) { return vmaxnmq_f32(a, b); } -// CHECK-LABEL: @test_vmaxnmq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VMAXNM2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vmaxnmq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> [[VMAXNM_I]], <2 x double> [[VMAXNM1_I]]) +// CHECK-NEXT: ret <2 x double> [[VMAXNM2_I]] +// float64x2_t test_vmaxnmq_f64(float64x2_t a, float64x2_t b) { return vmaxnmq_f64(a, b); } -// CHECK-LABEL: @test_vminnm_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VMINNM2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vminnm_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> [[VMINNM_I]], <2 x float> [[VMINNM1_I]]) +// CHECK-NEXT: ret <2 x float> [[VMINNM2_I]] +// float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) { return vminnm_f32(a, b); } -// CHECK-LABEL: @test_vminnmq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VMINNM2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vminnmq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> [[VMINNM_I]], <4 x float> [[VMINNM1_I]]) +// CHECK-NEXT: ret <4 x float> [[VMINNM2_I]] +// float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) { return vminnmq_f32(a, b); } -// CHECK-LABEL: @test_vminnmq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VMINNM2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vminnmq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> [[VMINNM_I]], <2 x double> [[VMINNM1_I]]) +// CHECK-NEXT: ret <2 x double> [[VMINNM2_I]] +// float64x2_t test_vminnmq_f64(float64x2_t a, float64x2_t b) { return vminnmq_f64(a, b); } -// CHECK-LABEL: @test_vpmax_s8( -// CHECK: [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMAX_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vpmax_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMAX_I]] +// int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) { return vpmax_s8(a, b); } -// CHECK-LABEL: @test_vpmax_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpmax_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VPMAX2_I]] +// int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) { return vpmax_s16(a, b); } -// CHECK-LABEL: @test_vpmax_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpmax_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VPMAX2_I]] +// int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) { return vpmax_s32(a, b); } -// CHECK-LABEL: @test_vpmax_u8( -// CHECK: [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMAX_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vpmax_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMAX_I]] +// uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) { return vpmax_u8(a, b); } -// CHECK-LABEL: @test_vpmax_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpmax_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VPMAX2_I]] +// uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) { return vpmax_u16(a, b); } -// CHECK-LABEL: @test_vpmax_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpmax_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VPMAX2_I]] +// uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) { return vpmax_u32(a, b); } -// CHECK-LABEL: @test_vpmax_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vpmax_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> [[VPMAX_I]], <2 x float> [[VPMAX1_I]]) +// CHECK-NEXT: ret <2 x float> [[VPMAX2_I]] +// float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) { return vpmax_f32(a, b); } -// CHECK-LABEL: @test_vpmaxq_s8( -// CHECK: [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VPMAX_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vpmaxq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VPMAX_I]] +// int8x16_t test_vpmaxq_s8(int8x16_t a, int8x16_t b) { return vpmaxq_s8(a, b); } -// CHECK-LABEL: @test_vpmaxq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpmaxq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> [[VPMAX_I]], <8 x i16> [[VPMAX1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VPMAX2_I]] +// int16x8_t test_vpmaxq_s16(int16x8_t a, int16x8_t b) { return vpmaxq_s16(a, b); } -// CHECK-LABEL: @test_vpmaxq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpmaxq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> [[VPMAX_I]], <4 x i32> [[VPMAX1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VPMAX2_I]] +// int32x4_t test_vpmaxq_s32(int32x4_t a, int32x4_t b) { return vpmaxq_s32(a, b); } -// CHECK-LABEL: @test_vpmaxq_u8( -// CHECK: [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VPMAX_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vpmaxq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VPMAX_I]] +// uint8x16_t test_vpmaxq_u8(uint8x16_t a, uint8x16_t b) { return vpmaxq_u8(a, b); } -// CHECK-LABEL: @test_vpmaxq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpmaxq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> [[VPMAX_I]], <8 x i16> [[VPMAX1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VPMAX2_I]] +// uint16x8_t test_vpmaxq_u16(uint16x8_t a, uint16x8_t b) { return vpmaxq_u16(a, b); } -// CHECK-LABEL: @test_vpmaxq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpmaxq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> [[VPMAX_I]], <4 x i32> [[VPMAX1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VPMAX2_I]] +// uint32x4_t test_vpmaxq_u32(uint32x4_t a, uint32x4_t b) { return vpmaxq_u32(a, b); } -// CHECK-LABEL: @test_vpmaxq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vpmaxq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> [[VPMAX_I]], <4 x float> [[VPMAX1_I]]) +// CHECK-NEXT: ret <4 x float> [[VPMAX2_I]] +// float32x4_t test_vpmaxq_f32(float32x4_t a, float32x4_t b) { return vpmaxq_f32(a, b); } -// CHECK-LABEL: @test_vpmaxq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vpmaxq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> [[VPMAX_I]], <2 x double> [[VPMAX1_I]]) +// CHECK-NEXT: ret <2 x double> [[VPMAX2_I]] +// float64x2_t test_vpmaxq_f64(float64x2_t a, float64x2_t b) { return vpmaxq_f64(a, b); } -// CHECK-LABEL: @test_vpmin_s8( -// CHECK: [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMIN_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vpmin_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMIN_I]] +// int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) { return vpmin_s8(a, b); } -// CHECK-LABEL: @test_vpmin_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpmin_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VPMIN2_I]] +// int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) { return vpmin_s16(a, b); } -// CHECK-LABEL: @test_vpmin_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpmin_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VPMIN2_I]] +// int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) { return vpmin_s32(a, b); } -// CHECK-LABEL: @test_vpmin_u8( -// CHECK: [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMIN_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vpmin_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMIN_I]] +// uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) { return vpmin_u8(a, b); } -// CHECK-LABEL: @test_vpmin_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpmin_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VPMIN2_I]] +// uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) { return vpmin_u16(a, b); } -// CHECK-LABEL: @test_vpmin_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpmin_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VPMIN2_I]] +// uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) { return vpmin_u32(a, b); } -// CHECK-LABEL: @test_vpmin_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vpmin_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> [[VPMIN_I]], <2 x float> [[VPMIN1_I]]) +// CHECK-NEXT: ret <2 x float> [[VPMIN2_I]] +// float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) { return vpmin_f32(a, b); } -// CHECK-LABEL: @test_vpminq_s8( -// CHECK: [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VPMIN_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vpminq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VPMIN_I]] +// int8x16_t test_vpminq_s8(int8x16_t a, int8x16_t b) { return vpminq_s8(a, b); } -// CHECK-LABEL: @test_vpminq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpminq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> [[VPMIN_I]], <8 x i16> [[VPMIN1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VPMIN2_I]] +// int16x8_t test_vpminq_s16(int16x8_t a, int16x8_t b) { return vpminq_s16(a, b); } -// CHECK-LABEL: @test_vpminq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpminq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> [[VPMIN_I]], <4 x i32> [[VPMIN1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VPMIN2_I]] +// int32x4_t test_vpminq_s32(int32x4_t a, int32x4_t b) { return vpminq_s32(a, b); } -// CHECK-LABEL: @test_vpminq_u8( -// CHECK: [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VPMIN_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vpminq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VPMIN_I]] +// uint8x16_t test_vpminq_u8(uint8x16_t a, uint8x16_t b) { return vpminq_u8(a, b); } -// CHECK-LABEL: @test_vpminq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpminq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> [[VPMIN_I]], <8 x i16> [[VPMIN1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VPMIN2_I]] +// uint16x8_t test_vpminq_u16(uint16x8_t a, uint16x8_t b) { return vpminq_u16(a, b); } -// CHECK-LABEL: @test_vpminq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpminq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> [[VPMIN_I]], <4 x i32> [[VPMIN1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VPMIN2_I]] +// uint32x4_t test_vpminq_u32(uint32x4_t a, uint32x4_t b) { return vpminq_u32(a, b); } -// CHECK-LABEL: @test_vpminq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vpminq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> [[VPMIN_I]], <4 x float> [[VPMIN1_I]]) +// CHECK-NEXT: ret <4 x float> [[VPMIN2_I]] +// float32x4_t test_vpminq_f32(float32x4_t a, float32x4_t b) { return vpminq_f32(a, b); } -// CHECK-LABEL: @test_vpminq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vpminq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> [[VPMIN_I]], <2 x double> [[VPMIN1_I]]) +// CHECK-NEXT: ret <2 x double> [[VPMIN2_I]] +// float64x2_t test_vpminq_f64(float64x2_t a, float64x2_t b) { return vpminq_f64(a, b); } -// CHECK-LABEL: @test_vpmaxnm_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VPMAXNM2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vpmaxnm_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VPMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> [[VPMAXNM_I]], <2 x float> [[VPMAXNM1_I]]) +// CHECK-NEXT: ret <2 x float> [[VPMAXNM2_I]] +// float32x2_t test_vpmaxnm_f32(float32x2_t a, float32x2_t b) { return vpmaxnm_f32(a, b); } -// CHECK-LABEL: @test_vpmaxnmq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VPMAXNM2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vpmaxnmq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> [[VPMAXNM_I]], <4 x float> [[VPMAXNM1_I]]) +// CHECK-NEXT: ret <4 x float> [[VPMAXNM2_I]] +// float32x4_t test_vpmaxnmq_f32(float32x4_t a, float32x4_t b) { return vpmaxnmq_f32(a, b); } -// CHECK-LABEL: @test_vpmaxnmq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VPMAXNM2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vpmaxnmq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> [[VPMAXNM_I]], <2 x double> [[VPMAXNM1_I]]) +// CHECK-NEXT: ret <2 x double> [[VPMAXNM2_I]] +// float64x2_t test_vpmaxnmq_f64(float64x2_t a, float64x2_t b) { return vpmaxnmq_f64(a, b); } -// CHECK-LABEL: @test_vpminnm_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VPMINNM2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vpminnm_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPMINNM_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VPMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> [[VPMINNM_I]], <2 x float> [[VPMINNM1_I]]) +// CHECK-NEXT: ret <2 x float> [[VPMINNM2_I]] +// float32x2_t test_vpminnm_f32(float32x2_t a, float32x2_t b) { return vpminnm_f32(a, b); } -// CHECK-LABEL: @test_vpminnmq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VPMINNM2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vpminnmq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> [[VPMINNM_I]], <4 x float> [[VPMINNM1_I]]) +// CHECK-NEXT: ret <4 x float> [[VPMINNM2_I]] +// float32x4_t test_vpminnmq_f32(float32x4_t a, float32x4_t b) { return vpminnmq_f32(a, b); } -// CHECK-LABEL: @test_vpminnmq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VPMINNM2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vpminnmq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> [[VPMINNM_I]], <2 x double> [[VPMINNM1_I]]) +// CHECK-NEXT: ret <2 x double> [[VPMINNM2_I]] +// float64x2_t test_vpminnmq_f64(float64x2_t a, float64x2_t b) { return vpminnmq_f64(a, b); } -// CHECK-LABEL: @test_vpadd_s8( -// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vpadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPADD_V_I]] +// int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) { return vpadd_s8(a, b); } -// CHECK-LABEL: @test_vpadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) +// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) { return vpadd_s16(a, b); } -// CHECK-LABEL: @test_vpadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) +// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) { return vpadd_s32(a, b); } -// CHECK-LABEL: @test_vpadd_u8( -// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vpadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPADD_V_I]] +// uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) { return vpadd_u8(a, b); } -// CHECK-LABEL: @test_vpadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) +// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) { return vpadd_u16(a, b); } -// CHECK-LABEL: @test_vpadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) +// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) { return vpadd_u32(a, b); } -// CHECK-LABEL: @test_vpadd_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VPADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vpadd_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) +// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] +// float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) { return vpadd_f32(a, b); } -// CHECK-LABEL: @test_vpaddq_s8( -// CHECK: [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VPADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vpaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VPADDQ_V_I]] +// int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) { return vpaddq_s8(a, b); } -// CHECK-LABEL: @test_vpaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[VPADDQ_V_I]], <8 x i16> [[VPADDQ_V1_I]]) +// CHECK-NEXT: [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) { return vpaddq_s16(a, b); } -// CHECK-LABEL: @test_vpaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[VPADDQ_V_I]], <4 x i32> [[VPADDQ_V1_I]]) +// CHECK-NEXT: [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) { return vpaddq_s32(a, b); } -// CHECK-LABEL: @test_vpaddq_u8( -// CHECK: [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VPADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vpaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VPADDQ_V_I]] +// uint8x16_t test_vpaddq_u8(uint8x16_t a, uint8x16_t b) { return vpaddq_u8(a, b); } -// CHECK-LABEL: @test_vpaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[VPADDQ_V_I]], <8 x i16> [[VPADDQ_V1_I]]) +// CHECK-NEXT: [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vpaddq_u16(uint16x8_t a, uint16x8_t b) { return vpaddq_u16(a, b); } -// CHECK-LABEL: @test_vpaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[VPADDQ_V_I]], <4 x i32> [[VPADDQ_V1_I]]) +// CHECK-NEXT: [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vpaddq_u32(uint32x4_t a, uint32x4_t b) { return vpaddq_u32(a, b); } -// CHECK-LABEL: @test_vpaddq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vpaddq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> [[VPADDQ_V_I]], <4 x float> [[VPADDQ_V1_I]]) +// CHECK-NEXT: [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) { return vpaddq_f32(a, b); } -// CHECK-LABEL: @test_vpaddq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x double> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vpaddq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> [[VPADDQ_V_I]], <2 x double> [[VPADDQ_V1_I]]) +// CHECK-NEXT: [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP5]] +// float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) { return vpaddq_f64(a, b); } -// CHECK-LABEL: @test_vqdmulh_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQDMULH_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqdmulh_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) +// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) { return vqdmulh_s16(a, b); } -// CHECK-LABEL: @test_vqdmulh_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQDMULH_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqdmulh_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) +// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) { return vqdmulh_s32(a, b); } -// CHECK-LABEL: @test_vqdmulhq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqdmulhq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) +// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) { return vqdmulhq_s16(a, b); } -// CHECK-LABEL: @test_vqdmulhq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmulhq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) +// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) { return vqdmulhq_s32(a, b); } -// CHECK-LABEL: @test_vqrdmulh_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRDMULH_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqrdmulh_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) +// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) { return vqrdmulh_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulh_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRDMULH_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqrdmulh_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) +// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) { return vqrdmulh_s32(a, b); } -// CHECK-LABEL: @test_vqrdmulhq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqrdmulhq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) +// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) { return vqrdmulhq_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulhq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqrdmulhq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) +// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) { return vqrdmulhq_s32(a, b); } -// CHECK-LABEL: @test_vmulx_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VMULX2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmulx_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) +// CHECK-NEXT: ret <2 x float> [[VMULX2_I]] +// float32x2_t test_vmulx_f32(float32x2_t a, float32x2_t b) { return vmulx_f32(a, b); } -// CHECK-LABEL: @test_vmulxq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VMULX2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vmulxq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) +// CHECK-NEXT: ret <4 x float> [[VMULX2_I]] +// float32x4_t test_vmulxq_f32(float32x4_t a, float32x4_t b) { return vmulxq_f32(a, b); } -// CHECK-LABEL: @test_vmulxq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VMULX2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vmulxq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) +// CHECK-NEXT: ret <2 x double> [[VMULX2_I]] +// float64x2_t test_vmulxq_f64(float64x2_t a, float64x2_t b) { return vmulxq_f64(a, b); } -// CHECK-LABEL: @test_vshl_n_s8( -// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, splat (i8 3) -// CHECK: ret <8 x i8> [[VSHL_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <8 x i8> [[VSHL_N]] +// int8x8_t test_vshl_n_s8(int8x8_t a) { return vshl_n_s8(a, 3); } -// CHECK-LABEL: @test_vshl_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <4 x i16> [[VSHL_N]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 3) +// CHECK-NEXT: ret <4 x i16> [[VSHL_N]] +// int16x4_t test_vshl_n_s16(int16x4_t a) { return vshl_n_s16(a, 3); } -// CHECK-LABEL: @test_vshl_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <2 x i32> [[VSHL_N]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSHL_N]] +// int32x2_t test_vshl_n_s32(int32x2_t a) { return vshl_n_s32(a, 3); } -// CHECK-LABEL: @test_vshlq_n_s8( -// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, splat (i8 3) -// CHECK: ret <16 x i8> [[VSHL_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <16 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <16 x i8> [[VSHL_N]] +// int8x16_t test_vshlq_n_s8(int8x16_t a) { return vshlq_n_s8(a, 3); } -// CHECK-LABEL: @test_vshlq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHL_N]] +// int16x8_t test_vshlq_n_s16(int16x8_t a) { return vshlq_n_s16(a, 3); } -// CHECK-LABEL: @test_vshlq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <4 x i32> [[VSHL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSHL_N]] +// int32x4_t test_vshlq_n_s32(int32x4_t a) { return vshlq_n_s32(a, 3); } -// CHECK-LABEL: @test_vshlq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 3) -// CHECK: ret <2 x i64> [[VSHL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 3) +// CHECK-NEXT: ret <2 x i64> [[VSHL_N]] +// int64x2_t test_vshlq_n_s64(int64x2_t a) { return vshlq_n_s64(a, 3); } -// CHECK-LABEL: @test_vshl_n_u8( -// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, splat (i8 3) -// CHECK: ret <8 x i8> [[VSHL_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <8 x i8> [[VSHL_N]] +// uint8x8_t test_vshl_n_u8(uint8x8_t a) { return vshl_n_u8(a, 3); } -// CHECK-LABEL: @test_vshl_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <4 x i16> [[VSHL_N]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 3) +// CHECK-NEXT: ret <4 x i16> [[VSHL_N]] +// uint16x4_t test_vshl_n_u16(uint16x4_t a) { return vshl_n_u16(a, 3); } -// CHECK-LABEL: @test_vshl_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <2 x i32> [[VSHL_N]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSHL_N]] +// uint32x2_t test_vshl_n_u32(uint32x2_t a) { return vshl_n_u32(a, 3); } -// CHECK-LABEL: @test_vshlq_n_u8( -// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, splat (i8 3) -// CHECK: ret <16 x i8> [[VSHL_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <16 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <16 x i8> [[VSHL_N]] +// uint8x16_t test_vshlq_n_u8(uint8x16_t a) { return vshlq_n_u8(a, 3); } -// CHECK-LABEL: @test_vshlq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHL_N]] +// uint16x8_t test_vshlq_n_u16(uint16x8_t a) { return vshlq_n_u16(a, 3); } -// CHECK-LABEL: @test_vshlq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <4 x i32> [[VSHL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSHL_N]] +// uint32x4_t test_vshlq_n_u32(uint32x4_t a) { return vshlq_n_u32(a, 3); } -// CHECK-LABEL: @test_vshlq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 3) -// CHECK: ret <2 x i64> [[VSHL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 3) +// CHECK-NEXT: ret <2 x i64> [[VSHL_N]] +// uint64x2_t test_vshlq_n_u64(uint64x2_t a) { return vshlq_n_u64(a, 3); } -// CHECK-LABEL: @test_vshr_n_s8( -// CHECK: [[VSHR_N:%.*]] = ashr <8 x i8> %a, splat (i8 3) -// CHECK: ret <8 x i8> [[VSHR_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshr_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <8 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <8 x i8> [[VSHR_N]] +// int8x8_t test_vshr_n_s8(int8x8_t a) { return vshr_n_s8(a, 3); } -// CHECK-LABEL: @test_vshr_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <4 x i16> [[VSHR_N]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshr_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], splat (i16 3) +// CHECK-NEXT: ret <4 x i16> [[VSHR_N]] +// int16x4_t test_vshr_n_s16(int16x4_t a) { return vshr_n_s16(a, 3); } -// CHECK-LABEL: @test_vshr_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <2 x i32> [[VSHR_N]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshr_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], splat (i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSHR_N]] +// int32x2_t test_vshr_n_s32(int32x2_t a) { return vshr_n_s32(a, 3); } -// CHECK-LABEL: @test_vshrq_n_s8( -// CHECK: [[VSHR_N:%.*]] = ashr <16 x i8> %a, splat (i8 3) -// CHECK: ret <16 x i8> [[VSHR_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshrq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <16 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <16 x i8> [[VSHR_N]] +// int8x16_t test_vshrq_n_s8(int8x16_t a) { return vshrq_n_s8(a, 3); } -// CHECK-LABEL: @test_vshrq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHR_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshrq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHR_N]] +// int16x8_t test_vshrq_n_s16(int16x8_t a) { return vshrq_n_s16(a, 3); } -// CHECK-LABEL: @test_vshrq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <4 x i32> [[VSHR_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshrq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSHR_N]] +// int32x4_t test_vshrq_n_s32(int32x4_t a) { return vshrq_n_s32(a, 3); } -// CHECK-LABEL: @test_vshrq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 3) -// CHECK: ret <2 x i64> [[VSHR_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshrq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 3) +// CHECK-NEXT: ret <2 x i64> [[VSHR_N]] +// int64x2_t test_vshrq_n_s64(int64x2_t a) { return vshrq_n_s64(a, 3); } -// CHECK-LABEL: @test_vshr_n_u8( -// CHECK: [[VSHR_N:%.*]] = lshr <8 x i8> %a, splat (i8 3) -// CHECK: ret <8 x i8> [[VSHR_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshr_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <8 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <8 x i8> [[VSHR_N]] +// uint8x8_t test_vshr_n_u8(uint8x8_t a) { return vshr_n_u8(a, 3); } -// CHECK-LABEL: @test_vshr_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <4 x i16> [[VSHR_N]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshr_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], splat (i16 3) +// CHECK-NEXT: ret <4 x i16> [[VSHR_N]] +// uint16x4_t test_vshr_n_u16(uint16x4_t a) { return vshr_n_u16(a, 3); } -// CHECK-LABEL: @test_vshr_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <2 x i32> [[VSHR_N]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshr_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSHR_N]] +// uint32x2_t test_vshr_n_u32(uint32x2_t a) { return vshr_n_u32(a, 3); } -// CHECK-LABEL: @test_vshrq_n_u8( -// CHECK: [[VSHR_N:%.*]] = lshr <16 x i8> %a, splat (i8 3) -// CHECK: ret <16 x i8> [[VSHR_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshrq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <16 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <16 x i8> [[VSHR_N]] +// uint8x16_t test_vshrq_n_u8(uint8x16_t a) { return vshrq_n_u8(a, 3); } -// CHECK-LABEL: @test_vshrq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHR_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshrq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHR_N]] +// uint16x8_t test_vshrq_n_u16(uint16x8_t a) { return vshrq_n_u16(a, 3); } -// CHECK-LABEL: @test_vshrq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <4 x i32> [[VSHR_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshrq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSHR_N]] +// uint32x4_t test_vshrq_n_u32(uint32x4_t a) { return vshrq_n_u32(a, 3); } -// CHECK-LABEL: @test_vshrq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 3) -// CHECK: ret <2 x i64> [[VSHR_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshrq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 3) +// CHECK-NEXT: ret <2 x i64> [[VSHR_N]] +// uint64x2_t test_vshrq_n_u64(uint64x2_t a) { return vshrq_n_u64(a, 3); } -// CHECK-LABEL: @test_vsra_n_s8( -// CHECK: [[VSRA_N:%.*]] = ashr <8 x i8> %b, splat (i8 3) -// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]] -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsra_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <8 x i8> [[B]], splat (i8 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) { return vsra_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vsra_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], splat (i16 3) -// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i16> [[TMP4]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsra_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], splat (i16 3) +// CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i16> [[TMP4]] +// int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) { return vsra_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vsra_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], splat (i32 3) -// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i32> [[TMP4]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsra_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], splat (i32 3) +// CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i32> [[TMP4]] +// int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) { return vsra_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_s8( -// CHECK: [[VSRA_N:%.*]] = ashr <16 x i8> %b, splat (i8 3) -// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]] -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsraq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <16 x i8> [[B]], splat (i8 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <16 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) { return vsraq_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], splat (i16 3) -// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <8 x i16> [[TMP4]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsraq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], splat (i16 3) +// CHECK-NEXT: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i16> [[TMP4]] +// int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) { return vsraq_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], splat (i32 3) -// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i32> [[TMP4]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsraq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], splat (i32 3) +// CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i32> [[TMP4]] +// int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) { return vsraq_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], splat (i64 3) -// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i64> [[TMP4]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsraq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], splat (i64 3) +// CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i64> [[TMP4]] +// int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) { return vsraq_n_s64(a, b, 3); } -// CHECK-LABEL: @test_vsra_n_u8( -// CHECK: [[VSRA_N:%.*]] = lshr <8 x i8> %b, splat (i8 3) -// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]] -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsra_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <8 x i8> [[B]], splat (i8 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) { return vsra_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vsra_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 3) -// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i16> [[TMP4]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsra_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 3) +// CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i16> [[TMP4]] +// uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) { return vsra_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vsra_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], splat (i32 3) -// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i32> [[TMP4]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsra_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], splat (i32 3) +// CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i32> [[TMP4]] +// uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) { return vsra_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_u8( -// CHECK: [[VSRA_N:%.*]] = lshr <16 x i8> %b, splat (i8 3) -// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]] -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsraq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <16 x i8> [[B]], splat (i8 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <16 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) { return vsraq_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], splat (i16 3) -// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <8 x i16> [[TMP4]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsraq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], splat (i16 3) +// CHECK-NEXT: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i16> [[TMP4]] +// uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) { return vsraq_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], splat (i32 3) -// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i32> [[TMP4]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsraq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], splat (i32 3) +// CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i32> [[TMP4]] +// uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) { return vsraq_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 3) -// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i64> [[TMP4]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsraq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 3) +// CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i64> [[TMP4]] +// uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) { return vsraq_n_u64(a, b, 3); } -// CHECK-LABEL: @test_vrshr_n_s8( -// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> splat (i8 -3)) -// CHECK: ret <8 x i8> [[VRSHR_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrshr_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 -3)) +// CHECK-NEXT: ret <8 x i8> [[VRSHR_N]] +// int8x8_t test_vrshr_n_s8(int8x8_t a) { return vrshr_n_s8(a, 3); } -// CHECK-LABEL: @test_vrshr_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3)) -// CHECK: ret <4 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrshr_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3)) +// CHECK-NEXT: ret <4 x i16> [[VRSHR_N1]] +// int16x4_t test_vrshr_n_s16(int16x4_t a) { return vrshr_n_s16(a, 3); } -// CHECK-LABEL: @test_vrshr_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3)) -// CHECK: ret <2 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrshr_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3)) +// CHECK-NEXT: ret <2 x i32> [[VRSHR_N1]] +// int32x2_t test_vrshr_n_s32(int32x2_t a) { return vrshr_n_s32(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_s8( -// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> splat (i8 -3)) -// CHECK: ret <16 x i8> [[VRSHR_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrshrq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 -3)) +// CHECK-NEXT: ret <16 x i8> [[VRSHR_N]] +// int8x16_t test_vrshrq_n_s8(int8x16_t a) { return vrshrq_n_s8(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3)) -// CHECK: ret <8 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrshrq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3)) +// CHECK-NEXT: ret <8 x i16> [[VRSHR_N1]] +// int16x8_t test_vrshrq_n_s16(int16x8_t a) { return vrshrq_n_s16(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3)) -// CHECK: ret <4 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrshrq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3)) +// CHECK-NEXT: ret <4 x i32> [[VRSHR_N1]] +// int32x4_t test_vrshrq_n_s32(int32x4_t a) { return vrshrq_n_s32(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3)) -// CHECK: ret <2 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vrshrq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3)) +// CHECK-NEXT: ret <2 x i64> [[VRSHR_N1]] +// int64x2_t test_vrshrq_n_s64(int64x2_t a) { return vrshrq_n_s64(a, 3); } -// CHECK-LABEL: @test_vrshr_n_u8( -// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> splat (i8 -3)) -// CHECK: ret <8 x i8> [[VRSHR_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrshr_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 -3)) +// CHECK-NEXT: ret <8 x i8> [[VRSHR_N]] +// uint8x8_t test_vrshr_n_u8(uint8x8_t a) { return vrshr_n_u8(a, 3); } -// CHECK-LABEL: @test_vrshr_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3)) -// CHECK: ret <4 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrshr_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3)) +// CHECK-NEXT: ret <4 x i16> [[VRSHR_N1]] +// uint16x4_t test_vrshr_n_u16(uint16x4_t a) { return vrshr_n_u16(a, 3); } -// CHECK-LABEL: @test_vrshr_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3)) -// CHECK: ret <2 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrshr_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3)) +// CHECK-NEXT: ret <2 x i32> [[VRSHR_N1]] +// uint32x2_t test_vrshr_n_u32(uint32x2_t a) { return vrshr_n_u32(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_u8( -// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> splat (i8 -3)) -// CHECK: ret <16 x i8> [[VRSHR_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrshrq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 -3)) +// CHECK-NEXT: ret <16 x i8> [[VRSHR_N]] +// uint8x16_t test_vrshrq_n_u8(uint8x16_t a) { return vrshrq_n_u8(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3)) -// CHECK: ret <8 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrshrq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3)) +// CHECK-NEXT: ret <8 x i16> [[VRSHR_N1]] +// uint16x8_t test_vrshrq_n_u16(uint16x8_t a) { return vrshrq_n_u16(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3)) -// CHECK: ret <4 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrshrq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3)) +// CHECK-NEXT: ret <4 x i32> [[VRSHR_N1]] +// uint32x4_t test_vrshrq_n_u32(uint32x4_t a) { return vrshrq_n_u32(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3)) -// CHECK: ret <2 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vrshrq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3)) +// CHECK-NEXT: ret <2 x i64> [[VRSHR_N1]] +// uint64x2_t test_vrshrq_n_u64(uint64x2_t a) { return vrshrq_n_u64(a, 3); } -// CHECK-LABEL: @test_vrsra_n_s8( -// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %b, <8 x i8> splat (i8 -3)) -// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]] -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrsra_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[B]], <8 x i8> splat (i8 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i8> [[A]], [[VRSHR_N]] +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) { return vrsra_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vrsra_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <4 x i16> [[TMP3]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrsra_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3)) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]] +// CHECK-NEXT: ret <4 x i16> [[TMP3]] +// int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) { return vrsra_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vrsra_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <2 x i32> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrsra_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3)) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]] +// CHECK-NEXT: ret <2 x i32> [[TMP3]] +// int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) { return vrsra_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_s8( -// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %b, <16 x i8> splat (i8 -3)) -// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]] -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrsraq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[B]], <16 x i8> splat (i8 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <16 x i8> [[A]], [[VRSHR_N]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) { return vrsraq_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <8 x i16> [[TMP3]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrsraq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3)) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]] +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) { return vrsraq_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <4 x i32> [[TMP3]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrsraq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3)) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]] +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) { return vrsraq_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <2 x i64> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vrsraq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3)) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]] +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) { return vrsraq_n_s64(a, b, 3); } -// CHECK-LABEL: @test_vrsra_n_u8( -// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %b, <8 x i8> splat (i8 -3)) -// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]] -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrsra_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[B]], <8 x i8> splat (i8 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i8> [[A]], [[VRSHR_N]] +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) { return vrsra_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vrsra_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <4 x i16> [[TMP3]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrsra_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3)) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]] +// CHECK-NEXT: ret <4 x i16> [[TMP3]] +// uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) { return vrsra_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vrsra_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <2 x i32> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrsra_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3)) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]] +// CHECK-NEXT: ret <2 x i32> [[TMP3]] +// uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) { return vrsra_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_u8( -// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %b, <16 x i8> splat (i8 -3)) -// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]] -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrsraq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[B]], <16 x i8> splat (i8 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <16 x i8> [[A]], [[VRSHR_N]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) { return vrsraq_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <8 x i16> [[TMP3]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrsraq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3)) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]] +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) { return vrsraq_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <4 x i32> [[TMP3]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrsraq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3)) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]] +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) { return vrsraq_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <2 x i64> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vrsraq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3)) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]] +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) { return vrsraq_n_u64(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_s8( -// CHECK: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) -// CHECK: ret <8 x i8> [[VSRI_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsri_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VSRI_N]] +// int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) { return vsri_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3) -// CHECK: ret <4 x i16> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsri_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3) +// CHECK-NEXT: ret <4 x i16> [[VSRI_N2]] +// int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) { return vsri_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3) -// CHECK: ret <2 x i32> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsri_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSRI_N2]] +// int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) { return vsri_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_s8( -// CHECK: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) -// CHECK: ret <16 x i8> [[VSRI_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsriq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VSRI_N]] +// int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) { return vsriq_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3) -// CHECK: ret <8 x i16> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsriq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VSRI_N2]] +// int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) { return vsriq_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3) -// CHECK: ret <4 x i32> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsriq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSRI_N2]] +// int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) { return vsriq_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3) -// CHECK: ret <2 x i64> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsriq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3) +// CHECK-NEXT: ret <2 x i64> [[VSRI_N2]] +// int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) { return vsriq_n_s64(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_u8( -// CHECK: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) -// CHECK: ret <8 x i8> [[VSRI_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsri_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VSRI_N]] +// uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) { return vsri_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3) -// CHECK: ret <4 x i16> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsri_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3) +// CHECK-NEXT: ret <4 x i16> [[VSRI_N2]] +// uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) { return vsri_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3) -// CHECK: ret <2 x i32> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsri_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSRI_N2]] +// uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) { return vsri_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_u8( -// CHECK: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) -// CHECK: ret <16 x i8> [[VSRI_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsriq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VSRI_N]] +// uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) { return vsriq_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3) -// CHECK: ret <8 x i16> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsriq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VSRI_N2]] +// uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) { return vsriq_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3) -// CHECK: ret <4 x i32> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsriq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSRI_N2]] +// uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) { return vsriq_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3) -// CHECK: ret <2 x i64> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsriq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3) +// CHECK-NEXT: ret <2 x i64> [[VSRI_N2]] +// uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) { return vsriq_n_u64(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_p8( -// CHECK: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) -// CHECK: ret <8 x i8> [[VSRI_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsri_n_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VSRI_N]] +// poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) { return vsri_n_p8(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 15) -// CHECK: ret <4 x i16> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsri_n_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 15) +// CHECK-NEXT: ret <4 x i16> [[VSRI_N2]] +// poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) { return vsri_n_p16(a, b, 15); } -// CHECK-LABEL: @test_vsriq_n_p8( -// CHECK: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) -// CHECK: ret <16 x i8> [[VSRI_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsriq_n_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VSRI_N]] +// poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) { return vsriq_n_p8(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 15) -// CHECK: ret <8 x i16> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsriq_n_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 15) +// CHECK-NEXT: ret <8 x i16> [[VSRI_N2]] +// poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) { return vsriq_n_p16(a, b, 15); } -// CHECK-LABEL: @test_vsli_n_s8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsli_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) { return vsli_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsli_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) { return vsli_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3) -// CHECK: ret <2 x i32> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsli_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSLI_N2]] +// int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) { return vsli_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_s8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsliq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) { return vsliq_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsliq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) { return vsliq_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3) -// CHECK: ret <4 x i32> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsliq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSLI_N2]] +// int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) { return vsliq_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsliq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) { return vsliq_n_s64(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_u8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsli_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) { return vsli_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsli_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) { return vsli_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3) -// CHECK: ret <2 x i32> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsli_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSLI_N2]] +// uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) { return vsli_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_u8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsliq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) { return vsliq_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsliq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) { return vsliq_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3) -// CHECK: ret <4 x i32> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsliq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSLI_N2]] +// uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) { return vsliq_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsliq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) { return vsliq_n_u64(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_p8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsli_n_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) { return vsli_n_p8(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 15) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsli_n_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 15) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) { return vsli_n_p16(a, b, 15); } -// CHECK-LABEL: @test_vsliq_n_p8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsliq_n_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) { return vsliq_n_p8(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 15) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsliq_n_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 15) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) { return vsliq_n_p16(a, b, 15); } -// CHECK-LABEL: @test_vqshlu_n_s8( -// CHECK: [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %a, <8 x i8> splat (i8 3)) -// CHECK: ret <8 x i8> [[VQSHLU_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshlu_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 3)) +// CHECK-NEXT: ret <8 x i8> [[VQSHLU_N]] +// uint8x8_t test_vqshlu_n_s8(int8x8_t a) { return vqshlu_n_s8(a, 3); } -// CHECK-LABEL: @test_vqshlu_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> splat (i16 3)) -// CHECK: ret <4 x i16> [[VQSHLU_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshlu_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> splat (i16 3)) +// CHECK-NEXT: ret <4 x i16> [[VQSHLU_N1]] +// uint16x4_t test_vqshlu_n_s16(int16x4_t a) { return vqshlu_n_s16(a, 3); } -// CHECK-LABEL: @test_vqshlu_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> splat (i32 3)) -// CHECK: ret <2 x i32> [[VQSHLU_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshlu_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> splat (i32 3)) +// CHECK-NEXT: ret <2 x i32> [[VQSHLU_N1]] +// uint32x2_t test_vqshlu_n_s32(int32x2_t a) { return vqshlu_n_s32(a, 3); } -// CHECK-LABEL: @test_vqshluq_n_s8( -// CHECK: [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %a, <16 x i8> splat (i8 3)) -// CHECK: ret <16 x i8> [[VQSHLU_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshluq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 3)) +// CHECK-NEXT: ret <16 x i8> [[VQSHLU_N]] +// uint8x16_t test_vqshluq_n_s8(int8x16_t a) { return vqshluq_n_s8(a, 3); } -// CHECK-LABEL: @test_vqshluq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> splat (i16 3)) -// CHECK: ret <8 x i16> [[VQSHLU_N1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshluq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> splat (i16 3)) +// CHECK-NEXT: ret <8 x i16> [[VQSHLU_N1]] +// uint16x8_t test_vqshluq_n_s16(int16x8_t a) { return vqshluq_n_s16(a, 3); } -// CHECK-LABEL: @test_vqshluq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> splat (i32 3)) -// CHECK: ret <4 x i32> [[VQSHLU_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshluq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> splat (i32 3)) +// CHECK-NEXT: ret <4 x i32> [[VQSHLU_N1]] +// uint32x4_t test_vqshluq_n_s32(int32x4_t a) { return vqshluq_n_s32(a, 3); } -// CHECK-LABEL: @test_vqshluq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> splat (i64 3)) -// CHECK: ret <2 x i64> [[VQSHLU_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqshluq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> splat (i64 3)) +// CHECK-NEXT: ret <2 x i64> [[VQSHLU_N1]] +// uint64x2_t test_vqshluq_n_s64(int64x2_t a) { return vqshluq_n_s64(a, 3); } -// CHECK-LABEL: @test_vshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSHRN_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 3) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSHRN_N]] +// int8x8_t test_vshrn_n_s16(int16x8_t a) { return vshrn_n_s16(a, 3); } -// CHECK-LABEL: @test_vshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 9) -// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSHRN_N]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 9) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSHRN_N]] +// int16x4_t test_vshrn_n_s32(int32x4_t a) { return vshrn_n_s32(a, 9); } -// CHECK-LABEL: @test_vshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 19) -// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSHRN_N]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 19) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSHRN_N]] +// int32x2_t test_vshrn_n_s64(int64x2_t a) { return vshrn_n_s64(a, 19); } -// CHECK-LABEL: @test_vshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSHRN_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 3) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSHRN_N]] +// uint8x8_t test_vshrn_n_u16(uint16x8_t a) { return vshrn_n_u16(a, 3); } -// CHECK-LABEL: @test_vshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 9) -// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSHRN_N]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 9) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSHRN_N]] +// uint16x4_t test_vshrn_n_u32(uint32x4_t a) { return vshrn_n_u32(a, 9); } -// CHECK-LABEL: @test_vshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 19) -// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSHRN_N]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 19) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSHRN_N]] +// uint32x2_t test_vshrn_n_u64(uint64x2_t a) { return vshrn_n_u64(a, 19); } -// CHECK-LABEL: @test_vshrn_high_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshrn_high_n_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 3) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VSHRN_N]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vshrn_high_n_s16(int8x8_t a, int16x8_t b) { return vshrn_high_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vshrn_high_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 9) -// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshrn_high_n_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 9) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VSHRN_N]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vshrn_high_n_s32(int16x4_t a, int32x4_t b) { return vshrn_high_n_s32(a, b, 9); } -// CHECK-LABEL: @test_vshrn_high_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 19) -// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshrn_high_n_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 19) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VSHRN_N]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vshrn_high_n_s64(int32x2_t a, int64x2_t b) { return vshrn_high_n_s64(a, b, 19); } -// CHECK-LABEL: @test_vshrn_high_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshrn_high_n_u16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 3) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VSHRN_N]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vshrn_high_n_u16(uint8x8_t a, uint16x8_t b) { return vshrn_high_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vshrn_high_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 9) -// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshrn_high_n_u32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 9) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VSHRN_N]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vshrn_high_n_u32(uint16x4_t a, uint32x4_t b) { return vshrn_high_n_u32(a, b, 9); } -// CHECK-LABEL: @test_vshrn_high_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 19) -// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshrn_high_n_u64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 19) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VSHRN_N]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vshrn_high_n_u64(uint32x2_t a, uint64x2_t b) { return vshrn_high_n_u64(a, b, 19); } -// CHECK-LABEL: @test_vqshrun_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3) -// CHECK: ret <8 x i8> [[VQSHRUN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshrun_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VQSHRUN_N1]] +// uint8x8_t test_vqshrun_n_s16(int16x8_t a) { return vqshrun_n_s16(a, 3); } -// CHECK-LABEL: @test_vqshrun_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9) -// CHECK: ret <4 x i16> [[VQSHRUN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshrun_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VQSHRUN_N1]] +// uint16x4_t test_vqshrun_n_s32(int32x4_t a) { return vqshrun_n_s32(a, 9); } -// CHECK-LABEL: @test_vqshrun_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19) -// CHECK: ret <2 x i32> [[VQSHRUN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshrun_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VQSHRUN_N1]] +// uint32x2_t test_vqshrun_n_s64(int64x2_t a) { return vqshrun_n_s64(a, 19); } -// CHECK-LABEL: @test_vqshrun_high_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRUN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshrun_high_n_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHRUN_N3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQSHRUN_N3]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vqshrun_high_n_s16(int8x8_t a, int16x8_t b) { return vqshrun_high_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vqshrun_high_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRUN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshrun_high_n_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHRUN_N3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQSHRUN_N3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vqshrun_high_n_s32(int16x4_t a, int32x4_t b) { return vqshrun_high_n_s32(a, b, 9); } -// CHECK-LABEL: @test_vqshrun_high_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRUN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshrun_high_n_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHRUN_N3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQSHRUN_N3]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vqshrun_high_n_s64(int32x2_t a, int64x2_t b) { return vqshrun_high_n_s64(a, b, 19); } -// CHECK-LABEL: @test_vrshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) -// CHECK: ret <8 x i8> [[VRSHRN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VRSHRN_N1]] +// int8x8_t test_vrshrn_n_s16(int16x8_t a) { return vrshrn_n_s16(a, 3); } -// CHECK-LABEL: @test_vrshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) -// CHECK: ret <4 x i16> [[VRSHRN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VRSHRN_N1]] +// int16x4_t test_vrshrn_n_s32(int32x4_t a) { return vrshrn_n_s32(a, 9); } -// CHECK-LABEL: @test_vrshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) -// CHECK: ret <2 x i32> [[VRSHRN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VRSHRN_N1]] +// int32x2_t test_vrshrn_n_s64(int64x2_t a) { return vrshrn_n_s64(a, 19); } -// CHECK-LABEL: @test_vrshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) -// CHECK: ret <8 x i8> [[VRSHRN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VRSHRN_N1]] +// uint8x8_t test_vrshrn_n_u16(uint16x8_t a) { return vrshrn_n_u16(a, 3); } -// CHECK-LABEL: @test_vrshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) -// CHECK: ret <4 x i16> [[VRSHRN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VRSHRN_N1]] +// uint16x4_t test_vrshrn_n_u32(uint32x4_t a) { return vrshrn_n_u32(a, 9); } -// CHECK-LABEL: @test_vrshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) -// CHECK: ret <2 x i32> [[VRSHRN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VRSHRN_N1]] +// uint32x2_t test_vrshrn_n_u64(uint64x2_t a) { return vrshrn_n_u64(a, 19); } -// CHECK-LABEL: @test_vrshrn_high_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrshrn_high_n_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHRN_N3:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VRSHRN_N3]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrshrn_high_n_s16(int8x8_t a, int16x8_t b) { return vrshrn_high_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vrshrn_high_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrshrn_high_n_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHRN_N3:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VRSHRN_N3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vrshrn_high_n_s32(int16x4_t a, int32x4_t b) { return vrshrn_high_n_s32(a, b, 9); } -// CHECK-LABEL: @test_vrshrn_high_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrshrn_high_n_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHRN_N3:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VRSHRN_N3]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vrshrn_high_n_s64(int32x2_t a, int64x2_t b) { return vrshrn_high_n_s64(a, b, 19); } -// CHECK-LABEL: @test_vrshrn_high_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrshrn_high_n_u16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHRN_N3:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VRSHRN_N3]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) { return vrshrn_high_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vrshrn_high_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrshrn_high_n_u32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHRN_N3:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VRSHRN_N3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) { return vrshrn_high_n_u32(a, b, 9); } -// CHECK-LABEL: @test_vrshrn_high_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrshrn_high_n_u64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHRN_N3:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VRSHRN_N3]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) { return vrshrn_high_n_u64(a, b, 19); } -// CHECK-LABEL: @test_vqrshrun_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3) -// CHECK: ret <8 x i8> [[VQRSHRUN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshrun_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VQRSHRUN_N1]] +// uint8x8_t test_vqrshrun_n_s16(int16x8_t a) { return vqrshrun_n_s16(a, 3); } -// CHECK-LABEL: @test_vqrshrun_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9) -// CHECK: ret <4 x i16> [[VQRSHRUN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshrun_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VQRSHRUN_N1]] +// uint16x4_t test_vqrshrun_n_s32(int32x4_t a) { return vqrshrun_n_s32(a, 9); } -// CHECK-LABEL: @test_vqrshrun_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19) -// CHECK: ret <2 x i32> [[VQRSHRUN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshrun_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VQRSHRUN_N1]] +// uint32x2_t test_vqrshrun_n_s64(int64x2_t a) { return vqrshrun_n_s64(a, 19); } -// CHECK-LABEL: @test_vqrshrun_high_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRUN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshrun_high_n_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHRUN_N3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQRSHRUN_N3]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vqrshrun_high_n_s16(int8x8_t a, int16x8_t b) { return vqrshrun_high_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vqrshrun_high_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRUN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshrun_high_n_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHRUN_N3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQRSHRUN_N3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vqrshrun_high_n_s32(int16x4_t a, int32x4_t b) { return vqrshrun_high_n_s32(a, b, 9); } -// CHECK-LABEL: @test_vqrshrun_high_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRUN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshrun_high_n_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHRUN_N3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQRSHRUN_N3]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vqrshrun_high_n_s64(int32x2_t a, int64x2_t b) { return vqrshrun_high_n_s64(a, b, 19); } -// CHECK-LABEL: @test_vqshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) -// CHECK: ret <8 x i8> [[VQSHRN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VQSHRN_N1]] +// int8x8_t test_vqshrn_n_s16(int16x8_t a) { return vqshrn_n_s16(a, 3); } -// CHECK-LABEL: @test_vqshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) -// CHECK: ret <4 x i16> [[VQSHRN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VQSHRN_N1]] +// int16x4_t test_vqshrn_n_s32(int32x4_t a) { return vqshrn_n_s32(a, 9); } -// CHECK-LABEL: @test_vqshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) -// CHECK: ret <2 x i32> [[VQSHRN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VQSHRN_N1]] +// int32x2_t test_vqshrn_n_s64(int64x2_t a) { return vqshrn_n_s64(a, 19); } -// CHECK-LABEL: @test_vqshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) -// CHECK: ret <8 x i8> [[VQSHRN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VQSHRN_N1]] +// uint8x8_t test_vqshrn_n_u16(uint16x8_t a) { return vqshrn_n_u16(a, 3); } -// CHECK-LABEL: @test_vqshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) -// CHECK: ret <4 x i16> [[VQSHRN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VQSHRN_N1]] +// uint16x4_t test_vqshrn_n_u32(uint32x4_t a) { return vqshrn_n_u32(a, 9); } -// CHECK-LABEL: @test_vqshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) -// CHECK: ret <2 x i32> [[VQSHRN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VQSHRN_N1]] +// uint32x2_t test_vqshrn_n_u64(uint64x2_t a) { return vqshrn_n_u64(a, 19); } -// CHECK-LABEL: @test_vqshrn_high_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshrn_high_n_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHRN_N3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQSHRN_N3]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vqshrn_high_n_s16(int8x8_t a, int16x8_t b) { return vqshrn_high_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vqshrn_high_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshrn_high_n_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHRN_N3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQSHRN_N3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vqshrn_high_n_s32(int16x4_t a, int32x4_t b) { return vqshrn_high_n_s32(a, b, 9); } -// CHECK-LABEL: @test_vqshrn_high_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshrn_high_n_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHRN_N3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQSHRN_N3]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vqshrn_high_n_s64(int32x2_t a, int64x2_t b) { return vqshrn_high_n_s64(a, b, 19); } -// CHECK-LABEL: @test_vqshrn_high_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshrn_high_n_u16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHRN_N3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQSHRN_N3]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vqshrn_high_n_u16(uint8x8_t a, uint16x8_t b) { return vqshrn_high_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vqshrn_high_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshrn_high_n_u32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHRN_N3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQSHRN_N3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vqshrn_high_n_u32(uint16x4_t a, uint32x4_t b) { return vqshrn_high_n_u32(a, b, 9); } -// CHECK-LABEL: @test_vqshrn_high_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshrn_high_n_u64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHRN_N3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQSHRN_N3]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vqshrn_high_n_u64(uint32x2_t a, uint64x2_t b) { return vqshrn_high_n_u64(a, b, 19); } -// CHECK-LABEL: @test_vqrshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) -// CHECK: ret <8 x i8> [[VQRSHRN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VQRSHRN_N1]] +// int8x8_t test_vqrshrn_n_s16(int16x8_t a) { return vqrshrn_n_s16(a, 3); } -// CHECK-LABEL: @test_vqrshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) -// CHECK: ret <4 x i16> [[VQRSHRN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VQRSHRN_N1]] +// int16x4_t test_vqrshrn_n_s32(int32x4_t a) { return vqrshrn_n_s32(a, 9); } -// CHECK-LABEL: @test_vqrshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) -// CHECK: ret <2 x i32> [[VQRSHRN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VQRSHRN_N1]] +// int32x2_t test_vqrshrn_n_s64(int64x2_t a) { return vqrshrn_n_s64(a, 19); } -// CHECK-LABEL: @test_vqrshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) -// CHECK: ret <8 x i8> [[VQRSHRN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VQRSHRN_N1]] +// uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) { return vqrshrn_n_u16(a, 3); } -// CHECK-LABEL: @test_vqrshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) -// CHECK: ret <4 x i16> [[VQRSHRN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VQRSHRN_N1]] +// uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) { return vqrshrn_n_u32(a, 9); } -// CHECK-LABEL: @test_vqrshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) -// CHECK: ret <2 x i32> [[VQRSHRN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VQRSHRN_N1]] +// uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) { return vqrshrn_n_u64(a, 19); } -// CHECK-LABEL: @test_vqrshrn_high_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshrn_high_n_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHRN_N3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQRSHRN_N3]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) { return vqrshrn_high_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vqrshrn_high_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshrn_high_n_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHRN_N3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQRSHRN_N3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) { return vqrshrn_high_n_s32(a, b, 9); } -// CHECK-LABEL: @test_vqrshrn_high_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshrn_high_n_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHRN_N3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQRSHRN_N3]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) { return vqrshrn_high_n_s64(a, b, 19); } -// CHECK-LABEL: @test_vqrshrn_high_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshrn_high_n_u16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHRN_N3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQRSHRN_N3]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) { return vqrshrn_high_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vqrshrn_high_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshrn_high_n_u32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHRN_N3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQRSHRN_N3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) { return vqrshrn_high_n_u32(a, b, 9); } -// CHECK-LABEL: @test_vqrshrn_high_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshrn_high_n_u64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHRN_N3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQRSHRN_N3]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) { return vqrshrn_high_n_u64(a, b, 19); } -// CHECK-LABEL: @test_vshll_n_s8( -// CHECK: [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// int16x8_t test_vshll_n_s8(int8x8_t a) { return vshll_n_s8(a, 3); } -// CHECK-LABEL: @test_vshll_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// int32x4_t test_vshll_n_s16(int16x4_t a) { return vshll_n_s16(a, 9); } -// CHECK-LABEL: @test_vshll_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// int64x2_t test_vshll_n_s32(int32x2_t a) { return vshll_n_s32(a, 19); } -// CHECK-LABEL: @test_vshll_n_u8( -// CHECK: [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// uint16x8_t test_vshll_n_u8(uint8x8_t a) { return vshll_n_u8(a, 3); } -// CHECK-LABEL: @test_vshll_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// uint32x4_t test_vshll_n_u16(uint16x4_t a) { return vshll_n_u16(a, 9); } -// CHECK-LABEL: @test_vshll_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// uint64x2_t test_vshll_n_u32(uint32x2_t a) { return vshll_n_u32(a, 19); } -// CHECK-LABEL: @test_vshll_high_n_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_high_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// int16x8_t test_vshll_high_n_s8(int8x16_t a) { return vshll_high_n_s8(a, 3); } -// CHECK-LABEL: @test_vshll_high_n_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_high_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// int32x4_t test_vshll_high_n_s16(int16x8_t a) { return vshll_high_n_s16(a, 9); } -// CHECK-LABEL: @test_vshll_high_n_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_high_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// int64x2_t test_vshll_high_n_s32(int32x4_t a) { return vshll_high_n_s32(a, 19); } -// CHECK-LABEL: @test_vshll_high_n_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_high_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// uint16x8_t test_vshll_high_n_u8(uint8x16_t a) { return vshll_high_n_u8(a, 3); } -// CHECK-LABEL: @test_vshll_high_n_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_high_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// uint32x4_t test_vshll_high_n_u16(uint16x8_t a) { return vshll_high_n_u16(a, 9); } -// CHECK-LABEL: @test_vshll_high_n_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_high_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// uint64x2_t test_vshll_high_n_u32(uint32x4_t a) { return vshll_high_n_u32(a, 19); } -// CHECK-LABEL: @test_vmovl_s8( -// CHECK: [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] +// int16x8_t test_vmovl_s8(int8x8_t a) { return vmovl_s8(a); } -// CHECK-LABEL: @test_vmovl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = sext <4 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] +// int32x4_t test_vmovl_s16(int16x4_t a) { return vmovl_s16(a); } -// CHECK-LABEL: @test_vmovl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = sext <2 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] +// int64x2_t test_vmovl_s32(int32x2_t a) { return vmovl_s32(a); } -// CHECK-LABEL: @test_vmovl_u8( -// CHECK: [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] +// uint16x8_t test_vmovl_u8(uint8x8_t a) { return vmovl_u8(a); } -// CHECK-LABEL: @test_vmovl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = zext <4 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] +// uint32x4_t test_vmovl_u16(uint16x4_t a) { return vmovl_u16(a); } -// CHECK-LABEL: @test_vmovl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = zext <2 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] +// uint64x2_t test_vmovl_u32(uint32x2_t a) { return vmovl_u32(a); } -// CHECK-LABEL: @test_vmovl_high_s8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_high_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vmovl_high_s8(int8x16_t a) { return vmovl_high_s8(a); } -// CHECK-LABEL: @test_vmovl_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vmovl_high_s16(int16x8_t a) { return vmovl_high_s16(a); } -// CHECK-LABEL: @test_vmovl_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vmovl_high_s32(int32x4_t a) { return vmovl_high_s32(a); } -// CHECK-LABEL: @test_vmovl_high_u8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_high_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vmovl_high_u8(uint8x16_t a) { return vmovl_high_u8(a); } -// CHECK-LABEL: @test_vmovl_high_u16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_high_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vmovl_high_u16(uint16x8_t a) { return vmovl_high_u16(a); } -// CHECK-LABEL: @test_vmovl_high_u32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_high_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vmovl_high_u32(uint32x4_t a) { return vmovl_high_u32(a); } -// CHECK-LABEL: @test_vcvt_n_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31) -// CHECK: ret <2 x float> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_n_f32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31) +// CHECK-NEXT: ret <2 x float> [[VCVT_N1]] +// float32x2_t test_vcvt_n_f32_s32(int32x2_t a) { return vcvt_n_f32_s32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31) -// CHECK: ret <4 x float> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvtq_n_f32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31) +// CHECK-NEXT: ret <4 x float> [[VCVT_N1]] +// float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) { return vcvtq_n_f32_s32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_f64_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50) -// CHECK: ret <2 x double> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x double> @test_vcvtq_n_f64_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50) +// CHECK-NEXT: ret <2 x double> [[VCVT_N1]] +// float64x2_t test_vcvtq_n_f64_s64(int64x2_t a) { return vcvtq_n_f64_s64(a, 50); } -// CHECK-LABEL: @test_vcvt_n_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31) -// CHECK: ret <2 x float> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_n_f32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31) +// CHECK-NEXT: ret <2 x float> [[VCVT_N1]] +// float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) { return vcvt_n_f32_u32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31) -// CHECK: ret <4 x float> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvtq_n_f32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31) +// CHECK-NEXT: ret <4 x float> [[VCVT_N1]] +// float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) { return vcvtq_n_f32_u32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_f64_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50) -// CHECK: ret <2 x double> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x double> @test_vcvtq_n_f64_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50) +// CHECK-NEXT: ret <2 x double> [[VCVT_N1]] +// float64x2_t test_vcvtq_n_f64_u64(uint64x2_t a) { return vcvtq_n_f64_u64(a, 50); } -// CHECK-LABEL: @test_vcvt_n_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31) -// CHECK: ret <2 x i32> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvt_n_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31) +// CHECK-NEXT: ret <2 x i32> [[VCVT_N1]] +// int32x2_t test_vcvt_n_s32_f32(float32x2_t a) { return vcvt_n_s32_f32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31) -// CHECK: ret <4 x i32> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtq_n_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31) +// CHECK-NEXT: ret <4 x i32> [[VCVT_N1]] +// int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) { return vcvtq_n_s32_f32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK: [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50) -// CHECK: ret <2 x i64> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtq_n_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50) +// CHECK-NEXT: ret <2 x i64> [[VCVT_N1]] +// int64x2_t test_vcvtq_n_s64_f64(float64x2_t a) { return vcvtq_n_s64_f64(a, 50); } -// CHECK-LABEL: @test_vcvt_n_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31) -// CHECK: ret <2 x i32> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvt_n_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31) +// CHECK-NEXT: ret <2 x i32> [[VCVT_N1]] +// uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) { return vcvt_n_u32_f32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31) -// CHECK: ret <4 x i32> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtq_n_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31) +// CHECK-NEXT: ret <4 x i32> [[VCVT_N1]] +// uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) { return vcvtq_n_u32_f32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK: [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50) -// CHECK: ret <2 x i64> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtq_n_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50) +// CHECK-NEXT: ret <2 x i64> [[VCVT_N1]] +// uint64x2_t test_vcvtq_n_u64_f64(float64x2_t a) { return vcvtq_n_u64_f64(a, 50); } -// CHECK-LABEL: @test_vaddl_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) { return vaddl_s8(a, b); } -// CHECK-LABEL: @test_vaddl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) { return vaddl_s16(a, b); } -// CHECK-LABEL: @test_vaddl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) { return vaddl_s32(a, b); } -// CHECK-LABEL: @test_vaddl_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) { return vaddl_u8(a, b); } -// CHECK-LABEL: @test_vaddl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) { return vaddl_u16(a, b); } -// CHECK-LABEL: @test_vaddl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) { return vaddl_u32(a, b); } -// CHECK-LABEL: @test_vaddl_high_s8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddl_high_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I12_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I12_I]] to <8 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddl_high_s8(int8x16_t a, int8x16_t b) { return vaddl_high_s8(a, b); } -// CHECK-LABEL: @test_vaddl_high_s16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddl_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I12_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I12_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP5]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddl_high_s16(int16x8_t a, int16x8_t b) { return vaddl_high_s16(a, b); } -// CHECK-LABEL: @test_vaddl_high_s32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddl_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I12_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I12_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP5]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddl_high_s32(int32x4_t a, int32x4_t b) { return vaddl_high_s32(a, b); } -// CHECK-LABEL: @test_vaddl_high_u8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddl_high_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I12_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I12_I]] to <8 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddl_high_u8(uint8x16_t a, uint8x16_t b) { return vaddl_high_u8(a, b); } -// CHECK-LABEL: @test_vaddl_high_u16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddl_high_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I12_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I12_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP5]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddl_high_u16(uint16x8_t a, uint16x8_t b) { return vaddl_high_u16(a, b); } -// CHECK-LABEL: @test_vaddl_high_u32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddl_high_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I12_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I12_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP5]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddl_high_u32(uint32x4_t a, uint32x4_t b) { return vaddl_high_u32(a, b); } -// CHECK-LABEL: @test_vaddw_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddw_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) { return vaddw_s8(a, b); } -// CHECK-LABEL: @test_vaddw_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddw_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) { return vaddw_s16(a, b); } -// CHECK-LABEL: @test_vaddw_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddw_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) { return vaddw_s32(a, b); } -// CHECK-LABEL: @test_vaddw_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddw_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) { return vaddw_u8(a, b); } -// CHECK-LABEL: @test_vaddw_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddw_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) { return vaddw_u16(a, b); } -// CHECK-LABEL: @test_vaddw_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddw_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) { return vaddw_u32(a, b); } -// CHECK-LABEL: @test_vaddw_high_s8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddw_high_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddw_high_s8(int16x8_t a, int8x16_t b) { return vaddw_high_s8(a, b); } -// CHECK-LABEL: @test_vaddw_high_s16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP1]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddw_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[TMP2]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddw_high_s16(int32x4_t a, int16x8_t b) { return vaddw_high_s16(a, b); } -// CHECK-LABEL: @test_vaddw_high_s32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP1]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddw_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[TMP2]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddw_high_s32(int64x2_t a, int32x4_t b) { return vaddw_high_s32(a, b); } -// CHECK-LABEL: @test_vaddw_high_u8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddw_high_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddw_high_u8(uint16x8_t a, uint8x16_t b) { return vaddw_high_u8(a, b); } -// CHECK-LABEL: @test_vaddw_high_u16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP1]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddw_high_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[TMP2]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddw_high_u16(uint32x4_t a, uint16x8_t b) { return vaddw_high_u16(a, b); } -// CHECK-LABEL: @test_vaddw_high_u32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP1]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddw_high_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[TMP2]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddw_high_u32(uint64x2_t a, uint32x4_t b) { return vaddw_high_u32(a, b); } -// CHECK-LABEL: @test_vsubl_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) { return vsubl_s8(a, b); } -// CHECK-LABEL: @test_vsubl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) { return vsubl_s16(a, b); } -// CHECK-LABEL: @test_vsubl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) { return vsubl_s32(a, b); } -// CHECK-LABEL: @test_vsubl_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) { return vsubl_u8(a, b); } -// CHECK-LABEL: @test_vsubl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) { return vsubl_u16(a, b); } -// CHECK-LABEL: @test_vsubl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) { return vsubl_u32(a, b); } -// CHECK-LABEL: @test_vsubl_high_s8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_high_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I12_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I12_I]] to <8 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubl_high_s8(int8x16_t a, int8x16_t b) { return vsubl_high_s8(a, b); } -// CHECK-LABEL: @test_vsubl_high_s16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I12_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I12_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubl_high_s16(int16x8_t a, int16x8_t b) { return vsubl_high_s16(a, b); } -// CHECK-LABEL: @test_vsubl_high_s32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I12_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I12_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP5]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubl_high_s32(int32x4_t a, int32x4_t b) { return vsubl_high_s32(a, b); } -// CHECK-LABEL: @test_vsubl_high_u8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_high_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I12_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I12_I]] to <8 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubl_high_u8(uint8x16_t a, uint8x16_t b) { return vsubl_high_u8(a, b); } -// CHECK-LABEL: @test_vsubl_high_u16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_high_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I12_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I12_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubl_high_u16(uint16x8_t a, uint16x8_t b) { return vsubl_high_u16(a, b); } -// CHECK-LABEL: @test_vsubl_high_u32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_high_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I12_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I12_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP5]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubl_high_u32(uint32x4_t a, uint32x4_t b) { return vsubl_high_u32(a, b); } -// CHECK-LABEL: @test_vsubw_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) { return vsubw_s8(a, b); } -// CHECK-LABEL: @test_vsubw_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) { return vsubw_s16(a, b); } -// CHECK-LABEL: @test_vsubw_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) { return vsubw_s32(a, b); } -// CHECK-LABEL: @test_vsubw_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) { return vsubw_u8(a, b); } -// CHECK-LABEL: @test_vsubw_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) { return vsubw_u16(a, b); } -// CHECK-LABEL: @test_vsubw_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) { return vsubw_u32(a, b); } -// CHECK-LABEL: @test_vsubw_high_s8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_high_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubw_high_s8(int16x8_t a, int8x16_t b) { return vsubw_high_s8(a, b); } -// CHECK-LABEL: @test_vsubw_high_s16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP1]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[TMP2]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubw_high_s16(int32x4_t a, int16x8_t b) { return vsubw_high_s16(a, b); } -// CHECK-LABEL: @test_vsubw_high_s32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP1]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[TMP2]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubw_high_s32(int64x2_t a, int32x4_t b) { return vsubw_high_s32(a, b); } -// CHECK-LABEL: @test_vsubw_high_u8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_high_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubw_high_u8(uint16x8_t a, uint8x16_t b) { return vsubw_high_u8(a, b); } -// CHECK-LABEL: @test_vsubw_high_u16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP1]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_high_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[TMP2]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubw_high_u16(uint32x4_t a, uint16x8_t b) { return vsubw_high_u16(a, b); } -// CHECK-LABEL: @test_vsubw_high_u32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP1]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_high_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[TMP2]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubw_high_u32(uint64x2_t a, uint32x4_t b) { return vsubw_high_u32(a, b); } -// CHECK-LABEL: @test_vaddhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) -// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VADDHN2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vaddhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VADDHN2_I]] +// int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) { return vaddhn_s16(a, b); } -// CHECK-LABEL: @test_vaddhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) -// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VADDHN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vaddhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VADDHN2_I]] +// int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) { return vaddhn_s32(a, b); } -// CHECK-LABEL: @test_vaddhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) -// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VADDHN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vaddhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VADDHN2_I]] +// int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) { return vaddhn_s64(a, b); } -// CHECK-LABEL: @test_vaddhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) -// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VADDHN2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vaddhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VADDHN2_I]] +// uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) { return vaddhn_u16(a, b); } -// CHECK-LABEL: @test_vaddhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) -// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VADDHN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vaddhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VADDHN2_I]] +// uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) { return vaddhn_u32(a, b); } -// CHECK-LABEL: @test_vaddhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) -// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VADDHN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vaddhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VADDHN2_I]] +// uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) { return vaddhn_u64(a, b); } -// CHECK-LABEL: @test_vaddhn_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VADDHN_I_I:%.*]] = add <8 x i16> %a, %b -// CHECK: [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat (i16 8) -// CHECK: [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vaddhn_high_s16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat (i16 8) +// CHECK-NEXT: [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VADDHN2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// int8x16_t test_vaddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) { return vaddhn_high_s16(r, a, b); } -// CHECK-LABEL: @test_vaddhn_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VADDHN_I_I:%.*]] = add <4 x i32> %a, %b -// CHECK: [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat (i32 16) -// CHECK: [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddhn_high_s32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat (i32 16) +// CHECK-NEXT: [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VADDHN2_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// int16x8_t test_vaddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) { return vaddhn_high_s32(r, a, b); } -// CHECK-LABEL: @test_vaddhn_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VADDHN_I_I:%.*]] = add <2 x i64> %a, %b -// CHECK: [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat (i64 32) -// CHECK: [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddhn_high_s64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat (i64 32) +// CHECK-NEXT: [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VADDHN2_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// int32x4_t test_vaddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) { return vaddhn_high_s64(r, a, b); } -// CHECK-LABEL: @test_vaddhn_high_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VADDHN_I_I:%.*]] = add <8 x i16> %a, %b -// CHECK: [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat (i16 8) -// CHECK: [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vaddhn_high_u16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat (i16 8) +// CHECK-NEXT: [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VADDHN2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// uint8x16_t test_vaddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) { return vaddhn_high_u16(r, a, b); } -// CHECK-LABEL: @test_vaddhn_high_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VADDHN_I_I:%.*]] = add <4 x i32> %a, %b -// CHECK: [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat (i32 16) -// CHECK: [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddhn_high_u32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat (i32 16) +// CHECK-NEXT: [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VADDHN2_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// uint16x8_t test_vaddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) { return vaddhn_high_u32(r, a, b); } -// CHECK-LABEL: @test_vaddhn_high_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VADDHN_I_I:%.*]] = add <2 x i64> %a, %b -// CHECK: [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat (i64 32) -// CHECK: [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddhn_high_u64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat (i64 32) +// CHECK-NEXT: [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VADDHN2_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// uint32x4_t test_vaddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { return vaddhn_high_u64(r, a, b); } -// CHECK-LABEL: @test_vraddhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRADDHN_V2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vraddhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) +// CHECK-NEXT: ret <8 x i8> [[VRADDHN_V2_I]] +// int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) { return vraddhn_s16(a, b); } -// CHECK-LABEL: @test_vraddhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRADDHN_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vraddhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) +// CHECK-NEXT: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) { return vraddhn_s32(a, b); } -// CHECK-LABEL: @test_vraddhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRADDHN_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vraddhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) +// CHECK-NEXT: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) { return vraddhn_s64(a, b); } -// CHECK-LABEL: @test_vraddhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRADDHN_V2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vraddhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) +// CHECK-NEXT: ret <8 x i8> [[VRADDHN_V2_I]] +// uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) { return vraddhn_u16(a, b); } -// CHECK-LABEL: @test_vraddhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRADDHN_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vraddhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) +// CHECK-NEXT: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) { return vraddhn_u32(a, b); } -// CHECK-LABEL: @test_vraddhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRADDHN_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vraddhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) +// CHECK-NEXT: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) { return vraddhn_u64(a, b); } -// CHECK-LABEL: @test_vraddhn_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vraddhn_high_s16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> [[VRADDHN_V1_I_I]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// int8x16_t test_vraddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) { return vraddhn_high_s16(r, a, b); } -// CHECK-LABEL: @test_vraddhn_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vraddhn_high_s32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> [[VRADDHN_V1_I_I]]) +// CHECK-NEXT: [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[TMP2]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// int16x8_t test_vraddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) { return vraddhn_high_s32(r, a, b); } -// CHECK-LABEL: @test_vraddhn_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vraddhn_high_s64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> [[VRADDHN_V1_I_I]]) +// CHECK-NEXT: [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[TMP2]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// int32x4_t test_vraddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) { return vraddhn_high_s64(r, a, b); } -// CHECK-LABEL: @test_vraddhn_high_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vraddhn_high_u16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> [[VRADDHN_V1_I_I]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// uint8x16_t test_vraddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) { return vraddhn_high_u16(r, a, b); } -// CHECK-LABEL: @test_vraddhn_high_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vraddhn_high_u32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> [[VRADDHN_V1_I_I]]) +// CHECK-NEXT: [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[TMP2]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// uint16x8_t test_vraddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) { return vraddhn_high_u32(r, a, b); } -// CHECK-LABEL: @test_vraddhn_high_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vraddhn_high_u64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> [[VRADDHN_V1_I_I]]) +// CHECK-NEXT: [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[TMP2]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// uint32x4_t test_vraddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { return vraddhn_high_u64(r, a, b); } -// CHECK-LABEL: @test_vsubhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSUBHN2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsubhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSUBHN2_I]] +// int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) { return vsubhn_s16(a, b); } -// CHECK-LABEL: @test_vsubhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSUBHN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsubhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSUBHN2_I]] +// int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) { return vsubhn_s32(a, b); } -// CHECK-LABEL: @test_vsubhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSUBHN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsubhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSUBHN2_I]] +// int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) { return vsubhn_s64(a, b); } -// CHECK-LABEL: @test_vsubhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSUBHN2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsubhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSUBHN2_I]] +// uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) { return vsubhn_u16(a, b); } -// CHECK-LABEL: @test_vsubhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSUBHN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsubhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSUBHN2_I]] +// uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) { return vsubhn_u32(a, b); } -// CHECK-LABEL: @test_vsubhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSUBHN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsubhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSUBHN2_I]] +// uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) { return vsubhn_u64(a, b); } -// CHECK-LABEL: @test_vsubhn_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSUBHN_I_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], splat (i16 8) -// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsubhn_high_s16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], splat (i16 8) +// CHECK-NEXT: [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VSUBHN2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// int8x16_t test_vsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) { return vsubhn_high_s16(r, a, b); } -// CHECK-LABEL: @test_vsubhn_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSUBHN_I_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], splat (i32 16) -// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubhn_high_s32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], splat (i32 16) +// CHECK-NEXT: [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VSUBHN2_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// int16x8_t test_vsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) { return vsubhn_high_s32(r, a, b); } -// CHECK-LABEL: @test_vsubhn_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSUBHN_I_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], splat (i64 32) -// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubhn_high_s64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], splat (i64 32) +// CHECK-NEXT: [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VSUBHN2_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// int32x4_t test_vsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) { return vsubhn_high_s64(r, a, b); } -// CHECK-LABEL: @test_vsubhn_high_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSUBHN_I_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], splat (i16 8) -// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsubhn_high_u16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], splat (i16 8) +// CHECK-NEXT: [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VSUBHN2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// uint8x16_t test_vsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) { return vsubhn_high_u16(r, a, b); } -// CHECK-LABEL: @test_vsubhn_high_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSUBHN_I_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], splat (i32 16) -// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubhn_high_u32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], splat (i32 16) +// CHECK-NEXT: [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VSUBHN2_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// uint16x8_t test_vsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) { return vsubhn_high_u32(r, a, b); } -// CHECK-LABEL: @test_vsubhn_high_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSUBHN_I_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], splat (i64 32) -// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubhn_high_u64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], splat (i64 32) +// CHECK-NEXT: [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VSUBHN2_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// uint32x4_t test_vsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { return vsubhn_high_u64(r, a, b); } -// CHECK-LABEL: @test_vrsubhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrsubhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) +// CHECK-NEXT: ret <8 x i8> [[VRSUBHN_V2_I]] +// int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) { return vrsubhn_s16(a, b); } -// CHECK-LABEL: @test_vrsubhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrsubhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) +// CHECK-NEXT: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) { return vrsubhn_s32(a, b); } -// CHECK-LABEL: @test_vrsubhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrsubhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) +// CHECK-NEXT: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) { return vrsubhn_s64(a, b); } -// CHECK-LABEL: @test_vrsubhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrsubhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) +// CHECK-NEXT: ret <8 x i8> [[VRSUBHN_V2_I]] +// uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) { return vrsubhn_u16(a, b); } -// CHECK-LABEL: @test_vrsubhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrsubhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) +// CHECK-NEXT: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) { return vrsubhn_u32(a, b); } -// CHECK-LABEL: @test_vrsubhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrsubhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) +// CHECK-NEXT: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) { return vrsubhn_u64(a, b); } -// CHECK-LABEL: @test_vrsubhn_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrsubhn_high_s16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I_I]], <8 x i16> [[VRSUBHN_V1_I_I]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// int8x16_t test_vrsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) { return vrsubhn_high_s16(r, a, b); } -// CHECK-LABEL: @test_vrsubhn_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrsubhn_high_s32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I_I]], <4 x i32> [[VRSUBHN_V1_I_I]]) +// CHECK-NEXT: [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[TMP2]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// int16x8_t test_vrsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) { return vrsubhn_high_s32(r, a, b); } -// CHECK-LABEL: @test_vrsubhn_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrsubhn_high_s64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I_I]], <2 x i64> [[VRSUBHN_V1_I_I]]) +// CHECK-NEXT: [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[TMP2]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// int32x4_t test_vrsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) { return vrsubhn_high_s64(r, a, b); } -// CHECK-LABEL: @test_vrsubhn_high_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrsubhn_high_u16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I_I]], <8 x i16> [[VRSUBHN_V1_I_I]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// uint8x16_t test_vrsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) { return vrsubhn_high_u16(r, a, b); } -// CHECK-LABEL: @test_vrsubhn_high_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrsubhn_high_u32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I_I]], <4 x i32> [[VRSUBHN_V1_I_I]]) +// CHECK-NEXT: [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[TMP2]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// uint16x8_t test_vrsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) { return vrsubhn_high_u32(r, a, b); } -// CHECK-LABEL: @test_vrsubhn_high_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrsubhn_high_u64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I_I]], <2 x i64> [[VRSUBHN_V1_I_I]]) +// CHECK-NEXT: [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[TMP2]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// uint32x4_t test_vrsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { return vrsubhn_high_u64(r, a, b); } -// CHECK-LABEL: @test_vabdl_s8( -// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I_I]] +// int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) { return vabdl_s8(a, b); } -// CHECK-LABEL: @test_vabdl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabdl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I_I]] +// int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) { return vabdl_s16(a, b); } -// CHECK-LABEL: @test_vabdl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabdl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I_I]] +// int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) { return vabdl_s32(a, b); } -// CHECK-LABEL: @test_vabdl_u8( -// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I_I]] +// uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) { return vabdl_u8(a, b); } -// CHECK-LABEL: @test_vabdl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabdl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I_I]] +// uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) { return vabdl_u16(a, b); } -// CHECK-LABEL: @test_vabdl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabdl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I_I]] +// uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) { return vabdl_u32(a, b); } -// CHECK-LABEL: @test_vabal_s8( -// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabal_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vabal_s8(a, b, c); } -// CHECK-LABEL: @test_vabal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabal_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vabal_s16(a, b, c); } -// CHECK-LABEL: @test_vabal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabal_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vabal_s32(a, b, c); } -// CHECK-LABEL: @test_vabal_u8( -// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabal_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vabal_u8(a, b, c); } -// CHECK-LABEL: @test_vabal_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabal_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vabal_u16(a, b, c); } -// CHECK-LABEL: @test_vabal_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabal_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vabal_u32(a, b, c); } -// CHECK-LABEL: @test_vabdl_high_s8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_high_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I_I_I]] +// int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) { return vabdl_high_s8(a, b); } -// CHECK-LABEL: @test_vabdl_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabdl_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I_I_I]] +// int32x4_t test_vabdl_high_s16(int16x8_t a, int16x8_t b) { return vabdl_high_s16(a, b); } -// CHECK-LABEL: @test_vabdl_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabdl_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I_I_I]] +// int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) { return vabdl_high_s32(a, b); } -// CHECK-LABEL: @test_vabdl_high_u8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_high_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I_I_I]] +// uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) { return vabdl_high_u8(a, b); } -// CHECK-LABEL: @test_vabdl_high_u16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabdl_high_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I_I_I]] +// uint32x4_t test_vabdl_high_u16(uint16x8_t a, uint16x8_t b) { return vabdl_high_u16(a, b); } -// CHECK-LABEL: @test_vabdl_high_u32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabdl_high_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I_I_I]] +// uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t b) { return vabdl_high_u32(a, b); } -// CHECK-LABEL: @test_vabal_high_s8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> -// CHECK: [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabal_high_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> [[C]], <8 x i32> +// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I_I]] +// int16x8_t test_vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { return vabal_high_s8(a, b, c); } -// CHECK-LABEL: @test_vabal_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabal_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I_I]] +// int32x4_t test_vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return vabal_high_s16(a, b, c); } -// CHECK-LABEL: @test_vabal_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabal_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I_I]] +// int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return vabal_high_s32(a, b, c); } -// CHECK-LABEL: @test_vabal_high_u8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> -// CHECK: [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabal_high_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> [[C]], <8 x i32> +// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I_I]] +// uint16x8_t test_vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { return vabal_high_u8(a, b, c); } -// CHECK-LABEL: @test_vabal_high_u16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabal_high_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I_I]] +// uint32x4_t test_vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { return vabal_high_u16(a, b, c); } -// CHECK-LABEL: @test_vabal_high_u32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabal_high_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I_I]] +// uint64x2_t test_vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { return vabal_high_u32(a, b, c); } -// CHECK-LABEL: @test_vmull_s8( -// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i16> [[VMULL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I]] +// int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) { return vmull_s8(a, b); } -// CHECK-LABEL: @test_vmull_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i32> [[VMULL2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmull_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] +// int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) { return vmull_s16(a, b); } -// CHECK-LABEL: @test_vmull_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i64> [[VMULL2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmull_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] +// int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) { return vmull_s32(a, b); } -// CHECK-LABEL: @test_vmull_u8( -// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i16> [[VMULL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I]] +// uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) { return vmull_u8(a, b); } -// CHECK-LABEL: @test_vmull_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i32> [[VMULL2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmull_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] +// uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) { return vmull_u16(a, b); } -// CHECK-LABEL: @test_vmull_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i64> [[VMULL2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmull_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] +// uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) { return vmull_u32(a, b); } -// CHECK-LABEL: @test_vmull_high_s8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: ret <8 x i16> [[VMULL_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_high_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I_I]] +// int16x8_t test_vmull_high_s8(int8x16_t a, int8x16_t b) { return vmull_high_s8(a, b); } -// CHECK-LABEL: @test_vmull_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: ret <4 x i32> [[VMULL2_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmull_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] +// int32x4_t test_vmull_high_s16(int16x8_t a, int16x8_t b) { return vmull_high_s16(a, b); } -// CHECK-LABEL: @test_vmull_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: ret <2 x i64> [[VMULL2_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmull_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] +// int64x2_t test_vmull_high_s32(int32x4_t a, int32x4_t b) { return vmull_high_s32(a, b); } -// CHECK-LABEL: @test_vmull_high_u8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: ret <8 x i16> [[VMULL_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_high_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I_I]] +// uint16x8_t test_vmull_high_u8(uint8x16_t a, uint8x16_t b) { return vmull_high_u8(a, b); } -// CHECK-LABEL: @test_vmull_high_u16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: ret <4 x i32> [[VMULL2_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmull_high_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] +// uint32x4_t test_vmull_high_u16(uint16x8_t a, uint16x8_t b) { return vmull_high_u16(a, b); } -// CHECK-LABEL: @test_vmull_high_u32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: ret <2 x i64> [[VMULL2_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmull_high_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] +// uint64x2_t test_vmull_high_u32(uint32x4_t a, uint32x4_t b) { return vmull_high_u32(a, b); } -// CHECK-LABEL: @test_vmlal_s8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vmlal_s8(a, b, c); } -// CHECK-LABEL: @test_vmlal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlal_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlal_s16(a, b, c); } -// CHECK-LABEL: @test_vmlal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlal_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlal_s32(a, b, c); } -// CHECK-LABEL: @test_vmlal_u8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vmlal_u8(a, b, c); } -// CHECK-LABEL: @test_vmlal_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlal_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlal_u16(a, b, c); } -// CHECK-LABEL: @test_vmlal_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlal_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlal_u32(a, b, c); } -// CHECK-LABEL: @test_vmlal_high_s8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> -// CHECK: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_high_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> [[C]], <8 x i32> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I_I]] +// int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { return vmlal_high_s8(a, b, c); } -// CHECK-LABEL: @test_vmlal_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlal_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I_I]] +// int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return vmlal_high_s16(a, b, c); } -// CHECK-LABEL: @test_vmlal_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlal_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I_I]] +// int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return vmlal_high_s32(a, b, c); } -// CHECK-LABEL: @test_vmlal_high_u8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> -// CHECK: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_high_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> [[C]], <8 x i32> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I_I]] +// uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { return vmlal_high_u8(a, b, c); } -// CHECK-LABEL: @test_vmlal_high_u16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlal_high_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I_I]] +// uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { return vmlal_high_u16(a, b, c); } -// CHECK-LABEL: @test_vmlal_high_u32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlal_high_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I_I]] +// uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { return vmlal_high_u32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_s8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vmlsl_s8(a, b, c); } -// CHECK-LABEL: @test_vmlsl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsl_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlsl_s16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlsl_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlsl_s32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_u8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vmlsl_u8(a, b, c); } -// CHECK-LABEL: @test_vmlsl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsl_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlsl_u16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlsl_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlsl_u32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_high_s8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> -// CHECK: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]] -// CHECK: ret <8 x i16> [[SUB_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_high_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> [[C]], <8 x i32> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I_I]] +// int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { return vmlsl_high_s8(a, b, c); } -// CHECK-LABEL: @test_vmlsl_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]] -// CHECK: ret <4 x i32> [[SUB_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsl_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I_I]] +// int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return vmlsl_high_s16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]] -// CHECK: ret <2 x i64> [[SUB_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlsl_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I_I]] +// int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return vmlsl_high_s32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_high_u8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> -// CHECK: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]] -// CHECK: ret <8 x i16> [[SUB_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_high_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> [[C]], <8 x i32> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I_I]] +// uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { return vmlsl_high_u8(a, b, c); } -// CHECK-LABEL: @test_vmlsl_high_u16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]] -// CHECK: ret <4 x i32> [[SUB_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsl_high_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I_I]] +// uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { return vmlsl_high_u16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_high_u32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]] -// CHECK: ret <2 x i64> [[SUB_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlsl_high_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I_I]] +// uint64x2_t test_vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { return vmlsl_high_u32(a, b, c); } -// CHECK-LABEL: @test_vqdmull_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmull_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) +// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) { return vqdmull_s16(a, b); } -// CHECK-LABEL: @test_vqdmull_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmull_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) +// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) { return vqdmull_s32(a, b); } -// CHECK-LABEL: @test_vqdmlal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) -// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmlal_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] +// int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlal_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) -// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmlal_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] +// int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlal_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) -// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmlsl_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] +// int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlsl_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) -// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmlsl_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] +// int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlsl_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmull_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULL_V2_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmull_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V1_I_I]]) +// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqdmull_high_s16(int16x8_t a, int16x8_t b) { return vqdmull_high_s16(a, b); } -// CHECK-LABEL: @test_vqdmull_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQDMULL_V2_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmull_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V1_I_I]]) +// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vqdmull_high_s32(int32x4_t a, int32x4_t b) { return vqdmull_high_s32(a, b); } -// CHECK-LABEL: @test_vqdmlal_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I_I]]) -// CHECK: ret <4 x i32> [[VQDMLAL_V3_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmlal_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]] +// int32x4_t test_vqdmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return vqdmlal_high_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlal_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I_I]]) -// CHECK: ret <2 x i64> [[VQDMLAL_V3_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmlal_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]] +// int64x2_t test_vqdmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return vqdmlal_high_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I_I]]) -// CHECK: ret <4 x i32> [[VQDMLSL_V3_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmlsl_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> [[C]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]] +// int32x4_t test_vqdmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return vqdmlsl_high_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I_I]]) -// CHECK: ret <2 x i64> [[VQDMLSL_V3_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmlsl_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> [[C]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I5_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]] +// int64x2_t test_vqdmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return vqdmlsl_high_s32(a, b, c); } -// CHECK-LABEL: @test_vmull_p8( -// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i16> [[VMULL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I]] +// poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) { return vmull_p8(a, b); } -// CHECK-LABEL: @test_vmull_high_p8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: ret <8 x i16> [[VMULL_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_high_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[SHUFFLE_I5]], <8 x i8> [[SHUFFLE_I]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I_I]] +// poly16x8_t test_vmull_high_p8(poly8x16_t a, poly8x16_t b) { return vmull_high_p8(a, b); } -// CHECK-LABEL: @test_vaddd_s64( -// CHECK: [[VADDD_I:%.*]] = add i64 %a, %b -// CHECK: ret i64 [[VADDD_I]] +// CHECK-LABEL: define dso_local i64 @test_vaddd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDD_I:%.*]] = add i64 [[A]], [[B]] +// CHECK-NEXT: ret i64 [[VADDD_I]] +// int64_t test_vaddd_s64(int64_t a, int64_t b) { return vaddd_s64(a, b); } -// CHECK-LABEL: @test_vaddd_u64( -// CHECK: [[VADDD_I:%.*]] = add i64 %a, %b -// CHECK: ret i64 [[VADDD_I]] +// CHECK-LABEL: define dso_local i64 @test_vaddd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDD_I:%.*]] = add i64 [[A]], [[B]] +// CHECK-NEXT: ret i64 [[VADDD_I]] +// uint64_t test_vaddd_u64(uint64_t a, uint64_t b) { return vaddd_u64(a, b); } -// CHECK-LABEL: @test_vsubd_s64( -// CHECK: [[VSUBD_I:%.*]] = sub i64 %a, %b -// CHECK: ret i64 [[VSUBD_I]] +// CHECK-LABEL: define dso_local i64 @test_vsubd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBD_I:%.*]] = sub i64 [[A]], [[B]] +// CHECK-NEXT: ret i64 [[VSUBD_I]] +// int64_t test_vsubd_s64(int64_t a, int64_t b) { return vsubd_s64(a, b); } -// CHECK-LABEL: @test_vsubd_u64( -// CHECK: [[VSUBD_I:%.*]] = sub i64 %a, %b -// CHECK: ret i64 [[VSUBD_I]] +// CHECK-LABEL: define dso_local i64 @test_vsubd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBD_I:%.*]] = sub i64 [[A]], [[B]] +// CHECK-NEXT: ret i64 [[VSUBD_I]] +// uint64_t test_vsubd_u64(uint64_t a, uint64_t b) { return vsubd_u64(a, b); } -// CHECK-LABEL: @test_vqaddb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqaddb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// int8_t test_vqaddb_s8(int8_t a, int8_t b) { return vqaddb_s8(a, b); } -// CHECK-LABEL: @test_vqaddh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqaddh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqaddh_s16(int16_t a, int16_t b) { return vqaddh_s16(a, b); } -// CHECK-LABEL: @test_vqadds_s32( -// CHECK: [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQADDS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqadds_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQADDS_S32_I]] +// int32_t test_vqadds_s32(int32_t a, int32_t b) { return vqadds_s32(a, b); } -// CHECK-LABEL: @test_vqaddd_s64( -// CHECK: [[VQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQADDD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqaddd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQADDD_S64_I]] +// int64_t test_vqaddd_s64(int64_t a, int64_t b) { return vqaddd_s64(a, b); } -// CHECK-LABEL: @test_vqaddb_u8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_U8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqaddb_u8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_U8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// uint8_t test_vqaddb_u8(uint8_t a, uint8_t b) { return vqaddb_u8(a, b); } -// CHECK-LABEL: @test_vqaddh_u16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_U16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqaddh_u16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_U16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// uint16_t test_vqaddh_u16(uint16_t a, uint16_t b) { return vqaddh_u16(a, b); } -// CHECK-LABEL: @test_vqadds_u32( -// CHECK: [[VQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqadd.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQADDS_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqadds_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqadd.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQADDS_U32_I]] +// uint32_t test_vqadds_u32(uint32_t a, uint32_t b) { return vqadds_u32(a, b); } -// CHECK-LABEL: @test_vqaddd_u64( -// CHECK: [[VQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqadd.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQADDD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqaddd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqadd.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQADDD_U64_I]] +// uint64_t test_vqaddd_u64(uint64_t a, uint64_t b) { return vqaddd_u64(a, b); } -// CHECK-LABEL: @test_vqsubb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQSUBB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqsubb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQSUBB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// int8_t test_vqsubb_s8(int8_t a, int8_t b) { return vqsubb_s8(a, b); } -// CHECK-LABEL: @test_vqsubh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqsubh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqsubh_s16(int16_t a, int16_t b) { return vqsubh_s16(a, b); } -// CHECK-LABEL: @test_vqsubs_s32( -// CHECK: [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQSUBS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqsubs_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQSUBS_S32_I]] +// int32_t test_vqsubs_s32(int32_t a, int32_t b) { return vqsubs_s32(a, b); } -// CHECK-LABEL: @test_vqsubd_s64( -// CHECK: [[VQSUBD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQSUBD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqsubd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQSUBD_S64_I]] +// int64_t test_vqsubd_s64(int64_t a, int64_t b) { return vqsubd_s64(a, b); } -// CHECK-LABEL: @test_vqsubb_u8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQSUBB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_U8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqsubb_u8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQSUBB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_U8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// uint8_t test_vqsubb_u8(uint8_t a, uint8_t b) { return vqsubb_u8(a, b); } -// CHECK-LABEL: @test_vqsubh_u16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQSUBH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_U16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqsubh_u16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQSUBH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_U16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// uint16_t test_vqsubh_u16(uint16_t a, uint16_t b) { return vqsubh_u16(a, b); } -// CHECK-LABEL: @test_vqsubs_u32( -// CHECK: [[VQSUBS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqsub.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQSUBS_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqsubs_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqsub.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQSUBS_U32_I]] +// uint32_t test_vqsubs_u32(uint32_t a, uint32_t b) { return vqsubs_u32(a, b); } -// CHECK-LABEL: @test_vqsubd_u64( -// CHECK: [[VQSUBD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqsub.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQSUBD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqsubd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqsub.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQSUBD_U64_I]] +// uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) { return vqsubd_u64(a, b); } -// CHECK-LABEL: @test_vshld_s64( -// CHECK: [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VSHLD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vshld_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VSHLD_S64_I]] +// int64_t test_vshld_s64(int64_t a, int64_t b) { return vshld_s64(a, b); } -// CHECK-LABEL: @test_vshld_u64( -// CHECK: [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VSHLD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vshld_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VSHLD_U64_I]] +// uint64_t test_vshld_u64(uint64_t a, int64_t b) { return vshld_u64(a, b); } -// CHECK-LABEL: @test_vqshlb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqshlb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// int8_t test_vqshlb_s8(int8_t a, int8_t b) { return vqshlb_s8(a, b); } -// CHECK-LABEL: @test_vqshlh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqshlh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqshlh_s16(int16_t a, int16_t b) { return vqshlh_s16(a, b); } -// CHECK-LABEL: @test_vqshls_s32( -// CHECK: [[VQSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQSHLS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqshls_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQSHLS_S32_I]] +// int32_t test_vqshls_s32(int32_t a, int32_t b) { return vqshls_s32(a, b); } -// CHECK-LABEL: @test_vqshld_s64( -// CHECK: [[VQSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQSHLD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqshld_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQSHLD_S64_I]] +// int64_t test_vqshld_s64(int64_t a, int64_t b) { return vqshld_s64(a, b); } -// CHECK-LABEL: @test_vqshlb_u8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_U8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqshlb_u8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_U8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// uint8_t test_vqshlb_u8(uint8_t a, int8_t b) { return vqshlb_u8(a, b); } -// CHECK-LABEL: @test_vqshlh_u16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_U16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqshlh_u16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_U16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// uint16_t test_vqshlh_u16(uint16_t a, int16_t b) { return vqshlh_u16(a, b); } -// CHECK-LABEL: @test_vqshls_u32( -// CHECK: [[VQSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQSHLS_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqshls_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQSHLS_U32_I]] +// uint32_t test_vqshls_u32(uint32_t a, int32_t b) { return vqshls_u32(a, b); } -// CHECK-LABEL: @test_vqshld_u64( -// CHECK: [[VQSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQSHLD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqshld_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQSHLD_U64_I]] +// uint64_t test_vqshld_u64(uint64_t a, int64_t b) { return vqshld_u64(a, b); } -// CHECK-LABEL: @test_vrshld_s64( -// CHECK: [[VRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VRSHLD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vrshld_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VRSHLD_S64_I]] +// int64_t test_vrshld_s64(int64_t a, int64_t b) { return vrshld_s64(a, b); } -// CHECK-LABEL: @test_vrshld_u64( -// CHECK: [[VRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VRSHLD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vrshld_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VRSHLD_U64_I]] +// uint64_t test_vrshld_u64(uint64_t a, int64_t b) { return vrshld_u64(a, b); } -// CHECK-LABEL: @test_vqrshlb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQRSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqrshlb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQRSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// int8_t test_vqrshlb_s8(int8_t a, int8_t b) { return vqrshlb_s8(a, b); } -// CHECK-LABEL: @test_vqrshlh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQRSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqrshlh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQRSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqrshlh_s16(int16_t a, int16_t b) { return vqrshlh_s16(a, b); } -// CHECK-LABEL: @test_vqrshls_s32( -// CHECK: [[VQRSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrshl.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQRSHLS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqrshls_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrshl.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQRSHLS_S32_I]] +// int32_t test_vqrshls_s32(int32_t a, int32_t b) { return vqrshls_s32(a, b); } -// CHECK-LABEL: @test_vqrshld_s64( -// CHECK: [[VQRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQRSHLD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqrshld_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQRSHLD_S64_I]] +// int64_t test_vqrshld_s64(int64_t a, int64_t b) { return vqrshld_s64(a, b); } -// CHECK-LABEL: @test_vqrshlb_u8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQRSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_U8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqrshlb_u8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQRSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_U8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// uint8_t test_vqrshlb_u8(uint8_t a, int8_t b) { return vqrshlb_u8(a, b); } -// CHECK-LABEL: @test_vqrshlh_u16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQRSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_U16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqrshlh_u16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQRSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_U16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// uint16_t test_vqrshlh_u16(uint16_t a, int16_t b) { return vqrshlh_u16(a, b); } -// CHECK-LABEL: @test_vqrshls_u32( -// CHECK: [[VQRSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqrshl.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQRSHLS_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqrshls_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqrshl.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQRSHLS_U32_I]] +// uint32_t test_vqrshls_u32(uint32_t a, int32_t b) { return vqrshls_u32(a, b); } -// CHECK-LABEL: @test_vqrshld_u64( -// CHECK: [[VQRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQRSHLD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqrshld_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQRSHLD_U64_I]] +// uint64_t test_vqrshld_u64(uint64_t a, int64_t b) { return vqrshld_u64(a, b); } -// CHECK-LABEL: @test_vpaddd_s64( -// CHECK: [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a) -// CHECK: ret i64 [[VPADDD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vpaddd_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: ret i64 [[VPADDD_S64_I]] +// int64_t test_vpaddd_s64(int64x2_t a) { return vpaddd_s64(a); } -// CHECK-LABEL: @test_vpadds_f32( -// CHECK: [[LANE0_I:%.*]] = extractelement <2 x float> %a, i64 0 -// CHECK: [[LANE1_I:%.*]] = extractelement <2 x float> %a, i64 1 -// CHECK: [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]] -// CHECK: ret float [[VPADDD_I]] +// CHECK-LABEL: define dso_local float @test_vpadds_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE0_I:%.*]] = extractelement <2 x float> [[A]], i64 0 +// CHECK-NEXT: [[LANE1_I:%.*]] = extractelement <2 x float> [[A]], i64 1 +// CHECK-NEXT: [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]] +// CHECK-NEXT: ret float [[VPADDD_I]] +// float32_t test_vpadds_f32(float32x2_t a) { return vpadds_f32(a); } -// CHECK-LABEL: @test_vpaddd_f64( -// CHECK: [[LANE0_I:%.*]] = extractelement <2 x double> %a, i64 0 -// CHECK: [[LANE1_I:%.*]] = extractelement <2 x double> %a, i64 1 -// CHECK: [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]] -// CHECK: ret double [[VPADDD_I]] +// CHECK-LABEL: define dso_local double @test_vpaddd_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE0_I:%.*]] = extractelement <2 x double> [[A]], i64 0 +// CHECK-NEXT: [[LANE1_I:%.*]] = extractelement <2 x double> [[A]], i64 1 +// CHECK-NEXT: [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]] +// CHECK-NEXT: ret double [[VPADDD_I]] +// float64_t test_vpaddd_f64(float64x2_t a) { return vpaddd_f64(a); } -// CHECK-LABEL: @test_vpmaxnms_f32( -// CHECK: [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VPMAXNMS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vpmaxnms_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VPMAXNMS_F32_I]] +// float32_t test_vpmaxnms_f32(float32x2_t a) { return vpmaxnms_f32(a); } -// CHECK-LABEL: @test_vpmaxnmqd_f64( -// CHECK: [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VPMAXNMQD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vpmaxnmqd_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VPMAXNMQD_F64_I]] +// float64_t test_vpmaxnmqd_f64(float64x2_t a) { return vpmaxnmqd_f64(a); } -// CHECK-LABEL: @test_vpmaxs_f32( -// CHECK: [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VPMAXS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vpmaxs_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VPMAXS_F32_I]] +// float32_t test_vpmaxs_f32(float32x2_t a) { return vpmaxs_f32(a); } -// CHECK-LABEL: @test_vpmaxqd_f64( -// CHECK: [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VPMAXQD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vpmaxqd_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VPMAXQD_F64_I]] +// float64_t test_vpmaxqd_f64(float64x2_t a) { return vpmaxqd_f64(a); } -// CHECK-LABEL: @test_vpminnms_f32( -// CHECK: [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VPMINNMS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vpminnms_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VPMINNMS_F32_I]] +// float32_t test_vpminnms_f32(float32x2_t a) { return vpminnms_f32(a); } -// CHECK-LABEL: @test_vpminnmqd_f64( -// CHECK: [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VPMINNMQD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vpminnmqd_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VPMINNMQD_F64_I]] +// float64_t test_vpminnmqd_f64(float64x2_t a) { return vpminnmqd_f64(a); } -// CHECK-LABEL: @test_vpmins_f32( -// CHECK: [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VPMINS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vpmins_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VPMINS_F32_I]] +// float32_t test_vpmins_f32(float32x2_t a) { return vpmins_f32(a); } -// CHECK-LABEL: @test_vpminqd_f64( -// CHECK: [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VPMINQD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vpminqd_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VPMINQD_F64_I]] +// float64_t test_vpminqd_f64(float64x2_t a) { return vpminqd_f64(a); } -// CHECK-LABEL: @test_vqdmulhh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqdmulhh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqdmulhh_s16(int16_t a, int16_t b) { return vqdmulhh_s16(a, b); } -// CHECK-LABEL: @test_vqdmulhs_s32( -// CHECK: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQDMULHS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqdmulhs_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQDMULHS_S32_I]] +// int32_t test_vqdmulhs_s32(int32_t a, int32_t b) { return vqdmulhs_s32(a, b); } -// CHECK-LABEL: @test_vqrdmulhh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqrdmulhh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqrdmulhh_s16(int16_t a, int16_t b) { return vqrdmulhh_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulhs_s32( -// CHECK: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQRDMULHS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqrdmulhs_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQRDMULHS_S32_I]] +// int32_t test_vqrdmulhs_s32(int32_t a, int32_t b) { return vqrdmulhs_s32(a, b); } -// CHECK-LABEL: @test_vmulxs_f32( -// CHECK: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) -// CHECK: ret float [[VMULXS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vmulxs_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[B]]) +// CHECK-NEXT: ret float [[VMULXS_F32_I]] +// float32_t test_vmulxs_f32(float32_t a, float32_t b) { return vmulxs_f32(a, b); } -// CHECK-LABEL: @test_vmulxd_f64( -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) -// CHECK: ret double [[VMULXD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vmulxd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[B]]) +// CHECK-NEXT: ret double [[VMULXD_F64_I]] +// float64_t test_vmulxd_f64(float64_t a, float64_t b) { return vmulxd_f64(a, b); } -// CHECK-LABEL: @test_vmulx_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VMULX2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> [[VMULX_I]], <1 x double> [[VMULX1_I]]) +// CHECK-NEXT: ret <1 x double> [[VMULX2_I]] +// float64x1_t test_vmulx_f64(float64x1_t a, float64x1_t b) { return vmulx_f64(a, b); } -// CHECK-LABEL: @test_vrecpss_f32( -// CHECK: [[VRECPS_I:%.*]] = call float @llvm.aarch64.neon.frecps.f32(float %a, float %b) -// CHECK: ret float [[VRECPS_I]] +// CHECK-LABEL: define dso_local float @test_vrecpss_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPS_I:%.*]] = call float @llvm.aarch64.neon.frecps.f32(float [[A]], float [[B]]) +// CHECK-NEXT: ret float [[VRECPS_I]] +// float32_t test_vrecpss_f32(float32_t a, float32_t b) { return vrecpss_f32(a, b); } -// CHECK-LABEL: @test_vrecpsd_f64( -// CHECK: [[VRECPS_I:%.*]] = call double @llvm.aarch64.neon.frecps.f64(double %a, double %b) -// CHECK: ret double [[VRECPS_I]] +// CHECK-LABEL: define dso_local double @test_vrecpsd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPS_I:%.*]] = call double @llvm.aarch64.neon.frecps.f64(double [[A]], double [[B]]) +// CHECK-NEXT: ret double [[VRECPS_I]] +// float64_t test_vrecpsd_f64(float64_t a, float64_t b) { return vrecpsd_f64(a, b); } -// CHECK-LABEL: @test_vrsqrtss_f32( -// CHECK: [[VRSQRTSS_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) -// CHECK: ret float [[VRSQRTSS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vrsqrtss_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTSS_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrts.f32(float [[A]], float [[B]]) +// CHECK-NEXT: ret float [[VRSQRTSS_F32_I]] +// float32_t test_vrsqrtss_f32(float32_t a, float32_t b) { return vrsqrtss_f32(a, b); } -// CHECK-LABEL: @test_vrsqrtsd_f64( -// CHECK: [[VRSQRTSD_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) -// CHECK: ret double [[VRSQRTSD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vrsqrtsd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTSD_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrts.f64(double [[A]], double [[B]]) +// CHECK-NEXT: ret double [[VRSQRTSD_F64_I]] +// float64_t test_vrsqrtsd_f64(float64_t a, float64_t b) { return vrsqrtsd_f64(a, b); } -// CHECK-LABEL: @test_vcvts_f32_s32( -// CHECK: [[TMP0:%.*]] = sitofp i32 %a to float -// CHECK: ret float [[TMP0]] +// CHECK-LABEL: define dso_local float @test_vcvts_f32_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sitofp i32 [[A]] to float +// CHECK-NEXT: ret float [[TMP0]] +// float32_t test_vcvts_f32_s32(int32_t a) { return vcvts_f32_s32(a); } -// CHECK-LABEL: @test_vcvtd_f64_s64( -// CHECK: [[TMP0:%.*]] = sitofp i64 %a to double -// CHECK: ret double [[TMP0]] +// CHECK-LABEL: define dso_local double @test_vcvtd_f64_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sitofp i64 [[A]] to double +// CHECK-NEXT: ret double [[TMP0]] +// float64_t test_vcvtd_f64_s64(int64_t a) { return vcvtd_f64_s64(a); } -// CHECK-LABEL: @test_vcvts_f32_u32( -// CHECK: [[TMP0:%.*]] = uitofp i32 %a to float -// CHECK: ret float [[TMP0]] +// CHECK-LABEL: define dso_local float @test_vcvts_f32_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = uitofp i32 [[A]] to float +// CHECK-NEXT: ret float [[TMP0]] +// float32_t test_vcvts_f32_u32(uint32_t a) { return vcvts_f32_u32(a); } -// CHECK-LABEL: @test_vcvtd_f64_u64( -// CHECK: [[TMP0:%.*]] = uitofp i64 %a to double -// CHECK: ret double [[TMP0]] +// CHECK-LABEL: define dso_local double @test_vcvtd_f64_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = uitofp i64 [[A]] to double +// CHECK-NEXT: ret double [[TMP0]] +// float64_t test_vcvtd_f64_u64(uint64_t a) { return vcvtd_f64_u64(a); } -// CHECK-LABEL: @test_vrecpes_f32( -// CHECK: [[VRECPES_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpe.f32(float %a) -// CHECK: ret float [[VRECPES_F32_I]] +// CHECK-LABEL: define dso_local float @test_vrecpes_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPES_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpe.f32(float [[A]]) +// CHECK-NEXT: ret float [[VRECPES_F32_I]] +// float32_t test_vrecpes_f32(float32_t a) { return vrecpes_f32(a); } -// CHECK-LABEL: @test_vrecped_f64( -// CHECK: [[VRECPED_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpe.f64(double %a) -// CHECK: ret double [[VRECPED_F64_I]] +// CHECK-LABEL: define dso_local double @test_vrecped_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPED_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpe.f64(double [[A]]) +// CHECK-NEXT: ret double [[VRECPED_F64_I]] +// float64_t test_vrecped_f64(float64_t a) { return vrecped_f64(a); } -// CHECK-LABEL: @test_vrecpxs_f32( -// CHECK: [[VRECPXS_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpx.f32(float %a) -// CHECK: ret float [[VRECPXS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vrecpxs_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPXS_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpx.f32(float [[A]]) +// CHECK-NEXT: ret float [[VRECPXS_F32_I]] +// float32_t test_vrecpxs_f32(float32_t a) { return vrecpxs_f32(a); } -// CHECK-LABEL: @test_vrecpxd_f64( -// CHECK: [[VRECPXD_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpx.f64(double %a) -// CHECK: ret double [[VRECPXD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vrecpxd_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPXD_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpx.f64(double [[A]]) +// CHECK-NEXT: ret double [[VRECPXD_F64_I]] +// float64_t test_vrecpxd_f64(float64_t a) { return vrecpxd_f64(a); } -// CHECK-LABEL: @test_vrsqrte_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %a) -// CHECK: ret <2 x i32> [[VRSQRTE_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrsqrte_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) +// CHECK-NEXT: ret <2 x i32> [[VRSQRTE_V1_I]] +// uint32x2_t test_vrsqrte_u32(uint32x2_t a) { return vrsqrte_u32(a); } -// CHECK-LABEL: @test_vrsqrteq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %a) -// CHECK: ret <4 x i32> [[VRSQRTEQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrsqrteq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) +// CHECK-NEXT: ret <4 x i32> [[VRSQRTEQ_V1_I]] +// uint32x4_t test_vrsqrteq_u32(uint32x4_t a) { return vrsqrteq_u32(a); } -// CHECK-LABEL: @test_vrsqrtes_f32( -// CHECK: [[VRSQRTES_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrte.f32(float %a) -// CHECK: ret float [[VRSQRTES_F32_I]] +// CHECK-LABEL: define dso_local float @test_vrsqrtes_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTES_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrte.f32(float [[A]]) +// CHECK-NEXT: ret float [[VRSQRTES_F32_I]] +// float32_t test_vrsqrtes_f32(float32_t a) { return vrsqrtes_f32(a); } -// CHECK-LABEL: @test_vrsqrted_f64( -// CHECK: [[VRSQRTED_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrte.f64(double %a) -// CHECK: ret double [[VRSQRTED_F64_I]] +// CHECK-LABEL: define dso_local double @test_vrsqrted_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTED_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrte.f64(double [[A]]) +// CHECK-NEXT: ret double [[VRSQRTED_F64_I]] +// float64_t test_vrsqrted_f64(float64_t a) { return vrsqrted_f64(a); } -// CHECK-LABEL: @test_vld1q_u8( -// CHECK: [[TMP1:%.*]] = load <16 x i8>, ptr %a, align 1 -// CHECK: ret <16 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vld1q_u8(uint8_t const *a) { return vld1q_u8(a); } -// CHECK-LABEL: @test_vld1q_u16( -// CHECK: [[TMP2:%.*]] = load <8 x i16>, ptr %a, align 2 -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vld1q_u16(uint16_t const *a) { return vld1q_u16(a); } -// CHECK-LABEL: @test_vld1q_u32( -// CHECK: [[TMP2:%.*]] = load <4 x i32>, ptr %a, align 4 -// CHECK: ret <4 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 4 +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vld1q_u32(uint32_t const *a) { return vld1q_u32(a); } -// CHECK-LABEL: @test_vld1q_u64( -// CHECK: [[TMP2:%.*]] = load <2 x i64>, ptr %a, align 8 -// CHECK: ret <2 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A]], align 8 +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vld1q_u64(uint64_t const *a) { return vld1q_u64(a); } -// CHECK-LABEL: @test_vld1q_s8( -// CHECK: [[TMP1:%.*]] = load <16 x i8>, ptr %a, align 1 -// CHECK: ret <16 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vld1q_s8(int8_t const *a) { return vld1q_s8(a); } -// CHECK-LABEL: @test_vld1q_s16( -// CHECK: [[TMP2:%.*]] = load <8 x i16>, ptr %a, align 2 -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vld1q_s16(int16_t const *a) { return vld1q_s16(a); } -// CHECK-LABEL: @test_vld1q_s32( -// CHECK: [[TMP2:%.*]] = load <4 x i32>, ptr %a, align 4 -// CHECK: ret <4 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 4 +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vld1q_s32(int32_t const *a) { return vld1q_s32(a); } -// CHECK-LABEL: @test_vld1q_s64( -// CHECK: [[TMP2:%.*]] = load <2 x i64>, ptr %a, align 8 -// CHECK: ret <2 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A]], align 8 +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vld1q_s64(int64_t const *a) { return vld1q_s64(a); } -// CHECK-LABEL: @test_vld1q_f16( -// CHECK: [[TMP2:%.*]] = load <8 x half>, ptr %a, align 2 -// CHECK: ret <8 x half> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x half> @test_vld1q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[A]], align 2 +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vld1q_f16(float16_t const *a) { return vld1q_f16(a); } -// CHECK-LABEL: @test_vld1q_f32( -// CHECK: [[TMP2:%.*]] = load <4 x float>, ptr %a, align 4 -// CHECK: ret <4 x float> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x float> @test_vld1q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4 +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vld1q_f32(float32_t const *a) { return vld1q_f32(a); } -// CHECK-LABEL: @test_vld1q_f64( -// CHECK: [[TMP2:%.*]] = load <2 x double>, ptr %a, align 8 -// CHECK: ret <2 x double> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x double> @test_vld1q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vld1q_f64(float64_t const *a) { return vld1q_f64(a); } -// CHECK-LABEL: @test_vld1q_p8( -// CHECK: [[TMP1:%.*]] = load <16 x i8>, ptr %a, align 1 -// CHECK: ret <16 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vld1q_p8(poly8_t const *a) { return vld1q_p8(a); } -// CHECK-LABEL: @test_vld1q_p16( -// CHECK: [[TMP2:%.*]] = load <8 x i16>, ptr %a, align 2 -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vld1q_p16(poly16_t const *a) { return vld1q_p16(a); } -// CHECK-LABEL: @test_vld1_u8( -// CHECK: [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1 -// CHECK: ret <8 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vld1_u8(uint8_t const *a) { return vld1_u8(a); } -// CHECK-LABEL: @test_vld1_u16( -// CHECK: [[TMP2:%.*]] = load <4 x i16>, ptr %a, align 2 -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vld1_u16(uint16_t const *a) { return vld1_u16(a); } -// CHECK-LABEL: @test_vld1_u32( -// CHECK: [[TMP2:%.*]] = load <2 x i32>, ptr %a, align 4 -// CHECK: ret <2 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 4 +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vld1_u32(uint32_t const *a) { return vld1_u32(a); } -// CHECK-LABEL: @test_vld1_u64( -// CHECK: [[TMP2:%.*]] = load <1 x i64>, ptr %a, align 8 -// CHECK: ret <1 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[A]], align 8 +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vld1_u64(uint64_t const *a) { return vld1_u64(a); } -// CHECK-LABEL: @test_vld1_s8( -// CHECK: [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1 -// CHECK: ret <8 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vld1_s8(int8_t const *a) { return vld1_s8(a); } -// CHECK-LABEL: @test_vld1_s16( -// CHECK: [[TMP2:%.*]] = load <4 x i16>, ptr %a, align 2 -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vld1_s16(int16_t const *a) { return vld1_s16(a); } -// CHECK-LABEL: @test_vld1_s32( -// CHECK: [[TMP2:%.*]] = load <2 x i32>, ptr %a, align 4 -// CHECK: ret <2 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 4 +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vld1_s32(int32_t const *a) { return vld1_s32(a); } -// CHECK-LABEL: @test_vld1_s64( -// CHECK: [[TMP2:%.*]] = load <1 x i64>, ptr %a, align 8 -// CHECK: ret <1 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[A]], align 8 +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vld1_s64(int64_t const *a) { return vld1_s64(a); } -// CHECK-LABEL: @test_vld1_f16( -// CHECK: [[TMP2:%.*]] = load <4 x half>, ptr %a, align 2 -// CHECK: ret <4 x half> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x half> @test_vld1_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[A]], align 2 +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vld1_f16(float16_t const *a) { return vld1_f16(a); } -// CHECK-LABEL: @test_vld1_f32( -// CHECK: [[TMP2:%.*]] = load <2 x float>, ptr %a, align 4 -// CHECK: ret <2 x float> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x float> @test_vld1_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4 +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vld1_f32(float32_t const *a) { return vld1_f32(a); } -// CHECK-LABEL: @test_vld1_f64( -// CHECK: [[TMP2:%.*]] = load <1 x double>, ptr %a, align 8 -// CHECK: ret <1 x double> [[TMP2]] +// CHECK-LABEL: define dso_local <1 x double> @test_vld1_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[A]], align 8 +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vld1_f64(float64_t const *a) { return vld1_f64(a); } -// CHECK-LABEL: @test_vld1_p8( -// CHECK: [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1 -// CHECK: ret <8 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vld1_p8(poly8_t const *a) { return vld1_p8(a); } -// CHECK-LABEL: @test_vld1_p16( -// CHECK: [[TMP2:%.*]] = load <4 x i16>, ptr %a, align 2 -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vld1_p16(poly16_t const *a) { return vld1_p16(a); } -// CHECK-LABEL: @test_vld1_u8_void( -// CHECK: [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1 -// CHECK: ret <8 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_u8_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vld1_u8_void(void *a) { return vld1_u8(a); } -// CHECK-LABEL: @test_vld1_u16_void( -// CHECK: [[TMP1:%.*]] = load <4 x i16>, ptr %a, align 1 -// CHECK: ret <4 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_u16_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 1 +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vld1_u16_void(void *a) { return vld1_u16(a); } -// CHECK-LABEL: @test_vld1_u32_void( -// CHECK: [[TMP1:%.*]] = load <2 x i32>, ptr %a, align 1 -// CHECK: ret <2 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_u32_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 1 +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vld1_u32_void(void *a) { return vld1_u32(a); } -// CHECK-LABEL: @test_vld1_u64_void( -// CHECK: [[TMP1:%.*]] = load <1 x i64>, ptr %a, align 1 -// CHECK: ret <1 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_u64_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[A]], align 1 +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vld1_u64_void(void *a) { return vld1_u64(a); } -// CHECK-LABEL: @test_vld1_s8_void( -// CHECK: [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1 -// CHECK: ret <8 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_s8_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vld1_s8_void(void *a) { return vld1_s8(a); } -// CHECK-LABEL: @test_vld1_s16_void( -// CHECK: [[TMP1:%.*]] = load <4 x i16>, ptr %a, align 1 -// CHECK: ret <4 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_s16_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 1 +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vld1_s16_void(void *a) { return vld1_s16(a); } -// CHECK-LABEL: @test_vld1_s32_void( -// CHECK: [[TMP1:%.*]] = load <2 x i32>, ptr %a, align 1 -// CHECK: ret <2 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_s32_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 1 +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vld1_s32_void(void *a) { return vld1_s32(a); } -// CHECK-LABEL: @test_vld1_s64_void( -// CHECK: [[TMP1:%.*]] = load <1 x i64>, ptr %a, align 1 -// CHECK: ret <1 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_s64_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[A]], align 1 +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vld1_s64_void(void *a) { return vld1_s64(a); } -// CHECK-LABEL: @test_vld1_f16_void( -// CHECK: [[TMP1:%.*]] = load <4 x half>, ptr %a, align 1 -// CHECK: ret <4 x half> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x half> @test_vld1_f16_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[A]], align 1 +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vld1_f16_void(void *a) { return vld1_f16(a); } -// CHECK-LABEL: @test_vld1_f32_void( -// CHECK: [[TMP1:%.*]] = load <2 x float>, ptr %a, align 1 -// CHECK: ret <2 x float> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x float> @test_vld1_f32_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 1 +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vld1_f32_void(void *a) { return vld1_f32(a); } -// CHECK-LABEL: @test_vld1_f64_void( -// CHECK: [[TMP1:%.*]] = load <1 x double>, ptr %a, align 1 -// CHECK: ret <1 x double> [[TMP1]] +// CHECK-LABEL: define dso_local <1 x double> @test_vld1_f64_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[A]], align 1 +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vld1_f64_void(void *a) { return vld1_f64(a); } -// CHECK-LABEL: @test_vld1_p8_void( -// CHECK: [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1 -// CHECK: ret <8 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_p8_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vld1_p8_void(void *a) { return vld1_p8(a); } -// CHECK-LABEL: @test_vld1_p16_void( -// CHECK: [[TMP1:%.*]] = load <4 x i16>, ptr %a, align 1 -// CHECK: ret <4 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_p16_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 1 +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vld1_p16_void(void *a) { return vld1_p16(a); } -// CHECK-LABEL: @test_vld2q_u8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x2_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vld2q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T:%.*]] poison, <16 x i8> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// uint8x16x2_t test_vld2q_u8(uint8_t const *a) { return vld2q_u8(a); } -// CHECK-LABEL: @test_vld2q_u16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vld2q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] poison, <8 x i16> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// uint16x8x2_t test_vld2q_u16(uint16_t const *a) { return vld2q_u16(a); } -// CHECK-LABEL: @test_vld2q_u32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a) -// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vld2q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T:%.*]] poison, <4 x i32> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// uint32x4x2_t test_vld2q_u32(uint32_t const *a) { return vld2q_u32(a); } -// CHECK-LABEL: @test_vld2q_u64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x2x2_t @test_vld2q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X2_T:%.*]] poison, <2 x i64> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// uint64x2x2_t test_vld2q_u64(uint64_t const *a) { return vld2q_u64(a); } -// CHECK-LABEL: @test_vld2q_s8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x2_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vld2q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X2_T:%.*]] poison, <16 x i8> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// int8x16x2_t test_vld2q_s8(int8_t const *a) { return vld2q_s8(a); } -// CHECK-LABEL: @test_vld2q_s16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vld2q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X2_T:%.*]] poison, <8 x i16> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// int16x8x2_t test_vld2q_s16(int16_t const *a) { return vld2q_s16(a); } -// CHECK-LABEL: @test_vld2q_s32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a) -// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vld2q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] poison, <4 x i32> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// int32x4x2_t test_vld2q_s32(int32_t const *a) { return vld2q_s32(a); } -// CHECK-LABEL: @test_vld2q_s64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x2x2_t @test_vld2q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X2_T:%.*]] poison, <2 x i64> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// int64x2x2_t test_vld2q_s64(int64_t const *a) { return vld2q_s64(a); } -// CHECK-LABEL: @test_vld2q_f16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0(ptr %a) -// CHECK: store { <8 x half>, <8 x half> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float16x8x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float16x8x2_t @test_vld2q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] poison, <8 x half> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x half> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// float16x8x2_t test_vld2q_f16(float16_t const *a) { return vld2q_f16(a); } -// CHECK-LABEL: @test_vld2q_f32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr %a) -// CHECK: store { <4 x float>, <4 x float> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vld2q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T:%.*]] poison, <4 x float> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x float> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// float32x4x2_t test_vld2q_f32(float32_t const *a) { return vld2q_f32(a); } -// CHECK-LABEL: @test_vld2q_f64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld2q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T:%.*]] poison, <2 x double> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x double> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// float64x2x2_t test_vld2q_f64(float64_t const *a) { return vld2q_f64(a); } -// CHECK-LABEL: @test_vld2q_p8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x2_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vld2q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T:%.*]] poison, <16 x i8> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// poly8x16x2_t test_vld2q_p8(poly8_t const *a) { return vld2q_p8(a); } -// CHECK-LABEL: @test_vld2q_p16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vld2q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T:%.*]] poison, <8 x i16> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// poly16x8x2_t test_vld2q_p16(poly16_t const *a) { return vld2q_p16(a); } -// CHECK-LABEL: @test_vld2_u8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x2_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vld2_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T:%.*]] poison, <8 x i8> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// uint8x8x2_t test_vld2_u8(uint8_t const *a) { return vld2_u8(a); } -// CHECK-LABEL: @test_vld2_u16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vld2_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T:%.*]] poison, <4 x i16> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// uint16x4x2_t test_vld2_u16(uint16_t const *a) { return vld2_u16(a); } -// CHECK-LABEL: @test_vld2_u32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr %a) -// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vld2_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T:%.*]] poison, <2 x i32> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// uint32x2x2_t test_vld2_u32(uint32_t const *a) { return vld2_u32(a); } -// CHECK-LABEL: @test_vld2_u64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x1x2_t @test_vld2_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X2_T:%.*]] poison, <1 x i64> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X2_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT64X1X2_T]] [[DOTFCA_0_1_INSERT]] +// uint64x1x2_t test_vld2_u64(uint64_t const *a) { return vld2_u64(a); } -// CHECK-LABEL: @test_vld2_s8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x2_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vld2_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X2_T:%.*]] poison, <8 x i8> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// int8x8x2_t test_vld2_s8(int8_t const *a) { return vld2_s8(a); } -// CHECK-LABEL: @test_vld2_s16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vld2_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X2_T:%.*]] poison, <4 x i16> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// int16x4x2_t test_vld2_s16(int16_t const *a) { return vld2_s16(a); } -// CHECK-LABEL: @test_vld2_s32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr %a) -// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vld2_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X2_T:%.*]] poison, <2 x i32> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// int32x2x2_t test_vld2_s32(int32_t const *a) { return vld2_s32(a); } -// CHECK-LABEL: @test_vld2_s64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x1x2_t @test_vld2_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X2_T:%.*]] poison, <1 x i64> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X2_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT64X1X2_T]] [[DOTFCA_0_1_INSERT]] +// int64x1x2_t test_vld2_s64(int64_t const *a) { return vld2_s64(a); } -// CHECK-LABEL: @test_vld2_f16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0(ptr %a) -// CHECK: store { <4 x half>, <4 x half> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float16x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float16x4x2_t @test_vld2_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T:%.*]] poison, <4 x half> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x half> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// float16x4x2_t test_vld2_f16(float16_t const *a) { return vld2_f16(a); } -// CHECK-LABEL: @test_vld2_f32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0(ptr %a) -// CHECK: store { <2 x float>, <2 x float> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vld2_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T:%.*]] poison, <2 x float> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x float> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// float32x2x2_t test_vld2_f32(float32_t const *a) { return vld2_f32(a); } -// CHECK-LABEL: @test_vld2_f64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld2_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T:%.*]] poison, <1 x double> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T]] [[DOTFCA_0_0_INSERT]], <1 x double> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X2_T]] [[DOTFCA_0_1_INSERT]] +// float64x1x2_t test_vld2_f64(float64_t const *a) { return vld2_f64(a); } -// CHECK-LABEL: @test_vld2_p8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x2_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vld2_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T:%.*]] poison, <8 x i8> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// poly8x8x2_t test_vld2_p8(poly8_t const *a) { return vld2_p8(a); } -// CHECK-LABEL: @test_vld2_p16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vld2_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T:%.*]] poison, <4 x i16> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// poly16x4x2_t test_vld2_p16(poly16_t const *a) { return vld2_p16(a); } -// CHECK-LABEL: @test_vld3q_u8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x3_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.uint8x16x3_t @test_vld3q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X3_T:%.*]] poison, <16 x i8> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X3_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X3_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X3_T]] [[DOTFCA_0_2_INSERT]] +// uint8x16x3_t test_vld3q_u8(uint8_t const *a) { return vld3q_u8(a); } -// CHECK-LABEL: @test_vld3q_u16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint16x8x3_t @test_vld3q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X3_T:%.*]] poison, <8 x i16> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x i16> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X3_T]] [[DOTFCA_0_2_INSERT]] +// uint16x8x3_t test_vld3q_u16(uint16_t const *a) { return vld3q_u16(a); } -// CHECK-LABEL: @test_vld3q_u32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint32x4x3_t @test_vld3q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X3_T:%.*]] poison, <4 x i32> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x i32> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X3_T]] [[DOTFCA_0_2_INSERT]] +// uint32x4x3_t test_vld3q_u32(uint32_t const *a) { return vld3q_u32(a); } -// CHECK-LABEL: @test_vld3q_u64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x2x3_t @test_vld3q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T:%.*]] poison, <2 x i64> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// uint64x2x3_t test_vld3q_u64(uint64_t const *a) { return vld3q_u64(a); } -// CHECK-LABEL: @test_vld3q_s8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.int8x16x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x3_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.int8x16x3_t @test_vld3q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X3_T:%.*]] poison, <16 x i8> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X3_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X3_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT8X16X3_T]] [[DOTFCA_0_2_INSERT]] +// int8x16x3_t test_vld3q_s8(int8_t const *a) { return vld3q_s8(a); } -// CHECK-LABEL: @test_vld3q_s16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int16x8x3_t @test_vld3q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X3_T:%.*]] poison, <8 x i16> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x i16> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT16X8X3_T]] [[DOTFCA_0_2_INSERT]] +// int16x8x3_t test_vld3q_s16(int16_t const *a) { return vld3q_s16(a); } -// CHECK-LABEL: @test_vld3q_s32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int32x4x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int32x4x3_t @test_vld3q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X3_T:%.*]] poison, <4 x i32> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x i32> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT32X4X3_T]] [[DOTFCA_0_2_INSERT]] +// int32x4x3_t test_vld3q_s32(int32_t const *a) { return vld3q_s32(a); } -// CHECK-LABEL: @test_vld3q_s64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x2x3_t @test_vld3q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X3_T:%.*]] poison, <2 x i64> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// int64x2x3_t test_vld3q_s64(int64_t const *a) { return vld3q_s64(a); } -// CHECK-LABEL: @test_vld3q_f16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0(ptr %a) -// CHECK: store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float16x8x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float16x8x3_t @test_vld3q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X3_T:%.*]] poison, <8 x half> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x half> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x half> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]] +// float16x8x3_t test_vld3q_f16(float16_t const *a) { return vld3q_f16(a); } -// CHECK-LABEL: @test_vld3q_f32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0(ptr %a) -// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float32x4x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float32x4x3_t @test_vld3q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X3_T:%.*]] poison, <4 x float> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x float> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x float> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X3_T]] [[DOTFCA_0_2_INSERT]] +// float32x4x3_t test_vld3q_f32(float32_t const *a) { return vld3q_f32(a); } -// CHECK-LABEL: @test_vld3q_f64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld3q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T:%.*]] poison, <2 x double> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x double> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x double> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// float64x2x3_t test_vld3q_f64(float64_t const *a) { return vld3q_f64(a); } -// CHECK-LABEL: @test_vld3q_p8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x3_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.poly8x16x3_t @test_vld3q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X3_T:%.*]] poison, <16 x i8> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X3_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X3_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X3_T]] [[DOTFCA_0_2_INSERT]] +// poly8x16x3_t test_vld3q_p8(poly8_t const *a) { return vld3q_p8(a); } -// CHECK-LABEL: @test_vld3q_p16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly16x8x3_t @test_vld3q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X3_T:%.*]] poison, <8 x i16> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x i16> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X3_T]] [[DOTFCA_0_2_INSERT]] +// poly16x8x3_t test_vld3q_p16(poly16_t const *a) { return vld3q_p16(a); } -// CHECK-LABEL: @test_vld3_u8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x3_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.uint8x8x3_t @test_vld3_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X3_T:%.*]] poison, <8 x i8> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x i8> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X3_T]] [[DOTFCA_0_2_INSERT]] +// uint8x8x3_t test_vld3_u8(uint8_t const *a) { return vld3_u8(a); } -// CHECK-LABEL: @test_vld3_u16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint16x4x3_t @test_vld3_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X3_T:%.*]] poison, <4 x i16> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x i16> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X3_T]] [[DOTFCA_0_2_INSERT]] +// uint16x4x3_t test_vld3_u16(uint16_t const *a) { return vld3_u16(a); } -// CHECK-LABEL: @test_vld3_u32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint32x2x3_t @test_vld3_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X3_T:%.*]] poison, <2 x i32> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i32> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X3_T]] [[DOTFCA_0_2_INSERT]] +// uint32x2x3_t test_vld3_u32(uint32_t const *a) { return vld3_u32(a); } -// CHECK-LABEL: @test_vld3_u64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x1x3_t @test_vld3_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X3_T:%.*]] poison, <1 x i64> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X3_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X3_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT64X1X3_T]] [[DOTFCA_0_2_INSERT]] +// uint64x1x3_t test_vld3_u64(uint64_t const *a) { return vld3_u64(a); } -// CHECK-LABEL: @test_vld3_s8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.int8x8x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x3_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.int8x8x3_t @test_vld3_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X3_T:%.*]] poison, <8 x i8> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x i8> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT8X8X3_T]] [[DOTFCA_0_2_INSERT]] +// int8x8x3_t test_vld3_s8(int8_t const *a) { return vld3_s8(a); } -// CHECK-LABEL: @test_vld3_s16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int16x4x3_t @test_vld3_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X3_T:%.*]] poison, <4 x i16> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x i16> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT16X4X3_T]] [[DOTFCA_0_2_INSERT]] +// int16x4x3_t test_vld3_s16(int16_t const *a) { return vld3_s16(a); } -// CHECK-LABEL: @test_vld3_s32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int32x2x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int32x2x3_t @test_vld3_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X3_T:%.*]] poison, <2 x i32> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i32> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT32X2X3_T]] [[DOTFCA_0_2_INSERT]] +// int32x2x3_t test_vld3_s32(int32_t const *a) { return vld3_s32(a); } -// CHECK-LABEL: @test_vld3_s64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x1x3_t @test_vld3_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X3_T:%.*]] poison, <1 x i64> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X3_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X3_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT64X1X3_T]] [[DOTFCA_0_2_INSERT]] +// int64x1x3_t test_vld3_s64(int64_t const *a) { return vld3_s64(a); } -// CHECK-LABEL: @test_vld3_f16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0(ptr %a) -// CHECK: store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float16x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float16x4x3_t @test_vld3_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X3_T:%.*]] poison, <4 x half> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x half> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x half> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]] +// float16x4x3_t test_vld3_f16(float16_t const *a) { return vld3_f16(a); } -// CHECK-LABEL: @test_vld3_f32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0(ptr %a) -// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float32x2x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float32x2x3_t @test_vld3_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X3_T:%.*]] poison, <2 x float> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x float> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x float> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X3_T]] [[DOTFCA_0_2_INSERT]] +// float32x2x3_t test_vld3_f32(float32_t const *a) { return vld3_f32(a); } -// CHECK-LABEL: @test_vld3_f64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld3_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T:%.*]] poison, <1 x double> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] [[DOTFCA_0_0_INSERT]], <1 x double> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] [[DOTFCA_0_1_INSERT]], <1 x double> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X3_T]] [[DOTFCA_0_2_INSERT]] +// float64x1x3_t test_vld3_f64(float64_t const *a) { return vld3_f64(a); } -// CHECK-LABEL: @test_vld3_p8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x3_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.poly8x8x3_t @test_vld3_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X3_T:%.*]] poison, <8 x i8> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x i8> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X3_T]] [[DOTFCA_0_2_INSERT]] +// poly8x8x3_t test_vld3_p8(poly8_t const *a) { return vld3_p8(a); } -// CHECK-LABEL: @test_vld3_p16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly16x4x3_t @test_vld3_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X3_T:%.*]] poison, <4 x i16> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x i16> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X3_T]] [[DOTFCA_0_2_INSERT]] +// poly16x4x3_t test_vld3_p16(poly16_t const *a) { return vld3_p16(a); } -// CHECK-LABEL: @test_vld4q_u8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x4_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.uint8x16x4_t @test_vld4q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T:%.*]] poison, <16 x i8> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] [[DOTFCA_0_2_INSERT]], <16 x i8> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X4_T]] [[DOTFCA_0_3_INSERT]] +// uint8x16x4_t test_vld4q_u8(uint8_t const *a) { return vld4q_u8(a); } -// CHECK-LABEL: @test_vld4q_u16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint16x8x4_t @test_vld4q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X4_T:%.*]] poison, <8 x i16> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x i16> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x i16> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X4_T]] [[DOTFCA_0_3_INSERT]] +// uint16x8x4_t test_vld4q_u16(uint16_t const *a) { return vld4q_u16(a); } -// CHECK-LABEL: @test_vld4q_u32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint32x4x4_t @test_vld4q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X4_T:%.*]] poison, <4 x i32> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x i32> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x i32> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X4_T]] [[DOTFCA_0_3_INSERT]] +// uint32x4x4_t test_vld4q_u32(uint32_t const *a) { return vld4q_u32(a); } -// CHECK-LABEL: @test_vld4q_u64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x2x4_t @test_vld4q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T:%.*]] poison, <2 x i64> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i64> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// uint64x2x4_t test_vld4q_u64(uint64_t const *a) { return vld4q_u64(a); } -// CHECK-LABEL: @test_vld4q_s8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.int8x16x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x4_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.int8x16x4_t @test_vld4q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T:%.*]] poison, <16 x i8> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_2_INSERT]], <16 x i8> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_3_INSERT]] +// int8x16x4_t test_vld4q_s8(int8_t const *a) { return vld4q_s8(a); } -// CHECK-LABEL: @test_vld4q_s16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int16x8x4_t @test_vld4q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X4_T:%.*]] poison, <8 x i16> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x i16> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x i16> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT16X8X4_T]] [[DOTFCA_0_3_INSERT]] +// int16x8x4_t test_vld4q_s16(int16_t const *a) { return vld4q_s16(a); } -// CHECK-LABEL: @test_vld4q_s32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int32x4x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int32x4x4_t @test_vld4q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X4_T:%.*]] poison, <4 x i32> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x i32> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x i32> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT32X4X4_T]] [[DOTFCA_0_3_INSERT]] +// int32x4x4_t test_vld4q_s32(int32_t const *a) { return vld4q_s32(a); } -// CHECK-LABEL: @test_vld4q_s64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x2x4_t @test_vld4q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X4_T:%.*]] poison, <2 x i64> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i64> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// int64x2x4_t test_vld4q_s64(int64_t const *a) { return vld4q_s64(a); } -// CHECK-LABEL: @test_vld4q_f16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0(ptr %a) -// CHECK: store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float16x8x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float16x8x4_t @test_vld4q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X4_T:%.*]] poison, <8 x half> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x half> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x half> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x half> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]] +// float16x8x4_t test_vld4q_f16(float16_t const *a) { return vld4q_f16(a); } -// CHECK-LABEL: @test_vld4q_f32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0(ptr %a) -// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float32x4x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float32x4x4_t @test_vld4q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X4_T:%.*]] poison, <4 x float> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x float> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x float> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x float> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X4_T]] [[DOTFCA_0_3_INSERT]] +// float32x4x4_t test_vld4q_f32(float32_t const *a) { return vld4q_f32(a); } -// CHECK-LABEL: @test_vld4q_f64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld4q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T:%.*]] poison, <2 x double> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x double> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x double> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x double> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// float64x2x4_t test_vld4q_f64(float64_t const *a) { return vld4q_f64(a); } -// CHECK-LABEL: @test_vld4q_p8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x4_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.poly8x16x4_t @test_vld4q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X4_T:%.*]] poison, <16 x i8> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X4_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X4_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X4_T]] [[DOTFCA_0_2_INSERT]], <16 x i8> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X4_T]] [[DOTFCA_0_3_INSERT]] +// poly8x16x4_t test_vld4q_p8(poly8_t const *a) { return vld4q_p8(a); } -// CHECK-LABEL: @test_vld4q_p16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly16x8x4_t @test_vld4q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X4_T:%.*]] poison, <8 x i16> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x i16> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x i16> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X4_T]] [[DOTFCA_0_3_INSERT]] +// poly16x8x4_t test_vld4q_p16(poly16_t const *a) { return vld4q_p16(a); } -// CHECK-LABEL: @test_vld4_u8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x4_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.uint8x8x4_t @test_vld4_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X4_T:%.*]] poison, <8 x i8> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x i8> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x i8> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X4_T]] [[DOTFCA_0_3_INSERT]] +// uint8x8x4_t test_vld4_u8(uint8_t const *a) { return vld4_u8(a); } -// CHECK-LABEL: @test_vld4_u16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint16x4x4_t @test_vld4_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X4_T:%.*]] poison, <4 x i16> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x i16> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x i16> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X4_T]] [[DOTFCA_0_3_INSERT]] +// uint16x4x4_t test_vld4_u16(uint16_t const *a) { return vld4_u16(a); } -// CHECK-LABEL: @test_vld4_u32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint32x2x4_t @test_vld4_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X4_T:%.*]] poison, <2 x i32> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i32> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i32> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X4_T]] [[DOTFCA_0_3_INSERT]] +// uint32x2x4_t test_vld4_u32(uint32_t const *a) { return vld4_u32(a); } -// CHECK-LABEL: @test_vld4_u64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x1x4_t @test_vld4_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X4_T:%.*]] poison, <1 x i64> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X4_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X4_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X4_T]] [[DOTFCA_0_2_INSERT]], <1 x i64> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT64X1X4_T]] [[DOTFCA_0_3_INSERT]] +// uint64x1x4_t test_vld4_u64(uint64_t const *a) { return vld4_u64(a); } -// CHECK-LABEL: @test_vld4_s8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.int8x8x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x4_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.int8x8x4_t @test_vld4_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X4_T:%.*]] poison, <8 x i8> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x i8> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x i8> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT8X8X4_T]] [[DOTFCA_0_3_INSERT]] +// int8x8x4_t test_vld4_s8(int8_t const *a) { return vld4_s8(a); } -// CHECK-LABEL: @test_vld4_s16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int16x4x4_t @test_vld4_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X4_T:%.*]] poison, <4 x i16> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x i16> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x i16> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT16X4X4_T]] [[DOTFCA_0_3_INSERT]] +// int16x4x4_t test_vld4_s16(int16_t const *a) { return vld4_s16(a); } -// CHECK-LABEL: @test_vld4_s32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int32x2x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int32x2x4_t @test_vld4_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X4_T:%.*]] poison, <2 x i32> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i32> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i32> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT32X2X4_T]] [[DOTFCA_0_3_INSERT]] +// int32x2x4_t test_vld4_s32(int32_t const *a) { return vld4_s32(a); } -// CHECK-LABEL: @test_vld4_s64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x1x4_t @test_vld4_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X4_T:%.*]] poison, <1 x i64> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X4_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X4_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X4_T]] [[DOTFCA_0_2_INSERT]], <1 x i64> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT64X1X4_T]] [[DOTFCA_0_3_INSERT]] +// int64x1x4_t test_vld4_s64(int64_t const *a) { return vld4_s64(a); } -// CHECK-LABEL: @test_vld4_f16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0(ptr %a) -// CHECK: store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float16x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float16x4x4_t @test_vld4_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X4_T:%.*]] poison, <4 x half> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x half> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x half> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x half> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]] +// float16x4x4_t test_vld4_f16(float16_t const *a) { return vld4_f16(a); } -// CHECK-LABEL: @test_vld4_f32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0(ptr %a) -// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float32x2x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float32x2x4_t @test_vld4_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X4_T:%.*]] poison, <2 x float> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x float> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x float> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x float> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X4_T]] [[DOTFCA_0_3_INSERT]] +// float32x2x4_t test_vld4_f32(float32_t const *a) { return vld4_f32(a); } -// CHECK-LABEL: @test_vld4_f64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld4_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T:%.*]] poison, <1 x double> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_0_INSERT]], <1 x double> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_1_INSERT]], <1 x double> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_2_INSERT]], <1 x double> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_3_INSERT]] +// float64x1x4_t test_vld4_f64(float64_t const *a) { return vld4_f64(a); } -// CHECK-LABEL: @test_vld4_p8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x4_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.poly8x8x4_t @test_vld4_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X4_T:%.*]] poison, <8 x i8> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x i8> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x i8> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X4_T]] [[DOTFCA_0_3_INSERT]] +// poly8x8x4_t test_vld4_p8(poly8_t const *a) { return vld4_p8(a); } -// CHECK-LABEL: @test_vld4_p16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly16x4x4_t @test_vld4_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X4_T:%.*]] poison, <4 x i16> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x i16> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x i16> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X4_T]] [[DOTFCA_0_3_INSERT]] +// poly16x4x4_t test_vld4_p16(poly16_t const *a) { return vld4_p16(a); } -// CHECK-LABEL: @test_vst1q_u8( -// CHECK: store <16 x i8> %b, ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <16 x i8> [[B]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_u8(uint8_t *a, uint8x16_t b) { vst1q_u8(a, b); } -// CHECK-LABEL: @test_vst1q_u16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: store <8 x i16> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[TMP1]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_u16(uint16_t *a, uint16x8_t b) { vst1q_u16(a, b); } -// CHECK-LABEL: @test_vst1q_u32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: store <4 x i32> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_u32(uint32_t *a, uint32x4_t b) { vst1q_u32(a, b); } -// CHECK-LABEL: @test_vst1q_u64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: store <2 x i64> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_u64(uint64_t *a, uint64x2_t b) { vst1q_u64(a, b); } -// CHECK-LABEL: @test_vst1q_s8( -// CHECK: store <16 x i8> %b, ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <16 x i8> [[B]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_s8(int8_t *a, int8x16_t b) { vst1q_s8(a, b); } -// CHECK-LABEL: @test_vst1q_s16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: store <8 x i16> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[TMP1]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_s16(int16_t *a, int16x8_t b) { vst1q_s16(a, b); } -// CHECK-LABEL: @test_vst1q_s32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: store <4 x i32> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_s32(int32_t *a, int32x4_t b) { vst1q_s32(a, b); } -// CHECK-LABEL: @test_vst1q_s64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: store <2 x i64> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_s64(int64_t *a, int64x2_t b) { vst1q_s64(a, b); } -// CHECK-LABEL: @test_vst1q_f16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK: store <8 x half> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: store <8 x half> [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_f16(float16_t *a, float16x8_t b) { vst1q_f16(a, b); } -// CHECK-LABEL: @test_vst1q_f32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: store <4 x float> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_f32(float32_t *a, float32x4_t b) { vst1q_f32(a, b); } -// CHECK-LABEL: @test_vst1q_f64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK: store <2 x double> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_f64(float64_t *a, float64x2_t b) { vst1q_f64(a, b); } -// CHECK-LABEL: @test_vst1q_p8( -// CHECK: store <16 x i8> %b, ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <16 x i8> [[B]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_p8(poly8_t *a, poly8x16_t b) { vst1q_p8(a, b); } -// CHECK-LABEL: @test_vst1q_p16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: store <8 x i16> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: store <8 x i16> [[TMP1]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_p16(poly16_t *a, poly16x8_t b) { vst1q_p16(a, b); } -// CHECK-LABEL: @test_vst1_u8( -// CHECK: store <8 x i8> %b, ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <8 x i8> [[B]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_u8(uint8_t *a, uint8x8_t b) { vst1_u8(a, b); } -// CHECK-LABEL: @test_vst1_u16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: store <4 x i16> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: store <4 x i16> [[TMP1]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_u16(uint16_t *a, uint16x4_t b) { vst1_u16(a, b); } -// CHECK-LABEL: @test_vst1_u32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: store <2 x i32> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_u32(uint32_t *a, uint32x2_t b) { vst1_u32(a, b); } -// CHECK-LABEL: @test_vst1_u64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: store <1 x i64> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: store <1 x i64> [[TMP1]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_u64(uint64_t *a, uint64x1_t b) { vst1_u64(a, b); } -// CHECK-LABEL: @test_vst1_s8( -// CHECK: store <8 x i8> %b, ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <8 x i8> [[B]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_s8(int8_t *a, int8x8_t b) { vst1_s8(a, b); } -// CHECK-LABEL: @test_vst1_s16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: store <4 x i16> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: store <4 x i16> [[TMP1]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_s16(int16_t *a, int16x4_t b) { vst1_s16(a, b); } -// CHECK-LABEL: @test_vst1_s32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: store <2 x i32> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_s32(int32_t *a, int32x2_t b) { vst1_s32(a, b); } -// CHECK-LABEL: @test_vst1_s64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: store <1 x i64> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: store <1 x i64> [[TMP1]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_s64(int64_t *a, int64x1_t b) { vst1_s64(a, b); } -// CHECK-LABEL: @test_vst1_f16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK: store <4 x half> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: store <4 x half> [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_f16(float16_t *a, float16x4_t b) { vst1_f16(a, b); } -// CHECK-LABEL: @test_vst1_f32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: store <2 x float> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_f32(float32_t *a, float32x2_t b) { vst1_f32(a, b); } -// CHECK-LABEL: @test_vst1_f64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK: store <1 x double> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: store <1 x double> [[TMP2]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_f64(float64_t *a, float64x1_t b) { vst1_f64(a, b); } -// CHECK-LABEL: @test_vst1_p8( -// CHECK: store <8 x i8> %b, ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <8 x i8> [[B]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_p8(poly8_t *a, poly8x8_t b) { vst1_p8(a, b); } -// CHECK-LABEL: @test_vst1_p16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: store <4 x i16> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: store <4 x i16> [[TMP1]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_p16(poly16_t *a, poly16x4_t b) { vst1_p16(a, b); } -// CHECK-LABEL: @test_vst2q_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) { vst2q_u8(a, b); } -// CHECK-LABEL: @test_vst2q_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) { vst2q_u16(a, b); } -// CHECK-LABEL: @test_vst2q_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) { vst2q_u32(a, b); } -// CHECK-LABEL: @test_vst2q_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) { vst2q_u64(a, b); } -// CHECK-LABEL: @test_vst2q_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_s8(int8_t *a, int8x16x2_t b) { vst2q_s8(a, b); } -// CHECK-LABEL: @test_vst2q_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_s16(int16_t *a, int16x8x2_t b) { vst2q_s16(a, b); } -// CHECK-LABEL: @test_vst2q_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_s32(int32_t *a, int32x4x2_t b) { vst2q_s32(a, b); } -// CHECK-LABEL: @test_vst2q_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_s64(int64_t *a, int64x2x2_t b) { vst2q_s64(a, b); } -// CHECK-LABEL: @test_vst2q_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> [[TMP7]], <8 x half> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> [[TMP4]], <8 x half> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_f16(float16_t *a, float16x8x2_t b) { vst2q_f16(a, b); } -// CHECK-LABEL: @test_vst2q_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[TMP7]], <4 x float> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[TMP4]], <4 x float> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_f32(float32_t *a, float32x4x2_t b) { vst2q_f32(a, b); } -// CHECK-LABEL: @test_vst2q_f64( -// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[TMP7]], <2 x double> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[TMP4]], <2 x double> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_f64(float64_t *a, float64x2x2_t b) { vst2q_f64(a, b); } -// CHECK-LABEL: @test_vst2q_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) { vst2q_p8(a, b); } -// CHECK-LABEL: @test_vst2q_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) { vst2q_p16(a, b); } -// CHECK-LABEL: @test_vst2_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_u8(uint8_t *a, uint8x8x2_t b) { vst2_u8(a, b); } -// CHECK-LABEL: @test_vst2_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_u16(uint16_t *a, uint16x4x2_t b) { vst2_u16(a, b); } -// CHECK-LABEL: @test_vst2_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_u32(uint32_t *a, uint32x2x2_t b) { vst2_u32(a, b); } -// CHECK-LABEL: @test_vst2_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_u64(uint64_t *a, uint64x1x2_t b) { vst2_u64(a, b); } -// CHECK-LABEL: @test_vst2_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_s8(int8_t *a, int8x8x2_t b) { vst2_s8(a, b); } -// CHECK-LABEL: @test_vst2_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_s16(int16_t *a, int16x4x2_t b) { vst2_s16(a, b); } -// CHECK-LABEL: @test_vst2_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_s32(int32_t *a, int32x2x2_t b) { vst2_s32(a, b); } -// CHECK-LABEL: @test_vst2_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_s64(int64_t *a, int64x1x2_t b) { vst2_s64(a, b); } -// CHECK-LABEL: @test_vst2_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> [[TMP7]], <4 x half> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> [[TMP4]], <4 x half> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_f16(float16_t *a, float16x4x2_t b) { vst2_f16(a, b); } -// CHECK-LABEL: @test_vst2_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> [[TMP7]], <2 x float> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> [[TMP4]], <2 x float> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_f32(float32_t *a, float32x2x2_t b) { vst2_f32(a, b); } -// CHECK-LABEL: @test_vst2_f64( -// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> [[TMP7]], <1 x double> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_0_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_1_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_2_8_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_SROA_2_8_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> [[TMP4]], <1 x double> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_f64(float64_t *a, float64x1x2_t b) { vst2_f64(a, b); } -// CHECK-LABEL: @test_vst2_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_p8(poly8_t *a, poly8x8x2_t b) { vst2_p8(a, b); } -// CHECK-LABEL: @test_vst2_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_p16(poly16_t *a, poly16x4x2_t b) { vst2_p16(a, b); } -// CHECK-LABEL: @test_vst3q_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) { vst3q_u8(a, b); } -// CHECK-LABEL: @test_vst3q_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) { vst3q_u16(a, b); } -// CHECK-LABEL: @test_vst3q_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) { vst3q_u32(a, b); } -// CHECK-LABEL: @test_vst3q_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) { vst3q_u64(a, b); } -// CHECK-LABEL: @test_vst3q_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_s8(int8_t *a, int8x16x3_t b) { vst3q_s8(a, b); } -// CHECK-LABEL: @test_vst3q_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_s16(int16_t *a, int16x8x3_t b) { vst3q_s16(a, b); } -// CHECK-LABEL: @test_vst3q_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_s32(int32_t *a, int32x4x3_t b) { vst3q_s32(a, b); } -// CHECK-LABEL: @test_vst3q_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_s64(int64_t *a, int64x2x3_t b) { vst3q_s64(a, b); } -// CHECK-LABEL: @test_vst3q_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> [[TMP6]], <8 x half> [[TMP7]], <8 x half> [[TMP8]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_f16(float16_t *a, float16x8x3_t b) { vst3q_f16(a, b); } -// CHECK-LABEL: @test_vst3q_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> [[TMP6]], <4 x float> [[TMP7]], <4 x float> [[TMP8]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_f32(float32_t *a, float32x4x3_t b) { vst3q_f32(a, b); } -// CHECK-LABEL: @test_vst3q_f64( -// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x double> [[TMP8]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_f64(float64_t *a, float64x2x3_t b) { vst3q_f64(a, b); } -// CHECK-LABEL: @test_vst3q_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) { vst3q_p8(a, b); } -// CHECK-LABEL: @test_vst3q_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) { vst3q_p16(a, b); } -// CHECK-LABEL: @test_vst3_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_u8(uint8_t *a, uint8x8x3_t b) { vst3_u8(a, b); } -// CHECK-LABEL: @test_vst3_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_u16(uint16_t *a, uint16x4x3_t b) { vst3_u16(a, b); } -// CHECK-LABEL: @test_vst3_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_u32(uint32_t *a, uint32x2x3_t b) { vst3_u32(a, b); } -// CHECK-LABEL: @test_vst3_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_u64(uint64_t *a, uint64x1x3_t b) { vst3_u64(a, b); } -// CHECK-LABEL: @test_vst3_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_s8(int8_t *a, int8x8x3_t b) { vst3_s8(a, b); } -// CHECK-LABEL: @test_vst3_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_s16(int16_t *a, int16x4x3_t b) { vst3_s16(a, b); } -// CHECK-LABEL: @test_vst3_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_s32(int32_t *a, int32x2x3_t b) { vst3_s32(a, b); } -// CHECK-LABEL: @test_vst3_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_s64(int64_t *a, int64x1x3_t b) { vst3_s64(a, b); } -// CHECK-LABEL: @test_vst3_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_f16(float16_t *a, float16x4x3_t b) { vst3_f16(a, b); } -// CHECK-LABEL: @test_vst3_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_f32(float32_t *a, float32x2x3_t b) { vst3_f32(a, b); } -// CHECK-LABEL: @test_vst3_f64( -// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_0_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_1_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_2_8_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_2_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_4_16_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[B_SROA_2_8_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[B_SROA_4_16_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> [[TMP6]], <1 x double> [[TMP7]], <1 x double> [[TMP8]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_f64(float64_t *a, float64x1x3_t b) { vst3_f64(a, b); } -// CHECK-LABEL: @test_vst3_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_p8(poly8_t *a, poly8x8x3_t b) { vst3_p8(a, b); } -// CHECK-LABEL: @test_vst3_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_p16(poly16_t *a, poly16x4x3_t b) { vst3_p16(a, b); } -// CHECK-LABEL: @test_vst4q_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) { vst4q_u8(a, b); } -// CHECK-LABEL: @test_vst4q_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) { vst4q_u16(a, b); } -// CHECK-LABEL: @test_vst4q_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) { vst4q_u32(a, b); } -// CHECK-LABEL: @test_vst4q_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) { vst4q_u64(a, b); } -// CHECK-LABEL: @test_vst4q_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_s8(int8_t *a, int8x16x4_t b) { vst4q_s8(a, b); } -// CHECK-LABEL: @test_vst4q_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_s16(int16_t *a, int16x8x4_t b) { vst4q_s16(a, b); } -// CHECK-LABEL: @test_vst4q_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_s32(int32_t *a, int32x4x4_t b) { vst4q_s32(a, b); } -// CHECK-LABEL: @test_vst4q_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_s64(int64_t *a, int64x2x4_t b) { vst4q_s64(a, b); } -// CHECK-LABEL: @test_vst4q_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half> -// CHECK: call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> [[TMP8]], <8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_f16(float16_t *a, float16x8x4_t b) { vst4q_f16(a, b); } -// CHECK-LABEL: @test_vst4q_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> -// CHECK: call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_3_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_f32(float32_t *a, float32x4x4_t b) { vst4q_f32(a, b); } -// CHECK-LABEL: @test_vst4q_f64( -// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_3_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_f64(float64_t *a, float64x2x4_t b) { vst4q_f64(a, b); } -// CHECK-LABEL: @test_vst4q_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) { vst4q_p8(a, b); } -// CHECK-LABEL: @test_vst4q_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) { vst4q_p16(a, b); } -// CHECK-LABEL: @test_vst4_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_u8(uint8_t *a, uint8x8x4_t b) { vst4_u8(a, b); } -// CHECK-LABEL: @test_vst4_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_u16(uint16_t *a, uint16x4x4_t b) { vst4_u16(a, b); } -// CHECK-LABEL: @test_vst4_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_u32(uint32_t *a, uint32x2x4_t b) { vst4_u32(a, b); } -// CHECK-LABEL: @test_vst4_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP4]], <1 x i64> [[TMP5]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_u64(uint64_t *a, uint64x1x4_t b) { vst4_u64(a, b); } -// CHECK-LABEL: @test_vst4_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_s8(int8_t *a, int8x8x4_t b) { vst4_s8(a, b); } -// CHECK-LABEL: @test_vst4_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_s16(int16_t *a, int16x4x4_t b) { vst4_s16(a, b); } -// CHECK-LABEL: @test_vst4_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_s32(int32_t *a, int32x2x4_t b) { vst4_s32(a, b); } -// CHECK-LABEL: @test_vst4_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP4]], <1 x i64> [[TMP5]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_s64(int64_t *a, int64x1x4_t b) { vst4_s64(a, b); } -// CHECK-LABEL: @test_vst4_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half> -// CHECK: call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_f16(float16_t *a, float16x4x4_t b) { vst4_f16(a, b); } -// CHECK-LABEL: @test_vst4_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> -// CHECK: call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_f32(float32_t *a, float32x2x4_t b) { vst4_f32(a, b); } -// CHECK-LABEL: @test_vst4_f64( -// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_0_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_1_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_2_8_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_2_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_4_16_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_3_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_6_24_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP3]], i32 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[B_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[B_SROA_2_8_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[B_SROA_4_16_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[B_SROA_6_24_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> [[TMP8]], <1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_f64(float64_t *a, float64x1x4_t b) { vst4_f64(a, b); } -// CHECK-LABEL: @test_vst4_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_p8(poly8_t *a, poly8x8x4_t b) { vst4_p8(a, b); } -// CHECK-LABEL: @test_vst4_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_p16(poly16_t *a, poly16x4x4_t b) { vst4_p16(a, b); } -// CHECK-LABEL: @test_vld1q_f64_x2( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld1q_f64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD1XN]], 0 +// CHECK-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD1XN]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T:%.*]] poison, <2 x double> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x double> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// float64x2x2_t test_vld1q_f64_x2(float64_t const *a) { return vld1q_f64_x2(a); } -// CHECK-LABEL: @test_vld1q_p64_x2( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld1q_p64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD1XN]], 0 +// CHECK-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD1XN]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T:%.*]] poison, <2 x i64> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) { return vld1q_p64_x2(a); } -// CHECK-LABEL: @test_vld1_f64_x2( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld1_f64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD1XN]], 0 +// CHECK-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD1XN]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T:%.*]] poison, <1 x double> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T]] [[DOTFCA_0_0_INSERT]], <1 x double> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X2_T]] [[DOTFCA_0_1_INSERT]] +// float64x1x2_t test_vld1_f64_x2(float64_t const *a) { return vld1_f64_x2(a); } -// CHECK-LABEL: @test_vld1_p64_x2( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld1_p64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD1XN]], 0 +// CHECK-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD1XN]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T:%.*]] poison, <1 x i64> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X2_T]] [[DOTFCA_0_1_INSERT]] +// poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) { return vld1_p64_x2(a); } -// CHECK-LABEL: @test_vld1q_f64_x3( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld1q_f64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 0 +// CHECK-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 1 +// CHECK-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T:%.*]] poison, <2 x double> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x double> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x double> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// float64x2x3_t test_vld1q_f64_x3(float64_t const *a) { return vld1q_f64_x3(a); } -// CHECK-LABEL: @test_vld1q_p64_x3( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld1q_p64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 0 +// CHECK-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 1 +// CHECK-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T:%.*]] poison, <2 x i64> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) { return vld1q_p64_x3(a); } -// CHECK-LABEL: @test_vld1_f64_x3( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld1_f64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 0 +// CHECK-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 1 +// CHECK-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T:%.*]] poison, <1 x double> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] [[DOTFCA_0_0_INSERT]], <1 x double> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] [[DOTFCA_0_1_INSERT]], <1 x double> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X3_T]] [[DOTFCA_0_2_INSERT]] +// float64x1x3_t test_vld1_f64_x3(float64_t const *a) { return vld1_f64_x3(a); } -// CHECK-LABEL: @test_vld1_p64_x3( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld1_p64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 0 +// CHECK-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 1 +// CHECK-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T:%.*]] poison, <1 x i64> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X3_T]] [[DOTFCA_0_2_INSERT]] +// poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) { return vld1_p64_x3(a); } -// CHECK-LABEL: @test_vld1q_f64_x4( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld1q_f64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 0 +// CHECK-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 1 +// CHECK-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 2 +// CHECK-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T:%.*]] poison, <2 x double> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x double> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x double> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x double> [[VLD1XN_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// float64x2x4_t test_vld1q_f64_x4(float64_t const *a) { return vld1q_f64_x4(a); } -// CHECK-LABEL: @test_vld1q_p64_x4( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld1q_p64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 0 +// CHECK-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 1 +// CHECK-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 2 +// CHECK-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T:%.*]] poison, <2 x i64> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i64> [[VLD1XN_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) { return vld1q_p64_x4(a); } -// CHECK-LABEL: @test_vld1_f64_x4( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld1_f64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 0 +// CHECK-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 1 +// CHECK-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 2 +// CHECK-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T:%.*]] poison, <1 x double> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_0_INSERT]], <1 x double> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_1_INSERT]], <1 x double> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_2_INSERT]], <1 x double> [[VLD1XN_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_3_INSERT]] +// float64x1x4_t test_vld1_f64_x4(float64_t const *a) { return vld1_f64_x4(a); } -// CHECK-LABEL: @test_vld1_p64_x4( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld1_p64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 0 +// CHECK-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 1 +// CHECK-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 2 +// CHECK-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T:%.*]] poison, <1 x i64> [[VLD1XN_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD1XN_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD1XN_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_2_INSERT]], <1 x i64> [[VLD1XN_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_3_INSERT]] +// poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) { return vld1_p64_x4(a); } -// CHECK-LABEL: @test_vst1q_f64_x2( -// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> [[TMP7]], <2 x double> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_f64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> [[TMP4]], <2 x double> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) { vst1q_f64_x2(a, b); } -// CHECK-LABEL: @test_vst1q_p64_x2( -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_p64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) { vst1q_p64_x2(a, b); } -// CHECK-LABEL: @test_vst1_f64_x2( -// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> [[TMP7]], <1 x double> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_f64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_0_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_1_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_2_8_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_SROA_2_8_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> [[TMP4]], <1 x double> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) { vst1_f64_x2(a, b); } -// CHECK-LABEL: @test_vst1_p64_x2( -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_p64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) { vst1_p64_x2(a, b); } -// CHECK-LABEL: @test_vst1q_f64_x3( -// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_f64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x double> [[TMP8]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) { vst1q_f64_x3(a, b); } -// CHECK-LABEL: @test_vst1q_p64_x3( -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_p64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) { vst1q_p64_x3(a, b); } -// CHECK-LABEL: @test_vst1_f64_x3( -// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_f64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_0_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_1_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_2_8_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_2_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_4_16_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[B_SROA_2_8_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[B_SROA_4_16_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> [[TMP6]], <1 x double> [[TMP7]], <1 x double> [[TMP8]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) { vst1_f64_x3(a, b); } -// CHECK-LABEL: @test_vst1_p64_x3( -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_p64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) { vst1_p64_x3(a, b); } -// CHECK-LABEL: @test_vst1q_f64_x4( -// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_f64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_3_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) { vst1q_f64_x4(a, b); } -// CHECK-LABEL: @test_vst1q_p64_x4( -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_p64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) { vst1q_p64_x4(a, b); } -// CHECK-LABEL: @test_vst1_f64_x4( -// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_f64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_0_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_1_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_2_8_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_2_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_4_16_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_3_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_6_24_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP3]], i32 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[B_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[B_SROA_2_8_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[B_SROA_4_16_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[B_SROA_6_24_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> [[TMP8]], <1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) { vst1_f64_x4(a, b); } -// CHECK-LABEL: @test_vst1_p64_x4( -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_p64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP4]], <1 x i64> [[TMP5]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1_p64_x4(poly64_t *a, poly64x1x4_t b) { vst1_p64_x4(a, b); } -// CHECK-LABEL: @test_vceqd_s64( -// CHECK: [[TMP0:%.*]] = icmp eq i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vceqd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vceqd_s64(int64_t a, int64_t b) { return (uint64_t)vceqd_s64(a, b); } -// CHECK-LABEL: @test_vceqd_u64( -// CHECK: [[TMP0:%.*]] = icmp eq i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vceqd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vceqd_u64(uint64_t a, uint64_t b) { return (int64_t)vceqd_u64(a, b); } -// CHECK-LABEL: @test_vceqzd_s64( -// CHECK: [[TMP0:%.*]] = icmp eq i64 %a, 0 -// CHECK: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vceqzd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[A]], 0 +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQZ_I]] +// uint64_t test_vceqzd_s64(int64_t a) { return (uint64_t)vceqzd_s64(a); } -// CHECK-LABEL: @test_vceqzd_u64( -// CHECK: [[TMP0:%.*]] = icmp eq i64 %a, 0 -// CHECK: [[VCEQZD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQZD_I]] +// CHECK-LABEL: define dso_local i64 @test_vceqzd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[A]], 0 +// CHECK-NEXT: [[VCEQZD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQZD_I]] +// int64_t test_vceqzd_u64(int64_t a) { return (int64_t)vceqzd_u64(a); } -// CHECK-LABEL: @test_vcged_s64( -// CHECK: [[TMP0:%.*]] = icmp sge i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcged_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sge i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcged_s64(int64_t a, int64_t b) { return (uint64_t)vcged_s64(a, b); } -// CHECK-LABEL: @test_vcged_u64( -// CHECK: [[TMP0:%.*]] = icmp uge i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcged_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp uge i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcged_u64(uint64_t a, uint64_t b) { return (uint64_t)vcged_u64(a, b); } -// CHECK-LABEL: @test_vcgezd_s64( -// CHECK: [[TMP0:%.*]] = icmp sge i64 %a, 0 -// CHECK: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCGEZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgezd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sge i64 [[A]], 0 +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCGEZ_I]] +// uint64_t test_vcgezd_s64(int64_t a) { return (uint64_t)vcgezd_s64(a); } -// CHECK-LABEL: @test_vcgtd_s64( -// CHECK: [[TMP0:%.*]] = icmp sgt i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgtd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcgtd_s64(int64_t a, int64_t b) { return (uint64_t)vcgtd_s64(a, b); } -// CHECK-LABEL: @test_vcgtd_u64( -// CHECK: [[TMP0:%.*]] = icmp ugt i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgtd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcgtd_u64(uint64_t a, uint64_t b) { return (uint64_t)vcgtd_u64(a, b); } -// CHECK-LABEL: @test_vcgtzd_s64( -// CHECK: [[TMP0:%.*]] = icmp sgt i64 %a, 0 -// CHECK: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCGTZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgtzd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[A]], 0 +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCGTZ_I]] +// uint64_t test_vcgtzd_s64(int64_t a) { return (uint64_t)vcgtzd_s64(a); } -// CHECK-LABEL: @test_vcled_s64( -// CHECK: [[TMP0:%.*]] = icmp sle i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcled_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sle i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcled_s64(int64_t a, int64_t b) { return (uint64_t)vcled_s64(a, b); } -// CHECK-LABEL: @test_vcled_u64( -// CHECK: [[TMP0:%.*]] = icmp ule i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcled_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp ule i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcled_u64(uint64_t a, uint64_t b) { return (uint64_t)vcled_u64(a, b); } -// CHECK-LABEL: @test_vclezd_s64( -// CHECK: [[TMP0:%.*]] = icmp sle i64 %a, 0 -// CHECK: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCLEZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vclezd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sle i64 [[A]], 0 +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCLEZ_I]] +// uint64_t test_vclezd_s64(int64_t a) { return (uint64_t)vclezd_s64(a); } -// CHECK-LABEL: @test_vcltd_s64( -// CHECK: [[TMP0:%.*]] = icmp slt i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcltd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcltd_s64(int64_t a, int64_t b) { return (uint64_t)vcltd_s64(a, b); } -// CHECK-LABEL: @test_vcltd_u64( -// CHECK: [[TMP0:%.*]] = icmp ult i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcltd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp ult i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcltd_u64(uint64_t a, uint64_t b) { return (uint64_t)vcltd_u64(a, b); } -// CHECK-LABEL: @test_vcltzd_s64( -// CHECK: [[TMP0:%.*]] = icmp slt i64 %a, 0 -// CHECK: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCLTZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vcltzd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt i64 [[A]], 0 +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCLTZ_I]] +// uint64_t test_vcltzd_s64(int64_t a) { return (uint64_t)vcltzd_s64(a); } -// CHECK-LABEL: @test_vtstd_s64( -// CHECK: [[TMP0:%.*]] = and i64 %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0 -// CHECK: [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64 -// CHECK: ret i64 [[VTSTD_I]] +// CHECK-LABEL: define dso_local i64 @test_vtstd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and i64 [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0 +// CHECK-NEXT: [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64 +// CHECK-NEXT: ret i64 [[VTSTD_I]] +// uint64_t test_vtstd_s64(int64_t a, int64_t b) { return (uint64_t)vtstd_s64(a, b); } -// CHECK-LABEL: @test_vtstd_u64( -// CHECK: [[TMP0:%.*]] = and i64 %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0 -// CHECK: [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64 -// CHECK: ret i64 [[VTSTD_I]] +// CHECK-LABEL: define dso_local i64 @test_vtstd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and i64 [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0 +// CHECK-NEXT: [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64 +// CHECK-NEXT: ret i64 [[VTSTD_I]] +// uint64_t test_vtstd_u64(uint64_t a, uint64_t b) { return (uint64_t)vtstd_u64(a, b); } -// CHECK-LABEL: @test_vabsd_s64( -// CHECK: [[VABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.abs.i64(i64 %a) -// CHECK: ret i64 [[VABSD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vabsd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.abs.i64(i64 [[A]]) +// CHECK-NEXT: ret i64 [[VABSD_S64_I]] +// int64_t test_vabsd_s64(int64_t a) { return (int64_t)vabsd_s64(a); } -// CHECK-LABEL: @test_vqabsb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[VQABSB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQABSB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqabsb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VQABSB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQABSB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqabsb_s8(int8_t a) { return (int8_t)vqabsb_s8(a); } -// CHECK-LABEL: @test_vqabsh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQABSH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQABSH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqabsh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQABSH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQABSH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqabsh_s16(int16_t a) { return (int16_t)vqabsh_s16(a); } -// CHECK-LABEL: @test_vqabss_s32( -// CHECK: [[VQABSS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) -// CHECK: ret i32 [[VQABSS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqabss_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqabs.i32(i32 [[A]]) +// CHECK-NEXT: ret i32 [[VQABSS_S32_I]] +// int32_t test_vqabss_s32(int32_t a) { return (int32_t)vqabss_s32(a); } -// CHECK-LABEL: @test_vqabsd_s64( -// CHECK: [[VQABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqabs.i64(i64 %a) -// CHECK: ret i64 [[VQABSD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqabsd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqabs.i64(i64 [[A]]) +// CHECK-NEXT: ret i64 [[VQABSD_S64_I]] +// int64_t test_vqabsd_s64(int64_t a) { return (int64_t)vqabsd_s64(a); } -// CHECK-LABEL: @test_vnegd_s64( -// CHECK: [[VNEGD_I:%.*]] = sub i64 0, %a -// CHECK: ret i64 [[VNEGD_I]] +// CHECK-LABEL: define dso_local i64 @test_vnegd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VNEGD_I:%.*]] = sub i64 0, [[A]] +// CHECK-NEXT: ret i64 [[VNEGD_I]] +// int64_t test_vnegd_s64(int64_t a) { return (int64_t)vnegd_s64(a); } -// CHECK-LABEL: @test_vqnegb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[VQNEGB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQNEGB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqnegb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VQNEGB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQNEGB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqnegb_s8(int8_t a) { return (int8_t)vqnegb_s8(a); } -// CHECK-LABEL: @test_vqnegh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQNEGH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQNEGH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqnegh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQNEGH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQNEGH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqnegh_s16(int16_t a) { return (int16_t)vqnegh_s16(a); } -// CHECK-LABEL: @test_vqnegs_s32( -// CHECK: [[VQNEGS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqneg.i32(i32 %a) -// CHECK: ret i32 [[VQNEGS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqnegs_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqneg.i32(i32 [[A]]) +// CHECK-NEXT: ret i32 [[VQNEGS_S32_I]] +// int32_t test_vqnegs_s32(int32_t a) { return (int32_t)vqnegs_s32(a); } -// CHECK-LABEL: @test_vqnegd_s64( -// CHECK: [[VQNEGD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqneg.i64(i64 %a) -// CHECK: ret i64 [[VQNEGD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqnegd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqneg.i64(i64 [[A]]) +// CHECK-NEXT: ret i64 [[VQNEGD_S64_I]] +// int64_t test_vqnegd_s64(int64_t a) { return (int64_t)vqnegd_s64(a); } -// CHECK-LABEL: @test_vuqaddb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VUQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VUQADDB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vuqaddb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VUQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VUQADDB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// int8_t test_vuqaddb_s8(int8_t a, uint8_t b) { return (int8_t)vuqaddb_s8(a, b); } -// CHECK-LABEL: @test_vuqaddh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VUQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VUQADDH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vuqaddh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VUQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VUQADDH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vuqaddh_s16(int16_t a, uint16_t b) { return (int16_t)vuqaddh_s16(a, b); } -// CHECK-LABEL: @test_vuqadds_s32( -// CHECK: [[VUQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VUQADDS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vuqadds_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.suqadd.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VUQADDS_S32_I]] +// int32_t test_vuqadds_s32(int32_t a, uint32_t b) { return (int32_t)vuqadds_s32(a, b); } -// CHECK-LABEL: @test_vuqaddd_s64( -// CHECK: [[VUQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VUQADDD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vuqaddd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.suqadd.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VUQADDD_S64_I]] +// int64_t test_vuqaddd_s64(int64_t a, uint64_t b) { return (int64_t)vuqaddd_s64(a, b); } -// CHECK-LABEL: @test_vsqaddb_u8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VSQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VSQADDB_U8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vsqaddb_u8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VSQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VSQADDB_U8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// uint8_t test_vsqaddb_u8(uint8_t a, int8_t b) { return (uint8_t)vsqaddb_u8(a, b); } -// CHECK-LABEL: @test_vsqaddh_u16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VSQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VSQADDH_U16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vsqaddh_u16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VSQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VSQADDH_U16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// uint16_t test_vsqaddh_u16(uint16_t a, int16_t b) { return (uint16_t)vsqaddh_u16(a, b); } -// CHECK-LABEL: @test_vsqadds_u32( -// CHECK: [[VSQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VSQADDS_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vsqadds_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.usqadd.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VSQADDS_U32_I]] +// uint32_t test_vsqadds_u32(uint32_t a, int32_t b) { return (uint32_t)vsqadds_u32(a, b); } -// CHECK-LABEL: @test_vsqaddd_u64( -// CHECK: [[VSQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VSQADDD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vsqaddd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.usqadd.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VSQADDD_U64_I]] +// uint64_t test_vsqaddd_u64(uint64_t a, int64_t b) { return (uint64_t)vsqaddd_u64(a, b); } -// CHECK-LABEL: @test_vqdmlalh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %c, i64 0 -// CHECK: [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0 -// CHECK: [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0_I]]) -// CHECK: ret i32 [[VQDMLXL1_I]] +// CHECK-LABEL: define dso_local i32 @test_vqdmlalh_s16( +// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], i16 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0 +// CHECK-NEXT: [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[LANE0_I]]) +// CHECK-NEXT: ret i32 [[VQDMLXL1_I]] +// int32_t test_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) { return (int32_t)vqdmlalh_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlals_s32( -// CHECK: [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) -// CHECK: [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL_I]]) -// CHECK: ret i64 [[VQDMLXL1_I]] +// CHECK-LABEL: define dso_local i64 @test_vqdmlals_s32( +// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[C]]) +// CHECK-NEXT: [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[VQDMLXL_I]]) +// CHECK-NEXT: ret i64 [[VQDMLXL1_I]] +// int64_t test_vqdmlals_s32(int64_t a, int32_t b, int32_t c) { return (int64_t)vqdmlals_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmlslh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %c, i64 0 -// CHECK: [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0 -// CHECK: [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0_I]]) -// CHECK: ret i32 [[VQDMLXL1_I]] +// CHECK-LABEL: define dso_local i32 @test_vqdmlslh_s16( +// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], i16 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0 +// CHECK-NEXT: [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[LANE0_I]]) +// CHECK-NEXT: ret i32 [[VQDMLXL1_I]] +// int32_t test_vqdmlslh_s16(int32_t a, int16_t b, int16_t c) { return (int32_t)vqdmlslh_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlsls_s32( -// CHECK: [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) -// CHECK: [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL_I]]) -// CHECK: ret i64 [[VQDMLXL1_I]] +// CHECK-LABEL: define dso_local i64 @test_vqdmlsls_s32( +// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[C]]) +// CHECK-NEXT: [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[VQDMLXL_I]]) +// CHECK-NEXT: ret i64 [[VQDMLXL1_I]] +// int64_t test_vqdmlsls_s32(int64_t a, int32_t b, int32_t c) { return (int64_t)vqdmlsls_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmullh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 -// CHECK: ret i32 [[TMP2]] +// CHECK-LABEL: define dso_local i32 @test_vqdmullh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 +// CHECK-NEXT: ret i32 [[TMP2]] +// int32_t test_vqdmullh_s16(int16_t a, int16_t b) { return (int32_t)vqdmullh_s16(a, b); } -// CHECK-LABEL: @test_vqdmulls_s32( -// CHECK: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 %b) -// CHECK: ret i64 [[VQDMULLS_S32_I]] +// CHECK-LABEL: define dso_local i64 @test_vqdmulls_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i64 [[VQDMULLS_S32_I]] +// int64_t test_vqdmulls_s32(int32_t a, int32_t b) { return (int64_t)vqdmulls_s32(a, b); } -// CHECK-LABEL: @test_vqmovunh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQMOVUNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVUNH_S16_I]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqmovunh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQMOVUNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVUNH_S16_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// uint8_t test_vqmovunh_s16(int16_t a) { return (uint8_t)vqmovunh_s16(a); } -// CHECK-LABEL: @test_vqmovuns_s32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQMOVUNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVUNS_S32_I]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqmovuns_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQMOVUNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVUNS_S32_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// uint16_t test_vqmovuns_s32(int32_t a) { return (uint16_t)vqmovuns_s32(a); } -// CHECK-LABEL: @test_vqmovund_s64( -// CHECK: [[VQMOVUND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %a) -// CHECK: ret i32 [[VQMOVUND_S64_I]] +// CHECK-LABEL: define dso_local i32 @test_vqmovund_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVUND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 [[A]]) +// CHECK-NEXT: ret i32 [[VQMOVUND_S64_I]] +// uint32_t test_vqmovund_s64(int64_t a) { return (uint32_t)vqmovund_s64(a); } -// CHECK-LABEL: @test_vqmovnh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQMOVNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_S16_I]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqmovnh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQMOVNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_S16_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqmovnh_s16(int16_t a) { return (int8_t)vqmovnh_s16(a); } -// CHECK-LABEL: @test_vqmovns_s32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQMOVNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_S32_I]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqmovns_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQMOVNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_S32_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqmovns_s32(int32_t a) { return (int16_t)vqmovns_s32(a); } -// CHECK-LABEL: @test_vqmovnd_s64( -// CHECK: [[VQMOVND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %a) -// CHECK: ret i32 [[VQMOVND_S64_I]] +// CHECK-LABEL: define dso_local i32 @test_vqmovnd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 [[A]]) +// CHECK-NEXT: ret i32 [[VQMOVND_S64_I]] +// int32_t test_vqmovnd_s64(int64_t a) { return (int32_t)vqmovnd_s64(a); } -// CHECK-LABEL: @test_vqmovnh_u16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQMOVNH_U16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_U16_I]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqmovnh_u16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQMOVNH_U16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_U16_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqmovnh_u16(int16_t a) { return (int8_t)vqmovnh_u16(a); } -// CHECK-LABEL: @test_vqmovns_u32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQMOVNS_U32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_U32_I]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqmovns_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQMOVNS_U32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_U32_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqmovns_u32(int32_t a) { return (int16_t)vqmovns_u32(a); } -// CHECK-LABEL: @test_vqmovnd_u64( -// CHECK: [[VQMOVND_U64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %a) -// CHECK: ret i32 [[VQMOVND_U64_I]] +// CHECK-LABEL: define dso_local i32 @test_vqmovnd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVND_U64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 [[A]]) +// CHECK-NEXT: ret i32 [[VQMOVND_U64_I]] +// int32_t test_vqmovnd_u64(int64_t a) { return (int32_t)vqmovnd_u64(a); } -// CHECK-LABEL: @test_vceqs_f32( -// CHECK: [[TMP0:%.*]] = fcmp oeq float %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i32 @test_vceqs_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq float [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vceqs_f32(float32_t a, float32_t b) { return (uint32_t)vceqs_f32(a, b); } -// CHECK-LABEL: @test_vceqd_f64( -// CHECK: [[TMP0:%.*]] = fcmp oeq double %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i64 @test_vceqd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq double [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vceqd_f64(float64_t a, float64_t b) { return (uint64_t)vceqd_f64(a, b); } -// CHECK-LABEL: @test_vceqzs_f32( -// CHECK: [[TMP0:%.*]] = fcmp oeq float %a, 0.000000e+00 -// CHECK: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCEQZ_I]] +// CHECK-LABEL: define dso_local i32 @test_vceqzs_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq float [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCEQZ_I]] +// uint32_t test_vceqzs_f32(float32_t a) { return (uint32_t)vceqzs_f32(a); } -// CHECK-LABEL: @test_vceqzd_f64( -// CHECK: [[TMP0:%.*]] = fcmp oeq double %a, 0.000000e+00 -// CHECK: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vceqzd_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq double [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQZ_I]] +// uint64_t test_vceqzd_f64(float64_t a) { return (uint64_t)vceqzd_f64(a); } -// CHECK-LABEL: @test_vcges_f32( -// CHECK: [[TMP0:%.*]] = fcmp oge float %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i32 @test_vcges_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge float [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vcges_f32(float32_t a, float32_t b) { return (uint32_t)vcges_f32(a, b); } -// CHECK-LABEL: @test_vcged_f64( -// CHECK: [[TMP0:%.*]] = fcmp oge double %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcged_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge double [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcged_f64(float64_t a, float64_t b) { return (uint64_t)vcged_f64(a, b); } -// CHECK-LABEL: @test_vcgezs_f32( -// CHECK: [[TMP0:%.*]] = fcmp oge float %a, 0.000000e+00 -// CHECK: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCGEZ_I]] +// CHECK-LABEL: define dso_local i32 @test_vcgezs_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge float [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCGEZ_I]] +// uint32_t test_vcgezs_f32(float32_t a) { return (uint32_t)vcgezs_f32(a); } -// CHECK-LABEL: @test_vcgezd_f64( -// CHECK: [[TMP0:%.*]] = fcmp oge double %a, 0.000000e+00 -// CHECK: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCGEZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgezd_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge double [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCGEZ_I]] +// uint64_t test_vcgezd_f64(float64_t a) { return (uint64_t)vcgezd_f64(a); } -// CHECK-LABEL: @test_vcgts_f32( -// CHECK: [[TMP0:%.*]] = fcmp ogt float %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i32 @test_vcgts_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt float [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vcgts_f32(float32_t a, float32_t b) { return (uint32_t)vcgts_f32(a, b); } -// CHECK-LABEL: @test_vcgtd_f64( -// CHECK: [[TMP0:%.*]] = fcmp ogt double %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgtd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt double [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcgtd_f64(float64_t a, float64_t b) { return (uint64_t)vcgtd_f64(a, b); } -// CHECK-LABEL: @test_vcgtzs_f32( -// CHECK: [[TMP0:%.*]] = fcmp ogt float %a, 0.000000e+00 -// CHECK: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCGTZ_I]] +// CHECK-LABEL: define dso_local i32 @test_vcgtzs_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt float [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCGTZ_I]] +// uint32_t test_vcgtzs_f32(float32_t a) { return (uint32_t)vcgtzs_f32(a); } -// CHECK-LABEL: @test_vcgtzd_f64( -// CHECK: [[TMP0:%.*]] = fcmp ogt double %a, 0.000000e+00 -// CHECK: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCGTZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgtzd_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt double [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCGTZ_I]] +// uint64_t test_vcgtzd_f64(float64_t a) { return (uint64_t)vcgtzd_f64(a); } -// CHECK-LABEL: @test_vcles_f32( -// CHECK: [[TMP0:%.*]] = fcmp ole float %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i32 @test_vcles_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole float [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vcles_f32(float32_t a, float32_t b) { return (uint32_t)vcles_f32(a, b); } -// CHECK-LABEL: @test_vcled_f64( -// CHECK: [[TMP0:%.*]] = fcmp ole double %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcled_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole double [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcled_f64(float64_t a, float64_t b) { return (uint64_t)vcled_f64(a, b); } -// CHECK-LABEL: @test_vclezs_f32( -// CHECK: [[TMP0:%.*]] = fcmp ole float %a, 0.000000e+00 -// CHECK: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCLEZ_I]] +// CHECK-LABEL: define dso_local i32 @test_vclezs_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole float [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCLEZ_I]] +// uint32_t test_vclezs_f32(float32_t a) { return (uint32_t)vclezs_f32(a); } -// CHECK-LABEL: @test_vclezd_f64( -// CHECK: [[TMP0:%.*]] = fcmp ole double %a, 0.000000e+00 -// CHECK: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCLEZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vclezd_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole double [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCLEZ_I]] +// uint64_t test_vclezd_f64(float64_t a) { return (uint64_t)vclezd_f64(a); } -// CHECK-LABEL: @test_vclts_f32( -// CHECK: [[TMP0:%.*]] = fcmp olt float %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i32 @test_vclts_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt float [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vclts_f32(float32_t a, float32_t b) { return (uint32_t)vclts_f32(a, b); } -// CHECK-LABEL: @test_vcltd_f64( -// CHECK: [[TMP0:%.*]] = fcmp olt double %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcltd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt double [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcltd_f64(float64_t a, float64_t b) { return (uint64_t)vcltd_f64(a, b); } -// CHECK-LABEL: @test_vcltzs_f32( -// CHECK: [[TMP0:%.*]] = fcmp olt float %a, 0.000000e+00 -// CHECK: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCLTZ_I]] +// CHECK-LABEL: define dso_local i32 @test_vcltzs_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt float [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCLTZ_I]] +// uint32_t test_vcltzs_f32(float32_t a) { return (uint32_t)vcltzs_f32(a); } -// CHECK-LABEL: @test_vcltzd_f64( -// CHECK: [[TMP0:%.*]] = fcmp olt double %a, 0.000000e+00 -// CHECK: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCLTZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vcltzd_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt double [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCLTZ_I]] +// uint64_t test_vcltzd_f64(float64_t a) { return (uint64_t)vcltzd_f64(a); } -// CHECK-LABEL: @test_vcages_f32( -// CHECK: [[VCAGES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %a, float %b) -// CHECK: ret i32 [[VCAGES_F32_I]] +// CHECK-LABEL: define dso_local i32 @test_vcages_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float [[A]], float [[B]]) +// CHECK-NEXT: ret i32 [[VCAGES_F32_I]] +// uint32_t test_vcages_f32(float32_t a, float32_t b) { return (uint32_t)vcages_f32(a, b); } -// CHECK-LABEL: @test_vcaged_f64( -// CHECK: [[VCAGED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %a, double %b) -// CHECK: ret i64 [[VCAGED_F64_I]] +// CHECK-LABEL: define dso_local i64 @test_vcaged_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double [[A]], double [[B]]) +// CHECK-NEXT: ret i64 [[VCAGED_F64_I]] +// uint64_t test_vcaged_f64(float64_t a, float64_t b) { return (uint64_t)vcaged_f64(a, b); } -// CHECK-LABEL: @test_vcagts_f32( -// CHECK: [[VCAGTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %a, float %b) -// CHECK: ret i32 [[VCAGTS_F32_I]] +// CHECK-LABEL: define dso_local i32 @test_vcagts_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float [[A]], float [[B]]) +// CHECK-NEXT: ret i32 [[VCAGTS_F32_I]] +// uint32_t test_vcagts_f32(float32_t a, float32_t b) { return (uint32_t)vcagts_f32(a, b); } -// CHECK-LABEL: @test_vcagtd_f64( -// CHECK: [[VCAGTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %a, double %b) -// CHECK: ret i64 [[VCAGTD_F64_I]] +// CHECK-LABEL: define dso_local i64 @test_vcagtd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double [[A]], double [[B]]) +// CHECK-NEXT: ret i64 [[VCAGTD_F64_I]] +// uint64_t test_vcagtd_f64(float64_t a, float64_t b) { return (uint64_t)vcagtd_f64(a, b); } -// CHECK-LABEL: @test_vcales_f32( -// CHECK: [[VCALES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %b, float %a) -// CHECK: ret i32 [[VCALES_F32_I]] +// CHECK-LABEL: define dso_local i32 @test_vcales_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float [[B]], float [[A]]) +// CHECK-NEXT: ret i32 [[VCALES_F32_I]] +// uint32_t test_vcales_f32(float32_t a, float32_t b) { return (uint32_t)vcales_f32(a, b); } -// CHECK-LABEL: @test_vcaled_f64( -// CHECK: [[VCALED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %b, double %a) -// CHECK: ret i64 [[VCALED_F64_I]] +// CHECK-LABEL: define dso_local i64 @test_vcaled_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double [[B]], double [[A]]) +// CHECK-NEXT: ret i64 [[VCALED_F64_I]] +// uint64_t test_vcaled_f64(float64_t a, float64_t b) { return (uint64_t)vcaled_f64(a, b); } -// CHECK-LABEL: @test_vcalts_f32( -// CHECK: [[VCALTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %b, float %a) -// CHECK: ret i32 [[VCALTS_F32_I]] +// CHECK-LABEL: define dso_local i32 @test_vcalts_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float [[B]], float [[A]]) +// CHECK-NEXT: ret i32 [[VCALTS_F32_I]] +// uint32_t test_vcalts_f32(float32_t a, float32_t b) { return (uint32_t)vcalts_f32(a, b); } -// CHECK-LABEL: @test_vcaltd_f64( -// CHECK: [[VCALTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %b, double %a) -// CHECK: ret i64 [[VCALTD_F64_I]] +// CHECK-LABEL: define dso_local i64 @test_vcaltd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double [[B]], double [[A]]) +// CHECK-NEXT: ret i64 [[VCALTD_F64_I]] +// uint64_t test_vcaltd_f64(float64_t a, float64_t b) { return (uint64_t)vcaltd_f64(a, b); } -// CHECK-LABEL: @test_vshrd_n_s64( -// CHECK: [[SHRD_N:%.*]] = ashr i64 %a, 1 -// CHECK: ret i64 [[SHRD_N]] +// CHECK-LABEL: define dso_local i64 @test_vshrd_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHRD_N:%.*]] = ashr i64 [[A]], 1 +// CHECK-NEXT: ret i64 [[SHRD_N]] +// int64_t test_vshrd_n_s64(int64_t a) { return (int64_t)vshrd_n_s64(a, 1); } -// CHECK-LABEL: @test_vshr_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHR_N]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vshr_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHR_N]] +// int64x1_t test_vshr_n_s64(int64x1_t a) { return vshr_n_s64(a, 1); } -// CHECK-LABEL: @test_vshrd_n_u64( -// CHECK: ret i64 0 +// CHECK-LABEL: define dso_local i64 @test_vshrd_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret i64 0 +// uint64_t test_vshrd_n_u64(uint64_t a) { return (uint64_t)vshrd_n_u64(a, 64); } -// CHECK-LABEL: @test_vshrd_n_u64_2( -// CHECK: ret i64 0 +// CHECK-LABEL: define dso_local i64 @test_vshrd_n_u64_2( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret i64 0 +// uint64_t test_vshrd_n_u64_2() { uint64_t a = UINT64_C(0xf000000000000000); return vshrd_n_u64(a, 64); } -// CHECK-LABEL: @test_vshr_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHR_N]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vshr_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHR_N]] +// uint64x1_t test_vshr_n_u64(uint64x1_t a) { return vshr_n_u64(a, 1); } -// CHECK-LABEL: @test_vrshrd_n_s64( -// CHECK: [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 -63) -// CHECK: ret i64 [[VRSHR_N]] +// CHECK-LABEL: define dso_local i64 @test_vrshrd_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[A]], i64 -63) +// CHECK-NEXT: ret i64 [[VRSHR_N]] +// int64_t test_vrshrd_n_s64(int64_t a) { return (int64_t)vrshrd_n_s64(a, 63); } -// CHECK-LABEL: @test_vrshr_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) -// CHECK: ret <1 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vrshr_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <1 x i64> [[VRSHR_N1]] +// int64x1_t test_vrshr_n_s64(int64x1_t a) { return vrshr_n_s64(a, 1); } -// CHECK-LABEL: @test_vrshrd_n_u64( -// CHECK: [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 -63) -// CHECK: ret i64 [[VRSHR_N]] +// CHECK-LABEL: define dso_local i64 @test_vrshrd_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[A]], i64 -63) +// CHECK-NEXT: ret i64 [[VRSHR_N]] +// uint64_t test_vrshrd_n_u64(uint64_t a) { return (uint64_t)vrshrd_n_u64(a, 63); } -// CHECK-LABEL: @test_vrshr_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) -// CHECK: ret <1 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vrshr_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <1 x i64> [[VRSHR_N1]] +// uint64x1_t test_vrshr_n_u64(uint64x1_t a) { return vrshr_n_u64(a, 1); } -// CHECK-LABEL: @test_vsrad_n_s64( -// CHECK: [[SHRD_N:%.*]] = ashr i64 %b, 63 -// CHECK: [[TMP0:%.*]] = add i64 %a, [[SHRD_N]] -// CHECK: ret i64 [[TMP0]] +// CHECK-LABEL: define dso_local i64 @test_vsrad_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHRD_N:%.*]] = ashr i64 [[B]], 63 +// CHECK-NEXT: [[TMP0:%.*]] = add i64 [[A]], [[SHRD_N]] +// CHECK-NEXT: ret i64 [[TMP0]] +// int64_t test_vsrad_n_s64(int64_t a, int64_t b) { return (int64_t)vsrad_n_s64(a, b, 63); } -// CHECK-LABEL: @test_vsra_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], splat (i64 1) -// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <1 x i64> [[TMP4]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsra_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], splat (i64 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <1 x i64> [[TMP4]] +// int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) { return vsra_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsrad_n_u64( -// CHECK: [[SHRD_N:%.*]] = lshr i64 %b, 63 -// CHECK: [[TMP0:%.*]] = add i64 %a, [[SHRD_N]] -// CHECK: ret i64 [[TMP0]] +// CHECK-LABEL: define dso_local i64 @test_vsrad_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHRD_N:%.*]] = lshr i64 [[B]], 63 +// CHECK-NEXT: [[TMP0:%.*]] = add i64 [[A]], [[SHRD_N]] +// CHECK-NEXT: ret i64 [[TMP0]] +// uint64_t test_vsrad_n_u64(uint64_t a, uint64_t b) { return (uint64_t)vsrad_n_u64(a, b, 63); } -// CHECK-LABEL: @test_vsrad_n_u64_2( -// CHECK: ret i64 %a +// CHECK-LABEL: define dso_local i64 @test_vsrad_n_u64_2( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret i64 [[A]] +// uint64_t test_vsrad_n_u64_2(uint64_t a, uint64_t b) { return (uint64_t)vsrad_n_u64(a, b, 64); } -// CHECK-LABEL: @test_vsra_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], splat (i64 1) -// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <1 x i64> [[TMP4]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsra_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], splat (i64 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <1 x i64> [[TMP4]] +// uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) { return vsra_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vrsrad_n_s64( -// CHECK: [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %b, i64 -63) -// CHECK: [[TMP1:%.*]] = add i64 %a, [[TMP0]] -// CHECK: ret i64 [[TMP1]] +// CHECK-LABEL: define dso_local i64 @test_vrsrad_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[B]], i64 -63) +// CHECK-NEXT: [[TMP1:%.*]] = add i64 [[A]], [[TMP0]] +// CHECK-NEXT: ret i64 [[TMP1]] +// int64_t test_vrsrad_n_s64(int64_t a, int64_t b) { return (int64_t)vrsrad_n_s64(a, b, 63); } -// CHECK-LABEL: @test_vrsra_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <1 x i64> [[TMP3]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vrsra_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]] +// CHECK-NEXT: ret <1 x i64> [[TMP3]] +// int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) { return vrsra_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vrsrad_n_u64( -// CHECK: [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %b, i64 -63) -// CHECK: [[TMP1:%.*]] = add i64 %a, [[TMP0]] -// CHECK: ret i64 [[TMP1]] +// CHECK-LABEL: define dso_local i64 @test_vrsrad_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[B]], i64 -63) +// CHECK-NEXT: [[TMP1:%.*]] = add i64 [[A]], [[TMP0]] +// CHECK-NEXT: ret i64 [[TMP1]] +// uint64_t test_vrsrad_n_u64(uint64_t a, uint64_t b) { return (uint64_t)vrsrad_n_u64(a, b, 63); } -// CHECK-LABEL: @test_vrsra_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <1 x i64> [[TMP3]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vrsra_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]] +// CHECK-NEXT: ret <1 x i64> [[TMP3]] +// uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) { return vrsra_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vshld_n_s64( -// CHECK: [[SHLD_N:%.*]] = shl i64 %a, 1 -// CHECK: ret i64 [[SHLD_N]] +// CHECK-LABEL: define dso_local i64 @test_vshld_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHLD_N:%.*]] = shl i64 [[A]], 1 +// CHECK-NEXT: ret i64 [[SHLD_N]] +// int64_t test_vshld_n_s64(int64_t a) { return (int64_t)vshld_n_s64(a, 1); } -// CHECK-LABEL: @test_vshl_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHL_N]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHL_N]] +// int64x1_t test_vshl_n_s64(int64x1_t a) { return vshl_n_s64(a, 1); } -// CHECK-LABEL: @test_vshld_n_u64( -// CHECK: [[SHLD_N:%.*]] = shl i64 %a, 63 -// CHECK: ret i64 [[SHLD_N]] +// CHECK-LABEL: define dso_local i64 @test_vshld_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHLD_N:%.*]] = shl i64 [[A]], 63 +// CHECK-NEXT: ret i64 [[SHLD_N]] +// uint64_t test_vshld_n_u64(uint64_t a) { return (uint64_t)vshld_n_u64(a, 63); } -// CHECK-LABEL: @test_vshl_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHL_N]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHL_N]] +// uint64x1_t test_vshl_n_u64(uint64x1_t a) { return vshl_n_u64(a, 1); } -// CHECK-LABEL: @test_vqshlb_n_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[VQSHLB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_S8]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqshlb_n_s8( +// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VQSHLB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_S8]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqshlb_n_s8(int8_t a) { return (int8_t)vqshlb_n_s8(a, 7); } -// CHECK-LABEL: @test_vqshlh_n_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQSHLH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_S16]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqshlh_n_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQSHLH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_S16]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqshlh_n_s16(int16_t a) { return (int16_t)vqshlh_n_s16(a, 15); } -// CHECK-LABEL: @test_vqshls_n_s32( -// CHECK: [[VQSHLS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 31) -// CHECK: ret i32 [[VQSHLS_N_S32]] +// CHECK-LABEL: define dso_local i32 @test_vqshls_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 [[A]], i32 31) +// CHECK-NEXT: ret i32 [[VQSHLS_N_S32]] +// int32_t test_vqshls_n_s32(int32_t a) { return (int32_t)vqshls_n_s32(a, 31); } -// CHECK-LABEL: @test_vqshld_n_s64( -// CHECK: [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 63) -// CHECK: ret i64 [[VQSHL_N]] +// CHECK-LABEL: define dso_local i64 @test_vqshld_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[A]], i64 63) +// CHECK-NEXT: ret i64 [[VQSHL_N]] +// int64_t test_vqshld_n_s64(int64_t a) { return (int64_t)vqshld_n_s64(a, 63); } -// CHECK-LABEL: @test_vqshl_n_s8( -// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer) -// CHECK: ret <8 x i8> [[VQSHL_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[A]], <8 x i8> zeroinitializer) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_N]] +// int8x8_t test_vqshl_n_s8(int8x8_t a) { return vqshl_n_s8(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_s8( -// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer) -// CHECK: ret <16 x i8> [[VQSHL_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshlq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[A]], <16 x i8> zeroinitializer) +// CHECK-NEXT: ret <16 x i8> [[VQSHL_N]] +// int8x16_t test_vqshlq_n_s8(int8x16_t a) { return vqshlq_n_s8(a, 0); } -// CHECK-LABEL: @test_vqshl_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer) -// CHECK: ret <4 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshl_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer) +// CHECK-NEXT: ret <4 x i16> [[VQSHL_N1]] +// int16x4_t test_vqshl_n_s16(int16x4_t a) { return vqshl_n_s16(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer) -// CHECK: ret <8 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshlq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer) +// CHECK-NEXT: ret <8 x i16> [[VQSHL_N1]] +// int16x8_t test_vqshlq_n_s16(int16x8_t a) { return vqshlq_n_s16(a, 0); } -// CHECK-LABEL: @test_vqshl_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer) -// CHECK: ret <2 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshl_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer) +// CHECK-NEXT: ret <2 x i32> [[VQSHL_N1]] +// int32x2_t test_vqshl_n_s32(int32x2_t a) { return vqshl_n_s32(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer) -// CHECK: ret <4 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshlq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer) +// CHECK-NEXT: ret <4 x i32> [[VQSHL_N1]] +// int32x4_t test_vqshlq_n_s32(int32x4_t a) { return vqshlq_n_s32(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer) -// CHECK: ret <2 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqshlq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer) +// CHECK-NEXT: ret <2 x i64> [[VQSHL_N1]] +// int64x2_t test_vqshlq_n_s64(int64x2_t a) { return vqshlq_n_s64(a, 0); } -// CHECK-LABEL: @test_vqshl_n_u8( -// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer) -// CHECK: ret <8 x i8> [[VQSHL_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[A]], <8 x i8> zeroinitializer) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_N]] +// uint8x8_t test_vqshl_n_u8(uint8x8_t a) { return vqshl_n_u8(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_u8( -// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer) -// CHECK: ret <16 x i8> [[VQSHL_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshlq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[A]], <16 x i8> zeroinitializer) +// CHECK-NEXT: ret <16 x i8> [[VQSHL_N]] +// uint8x16_t test_vqshlq_n_u8(uint8x16_t a) { return vqshlq_n_u8(a, 0); } -// CHECK-LABEL: @test_vqshl_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer) -// CHECK: ret <4 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshl_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer) +// CHECK-NEXT: ret <4 x i16> [[VQSHL_N1]] +// uint16x4_t test_vqshl_n_u16(uint16x4_t a) { return vqshl_n_u16(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer) -// CHECK: ret <8 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshlq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer) +// CHECK-NEXT: ret <8 x i16> [[VQSHL_N1]] +// uint16x8_t test_vqshlq_n_u16(uint16x8_t a) { return vqshlq_n_u16(a, 0); } -// CHECK-LABEL: @test_vqshl_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer) -// CHECK: ret <2 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshl_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer) +// CHECK-NEXT: ret <2 x i32> [[VQSHL_N1]] +// uint32x2_t test_vqshl_n_u32(uint32x2_t a) { return vqshl_n_u32(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer) -// CHECK: ret <4 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshlq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer) +// CHECK-NEXT: ret <4 x i32> [[VQSHL_N1]] +// uint32x4_t test_vqshlq_n_u32(uint32x4_t a) { return vqshlq_n_u32(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer) -// CHECK: ret <2 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqshlq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer) +// CHECK-NEXT: ret <2 x i64> [[VQSHL_N1]] +// uint64x2_t test_vqshlq_n_u64(uint64x2_t a) { return vqshlq_n_u64(a, 0); } -// CHECK-LABEL: @test_vqshl_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqshl_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VQSHL_N1]] +// int64x1_t test_vqshl_n_s64(int64x1_t a) { return vqshl_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshlb_n_u8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[VQSHLB_N_U8:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_U8]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqshlb_n_u8( +// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VQSHLB_N_U8:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_U8]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// uint8_t test_vqshlb_n_u8(uint8_t a) { return (uint8_t)vqshlb_n_u8(a, 7); } -// CHECK-LABEL: @test_vqshlh_n_u16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQSHLH_N_U16:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_U16]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqshlh_n_u16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQSHLH_N_U16:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_U16]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// uint16_t test_vqshlh_n_u16(uint16_t a) { return (uint16_t)vqshlh_n_u16(a, 15); } -// CHECK-LABEL: @test_vqshls_n_u32( -// CHECK: [[VQSHLS_N_U32:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 31) -// CHECK: ret i32 [[VQSHLS_N_U32]] +// CHECK-LABEL: define dso_local i32 @test_vqshls_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLS_N_U32:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 [[A]], i32 31) +// CHECK-NEXT: ret i32 [[VQSHLS_N_U32]] +// uint32_t test_vqshls_n_u32(uint32_t a) { return (uint32_t)vqshls_n_u32(a, 31); } -// CHECK-LABEL: @test_vqshld_n_u64( -// CHECK: [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 63) -// CHECK: ret i64 [[VQSHL_N]] +// CHECK-LABEL: define dso_local i64 @test_vqshld_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[A]], i64 63) +// CHECK-NEXT: ret i64 [[VQSHL_N]] +// uint64_t test_vqshld_n_u64(uint64_t a) { return (uint64_t)vqshld_n_u64(a, 63); } -// CHECK-LABEL: @test_vqshl_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqshl_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VQSHL_N1]] +// uint64x1_t test_vqshl_n_u64(uint64x1_t a) { return vqshl_n_u64(a, 1); } -// CHECK-LABEL: @test_vqshlub_n_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[VQSHLUB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLUB_N_S8]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqshlub_n_s8( +// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VQSHLUB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLUB_N_S8]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqshlub_n_s8(int8_t a) { return (int8_t)vqshlub_n_s8(a, 7); } -// CHECK-LABEL: @test_vqshluh_n_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQSHLUH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLUH_N_S16]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqshluh_n_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQSHLUH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLUH_N_S16]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqshluh_n_s16(int16_t a) { return (int16_t)vqshluh_n_s16(a, 15); } -// CHECK-LABEL: @test_vqshlus_n_s32( -// CHECK: [[VQSHLUS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 %a, i32 31) -// CHECK: ret i32 [[VQSHLUS_N_S32]] +// CHECK-LABEL: define dso_local i32 @test_vqshlus_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLUS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 [[A]], i32 31) +// CHECK-NEXT: ret i32 [[VQSHLUS_N_S32]] +// int32_t test_vqshlus_n_s32(int32_t a) { return (int32_t)vqshlus_n_s32(a, 31); } -// CHECK-LABEL: @test_vqshlud_n_s64( -// CHECK: [[VQSHLU_N:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 %a, i64 63) -// CHECK: ret i64 [[VQSHLU_N]] +// CHECK-LABEL: define dso_local i64 @test_vqshlud_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 [[A]], i64 63) +// CHECK-NEXT: ret i64 [[VQSHLU_N]] +// int64_t test_vqshlud_n_s64(int64_t a) { return (int64_t)vqshlud_n_s64(a, 63); } -// CHECK-LABEL: @test_vqshlu_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VQSHLU_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqshlu_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VQSHLU_N1]] +// uint64x1_t test_vqshlu_n_s64(int64x1_t a) { return vqshlu_n_s64(a, 1); } -// CHECK-LABEL: @test_vsrid_n_s64( -// CHECK: [[VSRID_N_S64:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: [[VSRID_N_S641:%.*]] = bitcast i64 %b to <1 x i64> -// CHECK: [[VSRID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_S64]], <1 x i64> [[VSRID_N_S641]], i32 63) -// CHECK: [[VSRID_N_S643:%.*]] = bitcast <1 x i64> [[VSRID_N_S642]] to i64 -// CHECK: ret i64 [[VSRID_N_S643]] +// CHECK-LABEL: define dso_local i64 @test_vsrid_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRID_N_S64:%.*]] = bitcast i64 [[A]] to <1 x i64> +// CHECK-NEXT: [[VSRID_N_S641:%.*]] = bitcast i64 [[B]] to <1 x i64> +// CHECK-NEXT: [[VSRID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_S64]], <1 x i64> [[VSRID_N_S641]], i32 63) +// CHECK-NEXT: [[VSRID_N_S643:%.*]] = bitcast <1 x i64> [[VSRID_N_S642]] to i64 +// CHECK-NEXT: ret i64 [[VSRID_N_S643]] +// int64_t test_vsrid_n_s64(int64_t a, int64_t b) { return (int64_t)vsrid_n_s64(a, b, 63); } -// CHECK-LABEL: @test_vsri_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1) -// CHECK: ret <1 x i64> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsri_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1) +// CHECK-NEXT: ret <1 x i64> [[VSRI_N2]] +// int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) { return vsri_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsrid_n_u64( -// CHECK: [[VSRID_N_U64:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: [[VSRID_N_U641:%.*]] = bitcast i64 %b to <1 x i64> -// CHECK: [[VSRID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_U64]], <1 x i64> [[VSRID_N_U641]], i32 63) -// CHECK: [[VSRID_N_U643:%.*]] = bitcast <1 x i64> [[VSRID_N_U642]] to i64 -// CHECK: ret i64 [[VSRID_N_U643]] +// CHECK-LABEL: define dso_local i64 @test_vsrid_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRID_N_U64:%.*]] = bitcast i64 [[A]] to <1 x i64> +// CHECK-NEXT: [[VSRID_N_U641:%.*]] = bitcast i64 [[B]] to <1 x i64> +// CHECK-NEXT: [[VSRID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_U64]], <1 x i64> [[VSRID_N_U641]], i32 63) +// CHECK-NEXT: [[VSRID_N_U643:%.*]] = bitcast <1 x i64> [[VSRID_N_U642]] to i64 +// CHECK-NEXT: ret i64 [[VSRID_N_U643]] +// uint64_t test_vsrid_n_u64(uint64_t a, uint64_t b) { return (uint64_t)vsrid_n_u64(a, b, 63); } -// CHECK-LABEL: @test_vsri_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1) -// CHECK: ret <1 x i64> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsri_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1) +// CHECK-NEXT: ret <1 x i64> [[VSRI_N2]] +// uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) { return vsri_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vslid_n_s64( -// CHECK: [[VSLID_N_S64:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: [[VSLID_N_S641:%.*]] = bitcast i64 %b to <1 x i64> -// CHECK: [[VSLID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_S64]], <1 x i64> [[VSLID_N_S641]], i32 63) -// CHECK: [[VSLID_N_S643:%.*]] = bitcast <1 x i64> [[VSLID_N_S642]] to i64 -// CHECK: ret i64 [[VSLID_N_S643]] +// CHECK-LABEL: define dso_local i64 @test_vslid_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLID_N_S64:%.*]] = bitcast i64 [[A]] to <1 x i64> +// CHECK-NEXT: [[VSLID_N_S641:%.*]] = bitcast i64 [[B]] to <1 x i64> +// CHECK-NEXT: [[VSLID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_S64]], <1 x i64> [[VSLID_N_S641]], i32 63) +// CHECK-NEXT: [[VSLID_N_S643:%.*]] = bitcast <1 x i64> [[VSLID_N_S642]] to i64 +// CHECK-NEXT: ret i64 [[VSLID_N_S643]] +// int64_t test_vslid_n_s64(int64_t a, int64_t b) { return (int64_t)vslid_n_s64(a, b, 63); } -// CHECK-LABEL: @test_vsli_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsli_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) { return vsli_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vslid_n_u64( -// CHECK: [[VSLID_N_U64:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: [[VSLID_N_U641:%.*]] = bitcast i64 %b to <1 x i64> -// CHECK: [[VSLID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_U64]], <1 x i64> [[VSLID_N_U641]], i32 63) -// CHECK: [[VSLID_N_U643:%.*]] = bitcast <1 x i64> [[VSLID_N_U642]] to i64 -// CHECK: ret i64 [[VSLID_N_U643]] +// CHECK-LABEL: define dso_local i64 @test_vslid_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLID_N_U64:%.*]] = bitcast i64 [[A]] to <1 x i64> +// CHECK-NEXT: [[VSLID_N_U641:%.*]] = bitcast i64 [[B]] to <1 x i64> +// CHECK-NEXT: [[VSLID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_U64]], <1 x i64> [[VSLID_N_U641]], i32 63) +// CHECK-NEXT: [[VSLID_N_U643:%.*]] = bitcast <1 x i64> [[VSLID_N_U642]] to i64 +// CHECK-NEXT: ret i64 [[VSLID_N_U643]] +// uint64_t test_vslid_n_u64(uint64_t a, uint64_t b) { return (uint64_t)vslid_n_u64(a, b, 63); } -// CHECK-LABEL: @test_vsli_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsli_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) { return vsli_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vqshrnh_n_s16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP0]], i32 8) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_S16]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqshrnh_n_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_S16]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqshrnh_n_s16(int16_t a) { return (int8_t)vqshrnh_n_s16(a, 8); } -// CHECK-LABEL: @test_vqshrns_n_s32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP0]], i32 16) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_S32]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqshrns_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_S32]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqshrns_n_s32(int32_t a) { return (int16_t)vqshrns_n_s32(a, 16); } -// CHECK-LABEL: @test_vqshrnd_n_s64( -// CHECK: [[VQSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %a, i32 32) -// CHECK: ret i32 [[VQSHRND_N_S64]] +// CHECK-LABEL: define dso_local i32 @test_vqshrnd_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VQSHRND_N_S64]] +// int32_t test_vqshrnd_n_s64(int64_t a) { return (int32_t)vqshrnd_n_s64(a, 32); } -// CHECK-LABEL: @test_vqshrnh_n_u16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP0]], i32 8) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_U16]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqshrnh_n_u16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_U16]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// uint8_t test_vqshrnh_n_u16(uint16_t a) { return (uint8_t)vqshrnh_n_u16(a, 8); } -// CHECK-LABEL: @test_vqshrns_n_u32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP0]], i32 16) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_U32]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqshrns_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_U32]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// uint16_t test_vqshrns_n_u32(uint32_t a) { return (uint16_t)vqshrns_n_u32(a, 16); } -// CHECK-LABEL: @test_vqshrnd_n_u64( -// CHECK: [[VQSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %a, i32 32) -// CHECK: ret i32 [[VQSHRND_N_U64]] +// CHECK-LABEL: define dso_local i32 @test_vqshrnd_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VQSHRND_N_U64]] +// uint32_t test_vqshrnd_n_u64(uint64_t a) { return (uint32_t)vqshrnd_n_u64(a, 32); } -// CHECK-LABEL: @test_vqrshrnh_n_s16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQRSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_S16]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqrshrnh_n_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQRSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_S16]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqrshrnh_n_s16(int16_t a) { return (int8_t)vqrshrnh_n_s16(a, 8); } -// CHECK-LABEL: @test_vqrshrns_n_s32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQRSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_S32]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqrshrns_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQRSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_S32]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqrshrns_n_s32(int32_t a) { return (int16_t)vqrshrns_n_s32(a, 16); } -// CHECK-LABEL: @test_vqrshrnd_n_s64( -// CHECK: [[VQRSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %a, i32 32) -// CHECK: ret i32 [[VQRSHRND_N_S64]] +// CHECK-LABEL: define dso_local i32 @test_vqrshrnd_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VQRSHRND_N_S64]] +// int32_t test_vqrshrnd_n_s64(int64_t a) { return (int32_t)vqrshrnd_n_s64(a, 32); } -// CHECK-LABEL: @test_vqrshrnh_n_u16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQRSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_U16]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqrshrnh_n_u16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQRSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_U16]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// uint8_t test_vqrshrnh_n_u16(uint16_t a) { return (uint8_t)vqrshrnh_n_u16(a, 8); } -// CHECK-LABEL: @test_vqrshrns_n_u32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQRSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_U32]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqrshrns_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQRSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_U32]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// uint16_t test_vqrshrns_n_u32(uint32_t a) { return (uint16_t)vqrshrns_n_u32(a, 16); } -// CHECK-LABEL: @test_vqrshrnd_n_u64( -// CHECK: [[VQRSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %a, i32 32) -// CHECK: ret i32 [[VQRSHRND_N_U64]] +// CHECK-LABEL: define dso_local i32 @test_vqrshrnd_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VQRSHRND_N_U64]] +// uint32_t test_vqrshrnd_n_u64(uint64_t a) { return (uint32_t)vqrshrnd_n_u64(a, 32); } -// CHECK-LABEL: @test_vqshrunh_n_s16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP0]], i32 8) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRUNH_N_S16]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqshrunh_n_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRUNH_N_S16]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqshrunh_n_s16(int16_t a) { return (int8_t)vqshrunh_n_s16(a, 8); } -// CHECK-LABEL: @test_vqshruns_n_s32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP0]], i32 16) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRUNS_N_S32]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqshruns_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRUNS_N_S32]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqshruns_n_s32(int32_t a) { return (int16_t)vqshruns_n_s32(a, 16); } -// CHECK-LABEL: @test_vqshrund_n_s64( -// CHECK: [[VQSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %a, i32 32) -// CHECK: ret i32 [[VQSHRUND_N_S64]] +// CHECK-LABEL: define dso_local i32 @test_vqshrund_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VQSHRUND_N_S64]] +// int32_t test_vqshrund_n_s64(int64_t a) { return (int32_t)vqshrund_n_s64(a, 32); } -// CHECK-LABEL: @test_vqrshrunh_n_s16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQRSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP0]], i32 8) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRUNH_N_S16]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqrshrunh_n_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQRSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRUNH_N_S16]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// uint8_t test_vqrshrunh_n_s16(int16_t a) { return (uint8_t)vqrshrunh_n_s16(a, 8); } -// CHECK-LABEL: @test_vqrshruns_n_s32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQRSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP0]], i32 16) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRUNS_N_S32]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqrshruns_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQRSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRUNS_N_S32]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// uint16_t test_vqrshruns_n_s32(int32_t a) { return (uint16_t)vqrshruns_n_s32(a, 16); } -// CHECK-LABEL: @test_vqrshrund_n_s64( -// CHECK: [[VQRSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %a, i32 32) -// CHECK: ret i32 [[VQRSHRUND_N_S64]] +// CHECK-LABEL: define dso_local i32 @test_vqrshrund_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VQRSHRUND_N_S64]] +// uint32_t test_vqrshrund_n_s64(int64_t a) { return (uint32_t)vqrshrund_n_s64(a, 32); } -// CHECK-LABEL: @test_vcvts_n_f32_s32( -// CHECK: [[VCVTS_N_F32_S32:%.*]] = call float @llvm.aarch64.neon.vcvtfxs2fp.f32.i32(i32 %a, i32 1) -// CHECK: ret float [[VCVTS_N_F32_S32]] +// CHECK-LABEL: define dso_local float @test_vcvts_n_f32_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTS_N_F32_S32:%.*]] = call float @llvm.aarch64.neon.vcvtfxs2fp.f32.i32(i32 [[A]], i32 1) +// CHECK-NEXT: ret float [[VCVTS_N_F32_S32]] +// float32_t test_vcvts_n_f32_s32(int32_t a) { return vcvts_n_f32_s32(a, 1); } -// CHECK-LABEL: @test_vcvtd_n_f64_s64( -// CHECK: [[VCVTD_N_F64_S64:%.*]] = call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %a, i32 1) -// CHECK: ret double [[VCVTD_N_F64_S64]] +// CHECK-LABEL: define dso_local double @test_vcvtd_n_f64_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTD_N_F64_S64:%.*]] = call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 [[A]], i32 1) +// CHECK-NEXT: ret double [[VCVTD_N_F64_S64]] +// float64_t test_vcvtd_n_f64_s64(int64_t a) { return vcvtd_n_f64_s64(a, 1); } -// CHECK-LABEL: @test_vcvts_n_f32_u32( -// CHECK: [[VCVTS_N_F32_U32:%.*]] = call float @llvm.aarch64.neon.vcvtfxu2fp.f32.i32(i32 %a, i32 32) -// CHECK: ret float [[VCVTS_N_F32_U32]] +// CHECK-LABEL: define dso_local float @test_vcvts_n_f32_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTS_N_F32_U32:%.*]] = call float @llvm.aarch64.neon.vcvtfxu2fp.f32.i32(i32 [[A]], i32 32) +// CHECK-NEXT: ret float [[VCVTS_N_F32_U32]] +// float32_t test_vcvts_n_f32_u32(uint32_t a) { return vcvts_n_f32_u32(a, 32); } -// CHECK-LABEL: @test_vcvtd_n_f64_u64( -// CHECK: [[VCVTD_N_F64_U64:%.*]] = call double @llvm.aarch64.neon.vcvtfxu2fp.f64.i64(i64 %a, i32 64) -// CHECK: ret double [[VCVTD_N_F64_U64]] +// CHECK-LABEL: define dso_local double @test_vcvtd_n_f64_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTD_N_F64_U64:%.*]] = call double @llvm.aarch64.neon.vcvtfxu2fp.f64.i64(i64 [[A]], i32 64) +// CHECK-NEXT: ret double [[VCVTD_N_F64_U64]] +// float64_t test_vcvtd_n_f64_u64(uint64_t a) { return vcvtd_n_f64_u64(a, 64); } -// CHECK-LABEL: @test_vcvts_n_s32_f32( -// CHECK: [[VCVTS_N_S32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f32(float %a, i32 1) -// CHECK: ret i32 [[VCVTS_N_S32_F32]] +// CHECK-LABEL: define dso_local i32 @test_vcvts_n_s32_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTS_N_S32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f32(float [[A]], i32 1) +// CHECK-NEXT: ret i32 [[VCVTS_N_S32_F32]] +// int32_t test_vcvts_n_s32_f32(float32_t a) { return (int32_t)vcvts_n_s32_f32(a, 1); } -// CHECK-LABEL: @test_vcvtd_n_s64_f64( -// CHECK: [[VCVTD_N_S64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f64(double %a, i32 1) -// CHECK: ret i64 [[VCVTD_N_S64_F64]] +// CHECK-LABEL: define dso_local i64 @test_vcvtd_n_s64_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTD_N_S64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f64(double [[A]], i32 1) +// CHECK-NEXT: ret i64 [[VCVTD_N_S64_F64]] +// int64_t test_vcvtd_n_s64_f64(float64_t a) { return (int64_t)vcvtd_n_s64_f64(a, 1); } -// CHECK-LABEL: @test_vcvts_n_u32_f32( -// CHECK: [[VCVTS_N_U32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f32(float %a, i32 32) -// CHECK: ret i32 [[VCVTS_N_U32_F32]] +// CHECK-LABEL: define dso_local i32 @test_vcvts_n_u32_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTS_N_U32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f32(float [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VCVTS_N_U32_F32]] +// uint32_t test_vcvts_n_u32_f32(float32_t a) { return (uint32_t)vcvts_n_u32_f32(a, 32); } -// CHECK-LABEL: @test_vcvtd_n_u64_f64( -// CHECK: [[VCVTD_N_U64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f64(double %a, i32 64) -// CHECK: ret i64 [[VCVTD_N_U64_F64]] +// CHECK-LABEL: define dso_local i64 @test_vcvtd_n_u64_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTD_N_U64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f64(double [[A]], i32 64) +// CHECK-NEXT: ret i64 [[VCVTD_N_U64_F64]] +// uint64_t test_vcvtd_n_u64_f64(float64_t a) { return (uint64_t)vcvtd_n_u64_f64(a, 64); } -// CHECK-LABEL: @test_vreinterpret_s8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_s16(int16x4_t a) { return vreinterpret_s8_s16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_s32(int32x2_t a) { return vreinterpret_s8_s32(a); } -// CHECK-LABEL: @test_vreinterpret_s8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_s64(int64x1_t a) { return vreinterpret_s8_s64(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) { return vreinterpret_s8_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) { return vreinterpret_s8_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) { return vreinterpret_s8_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) { return vreinterpret_s8_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// int8x8_t test_vreinterpret_s8_f16(float16x4_t a) { return vreinterpret_s8_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// int8x8_t test_vreinterpret_s8_f32(float32x2_t a) { return vreinterpret_s8_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s8_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// int8x8_t test_vreinterpret_s8_f64(float64x1_t a) { return vreinterpret_s8_f64(a); } -// CHECK-LABEL: @test_vreinterpret_s8_p8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) { return vreinterpret_s8_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) { return vreinterpret_s8_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_p64(poly64x1_t a) { return vreinterpret_s8_p64(a); } -// CHECK-LABEL: @test_vreinterpret_s16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_s8(int8x8_t a) { return vreinterpret_s16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_s16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_s32(int32x2_t a) { return vreinterpret_s16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_s16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_s64(int64x1_t a) { return vreinterpret_s16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) { return vreinterpret_s16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) { return vreinterpret_s16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) { return vreinterpret_s16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) { return vreinterpret_s16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_f16(float16x4_t a) { return vreinterpret_s16_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vreinterpret_s16_f32(float32x2_t a) { return vreinterpret_s16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vreinterpret_s16_f64(float64x1_t a) { return vreinterpret_s16_f64(a); } -// CHECK-LABEL: @test_vreinterpret_s16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) { return vreinterpret_s16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s16_p16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) { return vreinterpret_s16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_p64(poly64x1_t a) { return vreinterpret_s16_p64(a); } -// CHECK-LABEL: @test_vreinterpret_s32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_s8(int8x8_t a) { return vreinterpret_s32_s8(a); } -// CHECK-LABEL: @test_vreinterpret_s32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_s16(int16x4_t a) { return vreinterpret_s32_s16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_s64(int64x1_t a) { return vreinterpret_s32_s64(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) { return vreinterpret_s32_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) { return vreinterpret_s32_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u32( -// CHECK: ret <2 x i32> %a +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i32> [[A]] +// int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) { return vreinterpret_s32_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) { return vreinterpret_s32_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_f16(float16x4_t a) { return vreinterpret_s32_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_f32(float32x2_t a) { return vreinterpret_s32_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vreinterpret_s32_f64(float64x1_t a) { return vreinterpret_s32_f64(a); } -// CHECK-LABEL: @test_vreinterpret_s32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) { return vreinterpret_s32_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) { return vreinterpret_s32_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_p64(poly64x1_t a) { return vreinterpret_s32_p64(a); } -// CHECK-LABEL: @test_vreinterpret_s64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_s8(int8x8_t a) { return vreinterpret_s64_s8(a); } -// CHECK-LABEL: @test_vreinterpret_s64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_s16(int16x4_t a) { return vreinterpret_s64_s16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_s32(int32x2_t a) { return vreinterpret_s64_s32(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) { return vreinterpret_s64_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) { return vreinterpret_s64_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) { return vreinterpret_s64_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) { return vreinterpret_s64_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_f16(float16x4_t a) { return vreinterpret_s64_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_f32(float32x2_t a) { return vreinterpret_s64_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_f64(float64x1_t a) { return vreinterpret_s64_f64(a); } -// CHECK-LABEL: @test_vreinterpret_s64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) { return vreinterpret_s64_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) { return vreinterpret_s64_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_p64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// int64x1_t test_vreinterpret_s64_p64(poly64x1_t a) { return vreinterpret_s64_p64(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) { return vreinterpret_u8_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) { return vreinterpret_u8_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) { return vreinterpret_u8_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) { return vreinterpret_u8_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) { return vreinterpret_u8_u16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) { return vreinterpret_u8_u32(a); } -// CHECK-LABEL: @test_vreinterpret_u8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) { return vreinterpret_u8_u64(a); } -// CHECK-LABEL: @test_vreinterpret_u8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) { return vreinterpret_u8_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) { return vreinterpret_u8_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u8_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// uint8x8_t test_vreinterpret_u8_f64(float64x1_t a) { return vreinterpret_u8_f64(a); } -// CHECK-LABEL: @test_vreinterpret_u8_p8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) { return vreinterpret_u8_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) { return vreinterpret_u8_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_p64(poly64x1_t a) { return vreinterpret_u8_p64(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) { return vreinterpret_u16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) { return vreinterpret_u16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) { return vreinterpret_u16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) { return vreinterpret_u16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) { return vreinterpret_u16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_u16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) { return vreinterpret_u16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_u16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) { return vreinterpret_u16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_u16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) { return vreinterpret_u16_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) { return vreinterpret_u16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// uint16x4_t test_vreinterpret_u16_f64(float64x1_t a) { return vreinterpret_u16_f64(a); } -// CHECK-LABEL: @test_vreinterpret_u16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) { return vreinterpret_u16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u16_p16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) { return vreinterpret_u16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_p64(poly64x1_t a) { return vreinterpret_u16_p64(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) { return vreinterpret_u32_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) { return vreinterpret_u32_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s32( -// CHECK: ret <2 x i32> %a +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i32> [[A]] +// uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) { return vreinterpret_u32_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) { return vreinterpret_u32_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) { return vreinterpret_u32_u8(a); } -// CHECK-LABEL: @test_vreinterpret_u32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) { return vreinterpret_u32_u16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) { return vreinterpret_u32_u64(a); } -// CHECK-LABEL: @test_vreinterpret_u32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) { return vreinterpret_u32_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) { return vreinterpret_u32_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// uint32x2_t test_vreinterpret_u32_f64(float64x1_t a) { return vreinterpret_u32_f64(a); } -// CHECK-LABEL: @test_vreinterpret_u32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) { return vreinterpret_u32_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) { return vreinterpret_u32_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_p64(poly64x1_t a) { return vreinterpret_u32_p64(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) { return vreinterpret_u64_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) { return vreinterpret_u64_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) { return vreinterpret_u64_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) { return vreinterpret_u64_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) { return vreinterpret_u64_u8(a); } -// CHECK-LABEL: @test_vreinterpret_u64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) { return vreinterpret_u64_u16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) { return vreinterpret_u64_u32(a); } -// CHECK-LABEL: @test_vreinterpret_u64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) { return vreinterpret_u64_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) { return vreinterpret_u64_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_f64(float64x1_t a) { return vreinterpret_u64_f64(a); } -// CHECK-LABEL: @test_vreinterpret_u64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) { return vreinterpret_u64_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) { return vreinterpret_u64_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_p64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// uint64x1_t test_vreinterpret_u64_p64(poly64x1_t a) { return vreinterpret_u64_p64(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP1]] +// float16x4_t test_vreinterpret_f16_s8(int8x8_t a) { return vreinterpret_f16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s16(int16x4_t a) { return vreinterpret_f16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s32(int32x2_t a) { return vreinterpret_f16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s64(int64x1_t a) { return vreinterpret_f16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP1]] +// float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) { return vreinterpret_f16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) { return vreinterpret_f16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) { return vreinterpret_f16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) { return vreinterpret_f16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP1]] +// float16x4_t test_vreinterpret_f16_f32(float32x2_t a) { return vreinterpret_f16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_f16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP1]] +// float16x4_t test_vreinterpret_f16_f64(float64x1_t a) { return vreinterpret_f16_f64(a); } -// CHECK-LABEL: @test_vreinterpret_f16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP1]] +// float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) { return vreinterpret_f16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_f16_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) { return vreinterpret_f16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_f16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_p64(poly64x1_t a) { return vreinterpret_f16_p64(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_s8(int8x8_t a) { return vreinterpret_f32_s8(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_s16(int16x4_t a) { return vreinterpret_f32_s16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_s32(int32x2_t a) { return vreinterpret_f32_s32(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_s64(int64x1_t a) { return vreinterpret_f32_s64(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) { return vreinterpret_f32_u8(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) { return vreinterpret_f32_u16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) { return vreinterpret_f32_u32(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) { return vreinterpret_f32_u64(a); } -// CHECK-LABEL: @test_vreinterpret_f32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_f16(float16x4_t a) { return vreinterpret_f32_f16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_f64(float64x1_t a) { return vreinterpret_f32_f64(a); } -// CHECK-LABEL: @test_vreinterpret_f32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) { return vreinterpret_f32_p8(a); } -// CHECK-LABEL: @test_vreinterpret_f32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) { return vreinterpret_f32_p16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_p64(poly64x1_t a) { return vreinterpret_f32_p64(a); } -// CHECK-LABEL: @test_vreinterpret_f64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] +// float64x1_t test_vreinterpret_f64_s8(int8x8_t a) { return vreinterpret_f64_s8(a); } -// CHECK-LABEL: @test_vreinterpret_f64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] +// float64x1_t test_vreinterpret_f64_s16(int16x4_t a) { return vreinterpret_f64_s16(a); } -// CHECK-LABEL: @test_vreinterpret_f64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] +// float64x1_t test_vreinterpret_f64_s32(int32x2_t a) { return vreinterpret_f64_s32(a); } -// CHECK-LABEL: @test_vreinterpret_f64_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_s64(int64x1_t a) { return vreinterpret_f64_s64(a); } -// CHECK-LABEL: @test_vreinterpret_f64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] +// float64x1_t test_vreinterpret_f64_u8(uint8x8_t a) { return vreinterpret_f64_u8(a); } -// CHECK-LABEL: @test_vreinterpret_f64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] +// float64x1_t test_vreinterpret_f64_u16(uint16x4_t a) { return vreinterpret_f64_u16(a); } -// CHECK-LABEL: @test_vreinterpret_f64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] +// float64x1_t test_vreinterpret_f64_u32(uint32x2_t a) { return vreinterpret_f64_u32(a); } -// CHECK-LABEL: @test_vreinterpret_f64_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_u64(uint64x1_t a) { return vreinterpret_f64_u64(a); } -// CHECK-LABEL: @test_vreinterpret_f64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] +// float64x1_t test_vreinterpret_f64_f16(float16x4_t a) { return vreinterpret_f64_f16(a); } -// CHECK-LABEL: @test_vreinterpret_f64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] +// float64x1_t test_vreinterpret_f64_f32(float32x2_t a) { return vreinterpret_f64_f32(a); } -// CHECK-LABEL: @test_vreinterpret_f64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] +// float64x1_t test_vreinterpret_f64_p8(poly8x8_t a) { return vreinterpret_f64_p8(a); } -// CHECK-LABEL: @test_vreinterpret_f64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] +// float64x1_t test_vreinterpret_f64_p16(poly16x4_t a) { return vreinterpret_f64_p16(a); } -// CHECK-LABEL: @test_vreinterpret_f64_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_p64(poly64x1_t a) { return vreinterpret_f64_p64(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) { return vreinterpret_p8_s8(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) { return vreinterpret_p8_s16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) { return vreinterpret_p8_s32(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) { return vreinterpret_p8_s64(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) { return vreinterpret_p8_u8(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) { return vreinterpret_p8_u16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) { return vreinterpret_p8_u32(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) { return vreinterpret_p8_u64(a); } -// CHECK-LABEL: @test_vreinterpret_p8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) { return vreinterpret_p8_f16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) { return vreinterpret_p8_f32(a); } -// CHECK-LABEL: @test_vreinterpret_p8_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// poly8x8_t test_vreinterpret_p8_f64(float64x1_t a) { return vreinterpret_p8_f64(a); } -// CHECK-LABEL: @test_vreinterpret_p8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) { return vreinterpret_p8_p16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_p64(poly64x1_t a) { return vreinterpret_p8_p64(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) { return vreinterpret_p16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) { return vreinterpret_p16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) { return vreinterpret_p16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) { return vreinterpret_p16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) { return vreinterpret_p16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) { return vreinterpret_p16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) { return vreinterpret_p16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) { return vreinterpret_p16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_p16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) { return vreinterpret_p16_f16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) { return vreinterpret_p16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_p16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// poly16x4_t test_vreinterpret_p16_f64(float64x1_t a) { return vreinterpret_p16_f64(a); } -// CHECK-LABEL: @test_vreinterpret_p16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) { return vreinterpret_p16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_p16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_p64(poly64x1_t a) { return vreinterpret_p16_p64(a); } -// CHECK-LABEL: @test_vreinterpret_p64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// poly64x1_t test_vreinterpret_p64_s8(int8x8_t a) { return vreinterpret_p64_s8(a); } -// CHECK-LABEL: @test_vreinterpret_p64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// poly64x1_t test_vreinterpret_p64_s16(int16x4_t a) { return vreinterpret_p64_s16(a); } -// CHECK-LABEL: @test_vreinterpret_p64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// poly64x1_t test_vreinterpret_p64_s32(int32x2_t a) { return vreinterpret_p64_s32(a); } -// CHECK-LABEL: @test_vreinterpret_p64_s64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// poly64x1_t test_vreinterpret_p64_s64(int64x1_t a) { return vreinterpret_p64_s64(a); } -// CHECK-LABEL: @test_vreinterpret_p64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// poly64x1_t test_vreinterpret_p64_u8(uint8x8_t a) { return vreinterpret_p64_u8(a); } -// CHECK-LABEL: @test_vreinterpret_p64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// poly64x1_t test_vreinterpret_p64_u16(uint16x4_t a) { return vreinterpret_p64_u16(a); } -// CHECK-LABEL: @test_vreinterpret_p64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// poly64x1_t test_vreinterpret_p64_u32(uint32x2_t a) { return vreinterpret_p64_u32(a); } -// CHECK-LABEL: @test_vreinterpret_p64_u64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// poly64x1_t test_vreinterpret_p64_u64(uint64x1_t a) { return vreinterpret_p64_u64(a); } -// CHECK-LABEL: @test_vreinterpret_p64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// poly64x1_t test_vreinterpret_p64_f16(float16x4_t a) { return vreinterpret_p64_f16(a); } -// CHECK-LABEL: @test_vreinterpret_p64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// poly64x1_t test_vreinterpret_p64_f32(float32x2_t a) { return vreinterpret_p64_f32(a); } -// CHECK-LABEL: @test_vreinterpret_p64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// poly64x1_t test_vreinterpret_p64_f64(float64x1_t a) { return vreinterpret_p64_f64(a); } -// CHECK-LABEL: @test_vreinterpret_p64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// poly64x1_t test_vreinterpret_p64_p8(poly8x8_t a) { return vreinterpret_p64_p8(a); } -// CHECK-LABEL: @test_vreinterpret_p64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// poly64x1_t test_vreinterpret_p64_p16(poly16x4_t a) { return vreinterpret_p64_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) { return vreinterpretq_s8_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) { return vreinterpretq_s8_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) { return vreinterpretq_s8_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) { return vreinterpretq_s8_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) { return vreinterpretq_s8_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) { return vreinterpretq_s8_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) { return vreinterpretq_s8_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) { return vreinterpretq_s8_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) { return vreinterpretq_s8_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// int8x16_t test_vreinterpretq_s8_f64(float64x2_t a) { return vreinterpretq_s8_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_p8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) { return vreinterpretq_s8_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) { return vreinterpretq_s8_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_p64(poly64x2_t a) { return vreinterpretq_s8_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) { return vreinterpretq_s16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) { return vreinterpretq_s16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) { return vreinterpretq_s16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) { return vreinterpretq_s16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) { return vreinterpretq_s16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) { return vreinterpretq_s16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) { return vreinterpretq_s16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) { return vreinterpretq_s16_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) { return vreinterpretq_s16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vreinterpretq_s16_f64(float64x2_t a) { return vreinterpretq_s16_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) { return vreinterpretq_s16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_p16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) { return vreinterpretq_s16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_p64(poly64x2_t a) { return vreinterpretq_s16_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) { return vreinterpretq_s32_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) { return vreinterpretq_s32_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) { return vreinterpretq_s32_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) { return vreinterpretq_s32_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) { return vreinterpretq_s32_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u32( -// CHECK: ret <4 x i32> %a +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i32> [[A]] +// int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) { return vreinterpretq_s32_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) { return vreinterpretq_s32_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) { return vreinterpretq_s32_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) { return vreinterpretq_s32_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// int32x4_t test_vreinterpretq_s32_f64(float64x2_t a) { return vreinterpretq_s32_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) { return vreinterpretq_s32_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) { return vreinterpretq_s32_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_p64(poly64x2_t a) { return vreinterpretq_s32_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) { return vreinterpretq_s64_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) { return vreinterpretq_s64_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) { return vreinterpretq_s64_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) { return vreinterpretq_s64_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) { return vreinterpretq_s64_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) { return vreinterpretq_s64_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) { return vreinterpretq_s64_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) { return vreinterpretq_s64_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) { return vreinterpretq_s64_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_f64(float64x2_t a) { return vreinterpretq_s64_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) { return vreinterpretq_s64_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) { return vreinterpretq_s64_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_p64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// int64x2_t test_vreinterpretq_s64_p64(poly64x2_t a) { return vreinterpretq_s64_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) { return vreinterpretq_u8_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) { return vreinterpretq_u8_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) { return vreinterpretq_u8_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) { return vreinterpretq_u8_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) { return vreinterpretq_u8_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) { return vreinterpretq_u8_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) { return vreinterpretq_u8_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) { return vreinterpretq_u8_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) { return vreinterpretq_u8_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// uint8x16_t test_vreinterpretq_u8_f64(float64x2_t a) { return vreinterpretq_u8_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_p8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) { return vreinterpretq_u8_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) { return vreinterpretq_u8_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_p64(poly64x2_t a) { return vreinterpretq_u8_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) { return vreinterpretq_u16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) { return vreinterpretq_u16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) { return vreinterpretq_u16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) { return vreinterpretq_u16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) { return vreinterpretq_u16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) { return vreinterpretq_u16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) { return vreinterpretq_u16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) { return vreinterpretq_u16_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) { return vreinterpretq_u16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// uint16x8_t test_vreinterpretq_u16_f64(float64x2_t a) { return vreinterpretq_u16_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) { return vreinterpretq_u16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_p16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) { return vreinterpretq_u16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_p64(poly64x2_t a) { return vreinterpretq_u16_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) { return vreinterpretq_u32_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) { return vreinterpretq_u32_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s32( -// CHECK: ret <4 x i32> %a +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i32> [[A]] +// uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) { return vreinterpretq_u32_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) { return vreinterpretq_u32_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) { return vreinterpretq_u32_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) { return vreinterpretq_u32_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) { return vreinterpretq_u32_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) { return vreinterpretq_u32_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) { return vreinterpretq_u32_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// uint32x4_t test_vreinterpretq_u32_f64(float64x2_t a) { return vreinterpretq_u32_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) { return vreinterpretq_u32_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) { return vreinterpretq_u32_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_p64(poly64x2_t a) { return vreinterpretq_u32_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) { return vreinterpretq_u64_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) { return vreinterpretq_u64_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) { return vreinterpretq_u64_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) { return vreinterpretq_u64_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) { return vreinterpretq_u64_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) { return vreinterpretq_u64_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) { return vreinterpretq_u64_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) { return vreinterpretq_u64_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) { return vreinterpretq_u64_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_f64(float64x2_t a) { return vreinterpretq_u64_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) { return vreinterpretq_u64_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) { return vreinterpretq_u64_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_p64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// uint64x2_t test_vreinterpretq_u64_p64(poly64x2_t a) { return vreinterpretq_u64_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) { return vreinterpretq_f16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) { return vreinterpretq_f16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) { return vreinterpretq_f16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) { return vreinterpretq_f16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) { return vreinterpretq_f16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) { return vreinterpretq_f16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) { return vreinterpretq_f16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) { return vreinterpretq_f16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) { return vreinterpretq_f16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// float16x8_t test_vreinterpretq_f16_f64(float64x2_t a) { return vreinterpretq_f16_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) { return vreinterpretq_f16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) { return vreinterpretq_f16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_p64(poly64x2_t a) { return vreinterpretq_f16_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) { return vreinterpretq_f32_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) { return vreinterpretq_f32_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) { return vreinterpretq_f32_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) { return vreinterpretq_f32_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) { return vreinterpretq_f32_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) { return vreinterpretq_f32_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) { return vreinterpretq_f32_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) { return vreinterpretq_f32_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) { return vreinterpretq_f32_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_f64(float64x2_t a) { return vreinterpretq_f32_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) { return vreinterpretq_f32_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) { return vreinterpretq_f32_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_p64(poly64x2_t a) { return vreinterpretq_f32_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// float64x2_t test_vreinterpretq_f64_s8(int8x16_t a) { return vreinterpretq_f64_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// float64x2_t test_vreinterpretq_f64_s16(int16x8_t a) { return vreinterpretq_f64_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// float64x2_t test_vreinterpretq_f64_s32(int32x4_t a) { return vreinterpretq_f64_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_s64(int64x2_t a) { return vreinterpretq_f64_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// float64x2_t test_vreinterpretq_f64_u8(uint8x16_t a) { return vreinterpretq_f64_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// float64x2_t test_vreinterpretq_f64_u16(uint16x8_t a) { return vreinterpretq_f64_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// float64x2_t test_vreinterpretq_f64_u32(uint32x4_t a) { return vreinterpretq_f64_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_u64(uint64x2_t a) { return vreinterpretq_f64_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// float64x2_t test_vreinterpretq_f64_f16(float16x8_t a) { return vreinterpretq_f64_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// float64x2_t test_vreinterpretq_f64_f32(float32x4_t a) { return vreinterpretq_f64_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// float64x2_t test_vreinterpretq_f64_p8(poly8x16_t a) { return vreinterpretq_f64_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// float64x2_t test_vreinterpretq_f64_p16(poly16x8_t a) { return vreinterpretq_f64_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_p64(poly64x2_t a) { return vreinterpretq_f64_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) { return vreinterpretq_p8_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) { return vreinterpretq_p8_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) { return vreinterpretq_p8_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) { return vreinterpretq_p8_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) { return vreinterpretq_p8_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) { return vreinterpretq_p8_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) { return vreinterpretq_p8_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) { return vreinterpretq_p8_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) { return vreinterpretq_p8_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) { return vreinterpretq_p8_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// poly8x16_t test_vreinterpretq_p8_f64(float64x2_t a) { return vreinterpretq_p8_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) { return vreinterpretq_p8_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_p64(poly64x2_t a) { return vreinterpretq_p8_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) { return vreinterpretq_p16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) { return vreinterpretq_p16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) { return vreinterpretq_p16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) { return vreinterpretq_p16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) { return vreinterpretq_p16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) { return vreinterpretq_p16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) { return vreinterpretq_p16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) { return vreinterpretq_p16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) { return vreinterpretq_p16_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) { return vreinterpretq_p16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// poly16x8_t test_vreinterpretq_p16_f64(float64x2_t a) { return vreinterpretq_p16_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) { return vreinterpretq_p16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_p64(poly64x2_t a) { return vreinterpretq_p16_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_s8(int8x16_t a) { return vreinterpretq_p64_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_s16(int16x8_t a) { return vreinterpretq_p64_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_s32(int32x4_t a) { return vreinterpretq_p64_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_s64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// poly64x2_t test_vreinterpretq_p64_s64(int64x2_t a) { return vreinterpretq_p64_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_u8(uint8x16_t a) { return vreinterpretq_p64_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_u16(uint16x8_t a) { return vreinterpretq_p64_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_u32(uint32x4_t a) { return vreinterpretq_p64_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_u64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// poly64x2_t test_vreinterpretq_p64_u64(uint64x2_t a) { return vreinterpretq_p64_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_f16(float16x8_t a) { return vreinterpretq_p64_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_f32(float32x4_t a) { return vreinterpretq_p64_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_f64(float64x2_t a) { return vreinterpretq_p64_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_p8(poly8x16_t a) { return vreinterpretq_p64_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_p16(poly16x8_t a) { return vreinterpretq_p64_p16(a); } -// CHECK-LABEL: @test_vabds_f32( -// CHECK: [[VABDS_F32_I:%.*]] = call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) -// CHECK: ret float [[VABDS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vabds_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDS_F32_I:%.*]] = call float @llvm.aarch64.sisd.fabd.f32(float [[A]], float [[B]]) +// CHECK-NEXT: ret float [[VABDS_F32_I]] +// float32_t test_vabds_f32(float32_t a, float32_t b) { return vabds_f32(a, b); } -// CHECK-LABEL: @test_vabdd_f64( -// CHECK: [[VABDD_F64_I:%.*]] = call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) -// CHECK: ret double [[VABDD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vabdd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDD_F64_I:%.*]] = call double @llvm.aarch64.sisd.fabd.f64(double [[A]], double [[B]]) +// CHECK-NEXT: ret double [[VABDD_F64_I]] +// float64_t test_vabdd_f64(float64_t a, float64_t b) { return vabdd_f64(a, b); } -// CHECK-LABEL: @test_vuqaddq_s8( -// CHECK: entry: -// CHECK-NEXT: [[V:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK-NEXT: ret <16 x i8> [[V]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuqaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VUQADD_I]] +// int8x16_t test_vuqaddq_s8(int8x16_t a, uint8x16_t b) { return vuqaddq_s8(a, b); } -// CHECK-LABEL: @test_vuqaddq_s32( -// CHECK: [[V:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK-NEXT: ret <4 x i32> [[V]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vuqaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> [[VUQADD_I]], <4 x i32> [[VUQADD1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VUQADD2_I]] +// int32x4_t test_vuqaddq_s32(int32x4_t a, uint32x4_t b) { return vuqaddq_s32(a, b); } -// CHECK-LABEL: @test_vuqaddq_s64( -// CHECK: [[V:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK-NEXT: ret <2 x i64> [[V]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuqaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> [[VUQADD_I]], <2 x i64> [[VUQADD1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VUQADD2_I]] +// int64x2_t test_vuqaddq_s64(int64x2_t a, uint64x2_t b) { return vuqaddq_s64(a, b); } -// CHECK-LABEL: @test_vuqaddq_s16( -// CHECK: [[V:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK-NEXT: ret <8 x i16> [[V]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuqaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> [[VUQADD_I]], <8 x i16> [[VUQADD1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VUQADD2_I]] +// int16x8_t test_vuqaddq_s16(int16x8_t a, uint16x8_t b) { return vuqaddq_s16(a, b); } -// CHECK-LABEL: @test_vuqadd_s8( -// CHECK: entry: -// CHECK-NEXT: [[V:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK-NEXT: ret <8 x i8> [[V]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuqadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VUQADD_I]] +// int8x8_t test_vuqadd_s8(int8x8_t a, uint8x8_t b) { return vuqadd_s8(a, b); } -// CHECK-LABEL: @test_vuqadd_s32( -// CHECK: [[V:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK-NEXT: ret <2 x i32> [[V]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vuqadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> [[VUQADD_I]], <2 x i32> [[VUQADD1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VUQADD2_I]] +// int32x2_t test_vuqadd_s32(int32x2_t a, uint32x2_t b) { return vuqadd_s32(a, b); } -// CHECK-LABEL: @test_vuqadd_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: ret <1 x i64> [[VUQADD2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vuqadd_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> [[VUQADD_I]], <1 x i64> [[VUQADD1_I]]) +// CHECK-NEXT: ret <1 x i64> [[VUQADD2_I]] +// int64x1_t test_vuqadd_s64(int64x1_t a, uint64x1_t b) { return vuqadd_s64(a, b); } -// CHECK-LABEL: @test_vuqadd_s16( -// CHECK: [[V:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK-NEXT: ret <4 x i16> [[V]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuqadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[VUQADD_I]], <4 x i16> [[VUQADD1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VUQADD2_I]] +// int16x4_t test_vuqadd_s16(int16x4_t a, uint16x4_t b) { return vuqadd_s16(a, b); } -// CHECK-LABEL: @test_vsqadd_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: ret <1 x i64> [[VSQADD2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsqadd_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> [[VSQADD_I]], <1 x i64> [[VSQADD1_I]]) +// CHECK-NEXT: ret <1 x i64> [[VSQADD2_I]] +// uint64x1_t test_vsqadd_u64(uint64x1_t a, int64x1_t b) { return vsqadd_u64(a, b); } -// CHECK-LABEL: @test_vsqadd_u8( -// CHECK: [[VSQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VSQADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsqadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VSQADD_I]] +// uint8x8_t test_vsqadd_u8(uint8x8_t a, int8x8_t b) { return vsqadd_u8(a, b); } -// CHECK-LABEL: @test_vsqaddq_u8( -// CHECK: [[VSQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VSQADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsqaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VSQADD_I]] +// uint8x16_t test_vsqaddq_u8(uint8x16_t a, int8x16_t b) { return vsqaddq_u8(a, b); } -// CHECK-LABEL: @test_vsqadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VSQADD2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsqadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[VSQADD_I]], <4 x i16> [[VSQADD1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VSQADD2_I]] +// uint16x4_t test_vsqadd_u16(uint16x4_t a, int16x4_t b) { return vsqadd_u16(a, b); } -// CHECK-LABEL: @test_vsqaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VSQADD2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsqaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> [[VSQADD_I]], <8 x i16> [[VSQADD1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VSQADD2_I]] +// uint16x8_t test_vsqaddq_u16(uint16x8_t a, int16x8_t b) { return vsqaddq_u16(a, b); } -// CHECK-LABEL: @test_vsqadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VSQADD2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsqadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> [[VSQADD_I]], <2 x i32> [[VSQADD1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VSQADD2_I]] +// uint32x2_t test_vsqadd_u32(uint32x2_t a, int32x2_t b) { return vsqadd_u32(a, b); } -// CHECK-LABEL: @test_vsqaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VSQADD2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsqaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> [[VSQADD_I]], <4 x i32> [[VSQADD1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VSQADD2_I]] +// uint32x4_t test_vsqaddq_u32(uint32x4_t a, int32x4_t b) { return vsqaddq_u32(a, b); } -// CHECK-LABEL: @test_vsqaddq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: ret <2 x i64> [[VSQADD2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsqaddq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> [[VSQADD_I]], <2 x i64> [[VSQADD1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VSQADD2_I]] +// uint64x2_t test_vsqaddq_u64(uint64x2_t a, int64x2_t b) { return vsqaddq_u64(a, b); } -// CHECK-LABEL: @test_vabs_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %a) -// CHECK: ret <1 x i64> [[VABS1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vabs_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> [[VABS_I]]) +// CHECK-NEXT: ret <1 x i64> [[VABS1_I]] +// int64x1_t test_vabs_s64(int64x1_t a) { return vabs_s64(a); } -// CHECK-LABEL: @test_vqabs_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> %a) -// CHECK: [[VQABS_V2_I:%.*]] = bitcast <1 x i64> [[VQABS_V1_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQABS_V1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqabs_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> [[VQABS_V_I]]) +// CHECK-NEXT: [[VQABS_V2_I:%.*]] = bitcast <1 x i64> [[VQABS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vqabs_s64(int64x1_t a) { return vqabs_s64(a); } -// CHECK-LABEL: @test_vqneg_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> %a) -// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <1 x i64> [[VQNEG_V1_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQNEG_V1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqneg_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> [[VQNEG_V_I]]) +// CHECK-NEXT: [[VQNEG_V2_I:%.*]] = bitcast <1 x i64> [[VQNEG_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vqneg_s64(int64x1_t a) { return vqneg_s64(a); } -// CHECK-LABEL: @test_vneg_s64( -// CHECK: [[SUB_I:%.*]] = sub <1 x i64> zeroinitializer, %a -// CHECK: ret <1 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vneg_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <1 x i64> zeroinitializer, [[A]] +// CHECK-NEXT: ret <1 x i64> [[SUB_I]] +// int64x1_t test_vneg_s64(int64x1_t a) { return vneg_s64(a); } -// CHECK-LABEL: @test_vaddv_f32( -// CHECK: [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VADDV_F32_I]] +// CHECK-LABEL: define dso_local float @test_vaddv_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VADDV_F32_I]] +// float32_t test_vaddv_f32(float32x2_t a) { return vaddv_f32(a); } -// CHECK-LABEL: @test_vaddvq_f32( -// CHECK: [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a) -// CHECK: ret float [[VADDVQ_F32_I]] +// CHECK-LABEL: define dso_local float @test_vaddvq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret float [[VADDVQ_F32_I]] +// float32_t test_vaddvq_f32(float32x4_t a) { return vaddvq_f32(a); } -// CHECK-LABEL: @test_vaddvq_f64( -// CHECK: [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VADDVQ_F64_I]] +// CHECK-LABEL: define dso_local double @test_vaddvq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VADDVQ_F64_I]] +// float64_t test_vaddvq_f64(float64x2_t a) { return vaddvq_f64(a); } -// CHECK-LABEL: @test_vmaxv_f32( -// CHECK: [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VMAXV_F32_I]] +// CHECK-LABEL: define dso_local float @test_vmaxv_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VMAXV_F32_I]] +// float32_t test_vmaxv_f32(float32x2_t a) { return vmaxv_f32(a); } -// CHECK-LABEL: @test_vmaxvq_f64( -// CHECK: [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VMAXVQ_F64_I]] +// CHECK-LABEL: define dso_local double @test_vmaxvq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VMAXVQ_F64_I]] +// float64_t test_vmaxvq_f64(float64x2_t a) { return vmaxvq_f64(a); } -// CHECK-LABEL: @test_vminv_f32( -// CHECK: [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VMINV_F32_I]] +// CHECK-LABEL: define dso_local float @test_vminv_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VMINV_F32_I]] +// float32_t test_vminv_f32(float32x2_t a) { return vminv_f32(a); } -// CHECK-LABEL: @test_vminvq_f64( -// CHECK: [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VMINVQ_F64_I]] +// CHECK-LABEL: define dso_local double @test_vminvq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VMINVQ_F64_I]] +// float64_t test_vminvq_f64(float64x2_t a) { return vminvq_f64(a); } -// CHECK-LABEL: @test_vmaxnmvq_f64( -// CHECK: [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VMAXNMVQ_F64_I]] +// CHECK-LABEL: define dso_local double @test_vmaxnmvq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VMAXNMVQ_F64_I]] +// float64_t test_vmaxnmvq_f64(float64x2_t a) { return vmaxnmvq_f64(a); } -// CHECK-LABEL: @test_vmaxnmv_f32( -// CHECK: [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VMAXNMV_F32_I]] +// CHECK-LABEL: define dso_local float @test_vmaxnmv_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VMAXNMV_F32_I]] +// float32_t test_vmaxnmv_f32(float32x2_t a) { return vmaxnmv_f32(a); } -// CHECK-LABEL: @test_vminnmvq_f64( -// CHECK: [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VMINNMVQ_F64_I]] +// CHECK-LABEL: define dso_local double @test_vminnmvq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VMINNMVQ_F64_I]] +// float64_t test_vminnmvq_f64(float64x2_t a) { return vminnmvq_f64(a); } -// CHECK-LABEL: @test_vminnmv_f32( -// CHECK: [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VMINNMV_F32_I]] +// CHECK-LABEL: define dso_local float @test_vminnmv_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VMINNMV_F32_I]] +// float32_t test_vminnmv_f32(float32x2_t a) { return vminnmv_f32(a); } -// CHECK-LABEL: @test_vpaddq_s64( -// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[VPADDQ_V_I]], <2 x i64> [[VPADDQ_V1_I]]) +// CHECK-NEXT: [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vpaddq_s64(int64x2_t a, int64x2_t b) { return vpaddq_s64(a, b); } -// CHECK-LABEL: @test_vpaddq_u64( -// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[VPADDQ_V_I]], <2 x i64> [[VPADDQ_V1_I]]) +// CHECK-NEXT: [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vpaddq_u64(uint64x2_t a, uint64x2_t b) { return vpaddq_u64(a, b); } -// CHECK-LABEL: @test_vpaddd_u64( -// CHECK: [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a) -// CHECK: ret i64 [[VPADDD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vpaddd_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: ret i64 [[VPADDD_U64_I]] +// uint64_t test_vpaddd_u64(uint64x2_t a) { return vpaddd_u64(a); } -// CHECK-LABEL: @test_vaddvq_s64( -// CHECK: [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a) -// CHECK: ret i64 [[VADDVQ_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vaddvq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: ret i64 [[VADDVQ_S64_I]] +// int64_t test_vaddvq_s64(int64x2_t a) { return vaddvq_s64(a); } -// CHECK-LABEL: @test_vaddvq_u64( -// CHECK: [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a) -// CHECK: ret i64 [[VADDVQ_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vaddvq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: ret i64 [[VADDVQ_U64_I]] +// uint64_t test_vaddvq_u64(uint64x2_t a) { return vaddvq_u64(a); } -// CHECK-LABEL: @test_vadd_f64( -// CHECK: [[ADD_I:%.*]] = fadd <1 x double> %a, %b -// CHECK: ret <1 x double> [[ADD_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vadd_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <1 x double> [[A]], [[B]] +// CHECK-NEXT: ret <1 x double> [[ADD_I]] +// float64x1_t test_vadd_f64(float64x1_t a, float64x1_t b) { return vadd_f64(a, b); } -// CHECK-LABEL: @test_vmul_f64( -// CHECK: [[MUL_I:%.*]] = fmul <1 x double> %a, %b -// CHECK: ret <1 x double> [[MUL_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmul_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <1 x double> [[A]], [[B]] +// CHECK-NEXT: ret <1 x double> [[MUL_I]] +// float64x1_t test_vmul_f64(float64x1_t a, float64x1_t b) { return vmul_f64(a, b); } -// CHECK-LABEL: @test_vdiv_f64( -// CHECK: [[DIV_I:%.*]] = fdiv <1 x double> %a, %b -// CHECK: ret <1 x double> [[DIV_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vdiv_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DIV_I:%.*]] = fdiv <1 x double> [[A]], [[B]] +// CHECK-NEXT: ret <1 x double> [[DIV_I]] +// float64x1_t test_vdiv_f64(float64x1_t a, float64x1_t b) { return vdiv_f64(a, b); } -// CHECK-LABEL: @test_vmla_f64( -// CHECK: [[MUL_I:%.*]] = fmul <1 x double> %b, %c -// CHECK: [[ADD_I:%.*]] = fadd <1 x double> %a, [[MUL_I]] -// CHECK: ret <1 x double> [[ADD_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmla_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <1 x double> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <1 x double> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <1 x double> [[ADD_I]] +// float64x1_t test_vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vmla_f64(a, b, c); } -// CHECK-LABEL: @test_vmls_f64( -// CHECK: [[MUL_I:%.*]] = fmul <1 x double> %b, %c -// CHECK: [[SUB_I:%.*]] = fsub <1 x double> %a, [[MUL_I]] -// CHECK: ret <1 x double> [[SUB_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmls_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <1 x double> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <1 x double> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <1 x double> [[SUB_I]] +// float64x1_t test_vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vmls_f64(a, b, c); } -// CHECK-LABEL: @test_vfma_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8> -// CHECK: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a) -// CHECK: ret <1 x double> [[TMP3]] +// CHECK-LABEL: define dso_local <1 x double> @test_vfma_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[C]] to i64 +// CHECK-NEXT: [[__P2_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__P2_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[TMP9:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], <1 x double> [[TMP6]]) +// CHECK-NEXT: ret <1 x double> [[TMP9]] +// float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vfma_f64(a, b, c); } -// CHECK-LABEL: @test_vfms_f64( -// CHECK: [[SUB_I:%.*]] = fneg <1 x double> %b -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8> -// CHECK: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> %c, <1 x double> %a) -// CHECK: ret <1 x double> [[TMP3]] +// CHECK-LABEL: define dso_local <1 x double> @test_vfms_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[B]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[C]] to i64 +// CHECK-NEXT: [[__P2_ADDR_I_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__P2_ADDR_I_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[TMP9:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], <1 x double> [[TMP6]]) +// CHECK-NEXT: ret <1 x double> [[TMP9]] +// float64x1_t test_vfms_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vfms_f64(a, b, c); } -// CHECK-LABEL: @test_vsub_f64( -// CHECK: [[SUB_I:%.*]] = fsub <1 x double> %a, %b -// CHECK: ret <1 x double> [[SUB_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vsub_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <1 x double> [[A]], [[B]] +// CHECK-NEXT: ret <1 x double> [[SUB_I]] +// float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) { return vsub_f64(a, b); } -// CHECK-LABEL: @test_vabd_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VABD2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vabd_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> [[VABD_I]], <1 x double> [[VABD1_I]]) +// CHECK-NEXT: ret <1 x double> [[VABD2_I]] +// float64x1_t test_vabd_f64(float64x1_t a, float64x1_t b) { return vabd_f64(a, b); } -// CHECK-LABEL: @test_vmax_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmax_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> [[VMAX_I]], <1 x double> [[VMAX1_I]]) +// CHECK-NEXT: ret <1 x double> [[VMAX2_I]] +// float64x1_t test_vmax_f64(float64x1_t a, float64x1_t b) { return vmax_f64(a, b); } -// CHECK-LABEL: @test_vmin_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmin_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> [[VMIN_I]], <1 x double> [[VMIN1_I]]) +// CHECK-NEXT: ret <1 x double> [[VMIN2_I]] +// float64x1_t test_vmin_f64(float64x1_t a, float64x1_t b) { return vmin_f64(a, b); } -// CHECK-LABEL: @test_vmaxnm_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VMAXNM2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmaxnm_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> [[VMAXNM_I]], <1 x double> [[VMAXNM1_I]]) +// CHECK-NEXT: ret <1 x double> [[VMAXNM2_I]] +// float64x1_t test_vmaxnm_f64(float64x1_t a, float64x1_t b) { return vmaxnm_f64(a, b); } -// CHECK-LABEL: @test_vminnm_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VMINNM2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vminnm_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> [[VMINNM_I]], <1 x double> [[VMINNM1_I]]) +// CHECK-NEXT: ret <1 x double> [[VMINNM2_I]] +// float64x1_t test_vminnm_f64(float64x1_t a, float64x1_t b) { return vminnm_f64(a, b); } -// CHECK-LABEL: @test_vabs_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VABS1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vabs_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> [[VABS_I]]) +// CHECK-NEXT: ret <1 x double> [[VABS1_I]] +// float64x1_t test_vabs_f64(float64x1_t a) { return vabs_f64(a); } -// CHECK-LABEL: @test_vneg_f64( -// CHECK: [[SUB_I:%.*]] = fneg <1 x double> %a -// CHECK: ret <1 x double> [[SUB_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vneg_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[A]] +// CHECK-NEXT: ret <1 x double> [[FNEG_I]] +// float64x1_t test_vneg_f64(float64x1_t a) { return vneg_f64(a); } -// CHECK-LABEL: @test_vcvt_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvt_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// int64x1_t test_vcvt_s64_f64(float64x1_t a) { return vcvt_s64_f64(a); } -// CHECK-LABEL: @test_vcvt_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvt_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[VCVTZ_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// uint64x1_t test_vcvt_u64_f64(float64x1_t a) { return vcvt_u64_f64(a); } -// CHECK-LABEL: @test_vcvtn_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtn_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> [[VCVTN_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTN1_I]] +// int64x1_t test_vcvtn_s64_f64(float64x1_t a) { return vcvtn_s64_f64(a); } -// CHECK-LABEL: @test_vcvtn_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtn_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> [[VCVTN_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTN1_I]] +// uint64x1_t test_vcvtn_u64_f64(float64x1_t a) { return vcvtn_u64_f64(a); } -// CHECK-LABEL: @test_vcvtp_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtp_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> [[VCVTP_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTP1_I]] +// int64x1_t test_vcvtp_s64_f64(float64x1_t a) { return vcvtp_s64_f64(a); } -// CHECK-LABEL: @test_vcvtp_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtp_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> [[VCVTP_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTP1_I]] +// uint64x1_t test_vcvtp_u64_f64(float64x1_t a) { return vcvtp_u64_f64(a); } -// CHECK-LABEL: @test_vcvtm_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtm_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> [[VCVTM_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTM1_I]] +// int64x1_t test_vcvtm_s64_f64(float64x1_t a) { return vcvtm_s64_f64(a); } -// CHECK-LABEL: @test_vcvtm_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtm_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> [[VCVTM_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTM1_I]] +// uint64x1_t test_vcvtm_u64_f64(float64x1_t a) { return vcvtm_u64_f64(a); } -// CHECK-LABEL: @test_vcvta_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvta_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> [[VCVTA_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTA1_I]] +// int64x1_t test_vcvta_s64_f64(float64x1_t a) { return vcvta_s64_f64(a); } -// CHECK-LABEL: @test_vcvta_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvta_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> [[VCVTA_I]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTA1_I]] +// uint64x1_t test_vcvta_u64_f64(float64x1_t a) { return vcvta_u64_f64(a); } -// CHECK-LABEL: @test_vcvt_f64_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = sitofp <1 x i64> %a to <1 x double> -// CHECK: ret <1 x double> [[VCVT_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vcvt_f64_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <1 x i64> [[TMP1]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[VCVT_I]] +// float64x1_t test_vcvt_f64_s64(int64x1_t a) { return vcvt_f64_s64(a); } -// CHECK-LABEL: @test_vcvt_f64_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = uitofp <1 x i64> %a to <1 x double> -// CHECK: ret <1 x double> [[VCVT_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vcvt_f64_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <1 x i64> [[TMP1]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[VCVT_I]] +// float64x1_t test_vcvt_f64_u64(uint64x1_t a) { return vcvt_f64_u64(a); } -// CHECK-LABEL: @test_vcvt_n_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK: [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64) -// CHECK: ret <1 x i64> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvt_n_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64) +// CHECK-NEXT: ret <1 x i64> [[VCVT_N1]] +// int64x1_t test_vcvt_n_s64_f64(float64x1_t a) { return vcvt_n_s64_f64(a, 64); } -// CHECK-LABEL: @test_vcvt_n_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK: [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64) -// CHECK: ret <1 x i64> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvt_n_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64) +// CHECK-NEXT: ret <1 x i64> [[VCVT_N1]] +// uint64x1_t test_vcvt_n_u64_f64(float64x1_t a) { return vcvt_n_u64_f64(a, 64); } -// CHECK-LABEL: @test_vcvt_n_f64_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64) -// CHECK: ret <1 x double> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <1 x double> @test_vcvt_n_f64_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64) +// CHECK-NEXT: ret <1 x double> [[VCVT_N1]] +// float64x1_t test_vcvt_n_f64_s64(int64x1_t a) { return vcvt_n_f64_s64(a, 64); } -// CHECK-LABEL: @test_vcvt_n_f64_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64) -// CHECK: ret <1 x double> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <1 x double> @test_vcvt_n_f64_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64) +// CHECK-NEXT: ret <1 x double> [[VCVT_N1]] +// float64x1_t test_vcvt_n_f64_u64(uint64x1_t a) { return vcvt_n_f64_u64(a, 64); } -// CHECK-LABEL: @test_vrndn_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDN1_I:%.*]] = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDN1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrndn_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <1 x double> @llvm.roundeven.v1f64(<1 x double> [[VRNDN_I]]) +// CHECK-NEXT: ret <1 x double> [[VRNDN1_I]] +// float64x1_t test_vrndn_f64(float64x1_t a) { return vrndn_f64(a); } -// CHECK-LABEL: @test_vrnda_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDA1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrnda_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> [[VRNDA_I]]) +// CHECK-NEXT: ret <1 x double> [[VRNDA1_I]] +// float64x1_t test_vrnda_f64(float64x1_t a) { return vrnda_f64(a); } -// CHECK-LABEL: @test_vrndp_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDP1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrndp_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> [[VRNDP_I]]) +// CHECK-NEXT: ret <1 x double> [[VRNDP1_I]] +// float64x1_t test_vrndp_f64(float64x1_t a) { return vrndp_f64(a); } -// CHECK-LABEL: @test_vrndm_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDM1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrndm_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> [[VRNDM_I]]) +// CHECK-NEXT: ret <1 x double> [[VRNDM1_I]] +// float64x1_t test_vrndm_f64(float64x1_t a) { return vrndm_f64(a); } -// CHECK-LABEL: @test_vrndx_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDX1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrndx_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> [[VRNDX_I]]) +// CHECK-NEXT: ret <1 x double> [[VRNDX1_I]] +// float64x1_t test_vrndx_f64(float64x1_t a) { return vrndx_f64(a); } -// CHECK-LABEL: @test_vrnd_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDZ1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrnd_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> [[VRNDZ_I]]) +// CHECK-NEXT: ret <1 x double> [[VRNDZ1_I]] +// float64x1_t test_vrnd_f64(float64x1_t a) { return vrnd_f64(a); } -// CHECK-LABEL: @test_vrndi_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDI1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrndi_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRNDI_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRNDI_V1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> [[VRNDI_V_I]]) +// CHECK-NEXT: ret <1 x double> [[VRNDI_V1_I]] +// float64x1_t test_vrndi_f64(float64x1_t a) { return vrndi_f64(a); } -// CHECK-LABEL: @test_vrsqrte_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRSQRTE_V1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrsqrte_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> [[VRSQRTE_V_I]]) +// CHECK-NEXT: ret <1 x double> [[VRSQRTE_V1_I]] +// float64x1_t test_vrsqrte_f64(float64x1_t a) { return vrsqrte_f64(a); } -// CHECK-LABEL: @test_vrecpe_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRECPE_V1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrecpe_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> [[VRECPE_V_I]]) +// CHECK-NEXT: ret <1 x double> [[VRECPE_V1_I]] +// float64x1_t test_vrecpe_f64(float64x1_t a) { return vrecpe_f64(a); } -// CHECK-LABEL: @test_vsqrt_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VSQRT_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vsqrt_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> [[TMP2]]) +// CHECK-NEXT: ret <1 x double> [[VSQRT_I]] +// float64x1_t test_vsqrt_f64(float64x1_t a) { return vsqrt_f64(a); } -// CHECK-LABEL: @test_vrecps_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VRECPS_V2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrecps_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> [[VRECPS_V_I]], <1 x double> [[VRECPS_V1_I]]) +// CHECK-NEXT: [[VRECPS_V3_I:%.*]] = bitcast <1 x double> [[VRECPS_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP4]], i32 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP5]] +// float64x1_t test_vrecps_f64(float64x1_t a, float64x1_t b) { return vrecps_f64(a, b); } -// CHECK-LABEL: @test_vrsqrts_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: [[VRSQRTS_V3_I:%.*]] = bitcast <1 x double> [[VRSQRTS_V2_I]] to <8 x i8> -// CHECK: ret <1 x double> [[VRSQRTS_V2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrsqrts_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__P1_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__P1_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> [[VRSQRTS_V_I]], <1 x double> [[VRSQRTS_V1_I]]) +// CHECK-NEXT: [[VRSQRTS_V3_I:%.*]] = bitcast <1 x double> [[VRSQRTS_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP4]], i32 0 +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP5]] +// float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) { return vrsqrts_f64(a, b); } -// CHECK-LABEL: @test_vminv_s32( -// CHECK: [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> %a) -// CHECK: ret i32 [[VMINV_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vminv_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i32 [[VMINV_S32_I]] +// int32_t test_vminv_s32(int32x2_t a) { return vminv_s32(a); } -// CHECK-LABEL: @test_vminv_u32( -// CHECK: [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> %a) -// CHECK: ret i32 [[VMINV_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vminv_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i32 [[VMINV_U32_I]] +// uint32_t test_vminv_u32(uint32x2_t a) { return vminv_u32(a); } -// CHECK-LABEL: @test_vmaxv_s32( -// CHECK: [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> %a) -// CHECK: ret i32 [[VMAXV_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vmaxv_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i32 [[VMAXV_S32_I]] +// int32_t test_vmaxv_s32(int32x2_t a) { return vmaxv_s32(a); } -// CHECK-LABEL: @test_vmaxv_u32( -// CHECK: [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> %a) -// CHECK: ret i32 [[VMAXV_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vmaxv_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i32 [[VMAXV_U32_I]] +// uint32_t test_vmaxv_u32(uint32x2_t a) { return vmaxv_u32(a); } -// CHECK-LABEL: @test_vaddv_s32( -// CHECK: [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a) -// CHECK: ret i32 [[VADDV_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vaddv_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i32 [[VADDV_S32_I]] +// int32_t test_vaddv_s32(int32x2_t a) { return vaddv_s32(a); } -// CHECK-LABEL: @test_vaddv_u32( -// CHECK: [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a) -// CHECK: ret i32 [[VADDV_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vaddv_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i32 [[VADDV_U32_I]] +// uint32_t test_vaddv_u32(uint32x2_t a) { return vaddv_u32(a); } -// CHECK-LABEL: @test_vaddlv_s32( -// CHECK: [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %a) -// CHECK: ret i64 [[VADDLV_S32_I]] +// CHECK-LABEL: define dso_local i64 @test_vaddlv_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i64 [[VADDLV_S32_I]] +// int64_t test_vaddlv_s32(int32x2_t a) { return vaddlv_s32(a); } -// CHECK-LABEL: @test_vaddlv_u32( -// CHECK: [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> %a) -// CHECK: ret i64 [[VADDLV_U32_I]] +// CHECK-LABEL: define dso_local i64 @test_vaddlv_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i64 [[VADDLV_U32_I]] +// uint64_t test_vaddlv_u32(uint32x2_t a) { return vaddlv_u32(a); } diff --git a/clang/test/CodeGen/AArch64/neon-ldst-one-rcpc3.c b/clang/test/CodeGen/AArch64/neon-ldst-one-rcpc3.c index 40c5a0a598d68..29bfaabbe83cd 100644 --- a/clang/test/CodeGen/AArch64/neon-ldst-one-rcpc3.c +++ b/clang/test/CodeGen/AArch64/neon-ldst-one-rcpc3.c @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64 -target-feature +neon \ // RUN: -target-feature +rcpc3 -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck %s +// RUN: | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target @@ -34,10 +34,11 @@ int64x2_t test_vldap1q_lane_s64(int64_t *a, int64x2_t b) { // CHECK-LABEL: @test_vldap1q_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[TMP2:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8 -// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x double> [[TMP1]], double [[TMP2]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[TMP3:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP3]], i32 1 // CHECK-NEXT: ret <2 x double> [[VLDAP1_LANE]] // float64x2_t test_vldap1q_lane_f64(float64_t *a, float64x2_t b) { @@ -82,10 +83,12 @@ int64x1_t test_vldap1_lane_s64(int64_t *a, int64x1_t b) { // CHECK-LABEL: @test_vldap1_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK-NEXT: [[TMP2:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8 -// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x double> [[TMP1]], double [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B:%.*]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[TMP3:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP3]], i32 0 // CHECK-NEXT: ret <1 x double> [[VLDAP1_LANE]] // float64x1_t test_vldap1_lane_f64(float64_t *a, float64x1_t b) { @@ -130,10 +133,11 @@ void test_vstl1q_lane_s64(int64_t *a, int64x2_t b) { // CHECK-LABEL: @test_vstl1q_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -// CHECK-NEXT: store atomic double [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B:%.*]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +// CHECK-NEXT: store atomic double [[TMP3]], ptr [[A:%.*]] release, align 8 // CHECK-NEXT: ret void // void test_vstl1q_lane_f64(float64_t *a, float64x2_t b) { @@ -178,10 +182,12 @@ void test_vstl1_lane_s64(int64_t *a, int64x1_t b) { // CHECK-LABEL: @test_vstl1_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 -// CHECK-NEXT: store atomic double [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B:%.*]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0 +// CHECK-NEXT: store atomic double [[TMP3]], ptr [[A:%.*]] release, align 8 // CHECK-NEXT: ret void // void test_vstl1_lane_f64(float64_t *a, float64x1_t b) { diff --git a/clang/test/CodeGen/AArch64/neon-ldst-one.c b/clang/test/CodeGen/AArch64/neon-ldst-one.c index b57df40d8e5c9..2cff007826ba6 100644 --- a/clang/test/CodeGen/AArch64/neon-ldst-one.c +++ b/clang/test/CodeGen/AArch64/neon-ldst-one.c @@ -1,5757 +1,4962 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck %s +// RUN: | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_u8(ptr noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer -// CHECK: ret <16 x i8> [[LANE]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_dup_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// uint8x16_t test_vld1q_dup_u8(uint8_t *a) { return vld1q_dup_u8(a); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_u16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i16, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_dup_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// uint16x8_t test_vld1q_dup_u16(uint16_t *a) { return vld1q_dup_u16(a); } -// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_u32(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i32, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i32> [[LANE]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_dup_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i32> [[LANE]] +// uint32x4_t test_vld1q_dup_u32(uint32_t *a) { return vld1q_dup_u32(a); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_u64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i64, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// uint64x2_t test_vld1q_dup_u64(uint64_t *a) { return vld1q_dup_u64(a); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_s8(ptr noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer -// CHECK: ret <16 x i8> [[LANE]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_dup_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// int8x16_t test_vld1q_dup_s8(int8_t *a) { return vld1q_dup_s8(a); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_s16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i16, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_dup_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// int16x8_t test_vld1q_dup_s16(int16_t *a) { return vld1q_dup_s16(a); } -// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_s32(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i32, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i32> [[LANE]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_dup_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i32> [[LANE]] +// int32x4_t test_vld1q_dup_s32(int32_t *a) { return vld1q_dup_s32(a); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_s64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i64, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// int64x2_t test_vld1q_dup_s64(int64_t *a) { return vld1q_dup_s64(a); } -// CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_dup_f16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load half, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x half> [[LANE]] +// CHECK-LABEL: define dso_local <8 x half> @test_vld1q_dup_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x half> [[LANE]] +// float16x8_t test_vld1q_dup_f16(float16_t *a) { return vld1q_dup_f16(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_dup_f32(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load float, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x float> [[LANE]] +// CHECK-LABEL: define dso_local <4 x float> @test_vld1q_dup_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x float> [[LANE]] +// float32x4_t test_vld1q_dup_f32(float32_t *a) { return vld1q_dup_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_dup_f64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load double, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x double> [[LANE]] +// CHECK-LABEL: define dso_local <2 x double> @test_vld1q_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x double> [[LANE]] +// float64x2_t test_vld1q_dup_f64(float64_t *a) { return vld1q_dup_f64(a); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_p8(ptr noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer -// CHECK: ret <16 x i8> [[LANE]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_dup_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// poly8x16_t test_vld1q_dup_p8(poly8_t *a) { return vld1q_dup_p8(a); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_p16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i16, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_dup_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// poly16x8_t test_vld1q_dup_p16(poly16_t *a) { return vld1q_dup_p16(a); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_p64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i64, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// poly64x2_t test_vld1q_dup_p64(poly64_t *a) { return vld1q_dup_p64(a); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_u8(ptr noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i8> [[LANE]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_dup_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// uint8x8_t test_vld1_dup_u8(uint8_t *a) { return vld1_dup_u8(a); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_u16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i16, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_dup_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// uint16x4_t test_vld1_dup_u16(uint16_t *a) { return vld1_dup_u16(a); } -// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_u32(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i32, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i32> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_dup_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i32> [[LANE]] +// uint32x2_t test_vld1_dup_u32(uint32_t *a) { return vld1_dup_u32(a); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_u64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i64, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[LANE]] +// uint64x1_t test_vld1_dup_u64(uint64_t *a) { return vld1_dup_u64(a); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_s8(ptr noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i8> [[LANE]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_dup_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// int8x8_t test_vld1_dup_s8(int8_t *a) { return vld1_dup_s8(a); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_s16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i16, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_dup_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// int16x4_t test_vld1_dup_s16(int16_t *a) { return vld1_dup_s16(a); } -// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_s32(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i32, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i32> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_dup_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i32> [[LANE]] +// int32x2_t test_vld1_dup_s32(int32_t *a) { return vld1_dup_s32(a); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_s64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i64, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[LANE]] +// int64x1_t test_vld1_dup_s64(int64_t *a) { return vld1_dup_s64(a); } -// CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_dup_f16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load half, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x half> poison, half [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x half> [[LANE]] +// CHECK-LABEL: define dso_local <4 x half> @test_vld1_dup_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x half> [[LANE]] +// float16x4_t test_vld1_dup_f16(float16_t *a) { return vld1_dup_f16(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_dup_f32(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load float, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x float> [[LANE]] +// CHECK-LABEL: define dso_local <2 x float> @test_vld1_dup_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x float> [[LANE]] +// float32x2_t test_vld1_dup_f32(float32_t *a) { return vld1_dup_f32(a); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_dup_f64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load double, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x double> [[LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vld1_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x double> [[LANE]] +// float64x1_t test_vld1_dup_f64(float64_t *a) { return vld1_dup_f64(a); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_p8(ptr noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i8> [[LANE]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_dup_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// poly8x8_t test_vld1_dup_p8(poly8_t *a) { return vld1_dup_p8(a); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_p16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i16, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_dup_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// poly16x4_t test_vld1_dup_p16(poly16_t *a) { return vld1_dup_p16(a); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_p64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i64, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[LANE]] +// poly64x1_t test_vld1_dup_p64(poly64_t *a) { return vld1_dup_p64(a); } -// CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_dup_u64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x2x2_t @test_vld2q_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X2_T:%.*]] poison, <2 x i64> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// uint64x2x2_t test_vld2q_dup_u64(uint64_t *a) { return vld2q_dup_u64(a); } -// CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_dup_s64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x2x2_t @test_vld2q_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X2_T:%.*]] poison, <2 x i64> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// int64x2x2_t test_vld2q_dup_s64(int64_t *a) { return vld2q_dup_s64(a); } -// CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_dup_f64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld2q_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T:%.*]] poison, <2 x double> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x double> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// float64x2x2_t test_vld2q_dup_f64(float64_t *a) { return vld2q_dup_f64(a); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_dup_p64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld2q_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T:%.*]] poison, <2 x i64> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// poly64x2x2_t test_vld2q_dup_p64(poly64_t *a) { return vld2q_dup_p64(a); } -// CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_dup_f64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld2_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T:%.*]] poison, <1 x double> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T]] [[DOTFCA_0_0_INSERT]], <1 x double> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X2_T]] [[DOTFCA_0_1_INSERT]] +// float64x1x2_t test_vld2_dup_f64(float64_t *a) { return vld2_dup_f64(a); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_dup_p64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld2_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T:%.*]] poison, <1 x i64> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X2_T]] [[DOTFCA_0_1_INSERT]] +// poly64x1x2_t test_vld2_dup_p64(poly64_t *a) { return vld2_dup_p64(a); } -// CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_dup_u64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x2x3_t @test_vld3q_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T:%.*]] poison, <2 x i64> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// uint64x2x3_t test_vld3q_dup_u64(uint64_t *a) { return vld3q_dup_u64(a); // [{{x[0-9]+|sp}}] } -// CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_dup_s64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x2x3_t @test_vld3q_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X3_T:%.*]] poison, <2 x i64> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// int64x2x3_t test_vld3q_dup_s64(int64_t *a) { return vld3q_dup_s64(a); // [{{x[0-9]+|sp}}] } -// CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_dup_f64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld3q_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T:%.*]] poison, <2 x double> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x double> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x double> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// float64x2x3_t test_vld3q_dup_f64(float64_t *a) { return vld3q_dup_f64(a); // [{{x[0-9]+|sp}}] } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_dup_p64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld3q_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T:%.*]] poison, <2 x i64> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// poly64x2x3_t test_vld3q_dup_p64(poly64_t *a) { return vld3q_dup_p64(a); // [{{x[0-9]+|sp}}] } -// CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_dup_f64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld3_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T:%.*]] poison, <1 x double> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] [[DOTFCA_0_0_INSERT]], <1 x double> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] [[DOTFCA_0_1_INSERT]], <1 x double> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X3_T]] [[DOTFCA_0_2_INSERT]] +// float64x1x3_t test_vld3_dup_f64(float64_t *a) { return vld3_dup_f64(a); // [{{x[0-9]+|sp}}] } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_dup_p64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld3_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T:%.*]] poison, <1 x i64> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X3_T]] [[DOTFCA_0_2_INSERT]] +// poly64x1x3_t test_vld3_dup_p64(poly64_t *a) { return vld3_dup_p64(a); // [{{x[0-9]+|sp}}] } -// CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_dup_u64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x2x4_t @test_vld4q_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T:%.*]] poison, <2 x i64> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i64> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// uint64x2x4_t test_vld4q_dup_u64(uint64_t *a) { return vld4q_dup_u64(a); } -// CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_dup_s64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x2x4_t @test_vld4q_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X4_T:%.*]] poison, <2 x i64> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i64> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// int64x2x4_t test_vld4q_dup_s64(int64_t *a) { return vld4q_dup_s64(a); } -// CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_dup_f64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld4q_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T:%.*]] poison, <2 x double> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x double> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x double> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x double> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// float64x2x4_t test_vld4q_dup_f64(float64_t *a) { return vld4q_dup_f64(a); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_dup_p64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld4q_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T:%.*]] poison, <2 x i64> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i64> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// poly64x2x4_t test_vld4q_dup_p64(poly64_t *a) { return vld4q_dup_p64(a); } -// CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_dup_f64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld4_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T:%.*]] poison, <1 x double> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_0_INSERT]], <1 x double> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_1_INSERT]], <1 x double> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_2_INSERT]], <1 x double> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_3_INSERT]] +// float64x1x4_t test_vld4_dup_f64(float64_t *a) { return vld4_dup_f64(a); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_dup_p64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld4_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T:%.*]] poison, <1 x i64> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_2_INSERT]], <1 x i64> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_3_INSERT]] +// poly64x1x4_t test_vld4_dup_p64(poly64_t *a) { return vld4_dup_p64(a); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_u8(ptr noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 -// CHECK: ret <16 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VLD1_LANE]] +// uint8x16_t test_vld1q_lane_u8(uint8_t *a, uint8x16_t b) { return vld1q_lane_u8(a, b, 15); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_u16(ptr noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: ret <8 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VLD1_LANE]] +// uint16x8_t test_vld1q_lane_u16(uint16_t *a, uint16x8_t b) { return vld1q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_u32(ptr noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3 -// CHECK: ret <4 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 3 +// CHECK-NEXT: ret <4 x i32> [[VLD1_LANE]] +// uint32x4_t test_vld1q_lane_u32(uint32_t *a, uint32x4_t b) { return vld1q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_u64(ptr noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1 -// CHECK: ret <2 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VLD1_LANE]] +// uint64x2_t test_vld1q_lane_u64(uint64_t *a, uint64x2_t b) { return vld1q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_s8(ptr noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 -// CHECK: ret <16 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VLD1_LANE]] +// int8x16_t test_vld1q_lane_s8(int8_t *a, int8x16_t b) { return vld1q_lane_s8(a, b, 15); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_s16(ptr noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: ret <8 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VLD1_LANE]] +// int16x8_t test_vld1q_lane_s16(int16_t *a, int16x8_t b) { return vld1q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_s32(ptr noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3 -// CHECK: ret <4 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 3 +// CHECK-NEXT: ret <4 x i32> [[VLD1_LANE]] +// int32x4_t test_vld1q_lane_s32(int32_t *a, int32x4_t b) { return vld1q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_s64(ptr noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1 -// CHECK: ret <2 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VLD1_LANE]] +// int64x2_t test_vld1q_lane_s64(int64_t *a, int64x2_t b) { return vld1q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_lane_f16(ptr noundef %a, <8 x half> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK: [[TMP4:%.*]] = load half, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7 -// CHECK: ret <8 x half> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x half> @test_vld1q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP3]], i32 7 +// CHECK-NEXT: ret <8 x half> [[VLD1_LANE]] +// float16x8_t test_vld1q_lane_f16(float16_t *a, float16x8_t b) { return vld1q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_lane_f32(ptr noundef %a, <4 x float> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: [[TMP4:%.*]] = load float, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3 -// CHECK: ret <4 x float> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x float> @test_vld1q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 3 +// CHECK-NEXT: ret <4 x float> [[VLD1_LANE]] +// float32x4_t test_vld1q_lane_f32(float32_t *a, float32x4_t b) { return vld1q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_lane_f64(ptr noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK: [[TMP4:%.*]] = load double, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1 -// CHECK: ret <2 x double> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x double> @test_vld1q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP3]], i32 1 +// CHECK-NEXT: ret <2 x double> [[VLD1_LANE]] +// float64x2_t test_vld1q_lane_f64(float64_t *a, float64x2_t b) { return vld1q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_p8(ptr noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 -// CHECK: ret <16 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VLD1_LANE]] +// poly8x16_t test_vld1q_lane_p8(poly8_t *a, poly8x16_t b) { return vld1q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_p16(ptr noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: ret <8 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VLD1_LANE]] +// poly16x8_t test_vld1q_lane_p16(poly16_t *a, poly16x8_t b) { return vld1q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_p64(ptr noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1 -// CHECK: ret <2 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VLD1_LANE]] +// poly64x2_t test_vld1q_lane_p64(poly64_t *a, poly64x2_t b) { return vld1q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_u8(ptr noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 -// CHECK: ret <8 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VLD1_LANE]] +// uint8x8_t test_vld1_lane_u8(uint8_t *a, uint8x8_t b) { return vld1_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_u16(ptr noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 -// CHECK: ret <4 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VLD1_LANE]] +// uint16x4_t test_vld1_lane_u16(uint16_t *a, uint16x4_t b) { return vld1_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_u32(ptr noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1 -// CHECK: ret <2 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i32 1 +// CHECK-NEXT: ret <2 x i32> [[VLD1_LANE]] +// uint32x2_t test_vld1_lane_u32(uint32_t *a, uint32x2_t b) { return vld1_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_u64(ptr noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 -// CHECK: ret <1 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VLD1_LANE]] +// uint64x1_t test_vld1_lane_u64(uint64_t *a, uint64x1_t b) { return vld1_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_s8(ptr noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 -// CHECK: ret <8 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VLD1_LANE]] +// int8x8_t test_vld1_lane_s8(int8_t *a, int8x8_t b) { return vld1_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_s16(ptr noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 -// CHECK: ret <4 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VLD1_LANE]] +// int16x4_t test_vld1_lane_s16(int16_t *a, int16x4_t b) { return vld1_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_s32(ptr noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1 -// CHECK: ret <2 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i32 1 +// CHECK-NEXT: ret <2 x i32> [[VLD1_LANE]] +// int32x2_t test_vld1_lane_s32(int32_t *a, int32x2_t b) { return vld1_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_s64(ptr noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 -// CHECK: ret <1 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VLD1_LANE]] +// int64x1_t test_vld1_lane_s64(int64_t *a, int64x1_t b) { return vld1_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_lane_f16(ptr noundef %a, <4 x half> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK: [[TMP4:%.*]] = load half, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3 -// CHECK: ret <4 x half> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x half> @test_vld1_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP3]], i32 3 +// CHECK-NEXT: ret <4 x half> [[VLD1_LANE]] +// float16x4_t test_vld1_lane_f16(float16_t *a, float16x4_t b) { return vld1_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_lane_f32(ptr noundef %a, <2 x float> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: [[TMP4:%.*]] = load float, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1 -// CHECK: ret <2 x float> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x float> @test_vld1_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP3]], i32 1 +// CHECK-NEXT: ret <2 x float> [[VLD1_LANE]] +// float32x2_t test_vld1_lane_f32(float32_t *a, float32x2_t b) { return vld1_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_lane_f64(ptr noundef %a, <1 x double> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK: [[TMP4:%.*]] = load double, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0 -// CHECK: ret <1 x double> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vld1_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP3]], i32 0 +// CHECK-NEXT: ret <1 x double> [[VLD1_LANE]] +// float64x1_t test_vld1_lane_f64(float64_t *a, float64x1_t b) { return vld1_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_p8(ptr noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 -// CHECK: ret <8 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VLD1_LANE]] +// poly8x8_t test_vld1_lane_p8(poly8_t *a, poly8x8_t b) { return vld1_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_p16(ptr noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 -// CHECK: ret <4 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VLD1_LANE]] +// poly16x4_t test_vld1_lane_p16(poly16_t *a, poly16x4_t b) { return vld1_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_p64(ptr noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 -// CHECK: ret <1 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VLD1_LANE]] +// poly64x1_t test_vld1_lane_p64(poly64_t *a, poly64x1_t b) { return vld1_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.int8x16x2_t @test_vld2q_lane_s8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[SRC]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr) -// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vld2q_lane_s8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 0 +// CHECK-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 1 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[SRC_COERCE_FCA_0_EXTRACT]], <16 x i8> [[SRC_COERCE_FCA_1_EXTRACT]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X2_T:%.*]] poison, <16 x i8> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) { return vld2q_lane_s8(ptr, src, 15); } -// CHECK-LABEL: define{{.*}} %struct.uint8x16x2_t @test_vld2q_lane_u8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[SRC]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr) -// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vld2q_lane_u8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 0 +// CHECK-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 1 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[SRC_COERCE_FCA_0_EXTRACT]], <16 x i8> [[SRC_COERCE_FCA_1_EXTRACT]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T:%.*]] poison, <16 x i8> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) { return vld2q_lane_u8(ptr, src, 15); } -// CHECK-LABEL: define{{.*}} %struct.poly8x16x2_t @test_vld2q_lane_p8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[SRC]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr) -// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vld2q_lane_p8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 0 +// CHECK-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 1 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[SRC_COERCE_FCA_0_EXTRACT]], <16 x i8> [[SRC_COERCE_FCA_1_EXTRACT]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T:%.*]] poison, <16 x i8> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) { return vld2q_lane_p8(ptr, src, 15); } -// CHECK-LABEL: define{{.*}} %struct.int8x16x3_t @test_vld3q_lane_s8(ptr noundef %ptr, [3 x <16 x i8>] alignstack(16) %src.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[SRC]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %ptr) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP9:%.*]] = load %struct.int8x16x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x3_t [[TMP9]] +// CHECK-LABEL: define dso_local %struct.int8x16x3_t @test_vld3q_lane_s8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 0 +// CHECK-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 1 +// CHECK-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 2 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[SRC_COERCE_FCA_0_EXTRACT]], <16 x i8> [[SRC_COERCE_FCA_1_EXTRACT]], <16 x i8> [[SRC_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X3_T:%.*]] poison, <16 x i8> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X3_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X3_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT8X16X3_T]] [[DOTFCA_0_2_INSERT]] +// int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) { return vld3q_lane_s8(ptr, src, 15); } -// CHECK-LABEL: define{{.*}} %struct.uint8x16x3_t @test_vld3q_lane_u8(ptr noundef %ptr, [3 x <16 x i8>] alignstack(16) %src.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[SRC]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %ptr) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP9:%.*]] = load %struct.uint8x16x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x3_t [[TMP9]] +// CHECK-LABEL: define dso_local %struct.uint8x16x3_t @test_vld3q_lane_u8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 0 +// CHECK-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 1 +// CHECK-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 2 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[SRC_COERCE_FCA_0_EXTRACT]], <16 x i8> [[SRC_COERCE_FCA_1_EXTRACT]], <16 x i8> [[SRC_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X3_T:%.*]] poison, <16 x i8> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X3_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X3_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X3_T]] [[DOTFCA_0_2_INSERT]] +// uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) { return vld3q_lane_u8(ptr, src, 15); } -// CHECK-LABEL: define{{.*}} %struct.uint16x8x2_t @test_vld2q_lane_u16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vld2q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] poison, <8 x i16> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// uint16x8x2_t test_vld2q_lane_u16(uint16_t *a, uint16x8x2_t b) { return vld2q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.uint32x4x2_t @test_vld2q_lane_u32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vld2q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T:%.*]] poison, <4 x i32> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// uint32x4x2_t test_vld2q_lane_u32(uint32_t *a, uint32x4x2_t b) { return vld2q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_lane_u64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.uint64x2x2_t @test_vld2q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X2_T:%.*]] poison, <2 x i64> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// uint64x2x2_t test_vld2q_lane_u64(uint64_t *a, uint64x2x2_t b) { return vld2q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.int16x8x2_t @test_vld2q_lane_s16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vld2q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X2_T:%.*]] poison, <8 x i16> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// int16x8x2_t test_vld2q_lane_s16(int16_t *a, int16x8x2_t b) { return vld2q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.int32x4x2_t @test_vld2q_lane_s32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vld2q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] poison, <4 x i32> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// int32x4x2_t test_vld2q_lane_s32(int32_t *a, int32x4x2_t b) { return vld2q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_lane_s64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.int64x2x2_t @test_vld2q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X2_T:%.*]] poison, <2 x i64> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// int64x2x2_t test_vld2q_lane_s64(int64_t *a, int64x2x2_t b) { return vld2q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.float16x8x2_t @test_vld2q_lane_f16(ptr noundef %a, [2 x <8 x half>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0(<8 x half> [[TMP8]], <8 x half> [[TMP9]], i64 7, ptr %a) -// CHECK: store { <8 x half>, <8 x half> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.float16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float16x8x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.float16x8x2_t @test_vld2q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0(<8 x half> [[TMP4]], <8 x half> [[TMP5]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] poison, <8 x half> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x half> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// float16x8x2_t test_vld2q_lane_f16(float16_t *a, float16x8x2_t b) { return vld2q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.float32x4x2_t @test_vld2q_lane_f32(ptr noundef %a, [2 x <4 x float>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x float>, <4 x float> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vld2q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0(<4 x float> [[TMP4]], <4 x float> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T:%.*]] poison, <4 x float> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x float> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// float32x4x2_t test_vld2q_lane_f32(float32_t *a, float32x4x2_t b) { return vld2q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_lane_f64(ptr noundef %a, [2 x <2 x double>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x double>, <2 x double> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld2q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0(<2 x double> [[TMP4]], <2 x double> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T:%.*]] poison, <2 x double> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x double> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// float64x2x2_t test_vld2q_lane_f64(float64_t *a, float64x2x2_t b) { return vld2q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.poly16x8x2_t @test_vld2q_lane_p16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vld2q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T:%.*]] poison, <8 x i16> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// poly16x8x2_t test_vld2q_lane_p16(poly16_t *a, poly16x8x2_t b) { return vld2q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_lane_p64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld2q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T:%.*]] poison, <2 x i64> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// poly64x2x2_t test_vld2q_lane_p64(poly64_t *a, poly64x2x2_t b) { return vld2q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.uint8x8x2_t @test_vld2_lane_u8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vld2_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T:%.*]] poison, <8 x i8> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// uint8x8x2_t test_vld2_lane_u8(uint8_t *a, uint8x8x2_t b) { return vld2_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.uint16x4x2_t @test_vld2_lane_u16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vld2_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T:%.*]] poison, <4 x i16> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// uint16x4x2_t test_vld2_lane_u16(uint16_t *a, uint16x4x2_t b) { return vld2_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.uint32x2x2_t @test_vld2_lane_u32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vld2_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T:%.*]] poison, <2 x i32> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// uint32x2x2_t test_vld2_lane_u32(uint32_t *a, uint32x2x2_t b) { return vld2_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.uint64x1x2_t @test_vld2_lane_u64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.uint64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint64x1x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.uint64x1x2_t @test_vld2_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X2_T:%.*]] poison, <1 x i64> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X2_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT64X1X2_T]] [[DOTFCA_0_1_INSERT]] +// uint64x1x2_t test_vld2_lane_u64(uint64_t *a, uint64x1x2_t b) { return vld2_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.int8x8x2_t @test_vld2_lane_s8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vld2_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X2_T:%.*]] poison, <8 x i8> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// int8x8x2_t test_vld2_lane_s8(int8_t *a, int8x8x2_t b) { return vld2_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.int16x4x2_t @test_vld2_lane_s16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vld2_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X2_T:%.*]] poison, <4 x i16> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// int16x4x2_t test_vld2_lane_s16(int16_t *a, int16x4x2_t b) { return vld2_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.int32x2x2_t @test_vld2_lane_s32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vld2_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X2_T:%.*]] poison, <2 x i32> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// int32x2x2_t test_vld2_lane_s32(int32_t *a, int32x2x2_t b) { return vld2_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.int64x1x2_t @test_vld2_lane_s64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.int64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int64x1x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.int64x1x2_t @test_vld2_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X2_T:%.*]] poison, <1 x i64> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X2_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT64X1X2_T]] [[DOTFCA_0_1_INSERT]] +// int64x1x2_t test_vld2_lane_s64(int64_t *a, int64x1x2_t b) { return vld2_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.float16x4x2_t @test_vld2_lane_f16(ptr noundef %a, [2 x <4 x half>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0(<4 x half> [[TMP8]], <4 x half> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x half>, <4 x half> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.float16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float16x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.float16x4x2_t @test_vld2_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0(<4 x half> [[TMP4]], <4 x half> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T:%.*]] poison, <4 x half> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x half> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// float16x4x2_t test_vld2_lane_f16(float16_t *a, float16x4x2_t b) { return vld2_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.float32x2x2_t @test_vld2_lane_f32(ptr noundef %a, [2 x <2 x float>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x float>, <2 x float> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vld2_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0(<2 x float> [[TMP4]], <2 x float> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T:%.*]] poison, <2 x float> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x float> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// float32x2x2_t test_vld2_lane_f32(float32_t *a, float32x2x2_t b) { return vld2_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_lane_f64(ptr noundef %a, [2 x <1 x double>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> -// CHECK: [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, ptr %a) -// CHECK: store { <1 x double>, <1 x double> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld2_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_0_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_1_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_2_8_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_SROA_2_8_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0(<1 x double> [[TMP4]], <1 x double> [[TMP5]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T:%.*]] poison, <1 x double> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T]] [[DOTFCA_0_0_INSERT]], <1 x double> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X2_T]] [[DOTFCA_0_1_INSERT]] +// float64x1x2_t test_vld2_lane_f64(float64_t *a, float64x1x2_t b) { return vld2_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.poly8x8x2_t @test_vld2_lane_p8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vld2_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T:%.*]] poison, <8 x i8> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// poly8x8x2_t test_vld2_lane_p8(poly8_t *a, poly8x8x2_t b) { return vld2_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.poly16x4x2_t @test_vld2_lane_p16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vld2_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T:%.*]] poison, <4 x i16> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// poly16x4x2_t test_vld2_lane_p16(poly16_t *a, poly16x4x2_t b) { return vld2_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_lane_p64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld2_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 0 +// CHECK-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T:%.*]] poison, <1 x i64> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X2_T]] [[DOTFCA_0_1_INSERT]] +// poly64x1x2_t test_vld2_lane_p64(poly64_t *a, poly64x1x2_t b) { return vld2_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.uint16x8x3_t @test_vld3q_lane_u16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.uint16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.uint16x8x3_t @test_vld3q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X3_T:%.*]] poison, <8 x i16> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x i16> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X3_T]] [[DOTFCA_0_2_INSERT]] +// uint16x8x3_t test_vld3q_lane_u16(uint16_t *a, uint16x8x3_t b) { return vld3q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.uint32x4x3_t @test_vld3q_lane_u32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.uint32x4x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.uint32x4x3_t @test_vld3q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X3_T:%.*]] poison, <4 x i32> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x i32> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X3_T]] [[DOTFCA_0_2_INSERT]] +// uint32x4x3_t test_vld3q_lane_u32(uint32_t *a, uint32x4x3_t b) { return vld3q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_lane_u64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.uint64x2x3_t @test_vld3q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T:%.*]] poison, <2 x i64> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// uint64x2x3_t test_vld3q_lane_u64(uint64_t *a, uint64x2x3_t b) { return vld3q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.int16x8x3_t @test_vld3q_lane_s16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.int16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.int16x8x3_t @test_vld3q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X3_T:%.*]] poison, <8 x i16> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x i16> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT16X8X3_T]] [[DOTFCA_0_2_INSERT]] +// int16x8x3_t test_vld3q_lane_s16(int16_t *a, int16x8x3_t b) { return vld3q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.int32x4x3_t @test_vld3q_lane_s32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.int32x4x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.int32x4x3_t @test_vld3q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X3_T:%.*]] poison, <4 x i32> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x i32> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT32X4X3_T]] [[DOTFCA_0_2_INSERT]] +// int32x4x3_t test_vld3q_lane_s32(int32_t *a, int32x4x3_t b) { return vld3q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_lane_s64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.int64x2x3_t @test_vld3q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X3_T:%.*]] poison, <2 x i64> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// int64x2x3_t test_vld3q_lane_s64(int64_t *a, int64x2x3_t b) { return vld3q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.float16x8x3_t @test_vld3q_lane_f16(ptr noundef %a, [3 x <8 x half>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half> -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0(<8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i64 7, ptr %a) -// CHECK: store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.float16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float16x8x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.float16x8x3_t @test_vld3q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0(<8 x half> [[TMP6]], <8 x half> [[TMP7]], <8 x half> [[TMP8]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X3_T:%.*]] poison, <8 x half> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x half> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x half> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]] +// float16x8x3_t test_vld3q_lane_f16(float16_t *a, float16x8x3_t b) { return vld3q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.float32x4x3_t @test_vld3q_lane_f32(ptr noundef %a, [3 x <4 x float>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0(<4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.float32x4x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.float32x4x3_t @test_vld3q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0(<4 x float> [[TMP6]], <4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X3_T:%.*]] poison, <4 x float> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x float> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x float> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X3_T]] [[DOTFCA_0_2_INSERT]] +// float32x4x3_t test_vld3q_lane_f32(float32_t *a, float32x4x3_t b) { return vld3q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_lane_f64(ptr noundef %a, [3 x <2 x double>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0(<2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x double> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld3q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0(<2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T:%.*]] poison, <2 x double> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x double> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x double> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// float64x2x3_t test_vld3q_lane_f64(float64_t *a, float64x2x3_t b) { return vld3q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.poly8x16x3_t @test_vld3q_lane_p8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP9:%.*]] = load %struct.poly8x16x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x3_t [[TMP9]] +// CHECK-LABEL: define dso_local %struct.poly8x16x3_t @test_vld3q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X3_T:%.*]] poison, <16 x i8> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X3_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X3_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X3_T]] [[DOTFCA_0_2_INSERT]] +// poly8x16x3_t test_vld3q_lane_p8(poly8_t *a, poly8x16x3_t b) { return vld3q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} %struct.poly16x8x3_t @test_vld3q_lane_p16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.poly16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.poly16x8x3_t @test_vld3q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X3_T:%.*]] poison, <8 x i16> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x i16> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X3_T]] [[DOTFCA_0_2_INSERT]] +// poly16x8x3_t test_vld3q_lane_p16(poly16_t *a, poly16x8x3_t b) { return vld3q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_lane_p64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld3q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T:%.*]] poison, <2 x i64> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// poly64x2x3_t test_vld3q_lane_p64(poly64_t *a, poly64x2x3_t b) { return vld3q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.uint8x8x3_t @test_vld3_lane_u8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP9:%.*]] = load %struct.uint8x8x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x3_t [[TMP9]] +// CHECK-LABEL: define dso_local %struct.uint8x8x3_t @test_vld3_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X3_T:%.*]] poison, <8 x i8> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x i8> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X3_T]] [[DOTFCA_0_2_INSERT]] +// uint8x8x3_t test_vld3_lane_u8(uint8_t *a, uint8x8x3_t b) { return vld3_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.uint16x4x3_t @test_vld3_lane_u16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.uint16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.uint16x4x3_t @test_vld3_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X3_T:%.*]] poison, <4 x i16> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x i16> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X3_T]] [[DOTFCA_0_2_INSERT]] +// uint16x4x3_t test_vld3_lane_u16(uint16_t *a, uint16x4x3_t b) { return vld3_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.uint32x2x3_t @test_vld3_lane_u32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.uint32x2x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.uint32x2x3_t @test_vld3_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X3_T:%.*]] poison, <2 x i32> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i32> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X3_T]] [[DOTFCA_0_2_INSERT]] +// uint32x2x3_t test_vld3_lane_u32(uint32_t *a, uint32x2x3_t b) { return vld3_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.uint64x1x3_t @test_vld3_lane_u64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.uint64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint64x1x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.uint64x1x3_t @test_vld3_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X3_T:%.*]] poison, <1 x i64> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X3_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X3_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_UINT64X1X3_T]] [[DOTFCA_0_2_INSERT]] +// uint64x1x3_t test_vld3_lane_u64(uint64_t *a, uint64x1x3_t b) { return vld3_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.int8x8x3_t @test_vld3_lane_s8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP9:%.*]] = load %struct.int8x8x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x3_t [[TMP9]] +// CHECK-LABEL: define dso_local %struct.int8x8x3_t @test_vld3_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X3_T:%.*]] poison, <8 x i8> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x i8> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT8X8X3_T]] [[DOTFCA_0_2_INSERT]] +// int8x8x3_t test_vld3_lane_s8(int8_t *a, int8x8x3_t b) { return vld3_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.int16x4x3_t @test_vld3_lane_s16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.int16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.int16x4x3_t @test_vld3_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X3_T:%.*]] poison, <4 x i16> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x i16> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT16X4X3_T]] [[DOTFCA_0_2_INSERT]] +// int16x4x3_t test_vld3_lane_s16(int16_t *a, int16x4x3_t b) { return vld3_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.int32x2x3_t @test_vld3_lane_s32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.int32x2x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.int32x2x3_t @test_vld3_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X3_T:%.*]] poison, <2 x i32> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i32> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT32X2X3_T]] [[DOTFCA_0_2_INSERT]] +// int32x2x3_t test_vld3_lane_s32(int32_t *a, int32x2x3_t b) { return vld3_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.int64x1x3_t @test_vld3_lane_s64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.int64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int64x1x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.int64x1x3_t @test_vld3_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X3_T:%.*]] poison, <1 x i64> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X3_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X3_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_INT64X1X3_T]] [[DOTFCA_0_2_INSERT]] +// int64x1x3_t test_vld3_lane_s64(int64_t *a, int64x1x3_t b) { return vld3_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.float16x4x3_t @test_vld3_lane_f16(ptr noundef %a, [3 x <4 x half>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0(<4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.float16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float16x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.float16x4x3_t @test_vld3_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0(<4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X3_T:%.*]] poison, <4 x half> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x half> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x half> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]] +// float16x4x3_t test_vld3_lane_f16(float16_t *a, float16x4x3_t b) { return vld3_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.float32x2x3_t @test_vld3_lane_f32(ptr noundef %a, [3 x <2 x float>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0(<2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.float32x2x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.float32x2x3_t @test_vld3_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0(<2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X3_T:%.*]] poison, <2 x float> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x float> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x float> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X3_T]] [[DOTFCA_0_2_INSERT]] +// float32x2x3_t test_vld3_lane_f32(float32_t *a, float32x2x3_t b) { return vld3_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_lane_f64(ptr noundef %a, [3 x <1 x double>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double> -// CHECK: [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0(<1 x double> [[TMP10]], <1 x double> [[TMP11]], <1 x double> [[TMP12]], i64 0, ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld3_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_0_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_1_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_2_8_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_2_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_4_16_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[B_SROA_2_8_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[B_SROA_4_16_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0(<1 x double> [[TMP6]], <1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T:%.*]] poison, <1 x double> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] [[DOTFCA_0_0_INSERT]], <1 x double> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] [[DOTFCA_0_1_INSERT]], <1 x double> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X3_T]] [[DOTFCA_0_2_INSERT]] +// float64x1x3_t test_vld3_lane_f64(float64_t *a, float64x1x3_t b) { return vld3_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.poly8x8x3_t @test_vld3_lane_p8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP9:%.*]] = load %struct.poly8x8x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x3_t [[TMP9]] +// CHECK-LABEL: define dso_local %struct.poly8x8x3_t @test_vld3_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X3_T:%.*]] poison, <8 x i8> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x i8> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X3_T]] [[DOTFCA_0_2_INSERT]] +// poly8x8x3_t test_vld3_lane_p8(poly8_t *a, poly8x8x3_t b) { return vld3_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.poly16x4x3_t @test_vld3_lane_p16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.poly16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.poly16x4x3_t @test_vld3_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X3_T:%.*]] poison, <4 x i16> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x i16> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X3_T]] [[DOTFCA_0_2_INSERT]] +// poly16x4x3_t test_vld3_lane_p16(poly16_t *a, poly16x4x3_t b) { return vld3_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_lane_p64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld3_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 0 +// CHECK-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 1 +// CHECK-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T:%.*]] poison, <1 x i64> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X3_T]] [[DOTFCA_0_2_INSERT]] +// poly64x1x3_t test_vld3_lane_p64(poly64_t *a, poly64x1x3_t b) { return vld3_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.uint8x16x4_t @test_vld4q_lane_u8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP10:%.*]] = load %struct.uint8x16x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x4_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint8x16x4_t @test_vld4q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T:%.*]] poison, <16 x i8> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] [[DOTFCA_0_2_INSERT]], <16 x i8> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X4_T]] [[DOTFCA_0_3_INSERT]] +// uint8x16x4_t test_vld4q_lane_u8(uint8_t *a, uint8x16x4_t b) { return vld4q_lane_u8(a, b, 15); } -// CHECK-LABEL: define{{.*}} %struct.uint16x8x4_t @test_vld4q_lane_u16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.uint16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.uint16x8x4_t @test_vld4q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X4_T:%.*]] poison, <8 x i16> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x i16> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x i16> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X4_T]] [[DOTFCA_0_3_INSERT]] +// uint16x8x4_t test_vld4q_lane_u16(uint16_t *a, uint16x8x4_t b) { return vld4q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.uint32x4x4_t @test_vld4q_lane_u32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.uint32x4x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.uint32x4x4_t @test_vld4q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X4_T:%.*]] poison, <4 x i32> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x i32> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x i32> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X4_T]] [[DOTFCA_0_3_INSERT]] +// uint32x4x4_t test_vld4q_lane_u32(uint32_t *a, uint32x4x4_t b) { return vld4q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_lane_u64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.uint64x2x4_t @test_vld4q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T:%.*]] poison, <2 x i64> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i64> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// uint64x2x4_t test_vld4q_lane_u64(uint64_t *a, uint64x2x4_t b) { return vld4q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.int8x16x4_t @test_vld4q_lane_s8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP10:%.*]] = load %struct.int8x16x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x4_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int8x16x4_t @test_vld4q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T:%.*]] poison, <16 x i8> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_2_INSERT]], <16 x i8> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT8X16X4_T]] [[DOTFCA_0_3_INSERT]] +// int8x16x4_t test_vld4q_lane_s8(int8_t *a, int8x16x4_t b) { return vld4q_lane_s8(a, b, 15); } -// CHECK-LABEL: define{{.*}} %struct.int16x8x4_t @test_vld4q_lane_s16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.int16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.int16x8x4_t @test_vld4q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X4_T:%.*]] poison, <8 x i16> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x i16> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x i16> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT16X8X4_T]] [[DOTFCA_0_3_INSERT]] +// int16x8x4_t test_vld4q_lane_s16(int16_t *a, int16x8x4_t b) { return vld4q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.int32x4x4_t @test_vld4q_lane_s32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.int32x4x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.int32x4x4_t @test_vld4q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X4_T:%.*]] poison, <4 x i32> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x i32> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x i32> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT32X4X4_T]] [[DOTFCA_0_3_INSERT]] +// int32x4x4_t test_vld4q_lane_s32(int32_t *a, int32x4x4_t b) { return vld4q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_lane_s64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.int64x2x4_t @test_vld4q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X4_T:%.*]] poison, <2 x i64> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i64> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// int64x2x4_t test_vld4q_lane_s64(int64_t *a, int64x2x4_t b) { return vld4q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.float16x8x4_t @test_vld4q_lane_f16(ptr noundef %a, [4 x <8 x half>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half> -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0(<8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i64 7, ptr %a) -// CHECK: store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.float16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float16x8x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.float16x8x4_t @test_vld4q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0(<8 x half> [[TMP8]], <8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X4_T:%.*]] poison, <8 x half> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x half> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x half> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x half> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]] +// float16x8x4_t test_vld4q_lane_f16(float16_t *a, float16x8x4_t b) { return vld4q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.float32x4x4_t @test_vld4q_lane_f32(ptr noundef %a, [4 x <4 x float>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0(<4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.float32x4x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.float32x4x4_t @test_vld4q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_3_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0(<4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X4_T:%.*]] poison, <4 x float> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x float> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x float> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x float> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X4_T]] [[DOTFCA_0_3_INSERT]] +// float32x4x4_t test_vld4q_lane_f32(float32_t *a, float32x4x4_t b) { return vld4q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_lane_f64(ptr noundef %a, [4 x <2 x double>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <2 x double> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x double> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0(<2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld4q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_3_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0(<2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T:%.*]] poison, <2 x double> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x double> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x double> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x double> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// float64x2x4_t test_vld4q_lane_f64(float64_t *a, float64x2x4_t b) { return vld4q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.poly8x16x4_t @test_vld4q_lane_p8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP10:%.*]] = load %struct.poly8x16x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x4_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly8x16x4_t @test_vld4q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X4_T:%.*]] poison, <16 x i8> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X4_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X4_T]] [[DOTFCA_0_1_INSERT]], <16 x i8> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X4_T]] [[DOTFCA_0_2_INSERT]], <16 x i8> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X4_T]] [[DOTFCA_0_3_INSERT]] +// poly8x16x4_t test_vld4q_lane_p8(poly8_t *a, poly8x16x4_t b) { return vld4q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} %struct.poly16x8x4_t @test_vld4q_lane_p16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.poly16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.poly16x8x4_t @test_vld4q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X4_T:%.*]] poison, <8 x i16> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x i16> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x i16> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X4_T]] [[DOTFCA_0_3_INSERT]] +// poly16x8x4_t test_vld4q_lane_p16(poly16_t *a, poly16x8x4_t b) { return vld4q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_lane_p64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld4q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T:%.*]] poison, <2 x i64> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i64> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// poly64x2x4_t test_vld4q_lane_p64(poly64_t *a, poly64x2x4_t b) { return vld4q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.uint8x8x4_t @test_vld4_lane_u8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP10:%.*]] = load %struct.uint8x8x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x4_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint8x8x4_t @test_vld4_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X4_T:%.*]] poison, <8 x i8> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x i8> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x i8> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X4_T]] [[DOTFCA_0_3_INSERT]] +// uint8x8x4_t test_vld4_lane_u8(uint8_t *a, uint8x8x4_t b) { return vld4_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.uint16x4x4_t @test_vld4_lane_u16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.uint16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.uint16x4x4_t @test_vld4_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X4_T:%.*]] poison, <4 x i16> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x i16> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x i16> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X4_T]] [[DOTFCA_0_3_INSERT]] +// uint16x4x4_t test_vld4_lane_u16(uint16_t *a, uint16x4x4_t b) { return vld4_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.uint32x2x4_t @test_vld4_lane_u32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.uint32x2x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.uint32x2x4_t @test_vld4_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X4_T:%.*]] poison, <2 x i32> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i32> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i32> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X4_T]] [[DOTFCA_0_3_INSERT]] +// uint32x2x4_t test_vld4_lane_u32(uint32_t *a, uint32x2x4_t b) { return vld4_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.uint64x1x4_t @test_vld4_lane_u64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -// CHECK: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.uint64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint64x1x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.uint64x1x4_t @test_vld4_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP4]], <1 x i64> [[TMP5]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X4_T:%.*]] poison, <1 x i64> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X4_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X4_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_UINT64X1X4_T]] [[DOTFCA_0_2_INSERT]], <1 x i64> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT64X1X4_T]] [[DOTFCA_0_3_INSERT]] +// uint64x1x4_t test_vld4_lane_u64(uint64_t *a, uint64x1x4_t b) { return vld4_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.int8x8x4_t @test_vld4_lane_s8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP10:%.*]] = load %struct.int8x8x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x4_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int8x8x4_t @test_vld4_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X4_T:%.*]] poison, <8 x i8> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x i8> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x i8> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT8X8X4_T]] [[DOTFCA_0_3_INSERT]] +// int8x8x4_t test_vld4_lane_s8(int8_t *a, int8x8x4_t b) { return vld4_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.int16x4x4_t @test_vld4_lane_s16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.int16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.int16x4x4_t @test_vld4_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X4_T:%.*]] poison, <4 x i16> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x i16> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x i16> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT16X4X4_T]] [[DOTFCA_0_3_INSERT]] +// int16x4x4_t test_vld4_lane_s16(int16_t *a, int16x4x4_t b) { return vld4_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.int32x2x4_t @test_vld4_lane_s32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.int32x2x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.int32x2x4_t @test_vld4_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X4_T:%.*]] poison, <2 x i32> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i32> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i32> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT32X2X4_T]] [[DOTFCA_0_3_INSERT]] +// int32x2x4_t test_vld4_lane_s32(int32_t *a, int32x2x4_t b) { return vld4_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.int64x1x4_t @test_vld4_lane_s64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -// CHECK: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.int64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int64x1x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.int64x1x4_t @test_vld4_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP4]], <1 x i64> [[TMP5]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X4_T:%.*]] poison, <1 x i64> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X4_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X4_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_INT64X1X4_T]] [[DOTFCA_0_2_INSERT]], <1 x i64> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_INT64X1X4_T]] [[DOTFCA_0_3_INSERT]] +// int64x1x4_t test_vld4_lane_s64(int64_t *a, int64x1x4_t b) { return vld4_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.float16x4x4_t @test_vld4_lane_f16(ptr noundef %a, [4 x <4 x half>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0(<4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.float16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float16x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.float16x4x4_t @test_vld4_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0(<4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X4_T:%.*]] poison, <4 x half> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x half> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x half> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x half> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]] +// float16x4x4_t test_vld4_lane_f16(float16_t *a, float16x4x4_t b) { return vld4_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.float32x2x4_t @test_vld4_lane_f32(ptr noundef %a, [4 x <2 x float>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0(<2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.float32x2x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.float32x2x4_t @test_vld4_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0(<2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X4_T:%.*]] poison, <2 x float> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x float> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x float> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x float> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X4_T]] [[DOTFCA_0_3_INSERT]] +// float32x2x4_t test_vld4_lane_f32(float32_t *a, float32x2x4_t b) { return vld4_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_lane_f64(ptr noundef %a, [4 x <1 x double>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x double> -// CHECK: [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0(<1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], <1 x double> [[TMP15]], i64 0, ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld4_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_0_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_1_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_2_8_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_2_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_4_16_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_3_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_6_24_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP3]], i32 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[B_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[B_SROA_2_8_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[B_SROA_4_16_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[B_SROA_6_24_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0(<1 x double> [[TMP8]], <1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T:%.*]] poison, <1 x double> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_0_INSERT]], <1 x double> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_1_INSERT]], <1 x double> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_2_INSERT]], <1 x double> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X4_T]] [[DOTFCA_0_3_INSERT]] +// float64x1x4_t test_vld4_lane_f64(float64_t *a, float64x1x4_t b) { return vld4_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.poly8x8x4_t @test_vld4_lane_p8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP10:%.*]] = load %struct.poly8x8x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x4_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly8x8x4_t @test_vld4_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X4_T:%.*]] poison, <8 x i8> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x i8> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x i8> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X4_T]] [[DOTFCA_0_3_INSERT]] +// poly8x8x4_t test_vld4_lane_p8(poly8_t *a, poly8x8x4_t b) { return vld4_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.poly16x4x4_t @test_vld4_lane_p16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.poly16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.poly16x4x4_t @test_vld4_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X4_T:%.*]] poison, <4 x i16> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x i16> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x i16> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X4_T]] [[DOTFCA_0_3_INSERT]] +// poly16x4x4_t test_vld4_lane_p16(poly16_t *a, poly16x4x4_t b) { return vld4_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_lane_p64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -// CHECK: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld4_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP4]], <1 x i64> [[TMP5]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 0 +// CHECK-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 1 +// CHECK-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 2 +// CHECK-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T:%.*]] poison, <1 x i64> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_2_INSERT]], <1 x i64> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_3_INSERT]] +// poly64x1x4_t test_vld4_lane_p64(poly64_t *a, poly64x1x4_t b) { return vld4_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u8(ptr noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 -// CHECK: store i8 [[TMP0]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u8(uint8_t *a, uint8x16_t b) { vst1q_lane_u8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u16(ptr noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK: store i16 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u16(uint16_t *a, uint16x8_t b) { vst1q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u32(ptr noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -// CHECK: store i32 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u32(uint32_t *a, uint32x4_t b) { vst1q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u64(ptr noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -// CHECK: store i64 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK-NEXT: store i64 [[TMP2]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u64(uint64_t *a, uint64x2_t b) { vst1q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s8(ptr noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 -// CHECK: store i8 [[TMP0]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s8(int8_t *a, int8x16_t b) { vst1q_lane_s8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s16(ptr noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK: store i16 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s16(int16_t *a, int16x8_t b) { vst1q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s32(ptr noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -// CHECK: store i32 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s32(int32_t *a, int32x4_t b) { vst1q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s64(ptr noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -// CHECK: store i64 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK-NEXT: store i64 [[TMP2]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s64(int64_t *a, int64x2_t b) { vst1q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f16(ptr noundef %a, <8 x half> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK: [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7 -// CHECK: store half [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7 +// CHECK-NEXT: store half [[TMP3]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_f16(float16_t *a, float16x8_t b) { vst1q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f32(ptr noundef %a, <4 x float> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -// CHECK: store float [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +// CHECK-NEXT: store float [[TMP3]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_lane_f32(float32_t *a, float32x4_t b) { vst1q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f64(ptr noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 -// CHECK: store double [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +// CHECK-NEXT: store double [[TMP3]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_lane_f64(float64_t *a, float64x2_t b) { vst1q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p8(ptr noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 -// CHECK: store i8 [[TMP0]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_lane_p8(poly8_t *a, poly8x16_t b) { vst1q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p16(ptr noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK: store i16 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_p16(poly16_t *a, poly16x8_t b) { vst1q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p64(ptr noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -// CHECK: store i64 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +// CHECK-NEXT: store i64 [[TMP2]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_lane_p64(poly64_t *a, poly64x2_t b) { vst1q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u8(ptr noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 -// CHECK: store i8 [[TMP0]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_lane_u8(uint8_t *a, uint8x8_t b) { vst1_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u16(ptr noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK: store i16 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_u16(uint16_t *a, uint16x4_t b) { vst1_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u32(ptr noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -// CHECK: store i32 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_u32(uint32_t *a, uint32x2_t b) { vst1_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u64(ptr noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 -// CHECK: store i64 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK-NEXT: store i64 [[TMP2]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_lane_u64(uint64_t *a, uint64x1_t b) { vst1_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s8(ptr noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 -// CHECK: store i8 [[TMP0]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_lane_s8(int8_t *a, int8x8_t b) { vst1_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s16(ptr noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK: store i16 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_s16(int16_t *a, int16x4_t b) { vst1_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s32(ptr noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -// CHECK: store i32 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_s32(int32_t *a, int32x2_t b) { vst1_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s64(ptr noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 -// CHECK: store i64 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK-NEXT: store i64 [[TMP2]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_lane_s64(int64_t *a, int64x1_t b) { vst1_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_f16(ptr noundef %a, <4 x half> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK: [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3 -// CHECK: store half [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3 +// CHECK-NEXT: store half [[TMP3]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_f16(float16_t *a, float16x4_t b) { vst1_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_f32(ptr noundef %a, <2 x float> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -// CHECK: store float [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +// CHECK-NEXT: store float [[TMP3]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_f32(float32_t *a, float32x2_t b) { vst1_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_f64(ptr noundef %a, <1 x double> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK: [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0 -// CHECK: store double [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0 +// CHECK-NEXT: store double [[TMP3]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_lane_f64(float64_t *a, float64x1_t b) { vst1_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_p8(ptr noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 -// CHECK: store i8 [[TMP0]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_lane_p8(poly8_t *a, poly8x8_t b) { vst1_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_p16(ptr noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK: store i16 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_p16(poly16_t *a, poly16x4_t b) { vst1_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_p64(ptr noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 -// CHECK: store i64 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK-NEXT: store i64 [[TMP2]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_lane_p64(poly64_t *a, poly64x1_t b) { vst1_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_u8(uint8_t *a, uint8x16x2_t b) { vst2q_lane_u8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_u16(uint16_t *a, uint16x8x2_t b) { vst2q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_u32(uint32_t *a, uint32x4x2_t b) { vst2q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_u64(uint64_t *a, uint64x2x2_t b) { vst2q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_s8(int8_t *a, int8x16x2_t b) { vst2q_lane_s8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_s16(int16_t *a, int16x8x2_t b) { vst2q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_s32(int32_t *a, int32x4x2_t b) { vst2q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_s64(int64_t *a, int64x2x2_t b) { vst2q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f16(ptr noundef %a, [2 x <8 x half>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: call void @llvm.aarch64.neon.st2lane.v8f16.p0(<8 x half> [[TMP7]], <8 x half> [[TMP8]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8f16.p0(<8 x half> [[TMP4]], <8 x half> [[TMP5]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_f16(float16_t *a, float16x8x2_t b) { vst2q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f32(ptr noundef %a, [2 x <4 x float>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> [[TMP4]], <4 x float> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_f32(float32_t *a, float32x4x2_t b) { vst2q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f64(ptr noundef %a, [2 x <2 x double>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2f64.p0(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2f64.p0(<2 x double> [[TMP4]], <2 x double> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_f64(float64_t *a, float64x2x2_t b) { vst2q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_p8(poly8_t *a, poly8x16x2_t b) { vst2q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_p16(poly16_t *a, poly16x8x2_t b) { vst2q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_p64(poly64_t *a, poly64x2x2_t b) { vst2q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_u8(uint8_t *a, uint8x8x2_t b) { vst2_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_u16(uint16_t *a, uint16x4x2_t b) { vst2_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_u32(uint32_t *a, uint32x2x2_t b) { vst2_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_u64(uint64_t *a, uint64x1x2_t b) { vst2_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_s8(int8_t *a, int8x8x2_t b) { vst2_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_s16(int16_t *a, int16x4x2_t b) { vst2_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_s32(int32_t *a, int32x2x2_t b) { vst2_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_s64(int64_t *a, int64x1x2_t b) { vst2_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_f16(ptr noundef %a, [2 x <4 x half>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4f16.p0(<4 x half> [[TMP7]], <4 x half> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4f16.p0(<4 x half> [[TMP4]], <4 x half> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_f16(float16_t *a, float16x4x2_t b) { vst2_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_f32(ptr noundef %a, [2 x <2 x float>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2f32.p0(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2f32.p0(<2 x float> [[TMP4]], <2 x float> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_f32(float32_t *a, float32x2x2_t b) { vst2_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_f64(ptr noundef %a, [2 x <1 x double>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st2lane.v1f64.p0(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_0_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_1_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_2_8_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_SROA_2_8_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v1f64.p0(<1 x double> [[TMP4]], <1 x double> [[TMP5]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_f64(float64_t *a, float64x1x2_t b) { vst2_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_p8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_p8(poly8_t *a, poly8x8x2_t b) { vst2_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_p16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_p16(poly16_t *a, poly16x4x2_t b) { vst2_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_p64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_p64(poly64_t *a, poly64x1x2_t b) { vst2_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_u8(uint8_t *a, uint8x16x3_t b) { vst3q_lane_u8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_u16(uint16_t *a, uint16x8x3_t b) { vst3q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_u32(uint32_t *a, uint32x4x3_t b) { vst3q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_u64(uint64_t *a, uint64x2x3_t b) { vst3q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_s8(int8_t *a, int8x16x3_t b) { vst3q_lane_s8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_s16(int16_t *a, int16x8x3_t b) { vst3q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_s32(int32_t *a, int32x4x3_t b) { vst3q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_s64(int64_t *a, int64x2x3_t b) { vst3q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f16(ptr noundef %a, [3 x <8 x half>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: call void @llvm.aarch64.neon.st3lane.v8f16.p0(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8f16.p0(<8 x half> [[TMP6]], <8 x half> [[TMP7]], <8 x half> [[TMP8]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_f16(float16_t *a, float16x8x3_t b) { vst3q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f32(ptr noundef %a, [3 x <4 x float>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> [[TMP6]], <4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_f32(float32_t *a, float32x4x3_t b) { vst3q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f64(ptr noundef %a, [3 x <2 x double>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2f64.p0(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2f64.p0(<2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_f64(float64_t *a, float64x2x3_t b) { vst3q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_p8(poly8_t *a, poly8x16x3_t b) { vst3q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_p16(poly16_t *a, poly16x8x3_t b) { vst3q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_p64(poly64_t *a, poly64x2x3_t b) { vst3q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_u8(uint8_t *a, uint8x8x3_t b) { vst3_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_u16(uint16_t *a, uint16x4x3_t b) { vst3_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_u32(uint32_t *a, uint32x2x3_t b) { vst3_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_u64(uint64_t *a, uint64x1x3_t b) { vst3_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_s8(int8_t *a, int8x8x3_t b) { vst3_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_s16(int16_t *a, int16x4x3_t b) { vst3_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_s32(int32_t *a, int32x2x3_t b) { vst3_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_s64(int64_t *a, int64x1x3_t b) { vst3_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_f16(ptr noundef %a, [3 x <4 x half>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4f16.p0(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4f16.p0(<4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_f16(float16_t *a, float16x4x3_t b) { vst3_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_f32(ptr noundef %a, [3 x <2 x float>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2f32.p0(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2f32.p0(<2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_f32(float32_t *a, float32x2x3_t b) { vst3_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_f64(ptr noundef %a, [3 x <1 x double>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st3lane.v1f64.p0(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_0_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_1_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_2_8_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_2_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_4_16_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[B_SROA_2_8_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[B_SROA_4_16_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v1f64.p0(<1 x double> [[TMP6]], <1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_f64(float64_t *a, float64x1x3_t b) { vst3_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_p8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_p8(poly8_t *a, poly8x8x3_t b) { vst3_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_p16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_p16(poly16_t *a, poly16x4x3_t b) { vst3_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_p64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_p64(poly64_t *a, poly64x1x3_t b) { vst3_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_u8(uint8_t *a, uint8x16x4_t b) { vst4q_lane_u8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_u16(uint16_t *a, uint16x8x4_t b) { vst4q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_u32(uint32_t *a, uint32x4x4_t b) { vst4q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_u64(uint64_t *a, uint64x2x4_t b) { vst4q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_s8(int8_t *a, int8x16x4_t b) { vst4q_lane_s8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_s16(int16_t *a, int16x8x4_t b) { vst4q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_s32(int32_t *a, int32x4x4_t b) { vst4q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_s64(int64_t *a, int64x2x4_t b) { vst4q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f16(ptr noundef %a, [4 x <8 x half>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half> -// CHECK: call void @llvm.aarch64.neon.st4lane.v8f16.p0(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8f16.p0(<8 x half> [[TMP8]], <8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_f16(float16_t *a, float16x8x4_t b) { vst4q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f32(ptr noundef %a, [4 x <4 x float>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_3_EXTRACT]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_f32(float32_t *a, float32x4x4_t b) { vst4q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f64(ptr noundef %a, [4 x <2 x double>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2f64.p0(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[B_COERCE_FCA_3_EXTRACT]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2f64.p0(<2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_f64(float64_t *a, float64x2x4_t b) { vst4q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_p8(poly8_t *a, poly8x16x4_t b) { vst4q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_p16(poly16_t *a, poly16x8x4_t b) { vst4q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_p64(poly64_t *a, poly64x2x4_t b) { vst4q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_u8(uint8_t *a, uint8x8x4_t b) { vst4_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_u16(uint16_t *a, uint16x4x4_t b) { vst4_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_u32(uint32_t *a, uint32x2x4_t b) { vst4_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP4]], <1 x i64> [[TMP5]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_u64(uint64_t *a, uint64x1x4_t b) { vst4_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_s8(int8_t *a, int8x8x4_t b) { vst4_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_s16(int16_t *a, int16x4x4_t b) { vst4_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_s32(int32_t *a, int32x2x4_t b) { vst4_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP4]], <1 x i64> [[TMP5]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_s64(int64_t *a, int64x1x4_t b) { vst4_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_f16(ptr noundef %a, [4 x <4 x half>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4f16.p0(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4f16.p0(<4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_f16(float16_t *a, float16x4x4_t b) { vst4_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_f32(ptr noundef %a, [4 x <2 x float>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2f32.p0(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2f32.p0(<2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_f32(float32_t *a, float32x2x4_t b) { vst4_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_f64(ptr noundef %a, [4 x <1 x double>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st4lane.v1f64.p0(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_0_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_1_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_2_8_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_2_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_4_16_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x double> [[B_COERCE_FCA_3_EXTRACT]] to i64 +// CHECK-NEXT: [[B_SROA_6_24_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP3]], i32 0 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[B_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[B_SROA_2_8_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[B_SROA_4_16_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[B_SROA_6_24_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v1f64.p0(<1 x double> [[TMP8]], <1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_f64(float64_t *a, float64x1x4_t b) { vst4_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_p8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_p8(poly8_t *a, poly8x8x4_t b) { vst4_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_p16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_p16(poly16_t *a, poly16x4x4_t b) { vst4_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_p64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP4]], <1 x i64> [[TMP5]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_p64(poly64_t *a, poly64x1x4_t b) { vst4_lane_p64(a, b, 0); } diff --git a/clang/test/CodeGen/AArch64/neon-misc-constrained.c b/clang/test/CodeGen/AArch64/neon-misc-constrained.c index e24e129d2bc7d..06ecfd91252a1 100644 --- a/clang/test/CodeGen/AArch64/neon-misc-constrained.c +++ b/clang/test/CodeGen/AArch64/neon-misc-constrained.c @@ -1,17 +1,11 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=UNCONSTRAINED %s +// RUN: | opt -S -passes=mem2reg,sroa | FileCheck --check-prefix=UNCONSTRAINED %s // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -ffp-exception-behavior=strict \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=CONSTRAINED %s -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -disable-O0-optnone -S -o - %s \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-exception-behavior=strict \ -// RUN: -disable-O0-optnone -S -o - %s \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s +// RUN: | opt -S -passes=mem2reg,sroa | FileCheck --check-prefix=CONSTRAINED %s // REQUIRES: aarch64-registered-target @@ -19,42 +13,93 @@ #include -// COMMON-LABEL: test_vrndaq_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// UNCONSTRAINED: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> %a) -// CONSTRAINED: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double> %a, metadata !"fpexcept.strict") -// CHECK-ASM: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d -// COMMONIR: ret <2 x double> [[VRNDA1_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vrndaq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// UNCONSTRAINED-NEXT: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[VRNDA_I]]) +// UNCONSTRAINED-NEXT: ret <2 x double> [[VRNDA1_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vrndaq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CONSTRAINED-NEXT: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double> [[VRNDA_I]], metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]] +// CONSTRAINED-NEXT: ret <2 x double> [[VRNDA1_I]] +// float64x2_t test_vrndaq_f64(float64x2_t a) { return vrndaq_f64(a); } -// COMMON-LABEL: test_vrndpq_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// UNCONSTRAINED: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) -// CONSTRAINED: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double> %a, metadata !"fpexcept.strict") -// CHECK-ASM: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d -// COMMONIR: ret <2 x double> [[VRNDP1_I]] + +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vrndpq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// UNCONSTRAINED-NEXT: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[VRNDP_I]]) +// UNCONSTRAINED-NEXT: ret <2 x double> [[VRNDP1_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vrndpq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CONSTRAINED-NEXT: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double> [[VRNDP_I]], metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <2 x double> [[VRNDP1_I]] +// float64x2_t test_vrndpq_f64(float64x2_t a) { return vrndpq_f64(a); } -// COMMON-LABEL: test_vsqrtq_f32 -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// UNCONSTRAINED: [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) -// CONSTRAINED: [[VSQRT_I:%.*]] = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s -// COMMONIR: ret <4 x float> [[VSQRT_I]] + +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vsqrtq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// UNCONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]]) +// UNCONSTRAINED-NEXT: ret <4 x float> [[VSQRT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vsqrtq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float> [[TMP2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x float> [[VSQRT_I]] +// float32x4_t test_vsqrtq_f32(float32x4_t a) { return vsqrtq_f32(a); } -// COMMON-LABEL: test_vsqrtq_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// UNCONSTRAINED: [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) -// CONSTRAINED: [[VSQRT_I:%.*]] = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d -// COMMONIR: ret <2 x double> [[VSQRT_I]] + +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vsqrtq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// UNCONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]]) +// UNCONSTRAINED-NEXT: ret <2 x double> [[VSQRT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vsqrtq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double> [[TMP2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <2 x double> [[VSQRT_I]] +// float64x2_t test_vsqrtq_f64(float64x2_t a) { return vsqrtq_f64(a); } diff --git a/clang/test/CodeGen/AArch64/neon-misc.c b/clang/test/CodeGen/AArch64/neon-misc.c index 165f33a9f399f..6eadaaf27a210 100644 --- a/clang/test/CodeGen/AArch64/neon-misc.c +++ b/clang/test/CodeGen/AArch64/neon-misc.c @@ -1,2718 +1,4056 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck %s +// RUN: | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: @test_vceqz_s8( -// CHECK: [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vceqz_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCEQZ_I]] +// uint8x8_t test_vceqz_s8(int8x8_t a) { return vceqz_s8(a); } -// CHECK-LABEL: @test_vceqz_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <4 x i16> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vceqz_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCEQZ_I]] +// uint16x4_t test_vceqz_s16(int16x4_t a) { return vceqz_s16(a); } -// CHECK-LABEL: @test_vceqz_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <2 x i32> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vceqz_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCEQZ_I]] +// uint32x2_t test_vceqz_s32(int32x2_t a) { return vceqz_s32(a); } -// CHECK-LABEL: @test_vceqz_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCEQZ_I]] +// uint64x1_t test_vceqz_s64(int64x1_t a) { return vceqz_s64(a); } -// CHECK-LABEL: @test_vceqz_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCEQZ_I]] +// uint64x1_t test_vceqz_u64(uint64x1_t a) { return vceqz_u64(a); } -// CHECK-LABEL: @test_vceqz_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCEQZ_I]] +// uint64x1_t test_vceqz_p64(poly64x1_t a) { return vceqz_p64(a); } -// CHECK-LABEL: @test_vceqzq_s8( -// CHECK: [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vceqzq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCEQZ_I]] +// uint8x16_t test_vceqzq_s8(int8x16_t a) { return vceqzq_s8(a); } -// CHECK-LABEL: @test_vceqzq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <8 x i16> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> -// CHECK: ret <8 x i16> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vceqzq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCEQZ_I]] +// uint16x8_t test_vceqzq_s16(int16x8_t a) { return vceqzq_s16(a); } -// CHECK-LABEL: @test_vceqzq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <4 x i32> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vceqzq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCEQZ_I]] +// uint32x4_t test_vceqzq_s32(int32x4_t a) { return vceqzq_s32(a); } -// CHECK-LABEL: @test_vceqzq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCEQZ_I]] +// uint64x2_t test_vceqzq_s64(int64x2_t a) { return vceqzq_s64(a); } -// CHECK-LABEL: @test_vceqz_u8( -// CHECK: [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vceqz_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCEQZ_I]] +// uint8x8_t test_vceqz_u8(uint8x8_t a) { return vceqz_u8(a); } -// CHECK-LABEL: @test_vceqz_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <4 x i16> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vceqz_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCEQZ_I]] +// uint16x4_t test_vceqz_u16(uint16x4_t a) { return vceqz_u16(a); } -// CHECK-LABEL: @test_vceqz_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <2 x i32> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vceqz_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCEQZ_I]] +// uint32x2_t test_vceqz_u32(uint32x2_t a) { return vceqz_u32(a); } -// CHECK-LABEL: @test_vceqzq_u8( -// CHECK: [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vceqzq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCEQZ_I]] +// uint8x16_t test_vceqzq_u8(uint8x16_t a) { return vceqzq_u8(a); } -// CHECK-LABEL: @test_vceqzq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <8 x i16> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> -// CHECK: ret <8 x i16> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vceqzq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCEQZ_I]] +// uint16x8_t test_vceqzq_u16(uint16x8_t a) { return vceqzq_u16(a); } -// CHECK-LABEL: @test_vceqzq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <4 x i32> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vceqzq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCEQZ_I]] +// uint32x4_t test_vceqzq_u32(uint32x4_t a) { return vceqzq_u32(a); } -// CHECK-LABEL: @test_vceqzq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCEQZ_I]] +// uint64x2_t test_vceqzq_u64(uint64x2_t a) { return vceqzq_u64(a); } -// CHECK-LABEL: @test_vceqz_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oeq <2 x float> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vceqz_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <2 x float> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCEQZ_I]] +// uint32x2_t test_vceqz_f32(float32x2_t a) { return vceqz_f32(a); } -// CHECK-LABEL: @test_vceqz_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oeq <1 x double> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <1 x double> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCEQZ_I]] +// uint64x1_t test_vceqz_f64(float64x1_t a) { return vceqz_f64(a); } -// CHECK-LABEL: @test_vceqzq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oeq <4 x float> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vceqzq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <4 x float> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCEQZ_I]] +// uint32x4_t test_vceqzq_f32(float32x4_t a) { return vceqzq_f32(a); } -// CHECK-LABEL: @test_vceqz_p8( -// CHECK: [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vceqz_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCEQZ_I]] +// uint8x8_t test_vceqz_p8(poly8x8_t a) { return vceqz_p8(a); } -// CHECK-LABEL: @test_vceqzq_p8( -// CHECK: [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vceqzq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCEQZ_I]] +// uint8x16_t test_vceqzq_p8(poly8x16_t a) { return vceqzq_p8(a); } -// CHECK-LABEL: @test_vceqzq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oeq <2 x double> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <2 x double> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCEQZ_I]] +// uint64x2_t test_vceqzq_f64(float64x2_t a) { return vceqzq_f64(a); } -// CHECK-LABEL: @test_vceqzq_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCEQZ_I]] +// uint64x2_t test_vceqzq_p64(poly64x2_t a) { return vceqzq_p64(a); } -// CHECK-LABEL: @test_vcgez_s8( -// CHECK: [[TMP0:%.*]] = icmp sge <8 x i8> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcgez_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sge <8 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCGEZ_I]] +// uint8x8_t test_vcgez_s8(int8x8_t a) { return vcgez_s8(a); } -// CHECK-LABEL: @test_vcgez_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sge <4 x i16> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgez_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sge <4 x i16> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCGEZ_I]] +// uint16x4_t test_vcgez_s16(int16x4_t a) { return vcgez_s16(a); } -// CHECK-LABEL: @test_vcgez_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sge <2 x i32> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgez_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sge <2 x i32> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCGEZ_I]] +// uint32x2_t test_vcgez_s32(int32x2_t a) { return vcgez_s32(a); } -// CHECK-LABEL: @test_vcgez_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sge <1 x i64> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgez_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sge <1 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCGEZ_I]] +// uint64x1_t test_vcgez_s64(int64x1_t a) { return vcgez_s64(a); } -// CHECK-LABEL: @test_vcgezq_s8( -// CHECK: [[TMP0:%.*]] = icmp sge <16 x i8> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcgezq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sge <16 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCGEZ_I]] +// uint8x16_t test_vcgezq_s8(int8x16_t a) { return vcgezq_s8(a); } -// CHECK-LABEL: @test_vcgezq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sge <8 x i16> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> -// CHECK: ret <8 x i16> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgezq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sge <8 x i16> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCGEZ_I]] +// uint16x8_t test_vcgezq_s16(int16x8_t a) { return vcgezq_s16(a); } -// CHECK-LABEL: @test_vcgezq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sge <4 x i32> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgezq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sge <4 x i32> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCGEZ_I]] +// uint32x4_t test_vcgezq_s32(int32x4_t a) { return vcgezq_s32(a); } -// CHECK-LABEL: @test_vcgezq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sge <2 x i64> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgezq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sge <2 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCGEZ_I]] +// uint64x2_t test_vcgezq_s64(int64x2_t a) { return vcgezq_s64(a); } -// CHECK-LABEL: @test_vcgez_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oge <2 x float> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgez_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oge <2 x float> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCGEZ_I]] +// uint32x2_t test_vcgez_f32(float32x2_t a) { return vcgez_f32(a); } -// CHECK-LABEL: @test_vcgez_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oge <1 x double> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgez_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oge <1 x double> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCGEZ_I]] +// uint64x1_t test_vcgez_f64(float64x1_t a) { return vcgez_f64(a); } -// CHECK-LABEL: @test_vcgezq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oge <4 x float> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgezq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oge <4 x float> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCGEZ_I]] +// uint32x4_t test_vcgezq_f32(float32x4_t a) { return vcgezq_f32(a); } -// CHECK-LABEL: @test_vcgezq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oge <2 x double> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgezq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oge <2 x double> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCGEZ_I]] +// uint64x2_t test_vcgezq_f64(float64x2_t a) { return vcgezq_f64(a); } -// CHECK-LABEL: @test_vclez_s8( -// CHECK: [[TMP0:%.*]] = icmp sle <8 x i8> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vclez_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sle <8 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCLEZ_I]] +// uint8x8_t test_vclez_s8(int8x8_t a) { return vclez_s8(a); } -// CHECK-LABEL: @test_vclez_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sle <4 x i16> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclez_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sle <4 x i16> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCLEZ_I]] +// uint16x4_t test_vclez_s16(int16x4_t a) { return vclez_s16(a); } -// CHECK-LABEL: @test_vclez_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sle <2 x i32> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclez_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sle <2 x i32> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCLEZ_I]] +// uint32x2_t test_vclez_s32(int32x2_t a) { return vclez_s32(a); } -// CHECK-LABEL: @test_vclez_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sle <1 x i64> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vclez_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sle <1 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCLEZ_I]] +// uint64x1_t test_vclez_s64(int64x1_t a) { return vclez_s64(a); } -// CHECK-LABEL: @test_vclezq_s8( -// CHECK: [[TMP0:%.*]] = icmp sle <16 x i8> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vclezq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sle <16 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCLEZ_I]] +// uint8x16_t test_vclezq_s8(int8x16_t a) { return vclezq_s8(a); } -// CHECK-LABEL: @test_vclezq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sle <8 x i16> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> -// CHECK: ret <8 x i16> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vclezq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sle <8 x i16> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCLEZ_I]] +// uint16x8_t test_vclezq_s16(int16x8_t a) { return vclezq_s16(a); } -// CHECK-LABEL: @test_vclezq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sle <4 x i32> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vclezq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sle <4 x i32> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCLEZ_I]] +// uint32x4_t test_vclezq_s32(int32x4_t a) { return vclezq_s32(a); } -// CHECK-LABEL: @test_vclezq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sle <2 x i64> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vclezq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sle <2 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCLEZ_I]] +// uint64x2_t test_vclezq_s64(int64x2_t a) { return vclezq_s64(a); } -// CHECK-LABEL: @test_vclez_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ole <2 x float> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclez_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ole <2 x float> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCLEZ_I]] +// uint32x2_t test_vclez_f32(float32x2_t a) { return vclez_f32(a); } -// CHECK-LABEL: @test_vclez_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ole <1 x double> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vclez_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ole <1 x double> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCLEZ_I]] +// uint64x1_t test_vclez_f64(float64x1_t a) { return vclez_f64(a); } -// CHECK-LABEL: @test_vclezq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ole <4 x float> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vclezq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ole <4 x float> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCLEZ_I]] +// uint32x4_t test_vclezq_f32(float32x4_t a) { return vclezq_f32(a); } -// CHECK-LABEL: @test_vclezq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ole <2 x double> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vclezq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ole <2 x double> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCLEZ_I]] +// uint64x2_t test_vclezq_f64(float64x2_t a) { return vclezq_f64(a); } -// CHECK-LABEL: @test_vcgtz_s8( -// CHECK: [[TMP0:%.*]] = icmp sgt <8 x i8> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcgtz_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCGTZ_I]] +// uint8x8_t test_vcgtz_s8(int8x8_t a) { return vcgtz_s8(a); } -// CHECK-LABEL: @test_vcgtz_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sgt <4 x i16> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgtz_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i16> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCGTZ_I]] +// uint16x4_t test_vcgtz_s16(int16x4_t a) { return vcgtz_s16(a); } -// CHECK-LABEL: @test_vcgtz_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sgt <2 x i32> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgtz_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i32> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCGTZ_I]] +// uint32x2_t test_vcgtz_s32(int32x2_t a) { return vcgtz_s32(a); } -// CHECK-LABEL: @test_vcgtz_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sgt <1 x i64> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgtz_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <1 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCGTZ_I]] +// uint64x1_t test_vcgtz_s64(int64x1_t a) { return vcgtz_s64(a); } -// CHECK-LABEL: @test_vcgtzq_s8( -// CHECK: [[TMP0:%.*]] = icmp sgt <16 x i8> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcgtzq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <16 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCGTZ_I]] +// uint8x16_t test_vcgtzq_s8(int8x16_t a) { return vcgtzq_s8(a); } -// CHECK-LABEL: @test_vcgtzq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sgt <8 x i16> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> -// CHECK: ret <8 x i16> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtzq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <8 x i16> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCGTZ_I]] +// uint16x8_t test_vcgtzq_s16(int16x8_t a) { return vcgtzq_s16(a); } -// CHECK-LABEL: @test_vcgtzq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sgt <4 x i32> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtzq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCGTZ_I]] +// uint32x4_t test_vcgtzq_s32(int32x4_t a) { return vcgtzq_s32(a); } -// CHECK-LABEL: @test_vcgtzq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sgt <2 x i64> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtzq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCGTZ_I]] +// uint64x2_t test_vcgtzq_s64(int64x2_t a) { return vcgtzq_s64(a); } -// CHECK-LABEL: @test_vcgtz_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ogt <2 x float> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgtz_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x float> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCGTZ_I]] +// uint32x2_t test_vcgtz_f32(float32x2_t a) { return vcgtz_f32(a); } -// CHECK-LABEL: @test_vcgtz_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ogt <1 x double> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgtz_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <1 x double> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCGTZ_I]] +// uint64x1_t test_vcgtz_f64(float64x1_t a) { return vcgtz_f64(a); } -// CHECK-LABEL: @test_vcgtzq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ogt <4 x float> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtzq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCGTZ_I]] +// uint32x4_t test_vcgtzq_f32(float32x4_t a) { return vcgtzq_f32(a); } -// CHECK-LABEL: @test_vcgtzq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ogt <2 x double> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtzq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x double> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCGTZ_I]] +// uint64x2_t test_vcgtzq_f64(float64x2_t a) { return vcgtzq_f64(a); } -// CHECK-LABEL: @test_vcltz_s8( -// CHECK: [[TMP0:%.*]] = icmp slt <8 x i8> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcltz_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt <8 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCLTZ_I]] +// uint8x8_t test_vcltz_s8(int8x8_t a) { return vcltz_s8(a); } -// CHECK-LABEL: @test_vcltz_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp slt <4 x i16> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcltz_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = icmp slt <4 x i16> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCLTZ_I]] +// uint16x4_t test_vcltz_s16(int16x4_t a) { return vcltz_s16(a); } -// CHECK-LABEL: @test_vcltz_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp slt <2 x i32> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcltz_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCLTZ_I]] +// uint32x2_t test_vcltz_s32(int32x2_t a) { return vcltz_s32(a); } -// CHECK-LABEL: @test_vcltz_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp slt <1 x i64> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcltz_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp slt <1 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCLTZ_I]] +// uint64x1_t test_vcltz_s64(int64x1_t a) { return vcltz_s64(a); } -// CHECK-LABEL: @test_vcltzq_s8( -// CHECK: [[TMP0:%.*]] = icmp slt <16 x i8> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcltzq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt <16 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCLTZ_I]] +// uint8x16_t test_vcltzq_s8(int8x16_t a) { return vcltzq_s8(a); } -// CHECK-LABEL: @test_vcltzq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp slt <8 x i16> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> -// CHECK: ret <8 x i16> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcltzq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = icmp slt <8 x i16> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCLTZ_I]] +// uint16x8_t test_vcltzq_s16(int16x8_t a) { return vcltzq_s16(a); } -// CHECK-LABEL: @test_vcltzq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp slt <4 x i32> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcltzq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCLTZ_I]] +// uint32x4_t test_vcltzq_s32(int32x4_t a) { return vcltzq_s32(a); } -// CHECK-LABEL: @test_vcltzq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp slt <2 x i64> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcltzq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = icmp slt <2 x i64> [[TMP1]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCLTZ_I]] +// uint64x2_t test_vcltzq_s64(int64x2_t a) { return vcltzq_s64(a); } -// CHECK-LABEL: @test_vcltz_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp olt <2 x float> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcltz_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <2 x float> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCLTZ_I]] +// uint32x2_t test_vcltz_f32(float32x2_t a) { return vcltz_f32(a); } -// CHECK-LABEL: @test_vcltz_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp olt <1 x double> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcltz_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <1 x double> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCLTZ_I]] +// uint64x1_t test_vcltz_f64(float64x1_t a) { return vcltz_f64(a); } -// CHECK-LABEL: @test_vcltzq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp olt <4 x float> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcltzq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <4 x float> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCLTZ_I]] +// uint32x4_t test_vcltzq_f32(float32x4_t a) { return vcltzq_f32(a); } -// CHECK-LABEL: @test_vcltzq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp olt <2 x double> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcltzq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <2 x double> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCLTZ_I]] +// uint64x2_t test_vcltzq_f64(float64x2_t a) { return vcltzq_f64(a); } -// CHECK-LABEL: @test_vrev16_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vrev16_s8(int8x8_t a) { return vrev16_s8(a); } -// CHECK-LABEL: @test_vrev16_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vrev16_u8(uint8x8_t a) { return vrev16_u8(a); } -// CHECK-LABEL: @test_vrev16_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vrev16_p8(poly8x8_t a) { return vrev16_p8(a); } -// CHECK-LABEL: @test_vrev16q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev16q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrev16q_s8(int8x16_t a) { return vrev16q_s8(a); } -// CHECK-LABEL: @test_vrev16q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev16q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrev16q_u8(uint8x16_t a) { return vrev16q_u8(a); } -// CHECK-LABEL: @test_vrev16q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev16q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vrev16q_p8(poly8x16_t a) { return vrev16q_p8(a); } -// CHECK-LABEL: @test_vrev32_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vrev32_s8(int8x8_t a) { return vrev32_s8(a); } -// CHECK-LABEL: @test_vrev32_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrev32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vrev32_s16(int16x4_t a) { return vrev32_s16(a); } -// CHECK-LABEL: @test_vrev32_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vrev32_u8(uint8x8_t a) { return vrev32_u8(a); } -// CHECK-LABEL: @test_vrev32_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrev32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vrev32_u16(uint16x4_t a) { return vrev32_u16(a); } -// CHECK-LABEL: @test_vrev32_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vrev32_p8(poly8x8_t a) { return vrev32_p8(a); } -// CHECK-LABEL: @test_vrev32_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrev32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vrev32_p16(poly16x4_t a) { return vrev32_p16(a); } -// CHECK-LABEL: @test_vrev32q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev32q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrev32q_s8(int8x16_t a) { return vrev32q_s8(a); } -// CHECK-LABEL: @test_vrev32q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrev32q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vrev32q_s16(int16x8_t a) { return vrev32q_s16(a); } -// CHECK-LABEL: @test_vrev32q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev32q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrev32q_u8(uint8x16_t a) { return vrev32q_u8(a); } -// CHECK-LABEL: @test_vrev32q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrev32q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vrev32q_u16(uint16x8_t a) { return vrev32q_u16(a); } -// CHECK-LABEL: @test_vrev32q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev32q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vrev32q_p8(poly8x16_t a) { return vrev32q_p8(a); } -// CHECK-LABEL: @test_vrev32q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrev32q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vrev32q_p16(poly16x8_t a) { return vrev32q_p16(a); } -// CHECK-LABEL: @test_vrev64_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vrev64_s8(int8x8_t a) { return vrev64_s8(a); } -// CHECK-LABEL: @test_vrev64_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrev64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vrev64_s16(int16x4_t a) { return vrev64_s16(a); } -// CHECK-LABEL: @test_vrev64_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrev64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[A]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vrev64_s32(int32x2_t a) { return vrev64_s32(a); } -// CHECK-LABEL: @test_vrev64_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vrev64_u8(uint8x8_t a) { return vrev64_u8(a); } -// CHECK-LABEL: @test_vrev64_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrev64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vrev64_u16(uint16x4_t a) { return vrev64_u16(a); } -// CHECK-LABEL: @test_vrev64_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrev64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[A]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vrev64_u32(uint32x2_t a) { return vrev64_u32(a); } -// CHECK-LABEL: @test_vrev64_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vrev64_p8(poly8x8_t a) { return vrev64_p8(a); } -// CHECK-LABEL: @test_vrev64_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrev64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vrev64_p16(poly16x4_t a) { return vrev64_p16(a); } -// CHECK-LABEL: @test_vrev64_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrev64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[A]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vrev64_f32(float32x2_t a) { return vrev64_f32(a); } -// CHECK-LABEL: @test_vrev64q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev64q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrev64q_s8(int8x16_t a) { return vrev64q_s8(a); } -// CHECK-LABEL: @test_vrev64q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrev64q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vrev64q_s16(int16x8_t a) { return vrev64q_s16(a); } -// CHECK-LABEL: @test_vrev64q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrev64q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vrev64q_s32(int32x4_t a) { return vrev64q_s32(a); } -// CHECK-LABEL: @test_vrev64q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev64q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrev64q_u8(uint8x16_t a) { return vrev64q_u8(a); } -// CHECK-LABEL: @test_vrev64q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrev64q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vrev64q_u16(uint16x8_t a) { return vrev64q_u16(a); } -// CHECK-LABEL: @test_vrev64q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrev64q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vrev64q_u32(uint32x4_t a) { return vrev64q_u32(a); } -// CHECK-LABEL: @test_vrev64q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev64q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vrev64q_p8(poly8x16_t a) { return vrev64q_p8(a); } -// CHECK-LABEL: @test_vrev64q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrev64q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vrev64q_p16(poly16x8_t a) { return vrev64q_p16(a); } -// CHECK-LABEL: @test_vrev64q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrev64q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vrev64q_f32(float32x4_t a) { return vrev64q_f32(a); } -// CHECK-LABEL: @test_vpaddl_s8( -// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %a) -// CHECK: ret <4 x i16> [[VPADDL_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpaddl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VPADDL_I]] +// int16x4_t test_vpaddl_s8(int8x8_t a) { return vpaddl_s8(a); } -// CHECK-LABEL: @test_vpaddl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %a) -// CHECK: ret <2 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpaddl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) +// CHECK-NEXT: ret <2 x i32> [[VPADDL1_I]] +// int32x2_t test_vpaddl_s16(int16x4_t a) { return vpaddl_s16(a); } -// CHECK-LABEL: @test_vpaddl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %a) -// CHECK: ret <1 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vpaddl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) +// CHECK-NEXT: ret <1 x i64> [[VPADDL1_I]] +// int64x1_t test_vpaddl_s32(int32x2_t a) { return vpaddl_s32(a); } -// CHECK-LABEL: @test_vpaddl_u8( -// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %a) -// CHECK: ret <4 x i16> [[VPADDL_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpaddl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VPADDL_I]] +// uint16x4_t test_vpaddl_u8(uint8x8_t a) { return vpaddl_u8(a); } -// CHECK-LABEL: @test_vpaddl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %a) -// CHECK: ret <2 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpaddl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) +// CHECK-NEXT: ret <2 x i32> [[VPADDL1_I]] +// uint32x2_t test_vpaddl_u16(uint16x4_t a) { return vpaddl_u16(a); } -// CHECK-LABEL: @test_vpaddl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %a) -// CHECK: ret <1 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vpaddl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) +// CHECK-NEXT: ret <1 x i64> [[VPADDL1_I]] +// uint64x1_t test_vpaddl_u32(uint32x2_t a) { return vpaddl_u32(a); } -// CHECK-LABEL: @test_vpaddlq_s8( -// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %a) -// CHECK: ret <8 x i16> [[VPADDL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VPADDL_I]] +// int16x8_t test_vpaddlq_s8(int8x16_t a) { return vpaddlq_s8(a); } -// CHECK-LABEL: @test_vpaddlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %a) -// CHECK: ret <4 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) +// CHECK-NEXT: ret <4 x i32> [[VPADDL1_I]] +// int32x4_t test_vpaddlq_s16(int16x8_t a) { return vpaddlq_s16(a); } -// CHECK-LABEL: @test_vpaddlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %a) -// CHECK: ret <2 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) +// CHECK-NEXT: ret <2 x i64> [[VPADDL1_I]] +// int64x2_t test_vpaddlq_s32(int32x4_t a) { return vpaddlq_s32(a); } -// CHECK-LABEL: @test_vpaddlq_u8( -// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a) -// CHECK: ret <8 x i16> [[VPADDL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VPADDL_I]] +// uint16x8_t test_vpaddlq_u8(uint8x16_t a) { return vpaddlq_u8(a); } -// CHECK-LABEL: @test_vpaddlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %a) -// CHECK: ret <4 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) +// CHECK-NEXT: ret <4 x i32> [[VPADDL1_I]] +// uint32x4_t test_vpaddlq_u16(uint16x8_t a) { return vpaddlq_u16(a); } -// CHECK-LABEL: @test_vpaddlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %a) -// CHECK: ret <2 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) +// CHECK-NEXT: ret <2 x i64> [[VPADDL1_I]] +// uint64x2_t test_vpaddlq_u32(uint32x4_t a) { return vpaddlq_u32(a); } -// CHECK-LABEL: @test_vpadal_s8( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %b) -// CHECK: [[TMP1:%.*]] = add <4 x i16> [[VPADAL_I]], %a -// CHECK: ret <4 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpadal_s8( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> [[B]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]] +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) { return vpadal_s8(a, b); } -// CHECK-LABEL: @test_vpadal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %b) -// CHECK: [[TMP2:%.*]] = add <2 x i32> [[VPADAL1_I]], %a -// CHECK: ret <2 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpadal_s16( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]] +// CHECK-NEXT: ret <2 x i32> [[TMP3]] +// int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) { return vpadal_s16(a, b); } -// CHECK-LABEL: @test_vpadal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %b) -// CHECK: [[TMP2:%.*]] = add <1 x i64> [[VPADAL1_I]], %a -// CHECK: ret <1 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vpadal_s32( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]] +// CHECK-NEXT: ret <1 x i64> [[TMP3]] +// int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) { return vpadal_s32(a, b); } -// CHECK-LABEL: @test_vpadal_u8( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %b) -// CHECK: [[TMP1:%.*]] = add <4 x i16> [[VPADAL_I]], %a -// CHECK: ret <4 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpadal_u8( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> [[B]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]] +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) { return vpadal_u8(a, b); } -// CHECK-LABEL: @test_vpadal_u16( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %b) -// CHECK: [[TMP2:%.*]] = add <2 x i32> [[VPADAL1_I]], %a -// CHECK: ret <2 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpadal_u16( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]] +// CHECK-NEXT: ret <2 x i32> [[TMP3]] +// uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) { return vpadal_u16(a, b); } -// CHECK-LABEL: @test_vpadal_u32( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %b) -// CHECK: [[TMP2:%.*]] = add <1 x i64> [[VPADAL1_I]], %a -// CHECK: ret <1 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vpadal_u32( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]] +// CHECK-NEXT: ret <1 x i64> [[TMP3]] +// uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) { return vpadal_u32(a, b); } -// CHECK-LABEL: @test_vpadalq_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %b) -// CHECK: [[TMP1:%.*]] = add <8 x i16> [[VPADAL_I]], %a -// CHECK: ret <8 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpadalq_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> [[B]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]] +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) { return vpadalq_s8(a, b); } -// CHECK-LABEL: @test_vpadalq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %b) -// CHECK: [[TMP2:%.*]] = add <4 x i32> [[VPADAL1_I]], %a -// CHECK: ret <4 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpadalq_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]] +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) { return vpadalq_s16(a, b); } -// CHECK-LABEL: @test_vpadalq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %b) -// CHECK: [[TMP2:%.*]] = add <2 x i64> [[VPADAL1_I]], %a -// CHECK: ret <2 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vpadalq_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]] +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) { return vpadalq_s32(a, b); } -// CHECK-LABEL: @test_vpadalq_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %b) -// CHECK: [[TMP1:%.*]] = add <8 x i16> [[VPADAL_I]], %a -// CHECK: ret <8 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpadalq_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[B]]) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]] +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) { return vpadalq_u8(a, b); } -// CHECK-LABEL: @test_vpadalq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %b) -// CHECK: [[TMP2:%.*]] = add <4 x i32> [[VPADAL1_I]], %a -// CHECK: ret <4 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpadalq_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]] +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) { return vpadalq_u16(a, b); } -// CHECK-LABEL: @test_vpadalq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %b) -// CHECK: [[TMP2:%.*]] = add <2 x i64> [[VPADAL1_I]], %a -// CHECK: ret <2 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vpadalq_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]] +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) { return vpadalq_u32(a, b); } -// CHECK-LABEL: @test_vqabs_s8( -// CHECK: [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VQABS_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqabs_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQABS_V_I]] +// int8x8_t test_vqabs_s8(int8x8_t a) { return vqabs_s8(a); } -// CHECK-LABEL: @test_vqabsq_s8( -// CHECK: [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VQABSQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqabsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VQABSQ_V_I]] +// int8x16_t test_vqabsq_s8(int8x16_t a) { return vqabsq_s8(a); } -// CHECK-LABEL: @test_vqabs_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %a) -// CHECK: [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQABS_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqabs_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[VQABS_V_I]]) +// CHECK-NEXT: [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vqabs_s16(int16x4_t a) { return vqabs_s16(a); } -// CHECK-LABEL: @test_vqabsq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %a) -// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQABSQ_V1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqabsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) +// CHECK-NEXT: [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vqabsq_s16(int16x8_t a) { return vqabsq_s16(a); } -// CHECK-LABEL: @test_vqabs_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %a) -// CHECK: [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQABS_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqabs_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> [[VQABS_V_I]]) +// CHECK-NEXT: [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vqabs_s32(int32x2_t a) { return vqabs_s32(a); } -// CHECK-LABEL: @test_vqabsq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %a) -// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQABSQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqabsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) +// CHECK-NEXT: [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// int32x4_t test_vqabsq_s32(int32x4_t a) { return vqabsq_s32(a); } -// CHECK-LABEL: @test_vqabsq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> %a) -// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <2 x i64> [[VQABSQ_V1_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQABSQ_V1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqabsq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> [[VQABSQ_V_I]]) +// CHECK-NEXT: [[VQABSQ_V2_I:%.*]] = bitcast <2 x i64> [[VQABSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// int64x2_t test_vqabsq_s64(int64x2_t a) { return vqabsq_s64(a); } -// CHECK-LABEL: @test_vqneg_s8( -// CHECK: [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VQNEG_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqneg_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQNEG_V_I]] +// int8x8_t test_vqneg_s8(int8x8_t a) { return vqneg_s8(a); } -// CHECK-LABEL: @test_vqnegq_s8( -// CHECK: [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VQNEGQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqnegq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VQNEGQ_V_I]] +// int8x16_t test_vqnegq_s8(int8x16_t a) { return vqnegq_s8(a); } -// CHECK-LABEL: @test_vqneg_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %a) -// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQNEG_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqneg_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) +// CHECK-NEXT: [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vqneg_s16(int16x4_t a) { return vqneg_s16(a); } -// CHECK-LABEL: @test_vqnegq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %a) -// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQNEGQ_V1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqnegq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) +// CHECK-NEXT: [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vqnegq_s16(int16x8_t a) { return vqnegq_s16(a); } -// CHECK-LABEL: @test_vqneg_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %a) -// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQNEG_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqneg_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) +// CHECK-NEXT: [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vqneg_s32(int32x2_t a) { return vqneg_s32(a); } -// CHECK-LABEL: @test_vqnegq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %a) -// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQNEGQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqnegq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) +// CHECK-NEXT: [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// int32x4_t test_vqnegq_s32(int32x4_t a) { return vqnegq_s32(a); } -// CHECK-LABEL: @test_vqnegq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> %a) -// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <2 x i64> [[VQNEGQ_V1_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQNEGQ_V1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqnegq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> [[VQNEGQ_V_I]]) +// CHECK-NEXT: [[VQNEGQ_V2_I:%.*]] = bitcast <2 x i64> [[VQNEGQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP1]] +// int64x2_t test_vqnegq_s64(int64x2_t a) { return vqnegq_s64(a); } -// CHECK-LABEL: @test_vneg_s8( -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vneg_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[A]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// int8x8_t test_vneg_s8(int8x8_t a) { return vneg_s8(a); } -// CHECK-LABEL: @test_vnegq_s8( -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vnegq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, [[A]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// int8x16_t test_vnegq_s8(int8x16_t a) { return vnegq_s8(a); } -// CHECK-LABEL: @test_vneg_s16( -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vneg_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[A]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// int16x4_t test_vneg_s16(int16x4_t a) { return vneg_s16(a); } -// CHECK-LABEL: @test_vnegq_s16( -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vnegq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, [[A]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vnegq_s16(int16x8_t a) { return vnegq_s16(a); } -// CHECK-LABEL: @test_vneg_s32( -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vneg_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[A]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vneg_s32(int32x2_t a) { return vneg_s32(a); } -// CHECK-LABEL: @test_vnegq_s32( -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vnegq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, [[A]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vnegq_s32(int32x4_t a) { return vnegq_s32(a); } -// CHECK-LABEL: @test_vnegq_s64( -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, %a -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vnegq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, [[A]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vnegq_s64(int64x2_t a) { return vnegq_s64(a); } -// CHECK-LABEL: @test_vneg_f32( -// CHECK: [[SUB_I:%.*]] = fneg <2 x float> %a -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vneg_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[A]] +// CHECK-NEXT: ret <2 x float> [[FNEG_I]] +// float32x2_t test_vneg_f32(float32x2_t a) { return vneg_f32(a); } -// CHECK-LABEL: @test_vnegq_f32( -// CHECK: [[SUB_I:%.*]] = fneg <4 x float> %a -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vnegq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[A]] +// CHECK-NEXT: ret <4 x float> [[FNEG_I]] +// float32x4_t test_vnegq_f32(float32x4_t a) { return vnegq_f32(a); } -// CHECK-LABEL: @test_vnegq_f64( -// CHECK: [[SUB_I:%.*]] = fneg <2 x double> %a -// CHECK: ret <2 x double> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vnegq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x double> [[A]] +// CHECK-NEXT: ret <2 x double> [[FNEG_I]] +// float64x2_t test_vnegq_f64(float64x2_t a) { return vnegq_f64(a); } -// CHECK-LABEL: @test_vabs_s8( -// CHECK: [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VABS_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vabs_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VABS_I]] +// int8x8_t test_vabs_s8(int8x8_t a) { return vabs_s8(a); } -// CHECK-LABEL: @test_vabsq_s8( -// CHECK: [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VABS_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vabsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VABS_I]] +// int8x16_t test_vabsq_s8(int8x16_t a) { return vabsq_s8(a); } -// CHECK-LABEL: @test_vabs_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %a) -// CHECK: ret <4 x i16> [[VABS1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vabs_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> [[VABS_I]]) +// CHECK-NEXT: ret <4 x i16> [[VABS1_I]] +// int16x4_t test_vabs_s16(int16x4_t a) { return vabs_s16(a); } -// CHECK-LABEL: @test_vabsq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %a) -// CHECK: ret <8 x i16> [[VABS1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> [[VABS_I]]) +// CHECK-NEXT: ret <8 x i16> [[VABS1_I]] +// int16x8_t test_vabsq_s16(int16x8_t a) { return vabsq_s16(a); } -// CHECK-LABEL: @test_vabs_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %a) -// CHECK: ret <2 x i32> [[VABS1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vabs_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> [[VABS_I]]) +// CHECK-NEXT: ret <2 x i32> [[VABS1_I]] +// int32x2_t test_vabs_s32(int32x2_t a) { return vabs_s32(a); } -// CHECK-LABEL: @test_vabsq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %a) -// CHECK: ret <4 x i32> [[VABS1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> [[VABS_I]]) +// CHECK-NEXT: ret <4 x i32> [[VABS1_I]] +// int32x4_t test_vabsq_s32(int32x4_t a) { return vabsq_s32(a); } -// CHECK-LABEL: @test_vabsq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> %a) -// CHECK: ret <2 x i64> [[VABS1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabsq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> [[VABS_I]]) +// CHECK-NEXT: ret <2 x i64> [[VABS1_I]] +// int64x2_t test_vabsq_s64(int64x2_t a) { return vabsq_s64(a); } -// CHECK-LABEL: @test_vabs_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VABS1_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vabs_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) +// CHECK-NEXT: ret <2 x float> [[VABS1_I]] +// float32x2_t test_vabs_f32(float32x2_t a) { return vabs_f32(a); } -// CHECK-LABEL: @test_vabsq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VABS1_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vabsq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) +// CHECK-NEXT: ret <4 x float> [[VABS1_I]] +// float32x4_t test_vabsq_f32(float32x4_t a) { return vabsq_f32(a); } -// CHECK-LABEL: @test_vabsq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VABS1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vabsq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[VABS_I]]) +// CHECK-NEXT: ret <2 x double> [[VABS1_I]] +// float64x2_t test_vabsq_f64(float64x2_t a) { return vabsq_f64(a); } -// CHECK-LABEL: @test_vuqadd_s8( -// CHECK: [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VUQADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuqadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VUQADD_I]] +// int8x8_t test_vuqadd_s8(int8x8_t a, int8x8_t b) { return vuqadd_s8(a, b); } -// CHECK-LABEL: @test_vuqaddq_s8( -// CHECK: [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VUQADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuqaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VUQADD_I]] +// int8x16_t test_vuqaddq_s8(int8x16_t a, int8x16_t b) { return vuqaddq_s8(a, b); } -// CHECK-LABEL: @test_vuqadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VUQADD2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuqadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[VUQADD_I]], <4 x i16> [[VUQADD1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VUQADD2_I]] +// int16x4_t test_vuqadd_s16(int16x4_t a, int16x4_t b) { return vuqadd_s16(a, b); } -// CHECK-LABEL: @test_vuqaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VUQADD2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuqaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> [[VUQADD_I]], <8 x i16> [[VUQADD1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VUQADD2_I]] +// int16x8_t test_vuqaddq_s16(int16x8_t a, int16x8_t b) { return vuqaddq_s16(a, b); } -// CHECK-LABEL: @test_vuqadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VUQADD2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vuqadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> [[VUQADD_I]], <2 x i32> [[VUQADD1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VUQADD2_I]] +// int32x2_t test_vuqadd_s32(int32x2_t a, int32x2_t b) { return vuqadd_s32(a, b); } -// CHECK-LABEL: @test_vuqaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VUQADD2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vuqaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> [[VUQADD_I]], <4 x i32> [[VUQADD1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VUQADD2_I]] +// int32x4_t test_vuqaddq_s32(int32x4_t a, int32x4_t b) { return vuqaddq_s32(a, b); } -// CHECK-LABEL: @test_vuqaddq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: ret <2 x i64> [[VUQADD2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuqaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> [[VUQADD_I]], <2 x i64> [[VUQADD1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VUQADD2_I]] +// int64x2_t test_vuqaddq_s64(int64x2_t a, int64x2_t b) { return vuqaddq_s64(a, b); } -// CHECK-LABEL: @test_vcls_s8( -// CHECK: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCLS_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcls_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCLS_V_I]] +// int8x8_t test_vcls_s8(int8x8_t a) { return vcls_s8(a); } -// CHECK-LABEL: @test_vcls_u8( -// CHECK: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCLS_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcls_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCLS_V_I]] +// int8x8_t test_vcls_u8(uint8x8_t a) { return vcls_u8(a); } -// CHECK-LABEL: @test_vclsq_s8( -// CHECK: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCLSQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vclsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCLSQ_V_I]] +// int8x16_t test_vclsq_s8(int8x16_t a) { return vclsq_s8(a); } -// CHECK-LABEL: @test_vclsq_u8( -// CHECK: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCLSQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vclsq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCLSQ_V_I]] +// int8x16_t test_vclsq_u8(uint8x16_t a) { return vclsq_u8(a); } -// CHECK-LABEL: @test_vcls_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLS_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcls_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[VCLS_V_I]]) +// CHECK-NEXT: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vcls_s16(int16x4_t a) { return vcls_s16(a); } -// CHECK-LABEL: @test_vcls_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLS_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcls_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[VCLS_V_I]]) +// CHECK-NEXT: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vcls_u16(uint16x4_t a) { return vcls_u16(a); } -// CHECK-LABEL: @test_vclsq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLSQ_V1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vclsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[VCLSQ_V_I]]) +// CHECK-NEXT: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vclsq_s16(int16x8_t a) { return vclsq_s16(a); } -// CHECK-LABEL: @test_vclsq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLSQ_V1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vclsq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[VCLSQ_V_I]]) +// CHECK-NEXT: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vclsq_u16(uint16x8_t a) { return vclsq_u16(a); } -// CHECK-LABEL: @test_vcls_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLS_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcls_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[VCLS_V_I]]) +// CHECK-NEXT: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vcls_s32(int32x2_t a) { return vcls_s32(a); } -// CHECK-LABEL: @test_vcls_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLS_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcls_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[VCLS_V_I]]) +// CHECK-NEXT: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vcls_u32(uint32x2_t a) { return vcls_u32(a); } -// CHECK-LABEL: @test_vclsq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLSQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vclsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[VCLSQ_V_I]]) +// CHECK-NEXT: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// int32x4_t test_vclsq_s32(int32x4_t a) { return vclsq_s32(a); } -// CHECK-LABEL: @test_vclsq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLSQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vclsq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[VCLSQ_V_I]]) +// CHECK-NEXT: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// int32x4_t test_vclsq_u32(uint32x4_t a) { return vclsq_u32(a); } -// CHECK-LABEL: @test_vclz_s8( -// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) -// CHECK: ret <8 x i8> [[VCLZ_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vclz_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <8 x i8> [[VCLZ_V_I]] +// int8x8_t test_vclz_s8(int8x8_t a) { return vclz_s8(a); } -// CHECK-LABEL: @test_vclzq_s8( -// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) -// CHECK: ret <16 x i8> [[VCLZQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vclzq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <16 x i8> [[VCLZQ_V_I]] +// int8x16_t test_vclzq_s8(int8x16_t a) { return vclzq_s8(a); } -// CHECK-LABEL: @test_vclz_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLZ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclz_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vclz_s16(int16x4_t a) { return vclz_s16(a); } -// CHECK-LABEL: @test_vclzq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLZQ_V1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vclzq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vclzq_s16(int16x8_t a) { return vclzq_s16(a); } -// CHECK-LABEL: @test_vclz_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLZ_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclz_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vclz_s32(int32x2_t a) { return vclz_s32(a); } -// CHECK-LABEL: @test_vclzq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLZQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vclzq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// int32x4_t test_vclzq_s32(int32x4_t a) { return vclzq_s32(a); } -// CHECK-LABEL: @test_vclz_u8( -// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) -// CHECK: ret <8 x i8> [[VCLZ_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vclz_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <8 x i8> [[VCLZ_V_I]] +// uint8x8_t test_vclz_u8(uint8x8_t a) { return vclz_u8(a); } -// CHECK-LABEL: @test_vclzq_u8( -// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) -// CHECK: ret <16 x i8> [[VCLZQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vclzq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <16 x i8> [[VCLZQ_V_I]] +// uint8x16_t test_vclzq_u8(uint8x16_t a) { return vclzq_u8(a); } -// CHECK-LABEL: @test_vclz_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLZ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclz_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// uint16x4_t test_vclz_u16(uint16x4_t a) { return vclz_u16(a); } -// CHECK-LABEL: @test_vclzq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLZQ_V1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vclzq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// uint16x8_t test_vclzq_u16(uint16x8_t a) { return vclzq_u16(a); } -// CHECK-LABEL: @test_vclz_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLZ_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclz_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// uint32x2_t test_vclz_u32(uint32x2_t a) { return vclz_u32(a); } -// CHECK-LABEL: @test_vclzq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLZQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vclzq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// uint32x4_t test_vclzq_u32(uint32x4_t a) { return vclzq_u32(a); } -// CHECK-LABEL: @test_vcnt_s8( -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcnt_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// int8x8_t test_vcnt_s8(int8x8_t a) { return vcnt_s8(a); } -// CHECK-LABEL: @test_vcntq_s8( -// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCNTQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcntq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCNTQ_V_I]] +// int8x16_t test_vcntq_s8(int8x16_t a) { return vcntq_s8(a); } -// CHECK-LABEL: @test_vcnt_u8( -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcnt_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// uint8x8_t test_vcnt_u8(uint8x8_t a) { return vcnt_u8(a); } -// CHECK-LABEL: @test_vcntq_u8( -// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCNTQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcntq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCNTQ_V_I]] +// uint8x16_t test_vcntq_u8(uint8x16_t a) { return vcntq_u8(a); } -// CHECK-LABEL: @test_vcnt_p8( -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcnt_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// poly8x8_t test_vcnt_p8(poly8x8_t a) { return vcnt_p8(a); } -// CHECK-LABEL: @test_vcntq_p8( -// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCNTQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcntq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCNTQ_V_I]] +// poly8x16_t test_vcntq_p8(poly8x16_t a) { return vcntq_p8(a); } -// CHECK-LABEL: @test_vmvn_s8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1) -// CHECK: ret <8 x i8> [[NEG_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmvn_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <8 x i8> [[NOT_I]] +// int8x8_t test_vmvn_s8(int8x8_t a) { return vmvn_s8(a); } -// CHECK-LABEL: @test_vmvnq_s8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1) -// CHECK: ret <16 x i8> [[NEG_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmvnq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <16 x i8> [[NOT_I]] +// int8x16_t test_vmvnq_s8(int8x16_t a) { return vmvnq_s8(a); } -// CHECK-LABEL: @test_vmvn_s16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, splat (i16 -1) -// CHECK: ret <4 x i16> [[NEG_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmvn_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <4 x i16> [[NOT_I]] +// int16x4_t test_vmvn_s16(int16x4_t a) { return vmvn_s16(a); } -// CHECK-LABEL: @test_vmvnq_s16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, splat (i16 -1) -// CHECK: ret <8 x i16> [[NEG_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmvnq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <8 x i16> [[NOT_I]] +// int16x8_t test_vmvnq_s16(int16x8_t a) { return vmvnq_s16(a); } -// CHECK-LABEL: @test_vmvn_s32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, splat (i32 -1) -// CHECK: ret <2 x i32> [[NEG_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmvn_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <2 x i32> [[NOT_I]] +// int32x2_t test_vmvn_s32(int32x2_t a) { return vmvn_s32(a); } -// CHECK-LABEL: @test_vmvnq_s32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, splat (i32 -1) -// CHECK: ret <4 x i32> [[NEG_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmvnq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <4 x i32> [[NOT_I]] +// int32x4_t test_vmvnq_s32(int32x4_t a) { return vmvnq_s32(a); } -// CHECK-LABEL: @test_vmvn_u8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1) -// CHECK: ret <8 x i8> [[NEG_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmvn_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <8 x i8> [[NOT_I]] +// uint8x8_t test_vmvn_u8(uint8x8_t a) { return vmvn_u8(a); } -// CHECK-LABEL: @test_vmvnq_u8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1) -// CHECK: ret <16 x i8> [[NEG_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmvnq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <16 x i8> [[NOT_I]] +// uint8x16_t test_vmvnq_u8(uint8x16_t a) { return vmvnq_u8(a); } -// CHECK-LABEL: @test_vmvn_u16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, splat (i16 -1) -// CHECK: ret <4 x i16> [[NEG_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmvn_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <4 x i16> [[NOT_I]] +// uint16x4_t test_vmvn_u16(uint16x4_t a) { return vmvn_u16(a); } -// CHECK-LABEL: @test_vmvnq_u16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, splat (i16 -1) -// CHECK: ret <8 x i16> [[NEG_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmvnq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <8 x i16> [[NOT_I]] +// uint16x8_t test_vmvnq_u16(uint16x8_t a) { return vmvnq_u16(a); } -// CHECK-LABEL: @test_vmvn_u32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, splat (i32 -1) -// CHECK: ret <2 x i32> [[NEG_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmvn_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <2 x i32> [[NOT_I]] +// uint32x2_t test_vmvn_u32(uint32x2_t a) { return vmvn_u32(a); } -// CHECK-LABEL: @test_vmvnq_u32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, splat (i32 -1) -// CHECK: ret <4 x i32> [[NEG_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmvnq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <4 x i32> [[NOT_I]] +// uint32x4_t test_vmvnq_u32(uint32x4_t a) { return vmvnq_u32(a); } -// CHECK-LABEL: @test_vmvn_p8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1) -// CHECK: ret <8 x i8> [[NEG_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmvn_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <8 x i8> [[NOT_I]] +// poly8x8_t test_vmvn_p8(poly8x8_t a) { return vmvn_p8(a); } -// CHECK-LABEL: @test_vmvnq_p8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1) -// CHECK: ret <16 x i8> [[NEG_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmvnq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <16 x i8> [[NOT_I]] +// poly8x16_t test_vmvnq_p8(poly8x16_t a) { return vmvnq_p8(a); } -// CHECK-LABEL: @test_vrbit_s8( -// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VRBIT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrbit_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VRBIT_I]] +// int8x8_t test_vrbit_s8(int8x8_t a) { return vrbit_s8(a); } -// CHECK-LABEL: @test_vrbitq_s8( -// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VRBIT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrbitq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VRBIT_I]] +// int8x16_t test_vrbitq_s8(int8x16_t a) { return vrbitq_s8(a); } -// CHECK-LABEL: @test_vrbit_u8( -// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VRBIT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrbit_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VRBIT_I]] +// uint8x8_t test_vrbit_u8(uint8x8_t a) { return vrbit_u8(a); } -// CHECK-LABEL: @test_vrbitq_u8( -// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VRBIT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrbitq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VRBIT_I]] +// uint8x16_t test_vrbitq_u8(uint8x16_t a) { return vrbitq_u8(a); } -// CHECK-LABEL: @test_vrbit_p8( -// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VRBIT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrbit_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VRBIT_I]] +// poly8x8_t test_vrbit_p8(poly8x8_t a) { return vrbit_p8(a); } -// CHECK-LABEL: @test_vrbitq_p8( -// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VRBIT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrbitq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VRBIT_I]] +// poly8x16_t test_vrbitq_p8(poly8x16_t a) { return vrbitq_p8(a); } -// CHECK-LABEL: @test_vmovn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[VMOVN_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmovn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VMOVN_I]] +// int8x8_t test_vmovn_s16(int16x8_t a) { return vmovn_s16(a); } -// CHECK-LABEL: @test_vmovn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[VMOVN_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmovn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VMOVN_I]] +// int16x4_t test_vmovn_s32(int32x4_t a) { return vmovn_s32(a); } -// CHECK-LABEL: @test_vmovn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[VMOVN_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmovn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VMOVN_I]] +// int32x2_t test_vmovn_s64(int64x2_t a) { return vmovn_s64(a); } -// CHECK-LABEL: @test_vmovn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[VMOVN_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmovn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VMOVN_I]] +// uint8x8_t test_vmovn_u16(uint16x8_t a) { return vmovn_u16(a); } -// CHECK-LABEL: @test_vmovn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[VMOVN_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmovn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VMOVN_I]] +// uint16x4_t test_vmovn_u32(uint32x4_t a) { return vmovn_u32(a); } -// CHECK-LABEL: @test_vmovn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[VMOVN_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmovn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VMOVN_I]] +// uint32x2_t test_vmovn_u64(uint64x2_t a) { return vmovn_u64(a); } -// CHECK-LABEL: @test_vmovn_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMOVN_I_I:%.*]] = trunc <8 x i16> %b to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmovn_high_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VMOVN_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vmovn_high_s16(int8x8_t a, int16x8_t b) { return vmovn_high_s16(a, b); } -// CHECK-LABEL: @test_vmovn_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMOVN_I_I:%.*]] = trunc <4 x i32> %b to <4 x i16> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmovn_high_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VMOVN_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vmovn_high_s32(int16x4_t a, int32x4_t b) { return vmovn_high_s32(a, b); } -// CHECK-LABEL: @test_vmovn_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VMOVN_I_I:%.*]] = trunc <2 x i64> %b to <2 x i32> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmovn_high_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VMOVN_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vmovn_high_s64(int32x2_t a, int64x2_t b) { return vmovn_high_s64(a, b); } -// CHECK-LABEL: @test_vmovn_high_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMOVN_I_I:%.*]] = trunc <8 x i16> %b to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmovn_high_u16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VMOVN_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vmovn_high_u16(int8x8_t a, int16x8_t b) { return vmovn_high_u16(a, b); } -// CHECK-LABEL: @test_vmovn_high_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMOVN_I_I:%.*]] = trunc <4 x i32> %b to <4 x i16> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmovn_high_u32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VMOVN_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vmovn_high_u32(int16x4_t a, int32x4_t b) { return vmovn_high_u32(a, b); } -// CHECK-LABEL: @test_vmovn_high_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VMOVN_I_I:%.*]] = trunc <2 x i64> %b to <2 x i32> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmovn_high_u64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VMOVN_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vmovn_high_u64(int32x2_t a, int64x2_t b) { return vmovn_high_u64(a, b); } -// CHECK-LABEL: @test_vqmovun_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %a) -// CHECK: ret <8 x i8> [[VQMOVUN_V1_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqmovun_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I]]) +// CHECK-NEXT: ret <8 x i8> [[VQMOVUN_V1_I]] +// int8x8_t test_vqmovun_s16(int16x8_t a) { return vqmovun_s16(a); } -// CHECK-LABEL: @test_vqmovun_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %a) -// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQMOVUN_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqmovun_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I]]) +// CHECK-NEXT: [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vqmovun_s32(int32x4_t a) { return vqmovun_s32(a); } -// CHECK-LABEL: @test_vqmovun_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %a) -// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQMOVUN_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqmovun_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I]]) +// CHECK-NEXT: [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vqmovun_s64(int64x2_t a) { return vqmovun_s64(a); } -// CHECK-LABEL: @test_vqmovun_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVUN_V1_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqmovun_high_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I_I]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQMOVUN_V1_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vqmovun_high_s16(uint8x8_t a, int16x8_t b) { return vqmovun_high_s16(a, b); } -// CHECK-LABEL: @test_vqmovun_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %b) -// CHECK: [[VQMOVUN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVUN_V1_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqmovun_high_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I_I]]) +// CHECK-NEXT: [[VQMOVUN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vqmovun_high_s32(uint16x4_t a, int32x4_t b) { return vqmovun_high_s32(a, b); } -// CHECK-LABEL: @test_vqmovun_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %b) -// CHECK: [[VQMOVUN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVUN_V1_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqmovun_high_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I_I]]) +// CHECK-NEXT: [[VQMOVUN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vqmovun_high_s64(uint32x2_t a, int64x2_t b) { return vqmovun_high_s64(a, b); } -// CHECK-LABEL: @test_vqmovn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %a) -// CHECK: ret <8 x i8> [[VQMOVN_V1_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqmovn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) +// CHECK-NEXT: ret <8 x i8> [[VQMOVN_V1_I]] +// int8x8_t test_vqmovn_s16(int16x8_t a) { return vqmovn_s16(a); } -// CHECK-LABEL: @test_vqmovn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQMOVN_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqmovn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) +// CHECK-NEXT: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vqmovn_s32(int32x4_t a) { return vqmovn_s32(a); } -// CHECK-LABEL: @test_vqmovn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQMOVN_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqmovn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) +// CHECK-NEXT: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vqmovn_s64(int64x2_t a) { return vqmovn_s64(a); } -// CHECK-LABEL: @test_vqmovn_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqmovn_high_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I_I]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vqmovn_high_s16(int8x8_t a, int16x8_t b) { return vqmovn_high_s16(a, b); } -// CHECK-LABEL: @test_vqmovn_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %b) -// CHECK: [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqmovn_high_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I_I]]) +// CHECK-NEXT: [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vqmovn_high_s32(int16x4_t a, int32x4_t b) { return vqmovn_high_s32(a, b); } -// CHECK-LABEL: @test_vqmovn_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %b) -// CHECK: [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqmovn_high_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I_I]]) +// CHECK-NEXT: [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vqmovn_high_s64(int32x2_t a, int64x2_t b) { return vqmovn_high_s64(a, b); } -// CHECK-LABEL: @test_vqmovn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %a) -// CHECK: ret <8 x i8> [[VQMOVN_V1_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqmovn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) +// CHECK-NEXT: ret <8 x i8> [[VQMOVN_V1_I]] +// uint8x8_t test_vqmovn_u16(uint16x8_t a) { return vqmovn_u16(a); } -// CHECK-LABEL: @test_vqmovn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQMOVN_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqmovn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) +// CHECK-NEXT: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// uint16x4_t test_vqmovn_u32(uint32x4_t a) { return vqmovn_u32(a); } -// CHECK-LABEL: @test_vqmovn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQMOVN_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqmovn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) +// CHECK-NEXT: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// uint32x2_t test_vqmovn_u64(uint64x2_t a) { return vqmovn_u64(a); } -// CHECK-LABEL: @test_vqmovn_high_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqmovn_high_u16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I_I]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vqmovn_high_u16(uint8x8_t a, uint16x8_t b) { return vqmovn_high_u16(a, b); } -// CHECK-LABEL: @test_vqmovn_high_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %b) -// CHECK: [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqmovn_high_u32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I_I]]) +// CHECK-NEXT: [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vqmovn_high_u32(uint16x4_t a, uint32x4_t b) { return vqmovn_high_u32(a, b); } -// CHECK-LABEL: @test_vqmovn_high_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %b) -// CHECK: [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqmovn_high_u64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I_I]]) +// CHECK-NEXT: [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vqmovn_high_u64(uint32x2_t a, uint64x2_t b) { return vqmovn_high_u64(a, b); } -// CHECK-LABEL: @test_vshll_n_s8( -// CHECK: [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// int16x8_t test_vshll_n_s8(int8x8_t a) { return vshll_n_s8(a, 8); } -// CHECK-LABEL: @test_vshll_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// int32x4_t test_vshll_n_s16(int16x4_t a) { return vshll_n_s16(a, 16); } -// CHECK-LABEL: @test_vshll_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// int64x2_t test_vshll_n_s32(int32x2_t a) { return vshll_n_s32(a, 32); } -// CHECK-LABEL: @test_vshll_n_u8( -// CHECK: [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// uint16x8_t test_vshll_n_u8(uint8x8_t a) { return vshll_n_u8(a, 8); } -// CHECK-LABEL: @test_vshll_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// uint32x4_t test_vshll_n_u16(uint16x4_t a) { return vshll_n_u16(a, 16); } -// CHECK-LABEL: @test_vshll_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// uint64x2_t test_vshll_n_u32(uint32x2_t a) { return vshll_n_u32(a, 32); } -// CHECK-LABEL: @test_vshll_high_n_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_high_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// int16x8_t test_vshll_high_n_s8(int8x16_t a) { return vshll_high_n_s8(a, 8); } -// CHECK-LABEL: @test_vshll_high_n_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_high_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// int32x4_t test_vshll_high_n_s16(int16x8_t a) { return vshll_high_n_s16(a, 16); } -// CHECK-LABEL: @test_vshll_high_n_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_high_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// int64x2_t test_vshll_high_n_s32(int32x4_t a) { return vshll_high_n_s32(a, 32); } -// CHECK-LABEL: @test_vshll_high_n_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_high_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// uint16x8_t test_vshll_high_n_u8(uint8x16_t a) { return vshll_high_n_u8(a, 8); } -// CHECK-LABEL: @test_vshll_high_n_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_high_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// uint32x4_t test_vshll_high_n_u16(uint16x8_t a) { return vshll_high_n_u16(a, 16); } -// CHECK-LABEL: @test_vshll_high_n_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_high_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// uint64x2_t test_vshll_high_n_u32(uint32x4_t a) { return vshll_high_n_u32(a, 32); } -// CHECK-LABEL: @test_vcvt_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) -// CHECK: [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half> -// CHECK: ret <4 x half> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_f16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) +// CHECK-NEXT: [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP3]] +// float16x4_t test_vcvt_f16_f32(float32x4_t a) { return vcvt_f16_f32(a); } -// CHECK-LABEL: @test_vcvt_high_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %b) -// CHECK: [[VCVT_F16_F322_I_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I_I]] to <4 x half> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x half> %a, <4 x half> [[TMP1]], <8 x i32> -// CHECK: ret <8 x half> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt_high_f16_f32( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_F16_F32_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I_I]]) +// CHECK-NEXT: [[VCVT_F16_F322_I_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[TMP3]], <8 x i32> +// CHECK-NEXT: ret <8 x half> [[SHUFFLE_I]] +// float16x8_t test_vcvt_high_f16_f32(float16x4_t a, float32x4_t b) { return vcvt_high_f16_f32(a, b); } -// CHECK-LABEL: @test_vcvt_f32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = fptrunc <2 x double> %a to <2 x float> -// CHECK: ret <2 x float> [[VCVT_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_f32_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVT_I:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[VCVT_I]] +// float32x2_t test_vcvt_f32_f64(float64x2_t a) { return vcvt_f32_f64(a); } -// CHECK-LABEL: @test_vcvt_high_f32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VCVT_I_I:%.*]] = fptrunc <2 x double> %b to <2 x float> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVT_I_I]], <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvt_high_f32_f64( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVT_I_I:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[VCVT_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vcvt_high_f32_f64(float32x2_t a, float64x2_t b) { return vcvt_high_f32_f64(a, b); } -// CHECK-LABEL: @test_vcvtx_f32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %a) -// CHECK: ret <2 x float> [[VCVTX_F32_V1_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vcvtx_f32_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTX_F32_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[VCVTX_F32_V_I]]) +// CHECK-NEXT: ret <2 x float> [[VCVTX_F32_V1_I]] +// float32x2_t test_vcvtx_f32_f64(float64x2_t a) { return vcvtx_f32_f64(a); } -// CHECK-LABEL: @test_vcvtx_high_f32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVTX_F32_V1_I_I]], <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvtx_high_f32_f64( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTX_F32_V_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[VCVTX_F32_V_I_I]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[VCVTX_F32_V1_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) { return vcvtx_high_f32_f64(a, b); } -// CHECK-LABEL: @test_vcvt_f32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) -// CHECK: [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VCVT_F32_F161_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvt_f32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) +// CHECK-NEXT: [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// float32x4_t test_vcvt_f32_f16(float16x4_t a) { return vcvt_f32_f16(a); } -// CHECK-LABEL: @test_vcvt_high_f32_f16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[VCVT_F32_F16_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]]) -// CHECK: [[VCVT_F32_F162_I_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VCVT_F32_F161_I_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvt_high_f32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVT_F32_F16_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]]) +// CHECK-NEXT: [[VCVT_F32_F162_I_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I_I]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// float32x4_t test_vcvt_high_f32_f16(float16x8_t a) { return vcvt_high_f32_f16(a); } -// CHECK-LABEL: @test_vcvt_f64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = fpext <2 x float> %a to <2 x double> -// CHECK: ret <2 x double> [[VCVT_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vcvt_f64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVT_I:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[VCVT_I]] +// float64x2_t test_vcvt_f64_f32(float32x2_t a) { return vcvt_f64_f32(a); } -// CHECK-LABEL: @test_vcvt_high_f64_f32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[VCVT_I_I:%.*]] = fpext <2 x float> [[SHUFFLE_I_I]] to <2 x double> -// CHECK: ret <2 x double> [[VCVT_I_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vcvt_high_f64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[A]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[SHUFFLE_I]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVT_I_I:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[VCVT_I_I]] +// float64x2_t test_vcvt_high_f64_f32(float32x4_t a) { return vcvt_high_f64_f32(a); } -// CHECK-LABEL: @test_vrndnq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDN1_I:%.*]] = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDN1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndnq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <2 x double> @llvm.roundeven.v2f64(<2 x double> [[VRNDN_I]]) +// CHECK-NEXT: ret <2 x double> [[VRNDN1_I]] +// float64x2_t test_vrndnq_f64(float64x2_t a) { return vrndnq_f64(a); } -// CHECK-LABEL: @test_vrndaq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDA1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndaq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[VRNDA_I]]) +// CHECK-NEXT: ret <2 x double> [[VRNDA1_I]] +// float64x2_t test_vrndaq_f64(float64x2_t a) { return vrndaq_f64(a); } -// CHECK-LABEL: @test_vrndpq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDP1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndpq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[VRNDP_I]]) +// CHECK-NEXT: ret <2 x double> [[VRNDP1_I]] +// float64x2_t test_vrndpq_f64(float64x2_t a) { return vrndpq_f64(a); } -// CHECK-LABEL: @test_vrndmq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDM1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndmq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[VRNDM_I]]) +// CHECK-NEXT: ret <2 x double> [[VRNDM1_I]] +// float64x2_t test_vrndmq_f64(float64x2_t a) { return vrndmq_f64(a); } -// CHECK-LABEL: @test_vrndxq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDX1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndxq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[VRNDX_I]]) +// CHECK-NEXT: ret <2 x double> [[VRNDX1_I]] +// float64x2_t test_vrndxq_f64(float64x2_t a) { return vrndxq_f64(a); } -// CHECK-LABEL: @test_vrndq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDZ1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[VRNDZ_I]]) +// CHECK-NEXT: ret <2 x double> [[VRNDZ1_I]] +// float64x2_t test_vrndq_f64(float64x2_t a) { return vrndq_f64(a); } -// CHECK-LABEL: @test_vrndiq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDI1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDI1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndiq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDIQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDIQ_V1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[VRNDIQ_V_I]]) +// CHECK-NEXT: ret <2 x double> [[VRNDIQ_V1_I]] +// float64x2_t test_vrndiq_f64(float64x2_t a) { return vrndiq_f64(a); } -// CHECK-LABEL: @test_vcvt_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvt_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> [[VCVTZ_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTZ1_I]] +// int32x2_t test_vcvt_s32_f32(float32x2_t a) { return vcvt_s32_f32(a); } -// CHECK-LABEL: @test_vcvtq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> [[VCVTZ_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTZ1_I]] +// int32x4_t test_vcvtq_s32_f32(float32x4_t a) { return vcvtq_s32_f32(a); } -// CHECK-LABEL: @test_vcvtq_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtq_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> [[VCVTZ_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTZ1_I]] +// int64x2_t test_vcvtq_s64_f64(float64x2_t a) { return vcvtq_s64_f64(a); } -// CHECK-LABEL: @test_vcvt_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvt_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> [[VCVTZ_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTZ1_I]] +// uint32x2_t test_vcvt_u32_f32(float32x2_t a) { return vcvt_u32_f32(a); } -// CHECK-LABEL: @test_vcvtq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> [[VCVTZ_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTZ1_I]] +// uint32x4_t test_vcvtq_u32_f32(float32x4_t a) { return vcvtq_u32_f32(a); } -// CHECK-LABEL: @test_vcvtq_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtq_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> [[VCVTZ_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTZ1_I]] +// uint64x2_t test_vcvtq_u64_f64(float64x2_t a) { return vcvtq_u64_f64(a); } -// CHECK-LABEL: @test_vcvtn_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtn_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> [[VCVTN_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTN1_I]] +// int32x2_t test_vcvtn_s32_f32(float32x2_t a) { return vcvtn_s32_f32(a); } -// CHECK-LABEL: @test_vcvtnq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtnq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> [[VCVTN_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTN1_I]] +// int32x4_t test_vcvtnq_s32_f32(float32x4_t a) { return vcvtnq_s32_f32(a); } -// CHECK-LABEL: @test_vcvtnq_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtnq_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> [[VCVTN_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTN1_I]] +// int64x2_t test_vcvtnq_s64_f64(float64x2_t a) { return vcvtnq_s64_f64(a); } -// CHECK-LABEL: @test_vcvtn_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtn_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> [[VCVTN_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTN1_I]] +// uint32x2_t test_vcvtn_u32_f32(float32x2_t a) { return vcvtn_u32_f32(a); } -// CHECK-LABEL: @test_vcvtnq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtnq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> [[VCVTN_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTN1_I]] +// uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) { return vcvtnq_u32_f32(a); } -// CHECK-LABEL: @test_vcvtnq_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtnq_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> [[VCVTN_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTN1_I]] +// uint64x2_t test_vcvtnq_u64_f64(float64x2_t a) { return vcvtnq_u64_f64(a); } -// CHECK-LABEL: @test_vcvtp_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtp_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> [[VCVTP_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTP1_I]] +// int32x2_t test_vcvtp_s32_f32(float32x2_t a) { return vcvtp_s32_f32(a); } -// CHECK-LABEL: @test_vcvtpq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtpq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> [[VCVTP_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTP1_I]] +// int32x4_t test_vcvtpq_s32_f32(float32x4_t a) { return vcvtpq_s32_f32(a); } -// CHECK-LABEL: @test_vcvtpq_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtpq_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> [[VCVTP_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTP1_I]] +// int64x2_t test_vcvtpq_s64_f64(float64x2_t a) { return vcvtpq_s64_f64(a); } -// CHECK-LABEL: @test_vcvtp_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtp_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> [[VCVTP_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTP1_I]] +// uint32x2_t test_vcvtp_u32_f32(float32x2_t a) { return vcvtp_u32_f32(a); } -// CHECK-LABEL: @test_vcvtpq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtpq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> [[VCVTP_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTP1_I]] +// uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) { return vcvtpq_u32_f32(a); } -// CHECK-LABEL: @test_vcvtpq_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtpq_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> [[VCVTP_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTP1_I]] +// uint64x2_t test_vcvtpq_u64_f64(float64x2_t a) { return vcvtpq_u64_f64(a); } -// CHECK-LABEL: @test_vcvtm_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtm_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> [[VCVTM_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTM1_I]] +// int32x2_t test_vcvtm_s32_f32(float32x2_t a) { return vcvtm_s32_f32(a); } -// CHECK-LABEL: @test_vcvtmq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtmq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> [[VCVTM_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTM1_I]] +// int32x4_t test_vcvtmq_s32_f32(float32x4_t a) { return vcvtmq_s32_f32(a); } -// CHECK-LABEL: @test_vcvtmq_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtmq_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> [[VCVTM_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTM1_I]] +// int64x2_t test_vcvtmq_s64_f64(float64x2_t a) { return vcvtmq_s64_f64(a); } -// CHECK-LABEL: @test_vcvtm_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtm_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> [[VCVTM_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTM1_I]] +// uint32x2_t test_vcvtm_u32_f32(float32x2_t a) { return vcvtm_u32_f32(a); } -// CHECK-LABEL: @test_vcvtmq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtmq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> [[VCVTM_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTM1_I]] +// uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) { return vcvtmq_u32_f32(a); } -// CHECK-LABEL: @test_vcvtmq_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtmq_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> [[VCVTM_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTM1_I]] +// uint64x2_t test_vcvtmq_u64_f64(float64x2_t a) { return vcvtmq_u64_f64(a); } -// CHECK-LABEL: @test_vcvta_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvta_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> [[VCVTA_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTA1_I]] +// int32x2_t test_vcvta_s32_f32(float32x2_t a) { return vcvta_s32_f32(a); } -// CHECK-LABEL: @test_vcvtaq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtaq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> [[VCVTA_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTA1_I]] +// int32x4_t test_vcvtaq_s32_f32(float32x4_t a) { return vcvtaq_s32_f32(a); } -// CHECK-LABEL: @test_vcvtaq_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtaq_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> [[VCVTA_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTA1_I]] +// int64x2_t test_vcvtaq_s64_f64(float64x2_t a) { return vcvtaq_s64_f64(a); } -// CHECK-LABEL: @test_vcvta_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvta_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> [[VCVTA_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTA1_I]] +// uint32x2_t test_vcvta_u32_f32(float32x2_t a) { return vcvta_u32_f32(a); } -// CHECK-LABEL: @test_vcvtaq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtaq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> [[VCVTA_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTA1_I]] +// uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) { return vcvtaq_u32_f32(a); } -// CHECK-LABEL: @test_vcvtaq_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtaq_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> [[VCVTA_I]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTA1_I]] +// uint64x2_t test_vcvtaq_u64_f64(float64x2_t a) { return vcvtaq_u64_f64(a); } -// CHECK-LABEL: @test_vrsqrte_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRSQRTE_V1_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrsqrte_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) +// CHECK-NEXT: ret <2 x float> [[VRSQRTE_V1_I]] +// float32x2_t test_vrsqrte_f32(float32x2_t a) { return vrsqrte_f32(a); } -// CHECK-LABEL: @test_vrsqrteq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRSQRTEQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrsqrteq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) +// CHECK-NEXT: ret <4 x float> [[VRSQRTEQ_V1_I]] +// float32x4_t test_vrsqrteq_f32(float32x4_t a) { return vrsqrteq_f32(a); } -// CHECK-LABEL: @test_vrsqrteq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRSQRTEQ_V1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrsqrteq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> [[VRSQRTEQ_V_I]]) +// CHECK-NEXT: ret <2 x double> [[VRSQRTEQ_V1_I]] +// float64x2_t test_vrsqrteq_f64(float64x2_t a) { return vrsqrteq_f64(a); } -// CHECK-LABEL: @test_vrecpe_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRECPE_V1_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrecpe_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> [[VRECPE_V_I]]) +// CHECK-NEXT: ret <2 x float> [[VRECPE_V1_I]] +// float32x2_t test_vrecpe_f32(float32x2_t a) { return vrecpe_f32(a); } -// CHECK-LABEL: @test_vrecpeq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRECPEQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrecpeq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) +// CHECK-NEXT: ret <4 x float> [[VRECPEQ_V1_I]] +// float32x4_t test_vrecpeq_f32(float32x4_t a) { return vrecpeq_f32(a); } -// CHECK-LABEL: @test_vrecpeq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRECPEQ_V1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrecpeq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> [[VRECPEQ_V_I]]) +// CHECK-NEXT: ret <2 x double> [[VRECPEQ_V1_I]] +// float64x2_t test_vrecpeq_f64(float64x2_t a) { return vrecpeq_f64(a); } -// CHECK-LABEL: @test_vrecpe_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> %a) -// CHECK: ret <2 x i32> [[VRECPE_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrecpe_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) +// CHECK-NEXT: ret <2 x i32> [[VRECPE_V1_I]] +// uint32x2_t test_vrecpe_u32(uint32x2_t a) { return vrecpe_u32(a); } -// CHECK-LABEL: @test_vrecpeq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> %a) -// CHECK: ret <4 x i32> [[VRECPEQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrecpeq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) +// CHECK-NEXT: ret <4 x i32> [[VRECPEQ_V1_I]] +// uint32x4_t test_vrecpeq_u32(uint32x4_t a) { return vrecpeq_u32(a); } -// CHECK-LABEL: @test_vsqrt_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VSQRT_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vsqrt_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[TMP2]]) +// CHECK-NEXT: ret <2 x float> [[VSQRT_I]] +// float32x2_t test_vsqrt_f32(float32x2_t a) { return vsqrt_f32(a); } -// CHECK-LABEL: @test_vsqrtq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VSQRT_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vsqrtq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]]) +// CHECK-NEXT: ret <4 x float> [[VSQRT_I]] +// float32x4_t test_vsqrtq_f32(float32x4_t a) { return vsqrtq_f32(a); } -// CHECK-LABEL: @test_vsqrtq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VSQRT_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vsqrtq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]]) +// CHECK-NEXT: ret <2 x double> [[VSQRT_I]] +// float64x2_t test_vsqrtq_f64(float64x2_t a) { return vsqrtq_f64(a); } -// CHECK-LABEL: @test_vcvt_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = sitofp <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[VCVT_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_f32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[VCVT_I]] +// float32x2_t test_vcvt_f32_s32(int32x2_t a) { return vcvt_f32_s32(a); } -// CHECK-LABEL: @test_vcvt_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = uitofp <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[VCVT_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_f32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[VCVT_I]] +// float32x2_t test_vcvt_f32_u32(uint32x2_t a) { return vcvt_f32_u32(a); } -// CHECK-LABEL: @test_vcvtq_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = sitofp <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[VCVT_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvtq_f32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[VCVT_I]] +// float32x4_t test_vcvtq_f32_s32(int32x4_t a) { return vcvtq_f32_s32(a); } -// CHECK-LABEL: @test_vcvtq_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = uitofp <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[VCVT_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvtq_f32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[VCVT_I]] +// float32x4_t test_vcvtq_f32_u32(uint32x4_t a) { return vcvtq_f32_u32(a); } -// CHECK-LABEL: @test_vcvtq_f64_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = sitofp <2 x i64> %a to <2 x double> -// CHECK: ret <2 x double> [[VCVT_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vcvtq_f64_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[VCVT_I]] +// float64x2_t test_vcvtq_f64_s64(int64x2_t a) { return vcvtq_f64_s64(a); } -// CHECK-LABEL: @test_vcvtq_f64_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = uitofp <2 x i64> %a to <2 x double> -// CHECK: ret <2 x double> [[VCVT_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vcvtq_f64_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[VCVT_I]] +// float64x2_t test_vcvtq_f64_u64(uint64x2_t a) { return vcvtq_f64_u64(a); } diff --git a/clang/test/CodeGen/AArch64/neon-perm.c b/clang/test/CodeGen/AArch64/neon-perm.c index 1ffbbd5d9bc42..61b24c55a39a3 100644 --- a/clang/test/CodeGen/AArch64/neon-perm.c +++ b/clang/test/CodeGen/AArch64/neon-perm.c @@ -1,1932 +1,2395 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: @test_vuzp1_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vuzp1_s8(int8x8_t a, int8x8_t b) { return vuzp1_s8(a, b); } -// CHECK-LABEL: @test_vuzp1q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vuzp1q_s8(int8x16_t a, int8x16_t b) { return vuzp1q_s8(a, b); } -// CHECK-LABEL: @test_vuzp1_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp1_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vuzp1_s16(int16x4_t a, int16x4_t b) { return vuzp1_s16(a, b); } -// CHECK-LABEL: @test_vuzp1q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp1q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vuzp1q_s16(int16x8_t a, int16x8_t b) { return vuzp1q_s16(a, b); } -// CHECK-LABEL: @test_vuzp1_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp1_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vuzp1_s32(int32x2_t a, int32x2_t b) { return vuzp1_s32(a, b); } -// CHECK-LABEL: @test_vuzp1q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp1q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) { return vuzp1q_s32(a, b); } -// CHECK-LABEL: @test_vuzp1q_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp1q_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vuzp1q_s64(int64x2_t a, int64x2_t b) { return vuzp1q_s64(a, b); } -// CHECK-LABEL: @test_vuzp1_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vuzp1_u8(uint8x8_t a, uint8x8_t b) { return vuzp1_u8(a, b); } -// CHECK-LABEL: @test_vuzp1q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vuzp1q_u8(uint8x16_t a, uint8x16_t b) { return vuzp1q_u8(a, b); } -// CHECK-LABEL: @test_vuzp1_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp1_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vuzp1_u16(uint16x4_t a, uint16x4_t b) { return vuzp1_u16(a, b); } -// CHECK-LABEL: @test_vuzp1q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp1q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vuzp1q_u16(uint16x8_t a, uint16x8_t b) { return vuzp1q_u16(a, b); } -// CHECK-LABEL: @test_vuzp1_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp1_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vuzp1_u32(uint32x2_t a, uint32x2_t b) { return vuzp1_u32(a, b); } -// CHECK-LABEL: @test_vuzp1q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp1q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) { return vuzp1q_u32(a, b); } -// CHECK-LABEL: @test_vuzp1q_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp1q_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vuzp1q_u64(uint64x2_t a, uint64x2_t b) { return vuzp1q_u64(a, b); } -// CHECK-LABEL: @test_vuzp1_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vuzp1_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vuzp1_f32(float32x2_t a, float32x2_t b) { return vuzp1_f32(a, b); } -// CHECK-LABEL: @test_vuzp1q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vuzp1q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) { return vuzp1q_f32(a, b); } -// CHECK-LABEL: @test_vuzp1q_f64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> -// CHECK: ret <2 x double> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vuzp1q_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] +// float64x2_t test_vuzp1q_f64(float64x2_t a, float64x2_t b) { return vuzp1q_f64(a, b); } -// CHECK-LABEL: @test_vuzp1_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vuzp1_p8(poly8x8_t a, poly8x8_t b) { return vuzp1_p8(a, b); } -// CHECK-LABEL: @test_vuzp1q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vuzp1q_p8(poly8x16_t a, poly8x16_t b) { return vuzp1q_p8(a, b); } -// CHECK-LABEL: @test_vuzp1_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp1_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vuzp1_p16(poly16x4_t a, poly16x4_t b) { return vuzp1_p16(a, b); } -// CHECK-LABEL: @test_vuzp1q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp1q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vuzp1q_p16(poly16x8_t a, poly16x8_t b) { return vuzp1q_p16(a, b); } -// CHECK-LABEL: @test_vuzp2_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vuzp2_s8(int8x8_t a, int8x8_t b) { return vuzp2_s8(a, b); } -// CHECK-LABEL: @test_vuzp2q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vuzp2q_s8(int8x16_t a, int8x16_t b) { return vuzp2q_s8(a, b); } -// CHECK-LABEL: @test_vuzp2_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp2_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vuzp2_s16(int16x4_t a, int16x4_t b) { return vuzp2_s16(a, b); } -// CHECK-LABEL: @test_vuzp2q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp2q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vuzp2q_s16(int16x8_t a, int16x8_t b) { return vuzp2q_s16(a, b); } -// CHECK-LABEL: @test_vuzp2_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp2_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vuzp2_s32(int32x2_t a, int32x2_t b) { return vuzp2_s32(a, b); } -// CHECK-LABEL: @test_vuzp2q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp2q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) { return vuzp2q_s32(a, b); } -// CHECK-LABEL: @test_vuzp2q_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp2q_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vuzp2q_s64(int64x2_t a, int64x2_t b) { return vuzp2q_s64(a, b); } -// CHECK-LABEL: @test_vuzp2_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vuzp2_u8(uint8x8_t a, uint8x8_t b) { return vuzp2_u8(a, b); } -// CHECK-LABEL: @test_vuzp2q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vuzp2q_u8(uint8x16_t a, uint8x16_t b) { return vuzp2q_u8(a, b); } -// CHECK-LABEL: @test_vuzp2_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp2_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vuzp2_u16(uint16x4_t a, uint16x4_t b) { return vuzp2_u16(a, b); } -// CHECK-LABEL: @test_vuzp2q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp2q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vuzp2q_u16(uint16x8_t a, uint16x8_t b) { return vuzp2q_u16(a, b); } -// CHECK-LABEL: @test_vuzp2_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp2_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vuzp2_u32(uint32x2_t a, uint32x2_t b) { return vuzp2_u32(a, b); } -// CHECK-LABEL: @test_vuzp2q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp2q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) { return vuzp2q_u32(a, b); } -// CHECK-LABEL: @test_vuzp2q_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp2q_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vuzp2q_u64(uint64x2_t a, uint64x2_t b) { return vuzp2q_u64(a, b); } -// CHECK-LABEL: @test_vuzp2_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vuzp2_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vuzp2_f32(float32x2_t a, float32x2_t b) { return vuzp2_f32(a, b); } -// CHECK-LABEL: @test_vuzp2q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vuzp2q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) { return vuzp2q_f32(a, b); } -// CHECK-LABEL: @test_vuzp2q_f64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> -// CHECK: ret <2 x double> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vuzp2q_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] +// float64x2_t test_vuzp2q_f64(float64x2_t a, float64x2_t b) { return vuzp2q_f64(a, b); } -// CHECK-LABEL: @test_vuzp2_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vuzp2_p8(poly8x8_t a, poly8x8_t b) { return vuzp2_p8(a, b); } -// CHECK-LABEL: @test_vuzp2q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vuzp2q_p8(poly8x16_t a, poly8x16_t b) { return vuzp2q_p8(a, b); } -// CHECK-LABEL: @test_vuzp2_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp2_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vuzp2_p16(poly16x4_t a, poly16x4_t b) { return vuzp2_p16(a, b); } -// CHECK-LABEL: @test_vuzp2q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp2q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vuzp2q_p16(poly16x8_t a, poly16x8_t b) { return vuzp2q_p16(a, b); } -// CHECK-LABEL: @test_vzip1_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip1_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vzip1_s8(int8x8_t a, int8x8_t b) { return vzip1_s8(a, b); } -// CHECK-LABEL: @test_vzip1q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip1q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vzip1q_s8(int8x16_t a, int8x16_t b) { return vzip1q_s8(a, b); } -// CHECK-LABEL: @test_vzip1_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vzip1_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vzip1_s16(int16x4_t a, int16x4_t b) { return vzip1_s16(a, b); } -// CHECK-LABEL: @test_vzip1q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vzip1q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vzip1q_s16(int16x8_t a, int16x8_t b) { return vzip1q_s16(a, b); } -// CHECK-LABEL: @test_vzip1_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vzip1_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vzip1_s32(int32x2_t a, int32x2_t b) { return vzip1_s32(a, b); } -// CHECK-LABEL: @test_vzip1q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vzip1q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vzip1q_s32(int32x4_t a, int32x4_t b) { return vzip1q_s32(a, b); } -// CHECK-LABEL: @test_vzip1q_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vzip1q_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vzip1q_s64(int64x2_t a, int64x2_t b) { return vzip1q_s64(a, b); } -// CHECK-LABEL: @test_vzip1_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip1_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vzip1_u8(uint8x8_t a, uint8x8_t b) { return vzip1_u8(a, b); } -// CHECK-LABEL: @test_vzip1q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip1q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vzip1q_u8(uint8x16_t a, uint8x16_t b) { return vzip1q_u8(a, b); } -// CHECK-LABEL: @test_vzip1_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vzip1_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vzip1_u16(uint16x4_t a, uint16x4_t b) { return vzip1_u16(a, b); } -// CHECK-LABEL: @test_vzip1q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vzip1q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vzip1q_u16(uint16x8_t a, uint16x8_t b) { return vzip1q_u16(a, b); } -// CHECK-LABEL: @test_vzip1_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vzip1_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vzip1_u32(uint32x2_t a, uint32x2_t b) { return vzip1_u32(a, b); } -// CHECK-LABEL: @test_vzip1q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vzip1q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vzip1q_u32(uint32x4_t a, uint32x4_t b) { return vzip1q_u32(a, b); } -// CHECK-LABEL: @test_vzip1q_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vzip1q_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vzip1q_u64(uint64x2_t a, uint64x2_t b) { return vzip1q_u64(a, b); } -// CHECK-LABEL: @test_vzip1_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vzip1_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vzip1_f32(float32x2_t a, float32x2_t b) { return vzip1_f32(a, b); } -// CHECK-LABEL: @test_vzip1q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vzip1q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vzip1q_f32(float32x4_t a, float32x4_t b) { return vzip1q_f32(a, b); } -// CHECK-LABEL: @test_vzip1q_f64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> -// CHECK: ret <2 x double> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vzip1q_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] +// float64x2_t test_vzip1q_f64(float64x2_t a, float64x2_t b) { return vzip1q_f64(a, b); } -// CHECK-LABEL: @test_vzip1_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip1_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vzip1_p8(poly8x8_t a, poly8x8_t b) { return vzip1_p8(a, b); } -// CHECK-LABEL: @test_vzip1q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip1q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vzip1q_p8(poly8x16_t a, poly8x16_t b) { return vzip1q_p8(a, b); } -// CHECK-LABEL: @test_vzip1_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vzip1_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vzip1_p16(poly16x4_t a, poly16x4_t b) { return vzip1_p16(a, b); } -// CHECK-LABEL: @test_vzip1q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vzip1q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vzip1q_p16(poly16x8_t a, poly16x8_t b) { return vzip1q_p16(a, b); } -// CHECK-LABEL: @test_vzip2_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip2_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vzip2_s8(int8x8_t a, int8x8_t b) { return vzip2_s8(a, b); } -// CHECK-LABEL: @test_vzip2q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip2q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vzip2q_s8(int8x16_t a, int8x16_t b) { return vzip2q_s8(a, b); } -// CHECK-LABEL: @test_vzip2_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vzip2_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vzip2_s16(int16x4_t a, int16x4_t b) { return vzip2_s16(a, b); } -// CHECK-LABEL: @test_vzip2q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vzip2q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vzip2q_s16(int16x8_t a, int16x8_t b) { return vzip2q_s16(a, b); } -// CHECK-LABEL: @test_vzip2_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vzip2_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vzip2_s32(int32x2_t a, int32x2_t b) { return vzip2_s32(a, b); } -// CHECK-LABEL: @test_vzip2q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vzip2q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vzip2q_s32(int32x4_t a, int32x4_t b) { return vzip2q_s32(a, b); } -// CHECK-LABEL: @test_vzip2q_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vzip2q_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vzip2q_s64(int64x2_t a, int64x2_t b) { return vzip2q_s64(a, b); } -// CHECK-LABEL: @test_vzip2_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip2_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vzip2_u8(uint8x8_t a, uint8x8_t b) { return vzip2_u8(a, b); } -// CHECK-LABEL: @test_vzip2q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip2q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vzip2q_u8(uint8x16_t a, uint8x16_t b) { return vzip2q_u8(a, b); } -// CHECK-LABEL: @test_vzip2_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vzip2_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vzip2_u16(uint16x4_t a, uint16x4_t b) { return vzip2_u16(a, b); } -// CHECK-LABEL: @test_vzip2q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vzip2q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vzip2q_u16(uint16x8_t a, uint16x8_t b) { return vzip2q_u16(a, b); } -// CHECK-LABEL: @test_vzip2_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vzip2_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vzip2_u32(uint32x2_t a, uint32x2_t b) { return vzip2_u32(a, b); } -// CHECK-LABEL: @test_vzip2q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vzip2q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vzip2q_u32(uint32x4_t a, uint32x4_t b) { return vzip2q_u32(a, b); } -// CHECK-LABEL: @test_vzip2q_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vzip2q_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vzip2q_u64(uint64x2_t a, uint64x2_t b) { return vzip2q_u64(a, b); } -// CHECK-LABEL: @test_vzip2_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vzip2_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vzip2_f32(float32x2_t a, float32x2_t b) { return vzip2_f32(a, b); } -// CHECK-LABEL: @test_vzip2q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vzip2q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vzip2q_f32(float32x4_t a, float32x4_t b) { return vzip2q_f32(a, b); } -// CHECK-LABEL: @test_vzip2q_f64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> -// CHECK: ret <2 x double> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vzip2q_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] +// float64x2_t test_vzip2q_f64(float64x2_t a, float64x2_t b) { return vzip2q_f64(a, b); } -// CHECK-LABEL: @test_vzip2_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip2_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vzip2_p8(poly8x8_t a, poly8x8_t b) { return vzip2_p8(a, b); } -// CHECK-LABEL: @test_vzip2q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip2q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vzip2q_p8(poly8x16_t a, poly8x16_t b) { return vzip2q_p8(a, b); } -// CHECK-LABEL: @test_vzip2_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vzip2_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vzip2_p16(poly16x4_t a, poly16x4_t b) { return vzip2_p16(a, b); } -// CHECK-LABEL: @test_vzip2q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vzip2q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vzip2q_p16(poly16x8_t a, poly16x8_t b) { return vzip2q_p16(a, b); } -// CHECK-LABEL: @test_vtrn1_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) { return vtrn1_s8(a, b); } -// CHECK-LABEL: @test_vtrn1q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) { return vtrn1q_s8(a, b); } -// CHECK-LABEL: @test_vtrn1_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) { return vtrn1_s16(a, b); } -// CHECK-LABEL: @test_vtrn1q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) { return vtrn1q_s16(a, b); } -// CHECK-LABEL: @test_vtrn1_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn1_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) { return vtrn1_s32(a, b); } -// CHECK-LABEL: @test_vtrn1q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn1q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) { return vtrn1q_s32(a, b); } -// CHECK-LABEL: @test_vtrn1q_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) { return vtrn1q_s64(a, b); } -// CHECK-LABEL: @test_vtrn1_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) { return vtrn1_u8(a, b); } -// CHECK-LABEL: @test_vtrn1q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) { return vtrn1q_u8(a, b); } -// CHECK-LABEL: @test_vtrn1_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) { return vtrn1_u16(a, b); } -// CHECK-LABEL: @test_vtrn1q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) { return vtrn1q_u16(a, b); } -// CHECK-LABEL: @test_vtrn1_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn1_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) { return vtrn1_u32(a, b); } -// CHECK-LABEL: @test_vtrn1q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn1q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) { return vtrn1q_u32(a, b); } -// CHECK-LABEL: @test_vtrn1q_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) { return vtrn1q_u64(a, b); } -// CHECK-LABEL: @test_vtrn1_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vtrn1_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) { return vtrn1_f32(a, b); } -// CHECK-LABEL: @test_vtrn1q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vtrn1q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) { return vtrn1q_f32(a, b); } -// CHECK-LABEL: @test_vtrn1q_f64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> -// CHECK: ret <2 x double> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vtrn1q_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] +// float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) { return vtrn1q_f64(a, b); } -// CHECK-LABEL: @test_vtrn1_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) { return vtrn1_p8(a, b); } -// CHECK-LABEL: @test_vtrn1q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) { return vtrn1q_p8(a, b); } -// CHECK-LABEL: @test_vtrn1_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) { return vtrn1_p16(a, b); } -// CHECK-LABEL: @test_vtrn1q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) { return vtrn1q_p16(a, b); } -// CHECK-LABEL: @test_vtrn2_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) { return vtrn2_s8(a, b); } -// CHECK-LABEL: @test_vtrn2q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) { return vtrn2q_s8(a, b); } -// CHECK-LABEL: @test_vtrn2_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) { return vtrn2_s16(a, b); } -// CHECK-LABEL: @test_vtrn2q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) { return vtrn2q_s16(a, b); } -// CHECK-LABEL: @test_vtrn2_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn2_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) { return vtrn2_s32(a, b); } -// CHECK-LABEL: @test_vtrn2q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn2q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) { return vtrn2q_s32(a, b); } -// CHECK-LABEL: @test_vtrn2q_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) { return vtrn2q_s64(a, b); } -// CHECK-LABEL: @test_vtrn2_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) { return vtrn2_u8(a, b); } -// CHECK-LABEL: @test_vtrn2q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) { return vtrn2q_u8(a, b); } -// CHECK-LABEL: @test_vtrn2_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) { return vtrn2_u16(a, b); } -// CHECK-LABEL: @test_vtrn2q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) { return vtrn2q_u16(a, b); } -// CHECK-LABEL: @test_vtrn2_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn2_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) { return vtrn2_u32(a, b); } -// CHECK-LABEL: @test_vtrn2q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn2q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) { return vtrn2q_u32(a, b); } -// CHECK-LABEL: @test_vtrn2q_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) { return vtrn2q_u64(a, b); } -// CHECK-LABEL: @test_vtrn2_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vtrn2_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) { return vtrn2_f32(a, b); } -// CHECK-LABEL: @test_vtrn2q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vtrn2q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) { return vtrn2q_f32(a, b); } -// CHECK-LABEL: @test_vtrn2q_f64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> -// CHECK: ret <2 x double> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vtrn2q_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] +// float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) { return vtrn2q_f64(a, b); } -// CHECK-LABEL: @test_vtrn2_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) { return vtrn2_p8(a, b); } -// CHECK-LABEL: @test_vtrn2q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) { return vtrn2q_p8(a, b); } -// CHECK-LABEL: @test_vtrn2_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) { return vtrn2_p16(a, b); } -// CHECK-LABEL: @test_vtrn2q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) { return vtrn2q_p16(a, b); } -// CHECK-LABEL: @test_vuzp_s8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vuzp_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT8X8X2_T:%.*]] poison, <8 x i8> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) { return vuzp_s8(a, b); } -// CHECK-LABEL: @test_vuzp_s16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vuzp_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT16X4X2_T:%.*]] poison, <4 x i16> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i16> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] poison, <4 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) { return vuzp_s16(a, b); } -// CHECK-LABEL: @test_vuzp_s32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vuzp_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT32X2X2_T:%.*]] poison, <2 x i32> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_0_INSERT1]], <2 x i32> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] poison, <2 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) { return vuzp_s32(a, b); } -// CHECK-LABEL: @test_vuzp_u8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vuzp_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T:%.*]] poison, <8 x i8> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) { return vuzp_u8(a, b); } -// CHECK-LABEL: @test_vuzp_u16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vuzp_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T:%.*]] poison, <4 x i16> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i16> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] poison, <4 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) { return vuzp_u16(a, b); } -// CHECK-LABEL: @test_vuzp_u32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vuzp_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T:%.*]] poison, <2 x i32> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_0_INSERT1]], <2 x i32> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] poison, <2 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) { return vuzp_u32(a, b); } -// CHECK-LABEL: @test_vuzp_f32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x float>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vuzp_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T:%.*]] poison, <2 x float> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_0_INSERT3]], <2 x float> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_1_INSERT4]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[TMP6]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[TMP6]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] poison, <2 x float> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x float> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) { return vuzp_f32(a, b); } -// CHECK-LABEL: @test_vuzp_p8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vuzp_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T:%.*]] poison, <8 x i8> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) { return vuzp_p8(a, b); } -// CHECK-LABEL: @test_vuzp_p16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vuzp_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T:%.*]] poison, <4 x i16> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i16> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] poison, <4 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) { return vuzp_p16(a, b); } -// CHECK-LABEL: @test_vuzpq_s8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vuzpq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT8X16X2_T:%.*]] poison, <16 x i8> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) { return vuzpq_s8(a, b); } -// CHECK-LABEL: @test_vuzpq_s16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vuzpq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT16X8X2_T:%.*]] poison, <8 x i16> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i16> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] poison, <8 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) { return vuzpq_s16(a, b); } -// CHECK-LABEL: @test_vuzpq_s32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vuzpq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] poison, <4 x i32> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i32> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] poison, <4 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) { return vuzpq_s32(a, b); } -// CHECK-LABEL: @test_vuzpq_u8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vuzpq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T:%.*]] poison, <16 x i8> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) { return vuzpq_u8(a, b); } -// CHECK-LABEL: @test_vuzpq_u16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vuzpq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] poison, <8 x i16> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i16> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] poison, <8 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) { return vuzpq_u16(a, b); } -// CHECK-LABEL: @test_vuzpq_u32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vuzpq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T:%.*]] poison, <4 x i32> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i32> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] poison, <4 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) { return vuzpq_u32(a, b); } -// CHECK-LABEL: @test_vuzpq_f32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x float>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vuzpq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T:%.*]] poison, <4 x float> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_0_INSERT3]], <4 x float> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_1_INSERT4]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[TMP6]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[TMP6]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] poison, <4 x float> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x float> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) { return vuzpq_f32(a, b); } -// CHECK-LABEL: @test_vuzpq_p8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vuzpq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T:%.*]] poison, <16 x i8> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) { return vuzpq_p8(a, b); } -// CHECK-LABEL: @test_vuzpq_p16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vuzpq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T:%.*]] poison, <8 x i16> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i16> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] poison, <8 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) { return vuzpq_p16(a, b); } -// CHECK-LABEL: @test_vzip_s8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vzip_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT8X8X2_T:%.*]] poison, <8 x i8> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) { return vzip_s8(a, b); } -// CHECK-LABEL: @test_vzip_s16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vzip_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT16X4X2_T:%.*]] poison, <4 x i16> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i16> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] poison, <4 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) { return vzip_s16(a, b); } -// CHECK-LABEL: @test_vzip_s32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vzip_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT32X2X2_T:%.*]] poison, <2 x i32> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_0_INSERT1]], <2 x i32> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] poison, <2 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) { return vzip_s32(a, b); } -// CHECK-LABEL: @test_vzip_u8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vzip_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T:%.*]] poison, <8 x i8> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) { return vzip_u8(a, b); } -// CHECK-LABEL: @test_vzip_u16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vzip_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T:%.*]] poison, <4 x i16> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i16> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] poison, <4 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) { return vzip_u16(a, b); } -// CHECK-LABEL: @test_vzip_u32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vzip_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T:%.*]] poison, <2 x i32> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_0_INSERT1]], <2 x i32> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] poison, <2 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) { return vzip_u32(a, b); } -// CHECK-LABEL: @test_vzip_f32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x float>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vzip_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T:%.*]] poison, <2 x float> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_0_INSERT3]], <2 x float> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_1_INSERT4]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[TMP6]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[TMP6]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] poison, <2 x float> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x float> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) { return vzip_f32(a, b); } -// CHECK-LABEL: @test_vzip_p8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vzip_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T:%.*]] poison, <8 x i8> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) { return vzip_p8(a, b); } -// CHECK-LABEL: @test_vzip_p16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vzip_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T:%.*]] poison, <4 x i16> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i16> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] poison, <4 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) { return vzip_p16(a, b); } -// CHECK-LABEL: @test_vzipq_s8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vzipq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT8X16X2_T:%.*]] poison, <16 x i8> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) { return vzipq_s8(a, b); } -// CHECK-LABEL: @test_vzipq_s16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vzipq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT16X8X2_T:%.*]] poison, <8 x i16> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i16> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] poison, <8 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) { return vzipq_s16(a, b); } -// CHECK-LABEL: @test_vzipq_s32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vzipq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] poison, <4 x i32> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i32> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] poison, <4 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) { return vzipq_s32(a, b); } -// CHECK-LABEL: @test_vzipq_u8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vzipq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T:%.*]] poison, <16 x i8> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) { return vzipq_u8(a, b); } -// CHECK-LABEL: @test_vzipq_u16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vzipq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] poison, <8 x i16> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i16> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] poison, <8 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) { return vzipq_u16(a, b); } -// CHECK-LABEL: @test_vzipq_u32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vzipq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T:%.*]] poison, <4 x i32> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i32> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] poison, <4 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) { return vzipq_u32(a, b); } -// CHECK-LABEL: @test_vzipq_f32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x float>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vzipq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T:%.*]] poison, <4 x float> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_0_INSERT3]], <4 x float> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_1_INSERT4]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[TMP6]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[TMP6]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] poison, <4 x float> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x float> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) { return vzipq_f32(a, b); } -// CHECK-LABEL: @test_vzipq_p8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vzipq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T:%.*]] poison, <16 x i8> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) { return vzipq_p8(a, b); } -// CHECK-LABEL: @test_vzipq_p16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vzipq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T:%.*]] poison, <8 x i16> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i16> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] poison, <8 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) { return vzipq_p16(a, b); } -// CHECK-LABEL: @test_vtrn_s8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vtrn_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT8X8X2_T:%.*]] poison, <8 x i8> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) { return vtrn_s8(a, b); } -// CHECK-LABEL: @test_vtrn_s16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vtrn_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT16X4X2_T:%.*]] poison, <4 x i16> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i16> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] poison, <4 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) { return vtrn_s16(a, b); } -// CHECK-LABEL: @test_vtrn_s32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vtrn_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT32X2X2_T:%.*]] poison, <2 x i32> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_0_INSERT1]], <2 x i32> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] poison, <2 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) { return vtrn_s32(a, b); } -// CHECK-LABEL: @test_vtrn_u8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vtrn_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T:%.*]] poison, <8 x i8> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) { return vtrn_u8(a, b); } -// CHECK-LABEL: @test_vtrn_u16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vtrn_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T:%.*]] poison, <4 x i16> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i16> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] poison, <4 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) { return vtrn_u16(a, b); } -// CHECK-LABEL: @test_vtrn_u32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vtrn_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T:%.*]] poison, <2 x i32> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_0_INSERT1]], <2 x i32> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] poison, <2 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) { return vtrn_u32(a, b); } -// CHECK-LABEL: @test_vtrn_f32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x float>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vtrn_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T:%.*]] poison, <2 x float> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_0_INSERT3]], <2 x float> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_1_INSERT4]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[TMP6]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[TMP6]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] poison, <2 x float> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x float> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X2_T]] [[DOTFCA_0_1_INSERT]] +// float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) { return vtrn_f32(a, b); } -// CHECK-LABEL: @test_vtrn_p8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vtrn_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T:%.*]] poison, <8 x i8> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) { return vtrn_p8(a, b); } -// CHECK-LABEL: @test_vtrn_p16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vtrn_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T:%.*]] poison, <4 x i16> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i16> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] poison, <4 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X2_T]] [[DOTFCA_0_1_INSERT]] +// poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) { return vtrn_p16(a, b); } -// CHECK-LABEL: @test_vtrnq_s8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vtrnq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT8X16X2_T:%.*]] poison, <16 x i8> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) { return vtrnq_s8(a, b); } -// CHECK-LABEL: @test_vtrnq_s16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vtrnq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT16X8X2_T:%.*]] poison, <8 x i16> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i16> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] poison, <8 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) { return vtrnq_s16(a, b); } -// CHECK-LABEL: @test_vtrnq_s32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vtrnq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] poison, <4 x i32> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i32> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] poison, <4 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_INT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) { return vtrnq_s32(a, b); } -// CHECK-LABEL: @test_vtrnq_u8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vtrnq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T:%.*]] poison, <16 x i8> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) { return vtrnq_u8(a, b); } -// CHECK-LABEL: @test_vtrnq_u16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vtrnq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] poison, <8 x i16> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i16> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] poison, <8 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) { return vtrnq_u16(a, b); } -// CHECK-LABEL: @test_vtrnq_u32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vtrnq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T:%.*]] poison, <4 x i32> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT1]], <4 x i32> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] poison, <4 x i32> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x i32> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) { return vtrnq_u32(a, b); } -// CHECK-LABEL: @test_vtrnq_f32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x float>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vtrnq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T:%.*]] poison, <4 x float> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_0_INSERT3]], <4 x float> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_1_INSERT4]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[TMP6]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[TMP6]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] poison, <4 x float> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x float> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X2_T]] [[DOTFCA_0_1_INSERT]] +// float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) { return vtrnq_f32(a, b); } -// CHECK-LABEL: @test_vtrnq_p8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vtrnq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T:%.*]] poison, <16 x i8> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) { return vtrnq_p8(a, b); } -// CHECK-LABEL: @test_vtrnq_p16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vtrnq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T:%.*]] poison, <8 x i16> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i16> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[TMP4]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] poison, <8 x i16> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i16> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X2_T]] [[DOTFCA_0_1_INSERT]] +// poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) { return vtrnq_p16(a, b); } diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.c index 1d0db697e4fdd..d56dc193d7f1e 100644 --- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.c +++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.c @@ -1,17 +1,11 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=UNCONSTRAINED %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa \ +// RUN: | FileCheck --check-prefix=UNCONSTRAINED %s // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \ // RUN: -ffp-exception-behavior=strict \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=CONSTRAINED %s -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | llc -o=- - \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \ -// RUN: -ffp-exception-behavior=strict \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | llc -o=- - \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa \ +// RUN: | FileCheck --check-prefix=CONSTRAINED %s // REQUIRES: aarch64-registered-target @@ -19,112 +13,248 @@ #include -// COMMON-LABEL: test_vfmas_lane_f32 -// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1 -// UNCONSTRAINED: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a) -// CONSTRAINED: [[TMP2:%.*]] = call float @llvm.experimental.constrained.fma.f32(float %b, float [[EXTRACT]], float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}.s[{{[0-9]+}}] -// COMMONIR: ret float [[TMP2]] +// UNCONSTRAINED-LABEL: define dso_local float @test_vfmas_lane_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i32 1 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B]], float [[EXTRACT]], float [[A]]) +// UNCONSTRAINED-NEXT: ret float [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local float @test_vfmas_lane_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i32 1 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call float @llvm.experimental.constrained.fma.f32(float [[B]], float [[EXTRACT]], float [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]] +// CONSTRAINED-NEXT: ret float [[TMP0]] +// float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) { return vfmas_lane_f32(a, b, c, 1); } -// COMMON-LABEL: test_vfmad_lane_f64 -// COMMONIR: [[EXTRACT:%.*]] = extractelement <1 x double> %c, i32 0 -// UNCONSTRAINED: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a) -// CONSTRAINED: [[TMP2:%.*]] = call double @llvm.experimental.constrained.fma.f64(double %b, double [[EXTRACT]], double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// COMMONIR: ret double [[TMP2]] +// UNCONSTRAINED-LABEL: define dso_local double @test_vfmad_lane_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]]) +// UNCONSTRAINED-NEXT: ret double [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local double @test_vfmad_lane_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i32 0 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[B]], double [[EXTRACT]], double [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret double [[TMP0]] +// float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) { return vfmad_lane_f64(a, b, c, 0); } -// COMMON-LABEL: test_vfmad_laneq_f64 -// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x double> %c, i32 1 -// UNCONSTRAINED: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a) -// CONSTRAINED: [[TMP2:%.*]] = call double @llvm.experimental.constrained.fma.f64(double %b, double [[EXTRACT]], double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla d{{[0-9]+}}, d{{[0-9]+}}, v{{[0-9]+}}.d[{{[0-9]+}}] -// COMMONIR: ret double [[TMP2]] +// UNCONSTRAINED-LABEL: define dso_local double @test_vfmad_laneq_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i32 1 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]]) +// UNCONSTRAINED-NEXT: ret double [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local double @test_vfmad_laneq_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i32 1 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[B]], double [[EXTRACT]], double [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret double [[TMP0]] +// float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) { return vfmad_laneq_f64(a, b, c, 1); } -// COMMON-LABEL: test_vfmss_lane_f32 -// COMMONIR: [[SUB:%.*]] = fneg float %b -// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1 -// UNCONSTRAINED: [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a) -// CONSTRAINED: [[TMP2:%.*]] = call float @llvm.experimental.constrained.fma.f32(float [[SUB]], float [[EXTRACT]], float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}.s[{{[0-9]+}}] -// COMMONIR: ret float [[TMP2]] +// UNCONSTRAINED-LABEL: define dso_local float @test_vfmss_lane_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg float [[B]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i32 1 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A]]) +// UNCONSTRAINED-NEXT: ret float [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local float @test_vfmss_lane_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg float [[B]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i32 1 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call float @llvm.experimental.constrained.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret float [[TMP0]] +// float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) { return vfmss_lane_f32(a, b, c, 1); } -// COMMON-LABEL: test_vfma_lane_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// COMMONIR: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// COMMONIR: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// COMMONIR: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// UNCONSTRAINED: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) -// CONSTRAINED: [[FMLA2:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// COMMONIR: ret <1 x double> [[FMLA2]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_lane_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// UNCONSTRAINED-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64 +// UNCONSTRAINED-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer +// UNCONSTRAINED-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// UNCONSTRAINED-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// UNCONSTRAINED-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[FMLA2]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_lane_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CONSTRAINED-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64 +// CONSTRAINED-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer +// CONSTRAINED-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CONSTRAINED-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CONSTRAINED-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <1 x double> [[FMLA2]] +// float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { return vfma_lane_f64(a, b, v, 0); } -// COMMON-LABEL: test_vfms_lane_f64 -// COMMONIR: [[SUB:%.*]] = fneg <1 x double> %b -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// COMMONIR: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// COMMONIR: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// COMMONIR: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// UNCONSTRAINED: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) -// CONSTRAINED: [[FMLA2:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// COMMONIR: ret <1 x double> [[FMLA2]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_lane_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64 +// UNCONSTRAINED-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64 +// UNCONSTRAINED-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer +// UNCONSTRAINED-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// UNCONSTRAINED-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// UNCONSTRAINED-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[FMLA2]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_lane_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64 +// CONSTRAINED-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64 +// CONSTRAINED-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer +// CONSTRAINED-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CONSTRAINED-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CONSTRAINED-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <1 x double> [[FMLA2]] +// float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { return vfms_lane_f64(a, b, v, 0); } -// COMMON-LABEL: test_vfma_laneq_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// COMMONIR: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -// UNCONSTRAINED: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) -// CONSTRAINED: [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// COMMONIR: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> -// COMMONIR: ret <1 x double> [[TMP7]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// UNCONSTRAINED-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]]) +// UNCONSTRAINED-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double> +// UNCONSTRAINED-NEXT: ret <1 x double> [[TMP10]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CONSTRAINED-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double> +// CONSTRAINED-NEXT: ret <1 x double> [[TMP10]] +// float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfma_laneq_f64(a, b, v, 0); } -// COMMON-LABEL: test_vfms_laneq_f64 -// COMMONIR: [[SUB:%.*]] = fneg <1 x double> %b -// CHECK-ASM: fneg d{{[0-9]+}}, d{{[0-9]+}} -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// COMMONIR: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -// UNCONSTRAINED: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) -// CONSTRAINED: [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// COMMONIR: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> -// COMMONIR: ret <1 x double> [[TMP7]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// UNCONSTRAINED-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64 +// UNCONSTRAINED-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]]) +// UNCONSTRAINED-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double> +// UNCONSTRAINED-NEXT: ret <1 x double> [[TMP10]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CONSTRAINED-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64 +// CONSTRAINED-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double> +// CONSTRAINED-NEXT: ret <1 x double> [[TMP10]] +// float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfms_laneq_f64(a, b, v, 0); } diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c index 8b7b976ab5e5a..9b98126500444 100644 --- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c +++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c @@ -1,419 +1,565 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: define{{.*}} float @test_vmuls_lane_f32(float noundef %a, <2 x float> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> %b, i32 1 -// CHECK: [[MUL:%.*]] = fmul float %a, [[VGET_LANE]] -// CHECK: ret float [[MUL]] +// CHECK-LABEL: define dso_local float @test_vmuls_lane_f32( +// CHECK-SAME: float noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x float> [[B]], i32 1 +// CHECK-NEXT: [[MUL:%.*]] = fmul float [[A]], [[VGET_LANE]] +// CHECK-NEXT: ret float [[MUL]] +// float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) { return vmuls_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} double @test_vmuld_lane_f64(double noundef %a, <1 x double> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> %b, i32 0 -// CHECK: [[MUL:%.*]] = fmul double %a, [[VGET_LANE]] -// CHECK: ret double [[MUL]] +// CHECK-LABEL: define dso_local double @test_vmuld_lane_f64( +// CHECK-SAME: double noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[B]], i32 0 +// CHECK-NEXT: [[MUL:%.*]] = fmul double [[A]], [[VGET_LANE]] +// CHECK-NEXT: ret double [[MUL]] +// float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) { return vmuld_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} float @test_vmuls_laneq_f32(float noundef %a, <4 x float> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> %b, i32 3 -// CHECK: [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]] -// CHECK: ret float [[MUL]] +// CHECK-LABEL: define dso_local float @test_vmuls_laneq_f32( +// CHECK-SAME: float noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[B]], i32 3 +// CHECK-NEXT: [[MUL:%.*]] = fmul float [[A]], [[VGETQ_LANE]] +// CHECK-NEXT: ret float [[MUL]] +// float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) { return vmuls_laneq_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} double @test_vmuld_laneq_f64(double noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1 -// CHECK: [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]] -// CHECK: ret double [[MUL]] +// CHECK-LABEL: define dso_local double @test_vmuld_laneq_f64( +// CHECK-SAME: double noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B]], i32 1 +// CHECK-NEXT: [[MUL:%.*]] = fmul double [[A]], [[VGETQ_LANE]] +// CHECK-NEXT: ret double [[MUL]] +// float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) { return vmuld_laneq_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vmul_n_f64(<1 x double> noundef %a, double noundef %b) #0 { -// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %a to double -// CHECK: [[TMP3:%.*]] = fmul double [[TMP2]], %b -// CHECK: [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double> -// CHECK: ret <1 x double> [[TMP4]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmul_n_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to double +// CHECK-NEXT: [[TMP1:%.*]] = fmul double [[TMP0]], [[B]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP2]] +// float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) { return vmul_n_f64(a, b); } -// CHECK-LABEL: define{{.*}} float @test_vmulxs_lane_f32(float noundef %a, <2 x float> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> %b, i32 1 -// CHECK: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]]) -// CHECK: ret float [[VMULXS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vmulxs_lane_f32( +// CHECK-SAME: float noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x float> [[B]], i32 1 +// CHECK-NEXT: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[VGET_LANE]]) +// CHECK-NEXT: ret float [[VMULXS_F32_I]] +// float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) { return vmulxs_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} float @test_vmulxs_laneq_f32(float noundef %a, <4 x float> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> %b, i32 3 -// CHECK: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]]) -// CHECK: ret float [[VMULXS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vmulxs_laneq_f32( +// CHECK-SAME: float noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[B]], i32 3 +// CHECK-NEXT: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[VGETQ_LANE]]) +// CHECK-NEXT: ret float [[VMULXS_F32_I]] +// float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) { return vmulxs_laneq_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} double @test_vmulxd_lane_f64(double noundef %a, <1 x double> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> %b, i32 0 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]]) -// CHECK: ret double [[VMULXD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vmulxd_lane_f64( +// CHECK-SAME: double noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[B]], i32 0 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[VGET_LANE]]) +// CHECK-NEXT: ret double [[VMULXD_F64_I]] +// float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) { return vmulxd_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} double @test_vmulxd_laneq_f64(double noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]]) -// CHECK: ret double [[VMULXD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vmulxd_laneq_f64( +// CHECK-SAME: double noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B]], i32 1 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[VGETQ_LANE]]) +// CHECK-NEXT: ret double [[VMULXD_F64_I]] +// float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) { return vmulxd_laneq_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_lane_f64(<1 x double> noundef %a, <1 x double> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0 -// CHECK: [[VGET_LANE6:%.*]] = extractelement <1 x double> %b, i32 0 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE6]]) -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0 -// CHECK: ret <1 x double> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_lane_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i32 0 +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <1 x double> [[B]], i32 0 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE4]]) +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0 +// CHECK-NEXT: ret <1 x double> [[VSET_LANE]] +// float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) { return vmulx_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_laneq_f64_0(<1 x double> noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0 -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 0 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0 -// CHECK: ret <1 x double> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_0( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i32 0 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B]], i32 0 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0 +// CHECK-NEXT: ret <1 x double> [[VSET_LANE]] +// float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) { return vmulx_laneq_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_laneq_f64_1(<1 x double> noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0 -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0 -// CHECK: ret <1 x double> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_1( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i32 0 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B]], i32 1 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0 +// CHECK-NEXT: ret <1 x double> [[VSET_LANE]] +// float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) { return vmulx_laneq_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} float @test_vfmas_lane_f32(float noundef %a, float noundef %b, <2 x float> noundef %c) #0 { -// CHECK: [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1 -// CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a) -// CHECK: ret float [[TMP2]] +// CHECK-LABEL: define dso_local float @test_vfmas_lane_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B]], float [[EXTRACT]], float [[A]]) +// CHECK-NEXT: ret float [[TMP0]] +// float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) { return vfmas_lane_f32(a, b, c, 1); } -// CHECK-LABEL: define{{.*}} double @test_vfmad_lane_f64(double noundef %a, double noundef %b, <1 x double> noundef %c) #0 { -// CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> %c, i32 0 -// CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a) -// CHECK: ret double [[TMP2]] +// CHECK-LABEL: define dso_local double @test_vfmad_lane_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]]) +// CHECK-NEXT: ret double [[TMP0]] +// float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) { return vfmad_lane_f64(a, b, c, 0); } -// CHECK-LABEL: define{{.*}} double @test_vfmad_laneq_f64(double noundef %a, double noundef %b, <2 x double> noundef %c) #0 { -// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> %c, i32 1 -// CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a) -// CHECK: ret double [[TMP2]] +// CHECK-LABEL: define dso_local double @test_vfmad_laneq_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]]) +// CHECK-NEXT: ret double [[TMP0]] +// float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) { return vfmad_laneq_f64(a, b, c, 1); } -// CHECK-LABEL: define{{.*}} float @test_vfmss_lane_f32(float noundef %a, float noundef %b, <2 x float> noundef %c) #0 { -// CHECK: [[SUB:%.*]] = fneg float %b -// CHECK: [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1 -// CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a) -// CHECK: ret float [[TMP2]] +// CHECK-LABEL: define dso_local float @test_vfmss_lane_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG:%.*]] = fneg float [[B]] +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A]]) +// CHECK-NEXT: ret float [[TMP0]] +// float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) { return vfmss_lane_f32(a, b, c, 1); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vfma_lane_f64(<1 x double> noundef %a, <1 x double> noundef %b, <1 x double> noundef %v) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) -// CHECK: ret <1 x double> [[FMLA2]] +// CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64 +// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) +// CHECK-NEXT: ret <1 x double> [[FMLA2]] +// float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { return vfma_lane_f64(a, b, v, 0); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vfms_lane_f64(<1 x double> noundef %a, <1 x double> noundef %b, <1 x double> noundef %v) #0 { -// CHECK: [[SUB:%.*]] = fneg <1 x double> %b -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) -// CHECK: ret <1 x double> [[FMLA2]] +// CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V]] to i64 +// CHECK-NEXT: [[__S2_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[__S2_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP6]], <1 x double> [[TMP6]], <1 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x double> +// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) +// CHECK-NEXT: ret <1 x double> [[FMLA2]] +// float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { return vfms_lane_f64(a, b, v, 0); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vfma_laneq_f64(<1 x double> noundef %a, <1 x double> noundef %b, <2 x double> noundef %v) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double -// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -// CHECK: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) -// CHECK: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> -// CHECK: ret <1 x double> [[TMP7]] +// CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 +// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]]) +// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP10]] +// float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfma_laneq_f64(a, b, v, 0); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vfms_laneq_f64(<1 x double> noundef %a, <1 x double> noundef %b, <2 x double> noundef %v) #0 { -// CHECK: [[SUB:%.*]] = fneg <1 x double> %b -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double -// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -// CHECK: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) -// CHECK: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> -// CHECK: ret <1 x double> [[TMP7]] +// CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to i64 +// CHECK-NEXT: [[__S1_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[__S1_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to double +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to double +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 +// CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[EXTRACT]], double [[TMP6]]) +// CHECK-NEXT: [[TMP10:%.*]] = bitcast double [[TMP9]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP10]] +// float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfms_laneq_f64(a, b, v, 0); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmullh_lane_s16(i16 noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0 -// CHECK: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 -// CHECK: ret i32 [[TMP4]] +// CHECK-LABEL: define dso_local i32 @test_vqdmullh_lane_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[B]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0 +// CHECK-NEXT: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 +// CHECK-NEXT: ret i32 [[TMP2]] +// int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) { return vqdmullh_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} i64 @test_vqdmulls_lane_s32(i32 noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1 -// CHECK: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]]) -// CHECK: ret i64 [[VQDMULLS_S32_I]] +// CHECK-LABEL: define dso_local i64 @test_vqdmulls_lane_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[B]], i32 1 +// CHECK-NEXT: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[VGET_LANE]]) +// CHECK-NEXT: ret i64 [[VQDMULLS_S32_I]] +// int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) { return vqdmulls_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmullh_laneq_s16(i16 noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0 -// CHECK: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 -// CHECK: ret i32 [[TMP4]] +// CHECK-LABEL: define dso_local i32 @test_vqdmullh_laneq_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[B]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0 +// CHECK-NEXT: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 +// CHECK-NEXT: ret i32 [[TMP2]] +// int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) { return vqdmullh_laneq_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} i64 @test_vqdmulls_laneq_s32(i32 noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3 -// CHECK: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]]) -// CHECK: ret i64 [[VQDMULLS_S32_I]] +// CHECK-LABEL: define dso_local i64 @test_vqdmulls_laneq_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[B]], i32 3 +// CHECK-NEXT: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[VGETQ_LANE]]) +// CHECK-NEXT: ret i64 [[VQDMULLS_S32_I]] +// int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) { return vqdmulls_laneq_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} i16 @test_vqdmulhh_lane_s16(i16 noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0 -// CHECK: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP4]] +// CHECK-LABEL: define dso_local i16 @test_vqdmulhh_lane_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[B]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0 +// CHECK-NEXT: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) { return vqdmulhh_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmulhs_lane_s32(i32 noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1 -// CHECK: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]]) -// CHECK: ret i32 [[VQDMULHS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqdmulhs_lane_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[B]], i32 1 +// CHECK-NEXT: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A]], i32 [[VGET_LANE]]) +// CHECK-NEXT: ret i32 [[VQDMULHS_S32_I]] +// int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) { return vqdmulhs_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} i16 @test_vqdmulhh_laneq_s16(i16 noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0 -// CHECK: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP4]] +// CHECK-LABEL: define dso_local i16 @test_vqdmulhh_laneq_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[B]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0 +// CHECK-NEXT: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) { return vqdmulhh_laneq_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmulhs_laneq_s32(i32 noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3 -// CHECK: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) -// CHECK: ret i32 [[VQDMULHS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqdmulhs_laneq_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[B]], i32 3 +// CHECK-NEXT: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A]], i32 [[VGETQ_LANE]]) +// CHECK-NEXT: ret i32 [[VQDMULHS_S32_I]] +// int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) { return vqdmulhs_laneq_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} i16 @test_vqrdmulhh_lane_s16(i16 noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0 -// CHECK: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP4]] +// CHECK-LABEL: define dso_local i16 @test_vqrdmulhh_lane_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[B]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0 +// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) { return vqrdmulhh_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} i32 @test_vqrdmulhs_lane_s32(i32 noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1 -// CHECK: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]]) -// CHECK: ret i32 [[VQRDMULHS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqrdmulhs_lane_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[B]], i32 1 +// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A]], i32 [[VGET_LANE]]) +// CHECK-NEXT: ret i32 [[VQRDMULHS_S32_I]] +// int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) { return vqrdmulhs_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} i16 @test_vqrdmulhh_laneq_s16(i16 noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0 -// CHECK: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP4]] +// CHECK-LABEL: define dso_local i16 @test_vqrdmulhh_laneq_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[B]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0 +// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) { return vqrdmulhh_laneq_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} i32 @test_vqrdmulhs_laneq_s32(i32 noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3 -// CHECK: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) -// CHECK: ret i32 [[VQRDMULHS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqrdmulhs_laneq_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[B]], i32 3 +// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A]], i32 [[VGETQ_LANE]]) +// CHECK-NEXT: ret i32 [[VQRDMULHS_S32_I]] +// int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) { return vqrdmulhs_laneq_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmlalh_lane_s16(i32 noundef %a, i16 noundef %b, <4 x i16> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <4 x i16> %c, i32 3 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0 -// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 -// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]]) -// CHECK: ret i32 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i32 @test_vqdmlalh_lane_s16( +// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = extractelement <4 x i16> [[C]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0 +// CHECK-NEXT: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[LANE0]]) +// CHECK-NEXT: ret i32 [[VQDMLXL1]] +// int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) { return vqdmlalh_lane_s16(a, b, c, 3); } -// CHECK-LABEL: define{{.*}} i64 @test_vqdmlals_lane_s32(i64 noundef %a, i32 noundef %b, <2 x i32> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <2 x i32> %c, i32 1 -// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]]) -// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]]) -// CHECK: ret i64 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i64 @test_vqdmlals_lane_s32( +// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = extractelement <2 x i32> [[C]], i32 1 +// CHECK-NEXT: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[LANE]]) +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[VQDMLXL]]) +// CHECK-NEXT: ret i64 [[VQDMLXL1]] +// int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) { return vqdmlals_lane_s32(a, b, c, 1); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmlalh_laneq_s16(i32 noundef %a, i16 noundef %b, <8 x i16> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <8 x i16> %c, i32 7 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0 -// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 -// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]]) -// CHECK: ret i32 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i32 @test_vqdmlalh_laneq_s16( +// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = extractelement <8 x i16> [[C]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0 +// CHECK-NEXT: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[LANE0]]) +// CHECK-NEXT: ret i32 [[VQDMLXL1]] +// int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) { return vqdmlalh_laneq_s16(a, b, c, 7); } -// CHECK-LABEL: define{{.*}} i64 @test_vqdmlals_laneq_s32(i64 noundef %a, i32 noundef %b, <4 x i32> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <4 x i32> %c, i32 3 -// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]]) -// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]]) -// CHECK: ret i64 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i64 @test_vqdmlals_laneq_s32( +// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = extractelement <4 x i32> [[C]], i32 3 +// CHECK-NEXT: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[LANE]]) +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[VQDMLXL]]) +// CHECK-NEXT: ret i64 [[VQDMLXL1]] +// int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) { return vqdmlals_laneq_s32(a, b, c, 3); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmlslh_lane_s16(i32 noundef %a, i16 noundef %b, <4 x i16> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <4 x i16> %c, i32 3 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0 -// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 -// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]]) -// CHECK: ret i32 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i32 @test_vqdmlslh_lane_s16( +// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = extractelement <4 x i16> [[C]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0 +// CHECK-NEXT: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[LANE0]]) +// CHECK-NEXT: ret i32 [[VQDMLXL1]] +// int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) { return vqdmlslh_lane_s16(a, b, c, 3); } -// CHECK-LABEL: define{{.*}} i64 @test_vqdmlsls_lane_s32(i64 noundef %a, i32 noundef %b, <2 x i32> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <2 x i32> %c, i32 1 -// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]]) -// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]]) -// CHECK: ret i64 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i64 @test_vqdmlsls_lane_s32( +// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = extractelement <2 x i32> [[C]], i32 1 +// CHECK-NEXT: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[LANE]]) +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[VQDMLXL]]) +// CHECK-NEXT: ret i64 [[VQDMLXL1]] +// int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) { return vqdmlsls_lane_s32(a, b, c, 1); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmlslh_laneq_s16(i32 noundef %a, i16 noundef %b, <8 x i16> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <8 x i16> %c, i32 7 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0 -// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 -// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]]) -// CHECK: ret i32 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i32 @test_vqdmlslh_laneq_s16( +// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = extractelement <8 x i16> [[C]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0 +// CHECK-NEXT: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[LANE0]]) +// CHECK-NEXT: ret i32 [[VQDMLXL1]] +// int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) { return vqdmlslh_laneq_s16(a, b, c, 7); } -// CHECK-LABEL: define{{.*}} i64 @test_vqdmlsls_laneq_s32(i64 noundef %a, i32 noundef %b, <4 x i32> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <4 x i32> %c, i32 3 -// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]]) -// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]]) -// CHECK: ret i64 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i64 @test_vqdmlsls_laneq_s32( +// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = extractelement <4 x i32> [[C]], i32 3 +// CHECK-NEXT: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[LANE]]) +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[VQDMLXL]]) +// CHECK-NEXT: ret i64 [[VQDMLXL1]] +// int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) { return vqdmlsls_laneq_s32(a, b, c, 3); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_lane_f64_0() #0 { -// CHECK: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double> -// CHECK: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double> -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0 -// CHECK: [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]]) -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0 -// CHECK: ret <1 x double> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_lane_f64_0( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double> +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0 +// CHECK-NEXT: [[VGET_LANE9:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE9]]) +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0 +// CHECK-NEXT: ret <1 x double> [[VSET_LANE]] +// float64x1_t test_vmulx_lane_f64_0() { float64x1_t arg1; float64x1_t arg2; @@ -425,15 +571,18 @@ float64x1_t test_vmulx_lane_f64_0() { return result; } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_laneq_f64_2() #0 { -// CHECK: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double> -// CHECK: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0 -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0 -// CHECK: ret <1 x double> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_2( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0 +// CHECK-NEXT: ret <1 x double> [[VSET_LANE]] +// float64x1_t test_vmulx_laneq_f64_2() { float64x1_t arg1; float64x1_t arg2; diff --git a/clang/test/CodeGen/AArch64/neon-vcmla.c b/clang/test/CodeGen/AArch64/neon-vcmla.c index d860411fe45c5..72dad95939ecd 100644 --- a/clang/test/CodeGen/AArch64/neon-vcmla.c +++ b/clang/test/CodeGen/AArch64/neon-vcmla.c @@ -2,7 +2,7 @@ // RUN: %clang_cc1 -triple arm64 -target-feature +neon \ // RUN: -target-feature +v8.3a \ // RUN: -target-feature +fullfp16 \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes="mem2reg,instsimplify" | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes="mem2reg,instsimplify,sroa" | FileCheck %s // REQUIRES: aarch64-registered-target @@ -11,8 +11,20 @@ // CHECK-LABEL: define dso_local <4 x half> @test_vcmla_f16( // CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLA_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[RHS]]) -// CHECK-NEXT: ret <4 x half> [[VCMLA_F163_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[ACC]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[LHS]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[RHS]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_F16_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VCMLA_F161_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[VCMLA_F162_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VCMLA_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> [[VCMLA_F16_I]], <4 x half> [[VCMLA_F161_I]], <4 x half> [[VCMLA_F162_I]]) +// CHECK-NEXT: [[VCMLA_F164_I:%.*]] = bitcast <4 x half> [[VCMLA_F163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[VCMLA_F164_I]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP7]] // float16x4_t test_vcmla_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) { return vcmla_f16(acc, lhs, rhs); @@ -21,8 +33,20 @@ float16x4_t test_vcmla_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) { // CHECK-LABEL: define dso_local <2 x float> @test_vcmla_f32( // CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLA_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]]) -// CHECK-NEXT: ret <2 x float> [[VCMLA_F323_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[ACC]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[RHS]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_F32_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VCMLA_F321_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[VCMLA_F322_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VCMLA_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> [[VCMLA_F32_I]], <2 x float> [[VCMLA_F321_I]], <2 x float> [[VCMLA_F322_I]]) +// CHECK-NEXT: [[VCMLA_F324_I:%.*]] = bitcast <2 x float> [[VCMLA_F323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[VCMLA_F324_I]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP7]] // float32x2_t test_vcmla_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) { return vcmla_f32(acc, lhs, rhs); @@ -31,8 +55,20 @@ float32x2_t test_vcmla_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) { // CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_f16( // CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLAQ_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[RHS]]) -// CHECK-NEXT: ret <8 x half> [[VCMLAQ_F163_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[ACC]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[LHS]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[RHS]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_F16_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_F161_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_F162_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> [[VCMLAQ_F16_I]], <8 x half> [[VCMLAQ_F161_I]], <8 x half> [[VCMLAQ_F162_I]]) +// CHECK-NEXT: [[VCMLAQ_F164_I:%.*]] = bitcast <8 x half> [[VCMLAQ_F163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VCMLAQ_F164_I]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP7]] // float16x8_t test_vcmlaq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) { return vcmlaq_f16(acc, lhs, rhs); @@ -41,8 +77,20 @@ float16x8_t test_vcmlaq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) { // CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_f32( // CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[RHS]]) -// CHECK-NEXT: ret <4 x float> [[VCMLAQ_F323_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACC]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[RHS]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> [[VCMLAQ_F32_I]], <4 x float> [[VCMLAQ_F321_I]], <4 x float> [[VCMLAQ_F322_I]]) +// CHECK-NEXT: [[VCMLAQ_F324_I:%.*]] = bitcast <4 x float> [[VCMLAQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VCMLAQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vcmlaq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) { return vcmlaq_f32(acc, lhs, rhs); @@ -51,8 +99,20 @@ float32x4_t test_vcmlaq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) { // CHECK-LABEL: define dso_local <2 x double> @test_vcmlaq_f64( // CHECK-SAME: <2 x double> noundef [[ACC:%.*]], <2 x double> noundef [[LHS:%.*]], <2 x double> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLAQ_F643_I:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> [[ACC]], <2 x double> [[LHS]], <2 x double> [[RHS]]) -// CHECK-NEXT: ret <2 x double> [[VCMLAQ_F643_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[ACC]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[LHS]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[RHS]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_F64_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VCMLAQ_F641_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[VCMLAQ_F642_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[VCMLAQ_F643_I:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot0.v2f64(<2 x double> [[VCMLAQ_F64_I]], <2 x double> [[VCMLAQ_F641_I]], <2 x double> [[VCMLAQ_F642_I]]) +// CHECK-NEXT: [[VCMLAQ_F644_I:%.*]] = bitcast <2 x double> [[VCMLAQ_F643_I]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VCMLAQ_F644_I]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP7]] // float64x2_t test_vcmlaq_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) { return vcmlaq_f64(acc, lhs, rhs); @@ -61,8 +121,20 @@ float64x2_t test_vcmlaq_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) { // CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot90_f16( // CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLA_ROT90_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[RHS]]) -// CHECK-NEXT: ret <4 x half> [[VCMLA_ROT90_F163_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[ACC]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[LHS]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[RHS]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT90_F16_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT90_F161_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT90_F162_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT90_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> [[VCMLA_ROT90_F16_I]], <4 x half> [[VCMLA_ROT90_F161_I]], <4 x half> [[VCMLA_ROT90_F162_I]]) +// CHECK-NEXT: [[VCMLA_ROT90_F164_I:%.*]] = bitcast <4 x half> [[VCMLA_ROT90_F163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[VCMLA_ROT90_F164_I]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP7]] // float16x4_t test_vcmla_rot90_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) { return vcmla_rot90_f16(acc, lhs, rhs); @@ -71,8 +143,20 @@ float16x4_t test_vcmla_rot90_f16(float16x4_t acc, float16x4_t lhs, float16x4_t r // CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot90_f32( // CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLA_ROT90_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]]) -// CHECK-NEXT: ret <2 x float> [[VCMLA_ROT90_F323_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[ACC]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[RHS]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT90_F32_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT90_F321_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT90_F322_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT90_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> [[VCMLA_ROT90_F32_I]], <2 x float> [[VCMLA_ROT90_F321_I]], <2 x float> [[VCMLA_ROT90_F322_I]]) +// CHECK-NEXT: [[VCMLA_ROT90_F324_I:%.*]] = bitcast <2 x float> [[VCMLA_ROT90_F323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[VCMLA_ROT90_F324_I]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP7]] // float32x2_t test_vcmla_rot90_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) { return vcmla_rot90_f32(acc, lhs, rhs); @@ -81,8 +165,20 @@ float32x2_t test_vcmla_rot90_f32(float32x2_t acc, float32x2_t lhs, float32x2_t r // CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot90_f16( // CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLAQ_ROT90_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[RHS]]) -// CHECK-NEXT: ret <8 x half> [[VCMLAQ_ROT90_F163_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[ACC]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[LHS]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[RHS]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT90_F16_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT90_F161_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT90_F162_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT90_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> [[VCMLAQ_ROT90_F16_I]], <8 x half> [[VCMLAQ_ROT90_F161_I]], <8 x half> [[VCMLAQ_ROT90_F162_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT90_F164_I:%.*]] = bitcast <8 x half> [[VCMLAQ_ROT90_F163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT90_F164_I]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP7]] // float16x8_t test_vcmlaq_rot90_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) { return vcmlaq_rot90_f16(acc, lhs, rhs); @@ -91,8 +187,20 @@ float16x8_t test_vcmlaq_rot90_f16(float16x8_t acc, float16x8_t lhs, float16x8_t // CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot90_f32( // CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLAQ_ROT90_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[RHS]]) -// CHECK-NEXT: ret <4 x float> [[VCMLAQ_ROT90_F323_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACC]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[RHS]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT90_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT90_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT90_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT90_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> [[VCMLAQ_ROT90_F32_I]], <4 x float> [[VCMLAQ_ROT90_F321_I]], <4 x float> [[VCMLAQ_ROT90_F322_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT90_F324_I:%.*]] = bitcast <4 x float> [[VCMLAQ_ROT90_F323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT90_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vcmlaq_rot90_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) { return vcmlaq_rot90_f32(acc, lhs, rhs); @@ -101,8 +209,20 @@ float32x4_t test_vcmlaq_rot90_f32(float32x4_t acc, float32x4_t lhs, float32x4_t // CHECK-LABEL: define dso_local <2 x double> @test_vcmlaq_rot90_f64( // CHECK-SAME: <2 x double> noundef [[ACC:%.*]], <2 x double> noundef [[LHS:%.*]], <2 x double> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLAQ_ROT90_F643_I:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> [[ACC]], <2 x double> [[LHS]], <2 x double> [[RHS]]) -// CHECK-NEXT: ret <2 x double> [[VCMLAQ_ROT90_F643_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[ACC]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[LHS]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[RHS]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT90_F64_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VCMLAQ_ROT90_F641_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[VCMLAQ_ROT90_F642_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[VCMLAQ_ROT90_F643_I:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot90.v2f64(<2 x double> [[VCMLAQ_ROT90_F64_I]], <2 x double> [[VCMLAQ_ROT90_F641_I]], <2 x double> [[VCMLAQ_ROT90_F642_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT90_F644_I:%.*]] = bitcast <2 x double> [[VCMLAQ_ROT90_F643_I]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT90_F644_I]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP7]] // float64x2_t test_vcmlaq_rot90_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) { return vcmlaq_rot90_f64(acc, lhs, rhs); @@ -111,8 +231,20 @@ float64x2_t test_vcmlaq_rot90_f64(float64x2_t acc, float64x2_t lhs, float64x2_t // CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot180_f16( // CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLA_ROT180_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[RHS]]) -// CHECK-NEXT: ret <4 x half> [[VCMLA_ROT180_F163_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[ACC]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[LHS]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[RHS]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT180_F16_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT180_F161_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT180_F162_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT180_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> [[VCMLA_ROT180_F16_I]], <4 x half> [[VCMLA_ROT180_F161_I]], <4 x half> [[VCMLA_ROT180_F162_I]]) +// CHECK-NEXT: [[VCMLA_ROT180_F164_I:%.*]] = bitcast <4 x half> [[VCMLA_ROT180_F163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[VCMLA_ROT180_F164_I]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP7]] // float16x4_t test_vcmla_rot180_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) { return vcmla_rot180_f16(acc, lhs, rhs); @@ -121,8 +253,20 @@ float16x4_t test_vcmla_rot180_f16(float16x4_t acc, float16x4_t lhs, float16x4_t // CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot180_f32( // CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLA_ROT180_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]]) -// CHECK-NEXT: ret <2 x float> [[VCMLA_ROT180_F323_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[ACC]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[RHS]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT180_F32_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT180_F321_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT180_F322_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT180_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> [[VCMLA_ROT180_F32_I]], <2 x float> [[VCMLA_ROT180_F321_I]], <2 x float> [[VCMLA_ROT180_F322_I]]) +// CHECK-NEXT: [[VCMLA_ROT180_F324_I:%.*]] = bitcast <2 x float> [[VCMLA_ROT180_F323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[VCMLA_ROT180_F324_I]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP7]] // float32x2_t test_vcmla_rot180_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) { return vcmla_rot180_f32(acc, lhs, rhs); @@ -131,8 +275,20 @@ float32x2_t test_vcmla_rot180_f32(float32x2_t acc, float32x2_t lhs, float32x2_t // CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot180_f16( // CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLAQ_ROT180_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[RHS]]) -// CHECK-NEXT: ret <8 x half> [[VCMLAQ_ROT180_F163_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[ACC]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[LHS]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[RHS]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT180_F16_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT180_F161_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT180_F162_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT180_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> [[VCMLAQ_ROT180_F16_I]], <8 x half> [[VCMLAQ_ROT180_F161_I]], <8 x half> [[VCMLAQ_ROT180_F162_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT180_F164_I:%.*]] = bitcast <8 x half> [[VCMLAQ_ROT180_F163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT180_F164_I]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP7]] // float16x8_t test_vcmlaq_rot180_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) { return vcmlaq_rot180_f16(acc, lhs, rhs); @@ -141,8 +297,20 @@ float16x8_t test_vcmlaq_rot180_f16(float16x8_t acc, float16x8_t lhs, float16x8_t // CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot180_f32( // CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLAQ_ROT180_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[RHS]]) -// CHECK-NEXT: ret <4 x float> [[VCMLAQ_ROT180_F323_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACC]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[RHS]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT180_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT180_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT180_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT180_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> [[VCMLAQ_ROT180_F32_I]], <4 x float> [[VCMLAQ_ROT180_F321_I]], <4 x float> [[VCMLAQ_ROT180_F322_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT180_F324_I:%.*]] = bitcast <4 x float> [[VCMLAQ_ROT180_F323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT180_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vcmlaq_rot180_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) { return vcmlaq_rot180_f32(acc, lhs, rhs); @@ -151,8 +319,20 @@ float32x4_t test_vcmlaq_rot180_f32(float32x4_t acc, float32x4_t lhs, float32x4_t // CHECK-LABEL: define dso_local <2 x double> @test_vcmlaq_rot180_f64( // CHECK-SAME: <2 x double> noundef [[ACC:%.*]], <2 x double> noundef [[LHS:%.*]], <2 x double> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLAQ_ROT180_F643_I:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> [[ACC]], <2 x double> [[LHS]], <2 x double> [[RHS]]) -// CHECK-NEXT: ret <2 x double> [[VCMLAQ_ROT180_F643_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[ACC]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[LHS]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[RHS]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT180_F64_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VCMLAQ_ROT180_F641_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[VCMLAQ_ROT180_F642_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[VCMLAQ_ROT180_F643_I:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot180.v2f64(<2 x double> [[VCMLAQ_ROT180_F64_I]], <2 x double> [[VCMLAQ_ROT180_F641_I]], <2 x double> [[VCMLAQ_ROT180_F642_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT180_F644_I:%.*]] = bitcast <2 x double> [[VCMLAQ_ROT180_F643_I]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT180_F644_I]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP7]] // float64x2_t test_vcmlaq_rot180_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) { return vcmlaq_rot180_f64(acc, lhs, rhs); @@ -161,8 +341,20 @@ float64x2_t test_vcmlaq_rot180_f64(float64x2_t acc, float64x2_t lhs, float64x2_t // CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot270_f16( // CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLA_ROT270_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[RHS]]) -// CHECK-NEXT: ret <4 x half> [[VCMLA_ROT270_F163_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[ACC]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[LHS]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[RHS]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT270_F16_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT270_F161_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT270_F162_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT270_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> [[VCMLA_ROT270_F16_I]], <4 x half> [[VCMLA_ROT270_F161_I]], <4 x half> [[VCMLA_ROT270_F162_I]]) +// CHECK-NEXT: [[VCMLA_ROT270_F164_I:%.*]] = bitcast <4 x half> [[VCMLA_ROT270_F163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[VCMLA_ROT270_F164_I]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP7]] // float16x4_t test_vcmla_rot270_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) { return vcmla_rot270_f16(acc, lhs, rhs); @@ -171,8 +363,20 @@ float16x4_t test_vcmla_rot270_f16(float16x4_t acc, float16x4_t lhs, float16x4_t // CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot270_f32( // CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLA_ROT270_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[RHS]]) -// CHECK-NEXT: ret <2 x float> [[VCMLA_ROT270_F323_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[ACC]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[RHS]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT270_F32_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT270_F321_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT270_F322_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT270_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> [[VCMLA_ROT270_F32_I]], <2 x float> [[VCMLA_ROT270_F321_I]], <2 x float> [[VCMLA_ROT270_F322_I]]) +// CHECK-NEXT: [[VCMLA_ROT270_F324_I:%.*]] = bitcast <2 x float> [[VCMLA_ROT270_F323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[VCMLA_ROT270_F324_I]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP7]] // float32x2_t test_vcmla_rot270_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) { return vcmla_rot270_f32(acc, lhs, rhs); @@ -181,8 +385,20 @@ float32x2_t test_vcmla_rot270_f32(float32x2_t acc, float32x2_t lhs, float32x2_t // CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot270_f16( // CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLAQ_ROT270_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[RHS]]) -// CHECK-NEXT: ret <8 x half> [[VCMLAQ_ROT270_F163_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[ACC]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[LHS]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[RHS]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT270_F16_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT270_F161_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT270_F162_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT270_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> [[VCMLAQ_ROT270_F16_I]], <8 x half> [[VCMLAQ_ROT270_F161_I]], <8 x half> [[VCMLAQ_ROT270_F162_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT270_F164_I:%.*]] = bitcast <8 x half> [[VCMLAQ_ROT270_F163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT270_F164_I]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP7]] // float16x8_t test_vcmlaq_rot270_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) { return vcmlaq_rot270_f16(acc, lhs, rhs); @@ -191,8 +407,20 @@ float16x8_t test_vcmlaq_rot270_f16(float16x8_t acc, float16x8_t lhs, float16x8_t // CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot270_f32( // CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLAQ_ROT270_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[RHS]]) -// CHECK-NEXT: ret <4 x float> [[VCMLAQ_ROT270_F323_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACC]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[RHS]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT270_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT270_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT270_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT270_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> [[VCMLAQ_ROT270_F32_I]], <4 x float> [[VCMLAQ_ROT270_F321_I]], <4 x float> [[VCMLAQ_ROT270_F322_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT270_F324_I:%.*]] = bitcast <4 x float> [[VCMLAQ_ROT270_F323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT270_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vcmlaq_rot270_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) { return vcmlaq_rot270_f32(acc, lhs, rhs); @@ -201,8 +429,20 @@ float32x4_t test_vcmlaq_rot270_f32(float32x4_t acc, float32x4_t lhs, float32x4_t // CHECK-LABEL: define dso_local <2 x double> @test_vcmlaq_rot270_f64( // CHECK-SAME: <2 x double> noundef [[ACC:%.*]], <2 x double> noundef [[LHS:%.*]], <2 x double> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VCMLAQ_ROT270_F643_I:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> [[ACC]], <2 x double> [[LHS]], <2 x double> [[RHS]]) -// CHECK-NEXT: ret <2 x double> [[VCMLAQ_ROT270_F643_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[ACC]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[LHS]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[RHS]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT270_F64_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x double> +// CHECK-NEXT: [[VCMLAQ_ROT270_F641_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> +// CHECK-NEXT: [[VCMLAQ_ROT270_F642_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> +// CHECK-NEXT: [[VCMLAQ_ROT270_F643_I:%.*]] = call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> [[VCMLAQ_ROT270_F64_I]], <2 x double> [[VCMLAQ_ROT270_F641_I]], <2 x double> [[VCMLAQ_ROT270_F642_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT270_F644_I:%.*]] = bitcast <2 x double> [[VCMLAQ_ROT270_F643_I]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT270_F644_I]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP7]] // float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t rhs) { return vcmlaq_rot270_f64(acc, lhs, rhs); @@ -211,19 +451,26 @@ float64x2_t test_vcmlaq_rot270_f64(float64x2_t acc, float64x2_t lhs, float64x2_t // CHECK-LABEL: define dso_local <4 x half> @test_vcmla_lane_f16( // CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_150:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_150:%.*]] = alloca <2 x i32>, align 8 -// CHECK-NEXT: store <4 x half> [[RHS]], ptr [[__REINT_150]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_150]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[RHS]] to <2 x i32> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_150]], align 8 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i32> [[VECINIT5]], ptr [[__REINT1_150]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_150]], align 8 -// CHECK-NEXT: [[VCMLA_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]]) -// CHECK-NEXT: ret <4 x half> [[VCMLA_F163_I]] +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT6]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[ACC]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LHS]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_F16_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VCMLA_F161_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[VCMLA_F162_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: [[VCMLA_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> [[VCMLA_F16_I]], <4 x half> [[VCMLA_F161_I]], <4 x half> [[VCMLA_F162_I]]) +// CHECK-NEXT: [[VCMLA_F164_I:%.*]] = bitcast <4 x half> [[VCMLA_F163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_F164_I]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) { return vcmla_lane_f16(acc, lhs, rhs, 1); @@ -233,19 +480,26 @@ float16x4_t test_vcmla_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rh // CHECK-LABEL: define dso_local <4 x half> @test_vcmla_laneq_f16( // CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_154:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_154:%.*]] = alloca <2 x i32>, align 8 -// CHECK-NEXT: store <8 x half> [[RHS]], ptr [[__REINT_154]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_154]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_154]], align 16 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i32> [[VECINIT5]], ptr [[__REINT1_154]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_154]], align 8 -// CHECK-NEXT: [[VCMLA_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]]) -// CHECK-NEXT: ret <4 x half> [[VCMLA_F163_I]] +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT6]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[ACC]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LHS]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_F16_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VCMLA_F161_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[VCMLA_F162_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: [[VCMLA_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot0.v4f16(<4 x half> [[VCMLA_F16_I]], <4 x half> [[VCMLA_F161_I]], <4 x half> [[VCMLA_F162_I]]) +// CHECK-NEXT: [[VCMLA_F164_I:%.*]] = bitcast <4 x half> [[VCMLA_F163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_F164_I]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vcmla_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) { return vcmla_laneq_f16(acc, lhs, rhs, 3); @@ -254,25 +508,30 @@ float16x4_t test_vcmla_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t r // CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_lane_f16( // CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_152:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_152:%.*]] = alloca <4 x i32>, align 16 -// CHECK-NEXT: store <4 x half> [[RHS]], ptr [[__REINT_152]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_152]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[RHS]] to <2 x i32> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_152]], align 8 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[__REINT_152]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[__REINT_152]], align 8 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGET_LANE13]], i32 3 -// CHECK-NEXT: store <4 x i32> [[VECINIT15]], ptr [[__REINT1_152]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_152]], align 16 -// CHECK-NEXT: [[VCMLAQ_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]]) -// CHECK-NEXT: ret <8 x half> [[VCMLAQ_F163_I]] +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[VGET_LANE10:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[VGET_LANE10]], i32 2 +// CHECK-NEXT: [[VGET_LANE16:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <4 x i32> [[VECINIT12]], i32 [[VGET_LANE16]], i32 3 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT18]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[ACC]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LHS]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_F16_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_F161_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_F162_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> [[VCMLAQ_F16_I]], <8 x half> [[VCMLAQ_F161_I]], <8 x half> [[VCMLAQ_F162_I]]) +// CHECK-NEXT: [[VCMLAQ_F164_I:%.*]] = bitcast <8 x half> [[VCMLAQ_F163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_F164_I]] to <8 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vcmlaq_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) { return vcmlaq_lane_f16(acc, lhs, rhs, 1); @@ -281,25 +540,30 @@ float16x8_t test_vcmlaq_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t r // CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_laneq_f16( // CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_156:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_156:%.*]] = alloca <4 x i32>, align 16 -// CHECK-NEXT: store <8 x half> [[RHS]], ptr [[__REINT_156]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_156]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_156]], align 16 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[__REINT_156]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGETQ_LANE8]], i32 2 -// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[__REINT_156]], align 16 -// CHECK-NEXT: [[VGETQ_LANE13:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGETQ_LANE13]], i32 3 -// CHECK-NEXT: store <4 x i32> [[VECINIT15]], ptr [[__REINT1_156]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_156]], align 16 -// CHECK-NEXT: [[VCMLAQ_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]]) -// CHECK-NEXT: ret <8 x half> [[VCMLAQ_F163_I]] +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[VGETQ_LANE10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[VGETQ_LANE10]], i32 2 +// CHECK-NEXT: [[VGETQ_LANE16:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <4 x i32> [[VECINIT12]], i32 [[VGETQ_LANE16]], i32 3 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT18]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[ACC]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LHS]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_F16_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_F161_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_F162_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot0.v8f16(<8 x half> [[VCMLAQ_F16_I]], <8 x half> [[VCMLAQ_F161_I]], <8 x half> [[VCMLAQ_F162_I]]) +// CHECK-NEXT: [[VCMLAQ_F164_I:%.*]] = bitcast <8 x half> [[VCMLAQ_F163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_F164_I]] to <8 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vcmlaq_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) { return vcmlaq_laneq_f16(acc, lhs, rhs, 3); @@ -308,16 +572,25 @@ float16x8_t test_vcmlaq_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t // CHECK-LABEL: define dso_local <2 x float> @test_vcmla_lane_f32( // CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_182:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: [[__REINT1_182:%.*]] = alloca <1 x i64>, align 8 -// CHECK-NEXT: store <2 x float> [[RHS]], ptr [[__REINT_182]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_182]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[RHS]] to i64 +// CHECK-NEXT: [[__S2_182_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[__S2_182_SROA_0_0_VEC_INSERT]], i32 0 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGET_LANE]], i32 0 -// CHECK-NEXT: store <1 x i64> [[VECINIT]], ptr [[__REINT1_182]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_182]], align 8 -// CHECK-NEXT: [[VCMLA_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]]) -// CHECK-NEXT: ret <2 x float> [[VCMLA_F323_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[VECINIT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[ACC]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_F32_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VCMLA_F321_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[VCMLA_F322_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: [[VCMLA_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> [[VCMLA_F32_I]], <2 x float> [[VCMLA_F321_I]], <2 x float> [[VCMLA_F322_I]]) +// CHECK-NEXT: [[VCMLA_F324_I:%.*]] = bitcast <2 x float> [[VCMLA_F323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_F324_I]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vcmla_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) { return vcmla_lane_f32(acc, lhs, rhs, 0); @@ -327,16 +600,24 @@ float32x2_t test_vcmla_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rh // CHECK-LABEL: define dso_local <2 x float> @test_vcmla_laneq_f32( // CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_186:%.*]] = alloca <4 x float>, align 16 -// CHECK-NEXT: [[__REINT1_186:%.*]] = alloca <1 x i64>, align 8 -// CHECK-NEXT: store <4 x float> [[RHS]], ptr [[__REINT_186]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_186]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[RHS]] to <2 x i64> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: store <1 x i64> [[VECINIT]], ptr [[__REINT1_186]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_186]], align 8 -// CHECK-NEXT: [[VCMLA_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]]) -// CHECK-NEXT: ret <2 x float> [[VCMLA_F323_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[VECINIT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[ACC]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_F32_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VCMLA_F321_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[VCMLA_F322_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: [[VCMLA_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot0.v2f32(<2 x float> [[VCMLA_F32_I]], <2 x float> [[VCMLA_F321_I]], <2 x float> [[VCMLA_F322_I]]) +// CHECK-NEXT: [[VCMLA_F324_I:%.*]] = bitcast <2 x float> [[VCMLA_F323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_F324_I]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) { return vcmla_laneq_f32(acc, lhs, rhs, 1); @@ -345,19 +626,27 @@ float32x2_t test_vcmla_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t r // CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_lane_f32( // CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_184:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: [[__REINT1_184:%.*]] = alloca <2 x i64>, align 16 -// CHECK-NEXT: store <2 x float> [[RHS]], ptr [[__REINT_184]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_184]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[RHS]] to i64 +// CHECK-NEXT: [[__S2_184_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[__S2_184_SROA_0_0_VEC_INSERT]], i32 0 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[__REINT_184]], align 8 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGET_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i64> [[VECINIT5]], ptr [[__REINT1_184]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_184]], align 16 -// CHECK-NEXT: [[VCMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]]) -// CHECK-NEXT: ret <4 x float> [[VCMLAQ_F323_I]] +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <1 x i64> [[__S2_184_SROA_0_0_VEC_INSERT]], i32 0 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT6]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[ACC]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> [[VCMLAQ_F32_I]], <4 x float> [[VCMLAQ_F321_I]], <4 x float> [[VCMLAQ_F322_I]]) +// CHECK-NEXT: [[VCMLAQ_F324_I:%.*]] = bitcast <4 x float> [[VCMLAQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) { return vcmlaq_lane_f32(acc, lhs, rhs, 0); @@ -366,19 +655,26 @@ float32x4_t test_vcmlaq_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t r // CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_laneq_f32( // CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_188:%.*]] = alloca <4 x float>, align 16 -// CHECK-NEXT: [[__REINT1_188:%.*]] = alloca <2 x i64>, align 16 -// CHECK-NEXT: store <4 x float> [[RHS]], ptr [[__REINT_188]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_188]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[RHS]] to <2 x i64> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[__REINT_188]], align 16 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i64> [[VECINIT5]], ptr [[__REINT1_188]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_188]], align 16 -// CHECK-NEXT: [[VCMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]]) -// CHECK-NEXT: ret <4 x float> [[VCMLAQ_F323_I]] +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT6]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[ACC]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> [[VCMLAQ_F32_I]], <4 x float> [[VCMLAQ_F321_I]], <4 x float> [[VCMLAQ_F322_I]]) +// CHECK-NEXT: [[VCMLAQ_F324_I:%.*]] = bitcast <4 x float> [[VCMLAQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) { return vcmlaq_laneq_f32(acc, lhs, rhs, 1); @@ -387,19 +683,26 @@ float32x4_t test_vcmlaq_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t // CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot90_lane_f16( // CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_174:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_174:%.*]] = alloca <2 x i32>, align 8 -// CHECK-NEXT: store <4 x half> [[RHS]], ptr [[__REINT_174]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_174]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[RHS]] to <2 x i32> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_174]], align 8 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i32> [[VECINIT5]], ptr [[__REINT1_174]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_174]], align 8 -// CHECK-NEXT: [[VCMLA_ROT90_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]]) -// CHECK-NEXT: ret <4 x half> [[VCMLA_ROT90_F163_I]] +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT6]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[ACC]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LHS]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT90_F16_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT90_F161_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT90_F162_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT90_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> [[VCMLA_ROT90_F16_I]], <4 x half> [[VCMLA_ROT90_F161_I]], <4 x half> [[VCMLA_ROT90_F162_I]]) +// CHECK-NEXT: [[VCMLA_ROT90_F164_I:%.*]] = bitcast <4 x half> [[VCMLA_ROT90_F163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_ROT90_F164_I]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) { return vcmla_rot90_lane_f16(acc, lhs, rhs, 1); @@ -409,19 +712,26 @@ float16x4_t test_vcmla_rot90_lane_f16(float16x4_t acc, float16x4_t lhs, float16x // CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot90_laneq_f16( // CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_178:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_178:%.*]] = alloca <2 x i32>, align 8 -// CHECK-NEXT: store <8 x half> [[RHS]], ptr [[__REINT_178]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_178]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_178]], align 16 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i32> [[VECINIT5]], ptr [[__REINT1_178]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_178]], align 8 -// CHECK-NEXT: [[VCMLA_ROT90_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]]) -// CHECK-NEXT: ret <4 x half> [[VCMLA_ROT90_F163_I]] +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT6]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[ACC]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LHS]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT90_F16_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT90_F161_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT90_F162_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT90_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot90.v4f16(<4 x half> [[VCMLA_ROT90_F16_I]], <4 x half> [[VCMLA_ROT90_F161_I]], <4 x half> [[VCMLA_ROT90_F162_I]]) +// CHECK-NEXT: [[VCMLA_ROT90_F164_I:%.*]] = bitcast <4 x half> [[VCMLA_ROT90_F163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_ROT90_F164_I]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vcmla_rot90_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) { return vcmla_rot90_laneq_f16(acc, lhs, rhs, 3); @@ -430,25 +740,30 @@ float16x4_t test_vcmla_rot90_laneq_f16(float16x4_t acc, float16x4_t lhs, float16 // CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot90_lane_f16( // CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_176:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_176:%.*]] = alloca <4 x i32>, align 16 -// CHECK-NEXT: store <4 x half> [[RHS]], ptr [[__REINT_176]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_176]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[RHS]] to <2 x i32> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_176]], align 8 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[__REINT_176]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[__REINT_176]], align 8 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGET_LANE13]], i32 3 -// CHECK-NEXT: store <4 x i32> [[VECINIT15]], ptr [[__REINT1_176]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_176]], align 16 -// CHECK-NEXT: [[VCMLAQ_ROT90_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]]) -// CHECK-NEXT: ret <8 x half> [[VCMLAQ_ROT90_F163_I]] +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[VGET_LANE10:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[VGET_LANE10]], i32 2 +// CHECK-NEXT: [[VGET_LANE16:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <4 x i32> [[VECINIT12]], i32 [[VGET_LANE16]], i32 3 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT18]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[ACC]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LHS]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT90_F16_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT90_F161_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT90_F162_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT90_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> [[VCMLAQ_ROT90_F16_I]], <8 x half> [[VCMLAQ_ROT90_F161_I]], <8 x half> [[VCMLAQ_ROT90_F162_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT90_F164_I:%.*]] = bitcast <8 x half> [[VCMLAQ_ROT90_F163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT90_F164_I]] to <8 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vcmlaq_rot90_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) { return vcmlaq_rot90_lane_f16(acc, lhs, rhs, 1); @@ -457,25 +772,30 @@ float16x8_t test_vcmlaq_rot90_lane_f16(float16x8_t acc, float16x8_t lhs, float16 // CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot90_laneq_f16( // CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_180:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_180:%.*]] = alloca <4 x i32>, align 16 -// CHECK-NEXT: store <8 x half> [[RHS]], ptr [[__REINT_180]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_180]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_180]], align 16 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[__REINT_180]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGETQ_LANE8]], i32 2 -// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[__REINT_180]], align 16 -// CHECK-NEXT: [[VGETQ_LANE13:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGETQ_LANE13]], i32 3 -// CHECK-NEXT: store <4 x i32> [[VECINIT15]], ptr [[__REINT1_180]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_180]], align 16 -// CHECK-NEXT: [[VCMLAQ_ROT90_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]]) -// CHECK-NEXT: ret <8 x half> [[VCMLAQ_ROT90_F163_I]] +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[VGETQ_LANE10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[VGETQ_LANE10]], i32 2 +// CHECK-NEXT: [[VGETQ_LANE16:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <4 x i32> [[VECINIT12]], i32 [[VGETQ_LANE16]], i32 3 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT18]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[ACC]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LHS]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT90_F16_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT90_F161_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT90_F162_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT90_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot90.v8f16(<8 x half> [[VCMLAQ_ROT90_F16_I]], <8 x half> [[VCMLAQ_ROT90_F161_I]], <8 x half> [[VCMLAQ_ROT90_F162_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT90_F164_I:%.*]] = bitcast <8 x half> [[VCMLAQ_ROT90_F163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT90_F164_I]] to <8 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vcmlaq_rot90_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) { return vcmlaq_rot90_laneq_f16(acc, lhs, rhs, 3); @@ -484,16 +804,25 @@ float16x8_t test_vcmlaq_rot90_laneq_f16(float16x8_t acc, float16x8_t lhs, float1 // CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot90_lane_f32( // CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_206:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: [[__REINT1_206:%.*]] = alloca <1 x i64>, align 8 -// CHECK-NEXT: store <2 x float> [[RHS]], ptr [[__REINT_206]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_206]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[RHS]] to i64 +// CHECK-NEXT: [[__S2_206_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[__S2_206_SROA_0_0_VEC_INSERT]], i32 0 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGET_LANE]], i32 0 -// CHECK-NEXT: store <1 x i64> [[VECINIT]], ptr [[__REINT1_206]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_206]], align 8 -// CHECK-NEXT: [[VCMLA_ROT90_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]]) -// CHECK-NEXT: ret <2 x float> [[VCMLA_ROT90_F323_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[VECINIT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[ACC]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT90_F32_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT90_F321_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT90_F322_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT90_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> [[VCMLA_ROT90_F32_I]], <2 x float> [[VCMLA_ROT90_F321_I]], <2 x float> [[VCMLA_ROT90_F322_I]]) +// CHECK-NEXT: [[VCMLA_ROT90_F324_I:%.*]] = bitcast <2 x float> [[VCMLA_ROT90_F323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_ROT90_F324_I]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vcmla_rot90_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) { return vcmla_rot90_lane_f32(acc, lhs, rhs, 0); @@ -503,16 +832,24 @@ float32x2_t test_vcmla_rot90_lane_f32(float32x2_t acc, float32x2_t lhs, float32x // CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot90_laneq_f32( // CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_210:%.*]] = alloca <4 x float>, align 16 -// CHECK-NEXT: [[__REINT1_210:%.*]] = alloca <1 x i64>, align 8 -// CHECK-NEXT: store <4 x float> [[RHS]], ptr [[__REINT_210]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_210]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[RHS]] to <2 x i64> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: store <1 x i64> [[VECINIT]], ptr [[__REINT1_210]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_210]], align 8 -// CHECK-NEXT: [[VCMLA_ROT90_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]]) -// CHECK-NEXT: ret <2 x float> [[VCMLA_ROT90_F323_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[VECINIT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[ACC]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT90_F32_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT90_F321_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT90_F322_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT90_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot90.v2f32(<2 x float> [[VCMLA_ROT90_F32_I]], <2 x float> [[VCMLA_ROT90_F321_I]], <2 x float> [[VCMLA_ROT90_F322_I]]) +// CHECK-NEXT: [[VCMLA_ROT90_F324_I:%.*]] = bitcast <2 x float> [[VCMLA_ROT90_F323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_ROT90_F324_I]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) { return vcmla_rot90_laneq_f32(acc, lhs, rhs, 1); @@ -521,19 +858,27 @@ float32x2_t test_vcmla_rot90_laneq_f32(float32x2_t acc, float32x2_t lhs, float32 // CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot90_lane_f32( // CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_208:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: [[__REINT1_208:%.*]] = alloca <2 x i64>, align 16 -// CHECK-NEXT: store <2 x float> [[RHS]], ptr [[__REINT_208]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_208]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[RHS]] to i64 +// CHECK-NEXT: [[__S2_208_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[__S2_208_SROA_0_0_VEC_INSERT]], i32 0 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[__REINT_208]], align 8 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGET_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i64> [[VECINIT5]], ptr [[__REINT1_208]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_208]], align 16 -// CHECK-NEXT: [[VCMLAQ_ROT90_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]]) -// CHECK-NEXT: ret <4 x float> [[VCMLAQ_ROT90_F323_I]] +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <1 x i64> [[__S2_208_SROA_0_0_VEC_INSERT]], i32 0 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT6]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[ACC]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT90_F32_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT90_F321_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT90_F322_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT90_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> [[VCMLAQ_ROT90_F32_I]], <4 x float> [[VCMLAQ_ROT90_F321_I]], <4 x float> [[VCMLAQ_ROT90_F322_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT90_F324_I:%.*]] = bitcast <4 x float> [[VCMLAQ_ROT90_F323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT90_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) { return vcmlaq_rot90_lane_f32(acc, lhs, rhs, 0); @@ -542,19 +887,26 @@ float32x4_t test_vcmlaq_rot90_lane_f32(float32x4_t acc, float32x4_t lhs, float32 // CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot90_laneq_f32( // CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_212:%.*]] = alloca <4 x float>, align 16 -// CHECK-NEXT: [[__REINT1_212:%.*]] = alloca <2 x i64>, align 16 -// CHECK-NEXT: store <4 x float> [[RHS]], ptr [[__REINT_212]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_212]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[RHS]] to <2 x i64> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[__REINT_212]], align 16 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i64> [[VECINIT5]], ptr [[__REINT1_212]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_212]], align 16 -// CHECK-NEXT: [[VCMLAQ_ROT90_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]]) -// CHECK-NEXT: ret <4 x float> [[VCMLAQ_ROT90_F323_I]] +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT6]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[ACC]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT90_F32_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT90_F321_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT90_F322_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT90_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> [[VCMLAQ_ROT90_F32_I]], <4 x float> [[VCMLAQ_ROT90_F321_I]], <4 x float> [[VCMLAQ_ROT90_F322_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT90_F324_I:%.*]] = bitcast <4 x float> [[VCMLAQ_ROT90_F323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT90_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) { return vcmlaq_rot90_laneq_f32(acc, lhs, rhs, 1); @@ -563,19 +915,26 @@ float32x4_t test_vcmlaq_rot90_laneq_f32(float32x4_t acc, float32x4_t lhs, float3 // CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot180_lane_f16( // CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_158:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_158:%.*]] = alloca <2 x i32>, align 8 -// CHECK-NEXT: store <4 x half> [[RHS]], ptr [[__REINT_158]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_158]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[RHS]] to <2 x i32> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_158]], align 8 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i32> [[VECINIT5]], ptr [[__REINT1_158]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_158]], align 8 -// CHECK-NEXT: [[VCMLA_ROT180_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]]) -// CHECK-NEXT: ret <4 x half> [[VCMLA_ROT180_F163_I]] +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT6]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[ACC]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LHS]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT180_F16_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT180_F161_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT180_F162_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT180_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> [[VCMLA_ROT180_F16_I]], <4 x half> [[VCMLA_ROT180_F161_I]], <4 x half> [[VCMLA_ROT180_F162_I]]) +// CHECK-NEXT: [[VCMLA_ROT180_F164_I:%.*]] = bitcast <4 x half> [[VCMLA_ROT180_F163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_ROT180_F164_I]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) { return vcmla_rot180_lane_f16(acc, lhs, rhs, 1); @@ -585,19 +944,26 @@ float16x4_t test_vcmla_rot180_lane_f16(float16x4_t acc, float16x4_t lhs, float16 // CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot180_laneq_f16( // CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_162:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_162:%.*]] = alloca <2 x i32>, align 8 -// CHECK-NEXT: store <8 x half> [[RHS]], ptr [[__REINT_162]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_162]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_162]], align 16 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i32> [[VECINIT5]], ptr [[__REINT1_162]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_162]], align 8 -// CHECK-NEXT: [[VCMLA_ROT180_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]]) -// CHECK-NEXT: ret <4 x half> [[VCMLA_ROT180_F163_I]] +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT6]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[ACC]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LHS]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT180_F16_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT180_F161_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT180_F162_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT180_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> [[VCMLA_ROT180_F16_I]], <4 x half> [[VCMLA_ROT180_F161_I]], <4 x half> [[VCMLA_ROT180_F162_I]]) +// CHECK-NEXT: [[VCMLA_ROT180_F164_I:%.*]] = bitcast <4 x half> [[VCMLA_ROT180_F163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_ROT180_F164_I]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vcmla_rot180_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) { return vcmla_rot180_laneq_f16(acc, lhs, rhs, 3); @@ -606,25 +972,30 @@ float16x4_t test_vcmla_rot180_laneq_f16(float16x4_t acc, float16x4_t lhs, float1 // CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot180_lane_f16( // CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_160:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_160:%.*]] = alloca <4 x i32>, align 16 -// CHECK-NEXT: store <4 x half> [[RHS]], ptr [[__REINT_160]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_160]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[RHS]] to <2 x i32> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_160]], align 8 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[__REINT_160]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[__REINT_160]], align 8 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGET_LANE13]], i32 3 -// CHECK-NEXT: store <4 x i32> [[VECINIT15]], ptr [[__REINT1_160]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_160]], align 16 -// CHECK-NEXT: [[VCMLAQ_ROT180_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]]) -// CHECK-NEXT: ret <8 x half> [[VCMLAQ_ROT180_F163_I]] +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[VGET_LANE10:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[VGET_LANE10]], i32 2 +// CHECK-NEXT: [[VGET_LANE16:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <4 x i32> [[VECINIT12]], i32 [[VGET_LANE16]], i32 3 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT18]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[ACC]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LHS]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT180_F16_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT180_F161_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT180_F162_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT180_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> [[VCMLAQ_ROT180_F16_I]], <8 x half> [[VCMLAQ_ROT180_F161_I]], <8 x half> [[VCMLAQ_ROT180_F162_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT180_F164_I:%.*]] = bitcast <8 x half> [[VCMLAQ_ROT180_F163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT180_F164_I]] to <8 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vcmlaq_rot180_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) { return vcmlaq_rot180_lane_f16(acc, lhs, rhs, 1); @@ -633,25 +1004,30 @@ float16x8_t test_vcmlaq_rot180_lane_f16(float16x8_t acc, float16x8_t lhs, float1 // CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot180_laneq_f16( // CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_164:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_164:%.*]] = alloca <4 x i32>, align 16 -// CHECK-NEXT: store <8 x half> [[RHS]], ptr [[__REINT_164]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_164]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_164]], align 16 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[__REINT_164]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGETQ_LANE8]], i32 2 -// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[__REINT_164]], align 16 -// CHECK-NEXT: [[VGETQ_LANE13:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGETQ_LANE13]], i32 3 -// CHECK-NEXT: store <4 x i32> [[VECINIT15]], ptr [[__REINT1_164]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_164]], align 16 -// CHECK-NEXT: [[VCMLAQ_ROT180_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]]) -// CHECK-NEXT: ret <8 x half> [[VCMLAQ_ROT180_F163_I]] +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[VGETQ_LANE10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[VGETQ_LANE10]], i32 2 +// CHECK-NEXT: [[VGETQ_LANE16:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <4 x i32> [[VECINIT12]], i32 [[VGETQ_LANE16]], i32 3 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT18]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[ACC]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LHS]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT180_F16_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT180_F161_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT180_F162_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT180_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot180.v8f16(<8 x half> [[VCMLAQ_ROT180_F16_I]], <8 x half> [[VCMLAQ_ROT180_F161_I]], <8 x half> [[VCMLAQ_ROT180_F162_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT180_F164_I:%.*]] = bitcast <8 x half> [[VCMLAQ_ROT180_F163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT180_F164_I]] to <8 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vcmlaq_rot180_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) { return vcmlaq_rot180_laneq_f16(acc, lhs, rhs, 3); @@ -660,16 +1036,25 @@ float16x8_t test_vcmlaq_rot180_laneq_f16(float16x8_t acc, float16x8_t lhs, float // CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot180_lane_f32( // CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_190:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: [[__REINT1_190:%.*]] = alloca <1 x i64>, align 8 -// CHECK-NEXT: store <2 x float> [[RHS]], ptr [[__REINT_190]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_190]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[RHS]] to i64 +// CHECK-NEXT: [[__S2_190_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[__S2_190_SROA_0_0_VEC_INSERT]], i32 0 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGET_LANE]], i32 0 -// CHECK-NEXT: store <1 x i64> [[VECINIT]], ptr [[__REINT1_190]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_190]], align 8 -// CHECK-NEXT: [[VCMLA_ROT180_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]]) -// CHECK-NEXT: ret <2 x float> [[VCMLA_ROT180_F323_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[VECINIT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[ACC]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT180_F32_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT180_F321_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT180_F322_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT180_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> [[VCMLA_ROT180_F32_I]], <2 x float> [[VCMLA_ROT180_F321_I]], <2 x float> [[VCMLA_ROT180_F322_I]]) +// CHECK-NEXT: [[VCMLA_ROT180_F324_I:%.*]] = bitcast <2 x float> [[VCMLA_ROT180_F323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_ROT180_F324_I]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vcmla_rot180_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) { return vcmla_rot180_lane_f32(acc, lhs, rhs, 0); @@ -679,16 +1064,24 @@ float32x2_t test_vcmla_rot180_lane_f32(float32x2_t acc, float32x2_t lhs, float32 // CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot180_laneq_f32( // CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_194:%.*]] = alloca <4 x float>, align 16 -// CHECK-NEXT: [[__REINT1_194:%.*]] = alloca <1 x i64>, align 8 -// CHECK-NEXT: store <4 x float> [[RHS]], ptr [[__REINT_194]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_194]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[RHS]] to <2 x i64> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: store <1 x i64> [[VECINIT]], ptr [[__REINT1_194]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_194]], align 8 -// CHECK-NEXT: [[VCMLA_ROT180_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]]) -// CHECK-NEXT: ret <2 x float> [[VCMLA_ROT180_F323_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[VECINIT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[ACC]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT180_F32_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT180_F321_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT180_F322_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT180_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot180.v2f32(<2 x float> [[VCMLA_ROT180_F32_I]], <2 x float> [[VCMLA_ROT180_F321_I]], <2 x float> [[VCMLA_ROT180_F322_I]]) +// CHECK-NEXT: [[VCMLA_ROT180_F324_I:%.*]] = bitcast <2 x float> [[VCMLA_ROT180_F323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_ROT180_F324_I]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) { return vcmla_rot180_laneq_f32(acc, lhs, rhs, 1); @@ -697,19 +1090,27 @@ float32x2_t test_vcmla_rot180_laneq_f32(float32x2_t acc, float32x2_t lhs, float3 // CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot180_lane_f32( // CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_192:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: [[__REINT1_192:%.*]] = alloca <2 x i64>, align 16 -// CHECK-NEXT: store <2 x float> [[RHS]], ptr [[__REINT_192]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_192]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[RHS]] to i64 +// CHECK-NEXT: [[__S2_192_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[__S2_192_SROA_0_0_VEC_INSERT]], i32 0 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[__REINT_192]], align 8 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGET_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i64> [[VECINIT5]], ptr [[__REINT1_192]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_192]], align 16 -// CHECK-NEXT: [[VCMLAQ_ROT180_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]]) -// CHECK-NEXT: ret <4 x float> [[VCMLAQ_ROT180_F323_I]] +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <1 x i64> [[__S2_192_SROA_0_0_VEC_INSERT]], i32 0 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT6]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[ACC]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT180_F32_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT180_F321_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT180_F322_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT180_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> [[VCMLAQ_ROT180_F32_I]], <4 x float> [[VCMLAQ_ROT180_F321_I]], <4 x float> [[VCMLAQ_ROT180_F322_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT180_F324_I:%.*]] = bitcast <4 x float> [[VCMLAQ_ROT180_F323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT180_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) { return vcmlaq_rot180_lane_f32(acc, lhs, rhs, 0); @@ -718,19 +1119,26 @@ float32x4_t test_vcmlaq_rot180_lane_f32(float32x4_t acc, float32x4_t lhs, float3 // CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot180_laneq_f32( // CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_196:%.*]] = alloca <4 x float>, align 16 -// CHECK-NEXT: [[__REINT1_196:%.*]] = alloca <2 x i64>, align 16 -// CHECK-NEXT: store <4 x float> [[RHS]], ptr [[__REINT_196]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_196]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[RHS]] to <2 x i64> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[__REINT_196]], align 16 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i64> [[VECINIT5]], ptr [[__REINT1_196]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_196]], align 16 -// CHECK-NEXT: [[VCMLAQ_ROT180_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]]) -// CHECK-NEXT: ret <4 x float> [[VCMLAQ_ROT180_F323_I]] +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT6]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[ACC]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT180_F32_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT180_F321_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT180_F322_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT180_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot180.v4f32(<4 x float> [[VCMLAQ_ROT180_F32_I]], <4 x float> [[VCMLAQ_ROT180_F321_I]], <4 x float> [[VCMLAQ_ROT180_F322_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT180_F324_I:%.*]] = bitcast <4 x float> [[VCMLAQ_ROT180_F323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT180_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) { return vcmlaq_rot180_laneq_f32(acc, lhs, rhs, 1); @@ -739,19 +1147,26 @@ float32x4_t test_vcmlaq_rot180_laneq_f32(float32x4_t acc, float32x4_t lhs, float // CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot270_lane_f16( // CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_166:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_166:%.*]] = alloca <2 x i32>, align 8 -// CHECK-NEXT: store <4 x half> [[RHS]], ptr [[__REINT_166]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_166]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[RHS]] to <2 x i32> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_166]], align 8 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i32> [[VECINIT5]], ptr [[__REINT1_166]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_166]], align 8 -// CHECK-NEXT: [[VCMLA_ROT270_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]]) -// CHECK-NEXT: ret <4 x half> [[VCMLA_ROT270_F163_I]] +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT6]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[ACC]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LHS]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT270_F16_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT270_F161_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT270_F162_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT270_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> [[VCMLA_ROT270_F16_I]], <4 x half> [[VCMLA_ROT270_F161_I]], <4 x half> [[VCMLA_ROT270_F162_I]]) +// CHECK-NEXT: [[VCMLA_ROT270_F164_I:%.*]] = bitcast <4 x half> [[VCMLA_ROT270_F163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_ROT270_F164_I]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16x4_t rhs) { return vcmla_rot270_lane_f16(acc, lhs, rhs, 1); @@ -761,19 +1176,26 @@ float16x4_t test_vcmla_rot270_lane_f16(float16x4_t acc, float16x4_t lhs, float16 // CHECK-LABEL: define dso_local <4 x half> @test_vcmla_rot270_laneq_f16( // CHECK-SAME: <4 x half> noundef [[ACC:%.*]], <4 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_170:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_170:%.*]] = alloca <2 x i32>, align 8 -// CHECK-NEXT: store <8 x half> [[RHS]], ptr [[__REINT_170]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_170]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i32> poison, i32 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_170]], align 16 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i32> [[VECINIT5]], ptr [[__REINT1_170]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[__REINT1_170]], align 8 -// CHECK-NEXT: [[VCMLA_ROT270_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> [[ACC]], <4 x half> [[LHS]], <4 x half> [[TMP2]]) -// CHECK-NEXT: ret <4 x half> [[VCMLA_ROT270_F163_I]] +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT6]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[ACC]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LHS]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT270_F16_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT270_F161_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT270_F162_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: [[VCMLA_ROT270_F163_I:%.*]] = call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> [[VCMLA_ROT270_F16_I]], <4 x half> [[VCMLA_ROT270_F161_I]], <4 x half> [[VCMLA_ROT270_F162_I]]) +// CHECK-NEXT: [[VCMLA_ROT270_F164_I:%.*]] = bitcast <4 x half> [[VCMLA_ROT270_F163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_ROT270_F164_I]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vcmla_rot270_laneq_f16(float16x4_t acc, float16x4_t lhs, float16x8_t rhs) { return vcmla_rot270_laneq_f16(acc, lhs, rhs, 3); @@ -782,25 +1204,30 @@ float16x4_t test_vcmla_rot270_laneq_f16(float16x4_t acc, float16x4_t lhs, float1 // CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot270_lane_f16( // CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <4 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_168:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_168:%.*]] = alloca <4 x i32>, align 16 -// CHECK-NEXT: store <4 x half> [[RHS]], ptr [[__REINT_168]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__REINT_168]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[RHS]] to <2 x i32> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[__REINT_168]], align 8 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[__REINT_168]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[__REINT_168]], align 8 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGET_LANE13]], i32 3 -// CHECK-NEXT: store <4 x i32> [[VECINIT15]], ptr [[__REINT1_168]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_168]], align 16 -// CHECK-NEXT: [[VCMLAQ_ROT270_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]]) -// CHECK-NEXT: ret <8 x half> [[VCMLAQ_ROT270_F163_I]] +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[VGET_LANE10:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[VGET_LANE10]], i32 2 +// CHECK-NEXT: [[VGET_LANE16:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <4 x i32> [[VECINIT12]], i32 [[VGET_LANE16]], i32 3 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT18]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[ACC]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LHS]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT270_F16_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT270_F161_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT270_F162_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT270_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> [[VCMLAQ_ROT270_F16_I]], <8 x half> [[VCMLAQ_ROT270_F161_I]], <8 x half> [[VCMLAQ_ROT270_F162_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT270_F164_I:%.*]] = bitcast <8 x half> [[VCMLAQ_ROT270_F163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT270_F164_I]] to <8 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vcmlaq_rot270_lane_f16(float16x8_t acc, float16x8_t lhs, float16x4_t rhs) { return vcmlaq_rot270_lane_f16(acc, lhs, rhs, 1); @@ -809,25 +1236,30 @@ float16x8_t test_vcmlaq_rot270_lane_f16(float16x8_t acc, float16x8_t lhs, float1 // CHECK-LABEL: define dso_local <8 x half> @test_vcmlaq_rot270_laneq_f16( // CHECK-SAME: <8 x half> noundef [[ACC:%.*]], <8 x half> noundef [[LHS:%.*]], <8 x half> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_172:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_172:%.*]] = alloca <4 x i32>, align 16 -// CHECK-NEXT: store <8 x half> [[RHS]], ptr [[__REINT_172]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__REINT_172]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[RHS]] to <4 x i32> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[__REINT_172]], align 16 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[__REINT_172]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <4 x i32> [[VECINIT5]], i32 [[VGETQ_LANE8]], i32 2 -// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[__REINT_172]], align 16 -// CHECK-NEXT: [[VGETQ_LANE13:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <4 x i32> [[VECINIT10]], i32 [[VGETQ_LANE13]], i32 3 -// CHECK-NEXT: store <4 x i32> [[VECINIT15]], ptr [[__REINT1_172]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x half>, ptr [[__REINT1_172]], align 16 -// CHECK-NEXT: [[VCMLAQ_ROT270_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> [[ACC]], <8 x half> [[LHS]], <8 x half> [[TMP4]]) -// CHECK-NEXT: ret <8 x half> [[VCMLAQ_ROT270_F163_I]] +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[VGETQ_LANE10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[VGETQ_LANE10]], i32 2 +// CHECK-NEXT: [[VGETQ_LANE16:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <4 x i32> [[VECINIT12]], i32 [[VGETQ_LANE16]], i32 3 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT18]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[ACC]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LHS]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT270_F16_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT270_F161_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT270_F162_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> +// CHECK-NEXT: [[VCMLAQ_ROT270_F163_I:%.*]] = call <8 x half> @llvm.aarch64.neon.vcmla.rot270.v8f16(<8 x half> [[VCMLAQ_ROT270_F16_I]], <8 x half> [[VCMLAQ_ROT270_F161_I]], <8 x half> [[VCMLAQ_ROT270_F162_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT270_F164_I:%.*]] = bitcast <8 x half> [[VCMLAQ_ROT270_F163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT270_F164_I]] to <8 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vcmlaq_rot270_laneq_f16(float16x8_t acc, float16x8_t lhs, float16x8_t rhs) { return vcmlaq_rot270_laneq_f16(acc, lhs, rhs, 3); @@ -836,16 +1268,25 @@ float16x8_t test_vcmlaq_rot270_laneq_f16(float16x8_t acc, float16x8_t lhs, float // CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot270_lane_f32( // CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_198:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: [[__REINT1_198:%.*]] = alloca <1 x i64>, align 8 -// CHECK-NEXT: store <2 x float> [[RHS]], ptr [[__REINT_198]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_198]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[RHS]] to i64 +// CHECK-NEXT: [[__S2_198_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[__S2_198_SROA_0_0_VEC_INSERT]], i32 0 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGET_LANE]], i32 0 -// CHECK-NEXT: store <1 x i64> [[VECINIT]], ptr [[__REINT1_198]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_198]], align 8 -// CHECK-NEXT: [[VCMLA_ROT270_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]]) -// CHECK-NEXT: ret <2 x float> [[VCMLA_ROT270_F323_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[VECINIT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[ACC]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT270_F32_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT270_F321_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT270_F322_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT270_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> [[VCMLA_ROT270_F32_I]], <2 x float> [[VCMLA_ROT270_F321_I]], <2 x float> [[VCMLA_ROT270_F322_I]]) +// CHECK-NEXT: [[VCMLA_ROT270_F324_I:%.*]] = bitcast <2 x float> [[VCMLA_ROT270_F323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_ROT270_F324_I]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vcmla_rot270_lane_f32(float32x2_t acc, float32x2_t lhs, float32x2_t rhs) { return vcmla_rot270_lane_f32(acc, lhs, rhs, 0); @@ -855,16 +1296,24 @@ float32x2_t test_vcmla_rot270_lane_f32(float32x2_t acc, float32x2_t lhs, float32 // CHECK-LABEL: define dso_local <2 x float> @test_vcmla_rot270_laneq_f32( // CHECK-SAME: <2 x float> noundef [[ACC:%.*]], <2 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_202:%.*]] = alloca <4 x float>, align 16 -// CHECK-NEXT: [[__REINT1_202:%.*]] = alloca <1 x i64>, align 8 -// CHECK-NEXT: store <4 x float> [[RHS]], ptr [[__REINT_202]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_202]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[RHS]] to <2 x i64> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <1 x i64> poison, i64 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: store <1 x i64> [[VECINIT]], ptr [[__REINT1_202]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[__REINT1_202]], align 8 -// CHECK-NEXT: [[VCMLA_ROT270_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> [[ACC]], <2 x float> [[LHS]], <2 x float> [[TMP1]]) -// CHECK-NEXT: ret <2 x float> [[VCMLA_ROT270_F323_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[VECINIT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[ACC]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VCMLA_ROT270_F32_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT270_F321_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT270_F322_I:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: [[VCMLA_ROT270_F323_I:%.*]] = call <2 x float> @llvm.aarch64.neon.vcmla.rot270.v2f32(<2 x float> [[VCMLA_ROT270_F32_I]], <2 x float> [[VCMLA_ROT270_F321_I]], <2 x float> [[VCMLA_ROT270_F322_I]]) +// CHECK-NEXT: [[VCMLA_ROT270_F324_I:%.*]] = bitcast <2 x float> [[VCMLA_ROT270_F323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[VCMLA_ROT270_F324_I]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float32x4_t rhs) { return vcmla_rot270_laneq_f32(acc, lhs, rhs, 1); @@ -873,19 +1322,27 @@ float32x2_t test_vcmla_rot270_laneq_f32(float32x2_t acc, float32x2_t lhs, float3 // CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot270_lane_f32( // CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_200:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: [[__REINT1_200:%.*]] = alloca <2 x i64>, align 16 -// CHECK-NEXT: store <2 x float> [[RHS]], ptr [[__REINT_200]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__REINT_200]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[RHS]] to i64 +// CHECK-NEXT: [[__S2_200_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[__S2_200_SROA_0_0_VEC_INSERT]], i32 0 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[__REINT_200]], align 8 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGET_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i64> [[VECINIT5]], ptr [[__REINT1_200]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_200]], align 16 -// CHECK-NEXT: [[VCMLAQ_ROT270_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]]) -// CHECK-NEXT: ret <4 x float> [[VCMLAQ_ROT270_F323_I]] +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <1 x i64> [[__S2_200_SROA_0_0_VEC_INSERT]], i32 0 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT6]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[ACC]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT270_F32_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT270_F321_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT270_F322_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT270_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> [[VCMLAQ_ROT270_F32_I]], <4 x float> [[VCMLAQ_ROT270_F321_I]], <4 x float> [[VCMLAQ_ROT270_F322_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT270_F324_I:%.*]] = bitcast <4 x float> [[VCMLAQ_ROT270_F323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT270_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float32x2_t rhs) { return vcmlaq_rot270_lane_f32(acc, lhs, rhs, 0); @@ -894,19 +1351,26 @@ float32x4_t test_vcmlaq_rot270_lane_f32(float32x4_t acc, float32x4_t lhs, float3 // CHECK-LABEL: define dso_local <4 x float> @test_vcmlaq_rot270_laneq_f32( // CHECK-SAME: <4 x float> noundef [[ACC:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[__REINT_204:%.*]] = alloca <4 x float>, align 16 -// CHECK-NEXT: [[__REINT1_204:%.*]] = alloca <2 x i64>, align 16 -// CHECK-NEXT: store <4 x float> [[RHS]], ptr [[__REINT_204]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__REINT_204]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[RHS]] to <2 x i64> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <2 x i64> poison, i64 [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[__REINT_204]], align 16 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: store <2 x i64> [[VECINIT5]], ptr [[__REINT1_204]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[__REINT1_204]], align 16 -// CHECK-NEXT: [[VCMLAQ_ROT270_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> [[ACC]], <4 x float> [[LHS]], <4 x float> [[TMP2]]) -// CHECK-NEXT: ret <4 x float> [[VCMLAQ_ROT270_F323_I]] +// CHECK-NEXT: [[VGETQ_LANE4:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <2 x i64> [[VECINIT]], i64 [[VGETQ_LANE4]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[VECINIT6]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[ACC]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VCMLAQ_ROT270_F32_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT270_F321_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT270_F322_I:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK-NEXT: [[VCMLAQ_ROT270_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcmla.rot270.v4f32(<4 x float> [[VCMLAQ_ROT270_F32_I]], <4 x float> [[VCMLAQ_ROT270_F321_I]], <4 x float> [[VCMLAQ_ROT270_F322_I]]) +// CHECK-NEXT: [[VCMLAQ_ROT270_F324_I:%.*]] = bitcast <4 x float> [[VCMLAQ_ROT270_F323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[VCMLAQ_ROT270_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vcmlaq_rot270_laneq_f32(float32x4_t acc, float32x4_t lhs, float32x4_t rhs) { return vcmlaq_rot270_laneq_f32(acc, lhs, rhs, 1); diff --git a/clang/test/CodeGen/AArch64/poly-add.c b/clang/test/CodeGen/AArch64/poly-add.c index 0795aecac433f..069df72f87deb 100644 --- a/clang/test/CodeGen/AArch64/poly-add.c +++ b/clang/test/CodeGen/AArch64/poly-add.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target diff --git a/clang/test/CodeGen/AArch64/poly128.c b/clang/test/CodeGen/AArch64/poly128.c index f188632468fc8..a9df831c07cb6 100644 --- a/clang/test/CodeGen/AArch64/poly128.c +++ b/clang/test/CodeGen/AArch64/poly128.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -disable-O0-optnone -ffp-contract=fast -emit-llvm -o - %s | opt -S -passes=mem2reg \ +// RUN: -disable-O0-optnone -ffp-contract=fast -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target @@ -61,7 +61,7 @@ __attribute__((target("aes"))) poly128_t test_vmull_p64(poly64_t a, poly64_t b) } // CHECK-LABEL: define {{[^@]+}}@test_vmull_high_p64 -// CHECK-SAME: (<2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-SAME: (<2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I5:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> // CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I5]] to i64 @@ -76,7 +76,7 @@ __attribute__((target("aes"))) poly128_t test_vmull_high_p64(poly64x2_t a, poly6 } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -86,7 +86,7 @@ poly128_t test_vreinterpretq_p128_s8(int8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -96,7 +96,7 @@ poly128_t test_vreinterpretq_p128_s16(int16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s32 -// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -106,7 +106,7 @@ poly128_t test_vreinterpretq_p128_s32(int32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s64 -// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -116,7 +116,7 @@ poly128_t test_vreinterpretq_p128_s64(int64x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -126,7 +126,7 @@ poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -136,7 +136,7 @@ poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u32 -// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -146,7 +146,7 @@ poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u64 -// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -156,7 +156,7 @@ poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -166,7 +166,7 @@ poly128_t test_vreinterpretq_p128_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_f64 -// CHECK-SAME: (<2 x double> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<2 x double> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -176,7 +176,7 @@ poly128_t test_vreinterpretq_p128_f64(float64x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -186,7 +186,7 @@ poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -196,7 +196,7 @@ poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p64 -// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -206,7 +206,7 @@ poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s8_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8> // CHECK-NEXT: ret <16 x i8> [[TMP0]] @@ -216,7 +216,7 @@ int8x16_t test_vreinterpretq_s8_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s16_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[TMP0]] @@ -226,7 +226,7 @@ int16x8_t test_vreinterpretq_s16_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s32_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP0]] @@ -236,7 +236,7 @@ int32x4_t test_vreinterpretq_s32_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s64_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[TMP0]] @@ -246,7 +246,7 @@ int64x2_t test_vreinterpretq_s64_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u8_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8> // CHECK-NEXT: ret <16 x i8> [[TMP0]] @@ -256,7 +256,7 @@ uint8x16_t test_vreinterpretq_u8_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u16_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[TMP0]] @@ -266,7 +266,7 @@ uint16x8_t test_vreinterpretq_u16_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u32_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP0]] @@ -276,7 +276,7 @@ uint32x4_t test_vreinterpretq_u32_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u64_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[TMP0]] @@ -286,7 +286,7 @@ uint64x2_t test_vreinterpretq_u64_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_f32_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x float> // CHECK-NEXT: ret <4 x float> [[TMP0]] @@ -296,7 +296,7 @@ float32x4_t test_vreinterpretq_f32_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_f64_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x double> // CHECK-NEXT: ret <2 x double> [[TMP0]] @@ -306,7 +306,7 @@ float64x2_t test_vreinterpretq_f64_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p8_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8> // CHECK-NEXT: ret <16 x i8> [[TMP0]] @@ -316,7 +316,7 @@ poly8x16_t test_vreinterpretq_p8_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p16_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[TMP0]] @@ -326,7 +326,7 @@ poly16x8_t test_vreinterpretq_p16_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p64_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[TMP0]] diff --git a/clang/test/CodeGen/AArch64/poly64.c b/clang/test/CodeGen/AArch64/poly64.c index f3c057ecf48c1..578dd2054dc66 100644 --- a/clang/test/CodeGen/AArch64/poly64.c +++ b/clang/test/CodeGen/AArch64/poly64.c @@ -1,537 +1,642 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \ +// RUN: -ffp-contract=fast -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vceq_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceq_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vceq_p64(poly64x1_t a, poly64x1_t b) { return vceq_p64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vceqq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vceqq_p64(poly64x2_t a, poly64x2_t b) { return vceqq_p64(a, b); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vtst_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP4:%.*]] = and <1 x i64> %a, %b -// CHECK: [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64> -// CHECK: ret <1 x i64> [[VTST_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vtst_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VTST_I]] +// uint64x1_t test_vtst_p64(poly64x1_t a, poly64x1_t b) { return vtst_p64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vtstq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP4:%.*]] = and <2 x i64> %a, %b -// CHECK: [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64> -// CHECK: ret <2 x i64> [[VTST_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtstq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VTST_I]] +// uint64x2_t test_vtstq_p64(poly64x2_t a, poly64x2_t b) { return vtstq_p64(a, b); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vbsl_p64(<1 x i64> noundef %a, <1 x i64> noundef %b, <1 x i64> noundef %c) #0 { -// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> %a, %b -// CHECK: [[TMP3:%.*]] = xor <1 x i64> %a, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %c -// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <1 x i64> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vbsl_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]], <1 x i64> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <1 x i64> [[VBSL5_I]] +// poly64x1_t test_vbsl_p64(poly64x1_t a, poly64x1_t b, poly64x1_t c) { return vbsl_p64(a, b, c); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vbslq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b, <2 x i64> noundef %c) #0 { -// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> %a, %b -// CHECK: [[TMP3:%.*]] = xor <2 x i64> %a, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %c -// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <2 x i64> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vbslq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]], <2 x i64> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <2 x i64> [[VBSL5_I]] +// poly64x2_t test_vbslq_p64(poly64x2_t a, poly64x2_t b, poly64x2_t c) { return vbslq_p64(a, b, c); } -// CHECK-LABEL: define{{.*}} i64 @test_vget_lane_p64(<1 x i64> noundef %v) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> %v, i32 0 -// CHECK: ret i64 [[VGET_LANE]] +// CHECK-LABEL: define dso_local i64 @test_vget_lane_p64( +// CHECK-SAME: <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[V]], i32 0 +// CHECK-NEXT: ret i64 [[VGET_LANE]] +// poly64_t test_vget_lane_p64(poly64x1_t v) { return vget_lane_p64(v, 0); } -// CHECK-LABEL: define{{.*}} i64 @test_vgetq_lane_p64(<2 x i64> noundef %v) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> %v, i32 1 -// CHECK: ret i64 [[VGETQ_LANE]] +// CHECK-LABEL: define dso_local i64 @test_vgetq_lane_p64( +// CHECK-SAME: <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[V]], i32 1 +// CHECK-NEXT: ret i64 [[VGETQ_LANE]] +// poly64_t test_vgetq_lane_p64(poly64x2_t v) { return vgetq_lane_p64(v, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vset_lane_p64(i64 noundef %a, <1 x i64> noundef %v) #0 { -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> %v, i64 %a, i32 0 -// CHECK: ret <1 x i64> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vset_lane_p64( +// CHECK-SAME: i64 noundef [[A:%.*]], <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[V]], i64 [[A]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VSET_LANE]] +// poly64x1_t test_vset_lane_p64(poly64_t a, poly64x1_t v) { return vset_lane_p64(a, v, 0); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vsetq_lane_p64(i64 noundef %a, <2 x i64> noundef %v) #0 { -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> %v, i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsetq_lane_p64( +// CHECK-SAME: i64 noundef [[A:%.*]], <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[V]], i64 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VSET_LANE]] +// poly64x2_t test_vsetq_lane_p64(poly64_t a, poly64x2_t v) { return vsetq_lane_p64(a, v, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vcopy_lane_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> %b, i32 0 -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> %a, i64 [[VGET_LANE]], i32 0 -// CHECK: ret <1 x i64> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcopy_lane_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[B]], i32 0 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[A]], i64 [[VGET_LANE]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VSET_LANE]] +// poly64x1_t test_vcopy_lane_p64(poly64x1_t a, poly64x1_t b) { return vcopy_lane_p64(a, 0, b, 0); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vcopyq_lane_p64(<2 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> %b, i32 0 -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> %a, i64 [[VGET_LANE]], i32 1 -// CHECK: ret <2 x i64> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcopyq_lane_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[B]], i32 0 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[A]], i64 [[VGET_LANE]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VSET_LANE]] +// poly64x2_t test_vcopyq_lane_p64(poly64x2_t a, poly64x1_t b) { return vcopyq_lane_p64(a, 1, b, 0); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vcopyq_laneq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> %b, i32 1 -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> %a, i64 [[VGETQ_LANE]], i32 1 -// CHECK: ret <2 x i64> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcopyq_laneq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[B]], i32 1 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[A]], i64 [[VGETQ_LANE]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VSET_LANE]] +// poly64x2_t test_vcopyq_laneq_p64(poly64x2_t a, poly64x2_t b) { return vcopyq_laneq_p64(a, 1, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vcreate_p64(i64 noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcreate_p64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vcreate_p64(uint64_t a) { return vcreate_p64(a); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vdup_n_p64(i64 noundef %a) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0 -// CHECK: ret <1 x i64> [[VECINIT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vdup_n_p64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VECINIT_I]] +// poly64x1_t test_vdup_n_p64(poly64_t a) { return vdup_n_p64(a); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vdupq_n_p64(i64 noundef %a) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VECINIT1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vdupq_n_p64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VECINIT1_I]] +// poly64x2_t test_vdupq_n_p64(poly64_t a) { return vdupq_n_p64(a); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vmov_n_p64(i64 noundef %a) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0 -// CHECK: ret <1 x i64> [[VECINIT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vmov_n_p64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VECINIT_I]] +// poly64x1_t test_vmov_n_p64(poly64_t a) { return vmov_n_p64(a); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vmovq_n_p64(i64 noundef %a) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VECINIT1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmovq_n_p64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VECINIT1_I]] +// poly64x2_t test_vmovq_n_p64(poly64_t a) { return vmovq_n_p64(a); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vdup_lane_p64(<1 x i64> noundef %vec) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vdup_lane_p64( +// CHECK-SAME: <1 x i64> noundef [[VEC:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[VEC]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[LANE]] +// poly64x1_t test_vdup_lane_p64(poly64x1_t vec) { return vdup_lane_p64(vec, 0); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vdupq_lane_p64(<1 x i64> noundef %vec) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vdupq_lane_p64( +// CHECK-SAME: <1 x i64> noundef [[VEC:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[VEC]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) { return vdupq_lane_p64(vec, 0); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vdupq_laneq_p64(<2 x i64> noundef %vec) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> [[VEC:%.*]] to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vdupq_laneq_p64( +// CHECK-SAME: <2 x i64> noundef [[VEC:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[VEC]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) { return vdupq_laneq_p64(vec, 1); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vcombine_p64(<1 x i64> noundef %low, <1 x i64> noundef %high) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcombine_p64( +// CHECK-SAME: <1 x i64> noundef [[LOW:%.*]], <1 x i64> noundef [[HIGH:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> [[LOW]], <1 x i64> [[HIGH]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) { return vcombine_p64(low, high); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_p64(ptr noundef %ptr) #0 { -// CHECK: [[TMP2:%.*]] = load <1 x i64>, ptr %ptr -// CHECK: ret <1 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[PTR]], align 8 +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vld1_p64(poly64_t const * ptr) { return vld1_p64(ptr); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_p64(ptr noundef %ptr) #0 { -// CHECK: [[TMP2:%.*]] = load <2 x i64>, ptr %ptr -// CHECK: ret <2 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[PTR]], align 8 +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vld1q_p64(poly64_t const * ptr) { return vld1q_p64(ptr); } -// CHECK-LABEL: define{{.*}} void @test_vst1_p64(ptr noundef %ptr, <1 x i64> noundef %val) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %val to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: store <1 x i64> [[TMP3]], ptr %ptr -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], <1 x i64> noundef [[VAL:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[VAL]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: store <1 x i64> [[TMP1]], ptr [[PTR]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_p64(poly64_t * ptr, poly64x1_t val) { return vst1_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_p64(ptr noundef %ptr, <2 x i64> noundef %val) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %val to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: store <2 x i64> [[TMP3]], ptr %ptr -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], <2 x i64> noundef [[VAL:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[VAL]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[PTR]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_p64(poly64_t * ptr, poly64x2_t val) { return vst1q_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_p64(ptr noundef %ptr) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr %ptr) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld2_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[PTR]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T:%.*]] poison, <1 x i64> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X2_T]] [[DOTFCA_0_1_INSERT]] +// poly64x1x2_t test_vld2_p64(poly64_t const * ptr) { return vld2_p64(ptr); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_p64(ptr noundef %ptr) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr %ptr) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld2q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]]) +// CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0 +// CHECK-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T:%.*]] poison, <2 x i64> [[VLD2_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD2_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X2_T]] [[DOTFCA_0_1_INSERT]] +// poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) { return vld2q_p64(ptr); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_p64(ptr noundef %ptr) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr %ptr) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld3_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[PTR]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T:%.*]] poison, <1 x i64> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X3_T]] [[DOTFCA_0_2_INSERT]] +// poly64x1x3_t test_vld3_p64(poly64_t const * ptr) { return vld3_p64(ptr); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_p64(ptr noundef %ptr) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr %ptr) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld3q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[PTR]]) +// CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0 +// CHECK-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1 +// CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T:%.*]] poison, <2 x i64> [[VLD3_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD3_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD3_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X3_T]] [[DOTFCA_0_2_INSERT]] +// poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) { return vld3q_p64(ptr); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_p64(ptr noundef %ptr) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr %ptr) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld4_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[PTR]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T:%.*]] poison, <1 x i64> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_0_INSERT]], <1 x i64> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_1_INSERT]], <1 x i64> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_2_INSERT]], <1 x i64> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X4_T]] [[DOTFCA_0_3_INSERT]] +// poly64x1x4_t test_vld4_p64(poly64_t const * ptr) { return vld4_p64(ptr); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_p64(ptr noundef %ptr) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr %ptr) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld4q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[PTR]]) +// CHECK-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0 +// CHECK-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1 +// CHECK-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2 +// CHECK-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T:%.*]] poison, <2 x i64> [[VLD4_FCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_0_INSERT]], <2 x i64> [[VLD4_FCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_1_INSERT]], <2 x i64> [[VLD4_FCA_2_EXTRACT]], 0, 2 +// CHECK-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_2_INSERT]], <2 x i64> [[VLD4_FCA_3_EXTRACT]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X4_T]] [[DOTFCA_0_3_INSERT]] +// poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) { return vld4q_p64(ptr); } -// CHECK-LABEL: define{{.*}} void @test_vst2_p64(ptr noundef %ptr, [2 x <1 x i64>] alignstack(8) %val.coerce) #0 { -// CHECK: [[VAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[VAL]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[VAL]], i64 16, i1 false) -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL2]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %ptr) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <1 x i64>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// void test_vst2_p64(poly64_t * ptr, poly64x1x2_t val) { return vst2_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_p64(ptr noundef %ptr, [2 x <2 x i64>] alignstack(16) %val.coerce) #0 { -// CHECK: [[VAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[VAL]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[VAL]], i64 32, i1 false) -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL2]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %ptr) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <2 x i64>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[VAL_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[VAL_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// void test_vst2q_p64(poly64_t * ptr, poly64x2x2_t val) { return vst2q_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} void @test_vst3_p64(ptr noundef %ptr, [3 x <1 x i64>] alignstack(8) %val.coerce) #0 { -// CHECK: [[VAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[VAL]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[VAL]], i64 24, i1 false) -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL2]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL4:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL4]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX5]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %ptr) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <1 x i64>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[VAL_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// void test_vst3_p64(poly64_t * ptr, poly64x1x3_t val) { return vst3_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_p64(ptr noundef %ptr, [3 x <2 x i64>] alignstack(16) %val.coerce) #0 { -// CHECK: [[VAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[VAL]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[VAL]], i64 48, i1 false) -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL2]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL4:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL4]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX5]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %ptr) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <2 x i64>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[VAL_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[VAL_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[VAL_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[VAL_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// void test_vst3q_p64(poly64_t * ptr, poly64x2x3_t val) { return vst3q_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} void @test_vst4_p64(ptr noundef %ptr, [4 x <1 x i64>] alignstack(8) %val.coerce) #0 { -// CHECK: [[VAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[VAL]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[VAL]], i64 32, i1 false) -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL2]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL4:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL4]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX5]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL6:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL6]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX7]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %ptr) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <1 x i64>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[VAL_COERCE]], 2 +// CHECK-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[VAL_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP4]], <1 x i64> [[TMP5]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// void test_vst4_p64(poly64_t * ptr, poly64x1x4_t val) { return vst4_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_p64(ptr noundef %ptr, [4 x <2 x i64>] alignstack(16) %val.coerce) #0 { -// CHECK: [[VAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[VAL]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[VAL]], i64 64, i1 false) -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL2]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL4:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL4]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX5]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL6:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL6]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX7]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %ptr) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <2 x i64>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[VAL_COERCE]], 0 +// CHECK-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[VAL_COERCE]], 1 +// CHECK-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[VAL_COERCE]], 2 +// CHECK-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[VAL_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[VAL_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[VAL_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[VAL_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[VAL_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// void test_vst4q_p64(poly64_t * ptr, poly64x2x4_t val) { return vst4q_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vext_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[VEXT]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vext_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[VEXT]] +// poly64x1_t test_vext_p64(poly64x1_t a, poly64x1_t b) { return vext_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vextq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i64> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vextq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VEXT]] +// poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) { return vextq_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vzip1q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vzip1q_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vzip1q_p64(poly64x2_t a, poly64x2_t b) { return vzip1q_p64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vzip2q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vzip2q_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vzip2q_p64(poly64x2_t a, poly64x2_t b) { return vzip2q_u64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vuzp1q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp1q_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vuzp1q_p64(poly64x2_t a, poly64x2_t b) { return vuzp1q_p64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vuzp2q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp2q_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vuzp2q_p64(poly64x2_t a, poly64x2_t b) { return vuzp2q_u64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vtrn1q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) { return vtrn1q_p64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vtrn2q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) { return vtrn2q_u64(a, b); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vsri_n_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 33) -// CHECK: ret <1 x i64> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsri_n_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 33) +// CHECK-NEXT: ret <1 x i64> [[VSRI_N2]] +// poly64x1_t test_vsri_n_p64(poly64x1_t a, poly64x1_t b) { return vsri_n_p64(a, b, 33); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vsriq_n_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 64) -// CHECK: ret <2 x i64> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsriq_n_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 64) +// CHECK-NEXT: ret <2 x i64> [[VSRI_N2]] +// poly64x2_t test_vsriq_n_p64(poly64x2_t a, poly64x2_t b) { return vsriq_n_p64(a, b, 64); } diff --git a/clang/test/CodeGen/AArch64/v8.1a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.1a-neon-intrinsics.c index bc985efa6bc99..3317db1bf5af6 100644 --- a/clang/test/CodeGen/AArch64/v8.1a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.1a-neon-intrinsics.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \ -// RUN: -target-feature +v8.1a -emit-llvm -disable-O0-optnone -o - %s | opt -passes=mem2reg,dce -S | FileCheck %s +// RUN: -target-feature +v8.1a -emit-llvm -disable-O0-optnone -o - %s | opt -passes=mem2reg,sroa,dce -S | FileCheck %s // REQUIRES: aarch64-registered-target @@ -11,8 +11,16 @@ // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) -// CHECK-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQRDMLAH_S16_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQRDMLAH_S161_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQRDMLAH_S162_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[VQRDMLAH_S16_I]], <4 x i16> [[VQRDMLAH_S161_I]], <4 x i16> [[VQRDMLAH_S162_I]]) +// CHECK-NEXT: [[VQRDMLAH_S164_I:%.*]] = bitcast <4 x i16> [[VQRDMLAH_S163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VQRDMLAH_S164_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP5]] // int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { return vqrdmlah_laneq_s16(a, b, v, 7); @@ -23,8 +31,16 @@ int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) -// CHECK-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQRDMLAH_S32_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQRDMLAH_S321_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQRDMLAH_S322_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[VQRDMLAH_S32_I]], <2 x i32> [[VQRDMLAH_S321_I]], <2 x i32> [[VQRDMLAH_S322_I]]) +// CHECK-NEXT: [[VQRDMLAH_S324_I:%.*]] = bitcast <2 x i32> [[VQRDMLAH_S323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VQRDMLAH_S324_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP5]] // int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { return vqrdmlah_laneq_s32(a, b, v, 3); @@ -35,8 +51,16 @@ int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> -// CHECK-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) -// CHECK-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VQRDMLAHQ_S16_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VQRDMLAHQ_S161_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: [[VQRDMLAHQ_S162_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK-NEXT: [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[VQRDMLAHQ_S16_I]], <8 x i16> [[VQRDMLAHQ_S161_I]], <8 x i16> [[VQRDMLAHQ_S162_I]]) +// CHECK-NEXT: [[VQRDMLAHQ_S164_I:%.*]] = bitcast <8 x i16> [[VQRDMLAHQ_S163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VQRDMLAHQ_S164_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP5]] // int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { return vqrdmlahq_laneq_s16(a, b, v, 7); @@ -47,8 +71,16 @@ int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) -// CHECK-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VQRDMLAHQ_S32_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQRDMLAHQ_S321_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[VQRDMLAHQ_S322_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[VQRDMLAHQ_S32_I]], <4 x i32> [[VQRDMLAHQ_S321_I]], <4 x i32> [[VQRDMLAHQ_S322_I]]) +// CHECK-NEXT: [[VQRDMLAHQ_S324_I:%.*]] = bitcast <4 x i32> [[VQRDMLAHQ_S323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VQRDMLAHQ_S324_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP5]] // int32x4_t test_vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { return vqrdmlahq_laneq_s32(a, b, v, 3); @@ -129,8 +161,16 @@ int32_t test_vqrdmlahs_laneq_s32(int32_t a, int32_t b, int32x4_t c) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) -// CHECK-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQRDMLSH_S16_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQRDMLSH_S161_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQRDMLSH_S162_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[VQRDMLSH_S16_I]], <4 x i16> [[VQRDMLSH_S161_I]], <4 x i16> [[VQRDMLSH_S162_I]]) +// CHECK-NEXT: [[VQRDMLSH_S164_I:%.*]] = bitcast <4 x i16> [[VQRDMLSH_S163_I]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VQRDMLSH_S164_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP5]] // int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { return vqrdmlsh_laneq_s16(a, b, v, 7); @@ -141,8 +181,16 @@ int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) -// CHECK-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQRDMLSH_S32_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQRDMLSH_S321_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQRDMLSH_S322_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[VQRDMLSH_S32_I]], <2 x i32> [[VQRDMLSH_S321_I]], <2 x i32> [[VQRDMLSH_S322_I]]) +// CHECK-NEXT: [[VQRDMLSH_S324_I:%.*]] = bitcast <2 x i32> [[VQRDMLSH_S323_I]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VQRDMLSH_S324_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP5]] // int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { return vqrdmlsh_laneq_s32(a, b, v, 3); @@ -153,8 +201,16 @@ int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> -// CHECK-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) -// CHECK-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VQRDMLSHQ_S16_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VQRDMLSHQ_S161_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: [[VQRDMLSHQ_S162_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK-NEXT: [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[VQRDMLSHQ_S16_I]], <8 x i16> [[VQRDMLSHQ_S161_I]], <8 x i16> [[VQRDMLSHQ_S162_I]]) +// CHECK-NEXT: [[VQRDMLSHQ_S164_I:%.*]] = bitcast <8 x i16> [[VQRDMLSHQ_S163_I]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VQRDMLSHQ_S164_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP5]] // int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { return vqrdmlshq_laneq_s16(a, b, v, 7); @@ -165,8 +221,16 @@ int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> // CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) -// CHECK-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VQRDMLSHQ_S32_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQRDMLSHQ_S321_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[VQRDMLSHQ_S322_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[VQRDMLSHQ_S32_I]], <4 x i32> [[VQRDMLSHQ_S321_I]], <4 x i32> [[VQRDMLSHQ_S322_I]]) +// CHECK-NEXT: [[VQRDMLSHQ_S324_I:%.*]] = bitcast <4 x i32> [[VQRDMLSHQ_S323_I]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VQRDMLSHQ_S324_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP5]] // int32x4_t test_vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { return vqrdmlshq_laneq_s32(a, b, v, 3); diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.c index b51e6f7e6e1ac..02ddbf2950829 100644 --- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.c +++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.c @@ -1,21 +1,13 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\ // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=UNCONSTRAINED %s +// RUN: | opt -S -passes=mem2reg,sroa \ +// RUN: | FileCheck --check-prefix=UNCONSTRAINED %s // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\ // RUN: -ffp-exception-behavior=maytrap -DEXCEPT=1 \ // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=CONSTRAINED --implicit-check-not=fpexcept.maytrap %s -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\ -// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | llc -o=- - \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\ -// RUN: -ffp-exception-behavior=maytrap -DEXCEPT=1 \ -// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | llc -o=- - \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM --implicit-check-not=fpexcept.maytrap %s +// RUN: | opt -S -passes=mem2reg,sroa \ +// RUN: | FileCheck --check-prefix=CONSTRAINED --implicit-check-not=fpexcept.maytrap %s // REQUIRES: aarch64-registered-target @@ -29,310 +21,754 @@ #include -// COMMON-LABEL: test_vsqrt_f16 -// UNCONSTRAINED: [[SQR:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a) -// CONSTRAINED: [[SQR:%.*]] = call <4 x half> @llvm.experimental.constrained.sqrt.v4f16(<4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fsqrt v{{[0-9]+}}.4h, v{{[0-9]+}}.4h -// COMMONIR: ret <4 x half> [[SQR]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vsqrt_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// UNCONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> [[TMP2]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[VSQRT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vsqrt_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <4 x half> @llvm.experimental.constrained.sqrt.v4f16(<4 x half> [[TMP2]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2:[0-9]+]] +// CONSTRAINED-NEXT: ret <4 x half> [[VSQRT_I]] +// float16x4_t test_vsqrt_f16(float16x4_t a) { return vsqrt_f16(a); } -// COMMON-LABEL: test_vsqrtq_f16 -// UNCONSTRAINED: [[SQR:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a) -// CONSTRAINED: [[SQR:%.*]] = call <8 x half> @llvm.experimental.constrained.sqrt.v8f16(<8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fsqrt v{{[0-9]+}}.8h, v{{[0-9]+}}.8h -// COMMONIR: ret <8 x half> [[SQR]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vsqrtq_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// UNCONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> [[TMP2]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[VSQRT_I]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vsqrtq_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <8 x half> @llvm.experimental.constrained.sqrt.v8f16(<8 x half> [[TMP2]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[VSQRT_I]] +// float16x8_t test_vsqrtq_f16(float16x8_t a) { return vsqrtq_f16(a); } -// COMMON-LABEL: test_vfma_f16 -// UNCONSTRAINED: [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a) -// CONSTRAINED: [[ADD:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h -// COMMONIR: ret <4 x half> [[ADD]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP9]] +// float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_f16(a, b, c); } -// COMMON-LABEL: test_vfmaq_f16 -// UNCONSTRAINED: [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a) -// CONSTRAINED: [[ADD:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h -// COMMONIR: ret <8 x half> [[ADD]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[TMP9]] +// float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_f16(a, b, c); } -// COMMON-LABEL: test_vfms_f16 -// COMMONIR: [[SUB:%.*]] = fneg <4 x half> %b -// UNCONSTRAINED: [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> %c, <4 x half> %a) -// CONSTRAINED: [[ADD:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[SUB]], <4 x half> %c, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h -// COMMONIR: ret <4 x half> [[ADD]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <4 x half> [[B]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG_I]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <4 x half> [[B]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG_I]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP9]] +// float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfms_f16(a, b, c); } -// COMMON-LABEL: test_vfmsq_f16 -// COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b -// UNCONSTRAINED: [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> %c, <8 x half> %a) -// CONSTRAINED: [[ADD:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[SUB]], <8 x half> %c, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h -// COMMONIR: ret <8 x half> [[ADD]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <8 x half> [[B]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG_I]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <8 x half> [[B]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG_I]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[TMP9]] +// float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmsq_f16(a, b, c); } -// COMMON-LABEL: test_vfma_lane_f16 -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> -// COMMONIR: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// UNCONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]]) -// CONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <4 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_lane_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <4 x i32> +// UNCONSTRAINED-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// UNCONSTRAINED-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// UNCONSTRAINED-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[FMLA2]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_lane_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <4 x i32> +// CONSTRAINED-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CONSTRAINED-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CONSTRAINED-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[FMLA2]] +// float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_lane_f16(a, b, c, 3); } -// COMMON-LABEL: test_vfmaq_lane_f16 -// COMMONIR: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> -// COMMONIR: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]]) -// CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <8 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_lane_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <8 x i32> +// UNCONSTRAINED-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// UNCONSTRAINED-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// UNCONSTRAINED-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[FMLA2]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_lane_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <8 x i32> +// CONSTRAINED-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CONSTRAINED-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CONSTRAINED-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[FMLA2]] +// float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { return vfmaq_lane_f16(a, b, c, 3); } -// COMMON-LABEL: test_vfma_laneq_f16 -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// COMMONIR: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> -// UNCONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]]) -// CONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <4 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_laneq_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP7]], <4 x half> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_laneq_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <4 x i32> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP7]], <4 x half> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP9]] +// float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { return vfma_laneq_f16(a, b, c, 7); } -// COMMON-LABEL: test_vfmaq_laneq_f16 -// COMMONIR: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// COMMONIR: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> -// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]]) -// CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <8 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_laneq_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <8 x i32> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP7]], <8 x half> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_laneq_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <8 x i32> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP7]], <8 x half> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[TMP9]] +// float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_laneq_f16(a, b, c, 7); } -// COMMON-LABEL: test_vfma_n_f16 -// COMMONIR: [[TMP0:%.*]] = insertelement <4 x half> poison, half %c, i32 0 -// COMMONIR: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %c, i32 1 -// COMMONIR: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %c, i32 2 -// COMMONIR: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3 -// UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> [[TMP3]], <4 x half> %a) -// CONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> [[TMP3]], <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <4 x half> [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_n_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i32 0 +// UNCONSTRAINED-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1 +// UNCONSTRAINED-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2 +// UNCONSTRAINED-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_n_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i32 0 +// CONSTRAINED-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1 +// CONSTRAINED-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2 +// CONSTRAINED-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP9]] +// float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) { return vfma_n_f16(a, b, c); } -// COMMON-LABEL: test_vfmaq_n_f16 -// COMMONIR: [[TMP0:%.*]] = insertelement <8 x half> poison, half %c, i32 0 -// COMMONIR: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %c, i32 1 -// COMMONIR: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %c, i32 2 -// COMMONIR: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %c, i32 3 -// COMMONIR: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %c, i32 4 -// COMMONIR: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %c, i32 5 -// COMMONIR: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %c, i32 6 -// COMMONIR: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7 -// UNCONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> [[TMP7]], <8 x half> %a) -// CONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> [[TMP7]], <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <8 x half> [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_n_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i32 0 +// UNCONSTRAINED-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[C]], i32 1 +// UNCONSTRAINED-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[C]], i32 2 +// UNCONSTRAINED-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[C]], i32 3 +// UNCONSTRAINED-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[C]], i32 4 +// UNCONSTRAINED-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5 +// UNCONSTRAINED-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6 +// UNCONSTRAINED-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_n_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i32 0 +// CONSTRAINED-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[C]], i32 1 +// CONSTRAINED-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[C]], i32 2 +// CONSTRAINED-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[C]], i32 3 +// CONSTRAINED-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[C]], i32 4 +// CONSTRAINED-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5 +// CONSTRAINED-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6 +// CONSTRAINED-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[TMP9]] +// float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { return vfmaq_n_f16(a, b, c); } -// COMMON-LABEL: test_vfmah_lane_f16 -// COMMONIR: [[EXTR:%.*]] = extractelement <4 x half> %c, i32 3 -// UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a) -// CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half %b, half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret half [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local half @test_vfmah_lane_f16( +// UNCONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]]) +// UNCONSTRAINED-NEXT: ret half [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local half @test_vfmah_lane_f16( +// CONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[B]], half [[EXTRACT]], half [[A]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret half [[TMP0]] +// float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) { return vfmah_lane_f16(a, b, c, 3); } -// COMMON-LABEL: test_vfmah_laneq_f16 -// COMMONIR: [[EXTR:%.*]] = extractelement <8 x half> %c, i32 7 -// UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a) -// CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half %b, half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret half [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local half @test_vfmah_laneq_f16( +// UNCONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]]) +// UNCONSTRAINED-NEXT: ret half [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local half @test_vfmah_laneq_f16( +// CONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[B]], half [[EXTRACT]], half [[A]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret half [[TMP0]] +// float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) { return vfmah_laneq_f16(a, b, c, 7); } -// COMMON-LABEL: test_vfms_lane_f16 -// COMMONIR: [[SUB:%.*]] = fneg <4 x half> %b -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <4 x half> [[SUB]] to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> -// COMMONIR: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]]) -// CONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <4 x half> [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_lane_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <4 x i32> +// UNCONSTRAINED-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// UNCONSTRAINED-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// UNCONSTRAINED-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[FMLA2]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_lane_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <4 x i32> +// CONSTRAINED-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CONSTRAINED-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CONSTRAINED-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[FMLA2]] +// float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfms_lane_f16(a, b, c, 3); } -// COMMON-LABEL: test_vfmsq_lane_f16 -// COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b -// COMMONIR: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> -// COMMONIR: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]]) -// CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <8 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_lane_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <8 x i32> +// UNCONSTRAINED-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// UNCONSTRAINED-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// UNCONSTRAINED-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[FMLA2]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_lane_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <8 x i32> +// CONSTRAINED-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CONSTRAINED-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CONSTRAINED-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[FMLA2]] +// float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { return vfmsq_lane_f16(a, b, c, 3); } -// COMMON-LABEL: test_vfms_laneq_f16 -// COMMONIR: [[SUB:%.*]] = fneg <4 x half> %b -// CHECK-ASM-NOT: fneg -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <4 x half> [[SUB]] to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// COMMONIR: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> -// UNCONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]]) -// CONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <4 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_laneq_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP7]], <4 x half> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_laneq_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <4 x i32> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP7]], <4 x half> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP9]] +// float16x4_t test_vfms_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { return vfms_laneq_f16(a, b, c, 7); } -// COMMON-LABEL: test_vfmsq_laneq_f16 -// COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b -// CHECK-ASM-NOT: fneg -// COMMONIR: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// COMMONIR: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> -// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]]) -// CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <8 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_laneq_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <8 x i32> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP7]], <8 x half> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_laneq_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <8 x i32> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP7]], <8 x half> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[TMP9]] +// float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmsq_laneq_f16(a, b, c, 7); } -// COMMON-LABEL: test_vfms_n_f16 -// COMMONIR: [[SUB:%.*]] = fneg <4 x half> %b -// COMMONIR: [[TMP0:%.*]] = insertelement <4 x half> poison, half %c, i32 0 -// COMMONIR: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %c, i32 1 -// COMMONIR: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %c, i32 2 -// COMMONIR: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3 -// UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> [[TMP3]], <4 x half> %a) -// CONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[SUB]], <4 x half> [[TMP3]], <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <4 x half> [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_n_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// UNCONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i32 0 +// UNCONSTRAINED-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1 +// UNCONSTRAINED-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2 +// UNCONSTRAINED-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <4 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_n_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// CONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i32 0 +// CONSTRAINED-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1 +// CONSTRAINED-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2 +// CONSTRAINED-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <4 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP9]] +// float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) { return vfms_n_f16(a, b, c); } -// COMMON-LABEL: test_vfmsq_n_f16 -// COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b -// COMMONIR: [[TMP0:%.*]] = insertelement <8 x half> poison, half %c, i32 0 -// COMMONIR: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %c, i32 1 -// COMMONIR: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %c, i32 2 -// COMMONIR: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %c, i32 3 -// COMMONIR: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %c, i32 4 -// COMMONIR: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %c, i32 5 -// COMMONIR: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %c, i32 6 -// COMMONIR: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7 -// UNCONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> [[TMP7]], <8 x half> %a) -// CONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[SUB]], <8 x half> [[TMP7]], <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <8 x half> [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_n_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// UNCONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i32 0 +// UNCONSTRAINED-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[C]], i32 1 +// UNCONSTRAINED-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[C]], i32 2 +// UNCONSTRAINED-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[C]], i32 3 +// UNCONSTRAINED-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[C]], i32 4 +// UNCONSTRAINED-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5 +// UNCONSTRAINED-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6 +// UNCONSTRAINED-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <8 x i16> +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// UNCONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// UNCONSTRAINED-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[TMP9]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_n_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// CONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i32 0 +// CONSTRAINED-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[C]], i32 1 +// CONSTRAINED-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[C]], i32 2 +// CONSTRAINED-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[C]], i32 3 +// CONSTRAINED-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[C]], i32 4 +// CONSTRAINED-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5 +// CONSTRAINED-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6 +// CONSTRAINED-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <8 x i16> +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CONSTRAINED-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CONSTRAINED-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[TMP9]] +// float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { return vfmsq_n_f16(a, b, c); } -// COMMON-LABEL: test_vfmsh_lane_f16 -// UNCONSTRAINED: [[TMP0:%.*]] = fpext half %b to float -// CONSTRAINED: [[TMP0:%.*]] = call float @llvm.experimental.constrained.fpext.f32.f16(half %b, metadata !"fpexcept.strict") -// CHECK-ASM: fcvt s{{[0-9]+}}, h{{[0-9]+}} -// COMMONIR: [[TMP1:%.*]] = fneg float [[TMP0]] -// CHECK-ASM: fneg s{{[0-9]+}}, s{{[0-9]+}} -// UNCONSTRAINED: [[SUB:%.*]] = fptrunc float [[TMP1]] to half -// CONSTRAINED: [[SUB:%.*]] = call half @llvm.experimental.constrained.fptrunc.f16.f32(float [[TMP1]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fcvt h{{[0-9]+}}, s{{[0-9]+}} -// COMMONIR: [[EXTR:%.*]] = extractelement <4 x half> %c, i32 3 -// UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a) -// CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[SUB]], half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret half [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local half @test_vfmsh_lane_f16( +// UNCONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CONV:%.*]] = fpext half [[B]] to float +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg float [[CONV]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fptrunc float [[FNEG]] to half +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]]) +// UNCONSTRAINED-NEXT: ret half [[TMP1]] +// +// CONSTRAINED-LABEL: define dso_local half @test_vfmsh_lane_f16( +// CONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CONV:%.*]] = call float @llvm.experimental.constrained.fpext.f32.f16(half [[B]], metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg float [[CONV]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call half @llvm.experimental.constrained.fptrunc.f16.f32(float [[FNEG]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret half [[TMP1]] +// float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) { return vfmsh_lane_f16(a, b, c, 3); } -// COMMON-LABEL: test_vfmsh_laneq_f16 -// UNCONSTRAINED: [[TMP0:%.*]] = fpext half %b to float -// CONSTRAINED: [[TMP0:%.*]] = call float @llvm.experimental.constrained.fpext.f32.f16(half %b, metadata !"fpexcept.strict") -// CHECK-ASM: fcvt s{{[0-9]+}}, h{{[0-9]+}} -// COMMONIR: [[TMP1:%.*]] = fneg float [[TMP0]] -// CHECK-ASM: fneg s{{[0-9]+}}, s{{[0-9]+}} -// UNCONSTRAINED: [[SUB:%.*]] = fptrunc float [[TMP1]] to half -// CONSTRAINED: [[SUB:%.*]] = call half @llvm.experimental.constrained.fptrunc.f16.f32(float [[TMP1]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fcvt h{{[0-9]+}}, s{{[0-9]+}} -// COMMONIR: [[EXTR:%.*]] = extractelement <8 x half> %c, i32 7 -// UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a) -// CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[SUB]], half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret half [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local half @test_vfmsh_laneq_f16( +// UNCONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CONV:%.*]] = fpext half [[B]] to float +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg float [[CONV]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fptrunc float [[FNEG]] to half +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]]) +// UNCONSTRAINED-NEXT: ret half [[TMP1]] +// +// CONSTRAINED-LABEL: define dso_local half @test_vfmsh_laneq_f16( +// CONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CONV:%.*]] = call float @llvm.experimental.constrained.fpext.f32.f16(half [[B]], metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg float [[CONV]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call half @llvm.experimental.constrained.fptrunc.f16.f32(float [[FNEG]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]], metadata !"round.tonearest", metadata !"fpexcept.maytrap") #[[ATTR2]] +// CONSTRAINED-NEXT: ret half [[TMP1]] +// float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) { return vfmsh_laneq_f16(a, b, c, 7); } diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c index 4d2ef318005bd..8c719178d7241 100644 --- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c +++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature -fullfp16 -target-feature +v8a\ // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\ // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target @@ -15,17 +15,20 @@ // CHECK-LABEL: define {{[^@]+}}@test_vbsl_f16 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> -// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i16> [[A]], [[VBSL1_I]] -// CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[A]], splat (i16 -1) -// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i16> [[VBSL_I]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP5]], [[VBSL2_I]] // CHECK-NEXT: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <4 x half> -// CHECK-NEXT: ret <4 x half> [[TMP4]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP6]] // float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) { return vbsl_f16(a, b, c); @@ -34,17 +37,20 @@ float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vbslq_f16 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> -// CHECK-NEXT: [[VBSL3_I:%.*]] = and <8 x i16> [[A]], [[VBSL1_I]] -// CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i16> [[A]], splat (i16 -1) -// CHECK-NEXT: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP5:%.*]] = xor <8 x i16> [[VBSL_I]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP5]], [[VBSL2_I]] // CHECK-NEXT: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[VBSL5_I]] to <8 x half> -// CHECK-NEXT: ret <8 x half> [[TMP4]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[VBSL5_I]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP6]] // float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) { return vbslq_f16(a, b, c); @@ -53,21 +59,22 @@ float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vzip_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8 -// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-NEXT: store <4 x half> [[VZIP_I]], ptr [[RETVAL_I]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1 -// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-NEXT: store <4 x half> [[VZIP1_I]], ptr [[TMP2]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0 -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0 -// CHECK-NEXT: store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T:%.*]] poison, <4 x half> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT3]], <4 x half> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT4]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[TMP6]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[TMP6]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] poison, <4 x half> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x half> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] // float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) { return vzip_f16(a, b); @@ -76,21 +83,22 @@ float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vzipq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16 -// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-NEXT: store <8 x half> [[VZIP_I]], ptr [[RETVAL_I]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1 -// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-NEXT: store <8 x half> [[VZIP1_I]], ptr [[TMP2]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0 -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0 -// CHECK-NEXT: store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] poison, <8 x half> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT3]], <8 x half> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT4]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[TMP6]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[TMP6]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] poison, <8 x half> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x half> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] // float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) { return vzipq_f16(a, b); @@ -99,21 +107,22 @@ float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vuzp_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8 -// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-NEXT: store <4 x half> [[VUZP_I]], ptr [[RETVAL_I]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1 -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-NEXT: store <4 x half> [[VUZP1_I]], ptr [[TMP2]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0 -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0 -// CHECK-NEXT: store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T:%.*]] poison, <4 x half> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT3]], <4 x half> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT4]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[TMP6]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[TMP6]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] poison, <4 x half> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x half> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] // float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) { return vuzp_f16(a, b); @@ -122,21 +131,22 @@ float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vuzpq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16 -// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-NEXT: store <8 x half> [[VUZP_I]], ptr [[RETVAL_I]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1 -// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-NEXT: store <8 x half> [[VUZP1_I]], ptr [[TMP2]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0 -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0 -// CHECK-NEXT: store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] poison, <8 x half> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT3]], <8 x half> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT4]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[TMP6]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[TMP6]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] poison, <8 x half> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x half> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] // float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) { return vuzpq_f16(a, b); @@ -145,21 +155,22 @@ float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vtrn_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8 -// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-NEXT: store <4 x half> [[VTRN_I]], ptr [[RETVAL_I]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1 -// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-NEXT: store <4 x half> [[VTRN1_I]], ptr [[TMP2]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0 -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0 -// CHECK-NEXT: store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T:%.*]] poison, <4 x half> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT3]], <4 x half> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT4]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[TMP6]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[TMP6]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] poison, <4 x half> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x half> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]] // float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) { return vtrn_f16(a, b); @@ -168,21 +179,22 @@ float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vtrnq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16 -// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-NEXT: store <8 x half> [[VTRN_I]], ptr [[RETVAL_I]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1 -// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-NEXT: store <8 x half> [[VTRN1_I]], ptr [[TMP2]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0 -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0 -// CHECK-NEXT: store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT3:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] poison, <8 x half> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT4:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT3]], <8 x half> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT4]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[TMP6]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[TMP6]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] poison, <8 x half> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x half> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]] // float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) { return vtrnq_f16(a, b); @@ -251,9 +263,10 @@ float16x8_t test_vdupq_n_f16(float16_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vdup_lane_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <4 x i32> // CHECK-NEXT: ret <4 x half> [[LANE]] // float16x4_t test_vdup_lane_f16(float16x4_t a) { @@ -263,9 +276,10 @@ float16x4_t test_vdup_lane_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vdupq_lane_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <8 x i32> // CHECK-NEXT: ret <8 x half> [[LANE]] // float16x8_t test_vdupq_lane_f16(float16x4_t a) { @@ -275,9 +289,10 @@ float16x8_t test_vdupq_lane_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vdup_laneq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP2]], <4 x i32> // CHECK-NEXT: ret <4 x half> [[LANE]] // float16x4_t test_vdup_laneq_f16(float16x8_t a) { @@ -287,9 +302,10 @@ float16x4_t test_vdup_laneq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vdupq_laneq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP2]], <8 x i32> // CHECK-NEXT: ret <8 x half> [[LANE]] // float16x8_t test_vdupq_laneq_f16(float16x8_t a) { @@ -299,11 +315,13 @@ float16x8_t test_vdupq_laneq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vext_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> // CHECK-NEXT: ret <4 x half> [[VEXT]] // float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) { @@ -313,11 +331,13 @@ float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vextq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> // CHECK-NEXT: ret <8 x half> [[VEXT]] // float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) { diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c index 1cce977b60e6b..9c408e8c702fd 100644 --- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\ // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target @@ -11,8 +11,10 @@ // CHECK-LABEL: define {{[^@]+}}@test_vabs_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[VABS_I]]) // CHECK-NEXT: ret <4 x half> [[VABS1_I]] // float16x4_t test_vabs_f16(float16x4_t a) { @@ -22,8 +24,10 @@ float16x4_t test_vabs_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vabsq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VABS1_I:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[VABS_I]]) // CHECK-NEXT: ret <8 x half> [[VABS1_I]] // float16x8_t test_vabsq_f16(float16x8_t a) { @@ -33,9 +37,11 @@ float16x8_t test_vabsq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vceqz_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp oeq <4 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <4 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[VCEQZ_I]] // uint16x4_t test_vceqz_f16(float16x4_t a) { @@ -45,9 +51,11 @@ uint16x4_t test_vceqz_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vceqzq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp oeq <8 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <8 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VCEQZ_I]] // uint16x8_t test_vceqzq_f16(float16x8_t a) { @@ -57,9 +65,11 @@ uint16x8_t test_vceqzq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcgez_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp oge <4 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oge <4 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[VCGEZ_I]] // uint16x4_t test_vcgez_f16(float16x4_t a) { @@ -69,9 +79,11 @@ uint16x4_t test_vcgez_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcgezq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp oge <8 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oge <8 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VCGEZ_I]] // uint16x8_t test_vcgezq_f16(float16x8_t a) { @@ -81,9 +93,11 @@ uint16x8_t test_vcgezq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcgtz_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[VCGTZ_I]] // uint16x4_t test_vcgtz_f16(float16x4_t a) { @@ -93,9 +107,11 @@ uint16x4_t test_vcgtz_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcgtzq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <8 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <8 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VCGTZ_I]] // uint16x8_t test_vcgtzq_f16(float16x8_t a) { @@ -105,9 +121,11 @@ uint16x8_t test_vcgtzq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vclez_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp ole <4 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ole <4 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[VCLEZ_I]] // uint16x4_t test_vclez_f16(float16x4_t a) { @@ -117,9 +135,11 @@ uint16x4_t test_vclez_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vclezq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp ole <8 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ole <8 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VCLEZ_I]] // uint16x8_t test_vclezq_f16(float16x8_t a) { @@ -129,9 +149,11 @@ uint16x8_t test_vclezq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcltz_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp olt <4 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <4 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[VCLTZ_I]] // uint16x4_t test_vcltz_f16(float16x4_t a) { @@ -141,9 +163,11 @@ uint16x4_t test_vcltz_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcltzq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp olt <8 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <8 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VCLTZ_I]] // uint16x8_t test_vcltzq_f16(float16x8_t a) { @@ -154,7 +178,8 @@ uint16x8_t test_vcltzq_f16(float16x8_t a) { // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x half> // CHECK-NEXT: ret <4 x half> [[VCVT_I]] // float16x4_t test_vcvt_f16_s16 (int16x4_t a) { @@ -165,7 +190,8 @@ float16x4_t test_vcvt_f16_s16 (int16x4_t a) { // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x half> // CHECK-NEXT: ret <8 x half> [[VCVT_I]] // float16x8_t test_vcvtq_f16_s16 (int16x8_t a) { @@ -176,7 +202,8 @@ float16x8_t test_vcvtq_f16_s16 (int16x8_t a) { // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x half> // CHECK-NEXT: ret <4 x half> [[VCVT_I]] // float16x4_t test_vcvt_f16_u16 (uint16x4_t a) { @@ -187,7 +214,8 @@ float16x4_t test_vcvt_f16_u16 (uint16x4_t a) { // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x half> // CHECK-NEXT: ret <8 x half> [[VCVT_I]] // float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) { @@ -197,8 +225,10 @@ float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvt_s16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> [[VCVTZ_I]]) // CHECK-NEXT: ret <4 x i16> [[VCVTZ1_I]] // int16x4_t test_vcvt_s16_f16 (float16x4_t a) { @@ -208,8 +238,10 @@ int16x4_t test_vcvt_s16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_s16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> [[VCVTZ_I]]) // CHECK-NEXT: ret <8 x i16> [[VCVTZ1_I]] // int16x8_t test_vcvtq_s16_f16 (float16x8_t a) { @@ -219,8 +251,10 @@ int16x8_t test_vcvtq_s16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvt_u16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> [[VCVTZ_I]]) // CHECK-NEXT: ret <4 x i16> [[VCVTZ1_I]] // uint16x4_t test_vcvt_u16_f16 (float16x4_t a) { @@ -230,8 +264,10 @@ uint16x4_t test_vcvt_u16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_u16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[VCVTZ_I]]) // CHECK-NEXT: ret <8 x i16> [[VCVTZ1_I]] // uint16x8_t test_vcvtq_u16_f16 (float16x8_t a) { @@ -241,8 +277,10 @@ uint16x8_t test_vcvtq_u16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvta_s16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> [[VCVTA_I]]) // CHECK-NEXT: ret <4 x i16> [[VCVTA1_I]] // int16x4_t test_vcvta_s16_f16 (float16x4_t a) { @@ -252,8 +290,10 @@ int16x4_t test_vcvta_s16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvta_u16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> [[VCVTA_I]]) // CHECK-NEXT: ret <4 x i16> [[VCVTA1_I]] // uint16x4_t test_vcvta_u16_f16 (float16x4_t a) { @@ -263,8 +303,10 @@ uint16x4_t test_vcvta_u16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtaq_s16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtas.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtas.v8i16.v8f16(<8 x half> [[VCVTA_I]]) // CHECK-NEXT: ret <8 x i16> [[VCVTA1_I]] // int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) { @@ -274,8 +316,10 @@ int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtm_s16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> [[VCVTM_I]]) // CHECK-NEXT: ret <4 x i16> [[VCVTM1_I]] // int16x4_t test_vcvtm_s16_f16 (float16x4_t a) { @@ -285,8 +329,10 @@ int16x4_t test_vcvtm_s16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_s16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtms.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtms.v8i16.v8f16(<8 x half> [[VCVTM_I]]) // CHECK-NEXT: ret <8 x i16> [[VCVTM1_I]] // int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) { @@ -296,8 +342,10 @@ int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtm_u16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> [[VCVTM_I]]) // CHECK-NEXT: ret <4 x i16> [[VCVTM1_I]] // uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) { @@ -307,8 +355,10 @@ uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_u16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtmu.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtmu.v8i16.v8f16(<8 x half> [[VCVTM_I]]) // CHECK-NEXT: ret <8 x i16> [[VCVTM1_I]] // uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) { @@ -318,8 +368,10 @@ uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtn_s16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> [[VCVTN_I]]) // CHECK-NEXT: ret <4 x i16> [[VCVTN1_I]] // int16x4_t test_vcvtn_s16_f16 (float16x4_t a) { @@ -329,8 +381,10 @@ int16x4_t test_vcvtn_s16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_s16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtns.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtns.v8i16.v8f16(<8 x half> [[VCVTN_I]]) // CHECK-NEXT: ret <8 x i16> [[VCVTN1_I]] // int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) { @@ -340,8 +394,10 @@ int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtn_u16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> [[VCVTN_I]]) // CHECK-NEXT: ret <4 x i16> [[VCVTN1_I]] // uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) { @@ -351,8 +407,10 @@ uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_u16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtnu.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtnu.v8i16.v8f16(<8 x half> [[VCVTN_I]]) // CHECK-NEXT: ret <8 x i16> [[VCVTN1_I]] // uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) { @@ -362,8 +420,10 @@ uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtp_s16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> [[VCVTP_I]]) // CHECK-NEXT: ret <4 x i16> [[VCVTP1_I]] // int16x4_t test_vcvtp_s16_f16 (float16x4_t a) { @@ -373,8 +433,10 @@ int16x4_t test_vcvtp_s16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_s16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtps.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtps.v8i16.v8f16(<8 x half> [[VCVTP_I]]) // CHECK-NEXT: ret <8 x i16> [[VCVTP1_I]] // int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) { @@ -384,8 +446,10 @@ int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtp_u16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> [[VCVTP_I]]) // CHECK-NEXT: ret <4 x i16> [[VCVTP1_I]] // uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) { @@ -395,8 +459,10 @@ uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_u16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtpu.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtpu.v8i16.v8f16(<8 x half> [[VCVTP_I]]) // CHECK-NEXT: ret <8 x i16> [[VCVTP1_I]] // uint16x8_t test_vcvtpq_u16_f16 (float16x8_t a) { @@ -427,8 +493,10 @@ float16x8_t test_vnegq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrecpe_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> [[VRECPE_V_I]]) // CHECK-NEXT: ret <4 x half> [[VRECPE_V1_I]] // float16x4_t test_vrecpe_f16(float16x4_t a) { @@ -438,8 +506,10 @@ float16x4_t test_vrecpe_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrecpeq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> [[VRECPEQ_V_I]]) // CHECK-NEXT: ret <8 x half> [[VRECPEQ_V1_I]] // float16x8_t test_vrecpeq_f16(float16x8_t a) { @@ -449,8 +519,10 @@ float16x8_t test_vrecpeq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrnd_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <4 x half> @llvm.trunc.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <4 x half> @llvm.trunc.v4f16(<4 x half> [[VRNDZ_I]]) // CHECK-NEXT: ret <4 x half> [[VRNDZ1_I]] // float16x4_t test_vrnd_f16(float16x4_t a) { @@ -460,8 +532,10 @@ float16x4_t test_vrnd_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <8 x half> @llvm.trunc.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <8 x half> @llvm.trunc.v8f16(<8 x half> [[VRNDZ_I]]) // CHECK-NEXT: ret <8 x half> [[VRNDZ1_I]] // float16x8_t test_vrndq_f16(float16x8_t a) { @@ -471,8 +545,10 @@ float16x8_t test_vrndq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrnda_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <4 x half> @llvm.round.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <4 x half> @llvm.round.v4f16(<4 x half> [[VRNDA_I]]) // CHECK-NEXT: ret <4 x half> [[VRNDA1_I]] // float16x4_t test_vrnda_f16(float16x4_t a) { @@ -482,8 +558,10 @@ float16x4_t test_vrnda_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndaq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <8 x half> @llvm.round.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <8 x half> @llvm.round.v8f16(<8 x half> [[VRNDA_I]]) // CHECK-NEXT: ret <8 x half> [[VRNDA1_I]] // float16x8_t test_vrndaq_f16(float16x8_t a) { @@ -493,8 +571,10 @@ float16x8_t test_vrndaq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndi_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VRNDI_V1_I:%.*]] = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDI_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRNDI_V1_I:%.*]] = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> [[VRNDI_V_I]]) // CHECK-NEXT: ret <4 x half> [[VRNDI_V1_I]] // float16x4_t test_vrndi_f16(float16x4_t a) { @@ -504,8 +584,10 @@ float16x4_t test_vrndi_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndiq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VRNDIQ_V1_I:%.*]] = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDIQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDIQ_V1_I:%.*]] = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> [[VRNDIQ_V_I]]) // CHECK-NEXT: ret <8 x half> [[VRNDIQ_V1_I]] // float16x8_t test_vrndiq_f16(float16x8_t a) { @@ -515,8 +597,10 @@ float16x8_t test_vrndiq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndm_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <4 x half> @llvm.floor.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <4 x half> @llvm.floor.v4f16(<4 x half> [[VRNDM_I]]) // CHECK-NEXT: ret <4 x half> [[VRNDM1_I]] // float16x4_t test_vrndm_f16(float16x4_t a) { @@ -526,8 +610,10 @@ float16x4_t test_vrndm_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndmq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <8 x half> @llvm.floor.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <8 x half> @llvm.floor.v8f16(<8 x half> [[VRNDM_I]]) // CHECK-NEXT: ret <8 x half> [[VRNDM1_I]] // float16x8_t test_vrndmq_f16(float16x8_t a) { @@ -537,8 +623,10 @@ float16x8_t test_vrndmq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndn_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <4 x half> @llvm.roundeven.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <4 x half> @llvm.roundeven.v4f16(<4 x half> [[VRNDN_I]]) // CHECK-NEXT: ret <4 x half> [[VRNDN1_I]] // float16x4_t test_vrndn_f16(float16x4_t a) { @@ -548,8 +636,10 @@ float16x4_t test_vrndn_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndnq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <8 x half> @llvm.roundeven.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <8 x half> @llvm.roundeven.v8f16(<8 x half> [[VRNDN_I]]) // CHECK-NEXT: ret <8 x half> [[VRNDN1_I]] // float16x8_t test_vrndnq_f16(float16x8_t a) { @@ -559,8 +649,10 @@ float16x8_t test_vrndnq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndp_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <4 x half> @llvm.ceil.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <4 x half> @llvm.ceil.v4f16(<4 x half> [[VRNDP_I]]) // CHECK-NEXT: ret <4 x half> [[VRNDP1_I]] // float16x4_t test_vrndp_f16(float16x4_t a) { @@ -570,8 +662,10 @@ float16x4_t test_vrndp_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndpq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <8 x half> @llvm.ceil.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <8 x half> @llvm.ceil.v8f16(<8 x half> [[VRNDP_I]]) // CHECK-NEXT: ret <8 x half> [[VRNDP1_I]] // float16x8_t test_vrndpq_f16(float16x8_t a) { @@ -581,8 +675,10 @@ float16x8_t test_vrndpq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndx_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <4 x half> @llvm.rint.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <4 x half> @llvm.rint.v4f16(<4 x half> [[VRNDX_I]]) // CHECK-NEXT: ret <4 x half> [[VRNDX1_I]] // float16x4_t test_vrndx_f16(float16x4_t a) { @@ -592,8 +688,10 @@ float16x4_t test_vrndx_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndxq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <8 x half> @llvm.rint.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <8 x half> @llvm.rint.v8f16(<8 x half> [[VRNDX_I]]) // CHECK-NEXT: ret <8 x half> [[VRNDX1_I]] // float16x8_t test_vrndxq_f16(float16x8_t a) { @@ -603,8 +701,10 @@ float16x8_t test_vrndxq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrsqrte_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> [[VRSQRTE_V_I]]) // CHECK-NEXT: ret <4 x half> [[VRSQRTE_V1_I]] // float16x4_t test_vrsqrte_f16(float16x4_t a) { @@ -614,8 +714,10 @@ float16x4_t test_vrsqrte_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrsqrteq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> [[VRSQRTEQ_V_I]]) // CHECK-NEXT: ret <8 x half> [[VRSQRTEQ_V1_I]] // float16x8_t test_vrsqrteq_f16(float16x8_t a) { @@ -625,8 +727,10 @@ float16x8_t test_vrsqrteq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vsqrt_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VSQRT_I:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VSQRT_I:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> [[TMP2]]) // CHECK-NEXT: ret <4 x half> [[VSQRT_I]] // float16x4_t test_vsqrt_f16(float16x4_t a) { @@ -636,8 +740,10 @@ float16x4_t test_vsqrt_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vsqrtq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VSQRT_I:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VSQRT_I:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> [[TMP2]]) // CHECK-NEXT: ret <8 x half> [[VSQRT_I]] // float16x8_t test_vsqrtq_f16(float16x8_t a) { @@ -667,9 +773,13 @@ float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vabd_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fabd.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fabd.v4f16(<4 x half> [[VABD_I]], <4 x half> [[VABD1_I]]) // CHECK-NEXT: ret <4 x half> [[VABD2_I]] // float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) { @@ -679,9 +789,13 @@ float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vabdq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fabd.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fabd.v8f16(<8 x half> [[VABD_I]], <8 x half> [[VABD1_I]]) // CHECK-NEXT: ret <8 x half> [[VABD2_I]] // float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) { @@ -691,9 +805,13 @@ float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcage_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[VCAGE_V_I]], <4 x half> [[VCAGE_V1_I]]) // CHECK-NEXT: ret <4 x i16> [[VCAGE_V2_I]] // uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) { @@ -703,9 +821,13 @@ uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcageq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[VCAGEQ_V_I]], <8 x half> [[VCAGEQ_V1_I]]) // CHECK-NEXT: ret <8 x i16> [[VCAGEQ_V2_I]] // uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) { @@ -715,9 +837,13 @@ uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcagt_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[VCAGT_V_I]], <4 x half> [[VCAGT_V1_I]]) // CHECK-NEXT: ret <4 x i16> [[VCAGT_V2_I]] // uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) { @@ -727,9 +853,13 @@ uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcagtq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[VCAGTQ_V_I]], <8 x half> [[VCAGTQ_V1_I]]) // CHECK-NEXT: ret <8 x i16> [[VCAGTQ_V2_I]] // uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) { @@ -739,9 +869,13 @@ uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcale_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[VCALE_V_I]], <4 x half> [[VCALE_V1_I]]) // CHECK-NEXT: ret <4 x i16> [[VCALE_V2_I]] // uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) { @@ -751,9 +885,13 @@ uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcaleq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[VCALEQ_V_I]], <8 x half> [[VCALEQ_V1_I]]) // CHECK-NEXT: ret <8 x i16> [[VCALEQ_V2_I]] // uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) { @@ -763,9 +901,13 @@ uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcalt_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[VCALT_V_I]], <4 x half> [[VCALT_V1_I]]) // CHECK-NEXT: ret <4 x i16> [[VCALT_V2_I]] // uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) { @@ -775,9 +917,13 @@ uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcaltq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[VCALTQ_V_I]], <8 x half> [[VCALTQ_V1_I]]) // CHECK-NEXT: ret <8 x i16> [[VCALTQ_V2_I]] // uint16x8_t test_vcaltq_f16(float16x8_t a, float16x8_t b) { @@ -945,8 +1091,9 @@ float16x8_t test_vcvtq_n_f16_u16(uint16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_s16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> // CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> [[VCVT_N]], i32 2) // CHECK-NEXT: ret <4 x i16> [[VCVT_N1]] // @@ -957,8 +1104,9 @@ int16x4_t test_vcvt_n_s16_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_s16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> // CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> [[VCVT_N]], i32 2) // CHECK-NEXT: ret <8 x i16> [[VCVT_N1]] // @@ -969,8 +1117,9 @@ int16x8_t test_vcvtq_n_s16_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_u16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> // CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> [[VCVT_N]], i32 2) // CHECK-NEXT: ret <4 x i16> [[VCVT_N1]] // @@ -981,8 +1130,9 @@ uint16x4_t test_vcvt_n_u16_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_u16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> // CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> [[VCVT_N]], i32 2) // CHECK-NEXT: ret <8 x i16> [[VCVT_N1]] // @@ -1013,9 +1163,13 @@ float16x8_t test_vdivq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmax_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmax.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmax.v4f16(<4 x half> [[VMAX_I]], <4 x half> [[VMAX1_I]]) // CHECK-NEXT: ret <4 x half> [[VMAX2_I]] // float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) { @@ -1025,9 +1179,13 @@ float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmax.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmax.v8f16(<8 x half> [[VMAX_I]], <8 x half> [[VMAX1_I]]) // CHECK-NEXT: ret <8 x half> [[VMAX2_I]] // float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) { @@ -1037,9 +1195,13 @@ float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxnm_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnm.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnm.v4f16(<4 x half> [[VMAXNM_I]], <4 x half> [[VMAXNM1_I]]) // CHECK-NEXT: ret <4 x half> [[VMAXNM2_I]] // float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) { @@ -1049,9 +1211,13 @@ float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnm.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnm.v8f16(<8 x half> [[VMAXNM_I]], <8 x half> [[VMAXNM1_I]]) // CHECK-NEXT: ret <8 x half> [[VMAXNM2_I]] // float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) { @@ -1061,9 +1227,13 @@ float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmin_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmin.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmin.v4f16(<4 x half> [[VMIN_I]], <4 x half> [[VMIN1_I]]) // CHECK-NEXT: ret <4 x half> [[VMIN2_I]] // float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) { @@ -1073,9 +1243,13 @@ float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vminq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmin.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmin.v8f16(<8 x half> [[VMIN_I]], <8 x half> [[VMIN1_I]]) // CHECK-NEXT: ret <8 x half> [[VMIN2_I]] // float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) { @@ -1085,9 +1259,13 @@ float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vminnm_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnm.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnm.v4f16(<4 x half> [[VMINNM_I]], <4 x half> [[VMINNM1_I]]) // CHECK-NEXT: ret <4 x half> [[VMINNM2_I]] // float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) { @@ -1097,9 +1275,13 @@ float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vminnmq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnm.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnm.v8f16(<8 x half> [[VMINNM_I]], <8 x half> [[VMINNM1_I]]) // CHECK-NEXT: ret <8 x half> [[VMINNM2_I]] // float16x8_t test_vminnmq_f16(float16x8_t a, float16x8_t b) { @@ -1129,9 +1311,13 @@ float16x8_t test_vmulq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulx_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[VMULX_I]], <4 x half> [[VMULX1_I]]) // CHECK-NEXT: ret <4 x half> [[VMULX2_I]] // float16x4_t test_vmulx_f16(float16x4_t a, float16x4_t b) { @@ -1141,9 +1327,13 @@ float16x4_t test_vmulx_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulxq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[VMULX_I]], <8 x half> [[VMULX1_I]]) // CHECK-NEXT: ret <8 x half> [[VMULX2_I]] // float16x8_t test_vmulxq_f16(float16x8_t a, float16x8_t b) { @@ -1153,11 +1343,17 @@ float16x8_t test_vmulxq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpadd_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> [[VPADD_V_I]], <4 x half> [[VPADD_V1_I]]) // CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <4 x half> [[VPADD_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x half> [[VPADD_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] // float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) { return vpadd_f16(a, b); @@ -1166,11 +1362,17 @@ float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpaddq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> [[VPADDQ_V_I]], <8 x half> [[VPADDQ_V1_I]]) // CHECK-NEXT: [[VPADDQ_V3_I:%.*]] = bitcast <8 x half> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x half> [[VPADDQ_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP5]] // float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) { return vpaddq_f16(a, b); @@ -1179,9 +1381,13 @@ float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpmax_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxp.v4f16(<4 x half> [[VPMAX_I]], <4 x half> [[VPMAX1_I]]) // CHECK-NEXT: ret <4 x half> [[VPMAX2_I]] // float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) { @@ -1191,9 +1397,13 @@ float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpmaxq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxp.v8f16(<8 x half> [[VPMAX_I]], <8 x half> [[VPMAX1_I]]) // CHECK-NEXT: ret <8 x half> [[VPMAX2_I]] // float16x8_t test_vpmaxq_f16(float16x8_t a, float16x8_t b) { @@ -1203,9 +1413,13 @@ float16x8_t test_vpmaxq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpmaxnm_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VPMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnmp.v4f16(<4 x half> [[VPMAXNM_I]], <4 x half> [[VPMAXNM1_I]]) // CHECK-NEXT: ret <4 x half> [[VPMAXNM2_I]] // float16x4_t test_vpmaxnm_f16(float16x4_t a, float16x4_t b) { @@ -1215,9 +1429,13 @@ float16x4_t test_vpmaxnm_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpmaxnmq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnmp.v8f16(<8 x half> [[VPMAXNM_I]], <8 x half> [[VPMAXNM1_I]]) // CHECK-NEXT: ret <8 x half> [[VPMAXNM2_I]] // float16x8_t test_vpmaxnmq_f16(float16x8_t a, float16x8_t b) { @@ -1227,9 +1445,13 @@ float16x8_t test_vpmaxnmq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpmin_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminp.v4f16(<4 x half> [[VPMIN_I]], <4 x half> [[VPMIN1_I]]) // CHECK-NEXT: ret <4 x half> [[VPMIN2_I]] // float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) { @@ -1239,9 +1461,13 @@ float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpminq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminp.v8f16(<8 x half> [[VPMIN_I]], <8 x half> [[VPMIN1_I]]) // CHECK-NEXT: ret <8 x half> [[VPMIN2_I]] // float16x8_t test_vpminq_f16(float16x8_t a, float16x8_t b) { @@ -1251,9 +1477,13 @@ float16x8_t test_vpminq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpminnm_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPMINNM_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VPMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnmp.v4f16(<4 x half> [[VPMINNM_I]], <4 x half> [[VPMINNM1_I]]) // CHECK-NEXT: ret <4 x half> [[VPMINNM2_I]] // float16x4_t test_vpminnm_f16(float16x4_t a, float16x4_t b) { @@ -1263,9 +1493,13 @@ float16x4_t test_vpminnm_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpminnmq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnmp.v8f16(<8 x half> [[VPMINNM_I]], <8 x half> [[VPMINNM1_I]]) // CHECK-NEXT: ret <8 x half> [[VPMINNM2_I]] // float16x8_t test_vpminnmq_f16(float16x8_t a, float16x8_t b) { @@ -1275,11 +1509,17 @@ float16x8_t test_vpminnmq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vrecps_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> [[VRECPS_V_I]], <4 x half> [[VRECPS_V1_I]]) // CHECK-NEXT: [[VRECPS_V3_I:%.*]] = bitcast <4 x half> [[VRECPS_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x half> [[VRECPS_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] // float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) { return vrecps_f16(a, b); @@ -1288,11 +1528,17 @@ float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vrecpsq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> [[VRECPSQ_V_I]], <8 x half> [[VRECPSQ_V1_I]]) // CHECK-NEXT: [[VRECPSQ_V3_I:%.*]] = bitcast <8 x half> [[VRECPSQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x half> [[VRECPSQ_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP5]] // float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) { return vrecpsq_f16(a, b); @@ -1301,11 +1547,17 @@ float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vrsqrts_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> [[VRSQRTS_V_I]], <4 x half> [[VRSQRTS_V1_I]]) // CHECK-NEXT: [[VRSQRTS_V3_I:%.*]] = bitcast <4 x half> [[VRSQRTS_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <4 x half> [[VRSQRTS_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] // float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) { return vrsqrts_f16(a, b); @@ -1314,11 +1566,17 @@ float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vrsqrtsq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> [[VRSQRTSQ_V_I]], <8 x half> [[VRSQRTSQ_V1_I]]) // CHECK-NEXT: [[VRSQRTSQ_V3_I:%.*]] = bitcast <8 x half> [[VRSQRTSQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <8 x half> [[VRSQRTSQ_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP5]] // float16x8_t test_vrsqrtsq_f16(float16x8_t a, float16x8_t b) { return vrsqrtsq_f16(a, b); @@ -1347,11 +1605,17 @@ float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vfma_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]]) -// CHECK-NEXT: ret <4 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]]) +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_f16(a, b, c); @@ -1360,11 +1624,17 @@ float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]]) -// CHECK-NEXT: ret <8 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]]) +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_f16(a, b, c); @@ -1374,11 +1644,17 @@ float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x half> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]]) -// CHECK-NEXT: ret <4 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG_I]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]]) +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfms_f16(a, b, c); @@ -1388,11 +1664,17 @@ float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) { // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <8 x half> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]]) -// CHECK-NEXT: ret <8 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG_I]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]]) +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmsq_f16(a, b, c); @@ -1401,13 +1683,16 @@ float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfma_lane_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <4 x i32> +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]]) // CHECK-NEXT: ret <4 x half> [[FMLA2]] // @@ -1418,13 +1703,16 @@ float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_lane_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <8 x i32> +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> // CHECK-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]]) // CHECK-NEXT: ret <8 x half> [[FMLA2]] // @@ -1435,15 +1723,18 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfma_laneq_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]]) -// CHECK-NEXT: ret <4 x half> [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <4 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP7]], <4 x half> [[TMP6]]) +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { return vfma_laneq_f16(a, b, c, 7); @@ -1452,15 +1743,18 @@ float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_laneq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]]) -// CHECK-NEXT: ret <8 x half> [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <8 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP7]], <8 x half> [[TMP6]]) +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_laneq_f16(a, b, c, 7); @@ -1473,11 +1767,17 @@ float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { // CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1 // CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2 // CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) -// CHECK-NEXT: ret <4 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]]) +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) { return vfma_n_f16(a, b, c); @@ -1494,11 +1794,17 @@ float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) { // CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5 // CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6 // CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) -// CHECK-NEXT: ret <8 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]]) +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { return vfmaq_n_f16(a, b, c); @@ -1529,14 +1835,17 @@ float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfms_lane_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <4 x i32> +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> // CHECK-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]]) // CHECK-NEXT: ret <4 x half> [[FMLA2]] // @@ -1547,14 +1856,17 @@ float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmsq_lane_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> // CHECK-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP6]], <4 x half> [[TMP6]], <8 x i32> +// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> // CHECK-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]]) // CHECK-NEXT: ret <8 x half> [[FMLA2]] // @@ -1565,16 +1877,19 @@ float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfms_laneq_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]]) -// CHECK-NEXT: ret <4 x half> [[TMP6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <4 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP7]], <4 x half> [[TMP6]]) +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vfms_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { return vfms_laneq_f16(a, b, c, 7); @@ -1583,16 +1898,19 @@ float16x4_t test_vfms_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmsq_laneq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> // CHECK-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]]) -// CHECK-NEXT: ret <8 x half> [[TMP6]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP8]], <8 x half> [[TMP8]], <8 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP7]], <8 x half> [[TMP6]]) +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmsq_laneq_f16(a, b, c, 7); @@ -1606,11 +1924,17 @@ float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { // CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1 // CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2 // CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) -// CHECK-NEXT: ret <4 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]]) +// CHECK-NEXT: ret <4 x half> [[TMP9]] // float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) { return vfms_n_f16(a, b, c); @@ -1628,11 +1952,17 @@ float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) { // CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5 // CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6 // CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) -// CHECK-NEXT: ret <8 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]]) +// CHECK-NEXT: ret <8 x half> [[TMP9]] // float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { return vfmsq_n_f16(a, b, c); @@ -1669,9 +1999,10 @@ float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vmul_lane_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x half> [[A]], [[LANE]] // CHECK-NEXT: ret <4 x half> [[MUL]] // @@ -1682,9 +2013,10 @@ float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulq_lane_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <8 x half> [[A]], [[LANE]] // CHECK-NEXT: ret <8 x half> [[MUL]] // @@ -1695,9 +2027,10 @@ float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmul_laneq_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP2]], <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x half> [[A]], [[LANE]] // CHECK-NEXT: ret <4 x half> [[MUL]] // @@ -1708,9 +2041,10 @@ float16x4_t test_vmul_laneq_f16(float16x4_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulq_laneq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP2]], <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <8 x half> [[A]], [[LANE]] // CHECK-NEXT: ret <8 x half> [[MUL]] // @@ -1754,16 +2088,12 @@ float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulh_lane_f16 // CHECK-SAME: (half noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 // CHECK-NEXT: [[CONV:%.*]] = fpext half [[A]] to float -// CHECK-NEXT: store <4 x half> [[B]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[CONV2:%.*]] = fpext half [[TMP1]] to float -// CHECK-NEXT: [[MUL:%.*]] = fmul float [[CONV]], [[CONV2]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half +// CHECK-NEXT: [[CONV3:%.*]] = fpext half [[TMP1]] to float +// CHECK-NEXT: [[MUL:%.*]] = fmul float [[CONV]], [[CONV3]] // CHECK-NEXT: [[TMP2:%.*]] = fptrunc float [[MUL]] to half // CHECK-NEXT: ret half [[TMP2]] // @@ -1774,16 +2104,12 @@ float16_t test_vmulh_lane_f16(float16_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulh_laneq_f16 // CHECK-SAME: (half noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 // CHECK-NEXT: [[CONV:%.*]] = fpext half [[A]] to float -// CHECK-NEXT: store <8 x half> [[B]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> // CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[CONV2:%.*]] = fpext half [[TMP1]] to float -// CHECK-NEXT: [[MUL:%.*]] = fmul float [[CONV]], [[CONV2]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGETQ_LANE]] to half +// CHECK-NEXT: [[CONV3:%.*]] = fpext half [[TMP1]] to float +// CHECK-NEXT: [[MUL:%.*]] = fmul float [[CONV]], [[CONV3]] // CHECK-NEXT: [[TMP2:%.*]] = fptrunc float [[MUL]] to half // CHECK-NEXT: ret half [[TMP2]] // @@ -1794,12 +2120,17 @@ float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulx_lane_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[LANE]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[VMULX_I]], <4 x half> [[VMULX1_I]]) // CHECK-NEXT: ret <4 x half> [[VMULX2_I]] // float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) { @@ -1809,12 +2140,17 @@ float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulxq_lane_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <8 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[LANE]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[VMULX_I]], <8 x half> [[VMULX1_I]]) // CHECK-NEXT: ret <8 x half> [[VMULX2_I]] // float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) { @@ -1824,12 +2160,17 @@ float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulx_laneq_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP2]], <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[LANE]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[VMULX_I]], <4 x half> [[VMULX1_I]]) // CHECK-NEXT: ret <4 x half> [[VMULX2_I]] // float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) { @@ -1839,12 +2180,17 @@ float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulxq_laneq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP2]], <8 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[LANE]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[VMULX_I]], <8 x half> [[VMULX1_I]]) // CHECK-NEXT: ret <8 x half> [[VMULX2_I]] // float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) { @@ -1858,9 +2204,13 @@ float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) { // CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[B]], i32 1 // CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[B]], i32 2 // CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[B]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[VECINIT3]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VECINIT3]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[VMULX_I]], <4 x half> [[VMULX1_I]]) // CHECK-NEXT: ret <4 x half> [[VMULX2_I]] // float16x4_t test_vmulx_n_f16(float16x4_t a, float16_t b) { @@ -1878,9 +2228,13 @@ float16x4_t test_vmulx_n_f16(float16x4_t a, float16_t b) { // CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[B]], i32 5 // CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[B]], i32 6 // CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[B]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[VECINIT7]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VECINIT7]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[VMULX_I]], <8 x half> [[VMULX1_I]]) // CHECK-NEXT: ret <8 x half> [[VMULX2_I]] // float16x8_t test_vmulxq_n_f16(float16x8_t a, float16_t b) { @@ -1912,8 +2266,9 @@ float16_t test_vmulxh_laneq_f16(float16_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VMAXV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VMAXV:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> // CHECK-NEXT: [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v4f16(<4 x half> [[VMAXV]]) // CHECK-NEXT: ret half [[VMAXV1]] // @@ -1924,8 +2279,9 @@ float16_t test_vmaxv_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VMAXV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VMAXV:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> // CHECK-NEXT: [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v8f16(<8 x half> [[VMAXV]]) // CHECK-NEXT: ret half [[VMAXV1]] // @@ -1936,8 +2292,9 @@ float16_t test_vmaxvq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminv_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VMINV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VMINV:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> // CHECK-NEXT: [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v4f16(<4 x half> [[VMINV]]) // CHECK-NEXT: ret half [[VMINV1]] // @@ -1948,8 +2305,9 @@ float16_t test_vminv_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminvq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VMINV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VMINV:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> // CHECK-NEXT: [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v8f16(<8 x half> [[VMINV]]) // CHECK-NEXT: ret half [[VMINV1]] // @@ -1960,8 +2318,9 @@ float16_t test_vminvq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmv_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VMAXNMV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VMAXNMV:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> // CHECK-NEXT: [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v4f16(<4 x half> [[VMAXNMV]]) // CHECK-NEXT: ret half [[VMAXNMV1]] // @@ -1972,8 +2331,9 @@ float16_t test_vmaxnmv_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmvq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VMAXNMV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VMAXNMV:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> // CHECK-NEXT: [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v8f16(<8 x half> [[VMAXNMV]]) // CHECK-NEXT: ret half [[VMAXNMV1]] // @@ -1984,8 +2344,9 @@ float16_t test_vmaxnmvq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminnmv_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VMINNMV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VMINNMV:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> // CHECK-NEXT: [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v4f16(<4 x half> [[VMINNMV]]) // CHECK-NEXT: ret half [[VMINNMV1]] // @@ -1996,8 +2357,9 @@ float16_t test_vminnmv_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminnmvq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VMINNMV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VMINNMV:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> // CHECK-NEXT: [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v8f16(<8 x half> [[VMINNMV]]) // CHECK-NEXT: ret half [[VMINNMV1]] // diff --git a/clang/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.c b/clang/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.c index c44dd333c9754..0138fad1a7792 100644 --- a/clang/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.c +++ b/clang/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.c @@ -1,120 +1,273 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v8.5a\ // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target #include -// CHECK-LABEL: test_vrnd32x_f32 -// CHECK: [[RND:%.*]] = call <2 x float> @llvm.aarch64.neon.frint32x.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[RND]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrnd32x_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRND32X_F32_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VRND32X_F321_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frint32x.v2f32(<2 x float> [[VRND32X_F32_I]]) +// CHECK-NEXT: [[VRND32X_F322_I:%.*]] = bitcast <2 x float> [[VRND32X_F321_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRND32X_F322_I]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP3]] +// float32x2_t test_vrnd32x_f32(float32x2_t a) { return vrnd32x_f32(a); } -// CHECK-LABEL: test_vrnd32xq_f32 -// CHECK: [[RND:%.*]] = call <4 x float> @llvm.aarch64.neon.frint32x.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[RND]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrnd32xq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRND32XQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VRND32XQ_F321_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frint32x.v4f32(<4 x float> [[VRND32XQ_F32_I]]) +// CHECK-NEXT: [[VRND32XQ_F322_I:%.*]] = bitcast <4 x float> [[VRND32XQ_F321_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRND32XQ_F322_I]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// float32x4_t test_vrnd32xq_f32(float32x4_t a) { return vrnd32xq_f32(a); } -// CHECK-LABEL: test_vrnd32z_f32 -// CHECK: [[RND:%.*]] = call <2 x float> @llvm.aarch64.neon.frint32z.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[RND]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrnd32z_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRND32Z_F32_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VRND32Z_F321_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frint32z.v2f32(<2 x float> [[VRND32Z_F32_I]]) +// CHECK-NEXT: [[VRND32Z_F322_I:%.*]] = bitcast <2 x float> [[VRND32Z_F321_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRND32Z_F322_I]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP3]] +// float32x2_t test_vrnd32z_f32(float32x2_t a) { return vrnd32z_f32(a); } -// CHECK-LABEL: test_vrnd32zq_f32 -// CHECK: [[RND:%.*]] = call <4 x float> @llvm.aarch64.neon.frint32z.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[RND]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrnd32zq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRND32ZQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VRND32ZQ_F321_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frint32z.v4f32(<4 x float> [[VRND32ZQ_F32_I]]) +// CHECK-NEXT: [[VRND32ZQ_F322_I:%.*]] = bitcast <4 x float> [[VRND32ZQ_F321_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRND32ZQ_F322_I]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// float32x4_t test_vrnd32zq_f32(float32x4_t a) { return vrnd32zq_f32(a); } -// CHECK-LABEL: test_vrnd64x_f32 -// CHECK: [[RND:%.*]] = call <2 x float> @llvm.aarch64.neon.frint64x.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[RND]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrnd64x_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRND64X_F32_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VRND64X_F321_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frint64x.v2f32(<2 x float> [[VRND64X_F32_I]]) +// CHECK-NEXT: [[VRND64X_F322_I:%.*]] = bitcast <2 x float> [[VRND64X_F321_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRND64X_F322_I]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP3]] +// float32x2_t test_vrnd64x_f32(float32x2_t a) { return vrnd64x_f32(a); } -// CHECK-LABEL: test_vrnd64xq_f32 -// CHECK: [[RND:%.*]] = call <4 x float> @llvm.aarch64.neon.frint64x.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[RND]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrnd64xq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRND64XQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VRND64XQ_F321_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frint64x.v4f32(<4 x float> [[VRND64XQ_F32_I]]) +// CHECK-NEXT: [[VRND64XQ_F322_I:%.*]] = bitcast <4 x float> [[VRND64XQ_F321_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRND64XQ_F322_I]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// float32x4_t test_vrnd64xq_f32(float32x4_t a) { return vrnd64xq_f32(a); } -// CHECK-LABEL: test_vrnd64z_f32 -// CHECK: [[RND:%.*]] = call <2 x float> @llvm.aarch64.neon.frint64z.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[RND]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrnd64z_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRND64Z_F32_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VRND64Z_F321_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frint64z.v2f32(<2 x float> [[VRND64Z_F32_I]]) +// CHECK-NEXT: [[VRND64Z_F322_I:%.*]] = bitcast <2 x float> [[VRND64Z_F321_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRND64Z_F322_I]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP3]] +// float32x2_t test_vrnd64z_f32(float32x2_t a) { return vrnd64z_f32(a); } -// CHECK-LABEL: test_vrnd64zq_f32 -// CHECK: [[RND:%.*]] = call <4 x float> @llvm.aarch64.neon.frint64z.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[RND]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrnd64zq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRND64ZQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VRND64ZQ_F321_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frint64z.v4f32(<4 x float> [[VRND64ZQ_F32_I]]) +// CHECK-NEXT: [[VRND64ZQ_F322_I:%.*]] = bitcast <4 x float> [[VRND64ZQ_F321_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRND64ZQ_F322_I]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// float32x4_t test_vrnd64zq_f32(float32x4_t a) { return vrnd64zq_f32(a); } -// CHECK-LABEL: test_vrnd32x_f64 -// CHECK: [[RND:%.*]] = call <1 x double> @llvm.aarch64.neon.frint32x.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[RND]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrnd32x_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRND32X_F64_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRND32X_F641_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frint32x.v1f64(<1 x double> [[VRND32X_F64_I]]) +// CHECK-NEXT: [[VRND32X_F642_I:%.*]] = bitcast <1 x double> [[VRND32X_F641_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRND32X_F642_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP3]] +// float64x1_t test_vrnd32x_f64(float64x1_t a) { return vrnd32x_f64(a); } -// CHECK-LABEL: test_vrnd32xq_f64 -// CHECK: [[RND:%.*]] = call <2 x double> @llvm.aarch64.neon.frint32x.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[RND]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrnd32xq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRND32XQ_F64_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRND32XQ_F641_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frint32x.v2f64(<2 x double> [[VRND32XQ_F64_I]]) +// CHECK-NEXT: [[VRND32XQ_F642_I:%.*]] = bitcast <2 x double> [[VRND32XQ_F641_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRND32XQ_F642_I]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t test_vrnd32xq_f64(float64x2_t a) { return vrnd32xq_f64(a); } -// CHECK-LABEL: test_vrnd32z_f64 -// CHECK: [[RND:%.*]] = call <1 x double> @llvm.aarch64.neon.frint32z.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[RND]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrnd32z_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRND32Z_F64_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRND32Z_F641_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frint32z.v1f64(<1 x double> [[VRND32Z_F64_I]]) +// CHECK-NEXT: [[VRND32Z_F642_I:%.*]] = bitcast <1 x double> [[VRND32Z_F641_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRND32Z_F642_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP3]] +// float64x1_t test_vrnd32z_f64(float64x1_t a) { return vrnd32z_f64(a); } -// CHECK-LABEL: test_vrnd32zq_f64 -// CHECK: [[RND:%.*]] = call <2 x double> @llvm.aarch64.neon.frint32z.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[RND]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrnd32zq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRND32ZQ_F64_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRND32ZQ_F641_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frint32z.v2f64(<2 x double> [[VRND32ZQ_F64_I]]) +// CHECK-NEXT: [[VRND32ZQ_F642_I:%.*]] = bitcast <2 x double> [[VRND32ZQ_F641_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRND32ZQ_F642_I]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t test_vrnd32zq_f64(float64x2_t a) { return vrnd32zq_f64(a); } -// CHECK-LABEL: test_vrnd64x_f64 -// CHECK: [[RND:%.*]] = call <1 x double> @llvm.aarch64.neon.frint64x.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[RND]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrnd64x_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRND64X_F64_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRND64X_F641_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frint64x.v1f64(<1 x double> [[VRND64X_F64_I]]) +// CHECK-NEXT: [[VRND64X_F642_I:%.*]] = bitcast <1 x double> [[VRND64X_F641_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRND64X_F642_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP3]] +// float64x1_t test_vrnd64x_f64(float64x1_t a) { return vrnd64x_f64(a); } -// CHECK-LABEL: test_vrnd64xq_f64 -// CHECK: [[RND:%.*]] = call <2 x double> @llvm.aarch64.neon.frint64x.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[RND]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrnd64xq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRND64XQ_F64_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRND64XQ_F641_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frint64x.v2f64(<2 x double> [[VRND64XQ_F64_I]]) +// CHECK-NEXT: [[VRND64XQ_F642_I:%.*]] = bitcast <2 x double> [[VRND64XQ_F641_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRND64XQ_F642_I]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t test_vrnd64xq_f64(float64x2_t a) { return vrnd64xq_f64(a); } -// CHECK-LABEL: test_vrnd64z_f64 -// CHECK: [[RND:%.*]] = call <1 x double> @llvm.aarch64.neon.frint64z.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[RND]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrnd64z_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[VRND64Z_F64_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[VRND64Z_F641_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frint64z.v1f64(<1 x double> [[VRND64Z_F64_I]]) +// CHECK-NEXT: [[VRND64Z_F642_I:%.*]] = bitcast <1 x double> [[VRND64Z_F641_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRND64Z_F642_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP3]] +// float64x1_t test_vrnd64z_f64(float64x1_t a) { return vrnd64z_f64(a); } -// CHECK-LABEL: test_vrnd64zq_f64 -// CHECK: [[RND:%.*]] = call <2 x double> @llvm.aarch64.neon.frint64z.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[RND]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrnd64zq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRND64ZQ_F64_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRND64ZQ_F641_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frint64z.v2f64(<2 x double> [[VRND64ZQ_F64_I]]) +// CHECK-NEXT: [[VRND64ZQ_F642_I:%.*]] = bitcast <2 x double> [[VRND64ZQ_F641_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRND64ZQ_F642_I]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t test_vrnd64zq_f64(float64x2_t a) { return vrnd64zq_f64(a); } diff --git a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c index 7bfeb7939edb2..6fffcb6c6b391 100644 --- a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c @@ -1,3 +1,4 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ // RUN: | opt -S -passes=mem2reg,sroa \ @@ -7,141 +8,198 @@ #include -// CHECK-LABEL: test_vmmlaq_s32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VMMLA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> [[VMMLA_I]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMMLA1_I]] +// int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) { return vmmlaq_s32(r, a, b); } -// CHECK-LABEL: test_vmmlaq_u32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_u32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VMMLA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> [[VMMLA_I]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMMLA1_I]] +// uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) { return vmmlaq_u32(r, a, b); } -// CHECK-LABEL: test_vusmmlaq_s32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VUSMMLA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VUSMMLA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> [[VUSMMLA_I]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VUSMMLA1_I]] +// int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { return vusmmlaq_s32(r, a, b); } -// CHECK-LABEL: test_vusdot_s32 -// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) -// CHECK: ret <2 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { return vusdot_s32(r, a, b); } -// CHECK-LABEL: test_vusdot_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> -// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) -// CHECK: ret <2 x i32> [[OP]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[A]], <8 x i8> [[TMP3]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { return vusdot_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vsudot_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %0 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> %1 to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> -// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) -// CHECK: ret <2 x i32> [[OP]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsudot_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[TMP3]], <8 x i8> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) { return vsudot_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vusdot_laneq_s32 -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> -// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) -// CHECK: ret <2 x i32> [[OP]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_laneq_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[A]], <8 x i8> [[TMP3]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) { return vusdot_laneq_s32(r, a, b, 0); } -// CHECK-LABEL: test_vsudot_laneq_s32 -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> -// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) -// CHECK: ret <2 x i32> [[OP]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsudot_laneq_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[TMP3]], <8 x i8> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) { return vsudot_laneq_s32(r, a, b, 0); } -// CHECK-LABEL: test_vusdotq_s32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[VUSDOT_I]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { return vusdotq_s32(r, a, b); } -// CHECK-LABEL: test_vusdotq_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> -// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) -// CHECK: ret <4 x i32> [[OP]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[VUSDOT_I]], <16 x i8> [[A]], <16 x i8> [[TMP3]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) { return vusdotq_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vsudotq_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> -// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) -// CHECK: ret <4 x i32> [[OP]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsudotq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[VUSDOT_I]], <16 x i8> [[TMP3]], <16 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) { return vsudotq_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vusdotq_laneq_s32 -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> -// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) -// CHECK: ret <4 x i32> [[OP]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_laneq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[VUSDOT_I]], <16 x i8> [[A]], <16 x i8> [[TMP3]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { return vusdotq_laneq_s32(r, a, b, 0); } -// CHECK-LABEL: test_vsudotq_laneq_s32 -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> -// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) -// CHECK: ret <4 x i32> [[OP]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsudotq_laneq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[VUSDOT_I]], <16 x i8> [[TMP3]], <16 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) { return vsudotq_laneq_s32(r, a, b, 0); } diff --git a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c index c4f0b78fc6a57..874b1c4f36867 100644 --- a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c +++ b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c @@ -2,11 +2,11 @@ // RUN: %clang_cc1 -triple armv8-arm-none-eabi \ // RUN: -target-feature +neon -target-feature +bf16 -mfloat-abi soft \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck %s +// RUN: | opt -S -passes=mem2reg,sroa | FileCheck %s // RUN: %clang_cc1 -triple armv8-arm-none-eabi \ // RUN: -target-feature +neon -target-feature +bf16 -mfloat-abi hard \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck %s +// RUN: | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -14,10 +14,16 @@ // CHECK-LABEL: @test_vbfdot_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[VBFDOT_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT2_I:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[VBFDOT_I]], <4 x bfloat> [[VBFDOT1_I]], <4 x bfloat> [[VBFDOT2_I]]) // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { @@ -26,10 +32,16 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { // CHECK-LABEL: @test_vbfdotq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFDOT_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT2_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[VBFDOT_I]], <8 x bfloat> [[VBFDOT1_I]], <8 x bfloat> [[VBFDOT2_I]]) // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ @@ -38,19 +50,24 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ // CHECK-LABEL: @test_vbfdot_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_128:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-NEXT: [[__REINT1_128:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: store <4 x bfloat> [[B:%.*]], ptr [[__REINT_128]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_128]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer -// CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_128]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_128]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[LANE]] to <2 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x float> [[R:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-NEXT: [[VBFDOT_I:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT2_I:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[VBFDOT_I]], <4 x bfloat> [[VBFDOT1_I]], <4 x bfloat> [[VBFDOT2_I]]) // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ @@ -59,19 +76,24 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ // CHECK-LABEL: @test_vbfdotq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_130:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-NEXT: [[__REINT1_130:%.*]] = alloca <4 x float>, align 8 -// CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_130]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_130]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> -// CHECK-NEXT: store <4 x float> [[LANE]], ptr [[__REINT1_130]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_130]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP4]], <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[LANE]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK-NEXT: [[VBFDOT_I:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT2_I:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[VBFDOT_I]], <8 x bfloat> [[VBFDOT1_I]], <8 x bfloat> [[VBFDOT2_I]]) // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -80,19 +102,24 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b // CHECK-LABEL: @test_vbfdot_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_132:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-NEXT: [[__REINT1_132:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_132]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_132]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> -// CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_132]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_132]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP4]], <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[LANE]] to <2 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x float> [[R:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-NEXT: [[VBFDOT_I:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT2_I:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[VBFDOT_I]], <4 x bfloat> [[VBFDOT1_I]], <4 x bfloat> [[VBFDOT2_I]]) // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { @@ -101,19 +128,24 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) // CHECK-LABEL: @test_vbfdotq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_126:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-NEXT: [[__REINT1_126:%.*]] = alloca <4 x float>, align 8 -// CHECK-NEXT: store <4 x bfloat> [[B:%.*]], ptr [[__REINT_126]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_126]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> zeroinitializer -// CHECK-NEXT: store <4 x float> [[LANE]], ptr [[__REINT1_126]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_126]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[LANE]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> +// CHECK-NEXT: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> +// CHECK-NEXT: [[VBFDOT_I:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> +// CHECK-NEXT: [[VBFDOT1_I:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT2_I:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[VBFDOT_I]], <8 x bfloat> [[VBFDOT1_I]], <8 x bfloat> [[VBFDOT2_I]]) // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { @@ -122,12 +154,20 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) // CHECK-LABEL: @test_vbfmmlaq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMMLAQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMMLAQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMMLAQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMMLAQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMMLAQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[VBFMMLAQ_F32_I]], <8 x bfloat> [[VBFMMLAQ_F321_I]], <8 x bfloat> [[VBFMMLAQ_F322_I]]) // CHECK-NEXT: [[VBFMMLAQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMMLAQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMMLAQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmmlaq_f32(r, a, b); @@ -135,12 +175,20 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlalbq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALBQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMLALBQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALBQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[VBFMLALBQ_F32_I]], <8 x bfloat> [[VBFMLALBQ_F321_I]], <8 x bfloat> [[VBFMLALBQ_F322_I]]) // CHECK-NEXT: [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMLALBQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_f32(r, a, b); @@ -148,12 +196,20 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlaltq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALTQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMLALTQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALTQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[VBFMLALTQ_F32_I]], <8 x bfloat> [[VBFMLALTQ_F321_I]], <8 x bfloat> [[VBFMLALTQ_F322_I]]) // CHECK-NEXT: [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMLALTQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_f32(r, a, b); @@ -163,26 +219,34 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-NEXT: entry: // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4 -// CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5 +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[VGET_LANE10:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x bfloat> [[VECINIT6]], bfloat [[VGET_LANE10]], i32 2 +// CHECK-NEXT: [[VGET_LANE16:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <8 x bfloat> [[VECINIT12]], bfloat [[VGET_LANE16]], i32 3 +// CHECK-NEXT: [[VGET_LANE22:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT24:%.*]] = insertelement <8 x bfloat> [[VECINIT18]], bfloat [[VGET_LANE22]], i32 4 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6 -// CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) +// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT24]], bfloat [[VGET_LANE28]], i32 5 +// CHECK-NEXT: [[VGET_LANE34:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT36:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE34]], i32 6 +// CHECK-NEXT: [[VGET_LANE40:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x bfloat> [[VECINIT36]], bfloat [[VGET_LANE40]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT42]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALBQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMLALBQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALBQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[VBFMLALBQ_F32_I]], <8 x bfloat> [[VBFMLALBQ_F321_I]], <8 x bfloat> [[VBFMLALBQ_F322_I]]) // CHECK-NEXT: [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMLALBQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlalbq_lane_f32(r, a, b, 0); @@ -192,26 +256,34 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t // CHECK-NEXT: entry: // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4 -// CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5 +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[VGET_LANE10:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x bfloat> [[VECINIT6]], bfloat [[VGET_LANE10]], i32 2 +// CHECK-NEXT: [[VGET_LANE16:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <8 x bfloat> [[VECINIT12]], bfloat [[VGET_LANE16]], i32 3 +// CHECK-NEXT: [[VGET_LANE22:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT24:%.*]] = insertelement <8 x bfloat> [[VECINIT18]], bfloat [[VGET_LANE22]], i32 4 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6 -// CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) +// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT24]], bfloat [[VGET_LANE28]], i32 5 +// CHECK-NEXT: [[VGET_LANE34:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT36:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE34]], i32 6 +// CHECK-NEXT: [[VGET_LANE40:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x bfloat> [[VECINIT36]], bfloat [[VGET_LANE40]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT42]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALBQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMLALBQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALBQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[VBFMLALBQ_F32_I]], <8 x bfloat> [[VBFMLALBQ_F321_I]], <8 x bfloat> [[VBFMLALBQ_F322_I]]) // CHECK-NEXT: [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMLALBQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlalbq_laneq_f32(r, a, b, 3); @@ -221,26 +293,34 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t // CHECK-NEXT: entry: // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4 -// CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5 +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[VGET_LANE10:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x bfloat> [[VECINIT6]], bfloat [[VGET_LANE10]], i32 2 +// CHECK-NEXT: [[VGET_LANE16:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <8 x bfloat> [[VECINIT12]], bfloat [[VGET_LANE16]], i32 3 +// CHECK-NEXT: [[VGET_LANE22:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT24:%.*]] = insertelement <8 x bfloat> [[VECINIT18]], bfloat [[VGET_LANE22]], i32 4 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6 -// CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) +// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT24]], bfloat [[VGET_LANE28]], i32 5 +// CHECK-NEXT: [[VGET_LANE34:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT36:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE34]], i32 6 +// CHECK-NEXT: [[VGET_LANE40:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x bfloat> [[VECINIT36]], bfloat [[VGET_LANE40]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT42]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALTQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMLALTQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALTQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[VBFMLALTQ_F32_I]], <8 x bfloat> [[VBFMLALTQ_F321_I]], <8 x bfloat> [[VBFMLALTQ_F322_I]]) // CHECK-NEXT: [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMLALTQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { return vbfmlaltq_lane_f32(r, a, b, 0); @@ -250,26 +330,34 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t // CHECK-NEXT: entry: // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4 -// CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5 +// CHECK-NEXT: [[VGET_LANE4:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE4]], i32 1 +// CHECK-NEXT: [[VGET_LANE10:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT12:%.*]] = insertelement <8 x bfloat> [[VECINIT6]], bfloat [[VGET_LANE10]], i32 2 +// CHECK-NEXT: [[VGET_LANE16:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT18:%.*]] = insertelement <8 x bfloat> [[VECINIT12]], bfloat [[VGET_LANE16]], i32 3 +// CHECK-NEXT: [[VGET_LANE22:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT24:%.*]] = insertelement <8 x bfloat> [[VECINIT18]], bfloat [[VGET_LANE22]], i32 4 // CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6 -// CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) +// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT24]], bfloat [[VGET_LANE28]], i32 5 +// CHECK-NEXT: [[VGET_LANE34:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT36:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE34]], i32 6 +// CHECK-NEXT: [[VGET_LANE40:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 +// CHECK-NEXT: [[VECINIT42:%.*]] = insertelement <8 x bfloat> [[VECINIT36]], bfloat [[VGET_LANE40]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT42]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALTQ_F32_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VBFMLALTQ_F321_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALTQ_F322_I:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x bfloat> +// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[VBFMLALTQ_F32_I]], <8 x bfloat> [[VBFMLALTQ_F321_I]], <8 x bfloat> [[VBFMLALTQ_F322_I]]) // CHECK-NEXT: [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_F323_I]] +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[VBFMLALTQ_F324_I]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { return vbfmlaltq_laneq_f32(r, a, b, 3); diff --git a/clang/test/CodeGen/arm-bf16-getset-intrinsics.c b/clang/test/CodeGen/arm-bf16-getset-intrinsics.c index b87d0e8eb68bb..97d51839e2eb6 100644 --- a/clang/test/CodeGen/arm-bf16-getset-intrinsics.c +++ b/clang/test/CodeGen/arm-bf16-getset-intrinsics.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \ -// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,sroa | FileCheck %s // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi soft \ -// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -47,9 +47,10 @@ bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) { // CHECK-LABEL: @test_vdup_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x bfloat> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP2]], <4 x i32> // CHECK-NEXT: ret <4 x bfloat> [[LANE]] // bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) { @@ -58,9 +59,10 @@ bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) { // CHECK-LABEL: @test_vdupq_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x bfloat> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP2]], <8 x i32> // CHECK-NEXT: ret <8 x bfloat> [[LANE]] // bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) { @@ -69,9 +71,10 @@ bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) { // CHECK-LABEL: @test_vdup_laneq_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x bfloat> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP2]], <4 x i32> // CHECK-NEXT: ret <4 x bfloat> [[LANE]] // bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) { @@ -80,9 +83,10 @@ bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) { // CHECK-LABEL: @test_vdupq_laneq_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x bfloat> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP2]], <8 x i32> // CHECK-NEXT: ret <8 x bfloat> [[LANE]] // bfloat16x8_t test_vdupq_laneq_bf16(bfloat16x8_t v) { diff --git a/clang/test/CodeGen/arm-neon-directed-rounding.c b/clang/test/CodeGen/arm-neon-directed-rounding.c index 63ec016b49a0c..be587ea8e697a 100644 --- a/clang/test/CodeGen/arm-neon-directed-rounding.c +++ b/clang/test/CodeGen/arm-neon-directed-rounding.c @@ -1,130 +1,353 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A32 %s +// RUN: opt -S -passes=mem2reg,sroa | FileCheck -check-prefixes=CHECK,CHECK-A32 %s // RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A64 %s +// RUN: opt -S -passes=mem2reg,sroa | FileCheck -check-prefixes=CHECK,CHECK-A64 %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrnda_f32(<2 x float> noundef %a) -// CHECK-A32: [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> %a) -// CHECK-A64: [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRNDA_V1_I]] +// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrnda_f32( +// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[VRNDA_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-A32-NEXT: [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> [[VRNDA_V_I]]) +// CHECK-A32-NEXT: [[VRNDA_V2_I:%.*]] = bitcast <2 x float> [[VRNDA_V1_I]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRNDA_V2_I]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +// CHECK-A32-NEXT: ret <2 x float> [[TMP3]] +// +// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrnda_f32( +// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A64-NEXT: [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-A64-NEXT: [[VRNDA1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[VRNDA_I]]) +// CHECK-A64-NEXT: ret <2 x float> [[VRNDA1_I]] +// float32x2_t test_vrnda_f32(float32x2_t a) { return vrnda_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndaq_f32(<4 x float> noundef %a) -// CHECK-A32: [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> %a) -// CHECK-A64: [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDAQ_V1_I]] +// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndaq_f32( +// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-NEXT: [[VRNDAQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-NEXT: [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> [[VRNDAQ_V_I]]) +// CHECK-A32-NEXT: [[VRNDAQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDAQ_V1_I]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRNDAQ_V2_I]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-A32-NEXT: ret <4 x float> [[TMP3]] +// +// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndaq_f32( +// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A64-NEXT: [[VRNDA1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[VRNDA_I]]) +// CHECK-A64-NEXT: ret <4 x float> [[VRNDA1_I]] +// float32x4_t test_vrndaq_f32(float32x4_t a) { return vrndaq_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndm_f32(<2 x float> noundef %a) -// CHECK-A32: [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> %a) -// CHECK-A64: [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRNDM_V1_I]] +// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrndm_f32( +// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[VRNDM_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-A32-NEXT: [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> [[VRNDM_V_I]]) +// CHECK-A32-NEXT: [[VRNDM_V2_I:%.*]] = bitcast <2 x float> [[VRNDM_V1_I]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRNDM_V2_I]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +// CHECK-A32-NEXT: ret <2 x float> [[TMP3]] +// +// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrndm_f32( +// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A64-NEXT: [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-A64-NEXT: [[VRNDM1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[VRNDM_I]]) +// CHECK-A64-NEXT: ret <2 x float> [[VRNDM1_I]] +// float32x2_t test_vrndm_f32(float32x2_t a) { return vrndm_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndmq_f32(<4 x float> noundef %a) -// CHECK-A32: [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> %a) -// CHECK-A64: [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDMQ_V1_I]] +// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndmq_f32( +// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-NEXT: [[VRNDMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-NEXT: [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> [[VRNDMQ_V_I]]) +// CHECK-A32-NEXT: [[VRNDMQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDMQ_V1_I]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRNDMQ_V2_I]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-A32-NEXT: ret <4 x float> [[TMP3]] +// +// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndmq_f32( +// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A64-NEXT: [[VRNDM1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[VRNDM_I]]) +// CHECK-A64-NEXT: ret <4 x float> [[VRNDM1_I]] +// float32x4_t test_vrndmq_f32(float32x4_t a) { return vrndmq_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndn_f32(<2 x float> noundef %a) -// CHECK-A32: [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> %a) -// CHECK-A64: [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRNDN_V1_I]] +// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrndn_f32( +// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[VRNDN_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-A32-NEXT: [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> [[VRNDN_V_I]]) +// CHECK-A32-NEXT: [[VRNDN_V2_I:%.*]] = bitcast <2 x float> [[VRNDN_V1_I]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRNDN_V2_I]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +// CHECK-A32-NEXT: ret <2 x float> [[TMP3]] +// +// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrndn_f32( +// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A64-NEXT: [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-A64-NEXT: [[VRNDN1_I:%.*]] = call <2 x float> @llvm.roundeven.v2f32(<2 x float> [[VRNDN_I]]) +// CHECK-A64-NEXT: ret <2 x float> [[VRNDN1_I]] +// float32x2_t test_vrndn_f32(float32x2_t a) { return vrndn_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndnq_f32(<4 x float> noundef %a) -// CHECK-A32: [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> %a) -// CHECK-A64: [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDNQ_V1_I]] +// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndnq_f32( +// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-NEXT: [[VRNDNQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-NEXT: [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> [[VRNDNQ_V_I]]) +// CHECK-A32-NEXT: [[VRNDNQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDNQ_V1_I]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRNDNQ_V2_I]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-A32-NEXT: ret <4 x float> [[TMP3]] +// +// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndnq_f32( +// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A64-NEXT: [[VRNDN1_I:%.*]] = call <4 x float> @llvm.roundeven.v4f32(<4 x float> [[VRNDN_I]]) +// CHECK-A64-NEXT: ret <4 x float> [[VRNDN1_I]] +// float32x4_t test_vrndnq_f32(float32x4_t a) { return vrndnq_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndp_f32(<2 x float> noundef %a) -// CHECK-A32: [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> %a) -// CHECK-A64: [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRNDP_V1_I]] +// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrndp_f32( +// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[VRNDP_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-A32-NEXT: [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> [[VRNDP_V_I]]) +// CHECK-A32-NEXT: [[VRNDP_V2_I:%.*]] = bitcast <2 x float> [[VRNDP_V1_I]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRNDP_V2_I]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +// CHECK-A32-NEXT: ret <2 x float> [[TMP3]] +// +// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrndp_f32( +// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A64-NEXT: [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-A64-NEXT: [[VRNDP1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[VRNDP_I]]) +// CHECK-A64-NEXT: ret <2 x float> [[VRNDP1_I]] +// float32x2_t test_vrndp_f32(float32x2_t a) { return vrndp_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndpq_f32(<4 x float> noundef %a) -// CHECK-A32: [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> %a) -// CHECK-A64: [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDPQ_V1_I]] +// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndpq_f32( +// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-NEXT: [[VRNDPQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-NEXT: [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> [[VRNDPQ_V_I]]) +// CHECK-A32-NEXT: [[VRNDPQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDPQ_V1_I]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRNDPQ_V2_I]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-A32-NEXT: ret <4 x float> [[TMP3]] +// +// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndpq_f32( +// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A64-NEXT: [[VRNDP1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[VRNDP_I]]) +// CHECK-A64-NEXT: ret <4 x float> [[VRNDP1_I]] +// float32x4_t test_vrndpq_f32(float32x4_t a) { return vrndpq_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndx_f32(<2 x float> noundef %a) -// CHECK-A32: [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> %a) -// CHECK-A64: [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRNDX_V1_I]] +// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrndx_f32( +// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[VRNDX_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-A32-NEXT: [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> [[VRNDX_V_I]]) +// CHECK-A32-NEXT: [[VRNDX_V2_I:%.*]] = bitcast <2 x float> [[VRNDX_V1_I]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRNDX_V2_I]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +// CHECK-A32-NEXT: ret <2 x float> [[TMP3]] +// +// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrndx_f32( +// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A64-NEXT: [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-A64-NEXT: [[VRNDX1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> [[VRNDX_I]]) +// CHECK-A64-NEXT: ret <2 x float> [[VRNDX1_I]] +// float32x2_t test_vrndx_f32(float32x2_t a) { return vrndx_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndxq_f32(<4 x float> noundef %a) -// CHECK-A32: [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> %a) -// CHECK-A64: [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDXQ_V1_I]] +// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndxq_f32( +// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-NEXT: [[VRNDXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-NEXT: [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> [[VRNDXQ_V_I]]) +// CHECK-A32-NEXT: [[VRNDXQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDXQ_V1_I]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRNDXQ_V2_I]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-A32-NEXT: ret <4 x float> [[TMP3]] +// +// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndxq_f32( +// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A64-NEXT: [[VRNDX1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[VRNDX_I]]) +// CHECK-A64-NEXT: ret <4 x float> [[VRNDX1_I]] +// float32x4_t test_vrndxq_f32(float32x4_t a) { return vrndxq_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrnd_f32(<2 x float> noundef %a) -// CHECK-A32: [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> %a) -// CHECK-A64: [[VRND_V1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRND_V1_I]] +// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrnd_f32( +// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[VRND_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-A32-NEXT: [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> [[VRND_V_I]]) +// CHECK-A32-NEXT: [[VRND_V2_I:%.*]] = bitcast <2 x float> [[VRND_V1_I]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRND_V2_I]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <2 x float> +// CHECK-A32-NEXT: ret <2 x float> [[TMP3]] +// +// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrnd_f32( +// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A64-NEXT: [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-A64-NEXT: [[VRNDZ1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> [[VRNDZ_I]]) +// CHECK-A64-NEXT: ret <2 x float> [[VRNDZ1_I]] +// float32x2_t test_vrnd_f32(float32x2_t a) { return vrnd_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndq_f32(<4 x float> noundef %a) -// CHECK-A32: [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> %a) -// CHECK-A64: [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDQ_V1_I]] +// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndq_f32( +// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-NEXT: [[VRNDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-NEXT: [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> [[VRNDQ_V_I]]) +// CHECK-A32-NEXT: [[VRNDQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDQ_V1_I]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRNDQ_V2_I]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-A32-NEXT: ret <4 x float> [[TMP3]] +// +// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndq_f32( +// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A64-NEXT: [[VRNDZ1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[VRNDZ_I]]) +// CHECK-A64-NEXT: ret <4 x float> [[VRNDZ1_I]] +// float32x4_t test_vrndq_f32(float32x4_t a) { return vrndq_f32(a); } -// CHECK-LABEL: define{{.*}} float @test_vrndns_f32(float noundef %a) -// CHECK-A32: [[VRNDN_I:%.*]] = call float @llvm.arm.neon.vrintn.f32(float %a) -// CHECK-A64: [[VRNDN_I:%.*]] = call float @llvm.roundeven.f32(float %a) -// CHECK: ret float [[VRNDN_I]] +// CHECK-A32-LABEL: define dso_local float @test_vrndns_f32( +// CHECK-A32-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDN_I:%.*]] = call float @llvm.arm.neon.vrintn.f32(float [[A]]) +// CHECK-A32-NEXT: ret float [[VRNDN_I]] +// +// CHECK-A64-LABEL: define dso_local float @test_vrndns_f32( +// CHECK-A64-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDN_I:%.*]] = call float @llvm.roundeven.f32(float [[A]]) +// CHECK-A64-NEXT: ret float [[VRNDN_I]] +// float32_t test_vrndns_f32(float32_t a) { return vrndns_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndi_f32(<2 x float> noundef %a) -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRNDI1_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrndi_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDI_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VRNDI_V1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[VRNDI_V_I]]) +// CHECK-NEXT: ret <2 x float> [[VRNDI_V1_I]] +// float32x2_t test_vrndi_f32(float32x2_t a) { return vrndi_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndiq_f32(<4 x float> noundef %a) -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDI1_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrndiq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDIQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VRNDIQ_V1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[VRNDIQ_V_I]]) +// CHECK-NEXT: ret <4 x float> [[VRNDIQ_V1_I]] +// float32x4_t test_vrndiq_f32(float32x4_t a) { return vrndiq_f32(a); } diff --git a/clang/test/CodeGen/arm-neon-fma.c b/clang/test/CodeGen/arm-neon-fma.c index 682eda9750c81..663347010eae5 100644 --- a/clang/test/CodeGen/arm-neon-fma.c +++ b/clang/test/CodeGen/arm-neon-fma.c @@ -4,7 +4,7 @@ // RUN: -target-cpu cortex-a7 \ // RUN: -mfloat-abi hard \ // RUN: -ffreestanding \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -13,11 +13,17 @@ // CHECK-LABEL: define {{[^@]+}}@test_fma_order // CHECK-SAME: (<2 x float> noundef [[ACCUM:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[ACCUM]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[LHS]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[RHS]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[RHS]], <2 x float> [[ACCUM]]) -// CHECK-NEXT: ret <2 x float> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[ACCUM]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[LHS]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[RHS]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]]) +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) { return vfma_f32(accum, lhs, rhs); @@ -26,11 +32,17 @@ float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) // CHECK-LABEL: define {{[^@]+}}@test_fmaq_order // CHECK-SAME: (<4 x float> noundef [[ACCUM:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACCUM]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[LHS]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[RHS]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[RHS]], <4 x float> [[ACCUM]]) -// CHECK-NEXT: ret <4 x float> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACCUM]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[LHS]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[RHS]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]]) +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) { return vfmaq_f32(accum, lhs, rhs); @@ -41,11 +53,17 @@ float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) // CHECK-NEXT: entry: // CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N]], i32 0 // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) -// CHECK-NEXT: ret <2 x float> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]]) +// CHECK-NEXT: ret <2 x float> [[TMP9]] // float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) { return vfma_n_f32(a, b, n); @@ -58,11 +76,17 @@ float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) { // CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1 // CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2 // CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) -// CHECK-NEXT: ret <4 x float> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]]) +// CHECK-NEXT: ret <4 x float> [[TMP9]] // float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { return vfmaq_n_f32(a, b, n); diff --git a/clang/test/CodeGen/arm-neon-numeric-maxmin.c b/clang/test/CodeGen/arm-neon-numeric-maxmin.c index d2d4fee10f079..0b76f3022466f 100644 --- a/clang/test/CodeGen/arm-neon-numeric-maxmin.c +++ b/clang/test/CodeGen/arm-neon-numeric-maxmin.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature -// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -8,24 +8,36 @@ // CHECK-LABEL: define {{[^@]+}}@test_vmaxnm_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8> -// CHECK-NEXT: [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMAXNM_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VMAXNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[VMAXNM_V_I]], <2 x float> [[VMAXNM_V1_I]]) // CHECK-NEXT: [[VMAXNM_V3_I:%.*]] = bitcast <2 x float> [[VMAXNM_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x float> [[VMAXNM_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VMAXNM_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] // float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) { return vmaxnm_f32(a, b); } // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmq_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8> -// CHECK-NEXT: [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMAXNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VMAXNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[VMAXNMQ_V_I]], <4 x float> [[VMAXNMQ_V1_I]]) // CHECK-NEXT: [[VMAXNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXNMQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VMAXNMQ_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VMAXNMQ_V3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP5]] // float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) { return vmaxnmq_f32(a, b); @@ -34,24 +46,36 @@ float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vminnm_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8> -// CHECK-NEXT: [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMINNM_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VMINNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[VMINNM_V_I]], <2 x float> [[VMINNM_V1_I]]) // CHECK-NEXT: [[VMINNM_V3_I:%.*]] = bitcast <2 x float> [[VMINNM_V2_I]] to <8 x i8> -// CHECK-NEXT: ret <2 x float> [[VMINNM_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VMINNM_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] // float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) { return vminnm_f32(a, b); } // CHECK-LABEL: define {{[^@]+}}@test_vminnmq_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8> -// CHECK-NEXT: [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMINNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VMINNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[VMINNMQ_V_I]], <4 x float> [[VMINNMQ_V1_I]]) // CHECK-NEXT: [[VMINNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMINNMQ_V2_I]] to <16 x i8> -// CHECK-NEXT: ret <4 x float> [[VMINNMQ_V2_I]] +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VMINNMQ_V3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP5]] // float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) { return vminnmq_f32(a, b); diff --git a/clang/test/CodeGen/arm-neon-vcvtX.c b/clang/test/CodeGen/arm-neon-vcvtX.c index c087b92102c5b..17dd97467308b 100644 --- a/clang/test/CodeGen/arm-neon-vcvtX.c +++ b/clang/test/CodeGen/arm-neon-vcvtX.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature -// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -8,8 +8,10 @@ // CHECK-LABEL: define {{[^@]+}}@test_vcvta_s32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTA_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[VCVTA_S32_V_I]]) // CHECK-NEXT: ret <2 x i32> [[VCVTA_S32_V1_I]] // int32x2_t test_vcvta_s32_f32(float32x2_t a) { @@ -19,8 +21,10 @@ int32x2_t test_vcvta_s32_f32(float32x2_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvta_u32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTA_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[VCVTA_U32_V_I]]) // CHECK-NEXT: ret <2 x i32> [[VCVTA_U32_V1_I]] // uint32x2_t test_vcvta_u32_f32(float32x2_t a) { @@ -28,10 +32,12 @@ uint32x2_t test_vcvta_u32_f32(float32x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtaq_s32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTAQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[VCVTAQ_S32_V_I]]) // CHECK-NEXT: ret <4 x i32> [[VCVTAQ_S32_V1_I]] // int32x4_t test_vcvtaq_s32_f32(float32x4_t a) { @@ -39,10 +45,12 @@ int32x4_t test_vcvtaq_s32_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtaq_u32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTAQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[VCVTAQ_U32_V_I]]) // CHECK-NEXT: ret <4 x i32> [[VCVTAQ_U32_V1_I]] // uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) { @@ -52,8 +60,10 @@ uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtn_s32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTN_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[VCVTN_S32_V_I]]) // CHECK-NEXT: ret <2 x i32> [[VCVTN_S32_V1_I]] // int32x2_t test_vcvtn_s32_f32(float32x2_t a) { @@ -63,8 +73,10 @@ int32x2_t test_vcvtn_s32_f32(float32x2_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtn_u32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTN_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[VCVTN_U32_V_I]]) // CHECK-NEXT: ret <2 x i32> [[VCVTN_U32_V1_I]] // uint32x2_t test_vcvtn_u32_f32(float32x2_t a) { @@ -72,10 +84,12 @@ uint32x2_t test_vcvtn_u32_f32(float32x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_s32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTNQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[VCVTNQ_S32_V_I]]) // CHECK-NEXT: ret <4 x i32> [[VCVTNQ_S32_V1_I]] // int32x4_t test_vcvtnq_s32_f32(float32x4_t a) { @@ -83,10 +97,12 @@ int32x4_t test_vcvtnq_s32_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_u32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTNQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[VCVTNQ_U32_V_I]]) // CHECK-NEXT: ret <4 x i32> [[VCVTNQ_U32_V1_I]] // uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) { @@ -96,8 +112,10 @@ uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtp_s32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTP_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[VCVTP_S32_V_I]]) // CHECK-NEXT: ret <2 x i32> [[VCVTP_S32_V1_I]] // int32x2_t test_vcvtp_s32_f32(float32x2_t a) { @@ -107,8 +125,10 @@ int32x2_t test_vcvtp_s32_f32(float32x2_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtp_u32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTP_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[VCVTP_U32_V_I]]) // CHECK-NEXT: ret <2 x i32> [[VCVTP_U32_V1_I]] // uint32x2_t test_vcvtp_u32_f32(float32x2_t a) { @@ -116,10 +136,12 @@ uint32x2_t test_vcvtp_u32_f32(float32x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_s32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTPQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[VCVTPQ_S32_V_I]]) // CHECK-NEXT: ret <4 x i32> [[VCVTPQ_S32_V1_I]] // int32x4_t test_vcvtpq_s32_f32(float32x4_t a) { @@ -127,10 +149,12 @@ int32x4_t test_vcvtpq_s32_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_u32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTPQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[VCVTPQ_U32_V_I]]) // CHECK-NEXT: ret <4 x i32> [[VCVTPQ_U32_V1_I]] // uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) { @@ -140,8 +164,10 @@ uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtm_s32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTM_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[VCVTM_S32_V_I]]) // CHECK-NEXT: ret <2 x i32> [[VCVTM_S32_V1_I]] // int32x2_t test_vcvtm_s32_f32(float32x2_t a) { @@ -151,8 +177,10 @@ int32x2_t test_vcvtm_s32_f32(float32x2_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtm_u32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTM_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[VCVTM_U32_V_I]]) // CHECK-NEXT: ret <2 x i32> [[VCVTM_U32_V1_I]] // uint32x2_t test_vcvtm_u32_f32(float32x2_t a) { @@ -160,10 +188,12 @@ uint32x2_t test_vcvtm_u32_f32(float32x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_s32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTMQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[VCVTMQ_S32_V_I]]) // CHECK-NEXT: ret <4 x i32> [[VCVTMQ_S32_V1_I]] // int32x4_t test_vcvtmq_s32_f32(float32x4_t a) { @@ -171,10 +201,12 @@ int32x4_t test_vcvtmq_s32_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_u32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTMQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[VCVTMQ_U32_V_I]]) // CHECK-NEXT: ret <4 x i32> [[VCVTMQ_U32_V1_I]] // uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) { diff --git a/clang/test/CodeGen/arm-neon-vst.c b/clang/test/CodeGen/arm-neon-vst.c index 6e135ab07af49..eeff0c00221b2 100644 --- a/clang/test/CodeGen/arm-neon-vst.c +++ b/clang/test/CodeGen/arm-neon-vst.c @@ -1,1990 +1,2738 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | \ -// RUN: FileCheck -check-prefixes=CHECK,CHECK-A64 %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | \ +// RUN: FileCheck -check-prefixes=CHECK-A64 %s // RUN: %clang_cc1 -triple armv8-none-linux-gnueabi -target-feature +neon \ // RUN: -target-feature +fp16 -disable-O0-optnone -emit-llvm -o - %s | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A32 %s +// RUN: opt -S -passes=mem2reg,sroa | FileCheck -check-prefixes=CHECK-A32 %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: @test_vst1_f16_x2( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF:(half|i16)]]> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4f16.p0(<4 x half> [[TMP7]], <4 x half> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr %a, <4 x i16> [[TMP7]], <4 x i16> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_f16_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v4f16.p0(<4 x half> [[TMP4]], <4 x half> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_f16_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) { vst1_f16_x2(a, b); } -// CHECK-LABEL: @test_vst1_f16_x3( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF]]> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x [[HALF]]> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4f16.p0(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4i16(ptr %a, <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_f16_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-A64-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v4f16.p0(<4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_f16_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) { vst1_f16_x3(a, b); } -// CHECK-LABEL: @test_vst1_f16_x4( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x [[HALF]]> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x [[HALF]]> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x [[HALF]]> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x [[HALF]]> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4f16.p0(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4i16(ptr %a, <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_f16_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-A64-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-A64-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-A64-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v4f16.p0(<4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_f16_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x half> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) { vst1_f16_x4(a, b); } -// CHECK-LABEL: @test_vst1_f32_x2( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2f32.p0(<2 x float> [[TMP7]], <2 x float> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v2f32(ptr %a, <2 x float> [[TMP7]], <2 x float> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_f32_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v2f32.p0(<2 x float> [[TMP4]], <2 x float> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_f32_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v2f32(ptr [[A]], <2 x float> [[TMP4]], <2 x float> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) { vst1_f32_x2(a, b); } -// CHECK-LABEL: @test_vst1_f32_x3( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2f32.p0(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v2f32(ptr %a, <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_f32_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-A64-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v2f32.p0(<2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_f32_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v2f32(ptr [[A]], <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) { vst1_f32_x3(a, b); } -// CHECK-LABEL: @test_vst1_f32_x4( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2f32.p0(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v2f32(ptr %a, <2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_f32_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-A64-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-A64-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-A64-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v2f32.p0(<2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_f32_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x float> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-A32-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-A32-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-A32-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v2f32(ptr [[A]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) { vst1_f32_x4(a, b); } -// CHECK-LABEL: @test_vst1_p16_x2( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr %a, <4 x i16> [[TMP7]], <4 x i16> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_p16_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_p16_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) { vst1_p16_x2(a, b); } -// CHECK-LABEL: @test_vst1_p16_x3( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4i16(ptr %a, <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_p16_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v4i16.p0(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_p16_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) { vst1_p16_x3(a, b); } -// CHECK-LABEL: @test_vst1_p16_x4( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4i16(ptr %a, <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_p16_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v4i16.p0(<4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_p16_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) { vst1_p16_x4(a, b); } -// CHECK-LABEL: @test_vst1_p8_x2( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_p8_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_p8_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) { vst1_p8_x2(a, b); } -// CHECK-LABEL: @test_vst1_p8_x3( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_p8_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_p8_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) { vst1_p8_x3(a, b); } -// CHECK-LABEL: @test_vst1_p8_x4( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_p8_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_p8_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) { vst1_p8_x4(a, b); } -// CHECK-LABEL: @test_vst1_s16_x2( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr %a, <4 x i16> [[TMP7]], <4 x i16> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_s16_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_s16_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) { vst1_s16_x2(a, b); } -// CHECK-LABEL: @test_vst1_s16_x3( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4i16(ptr %a, <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_s16_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v4i16.p0(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_s16_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) { vst1_s16_x3(a, b); } -// CHECK-LABEL: @test_vst1_s16_x4( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4i16(ptr %a, <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_s16_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v4i16.p0(<4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_s16_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) { vst1_s16_x4(a, b); } -// CHECK-LABEL: @test_vst1_s32_x2( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v2i32(ptr %a, <2 x i32> [[TMP7]], <2 x i32> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_s32_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_s32_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) { vst1_s32_x2(a, b); } -// CHECK-LABEL: @test_vst1_s32_x3( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v2i32(ptr %a, <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_s32_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v2i32.p0(<2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_s32_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) { vst1_s32_x3(a, b); } -// CHECK-LABEL: @test_vst1_s32_x4( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v2i32(ptr %a, <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_s32_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v2i32.p0(<2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_s32_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) { vst1_s32_x4(a, b); } -// CHECK-LABEL: @test_vst1_s64_x2( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v1i64(ptr %a, <1 x i64> [[TMP7]], <1 x i64> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_s64_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_s64_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v1i64(ptr [[A]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) { vst1_s64_x2(a, b); } -// CHECK-LABEL: @test_vst1_s64_x3( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v1i64(ptr %a, <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_s64_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_s64_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v1i64(ptr [[A]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) { vst1_s64_x3(a, b); } -// CHECK-LABEL: @test_vst1_s64_x4( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v1i64(ptr %a, <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_s64_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP4]], <1 x i64> [[TMP5]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_s64_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v1i64(ptr [[A]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) { vst1_s64_x4(a, b); } -// CHECK-LABEL: @test_vst1_s8_x2( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_s8_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_s8_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) { vst1_s8_x2(a, b); } -// CHECK-LABEL: @test_vst1_s8_x3( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_s8_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_s8_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) { vst1_s8_x3(a, b); } -// CHECK-LABEL: @test_vst1_s8_x4( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_s8_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_s8_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) { vst1_s8_x4(a, b); } -// CHECK-LABEL: @test_vst1_u16_x2( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr %a, <4 x i16> [[TMP7]], <4 x i16> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_u16_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v4i16.p0(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_u16_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) { vst1_u16_x2(a, b); } -// CHECK-LABEL: @test_vst1_u16_x3( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4i16(ptr %a, <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_u16_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v4i16.p0(<4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_u16_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) { vst1_u16_x3(a, b); } -// CHECK-LABEL: @test_vst1_u16_x4( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4i16(ptr %a, <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_u16_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v4i16.p0(<4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_u16_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-A32-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) { vst1_u16_x4(a, b); } -// CHECK-LABEL: @test_vst1_u32_x2( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v2i32(ptr %a, <2 x i32> [[TMP7]], <2 x i32> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_u32_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v2i32.p0(<2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_u32_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) { vst1_u32_x2(a, b); } -// CHECK-LABEL: @test_vst1_u32_x3( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v2i32(ptr %a, <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_u32_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v2i32.p0(<2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_u32_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) { vst1_u32_x3(a, b); } -// CHECK-LABEL: @test_vst1_u32_x4( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v2i32(ptr %a, <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_u32_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v2i32.p0(<2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_u32_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK-A32-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) { vst1_u32_x4(a, b); } -// CHECK-LABEL: @test_vst1_u64_x2( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v1i64(ptr %a, <1 x i64> [[TMP7]], <1 x i64> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_u64_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_u64_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v1i64(ptr [[A]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) { vst1_u64_x2(a, b); } -// CHECK-LABEL: @test_vst1_u64_x3( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v1i64(ptr %a, <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_u64_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_u64_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v1i64(ptr [[A]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) { vst1_u64_x3(a, b); } -// CHECK-LABEL: @test_vst1_u64_x4( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v1i64(ptr %a, <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_u64_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP4]], <1 x i64> [[TMP5]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_u64_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK-A32-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK-A32-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v1i64(ptr [[A]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) { vst1_u64_x4(a, b); } -// CHECK-LABEL: @test_vst1_u8_x2( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [2 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_u8_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_u8_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) { vst1_u8_x2(a, b); } -// CHECK-LABEL: @test_vst1_u8_x3( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [3 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_u8_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_u8_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) { vst1_u8_x3(a, b); } -// CHECK-LABEL: @test_vst1_u8_x4( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align 8 [[__S1]], ptr align 8 [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i8(ptr %a, <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1_u8_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v8i8.p0(<8 x i8> [[B_COERCE_FCA_0_EXTRACT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1_u8_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) { vst1_u8_x4(a, b); } -// CHECK-LABEL: @test_vst1q_f16_x2( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align [[QALIGN:(16|8)]] -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8f16.p0(<8 x half> [[TMP7]], <8 x half> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i16(ptr %a, <8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_f16_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i16> +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v8f16.p0(<8 x half> [[TMP4]], <8 x half> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_f16_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) { vst1q_f16_x2(a, b); } -// CHECK-LABEL: @test_vst1q_f16_x3( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x [[HALF]]> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8f16.p0(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i16(ptr %a, <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_f16_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i16> +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i16> +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-A64-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v8f16.p0(<8 x half> [[TMP6]], <8 x half> [[TMP7]], <8 x half> [[TMP8]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_f16_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) { vst1q_f16_x3(a, b); } -// CHECK-LABEL: @test_vst1q_f16_x4( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align [[QALIGN]] -// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x [[HALF]]> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x [[HALF]]> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x [[HALF]]> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x [[HALF]]> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8f16.p0(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i16(ptr %a, <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_f16_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_0_EXTRACT]] to <8 x i16> +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_1_EXTRACT]] to <8 x i16> +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_2_EXTRACT]] to <8 x i16> +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[B_COERCE_FCA_3_EXTRACT]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-A64-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-A64-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> +// CHECK-A64-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v8f16.p0(<8 x half> [[TMP8]], <8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_f16_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-A32-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-A32-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) { vst1q_f16_x4(a, b); } -// CHECK-LABEL: @test_vst1q_f32_x2( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4f32.p0(<4 x float> [[TMP7]], <4 x float> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4f32(ptr %a, <4 x float> [[TMP7]], <4 x float> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_f32_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i32> +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v4f32.p0(<4 x float> [[TMP4]], <4 x float> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_f32_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v4f32(ptr [[A]], <4 x float> [[TMP2]], <4 x float> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) { vst1q_f32_x2(a, b); } -// CHECK-LABEL: @test_vst1q_f32_x3( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4f32.p0(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4f32(ptr %a, <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_f32_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i32> +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i32> +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-A64-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v4f32.p0(<4 x float> [[TMP6]], <4 x float> [[TMP7]], <4 x float> [[TMP8]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_f32_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v4f32(ptr [[A]], <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) { vst1q_f32_x3(a, b); } -// CHECK-LABEL: @test_vst1q_f32_x4( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align [[QALIGN]] -// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4f32.p0(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4f32(ptr %a, <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_f32_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_0_EXTRACT]] to <4 x i32> +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_1_EXTRACT]] to <4 x i32> +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_2_EXTRACT]] to <4 x i32> +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[B_COERCE_FCA_3_EXTRACT]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-A64-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-A64-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> +// CHECK-A64-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v4f32.p0(<4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_f32_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-A32-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-A32-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v4f32(ptr [[A]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[TMP7]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) { vst1q_f32_x4(a, b); } -// CHECK-LABEL: @test_vst1q_p16_x2( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i16(ptr %a, <8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_p16_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_p16_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) { vst1q_p16_x2(a, b); } -// CHECK-LABEL: @test_vst1q_p16_x3( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i16(ptr %a, <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_p16_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v8i16.p0(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_p16_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) { vst1q_p16_x3(a, b); } -// CHECK-LABEL: @test_vst1q_p16_x4( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align [[QALIGN]] -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i16(ptr %a, <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_p16_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v8i16.p0(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_p16_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-A32-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-A32-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) { vst1q_p16_x4(a, b); } -// CHECK-LABEL: @test_vst1q_p8_x2( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_p8_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_p8_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) { vst1q_p8_x2(a, b); } -// CHECK-LABEL: @test_vst1q_p8_x3( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_p8_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_p8_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) { vst1q_p8_x3(a, b); } -// CHECK-LABEL: @test_vst1q_p8_x4( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align [[QALIGN]] -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_p8_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_p8_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-A32-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-A32-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) { vst1q_p8_x4(a, b); } -// CHECK-LABEL: @test_vst1q_s16_x2( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i16(ptr %a, <8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_s16_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_s16_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) { vst1q_s16_x2(a, b); } -// CHECK-LABEL: @test_vst1q_s16_x3( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i16(ptr %a, <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_s16_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v8i16.p0(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_s16_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) { vst1q_s16_x3(a, b); } -// CHECK-LABEL: @test_vst1q_s16_x4( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align [[QALIGN]] -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i16(ptr %a, <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_s16_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v8i16.p0(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_s16_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-A32-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-A32-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) { vst1q_s16_x4(a, b); } -// CHECK-LABEL: @test_vst1q_s32_x2( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4i32(ptr %a, <4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_s32_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_s32_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v4i32(ptr [[A]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) { vst1q_s32_x2(a, b); } -// CHECK-LABEL: @test_vst1q_s32_x3( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4i32(ptr %a, <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_s32_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v4i32.p0(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_s32_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v4i32(ptr [[A]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) { vst1q_s32_x3(a, b); } -// CHECK-LABEL: @test_vst1q_s32_x4( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align [[QALIGN]] -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4i32(ptr %a, <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_s32_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v4i32.p0(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_s32_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-A32-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-A32-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) { vst1q_s32_x4(a, b); } -// CHECK-LABEL: @test_vst1q_s64_x2( -// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v2i64(ptr %a, <2 x i64> [[TMP7]], <2 x i64> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_s64_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_s64_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v2i64(ptr [[A]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) { vst1q_s64_x2(a, b); } -// CHECK-LABEL: @test_vst1q_s64_x3( -// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v2i64(ptr %a, <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_s64_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_s64_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v2i64(ptr [[A]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) { vst1q_s64_x3(a, b); } -// CHECK-LABEL: @test_vst1q_s64_x4( -// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align [[QALIGN]] -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v2i64(ptr %a, <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_s64_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_s64_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-A32-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-A32-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v2i64(ptr [[A]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) { vst1q_s64_x4(a, b); } -// CHECK-LABEL: @test_vst1q_s8_x2( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_s8_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_s8_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) { vst1q_s8_x2(a, b); } -// CHECK-LABEL: @test_vst1q_s8_x3( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_s8_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_s8_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) { vst1q_s8_x3(a, b); } -// CHECK-LABEL: @test_vst1q_s8_x4( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align [[QALIGN]] -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_s8_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_s8_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-A32-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-A32-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) { vst1q_s8_x4(a, b); } -// CHECK-LABEL: @test_vst1q_u16_x2( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v8i16(ptr %a, <8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_u16_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v8i16.p0(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_u16_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) { vst1q_u16_x2(a, b); } -// CHECK-LABEL: @test_vst1q_u16_x3( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v8i16(ptr %a, <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_u16_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v8i16.p0(<8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_u16_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) { vst1q_u16_x3(a, b); } -// CHECK-LABEL: @test_vst1q_u16_x4( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align [[QALIGN]] -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v8i16(ptr %a, <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_u16_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v8i16.p0(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_u16_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-A32-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-A32-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) { vst1q_u16_x4(a, b); } -// CHECK-LABEL: @test_vst1q_u32_x2( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v4i32(ptr %a, <4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_u32_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_u32_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v4i32(ptr [[A]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) { vst1q_u32_x2(a, b); } -// CHECK-LABEL: @test_vst1q_u32_x3( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v4i32(ptr %a, <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_u32_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v4i32.p0(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_u32_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v4i32(ptr [[A]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) { vst1q_u32_x3(a, b); } -// CHECK-LABEL: @test_vst1q_u32_x4( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align [[QALIGN]] -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v4i32(ptr %a, <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_u32_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v4i32.p0(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_u32_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-A32-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-A32-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) { vst1q_u32_x4(a, b); } -// CHECK-LABEL: @test_vst1q_u64_x2( -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK-DAG: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK-DAG: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v2i64(ptr %a, <2 x i64> [[TMP7]], <2 x i64> [[TMP8]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_u64_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_u64_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v2i64(ptr [[A]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) { vst1q_u64_x2(a, b); } -// CHECK-LABEL: @test_vst1q_u64_x3( -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK-DAG: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK-DAG: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v2i64(ptr %a, <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_u64_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_u64_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v2i64(ptr [[A]], <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) { vst1q_u64_x3(a, b); } -// CHECK-LABEL: @test_vst1q_u64_x4( -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align [[QALIGN]] -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK-DAG: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK-DAG: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK-DAG: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK-DAG: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v2i64(ptr %a, <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_u64_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_0_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_1_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_2_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_COERCE_FCA_3_EXTRACT]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-A64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-A64-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_u64_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-A32-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-A32-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-A32-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-A32-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-A32-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <2 x i64> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v2i64(ptr [[A]], <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP7]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) { vst1q_u64_x4(a, b); } -// CHECK-LABEL: @test_vst1q_u8_x2( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [4 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK-A64: call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x2.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_u8_x2( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x2.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_u8_x2( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) { vst1q_u8_x2(a, b); } -// CHECK-LABEL: @test_vst1q_u8_x3( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [6 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK-A64: call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x3.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_u8_x3( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x3.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_u8_x3( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) { vst1q_u8_x3(a, b); } -// CHECK-LABEL: @test_vst1q_u8_x4( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align [[QALIGN]] -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align [[QALIGN]] -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK-A64: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK-A32: store [8 x i64] %b.coerce, ptr %coerce.dive, align 8 -// CHECK: call void @llvm.memcpy.p0.p0.{{i64|i32}}(ptr align [[QALIGN]] [[__S1]], ptr align [[QALIGN]] [[B]], {{i64|i32}} 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], {{i64|i32}} 0, {{i64|i32}} 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align [[QALIGN]] -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], {{i64|i32}} 0, {{i64|i32}} 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align [[QALIGN]] -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], {{i64|i32}} 0, {{i64|i32}} 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align [[QALIGN]] -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], {{i64|i32}} 0, {{i64|i32}} 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align [[QALIGN]] -// CHECK-A64: call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a) -// CHECK-A32: call void @llvm.arm.neon.vst1x4.p0.v16i8(ptr %a, <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) -// CHECK: ret void +// CHECK-A64-LABEL: define dso_local void @test_vst1q_u8_x4( +// CHECK-A64-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-A64-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-A64-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-A64-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-A64-NEXT: call void @llvm.aarch64.neon.st1x4.v16i8.p0(<16 x i8> [[B_COERCE_FCA_0_EXTRACT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], ptr [[A]]) +// CHECK-A64-NEXT: ret void +// +// CHECK-A32-LABEL: define dso_local void @test_vst1q_u8_x4( +// CHECK-A32-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-A32-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-A32-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-A32-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-A32-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-A32-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-A32-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-A32-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-A32-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-A32-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-A32-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-A32-NEXT: call void @llvm.arm.neon.vst1x4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]]) +// CHECK-A32-NEXT: ret void +// void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) { vst1q_u8_x4(a, b); } diff --git a/clang/test/CodeGen/arm64-vrnd-constrained.c b/clang/test/CodeGen/arm64-vrnd-constrained.c index ccf729a6a25ef..8e61f1ea6a3d0 100644 --- a/clang/test/CodeGen/arm64-vrnd-constrained.c +++ b/clang/test/CodeGen/arm64-vrnd-constrained.c @@ -1,43 +1,210 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -flax-vector-conversions=none -emit-llvm -o - %s \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=UNCONSTRAINED %s +// RUN: | FileCheck --check-prefix=UNCONSTRAINED %s // RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -flax-vector-conversions=none -ffp-exception-behavior=strict -emit-llvm -o - %s \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CONSTRAINED %s -// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -flax-vector-conversions=none -emit-llvm -o - %s | llc -o=- - \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s -// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -flax-vector-conversions=none -ffp-exception-behavior=strict -emit-llvm -o - %s | llc -o=- - \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s +// RUN: | FileCheck --check-prefix=CONSTRAINED %s // REQUIRES: aarch64-registered-target #include +// UNCONSTRAINED-LABEL: define <2 x double> @rnd5( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// UNCONSTRAINED-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// UNCONSTRAINED-NEXT: [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// UNCONSTRAINED-NEXT: [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[VRNDZ_I]]) +// UNCONSTRAINED-NEXT: store <2 x double> [[VRNDZ1_I]], ptr [[REF_TMP_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// UNCONSTRAINED-NEXT: ret <2 x double> [[TMP3]] +// +// CONSTRAINED-LABEL: define <2 x double> @rnd5( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// CONSTRAINED-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CONSTRAINED-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CONSTRAINED-NEXT: [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CONSTRAINED-NEXT: [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double> [[VRNDZ_I]], metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]] +// CONSTRAINED-NEXT: store <2 x double> [[VRNDZ1_I]], ptr [[REF_TMP_I]], align 16 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// CONSTRAINED-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// CONSTRAINED-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// CONSTRAINED-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t rnd5(float64x2_t a) { return vrndq_f64(a); } -// COMMON-LABEL: rnd5 -// UNCONSTRAINED: call <2 x double> @llvm.trunc.v2f64(<2 x double> -// CONSTRAINED: call <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double> -// CHECK-ASM: frintz.2d v{{[0-9]+}}, v{{[0-9]+}} +// UNCONSTRAINED-LABEL: define <2 x double> @rnd13( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// UNCONSTRAINED-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// UNCONSTRAINED-NEXT: [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// UNCONSTRAINED-NEXT: [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[VRNDM_I]]) +// UNCONSTRAINED-NEXT: store <2 x double> [[VRNDM1_I]], ptr [[REF_TMP_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// UNCONSTRAINED-NEXT: ret <2 x double> [[TMP3]] +// +// CONSTRAINED-LABEL: define <2 x double> @rnd13( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// CONSTRAINED-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CONSTRAINED-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CONSTRAINED-NEXT: [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CONSTRAINED-NEXT: [[VRNDM1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double> [[VRNDM_I]], metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: store <2 x double> [[VRNDM1_I]], ptr [[REF_TMP_I]], align 16 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// CONSTRAINED-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// CONSTRAINED-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// CONSTRAINED-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t rnd13(float64x2_t a) { return vrndmq_f64(a); } -// COMMON-LABEL: rnd13 -// UNCONSTRAINED: call <2 x double> @llvm.floor.v2f64(<2 x double> -// CONSTRAINED: call <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double> -// CHECK-ASM: frintm.2d v{{[0-9]+}}, v{{[0-9]+}} +// UNCONSTRAINED-LABEL: define <2 x double> @rnd18( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// UNCONSTRAINED-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// UNCONSTRAINED-NEXT: [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// UNCONSTRAINED-NEXT: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[VRNDP_I]]) +// UNCONSTRAINED-NEXT: store <2 x double> [[VRNDP1_I]], ptr [[REF_TMP_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// UNCONSTRAINED-NEXT: ret <2 x double> [[TMP3]] +// +// CONSTRAINED-LABEL: define <2 x double> @rnd18( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// CONSTRAINED-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CONSTRAINED-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CONSTRAINED-NEXT: [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CONSTRAINED-NEXT: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double> [[VRNDP_I]], metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: store <2 x double> [[VRNDP1_I]], ptr [[REF_TMP_I]], align 16 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// CONSTRAINED-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// CONSTRAINED-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// CONSTRAINED-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t rnd18(float64x2_t a) { return vrndpq_f64(a); } -// COMMON-LABEL: rnd18 -// UNCONSTRAINED: call <2 x double> @llvm.ceil.v2f64(<2 x double> -// CONSTRAINED: call <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double> -// CHECK-ASM: frintp.2d v{{[0-9]+}}, v{{[0-9]+}} +// UNCONSTRAINED-LABEL: define <2 x double> @rnd22( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// UNCONSTRAINED-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// UNCONSTRAINED-NEXT: [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// UNCONSTRAINED-NEXT: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[VRNDA_I]]) +// UNCONSTRAINED-NEXT: store <2 x double> [[VRNDA1_I]], ptr [[REF_TMP_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// UNCONSTRAINED-NEXT: ret <2 x double> [[TMP3]] +// +// CONSTRAINED-LABEL: define <2 x double> @rnd22( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// CONSTRAINED-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CONSTRAINED-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CONSTRAINED-NEXT: [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CONSTRAINED-NEXT: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double> [[VRNDA_I]], metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: store <2 x double> [[VRNDA1_I]], ptr [[REF_TMP_I]], align 16 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// CONSTRAINED-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// CONSTRAINED-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// CONSTRAINED-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t rnd22(float64x2_t a) { return vrndaq_f64(a); } -// COMMON-LABEL: rnd22 -// UNCONSTRAINED: call <2 x double> @llvm.round.v2f64(<2 x double> -// CONSTRAINED: call <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double> -// CHECK-ASM: frinta.2d v{{[0-9]+}}, v{{[0-9]+}} +// UNCONSTRAINED-LABEL: define <2 x double> @rnd25( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// UNCONSTRAINED-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// UNCONSTRAINED-NEXT: [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// UNCONSTRAINED-NEXT: [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[VRNDX_I]]) +// UNCONSTRAINED-NEXT: store <2 x double> [[VRNDX1_I]], ptr [[REF_TMP_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// UNCONSTRAINED-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// UNCONSTRAINED-NEXT: ret <2 x double> [[TMP3]] +// +// CONSTRAINED-LABEL: define <2 x double> @rnd25( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// CONSTRAINED-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CONSTRAINED-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CONSTRAINED-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CONSTRAINED-NEXT: [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CONSTRAINED-NEXT: [[VRNDX1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> [[VRNDX_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: store <2 x double> [[VRNDX1_I]], ptr [[REF_TMP_I]], align 16 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// CONSTRAINED-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// CONSTRAINED-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// CONSTRAINED-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t rnd25(float64x2_t a) { return vrndxq_f64(a); } -// COMMON-LABEL: rnd25 -// UNCONSTRAINED: call <2 x double> @llvm.rint.v2f64(<2 x double> -// CONSTRAINED: call <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double> -// CHECK-ASM: frintx.2d v{{[0-9]+}}, v{{[0-9]+}} diff --git a/clang/test/CodeGen/arm64-vrnd.c b/clang/test/CodeGen/arm64-vrnd.c index 0059dc43abe0b..2ca4abafc4813 100644 --- a/clang/test/CodeGen/arm64-vrnd.c +++ b/clang/test/CodeGen/arm64-vrnd.c @@ -1,24 +1,133 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -flax-vector-conversions=none -emit-llvm -o - %s | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include +// CHECK-LABEL: define <2 x double> @rnd5( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CHECK-NEXT: [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[VRNDZ_I]]) +// CHECK-NEXT: store <2 x double> [[VRNDZ1_I]], ptr [[REF_TMP_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// CHECK-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t rnd5(float64x2_t a) { return vrndq_f64(a); } -// CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> +// CHECK-LABEL: define <2 x double> @rnd9( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CHECK-NEXT: [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <2 x double> @llvm.roundeven.v2f64(<2 x double> [[VRNDN_I]]) +// CHECK-NEXT: store <2 x double> [[VRNDN1_I]], ptr [[REF_TMP_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// CHECK-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t rnd9(float64x2_t a) { return vrndnq_f64(a); } -// CHECK: call <2 x double> @llvm.roundeven.v2f64(<2 x double> +// CHECK-LABEL: define <2 x double> @rnd13( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CHECK-NEXT: [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[VRNDM_I]]) +// CHECK-NEXT: store <2 x double> [[VRNDM1_I]], ptr [[REF_TMP_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// CHECK-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t rnd13(float64x2_t a) { return vrndmq_f64(a); } -// CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> +// CHECK-LABEL: define <2 x double> @rnd18( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CHECK-NEXT: [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[VRNDP_I]]) +// CHECK-NEXT: store <2 x double> [[VRNDP1_I]], ptr [[REF_TMP_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// CHECK-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t rnd18(float64x2_t a) { return vrndpq_f64(a); } -// CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> +// CHECK-LABEL: define <2 x double> @rnd22( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CHECK-NEXT: [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[VRNDA_I]]) +// CHECK-NEXT: store <2 x double> [[VRNDA1_I]], ptr [[REF_TMP_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// CHECK-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t rnd22(float64x2_t a) { return vrndaq_f64(a); } -// CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> +// CHECK-LABEL: define <2 x double> @rnd25( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[__P0_ADDR_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[__RET_I:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: [[REF_TMP_I:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-NEXT: store <2 x double> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CHECK-NEXT: [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> +// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[VRNDX_I]]) +// CHECK-NEXT: store <2 x double> [[VRNDX1_I]], ptr [[REF_TMP_I]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[REF_TMP_I]], align 16 +// CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[__RET_I]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[__RET_I]], align 16 +// CHECK-NEXT: ret <2 x double> [[TMP3]] +// float64x2_t rnd25(float64x2_t a) { return vrndxq_f64(a); } -// CHECK: call <2 x double> @llvm.rint.v2f64(<2 x double> diff --git a/clang/test/CodeGen/arm64_vcreate.c b/clang/test/CodeGen/arm64_vcreate.c index 2b6e8e4439167..3641ebc4ec6a7 100644 --- a/clang/test/CodeGen/arm64_vcreate.c +++ b/clang/test/CodeGen/arm64_vcreate.c @@ -1,3 +1,4 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -o - -emit-llvm %s | opt -S -passes=mem2reg | FileCheck %s // Test ARM64 SIMD vcreate intrinsics @@ -5,9 +6,23 @@ #include +// CHECK-LABEL: define <2 x float> @test_vcreate_f32( +// CHECK-SAME: i64 noundef [[A1:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A1_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca <2 x float>, align 8 +// CHECK-NEXT: [[__PROMOTE:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca <2 x float>, align 8 +// CHECK-NEXT: store i64 [[A1]], ptr [[A1_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A1_ADDR]], align 8 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[__PROMOTE]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[__PROMOTE]], align 8 +// CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[__RET]], align 8 +// CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[TMP]], align 8 +// CHECK-NEXT: ret <2 x float> [[TMP3]] +// float32x2_t test_vcreate_f32(uint64_t a1) { - // CHECK: test_vcreate_f32 return vcreate_f32(a1); - // CHECK: bitcast {{.*}} to <2 x float> - // CHECK-NEXT: ret } diff --git a/clang/test/CodeGen/arm64_vdupq_n_f64.c b/clang/test/CodeGen/arm64_vdupq_n_f64.c index 2da2d3bc8d075..5159382ba0820 100644 --- a/clang/test/CodeGen/arm64_vdupq_n_f64.c +++ b/clang/test/CodeGen/arm64_vdupq_n_f64.c @@ -1,57 +1,74 @@ -// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -o - -disable-O0-optnone -emit-llvm %s | opt -S -passes=mem2reg | FileCheck %s +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -o - -disable-O0-optnone -emit-llvm %s | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include // vdupq_n_f64 -> dup.2d v0, v0[0] +// CHECK-LABEL: define <2 x double> @test_vdupq_n_f64( +// CHECK-SAME: double noundef [[W:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[W]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[W]], i32 1 +// CHECK-NEXT: ret <2 x double> [[VECINIT1_I]] // -// CHECK-LABEL: define{{.*}} <2 x double> @test_vdupq_n_f64(double noundef %w) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double %w, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1 -// CHECK: ret <2 x double> [[VECINIT1_I]] float64x2_t test_vdupq_n_f64(float64_t w) { return vdupq_n_f64(w); } // might as well test this while we're here // vdupq_n_f32 -> dup.4s v0, v0[0] -// CHECK-LABEL: define{{.*}} <4 x float> @test_vdupq_n_f32(float noundef %w) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %w, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %w, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %w, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %w, i32 3 -// CHECK: ret <4 x float> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32( +// CHECK-SAME: float noundef [[W:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[W]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[W]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[W]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[W]], i32 3 +// CHECK-NEXT: ret <4 x float> [[VECINIT3_I]] +// float32x4_t test_vdupq_n_f32(float32_t w) { return vdupq_n_f32(w); } // vdupq_lane_f64 -> dup.2d v0, v0[0] -// CHECK-LABEL: define{{.*}} <2 x double> @test_vdupq_lane_f64(<1 x double> noundef %V) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %V to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer -// CHECK: ret <2 x double> [[SHUFFLE]] +// CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64( +// CHECK-SAME: <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V]] to i64 +// CHECK-NEXT: [[__S0_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[__S0_SROA_0_0_VEC_INSERT]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP2]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x double> [[LANE]] +// float64x2_t test_vdupq_lane_f64(float64x1_t V) { return vdupq_lane_f64(V, 0); } // vmovq_n_f64 -> dup Vd.2d,X0 -// CHECK-LABEL: define{{.*}} <2 x double> @test_vmovq_n_f64(double noundef %w) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double %w, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1 -// CHECK: ret <2 x double> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x double> @test_vmovq_n_f64( +// CHECK-SAME: double noundef [[W:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[W]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[W]], i32 1 +// CHECK-NEXT: ret <2 x double> [[VECINIT1_I]] +// float64x2_t test_vmovq_n_f64(float64_t w) { return vmovq_n_f64(w); } -// CHECK-LABEL: define{{.*}} <4 x half> @test_vmov_n_f16(ptr noundef %a1) #0 { -// CHECK: [[TMP0:%.*]] = load half, ptr %a1, align 2 -// CHECK: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0 -// CHECK: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1 -// CHECK: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2 -// CHECK: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3 -// CHECK: ret <4 x half> [[VECINIT3]] +// CHECK-LABEL: define <4 x half> @test_vmov_n_f16( +// CHECK-SAME: ptr noundef [[A1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A1]], align 2 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3 +// CHECK-NEXT: ret <4 x half> [[VECINIT3]] +// float16x4_t test_vmov_n_f16(float16_t *a1) { return vmov_n_f16(*a1); } @@ -62,17 +79,20 @@ float64x1_t test_vmov_n_f64(float64_t a1) { } */ -// CHECK-LABEL: define{{.*}} <8 x half> @test_vmovq_n_f16(ptr noundef %a1) #0 { -// CHECK: [[TMP0:%.*]] = load half, ptr %a1, align 2 -// CHECK: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0 -// CHECK: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1 -// CHECK: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2 -// CHECK: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3 -// CHECK: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4 -// CHECK: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5 -// CHECK: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6 -// CHECK: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7 -// CHECK: ret <8 x half> [[VECINIT7]] +// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16( +// CHECK-SAME: ptr noundef [[A1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A1]], align 2 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4 +// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7 +// CHECK-NEXT: ret <8 x half> [[VECINIT7]] +// float16x8_t test_vmovq_n_f16(float16_t *a1) { return vmovq_n_f16(*a1); } diff --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c index 9f43dd2be5af5..eb9fe126ff2a0 100644 --- a/clang/test/CodeGen/arm_neon_intrinsics.c +++ b/clang/test/CodeGen/arm_neon_intrinsics.c @@ -1,20011 +1,27310 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\ // RUN: -target-cpu swift \ // RUN: -target-feature +fullfp16 -ffreestanding \ // RUN: -flax-vector-conversions=none \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck %s +// RUN: | opt -S -passes=mem2reg,sroa | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: @test_vaba_s8( -// CHECK: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define <8 x i8> @test_vaba_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[A]], [[VABD_V_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return vaba_s8(a, b, c); } -// CHECK-LABEL: @test_vaba_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vaba_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) +// CHECK-NEXT: [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[TMP2]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vaba_s16(a, b, c); } -// CHECK-LABEL: @test_vaba_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vaba_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) +// CHECK-NEXT: [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[TMP2]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vaba_s32(a, b, c); } -// CHECK-LABEL: @test_vaba_u8( -// CHECK: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define <8 x i8> @test_vaba_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[A]], [[VABD_V_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vaba_u8(a, b, c); } -// CHECK-LABEL: @test_vaba_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vaba_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) +// CHECK-NEXT: [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[TMP2]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vaba_u16(a, b, c); } -// CHECK-LABEL: @test_vaba_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vaba_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) +// CHECK-NEXT: [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[TMP2]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vaba_u32(a, b, c); } -// CHECK-LABEL: @test_vabaq_s8( -// CHECK: [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %b, <16 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define <16 x i8> @test_vabaq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> [[B]], <16 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[A]], [[VABDQ_V_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { return vabaq_s8(a, b, c); } -// CHECK-LABEL: @test_vabaq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8> -// CHECK: [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %b, <8 x i16> %c) -// CHECK: [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vabaq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8> +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) +// CHECK-NEXT: [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[TMP2]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { return vabaq_s16(a, b, c); } -// CHECK-LABEL: @test_vabaq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8> -// CHECK: [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %b, <4 x i32> %c) -// CHECK: [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vabaq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[C]] to <16 x i8> +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) +// CHECK-NEXT: [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[TMP2]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { return vabaq_s32(a, b, c); } -// CHECK-LABEL: @test_vabaq_u8( -// CHECK: [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %b, <16 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define <16 x i8> @test_vabaq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> [[B]], <16 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[A]], [[VABDQ_V_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vabaq_u8(a, b, c); } -// CHECK-LABEL: @test_vabaq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8> -// CHECK: [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %b, <8 x i16> %c) -// CHECK: [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vabaq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8> +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) +// CHECK-NEXT: [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[TMP2]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vabaq_u16(a, b, c); } -// CHECK-LABEL: @test_vabaq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8> -// CHECK: [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %b, <4 x i32> %c) -// CHECK: [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vabaq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[C]] to <16 x i8> +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) +// CHECK-NEXT: [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[TMP2]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vabaq_u32(a, b, c); } -// CHECK-LABEL: @test_vabal_s8( -// CHECK: [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vabal_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vabal_s8(a, b, c); } -// CHECK-LABEL: @test_vabal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vabal_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) +// CHECK-NEXT: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vabal_s16(a, b, c); } -// CHECK-LABEL: @test_vabal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vabal_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) +// CHECK-NEXT: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vabal_s32(a, b, c); } -// CHECK-LABEL: @test_vabal_u8( -// CHECK: [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vabal_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vabal_u8(a, b, c); } -// CHECK-LABEL: @test_vabal_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vabal_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) +// CHECK-NEXT: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vabal_u16(a, b, c); } -// CHECK-LABEL: @test_vabal_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vabal_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) +// CHECK-NEXT: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vabal_u32(a, b, c); } -// CHECK-LABEL: @test_vabd_s8( -// CHECK: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VABD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vabd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VABD_V_I]] +// int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) { return vabd_s8(a, b); } -// CHECK-LABEL: @test_vabd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VABD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vabd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) +// CHECK-NEXT: [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) { return vabd_s16(a, b); } -// CHECK-LABEL: @test_vabd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VABD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vabd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) +// CHECK-NEXT: [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) { return vabd_s32(a, b); } -// CHECK-LABEL: @test_vabd_u8( -// CHECK: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VABD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vabd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VABD_V_I]] +// uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) { return vabd_u8(a, b); } -// CHECK-LABEL: @test_vabd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VABD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vabd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) +// CHECK-NEXT: [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) { return vabd_u16(a, b); } -// CHECK-LABEL: @test_vabd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VABD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vabd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) +// CHECK-NEXT: [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) { return vabd_u32(a, b); } -// CHECK-LABEL: @test_vabd_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VABD_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vabd_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> [[VABD_V_I]], <2 x float> [[VABD_V1_I]]) +// CHECK-NEXT: [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] +// float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) { return vabd_f32(a, b); } -// CHECK-LABEL: @test_vabdq_s8( -// CHECK: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VABDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vabdq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VABDQ_V_I]] +// int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) { return vabdq_s8(a, b); } -// CHECK-LABEL: @test_vabdq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VABDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vabdq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) +// CHECK-NEXT: [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) { return vabdq_s16(a, b); } -// CHECK-LABEL: @test_vabdq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VABDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vabdq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) +// CHECK-NEXT: [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) { return vabdq_s32(a, b); } -// CHECK-LABEL: @test_vabdq_u8( -// CHECK: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VABDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vabdq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VABDQ_V_I]] +// uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) { return vabdq_u8(a, b); } -// CHECK-LABEL: @test_vabdq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VABDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vabdq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) +// CHECK-NEXT: [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) { return vabdq_u16(a, b); } -// CHECK-LABEL: @test_vabdq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VABDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vabdq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) +// CHECK-NEXT: [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) { return vabdq_u32(a, b); } -// CHECK-LABEL: @test_vabdq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VABDQ_V2_I]] +// CHECK-LABEL: define <4 x float> @test_vabdq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> [[VABDQ_V_I]], <4 x float> [[VABDQ_V1_I]]) +// CHECK-NEXT: [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) { return vabdq_f32(a, b); } -// CHECK-LABEL: @test_vabdl_s8( -// CHECK: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I_I]] +// CHECK-LABEL: define <8 x i16> @test_vabdl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] +// int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) { return vabdl_s8(a, b); } -// CHECK-LABEL: @test_vabdl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I_I]] +// CHECK-LABEL: define <4 x i32> @test_vabdl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) +// CHECK-NEXT: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] +// int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) { return vabdl_s16(a, b); } -// CHECK-LABEL: @test_vabdl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I_I]] +// CHECK-LABEL: define <2 x i64> @test_vabdl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) +// CHECK-NEXT: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] +// int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) { return vabdl_s32(a, b); } -// CHECK-LABEL: @test_vabdl_u8( -// CHECK: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I_I]] +// CHECK-LABEL: define <8 x i16> @test_vabdl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] +// uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) { return vabdl_u8(a, b); } -// CHECK-LABEL: @test_vabdl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I_I]] +// CHECK-LABEL: define <4 x i32> @test_vabdl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) +// CHECK-NEXT: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] +// uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) { return vabdl_u16(a, b); } -// CHECK-LABEL: @test_vabdl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I_I]] +// CHECK-LABEL: define <2 x i64> @test_vabdl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) +// CHECK-NEXT: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] +// uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) { return vabdl_u32(a, b); } -// CHECK-LABEL: @test_vabs_s8( -// CHECK: [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VABS_I]] +// CHECK-LABEL: define <8 x i8> @test_vabs_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VABS_I]] +// int8x8_t test_vabs_s8(int8x8_t a) { return vabs_s8(a); } -// CHECK-LABEL: @test_vabs_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a) -// CHECK: ret <4 x i16> [[VABS1_I]] +// CHECK-LABEL: define <4 x i16> @test_vabs_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> [[VABS_I]]) +// CHECK-NEXT: ret <4 x i16> [[VABS1_I]] +// int16x4_t test_vabs_s16(int16x4_t a) { return vabs_s16(a); } -// CHECK-LABEL: @test_vabs_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a) -// CHECK: ret <2 x i32> [[VABS1_I]] +// CHECK-LABEL: define <2 x i32> @test_vabs_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> [[VABS_I]]) +// CHECK-NEXT: ret <2 x i32> [[VABS1_I]] +// int32x2_t test_vabs_s32(int32x2_t a) { return vabs_s32(a); } -// CHECK-LABEL: @test_vabs_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VABS1_I]] +// CHECK-LABEL: define <2 x float> @test_vabs_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) +// CHECK-NEXT: ret <2 x float> [[VABS1_I]] +// float32x2_t test_vabs_f32(float32x2_t a) { return vabs_f32(a); } -// CHECK-LABEL: @test_vabsq_s8( -// CHECK: [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VABS_I]] +// CHECK-LABEL: define <16 x i8> @test_vabsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VABS_I]] +// int8x16_t test_vabsq_s8(int8x16_t a) { return vabsq_s8(a); } -// CHECK-LABEL: @test_vabsq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a) -// CHECK: ret <8 x i16> [[VABS1_I]] +// CHECK-LABEL: define <8 x i16> @test_vabsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> [[VABS_I]]) +// CHECK-NEXT: ret <8 x i16> [[VABS1_I]] +// int16x8_t test_vabsq_s16(int16x8_t a) { return vabsq_s16(a); } -// CHECK-LABEL: @test_vabsq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a) -// CHECK: ret <4 x i32> [[VABS1_I]] +// CHECK-LABEL: define <4 x i32> @test_vabsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> [[VABS_I]]) +// CHECK-NEXT: ret <4 x i32> [[VABS1_I]] +// int32x4_t test_vabsq_s32(int32x4_t a) { return vabsq_s32(a); } -// CHECK-LABEL: @test_vabsq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VABS1_I]] +// CHECK-LABEL: define <4 x float> @test_vabsq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) +// CHECK-NEXT: ret <4 x float> [[VABS1_I]] +// float32x4_t test_vabsq_f32(float32x4_t a) { return vabsq_f32(a); } -// CHECK-LABEL: @test_vadd_s8( -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define <8 x i8> @test_vadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) { return vadd_s8(a, b); } -// CHECK-LABEL: @test_vadd_s16( -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) { return vadd_s16(a, b); } -// CHECK-LABEL: @test_vadd_s32( -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) { return vadd_s32(a, b); } -// CHECK-LABEL: @test_vadd_s64( -// CHECK: [[ADD_I:%.*]] = add <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vadd_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) { return vadd_s64(a, b); } -// CHECK-LABEL: @test_vadd_f32( -// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, %b -// CHECK: ret <2 x float> [[ADD_I]] +// CHECK-LABEL: define <2 x float> @test_vadd_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[B]] +// CHECK-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) { return vadd_f32(a, b); } -// CHECK-LABEL: @test_vadd_u8( -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define <8 x i8> @test_vadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) { return vadd_u8(a, b); } -// CHECK-LABEL: @test_vadd_u16( -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) { return vadd_u16(a, b); } -// CHECK-LABEL: @test_vadd_u32( -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) { return vadd_u32(a, b); } -// CHECK-LABEL: @test_vadd_u64( -// CHECK: [[ADD_I:%.*]] = add <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vadd_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) { return vadd_u64(a, b); } -// CHECK-LABEL: @test_vaddq_s8( -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define <16 x i8> @test_vaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) { return vaddq_s8(a, b); } -// CHECK-LABEL: @test_vaddq_s16( -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) { return vaddq_s16(a, b); } -// CHECK-LABEL: @test_vaddq_s32( -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) { return vaddq_s32(a, b); } -// CHECK-LABEL: @test_vaddq_s64( -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) { return vaddq_s64(a, b); } -// CHECK-LABEL: @test_vaddq_f32( -// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, %b -// CHECK: ret <4 x float> [[ADD_I]] +// CHECK-LABEL: define <4 x float> @test_vaddq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[B]] +// CHECK-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) { return vaddq_f32(a, b); } -// CHECK-LABEL: @test_vaddq_u8( -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define <16 x i8> @test_vaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) { return vaddq_u8(a, b); } -// CHECK-LABEL: @test_vaddq_u16( -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) { return vaddq_u16(a, b); } -// CHECK-LABEL: @test_vaddq_u32( -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) { return vaddq_u32(a, b); } -// CHECK-LABEL: @test_vaddq_u64( -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vaddq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) { return vaddq_u64(a, b); } -// CHECK-LABEL: @test_vaddhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) -// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VADDHN2_I]] +// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VADDHN2_I]] +// int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) { return vaddhn_s16(a, b); } -// CHECK-LABEL: @test_vaddhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) -// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VADDHN2_I]] +// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VADDHN2_I]] +// int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) { return vaddhn_s32(a, b); } -// CHECK-LABEL: @test_vaddhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) -// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VADDHN2_I]] +// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VADDHN2_I]] +// int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) { return vaddhn_s64(a, b); } -// CHECK-LABEL: @test_vaddhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) -// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VADDHN2_I]] +// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VADDHN2_I]] +// uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) { return vaddhn_u16(a, b); } -// CHECK-LABEL: @test_vaddhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) -// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VADDHN2_I]] +// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VADDHN2_I]] +// uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) { return vaddhn_u32(a, b); } -// CHECK-LABEL: @test_vaddhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) -// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VADDHN2_I]] +// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VADDHN2_I]] +// uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) { return vaddhn_u64(a, b); } -// CHECK-LABEL: @test_vaddl_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vaddl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I6:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I6]], [[VMOVL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) { return vaddl_s8(a, b); } -// CHECK-LABEL: @test_vaddl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vaddl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I6:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I6]], [[VMOVL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) { return vaddl_s16(a, b); } -// CHECK-LABEL: @test_vaddl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vaddl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I6:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I6]], [[VMOVL_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) { return vaddl_s32(a, b); } -// CHECK-LABEL: @test_vaddl_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vaddl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I6:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I6]], [[VMOVL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) { return vaddl_u8(a, b); } -// CHECK-LABEL: @test_vaddl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vaddl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I6:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I6]], [[VMOVL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) { return vaddl_u16(a, b); } -// CHECK-LABEL: @test_vaddl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vaddl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I6:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I6]], [[VMOVL_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) { return vaddl_u32(a, b); } -// CHECK-LABEL: @test_vaddw_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vaddw_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) { return vaddw_s8(a, b); } -// CHECK-LABEL: @test_vaddw_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vaddw_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) { return vaddw_s16(a, b); } -// CHECK-LABEL: @test_vaddw_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vaddw_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) { return vaddw_s32(a, b); } -// CHECK-LABEL: @test_vaddw_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vaddw_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) { return vaddw_u8(a, b); } -// CHECK-LABEL: @test_vaddw_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vaddw_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) { return vaddw_u16(a, b); } -// CHECK-LABEL: @test_vaddw_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vaddw_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) { return vaddw_u32(a, b); } -// CHECK-LABEL: @test_vand_s8( -// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[AND_I]] +// CHECK-LABEL: define <8 x i8> @test_vand_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[AND_I]] +// int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) { return vand_s8(a, b); } -// CHECK-LABEL: @test_vand_s16( -// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[AND_I]] +// CHECK-LABEL: define <4 x i16> @test_vand_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[AND_I]] +// int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) { return vand_s16(a, b); } -// CHECK-LABEL: @test_vand_s32( -// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[AND_I]] +// CHECK-LABEL: define <2 x i32> @test_vand_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[AND_I]] +// int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) { return vand_s32(a, b); } -// CHECK-LABEL: @test_vand_s64( -// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[AND_I]] +// CHECK-LABEL: define <1 x i64> @test_vand_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[AND_I]] +// int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) { return vand_s64(a, b); } -// CHECK-LABEL: @test_vand_u8( -// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[AND_I]] +// CHECK-LABEL: define <8 x i8> @test_vand_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[AND_I]] +// uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) { return vand_u8(a, b); } -// CHECK-LABEL: @test_vand_u16( -// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[AND_I]] +// CHECK-LABEL: define <4 x i16> @test_vand_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[AND_I]] +// uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) { return vand_u16(a, b); } -// CHECK-LABEL: @test_vand_u32( -// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[AND_I]] +// CHECK-LABEL: define <2 x i32> @test_vand_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[AND_I]] +// uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) { return vand_u32(a, b); } -// CHECK-LABEL: @test_vand_u64( -// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[AND_I]] +// CHECK-LABEL: define <1 x i64> @test_vand_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[AND_I]] +// uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) { return vand_u64(a, b); } -// CHECK-LABEL: @test_vandq_s8( -// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[AND_I]] +// CHECK-LABEL: define <16 x i8> @test_vandq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[AND_I]] +// int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) { return vandq_s8(a, b); } -// CHECK-LABEL: @test_vandq_s16( -// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[AND_I]] +// CHECK-LABEL: define <8 x i16> @test_vandq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[AND_I]] +// int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) { return vandq_s16(a, b); } -// CHECK-LABEL: @test_vandq_s32( -// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[AND_I]] +// CHECK-LABEL: define <4 x i32> @test_vandq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[AND_I]] +// int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) { return vandq_s32(a, b); } -// CHECK-LABEL: @test_vandq_s64( -// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[AND_I]] +// CHECK-LABEL: define <2 x i64> @test_vandq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[AND_I]] +// int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) { return vandq_s64(a, b); } -// CHECK-LABEL: @test_vandq_u8( -// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[AND_I]] +// CHECK-LABEL: define <16 x i8> @test_vandq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[AND_I]] +// uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) { return vandq_u8(a, b); } -// CHECK-LABEL: @test_vandq_u16( -// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[AND_I]] +// CHECK-LABEL: define <8 x i16> @test_vandq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[AND_I]] +// uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) { return vandq_u16(a, b); } -// CHECK-LABEL: @test_vandq_u32( -// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[AND_I]] +// CHECK-LABEL: define <4 x i32> @test_vandq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[AND_I]] +// uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) { return vandq_u32(a, b); } -// CHECK-LABEL: @test_vandq_u64( -// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[AND_I]] +// CHECK-LABEL: define <2 x i64> @test_vandq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[AND_I]] +// uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) { return vandq_u64(a, b); } -// CHECK-LABEL: @test_vbic_s8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, splat (i8 -1) -// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]] -// CHECK: ret <8 x i8> [[AND_I]] +// CHECK-LABEL: define <8 x i8> @test_vbic_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i8> [[AND_I]] +// int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) { return vbic_s8(a, b); } -// CHECK-LABEL: @test_vbic_s16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, splat (i16 -1) -// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]] -// CHECK: ret <4 x i16> [[AND_I]] +// CHECK-LABEL: define <4 x i16> @test_vbic_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i16> [[AND_I]] +// int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) { return vbic_s16(a, b); } -// CHECK-LABEL: @test_vbic_s32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, splat (i32 -1) -// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]] -// CHECK: ret <2 x i32> [[AND_I]] +// CHECK-LABEL: define <2 x i32> @test_vbic_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i32> [[AND_I]] +// int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) { return vbic_s32(a, b); } -// CHECK-LABEL: @test_vbic_s64( -// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, splat (i64 -1) -// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]] -// CHECK: ret <1 x i64> [[AND_I]] +// CHECK-LABEL: define <1 x i64> @test_vbic_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <1 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <1 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <1 x i64> [[AND_I]] +// int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) { return vbic_s64(a, b); } -// CHECK-LABEL: @test_vbic_u8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, splat (i8 -1) -// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]] -// CHECK: ret <8 x i8> [[AND_I]] +// CHECK-LABEL: define <8 x i8> @test_vbic_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i8> [[AND_I]] +// uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) { return vbic_u8(a, b); } -// CHECK-LABEL: @test_vbic_u16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, splat (i16 -1) -// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]] -// CHECK: ret <4 x i16> [[AND_I]] +// CHECK-LABEL: define <4 x i16> @test_vbic_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i16> [[AND_I]] +// uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) { return vbic_u16(a, b); } -// CHECK-LABEL: @test_vbic_u32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, splat (i32 -1) -// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]] -// CHECK: ret <2 x i32> [[AND_I]] +// CHECK-LABEL: define <2 x i32> @test_vbic_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i32> [[AND_I]] +// uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) { return vbic_u32(a, b); } -// CHECK-LABEL: @test_vbic_u64( -// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, splat (i64 -1) -// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]] -// CHECK: ret <1 x i64> [[AND_I]] +// CHECK-LABEL: define <1 x i64> @test_vbic_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <1 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <1 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <1 x i64> [[AND_I]] +// uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) { return vbic_u64(a, b); } -// CHECK-LABEL: @test_vbicq_s8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, splat (i8 -1) -// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]] -// CHECK: ret <16 x i8> [[AND_I]] +// CHECK-LABEL: define <16 x i8> @test_vbicq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <16 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <16 x i8> [[AND_I]] +// int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) { return vbicq_s8(a, b); } -// CHECK-LABEL: @test_vbicq_s16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, splat (i16 -1) -// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]] -// CHECK: ret <8 x i16> [[AND_I]] +// CHECK-LABEL: define <8 x i16> @test_vbicq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i16> [[AND_I]] +// int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) { return vbicq_s16(a, b); } -// CHECK-LABEL: @test_vbicq_s32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, splat (i32 -1) -// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]] -// CHECK: ret <4 x i32> [[AND_I]] +// CHECK-LABEL: define <4 x i32> @test_vbicq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i32> [[AND_I]] +// int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) { return vbicq_s32(a, b); } -// CHECK-LABEL: @test_vbicq_s64( -// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, splat (i64 -1) -// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]] -// CHECK: ret <2 x i64> [[AND_I]] +// CHECK-LABEL: define <2 x i64> @test_vbicq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i64> [[AND_I]] +// int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) { return vbicq_s64(a, b); } -// CHECK-LABEL: @test_vbicq_u8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, splat (i8 -1) -// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]] -// CHECK: ret <16 x i8> [[AND_I]] +// CHECK-LABEL: define <16 x i8> @test_vbicq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <16 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <16 x i8> [[AND_I]] +// uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) { return vbicq_u8(a, b); } -// CHECK-LABEL: @test_vbicq_u16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, splat (i16 -1) -// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]] -// CHECK: ret <8 x i16> [[AND_I]] +// CHECK-LABEL: define <8 x i16> @test_vbicq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i16> [[AND_I]] +// uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) { return vbicq_u16(a, b); } -// CHECK-LABEL: @test_vbicq_u32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, splat (i32 -1) -// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]] -// CHECK: ret <4 x i32> [[AND_I]] +// CHECK-LABEL: define <4 x i32> @test_vbicq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i32> [[AND_I]] +// uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) { return vbicq_u32(a, b); } -// CHECK-LABEL: @test_vbicq_u64( -// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, splat (i64 -1) -// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]] -// CHECK: ret <2 x i64> [[AND_I]] +// CHECK-LABEL: define <2 x i64> @test_vbicq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i64> [[AND_I]] +// uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) { return vbicq_u64(a, b); } -// CHECK-LABEL: @test_vbsl_s8( -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) -// CHECK: ret <8 x i8> [[VBSL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vbsl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VBSL_V_I]] +// int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) { return vbsl_s8(a, b, c); } -// CHECK-LABEL: @test_vbsl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP3]] +// CHECK-LABEL: define <4 x i16> @test_vbsl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP3]] +// int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) { return vbsl_s16(a, b, c); } -// CHECK-LABEL: @test_vbsl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[TMP3]] +// CHECK-LABEL: define <2 x i32> @test_vbsl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP3]] +// int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) { return vbsl_s32(a, b, c); } -// CHECK-LABEL: @test_vbsl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[TMP3]] +// CHECK-LABEL: define <1 x i64> @test_vbsl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]], <1 x i64> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP3]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) { return vbsl_s64(a, b, c); } -// CHECK-LABEL: @test_vbsl_u8( -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) -// CHECK: ret <8 x i8> [[VBSL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vbsl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VBSL_V_I]] +// uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vbsl_u8(a, b, c); } -// CHECK-LABEL: @test_vbsl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP3]] +// CHECK-LABEL: define <4 x i16> @test_vbsl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP3]] +// uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vbsl_u16(a, b, c); } -// CHECK-LABEL: @test_vbsl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[TMP3]] +// CHECK-LABEL: define <2 x i32> @test_vbsl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP3]] +// uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vbsl_u32(a, b, c); } -// CHECK-LABEL: @test_vbsl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[TMP3]] +// CHECK-LABEL: define <1 x i64> @test_vbsl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]], <1 x i64> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP3]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) { return vbsl_u64(a, b, c); } -// CHECK-LABEL: @test_vbsl_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x float> -// CHECK: ret <2 x float> [[TMP3]] +// CHECK-LABEL: define <2 x float> @test_vbsl_f32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[C]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP6]] +// float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) { return vbsl_f32(a, b, c); } -// CHECK-LABEL: @test_vbsl_p8( -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) -// CHECK: ret <8 x i8> [[VBSL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vbsl_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VBSL_V_I]] +// poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) { return vbsl_p8(a, b, c); } -// CHECK-LABEL: @test_vbsl_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP3]] +// CHECK-LABEL: define <4 x i16> @test_vbsl_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP3]] +// poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) { return vbsl_p16(a, b, c); } -// CHECK-LABEL: @test_vbslq_s8( -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) -// CHECK: ret <16 x i8> [[VBSLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vbslq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VBSLQ_V_I]] +// int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) { return vbslq_s8(a, b, c); } -// CHECK-LABEL: @test_vbslq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP3]] +// CHECK-LABEL: define <8 x i16> @test_vbslq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) { return vbslq_s16(a, b, c); } -// CHECK-LABEL: @test_vbslq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[TMP3]] +// CHECK-LABEL: define <4 x i32> @test_vbslq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) { return vbslq_s32(a, b, c); } -// CHECK-LABEL: @test_vbslq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[TMP3]] +// CHECK-LABEL: define <2 x i64> @test_vbslq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]], <2 x i64> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) { return vbslq_s64(a, b, c); } -// CHECK-LABEL: @test_vbslq_u8( -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) -// CHECK: ret <16 x i8> [[VBSLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vbslq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VBSLQ_V_I]] +// uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vbslq_u8(a, b, c); } -// CHECK-LABEL: @test_vbslq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP3]] +// CHECK-LABEL: define <8 x i16> @test_vbslq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vbslq_u16(a, b, c); } -// CHECK-LABEL: @test_vbslq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[TMP3]] +// CHECK-LABEL: define <4 x i32> @test_vbslq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vbslq_u32(a, b, c); } -// CHECK-LABEL: @test_vbslq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[TMP3]] +// CHECK-LABEL: define <2 x i64> @test_vbslq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]], <2 x i64> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) { return vbslq_u64(a, b, c); } -// CHECK-LABEL: @test_vbslq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x float> -// CHECK: ret <4 x float> [[TMP3]] +// CHECK-LABEL: define <4 x float> @test_vbslq_f32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[C]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP6]] +// float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) { return vbslq_f32(a, b, c); } -// CHECK-LABEL: @test_vbslq_p8( -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) -// CHECK: ret <16 x i8> [[VBSLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vbslq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VBSLQ_V_I]] +// poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) { return vbslq_p8(a, b, c); } -// CHECK-LABEL: @test_vbslq_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP3]] +// CHECK-LABEL: define <8 x i16> @test_vbslq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) { return vbslq_p16(a, b, c); } -// CHECK-LABEL: @test_vcage_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x i32> [[VCAGE_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vcage_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCAGE_V2_I]] +// uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) { return vcage_f32(a, b); } -// CHECK-LABEL: @test_vcageq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x i32> [[VCAGEQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vcageq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCAGEQ_V2_I]] +// uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) { return vcageq_f32(a, b); } -// CHECK-LABEL: @test_vcagt_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x i32> [[VCAGT_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vcagt_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCAGT_V2_I]] +// uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) { return vcagt_f32(a, b); } -// CHECK-LABEL: @test_vcagtq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x i32> [[VCAGTQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCAGTQ_V2_I]] +// uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) { return vcagtq_f32(a, b); } -// CHECK-LABEL: @test_vcale_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %b, <2 x float> %a) -// CHECK: ret <2 x i32> [[VCALE_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vcale_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCALE_V2_I]] +// uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) { return vcale_f32(a, b); } -// CHECK-LABEL: @test_vcaleq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %b, <4 x float> %a) -// CHECK: ret <4 x i32> [[VCALEQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCALEQ_V2_I]] +// uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) { return vcaleq_f32(a, b); } -// CHECK-LABEL: @test_vcalt_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %b, <2 x float> %a) -// CHECK: ret <2 x i32> [[VCALT_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vcalt_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VCALT_V2_I]] +// uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) { return vcalt_f32(a, b); } -// CHECK-LABEL: @test_vcaltq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %b, <4 x float> %a) -// CHECK: ret <4 x i32> [[VCALTQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VCALTQ_V2_I]] +// uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) { return vcaltq_f32(a, b); } -// CHECK-LABEL: @test_vceq_s8( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vceq_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) { return vceq_s8(a, b); } -// CHECK-LABEL: @test_vceq_s16( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vceq_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) { return vceq_s16(a, b); } -// CHECK-LABEL: @test_vceq_s32( -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vceq_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) { return vceq_s32(a, b); } -// CHECK-LABEL: @test_vceq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vceq_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <2 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) { return vceq_f32(a, b); } -// CHECK-LABEL: @test_vceq_u8( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vceq_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) { return vceq_u8(a, b); } -// CHECK-LABEL: @test_vceq_u16( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vceq_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) { return vceq_u16(a, b); } -// CHECK-LABEL: @test_vceq_u32( -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vceq_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) { return vceq_u32(a, b); } -// CHECK-LABEL: @test_vceq_p8( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vceq_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) { return vceq_p8(a, b); } -// CHECK-LABEL: @test_vceqq_s8( -// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vceqq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) { return vceqq_s8(a, b); } -// CHECK-LABEL: @test_vceqq_s16( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vceqq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) { return vceqq_s16(a, b); } -// CHECK-LABEL: @test_vceqq_s32( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vceqq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) { return vceqq_s32(a, b); } -// CHECK-LABEL: @test_vceqq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vceqq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <4 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) { return vceqq_f32(a, b); } -// CHECK-LABEL: @test_vceqq_u8( -// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vceqq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) { return vceqq_u8(a, b); } -// CHECK-LABEL: @test_vceqq_u16( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vceqq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) { return vceqq_u16(a, b); } -// CHECK-LABEL: @test_vceqq_u32( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vceqq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) { return vceqq_u32(a, b); } -// CHECK-LABEL: @test_vceqq_p8( -// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vceqq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) { return vceqq_p8(a, b); } -// CHECK-LABEL: @test_vcge_s8( -// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vcge_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) { return vcge_s8(a, b); } -// CHECK-LABEL: @test_vcge_s16( -// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vcge_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) { return vcge_s16(a, b); } -// CHECK-LABEL: @test_vcge_s32( -// CHECK: [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcge_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) { return vcge_s32(a, b); } -// CHECK-LABEL: @test_vcge_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcge_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <2 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) { return vcge_f32(a, b); } -// CHECK-LABEL: @test_vcge_u8( -// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vcge_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) { return vcge_u8(a, b); } -// CHECK-LABEL: @test_vcge_u16( -// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vcge_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) { return vcge_u16(a, b); } -// CHECK-LABEL: @test_vcge_u32( -// CHECK: [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcge_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) { return vcge_u32(a, b); } -// CHECK-LABEL: @test_vcgeq_s8( -// CHECK: [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) { return vcgeq_s8(a, b); } -// CHECK-LABEL: @test_vcgeq_s16( -// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) { return vcgeq_s16(a, b); } -// CHECK-LABEL: @test_vcgeq_s32( -// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) { return vcgeq_s32(a, b); } -// CHECK-LABEL: @test_vcgeq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <4 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) { return vcgeq_f32(a, b); } -// CHECK-LABEL: @test_vcgeq_u8( -// CHECK: [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) { return vcgeq_u8(a, b); } -// CHECK-LABEL: @test_vcgeq_u16( -// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) { return vcgeq_u16(a, b); } -// CHECK-LABEL: @test_vcgeq_u32( -// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) { return vcgeq_u32(a, b); } -// CHECK-LABEL: @test_vcgt_s8( -// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vcgt_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) { return vcgt_s8(a, b); } -// CHECK-LABEL: @test_vcgt_s16( -// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vcgt_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) { return vcgt_s16(a, b); } -// CHECK-LABEL: @test_vcgt_s32( -// CHECK: [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcgt_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) { return vcgt_s32(a, b); } -// CHECK-LABEL: @test_vcgt_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcgt_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <2 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) { return vcgt_f32(a, b); } -// CHECK-LABEL: @test_vcgt_u8( -// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vcgt_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) { return vcgt_u8(a, b); } -// CHECK-LABEL: @test_vcgt_u16( -// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vcgt_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) { return vcgt_u16(a, b); } -// CHECK-LABEL: @test_vcgt_u32( -// CHECK: [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcgt_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) { return vcgt_u32(a, b); } -// CHECK-LABEL: @test_vcgtq_s8( -// CHECK: [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) { return vcgtq_s8(a, b); } -// CHECK-LABEL: @test_vcgtq_s16( -// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) { return vcgtq_s16(a, b); } -// CHECK-LABEL: @test_vcgtq_s32( -// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) { return vcgtq_s32(a, b); } -// CHECK-LABEL: @test_vcgtq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <4 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) { return vcgtq_f32(a, b); } -// CHECK-LABEL: @test_vcgtq_u8( -// CHECK: [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) { return vcgtq_u8(a, b); } -// CHECK-LABEL: @test_vcgtq_u16( -// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) { return vcgtq_u16(a, b); } -// CHECK-LABEL: @test_vcgtq_u32( -// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) { return vcgtq_u32(a, b); } -// CHECK-LABEL: @test_vcle_s8( -// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vcle_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) { return vcle_s8(a, b); } -// CHECK-LABEL: @test_vcle_s16( -// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vcle_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) { return vcle_s16(a, b); } -// CHECK-LABEL: @test_vcle_s32( -// CHECK: [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcle_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) { return vcle_s32(a, b); } -// CHECK-LABEL: @test_vcle_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcle_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <2 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) { return vcle_f32(a, b); } -// CHECK-LABEL: @test_vcle_u8( -// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vcle_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) { return vcle_u8(a, b); } -// CHECK-LABEL: @test_vcle_u16( -// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vcle_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) { return vcle_u16(a, b); } -// CHECK-LABEL: @test_vcle_u32( -// CHECK: [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcle_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) { return vcle_u32(a, b); } -// CHECK-LABEL: @test_vcleq_s8( -// CHECK: [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcleq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) { return vcleq_s8(a, b); } -// CHECK-LABEL: @test_vcleq_s16( -// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcleq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) { return vcleq_s16(a, b); } -// CHECK-LABEL: @test_vcleq_s32( -// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcleq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) { return vcleq_s32(a, b); } -// CHECK-LABEL: @test_vcleq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcleq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <4 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) { return vcleq_f32(a, b); } -// CHECK-LABEL: @test_vcleq_u8( -// CHECK: [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcleq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) { return vcleq_u8(a, b); } -// CHECK-LABEL: @test_vcleq_u16( -// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcleq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) { return vcleq_u16(a, b); } -// CHECK-LABEL: @test_vcleq_u32( -// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcleq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) { return vcleq_u32(a, b); } -// CHECK-LABEL: @test_vcls_s8( -// CHECK: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCLS_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcls_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCLS_V_I]] +// int8x8_t test_vcls_s8(int8x8_t a) { return vcls_s8(a); } -// CHECK-LABEL: @test_vcls_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLS_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vcls_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[VCLS_V_I]]) +// CHECK-NEXT: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vcls_s16(int16x4_t a) { return vcls_s16(a); } -// CHECK-LABEL: @test_vcls_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLS_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vcls_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[VCLS_V_I]]) +// CHECK-NEXT: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vcls_s32(int32x2_t a) { return vcls_s32(a); } -// CHECK-LABEL: @test_vcls_u8( -// CHECK: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCLS_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcls_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCLS_V_I]] +// int8x8_t test_vcls_u8(uint8x8_t a) { return vcls_u8(a); } -// CHECK-LABEL: @test_vcls_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLS_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vcls_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[VCLS_V_I]]) +// CHECK-NEXT: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vcls_u16(uint16x4_t a) { return vcls_u16(a); } -// CHECK-LABEL: @test_vcls_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLS_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vcls_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[VCLS_V_I]]) +// CHECK-NEXT: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vcls_u32(uint32x2_t a) { return vcls_u32(a); } -// CHECK-LABEL: @test_vclsq_s8( -// CHECK: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCLSQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vclsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCLSQ_V_I]] +// int8x16_t test_vclsq_s8(int8x16_t a) { return vclsq_s8(a); } -// CHECK-LABEL: @test_vclsq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLSQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vclsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[VCLSQ_V_I]]) +// CHECK-NEXT: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vclsq_s16(int16x8_t a) { return vclsq_s16(a); } -// CHECK-LABEL: @test_vclsq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLSQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vclsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[VCLSQ_V_I]]) +// CHECK-NEXT: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// int32x4_t test_vclsq_s32(int32x4_t a) { return vclsq_s32(a); } -// CHECK-LABEL: @test_vclsq_u8( -// CHECK: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCLSQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vclsq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCLSQ_V_I]] +// int8x16_t test_vclsq_u8(uint8x16_t a) { return vclsq_u8(a); } -// CHECK-LABEL: @test_vclsq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLSQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vclsq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[VCLSQ_V_I]]) +// CHECK-NEXT: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vclsq_u16(uint16x8_t a) { return vclsq_u16(a); } -// CHECK-LABEL: @test_vclsq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLSQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vclsq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[VCLSQ_V_I]]) +// CHECK-NEXT: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// int32x4_t test_vclsq_u32(uint32x4_t a) { return vclsq_u32(a); } -// CHECK-LABEL: @test_vclt_s8( -// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vclt_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) { return vclt_s8(a, b); } -// CHECK-LABEL: @test_vclt_s16( -// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vclt_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) { return vclt_s16(a, b); } -// CHECK-LABEL: @test_vclt_s32( -// CHECK: [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vclt_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) { return vclt_s32(a, b); } -// CHECK-LABEL: @test_vclt_f32( -// CHECK: [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vclt_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) { return vclt_f32(a, b); } -// CHECK-LABEL: @test_vclt_u8( -// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vclt_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) { return vclt_u8(a, b); } -// CHECK-LABEL: @test_vclt_u16( -// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vclt_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) { return vclt_u16(a, b); } -// CHECK-LABEL: @test_vclt_u32( -// CHECK: [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vclt_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) { return vclt_u32(a, b); } -// CHECK-LABEL: @test_vcltq_s8( -// CHECK: [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcltq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) { return vcltq_s8(a, b); } -// CHECK-LABEL: @test_vcltq_s16( -// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcltq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) { return vcltq_s16(a, b); } -// CHECK-LABEL: @test_vcltq_s32( -// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcltq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) { return vcltq_s32(a, b); } -// CHECK-LABEL: @test_vcltq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcltq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) { return vcltq_f32(a, b); } -// CHECK-LABEL: @test_vcltq_u8( -// CHECK: [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcltq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) { return vcltq_u8(a, b); } -// CHECK-LABEL: @test_vcltq_u16( -// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcltq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) { return vcltq_u16(a, b); } -// CHECK-LABEL: @test_vcltq_u32( -// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcltq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) { return vcltq_u32(a, b); } -// CHECK-LABEL: @test_vclz_s8( -// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) -// CHECK: ret <8 x i8> [[VCLZ_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vclz_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <8 x i8> [[VCLZ_V_I]] +// int8x8_t test_vclz_s8(int8x8_t a) { return vclz_s8(a); } -// CHECK-LABEL: @test_vclz_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLZ_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vclz_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vclz_s16(int16x4_t a) { return vclz_s16(a); } -// CHECK-LABEL: @test_vclz_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLZ_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vclz_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vclz_s32(int32x2_t a) { return vclz_s32(a); } -// CHECK-LABEL: @test_vclz_u8( -// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) -// CHECK: ret <8 x i8> [[VCLZ_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vclz_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <8 x i8> [[VCLZ_V_I]] +// uint8x8_t test_vclz_u8(uint8x8_t a) { return vclz_u8(a); } -// CHECK-LABEL: @test_vclz_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLZ_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vclz_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// uint16x4_t test_vclz_u16(uint16x4_t a) { return vclz_u16(a); } -// CHECK-LABEL: @test_vclz_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLZ_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vclz_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// uint32x2_t test_vclz_u32(uint32x2_t a) { return vclz_u32(a); } -// CHECK-LABEL: @test_vclzq_s8( -// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) -// CHECK: ret <16 x i8> [[VCLZQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vclzq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <16 x i8> [[VCLZQ_V_I]] +// int8x16_t test_vclzq_s8(int8x16_t a) { return vclzq_s8(a); } -// CHECK-LABEL: @test_vclzq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLZQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vclzq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vclzq_s16(int16x8_t a) { return vclzq_s16(a); } -// CHECK-LABEL: @test_vclzq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLZQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vclzq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// int32x4_t test_vclzq_s32(int32x4_t a) { return vclzq_s32(a); } -// CHECK-LABEL: @test_vclzq_u8( -// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) -// CHECK: ret <16 x i8> [[VCLZQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vclzq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <16 x i8> [[VCLZQ_V_I]] +// uint8x16_t test_vclzq_u8(uint8x16_t a) { return vclzq_u8(a); } -// CHECK-LABEL: @test_vclzq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLZQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vclzq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// uint16x8_t test_vclzq_u16(uint16x8_t a) { return vclzq_u16(a); } -// CHECK-LABEL: @test_vclzq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLZQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vclzq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// uint32x4_t test_vclzq_u32(uint32x4_t a) { return vclzq_u32(a); } -// CHECK-LABEL: @test_vcnt_u8( -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcnt_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// uint8x8_t test_vcnt_u8(uint8x8_t a) { return vcnt_u8(a); } -// CHECK-LABEL: @test_vcnt_s8( -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcnt_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// int8x8_t test_vcnt_s8(int8x8_t a) { return vcnt_s8(a); } -// CHECK-LABEL: @test_vcnt_p8( -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcnt_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// poly8x8_t test_vcnt_p8(poly8x8_t a) { return vcnt_p8(a); } -// CHECK-LABEL: @test_vcntq_u8( -// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCNTQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vcntq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCNTQ_V_I]] +// uint8x16_t test_vcntq_u8(uint8x16_t a) { return vcntq_u8(a); } -// CHECK-LABEL: @test_vcntq_s8( -// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCNTQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vcntq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCNTQ_V_I]] +// int8x16_t test_vcntq_s8(int8x16_t a) { return vcntq_s8(a); } -// CHECK-LABEL: @test_vcntq_p8( -// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCNTQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vcntq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCNTQ_V_I]] +// poly8x16_t test_vcntq_p8(poly8x16_t a) { return vcntq_p8(a); } -// CHECK-LABEL: @test_vcombine_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vcombine_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) { return vcombine_s8(a, b); } -// CHECK-LABEL: @test_vcombine_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vcombine_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) { return vcombine_s16(a, b); } -// CHECK-LABEL: @test_vcombine_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i32> @test_vcombine_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) { return vcombine_s32(a, b); } -// CHECK-LABEL: @test_vcombine_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i64> @test_vcombine_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> [[A]], <1 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) { return vcombine_s64(a, b); } -// CHECK-LABEL: @test_vcombine_f16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> -// CHECK: ret <8 x half> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x half> @test_vcombine_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x half> [[SHUFFLE_I]] +// float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) { return vcombine_f16(a, b); } -// CHECK-LABEL: @test_vcombine_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x float> @test_vcombine_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) { return vcombine_f32(a, b); } -// CHECK-LABEL: @test_vcombine_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vcombine_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) { return vcombine_u8(a, b); } -// CHECK-LABEL: @test_vcombine_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vcombine_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) { return vcombine_u16(a, b); } -// CHECK-LABEL: @test_vcombine_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i32> @test_vcombine_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) { return vcombine_u32(a, b); } -// CHECK-LABEL: @test_vcombine_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i64> @test_vcombine_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> [[A]], <1 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) { return vcombine_u64(a, b); } -// CHECK-LABEL: @test_vcombine_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vcombine_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) { return vcombine_p8(a, b); } -// CHECK-LABEL: @test_vcombine_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vcombine_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) { return vcombine_p16(a, b); } -// CHECK-LABEL: @test_vcreate_s8( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <8 x i8> -// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) -// CHECK: ret <8 x i8> [[VCLZ_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcreate_s8( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) +// CHECK-NEXT: ret <8 x i8> [[VCLZ_V_I]] +// int8x8_t test_vcreate_s8(uint64_t a) { return vclz_s8(vcreate_s8(a)); } -// CHECK-LABEL: @test_vcreate_imm -// CHECK: [[RES:%.*]] = bitcast i64 0 to <4 x i16> -// CHECK: ret <4 x i16> [[RES]] +// CHECK-LABEL: define <4 x i16> @test_vcreate_imm( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 0 to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vcreate_imm(void) { return vcreate_s16(0); } -// CHECK-LABEL: @test_vcreate_s16( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x i16> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLZ_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vcreate_s16( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vcreate_s16(uint64_t a) { return vclz_s16(vcreate_s16(a)); } -// CHECK-LABEL: @test_vcreate_s32( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLZ_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vcreate_s32( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vcreate_s32(uint64_t a) { return vclz_s32(vcreate_s32(a)); } -// CHECK-LABEL: @test_vcreate_f16( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vcreate_f16( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vcreate_f16(uint64_t a) { return vcreate_f16(a); } -// CHECK-LABEL: @test_vcreate_f32( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vcreate_f32( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vcreate_f32(uint64_t a) { return vcreate_f32(a); } -// CHECK-LABEL: @test_vcreate_u8( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <8 x i8> -// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) -// CHECK: ret <8 x i8> [[VCLZ_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcreate_u8( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) +// CHECK-NEXT: ret <8 x i8> [[VCLZ_V_I]] +// int8x8_t test_vcreate_u8(uint64_t a) { return vclz_s8((int8x8_t)vcreate_u8(a)); } -// CHECK-LABEL: @test_vcreate_u16( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x i16> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLZ_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vcreate_u16( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vcreate_u16(uint64_t a) { return vclz_s16((int16x4_t)vcreate_u16(a)); } -// CHECK-LABEL: @test_vcreate_u32( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLZ_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vcreate_u32( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) +// CHECK-NEXT: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vcreate_u32(uint64_t a) { return vclz_s32((int32x2_t)vcreate_u32(a)); } -// CHECK-LABEL: @test_vcreate_u64( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]] -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vcreate_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <1 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// uint64x1_t test_vcreate_u64(uint64_t a) { uint64x1_t tmp = vcreate_u64(a); return vadd_u64(tmp, tmp); } -// CHECK-LABEL: @test_vcreate_p8( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <8 x i8> -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcreate_p8( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8> +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// poly8x8_t test_vcreate_p8(uint64_t a) { return vcnt_p8(vcreate_p8(a)); } -// CHECK-LABEL: @test_vcreate_p16( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x i16> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP4]] +// CHECK-LABEL: define <4 x i16> @test_vcreate_p16( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP4]] +// poly16x4_t test_vcreate_p16(uint64_t a) { poly16x4_t tmp = vcreate_p16(a); return vbsl_p16((uint16x4_t)tmp, tmp, tmp); } -// CHECK-LABEL: @test_vcreate_s64( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]] -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vcreate_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <1 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// int64x1_t test_vcreate_s64(uint64_t a) { int64x1_t tmp = vcreate_s64(a); return vadd_s64(tmp, tmp); } -// CHECK-LABEL: @test_vcvt_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a) -// CHECK: [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half> -// CHECK: ret <4 x half> [[TMP1]] +// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) +// CHECK-NEXT: [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP3]] +// float16x4_t test_vcvt_f16_f32(float32x4_t a) { return vcvt_f16_f32(a); } -// CHECK-LABEL: @test_vcvt_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = sitofp <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[VCVT_I]] +// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[VCVT_I]] +// float32x2_t test_vcvt_f32_s32(int32x2_t a) { return vcvt_f32_s32(a); } -// CHECK-LABEL: @test_vcvt_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = uitofp <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[VCVT_I]] +// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[VCVT_I]] +// float32x2_t test_vcvt_f32_u32(uint32x2_t a) { return vcvt_f32_u32(a); } -// CHECK-LABEL: @test_vcvtq_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = sitofp <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[VCVT_I]] +// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[VCVT_I]] +// float32x4_t test_vcvtq_f32_s32(int32x4_t a) { return vcvtq_f32_s32(a); } -// CHECK-LABEL: @test_vcvtq_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = uitofp <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[VCVT_I]] +// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[VCVT_I]] +// float32x4_t test_vcvtq_f32_u32(uint32x4_t a) { return vcvtq_f32_u32(a); } -// CHECK-LABEL: @test_vcvt_f32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) -// CHECK: [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VCVT_F32_F161_I]] +// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) +// CHECK-NEXT: [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// float32x4_t test_vcvt_f32_f16(float16x4_t a) { return vcvt_f32_f16(a); } -// CHECK-LABEL: @test_vcvt_n_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1) -// CHECK: ret <2 x float> [[VCVT_N1]] +// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1) +// CHECK-NEXT: ret <2 x float> [[VCVT_N1]] +// float32x2_t test_vcvt_n_f32_s32(int32x2_t a) { return vcvt_n_f32_s32(a, 1); } -// CHECK-LABEL: @test_vcvt_n_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1) -// CHECK: ret <2 x float> [[VCVT_N1]] +// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1) +// CHECK-NEXT: ret <2 x float> [[VCVT_N1]] +// float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) { return vcvt_n_f32_u32(a, 1); } -// CHECK-LABEL: @test_vcvtq_n_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3) -// CHECK: ret <4 x float> [[VCVT_N1]] +// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3) +// CHECK-NEXT: ret <4 x float> [[VCVT_N1]] +// float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) { return vcvtq_n_f32_s32(a, 3); } -// CHECK-LABEL: @test_vcvtq_n_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3) -// CHECK: ret <4 x float> [[VCVT_N1]] +// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3) +// CHECK-NEXT: ret <4 x float> [[VCVT_N1]] +// float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) { return vcvtq_n_f32_u32(a, 3); } -// CHECK-LABEL: @test_vcvt_n_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1) -// CHECK: ret <2 x i32> [[VCVT_N1]] +// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1) +// CHECK-NEXT: ret <2 x i32> [[VCVT_N1]] +// int32x2_t test_vcvt_n_s32_f32(float32x2_t a) { return vcvt_n_s32_f32(a, 1); } -// CHECK-LABEL: @test_vcvtq_n_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3) -// CHECK: ret <4 x i32> [[VCVT_N1]] +// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VCVT_N1]] +// int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) { return vcvtq_n_s32_f32(a, 3); } -// CHECK-LABEL: @test_vcvt_n_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1) -// CHECK: ret <2 x i32> [[VCVT_N1]] +// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1) +// CHECK-NEXT: ret <2 x i32> [[VCVT_N1]] +// uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) { return vcvt_n_u32_f32(a, 1); } -// CHECK-LABEL: @test_vcvtq_n_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3) -// CHECK: ret <4 x i32> [[VCVT_N1]] +// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VCVT_N1]] +// uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) { return vcvtq_n_u32_f32(a, 3); } -// CHECK-LABEL: @test_vcvt_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = fptosi <2 x float> %a to <2 x i32> -// CHECK: ret <2 x i32> [[VCVT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVT_I:%.*]] = fptosi <2 x float> [[TMP2]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCVT_I]] +// int32x2_t test_vcvt_s32_f32(float32x2_t a) { return vcvt_s32_f32(a); } -// CHECK-LABEL: @test_vcvtq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = fptosi <4 x float> %a to <4 x i32> -// CHECK: ret <4 x i32> [[VCVT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVT_I:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCVT_I]] +// int32x4_t test_vcvtq_s32_f32(float32x4_t a) { return vcvtq_s32_f32(a); } -// CHECK-LABEL: @test_vcvt_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = fptoui <2 x float> %a to <2 x i32> -// CHECK: ret <2 x i32> [[VCVT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VCVT_I:%.*]] = fptoui <2 x float> [[TMP2]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCVT_I]] +// uint32x2_t test_vcvt_u32_f32(float32x2_t a) { return vcvt_u32_f32(a); } -// CHECK-LABEL: @test_vcvtq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = fptoui <4 x float> %a to <4 x i32> -// CHECK: ret <4 x i32> [[VCVT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VCVT_I:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCVT_I]] +// uint32x4_t test_vcvtq_u32_f32(float32x4_t a) { return vcvtq_u32_f32(a); } -// CHECK-LABEL: @test_vdup_lane_u8( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE]] +// CHECK-LABEL: define <8 x i8> @test_vdup_lane_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// uint8x8_t test_vdup_lane_u8(uint8x8_t a) { return vdup_lane_u8(a, 7); } -// CHECK-LABEL: @test_vdup_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define <4 x i16> @test_vdup_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// uint16x4_t test_vdup_lane_u16(uint16x4_t a) { return vdup_lane_u16(a, 3); } -// CHECK-LABEL: @test_vdup_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: ret <2 x i32> [[LANE]] +// CHECK-LABEL: define <2 x i32> @test_vdup_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[LANE]] +// uint32x2_t test_vdup_lane_u32(uint32x2_t a) { return vdup_lane_u32(a, 1); } -// CHECK-LABEL: @test_vdup_lane_s8( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE]] +// CHECK-LABEL: define <8 x i8> @test_vdup_lane_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// int8x8_t test_vdup_lane_s8(int8x8_t a) { return vdup_lane_s8(a, 7); } -// CHECK-LABEL: @test_vdup_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define <4 x i16> @test_vdup_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// int16x4_t test_vdup_lane_s16(int16x4_t a) { return vdup_lane_s16(a, 3); } -// CHECK-LABEL: @test_vdup_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: ret <2 x i32> [[LANE]] +// CHECK-LABEL: define <2 x i32> @test_vdup_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[LANE]] +// int32x2_t test_vdup_lane_s32(int32x2_t a) { return vdup_lane_s32(a, 1); } -// CHECK-LABEL: @test_vdup_lane_p8( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE]] +// CHECK-LABEL: define <8 x i8> @test_vdup_lane_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// poly8x8_t test_vdup_lane_p8(poly8x8_t a) { return vdup_lane_p8(a, 7); } -// CHECK-LABEL: @test_vdup_lane_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define <4 x i16> @test_vdup_lane_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// poly16x4_t test_vdup_lane_p16(poly16x4_t a) { return vdup_lane_p16(a, 3); } -// CHECK-LABEL: @test_vdup_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> -// CHECK: ret <2 x float> [[LANE]] +// CHECK-LABEL: define <2 x float> @test_vdup_lane_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[LANE]] +// float32x2_t test_vdup_lane_f32(float32x2_t a) { return vdup_lane_f32(a, 1); } -// CHECK-LABEL: @test_vdupq_lane_u8( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE]] +// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// uint8x16_t test_vdupq_lane_u8(uint8x8_t a) { return vdupq_lane_u8(a, 7); } -// CHECK-LABEL: @test_vdupq_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// uint16x8_t test_vdupq_lane_u16(uint16x4_t a) { return vdupq_lane_u16(a, 3); } -// CHECK-LABEL: @test_vdupq_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: ret <4 x i32> [[LANE]] +// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[LANE]] +// uint32x4_t test_vdupq_lane_u32(uint32x2_t a) { return vdupq_lane_u32(a, 1); } -// CHECK-LABEL: @test_vdupq_lane_s8( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE]] +// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// int8x16_t test_vdupq_lane_s8(int8x8_t a) { return vdupq_lane_s8(a, 7); } -// CHECK-LABEL: @test_vdupq_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// int16x8_t test_vdupq_lane_s16(int16x4_t a) { return vdupq_lane_s16(a, 3); } -// CHECK-LABEL: @test_vdupq_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: ret <4 x i32> [[LANE]] +// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[LANE]] +// int32x4_t test_vdupq_lane_s32(int32x2_t a) { return vdupq_lane_s32(a, 1); } -// CHECK-LABEL: @test_vdupq_lane_p8( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE]] +// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// poly8x16_t test_vdupq_lane_p8(poly8x8_t a) { return vdupq_lane_p8(a, 7); } -// CHECK-LABEL: @test_vdupq_lane_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// poly16x8_t test_vdupq_lane_p16(poly16x4_t a) { return vdupq_lane_p16(a, 3); } -// CHECK-LABEL: @test_vdupq_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> -// CHECK: ret <4 x float> [[LANE]] +// CHECK-LABEL: define <4 x float> @test_vdupq_lane_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[LANE]] +// float32x4_t test_vdupq_lane_f32(float32x2_t a) { return vdupq_lane_f32(a, 1); } -// CHECK-LABEL: @test_vdup_lane_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define <1 x i64> @test_vdup_lane_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[LANE]] +// int64x1_t test_vdup_lane_s64(int64x1_t a) { return vdup_lane_s64(a, 0); } -// CHECK-LABEL: @test_vdup_lane_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define <1 x i64> @test_vdup_lane_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[LANE]] +// uint64x1_t test_vdup_lane_u64(uint64x1_t a) { return vdup_lane_u64(a, 0); } -// CHECK-LABEL: @test_vdupq_lane_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// int64x2_t test_vdupq_lane_s64(int64x1_t a) { return vdupq_lane_s64(a, 0); } -// CHECK-LABEL: @test_vdupq_lane_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// uint64x2_t test_vdupq_lane_u64(uint64x1_t a) { return vdupq_lane_u64(a, 0); } -// CHECK-LABEL: @test_vdup_n_u8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i8> @test_vdup_n_u8( +// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// uint8x8_t test_vdup_n_u8(uint8_t a) { return vdup_n_u8(a); } -// CHECK-LABEL: @test_vdup_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i16> @test_vdup_n_u16( +// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VECINIT3_I]] +// uint16x4_t test_vdup_n_u16(uint16_t a) { return vdup_n_u16(a); } -// CHECK-LABEL: @test_vdup_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: ret <2 x i32> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x i32> @test_vdup_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i32> [[VECINIT1_I]] +// uint32x2_t test_vdup_n_u32(uint32_t a) { return vdup_n_u32(a); } -// CHECK-LABEL: @test_vdup_n_s8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i8> @test_vdup_n_s8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// int8x8_t test_vdup_n_s8(int8_t a) { return vdup_n_s8(a); } -// CHECK-LABEL: @test_vdup_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i16> @test_vdup_n_s16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VECINIT3_I]] +// int16x4_t test_vdup_n_s16(int16_t a) { return vdup_n_s16(a); } -// CHECK-LABEL: @test_vdup_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: ret <2 x i32> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x i32> @test_vdup_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i32> [[VECINIT1_I]] +// int32x2_t test_vdup_n_s32(int32_t a) { return vdup_n_s32(a); } -// CHECK-LABEL: @test_vdup_n_p8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i8> @test_vdup_n_p8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// poly8x8_t test_vdup_n_p8(poly8_t a) { return vdup_n_p8(a); } -// CHECK-LABEL: @test_vdup_n_p16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i16> @test_vdup_n_p16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VECINIT3_I]] +// poly16x4_t test_vdup_n_p16(poly16_t a) { return vdup_n_p16(a); } -// CHECK-LABEL: @test_vdup_n_f16( -// CHECK: [[TMP0:%.*]] = load half, ptr %a, align 2 -// CHECK: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0 -// CHECK: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1 -// CHECK: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2 -// CHECK: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3 -// CHECK: ret <4 x half> [[VECINIT3]] +// CHECK-LABEL: define <4 x half> @test_vdup_n_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3 +// CHECK-NEXT: ret <4 x half> [[VECINIT3]] +// float16x4_t test_vdup_n_f16(float16_t *a) { return vdup_n_f16(*a); } -// CHECK-LABEL: @test_vdup_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1 -// CHECK: ret <2 x float> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x float> @test_vdup_n_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[A]], i32 1 +// CHECK-NEXT: ret <2 x float> [[VECINIT1_I]] +// float32x2_t test_vdup_n_f32(float32_t a) { return vdup_n_f32(a); } -// CHECK-LABEL: @test_vdupq_n_u8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 -// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 -// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 -// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 -// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 -// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 -// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 -// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VECINIT15_I]] +// CHECK-LABEL: define <16 x i8> @test_vdupq_n_u8( +// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7 +// CHECK-NEXT: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8 +// CHECK-NEXT: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9 +// CHECK-NEXT: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10 +// CHECK-NEXT: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11 +// CHECK-NEXT: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12 +// CHECK-NEXT: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13 +// CHECK-NEXT: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// uint8x16_t test_vdupq_n_u8(uint8_t a) { return vdupq_n_u8(a); } -// CHECK-LABEL: @test_vdupq_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i16> @test_vdupq_n_u16( +// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VECINIT7_I]] +// uint16x8_t test_vdupq_n_u16(uint16_t a) { return vdupq_n_u16(a); } -// CHECK-LABEL: @test_vdupq_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3 -// CHECK: ret <4 x i32> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i32> @test_vdupq_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i32> [[VECINIT3_I]] +// uint32x4_t test_vdupq_n_u32(uint32_t a) { return vdupq_n_u32(a); } -// CHECK-LABEL: @test_vdupq_n_s8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 -// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 -// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 -// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 -// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 -// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 -// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 -// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VECINIT15_I]] +// CHECK-LABEL: define <16 x i8> @test_vdupq_n_s8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7 +// CHECK-NEXT: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8 +// CHECK-NEXT: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9 +// CHECK-NEXT: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10 +// CHECK-NEXT: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11 +// CHECK-NEXT: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12 +// CHECK-NEXT: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13 +// CHECK-NEXT: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// int8x16_t test_vdupq_n_s8(int8_t a) { return vdupq_n_s8(a); } -// CHECK-LABEL: @test_vdupq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i16> @test_vdupq_n_s16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VECINIT7_I]] +// int16x8_t test_vdupq_n_s16(int16_t a) { return vdupq_n_s16(a); } -// CHECK-LABEL: @test_vdupq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3 -// CHECK: ret <4 x i32> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i32> @test_vdupq_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i32> [[VECINIT3_I]] +// int32x4_t test_vdupq_n_s32(int32_t a) { return vdupq_n_s32(a); } -// CHECK-LABEL: @test_vdupq_n_p8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 -// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 -// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 -// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 -// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 -// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 -// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 -// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VECINIT15_I]] +// CHECK-LABEL: define <16 x i8> @test_vdupq_n_p8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7 +// CHECK-NEXT: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8 +// CHECK-NEXT: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9 +// CHECK-NEXT: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10 +// CHECK-NEXT: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11 +// CHECK-NEXT: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12 +// CHECK-NEXT: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13 +// CHECK-NEXT: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// poly8x16_t test_vdupq_n_p8(poly8_t a) { return vdupq_n_p8(a); } -// CHECK-LABEL: @test_vdupq_n_p16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i16> @test_vdupq_n_p16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VECINIT7_I]] +// poly16x8_t test_vdupq_n_p16(poly16_t a) { return vdupq_n_p16(a); } -// CHECK-LABEL: @test_vdupq_n_f16( -// CHECK: [[TMP0:%.*]] = load half, ptr %a, align 2 -// CHECK: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0 -// CHECK: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1 -// CHECK: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2 -// CHECK: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3 -// CHECK: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4 -// CHECK: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5 -// CHECK: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6 -// CHECK: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7 -// CHECK: ret <8 x half> [[VECINIT7]] +// CHECK-LABEL: define <8 x half> @test_vdupq_n_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4 +// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7 +// CHECK-NEXT: ret <8 x half> [[VECINIT7]] +// float16x8_t test_vdupq_n_f16(float16_t *a) { return vdupq_n_f16(*a); } -// CHECK-LABEL: @test_vdupq_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3 -// CHECK: ret <4 x float> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[A]], i32 3 +// CHECK-NEXT: ret <4 x float> [[VECINIT3_I]] +// float32x4_t test_vdupq_n_f32(float32_t a) { return vdupq_n_f32(a); } -// CHECK-LABEL: @test_vdup_n_s64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0 -// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vdup_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0 +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// int64x1_t test_vdup_n_s64(int64_t a) { int64x1_t tmp = vdup_n_s64(a); return vadd_s64(tmp, tmp); } -// CHECK-LABEL: @test_vdup_n_u64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0 -// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vdup_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0 +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// int64x1_t test_vdup_n_u64(uint64_t a) { int64x1_t tmp = (int64x1_t)vdup_n_u64(a); return vadd_s64(tmp, tmp); } -// CHECK-LABEL: @test_vdupq_n_s64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vdupq_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1 +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vdupq_n_s64(int64_t a) { int64x2_t tmp = vdupq_n_s64(a); return vaddq_s64(tmp, tmp); } -// CHECK-LABEL: @test_vdupq_n_u64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vdupq_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1 +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vdupq_n_u64(uint64_t a) { uint64x2_t tmp = vdupq_n_u64(a); return vaddq_u64(tmp, tmp); } -// CHECK-LABEL: @test_veor_s8( -// CHECK: [[XOR_I:%.*]] = xor <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[XOR_I]] +// CHECK-LABEL: define <8 x i8> @test_veor_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[XOR_I]] +// int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) { return veor_s8(a, b); } -// CHECK-LABEL: @test_veor_s16( -// CHECK: [[XOR_I:%.*]] = xor <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[XOR_I]] +// CHECK-LABEL: define <4 x i16> @test_veor_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[XOR_I]] +// int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) { return veor_s16(a, b); } -// CHECK-LABEL: @test_veor_s32( -// CHECK: [[XOR_I:%.*]] = xor <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[XOR_I]] +// CHECK-LABEL: define <2 x i32> @test_veor_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[XOR_I]] +// int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) { return veor_s32(a, b); } -// CHECK-LABEL: @test_veor_s64( -// CHECK: [[XOR_I:%.*]] = xor <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[XOR_I]] +// CHECK-LABEL: define <1 x i64> @test_veor_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[XOR_I]] +// int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) { return veor_s64(a, b); } -// CHECK-LABEL: @test_veor_u8( -// CHECK: [[XOR_I:%.*]] = xor <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[XOR_I]] +// CHECK-LABEL: define <8 x i8> @test_veor_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[XOR_I]] +// uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) { return veor_u8(a, b); } -// CHECK-LABEL: @test_veor_u16( -// CHECK: [[XOR_I:%.*]] = xor <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[XOR_I]] +// CHECK-LABEL: define <4 x i16> @test_veor_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[XOR_I]] +// uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) { return veor_u16(a, b); } -// CHECK-LABEL: @test_veor_u32( -// CHECK: [[XOR_I:%.*]] = xor <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[XOR_I]] +// CHECK-LABEL: define <2 x i32> @test_veor_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[XOR_I]] +// uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) { return veor_u32(a, b); } -// CHECK-LABEL: @test_veor_u64( -// CHECK: [[XOR_I:%.*]] = xor <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[XOR_I]] +// CHECK-LABEL: define <1 x i64> @test_veor_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[XOR_I]] +// uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) { return veor_u64(a, b); } -// CHECK-LABEL: @test_veorq_s8( -// CHECK: [[XOR_I:%.*]] = xor <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[XOR_I]] +// CHECK-LABEL: define <16 x i8> @test_veorq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[XOR_I]] +// int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) { return veorq_s8(a, b); } -// CHECK-LABEL: @test_veorq_s16( -// CHECK: [[XOR_I:%.*]] = xor <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[XOR_I]] +// CHECK-LABEL: define <8 x i16> @test_veorq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[XOR_I]] +// int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) { return veorq_s16(a, b); } -// CHECK-LABEL: @test_veorq_s32( -// CHECK: [[XOR_I:%.*]] = xor <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[XOR_I]] +// CHECK-LABEL: define <4 x i32> @test_veorq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[XOR_I]] +// int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) { return veorq_s32(a, b); } -// CHECK-LABEL: @test_veorq_s64( -// CHECK: [[XOR_I:%.*]] = xor <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[XOR_I]] +// CHECK-LABEL: define <2 x i64> @test_veorq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[XOR_I]] +// int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) { return veorq_s64(a, b); } -// CHECK-LABEL: @test_veorq_u8( -// CHECK: [[XOR_I:%.*]] = xor <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[XOR_I]] +// CHECK-LABEL: define <16 x i8> @test_veorq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[XOR_I]] +// uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) { return veorq_u8(a, b); } -// CHECK-LABEL: @test_veorq_u16( -// CHECK: [[XOR_I:%.*]] = xor <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[XOR_I]] +// CHECK-LABEL: define <8 x i16> @test_veorq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[XOR_I]] +// uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) { return veorq_u16(a, b); } -// CHECK-LABEL: @test_veorq_u32( -// CHECK: [[XOR_I:%.*]] = xor <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[XOR_I]] +// CHECK-LABEL: define <4 x i32> @test_veorq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[XOR_I]] +// uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) { return veorq_u32(a, b); } -// CHECK-LABEL: @test_veorq_u64( -// CHECK: [[XOR_I:%.*]] = xor <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[XOR_I]] +// CHECK-LABEL: define <2 x i64> @test_veorq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[XOR_I]] +// uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) { return veorq_u64(a, b); } -// CHECK-LABEL: @test_vext_s8( -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[VEXT]] +// CHECK-LABEL: define <8 x i8> @test_vext_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) { return vext_s8(a, b, 7); } -// CHECK-LABEL: @test_vext_u8( -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[VEXT]] +// CHECK-LABEL: define <8 x i8> @test_vext_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) { return vext_u8(a, b, 7); } -// CHECK-LABEL: @test_vext_p8( -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[VEXT]] +// CHECK-LABEL: define <8 x i8> @test_vext_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) { return vext_p8(a, b, 7); } -// CHECK-LABEL: @test_vext_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i16> [[VEXT]] +// CHECK-LABEL: define <4 x i16> @test_vext_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[VEXT]] +// int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) { return vext_s16(a, b, 3); } -// CHECK-LABEL: @test_vext_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i16> [[VEXT]] +// CHECK-LABEL: define <4 x i16> @test_vext_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[VEXT]] +// uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) { return vext_u16(a, b, 3); } -// CHECK-LABEL: @test_vext_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i16> [[VEXT]] +// CHECK-LABEL: define <4 x i16> @test_vext_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[VEXT]] +// poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) { return vext_p16(a, b, 3); } -// CHECK-LABEL: @test_vext_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i32> [[VEXT]] +// CHECK-LABEL: define <2 x i32> @test_vext_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VEXT]] +// int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) { return vext_s32(a, b, 1); } -// CHECK-LABEL: @test_vext_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i32> [[VEXT]] +// CHECK-LABEL: define <2 x i32> @test_vext_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VEXT]] +// uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) { return vext_u32(a, b, 1); } -// CHECK-LABEL: @test_vext_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[VEXT]] +// CHECK-LABEL: define <1 x i64> @test_vext_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[VEXT]] +// int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) { return vext_s64(a, b, 0); } -// CHECK-LABEL: @test_vext_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[VEXT]] +// CHECK-LABEL: define <1 x i64> @test_vext_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[VEXT]] +// uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) { return vext_u64(a, b, 0); } -// CHECK-LABEL: @test_vext_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> -// CHECK: ret <2 x float> [[VEXT]] +// CHECK-LABEL: define <2 x float> @test_vext_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[VEXT]] +// float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) { return vext_f32(a, b, 1); } -// CHECK-LABEL: @test_vextq_s8( -// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[VEXT]] +// CHECK-LABEL: define <16 x i8> @test_vextq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) { return vextq_s8(a, b, 15); } -// CHECK-LABEL: @test_vextq_u8( -// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[VEXT]] +// CHECK-LABEL: define <16 x i8> @test_vextq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) { return vextq_u8(a, b, 15); } -// CHECK-LABEL: @test_vextq_p8( -// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[VEXT]] +// CHECK-LABEL: define <16 x i8> @test_vextq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) { return vextq_p8(a, b, 15); } -// CHECK-LABEL: @test_vextq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK: ret <8 x i16> [[VEXT]] +// CHECK-LABEL: define <8 x i16> @test_vextq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[VEXT]] +// int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) { return vextq_s16(a, b, 7); } -// CHECK-LABEL: @test_vextq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK: ret <8 x i16> [[VEXT]] +// CHECK-LABEL: define <8 x i16> @test_vextq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[VEXT]] +// uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) { return vextq_u16(a, b, 7); } -// CHECK-LABEL: @test_vextq_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK: ret <8 x i16> [[VEXT]] +// CHECK-LABEL: define <8 x i16> @test_vextq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[VEXT]] +// poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) { return vextq_p16(a, b, 7); } -// CHECK-LABEL: @test_vextq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i32> [[VEXT]] +// CHECK-LABEL: define <4 x i32> @test_vextq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VEXT]] +// int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) { return vextq_s32(a, b, 3); } -// CHECK-LABEL: @test_vextq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i32> [[VEXT]] +// CHECK-LABEL: define <4 x i32> @test_vextq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VEXT]] +// uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) { return vextq_u32(a, b, 3); } -// CHECK-LABEL: @test_vextq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i64> [[VEXT]] +// CHECK-LABEL: define <2 x i64> @test_vextq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VEXT]] +// int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) { return vextq_s64(a, b, 1); } -// CHECK-LABEL: @test_vextq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i64> [[VEXT]] +// CHECK-LABEL: define <2 x i64> @test_vextq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VEXT]] +// uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) { return vextq_u64(a, b, 1); } -// CHECK-LABEL: @test_vextq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> -// CHECK: ret <4 x float> [[VEXT]] +// CHECK-LABEL: define <4 x float> @test_vextq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[VEXT]] +// float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) { return vextq_f32(a, b, 3); } -// CHECK-LABEL: @test_vfma_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8> -// CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %c, <2 x float> %a) -// CHECK: ret <2 x float> [[TMP3]] +// CHECK-LABEL: define <2 x float> @test_vfma_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[C]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]]) +// CHECK-NEXT: ret <2 x float> [[TMP9]] +// float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vfma_f32(a, b, c); } -// CHECK-LABEL: @test_vfmaq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8> -// CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %c, <4 x float> %a) -// CHECK: ret <4 x float> [[TMP3]] +// CHECK-LABEL: define <4 x float> @test_vfmaq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[C]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]]) +// CHECK-NEXT: ret <4 x float> [[TMP9]] +// float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return vfmaq_f32(a, b, c); } -// CHECK-LABEL: @test_vfms_f32( -// CHECK: [[SUB_I:%.*]] = fneg <2 x float> %b -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8> -// CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %c, <2 x float> %a) -// CHECK: ret <2 x float> [[TMP3]] +// CHECK-LABEL: define <2 x float> @test_vfms_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[B]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[C]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], <2 x float> [[TMP6]]) +// CHECK-NEXT: ret <2 x float> [[TMP9]] +// float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vfms_f32(a, b, c); } -// CHECK-LABEL: @test_vfmsq_f32( -// CHECK: [[SUB_I:%.*]] = fneg <4 x float> %b -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8> -// CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %c, <4 x float> %a) -// CHECK: ret <4 x float> [[TMP3]] +// CHECK-LABEL: define <4 x float> @test_vfmsq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[B]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[C]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x float> [[TMP6]]) +// CHECK-NEXT: ret <4 x float> [[TMP9]] +// float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return vfmsq_f32(a, b, c); } -// CHECK-LABEL: @test_vget_high_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vget_high_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vget_high_s8(int8x16_t a) { return vget_high_s8(a); } -// CHECK-LABEL: @test_vget_high_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vget_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vget_high_s16(int16x8_t a) { return vget_high_s16(a); } -// CHECK-LABEL: @test_vget_high_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i32> @test_vget_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vget_high_s32(int32x4_t a) { return vget_high_s32(a); } -// CHECK-LABEL: @test_vget_high_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> -// CHECK: ret <1 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define <1 x i64> @test_vget_high_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> +// CHECK-NEXT: ret <1 x i64> [[SHUFFLE_I]] +// int64x1_t test_vget_high_s64(int64x2_t a) { return vget_high_s64(a); } -// CHECK-LABEL: @test_vget_high_f16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> -// CHECK: ret <4 x half> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x half> @test_vget_high_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x half> [[SHUFFLE_I]] +// float16x4_t test_vget_high_f16(float16x8_t a) { return vget_high_f16(a); } -// CHECK-LABEL: @test_vget_high_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x float> @test_vget_high_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[A]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vget_high_f32(float32x4_t a) { return vget_high_f32(a); } -// CHECK-LABEL: @test_vget_high_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vget_high_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vget_high_u8(uint8x16_t a) { return vget_high_u8(a); } -// CHECK-LABEL: @test_vget_high_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vget_high_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vget_high_u16(uint16x8_t a) { return vget_high_u16(a); } -// CHECK-LABEL: @test_vget_high_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i32> @test_vget_high_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vget_high_u32(uint32x4_t a) { return vget_high_u32(a); } -// CHECK-LABEL: @test_vget_high_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> -// CHECK: ret <1 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define <1 x i64> @test_vget_high_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> +// CHECK-NEXT: ret <1 x i64> [[SHUFFLE_I]] +// uint64x1_t test_vget_high_u64(uint64x2_t a) { return vget_high_u64(a); } -// CHECK-LABEL: @test_vget_high_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vget_high_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vget_high_p8(poly8x16_t a) { return vget_high_p8(a); } -// CHECK-LABEL: @test_vget_high_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vget_high_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vget_high_p16(poly16x8_t a) { return vget_high_p16(a); } -// CHECK-LABEL: @test_vget_lane_u8( -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 -// CHECK: ret i8 [[VGET_LANE]] +// CHECK-LABEL: define zeroext i8 @test_vget_lane_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7 +// CHECK-NEXT: ret i8 [[VGET_LANE]] +// uint8_t test_vget_lane_u8(uint8x8_t a) { return vget_lane_u8(a, 7); } -// CHECK-LABEL: @test_vget_lane_u16( -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> %a, i32 3 -// CHECK: ret i16 [[VGET_LANE]] +// CHECK-LABEL: define zeroext i16 @test_vget_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i32 3 +// CHECK-NEXT: ret i16 [[VGET_LANE]] +// uint16_t test_vget_lane_u16(uint16x4_t a) { return vget_lane_u16(a, 3); } -// CHECK-LABEL: @test_vget_lane_u32( -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> %a, i32 1 -// CHECK: ret i32 [[VGET_LANE]] +// CHECK-LABEL: define i32 @test_vget_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i32 1 +// CHECK-NEXT: ret i32 [[VGET_LANE]] +// uint32_t test_vget_lane_u32(uint32x2_t a) { return vget_lane_u32(a, 1); } -// CHECK-LABEL: @test_vget_lane_s8( -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 -// CHECK: ret i8 [[VGET_LANE]] +// CHECK-LABEL: define signext i8 @test_vget_lane_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7 +// CHECK-NEXT: ret i8 [[VGET_LANE]] +// int8_t test_vget_lane_s8(int8x8_t a) { return vget_lane_s8(a, 7); } -// CHECK-LABEL: @test_vget_lane_s16( -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> %a, i32 3 -// CHECK: ret i16 [[VGET_LANE]] +// CHECK-LABEL: define signext i16 @test_vget_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i32 3 +// CHECK-NEXT: ret i16 [[VGET_LANE]] +// int16_t test_vget_lane_s16(int16x4_t a) { return vget_lane_s16(a, 3); } -// CHECK-LABEL: @test_vget_lane_s32( -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> %a, i32 1 -// CHECK: ret i32 [[VGET_LANE]] +// CHECK-LABEL: define i32 @test_vget_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i32 1 +// CHECK-NEXT: ret i32 [[VGET_LANE]] +// int32_t test_vget_lane_s32(int32x2_t a) { return vget_lane_s32(a, 1); } -// CHECK-LABEL: @test_vget_lane_p8( -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 -// CHECK: ret i8 [[VGET_LANE]] +// CHECK-LABEL: define signext i8 @test_vget_lane_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7 +// CHECK-NEXT: ret i8 [[VGET_LANE]] +// poly8_t test_vget_lane_p8(poly8x8_t a) { return vget_lane_p8(a, 7); } -// CHECK-LABEL: @test_vget_lane_p16( -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> %a, i32 3 -// CHECK: ret i16 [[VGET_LANE]] +// CHECK-LABEL: define signext i16 @test_vget_lane_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i32 3 +// CHECK-NEXT: ret i16 [[VGET_LANE]] +// poly16_t test_vget_lane_p16(poly16x4_t a) { return vget_lane_p16(a, 3); } -// CHECK-LABEL: @test_vget_lane_f32( -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> %a, i32 1 -// CHECK: ret float [[VGET_LANE]] +// CHECK-LABEL: define float @test_vget_lane_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x float> [[A]], i32 1 +// CHECK-NEXT: ret float [[VGET_LANE]] +// float32_t test_vget_lane_f32(float32x2_t a) { return vget_lane_f32(a, 1); } -// CHECK-LABEL: @test_vget_lane_f16( -// CHECK: [[__REINT_242:%.*]] = alloca <4 x half>, align 8 -// CHECK: [[__REINT1_242:%.*]] = alloca i16, align 2 -// CHECK: store <4 x half> %a, ptr [[__REINT_242]], align 8 -// CHECK: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_242]], align 8 -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 -// CHECK: store i16 [[VGET_LANE]], ptr [[__REINT1_242]], align 2 -// CHECK: [[TMP5:%.*]] = load half, ptr [[__REINT1_242]], align 2 -// CHECK: [[CONV:%.*]] = fpext half [[TMP5]] to float -// CHECK: ret float [[CONV]] +// CHECK-LABEL: define float @test_vget_lane_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half +// CHECK-NEXT: [[CONV:%.*]] = fpext half [[TMP1]] to float +// CHECK-NEXT: ret float [[CONV]] +// float32_t test_vget_lane_f16(float16x4_t a) { return vget_lane_f16(a, 1); } -// CHECK-LABEL: @test_vgetq_lane_u8( -// CHECK: [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 -// CHECK: ret i8 [[VGET_LANE]] +// CHECK-LABEL: define zeroext i8 @test_vgetq_lane_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15 +// CHECK-NEXT: ret i8 [[VGET_LANE]] +// uint8_t test_vgetq_lane_u8(uint8x16_t a) { return vgetq_lane_u8(a, 15); } -// CHECK-LABEL: @test_vgetq_lane_u16( -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> %a, i32 7 -// CHECK: ret i16 [[VGET_LANE]] +// CHECK-LABEL: define zeroext i16 @test_vgetq_lane_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i32 7 +// CHECK-NEXT: ret i16 [[VGET_LANE]] +// uint16_t test_vgetq_lane_u16(uint16x8_t a) { return vgetq_lane_u16(a, 7); } -// CHECK-LABEL: @test_vgetq_lane_u32( -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i32> %a, i32 3 -// CHECK: ret i32 [[VGET_LANE]] +// CHECK-LABEL: define i32 @test_vgetq_lane_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i32 3 +// CHECK-NEXT: ret i32 [[VGET_LANE]] +// uint32_t test_vgetq_lane_u32(uint32x4_t a) { return vgetq_lane_u32(a, 3); } -// CHECK-LABEL: @test_vgetq_lane_s8( -// CHECK: [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 -// CHECK: ret i8 [[VGET_LANE]] +// CHECK-LABEL: define signext i8 @test_vgetq_lane_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15 +// CHECK-NEXT: ret i8 [[VGET_LANE]] +// int8_t test_vgetq_lane_s8(int8x16_t a) { return vgetq_lane_s8(a, 15); } -// CHECK-LABEL: @test_vgetq_lane_s16( -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> %a, i32 7 -// CHECK: ret i16 [[VGET_LANE]] +// CHECK-LABEL: define signext i16 @test_vgetq_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i32 7 +// CHECK-NEXT: ret i16 [[VGET_LANE]] +// int16_t test_vgetq_lane_s16(int16x8_t a) { return vgetq_lane_s16(a, 7); } -// CHECK-LABEL: @test_vgetq_lane_s32( -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i32> %a, i32 3 -// CHECK: ret i32 [[VGET_LANE]] +// CHECK-LABEL: define i32 @test_vgetq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i32 3 +// CHECK-NEXT: ret i32 [[VGET_LANE]] +// int32_t test_vgetq_lane_s32(int32x4_t a) { return vgetq_lane_s32(a, 3); } -// CHECK-LABEL: @test_vgetq_lane_p8( -// CHECK: [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 -// CHECK: ret i8 [[VGET_LANE]] +// CHECK-LABEL: define signext i8 @test_vgetq_lane_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15 +// CHECK-NEXT: ret i8 [[VGET_LANE]] +// poly8_t test_vgetq_lane_p8(poly8x16_t a) { return vgetq_lane_p8(a, 15); } -// CHECK-LABEL: @test_vgetq_lane_p16( -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> %a, i32 7 -// CHECK: ret i16 [[VGET_LANE]] +// CHECK-LABEL: define signext i16 @test_vgetq_lane_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i32 7 +// CHECK-NEXT: ret i16 [[VGET_LANE]] +// poly16_t test_vgetq_lane_p16(poly16x8_t a) { return vgetq_lane_p16(a, 7); } -// CHECK-LABEL: @test_vgetq_lane_f32( -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x float> %a, i32 3 -// CHECK: ret float [[VGET_LANE]] +// CHECK-LABEL: define float @test_vgetq_lane_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x float> [[A]], i32 3 +// CHECK-NEXT: ret float [[VGET_LANE]] +// float32_t test_vgetq_lane_f32(float32x4_t a) { return vgetq_lane_f32(a, 3); } -// CHECK-LABEL: @test_vgetq_lane_f16( -// CHECK: [[__REINT_244:%.*]] = alloca <8 x half>, align 16 -// CHECK: [[__REINT1_244:%.*]] = alloca i16, align 2 -// CHECK: store <8 x half> %a, ptr [[__REINT_244]], align 16 -// CHECK: [[TMP1:%.*]] = load <8 x i16>, ptr [[__REINT_244]], align 16 -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 -// CHECK: store i16 [[VGET_LANE]], ptr [[__REINT1_244]], align 2 -// CHECK: [[TMP5:%.*]] = load half, ptr [[__REINT1_244]], align 2 -// CHECK: [[CONV:%.*]] = fpext half [[TMP5]] to float -// CHECK: ret float [[CONV]] +// CHECK-LABEL: define float @test_vgetq_lane_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 3 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[VGET_LANE]] to half +// CHECK-NEXT: [[CONV:%.*]] = fpext half [[TMP1]] to float +// CHECK-NEXT: ret float [[CONV]] +// float32_t test_vgetq_lane_f16(float16x8_t a) { return vgetq_lane_f16(a, 3); } -// CHECK-LABEL: @test_vget_lane_s64( -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> %a, i32 0 -// CHECK: ret i64 [[VGET_LANE]] +// CHECK-LABEL: define i64 @test_vget_lane_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[A]], i32 0 +// CHECK-NEXT: ret i64 [[VGET_LANE]] +// int64_t test_vget_lane_s64(int64x1_t a) { return vget_lane_s64(a, 0); } -// CHECK-LABEL: @test_vget_lane_u64( -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> %a, i32 0 -// CHECK: ret i64 [[VGET_LANE]] +// CHECK-LABEL: define i64 @test_vget_lane_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[A]], i32 0 +// CHECK-NEXT: ret i64 [[VGET_LANE]] +// uint64_t test_vget_lane_u64(uint64x1_t a) { return vget_lane_u64(a, 0); } -// CHECK-LABEL: @test_vgetq_lane_s64( -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i64> %a, i32 1 -// CHECK: ret i64 [[VGET_LANE]] +// CHECK-LABEL: define i64 @test_vgetq_lane_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i32 1 +// CHECK-NEXT: ret i64 [[VGET_LANE]] +// int64_t test_vgetq_lane_s64(int64x2_t a) { return vgetq_lane_s64(a, 1); } -// CHECK-LABEL: @test_vgetq_lane_u64( -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i64> %a, i32 1 -// CHECK: ret i64 [[VGET_LANE]] +// CHECK-LABEL: define i64 @test_vgetq_lane_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i32 1 +// CHECK-NEXT: ret i64 [[VGET_LANE]] +// uint64_t test_vgetq_lane_u64(uint64x2_t a) { return vgetq_lane_u64(a, 1); } -// CHECK-LABEL: @test_vget_low_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vget_low_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vget_low_s8(int8x16_t a) { return vget_low_s8(a); } -// CHECK-LABEL: @test_vget_low_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vget_low_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vget_low_s16(int16x8_t a) { return vget_low_s16(a); } -// CHECK-LABEL: @test_vget_low_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i32> @test_vget_low_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vget_low_s32(int32x4_t a) { return vget_low_s32(a); } -// CHECK-LABEL: @test_vget_low_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define <1 x i64> @test_vget_low_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[SHUFFLE_I]] +// int64x1_t test_vget_low_s64(int64x2_t a) { return vget_low_s64(a); } -// CHECK-LABEL: @test_vget_low_f16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> -// CHECK: ret <4 x half> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x half> @test_vget_low_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x half> [[SHUFFLE_I]] +// float16x4_t test_vget_low_f16(float16x8_t a) { return vget_low_f16(a); } -// CHECK-LABEL: @test_vget_low_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x float> @test_vget_low_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[A]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vget_low_f32(float32x4_t a) { return vget_low_f32(a); } -// CHECK-LABEL: @test_vget_low_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vget_low_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vget_low_u8(uint8x16_t a) { return vget_low_u8(a); } -// CHECK-LABEL: @test_vget_low_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vget_low_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vget_low_u16(uint16x8_t a) { return vget_low_u16(a); } -// CHECK-LABEL: @test_vget_low_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i32> @test_vget_low_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vget_low_u32(uint32x4_t a) { return vget_low_u32(a); } -// CHECK-LABEL: @test_vget_low_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define <1 x i64> @test_vget_low_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[SHUFFLE_I]] +// uint64x1_t test_vget_low_u64(uint64x2_t a) { return vget_low_u64(a); } -// CHECK-LABEL: @test_vget_low_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vget_low_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vget_low_p8(poly8x16_t a) { return vget_low_p8(a); } -// CHECK-LABEL: @test_vget_low_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vget_low_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vget_low_p16(poly16x8_t a) { return vget_low_p16(a); } -// CHECK-LABEL: @test_vhadd_s8( -// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VHADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vhadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VHADD_V_I]] +// int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) { return vhadd_s8(a, b); } -// CHECK-LABEL: @test_vhadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vhadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) +// CHECK-NEXT: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) { return vhadd_s16(a, b); } -// CHECK-LABEL: @test_vhadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vhadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) +// CHECK-NEXT: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) { return vhadd_s32(a, b); } -// CHECK-LABEL: @test_vhadd_u8( -// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VHADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vhadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VHADD_V_I]] +// uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) { return vhadd_u8(a, b); } -// CHECK-LABEL: @test_vhadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vhadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) +// CHECK-NEXT: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) { return vhadd_u16(a, b); } -// CHECK-LABEL: @test_vhadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vhadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) +// CHECK-NEXT: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) { return vhadd_u32(a, b); } -// CHECK-LABEL: @test_vhaddq_s8( -// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VHADDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VHADDQ_V_I]] +// int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) { return vhaddq_s8(a, b); } -// CHECK-LABEL: @test_vhaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHADDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) +// CHECK-NEXT: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) { return vhaddq_s16(a, b); } -// CHECK-LABEL: @test_vhaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHADDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) +// CHECK-NEXT: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) { return vhaddq_s32(a, b); } -// CHECK-LABEL: @test_vhaddq_u8( -// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VHADDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VHADDQ_V_I]] +// uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) { return vhaddq_u8(a, b); } -// CHECK-LABEL: @test_vhaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHADDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) +// CHECK-NEXT: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) { return vhaddq_u16(a, b); } -// CHECK-LABEL: @test_vhaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHADDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) +// CHECK-NEXT: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) { return vhaddq_u32(a, b); } -// CHECK-LABEL: @test_vhsub_s8( -// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VHSUB_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vhsub_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VHSUB_V_I]] +// int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) { return vhsub_s8(a, b); } -// CHECK-LABEL: @test_vhsub_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHSUB_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vhsub_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) +// CHECK-NEXT: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) { return vhsub_s16(a, b); } -// CHECK-LABEL: @test_vhsub_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHSUB_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vhsub_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) +// CHECK-NEXT: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) { return vhsub_s32(a, b); } -// CHECK-LABEL: @test_vhsub_u8( -// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VHSUB_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vhsub_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VHSUB_V_I]] +// uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) { return vhsub_u8(a, b); } -// CHECK-LABEL: @test_vhsub_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHSUB_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vhsub_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) +// CHECK-NEXT: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) { return vhsub_u16(a, b); } -// CHECK-LABEL: @test_vhsub_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHSUB_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vhsub_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) +// CHECK-NEXT: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) { return vhsub_u32(a, b); } -// CHECK-LABEL: @test_vhsubq_s8( -// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VHSUBQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VHSUBQ_V_I]] +// int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) { return vhsubq_s8(a, b); } -// CHECK-LABEL: @test_vhsubq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) +// CHECK-NEXT: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) { return vhsubq_s16(a, b); } -// CHECK-LABEL: @test_vhsubq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) +// CHECK-NEXT: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) { return vhsubq_s32(a, b); } -// CHECK-LABEL: @test_vhsubq_u8( -// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VHSUBQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VHSUBQ_V_I]] +// uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) { return vhsubq_u8(a, b); } -// CHECK-LABEL: @test_vhsubq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) +// CHECK-NEXT: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) { return vhsubq_u16(a, b); } -// CHECK-LABEL: @test_vhsubq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) +// CHECK-NEXT: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) { return vhsubq_u32(a, b); } -// CHECK-LABEL: @test_vld1q_u8( -// CHECK: [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr %a, i32 1) -// CHECK: ret <16 x i8> [[VLD1]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLD1]] +// uint8x16_t test_vld1q_u8(uint8_t const * a) { return vld1q_u8(a); } -// CHECK-LABEL: @test_vld1q_u16( -// CHECK: [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr %a, i32 2) -// CHECK: ret <8 x i16> [[VLD1]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: ret <8 x i16> [[VLD1]] +// uint16x8_t test_vld1q_u16(uint16_t const * a) { return vld1q_u16(a); } -// CHECK-LABEL: @test_vld1q_u32( -// CHECK: [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr %a, i32 4) -// CHECK: ret <4 x i32> [[VLD1]] +// CHECK-LABEL: define <4 x i32> @test_vld1q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: ret <4 x i32> [[VLD1]] +// uint32x4_t test_vld1q_u32(uint32_t const * a) { return vld1q_u32(a); } -// CHECK-LABEL: @test_vld1q_u64( -// CHECK: [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr %a, i32 4) -// CHECK: ret <2 x i64> [[VLD1]] +// CHECK-LABEL: define <2 x i64> @test_vld1q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: ret <2 x i64> [[VLD1]] +// uint64x2_t test_vld1q_u64(uint64_t const * a) { return vld1q_u64(a); } -// CHECK-LABEL: @test_vld1q_s8( -// CHECK: [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr %a, i32 1) -// CHECK: ret <16 x i8> [[VLD1]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLD1]] +// int8x16_t test_vld1q_s8(int8_t const * a) { return vld1q_s8(a); } -// CHECK-LABEL: @test_vld1q_s16( -// CHECK: [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr %a, i32 2) -// CHECK: ret <8 x i16> [[VLD1]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: ret <8 x i16> [[VLD1]] +// int16x8_t test_vld1q_s16(int16_t const * a) { return vld1q_s16(a); } -// CHECK-LABEL: @test_vld1q_s32( -// CHECK: [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr %a, i32 4) -// CHECK: ret <4 x i32> [[VLD1]] +// CHECK-LABEL: define <4 x i32> @test_vld1q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: ret <4 x i32> [[VLD1]] +// int32x4_t test_vld1q_s32(int32_t const * a) { return vld1q_s32(a); } -// CHECK-LABEL: @test_vld1q_s64( -// CHECK: [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr %a, i32 4) -// CHECK: ret <2 x i64> [[VLD1]] +// CHECK-LABEL: define <2 x i64> @test_vld1q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: ret <2 x i64> [[VLD1]] +// int64x2_t test_vld1q_s64(int64_t const * a) { return vld1q_s64(a); } -// CHECK-LABEL: @test_vld1q_f16( -// CHECK: [[VLD1:%.*]] = call <8 x half> @llvm.arm.neon.vld1.v8f16.p0(ptr %a, i32 2) -// CHECK: ret <8 x half> [[VLD1]] +// CHECK-LABEL: define <8 x half> @test_vld1q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <8 x half> @llvm.arm.neon.vld1.v8f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: ret <8 x half> [[VLD1]] +// float16x8_t test_vld1q_f16(float16_t const * a) { return vld1q_f16(a); } -// CHECK-LABEL: @test_vld1q_f32( -// CHECK: [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0(ptr %a, i32 4) -// CHECK: ret <4 x float> [[VLD1]] +// CHECK-LABEL: define <4 x float> @test_vld1q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: ret <4 x float> [[VLD1]] +// float32x4_t test_vld1q_f32(float32_t const * a) { return vld1q_f32(a); } -// CHECK-LABEL: @test_vld1q_p8( -// CHECK: [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr %a, i32 1) -// CHECK: ret <16 x i8> [[VLD1]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLD1]] +// poly8x16_t test_vld1q_p8(poly8_t const * a) { return vld1q_p8(a); } -// CHECK-LABEL: @test_vld1q_p16( -// CHECK: [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr %a, i32 2) -// CHECK: ret <8 x i16> [[VLD1]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: ret <8 x i16> [[VLD1]] +// poly16x8_t test_vld1q_p16(poly16_t const * a) { return vld1q_p16(a); } -// CHECK-LABEL: @test_vld1_u8( -// CHECK: [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %a, i32 1) -// CHECK: ret <8 x i8> [[VLD1]] +// CHECK-LABEL: define <8 x i8> @test_vld1_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: ret <8 x i8> [[VLD1]] +// uint8x8_t test_vld1_u8(uint8_t const * a) { return vld1_u8(a); } -// CHECK-LABEL: @test_vld1_u16( -// CHECK: [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr %a, i32 2) -// CHECK: ret <4 x i16> [[VLD1]] +// CHECK-LABEL: define <4 x i16> @test_vld1_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: ret <4 x i16> [[VLD1]] +// uint16x4_t test_vld1_u16(uint16_t const * a) { return vld1_u16(a); } -// CHECK-LABEL: @test_vld1_u32( -// CHECK: [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr %a, i32 4) -// CHECK: ret <2 x i32> [[VLD1]] +// CHECK-LABEL: define <2 x i32> @test_vld1_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: ret <2 x i32> [[VLD1]] +// uint32x2_t test_vld1_u32(uint32_t const * a) { return vld1_u32(a); } -// CHECK-LABEL: @test_vld1_u64( -// CHECK: [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4) -// CHECK: ret <1 x i64> [[VLD1]] +// CHECK-LABEL: define <1 x i64> @test_vld1_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: ret <1 x i64> [[VLD1]] +// uint64x1_t test_vld1_u64(uint64_t const * a) { return vld1_u64(a); } -// CHECK-LABEL: @test_vld1_s8( -// CHECK: [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %a, i32 1) -// CHECK: ret <8 x i8> [[VLD1]] +// CHECK-LABEL: define <8 x i8> @test_vld1_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: ret <8 x i8> [[VLD1]] +// int8x8_t test_vld1_s8(int8_t const * a) { return vld1_s8(a); } -// CHECK-LABEL: @test_vld1_s16( -// CHECK: [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr %a, i32 2) -// CHECK: ret <4 x i16> [[VLD1]] +// CHECK-LABEL: define <4 x i16> @test_vld1_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: ret <4 x i16> [[VLD1]] +// int16x4_t test_vld1_s16(int16_t const * a) { return vld1_s16(a); } -// CHECK-LABEL: @test_vld1_s32( -// CHECK: [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr %a, i32 4) -// CHECK: ret <2 x i32> [[VLD1]] +// CHECK-LABEL: define <2 x i32> @test_vld1_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: ret <2 x i32> [[VLD1]] +// int32x2_t test_vld1_s32(int32_t const * a) { return vld1_s32(a); } -// CHECK-LABEL: @test_vld1_s64( -// CHECK: [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4) -// CHECK: ret <1 x i64> [[VLD1]] +// CHECK-LABEL: define <1 x i64> @test_vld1_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: ret <1 x i64> [[VLD1]] +// int64x1_t test_vld1_s64(int64_t const * a) { return vld1_s64(a); } -// CHECK-LABEL: @test_vld1_f16( -// CHECK: [[VLD1:%.*]] = call <4 x half> @llvm.arm.neon.vld1.v4f16.p0(ptr %a, i32 2) -// CHECK: ret <4 x half> [[VLD1]] +// CHECK-LABEL: define <4 x half> @test_vld1_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <4 x half> @llvm.arm.neon.vld1.v4f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: ret <4 x half> [[VLD1]] +// float16x4_t test_vld1_f16(float16_t const * a) { return vld1_f16(a); } -// CHECK-LABEL: @test_vld1_f32( -// CHECK: [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0(ptr %a, i32 4) -// CHECK: ret <2 x float> [[VLD1]] +// CHECK-LABEL: define <2 x float> @test_vld1_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: ret <2 x float> [[VLD1]] +// float32x2_t test_vld1_f32(float32_t const * a) { return vld1_f32(a); } -// CHECK-LABEL: @test_vld1_p8( -// CHECK: [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %a, i32 1) -// CHECK: ret <8 x i8> [[VLD1]] +// CHECK-LABEL: define <8 x i8> @test_vld1_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: ret <8 x i8> [[VLD1]] +// poly8x8_t test_vld1_p8(poly8_t const * a) { return vld1_p8(a); } -// CHECK-LABEL: @test_vld1_p16( -// CHECK: [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr %a, i32 2) -// CHECK: ret <4 x i16> [[VLD1]] +// CHECK-LABEL: define <4 x i16> @test_vld1_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: ret <4 x i16> [[VLD1]] +// poly16x4_t test_vld1_p16(poly16_t const * a) { return vld1_p16(a); } -// CHECK-LABEL: @test_vld1q_dup_u8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer -// CHECK: ret <16 x i8> [[LANE]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// uint8x16_t test_vld1q_dup_u8(uint8_t const * a) { return vld1q_dup_u8(a); } -// CHECK-LABEL: @test_vld1q_dup_u16( -// CHECK: [[TMP2:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// uint16x8_t test_vld1q_dup_u16(uint16_t const * a) { return vld1q_dup_u16(a); } -// CHECK-LABEL: @test_vld1q_dup_u32( -// CHECK: [[TMP2:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i32> [[LANE]] +// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i32> [[LANE]] +// uint32x4_t test_vld1q_dup_u32(uint32_t const * a) { return vld1q_dup_u32(a); } -// CHECK-LABEL: @test_vld1q_dup_u64( -// CHECK: [[TMP2:%.*]] = load i64, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// uint64x2_t test_vld1q_dup_u64(uint64_t const * a) { return vld1q_dup_u64(a); } -// CHECK-LABEL: @test_vld1q_dup_s8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer -// CHECK: ret <16 x i8> [[LANE]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// int8x16_t test_vld1q_dup_s8(int8_t const * a) { return vld1q_dup_s8(a); } -// CHECK-LABEL: @test_vld1q_dup_s16( -// CHECK: [[TMP2:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// int16x8_t test_vld1q_dup_s16(int16_t const * a) { return vld1q_dup_s16(a); } -// CHECK-LABEL: @test_vld1q_dup_s32( -// CHECK: [[TMP2:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i32> [[LANE]] +// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i32> [[LANE]] +// int32x4_t test_vld1q_dup_s32(int32_t const * a) { return vld1q_dup_s32(a); } -// CHECK-LABEL: @test_vld1q_dup_s64( -// CHECK: [[TMP2:%.*]] = load i64, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// int64x2_t test_vld1q_dup_s64(int64_t const * a) { return vld1q_dup_s64(a); } -// CHECK-LABEL: @test_vld1q_dup_f16( -// CHECK: [[TMP2:%.*]] = load half, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x half> [[LANE]] +// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x half> [[LANE]] +// float16x8_t test_vld1q_dup_f16(float16_t const * a) { return vld1q_dup_f16(a); } -// CHECK-LABEL: @test_vld1q_dup_f32( -// CHECK: [[TMP2:%.*]] = load float, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x float> [[LANE]] +// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x float> [[LANE]] +// float32x4_t test_vld1q_dup_f32(float32_t const * a) { return vld1q_dup_f32(a); } -// CHECK-LABEL: @test_vld1q_dup_p8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer -// CHECK: ret <16 x i8> [[LANE]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// poly8x16_t test_vld1q_dup_p8(poly8_t const * a) { return vld1q_dup_p8(a); } -// CHECK-LABEL: @test_vld1q_dup_p16( -// CHECK: [[TMP2:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// poly16x8_t test_vld1q_dup_p16(poly16_t const * a) { return vld1q_dup_p16(a); } -// CHECK-LABEL: @test_vld1_dup_u8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i8> [[LANE]] +// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// uint8x8_t test_vld1_dup_u8(uint8_t const * a) { return vld1_dup_u8(a); } -// CHECK-LABEL: @test_vld1_dup_u16( -// CHECK: [[TMP2:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// uint16x4_t test_vld1_dup_u16(uint16_t const * a) { return vld1_dup_u16(a); } -// CHECK-LABEL: @test_vld1_dup_u32( -// CHECK: [[TMP2:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i32> [[LANE]] +// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i32> [[LANE]] +// uint32x2_t test_vld1_dup_u32(uint32_t const * a) { return vld1_dup_u32(a); } -// CHECK-LABEL: @test_vld1_dup_u64( -// CHECK: [[TMP2:%.*]] = load i64, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[LANE]] +// uint64x1_t test_vld1_dup_u64(uint64_t const * a) { return vld1_dup_u64(a); } -// CHECK-LABEL: @test_vld1_dup_s8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i8> [[LANE]] +// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// int8x8_t test_vld1_dup_s8(int8_t const * a) { return vld1_dup_s8(a); } -// CHECK-LABEL: @test_vld1_dup_s16( -// CHECK: [[TMP2:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// int16x4_t test_vld1_dup_s16(int16_t const * a) { return vld1_dup_s16(a); } -// CHECK-LABEL: @test_vld1_dup_s32( -// CHECK: [[TMP2:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i32> [[LANE]] +// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i32> [[LANE]] +// int32x2_t test_vld1_dup_s32(int32_t const * a) { return vld1_dup_s32(a); } -// CHECK-LABEL: @test_vld1_dup_s64( -// CHECK: [[TMP2:%.*]] = load i64, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[LANE]] +// int64x1_t test_vld1_dup_s64(int64_t const * a) { return vld1_dup_s64(a); } -// CHECK-LABEL: @test_vld1_dup_f16( -// CHECK: [[TMP2:%.*]] = load half, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <4 x half> poison, half [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x half> [[LANE]] +// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x half> [[LANE]] +// float16x4_t test_vld1_dup_f16(float16_t const * a) { return vld1_dup_f16(a); } -// CHECK-LABEL: @test_vld1_dup_f32( -// CHECK: [[TMP2:%.*]] = load float, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x float> [[LANE]] +// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x float> [[LANE]] +// float32x2_t test_vld1_dup_f32(float32_t const * a) { return vld1_dup_f32(a); } -// CHECK-LABEL: @test_vld1_dup_p8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i8> [[LANE]] +// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// poly8x8_t test_vld1_dup_p8(poly8_t const * a) { return vld1_dup_p8(a); } -// CHECK-LABEL: @test_vld1_dup_p16( -// CHECK: [[TMP2:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i32 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// poly16x4_t test_vld1_dup_p16(poly16_t const * a) { return vld1_dup_p16(a); } -// CHECK-LABEL: @test_vld1q_lane_u8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 -// CHECK: ret <16 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VLD1_LANE]] +// uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) { return vld1q_lane_u8(a, b, 15); } -// CHECK-LABEL: @test_vld1q_lane_u16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: ret <8 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VLD1_LANE]] +// uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) { return vld1q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vld1q_lane_u32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3 -// CHECK: ret <4 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 3 +// CHECK-NEXT: ret <4 x i32> [[VLD1_LANE]] +// uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) { return vld1q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vld1q_lane_u64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4) -// CHECK: [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> -// CHECK: ret <2 x i64> [[VLD1Q_LANE]] +// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VLD1Q_LANE]] +// uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) { return vld1q_lane_u64(a, b, 1); } -// CHECK-LABEL: @test_vld1q_lane_s8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 -// CHECK: ret <16 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VLD1_LANE]] +// int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) { return vld1q_lane_s8(a, b, 15); } -// CHECK-LABEL: @test_vld1q_lane_s16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: ret <8 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VLD1_LANE]] +// int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) { return vld1q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vld1q_lane_s32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3 -// CHECK: ret <4 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 3 +// CHECK-NEXT: ret <4 x i32> [[VLD1_LANE]] +// int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) { return vld1q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vld1q_lane_s64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4) -// CHECK: [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> -// CHECK: ret <2 x i64> [[VLD1Q_LANE]] +// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VLD1Q_LANE]] +// int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) { return vld1q_lane_s64(a, b, 1); } -// CHECK-LABEL: @test_vld1q_lane_f16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK: [[TMP4:%.*]] = load half, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7 -// CHECK: ret <8 x half> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP3]], i32 7 +// CHECK-NEXT: ret <8 x half> [[VLD1_LANE]] +// float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) { return vld1q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vld1q_lane_f32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: [[TMP4:%.*]] = load float, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3 -// CHECK: ret <4 x float> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 3 +// CHECK-NEXT: ret <4 x float> [[VLD1_LANE]] +// float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) { return vld1q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vld1q_lane_p8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 -// CHECK: ret <16 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VLD1_LANE]] +// poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) { return vld1q_lane_p8(a, b, 15); } -// CHECK-LABEL: @test_vld1q_lane_p16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: ret <8 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VLD1_LANE]] +// poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) { return vld1q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vld1_lane_u8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 -// CHECK: ret <8 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VLD1_LANE]] +// uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) { return vld1_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vld1_lane_u16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 -// CHECK: ret <4 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VLD1_LANE]] +// uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) { return vld1_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vld1_lane_u32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1 -// CHECK: ret <2 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i32 1 +// CHECK-NEXT: ret <2 x i32> [[VLD1_LANE]] +// uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) { return vld1_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vld1_lane_u64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 -// CHECK: ret <1 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VLD1_LANE]] +// uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) { return vld1_lane_u64(a, b, 0); } -// CHECK-LABEL: @test_vld1_lane_s8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 -// CHECK: ret <8 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VLD1_LANE]] +// int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) { return vld1_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vld1_lane_s16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 -// CHECK: ret <4 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VLD1_LANE]] +// int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) { return vld1_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vld1_lane_s32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1 -// CHECK: ret <2 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i32 1 +// CHECK-NEXT: ret <2 x i32> [[VLD1_LANE]] +// int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) { return vld1_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vld1_lane_s64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 -// CHECK: ret <1 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VLD1_LANE]] +// int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) { return vld1_lane_s64(a, b, 0); } -// CHECK-LABEL: @test_vld1_lane_f16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK: [[TMP4:%.*]] = load half, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3 -// CHECK: ret <4 x half> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP3]], i32 3 +// CHECK-NEXT: ret <4 x half> [[VLD1_LANE]] +// float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) { return vld1_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vld1_lane_f32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: [[TMP4:%.*]] = load float, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1 -// CHECK: ret <2 x float> [[VLD1_LANE]] +// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP3]], i32 1 +// CHECK-NEXT: ret <2 x float> [[VLD1_LANE]] +// float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) { return vld1_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vld1_lane_p8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 -// CHECK: ret <8 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VLD1_LANE]] +// poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) { return vld1_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vld1_lane_p16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 -// CHECK: ret <4 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VLD1_LANE]] +// poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) { return vld1_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vld2q_u8( -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld2q_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint8x16x2_t test_vld2q_u8(uint8_t const * a) { return vld2q_u8(a); } -// CHECK-LABEL: @test_vld2q_u16( -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld2q_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint16x8x2_t test_vld2q_u16(uint16_t const * a) { return vld2q_u16(a); } -// CHECK-LABEL: @test_vld2q_u32( -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld2q_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint32x4x2_t test_vld2q_u32(uint32_t const * a) { return vld2q_u32(a); } -// CHECK-LABEL: @test_vld2q_s8( -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld2q_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int8x16x2_t test_vld2q_s8(int8_t const * a) { return vld2q_s8(a); } -// CHECK-LABEL: @test_vld2q_s16( -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld2q_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int16x8x2_t test_vld2q_s16(int16_t const * a) { return vld2q_s16(a); } -// CHECK-LABEL: @test_vld2q_s32( -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld2q_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int32x4x2_t test_vld2q_s32(int32_t const * a) { return vld2q_s32(a); } -// CHECK-LABEL: @test_vld2q_f16( -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <8 x half>, <8 x half> +// CHECK-LABEL: define void @test_vld2q_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <8 x half>, <8 x half> } @llvm.arm.neon.vld2.v8f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <8 x half> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x half> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float16x8x2_t test_vld2q_f16(float16_t const * a) { return vld2q_f16(a); } -// CHECK-LABEL: @test_vld2q_f32( -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float> +// CHECK-LABEL: define void @test_vld2q_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <4 x float> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x float> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float32x4x2_t test_vld2q_f32(float32_t const * a) { return vld2q_f32(a); } -// CHECK-LABEL: @test_vld2q_p8( -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld2q_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly8x16x2_t test_vld2q_p8(poly8_t const * a) { return vld2q_p8(a); } -// CHECK-LABEL: @test_vld2q_p16( -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld2q_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly16x8x2_t test_vld2q_p16(poly16_t const * a) { return vld2q_p16(a); } -// CHECK-LABEL: @test_vld2_u8( -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld2_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint8x8x2_t test_vld2_u8(uint8_t const * a) { return vld2_u8(a); } -// CHECK-LABEL: @test_vld2_u16( -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld2_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint16x4x2_t test_vld2_u16(uint16_t const * a) { return vld2_u16(a); } -// CHECK-LABEL: @test_vld2_u32( -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld2_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_V]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint32x2x2_t test_vld2_u32(uint32_t const * a) { return vld2_u32(a); } -// CHECK-LABEL: @test_vld2_u64( -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> +// CHECK-LABEL: define void @test_vld2_u64( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT64X1X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_V]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <1 x i64> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint64x1x2_t test_vld2_u64(uint64_t const * a) { return vld2_u64(a); } -// CHECK-LABEL: @test_vld2_s8( -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld2_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int8x8x2_t test_vld2_s8(int8_t const * a) { return vld2_s8(a); } -// CHECK-LABEL: @test_vld2_s16( -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld2_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int16x4x2_t test_vld2_s16(int16_t const * a) { return vld2_s16(a); } -// CHECK-LABEL: @test_vld2_s32( -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld2_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_V]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int32x2x2_t test_vld2_s32(int32_t const * a) { return vld2_s32(a); } -// CHECK-LABEL: @test_vld2_s64( -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> +// CHECK-LABEL: define void @test_vld2_s64( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT64X1X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_V]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <1 x i64> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int64x1x2_t test_vld2_s64(int64_t const * a) { return vld2_s64(a); } -// CHECK-LABEL: @test_vld2_f16( -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <4 x half>, <4 x half> +// CHECK-LABEL: define void @test_vld2_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <4 x half>, <4 x half> } @llvm.arm.neon.vld2.v4f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_V]], 1 +// CHECK-NEXT: store <4 x half> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x half> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float16x4x2_t test_vld2_f16(float16_t const * a) { return vld2_f16(a); } -// CHECK-LABEL: @test_vld2_f32( -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <2 x float>, <2 x float> +// CHECK-LABEL: define void @test_vld2_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_V]], 1 +// CHECK-NEXT: store <2 x float> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x float> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float32x2x2_t test_vld2_f32(float32_t const * a) { return vld2_f32(a); } -// CHECK-LABEL: @test_vld2_p8( -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld2_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly8x8x2_t test_vld2_p8(poly8_t const * a) { return vld2_p8(a); } -// CHECK-LABEL: @test_vld2_p16( -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld2_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly16x4x2_t test_vld2_p16(poly16_t const * a) { return vld2_p16(a); } -// CHECK-LABEL: @test_vld2q_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld2q_lane_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) { return vld2q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vld2q_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld2q_lane_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 3, i32 4) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) { return vld2q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vld2q_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld2q_lane_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) { return vld2q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vld2q_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld2q_lane_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 3, i32 4) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) { return vld2q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vld2q_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <8 x half>, <8 x half> +// CHECK-LABEL: define void @test_vld2q_lane_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <8 x half>, <8 x half> } @llvm.arm.neon.vld2lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <8 x half> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x half> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) { return vld2q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vld2q_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float> +// CHECK-LABEL: define void @test_vld2q_lane_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i32 3, i32 4) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <4 x float> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x float> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) { return vld2q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vld2q_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld2q_lane_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) { return vld2q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vld2_lane_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld2_lane_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) { return vld2_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vld2_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld2_lane_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) { return vld2_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vld2_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld2_lane_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 1, i32 4) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) { return vld2_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vld2_lane_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld2_lane_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) { return vld2_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vld2_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld2_lane_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) { return vld2_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vld2_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld2_lane_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 1, i32 4) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) { return vld2_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vld2_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <4 x half>, <4 x half> +// CHECK-LABEL: define void @test_vld2_lane_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <4 x half>, <4 x half> } @llvm.arm.neon.vld2lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP4]], <4 x half> [[TMP5]], i32 3, i32 2) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <4 x half> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x half> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) { return vld2_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vld2_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float> +// CHECK-LABEL: define void @test_vld2_lane_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP4]], <2 x float> [[TMP5]], i32 1, i32 4) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <2 x float> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x float> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) { return vld2_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vld2_lane_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld2_lane_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) { return vld2_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vld2_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld2_lane_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) { return vld2_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vld3q_u8( -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld3q_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint8x16x3_t test_vld3q_u8(uint8_t const * a) { return vld3q_u8(a); } -// CHECK-LABEL: @test_vld3q_u16( -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld3q_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint16x8x3_t test_vld3q_u16(uint16_t const * a) { return vld3q_u16(a); } -// CHECK-LABEL: @test_vld3q_u32( -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld3q_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint32x4x3_t test_vld3q_u32(uint32_t const * a) { return vld3q_u32(a); } -// CHECK-LABEL: @test_vld3q_s8( -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld3q_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int8x16x3_t test_vld3q_s8(int8_t const * a) { return vld3q_s8(a); } -// CHECK-LABEL: @test_vld3q_s16( -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld3q_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int16x8x3_t test_vld3q_s16(int16_t const * a) { return vld3q_s16(a); } -// CHECK-LABEL: @test_vld3q_s32( -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld3q_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int32x4x3_t test_vld3q_s32(int32_t const * a) { return vld3q_s32(a); } -// CHECK-LABEL: @test_vld3q_f16( -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> +// CHECK-LABEL: define void @test_vld3q_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld3.v8f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <8 x half> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x half> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x half> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float16x8x3_t test_vld3q_f16(float16_t const * a) { return vld3q_f16(a); } -// CHECK-LABEL: @test_vld3q_f32( -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> +// CHECK-LABEL: define void @test_vld3q_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <4 x float> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x float> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x float> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float32x4x3_t test_vld3q_f32(float32_t const * a) { return vld3q_f32(a); } -// CHECK-LABEL: @test_vld3q_p8( -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld3q_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly8x16x3_t test_vld3q_p8(poly8_t const * a) { return vld3q_p8(a); } -// CHECK-LABEL: @test_vld3q_p16( -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld3q_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly16x8x3_t test_vld3q_p16(poly16_t const * a) { return vld3q_p16(a); } -// CHECK-LABEL: @test_vld3_u8( -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld3_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint8x8x3_t test_vld3_u8(uint8_t const * a) { return vld3_u8(a); } -// CHECK-LABEL: @test_vld3_u16( -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld3_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint16x4x3_t test_vld3_u16(uint16_t const * a) { return vld3_u16(a); } -// CHECK-LABEL: @test_vld3_u32( -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld3_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint32x2x3_t test_vld3_u32(uint32_t const * a) { return vld3_u32(a); } -// CHECK-LABEL: @test_vld3_u64( -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> +// CHECK-LABEL: define void @test_vld3_u64( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT64X1X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <1 x i64> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <1 x i64> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint64x1x3_t test_vld3_u64(uint64_t const * a) { return vld3_u64(a); } -// CHECK-LABEL: @test_vld3_s8( -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld3_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int8x8x3_t test_vld3_s8(int8_t const * a) { return vld3_s8(a); } -// CHECK-LABEL: @test_vld3_s16( -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld3_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int16x4x3_t test_vld3_s16(int16_t const * a) { return vld3_s16(a); } -// CHECK-LABEL: @test_vld3_s32( -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld3_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int32x2x3_t test_vld3_s32(int32_t const * a) { return vld3_s32(a); } -// CHECK-LABEL: @test_vld3_s64( -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> +// CHECK-LABEL: define void @test_vld3_s64( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT64X1X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <1 x i64> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <1 x i64> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int64x1x3_t test_vld3_s64(int64_t const * a) { return vld3_s64(a); } -// CHECK-LABEL: @test_vld3_f16( -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> +// CHECK-LABEL: define void @test_vld3_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld3.v4f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_V]], 2 +// CHECK-NEXT: store <4 x half> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x half> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x half> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float16x4x3_t test_vld3_f16(float16_t const * a) { return vld3_f16(a); } -// CHECK-LABEL: @test_vld3_f32( -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> +// CHECK-LABEL: define void @test_vld3_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], 2 +// CHECK-NEXT: store <2 x float> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x float> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x float> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float32x2x3_t test_vld3_f32(float32_t const * a) { return vld3_f32(a); } -// CHECK-LABEL: @test_vld3_p8( -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld3_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly8x8x3_t test_vld3_p8(poly8_t const * a) { return vld3_p8(a); } -// CHECK-LABEL: @test_vld3_p16( -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld3_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly16x4x3_t test_vld3_p16(poly16_t const * a) { return vld3_p16(a); } -// CHECK-LABEL: @test_vld3q_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld3q_lane_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) { return vld3q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vld3q_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld3q_lane_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3, i32 4) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) { return vld3q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vld3q_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld3q_lane_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) { return vld3q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vld3q_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld3q_lane_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3, i32 4) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) { return vld3q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vld3q_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> +// CHECK-LABEL: define void @test_vld3q_lane_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld3lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP3]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i32 7, i32 2) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <8 x half> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x half> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x half> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) { return vld3q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vld3q_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> +// CHECK-LABEL: define void @test_vld3q_lane_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], i32 3, i32 4) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <4 x float> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x float> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x float> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) { return vld3q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vld3q_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld3q_lane_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) { return vld3q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vld3_lane_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld3_lane_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) { return vld3_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vld3_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld3_lane_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) { return vld3_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vld3_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld3_lane_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 1, i32 4) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) { return vld3_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vld3_lane_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld3_lane_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) { return vld3_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vld3_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld3_lane_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) { return vld3_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vld3_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld3_lane_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 1, i32 4) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) { return vld3_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vld3_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> +// CHECK-LABEL: define void @test_vld3_lane_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld3lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], i32 3, i32 2) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <4 x half> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x half> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x half> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) { return vld3_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vld3_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> +// CHECK-LABEL: define void @test_vld3_lane_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], i32 1, i32 4) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <2 x float> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x float> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x float> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) { return vld3_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vld3_lane_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld3_lane_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) { return vld3_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vld3_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld3_lane_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) { return vld3_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vld4q_u8( -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld4q_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint8x16x4_t test_vld4q_u8(uint8_t const * a) { return vld4q_u8(a); } -// CHECK-LABEL: @test_vld4q_u16( -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld4q_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint16x8x4_t test_vld4q_u16(uint16_t const * a) { return vld4q_u16(a); } -// CHECK-LABEL: @test_vld4q_u32( -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld4q_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint32x4x4_t test_vld4q_u32(uint32_t const * a) { return vld4q_u32(a); } -// CHECK-LABEL: @test_vld4q_s8( -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld4q_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int8x16x4_t test_vld4q_s8(int8_t const * a) { return vld4q_s8(a); } -// CHECK-LABEL: @test_vld4q_s16( -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld4q_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int16x8x4_t test_vld4q_s16(int16_t const * a) { return vld4q_s16(a); } -// CHECK-LABEL: @test_vld4q_s32( -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld4q_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int32x4x4_t test_vld4q_s32(int32_t const * a) { return vld4q_s32(a); } -// CHECK-LABEL: @test_vld4q_f16( -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> +// CHECK-LABEL: define void @test_vld4q_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld4.v8f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <8 x half> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x half> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x half> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x half> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float16x8x4_t test_vld4q_f16(float16_t const * a) { return vld4q_f16(a); } -// CHECK-LABEL: @test_vld4q_f32( -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> +// CHECK-LABEL: define void @test_vld4q_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <4 x float> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x float> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x float> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <4 x float> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float32x4x4_t test_vld4q_f32(float32_t const * a) { return vld4q_f32(a); } -// CHECK-LABEL: @test_vld4q_p8( -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld4q_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly8x16x4_t test_vld4q_p8(poly8_t const * a) { return vld4q_p8(a); } -// CHECK-LABEL: @test_vld4q_p16( -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld4q_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly16x8x4_t test_vld4q_p16(poly16_t const * a) { return vld4q_p16(a); } -// CHECK-LABEL: @test_vld4_u8( -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld4_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint8x8x4_t test_vld4_u8(uint8_t const * a) { return vld4_u8(a); } -// CHECK-LABEL: @test_vld4_u16( -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld4_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint16x4x4_t test_vld4_u16(uint16_t const * a) { return vld4_u16(a); } -// CHECK-LABEL: @test_vld4_u32( -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld4_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 3 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint32x2x4_t test_vld4_u32(uint32_t const * a) { return vld4_u32(a); } -// CHECK-LABEL: @test_vld4_u64( -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> +// CHECK-LABEL: define void @test_vld4_u64( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT64X1X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 3 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint64x1x4_t test_vld4_u64(uint64_t const * a) { return vld4_u64(a); } -// CHECK-LABEL: @test_vld4_s8( -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld4_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int8x8x4_t test_vld4_s8(int8_t const * a) { return vld4_s8(a); } -// CHECK-LABEL: @test_vld4_s16( -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld4_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int16x4x4_t test_vld4_s16(int16_t const * a) { return vld4_s16(a); } -// CHECK-LABEL: @test_vld4_s32( -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld4_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 3 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int32x2x4_t test_vld4_s32(int32_t const * a) { return vld4_s32(a); } -// CHECK-LABEL: @test_vld4_s64( -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> +// CHECK-LABEL: define void @test_vld4_s64( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT64X1X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 3 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int64x1x4_t test_vld4_s64(int64_t const * a) { return vld4_s64(a); } -// CHECK-LABEL: @test_vld4_f16( -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> +// CHECK-LABEL: define void @test_vld4_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld4.v4f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], 3 +// CHECK-NEXT: store <4 x half> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x half> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x half> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x half> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float16x4x4_t test_vld4_f16(float16_t const * a) { return vld4_f16(a); } -// CHECK-LABEL: @test_vld4_f32( -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> +// CHECK-LABEL: define void @test_vld4_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], 3 +// CHECK-NEXT: store <2 x float> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x float> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x float> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <2 x float> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float32x2x4_t test_vld4_f32(float32_t const * a) { return vld4_f32(a); } -// CHECK-LABEL: @test_vld4_p8( -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld4_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly8x8x4_t test_vld4_p8(poly8_t const * a) { return vld4_p8(a); } -// CHECK-LABEL: @test_vld4_p16( -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld4_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly16x4x4_t test_vld4_p16(poly16_t const * a) { return vld4_p16(a); } -// CHECK-LABEL: @test_vld4q_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld4q_lane_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i32 7, i32 2) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) { return vld4q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vld4q_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld4q_lane_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i32 3, i32 4) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) { return vld4q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vld4q_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld4q_lane_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i32 7, i32 2) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) { return vld4q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vld4q_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld4q_lane_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i32 3, i32 4) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) { return vld4q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vld4q_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x half> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> +// CHECK-LABEL: define void @test_vld4q_lane_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld4lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x half> [[TMP6]], <8 x half> [[TMP7]], i32 7, i32 2) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <8 x half> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x half> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x half> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x half> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) { return vld4q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vld4q_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x float> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> +// CHECK-LABEL: define void @test_vld4q_lane_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[TMP7]], i32 3, i32 4) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <4 x float> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x float> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x float> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <4 x float> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) { return vld4q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vld4q_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld4q_lane_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i32 7, i32 2) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) { return vld4q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vld4_lane_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld4_lane_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) { return vld4_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vld4_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld4_lane_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) { return vld4_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vld4_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld4_lane_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 1, i32 4) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) { return vld4_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vld4_lane_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld4_lane_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) { return vld4_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vld4_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld4_lane_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) { return vld4_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vld4_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld4_lane_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 1, i32 4) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) { return vld4_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vld4_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> +// CHECK-LABEL: define void @test_vld4_lane_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld4lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i32 3, i32 2) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <4 x half> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x half> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x half> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x half> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) { return vld4_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vld4_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <2 x float> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x float> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> +// CHECK-LABEL: define void @test_vld4_lane_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i32 1, i32 4) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <2 x float> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x float> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x float> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <2 x float> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) { return vld4_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vld4_lane_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld4_lane_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) { return vld4_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vld4_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld4_lane_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) { return vld4_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vmax_s8( -// CHECK: [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMAX_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vmax_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMAX_V_I]] +// int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) { return vmax_s8(a, b); } -// CHECK-LABEL: @test_vmax_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VMAX_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vmax_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> [[VMAX_V_I]], <4 x i16> [[VMAX_V1_I]]) +// CHECK-NEXT: [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) { return vmax_s16(a, b); } -// CHECK-LABEL: @test_vmax_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VMAX_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vmax_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> [[VMAX_V_I]], <2 x i32> [[VMAX_V1_I]]) +// CHECK-NEXT: [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) { return vmax_s32(a, b); } -// CHECK-LABEL: @test_vmax_u8( -// CHECK: [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMAX_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vmax_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMAX_V_I]] +// uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) { return vmax_u8(a, b); } -// CHECK-LABEL: @test_vmax_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VMAX_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vmax_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> [[VMAX_V_I]], <4 x i16> [[VMAX_V1_I]]) +// CHECK-NEXT: [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) { return vmax_u16(a, b); } -// CHECK-LABEL: @test_vmax_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VMAX_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vmax_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> [[VMAX_V_I]], <2 x i32> [[VMAX_V1_I]]) +// CHECK-NEXT: [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) { return vmax_u32(a, b); } -// CHECK-LABEL: @test_vmax_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VMAX_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vmax_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> [[VMAX_V_I]], <2 x float> [[VMAX_V1_I]]) +// CHECK-NEXT: [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] +// float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) { return vmax_f32(a, b); } -// CHECK-LABEL: @test_vmaxq_s8( -// CHECK: [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMAXQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMAXQ_V_I]] +// int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) { return vmaxq_s8(a, b); } -// CHECK-LABEL: @test_vmaxq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VMAXQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> [[VMAXQ_V_I]], <8 x i16> [[VMAXQ_V1_I]]) +// CHECK-NEXT: [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) { return vmaxq_s16(a, b); } -// CHECK-LABEL: @test_vmaxq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VMAXQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> [[VMAXQ_V_I]], <4 x i32> [[VMAXQ_V1_I]]) +// CHECK-NEXT: [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) { return vmaxq_s32(a, b); } -// CHECK-LABEL: @test_vmaxq_u8( -// CHECK: [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMAXQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMAXQ_V_I]] +// uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) { return vmaxq_u8(a, b); } -// CHECK-LABEL: @test_vmaxq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VMAXQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> [[VMAXQ_V_I]], <8 x i16> [[VMAXQ_V1_I]]) +// CHECK-NEXT: [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) { return vmaxq_u16(a, b); } -// CHECK-LABEL: @test_vmaxq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VMAXQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> [[VMAXQ_V_I]], <4 x i32> [[VMAXQ_V1_I]]) +// CHECK-NEXT: [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) { return vmaxq_u32(a, b); } -// CHECK-LABEL: @test_vmaxq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VMAXQ_V2_I]] +// CHECK-LABEL: define <4 x float> @test_vmaxq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> [[VMAXQ_V_I]], <4 x float> [[VMAXQ_V1_I]]) +// CHECK-NEXT: [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) { return vmaxq_f32(a, b); } -// CHECK-LABEL: @test_vmin_s8( -// CHECK: [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMIN_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vmin_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMIN_V_I]] +// int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) { return vmin_s8(a, b); } -// CHECK-LABEL: @test_vmin_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VMIN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vmin_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> [[VMIN_V_I]], <4 x i16> [[VMIN_V1_I]]) +// CHECK-NEXT: [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) { return vmin_s16(a, b); } -// CHECK-LABEL: @test_vmin_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VMIN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vmin_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> [[VMIN_V_I]], <2 x i32> [[VMIN_V1_I]]) +// CHECK-NEXT: [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) { return vmin_s32(a, b); } -// CHECK-LABEL: @test_vmin_u8( -// CHECK: [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMIN_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vmin_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMIN_V_I]] +// uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) { return vmin_u8(a, b); } -// CHECK-LABEL: @test_vmin_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VMIN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vmin_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> [[VMIN_V_I]], <4 x i16> [[VMIN_V1_I]]) +// CHECK-NEXT: [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) { return vmin_u16(a, b); } -// CHECK-LABEL: @test_vmin_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VMIN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vmin_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> [[VMIN_V_I]], <2 x i32> [[VMIN_V1_I]]) +// CHECK-NEXT: [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) { return vmin_u32(a, b); } -// CHECK-LABEL: @test_vmin_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VMIN_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vmin_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> [[VMIN_V_I]], <2 x float> [[VMIN_V1_I]]) +// CHECK-NEXT: [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] +// float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) { return vmin_f32(a, b); } -// CHECK-LABEL: @test_vminq_s8( -// CHECK: [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMINQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vminq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMINQ_V_I]] +// int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) { return vminq_s8(a, b); } -// CHECK-LABEL: @test_vminq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VMINQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vminq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> [[VMINQ_V_I]], <8 x i16> [[VMINQ_V1_I]]) +// CHECK-NEXT: [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) { return vminq_s16(a, b); } -// CHECK-LABEL: @test_vminq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VMINQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vminq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> [[VMINQ_V_I]], <4 x i32> [[VMINQ_V1_I]]) +// CHECK-NEXT: [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) { return vminq_s32(a, b); } -// CHECK-LABEL: @test_vminq_u8( -// CHECK: [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMINQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vminq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMINQ_V_I]] +// uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) { return vminq_u8(a, b); } -// CHECK-LABEL: @test_vminq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VMINQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vminq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> [[VMINQ_V_I]], <8 x i16> [[VMINQ_V1_I]]) +// CHECK-NEXT: [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) { return vminq_u16(a, b); } -// CHECK-LABEL: @test_vminq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VMINQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vminq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> [[VMINQ_V_I]], <4 x i32> [[VMINQ_V1_I]]) +// CHECK-NEXT: [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) { return vminq_u32(a, b); } -// CHECK-LABEL: @test_vminq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VMINQ_V2_I]] +// CHECK-LABEL: define <4 x float> @test_vminq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> [[VMINQ_V_I]], <4 x float> [[VMINQ_V1_I]]) +// CHECK-NEXT: [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) { return vminq_f32(a, b); } -// CHECK-LABEL: @test_vmla_s8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define <8 x i8> @test_vmla_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return vmla_s8(a, b, c); } -// CHECK-LABEL: @test_vmla_s16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vmla_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmla_s16(a, b, c); } -// CHECK-LABEL: @test_vmla_s32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vmla_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmla_s32(a, b, c); } -// CHECK-LABEL: @test_vmla_f32( -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, %c -// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]] -// CHECK: ret <2 x float> [[ADD_I]] +// CHECK-LABEL: define <2 x float> @test_vmla_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmla_f32(a, b, c); } -// CHECK-LABEL: @test_vmla_u8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define <8 x i8> @test_vmla_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vmla_u8(a, b, c); } -// CHECK-LABEL: @test_vmla_u16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vmla_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmla_u16(a, b, c); } -// CHECK-LABEL: @test_vmla_u32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vmla_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmla_u32(a, b, c); } -// CHECK-LABEL: @test_vmlaq_s8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { return vmlaq_s8(a, b, c); } -// CHECK-LABEL: @test_vmlaq_s16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { return vmlaq_s16(a, b, c); } -// CHECK-LABEL: @test_vmlaq_s32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { return vmlaq_s32(a, b, c); } -// CHECK-LABEL: @test_vmlaq_f32( -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, %c -// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]] -// CHECK: ret <4 x float> [[ADD_I]] +// CHECK-LABEL: define <4 x float> @test_vmlaq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return vmlaq_f32(a, b, c); } -// CHECK-LABEL: @test_vmlaq_u8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vmlaq_u8(a, b, c); } -// CHECK-LABEL: @test_vmlaq_u16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vmlaq_u16(a, b, c); } -// CHECK-LABEL: @test_vmlaq_u32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vmlaq_u32(a, b, c); } -// CHECK-LABEL: @test_vmlal_s8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlal_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vmlal_s8(a, b, c); } -// CHECK-LABEL: @test_vmlal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlal_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlal_s16(a, b, c); } -// CHECK-LABEL: @test_vmlal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlal_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlal_s32(a, b, c); } -// CHECK-LABEL: @test_vmlal_u8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlal_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vmlal_u8(a, b, c); } -// CHECK-LABEL: @test_vmlal_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlal_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlal_u16(a, b, c); } -// CHECK-LABEL: @test_vmlal_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlal_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlal_u32(a, b, c); } -// CHECK-LABEL: @test_vmlal_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <4 x i32> [[ADD]] +// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD]] +// int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlal_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlal_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <2 x i64> [[ADD]] +// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD]] +// int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlal_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlal_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <4 x i32> [[ADD]] +// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD]] +// uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlal_lane_u16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlal_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <2 x i64> [[ADD]] +// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD]] +// uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlal_lane_u32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlal_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return vmlal_n_s16(a, b, c); } -// CHECK-LABEL: @test_vmlal_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return vmlal_n_s32(a, b, c); } -// CHECK-LABEL: @test_vmlal_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { return vmlal_n_u16(a, b, c); } -// CHECK-LABEL: @test_vmlal_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { return vmlal_n_u32(a, b, c); } -// CHECK-LABEL: @test_vmla_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i16> [[ADD]] +// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i16> [[ADD]] +// int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmla_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vmla_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <2 x i32> [[ADD]] +// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <2 x i32> [[ADD]] +// int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmla_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vmla_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i16> [[ADD]] +// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i16> [[ADD]] +// uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmla_lane_u16(a, b, c, 3); } -// CHECK-LABEL: @test_vmla_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <2 x i32> [[ADD]] +// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <2 x i32> [[ADD]] +// uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmla_lane_u32(a, b, c, 1); } -// CHECK-LABEL: @test_vmla_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[C]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]] +// CHECK-NEXT: ret <2 x float> [[ADD]] +// float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmla_lane_f32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlaq_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <8 x i16> [[ADD]] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <8 x i16> [[ADD]] +// int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { return vmlaq_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlaq_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i32> [[ADD]] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i32> [[ADD]] +// int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { return vmlaq_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlaq_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <8 x i16> [[ADD]] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <8 x i16> [[ADD]] +// uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) { return vmlaq_lane_u16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlaq_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i32> [[ADD]] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i32> [[ADD]] +// uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) { return vmlaq_lane_u32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlaq_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[C]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x float> [[ADD]] +// float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) { return vmlaq_lane_f32(a, b, c, 1); } -// CHECK-LABEL: @test_vmla_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) { return vmla_n_s16(a, b, c); } -// CHECK-LABEL: @test_vmla_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) { return vmla_n_s32(a, b, c); } -// CHECK-LABEL: @test_vmla_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { return vmla_n_u16(a, b, c); } -// CHECK-LABEL: @test_vmla_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { return vmla_n_u32(a, b, c); } -// CHECK-LABEL: @test_vmla_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1 -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]] -// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]] -// CHECK: ret <2 x float> [[ADD_I]] +// CHECK-LABEL: define <2 x float> @test_vmla_n_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[C]], i32 1 +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) { return vmla_n_f32(a, b, c); } -// CHECK-LABEL: @test_vmlaq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { return vmlaq_n_s16(a, b, c); } -// CHECK-LABEL: @test_vmlaq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { return vmlaq_n_s32(a, b, c); } -// CHECK-LABEL: @test_vmlaq_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { return vmlaq_n_u16(a, b, c); } -// CHECK-LABEL: @test_vmlaq_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { return vmlaq_n_u32(a, b, c); } -// CHECK-LABEL: @test_vmlaq_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3 -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]] -// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]] -// CHECK: ret <4 x float> [[ADD_I]] +// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { return vmlaq_n_f32(a, b, c); } -// CHECK-LABEL: @test_vmls_s8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]] -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define <8 x i8> @test_vmls_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return vmls_s8(a, b, c); } -// CHECK-LABEL: @test_vmls_s16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vmls_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmls_s16(a, b, c); } -// CHECK-LABEL: @test_vmls_s32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vmls_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmls_s32(a, b, c); } -// CHECK-LABEL: @test_vmls_f32( -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, %c -// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]] -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define <2 x float> @test_vmls_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmls_f32(a, b, c); } -// CHECK-LABEL: @test_vmls_u8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]] -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define <8 x i8> @test_vmls_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vmls_u8(a, b, c); } -// CHECK-LABEL: @test_vmls_u16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vmls_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmls_u16(a, b, c); } -// CHECK-LABEL: @test_vmls_u32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vmls_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmls_u32(a, b, c); } -// CHECK-LABEL: @test_vmlsq_s8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]] -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { return vmlsq_s8(a, b, c); } -// CHECK-LABEL: @test_vmlsq_s16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { return vmlsq_s16(a, b, c); } -// CHECK-LABEL: @test_vmlsq_s32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { return vmlsq_s32(a, b, c); } -// CHECK-LABEL: @test_vmlsq_f32( -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, %c -// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]] -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define <4 x float> @test_vmlsq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return vmlsq_f32(a, b, c); } -// CHECK-LABEL: @test_vmlsq_u8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]] -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vmlsq_u8(a, b, c); } -// CHECK-LABEL: @test_vmlsq_u16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vmlsq_u16(a, b, c); } -// CHECK-LABEL: @test_vmlsq_u32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vmlsq_u32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_s8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vmlsl_s8(a, b, c); } -// CHECK-LABEL: @test_vmlsl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlsl_s16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlsl_s32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_u8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vmlsl_u8(a, b, c); } -// CHECK-LABEL: @test_vmlsl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlsl_u16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlsl_u32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <4 x i32> [[SUB]] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB]] +// int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlsl_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlsl_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <2 x i64> [[SUB]] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB]] +// int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlsl_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlsl_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <4 x i32> [[SUB]] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB]] +// uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlsl_lane_u16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlsl_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <2 x i64> [[SUB]] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB]] +// uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlsl_lane_u32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlsl_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return vmlsl_n_s16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return vmlsl_n_s32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { return vmlsl_n_u16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { return vmlsl_n_u32(a, b, c); } -// CHECK-LABEL: @test_vmls_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i16> [[SUB]] +// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i16> [[SUB]] +// int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmls_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vmls_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <2 x i32> [[SUB]] +// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <2 x i32> [[SUB]] +// int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmls_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vmls_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i16> [[SUB]] +// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i16> [[SUB]] +// uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmls_lane_u16(a, b, c, 3); } -// CHECK-LABEL: @test_vmls_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <2 x i32> [[SUB]] +// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <2 x i32> [[SUB]] +// uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmls_lane_u32(a, b, c, 1); } -// CHECK-LABEL: @test_vmls_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[C]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]] +// CHECK-NEXT: ret <2 x float> [[SUB]] +// float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmls_lane_f32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlsq_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <8 x i16> [[SUB]] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <8 x i16> [[SUB]] +// int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { return vmlsq_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlsq_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i32> [[SUB]] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i32> [[SUB]] +// int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { return vmlsq_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlsq_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <8 x i16> [[SUB]] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <8 x i16> [[SUB]] +// uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) { return vmlsq_lane_u16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlsq_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i32> [[SUB]] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i32> [[SUB]] +// uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) { return vmlsq_lane_u32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlsq_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[C]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x float> [[SUB]] +// float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) { return vmlsq_lane_f32(a, b, c, 1); } -// CHECK-LABEL: @test_vmls_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) { return vmls_n_s16(a, b, c); } -// CHECK-LABEL: @test_vmls_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) { return vmls_n_s32(a, b, c); } -// CHECK-LABEL: @test_vmls_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { return vmls_n_u16(a, b, c); } -// CHECK-LABEL: @test_vmls_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { return vmls_n_u32(a, b, c); } -// CHECK-LABEL: @test_vmls_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1 -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]] -// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]] -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define <2 x float> @test_vmls_n_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[C]], i32 1 +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) { return vmls_n_f32(a, b, c); } -// CHECK-LABEL: @test_vmlsq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { return vmlsq_n_s16(a, b, c); } -// CHECK-LABEL: @test_vmlsq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { return vmlsq_n_s32(a, b, c); } -// CHECK-LABEL: @test_vmlsq_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { return vmlsq_n_u16(a, b, c); } -// CHECK-LABEL: @test_vmlsq_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { return vmlsq_n_u32(a, b, c); } -// CHECK-LABEL: @test_vmlsq_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3 -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]] -// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]] -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { return vmlsq_n_f32(a, b, c); } -// CHECK-LABEL: @test_vmovl_s8( -// CHECK: [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmovl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] +// int16x8_t test_vmovl_s8(int8x8_t a) { return vmovl_s8(a); } -// CHECK-LABEL: @test_vmovl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = sext <4 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I]] +// CHECK-LABEL: define <4 x i32> @test_vmovl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] +// int32x4_t test_vmovl_s16(int16x4_t a) { return vmovl_s16(a); } -// CHECK-LABEL: @test_vmovl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = sext <2 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I]] +// CHECK-LABEL: define <2 x i64> @test_vmovl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] +// int64x2_t test_vmovl_s32(int32x2_t a) { return vmovl_s32(a); } -// CHECK-LABEL: @test_vmovl_u8( -// CHECK: [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmovl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] +// uint16x8_t test_vmovl_u8(uint8x8_t a) { return vmovl_u8(a); } -// CHECK-LABEL: @test_vmovl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = zext <4 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I]] +// CHECK-LABEL: define <4 x i32> @test_vmovl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] +// uint32x4_t test_vmovl_u16(uint16x4_t a) { return vmovl_u16(a); } -// CHECK-LABEL: @test_vmovl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = zext <2 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I]] +// CHECK-LABEL: define <2 x i64> @test_vmovl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] +// uint64x2_t test_vmovl_u32(uint32x2_t a) { return vmovl_u32(a); } -// CHECK-LABEL: @test_vmovn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[VMOVN_I]] +// CHECK-LABEL: define <8 x i8> @test_vmovn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VMOVN_I]] +// int8x8_t test_vmovn_s16(int16x8_t a) { return vmovn_s16(a); } -// CHECK-LABEL: @test_vmovn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[VMOVN_I]] +// CHECK-LABEL: define <4 x i16> @test_vmovn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VMOVN_I]] +// int16x4_t test_vmovn_s32(int32x4_t a) { return vmovn_s32(a); } -// CHECK-LABEL: @test_vmovn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[VMOVN_I]] +// CHECK-LABEL: define <2 x i32> @test_vmovn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VMOVN_I]] +// int32x2_t test_vmovn_s64(int64x2_t a) { return vmovn_s64(a); } -// CHECK-LABEL: @test_vmovn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[VMOVN_I]] +// CHECK-LABEL: define <8 x i8> @test_vmovn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VMOVN_I]] +// uint8x8_t test_vmovn_u16(uint16x8_t a) { return vmovn_u16(a); } -// CHECK-LABEL: @test_vmovn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[VMOVN_I]] +// CHECK-LABEL: define <4 x i16> @test_vmovn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VMOVN_I]] +// uint16x4_t test_vmovn_u32(uint32x4_t a) { return vmovn_u32(a); } -// CHECK-LABEL: @test_vmovn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[VMOVN_I]] +// CHECK-LABEL: define <2 x i32> @test_vmovn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VMOVN_I]] +// uint32x2_t test_vmovn_u64(uint64x2_t a) { return vmovn_u64(a); } -// CHECK-LABEL: @test_vmov_n_u8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i8> @test_vmov_n_u8( +// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// uint8x8_t test_vmov_n_u8(uint8_t a) { return vmov_n_u8(a); } -// CHECK-LABEL: @test_vmov_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i16> @test_vmov_n_u16( +// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VECINIT3_I]] +// uint16x4_t test_vmov_n_u16(uint16_t a) { return vmov_n_u16(a); } -// CHECK-LABEL: @test_vmov_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: ret <2 x i32> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x i32> @test_vmov_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i32> [[VECINIT1_I]] +// uint32x2_t test_vmov_n_u32(uint32_t a) { return vmov_n_u32(a); } -// CHECK-LABEL: @test_vmov_n_s8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i8> @test_vmov_n_s8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// int8x8_t test_vmov_n_s8(int8_t a) { return vmov_n_s8(a); } -// CHECK-LABEL: @test_vmov_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i16> @test_vmov_n_s16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VECINIT3_I]] +// int16x4_t test_vmov_n_s16(int16_t a) { return vmov_n_s16(a); } -// CHECK-LABEL: @test_vmov_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: ret <2 x i32> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x i32> @test_vmov_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i32> [[VECINIT1_I]] +// int32x2_t test_vmov_n_s32(int32_t a) { return vmov_n_s32(a); } -// CHECK-LABEL: @test_vmov_n_p8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i8> @test_vmov_n_p8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// poly8x8_t test_vmov_n_p8(poly8_t a) { return vmov_n_p8(a); } -// CHECK-LABEL: @test_vmov_n_p16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i16> @test_vmov_n_p16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VECINIT3_I]] +// poly16x4_t test_vmov_n_p16(poly16_t a) { return vmov_n_p16(a); } -// CHECK-LABEL: @test_vmov_n_f16( -// CHECK: [[TMP0:%.*]] = load half, ptr %a, align 2 -// CHECK: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0 -// CHECK: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1 -// CHECK: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2 -// CHECK: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3 -// CHECK: ret <4 x half> [[VECINIT3]] +// CHECK-LABEL: define <4 x half> @test_vmov_n_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3 +// CHECK-NEXT: ret <4 x half> [[VECINIT3]] +// float16x4_t test_vmov_n_f16(float16_t *a) { return vmov_n_f16(*a); } -// CHECK-LABEL: @test_vmov_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1 -// CHECK: ret <2 x float> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x float> @test_vmov_n_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[A]], i32 1 +// CHECK-NEXT: ret <2 x float> [[VECINIT1_I]] +// float32x2_t test_vmov_n_f32(float32_t a) { return vmov_n_f32(a); } -// CHECK-LABEL: @test_vmovq_n_u8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 -// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 -// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 -// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 -// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 -// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 -// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 -// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VECINIT15_I]] +// CHECK-LABEL: define <16 x i8> @test_vmovq_n_u8( +// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7 +// CHECK-NEXT: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8 +// CHECK-NEXT: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9 +// CHECK-NEXT: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10 +// CHECK-NEXT: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11 +// CHECK-NEXT: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12 +// CHECK-NEXT: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13 +// CHECK-NEXT: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// uint8x16_t test_vmovq_n_u8(uint8_t a) { return vmovq_n_u8(a); } -// CHECK-LABEL: @test_vmovq_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i16> @test_vmovq_n_u16( +// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VECINIT7_I]] +// uint16x8_t test_vmovq_n_u16(uint16_t a) { return vmovq_n_u16(a); } -// CHECK-LABEL: @test_vmovq_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3 -// CHECK: ret <4 x i32> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i32> @test_vmovq_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i32> [[VECINIT3_I]] +// uint32x4_t test_vmovq_n_u32(uint32_t a) { return vmovq_n_u32(a); } -// CHECK-LABEL: @test_vmovq_n_s8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 -// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 -// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 -// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 -// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 -// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 -// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 -// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VECINIT15_I]] +// CHECK-LABEL: define <16 x i8> @test_vmovq_n_s8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7 +// CHECK-NEXT: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8 +// CHECK-NEXT: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9 +// CHECK-NEXT: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10 +// CHECK-NEXT: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11 +// CHECK-NEXT: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12 +// CHECK-NEXT: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13 +// CHECK-NEXT: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// int8x16_t test_vmovq_n_s8(int8_t a) { return vmovq_n_s8(a); } -// CHECK-LABEL: @test_vmovq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i16> @test_vmovq_n_s16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VECINIT7_I]] +// int16x8_t test_vmovq_n_s16(int16_t a) { return vmovq_n_s16(a); } -// CHECK-LABEL: @test_vmovq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3 -// CHECK: ret <4 x i32> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i32> @test_vmovq_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i32> [[VECINIT3_I]] +// int32x4_t test_vmovq_n_s32(int32_t a) { return vmovq_n_s32(a); } -// CHECK-LABEL: @test_vmovq_n_p8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 -// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 -// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 -// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 -// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 -// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 -// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 -// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VECINIT15_I]] +// CHECK-LABEL: define <16 x i8> @test_vmovq_n_p8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[A]], i32 7 +// CHECK-NEXT: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[A]], i32 8 +// CHECK-NEXT: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[A]], i32 9 +// CHECK-NEXT: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[A]], i32 10 +// CHECK-NEXT: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[A]], i32 11 +// CHECK-NEXT: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[A]], i32 12 +// CHECK-NEXT: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[A]], i32 13 +// CHECK-NEXT: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[A]], i32 14 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[A]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// poly8x16_t test_vmovq_n_p8(poly8_t a) { return vmovq_n_p8(a); } -// CHECK-LABEL: @test_vmovq_n_p16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i16> @test_vmovq_n_p16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[A]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[A]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[A]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[A]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VECINIT7_I]] +// poly16x8_t test_vmovq_n_p16(poly16_t a) { return vmovq_n_p16(a); } -// CHECK-LABEL: @test_vmovq_n_f16( -// CHECK: [[TMP0:%.*]] = load half, ptr %a, align 2 -// CHECK: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0 -// CHECK: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1 -// CHECK: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2 -// CHECK: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3 -// CHECK: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4 -// CHECK: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5 -// CHECK: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6 -// CHECK: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7 -// CHECK: ret <8 x half> [[VECINIT7]] +// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4 +// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7 +// CHECK-NEXT: ret <8 x half> [[VECINIT7]] +// float16x8_t test_vmovq_n_f16(float16_t *a) { return vmovq_n_f16(*a); } -// CHECK-LABEL: @test_vmovq_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3 -// CHECK: ret <4 x float> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x float> @test_vmovq_n_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[A]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[A]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[A]], i32 3 +// CHECK-NEXT: ret <4 x float> [[VECINIT3_I]] +// float32x4_t test_vmovq_n_f32(float32_t a) { return vmovq_n_f32(a); } -// CHECK-LABEL: @test_vmov_n_s64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0 -// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vmov_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0 +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// int64x1_t test_vmov_n_s64(int64_t a) { int64x1_t tmp = vmov_n_s64(a); return vadd_s64(tmp, tmp); } -// CHECK-LABEL: @test_vmov_n_u64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0 -// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vmov_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i32 0 +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// uint64x1_t test_vmov_n_u64(uint64_t a) { uint64x1_t tmp = vmov_n_u64(a); return vadd_u64(tmp, tmp); } -// CHECK-LABEL: @test_vmovq_n_s64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x i64> @test_vmovq_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VECINIT1_I]] +// int64x2_t test_vmovq_n_s64(int64_t a) { return vmovq_n_s64(a); } -// CHECK-LABEL: @test_vmovq_n_u64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x i64> @test_vmovq_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VECINIT1_I]] +// uint64x2_t test_vmovq_n_u64(uint64_t a) { return vmovq_n_u64(a); } -// CHECK-LABEL: @test_vmul_s8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[MUL_I]] +// CHECK-LABEL: define <8 x i8> @test_vmul_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[MUL_I]] +// int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) { return vmul_s8(a, b); } -// CHECK-LABEL: @test_vmul_s16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[MUL_I]] +// CHECK-LABEL: define <4 x i16> @test_vmul_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[MUL_I]] +// int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) { return vmul_s16(a, b); } -// CHECK-LABEL: @test_vmul_s32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[MUL_I]] +// CHECK-LABEL: define <2 x i32> @test_vmul_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[MUL_I]] +// int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) { return vmul_s32(a, b); } -// CHECK-LABEL: @test_vmul_f32( -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, %b -// CHECK: ret <2 x float> [[MUL_I]] +// CHECK-LABEL: define <2 x float> @test_vmul_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[A]], [[B]] +// CHECK-NEXT: ret <2 x float> [[MUL_I]] +// float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) { return vmul_f32(a, b); } -// CHECK-LABEL: @test_vmul_u8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[MUL_I]] +// CHECK-LABEL: define <8 x i8> @test_vmul_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[MUL_I]] +// uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) { return vmul_u8(a, b); } -// CHECK-LABEL: @test_vmul_u16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[MUL_I]] +// CHECK-LABEL: define <4 x i16> @test_vmul_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[MUL_I]] +// uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) { return vmul_u16(a, b); } -// CHECK-LABEL: @test_vmul_u32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[MUL_I]] +// CHECK-LABEL: define <2 x i32> @test_vmul_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[MUL_I]] +// uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) { return vmul_u32(a, b); } -// CHECK-LABEL: @test_vmulq_s8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[MUL_I]] +// CHECK-LABEL: define <16 x i8> @test_vmulq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[MUL_I]] +// int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) { return vmulq_s8(a, b); } -// CHECK-LABEL: @test_vmulq_s16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[MUL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmulq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[MUL_I]] +// int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) { return vmulq_s16(a, b); } -// CHECK-LABEL: @test_vmulq_s32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[MUL_I]] +// CHECK-LABEL: define <4 x i32> @test_vmulq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[MUL_I]] +// int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) { return vmulq_s32(a, b); } -// CHECK-LABEL: @test_vmulq_f32( -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, %b -// CHECK: ret <4 x float> [[MUL_I]] +// CHECK-LABEL: define <4 x float> @test_vmulq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[A]], [[B]] +// CHECK-NEXT: ret <4 x float> [[MUL_I]] +// float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) { return vmulq_f32(a, b); } -// CHECK-LABEL: @test_vmulq_u8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[MUL_I]] +// CHECK-LABEL: define <16 x i8> @test_vmulq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[MUL_I]] +// uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) { return vmulq_u8(a, b); } -// CHECK-LABEL: @test_vmulq_u16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[MUL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmulq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[MUL_I]] +// uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) { return vmulq_u16(a, b); } -// CHECK-LABEL: @test_vmulq_u32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[MUL_I]] +// CHECK-LABEL: define <4 x i32> @test_vmulq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[MUL_I]] +// uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) { return vmulq_u32(a, b); } -// CHECK-LABEL: @test_vmull_s8( -// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i16> [[VMULL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmull_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I]] +// int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) { return vmull_s8(a, b); } -// CHECK-LABEL: @test_vmull_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i32> [[VMULL2_I]] +// CHECK-LABEL: define <4 x i32> @test_vmull_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] +// int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) { return vmull_s16(a, b); } -// CHECK-LABEL: @test_vmull_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i64> [[VMULL2_I]] +// CHECK-LABEL: define <2 x i64> @test_vmull_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] +// int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) { return vmull_s32(a, b); } -// CHECK-LABEL: @test_vmull_u8( -// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i16> [[VMULL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmull_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I]] +// uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) { return vmull_u8(a, b); } -// CHECK-LABEL: @test_vmull_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i32> [[VMULL2_I]] +// CHECK-LABEL: define <4 x i32> @test_vmull_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] +// uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) { return vmull_u16(a, b); } -// CHECK-LABEL: @test_vmull_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i64> [[VMULL2_I]] +// CHECK-LABEL: define <2 x i64> @test_vmull_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] +// uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) { return vmull_u32(a, b); } -// CHECK-LABEL: @test_vmull_p8( -// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i16> [[VMULL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmull_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I]] +// poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) { return vmull_p8(a, b); } -// CHECK-LABEL: @test_vmull_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK: ret <4 x i32> [[VMULL2_I]] +// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] +// int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) { return vmull_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vmull_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK: ret <2 x i64> [[VMULL2_I]] +// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] +// int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) { return vmull_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vmull_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK: ret <4 x i32> [[VMULL2_I]] +// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] +// uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) { return vmull_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vmull_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK: ret <2 x i64> [[VMULL2_I]] +// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] +// uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) { return vmull_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vmull_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) -// CHECK: ret <4 x i32> [[VMULL5_I]] +// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] +// int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) { return vmull_n_s16(a, b); } -// CHECK-LABEL: @test_vmull_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) -// CHECK: ret <2 x i64> [[VMULL3_I]] +// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] +// int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) { return vmull_n_s32(a, b); } -// CHECK-LABEL: @test_vmull_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) -// CHECK: ret <4 x i32> [[VMULL5_I]] +// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] +// uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) { return vmull_n_u16(a, b); } -// CHECK-LABEL: @test_vmull_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) -// CHECK: ret <2 x i64> [[VMULL3_I]] +// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] +// uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) { return vmull_n_u32(a, b); } -// CHECK-LABEL: @test_vmul_p8( -// CHECK: [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMUL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vmul_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMUL_V_I]] +// poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) { return vmul_p8(a, b); } -// CHECK-LABEL: @test_vmulq_p8( -// CHECK: [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMULQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vmulq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMULQ_V_I]] +// poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) { return vmulq_p8(a, b); } -// CHECK-LABEL: @test_vmul_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] -// CHECK: ret <4 x i16> [[MUL]] +// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A]], [[LANE]] +// CHECK-NEXT: ret <4 x i16> [[MUL]] +// int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) { return vmul_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vmul_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] -// CHECK: ret <2 x i32> [[MUL]] +// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A]], [[LANE]] +// CHECK-NEXT: ret <2 x i32> [[MUL]] +// int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) { return vmul_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vmul_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] -// CHECK: ret <2 x float> [[MUL]] +// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A]], [[LANE]] +// CHECK-NEXT: ret <2 x float> [[MUL]] +// float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) { return vmul_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vmul_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] -// CHECK: ret <4 x i16> [[MUL]] +// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A]], [[LANE]] +// CHECK-NEXT: ret <4 x i16> [[MUL]] +// uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) { return vmul_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vmul_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] -// CHECK: ret <2 x i32> [[MUL]] +// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A]], [[LANE]] +// CHECK-NEXT: ret <2 x i32> [[MUL]] +// uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) { return vmul_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vmulq_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] -// CHECK: ret <8 x i16> [[MUL]] +// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A]], [[LANE]] +// CHECK-NEXT: ret <8 x i16> [[MUL]] +// int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) { return vmulq_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vmulq_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] -// CHECK: ret <4 x i32> [[MUL]] +// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A]], [[LANE]] +// CHECK-NEXT: ret <4 x i32> [[MUL]] +// int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) { return vmulq_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vmulq_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] -// CHECK: ret <4 x float> [[MUL]] +// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A]], [[LANE]] +// CHECK-NEXT: ret <4 x float> [[MUL]] +// float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) { return vmulq_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vmulq_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] -// CHECK: ret <8 x i16> [[MUL]] +// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A]], [[LANE]] +// CHECK-NEXT: ret <8 x i16> [[MUL]] +// uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) { return vmulq_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vmulq_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] -// CHECK: ret <4 x i32> [[MUL]] +// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A]], [[LANE]] +// CHECK-NEXT: ret <4 x i32> [[MUL]] +// uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) { return vmulq_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vmul_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]] -// CHECK: ret <4 x i16> [[MUL_I]] +// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[VECINIT3_I]] +// CHECK-NEXT: ret <4 x i16> [[MUL_I]] +// int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) { return vmul_n_s16(a, b); } -// CHECK-LABEL: @test_vmul_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]] -// CHECK: ret <2 x i32> [[MUL_I]] +// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[VECINIT1_I]] +// CHECK-NEXT: ret <2 x i32> [[MUL_I]] +// int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) { return vmul_n_s32(a, b); } -// CHECK-LABEL: @test_vmul_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1 -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]] -// CHECK: ret <2 x float> [[MUL_I]] +// CHECK-LABEL: define <2 x float> @test_vmul_n_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[B]], i32 1 +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[A]], [[VECINIT1_I]] +// CHECK-NEXT: ret <2 x float> [[MUL_I]] +// float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) { return vmul_n_f32(a, b); } -// CHECK-LABEL: @test_vmul_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]] -// CHECK: ret <4 x i16> [[MUL_I]] +// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[VECINIT3_I]] +// CHECK-NEXT: ret <4 x i16> [[MUL_I]] +// uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) { return vmul_n_u16(a, b); } -// CHECK-LABEL: @test_vmul_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]] -// CHECK: ret <2 x i32> [[MUL_I]] +// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[VECINIT1_I]] +// CHECK-NEXT: ret <2 x i32> [[MUL_I]] +// uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) { return vmul_n_u32(a, b); } -// CHECK-LABEL: @test_vmulq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]] -// CHECK: ret <8 x i16> [[MUL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[VECINIT7_I]] +// CHECK-NEXT: ret <8 x i16> [[MUL_I]] +// int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) { return vmulq_n_s16(a, b); } -// CHECK-LABEL: @test_vmulq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]] -// CHECK: ret <4 x i32> [[MUL_I]] +// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[VECINIT3_I]] +// CHECK-NEXT: ret <4 x i32> [[MUL_I]] +// int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) { return vmulq_n_s32(a, b); } -// CHECK-LABEL: @test_vmulq_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3 -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]] -// CHECK: ret <4 x float> [[MUL_I]] +// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[B]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[A]], [[VECINIT3_I]] +// CHECK-NEXT: ret <4 x float> [[MUL_I]] +// float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) { return vmulq_n_f32(a, b); } -// CHECK-LABEL: @test_vmulq_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]] -// CHECK: ret <8 x i16> [[MUL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[VECINIT7_I]] +// CHECK-NEXT: ret <8 x i16> [[MUL_I]] +// uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) { return vmulq_n_u16(a, b); } -// CHECK-LABEL: @test_vmulq_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]] -// CHECK: ret <4 x i32> [[MUL_I]] +// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[VECINIT3_I]] +// CHECK-NEXT: ret <4 x i32> [[MUL_I]] +// uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) { return vmulq_n_u32(a, b); } -// CHECK-LABEL: @test_vmvn_s8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1) -// CHECK: ret <8 x i8> [[NEG_I]] +// CHECK-LABEL: define <8 x i8> @test_vmvn_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <8 x i8> [[NOT_I]] +// int8x8_t test_vmvn_s8(int8x8_t a) { return vmvn_s8(a); } -// CHECK-LABEL: @test_vmvn_s16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, splat (i16 -1) -// CHECK: ret <4 x i16> [[NEG_I]] +// CHECK-LABEL: define <4 x i16> @test_vmvn_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <4 x i16> [[NOT_I]] +// int16x4_t test_vmvn_s16(int16x4_t a) { return vmvn_s16(a); } -// CHECK-LABEL: @test_vmvn_s32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, splat (i32 -1) -// CHECK: ret <2 x i32> [[NEG_I]] +// CHECK-LABEL: define <2 x i32> @test_vmvn_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <2 x i32> [[NOT_I]] +// int32x2_t test_vmvn_s32(int32x2_t a) { return vmvn_s32(a); } -// CHECK-LABEL: @test_vmvn_u8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1) -// CHECK: ret <8 x i8> [[NEG_I]] +// CHECK-LABEL: define <8 x i8> @test_vmvn_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <8 x i8> [[NOT_I]] +// uint8x8_t test_vmvn_u8(uint8x8_t a) { return vmvn_u8(a); } -// CHECK-LABEL: @test_vmvn_u16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, splat (i16 -1) -// CHECK: ret <4 x i16> [[NEG_I]] +// CHECK-LABEL: define <4 x i16> @test_vmvn_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <4 x i16> [[NOT_I]] +// uint16x4_t test_vmvn_u16(uint16x4_t a) { return vmvn_u16(a); } -// CHECK-LABEL: @test_vmvn_u32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, splat (i32 -1) -// CHECK: ret <2 x i32> [[NEG_I]] +// CHECK-LABEL: define <2 x i32> @test_vmvn_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <2 x i32> [[NOT_I]] +// uint32x2_t test_vmvn_u32(uint32x2_t a) { return vmvn_u32(a); } -// CHECK-LABEL: @test_vmvn_p8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1) -// CHECK: ret <8 x i8> [[NEG_I]] +// CHECK-LABEL: define <8 x i8> @test_vmvn_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <8 x i8> [[NOT_I]] +// poly8x8_t test_vmvn_p8(poly8x8_t a) { return vmvn_p8(a); } -// CHECK-LABEL: @test_vmvnq_s8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1) -// CHECK: ret <16 x i8> [[NEG_I]] +// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <16 x i8> [[NOT_I]] +// int8x16_t test_vmvnq_s8(int8x16_t a) { return vmvnq_s8(a); } -// CHECK-LABEL: @test_vmvnq_s16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, splat (i16 -1) -// CHECK: ret <8 x i16> [[NEG_I]] +// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <8 x i16> [[NOT_I]] +// int16x8_t test_vmvnq_s16(int16x8_t a) { return vmvnq_s16(a); } -// CHECK-LABEL: @test_vmvnq_s32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, splat (i32 -1) -// CHECK: ret <4 x i32> [[NEG_I]] +// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <4 x i32> [[NOT_I]] +// int32x4_t test_vmvnq_s32(int32x4_t a) { return vmvnq_s32(a); } -// CHECK-LABEL: @test_vmvnq_u8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1) -// CHECK: ret <16 x i8> [[NEG_I]] +// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <16 x i8> [[NOT_I]] +// uint8x16_t test_vmvnq_u8(uint8x16_t a) { return vmvnq_u8(a); } -// CHECK-LABEL: @test_vmvnq_u16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, splat (i16 -1) -// CHECK: ret <8 x i16> [[NEG_I]] +// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <8 x i16> [[NOT_I]] +// uint16x8_t test_vmvnq_u16(uint16x8_t a) { return vmvnq_u16(a); } -// CHECK-LABEL: @test_vmvnq_u32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, splat (i32 -1) -// CHECK: ret <4 x i32> [[NEG_I]] +// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <4 x i32> [[NOT_I]] +// uint32x4_t test_vmvnq_u32(uint32x4_t a) { return vmvnq_u32(a); } -// CHECK-LABEL: @test_vmvnq_p8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1) -// CHECK: ret <16 x i8> [[NEG_I]] +// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <16 x i8> [[NOT_I]] +// poly8x16_t test_vmvnq_p8(poly8x16_t a) { return vmvnq_p8(a); } -// CHECK-LABEL: @test_vneg_s8( -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define <8 x i8> @test_vneg_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[A]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// int8x8_t test_vneg_s8(int8x8_t a) { return vneg_s8(a); } -// CHECK-LABEL: @test_vneg_s16( -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vneg_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[A]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// int16x4_t test_vneg_s16(int16x4_t a) { return vneg_s16(a); } -// CHECK-LABEL: @test_vneg_s32( -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vneg_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[A]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vneg_s32(int32x2_t a) { return vneg_s32(a); } -// CHECK-LABEL: @test_vneg_f32( -// CHECK: [[SUB_I:%.*]] = fneg <2 x float> %a -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define <2 x float> @test_vneg_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[A]] +// CHECK-NEXT: ret <2 x float> [[FNEG_I]] +// float32x2_t test_vneg_f32(float32x2_t a) { return vneg_f32(a); } -// CHECK-LABEL: @test_vnegq_s8( -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define <16 x i8> @test_vnegq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, [[A]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// int8x16_t test_vnegq_s8(int8x16_t a) { return vnegq_s8(a); } -// CHECK-LABEL: @test_vnegq_s16( -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vnegq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, [[A]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vnegq_s16(int16x8_t a) { return vnegq_s16(a); } -// CHECK-LABEL: @test_vnegq_s32( -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vnegq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, [[A]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vnegq_s32(int32x4_t a) { return vnegq_s32(a); } -// CHECK-LABEL: @test_vnegq_f32( -// CHECK: [[SUB_I:%.*]] = fneg <4 x float> %a -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define <4 x float> @test_vnegq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[A]] +// CHECK-NEXT: ret <4 x float> [[FNEG_I]] +// float32x4_t test_vnegq_f32(float32x4_t a) { return vnegq_f32(a); } -// CHECK-LABEL: @test_vorn_s8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, splat (i8 -1) -// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]] -// CHECK: ret <8 x i8> [[OR_I]] +// CHECK-LABEL: define <8 x i8> @test_vorn_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i8> [[OR_I]] +// int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) { return vorn_s8(a, b); } -// CHECK-LABEL: @test_vorn_s16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, splat (i16 -1) -// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]] -// CHECK: ret <4 x i16> [[OR_I]] +// CHECK-LABEL: define <4 x i16> @test_vorn_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i16> [[OR_I]] +// int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) { return vorn_s16(a, b); } -// CHECK-LABEL: @test_vorn_s32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, splat (i32 -1) -// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]] -// CHECK: ret <2 x i32> [[OR_I]] +// CHECK-LABEL: define <2 x i32> @test_vorn_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i32> [[OR_I]] +// int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) { return vorn_s32(a, b); } -// CHECK-LABEL: @test_vorn_s64( -// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, splat (i64 -1) -// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]] -// CHECK: ret <1 x i64> [[OR_I]] +// CHECK-LABEL: define <1 x i64> @test_vorn_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <1 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <1 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <1 x i64> [[OR_I]] +// int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) { return vorn_s64(a, b); } -// CHECK-LABEL: @test_vorn_u8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, splat (i8 -1) -// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]] -// CHECK: ret <8 x i8> [[OR_I]] +// CHECK-LABEL: define <8 x i8> @test_vorn_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i8> [[OR_I]] +// uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) { return vorn_u8(a, b); } -// CHECK-LABEL: @test_vorn_u16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, splat (i16 -1) -// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]] -// CHECK: ret <4 x i16> [[OR_I]] +// CHECK-LABEL: define <4 x i16> @test_vorn_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i16> [[OR_I]] +// uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) { return vorn_u16(a, b); } -// CHECK-LABEL: @test_vorn_u32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, splat (i32 -1) -// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]] -// CHECK: ret <2 x i32> [[OR_I]] +// CHECK-LABEL: define <2 x i32> @test_vorn_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i32> [[OR_I]] +// uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) { return vorn_u32(a, b); } -// CHECK-LABEL: @test_vorn_u64( -// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, splat (i64 -1) -// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]] -// CHECK: ret <1 x i64> [[OR_I]] +// CHECK-LABEL: define <1 x i64> @test_vorn_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <1 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <1 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <1 x i64> [[OR_I]] +// uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) { return vorn_u64(a, b); } -// CHECK-LABEL: @test_vornq_s8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, splat (i8 -1) -// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]] -// CHECK: ret <16 x i8> [[OR_I]] +// CHECK-LABEL: define <16 x i8> @test_vornq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <16 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <16 x i8> [[OR_I]] +// int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) { return vornq_s8(a, b); } -// CHECK-LABEL: @test_vornq_s16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, splat (i16 -1) -// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]] -// CHECK: ret <8 x i16> [[OR_I]] +// CHECK-LABEL: define <8 x i16> @test_vornq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i16> [[OR_I]] +// int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) { return vornq_s16(a, b); } -// CHECK-LABEL: @test_vornq_s32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, splat (i32 -1) -// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]] -// CHECK: ret <4 x i32> [[OR_I]] +// CHECK-LABEL: define <4 x i32> @test_vornq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i32> [[OR_I]] +// int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) { return vornq_s32(a, b); } -// CHECK-LABEL: @test_vornq_s64( -// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, splat (i64 -1) -// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]] -// CHECK: ret <2 x i64> [[OR_I]] +// CHECK-LABEL: define <2 x i64> @test_vornq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i64> [[OR_I]] +// int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) { return vornq_s64(a, b); } -// CHECK-LABEL: @test_vornq_u8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, splat (i8 -1) -// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]] -// CHECK: ret <16 x i8> [[OR_I]] +// CHECK-LABEL: define <16 x i8> @test_vornq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <16 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <16 x i8> [[OR_I]] +// uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) { return vornq_u8(a, b); } -// CHECK-LABEL: @test_vornq_u16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, splat (i16 -1) -// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]] -// CHECK: ret <8 x i16> [[OR_I]] +// CHECK-LABEL: define <8 x i16> @test_vornq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i16> [[OR_I]] +// uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) { return vornq_u16(a, b); } -// CHECK-LABEL: @test_vornq_u32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, splat (i32 -1) -// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]] -// CHECK: ret <4 x i32> [[OR_I]] +// CHECK-LABEL: define <4 x i32> @test_vornq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i32> [[OR_I]] +// uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) { return vornq_u32(a, b); } -// CHECK-LABEL: @test_vornq_u64( -// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, splat (i64 -1) -// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]] -// CHECK: ret <2 x i64> [[OR_I]] +// CHECK-LABEL: define <2 x i64> @test_vornq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i64> [[OR_I]] +// uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) { return vornq_u64(a, b); } -// CHECK-LABEL: @test_vorr_s8( -// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[OR_I]] +// CHECK-LABEL: define <8 x i8> @test_vorr_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[OR_I]] +// int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) { return vorr_s8(a, b); } -// CHECK-LABEL: @test_vorr_s16( -// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[OR_I]] +// CHECK-LABEL: define <4 x i16> @test_vorr_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[OR_I]] +// int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) { return vorr_s16(a, b); } -// CHECK-LABEL: @test_vorr_s32( -// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[OR_I]] +// CHECK-LABEL: define <2 x i32> @test_vorr_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[OR_I]] +// int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) { return vorr_s32(a, b); } -// CHECK-LABEL: @test_vorr_s64( -// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[OR_I]] +// CHECK-LABEL: define <1 x i64> @test_vorr_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[OR_I]] +// int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) { return vorr_s64(a, b); } -// CHECK-LABEL: @test_vorr_u8( -// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[OR_I]] +// CHECK-LABEL: define <8 x i8> @test_vorr_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[OR_I]] +// uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) { return vorr_u8(a, b); } -// CHECK-LABEL: @test_vorr_u16( -// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[OR_I]] +// CHECK-LABEL: define <4 x i16> @test_vorr_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[OR_I]] +// uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) { return vorr_u16(a, b); } -// CHECK-LABEL: @test_vorr_u32( -// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[OR_I]] +// CHECK-LABEL: define <2 x i32> @test_vorr_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[OR_I]] +// uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) { return vorr_u32(a, b); } -// CHECK-LABEL: @test_vorr_u64( -// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[OR_I]] +// CHECK-LABEL: define <1 x i64> @test_vorr_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[OR_I]] +// uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) { return vorr_u64(a, b); } -// CHECK-LABEL: @test_vorrq_s8( -// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[OR_I]] +// CHECK-LABEL: define <16 x i8> @test_vorrq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[OR_I]] +// int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) { return vorrq_s8(a, b); } -// CHECK-LABEL: @test_vorrq_s16( -// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[OR_I]] +// CHECK-LABEL: define <8 x i16> @test_vorrq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[OR_I]] +// int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) { return vorrq_s16(a, b); } -// CHECK-LABEL: @test_vorrq_s32( -// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[OR_I]] +// CHECK-LABEL: define <4 x i32> @test_vorrq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[OR_I]] +// int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) { return vorrq_s32(a, b); } -// CHECK-LABEL: @test_vorrq_s64( -// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[OR_I]] +// CHECK-LABEL: define <2 x i64> @test_vorrq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[OR_I]] +// int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) { return vorrq_s64(a, b); } -// CHECK-LABEL: @test_vorrq_u8( -// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[OR_I]] +// CHECK-LABEL: define <16 x i8> @test_vorrq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[OR_I]] +// uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) { return vorrq_u8(a, b); } -// CHECK-LABEL: @test_vorrq_u16( -// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[OR_I]] +// CHECK-LABEL: define <8 x i16> @test_vorrq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[OR_I]] +// uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) { return vorrq_u16(a, b); } -// CHECK-LABEL: @test_vorrq_u32( -// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[OR_I]] +// CHECK-LABEL: define <4 x i32> @test_vorrq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[OR_I]] +// uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) { return vorrq_u32(a, b); } -// CHECK-LABEL: @test_vorrq_u64( -// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[OR_I]] +// CHECK-LABEL: define <2 x i64> @test_vorrq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[OR_I]] +// uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) { return vorrq_u64(a, b); } -// CHECK-LABEL: @test_vpadal_s8( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) -// CHECK: ret <4 x i16> [[VPADAL_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vpadal_s8( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> [[VPADAL_V_I]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPADAL_V1_I]] +// int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) { return vpadal_s8(a, b); } -// CHECK-LABEL: @test_vpadal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) -// CHECK: ret <2 x i32> [[VPADAL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpadal_s16( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> [[VPADAL_V_I]], <4 x i16> [[VPADAL_V1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VPADAL_V2_I]] +// int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) { return vpadal_s16(a, b); } -// CHECK-LABEL: @test_vpadal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) -// CHECK: ret <1 x i64> [[VPADAL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vpadal_s32( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> [[VPADAL_V_I]], <2 x i32> [[VPADAL_V1_I]]) +// CHECK-NEXT: ret <1 x i64> [[VPADAL_V2_I]] +// int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) { return vpadal_s32(a, b); } -// CHECK-LABEL: @test_vpadal_u8( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) -// CHECK: ret <4 x i16> [[VPADAL_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vpadal_u8( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> [[VPADAL_V_I]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPADAL_V1_I]] +// uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) { return vpadal_u8(a, b); } -// CHECK-LABEL: @test_vpadal_u16( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) -// CHECK: ret <2 x i32> [[VPADAL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpadal_u16( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> [[VPADAL_V_I]], <4 x i16> [[VPADAL_V1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VPADAL_V2_I]] +// uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) { return vpadal_u16(a, b); } -// CHECK-LABEL: @test_vpadal_u32( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) -// CHECK: ret <1 x i64> [[VPADAL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vpadal_u32( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> [[VPADAL_V_I]], <2 x i32> [[VPADAL_V1_I]]) +// CHECK-NEXT: ret <1 x i64> [[VPADAL_V2_I]] +// uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) { return vpadal_u32(a, b); } -// CHECK-LABEL: @test_vpadalq_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) -// CHECK: ret <8 x i16> [[VPADALQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> [[VPADALQ_V_I]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VPADALQ_V1_I]] +// int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) { return vpadalq_s8(a, b); } -// CHECK-LABEL: @test_vpadalq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) -// CHECK: ret <4 x i32> [[VPADALQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> [[VPADALQ_V_I]], <8 x i16> [[VPADALQ_V1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VPADALQ_V2_I]] +// int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) { return vpadalq_s16(a, b); } -// CHECK-LABEL: @test_vpadalq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) -// CHECK: ret <2 x i64> [[VPADALQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> [[VPADALQ_V_I]], <4 x i32> [[VPADALQ_V1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VPADALQ_V2_I]] +// int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) { return vpadalq_s32(a, b); } -// CHECK-LABEL: @test_vpadalq_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) -// CHECK: ret <8 x i16> [[VPADALQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> [[VPADALQ_V_I]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VPADALQ_V1_I]] +// uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) { return vpadalq_u8(a, b); } -// CHECK-LABEL: @test_vpadalq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) -// CHECK: ret <4 x i32> [[VPADALQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> [[VPADALQ_V_I]], <8 x i16> [[VPADALQ_V1_I]]) +// CHECK-NEXT: ret <4 x i32> [[VPADALQ_V2_I]] +// uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) { return vpadalq_u16(a, b); } -// CHECK-LABEL: @test_vpadalq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) -// CHECK: ret <2 x i64> [[VPADALQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> [[VPADALQ_V_I]], <4 x i32> [[VPADALQ_V1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VPADALQ_V2_I]] +// uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) { return vpadalq_u32(a, b); } -// CHECK-LABEL: @test_vpadd_s8( -// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vpadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPADD_V_I]] +// int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) { return vpadd_s8(a, b); } -// CHECK-LABEL: @test_vpadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vpadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) +// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) { return vpadd_s16(a, b); } -// CHECK-LABEL: @test_vpadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) +// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) { return vpadd_s32(a, b); } -// CHECK-LABEL: @test_vpadd_u8( -// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vpadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPADD_V_I]] +// uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) { return vpadd_u8(a, b); } -// CHECK-LABEL: @test_vpadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vpadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) +// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) { return vpadd_u16(a, b); } -// CHECK-LABEL: @test_vpadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) +// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) { return vpadd_u32(a, b); } -// CHECK-LABEL: @test_vpadd_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VPADD_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vpadd_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) +// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] +// float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) { return vpadd_f32(a, b); } -// CHECK-LABEL: @test_vpaddl_s8( -// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) -// CHECK: ret <4 x i16> [[VPADDL_I]] +// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VPADDL_I]] +// int16x4_t test_vpaddl_s8(int8x8_t a) { return vpaddl_s8(a); } -// CHECK-LABEL: @test_vpaddl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a) -// CHECK: ret <2 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) +// CHECK-NEXT: ret <2 x i32> [[VPADDL1_I]] +// int32x2_t test_vpaddl_s16(int16x4_t a) { return vpaddl_s16(a); } -// CHECK-LABEL: @test_vpaddl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a) -// CHECK: ret <1 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) +// CHECK-NEXT: ret <1 x i64> [[VPADDL1_I]] +// int64x1_t test_vpaddl_s32(int32x2_t a) { return vpaddl_s32(a); } -// CHECK-LABEL: @test_vpaddl_u8( -// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) -// CHECK: ret <4 x i16> [[VPADDL_I]] +// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VPADDL_I]] +// uint16x4_t test_vpaddl_u8(uint8x8_t a) { return vpaddl_u8(a); } -// CHECK-LABEL: @test_vpaddl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a) -// CHECK: ret <2 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) +// CHECK-NEXT: ret <2 x i32> [[VPADDL1_I]] +// uint32x2_t test_vpaddl_u16(uint16x4_t a) { return vpaddl_u16(a); } -// CHECK-LABEL: @test_vpaddl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a) -// CHECK: ret <1 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) +// CHECK-NEXT: ret <1 x i64> [[VPADDL1_I]] +// uint64x1_t test_vpaddl_u32(uint32x2_t a) { return vpaddl_u32(a); } -// CHECK-LABEL: @test_vpaddlq_s8( -// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) -// CHECK: ret <8 x i16> [[VPADDL_I]] +// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VPADDL_I]] +// int16x8_t test_vpaddlq_s8(int8x16_t a) { return vpaddlq_s8(a); } -// CHECK-LABEL: @test_vpaddlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a) -// CHECK: ret <4 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) +// CHECK-NEXT: ret <4 x i32> [[VPADDL1_I]] +// int32x4_t test_vpaddlq_s16(int16x8_t a) { return vpaddlq_s16(a); } -// CHECK-LABEL: @test_vpaddlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a) -// CHECK: ret <2 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) +// CHECK-NEXT: ret <2 x i64> [[VPADDL1_I]] +// int64x2_t test_vpaddlq_s32(int32x4_t a) { return vpaddlq_s32(a); } -// CHECK-LABEL: @test_vpaddlq_u8( -// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) -// CHECK: ret <8 x i16> [[VPADDL_I]] +// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VPADDL_I]] +// uint16x8_t test_vpaddlq_u8(uint8x16_t a) { return vpaddlq_u8(a); } -// CHECK-LABEL: @test_vpaddlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a) -// CHECK: ret <4 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) +// CHECK-NEXT: ret <4 x i32> [[VPADDL1_I]] +// uint32x4_t test_vpaddlq_u16(uint16x8_t a) { return vpaddlq_u16(a); } -// CHECK-LABEL: @test_vpaddlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a) -// CHECK: ret <2 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) +// CHECK-NEXT: ret <2 x i64> [[VPADDL1_I]] +// uint64x2_t test_vpaddlq_u32(uint32x4_t a) { return vpaddlq_u32(a); } -// CHECK-LABEL: @test_vpmax_s8( -// CHECK: [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMAX_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vpmax_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMAX_V_I]] +// int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) { return vpmax_s8(a, b); } -// CHECK-LABEL: @test_vpmax_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPMAX_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vpmax_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> [[VPMAX_V_I]], <4 x i16> [[VPMAX_V1_I]]) +// CHECK-NEXT: [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) { return vpmax_s16(a, b); } -// CHECK-LABEL: @test_vpmax_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPMAX_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpmax_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> [[VPMAX_V_I]], <2 x i32> [[VPMAX_V1_I]]) +// CHECK-NEXT: [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) { return vpmax_s32(a, b); } -// CHECK-LABEL: @test_vpmax_u8( -// CHECK: [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMAX_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vpmax_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMAX_V_I]] +// uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) { return vpmax_u8(a, b); } -// CHECK-LABEL: @test_vpmax_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPMAX_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vpmax_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> [[VPMAX_V_I]], <4 x i16> [[VPMAX_V1_I]]) +// CHECK-NEXT: [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) { return vpmax_u16(a, b); } -// CHECK-LABEL: @test_vpmax_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPMAX_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpmax_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> [[VPMAX_V_I]], <2 x i32> [[VPMAX_V1_I]]) +// CHECK-NEXT: [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) { return vpmax_u32(a, b); } -// CHECK-LABEL: @test_vpmax_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VPMAX_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vpmax_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> [[VPMAX_V_I]], <2 x float> [[VPMAX_V1_I]]) +// CHECK-NEXT: [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] +// float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) { return vpmax_f32(a, b); } -// CHECK-LABEL: @test_vpmin_s8( -// CHECK: [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMIN_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vpmin_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMIN_V_I]] +// int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) { return vpmin_s8(a, b); } -// CHECK-LABEL: @test_vpmin_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPMIN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vpmin_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> [[VPMIN_V_I]], <4 x i16> [[VPMIN_V1_I]]) +// CHECK-NEXT: [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) { return vpmin_s16(a, b); } -// CHECK-LABEL: @test_vpmin_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPMIN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpmin_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> [[VPMIN_V_I]], <2 x i32> [[VPMIN_V1_I]]) +// CHECK-NEXT: [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) { return vpmin_s32(a, b); } -// CHECK-LABEL: @test_vpmin_u8( -// CHECK: [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMIN_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vpmin_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMIN_V_I]] +// uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) { return vpmin_u8(a, b); } -// CHECK-LABEL: @test_vpmin_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPMIN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vpmin_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> [[VPMIN_V_I]], <4 x i16> [[VPMIN_V1_I]]) +// CHECK-NEXT: [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) { return vpmin_u16(a, b); } -// CHECK-LABEL: @test_vpmin_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPMIN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpmin_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> [[VPMIN_V_I]], <2 x i32> [[VPMIN_V1_I]]) +// CHECK-NEXT: [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) { return vpmin_u32(a, b); } -// CHECK-LABEL: @test_vpmin_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VPMIN_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vpmin_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> [[VPMIN_V_I]], <2 x float> [[VPMIN_V1_I]]) +// CHECK-NEXT: [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] +// float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) { return vpmin_f32(a, b); } -// CHECK-LABEL: @test_vqabs_s8( -// CHECK: [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VQABS_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqabs_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQABS_V_I]] +// int8x8_t test_vqabs_s8(int8x8_t a) { return vqabs_s8(a); } -// CHECK-LABEL: @test_vqabs_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a) -// CHECK: [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQABS_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vqabs_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> [[VQABS_V_I]]) +// CHECK-NEXT: [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vqabs_s16(int16x4_t a) { return vqabs_s16(a); } -// CHECK-LABEL: @test_vqabs_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a) -// CHECK: [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQABS_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vqabs_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> [[VQABS_V_I]]) +// CHECK-NEXT: [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vqabs_s32(int32x2_t a) { return vqabs_s32(a); } -// CHECK-LABEL: @test_vqabsq_s8( -// CHECK: [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VQABSQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VQABSQ_V_I]] +// int8x16_t test_vqabsq_s8(int8x16_t a) { return vqabsq_s8(a); } -// CHECK-LABEL: @test_vqabsq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a) -// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQABSQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) +// CHECK-NEXT: [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vqabsq_s16(int16x8_t a) { return vqabsq_s16(a); } -// CHECK-LABEL: @test_vqabsq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a) -// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQABSQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) +// CHECK-NEXT: [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// int32x4_t test_vqabsq_s32(int32x4_t a) { return vqabsq_s32(a); } -// CHECK-LABEL: @test_vqadd_s8( -// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQADD_V_I]] +// int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) { return vqadd_s8(a, b); } -// CHECK-LABEL: @test_vqadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) +// CHECK-NEXT: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) { return vqadd_s16(a, b); } -// CHECK-LABEL: @test_vqadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) +// CHECK-NEXT: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) { return vqadd_s32(a, b); } -// CHECK-LABEL: @test_vqadd_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQADD_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqadd_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) +// CHECK-NEXT: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) { return vqadd_s64(a, b); } -// CHECK-LABEL: @test_vqadd_u8( -// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQADD_V_I]] +// uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) { return vqadd_u8(a, b); } -// CHECK-LABEL: @test_vqadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) +// CHECK-NEXT: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) { return vqadd_u16(a, b); } -// CHECK-LABEL: @test_vqadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) +// CHECK-NEXT: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) { return vqadd_u32(a, b); } -// CHECK-LABEL: @test_vqadd_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQADD_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqadd_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) +// CHECK-NEXT: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) { return vqadd_u64(a, b); } -// CHECK-LABEL: @test_vqaddq_s8( -// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQADDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQADDQ_V_I]] +// int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) { return vqaddq_s8(a, b); } -// CHECK-LABEL: @test_vqaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) +// CHECK-NEXT: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) { return vqaddq_s16(a, b); } -// CHECK-LABEL: @test_vqaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) +// CHECK-NEXT: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) { return vqaddq_s32(a, b); } -// CHECK-LABEL: @test_vqaddq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQADDQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) +// CHECK-NEXT: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) { return vqaddq_s64(a, b); } -// CHECK-LABEL: @test_vqaddq_u8( -// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQADDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQADDQ_V_I]] +// uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) { return vqaddq_u8(a, b); } -// CHECK-LABEL: @test_vqaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) +// CHECK-NEXT: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) { return vqaddq_u16(a, b); } -// CHECK-LABEL: @test_vqaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) +// CHECK-NEXT: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) { return vqaddq_u32(a, b); } -// CHECK-LABEL: @test_vqaddq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQADDQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) +// CHECK-NEXT: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) { return vqaddq_u64(a, b); } -// CHECK-LABEL: @test_vqdmlal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) -// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] +// int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlal_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) -// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] +// int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlal_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmlal_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) -// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] +// int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlal_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vqdmlal_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) -// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] +// int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlal_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vqdmlal_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) -// CHECK: ret <4 x i32> [[VQDMLAL_V6_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]] +// int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return vqdmlal_n_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlal_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) -// CHECK: ret <2 x i64> [[VQDMLAL_V4_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]] +// int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return vqdmlal_n_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) -// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] +// int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlsl_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) -// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] +// int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlsl_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) -// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] +// int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlsl_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vqdmlsl_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) -// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] +// int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlsl_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vqdmlsl_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) -// CHECK: ret <4 x i32> [[VQDMLSL_V6_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]] +// int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return vqdmlsl_n_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) -// CHECK: ret <2 x i64> [[VQDMLSL_V4_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) +// CHECK-NEXT: [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]] +// int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return vqdmlsl_n_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmulh_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQDMULH_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) +// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) { return vqdmulh_s16(a, b); } -// CHECK-LABEL: @test_vqdmulh_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQDMULH_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) +// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) { return vqdmulh_s32(a, b); } -// CHECK-LABEL: @test_vqdmulhq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) +// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) { return vqdmulhq_s16(a, b); } -// CHECK-LABEL: @test_vqdmulhq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) +// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) { return vqdmulhq_s32(a, b); } -// CHECK-LABEL: @test_vqdmulh_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQDMULH_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) +// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP4]] +// int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) { return vqdmulh_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vqdmulh_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQDMULH_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) +// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP4]] +// int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) { return vqdmulh_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vqdmulhq_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) -// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) +// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP4]] +// int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) { return vqdmulhq_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vqdmulhq_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) -// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) +// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] +// int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) { return vqdmulhq_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vqdmulh_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQDMULH_V5_I]] +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMULH_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I_I]], <4 x i16> [[VQDMULH_V1_I_I]]) +// CHECK-NEXT: [[VQDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) { return vqdmulh_n_s16(a, b); } -// CHECK-LABEL: @test_vqdmulh_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQDMULH_V3_I]] +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMULH_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I_I]], <2 x i32> [[VQDMULH_V1_I_I]]) +// CHECK-NEXT: [[VQDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) { return vqdmulh_n_s32(a, b); } -// CHECK-LABEL: @test_vqdmulhq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> -// CHECK: [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) -// CHECK: [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQDMULHQ_V9_I]] +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> +// CHECK-NEXT: [[VQDMULHQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I_I]], <8 x i16> [[VQDMULHQ_V1_I_I]]) +// CHECK-NEXT: [[VQDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) { return vqdmulhq_n_s16(a, b); } -// CHECK-LABEL: @test_vqdmulhq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> -// CHECK: [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) -// CHECK: [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULHQ_V5_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> +// CHECK-NEXT: [[VQDMULHQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I_I]], <4 x i32> [[VQDMULHQ_V1_I_I]]) +// CHECK-NEXT: [[VQDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) { return vqdmulhq_n_s32(a, b); } -// CHECK-LABEL: @test_vqdmull_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) +// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) { return vqdmull_s16(a, b); } -// CHECK-LABEL: @test_vqdmull_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) +// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) { return vqdmull_s32(a, b); } -// CHECK-LABEL: @test_vqdmull_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) +// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] +// int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) { return vqdmull_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vqdmull_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) +// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP4]] +// int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) { return vqdmull_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vqdmull_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULL_V5_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V1_I_I]]) +// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) { return vqdmull_n_s16(a, b); } -// CHECK-LABEL: @test_vqdmull_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQDMULL_V3_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V1_I_I]]) +// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) { return vqdmull_n_s32(a, b); } -// CHECK-LABEL: @test_vqmovn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a) -// CHECK: ret <8 x i8> [[VQMOVN_V1_I]] +// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> [[VQMOVN_V_I]]) +// CHECK-NEXT: ret <8 x i8> [[VQMOVN_V1_I]] +// int8x8_t test_vqmovn_s16(int16x8_t a) { return vqmovn_s16(a); } -// CHECK-LABEL: @test_vqmovn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQMOVN_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> [[VQMOVN_V_I]]) +// CHECK-NEXT: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vqmovn_s32(int32x4_t a) { return vqmovn_s32(a); } -// CHECK-LABEL: @test_vqmovn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQMOVN_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> [[VQMOVN_V_I]]) +// CHECK-NEXT: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vqmovn_s64(int64x2_t a) { return vqmovn_s64(a); } -// CHECK-LABEL: @test_vqmovn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a) -// CHECK: ret <8 x i8> [[VQMOVN_V1_I]] +// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> [[VQMOVN_V_I]]) +// CHECK-NEXT: ret <8 x i8> [[VQMOVN_V1_I]] +// uint8x8_t test_vqmovn_u16(uint16x8_t a) { return vqmovn_u16(a); } -// CHECK-LABEL: @test_vqmovn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQMOVN_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> [[VQMOVN_V_I]]) +// CHECK-NEXT: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// uint16x4_t test_vqmovn_u32(uint32x4_t a) { return vqmovn_u32(a); } -// CHECK-LABEL: @test_vqmovn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQMOVN_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> [[VQMOVN_V_I]]) +// CHECK-NEXT: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// uint32x2_t test_vqmovn_u64(uint64x2_t a) { return vqmovn_u64(a); } -// CHECK-LABEL: @test_vqmovun_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a) -// CHECK: ret <8 x i8> [[VQMOVUN_V1_I]] +// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> [[VQMOVUN_V_I]]) +// CHECK-NEXT: ret <8 x i8> [[VQMOVUN_V1_I]] +// uint8x8_t test_vqmovun_s16(int16x8_t a) { return vqmovun_s16(a); } -// CHECK-LABEL: @test_vqmovun_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a) -// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQMOVUN_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> [[VQMOVUN_V_I]]) +// CHECK-NEXT: [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// uint16x4_t test_vqmovun_s32(int32x4_t a) { return vqmovun_s32(a); } -// CHECK-LABEL: @test_vqmovun_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a) -// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQMOVUN_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> [[VQMOVUN_V_I]]) +// CHECK-NEXT: [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// uint32x2_t test_vqmovun_s64(int64x2_t a) { return vqmovun_s64(a); } -// CHECK-LABEL: @test_vqneg_s8( -// CHECK: [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VQNEG_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqneg_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQNEG_V_I]] +// int8x8_t test_vqneg_s8(int8x8_t a) { return vqneg_s8(a); } -// CHECK-LABEL: @test_vqneg_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a) -// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQNEG_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vqneg_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) +// CHECK-NEXT: [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vqneg_s16(int16x4_t a) { return vqneg_s16(a); } -// CHECK-LABEL: @test_vqneg_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a) -// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQNEG_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vqneg_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) +// CHECK-NEXT: [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP1]] +// int32x2_t test_vqneg_s32(int32x2_t a) { return vqneg_s32(a); } -// CHECK-LABEL: @test_vqnegq_s8( -// CHECK: [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VQNEGQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VQNEGQ_V_I]] +// int8x16_t test_vqnegq_s8(int8x16_t a) { return vqnegq_s8(a); } -// CHECK-LABEL: @test_vqnegq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a) -// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQNEGQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) +// CHECK-NEXT: [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vqnegq_s16(int16x8_t a) { return vqnegq_s16(a); } -// CHECK-LABEL: @test_vqnegq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a) -// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQNEGQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) +// CHECK-NEXT: [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// int32x4_t test_vqnegq_s32(int32x4_t a) { return vqnegq_s32(a); } -// CHECK-LABEL: @test_vqrdmulh_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRDMULH_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) +// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) { return vqrdmulh_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulh_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRDMULH_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) +// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) { return vqrdmulh_s32(a, b); } -// CHECK-LABEL: @test_vqrdmulhq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) +// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) { return vqrdmulhq_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulhq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) +// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) { return vqrdmulhq_s32(a, b); } -// CHECK-LABEL: @test_vqrdmulh_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRDMULH_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) +// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP4]] +// int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) { return vqrdmulh_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vqrdmulh_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRDMULH_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) +// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP4]] +// int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) { return vqrdmulh_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vqrdmulhq_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) -// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) +// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP4]] +// int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) { return vqrdmulhq_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vqrdmulhq_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) -// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) +// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] +// int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) { return vqrdmulhq_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vqrdmulh_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRDMULH_V5_I]] +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULH_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I_I]], <4 x i16> [[VQRDMULH_V1_I_I]]) +// CHECK-NEXT: [[VQRDMULH_V3_I_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) { return vqrdmulh_n_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulh_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRDMULH_V3_I]] +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[VQRDMULH_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I_I]], <2 x i32> [[VQRDMULH_V1_I_I]]) +// CHECK-NEXT: [[VQRDMULH_V3_I_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) { return vqrdmulh_n_s32(a, b); } -// CHECK-LABEL: @test_vqrdmulhq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) -// CHECK: [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRDMULHQ_V9_I]] +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULHQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I_I]], <8 x i16> [[VQRDMULHQ_V1_I_I]]) +// CHECK-NEXT: [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) { return vqrdmulhq_n_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulhq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) -// CHECK: [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRDMULHQ_V5_I]] +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> +// CHECK-NEXT: [[VQRDMULHQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I_I]], <4 x i32> [[VQRDMULHQ_V1_I_I]]) +// CHECK-NEXT: [[VQRDMULHQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) { return vqrdmulhq_n_s32(a, b); } -// CHECK-LABEL: @test_vqrshl_s8( -// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQRSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQRSHL_V_I]] +// int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) { return vqrshl_s8(a, b); } -// CHECK-LABEL: @test_vqrshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) +// CHECK-NEXT: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) { return vqrshl_s16(a, b); } -// CHECK-LABEL: @test_vqrshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) +// CHECK-NEXT: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) { return vqrshl_s32(a, b); } -// CHECK-LABEL: @test_vqrshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQRSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) +// CHECK-NEXT: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) { return vqrshl_s64(a, b); } -// CHECK-LABEL: @test_vqrshl_u8( -// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQRSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQRSHL_V_I]] +// uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) { return vqrshl_u8(a, b); } -// CHECK-LABEL: @test_vqrshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) +// CHECK-NEXT: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) { return vqrshl_u16(a, b); } -// CHECK-LABEL: @test_vqrshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) +// CHECK-NEXT: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) { return vqrshl_u32(a, b); } -// CHECK-LABEL: @test_vqrshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQRSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) +// CHECK-NEXT: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) { return vqrshl_u64(a, b); } -// CHECK-LABEL: @test_vqrshlq_s8( -// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQRSHLQ_V_I]] +// int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) { return vqrshlq_s8(a, b); } -// CHECK-LABEL: @test_vqrshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) +// CHECK-NEXT: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) { return vqrshlq_s16(a, b); } -// CHECK-LABEL: @test_vqrshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) +// CHECK-NEXT: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) { return vqrshlq_s32(a, b); } -// CHECK-LABEL: @test_vqrshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) +// CHECK-NEXT: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) { return vqrshlq_s64(a, b); } -// CHECK-LABEL: @test_vqrshlq_u8( -// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQRSHLQ_V_I]] +// uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) { return vqrshlq_u8(a, b); } -// CHECK-LABEL: @test_vqrshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) +// CHECK-NEXT: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) { return vqrshlq_u16(a, b); } -// CHECK-LABEL: @test_vqrshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) +// CHECK-NEXT: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) { return vqrshlq_u32(a, b); } -// CHECK-LABEL: @test_vqrshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) +// CHECK-NEXT: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) { return vqrshlq_u64(a, b); } -// CHECK-LABEL: @test_vqrshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VQRSHRN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VQRSHRN_N1]] +// int8x8_t test_vqrshrn_n_s16(int16x8_t a) { return vqrshrn_n_s16(a, 1); } -// CHECK-LABEL: @test_vqrshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VQRSHRN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VQRSHRN_N1]] +// int16x4_t test_vqrshrn_n_s32(int32x4_t a) { return vqrshrn_n_s32(a, 1); } -// CHECK-LABEL: @test_vqrshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VQRSHRN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VQRSHRN_N1]] +// int32x2_t test_vqrshrn_n_s64(int64x2_t a) { return vqrshrn_n_s64(a, 1); } -// CHECK-LABEL: @test_vqrshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VQRSHRN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VQRSHRN_N1]] +// uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) { return vqrshrn_n_u16(a, 1); } -// CHECK-LABEL: @test_vqrshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VQRSHRN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VQRSHRN_N1]] +// uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) { return vqrshrn_n_u32(a, 1); } -// CHECK-LABEL: @test_vqrshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VQRSHRN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VQRSHRN_N1]] +// uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) { return vqrshrn_n_u64(a, 1); } -// CHECK-LABEL: @test_vqrshrun_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VQRSHRUN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VQRSHRUN_N1]] +// uint8x8_t test_vqrshrun_n_s16(int16x8_t a) { return vqrshrun_n_s16(a, 1); } -// CHECK-LABEL: @test_vqrshrun_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VQRSHRUN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VQRSHRUN_N1]] +// uint16x4_t test_vqrshrun_n_s32(int32x4_t a) { return vqrshrun_n_s32(a, 1); } -// CHECK-LABEL: @test_vqrshrun_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VQRSHRUN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VQRSHRUN_N1]] +// uint32x2_t test_vqrshrun_n_s64(int64x2_t a) { return vqrshrun_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshl_s8( -// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_V_I]] +// int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) { return vqshl_s8(a, b); } -// CHECK-LABEL: @test_vqshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) +// CHECK-NEXT: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) { return vqshl_s16(a, b); } -// CHECK-LABEL: @test_vqshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) +// CHECK-NEXT: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) { return vqshl_s32(a, b); } -// CHECK-LABEL: @test_vqshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) +// CHECK-NEXT: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) { return vqshl_s64(a, b); } -// CHECK-LABEL: @test_vqshl_u8( -// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_V_I]] +// uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) { return vqshl_u8(a, b); } -// CHECK-LABEL: @test_vqshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) +// CHECK-NEXT: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) { return vqshl_u16(a, b); } -// CHECK-LABEL: @test_vqshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) +// CHECK-NEXT: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) { return vqshl_u32(a, b); } -// CHECK-LABEL: @test_vqshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) +// CHECK-NEXT: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) { return vqshl_u64(a, b); } -// CHECK-LABEL: @test_vqshlq_s8( -// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSHLQ_V_I]] +// int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) { return vqshlq_s8(a, b); } -// CHECK-LABEL: @test_vqshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) +// CHECK-NEXT: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) { return vqshlq_s16(a, b); } -// CHECK-LABEL: @test_vqshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) +// CHECK-NEXT: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) { return vqshlq_s32(a, b); } -// CHECK-LABEL: @test_vqshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) +// CHECK-NEXT: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) { return vqshlq_s64(a, b); } -// CHECK-LABEL: @test_vqshlq_u8( -// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSHLQ_V_I]] +// uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) { return vqshlq_u8(a, b); } -// CHECK-LABEL: @test_vqshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) +// CHECK-NEXT: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) { return vqshlq_u16(a, b); } -// CHECK-LABEL: @test_vqshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) +// CHECK-NEXT: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) { return vqshlq_u32(a, b); } -// CHECK-LABEL: @test_vqshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) +// CHECK-NEXT: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) { return vqshlq_u64(a, b); } -// CHECK-LABEL: @test_vqshlu_n_s8( -// CHECK: [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %a, <8 x i8> splat (i8 1) -// CHECK: ret <8 x i8> [[VQSHLU_N]] +// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 1)) +// CHECK-NEXT: ret <8 x i8> [[VQSHLU_N]] +// uint8x8_t test_vqshlu_n_s8(int8x8_t a) { return vqshlu_n_s8(a, 1); } -// CHECK-LABEL: @test_vqshlu_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> splat (i16 1) -// CHECK: ret <4 x i16> [[VQSHLU_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> splat (i16 1)) +// CHECK-NEXT: ret <4 x i16> [[VQSHLU_N1]] +// uint16x4_t test_vqshlu_n_s16(int16x4_t a) { return vqshlu_n_s16(a, 1); } -// CHECK-LABEL: @test_vqshlu_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> splat (i32 1)) -// CHECK: ret <2 x i32> [[VQSHLU_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> splat (i32 1)) +// CHECK-NEXT: ret <2 x i32> [[VQSHLU_N1]] +// uint32x2_t test_vqshlu_n_s32(int32x2_t a) { return vqshlu_n_s32(a, 1); } -// CHECK-LABEL: @test_vqshlu_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VQSHLU_N1]] +// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VQSHLU_N1]] +// uint64x1_t test_vqshlu_n_s64(int64x1_t a) { return vqshlu_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshluq_n_s8( -// CHECK: [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %a, <16 x i8> splat (i8 1)) -// CHECK: ret <16 x i8> [[VQSHLU_N]] +// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 1)) +// CHECK-NEXT: ret <16 x i8> [[VQSHLU_N]] +// uint8x16_t test_vqshluq_n_s8(int8x16_t a) { return vqshluq_n_s8(a, 1); } -// CHECK-LABEL: @test_vqshluq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> splat (i16 1)) -// CHECK: ret <8 x i16> [[VQSHLU_N1]] +// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> splat (i16 1)) +// CHECK-NEXT: ret <8 x i16> [[VQSHLU_N1]] +// uint16x8_t test_vqshluq_n_s16(int16x8_t a) { return vqshluq_n_s16(a, 1); } -// CHECK-LABEL: @test_vqshluq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> splat (i32 1)) -// CHECK: ret <4 x i32> [[VQSHLU_N1]] +// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> splat (i32 1)) +// CHECK-NEXT: ret <4 x i32> [[VQSHLU_N1]] +// uint32x4_t test_vqshluq_n_s32(int32x4_t a) { return vqshluq_n_s32(a, 1); } -// CHECK-LABEL: @test_vqshluq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> splat (i64 1)) -// CHECK: ret <2 x i64> [[VQSHLU_N1]] +// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> splat (i64 1)) +// CHECK-NEXT: ret <2 x i64> [[VQSHLU_N1]] +// uint64x2_t test_vqshluq_n_s64(int64x2_t a) { return vqshluq_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshl_n_s8( -// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> splat (i8 1)) -// CHECK: ret <8 x i8> [[VQSHL_N]] +// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 1)) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_N]] +// int8x8_t test_vqshl_n_s8(int8x8_t a) { return vqshl_n_s8(a, 1); } -// CHECK-LABEL: @test_vqshl_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> splat (i16 1)) -// CHECK: ret <4 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> splat (i16 1)) +// CHECK-NEXT: ret <4 x i16> [[VQSHL_N1]] +// int16x4_t test_vqshl_n_s16(int16x4_t a) { return vqshl_n_s16(a, 1); } -// CHECK-LABEL: @test_vqshl_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> splat (i32 1)) -// CHECK: ret <2 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> splat (i32 1)) +// CHECK-NEXT: ret <2 x i32> [[VQSHL_N1]] +// int32x2_t test_vqshl_n_s32(int32x2_t a) { return vqshl_n_s32(a, 1); } -// CHECK-LABEL: @test_vqshl_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VQSHL_N1]] +// int64x1_t test_vqshl_n_s64(int64x1_t a) { return vqshl_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshl_n_u8( -// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> splat (i8 1)) -// CHECK: ret <8 x i8> [[VQSHL_N]] +// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 1)) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_N]] +// uint8x8_t test_vqshl_n_u8(uint8x8_t a) { return vqshl_n_u8(a, 1); } -// CHECK-LABEL: @test_vqshl_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> splat (i16 1)) -// CHECK: ret <4 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> splat (i16 1)) +// CHECK-NEXT: ret <4 x i16> [[VQSHL_N1]] +// uint16x4_t test_vqshl_n_u16(uint16x4_t a) { return vqshl_n_u16(a, 1); } -// CHECK-LABEL: @test_vqshl_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> splat (i32 1)) -// CHECK: ret <2 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> splat (i32 1)) +// CHECK-NEXT: ret <2 x i32> [[VQSHL_N1]] +// uint32x2_t test_vqshl_n_u32(uint32x2_t a) { return vqshl_n_u32(a, 1); } -// CHECK-LABEL: @test_vqshl_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VQSHL_N1]] +// uint64x1_t test_vqshl_n_u64(uint64x1_t a) { return vqshl_n_u64(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_s8( -// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> splat (i8 1)) -// CHECK: ret <16 x i8> [[VQSHL_N]] +// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 1)) +// CHECK-NEXT: ret <16 x i8> [[VQSHL_N]] +// int8x16_t test_vqshlq_n_s8(int8x16_t a) { return vqshlq_n_s8(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> splat (i16 1)) -// CHECK: ret <8 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> splat (i16 1)) +// CHECK-NEXT: ret <8 x i16> [[VQSHL_N1]] +// int16x8_t test_vqshlq_n_s16(int16x8_t a) { return vqshlq_n_s16(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> splat (i32 1)) -// CHECK: ret <4 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> splat (i32 1)) +// CHECK-NEXT: ret <4 x i32> [[VQSHL_N1]] +// int32x4_t test_vqshlq_n_s32(int32x4_t a) { return vqshlq_n_s32(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> splat (i64 1)) -// CHECK: ret <2 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> splat (i64 1)) +// CHECK-NEXT: ret <2 x i64> [[VQSHL_N1]] +// int64x2_t test_vqshlq_n_s64(int64x2_t a) { return vqshlq_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_u8( -// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> splat (i8 1)) -// CHECK: ret <16 x i8> [[VQSHL_N]] +// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 1)) +// CHECK-NEXT: ret <16 x i8> [[VQSHL_N]] +// uint8x16_t test_vqshlq_n_u8(uint8x16_t a) { return vqshlq_n_u8(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> splat (i16 1)) -// CHECK: ret <8 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> splat (i16 1)) +// CHECK-NEXT: ret <8 x i16> [[VQSHL_N1]] +// uint16x8_t test_vqshlq_n_u16(uint16x8_t a) { return vqshlq_n_u16(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> splat (i32 1)) -// CHECK: ret <4 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> splat (i32 1)) +// CHECK-NEXT: ret <4 x i32> [[VQSHL_N1]] +// uint32x4_t test_vqshlq_n_u32(uint32x4_t a) { return vqshlq_n_u32(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> splat (i64 1)) -// CHECK: ret <2 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> splat (i64 1)) +// CHECK-NEXT: ret <2 x i64> [[VQSHL_N1]] +// uint64x2_t test_vqshlq_n_u64(uint64x2_t a) { return vqshlq_n_u64(a, 1); } -// CHECK-LABEL: @test_vqshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VQSHRN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VQSHRN_N1]] +// int8x8_t test_vqshrn_n_s16(int16x8_t a) { return vqshrn_n_s16(a, 1); } -// CHECK-LABEL: @test_vqshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VQSHRN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VQSHRN_N1]] +// int16x4_t test_vqshrn_n_s32(int32x4_t a) { return vqshrn_n_s32(a, 1); } -// CHECK-LABEL: @test_vqshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VQSHRN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VQSHRN_N1]] +// int32x2_t test_vqshrn_n_s64(int64x2_t a) { return vqshrn_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VQSHRN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VQSHRN_N1]] +// uint8x8_t test_vqshrn_n_u16(uint16x8_t a) { return vqshrn_n_u16(a, 1); } -// CHECK-LABEL: @test_vqshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VQSHRN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VQSHRN_N1]] +// uint16x4_t test_vqshrn_n_u32(uint32x4_t a) { return vqshrn_n_u32(a, 1); } -// CHECK-LABEL: @test_vqshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VQSHRN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VQSHRN_N1]] +// uint32x2_t test_vqshrn_n_u64(uint64x2_t a) { return vqshrn_n_u64(a, 1); } -// CHECK-LABEL: @test_vqshrun_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VQSHRUN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VQSHRUN_N1]] +// uint8x8_t test_vqshrun_n_s16(int16x8_t a) { return vqshrun_n_s16(a, 1); } -// CHECK-LABEL: @test_vqshrun_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VQSHRUN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VQSHRUN_N1]] +// uint16x4_t test_vqshrun_n_s32(int32x4_t a) { return vqshrun_n_s32(a, 1); } -// CHECK-LABEL: @test_vqshrun_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VQSHRUN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VQSHRUN_N1]] +// uint32x2_t test_vqshrun_n_s64(int64x2_t a) { return vqshrun_n_s64(a, 1); } -// CHECK-LABEL: @test_vqsub_s8( -// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSUB_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqsub_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSUB_V_I]] +// int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) { return vqsub_s8(a, b); } -// CHECK-LABEL: @test_vqsub_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqsub_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) +// CHECK-NEXT: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) { return vqsub_s16(a, b); } -// CHECK-LABEL: @test_vqsub_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqsub_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) +// CHECK-NEXT: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) { return vqsub_s32(a, b); } -// CHECK-LABEL: @test_vqsub_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSUB_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqsub_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) +// CHECK-NEXT: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) { return vqsub_s64(a, b); } -// CHECK-LABEL: @test_vqsub_u8( -// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSUB_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqsub_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSUB_V_I]] +// uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) { return vqsub_u8(a, b); } -// CHECK-LABEL: @test_vqsub_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqsub_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) +// CHECK-NEXT: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) { return vqsub_u16(a, b); } -// CHECK-LABEL: @test_vqsub_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqsub_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) +// CHECK-NEXT: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) { return vqsub_u32(a, b); } -// CHECK-LABEL: @test_vqsub_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.usub.sat.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSUB_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqsub_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.usub.sat.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) +// CHECK-NEXT: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) { return vqsub_u64(a, b); } -// CHECK-LABEL: @test_vqsubq_s8( -// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSUBQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSUBQ_V_I]] +// int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) { return vqsubq_s8(a, b); } -// CHECK-LABEL: @test_vqsubq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) +// CHECK-NEXT: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) { return vqsubq_s16(a, b); } -// CHECK-LABEL: @test_vqsubq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) +// CHECK-NEXT: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) { return vqsubq_s32(a, b); } -// CHECK-LABEL: @test_vqsubq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) +// CHECK-NEXT: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) { return vqsubq_s64(a, b); } -// CHECK-LABEL: @test_vqsubq_u8( -// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSUBQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSUBQ_V_I]] +// uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) { return vqsubq_u8(a, b); } -// CHECK-LABEL: @test_vqsubq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) +// CHECK-NEXT: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) { return vqsubq_u16(a, b); } -// CHECK-LABEL: @test_vqsubq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) +// CHECK-NEXT: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) { return vqsubq_u32(a, b); } -// CHECK-LABEL: @test_vqsubq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) +// CHECK-NEXT: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) { return vqsubq_u64(a, b); } -// CHECK-LABEL: @test_vraddhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRADDHN_V2_I]] +// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) +// CHECK-NEXT: ret <8 x i8> [[VRADDHN_V2_I]] +// int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) { return vraddhn_s16(a, b); } -// CHECK-LABEL: @test_vraddhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRADDHN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) +// CHECK-NEXT: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) { return vraddhn_s32(a, b); } -// CHECK-LABEL: @test_vraddhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRADDHN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) +// CHECK-NEXT: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) { return vraddhn_s64(a, b); } -// CHECK-LABEL: @test_vraddhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRADDHN_V2_I]] +// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) +// CHECK-NEXT: ret <8 x i8> [[VRADDHN_V2_I]] +// uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) { return vraddhn_u16(a, b); } -// CHECK-LABEL: @test_vraddhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRADDHN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) +// CHECK-NEXT: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) { return vraddhn_u32(a, b); } -// CHECK-LABEL: @test_vraddhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRADDHN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) +// CHECK-NEXT: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) { return vraddhn_u64(a, b); } -// CHECK-LABEL: @test_vrecpe_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRECPE_V1_I]] +// CHECK-LABEL: define <2 x float> @test_vrecpe_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> [[VRECPE_V_I]]) +// CHECK-NEXT: ret <2 x float> [[VRECPE_V1_I]] +// float32x2_t test_vrecpe_f32(float32x2_t a) { return vrecpe_f32(a); } -// CHECK-LABEL: @test_vrecpe_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a) -// CHECK: ret <2 x i32> [[VRECPE_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) +// CHECK-NEXT: ret <2 x i32> [[VRECPE_V1_I]] +// uint32x2_t test_vrecpe_u32(uint32x2_t a) { return vrecpe_u32(a); } -// CHECK-LABEL: @test_vrecpeq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRECPEQ_V1_I]] +// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) +// CHECK-NEXT: ret <4 x float> [[VRECPEQ_V1_I]] +// float32x4_t test_vrecpeq_f32(float32x4_t a) { return vrecpeq_f32(a); } -// CHECK-LABEL: @test_vrecpeq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a) -// CHECK: ret <4 x i32> [[VRECPEQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) +// CHECK-NEXT: ret <4 x i32> [[VRECPEQ_V1_I]] +// uint32x4_t test_vrecpeq_u32(uint32x4_t a) { return vrecpeq_u32(a); } -// CHECK-LABEL: @test_vrecps_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VRECPS_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vrecps_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) +// CHECK-NEXT: [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] +// float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) { return vrecps_f32(a, b); } -// CHECK-LABEL: @test_vrecpsq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VRECPSQ_V2_I]] +// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) +// CHECK-NEXT: [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) { return vrecpsq_f32(a, b); } -// CHECK-LABEL: @test_vreinterpret_s8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_s16(int16x4_t a) { return vreinterpret_s8_s16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_s32(int32x2_t a) { return vreinterpret_s8_s32(a); } -// CHECK-LABEL: @test_vreinterpret_s8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_s64(int64x1_t a) { return vreinterpret_s8_s64(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) { return vreinterpret_s8_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) { return vreinterpret_s8_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) { return vreinterpret_s8_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) { return vreinterpret_s8_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// int8x8_t test_vreinterpret_s8_f16(float16x4_t a) { return vreinterpret_s8_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// int8x8_t test_vreinterpret_s8_f32(float32x2_t a) { return vreinterpret_s8_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s8_p8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) { return vreinterpret_s8_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) { return vreinterpret_s8_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_s8(int8x8_t a) { return vreinterpret_s16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_s16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_s32(int32x2_t a) { return vreinterpret_s16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_s16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_s64(int64x1_t a) { return vreinterpret_s16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) { return vreinterpret_s16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) { return vreinterpret_s16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) { return vreinterpret_s16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) { return vreinterpret_s16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_f16(float16x4_t a) { return vreinterpret_s16_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// int16x4_t test_vreinterpret_s16_f32(float32x2_t a) { return vreinterpret_s16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) { return vreinterpret_s16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s16_p16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) { return vreinterpret_s16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_s8(int8x8_t a) { return vreinterpret_s32_s8(a); } -// CHECK-LABEL: @test_vreinterpret_s32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_s16(int16x4_t a) { return vreinterpret_s32_s16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_s64(int64x1_t a) { return vreinterpret_s32_s64(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) { return vreinterpret_s32_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) { return vreinterpret_s32_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u32( -// CHECK: ret <2 x i32> %a +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i32> [[A]] +// int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) { return vreinterpret_s32_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) { return vreinterpret_s32_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_f16(float16x4_t a) { return vreinterpret_s32_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_f32(float32x2_t a) { return vreinterpret_s32_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) { return vreinterpret_s32_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) { return vreinterpret_s32_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_s8(int8x8_t a) { return vreinterpret_s64_s8(a); } -// CHECK-LABEL: @test_vreinterpret_s64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_s16(int16x4_t a) { return vreinterpret_s64_s16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_s32(int32x2_t a) { return vreinterpret_s64_s32(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) { return vreinterpret_s64_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) { return vreinterpret_s64_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) { return vreinterpret_s64_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) { return vreinterpret_s64_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_f16(float16x4_t a) { return vreinterpret_s64_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_f32(float32x2_t a) { return vreinterpret_s64_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) { return vreinterpret_s64_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) { return vreinterpret_s64_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) { return vreinterpret_u8_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) { return vreinterpret_u8_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) { return vreinterpret_u8_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) { return vreinterpret_u8_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) { return vreinterpret_u8_u16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) { return vreinterpret_u8_u32(a); } -// CHECK-LABEL: @test_vreinterpret_u8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) { return vreinterpret_u8_u64(a); } -// CHECK-LABEL: @test_vreinterpret_u8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) { return vreinterpret_u8_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) { return vreinterpret_u8_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u8_p8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) { return vreinterpret_u8_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) { return vreinterpret_u8_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) { return vreinterpret_u16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) { return vreinterpret_u16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) { return vreinterpret_u16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) { return vreinterpret_u16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) { return vreinterpret_u16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_u16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) { return vreinterpret_u16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_u16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) { return vreinterpret_u16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_u16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) { return vreinterpret_u16_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) { return vreinterpret_u16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) { return vreinterpret_u16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u16_p16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) { return vreinterpret_u16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) { return vreinterpret_u32_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) { return vreinterpret_u32_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s32( -// CHECK: ret <2 x i32> %a +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i32> [[A]] +// uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) { return vreinterpret_u32_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) { return vreinterpret_u32_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) { return vreinterpret_u32_u8(a); } -// CHECK-LABEL: @test_vreinterpret_u32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) { return vreinterpret_u32_u16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) { return vreinterpret_u32_u64(a); } -// CHECK-LABEL: @test_vreinterpret_u32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) { return vreinterpret_u32_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) { return vreinterpret_u32_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) { return vreinterpret_u32_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) { return vreinterpret_u32_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) { return vreinterpret_u64_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) { return vreinterpret_u64_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) { return vreinterpret_u64_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) { return vreinterpret_u64_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) { return vreinterpret_u64_u8(a); } -// CHECK-LABEL: @test_vreinterpret_u64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) { return vreinterpret_u64_u16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) { return vreinterpret_u64_u32(a); } -// CHECK-LABEL: @test_vreinterpret_u64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) { return vreinterpret_u64_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) { return vreinterpret_u64_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) { return vreinterpret_u64_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to i64 +// CHECK-NEXT: [[__P0_ADDR_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP0]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[__P0_ADDR_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) { return vreinterpret_u64_p16(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP1]] +// float16x4_t test_vreinterpret_f16_s8(int8x8_t a) { return vreinterpret_f16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s16(int16x4_t a) { return vreinterpret_f16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s32(int32x2_t a) { return vreinterpret_f16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s64(int64x1_t a) { return vreinterpret_f16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP1]] +// float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) { return vreinterpret_f16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) { return vreinterpret_f16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) { return vreinterpret_f16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) { return vreinterpret_f16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP1]] +// float16x4_t test_vreinterpret_f16_f32(float32x2_t a) { return vreinterpret_f16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_f16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP1]] +// float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) { return vreinterpret_f16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_f16_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) { return vreinterpret_f16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_s8(int8x8_t a) { return vreinterpret_f32_s8(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_s16(int16x4_t a) { return vreinterpret_f32_s16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_s32(int32x2_t a) { return vreinterpret_f32_s32(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_s64(int64x1_t a) { return vreinterpret_f32_s64(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) { return vreinterpret_f32_u8(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) { return vreinterpret_f32_u16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) { return vreinterpret_f32_u32(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) { return vreinterpret_f32_u64(a); } -// CHECK-LABEL: @test_vreinterpret_f32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_f16(float16x4_t a) { return vreinterpret_f32_f16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) { return vreinterpret_f32_p8(a); } -// CHECK-LABEL: @test_vreinterpret_f32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) { return vreinterpret_f32_p16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) { return vreinterpret_p8_s8(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) { return vreinterpret_p8_s16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) { return vreinterpret_p8_s32(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) { return vreinterpret_p8_s64(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) { return vreinterpret_p8_u8(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) { return vreinterpret_p8_u16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) { return vreinterpret_p8_u32(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) { return vreinterpret_p8_u64(a); } -// CHECK-LABEL: @test_vreinterpret_p8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) { return vreinterpret_p8_f16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) { return vreinterpret_p8_f32(a); } -// CHECK-LABEL: @test_vreinterpret_p8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) { return vreinterpret_p8_p16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) { return vreinterpret_p16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) { return vreinterpret_p16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) { return vreinterpret_p16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) { return vreinterpret_p16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) { return vreinterpret_p16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) { return vreinterpret_p16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) { return vreinterpret_p16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) { return vreinterpret_p16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_p16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) { return vreinterpret_p16_f16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP1]] +// poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) { return vreinterpret_p16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_p16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) { return vreinterpret_p16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) { return vreinterpretq_s8_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) { return vreinterpretq_s8_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) { return vreinterpretq_s8_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) { return vreinterpretq_s8_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) { return vreinterpretq_s8_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) { return vreinterpretq_s8_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) { return vreinterpretq_s8_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) { return vreinterpretq_s8_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) { return vreinterpretq_s8_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_p8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) { return vreinterpretq_s8_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) { return vreinterpretq_s8_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) { return vreinterpretq_s16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) { return vreinterpretq_s16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) { return vreinterpretq_s16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) { return vreinterpretq_s16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) { return vreinterpretq_s16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) { return vreinterpretq_s16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) { return vreinterpretq_s16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) { return vreinterpretq_s16_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) { return vreinterpretq_s16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) { return vreinterpretq_s16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_p16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) { return vreinterpretq_s16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) { return vreinterpretq_s32_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) { return vreinterpretq_s32_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) { return vreinterpretq_s32_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) { return vreinterpretq_s32_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) { return vreinterpretq_s32_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u32( -// CHECK: ret <4 x i32> %a +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i32> [[A]] +// int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) { return vreinterpretq_s32_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) { return vreinterpretq_s32_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) { return vreinterpretq_s32_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) { return vreinterpretq_s32_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) { return vreinterpretq_s32_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) { return vreinterpretq_s32_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) { return vreinterpretq_s64_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) { return vreinterpretq_s64_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) { return vreinterpretq_s64_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) { return vreinterpretq_s64_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) { return vreinterpretq_s64_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) { return vreinterpretq_s64_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) { return vreinterpretq_s64_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) { return vreinterpretq_s64_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) { return vreinterpretq_s64_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) { return vreinterpretq_s64_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) { return vreinterpretq_s64_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) { return vreinterpretq_u8_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) { return vreinterpretq_u8_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) { return vreinterpretq_u8_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) { return vreinterpretq_u8_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) { return vreinterpretq_u8_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) { return vreinterpretq_u8_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) { return vreinterpretq_u8_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) { return vreinterpretq_u8_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) { return vreinterpretq_u8_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_p8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) { return vreinterpretq_u8_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) { return vreinterpretq_u8_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) { return vreinterpretq_u16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) { return vreinterpretq_u16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) { return vreinterpretq_u16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) { return vreinterpretq_u16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) { return vreinterpretq_u16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) { return vreinterpretq_u16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) { return vreinterpretq_u16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) { return vreinterpretq_u16_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) { return vreinterpretq_u16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) { return vreinterpretq_u16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_p16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) { return vreinterpretq_u16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) { return vreinterpretq_u32_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) { return vreinterpretq_u32_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s32( -// CHECK: ret <4 x i32> %a +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i32> [[A]] +// uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) { return vreinterpretq_u32_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) { return vreinterpretq_u32_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) { return vreinterpretq_u32_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) { return vreinterpretq_u32_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) { return vreinterpretq_u32_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) { return vreinterpretq_u32_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) { return vreinterpretq_u32_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) { return vreinterpretq_u32_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) { return vreinterpretq_u32_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) { return vreinterpretq_u64_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) { return vreinterpretq_u64_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) { return vreinterpretq_u64_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) { return vreinterpretq_u64_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) { return vreinterpretq_u64_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) { return vreinterpretq_u64_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) { return vreinterpretq_u64_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) { return vreinterpretq_u64_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) { return vreinterpretq_u64_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) { return vreinterpretq_u64_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) { return vreinterpretq_u64_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) { return vreinterpretq_f16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) { return vreinterpretq_f16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) { return vreinterpretq_f16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) { return vreinterpretq_f16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) { return vreinterpretq_f16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) { return vreinterpretq_f16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) { return vreinterpretq_f16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) { return vreinterpretq_f16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) { return vreinterpretq_f16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) { return vreinterpretq_f16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) { return vreinterpretq_f16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) { return vreinterpretq_f32_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) { return vreinterpretq_f32_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) { return vreinterpretq_f32_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) { return vreinterpretq_f32_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) { return vreinterpretq_f32_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) { return vreinterpretq_f32_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) { return vreinterpretq_f32_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) { return vreinterpretq_f32_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) { return vreinterpretq_f32_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) { return vreinterpretq_f32_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) { return vreinterpretq_f32_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) { return vreinterpretq_p8_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) { return vreinterpretq_p8_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) { return vreinterpretq_p8_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) { return vreinterpretq_p8_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) { return vreinterpretq_p8_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) { return vreinterpretq_p8_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) { return vreinterpretq_p8_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) { return vreinterpretq_p8_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) { return vreinterpretq_p8_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) { return vreinterpretq_p8_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) { return vreinterpretq_p8_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) { return vreinterpretq_p16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) { return vreinterpretq_p16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) { return vreinterpretq_p16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) { return vreinterpretq_p16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) { return vreinterpretq_p16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) { return vreinterpretq_p16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) { return vreinterpretq_p16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) { return vreinterpretq_p16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) { return vreinterpretq_p16_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) { return vreinterpretq_p16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) { return vreinterpretq_p16_p8(a); } -// CHECK-LABEL: @test_vrev16_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vrev16_s8(int8x8_t a) { return vrev16_s8(a); } -// CHECK-LABEL: @test_vrev16_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vrev16_u8(uint8x8_t a) { return vrev16_u8(a); } -// CHECK-LABEL: @test_vrev16_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vrev16_p8(poly8x8_t a) { return vrev16_p8(a); } -// CHECK-LABEL: @test_vrev16q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrev16q_s8(int8x16_t a) { return vrev16q_s8(a); } -// CHECK-LABEL: @test_vrev16q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrev16q_u8(uint8x16_t a) { return vrev16q_u8(a); } -// CHECK-LABEL: @test_vrev16q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vrev16q_p8(poly8x16_t a) { return vrev16q_p8(a); } -// CHECK-LABEL: @test_vrev32_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vrev32_s8(int8x8_t a) { return vrev32_s8(a); } -// CHECK-LABEL: @test_vrev32_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vrev32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vrev32_s16(int16x4_t a) { return vrev32_s16(a); } -// CHECK-LABEL: @test_vrev32_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vrev32_u8(uint8x8_t a) { return vrev32_u8(a); } -// CHECK-LABEL: @test_vrev32_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vrev32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vrev32_u16(uint16x4_t a) { return vrev32_u16(a); } -// CHECK-LABEL: @test_vrev32_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vrev32_p8(poly8x8_t a) { return vrev32_p8(a); } -// CHECK-LABEL: @test_vrev32_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vrev32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vrev32_p16(poly16x4_t a) { return vrev32_p16(a); } -// CHECK-LABEL: @test_vrev32q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrev32q_s8(int8x16_t a) { return vrev32q_s8(a); } -// CHECK-LABEL: @test_vrev32q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vrev32q_s16(int16x8_t a) { return vrev32q_s16(a); } -// CHECK-LABEL: @test_vrev32q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrev32q_u8(uint8x16_t a) { return vrev32q_u8(a); } -// CHECK-LABEL: @test_vrev32q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vrev32q_u16(uint16x8_t a) { return vrev32q_u16(a); } -// CHECK-LABEL: @test_vrev32q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vrev32q_p8(poly8x16_t a) { return vrev32q_p8(a); } -// CHECK-LABEL: @test_vrev32q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vrev32q_p16(poly16x8_t a) { return vrev32q_p16(a); } -// CHECK-LABEL: @test_vrev64_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vrev64_s8(int8x8_t a) { return vrev64_s8(a); } -// CHECK-LABEL: @test_vrev64_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vrev64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vrev64_s16(int16x4_t a) { return vrev64_s16(a); } -// CHECK-LABEL: @test_vrev64_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i32> @test_vrev64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[A]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vrev64_s32(int32x2_t a) { return vrev64_s32(a); } -// CHECK-LABEL: @test_vrev64_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vrev64_u8(uint8x8_t a) { return vrev64_u8(a); } -// CHECK-LABEL: @test_vrev64_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vrev64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vrev64_u16(uint16x4_t a) { return vrev64_u16(a); } -// CHECK-LABEL: @test_vrev64_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i32> @test_vrev64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[A]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vrev64_u32(uint32x2_t a) { return vrev64_u32(a); } -// CHECK-LABEL: @test_vrev64_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vrev64_p8(poly8x8_t a) { return vrev64_p8(a); } -// CHECK-LABEL: @test_vrev64_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vrev64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vrev64_p16(poly16x4_t a) { return vrev64_p16(a); } -// CHECK-LABEL: @test_vrev64_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x float> @test_vrev64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[A]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vrev64_f32(float32x2_t a) { return vrev64_f32(a); } -// CHECK-LABEL: @test_vrev64q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrev64q_s8(int8x16_t a) { return vrev64q_s8(a); } -// CHECK-LABEL: @test_vrev64q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vrev64q_s16(int16x8_t a) { return vrev64q_s16(a); } -// CHECK-LABEL: @test_vrev64q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vrev64q_s32(int32x4_t a) { return vrev64q_s32(a); } -// CHECK-LABEL: @test_vrev64q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrev64q_u8(uint8x16_t a) { return vrev64q_u8(a); } -// CHECK-LABEL: @test_vrev64q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vrev64q_u16(uint16x8_t a) { return vrev64q_u16(a); } -// CHECK-LABEL: @test_vrev64q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vrev64q_u32(uint32x4_t a) { return vrev64q_u32(a); } -// CHECK-LABEL: @test_vrev64q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vrev64q_p8(poly8x16_t a) { return vrev64q_p8(a); } -// CHECK-LABEL: @test_vrev64q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vrev64q_p16(poly16x8_t a) { return vrev64q_p16(a); } -// CHECK-LABEL: @test_vrev64q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x float> @test_vrev64q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[A]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vrev64q_f32(float32x4_t a) { return vrev64q_f32(a); } -// CHECK-LABEL: @test_vrhadd_s8( -// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VRHADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRHADD_V_I]] +// int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) { return vrhadd_s8(a, b); } -// CHECK-LABEL: @test_vrhadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRHADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) +// CHECK-NEXT: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) { return vrhadd_s16(a, b); } -// CHECK-LABEL: @test_vrhadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRHADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) +// CHECK-NEXT: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) { return vrhadd_s32(a, b); } -// CHECK-LABEL: @test_vrhadd_u8( -// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VRHADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRHADD_V_I]] +// uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) { return vrhadd_u8(a, b); } -// CHECK-LABEL: @test_vrhadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRHADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) +// CHECK-NEXT: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) { return vrhadd_u16(a, b); } -// CHECK-LABEL: @test_vrhadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRHADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) +// CHECK-NEXT: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) { return vrhadd_u32(a, b); } -// CHECK-LABEL: @test_vrhaddq_s8( -// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VRHADDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VRHADDQ_V_I]] +// int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) { return vrhaddq_s8(a, b); } -// CHECK-LABEL: @test_vrhaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) +// CHECK-NEXT: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) { return vrhaddq_s16(a, b); } -// CHECK-LABEL: @test_vrhaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) +// CHECK-NEXT: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) { return vrhaddq_s32(a, b); } -// CHECK-LABEL: @test_vrhaddq_u8( -// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VRHADDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VRHADDQ_V_I]] +// uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) { return vrhaddq_u8(a, b); } -// CHECK-LABEL: @test_vrhaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) +// CHECK-NEXT: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) { return vrhaddq_u16(a, b); } -// CHECK-LABEL: @test_vrhaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) +// CHECK-NEXT: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) { return vrhaddq_u32(a, b); } -// CHECK-LABEL: @test_vrshl_s8( -// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VRSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vrshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRSHL_V_I]] +// int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) { return vrshl_s8(a, b); } -// CHECK-LABEL: @test_vrshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vrshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) +// CHECK-NEXT: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) { return vrshl_s16(a, b); } -// CHECK-LABEL: @test_vrshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vrshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) +// CHECK-NEXT: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) { return vrshl_s32(a, b); } -// CHECK-LABEL: @test_vrshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VRSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vrshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) +// CHECK-NEXT: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) { return vrshl_s64(a, b); } -// CHECK-LABEL: @test_vrshl_u8( -// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VRSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vrshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRSHL_V_I]] +// uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) { return vrshl_u8(a, b); } -// CHECK-LABEL: @test_vrshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vrshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) +// CHECK-NEXT: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) { return vrshl_u16(a, b); } -// CHECK-LABEL: @test_vrshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vrshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) +// CHECK-NEXT: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) { return vrshl_u32(a, b); } -// CHECK-LABEL: @test_vrshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VRSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vrshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) +// CHECK-NEXT: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) { return vrshl_u64(a, b); } -// CHECK-LABEL: @test_vrshlq_s8( -// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VRSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vrshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VRSHLQ_V_I]] +// int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) { return vrshlq_s8(a, b); } -// CHECK-LABEL: @test_vrshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vrshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) +// CHECK-NEXT: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) { return vrshlq_s16(a, b); } -// CHECK-LABEL: @test_vrshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vrshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) +// CHECK-NEXT: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) { return vrshlq_s32(a, b); } -// CHECK-LABEL: @test_vrshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vrshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) +// CHECK-NEXT: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) { return vrshlq_s64(a, b); } -// CHECK-LABEL: @test_vrshlq_u8( -// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VRSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vrshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VRSHLQ_V_I]] +// uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) { return vrshlq_u8(a, b); } -// CHECK-LABEL: @test_vrshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vrshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) +// CHECK-NEXT: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) { return vrshlq_u16(a, b); } -// CHECK-LABEL: @test_vrshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vrshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) +// CHECK-NEXT: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) { return vrshlq_u32(a, b); } -// CHECK-LABEL: @test_vrshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vrshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) +// CHECK-NEXT: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) { return vrshlq_u64(a, b); } -// CHECK-LABEL: @test_vrshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VRSHRN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VRSHRN_N1]] +// int8x8_t test_vrshrn_n_s16(int16x8_t a) { return vrshrn_n_s16(a, 1); } -// CHECK-LABEL: @test_vrshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VRSHRN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VRSHRN_N1]] +// int16x4_t test_vrshrn_n_s32(int32x4_t a) { return vrshrn_n_s32(a, 1); } -// CHECK-LABEL: @test_vrshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VRSHRN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VRSHRN_N1]] +// int32x2_t test_vrshrn_n_s64(int64x2_t a) { return vrshrn_n_s64(a, 1); } -// CHECK-LABEL: @test_vrshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VRSHRN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VRSHRN_N1]] +// uint8x8_t test_vrshrn_n_u16(uint16x8_t a) { return vrshrn_n_u16(a, 1); } -// CHECK-LABEL: @test_vrshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VRSHRN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VRSHRN_N1]] +// uint16x4_t test_vrshrn_n_u32(uint32x4_t a) { return vrshrn_n_u32(a, 1); } -// CHECK-LABEL: @test_vrshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VRSHRN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VRSHRN_N1]] +// uint32x2_t test_vrshrn_n_u64(uint64x2_t a) { return vrshrn_n_u64(a, 1); } -// CHECK-LABEL: @test_vrshr_n_s8( -// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> splat (i8 -1)) -// CHECK: ret <8 x i8> [[VRSHR_N]] +// CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <8 x i8> [[VRSHR_N]] +// int8x8_t test_vrshr_n_s8(int8x8_t a) { return vrshr_n_s8(a, 1); } -// CHECK-LABEL: @test_vrshr_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -1)) -// CHECK: ret <4 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <4 x i16> [[VRSHR_N1]] +// int16x4_t test_vrshr_n_s16(int16x4_t a) { return vrshr_n_s16(a, 1); } -// CHECK-LABEL: @test_vrshr_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -1)) -// CHECK: ret <2 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <2 x i32> [[VRSHR_N1]] +// int32x2_t test_vrshr_n_s32(int32x2_t a) { return vrshr_n_s32(a, 1); } -// CHECK-LABEL: @test_vrshr_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) -// CHECK: ret <1 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <1 x i64> [[VRSHR_N1]] +// int64x1_t test_vrshr_n_s64(int64x1_t a) { return vrshr_n_s64(a, 1); } -// CHECK-LABEL: @test_vrshr_n_u8( -// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> splat (i8 -1)) -// CHECK: ret <8 x i8> [[VRSHR_N]] +// CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <8 x i8> [[VRSHR_N]] +// uint8x8_t test_vrshr_n_u8(uint8x8_t a) { return vrshr_n_u8(a, 1); } -// CHECK-LABEL: @test_vrshr_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -1)) -// CHECK: ret <4 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <4 x i16> [[VRSHR_N1]] +// uint16x4_t test_vrshr_n_u16(uint16x4_t a) { return vrshr_n_u16(a, 1); } -// CHECK-LABEL: @test_vrshr_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -1)) -// CHECK: ret <2 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <2 x i32> [[VRSHR_N1]] +// uint32x2_t test_vrshr_n_u32(uint32x2_t a) { return vrshr_n_u32(a, 1); } -// CHECK-LABEL: @test_vrshr_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) -// CHECK: ret <1 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <1 x i64> [[VRSHR_N1]] +// uint64x1_t test_vrshr_n_u64(uint64x1_t a) { return vrshr_n_u64(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_s8( -// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> splat (i8 -1)) -// CHECK: ret <16 x i8> [[VRSHR_N]] +// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <16 x i8> [[VRSHR_N]] +// int8x16_t test_vrshrq_n_s8(int8x16_t a) { return vrshrq_n_s8(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i16> [[VRSHR_N1]] +// int16x8_t test_vrshrq_n_s16(int16x8_t a) { return vrshrq_n_s16(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i32> [[VRSHR_N1]] +// int32x4_t test_vrshrq_n_s32(int32x4_t a) { return vrshrq_n_s32(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i64> [[VRSHR_N1]] +// int64x2_t test_vrshrq_n_s64(int64x2_t a) { return vrshrq_n_s64(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_u8( -// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> splat (i8 -1)) -// CHECK: ret <16 x i8> [[VRSHR_N]] +// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <16 x i8> [[VRSHR_N]] +// uint8x16_t test_vrshrq_n_u8(uint8x16_t a) { return vrshrq_n_u8(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i16> [[VRSHR_N1]] +// uint16x8_t test_vrshrq_n_u16(uint16x8_t a) { return vrshrq_n_u16(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i32> [[VRSHR_N1]] +// uint32x4_t test_vrshrq_n_u32(uint32x4_t a) { return vrshrq_n_u32(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i64> [[VRSHR_N1]] +// uint64x2_t test_vrshrq_n_u64(uint64x2_t a) { return vrshrq_n_u64(a, 1); } -// CHECK-LABEL: @test_vrsqrte_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRSQRTE_V1_I]] +// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) +// CHECK-NEXT: ret <2 x float> [[VRSQRTE_V1_I]] +// float32x2_t test_vrsqrte_f32(float32x2_t a) { return vrsqrte_f32(a); } -// CHECK-LABEL: @test_vrsqrte_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a) -// CHECK: ret <2 x i32> [[VRSQRTE_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) +// CHECK-NEXT: ret <2 x i32> [[VRSQRTE_V1_I]] +// uint32x2_t test_vrsqrte_u32(uint32x2_t a) { return vrsqrte_u32(a); } -// CHECK-LABEL: @test_vrsqrteq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRSQRTEQ_V1_I]] +// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) +// CHECK-NEXT: ret <4 x float> [[VRSQRTEQ_V1_I]] +// float32x4_t test_vrsqrteq_f32(float32x4_t a) { return vrsqrteq_f32(a); } -// CHECK-LABEL: @test_vrsqrteq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a) -// CHECK: ret <4 x i32> [[VRSQRTEQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) +// CHECK-NEXT: ret <4 x i32> [[VRSQRTEQ_V1_I]] +// uint32x4_t test_vrsqrteq_u32(uint32x4_t a) { return vrsqrteq_u32(a); } -// CHECK-LABEL: @test_vrsqrts_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VRSQRTS_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vrsqrts_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) +// CHECK-NEXT: [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP5]] +// float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) { return vrsqrts_f32(a, b); } -// CHECK-LABEL: @test_vrsqrtsq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VRSQRTSQ_V2_I]] +// CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) +// CHECK-NEXT: [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) { return vrsqrtsq_f32(a, b); } -// CHECK-LABEL: @test_vrsra_n_s8( -// CHECK: [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %b, <8 x i8> splat (i8 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]] -// CHECK: ret <8 x i8> [[VRSRA_N]] +// CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> [[B]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <8 x i8> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i8> [[VRSRA_N]] +// int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) { return vrsra_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> splat (i16 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]] -// CHECK: ret <4 x i16> [[VRSRA_N]] +// CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <4 x i16> [[VRSRA_N]] +// int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) { return vrsra_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> splat (i32 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] -// CHECK: ret <2 x i32> [[VRSRA_N]] +// CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> splat (i32 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <2 x i32> [[VRSRA_N]] +// int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) { return vrsra_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> splat (i64 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]] -// CHECK: ret <1 x i64> [[VRSRA_N]] +// CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <1 x i64> [[VRSRA_N]] +// int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) { return vrsra_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_u8( -// CHECK: [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %b, <8 x i8> splat (i8 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]] -// CHECK: ret <8 x i8> [[VRSRA_N]] +// CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> [[B]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <8 x i8> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i8> [[VRSRA_N]] +// uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) { return vrsra_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> splat (i16 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]] -// CHECK: ret <4 x i16> [[VRSRA_N]] +// CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <4 x i16> [[VRSRA_N]] +// uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) { return vrsra_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> splat (i32 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] -// CHECK: ret <2 x i32> [[VRSRA_N]] +// CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> splat (i32 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <2 x i32> [[VRSRA_N]] +// uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) { return vrsra_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> splat (i64 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]] -// CHECK: ret <1 x i64> [[VRSRA_N]] +// CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <1 x i64> [[VRSRA_N]] +// uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) { return vrsra_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_s8( -// CHECK: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %b, <16 x i8> splat (i8 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]] -// CHECK: ret <16 x i8> [[VRSRA_N]] +// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> [[B]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <16 x i8> [[A]], [[TMP0]] +// CHECK-NEXT: ret <16 x i8> [[VRSRA_N]] +// int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) { return vrsraq_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> splat (i16 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] -// CHECK: ret <8 x i16> [[VRSRA_N]] +// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <8 x i16> [[VRSRA_N]] +// int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) { return vrsraq_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> splat (i32 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] -// CHECK: ret <4 x i32> [[VRSRA_N]] +// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <4 x i32> [[VRSRA_N]] +// int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) { return vrsraq_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> splat (i64 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -// CHECK: ret <2 x i64> [[VRSRA_N]] +// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <2 x i64> [[VRSRA_N]] +// int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) { return vrsraq_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_u8( -// CHECK: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %b, <16 x i8> splat (i8 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]] -// CHECK: ret <16 x i8> [[VRSRA_N]] +// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> [[B]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <16 x i8> [[A]], [[TMP0]] +// CHECK-NEXT: ret <16 x i8> [[VRSRA_N]] +// uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) { return vrsraq_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> splat (i16 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] -// CHECK: ret <8 x i16> [[VRSRA_N]] +// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <8 x i16> [[VRSRA_N]] +// uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) { return vrsraq_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> splat (i32 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] -// CHECK: ret <4 x i32> [[VRSRA_N]] +// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <4 x i32> [[VRSRA_N]] +// uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) { return vrsraq_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> splat (i64 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -// CHECK: ret <2 x i64> [[VRSRA_N]] +// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <2 x i64> [[VRSRA_N]] +// uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) { return vrsraq_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vrsubhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) +// CHECK-NEXT: ret <8 x i8> [[VRSUBHN_V2_I]] +// int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) { return vrsubhn_s16(a, b); } -// CHECK-LABEL: @test_vrsubhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) +// CHECK-NEXT: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) { return vrsubhn_s32(a, b); } -// CHECK-LABEL: @test_vrsubhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) +// CHECK-NEXT: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) { return vrsubhn_s64(a, b); } -// CHECK-LABEL: @test_vrsubhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) +// CHECK-NEXT: ret <8 x i8> [[VRSUBHN_V2_I]] +// uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) { return vrsubhn_u16(a, b); } -// CHECK-LABEL: @test_vrsubhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) +// CHECK-NEXT: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) { return vrsubhn_u32(a, b); } -// CHECK-LABEL: @test_vrsubhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) +// CHECK-NEXT: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) { return vrsubhn_u64(a, b); } -// CHECK-LABEL: @test_vset_lane_u8( -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VSET_LANE]] +// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8( +// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VSET_LANE]] +// uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) { return vset_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vset_lane_u16( -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> %b, i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VSET_LANE]] +// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16( +// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VSET_LANE]] +// uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) { return vset_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vset_lane_u32( -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i32> %b, i32 %a, i32 1 -// CHECK: ret <2 x i32> [[VSET_LANE]] +// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i32> [[VSET_LANE]] +// uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) { return vset_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vset_lane_s8( -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VSET_LANE]] +// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8( +// CHECK-SAME: i8 noundef signext [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VSET_LANE]] +// int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) { return vset_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vset_lane_s16( -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> %b, i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VSET_LANE]] +// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16( +// CHECK-SAME: i16 noundef signext [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VSET_LANE]] +// int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) { return vset_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vset_lane_s32( -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i32> %b, i32 %a, i32 1 -// CHECK: ret <2 x i32> [[VSET_LANE]] +// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i32> [[VSET_LANE]] +// int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) { return vset_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vset_lane_p8( -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VSET_LANE]] +// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8( +// CHECK-SAME: i8 noundef signext [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VSET_LANE]] +// poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) { return vset_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vset_lane_p16( -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> %b, i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VSET_LANE]] +// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16( +// CHECK-SAME: i16 noundef signext [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i16> [[VSET_LANE]] +// poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) { return vset_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vset_lane_f32( -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x float> %b, float %a, i32 1 -// CHECK: ret <2 x float> [[VSET_LANE]] +// CHECK-LABEL: define <2 x float> @test_vset_lane_f32( +// CHECK-SAME: float noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x float> [[B]], float [[A]], i32 1 +// CHECK-NEXT: ret <2 x float> [[VSET_LANE]] +// float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) { return vset_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vset_lane_f16( -// CHECK: [[__REINT_246:%.*]] = alloca half, align 2 -// CHECK: [[__REINT1_246:%.*]] = alloca <4 x half>, align 8 -// CHECK: [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8 -// CHECK: [[TMP0:%.*]] = load half, ptr %a, align 2 -// CHECK: store half [[TMP0]], ptr [[__REINT_246]], align 2 -// CHECK: store <4 x half> %b, ptr [[__REINT1_246]], align 8 -// CHECK: [[TMP2:%.*]] = load i16, ptr [[__REINT_246]], align 2 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT1_246]], align 8 -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP4]], i16 [[TMP2]], i32 1 -// CHECK: store <4 x i16> [[VSET_LANE]], ptr [[__REINT2_246]], align 8 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[__REINT2_246]], align 8 -// CHECK: ret <4 x half> [[TMP8]] +// CHECK-LABEL: define <4 x half> @test_vset_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast half [[TMP0]] to i16 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP2]], i32 1 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[VSET_LANE]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP3]] +// float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) { return vset_lane_f16(*a, b, 1); } -// CHECK-LABEL: @test_vsetq_lane_u8( -// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VSET_LANE]] +// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8( +// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VSET_LANE]] +// uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) { return vsetq_lane_u8(a, b, 15); } -// CHECK-LABEL: @test_vsetq_lane_u16( -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> %b, i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VSET_LANE]] +// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16( +// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VSET_LANE]] +// uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) { return vsetq_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vsetq_lane_u32( -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i32> %b, i32 %a, i32 3 -// CHECK: ret <4 x i32> [[VSET_LANE]] +// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i32> [[VSET_LANE]] +// uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) { return vsetq_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vsetq_lane_s8( -// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VSET_LANE]] +// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8( +// CHECK-SAME: i8 noundef signext [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VSET_LANE]] +// int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) { return vsetq_lane_s8(a, b, 15); } -// CHECK-LABEL: @test_vsetq_lane_s16( -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> %b, i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VSET_LANE]] +// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16( +// CHECK-SAME: i16 noundef signext [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VSET_LANE]] +// int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) { return vsetq_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vsetq_lane_s32( -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i32> %b, i32 %a, i32 3 -// CHECK: ret <4 x i32> [[VSET_LANE]] +// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[A]], i32 3 +// CHECK-NEXT: ret <4 x i32> [[VSET_LANE]] +// int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) { return vsetq_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vsetq_lane_p8( -// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VSET_LANE]] +// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8( +// CHECK-SAME: i8 noundef signext [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VSET_LANE]] +// poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) { return vsetq_lane_p8(a, b, 15); } -// CHECK-LABEL: @test_vsetq_lane_p16( -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> %b, i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VSET_LANE]] +// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16( +// CHECK-SAME: i16 noundef signext [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i32 7 +// CHECK-NEXT: ret <8 x i16> [[VSET_LANE]] +// poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) { return vsetq_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vsetq_lane_f32( -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x float> %b, float %a, i32 3 -// CHECK: ret <4 x float> [[VSET_LANE]] +// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32( +// CHECK-SAME: float noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x float> [[B]], float [[A]], i32 3 +// CHECK-NEXT: ret <4 x float> [[VSET_LANE]] +// float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) { return vsetq_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vsetq_lane_f16( -// CHECK: [[__REINT_248:%.*]] = alloca half, align 2 -// CHECK: [[__REINT1_248:%.*]] = alloca <8 x half>, align 16 -// CHECK: [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16 -// CHECK: [[TMP0:%.*]] = load half, ptr %a, align 2 -// CHECK: store half [[TMP0]], ptr [[__REINT_248]], align 2 -// CHECK: store <8 x half> %b, ptr [[__REINT1_248]], align 16 -// CHECK: [[TMP2:%.*]] = load i16, ptr [[__REINT_248]], align 2 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT1_248]], align 16 -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[TMP2]], i32 3 -// CHECK: store <8 x i16> [[VSET_LANE]], ptr [[__REINT2_248]], align 16 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[__REINT2_248]], align 16 -// CHECK: ret <8 x half> [[TMP8]] +// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast half [[TMP0]] to i16 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP2]], i32 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[VSET_LANE]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP3]] +// float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) { return vsetq_lane_f16(*a, b, 3); } -// CHECK-LABEL: @test_vset_lane_s64( -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> %b, i64 %a, i32 0 -// CHECK: ret <1 x i64> [[VSET_LANE]] +// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[B]], i64 [[A]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VSET_LANE]] +// int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) { return vset_lane_s64(a, b, 0); } -// CHECK-LABEL: @test_vset_lane_u64( -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> %b, i64 %a, i32 0 -// CHECK: ret <1 x i64> [[VSET_LANE]] +// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[B]], i64 [[A]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[VSET_LANE]] +// uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) { return vset_lane_u64(a, b, 0); } -// CHECK-LABEL: @test_vsetq_lane_s64( -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> %b, i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VSET_LANE]] +// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VSET_LANE]] +// int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) { return vsetq_lane_s64(a, b, 1); } -// CHECK-LABEL: @test_vsetq_lane_u64( -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> %b, i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VSET_LANE]] +// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[A]], i32 1 +// CHECK-NEXT: ret <2 x i64> [[VSET_LANE]] +// uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) { return vsetq_lane_u64(a, b, 1); } -// CHECK-LABEL: @test_vshl_s8( -// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VSHL_V_I]] +// int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) { return vshl_s8(a, b); } -// CHECK-LABEL: @test_vshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) +// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) { return vshl_s16(a, b); } -// CHECK-LABEL: @test_vshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) +// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) { return vshl_s32(a, b); } -// CHECK-LABEL: @test_vshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) +// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) { return vshl_s64(a, b); } -// CHECK-LABEL: @test_vshl_u8( -// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VSHL_V_I]] +// uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) { return vshl_u8(a, b); } -// CHECK-LABEL: @test_vshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) +// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP2]] +// uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) { return vshl_u16(a, b); } -// CHECK-LABEL: @test_vshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) +// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP2]] +// uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) { return vshl_u32(a, b); } -// CHECK-LABEL: @test_vshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) +// CHECK-NEXT: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to i64 +// CHECK-NEXT: [[REF_TMP_I_SROA_0_0_VEC_INSERT:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0 +// CHECK-NEXT: ret <1 x i64> [[REF_TMP_I_SROA_0_0_VEC_INSERT]] +// uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) { return vshl_u64(a, b); } -// CHECK-LABEL: @test_vshlq_s8( -// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VSHLQ_V_I]] +// int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) { return vshlq_s8(a, b); } -// CHECK-LABEL: @test_vshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) +// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) { return vshlq_s16(a, b); } -// CHECK-LABEL: @test_vshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) +// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) { return vshlq_s32(a, b); } -// CHECK-LABEL: @test_vshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) +// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) { return vshlq_s64(a, b); } -// CHECK-LABEL: @test_vshlq_u8( -// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VSHLQ_V_I]] +// uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) { return vshlq_u8(a, b); } -// CHECK-LABEL: @test_vshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) +// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) { return vshlq_u16(a, b); } -// CHECK-LABEL: @test_vshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) +// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) { return vshlq_u32(a, b); } -// CHECK-LABEL: @test_vshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) +// CHECK-NEXT: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) { return vshlq_u64(a, b); } -// CHECK-LABEL: @test_vshll_n_s8( -// CHECK: [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 1) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 1) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// int16x8_t test_vshll_n_s8(int8x8_t a) { return vshll_n_s8(a, 1); } -// CHECK-LABEL: @test_vshll_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 1) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 1) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// int32x4_t test_vshll_n_s16(int16x4_t a) { return vshll_n_s16(a, 1); } -// CHECK-LABEL: @test_vshll_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 1) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 1) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// int64x2_t test_vshll_n_s32(int32x2_t a) { return vshll_n_s32(a, 1); } -// CHECK-LABEL: @test_vshll_n_u8( -// CHECK: [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 1) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 1) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// uint16x8_t test_vshll_n_u8(uint8x8_t a) { return vshll_n_u8(a, 1); } -// CHECK-LABEL: @test_vshll_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 1) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 1) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// uint32x4_t test_vshll_n_u16(uint16x4_t a) { return vshll_n_u16(a, 1); } -// CHECK-LABEL: @test_vshll_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 1) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 1) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// uint64x2_t test_vshll_n_u32(uint32x2_t a) { return vshll_n_u32(a, 1); } -// CHECK-LABEL: @test_vshl_n_s8( -// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, splat (i8 1) -// CHECK: ret <8 x i8> [[VSHL_N]] +// CHECK-LABEL: define <8 x i8> @test_vshl_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <8 x i8> [[VSHL_N]] +// int8x8_t test_vshl_n_s8(int8x8_t a) { return vshl_n_s8(a, 1); } -// CHECK-LABEL: @test_vshl_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <4 x i16> [[VSHL_N]] +// CHECK-LABEL: define <4 x i16> @test_vshl_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 1) +// CHECK-NEXT: ret <4 x i16> [[VSHL_N]] +// int16x4_t test_vshl_n_s16(int16x4_t a) { return vshl_n_s16(a, 1); } -// CHECK-LABEL: @test_vshl_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <2 x i32> [[VSHL_N]] +// CHECK-LABEL: define <2 x i32> @test_vshl_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 1) +// CHECK-NEXT: ret <2 x i32> [[VSHL_N]] +// int32x2_t test_vshl_n_s32(int32x2_t a) { return vshl_n_s32(a, 1); } -// CHECK-LABEL: @test_vshl_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHL_N]] +// CHECK-LABEL: define <1 x i64> @test_vshl_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHL_N]] +// int64x1_t test_vshl_n_s64(int64x1_t a) { return vshl_n_s64(a, 1); } -// CHECK-LABEL: @test_vshl_n_u8( -// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, splat (i8 1) -// CHECK: ret <8 x i8> [[VSHL_N]] +// CHECK-LABEL: define <8 x i8> @test_vshl_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <8 x i8> [[VSHL_N]] +// uint8x8_t test_vshl_n_u8(uint8x8_t a) { return vshl_n_u8(a, 1); } -// CHECK-LABEL: @test_vshl_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <4 x i16> [[VSHL_N]] +// CHECK-LABEL: define <4 x i16> @test_vshl_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 1) +// CHECK-NEXT: ret <4 x i16> [[VSHL_N]] +// uint16x4_t test_vshl_n_u16(uint16x4_t a) { return vshl_n_u16(a, 1); } -// CHECK-LABEL: @test_vshl_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <2 x i32> [[VSHL_N]] +// CHECK-LABEL: define <2 x i32> @test_vshl_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 1) +// CHECK-NEXT: ret <2 x i32> [[VSHL_N]] +// uint32x2_t test_vshl_n_u32(uint32x2_t a) { return vshl_n_u32(a, 1); } -// CHECK-LABEL: @test_vshl_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHL_N]] +// CHECK-LABEL: define <1 x i64> @test_vshl_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHL_N]] +// uint64x1_t test_vshl_n_u64(uint64x1_t a) { return vshl_n_u64(a, 1); } -// CHECK-LABEL: @test_vshlq_n_s8( -// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, splat (i8 1) -// CHECK: ret <16 x i8> [[VSHL_N]] +// CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <16 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <16 x i8> [[VSHL_N]] +// int8x16_t test_vshlq_n_s8(int8x16_t a) { return vshlq_n_s8(a, 1); } -// CHECK-LABEL: @test_vshlq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <8 x i16> [[VSHL_N]] +// CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 1) +// CHECK-NEXT: ret <8 x i16> [[VSHL_N]] +// int16x8_t test_vshlq_n_s16(int16x8_t a) { return vshlq_n_s16(a, 1); } -// CHECK-LABEL: @test_vshlq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <4 x i32> [[VSHL_N]] +// CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 1) +// CHECK-NEXT: ret <4 x i32> [[VSHL_N]] +// int32x4_t test_vshlq_n_s32(int32x4_t a) { return vshlq_n_s32(a, 1); } -// CHECK-LABEL: @test_vshlq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <2 x i64> [[VSHL_N]] +// CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: ret <2 x i64> [[VSHL_N]] +// int64x2_t test_vshlq_n_s64(int64x2_t a) { return vshlq_n_s64(a, 1); } -// CHECK-LABEL: @test_vshlq_n_u8( -// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, splat (i8 1) -// CHECK: ret <16 x i8> [[VSHL_N]] +// CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <16 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <16 x i8> [[VSHL_N]] +// uint8x16_t test_vshlq_n_u8(uint8x16_t a) { return vshlq_n_u8(a, 1); } -// CHECK-LABEL: @test_vshlq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <8 x i16> [[VSHL_N]] +// CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 1) +// CHECK-NEXT: ret <8 x i16> [[VSHL_N]] +// uint16x8_t test_vshlq_n_u16(uint16x8_t a) { return vshlq_n_u16(a, 1); } -// CHECK-LABEL: @test_vshlq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <4 x i32> [[VSHL_N]] +// CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 1) +// CHECK-NEXT: ret <4 x i32> [[VSHL_N]] +// uint32x4_t test_vshlq_n_u32(uint32x4_t a) { return vshlq_n_u32(a, 1); } -// CHECK-LABEL: @test_vshlq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <2 x i64> [[VSHL_N]] +// CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: ret <2 x i64> [[VSHL_N]] +// uint64x2_t test_vshlq_n_u64(uint64x2_t a) { return vshlq_n_u64(a, 1); } -// CHECK-LABEL: @test_vshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 1) -// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSHRN_N]] +// CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 1) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSHRN_N]] +// int8x8_t test_vshrn_n_s16(int16x8_t a) { return vshrn_n_s16(a, 1); } -// CHECK-LABEL: @test_vshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 1) -// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSHRN_N]] +// CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 1) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSHRN_N]] +// int16x4_t test_vshrn_n_s32(int32x4_t a) { return vshrn_n_s32(a, 1); } -// CHECK-LABEL: @test_vshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 1) -// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSHRN_N]] +// CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSHRN_N]] +// int32x2_t test_vshrn_n_s64(int64x2_t a) { return vshrn_n_s64(a, 1); } -// CHECK-LABEL: @test_vshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 1) -// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSHRN_N]] +// CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 1) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSHRN_N]] +// uint8x8_t test_vshrn_n_u16(uint16x8_t a) { return vshrn_n_u16(a, 1); } -// CHECK-LABEL: @test_vshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 1) -// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSHRN_N]] +// CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 1) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSHRN_N]] +// uint16x4_t test_vshrn_n_u32(uint32x4_t a) { return vshrn_n_u32(a, 1); } -// CHECK-LABEL: @test_vshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 1) -// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSHRN_N]] +// CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSHRN_N]] +// uint32x2_t test_vshrn_n_u64(uint64x2_t a) { return vshrn_n_u64(a, 1); } -// CHECK-LABEL: @test_vshr_n_s8( -// CHECK: [[VSHR_N:%.*]] = ashr <8 x i8> %a, splat (i8 1) -// CHECK: ret <8 x i8> [[VSHR_N]] +// CHECK-LABEL: define <8 x i8> @test_vshr_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <8 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <8 x i8> [[VSHR_N]] +// int8x8_t test_vshr_n_s8(int8x8_t a) { return vshr_n_s8(a, 1); } -// CHECK-LABEL: @test_vshr_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <4 x i16> [[VSHR_N]] +// CHECK-LABEL: define <4 x i16> @test_vshr_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], splat (i16 1) +// CHECK-NEXT: ret <4 x i16> [[VSHR_N]] +// int16x4_t test_vshr_n_s16(int16x4_t a) { return vshr_n_s16(a, 1); } -// CHECK-LABEL: @test_vshr_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <2 x i32> [[VSHR_N]] +// CHECK-LABEL: define <2 x i32> @test_vshr_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], splat (i32 1) +// CHECK-NEXT: ret <2 x i32> [[VSHR_N]] +// int32x2_t test_vshr_n_s32(int32x2_t a) { return vshr_n_s32(a, 1); } -// CHECK-LABEL: @test_vshr_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHR_N]] +// CHECK-LABEL: define <1 x i64> @test_vshr_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHR_N]] +// int64x1_t test_vshr_n_s64(int64x1_t a) { return vshr_n_s64(a, 1); } -// CHECK-LABEL: @test_vshr_n_u8( -// CHECK: [[VSHR_N:%.*]] = lshr <8 x i8> %a, splat (i8 1) -// CHECK: ret <8 x i8> [[VSHR_N]] +// CHECK-LABEL: define <8 x i8> @test_vshr_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <8 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <8 x i8> [[VSHR_N]] +// uint8x8_t test_vshr_n_u8(uint8x8_t a) { return vshr_n_u8(a, 1); } -// CHECK-LABEL: @test_vshr_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <4 x i16> [[VSHR_N]] +// CHECK-LABEL: define <4 x i16> @test_vshr_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], splat (i16 1) +// CHECK-NEXT: ret <4 x i16> [[VSHR_N]] +// uint16x4_t test_vshr_n_u16(uint16x4_t a) { return vshr_n_u16(a, 1); } -// CHECK-LABEL: @test_vshr_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <2 x i32> [[VSHR_N]] +// CHECK-LABEL: define <2 x i32> @test_vshr_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 1) +// CHECK-NEXT: ret <2 x i32> [[VSHR_N]] +// uint32x2_t test_vshr_n_u32(uint32x2_t a) { return vshr_n_u32(a, 1); } -// CHECK-LABEL: @test_vshr_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHR_N]] +// CHECK-LABEL: define <1 x i64> @test_vshr_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHR_N]] +// uint64x1_t test_vshr_n_u64(uint64x1_t a) { return vshr_n_u64(a, 1); } -// CHECK-LABEL: @test_vshrq_n_s8( -// CHECK: [[VSHR_N:%.*]] = ashr <16 x i8> %a, splat (i8 1) -// CHECK: ret <16 x i8> [[VSHR_N]] +// CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <16 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <16 x i8> [[VSHR_N]] +// int8x16_t test_vshrq_n_s8(int8x16_t a) { return vshrq_n_s8(a, 1); } -// CHECK-LABEL: @test_vshrq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <8 x i16> [[VSHR_N]] +// CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 1) +// CHECK-NEXT: ret <8 x i16> [[VSHR_N]] +// int16x8_t test_vshrq_n_s16(int16x8_t a) { return vshrq_n_s16(a, 1); } -// CHECK-LABEL: @test_vshrq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <4 x i32> [[VSHR_N]] +// CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 1) +// CHECK-NEXT: ret <4 x i32> [[VSHR_N]] +// int32x4_t test_vshrq_n_s32(int32x4_t a) { return vshrq_n_s32(a, 1); } -// CHECK-LABEL: @test_vshrq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <2 x i64> [[VSHR_N]] +// CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: ret <2 x i64> [[VSHR_N]] +// int64x2_t test_vshrq_n_s64(int64x2_t a) { return vshrq_n_s64(a, 1); } -// CHECK-LABEL: @test_vshrq_n_u8( -// CHECK: [[VSHR_N:%.*]] = lshr <16 x i8> %a, splat (i8 1) -// CHECK: ret <16 x i8> [[VSHR_N]] +// CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <16 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <16 x i8> [[VSHR_N]] +// uint8x16_t test_vshrq_n_u8(uint8x16_t a) { return vshrq_n_u8(a, 1); } -// CHECK-LABEL: @test_vshrq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <8 x i16> [[VSHR_N]] +// CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 1) +// CHECK-NEXT: ret <8 x i16> [[VSHR_N]] +// uint16x8_t test_vshrq_n_u16(uint16x8_t a) { return vshrq_n_u16(a, 1); } -// CHECK-LABEL: @test_vshrq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <4 x i32> [[VSHR_N]] +// CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 1) +// CHECK-NEXT: ret <4 x i32> [[VSHR_N]] +// uint32x4_t test_vshrq_n_u32(uint32x4_t a) { return vshrq_n_u32(a, 1); } -// CHECK-LABEL: @test_vshrq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <2 x i64> [[VSHR_N]] +// CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 1) +// CHECK-NEXT: ret <2 x i64> [[VSHR_N]] +// uint64x2_t test_vshrq_n_u64(uint64x2_t a) { return vshrq_n_u64(a, 1); } -// CHECK-LABEL: @test_vsli_n_s8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 1)) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define <8 x i8> @test_vsli_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 1)) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) { return vsli_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1)) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i16> @test_vsli_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1)) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) { return vsli_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 1)) -// CHECK: ret <2 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i32> @test_vsli_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 1)) +// CHECK-NEXT: ret <2 x i32> [[VSLI_N2]] +// int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) { return vsli_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <1 x i64> @test_vsli_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) { return vsli_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_u8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 1)) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define <8 x i8> @test_vsli_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 1)) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) { return vsli_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1)) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i16> @test_vsli_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1)) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) { return vsli_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 1)) -// CHECK: ret <2 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i32> @test_vsli_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 1)) +// CHECK-NEXT: ret <2 x i32> [[VSLI_N2]] +// uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) { return vsli_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <1 x i64> @test_vsli_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) { return vsli_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_p8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 1)) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define <8 x i8> @test_vsli_n_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 1)) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) { return vsli_n_p8(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1)) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i16> @test_vsli_n_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1)) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) { return vsli_n_p16(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_s8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 1)) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 1)) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) { return vsliq_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1)) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1)) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) { return vsliq_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 1)) -// CHECK: ret <4 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 1)) +// CHECK-NEXT: ret <4 x i32> [[VSLI_N2]] +// int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) { return vsliq_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 1)) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 1)) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) { return vsliq_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_u8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 1)) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 1)) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) { return vsliq_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1)) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1)) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) { return vsliq_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 1)) -// CHECK: ret <4 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 1)) +// CHECK-NEXT: ret <4 x i32> [[VSLI_N2]] +// uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) { return vsliq_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 1)) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 1)) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) { return vsliq_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_p8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 1)) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 1)) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) { return vsliq_n_p8(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1)) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1)) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) { return vsliq_n_p16(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_s8( -// CHECK: [[VSRA_N:%.*]] = ashr <8 x i8> %b, splat (i8 1) -// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]] -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vsra_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <8 x i8> [[B]], splat (i8 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) { return vsra_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], splat (i16 1) -// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i16> [[TMP4]] +// CHECK-LABEL: define <4 x i16> @test_vsra_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], splat (i16 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i16> [[TMP4]] +// int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) { return vsra_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], splat (i32 1) -// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i32> [[TMP4]] +// CHECK-LABEL: define <2 x i32> @test_vsra_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], splat (i32 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i32> [[TMP4]] +// int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) { return vsra_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], splat (i64 1) -// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <1 x i64> [[TMP4]] +// CHECK-LABEL: define <1 x i64> @test_vsra_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], splat (i64 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <1 x i64> [[TMP4]] +// int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) { return vsra_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_u8( -// CHECK: [[VSRA_N:%.*]] = lshr <8 x i8> %b, splat (i8 1) -// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]] -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vsra_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <8 x i8> [[B]], splat (i8 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) { return vsra_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 1) -// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i16> [[TMP4]] +// CHECK-LABEL: define <4 x i16> @test_vsra_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i16> [[TMP4]] +// uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) { return vsra_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], splat (i32 1) -// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i32> [[TMP4]] +// CHECK-LABEL: define <2 x i32> @test_vsra_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], splat (i32 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i32> [[TMP4]] +// uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) { return vsra_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], splat (i64 1) -// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <1 x i64> [[TMP4]] +// CHECK-LABEL: define <1 x i64> @test_vsra_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], splat (i64 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <1 x i64> [[TMP4]] +// uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) { return vsra_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_s8( -// CHECK: [[VSRA_N:%.*]] = ashr <16 x i8> %b, splat (i8 1) -// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]] -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <16 x i8> [[B]], splat (i8 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <16 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) { return vsraq_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], splat (i16 1) -// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <8 x i16> [[TMP4]] +// CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], splat (i16 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i16> [[TMP4]] +// int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) { return vsraq_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], splat (i32 1) -// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i32> [[TMP4]] +// CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], splat (i32 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i32> [[TMP4]] +// int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) { return vsraq_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], splat (i64 1) -// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i64> [[TMP4]] +// CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], splat (i64 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i64> [[TMP4]] +// int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) { return vsraq_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_u8( -// CHECK: [[VSRA_N:%.*]] = lshr <16 x i8> %b, splat (i8 1) -// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]] -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <16 x i8> [[B]], splat (i8 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <16 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) { return vsraq_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], splat (i16 1) -// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <8 x i16> [[TMP4]] +// CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], splat (i16 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i16> [[TMP4]] +// uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) { return vsraq_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], splat (i32 1) -// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i32> [[TMP4]] +// CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], splat (i32 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i32> [[TMP4]] +// uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) { return vsraq_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 1) -// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i64> [[TMP4]] +// CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 1) +// CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i64> [[TMP4]] +// uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) { return vsraq_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_s8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 -1)) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define <8 x i8> @test_vsri_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) { return vsri_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1)) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i16> @test_vsri_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) { return vsri_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 -1)) -// CHECK: ret <2 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i32> @test_vsri_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <2 x i32> [[VSLI_N2]] +// int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) { return vsri_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 -1)) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <1 x i64> @test_vsri_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) { return vsri_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_u8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 -1)) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define <8 x i8> @test_vsri_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) { return vsri_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1)) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i16> @test_vsri_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) { return vsri_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 -1)) -// CHECK: ret <2 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i32> @test_vsri_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <2 x i32> [[VSLI_N2]] +// uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) { return vsri_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 -1)) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <1 x i64> @test_vsri_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) { return vsri_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_p8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 -1)) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define <8 x i8> @test_vsri_n_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) { return vsri_n_p8(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1)) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i16> @test_vsri_n_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) { return vsri_n_p16(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_s8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 -1)) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) { return vsriq_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) { return vsriq_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i32> [[VSLI_N2]] +// int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) { return vsriq_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) { return vsriq_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_u8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 -1)) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) { return vsriq_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) { return vsriq_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i32> [[VSLI_N2]] +// uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) { return vsriq_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) { return vsriq_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_p8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 -1)) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) { return vsriq_n_p8(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) { return vsriq_n_p16(a, b, 1); } -// CHECK-LABEL: @test_vst1q_u8( -// CHECK: call void @llvm.arm.neon.vst1.p0.v16i8(ptr %a, <16 x i8> %b, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v16i8(ptr [[A]], <16 x i8> [[B]], i32 1) +// CHECK-NEXT: ret void +// void test_vst1q_u8(uint8_t * a, uint8x16_t b) { vst1q_u8(a, b); } -// CHECK-LABEL: @test_vst1q_u16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst1.p0.v8i16(ptr %a, <8 x i16> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1q_u16(uint16_t * a, uint16x8_t b) { vst1q_u16(a, b); } -// CHECK-LABEL: @test_vst1q_u32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4i32(ptr %a, <4 x i32> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4i32(ptr [[A]], <4 x i32> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_u32(uint32_t * a, uint32x4_t b) { vst1q_u32(a, b); } -// CHECK-LABEL: @test_vst1q_u64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: call void @llvm.arm.neon.vst1.p0.v2i64(ptr %a, <2 x i64> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v2i64(ptr [[A]], <2 x i64> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_u64(uint64_t * a, uint64x2_t b) { vst1q_u64(a, b); } -// CHECK-LABEL: @test_vst1q_s8( -// CHECK: call void @llvm.arm.neon.vst1.p0.v16i8(ptr %a, <16 x i8> %b, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v16i8(ptr [[A]], <16 x i8> [[B]], i32 1) +// CHECK-NEXT: ret void +// void test_vst1q_s8(int8_t * a, int8x16_t b) { vst1q_s8(a, b); } -// CHECK-LABEL: @test_vst1q_s16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst1.p0.v8i16(ptr %a, <8 x i16> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1q_s16(int16_t * a, int16x8_t b) { vst1q_s16(a, b); } -// CHECK-LABEL: @test_vst1q_s32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4i32(ptr %a, <4 x i32> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4i32(ptr [[A]], <4 x i32> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_s32(int32_t * a, int32x4_t b) { vst1q_s32(a, b); } -// CHECK-LABEL: @test_vst1q_s64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: call void @llvm.arm.neon.vst1.p0.v2i64(ptr %a, <2 x i64> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v2i64(ptr [[A]], <2 x i64> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_s64(int64_t * a, int64x2_t b) { vst1q_s64(a, b); } -// CHECK-LABEL: @test_vst1q_f16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst1.p0.v8f16(ptr %a, <8 x half> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8f16(ptr [[A]], <8 x half> [[TMP2]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1q_f16(float16_t * a, float16x8_t b) { vst1q_f16(a, b); } -// CHECK-LABEL: @test_vst1q_f32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4f32(ptr %a, <4 x float> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4f32(ptr [[A]], <4 x float> [[TMP2]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_f32(float32_t * a, float32x4_t b) { vst1q_f32(a, b); } -// CHECK-LABEL: @test_vst1q_p8( -// CHECK: call void @llvm.arm.neon.vst1.p0.v16i8(ptr %a, <16 x i8> %b, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v16i8(ptr [[A]], <16 x i8> [[B]], i32 1) +// CHECK-NEXT: ret void +// void test_vst1q_p8(poly8_t * a, poly8x16_t b) { vst1q_p8(a, b); } -// CHECK-LABEL: @test_vst1q_p16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst1.p0.v8i16(ptr %a, <8 x i16> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1q_p16(poly16_t * a, poly16x8_t b) { vst1q_p16(a, b); } -// CHECK-LABEL: @test_vst1_u8( -// CHECK: call void @llvm.arm.neon.vst1.p0.v8i8(ptr %a, <8 x i8> %b, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8i8(ptr [[A]], <8 x i8> [[B]], i32 1) +// CHECK-NEXT: ret void +// void test_vst1_u8(uint8_t * a, uint8x8_t b) { vst1_u8(a, b); } -// CHECK-LABEL: @test_vst1_u16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4i16(ptr %a, <4 x i16> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1_u16(uint16_t * a, uint16x4_t b) { vst1_u16(a, b); } -// CHECK-LABEL: @test_vst1_u32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst1.p0.v2i32(ptr %a, <2 x i32> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v2i32(ptr [[A]], <2 x i32> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1_u32(uint32_t * a, uint32x2_t b) { vst1_u32(a, b); } -// CHECK-LABEL: @test_vst1_u64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1_u64(uint64_t * a, uint64x1_t b) { vst1_u64(a, b); } -// CHECK-LABEL: @test_vst1_s8( -// CHECK: call void @llvm.arm.neon.vst1.p0.v8i8(ptr %a, <8 x i8> %b, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8i8(ptr [[A]], <8 x i8> [[B]], i32 1) +// CHECK-NEXT: ret void +// void test_vst1_s8(int8_t * a, int8x8_t b) { vst1_s8(a, b); } -// CHECK-LABEL: @test_vst1_s16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4i16(ptr %a, <4 x i16> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1_s16(int16_t * a, int16x4_t b) { vst1_s16(a, b); } -// CHECK-LABEL: @test_vst1_s32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst1.p0.v2i32(ptr %a, <2 x i32> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v2i32(ptr [[A]], <2 x i32> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1_s32(int32_t * a, int32x2_t b) { vst1_s32(a, b); } -// CHECK-LABEL: @test_vst1_s64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1_s64(int64_t * a, int64x1_t b) { vst1_s64(a, b); } -// CHECK-LABEL: @test_vst1_f16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4f16(ptr %a, <4 x half> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4f16(ptr [[A]], <4 x half> [[TMP2]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1_f16(float16_t * a, float16x4_t b) { vst1_f16(a, b); } -// CHECK-LABEL: @test_vst1_f32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst1.p0.v2f32(ptr %a, <2 x float> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v2f32(ptr [[A]], <2 x float> [[TMP2]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1_f32(float32_t * a, float32x2_t b) { vst1_f32(a, b); } -// CHECK-LABEL: @test_vst1_p8( -// CHECK: call void @llvm.arm.neon.vst1.p0.v8i8(ptr %a, <8 x i8> %b, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8i8(ptr [[A]], <8 x i8> [[B]], i32 1) +// CHECK-NEXT: ret void +// void test_vst1_p8(poly8_t * a, poly8x8_t b) { vst1_p8(a, b); } -// CHECK-LABEL: @test_vst1_p16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4i16(ptr %a, <4 x i16> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1_p16(poly16_t * a, poly16x4_t b) { vst1_p16(a, b); } -// CHECK-LABEL: @test_vst1q_lane_u8( -// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 -// CHECK: store i8 [[TMP0]], ptr %a, align 1 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) { vst1q_lane_u8(a, b, 15); } -// CHECK-LABEL: @test_vst1q_lane_u16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK: store i16 [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) { vst1q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vst1q_lane_u32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -// CHECK: store i32 [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) { vst1q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vst1q_lane_u64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> -// CHECK: call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP3]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <1 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP2]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) { vst1q_lane_u64(a, b, 1); } -// CHECK-LABEL: @test_vst1q_lane_s8( -// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 -// CHECK: store i8 [[TMP0]], ptr %a, align 1 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s8(int8_t * a, int8x16_t b) { vst1q_lane_s8(a, b, 15); } -// CHECK-LABEL: @test_vst1q_lane_s16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK: store i16 [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s16(int16_t * a, int16x8_t b) { vst1q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vst1q_lane_s32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -// CHECK: store i32 [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +// CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s32(int32_t * a, int32x4_t b) { vst1q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vst1q_lane_s64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> -// CHECK: call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP3]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <1 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP2]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_lane_s64(int64_t * a, int64x2_t b) { vst1q_lane_s64(a, b, 1); } -// CHECK-LABEL: @test_vst1q_lane_f16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK: [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7 -// CHECK: store half [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7 +// CHECK-NEXT: store half [[TMP3]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_f16(float16_t * a, float16x8_t b) { vst1q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vst1q_lane_f32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -// CHECK: store float [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +// CHECK-NEXT: store float [[TMP3]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_lane_f32(float32_t * a, float32x4_t b) { vst1q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vst1q_lane_p8( -// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 -// CHECK: store i8 [[TMP0]], ptr %a, align 1 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i32 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) { vst1q_lane_p8(a, b, 15); } -// CHECK-LABEL: @test_vst1q_lane_p16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK: store i16 [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) { vst1q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vst1_lane_u8( -// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 -// CHECK: store i8 [[TMP0]], ptr %a, align 1 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) { vst1_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vst1_lane_u16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK: store i16 [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) { vst1_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vst1_lane_u32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -// CHECK: store i32 [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) { vst1_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vst1_lane_u64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 -// CHECK: store i64 [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK-NEXT: store i64 [[TMP2]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) { vst1_lane_u64(a, b, 0); } -// CHECK-LABEL: @test_vst1_lane_s8( -// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 -// CHECK: store i8 [[TMP0]], ptr %a, align 1 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_lane_s8(int8_t * a, int8x8_t b) { vst1_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vst1_lane_s16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK: store i16 [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_s16(int16_t * a, int16x4_t b) { vst1_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vst1_lane_s32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -// CHECK: store i32 [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +// CHECK-NEXT: store i32 [[TMP2]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_s32(int32_t * a, int32x2_t b) { vst1_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vst1_lane_s64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 -// CHECK: store i64 [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK-NEXT: store i64 [[TMP2]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_s64(int64_t * a, int64x1_t b) { vst1_lane_s64(a, b, 0); } -// CHECK-LABEL: @test_vst1_lane_f16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK: [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3 -// CHECK: store half [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3 +// CHECK-NEXT: store half [[TMP3]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_f16(float16_t * a, float16x4_t b) { vst1_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vst1_lane_f32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -// CHECK: store float [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +// CHECK-NEXT: store float [[TMP3]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_f32(float32_t * a, float32x2_t b) { vst1_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vst1_lane_p8( -// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 -// CHECK: store i8 [[TMP0]], ptr %a, align 1 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i32 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) { vst1_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vst1_lane_p16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK: store i16 [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) { vst1_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vst2q_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.arm.neon.vst2.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1) +// CHECK-NEXT: ret void +// void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) { vst2q_u8(a, b); } -// CHECK-LABEL: @test_vst2q_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst2.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) { vst2q_u16(a, b); } -// CHECK-LABEL: @test_vst2q_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4i32(ptr [[A]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) { vst2q_u32(a, b); } -// CHECK-LABEL: @test_vst2q_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.arm.neon.vst2.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1) +// CHECK-NEXT: ret void +// void test_vst2q_s8(int8_t * a, int8x16x2_t b) { vst2q_s8(a, b); } -// CHECK-LABEL: @test_vst2q_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst2.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_s16(int16_t * a, int16x8x2_t b) { vst2q_s16(a, b); } -// CHECK-LABEL: @test_vst2q_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4i32(ptr [[A]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2q_s32(int32_t * a, int32x4x2_t b) { vst2q_s32(a, b); } -// CHECK-LABEL: @test_vst2q_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst2.p0.v8f16(ptr %a, <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8f16(ptr [[A]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_f16(float16_t * a, float16x8x2_t b) { vst2q_f16(a, b); } -// CHECK-LABEL: @test_vst2q_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4f32(ptr %a, <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4f32(ptr [[A]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2q_f32(float32_t * a, float32x4x2_t b) { vst2q_f32(a, b); } -// CHECK-LABEL: @test_vst2q_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.arm.neon.vst2.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1) +// CHECK-NEXT: ret void +// void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) { vst2q_p8(a, b); } -// CHECK-LABEL: @test_vst2q_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst2.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) { vst2q_p16(a, b); } -// CHECK-LABEL: @test_vst2_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.arm.neon.vst2.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 1) +// CHECK-NEXT: ret void +// void test_vst2_u8(uint8_t * a, uint8x8x2_t b) { vst2_u8(a, b); } -// CHECK-LABEL: @test_vst2_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2_u16(uint16_t * a, uint16x4x2_t b) { vst2_u16(a, b); } -// CHECK-LABEL: @test_vst2_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst2.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2_u32(uint32_t * a, uint32x2x2_t b) { vst2_u32(a, b); } -// CHECK-LABEL: @test_vst2_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst2.p0.v1i64(ptr %a, <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v1i64(ptr [[A]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2_u64(uint64_t * a, uint64x1x2_t b) { vst2_u64(a, b); } -// CHECK-LABEL: @test_vst2_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.arm.neon.vst2.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 1) +// CHECK-NEXT: ret void +// void test_vst2_s8(int8_t * a, int8x8x2_t b) { vst2_s8(a, b); } -// CHECK-LABEL: @test_vst2_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2_s16(int16_t * a, int16x4x2_t b) { vst2_s16(a, b); } -// CHECK-LABEL: @test_vst2_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst2.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2_s32(int32_t * a, int32x2x2_t b) { vst2_s32(a, b); } -// CHECK-LABEL: @test_vst2_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst2.p0.v1i64(ptr %a, <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v1i64(ptr [[A]], <1 x i64> [[TMP4]], <1 x i64> [[TMP5]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2_s64(int64_t * a, int64x1x2_t b) { vst2_s64(a, b); } -// CHECK-LABEL: @test_vst2_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4f16(ptr %a, <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4f16(ptr [[A]], <4 x half> [[TMP4]], <4 x half> [[TMP5]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2_f16(float16_t * a, float16x4x2_t b) { vst2_f16(a, b); } -// CHECK-LABEL: @test_vst2_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst2.p0.v2f32(ptr %a, <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v2f32(ptr [[A]], <2 x float> [[TMP4]], <2 x float> [[TMP5]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2_f32(float32_t * a, float32x2x2_t b) { vst2_f32(a, b); } -// CHECK-LABEL: @test_vst2_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.arm.neon.vst2.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 1) +// CHECK-NEXT: ret void +// void test_vst2_p8(poly8_t * a, poly8x8x2_t b) { vst2_p8(a, b); } -// CHECK-LABEL: @test_vst2_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2_p16(poly16_t * a, poly16x4x2_t b) { vst2_p16(a, b); } -// CHECK-LABEL: @test_vst2q_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) { vst2q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vst2q_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) { vst2q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vst2q_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) { vst2q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vst2q_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) { vst2q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vst2q_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8f16(ptr %a, <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) { vst2q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vst2q_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4f32(ptr %a, <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) { vst2q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vst2q_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) { vst2q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vst2_lane_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) { vst2_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vst2_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) { vst2_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vst2_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) { vst2_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vst2_lane_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) { vst2_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vst2_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) { vst2_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vst2_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) { vst2_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vst2_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4f16(ptr %a, <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP4]], <4 x half> [[TMP5]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) { vst2_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vst2_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v2f32(ptr %a, <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP4]], <2 x float> [[TMP5]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) { vst2_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vst2_lane_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) { vst2_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vst2_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) { vst2_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vst3q_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.arm.neon.vst3.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) { vst3q_u8(a, b); } -// CHECK-LABEL: @test_vst3q_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst3.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) { vst3q_u16(a, b); } -// CHECK-LABEL: @test_vst3q_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4i32(ptr [[A]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) { vst3q_u32(a, b); } -// CHECK-LABEL: @test_vst3q_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.arm.neon.vst3.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// void test_vst3q_s8(int8_t * a, int8x16x3_t b) { vst3q_s8(a, b); } -// CHECK-LABEL: @test_vst3q_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst3.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_s16(int16_t * a, int16x8x3_t b) { vst3q_s16(a, b); } -// CHECK-LABEL: @test_vst3q_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4i32(ptr [[A]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3q_s32(int32_t * a, int32x4x3_t b) { vst3q_s32(a, b); } -// CHECK-LABEL: @test_vst3q_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst3.p0.v8f16(ptr %a, <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8f16(ptr [[A]], <8 x half> [[TMP3]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_f16(float16_t * a, float16x8x3_t b) { vst3q_f16(a, b); } -// CHECK-LABEL: @test_vst3q_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4f32(ptr %a, <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4f32(ptr [[A]], <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3q_f32(float32_t * a, float32x4x3_t b) { vst3q_f32(a, b); } -// CHECK-LABEL: @test_vst3q_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.arm.neon.vst3.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) { vst3q_p8(a, b); } -// CHECK-LABEL: @test_vst3q_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst3.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) { vst3q_p16(a, b); } -// CHECK-LABEL: @test_vst3_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.arm.neon.vst3.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// void test_vst3_u8(uint8_t * a, uint8x8x3_t b) { vst3_u8(a, b); } -// CHECK-LABEL: @test_vst3_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3_u16(uint16_t * a, uint16x4x3_t b) { vst3_u16(a, b); } -// CHECK-LABEL: @test_vst3_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst3.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3_u32(uint32_t * a, uint32x2x3_t b) { vst3_u32(a, b); } -// CHECK-LABEL: @test_vst3_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst3.p0.v1i64(ptr %a, <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v1i64(ptr [[A]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3_u64(uint64_t * a, uint64x1x3_t b) { vst3_u64(a, b); } -// CHECK-LABEL: @test_vst3_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.arm.neon.vst3.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// void test_vst3_s8(int8_t * a, int8x8x3_t b) { vst3_s8(a, b); } -// CHECK-LABEL: @test_vst3_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3_s16(int16_t * a, int16x4x3_t b) { vst3_s16(a, b); } -// CHECK-LABEL: @test_vst3_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst3.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3_s32(int32_t * a, int32x2x3_t b) { vst3_s32(a, b); } -// CHECK-LABEL: @test_vst3_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst3.p0.v1i64(ptr %a, <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <1 x i64> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v1i64(ptr [[A]], <1 x i64> [[TMP6]], <1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3_s64(int64_t * a, int64x1x3_t b) { vst3_s64(a, b); } -// CHECK-LABEL: @test_vst3_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4f16(ptr %a, <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4f16(ptr [[A]], <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3_f16(float16_t * a, float16x4x3_t b) { vst3_f16(a, b); } -// CHECK-LABEL: @test_vst3_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst3.p0.v2f32(ptr %a, <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v2f32(ptr [[A]], <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3_f32(float32_t * a, float32x2x3_t b) { vst3_f32(a, b); } -// CHECK-LABEL: @test_vst3_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.arm.neon.vst3.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// void test_vst3_p8(poly8_t * a, poly8x8x3_t b) { vst3_p8(a, b); } -// CHECK-LABEL: @test_vst3_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3_p16(poly16_t * a, poly16x4x3_t b) { vst3_p16(a, b); } -// CHECK-LABEL: @test_vst3q_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) { vst3q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vst3q_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) { vst3q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vst3q_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) { vst3q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vst3q_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) { vst3q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vst3q_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8f16(ptr %a, <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP3]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) { vst3q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vst3q_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4f32(ptr %a, <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) { vst3q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vst3q_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP3]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) { vst3q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vst3_lane_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) { vst3_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vst3_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) { vst3_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vst3_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) { vst3_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vst3_lane_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) { vst3_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vst3_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) { vst3_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vst3_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP6]], <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) { vst3_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vst3_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4f16(ptr %a, <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP6]], <4 x half> [[TMP7]], <4 x half> [[TMP8]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) { vst3_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vst3_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v2f32(ptr %a, <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP6]], <2 x float> [[TMP7]], <2 x float> [[TMP8]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) { vst3_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vst3_lane_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) { vst3_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vst3_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) { vst3_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vst4q_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.arm.neon.vst4.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i32 1) +// CHECK-NEXT: ret void +// void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) { vst4q_u8(a, b); } -// CHECK-LABEL: @test_vst4q_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst4.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) { vst4q_u16(a, b); } -// CHECK-LABEL: @test_vst4q_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) { vst4q_u32(a, b); } -// CHECK-LABEL: @test_vst4q_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.arm.neon.vst4.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i32 1) +// CHECK-NEXT: ret void +// void test_vst4q_s8(int8_t * a, int8x16x4_t b) { vst4q_s8(a, b); } -// CHECK-LABEL: @test_vst4q_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst4.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_s16(int16_t * a, int16x8x4_t b) { vst4q_s16(a, b); } -// CHECK-LABEL: @test_vst4q_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4q_s32(int32_t * a, int32x4x4_t b) { vst4q_s32(a, b); } -// CHECK-LABEL: @test_vst4q_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst4.p0.v8f16(ptr %a, <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8f16(ptr [[A]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x half> [[TMP6]], <8 x half> [[TMP7]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_f16(float16_t * a, float16x8x4_t b) { vst4q_f16(a, b); } -// CHECK-LABEL: @test_vst4q_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4f32(ptr %a, <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4f32(ptr [[A]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[TMP7]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4q_f32(float32_t * a, float32x4x4_t b) { vst4q_f32(a, b); } -// CHECK-LABEL: @test_vst4q_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.arm.neon.vst4.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i32 1) +// CHECK-NEXT: ret void +// void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) { vst4q_p8(a, b); } -// CHECK-LABEL: @test_vst4q_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst4.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) { vst4q_p16(a, b); } -// CHECK-LABEL: @test_vst4_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.arm.neon.vst4.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 1) +// CHECK-NEXT: ret void +// void test_vst4_u8(uint8_t * a, uint8x8x4_t b) { vst4_u8(a, b); } -// CHECK-LABEL: @test_vst4_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4_u16(uint16_t * a, uint16x4x4_t b) { vst4_u16(a, b); } -// CHECK-LABEL: @test_vst4_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst4.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4_u32(uint32_t * a, uint32x2x4_t b) { vst4_u32(a, b); } -// CHECK-LABEL: @test_vst4_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst4.p0.v1i64(ptr %a, <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v1i64(ptr [[A]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4_u64(uint64_t * a, uint64x1x4_t b) { vst4_u64(a, b); } -// CHECK-LABEL: @test_vst4_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.arm.neon.vst4.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 1) +// CHECK-NEXT: ret void +// void test_vst4_s8(int8_t * a, int8x8x4_t b) { vst4_s8(a, b); } -// CHECK-LABEL: @test_vst4_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4_s16(int16_t * a, int16x4x4_t b) { vst4_s16(a, b); } -// CHECK-LABEL: @test_vst4_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst4.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4_s32(int32_t * a, int32x2x4_t b) { vst4_s32(a, b); } -// CHECK-LABEL: @test_vst4_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst4.p0.v1i64(ptr %a, <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <1 x i64> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v1i64(ptr [[A]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4_s64(int64_t * a, int64x1x4_t b) { vst4_s64(a, b); } -// CHECK-LABEL: @test_vst4_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4f16(ptr %a, <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4f16(ptr [[A]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4_f16(float16_t * a, float16x4x4_t b) { vst4_f16(a, b); } -// CHECK-LABEL: @test_vst4_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst4.p0.v2f32(ptr %a, <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v2f32(ptr [[A]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4_f32(float32_t * a, float32x2x4_t b) { vst4_f32(a, b); } -// CHECK-LABEL: @test_vst4_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.arm.neon.vst4.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 1) +// CHECK-NEXT: ret void +// void test_vst4_p8(poly8_t * a, poly8x8x4_t b) { vst4_p8(a, b); } -// CHECK-LABEL: @test_vst4_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4_p16(poly16_t * a, poly16x4x4_t b) { vst4_p16(a, b); } -// CHECK-LABEL: @test_vst4q_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) { vst4q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vst4q_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) { vst4q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vst4q_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) { vst4q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vst4q_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) { vst4q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vst4q_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8f16(ptr %a, <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x half> [[TMP6]], <8 x half> [[TMP7]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) { vst4q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vst4q_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4f32(ptr %a, <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[TMP7]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) { vst4q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vst4q_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_0_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_2_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_4_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i32 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[B_COERCE_FCA_6_EXTRACT]], i32 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) { vst4q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vst4_lane_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) { vst4_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vst4_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) { vst4_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vst4_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) { vst4_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vst4_lane_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) { vst4_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vst4_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) { vst4_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vst4_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) { vst4_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vst4_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4f16(ptr %a, <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) { vst4_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vst4_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v2f32(ptr %a, <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x float> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x float> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) { vst4_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vst4_lane_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) { vst4_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vst4_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> +// CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) { vst4_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vsub_s8( -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define <8 x i8> @test_vsub_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) { return vsub_s8(a, b); } -// CHECK-LABEL: @test_vsub_s16( -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vsub_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) { return vsub_s16(a, b); } -// CHECK-LABEL: @test_vsub_s32( -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vsub_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) { return vsub_s32(a, b); } -// CHECK-LABEL: @test_vsub_s64( -// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[SUB_I]] +// CHECK-LABEL: define <1 x i64> @test_vsub_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[SUB_I]] +// int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) { return vsub_s64(a, b); } -// CHECK-LABEL: @test_vsub_f32( -// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, %b -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define <2 x float> @test_vsub_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[B]] +// CHECK-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) { return vsub_f32(a, b); } -// CHECK-LABEL: @test_vsub_u8( -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define <8 x i8> @test_vsub_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) { return vsub_u8(a, b); } -// CHECK-LABEL: @test_vsub_u16( -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vsub_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) { return vsub_u16(a, b); } -// CHECK-LABEL: @test_vsub_u32( -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vsub_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) { return vsub_u32(a, b); } -// CHECK-LABEL: @test_vsub_u64( -// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[SUB_I]] +// CHECK-LABEL: define <1 x i64> @test_vsub_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[SUB_I]] +// uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) { return vsub_u64(a, b); } -// CHECK-LABEL: @test_vsubq_s8( -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define <16 x i8> @test_vsubq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) { return vsubq_s8(a, b); } -// CHECK-LABEL: @test_vsubq_s16( -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vsubq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) { return vsubq_s16(a, b); } -// CHECK-LABEL: @test_vsubq_s32( -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vsubq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) { return vsubq_s32(a, b); } -// CHECK-LABEL: @test_vsubq_s64( -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vsubq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) { return vsubq_s64(a, b); } -// CHECK-LABEL: @test_vsubq_f32( -// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, %b -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define <4 x float> @test_vsubq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[B]] +// CHECK-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) { return vsubq_f32(a, b); } -// CHECK-LABEL: @test_vsubq_u8( -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define <16 x i8> @test_vsubq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) { return vsubq_u8(a, b); } -// CHECK-LABEL: @test_vsubq_u16( -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vsubq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) { return vsubq_u16(a, b); } -// CHECK-LABEL: @test_vsubq_u32( -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vsubq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) { return vsubq_u32(a, b); } -// CHECK-LABEL: @test_vsubq_u64( -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vsubq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) { return vsubq_u64(a, b); } -// CHECK-LABEL: @test_vsubhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSUBHN2_I]] +// CHECK-LABEL: define <8 x i8> @test_vsubhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSUBHN2_I]] +// int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) { return vsubhn_s16(a, b); } -// CHECK-LABEL: @test_vsubhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSUBHN2_I]] +// CHECK-LABEL: define <4 x i16> @test_vsubhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSUBHN2_I]] +// int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) { return vsubhn_s32(a, b); } -// CHECK-LABEL: @test_vsubhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSUBHN2_I]] +// CHECK-LABEL: define <2 x i32> @test_vsubhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSUBHN2_I]] +// int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) { return vsubhn_s64(a, b); } -// CHECK-LABEL: @test_vsubhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSUBHN2_I]] +// CHECK-LABEL: define <8 x i8> @test_vsubhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSUBHN2_I]] +// uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) { return vsubhn_u16(a, b); } -// CHECK-LABEL: @test_vsubhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSUBHN2_I]] +// CHECK-LABEL: define <4 x i16> @test_vsubhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSUBHN2_I]] +// uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) { return vsubhn_u32(a, b); } -// CHECK-LABEL: @test_vsubhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSUBHN2_I]] +// CHECK-LABEL: define <2 x i32> @test_vsubhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSUBHN2_I]] +// uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) { return vsubhn_u64(a, b); } -// CHECK-LABEL: @test_vsubl_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vsubl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) { return vsubl_s8(a, b); } -// CHECK-LABEL: @test_vsubl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vsubl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) { return vsubl_s16(a, b); } -// CHECK-LABEL: @test_vsubl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vsubl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) { return vsubl_s32(a, b); } -// CHECK-LABEL: @test_vsubl_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vsubl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) { return vsubl_u8(a, b); } -// CHECK-LABEL: @test_vsubl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vsubl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) { return vsubl_u16(a, b); } -// CHECK-LABEL: @test_vsubl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vsubl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I5_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I5_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) { return vsubl_u32(a, b); } -// CHECK-LABEL: @test_vsubw_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vsubw_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) { return vsubw_s8(a, b); } -// CHECK-LABEL: @test_vsubw_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vsubw_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) { return vsubw_s16(a, b); } -// CHECK-LABEL: @test_vsubw_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vsubw_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) { return vsubw_s32(a, b); } -// CHECK-LABEL: @test_vsubw_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vsubw_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) { return vsubw_u8(a, b); } -// CHECK-LABEL: @test_vsubw_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vsubw_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) { return vsubw_u16(a, b); } -// CHECK-LABEL: @test_vsubw_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vsubw_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) { return vsubw_u32(a, b); } -// CHECK-LABEL: @test_vtbl1_u8( -// CHECK: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL1_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl1_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL1_I]] +// uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) { return vtbl1_u8(a, b); } -// CHECK-LABEL: @test_vtbl1_s8( -// CHECK: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL1_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl1_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL1_I]] +// int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) { return vtbl1_s8(a, b); } -// CHECK-LABEL: @test_vtbl1_p8( -// CHECK: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL1_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl1_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL1_I]] +// poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) { return vtbl1_p8(a, b); } -// CHECK-LABEL: @test_vtbl2_u8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [2 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [2 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL2_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl2_u8( +// CHECK-SAME: [2 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x i64] poison, i64 [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i64] [[DOTFCA_0_INSERT]], i64 [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL2_I]] +// uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) { return vtbl2_u8(a, b); } -// CHECK-LABEL: @test_vtbl2_s8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [2 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [2 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL2_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl2_s8( +// CHECK-SAME: [2 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x i64] poison, i64 [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i64] [[DOTFCA_0_INSERT]], i64 [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL2_I]] +// int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) { return vtbl2_s8(a, b); } -// CHECK-LABEL: @test_vtbl2_p8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [2 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [2 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL2_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl2_p8( +// CHECK-SAME: [2 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x i64] poison, i64 [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i64] [[DOTFCA_0_INSERT]], i64 [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL2_I]] +// poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) { return vtbl2_p8(a, b); } -// CHECK-LABEL: @test_vtbl3_u8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [3 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [3 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL3_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl3_u8( +// CHECK-SAME: [3 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x i64] poison, i64 [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x i64] [[DOTFCA_0_INSERT]], i64 [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x i64] [[DOTFCA_1_INSERT]], i64 [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL3_I]] +// uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) { return vtbl3_u8(a, b); } -// CHECK-LABEL: @test_vtbl3_s8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [3 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [3 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL3_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl3_s8( +// CHECK-SAME: [3 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x i64] poison, i64 [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x i64] [[DOTFCA_0_INSERT]], i64 [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x i64] [[DOTFCA_1_INSERT]], i64 [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL3_I]] +// int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) { return vtbl3_s8(a, b); } -// CHECK-LABEL: @test_vtbl3_p8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [3 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [3 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL3_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl3_p8( +// CHECK-SAME: [3 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x i64] poison, i64 [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x i64] [[DOTFCA_0_INSERT]], i64 [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x i64] [[DOTFCA_1_INSERT]], i64 [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL3_I]] +// poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) { return vtbl3_p8(a, b); } -// CHECK-LABEL: @test_vtbl4_u8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [4 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [4 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8 -// CHECK: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL4_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl4_u8( +// CHECK-SAME: [4 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 2 +// CHECK-NEXT: [[A_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i64] poison, i64 [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_0_INSERT]], i64 [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_1_INSERT]], i64 [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_2_INSERT]], i64 [[A_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL4_I]] +// uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) { return vtbl4_u8(a, b); } -// CHECK-LABEL: @test_vtbl4_s8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [4 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [4 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8 -// CHECK: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL4_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl4_s8( +// CHECK-SAME: [4 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 2 +// CHECK-NEXT: [[A_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i64] poison, i64 [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_0_INSERT]], i64 [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_1_INSERT]], i64 [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_2_INSERT]], i64 [[A_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL4_I]] +// int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) { return vtbl4_s8(a, b); } -// CHECK-LABEL: @test_vtbl4_p8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [4 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [4 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8 -// CHECK: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL4_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl4_p8( +// CHECK-SAME: [4 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 2 +// CHECK-NEXT: [[A_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i64] poison, i64 [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_0_INSERT]], i64 [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_1_INSERT]], i64 [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_2_INSERT]], i64 [[A_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL4_I]] +// poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) { return vtbl4_p8(a, b); } -// CHECK-LABEL: @test_vtbx1_u8( -// CHECK: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX1_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx1_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX1_I]] +// uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vtbx1_u8(a, b, c); } -// CHECK-LABEL: @test_vtbx1_s8( -// CHECK: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX1_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx1_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX1_I]] +// int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return vtbx1_s8(a, b, c); } -// CHECK-LABEL: @test_vtbx1_p8( -// CHECK: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX1_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx1_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX1_I]] +// poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) { return vtbx1_p8(a, b, c); } -// CHECK-LABEL: @test_vtbx2_u8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [2 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX2_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx2_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x i64] poison, i64 [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i64] [[DOTFCA_0_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX2_I]] +// uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) { return vtbx2_u8(a, b, c); } -// CHECK-LABEL: @test_vtbx2_s8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [2 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX2_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx2_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x i64] poison, i64 [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i64] [[DOTFCA_0_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX2_I]] +// int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) { return vtbx2_s8(a, b, c); } -// CHECK-LABEL: @test_vtbx2_p8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [2 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX2_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx2_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x i64] poison, i64 [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x i64] [[DOTFCA_0_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX2_I]] +// poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) { return vtbx2_p8(a, b, c); } -// CHECK-LABEL: @test_vtbx3_u8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [3 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX3_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx3_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x i64] poison, i64 [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x i64] [[DOTFCA_0_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x i64] [[DOTFCA_1_INSERT]], i64 [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX3_I]] +// uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) { return vtbx3_u8(a, b, c); } -// CHECK-LABEL: @test_vtbx3_s8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [3 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX3_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx3_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x i64] poison, i64 [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x i64] [[DOTFCA_0_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x i64] [[DOTFCA_1_INSERT]], i64 [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX3_I]] +// int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) { return vtbx3_s8(a, b, c); } -// CHECK-LABEL: @test_vtbx3_p8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [3 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX3_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx3_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x i64] poison, i64 [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x i64] [[DOTFCA_0_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x i64] [[DOTFCA_1_INSERT]], i64 [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX3_I]] +// poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) { return vtbx3_p8(a, b, c); } -// CHECK-LABEL: @test_vtbx4_u8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [4 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8 -// CHECK: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX4_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx4_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i64] poison, i64 [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_0_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_1_INSERT]], i64 [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_2_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX4_I]] +// uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) { return vtbx4_u8(a, b, c); } -// CHECK-LABEL: @test_vtbx4_s8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [4 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8 -// CHECK: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX4_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx4_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i64] poison, i64 [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_0_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_1_INSERT]], i64 [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_2_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX4_I]] +// int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) { return vtbx4_s8(a, b, c); } -// CHECK-LABEL: @test_vtbx4_p8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [4 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8 -// CHECK: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX4_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx4_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x i64] poison, i64 [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_0_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_1_INSERT]], i64 [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x i64] [[DOTFCA_2_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX4_I]] +// poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) { return vtbx4_p8(a, b, c); } -// CHECK: @test_vtrn_s8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META3]] +// CHECK-NEXT: ret void +// int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) { return vtrn_s8(a, b); } -// CHECK: @test_vtrn_s16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope [[META6]] +// CHECK-NEXT: ret void +// int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) { return vtrn_s16(a, b); } -// CHECK: @test_vtrn_s32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope [[META9]] +// CHECK-NEXT: ret void +// int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) { return vtrn_s32(a, b); } -// CHECK: @test_vtrn_u8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META12]] +// CHECK-NEXT: ret void +// uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) { return vtrn_u8(a, b); } -// CHECK: @test_vtrn_u16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope [[META15]] +// CHECK-NEXT: ret void +// uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) { return vtrn_u16(a, b); } -// CHECK: @test_vtrn_u32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope [[META18]] +// CHECK-NEXT: ret void +// uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) { return vtrn_u32(a, b); } -// CHECK: @test_vtrn_f32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: store <2 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META21]] +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: store <2 x float> [[VTRN1_I]], ptr [[TMP6]], align 4, !alias.scope [[META21]] +// CHECK-NEXT: ret void +// float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) { return vtrn_f32(a, b); } -// CHECK: @test_vtrn_p8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]]) +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META24]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META24]] +// CHECK-NEXT: ret void +// poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) { return vtrn_p8(a, b); } -// CHECK: @test_vtrn_p16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META27]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope [[META27]] +// CHECK-NEXT: ret void +// poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) { return vtrn_p16(a, b); } -// CHECK: @test_vtrnq_s8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]]) +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META30]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META30]] +// CHECK-NEXT: ret void +// int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) { return vtrnq_s8(a, b); } -// CHECK: @test_vtrnq_s16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META33]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope [[META33]] +// CHECK-NEXT: ret void +// int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) { return vtrnq_s16(a, b); } -// CHECK: @test_vtrnq_s32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META36]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope [[META36]] +// CHECK-NEXT: ret void +// int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) { return vtrnq_s32(a, b); } -// CHECK: @test_vtrnq_u8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META39:![0-9]+]]) +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META39]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META39]] +// CHECK-NEXT: ret void +// uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) { return vtrnq_u8(a, b); } -// CHECK: @test_vtrnq_u16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META42:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META42]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope [[META42]] +// CHECK-NEXT: ret void +// uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) { return vtrnq_u16(a, b); } -// CHECK: @test_vtrnq_u32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META45:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META45]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope [[META45]] +// CHECK-NEXT: ret void +// uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) { return vtrnq_u32(a, b); } -// CHECK: @test_vtrnq_f32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META48:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: store <4 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META48]] +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: store <4 x float> [[VTRN1_I]], ptr [[TMP6]], align 4, !alias.scope [[META48]] +// CHECK-NEXT: ret void +// float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) { return vtrnq_f32(a, b); } -// CHECK: @test_vtrnq_p8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META51:![0-9]+]]) +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META51]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META51]] +// CHECK-NEXT: ret void +// poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) { return vtrnq_p8(a, b); } -// CHECK: @test_vtrnq_p16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META54:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META54]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope [[META54]] +// CHECK-NEXT: ret void +// poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) { return vtrnq_p16(a, b); } -// CHECK-LABEL: @test_vtst_s8( -// CHECK: [[TMP0:%.*]] = and <8 x i8> %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> -// CHECK: ret <8 x i8> [[VTST_I]] +// CHECK-LABEL: define <8 x i8> @test_vtst_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VTST_I]] +// uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) { return vtst_s8(a, b); } -// CHECK-LABEL: @test_vtst_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i16> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -// CHECK: ret <4 x i16> [[VTST_I]] +// CHECK-LABEL: define <4 x i16> @test_vtst_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VTST_I]] +// uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) { return vtst_s16(a, b); } -// CHECK-LABEL: @test_vtst_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <2 x i32> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> -// CHECK: ret <2 x i32> [[VTST_I]] +// CHECK-LABEL: define <2 x i32> @test_vtst_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VTST_I]] +// uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) { return vtst_s32(a, b); } -// CHECK-LABEL: @test_vtst_u8( -// CHECK: [[TMP0:%.*]] = and <8 x i8> %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> -// CHECK: ret <8 x i8> [[VTST_I]] +// CHECK-LABEL: define <8 x i8> @test_vtst_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VTST_I]] +// uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) { return vtst_u8(a, b); } -// CHECK-LABEL: @test_vtst_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i16> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -// CHECK: ret <4 x i16> [[VTST_I]] +// CHECK-LABEL: define <4 x i16> @test_vtst_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VTST_I]] +// uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) { return vtst_u16(a, b); } -// CHECK-LABEL: @test_vtst_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <2 x i32> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> -// CHECK: ret <2 x i32> [[VTST_I]] +// CHECK-LABEL: define <2 x i32> @test_vtst_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VTST_I]] +// uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) { return vtst_u32(a, b); } -// CHECK-LABEL: @test_vtst_p8( -// CHECK: [[TMP0:%.*]] = and <8 x i8> %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> -// CHECK: ret <8 x i8> [[VTST_I]] +// CHECK-LABEL: define <8 x i8> @test_vtst_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VTST_I]] +// uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) { return vtst_p8(a, b); } -// CHECK-LABEL: @test_vtst_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i16> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -// CHECK: ret <4 x i16> [[VTST_I]] +// CHECK-LABEL: define <4 x i16> @test_vtst_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VTST_I]] +// uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) { return vtst_p16(a, b); } -// CHECK-LABEL: @test_vtstq_s8( -// CHECK: [[TMP0:%.*]] = and <16 x i8> %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> -// CHECK: ret <16 x i8> [[VTST_I]] +// CHECK-LABEL: define <16 x i8> @test_vtstq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VTST_I]] +// uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) { return vtstq_s8(a, b); } -// CHECK-LABEL: @test_vtstq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <8 x i16> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> -// CHECK: ret <8 x i16> [[VTST_I]] +// CHECK-LABEL: define <8 x i16> @test_vtstq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VTST_I]] +// uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) { return vtstq_s16(a, b); } -// CHECK-LABEL: @test_vtstq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i32> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> -// CHECK: ret <4 x i32> [[VTST_I]] +// CHECK-LABEL: define <4 x i32> @test_vtstq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VTST_I]] +// uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) { return vtstq_s32(a, b); } -// CHECK-LABEL: @test_vtstq_u8( -// CHECK: [[TMP0:%.*]] = and <16 x i8> %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> -// CHECK: ret <16 x i8> [[VTST_I]] +// CHECK-LABEL: define <16 x i8> @test_vtstq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VTST_I]] +// uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) { return vtstq_u8(a, b); } -// CHECK-LABEL: @test_vtstq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <8 x i16> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> -// CHECK: ret <8 x i16> [[VTST_I]] +// CHECK-LABEL: define <8 x i16> @test_vtstq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VTST_I]] +// uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) { return vtstq_u16(a, b); } -// CHECK-LABEL: @test_vtstq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i32> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> -// CHECK: ret <4 x i32> [[VTST_I]] +// CHECK-LABEL: define <4 x i32> @test_vtstq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VTST_I]] +// uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) { return vtstq_u32(a, b); } -// CHECK-LABEL: @test_vtstq_p8( -// CHECK: [[TMP0:%.*]] = and <16 x i8> %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> -// CHECK: ret <16 x i8> [[VTST_I]] +// CHECK-LABEL: define <16 x i8> @test_vtstq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VTST_I]] +// uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) { return vtstq_p8(a, b); } -// CHECK-LABEL: @test_vtstq_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <8 x i16> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> -// CHECK: ret <8 x i16> [[VTST_I]] +// CHECK-LABEL: define <8 x i16> @test_vtstq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VTST_I]] +// uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) { return vtstq_p16(a, b); } -// CHECK: @test_vuzp_s8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META57:![0-9]+]]) +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META57]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META57]] +// CHECK-NEXT: ret void +// int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) { return vuzp_s8(a, b); } -// CHECK: @test_vuzp_s16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META60:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META60]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META60]] +// CHECK-NEXT: ret void +// int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) { return vuzp_s16(a, b); } -// CHECK: @test_vuzp_s32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META63:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META63]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META63]] +// CHECK-NEXT: ret void +// int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) { return vuzp_s32(a, b); } -// CHECK: @test_vuzp_u8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META66:![0-9]+]]) +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META66]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META66]] +// CHECK-NEXT: ret void +// uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) { return vuzp_u8(a, b); } -// CHECK: @test_vuzp_u16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META69:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META69]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META69]] +// CHECK-NEXT: ret void +// uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) { return vuzp_u16(a, b); } -// CHECK: @test_vuzp_u32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META72:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META72]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META72]] +// CHECK-NEXT: ret void +// uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) { return vuzp_u32(a, b); } -// CHECK: @test_vuzp_f32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META75:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: store <2 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META75]] +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: store <2 x float> [[VUZP1_I]], ptr [[TMP6]], align 4, !alias.scope [[META75]] +// CHECK-NEXT: ret void +// float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) { return vuzp_f32(a, b); } -// CHECK: @test_vuzp_p8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META78:![0-9]+]]) +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META78]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META78]] +// CHECK-NEXT: ret void +// poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) { return vuzp_p8(a, b); } -// CHECK: @test_vuzp_p16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META81:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META81]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META81]] +// CHECK-NEXT: ret void +// poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) { return vuzp_p16(a, b); } -// CHECK: @test_vuzpq_s8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META84:![0-9]+]]) +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META84]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META84]] +// CHECK-NEXT: ret void +// int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) { return vuzpq_s8(a, b); } -// CHECK: @test_vuzpq_s16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META87:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META87]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META87]] +// CHECK-NEXT: ret void +// int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) { return vuzpq_s16(a, b); } -// CHECK: @test_vuzpq_s32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META90:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META90]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META90]] +// CHECK-NEXT: ret void +// int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) { return vuzpq_s32(a, b); } -// CHECK: @test_vuzpq_u8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META93:![0-9]+]]) +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META93]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META93]] +// CHECK-NEXT: ret void +// uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) { return vuzpq_u8(a, b); } -// CHECK: @test_vuzpq_u16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META96:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META96]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META96]] +// CHECK-NEXT: ret void +// uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) { return vuzpq_u16(a, b); } -// CHECK: @test_vuzpq_u32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META99:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META99]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META99]] +// CHECK-NEXT: ret void +// uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) { return vuzpq_u32(a, b); } -// CHECK: @test_vuzpq_f32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META102:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: store <4 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META102]] +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: store <4 x float> [[VUZP1_I]], ptr [[TMP6]], align 4, !alias.scope [[META102]] +// CHECK-NEXT: ret void +// float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) { return vuzpq_f32(a, b); } -// CHECK: @test_vuzpq_p8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META105:![0-9]+]]) +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META105]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META105]] +// CHECK-NEXT: ret void +// poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) { return vuzpq_p8(a, b); } -// CHECK: @test_vuzpq_p16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META108:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META108]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META108]] +// CHECK-NEXT: ret void +// poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) { return vuzpq_p16(a, b); } -// CHECK: @test_vzip_s8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META111:![0-9]+]]) +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META111]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META111]] +// CHECK-NEXT: ret void +// int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) { return vzip_s8(a, b); } -// CHECK: @test_vzip_s16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META114:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META114]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META114]] +// CHECK-NEXT: ret void +// int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) { return vzip_s16(a, b); } -// CHECK: @test_vzip_s32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META117:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META117]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META117]] +// CHECK-NEXT: ret void +// int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) { return vzip_s32(a, b); } -// CHECK: @test_vzip_u8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META120:![0-9]+]]) +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META120]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META120]] +// CHECK-NEXT: ret void +// uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) { return vzip_u8(a, b); } -// CHECK: @test_vzip_u16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META123:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META123]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META123]] +// CHECK-NEXT: ret void +// uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) { return vzip_u16(a, b); } -// CHECK: @test_vzip_u32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META126:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META126]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META126]] +// CHECK-NEXT: ret void +// uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) { return vzip_u32(a, b); } -// CHECK: @test_vzip_f32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META129:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x float> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: store <2 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META129]] +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +// CHECK-NEXT: store <2 x float> [[VZIP1_I]], ptr [[TMP6]], align 4, !alias.scope [[META129]] +// CHECK-NEXT: ret void +// float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) { return vzip_f32(a, b); } -// CHECK: @test_vzip_p8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META132:![0-9]+]]) +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META132]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META132]] +// CHECK-NEXT: ret void +// poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) { return vzip_p8(a, b); } -// CHECK: @test_vzip_p16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META135:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META135]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META135]] +// CHECK-NEXT: ret void +// poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) { return vzip_p16(a, b); } -// CHECK: @test_vzipq_s8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META138:![0-9]+]]) +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META138]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META138]] +// CHECK-NEXT: ret void +// int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) { return vzipq_s8(a, b); } -// CHECK: @test_vzipq_s16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META141:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META141]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META141]] +// CHECK-NEXT: ret void +// int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) { return vzipq_s16(a, b); } -// CHECK: @test_vzipq_s32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META144:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META144]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META144]] +// CHECK-NEXT: ret void +// int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) { return vzipq_s32(a, b); } -// CHECK: @test_vzipq_u8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META147:![0-9]+]]) +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META147]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META147]] +// CHECK-NEXT: ret void +// uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) { return vzipq_u8(a, b); } -// CHECK: @test_vzipq_u16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META150:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META150]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META150]] +// CHECK-NEXT: ret void +// uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) { return vzipq_u16(a, b); } -// CHECK: @test_vzipq_u32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META153:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META153]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META153]] +// CHECK-NEXT: ret void +// uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) { return vzipq_u32(a, b); } -// CHECK: @test_vzipq_f32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META156:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: store <4 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META156]] +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +// CHECK-NEXT: store <4 x float> [[VZIP1_I]], ptr [[TMP6]], align 4, !alias.scope [[META156]] +// CHECK-NEXT: ret void +// float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) { return vzipq_f32(a, b); } -// CHECK: @test_vzipq_p8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META159:![0-9]+]]) +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META159]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META159]] +// CHECK-NEXT: ret void +// poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) { return vzipq_p8(a, b); } -// CHECK: @test_vzipq_p16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META162:![0-9]+]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META162]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope [[META162]] +// CHECK-NEXT: ret void +// poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) { return vzipq_p16(a, b); } +//. +// CHECK: [[META3]] = !{[[META4:![0-9]+]]} +// CHECK: [[META4]] = distinct !{[[META4]], [[META5:![0-9]+]], !"vtrn_s8: %agg.result"} +// CHECK: [[META5]] = distinct !{[[META5]], !"vtrn_s8"} +// CHECK: [[META6]] = !{[[META7:![0-9]+]]} +// CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]], !"vtrn_s16: %agg.result"} +// CHECK: [[META8]] = distinct !{[[META8]], !"vtrn_s16"} +// CHECK: [[META9]] = !{[[META10:![0-9]+]]} +// CHECK: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]], !"vtrn_s32: %agg.result"} +// CHECK: [[META11]] = distinct !{[[META11]], !"vtrn_s32"} +// CHECK: [[META12]] = !{[[META13:![0-9]+]]} +// CHECK: [[META13]] = distinct !{[[META13]], [[META14:![0-9]+]], !"vtrn_u8: %agg.result"} +// CHECK: [[META14]] = distinct !{[[META14]], !"vtrn_u8"} +// CHECK: [[META15]] = !{[[META16:![0-9]+]]} +// CHECK: [[META16]] = distinct !{[[META16]], [[META17:![0-9]+]], !"vtrn_u16: %agg.result"} +// CHECK: [[META17]] = distinct !{[[META17]], !"vtrn_u16"} +// CHECK: [[META18]] = !{[[META19:![0-9]+]]} +// CHECK: [[META19]] = distinct !{[[META19]], [[META20:![0-9]+]], !"vtrn_u32: %agg.result"} +// CHECK: [[META20]] = distinct !{[[META20]], !"vtrn_u32"} +// CHECK: [[META21]] = !{[[META22:![0-9]+]]} +// CHECK: [[META22]] = distinct !{[[META22]], [[META23:![0-9]+]], !"vtrn_f32: %agg.result"} +// CHECK: [[META23]] = distinct !{[[META23]], !"vtrn_f32"} +// CHECK: [[META24]] = !{[[META25:![0-9]+]]} +// CHECK: [[META25]] = distinct !{[[META25]], [[META26:![0-9]+]], !"vtrn_p8: %agg.result"} +// CHECK: [[META26]] = distinct !{[[META26]], !"vtrn_p8"} +// CHECK: [[META27]] = !{[[META28:![0-9]+]]} +// CHECK: [[META28]] = distinct !{[[META28]], [[META29:![0-9]+]], !"vtrn_p16: %agg.result"} +// CHECK: [[META29]] = distinct !{[[META29]], !"vtrn_p16"} +// CHECK: [[META30]] = !{[[META31:![0-9]+]]} +// CHECK: [[META31]] = distinct !{[[META31]], [[META32:![0-9]+]], !"vtrnq_s8: %agg.result"} +// CHECK: [[META32]] = distinct !{[[META32]], !"vtrnq_s8"} +// CHECK: [[META33]] = !{[[META34:![0-9]+]]} +// CHECK: [[META34]] = distinct !{[[META34]], [[META35:![0-9]+]], !"vtrnq_s16: %agg.result"} +// CHECK: [[META35]] = distinct !{[[META35]], !"vtrnq_s16"} +// CHECK: [[META36]] = !{[[META37:![0-9]+]]} +// CHECK: [[META37]] = distinct !{[[META37]], [[META38:![0-9]+]], !"vtrnq_s32: %agg.result"} +// CHECK: [[META38]] = distinct !{[[META38]], !"vtrnq_s32"} +// CHECK: [[META39]] = !{[[META40:![0-9]+]]} +// CHECK: [[META40]] = distinct !{[[META40]], [[META41:![0-9]+]], !"vtrnq_u8: %agg.result"} +// CHECK: [[META41]] = distinct !{[[META41]], !"vtrnq_u8"} +// CHECK: [[META42]] = !{[[META43:![0-9]+]]} +// CHECK: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"vtrnq_u16: %agg.result"} +// CHECK: [[META44]] = distinct !{[[META44]], !"vtrnq_u16"} +// CHECK: [[META45]] = !{[[META46:![0-9]+]]} +// CHECK: [[META46]] = distinct !{[[META46]], [[META47:![0-9]+]], !"vtrnq_u32: %agg.result"} +// CHECK: [[META47]] = distinct !{[[META47]], !"vtrnq_u32"} +// CHECK: [[META48]] = !{[[META49:![0-9]+]]} +// CHECK: [[META49]] = distinct !{[[META49]], [[META50:![0-9]+]], !"vtrnq_f32: %agg.result"} +// CHECK: [[META50]] = distinct !{[[META50]], !"vtrnq_f32"} +// CHECK: [[META51]] = !{[[META52:![0-9]+]]} +// CHECK: [[META52]] = distinct !{[[META52]], [[META53:![0-9]+]], !"vtrnq_p8: %agg.result"} +// CHECK: [[META53]] = distinct !{[[META53]], !"vtrnq_p8"} +// CHECK: [[META54]] = !{[[META55:![0-9]+]]} +// CHECK: [[META55]] = distinct !{[[META55]], [[META56:![0-9]+]], !"vtrnq_p16: %agg.result"} +// CHECK: [[META56]] = distinct !{[[META56]], !"vtrnq_p16"} +// CHECK: [[META57]] = !{[[META58:![0-9]+]]} +// CHECK: [[META58]] = distinct !{[[META58]], [[META59:![0-9]+]], !"vuzp_s8: %agg.result"} +// CHECK: [[META59]] = distinct !{[[META59]], !"vuzp_s8"} +// CHECK: [[META60]] = !{[[META61:![0-9]+]]} +// CHECK: [[META61]] = distinct !{[[META61]], [[META62:![0-9]+]], !"vuzp_s16: %agg.result"} +// CHECK: [[META62]] = distinct !{[[META62]], !"vuzp_s16"} +// CHECK: [[META63]] = !{[[META64:![0-9]+]]} +// CHECK: [[META64]] = distinct !{[[META64]], [[META65:![0-9]+]], !"vuzp_s32: %agg.result"} +// CHECK: [[META65]] = distinct !{[[META65]], !"vuzp_s32"} +// CHECK: [[META66]] = !{[[META67:![0-9]+]]} +// CHECK: [[META67]] = distinct !{[[META67]], [[META68:![0-9]+]], !"vuzp_u8: %agg.result"} +// CHECK: [[META68]] = distinct !{[[META68]], !"vuzp_u8"} +// CHECK: [[META69]] = !{[[META70:![0-9]+]]} +// CHECK: [[META70]] = distinct !{[[META70]], [[META71:![0-9]+]], !"vuzp_u16: %agg.result"} +// CHECK: [[META71]] = distinct !{[[META71]], !"vuzp_u16"} +// CHECK: [[META72]] = !{[[META73:![0-9]+]]} +// CHECK: [[META73]] = distinct !{[[META73]], [[META74:![0-9]+]], !"vuzp_u32: %agg.result"} +// CHECK: [[META74]] = distinct !{[[META74]], !"vuzp_u32"} +// CHECK: [[META75]] = !{[[META76:![0-9]+]]} +// CHECK: [[META76]] = distinct !{[[META76]], [[META77:![0-9]+]], !"vuzp_f32: %agg.result"} +// CHECK: [[META77]] = distinct !{[[META77]], !"vuzp_f32"} +// CHECK: [[META78]] = !{[[META79:![0-9]+]]} +// CHECK: [[META79]] = distinct !{[[META79]], [[META80:![0-9]+]], !"vuzp_p8: %agg.result"} +// CHECK: [[META80]] = distinct !{[[META80]], !"vuzp_p8"} +// CHECK: [[META81]] = !{[[META82:![0-9]+]]} +// CHECK: [[META82]] = distinct !{[[META82]], [[META83:![0-9]+]], !"vuzp_p16: %agg.result"} +// CHECK: [[META83]] = distinct !{[[META83]], !"vuzp_p16"} +// CHECK: [[META84]] = !{[[META85:![0-9]+]]} +// CHECK: [[META85]] = distinct !{[[META85]], [[META86:![0-9]+]], !"vuzpq_s8: %agg.result"} +// CHECK: [[META86]] = distinct !{[[META86]], !"vuzpq_s8"} +// CHECK: [[META87]] = !{[[META88:![0-9]+]]} +// CHECK: [[META88]] = distinct !{[[META88]], [[META89:![0-9]+]], !"vuzpq_s16: %agg.result"} +// CHECK: [[META89]] = distinct !{[[META89]], !"vuzpq_s16"} +// CHECK: [[META90]] = !{[[META91:![0-9]+]]} +// CHECK: [[META91]] = distinct !{[[META91]], [[META92:![0-9]+]], !"vuzpq_s32: %agg.result"} +// CHECK: [[META92]] = distinct !{[[META92]], !"vuzpq_s32"} +// CHECK: [[META93]] = !{[[META94:![0-9]+]]} +// CHECK: [[META94]] = distinct !{[[META94]], [[META95:![0-9]+]], !"vuzpq_u8: %agg.result"} +// CHECK: [[META95]] = distinct !{[[META95]], !"vuzpq_u8"} +// CHECK: [[META96]] = !{[[META97:![0-9]+]]} +// CHECK: [[META97]] = distinct !{[[META97]], [[META98:![0-9]+]], !"vuzpq_u16: %agg.result"} +// CHECK: [[META98]] = distinct !{[[META98]], !"vuzpq_u16"} +// CHECK: [[META99]] = !{[[META100:![0-9]+]]} +// CHECK: [[META100]] = distinct !{[[META100]], [[META101:![0-9]+]], !"vuzpq_u32: %agg.result"} +// CHECK: [[META101]] = distinct !{[[META101]], !"vuzpq_u32"} +// CHECK: [[META102]] = !{[[META103:![0-9]+]]} +// CHECK: [[META103]] = distinct !{[[META103]], [[META104:![0-9]+]], !"vuzpq_f32: %agg.result"} +// CHECK: [[META104]] = distinct !{[[META104]], !"vuzpq_f32"} +// CHECK: [[META105]] = !{[[META106:![0-9]+]]} +// CHECK: [[META106]] = distinct !{[[META106]], [[META107:![0-9]+]], !"vuzpq_p8: %agg.result"} +// CHECK: [[META107]] = distinct !{[[META107]], !"vuzpq_p8"} +// CHECK: [[META108]] = !{[[META109:![0-9]+]]} +// CHECK: [[META109]] = distinct !{[[META109]], [[META110:![0-9]+]], !"vuzpq_p16: %agg.result"} +// CHECK: [[META110]] = distinct !{[[META110]], !"vuzpq_p16"} +// CHECK: [[META111]] = !{[[META112:![0-9]+]]} +// CHECK: [[META112]] = distinct !{[[META112]], [[META113:![0-9]+]], !"vzip_s8: %agg.result"} +// CHECK: [[META113]] = distinct !{[[META113]], !"vzip_s8"} +// CHECK: [[META114]] = !{[[META115:![0-9]+]]} +// CHECK: [[META115]] = distinct !{[[META115]], [[META116:![0-9]+]], !"vzip_s16: %agg.result"} +// CHECK: [[META116]] = distinct !{[[META116]], !"vzip_s16"} +// CHECK: [[META117]] = !{[[META118:![0-9]+]]} +// CHECK: [[META118]] = distinct !{[[META118]], [[META119:![0-9]+]], !"vzip_s32: %agg.result"} +// CHECK: [[META119]] = distinct !{[[META119]], !"vzip_s32"} +// CHECK: [[META120]] = !{[[META121:![0-9]+]]} +// CHECK: [[META121]] = distinct !{[[META121]], [[META122:![0-9]+]], !"vzip_u8: %agg.result"} +// CHECK: [[META122]] = distinct !{[[META122]], !"vzip_u8"} +// CHECK: [[META123]] = !{[[META124:![0-9]+]]} +// CHECK: [[META124]] = distinct !{[[META124]], [[META125:![0-9]+]], !"vzip_u16: %agg.result"} +// CHECK: [[META125]] = distinct !{[[META125]], !"vzip_u16"} +// CHECK: [[META126]] = !{[[META127:![0-9]+]]} +// CHECK: [[META127]] = distinct !{[[META127]], [[META128:![0-9]+]], !"vzip_u32: %agg.result"} +// CHECK: [[META128]] = distinct !{[[META128]], !"vzip_u32"} +// CHECK: [[META129]] = !{[[META130:![0-9]+]]} +// CHECK: [[META130]] = distinct !{[[META130]], [[META131:![0-9]+]], !"vzip_f32: %agg.result"} +// CHECK: [[META131]] = distinct !{[[META131]], !"vzip_f32"} +// CHECK: [[META132]] = !{[[META133:![0-9]+]]} +// CHECK: [[META133]] = distinct !{[[META133]], [[META134:![0-9]+]], !"vzip_p8: %agg.result"} +// CHECK: [[META134]] = distinct !{[[META134]], !"vzip_p8"} +// CHECK: [[META135]] = !{[[META136:![0-9]+]]} +// CHECK: [[META136]] = distinct !{[[META136]], [[META137:![0-9]+]], !"vzip_p16: %agg.result"} +// CHECK: [[META137]] = distinct !{[[META137]], !"vzip_p16"} +// CHECK: [[META138]] = !{[[META139:![0-9]+]]} +// CHECK: [[META139]] = distinct !{[[META139]], [[META140:![0-9]+]], !"vzipq_s8: %agg.result"} +// CHECK: [[META140]] = distinct !{[[META140]], !"vzipq_s8"} +// CHECK: [[META141]] = !{[[META142:![0-9]+]]} +// CHECK: [[META142]] = distinct !{[[META142]], [[META143:![0-9]+]], !"vzipq_s16: %agg.result"} +// CHECK: [[META143]] = distinct !{[[META143]], !"vzipq_s16"} +// CHECK: [[META144]] = !{[[META145:![0-9]+]]} +// CHECK: [[META145]] = distinct !{[[META145]], [[META146:![0-9]+]], !"vzipq_s32: %agg.result"} +// CHECK: [[META146]] = distinct !{[[META146]], !"vzipq_s32"} +// CHECK: [[META147]] = !{[[META148:![0-9]+]]} +// CHECK: [[META148]] = distinct !{[[META148]], [[META149:![0-9]+]], !"vzipq_u8: %agg.result"} +// CHECK: [[META149]] = distinct !{[[META149]], !"vzipq_u8"} +// CHECK: [[META150]] = !{[[META151:![0-9]+]]} +// CHECK: [[META151]] = distinct !{[[META151]], [[META152:![0-9]+]], !"vzipq_u16: %agg.result"} +// CHECK: [[META152]] = distinct !{[[META152]], !"vzipq_u16"} +// CHECK: [[META153]] = !{[[META154:![0-9]+]]} +// CHECK: [[META154]] = distinct !{[[META154]], [[META155:![0-9]+]], !"vzipq_u32: %agg.result"} +// CHECK: [[META155]] = distinct !{[[META155]], !"vzipq_u32"} +// CHECK: [[META156]] = !{[[META157:![0-9]+]]} +// CHECK: [[META157]] = distinct !{[[META157]], [[META158:![0-9]+]], !"vzipq_f32: %agg.result"} +// CHECK: [[META158]] = distinct !{[[META158]], !"vzipq_f32"} +// CHECK: [[META159]] = !{[[META160:![0-9]+]]} +// CHECK: [[META160]] = distinct !{[[META160]], [[META161:![0-9]+]], !"vzipq_p8: %agg.result"} +// CHECK: [[META161]] = distinct !{[[META161]], !"vzipq_p8"} +// CHECK: [[META162]] = !{[[META163:![0-9]+]]} +// CHECK: [[META163]] = distinct !{[[META163]], [[META164:![0-9]+]], !"vzipq_p16: %agg.result"} +// CHECK: [[META164]] = distinct !{[[META164]], !"vzipq_p16"} +//. diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 51eb02be58692..51bfe212464cf 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -1367,7 +1367,7 @@ void Intrinsic::emitBodyAsBuiltinCall() { LocalCK = ClassB; if (!getReturnType().isVoid() && !SRet) - S += "(" + RetVar.getType().str() + ") "; + S += "__builtin_bit_cast(" + RetVar.getType().str() + ", "; S += "__builtin_neon_" + mangleName(std::string(N), LocalCK) + "("; @@ -1387,11 +1387,12 @@ void Intrinsic::emitBodyAsBuiltinCall() { Type T2 = T; T2.makeOneVector(); T2.makeInteger(8, /*Sign=*/true); - Cast = "(" + T2.str() + ")"; + Cast = "__builtin_bit_cast(" + T2.str() + ", "; } for (unsigned J = 0; J < T.getNumVectors(); ++J) - S += Cast + V.getName() + ".val[" + utostr(J) + "], "; + S += Cast + V.getName() + ".val[" + utostr(J) + "]" + + (Cast.empty() ? ", " : "), "); continue; } @@ -1399,14 +1400,16 @@ void Intrinsic::emitBodyAsBuiltinCall() { Type CastToType = T; // Check if an explicit cast is needed. - if (CastToType.isVector() && - (LocalCK == ClassB || (T.isHalf() && !T.isScalarForMangling()))) { - CastToType.makeInteger(8, true); - Arg = "(" + CastToType.str() + ")" + Arg; - } else if (CastToType.isVector() && LocalCK == ClassI) { - if (CastToType.isInteger()) - CastToType.makeSigned(); - Arg = "(" + CastToType.str() + ")" + Arg; + if (CastToType.isVector()) { + if (LocalCK == ClassB || (T.isHalf() && !T.isScalarForMangling())) { + CastToType.makeInteger(8, true); + Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; + } else if (LocalCK == ClassI) { + if (CastToType.isInteger()) { + CastToType.makeSigned(); + Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; + } + } } S += Arg + ", "; @@ -1420,6 +1423,9 @@ void Intrinsic::emitBodyAsBuiltinCall() { S.pop_back(); S.pop_back(); } + + if (!getReturnType().isVoid() && !SRet) + S += ")"; S += ");"; std::string RetExpr; diff --git a/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.ll b/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.ll new file mode 100644 index 0000000000000..58f1accdf91d5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.ll @@ -0,0 +1,276 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s +target triple = "arm64-none-linux-gnu" + +define <4 x half> @test_vsqrt_f16(<4 x half> %a) #0 { +; CHECK-LABEL: test_vsqrt_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vsqrt.i = call <4 x half> @llvm.experimental.constrained.sqrt.v4f16(<4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %vsqrt.i +} + +define <8 x half> @test_vsqrtq_f16(<8 x half> %a) #0 { +; CHECK-LABEL: test_vsqrtq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.8h, v0.8h +; CHECK-NEXT: ret +entry: + %vsqrt.i = call <8 x half> @llvm.experimental.constrained.sqrt.v8f16(<8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %vsqrt.i +} + +define <4 x half> @test_vfma_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfma_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h +; CHECK-NEXT: ret +entry: + %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %0 +} + +define <8 x half> @test_vfmaq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmaq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h +; CHECK-NEXT: ret +entry: + %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %0 +} + +define <4 x half> @test_vfms_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfms_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls v0.4h, v2.4h, v1.4h +; CHECK-NEXT: ret +entry: + %fneg.i = fneg <4 x half> %b + %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %fneg.i, <4 x half> %c, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %0 +} + +define <8 x half> @test_vfmsq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmsq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls v0.8h, v2.8h, v1.8h +; CHECK-NEXT: ret +entry: + %fneg.i = fneg <8 x half> %b + %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %fneg.i, <8 x half> %c, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %0 +} + +define <4 x half> @test_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfma_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[3] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <4 x half> %c, <4 x half> poison, <4 x i32> + %fmla2 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> %lane, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %fmla2 +} + +define <8 x half> @test_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfmaq_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[3] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <4 x half> %c, <4 x half> poison, <8 x i32> + %fmla2 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> %lane, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %fmla2 +} + +define <4 x half> @test_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfma_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[7] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <8 x half> %c, <8 x half> poison, <4 x i32> + %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %lane, <4 x half> %b, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %0 +} + +define <8 x half> @test_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmaq_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[7] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <8 x half> %c, <8 x half> poison, <8 x i32> + %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %lane, <8 x half> %b, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %0 +} + +define <4 x half> @test_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) #0 { +; CHECK-LABEL: test_vfma_n_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[0] +; CHECK-NEXT: ret +entry: + %vecinit = insertelement <4 x half> poison, half %c, i64 0 + %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> poison, <4 x i32> zeroinitializer + %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> %vecinit3, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %0 +} + +define <8 x half> @test_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) #0 { +; CHECK-LABEL: test_vfmaq_n_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0] +; CHECK-NEXT: ret +entry: + %vecinit = insertelement <8 x half> poison, half %c, i64 0 + %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> poison, <8 x i32> zeroinitializer + %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> %vecinit7, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %0 +} + +define half @test_vfmah_lane_f16(half %a, half %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfmah_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla h0, h1, v2.h[3] +; CHECK-NEXT: ret +entry: + %extract = extractelement <4 x half> %c, i64 3 + %0 = call half @llvm.experimental.constrained.fma.f16(half %b, half %extract, half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %0 +} + +define half @test_vfmah_laneq_f16(half %a, half %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmah_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla h0, h1, v2.h[7] +; CHECK-NEXT: ret +entry: + %extract = extractelement <8 x half> %c, i64 7 + %0 = call half @llvm.experimental.constrained.fma.f16(half %b, half %extract, half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %0 +} + +define <4 x half> @test_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfms_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[3] +; CHECK-NEXT: ret +entry: + %fneg = fneg <4 x half> %b + %lane = shufflevector <4 x half> %c, <4 x half> poison, <4 x i32> + %fmla2 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %fneg, <4 x half> %lane, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %fmla2 +} + +define <8 x half> @test_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfmsq_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[3] +; CHECK-NEXT: ret +entry: + %fneg = fneg <8 x half> %b + %lane = shufflevector <4 x half> %c, <4 x half> poison, <8 x i32> + %fmla2 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %fneg, <8 x half> %lane, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %fmla2 +} + +define <4 x half> @test_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfms_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[7] +; CHECK-NEXT: ret +entry: + %fneg = fneg <4 x half> %b + %lane = shufflevector <8 x half> %c, <8 x half> poison, <4 x i32> + %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %lane, <4 x half> %fneg, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %0 +} + +define <8 x half> @test_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmsq_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[7] +; CHECK-NEXT: ret +entry: + %fneg = fneg <8 x half> %b + %lane = shufflevector <8 x half> %c, <8 x half> poison, <8 x i32> + %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %lane, <8 x half> %fneg, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %0 +} + +define <4 x half> @test_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) #0 { +; CHECK-LABEL: test_vfms_n_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[0] +; CHECK-NEXT: ret +entry: + %fneg = fneg <4 x half> %b + %vecinit = insertelement <4 x half> poison, half %c, i64 0 + %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> poison, <4 x i32> zeroinitializer + %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %fneg, <4 x half> %vecinit3, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %0 +} + +define <8 x half> @test_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) #0 { +; CHECK-LABEL: test_vfmsq_n_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0] +; CHECK-NEXT: ret +entry: + %fneg = fneg <8 x half> %b + %vecinit = insertelement <8 x half> poison, half %c, i64 0 + %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> poison, <8 x i32> zeroinitializer + %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %fneg, <8 x half> %vecinit7, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %0 +} + +define half @test_vfmsh_lane_f16(half %a, half %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfmsh_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fneg s1, s1 +; CHECK-NEXT: fcvt h1, s1 +; CHECK-NEXT: fmla h0, h1, v2.h[3] +; CHECK-NEXT: ret +entry: + %conv = call float @llvm.experimental.constrained.fpext.f32.f16(half %b, metadata !"fpexcept.strict") #1 + %fneg = fneg float %conv + %0 = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %fneg, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + %extract = extractelement <4 x half> %c, i64 3 + %1 = call half @llvm.experimental.constrained.fma.f16(half %0, half %extract, half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %1 +} + +define half @test_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmsh_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fneg s1, s1 +; CHECK-NEXT: fcvt h1, s1 +; CHECK-NEXT: fmla h0, h1, v2.h[7] +; CHECK-NEXT: ret +entry: + %conv = call float @llvm.experimental.constrained.fpext.f32.f16(half %b, metadata !"fpexcept.strict") #1 + %fneg = fneg float %conv + %0 = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %fneg, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + %extract = extractelement <8 x half> %c, i64 7 + %1 = call half @llvm.experimental.constrained.fma.f16(half %0, half %extract, half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %1 +} + +attributes #0 = { noinline nounwind strictfp "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" } +attributes #1 = { strictfp } From e47d3a30881794c99a6de3aa9adc806bf3dc79f6 Mon Sep 17 00:00:00 2001 From: Peter Smith Date: Tue, 1 Apr 2025 09:49:27 +0100 Subject: [PATCH 0212/1029] [LLD][AArch64] Increase alignment of AArch64AbsLongThunk to 8 (#133738) This permits an AArch64AbsLongThunk to be used in an environment where unaligned accesses are disabled. The AArch64AbsLongThunk does a load of an 8-byte address. When unaligned accesses are disabled this address must be 8-byte aligned. The vast majority of AArch64 systems will have unaligned accesses enabled in userspace. However, after a reset, before the MMU has been enabled, all memory accesses are to "device" memory, which requires aligned accesses. In systems with multi-stage boot loaders a thunk may be required to a later stage before the MMU has been enabled. As we only want to increase the alignment when the ldr is used we delay the increase in thunk alignment until we know we are going to write an ldr. We also need to account for the ThunkSection alignment increase when this happens. In some of the test updates, particularly those with shared CHECK lines with position independent thunks it was easier to ensure that the thunks started at an 8-byte aligned address in all cases. --- lld/ELF/SyntheticSections.cpp | 8 +- lld/ELF/Thunks.cpp | 3 + lld/test/ELF/aarch64-call26-thunk.s | 16 +- .../ELF/aarch64-cortex-a53-843419-thunk.s | 2 +- lld/test/ELF/aarch64-jump26-thunk.s | 15 +- .../ELF/aarch64-range-thunk-extension-plt32.s | 10 +- lld/test/ELF/aarch64-thunk-align.s | 42 +++++ lld/test/ELF/aarch64-thunk-bti-multipass.s | 4 +- lld/test/ELF/aarch64-thunk-bti.s | 150 +++++++++--------- 9 files changed, 151 insertions(+), 99 deletions(-) create mode 100644 lld/test/ELF/aarch64-thunk-align.s diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index b03c4282ab1aa..3bb9815336a7c 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -4317,14 +4317,20 @@ InputSection *ThunkSection::getTargetInputSection() const { bool ThunkSection::assignOffsets() { uint64_t off = 0; + bool changed = false; for (Thunk *t : thunks) { + if (t->alignment > addralign) { + addralign = t->alignment; + changed = true; + } off = alignToPowerOf2(off, t->alignment); t->setOffset(off); uint32_t size = t->size(); t->getThunkTargetSym()->size = size; off += size; } - bool changed = off != size; + if (off != size) + changed = true; size = off; return changed; } diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp index 0008ee3a0de67..bad1b4b85735a 100644 --- a/lld/ELF/Thunks.cpp +++ b/lld/ELF/Thunks.cpp @@ -674,6 +674,9 @@ void AArch64ABSLongThunk::addSymbols(ThunkSection &isec) { void AArch64ABSLongThunk::addLongMapSyms() { addSymbol("$d", STT_NOTYPE, 8, *tsec); + // The ldr in the long Thunk requires 8-byte alignment when + // unaligned accesses are disabled. + alignment = 8; } void AArch64ABSXOLongThunk::writeLong(uint8_t *buf) { diff --git a/lld/test/ELF/aarch64-call26-thunk.s b/lld/test/ELF/aarch64-call26-thunk.s index 8c3034f568b39..4200bb09eaef7 100644 --- a/lld/test/ELF/aarch64-call26-thunk.s +++ b/lld/test/ELF/aarch64-call26-thunk.s @@ -9,13 +9,13 @@ _start: bl big -// CHECK: Disassembly of section .text: +// CHECK-LABEL: <_start>: +// CHECK-NEXT: 210120: bl 0x210128 <__AArch64AbsLongThunk_big> +// CHECK-NEXT: udf #0x0 // CHECK-EMPTY: -// CHECK-NEXT: <_start>: -// CHECK-NEXT: 210120: bl 0x210124 -// CHECK: <__AArch64AbsLongThunk_big>: -// CHECK-NEXT: 210124: ldr x16, 0x21012c -// CHECK-NEXT: 210128: br x16 -// CHECK-NEXT: 21012c: 00 00 00 00 .word 0x00000000 -// CHECK-NEXT: 210130: 10 00 00 00 .word 0x00000010 +// CHECK-LABEL: <__AArch64AbsLongThunk_big>: +// CHECK-NEXT: 210128: ldr x16, 0x210130 <__AArch64AbsLongThunk_big+0x8> +// CHECK-NEXT: br x16 +// CHECK-NEXT: 00 00 00 00 .word 0x00000000 +// CHECK-NEXT: 10 00 00 00 .word 0x00000010 diff --git a/lld/test/ELF/aarch64-cortex-a53-843419-thunk.s b/lld/test/ELF/aarch64-cortex-a53-843419-thunk.s index 484de11ac5541..4fca2113c39a6 100644 --- a/lld/test/ELF/aarch64-cortex-a53-843419-thunk.s +++ b/lld/test/ELF/aarch64-cortex-a53-843419-thunk.s @@ -24,7 +24,7 @@ _start: /// Thunk to far_away, size 16-bytes goes here. .section .text.02, "ax", %progbits - .space 4096 - 28 + .space 4096 - 32 /// Erratum sequence will only line up at address 0 modulo 0xffc when /// Thunk is inserted. diff --git a/lld/test/ELF/aarch64-jump26-thunk.s b/lld/test/ELF/aarch64-jump26-thunk.s index 7fea5eebc3668..3c9ce69920efc 100644 --- a/lld/test/ELF/aarch64-jump26-thunk.s +++ b/lld/test/ELF/aarch64-jump26-thunk.s @@ -11,10 +11,11 @@ _start: // CHECK: Disassembly of section .text: // CHECK-EMPTY: -// CHECK-NEXT: <_start>: -// CHECK-NEXT: 210120: b 0x210124 -// CHECK: <__AArch64AbsLongThunk_big>: -// CHECK-NEXT: 210124: ldr x16, 0x21012c -// CHECK-NEXT: 210128: br x16 -// CHECK-NEXT: 21012c: 00 00 00 00 .word 0x00000000 -// CHECK-NEXT: 210130: 10 00 00 00 .word 0x00000010 +// CHECK-LABEL: <_start>: +// CHECK-NEXT: 210120: b 0x210128 +// CHECK-NEXT: udf #0x0 +// CHECK-LABEL: <__AArch64AbsLongThunk_big>: +// CHECK-NEXT: 210128: ldr x16, 0x210130 +// CHECK-NEXT: br x16 +// CHECK-NEXT: 00 00 00 00 .word 0x00000000 +// CHECK-NEXT: 10 00 00 00 .word 0x00000010 diff --git a/lld/test/ELF/aarch64-range-thunk-extension-plt32.s b/lld/test/ELF/aarch64-range-thunk-extension-plt32.s index 1d09012a2295c..9ebf7f5c69526 100644 --- a/lld/test/ELF/aarch64-range-thunk-extension-plt32.s +++ b/lld/test/ELF/aarch64-range-thunk-extension-plt32.s @@ -9,14 +9,14 @@ // The word should be an offset to the range extension thunk. // CHECK-LABEL: <_start>: -// CHECK-NEXT: 10000: 04 00 00 00 .word 0x00000004 +// CHECK-NEXT: 10000: 08 00 00 00 .word 0x00000008 // The thunk redirects to the address of callee. // CHECK-LABEL: <__AArch64AbsLongThunk_callee>: -// CHECK-NEXT: 10004: ldr x16, 0x1000c <__AArch64AbsLongThunk_callee+0x8> -// CHECK-NEXT: 10008: br x16 -// CHECK-NEXT: 1000c: 00 00 00 00 .word 0x00000000 -// CHECK-NEXT: 10010: 02 00 00 00 .word 0x00000002 +// CHECK-NEXT: 10008: ldr x16, 0x10010 <__AArch64AbsLongThunk_callee+0x8> +// CHECK-NEXT: br x16 +// CHECK-NEXT: 00 00 00 00 .word 0x00000000 +// CHECK-NEXT: 02 00 00 00 .word 0x00000002 // CHECK-LABEL: : // CHECK-NEXT: 200000000: ret diff --git a/lld/test/ELF/aarch64-thunk-align.s b/lld/test/ELF/aarch64-thunk-align.s new file mode 100644 index 0000000000000..425a00c7564a4 --- /dev/null +++ b/lld/test/ELF/aarch64-thunk-align.s @@ -0,0 +1,42 @@ +// REQUIRES: aarch64 +// RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t +// RUN: ld.lld -Ttext=0x12000 -defsym long=0x10000000 -defsym short=0x8012004 -defsym short2=0x8012008 -defsym short3=0x801200c %t -o %t.exe +// RUN: llvm-objdump -d --no-show-raw-insn %t.exe | FileCheck %s + +/// The AArch64AbsLongThunk requires 8-byte alignment just in case unaligned +/// accesses are disabled. This increases the thunk section alignment to 8, +/// and the alignment of the AArch64AbsLongThunk to 8. The short thunk form +/// can still use 4-byte alignment. +.text +.type _start, %function +.globl _start +_start: + b short + b short2 + b short3 + b long + nop + +// CHECK-LABEL: <_start>: +// CHECK-NEXT: 12000: b 0x12018 <__AArch64AbsLongThunk_short> +// CHECK-NEXT: b 0x1201c <__AArch64AbsLongThunk_short2> +// CHECK-NEXT: b 0x12020 <__AArch64AbsLongThunk_short3> +// CHECK-NEXT: b 0x12028 <__AArch64AbsLongThunk_long> +// CHECK-NEXT: nop +// CHECK-NEXT: udf #0x0 +// CHECK-EMPTY: +// CHECK-LABEL: <__AArch64AbsLongThunk_short>: +// CHECK-NEXT: 12018: b 0x8012004 <__AArch64AbsLongThunk_long+0x7ffffdc> +// CHECK-EMPY: +// CHECK-LABEL: <__AArch64AbsLongThunk_short2>: +// CHECK-NEXT: 1201c: b 0x8012008 <__AArch64AbsLongThunk_long+0x7ffffe0> +// CHECK-EMPTY: +// CHECK-LABEL: <__AArch64AbsLongThunk_short3>: +// CHECK-NEXT: 12020: b 0x801200c <__AArch64AbsLongThunk_long+0x7ffffe4> +// CHECK-NEXT: udf #0x0 +// CHECK-EMPTY: +// CHECK-LABEL: <__AArch64AbsLongThunk_long>: +// CHECK-NEXT: 12028: ldr x16, 0x12030 <__AArch64AbsLongThunk_long+0x8> +// CHECK-NEXT: br x16 +// CHECK-NEXT: 00 00 00 10 .word 0x10000000 +// CHECK-NEXT: 00 00 00 00 .word 0x00000000 diff --git a/lld/test/ELF/aarch64-thunk-bti-multipass.s b/lld/test/ELF/aarch64-thunk-bti-multipass.s index 4f0d7343cc6bf..f2ff914fb850d 100644 --- a/lld/test/ELF/aarch64-thunk-bti-multipass.s +++ b/lld/test/ELF/aarch64-thunk-bti-multipass.s @@ -38,10 +38,10 @@ _start: /// and will need a long branch thunk, which in turn needs a BTI landing pad. // CHECK-LABEL: <_start>: -// CHECK-NEXT: 10001000: bl 0x10002004 <__AArch64AbsLongThunk_fn1> +// CHECK-NEXT: 10001000: bl 0x10002008 <__AArch64AbsLongThunk_fn1> // CHECK-LABEL: <__AArch64AbsLongThunk_fn1>: -// CHECK-NEXT: 10002004: ldr x16, 0x1000200c <__AArch64AbsLongThunk_fn1+0x8> +// CHECK-NEXT: 10002008: ldr x16, 0x10002010 <__AArch64AbsLongThunk_fn1+0x8> // CHECK-NEXT: br x16 // CHECK-NEXT: 00 30 00 18 .word 0x18003000 // CHECK-NEXT: 00 00 00 00 .word 0x00000000 diff --git a/lld/test/ELF/aarch64-thunk-bti.s b/lld/test/ELF/aarch64-thunk-bti.s index a447fe4ee9274..0672ad75d8094 100644 --- a/lld/test/ELF/aarch64-thunk-bti.s +++ b/lld/test/ELF/aarch64-thunk-bti.s @@ -52,19 +52,17 @@ _start: bl via_plt /// We cannot add landing pads for absolute symbols. bl absolute - /// padding so that we require thunks that can be placed after this section. /// The thunks are close enough to the target to be short. + .balign 8 .space 0x1000 // CHECK-PADS-LABEL: <_start>: -// CHECK-PADS-NEXT: 10001000: bl 0x1000203c -// CHECK-PADS-NEXT: bl 0x10002040 +// CHECK-PADS-NEXT: 10001000: bl 0x10002040 // CHECK-PADS-NEXT: bl 0x10002044 // CHECK-PADS-NEXT: bl 0x10002048 // CHECK-PADS-NEXT: bl 0x1000204c // CHECK-PADS-NEXT: bl 0x10002050 -// CHECK-PADS-NEXT: b 0x10002050 // CHECK-PADS-NEXT: bl 0x10002054 // CHECK-PADS-NEXT: b 0x10002054 // CHECK-PADS-NEXT: bl 0x10002058 @@ -72,73 +70,75 @@ _start: // CHECK-PADS-NEXT: bl 0x1000205c // CHECK-PADS-NEXT: b 0x1000205c // CHECK-PADS-NEXT: bl 0x10002060 +// CHECK-PADS-NEXT: b 0x10002060 // CHECK-PADS-NEXT: bl 0x10002064 +// CHECK-PADS-NEXT: bl 0x10002068 // CHECK-LABEL: <__AArch64ADRPThunk_>: -// CHECK-NEXT: 1000203c: b 0x18001000 +// CHECK-NEXT: 10002040: b 0x18001000 // CHECK-LABEL: <__AArch64ADRPThunk_>: -// CHECK-NEXT: 10002040: b 0x18001008 +// CHECK-NEXT: 10002044: b 0x18001008 // CHECK-LABEL: <__AArch64ADRPThunk_>: -// CHECK-NEXT: 10002044: b 0x18001010 +// CHECK-NEXT: 10002048: b 0x18001010 // CHECK-LABEL: <__AArch64ADRPThunk_>: -// CHECK-NEXT: 10002048: b 0x18001018 +// CHECK-NEXT: 1000204c: b 0x18001018 // CHECK-LABEL: <__AArch64ADRPThunk_>: -// CHECK-NEXT: 1000204c: b 0x18001020 +// CHECK-NEXT: 10002050: b 0x18001020 // CHECK-LABEL: <__AArch64ADRPThunk_>: -// CHECK-NEXT: 10002050: b 0x18001038 +// CHECK-NEXT: 10002054: b 0x18001038 // CHECK-LABEL: <__AArch64ADRPThunk_>: -// CHECK-NEXT: 10002054: b 0x18001034 +// CHECK-NEXT: 10002058: b 0x18001034 // CHECK-LABEL: <__AArch64ADRPThunk_>: -// CHECK-NEXT: 10002058: b 0x18001040 +// CHECK-NEXT: 1000205c: b 0x18001040 // CHECK-LABEL: <__AArch64ADRPThunk_>: -// CHECK-NEXT: 1000205c: b 0x18001050 +// CHECK-NEXT: 10002060: b 0x18001050 // CHECK-LABEL: <__AArch64ADRPThunk_via_plt>: -// CHECK-NEXT: 10002060: b 0x18001080 +// CHECK-NEXT: 10002064: b 0x18001080 // CHECK-LABEL: <__AArch64ADRPThunk_absolute>: -// CHECK-NEXT: 10002064: b 0x18001098 +// CHECK-NEXT: 10002068: b 0x18001098 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 1000203c: b 0x18001000 +// CHECK-EXE-NEXT: 10002040: b 0x18001000 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 10002040: b 0x18001008 +// CHECK-EXE-NEXT: 10002044: b 0x18001008 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 10002044: b 0x18001010 +// CHECK-EXE-NEXT: 10002048: b 0x18001010 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 10002048: b 0x18001018 +// CHECK-EXE-NEXT: 1000204c: b 0x18001018 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 1000204c: b 0x18001020 +// CHECK-EXE-NEXT: 10002050: b 0x18001020 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 10002050: b 0x18001038 +// CHECK-EXE-NEXT: 10002054: b 0x18001038 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 10002054: b 0x18001034 +// CHECK-EXE-NEXT: 10002058: b 0x18001034 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 10002058: b 0x18001040 +// CHECK-EXE-NEXT: 1000205c: b 0x18001040 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 1000205c: b 0x18001050 +// CHECK-EXE-NEXT: 10002060: b 0x18001050 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_via_plt>: -// CHECK-EXE-NEXT: 10002060: b 0x18001080 +// CHECK-EXE-NEXT: 10002064: b 0x18001080 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_absolute>: -// CHECK-EXE-NEXT: 10002064: ldr x16, 0x1000206c <__AArch64AbsLongThunk_absolute+0x8> +// CHECK-EXE-NEXT: 10002068: ldr x16, 0x10002070 <__AArch64AbsLongThunk_absolute+0x8> // CHECK-EXE-NEXT: br x16 // CHECK-EXE-NEXT: 00 00 00 f0 .word 0xf0000000 // CHECK-EXE-NEXT: 00 00 00 00 .word 0x00000000 @@ -278,21 +278,21 @@ long_calls: bl via_plt /// We cannot add landing pads for absolute symbols. bl absolute - + .balign 8 /// PLT entries have BTI at start. // CHECK-LABEL: : // CHECK-NEXT: bti c // CHECK-NEXT: adrp x16, 0x30000000 -// CHECK-NEXT: ldr x17, [x16, #0x198] -// CHECK-NEXT: add x16, x16, #0x198 +// CHECK-NEXT: ldr x17, [x16, #0x1a0] +// CHECK-NEXT: add x16, x16, #0x1a0 // CHECK-NEXT: br x17 // CHECK-NEXT: nop // CHECK: : // CHECK-NEXT: bti c // CHECK-NEXT: adrp x16, 0x30000000 -// CHECK-NEXT: ldr x17, [x16, #0x1a0] -// CHECK-NEXT: add x16, x16, #0x1a0 +// CHECK-NEXT: ldr x17, [x16, #0x1a8] +// CHECK-NEXT: add x16, x16, #0x1a8 // CHECK-NEXT: br x17 // CHECK-NEXT: nop @@ -305,25 +305,25 @@ long_calls: // CHECK-EXE-NEXT: nop // CHECK-LABEL: : -// CHECK-NEXT: 30000000: bl 0x3000003c <__AArch64ADRPThunk_> -// CHECK-NEXT: bl 0x30000048 <__AArch64ADRPThunk_> -// CHECK-NEXT: bl 0x30000054 <__AArch64ADRPThunk_> -// CHECK-NEXT: bl 0x30000060 <__AArch64ADRPThunk_> -// CHECK-NEXT: bl 0x3000006c <__AArch64ADRPThunk_> -// CHECK-NEXT: bl 0x30000078 <__AArch64ADRPThunk_> -// CHECK-NEXT: b 0x30000078 <__AArch64ADRPThunk_> -// CHECK-NEXT: bl 0x30000084 <__AArch64ADRPThunk_> -// CHECK-NEXT: b 0x30000084 <__AArch64ADRPThunk_> -// CHECK-NEXT: bl 0x30000090 <__AArch64ADRPThunk_> -// CHECK-NEXT: b 0x30000090 <__AArch64ADRPThunk_> -// CHECK-NEXT: bl 0x3000009c <__AArch64ADRPThunk_> -// CHECK-NEXT: b 0x3000009c <__AArch64ADRPThunk_> -// CHECK-NEXT: bl 0x300000a8 <__AArch64ADRPThunk_via_plt> -// CHECK-NEXT: bl 0x300000b4 <__AArch64ADRPThunk_absolute> +// CHECK-NEXT: 30000000: bl 0x30000040 <__AArch64ADRPThunk_> +// CHECK-NEXT: bl 0x3000004c <__AArch64ADRPThunk_> +// CHECK-NEXT: bl 0x30000058 <__AArch64ADRPThunk_> +// CHECK-NEXT: bl 0x30000064 <__AArch64ADRPThunk_> +// CHECK-NEXT: bl 0x30000070 <__AArch64ADRPThunk_> +// CHECK-NEXT: bl 0x3000007c <__AArch64ADRPThunk_> +// CHECK-NEXT: b 0x3000007c <__AArch64ADRPThunk_> +// CHECK-NEXT: bl 0x30000088 <__AArch64ADRPThunk_> +// CHECK-NEXT: b 0x30000088 <__AArch64ADRPThunk_> +// CHECK-NEXT: bl 0x30000094 <__AArch64ADRPThunk_> +// CHECK-NEXT: b 0x30000094 <__AArch64ADRPThunk_> +// CHECK-NEXT: bl 0x300000a0 <__AArch64ADRPThunk_> +// CHECK-NEXT: b 0x300000a0 <__AArch64ADRPThunk_> +// CHECK-NEXT: bl 0x300000ac <__AArch64ADRPThunk_via_plt> +// CHECK-NEXT: bl 0x300000b8 <__AArch64ADRPThunk_absolute> /// bti_c_target. // CHECK-LABEL: <__AArch64ADRPThunk_>: -// CHECK-NEXT: 3000003c: adrp x16, 0x18001000 +// CHECK-NEXT: 30000040: adrp x16, 0x18001000 // CHECK-NEXT: add x16, x16, #0x0 // CHECK-NEXT: br x16 /// bti_j_target. @@ -378,84 +378,84 @@ long_calls: // CHECK-NEXT: br x16 // CHECK-EXE-LABEL: : -// CHECK-EXE-NEXT: 30000000: bl 0x3000003c <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: bl 0x3000004c <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: bl 0x3000005c <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: bl 0x3000006c <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: bl 0x3000007c <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: bl 0x3000008c <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: b 0x3000008c <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: bl 0x3000009c <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: b 0x3000009c <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: bl 0x300000ac <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: b 0x300000ac <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: bl 0x300000bc <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: b 0x300000bc <__AArch64AbsLongThunk_> -// CHECK-EXE-NEXT: bl 0x300000cc <__AArch64AbsLongThunk_via_plt> -// CHECK-EXE-NEXT: bl 0x300000dc <__AArch64AbsLongThunk_absolute> - -// CHECK-EXE-LABEL: 000000003000003c <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 3000003c: ldr x16, 0x30000044 <__AArch64AbsLongThunk_+0x8> +// CHECK-EXE-NEXT: 30000000: bl 0x30000040 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: bl 0x30000050 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: bl 0x30000060 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: bl 0x30000070 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: bl 0x30000080 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: bl 0x30000090 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: b 0x30000090 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: bl 0x300000a0 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: b 0x300000a0 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: bl 0x300000b0 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: b 0x300000b0 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: bl 0x300000c0 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: b 0x300000c0 <__AArch64AbsLongThunk_> +// CHECK-EXE-NEXT: bl 0x300000d0 <__AArch64AbsLongThunk_via_plt> +// CHECK-EXE-NEXT: bl 0x300000e0 <__AArch64AbsLongThunk_absolute> + +// CHECK-EXE-LABEL: 0000000030000040 <__AArch64AbsLongThunk_>: +// CHECK-EXE-NEXT: 30000040: ldr x16, 0x30000048 <__AArch64AbsLongThunk_+0x8> // CHECK-EXE-NEXT: br x16 // CHECK-EXE-NEXT: 00 10 00 18 .word 0x18001000 // CHECK-EXE-NEXT: 00 00 00 00 .word 0x00000000 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 3000004c: ldr x16, 0x30000054 <__AArch64AbsLongThunk_+0x8> +// CHECK-EXE-NEXT: 30000050: ldr x16, 0x30000058 <__AArch64AbsLongThunk_+0x8> // CHECK-EXE-NEXT: br x16 // CHECK-EXE-NEXT: 08 10 00 18 .word 0x18001008 // CHECK-EXE-NEXT: 00 00 00 00 .word 0x00000000 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 3000005c: ldr x16, 0x30000064 <__AArch64AbsLongThunk_+0x8> +// CHECK-EXE-NEXT: 30000060: ldr x16, 0x30000068 <__AArch64AbsLongThunk_+0x8> // CHECK-EXE-NEXT: br x16 // CHECK-EXE-NEXT: 10 10 00 18 .word 0x18001010 // CHECK-EXE-NEXT: 00 00 00 00 .word 0x00000000 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 3000006c: ldr x16, 0x30000074 <__AArch64AbsLongThunk_+0x8> +// CHECK-EXE-NEXT: 30000070: ldr x16, 0x30000078 <__AArch64AbsLongThunk_+0x8> // CHECK-EXE-NEXT: br x16 // CHECK-EXE-NEXT: 18 10 00 18 .word 0x18001018 // CHECK-EXE-NEXT: 00 00 00 00 .word 0x00000000 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 3000007c: ldr x16, 0x30000084 <__AArch64AbsLongThunk_+0x8> +// CHECK-EXE-NEXT: 30000080: ldr x16, 0x30000088 <__AArch64AbsLongThunk_+0x8> // CHECK-EXE-NEXT: br x16 // CHECK-EXE-NEXT: 20 10 00 18 .word 0x18001020 // CHECK-EXE-NEXT: 00 00 00 00 .word 0x00000000 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 3000008c: ldr x16, 0x30000094 <__AArch64AbsLongThunk_+0x8> +// CHECK-EXE-NEXT: 30000090: ldr x16, 0x30000098 <__AArch64AbsLongThunk_+0x8> // CHECK-EXE-NEXT: br x16 // CHECK-EXE-NEXT: 28 10 00 18 .word 0x18001028 // CHECK-EXE-NEXT: 00 00 00 00 .word 0x00000000 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 3000009c: ldr x16, 0x300000a4 <__AArch64AbsLongThunk_+0x8> +// CHECK-EXE-NEXT: 300000a0: ldr x16, 0x300000a8 <__AArch64AbsLongThunk_+0x8> // CHECK-EXE-NEXT: br x16 // CHECK-EXE-NEXT: 30 10 00 18 .word 0x18001030 // CHECK-EXE-NEXT: 00 00 00 00 .word 0x00000000 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 300000ac: ldr x16, 0x300000b4 <__AArch64AbsLongThunk_+0x8> +// CHECK-EXE-NEXT: 300000b0: ldr x16, 0x300000b8 <__AArch64AbsLongThunk_+0x8> // CHECK-EXE-NEXT: br x16 // CHECK-EXE-NEXT: 3c 10 00 18 .word 0x1800103c // CHECK-EXE-NEXT: 00 00 00 00 .word 0x00000000 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_>: -// CHECK-EXE-NEXT: 300000bc: ldr x16, 0x300000c4 <__AArch64AbsLongThunk_+0x8> +// CHECK-EXE-NEXT: 300000c0: ldr x16, 0x300000c8 <__AArch64AbsLongThunk_+0x8> // CHECK-EXE-NEXT: br x16 // CHECK-EXE-NEXT: 44 10 00 18 .word 0x18001044 // CHECK-EXE-NEXT: 00 00 00 00 .word 0x00000000 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_via_plt>: -// CHECK-EXE-NEXT: 300000cc: ldr x16, 0x300000d4 <__AArch64AbsLongThunk_via_plt+0x8> +// CHECK-EXE-NEXT: 300000d0: ldr x16, 0x300000d8 <__AArch64AbsLongThunk_via_plt+0x8> // CHECK-EXE-NEXT: br x16 // CHECK-EXE-NEXT: 80 10 00 18 .word 0x18001080 // CHECK-EXE-NEXT: 00 00 00 00 .word 0x00000000 // CHECK-EXE-LABEL: <__AArch64AbsLongThunk_absolute>: -// CHECK-EXE-NEXT: 300000dc: ldr x16, 0x300000e4 <__AArch64AbsLongThunk_absolute+0x8> +// CHECK-EXE-NEXT: 300000e0: ldr x16, 0x300000e8 <__AArch64AbsLongThunk_absolute+0x8> // CHECK-EXE-NEXT: br x16 // CHECK-EXE-NEXT: 00 00 00 f0 .word 0xf0000000 // CHECK-EXE-NEXT: 00 00 00 00 .word 0x00000000 From 197ead75fb7a3c57f7a41cdbcff48ef3841e3832 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 1 Apr 2025 09:57:39 +0100 Subject: [PATCH 0213/1029] [X86] splitVector - split concat_vectors(a,b,c,d) -> concat_vectors(a,b) + concat_vectors(c,d) (#133753) Similar to what we already for build_vectors during subvector extraction, when splitting concat_vectors nodes, attempt to create a pair of half size concat_vectors nodes to see if these can fold. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 12 + .../vector-interleaved-load-i8-stride-3.ll | 40 +- .../vector-interleaved-store-i16-stride-3.ll | 64 +- .../vector-interleaved-store-i16-stride-4.ll | 8 +- .../vector-interleaved-store-i16-stride-5.ll | 164 ++-- .../vector-interleaved-store-i16-stride-6.ll | 90 ++- .../vector-interleaved-store-i16-stride-7.ll | 168 ++--- .../vector-interleaved-store-i16-stride-8.ll | 352 ++++----- .../vector-interleaved-store-i8-stride-4.ll | 36 +- .../vector-interleaved-store-i8-stride-5.ll | 16 +- .../vector-interleaved-store-i8-stride-6.ll | 34 +- .../vector-interleaved-store-i8-stride-7.ll | 708 +++++++++--------- .../vector-interleaved-store-i8-stride-8.ll | 494 ++++++------ 13 files changed, 1088 insertions(+), 1098 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5fff78f7a173a..b1745e5a30d7b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4326,6 +4326,18 @@ static std::pair splitVector(SDValue Op, SelectionDAG &DAG, assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 && "Can't split odd sized vector"); + if (Op.getOpcode() == ISD::CONCAT_VECTORS) { + assert((Op.getNumOperands() % 2) == 0 && + "Can't split odd sized vector concat"); + unsigned HalfOps = Op.getNumOperands() / 2; + EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + SmallVector LoOps(Op->op_begin(), Op->op_begin() + HalfOps); + SmallVector HiOps(Op->op_begin() + HalfOps, Op->op_end()); + SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, LoOps); + SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, HiOps); + return std::make_pair(Lo, Hi); + } + // If this is a splat value (with no-undefs) then use the lower subvector, // which should be a free extraction. SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2); diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index e4ddf5bc3a8af..d1d7cb0a34332 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -2410,19 +2410,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 -; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 -; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 ; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] @@ -2457,19 +2457,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 -; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 -; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] @@ -2504,19 +2504,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512DQ-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 -; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 -; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512DQ-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] @@ -2551,19 +2551,19 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13] ; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26] ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index 741f4b80a5ecb..6d1ba933b9082 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -552,23 +552,21 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-LABEL: store_i16_stride3_vf8: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] -; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,1,1,0,2] -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,1,5,2,6] -; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5],zero,zero,ymm0[2,3,6,7],zero,zero,ymm0[8,9,12,13],zero,zero,ymm0[18,19,22,23],zero,zero,ymm0[24,25,28,29],zero,zero,ymm0[26,27] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ~mem) -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,8,1,9,1,9,2,10] +; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5],zero,zero,ymm3[2,3,6,7],zero,zero,ymm3[8,9,12,13],zero,zero,ymm3[18,19,22,23],zero,zero,ymm3[24,25,28,29],zero,zero,ymm3[26,27] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ~mem) | ymm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -599,23 +597,21 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-LABEL: store_i16_stride3_vf8: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, (%rdx), %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] -; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,1,1,0,2] -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,1,5,2,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5],zero,zero,ymm0[2,3,6,7],zero,zero,ymm0[8,9,12,13],zero,zero,ymm0[18,19,22,23],zero,zero,ymm0[24,25,28,29],zero,zero,ymm0[26,27] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ~mem) -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,8,1,9,1,9,2,10] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,4,5],zero,zero,ymm3[2,3,6,7],zero,zero,ymm3[8,9,12,13],zero,zero,ymm3[18,19,22,23],zero,zero,ymm3[24,25,28,29],zero,zero,ymm3[26,27] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,1,0,2] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ~mem) | ymm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll index 2f6452467a420..fc4377a08d560 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -513,11 +513,11 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3] ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] ; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm4 @@ -536,11 +536,11 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3] ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 @@ -559,11 +559,11 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3] ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm4 @@ -582,11 +582,11 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3] ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [151519488,185205506,218891524,252577542] ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index e4fa594f3dd72..322d606538c54 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -906,28 +906,28 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm4 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,1,2,3,5,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15] -; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6],ymm7[7],ymm8[8,9],ymm7[10,11],ymm8[12,13,14],ymm7[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-NEXT: vpandn %ymm7, %ymm8, %ymm7 -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,u,u],zero,zero,zero,zero,ymm6[2,3,18,19,u,u],zero,zero,zero,zero,ymm6[28,29,20,21,u,u],zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,3,2,4,6,7,6] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[1,1,2,3,5,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6],ymm6[7],ymm7[8,9],ymm6[10,11],ymm7[12,13,14],ymm6[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-NEXT: vpandn %ymm6, %ymm7, %ymm6 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,u,u],zero,zero,zero,zero,ymm5[2,3,18,19,u,u],zero,zero,zero,zero,ymm5[28,29,20,21,u,u],zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,2,4,6,7,6] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u,28,29,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3],ymm7[4],ymm9[5,6],ymm7[7],ymm9[8,9],ymm7[10],ymm9[11],ymm7[12],ymm9[13,14],ymm7[15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] ; AVX512-NEXT: vpand %ymm7, %ymm8, %ymm7 -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,ymm5[u,u,2,3,10,11],zero,zero,zero,zero,ymm5[u,u,20,21,28,29],zero,zero,zero,zero,ymm5[u,u,22,23] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512-NEXT: vporq %zmm6, %zmm5, %zmm5 +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,8,9],zero,zero,zero,zero,ymm6[u,u,2,3,10,11],zero,zero,zero,zero,ymm6[u,u,20,21,28,29],zero,zero,zero,zero,ymm6[u,u,22,23] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512-NEXT: vporq %zmm5, %zmm6, %zmm5 ; AVX512-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 @@ -947,35 +947,35 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-FCP-LABEL: store_i16_stride5_vf8: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,2,0] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u],zero,zero,zero,zero,ymm7[2,3,18,19,u,u],zero,zero,zero,zero,ymm7[28,29,20,21,u,u],zero,zero -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,0,0] -; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[2,3,6,7,u,u],zero,zero,zero,zero,ymm6[8,9,12,13,u,u],zero,zero,zero,zero,ymm6[18,19,22,23,u,u],zero,zero,zero,zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,9,2,10,2,10,0,0] +; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[2,3,6,7,u,u],zero,zero,zero,zero,ymm5[8,9,12,13,u,u],zero,zero,zero,zero,ymm5[18,19,22,23,u,u],zero,zero,zero,zero +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,u,u],zero,zero,zero,zero,ymm6[2,3,18,19,u,u],zero,zero,zero,zero,ymm6[28,29,20,21,u,u],zero,zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,ymm7[u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,22,23] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3],zero,zero,zero,zero,ymm5[u,u,4,5,8,9],zero,zero,zero,zero,ymm5[u,u,18,19,22,23],zero,zero,zero,zero,ymm5[u,u,24,25,28,29] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512-FCP-NEXT: vporq %zmm6, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3],zero,zero,zero,zero,ymm6[u,u,4,5,8,9],zero,zero,zero,zero,ymm6[u,u,18,19,22,23],zero,zero,zero,zero,ymm6[u,u,24,25,28,29] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512-FCP-NEXT: vporq %zmm5, %zmm6, %zmm5 ; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) -; AVX512-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX512-FCP-NEXT: vpsrlq $48, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] ; AVX512-FCP-NEXT: vpbroadcastd 12(%r8), %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX512-FCP-NEXT: vmovdqa %xmm0, 64(%r9) @@ -990,28 +990,28 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm4 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[1,1,2,3,5,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6],ymm7[7],ymm8[8,9],ymm7[10,11],ymm8[12,13,14],ymm7[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-NEXT: vpandn %ymm7, %ymm8, %ymm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,u,u],zero,zero,zero,zero,ymm6[2,3,18,19,u,u],zero,zero,zero,zero,ymm6[28,29,20,21,u,u],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,3,2,4,6,7,6] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[1,1,2,3,5,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6],ymm6[7],ymm7[8,9],ymm6[10,11],ymm7[12,13,14],ymm6[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-NEXT: vpandn %ymm6, %ymm7, %ymm6 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,u,u],zero,zero,zero,zero,ymm5[2,3,18,19,u,u],zero,zero,zero,zero,ymm5[28,29,20,21,u,u],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,2,4,6,7,6] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[6,7,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,26,27,u,u,u,u,u,u,u,u,28,29,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3],ymm7[4],ymm9[5,6],ymm7[7],ymm9[8,9],ymm7[10],ymm9[11],ymm7[12],ymm9[13,14],ymm7[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] ; AVX512DQ-NEXT: vpand %ymm7, %ymm8, %ymm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,8,9],zero,zero,zero,zero,ymm5[u,u,2,3,10,11],zero,zero,zero,zero,ymm5[u,u,20,21,28,29],zero,zero,zero,zero,ymm5[u,u,22,23] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512DQ-NEXT: vporq %zmm6, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,8,9],zero,zero,zero,zero,ymm6[u,u,2,3,10,11],zero,zero,zero,zero,ymm6[u,u,20,21,28,29],zero,zero,zero,zero,ymm6[u,u,22,23] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512DQ-NEXT: vporq %zmm5, %zmm6, %zmm5 ; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 @@ -1031,35 +1031,35 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-FCP-LABEL: store_i16_stride5_vf8: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,2,0] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u],zero,zero,zero,zero,ymm7[2,3,18,19,u,u],zero,zero,zero,zero,ymm7[28,29,20,21,u,u],zero,zero -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[2,3,6,7,u,u],zero,zero,zero,zero,ymm6[8,9,12,13,u,u],zero,zero,zero,zero,ymm6[18,19,22,23,u,u],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,9,2,10,2,10,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[2,3,6,7,u,u],zero,zero,zero,zero,ymm5[8,9,12,13,u,u],zero,zero,zero,zero,ymm5[18,19,22,23,u,u],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,u,u],zero,zero,zero,zero,ymm6[2,3,18,19,u,u],zero,zero,zero,zero,ymm6[28,29,20,21,u,u],zero,zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,ymm7[u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,22,23] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3],zero,zero,zero,zero,ymm5[u,u,4,5,8,9],zero,zero,zero,zero,ymm5[u,u,18,19,22,23],zero,zero,zero,zero,ymm5[u,u,24,25,28,29] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 -; AVX512DQ-FCP-NEXT: vporq %zmm6, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3],zero,zero,zero,zero,ymm6[u,u,4,5,8,9],zero,zero,zero,zero,ymm6[u,u,18,19,22,23],zero,zero,zero,zero,ymm6[u,u,24,25,28,29] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512DQ-FCP-NEXT: vporq %zmm5, %zmm6, %zmm5 ; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] +; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] ; AVX512DQ-FCP-NEXT: vpbroadcastd 12(%r8), %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 64(%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index 33c57f2edf06e..25bad7578c111 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -491,25 +491,24 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13] -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm5 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] -; AVX512-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13] +; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4,5],ymm5[6],ymm7[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 +; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX512-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-NEXT: vzeroupper @@ -527,14 +526,14 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8,10,1,3,8,10,1,3] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5],ymm6[6],ymm5[7] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [8,10,1,3,8,10,1,3] +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] ; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 @@ -560,25 +559,24 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm5 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3,4,5],ymm3[6],ymm5[7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastq %xmm4, %ymm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4,5],ymm5[6],ymm7[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7] ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rax) ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-NEXT: vzeroupper @@ -596,14 +594,14 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8,10,1,3,8,10,1,3] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5],ymm6[6],ymm5[7] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [8,10,1,3,8,10,1,3] +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3 @@ -991,11 +989,11 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27] ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] ; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] @@ -1033,13 +1031,13 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] @@ -1083,11 +1081,11 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,1,3] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,6,7,14,15,2,3,10,11,16,17,24,25,16,17,24,25,24,25,26,27,18,19,26,27] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] @@ -1125,13 +1123,13 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5] ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,6,7,14,15,10,11,14,15,2,3,10,11,16,17,24,25,16,17,20,21,24,25,26,27,18,19,26,27] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index e4e013446f7a5..5aa7c055d408e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -1353,25 +1353,25 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512-NEXT: vmovdqa (%r8), %xmm5 ; AVX512-NEXT: vmovdqa (%r9), %xmm6 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm2 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,2,2,4,5,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11,12,13],ymm11[14],ymm10[15] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,2,0] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,8,9],zero,zero,zero,zero,ymm11[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm11[u,u,u,u,u,u,28,29,20,21] -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512-NEXT: vporq %zmm9, %zmm10, %zmm9 -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,2,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm2[2,3,0,1] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,2,2,4,5,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11,12,13],ymm10[14],ymm9[15] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,2,0] +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21] +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512-NEXT: vporq %zmm7, %zmm9, %zmm7 +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 @@ -1379,13 +1379,13 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpbroadcastd 4(%r10), %ymm12 ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm7)) ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u],zero,zero,zero,zero,ymm2[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[20,21,28,29,u,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u] ; AVX512-NEXT: vpor %ymm2, %ymm7, %ymm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7 @@ -1416,22 +1416,22 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm4 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,2,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,2,2,0] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,28,29,20,21] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,5,0,0,5,2,6,0] +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[20,21,24,25] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,2,0] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,5,0,0,5,2,6,0] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[20,21,24,25] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vporq %zmm9, %zmm10, %zmm9 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512-FCP-NEXT: vporq %zmm7, %zmm9, %zmm7 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm9 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 @@ -1439,13 +1439,13 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u],zero,zero,zero,zero,ymm7[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[20,21,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm7)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm6[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm6[u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm7, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u],zero,zero,zero,zero,ymm6[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[20,21,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7 @@ -1476,25 +1476,25 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm4 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5 ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm6 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,2,2,4,5,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11,12,13],ymm11[14],ymm10[15] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,2,0] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,8,9],zero,zero,zero,zero,ymm11[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm11[u,u,u,u,u,u,28,29,20,21] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512DQ-NEXT: vporq %zmm9, %zmm10, %zmm9 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,2,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,2,2,4,5,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11,12,13],ymm10[14],ymm9[15] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,2,0] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512DQ-NEXT: vporq %zmm7, %zmm9, %zmm7 +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 @@ -1502,13 +1502,13 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpbroadcastd 4(%r10), %ymm12 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm7)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u],zero,zero,zero,zero,ymm2[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[20,21,28,29,u,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u] ; AVX512DQ-NEXT: vpor %ymm2, %ymm7, %ymm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7 @@ -1539,22 +1539,22 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,2,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[0,2,2,0] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm9[u,u,u,u,u,u,28,29,20,21] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,5,0,0,5,2,6,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[20,21,24,25] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm6[0,2,2,0] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,5,0,0,5,2,6,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm11[20,21,24,25] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vporq %zmm9, %zmm10, %zmm9 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512DQ-FCP-NEXT: vporq %zmm7, %zmm9, %zmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm9 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 @@ -1562,13 +1562,13 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u],zero,zero,zero,zero,ymm7[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[20,21,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm7)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm6[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm6[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,3,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u],zero,zero,zero,zero,ymm6[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[20,21,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm8 = (mem & ~ymm8) | ymm7 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index e03bd7d8e5378..0eaf71552bcbc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -588,31 +588,31 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm4 +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm5 +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[3,1,2,3,7,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[3,1,2,3,7,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[3,1,2,3,7,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,4] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,1,2,0,4,5,6,4] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) @@ -675,31 +675,31 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,4,6,8,9,10,11,13,15,12,14] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm5 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[3,1,2,3,7,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[3,1,2,3,7,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[3,1,2,3,7,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3],ymm1[4],ymm6[5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[2,0,3,1,4,5,6,7,10,8,11,9,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,4] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,4,8,9,10,11,15,13,14,12] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,1,2,0,4,5,6,4] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,7,5,8,9,10,11,14,12,15,13] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,0,2,4,5,6,7,9,11,8,10,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) @@ -1209,62 +1209,62 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-NEXT: vmovdqa (%r11), %xmm3 -; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] +; AVX512-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-NEXT: vmovdqa (%r11), %xmm0 +; AVX512-NEXT: vinserti128 $1, (%r10), %ymm0, %ymm4 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm6 +; AVX512-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542] ; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm11 -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] +; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0] ; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] -; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506] ; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm7, %ymm9, %ymm9 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0] ; AVX512-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3] +; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] -; AVX512-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm6 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm6 -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] +; AVX512-NEXT: vpshufb %ymm10, %ymm2, %ymm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512-NEXT: vpshufb %ymm10, %ymm1, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX512-NEXT: vpshufb %ymm13, %ymm0, %ymm8 +; AVX512-NEXT: vpshufb %ymm13, %ymm1, %ymm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] +; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7] +; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1273,62 +1273,62 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm6 +; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] +; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0] ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1337,62 +1337,62 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%r11), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm0, %ymm4 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm6 +; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542] ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm9, %ymm11 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] +; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0] ; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506] ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm9, %ymm9 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0] ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm2, %ymm6 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm2, %ymm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm1, %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm0, %ymm8 +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm1, %ymm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -1401,62 +1401,62 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,218891524,0,252577542] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218891524,0,252577542,0] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [0,151519488,0,185205506] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm15 = [151519488,0,185205506,0] ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll index a32caab520ca5..e74521d5463a4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll @@ -669,11 +669,10 @@ define void @store_i8_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX512-NEXT: vmovdqa %ymm1, (%r8) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovdqa %xmm4, 32(%r8) +; AVX512-NEXT: vmovdqa %xmm0, 48(%r8) +; AVX512-NEXT: vmovdqa %xmm1, 16(%r8) +; AVX512-NEXT: vmovdqa %xmm3, (%r8) ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride4_vf16: @@ -690,11 +689,10 @@ define void @store_i8_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX512-FCP-NEXT: vmovdqa %ymm1, (%r8) -; AVX512-FCP-NEXT: vzeroupper +; AVX512-FCP-NEXT: vmovdqa %xmm4, 32(%r8) +; AVX512-FCP-NEXT: vmovdqa %xmm0, 48(%r8) +; AVX512-FCP-NEXT: vmovdqa %xmm1, 16(%r8) +; AVX512-FCP-NEXT: vmovdqa %xmm3, (%r8) ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride4_vf16: @@ -711,11 +709,10 @@ define void @store_i8_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX512DQ-NEXT: vmovdqa %ymm1, (%r8) -; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: vmovdqa %xmm4, 32(%r8) +; AVX512DQ-NEXT: vmovdqa %xmm0, 48(%r8) +; AVX512DQ-NEXT: vmovdqa %xmm1, 16(%r8) +; AVX512DQ-NEXT: vmovdqa %xmm3, (%r8) ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride4_vf16: @@ -732,11 +729,10 @@ define void @store_i8_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 32(%r8) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%r8) -; AVX512DQ-FCP-NEXT: vzeroupper +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 32(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 48(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 16(%r8) +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%r8) ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i8_stride4_vf16: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 39f8a93a7b77a..d25f8cf6b0bca 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -1306,13 +1306,13 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512-NEXT: vmovdqa (%r8), %xmm0 -; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u] ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3) +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 @@ -1345,10 +1345,10 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero +; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero @@ -1381,13 +1381,13 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,7],zero,ymm1[u,u,u,8],zero,ymm1[u,u,u,9],zero,ymm1[u,u,u],zero,ymm1[26,u,u,u],zero,ymm1[27,u,u,u],zero,ymm1[28,u,u] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u],zero,ymm4[7,u,u,u],zero,ymm4[8,u,u,u],zero,ymm4[9,u,u,u,26],zero,ymm4[u,u,u,27],zero,ymm4[u,u,u,28],zero,ymm4[u,u] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ~ymm5 & (ymm4 | ymm3) +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 @@ -1420,10 +1420,10 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u],zero,zero,ymm3[1,9,u],zero,zero,ymm3[2,10,u],zero,zero,ymm3[19,27,u],zero,zero,ymm3[20,28,u],zero,zero,ymm3[21,29,u],zero,zero +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,2,6,2,6,3,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm4 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,3,7],zero,zero,ymm4[u,8,12],zero,zero,ymm4[u,9,13],zero,zero,ymm4[u,18,22],zero,zero,ymm4[u,19,23],zero,zero,ymm4[u,24,28],zero,zero @@ -1456,12 +1456,12 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero ; AVX512BW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero @@ -1496,8 +1496,8 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,4,0,4,5,0,1,17,21,18,22,22,18,19,23] ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60] @@ -1526,12 +1526,12 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm0[7],zero,zero,zero,zero,ymm0[8],zero,zero,zero,zero,ymm0[9],zero,zero,zero,zero,zero,ymm0[26],zero,zero,zero,zero,ymm0[27],zero,zero,zero,zero,ymm0[28],zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero ; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero @@ -1566,8 +1566,8 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,4,0,4,5,0,1,17,21,18,22,22,18,19,23] ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zmm3[1,9],zero,zero,zero,zmm3[2,10],zero,zero,zero,zmm3[3,19],zero,zero,zero,zmm3[28,20],zero,zero,zero,zmm3[29,21],zero,zero,zero,zmm3[30,22,34,38],zero,zero,zero,zmm3[35,39],zero,zero,zero,zmm3[40,44],zero,zero,zero,zmm3[41,49],zero,zero,zero,zmm3[54,50],zero,zero,zero,zmm3[55,51],zero,zero,zero,zmm3[56,60] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index 6f48e3223bd5a..6205be83f5123 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -1387,10 +1387,10 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero +; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,u,6,14],zero,zero,ymm4[u,u,7,15],zero,zero,ymm4[u,u,16,24],zero,zero,ymm4[u,u,17,25],zero,zero,ymm4[u,u,18,26],zero,zero ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 @@ -1426,10 +1426,10 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero +; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,u,6,14],zero,zero,ymm4[u,u,7,15],zero,zero,ymm4[u,u,16,24],zero,zero,ymm4[u,u,17,25],zero,zero,ymm4[u,u,18,26],zero,zero ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 @@ -1465,10 +1465,10 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero +; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,u,6,14],zero,zero,ymm4[u,u,7,15],zero,zero,ymm4[u,u,16,24],zero,zero,ymm4[u,u,17,25],zero,zero,ymm4[u,u,18,26],zero,zero ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 @@ -1504,10 +1504,10 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[0,8,u,u],zero,zero,ymm3[1,9,u,u],zero,zero,ymm3[2,10,u,u],zero,zero,ymm3[19,27,u,u],zero,zero,ymm3[20,28,u,u],zero,zero +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[u,u,6,14],zero,zero,ymm4[u,u,7,15],zero,zero,ymm4[u,u,16,24],zero,zero,ymm4[u,u,17,25],zero,zero,ymm4[u,u,18,26],zero,zero ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 @@ -1543,9 +1543,9 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,0,2] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,1,3] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zmm3[21,29,37,45],zero,zero,zero,zero,zmm3[38,46],zero,zero,zero,zero,zmm3[39,47],zero,zero,zero,zero,zmm3[48,56],zero,zero,zero,zero,zmm3[49,57],zero,zero,zero,zero,zmm3[50,58] @@ -1585,15 +1585,14 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,4,6,5,7] -; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,0,2,8,10,9,11] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8],zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zmm4[2,10],zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zmm4[20,28],zero,zero,zero,zero,zmm4[21,29,37,45],zero,zero,zero,zero,zmm4[38,46],zero,zero,zero,zero,zmm4[39,47],zero,zero,zero,zero,zmm4[48,56],zero,zero,zero,zero,zmm4[49,57],zero,zero,zero,zero,zmm4[50,58] -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [4,6,4,6,0,2,1,3] -; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zmm3[0,8],zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zero,zero,zero,zmm3[38,46],zero,zero,zero,zero,zmm3[39,47],zero,zero,zero,zero,zmm3[48,56],zero,zero,zero,zero,zmm3[49,57],zero,zero,zero,zero,zmm3[50,58],zero,zero ; AVX512BW-FCP-NEXT: vporq %zmm4, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3] ; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u] @@ -1621,9 +1620,9 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,0,2] +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,1,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zmm3[21,29,37,45],zero,zero,zero,zero,zmm3[38,46],zero,zero,zero,zero,zmm3[39,47],zero,zero,zero,zero,zmm3[48,56],zero,zero,zero,zero,zmm3[49,57],zero,zero,zero,zero,zmm3[50,58] @@ -1663,15 +1662,14 @@ define void @store_i8_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,4,6,5,7] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,0,2,8,10,9,11] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8],zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zmm4[2,10],zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zmm4[20,28],zero,zero,zero,zero,zmm4[21,29,37,45],zero,zero,zero,zero,zmm4[38,46],zero,zero,zero,zero,zmm4[39,47],zero,zero,zero,zero,zmm4[48,56],zero,zero,zero,zero,zmm4[49,57],zero,zero,zero,zero,zmm4[50,58] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [4,6,4,6,0,2,1,3] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zmm3[0,8],zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zero,zero,zero,zmm3[38,46],zero,zero,zero,zero,zmm3[39,47],zero,zero,zero,zero,zmm3[48,56],zero,zero,zero,zero,zmm3[49,57],zero,zero,zero,zero,zmm3[50,58],zero,zero ; AVX512DQ-BW-FCP-NEXT: vporq %zmm4, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,0,2,0,2,1,3] ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u,u,u,37,45,u,u,u,u,38,46,u,u,u,u,39,47,u,u,u,u,48,56,u,u,u,u,49,57,u,u,u,u] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index b82e663528398..6a5dbbc56d9bc 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -2038,35 +2038,35 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512-NEXT: vmovdqa (%r8), %xmm1 ; AVX512-NEXT: vmovdqa (%r9), %xmm2 ; AVX512-NEXT: vmovdqa (%r10), %xmm0 -; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 -; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ~ymm8 & (ymm7 | ymm6) +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u],zero,ymm6[5,u,u,u,u,u],zero,ymm6[6,u,u,u,u,u,23],zero,ymm6[u,u,u,u,u,24],zero,ymm6[u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ~ymm7 & (ymm6 | ymm5) +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8,u,u,u],zero,zero,ymm5[1,9,u,u,u],zero,zero,ymm5[18,26,u,u,u],zero,zero,ymm5[19,27,u,u,u],zero,zero,ymm5[20,28] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm4, %ymm4 +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u,u],zero +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u,u,u,u,25] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 & (ymm8 | ymm6) ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u,u,u,u,25] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 & (ymm9 | ymm7) -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512-NEXT: vporq %zmm6, %zmm7, %zmm6 -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512-NEXT: vporq %zmm5, %zmm6, %zmm5 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,ymm6[u,u,u,u,u,5],zero,ymm6[u,u,u,u,u,6],zero,ymm6[u,u,u,u,u],zero,ymm6[23,u,u,u,u,u],zero,ymm6[24,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = mem & (ymm8 | ymm7) -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u] ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] @@ -2075,30 +2075,30 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 ; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,1,0,4,4,5,4] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & mem) | zmm7 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm6)) -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[1,3,3,1] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm5)) +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[3,1,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,ymm5[u,u,u,10,2],zero,zero,ymm5[u,u,u,11,3],zero,zero,ymm5[u,u,u,20,28],zero,zero,ymm5[u,u,u,21,29],zero,zero,ymm5[u] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[1,3,3,1] ; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9,u,u,u],zero,zero,ymm7[2,10,u,u,u],zero,zero,ymm7[3,19,u,u,u],zero,zero,ymm7[28,20,u,u,u],zero,zero,ymm7[29,21,u] -; AVX512-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512-NEXT: vpor %ymm5, %ymm7, %ymm5 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6] ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm7 & ~mem) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9],zero,ymm6[u,u,u,u,2,10],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u,20,28],zero,ymm6[u,u,u,u,21] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ~mem) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm5)) ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] -; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[6,14,u,u,u],zero,zero,xmm3[7,15,u,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,14],zero,zero,xmm4[u,u,u,7,15],zero,zero,xmm4[u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] ; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX512-NEXT: vmovdqa %ymm6, 64(%rax) ; AVX512-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512-NEXT: vzeroupper @@ -2113,46 +2113,48 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 ; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,5,2,6,1,5,2,6] -; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,zero,ymm8[1,5,u,u,u],zero,zero,ymm8[2,6,u,u,u],zero,zero,ymm8[19,23,u,u,u],zero,zero,ymm8[24,28,u,u,u],zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,ymm8[u,u,u,1,9],zero,zero,ymm8[u,u,u,2,10],zero,zero,ymm8[u,u,u,19,27],zero,zero,ymm8[u,u,u,20,28],zero,zero -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,5],zero,zero,ymm9[u,u,u,2,6],zero,zero,ymm9[u,u,u,19,23],zero,zero,ymm9[u,u,u,24,28],zero,zero,ymm9[u,u,u,25] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512-FCP-NEXT: vporq %zmm6, %zmm8, %zmm6 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] -; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm9, %zmm10 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8],zero,ymm8[u,u,u,u,1,9],zero,ymm8[u,u,u,u,18,26],zero,ymm8[u,u,u,u,19,27],zero,ymm8[u,u,u,u] -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,4],zero,ymm7[u,u,u,u,1,5],zero,ymm7[u,u,u,u,2,6],zero,ymm7[u,u,u,u,19,23],zero,ymm7[u,u,u,u,24,28],zero,ymm7[u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm10 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8,u,u,u],zero,zero,ymm5[1,9,u,u,u],zero,zero,ymm5[18,26,u,u,u],zero,zero,ymm5[19,27,u,u,u],zero,zero,ymm5[20,28] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,5,2,6,1,5,2,6] +; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,zero,ymm7[1,5,u,u,u],zero,zero,ymm7[2,6,u,u,u],zero,zero,ymm7[19,23,u,u,u],zero,zero,ymm7[24,28,u,u,u],zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,5],zero,zero,ymm6[u,u,u,2,6],zero,zero,ymm6[u,u,u,19,23],zero,zero,ymm6[u,u,u,24,28],zero,zero,ymm6[u,u,u,25] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512-FCP-NEXT: vporq %zmm5, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] +; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,9,2,10,1,9,2,10] +; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,4],zero,ymm6[u,u,u,u,1,5],zero,ymm6[u,u,u,u,2,6],zero,ymm6[u,u,u,u,19,23],zero,ymm6[u,u,u,u,24,28],zero,ymm6[u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm7 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm8 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[3,1,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,ymm5[u,u,u,10,2],zero,zero,ymm5[u,u,u,11,3],zero,zero,ymm5[u,u,u,20,28],zero,zero,ymm5[u,u,u,21,29],zero,zero,ymm5[u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] -; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] ; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm8 & ~mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] @@ -2163,9 +2165,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] ; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512-FCP-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -2173,35 +2175,35 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm1 ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm2 ; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 -; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u],zero,ymm7[5,u,u,u,u,u],zero,ymm7[6,u,u,u,u,u,23],zero,ymm7[u,u,u,u,u,24],zero,ymm7[u,u,u,u] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ~ymm8 & (ymm7 | ymm6) +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm3, %ymm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u],zero,ymm6[5,u,u,u,u,u],zero,ymm6[6,u,u,u,u,u,23],zero,ymm6[u,u,u,u,u,24],zero,ymm6[u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ~ymm7 & (ymm6 | ymm5) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8,u,u,u],zero,zero,ymm5[1,9,u,u,u],zero,zero,ymm5[18,26,u,u,u],zero,zero,ymm5[19,27,u,u,u],zero,zero,ymm5[20,28] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm4, %ymm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,5],zero,ymm4[u,u,u,u,u,6],zero,ymm4[u,u,u,u,u],zero,ymm4[23,u,u,u,u,u],zero,ymm4[24,u,u,u,u,u],zero +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u,u,u,u,25] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 & (ymm8 | ymm6) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,5],zero,ymm3[u,u,u,u,u,6],zero,ymm3[u,u,u,u,u],zero,ymm3[23,u,u,u,u,u],zero,ymm3[24,u,u,u,u,u],zero -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u,u,u,u,25] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 & (ymm9 | ymm7) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512DQ-NEXT: vporq %zmm6, %zmm7, %zmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512DQ-NEXT: vporq %zmm5, %zmm6, %zmm5 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,ymm6[u,u,u,u,u,5],zero,ymm6[u,u,u,u,u,6],zero,ymm6[u,u,u,u,u],zero,ymm6[23,u,u,u,u,u],zero,ymm6[24,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = mem & (ymm8 | ymm7) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] @@ -2210,30 +2212,30 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,1,0,4,4,5,4] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & mem) | zmm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm6)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[1,3,3,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm5)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[3,1,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,ymm5[u,u,u,10,2],zero,zero,ymm5[u,u,u,11,3],zero,zero,ymm5[u,u,u,20,28],zero,zero,ymm5[u,u,u,21,29],zero,zero,ymm5[u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[1,3,3,1] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9,u,u,u],zero,zero,ymm7[2,10,u,u,u],zero,zero,ymm7[3,19,u,u,u],zero,zero,ymm7[28,20,u,u,u],zero,zero,ymm7[29,21,u] -; AVX512DQ-NEXT: vpor %ymm6, %ymm7, %ymm6 +; AVX512DQ-NEXT: vpor %ymm5, %ymm7, %ymm5 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm7 & ~mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9],zero,ymm6[u,u,u,u,2,10],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u,20,28],zero,ymm6[u,u,u,u,21] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ~mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm5)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u] -; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[6,14,u,u,u],zero,zero,xmm3[7,15,u,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,14],zero,zero,xmm4[u,u,u,7,15],zero,zero,xmm4[u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,xmm1[u,u,u,u,13,12],zero,xmm1[u,u,u,u,15,14],zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] ; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512DQ-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX512DQ-NEXT: vmovdqa %ymm6, 64(%rax) ; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512DQ-NEXT: vzeroupper @@ -2248,46 +2250,48 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u,u,u],zero,zero,ymm6[1,9,u,u,u],zero,zero,ymm6[18,26,u,u,u],zero,zero,ymm6[19,27,u,u,u],zero,zero,ymm6[20,28] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,5,2,6,1,5,2,6] -; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u],zero,zero,ymm8[1,5,u,u,u],zero,zero,ymm8[2,6,u,u,u],zero,zero,ymm8[19,23,u,u,u],zero,zero,ymm8[24,28,u,u,u],zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,ymm8[u,u,u,1,9],zero,zero,ymm8[u,u,u,2,10],zero,zero,ymm8[u,u,u,19,27],zero,zero,ymm8[u,u,u,20,28],zero,zero -; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm7, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,1,5],zero,zero,ymm9[u,u,u,2,6],zero,zero,ymm9[u,u,u,19,23],zero,zero,ymm9[u,u,u,24,28],zero,zero,ymm9[u,u,u,25] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512DQ-FCP-NEXT: vporq %zmm6, %zmm8, %zmm6 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm9, %zmm10 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8],zero,ymm8[u,u,u,u,1,9],zero,ymm8[u,u,u,u,18,26],zero,ymm8[u,u,u,u,19,27],zero,ymm8[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,4],zero,ymm7[u,u,u,u,1,5],zero,ymm7[u,u,u,u,2,6],zero,ymm7[u,u,u,u,19,23],zero,ymm7[u,u,u,u,24,28],zero,ymm7[u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm10 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[3,1,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8,u,u,u],zero,zero,ymm5[1,9,u,u,u],zero,zero,ymm5[18,26,u,u,u],zero,zero,ymm5[19,27,u,u,u],zero,zero,ymm5[20,28] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,5,2,6,1,5,2,6] +; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u],zero,zero,ymm7[1,5,u,u,u],zero,zero,ymm7[2,6,u,u,u],zero,zero,ymm7[19,23,u,u,u],zero,zero,ymm7[24,28,u,u,u],zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,ymm7[u,u,u,1,9],zero,zero,ymm7[u,u,u,2,10],zero,zero,ymm7[u,u,u,19,27],zero,zero,ymm7[u,u,u,20,28],zero,zero +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,5],zero,zero,ymm6[u,u,u,2,6],zero,zero,ymm6[u,u,u,19,23],zero,zero,ymm6[u,u,u,24,28],zero,zero,ymm6[u,u,u,25] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512DQ-FCP-NEXT: vporq %zmm5, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,9,2,10,1,9,2,10] +; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,4],zero,ymm6[u,u,u,u,1,5],zero,ymm6[u,u,u,u,2,6],zero,ymm6[u,u,u,u,19,23],zero,ymm6[u,u,u,u,24,28],zero,ymm6[u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm7 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm8 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[3,1,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,ymm5[u,u,u,10,2],zero,zero,ymm5[u,u,u,11,3],zero,zero,ymm5[u,u,u,20,28],zero,zero,ymm5[u,u,u,21,29],zero,zero,ymm5[u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] -; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] ; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm8 & ~mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm6)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm8 & ~mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] @@ -2298,9 +2302,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -2308,81 +2312,80 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25] -; AVX512BW-NEXT: vpor %ymm5, %ymm4, %ymm4 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm3 +; AVX512BW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512BW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] +; AVX512BW-NEXT: vpor %ymm7, %ymm6, %ymm6 ; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero +; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm6 {%k1} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28] -; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[4],zero,zero,zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm2, %ymm5 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512BW-NEXT: vpermw %zmm5, %zmm6, %zmm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] +; AVX512BW-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero +; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512BW-NEXT: vpermw %zmm2, %zmm8, %zmm8 ; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm7 {%k1} ; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} -; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,7,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] +; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,7,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,3,2] ; AVX512BW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[1,3,2,3] +; AVX512BW-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,2,3] ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[6,14],zero,zero,zero,zero,zero,xmm7[7,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512BW-NEXT: vpor %xmm4, %xmm7, %xmm4 ; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm4, %xmm6 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %ymm5, %ymm4, %ymm4 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-NEXT: vmovdqu8 %xmm3, %xmm4 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermw %ymm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm6[1,3,1,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm2 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[3,19],zero,zero,zero,zero,zero,ymm0[28,20],zero,zero,zero,zero,zero,ymm0[29,21],zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,1,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1],zero,zero,zero,zero,zero,ymm1[10,2],zero,zero,zero,zero,zero,ymm1[11,3],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero,zero,zero,zero,ymm1[21,29],zero,zero,zero +; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} +; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm0 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa %xmm6, 96(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa %xmm4, 96(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2393,61 +2396,62 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm4 +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,2,10,1,9,2,10] +; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm5 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512BW-FCP-NEXT: vpermw %zmm4, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rdx # imm = 0x4081020408102040 +; AVX512BW-FCP-NEXT: kmovq %rdx, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,zero,zero,zero,zero,xmm5[13,12],zero,zero,zero,zero,zero,xmm5[15,14],zero -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm6 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[13],zero,zero,zero,zero,zero,zero,xmm6[14],zero,zero,zero,zero,zero,zero,xmm6[15] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm5, %xmm4 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512BW-FCP-NEXT: vpermw %zmm6, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,0,4,0,1,5,0,0,1,5,2,6] -; AVX512BW-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm7 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] -; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 -; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,1,4,5,1,5,0,0,1,5,2,6] -; AVX512BW-FCP-NEXT: vpermd %zmm3, %zmm5, %zmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57] -; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zero,zero,zmm5[1,9],zero,zero,zero,zero,zero,zmm5[18,26],zero,zero,zero,zero,zero,zmm5[19,27],zero,zero,zero,zero,zero,zmm5[20,28],zero,zero,zero,zero,zero,zmm5[33,37],zero,zero,zero,zero,zero,zmm5[34,38],zero,zero,zero,zero,zero,zmm5[51,55],zero,zero,zero,zero,zero,zmm5[56,60],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,1,4,5,1,5,0,0,1,5,2,6] +; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[0,8],zero,zero,zero,zero,zero,zmm8[1,9],zero,zero,zero,zero,zero,zmm8[2,10],zero,zero,zero,zero,zero,zmm8[19,27],zero,zero,zero,zero,zero,zmm8[20,28],zero,zero,zero,zero,zero,zmm8[33,37],zero,zero,zero,zero,zero,zmm8[34,38],zero,zero,zero,zero,zero,zmm8[51,55],zero,zero,zero,zero,zero,zmm8[56,60],zero,zero,zero,zero,zero,zmm8[57] +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zero,zero,zmm7[0,8],zero,zero,zero,zero,zero,zmm7[1,9],zero,zero,zero,zero,zero,zmm7[18,26],zero,zero,zero,zero,zero,zmm7[19,27],zero,zero,zero,zero,zero,zmm7[20,28],zero,zero,zero,zero,zero,zmm7[33,37],zero,zero,zero,zero,zero,zmm7[34,38],zero,zero,zero,zero,zero,zmm7[51,55],zero,zero,zero,zero,zero,zmm7[56,60],zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %zmm8, %zmm7, %zmm7 ; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-FCP-NEXT: vpermw %ymm4, %ymm5, %ymm5 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm2 {%k1} -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,1] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm6 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,3,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[3,19],zero,zero,zero,zero,zero,ymm5[28,20],zero,zero,zero,zero,zero,ymm5[29,21],zero +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[3,1,1,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa %xmm4, 96(%rax) +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm5 {%k1} +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[6,14],zero,zero,zero,zero,zero,xmm0[7,15],zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[6,14],zero,zero,zero,zero,zero,xmm1[7,15],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15] +; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 +; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -2455,81 +2459,80 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[5],zero,zero,zero,zero,zero,zero,ymm4[6],zero,zero,zero,zero,zero,ymm4[23],zero,zero,zero,zero,zero,zero,ymm4[24],zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero,zero,zero,zero,ymm5[25] -; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm4, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] +; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm6, %ymm6 ; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,zero,zero,ymm3[1,9],zero,zero,zero,zero,zero,ymm3[2,10],zero,zero,zero,zero,zero,ymm3[19,27],zero,zero,zero,zero,zero,ymm3[20,28],zero,zero +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm6 {%k1} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28] -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[4],zero,zero,zero,zero,zero,zero,ymm2[5],zero,zero,zero,zero,zero,zero,ymm2[6],zero,zero,zero,zero,zero,zero,zero,ymm2[23],zero,zero,zero,zero,zero,zero,ymm2[24],zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[4],zero,zero,zero,zero,zero,zero,ymm5[5],zero,zero,zero,zero,zero,zero,ymm5[6],zero,zero,zero,zero,zero,ymm5[23],zero,zero,zero,zero,zero,zero,ymm5[24],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm2, %ymm5 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] +; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm8, %zmm8 ; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,7,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,3,2] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,7,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,3,2] ; AVX512DQ-BW-NEXT: movw $-32510, %cx # imm = 0x8102 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[1,3,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[6,14],zero,zero,zero,zero,zero,xmm6[7,15],zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[1,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,2,3] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[6,14],zero,zero,zero,zero,zero,xmm7[7,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm7, %xmm4 ; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm4, %xmm6 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermw %ymm5, %ymm4, %ymm4 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm3, %xmm4 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm3, %ymm2 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm6[1,3,1,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512DQ-BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm2 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[3,19],zero,zero,zero,zero,zero,ymm0[28,20],zero,zero,zero,zero,zero,ymm0[29,21],zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,1,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1],zero,zero,zero,zero,zero,ymm1[10,2],zero,zero,zero,zero,zero,ymm1[11,3],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero,zero,zero,zero,ymm1[21,29],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa %xmm6, 96(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa %xmm4, 96(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -2540,61 +2543,62 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,2,10,1,9,2,10] +; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm3, %ymm2, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm4, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rdx # imm = 0x4081020408102040 +; AVX512DQ-BW-FCP-NEXT: kmovq %rdx, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[1,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm5[6,14],zero,zero,zero,zero,zero,xmm5[7,15],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10],zero,zero,zero,zero,zero,xmm5[13,12],zero,zero,zero,zero,zero,xmm5[15,14],zero -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[13],zero,zero,zero,zero,zero,zero,xmm6[14],zero,zero,zero,zero,zero,zero,xmm6[15] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm5, %xmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm6, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,0,4,0,1,5,0,0,1,5,2,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u,32,36,u,u,u,u,u,33,37,u,u,u,u,u,34,38,u,u,u,u,u,51,55,u,u,u,u,u,56,60,u,u] -; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 -; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,1,4,5,1,5,0,0,1,5,2,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm3, %zmm5, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zmm3[2,10],zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zero,zmm3[20,28],zero,zero,zero,zero,zero,zmm3[33,37],zero,zero,zero,zero,zero,zmm3[34,38],zero,zero,zero,zero,zero,zmm3[51,55],zero,zero,zero,zero,zero,zmm3[56,60],zero,zero,zero,zero,zero,zmm3[57] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zmm5[0,8],zero,zero,zero,zero,zero,zmm5[1,9],zero,zero,zero,zero,zero,zmm5[18,26],zero,zero,zero,zero,zero,zmm5[19,27],zero,zero,zero,zero,zero,zmm5[20,28],zero,zero,zero,zero,zero,zmm5[33,37],zero,zero,zero,zero,zero,zmm5[34,38],zero,zero,zero,zero,zero,zmm5[51,55],zero,zero,zero,zero,zero,zmm5[56,60],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,1,4,5,1,5,0,0,1,5,2,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[0,8],zero,zero,zero,zero,zero,zmm8[1,9],zero,zero,zero,zero,zero,zmm8[2,10],zero,zero,zero,zero,zero,zmm8[19,27],zero,zero,zero,zero,zero,zmm8[20,28],zero,zero,zero,zero,zero,zmm8[33,37],zero,zero,zero,zero,zero,zmm8[34,38],zero,zero,zero,zero,zero,zmm8[51,55],zero,zero,zero,zero,zero,zmm8[56,60],zero,zero,zero,zero,zero,zmm8[57] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zero,zero,zmm7[0,8],zero,zero,zero,zero,zero,zmm7[1,9],zero,zero,zero,zero,zero,zmm7[18,26],zero,zero,zero,zero,zero,zmm7[19,27],zero,zero,zero,zero,zero,zmm7[20,28],zero,zero,zero,zero,zero,zmm7[33,37],zero,zero,zero,zero,zero,zmm7[34,38],zero,zero,zero,zero,zero,zmm7[51,55],zero,zero,zero,zero,zero,zmm7[56,60],zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %zmm8, %zmm7, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] ; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm4, %ymm5, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512DQ-BW-FCP-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,3,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm1[1,9],zero,zero,zero,zero,zero,ymm1[2,10],zero,zero,zero,zero,zero,ymm1[3,19],zero,zero,zero,zero,zero,ymm1[28,20],zero,zero,zero,zero,zero,ymm1[29,21],zero -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,1,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,ymm0[10,2],zero,zero,zero,zero,zero,ymm0[11,3],zero,zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,zero,ymm0[21,29],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[1,3,3,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[3,19],zero,zero,zero,zero,zero,ymm5[28,20],zero,zero,zero,zero,zero,ymm5[29,21],zero +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[3,1,1,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, 96(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[6,14],zero,zero,zero,zero,zero,xmm0[7,15],zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[6,14],zero,zero,zero,zero,zero,xmm1[7,15],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 +; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index c1f8c660ccb88..a03b03e120e88 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -989,29 +989,29 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128] -; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm6 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm4 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128] +; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm7 +; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128] +; AVX512-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31] +; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[3],ymm1[4],ymm6[5],ymm1[6],ymm6[7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27] ; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31] -; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm2 -; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27] -; AVX512-NEXT: vpshufb %ymm2, %ymm5, %ymm3 -; AVX512-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vpord %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpord %zmm4, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1072,29 +1072,29 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm6 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,4,12,128,128,4,12,128,128,5,13,128,128,5,13,22,30,128,128,22,30,128,128,23,31,128,128,23,31,128,128] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm6 +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm7 +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm6 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2],ymm6[3],ymm1[4],ymm6[5],ymm1[6],ymm6[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,0,8,128,128,0,8,128,128,1,9,128,128,1,9,18,26,128,128,18,26,128,128,19,27,128,128,19,27,128,128] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27] ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,12,128,128,4,12,128,128,5,13,128,128,5,13,128,128,128,128,22,30,128,128,22,30,128,128,23,31,128,128,23,31] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,8,128,128,0,8,128,128,1,9,128,128,1,9,128,128,128,128,18,26,128,128,18,26,128,128,19,27,128,128,19,27] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpord %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpord %zmm4, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1155,17 +1155,18 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zmm3[18,26],zero,zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zmm3[36,44],zero,zero,zero,zero,zero,zero,zmm3[37,45],zero,zero,zero,zero,zero,zero,zero,zero,zmm3[54,62],zero,zero,zero,zero,zero,zero,zmm3[55,63],zero,zero,zero,zero -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1,2,3,0,1] +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8],zero,zero,zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zmm4[18,26],zero,zero,zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zmm4[36,44],zero,zero,zero,zero,zero,zero,zmm4[37,45],zero,zero,zero,zero,zero,zero,zero,zero,zmm4[54,62],zero,zero,zero,zero,zero,zero,zmm4[55,63],zero,zero,zero,zero +; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[36,44],zero,zero,zero,zero,zero,zero,zmm0[37,45],zero,zero,zero,zero,zmm0[54,62],zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vporq %zmm4, %zmm0, %zmm0 +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18,26],zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zero,zero,zero,zmm1[54,62],zero,zero,zero,zero,zero,zero,zmm1[55,63] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zmm2[18,26],zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zmm2[54,62],zero,zero,zero,zero,zero,zero,zmm2[55,63],zero,zero ; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1216,17 +1217,18 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8],zero,zero,zero,zero,zero,zero,zmm3[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zmm3[18,26],zero,zero,zero,zero,zero,zero,zmm3[19,27],zero,zero,zero,zero,zmm3[36,44],zero,zero,zero,zero,zero,zero,zmm3[37,45],zero,zero,zero,zero,zero,zero,zero,zero,zmm3[54,62],zero,zero,zero,zero,zero,zero,zmm3[55,63],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1,2,3,0,1] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8],zero,zero,zero,zero,zero,zero,zmm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zmm4[18,26],zero,zero,zero,zero,zero,zero,zmm4[19,27],zero,zero,zero,zero,zmm4[36,44],zero,zero,zero,zero,zero,zero,zmm4[37,45],zero,zero,zero,zero,zero,zero,zero,zero,zmm4[54,62],zero,zero,zero,zero,zero,zero,zmm4[55,63],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[0,8],zero,zero,zero,zero,zero,zero,zmm0[1,9],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[36,44],zero,zero,zero,zero,zero,zero,zmm0[37,45],zero,zero,zero,zero,zmm0[54,62],zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %zmm3, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vporq %zmm4, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18,26],zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zero,zero,zero,zmm1[54,62],zero,zero,zero,zero,zero,zero,zmm1[55,63] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zmm2[18,26],zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zmm2[54,62],zero,zero,zero,zero,zero,zero,zmm2[55,63],zero,zero ; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) @@ -1788,62 +1790,62 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-NEXT: vmovdqa (%r11), %xmm3 -; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] -; AVX512-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] +; AVX512-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-NEXT: vmovdqa (%r11), %xmm0 +; AVX512-NEXT: vinserti128 $1, (%r10), %ymm0, %ymm4 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] +; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm6 +; AVX512-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] ; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] ; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] +; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm11 -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] +; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] ; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] ; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] -; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] ; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] ; AVX512-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 ; AVX512-NEXT: vpshufb %ymm7, %ymm9, %ymm9 ; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] ; AVX512-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3] +; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] -; AVX512-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm6 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm6 -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15] +; AVX512-NEXT: vpshufb %ymm10, %ymm2, %ymm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512-NEXT: vpshufb %ymm10, %ymm1, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX512-NEXT: vpshufb %ymm13, %ymm0, %ymm8 +; AVX512-NEXT: vpshufb %ymm13, %ymm1, %ymm8 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15] -; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4],ymm0[5],ymm6[6],ymm0[7] +; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7],ymm3[8,9,10],ymm4[11],ymm3[12,13,14],ymm4[15] +; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1852,62 +1854,62 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%r11), %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm6 +; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] ; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] +; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] +; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] ; AVX512-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] ; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4],ymm0[5],ymm6[6],ymm0[7] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7],ymm3[8,9,10],ymm4[11],ymm3[12,13,14],ymm4[15] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1916,62 +1918,62 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%r11), %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%r11), %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm0, %ymm4 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm6 +; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] ; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] +; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512DQ-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm9, %ymm11 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] +; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] ; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] ; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] ; AVX512DQ-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm9, %ymm9 ; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm2, %ymm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15] +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm2, %ymm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm1, %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm0, %ymm8 +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm1, %ymm8 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4],ymm0[5],ymm6[6],ymm0[7] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7],ymm3[8,9,10],ymm4[11],ymm3[12,13,14],ymm4[15] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -1980,62 +1982,62 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r11), %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,2,0,2] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [0,3076,0,3333,0,3590,0,3847] ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2] +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm11 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2] +; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm13 = [3076,3333,3590,3847] ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm14 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} ymm14 = [0,2048,0,2305,0,2562,0,2819] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15] ; AVX512DQ-FCP-NEXT: vpmovsxdq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm15 = [2048,2305,2562,2819] ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6],ymm0[7],ymm6[8,9,10],ymm0[11],ymm6[12,13,14],ymm0[15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7],ymm6[8,9,10],ymm5[11],ymm6[12,13,14],ymm5[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7],ymm2[8,9,10],ymm3[11],ymm2[12,13,14],ymm3[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4],ymm0[5],ymm6[6],ymm0[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6],ymm4[7],ymm3[8,9,10],ymm4[11],ymm3[12,13,14],ymm4[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -2048,26 +2050,24 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-NEXT: vmovdqa (%r11), %xmm3 -; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 -; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm4 = zmm0[0,2,0,2,4,6,4,6] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128,128,128] ; AVX512BW-NEXT: vpshufb %zmm5, %zmm4, %zmm4 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm1[0,2,0,2,4,6,4,6] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128] ; AVX512BW-NEXT: vpshufb %zmm7, %zmm6, %zmm6 ; AVX512BW-NEXT: vporq %zmm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm2[0,2,0,2,4,6,4,6] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128] ; AVX512BW-NEXT: vpshufb %zmm8, %zmm6, %zmm6 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm3[0,2,0,2,4,6,4,6] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] ; AVX512BW-NEXT: vpshufb %zmm10, %zmm9, %zmm9 @@ -2098,46 +2098,40 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512BW-FCP-NEXT: vmovdqa (%r11), %xmm3 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,2,0,2,0,2,0,2] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128,128,128] ; AVX512BW-FCP-NEXT: vpshufb %zmm6, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,6,4,6,4,6,4,6] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm7, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128] -; AVX512BW-FCP-NEXT: vpshufb %zmm9, %zmm8, %zmm8 -; AVX512BW-FCP-NEXT: vporq %zmm5, %zmm8, %zmm5 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128] +; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vporq %zmm5, %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128] +; AVX512BW-FCP-NEXT: vpshufb %zmm9, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128] -; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm7, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] -; AVX512BW-FCP-NEXT: vpshufb %zmm10, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vpshufb %zmm10, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vpord %zmm4, %zmm7, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,7,5,7,5,7,5,7] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm3 -; AVX512BW-FCP-NEXT: vpshufb %zmm10, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vpshufb %zmm9, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpord %zmm7, %zmm4, %zmm5 {%k1} ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3] ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb %zmm6, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpord %zmm1, %zmm3, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %zmm9, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vpshufb %zmm10, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -2152,26 +2146,24 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-NEXT: vmovdqa (%r11), %xmm3 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 -; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm4 = zmm0[0,2,0,2,4,6,4,6] ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128,128,128] ; AVX512DQ-BW-NEXT: vpshufb %zmm5, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm1[0,2,0,2,4,6,4,6] ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm7 = [128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128] ; AVX512DQ-BW-NEXT: vpshufb %zmm7, %zmm6, %zmm6 ; AVX512DQ-BW-NEXT: vporq %zmm4, %zmm6, %zmm4 +; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm2[0,2,0,2,4,6,4,6] ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm8 = [128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128] ; AVX512DQ-BW-NEXT: vpshufb %zmm8, %zmm6, %zmm6 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm3[0,2,0,2,4,6,4,6] ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] ; AVX512DQ-BW-NEXT: vpshufb %zmm10, %zmm9, %zmm9 @@ -2202,46 +2194,40 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r11), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [0,2,0,2,0,2,0,2] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128,128,128] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm6, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [4,6,4,6,4,6,4,6] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm7, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm9, %zmm8, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vporq %zmm5, %zmm8, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vporq %zmm5, %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm9, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm7, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm10, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm10, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vpord %zmm4, %zmm7, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,7,5,7,5,7,5,7] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm10, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm9, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpord %zmm7, %zmm4, %zmm5 {%k1} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm6, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpord %zmm1, %zmm3, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm9, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm10, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper From da5fb4213ff210f0d49c4ec837b72997cb9e69f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Tue, 1 Apr 2025 11:03:30 +0200 Subject: [PATCH 0214/1029] [Clang][SPIR-V] Fix convergence tokens for dtor (#133469) Destructor calls were emitted without convergence intrinsics when building for SPIR-V, which means invalid IR since we mixed controlled and non-controlled convergence. --- clang/lib/CodeGen/CGDeclCXX.cpp | 5 ++- clang/test/CodeGenHLSL/GlobalDestructors.hlsl | 40 +++++++++++++------ 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp index a01fa157c2b26..33c048b48795c 100644 --- a/clang/lib/CodeGen/CGDeclCXX.cpp +++ b/clang/lib/CodeGen/CGDeclCXX.cpp @@ -1150,7 +1150,7 @@ void CodeGenFunction::GenerateCXXGlobalCleanUpFunc( llvm::Constant *Arg; std::tie(CalleeTy, Callee, Arg) = DtorsOrStermFinalizers[e - i - 1]; - llvm::CallInst *CI = nullptr; + llvm::CallBase *CI = nullptr; if (Arg == nullptr) { assert( CGM.getCXXABI().useSinitAndSterm() && @@ -1162,6 +1162,9 @@ void CodeGenFunction::GenerateCXXGlobalCleanUpFunc( // Make sure the call and the callee agree on calling convention. if (llvm::Function *F = dyn_cast(Callee)) CI->setCallingConv(F->getCallingConv()); + + if (CGM.shouldEmitConvergenceTokens() && CI->isConvergent()) + CI = addConvergenceControlToken(CI); } } diff --git a/clang/test/CodeGenHLSL/GlobalDestructors.hlsl b/clang/test/CodeGenHLSL/GlobalDestructors.hlsl index f98318601134b..9f90971bafd05 100644 --- a/clang/test/CodeGenHLSL/GlobalDestructors.hlsl +++ b/clang/test/CodeGenHLSL/GlobalDestructors.hlsl @@ -1,5 +1,6 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CS,NOINLINE,CHECK -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=LIB,NOINLINE,CHECK +// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-compute -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CS,NOINLINE-SPIRV,CHECK +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=CS,NOINLINE-DXIL,CHECK +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes %s -o - | FileCheck %s --check-prefixes=LIB,NOINLINE-DXIL,CHECK // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=INLINE,CHECK // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -emit-llvm -O0 %s -o - | FileCheck %s --check-prefixes=INLINE,CHECK @@ -57,11 +58,19 @@ void main(unsigned GI : SV_GroupIndex) { // CHECK: define void @main() // CHECK-NEXT: entry: // Verify destructor is emitted -// NOINLINE-NEXT: call void @_GLOBAL__sub_I_GlobalDestructors.hlsl() -// NOINLINE-NEXT: %0 = call i32 @llvm.dx.flattened.thread.id.in.group() -// NOINLINE-NEXT: call void @_Z4mainj(i32 %0) -// NOINLINE-NEXT: call void @_GLOBAL__D_a() -// NOINLINE-NEXT: ret void +// NOINLINE-DXIL-NEXT: call void @_GLOBAL__sub_I_GlobalDestructors.hlsl() +// NOINLINE-DXIL-NEXT: %0 = call i32 @llvm.dx.flattened.thread.id.in.group() +// NOINLINE-DXIL-NEXT: call void @_Z4mainj(i32 %0) +// NOINLINE-DXIL-NEXT: call void @_GLOBAL__D_a() +// NOINLINE-DXIL-NEXT: ret void + +// NOINLINE-SPIRV-NEXT: %0 = call token @llvm.experimental.convergence.entry() +// NOINLINE-SPIRV-NEXT: call spir_func void @_GLOBAL__sub_I_GlobalDestructors.hlsl() [ "convergencectrl"(token %0) ] +// NOINLINE-SPIRV-NEXT: %1 = call i32 @llvm.spv.flattened.thread.id.in.group() +// NOINLINE-SPIRV-NEXT: call spir_func void @_Z4mainj(i32 %1) [ "convergencectrl"(token %0) ] +// NOINLINE-SPIRV-NEXT: call spir_func void @_GLOBAL__D_a() [ "convergencectrl"(token %0) ] +// NOINLINE-SPIRV-NEXT: ret void + // Verify inlining leaves only calls to "llvm." intrinsics // INLINE-NOT: call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}} // INLINE: ret void @@ -69,10 +78,17 @@ void main(unsigned GI : SV_GroupIndex) { // This is really just a sanity check I needed for myself to verify that // function scope static variables also get destroyed properly. -// NOINLINE: define internal void @_GLOBAL__D_a() [[IntAttr:\#[0-9]+]] -// NOINLINE-NEXT: entry: -// NOINLINE-NEXT: call void @_ZN4TailD1Ev(ptr @_ZZ3WagvE1T) -// NOINLINE-NEXT: call void @_ZN6PupperD1Ev(ptr @GlobalPup) -// NOINLINE-NEXT: ret void +// NOINLINE-DXIL: define internal void @_GLOBAL__D_a() [[IntAttr:\#[0-9]+]] +// NOINLINE-DXIL-NEXT: entry: +// NOINLINE-DXIL-NEXT: call void @_ZN4TailD1Ev(ptr @_ZZ3WagvE1T) +// NOINLINE-DXIL-NEXT: call void @_ZN6PupperD1Ev(ptr @GlobalPup) +// NOINLINE-DXIL-NEXT: ret void + +// NOINLINE-SPIRV: define internal spir_func void @_GLOBAL__D_a() [[IntAttr:\#[0-9]+]] +// NOINLINE-SPIRV-NEXT: entry: +// NOINLINE-SPIRV-NEXT: %0 = call token @llvm.experimental.convergence.entry() +// NOINLINE-SPIRV-NEXT: call spir_func void @_ZN4TailD1Ev(ptr @_ZZ3WagvE1T) [ "convergencectrl"(token %0) ] +// NOINLINE-SPIRV-NEXT: call spir_func void @_ZN6PupperD1Ev(ptr @GlobalPup) [ "convergencectrl"(token %0) ] +// NOINLINE-SPIRV-NEXT: ret void // NOINLINE: attributes [[IntAttr]] = {{.*}} alwaysinline From 7581cb68f9fbff7a4628da594580d81a803129ee Mon Sep 17 00:00:00 2001 From: David Spickett Date: Tue, 1 Apr 2025 10:05:11 +0100 Subject: [PATCH 0215/1029] [lldb] Remove lldb-server min and max gdbserver port options (#133275) Since lldb 20, these have had no effect: https://releases.llvm.org/20.1.0/docs/ReleaseNotes.html#changes-to-lldb > lldb-server now listens to a single port for gdbserver connections and > provides that port to the connection handler processes. This means that > only 2 ports need to be opened in the firewall (one for the lldb-server > platform, one for gdbserver connections). In addition, due to this work, lldb-server now works on Windows in the server mode. Remove them. --- lldb/tools/lldb-server/lldb-platform.cpp | 2 -- llvm/docs/ReleaseNotes.md | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lldb/tools/lldb-server/lldb-platform.cpp b/lldb/tools/lldb-server/lldb-platform.cpp index 51174a0f443c3..23d36ffb4cb66 100644 --- a/lldb/tools/lldb-server/lldb-platform.cpp +++ b/lldb/tools/lldb-server/lldb-platform.cpp @@ -67,8 +67,6 @@ static struct option g_long_options[] = { {"log-channels", required_argument, nullptr, 'c'}, {"listen", required_argument, nullptr, 'L'}, {"gdbserver-port", required_argument, nullptr, 'P'}, - {"min-gdbserver-port", required_argument, nullptr, 'm'}, - {"max-gdbserver-port", required_argument, nullptr, 'M'}, {"socket-file", required_argument, nullptr, 'f'}, {"server", no_argument, &g_server, 1}, {"child-platform-fd", required_argument, nullptr, 2}, diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index b989f477be6b5..58cf71b947083 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -220,6 +220,9 @@ Changes to LLDB information about the current state of the debugger at the bottom of the terminal. This is on by default and can be configured using the `show-statusline` and `statusline-format` settings. +* The `min-gdbserver-port` and `max-gdbserver-port` options have been removed + from `lldb-server`'s platform mode. Since the changes to `lldb-server`'s port + handling in LLDB 20, these options have had no effect. ### Changes to lldb-dap From dca7e0370e9684c00d95fb810c4efd31af0a3a9f Mon Sep 17 00:00:00 2001 From: dlav-sc Date: Tue, 1 Apr 2025 12:07:44 +0300 Subject: [PATCH 0216/1029] [lldb] add --platform-available-ports option to the dotest.py (#112555) This patch adds --platform-available-ports option to the dotest.py script to avoid hardcoded gdb ports in lldb testsuite. Currently, this option could be helpful in GdbRemoteTestCases (e.g. TestLldbGdbServer, TestNonStop, TestGdbRemoteThreadsInStopReply, TestGdbRemotePlatformFile, TestGdbRemote_vCont) --- lldb/packages/Python/lldbsuite/test/configuration.py | 1 + lldb/packages/Python/lldbsuite/test/dotest.py | 2 ++ lldb/packages/Python/lldbsuite/test/dotest_args.py | 8 ++++++++ lldb/packages/Python/lldbsuite/test/lldbtest.py | 4 ++++ .../test/tools/lldb-server/gdbremote_testcase.py | 2 ++ 5 files changed, 17 insertions(+) diff --git a/lldb/packages/Python/lldbsuite/test/configuration.py b/lldb/packages/Python/lldbsuite/test/configuration.py index bcc179346836d..18c1566176331 100644 --- a/lldb/packages/Python/lldbsuite/test/configuration.py +++ b/lldb/packages/Python/lldbsuite/test/configuration.py @@ -103,6 +103,7 @@ lldb_platform_name = None lldb_platform_url = None lldb_platform_working_dir = None +lldb_platform_available_ports = None # Apple SDK apple_sdk = None diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py index 681ea1638f2d6..7cc8f2985043e 100644 --- a/lldb/packages/Python/lldbsuite/test/dotest.py +++ b/lldb/packages/Python/lldbsuite/test/dotest.py @@ -419,6 +419,8 @@ def parseOptionsAndInitTestdirs(): configuration.lldb_platform_url = args.lldb_platform_url if args.lldb_platform_working_dir: configuration.lldb_platform_working_dir = args.lldb_platform_working_dir + if args.lldb_platform_available_ports: + configuration.lldb_platform_available_ports = args.lldb_platform_available_ports if platform_system == "Darwin" and args.apple_sdk: configuration.apple_sdk = args.apple_sdk if args.test_build_dir: diff --git a/lldb/packages/Python/lldbsuite/test/dotest_args.py b/lldb/packages/Python/lldbsuite/test/dotest_args.py index a80428ebec589..98210b7102e1b 100644 --- a/lldb/packages/Python/lldbsuite/test/dotest_args.py +++ b/lldb/packages/Python/lldbsuite/test/dotest_args.py @@ -292,6 +292,14 @@ def create_parser(): metavar="platform-working-dir", help="The directory to use on the remote platform.", ) + group.add_argument( + "--platform-available-ports", + dest="lldb_platform_available_ports", + nargs="*", + type=int, + metavar="platform-available-ports", + help="Ports available for connection to a lldb server on the remote platform", + ) # Test-suite behaviour group = parser.add_argument_group("Runtime behaviour options") diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index a055314673d18..db15a1d851677 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -749,6 +749,10 @@ def getSourcePath(self, name): """Return absolute path to a file in the test's source directory.""" return os.path.join(self.getSourceDir(), name) + def getPlatformAvailablePorts(self): + """Return ports available for connection to a lldb server on the remote platform.""" + return configuration.lldb_platform_available_ports + @classmethod def setUpCommands(cls): commands = [ diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py index 3d3ecb9aa8f95..19c766996292e 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/gdbremote_testcase.py @@ -185,6 +185,8 @@ def setUpServerLogging(self, is_llgs): ] def get_next_port(self): + if available_ports := self.getPlatformAvailablePorts(): + return random.choice(available_ports) return 12000 + random.randint(0, 7999) def reset_test_sequence(self): From e17d864f55133d46e12614280951ddb2dc43cc74 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Tue, 1 Apr 2025 10:26:14 +0100 Subject: [PATCH 0217/1029] [flang][OpenMP][Lower] lower array subscripts for task depend (#132994) The OpenMP standard says that all dependencies in the same set of inter-dependent tasks must be non-overlapping. This simplification means that the OpenMP only needs to keep track of the base addresses of dependency variables. This can be seen in kmp_taskdeps.cpp, which stores task dependency information in a hash table, using the base address as a key. This patch generates a rebox operation to slice boxed arrays, but only the box data address is used for the task dependency. The extra box is optimized away by LLVM at O3. Vector subscripts are TODO (I will address in my next patch). This also fixes a bug for ordinary subscripts when the symbol was mapped to a box: Fixes #132647 --- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 56 +++++++++++++++++-- flang/lib/Lower/OpenMP/ClauseProcessor.h | 3 +- flang/lib/Lower/OpenMP/OpenMP.cpp | 27 +++++---- ...-clause-vector-subscript-array-section.f90 | 11 ++++ flang/test/Lower/OpenMP/target.f90 | 18 ++++++ .../OpenMP/task-depend-array-section.f90 | 51 +++++++++++++++++ flang/test/Lower/OpenMP/task.f90 | 19 ++++++- 7 files changed, 165 insertions(+), 20 deletions(-) create mode 100644 flang/test/Lower/OpenMP/Todo/depend-clause-vector-subscript-array-section.f90 create mode 100644 flang/test/Lower/OpenMP/task-depend-array-section.f90 diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index c66fd46767b86..12ac6b3285575 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -14,6 +14,7 @@ #include "Clauses.h" #include "Utils.h" +#include "flang/Lower/ConvertExprToHLFIR.h" #include "flang/Lower/PFTBuilder.h" #include "flang/Parser/tools.h" #include "flang/Semantics/tools.h" @@ -808,7 +809,21 @@ bool ClauseProcessor::processCopyprivate( return hasCopyPrivate; } -bool ClauseProcessor::processDepend(mlir::omp::DependClauseOps &result) const { +template +static bool isVectorSubscript(const evaluate::Expr &expr) { + if (std::optional dataRef{evaluate::ExtractDataRef(expr)}) + if (const auto *arrayRef = std::get_if(&dataRef->u)) + for (const evaluate::Subscript &subscript : arrayRef->subscript()) + if (std::holds_alternative( + subscript.u)) + if (subscript.Rank() > 0) + return true; + return false; +} + +bool ClauseProcessor::processDepend(lower::SymMap &symMap, + lower::StatementContext &stmtCtx, + mlir::omp::DependClauseOps &result) const { auto process = [&](const omp::clause::Depend &clause, const parser::CharBlock &) { using Depend = omp::clause::Depend; @@ -819,6 +834,7 @@ bool ClauseProcessor::processDepend(mlir::omp::DependClauseOps &result) const { auto &taskDep = std::get(clause.u); auto depType = std::get(taskDep.t); auto &objects = std::get(taskDep.t); + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); if (std::get>(taskDep.t)) { TODO(converter.getCurrentLocation(), @@ -830,18 +846,46 @@ bool ClauseProcessor::processDepend(mlir::omp::DependClauseOps &result) const { for (const omp::Object &object : objects) { assert(object.ref() && "Expecting designator"); + mlir::Value dependVar; if (evaluate::ExtractSubstring(*object.ref())) { TODO(converter.getCurrentLocation(), "substring not supported for task depend"); } else if (evaluate::IsArrayElement(*object.ref())) { - TODO(converter.getCurrentLocation(), - "array sections not supported for task depend"); + // Array Section + SomeExpr expr = *object.ref(); + if (isVectorSubscript(expr)) + TODO(converter.getCurrentLocation(), + "Vector subscripted array section for task dependency"); + + hlfir::EntityWithAttributes entity = convertExprToHLFIR( + converter.getCurrentLocation(), converter, expr, symMap, stmtCtx); + dependVar = entity.getBase(); + } else { + semantics::Symbol *sym = object.sym(); + dependVar = converter.getSymbolAddress(*sym); } - semantics::Symbol *sym = object.sym(); - const mlir::Value variable = converter.getSymbolAddress(*sym); - result.dependVars.push_back(variable); + // If we pass a mutable box e.g. !fir.ref>> then + // the runtime will use the address of the box not the address of the + // data. Flang generates a lot of memcpys between different box + // allocations so this is not a reliable way to identify the dependency. + if (auto ref = mlir::dyn_cast(dependVar.getType())) + if (fir::isa_box_type(ref.getElementType())) + dependVar = builder.create( + converter.getCurrentLocation(), dependVar); + + // The openmp dialect doesn't know what to do with boxes (and it would + // break layering to teach it about them). The dependency variable can be + // a box because it was an array section or because the original symbol + // was mapped to a box. + // Getting the address of the box data is okay because all the runtime + // ultimately cares about is the base address of the array. + if (fir::isa_box_type(dependVar.getType())) + dependVar = builder.create( + converter.getCurrentLocation(), dependVar); + + result.dependVars.push_back(dependVar); } }; diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index aa203689ab560..6b1f7a31c7aac 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -104,7 +104,8 @@ class ClauseProcessor { bool processCopyin() const; bool processCopyprivate(mlir::Location currentLocation, mlir::omp::CopyprivateClauseOps &result) const; - bool processDepend(mlir::omp::DependClauseOps &result) const; + bool processDepend(lower::SymMap &symMap, lower::StatementContext &stmtCtx, + mlir::omp::DependClauseOps &result) const; bool processEnter(llvm::SmallVectorImpl &result) const; bool processIf(omp::clause::If::DirectiveNameModifier directiveName, diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 1304f06ed28cf..ab90b4609e855 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1672,15 +1672,15 @@ static void genSingleClauses(lower::AbstractConverter &converter, static void genTargetClauses( lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx, - lower::StatementContext &stmtCtx, lower::pft::Evaluation &eval, - const List &clauses, mlir::Location loc, - mlir::omp::TargetOperands &clauseOps, + lower::SymMap &symTable, lower::StatementContext &stmtCtx, + lower::pft::Evaluation &eval, const List &clauses, + mlir::Location loc, mlir::omp::TargetOperands &clauseOps, llvm::SmallVectorImpl &hasDeviceAddrSyms, llvm::SmallVectorImpl &isDevicePtrSyms, llvm::SmallVectorImpl &mapSyms) { ClauseProcessor cp(converter, semaCtx, clauses); cp.processBare(clauseOps); - cp.processDepend(clauseOps); + cp.processDepend(symTable, stmtCtx, clauseOps); cp.processDevice(stmtCtx, clauseOps); cp.processHasDeviceAddr(stmtCtx, clauseOps, hasDeviceAddrSyms); if (!hostEvalInfo.empty()) { @@ -1731,11 +1731,12 @@ static void genTargetDataClauses( static void genTargetEnterExitUpdateDataClauses( lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx, - lower::StatementContext &stmtCtx, const List &clauses, - mlir::Location loc, llvm::omp::Directive directive, + lower::SymMap &symTable, lower::StatementContext &stmtCtx, + const List &clauses, mlir::Location loc, + llvm::omp::Directive directive, mlir::omp::TargetEnterExitUpdateDataOperands &clauseOps) { ClauseProcessor cp(converter, semaCtx, clauses); - cp.processDepend(clauseOps); + cp.processDepend(symTable, stmtCtx, clauseOps); cp.processDevice(stmtCtx, clauseOps); cp.processIf(directive, clauseOps); @@ -1749,12 +1750,13 @@ static void genTargetEnterExitUpdateDataClauses( static void genTaskClauses(lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx, + lower::SymMap &symTable, lower::StatementContext &stmtCtx, const List &clauses, mlir::Location loc, mlir::omp::TaskOperands &clauseOps) { ClauseProcessor cp(converter, semaCtx, clauses); cp.processAllocate(clauseOps); - cp.processDepend(clauseOps); + cp.processDepend(symTable, stmtCtx, clauseOps); cp.processFinal(stmtCtx, clauseOps); cp.processIf(llvm::omp::Directive::OMPD_task, clauseOps); cp.processMergeable(clauseOps); @@ -2197,8 +2199,8 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, mlir::omp::TargetOperands clauseOps; llvm::SmallVector mapSyms, isDevicePtrSyms, hasDeviceAddrSyms; - genTargetClauses(converter, semaCtx, stmtCtx, eval, item->clauses, loc, - clauseOps, hasDeviceAddrSyms, isDevicePtrSyms, mapSyms); + genTargetClauses(converter, semaCtx, symTable, stmtCtx, eval, item->clauses, + loc, clauseOps, hasDeviceAddrSyms, isDevicePtrSyms, mapSyms); DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval, /*shouldCollectPreDeterminedSymbols=*/ @@ -2418,7 +2420,7 @@ static OpTy genTargetEnterExitUpdateDataOp( } mlir::omp::TargetEnterExitUpdateDataOperands clauseOps; - genTargetEnterExitUpdateDataClauses(converter, semaCtx, stmtCtx, + genTargetEnterExitUpdateDataClauses(converter, semaCtx, symTable, stmtCtx, item->clauses, loc, directive, clauseOps); return firOpBuilder.create(loc, clauseOps); @@ -2431,7 +2433,8 @@ genTaskOp(lower::AbstractConverter &converter, lower::SymMap &symTable, ConstructQueue::const_iterator item) { lower::StatementContext stmtCtx; mlir::omp::TaskOperands clauseOps; - genTaskClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps); + genTaskClauses(converter, semaCtx, symTable, stmtCtx, item->clauses, loc, + clauseOps); if (!enableDelayedPrivatization) return genOpWithBody( diff --git a/flang/test/Lower/OpenMP/Todo/depend-clause-vector-subscript-array-section.f90 b/flang/test/Lower/OpenMP/Todo/depend-clause-vector-subscript-array-section.f90 new file mode 100644 index 0000000000000..f3bd58c8c559a --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/depend-clause-vector-subscript-array-section.f90 @@ -0,0 +1,11 @@ +! RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s + +! CHECK: Vector subscripted array section for task dependency +subroutine vectorSubscriptArraySection(array, indices) + integer :: array(:) + integer :: indices(:) + + !$omp task depend (in: array(indices)) + !$omp end task +end subroutine diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90 index 868116a04dc53..36877210c136d 100644 --- a/flang/test/Lower/OpenMP/target.f90 +++ b/flang/test/Lower/OpenMP/target.f90 @@ -35,6 +35,24 @@ subroutine omp_target_enter_depend return end subroutine omp_target_enter_depend +!CHECK-LABEL: func.func @_QPomp_target_enter_depend_section() { +subroutine omp_target_enter_depend_section + !CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFomp_target_enter_depend_sectionEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + integer :: a(1024) + + !CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[A]]#0 ({{.*}}) shape %{{.*}} : (!fir.ref>, index, index, index, !fir.shape<1>) -> !fir.ref> + !CHECK: omp.task depend(taskdependout -> %[[DESIGNATE]] : !fir.ref>) private({{.*}}) { + !$omp task depend(out: a(1:512)) + call foo(a) + !$omp end task + !CHECK: %[[DESIGNATE2:.*]] = hlfir.designate %[[A]]#0 ({{.*}}) shape %{{.*}} : (!fir.ref>, index, index, index, !fir.shape<1>) -> !fir.ref> + !CHECK: %[[BOUNDS:.*]] = omp.map.bounds lower_bound({{.*}}) upper_bound({{.*}}) extent({{.*}}) stride({{.*}}) start_idx({{.*}}) + !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}}) map_clauses(to) capture(ByRef) bounds(%[[BOUNDS]]) -> !fir.ref> {name = "a"} + !CHECK: omp.target_enter_data depend(taskdependin -> %[[DESIGNATE2]] : !fir.ref>) map_entries(%[[MAP]] : !fir.ref>) + !$omp target enter data map(to: a) depend(in: a(1:512)) + return +end subroutine omp_target_enter_depend_section + !=============================================================================== ! Target_Enter Map types !=============================================================================== diff --git a/flang/test/Lower/OpenMP/task-depend-array-section.f90 b/flang/test/Lower/OpenMP/task-depend-array-section.f90 new file mode 100644 index 0000000000000..b364a5e06a29c --- /dev/null +++ b/flang/test/Lower/OpenMP/task-depend-array-section.f90 @@ -0,0 +1,51 @@ +! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s + +subroutine knownShape(array) + integer :: array(10) + + !$omp task depend(in: array(2:8)) + !$omp end task +end subroutine + +! CHECK-LABEL: func.func @_QPknownshape( +! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref> {fir.bindc_name = "array"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]] = arith.constant 10 : index +! CHECK: %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1> +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFknownshapeEarray"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[VAL_5:.*]] = arith.constant 2 : index +! CHECK: %[[VAL_6:.*]] = arith.constant 8 : index +! CHECK: %[[VAL_7:.*]] = arith.constant 1 : index +! CHECK: %[[VAL_8:.*]] = arith.constant 7 : index +! CHECK: %[[VAL_9:.*]] = fir.shape %[[VAL_8]] : (index) -> !fir.shape<1> +! CHECK: %[[VAL_10:.*]] = hlfir.designate %[[VAL_4]]#0 (%[[VAL_5]]:%[[VAL_6]]:%[[VAL_7]]) shape %[[VAL_9]] : (!fir.ref>, index, index, index, !fir.shape<1>) -> !fir.ref> +! CHECK: omp.task depend(taskdependin -> %[[VAL_10]] : !fir.ref>) { +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } + + +subroutine assumedShape(array) + integer :: array(:) + + !$omp task depend(in: array(2:8:2)) + !$omp end task +end subroutine + +! CHECK-LABEL: func.func @_QPassumedshape( +! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box> {fir.bindc_name = "array"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFassumedshapeEarray"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) +! CHECK: %[[VAL_3:.*]] = arith.constant 2 : index +! CHECK: %[[VAL_4:.*]] = arith.constant 8 : index +! CHECK: %[[VAL_5:.*]] = arith.constant 2 : index +! CHECK: %[[VAL_6:.*]] = arith.constant 4 : index +! CHECK: %[[VAL_7:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1> +! CHECK: %[[VAL_8:.*]] = hlfir.designate %[[VAL_2]]#0 (%[[VAL_3]]:%[[VAL_4]]:%[[VAL_5]]) shape %[[VAL_7]] : (!fir.box>, index, index, index, !fir.shape<1>) -> !fir.box> +! CHECK: %[[VAL_9:.*]] = fir.box_addr %[[VAL_8]] : (!fir.box>) -> !fir.ref> +! CHECK: omp.task depend(taskdependin -> %[[VAL_9]] : !fir.ref>) { +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } diff --git a/flang/test/Lower/OpenMP/task.f90 b/flang/test/Lower/OpenMP/task.f90 index 393801997aebc..67194fa5b19a3 100644 --- a/flang/test/Lower/OpenMP/task.f90 +++ b/flang/test/Lower/OpenMP/task.f90 @@ -93,7 +93,7 @@ subroutine task_depend_non_int() character(len = 15) :: x integer, allocatable :: y complex :: z - !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref>, taskdependin -> %{{.+}} : !fir.ref>>, taskdependin -> %{{.+}} : !fir.ref>) { + !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref>, taskdependin -> %{{.+}} : !fir.heap, taskdependin -> %{{.+}} : !fir.ref>) { !$omp task depend(in : x, y, z) !CHECK: omp.terminator !$omp end task @@ -158,6 +158,23 @@ subroutine task_depend_multi_task() !$omp end task end subroutine task_depend_multi_task +subroutine task_depend_box(array) + integer :: array(:) + !CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %{{.*}} : (!fir.box>) -> !fir.ref> + !CHECK: omp.task depend(taskdependin -> %[[BOX_ADDR]] : !fir.ref>) + !$omp task depend(in: array) + !$omp end task +end subroutine + +subroutine task_depend_mutable_box(alloc) + integer, allocatable :: alloc + !CHECK: %[[LOAD:.*]] = fir.load %{{.*}} : !fir.ref>> + !CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[LOAD]] : (!fir.box>) -> !fir.heap + !CHECK: omp.task depend(taskdependin -> %[[BOX_ADDR]] : !fir.heap) + !$omp task depend(in: alloc) + !$omp end task +end subroutine + !=============================================================================== ! `private` clause !=============================================================================== From 66fca0674d83254c70af4a6289496b8acc4377df Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Tue, 1 Apr 2025 10:29:08 +0100 Subject: [PATCH 0218/1029] [OpenMP] Fix num_iters in __kmpc_*_loop DeviceRTL functions (#133435) This patch removes the addition of 1 to the number of iterations when calling the following DeviceRTL functions: - `__kmpc_distribute_for_static_loop*` - `__kmpc_distribute_static_loop*` - `__kmpc_for_static_loop*` Calls to these functions are currently only produced by the OMPIRBuilder from flang, which already passes the correct number of iterations to these functions. By adding 1 to the received `num_iters` variable, worksharing can produce incorrect results. This impacts flang OpenMP offloading of `do`, `distribute` and `distribute parallel do` constructs. Expecting the application to pass `tripcount - 1` as the argument seems unexpected as well, so rather than updating flang I think it makes more sense to update the runtime. --- offload/DeviceRTL/src/Workshare.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index 861b9ca371ccd..a8759307b42bd 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -911,19 +911,19 @@ template class StaticLoopChunker { IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ TY num_threads, TY block_chunk, TY thread_chunk) { \ ompx::StaticLoopChunker::DistributeFor( \ - loc, fn, arg, num_iters + 1, num_threads, block_chunk, thread_chunk); \ + loc, fn, arg, num_iters, num_threads, block_chunk, thread_chunk); \ } \ [[gnu::flatten, clang::always_inline]] void \ __kmpc_distribute_static_loop##BW(IdentTy *loc, void (*fn)(TY, void *), \ void *arg, TY num_iters, \ TY block_chunk) { \ - ompx::StaticLoopChunker::Distribute(loc, fn, arg, num_iters + 1, \ + ompx::StaticLoopChunker::Distribute(loc, fn, arg, num_iters, \ block_chunk); \ } \ [[gnu::flatten, clang::always_inline]] void __kmpc_for_static_loop##BW( \ IdentTy *loc, void (*fn)(TY, void *), void *arg, TY num_iters, \ TY num_threads, TY thread_chunk) { \ - ompx::StaticLoopChunker::For(loc, fn, arg, num_iters + 1, num_threads, \ + ompx::StaticLoopChunker::For(loc, fn, arg, num_iters, num_threads, \ thread_chunk); \ } From 7f14b2a9eb4792155ed31da7bc16cc58cbb1b0fc Mon Sep 17 00:00:00 2001 From: Shoreshen <372660931@qq.com> Date: Tue, 1 Apr 2025 17:37:02 +0800 Subject: [PATCH 0219/1029] Revert "[AMDGPU][CodeGenPrepare] Narrow 64 bit math to 32 bit if profitable" (#133880) Reverts llvm/llvm-project#130577 --- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 84 ------- .../AMDGPU/amdgpu-codegenprepare-mul24.ll | 5 +- .../atomic_optimizations_global_pointer.ll | 52 ++-- .../CodeGen/AMDGPU/narrow_math_for_and.ll | 231 ------------------ llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 9 +- 5 files changed, 34 insertions(+), 347 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index eb5c160670992..9c482aeb3ea5c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -1561,87 +1561,6 @@ void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const { llvm_unreachable("not a division"); } -Type *findSmallestLegalBits(Instruction *I, int OrigBit, int MaxBitsNeeded, - const TargetLowering *TLI, const DataLayout &DL) { - if (MaxBitsNeeded >= OrigBit) - return nullptr; - - Type *NewType = I->getType()->getWithNewBitWidth(MaxBitsNeeded); - while (OrigBit > MaxBitsNeeded) { - if (TLI->isOperationLegalOrCustom( - TLI->InstructionOpcodeToISD(I->getOpcode()), - TLI->getValueType(DL, NewType, true))) - return NewType; - - MaxBitsNeeded *= 2; - NewType = I->getType()->getWithNewBitWidth(MaxBitsNeeded); - } - return nullptr; -} - -static bool tryNarrowMathIfNoOverflow(Instruction *I, const TargetLowering *TLI, - const TargetTransformInfo &TTI, - const DataLayout &DL) { - unsigned Opc = I->getOpcode(); - Type *OldType = I->getType(); - - if (Opc != Instruction::Add && Opc != Instruction::Mul) - return false; - - unsigned OrigBit = OldType->getScalarSizeInBits(); - unsigned MaxBitsNeeded = OrigBit; - - switch (Opc) { - case Instruction::Add: - MaxBitsNeeded = KnownBits::add(computeKnownBits(I->getOperand(0), DL), - computeKnownBits(I->getOperand(1), DL)) - .countMaxActiveBits(); - break; - case Instruction::Mul: - MaxBitsNeeded = KnownBits::mul(computeKnownBits(I->getOperand(0), DL), - computeKnownBits(I->getOperand(1), DL)) - .countMaxActiveBits(); - break; - default: - llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and " - "Instruction::Mul."); - } - - MaxBitsNeeded = std::max(bit_ceil(MaxBitsNeeded), 8); - Type *NewType = findSmallestLegalBits(I, OrigBit, MaxBitsNeeded, TLI, DL); - - if (!NewType) - return false; - - // Old cost - InstructionCost OldCost = - TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput); - // New cost of new op - InstructionCost NewCost = - TTI.getArithmeticInstrCost(Opc, NewType, TTI::TCK_RecipThroughput); - // New cost of narrowing 2 operands (use trunc) - NewCost += 2 * TTI.getCastInstrCost(Instruction::Trunc, NewType, OldType, - TTI.getCastContextHint(I), - TTI::TCK_RecipThroughput); - // New cost of zext narrowed result to original type - NewCost += - TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType, - TTI.getCastContextHint(I), TTI::TCK_RecipThroughput); - if (NewCost >= OldCost) - return false; - - IRBuilder<> Builder(I); - Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType); - Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType); - Value *Arith = - Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1); - - Value *Zext = Builder.CreateZExt(Arith, OldType); - I->replaceAllUsesWith(Zext); - I->eraseFromParent(); - return true; -} - bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (foldBinOpIntoSelect(I)) return true; @@ -1726,9 +1645,6 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { } } - Changed = tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(), - TM.getTargetTransformInfo(F), DL); - return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll index d7c35a8b007c6..296b817bc8f75 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll @@ -414,10 +414,7 @@ define i64 @umul24_i64_2(i64 %lhs, i64 %rhs) { ; DISABLED-LABEL: @umul24_i64_2( ; DISABLED-NEXT: [[LHS24:%.*]] = and i64 [[LHS:%.*]], 65535 ; DISABLED-NEXT: [[RHS24:%.*]] = and i64 [[RHS:%.*]], 65535 -; DISABLED-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 -; DISABLED-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 -; DISABLED-NEXT: [[TMP3:%.*]] = mul i32 [[TMP1]], [[TMP2]] -; DISABLED-NEXT: [[MUL:%.*]] = zext i32 [[TMP3]] to i64 +; DISABLED-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] ; DISABLED-NEXT: ret i64 [[MUL]] ; %lhs24 = and i64 %lhs, 65535 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index e2dfcf55b7856..62083b3e67ab6 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1823,22 +1823,22 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-NEXT: s_cbranch_execz .LBB3_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1264-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264-NEXT: s_wait_alu 0xfffe -; GFX1264-NEXT: s_mul_i32 s6, s6, 5 +; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5 +; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mov_b32_e32 v0, s6 -; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: v_mov_b32_e32 v1, s7 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 @@ -1860,19 +1860,20 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-LABEL: add_i64_constant: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1232-NEXT: s_mov_b32 s7, exec_lo +; GFX1232-NEXT: s_mov_b32 s5, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX1232-NEXT: s_mov_b32 s6, exec_lo -; GFX1232-NEXT: s_mov_b32 s4, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB3_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6 +; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_mul_i32 s5, s5, 5 +; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 +; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 @@ -1880,7 +1881,8 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB3_2: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1232-NEXT: s_wait_alu 0xfffe +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 @@ -5370,22 +5372,22 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264: ; %bb.0: ; %entry ; GFX1264-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1264-NEXT: s_mov_b64 s[6:7], exec -; GFX1264-NEXT: s_mov_b64 s[4:5], exec +; GFX1264-NEXT: s_mov_b32 s9, 0 ; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX1264-NEXT: s_mov_b64 s[4:5], exec ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 ; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1264-NEXT: s_cbranch_execz .LBB9_2 ; GFX1264-NEXT: ; %bb.1: -; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1264-NEXT: v_mov_b32_e32 v1, 0 -; GFX1264-NEXT: s_wait_alu 0xfffe -; GFX1264-NEXT: s_mul_i32 s6, s6, 5 +; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7] ; GFX1264-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5 +; GFX1264-NEXT: s_mov_b32 s10, -1 ; GFX1264-NEXT: s_wait_alu 0xfffe ; GFX1264-NEXT: v_mov_b32_e32 v0, s6 -; GFX1264-NEXT: s_mov_b32 s10, -1 +; GFX1264-NEXT: v_mov_b32_e32 v1, s7 ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: s_mov_b32 s8, s2 ; GFX1264-NEXT: s_mov_b32 s9, s3 @@ -5410,19 +5412,20 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-LABEL: sub_i64_constant: ; GFX1232: ; %bb.0: ; %entry ; GFX1232-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1232-NEXT: s_mov_b32 s7, exec_lo +; GFX1232-NEXT: s_mov_b32 s5, 0 +; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s7, 0 ; GFX1232-NEXT: s_mov_b32 s6, exec_lo -; GFX1232-NEXT: s_mov_b32 s4, exec_lo -; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s6, 0 ; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1232-NEXT: s_cbranch_execz .LBB9_2 ; GFX1232-NEXT: ; %bb.1: -; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s6 +; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s7 ; GFX1232-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-NEXT: s_mul_i32 s5, s5, 5 +; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5 ; GFX1232-NEXT: s_mov_b32 s10, -1 -; GFX1232-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 +; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: s_mov_b32 s8, s2 ; GFX1232-NEXT: s_mov_b32 s9, s3 @@ -5430,7 +5433,8 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_loadcnt 0x0 ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB9_2: -; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1232-NEXT: s_wait_alu 0xfffe +; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2 diff --git a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll deleted file mode 100644 index 3f49b1e550595..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll +++ /dev/null @@ -1,231 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 - -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s - -define i64 @narrow_add(i64 %a, i64 %b) { -; CHECK-LABEL: narrow_add: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and i64 %a, 2147483647 - %zext1 = and i64 %b, 2147483647 - %add = add i64 %zext0, %zext1 - ret i64 %add -} - -define i64 @narrow_add_1(i64 %a, i64 %b) { -; CHECK-LABEL: narrow_add_1: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and i64 %a, 2147483647 - %zext1 = and i64 %b, 2147483648 - %add = add i64 %zext0, %zext1 - ret i64 %add -} - -define <2 x i64> @narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 { -; CHECK-LABEL: narrow_add_vec: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v4 -; CHECK-NEXT: v_and_b32_e32 v2, 30, v2 -; CHECK-NEXT: v_and_b32_e32 v3, 0x7ffffffe, v6 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_add_co_u32 v0, s0, v0, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_add_co_u32 v2, s0, v2, v3 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and <2 x i64> %a, - %zext1 = and <2 x i64> %b, - %add = add <2 x i64> %zext0, %zext1 - ret <2 x i64> %add -} - -define <2 x i32> @narrow_add_vec_1(<2 x i32> %a, <2 x i32> %b) #0 { -; CHECK-LABEL: narrow_add_vec_1: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 0x3fff, v1 -; CHECK-NEXT: v_and_b32_e32 v0, 0x4000, v0 -; CHECK-NEXT: v_and_b32_e32 v3, 0x4001, v3 -; CHECK-NEXT: v_and_b32_e32 v2, 0x4000, v2 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; CHECK-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; CHECK-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_pk_add_u16 v1, v0, v1 -; CHECK-NEXT: v_and_b32_e32 v0, 0xc000, v1 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and <2 x i32> %a, - %zext1 = and <2 x i32> %b, - %add = add <2 x i32> %zext0, %zext1 - ret <2 x i32> %add -} - -define i64 @narrow_mul(i64 %a, i64 %b) { -; CHECK-LABEL: narrow_mul: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 2, v2 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-NEXT: v_mul_lo_u32 v0, v0, v1 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and i64 %a, 2147483647 - %zext1 = and i64 %b, 2 - %mul = mul i64 %zext0, %zext1 - ret i64 %mul -} - -define i64 @narrow_mul_1(i64 %a, i64 %b) { -; CHECK-LABEL: narrow_mul_1: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 0xf73594, v0 -; CHECK-NEXT: v_and_b32_e32 v2, 0x100, v2 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-NEXT: v_mul_u32_u24_e32 v0, v1, v2 -; CHECK-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v2 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and i64 %a, 16201108 - %zext1 = and i64 %b, 256 - %mul = mul i64 %zext0, %zext1 - ret i64 %mul -} - -define <2 x i64> @narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 { -; CHECK-LABEL: narrow_mul_vec: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v0, 0x2d48aff, v0 -; CHECK-NEXT: v_and_b32_e32 v1, 0x50, v4 -; CHECK-NEXT: v_and_b32_e32 v3, 50, v2 -; CHECK-NEXT: v_and_b32_e32 v4, 20, v6 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; CHECK-NEXT: v_mul_lo_u32 v0, v0, v1 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_mul_u32_u24_e32 v2, v3, v4 -; CHECK-NEXT: v_mul_hi_u32_u24_e32 v3, v3, v4 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and <2 x i64> %a, - %zext1 = and <2 x i64> %b, - %mul = mul <2 x i64> %zext0, %zext1 - ret <2 x i64> %mul -} - -define <2 x i32> @narrow_add_mul_1(<2 x i32> %a, <2 x i32> %b) #0 { -; CHECK-LABEL: narrow_add_mul_1: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 0x4000, v1 -; CHECK-NEXT: v_and_b32_e32 v0, 0x4000, v0 -; CHECK-NEXT: v_and_b32_e32 v2, 3, v2 -; CHECK-NEXT: v_and_b32_e32 v3, 2, v3 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; CHECK-NEXT: v_mul_u32_u24_e32 v0, v0, v2 -; CHECK-NEXT: v_mul_u32_u24_e32 v1, v1, v3 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and <2 x i32> %a, - %zext1 = and <2 x i32> %b, - %mul = mul <2 x i32> %zext0, %zext1 - ret <2 x i32> %mul -} - -define i64 @no_narrow_add(i64 %a, i64 %b) { -; CHECK-LABEL: no_narrow_add: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; CHECK-NEXT: v_and_b32_e32 v1, 0x80000000, v2 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_add_co_u32 v0, s0, v0, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and i64 %a, 2147483648 - %zext1 = and i64 %b, 2147483648 - %add = add i64 %zext0, %zext1 - ret i64 %add -} - -define i64 @no_narrow_add_1(i64 %a, i64 %b) { -; CHECK-LABEL: no_narrow_add_1: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 1, v2 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_add_co_u32 v0, s0, v0, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and i64 %a, 4294967295 - %zext1 = and i64 %b, 1 - %add = add i64 %zext0, %zext1 - ret i64 %add -} - -define <2 x i64> @no_narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 { -; CHECK-LABEL: no_narrow_add_vec: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; CHECK-NEXT: v_and_b32_e32 v1, 0x80000000, v4 -; CHECK-NEXT: v_and_b32_e32 v2, 30, v2 -; CHECK-NEXT: v_and_b32_e32 v3, 0x7ffffffe, v6 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_add_co_u32 v0, s0, v0, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0, s0 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; CHECK-NEXT: v_add_co_u32 v2, s0, v2, v3 -; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and <2 x i64> %a, - %zext1 = and <2 x i64> %b, - %add = add <2 x i64> %zext0, %zext1 - ret <2 x i64> %add -} - -define i64 @no_narrow_mul(i64 %a, i64 %b) { -; CHECK-LABEL: no_narrow_mul: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v0, 0x80000000, v0 -; CHECK-NEXT: v_and_b32_e32 v1, 2, v2 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) -; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and i64 %a, 2147483648 - %zext1 = and i64 %b, 2 - %mul = mul i64 %zext0, %zext1 - ret i64 %mul -} - -define <2 x i64> @no_narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 { -; CHECK-LABEL: no_narrow_mul_vec: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 0x8000, v0 -; CHECK-NEXT: v_and_b32_e32 v3, 0x20000, v4 -; CHECK-NEXT: v_and_b32_e32 v4, 50, v2 -; CHECK-NEXT: v_and_b32_e32 v5, 20, v6 -; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; CHECK-NEXT: v_mul_u32_u24_e32 v0, v1, v3 -; CHECK-NEXT: v_mul_hi_u32_u24_e32 v1, v1, v3 -; CHECK-NEXT: v_mul_u32_u24_e32 v2, v4, v5 -; CHECK-NEXT: v_mul_hi_u32_u24_e32 v3, v4, v5 -; CHECK-NEXT: s_setpc_b64 s[30:31] - %zext0 = and <2 x i64> %a, - %zext1 = and <2 x i64> %b, - %mul = mul <2 x i64> %zext0, %zext1 - ret <2 x i64> %mul -} diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index 4eb7761bfbddd..4290590e99711 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -508,16 +508,17 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; SI-LABEL: widen_i1_zext_to_i64_constant_load: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[0:1], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s2, s2, 0xff -; SI-NEXT: s_addk_i32 s2, 0x3e7 -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_and_b32 s2, s2, 1 +; SI-NEXT: s_add_u32 s4, s2, 0x3e7 +; SI-NEXT: s_addc_u32 s5, 0, 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; From 337bad3921356fba89409e03793f7d2df846c0e9 Mon Sep 17 00:00:00 2001 From: Afanasyev Ivan Date: Tue, 1 Apr 2025 17:06:30 +0700 Subject: [PATCH 0220/1029] [EarlyIfConverter] Fix reg killed twice after early-if-predicator and ifcvt (#133554) Bug relates to `early-if-predicator` and `early-ifcvt` passes. If virtual register has "killed" flag in both basic blocks to be merged into head, both instructions in head basic block will have "killed" flag for this register. It makes MIR incorrect. Example: ``` bb.0: ; if ... %0:intregs = COPY $r0 J2_jumpf %2, %bb.2, implicit-def dead $pc J2_jump %bb.1, implicit-def dead $pc bb.1: ; if.then ... S4_storeiri_io killed %0, 0, 1 J2_jump %bb.3, implicit-def dead $pc bb.2: ; if.else ... S4_storeiri_io killed %0, 0, 1 J2_jump %bb.3, implicit-def dead $pc ``` After early-if-predicator will become: ``` bb.0: %0:intregs = COPY $r0 S4_storeirif_io %1, killed %0, 0, 1 S4_storeirit_io %1, killed %0, 0, 1 ``` Having `killed` flag set twice in bb.0 for `%0` is an incorrect MIR. --- llvm/lib/CodeGen/EarlyIfConversion.cpp | 39 ++++++++++++++ .../early-ifcvt-on-double-killed-reg.mir | 54 +++++++++++++++++++ ...ly-if-predicator-reg-killed-everywhere.mir | 52 ++++++++++++++++++ 3 files changed, 145 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/early-ifcvt-on-double-killed-reg.mir create mode 100644 llvm/test/CodeGen/Hexagon/early-if-predicator-reg-killed-everywhere.mir diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp index 24c6dafc60459..da0987c3b50bb 100644 --- a/llvm/lib/CodeGen/EarlyIfConversion.cpp +++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp @@ -17,6 +17,7 @@ #include "llvm/CodeGen/EarlyIfConversion.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SparseSet.h" @@ -31,6 +32,7 @@ #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -163,6 +165,11 @@ class SSAIfConv { /// Insert selects and rewrite PHI operands to use them. void rewritePHIOperands(); + /// If virtual register has "killed" flag in TBB and FBB basic blocks, remove + /// the flag in TBB instruction. + void clearRepeatedKillFlagsFromTBB(MachineBasicBlock *TBB, + MachineBasicBlock *FBB); + public: /// init - Initialize per-function data structures. void init(MachineFunction &MF) { @@ -675,6 +682,31 @@ void SSAIfConv::rewritePHIOperands() { } } +void SSAIfConv::clearRepeatedKillFlagsFromTBB(MachineBasicBlock *TBB, + MachineBasicBlock *FBB) { + assert(TBB != FBB); + + // Collect virtual registers killed in FBB. + SmallDenseSet FBBKilledRegs; + for (MachineInstr &MI : FBB->instrs()) { + for (MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.isKill() && MO.getReg().isVirtual()) + FBBKilledRegs.insert(MO.getReg()); + } + } + + if (FBBKilledRegs.empty()) + return; + + // Find the same killed registers in TBB and clear kill flags for them. + for (MachineInstr &MI : TBB->instrs()) { + for (MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.isKill() && FBBKilledRegs.contains(MO.getReg())) + MO.setIsKill(false); + } + } +} + /// convertIf - Execute the if conversion after canConvertIf has determined the /// feasibility. /// @@ -690,6 +722,13 @@ void SSAIfConv::convertIf(SmallVectorImpl &RemoveBlocks, else ++NumDiamondsConv; + // If both blocks are going to be merged into Head, remove "killed" flag in + // TBB for registers, which are killed in TBB and FBB. Otherwise, register + // will be killed twice in Head after splice. Register killed twice is an + // incorrect MIR. + if (TBB != Tail && FBB != Tail) + clearRepeatedKillFlagsFromTBB(TBB, FBB); + // Move all instructions into Head, except for the terminators. if (TBB != Tail) { if (Predicate) diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-on-double-killed-reg.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-on-double-killed-reg.mir new file mode 100644 index 0000000000000..27222e46b9c10 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/early-ifcvt-on-double-killed-reg.mir @@ -0,0 +1,54 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64-- -run-pass=early-ifcvt -stress-early-ifcvt %s -o - -verify-machineinstrs | FileCheck %s + +# Test that "killed" flag on the same virtual register in merged blocks is +# removed for the first spliced block and is saved for the second one. +# Otherwise, register will be killed twice in a single block in the resulting +# MIR, which is incorrect. + +--- +name: my_func +tracksRegLiveness: true +liveins: + - { reg: '$w0', virtual-reg: '%0' } +body: | + ; CHECK-LABEL: name: my_func + ; CHECK: bb.0.entry: + ; CHECK-NEXT: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32common = COPY $w0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri [[COPY]], 1, 0, implicit-def $nzcv + ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32common = ADDWri [[COPY]], 3, 0 + ; CHECK-NEXT: [[SUBWri:%[0-9]+]]:gpr32common = SUBWri killed [[COPY]], 2, 0 + ; CHECK-NEXT: [[CSELWr:%[0-9]+]]:gpr32common = CSELWr [[ADDWri]], [[SUBWri]], 1, implicit $nzcv + ; CHECK-NEXT: $x2 = COPY [[COPY1]] + ; CHECK-NEXT: RET_ReallyLR implicit $x2 + bb.0.entry: + successors: %bb.1, %bb.2 + liveins: $w0 + + %0:gpr32common = COPY $w0 + %1:gpr64common = COPY $x0 + %2:gpr32 = SUBSWri %0, 1, 0, implicit-def $nzcv + Bcc 1, %bb.2, implicit $nzcv + B %bb.1 + + bb.1: + successors: %bb.3 + + %3:gpr32common = SUBWri killed %0, 2, 0 + B %bb.3 + + bb.2: + successors: %bb.3 + + %4:gpr32common = ADDWri killed %0, 3, 0 + B %bb.3 + + bb.3: + %5:gpr32common = PHI %3, %bb.1, %4, %bb.2 + $x2 = COPY %1 + RET_ReallyLR implicit $x2 + +... diff --git a/llvm/test/CodeGen/Hexagon/early-if-predicator-reg-killed-everywhere.mir b/llvm/test/CodeGen/Hexagon/early-if-predicator-reg-killed-everywhere.mir new file mode 100644 index 0000000000000..f189e89432dec --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/early-if-predicator-reg-killed-everywhere.mir @@ -0,0 +1,52 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=hexagon -run-pass early-if-predicator %s -o - -verify-machineinstrs | FileCheck %s + +# Test that "killed" flag on the same virtual register in merged blocks is +# removed for the first spliced block and is saved for the second one. +# Otherwise, register will be killed twice in a single block in the resulting +# MIR, which is incorrect. + +--- +name: my_func +alignment: 16 +tracksRegLiveness: true +liveins: + - { reg: '$r0', virtual-reg: '%0' } + - { reg: '$r1', virtual-reg: '%1' } +body: | + ; CHECK-LABEL: name: my_func + ; CHECK: bb.0: + ; CHECK-NEXT: liveins: $r0, $r1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:intregs = COPY $r0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:intregs = COPY $r1 + ; CHECK-NEXT: [[S2_tstbit_i:%[0-9]+]]:predregs = S2_tstbit_i [[COPY1]], 0 + ; CHECK-NEXT: S4_storeirif_io [[S2_tstbit_i]], [[COPY]], 0, 2 + ; CHECK-NEXT: S4_storeirit_io [[S2_tstbit_i]], killed [[COPY]], 0, 1 + ; CHECK-NEXT: PS_jmpret $r31, implicit-def dead $pc + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $r0, $r1 + + %0:intregs = COPY $r0 + %1:intregs = COPY $r1 + %2:predregs = S2_tstbit_i %1, 0 + J2_jumpf %2, %bb.2, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1: + successors: %bb.3(0x80000000) + + S4_storeiri_io killed %0, 0, 1 + J2_jump %bb.3, implicit-def dead $pc + + bb.2: + successors: %bb.3(0x80000000) + + S4_storeiri_io killed %0, 0, 2 + J2_jump %bb.3, implicit-def dead $pc + + bb.3: + PS_jmpret $r31, implicit-def dead $pc + +... From 4a687024559d5ef10abe6ed10555c5f5c2cfcb40 Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Tue, 1 Apr 2025 15:38:49 +0530 Subject: [PATCH 0221/1029] [CodeGen][NPM] Port XRayInstrumentation to NPM (#129865) --- .../llvm/CodeGen/XRayInstrumentation.h | 25 ++++++ llvm/include/llvm/InitializePasses.h | 2 +- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 1 + .../llvm/Passes/MachinePassRegistry.def | 2 +- llvm/lib/CodeGen/CodeGen.cpp | 2 +- llvm/lib/CodeGen/XRayInstrumentation.cpp | 83 +++++++++++++++---- llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/test/CodeGen/X86/xray-empty-firstmbb.mir | 1 + .../X86/xray-multiplerets-in-blocks.mir | 1 + 9 files changed, 100 insertions(+), 18 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/XRayInstrumentation.h diff --git a/llvm/include/llvm/CodeGen/XRayInstrumentation.h b/llvm/include/llvm/CodeGen/XRayInstrumentation.h new file mode 100644 index 0000000000000..cc65d61627fc0 --- /dev/null +++ b/llvm/include/llvm/CodeGen/XRayInstrumentation.h @@ -0,0 +1,25 @@ +//===- llvm/CodeGen/XRayInstrumentation.h -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_XRAYINSTRUMENTATION_H +#define LLVM_CODEGEN_XRAYINSTRUMENTATION_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class XRayInstrumentationPass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_XRAYINSTRUMENTATION_H diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index cc7bf245c37f5..fb27867176788 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -322,7 +322,7 @@ void initializeVirtRegRewriterPass(PassRegistry &); void initializeWasmEHPreparePass(PassRegistry &); void initializeWinEHPreparePass(PassRegistry &); void initializeWriteBitcodePassPass(PassRegistry &); -void initializeXRayInstrumentationPass(PassRegistry &); +void initializeXRayInstrumentationLegacyPass(PassRegistry &); } // end namespace llvm diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index bdb81cf77cfd1..25ca982916ff8 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -90,6 +90,7 @@ #include "llvm/CodeGen/UnreachableBlockElim.h" #include "llvm/CodeGen/WasmEHPrepare.h" #include "llvm/CodeGen/WinEHPrepare.h" +#include "llvm/CodeGen/XRayInstrumentation.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Verifier.h" #include "llvm/IRPrinter/IRPrintingPasses.h" diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def index 73c4d34faa5a3..3e9e788662900 100644 --- a/llvm/include/llvm/Passes/MachinePassRegistry.def +++ b/llvm/include/llvm/Passes/MachinePassRegistry.def @@ -190,6 +190,7 @@ MACHINE_FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass()) MACHINE_FUNCTION_PASS("two-address-instruction", TwoAddressInstructionPass()) MACHINE_FUNCTION_PASS("verify", MachineVerifierPass()) MACHINE_FUNCTION_PASS("verify", MachineTraceMetricsVerifierPass()) +MACHINE_FUNCTION_PASS("xray-instrumentation", XRayInstrumentationPass()) #undef MACHINE_FUNCTION_PASS #ifndef MACHINE_FUNCTION_PASS_WITH_PARAMS @@ -315,5 +316,4 @@ DUMMY_MACHINE_FUNCTION_PASS("stack-frame-layout", StackFrameLayoutAnalysisPass) DUMMY_MACHINE_FUNCTION_PASS("stackmap-liveness", StackMapLivenessPass) DUMMY_MACHINE_FUNCTION_PASS("unpack-mi-bundles", UnpackMachineBundlesPass) DUMMY_MACHINE_FUNCTION_PASS("virtregrewriter", VirtRegRewriterPass) -DUMMY_MACHINE_FUNCTION_PASS("xray-instrumentation", XRayInstrumentationPass) #undef DUMMY_MACHINE_FUNCTION_PASS diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp index 771e45ce71595..b1c26307b80fc 100644 --- a/llvm/lib/CodeGen/CodeGen.cpp +++ b/llvm/lib/CodeGen/CodeGen.cpp @@ -145,5 +145,5 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeVirtRegRewriterPass(Registry); initializeWasmEHPreparePass(Registry); initializeWinEHPreparePass(Registry); - initializeXRayInstrumentationPass(Registry); + initializeXRayInstrumentationLegacyPass(Registry); } diff --git a/llvm/lib/CodeGen/XRayInstrumentation.cpp b/llvm/lib/CodeGen/XRayInstrumentation.cpp index 0873d9956356e..dbdb81d1e6b33 100644 --- a/llvm/lib/CodeGen/XRayInstrumentation.cpp +++ b/llvm/lib/CodeGen/XRayInstrumentation.cpp @@ -13,14 +13,17 @@ // //===---------------------------------------------------------------------===// +#include "llvm/CodeGen/XRayInstrumentation.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/Attributes.h" @@ -44,11 +47,11 @@ struct InstrumentationOptions { bool HandleAllReturns; }; -struct XRayInstrumentation : public MachineFunctionPass { +struct XRayInstrumentationLegacy : public MachineFunctionPass { static char ID; - XRayInstrumentation() : MachineFunctionPass(ID) { - initializeXRayInstrumentationPass(*PassRegistry::getPassRegistry()); + XRayInstrumentationLegacy() : MachineFunctionPass(ID) { + initializeXRayInstrumentationLegacyPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -59,6 +62,27 @@ struct XRayInstrumentation : public MachineFunctionPass { } bool runOnMachineFunction(MachineFunction &MF) override; +}; + +struct XRayInstrumentation { + XRayInstrumentation(MachineDominatorTree *MDT, MachineLoopInfo *MLI) + : MDT(MDT), MLI(MLI) {} + + bool run(MachineFunction &MF); + + // Methods for use in the NPM and legacy passes, can be removed once migration + // is complete. + static bool alwaysInstrument(Function &F) { + auto InstrAttr = F.getFnAttribute("function-instrument"); + return InstrAttr.isStringAttribute() && + InstrAttr.getValueAsString() == "xray-always"; + } + + static bool needMDTAndMLIAnalyses(Function &F) { + auto IgnoreLoopsAttr = F.getFnAttribute("xray-ignore-loops"); + auto AlwaysInstrument = XRayInstrumentation::alwaysInstrument(F); + return !AlwaysInstrument && !IgnoreLoopsAttr.isValid(); + } private: // Replace the original RET instruction with the exit sled code ("patchable @@ -82,6 +106,9 @@ struct XRayInstrumentation : public MachineFunctionPass { void prependRetWithPatchableExit(MachineFunction &MF, const TargetInstrInfo *TII, InstrumentationOptions); + + MachineDominatorTree *MDT; + MachineLoopInfo *MLI; }; } // end anonymous namespace @@ -143,11 +170,42 @@ void XRayInstrumentation::prependRetWithPatchableExit( } } -bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { +PreservedAnalyses +XRayInstrumentationPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + MachineDominatorTree *MDT = nullptr; + MachineLoopInfo *MLI = nullptr; + + if (XRayInstrumentation::needMDTAndMLIAnalyses(MF.getFunction())) { + MDT = MFAM.getCachedResult(MF); + MLI = MFAM.getCachedResult(MF); + } + + if (!XRayInstrumentation(MDT, MLI).run(MF)) + return PreservedAnalyses::all(); + + auto PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet(); + return PA; +} + +bool XRayInstrumentationLegacy::runOnMachineFunction(MachineFunction &MF) { + MachineDominatorTree *MDT = nullptr; + MachineLoopInfo *MLI = nullptr; + if (XRayInstrumentation::needMDTAndMLIAnalyses(MF.getFunction())) { + auto *MDTWrapper = + getAnalysisIfAvailable(); + MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; + auto *MLIWrapper = getAnalysisIfAvailable(); + MLI = MLIWrapper ? &MLIWrapper->getLI() : nullptr; + } + return XRayInstrumentation(MDT, MLI).run(MF); +} + +bool XRayInstrumentation::run(MachineFunction &MF) { auto &F = MF.getFunction(); auto InstrAttr = F.getFnAttribute("function-instrument"); - bool AlwaysInstrument = InstrAttr.isStringAttribute() && - InstrAttr.getValueAsString() == "xray-always"; + bool AlwaysInstrument = alwaysInstrument(F); bool NeverInstrument = InstrAttr.isStringAttribute() && InstrAttr.getValueAsString() == "xray-never"; if (NeverInstrument && !AlwaysInstrument) @@ -171,9 +229,6 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { if (!IgnoreLoops) { // Get MachineDominatorTree or compute it on the fly if it's unavailable - auto *MDTWrapper = - getAnalysisIfAvailable(); - auto *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr; MachineDominatorTree ComputedMDT; if (!MDT) { ComputedMDT.recalculate(MF); @@ -181,8 +236,6 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { } // Get MachineLoopInfo or compute it on the fly if it's unavailable - auto *MLIWrapper = getAnalysisIfAvailable(); - auto *MLI = MLIWrapper ? &MLIWrapper->getLI() : nullptr; MachineLoopInfo ComputedMLI; if (!MLI) { ComputedMLI.analyze(*MDT); @@ -272,10 +325,10 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) { return true; } -char XRayInstrumentation::ID = 0; -char &llvm::XRayInstrumentationID = XRayInstrumentation::ID; -INITIALIZE_PASS_BEGIN(XRayInstrumentation, "xray-instrumentation", +char XRayInstrumentationLegacy::ID = 0; +char &llvm::XRayInstrumentationID = XRayInstrumentationLegacy::ID; +INITIALIZE_PASS_BEGIN(XRayInstrumentationLegacy, "xray-instrumentation", "Insert XRay ops", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) -INITIALIZE_PASS_END(XRayInstrumentation, "xray-instrumentation", +INITIALIZE_PASS_END(XRayInstrumentationLegacy, "xray-instrumentation", "Insert XRay ops", false, false) diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 8646c1f49ac35..536a7fcb67b5e 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -165,6 +165,7 @@ #include "llvm/CodeGen/VirtRegMap.h" #include "llvm/CodeGen/WasmEHPrepare.h" #include "llvm/CodeGen/WinEHPrepare.h" +#include "llvm/CodeGen/XRayInstrumentation.h" #include "llvm/IR/DebugInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/PassManager.h" diff --git a/llvm/test/CodeGen/X86/xray-empty-firstmbb.mir b/llvm/test/CodeGen/X86/xray-empty-firstmbb.mir index df5dc7b28ec1a..cd8b04b96ba2b 100644 --- a/llvm/test/CodeGen/X86/xray-empty-firstmbb.mir +++ b/llvm/test/CodeGen/X86/xray-empty-firstmbb.mir @@ -1,4 +1,5 @@ # RUN: llc -run-pass=xray-instrumentation -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s +# RUN: llc -passes=xray-instrumentation -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s # # Make sure we can handle empty first basic blocks. diff --git a/llvm/test/CodeGen/X86/xray-multiplerets-in-blocks.mir b/llvm/test/CodeGen/X86/xray-multiplerets-in-blocks.mir index 60a33b95f1412..0ddd5037e4265 100644 --- a/llvm/test/CodeGen/X86/xray-multiplerets-in-blocks.mir +++ b/llvm/test/CodeGen/X86/xray-multiplerets-in-blocks.mir @@ -1,4 +1,5 @@ # RUN: llc -run-pass=xray-instrumentation -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s +# RUN: llc -passes=xray-instrumentation -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s # # Make sure we can handle multiple ret instructions in a single basic block for # XRay. From 518102f2592e8b1d4f74510b97cb2d0e2bb1d66a Mon Sep 17 00:00:00 2001 From: Lukacma Date: Tue, 1 Apr 2025 11:42:22 +0100 Subject: [PATCH 0222/1029] Fix test failures caused by #127043 (#133895) --- .../CodeGen/arm-bf16-convert-intrinsics.c | 471 ++--- .../arm-neon-directed-rounding-constrained.c | 26 +- clang/test/CodeGen/arm-poly-add.c | 2 +- .../test/CodeGen/arm-v8.1a-neon-intrinsics.c | 388 ++++- .../arm-v8.2a-neon-intrinsics-generic.c | 376 ++-- .../test/CodeGen/arm-v8.2a-neon-intrinsics.c | 1517 +++++++++++++---- .../test/CodeGen/arm-v8.6a-neon-intrinsics.c | 132 +- 7 files changed, 1960 insertions(+), 952 deletions(-) diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c index 93f54c70c340d..65a23dc0325c8 100644 --- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c +++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c @@ -2,19 +2,19 @@ // RUN: %clang_cc1 \ // RUN: -triple aarch64 -target-feature +neon -target-feature +bf16 \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck --check-prefixes=CHECK,CHECK-A64 %s // RUN: %clang_cc1 \ // RUN: -triple armv8.6a-arm-none-eabi -target-feature +neon \ // RUN: -target-feature +bf16 -mfloat-abi hard \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck --check-prefixes=CHECK,CHECK-A32-HARDFP %s // RUN: %clang_cc1 \ // RUN: -triple armv8.6a-arm-none-eabi -target-feature +neon \ // RUN: -target-feature +bf16 -mfloat-abi softfp \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck --check-prefixes=CHECK,CHECK-A32-SOFTFP %s // REQUIRES: arm-registered-target @@ -24,51 +24,36 @@ // CHECK-A64-LABEL: @test_vcvt_f32_bf16( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A64-NEXT: [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 16 -// CHECK-A64-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_808_I]], align 8 -// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> // CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 16 -// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 16 -// CHECK-A64-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> +// CHECK-A64-NEXT: ret <4 x float> [[TMP4]] // // CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-HARDFP-NEXT: [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 8 -// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_808_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8 +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> // CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 8 -// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) +// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[__P0_808_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[A:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[A1:%.*]] = load <4 x bfloat>, ptr [[A]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[A1]], ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP0]], ptr [[__P0_808_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_8081_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8081_I]], ptr [[__REINT_808_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP4]] +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE:%.*]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[TMP0]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <4 x i16> +// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP6]], splat (i32 16) +// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP7]] // float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) { return vcvt_f32_bf16(a); @@ -76,72 +61,45 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) { // CHECK-A64-LABEL: @test_vcvtq_low_f32_bf16( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A64-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 16 // CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> -// CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8 -// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> // CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 16 -// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 16 -// CHECK-A64-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A64-NEXT: ret <4 x float> [[TMP4]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-HARDFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8 // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> -// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8 +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> // CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) +// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE3_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[A:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[A1:%.*]] = load <8 x bfloat>, ptr [[A]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[A1]], ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP0]], ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[__P01_I]], ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP1]], ptr [[__P0_I2]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I5:%.*]] = load <8 x bfloat>, ptr [[__P0_I2]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5]], <8 x bfloat> [[__P01_I5]], <4 x i32> -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP2]], ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP3]], ptr [[COERCE3_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_8081_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8081_I_I]], ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP7]], splat (i32 16) -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP8]] +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE:%.*]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP4]], <8 x bfloat> [[TMP4]], <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16> +// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> +// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16) +// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP13]] // float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) { return vcvtq_low_f32_bf16(a); @@ -149,72 +107,45 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) { // CHECK-A64-LABEL: @test_vcvtq_high_f32_bf16( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A64-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 16 // CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> -// CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8 -// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8 +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> // CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 16 -// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 16 -// CHECK-A64-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A64-NEXT: ret <4 x float> [[TMP4]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-HARDFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8 // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> -// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8 +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> // CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) +// CHECK-A32-HARDFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP4]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE3_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[A:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[A1:%.*]] = load <8 x bfloat>, ptr [[A]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[A1]], ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP0]], ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[__P01_I]], ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP1]], ptr [[__P0_I2]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I5:%.*]] = load <8 x bfloat>, ptr [[__P0_I2]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5]], <8 x bfloat> [[__P01_I5]], <4 x i32> -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP2]], ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP3]], ptr [[COERCE3_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_8081_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8081_I_I]], ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP7]], splat (i32 16) -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP8]] +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE:%.*]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP4]], <8 x bfloat> [[TMP4]], <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x bfloat> [[TMP8]] to <4 x i16> +// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> +// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> +// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = zext <4 x i16> [[TMP11]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16) +// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP13]] // float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) { return vcvtq_high_f32_bf16(a); @@ -222,37 +153,33 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) { // CHECK-A64-LABEL: @test_vcvt_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> -// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP1]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat> +// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP3]] // // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]]) // CHECK-A32-HARDFP-NEXT: ret <4 x bfloat> [[VCVTFP2BF1_I]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvt_bf16_f32( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I1:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP1]], ptr [[RETVAL_I1]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I1]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP2]], ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP3]], ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x bfloat>, ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP5]], ptr [[RETVAL]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 -// CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[TMP6]] +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]]) +// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[TMP2]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[TMP4]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP6]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[TMP7]] // bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) { return vcvt_bf16_f32(a); @@ -260,58 +187,44 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) { // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> -// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> -// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[TMP2]] to <4 x bfloat> +// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP3]], <4 x bfloat> zeroinitializer, <8 x i32> +// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) -// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast i64 0 to <4 x bfloat> +// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]]) +// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP0]], <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> // CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I4:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P1_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I1:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE1_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE4_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP1]], ptr [[RETVAL_I1]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I1]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP2]], ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> zeroinitializer, ptr [[COERCE1_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE1_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP3]], ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <4 x bfloat>, ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP5]], ptr [[__P1_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P12_I:%.*]] = load <4 x bfloat>, ptr [[__P1_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[__P01_I]], <4 x bfloat> [[__P12_I]], <8 x i32> -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I4]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr [[RETVAL_I4]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP6]], ptr [[COERCE4_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = load <8 x bfloat>, ptr [[COERCE4_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[TMP7]], ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP8]], ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = load <8 x bfloat>, ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[TMP9]], ptr [[RETVAL]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 8 -// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP10]] +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast i64 0 to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]]) +// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP0]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP7]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[TMP8]], <4 x bfloat> [[TMP9]], <8 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <8 x bfloat> [[TMP11]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x i32> [[TMP12]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP14]] // bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { return vcvtq_low_bf16_f32(a); @@ -319,83 +232,55 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { // CHECK-A64-LABEL: @test_vcvtq_high_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> -// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> -// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> -// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <8 x i16> +// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK-A64-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x bfloat> +// CHECK-A64-NEXT: [[TMP5:%.*]] = shufflevector <8 x bfloat> [[TMP4]], <8 x bfloat> poison, <4 x i32> +// CHECK-A64-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x float> +// CHECK-A64-NEXT: [[TMP7:%.*]] = fptrunc <4 x float> [[TMP6]] to <4 x bfloat> +// CHECK-A64-NEXT: [[TMP8:%.*]] = shufflevector <4 x bfloat> [[TMP5]], <4 x bfloat> [[TMP7]], <8 x i32> +// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP8]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[VCVTFP2BF_I]]) // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> [[INACTIVE]], <4 x i32> // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I8:%.*]] = shufflevector <4 x bfloat> [[VCVTFP2BF1_I]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> // CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I8]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_bf16_f32( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I11:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I12:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P1_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I8:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I3:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I4:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE4_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE5_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE6_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE8_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[INACTIVE:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE2:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[INACTIVE_COERCE:%.*]], ptr [[INACTIVE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[INACTIVE1:%.*]] = load <8 x bfloat>, ptr [[INACTIVE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[INACTIVE1]], ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP0]], ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP2]], ptr [[RETVAL_I8]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[RETVAL_I8]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP3]], ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[__P01_I]], ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP5]], ptr [[__P0_I4]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I7:%.*]] = load <8 x bfloat>, ptr [[__P0_I4]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I7]], <8 x bfloat> [[__P01_I7]], <4 x i32> -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I3]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[RETVAL_I3]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP6]], ptr [[COERCE4_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = load <4 x bfloat>, ptr [[COERCE4_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP4]], ptr [[COERCE5_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[COERCE5_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP7]], ptr [[COERCE6_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[COERCE6_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP8]], ptr [[__P0_I12]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I16:%.*]] = load <4 x bfloat>, ptr [[__P0_I12]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP9]], ptr [[__P1_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P12_I:%.*]] = load <4 x bfloat>, ptr [[__P1_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[__P01_I16]], <4 x bfloat> [[__P12_I]], <8 x i32> -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[SHUFFLE_I17]], ptr [[RETVAL_I11]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr [[RETVAL_I11]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP10]], ptr [[COERCE8_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = load <8 x bfloat>, ptr [[COERCE8_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[TMP11]], ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP12]], ptr [[COERCE2]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = load <8 x bfloat>, ptr [[COERCE2]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[TMP13]], ptr [[RETVAL]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 8 -// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP14]] +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[INACTIVE_COERCE:%.*]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[VCVTFP2BF_I]]) +// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP2]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[TMP9]], <8 x bfloat> [[TMP9]], <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = bitcast <4 x bfloat> [[TMP7]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = bitcast <4 x bfloat> [[TMP11]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP12]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP15:%.*]] = bitcast <2 x i32> [[TMP13]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[TMP14]], <4 x bfloat> [[TMP15]], <8 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP16:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP17:%.*]] = bitcast <4 x i32> [[TMP16]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP18:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP18]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[TMP20:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP20]] // bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) { return vcvtq_high_bf16_f32(inactive, a); @@ -422,14 +307,10 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) { // CHECK-LABEL: @test_vcvtah_f32_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_I:%.*]] = alloca bfloat, align 2 -// CHECK-NEXT: [[__REINT1_I:%.*]] = alloca i32, align 4 -// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[__REINT_I]], align 2 -// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[__REINT_I]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast bfloat [[A:%.*]] to i16 // CHECK-NEXT: [[CONV_I:%.*]] = zext i16 [[TMP0]] to i32 // CHECK-NEXT: [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16 -// CHECK-NEXT: store i32 [[SHL_I]], ptr [[__REINT1_I]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[__REINT1_I]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[SHL_I]] to float // CHECK-NEXT: ret float [[TMP1]] // float32_t test_vcvtah_f32_bf16(bfloat16_t a) { diff --git a/clang/test/CodeGen/arm-neon-directed-rounding-constrained.c b/clang/test/CodeGen/arm-neon-directed-rounding-constrained.c index 11de8ba1dab7a..836b41b9c4e55 100644 --- a/clang/test/CodeGen/arm-neon-directed-rounding-constrained.c +++ b/clang/test/CodeGen/arm-neon-directed-rounding-constrained.c @@ -1,45 +1,44 @@ // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s +// RUN: opt -S -passes=mem2reg,sroa | FileCheck -check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s // RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s +// RUN: opt -S -passes=mem2reg,sroa | FileCheck -check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \ // RUN: -ffp-exception-behavior=strict \ // RUN: -fexperimental-strict-floating-point \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=COMMON,COMMONIR,CONSTRAINED %s +// RUN: opt -S -passes=mem2reg,sroa | FileCheck -check-prefixes=COMMON,COMMONIR,CONSTRAINED %s // RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \ // RUN: -ffp-exception-behavior=strict \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=COMMON,COMMONIR,CONSTRAINED %s +// RUN: opt -S -passes=mem2reg,sroa | FileCheck -check-prefixes=COMMON,COMMONIR,CONSTRAINED %s // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM32 %s +// RUN: opt -S -passes=mem2reg,sroa | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM32 %s // RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM64 %s +// RUN: opt -S -passes=mem2reg,sroa | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM64 %s // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \ // RUN: -ffp-exception-behavior=strict \ // RUN: -fexperimental-strict-floating-point \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM32 %s +// RUN: opt -S -passes=mem2reg,sroa | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM32 %s // RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \ // RUN: -ffp-exception-behavior=strict \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM64 %s +// RUN: opt -S -passes=mem2reg,sroa | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM64 %s // REQUIRES: arm-registered-target,aarch64-registered-target #include // COMMON-LABEL: test_vrndi_f32 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) -// CONSTRAINED: [[VRNDI1_I:%.*]] = call <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") +// UNCONSTRAINED: [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[VRNDI_I:%.*]]) +// CONSTRAINED: [[VRNDI1_I:%.*]] = call <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float> [[VRNDI_I:%.*]], metadata !"round.tonearest", metadata !"fpexcept.strict") // CHECK-ASM32: vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}} // CHECK-ASM32: vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}} // CHECK-ASM64: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s @@ -49,9 +48,8 @@ float32x2_t test_vrndi_f32(float32x2_t a) { } // COMMON-LABEL: test_vrndiq_f32 -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// UNCONSTRAINED: [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) -// CONSTRAINED: [[VRNDI1_I:%.*]] = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") +// UNCONSTRAINED: [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[VRNDI_I:%.*]]) +// CONSTRAINED: [[VRNDI1_I:%.*]] = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> [[VRNDI_I:%.*]], metadata !"round.tonearest", metadata !"fpexcept.strict") // CHECK-ASM32: vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}} // CHECK-ASM32: vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}} // CHECK-ASM32: vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}} diff --git a/clang/test/CodeGen/arm-poly-add.c b/clang/test/CodeGen/arm-poly-add.c index 201a03a5bc8b6..d327efa0f5cb6 100644 --- a/clang/test/CodeGen/arm-poly-add.c +++ b/clang/test/CodeGen/arm-poly-add.c @@ -2,7 +2,7 @@ // RUN: %clang_cc1 -triple armv8.2a-arm-none-eabi \ // RUN: -target-feature +neon \ // RUN: -mfloat-abi hard \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s #include diff --git a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c index 555f8ccba7c3c..b053e5766b580 100644 --- a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c +++ b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple armv8.1a-linux-gnu -target-abi apcs-gnu -target-feature +neon \ -// RUN: -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,dce -S \ +// RUN: -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,sroa,dce -S \ // RUN: | FileCheck %s --check-prefix=CHECK-ARM // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \ -// RUN: -target-feature +v8.1a -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,dce -S \ +// RUN: -target-feature +v8.1a -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,sroa,dce -S \ // RUN: | FileCheck %s --check-prefix=CHECK-AARCH64 // REQUIRES: arm-registered-target,aarch64-registered-target @@ -13,13 +13,29 @@ // CHECK-ARM-LABEL: @test_vqrdmlah_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] +// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[VQRDMLAH_S16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-ARM-NEXT: [[VQRDMLAH_S161_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-ARM-NEXT: [[VQRDMLAH_S162_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-ARM-NEXT: [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[VQRDMLAH_S16_I]], <4 x i16> [[VQRDMLAH_S161_I]], <4 x i16> [[VQRDMLAH_S162_I]]) +// CHECK-ARM-NEXT: [[VQRDMLAH_S164_I:%.*]] = bitcast <4 x i16> [[VQRDMLAH_S163_I]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VQRDMLAH_S164_I]] to <4 x i16> +// CHECK-ARM-NEXT: ret <4 x i16> [[TMP3]] // // CHECK-AARCH64-LABEL: @test_vqrdmlah_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S161_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S162_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[VQRDMLAH_S16_I]], <4 x i16> [[VQRDMLAH_S161_I]], <4 x i16> [[VQRDMLAH_S162_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S164_I:%.*]] = bitcast <4 x i16> [[VQRDMLAH_S163_I]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VQRDMLAH_S164_I]] to <4 x i16> +// CHECK-AARCH64-NEXT: ret <4 x i16> [[TMP3]] // int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) { @@ -28,13 +44,29 @@ int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) { // CHECK-ARM-LABEL: @test_vqrdmlah_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] +// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[VQRDMLAH_S32_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAH_S321_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAH_S322_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[VQRDMLAH_S32_I]], <2 x i32> [[VQRDMLAH_S321_I]], <2 x i32> [[VQRDMLAH_S322_I]]) +// CHECK-ARM-NEXT: [[VQRDMLAH_S324_I:%.*]] = bitcast <2 x i32> [[VQRDMLAH_S323_I]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VQRDMLAH_S324_I]] to <2 x i32> +// CHECK-ARM-NEXT: ret <2 x i32> [[TMP3]] // // CHECK-AARCH64-LABEL: @test_vqrdmlah_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S32_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S321_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S322_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[VQRDMLAH_S32_I]], <2 x i32> [[VQRDMLAH_S321_I]], <2 x i32> [[VQRDMLAH_S322_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S324_I:%.*]] = bitcast <2 x i32> [[VQRDMLAH_S323_I]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VQRDMLAH_S324_I]] to <2 x i32> +// CHECK-AARCH64-NEXT: ret <2 x i32> [[TMP3]] // int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) { @@ -43,13 +75,29 @@ int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) { // CHECK-ARM-LABEL: @test_vqrdmlahq_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] +// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[C:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S16_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S161_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S162_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[VQRDMLAHQ_S16_I]], <8 x i16> [[VQRDMLAHQ_S161_I]], <8 x i16> [[VQRDMLAHQ_S162_I]]) +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S164_I:%.*]] = bitcast <8 x i16> [[VQRDMLAHQ_S163_I]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VQRDMLAHQ_S164_I]] to <8 x i16> +// CHECK-ARM-NEXT: ret <8 x i16> [[TMP3]] // // CHECK-AARCH64-LABEL: @test_vqrdmlahq_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[C:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S16_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S161_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S162_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[VQRDMLAHQ_S16_I]], <8 x i16> [[VQRDMLAHQ_S161_I]], <8 x i16> [[VQRDMLAHQ_S162_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S164_I:%.*]] = bitcast <8 x i16> [[VQRDMLAHQ_S163_I]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VQRDMLAHQ_S164_I]] to <8 x i16> +// CHECK-AARCH64-NEXT: ret <8 x i16> [[TMP3]] // int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { @@ -58,13 +106,29 @@ int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { // CHECK-ARM-LABEL: @test_vqrdmlahq_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] +// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[C:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S321_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S322_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[VQRDMLAHQ_S32_I]], <4 x i32> [[VQRDMLAHQ_S321_I]], <4 x i32> [[VQRDMLAHQ_S322_I]]) +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S324_I:%.*]] = bitcast <4 x i32> [[VQRDMLAHQ_S323_I]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VQRDMLAHQ_S324_I]] to <4 x i32> +// CHECK-ARM-NEXT: ret <4 x i32> [[TMP3]] // // CHECK-AARCH64-LABEL: @test_vqrdmlahq_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[C:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S321_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S322_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[VQRDMLAHQ_S32_I]], <4 x i32> [[VQRDMLAHQ_S321_I]], <4 x i32> [[VQRDMLAHQ_S322_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S324_I:%.*]] = bitcast <4 x i32> [[VQRDMLAHQ_S323_I]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VQRDMLAHQ_S324_I]] to <4 x i32> +// CHECK-AARCH64-NEXT: ret <4 x i32> [[TMP3]] // int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { @@ -76,16 +140,32 @@ int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) -// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-ARM-NEXT: [[VQRDMLAH_S16_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-ARM-NEXT: [[VQRDMLAH_S161_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-ARM-NEXT: [[VQRDMLAH_S162_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-ARM-NEXT: [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[VQRDMLAH_S16_I]], <4 x i16> [[VQRDMLAH_S161_I]], <4 x i16> [[VQRDMLAH_S162_I]]) +// CHECK-ARM-NEXT: [[VQRDMLAH_S164_I:%.*]] = bitcast <4 x i16> [[VQRDMLAH_S163_I]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VQRDMLAH_S164_I]] to <4 x i16> +// CHECK-ARM-NEXT: ret <4 x i16> [[TMP5]] // // CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s16( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S16_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S161_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S162_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[VQRDMLAH_S16_I]], <4 x i16> [[VQRDMLAH_S161_I]], <4 x i16> [[VQRDMLAH_S162_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S164_I:%.*]] = bitcast <4 x i16> [[VQRDMLAH_S163_I]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VQRDMLAH_S164_I]] to <4 x i16> +// CHECK-AARCH64-NEXT: ret <4 x i16> [[TMP5]] // int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { @@ -97,16 +177,32 @@ int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) -// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-ARM-NEXT: [[VQRDMLAH_S32_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAH_S321_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAH_S322_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[VQRDMLAH_S32_I]], <2 x i32> [[VQRDMLAH_S321_I]], <2 x i32> [[VQRDMLAH_S322_I]]) +// CHECK-ARM-NEXT: [[VQRDMLAH_S324_I:%.*]] = bitcast <2 x i32> [[VQRDMLAH_S323_I]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VQRDMLAH_S324_I]] to <2 x i32> +// CHECK-ARM-NEXT: ret <2 x i32> [[TMP5]] // // CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s32( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S32_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S321_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S322_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[VQRDMLAH_S32_I]], <2 x i32> [[VQRDMLAH_S321_I]], <2 x i32> [[VQRDMLAH_S322_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S324_I:%.*]] = bitcast <2 x i32> [[VQRDMLAH_S323_I]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VQRDMLAH_S324_I]] to <2 x i32> +// CHECK-AARCH64-NEXT: ret <2 x i32> [[TMP5]] // int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { @@ -118,16 +214,32 @@ int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) -// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S16_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S161_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S162_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[VQRDMLAHQ_S16_I]], <8 x i16> [[VQRDMLAHQ_S161_I]], <8 x i16> [[VQRDMLAHQ_S162_I]]) +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S164_I:%.*]] = bitcast <8 x i16> [[VQRDMLAHQ_S163_I]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VQRDMLAHQ_S164_I]] to <8 x i16> +// CHECK-ARM-NEXT: ret <8 x i16> [[TMP5]] // // CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s16( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S16_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S161_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S162_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[VQRDMLAHQ_S16_I]], <8 x i16> [[VQRDMLAHQ_S161_I]], <8 x i16> [[VQRDMLAHQ_S162_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S164_I:%.*]] = bitcast <8 x i16> [[VQRDMLAHQ_S163_I]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VQRDMLAHQ_S164_I]] to <8 x i16> +// CHECK-AARCH64-NEXT: ret <8 x i16> [[TMP5]] // int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { @@ -139,16 +251,32 @@ int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) -// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S32_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S321_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S322_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[VQRDMLAHQ_S32_I]], <4 x i32> [[VQRDMLAHQ_S321_I]], <4 x i32> [[VQRDMLAHQ_S322_I]]) +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S324_I:%.*]] = bitcast <4 x i32> [[VQRDMLAHQ_S323_I]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VQRDMLAHQ_S324_I]] to <4 x i32> +// CHECK-ARM-NEXT: ret <4 x i32> [[TMP5]] // // CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s32( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S32_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S321_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S322_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[VQRDMLAHQ_S32_I]], <4 x i32> [[VQRDMLAHQ_S321_I]], <4 x i32> [[VQRDMLAHQ_S322_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S324_I:%.*]] = bitcast <4 x i32> [[VQRDMLAHQ_S323_I]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VQRDMLAHQ_S324_I]] to <4 x i32> +// CHECK-AARCH64-NEXT: ret <4 x i32> [[TMP5]] // int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { @@ -157,13 +285,29 @@ int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { // CHECK-ARM-LABEL: @test_vqrdmlsh_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] +// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[VQRDMLSH_S16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-ARM-NEXT: [[VQRDMLSH_S161_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-ARM-NEXT: [[VQRDMLSH_S162_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-ARM-NEXT: [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[VQRDMLSH_S16_I]], <4 x i16> [[VQRDMLSH_S161_I]], <4 x i16> [[VQRDMLSH_S162_I]]) +// CHECK-ARM-NEXT: [[VQRDMLSH_S164_I:%.*]] = bitcast <4 x i16> [[VQRDMLSH_S163_I]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VQRDMLSH_S164_I]] to <4 x i16> +// CHECK-ARM-NEXT: ret <4 x i16> [[TMP3]] // // CHECK-AARCH64-LABEL: @test_vqrdmlsh_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S161_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S162_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[VQRDMLSH_S16_I]], <4 x i16> [[VQRDMLSH_S161_I]], <4 x i16> [[VQRDMLSH_S162_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S164_I:%.*]] = bitcast <4 x i16> [[VQRDMLSH_S163_I]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VQRDMLSH_S164_I]] to <4 x i16> +// CHECK-AARCH64-NEXT: ret <4 x i16> [[TMP3]] // int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) { @@ -172,13 +316,29 @@ int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) { // CHECK-ARM-LABEL: @test_vqrdmlsh_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] +// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[VQRDMLSH_S32_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSH_S321_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSH_S322_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[VQRDMLSH_S32_I]], <2 x i32> [[VQRDMLSH_S321_I]], <2 x i32> [[VQRDMLSH_S322_I]]) +// CHECK-ARM-NEXT: [[VQRDMLSH_S324_I:%.*]] = bitcast <2 x i32> [[VQRDMLSH_S323_I]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VQRDMLSH_S324_I]] to <2 x i32> +// CHECK-ARM-NEXT: ret <2 x i32> [[TMP3]] // // CHECK-AARCH64-LABEL: @test_vqrdmlsh_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S32_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S321_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S322_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[VQRDMLSH_S32_I]], <2 x i32> [[VQRDMLSH_S321_I]], <2 x i32> [[VQRDMLSH_S322_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S324_I:%.*]] = bitcast <2 x i32> [[VQRDMLSH_S323_I]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VQRDMLSH_S324_I]] to <2 x i32> +// CHECK-AARCH64-NEXT: ret <2 x i32> [[TMP3]] // int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) { @@ -187,13 +347,29 @@ int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) { // CHECK-ARM-LABEL: @test_vqrdmlshq_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] +// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[C:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S16_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S161_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S162_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[VQRDMLSHQ_S16_I]], <8 x i16> [[VQRDMLSHQ_S161_I]], <8 x i16> [[VQRDMLSHQ_S162_I]]) +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S164_I:%.*]] = bitcast <8 x i16> [[VQRDMLSHQ_S163_I]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VQRDMLSHQ_S164_I]] to <8 x i16> +// CHECK-ARM-NEXT: ret <8 x i16> [[TMP3]] // // CHECK-AARCH64-LABEL: @test_vqrdmlshq_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[C:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S16_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S161_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S162_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[VQRDMLSHQ_S16_I]], <8 x i16> [[VQRDMLSHQ_S161_I]], <8 x i16> [[VQRDMLSHQ_S162_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S164_I:%.*]] = bitcast <8 x i16> [[VQRDMLSHQ_S163_I]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VQRDMLSHQ_S164_I]] to <8 x i16> +// CHECK-AARCH64-NEXT: ret <8 x i16> [[TMP3]] // int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { @@ -202,13 +378,29 @@ int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { // CHECK-ARM-LABEL: @test_vqrdmlshq_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] +// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[C:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S321_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S322_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[VQRDMLSHQ_S32_I]], <4 x i32> [[VQRDMLSHQ_S321_I]], <4 x i32> [[VQRDMLSHQ_S322_I]]) +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S324_I:%.*]] = bitcast <4 x i32> [[VQRDMLSHQ_S323_I]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VQRDMLSHQ_S324_I]] to <4 x i32> +// CHECK-ARM-NEXT: ret <4 x i32> [[TMP3]] // // CHECK-AARCH64-LABEL: @test_vqrdmlshq_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[C:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S321_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S322_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[VQRDMLSHQ_S32_I]], <4 x i32> [[VQRDMLSHQ_S321_I]], <4 x i32> [[VQRDMLSHQ_S322_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S324_I:%.*]] = bitcast <4 x i32> [[VQRDMLSHQ_S323_I]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VQRDMLSHQ_S324_I]] to <4 x i32> +// CHECK-AARCH64-NEXT: ret <4 x i32> [[TMP3]] // int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { @@ -220,16 +412,32 @@ int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) -// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-ARM-NEXT: [[VQRDMLSH_S16_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-ARM-NEXT: [[VQRDMLSH_S161_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-ARM-NEXT: [[VQRDMLSH_S162_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-ARM-NEXT: [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[VQRDMLSH_S16_I]], <4 x i16> [[VQRDMLSH_S161_I]], <4 x i16> [[VQRDMLSH_S162_I]]) +// CHECK-ARM-NEXT: [[VQRDMLSH_S164_I:%.*]] = bitcast <4 x i16> [[VQRDMLSH_S163_I]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VQRDMLSH_S164_I]] to <4 x i16> +// CHECK-ARM-NEXT: ret <4 x i16> [[TMP5]] // // CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s16( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S16_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S161_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S162_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[VQRDMLSH_S16_I]], <4 x i16> [[VQRDMLSH_S161_I]], <4 x i16> [[VQRDMLSH_S162_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S164_I:%.*]] = bitcast <4 x i16> [[VQRDMLSH_S163_I]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VQRDMLSH_S164_I]] to <4 x i16> +// CHECK-AARCH64-NEXT: ret <4 x i16> [[TMP5]] // int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { @@ -241,16 +449,32 @@ int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) -// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-ARM-NEXT: [[VQRDMLSH_S32_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSH_S321_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSH_S322_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[VQRDMLSH_S32_I]], <2 x i32> [[VQRDMLSH_S321_I]], <2 x i32> [[VQRDMLSH_S322_I]]) +// CHECK-ARM-NEXT: [[VQRDMLSH_S324_I:%.*]] = bitcast <2 x i32> [[VQRDMLSH_S323_I]] to <8 x i8> +// CHECK-ARM-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VQRDMLSH_S324_I]] to <2 x i32> +// CHECK-ARM-NEXT: ret <2 x i32> [[TMP5]] // // CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s32( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S32_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S321_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S322_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[VQRDMLSH_S32_I]], <2 x i32> [[VQRDMLSH_S321_I]], <2 x i32> [[VQRDMLSH_S322_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S324_I:%.*]] = bitcast <2 x i32> [[VQRDMLSH_S323_I]] to <8 x i8> +// CHECK-AARCH64-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VQRDMLSH_S324_I]] to <2 x i32> +// CHECK-AARCH64-NEXT: ret <2 x i32> [[TMP5]] // int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { @@ -262,16 +486,32 @@ int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) -// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S16_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S161_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S162_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[VQRDMLSHQ_S16_I]], <8 x i16> [[VQRDMLSHQ_S161_I]], <8 x i16> [[VQRDMLSHQ_S162_I]]) +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S164_I:%.*]] = bitcast <8 x i16> [[VQRDMLSHQ_S163_I]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VQRDMLSHQ_S164_I]] to <8 x i16> +// CHECK-ARM-NEXT: ret <8 x i16> [[TMP5]] // // CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s16( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S16_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S161_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S162_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[VQRDMLSHQ_S16_I]], <8 x i16> [[VQRDMLSHQ_S161_I]], <8 x i16> [[VQRDMLSHQ_S162_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S164_I:%.*]] = bitcast <8 x i16> [[VQRDMLSHQ_S163_I]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VQRDMLSHQ_S164_I]] to <8 x i16> +// CHECK-AARCH64-NEXT: ret <8 x i16> [[TMP5]] // int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { @@ -283,16 +523,32 @@ int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { // CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) -// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] +// CHECK-ARM-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S32_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S321_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S322_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[VQRDMLSHQ_S32_I]], <4 x i32> [[VQRDMLSHQ_S321_I]], <4 x i32> [[VQRDMLSHQ_S322_I]]) +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S324_I:%.*]] = bitcast <4 x i32> [[VQRDMLSHQ_S323_I]] to <16 x i8> +// CHECK-ARM-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VQRDMLSHQ_S324_I]] to <4 x i32> +// CHECK-ARM-NEXT: ret <4 x i32> [[TMP5]] // // CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s32( // CHECK-AARCH64-NEXT: entry: // CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> // CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> // CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S32_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S321_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S322_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[VQRDMLSHQ_S32_I]], <4 x i32> [[VQRDMLSHQ_S321_I]], <4 x i32> [[VQRDMLSHQ_S322_I]]) +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S324_I:%.*]] = bitcast <4 x i32> [[VQRDMLSHQ_S323_I]] to <16 x i8> +// CHECK-AARCH64-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VQRDMLSHQ_S324_I]] to <4 x i32> +// CHECK-AARCH64-NEXT: ret <4 x i32> [[TMP5]] // int32x4_t test_vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { diff --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c index 5f1cb34e6603d..7f12598f71ff8 100644 --- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c +++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c @@ -21,25 +21,31 @@ // CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32> // CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half> // CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> [[TMP8]]) -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <2 x i32> -// CHECK-NOFP16-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <2 x i32> -// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP12]] +// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <8 x i8> [[TMP10]]) +// CHECK-NOFP16-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <4 x half> +// CHECK-NOFP16-NEXT: [[TMP13:%.*]] = bitcast <4 x half> [[TMP12]] to <2 x i32> +// CHECK-NOFP16-NEXT: [[TMP14:%.*]] = bitcast <2 x i32> [[TMP13]] to <4 x half> +// CHECK-NOFP16-NEXT: [[TMP15:%.*]] = bitcast <4 x half> [[TMP14]] to <2 x i32> +// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP15]] // // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vbsl_f16( // CHECK-FP16-SAME: <4 x i16> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-FP16-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x half> -// CHECK-FP16-NEXT: ret <4 x half> [[TMP3]] +// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-FP16-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-FP16-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]]) +// CHECK-FP16-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <4 x half> +// CHECK-FP16-NEXT: ret <4 x half> [[TMP6]] // float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) { return vbsl_f16(a, b, c); @@ -54,25 +60,31 @@ float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) { // CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32> // CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half> // CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <4 x i32> -// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP12]] +// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i8> [[TMP10]]) +// CHECK-NOFP16-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <8 x half> +// CHECK-NOFP16-NEXT: [[TMP13:%.*]] = bitcast <8 x half> [[TMP12]] to <4 x i32> +// CHECK-NOFP16-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <8 x half> +// CHECK-NOFP16-NEXT: [[TMP15:%.*]] = bitcast <8 x half> [[TMP14]] to <4 x i32> +// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP15]] // // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vbslq_f16( // CHECK-FP16-SAME: <8 x i16> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-FP16-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x half> -// CHECK-FP16-NEXT: ret <8 x half> [[TMP3]] +// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-FP16-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-FP16-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]]) +// CHECK-FP16-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> +// CHECK-FP16-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <8 x half> +// CHECK-FP16-NEXT: ret <8 x half> [[TMP6]] // float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) { return vbslq_f16(a, b, c); @@ -88,28 +100,34 @@ float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) { // CHECK-NOFP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) // CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half> // CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> +// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i32> // CHECK-NOFP16-NEXT: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3]] -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK-NOFP16-NEXT: [[VZIP3_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> -// CHECK-NOFP16-NEXT: store <4 x i16> [[VZIP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META3]] +// CHECK-NOFP16-NEXT: [[TMP12:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NOFP16-NEXT: [[VZIP3_I:%.*]] = shufflevector <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i32> +// CHECK-NOFP16-NEXT: store <4 x i16> [[VZIP3_I]], ptr [[TMP12]], align 4, !alias.scope [[META3]] // CHECK-NOFP16-NEXT: ret void // // CHECK-FP16-LABEL: define dso_local void @test_vzip_f16( // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: // CHECK-FP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-FP16-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> +// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-FP16-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-FP16-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-FP16-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> // CHECK-FP16-NEXT: store <4 x half> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3]] -// CHECK-FP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1 -// CHECK-FP16-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-FP16-NEXT: store <4 x half> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META3]] +// CHECK-FP16-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1 +// CHECK-FP16-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> +// CHECK-FP16-NEXT: store <4 x half> [[VZIP1_I]], ptr [[TMP6]], align 4, !alias.scope [[META3]] // CHECK-FP16-NEXT: ret void // float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) { @@ -126,28 +144,34 @@ float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) { // CHECK-NOFP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]]) // CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half> // CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> +// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i32> // CHECK-NOFP16-NEXT: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6]] -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK-NOFP16-NEXT: [[VZIP3_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> -// CHECK-NOFP16-NEXT: store <8 x i16> [[VZIP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META6]] +// CHECK-NOFP16-NEXT: [[TMP12:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NOFP16-NEXT: [[VZIP3_I:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i32> +// CHECK-NOFP16-NEXT: store <8 x i16> [[VZIP3_I]], ptr [[TMP12]], align 4, !alias.scope [[META6]] // CHECK-NOFP16-NEXT: ret void // // CHECK-FP16-LABEL: define dso_local void @test_vzipq_f16( // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: // CHECK-FP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]]) -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-FP16-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> +// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-FP16-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-FP16-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-FP16-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> // CHECK-FP16-NEXT: store <8 x half> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6]] -// CHECK-FP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1 -// CHECK-FP16-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-FP16-NEXT: store <8 x half> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META6]] +// CHECK-FP16-NEXT: [[TMP6:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1 +// CHECK-FP16-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> +// CHECK-FP16-NEXT: store <8 x half> [[VZIP1_I]], ptr [[TMP6]], align 4, !alias.scope [[META6]] // CHECK-FP16-NEXT: ret void // float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) { @@ -164,28 +188,34 @@ float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) { // CHECK-NOFP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]]) // CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half> // CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> +// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i32> // CHECK-NOFP16-NEXT: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9]] -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK-NOFP16-NEXT: [[VUZP3_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> -// CHECK-NOFP16-NEXT: store <4 x i16> [[VUZP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META9]] +// CHECK-NOFP16-NEXT: [[TMP12:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NOFP16-NEXT: [[VUZP3_I:%.*]] = shufflevector <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i32> +// CHECK-NOFP16-NEXT: store <4 x i16> [[VUZP3_I]], ptr [[TMP12]], align 4, !alias.scope [[META9]] // CHECK-NOFP16-NEXT: ret void // // CHECK-FP16-LABEL: define dso_local void @test_vuzp_f16( // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: // CHECK-FP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]]) -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-FP16-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> +// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-FP16-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-FP16-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-FP16-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> // CHECK-FP16-NEXT: store <4 x half> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9]] -// CHECK-FP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1 -// CHECK-FP16-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-FP16-NEXT: store <4 x half> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META9]] +// CHECK-FP16-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1 +// CHECK-FP16-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> +// CHECK-FP16-NEXT: store <4 x half> [[VUZP1_I]], ptr [[TMP6]], align 4, !alias.scope [[META9]] // CHECK-FP16-NEXT: ret void // float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) { @@ -202,28 +232,34 @@ float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) { // CHECK-NOFP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) // CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half> // CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> +// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i32> // CHECK-NOFP16-NEXT: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12]] -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK-NOFP16-NEXT: [[VUZP3_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> -// CHECK-NOFP16-NEXT: store <8 x i16> [[VUZP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META12]] +// CHECK-NOFP16-NEXT: [[TMP12:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NOFP16-NEXT: [[VUZP3_I:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i32> +// CHECK-NOFP16-NEXT: store <8 x i16> [[VUZP3_I]], ptr [[TMP12]], align 4, !alias.scope [[META12]] // CHECK-NOFP16-NEXT: ret void // // CHECK-FP16-LABEL: define dso_local void @test_vuzpq_f16( // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: // CHECK-FP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-FP16-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> +// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-FP16-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-FP16-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-FP16-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> // CHECK-FP16-NEXT: store <8 x half> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12]] -// CHECK-FP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1 -// CHECK-FP16-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-FP16-NEXT: store <8 x half> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META12]] +// CHECK-FP16-NEXT: [[TMP6:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1 +// CHECK-FP16-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> +// CHECK-FP16-NEXT: store <8 x half> [[VUZP1_I]], ptr [[TMP6]], align 4, !alias.scope [[META12]] // CHECK-FP16-NEXT: ret void // float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) { @@ -240,28 +276,34 @@ float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) { // CHECK-NOFP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]]) // CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half> // CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> +// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i32> // CHECK-NOFP16-NEXT: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15]] -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK-NOFP16-NEXT: [[VTRN3_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> -// CHECK-NOFP16-NEXT: store <4 x i16> [[VTRN3_I]], ptr [[TMP10]], align 4, !alias.scope [[META15]] +// CHECK-NOFP16-NEXT: [[TMP12:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NOFP16-NEXT: [[VTRN3_I:%.*]] = shufflevector <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i32> +// CHECK-NOFP16-NEXT: store <4 x i16> [[VTRN3_I]], ptr [[TMP12]], align 4, !alias.scope [[META15]] // CHECK-NOFP16-NEXT: ret void // // CHECK-FP16-LABEL: define dso_local void @test_vtrn_f16( // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: // CHECK-FP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]]) -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-FP16-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> +// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-FP16-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-FP16-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-FP16-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> // CHECK-FP16-NEXT: store <4 x half> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15]] -// CHECK-FP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1 -// CHECK-FP16-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-FP16-NEXT: store <4 x half> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META15]] +// CHECK-FP16-NEXT: [[TMP6:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1 +// CHECK-FP16-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> +// CHECK-FP16-NEXT: store <4 x half> [[VTRN1_I]], ptr [[TMP6]], align 4, !alias.scope [[META15]] // CHECK-FP16-NEXT: ret void // float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) { @@ -278,28 +320,34 @@ float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) { // CHECK-NOFP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]]) // CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half> // CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> +// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i32> // CHECK-NOFP16-NEXT: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18]] -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK-NOFP16-NEXT: [[VTRN3_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> -// CHECK-NOFP16-NEXT: store <8 x i16> [[VTRN3_I]], ptr [[TMP10]], align 4, !alias.scope [[META18]] +// CHECK-NOFP16-NEXT: [[TMP12:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 +// CHECK-NOFP16-NEXT: [[VTRN3_I:%.*]] = shufflevector <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i32> +// CHECK-NOFP16-NEXT: store <8 x i16> [[VTRN3_I]], ptr [[TMP12]], align 4, !alias.scope [[META18]] // CHECK-NOFP16-NEXT: ret void // // CHECK-FP16-LABEL: define dso_local void @test_vtrnq_f16( // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: // CHECK-FP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]]) -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-FP16-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> +// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-FP16-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-FP16-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-FP16-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> // CHECK-FP16-NEXT: store <8 x half> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18]] -// CHECK-FP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1 -// CHECK-FP16-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-FP16-NEXT: store <8 x half> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META18]] +// CHECK-FP16-NEXT: [[TMP6:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1 +// CHECK-FP16-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> +// CHECK-FP16-NEXT: store <8 x half> [[VTRN1_I]], ptr [[TMP6]], align 4, !alias.scope [[META18]] // CHECK-FP16-NEXT: ret void // float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) { @@ -418,19 +466,21 @@ float16x8_t test_vdupq_n_f16(float16_t a) { // CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: // CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <2 x i32> -// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP4]] +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> +// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half> +// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <2 x i32> +// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP5]] // // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vdup_lane_f16( // CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-FP16-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-FP16-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <4 x i32> // CHECK-FP16-NEXT: ret <4 x half> [[LANE]] // float16x4_t test_vdup_lane_f16(float16x4_t a) { @@ -441,19 +491,21 @@ float16x4_t test_vdup_lane_f16(float16x4_t a) { // CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: // CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <8 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <4 x i32> -// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP4]] +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <8 x i32> +// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half> +// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x i32> +// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP5]] // // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vdupq_lane_f16( // CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-FP16-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-FP16-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <8 x i32> // CHECK-FP16-NEXT: ret <8 x half> [[LANE]] // float16x8_t test_vdupq_lane_f16(float16x4_t a) { @@ -465,23 +517,27 @@ float16x8_t test_vdupq_lane_f16(float16x4_t a) { // CHECK-NOFP16-NEXT: entry: // CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half> // CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[VEXT]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <2 x i32> -// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP7]] +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i32> +// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <4 x i16> [[VEXT]] to <4 x half> +// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <2 x i32> +// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP9]] // // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vext_f16( // CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-FP16-NEXT: [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> +// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-FP16-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-FP16-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-FP16-NEXT: [[VEXT:%.*]] = shufflevector <4 x half> [[TMP4]], <4 x half> [[TMP5]], <4 x i32> // CHECK-FP16-NEXT: ret <4 x half> [[VEXT]] // float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) { @@ -493,23 +549,27 @@ float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) { // CHECK-NOFP16-NEXT: entry: // CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half> // CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[VEXT]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x i32> -// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP7]] +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> [[TMP7]], <8 x i32> +// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x i16> [[VEXT]] to <8 x half> +// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <4 x i32> +// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP9]] // // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vextq_f16( // CHECK-FP16-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-FP16-NEXT: [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> +// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-FP16-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-FP16-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-FP16-NEXT: [[VEXT:%.*]] = shufflevector <8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x i32> // CHECK-FP16-NEXT: ret <8 x half> [[VEXT]] // float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) { diff --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c index 59f56b988d2ab..f85deeeca757f 100644 --- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c @@ -1,819 +1,1594 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple armv8.2a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature +fullfp16 \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,sroa \ // RUN: | FileCheck %s // REQUIRES: arm-registered-target #include -// CHECK-LABEL: test_vabs_f16 -// CHECK: [[ABS:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[ABS]] +// CHECK-LABEL: define dso_local <4 x half> @test_vabs_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[VABS_I]]) +// CHECK-NEXT: ret <4 x half> [[VABS1_I]] +// float16x4_t test_vabs_f16(float16x4_t a) { return vabs_f16(a); } -// CHECK-LABEL: test_vabsq_f16 -// CHECK: [[ABS:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[ABS]] +// CHECK-LABEL: define dso_local <8 x half> @test_vabsq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VABS1_I:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[VABS_I]]) +// CHECK-NEXT: ret <8 x half> [[VABS1_I]] +// float16x8_t test_vabsq_f16(float16x8_t a) { return vabsq_f16(a); } -// CHECK-LABEL: test_vceqz_f16 -// CHECK: [[TMP1:%.*]] = fcmp oeq <4 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vceqz_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <4 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCEQZ_I]] +// uint16x4_t test_vceqz_f16(float16x4_t a) { return vceqz_f16(a); } -// CHECK-LABEL: test_vceqzq_f16 -// CHECK: [[TMP1:%.*]] = fcmp oeq <8 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vceqzq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <8 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCEQZ_I]] +// uint16x8_t test_vceqzq_f16(float16x8_t a) { return vceqzq_f16(a); } -// CHECK-LABEL: test_vcgez_f16 -// CHECK: [[TMP1:%.*]] = fcmp oge <4 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgez_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oge <4 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCGEZ_I]] +// uint16x4_t test_vcgez_f16(float16x4_t a) { return vcgez_f16(a); } -// CHECK-LABEL: test_vcgezq_f16 -// CHECK: [[TMP1:%.*]] = fcmp oge <8 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgezq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp oge <8 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCGEZ_I]] +// uint16x8_t test_vcgezq_f16(float16x8_t a) { return vcgezq_f16(a); } -// CHECK-LABEL: test_vcgtz_f16 -// CHECK: [[TMP1:%.*]] = fcmp ogt <4 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgtz_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCGTZ_I]] +// uint16x4_t test_vcgtz_f16(float16x4_t a) { return vcgtz_f16(a); } -// CHECK-LABEL: test_vcgtzq_f16 -// CHECK: [[TMP1:%.*]] = fcmp ogt <8 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtzq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <8 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCGTZ_I]] +// uint16x8_t test_vcgtzq_f16(float16x8_t a) { return vcgtzq_f16(a); } -// CHECK-LABEL: test_vclez_f16 -// CHECK: [[TMP1:%.*]] = fcmp ole <4 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclez_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ole <4 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCLEZ_I]] +// uint16x4_t test_vclez_f16(float16x4_t a) { return vclez_f16(a); } -// CHECK-LABEL: test_vclezq_f16 -// CHECK: [[TMP1:%.*]] = fcmp ole <8 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vclezq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp ole <8 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCLEZ_I]] +// uint16x8_t test_vclezq_f16(float16x8_t a) { return vclezq_f16(a); } -// CHECK-LABEL: test_vcltz_f16 -// CHECK: [[TMP1:%.*]] = fcmp olt <4 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcltz_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <4 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCLTZ_I]] +// uint16x4_t test_vcltz_f16(float16x4_t a) { return vcltz_f16(a); } -// CHECK-LABEL: test_vcltzq_f16 -// CHECK: [[TMP1:%.*]] = fcmp olt <8 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcltzq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = fcmp olt <8 x half> [[TMP2]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCLTZ_I]] +// uint16x8_t test_vcltzq_f16(float16x8_t a) { return vcltzq_f16(a); } -// CHECK-LABEL: test_vcvt_f16_s16 -// CHECK: [[VCVT:%.*]] = sitofp <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_f16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[VCVT_I]] +// float16x4_t test_vcvt_f16_s16 (int16x4_t a) { return vcvt_f16_s16(a); } -// CHECK-LABEL: test_vcvtq_f16_s16 -// CHECK: [[VCVT:%.*]] = sitofp <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x half> @test_vcvtq_f16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[VCVT_I]] +// float16x8_t test_vcvtq_f16_s16 (int16x8_t a) { return vcvtq_f16_s16(a); } -// CHECK-LABEL: test_vcvt_f16_u16 -// CHECK: [[VCVT:%.*]] = uitofp <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_f16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[VCVT_I]] +// float16x4_t test_vcvt_f16_u16 (uint16x4_t a) { return vcvt_f16_u16(a); } -// CHECK-LABEL: test_vcvtq_f16_u16 -// CHECK: [[VCVT:%.*]] = uitofp <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x half> @test_vcvtq_f16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[VCVT_I]] +// float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) { return vcvtq_f16_u16(a); } -// CHECK-LABEL: test_vcvt_s16_f16 -// CHECK: [[VCVT:%.*]] = fptosi <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvt_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVT_I:%.*]] = fptosi <4 x half> [[TMP2]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCVT_I]] +// int16x4_t test_vcvt_s16_f16 (float16x4_t a) { return vcvt_s16_f16(a); } -// CHECK-LABEL: test_vcvtq_s16_f16 -// CHECK: [[VCVT:%.*]] = fptosi <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVT_I:%.*]] = fptosi <8 x half> [[TMP2]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCVT_I]] +// int16x8_t test_vcvtq_s16_f16 (float16x8_t a) { return vcvtq_s16_f16(a); } -// CHECK-LABEL: test_vcvt_u16_f16 -// CHECK: [[VCVT:%.*]] = fptoui <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvt_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVT_I:%.*]] = fptoui <4 x half> [[TMP2]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCVT_I]] +// int16x4_t test_vcvt_u16_f16 (float16x4_t a) { return vcvt_u16_f16(a); } -// CHECK-LABEL: test_vcvtq_u16_f16 -// CHECK: [[VCVT:%.*]] = fptoui <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtq_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVT_I:%.*]] = fptoui <8 x half> [[TMP2]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCVT_I]] +// int16x8_t test_vcvtq_u16_f16 (float16x8_t a) { return vcvtq_u16_f16(a); } -// CHECK-LABEL: test_vcvta_s16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtas.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvta_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTA_S16_F16_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTA_S16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtas.v4i16.v4f16(<4 x half> [[VCVTA_S16_F16_I]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTA_S16_F161_I]] +// int16x4_t test_vcvta_s16_f16 (float16x4_t a) { return vcvta_s16_f16(a); } -// CHECK-LABEL: test_vcvta_u16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtau.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvta_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTA_U16_F16_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTA_U16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtau.v4i16.v4f16(<4 x half> [[VCVTA_U16_F16_I]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTA_U16_F161_I]] +// int16x4_t test_vcvta_u16_f16 (float16x4_t a) { return vcvta_u16_f16(a); } -// CHECK-LABEL: test_vcvtaq_s16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtas.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtaq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTAQ_S16_F16_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTAQ_S16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtas.v8i16.v8f16(<8 x half> [[VCVTAQ_S16_F16_I]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTAQ_S16_F161_I]] +// int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) { return vcvtaq_s16_f16(a); } -// CHECK-LABEL: test_vcvtm_s16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtms.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtm_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTM_S16_F16_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTM_S16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtms.v4i16.v4f16(<4 x half> [[VCVTM_S16_F16_I]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTM_S16_F161_I]] +// int16x4_t test_vcvtm_s16_f16 (float16x4_t a) { return vcvtm_s16_f16(a); } -// CHECK-LABEL: test_vcvtmq_s16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtms.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtmq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTMQ_S16_F16_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTMQ_S16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtms.v8i16.v8f16(<8 x half> [[VCVTMQ_S16_F16_I]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTMQ_S16_F161_I]] +// int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) { return vcvtmq_s16_f16(a); } -// CHECK-LABEL: test_vcvtm_u16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtmu.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtm_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTM_U16_F16_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTM_U16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtmu.v4i16.v4f16(<4 x half> [[VCVTM_U16_F16_I]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTM_U16_F161_I]] +// uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) { return vcvtm_u16_f16(a); } -// CHECK-LABEL: test_vcvtmq_u16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtmu.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtmq_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTMQ_U16_F16_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTMQ_U16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtmu.v8i16.v8f16(<8 x half> [[VCVTMQ_U16_F16_I]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTMQ_U16_F161_I]] +// uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) { return vcvtmq_u16_f16(a); } -// CHECK-LABEL: test_vcvtn_s16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtns.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtn_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTN_S16_F16_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTN_S16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtns.v4i16.v4f16(<4 x half> [[VCVTN_S16_F16_I]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTN_S16_F161_I]] +// int16x4_t test_vcvtn_s16_f16 (float16x4_t a) { return vcvtn_s16_f16(a); } -// CHECK-LABEL: test_vcvtnq_s16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtns.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtnq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTNQ_S16_F16_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTNQ_S16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtns.v8i16.v8f16(<8 x half> [[VCVTNQ_S16_F16_I]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTNQ_S16_F161_I]] +// int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) { return vcvtnq_s16_f16(a); } -// CHECK-LABEL: test_vcvtn_u16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtnu.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtn_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTN_U16_F16_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTN_U16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtnu.v4i16.v4f16(<4 x half> [[VCVTN_U16_F16_I]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTN_U16_F161_I]] +// uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) { return vcvtn_u16_f16(a); } -// CHECK-LABEL: test_vcvtnq_u16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtnu.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtnq_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTNQ_U16_F16_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTNQ_U16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtnu.v8i16.v8f16(<8 x half> [[VCVTNQ_U16_F16_I]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTNQ_U16_F161_I]] +// uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) { return vcvtnq_u16_f16(a); } -// CHECK-LABEL: test_vcvtp_s16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtps.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtp_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTP_S16_F16_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTP_S16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtps.v4i16.v4f16(<4 x half> [[VCVTP_S16_F16_I]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTP_S16_F161_I]] +// int16x4_t test_vcvtp_s16_f16 (float16x4_t a) { return vcvtp_s16_f16(a); } -// CHECK-LABEL: test_vcvtpq_s16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtps.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtpq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTPQ_S16_F16_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTPQ_S16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtps.v8i16.v8f16(<8 x half> [[VCVTPQ_S16_F16_I]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTPQ_S16_F161_I]] +// int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) { return vcvtpq_s16_f16(a); } -// CHECK-LABEL: test_vcvtp_u16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtpu.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtp_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVTP_U16_F16_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVTP_U16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtpu.v4i16.v4f16(<4 x half> [[VCVTP_U16_F16_I]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTP_U16_F161_I]] +// uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) { return vcvtp_u16_f16(a); } -// CHECK-LABEL: test_vcvtpq_u16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtpu.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtpq_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVTPQ_U16_F16_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVTPQ_U16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtpu.v8i16.v8f16(<8 x half> [[VCVTPQ_U16_F16_I]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTPQ_U16_F161_I]] +// uint16x8_t test_vcvtpq_u16_f16 (float16x8_t a) { return vcvtpq_u16_f16(a); } // FIXME: Fix the zero constant when fp16 non-storage-only type becomes available. -// CHECK-LABEL: test_vneg_f16 -// CHECK: [[NEG:%.*]] = fneg <4 x half> %a -// CHECK: ret <4 x half> [[NEG]] +// CHECK-LABEL: define dso_local <4 x half> @test_vneg_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x half> [[A]] +// CHECK-NEXT: ret <4 x half> [[FNEG_I]] +// float16x4_t test_vneg_f16(float16x4_t a) { return vneg_f16(a); } -// CHECK-LABEL: test_vnegq_f16 -// CHECK: [[NEG:%.*]] = fneg <8 x half> %a -// CHECK: ret <8 x half> [[NEG]] +// CHECK-LABEL: define dso_local <8 x half> @test_vnegq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <8 x half> [[A]] +// CHECK-NEXT: ret <8 x half> [[FNEG_I]] +// float16x8_t test_vnegq_f16(float16x8_t a) { return vnegq_f16(a); } -// CHECK-LABEL: test_vrecpe_f16 -// CHECK: [[RCP:%.*]] = call <4 x half> @llvm.arm.neon.vrecpe.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RCP]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrecpe_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrecpe.v4f16(<4 x half> [[VRECPE_V_I]]) +// CHECK-NEXT: ret <4 x half> [[VRECPE_V1_I]] +// float16x4_t test_vrecpe_f16(float16x4_t a) { return vrecpe_f16(a); } -// CHECK-LABEL: test_vrecpeq_f16 -// CHECK: [[RCP:%.*]] = call <8 x half> @llvm.arm.neon.vrecpe.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RCP]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrecpeq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrecpe.v8f16(<8 x half> [[VRECPEQ_V_I]]) +// CHECK-NEXT: ret <8 x half> [[VRECPEQ_V1_I]] +// float16x8_t test_vrecpeq_f16(float16x8_t a) { return vrecpeq_f16(a); } -// CHECK-LABEL: test_vrnd_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrintz.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrnd_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRND_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRND_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintz.v4f16(<4 x half> [[VRND_V_I]]) +// CHECK-NEXT: [[VRND_V2_I:%.*]] = bitcast <4 x half> [[VRND_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRND_V2_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP3]] +// float16x4_t test_vrnd_f16(float16x4_t a) { return vrnd_f16(a); } -// CHECK-LABEL: test_vrndq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrintz.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrndq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintz.v8f16(<8 x half> [[VRNDQ_V_I]]) +// CHECK-NEXT: [[VRNDQ_V2_I:%.*]] = bitcast <8 x half> [[VRNDQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRNDQ_V2_I]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP3]] +// float16x8_t test_vrndq_f16(float16x8_t a) { return vrndq_f16(a); } -// CHECK-LABEL: test_vrnda_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrinta.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrnda_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDA_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRNDA_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrinta.v4f16(<4 x half> [[VRNDA_V_I]]) +// CHECK-NEXT: [[VRNDA_V2_I:%.*]] = bitcast <4 x half> [[VRNDA_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRNDA_V2_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP3]] +// float16x4_t test_vrnda_f16(float16x4_t a) { return vrnda_f16(a); } -// CHECK-LABEL: test_vrndaq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrinta.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrndaq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDAQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDAQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrinta.v8f16(<8 x half> [[VRNDAQ_V_I]]) +// CHECK-NEXT: [[VRNDAQ_V2_I:%.*]] = bitcast <8 x half> [[VRNDAQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRNDAQ_V2_I]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP3]] +// float16x8_t test_vrndaq_f16(float16x8_t a) { return vrndaq_f16(a); } -// CHECK-LABEL: test_vrndm_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrintm.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrndm_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDM_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRNDM_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintm.v4f16(<4 x half> [[VRNDM_V_I]]) +// CHECK-NEXT: [[VRNDM_V2_I:%.*]] = bitcast <4 x half> [[VRNDM_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRNDM_V2_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP3]] +// float16x4_t test_vrndm_f16(float16x4_t a) { return vrndm_f16(a); } -// CHECK-LABEL: test_vrndmq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrintm.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrndmq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDMQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintm.v8f16(<8 x half> [[VRNDMQ_V_I]]) +// CHECK-NEXT: [[VRNDMQ_V2_I:%.*]] = bitcast <8 x half> [[VRNDMQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRNDMQ_V2_I]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP3]] +// float16x8_t test_vrndmq_f16(float16x8_t a) { return vrndmq_f16(a); } -// CHECK-LABEL: test_vrndn_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrintn.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrndn_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDN_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRNDN_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintn.v4f16(<4 x half> [[VRNDN_V_I]]) +// CHECK-NEXT: [[VRNDN_V2_I:%.*]] = bitcast <4 x half> [[VRNDN_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRNDN_V2_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP3]] +// float16x4_t test_vrndn_f16(float16x4_t a) { return vrndn_f16(a); } -// CHECK-LABEL: test_vrndnq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrintn.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrndnq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDNQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDNQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintn.v8f16(<8 x half> [[VRNDNQ_V_I]]) +// CHECK-NEXT: [[VRNDNQ_V2_I:%.*]] = bitcast <8 x half> [[VRNDNQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRNDNQ_V2_I]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP3]] +// float16x8_t test_vrndnq_f16(float16x8_t a) { return vrndnq_f16(a); } -// CHECK-LABEL: test_vrndp_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrintp.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrndp_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDP_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRNDP_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintp.v4f16(<4 x half> [[VRNDP_V_I]]) +// CHECK-NEXT: [[VRNDP_V2_I:%.*]] = bitcast <4 x half> [[VRNDP_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRNDP_V2_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP3]] +// float16x4_t test_vrndp_f16(float16x4_t a) { return vrndp_f16(a); } -// CHECK-LABEL: test_vrndpq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrintp.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrndpq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDPQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDPQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintp.v8f16(<8 x half> [[VRNDPQ_V_I]]) +// CHECK-NEXT: [[VRNDPQ_V2_I:%.*]] = bitcast <8 x half> [[VRNDPQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRNDPQ_V2_I]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP3]] +// float16x8_t test_vrndpq_f16(float16x8_t a) { return vrndpq_f16(a); } -// CHECK-LABEL: test_vrndx_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrintx.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrndx_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRNDX_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRNDX_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintx.v4f16(<4 x half> [[VRNDX_V_I]]) +// CHECK-NEXT: [[VRNDX_V2_I:%.*]] = bitcast <4 x half> [[VRNDX_V1_I]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[VRNDX_V2_I]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP3]] +// float16x4_t test_vrndx_f16(float16x4_t a) { return vrndx_f16(a); } -// CHECK-LABEL: test_vrndxq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrintx.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrndxq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRNDXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRNDXQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintx.v8f16(<8 x half> [[VRNDXQ_V_I]]) +// CHECK-NEXT: [[VRNDXQ_V2_I:%.*]] = bitcast <8 x half> [[VRNDXQ_V1_I]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[VRNDXQ_V2_I]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP3]] +// float16x8_t test_vrndxq_f16(float16x8_t a) { return vrndxq_f16(a); } -// CHECK-LABEL: test_vrsqrte_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrte.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrsqrte_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrte.v4f16(<4 x half> [[VRSQRTE_V_I]]) +// CHECK-NEXT: ret <4 x half> [[VRSQRTE_V1_I]] +// float16x4_t test_vrsqrte_f16(float16x4_t a) { return vrsqrte_f16(a); } -// CHECK-LABEL: test_vrsqrteq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrsqrte.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrsqrteq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrsqrte.v8f16(<8 x half> [[VRSQRTEQ_V_I]]) +// CHECK-NEXT: ret <8 x half> [[VRSQRTEQ_V1_I]] +// float16x8_t test_vrsqrteq_f16(float16x8_t a) { return vrsqrteq_f16(a); } -// CHECK-LABEL: test_vadd_f16 -// CHECK: [[ADD:%.*]] = fadd <4 x half> %a, %b -// CHECK: ret <4 x half> [[ADD]] +// CHECK-LABEL: define dso_local <4 x half> @test_vadd_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x half> [[A]], [[B]] +// CHECK-NEXT: ret <4 x half> [[ADD_I]] +// float16x4_t test_vadd_f16(float16x4_t a, float16x4_t b) { return vadd_f16(a, b); } -// CHECK-LABEL: test_vaddq_f16 -// CHECK: [[ADD:%.*]] = fadd <8 x half> %a, %b -// CHECK: ret <8 x half> [[ADD]] +// CHECK-LABEL: define dso_local <8 x half> @test_vaddq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <8 x half> [[A]], [[B]] +// CHECK-NEXT: ret <8 x half> [[ADD_I]] +// float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b) { return vaddq_f16(a, b); } -// CHECK-LABEL: test_vabd_f16 -// CHECK: [[ABD:%.*]] = call <4 x half> @llvm.arm.neon.vabds.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[ABD]] +// CHECK-LABEL: define dso_local <4 x half> @test_vabd_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vabds.v4f16(<4 x half> [[VABD_V_I]], <4 x half> [[VABD_V1_I]]) +// CHECK-NEXT: [[VABD_V3_I:%.*]] = bitcast <4 x half> [[VABD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) { return vabd_f16(a, b); } -// CHECK-LABEL: test_vabdq_f16 -// CHECK: [[ABD:%.*]] = call <8 x half> @llvm.arm.neon.vabds.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[ABD]] +// CHECK-LABEL: define dso_local <8 x half> @test_vabdq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vabds.v8f16(<8 x half> [[VABDQ_V_I]], <8 x half> [[VABDQ_V1_I]]) +// CHECK-NEXT: [[VABDQ_V3_I:%.*]] = bitcast <8 x half> [[VABDQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP5]] +// float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) { return vabdq_f16(a, b); } -// CHECK-LABEL: test_vcage_f16 -// CHECK: [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcage_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> [[VCAGE_V_I]], <4 x half> [[VCAGE_V1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VCAGE_V2_I]] +// uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) { return vcage_f16(a, b); } -// CHECK-LABEL: test_vcageq_f16 -// CHECK: [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcageq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> [[VCAGEQ_V_I]], <8 x half> [[VCAGEQ_V1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VCAGEQ_V2_I]] +// uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) { return vcageq_f16(a, b); } -// CHECK-LABEL: test_vcagt_f16 -// CHECK: [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcagt_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> [[VCAGT_V_I]], <4 x half> [[VCAGT_V1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VCAGT_V2_I]] +// uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) { return vcagt_f16(a, b); } -// CHECK-LABEL: test_vcagtq_f16 -// CHECK: [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcagtq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> [[VCAGTQ_V_I]], <8 x half> [[VCAGTQ_V1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VCAGTQ_V2_I]] +// uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) { return vcagtq_f16(a, b); } -// CHECK-LABEL: test_vcale_f16 -// CHECK: [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> %b, <4 x half> %a) -// CHECK: ret <4 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcale_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> [[VCALE_V_I]], <4 x half> [[VCALE_V1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VCALE_V2_I]] +// uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) { return vcale_f16(a, b); } -// CHECK-LABEL: test_vcaleq_f16 -// CHECK: [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> %b, <8 x half> %a) -// CHECK: ret <8 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcaleq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> [[VCALEQ_V_I]], <8 x half> [[VCALEQ_V1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VCALEQ_V2_I]] +// uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) { return vcaleq_f16(a, b); } -// CHECK-LABEL: test_vcalt_f16 -// CHECK: [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> %b, <4 x half> %a) -// CHECK: ret <4 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcalt_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> [[VCALT_V_I]], <4 x half> [[VCALT_V1_I]]) +// CHECK-NEXT: ret <4 x i16> [[VCALT_V2_I]] +// uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) { return vcalt_f16(a, b); } -// CHECK-LABEL: test_vcaltq_f16 -// CHECK: [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> %b, <8 x half> %a) -// CHECK: ret <8 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcaltq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> [[VCALTQ_V_I]], <8 x half> [[VCALTQ_V1_I]]) +// CHECK-NEXT: ret <8 x i16> [[VCALTQ_V2_I]] +// uint16x8_t test_vcaltq_f16(float16x8_t a, float16x8_t b) { return vcaltq_f16(a, b); } -// CHECK-LABEL: test_vceq_f16 -// CHECK: [[TMP1:%.*]] = fcmp oeq <4 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vceq_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <4 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vceq_f16(float16x4_t a, float16x4_t b) { return vceq_f16(a, b); } -// CHECK-LABEL: test_vceqq_f16 -// CHECK: [[TMP1:%.*]] = fcmp oeq <8 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vceqq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <8 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vceqq_f16(float16x8_t a, float16x8_t b) { return vceqq_f16(a, b); } -// CHECK-LABEL: test_vcge_f16 -// CHECK: [[TMP1:%.*]] = fcmp oge <4 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcge_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <4 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcge_f16(float16x4_t a, float16x4_t b) { return vcge_f16(a, b); } -// CHECK-LABEL: test_vcgeq_f16 -// CHECK: [[TMP1:%.*]] = fcmp oge <8 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgeq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <8 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgeq_f16(float16x8_t a, float16x8_t b) { return vcgeq_f16(a, b); } -// CHECK-LABEL: test_vcgt_f16 -// CHECK: [[TMP1:%.*]] = fcmp ogt <4 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgt_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <4 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcgt_f16(float16x4_t a, float16x4_t b) { return vcgt_f16(a, b); } -// CHECK-LABEL: test_vcgtq_f16 -// CHECK: [[TMP1:%.*]] = fcmp ogt <8 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <8 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgtq_f16(float16x8_t a, float16x8_t b) { return vcgtq_f16(a, b); } -// CHECK-LABEL: test_vcle_f16 -// CHECK: [[TMP1:%.*]] = fcmp ole <4 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcle_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <4 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcle_f16(float16x4_t a, float16x4_t b) { return vcle_f16(a, b); } -// CHECK-LABEL: test_vcleq_f16 -// CHECK: [[TMP1:%.*]] = fcmp ole <8 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcleq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <8 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcleq_f16(float16x8_t a, float16x8_t b) { return vcleq_f16(a, b); } -// CHECK-LABEL: test_vclt_f16 -// CHECK: [[TMP1:%.*]] = fcmp olt <4 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclt_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vclt_f16(float16x4_t a, float16x4_t b) { return vclt_f16(a, b); } -// CHECK-LABEL: test_vcltq_f16 -// CHECK: [[TMP1:%.*]] = fcmp olt <8 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcltq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <8 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcltq_f16(float16x8_t a, float16x8_t b) { return vcltq_f16(a, b); } -// CHECK-LABEL: test_vcvt_n_f16_s16 -// CHECK: [[CVT:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> %vcvt_n, i32 2) -// CHECK: ret <4 x half> [[CVT]] +// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_n_f16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> [[VCVT_N]], i32 2) +// CHECK-NEXT: ret <4 x half> [[VCVT_N1]] +// float16x4_t test_vcvt_n_f16_s16(int16x4_t a) { return vcvt_n_f16_s16(a, 2); } -// CHECK-LABEL: test_vcvtq_n_f16_s16 -// CHECK: [[CVT:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> %vcvt_n, i32 2) -// CHECK: ret <8 x half> [[CVT]] +// CHECK-LABEL: define dso_local <8 x half> @test_vcvtq_n_f16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> [[VCVT_N]], i32 2) +// CHECK-NEXT: ret <8 x half> [[VCVT_N1]] +// float16x8_t test_vcvtq_n_f16_s16(int16x8_t a) { return vcvtq_n_f16_s16(a, 2); } -// CHECK-LABEL: test_vcvt_n_f16_u16 -// CHECK: [[CVT:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> %vcvt_n, i32 2) -// CHECK: ret <4 x half> [[CVT]] +// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_n_f16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> [[VCVT_N]], i32 2) +// CHECK-NEXT: ret <4 x half> [[VCVT_N1]] +// float16x4_t test_vcvt_n_f16_u16(uint16x4_t a) { return vcvt_n_f16_u16(a, 2); } -// CHECK-LABEL: test_vcvtq_n_f16_u16 -// CHECK: [[CVT:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> %vcvt_n, i32 2) -// CHECK: ret <8 x half> [[CVT]] +// CHECK-LABEL: define dso_local <8 x half> @test_vcvtq_n_f16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> [[VCVT_N]], i32 2) +// CHECK-NEXT: ret <8 x half> [[VCVT_N1]] +// float16x8_t test_vcvtq_n_f16_u16(uint16x8_t a) { return vcvtq_n_f16_u16(a, 2); } -// CHECK-LABEL: test_vcvt_n_s16_f16 -// CHECK: [[CVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> %vcvt_n, i32 2) -// CHECK: ret <4 x i16> [[CVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvt_n_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> [[VCVT_N]], i32 2) +// CHECK-NEXT: ret <4 x i16> [[VCVT_N1]] +// int16x4_t test_vcvt_n_s16_f16(float16x4_t a) { return vcvt_n_s16_f16(a, 2); } -// CHECK-LABEL: test_vcvtq_n_s16_f16 -// CHECK: [[CVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> %vcvt_n, i32 2) -// CHECK: ret <8 x i16> [[CVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtq_n_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> [[VCVT_N]], i32 2) +// CHECK-NEXT: ret <8 x i16> [[VCVT_N1]] +// int16x8_t test_vcvtq_n_s16_f16(float16x8_t a) { return vcvtq_n_s16_f16(a, 2); } -// CHECK-LABEL: test_vcvt_n_u16_f16 -// CHECK: [[CVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> %vcvt_n, i32 2) -// CHECK: ret <4 x i16> [[CVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvt_n_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> [[VCVT_N]], i32 2) +// CHECK-NEXT: ret <4 x i16> [[VCVT_N1]] +// uint16x4_t test_vcvt_n_u16_f16(float16x4_t a) { return vcvt_n_u16_f16(a, 2); } -// CHECK-LABEL: test_vcvtq_n_u16_f16 -// CHECK: [[CVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> %vcvt_n, i32 2) -// CHECK: ret <8 x i16> [[CVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtq_n_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> [[VCVT_N]], i32 2) +// CHECK-NEXT: ret <8 x i16> [[VCVT_N1]] +// uint16x8_t test_vcvtq_n_u16_f16(float16x8_t a) { return vcvtq_n_u16_f16(a, 2); } -// CHECK-LABEL: test_vmax_f16 -// CHECK: [[MAX:%.*]] = call <4 x half> @llvm.arm.neon.vmaxs.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MAX]] +// CHECK-LABEL: define dso_local <4 x half> @test_vmax_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VMAX_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vmaxs.v4f16(<4 x half> [[VMAX_V_I]], <4 x half> [[VMAX_V1_I]]) +// CHECK-NEXT: [[VMAX_V3_I:%.*]] = bitcast <4 x half> [[VMAX_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) { return vmax_f16(a, b); } -// CHECK-LABEL: test_vmaxq_f16 -// CHECK: [[MAX:%.*]] = call <8 x half> @llvm.arm.neon.vmaxs.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[MAX]] +// CHECK-LABEL: define dso_local <8 x half> @test_vmaxq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VMAXQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vmaxs.v8f16(<8 x half> [[VMAXQ_V_I]], <8 x half> [[VMAXQ_V1_I]]) +// CHECK-NEXT: [[VMAXQ_V3_I:%.*]] = bitcast <8 x half> [[VMAXQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP5]] +// float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) { return vmaxq_f16(a, b); } -// CHECK-LABEL: test_vmaxnm_f16 -// CHECK: [[MAX:%.*]] = call <4 x half> @llvm.arm.neon.vmaxnm.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MAX]] +// CHECK-LABEL: define dso_local <4 x half> @test_vmaxnm_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMAXNM_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VMAXNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VMAXNM_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vmaxnm.v4f16(<4 x half> [[VMAXNM_V_I]], <4 x half> [[VMAXNM_V1_I]]) +// CHECK-NEXT: [[VMAXNM_V3_I:%.*]] = bitcast <4 x half> [[VMAXNM_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VMAXNM_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) { return vmaxnm_f16(a, b); } -// CHECK-LABEL: test_vmaxnmq_f16 -// CHECK: [[MAX:%.*]] = call <8 x half> @llvm.arm.neon.vmaxnm.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[MAX]] +// CHECK-LABEL: define dso_local <8 x half> @test_vmaxnmq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMAXNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VMAXNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VMAXNMQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vmaxnm.v8f16(<8 x half> [[VMAXNMQ_V_I]], <8 x half> [[VMAXNMQ_V1_I]]) +// CHECK-NEXT: [[VMAXNMQ_V3_I:%.*]] = bitcast <8 x half> [[VMAXNMQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VMAXNMQ_V3_I]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP5]] +// float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) { return vmaxnmq_f16(a, b); } -// CHECK-LABEL: test_vmin_f16 -// CHECK: [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vmins.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MIN]] +// CHECK-LABEL: define dso_local <4 x half> @test_vmin_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VMIN_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vmins.v4f16(<4 x half> [[VMIN_V_I]], <4 x half> [[VMIN_V1_I]]) +// CHECK-NEXT: [[VMIN_V3_I:%.*]] = bitcast <4 x half> [[VMIN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) { return vmin_f16(a, b); } -// CHECK-LABEL: test_vminq_f16 -// CHECK: [[MIN:%.*]] = call <8 x half> @llvm.arm.neon.vmins.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[MIN]] +// CHECK-LABEL: define dso_local <8 x half> @test_vminq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VMINQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vmins.v8f16(<8 x half> [[VMINQ_V_I]], <8 x half> [[VMINQ_V1_I]]) +// CHECK-NEXT: [[VMINQ_V3_I:%.*]] = bitcast <8 x half> [[VMINQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP5]] +// float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) { return vminq_f16(a, b); } -// CHECK-LABEL: test_vminnm_f16 -// CHECK: [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vminnm.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MIN]] +// CHECK-LABEL: define dso_local <4 x half> @test_vminnm_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VMINNM_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VMINNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VMINNM_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vminnm.v4f16(<4 x half> [[VMINNM_V_I]], <4 x half> [[VMINNM_V1_I]]) +// CHECK-NEXT: [[VMINNM_V3_I:%.*]] = bitcast <4 x half> [[VMINNM_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VMINNM_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) { return vminnm_f16(a, b); } -// CHECK-LABEL: test_vminnmq_f16 -// CHECK: [[MIN:%.*]] = call <8 x half> @llvm.arm.neon.vminnm.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[MIN]] +// CHECK-LABEL: define dso_local <8 x half> @test_vminnmq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VMINNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VMINNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VMINNMQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vminnm.v8f16(<8 x half> [[VMINNMQ_V_I]], <8 x half> [[VMINNMQ_V1_I]]) +// CHECK-NEXT: [[VMINNMQ_V3_I:%.*]] = bitcast <8 x half> [[VMINNMQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VMINNMQ_V3_I]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP5]] +// float16x8_t test_vminnmq_f16(float16x8_t a, float16x8_t b) { return vminnmq_f16(a, b); } -// CHECK-LABEL: test_vmul_f16 -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, %b -// CHECK: ret <4 x half> [[MUL]] +// CHECK-LABEL: define dso_local <4 x half> @test_vmul_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x half> [[A]], [[B]] +// CHECK-NEXT: ret <4 x half> [[MUL_I]] +// float16x4_t test_vmul_f16(float16x4_t a, float16x4_t b) { return vmul_f16(a, b); } -// CHECK-LABEL: test_vmulq_f16 -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, %b -// CHECK: ret <8 x half> [[MUL]] +// CHECK-LABEL: define dso_local <8 x half> @test_vmulq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <8 x half> [[A]], [[B]] +// CHECK-NEXT: ret <8 x half> [[MUL_I]] +// float16x8_t test_vmulq_f16(float16x8_t a, float16x8_t b) { return vmulq_f16(a, b); } -// CHECK-LABEL: test_vpadd_f16 -// CHECK: [[ADD:%.*]] = call <4 x half> @llvm.arm.neon.vpadd.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[ADD]] +// CHECK-LABEL: define dso_local <4 x half> @test_vpadd_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vpadd.v4f16(<4 x half> [[VPADD_V_I]], <4 x half> [[VPADD_V1_I]]) +// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <4 x half> [[VPADD_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) { return vpadd_f16(a, b); } -// CHECK-LABEL: test_vpmax_f16 -// CHECK: [[MAX:%.*]] = call <4 x half> @llvm.arm.neon.vpmaxs.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MAX]] +// CHECK-LABEL: define dso_local <4 x half> @test_vpmax_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VPMAX_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vpmaxs.v4f16(<4 x half> [[VPMAX_V_I]], <4 x half> [[VPMAX_V1_I]]) +// CHECK-NEXT: [[VPMAX_V3_I:%.*]] = bitcast <4 x half> [[VPMAX_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) { return vpmax_f16(a, b); } -// CHECK-LABEL: test_vpmin_f16 -// CHECK: [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vpmins.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MIN]] +// CHECK-LABEL: define dso_local <4 x half> @test_vpmin_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VPMIN_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vpmins.v4f16(<4 x half> [[VPMIN_V_I]], <4 x half> [[VPMIN_V1_I]]) +// CHECK-NEXT: [[VPMIN_V3_I:%.*]] = bitcast <4 x half> [[VPMIN_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) { return vpmin_f16(a, b); } -// CHECK-LABEL: test_vrecps_f16 -// CHECK: [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vrecps.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MIN]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrecps_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vrecps.v4f16(<4 x half> [[VRECPS_V_I]], <4 x half> [[VRECPS_V1_I]]) +// CHECK-NEXT: [[VRECPS_V3_I:%.*]] = bitcast <4 x half> [[VRECPS_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) { return vrecps_f16(a, b); } -// CHECK-LABEL: test_vrecpsq_f16 -// CHECK: [[MIN:%.*]] = call <8 x half> @llvm.arm.neon.vrecps.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[MIN]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrecpsq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vrecps.v8f16(<8 x half> [[VRECPSQ_V_I]], <8 x half> [[VRECPSQ_V1_I]]) +// CHECK-NEXT: [[VRECPSQ_V3_I:%.*]] = bitcast <8 x half> [[VRECPSQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP5]] +// float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) { return vrecpsq_f16(a, b); } -// CHECK-LABEL: test_vrsqrts_f16 -// CHECK: [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrts.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MIN]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrsqrts_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> +// CHECK-NEXT: [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrts.v4f16(<4 x half> [[VRSQRTS_V_I]], <4 x half> [[VRSQRTS_V1_I]]) +// CHECK-NEXT: [[VRSQRTS_V3_I:%.*]] = bitcast <4 x half> [[VRSQRTS_V2_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <4 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP5]] +// float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) { return vrsqrts_f16(a, b); } -// CHECK-LABEL: test_vrsqrtsq_f16 -// CHECK: [[MIN:%.*]] = call <8 x half> @llvm.arm.neon.vrsqrts.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[MIN]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrsqrtsq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> +// CHECK-NEXT: [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vrsqrts.v8f16(<8 x half> [[VRSQRTSQ_V_I]], <8 x half> [[VRSQRTSQ_V1_I]]) +// CHECK-NEXT: [[VRSQRTSQ_V3_I:%.*]] = bitcast <8 x half> [[VRSQRTSQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <8 x i16> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP5]] +// float16x8_t test_vrsqrtsq_f16(float16x8_t a, float16x8_t b) { return vrsqrtsq_f16(a, b); } -// CHECK-LABEL: test_vsub_f16 -// CHECK: [[ADD:%.*]] = fsub <4 x half> %a, %b -// CHECK: ret <4 x half> [[ADD]] +// CHECK-LABEL: define dso_local <4 x half> @test_vsub_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x half> [[A]], [[B]] +// CHECK-NEXT: ret <4 x half> [[SUB_I]] +// float16x4_t test_vsub_f16(float16x4_t a, float16x4_t b) { return vsub_f16(a, b); } -// CHECK-LABEL: test_vsubq_f16 -// CHECK: [[ADD:%.*]] = fsub <8 x half> %a, %b -// CHECK: ret <8 x half> [[ADD]] +// CHECK-LABEL: define dso_local <8 x half> @test_vsubq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <8 x half> [[A]], [[B]] +// CHECK-NEXT: ret <8 x half> [[SUB_I]] +// float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) { return vsubq_f16(a, b); } -// CHECK-LABEL: test_vfma_f16 -// CHECK: [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a) -// CHECK: ret <4 x half> [[ADD]] +// CHECK-LABEL: define dso_local <4 x half> @test_vfma_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]]) +// CHECK-NEXT: ret <4 x half> [[TMP9]] +// float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_f16(a, b, c); } -// CHECK-LABEL: test_vfmaq_f16 -// CHECK: [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a) -// CHECK: ret <8 x half> [[ADD]] +// CHECK-LABEL: define dso_local <8 x half> @test_vfmaq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]]) +// CHECK-NEXT: ret <8 x half> [[TMP9]] +// float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_f16(a, b, c); } -// CHECK-LABEL: test_vfms_f16 -// CHECK: [[SUB:%.*]] = fneg <4 x half> %b -// CHECK: [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> %c, <4 x half> %a) -// CHECK: ret <4 x half> [[ADD]] +// CHECK-LABEL: define dso_local <4 x half> @test_vfms_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x half> [[B]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG_I]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> +// CHECK-NEXT: [[TMP9:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP7]], <4 x half> [[TMP8]], <4 x half> [[TMP6]]) +// CHECK-NEXT: ret <4 x half> [[TMP9]] +// float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfms_f16(a, b, c); } -// CHECK-LABEL: test_vfmsq_f16 -// CHECK: [[SUB:%.*]] = fneg <8 x half> %b -// CHECK: [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> %c, <8 x half> %a) -// CHECK: ret <8 x half> [[ADD]] +// CHECK-LABEL: define dso_local <8 x half> @test_vfmsq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <8 x half> [[B]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG_I]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8> +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to <16 x i8> +// CHECK-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x half> +// CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> +// CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> +// CHECK-NEXT: [[TMP9:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP7]], <8 x half> [[TMP8]], <8 x half> [[TMP6]]) +// CHECK-NEXT: ret <8 x half> [[TMP9]] +// float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmsq_f16(a, b, c); } -// CHECK-LABEL: test_vmul_lane_f16 -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]] -// CHECK: ret <4 x half> [[MUL]] +// CHECK-LABEL: define dso_local <4 x half> @test_vmul_lane_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x half> [[A]], [[LANE]] +// CHECK-NEXT: ret <4 x half> [[MUL]] +// float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) { return vmul_lane_f16(a, b, 3); } -// CHECK-LABEL: test_vmulq_lane_f16 -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]] -// CHECK: ret <8 x half> [[MUL]] +// CHECK-LABEL: define dso_local <8 x half> @test_vmulq_lane_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <8 x half> [[A]], [[LANE]] +// CHECK-NEXT: ret <8 x half> [[MUL]] +// float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) { return vmulq_lane_f16(a, b, 3); } -// CHECK-LABEL: test_vmul_n_f16 -// CHECK: [[TMP0:%.*]] = insertelement <4 x half> poison, half [[b:%.*]], i32 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[b]], i32 1 -// CHECK: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[b]], i32 2 -// CHECK: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[b]], i32 3 -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, [[TMP3]] -// CHECK: ret <4 x half> [[MUL]] +// CHECK-LABEL: define dso_local <4 x half> @test_vmul_n_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[B]], i32 3 +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x half> [[A]], [[VECINIT3]] +// CHECK-NEXT: ret <4 x half> [[MUL]] +// float16x4_t test_vmul_n_f16(float16x4_t a, float16_t b) { return vmul_n_f16(a, b); } -// CHECK-LABEL: test_vmulq_n_f16 -// CHECK: [[TMP0:%.*]] = insertelement <8 x half> poison, half [[b:%.*]], i32 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half [[b]], i32 1 -// CHECK: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[b]], i32 2 -// CHECK: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[b]], i32 3 -// CHECK: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[b]], i32 4 -// CHECK: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[b]], i32 5 -// CHECK: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[b]], i32 6 -// CHECK: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[b]], i32 7 -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, [[TMP7]] -// CHECK: ret <8 x half> [[MUL]] +// CHECK-LABEL: define dso_local <8 x half> @test_vmulq_n_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[B]], i32 0 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[B]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[B]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[B]], i32 3 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[B]], i32 4 +// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[B]], i32 5 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[B]], i32 6 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[B]], i32 7 +// CHECK-NEXT: [[MUL:%.*]] = fmul <8 x half> [[A]], [[VECINIT7]] +// CHECK-NEXT: ret <8 x half> [[MUL]] +// float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) { return vmulq_n_f16(a, b); } diff --git a/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c index 947f42cdd0de9..2c083d56238f5 100644 --- a/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c +++ b/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c @@ -1,3 +1,4 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +fullfp16 -target-feature +i8mm \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ // RUN: | opt -S -passes=mem2reg,sroa \ @@ -7,81 +8,118 @@ #include -// CHECK-LABEL: test_vmmlaq_s32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VMMLA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32> [[VMMLA_I]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMMLA1_I]] +// int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) { return vmmlaq_s32(r, a, b); } -// CHECK-LABEL: test_vmmlaq_u32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_u32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VMMLA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32> [[VMMLA_I]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMMLA1_I]] +// uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) { return vmmlaq_u32(r, a, b); } -// CHECK-LABEL: test_vusmmlaq_s32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VUSMMLA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[VUSMMLA1_I:%.*]] = call <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32> [[VUSMMLA_I]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VUSMMLA1_I]] +// int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { return vusmmlaq_s32(r, a, b); } -// CHECK-LABEL: test_vusdot_s32 -// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) -// CHECK: ret <2 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { return vusdot_s32(r, a, b); } -// CHECK-LABEL: test_vusdot_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> %r to <8 x i8> -// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP3]]) -// CHECK: ret <2 x i32> [[OP]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[A]], <8 x i8> [[TMP3]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { return vusdot_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vsudot_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> %r to <8 x i8> -// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP3]], <8 x i8> %a) -// CHECK: ret <2 x i32> [[OP]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsudot_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[R]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> [[VUSDOT_I]], <8 x i8> [[TMP3]], <8 x i8> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) { return vsudot_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vusdotq_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> -// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) -// CHECK: ret <4 x i32> [[OP]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> [[VUSDOT_I]], <16 x i8> [[A]], <16 x i8> [[TMP3]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) { return vusdotq_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vsudotq_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> %r to <16 x i8> -// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %3, <16 x i8> %a) -// CHECK: ret <4 x i32> [[OP]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsudotq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[R]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT_I:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> [[VUSDOT_I]], <16 x i8> [[TMP3]], <16 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) { return vsudotq_lane_s32(r, a, b, 0); } From b0a79065178db615b9aaff50337185ad8ee78054 Mon Sep 17 00:00:00 2001 From: offsetof Date: Tue, 1 Apr 2025 10:44:10 +0000 Subject: [PATCH 0223/1029] [clang] Fix crash on invalid `std::initializer_list` template-id (#132284) In `Sema::BuildStdInitializerList`, check that the synthesized template-id `std::initializer_list` is valid (which might not be the case if the template has associated constraints or subsequent parameters with default arguments) before forming the type. Fixes #132256 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaDeclCXX.cpp | 8 ++++++-- .../SemaCXX/invalid-std-initializer-list.cpp | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 clang/test/SemaCXX/invalid-std-initializer-list.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index daad01919ecd4..c034b925cddc6 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -371,6 +371,7 @@ Bug Fixes to C++ Support - Fixed a Clang regression in C++20 mode where unresolved dependent call expressions were created inside non-dependent contexts (#GH122892) - Clang now emits the ``-Wunused-variable`` warning when some structured bindings are unused and the ``[[maybe_unused]]`` attribute is not applied. (#GH125810) +- Fixed a crash caused by invalid declarations of ``std::initializer_list``. (#GH132256) - Clang no longer crashes when establishing subsumption between some constraint expressions. (#GH122581) - Clang now issues an error when placement new is used to modify a const-qualified variable in a ``constexpr`` function. (#GH131432) diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 676d53a1f4b45..d724e183b69bd 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -12182,10 +12182,14 @@ QualType Sema::BuildStdInitializerList(QualType Element, SourceLocation Loc) { Args.addArgument(TemplateArgumentLoc(TemplateArgument(Element), Context.getTrivialTypeSourceInfo(Element, Loc))); + + QualType T = CheckTemplateIdType(TemplateName(StdInitializerList), Loc, Args); + if (T.isNull()) + return QualType(); + return Context.getElaboratedType( ElaboratedTypeKeyword::None, - NestedNameSpecifier::Create(Context, nullptr, getStdNamespace()), - CheckTemplateIdType(TemplateName(StdInitializerList), Loc, Args)); + NestedNameSpecifier::Create(Context, nullptr, getStdNamespace()), T); } bool Sema::isInitListConstructor(const FunctionDecl *Ctor) { diff --git a/clang/test/SemaCXX/invalid-std-initializer-list.cpp b/clang/test/SemaCXX/invalid-std-initializer-list.cpp new file mode 100644 index 0000000000000..93246b5f03fd4 --- /dev/null +++ b/clang/test/SemaCXX/invalid-std-initializer-list.cpp @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 %s -verify -std=c++20 + +namespace std { + +template // expected-error 2 {{type 'int' cannot be used prior to '::' because it has no members}} +class initializer_list; + +} + +namespace gh132256 { + +auto x = {1}; // expected-note {{in instantiation of default argument for 'initializer_list' required here}} + +void f() { + for(int x : {1, 2}); // expected-note {{in instantiation of default argument for 'initializer_list' required here}} +} + +} From 19a319667b567a26a20f9829a0ae7e6a5c259cba Mon Sep 17 00:00:00 2001 From: Alexey Moksyakov Date: Tue, 1 Apr 2025 13:49:09 +0300 Subject: [PATCH 0224/1029] [bolt][aarch64] Adding test with unsupported indirect branches (#127655) This test contains the set of common indirect branch patterns. Adding the support will be step by step --- bolt/lib/Core/BinaryFunction.cpp | 5 + bolt/test/AArch64/jmp-table-unsupported.s | 286 ++++++++++++++++++++++ 2 files changed, 291 insertions(+) create mode 100644 bolt/test/AArch64/jmp-table-unsupported.s diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 09006249887f6..d1b293ada5fdc 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1219,6 +1219,11 @@ void BinaryFunction::handleIndirectBranch(MCInst &Instruction, uint64_t Size, case IndirectBranchType::UNKNOWN: // Keep processing. We'll do more checks and fixes in // postProcessIndirectBranches(). + if (opts::Verbosity > 2) { + outs() << "BOLT-WARNING: failed to match indirect branch, " + << getPrintName() << " at 0x" << Twine::utohexstr(Offset) + << " offset\n"; + } UnknownIndirectBranchOffsets.emplace(Offset); break; } diff --git a/bolt/test/AArch64/jmp-table-unsupported.s b/bolt/test/AArch64/jmp-table-unsupported.s new file mode 100644 index 0000000000000..1228149430449 --- /dev/null +++ b/bolt/test/AArch64/jmp-table-unsupported.s @@ -0,0 +1,286 @@ +## This test checks that disassemble stage works properly +## JT with indirect branch +## 1) nop + adr pair instructions +## 2) sub + ldr pair instructions +## 3) adrp + ldr pair instructions +## 4) pic jt with relive offsets packed to 1-byte entry size +## 5) fixed indirect branch +## 6) normal jt + +# REQUIRES: system-linux + +# RUN: rm -rf %t && split-file %s %t + +## Prepare binary (1) +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %t/jt_nop_adr.s \ +# RUN: -o %t/jt_nop_adr.o +# RUN: %clang %cflags --target=aarch64-unknown-linux %t/jt_nop_adr.o \ +# RUN: -Wl,-q -Wl,-z,now, -Wl,-T,%t/within-adr-range.t -o %t/jt_nop_adr.exe +# RUN: llvm-objdump --no-show-raw-insn -d %t/jt_nop_adr.exe | FileCheck \ +# RUN: --check-prefix=JT-RELAXED %s + +# JT-RELAXED: <_start>: +# JT-RELAXED-NEXT: nop +# JT-RELAXED-NEXT: adr {{.*}}x3 + +# RUN: llvm-bolt %t/jt_nop_adr.exe -o %t/jt_nop_adr.bolt -v 3 2>&1 | FileCheck \ +# RUN: --check-prefix=JT-BOLT-RELAXED %s + +# JT-BOLT-RELAXED: failed to match indirect branch + +## This linker script ensures that .rodata and .text are sufficiently (<1M) +## close to each other so that the adrp + ldr pair can be relaxed to nop + adr. +#--- within-adr-range.t +SECTIONS { + .rodata 0x1000: { *(.rodata) } + .text 0x2000: { *(.text) } + .rela.rodata : { *(.rela.rodata) } +} + +## Prepare binary (2) +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %t/jt_sub_ldr.s \ +# RUN: -o %t/jt_sub_ldr.o +# RUN: %clang %cflags --target=aarch64-unknown-linux %t/jt_sub_ldr.o \ +# RUN: -Wl,-q -Wl,-z,now -o %t/jt_sub_ldr.exe +# RUN: llvm-objdump --no-show-raw-insn -d %t/jt_sub_ldr.exe | FileCheck \ +# RUN: --check-prefix=JT-SUB-LDR %s + +# JT-SUB-LDR: <_start>: +# JT-SUB-LDR-NEXT: sub +# JT-SUB-LDR-NEXT: ldr + +# RUN: llvm-bolt %t/jt_sub_ldr.exe -o %t/jt_sub_ldr.bolt -v 3 2>&1 | FileCheck \ +# RUN: --check-prefix=JT-BOLT-SUBLDR %s +# JT-BOLT-SUBLDR: failed to match indirect branch + +## Prepare binary (3) +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %t/jt_adrp_ldr.s \ +# RUN: -o %t/jt_adrp_ldr.o +# RUN: %clang %cflags --target=aarch64-unknown-linux %t/jt_adrp_ldr.o \ +# RUN: -Wl,-q -Wl,-z,now -Wl,--no-relax -o %t/jt_adrp_ldr.exe +# RUN: llvm-objdump --no-show-raw-insn -d %t/jt_adrp_ldr.exe | FileCheck \ +# RUN: --check-prefix=JT-ADRP-LDR %s + +# JT-ADRP-LDR: <_start>: +# JT-ADRP-LDR-NEXT: adrp +# JT-ADRP-LDR-NEXT: ldr + +# RUN: llvm-bolt %t/jt_adrp_ldr.exe -o %t/jt_adrp_ldr.bolt -v 3 2>&1 | FileCheck \ +# RUN: --check-prefix=JT-BOLT-ADRP-LDR %s +# JT-BOLT-ADRP-LDR: failed to match indirect branch + +## Prepare binary (4) +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ +# RUN: --position-independent %t/jt_pic_with_relative_offset.s \ +# RUN: -o %t/jt_pic_with_relative_offset.o +# RUN: %clang %cflags -fPIC -O0 %t/jt_pic_with_relative_offset.o \ +# RUN: -o %t/jt_pic_with_relative_offset.exe -Wl,-q -Wl,--no-relax +# RUN: llvm-bolt %t/jt_pic_with_relative_offset.exe \ +# RUN: -o %t/jt_pic_with_relative_offset.bolt -v 3 2>&1 | FileCheck \ +# RUN: --check-prefix=JT-BOLT-JT-PIC-OFFSETS %s + +# JT-BOLT-JT-PIC-OFFSETS: failed to match indirect branch + +## Prepare binary (5) +# RUN: %clang %cflags %t/jt_fixed_branch.s -Wl,-q -Wl,--no-relax \ +# RUN: -o %t/jt_fixed_branch.exe + +# RUN: llvm-bolt %t/jt_fixed_branch.exe \ +# RUN: -o %t/jt_fixed_branch.bolt -v 3 2>&1 | FileCheck \ +# RUN: --check-prefix=JT-BOLT-FIXED-BR %s + +# JT-BOLT-FIXED-BR: failed to match indirect branch + +## Prepare binary (6) +# RUN: %clang %cflags -no-pie %t/jt_type_normal.c \ +# RUN: -Wl,-q -Wl,-z,now -Wl,--no-relax \ +# RUN: -o %t/jt_type_normal.exe +# RUN: llvm-objdump --no-show-raw-insn -d %t/jt_type_normal.exe | FileCheck \ +# RUN: --check-prefix=JT-OBJDUMP-NORMAL %s + +# JT-OBJDUMP-NORMAL: : +# JT-OBJDUMP-NORMAL: adrp +# JT-OBJDUMP-NORMAL-NEXT: add +# JT-OBJDUMP-NORMAL-NEXT: ldr +# JT-OBJDUMP-NORMAL-NEXT: blr + +# RUN: llvm-bolt %t/jt_type_normal.exe --dyno-stats \ +# RUN: -o %t/jt_type_normal.bolt -v 3 2>&1 | FileCheck \ +# RUN: --check-prefix=JT-BOLT-NORMAL %s + +# JT-BOLT-NORMAL: 0{{.*}}: indirect calls + +#--- jt_nop_adr.s + .globl _start + .type _start, %function +_start: + adrp x3, :got:jump_table + ldr x3, [x3, #:got_lo12:jump_table] + ldrh w3, [x3, x1, lsl #1] + adr x1, test2_0 + add x3, x1, w3, sxth #2 + br x3 +test2_0: + ret +test2_1: + ret + + .section .rodata,"a",@progbits +jump_table: + .hword (test2_0-test2_0)>>2 + .hword (test2_1-test2_0)>>2 + + +#--- jt_sub_ldr.s + .globl _start + .type _start, %function +_start: + sub x1, x29, #0x4, lsl #12 + ldr x1, [x1, #14352] + ldrh w1, [x1, w3, uxtw #1] + adr x3, test2_0 + add x1, x3, w1, sxth #2 + br x1 +test2_0: + ret +test2_1: + ret + + .section .rodata,"a",@progbits +jump_table: + .hword (test2_0-test2_0)>>2 + .hword (test2_1-test2_0)>>2 + + +#--- jt_adrp_ldr.s + .globl _start + .type _start, %function +_start: + adrp x3, :got:jump_table + ldr x3, [x3, #:got_lo12:jump_table] + ldrh w3, [x3, x1, lsl #1] + adr x1, test2_0 + add x3, x1, w3, sxth #2 + br x3 +test2_0: + ret +test2_1: + ret + + .section .rodata,"a",@progbits +jump_table: + .hword (test2_0-test2_0)>>2 + .hword (test2_1-test2_0)>>2 + + +#--- jt_pic_with_relative_offset.s +.text +.global _start +_start: + mov x4, 3 // index in jmp table where offset related to adr instr + adrp x0, funcTableSym + add x0, x0, #:lo12:funcTableSym + ldrb w0, [x0, w4, uxtw #0] + adr x2, .LBB1 + add x0, x2, w0, sxth #2 + br x0 + +.LBB1: + bl funcA + b .test_exit + +.LBB2: + bl funcB + b .test_exit + +.LBB3: + bl funcC + b .test_exit + +.LBB4: + bl funcD + b .test_exit + +.test_exit: + mov x8, #93 + mov x0, #0 + svc #0 + +.global funcA +funcA: + ret + +.global funcB +funcB: + ret + +.global funcC +funcC: + ret + +.global funcD +funcD: + ret + +.section .rodata,"a",@progbits +.align 2 +funcTableSym: + .byte 0x00,0x02,0x04,0x06 // 1 - .LBB1, 3 - .LBB2 + +#--- jt_fixed_branch.s + +.text +.global _start +_start: + mov x0, x13 + mov x1, x4 + mov x0, x2 + movk x1, #0x0, lsl #48 + movk x1, #0x0, lsl #32 + movk x1, #0x0, lsl #16 + movk x1, #0x12 + stp x0, x1, [sp, #-16]! + adrp x0, foo + add x0, x0, #:lo12:foo + br x0 + mov x8, #93 + mov x0, #0 + svc #0 + +.global foo +.type foo,%function +foo: + mov x8, #9 + ret +.size foo,.-foo + +#--- jt_type_normal.c + +void __attribute__ ((noinline)) option0() { +} + +void __attribute__ ((noinline)) option1() { +} + +void __attribute__ ((noinline)) option2() { +} + +void __attribute__ ((noinline)) option3() { +} + +void __attribute__ ((noinline)) option4() { +} + +void __attribute__ ((noinline)) option5() { +} + +void (*jumpTable[6])() = { option0, option1, option2, option3, option4, option5 }; + +void __attribute__ ((noinline)) handleOptionJumpTable(int option) { + jumpTable[option](); +} + +int main(int argc, char *argv[]) { + handleOptionJumpTable(argc); + return 0; +} From 1d9ad99305753a11a28e61edb130d9c63859f42e Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Tue, 1 Apr 2025 18:58:50 +0800 Subject: [PATCH 0225/1029] [mlir] Use llvm::hasSingleElement (NFC) (#133881) --- mlir/lib/IR/Block.cpp | 2 +- mlir/lib/Transforms/Utils/InliningUtils.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/lib/IR/Block.cpp b/mlir/lib/IR/Block.cpp index 4b1568219fb37..959a60c044502 100644 --- a/mlir/lib/IR/Block.cpp +++ b/mlir/lib/IR/Block.cpp @@ -119,7 +119,7 @@ bool Block::verifyOpOrder() { if (!isOpOrderValid()) return false; // The order is valid if there are less than 2 operations. - if (operations.empty() || std::next(operations.begin()) == operations.end()) + if (operations.empty() || llvm::hasSingleElement(operations)) return false; Operation *prev = nullptr; diff --git a/mlir/lib/Transforms/Utils/InliningUtils.cpp b/mlir/lib/Transforms/Utils/InliningUtils.cpp index a8bf34cdb3e02..e113389b26ae7 100644 --- a/mlir/lib/Transforms/Utils/InliningUtils.cpp +++ b/mlir/lib/Transforms/Utils/InliningUtils.cpp @@ -330,7 +330,7 @@ inlineRegionImpl(InlinerInterface &interface, Region *src, Block *inlineBlock, bool singleBlockFastPath = interface.allowSingleBlockOptimization(newBlocks); // Handle the case where only a single block was inlined. - if (singleBlockFastPath && std::next(newBlocks.begin()) == newBlocks.end()) { + if (singleBlockFastPath && llvm::hasSingleElement(newBlocks)) { // Run the result attribute handler on the terminator operands. Operation *firstBlockTerminator = firstNewBlock->getTerminator(); builder.setInsertionPoint(firstBlockTerminator); From 13a313fe582a3c41fb5c50ca2325c0987c0af6d7 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 1 Apr 2025 12:03:21 +0100 Subject: [PATCH 0226/1029] [libclc] Move sinpi/cospi/tanpi to the CLC library (#133889) Additionally, these builtins are now vectorized. This also moves the native_recip and native_divide builtins as they are used by the tanpi builtin. --- libclc/CMakeLists.txt | 2 + libclc/clc/include/clc/math/clc_cospi.h | 20 ++ .../clc/include/clc/math/clc_native_divide.h | 22 +++ .../clc/include/clc/math/clc_native_recip.h | 22 +++ .../include/clc/math/clc_sincos_helpers.inc | 2 + .../clc/include/clc/math/clc_sincos_piby4.h | 14 ++ .../clc/include/clc/math/clc_sincos_piby4.inc | 174 ++++++++++++++++++ libclc/clc/include/clc/math/clc_sinpi.h | 20 ++ libclc/clc/include/clc/math/clc_tanpi.h | 20 ++ libclc/clc/lib/generic/SOURCES | 5 + libclc/clc/lib/generic/math/clc_cospi.cl | 18 ++ libclc/clc/lib/generic/math/clc_cospi.inc | 116 ++++++++++++ .../clc/lib/generic/math/clc_native_divide.cl | 14 ++ .../lib/generic/math/clc_native_divide.inc | 12 ++ .../lib/generic/math/clc_native_recip.cl} | 9 +- .../lib/generic/math/clc_native_recip.inc} | 2 +- .../lib/generic/math/clc_sincos_helpers.cl | 1 + .../lib/generic/math/clc_sincos_helpers.inc | 19 ++ libclc/clc/lib/generic/math/clc_sinpi.cl | 18 ++ libclc/clc/lib/generic/math/clc_sinpi.inc | 114 ++++++++++++ libclc/clc/lib/generic/math/clc_tanpi.cl | 19 ++ libclc/clc/lib/generic/math/clc_tanpi.inc | 132 +++++++++++++ libclc/clspv/lib/SOURCES | 1 - libclc/generic/lib/SOURCES | 1 - libclc/generic/lib/math/clc_tan.cl | 7 +- libclc/generic/lib/math/clc_tanpi.cl | 132 ------------- libclc/generic/lib/math/cospi.cl | 123 +------------ libclc/generic/lib/math/native_divide.cl | 5 +- libclc/generic/lib/math/native_recip.cl | 5 +- libclc/generic/lib/math/sincosD_piby4.h | 119 ------------ libclc/generic/lib/math/sincos_helpers.cl | 24 +-- libclc/generic/lib/math/sincos_helpers.h | 2 - libclc/generic/lib/math/sincospiF_piby4.h | 46 ----- libclc/generic/lib/math/sinpi.cl | 118 +----------- libclc/generic/lib/math/tanpi.cl | 6 +- libclc/spirv/lib/SOURCES | 1 - 36 files changed, 797 insertions(+), 568 deletions(-) create mode 100644 libclc/clc/include/clc/math/clc_cospi.h create mode 100644 libclc/clc/include/clc/math/clc_native_divide.h create mode 100644 libclc/clc/include/clc/math/clc_native_recip.h create mode 100644 libclc/clc/include/clc/math/clc_sincos_piby4.h create mode 100644 libclc/clc/include/clc/math/clc_sincos_piby4.inc create mode 100644 libclc/clc/include/clc/math/clc_sinpi.h create mode 100644 libclc/clc/include/clc/math/clc_tanpi.h create mode 100644 libclc/clc/lib/generic/math/clc_cospi.cl create mode 100644 libclc/clc/lib/generic/math/clc_cospi.inc create mode 100644 libclc/clc/lib/generic/math/clc_native_divide.cl create mode 100644 libclc/clc/lib/generic/math/clc_native_divide.inc rename libclc/{generic/lib/math/native_divide.inc => clc/lib/generic/math/clc_native_recip.cl} (74%) rename libclc/{generic/lib/math/native_recip.inc => clc/lib/generic/math/clc_native_recip.inc} (83%) create mode 100644 libclc/clc/lib/generic/math/clc_sinpi.cl create mode 100644 libclc/clc/lib/generic/math/clc_sinpi.inc create mode 100644 libclc/clc/lib/generic/math/clc_tanpi.cl create mode 100644 libclc/clc/lib/generic/math/clc_tanpi.inc delete mode 100644 libclc/generic/lib/math/clc_tanpi.cl delete mode 100644 libclc/generic/lib/math/sincosD_piby4.h delete mode 100644 libclc/generic/lib/math/sincospiF_piby4.h diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index efe7f5804e8fb..d4753b22ed01c 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -265,11 +265,13 @@ endif() set_source_files_properties( # CLC builtins ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_cos.cl + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_divide.cl ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_exp2.cl ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_exp.cl ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_log10.cl ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_log2.cl ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_log.cl + ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_recip.cl ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_rsqrt.cl ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_sin.cl ${CMAKE_CURRENT_SOURCE_DIR}/clc/lib/generic/math/clc_native_sqrt.cl diff --git a/libclc/clc/include/clc/math/clc_cospi.h b/libclc/clc/include/clc/math/clc_cospi.h new file mode 100644 index 0000000000000..07565c23a2f07 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_cospi.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_COSPI_H__ +#define __CLC_MATH_CLC_COSPI_H__ + +#define __CLC_BODY +#define __CLC_FUNCTION __clc_cospi + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_COSPI_H__ diff --git a/libclc/clc/include/clc/math/clc_native_divide.h b/libclc/clc/include/clc/math/clc_native_divide.h new file mode 100644 index 0000000000000..b48c3e5d03b36 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_native_divide.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_NATIVE_DIVIDE_H__ +#define __CLC_MATH_CLC_NATIVE_DIVIDE_H__ + +#define __FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_divide +#define __CLC_BODY + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION +#undef __FLOAT_ONLY + +#endif // __CLC_MATH_CLC_NATIVE_DIVIDE_H__ diff --git a/libclc/clc/include/clc/math/clc_native_recip.h b/libclc/clc/include/clc/math/clc_native_recip.h new file mode 100644 index 0000000000000..9af36b0c7ce85 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_native_recip.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_NATIVE_RECIP_H__ +#define __CLC_MATH_CLC_NATIVE_RECIP_H__ + +#define __FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_recip +#define __CLC_BODY + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION +#undef __FLOAT_ONLY + +#endif // __CLC_MATH_CLC_NATIVE_RECIP_H__ diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers.inc b/libclc/clc/include/clc/math/clc_sincos_helpers.inc index c891dd91dfd2b..4daff92955cd7 100644 --- a/libclc/clc/include/clc/math/clc_sincos_helpers.inc +++ b/libclc/clc/include/clc/math/clc_sincos_helpers.inc @@ -10,6 +10,8 @@ _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_sinf_piby4(__CLC_FLOATN x, __CLC_FLOATN y); _CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x, __CLC_FLOATN y); +_CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x, + __CLC_INTN regn); _CLC_DECL _CLC_OVERLOAD __CLC_INTN __clc_argReductionS(private __CLC_FLOATN *r, private __CLC_FLOATN *rr, diff --git a/libclc/clc/include/clc/math/clc_sincos_piby4.h b/libclc/clc/include/clc/math/clc_sincos_piby4.h new file mode 100644 index 0000000000000..50608ae24e947 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_sincos_piby4.h @@ -0,0 +1,14 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/include/clc/math/clc_sincos_piby4.inc b/libclc/clc/include/clc/math/clc_sincos_piby4.inc new file mode 100644 index 0000000000000..91ec518b70e97 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_sincos_piby4.inc @@ -0,0 +1,174 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4] +_CLC_INLINE _CLC_OVERLOAD void +__clc_sincos_piby4(__CLC_GENTYPE x, private __CLC_GENTYPE *sinval, + private __CLC_GENTYPE *cosval) { + // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... + // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... + // = x * f(w) + // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... + // We use a minimax approximation of (f(w) - 1) / w + // because this produces an expansion in even powers of x. + + // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... + // = f(w) + // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... + // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) + // because this produces an expansion in even powers of x. + + const __CLC_GENTYPE sc1 = -0.166666666638608441788607926e0F; + const __CLC_GENTYPE sc2 = 0.833333187633086262120839299e-2F; + const __CLC_GENTYPE sc3 = -0.198400874359527693921333720e-3F; + const __CLC_GENTYPE sc4 = 0.272500015145584081596826911e-5F; + + const __CLC_GENTYPE cc1 = 0.41666666664325175238031e-1F; + const __CLC_GENTYPE cc2 = -0.13888887673175665567647e-2F; + const __CLC_GENTYPE cc3 = 0.24800600878112441958053e-4F; + const __CLC_GENTYPE cc4 = -0.27301013343179832472841e-6F; + + __CLC_GENTYPE x2 = x * x; + + *sinval = __clc_mad( + x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1), + x); + *cosval = __clc_mad( + x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1), + __clc_mad(x2, -0.5f, 1.0f)); +} + +#elif __CLC_FPSIZE == 64 + +_CLC_INLINE _CLC_OVERLOAD void +__clc_sincos_piby4(__CLC_GENTYPE x, __CLC_GENTYPE xx, + private __CLC_GENTYPE *sinval, + private __CLC_GENTYPE *cosval) { + // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... + // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... + // = x * f(w) + // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... + // We use a minimax approximation of (f(w) - 1) / w + // because this produces an expansion in even powers of x. + // If xx (the tail of x) is non-zero, we add a correction + // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx) + // is an approximation to cos(x)*sin(xx) valid because + // xx is tiny relative to x. + + // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... + // = f(w) + // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... + // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) + // because this produces an expansion in even powers of x. + // If xx (the tail of x) is non-zero, we subtract a correction + // term g(x,xx) = x*xx to the result, where g(x,xx) + // is an approximation to sin(x)*sin(xx) valid because + // xx is tiny relative to x. + + const __CLC_GENTYPE sc1 = -0.166666666666666646259241729; + const __CLC_GENTYPE sc2 = 0.833333333333095043065222816e-2; + const __CLC_GENTYPE sc3 = -0.19841269836761125688538679e-3; + const __CLC_GENTYPE sc4 = 0.275573161037288022676895908448e-5; + const __CLC_GENTYPE sc5 = -0.25051132068021699772257377197e-7; + const __CLC_GENTYPE sc6 = 0.159181443044859136852668200e-9; + + const __CLC_GENTYPE cc1 = 0.41666666666666665390037e-1; + const __CLC_GENTYPE cc2 = -0.13888888888887398280412e-2; + const __CLC_GENTYPE cc3 = 0.248015872987670414957399e-4; + const __CLC_GENTYPE cc4 = -0.275573172723441909470836e-6; + const __CLC_GENTYPE cc5 = 0.208761463822329611076335e-8; + const __CLC_GENTYPE cc6 = -0.113826398067944859590880e-10; + + __CLC_GENTYPE x2 = x * x; + __CLC_GENTYPE x3 = x2 * x; + __CLC_GENTYPE r = (__CLC_GENTYPE)0.5 * x2; + __CLC_GENTYPE t = (__CLC_GENTYPE)1.0 - r; + + __CLC_GENTYPE sp = __clc_fma( + __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2); + + __CLC_GENTYPE cp = + t + + __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5), + x2, cc4), + x2, cc3), + x2, cc2), + x2, cc1), + x2 * x2, __clc_fma(x, xx, (1.0 - t) - r)); + + *sinval = + x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx)); + *cosval = cp; +} + +_CLC_INLINE _CLC_OVERLOAD void __clc_tan_piby4(__CLC_GENTYPE x, + __CLC_GENTYPE xx, + private __CLC_GENTYPE *leadval, + private __CLC_GENTYPE *tailval) { + // 0x3fe921fb54442d18 + const __CLC_GENTYPE piby4_lead = 7.85398163397448278999e-01; + // 0x3c81a62633145c06 + const __CLC_GENTYPE piby4_tail = 3.06161699786838240164e-17; + + // In order to maintain relative precision transform using the identity: + // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4. + // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4. + + __CLC_LONGN ca = x > 0.68; + __CLC_LONGN cb = x < -0.68; + __CLC_GENTYPE transform = ca ? 1.0 : 0.0; + transform = cb ? -1.0 : transform; + + __CLC_GENTYPE tx = __clc_fma(-transform, x, piby4_lead) + + __clc_fma(-transform, xx, piby4_tail); + __CLC_LONGN c = ca | cb; + x = c ? tx : x; + xx = c ? 0.0 : xx; + + // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68]. + __CLC_GENTYPE t1 = x; + __CLC_GENTYPE r = __clc_fma(2.0, x * xx, x * x); + + __CLC_GENTYPE a = __clc_fma(r, + __clc_fma(r, 0.224044448537022097264602535574e-3, + -0.229345080057565662883358588111e-1), + 0.372379159759792203640806338901e0); + + __CLC_GENTYPE b = + __clc_fma(r, + __clc_fma(r, + __clc_fma(r, -0.232371494088563558304549252913e-3, + 0.260656620398645407524064091208e-1), + -0.515658515729031149329237816945e0), + 0.111713747927937668539901657944e1); + + __CLC_GENTYPE t2 = __clc_fma(MATH_DIVIDE(a, b), x * r, xx); + + __CLC_GENTYPE tp = t1 + t2; + + // Compute -1.0/(t1 + t2) accurately + __CLC_GENTYPE z1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(tp) & 0xffffffff00000000L); + __CLC_GENTYPE z2 = t2 - (z1 - t1); + __CLC_GENTYPE trec = -MATH_RECIP(tp); + __CLC_GENTYPE trec_top = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(trec) & 0xffffffff00000000L); + + __CLC_GENTYPE tpr = __clc_fma( + __clc_fma(trec_top, z2, __clc_fma(trec_top, z1, 1.0)), trec, trec_top); + + __CLC_GENTYPE tpt = transform * (1.0 - MATH_DIVIDE(2.0 * tp, 1.0 + tp)); + __CLC_GENTYPE tptr = transform * (MATH_DIVIDE(2.0 * tp, tp - 1.0) - 1.0); + + *leadval = c ? tpt : tp; + *tailval = c ? tptr : tpr; +} + +#endif diff --git a/libclc/clc/include/clc/math/clc_sinpi.h b/libclc/clc/include/clc/math/clc_sinpi.h new file mode 100644 index 0000000000000..46fec465ceb03 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_sinpi.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_SINPI_H__ +#define __CLC_MATH_CLC_SINPI_H__ + +#define __CLC_BODY +#define __CLC_FUNCTION __clc_sinpi + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_SINPI_H__ diff --git a/libclc/clc/include/clc/math/clc_tanpi.h b/libclc/clc/include/clc/math/clc_tanpi.h new file mode 100644 index 0000000000000..0b8efce27dee8 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_tanpi.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_TANPI_H__ +#define __CLC_MATH_CLC_TANPI_H__ + +#define __CLC_BODY +#define __CLC_FUNCTION __clc_tanpi + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_TANPI_H__ diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index c31963c59e950..474b11d745a44 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -30,6 +30,7 @@ math/clc_atanh.cl math/clc_atanpi.cl math/clc_ceil.cl math/clc_copysign.cl +math/clc_cospi.cl math/clc_ep_log.cl math/clc_fabs.cl math/clc_fma.cl @@ -46,12 +47,14 @@ math/clc_mad.cl math/clc_modf.cl math/clc_nan.cl math/clc_native_cos.cl +math/clc_native_divide.cl math/clc_native_exp.cl math/clc_native_exp2.cl math/clc_native_log.cl math/clc_native_log10.cl math/clc_native_log2.cl math/clc_native_rsqrt.cl +math/clc_native_recip.cl math/clc_native_sin.cl math/clc_native_sqrt.cl math/clc_nextafter.cl @@ -65,9 +68,11 @@ math/clc_rootn.cl math/clc_round.cl math/clc_rsqrt.cl math/clc_sincos_helpers.cl +math/clc_sinpi.cl math/clc_sqrt.cl math/clc_sw_fma.cl math/clc_tables.cl +math/clc_tanpi.cl math/clc_trunc.cl relational/clc_all.cl relational/clc_any.cl diff --git a/libclc/clc/lib/generic/math/clc_cospi.cl b/libclc/clc/lib/generic/math/clc_cospi.cl new file mode 100644 index 0000000000000..07e1b49cc9e02 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_cospi.cl @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/math/clc_cospi.inc b/libclc/clc/lib/generic/math/clc_cospi.inc new file mode 100644 index 0000000000000..b037f82872dde --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_cospi.inc @@ -0,0 +1,116 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cospi(__CLC_GENTYPE x) { + __CLC_GENTYPE absx = __clc_fabs(x); + __CLC_INTN ix = __CLC_AS_INTN(absx); + __CLC_INTN iax = __CLC_CONVERT_INTN(absx); + __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax); + __CLC_INTN xodd = (iax & 0x1) != 0 ? (__CLC_INTN)0x80000000 : (__CLC_INTN)0; + + // Initialize with return for +-Inf and NaN + __CLC_INTN ir = QNANBITPATT_SP32; + + // 2^24 <= |x| < Inf, the result is always even integer + ir = ix < PINFBITPATT_SP32 ? 0x3f800000 : ir; + + // 2^23 <= |x| < 2^24, the result is always integer + ir = ix < 0x4b800000 ? xodd | 0x3f800000 : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + __CLC_GENTYPE a = 1.0f - r; + __CLC_INTN e = 1; + __CLC_INTN s = xodd ^ (__CLC_INTN)0x80000000; + + // r <= 0.75 + __CLC_INTN c = r <= 0.75f; + a = c ? r - 0.5f : a; + e = c ? 0 : e; + + // r < 0.5 + c = r < 0.5f; + a = c ? 0.5f - r : a; + s = c ? xodd : s; + + // r <= 0.25 + c = r <= 0.25f; + a = c ? r : a; + e = c ? 1 : e; + + __CLC_GENTYPE sinval, cosval; + __clc_sincos_piby4(a * M_PI_F, &sinval, &cosval); + __CLC_INTN jr = s ^ __CLC_AS_INTN(e != 0 ? cosval : sinval); + + ir = ix < 0x4b000000 ? jr : ir; + + return __CLC_AS_GENTYPE(ir); +} + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cospi(__CLC_GENTYPE x) { + __CLC_GENTYPE absx = __clc_fabs(x); + __CLC_LONGN ix = __CLC_AS_LONGN(absx); + __CLC_LONGN iax = __CLC_CONVERT_LONGN(absx); + __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax); + __CLC_LONGN xodd = + (iax & 0x1L) != 0 ? (__CLC_LONGN)0x8000000000000000L : (__CLC_LONGN)0L; + + // Initialize with return for +-Inf and NaN + __CLC_LONGN ir = QNANBITPATT_DP64; + + // 2^53 <= |x| < Inf, the result is always even integer + ir = ix < PINFBITPATT_DP64 ? 0x3ff0000000000000L : ir; + + // 2^52 <= |x| < 2^53, the result is always integer + ir = absx < 0x1.0p+53 ? xodd | 0x3ff0000000000000L : ir; + + // 0x1.0p-7 <= |x| < 2^52, result depends on which 0.25 interval + + // r < 1.0 + __CLC_GENTYPE a = 1.0 - r; + __CLC_LONGN e = 1; + __CLC_LONGN s = xodd ^ (__CLC_LONGN)0x8000000000000000L; + + // r <= 0.75 + __CLC_LONGN c = r <= 0.75; + __CLC_GENTYPE t = r - 0.5; + a = c ? t : a; + e = c ? 0 : e; + + // r < 0.5 + c = r < 0.5; + t = 0.5 - r; + a = c ? t : a; + s = c ? xodd : s; + + // r <= 0.25 + c = r <= 0.25; + a = c ? r : a; + e = c ? 1 : e; + + __CLC_GENTYPE sinval, cosval; + __clc_sincos_piby4(a * M_PI, 0.0, &sinval, &cosval); + __CLC_LONGN jr = s ^ __CLC_AS_LONGN(e != 0 ? cosval : sinval); + + ir = absx < 0x1.0p+52 ? jr : ir; + + return __CLC_AS_GENTYPE(ir); +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cospi(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_cospi(__CLC_CONVERT_FLOATN(x))); +} + +#endif diff --git a/libclc/clc/lib/generic/math/clc_native_divide.cl b/libclc/clc/lib/generic/math/clc_native_divide.cl new file mode 100644 index 0000000000000..005089b1ba15d --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_native_divide.cl @@ -0,0 +1,14 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __FLOAT_ONLY +#define __CLC_BODY + +#include diff --git a/libclc/clc/lib/generic/math/clc_native_divide.inc b/libclc/clc/lib/generic/math/clc_native_divide.inc new file mode 100644 index 0000000000000..fdf1794812c5a --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_native_divide.inc @@ -0,0 +1,12 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_divide(__CLC_GENTYPE x, + __CLC_GENTYPE y) { + return x / y; +} diff --git a/libclc/generic/lib/math/native_divide.inc b/libclc/clc/lib/generic/math/clc_native_recip.cl similarity index 74% rename from libclc/generic/lib/math/native_divide.inc rename to libclc/clc/lib/generic/math/clc_native_recip.cl index b0c83d503b965..4377f10b1543f 100644 --- a/libclc/generic/lib/math/native_divide.inc +++ b/libclc/clc/lib/generic/math/clc_native_recip.cl @@ -6,6 +6,9 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE native_divide(__CLC_GENTYPE x, __CLC_GENTYPE y) { - return x / y; -} +#include + +#define __FLOAT_ONLY +#define __CLC_BODY + +#include diff --git a/libclc/generic/lib/math/native_recip.inc b/libclc/clc/lib/generic/math/clc_native_recip.inc similarity index 83% rename from libclc/generic/lib/math/native_recip.inc rename to libclc/clc/lib/generic/math/clc_native_recip.inc index d6652fc2d2c69..57eb35a9522f8 100644 --- a/libclc/generic/lib/math/native_recip.inc +++ b/libclc/clc/lib/generic/math/clc_native_recip.inc @@ -6,6 +6,6 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE native_recip(__CLC_GENTYPE val) { +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_recip(__CLC_GENTYPE val) { return 1.0f / val; } diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl index d8a7b10d8e868..24676d3c7711c 100644 --- a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl +++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc index d9d2b81226b72..516a40c4672a9 100644 --- a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc +++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc @@ -74,6 +74,25 @@ _CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_cosf_piby4(__CLC_FLOATN x, return ret; } +_CLC_DECL _CLC_OVERLOAD __CLC_FLOATN __clc_tanf_piby4(__CLC_FLOATN x, + __CLC_INTN regn) { + // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4]. + __CLC_FLOATN r = x * x; + + __CLC_FLOATN a = + __clc_mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f); + + __CLC_FLOATN b = __clc_mad( + r, + __clc_mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f), + 1.15588821434688393452299f); + + __CLC_FLOATN t = __clc_mad(x * r, __clc_native_divide(a, b), x); + __CLC_FLOATN tr = -MATH_RECIP(t); + + return regn & 1 ? tr : t; +} + _CLC_DEF _CLC_OVERLOAD void __clc_fullMulS(private __CLC_FLOATN *hi, private __CLC_FLOATN *lo, __CLC_FLOATN a, __CLC_FLOATN b, diff --git a/libclc/clc/lib/generic/math/clc_sinpi.cl b/libclc/clc/lib/generic/math/clc_sinpi.cl new file mode 100644 index 0000000000000..6cff247707845 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_sinpi.cl @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/math/clc_sinpi.inc b/libclc/clc/lib/generic/math/clc_sinpi.inc new file mode 100644 index 0000000000000..264609aeaca45 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_sinpi.inc @@ -0,0 +1,114 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) { + __CLC_INTN ix = __CLC_AS_INTN(x); + __CLC_INTN xsgn = ix & (__CLC_INTN)0x80000000; + ix ^= xsgn; + __CLC_GENTYPE absx = __clc_fabs(x); + __CLC_INTN iax = __CLC_CONVERT_INTN(absx); + __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax); + __CLC_INTN xodd = + xsgn ^ ((iax & 0x1) != 0 ? (__CLC_INTN)0x80000000 : (__CLC_INTN)0); + + // Initialize with return for +-Inf and NaN + __CLC_INTN ir = QNANBITPATT_SP32; + + // 2^23 <= |x| < Inf, the result is always integer + ir = ix < PINFBITPATT_SP32 ? xsgn : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + __CLC_GENTYPE a = 1.0f - r; + __CLC_INTN e = 0; + + // r <= 0.75 + __CLC_INTN c = r <= 0.75f; + a = c ? r - 0.5f : a; + e = c ? 1 : e; + + // r < 0.5 + c = r < 0.5f; + a = c ? 0.5f - r : a; + + // 0 < r <= 0.25 + c = r <= 0.25f; + a = c ? r : a; + e = c ? 0 : e; + + __CLC_GENTYPE sinval, cosval; + __clc_sincos_piby4(a * M_PI_F, &sinval, &cosval); + __CLC_INTN jr = xodd ^ __CLC_AS_INTN(e != 0 ? cosval : sinval); + + ir = ix < 0x4b000000 ? jr : ir; + + return __CLC_AS_GENTYPE(ir); +} + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) { + __CLC_LONGN ix = __CLC_AS_LONGN(x); + __CLC_LONGN xsgn = ix & (__CLC_LONGN)0x8000000000000000L; + ix ^= xsgn; + __CLC_GENTYPE absx = __clc_fabs(x); + __CLC_LONGN iax = __CLC_CONVERT_LONGN(absx); + __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax); + __CLC_LONGN xodd = + xsgn ^ + ((iax & 0x1L) != 0 ? (__CLC_LONGN)0x8000000000000000L : (__CLC_LONGN)0L); + + // Initialize with return for +-Inf and NaN + __CLC_LONGN ir = QNANBITPATT_DP64; + + // 2^23 <= |x| < Inf, the result is always integer + ir = ix < PINFBITPATT_DP64 ? xsgn : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + __CLC_GENTYPE a = 1.0 - r; + __CLC_LONGN e = 0; + + // r <= 0.75 + __CLC_LONGN c = r <= 0.75; + __CLC_GENTYPE t = r - 0.5; + a = c ? t : a; + e = c ? 1 : e; + + // r < 0.5 + c = r < 0.5; + t = 0.5 - r; + a = c ? t : a; + + // r <= 0.25 + c = r <= 0.25; + a = c ? r : a; + e = c ? 0 : e; + + __CLC_GENTYPE api = a * M_PI; + + __CLC_GENTYPE sinval, cosval; + __clc_sincos_piby4(api, 0.0, &sinval, &cosval); + __CLC_LONGN jr = xodd ^ __CLC_AS_LONGN(e != 0 ? cosval : sinval); + + ir = absx < 0x1.0p+52 ? jr : ir; + + return __CLC_AS_GENTYPE(ir); +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinpi(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_sinpi(__CLC_CONVERT_FLOATN(x))); +} + +#endif diff --git a/libclc/clc/lib/generic/math/clc_tanpi.cl b/libclc/clc/lib/generic/math/clc_tanpi.cl new file mode 100644 index 0000000000000..f1265892d107b --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_tanpi.cl @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/math/clc_tanpi.inc b/libclc/clc/lib/generic/math/clc_tanpi.inc new file mode 100644 index 0000000000000..3a2f5dcf7b1ee --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_tanpi.inc @@ -0,0 +1,132 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tanpi(__CLC_GENTYPE x) { + __CLC_INTN ix = __CLC_AS_INTN(x); + __CLC_INTN xsgn = ix & (__CLC_INTN)SIGNBIT_SP32; + __CLC_INTN xnsgn = xsgn ^ (__CLC_INTN)SIGNBIT_SP32; + ix ^= xsgn; + __CLC_GENTYPE absx = __clc_fabs(x); + __CLC_INTN iax = __CLC_CONVERT_INTN(absx); + __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax); + __CLC_INTN xodd = xsgn ^ __CLC_AS_INTN((iax & 0x1) != 0 ? SIGNBIT_SP32 : 0); + + // Initialize with return for +-Inf and NaN + __CLC_INTN ir = QNANBITPATT_SP32; + + // 2^24 <= |x| < Inf, the result is always even integer + ir = ix < PINFBITPATT_SP32 ? xsgn : ir; + + // 2^23 <= |x| < 2^24, the result is always integer + ir = ix < 0x4b800000 ? xodd : ir; + + // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval + + // r < 1.0 + __CLC_GENTYPE a = 1.0f - r; + __CLC_INTN e = 0; + __CLC_INTN s = xnsgn; + + // r <= 0.75 + __CLC_INTN c = r <= 0.75f; + a = c ? r - 0.5f : a; + e = c ? 1 : e; + s = c ? xsgn : s; + + // r < 0.5 + c = r < 0.5f; + a = c ? 0.5f - r : a; + s = c ? xnsgn : s; + + // 0 < r <= 0.25 + c = r <= 0.25f; + a = c ? r : a; + e = c ? 0 : e; + s = c ? xsgn : s; + + __CLC_GENTYPE t = __clc_tanf_piby4(a * M_PI_F, 0); + __CLC_GENTYPE tr = -__clc_native_recip(t); + __CLC_INTN jr = s ^ __CLC_AS_INTN(e != 0 ? tr : t); + + jr = r == 0.5f ? xodd | 0x7f800000 : jr; + + ir = ix < 0x4b000000 ? jr : ir; + + return __CLC_AS_GENTYPE(ir); +} + +#elif __CLC_FPSIZE == 64 + +_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tanpi(__CLC_GENTYPE x) { + __CLC_LONGN ix = __CLC_AS_LONGN(x); + __CLC_LONGN xsgn = ix & (__CLC_LONGN)0x8000000000000000L; + __CLC_LONGN xnsgn = xsgn ^ (__CLC_LONGN)0x8000000000000000L; + ix ^= xsgn; + __CLC_GENTYPE absx = __clc_fabs(x); + __CLC_LONGN iax = __CLC_CONVERT_LONGN(absx); + __CLC_GENTYPE r = absx - __CLC_CONVERT_GENTYPE(iax); + __CLC_LONGN xodd = + xsgn ^ __CLC_AS_LONGN((iax & 0x1) != 0 ? 0x8000000000000000L : 0L); + + // Initialize with return for +-Inf and NaN + __CLC_LONGN ir = QNANBITPATT_DP64; + + // 2^53 <= |x| < Inf, the result is always even integer + ir = ix < PINFBITPATT_DP64 ? xsgn : ir; + + // 2^52 <= |x| < 2^53, the result is always integer + ir = ix < 0x4340000000000000L ? xodd : ir; + + // 0x1.0p-14 <= |x| < 2^53, result depends on which 0.25 interval + + // r < 1.0 + __CLC_GENTYPE a = 1.0 - r; + __CLC_LONGN e = 0; + __CLC_LONGN s = xnsgn; + + // r <= 0.75 + __CLC_LONGN c = r <= 0.75; + __CLC_GENTYPE t = r - 0.5; + a = c ? t : a; + e = c ? 1 : e; + s = c ? xsgn : s; + + // r < 0.5 + c = r < 0.5; + t = 0.5 - r; + a = c ? t : a; + s = c ? xnsgn : s; + + // r <= 0.25 + c = r <= 0.25; + a = c ? r : a; + e = c ? 0 : e; + s = c ? xsgn : s; + + __CLC_GENTYPE api = a * M_PI; + __CLC_GENTYPE lo, hi; + __clc_tan_piby4(api, 0.0, &lo, &hi); + __CLC_LONGN jr = s ^ __CLC_AS_LONGN(e != 0 ? hi : lo); + + __CLC_LONGN si = xodd | 0x7ff0000000000000L; + jr = r == 0.5 ? si : jr; + + ir = ix < 0x4330000000000000L ? jr : ir; + + return __CLC_AS_GENTYPE(ir); +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_tanpi(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_tanpi(__CLC_CONVERT_FLOATN(x))); +} + +#endif diff --git a/libclc/clspv/lib/SOURCES b/libclc/clspv/lib/SOURCES index 1c86fb0fbc8e8..0ef09d275243b 100644 --- a/libclc/clspv/lib/SOURCES +++ b/libclc/clspv/lib/SOURCES @@ -18,7 +18,6 @@ subnormal_config.cl ../../generic/lib/math/cbrt.cl ../../generic/lib/math/clc_exp10.cl ../../generic/lib/math/clc_tan.cl -../../generic/lib/math/clc_tanpi.cl ../../generic/lib/math/cos.cl ../../generic/lib/math/cosh.cl ../../generic/lib/math/cospi.cl diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES index a93444af0c954..9b5bbc5d9b53c 100644 --- a/libclc/generic/lib/SOURCES +++ b/libclc/generic/lib/SOURCES @@ -174,7 +174,6 @@ math/sqrt.cl math/clc_tan.cl math/tan.cl math/tanh.cl -math/clc_tanpi.cl math/tanpi.cl math/tgamma.cl math/trunc.cl diff --git a/libclc/generic/lib/math/clc_tan.cl b/libclc/generic/lib/math/clc_tan.cl index eb02879339307..7e28e9ffed3b6 100644 --- a/libclc/generic/lib/math/clc_tan.cl +++ b/libclc/generic/lib/math/clc_tan.cl @@ -35,7 +35,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_tan(float x) { _CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_tan, float); #ifdef cl_khr_fp64 -#include "sincosD_piby4.h" +#include _CLC_DEF _CLC_OVERLOAD double __clc_tan(double x) { double y = __clc_fabs(x); @@ -48,9 +48,10 @@ _CLC_DEF _CLC_OVERLOAD double __clc_tan(double x) { else __clc_remainder_piby2_large(y, &r, &rr, ®n); - double2 tt = __clc_tan_piby4(r, rr); + double lead, tail; + __clc_tan_piby4(r, rr, &lead, &tail); - int2 t = as_int2(regn & 1 ? tt.y : tt.x); + int2 t = as_int2(regn & 1 ? tail : lead); t.hi ^= (x < 0.0) << 31; return __clc_isnan(x) || __clc_isinf(x) ? as_double(QNANBITPATT_DP64) diff --git a/libclc/generic/lib/math/clc_tanpi.cl b/libclc/generic/lib/math/clc_tanpi.cl deleted file mode 100644 index 533db5e4d1877..0000000000000 --- a/libclc/generic/lib/math/clc_tanpi.cl +++ /dev/null @@ -1,132 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "sincos_helpers.h" -#include -#include -#include -#include - -_CLC_DEF _CLC_OVERLOAD float __clc_tanpi(float x) -{ - int ix = as_int(x); - int xsgn = ix & 0x80000000; - int xnsgn = xsgn ^ 0x80000000; - ix ^= xsgn; - float ax = as_float(ix); - int iax = (int)ax; - float r = ax - iax; - int xodd = xsgn ^ (iax & 0x1 ? 0x80000000 : 0); - - // Initialize with return for +-Inf and NaN - int ir = 0x7fc00000; - - // 2^24 <= |x| < Inf, the result is always even integer - ir = ix < 0x7f800000 ? xsgn : ir; - - // 2^23 <= |x| < 2^24, the result is always integer - ir = ix < 0x4b800000 ? xodd : ir; - - // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval - - // r < 1.0 - float a = 1.0f - r; - int e = 0; - int s = xnsgn; - - // r <= 0.75 - int c = r <= 0.75f; - a = c ? r - 0.5f : a; - e = c ? 1 : e; - s = c ? xsgn : s; - - // r < 0.5 - c = r < 0.5f; - a = c ? 0.5f - r : a; - s = c ? xnsgn : s; - - // 0 < r <= 0.25 - c = r <= 0.25f; - a = c ? r : a; - e = c ? 0 : e; - s = c ? xsgn : s; - - float t = __clc_tanf_piby4(a * M_PI_F, 0); - float tr = -native_recip(t); - int jr = s ^ as_int(e ? tr : t); - - jr = r == 0.5f ? xodd | 0x7f800000 : jr; - - ir = ix < 0x4b000000 ? jr : ir; - - return as_float(ir); -} -_CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_tanpi, float); - -#ifdef cl_khr_fp64 -#include "sincosD_piby4.h" - -_CLC_DEF _CLC_OVERLOAD double __clc_tanpi(double x) -{ - long ix = as_long(x); - long xsgn = ix & 0x8000000000000000L; - long xnsgn = xsgn ^ 0x8000000000000000L; - ix ^= xsgn; - double ax = as_double(ix); - long iax = (long)ax; - double r = ax - iax; - long xodd = xsgn ^ (iax & 0x1 ? 0x8000000000000000L : 0L); - - // Initialize with return for +-Inf and NaN - long ir = 0x7ff8000000000000L; - - // 2^53 <= |x| < Inf, the result is always even integer - ir = ix < 0x7ff0000000000000L ? xsgn : ir; - - // 2^52 <= |x| < 2^53, the result is always integer - ir = ix < 0x4340000000000000L ? xodd : ir; - - // 0x1.0p-14 <= |x| < 2^53, result depends on which 0.25 interval - - // r < 1.0 - double a = 1.0 - r; - int e = 0; - long s = xnsgn; - - // r <= 0.75 - int c = r <= 0.75; - double t = r - 0.5; - a = c ? t : a; - e = c ? 1 : e; - s = c ? xsgn : s; - - // r < 0.5 - c = r < 0.5; - t = 0.5 - r; - a = c ? t : a; - s = c ? xnsgn : s; - - // r <= 0.25 - c = r <= 0.25; - a = c ? r : a; - e = c ? 0 : e; - s = c ? xsgn : s; - - double api = a * M_PI; - double2 tt = __clc_tan_piby4(api, 0.0); - long jr = s ^ as_long(e ? tt.hi : tt.lo); - - long si = xodd | 0x7ff0000000000000L; - jr = r == 0.5 ? si : jr; - - ir = ix < 0x4330000000000000L ? jr : ir; - - return as_double(ir); -} -_CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_tanpi, double); -#endif diff --git a/libclc/generic/lib/math/cospi.cl b/libclc/generic/lib/math/cospi.cl index c8d4148de700f..f78935664c65b 100644 --- a/libclc/generic/lib/math/cospi.cl +++ b/libclc/generic/lib/math/cospi.cl @@ -7,124 +7,9 @@ //===----------------------------------------------------------------------===// #include -#include -#include +#include -#include "sincos_helpers.h" -#include "sincospiF_piby4.h" -#ifdef cl_khr_fp64 -#include "sincosD_piby4.h" -#endif +#define FUNCTION cospi +#define __CLC_BODY -_CLC_OVERLOAD _CLC_DEF float cospi(float x) -{ - int ix = as_int(x) & 0x7fffffff; - float ax = as_float(ix); - int iax = (int)ax; - float r = ax - iax; - int xodd = iax & 0x1 ? 0x80000000 : 0; - - // Initialize with return for +-Inf and NaN - int ir = 0x7fc00000; - - // 2^24 <= |x| < Inf, the result is always even integer - ir = ix < 0x7f800000 ? 0x3f800000 : ir; - - // 2^23 <= |x| < 2^24, the result is always integer - ir = ix < 0x4b800000 ? xodd | 0x3f800000 : ir; - - // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval - - // r < 1.0 - float a = 1.0f - r; - int e = 1; - int s = xodd ^ 0x80000000; - - // r <= 0.75 - int c = r <= 0.75f; - a = c ? r - 0.5f : a; - e = c ? 0 : e; - - // r < 0.5 - c = r < 0.5f; - a = c ? 0.5f - r : a; - s = c ? xodd : s; - - // r <= 0.25 - c = r <= 0.25f; - a = c ? r : a; - e = c ? 1 : e; - - float2 t = __libclc__sincosf_piby4(a * M_PI_F); - int jr = s ^ as_int(e ? t.hi : t.lo); - - ir = ix < 0x4b000000 ? jr : ir; - - return as_float(ir); -} - - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cospi, float); - -#ifdef cl_khr_fp64 - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_OVERLOAD _CLC_DEF double cospi(double x) { - - long ix = as_long(x) & 0x7fffffffffffffffL; - double ax = as_double(ix); - long iax = (long)ax; - double r = ax - (double)iax; - long xodd = iax & 0x1L ? 0x8000000000000000L : 0L; - - // Initialize with return for +-Inf and NaN - long ir = 0x7ff8000000000000L; - - // 2^53 <= |x| < Inf, the result is always even integer - ir = ix < 0x7ff0000000000000 ? 0x3ff0000000000000L : ir; - - // 2^52 <= |x| < 2^53, the result is always integer - ir = ax < 0x1.0p+53 ? xodd | 0x3ff0000000000000L : ir; - - // 0x1.0p-7 <= |x| < 2^52, result depends on which 0.25 interval - - // r < 1.0 - double a = 1.0 - r; - int e = 1; - long s = xodd ^ 0x8000000000000000L; - - // r <= 0.75 - int c = r <= 0.75; - double t = r - 0.5; - a = c ? t : a; - e = c ? 0 : e; - - // r < 0.5 - c = r < 0.5; - t = 0.5 - r; - a = c ? t : a; - s = c ? xodd : s; - - // r <= 0.25 - c = r <= 0.25; - a = c ? r : a; - e = c ? 1 : e; - - double2 sc = __libclc__sincos_piby4(a * M_PI, 0.0); - long jr = s ^ as_long(e ? sc.hi : sc.lo); - - ir = ax < 0x1.0p+52 ? jr : ir; - - return as_double(ir); -} -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cospi, double); -#endif - -#ifdef cl_khr_fp16 - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -_CLC_DEFINE_UNARY_BUILTIN_FP16(cospi) - -#endif +#include diff --git a/libclc/generic/lib/math/native_divide.cl b/libclc/generic/lib/math/native_divide.cl index a1e9008a90c94..a4d9b830b5519 100644 --- a/libclc/generic/lib/math/native_divide.cl +++ b/libclc/generic/lib/math/native_divide.cl @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#define __CLC_BODY #define __FLOAT_ONLY +#define FUNCTION native_divide +#define __CLC_BODY + #include diff --git a/libclc/generic/lib/math/native_recip.cl b/libclc/generic/lib/math/native_recip.cl index b43248e6aeae6..3c844495046f1 100644 --- a/libclc/generic/lib/math/native_recip.cl +++ b/libclc/generic/lib/math/native_recip.cl @@ -7,7 +7,10 @@ //===----------------------------------------------------------------------===// #include +#include -#define __CLC_BODY #define __FLOAT_ONLY +#define FUNCTION native_recip +#define __CLC_BODY + #include diff --git a/libclc/generic/lib/math/sincosD_piby4.h b/libclc/generic/lib/math/sincosD_piby4.h deleted file mode 100644 index cce3d1554583f..0000000000000 --- a/libclc/generic/lib/math/sincosD_piby4.h +++ /dev/null @@ -1,119 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_INLINE double2 -__libclc__sincos_piby4(double x, double xx) -{ - // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... - // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... - // = x * f(w) - // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... - // We use a minimax approximation of (f(w) - 1) / w - // because this produces an expansion in even powers of x. - // If xx (the tail of x) is non-zero, we add a correction - // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx) - // is an approximation to cos(x)*sin(xx) valid because - // xx is tiny relative to x. - - // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... - // = f(w) - // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... - // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) - // because this produces an expansion in even powers of x. - // If xx (the tail of x) is non-zero, we subtract a correction - // term g(x,xx) = x*xx to the result, where g(x,xx) - // is an approximation to sin(x)*sin(xx) valid because - // xx is tiny relative to x. - - const double sc1 = -0.166666666666666646259241729; - const double sc2 = 0.833333333333095043065222816e-2; - const double sc3 = -0.19841269836761125688538679e-3; - const double sc4 = 0.275573161037288022676895908448e-5; - const double sc5 = -0.25051132068021699772257377197e-7; - const double sc6 = 0.159181443044859136852668200e-9; - - const double cc1 = 0.41666666666666665390037e-1; - const double cc2 = -0.13888888888887398280412e-2; - const double cc3 = 0.248015872987670414957399e-4; - const double cc4 = -0.275573172723441909470836e-6; - const double cc5 = 0.208761463822329611076335e-8; - const double cc6 = -0.113826398067944859590880e-10; - - double x2 = x * x; - double x3 = x2 * x; - double r = 0.5 * x2; - double t = 1.0 - r; - - double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2); - - double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1), - x2*x2, fma(x, xx, (1.0 - t) - r)); - - double2 ret; - ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx)); - ret.hi = cp; - - return ret; -} - -_CLC_INLINE double2 -__clc_tan_piby4(double x, double xx) -{ - const double piby4_lead = 7.85398163397448278999e-01; // 0x3fe921fb54442d18 - const double piby4_tail = 3.06161699786838240164e-17; // 0x3c81a62633145c06 - - // In order to maintain relative precision transform using the identity: - // tan(pi/4-x) = (1-tan(x))/(1+tan(x)) for arguments close to pi/4. - // Similarly use tan(x-pi/4) = (tan(x)-1)/(tan(x)+1) close to -pi/4. - - int ca = x > 0.68; - int cb = x < -0.68; - double transform = ca ? 1.0 : 0.0; - transform = cb ? -1.0 : transform; - - double tx = fma(-transform, x, piby4_lead) + fma(-transform, xx, piby4_tail); - int c = ca | cb; - x = c ? tx : x; - xx = c ? 0.0 : xx; - - // Core Remez [2,3] approximation to tan(x+xx) on the interval [0,0.68]. - double t1 = x; - double r = fma(2.0, x*xx, x*x); - - double a = fma(r, - fma(r, 0.224044448537022097264602535574e-3, -0.229345080057565662883358588111e-1), - 0.372379159759792203640806338901e0); - - double b = fma(r, - fma(r, - fma(r, -0.232371494088563558304549252913e-3, 0.260656620398645407524064091208e-1), - -0.515658515729031149329237816945e0), - 0.111713747927937668539901657944e1); - - double t2 = fma(MATH_DIVIDE(a, b), x*r, xx); - - double tp = t1 + t2; - - // Compute -1.0/(t1 + t2) accurately - double z1 = as_double(as_long(tp) & 0xffffffff00000000L); - double z2 = t2 - (z1 - t1); - double trec = -MATH_RECIP(tp); - double trec_top = as_double(as_long(trec) & 0xffffffff00000000L); - - double tpr = fma(fma(trec_top, z2, fma(trec_top, z1, 1.0)), trec, trec_top); - - double tpt = transform * (1.0 - MATH_DIVIDE(2.0*tp, 1.0 + tp)); - double tptr = transform * (MATH_DIVIDE(2.0*tp, tp - 1.0) - 1.0); - - double2 ret; - ret.lo = c ? tpt : tp; - ret.hi = c ? tptr : tpr; - return ret; -} diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl index 32ab5af4ca90c..651cd11ccf016 100644 --- a/libclc/generic/lib/math/sincos_helpers.cl +++ b/libclc/generic/lib/math/sincos_helpers.cl @@ -17,31 +17,13 @@ #include #include -#define bytealign(src0, src1, src2) \ - ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8))) - -_CLC_DEF float __clc_tanf_piby4(float x, int regn) { - // Core Remez [1,2] approximation to tan(x) on the interval [0,pi/4]. - float r = x * x; - - float a = - __clc_mad(r, -0.0172032480471481694693109f, 0.385296071263995406715129f); - - float b = __clc_mad( - r, - __clc_mad(r, 0.01844239256901656082986661f, -0.51396505478854532132342f), - 1.15588821434688393452299f); - - float t = __clc_mad(x * r, native_divide(a, b), x); - float tr = -MATH_RECIP(t); - - return regn & 1 ? tr : t; -} - #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable +#define bytealign(src0, src1, src2) \ + ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8))) + // Reduction for medium sized arguments _CLC_DEF void __clc_remainder_piby2_medium(double x, private double *r, private double *rr, diff --git a/libclc/generic/lib/math/sincos_helpers.h b/libclc/generic/lib/math/sincos_helpers.h index c94784081cd64..11cb93f34850d 100644 --- a/libclc/generic/lib/math/sincos_helpers.h +++ b/libclc/generic/lib/math/sincos_helpers.h @@ -9,8 +9,6 @@ #include #include -_CLC_DECL float __clc_tanf_piby4(float x, int y); - #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable diff --git a/libclc/generic/lib/math/sincospiF_piby4.h b/libclc/generic/lib/math/sincospiF_piby4.h deleted file mode 100644 index 66596395fdd1e..0000000000000 --- a/libclc/generic/lib/math/sincospiF_piby4.h +++ /dev/null @@ -1,46 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4] -_CLC_INLINE float2 __libclc__sincosf_piby4(float x) { - // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... - // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... - // = x * f(w) - // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... - // We use a minimax approximation of (f(w) - 1) / w - // because this produces an expansion in even powers of x. - - // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... - // = f(w) - // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... - // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) - // because this produces an expansion in even powers of x. - - const float sc1 = -0.166666666638608441788607926e0F; - const float sc2 = 0.833333187633086262120839299e-2F; - const float sc3 = -0.198400874359527693921333720e-3F; - const float sc4 = 0.272500015145584081596826911e-5F; - - const float cc1 = 0.41666666664325175238031e-1F; - const float cc2 = -0.13888887673175665567647e-2F; - const float cc3 = 0.24800600878112441958053e-4F; - const float cc4 = -0.27301013343179832472841e-6F; - - float x2 = x * x; - - float2 ret; - ret.x = __clc_mad( - x * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, sc4, sc3), sc2), sc1), - x); - ret.y = __clc_mad( - x2 * x2, __clc_mad(x2, __clc_mad(x2, __clc_mad(x2, cc4, cc3), cc2), cc1), - __clc_mad(x2, -0.5f, 1.0f)); - return ret; -} diff --git a/libclc/generic/lib/math/sinpi.cl b/libclc/generic/lib/math/sinpi.cl index 1261b7f9c5277..223e7db142117 100644 --- a/libclc/generic/lib/math/sinpi.cl +++ b/libclc/generic/lib/math/sinpi.cl @@ -7,119 +7,9 @@ //===----------------------------------------------------------------------===// #include -#include -#include +#include -#include "sincospiF_piby4.h" -#ifdef cl_khr_fp64 -#include "sincosD_piby4.h" -#endif +#define FUNCTION sinpi +#define __CLC_BODY -_CLC_OVERLOAD _CLC_DEF float sinpi(float x) -{ - int ix = as_int(x); - int xsgn = ix & 0x80000000; - ix ^= xsgn; - float ax = as_float(ix); - int iax = (int)ax; - float r = ax - iax; - int xodd = xsgn ^ (iax & 0x1 ? 0x80000000 : 0); - - // Initialize with return for +-Inf and NaN - int ir = 0x7fc00000; - - // 2^23 <= |x| < Inf, the result is always integer - ir = ix < 0x7f800000 ? xsgn : ir; - - // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval - - // r < 1.0 - float a = 1.0f - r; - int e = 0; - - // r <= 0.75 - int c = r <= 0.75f; - a = c ? r - 0.5f : a; - e = c ? 1 : e; - - // r < 0.5 - c = r < 0.5f; - a = c ? 0.5f - r : a; - - // 0 < r <= 0.25 - c = r <= 0.25f; - a = c ? r : a; - e = c ? 0 : e; - - float2 t = __libclc__sincosf_piby4(a * M_PI_F); - int jr = xodd ^ as_int(e ? t.hi : t.lo); - - ir = ix < 0x4b000000 ? jr : ir; - - return as_float(ir); -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, sinpi, float); - -#ifdef cl_khr_fp64 - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_OVERLOAD _CLC_DEF double sinpi(double x) -{ - long ix = as_long(x); - long xsgn = ix & 0x8000000000000000L; - ix ^= xsgn; - double ax = as_double(ix); - long iax = (long)ax; - double r = ax - (double)iax; - long xodd = xsgn ^ (iax & 0x1L ? 0x8000000000000000L : 0L); - - // Initialize with return for +-Inf and NaN - long ir = 0x7ff8000000000000L; - - // 2^23 <= |x| < Inf, the result is always integer - ir = ix < 0x7ff0000000000000 ? xsgn : ir; - - // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval - - // r < 1.0 - double a = 1.0 - r; - int e = 0; - - // r <= 0.75 - int c = r <= 0.75; - double t = r - 0.5; - a = c ? t : a; - e = c ? 1 : e; - - // r < 0.5 - c = r < 0.5; - t = 0.5 - r; - a = c ? t : a; - - // r <= 0.25 - c = r <= 0.25; - a = c ? r : a; - e = c ? 0 : e; - - double api = a * M_PI; - double2 sc = __libclc__sincos_piby4(api, 0.0); - long jr = xodd ^ as_long(e ? sc.hi : sc.lo); - - ir = ax < 0x1.0p+52 ? jr : ir; - - return as_double(ir); -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sinpi, double) - -#endif - -#ifdef cl_khr_fp16 - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -_CLC_DEFINE_UNARY_BUILTIN_FP16(sinpi) - -#endif +#include diff --git a/libclc/generic/lib/math/tanpi.cl b/libclc/generic/lib/math/tanpi.cl index 0f0461b1742c1..8015d32adb38d 100644 --- a/libclc/generic/lib/math/tanpi.cl +++ b/libclc/generic/lib/math/tanpi.cl @@ -7,9 +7,9 @@ //===----------------------------------------------------------------------===// #include +#include -#include +#define FUNCTION tanpi +#define __CLC_BODY -#define __CLC_FUNC tanpi -#define __CLC_BODY #include diff --git a/libclc/spirv/lib/SOURCES b/libclc/spirv/lib/SOURCES index ad9f44a6149ae..5358577ea1805 100644 --- a/libclc/spirv/lib/SOURCES +++ b/libclc/spirv/lib/SOURCES @@ -72,7 +72,6 @@ math/fma.cl ../../generic/lib/math/clc_tan.cl ../../generic/lib/math/tan.cl ../../generic/lib/math/tanh.cl -../../generic/lib/math/clc_tanpi.cl ../../generic/lib/math/tanpi.cl ../../generic/lib/math/tgamma.cl ../../generic/lib/shared/vload.cl From 191e0622e84a215c1d632412561c2fe6ccd86170 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Tue, 1 Apr 2025 21:56:00 +1100 Subject: [PATCH 0227/1029] [JITLink][MachO][arm64] Don't lower ptrauth edges in noalloc-lifetime sections. Ptrauth relocations can only be fixed up in the executing process, but noalloc sections do not have any memory in the executing process. Failure to skip ptrauth edges results in signing instructions that operate on invalid addresses, leading to segfaults or data corruption. Ignoring noalloc sections for ptrauth lowering purposes allows the ptrauth edges to persist until they reach the applyFixup method, at which point they raise a useful error and cleanly terminate linking. --- llvm/lib/ExecutionEngine/JITLink/aarch64.cpp | 127 +++++++++------- .../MachO_ptrauth_noolloc_sections.yaml | 136 ++++++++++++++++++ 2 files changed, 208 insertions(+), 55 deletions(-) create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ptrauth_noolloc_sections.yaml diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp index 8ce7e74d67cde..fbeb920f4e189 100644 --- a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp @@ -240,10 +240,20 @@ Error createEmptyPointerSigningFunction(LinkGraph &G) { // info encoded in the addend -- the only actually unknown quantity is the // fixup location, and we can probably put constraints even on that. size_t NumPtrAuthFixupLocations = 0; - for (auto *B : G.blocks()) - for (auto &E : B->edges()) - NumPtrAuthFixupLocations += - E.getKind() == aarch64::Pointer64Authenticated; + for (auto &Sec : G.sections()) { + + // No-alloc sections can't have ptrauth edges. We don't need to error out + // here: applyFixup will catch these edges if any make it to the fixup + // stage. + if (Sec.getMemLifetime() == orc::MemLifetime::NoAlloc) + continue; + + for (auto *B : Sec.blocks()) { + for (auto &E : B->edges()) + NumPtrAuthFixupLocations += + E.getKind() == aarch64::Pointer64Authenticated; + } + } constexpr size_t MaxPtrSignSeqLength = 4 + // To materialize the value to sign. @@ -316,58 +326,65 @@ Error lowerPointer64AuthEdgesToSigningFunction(LinkGraph &G) { return InstrWriter.writeInteger(Instr); }; - for (auto *B : G.blocks()) { - for (auto &E : B->edges()) { - // We're only concerned with Pointer64Authenticated edges here. - if (E.getKind() != aarch64::Pointer64Authenticated) - continue; - - uint64_t EncodedInfo = E.getAddend(); - int32_t RealAddend = (uint32_t)(EncodedInfo & 0xffffffff); - auto ValueToSign = E.getTarget().getAddress() + RealAddend; - if (!ValueToSign) { - LLVM_DEBUG(dbgs() << " " << B->getFixupAddress(E) << " <- null\n"); - E.setAddend(RealAddend); - E.setKind(aarch64::Pointer64); - continue; - } + for (auto &Sec : G.sections()) { + + if (Sec.getMemLifetime() == orc::MemLifetime::NoAlloc) + continue; - uint32_t InitialDiscriminator = (EncodedInfo >> 32) & 0xffff; - bool AddressDiversify = (EncodedInfo >> 48) & 0x1; - uint32_t Key = (EncodedInfo >> 49) & 0x3; - uint32_t HighBits = EncodedInfo >> 51; - - if (HighBits != 0x1000) - return make_error( - "Pointer64Auth edge at " + - formatv("{0:x}", B->getFixupAddress(E).getValue()) + - " has invalid encoded addend " + formatv("{0:x}", EncodedInfo)); - - LLVM_DEBUG({ - const char *const KeyNames[] = {"IA", "IB", "DA", "DB"}; - dbgs() << " " << B->getFixupAddress(E) << " <- " << ValueToSign - << " : key = " << KeyNames[Key] << ", discriminator = " - << formatv("{0:x4}", InitialDiscriminator) - << ", address diversified = " - << (AddressDiversify ? "yes" : "no") << "\n"; - }); - - // Materialize pointer value. - cantFail(writeMovRegImm64Seq(AppendInstr, Reg1, ValueToSign.getValue())); - - // Materialize fixup pointer. - cantFail(writeMovRegImm64Seq(AppendInstr, Reg2, - B->getFixupAddress(E).getValue())); - - // Write signing instruction(s). - cantFail(writePACSignSeq(AppendInstr, Reg1, ValueToSign, Reg2, Reg3, Key, - InitialDiscriminator, AddressDiversify)); - - // Store signed pointer. - cantFail(writeStoreRegSeq(AppendInstr, Reg2, Reg1)); - - // Replace edge with a keep-alive to preserve dependence info. - E.setKind(Edge::KeepAlive); + for (auto *B : Sec.blocks()) { + for (auto &E : B->edges()) { + // We're only concerned with Pointer64Authenticated edges here. + if (E.getKind() != aarch64::Pointer64Authenticated) + continue; + + uint64_t EncodedInfo = E.getAddend(); + int32_t RealAddend = (uint32_t)(EncodedInfo & 0xffffffff); + auto ValueToSign = E.getTarget().getAddress() + RealAddend; + if (!ValueToSign) { + LLVM_DEBUG(dbgs() << " " << B->getFixupAddress(E) << " <- null\n"); + E.setAddend(RealAddend); + E.setKind(aarch64::Pointer64); + continue; + } + + uint32_t InitialDiscriminator = (EncodedInfo >> 32) & 0xffff; + bool AddressDiversify = (EncodedInfo >> 48) & 0x1; + uint32_t Key = (EncodedInfo >> 49) & 0x3; + uint32_t HighBits = EncodedInfo >> 51; + + if (HighBits != 0x1000) + return make_error( + "Pointer64Auth edge at " + + formatv("{0:x}", B->getFixupAddress(E).getValue()) + + " has invalid encoded addend " + formatv("{0:x}", EncodedInfo)); + + LLVM_DEBUG({ + const char *const KeyNames[] = {"IA", "IB", "DA", "DB"}; + dbgs() << " " << B->getFixupAddress(E) << " <- " << ValueToSign + << " : key = " << KeyNames[Key] << ", discriminator = " + << formatv("{0:x4}", InitialDiscriminator) + << ", address diversified = " + << (AddressDiversify ? "yes" : "no") << "\n"; + }); + + // Materialize pointer value. + cantFail( + writeMovRegImm64Seq(AppendInstr, Reg1, ValueToSign.getValue())); + + // Materialize fixup pointer. + cantFail(writeMovRegImm64Seq(AppendInstr, Reg2, + B->getFixupAddress(E).getValue())); + + // Write signing instruction(s). + cantFail(writePACSignSeq(AppendInstr, Reg1, ValueToSign, Reg2, Reg3, + Key, InitialDiscriminator, AddressDiversify)); + + // Store signed pointer. + cantFail(writeStoreRegSeq(AppendInstr, Reg2, Reg1)); + + // Replace edge with a keep-alive to preserve dependence info. + E.setKind(Edge::KeepAlive); + } } } diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ptrauth_noolloc_sections.yaml b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ptrauth_noolloc_sections.yaml new file mode 100644 index 0000000000000..a2396aa776e10 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ptrauth_noolloc_sections.yaml @@ -0,0 +1,136 @@ +# RUN: yaml2obj %s -o %t +# RUN: not llvm-jitlink -noexec %t 2>&1 | FileCheck %s +# +# Check that ptrauth edges are _not_ lowered for noalloc sections. +# +# Ptrauth edges are lowered to signing function instructions, so any ptrauth +# edge in a noalloc section will introduce signing instructions that operate +# illegally on linker working memory, rather than executor memory (this will +# usually lead to a crash, but may silently corrupt memory in in-process JITs). +# +# By ignoring these edges during ptrauth lowering we prevent illegal signing +# instructions from being generated, and the ptrauth edges error out in +# applyFixup instead. +# + +# CHECK: llvm-jitlink error: {{.*}} unsupported edge kind Pointer64Authenticated + +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x80000002 + filetype: 0x1 + ncmds: 4 + sizeofcmds: 440 + flags: 0x2000 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 312 + segname: '' + vmaddr: 0 + vmsize: 16 + fileoff: 472 + filesize: 16 + maxprot: 7 + initprot: 7 + nsects: 2 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x0 + size: 8 + offset: 0x1D8 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 00008052C0035FD6 + - sectname: __debug_stuff + segname: __DWARF + addr: 0x8 + size: 8 + offset: 0x1E0 + align: 3 + reloff: 0x1E8 + nreloc: 1 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: '0000000000000080' + relocations: + - address: 0x0 + symbolnum: 3 + pcrel: false + length: 3 + extern: true + type: 11 + scattered: false + value: 0 + - cmd: LC_BUILD_VERSION + cmdsize: 24 + platform: 1 + minos: 983040 + sdk: 0 + ntools: 0 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 496 + nsyms: 4 + stroff: 560 + strsize: 24 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 2 + iextdefsym: 2 + nextdefsym: 2 + iundefsym: 4 + nundefsym: 0 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 0 + nindirectsyms: 0 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 +LinkEditData: + NameList: + - n_strx: 18 + n_type: 0xE + n_sect: 1 + n_desc: 0 + n_value: 0 + - n_strx: 12 + n_type: 0xE + n_sect: 2 + n_desc: 0 + n_value: 8 + - n_strx: 1 + n_type: 0xF + n_sect: 2 + n_desc: 0 + n_value: 8 + - n_strx: 6 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 0 + StringTable: + - '' + - _foo + - _main + - ltmp1 + - ltmp0 +... From 290d7b82cb5d3fd9fb433ede0eef7f3a524d89cd Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 1 Apr 2025 18:23:27 +0700 Subject: [PATCH 0228/1029] llvm-reduce: Prune some unneeded includes and forward declares (#133883) --- .../llvm-reduce/deltas/ReduceAttributes.cpp | 16 ---------------- .../llvm-reduce/deltas/ReduceOperandBundles.cpp | 1 - .../llvm-reduce/deltas/ReduceVirtualRegisters.h | 2 -- 3 files changed, 19 deletions(-) diff --git a/llvm/tools/llvm-reduce/deltas/ReduceAttributes.cpp b/llvm/tools/llvm-reduce/deltas/ReduceAttributes.cpp index 63d7abe61bda7..b1125de1b345e 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceAttributes.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceAttributes.cpp @@ -12,27 +12,11 @@ //===----------------------------------------------------------------------===// #include "ReduceAttributes.h" -#include "TestRunner.h" #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/Sequence.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/InstVisitor.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/raw_ostream.h" -#include -#include -#include -#include - -namespace llvm { -class LLVMContext; -} // namespace llvm using namespace llvm; diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp index e5d7b187c8107..a3e24f33dc77c 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "ReduceOperandBundles.h" -#include "TestRunner.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" diff --git a/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.h b/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.h index ff8ba4a004f34..84a741de92b2e 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceVirtualRegisters.h @@ -17,8 +17,6 @@ #include "Delta.h" namespace llvm { -class TestRunner; - /// Remove register allocation hints from virtual registes. void reduceVirtualRegisterHintsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); From 41b83b48e37aa0c7f9e0458638567f37d6dbc924 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Tue, 1 Apr 2025 07:56:36 -0400 Subject: [PATCH 0229/1029] No longer assert on incorrect attribute argument index (#133766) Fixes an assertion when referencing an out-of-bounds parameter via a function attribute whose argument list refers to parameters by index and the function is variadic. e.g., __attribute__ ((__format_arg__(2))) void test (int i, ...) { } Fixes #61635 --- clang/docs/ReleaseNotes.rst | 9 +++++++++ clang/include/clang/Sema/Sema.h | 11 ++++++----- clang/lib/Sema/SemaDeclAttr.cpp | 17 ++++++++++++----- clang/test/Sema/attr-args.c | 5 +++++ 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c034b925cddc6..75a173a48e67e 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -298,6 +298,15 @@ Improvements to Clang's diagnostics - Improve the ``-Wundefined-func-template`` warning when a function template is not instantiated due to being unreachable in modules. +- Fixed an assertion when referencing an out-of-bounds parameter via a function + attribute whose argument list refers to parameters by index and the function + is variadic. e.g., + .. code-block:: c + + __attribute__ ((__format_arg__(2))) void test (int i, ...) { } + + Fixes #GH61635 + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index c74e709ce06d2..09168218a9e36 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -4796,10 +4796,10 @@ class Sema final : public SemaBase { /// /// \returns true if IdxExpr is a valid index. template - bool checkFunctionOrMethodParameterIndex(const Decl *D, const AttrInfo &AI, - unsigned AttrArgNum, - const Expr *IdxExpr, ParamIdx &Idx, - bool CanIndexImplicitThis = false) { + bool checkFunctionOrMethodParameterIndex( + const Decl *D, const AttrInfo &AI, unsigned AttrArgNum, + const Expr *IdxExpr, ParamIdx &Idx, bool CanIndexImplicitThis = false, + bool CanIndexVariadicArguments = false) { assert(isFunctionOrMethodOrBlockForAttrSubject(D)); // In C++ the implicit 'this' function parameter also counts. @@ -4820,7 +4820,8 @@ class Sema final : public SemaBase { } unsigned IdxSource = IdxInt->getLimitedValue(UINT_MAX); - if (IdxSource < 1 || (!IV && IdxSource > NumParams)) { + if (IdxSource < 1 || + ((!IV || !CanIndexVariadicArguments) && IdxSource > NumParams)) { Diag(getAttrLoc(AI), diag::err_attribute_argument_out_of_bounds) << &AI << AttrArgNum << IdxExpr->getSourceRange(); return false; diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 0a8a3e1c49414..6cb6f6d105a32 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -1315,7 +1315,10 @@ static void handleNonNullAttr(Sema &S, Decl *D, const ParsedAttr &AL) { for (unsigned I = 0; I < AL.getNumArgs(); ++I) { Expr *Ex = AL.getArgAsExpr(I); ParamIdx Idx; - if (!S.checkFunctionOrMethodParameterIndex(D, AL, I + 1, Ex, Idx)) + if (!S.checkFunctionOrMethodParameterIndex( + D, AL, I + 1, Ex, Idx, + /*CanIndexImplicitThis=*/false, + /*CanIndexVariadicArguments=*/true)) return; // Is the function argument a pointer type? @@ -5756,13 +5759,17 @@ static void handleArgumentWithTypeTagAttr(Sema &S, Decl *D, } ParamIdx ArgumentIdx; - if (!S.checkFunctionOrMethodParameterIndex(D, AL, 2, AL.getArgAsExpr(1), - ArgumentIdx)) + if (!S.checkFunctionOrMethodParameterIndex( + D, AL, 2, AL.getArgAsExpr(1), ArgumentIdx, + /*CanIndexImplicitThis=*/false, + /*CanIndexVariadicArguments=*/true)) return; ParamIdx TypeTagIdx; - if (!S.checkFunctionOrMethodParameterIndex(D, AL, 3, AL.getArgAsExpr(2), - TypeTagIdx)) + if (!S.checkFunctionOrMethodParameterIndex( + D, AL, 3, AL.getArgAsExpr(2), TypeTagIdx, + /*CanIndexImplicitThis=*/false, + /*CanIndexVariadicArguments=*/true)) return; bool IsPointer = AL.getAttrName()->getName() == "pointer_with_type_tag"; diff --git a/clang/test/Sema/attr-args.c b/clang/test/Sema/attr-args.c index db69a99bdee3b..23815f3a4e675 100644 --- a/clang/test/Sema/attr-args.c +++ b/clang/test/Sema/attr-args.c @@ -24,3 +24,8 @@ inline __attribute__((stdcall(a))) void *f8(void); // expected-error {{'stdcall inline __attribute__((used(a))) void *f9(void); // expected-error {{'used' attribute takes no arguments}} inline __attribute__((unused(a))) void *f10(void); // expected-error {{'unused' attribute takes no arguments}} inline __attribute__((weak(a))) void *f11(void); // expected-error {{'weak' attribute takes no arguments}} + +__attribute__ ((__format_arg__(2))) // expected-error {{'__format_arg__' attribute parameter 1 is out of bounds}} +void test (int, ...); + +void __attribute__ ((alloc_size (2, 3))) *test2(int, ...); // expected-error {{'alloc_size' attribute parameter 1 is out of bounds}} From ba7feaab92ca807419de6f2b80dda2a1d1759d97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= Date: Tue, 1 Apr 2025 13:59:42 +0200 Subject: [PATCH 0230/1029] [AMDGPU][Docs] Fix and update AMDGPUUsage.rst (#133894) - Fix notes about SALU float and src1 SGPRs for dpp instructions - Add split between gfx11 and gfx12 sections, update references. --- llvm/docs/AMDGPUUsage.rst | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index ab507e3714ebb..d1535960a0257 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -515,6 +515,8 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following work-item Add product IDs names. + **GCN GFX12 (RDNA 4)** [AMD-GCN-GFX12-RDNA4]_ + ----------------------------------------------------------------------------------------------------------------------- ``gfx1200`` ``amdgcn`` dGPU - cumode - Architected *TBA* - wavefrontsize64 flat scratch .. TODO:: @@ -619,18 +621,18 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor SALU floating point instructions are not available on: - - ``gfx1150`` - - ``gfx1151`` - - ``gfx1152`` - - ``gfx1153`` + - ``gfx1100`` + - ``gfx1101`` + - ``gfx1102`` + - ``gfx1103`` SGPRs are not supported for src1 in dpp instructions for: - - ``gfx1150`` - - ``gfx1151`` - - ``gfx1152`` - - ``gfx1153`` + - ``gfx1100`` + - ``gfx1101`` + - ``gfx1102`` + - ``gfx1103`` ``gfx12-generic`` ``amdgcn`` - ``gfx1200`` - wavefrontsize64 - Architected No restrictions. @@ -17618,7 +17620,7 @@ combinations of operands, refer to one of instruction set architecture manuals [AMD-GCN-GFX900-GFX904-VEGA]_, [AMD-GCN-GFX906-VEGA7NM]_, [AMD-GCN-GFX908-CDNA1]_, [AMD-GCN-GFX90A-CDNA2]_, [AMD-GCN-GFX942-CDNA3]_, [AMD-GCN-GFX10-RDNA1]_, [AMD-GCN-GFX10-RDNA2]_, -[AMD-GCN-GFX11-RDNA3]_ and [AMD-GCN-GFX11-RDNA3.5]_. +[AMD-GCN-GFX11-RDNA3]_, [AMD-GCN-GFX11-RDNA3.5]_ and [AMD-GCN-GFX12-RDNA4]_. Operands ~~~~~~~~ @@ -18420,6 +18422,7 @@ Additional Documentation .. [AMD-GCN-GFX10-RDNA2] `AMD RDNA 2 Instruction Set Architecture `__ .. [AMD-GCN-GFX11-RDNA3] `AMD RDNA 3 Instruction Set Architecture `__ .. [AMD-GCN-GFX11-RDNA3.5] `AMD RDNA 3.5 Instruction Set Architecture `__ +.. [AMD-GCN-GFX12-RDNA4] `AMD RDNA 4 Instruction Set Architecture `__ .. [AMD-RADEON-HD-2000-3000] `AMD R6xx shader ISA `__ .. [AMD-RADEON-HD-4000] `AMD R7xx shader ISA `__ .. [AMD-RADEON-HD-5000] `AMD Evergreen shader ISA `__ From c192737009584377d99b18bfbc8298c8e58bcd02 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Tue, 1 Apr 2025 13:27:46 +0100 Subject: [PATCH 0231/1029] [LLVM][InstCombine][AArch64] Refactor common SVE intrinsic combines. (#126928) Introduce SVEIntrinsicInfo to store properties common across SVE intrinsics. This allows a seperation between intrinsic IDs and the transformations that can be applied to them, which reduces the layering problems we hit when adding new combines. This PR is mostly refactoring to bring in the concept and port the most common combines (e.g. dead code when all false). This will be followed up with new combines where I plan to reuse much of the existing instruction simplifcation logic to significantly improve our ability to constant fold SVE intrinsics. --- .../AArch64/AArch64TargetTransformInfo.cpp | 881 ++++++++++-------- 1 file changed, 517 insertions(+), 364 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 86c2fcf8ae2c1..823d77251a796 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1010,6 +1010,514 @@ static std::optional processPhiNode(InstCombiner &IC, return IC.replaceInstUsesWith(II, NPN); } +// A collection of properties common to SVE intrinsics that allow for combines +// to be written without needing to know the specific intrinsic. +struct SVEIntrinsicInfo { + // + // Helper routines for common intrinsic definitions. + // + + // e.g. llvm.aarch64.sve.add pg, op1, op2 + // with IID ==> llvm.aarch64.sve.add_u + static SVEIntrinsicInfo + defaultMergingOp(Intrinsic::ID IID = Intrinsic::not_intrinsic) { + return SVEIntrinsicInfo() + .setGoverningPredicateOperandIdx(0) + .setOperandIdxInactiveLanesTakenFrom(1) + .setMatchingUndefIntrinsic(IID); + } + + // e.g. llvm.aarch64.sve.neg inactive, pg, op + static SVEIntrinsicInfo defaultMergingUnaryOp() { + return SVEIntrinsicInfo() + .setGoverningPredicateOperandIdx(1) + .setOperandIdxInactiveLanesTakenFrom(0) + .setOperandIdxWithNoActiveLanes(0); + } + + // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op + static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp() { + return SVEIntrinsicInfo() + .setGoverningPredicateOperandIdx(1) + .setOperandIdxInactiveLanesTakenFrom(0); + } + + // e.g. llvm.aarch64.sve.add_u pg, op1, op2 + static SVEIntrinsicInfo defaultUndefOp() { + return SVEIntrinsicInfo() + .setGoverningPredicateOperandIdx(0) + .setInactiveLanesAreNotDefined(); + } + + // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0) + // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1) + static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) { + return SVEIntrinsicInfo() + .setGoverningPredicateOperandIdx(GPIndex) + .setInactiveLanesAreUnused(); + } + + // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2 + // llvm.aarch64.sve.ld1 pg, ptr + static SVEIntrinsicInfo defaultZeroingOp() { + return SVEIntrinsicInfo() + .setGoverningPredicateOperandIdx(0) + .setInactiveLanesAreUnused() + .setResultIsZeroInitialized(); + } + + // All properties relate to predication and thus having a general predicate + // is the minimum requirement to say there is intrinsic info to act on. + explicit operator bool() const { return hasGoverningPredicate(); } + + // + // Properties relating to the governing predicate. + // + + bool hasGoverningPredicate() const { + return GoverningPredicateIdx != std::numeric_limits::max(); + } + + unsigned getGoverningPredicateOperandIdx() const { + assert(hasGoverningPredicate() && "Propery not set!"); + return GoverningPredicateIdx; + } + + SVEIntrinsicInfo &setGoverningPredicateOperandIdx(unsigned Index) { + assert(!hasGoverningPredicate() && "Cannot set property twice!"); + GoverningPredicateIdx = Index; + return *this; + } + + // + // Properties relating to operations the intrinsic could be transformed into. + // NOTE: This does not mean such a transformation is always possible, but the + // knowledge makes it possible to reuse existing optimisations without needing + // to embed specific handling for each intrinsic. For example, instruction + // simplification can be used to optimise an intrinsic's active lanes. + // + + bool hasMatchingUndefIntrinsic() const { + return UndefIntrinsic != Intrinsic::not_intrinsic; + } + + Intrinsic::ID getMatchingUndefIntrinsic() const { + assert(hasMatchingUndefIntrinsic() && "Propery not set!"); + return UndefIntrinsic; + } + + SVEIntrinsicInfo &setMatchingUndefIntrinsic(Intrinsic::ID IID) { + assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!"); + UndefIntrinsic = IID; + return *this; + } + + // + // Properties relating to the result of inactive lanes. + // + + bool inactiveLanesTakenFromOperand() const { + return ResultLanes == InactiveLanesTakenFromOperand; + } + + unsigned getOperandIdxInactiveLanesTakenFrom() const { + assert(inactiveLanesTakenFromOperand() && "Propery not set!"); + return OperandIdxForInactiveLanes; + } + + SVEIntrinsicInfo &setOperandIdxInactiveLanesTakenFrom(unsigned Index) { + assert(ResultLanes == Uninitialized && "Cannot set property twice!"); + ResultLanes = InactiveLanesTakenFromOperand; + OperandIdxForInactiveLanes = Index; + return *this; + } + + bool inactiveLanesAreNotDefined() const { + return ResultLanes == InactiveLanesAreNotDefined; + } + + SVEIntrinsicInfo &setInactiveLanesAreNotDefined() { + assert(ResultLanes == Uninitialized && "Cannot set property twice!"); + ResultLanes = InactiveLanesAreNotDefined; + return *this; + } + + bool inactiveLanesAreUnused() const { + return ResultLanes == InactiveLanesAreUnused; + } + + SVEIntrinsicInfo &setInactiveLanesAreUnused() { + assert(ResultLanes == Uninitialized && "Cannot set property twice!"); + ResultLanes = InactiveLanesAreUnused; + return *this; + } + + // NOTE: Whilst not limited to only inactive lanes, the common use case is: + // inactiveLanesAreZerod = + // resultIsZeroInitialized() && inactiveLanesAreUnused() + bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; } + + SVEIntrinsicInfo &setResultIsZeroInitialized() { + ResultIsZeroInitialized = true; + return *this; + } + + // + // The first operand of unary merging operations is typically only used to + // set the result for inactive lanes. Knowing this allows us to deadcode the + // operand when we can prove there are no inactive lanes. + // + + bool hasOperandWithNoActiveLanes() const { + return OperandIdxWithNoActiveLanes != std::numeric_limits::max(); + } + + unsigned getOperandIdxWithNoActiveLanes() const { + assert(hasOperandWithNoActiveLanes() && "Propery not set!"); + return OperandIdxWithNoActiveLanes; + } + + SVEIntrinsicInfo &setOperandIdxWithNoActiveLanes(unsigned Index) { + assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!"); + OperandIdxWithNoActiveLanes = Index; + return *this; + } + +private: + unsigned GoverningPredicateIdx = std::numeric_limits::max(); + + Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic; + + enum PredicationStyle { + Uninitialized, + InactiveLanesTakenFromOperand, + InactiveLanesAreNotDefined, + InactiveLanesAreUnused + } ResultLanes = Uninitialized; + + bool ResultIsZeroInitialized = false; + unsigned OperandIdxForInactiveLanes = std::numeric_limits::max(); + unsigned OperandIdxWithNoActiveLanes = std::numeric_limits::max(); +}; + +static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) { + // Some SVE intrinsics do not use scalable vector types, but since they are + // not relevant from an SVEIntrinsicInfo perspective, they are also ignored. + if (!isa(II.getType()) && + all_of(II.args(), [&](const Value *V) { + return !isa(V->getType()); + })) + return SVEIntrinsicInfo(); + + Intrinsic::ID IID = II.getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::aarch64_sve_fcvt_bf16f32_v2: + case Intrinsic::aarch64_sve_fcvt_f16f32: + case Intrinsic::aarch64_sve_fcvt_f16f64: + case Intrinsic::aarch64_sve_fcvt_f32f16: + case Intrinsic::aarch64_sve_fcvt_f32f64: + case Intrinsic::aarch64_sve_fcvt_f64f16: + case Intrinsic::aarch64_sve_fcvt_f64f32: + case Intrinsic::aarch64_sve_fcvtlt_f32f16: + case Intrinsic::aarch64_sve_fcvtlt_f64f32: + case Intrinsic::aarch64_sve_fcvtx_f32f64: + case Intrinsic::aarch64_sve_fcvtzs: + case Intrinsic::aarch64_sve_fcvtzs_i32f16: + case Intrinsic::aarch64_sve_fcvtzs_i32f64: + case Intrinsic::aarch64_sve_fcvtzs_i64f16: + case Intrinsic::aarch64_sve_fcvtzs_i64f32: + case Intrinsic::aarch64_sve_fcvtzu: + case Intrinsic::aarch64_sve_fcvtzu_i32f16: + case Intrinsic::aarch64_sve_fcvtzu_i32f64: + case Intrinsic::aarch64_sve_fcvtzu_i64f16: + case Intrinsic::aarch64_sve_fcvtzu_i64f32: + case Intrinsic::aarch64_sve_scvtf: + case Intrinsic::aarch64_sve_scvtf_f16i32: + case Intrinsic::aarch64_sve_scvtf_f16i64: + case Intrinsic::aarch64_sve_scvtf_f32i64: + case Intrinsic::aarch64_sve_scvtf_f64i32: + case Intrinsic::aarch64_sve_ucvtf: + case Intrinsic::aarch64_sve_ucvtf_f16i32: + case Intrinsic::aarch64_sve_ucvtf_f16i64: + case Intrinsic::aarch64_sve_ucvtf_f32i64: + case Intrinsic::aarch64_sve_ucvtf_f64i32: + return SVEIntrinsicInfo::defaultMergingUnaryOp(); + + case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2: + case Intrinsic::aarch64_sve_fcvtnt_f16f32: + case Intrinsic::aarch64_sve_fcvtnt_f32f64: + case Intrinsic::aarch64_sve_fcvtxnt_f32f64: + return SVEIntrinsicInfo::defaultMergingUnaryNarrowingTopOp(); + + case Intrinsic::aarch64_sve_fabd: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u); + case Intrinsic::aarch64_sve_fadd: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u); + case Intrinsic::aarch64_sve_fdiv: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u); + case Intrinsic::aarch64_sve_fmax: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u); + case Intrinsic::aarch64_sve_fmaxnm: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u); + case Intrinsic::aarch64_sve_fmin: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u); + case Intrinsic::aarch64_sve_fminnm: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u); + case Intrinsic::aarch64_sve_fmla: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u); + case Intrinsic::aarch64_sve_fmls: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u); + case Intrinsic::aarch64_sve_fmul: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u); + case Intrinsic::aarch64_sve_fmulx: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u); + case Intrinsic::aarch64_sve_fnmla: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u); + case Intrinsic::aarch64_sve_fnmls: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u); + case Intrinsic::aarch64_sve_fsub: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u); + case Intrinsic::aarch64_sve_add: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u); + case Intrinsic::aarch64_sve_mla: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u); + case Intrinsic::aarch64_sve_mls: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u); + case Intrinsic::aarch64_sve_mul: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u); + case Intrinsic::aarch64_sve_sabd: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u); + case Intrinsic::aarch64_sve_smax: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u); + case Intrinsic::aarch64_sve_smin: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u); + case Intrinsic::aarch64_sve_smulh: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u); + case Intrinsic::aarch64_sve_sub: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u); + case Intrinsic::aarch64_sve_uabd: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u); + case Intrinsic::aarch64_sve_umax: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u); + case Intrinsic::aarch64_sve_umin: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u); + case Intrinsic::aarch64_sve_umulh: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u); + case Intrinsic::aarch64_sve_asr: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u); + case Intrinsic::aarch64_sve_lsl: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u); + case Intrinsic::aarch64_sve_lsr: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u); + case Intrinsic::aarch64_sve_and: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u); + case Intrinsic::aarch64_sve_bic: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u); + case Intrinsic::aarch64_sve_eor: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u); + case Intrinsic::aarch64_sve_orr: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u); + case Intrinsic::aarch64_sve_sqsub: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u); + case Intrinsic::aarch64_sve_uqsub: + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u); + + case Intrinsic::aarch64_sve_addqv: + case Intrinsic::aarch64_sve_and_z: + case Intrinsic::aarch64_sve_bic_z: + case Intrinsic::aarch64_sve_brka_z: + case Intrinsic::aarch64_sve_brkb_z: + case Intrinsic::aarch64_sve_brkn_z: + case Intrinsic::aarch64_sve_brkpa_z: + case Intrinsic::aarch64_sve_brkpb_z: + case Intrinsic::aarch64_sve_cntp: + case Intrinsic::aarch64_sve_compact: + case Intrinsic::aarch64_sve_eor_z: + case Intrinsic::aarch64_sve_eorv: + case Intrinsic::aarch64_sve_eorqv: + case Intrinsic::aarch64_sve_nand_z: + case Intrinsic::aarch64_sve_nor_z: + case Intrinsic::aarch64_sve_orn_z: + case Intrinsic::aarch64_sve_orr_z: + case Intrinsic::aarch64_sve_orv: + case Intrinsic::aarch64_sve_orqv: + case Intrinsic::aarch64_sve_pnext: + case Intrinsic::aarch64_sve_rdffr_z: + case Intrinsic::aarch64_sve_saddv: + case Intrinsic::aarch64_sve_uaddv: + case Intrinsic::aarch64_sve_umaxv: + case Intrinsic::aarch64_sve_umaxqv: + case Intrinsic::aarch64_sve_cmpeq: + case Intrinsic::aarch64_sve_cmpeq_wide: + case Intrinsic::aarch64_sve_cmpge: + case Intrinsic::aarch64_sve_cmpge_wide: + case Intrinsic::aarch64_sve_cmpgt: + case Intrinsic::aarch64_sve_cmpgt_wide: + case Intrinsic::aarch64_sve_cmphi: + case Intrinsic::aarch64_sve_cmphi_wide: + case Intrinsic::aarch64_sve_cmphs: + case Intrinsic::aarch64_sve_cmphs_wide: + case Intrinsic::aarch64_sve_cmple_wide: + case Intrinsic::aarch64_sve_cmplo_wide: + case Intrinsic::aarch64_sve_cmpls_wide: + case Intrinsic::aarch64_sve_cmplt_wide: + case Intrinsic::aarch64_sve_cmpne: + case Intrinsic::aarch64_sve_cmpne_wide: + case Intrinsic::aarch64_sve_facge: + case Intrinsic::aarch64_sve_facgt: + case Intrinsic::aarch64_sve_fcmpeq: + case Intrinsic::aarch64_sve_fcmpge: + case Intrinsic::aarch64_sve_fcmpgt: + case Intrinsic::aarch64_sve_fcmpne: + case Intrinsic::aarch64_sve_fcmpuo: + case Intrinsic::aarch64_sve_ld1: + case Intrinsic::aarch64_sve_ld1_gather: + case Intrinsic::aarch64_sve_ld1_gather_index: + case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: + case Intrinsic::aarch64_sve_ld1_gather_sxtw: + case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: + case Intrinsic::aarch64_sve_ld1_gather_uxtw: + case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: + case Intrinsic::aarch64_sve_ld1q_gather_index: + case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: + case Intrinsic::aarch64_sve_ld1q_gather_vector_offset: + case Intrinsic::aarch64_sve_ld1ro: + case Intrinsic::aarch64_sve_ld1rq: + case Intrinsic::aarch64_sve_ld1udq: + case Intrinsic::aarch64_sve_ld1uwq: + case Intrinsic::aarch64_sve_ld2_sret: + case Intrinsic::aarch64_sve_ld2q_sret: + case Intrinsic::aarch64_sve_ld3_sret: + case Intrinsic::aarch64_sve_ld3q_sret: + case Intrinsic::aarch64_sve_ld4_sret: + case Intrinsic::aarch64_sve_ld4q_sret: + case Intrinsic::aarch64_sve_ldff1: + case Intrinsic::aarch64_sve_ldff1_gather: + case Intrinsic::aarch64_sve_ldff1_gather_index: + case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: + case Intrinsic::aarch64_sve_ldff1_gather_sxtw: + case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: + case Intrinsic::aarch64_sve_ldff1_gather_uxtw: + case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: + case Intrinsic::aarch64_sve_ldnf1: + case Intrinsic::aarch64_sve_ldnt1: + case Intrinsic::aarch64_sve_ldnt1_gather: + case Intrinsic::aarch64_sve_ldnt1_gather_index: + case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: + case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: + return SVEIntrinsicInfo::defaultZeroingOp(); + + case Intrinsic::aarch64_sve_prf: + case Intrinsic::aarch64_sve_prfb_gather_index: + case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: + case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfd_gather_index: + case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: + case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfh_gather_index: + case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: + case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: + case Intrinsic::aarch64_sve_prfw_gather_index: + case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: + case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: + case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: + return SVEIntrinsicInfo::defaultVoidOp(0); + + case Intrinsic::aarch64_sve_st1_scatter: + case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: + case Intrinsic::aarch64_sve_st1_scatter_sxtw: + case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: + case Intrinsic::aarch64_sve_st1_scatter_uxtw: + case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: + case Intrinsic::aarch64_sve_st1dq: + case Intrinsic::aarch64_sve_st1q_scatter_index: + case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset: + case Intrinsic::aarch64_sve_st1q_scatter_vector_offset: + case Intrinsic::aarch64_sve_st1wq: + case Intrinsic::aarch64_sve_stnt1: + case Intrinsic::aarch64_sve_stnt1_scatter: + case Intrinsic::aarch64_sve_stnt1_scatter_index: + case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: + case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: + return SVEIntrinsicInfo::defaultVoidOp(1); + case Intrinsic::aarch64_sve_st2: + case Intrinsic::aarch64_sve_st2q: + return SVEIntrinsicInfo::defaultVoidOp(2); + case Intrinsic::aarch64_sve_st3: + case Intrinsic::aarch64_sve_st3q: + return SVEIntrinsicInfo::defaultVoidOp(3); + case Intrinsic::aarch64_sve_st4: + case Intrinsic::aarch64_sve_st4q: + return SVEIntrinsicInfo::defaultVoidOp(4); + } + + return SVEIntrinsicInfo(); +} + +static bool isAllActivePredicate(Value *Pred) { + // Look through convert.from.svbool(convert.to.svbool(...) chain. + Value *UncastedPred; + if (match(Pred, m_Intrinsic( + m_Intrinsic( + m_Value(UncastedPred))))) + // If the predicate has the same or less lanes than the uncasted + // predicate then we know the casting has no effect. + if (cast(Pred->getType())->getMinNumElements() <= + cast(UncastedPred->getType())->getMinNumElements()) + Pred = UncastedPred; + + return match(Pred, m_Intrinsic( + m_ConstantInt())); +} + +// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise +// to operations with less strict inactive lane requirements. +static std::optional +simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, + const SVEIntrinsicInfo &IInfo) { + if (!IInfo.hasGoverningPredicate()) + return std::nullopt; + + auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx()); + + // If there are no active lanes. + if (match(OpPredicate, m_ZeroInt())) { + if (IInfo.inactiveLanesTakenFromOperand()) + return IC.replaceInstUsesWith( + II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom())); + + if (IInfo.inactiveLanesAreUnused()) { + if (IInfo.resultIsZeroInitialized()) + IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); + + return IC.eraseInstFromFunction(II); + } + } + + // If there are no inactive lanes. + if (isAllActivePredicate(OpPredicate)) { + if (IInfo.hasOperandWithNoActiveLanes()) { + unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes(); + if (!isa(II.getOperand(OpIdx))) + return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType())); + } + + if (IInfo.hasMatchingUndefIntrinsic()) { + auto *NewDecl = Intrinsic::getOrInsertDeclaration( + II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()}); + II.setCalledFunction(NewDecl); + return &II; + } + } + + return std::nullopt; +} + // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) // => (binop (pred) (from_svbool _) (from_svbool _)) // @@ -1121,85 +1629,6 @@ instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { return IC.replaceInstUsesWith(II, EarliestReplacement); } -static bool isAllActivePredicate(Value *Pred) { - // Look through convert.from.svbool(convert.to.svbool(...) chain. - Value *UncastedPred; - if (match(Pred, m_Intrinsic( - m_Intrinsic( - m_Value(UncastedPred))))) - // If the predicate has the same or less lanes than the uncasted - // predicate then we know the casting has no effect. - if (cast(Pred->getType())->getMinNumElements() <= - cast(UncastedPred->getType())->getMinNumElements()) - Pred = UncastedPred; - - return match(Pred, m_Intrinsic( - m_ConstantInt())); -} - -// Simplify unary operation where predicate has all inactive lanes by replacing -// instruction with its operand -static std::optional -instCombineSVENoActiveReplace(InstCombiner &IC, IntrinsicInst &II, - bool hasInactiveVector) { - int PredOperand = hasInactiveVector ? 1 : 0; - int ReplaceOperand = hasInactiveVector ? 0 : 1; - if (match(II.getOperand(PredOperand), m_ZeroInt())) { - IC.replaceInstUsesWith(II, II.getOperand(ReplaceOperand)); - return IC.eraseInstFromFunction(II); - } - return std::nullopt; -} - -// Simplify unary operation where predicate has all inactive lanes or -// replace unused first operand with undef when all lanes are active -static std::optional -instCombineSVEAllOrNoActiveUnary(InstCombiner &IC, IntrinsicInst &II) { - if (isAllActivePredicate(II.getOperand(1)) && - !isa(II.getOperand(0)) && - !isa(II.getOperand(0))) { - Value *Undef = llvm::UndefValue::get(II.getType()); - return IC.replaceOperand(II, 0, Undef); - } - return instCombineSVENoActiveReplace(IC, II, true); -} - -// Erase unary operation where predicate has all inactive lanes -static std::optional -instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II, - int PredPos) { - if (match(II.getOperand(PredPos), m_ZeroInt())) { - return IC.eraseInstFromFunction(II); - } - return std::nullopt; -} - -// Simplify operation where predicate has all inactive lanes by replacing -// instruction with zeroed object -static std::optional -instCombineSVENoActiveZero(InstCombiner &IC, IntrinsicInst &II) { - if (match(II.getOperand(0), m_ZeroInt())) { - Constant *Node; - Type *RetTy = II.getType(); - if (RetTy->isStructTy()) { - auto StructT = cast(RetTy); - auto VecT = StructT->getElementType(0); - SmallVector ZerVec; - for (unsigned i = 0; i < StructT->getNumElements(); i++) { - ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0) - : ConstantInt::get(VecT, 0)); - } - Node = ConstantStruct::get(StructT, ZerVec); - } else - Node = RetTy->isFPOrFPVectorTy() ? ConstantFP::get(RetTy, 0.0) - : ConstantInt::get(II.getType(), 0); - - IC.replaceInstUsesWith(II, Node); - return IC.eraseInstFromFunction(II); - } - return std::nullopt; -} - static std::optional instCombineSVESel(InstCombiner &IC, IntrinsicInst &II) { // svsel(ptrue, x, y) => x @@ -1250,10 +1679,6 @@ static std::optional instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II) { LLVMContext &Ctx = II.getContext(); - // Replace by zero constant when all lanes are inactive - if (auto II_NA = instCombineSVENoActiveZero(IC, II)) - return II_NA; - // Check that the predicate is all active auto *Pg = dyn_cast(II.getArgOperand(0)); if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) @@ -1618,10 +2043,6 @@ instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { Value *PtrOp = II.getOperand(1); Type *VecTy = II.getType(); - // Replace by zero constant when all lanes are inactive - if (auto II_NA = instCombineSVENoActiveZero(IC, II)) - return II_NA; - if (isAllActivePredicate(Pred)) { LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp); Load->copyMetadata(II); @@ -1683,40 +2104,8 @@ instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { return IC.replaceInstUsesWith(II, BinOp); } -// Canonicalise operations that take an all active predicate (e.g. sve.add -> -// sve.add_u). -static std::optional instCombineSVEAllActive(IntrinsicInst &II, - Intrinsic::ID IID) { - auto *OpPredicate = II.getOperand(0); - if (!match(OpPredicate, m_Intrinsic( - m_ConstantInt()))) - return std::nullopt; - - auto *Mod = II.getModule(); - auto *NewDecl = Intrinsic::getOrInsertDeclaration(Mod, IID, {II.getType()}); - II.setCalledFunction(NewDecl); - - return &II; -} - -// Simplify operations where predicate has all inactive lanes or try to replace -// with _u form when all lanes are active -static std::optional -instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, - Intrinsic::ID IID) { - if (match(II.getOperand(0), m_ZeroInt())) { - // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are - // inactive for sv[func]_m - return IC.replaceInstUsesWith(II, II.getOperand(1)); - } - return instCombineSVEAllActive(II, IID); -} - static std::optional instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II) { - if (auto II_U = - instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u)) - return II_U; if (auto MLA = instCombineSVEVectorFuseMulAddSub( IC, II, true)) @@ -1730,9 +2119,6 @@ static std::optional instCombineSVEVectorAdd(InstCombiner &IC, static std::optional instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) { - if (auto II_U = - instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u)) - return II_U; if (auto FMLA = instCombineSVEVectorFuseMulAddSub(IC, II, @@ -1773,9 +2159,6 @@ instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) { static std::optional instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) { - if (auto II_U = - instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u)) - return II_U; if (auto FMLS = instCombineSVEVectorFuseMulAddSub(IC, II, @@ -1816,9 +2199,6 @@ instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) { static std::optional instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II) { - if (auto II_U = - instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u)) - return II_U; if (auto MLS = instCombineSVEVectorFuseMulAddSub( IC, II, true)) @@ -1827,8 +2207,7 @@ static std::optional instCombineSVEVectorSub(InstCombiner &IC, } static std::optional instCombineSVEVectorMul(InstCombiner &IC, - IntrinsicInst &II, - Intrinsic::ID IID) { + IntrinsicInst &II) { auto *OpPredicate = II.getOperand(0); auto *OpMultiplicand = II.getOperand(1); auto *OpMultiplier = II.getOperand(2); @@ -1967,10 +2346,6 @@ instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { Type *Ty = II.getType(); Value *PassThru = ConstantAggregateZero::get(Ty); - // Replace by zero constant when all lanes are inactive - if (auto II_NA = instCombineSVENoActiveZero(IC, II)) - return II_NA; - // Contiguous gather => masked load. // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) @@ -2229,172 +2604,16 @@ static std::optional instCombineDMB(InstCombiner &IC, std::optional AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { + const SVEIntrinsicInfo &IInfo = constructSVEIntrinsicInfo(II); + if (std::optional I = simplifySVEIntrinsic(IC, II, IInfo)) + return I; + Intrinsic::ID IID = II.getIntrinsicID(); switch (IID) { default: break; case Intrinsic::aarch64_dmb: return instCombineDMB(IC, II); - case Intrinsic::aarch64_sve_fcvt_bf16f32_v2: - case Intrinsic::aarch64_sve_fcvt_f16f32: - case Intrinsic::aarch64_sve_fcvt_f16f64: - case Intrinsic::aarch64_sve_fcvt_f32f16: - case Intrinsic::aarch64_sve_fcvt_f32f64: - case Intrinsic::aarch64_sve_fcvt_f64f16: - case Intrinsic::aarch64_sve_fcvt_f64f32: - case Intrinsic::aarch64_sve_fcvtlt_f32f16: - case Intrinsic::aarch64_sve_fcvtlt_f64f32: - case Intrinsic::aarch64_sve_fcvtx_f32f64: - case Intrinsic::aarch64_sve_fcvtzs: - case Intrinsic::aarch64_sve_fcvtzs_i32f16: - case Intrinsic::aarch64_sve_fcvtzs_i32f64: - case Intrinsic::aarch64_sve_fcvtzs_i64f16: - case Intrinsic::aarch64_sve_fcvtzs_i64f32: - case Intrinsic::aarch64_sve_fcvtzu: - case Intrinsic::aarch64_sve_fcvtzu_i32f16: - case Intrinsic::aarch64_sve_fcvtzu_i32f64: - case Intrinsic::aarch64_sve_fcvtzu_i64f16: - case Intrinsic::aarch64_sve_fcvtzu_i64f32: - case Intrinsic::aarch64_sve_scvtf: - case Intrinsic::aarch64_sve_scvtf_f16i32: - case Intrinsic::aarch64_sve_scvtf_f16i64: - case Intrinsic::aarch64_sve_scvtf_f32i64: - case Intrinsic::aarch64_sve_scvtf_f64i32: - case Intrinsic::aarch64_sve_ucvtf: - case Intrinsic::aarch64_sve_ucvtf_f16i32: - case Intrinsic::aarch64_sve_ucvtf_f16i64: - case Intrinsic::aarch64_sve_ucvtf_f32i64: - case Intrinsic::aarch64_sve_ucvtf_f64i32: - return instCombineSVEAllOrNoActiveUnary(IC, II); - case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2: - case Intrinsic::aarch64_sve_fcvtnt_f16f32: - case Intrinsic::aarch64_sve_fcvtnt_f32f64: - case Intrinsic::aarch64_sve_fcvtxnt_f32f64: - return instCombineSVENoActiveReplace(IC, II, true); - case Intrinsic::aarch64_sve_st1_scatter: - case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: - case Intrinsic::aarch64_sve_st1_scatter_sxtw: - case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: - case Intrinsic::aarch64_sve_st1_scatter_uxtw: - case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: - case Intrinsic::aarch64_sve_st1dq: - case Intrinsic::aarch64_sve_st1q_scatter_index: - case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset: - case Intrinsic::aarch64_sve_st1q_scatter_vector_offset: - case Intrinsic::aarch64_sve_st1wq: - case Intrinsic::aarch64_sve_stnt1: - case Intrinsic::aarch64_sve_stnt1_scatter: - case Intrinsic::aarch64_sve_stnt1_scatter_index: - case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: - case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: - return instCombineSVENoActiveUnaryErase(IC, II, 1); - case Intrinsic::aarch64_sve_st2: - case Intrinsic::aarch64_sve_st2q: - return instCombineSVENoActiveUnaryErase(IC, II, 2); - case Intrinsic::aarch64_sve_st3: - case Intrinsic::aarch64_sve_st3q: - return instCombineSVENoActiveUnaryErase(IC, II, 3); - case Intrinsic::aarch64_sve_st4: - case Intrinsic::aarch64_sve_st4q: - return instCombineSVENoActiveUnaryErase(IC, II, 4); - case Intrinsic::aarch64_sve_addqv: - case Intrinsic::aarch64_sve_and_z: - case Intrinsic::aarch64_sve_bic_z: - case Intrinsic::aarch64_sve_brka_z: - case Intrinsic::aarch64_sve_brkb_z: - case Intrinsic::aarch64_sve_brkn_z: - case Intrinsic::aarch64_sve_brkpa_z: - case Intrinsic::aarch64_sve_brkpb_z: - case Intrinsic::aarch64_sve_cntp: - case Intrinsic::aarch64_sve_compact: - case Intrinsic::aarch64_sve_eor_z: - case Intrinsic::aarch64_sve_eorv: - case Intrinsic::aarch64_sve_eorqv: - case Intrinsic::aarch64_sve_nand_z: - case Intrinsic::aarch64_sve_nor_z: - case Intrinsic::aarch64_sve_orn_z: - case Intrinsic::aarch64_sve_orr_z: - case Intrinsic::aarch64_sve_orv: - case Intrinsic::aarch64_sve_orqv: - case Intrinsic::aarch64_sve_pnext: - case Intrinsic::aarch64_sve_rdffr_z: - case Intrinsic::aarch64_sve_saddv: - case Intrinsic::aarch64_sve_uaddv: - case Intrinsic::aarch64_sve_umaxv: - case Intrinsic::aarch64_sve_umaxqv: - case Intrinsic::aarch64_sve_cmpeq: - case Intrinsic::aarch64_sve_cmpeq_wide: - case Intrinsic::aarch64_sve_cmpge: - case Intrinsic::aarch64_sve_cmpge_wide: - case Intrinsic::aarch64_sve_cmpgt: - case Intrinsic::aarch64_sve_cmpgt_wide: - case Intrinsic::aarch64_sve_cmphi: - case Intrinsic::aarch64_sve_cmphi_wide: - case Intrinsic::aarch64_sve_cmphs: - case Intrinsic::aarch64_sve_cmphs_wide: - case Intrinsic::aarch64_sve_cmple_wide: - case Intrinsic::aarch64_sve_cmplo_wide: - case Intrinsic::aarch64_sve_cmpls_wide: - case Intrinsic::aarch64_sve_cmplt_wide: - case Intrinsic::aarch64_sve_facge: - case Intrinsic::aarch64_sve_facgt: - case Intrinsic::aarch64_sve_fcmpeq: - case Intrinsic::aarch64_sve_fcmpge: - case Intrinsic::aarch64_sve_fcmpgt: - case Intrinsic::aarch64_sve_fcmpne: - case Intrinsic::aarch64_sve_fcmpuo: - case Intrinsic::aarch64_sve_ld1_gather: - case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: - case Intrinsic::aarch64_sve_ld1_gather_sxtw: - case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: - case Intrinsic::aarch64_sve_ld1_gather_uxtw: - case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: - case Intrinsic::aarch64_sve_ld1q_gather_index: - case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset: - case Intrinsic::aarch64_sve_ld1q_gather_vector_offset: - case Intrinsic::aarch64_sve_ld1ro: - case Intrinsic::aarch64_sve_ld1rq: - case Intrinsic::aarch64_sve_ld1udq: - case Intrinsic::aarch64_sve_ld1uwq: - case Intrinsic::aarch64_sve_ld2_sret: - case Intrinsic::aarch64_sve_ld2q_sret: - case Intrinsic::aarch64_sve_ld3_sret: - case Intrinsic::aarch64_sve_ld3q_sret: - case Intrinsic::aarch64_sve_ld4_sret: - case Intrinsic::aarch64_sve_ld4q_sret: - case Intrinsic::aarch64_sve_ldff1: - case Intrinsic::aarch64_sve_ldff1_gather: - case Intrinsic::aarch64_sve_ldff1_gather_index: - case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: - case Intrinsic::aarch64_sve_ldff1_gather_sxtw: - case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: - case Intrinsic::aarch64_sve_ldff1_gather_uxtw: - case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: - case Intrinsic::aarch64_sve_ldnf1: - case Intrinsic::aarch64_sve_ldnt1: - case Intrinsic::aarch64_sve_ldnt1_gather: - case Intrinsic::aarch64_sve_ldnt1_gather_index: - case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: - case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: - return instCombineSVENoActiveZero(IC, II); - case Intrinsic::aarch64_sve_prf: - case Intrinsic::aarch64_sve_prfb_gather_index: - case Intrinsic::aarch64_sve_prfb_gather_scalar_offset: - case Intrinsic::aarch64_sve_prfb_gather_sxtw_index: - case Intrinsic::aarch64_sve_prfb_gather_uxtw_index: - case Intrinsic::aarch64_sve_prfd_gather_index: - case Intrinsic::aarch64_sve_prfd_gather_scalar_offset: - case Intrinsic::aarch64_sve_prfd_gather_sxtw_index: - case Intrinsic::aarch64_sve_prfd_gather_uxtw_index: - case Intrinsic::aarch64_sve_prfh_gather_index: - case Intrinsic::aarch64_sve_prfh_gather_scalar_offset: - case Intrinsic::aarch64_sve_prfh_gather_sxtw_index: - case Intrinsic::aarch64_sve_prfh_gather_uxtw_index: - case Intrinsic::aarch64_sve_prfw_gather_index: - case Intrinsic::aarch64_sve_prfw_gather_scalar_offset: - case Intrinsic::aarch64_sve_prfw_gather_sxtw_index: - case Intrinsic::aarch64_sve_prfw_gather_uxtw_index: - return instCombineSVENoActiveUnaryErase(IC, II, 0); case Intrinsic::aarch64_neon_fmaxnm: case Intrinsic::aarch64_neon_fminnm: return instCombineMaxMinNM(IC, II); @@ -2427,39 +2646,14 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, case Intrinsic::aarch64_sve_ptest_first: case Intrinsic::aarch64_sve_ptest_last: return instCombineSVEPTest(IC, II); - case Intrinsic::aarch64_sve_fabd: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u); case Intrinsic::aarch64_sve_fadd: return instCombineSVEVectorFAdd(IC, II); case Intrinsic::aarch64_sve_fadd_u: return instCombineSVEVectorFAddU(IC, II); - case Intrinsic::aarch64_sve_fdiv: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u); - case Intrinsic::aarch64_sve_fmax: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u); - case Intrinsic::aarch64_sve_fmaxnm: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u); - case Intrinsic::aarch64_sve_fmin: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u); - case Intrinsic::aarch64_sve_fminnm: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u); - case Intrinsic::aarch64_sve_fmla: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u); - case Intrinsic::aarch64_sve_fmls: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u); case Intrinsic::aarch64_sve_fmul: - if (auto II_U = - instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u)) - return II_U; - return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u); + return instCombineSVEVectorMul(IC, II); case Intrinsic::aarch64_sve_fmul_u: - return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u); - case Intrinsic::aarch64_sve_fmulx: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u); - case Intrinsic::aarch64_sve_fnmla: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u); - case Intrinsic::aarch64_sve_fnmls: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u); + return instCombineSVEVectorMul(IC, II); case Intrinsic::aarch64_sve_fsub: return instCombineSVEVectorFSub(IC, II); case Intrinsic::aarch64_sve_fsub_u: @@ -2470,57 +2664,16 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, return instCombineSVEVectorFuseMulAddSub( IC, II, true); - case Intrinsic::aarch64_sve_mla: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u); - case Intrinsic::aarch64_sve_mls: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u); case Intrinsic::aarch64_sve_mul: - if (auto II_U = - instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u)) - return II_U; - return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u); + return instCombineSVEVectorMul(IC, II); case Intrinsic::aarch64_sve_mul_u: - return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u); - case Intrinsic::aarch64_sve_sabd: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u); - case Intrinsic::aarch64_sve_smax: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u); - case Intrinsic::aarch64_sve_smin: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u); - case Intrinsic::aarch64_sve_smulh: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u); + return instCombineSVEVectorMul(IC, II); case Intrinsic::aarch64_sve_sub: return instCombineSVEVectorSub(IC, II); case Intrinsic::aarch64_sve_sub_u: return instCombineSVEVectorFuseMulAddSub( IC, II, true); - case Intrinsic::aarch64_sve_uabd: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u); - case Intrinsic::aarch64_sve_umax: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u); - case Intrinsic::aarch64_sve_umin: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u); - case Intrinsic::aarch64_sve_umulh: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u); - case Intrinsic::aarch64_sve_asr: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u); - case Intrinsic::aarch64_sve_lsl: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u); - case Intrinsic::aarch64_sve_lsr: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u); - case Intrinsic::aarch64_sve_and: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u); - case Intrinsic::aarch64_sve_bic: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u); - case Intrinsic::aarch64_sve_eor: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u); - case Intrinsic::aarch64_sve_orr: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u); - case Intrinsic::aarch64_sve_sqsub: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u); - case Intrinsic::aarch64_sve_uqsub: - return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u); case Intrinsic::aarch64_sve_tbl: return instCombineSVETBL(IC, II); case Intrinsic::aarch64_sve_uunpkhi: From 8f56394487a4d454be0637667267ad37bd636d0f Mon Sep 17 00:00:00 2001 From: Anutosh Bhat Date: Tue, 1 Apr 2025 18:03:45 +0530 Subject: [PATCH 0232/1029] [clang-repl] Implement LoadDynamicLibrary for clang-repl wasm use cases (#133037) **Currently we don't make use of the JIT for the wasm use cases so the approach using the execution engine won't work in these cases.** Rather if we use dlopen. We should be able to do the following (demonstrating through a toy project) 1) Make use of LoadDynamicLibrary through the given implementation ``` extern "C" EMSCRIPTEN_KEEPALIVE int load_library(const char *name) { auto Err = Interp->LoadDynamicLibrary(name); if (Err) { llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), "load_library error: "); return -1; } return 0; } ``` 2) Add a button to call load_library once the library has been added in our MEMFS (currently we have symengine built as a SIDE MODULE and we are loading it) --- clang/lib/Interpreter/IncrementalExecutor.h | 2 +- clang/lib/Interpreter/Interpreter.cpp | 10 ++++++++++ clang/lib/Interpreter/Wasm.cpp | 13 +++++++++++++ clang/lib/Interpreter/Wasm.h | 3 +++ 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/clang/lib/Interpreter/IncrementalExecutor.h b/clang/lib/Interpreter/IncrementalExecutor.h index dbd61f0b8b1eb..71d71bc3883e2 100644 --- a/clang/lib/Interpreter/IncrementalExecutor.h +++ b/clang/lib/Interpreter/IncrementalExecutor.h @@ -57,7 +57,7 @@ class IncrementalExecutor { virtual llvm::Error removeModule(PartialTranslationUnit &PTU); virtual llvm::Error runCtors() const; virtual llvm::Error cleanUp(); - llvm::Expected + virtual llvm::Expected getSymbolAddress(llvm::StringRef Name, SymbolNameKind NameKind) const; llvm::orc::LLJIT &GetExecutionEngine() { return *Jit; } diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp index fa4c1439c9261..f8c8d0a425659 100644 --- a/clang/lib/Interpreter/Interpreter.cpp +++ b/clang/lib/Interpreter/Interpreter.cpp @@ -18,6 +18,7 @@ #include "llvm/Support/VirtualFileSystem.h" #ifdef __EMSCRIPTEN__ #include "Wasm.h" +#include #endif // __EMSCRIPTEN__ #include "clang/AST/ASTConsumer.h" @@ -711,6 +712,14 @@ llvm::Error Interpreter::Undo(unsigned N) { } llvm::Error Interpreter::LoadDynamicLibrary(const char *name) { +#ifdef __EMSCRIPTEN__ + void *handle = dlopen(name, RTLD_NOW | RTLD_GLOBAL); + if (!handle) { + llvm::errs() << dlerror() << '\n'; + return llvm::make_error("Failed to load dynamic library", + llvm::inconvertibleErrorCode()); + } +#else auto EE = getExecutionEngine(); if (!EE) return EE.takeError(); @@ -722,6 +731,7 @@ llvm::Error Interpreter::LoadDynamicLibrary(const char *name) { EE->getMainJITDylib().addGenerator(std::move(*DLSG)); else return DLSG.takeError(); +#endif return llvm::Error::success(); } diff --git a/clang/lib/Interpreter/Wasm.cpp b/clang/lib/Interpreter/Wasm.cpp index f7cb7598c77f8..0543a3504c9a2 100644 --- a/clang/lib/Interpreter/Wasm.cpp +++ b/clang/lib/Interpreter/Wasm.cpp @@ -144,6 +144,19 @@ llvm::Error WasmIncrementalExecutor::cleanUp() { return llvm::Error::success(); } +llvm::Expected +WasmIncrementalExecutor::getSymbolAddress(llvm::StringRef Name, + SymbolNameKind NameKind) const { + void *Sym = dlsym(RTLD_DEFAULT, Name.str().c_str()); + if (!Sym) { + return llvm::make_error("dlsym failed for symbol: " + + Name.str(), + llvm::inconvertibleErrorCode()); + } + + return llvm::orc::ExecutorAddr::fromPtr(Sym); +} + WasmIncrementalExecutor::~WasmIncrementalExecutor() = default; } // namespace clang diff --git a/clang/lib/Interpreter/Wasm.h b/clang/lib/Interpreter/Wasm.h index 4632613326d39..9a752934e3185 100644 --- a/clang/lib/Interpreter/Wasm.h +++ b/clang/lib/Interpreter/Wasm.h @@ -29,6 +29,9 @@ class WasmIncrementalExecutor : public IncrementalExecutor { llvm::Error removeModule(PartialTranslationUnit &PTU) override; llvm::Error runCtors() const override; llvm::Error cleanUp() override; + llvm::Expected + getSymbolAddress(llvm::StringRef Name, + SymbolNameKind NameKind) const override; ~WasmIncrementalExecutor() override; }; From 6892d5428600113dade7b4ecf6b70bbab3198c90 Mon Sep 17 00:00:00 2001 From: Virginia Cangelosi Date: Tue, 1 Apr 2025 13:35:09 +0100 Subject: [PATCH 0233/1029] [Clang][LLVM] Implement single-single vectors MOP4{A/S} (#127797) Implement all single-single {BF/F/S/U/SU/US}MOP4{A/S} instructions in clang and llvm following the acle in https://github.com/ARM-software/acle/pull/381/files --- clang/include/clang/Basic/arm_sme.td | 81 +++ .../sme2-intrinsics/acle_sme2_mop4_1x1.c | 465 ++++++++++++++++++ .../acle_sme2p2_imm.cpp | 84 ++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 24 +- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 68 +-- llvm/lib/Target/AArch64/SMEInstrFormats.td | 87 +++- .../AArch64/sme2-intrinsics-mop4a_1x1.ll | 419 ++++++++++++++++ 7 files changed, 1175 insertions(+), 53 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c create mode 100644 clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 288a8c04c217f..5012874a08790 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -289,6 +289,87 @@ multiclass ZAFPOuterProd { defm SVMOPA : ZAFPOuterProd<"mopa">; defm SVMOPS : ZAFPOuterProd<"mops">; +//////////////////////////////////////////////////////////////////////////////// +// SME2 - FMOP4A, FMOP4S, BFMOP4A, BFMOP4S + +multiclass MOP4 checks> { + def _1x1 : Inst<"svmop4" # mode # "[_1x1]" # za # "[_{d}_{d}]", "vidd", t, MergeNone, i # "_1x1", [IsInOutZA, IsStreaming], checks>; +} + +let SMETargetGuard = "sme2,sme-mop4" in { + defm SVFMOP4A_HtoS : MOP4<"a", "_za32", "hb", "aarch64_sme_mop4a_wide", [ImmCheck<0, ImmCheck0_3>]>; + defm SVFMOP4S_HtoS : MOP4<"s", "_za32", "hb", "aarch64_sme_mop4s_wide", [ImmCheck<0, ImmCheck0_3>]>; + defm SVFMOP4A_S : MOP4<"a", "_za32", "f", "aarch64_sme_mop4a", [ImmCheck<0, ImmCheck0_3>]>; + defm SVFMOP4S_S : MOP4<"s", "_za32", "f", "aarch64_sme_mop4s", [ImmCheck<0, ImmCheck0_3>]>; +} + +let SMETargetGuard = "sme2,sme-mop4,sme-f64f64" in { + defm SVFMOP4A_D : MOP4<"a", "_za64", "d", "aarch64_sme_mop4a", [ImmCheck<0, ImmCheck0_7>]>; + defm SVFMOP4S_D : MOP4<"s", "_za64", "d", "aarch64_sme_mop4s", [ImmCheck<0, ImmCheck0_7>]>; +} + +let SMETargetGuard = "sme2,sme-mop4,sme-f16f16" in { + defm SVFMOP4A_H : MOP4<"a", "_za16", "h", "aarch64_sme_mop4a", [ImmCheck<0, ImmCheck0_1>]>; + defm SVFMOP4S_H : MOP4<"s", "_za16", "h", "aarch64_sme_mop4s", [ImmCheck<0, ImmCheck0_1>]>; +} + +let SMETargetGuard = "sme2,sme-mop4,sme-b16b16" in { + defm SVBMOP4A_H : MOP4<"a", "_za16", "b", "aarch64_sme_mop4a", [ImmCheck<0, ImmCheck0_1>]>; + defm SVBMOP4S_H : MOP4<"s", "_za16", "b", "aarch64_sme_mop4s", [ImmCheck<0, ImmCheck0_1>]>; +} + +//////////////////////////////////////////////////////////////////////////////// +// SME2 - SMOP4A, SMOP4S, UMOP4A, UMOP4S + +let SMETargetGuard = "sme2,sme-mop4" in { + defm SVSMOP4A_H : MOP4<"a", "_za32", "cs", "aarch64_sme_smop4a_wide", [ImmCheck<0, ImmCheck0_3>]>; + defm SVSMOP4S_H : MOP4<"s", "_za32", "cs", "aarch64_sme_smop4s_wide", [ImmCheck<0, ImmCheck0_3>]>; + + defm SVUMOP4A_H : MOP4<"a", "_za32", "UcUs", "aarch64_sme_umop4a_wide", [ImmCheck<0, ImmCheck0_3>]>; + defm SVUMOP4S_H : MOP4<"s", "_za32", "UcUs", "aarch64_sme_umop4s_wide", [ImmCheck<0, ImmCheck0_3>]>; +} + +let SMETargetGuard = "sme2,sme-mop4,sme-i16i64" in { + defm SVSMOP4A_HtoD : MOP4<"a", "_za64", "s", "aarch64_sme_smop4a_za64_wide", [ImmCheck<0, ImmCheck0_7>]>; + defm SVSMOP4S_HtoD : MOP4<"s", "_za64", "s", "aarch64_sme_smop4s_za64_wide", [ImmCheck<0, ImmCheck0_7>]>; + + defm SVUMOP4A_HtoD : MOP4<"a", "_za64", "Us", "aarch64_sme_umop4a_za64_wide", [ImmCheck<0, ImmCheck0_7>]>; + defm SVUMOP4S_HtoD : MOP4<"s", "_za64", "Us", "aarch64_sme_umop4s_za64_wide", [ImmCheck<0, ImmCheck0_7>]>; +} + +//////////////////////////////////////////////////////////////////////////////// +// SME2 - SUMOP4A, SUMOP4S, USMOP4A, USMOP4S + +multiclass SUMOP4 checks> { + def _1x1 : SInst<"svmop4" # mode # "[_1x1]" # za # "[_{d}_{3}]", + "vidu", t, MergeNone, "aarch64_sme_sumop4" # mode # i # "_wide_1x1", + [IsStreaming, IsInOutZA], + checks>; +} + +multiclass USMOP4 checks> { + def _1x1 : SInst<"svmop4" # mode # "[_1x1]" # za # "[_{d}_{3}]", + "vidx", t, MergeNone, "aarch64_sme_usmop4" # mode # i # "_wide_1x1", + [IsStreaming, IsInOutZA], + checks>; +} + +let SMETargetGuard = "sme2,sme-mop4" in { + defm SVSUMOP4A_S : SUMOP4<"a", "_za32", "c", "", [ImmCheck<0, ImmCheck0_3>]>; + defm SVSUMOP4S_S : SUMOP4<"s", "_za32", "c", "", [ImmCheck<0, ImmCheck0_3>]>; + + defm SVUSMOP4A_S : USMOP4<"a", "_za32", "Uc", "", [ImmCheck<0, ImmCheck0_3>]>; + defm SVUSMOP4S_S : USMOP4<"s", "_za32", "Uc", "", [ImmCheck<0, ImmCheck0_3>]>; +} + +let SMETargetGuard = "sme2,sme-mop4,sme-i16i64" in { + defm SVSUMOP4A_D : SUMOP4<"a", "_za64", "s", "_za64", [ImmCheck<0, ImmCheck0_7>]>; + defm SVSUMOP4S_D : SUMOP4<"s", "_za64", "s", "_za64", [ImmCheck<0, ImmCheck0_7>]>; + + defm SVUSMOP4A_D : USMOP4<"a", "_za64", "Us", "_za64", [ImmCheck<0, ImmCheck0_7>]>; + defm SVUSMOP4S_D : USMOP4<"s", "_za64", "Us", "_za64", [ImmCheck<0, ImmCheck0_7>]>; +} + //////////////////////////////////////////////////////////////////////////////// // SME2 - ADD, SUB diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c new file mode 100644 index 0000000000000..94a839d053479 --- /dev/null +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c @@ -0,0 +1,465 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +#ifdef SME_OVERLOADED_FORMS +#define SME_ACLE_FUNC(A1,A2_UNUSED,A3, A4_UNUSED) A1##A3 +#else +#define SME_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svmop4a_1x1_za32_s8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_s8_s8u10__SVInt8_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za32_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za32,_s8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za32_s8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_s8_s8u10__SVInt8_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za32_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za32,_s8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za32_u8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_u8_u8u11__SVUint8_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za32,_u8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za32_u8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_u8_u8u11__SVUint8_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za32_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za32,_u8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za32_s8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za32,_s8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za32_s8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_s8_u8u10__SVInt8_tu11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za32_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za32,_s8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za32_u8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za32,_u8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za32_u8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x1_za32_u8_s8u11__SVUint8_tu10__SVInt8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za32_u8_s8(svuint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za32,_u8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za32_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_s16_s16u11__SVInt16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za32_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za32,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za32_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_s16_s16u11__SVInt16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za32_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za32,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za32_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_u16_u16u12__SVUint16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za32_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za32,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za32_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_u16_u16u12__SVUint16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za32_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za32,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za32_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_f16_f16u13__SVFloat16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za32_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za32,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za32_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_f16_f16u13__SVFloat16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za32_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za32,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za32_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x1_za32_bf16_bf16u14__SVBfloat16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za32_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za32,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za32_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x1_za32_bf16_bf16u14__SVBfloat16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za32_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za32,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za64_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_s16_s16u11__SVInt16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za64,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za64_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_s16_s16u11__SVInt16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za64_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za64,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za64_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_u16_u16u12__SVUint16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za64,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za64_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_u16_u16u12__SVUint16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za64_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za64,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za64_s16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_s16_u16u11__SVInt16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za64,_s16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za64_s16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_s16_u16u11__SVInt16_tu12__SVUint16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za64_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za64,_s16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za64_u16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_u16_s16u12__SVUint16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za64,_u16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za64_u16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_u16_s16u12__SVUint16_tu11__SVInt16_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za64_u16_s16(svuint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za64,_u16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za16_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za16_f16_f16u13__SVFloat16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za16_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za16,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za16_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za16_f16_f16u13__SVFloat16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za16_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za16,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za32_f32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za32_f32_f32u13__SVFloat32_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za32_f32_f32(svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za32,_f32_f32)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za32_f32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za32_f32_f32u13__SVFloat32_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za32_f32_f32(svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za32,_f32_f32)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za64_f64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x1_za64_f64_f64u13__SVFloat64_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za64_f64_f64(svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za64,_f64_f64)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za64_f64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x1_za64_f64_f64u13__SVFloat64_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za64_f64_f64(svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za64,_f64_f64)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x1_za16_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x1_za16_bf16_bf16u14__SVBfloat16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x1_za16_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x1,_za16,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x1_za16_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x1_za16_bf16_bf16u14__SVBfloat16_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x1_za16_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x1,_za16,_bf16_bf16)(1, zn, zm); +} diff --git a/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp new file mode 100644 index 0000000000000..556cb1742dbbd --- /dev/null +++ b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp @@ -0,0 +1,84 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \ +// RUN: -target-feature +sme -target-feature +sme2p2 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +#include + +void tests_mop4_imm_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x1_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x1_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x1_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x1_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x1_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x1_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4a_1x1_za32_u8_s8(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x1_za32_u8_s8(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x1_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x1_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_1x1_za64_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_1x1_za64_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x1_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x1_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_1x1_za64_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_1x1_za64_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x1_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_1x1_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4a_1x1_za64_u16_s16(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_1x1_za64_u16_s16(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x1_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x1_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_1x1_za16_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmop4s_1x1_za16_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + return; +} + +void tests_mop4_imm_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x1_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x1_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_1x1_za16_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmop4s_1x1_za16_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + return; + +} + +void tests_mop4_imm_f32_f32(svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x1_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x1_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_f64_f64(svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x1_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_1x1_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 6dfc3c8f2a393..fe8769154b1da 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3064,6 +3064,28 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sme_usmopa_wide : SME_OuterProduct_Intrinsic; def int_aarch64_sme_usmops_wide : SME_OuterProduct_Intrinsic; + class SME_OuterProduct_QuarterTile_Single_Single + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, + LLVMMatchType<0>], [ImmArg>, IntrNoMem, IntrHasSideEffects]>; + + // 2-way and 4-way multi-vector signed/unsigned Quarter Tile Quarter Product A/S + foreach mode = ["s", "a"] in { + foreach za = ["", "_za64"] in { + foreach ty = ["s", "u", "su", "us"] in { + def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_1x1" : SME_OuterProduct_QuarterTile_Single_Single; + } + } + } + + // 2-way and 4-way multi-vector floating point Quarter Tile Quarter Product A/S + foreach mode = ["s", "a"] in { + foreach wide = ["", "_wide"] in { + def int_aarch64_sme_mop4 # mode # wide # "_1x1" : SME_OuterProduct_QuarterTile_Single_Single; + } + } + class SME_AddVectorToTile_Intrinsic : DefaultAttrsIntrinsic<[], [llvm_i32_ty, @@ -3835,7 +3857,7 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sme_luti4_lane_zt_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], [ImmArg>, ImmArg>, IntrReadMem]>; - + // // Lookup table expand four registers // diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index d2aa86f388db2..f992f73171e0e 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -148,30 +148,30 @@ defm USMOPS_MPPZZ_D : sme_int_outer_product_i64<0b101, "usmops", int_aarch64_sme } let Predicates = [HasSME_MOP4] in { - defm SMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b0, "smop4a">; - defm SMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b1, "smop4s">; - defm SUMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b0, "sumop4a">; - defm SUMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b1, "sumop4s">; - defm USMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b0, "usmop4a">; - defm USMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b1, "usmop4s">; - defm UMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b0, "umop4a">; - defm UMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b1, "umop4s">; - - defm SMOP4A : sme_quarter_outer_product_i16_i32<0b0, 0b0, "smop4a">; - defm SMOP4S : sme_quarter_outer_product_i16_i32<0b0, 0b1, "smop4s">; - defm UMOP4A : sme_quarter_outer_product_i16_i32<0b1, 0b0, "umop4a">; - defm UMOP4S : sme_quarter_outer_product_i16_i32<0b1, 0b1, "umop4s">; + defm SMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b0, "smop4a", "int_aarch64_sme_smop4a_wide">; + defm SMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b0, 0b1, "smop4s", "int_aarch64_sme_smop4s_wide">; + defm SUMOP4A : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b0, "sumop4a", "int_aarch64_sme_sumop4a_wide">; + defm SUMOP4S : sme_quarter_outer_product_i8_i32<0b0, 0b1, 0b1, "sumop4s", "int_aarch64_sme_sumop4s_wide">; + defm USMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b0, "usmop4a", "int_aarch64_sme_usmop4a_wide">; + defm USMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b0, 0b1, "usmop4s", "int_aarch64_sme_usmop4s_wide">; + defm UMOP4A : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b0, "umop4a", "int_aarch64_sme_umop4a_wide">; + defm UMOP4S : sme_quarter_outer_product_i8_i32<0b1, 0b1, 0b1, "umop4s", "int_aarch64_sme_umop4s_wide">; + + defm SMOP4A : sme_quarter_outer_product_i16_i32<0b0, 0b0, "smop4a", "int_aarch64_sme_smop4a_wide">; + defm SMOP4S : sme_quarter_outer_product_i16_i32<0b0, 0b1, "smop4s", "int_aarch64_sme_smop4s_wide">; + defm UMOP4A : sme_quarter_outer_product_i16_i32<0b1, 0b0, "umop4a", "int_aarch64_sme_umop4a_wide">; + defm UMOP4S : sme_quarter_outer_product_i16_i32<0b1, 0b1, "umop4s", "int_aarch64_sme_umop4s_wide">; } let Predicates = [HasSME_MOP4, HasSMEI16I64] in { - defm SMOP4A : sme_quarter_outer_product_i64<0b0, 0b0, 0b0, "smop4a">; - defm SMOP4S : sme_quarter_outer_product_i64<0b0, 0b0, 0b1, "smop4s">; - defm SUMOP4A : sme_quarter_outer_product_i64<0b0, 0b1, 0b0, "sumop4a">; - defm SUMOP4S : sme_quarter_outer_product_i64<0b0, 0b1, 0b1, "sumop4s">; - defm UMOP4A : sme_quarter_outer_product_i64<0b1, 0b1, 0b0, "umop4a">; - defm UMOP4S : sme_quarter_outer_product_i64<0b1, 0b1, 0b1, "umop4s">; - defm USMOP4A : sme_quarter_outer_product_i64<0b1, 0b0, 0b0, "usmop4a">; - defm USMOP4S : sme_quarter_outer_product_i64<0b1, 0b0, 0b1, "usmop4s">; + defm SMOP4A : sme_quarter_outer_product_i64<0b0, 0b0, 0b0, "smop4a", "int_aarch64_sme_smop4a_za64_wide">; + defm SMOP4S : sme_quarter_outer_product_i64<0b0, 0b0, 0b1, "smop4s", "int_aarch64_sme_smop4s_za64_wide">; + defm SUMOP4A : sme_quarter_outer_product_i64<0b0, 0b1, 0b0, "sumop4a", "int_aarch64_sme_sumop4a_za64_wide">; + defm SUMOP4S : sme_quarter_outer_product_i64<0b0, 0b1, 0b1, "sumop4s", "int_aarch64_sme_sumop4s_za64_wide">; + defm UMOP4A : sme_quarter_outer_product_i64<0b1, 0b1, 0b0, "umop4a", "int_aarch64_sme_umop4a_za64_wide">; + defm UMOP4S : sme_quarter_outer_product_i64<0b1, 0b1, 0b1, "umop4s", "int_aarch64_sme_umop4s_za64_wide">; + defm USMOP4A : sme_quarter_outer_product_i64<0b1, 0b0, 0b0, "usmop4a", "int_aarch64_sme_usmop4a_za64_wide">; + defm USMOP4S : sme_quarter_outer_product_i64<0b1, 0b0, 0b1, "usmop4s", "int_aarch64_sme_usmop4s_za64_wide">; } let Predicates = [HasSME_TMOP] in { @@ -1054,14 +1054,14 @@ let Predicates = [HasSME2, HasSVEBFSCALE] in { } let Predicates = [HasSME_MOP4] in { - defm BFMOP4A : sme2_bfmop4as_widening<0, "bfmop4a">; - defm BFMOP4S : sme2_bfmop4as_widening<1, "bfmop4s">; + defm BFMOP4A : sme2_bfmop4as_widening<0, "bfmop4a", "int_aarch64_sme_mop4a_wide">; + defm BFMOP4S : sme2_bfmop4as_widening<1, "bfmop4s", "int_aarch64_sme_mop4s_wide">; - defm FMOP4A : sme2_fmop4as_fp16_fp32_widening<0, "fmop4a">; - defm FMOP4S : sme2_fmop4as_fp16_fp32_widening<1, "fmop4s">; + defm FMOP4A : sme2_fmop4as_fp16_fp32_widening<0, "fmop4a", "int_aarch64_sme_mop4a_wide">; + defm FMOP4S : sme2_fmop4as_fp16_fp32_widening<1, "fmop4s", "int_aarch64_sme_mop4s_wide">; - defm FMOP4A : sme2_fmop4as_fp32_non_widening<0, "fmop4a">; - defm FMOP4S : sme2_fmop4as_fp32_non_widening<1, "fmop4s">; + defm FMOP4A : sme2_fmop4as_fp32_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">; + defm FMOP4S : sme2_fmop4as_fp32_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">; } let Predicates = [HasSME_TMOP] in { @@ -1084,7 +1084,7 @@ let Predicates = [HasSME_TMOP, HasSMEB16B16] in { let Predicates = [HasSME_TMOP, HasSMEF8F32], Uses = [FPMR, FPCR] in { def FTMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b01000, ZZ_b_mul_r, ZPR8, "ftmopa">; -} +} let Predicates = [HasSME_TMOP, HasSMEF8F16], Uses = [FPMR, FPCR] in { def FTMOPA_M2ZZZI_BtoH : sme_tmopa_16b<0b01001, ZZ_b_mul_r, ZPR8, "ftmopa">; @@ -1099,8 +1099,8 @@ let Predicates = [HasSME_TMOP, HasSMEF16F16] in { } let Predicates = [HasSME_MOP4, HasSMEF16F16] in { - defm FMOP4A : sme2_fmop4as_fp16_non_widening<0, "fmop4a">; - defm FMOP4S : sme2_fmop4as_fp16_non_widening<1, "fmop4s">; + defm FMOP4A : sme2_fmop4as_fp16_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">; + defm FMOP4S : sme2_fmop4as_fp16_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">; } let Predicates = [HasSME2, HasSVEBFSCALE] in { @@ -1115,11 +1115,11 @@ let Predicates = [HasSME_MOP4, HasSMEF8F32] in { } let Predicates = [HasSME_MOP4, HasSMEB16B16] in { - defm BFMOP4A : sme2_bfmop4as_non_widening<0, "bfmop4a">; - defm BFMOP4S : sme2_bfmop4as_non_widening<1, "bfmop4s">; + defm BFMOP4A : sme2_bfmop4as_non_widening<0, "bfmop4a", "int_aarch64_sme_mop4a">; + defm BFMOP4S : sme2_bfmop4as_non_widening<1, "bfmop4s", "int_aarch64_sme_mop4s">; } let Predicates = [HasSME_MOP4, HasSMEF64F64] in { - defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a">; - defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s">; + defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">; + defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">; } diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 4f6a413ba5e5c..54c63ead059ae 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -104,6 +104,15 @@ class sme_outer_product_pseudo let usesCustomInserter = 1; } +class sme2_quarter_tile_outer_product_pseudo + : Pseudo<(outs), (ins i32imm:$tile, + zn_ty:$zn, zm_ty:$zm), []>, + Sched<[]> { + // Translated to the actual instructions in AArch64ISelLowering.cpp + let SMEMatrixType = za_flag; + let usesCustomInserter = 1; +} + class sme2_za_array_2op_multi_single_pseudo : SMEPseudo2Instr, @@ -257,6 +266,9 @@ class SME2_Tile_Movaz_Pat(name # _PSEUDO) $tile, $base, $offset)>; +class SME2_ZA_Tile_Vec_Single_Single_Pat + : Pat<(intrinsic imm_ty:$tile, vt:$Zn, vt:$Zm), + (!cast(name # _PSEUDO) $tile, $Zn, $Zm)>; //===----------------------------------------------------------------------===// // SME pattern match helpers. @@ -600,9 +612,14 @@ class sme_quarter_outer_product_i16_i32{ +multiclass sme_quarter_outer_product_i8_i32{ def _MZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 0}, subtr, - ZPR8Mul2_Lo, ZPR8Mul2_Hi, mnemonic>; + ZPR8Mul2_Lo, ZPR8Mul2_Hi, mnemonic>, SMEPseudo2Instr; + + def NAME # _MZZ_BToS_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_3, nxv16i8>; + def _M2ZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 0}, subtr, ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, mnemonic>; def _MZ2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 1}, subtr, @@ -611,9 +628,14 @@ multiclass sme_quarter_outer_product_i8_i32; } -multiclass sme_quarter_outer_product_i16_i32{ +multiclass sme_quarter_outer_product_i16_i32{ def _MZZ_HToS : sme_quarter_outer_product_i16_i32; + ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr; + + def NAME # _MZZ_HToS_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_3, nxv8i16>; + def _M2ZZ_HToS : sme_quarter_outer_product_i16_i32; def _MZ2Z_HToS : sme_quarter_outer_product_i16_i32; } -multiclass sme_quarter_outer_product_i64{ +multiclass sme_quarter_outer_product_i64{ def _MZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 0}, subtr, - ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>; + ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr; + + def NAME # _MZZ_HtoD_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_7, nxv8i16>; + def _M2ZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 0}, subtr, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>; def _MZ2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 1}, subtr, @@ -5470,9 +5497,13 @@ class sme2_bf16_fp32_quarter_tile_outer_product { +multiclass sme2_bfmop4as_widening { // Single vectors - def _MZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>; + def _MZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr; + + def NAME # _MZZ_S_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_3, nxv8bf16>; // Multiple and single vectors def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; @@ -5617,9 +5648,13 @@ class sme2_fp16_quarter_tile_outer_product { +multiclass sme2_fmop4as_fp16_non_widening { // Single vectors - def _MZZ_H : sme2_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>; + def _MZZ_H : sme2_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr; + + def NAME # _MZZ_H_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_1, nxv8f16>; // Multiple and single vectors def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; @@ -5689,9 +5724,13 @@ class sme2_bf16_fp16_quarter_tile_outer_product { +multiclass sme2_bfmop4as_non_widening { // Single vectors - def _MZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>; + def _MZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr; + + def NAME # _MZZ_H_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_1, nxv8bf16>; // Multiple and single vectors def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; @@ -5726,9 +5765,13 @@ class sme2_fp32_quarter_tile_outer_product { +multiclass sme2_fmop4as_fp32_non_widening { // Single vectors - def _MZZ_S : sme2_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR32Mul2_Lo, ZPR32Mul2_Hi>; + def _MZZ_S : sme2_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR32Mul2_Lo, ZPR32Mul2_Hi>, SMEPseudo2Instr; + + def NAME # _MZZ_S_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_3, nxv4f32>; // Multiple and single vectors def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>; @@ -5763,9 +5806,13 @@ class sme2_fp64_quarter_tile_outer_product { +multiclass sme2_fmop4as_fp64_non_widening { // Single vectors - def _MZZ_D : sme2_fp64_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR64Mul2_Lo, ZPR64Mul2_Hi>; + def _MZZ_D : sme2_fp64_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR64Mul2_Lo, ZPR64Mul2_Hi>, SMEPseudo2Instr; + + def NAME # _MZZ_D_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_7, nxv2f64>; // Multiple and single vectors def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>; @@ -5800,9 +5847,13 @@ class sme2_fp16_fp32_quarter_tile_outer_product { +multiclass sme2_fmop4as_fp16_fp32_widening { // Single vectors - def _MZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>; + def _MZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr; + + def NAME # _MZZ_HtoS_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_3, nxv8f16>; // Multiple and single vectors def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll new file mode 100644 index 0000000000000..ec899fab7cf21 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x1.ll @@ -0,0 +1,419 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-linux" + +; Widening +define void @mop4a_za32_s8( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za32_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4a za0.s, z0.b, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv16i8(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za32_s8( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4s za0.s, z0.b, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za32_u8( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za32_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: umop4a za0.s, z0.b, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv16i8(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za32_u8( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: umop4s za0.s, z0.b, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv16i8(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za32_s8_u8( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za32_s8_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: sumop4a za0.s, z0.b, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4a.wide.1x1.nxv16i8(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za32_s8_u8( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_s8_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: sumop4s za0.s, z0.b, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4s.wide.1x1.nxv16i8(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za32_u8_s8( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za32_u8_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: usmop4a za0.s, z0.b, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4a.wide.1x1.nxv16i8(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za32_u8_s8( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_u8_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: usmop4s za0.s, z0.b, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4s.wide.1x1.nxv16i8(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za32_s16( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za32_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4a za0.s, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.wide.1x1.nxv8i16(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za32_s16( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4s za0.s, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za32_u16( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za32_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: umop4a za0.s, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.wide.1x1.nxv8i16(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za32_u16( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: umop4s za0.s, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.wide.1x1.nxv8i16(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za32_f16( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4a za0.s, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8f16(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za32_f16( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za0.s, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za32_bf16( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za32_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: bfmop4a za0.s, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.wide.1x1.nxv8bf16(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za32_bf16( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: bfmop4s za0.s, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za64_s16( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za64_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4a za0.d, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.za64.wide.1x1.nxv8i16(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za64_s16( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za64_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4s za0.d, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za64_u16( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za64_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: umop4a za0.d, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.za64.wide.1x1.nxv8i16(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za64_u16( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za64_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: umop4s za0.d, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.za64.wide.1x1.nxv8i16(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za64_s16_u16( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za64_s16_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: sumop4a za0.d, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4a.za64.wide.1x1.nxv8i16(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za64_s16_u16( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za64_s16_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: sumop4s za0.d, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4s.za64.wide.1x1.nxv8i16(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za64_u16_s16( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za64_u16_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: usmop4a za0.d, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4a.za64.wide.1x1.nxv8i16(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za64_u16_s16( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za64_u16_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: usmop4s za0.d, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4s.za64.wide.1x1.nxv8i16(i32 0, %zn, %zm) + ret void +} + +; Non-widening +define void @mop4a_za16_f16( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4a za0.h, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.1x1.nxv8f16(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za16_f16( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za0.h, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za32_f32( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4a za0.s, z0.s, z24.s +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.1x1.nxv4f32(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za32_f32( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za0.s, z0.s, z24.s +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za64_f64( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4a za0.d, z0.d, z24.d +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.1x1.nxv2f64(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za64_f64( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za0.d, z0.d, z24.d +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 0, %zn, %zm) + ret void +} + +define void @mop4a_za16_bf16( %zn, %zm) #0 { +; CHECK-LABEL: mop4a_za16_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: bfmop4a za0.h, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.1x1.nxv8bf16(i32 0, %zn, %zm) + ret void +} + +define void @mop4s_za16_bf16( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za16_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: bfmop4s za0.h, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 0, %zn, %zm) + ret void +} + +; Tile limits +define void @mop4s_za32_s8_limit( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_s8_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4s za3.s, z0.b, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv16i8(i32 3, %zn, %zm) + ret void +} + +define void @mop4s_za32_s16_limit( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_s16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4s za3.s, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.1x1.nxv8i16(i32 3, %zn, %zm) + ret void +} + +define void @mop4s_za32_f16_limit( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_f16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za3.s, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8f16(i32 3, %zn, %zm) + ret void +} + +define void @mop4s_za32_bf16_limit( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_bf16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: bfmop4s za3.s, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.1x1.nxv8bf16(i32 3, %zn, %zm) + ret void +} + +define void @mop4s_za64_s16_limit( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za64_s16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4s za7.d, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.za64.wide.1x1.nxv8i16(i32 7, %zn, %zm) + ret void +} + +define void @mop4s_za64_f64_limit( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za64_f64_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za7.d, z0.d, z24.d +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x1.nxv2f64(i32 7, %zn, %zm) + ret void +} + +define void @mop4s_za32_f32_limit( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za32_f32_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za3.s, z0.s, z24.s +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x1.nxv4f32(i32 3, %zn, %zm) + ret void +} + +define void @mop4s_za16_f16_limit( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za16_f16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za1.h, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x1.nxv8f16(i32 1, %zn, %zm) + ret void +} + +define void @mop4s_za16_bf16_limit( %zn, %zm) #0 { +; CHECK-LABEL: mop4s_za16_bf16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: bfmop4s za1.h, z0.h, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x1.nxv8bf16(i32 1, %zn, %zm) + ret void +} + +attributes #0 = {nounwind "target-features" = "+sme-i16i64,+sme-f64f64,+sme-b16b16,+sme2p1,+bf16,+sme-f16f16,+sme-mop4" } From a338f80ddcb97edd275c8bf949b1fab0c7d1049e Mon Sep 17 00:00:00 2001 From: Pablo Antonio Martinez Date: Tue, 1 Apr 2025 13:39:33 +0100 Subject: [PATCH 0234/1029] [mlir][Linalg] Add transform to convert linalg.copy into memref.copy (#132422) Targeted rewrite of a linalg.copy on memrefs to a memref.copy. This is useful when bufferizing copies to a linalg.copy, applying some transformations, and then rewriting the copy into a memref.copy. If the element types of the source and destination differ, or if the source is a scalar, the transform produces a silenceable failure. --- .../Linalg/TransformOps/LinalgTransformOps.td | 34 +++++++ .../TransformOps/LinalgTransformOps.cpp | 65 +++++++++++++ .../transform-op-linalg-copy-to-memref.mlir | 94 +++++++++++++++++++ 3 files changed, 193 insertions(+) create mode 100644 mlir/test/Dialect/Linalg/transform-op-linalg-copy-to-memref.mlir diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td index 12080cee85c9d..15ea5e7bf7159 100644 --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -559,6 +559,40 @@ def InterchangeOp : Op { + let description = [{ + Targeted rewrite of a linalg.copy on memrefs to a memref.copy. + This is useful when bufferizing copies to a linalg.copy, later applying some + transformations, and then rewriting the copy into a memref.copy. + If the element types of the source and destination differ, or if the source + is a scalar, the transform produces a silenceable failure. + }]; + + let arguments = (ins TransformHandleTypeInterface:$target); + let results = (outs TransformHandleTypeInterface:$transformed); + + let assemblyFormat = "$target attr-dict `:` " + "functional-type(operands, results) "; + + let builders = [ + OpBuilder<(ins "Value":$target)>, + ]; + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::transform::TransformRewriter &rewriter, + ::mlir::Operation *target, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} + //===----------------------------------------------------------------------===// // LowerPackOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 5d65e7e4666c3..c90ebe4487ca4 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -1176,6 +1176,71 @@ LogicalResult transform::InterchangeOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// LinalgCopyToMemrefOp +//===----------------------------------------------------------------------===// + +DiagnosedSilenceableFailure transform::LinalgCopyToMemrefOp::applyToOne( + transform::TransformRewriter &rewriter, Operation *targetOp, + transform::ApplyToEachResultList &results, + transform::TransformState &state) { + + // Check if the target can be converted. + if (!isa(targetOp)) { + DiagnosedSilenceableFailure diag = + emitSilenceableError() << "only linalg.copy target ops are supported"; + diag.attachNote(targetOp->getLoc()) << "target op"; + return diag; + } + + auto copyOp = dyn_cast(targetOp); + if (!copyOp.hasPureBufferSemantics()) { + DiagnosedSilenceableFailure diag = + emitSilenceableError() + << "cannot transform a linalg.copy on tensors into a memref.copy"; + diag.attachNote(targetOp->getLoc()) << "target op"; + return diag; + } + + SmallVector inputs = copyOp.getInputs(); + SmallVector outputs = copyOp.getOutputs(); + assert(inputs.size() == 1 && "expected linalg copy op with one input"); + assert(outputs.size() == 1 && "expected memref copy op with one output"); + Value input = inputs.front(); + Value output = outputs.front(); + + // linalg.copy supports different element types on source/dest whereas + // memref.copy does not, so we must check that the source and dest types can + // be handled by memref.copy and otherwise reject the transformation. + if (!dyn_cast(input.getType())) { + DiagnosedSilenceableFailure diag = + emitSilenceableError() + << "cannot transform a linalg.copy which input has no shape"; + diag.attachNote(targetOp->getLoc()) << "target op"; + return diag; + } + + // linalg.copy destination must be a shaped type. + assert(dyn_cast(output.getType())); + + if (cast(input.getType()).getElementType() != + cast(output.getType()).getElementType()) { + DiagnosedSilenceableFailure diag = + emitSilenceableError() + << "cannot transform a linalg.copy with different source and " + "destination element types "; + diag.attachNote(targetOp->getLoc()) << "target op"; + return diag; + } + + // Target can be converted, do it. + auto memrefCopyOp = + rewriter.replaceOpWithNewOp(targetOp, input, output); + + results.push_back(memrefCopyOp); + return DiagnosedSilenceableFailure::success(); +} + //===----------------------------------------------------------------------===// // LowerPackOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Linalg/transform-op-linalg-copy-to-memref.mlir b/mlir/test/Dialect/Linalg/transform-op-linalg-copy-to-memref.mlir new file mode 100644 index 0000000000000..7280ccbea2563 --- /dev/null +++ b/mlir/test/Dialect/Linalg/transform-op-linalg-copy-to-memref.mlir @@ -0,0 +1,94 @@ +// RUN: mlir-opt -transform-interpreter %s --split-input-file --allow-unregistered-dialect -verify-diagnostics | FileCheck %s + +// CHECK: func.func @linalg_copy_to_memref_copy(%[[INPUT:.*]]: memref<128x64xf32>, %[[OUTPUT:.*]]: memref<128x64xf32>) { +// CHECK: memref.copy %[[INPUT]], %[[OUTPUT]] : memref<128x64xf32> to memref<128x64xf32> +// CHECK: return +// CHECK: } + +func.func @linalg_copy_to_memref_copy(%input : memref<128x64xf32>, %output : memref<128x64xf32>) { + linalg.copy ins(%input : memref<128x64xf32>) outs(%output : memref<128x64xf32>) + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.linalg_copy_to_memref %0 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +// CHECK: func.func @linalg_copy_to_memref_copy_strides(%[[INPUT:.*]]: memref<128x32xf32>, %[[OUTPUT:.*]]: memref<128x64xf32>) { +// CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 64 : i64} : memref<128x64xf32> +// CHECK: %[[SUBVIEW:.*]] = memref.subview %[[ALLOC]][0, 32] [128, 32] [1, 1] : memref<128x64xf32> to memref<128x32xf32, strided<[64, 1], offset: 32>> +// CHECK: memref.copy %[[INPUT]], %[[SUBVIEW]] : memref<128x32xf32> to memref<128x32xf32, strided<[64, 1], offset: 32>> +// CHECK: return +// CHECK: } + +func.func @linalg_copy_to_memref_copy_strides(%input : memref<128x32xf32>, %output : memref<128x64xf32>) { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<128x64xf32> + %subview = memref.subview %alloc[0, 32] [128, 32] [1, 1] : memref<128x64xf32> to memref<128x32xf32, strided<[64, 1], offset: 32>> + linalg.copy ins(%input : memref<128x32xf32>) outs(%subview : memref<128x32xf32, strided<[64, 1], offset: 32>>) + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.linalg_copy_to_memref %0 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +func.func @linalg_copy_to_memref_copy_tensors(%input : tensor<128x64xf32>, %output : tensor<128x64xf32>) -> tensor<128x64xf32> { + // expected-note @below {{target op}} + %0 = linalg.copy ins(%input : tensor<128x64xf32>) outs(%output : tensor<128x64xf32>) -> tensor<128x64xf32> + return %0 : tensor<128x64xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error @below {{cannot transform a linalg.copy on tensors into a memref.copy}} + %1 = transform.structured.linalg_copy_to_memref %0 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +func.func @linalg_copy_to_memref_copy_different_element(%input : memref<128x64xf32>, %output : memref<128x64xf64>) { + // expected-note @below {{target op}} + linalg.copy ins(%input : memref<128x64xf32>) outs(%output : memref<128x64xf64>) + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error @below {{cannot transform a linalg.copy with different source and destination element types}} + %1 = transform.structured.linalg_copy_to_memref %0 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +func.func @linalg_copy_to_memref_copy_scalar(%input : f64, %output : memref<128x64xf64>) { + // expected-note @below {{target op}} + linalg.copy ins(%input : f64) outs(%output : memref<128x64xf64>) + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error @below {{cannot transform a linalg.copy which input has no shape}} + %1 = transform.structured.linalg_copy_to_memref %0 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} From 2c0b888359c6c5976054bb423ba1d7b37bae9f1a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 1 Apr 2025 13:47:52 +0100 Subject: [PATCH 0235/1029] [X86] combineX86ShuffleChain - prefer combining to X86ISD::SHUF128 if PERMQ operands are splittable (#133900) If the 512-bit unary shuffle is a concatenation of 128/256-bit subvectors then we're better off using a X86ISD::SHUF128 node so we can fold the concatenation into the shuffle as well. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +- .../vector-interleaved-store-i8-stride-7.ll | 3304 ++++++++--------- 2 files changed, 1620 insertions(+), 1687 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b1745e5a30d7b..78762774f65e8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -39797,7 +39797,8 @@ static SDValue combineX86ShuffleChain( // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask // doesn't work because our mask is for 128 bits and we don't have an MVT // to match that. - bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) && + bool PreferPERMQ = UnaryShuffle && !isFreeToSplitVector(V1, DAG) && + isUndefOrInRange(ScaledMask[0], 0, 2) && isUndefOrInRange(ScaledMask[1], 0, 2) && isUndefOrInRange(ScaledMask[2], 2, 4) && isUndefOrInRange(ScaledMask[3], 2, 4) && diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 6a5dbbc56d9bc..02ec9fc66feab 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -4145,8 +4145,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpor %xmm8, %xmm11, %xmm8 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm11, %zmm8 -; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm11[0,1,0,1],zmm8[0,1,0,1] ; AVX512-NEXT: vmovdqa (%rcx), %xmm14 ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] ; AVX512-NEXT: vmovdqa (%rdx), %xmm15 @@ -4154,8 +4153,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512-NEXT: vinserti32x4 $2, %xmm11, %zmm12, %zmm11 -; AVX512-NEXT: vpermq {{.*#+}} zmm16 = zmm11[0,1,0,1,4,5,4,5] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,0,1],zmm11[0,1,0,1] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm8)) ; AVX512-NEXT: vmovdqa (%r9), %xmm11 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] @@ -4164,8 +4162,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpor %xmm8, %xmm13, %xmm8 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 -; AVX512-NEXT: vpermq {{.*#+}} zmm17 = zmm8[0,1,0,1,4,5,4,5] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm13[0,1,0,1],zmm8[0,1,0,1] ; AVX512-NEXT: vmovdqa (%r10), %xmm13 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm13[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[1,1,0,0,4,5,6,7] @@ -4281,8 +4278,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpor %xmm8, %xmm11, %xmm8 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm11, %zmm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm11[0,1,0,1],zmm8[0,1,0,1] ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm14 @@ -4290,8 +4286,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpor %xmm11, %xmm13, %xmm11 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm13, %zmm11 -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm16 = zmm11[0,1,0,1,4,5,4,5] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm13[0,1,0,1],zmm11[0,1,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm8)) ; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm11 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[1,1,0,0,4,5,6,7] @@ -4305,8 +4300,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpor %xmm0, %xmm8, %xmm0 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm0[0,1,0,1,4,5,4,5] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm0[0,1,0,1] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm17 ^ (mem & (zmm8 ^ zmm17)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm16)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] @@ -4417,8 +4411,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpor %xmm8, %xmm11, %xmm8 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm11, %zmm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm11[0,1,0,1],zmm8[0,1,0,1] ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm14 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm15 @@ -4426,8 +4419,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpor %xmm11, %xmm12, %xmm11 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm12, %zmm11 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm16 = zmm11[0,1,0,1,4,5,4,5] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,0,1],zmm11[0,1,0,1] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm8)) ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm11 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] @@ -4436,8 +4428,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpor %xmm8, %xmm13, %xmm8 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm13, %zmm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm17 = zmm8[0,1,0,1,4,5,4,5] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm13[0,1,0,1],zmm8[0,1,0,1] ; AVX512DQ-NEXT: vmovdqa (%r10), %xmm13 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm13[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[1,1,0,0,4,5,6,7] @@ -4553,8 +4544,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm11, %xmm8 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm11, %zmm8 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm11[0,1,0,1],zmm8[0,1,0,1] ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm14 @@ -4562,8 +4552,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm13, %xmm11 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm13, %zmm11 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm16 = zmm11[0,1,0,1,4,5,4,5] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm13[0,1,0,1],zmm11[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm8)) ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm11 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[1,1,0,0,4,5,6,7] @@ -4577,8 +4566,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm8, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm0[0,1,0,1,4,5,4,5] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm0[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm17 ^ (mem & (zmm8 ^ zmm17)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm16)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] @@ -4642,74 +4630,73 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero,zero,ymm2[18] -; AVX512BW-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,0,1,14],zero,ymm6[14,15,0,1,14,15],zero,ymm6[13,14,15,16,17,16],zero,ymm6[30,31,30,31,16,17],zero,ymm6[31,28,29,30,31] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[0,1,14],zero,ymm13[12,13,0,1,14,15],zero,ymm13[3,12,13,2,3,16],zero,ymm13[30,31,28,29,16,17],zero,ymm13[31,18,19,28,29,18],zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] +; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[13],zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero +; AVX512BW-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX512BW-NEXT: vmovdqa (%r9), %xmm8 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm11 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm14 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero -; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512BW-NEXT: vmovdqa (%r9), %xmm11 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm13 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm8 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512BW-NEXT: vpermw %zmm7, %zmm15, %zmm15 +; AVX512BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 ; AVX512BW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} +; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm14 {%k1} ; AVX512BW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm0 {%k1} -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm8 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm8[18,19,20,21],zero,zmm8[19],zero,zmm8[25,26,27,22],zero,zmm8[20],zero,zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm8[55],zero,zero,zero,zero,zmm8[58],zero,zmm8[56],zero,zero,zero,zero,zmm8[59],zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm15 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[18],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero,zmm15[59],zero,zmm15[57] -; AVX512BW-NEXT: vporq %zmm8, %zmm15, %zmm8 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,1,1,4,4,5,5] +; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm0 {%k1} +; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,1,1,4,4,5,5] ; AVX512BW-NEXT: movl $676341840, %ecx # imm = 0x28502850 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm1[23],zero,ymm1[21,22,23,26],zero,ymm1[24],zero,ymm1[28,29,26,27] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm17 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero -; AVX512BW-NEXT: vporq %ymm16, %ymm17, %ymm16 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k1} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm9[25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero +; AVX512BW-NEXT: vporq %ymm15, %ymm16, %ymm15 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm14[2,3,2,3],zmm15[2,3,2,3] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm14 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm14[18,19,20,21],zero,zmm14[19],zero,zmm14[25,26,27,22],zero,zmm14[20],zero,zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm14[55],zero,zero,zero,zero,zmm14[58],zero,zmm14[56],zero,zero,zero,zero,zmm14[59],zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] +; AVX512BW-NEXT: vporq %zmm14, %zmm16, %zmm14 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,3,2,3,6,7,6,7] ; AVX512BW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512BW-NEXT: kmovq %rcx, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k2} +; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm14 {%k2} ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-NEXT: vpermw %zmm7, %zmm15, %zmm15 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero +; AVX512BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm2[18],zero,zmm2[20,21,20,21],zero,zmm2[19],zero,zmm2[19,20,21,22],zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm2[55],zero,zmm2[55,56,57,58],zero,zmm2[56],zero,zmm2[62,63] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm17 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[20],zero,zmm1[18],zero,zero,zero,zero,zmm1[21],zero,zmm1[19],zero,zero,zero,zero,zmm1[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero ; AVX512BW-NEXT: vporq %zmm16, %zmm17, %zmm16 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] ; AVX512BW-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 @@ -4717,69 +4704,66 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm16 {%k2} ; AVX512BW-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C ; AVX512BW-NEXT: kmovq %rcx, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] -; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u,u],zero -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] -; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm14, %zmm9, %zmm9 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] -; AVX512BW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C -; AVX512BW-NEXT: kmovq %rcx, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm12, %zmm9 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX512BW-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-NEXT: vpermw %zmm7, %zmm11, %zmm11 -; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 -; AVX512BW-NEXT: kmovq %rcx, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm11, %zmm10 {%k2} -; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 -; AVX512BW-NEXT: kmovq %rcx, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k2} -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm14 {%k2} +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,3,3,6,6,7,7] ; AVX512BW-NEXT: movl $338170920, %ecx # imm = 0x14281428 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 {%k2} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[2,3,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,3,3,4,6,7,7] -; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm13 {%k2} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm13[2,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,3,3,4,6,7,7] +; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm9 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,2,3] ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] -; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %ymm7, %ymm2, %ymm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] -; AVX512BW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] +; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,0,1],zmm7[0,1,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u],zero,xmm4[7],zero,xmm4[5,u,u,u],zero,xmm4[8],zero,xmm4[6,u,u,u],zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero,xmm3[u,u,u,9] +; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm9[0,1,0,1] +; AVX512BW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C +; AVX512BW-NEXT: kmovq %rcx, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm8[4,u,u,u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero +; AVX512BW-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,0,1],zmm4[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 +; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 +; AVX512BW-NEXT: kmovq %rcx, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k1} +; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 +; AVX512BW-NEXT: kmovq %rcx, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermw %ymm5, %ymm4, %ymm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] +; AVX512BW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512BW-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} +; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512BW-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm6 {%k1} +; AVX512BW-NEXT: vmovdqa %ymm6, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -4865,15 +4849,13 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,0,1],zmm15[0,1,0,1] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] ; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm8, %zmm8 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm14[0,1,0,1] ; AVX512BW-FCP-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm8 {%k1} @@ -4882,8 +4864,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm11, %zmm9 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,0,1],zmm9[0,1,0,1] ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512BW-FCP-NEXT: vpermw %zmm7, %zmm11, %zmm11 ; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 @@ -4913,11 +4894,11 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-FCP-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -4927,74 +4908,73 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm3 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[0,1,14],zero,ymm4[12,13,0,1,14,15],zero,ymm4[3,12,13,2,3,16],zero,ymm4[30,31,28,29,16,17],zero,ymm4[31,18,19,28,29,18],zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero,zero,ymm2[18] -; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,0,1,14],zero,ymm6[14,15,0,1,14,15],zero,ymm6[13,14,15,16,17,16],zero,ymm6[30,31,30,31,16,17],zero,ymm6[31,28,29,30,31] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[0,1,14],zero,ymm13[12,13,0,1,14,15],zero,ymm13[3,12,13,2,3,16],zero,ymm13[30,31,28,29,16,17],zero,ymm13[31,18,19,28,29,18],zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] +; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[13],zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm8 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm11 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm14 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm11 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm13 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512DQ-BW-NEXT: vpermw %zmm7, %zmm15, %zmm15 +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 ; AVX512DQ-BW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm14 {%k1} ; AVX512DQ-BW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm8 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm8[18,19,20,21],zero,zmm8[19],zero,zmm8[25,26,27,22],zero,zmm8[20],zero,zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm8[55],zero,zero,zero,zero,zmm8[58],zero,zmm8[56],zero,zero,zero,zero,zmm8[59],zero -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm15 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[18],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero,zmm15[59],zero,zmm15[57] -; AVX512DQ-BW-NEXT: vporq %zmm8, %zmm15, %zmm8 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,1,1,4,4,5,5] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm14, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,1,1,4,4,5,5] ; AVX512DQ-BW-NEXT: movl $676341840, %ecx # imm = 0x28502850 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm1[23],zero,ymm1[21,22,23,26],zero,ymm1[24],zero,ymm1[28,29,26,27] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm17 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %ymm16, %ymm17, %ymm16 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k1} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm9[25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %ymm15, %ymm16, %ymm15 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm14[2,3,2,3],zmm15[2,3,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm14 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm14[18,19,20,21],zero,zmm14[19],zero,zmm14[25,26,27,22],zero,zmm14[20],zero,zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm14[55],zero,zero,zero,zero,zmm14[58],zero,zmm14[56],zero,zero,zero,zero,zmm14[59],zero +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] +; AVX512DQ-BW-NEXT: vporq %zmm14, %zmm16, %zmm14 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm14 {%k2} ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512DQ-BW-NEXT: vpermw %zmm7, %zmm15, %zmm15 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm2[18],zero,zmm2[20,21,20,21],zero,zmm2[19],zero,zmm2[19,20,21,22],zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm2[55],zero,zmm2[55,56,57,58],zero,zmm2[56],zero,zmm2[62,63] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm17 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[20],zero,zmm1[18],zero,zero,zero,zero,zmm1[21],zero,zmm1[19],zero,zero,zero,zero,zmm1[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero ; AVX512DQ-BW-NEXT: vporq %zmm16, %zmm17, %zmm16 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 @@ -5002,69 +4982,66 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C ; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm8 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] -; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6,u,u,u],zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] -; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm14, %zmm9, %zmm9 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] -; AVX512DQ-BW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C -; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm12, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,1,0,1,4,5,4,5] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512DQ-BW-NEXT: vpermw %zmm7, %zmm11, %zmm11 -; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 -; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm11, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 -; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k2} -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm14 {%k2} +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,3,3,6,6,7,7] ; AVX512DQ-BW-NEXT: movl $338170920, %ecx # imm = 0x14281428 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 {%k2} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm4[2,3,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,3,3,4,6,7,7] -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm13 {%k2} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm13[2,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,3,3,4,6,7,7] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm9 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,2,3] ; AVX512DQ-BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] -; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,0,1],zmm7[0,1,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u],zero,xmm4[7],zero,xmm4[5,u,u,u],zero,xmm4[8],zero,xmm4[6,u,u,u],zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero,xmm3[u,u,u,9] +; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm9[0,1,0,1] +; AVX512DQ-BW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C +; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm8[4,u,u,u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero +; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,0,1],zmm4[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] +; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermw %ymm5, %ymm4, %ymm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] +; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512DQ-BW-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa %ymm6, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -5150,15 +5127,13 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,1,0,1,4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,0,1],zmm15[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm8, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,1,0,1,4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm14[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm8 {%k1} @@ -5167,8 +5142,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm11, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,1,0,1,4,5,4,5] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,0,1],zmm9[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm7, %zmm11, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 @@ -5198,11 +5172,11 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512DQ-BW-FCP-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper @@ -8425,1577 +8399,1535 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-LABEL: store_i8_stride7_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $1560, %rsp # imm = 0x618 -; AVX512-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] -; AVX512-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX512-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512-NEXT: vpshufb %ymm2, %ymm11, %ymm1 +; AVX512-NEXT: vpshufb %ymm2, %ymm10, %ymm1 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512-NEXT: vmovdqa (%rcx), %ymm8 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512-NEXT: vmovdqa (%rdx), %ymm7 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512-NEXT: vpshufb %ymm11, %ymm7, %ymm1 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%r8), %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%r9), %ymm8 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm1 -; AVX512-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512-NEXT: vmovdqa (%r8), %ymm6 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm1, %ymm6, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512-NEXT: vmovdqa (%r9), %ymm5 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm2, %ymm5, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] -; AVX512-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm1, %ymm6, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm6, %ymm20 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512-NEXT: vpshufb %ymm5, %ymm11, %ymm2 -; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] +; AVX512-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm14, %ymm3, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512-NEXT: vpshufb %ymm0, %ymm10, %ymm2 +; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm15, %ymm7, %ymm1 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] ; AVX512-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] -; AVX512-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm7, %ymm13, %ymm3 -; AVX512-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufb %ymm2, %ymm8, %ymm3 +; AVX512-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] ; AVX512-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] -; AVX512-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm4, %ymm8, %ymm6 -; AVX512-NEXT: vmovdqa64 %ymm8, %ymm17 -; AVX512-NEXT: vpor %ymm0, %ymm6, %ymm0 +; AVX512-NEXT: vpshufb %ymm3, %ymm6, %ymm1 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] +; AVX512-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm9, %ymm5, %ymm13 +; AVX512-NEXT: vpor %ymm1, %ymm13, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512-NEXT: vpshufb %ymm14, %ymm13, %ymm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX512-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512-NEXT: vpshufb %ymm1, %ymm10, %ymm6 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-NEXT: vpshufb %ymm5, %ymm1, %ymm5 -; AVX512-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm5 -; AVX512-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128] -; AVX512-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm8, %ymm10, %ymm6 -; AVX512-NEXT: vmovdqa64 %ymm8, %ymm25 -; AVX512-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm6 -; AVX512-NEXT: vpshufb %ymm7, %ymm6, %ymm7 -; AVX512-NEXT: vpor %ymm2, %ymm7, %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero,zero,ymm14[27],zero,ymm14[25] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23,u,u,u],zero,ymm13[26],zero,ymm13[24,u,u,u],zero,ymm13[27],zero +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512-NEXT: vpshufb %ymm15, %ymm1, %ymm15 +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX512-NEXT: vpor %ymm2, %ymm15, %ymm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] -; AVX512-NEXT: vpor %ymm2, %ymm7, %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm12, %ymm1, %ymm15 +; AVX512-NEXT: vmovdqa64 %ymm12, %ymm22 +; AVX512-NEXT: vpor %ymm2, %ymm15, %ymm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm7 -; AVX512-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512-NEXT: vpor %ymm7, %ymm4, %ymm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm2[23],zero,ymm2[23,24,25,26],zero,ymm2[24],zero,ymm2[30,31] -; AVX512-NEXT: vpor %ymm4, %ymm7, %ymm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX512-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX512-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm3 +; AVX512-NEXT: vmovdqa64 %ymm9, %ymm21 +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm15[23],zero,ymm15[23,24,25,26],zero,ymm15[24],zero,ymm15[30,31] +; AVX512-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa 32(%rax), %ymm7 -; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX512-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX512-NEXT: vmovdqa64 %xmm14, %xmm23 -; AVX512-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512-NEXT: vpshufb %xmm14, %xmm9, %xmm8 -; AVX512-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512-NEXT: vpor %xmm4, %xmm8, %xmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] -; AVX512-NEXT: vmovdqa 32(%rax), %xmm8 -; AVX512-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpermi2d %zmm4, %zmm8, %zmm28 -; AVX512-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512-NEXT: vpshufb %xmm12, %xmm4, %xmm8 -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm30 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm9 -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm18 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512-NEXT: vpor %xmm8, %xmm9, %xmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm24, %ymm4 -; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm8 -; AVX512-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm9 -; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX512-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX512-NEXT: vmovdqa 32(%rax), %ymm9 +; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm4, %ymm9, %ymm12 +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm12[2,3,2,3],zmm3[2,3,2,3] +; AVX512-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512-NEXT: vpshufb %ymm11, %ymm1, %ymm11 +; AVX512-NEXT: vpor %ymm3, %ymm11, %ymm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] ; AVX512-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512-NEXT: vpor %ymm6, %ymm5, %ymm0 +; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm6 -; AVX512-NEXT: vpor %ymm5, %ymm6, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX512-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm0, %ymm10, %ymm5 -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512-NEXT: vpor %ymm5, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512-NEXT: vpshufb %ymm13, %ymm14, %ymm1 +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] ; AVX512-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm5 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm10, %ymm2, %ymm6 -; AVX512-NEXT: vpor %ymm5, %ymm6, %ymm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm21, %ymm4 -; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm2 -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm24 -; AVX512-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512-NEXT: vpshufb %xmm14, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512-NEXT: vpshufb %xmm15, %xmm3, %xmm2 -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%r9), %xmm14 -; AVX512-NEXT: vpshufb %xmm12, %xmm14, %xmm2 -; AVX512-NEXT: vmovdqa (%r8), %xmm12 -; AVX512-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm29, %ymm2 -; AVX512-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm25, %ymm3 -; AVX512-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512-NEXT: vporq %ymm2, %ymm0, %ymm29 -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero -; AVX512-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm3[23],zero,ymm3[21,22,23,26],zero,ymm3[24],zero,ymm3[28,29,26,27] -; AVX512-NEXT: vporq %ymm0, %ymm2, %ymm25 -; AVX512-NEXT: vpshufb %ymm8, %ymm13, %ymm0 -; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm2 -; AVX512-NEXT: vporq %ymm0, %ymm2, %ymm22 -; AVX512-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm3, %ymm15, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm25, %ymm1 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm16 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512-NEXT: vpshufb %xmm14, %xmm2, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rax), %ymm5 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm13 -; AVX512-NEXT: vmovdqa (%rax), %xmm11 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm31 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm30 +; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%r9), %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[4,u,u,u],zero,xmm1[7],zero,xmm1[5,u,u,u],zero,xmm1[8],zero,xmm1[6] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[4],zero,xmm3[u,u,u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero,ymm10[27],zero,ymm10[25] +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero +; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512-NEXT: vpshufb %ymm13, %ymm10, %ymm2 +; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero +; AVX512-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512-NEXT: vpshufb %ymm9, %ymm7, %ymm2 +; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] +; AVX512-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%rax), %ymm4 +; AVX512-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm18 +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX512-NEXT: vpshufb %xmm11, %xmm13, %xmm2 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512-NEXT: vporq %xmm2, %xmm0, %xmm26 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512-NEXT: vpshufb %xmm12, %xmm11, %xmm0 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512-NEXT: vpshufb %xmm14, %xmm8, %xmm2 +; AVX512-NEXT: vporq %xmm0, %xmm2, %xmm24 +; AVX512-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm20 +; AVX512-NEXT: vmovdqa 32(%r9), %xmm15 +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm15[4,u,u,u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6] +; AVX512-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[4],zero,xmm5[u,u,u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero +; AVX512-NEXT: vporq %xmm0, %xmm3, %xmm22 +; AVX512-NEXT: vmovdqa (%rax), %xmm2 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] +; AVX512-NEXT: vpshufb %ymm9, %ymm4, %ymm6 +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm26 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm0[2,3,2,3],zmm3[0,1,0,1] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm23 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm0[2,3,2,3],zmm3[0,1,0,1] -; AVX512-NEXT: vmovdqa64 %xmm30, %xmm4 -; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm6[0,1,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm0 -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm21 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,0,1],zmm1[0,1,0,1] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512-NEXT: vmovdqa64 %xmm24, %xmm7 -; AVX512-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm6[0,1,0,1] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,0,1],zmm1[0,1,0,1] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[2,3,2,3],zmm7[0,1,0,1] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm4 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[2,3,2,3],zmm0[0,1,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpshufb %ymm10, %ymm0, %ymm1 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm9 ^ (zmm1 & (zmm18 ^ zmm9)) -; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512-NEXT: # ymm9 = mem[2,3,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512-NEXT: # ymm27 = mem[2,3,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = zmm9 ^ (zmm1 & (zmm27 ^ zmm9)) -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512-NEXT: # zmm1 = zmm1[0,1,0,1],mem[0,1,0,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] -; AVX512-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload -; AVX512-NEXT: # zmm9 = zmm9[0,1,0,1],mem[0,1,0,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 16-byte Folded Reload -; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm7 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload -; AVX512-NEXT: # zmm14 = zmm14[2,3,2,3],mem[2,3,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 64-byte Folded Reload -; AVX512-NEXT: # zmm15 = zmm15[2,3,2,3],mem[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm14)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload -; AVX512-NEXT: # zmm14 = zmm14[2,3,2,3],mem[2,3,2,3] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm23 & (zmm14 ^ zmm15)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm31 ^ (zmm23 & (zmm21 ^ zmm31)) -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm19[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm17[2,3,2,3] -; AVX512-NEXT: vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512-NEXT: # xmm15 = mem[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm9 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm9 +; AVX512-NEXT: vmovdqa64 %xmm28, %xmm4 +; AVX512-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; AVX512-NEXT: vmovdqa64 %xmm12, %xmm16 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm28 # 64-byte Folded Reload +; AVX512-NEXT: # zmm28 = zmm4[0,1,0,1],mem[0,1,0,1] +; AVX512-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (zmm14 & (zmm7 ^ zmm6)) +; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512-NEXT: # ymm6 = mem[2,3,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm31, %zmm6 +; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512-NEXT: # ymm31 = mem[2,3,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm31 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm6 ^ (zmm14 & (zmm31 ^ zmm6)) +; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload +; AVX512-NEXT: # zmm6 = zmm12[0,1,0,1],mem[0,1,0,1] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm14 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm23 # 64-byte Folded Reload +; AVX512-NEXT: # zmm23 = zmm14[0,1,0,1],mem[0,1,0,1] +; AVX512-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm14 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX512-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm26[0,1,0,1],zmm10[0,1,0,1] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm24[0,1,0,1],zmm8[0,1,0,1] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm11 +; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm22[0,1,0,1],zmm2[0,1,0,1] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: # zmm12 = zmm12[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm5)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: # zmm12 = zmm12[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm17[2,3,2,3] +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm15 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,1,0,0,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] -; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload -; AVX512-NEXT: # zmm20 = zmm29[2,3,2,3],mem[2,3,2,3] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm22[2,3,2,3],zmm25[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm20 ^ (zmm23 & (zmm17 ^ zmm20)) -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm6)) -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 | (zmm5 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm11)) -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,0,1,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm30)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm18)) -; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm14)) -; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm4[0,1,0,1,4,5,4,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm28 ^ (mem & (zmm0 ^ zmm28)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm21)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: # zmm2 = zmm2[0,1,2,3],mem[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm2 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm27)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm1)) -; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm12[0,1,0,1,4,5,4,5] -; AVX512-NEXT: vpermq {{.*#+}} zmm2 = zmm7[0,0,1,0,4,4,5,4] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm9)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm1)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm17)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload +; AVX512-NEXT: # zmm17 = zmm17[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm12 ^ (zmm13 & (zmm17 ^ zmm12)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm10 ^ (zmm13 & (zmm8 ^ zmm10)) +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm0)) +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm0 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm4)) +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,1,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm19)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (mem & (zmm29 ^ zmm5)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = zmm4[0,1,2,3],mem[2,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm4 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm31)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm28)) +; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,0,4,4,5,4] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm23)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm4)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm17)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm20 ^ (mem & (zmm2 ^ zmm20)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm8)) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm10, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rax) -; AVX512-NEXT: addq $1560, %rsp # imm = 0x618 +; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm29, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $1640, %rsp # imm = 0x668 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[0,1,14],zero,ymm14[12,13,0,1,14,15],zero,ymm14[3,12,13,2,3,16],zero,ymm14[30,31,28,29,16,17],zero,ymm14[31,18,19,28,29,18],zero +; AVX512-FCP-NEXT: subq $1416, %rsp # imm = 0x588 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero,zero,ymm12[18] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[0,1,0,1,14],zero,ymm3[14,15,0,1,14,15],zero,ymm3[13,14,15,16,17,16],zero,ymm3[30,31,30,31,16,17],zero,ymm3[31,28,29,30,31] -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,1,0,1,14],zero,ymm4[14,15,0,1,14,15],zero,ymm4[13,14,15,16,17,16],zero,ymm4[30,31,30,31,16,17],zero,ymm4[31,28,29,30,31] +; AVX512-FCP-NEXT: vpor %ymm1, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[13,u,u,u,u,u],zero,ymm11[14,u,u,u,u,u],zero,ymm11[15,u,u,u,u,u],zero,ymm11[16,u,u,u,u,u],zero,ymm11[17,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm3, %ymm5, %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] +; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512-FCP-NEXT: vpor %ymm3, %ymm5, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128] +; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512-FCP-NEXT: vpor %ymm3, %ymm5, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm11[30],zero,ymm11[28,u,u,u],zero,ymm11[31],zero,ymm11[29,u] -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] +; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512-FCP-NEXT: vpor %ymm5, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm7 +; AVX512-FCP-NEXT: vpor %ymm5, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] +; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] +; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512-FCP-NEXT: vpor %ymm5, %ymm7, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm3[27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[2,3,2,3],zmm1[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm7 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm9[19],zero,ymm9[21,20,21,22],zero,ymm9[20],zero,ymm9[22,23] +; AVX512-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm7 ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm2[23],zero,ymm2[23,24,25,26],zero,ymm2[24],zero,ymm2[30,31] -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[20],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm8[18],zero,ymm8[20,21,20,21],zero,ymm8[19],zero,ymm8[19,20,21,22],zero +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm6 -; AVX512-FCP-NEXT: vpor %xmm0, %xmm6, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm5 +; AVX512-FCP-NEXT: vporq %ymm0, %ymm5, %ymm30 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm2 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm0 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm8 -; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0] +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm2 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm0, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero,xmm5[u,u] +; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm21 +; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm16 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm25 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm12 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm13 -; AVX512-FCP-NEXT: vpor %xmm12, %xmm13, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm17 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm6 +; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512-FCP-NEXT: vporq %xmm2, %xmm6, %xmm31 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[0,1,0,1,14],zero,ymm13[14,15,0,1,14,15],zero,ymm13[13,14,15,16,17,16],zero,ymm13[30,31,30,31,16,17],zero,ymm13[31,28,29,30,31] +; AVX512-FCP-NEXT: vpor %ymm2, %ymm6, %ymm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512-FCP-NEXT: vpor %xmm5, %xmm10, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3 +; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[0,1,14],zero,ymm15[12,13,0,1,14,15],zero,ymm15[3,12,13,2,3,16],zero,ymm15[30,31,28,29,16,17],zero,ymm15[31,18,19,28,29,18],zero +; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm17 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm4 -; AVX512-FCP-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm4 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm31 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512-FCP-NEXT: vpor %xmm4, %xmm8, %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm4 -; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm10 -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm8 -; AVX512-FCP-NEXT: vpor %ymm4, %ymm8, %ymm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm4 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm8 -; AVX512-FCP-NEXT: vpor %ymm4, %ymm8, %ymm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm8 -; AVX512-FCP-NEXT: vpor %ymm4, %ymm8, %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm15[19],zero,ymm15[21,20,21,22],zero,ymm15[20],zero,ymm15[22,23] +; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[20],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero +; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512-FCP-NEXT: vpor %ymm4, %ymm8, %ymm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm19 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0] -; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm9, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] -; AVX512-FCP-NEXT: vporq %ymm9, %ymm14, %ymm26 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero,ymm2[29,u] -; AVX512-FCP-NEXT: vporq %ymm9, %ymm14, %ymm24 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27,u,u,u],zero,ymm1[30],zero,ymm1[28,u,u,u],zero,ymm1[31],zero -; AVX512-FCP-NEXT: vporq %ymm9, %ymm14, %ymm23 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] -; AVX512-FCP-NEXT: vpor %ymm9, %ymm14, %ymm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm30 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero -; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm28 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm27 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm14[14],zero,zero,zero,zero,zero,zero,ymm14[15],zero,zero,zero,zero,zero,zero,ymm14[16],zero,zero,zero,zero,zero,zero,ymm14[17],zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[13,u,u,u,u,u],zero,ymm10[14,u,u,u,u,u],zero,ymm10[15,u,u,u,u,u],zero,ymm10[16,u,u,u,u,u],zero,ymm10[17,u,u,u] ; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[13,u,u,u,u,u],zero,ymm0[14,u,u,u,u,u],zero,ymm0[15,u,u,u,u,u],zero,ymm0[16,u,u,u,u,u],zero,ymm0[17,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,3,3,2,2,3,3] -; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm8 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm0[0,1,0,1],zmm2[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[0,1,0,1],zmm2[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm10, %zmm29 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm21 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm11 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 16-byte Folded Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [4,5,4,5,5,7,4,5,20,21,22,23,20,21,22,23] -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm26[2,3,2,3],zmm10[0,1,0,1] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm24[2,3,2,3],zmm14[0,1,0,1] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm12 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm23[2,3,2,3],zmm12[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-FCP-NEXT: vpermd %ymm15, %ymm2, %ymm4 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm9 = zmm1[2,3,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm15 = zmm1[2,3,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm9 ^ (zmm1 & (zmm15 ^ zmm9)) +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX512-FCP-NEXT: vporq %xmm1, %xmm0, %xmm20 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm0 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u] +; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm19 +; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm16 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm7 +; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm6 +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm15 +; AVX512-FCP-NEXT: vporq %xmm3, %xmm15, %xmm18 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3] +; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [4,5,4,5,5,7,4,5,20,21,22,23,20,21,22,23] +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm27 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm30[2,3,2,3],zmm4[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm15 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[2,3,2,3],zmm4[0,1,0,1] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm2[2,3,2,3],zmm4[0,1,0,1] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm2[0,1,0,1],zmm1[0,1,0,1] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm2[0,1,0,1],zmm1[0,1,0,1] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm31[0,1,0,1],zmm1[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm17, %ymm2 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm20[0,1,0,1] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm19[0,1,0,1] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,0,1],zmm18[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm9 = zmm9[2,3,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm7 ^ (zmm10 & (zmm9 ^ zmm7)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm9)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm15)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm9 = zmm9[2,3,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm18[0,1,0,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm19[0,1,0,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm17[0,1,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm12[0,1,0,1] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,1,0,1] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm0)) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm9)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm20 ^ (zmm1 & (zmm22 ^ zmm20)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm10 ^ (zmm8 & (zmm14 ^ zmm10)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm30[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm9 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm27[2,3,2,3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm28, %zmm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm9 ^ (zmm8 & (zmm10 ^ zmm9)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm3)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 | (zmm2 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm7)) -; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm29[0,1,0,1,4,5,4,5] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm22)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm21)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm31[0,1,0,1,4,5,4,5] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (mem & (zmm3 ^ zmm1)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm11)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm25 ^ (mem & (zmm12 ^ zmm25)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm14)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm5 = zmm1[0,1,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm10)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax) -; AVX512-FCP-NEXT: addq $1640, %rsp # imm = 0x668 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm10 & (zmm0 ^ zmm7)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm24 ^ (zmm10 & (zmm21 ^ zmm24)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm30 ^ (zmm7 & (zmm23 ^ zmm30)) +; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm10 = mem[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10 +; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm13 = mem[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm10 ^ (zmm7 & (zmm13 ^ zmm10)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm12, %zmm9 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm7)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm9)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm29 ^ (mem & (zmm4 ^ zmm29)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm23)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm25 ^ (mem & (zmm5 ^ zmm25)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm21)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm13)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm16 ^ (mem & (zmm6 ^ zmm16)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm8)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512-FCP-NEXT: addq $1416, %rsp # imm = 0x588 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride7_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $1560, %rsp # imm = 0x618 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm23 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm10 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm11, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm10, %ymm1 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm8 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm7 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm7, %ymm1 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm8 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm8, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm6, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm24 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] +; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm25 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] -; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm6, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm20 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm11, %ymm2 -; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] +; AVX512DQ-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm3, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm10, %ymm2 +; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512DQ-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm7, %ymm1 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] ; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm3, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] -; AVX512DQ-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm13, %ymm3 -; AVX512DQ-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm8, %ymm3 +; AVX512DQ-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm0 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] -; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm8, %ymm6 -; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm17 -; AVX512DQ-NEXT: vpor %ymm0, %ymm6, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm6, %ymm1 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] +; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm5, %ymm13 +; AVX512DQ-NEXT: vpor %ymm1, %ymm13, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm13, %ymm1 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm10, %ymm6 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm5 -; AVX512DQ-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm5 -; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128] -; AVX512DQ-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm10, %ymm6 -; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm25 -; AVX512DQ-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm6 -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm6, %ymm7 -; AVX512DQ-NEXT: vpor %ymm2, %ymm7, %ymm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero,zero,ymm14[27],zero,ymm14[25] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23,u,u,u],zero,ymm13[26],zero,ymm13[24,u,u,u],zero,ymm13[27],zero +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm1, %ymm15 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpor %ymm2, %ymm15, %ymm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] -; AVX512DQ-NEXT: vpor %ymm2, %ymm7, %ymm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512DQ-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm1, %ymm15 +; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm22 +; AVX512DQ-NEXT: vpor %ymm2, %ymm15, %ymm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm7 -; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512DQ-NEXT: vpor %ymm7, %ymm4, %ymm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm2[23],zero,ymm2[23,24,25,26],zero,ymm2[24],zero,ymm2[30,31] -; AVX512DQ-NEXT: vpor %ymm4, %ymm7, %ymm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm9 +; AVX512DQ-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm3 +; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm21 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm15[23],zero,ymm15[23,24,25,26],zero,ymm15[24],zero,ymm15[30,31] +; AVX512DQ-NEXT: vpor %ymm3, %ymm9, %ymm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm7 -; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm7, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm7 -; AVX512DQ-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512DQ-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm23 -; AVX512DQ-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm0, %xmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm9, %xmm8 -; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm31 -; AVX512DQ-NEXT: vpor %xmm4, %xmm8, %xmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] -; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm8 -; AVX512DQ-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm8, %zmm28 -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm4 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm4, %xmm8 -; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm30 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm9 -; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm18 -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512DQ-NEXT: vpor %xmm8, %xmm9, %xmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm4 -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm8 -; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm9 -; AVX512DQ-NEXT: vpor %ymm8, %ymm9, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX512DQ-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm6, %ymm6 +; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm9 +; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm9, %ymm12 +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm20 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm12[2,3,2,3],zmm3[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm1, %ymm11 +; AVX512DQ-NEXT: vpor %ymm3, %ymm11, %ymm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] ; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpor %ymm6, %ymm5, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm10[14],zero,zero,zero,zero,zero,zero,ymm10[15],zero,zero,zero,zero,zero,zero,ymm10[16],zero,zero,zero,zero,zero,zero,ymm10[17],zero,zero,zero,zero,zero,zero,ymm10[18] -; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm6 -; AVX512DQ-NEXT: vpor %ymm5, %ymm6, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm10, %ymm5 -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpor %ymm5, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm14, %ymm1 +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] ; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm5 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm2, %ymm6 -; AVX512DQ-NEXT: vpor %ymm5, %ymm6, %ymm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm4 -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm3, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm24 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm4, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-NEXT: vpshufb %xmm15, %xmm3, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm15 -; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm14 -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm14, %xmm2 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm12 -; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm3 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm11, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3 -; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512DQ-NEXT: vporq %ymm2, %ymm0, %ymm29 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm3[23],zero,ymm3[21,22,23,26],zero,ymm3[24],zero,ymm3[28,29,26,27] -; AVX512DQ-NEXT: vporq %ymm0, %ymm2, %ymm25 -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm13, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm2 -; AVX512DQ-NEXT: vporq %ymm0, %ymm2, %ymm22 -; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm15, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm16 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm2, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm13 -; AVX512DQ-NEXT: vmovdqa (%rax), %xmm11 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm3 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm31 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm30 +; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[4,u,u,u],zero,xmm1[7],zero,xmm1[5,u,u,u],zero,xmm1[8],zero,xmm1[6] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[4],zero,xmm3[u,u,u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero,ymm10[27],zero,ymm10[25] +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero +; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm10, %ymm2 +; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm7, %ymm2 +; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] +; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm4 +; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm18 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm13 +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm13, %xmm2 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm10 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512DQ-NEXT: vporq %xmm2, %xmm0, %xmm26 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm11, %xmm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm8, %xmm2 +; AVX512DQ-NEXT: vporq %xmm0, %xmm2, %xmm24 +; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm20 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm15 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm15[4,u,u,u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6] +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[4],zero,xmm5[u,u,u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero +; AVX512DQ-NEXT: vporq %xmm0, %xmm3, %xmm22 +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm4, %ymm6 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm26 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm0[2,3,2,3],zmm3[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm23 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm0[2,3,2,3],zmm3[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm4 -; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm6[0,1,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm21 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,0,1],zmm1[0,1,0,1] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm7 -; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm6[0,1,0,1] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,0,1],zmm1[0,1,0,1] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[2,3,2,3],zmm7[0,1,0,1] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[2,3,2,3],zmm0[0,1,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm9 ^ (zmm1 & (zmm18 ^ zmm9)) -; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm9 = mem[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm27 = mem[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm27 = zmm9 ^ (zmm1 & (zmm27 ^ zmm9)) -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm1 = zmm1[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm9 -; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm9 = zmm9[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 16-byte Folded Reload -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm14 = zmm14[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm15 = zmm15[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm14)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm14 = zmm14[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm23 & (zmm14 ^ zmm15)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm31 ^ (zmm23 & (zmm21 ^ zmm31)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm19[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm17[2,3,2,3] -; AVX512DQ-NEXT: vpshuflw $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm15 = mem[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm0, %ymm9 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm16 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm28 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm28 = zmm4[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (zmm14 & (zmm7 ^ zmm6)) +; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm6 = mem[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm31, %zmm6 +; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm31 = mem[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm31 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm6 ^ (zmm14 & (zmm31 ^ zmm6)) +; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm6 = zmm12[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm14 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm23 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm23 = zmm14[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm14 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm26[0,1,0,1],zmm10[0,1,0,1] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm8, %xmm8 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm24[0,1,0,1],zmm8[0,1,0,1] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm12, %xmm11 +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm22[0,1,0,1],zmm2[0,1,0,1] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm12 = zmm12[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm5)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm12 = zmm12[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm17[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm15 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,1,0,0,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0] -; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm20 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm20 = zmm29[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm22[2,3,2,3],zmm25[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm20 ^ (zmm23 & (zmm17 ^ zmm20)) -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm6)) -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 | (zmm5 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm11)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,0,1,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm30)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm18)) -; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm14)) -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm4[0,1,0,1,4,5,4,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm28 ^ (mem & (zmm0 ^ zmm28)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm21)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm2 = zmm2[0,1,2,3],mem[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm2 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm27)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm1)) -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm12[0,1,0,1,4,5,4,5] -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm2 = zmm7[0,0,1,0,4,4,5,4] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm9)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm1)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm17)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm17 = zmm17[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm12 ^ (zmm13 & (zmm17 ^ zmm12)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm10 ^ (zmm13 & (zmm8 ^ zmm10)) +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm0)) +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm4)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,1,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm19)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (mem & (zmm29 ^ zmm5)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm4 = zmm4[0,1,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm4 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm31)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm28)) +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,0,4,4,5,4] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm23)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm4)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm17)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm20 ^ (mem & (zmm2 ^ zmm20)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm8)) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rax) -; AVX512DQ-NEXT: addq $1560, %rsp # imm = 0x618 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $1640, %rsp # imm = 0x668 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm14[0,1,14],zero,ymm14[12,13,0,1,14,15],zero,ymm14[3,12,13,2,3,16],zero,ymm14[30,31,28,29,16,17],zero,ymm14[31,18,19,28,29,18],zero +; AVX512DQ-FCP-NEXT: subq $1416, %rsp # imm = 0x588 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero,zero,ymm12[18] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[0,1,0,1,14],zero,ymm3[14,15,0,1,14,15],zero,ymm3[13,14,15,16,17,16],zero,ymm3[30,31,30,31,16,17],zero,ymm3[31,28,29,30,31] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,1,0,1,14],zero,ymm4[14,15,0,1,14,15],zero,ymm4[13,14,15,16,17,16],zero,ymm4[30,31,30,31,16,17],zero,ymm4[31,28,29,30,31] +; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[13,u,u,u,u,u],zero,ymm11[14,u,u,u,u,u],zero,ymm11[15,u,u,u,u,u],zero,ymm11[16,u,u,u,u,u],zero,ymm11[17,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm5, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] +; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm5, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm0 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128] +; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm5, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm11[30],zero,ymm11[28,u,u,u],zero,ymm11[31],zero,ymm11[29,u] -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] +; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm7, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm7 +; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm7, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] +; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] +; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm7, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm27 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm3[27,u,u,u],zero,ymm3[30],zero,ymm3[28,u,u,u],zero,ymm3[31],zero -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm7 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm9[19],zero,ymm9[21,20,21,22],zero,ymm9[20],zero,ymm9[22,23] +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm7 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm2[23],zero,ymm2[23,24,25,26],zero,ymm2[24],zero,ymm2[30,31] -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[20],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm8[18],zero,ymm8[20,21,20,21],zero,ymm8[19],zero,ymm8[19,20,21,22],zero +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm6, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm5 +; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm5, %ymm30 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm2 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm0 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm8 -; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm2 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm0, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero,xmm5[u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm21 +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm16 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm13 -; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm13, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm17 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512DQ-FCP-NEXT: vporq %xmm2, %xmm6, %xmm31 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[0,1,0,1,14],zero,ymm13[14,15,0,1,14,15],zero,ymm13[13,14,15,16,17,16],zero,ymm13[30,31,30,31,16,17],zero,ymm13[31,28,29,30,31] +; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm6, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm10, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3 +; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[0,1,14],zero,ymm15[12,13,0,1,14,15],zero,ymm15[3,12,13,2,3,16],zero,ymm15[30,31,28,29,16,17],zero,ymm15[31,18,19,28,29,18],zero +; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm17 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm4 -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm31 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm8, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm8 -; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm8, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm4 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm8 -; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm8, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm8 -; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm8, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm15[19],zero,ymm15[21,20,21,22],zero,ymm15[20],zero,ymm15[22,23] +; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[20],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero +; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm8, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] -; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm19 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm9, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm0[23],zero,ymm0[23,24,25,26],zero,ymm0[24],zero,ymm0[30,31] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] -; AVX512DQ-FCP-NEXT: vporq %ymm9, %ymm14, %ymm26 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero,ymm2[29,u] -; AVX512DQ-FCP-NEXT: vporq %ymm9, %ymm14, %ymm24 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,zero,zero,ymm0[30],zero,ymm0[28],zero,zero,zero,zero,ymm0[31],zero,ymm0[29] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27,u,u,u],zero,ymm1[30],zero,ymm1[28,u,u,u],zero,ymm1[31],zero -; AVX512DQ-FCP-NEXT: vporq %ymm9, %ymm14, %ymm23 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] -; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm14, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm3, %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[0,1,14],zero,ymm1[12,13,0,1,14,15],zero,ymm1[3,12,13,2,3,16],zero,ymm1[30,31,28,29,16,17],zero,ymm1[31,18,19,28,29,18],zero -; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm3, %ymm28 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm3, %ymm27 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm14[14],zero,zero,zero,zero,zero,zero,ymm14[15],zero,zero,zero,zero,zero,zero,ymm14[16],zero,zero,zero,zero,zero,zero,ymm14[17],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[13,u,u,u,u,u],zero,ymm10[14,u,u,u,u,u],zero,ymm10[15,u,u,u,u,u],zero,ymm10[16,u,u,u,u,u],zero,ymm10[17,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[13,u,u,u,u,u],zero,ymm0[14,u,u,u,u,u],zero,ymm0[15,u,u,u,u,u],zero,ymm0[16,u,u,u,u,u],zero,ymm0[17,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm0[0,1,0,1],zmm2[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm18 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[0,1,0,1],zmm2[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm10, %zmm29 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm21 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm21 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm11 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [4,5,4,5,5,7,4,5,20,21,22,23,20,21,22,23] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm26[2,3,2,3],zmm10[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm14, %xmm14 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm24[2,3,2,3],zmm14[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm12 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm23[2,3,2,3],zmm12[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm15[13],zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm2, %ymm4 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm9 = zmm1[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm15 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm15 = zmm1[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm9 ^ (zmm1 & (zmm15 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX512DQ-FCP-NEXT: vporq %xmm1, %xmm0, %xmm20 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u] +; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm19 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm15 +; AVX512DQ-FCP-NEXT: vporq %xmm3, %xmm15, %xmm18 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm28 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [4,5,4,5,5,7,4,5,20,21,22,23,20,21,22,23] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm27 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm30[2,3,2,3],zmm4[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm15 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[2,3,2,3],zmm4[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm2[2,3,2,3],zmm4[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm2[0,1,0,1],zmm1[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm2[0,1,0,1],zmm1[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm31[0,1,0,1],zmm1[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm17, %ymm2 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm20[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm19[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,0,1],zmm18[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm9 = zmm9[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm7 ^ (zmm10 & (zmm9 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm15)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm9 = zmm9[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm18[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm19[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm17[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm12[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm20 ^ (zmm1 & (zmm22 ^ zmm20)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm10 ^ (zmm8 & (zmm14 ^ zmm10)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm30[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm9 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm27[2,3,2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm28, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm9 ^ (zmm8 & (zmm10 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 | (zmm2 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm2 = mem[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm29[0,1,0,1,4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm22)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm21)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm31[0,1,0,1,4,5,4,5] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (mem & (zmm3 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm11)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm25 ^ (mem & (zmm12 ^ zmm25)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm14)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm5 = zmm1[0,1,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm10)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $1640, %rsp # imm = 0x668 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm10 & (zmm0 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm24 ^ (zmm10 & (zmm21 ^ zmm24)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm30 ^ (zmm7 & (zmm23 ^ zmm30)) +; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm10 = mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10 +; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm13 = mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm10 ^ (zmm7 & (zmm13 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm12, %zmm9 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm29 ^ (mem & (zmm4 ^ zmm29)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm23)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm25 ^ (mem & (zmm5 ^ zmm25)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm21)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm13)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm16 ^ (mem & (zmm6 ^ zmm16)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) +; AVX512DQ-FCP-NEXT: addq $1416, %rsp # imm = 0x588 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; From 3a66760d9b0aa9ec31df591e87dbf0dedb4c466d Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Tue, 1 Apr 2025 14:11:20 +0100 Subject: [PATCH 0236/1029] [LV] Improve a test, regen with UTC (#130092) --- .../LoopVectorize/X86/reduction-crash.ll | 107 ++++++++++++++---- 1 file changed, 82 insertions(+), 25 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll index bd11562c07ff3..945d50058a1a2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-crash.ll @@ -1,35 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 ; RUN: opt -S -aa-pipeline= -passes=loop-vectorize -mcpu=prescott < %s | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128" target triple = "i386-apple-darwin" -; PR15344 -define void @test1(ptr nocapture %arg, i32 %arg1, i1 %arg2) nounwind { -; CHECK-LABEL: @test1( -; CHECK: preheader -; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0 -; CHECK: vector.memcheck +define void @pr15344(ptr noalias %ar, ptr noalias %ar2, i32 %exit.limit, i1 %cond) { +; CHECK-LABEL: define void @pr15344( +; CHECK-SAME: ptr noalias [[AR:%.*]], ptr noalias [[AR2:%.*]], i32 [[EXIT_LIMIT:%.*]], i1 [[COND:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[PH:.*]] +; CHECK: [[PH]]: +; CHECK-NEXT: br i1 [[COND]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK: [[LOOP_PREHEADER]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[EXIT_LIMIT]], 10 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[EXIT_LIMIT]], 2 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[AR2]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[EXIT_LIMIT]], 3 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[AR]], i32 [[TMP1]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[AR2]], [[SCEVGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[AR]], [[SCEVGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[EXIT_LIMIT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[EXIT_LIMIT]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2]] = fadd fast <2 x double> [[VEC_PHI]], splat (double 1.000000e+00) +; CHECK-NEXT: [[TMP3]] = fadd fast <2 x double> [[VEC_PHI2]], splat (double 1.000000e+00) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[AR2]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i32 2 +; CHECK-NEXT: store <2 x float> splat (float 2.000000e+00), ptr [[TMP5]], align 4, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] +; CHECK-NEXT: store <2 x float> splat (float 2.000000e+00), ptr [[TMP6]], align 4, !alias.scope [[META0]], !noalias [[META3]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[EXIT_LIMIT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[LOOP_PREHEADER]] ], [ 0.000000e+00, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[RDX:%.*]] = phi double [ [[FADD:%.*]], %[[LOOP]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP_AR:%.*]] = getelementptr inbounds double, ptr [[AR]], i32 [[IV]] +; CHECK-NEXT: [[LD_AR:%.*]] = load double, ptr [[GEP_AR]], align 4 +; CHECK-NEXT: [[FADD]] = fadd fast double [[RDX]], 1.000000e+00 +; CHECK-NEXT: [[GEP_AR2:%.*]] = getelementptr inbounds float, ptr [[AR2]], i32 [[IV]] +; CHECK-NEXT: store float 2.000000e+00, ptr [[GEP_AR2]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[EXIT_LIMIT]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[EXIT_LOOPEXIT]]: +; CHECK-NEXT: [[FADD_LCSSA:%.*]] = phi double [ [[FADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RET:%.*]] = phi double [ 0.000000e+00, %[[PH]] ], [ [[FADD_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-NEXT: ret void +; +entry: + br label %ph -bb: - br label %bb2 +ph: + br i1 %cond, label %loop, label %exit -bb2: ; preds = %bb - %tmp = load double, ptr null, align 8 - br i1 %arg2, label %bb3, label %bb12 +loop: + %rdx = phi double [ 0.0, %ph ], [ %fadd, %loop ] + %iv = phi i32 [ 0, %ph ], [ %iv.next, %loop ] + %gep.ar = getelementptr inbounds double, ptr %ar, i32 %iv + %ld.ar = load double, ptr %gep.ar, align 4 + %fadd = fadd fast double %rdx, 1.0 + %gep.ar2 = getelementptr inbounds float, ptr %ar2, i32 %iv + store float 2.0, ptr %gep.ar2, align 4 + %iv.next = add nsw i32 %iv, 1 + %exit.cond = icmp eq i32 %iv.next, %exit.limit + br i1 %exit.cond, label %exit, label %loop -bb3: ; preds = %bb3, %bb2 - %tmp4 = phi double [ %tmp9, %bb3 ], [ %tmp, %bb2 ] - %tmp5 = phi i32 [ %tmp8, %bb3 ], [ 0, %bb2 ] - %tmp6 = getelementptr inbounds [16 x double], ptr undef, i32 0, i32 %tmp5 - %tmp7 = load double, ptr %tmp6, align 4 - %tmp8 = add nsw i32 %tmp5, 1 - %tmp9 = fadd fast double %tmp4, undef - %tmp10 = getelementptr inbounds float, ptr %arg, i32 %tmp5 - store float undef, ptr %tmp10, align 4 - %tmp11 = icmp eq i32 %tmp8, %arg1 - br i1 %tmp11, label %bb12, label %bb3 - -bb12: ; preds = %bb3, %bb2 - %tmp13 = phi double [ %tmp, %bb2 ], [ %tmp9, %bb3 ] +exit: + %ret = phi double [ 0.0, %ph ], [ %fadd, %loop ] ret void } From a1e041b64648789897c96eca5d6270e253773d16 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 1 Apr 2025 13:39:49 +0100 Subject: [PATCH 0237/1029] [NFC][AArch64] Pre-commit high register pressure dot product test --- .../AArch64/partial-reduce-dot-product.ll | 469 ++++++++++++++++++ 1 file changed, 469 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 8e655a9370082..bcdbb4d4dfbf7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -3116,6 +3116,475 @@ for.exit: ; preds = %for.body ret i32 %add } +define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32 %n) #1 { +; CHECK-INTERLEAVE1-LABEL: define dso_local void @not_dotp_high_register_pressure( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[SUM:%.*]], i32 [[N:%.*]]) #[[ATTR1]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[CMP100:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP100]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-INTERLEAVE1: for.body.lr.ph: +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 4 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B_12:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 8 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 12 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 16 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 20 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 24 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX67:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 28 +; CHECK-INTERLEAVE1-NEXT: [[SUM_PROMOTED:%.*]] = load i32, ptr [[SUM]], align 4 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX13_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B_12_PROMOTED:%.*]] = load i32, ptr [[GEP_B_12]], align 4 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX31_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX31]], align 4 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX40_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX40]], align 4 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX49_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX49]], align 4 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX58_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX58]], align 4 +; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX67_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX67]], align 4 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX67_PROMOTED]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX58_PROMOTED]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX49_PROMOTED]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX40_PROMOTED]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX31_PROMOTED]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[GEP_B_12_PROMOTED]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX13_PROMOTED]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[SUM_PROMOTED]], i32 0 +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext <4 x i8> [[STRIDED_VEC]] to <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP15]] = add <4 x i32> [[TMP14]], [[VEC_PHI7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = sext <4 x i8> [[STRIDED_VEC8]] to <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP18]] = add <4 x i32> [[TMP17]], [[VEC_PHI6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = sext <4 x i8> [[STRIDED_VEC9]] to <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP21]] = add <4 x i32> [[TMP20]], [[VEC_PHI5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = sext <4 x i8> [[STRIDED_VEC10]] to <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP22]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP24]] = add <4 x i32> [[TMP23]], [[VEC_PHI4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext <4 x i8> [[STRIDED_VEC11]] to <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27]] = add <4 x i32> [[TMP26]], [[VEC_PHI3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = sext <4 x i8> [[STRIDED_VEC12]] to <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP30]] = add <4 x i32> [[TMP29]], [[VEC_PHI2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = sext <4 x i8> [[STRIDED_VEC13]] to <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP33]] = add <4 x i32> [[TMP32]], [[VEC_PHI1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = sext <4 x i8> [[STRIDED_VEC14]] to <4 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP34]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[TMP36]] = add <4 x i32> [[TMP35]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP30]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP21]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; +; CHECK-INTERLEAVED-LABEL: define dso_local void @not_dotp_high_register_pressure( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[SUM:%.*]], i32 [[N:%.*]]) #[[ATTR1]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[CMP100:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP100]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-INTERLEAVED: for.body.lr.ph: +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 4 +; CHECK-INTERLEAVED-NEXT: [[GEP_B_12:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 8 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 12 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 20 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 24 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX67:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 28 +; CHECK-INTERLEAVED-NEXT: [[SUM_PROMOTED:%.*]] = load i32, ptr [[SUM]], align 4 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX13_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 +; CHECK-INTERLEAVED-NEXT: [[GEP_B_12_PROMOTED:%.*]] = load i32, ptr [[GEP_B_12]], align 4 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX31_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX31]], align 4 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX40_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX40]], align 4 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX49_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX49]], align 4 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX58_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX58]], align 4 +; CHECK-INTERLEAVED-NEXT: [[ARRAYIDX67_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX67]], align 4 +; CHECK-INTERLEAVED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX67_PROMOTED]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX58_PROMOTED]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX49_PROMOTED]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX40_PROMOTED]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX31_PROMOTED]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[GEP_B_12_PROMOTED]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX13_PROMOTED]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[SUM_PROMOTED]], i32 0 +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP59:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI8:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI11:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI13:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI14:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI15:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[WIDE_LOAD16]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shl nsw i64 [[INDEX]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = shl nsw i64 [[TMP8]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP16]], align 1 +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC23:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[WIDE_VEC24:%.*]] = load <32 x i8>, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC25:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC26:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC27:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC28:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC29:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC30:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC31:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[STRIDED_VEC32:%.*]] = shufflevector <32 x i8> [[WIDE_VEC24]], <32 x i8> poison, <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext <4 x i8> [[STRIDED_VEC]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sext <4 x i8> [[STRIDED_VEC25]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP18]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add <4 x i32> [[TMP20]], [[VEC_PHI14]] +; CHECK-INTERLEAVED-NEXT: [[TMP23]] = add <4 x i32> [[TMP21]], [[VEC_PHI15]] +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <4 x i8> [[STRIDED_VEC17]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <4 x i8> [[STRIDED_VEC26]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <4 x i32> [[TMP24]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP28]] = add <4 x i32> [[TMP26]], [[VEC_PHI12]] +; CHECK-INTERLEAVED-NEXT: [[TMP29]] = add <4 x i32> [[TMP27]], [[VEC_PHI13]] +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <4 x i8> [[STRIDED_VEC18]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = sext <4 x i8> [[STRIDED_VEC27]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP34]] = add <4 x i32> [[TMP32]], [[VEC_PHI10]] +; CHECK-INTERLEAVED-NEXT: [[TMP35]] = add <4 x i32> [[TMP33]], [[VEC_PHI11]] +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext <4 x i8> [[STRIDED_VEC19]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <4 x i8> [[STRIDED_VEC28]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = mul nsw <4 x i32> [[TMP36]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <4 x i32> [[TMP37]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP40]] = add <4 x i32> [[TMP38]], [[VEC_PHI8]] +; CHECK-INTERLEAVED-NEXT: [[TMP41]] = add <4 x i32> [[TMP39]], [[VEC_PHI9]] +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = sext <4 x i8> [[STRIDED_VEC20]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = sext <4 x i8> [[STRIDED_VEC29]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP46]] = add <4 x i32> [[TMP44]], [[VEC_PHI6]] +; CHECK-INTERLEAVED-NEXT: [[TMP47]] = add <4 x i32> [[TMP45]], [[VEC_PHI7]] +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <4 x i8> [[STRIDED_VEC21]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = sext <4 x i8> [[STRIDED_VEC30]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = mul nsw <4 x i32> [[TMP48]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = mul nsw <4 x i32> [[TMP49]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP52]] = add <4 x i32> [[TMP50]], [[VEC_PHI4]] +; CHECK-INTERLEAVED-NEXT: [[TMP53]] = add <4 x i32> [[TMP51]], [[VEC_PHI5]] +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = sext <4 x i8> [[STRIDED_VEC22]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = sext <4 x i8> [[STRIDED_VEC31]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = mul nsw <4 x i32> [[TMP54]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = mul nsw <4 x i32> [[TMP55]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP58]] = add <4 x i32> [[TMP56]], [[VEC_PHI2]] +; CHECK-INTERLEAVED-NEXT: [[TMP59]] = add <4 x i32> [[TMP57]], [[VEC_PHI3]] +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext <4 x i8> [[STRIDED_VEC23]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext <4 x i8> [[STRIDED_VEC32]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw <4 x i32> [[TMP60]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = mul nsw <4 x i32> [[TMP61]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP64]] = add <4 x i32> [[TMP62]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP65]] = add <4 x i32> [[TMP63]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP65]], [[TMP64]] +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX33:%.*]] = add <4 x i32> [[TMP59]], [[TMP58]] +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX33]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX34:%.*]] = add <4 x i32> [[TMP53]], [[TMP52]] +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX34]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX35:%.*]] = add <4 x i32> [[TMP47]], [[TMP46]] +; CHECK-INTERLEAVED-NEXT: [[TMP70:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX35]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX36:%.*]] = add <4 x i32> [[TMP41]], [[TMP40]] +; CHECK-INTERLEAVED-NEXT: [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX36]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP35]], [[TMP34]] +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX37]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP29]], [[TMP28]] +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX39:%.*]] = add <4 x i32> [[TMP23]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[TMP74:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX39]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; +; CHECK-MAXBW-LABEL: define dso_local void @not_dotp_high_register_pressure( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[SUM:%.*]], i32 [[N:%.*]]) #[[ATTR1]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[CMP100:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-MAXBW-NEXT: br i1 [[CMP100]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-MAXBW: for.body.lr.ph: +; CHECK-MAXBW-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 4 +; CHECK-MAXBW-NEXT: [[GEP_B_12:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 8 +; CHECK-MAXBW-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 12 +; CHECK-MAXBW-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 16 +; CHECK-MAXBW-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 20 +; CHECK-MAXBW-NEXT: [[ARRAYIDX58:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 24 +; CHECK-MAXBW-NEXT: [[ARRAYIDX67:%.*]] = getelementptr inbounds nuw i8, ptr [[SUM]], i64 28 +; CHECK-MAXBW-NEXT: [[SUM_PROMOTED:%.*]] = load i32, ptr [[SUM]], align 4 +; CHECK-MAXBW-NEXT: [[ARRAYIDX13_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 +; CHECK-MAXBW-NEXT: [[GEP_B_12_PROMOTED:%.*]] = load i32, ptr [[GEP_B_12]], align 4 +; CHECK-MAXBW-NEXT: [[ARRAYIDX31_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX31]], align 4 +; CHECK-MAXBW-NEXT: [[ARRAYIDX40_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX40]], align 4 +; CHECK-MAXBW-NEXT: [[ARRAYIDX49_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX49]], align 4 +; CHECK-MAXBW-NEXT: [[ARRAYIDX58_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX58]], align 4 +; CHECK-MAXBW-NEXT: [[ARRAYIDX67_PROMOTED:%.*]] = load i32, ptr [[ARRAYIDX67]], align 4 +; CHECK-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX67_PROMOTED]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX58_PROMOTED]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX49_PROMOTED]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX40_PROMOTED]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX31_PROMOTED]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[GEP_B_12_PROMOTED]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[ARRAYIDX13_PROMOTED]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[SUM_PROMOTED]], i32 0 +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0 +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP9]], align 1 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]] +; CHECK-MAXBW-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-MAXBW-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <4 x i32> +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext <4 x i8> [[STRIDED_VEC]] to <4 x i32> +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP15]] = add <4 x i32> [[TMP14]], [[VEC_PHI7]] +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext <4 x i8> [[STRIDED_VEC8]] to <4 x i32> +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP18]] = add <4 x i32> [[TMP17]], [[VEC_PHI6]] +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = sext <4 x i8> [[STRIDED_VEC9]] to <4 x i32> +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP21]] = add <4 x i32> [[TMP20]], [[VEC_PHI5]] +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = sext <4 x i8> [[STRIDED_VEC10]] to <4 x i32> +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP22]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP24]] = add <4 x i32> [[TMP23]], [[VEC_PHI4]] +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = sext <4 x i8> [[STRIDED_VEC11]] to <4 x i32> +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP27]] = add <4 x i32> [[TMP26]], [[VEC_PHI3]] +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = sext <4 x i8> [[STRIDED_VEC12]] to <4 x i32> +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP30]] = add <4 x i32> [[TMP29]], [[VEC_PHI2]] +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = sext <4 x i8> [[STRIDED_VEC13]] to <4 x i32> +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP33]] = add <4 x i32> [[TMP32]], [[VEC_PHI1]] +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = sext <4 x i8> [[STRIDED_VEC14]] to <4 x i32> +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP34]], [[TMP10]] +; CHECK-MAXBW-NEXT: [[TMP36]] = add <4 x i32> [[TMP35]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]]) +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]]) +; CHECK-MAXBW-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP30]]) +; CHECK-MAXBW-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]]) +; CHECK-MAXBW-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]]) +; CHECK-MAXBW-NEXT: [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP21]]) +; CHECK-MAXBW-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]]) +; CHECK-MAXBW-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; +entry: + %cmp100 = icmp sgt i32 %n, 0 + br i1 %cmp100, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %arrayidx13 = getelementptr inbounds nuw i8, ptr %sum, i64 4 + %gep.b.12 = getelementptr inbounds nuw i8, ptr %sum, i64 8 + %arrayidx31 = getelementptr inbounds nuw i8, ptr %sum, i64 12 + %arrayidx40 = getelementptr inbounds nuw i8, ptr %sum, i64 16 + %arrayidx49 = getelementptr inbounds nuw i8, ptr %sum, i64 20 + %arrayidx58 = getelementptr inbounds nuw i8, ptr %sum, i64 24 + %arrayidx67 = getelementptr inbounds nuw i8, ptr %sum, i64 28 + %sum.promoted = load i32, ptr %sum, align 4 + %arrayidx13.promoted = load i32, ptr %arrayidx13, align 4 + %gep.b.12.promoted = load i32, ptr %gep.b.12, align 4 + %arrayidx31.promoted = load i32, ptr %arrayidx31, align 4 + %arrayidx40.promoted = load i32, ptr %arrayidx40, align 4 + %arrayidx49.promoted = load i32, ptr %arrayidx49, align 4 + %arrayidx58.promoted = load i32, ptr %arrayidx58, align 4 + %arrayidx67.promoted = load i32, ptr %arrayidx67, align 4 + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body + %add.lcssa = phi i32 [ %add.1, %for.body ] + %add.2.lcssa = phi i32 [ %add.2, %for.body ] + %add.3.lcssa = phi i32 [ %add.3, %for.body ] + %add.4.lcssa = phi i32 [ %add.4, %for.body ] + %add.5.lcssa = phi i32 [ %add.5, %for.body ] + %add.6.lcssa = phi i32 [ %add.6, %for.body ] + %add.7.lcssa = phi i32 [ %add.7, %for.body ] + %add.8.lcssa = phi i32 [ %add.8, %for.body ] + store i32 %add.lcssa, ptr %sum, align 4 + store i32 %add.2.lcssa, ptr %arrayidx13, align 4 + store i32 %add.3.lcssa, ptr %gep.b.12, align 4 + store i32 %add.4.lcssa, ptr %arrayidx31, align 4 + store i32 %add.5.lcssa, ptr %arrayidx40, align 4 + store i32 %add.6.lcssa, ptr %arrayidx49, align 4 + store i32 %add.7.lcssa, ptr %arrayidx58, align 4 + store i32 %add.8.lcssa, ptr %arrayidx67, align 4 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %0 = phi i32 [ %arrayidx67.promoted, %for.body.lr.ph ], [ %add.8, %for.body ] + %1 = phi i32 [ %arrayidx58.promoted, %for.body.lr.ph ], [ %add.7, %for.body ] + %2 = phi i32 [ %arrayidx49.promoted, %for.body.lr.ph ], [ %add.6, %for.body ] + %3 = phi i32 [ %arrayidx40.promoted, %for.body.lr.ph ], [ %add.5, %for.body ] + %4 = phi i32 [ %arrayidx31.promoted, %for.body.lr.ph ], [ %add.4, %for.body ] + %5 = phi i32 [ %gep.b.12.promoted, %for.body.lr.ph ], [ %add.3, %for.body ] + %6 = phi i32 [ %arrayidx13.promoted, %for.body.lr.ph ], [ %add.2, %for.body ] + %7 = phi i32 [ %sum.promoted, %for.body.lr.ph ], [ %add.1, %for.body ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv + %load.a = load i8, ptr %arrayidx, align 1 + %ext.a = zext i8 %load.a to i32 + %9 = shl nsw i64 %indvars.iv, 3 + %gep.b.1 = getelementptr inbounds nuw i8, ptr %b, i64 %9 + %load.b.1 = load i8, ptr %gep.b.1, align 1 + %ext.b.1 = sext i8 %load.b.1 to i32 + %mul.1 = mul nsw i32 %ext.b.1, %ext.a + %add.1 = add nsw i32 %mul.1, %7 + %11 = or disjoint i64 %9, 1 + %gep.b.2 = getelementptr inbounds nuw i8, ptr %b, i64 %11 + %load.b.2 = load i8, ptr %gep.b.2, align 1 + %ext.b.2 = sext i8 %load.b.2 to i32 + %mul.2 = mul nsw i32 %ext.b.2, %ext.a + %add.2 = add nsw i32 %mul.2, %6 + %13 = or disjoint i64 %9, 2 + %gep.b.3 = getelementptr inbounds nuw i8, ptr %b, i64 %13 + %load.b.3 = load i8, ptr %gep.b.3, align 1 + %ext.b.3 = sext i8 %load.b.3 to i32 + %mul.3 = mul nsw i32 %ext.b.3, %ext.a + %add.3 = add nsw i32 %mul.3, %5 + %15 = or disjoint i64 %9, 3 + %gep.b.4 = getelementptr inbounds nuw i8, ptr %b, i64 %15 + %load.b.4 = load i8, ptr %gep.b.4, align 1 + %ext.b.4 = sext i8 %load.b.4 to i32 + %mul.4 = mul nsw i32 %ext.b.4, %ext.a + %add.4 = add nsw i32 %mul.4, %4 + %17 = or disjoint i64 %9, 4 + %gep.b.5 = getelementptr inbounds nuw i8, ptr %b, i64 %17 + %load.b.5 = load i8, ptr %gep.b.5, align 1 + %ext.b.5 = sext i8 %load.b.5 to i32 + %mul.5 = mul nsw i32 %ext.b.5, %ext.a + %add.5 = add nsw i32 %mul.5, %3 + %19 = or disjoint i64 %9, 5 + %gep.b.6 = getelementptr inbounds nuw i8, ptr %b, i64 %19 + %load.b.6 = load i8, ptr %gep.b.6, align 1 + %ext.b.6 = sext i8 %load.b.6 to i32 + %mul.6 = mul nsw i32 %ext.b.6, %ext.a + %add.6 = add nsw i32 %mul.6, %2 + %21 = or disjoint i64 %9, 6 + %gep.b.7 = getelementptr inbounds nuw i8, ptr %b, i64 %21 + %load.b.7 = load i8, ptr %gep.b.7, align 1 + %ext.b.7 = sext i8 %load.b.7 to i32 + %mul.7 = mul nsw i32 %ext.b.7, %ext.a + %add.7 = add nsw i32 %mul.7, %1 + %23 = or disjoint i64 %9, 7 + %gep.b.8 = getelementptr inbounds nuw i8, ptr %b, i64 %23 + %load.b.8 = load i8, ptr %gep.b.8, align 1 + %ext.b.8 = sext i8 %load.b.8 to i32 + %mul.8 = mul nsw i32 %ext.b.8, %ext.a + %add.8 = add nsw i32 %mul.8, %0 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.for.cond.cleanup_crit_edge, label %for.body, !llvm.loop !8 +} !7 = distinct !{!7, !8, !9, !10} !8 = !{!"llvm.loop.mustprogress"} From 1ebc308bba0e1403b9fb3ba0fbadc66e182138e0 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Tue, 1 Apr 2025 14:27:11 +0100 Subject: [PATCH 0238/1029] [DebugInfo][RemoveDIs] Remove debug-intrinsic printing cmdline options (#131855) During the transition from debug intrinsics to debug records, we used several different command line options to customise handling: the printing of debug records to bitcode and textual could be independent of how the debug-info was represented inside a module, whether the autoupgrader ran could be customised. This was all valuable during development, but now that totally removing debug intrinsics is coming up, this patch removes those options in favour of a single flag (experimental-debuginfo-iterators), which enables autoupgrade, in-memory debug records, and debug record printing to bitcode and textual IR. We need to do this ahead of removing the experimental-debuginfo-iterators flag, to reduce the amount of test-juggling that happens at that time. There are quite a number of weird test behaviours related to this -- some of which I simply delete in this commit. Things like print-non-instruction-debug-info.ll , the test suite now checks for debug records in all tests, and we don't want to check we can print as intrinsics. Or the update_test_checks tests -- these are duplicated with write-experimental-debuginfo=false to ensure file writing for intrinsics is correct, but that's something we're imminently going to delete. A short survey of curious test changes: * free-intrinsics.ll: we don't need to test that debug-info is a zero cost intrinsic, because we won't be using intrinsics in the future. * undef-dbg-val.ll: apparently we pinned this to non-RemoveDIs in-memory mode while we sorted something out; it works now either way. * salvage-cast-debug-info.ll: was testing intrinsics-in-memory get salvaged, isn't necessary now * localize-constexpr-debuginfo.ll: was producing "dead metadata" intrinsics for optimised-out variable values, dbg-records takes the (correct) representation of poison/undef as an operand. Looks like we didn't update this in the past to avoid spurious test differences. * Transforms/Scalarizer/dbginfo.ll: this test was explicitly testing that debug-info affected codegen, and we deferred updating the tests until now. This is just one of those silent gnochange issues that get fixed by RemoveDIs. Finally: I've added a bitcode test, dbg-intrinsics-autoupgrade.ll.bc, that checks we can autoupgrade debug intrinsics that are in bitcode into the new debug records. --- flang/test/Integration/debug-local-var-2.f90 | 8 +- llvm/docs/LangRef.rst | 2 +- llvm/lib/AsmParser/LLParser.cpp | 4 - llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 23 +- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 3 +- llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp | 8 +- llvm/lib/CodeGen/MIRPrinter.cpp | 6 +- llvm/lib/IR/BasicBlock.cpp | 5 - llvm/lib/IR/IRPrintingPasses.cpp | 16 +- llvm/lib/IRPrinter/IRPrintingPasses.cpp | 12 +- .../Transforms/IPO/ThinLTOBitcodeWriter.cpp | 5 +- .../Analysis/CostModel/X86/free-intrinsics.ll | 21 +- .../CostModel/free-intrinsics-datalayout.ll | 21 +- .../CostModel/free-intrinsics-no_info.ll | 21 +- llvm/test/Bitcode/DIExpression-aggresult.ll | 1 - .../dbg-intrinsics-autoupgrade.ll} | 66 ++-- .../Bitcode/dbg-intrinsics-autoupgrade.ll.bc | Bin 0 -> 2256 bytes llvm/test/Bitcode/dbg-label-record-bc.ll | 15 +- llvm/test/Bitcode/dbg-record-roundtrip.ll | 65 +--- llvm/test/Bitcode/upgrade-dbg-addr.ll | 8 +- .../MIRDebugify/locations-and-values.mir | 4 - llvm/test/DebugInfo/X86/undef-dbg-val.ll | 2 +- .../roundtrip-non-instruction-debug-info.ll | 102 ------ .../test/DebugInfo/salvage-cast-debug-info.ll | 1 - .../GlobalOpt/localize-constexpr-debuginfo.ll | 9 +- llvm/test/Transforms/Scalarizer/dbginfo.ll | 16 +- .../Inputs/various_ir_values.ll | 175 ---------- .../Inputs/various_ir_values.ll.expected | 245 -------------- .../various_ir_values.ll.funcsig.expected | 247 -------------- ...ious_ir_values.ll.funcsig.globals.expected | 317 ------------------ ...us_ir_values.ll.funcsig.noglobals.expected | 245 -------------- ...lues.ll.funcsig.transitiveglobals.expected | 306 ----------------- .../Inputs/various_ir_values_dbgrecords.ll | 2 +- .../various_ir_values_dbgrecords.ll.expected | 2 +- ...s_ir_values_dbgrecords.ll.funcsig.expected | 2 +- ...ues_dbgrecords.ll.funcsig.globals.expected | 2 +- ...s_dbgrecords.ll.funcsig.noglobals.expected | 2 +- ...ords.ll.funcsig.transitiveglobals.expected | 2 +- .../update_test_checks/various_ir_values.test | 24 -- .../tools/llvm-reduce/remove-dp-values.ll | 7 - llvm/tools/llvm-as/llvm-as.cpp | 4 +- llvm/tools/llvm-dis/llvm-dis.cpp | 12 +- llvm/tools/llvm-link/llvm-link.cpp | 12 +- llvm/tools/llvm-lto/llvm-lto.cpp | 4 - llvm/tools/llvm-lto2/llvm-lto2.cpp | 4 - .../Analysis/IRSimilarityIdentifierTest.cpp | 3 +- llvm/unittests/Transforms/Utils/LocalTest.cpp | 8 +- mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp | 6 +- mlir/test/Target/LLVMIR/llvmir-debug.mlir | 13 +- .../Dialect/Test/TestToLLVMIRTranslation.cpp | 10 +- 50 files changed, 111 insertions(+), 1987 deletions(-) rename llvm/test/{DebugInfo/print-non-instruction-debug-info.ll => Bitcode/dbg-intrinsics-autoupgrade.ll} (52%) create mode 100644 llvm/test/Bitcode/dbg-intrinsics-autoupgrade.ll.bc delete mode 100644 llvm/test/DebugInfo/roundtrip-non-instruction-debug-info.ll delete mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values.ll delete mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values.ll.expected delete mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values.ll.funcsig.expected delete mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values.ll.funcsig.globals.expected delete mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values.ll.funcsig.noglobals.expected delete mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values.ll.funcsig.transitiveglobals.expected delete mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/various_ir_values.test diff --git a/flang/test/Integration/debug-local-var-2.f90 b/flang/test/Integration/debug-local-var-2.f90 index 20f9a9c622890..7e6398f284ba0 100644 --- a/flang/test/Integration/debug-local-var-2.f90 +++ b/flang/test/Integration/debug-local-var-2.f90 @@ -1,7 +1,7 @@ -! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -mllvm --write-experimental-debuginfo=false -o - | FileCheck %s --check-prefixes=BOTH,INTRINSICS -! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -mllvm --write-experimental-debuginfo=true -o - | FileCheck %s --check-prefixes=BOTH,RECORDS -! RUN: %flang_fc1 -emit-llvm -debug-info-kind=line-tables-only %s -mllvm --write-experimental-debuginfo=false -o - | FileCheck --check-prefix=LINEONLY %s -! RUN: %flang_fc1 -emit-llvm -debug-info-kind=line-tables-only %s -mllvm --write-experimental-debuginfo=true -o - | FileCheck --check-prefix=LINEONLY %s +! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -mllvm --experimental-debuginfo-iterators=false -o - | FileCheck %s --check-prefixes=BOTH,INTRINSICS +! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -mllvm --experimental-debuginfo-iterators=true -o - | FileCheck %s --check-prefixes=BOTH,RECORDS +! RUN: %flang_fc1 -emit-llvm -debug-info-kind=line-tables-only %s -mllvm --experimental-debuginfo-iterators=false -o - | FileCheck --check-prefix=LINEONLY %s +! RUN: %flang_fc1 -emit-llvm -debug-info-kind=line-tables-only %s -mllvm --experimental-debuginfo-iterators=true -o - | FileCheck --check-prefix=LINEONLY %s ! This tests checks the debug information for local variables in llvm IR. diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 34a6bb8f13d6b..e1636e154d43b 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -13447,7 +13447,7 @@ an extra level of indentation. As an example: %inst2 = op2 %inst1, %c These debug records replace the prior :ref:`debug intrinsics`. -Debug records will be disabled if ``--write-experimental-debuginfo=false`` is +Debug records will be disabled if ``--experimental-debuginfo-iterators=false`` is passed to LLVM; it is an error for both records and intrinsics to appear in the same module. More information about debug records can be found in the `LLVM Source Level Debugging `_ diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index b7ebffbeb7187..ff2cdea6e8ee6 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -65,8 +65,6 @@ static cl::opt AllowIncompleteIR( extern llvm::cl::opt UseNewDbgInfoFormat; extern cl::opt PreserveInputDbgFormat; -extern bool WriteNewDbgInfoFormatToBitcode; -extern cl::opt WriteNewDbgInfoFormat; static std::string getTypeString(Type *T) { std::string Result; @@ -213,8 +211,6 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { "Mixed debug intrinsics/records seen without a parsing error?"); if (PreserveInputDbgFormat == cl::boolOrDefault::BOU_TRUE) { UseNewDbgInfoFormat = SeenNewDbgInfoFormat; - WriteNewDbgInfoFormatToBitcode = SeenNewDbgInfoFormat; - WriteNewDbgInfoFormat = SeenNewDbgInfoFormat; M->setNewDbgInfoFormatFlag(SeenNewDbgInfoFormat); } diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index b0d9bcc384101..4de3c4f085ca7 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -102,18 +102,8 @@ static cl::opt ExpandConstantExprs( cl::desc( "Expand constant expressions to instructions for testing purposes")); -/// Load bitcode directly into RemoveDIs format (use debug records instead -/// of debug intrinsics). UNSET is treated as FALSE, so the default action -/// is to do nothing. Individual tools can override this to incrementally add -/// support for the RemoveDIs format. -cl::opt LoadBitcodeIntoNewDbgInfoFormat( - "load-bitcode-into-experimental-debuginfo-iterators", cl::Hidden, - cl::desc("Load bitcode directly into the new debug info format (regardless " - "of input format)")); extern cl::opt UseNewDbgInfoFormat; extern cl::opt PreserveInputDbgFormat; -extern bool WriteNewDbgInfoFormatToBitcode; -extern cl::opt WriteNewDbgInfoFormat; namespace { @@ -4492,14 +4482,9 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord( Error BitcodeReader::parseModule(uint64_t ResumeBit, bool ShouldLazyLoadMetadata, ParserCallbacks Callbacks) { - // Load directly into RemoveDIs format if LoadBitcodeIntoNewDbgInfoFormat - // has been set to true and we aren't attempting to preserve the existing - // format in the bitcode (default action: load into the old debug format). - if (PreserveInputDbgFormat != cl::boolOrDefault::BOU_TRUE) { - TheModule->IsNewDbgInfoFormat = - UseNewDbgInfoFormat && - LoadBitcodeIntoNewDbgInfoFormat != cl::boolOrDefault::BOU_FALSE; - } + // In preparation for the deletion of debug-intrinsics, don't allow module + // loading to escape intrinsics being autoupgraded to debug records. + TheModule->IsNewDbgInfoFormat = UseNewDbgInfoFormat; this->ValueTypeCallback = std::move(Callbacks.ValueType); if (ResumeBit) { @@ -7026,8 +7011,6 @@ Error BitcodeReader::materialize(GlobalValue *GV) { SeenAnyDebugInfo ? SeenDebugRecord : F->getParent()->IsNewDbgInfoFormat; if (SeenAnyDebugInfo) { UseNewDbgInfoFormat = SeenDebugRecord; - WriteNewDbgInfoFormatToBitcode = SeenDebugRecord; - WriteNewDbgInfoFormat = SeenDebugRecord; } // If the module's debug info format doesn't match the observed input // format, then set its format now; we don't need to call the conversion diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 4a0db9d76f44a..ad15f13902e63 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -103,7 +103,6 @@ namespace llvm { extern FunctionSummary::ForceSummaryHotnessType ForceSummaryEdgesCold; } -extern bool WriteNewDbgInfoFormatToBitcode; extern llvm::cl::opt UseNewDbgInfoFormat; namespace { @@ -3710,7 +3709,7 @@ void ModuleBitcodeWriter::writeFunction( // they come after the instruction so that it's easy to attach them again // when reading the bitcode, even though conceptually the debug locations // start "before" the instruction. - if (I.hasDbgRecords() && WriteNewDbgInfoFormatToBitcode) { + if (I.hasDbgRecords()) { /// Try to push the value only (unwrapped), otherwise push the /// metadata wrapped value. Returns true if the value was pushed /// without the ValueAsMetadata wrapper. diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp index 5f66e1ea0a835..fb393d33df3b2 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp @@ -18,11 +18,8 @@ #include "llvm/Pass.h" using namespace llvm; -extern bool WriteNewDbgInfoFormatToBitcode; - PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) { - ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat && - WriteNewDbgInfoFormatToBitcode); + ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat); if (M.IsNewDbgInfoFormat) M.removeDebugIntrinsicDeclarations(); @@ -54,8 +51,7 @@ namespace { StringRef getPassName() const override { return "Bitcode Writer"; } bool runOnModule(Module &M) override { - ScopedDbgInfoFormatSetter FormatSetter( - M, M.IsNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode); + ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat); if (M.IsNewDbgInfoFormat) M.removeDebugIntrinsicDeclarations(); diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index e9bd60e4e2597..2f08fcda1fbd0 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -69,7 +69,7 @@ static cl::opt SimplifyMIR( static cl::opt PrintLocations("mir-debug-loc", cl::Hidden, cl::init(true), cl::desc("Print MIR debug-locations")); -extern cl::opt WriteNewDbgInfoFormat; +extern cl::opt UseNewDbgInfoFormat; namespace { @@ -1050,7 +1050,7 @@ void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V, void llvm::printMIR(raw_ostream &OS, const Module &M) { ScopedDbgInfoFormatSetter FormatSetter(const_cast(M), - WriteNewDbgInfoFormat); + UseNewDbgInfoFormat); yaml::Output Out(OS); Out << const_cast(M); @@ -1061,7 +1061,7 @@ void llvm::printMIR(raw_ostream &OS, const MachineModuleInfo &MMI, // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info // in dbg.value format. ScopedDbgInfoFormatSetter FormatSetter( - const_cast(MF.getFunction()), WriteNewDbgInfoFormat); + const_cast(MF.getFunction()), UseNewDbgInfoFormat); MIRPrinter Printer(OS, MMI); Printer.print(MF); diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index dca42a57fa9e3..d9ff0687480a4 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -44,11 +44,6 @@ cl::opt PreserveInputDbgFormat( "contain debug records or intrinsics. Ignored in llvm-link, " "llvm-lto, and llvm-lto2.")); -bool WriteNewDbgInfoFormatToBitcode /*set default value in cl::init() below*/; -cl::opt WriteNewDbgInfoFormatToBitcode2( - "write-experimental-debuginfo-iterators-to-bitcode", cl::Hidden, - cl::location(WriteNewDbgInfoFormatToBitcode), cl::init(true)); - DbgMarker *BasicBlock::createMarker(Instruction *I) { assert(IsNewDbgInfoFormat && "Tried to create a marker in a non new debug-info block!"); diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp index 0dab0c9381635..96396d06ebba3 100644 --- a/llvm/lib/IR/IRPrintingPasses.cpp +++ b/llvm/lib/IR/IRPrintingPasses.cpp @@ -23,11 +23,7 @@ using namespace llvm; -cl::opt WriteNewDbgInfoFormat( - "write-experimental-debuginfo", - cl::desc("Write debug info in the new non-intrinsic format. Has no effect " - "if --preserve-input-debuginfo-format=true."), - cl::init(true)); +extern cl::opt UseNewDbgInfoFormat; namespace { @@ -45,13 +41,11 @@ class PrintModulePassWrapper : public ModulePass { ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {} bool runOnModule(Module &M) override { - // RemoveDIs: Regardless of the format we've processed this module in, use - // `WriteNewDbgInfoFormat` to determine which format we use to write it. - ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat); + ScopedDbgInfoFormatSetter FormatSetter(M, UseNewDbgInfoFormat); // Remove intrinsic declarations when printing in the new format. // TODO: Move this into Module::setIsNewDbgInfoFormat when we're ready to // update test output. - if (WriteNewDbgInfoFormat) + if (UseNewDbgInfoFormat) M.removeDebugIntrinsicDeclarations(); if (llvm::isFunctionInPrintList("*")) { @@ -93,9 +87,7 @@ class PrintFunctionPassWrapper : public FunctionPass { // This pass just prints a banner followed by the function as it's processed. bool runOnFunction(Function &F) override { - // RemoveDIs: Regardless of the format we've processed this function in, use - // `WriteNewDbgInfoFormat` to determine which format we use to write it. - ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat); + ScopedDbgInfoFormatSetter FormatSetter(F, UseNewDbgInfoFormat); if (isFunctionInPrintList(F.getName())) { if (forcePrintModuleIR()) diff --git a/llvm/lib/IRPrinter/IRPrintingPasses.cpp b/llvm/lib/IRPrinter/IRPrintingPasses.cpp index 026fa4d746d8b..b1750a5ca3091 100644 --- a/llvm/lib/IRPrinter/IRPrintingPasses.cpp +++ b/llvm/lib/IRPrinter/IRPrintingPasses.cpp @@ -22,7 +22,7 @@ using namespace llvm; -extern cl::opt WriteNewDbgInfoFormat; +extern cl::opt UseNewDbgInfoFormat; PrintModulePass::PrintModulePass() : OS(dbgs()) {} PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner, @@ -33,13 +33,11 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner, EmitSummaryIndex(EmitSummaryIndex) {} PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) { - // RemoveDIs: Regardless of the format we've processed this module in, use - // `WriteNewDbgInfoFormat` to determine which format we use to write it. - ScopedDbgInfoFormatSetter FormatSetter(M, WriteNewDbgInfoFormat); + ScopedDbgInfoFormatSetter FormatSetter(M, UseNewDbgInfoFormat); // Remove intrinsic declarations when printing in the new format. // TODO: Move this into Module::setIsNewDbgInfoFormat when we're ready to // update test output. - if (WriteNewDbgInfoFormat) + if (UseNewDbgInfoFormat) M.removeDebugIntrinsicDeclarations(); if (llvm::isFunctionInPrintList("*")) { @@ -77,9 +75,7 @@ PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner) PreservedAnalyses PrintFunctionPass::run(Function &F, FunctionAnalysisManager &) { - // RemoveDIs: Regardless of the format we've processed this function in, use - // `WriteNewDbgInfoFormat` to determine which format we use to write it. - ScopedDbgInfoFormatSetter FormatSetter(F, WriteNewDbgInfoFormat); + ScopedDbgInfoFormatSetter FormatSetter(F, UseNewDbgInfoFormat); if (isFunctionInPrintList(F.getName())) { if (forcePrintModuleIR()) diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index cd0e412bdf353..88abc6e560580 100644 --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -575,14 +575,13 @@ bool writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS, } } // anonymous namespace -extern bool WriteNewDbgInfoFormatToBitcode; + PreservedAnalyses llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) { FunctionAnalysisManager &FAM = AM.getResult(M).getManager(); - ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat && - WriteNewDbgInfoFormatToBitcode); + ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat); if (M.IsNewDbgInfoFormat) M.removeDebugIntrinsicDeclarations(); diff --git a/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll b/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll index 1e9fd0df78922..a8c5c43c3a9f8 100644 --- a/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/X86/free-intrinsics.ll @@ -1,9 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -;; Pin this test to not use "RemoveDIs" non-intrinsic debug-info. We get the -;; correct output in that mode, but it generates spurious test changes, so -;; avoid that for the moment. -; RUN: opt -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size %s -S -o - --experimental-debuginfo-iterators=false | FileCheck %s --check-prefix=CHECK-SIZE -; RUN: opt -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=throughput %s -S -o - --experimental-debuginfo-iterators=false | FileCheck %s --check-prefix=CHECK-THROUGHPUT +; RUN: opt -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=code-size %s -S -o - | FileCheck %s --check-prefix=CHECK-SIZE +; RUN: opt -mtriple=x86_64-- -passes="print" 2>&1 -disable-output -cost-kind=throughput %s -S -o - | FileCheck %s --check-prefix=CHECK-THROUGHPUT define i32 @trivially_free() { ; CHECK-SIZE-LABEL: 'trivially_free' @@ -11,9 +8,6 @@ define i32 @trivially_free() { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.sideeffect() -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.assign(metadata ptr undef, metadata !6, metadata !DIExpression(), metadata !8, metadata ptr undef, metadata !DIExpression()), !dbg !9 -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.declare(metadata ptr undef, metadata !6, metadata !DIExpression()), !dbg !9 -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.label(metadata !10), !dbg !9 ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = call ptr @llvm.invariant.start.p0(i64 1, ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.invariant.end.p0(ptr undef, i64 1, ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) @@ -31,9 +25,6 @@ define i32 @trivially_free() { ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.sideeffect() -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.assign(metadata ptr undef, metadata !6, metadata !DIExpression(), metadata !8, metadata ptr undef, metadata !DIExpression()), !dbg !9 -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.declare(metadata ptr undef, metadata !6, metadata !DIExpression()), !dbg !9 -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.label(metadata !10), !dbg !9 ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = call ptr @llvm.invariant.start.p0(i64 1, ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.invariant.end.p0(ptr undef, i64 1, ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) @@ -50,10 +41,6 @@ define i32 @trivially_free() { call void @llvm.assume(i1 undef) call void @llvm.experimental.noalias.scope.decl(metadata !4) call void @llvm.sideeffect() - call void @llvm.dbg.assign(metadata ptr undef, metadata !0, metadata !DIExpression(), metadata !10, metadata ptr undef, metadata !DIExpression()), !dbg !8 - call void @llvm.dbg.declare(metadata ptr undef, metadata !0, metadata !DIExpression()), !dbg !8 - call void @llvm.dbg.value(metadata i64 undef, i64 undef, metadata !DIExpression(), metadata !DIExpression()), !dbg !8 - call void @llvm.dbg.label(metadata !2), !dbg !8 %a1 = call ptr @llvm.invariant.start.p0(i64 1, ptr undef) call void @llvm.invariant.end.p0(ptr undef, i64 1, ptr undef) %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) @@ -71,10 +58,6 @@ declare i32 @llvm.annotation.i32(i32, ptr, ptr, i32) declare void @llvm.assume(i1) declare void @llvm.experimental.noalias.scope.decl(metadata) declare void @llvm.sideeffect() -declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) -declare void @llvm.dbg.declare(metadata, metadata, metadata) -declare void @llvm.dbg.value(metadata, i64, metadata, metadata) -declare void @llvm.dbg.label(metadata) declare ptr @llvm.invariant.start.p0(i64, ptr) declare void @llvm.invariant.end.p0(ptr, i64, ptr) declare ptr @llvm.launder.invariant.group.p0(ptr) diff --git a/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll b/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll index 09fbd68d95e92..560af3d2b48fc 100644 --- a/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll +++ b/llvm/test/Analysis/CostModel/free-intrinsics-datalayout.ll @@ -1,9 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -;; Pin this test to not use "RemoveDIs" non-intrinsic debug-info. We get the -;; correct output in that mode, but it generates spurious test changes, so -;; avoid that for the moment. -; RUN: opt -passes="print" 2>&1 -disable-output -cost-kind=code-size %s -S -o - --experimental-debuginfo-iterators=false | FileCheck %s --check-prefix=CHECK-SIZE -; RUN: opt -passes="print" 2>&1 -disable-output -cost-kind=throughput %s -S -o - --experimental-debuginfo-iterators=false | FileCheck %s --check-prefix=CHECK-THROUGHPUT +; RUN: opt -passes="print" 2>&1 -disable-output -cost-kind=code-size %s -S -o - | FileCheck %s --check-prefix=CHECK-SIZE +; RUN: opt -passes="print" 2>&1 -disable-output -cost-kind=throughput %s -S -o - | FileCheck %s --check-prefix=CHECK-THROUGHPUT target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -13,9 +10,6 @@ define i32 @trivially_free() { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.sideeffect() -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.assign(metadata ptr undef, metadata !6, metadata !DIExpression(), metadata !8, metadata ptr undef, metadata !DIExpression()), !dbg !9 -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.declare(metadata ptr undef, metadata !6, metadata !DIExpression()), !dbg !9 -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.label(metadata !10), !dbg !9 ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = call ptr @llvm.invariant.start.p0(i64 1, ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.invariant.end.p0(ptr undef, i64 1, ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) @@ -35,9 +29,6 @@ define i32 @trivially_free() { ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.sideeffect() -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.assign(metadata ptr undef, metadata !6, metadata !DIExpression(), metadata !8, metadata ptr undef, metadata !DIExpression()), !dbg !9 -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.declare(metadata ptr undef, metadata !6, metadata !DIExpression()), !dbg !9 -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.label(metadata !10), !dbg !9 ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = call ptr @llvm.invariant.start.p0(i64 1, ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.invariant.end.p0(ptr undef, i64 1, ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) @@ -56,10 +47,6 @@ define i32 @trivially_free() { call void @llvm.assume(i1 undef) call void @llvm.experimental.noalias.scope.decl(metadata !4) call void @llvm.sideeffect() - call void @llvm.dbg.assign(metadata ptr undef, metadata !0, metadata !DIExpression(), metadata !10, metadata ptr undef, metadata !DIExpression()), !dbg !8 - call void @llvm.dbg.declare(metadata ptr undef, metadata !0, metadata !DIExpression()), !dbg !8 - call void @llvm.dbg.value(metadata i64 undef, i64 undef, metadata !DIExpression(), metadata !DIExpression()), !dbg !8 - call void @llvm.dbg.label(metadata !2), !dbg !8 %a1 = call ptr @llvm.invariant.start.p0(i64 1, ptr undef) call void @llvm.invariant.end.p0(ptr undef, i64 1, ptr undef) %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) @@ -79,10 +66,6 @@ declare i32 @llvm.annotation.i32(i32, ptr, ptr, i32) declare void @llvm.assume(i1) declare void @llvm.experimental.noalias.scope.decl(metadata) declare void @llvm.sideeffect() -declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) -declare void @llvm.dbg.declare(metadata, metadata, metadata) -declare void @llvm.dbg.value(metadata, i64, metadata, metadata) -declare void @llvm.dbg.label(metadata) declare ptr @llvm.invariant.start.p0(i64, ptr) declare void @llvm.invariant.end.p0(ptr, i64, ptr) declare ptr @llvm.launder.invariant.group.p0(ptr) diff --git a/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll b/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll index 3e78c62a6fe25..53828f2f07277 100644 --- a/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll +++ b/llvm/test/Analysis/CostModel/free-intrinsics-no_info.ll @@ -1,9 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -;; Pin this test to not use "RemoveDIs" non-intrinsic debug-info. We get the -;; correct output in that mode, but it generates spurious test changes, so -;; avoid that for the moment. -; RUN: opt -passes="print" 2>&1 -disable-output -cost-kind=code-size %s -S -o - --experimental-debuginfo-iterators=false | FileCheck %s --check-prefix=CHECK-SIZE -; RUN: opt -passes="print" 2>&1 -disable-output -cost-kind=throughput %s -S -o - --experimental-debuginfo-iterators=false | FileCheck %s --check-prefix=CHECK-THROUGHPUT +; RUN: opt -passes="print" 2>&1 -disable-output -cost-kind=code-size %s -S -o - | FileCheck %s --check-prefix=CHECK-SIZE +; RUN: opt -passes="print" 2>&1 -disable-output -cost-kind=throughput %s -S -o - | FileCheck %s --check-prefix=CHECK-THROUGHPUT define i32 @trivially_free() { ; CHECK-SIZE-LABEL: 'trivially_free' @@ -11,9 +8,6 @@ define i32 @trivially_free() { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.sideeffect() -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.assign(metadata ptr undef, metadata !6, metadata !DIExpression(), metadata !8, metadata ptr undef, metadata !DIExpression()), !dbg !9 -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.declare(metadata ptr undef, metadata !6, metadata !DIExpression()), !dbg !9 -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.label(metadata !10), !dbg !9 ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = call ptr @llvm.invariant.start.p0(i64 1, ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.invariant.end.p0(ptr undef, i64 1, ptr undef) ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) @@ -33,9 +27,6 @@ define i32 @trivially_free() { ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.assume(i1 undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.experimental.noalias.scope.decl(metadata !3) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.sideeffect() -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.assign(metadata ptr undef, metadata !6, metadata !DIExpression(), metadata !8, metadata ptr undef, metadata !DIExpression()), !dbg !9 -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.declare(metadata ptr undef, metadata !6, metadata !DIExpression()), !dbg !9 -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.dbg.label(metadata !10), !dbg !9 ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a1 = call ptr @llvm.invariant.start.p0(i64 1, ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: call void @llvm.invariant.end.p0(ptr undef, i64 1, ptr undef) ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) @@ -54,10 +45,6 @@ define i32 @trivially_free() { call void @llvm.assume(i1 undef) call void @llvm.experimental.noalias.scope.decl(metadata !4) call void @llvm.sideeffect() - call void @llvm.dbg.assign(metadata ptr undef, metadata !0, metadata !DIExpression(), metadata !10, metadata ptr undef, metadata !DIExpression()), !dbg !8 - call void @llvm.dbg.declare(metadata ptr undef, metadata !0, metadata !DIExpression()), !dbg !8 - call void @llvm.dbg.value(metadata i64 undef, i64 undef, metadata !DIExpression(), metadata !DIExpression()), !dbg !8 - call void @llvm.dbg.label(metadata !2), !dbg !8 %a1 = call ptr @llvm.invariant.start.p0(i64 1, ptr undef) call void @llvm.invariant.end.p0(ptr undef, i64 1, ptr undef) %a2 = call ptr @llvm.launder.invariant.group.p0(ptr undef) @@ -77,10 +64,6 @@ declare i32 @llvm.annotation.i32(i32, ptr, ptr, i32) declare void @llvm.assume(i1) declare void @llvm.experimental.noalias.scope.decl(metadata) declare void @llvm.sideeffect() -declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) -declare void @llvm.dbg.declare(metadata, metadata, metadata) -declare void @llvm.dbg.value(metadata, i64, metadata, metadata) -declare void @llvm.dbg.label(metadata) declare ptr @llvm.invariant.start.p0(i64, ptr) declare void @llvm.invariant.end.p0(ptr, i64, ptr) declare ptr @llvm.launder.invariant.group.p0(ptr) diff --git a/llvm/test/Bitcode/DIExpression-aggresult.ll b/llvm/test/Bitcode/DIExpression-aggresult.ll index 309ca1f1d47b1..c7f91492f8e2e 100644 --- a/llvm/test/Bitcode/DIExpression-aggresult.ll +++ b/llvm/test/Bitcode/DIExpression-aggresult.ll @@ -1,5 +1,4 @@ ; RUN: llvm-dis -o - %s.bc | FileCheck %s -; RUN: llvm-dis -o - %s.bc --load-bitcode-into-experimental-debuginfo-iterators=true | FileCheck %s %class.A = type { i32, i32, i32, i32 } define void @_Z3fooi(%class.A* sret(%class.A) %agg.result) #0 !dbg !3 { diff --git a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll b/llvm/test/Bitcode/dbg-intrinsics-autoupgrade.ll similarity index 52% rename from llvm/test/DebugInfo/print-non-instruction-debug-info.ll rename to llvm/test/Bitcode/dbg-intrinsics-autoupgrade.ll index 490f24ff76ff5..d4d66b7c4fb02 100644 --- a/llvm/test/DebugInfo/print-non-instruction-debug-info.ll +++ b/llvm/test/Bitcode/dbg-intrinsics-autoupgrade.ll @@ -1,39 +1,29 @@ -;; Test that we can write in the new debug info format. -; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=false < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg -; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=false --write-experimental-debuginfo=true < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg +;; Test that we can read old debug intrinsics from bitcode, and autoupgrade +;; them to the new debug-records format. +; RUN: opt --passes=verify %s.bc -o - -S \ +; RUN: | FileCheck %s --implicit-check-not=llvm.dbg -;; Test also that the new flag is independent of the flag that enables use of -;; these non-instruction debug info during LLVM passes. -; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=false < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,OLDDBG --implicit-check-not=llvm.dbg -; RUN: opt --passes=verify -S --experimental-debuginfo-iterators=true --write-experimental-debuginfo=true < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,NEWDBG --implicit-check-not=llvm.dbg +;; While we're at it, test the textual IR autoupgrade path too. +; RUN: opt --passes=verify %s -o - -S \ +; RUN: | FileCheck %s --implicit-check-not=llvm.dbg + +;; Bitcode file was assembled with llvm-as ./brains.ll -o out.bc +;; --write-experimental-debuginfo-iterators-to-bitcode=false +;; immediately before the latter flag was deleted. ; CHECK: @f(i32 %[[VAL_A:[0-9a-zA-Z]+]]) ; CHECK-NEXT: entry: -; OLDDBG-NEXT: call void @llvm.dbg.value(metadata i32 %[[VAL_A]], metadata ![[VAR_A:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_1:[0-9]+]] -; NEWDBG-NEXT: {{^}} #dbg_value(i32 %[[VAL_A]], ![[VAR_A:[0-9]+]], !DIExpression(), ![[LOC_1:[0-9]+]]) +; CHECK-NEXT: {{^}} #dbg_value(i32 %[[VAL_A]], ![[VAR_A:[0-9]+]], !DIExpression(), ![[LOC_1:[0-9]+]]) ; CHECK-NEXT: {{^}} %[[VAL_B:[0-9a-zA-Z]+]] = alloca -; OLDDBG-NEXT: call void @llvm.dbg.declare(metadata ptr %[[VAL_B]], metadata ![[VAR_B:[0-9]+]], metadata !DIExpression()), !dbg ![[LOC_2:[0-9]+]] -; NEWDBG-NEXT: {{^}} #dbg_declare(ptr %[[VAL_B]], ![[VAR_B:[0-9]+]], !DIExpression(), ![[LOC_2:[0-9]+]]) +; CHECK-NEXT: {{^}} #dbg_declare(ptr %[[VAL_B]], ![[VAR_B:[0-9]+]], !DIExpression(), ![[LOC_2:[0-9]+]]) ; CHECK-NEXT: {{^}} %[[VAL_ADD:[0-9a-zA-Z]+]] = add i32 %[[VAL_A]], 5 -; OLDDBG-NEXT: call void @llvm.dbg.value(metadata !DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), metadata ![[VAR_A]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus)), !dbg ![[LOC_3:[0-9]+]] -; NEWDBG-NEXT: {{^}} #dbg_value(!DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), ![[VAR_A]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), ![[LOC_3:[0-9]+]]) -; OLDDBG-NEXT: call void @llvm.dbg.label(metadata ![[LABEL_ID:[0-9]+]]), !dbg ![[LOC_3]] -; NEWDBG-NEXT: {{^}} #dbg_label(![[LABEL_ID:[0-9]+]], ![[LOC_3]]) +; CHECK-NEXT: {{^}} #dbg_value(!DIArgList(i32 %[[VAL_A]], i32 %[[VAL_ADD]]), ![[VAR_A]], !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), ![[LOC_3:[0-9]+]]) +; CHECK-NEXT: {{^}} #dbg_label(![[LABEL_ID:[0-9]+]], ![[LOC_3]]) ; CHECK-NEXT: {{^}} store i32 %[[VAL_ADD]]{{.+}}, !DIAssignID ![[ASSIGNID:[0-9]+]] -; OLDDBG-NEXT: call void @llvm.dbg.assign(metadata i32 %[[VAL_ADD]], metadata ![[VAR_B]], metadata !DIExpression(), metadata ![[ASSIGNID]], metadata ptr %[[VAL_B]], metadata !DIExpression()), !dbg ![[LOC_4:[0-9]+]] -; NEWDBG-NEXT: {{^}} #dbg_assign(i32 %[[VAL_ADD]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ptr %[[VAL_B]], !DIExpression(), ![[LOC_4:[0-9]+]]) -; OLDDBG-NEXT: call void @llvm.dbg.assign(metadata ![[EMPTY:[0-9]+]], metadata ![[VAR_B]], metadata !DIExpression(), metadata ![[ASSIGNID]], metadata ![[EMPTY]], metadata !DIExpression()), !dbg ![[LOC_4]] -; NEWDBG-NEXT: {{^}} #dbg_assign(![[EMPTY:[0-9]+]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ![[EMPTY]], !DIExpression(), ![[LOC_4]]) +; CHECK-NEXT: {{^}} #dbg_assign(i32 %[[VAL_ADD]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ptr %[[VAL_B]], !DIExpression(), ![[LOC_4:[0-9]+]]) +; CHECK-NEXT: {{^}} #dbg_assign(![[EMPTY:[0-9]+]], ![[VAR_B]], !DIExpression(), ![[ASSIGNID]], ![[EMPTY]], !DIExpression(), ![[LOC_4]]) ; CHECK-NEXT: {{^}} ret i32 -; OLDDBG-DAG: declare void @llvm.dbg.value -; OLDDBG-DAG: declare void @llvm.dbg.declare -; OLDDBG-DAG: declare void @llvm.dbg.assign - ; CHECK-DAG: llvm.dbg.cu ; CHECK-DAG: ![[VAR_A]] = !DILocalVariable(name: "a" ; CHECK-DAG: ![[VAR_B]] = !DILocalVariable(name: "b" @@ -44,6 +34,28 @@ ; CHECK-DAG: ![[LABEL_ID]] = !DILabel( ; CHECK-DAG: ![[EMPTY]] = !{} +;; Also test that the bitcode file itself doesn't contain any debug records, +;; and instead has function calls, the storage for intrinsics. This is to +;; ensure we're actually testing the autoupgrade path from a bitcode file that +;; definitely contains intrinsics. + +; RUN: llvm-bcanalyzer %s.bc --dump --disable-histogram | FileCheck %s --check-prefix=BITCODE --implicit-check-not=FUNCTION_BLOCK --implicit-check-not=DEBUG_RECORD + +; BITCODE-LABEL: +;; Summary text, +; BITCODE: Block ID #12 (FUNCTION_BLOCK): + define dso_local i32 @f(i32 %a) !dbg !7 { entry: call void @llvm.dbg.value(metadata i32 %a, metadata !20, metadata !DIExpression()), !dbg !30 diff --git a/llvm/test/Bitcode/dbg-intrinsics-autoupgrade.ll.bc b/llvm/test/Bitcode/dbg-intrinsics-autoupgrade.ll.bc new file mode 100644 index 0000000000000000000000000000000000000000..99f5452e8144581c37c31435ef227f45b9109468 GIT binary patch literal 2256 zcmY*aeN0=|6~E@$eg@q4UK-Qb<=K0NkXc(t9(EvhY%AE$Tbz|@PMS?j+Y~<=8)ykJ zHsK@LCV#b$Zr0{RNMmg#L4&qT`G>S?dV^wLNtuedDypefh zqvuH>L%A+gbrn=sH4OK)G3F04vpd~MWOp@qvQx>|t8^lMvrgyEWSS`bp}_(|uP_Kb z0dF;g<^O|$Is!Ec4 zHmVr(@`=RrNJf3WgXFpFgG3EeiA7~IjSgUI(OVmnS))NaFEUVm$f4zr5Nu~dB#a~3#f26GliGJ_HPoC+WH3XuqYHi8X} z2oXlm6?YLES)$C6{VchZ_X?3t zAsR^y_TVFaVNfd!P71nP&yx=k`41nlmL*o($udK1fESEOznnMBxeV_Z>E(oBxkRs( z^$%o*u=bdBhC6gIWiR^ne&&c@7#_h#CXRJ%XGtjnh8GMsc>11EzmnJgP-c(}HtM%{ z<%yS4)}m;i;hYeVM{|xf&OQdLn0dPfJQaq)9uw-7Q^YDu0Q*5ndA~%g`G^nM#=mdy^jt!}RNk{;)Za1c=Z$(m@SgDt zVdfY1>zwl%W*bl0ik;U1eQ@?H*y$C9BZBVcU1F0Z%O&D|J9&Q+UJ+$j>@p6%oScAE zks>^}2v8={E?Y^2&^ZCAV*C80{>GHPG0klOOz#iNl>!Ra>mpdEgtgtroB6kSVsR$8MdyRE&-T?H@XN85V6FYN`cTARs%vZeZQIWdykS20*0H1t{Th12S3r4fP1FRnho9|? zOaR^zRJKJid52z@dg7i-Y7G$Uot%E^z^b3P>w{7u*QA0W@A`=CnRo&pHB~BkM1U-m z^0+f{QMAs=?9(adxLEZfGzEIOOfNI^8ZXg%3Bw#uFHX^go#f;5GIARL1N`Dd?buEd zDE?#x$d{n7MMyzAWIXhv0%n&2avMS;^=7>2m`>R*LcwFs$&>?%KPyU|muC>_f_1mt z<$$jT8DKUXDP)g5HIL;fxgrd(@qs=*5b8r$E?@e2JX}&DtUWD@lL+}wQ}T0ETy0`e z4O(J)-rMI!ZT^c5{%=}(K>4ah_U5ZDNSP&OaOJIBXScS=i<-Rh0DHiMHePtS+CAAp z#Y0JojbScz0W~UdD(Ux`?!zZ?_`39!8CGIPytWo|4lOEX%{*FORo zxe4U(ERavxpiiw)qAzM$9%Zz2yDI6*Pf7wxBHYJ7NQ?6AERWcL4;fX;BXbq)X&Kws zuF9hhEf`P(2KaXj$Vt~q(h=@I(%lC@4qpROdC8!Bd>TEH#1AF$yOJ2~9O6*~EJ@+H^>+6^7FPf@L*X^E z-MxcMFc1#5u&owArZo_5HHR#gaG %t.intrinsics.ll -; RUN: llvm-reduce --abort-on-invalid-reduction --test FileCheck --test-arg --check-prefixes=INTRINSIC-INTERESTINGNESS --test-arg %s --test-arg --input-file %t.intrinsics.ll -o %t -; RUN: FileCheck --check-prefixes=INTRINSIC-FINAL --input-file=%t %s --implicit-check-not=#dbg_value - ; Test that we can, in RemoveDIs mode / DbgVariableRecords mode (where variable location ; information isn't an instruction), remove one variable location assignment ; but not another. ; CHECK-INTERESTINGNESS: #dbg_value(i32 %added, -; INTRINSIC-INTERESTINGNESS: llvm.dbg.value(metadata i32 %added, ; CHECK-FINAL: %added = add ; CHECK-FINAL-NEXT: #dbg_value(i32 %added, -; INTRINSIC-FINAL: %added = add -; INTRINSIC-FINAL-NEXT: llvm.dbg.value(metadata i32 %added, define i32 @main() !dbg !7 { entry: diff --git a/llvm/tools/llvm-as/llvm-as.cpp b/llvm/tools/llvm-as/llvm-as.cpp index 1c29a7b671c46..081bcb6faa719 100644 --- a/llvm/tools/llvm-as/llvm-as.cpp +++ b/llvm/tools/llvm-as/llvm-as.cpp @@ -67,7 +67,6 @@ static cl::opt ClDataLayout("data-layout", cl::value_desc("layout-string"), cl::init(""), cl::cat(AsCat)); extern cl::opt UseNewDbgInfoFormat; -extern bool WriteNewDbgInfoFormatToBitcode; static void WriteOutputFile(const Module *M, const ModuleSummaryIndex *Index) { // Infer the output filename if needed. @@ -142,8 +141,7 @@ int main(int argc, char **argv) { } // Convert to new debug format if requested. - M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat && - WriteNewDbgInfoFormatToBitcode); + M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat); if (M->IsNewDbgInfoFormat) M->removeDebugIntrinsicDeclarations(); diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp index 49acc9cd456ff..8c1aaf6fd604b 100644 --- a/llvm/tools/llvm-dis/llvm-dis.cpp +++ b/llvm/tools/llvm-dis/llvm-dis.cpp @@ -96,9 +96,7 @@ static cl::opt PrintThinLTOIndexOnly( cl::desc("Only read thinlto index and print the index as LLVM assembly."), cl::init(false), cl::Hidden, cl::cat(DisCategory)); -extern cl::opt WriteNewDbgInfoFormat; - -extern cl::opt LoadBitcodeIntoNewDbgInfoFormat; +extern cl::opt UseNewDbgInfoFormat; namespace { @@ -187,10 +185,6 @@ int main(int argc, char **argv) { cl::HideUnrelatedOptions({&DisCategory, &getColorCategory()}); cl::ParseCommandLineOptions(argc, argv, "llvm .bc -> .ll disassembler\n"); - // Load bitcode into the new debug info format by default. - if (LoadBitcodeIntoNewDbgInfoFormat == cl::boolOrDefault::BOU_UNSET) - LoadBitcodeIntoNewDbgInfoFormat = cl::boolOrDefault::BOU_TRUE; - if (InputFilenames.size() < 1) { InputFilenames.push_back("-"); } else if (InputFilenames.size() > 1 && !OutputFilename.empty()) { @@ -276,8 +270,8 @@ int main(int argc, char **argv) { // All that llvm-dis does is write the assembly to a file. if (!DontPrint) { if (M) { - M->setIsNewDbgInfoFormat(WriteNewDbgInfoFormat); - if (WriteNewDbgInfoFormat) + M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat); + if (UseNewDbgInfoFormat) M->removeDebugIntrinsicDeclarations(); M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder); } diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp index f0bf9d69bd0d3..ac90c8dc23284 100644 --- a/llvm/tools/llvm-link/llvm-link.cpp +++ b/llvm/tools/llvm-link/llvm-link.cpp @@ -131,10 +131,6 @@ static cl::opt IgnoreNonBitcode( extern cl::opt UseNewDbgInfoFormat; extern cl::opt PreserveInputDbgFormat; -extern cl::opt WriteNewDbgInfoFormat; -extern bool WriteNewDbgInfoFormatToBitcode; - -extern cl::opt LoadBitcodeIntoNewDbgInfoFormat; static ExitOnError ExitOnErr; @@ -483,10 +479,6 @@ int main(int argc, char **argv) { cl::HideUnrelatedOptions({&LinkCategory, &getColorCategory()}); cl::ParseCommandLineOptions(argc, argv, "llvm linker\n"); - // Load bitcode into the new debug info format by default. - if (LoadBitcodeIntoNewDbgInfoFormat == cl::boolOrDefault::BOU_UNSET) - LoadBitcodeIntoNewDbgInfoFormat = cl::boolOrDefault::BOU_TRUE; - // Since llvm-link collects multiple IR modules together, for simplicity's // sake we disable the "PreserveInputDbgFormat" flag to enforce a single // debug info format. @@ -545,10 +537,10 @@ int main(int argc, char **argv) { Composite->removeDebugIntrinsicDeclarations(); }; if (OutputAssembly) { - SetFormat(WriteNewDbgInfoFormat); + SetFormat(UseNewDbgInfoFormat); Composite->print(Out.os(), nullptr, PreserveAssemblyUseListOrder); } else if (Force || !CheckBitcodeOutputToConsole(Out.os())) { - SetFormat(UseNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode); + SetFormat(UseNewDbgInfoFormat); WriteBitcodeToFile(*Composite, Out.os(), PreserveBitcodeUseListOrder); } diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp index bf2b5be977eb4..25a3b442f3ccd 100644 --- a/llvm/tools/llvm-lto/llvm-lto.cpp +++ b/llvm/tools/llvm-lto/llvm-lto.cpp @@ -264,7 +264,6 @@ static cl::opt LTOSaveBeforeOpt("lto-save-before-opt", cl::init(false), cl::desc("Save the IR before running optimizations")); -extern cl::opt LoadBitcodeIntoNewDbgInfoFormat; extern cl::opt PreserveInputDbgFormat; namespace { @@ -1001,9 +1000,6 @@ int main(int argc, char **argv) { InitLLVM X(argc, argv); cl::HideUnrelatedOptions({<OCategory, &getColorCategory()}); cl::ParseCommandLineOptions(argc, argv, "llvm LTO linker\n"); - // Load bitcode into the new debug info format by default. - if (LoadBitcodeIntoNewDbgInfoFormat == cl::boolOrDefault::BOU_UNSET) - LoadBitcodeIntoNewDbgInfoFormat = cl::boolOrDefault::BOU_TRUE; // Since llvm-lto collects multiple IR modules together, for simplicity's sake // we disable the "PreserveInputDbgFormat" flag to enforce a single debug info diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp index 76ff11b8d6412..c41e37bd5548a 100644 --- a/llvm/tools/llvm-lto2/llvm-lto2.cpp +++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -199,7 +199,6 @@ static cl::opt AllVtablesHaveTypeInfos("all-vtables-have-type-infos", cl::Hidden, cl::desc("All vtables have type infos")); -extern cl::opt LoadBitcodeIntoNewDbgInfoFormat; extern cl::opt PreserveInputDbgFormat; static void check(Error E, std::string Msg) { @@ -236,9 +235,6 @@ static int usage() { static int run(int argc, char **argv) { cl::ParseCommandLineOptions(argc, argv, "Resolution-based LTO test harness"); - // Load bitcode into the new debug info format by default. - if (LoadBitcodeIntoNewDbgInfoFormat == cl::boolOrDefault::BOU_UNSET) - LoadBitcodeIntoNewDbgInfoFormat = cl::boolOrDefault::BOU_TRUE; // Since llvm-lto2 collects multiple IR modules together, for simplicity's // sake we disable the "PreserveInputDbgFormat" flag to enforce a single debug diff --git a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp index 24f4f11db9a8b..632fd7b0e1a32 100644 --- a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp +++ b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp @@ -25,8 +25,7 @@ using namespace IRSimilarity; extern llvm::cl::opt UseNewDbgInfoFormat; extern cl::opt PreserveInputDbgFormat; -extern bool WriteNewDbgInfoFormatToBitcode; -extern cl::opt WriteNewDbgInfoFormat; +extern cl::opt UseNewDbgInfoFormat; static std::unique_ptr makeLLVMModule(LLVMContext &Context, StringRef ModuleStr) { diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp index 8d8f991e9ea49..cc9118034d5b5 100644 --- a/llvm/unittests/Transforms/Utils/LocalTest.cpp +++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp @@ -30,8 +30,6 @@ using namespace llvm; extern llvm::cl::opt UseNewDbgInfoFormat; extern cl::opt PreserveInputDbgFormat; -extern bool WriteNewDbgInfoFormatToBitcode; -extern cl::opt WriteNewDbgInfoFormat; // Backup all of the existing settings that may be modified when // PreserveInputDbgFormat=true, so that when the test is finished we return them @@ -39,13 +37,9 @@ extern cl::opt WriteNewDbgInfoFormat; static auto SaveDbgInfoFormat() { return make_scope_exit( [OldPreserveInputDbgFormat = PreserveInputDbgFormat.getValue(), - OldUseNewDbgInfoFormat = UseNewDbgInfoFormat.getValue(), - OldWriteNewDbgInfoFormatToBitcode = WriteNewDbgInfoFormatToBitcode, - OldWriteNewDbgInfoFormat = WriteNewDbgInfoFormat.getValue()] { + OldUseNewDbgInfoFormat = UseNewDbgInfoFormat.getValue()] { PreserveInputDbgFormat = OldPreserveInputDbgFormat; UseNewDbgInfoFormat = OldUseNewDbgInfoFormat; - WriteNewDbgInfoFormatToBitcode = OldWriteNewDbgInfoFormatToBitcode; - WriteNewDbgInfoFormat = OldWriteNewDbgInfoFormat; }); } diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp index 92e786b130abb..e3fa7c883c524 100644 --- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp +++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp @@ -20,7 +20,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -extern llvm::cl::opt WriteNewDbgInfoFormat; +extern llvm::cl::opt UseNewDbgInfoFormat; using namespace mlir; @@ -38,8 +38,8 @@ void registerToLLVMIRTranslation() { // format that LLVM expects us to print. // See https://llvm.org/docs/RemoveDIsDebugInfo.html llvm::ScopedDbgInfoFormatSetter formatSetter(*llvmModule, - WriteNewDbgInfoFormat); - if (WriteNewDbgInfoFormat) + UseNewDbgInfoFormat); + if (UseNewDbgInfoFormat) llvmModule->removeDebugIntrinsicDeclarations(); llvmModule->print(output, nullptr); return success(); diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir index ab39a29515cc2..274d64af78283 100644 --- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir @@ -1,5 +1,4 @@ -// RUN: mlir-translate -mlir-to-llvmir --write-experimental-debuginfo=false --split-input-file %s | FileCheck %s --check-prefixes=CHECK,INTRINSICS -// RUN: mlir-translate -mlir-to-llvmir --write-experimental-debuginfo=true --split-input-file %s | FileCheck %s --check-prefixes=CHECK,RECORDS +// RUN: mlir-translate -mlir-to-llvmir --split-input-file %s | FileCheck %s --check-prefixes=CHECK,RECORDS // CHECK-LABEL: define void @func_with_empty_named_info() // Check that translation doens't crash in the presence of an inlineble call @@ -100,15 +99,12 @@ llvm.func @func_with_debug(%arg: i64) { %allocCount = llvm.mlir.constant(1 : i32) : i32 %alloc = llvm.alloca %allocCount x i64 : (i32) -> !llvm.ptr - // INTRINSICS: call void @llvm.dbg.value(metadata i64 %[[ARG]], metadata ![[VAR_LOC:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 1)) // RECORDS: #dbg_value(i64 %[[ARG]], ![[VAR_LOC:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 1), !{{.*}}) llvm.intr.dbg.value #variable #llvm.di_expression<[DW_OP_LLVM_fragment(0, 1)]> = %arg : i64 - // INTRINSICS: call void @llvm.dbg.declare(metadata ptr %[[ALLOC]], metadata ![[ADDR_LOC:[0-9]+]], metadata !DIExpression(DW_OP_deref, DW_OP_LLVM_convert, 4, DW_ATE_signed)) // RECORDS: #dbg_declare(ptr %[[ALLOC]], ![[ADDR_LOC:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_LLVM_convert, 4, DW_ATE_signed), !{{.*}}) llvm.intr.dbg.declare #variableAddr #llvm.di_expression<[DW_OP_deref, DW_OP_LLVM_convert(4, DW_ATE_signed)]> = %alloc : !llvm.ptr - // INTRINSICS: call void @llvm.dbg.value(metadata i64 %[[ARG]], metadata ![[NO_NAME_VAR:[0-9]+]], metadata !DIExpression()) // RECORDS: #dbg_value(i64 %[[ARG]], ![[NO_NAME_VAR:[0-9]+]], !DIExpression(), !{{.*}}) llvm.intr.dbg.value #noNameVariable = %arg : i64 @@ -230,13 +226,10 @@ llvm.func @func_decl_with_subprogram() -> (i32) loc(fused<#di_subprogram>["foo.m // CHECK-LABEL: define i32 @func_with_inlined_dbg_value( // CHECK-SAME: i32 %[[ARG:.*]]) !dbg ![[OUTER_FUNC:[0-9]+]] llvm.func @func_with_inlined_dbg_value(%arg0: i32) -> (i32) { - // INTRINSICS: call void @llvm.dbg.value(metadata i32 %[[ARG]], metadata ![[VAR_LOC0:[0-9]+]], metadata !DIExpression()), !dbg ![[DBG_LOC0:.*]] // RECORDS: #dbg_value(i32 %[[ARG]], ![[VAR_LOC0:[0-9]+]], !DIExpression(), ![[DBG_LOC0:.*]]) llvm.intr.dbg.value #di_local_variable0 = %arg0 : i32 loc(fused<#di_subprogram>[#loc0]) - // INTRINSICS: call void @llvm.dbg.value(metadata i32 %[[ARG]], metadata ![[VAR_LOC1:[0-9]+]], metadata !DIExpression()), !dbg ![[DBG_LOC1:.*]] // RECORDS: #dbg_value(i32 %[[ARG]], ![[VAR_LOC1:[0-9]+]], !DIExpression(), ![[DBG_LOC1:.*]]) llvm.intr.dbg.value #di_local_variable1 = %arg0 : i32 loc(#loc1) - // INTRINSICS: call void @llvm.dbg.label(metadata ![[LABEL:[0-9]+]]), !dbg ![[DBG_LOC1:.*]] // RECORDS: #dbg_label(![[LABEL:[0-9]+]], ![[DBG_LOC1:.*]]) llvm.intr.dbg.label #di_label loc(#loc1) llvm.return %arg0 : i32 @@ -268,7 +261,6 @@ llvm.func @func_with_inlined_dbg_value(%arg0: i32) -> (i32) { // CHECK-LABEL: define void @func_without_subprogram( // CHECK-SAME: i32 %[[ARG:.*]]) llvm.func @func_without_subprogram(%0 : i32) { - // INTRINSICS: call void @llvm.dbg.value(metadata i32 %[[ARG]], metadata ![[VAR_LOC:[0-9]+]], metadata !DIExpression()), !dbg ![[DBG_LOC0:.*]] // RECORDS: #dbg_value(i32 %[[ARG]], ![[VAR_LOC:[0-9]+]], !DIExpression(), ![[DBG_LOC0:.*]]) llvm.intr.dbg.value #di_local_variable = %0 : i32 loc(fused<#di_subprogram>[#loc]) llvm.return @@ -300,13 +292,10 @@ llvm.func @func_without_subprogram(%0 : i32) { llvm.func @dbg_intrinsics_with_no_location(%arg0: i32) -> (i32) { %allocCount = llvm.mlir.constant(1 : i32) : i32 %alloc = llvm.alloca %allocCount x i64 : (i32) -> !llvm.ptr - // INTRINSICS-NOT: @llvm.dbg.value // RECORDS-NOT: #dbg_value llvm.intr.dbg.value #di_local_variable = %arg0 : i32 - // INTRINSICS-NOT: @llvm.dbg.declare // RECORDS-NOT: #dbg_declare llvm.intr.dbg.declare #declared_var = %alloc : !llvm.ptr - // INTRINSICS-NOT: @llvm.dbg.label // RECORDS-NOT: #dbg_label llvm.intr.dbg.label #di_label llvm.return %arg0 : i32 diff --git a/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp b/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp index 157c6265be834..103817df41d34 100644 --- a/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp +++ b/mlir/test/lib/Dialect/Test/TestToLLVMIRTranslation.cpp @@ -24,8 +24,6 @@ #include "llvm/ADT/TypeSwitch.h" #include "llvm/IR/DebugProgramInstruction.h" -extern llvm::cl::opt WriteNewDbgInfoFormat; - using namespace mlir; namespace { @@ -125,13 +123,7 @@ void registerTestToLLVMIR() { if (!llvmModule) return failure(); - // When printing LLVM IR, we should convert the module to the debug info - // format that LLVM expects us to print. - // See https://llvm.org/docs/RemoveDIsDebugInfo.html - llvm::ScopedDbgInfoFormatSetter formatSetter(*llvmModule, - WriteNewDbgInfoFormat); - if (WriteNewDbgInfoFormat) - llvmModule->removeDebugIntrinsicDeclarations(); + llvmModule->removeDebugIntrinsicDeclarations(); llvmModule->print(output, nullptr); return success(); }, From bcf0f8d8aa27910545762e2cc7733a11a00954d0 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 1 Apr 2025 14:39:17 +0100 Subject: [PATCH 0239/1029] [libclc] Move exp10 to the CLC library (#133899) The builtin was already nominally in the CLC library; this commit just moves it over. It also vectorizes the builtin on its way. --- libclc/clc/include/clc/math/clc_exp10.h | 20 +++ libclc/clc/include/clc/math/tables.h | 2 +- libclc/clc/lib/generic/SOURCES | 1 + libclc/clc/lib/generic/math/clc_exp10.cl | 21 +++ libclc/clc/lib/generic/math/clc_exp10.inc | 155 ++++++++++++++++++++++ libclc/clc/lib/generic/math/clc_tables.cl | 22 +++ libclc/clspv/lib/SOURCES | 1 - libclc/generic/lib/SOURCES | 1 - libclc/generic/lib/math/clc_exp10.cl | 152 --------------------- libclc/generic/lib/math/exp10.cl | 6 +- libclc/generic/lib/math/tables.cl | 69 ---------- libclc/spirv/lib/SOURCES | 1 - 12 files changed, 223 insertions(+), 228 deletions(-) create mode 100644 libclc/clc/include/clc/math/clc_exp10.h create mode 100644 libclc/clc/lib/generic/math/clc_exp10.cl create mode 100644 libclc/clc/lib/generic/math/clc_exp10.inc delete mode 100644 libclc/generic/lib/math/clc_exp10.cl diff --git a/libclc/clc/include/clc/math/clc_exp10.h b/libclc/clc/include/clc/math/clc_exp10.h new file mode 100644 index 0000000000000..4f98bf7de6a74 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_exp10.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_EXP10_H__ +#define __CLC_MATH_CLC_EXP10_H__ + +#define __CLC_BODY +#define __CLC_FUNCTION __clc_exp10 + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_EXP10_H__ diff --git a/libclc/clc/include/clc/math/tables.h b/libclc/clc/include/clc/math/tables.h index 3120a18cc996e..44679df8d2cdc 100644 --- a/libclc/clc/include/clc/math/tables.h +++ b/libclc/clc/include/clc/math/tables.h @@ -64,13 +64,13 @@ TABLE_FUNCTION_DECL(float2, log10_tbl); TABLE_FUNCTION_DECL(uint4, pibits_tbl); TABLE_FUNCTION_DECL(float2, sinhcosh_tbl); TABLE_FUNCTION_DECL(float2, cbrt_tbl); -TABLE_FUNCTION_DECL(float, exp_tbl); CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl_ep_head); CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl_ep_tail); CLC_TABLE_FUNCTION_DECL(float, loge_tbl_lo); CLC_TABLE_FUNCTION_DECL(float, loge_tbl_hi); CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl); +CLC_TABLE_FUNCTION_DECL(float, exp_tbl); CLC_TABLE_FUNCTION_DECL(float, exp_tbl_ep_head); CLC_TABLE_FUNCTION_DECL(float, exp_tbl_ep_tail); diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index 474b11d745a44..becfa3ff6dbed 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -32,6 +32,7 @@ math/clc_ceil.cl math/clc_copysign.cl math/clc_cospi.cl math/clc_ep_log.cl +math/clc_exp10.cl math/clc_fabs.cl math/clc_fma.cl math/clc_fmod.cl diff --git a/libclc/clc/lib/generic/math/clc_exp10.cl b/libclc/clc/lib/generic/math/clc_exp10.cl new file mode 100644 index 0000000000000..04e912ed98885 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_exp10.cl @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/math/clc_exp10.inc b/libclc/clc/lib/generic/math/clc_exp10.inc new file mode 100644 index 0000000000000..96bc5331fef17 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_exp10.inc @@ -0,0 +1,155 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Algorithm: +// +// e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) +// +// x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer +// n = 64*m + j, 0 <= j < 64 +// +// e^x = 2^((64*m + j + f)/64) +// = (2^m) * (2^(j/64)) * 2^(f/64) +// = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) +// +// f = x*(64/ln(2)) - n +// r = f*(ln(2)/64) = x - n*(ln(2)/64) +// +// e^x = (2^m) * (2^(j/64)) * e^r +// +// (2^(j/64)) is precomputed +// +// e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! +// e^r = 1 + q +// +// q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! +// +// e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_exp10(__CLC_GENTYPE x) { + // 128*log2/log10 : 38.53183944498959 + const __CLC_GENTYPE X_MAX = 0x1.344134p+5f; + // -149*log2/log10 : -44.8534693539332 + const __CLC_GENTYPE X_MIN = -0x1.66d3e8p+5f; + // 64*log10/log2 : 212.6033980727912 + const __CLC_GENTYPE R_64_BY_LOG10_2 = 0x1.a934f0p+7f; + // log2/(64 * log10) lead : 0.004699707 + const __CLC_GENTYPE R_LOG10_2_BY_64_LD = 0x1.340000p-8f; + // log2/(64 * log10) tail : 0.00000388665057 + const __CLC_GENTYPE R_LOG10_2_BY_64_TL = 0x1.04d426p-18f; + const __CLC_GENTYPE R_LN10 = 0x1.26bb1cp+1f; + + __CLC_INTN return_nan = __clc_isnan(x); + __CLC_INTN return_inf = x > X_MAX; + __CLC_INTN return_zero = x < X_MIN; + + __CLC_INTN n = __CLC_CONVERT_INTN(x * R_64_BY_LOG10_2); + + __CLC_GENTYPE fn = __CLC_CONVERT_GENTYPE(n); + __CLC_INTN j = n & 0x3f; + __CLC_INTN m = n >> 6; + __CLC_INTN m2 = m << EXPSHIFTBITS_SP32; + __CLC_GENTYPE r; + + r = R_LN10 * + __clc_mad(fn, -R_LOG10_2_BY_64_TL, __clc_mad(fn, -R_LOG10_2_BY_64_LD, x)); + + // Truncated Taylor series for e^r + __CLC_GENTYPE z2 = + __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r, + 0x1.000000p-1f), + r * r, r); + + __CLC_GENTYPE two_to_jby64 = USE_TABLE(exp_tbl, j); + z2 = __clc_mad(two_to_jby64, z2, two_to_jby64); + + __CLC_GENTYPE z2s = z2 * __CLC_AS_GENTYPE((__CLC_UINTN)0x1 << (m + 149)); + __CLC_GENTYPE z2n = __CLC_AS_GENTYPE(__CLC_AS_INTN(z2) + m2); + z2 = m <= -126 ? z2s : z2n; + + z2 = return_inf ? __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32) : z2; + z2 = return_zero ? 0.0f : z2; + z2 = return_nan ? x : z2; + return z2; +} + +#elif __CLC_FPSIZE == 64 + +_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_exp10(__CLC_GENTYPE x) { + // 1024*ln(2)/ln(10) + const __CLC_GENTYPE X_MAX = 0x1.34413509f79ffp+8; + // -1074*ln(2)/ln(10) + const __CLC_GENTYPE X_MIN = -0x1.434e6420f4374p+8; + // 64*ln(10)/ln(2) + const __CLC_GENTYPE R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7; + // head ln(2)/(64*ln(10)) + const __CLC_GENTYPE R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8; + // tail ln(2)/(64*ln(10)) + const __CLC_GENTYPE R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37; + // ln(10) + const __CLC_GENTYPE R_LN10 = 0x1.26bb1bbb55516p+1; + + __CLC_INTN n = __CLC_CONVERT_INTN(x * R_64_BY_LOG10_2); + + __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n); + + __CLC_INTN j = n & 0x3f; + __CLC_INTN m = n >> 6; + + __CLC_GENTYPE r = R_LN10 * __clc_fma(-R_LOG10_2_BY_64_TL, dn, + __clc_fma(-R_LOG10_2_BY_64_LD, dn, x)); + + // 6 term tail of Taylor expansion of e^r + __CLC_GENTYPE z2 = + r * __clc_fma( + r, + __clc_fma(r, + __clc_fma(r, + __clc_fma(r, + __clc_fma(r, 0x1.6c16c16c16c17p-10, + 0x1.1111111111111p-7), + 0x1.5555555555555p-5), + 0x1.5555555555555p-3), + 0x1.0000000000000p-1), + 1.0); + + __CLC_GENTYPE tv0 = USE_TABLE(two_to_jby64_ep_tbl_head, j); + __CLC_GENTYPE tv1 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); + z2 = __clc_fma(tv0 + tv1, z2, tv1) + tv0; + + __CLC_INTN small_value = + (m < -1022) || ((m == -1022) && __CLC_CONVERT_INTN(z2 < 1.0)); + + __CLC_INTN n1 = m >> 2; + __CLC_INTN n2 = m - n1; + __CLC_GENTYPE z3 = + z2 * __CLC_AS_GENTYPE((__CLC_CONVERT_LONGN(n1) + 1023) << 52); + z3 *= __CLC_AS_GENTYPE((__CLC_CONVERT_LONGN(n2) + 1023) << 52); + + z2 = __clc_ldexp(z2, m); + z2 = __CLC_CONVERT_LONGN(small_value) ? z3 : z2; + + z2 = __clc_isnan(x) ? x : z2; + + z2 = x > X_MAX ? __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) : z2; + z2 = x < X_MIN ? 0.0 : z2; + + return z2; +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_exp10(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_exp10(__CLC_CONVERT_FLOATN(x))); +} + +#endif diff --git a/libclc/clc/lib/generic/math/clc_tables.cl b/libclc/clc/lib/generic/math/clc_tables.cl index 040889e59c10e..981004c8c6cd9 100644 --- a/libclc/clc/lib/generic/math/clc_tables.cl +++ b/libclc/clc/lib/generic/math/clc_tables.cl @@ -197,6 +197,28 @@ DECLARE_TABLE(float, LOG_INV_TBL, 129) = { CLC_TABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl); +DECLARE_TABLE(float, EXP_TBL, 65) = { + 0x1.000000p+0f, 0x1.02c9a4p+0f, 0x1.059b0ep+0f, 0x1.087452p+0f, + 0x1.0b5586p+0f, 0x1.0e3ec4p+0f, 0x1.11301ep+0f, 0x1.1429aap+0f, + 0x1.172b84p+0f, 0x1.1a35bep+0f, 0x1.1d4874p+0f, 0x1.2063b8p+0f, + 0x1.2387a6p+0f, 0x1.26b456p+0f, 0x1.29e9e0p+0f, 0x1.2d285ap+0f, + 0x1.306fe0p+0f, 0x1.33c08cp+0f, 0x1.371a74p+0f, 0x1.3a7db4p+0f, + 0x1.3dea64p+0f, 0x1.4160a2p+0f, 0x1.44e086p+0f, 0x1.486a2cp+0f, + 0x1.4bfdaep+0f, 0x1.4f9b28p+0f, 0x1.5342b6p+0f, 0x1.56f474p+0f, + 0x1.5ab07ep+0f, 0x1.5e76f2p+0f, 0x1.6247ecp+0f, 0x1.662388p+0f, + 0x1.6a09e6p+0f, 0x1.6dfb24p+0f, 0x1.71f75ep+0f, 0x1.75feb6p+0f, + 0x1.7a1148p+0f, 0x1.7e2f34p+0f, 0x1.82589ap+0f, 0x1.868d9ap+0f, + 0x1.8ace54p+0f, 0x1.8f1aeap+0f, 0x1.93737cp+0f, 0x1.97d82ap+0f, + 0x1.9c4918p+0f, 0x1.a0c668p+0f, 0x1.a5503cp+0f, 0x1.a9e6b6p+0f, + 0x1.ae89fap+0f, 0x1.b33a2cp+0f, 0x1.b7f770p+0f, 0x1.bcc1eap+0f, + 0x1.c199bep+0f, 0x1.c67f12p+0f, 0x1.cb720ep+0f, 0x1.d072d4p+0f, + 0x1.d5818ep+0f, 0x1.da9e60p+0f, 0x1.dfc974p+0f, 0x1.e502eep+0f, + 0x1.ea4afap+0f, 0x1.efa1bep+0f, 0x1.f50766p+0f, 0x1.fa7c18p+0f, + 0x1.000000p+1f, +}; + +CLC_TABLE_FUNCTION(float, EXP_TBL, exp_tbl); + DECLARE_TABLE(float, EXP_TBL_EP_HEAD, 65) = { 0x1.000000p+0f, 0x1.02c000p+0f, 0x1.058000p+0f, 0x1.084000p+0f, 0x1.0b4000p+0f, 0x1.0e0000p+0f, 0x1.110000p+0f, 0x1.140000p+0f, diff --git a/libclc/clspv/lib/SOURCES b/libclc/clspv/lib/SOURCES index 0ef09d275243b..64122395438aa 100644 --- a/libclc/clspv/lib/SOURCES +++ b/libclc/clspv/lib/SOURCES @@ -16,7 +16,6 @@ subnormal_config.cl ../../generic/lib/math/atanh.cl ../../generic/lib/math/atanpi.cl ../../generic/lib/math/cbrt.cl -../../generic/lib/math/clc_exp10.cl ../../generic/lib/math/clc_tan.cl ../../generic/lib/math/cos.cl ../../generic/lib/math/cosh.cl diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES index 9b5bbc5d9b53c..a9dc2304c0d0e 100644 --- a/libclc/generic/lib/SOURCES +++ b/libclc/generic/lib/SOURCES @@ -100,7 +100,6 @@ math/exp.cl math/exp_helper.cl math/expm1.cl math/exp2.cl -math/clc_exp10.cl math/exp10.cl math/fabs.cl math/fdim.cl diff --git a/libclc/generic/lib/math/clc_exp10.cl b/libclc/generic/lib/math/clc_exp10.cl deleted file mode 100644 index 0b6f2f9f26db1..0000000000000 --- a/libclc/generic/lib/math/clc_exp10.cl +++ /dev/null @@ -1,152 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// Algorithm: -// -// e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) -// -// x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer -// n = 64*m + j, 0 <= j < 64 -// -// e^x = 2^((64*m + j + f)/64) -// = (2^m) * (2^(j/64)) * 2^(f/64) -// = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) -// -// f = x*(64/ln(2)) - n -// r = f*(ln(2)/64) = x - n*(ln(2)/64) -// -// e^x = (2^m) * (2^(j/64)) * e^r -// -// (2^(j/64)) is precomputed -// -// e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! -// e^r = 1 + q -// -// q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! -// -// e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) - -_CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x) { - // 128*log2/log10 : 38.53183944498959 - const float X_MAX = 0x1.344134p+5f; - // -149*log2/log10 : -44.8534693539332 - const float X_MIN = -0x1.66d3e8p+5f; - // 64*log10/log2 : 212.6033980727912 - const float R_64_BY_LOG10_2 = 0x1.a934f0p+7f; - // log2/(64 * log10) lead : 0.004699707 - const float R_LOG10_2_BY_64_LD = 0x1.340000p-8f; - // log2/(64 * log10) tail : 0.00000388665057 - const float R_LOG10_2_BY_64_TL = 0x1.04d426p-18f; - const float R_LN10 = 0x1.26bb1cp+1f; - - int return_nan = __clc_isnan(x); - int return_inf = x > X_MAX; - int return_zero = x < X_MIN; - - int n = __clc_convert_int(x * R_64_BY_LOG10_2); - - float fn = (float)n; - int j = n & 0x3f; - int m = n >> 6; - int m2 = m << EXPSHIFTBITS_SP32; - float r; - - r = R_LN10 * - __clc_mad(fn, -R_LOG10_2_BY_64_TL, __clc_mad(fn, -R_LOG10_2_BY_64_LD, x)); - - // Truncated Taylor series for e^r - float z2 = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), - r, 0x1.000000p-1f), - r * r, r); - - float two_to_jby64 = USE_TABLE(exp_tbl, j); - z2 = __clc_mad(two_to_jby64, z2, two_to_jby64); - - float z2s = z2 * __clc_as_float(0x1 << (m + 149)); - float z2n = __clc_as_float(__clc_as_int(z2) + m2); - z2 = m <= -126 ? z2s : z2n; - - z2 = return_inf ? __clc_as_float(PINFBITPATT_SP32) : z2; - z2 = return_zero ? 0.0f : z2; - z2 = return_nan ? x : z2; - return z2; -} -_CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_exp10, float) - -#ifdef cl_khr_fp64 -_CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x) { - // 1024*ln(2)/ln(10) - const double X_MAX = 0x1.34413509f79ffp+8; - // -1074*ln(2)/ln(10) - const double X_MIN = -0x1.434e6420f4374p+8; - // 64*ln(10)/ln(2) - const double R_64_BY_LOG10_2 = 0x1.a934f0979a371p+7; - // head ln(2)/(64*ln(10)) - const double R_LOG10_2_BY_64_LD = 0x1.3441350000000p-8; - // tail ln(2)/(64*ln(10)) - const double R_LOG10_2_BY_64_TL = 0x1.3ef3fde623e25p-37; - // ln(10) - const double R_LN10 = 0x1.26bb1bbb55516p+1; - - int n = __clc_convert_int(x * R_64_BY_LOG10_2); - - double dn = (double)n; - - int j = n & 0x3f; - int m = n >> 6; - - double r = R_LN10 * __clc_fma(-R_LOG10_2_BY_64_TL, dn, - __clc_fma(-R_LOG10_2_BY_64_LD, dn, x)); - - // 6 term tail of Taylor expansion of e^r - double z2 = - r * __clc_fma( - r, - __clc_fma(r, - __clc_fma(r, - __clc_fma(r, - __clc_fma(r, 0x1.6c16c16c16c17p-10, - 0x1.1111111111111p-7), - 0x1.5555555555555p-5), - 0x1.5555555555555p-3), - 0x1.0000000000000p-1), - 1.0); - - double tv0 = USE_TABLE(two_to_jby64_ep_tbl_head, j); - double tv1 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); - z2 = __clc_fma(tv0 + tv1, z2, tv1) + tv0; - - int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0)); - - int n1 = m >> 2; - int n2 = m - n1; - double z3 = z2 * __clc_as_double(((long)n1 + 1023) << 52); - z3 *= __clc_as_double(((long)n2 + 1023) << 52); - - z2 = ldexp(z2, m); - z2 = small_value ? z3 : z2; - - z2 = __clc_isnan(x) ? x : z2; - - z2 = x > X_MAX ? __clc_as_double(PINFBITPATT_DP64) : z2; - z2 = x < X_MIN ? 0.0 : z2; - - return z2; -} -_CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_exp10, double) -#endif diff --git a/libclc/generic/lib/math/exp10.cl b/libclc/generic/lib/math/exp10.cl index a9bba1f85d523..79544b26fa539 100644 --- a/libclc/generic/lib/math/exp10.cl +++ b/libclc/generic/lib/math/exp10.cl @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include -#include +#include -#define __CLC_FUNC exp10 -#define __CLC_BODY +#define FUNCTION exp10 +#define __CLC_BODY #include diff --git a/libclc/generic/lib/math/tables.cl b/libclc/generic/lib/math/tables.cl index 5534ae36d4e5b..b2f2626519a6e 100644 --- a/libclc/generic/lib/math/tables.cl +++ b/libclc/generic/lib/math/tables.cl @@ -462,74 +462,6 @@ DECLARE_TABLE(float2, CBRT_TBL, 129) = { (float2)(0x1.428000p+0f, 0x1.45f31ap-13f) }; -DECLARE_TABLE(float, EXP_TBL, 65) = { - 0x1.000000p+0f, - 0x1.02c9a4p+0f, - 0x1.059b0ep+0f, - 0x1.087452p+0f, - 0x1.0b5586p+0f, - 0x1.0e3ec4p+0f, - 0x1.11301ep+0f, - 0x1.1429aap+0f, - 0x1.172b84p+0f, - 0x1.1a35bep+0f, - 0x1.1d4874p+0f, - 0x1.2063b8p+0f, - 0x1.2387a6p+0f, - 0x1.26b456p+0f, - 0x1.29e9e0p+0f, - 0x1.2d285ap+0f, - 0x1.306fe0p+0f, - 0x1.33c08cp+0f, - 0x1.371a74p+0f, - 0x1.3a7db4p+0f, - 0x1.3dea64p+0f, - 0x1.4160a2p+0f, - 0x1.44e086p+0f, - 0x1.486a2cp+0f, - 0x1.4bfdaep+0f, - 0x1.4f9b28p+0f, - 0x1.5342b6p+0f, - 0x1.56f474p+0f, - 0x1.5ab07ep+0f, - 0x1.5e76f2p+0f, - 0x1.6247ecp+0f, - 0x1.662388p+0f, - 0x1.6a09e6p+0f, - 0x1.6dfb24p+0f, - 0x1.71f75ep+0f, - 0x1.75feb6p+0f, - 0x1.7a1148p+0f, - 0x1.7e2f34p+0f, - 0x1.82589ap+0f, - 0x1.868d9ap+0f, - 0x1.8ace54p+0f, - 0x1.8f1aeap+0f, - 0x1.93737cp+0f, - 0x1.97d82ap+0f, - 0x1.9c4918p+0f, - 0x1.a0c668p+0f, - 0x1.a5503cp+0f, - 0x1.a9e6b6p+0f, - 0x1.ae89fap+0f, - 0x1.b33a2cp+0f, - 0x1.b7f770p+0f, - 0x1.bcc1eap+0f, - 0x1.c199bep+0f, - 0x1.c67f12p+0f, - 0x1.cb720ep+0f, - 0x1.d072d4p+0f, - 0x1.d5818ep+0f, - 0x1.da9e60p+0f, - 0x1.dfc974p+0f, - 0x1.e502eep+0f, - 0x1.ea4afap+0f, - 0x1.efa1bep+0f, - 0x1.f50766p+0f, - 0x1.fa7c18p+0f, - 0x1.000000p+1f, -}; - TABLE_FUNCTION(float2, LOG2_TBL, log2_tbl); TABLE_FUNCTION(float2, LOG10_TBL, log10_tbl); @@ -539,7 +471,6 @@ uint4 TABLE_MANGLE(pibits_tbl)(size_t idx) { TABLE_FUNCTION(float2, SINHCOSH_TBL, sinhcosh_tbl); TABLE_FUNCTION(float2, CBRT_TBL, cbrt_tbl); -TABLE_FUNCTION(float, EXP_TBL, exp_tbl); #ifdef cl_khr_fp64 diff --git a/libclc/spirv/lib/SOURCES b/libclc/spirv/lib/SOURCES index 5358577ea1805..5c6051398c58f 100644 --- a/libclc/spirv/lib/SOURCES +++ b/libclc/spirv/lib/SOURCES @@ -38,7 +38,6 @@ subnormal_config.cl ../../generic/lib/math/exp_helper.cl ../../generic/lib/math/expm1.cl ../../generic/lib/math/exp2.cl -../../generic/lib/math/clc_exp10.cl ../../generic/lib/math/exp10.cl math/fma.cl ../../generic/lib/math/fmod.cl From 513a91a5f155746c7323a555a6e5ad0505627ca7 Mon Sep 17 00:00:00 2001 From: Jean-Didier PAILLEUX Date: Tue, 1 Apr 2025 15:47:54 +0200 Subject: [PATCH 0240/1029] [flang/flang-rt] Implement PERROR intrinsic form GNU Extension (#132406) Add the implementation of the `PERROR(STRING) ` intrinsic from the GNU Extension to prints on the stderr a newline-terminated error message corresponding to the last system error prefixed by `STRING`. (https://gcc.gnu.org/onlinedocs/gfortran/PERROR.html) --- flang-rt/lib/runtime/extensions.cpp | 4 ++ flang/docs/Intrinsics.md | 12 +++++ .../flang/Optimizer/Builder/IntrinsicCall.h | 1 + .../flang/Optimizer/Builder/Runtime/Command.h | 5 ++ flang/include/flang/Runtime/extensions.h | 3 ++ flang/lib/Evaluate/intrinsics.cpp | 2 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 15 ++++++ .../lib/Optimizer/Builder/Runtime/Command.cpp | 11 ++++ flang/test/Lower/Intrinsics/perror.f90 | 52 +++++++++++++++++++ 9 files changed, 105 insertions(+) create mode 100644 flang/test/Lower/Intrinsics/perror.f90 diff --git a/flang-rt/lib/runtime/extensions.cpp b/flang-rt/lib/runtime/extensions.cpp index 7e9e512778a75..618e184e28519 100644 --- a/flang-rt/lib/runtime/extensions.cpp +++ b/flang-rt/lib/runtime/extensions.cpp @@ -17,6 +17,7 @@ #include "flang/Runtime/entry-names.h" #include "flang/Runtime/io-api.h" #include +#include #include #include #include @@ -268,5 +269,8 @@ void FORTRAN_PROCEDURE_NAME(qsort)(int *array, int *len, int *isize, qsort(array, *len, *isize, compar); } +// PERROR(STRING) +void RTNAME(Perror)(const char *str) { perror(str); } + } // namespace Fortran::runtime } // extern "C" diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index b09de8ee77645..ddb053d7a3d0b 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -1175,3 +1175,15 @@ by `ISIZE`. - **Standard:** lib3f (section 3f of old man pages). - **Class:** subroutine - **Syntax:** `CALL QSORT(ARRAY, LEN, ISIZE, COMPAR)` + +### Non-Standard Intrinsics: PERROR + +#### Description +`PERROR(STRING)` prints (on the C stderr stream) a newline-terminated error message corresponding to the last system error. +This is prefixed by `STRING`, a colon and a space. + +#### Usage and Info + +- **Standard:** GNU extension +- **Class:** subroutine +- **Syntax:** `CALL PERROR(STRING)` diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index cdbb78224e3b4..83f08bb88f7f3 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -373,6 +373,7 @@ struct IntrinsicLibrary { fir::ExtendedValue genNull(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genPack(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genParity(mlir::Type, llvm::ArrayRef); + void genPerror(llvm::ArrayRef); mlir::Value genPopcnt(mlir::Type, llvm::ArrayRef); mlir::Value genPoppar(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genPresent(mlir::Type, llvm::ArrayRef); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Command.h b/flang/include/flang/Optimizer/Builder/Runtime/Command.h index d896393ce02f7..ba0d3b094f40c 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Command.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Command.h @@ -63,5 +63,10 @@ mlir::Value genGetCwd(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value genHostnm(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value res); +/// Generate a call to the Perror runtime function which implements +/// the PERROR GNU intrinsic. +void genPerror(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value string); + } // namespace fir::runtime #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_COMMAND_H diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h index 4e96f253a6c2c..57de3f8f05948 100644 --- a/flang/include/flang/Runtime/extensions.h +++ b/flang/include/flang/Runtime/extensions.h @@ -78,5 +78,8 @@ int RTNAME(Chdir)(const char *name); // GNU extension function IERRNO() int FORTRAN_PROCEDURE_NAME(ierrno)(); +// GNU extension subroutine PERROR(STRING) +void RTNAME(Perror)(const char *str); + } // extern "C" #endif // FORTRAN_RUNTIME_EXTENSIONS_H_ diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index 2f34b12ca80bf..0c15ec5473965 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -1573,6 +1573,8 @@ static const IntrinsicInterface intrinsicSubroutine[]{ {"errmsg", DefaultChar, Rank::scalar, Optionality::optional, common::Intent::InOut}}, {}, Rank::elemental, IntrinsicClass::pureSubroutine}, + {"perror", {{"string", DefaultChar, Rank::scalar}}, {}, Rank::elemental, + IntrinsicClass::impureSubroutine}, {"mvbits", {{"from", SameIntOrUnsigned}, {"frompos", AnyInt}, {"len", AnyInt}, {"to", SameIntOrUnsigned, Rank::elemental, Optionality::required, diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index e1e2fa875bff3..0948396ac3fb8 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -753,6 +753,10 @@ static constexpr IntrinsicHandler handlers[]{ &I::genParity, {{{"mask", asBox}, {"dim", asValue}}}, /*isElemental=*/false}, + {"perror", + &I::genPerror, + {{{"string", asBox}}}, + /*isElemental*/ false}, {"popcnt", &I::genPopcnt}, {"poppar", &I::genPoppar}, {"present", @@ -7158,6 +7162,17 @@ IntrinsicLibrary::genParity(mlir::Type resultType, return readAndAddCleanUp(resultMutableBox, resultType, "PARITY"); } +// PERROR +void IntrinsicLibrary::genPerror(llvm::ArrayRef args) { + assert(args.size() == 1); + + fir::ExtendedValue str = args[0]; + const auto *box = str.getBoxOf(); + mlir::Value addr = + builder.create(loc, box->getMemTy(), fir::getBase(*box)); + fir::runtime::genPerror(builder, loc, addr); +} + // POPCNT mlir::Value IntrinsicLibrary::genPopcnt(mlir::Type resultType, llvm::ArrayRef args) { diff --git a/flang/lib/Optimizer/Builder/Runtime/Command.cpp b/flang/lib/Optimizer/Builder/Runtime/Command.cpp index 612599551528f..9b814c3395aa1 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Command.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Command.cpp @@ -10,6 +10,7 @@ #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Runtime/command.h" +#include "flang/Runtime/extensions.h" using namespace Fortran::runtime; @@ -114,3 +115,13 @@ mlir::Value fir::runtime::genHostnm(fir::FirOpBuilder &builder, builder, loc, runtimeFuncTy, res, sourceFile, sourceLine); return builder.create(loc, func, args).getResult(0); } + +void fir::runtime::genPerror(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value string) { + auto runtimeFunc = + fir::runtime::getRuntimeFunc(loc, builder); + mlir::FunctionType runtimeFuncTy = runtimeFunc.getFunctionType(); + llvm::SmallVector args = + fir::runtime::createArguments(builder, loc, runtimeFuncTy, string); + builder.create(loc, runtimeFunc, args); +} diff --git a/flang/test/Lower/Intrinsics/perror.f90 b/flang/test/Lower/Intrinsics/perror.f90 new file mode 100644 index 0000000000000..acecf0b996949 --- /dev/null +++ b/flang/test/Lower/Intrinsics/perror.f90 @@ -0,0 +1,52 @@ +! RUN: bbc -emit-hlfir %s -o - | FileCheck --check-prefixes=CHECK %s +! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck --check-prefixes=CHECK %s + +! CHECK-LABEL: func @_QPtest_perror( +subroutine test_perror() + character(len=10) :: string + character(len=1) :: one + ! CHECK: %[[C1:.*]] = arith.constant 1 : index + ! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.char<1> {bindc_name = "one", uniq_name = "_QFtest_perrorEone"} + ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[C1]] {uniq_name = "_QFtest_perrorEone"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) + ! CHECK: %[[C10:.*]] = arith.constant 10 : index + ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.char<1,10> {bindc_name = "string", uniq_name = "_QFtest_perrorEstring"} + ! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] typeparams %[[C10]] {uniq_name = "_QFtest_perrorEstring"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) + + call perror(string) + ! CHECK: %[[VAL_4:.*]] = fir.embox %[[VAL_3]]#0 : (!fir.ref>) -> !fir.box> + ! CHECK: %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box>) -> !fir.ref> + ! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.ref>) -> !fir.ref + ! CHECK: fir.call @_FortranAPerror(%[[VAL_6]]) fastmath : (!fir.ref) -> () + + call perror("prefix") + ! CHECK: %[[VAL_7:.*]] = fir.address_of(@{{.*}}) : !fir.ref> + ! CHECK: %[[C6:.*]] = arith.constant 6 : index + ! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[C6]] {fortran_attrs = #fir.var_attrs, uniq_name = {{.*}}} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) + ! CHECK: %[[VAL_9:.*]] = fir.embox %[[VAL_8]]#0 : (!fir.ref>) -> !fir.box> + ! CHECK: %[[VAL_10:.*]] = fir.box_addr %[[VAL_9]] : (!fir.box>) -> !fir.ref> + ! CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (!fir.ref>) -> !fir.ref + ! CHECK: fir.call @_FortranAPerror(%[[VAL_11]]) fastmath : (!fir.ref) -> () + + call perror(one) + ! CHECK: %[[VAL_12:.*]] = fir.embox %[[VAL_1]]#0 : (!fir.ref>) -> !fir.box> + ! CHECK: %[[VAL_13:.*]] = fir.box_addr %[[VAL_12]] : (!fir.box>) -> !fir.ref> + ! CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (!fir.ref>) -> !fir.ref + ! CHECK: fir.call @_FortranAPerror(%[[VAL_14]]) fastmath : (!fir.ref) -> () +end subroutine test_perror + +! CHECK-LABEL: func @_QPtest_perror_unknown_length( +! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "str"} +subroutine test_perror_unknown_length(str) + implicit none + character(len=*), intent(in) :: str + + call perror(str) + ! CHECK: %[[VAL_0:.*]] = fir.dummy_scope : !fir.dscope + ! CHECK: %[[VAL_1:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref>, index) + ! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 dummy_scope %[[VAL_0]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest_perror_unknown_lengthEstr"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) + ! CHECK: %[[VAL_3:.*]] = fir.embox %[[VAL_2]]#1 typeparams %[[VAL_1]]#1 : (!fir.ref>, index) -> !fir.box> + ! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box>) -> !fir.ref> + ! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ref>) -> !fir.ref + ! CHECK: fir.call @_FortranAPerror(%[[VAL_5]]) fastmath : (!fir.ref) -> () + ! CHECK: return +end subroutine test_perror_unknown_length From 15cfe4a77495a6053cc9818247fe520da2bb3e5f Mon Sep 17 00:00:00 2001 From: Jean-Didier PAILLEUX Date: Tue, 1 Apr 2025 15:48:25 +0200 Subject: [PATCH 0241/1029] [MLIR] Adding 'no_inline' and 'always_inline' attributes on LLMV::CallOp (#133726) Addition of `no_inline` and `always_inline` attributes for CallOps in MLIR in order to be able to inline or not directly the call of a function without having the attribute on the `FuncOp`. The addition of these attributes will be used in a future PR in Flang (`[NO]INLINE` directive). --- mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 34 +++++++++---------- mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 12 ++++--- .../LLVMIR/LLVMToLLVMIRTranslation.cpp | 4 +++ mlir/lib/Target/LLVMIR/ModuleImport.cpp | 3 ++ .../Target/LLVMIR/Import/call-attributes.ll | 25 ++++++++++++++ mlir/test/Target/LLVMIR/llvmir.mlir | 29 ++++++++++++++++ 6 files changed, 86 insertions(+), 21 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/Import/call-attributes.ll diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index 707e23194061b..423cf948b03e1 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -771,23 +771,23 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call", }]; dag args = (ins OptionalAttr>:$var_callee_type, - OptionalAttr:$callee, - Variadic:$callee_operands, - DefaultValuedAttr:$fastmathFlags, - OptionalAttr:$branch_weights, - DefaultValuedAttr:$CConv, - DefaultValuedAttr:$TailCallKind, - OptionalAttr:$memory_effects, - OptionalAttr:$convergent, - OptionalAttr:$no_unwind, - OptionalAttr:$will_return, - VariadicOfVariadic:$op_bundle_operands, - DenseI32ArrayAttr:$op_bundle_sizes, - OptionalAttr:$op_bundle_tags, - OptionalAttr:$arg_attrs, - OptionalAttr:$res_attrs); + OptionalAttr:$callee, + Variadic:$callee_operands, + DefaultValuedAttr:$fastmathFlags, + OptionalAttr:$branch_weights, + DefaultValuedAttr:$CConv, + DefaultValuedAttr:$TailCallKind, + OptionalAttr:$memory_effects, + OptionalAttr:$convergent, + OptionalAttr:$no_unwind, + OptionalAttr:$will_return, + VariadicOfVariadic:$op_bundle_operands, + DenseI32ArrayAttr:$op_bundle_sizes, + OptionalAttr:$op_bundle_tags, + OptionalAttr:$arg_attrs, + OptionalAttr:$res_attrs, + OptionalAttr:$no_inline, + OptionalAttr:$always_inline); // Append the aliasing related attributes defined in LLVM_MemAccessOpBase. let arguments = !con(args, aliasAttrs); let results = (outs Optional:$result); diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 1c43173f31345..252bdd1425d5e 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -1037,7 +1037,8 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results, /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{}, /*arg_attrs=*/nullptr, /*res_attrs=*/nullptr, /*access_groups=*/nullptr, /*alias_scopes=*/nullptr, - /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr); + /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr, + /*no_inline=*/nullptr, /*always_inline=*/nullptr); } void CallOp::build(OpBuilder &builder, OperationState &state, @@ -1065,7 +1066,8 @@ void CallOp::build(OpBuilder &builder, OperationState &state, /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{}, /*arg_attrs=*/nullptr, /*res_attrs=*/nullptr, /*access_groups=*/nullptr, - /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr); + /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr, + /*no_inline=*/nullptr, /*always_inline=*/nullptr); } void CallOp::build(OpBuilder &builder, OperationState &state, @@ -1079,7 +1081,8 @@ void CallOp::build(OpBuilder &builder, OperationState &state, /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{}, /*arg_attrs=*/nullptr, /*res_attrs=*/nullptr, /*access_groups=*/nullptr, /*alias_scopes=*/nullptr, - /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr); + /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr, + /*no_inline=*/nullptr, /*always_inline=*/nullptr); } void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func, @@ -1093,7 +1096,8 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func, /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{}, /*access_groups=*/nullptr, /*alias_scopes=*/nullptr, /*arg_attrs=*/nullptr, /*res_attrs=*/nullptr, - /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr); + /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr, + /*no_inline=*/nullptr, /*always_inline=*/nullptr); } CallInterfaceCallable CallOp::getCallableForCallee() { diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp index f8fa22253bea8..10b68a333bcbd 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp @@ -319,6 +319,10 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder, call->addFnAttr(llvm::Attribute::NoUnwind); if (callOp.getWillReturnAttr()) call->addFnAttr(llvm::Attribute::WillReturn); + if (callOp.getNoInlineAttr()) + call->addFnAttr(llvm::Attribute::NoInline); + if (callOp.getAlwaysInlineAttr()) + call->addFnAttr(llvm::Attribute::AlwaysInline); if (failed(convertParameterAndResultAttrs(callOp, call, moduleTranslation))) return failure(); diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index c0711f7dded71..ea141d8b07284 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -2331,6 +2331,9 @@ LogicalResult ModuleImport::convertCallAttributes(llvm::CallInst *inst, op.setConvergent(callAttrs.getFnAttr(llvm::Attribute::Convergent).isValid()); op.setNoUnwind(callAttrs.getFnAttr(llvm::Attribute::NoUnwind).isValid()); op.setWillReturn(callAttrs.getFnAttr(llvm::Attribute::WillReturn).isValid()); + op.setNoInline(callAttrs.getFnAttr(llvm::Attribute::NoInline).isValid()); + op.setAlwaysInline( + callAttrs.getFnAttr(llvm::Attribute::AlwaysInline).isValid()); llvm::MemoryEffects memEffects = inst->getMemoryEffects(); ModRefInfo othermem = convertModRefInfoFromLLVM( diff --git a/mlir/test/Target/LLVMIR/Import/call-attributes.ll b/mlir/test/Target/LLVMIR/Import/call-attributes.ll new file mode 100644 index 0000000000000..96c61e6e31da8 --- /dev/null +++ b/mlir/test/Target/LLVMIR/Import/call-attributes.ll @@ -0,0 +1,25 @@ +; RUN: mlir-translate -import-llvm -split-input-file %s | FileCheck %s + +declare void @f() + +; CHECK-LABEL: @test_call_noinline +; CHECK: llvm.call @f() {no_inline} : () -> () +define void @test_call_noinline() { + call void @f() #0 + ret void +} + +attributes #0 = { noinline } + +// ----- + +declare void @f() + +; CHECK-LABEL: @test_call_alwaysinline +; CHECK: llvm.call @f() {always_inline} : () -> () +define void @test_call_alwaysinline() { + call void @f() #0 + ret void +} + +attributes #0 = { alwaysinline } diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir index 0238c95835a0f..cbd41efdc3015 100644 --- a/mlir/test/Target/LLVMIR/llvmir.mlir +++ b/mlir/test/Target/LLVMIR/llvmir.mlir @@ -2621,6 +2621,35 @@ llvm.func @willreturn_call() { // CHECK: #[[ATTRS]] // CHECK-SAME: willreturn +// ----- + +llvm.func @f() + +// CHECK-LABEL: @no_inline_call +// CHECK: call void @f() #[[ATTRS:[0-9]+]] +llvm.func @no_inline_call() { + llvm.call @f() {no_inline} : () -> () + llvm.return +} + +// CHECK: #[[ATTRS]] +// CHECK-SAME: noinline + +// ----- + +llvm.func @f() + +// CHECK-LABEL: @always_inline_call +// CHECK: call void @f() #[[ATTRS:[0-9]+]] +llvm.func @always_inline_call() { + llvm.call @f() {always_inline} : () -> () + llvm.return +} + +// CHECK: #[[ATTRS]] +// CHECK-SAME: alwaysinline + + // ----- llvm.func @fa() From c1efd8b663b7db3c717fae8a1991dcc4b8304c8f Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 1 Apr 2025 14:54:40 +0100 Subject: [PATCH 0242/1029] [libclc][NFC] Delete two unused headers These should have been deleted when the respective builtins were moved to the CLC library. --- libclc/generic/include/math/clc_exp10.h | 12 ------------ libclc/generic/include/math/clc_tanpi.h | 13 ------------- 2 files changed, 25 deletions(-) delete mode 100644 libclc/generic/include/math/clc_exp10.h delete mode 100644 libclc/generic/include/math/clc_tanpi.h diff --git a/libclc/generic/include/math/clc_exp10.h b/libclc/generic/include/math/clc_exp10.h deleted file mode 100644 index b789410d6f2ac..0000000000000 --- a/libclc/generic/include/math/clc_exp10.h +++ /dev/null @@ -1,12 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#define __CLC_FUNCTION __clc_exp10 -#define __CLC_BODY -#include -#undef __CLC_FUNCTION diff --git a/libclc/generic/include/math/clc_tanpi.h b/libclc/generic/include/math/clc_tanpi.h deleted file mode 100644 index 0f7d825dd1483..0000000000000 --- a/libclc/generic/include/math/clc_tanpi.h +++ /dev/null @@ -1,13 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#define __CLC_FUNCTION __clc_tanpi -#define __CLC_BODY -#include -#undef __CLC_BODY -#undef __CLC_FUNCTION From e92ff64bad38835a497f9eb928967bca6a295b28 Mon Sep 17 00:00:00 2001 From: Virginia Cangelosi Date: Tue, 1 Apr 2025 15:05:30 +0100 Subject: [PATCH 0243/1029] [Clang][LLVM] Implement single-multi vectors MOP4{A/S} (#128854) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement all single-multi {BF/F/S/U/SU/US}MOP4{A/S} instructions in clang and llvm following the acle in https://github.com/ARM-software/acle/pull/381/files. This PR depends on https://github.com/llvm/llvm-project/pull/127797 This patch updates the semantics of template arguments in intrinsic names for clarity and ease of use. Previously, template argument numbers indicated which character in the prototype string determined the final type suffix, which was confusing—especially for intrinsics using multiple prototype modifiers per operand (e.g., intrinsics operating on arrays of vectors). The number had to reference the correct character in the prototype (e.g., the ‘u’ in “2.u”), making the system cumbersome and error-prone. With this patch, template argument numbers now refer to the operand number that determines the final type suffix, providing a more intuitive and consistent approach. --- clang/include/clang/Basic/arm_sme.td | 9 + clang/include/clang/Basic/arm_sve.td | 4 +- .../sme2-intrinsics/acle_sme2_mop4_1x2.c | 466 ++++++++++++++++++ .../acle_sme2p2_imm.cpp | 104 +++- clang/utils/TableGen/SveEmitter.cpp | 5 +- llvm/include/llvm/IR/IntrinsicsAArch64.td | 11 +- llvm/lib/Target/AArch64/SMEInstrFormats.td | 61 ++- .../AArch64/sme2-intrinsics-mop4a_1x2.ll | 462 +++++++++++++++++ 8 files changed, 1099 insertions(+), 23 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x2.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 5012874a08790..6312223f5d112 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -294,6 +294,7 @@ defm SVMOPS : ZAFPOuterProd<"mops">; multiclass MOP4 checks> { def _1x1 : Inst<"svmop4" # mode # "[_1x1]" # za # "[_{d}_{d}]", "vidd", t, MergeNone, i # "_1x1", [IsInOutZA, IsStreaming], checks>; + def _1x2 : Inst<"svmop4" # mode # "[_1x2]" # za # "[_{d}_{d}]", "vid2", t, MergeNone, i # "_1x2", [IsInOutZA, IsStreaming], checks>; } let SMETargetGuard = "sme2,sme-mop4" in { @@ -345,6 +346,10 @@ multiclass SUMOP4 che "vidu", t, MergeNone, "aarch64_sme_sumop4" # mode # i # "_wide_1x1", [IsStreaming, IsInOutZA], checks>; + def _1x2 : SInst<"svmop4" # mode # "[_1x2]" # za # "[_{d}_{3}]", + "vid2.u", t, MergeNone, "aarch64_sme_sumop4" # mode # i # "_wide_1x2", + [IsStreaming, IsInOutZA], + checks>; } multiclass USMOP4 checks> { @@ -352,6 +357,10 @@ multiclass USMOP4 che "vidx", t, MergeNone, "aarch64_sme_usmop4" # mode # i # "_wide_1x1", [IsStreaming, IsInOutZA], checks>; + def _1x2 : SInst<"svmop4" # mode # "[_1x2]" # za # "[_{d}_{3}]", + "vid2.x", t, MergeNone, "aarch64_sme_usmop4" # mode # i # "_wide_1x2", + [IsStreaming, IsInOutZA], + checks>; } let SMETargetGuard = "sme2,sme-mop4" in { diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b51106fa56759..35263541b67ae 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -2420,8 +2420,8 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in { let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in { def SVSUNPK_X2 : SInst<"svunpk_{d}[_{1}_x2]", "2h", "sil", MergeNone, "aarch64_sve_sunpk_x2", [IsStreaming], []>; def SVUUNPK_X2 : SInst<"svunpk_{d}[_{1}_x2]", "2h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x2", [IsStreaming], []>; - def SVSUNPK_X4 : SInst<"svunpk_{d}[_{3}_x4]", "42.h", "sil", MergeNone, "aarch64_sve_sunpk_x4", [IsStreaming], []>; - def SVUUNPK_X4 : SInst<"svunpk_{d}[_{3}_x4]", "42.h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x4", [IsStreaming], []>; + def SVSUNPK_X4 : SInst<"svunpk_{d}[_{1}_x4]", "42.h", "sil", MergeNone, "aarch64_sve_sunpk_x4", [IsStreaming], []>; + def SVUUNPK_X4 : SInst<"svunpk_{d}[_{1}_x4]", "42.h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x4", [IsStreaming], []>; } let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in { diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c new file mode 100644 index 0000000000000..3c8bd372aa547 --- /dev/null +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c @@ -0,0 +1,466 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +#ifdef SME_OVERLOADED_FORMS +#define SME_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SME_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svmop4a_1x2_za32_s8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x2_za32_s8_s8u10__SVInt8_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za32_s8_s8(svint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za32,_s8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za32_s8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x2_za32_s8_s8u10__SVInt8_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za32_s8_s8(svint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za32,_s8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za32_u8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x2_za32_u8_u8u11__SVUint8_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za32_u8_u8(svuint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za32,_u8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za32_u8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x2_za32_u8_u8u11__SVUint8_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za32_u8_u8(svuint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za32,_u8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za32_s8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x2_za32_s8_u8u10__SVInt8_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za32_s8_u8(svint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za32,_s8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za32_s8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x2_za32_s8_u8u10__SVInt8_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za32_s8_u8(svint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za32,_s8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za32_u8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4a_1x2_za32_u8_s8u11__SVUint8_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za32_u8_s8(svuint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za32,_u8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za32_u8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4s_1x2_za32_u8_s8u11__SVUint8_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.1x2.nxv16i8(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za32_u8_s8(svuint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za32,_u8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za32_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za32_s16_s16u11__SVInt16_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za32_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za32,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za32_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za32_s16_s16u11__SVInt16_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za32_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za32,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za32_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za32_u16_u16u12__SVUint16_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za32_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za32,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za32_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za32_u16_u16u12__SVUint16_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za32_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za32,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za32_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8f16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za32_f16_f16u13__SVFloat16_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8f16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za32_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za32,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za32_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za32_f16_f16u13__SVFloat16_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za32_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za32,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za32_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x2_za32_bf16_bf16u14__SVBfloat16_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za32_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za32,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za32_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x2_za32_bf16_bf16u14__SVBfloat16_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za32_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za32,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za64_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_s16_s16u11__SVInt16_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za64_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za64,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za64_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_s16_s16u11__SVInt16_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za64_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za64,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za64_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_u16_u16u12__SVUint16_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za64_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za64,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za64_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_u16_u16u12__SVUint16_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za64_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za64,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za64_s16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_s16_u16u11__SVInt16_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za64_s16_u16(svint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za64,_s16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za64_s16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_s16_u16u11__SVInt16_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za64_s16_u16(svint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za64,_s16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za64_u16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_u16_s16u12__SVUint16_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za64_u16_s16(svuint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za64,_u16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za64_u16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_u16_s16u12__SVUint16_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.1x2.nxv8i16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za64_u16_s16(svuint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za64,_u16_s16)(1, zn, zm); +} + + +// CHECK-LABEL: @test_svmop4a_1x2_za16_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8f16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za16_f16_f16u13__SVFloat16_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8f16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za16_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za16,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za16_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za16_f16_f16u13__SVFloat16_t13svfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za16_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za16,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za32_f32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv4f32(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za32_f32_f32u13__SVFloat32_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv4f32(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za32_f32_f32(svfloat32_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za32,_f32_f32)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za32_f32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za32_f32_f32u13__SVFloat32_t13svfloat32x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za32_f32_f32(svfloat32_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za32,_f32_f32)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za64_f64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv2f64(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_1x2_za64_f64_f64u13__SVFloat64_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv2f64(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za64_f64_f64(svfloat64_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za64,_f64_f64)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za64_f64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_1x2_za64_f64_f64u13__SVFloat64_t13svfloat64x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za64_f64_f64(svfloat64_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za64,_f64_f64)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_1x2_za16_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svmop4a_1x2_za16_bf16_bf16u14__SVBfloat16_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.1x2.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_1x2_za16_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_1x2_,za16,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_1x2_za16_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svmop4s_1x2_za16_bf16_bf16u14__SVBfloat16_t14svbfloat16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 1, [[ZN:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_1x2_za16_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_1x2_,za16,_bf16_bf16)(1, zn, zm); +} diff --git a/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp index 556cb1742dbbd..47ce2a0f5f80f 100644 --- a/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp +++ b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp @@ -6,19 +6,19 @@ #include -void tests_mop4_imm_s8_s8(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { +void tests_mop4_imm_s8_s8_1x1(svint8_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { svmop4a_1x1_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} svmop4s_1x1_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} return; } -void tests_mop4_imm_u8_u8(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { +void tests_mop4_imm_u8_u8_1x1(svuint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { svmop4a_1x1_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} svmop4s_1x1_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} return; } -void tests_mop4_imm_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { +void tests_mop4_imm_s8_u8_1x1(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { svmop4a_1x1_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} svmop4s_1x1_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} svmop4a_1x1_za32_u8_s8(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} @@ -26,7 +26,7 @@ void tests_mop4_imm_s8_u8(svint8_t zn, svuint8_t zm) __arm_streaming __arm_inout return; } -void tests_mop4_imm_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { +void tests_mop4_imm_s16_s16_1x1(svint16_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { svmop4a_1x1_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} svmop4s_1x1_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} @@ -35,7 +35,7 @@ void tests_mop4_imm_s16_s16(svint16_t zn, svint16_t zm) __arm_streaming __arm_in return; } -void tests_mop4_imm_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { +void tests_mop4_imm_u16_u16_1x1(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { svmop4a_1x1_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} svmop4s_1x1_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} @@ -44,7 +44,7 @@ void tests_mop4_imm_u16_u16(svuint16_t zn, svuint16_t zm) __arm_streaming __arm_ return; } -void tests_mop4_imm_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { +void tests_mop4_imm_s16_u16_1x1(svint16_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { svmop4a_1x1_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} svmop4s_1x1_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} svmop4a_1x1_za64_u16_s16(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} @@ -52,7 +52,7 @@ void tests_mop4_imm_s16_u16(svint16_t zn, svuint16_t zm) __arm_streaming __arm_i return; } -void tests_mop4_imm_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { +void tests_mop4_imm_f16_f16_1x1(svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { svmop4a_1x1_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} svmop4s_1x1_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} @@ -61,7 +61,7 @@ void tests_mop4_imm_f16_f16(svfloat16_t zn, svfloat16_t zm) __arm_streaming __ar return; } -void tests_mop4_imm_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { +void tests_mop4_imm_bf16_bf16_1x1(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { svmop4a_1x1_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} svmop4s_1x1_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} @@ -71,14 +71,98 @@ void tests_mop4_imm_bf16_bf16(svbfloat16_t zn, svbfloat16_t zm) __arm_streaming } -void tests_mop4_imm_f32_f32(svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { +void tests_mop4_imm_f32_f32_1x1(svfloat32_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { svmop4a_1x1_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} svmop4s_1x1_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} return; } -void tests_mop4_imm_f64_f64(svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { +void tests_mop4_imm_f64_f64_1x1(svfloat64_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { svmop4a_1x1_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} svmop4s_1x1_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} return; } + +void tests_mop4_imm_s8_s8_1x2(svint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x2_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x2_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_u8_u8_1x2(svuint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x2_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x2_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_s8_u8_1x2(svint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x2_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x2_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_u8_s8_1x2(svuint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x2_za32_u8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x2_za32_u8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_s16_s16_1x2(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x2_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x2_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_1x2_za64_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_1x2_za64_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_u16_u16_1x2(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x2_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x2_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_1x2_za64_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_1x2_za64_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_s16_u16_1x2(svint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x2_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_1x2_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_u16_s16_1x2(svuint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x2_za64_u16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_1x2_za64_u16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_f16_f16_1x2(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x2_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x2_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_1x2_za16_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmop4s_1x2_za16_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + return; +} + +void tests_mop4_imm_bf16_bf16_1x2(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x2_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x2_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_1x2_za16_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmop4s_1x2_za16_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + return; +} + +void tests_mop4_imm_f32_f32_1x2(svfloat32_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x2_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_1x2_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_f64_f64_1x2(svfloat64_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_1x2_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_1x2_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index e226987b4844b..200f57960fff8 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -1043,7 +1043,10 @@ std::string Intrinsic::replaceTemplatedArgs(std::string Name, TypeSpec TS, case '1': case '2': case '3': - T = SVEType(TS, Proto[C - '0']); + // Extract the modifier before passing to SVEType to handle numeric + // modifiers + auto [Mod, NumVectors] = getProtoModifier(Proto, (C - '0')); + T = SVEType(TS, Mod); break; } diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index fe8769154b1da..f08bdf78b5f96 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3070,11 +3070,19 @@ let TargetPrefix = "aarch64" in { llvm_anyvector_ty, LLVMMatchType<0>], [ImmArg>, IntrNoMem, IntrHasSideEffects]>; + class SME_OuterProduct_QuarterTile_Single_Multi + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, + LLVMMatchType<0>, + LLVMMatchType<0>], [ImmArg>, IntrNoMem, IntrHasSideEffects]>; + // 2-way and 4-way multi-vector signed/unsigned Quarter Tile Quarter Product A/S foreach mode = ["s", "a"] in { foreach za = ["", "_za64"] in { foreach ty = ["s", "u", "su", "us"] in { def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_1x1" : SME_OuterProduct_QuarterTile_Single_Single; + def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_1x2" : SME_OuterProduct_QuarterTile_Single_Multi; } } } @@ -3083,9 +3091,10 @@ let TargetPrefix = "aarch64" in { foreach mode = ["s", "a"] in { foreach wide = ["", "_wide"] in { def int_aarch64_sme_mop4 # mode # wide # "_1x1" : SME_OuterProduct_QuarterTile_Single_Single; + def int_aarch64_sme_mop4 # mode # wide # "_1x2" : SME_OuterProduct_QuarterTile_Single_Multi; } } - + class SME_AddVectorToTile_Intrinsic : DefaultAttrsIntrinsic<[], [llvm_i32_ty, diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 54c63ead059ae..87a8f068083d5 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -270,6 +270,9 @@ class SME2_ZA_Tile_Vec_Single_Single_Pat(name # _PSEUDO) $tile, $Zn, $Zm)>; +class SME2_ZA_Tile_Vec_Multi_Pat + : Pat<(intrinsic imm_ty:$tile, vt:$Zn, vt:$Zm1, vt:$Zm2), + (!cast(name # _PSEUDO) $tile, $Zn, (REG_SEQUENCE ZPR2Mul2, vt:$Zm1, zsub0, vt:$Zm2, zsub1))>; //===----------------------------------------------------------------------===// // SME pattern match helpers. //===----------------------------------------------------------------------===// @@ -623,7 +626,12 @@ multiclass sme_quarter_outer_product_i8_i32; def _MZ2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 1}, subtr, - ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, mnemonic>; + ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, mnemonic>, SMEPseudo2Instr; + + def NAME # _MZ2Z_BToS_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_3, nxv16i8>; + def _M2Z2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 1}, subtr, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi, mnemonic>; } @@ -639,7 +647,12 @@ multiclass sme_quarter_outer_product_i16_i32; def _MZ2Z_HToS : sme_quarter_outer_product_i16_i32; + ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>, SMEPseudo2Instr; + + def NAME # _MZ2Z_HToS_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_3, nxv8i16>; + def _M2Z2Z_HToS : sme_quarter_outer_product_i16_i32; } @@ -655,7 +668,12 @@ multiclass sme_quarter_outer_product_i64; def _MZ2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 1}, subtr, - ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>; + ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>, SMEPseudo2Instr; + + def NAME # _MZ2Z_HtoD_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_7, nxv8i16>; + def _M2Z2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 1}, subtr, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>; } @@ -5509,7 +5527,12 @@ multiclass sme2_bfmop4as_widening { def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; // Single and multiple vectors - def _MZ2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>; + def _MZ2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr; + + def NAME # _MZ2Z_S_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_3, nxv8bf16>; + // Multiple vectors def _M2Z2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>; @@ -5660,7 +5683,11 @@ multiclass sme2_fmop4as_fp16_non_widening { def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; // Single and multiple vectors - def _MZ2Z_H : sme2_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>; + def _MZ2Z_H : sme2_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr; + + def NAME # _MZ2Z_H_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_1, nxv8f16>; // Multiple vectors def _M2Z2Z_H : sme2_fp16_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>; @@ -5736,7 +5763,11 @@ multiclass sme2_bfmop4as_non_widening { def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; // Single and multiple vectors - def _MZ2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>; + def _MZ2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr; + + def NAME # _MZ2Z_H_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_1, nxv8bf16>; // Multiple vectors def _M2Z2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>; @@ -5777,7 +5808,11 @@ multiclass sme2_fmop4as_fp32_non_widening { def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>; // Single and multiple vectors - def _MZ2Z_S : sme2_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR32Mul2_Lo, ZZ_s_mul_r_Hi>; + def _MZ2Z_S : sme2_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR32Mul2_Lo, ZZ_s_mul_r_Hi>, SMEPseudo2Instr; + + def NAME # _MZ2Z_S_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_3, nxv4f32>; // Multiple vectors def _M2Z2Z_S : sme2_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZZ_s_mul_r_Hi>; @@ -5818,7 +5853,11 @@ multiclass sme2_fmop4as_fp64_non_widening { def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>; // Single and multiple vectors - def _MZ2Z_D : sme2_fp64_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR64Mul2_Lo, ZZ_d_mul_r_Hi>; + def _MZ2Z_D : sme2_fp64_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR64Mul2_Lo, ZZ_d_mul_r_Hi>, SMEPseudo2Instr; + + def NAME # _MZ2Z_D_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_7, nxv2f64>; // Multiple vectors def _M2Z2Z_D : sme2_fp64_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZZ_d_mul_r_Hi>; @@ -5859,7 +5898,11 @@ multiclass sme2_fmop4as_fp16_fp32_widening { def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; // Single and multiple vectors - def _MZ2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>; + def _MZ2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr; + + def NAME # _MZ2Z_HtoS_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_3, nxv8f16>; // Multiple vectors def _M2Z2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>; diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x2.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x2.ll new file mode 100644 index 0000000000000..f3540458dcaa6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_1x2.ll @@ -0,0 +1,462 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-linux" + +; Widening +define void @mop4a_za32_s8( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4a za0.s, z0.b, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv16i8(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_s8( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4s za0.s, z0.b, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_u8( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: umop4a za0.s, z0.b, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv16i8(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_u8( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: umop4s za0.s, z0.b, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv16i8(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_s8_u8( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_s8_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: sumop4a za0.s, z0.b, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4a.wide.1x2.nxv16i8(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_s8_u8( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_s8_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: sumop4s za0.s, z0.b, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4s.wide.1x2.nxv16i8(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_u8_s8( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_u8_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: usmop4a za0.s, z0.b, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4a.wide.1x2.nxv16i8(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_u8_s8( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_u8_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: usmop4s za0.s, z0.b, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4s.wide.1x2.nxv16i8(i32 0, %zn, %zm1, %zm2) + ret void +} + + +define void @mop4a_za32_s16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4a za0.s, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.wide.1x2.nxv8i16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_s16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4s za0.s, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_u16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: umop4a za0.s, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.wide.1x2.nxv8i16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_u16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: umop4s za0.s, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.wide.1x2.nxv8i16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_f16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4a za0.s, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8f16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_f16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za0.s, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_bf16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: bfmop4a za0.s, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.wide.1x2.nxv8bf16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_bf16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: bfmop4s za0.s, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za64_s16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za64_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4a za0.d, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.za64.wide.1x2.nxv8i16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_s16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4s za0.d, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za64_u16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za64_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: umop4a za0.d, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.za64.wide.1x2.nxv8i16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_u16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: umop4s za0.d, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.za64.wide.1x2.nxv8i16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za64_s16_u16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za64_s16_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: sumop4a za0.d, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4a.za64.wide.1x2.nxv8i16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_s16_u16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_s16_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: sumop4s za0.d, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4s.za64.wide.1x2.nxv8i16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za64_u16_s16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za64_u16_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: usmop4a za0.d, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4a.za64.wide.1x2.nxv8i16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_u16_s16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_u16_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: usmop4s za0.d, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4s.za64.wide.1x2.nxv8i16(i32 0, %zn, %zm1, %zm2) + ret void +} + +; Non-widening +define void @mop4a_za16_f16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4a za0.h, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.1x2.nxv8f16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za16_f16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za0.h, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_f32( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4a za0.s, z0.s, { z24.s, z25.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.1x2.nxv4f32(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_f32( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za0.s, z0.s, { z24.s, z25.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za64_f64( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4a za0.d, z0.d, { z24.d, z25.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.1x2.nxv2f64(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_f64( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za0.d, z0.d, { z24.d, z25.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4a_za16_bf16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za16_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: bfmop4a za0.h, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.1x2.nxv8bf16(i32 0, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za16_bf16( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za16_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: bfmop4s za0.h, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 0, %zn, %zm1, %zm2) + ret void +} + +; Tile limits + +define void @mop4s_za32_s8_limit( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_s8_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4s za3.s, z0.b, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv16i8(i32 3, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_s16_limit( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_s16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4s za3.s, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.1x2.nxv8i16(i32 3, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_f16_limit( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_f16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za3.s, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8f16(i32 3, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_bf16_limit( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_bf16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: bfmop4s za3.s, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.1x2.nxv8bf16(i32 3, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_s16_limit( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_s16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: smop4s za7.d, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.za64.wide.1x2.nxv8i16(i32 7, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_f64_limit( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_f64_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za7.d, z0.d, { z24.d, z25.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x2.nxv2f64(i32 7, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_f32_limit( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_f32_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za3.s, z0.s, { z24.s, z25.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x2.nxv4f32(i32 3, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za16_f16_limit( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za16_f16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: fmop4s za1.h, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x2.nxv8f16(i32 1, %zn, %zm1, %zm2) + ret void +} + +define void @mop4s_za16_bf16_limit( %zn, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za16_bf16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z2.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: bfmop4s za1.h, z0.h, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.1x2.nxv8bf16(i32 1, %zn, %zm1, %zm2) + ret void +} + +attributes #0 = {nounwind "target-features" = "+sme-i16i64,+sme-f64f64,+sme-b16b16,+sme2p1,+bf16,+sme-f16f16,+sme-mop4" } From 173eb32b756ad62c4243e6631198cd22fc079cef Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 1 Apr 2025 07:27:23 -0700 Subject: [PATCH 0244/1029] [X86] Construct SmallVector with ArrayRef (NFC) (#133860) --- llvm/lib/Target/X86/X86ISelLowering.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 78762774f65e8..8e6a891444bf1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9342,7 +9342,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // See if we can use a vector load to get all of the elements. { - SmallVector Ops(Op->op_begin(), Op->op_begin() + NumElems); + SmallVector Ops(Op->ops().take_front(NumElems)); if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) return LD; @@ -23219,7 +23219,7 @@ static SDValue EmitTest(SDValue Op, X86::CondCode X86CC, const SDLoc &dl, DAG.getConstant(0, dl, Op.getValueType())); } SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - SmallVector Ops(Op->op_begin(), Op->op_begin() + NumOperands); + SmallVector Ops(Op->ops().take_front(NumOperands)); SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New); @@ -30377,7 +30377,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, // element to the other. // This optimized lowering is only valid if the elements in a pair can // be treated identically. - SmallVector AmtWideElts(Amt->op_begin(), Amt->op_end()); + SmallVector AmtWideElts(Amt->ops()); SmallVector TmpAmtWideElts; int WideEltSizeInBits = EltSizeInBits; while (WideEltSizeInBits < 32) { @@ -32908,7 +32908,7 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, EVT EltVT = InOp.getOperand(0).getValueType(); SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT); - SmallVector Ops(InOp->op_begin(), InOp->op_end()); + SmallVector Ops(InOp->ops()); Ops.append(WidenNumElts - InNumElts, FillVal); return DAG.getBuildVector(NVT, dl, Ops); } From c30776ab9a1404adff6022c65b92fb8cd3cfc097 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 1 Apr 2025 07:28:18 -0700 Subject: [PATCH 0245/1029] [AArch64] Use ArrayRef::slice (NFC) (#133862) --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 22083460b400a..40944e3d43d6b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -1476,8 +1476,7 @@ void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, // Form a REG_SEQUENCE to force register allocation. unsigned Vec0Off = ExtOff + 1; - SmallVector Regs(N->op_begin() + Vec0Off, - N->op_begin() + Vec0Off + NumVecs); + SmallVector Regs(N->ops().slice(Vec0Off, NumVecs)); SDValue RegSeq = createQTuple(Regs); SmallVector Ops; @@ -1863,7 +1862,7 @@ void AArch64DAGToDAGISel::SelectWhilePair(SDNode *N, unsigned Opc) { void AArch64DAGToDAGISel::SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode) { EVT VT = N->getValueType(0); - SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + SmallVector Regs(N->ops().slice(1, NumVecs)); SDValue Ops = createZTuple(Regs); SDLoc DL(N); SDNode *Intrinsic = CurDAG->getMachineNode(Opcode, DL, MVT::Untyped, Ops); @@ -2072,7 +2071,7 @@ void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs, SDLoc DL(N); EVT VT = N->getValueType(0); - SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + SmallVector Regs(N->ops().slice(1, NumVecs)); SDValue Zd = createZMulTuple(Regs); SDValue Zn = N->getOperand(1 + NumVecs); SDValue Zm = N->getOperand(2 + NumVecs); @@ -2242,7 +2241,7 @@ void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs, SDLoc dl(N); // Form a REG_SEQUENCE to force register allocation. - SmallVector Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); + SmallVector Regs(N->ops().slice(2, NumVecs)); SDValue RegSeq = createZTuple(Regs); // Optimize addressing mode. @@ -2287,7 +2286,7 @@ void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, // Form a REG_SEQUENCE to force register allocation. bool Is128Bit = VT.getSizeInBits() == 128; - SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + SmallVector Regs(N->ops().slice(1, NumVecs)); SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs); SDValue Ops[] = {RegSeq, @@ -2341,7 +2340,7 @@ void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, bool Narrow = VT.getSizeInBits() == 64; // Form a REG_SEQUENCE to force register allocation. - SmallVector Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); + SmallVector Regs(N->ops().slice(2, NumVecs)); if (Narrow) transform(Regs, Regs.begin(), @@ -2379,7 +2378,7 @@ void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, bool Narrow = VT.getSizeInBits() == 64; // Form a REG_SEQUENCE to force register allocation. - SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + SmallVector Regs(N->ops().slice(1, NumVecs)); if (Narrow) transform(Regs, Regs.begin(), @@ -2433,7 +2432,7 @@ void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs, bool Narrow = VT.getSizeInBits() == 64; // Form a REG_SEQUENCE to force register allocation. - SmallVector Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); + SmallVector Regs(N->ops().slice(2, NumVecs)); if (Narrow) transform(Regs, Regs.begin(), From 664745cf381b4a153b16ef9e38d3e01dbb1e7d4f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 1 Apr 2025 15:50:00 +0100 Subject: [PATCH 0246/1029] [X86] avx512-vselect.ll - regenerate VPTERNLOG comments --- llvm/test/CodeGen/X86/avx512-vselect.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512-vselect.ll b/llvm/test/CodeGen/X86/avx512-vselect.ll index c402e8d7b7714..aacd5d1ab0298 100644 --- a/llvm/test/CodeGen/X86/avx512-vselect.ll +++ b/llvm/test/CodeGen/X86/avx512-vselect.ll @@ -181,12 +181,12 @@ define <64 x i16> @test8(<64 x i8> %x, <64 x i16> %a, <64 x i16> %b) { ; CHECK-KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; CHECK-KNL-NEXT: vpmovsxbw %xmm0, %ymm0 ; CHECK-KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 -; CHECK-KNL-NEXT: vpternlogq $202, %zmm3, %zmm1, %zmm0 +; CHECK-KNL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm0 & (zmm1 ^ zmm3)) ; CHECK-KNL-NEXT: vpmovsxbw %xmm5, %ymm1 ; CHECK-KNL-NEXT: vextracti128 $1, %ymm5, %xmm3 ; CHECK-KNL-NEXT: vpmovsxbw %xmm3, %ymm3 ; CHECK-KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; CHECK-KNL-NEXT: vpternlogq $202, %zmm4, %zmm2, %zmm1 +; CHECK-KNL-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm1 & (zmm2 ^ zmm4)) ; CHECK-KNL-NEXT: retq %c = icmp eq <64 x i8> %x, zeroinitializer %ret = select <64 x i1> %c, <64 x i16> %a, <64 x i16> %b From 8741412bdfc0a60719f116add7d828694ef48c02 Mon Sep 17 00:00:00 2001 From: lntue Date: Tue, 1 Apr 2025 10:57:32 -0400 Subject: [PATCH 0247/1029] [libc][math] Implement a fast pass for atan2f128 with 1ULP error using DyadicFloat<128>. (#133150) Part of https://github.com/llvm/llvm-project/issues/131642. --- libc/config/linux/aarch64/entrypoints.txt | 1 + libc/config/linux/riscv/entrypoints.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 1 + libc/docs/headers/math/index.rst | 2 +- libc/include/math.yaml | 8 + libc/src/__support/FPUtil/dyadic_float.h | 2 +- libc/src/math/CMakeLists.txt | 1 + libc/src/math/atan2f128.h | 21 ++ libc/src/math/generic/CMakeLists.txt | 21 ++ libc/src/math/generic/atan2f128.cpp | 203 ++++++++++++++++++++ libc/src/math/generic/atan_utils.h | 108 ++++++++++- libc/test/src/math/CMakeLists.txt | 12 ++ libc/test/src/math/atan2f128_test.cpp | 99 ++++++++++ libc/test/src/math/smoke/CMakeLists.txt | 10 + libc/test/src/math/smoke/atan2f128_test.cpp | 28 +++ 15 files changed, 514 insertions(+), 4 deletions(-) create mode 100644 libc/src/math/atan2f128.h create mode 100644 libc/src/math/generic/atan2f128.cpp create mode 100644 libc/test/src/math/atan2f128_test.cpp create mode 100644 libc/test/src/math/smoke/atan2f128_test.cpp diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 27e4c7035165a..5f293dc1c3c73 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -759,6 +759,7 @@ endif() if(LIBC_TYPES_HAS_FLOAT128) list(APPEND TARGET_LIBM_ENTRYPOINTS # math.h C23 _Float128 entrypoints + libc.src.math.atan2f128 libc.src.math.canonicalizef128 libc.src.math.ceilf128 libc.src.math.copysignf128 diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index b2329d30efb54..e3c4fe5170104 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -640,6 +640,7 @@ endif() if(LIBC_TYPES_HAS_FLOAT128) list(APPEND TARGET_LIBM_ENTRYPOINTS # math.h C23 _Float128 entrypoints + libc.src.math.atan2f128 libc.src.math.canonicalizef128 libc.src.math.ceilf128 libc.src.math.copysignf128 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 09c8c18c04a59..eccd222fa123e 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -776,6 +776,7 @@ endif() if(LIBC_TYPES_HAS_FLOAT128) list(APPEND TARGET_LIBM_ENTRYPOINTS # math.h C23 _Float128 entrypoints + libc.src.math.atan2f128 libc.src.math.canonicalizef128 libc.src.math.ceilf128 libc.src.math.copysignf128 diff --git a/libc/docs/headers/math/index.rst b/libc/docs/headers/math/index.rst index df2650065f882..947bd4b60b391 100644 --- a/libc/docs/headers/math/index.rst +++ b/libc/docs/headers/math/index.rst @@ -263,7 +263,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | atan | |check| | 1 ULP | | | | 7.12.4.3 | F.10.1.3 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| atan2 | |check| | 1 ULP | | | | 7.12.4.4 | F.10.1.4 | +| atan2 | |check| | 1 ULP | | | 1 ULP | 7.12.4.4 | F.10.1.4 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | atan2pi | | | | | | 7.12.4.11 | F.10.1.11 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/include/math.yaml b/libc/include/math.yaml index 133f9a6c034ec..fef829422244d 100644 --- a/libc/include/math.yaml +++ b/libc/include/math.yaml @@ -93,6 +93,14 @@ functions: arguments: - type: long double - type: long double + - name: atan2f128 + standards: + - stdc + return_type: float128 + arguments: + - type: float128 + - type: float128 + guard: LIBC_TYPES_HAS_FLOAT128 - name: atanf standards: - stdc diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h index 2d181134bc2ae..f18ace7419940 100644 --- a/libc/src/__support/FPUtil/dyadic_float.h +++ b/libc/src/__support/FPUtil/dyadic_float.h @@ -104,7 +104,7 @@ template struct DyadicFloat { normalize(); } - LIBC_INLINE constexpr DyadicFloat(Sign s, int e, MantissaType m) + LIBC_INLINE constexpr DyadicFloat(Sign s, int e, const MantissaType &m) : sign(s), exponent(e), mantissa(m) { normalize(); } diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 88fb73f856e82..d177ff79141c0 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -62,6 +62,7 @@ add_math_entrypoint_object(atanf) add_math_entrypoint_object(atan2) add_math_entrypoint_object(atan2f) add_math_entrypoint_object(atan2l) +add_math_entrypoint_object(atan2f128) add_math_entrypoint_object(atanh) add_math_entrypoint_object(atanhf) diff --git a/libc/src/math/atan2f128.h b/libc/src/math/atan2f128.h new file mode 100644 index 0000000000000..26f7ec624940c --- /dev/null +++ b/libc/src/math/atan2f128.h @@ -0,0 +1,21 @@ +//===-- Implementation header for atan2f128 ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_ATAN2F128_H +#define LLVM_LIBC_SRC_MATH_ATAN2F128_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float128 atan2f128(float128 x, float128 y); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_ATAN2F128_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index de74729465ee7..f7c36aab77b7d 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -4117,8 +4117,11 @@ add_header_library( HDRS atan_utils.h DEPENDS + libc.src.__support.integer_literals libc.src.__support.FPUtil.double_double + libc.src.__support.FPUtil.dyadic_float libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval libc.src.__support.macros.optimization ) @@ -4200,6 +4203,24 @@ add_entrypoint_object( .atan2 ) +add_entrypoint_object( + atan2f128 + SRCS + atan2f128.cpp + HDRS + ../atan2f128.h + DEPENDS + .atan_utils + libc.src.__support.integer_literals + libc.src.__support.uint128 + libc.src.__support.FPUtil.dyadic_float + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.nearest_integer + libc.src.__support.macros.optimization + libc.src.__support.macros.properties.types +) + add_entrypoint_object( scalbln SRCS diff --git a/libc/src/math/generic/atan2f128.cpp b/libc/src/math/generic/atan2f128.cpp new file mode 100644 index 0000000000000..a3aba0bc7fa2a --- /dev/null +++ b/libc/src/math/generic/atan2f128.cpp @@ -0,0 +1,203 @@ +//===-- Quad-precision atan2 function -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/atan2f128.h" +#include "atan_utils.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/dyadic_float.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/nearest_integer.h" +#include "src/__support/integer_literals.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/macros/properties/types.h" +#include "src/__support/uint128.h" + +namespace LIBC_NAMESPACE_DECL { + +namespace { + +using Float128 = fputil::DyadicFloat<128>; + +static constexpr Float128 ZERO = {Sign::POS, 0, 0_u128}; +static constexpr Float128 MZERO = {Sign::NEG, 0, 0_u128}; +static constexpr Float128 PI = {Sign::POS, -126, + 0xc90fdaa2'2168c234'c4c6628b'80dc1cd1_u128}; +static constexpr Float128 MPI = {Sign::NEG, -126, + 0xc90fdaa2'2168c234'c4c6628b'80dc1cd1_u128}; +static constexpr Float128 PI_OVER_2 = { + Sign::POS, -127, 0xc90fdaa2'2168c234'c4c6628b'80dc1cd1_u128}; +static constexpr Float128 MPI_OVER_2 = { + Sign::NEG, -127, 0xc90fdaa2'2168c234'c4c6628b'80dc1cd1_u128}; +static constexpr Float128 PI_OVER_4 = { + Sign::POS, -128, 0xc90fdaa2'2168c234'c4c6628b'80dc1cd1_u128}; +static constexpr Float128 THREE_PI_OVER_4 = { + Sign::POS, -128, 0x96cbe3f9'990e91a7'9394c9e8'a0a5159d_u128}; + +// Adjustment for constant term: +// CONST_ADJ[x_sign][y_sign][recip] +static constexpr Float128 CONST_ADJ[2][2][2] = { + {{ZERO, MPI_OVER_2}, {MZERO, MPI_OVER_2}}, + {{MPI, PI_OVER_2}, {MPI, PI_OVER_2}}}; + +} // anonymous namespace + +// There are several range reduction steps we can take for atan2(y, x) as +// follow: + +// * Range reduction 1: signness +// atan2(y, x) will return a number between -PI and PI representing the angle +// forming by the 0x axis and the vector (x, y) on the 0xy-plane. +// In particular, we have that: +// atan2(y, x) = atan( y/x ) if x >= 0 and y >= 0 (I-quadrant) +// = pi + atan( y/x ) if x < 0 and y >= 0 (II-quadrant) +// = -pi + atan( y/x ) if x < 0 and y < 0 (III-quadrant) +// = atan( y/x ) if x >= 0 and y < 0 (IV-quadrant) +// Since atan function is odd, we can use the formula: +// atan(-u) = -atan(u) +// to adjust the above conditions a bit further: +// atan2(y, x) = atan( |y|/|x| ) if x >= 0 and y >= 0 (I-quadrant) +// = pi - atan( |y|/|x| ) if x < 0 and y >= 0 (II-quadrant) +// = -pi + atan( |y|/|x| ) if x < 0 and y < 0 (III-quadrant) +// = -atan( |y|/|x| ) if x >= 0 and y < 0 (IV-quadrant) +// Which can be simplified to: +// atan2(y, x) = sign(y) * atan( |y|/|x| ) if x >= 0 +// = sign(y) * (pi - atan( |y|/|x| )) if x < 0 + +// * Range reduction 2: reciprocal +// Now that the argument inside atan is positive, we can use the formula: +// atan(1/x) = pi/2 - atan(x) +// to make the argument inside atan <= 1 as follow: +// atan2(y, x) = sign(y) * atan( |y|/|x|) if 0 <= |y| <= x +// = sign(y) * (pi/2 - atan( |x|/|y| ) if 0 <= x < |y| +// = sign(y) * (pi - atan( |y|/|x| )) if 0 <= |y| <= -x +// = sign(y) * (pi/2 + atan( |x|/|y| )) if 0 <= -x < |y| + +// * Range reduction 3: look up table. +// After the previous two range reduction steps, we reduce the problem to +// compute atan(u) with 0 <= u <= 1, or to be precise: +// atan( n / d ) where n = min(|x|, |y|) and d = max(|x|, |y|). +// An accurate polynomial approximation for the whole [0, 1] input range will +// require a very large degree. To make it more efficient, we reduce the input +// range further by finding an integer idx such that: +// | n/d - idx/64 | <= 1/128. +// In particular, +// idx := round(2^6 * n/d) +// Then for the fast pass, we find a polynomial approximation for: +// atan( n/d ) ~ atan( idx/64 ) + (n/d - idx/64) * Q(n/d - idx/64) +// For the accurate pass, we use the addition formula: +// atan( n/d ) - atan( idx/64 ) = atan( (n/d - idx/64)/(1 + (n*idx)/(64*d)) ) +// = atan( (n - d*(idx/64))/(d + n*(idx/64)) ) +// And for the fast pass, we use degree-13 minimax polynomial to compute the +// RHS: +// atan(u) ~ P(u) = u - c_3 * u^3 + c_5 * u^5 - c_7 * u^7 + c_9 *u^9 - +// - c_11 * u^11 + c_13 * u^13 +// with absolute errors bounded by: +// |atan(u) - P(u)| < 2^-121 +// and relative errors bounded by: +// |(atan(u) - P(u)) / P(u)| < 2^-114. + +LLVM_LIBC_FUNCTION(float128, atan2f128, (float128 y, float128 x)) { + using FPBits = fputil::FPBits; + using Float128 = fputil::DyadicFloat<128>; + + FPBits x_bits(x), y_bits(y); + bool x_sign = x_bits.sign().is_neg(); + bool y_sign = y_bits.sign().is_neg(); + x_bits = x_bits.abs(); + y_bits = y_bits.abs(); + UInt128 x_abs = x_bits.uintval(); + UInt128 y_abs = y_bits.uintval(); + bool recip = x_abs < y_abs; + UInt128 min_abs = recip ? x_abs : y_abs; + UInt128 max_abs = !recip ? x_abs : y_abs; + unsigned min_exp = static_cast(min_abs >> FPBits::FRACTION_LEN); + unsigned max_exp = static_cast(max_abs >> FPBits::FRACTION_LEN); + + Float128 num(FPBits(min_abs).get_val()); + Float128 den(FPBits(max_abs).get_val()); + + // Check for exceptional cases, whether inputs are 0, inf, nan, or close to + // overflow, or close to underflow. + if (LIBC_UNLIKELY(max_exp >= 0x7fffU || min_exp == 0U)) { + if (x_bits.is_nan() || y_bits.is_nan()) + return FPBits::quiet_nan().get_val(); + unsigned x_except = x == 0 ? 0 : (FPBits(x_abs).is_inf() ? 2 : 1); + unsigned y_except = y == 0 ? 0 : (FPBits(y_abs).is_inf() ? 2 : 1); + + // Exceptional cases: + // EXCEPT[y_except][x_except][x_is_neg] + // with x_except & y_except: + // 0: zero + // 1: finite, non-zero + // 2: infinity + constexpr Float128 EXCEPTS[3][3][2] = { + {{ZERO, PI}, {ZERO, PI}, {ZERO, PI}}, + {{PI_OVER_2, PI_OVER_2}, {ZERO, ZERO}, {ZERO, PI}}, + {{PI_OVER_2, PI_OVER_2}, + {PI_OVER_2, PI_OVER_2}, + {PI_OVER_4, THREE_PI_OVER_4}}, + }; + + if ((x_except != 1) || (y_except != 1)) { + Float128 r = EXCEPTS[y_except][x_except][x_sign]; + if (y_sign) + r.sign = r.sign.negate(); + return static_cast(r); + } + } + + bool final_sign = ((x_sign != y_sign) != recip); + Float128 const_term = CONST_ADJ[x_sign][y_sign][recip]; + int exp_diff = den.exponent - num.exponent; + // We have the following bound for normalized n and d: + // 2^(-exp_diff - 1) < n/d < 2^(-exp_diff + 1). + if (LIBC_UNLIKELY(exp_diff > FPBits::FRACTION_LEN + 2)) { + if (final_sign) + const_term.sign = const_term.sign.negate(); + return static_cast(const_term); + } + + // Take 24 leading bits of num and den to convert to float for fast division. + // We also multiply the numerator by 64 using integer addition directly to the + // exponent field. + float num_f = + cpp::bit_cast(static_cast(num.mantissa >> 104) + + (6U << fputil::FPBits::FRACTION_LEN)); + float den_f = cpp::bit_cast( + static_cast(den.mantissa >> 104) + + (static_cast(exp_diff) << fputil::FPBits::FRACTION_LEN)); + + float k = fputil::nearest_integer(num_f / den_f); + unsigned idx = static_cast(k); + + // k_f128 = idx / 64 + Float128 k_f128(Sign::POS, -6, Float128::MantissaType(idx)); + + // Range reduction: + // atan(n/d) - atan(k) = atan((n/d - k/64) / (1 + (n/d) * (k/64))) + // = atan((n - d * k/64)) / (d + n * k/64)) + // num_f128 = n - d * k/64 + Float128 num_f128 = fputil::multiply_add(den, -k_f128, num); + // den_f128 = d + n * k/64 + Float128 den_f128 = fputil::multiply_add(num, k_f128, den); + + // q = (n - d * k) / (d + n * k) + Float128 q = fputil::quick_mul(num_f128, fputil::approx_reciprocal(den_f128)); + // p ~ atan(q) + Float128 p = atan_eval(q); + + Float128 r = + fputil::quick_add(const_term, fputil::quick_add(ATAN_I_F128[idx], p)); + if (final_sign) + r.sign = r.sign.negate(); + + return static_cast(r); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/atan_utils.h b/libc/src/math/generic/atan_utils.h index 3331843900d4b..24c7271b7e4ec 100644 --- a/libc/src/math/generic/atan_utils.h +++ b/libc/src/math/generic/atan_utils.h @@ -9,8 +9,11 @@ #ifndef LLVM_LIBC_SRC_MATH_GENERIC_ATAN_UTILS_H #define LLVM_LIBC_SRC_MATH_GENERIC_ATAN_UTILS_H +#include "src/__support/FPUtil/PolyEval.h" #include "src/__support/FPUtil/double_double.h" +#include "src/__support/FPUtil/dyadic_float.h" #include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/integer_literals.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { @@ -18,6 +21,7 @@ namespace LIBC_NAMESPACE_DECL { namespace { using DoubleDouble = fputil::DoubleDouble; +using Float128 = fputil::DyadicFloat<128>; // atan(i/64) with i = 0..64, generated by Sollya with: // > for i from 0 to 64 do { @@ -25,7 +29,7 @@ using DoubleDouble = fputil::DoubleDouble; // b = round(atan(i/64) - a, D, RN); // print("{", b, ",", a, "},"); // }; -constexpr fputil::DoubleDouble ATAN_I[65] = { +constexpr DoubleDouble ATAN_I[65] = { {0.0, 0.0}, {-0x1.220c39d4dff5p-61, 0x1.fff555bbb729bp-7}, {-0x1.5ec431444912cp-60, 0x1.ffd55bba97625p-6}, @@ -106,7 +110,7 @@ constexpr fputil::DoubleDouble ATAN_I[65] = { // + x_lo * (1 - x_hi^2 + x_hi^4) // Since p.lo is ~ x^3/3, the relative error from rounding is bounded by: // |(atan(x) - P(x))/atan(x)| < ulp(x^2) <= 2^(-14-52) = 2^-66. -DoubleDouble atan_eval(const DoubleDouble &x) { +[[maybe_unused]] DoubleDouble atan_eval(const DoubleDouble &x) { DoubleDouble p; p.hi = x.hi; double x_hi_sq = x.hi * x.hi; @@ -130,6 +134,106 @@ DoubleDouble atan_eval(const DoubleDouble &x) { return p; } +// Float128 versions. +// atan(i/64) with i = 0..64, generated by Sollya with: +// > for i from 1 to 64 do { +// a = round(atan(i/64), 128, RN); +// ll = ceil(log2(a)); +// b = 2^ll + a; +// print("{Sign::POS, ", 2^(ll - 128), ",", b, "},"); +// }; +constexpr Float128 ATAN_I_F128[65] = { + {Sign::POS, 0, 0_u128}, + {Sign::POS, -134, 0xfffaaadd'db94d5bb'e78c5640'15f76048_u128}, + {Sign::POS, -133, 0xffeaaddd'4bb12542'779d776d'da8c6214_u128}, + {Sign::POS, -132, 0xbfdc0c21'86d14fcf'220e10d6'1df56ec7_u128}, + {Sign::POS, -132, 0xffaaddb9'67ef4e36'cb2792dc'0e2e0d51_u128}, + {Sign::POS, -131, 0x9facf873'e2aceb58'99c50bbf'08e6cdf6_u128}, + {Sign::POS, -131, 0xbf70c130'17887460'93567e78'4cf83676_u128}, + {Sign::POS, -131, 0xdf1cf5f3'783e1bef'71e5340b'30e5d9ef_u128}, + {Sign::POS, -131, 0xfeadd4d5'617b6e32'c897989f'3e888ef8_u128}, + {Sign::POS, -130, 0x8f0fd7d8'21b93725'bd375929'83a0af9a_u128}, + {Sign::POS, -130, 0x9eb77746'331362c3'47619d25'0360fe85_u128}, + {Sign::POS, -130, 0xae4c08f1'f6134efa'b54d3fef'0c2de994_u128}, + {Sign::POS, -130, 0xbdcbda5e'72d81134'7b0b4f88'1c9c7488_u128}, + {Sign::POS, -130, 0xcd35474b'643130e7'b00f3da1'a46eeb3b_u128}, + {Sign::POS, -130, 0xdc86ba94'93051022'f621a5c1'cb552f03_u128}, + {Sign::POS, -130, 0xebbeaef9'02b9b38c'91a2a68b'2fbd78e8_u128}, + {Sign::POS, -130, 0xfadbafc9'6406eb15'6dc79ef5'f7a217e6_u128}, + {Sign::POS, -129, 0x84ee2cbe'c31b12c5'c8e72197'0cabd3a3_u128}, + {Sign::POS, -129, 0x8c5fad18'5f8bc130'ca4748b1'bf88298d_u128}, + {Sign::POS, -129, 0x93c1b902'bf7a2df1'06459240'6fe1447a_u128}, + {Sign::POS, -129, 0x9b13b9b8'3f5e5e69'c5abb498'd27af328_u128}, + {Sign::POS, -129, 0xa25521b6'15784d45'43787549'88b8d9e3_u128}, + {Sign::POS, -129, 0xa9856cca'8e6a4eda'99b7f77b'f7d9e8c1_u128}, + {Sign::POS, -129, 0xb0a42018'4e7f0cb1'b51d51dc'200a0fc3_u128}, + {Sign::POS, -129, 0xb7b0ca0f'26f78473'8aa32122'dcfe4483_u128}, + {Sign::POS, -129, 0xbeab025b'1d9fbad3'910b8564'93411026_u128}, + {Sign::POS, -129, 0xc59269ca'50d92b6d'a1746e91'f50a28de_u128}, + {Sign::POS, -129, 0xcc66aa2a'6b58c33c'd9311fa1'4ed9b7c4_u128}, + {Sign::POS, -129, 0xd327761e'611fe5b6'427c95e9'001e7136_u128}, + {Sign::POS, -129, 0xd9d488ed'32e3635c'30f6394a'0806345d_u128}, + {Sign::POS, -129, 0xe06da64a'764f7c67'c631ed96'798cb804_u128}, + {Sign::POS, -129, 0xe6f29a19'609a84ba'60b77ce1'ca6dc2c8_u128}, + {Sign::POS, -129, 0xed63382b'0dda7b45'6fe445ec'bc3a8d03_u128}, + {Sign::POS, -129, 0xf3bf5bf8'bad1a21c'a7b837e6'86adf3fa_u128}, + {Sign::POS, -129, 0xfa06e85a'a0a0be5c'66d23c7d'5dc8ecc2_u128}, + {Sign::POS, -128, 0x801ce39e'0d205c99'a6d6c6c5'4d938596_u128}, + {Sign::POS, -128, 0x832bf4a6'd9867e2a'4b6a09cb'61a515c1_u128}, + {Sign::POS, -128, 0x8630a2da'da1ed065'd3e84ed5'013ca37e_u128}, + {Sign::POS, -128, 0x892aecdf'de9547b5'094478fc'472b4afc_u128}, + {Sign::POS, -128, 0x8c1ad445'f3e09b8c'439d8018'60205921_u128}, + {Sign::POS, -128, 0x8f005d5e'f7f59f9b'5c835e16'65c43748_u128}, + {Sign::POS, -128, 0x91db8f16'64f350e2'10e4f9c1'126e0220_u128}, + {Sign::POS, -128, 0x94ac72c9'847186f6'18c4f393'f78a32f9_u128}, + {Sign::POS, -128, 0x97731420'365e538b'abd3fe19'f1aeb6b3_u128}, + {Sign::POS, -128, 0x9a2f80e6'71bdda20'4226f8e2'204ff3bd_u128}, + {Sign::POS, -128, 0x9ce1c8e6'a0b8cdb9'f799c4e8'174cf11c_u128}, + {Sign::POS, -128, 0x9f89fdc4'f4b7a1ec'f8b49264'4f0701e0_u128}, + {Sign::POS, -128, 0xa22832db'cadaae08'92fe9c08'637af0e6_u128}, + {Sign::POS, -128, 0xa4bc7d19'34f70924'19a87f2a'457dac9f_u128}, + {Sign::POS, -128, 0xa746f2dd'b7602294'67b7d66f'2d74e019_u128}, + {Sign::POS, -128, 0xa9c7abdc'4830f5c8'916a84b5'be7933f6_u128}, + {Sign::POS, -128, 0xac3ec0fb'997dd6a1'a36273a5'6afa8ef4_u128}, + {Sign::POS, -128, 0xaeac4c38'b4d8c080'14725e2f'3e52070a_u128}, + {Sign::POS, -128, 0xb110688a'ebdc6f6a'43d65788'b9f6a7b5_u128}, + {Sign::POS, -128, 0xb36b31c9'1f043691'59014174'4462f93a_u128}, + {Sign::POS, -128, 0xb5bcc490'59ecc4af'f8f3cee7'5e3907d5_u128}, + {Sign::POS, -128, 0xb8053e2b'c2319e73'cb2da552'10a4443d_u128}, + {Sign::POS, -128, 0xba44bc7d'd470782f'654c2cb1'0942e386_u128}, + {Sign::POS, -128, 0xbc7b5dea'e98af280'd4113006'e80fb290_u128}, + {Sign::POS, -128, 0xbea94144'fd049aac'1043c5e7'55282e7d_u128}, + {Sign::POS, -128, 0xc0ce85b8'ac526640'89dd62c4'6e92fa25_u128}, + {Sign::POS, -128, 0xc2eb4abb'661628b5'b373fe45'c61bb9fb_u128}, + {Sign::POS, -128, 0xc4ffaffa'bf8fbd54'8cb43d10'bc9e0221_u128}, + {Sign::POS, -128, 0xc70bd54c'e602ee13'e7d54fbd'09f2be38_u128}, + {Sign::POS, -128, 0xc90fdaa2'2168c234'c4c6628b'80dc1cd1_u128}, +}; + +// Degree-13 minimax polynomial generated by Sollya with: +// > P = fpminimax(atan(x), [|1, 3, 5, 7, 9, 11, 13|], [|1, 128...|], +// [0, 2^-7]); +// > dirtyinfnorm(atan(x) - P, [0, 2^-7]); +// 0x1.26016ad97f323875760f869684c0898d7b7bb8bep-122 +constexpr Float128 ATAN_POLY_F128[] = { + {Sign::NEG, -129, 0xaaaaaaaa'aaaaaaaa'aaaaaaa6'003c5d1d_u128}, + {Sign::POS, -130, 0xcccccccc'cccccccc'cca00232'8776b063_u128}, + {Sign::NEG, -130, 0x92492492'49249201'27f5268a'cb24aec0_u128}, + {Sign::POS, -131, 0xe38e38e3'8dce3d96'626a1643'f8eb68f3_u128}, + {Sign::NEG, -131, 0xba2e8b7a'ea4ad00f'005a35c7'6ef609b1_u128}, + {Sign::POS, -131, 0x9d82765e'd22a7d92'ac09c405'c0a69214_u128}, +}; + +// Approximate atan for |x| <= 2^-7. +[[maybe_unused]] Float128 atan_eval(const Float128 &x) { + Float128 x_sq = fputil::quick_mul(x, x); + Float128 x3 = fputil::quick_mul(x, x_sq); + Float128 p = fputil::polyeval(x_sq, ATAN_POLY_F128[0], ATAN_POLY_F128[1], + ATAN_POLY_F128[2], ATAN_POLY_F128[3], + ATAN_POLY_F128[4], ATAN_POLY_F128[5]); + return fputil::multiply_add(x3, p, x); +} + } // anonymous namespace } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 514c01834c1a4..7ee8b86135557 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -2413,6 +2413,18 @@ add_fp_unittest( libc.src.__support.FPUtil.fp_bits ) +add_fp_unittest( + atan2f128_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + atan2f128_test.cpp + DEPENDS + libc.src.math.atan2f128 + libc.src.__support.FPUtil.fp_bits +) + add_fp_unittest( f16add_test NEED_MPFR diff --git a/libc/test/src/math/atan2f128_test.cpp b/libc/test/src/math/atan2f128_test.cpp new file mode 100644 index 0000000000000..c03d8703d7c7b --- /dev/null +++ b/libc/test/src/math/atan2f128_test.cpp @@ -0,0 +1,99 @@ +//===-- Unittests for atan2f128 -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/atan2f128.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcAtan2f128Test = LIBC_NAMESPACE::testing::FPTest; +using LIBC_NAMESPACE::testing::tlog; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +TEST_F(LlvmLibcAtan2f128Test, InQuadRange) { + constexpr StorageType X_COUNT = 123; + constexpr StorageType X_START = FPBits(0.25q).uintval(); + constexpr StorageType X_STOP = FPBits(4.0q).uintval(); + constexpr StorageType X_STEP = (X_STOP - X_START) / X_COUNT; + + constexpr StorageType Y_COUNT = 137; + constexpr StorageType Y_START = FPBits(0.25q).uintval(); + constexpr StorageType Y_STOP = FPBits(4.0q).uintval(); + constexpr StorageType Y_STEP = (Y_STOP - Y_START) / Y_COUNT; + + auto test = [&](mpfr::RoundingMode rounding_mode) { + mpfr::ForceRoundingMode __r(rounding_mode); + if (!__r.success) + return; + + uint64_t fails = 0; + uint64_t finite_count = 0; + uint64_t total_count = 0; + float128 failed_x = 0.0, failed_y = 0.0, failed_r = 0.0; + double tol = 0.5; + + for (StorageType i = 0, v = X_START; i <= X_COUNT; ++i, v += X_STEP) { + float128 x = FPBits(v).get_val(); + if (FPBits(x).is_inf_or_nan() || x < 0.0q) + continue; + + for (StorageType j = 0, w = Y_START; j <= Y_COUNT; ++j, w += Y_STEP) { + float128 y = FPBits(w).get_val(); + if (FPBits(y).is_inf_or_nan()) + continue; + + float128 result = LIBC_NAMESPACE::atan2f128(x, y); + ++total_count; + if (FPBits(result).is_inf_or_nan()) + continue; + + ++finite_count; + mpfr::BinaryInput inputs{x, y}; + + if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Atan2, inputs, + result, 2.0, rounding_mode)) { + ++fails; + while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY( + mpfr::Operation::Atan2, inputs, result, tol, rounding_mode)) { + failed_x = x; + failed_y = y; + failed_r = result; + + if (tol > 1000.0) + break; + + tol *= 2.0; + } + } + } + } + if (fails || (finite_count < total_count)) { + tlog << " Atan2 failed: " << fails << "/" << finite_count << "/" + << total_count << " tests.\n" + << " Max ULPs is at most: " << static_cast(tol) << ".\n"; + } + if (fails) { + mpfr::BinaryInput inputs{failed_x, failed_y}; + EXPECT_MPFR_MATCH(mpfr::Operation::Atan2, inputs, failed_r, 0.5, + rounding_mode); + } + }; + + tlog << " Test Rounding To Nearest...\n"; + test(mpfr::RoundingMode::Nearest); + + tlog << " Test Rounding Downward...\n"; + test(mpfr::RoundingMode::Downward); + + tlog << " Test Rounding Upward...\n"; + test(mpfr::RoundingMode::Upward); + + tlog << " Test Rounding Toward Zero...\n"; + test(mpfr::RoundingMode::TowardZero); +} diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index a8c602b388504..bf6999d5d5649 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -4084,6 +4084,16 @@ add_fp_unittest( libc.src.math.atan2 ) +add_fp_unittest( + atan2f128_test + SUITE + libc-math-smoke-tests + SRCS + atan2f128_test.cpp + DEPENDS + libc.src.math.atan2f128 +) + add_fp_unittest( scalbln_test SUITE diff --git a/libc/test/src/math/smoke/atan2f128_test.cpp b/libc/test/src/math/smoke/atan2f128_test.cpp new file mode 100644 index 0000000000000..9d539f80bbd79 --- /dev/null +++ b/libc/test/src/math/smoke/atan2f128_test.cpp @@ -0,0 +1,28 @@ +//===-- Unittests for atan2f128 -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/atan2f128.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" + +using LlvmLibcAtan2f128Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcAtan2f128Test, SpecialNumbers) { + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan2f128(aNaN, zero)); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan2f128(1.0, aNaN)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::atan2f128(zero, zero)); + EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, + LIBC_NAMESPACE::atan2f128(neg_zero, zero)); + EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::atan2f128(1.0, inf)); + EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::atan2f128(-1.0, inf)); + + float128 x = 0x1.ffffffffffffffffffffffffffe7p1q; + float128 y = 0x1.fffffffffffffffffffffffffff2p1q; + float128 r = 0x1.921fb54442d18469898cc51701b3p-1q; + EXPECT_FP_EQ(r, LIBC_NAMESPACE::atan2f128(x, y)); +} From 19fb4b04a605c4a5759c72f61e8e5f5cac406a0b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Apr 2025 08:04:05 -0700 Subject: [PATCH 0248/1029] [RISCV] Validate the end of register ranges in Zcmp register lists. (#133866) We were only checking that the last register was a register, not that it was a legal register for a register list. This caused the encoder function to hit an llvm_unreachable. The error messages are not good, but this only one of multiple things that need to be fixed in this function. I'll focus on error messages later once I have the other issues fixed. --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 5 +++-- llvm/test/MC/RISCV/rv32zcmp-invalid.s | 22 +++++++++++++++++-- llvm/test/MC/RISCV/rv64zcmp-invalid.s | 21 ++++++++++++++++++ 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index d65eaac3716a1..6384a1c44196b 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2620,7 +2620,8 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, StringRef EndName = getLexer().getTok().getIdentifier(); // FIXME: the register mapping and checks of EABI is wrong RegEnd = matchRegisterNameHelper(EndName); - if (!RegEnd) + if (!(RegEnd == RISCV::X9 || + (RegEnd >= RISCV::X18 && RegEnd <= RISCV::X27))) return Error(getLoc(), "invalid register"); if (IsEABI && RegEnd != RISCV::X9) return Error(getLoc(), "contiguous register list of EABI can only be " @@ -2653,7 +2654,7 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, return Error(getLoc(), "invalid register"); EndName = getLexer().getTok().getIdentifier(); RegEnd = MatchRegisterName(EndName); - if (!RegEnd) + if (!(RegEnd >= RISCV::X19 && RegEnd <= RISCV::X27)) return Error(getLoc(), "invalid register"); getLexer().Lex(); } diff --git a/llvm/test/MC/RISCV/rv32zcmp-invalid.s b/llvm/test/MC/RISCV/rv32zcmp-invalid.s index 4115333fc738b..f89829a33bd9e 100644 --- a/llvm/test/MC/RISCV/rv32zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32zcmp-invalid.s @@ -25,5 +25,23 @@ cm.pop {ra, s0-s1}, -32 # CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] cm.push {ra}, -8 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] -cm.pop {ra, s0-s1}, -40 +# CHECK-ERROR: :[[@LINE+1]]:9: error: register list must start from 'ra' or 'x1' +cm.pop {s0}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:13: error: continuous register list must start from 's0' or 'x8' +cm.pop {ra, t1}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:16: error: invalid register +cm.pop {ra, s0-t1}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:20: error: second contiguous registers pair of register list must start from 'x18' +cm.pop {ra, x8-x9, x28}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:24: error: invalid register +cm.pop {ra, x8-x9, x18-x28}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:24: error: invalid register +cm.pop {ra, x8-x9, x18-x17}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:16: error: invalid register +cm.pop {ra, x8-f8, x18-x17}, -40 diff --git a/llvm/test/MC/RISCV/rv64zcmp-invalid.s b/llvm/test/MC/RISCV/rv64zcmp-invalid.s index 804234d2c11e6..7f90bf73ac713 100644 --- a/llvm/test/MC/RISCV/rv64zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv64zcmp-invalid.s @@ -27,3 +27,24 @@ cm.push {ra}, -15 # CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] cm.pop {ra, s0-s1}, -33 + +# CHECK-ERROR: :[[@LINE+1]]:9: error: register list must start from 'ra' or 'x1' +cm.pop {s0}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:13: error: continuous register list must start from 's0' or 'x8' +cm.pop {ra, t1}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:16: error: invalid register +cm.pop {ra, s0-t1}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:20: error: second contiguous registers pair of register list must start from 'x18' +cm.pop {ra, x8-x9, x28}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:24: error: invalid register +cm.pop {ra, x8-x9, x18-x28}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:24: error: invalid register +cm.pop {ra, x8-x9, x18-x17}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:16: error: invalid register +cm.pop {ra, x8-f8, x18-x17}, -40 From 4e6c48f1e74c46e62342f24ac879fd32d9a6c783 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Apr 2025 08:04:32 -0700 Subject: [PATCH 0249/1029] [RISCV] Merge RegStart with RegEnd in parseRegListCommon. NFC (#133867) We only need to keep track of the last register seen. We never need the first register once we've parsed. Currently if s0/x8 is used RegStart will point to that and not ra/s1 so it already isn't the start. --- llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 6384a1c44196b..f1ccf0cd052ba 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2583,9 +2583,8 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, return Error(getLoc(), "register list must start from 'ra' or 'x1'"); StringRef RegName = getLexer().getTok().getIdentifier(); - MCRegister RegStart = matchRegisterNameHelper(RegName); - MCRegister RegEnd; - if (RegStart != RISCV::X1) + MCRegister RegEnd = matchRegisterNameHelper(RegName); + if (RegEnd != RISCV::X1) return Error(getLoc(), "register list must start from 'ra' or 'x1'"); getLexer().Lex(); @@ -2606,10 +2605,10 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, if (getLexer().isNot(AsmToken::Identifier)) return Error(getLoc(), "invalid register"); StringRef RegName = getLexer().getTok().getIdentifier(); - RegStart = matchRegisterNameHelper(RegName); - if (!RegStart) + RegEnd = matchRegisterNameHelper(RegName); + if (!RegEnd) return Error(getLoc(), "invalid register"); - if (RegStart != RISCV::X8) + if (RegEnd != RISCV::X8) return Error(getLoc(), "continuous register list must start from 's0' or 'x8'"); getLexer().Lex(); // eat reg @@ -2668,9 +2667,6 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, if (parseToken(AsmToken::RCurly, "register list must end with '}'")) return ParseStatus::Failure; - if (!RegEnd) - RegEnd = RegStart; - auto Encode = RISCVZC::encodeRlist(RegEnd, IsEABI); assert(Encode != RISCVZC::INVALID_RLIST); if (MustIncludeS0) From 1ab3a4f234e38904f9935d02a696a52d1e5dff71 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Tue, 1 Apr 2025 08:04:45 -0700 Subject: [PATCH 0250/1029] [flang-rt][NFC] Work around CTK12.8 compilation failure. (#133833) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It happened in https://lab.llvm.org/buildbot/#/builders/152/builds/1131 when the buildbot was switched from CTK12.3 to CTK12.8. The logs are gone by now, so the above link is useless. The error was: error: ‘auto’ not permitted in template argument This workaround helps, but I also reported the issue to NVCC devs. --- flang-rt/lib/runtime/matmul-transpose.cpp | 3 ++- flang-rt/lib/runtime/matmul.cpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/flang-rt/lib/runtime/matmul-transpose.cpp b/flang-rt/lib/runtime/matmul-transpose.cpp index e20abbdddcd30..8666167e19472 100644 --- a/flang-rt/lib/runtime/matmul-transpose.cpp +++ b/flang-rt/lib/runtime/matmul-transpose.cpp @@ -331,6 +331,7 @@ template ; + using ResultTy = Fortran::common::optional>; RT_API_ATTRS void operator()(ResultDescriptor &result, const Descriptor &x, const Descriptor &y, const char *sourceFile, int line) const { Terminator terminator{sourceFile, line}; @@ -339,7 +340,7 @@ struct MatmulTransposeHelper { RUNTIME_CHECK(terminator, xCatKind.has_value() && yCatKind.has_value()); RUNTIME_CHECK(terminator, xCatKind->first == XCAT); RUNTIME_CHECK(terminator, yCatKind->first == YCAT); - if constexpr (constexpr auto resultType{ + if constexpr (constexpr ResultTy resultType{ GetResultType(XCAT, XKIND, YCAT, YKIND)}) { return DoMatmulTransposefirst, resultType->second, CppTypeFor, CppTypeFor>( diff --git a/flang-rt/lib/runtime/matmul.cpp b/flang-rt/lib/runtime/matmul.cpp index f14cea922d21e..693e51082bd43 100644 --- a/flang-rt/lib/runtime/matmul.cpp +++ b/flang-rt/lib/runtime/matmul.cpp @@ -424,6 +424,7 @@ static inline RT_API_ATTRS void DoMatmul( template struct MatmulHelper { + using ResultTy = Fortran::common::optional>; using ResultDescriptor = std::conditional_t; RT_API_ATTRS void operator()(ResultDescriptor &result, const Descriptor &x, @@ -439,7 +440,7 @@ struct MatmulHelper { xCatKind->first == TypeCategory::Unsigned) && (yCatKind->first == TypeCategory::Integer || yCatKind->first == TypeCategory::Unsigned)))); - if constexpr (constexpr auto resultType{ + if constexpr (constexpr ResultTy resultType{ GetResultType(XCAT, XKIND, YCAT, YKIND)}) { return DoMatmulfirst, resultType->second, CppTypeFor, CppTypeFor>( From aa73124e51d89a22b2ba89380d3a1403e4f1c385 Mon Sep 17 00:00:00 2001 From: Zahira Ammarguellat Date: Tue, 1 Apr 2025 08:10:51 -0700 Subject: [PATCH 0251/1029] Fix complex long double division with -mno-x87. (#133152) The combination of `-fcomplex-arithmetic=promoted` and `mno-x87` for `double` complex division is leading to a crash. See https://godbolt.org/z/189G957oY This patch fixes that. --- clang/lib/CodeGen/CGExprComplex.cpp | 4 ++ clang/lib/Sema/SemaExpr.cpp | 6 +- clang/test/CodeGen/promoted-complex-div.c | 83 +++++++++++++++++++++++ 3 files changed, 91 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/promoted-complex-div.c diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp index a7c8b96da6853..184a355734046 100644 --- a/clang/lib/CodeGen/CGExprComplex.cpp +++ b/clang/lib/CodeGen/CGExprComplex.cpp @@ -303,6 +303,10 @@ class ComplexExprEmitter // doubles the exponent of SmallerType.LargestFiniteVal) if (llvm::APFloat::semanticsMaxExponent(ElementTypeSemantics) * 2 + 1 <= llvm::APFloat::semanticsMaxExponent(HigherElementTypeSemantics)) { + if (!Ctx.getTargetInfo().hasLongDoubleType() && + HigherElementType.getCanonicalType().getUnqualifiedType() == + Ctx.LongDoubleTy) + return QualType(); FPHasBeenPromoted = true; return Ctx.getComplexType(HigherElementType); } else { diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 48dab1c1c94a8..1c0ef39878d7f 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -15329,8 +15329,10 @@ static void DetectPrecisionLossInComplexDivision(Sema &S, SourceLocation OpLoc, Ctx.getFloatTypeSemantics(ElementType); const llvm::fltSemantics &HigherElementTypeSemantics = Ctx.getFloatTypeSemantics(HigherElementType); - if (llvm::APFloat::semanticsMaxExponent(ElementTypeSemantics) * 2 + 1 > - llvm::APFloat::semanticsMaxExponent(HigherElementTypeSemantics)) { + if ((llvm::APFloat::semanticsMaxExponent(ElementTypeSemantics) * 2 + 1 > + llvm::APFloat::semanticsMaxExponent(HigherElementTypeSemantics)) || + (HigherElementType == Ctx.LongDoubleTy && + !Ctx.getTargetInfo().hasLongDoubleType())) { // Retain the location of the first use of higher precision type. if (!S.LocationOfExcessPrecisionNotSatisfied.isValid()) S.LocationOfExcessPrecisionNotSatisfied = OpLoc; diff --git a/clang/test/CodeGen/promoted-complex-div.c b/clang/test/CodeGen/promoted-complex-div.c new file mode 100644 index 0000000000000..7ed7b07db83ae --- /dev/null +++ b/clang/test/CodeGen/promoted-complex-div.c @@ -0,0 +1,83 @@ +// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown \ +// RUN: -verify -complex-range=promoted -o - | FileCheck %s + +// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown \ +// RUN: -verify=nopromotion -complex-range=promoted -target-feature -x87 \ +// RUN: -o - | FileCheck %s --check-prefix=NOX87 + +// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-windows \ +// RUN: -verify=nopromotion -complex-range=promoted -o - \ +// RUN: | FileCheck %s --check-prefix=NOX87 + +// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-windows \ +// RUN: -verify=nopromotion -complex-range=promoted -target-feature -x87 \ +// RUN: -o - | FileCheck %s --check-prefix=NOX87 + + + +// expected-no-diagnostics + +// CHECK-LABEL: define dso_local <2 x float> @divd +_Complex float divd(_Complex float a, _Complex float b) { + // CHECK: fpext float {{.*}} to double + // CHECK: fpext float {{.*}} to double + // CHECK: fdiv double + // CHECK: fdiv double + // CHECK: fptrunc double {{.*}} to float + // CHECK: fptrunc double {{.*}} to float + + // NOX87: fpext float {{.*}} to double + // NOX87: fpext float {{.*}} to double + // NOX87: fdiv double + // NOX87: fdiv double + // NOX87: fptrunc double {{.*}} to float + // NOX87: fptrunc double {{.*}} to float + + return a / b; +} + +// CHECK-LABEL: define dso_local { double, double } @divf +_Complex double divf(_Complex double a, _Complex double b) { + // CHECK: fpext double {{.*}} to x86_fp80 + // CHECK: fpext double {{.*}} to x86_fp80 + // CHECK: fdiv x86_fp80 + // CHECK: fdiv x86_fp80 + // CHECK: fptrunc x86_fp80 + // CHECK: fptrunc x86_fp80 + + // NOX87: call double @llvm.fabs.f64(double {{.*}}) + // NOX87-NEXT: call double @llvm.fabs.f64(double {{.*}}) + // NOX87-NEXT: fcmp ugt double %{{.*}}, {{.*}} + // NOX87-NEXT: br i1 {{.*}}, label + // NOX87: abs_rhsr_greater_or_equal_abs_rhsi: + // NOX87-NEXT: fdiv double + // NOX87-NEXT: fmul double + // NOX87-NEXT: fadd double + // NOX87-NEXT: fmul double + // NOX87-NEXT: fadd double + // NOX87-NEXT: fdiv double + // NOX87-NEXT: fmul double + // NOX87-NEXT: fsub double + // NOX87-NEXT: fdiv double + // NOX87-NEXT: br label {{.*}} + // NOX87: abs_rhsr_less_than_abs_rhsi: + // NOX87-NEXT: fdiv double + // NOX87-NEXT: fmul double + // NOX87-NEXT: fadd double + // NOX87-NEXT: fmul double + // NOX87-NEXT: fadd double + // NOX87-NEXT: fdiv double + // NOX87-NEXT: fmul double + // NOX87-NEXT: fsub double + // NOX87-NEXT: fdiv double + // NOX87-NEXT: br label + // NOX87: complex_div: + // NOX87-NEXT: phi double + // NOX87-NEXT: phi double + // NOX87-NEXT: getelementptr inbounds nuw { double, double }, ptr {{.*}}, i32 0, i32 0 + // NOX87-NEXT: getelementptr inbounds nuw { double, double }, ptr {{.*}}, i32 0, i32 1 + // NOX87-NEXT: store double + // NOX87-NEXT: store double + + return a / b; // nopromotion-warning{{excess precision is requested but the target does not support excess precision which may result in observable differences in complex division behavior}} +} From 664e847916ff7329da9e5295ecc17ec169e647eb Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 1 Apr 2025 22:14:47 +0700 Subject: [PATCH 0252/1029] llvm-reduce: Fix invalid reduction on tokens in operands-to-args (#133855) --- .../reduce-operands-to-args-token.ll | 27 +++++++++++++++++++ .../deltas/ReduceOperandsToArgs.cpp | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/llvm-reduce/reduce-operands-to-args-token.ll diff --git a/llvm/test/tools/llvm-reduce/reduce-operands-to-args-token.ll b/llvm/test/tools/llvm-reduce/reduce-operands-to-args-token.ll new file mode 100644 index 0000000000000..6dba9c266551a --- /dev/null +++ b/llvm/test/tools/llvm-reduce/reduce-operands-to-args-token.ll @@ -0,0 +1,27 @@ +; Check no invalid reduction caused by introducing a token typed +; function argument + +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=operands-to-args --test FileCheck --test-arg --check-prefix=INTERESTING --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck -check-prefix=RESULT %s < %t + +; INTERESTING-LABEL: define void @tokens( +; INTERESTING: call void @llvm.token.consumer + +; RESULT-LABEL: define void @tokens(ptr %local.ptr) { +; RESULT-NEXT: %local.ptr1 = alloca i32, align 4 +; RESULT-NEXT: %token = call token @llvm.token.producer() +; RESULT-NEXT: store i32 0, ptr %local.ptr, align 4 +; RESULT-NEXT: call void @llvm.token.consumer(token %token) +; RESULT-NEXT: store i32 1, ptr %local.ptr, align 4 +; RESULT-NEXT: ret void +define void @tokens() { + %local.ptr = alloca i32 + %token = call token @llvm.token.producer() + store i32 0, ptr %local.ptr + call void @llvm.token.consumer(token %token) + store i32 1, ptr %local.ptr + ret void +} + +declare token @llvm.token.producer() +declare void @llvm.token.consumer(token) diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp index 4bae588d60c14..c23fc6742bfeb 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandsToArgs.cpp @@ -36,7 +36,7 @@ static bool canReduceUse(Use &Op) { return false; // Don't pass labels/metadata as arguments. - if (Ty->isLabelTy() || Ty->isMetadataTy()) + if (Ty->isLabelTy() || Ty->isMetadataTy() || Ty->isTokenTy()) return false; // No need to replace values that are already arguments. From ac55688482637ce625edaa8a25ad6eced8992a22 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 1 Apr 2025 22:17:44 +0700 Subject: [PATCH 0253/1029] llvm-reduce: Add test for token handling in operands-skip (#133857) Seems to work correctly but wasn't tested. --- .../llvm-reduce/reduce-operands-skip-token.ll | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 llvm/test/tools/llvm-reduce/reduce-operands-skip-token.ll diff --git a/llvm/test/tools/llvm-reduce/reduce-operands-skip-token.ll b/llvm/test/tools/llvm-reduce/reduce-operands-skip-token.ll new file mode 100644 index 0000000000000..1202974333b5b --- /dev/null +++ b/llvm/test/tools/llvm-reduce/reduce-operands-skip-token.ll @@ -0,0 +1,27 @@ +; Check token values are correctly handled by operands-skip + +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=operands-skip --test FileCheck --test-arg --check-prefix=INTERESTING --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck -check-prefix=RESULT %s < %t + +; INTERESTING-LABEL: define void @tokens( +; INTERESTING: call void @llvm.token.consumer + +; RESULT-LABEL: define void @tokens(ptr %ptr) { +; RESULT-NEXT: %token = call token @llvm.token.producer() +; RESULT-NEXT:store i32 0, ptr %ptr, align 4 +; RESULT-NEXT:%chain = call token @llvm.token.chain(token %token) +; RESULT-NEXT:call void @llvm.token.consumer(token %token) +; RESULT-NEXT:store i32 1, ptr %ptr, align 4 +; RESULT-NEXT:ret void +define void @tokens(ptr %ptr) { + %token = call token @llvm.token.producer() + store i32 0, ptr %ptr + %chain = call token @llvm.token.chain(token %token) + call void @llvm.token.consumer(token %chain) ; -> rewrite to use %token directly + store i32 1, ptr %ptr + ret void +} + +declare token @llvm.token.producer() +declare token @llvm.token.chain(token) +declare void @llvm.token.consumer(token) From 4cb41d136cd4e2caef724e35b337f888036f8645 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 1 Apr 2025 16:24:54 +0100 Subject: [PATCH 0254/1029] [AArch64] Prefer zip over ushll for anyext. (#133433) Many CPUs have a higher throughput of ZIP instructions vs USHLL. This adds some tablegen patterns for preferring zip in anyext patterns. --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 17 +++++++++++++++++ llvm/test/CodeGen/AArch64/andorxor.ll | 12 ++++++------ .../CodeGen/AArch64/bitcast-promote-widen.ll | 3 +-- llvm/test/CodeGen/AArch64/bitcast.ll | 3 +-- llvm/test/CodeGen/AArch64/extbinopload.ll | 4 ++-- .../CodeGen/AArch64/extract-subvec-combine.ll | 19 +++++++++++++------ llvm/test/CodeGen/AArch64/neon-bitcast.ll | 5 ++--- .../sve-fixed-length-extract-subvector.ll | 4 +--- .../AArch64/vec3-loads-ext-trunc-stores.ll | 2 +- llvm/test/CodeGen/AArch64/zext.ll | 2 +- 10 files changed, 45 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 6c61e3a613f6f..f291589e04c6b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -6751,6 +6751,23 @@ def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))), (UZP2v4i32 V128:$Vn, V128:$Vm)>; +// extract_subvec(anyext) can use zip. Check for one use on the anyext, otherwise +// the extract_subvector can be free. +let HasOneUse = 1 in +def anyext_oneuse: PatFrag<(ops node:$src0), (anyext $src0)>; +def : Pat<(v4i16 (extract_subvector (v8i16 (anyext_oneuse (v8i8 V64:$Vn))), (i64 0))), + (ZIP1v8i8 V64:$Vn, V64:$Vn)>; +def : Pat<(v2i32 (extract_subvector (v4i32 (anyext_oneuse (v4i16 V64:$Vn))), (i64 0))), + (ZIP1v4i16 V64:$Vn, V64:$Vn)>; +def : Pat<(v1i64 (extract_subvector (v2i64 (anyext_oneuse (v2i32 V64:$Vn))), (i64 0))), + (ZIP1v2i32 V64:$Vn, V64:$Vn)>; +def : Pat<(v4i16 (extract_subvector (v8i16 (anyext_oneuse (v8i8 V64:$Vn))), (i64 4))), + (ZIP2v8i8 V64:$Vn, V64:$Vn)>; +def : Pat<(v2i32 (extract_subvector (v4i32 (anyext_oneuse (v4i16 V64:$Vn))), (i64 2))), + (ZIP2v4i16 V64:$Vn, V64:$Vn)>; +def : Pat<(v1i64 (extract_subvector (v2i64 (anyext_oneuse (v2i32 V64:$Vn))), (i64 1))), + (ZIP2v2i32 V64:$Vn, V64:$Vn)>; + //---------------------------------------------------------------------------- // AdvSIMD TBL/TBX instructions //---------------------------------------------------------------------------- diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll index 24f2549cce785..0384848082caa 100644 --- a/llvm/test/CodeGen/AArch64/andorxor.ll +++ b/llvm/test/CodeGen/AArch64/andorxor.ll @@ -433,8 +433,8 @@ define void @and_v4i8(ptr %p1, ptr %p2) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr s0, [x0] ; CHECK-SD-NEXT: ldr s1, [x1] -; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b ; CHECK-SD-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: str s0, [x0] @@ -482,8 +482,8 @@ define void @or_v4i8(ptr %p1, ptr %p2) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr s0, [x0] ; CHECK-SD-NEXT: ldr s1, [x1] -; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b ; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: str s0, [x0] @@ -531,8 +531,8 @@ define void @xor_v4i8(ptr %p1, ptr %p2) { ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: ldr s0, [x0] ; CHECK-SD-NEXT: ldr s1, [x1] -; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-SD-NEXT: zip1 v1.8b, v1.8b, v1.8b ; CHECK-SD-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: str s0, [x0] diff --git a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll index 864ddc2967c18..90fa294505c84 100644 --- a/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll +++ b/llvm/test/CodeGen/AArch64/bitcast-promote-widen.ll @@ -6,8 +6,7 @@ define <2 x i16> @bitcast_v2i16_v2f16(<2 x half> %x) { ; CHECK-LABEL: bitcast_v2i16_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: zip1 v0.4h, v0.4h, v0.4h ; CHECK-NEXT: ret %y = bitcast <2 x half> %x to <2 x i16> ret <2 x i16> %y diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index d9199ce2c79de..d54cc4adb81b3 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -125,8 +125,7 @@ define <2 x i16> @bitcast_i32_v2i16(i32 %a, i32 %b){ ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: add w8, w0, w1 ; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: bitcast_i32_v2i16: diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll index 72f4d58a425e7..82114d60c4a93 100644 --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -649,7 +649,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: add x11, x1, #12 ; CHECK-NEXT: str s1, [x4] -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: zip1 v1.8b, v1.8b, v1.8b ; CHECK-NEXT: ldr s0, [x2] ; CHECK-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-NEXT: umov w9, v2.h[0] @@ -659,7 +659,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: mov v0.b[9], w10 ; CHECK-NEXT: umov w10, v2.h[3] ; CHECK-NEXT: ldr s2, [x1] -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: zip1 v2.8b, v2.8b, v2.8b ; CHECK-NEXT: mov v0.b[10], w9 ; CHECK-NEXT: add x9, x1, #4 ; CHECK-NEXT: mov v1.d[1], v2.d[0] diff --git a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll index 75d55773b3681..368103bf2f2fe 100644 --- a/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll +++ b/llvm/test/CodeGen/AArch64/extract-subvec-combine.ll @@ -104,12 +104,19 @@ define <2 x i32> @sext_extract_zext_idx0(<4 x i16> %vec) nounwind { ; Negative test, combine should not fire if sign extension is for a different width. define <2 x i32> @sext_extract_zext_idx0_negtest(<4 x i16> %vec) nounwind { -; CHECK-LABEL: sext_extract_zext_idx0_negtest: -; CHECK: // %bb.0: -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: shl v0.2s, v0.2s, #17 -; CHECK-NEXT: sshr v0.2s, v0.2s, #17 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: sext_extract_zext_idx0_negtest: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-SD-NEXT: shl v0.2s, v0.2s, #17 +; CHECK-SD-NEXT: sshr v0.2s, v0.2s, #17 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: sext_extract_zext_idx0_negtest: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: shl v0.2s, v0.2s, #17 +; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #17 +; CHECK-GI-NEXT: ret %zext = zext <4 x i16> %vec to <4 x i32> %extract = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %zext, i64 0) %sext_inreg_step0 = shl <2 x i32> %extract, diff --git a/llvm/test/CodeGen/AArch64/neon-bitcast.ll b/llvm/test/CodeGen/AArch64/neon-bitcast.ll index d06612e2332e6..07772b716ec58 100644 --- a/llvm/test/CodeGen/AArch64/neon-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitcast.ll @@ -518,15 +518,14 @@ define <2 x i16> @bitcast_i32_to_v2i16(i32 %word) { ; CHECK-LE-LABEL: bitcast_i32_to_v2i16: ; CHECK-LE: // %bb.0: ; CHECK-LE-NEXT: fmov s0, w0 -; CHECK-LE-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-LE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-LE-NEXT: zip1 v0.4h, v0.4h, v0.4h ; CHECK-LE-NEXT: ret ; ; CHECK-BE-LABEL: bitcast_i32_to_v2i16: ; CHECK-BE: // %bb.0: ; CHECK-BE-NEXT: fmov s0, w0 ; CHECK-BE-NEXT: rev32 v0.4h, v0.4h -; CHECK-BE-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BE-NEXT: zip1 v0.4h, v0.4h, v0.4h ; CHECK-BE-NEXT: rev64 v0.2s, v0.2s ; CHECK-BE-NEXT: ret %ret = bitcast i32 %word to <2 x i16> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll index 8fac0e1067684..bda7ff9115e09 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -88,9 +88,7 @@ define void @extract_subvector_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: zip2 v0.4h, v0.4h, v0.4h ; CHECK-NEXT: ret %ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2) ret <2 x i16> %ret diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll index b52cbfe08156b..45b7a2759b0b3 100644 --- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll @@ -303,7 +303,7 @@ define <3 x i32> @load_v3i8_zext_to_3xi32(ptr %src) { ; BE-NEXT: add x8, x0, #2 ; BE-NEXT: ldr s0, [sp, #12] ; BE-NEXT: rev32 v0.8b, v0.8b -; BE-NEXT: ushll v0.8h, v0.8b, #0 +; BE-NEXT: zip1 v0.8b, v0.8b, v0.8b ; BE-NEXT: ld1 { v0.b }[4], [x8] ; BE-NEXT: ushll v0.4s, v0.4h, #0 ; BE-NEXT: and v0.16b, v0.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll index e40b9cb5c8482..962486afa3bb8 100644 --- a/llvm/test/CodeGen/AArch64/zext.ll +++ b/llvm/test/CodeGen/AArch64/zext.ll @@ -447,7 +447,7 @@ define <3 x i64> @zext_v3i10_v3i64(<3 x i10> %a) { ; CHECK-SD-NEXT: mov w8, #1023 // =0x3ff ; CHECK-SD-NEXT: dup v2.2d, x8 ; CHECK-SD-NEXT: mov v0.s[1], w1 -; CHECK-SD-NEXT: ushll v3.2d, v1.2s, #0 +; CHECK-SD-NEXT: zip1 v3.2s, v1.2s, v1.2s ; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-SD-NEXT: and v2.8b, v3.8b, v2.8b From a8a33bab69ccdeca9b0e1e6730ad17230431d2ab Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Tue, 1 Apr 2025 08:35:06 -0700 Subject: [PATCH 0255/1029] [NFC][SPIRV] Misc code cleanup in SPIRV Target (#133764) - Use static instead of anonymous namespace for file local functions. - Enclose file-local classes in anonymous namespace. - Eliminate `llvm::` qualifier when file has `using namespace llvm`. - Eliminate namespace surrounding entire code in SPIRVConvergenceRegionAnalysis.cpp file. - Eliminate call to `initializeSPIRVStructurizerPass` from the pass constructor (https://github.com/llvm/llvm-project/issues/111767) --- .../SPIRVConvergenceRegionAnalysis.cpp | 30 ++++---- .../Analysis/SPIRVConvergenceRegionAnalysis.h | 3 +- llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 2 +- llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp | 10 +-- llvm/lib/Target/SPIRV/SPIRVCommandLine.h | 2 +- llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 5 +- .../Target/SPIRV/SPIRVEmitNonSemanticDI.cpp | 8 +-- llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 11 +-- llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp | 71 ++++++++----------- .../Target/SPIRV/SPIRVStructurizerWrapper.h | 6 +- .../SPIRVConvergenceRegionAnalysisTests.cpp | 1 + 11 files changed, 63 insertions(+), 86 deletions(-) diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp index abacd4b4ef857..48b327deaba84 100644 --- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp @@ -1,4 +1,4 @@ -//===- ConvergenceRegionAnalysis.h -----------------------------*- C++ -*--===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -24,6 +24,7 @@ #define DEBUG_TYPE "spirv-convergence-region-analysis" using namespace llvm; +using namespace SPIRV; namespace llvm { void initializeSPIRVConvergenceRegionAnalysisWrapperPassPass(PassRegistry &); @@ -39,8 +40,6 @@ INITIALIZE_PASS_END(SPIRVConvergenceRegionAnalysisWrapperPass, "convergence-region", "SPIRV convergence regions analysis", true, true) -namespace llvm { -namespace SPIRV { namespace { template @@ -74,12 +73,13 @@ getConvergenceTokenInternal(BasicBlockType *BB) { return std::nullopt; } +} // anonymous namespace // Given a ConvergenceRegion tree with |Start| as its root, finds the smallest // region |Entry| belongs to. If |Entry| does not belong to the region defined // by |Start|, this function returns |nullptr|. -ConvergenceRegion *findParentRegion(ConvergenceRegion *Start, - BasicBlock *Entry) { +static ConvergenceRegion *findParentRegion(ConvergenceRegion *Start, + BasicBlock *Entry) { ConvergenceRegion *Candidate = nullptr; ConvergenceRegion *NextCandidate = Start; @@ -102,13 +102,13 @@ ConvergenceRegion *findParentRegion(ConvergenceRegion *Start, return Candidate; } -} // anonymous namespace - -std::optional getConvergenceToken(BasicBlock *BB) { +std::optional +llvm::SPIRV::getConvergenceToken(BasicBlock *BB) { return getConvergenceTokenInternal(BB); } -std::optional getConvergenceToken(const BasicBlock *BB) { +std::optional +llvm::SPIRV::getConvergenceToken(const BasicBlock *BB) { return getConvergenceTokenInternal(BB); } @@ -187,8 +187,8 @@ void ConvergenceRegion::dump(const unsigned IndentSize) const { dbgs() << Indent << "}\n"; } +namespace { class ConvergenceRegionAnalyzer { - public: ConvergenceRegionAnalyzer(Function &F, DominatorTree &DT, LoopInfo &LI) : DT(DT), LI(LI), F(F) {} @@ -305,15 +305,15 @@ class ConvergenceRegionAnalyzer { LoopInfo &LI; Function &F; }; +} // anonymous namespace -ConvergenceRegionInfo getConvergenceRegions(Function &F, DominatorTree &DT, - LoopInfo &LI) { +ConvergenceRegionInfo llvm::SPIRV::getConvergenceRegions(Function &F, + DominatorTree &DT, + LoopInfo &LI) { ConvergenceRegionAnalyzer Analyzer(F, DT, LI); return Analyzer.analyze(); } -} // namespace SPIRV - char SPIRVConvergenceRegionAnalysisWrapperPass::ID = 0; SPIRVConvergenceRegionAnalysisWrapperPass:: @@ -339,5 +339,3 @@ SPIRVConvergenceRegionAnalysis::run(Function &F, FunctionAnalysisManager &AM) { } AnalysisKey SPIRVConvergenceRegionAnalysis::Key; - -} // namespace llvm diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h index e435c88c919c9..78a066bef8abc 100644 --- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h +++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.h @@ -19,12 +19,11 @@ #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Dominators.h" -#include "llvm/IR/IntrinsicInst.h" -#include #include #include namespace llvm { +class IntrinsicInst; class SPIRVSubtarget; class MachineFunction; class MachineModuleInfo; diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 1ed92400fc577..334580fac73b4 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -3048,7 +3048,7 @@ static SPIRVType *getInlineSpirvType(const TargetExtType *ExtensionType, auto Opcode = ExtensionType->getIntParameter(0); SmallVector Operands; - for (llvm::Type *Param : ExtensionType->type_params()) { + for (Type *Param : ExtensionType->type_params()) { if (const TargetExtType *ParamEType = dyn_cast(Param)) { if (ParamEType->getName() == "spirv.IntegralConstant") { assert(ParamEType->getNumTypeParameters() == 1 && diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 37119bf01545c..8b9201ee7dae3 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -94,10 +94,10 @@ static const std::map> {"SPV_INTEL_fp_max_error", SPIRV::Extension::Extension::SPV_INTEL_fp_max_error}}; -bool SPIRVExtensionsParser::parse(cl::Option &O, llvm::StringRef ArgName, - llvm::StringRef ArgValue, +bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName, + StringRef ArgValue, std::set &Vals) { - llvm::SmallVector Tokens; + SmallVector Tokens; ArgValue.split(Tokens, ",", -1, false); std::sort(Tokens.begin(), Tokens.end()); @@ -114,7 +114,7 @@ bool SPIRVExtensionsParser::parse(cl::Option &O, llvm::StringRef ArgName, if (Token.empty() || (!Token.starts_with("+") && !Token.starts_with("-"))) return O.error("Invalid extension list format: " + Token.str()); - llvm::StringRef ExtensionName = Token.substr(1); + StringRef ExtensionName = Token.substr(1); auto NameValuePair = SPIRVExtensionMap.find(ExtensionName); if (NameValuePair == SPIRVExtensionMap.end()) @@ -137,7 +137,7 @@ bool SPIRVExtensionsParser::parse(cl::Option &O, llvm::StringRef ArgName, return false; } -llvm::StringRef SPIRVExtensionsParser::checkExtensions( +StringRef SPIRVExtensionsParser::checkExtensions( const std::vector &ExtNames, std::set &AllowedExtensions) { for (const auto &Ext : ExtNames) { diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.h b/llvm/lib/Target/SPIRV/SPIRVCommandLine.h index 8df2968eb6fe1..3e3b22bde8603 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.h +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.h @@ -39,7 +39,7 @@ struct SPIRVExtensionsParser /// /// \return Returns a reference to the unknown SPIR-V extension name from the /// list if present, or an empty StringRef on success. - static llvm::StringRef + static StringRef checkExtensions(const std::vector &ExtNames, std::set &AllowedExtensions); }; diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 6afbac5f7c3cb..0a6a54b4a2f67 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -293,8 +293,7 @@ static void setInsertPointAfterDef(IRBuilder<> &B, Instruction *I) { } static bool requireAssignType(Instruction *I) { - IntrinsicInst *Intr = dyn_cast(I); - if (Intr) { + if (const auto *Intr = dyn_cast(I)) { switch (Intr->getIntrinsicID()) { case Intrinsic::invariant_start: case Intrinsic::invariant_end: @@ -725,7 +724,7 @@ Type *SPIRVEmitIntrinsics::deduceNestedTypeHelper( if (!Visited.insert(U).second) return OrigTy; - if (dyn_cast(OrigTy)) { + if (isa(OrigTy)) { SmallVector Tys; bool Change = false; for (unsigned i = 0; i < U->getNumOperands(); ++i) { diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp index 3c0d9cc3b91c0..7858f44a054d7 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp @@ -27,7 +27,9 @@ #define DEBUG_TYPE "spirv-nonsemantic-debug-info" -namespace llvm { +using namespace llvm; + +namespace { struct SPIRVEmitNonSemanticDI : public MachineFunctionPass { static char ID; SPIRVTargetMachine *TM; @@ -40,9 +42,7 @@ struct SPIRVEmitNonSemanticDI : public MachineFunctionPass { bool IsGlobalDIEmitted = false; bool emitGlobalDI(MachineFunction &MF); }; -} // namespace llvm - -using namespace llvm; +} // anonymous namespace INITIALIZE_PASS(SPIRVEmitNonSemanticDI, DEBUG_TYPE, "SPIRV NonSemantic.Shader.DebugInfo.100 emitter", false, false) diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index bb71da49316f3..2c167ac226dea 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -32,13 +32,10 @@ using namespace llvm; -namespace { - -bool allowEmitFakeUse(const Value *Arg) { +static bool allowEmitFakeUse(const Value *Arg) { if (isSpvIntrinsic(Arg)) return false; - if (dyn_cast(Arg) || dyn_cast(Arg) || - dyn_cast(Arg)) + if (isa(Arg)) return false; if (const auto *LI = dyn_cast(Arg)) if (LI->getType()->isAggregateType()) @@ -46,7 +43,7 @@ bool allowEmitFakeUse(const Value *Arg) { return true; } -inline unsigned typeToAddressSpace(const Type *Ty) { +static unsigned typeToAddressSpace(const Type *Ty) { if (auto PType = dyn_cast(Ty)) return PType->getAddressSpace(); if (auto PType = dyn_cast(Ty)) @@ -57,8 +54,6 @@ inline unsigned typeToAddressSpace(const Type *Ty) { report_fatal_error("Unable to convert LLVM type to SPIRVType", true); } -} // anonymous namespace - SPIRVGlobalRegistry::SPIRVGlobalRegistry(unsigned PointerSize) : PointerSize(PointerSize), Bound(0) {} diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp index 7ead4c82fb7e6..324b80fe2efc6 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp @@ -38,27 +38,21 @@ using namespace llvm; using namespace SPIRV; -namespace llvm { - -void initializeSPIRVStructurizerPass(PassRegistry &); - -namespace { - using BlockSet = std::unordered_set; using Edge = std::pair; // Helper function to do a partial order visit from the block |Start|, calling // |Op| on each visited node. -void partialOrderVisit(BasicBlock &Start, - std::function Op) { +static void partialOrderVisit(BasicBlock &Start, + std::function Op) { PartialOrderingVisitor V(*Start.getParent()); V.partialOrderVisit(Start, Op); } // Returns the exact convergence region in the tree defined by `Node` for which // `BB` is the header, nullptr otherwise. -const ConvergenceRegion *getRegionForHeader(const ConvergenceRegion *Node, - BasicBlock *BB) { +static const ConvergenceRegion * +getRegionForHeader(const ConvergenceRegion *Node, BasicBlock *BB) { if (Node->Entry == BB) return Node; @@ -72,7 +66,7 @@ const ConvergenceRegion *getRegionForHeader(const ConvergenceRegion *Node, // Returns the single BasicBlock exiting the convergence region `CR`, // nullptr if no such exit exists. -BasicBlock *getExitFor(const ConvergenceRegion *CR) { +static BasicBlock *getExitFor(const ConvergenceRegion *CR) { std::unordered_set ExitTargets; for (BasicBlock *Exit : CR->Exits) { for (BasicBlock *Successor : successors(Exit)) { @@ -90,7 +84,7 @@ BasicBlock *getExitFor(const ConvergenceRegion *CR) { // Returns the merge block designated by I if I is a merge instruction, nullptr // otherwise. -BasicBlock *getDesignatedMergeBlock(Instruction *I) { +static BasicBlock *getDesignatedMergeBlock(Instruction *I) { IntrinsicInst *II = dyn_cast_or_null(I); if (II == nullptr) return nullptr; @@ -105,7 +99,7 @@ BasicBlock *getDesignatedMergeBlock(Instruction *I) { // Returns the continue block designated by I if I is an OpLoopMerge, nullptr // otherwise. -BasicBlock *getDesignatedContinueBlock(Instruction *I) { +static BasicBlock *getDesignatedContinueBlock(Instruction *I) { IntrinsicInst *II = dyn_cast_or_null(I); if (II == nullptr) return nullptr; @@ -119,7 +113,7 @@ BasicBlock *getDesignatedContinueBlock(Instruction *I) { // Returns true if Header has one merge instruction which designated Merge as // merge block. -bool isDefinedAsSelectionMergeBy(BasicBlock &Header, BasicBlock &Merge) { +static bool isDefinedAsSelectionMergeBy(BasicBlock &Header, BasicBlock &Merge) { for (auto &I : Header) { BasicBlock *MB = getDesignatedMergeBlock(&I); if (MB == &Merge) @@ -129,7 +123,7 @@ bool isDefinedAsSelectionMergeBy(BasicBlock &Header, BasicBlock &Merge) { } // Returns true if the BB has one OpLoopMerge instruction. -bool hasLoopMergeInstruction(BasicBlock &BB) { +static bool hasLoopMergeInstruction(BasicBlock &BB) { for (auto &I : BB) if (getDesignatedContinueBlock(&I)) return true; @@ -138,13 +132,13 @@ bool hasLoopMergeInstruction(BasicBlock &BB) { // Returns true is I is an OpSelectionMerge or OpLoopMerge instruction, false // otherwise. -bool isMergeInstruction(Instruction *I) { +static bool isMergeInstruction(Instruction *I) { return getDesignatedMergeBlock(I) != nullptr; } // Returns all blocks in F having at least one OpLoopMerge or OpSelectionMerge // instruction. -SmallPtrSet getHeaderBlocks(Function &F) { +static SmallPtrSet getHeaderBlocks(Function &F) { SmallPtrSet Output; for (BasicBlock &BB : F) { for (Instruction &I : BB) { @@ -157,7 +151,7 @@ SmallPtrSet getHeaderBlocks(Function &F) { // Returns all basic blocks in |F| referenced by at least 1 // OpSelectionMerge/OpLoopMerge instruction. -SmallPtrSet getMergeBlocks(Function &F) { +static SmallPtrSet getMergeBlocks(Function &F) { SmallPtrSet Output; for (BasicBlock &BB : F) { for (Instruction &I : BB) { @@ -172,7 +166,7 @@ SmallPtrSet getMergeBlocks(Function &F) { // Return all the merge instructions contained in BB. // Note: the SPIR-V spec doesn't allow a single BB to contain more than 1 merge // instruction, but this can happen while we structurize the CFG. -std::vector getMergeInstructions(BasicBlock &BB) { +static std::vector getMergeInstructions(BasicBlock &BB) { std::vector Output; for (Instruction &I : BB) if (isMergeInstruction(&I)) @@ -182,7 +176,7 @@ std::vector getMergeInstructions(BasicBlock &BB) { // Returns all basic blocks in |F| referenced as continue target by at least 1 // OpLoopMerge instruction. -SmallPtrSet getContinueBlocks(Function &F) { +static SmallPtrSet getContinueBlocks(Function &F) { SmallPtrSet Output; for (BasicBlock &BB : F) { for (Instruction &I : BB) { @@ -196,7 +190,7 @@ SmallPtrSet getContinueBlocks(Function &F) { // Do a preorder traversal of the CFG starting from the BB |Start|. // point. Calls |op| on each basic block encountered during the traversal. -void visit(BasicBlock &Start, std::function op) { +static void visit(BasicBlock &Start, std::function op) { std::stack ToVisit; SmallPtrSet Seen; @@ -221,8 +215,8 @@ void visit(BasicBlock &Start, std::function op) { // Replaces the conditional and unconditional branch targets of |BB| by // |NewTarget| if the target was |OldTarget|. This function also makes sure the // associated merge instruction gets updated accordingly. -void replaceIfBranchTargets(BasicBlock *BB, BasicBlock *OldTarget, - BasicBlock *NewTarget) { +static void replaceIfBranchTargets(BasicBlock *BB, BasicBlock *OldTarget, + BasicBlock *NewTarget) { auto *BI = cast(BB->getTerminator()); // 1. Replace all matching successors. @@ -268,8 +262,8 @@ void replaceIfBranchTargets(BasicBlock *BB, BasicBlock *OldTarget, // was |OldTarget|. This function also fixes the associated merge instruction. // Note: this function does not simplify branching instructions, it only updates // targets. See also: simplifyBranches. -void replaceBranchTargets(BasicBlock *BB, BasicBlock *OldTarget, - BasicBlock *NewTarget) { +static void replaceBranchTargets(BasicBlock *BB, BasicBlock *OldTarget, + BasicBlock *NewTarget) { auto *T = BB->getTerminator(); if (isa(T)) return; @@ -288,12 +282,10 @@ void replaceBranchTargets(BasicBlock *BB, BasicBlock *OldTarget, assert(false && "Unhandled terminator type."); } -} // anonymous namespace - +namespace { // Given a reducible CFG, produces a structurized CFG in the SPIR-V sense, // adding merge instructions when required. class SPIRVStructurizer : public FunctionPass { - struct DivergentConstruct; // Represents a list of condition/loops/switch constructs. // See SPIR-V 2.11.2. Structured Control-flow Constructs for the list of @@ -504,8 +496,7 @@ class SPIRVStructurizer : public FunctionPass { replaceBranchTargets(Src, Dst, NewExit); } - llvm::Value *Load = - ExitBuilder.CreateLoad(ExitBuilder.getInt32Ty(), Variable); + Value *Load = ExitBuilder.CreateLoad(ExitBuilder.getInt32Ty(), Variable); // If we can avoid an OpSwitch, generate an OpBranch. Reason is some // OpBranch are allowed to exist without a new OpSelectionMerge if one of @@ -595,9 +586,7 @@ class SPIRVStructurizer : public FunctionPass { // adding an unreachable merge block. if (Merge == nullptr) { BranchInst *Br = cast(BB.getTerminator()); - assert(Br && - "This assumes the branch is not a switch. Maybe that's wrong?"); - assert(cast(BB.getTerminator())->isUnconditional()); + assert(Br->isUnconditional()); Merge = CreateUnreachable(F); Builder.SetInsertPoint(Br); @@ -614,7 +603,7 @@ class SPIRVStructurizer : public FunctionPass { SmallVector LoopControlImms = getSpirvLoopControlOperandsFromLoopMetadata(L); for (unsigned Imm : LoopControlImms) - Args.emplace_back(llvm::ConstantInt::get(Builder.getInt32Ty(), Imm)); + Args.emplace_back(ConstantInt::get(Builder.getInt32Ty(), Imm)); Builder.CreateIntrinsic(Intrinsic::spv_loop_merge, {Args}); Modified = true; } @@ -1127,9 +1116,7 @@ class SPIRVStructurizer : public FunctionPass { public: static char ID; - SPIRVStructurizer() : FunctionPass(ID) { - initializeSPIRVStructurizerPass(*PassRegistry::getPassRegistry()); - }; + SPIRVStructurizer() : FunctionPass(ID) {} virtual bool runOnFunction(Function &F) override { bool Modified = false; @@ -1211,23 +1198,21 @@ class SPIRVStructurizer : public FunctionPass { MDNode *MDNode = BBTerminatorInst->getMetadata("hlsl.controlflow.hint"); - ConstantInt *BranchHint = llvm::ConstantInt::get(Builder->getInt32Ty(), 0); + ConstantInt *BranchHint = ConstantInt::get(Builder->getInt32Ty(), 0); if (MDNode) { assert(MDNode->getNumOperands() == 2 && "invalid metadata hlsl.controlflow.hint"); BranchHint = mdconst::extract(MDNode->getOperand(1)); - - assert(BranchHint && "invalid metadata value for hlsl.controlflow.hint"); } - llvm::SmallVector Args = {MergeAddress, BranchHint}; + SmallVector Args = {MergeAddress, BranchHint}; Builder->CreateIntrinsic(Intrinsic::spv_selection_merge, - {MergeAddress->getType()}, {Args}); + {MergeAddress->getType()}, Args); } }; -} // namespace llvm +} // anonymous namespace char SPIRVStructurizer::ID = 0; diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizerWrapper.h b/llvm/lib/Target/SPIRV/SPIRVStructurizerWrapper.h index cc69e20847c19..a0df0405bfd23 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStructurizerWrapper.h +++ b/llvm/lib/Target/SPIRV/SPIRVStructurizerWrapper.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_DIRECTX_SPIRVSTRUCTURIZER_H -#define LLVM_LIB_TARGET_DIRECTX_SPIRVSTRUCTURIZER_H +#ifndef LLVM_LIB_TARGET_SPIRV_SPIRVSTRUCTURIZER_H +#define LLVM_LIB_TARGET_SPIRV_SPIRVSTRUCTURIZER_H #include "llvm/IR/PassManager.h" @@ -26,4 +26,4 @@ class SPIRVStructurizerWrapper } // namespace llvm -#endif // LLVM_LIB_TARGET_DIRECTX_SPIRVSTRUCTURIZER_H +#endif // LLVM_LIB_TARGET_SPIRV_SPIRVSTRUCTURIZER_H diff --git a/llvm/unittests/Target/SPIRV/SPIRVConvergenceRegionAnalysisTests.cpp b/llvm/unittests/Target/SPIRV/SPIRVConvergenceRegionAnalysisTests.cpp index d45b4c0b9630d..b3d5f0a32fc35 100644 --- a/llvm/unittests/Target/SPIRV/SPIRVConvergenceRegionAnalysisTests.cpp +++ b/llvm/unittests/Target/SPIRV/SPIRVConvergenceRegionAnalysisTests.cpp @@ -11,6 +11,7 @@ #include "llvm/Analysis/PostDominators.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" From 65ad6267e82f6d8532e4b9de4716bb8fdbdd9ac0 Mon Sep 17 00:00:00 2001 From: lntue Date: Tue, 1 Apr 2025 11:37:57 -0400 Subject: [PATCH 0256/1029] [libc] Fix atan2f128 test for aarch64. (#133924) --- libc/test/src/math/atan2f128_test.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/libc/test/src/math/atan2f128_test.cpp b/libc/test/src/math/atan2f128_test.cpp index c03d8703d7c7b..0bfec1bd276ed 100644 --- a/libc/test/src/math/atan2f128_test.cpp +++ b/libc/test/src/math/atan2f128_test.cpp @@ -18,13 +18,15 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcAtan2f128Test, InQuadRange) { constexpr StorageType X_COUNT = 123; - constexpr StorageType X_START = FPBits(0.25q).uintval(); - constexpr StorageType X_STOP = FPBits(4.0q).uintval(); + constexpr StorageType X_START = + FPBits(static_cast(0.25q)).uintval(); + constexpr StorageType X_STOP = FPBits(static_cast(4.0q)).uintval(); constexpr StorageType X_STEP = (X_STOP - X_START) / X_COUNT; constexpr StorageType Y_COUNT = 137; - constexpr StorageType Y_START = FPBits(0.25q).uintval(); - constexpr StorageType Y_STOP = FPBits(4.0q).uintval(); + constexpr StorageType Y_START = + FPBits(static_cast(0.25q)).uintval(); + constexpr StorageType Y_STOP = FPBits(static_cast(4.0q)).uintval(); constexpr StorageType Y_STEP = (Y_STOP - Y_START) / Y_COUNT; auto test = [&](mpfr::RoundingMode rounding_mode) { From 58551faaf130de52f574b15d053eca1afc83b82b Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Tue, 1 Apr 2025 08:41:11 -0700 Subject: [PATCH 0257/1029] [flang] Inline fir.is_contiguous_box in some cases. (#133812) Added inlining for `rank == 1` and `innermost` cases. --- .../Transforms/SimplifyFIROperations.cpp | 59 +++++-- .../Transforms/simplify-fir-operations.fir | 160 ++++++++++-------- 2 files changed, 132 insertions(+), 87 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp index b6baae501f87e..212de2f2286db 100644 --- a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp +++ b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp @@ -75,27 +75,50 @@ mlir::LogicalResult IsContiguousBoxCoversion::matchAndRewrite( fir::IsContiguousBoxOp op, mlir::PatternRewriter &rewriter) const { mlir::Location loc = op.getLoc(); fir::FirOpBuilder builder(rewriter, op.getOperation()); - // TODO: support preferInlineImplementation. - bool doInline = options.preferInlineImplementation && false; - if (!doInline) { - // Generate Fortran runtime call. - mlir::Value result; - if (op.getInnermost()) { - mlir::Value one = - builder.createIntegerConstant(loc, builder.getI32Type(), 1); - result = - fir::runtime::genIsContiguousUpTo(builder, loc, op.getBox(), one); - } else { - result = fir::runtime::genIsContiguous(builder, loc, op.getBox()); + mlir::Value box = op.getBox(); + + if (options.preferInlineImplementation) { + auto boxType = mlir::cast(box.getType()); + unsigned rank = fir::getBoxRank(boxType); + + // If rank is one, or 'innermost' attribute is set and + // it is not a scalar, then generate a simple comparison + // for the leading dimension: (stride == elem_size || extent == 0). + // + // The scalar cases are supposed to be optimized by the canonicalization. + if (rank == 1 || (op.getInnermost() && rank > 0)) { + mlir::Type idxTy = builder.getIndexType(); + auto eleSize = builder.create(loc, idxTy, box); + mlir::Value zero = fir::factory::createZeroValue(builder, loc, idxTy); + auto dimInfo = + builder.create(loc, idxTy, idxTy, idxTy, box, zero); + mlir::Value stride = dimInfo.getByteStride(); + mlir::Value pred1 = builder.create( + loc, mlir::arith::CmpIPredicate::eq, eleSize, stride); + mlir::Value extent = dimInfo.getExtent(); + mlir::Value pred2 = builder.create( + loc, mlir::arith::CmpIPredicate::eq, extent, zero); + mlir::Value result = + builder.create(loc, pred1, pred2); + result = builder.createConvert(loc, op.getType(), result); + rewriter.replaceOp(op, result); + return mlir::success(); } - result = builder.createConvert(loc, op.getType(), result); - rewriter.replaceOp(op, result); - return mlir::success(); + // TODO: support arrays with multiple dimensions. } - // Generate inline implementation. - TODO(loc, "inline IsContiguousBoxOp"); - return mlir::failure(); + // Generate Fortran runtime call. + mlir::Value result; + if (op.getInnermost()) { + mlir::Value one = + builder.createIntegerConstant(loc, builder.getI32Type(), 1); + result = fir::runtime::genIsContiguousUpTo(builder, loc, box, one); + } else { + result = fir::runtime::genIsContiguous(builder, loc, box); + } + result = builder.createConvert(loc, op.getType(), result); + rewriter.replaceOp(op, result); + return mlir::success(); } /// Generate a call to Size runtime function or an inline diff --git a/flang/test/Transforms/simplify-fir-operations.fir b/flang/test/Transforms/simplify-fir-operations.fir index f712efde846ad..e0a71dd0ae5a7 100644 --- a/flang/test/Transforms/simplify-fir-operations.fir +++ b/flang/test/Transforms/simplify-fir-operations.fir @@ -1,17 +1,15 @@ -// RUN: fir-opt --split-input-file --simplify-fir-operations %s | FileCheck %s - -// ----- +// RUN: fir-opt --split-input-file --simplify-fir-operations %s | FileCheck --check-prefixes=ALL,NOOPT %s +// RUN: fir-opt --split-input-file --simplify-fir-operations=prefer-inline-implementation=true %s | FileCheck --check-prefixes=ALL,OPT %s func.func @test_none_innermost(%arg0: !fir.box) -> i1 { %0 = fir.is_contiguous_box %arg0 innermost : (!fir.box) -> i1 return %0 : i1 } -// CHECK-LABEL: func.func @test_none_innermost( -// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box) -> i1 { -// CHECK: %[[VAL_1:.*]] = arith.constant 1 : i32 -// CHECK: %[[VAL_2:.*]] = fir.call @_FortranAIsContiguousUpTo(%[[VAL_0]], %[[VAL_1]]) : (!fir.box, i32) -> i1 -// CHECK: return %[[VAL_2]] : i1 -// CHECK: } +// ALL-LABEL: func.func @test_none_innermost( +// ALL-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box) -> i1 { +// ALL: %[[VAL_1:.*]] = arith.constant 1 : i32 +// ALL: %[[VAL_2:.*]] = fir.call @_FortranAIsContiguousUpTo(%[[VAL_0]], %[[VAL_1]]) : (!fir.box, i32) -> i1 +// ALL: return %[[VAL_2]] : i1 // ----- @@ -19,11 +17,11 @@ func.func @test_none_whole(%arg0: !fir.box) -> i1 { %0 = fir.is_contiguous_box %arg0 whole : (!fir.box) -> i1 return %0 : i1 } -// CHECK-LABEL: func.func @test_none_whole( -// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box) -> i1 { -// CHECK: %[[VAL_1:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_0]]) : (!fir.box) -> i1 -// CHECK: return %[[VAL_1]] : i1 -// CHECK: } +// ALL-LABEL: func.func @test_none_whole( +// ALL-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box) -> i1 { +// ALL: %[[VAL_1:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_0]]) : (!fir.box) -> i1 +// ALL: return %[[VAL_1]] : i1 +// ALL: } // ----- @@ -31,13 +29,19 @@ func.func @test_array_innermost(%arg0: !fir.box>) -> i1 { %0 = fir.is_contiguous_box %arg0 innermost : (!fir.box>) -> i1 return %0 : i1 } -// CHECK-LABEL: func.func @test_array_innermost( -// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>) -> i1 { -// CHECK: %[[VAL_1:.*]] = arith.constant 1 : i32 -// CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box -// CHECK: %[[VAL_3:.*]] = fir.call @_FortranAIsContiguousUpTo(%[[VAL_2]], %[[VAL_1]]) : (!fir.box, i32) -> i1 -// CHECK: return %[[VAL_3]] : i1 -// CHECK: } +// ALL-LABEL: func.func @test_array_innermost( +// ALL-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>) -> i1 { +// NOOPT: %[[VAL_1:.*]] = arith.constant 1 : i32 +// NOOPT: %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box +// NOOPT: %[[VAL_3:.*]] = fir.call @_FortranAIsContiguousUpTo(%[[VAL_2]], %[[VAL_1]]) : (!fir.box, i32) -> i1 +// NOOPT: return %[[VAL_3]] : i1 +// OPT: %[[VAL_1:.*]] = arith.constant 0 : index +// OPT: %[[VAL_2:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box>) -> index +// OPT: %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_1]] : (!fir.box>, index) -> (index, index, index) +// OPT: %[[VAL_4:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_3]]#2 : index +// OPT: %[[VAL_5:.*]] = arith.cmpi eq, %[[VAL_3]]#1, %[[VAL_1]] : index +// OPT: %[[VAL_6:.*]] = arith.ori %[[VAL_4]], %[[VAL_5]] : i1 +// OPT: return %[[VAL_6]] : i1 // ----- @@ -45,12 +49,18 @@ func.func @test_array_whole(%arg0: !fir.box>) -> i1 { %0 = fir.is_contiguous_box %arg0 whole : (!fir.box>) -> i1 return %0 : i1 } -// CHECK-LABEL: func.func @test_array_whole( -// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>) -> i1 { -// CHECK: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box -// CHECK: %[[VAL_2:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_1]]) : (!fir.box) -> i1 -// CHECK: return %[[VAL_2]] : i1 -// CHECK: } +// ALL-LABEL: func.func @test_array_whole( +// ALL-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>) -> i1 { +// NOOPT: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box +// NOOPT: %[[VAL_2:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_1]]) : (!fir.box) -> i1 +// NOOPT: return %[[VAL_2]] : i1 +// OPT: %[[VAL_1:.*]] = arith.constant 0 : index +// OPT: %[[VAL_2:.*]] = fir.box_elesize %[[VAL_0]] : (!fir.box>) -> index +// OPT: %[[VAL_3:.*]]:3 = fir.box_dims %[[VAL_0]], %[[VAL_1]] : (!fir.box>, index) -> (index, index, index) +// OPT: %[[VAL_4:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_3]]#2 : index +// OPT: %[[VAL_5:.*]] = arith.cmpi eq, %[[VAL_3]]#1, %[[VAL_1]] : index +// OPT: %[[VAL_6:.*]] = arith.ori %[[VAL_4]], %[[VAL_5]] : i1 +// OPT: return %[[VAL_6]] : i1 // ----- @@ -58,13 +68,12 @@ func.func @test_assumed_rank_innermost(%arg0: !fir.box>) -> i1 %0 = fir.is_contiguous_box %arg0 innermost : (!fir.box>) -> i1 return %0 : i1 } -// CHECK-LABEL: func.func @test_assumed_rank_innermost( -// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>) -> i1 { -// CHECK: %[[VAL_1:.*]] = arith.constant 1 : i32 -// CHECK: %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box -// CHECK: %[[VAL_3:.*]] = fir.call @_FortranAIsContiguousUpTo(%[[VAL_2]], %[[VAL_1]]) : (!fir.box, i32) -> i1 -// CHECK: return %[[VAL_3]] : i1 -// CHECK: } +// ALL-LABEL: func.func @test_assumed_rank_innermost( +// ALL-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>) -> i1 { +// ALL: %[[VAL_1:.*]] = arith.constant 1 : i32 +// ALL: %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box +// ALL: %[[VAL_3:.*]] = fir.call @_FortranAIsContiguousUpTo(%[[VAL_2]], %[[VAL_1]]) : (!fir.box, i32) -> i1 +// ALL: return %[[VAL_3]] : i1 // ----- @@ -72,12 +81,25 @@ func.func @test_assumed_rank_whole(%arg0: !fir.box>) -> i1 { %0 = fir.is_contiguous_box %arg0 whole : (!fir.box>) -> i1 return %0 : i1 } -// CHECK-LABEL: func.func @test_assumed_rank_whole( -// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>) -> i1 { -// CHECK: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box -// CHECK: %[[VAL_2:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_1]]) : (!fir.box) -> i1 -// CHECK: return %[[VAL_2]] : i1 -// CHECK: } +// ALL-LABEL: func.func @test_assumed_rank_whole( +// ALL-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>) -> i1 { +// ALL: %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box +// ALL: %[[VAL_2:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_1]]) : (!fir.box) -> i1 +// ALL: return %[[VAL_2]] : i1 +// ALL: } + +// ----- + +func.func @test_scalar_upoly(%arg0: !fir.class) -> i1 { + %0 = fir.is_contiguous_box %arg0 innermost : (!fir.class) -> i1 + return %0 : i1 +} +// ALL-LABEL: func.func @test_scalar_upoly( +// ALL-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.class) -> i1 { +// ALL: %[[VAL_1:.*]] = arith.constant 1 : i32 +// ALL: %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.class) -> !fir.box +// ALL: %[[VAL_3:.*]] = fir.call @_FortranAIsContiguousUpTo(%[[VAL_2]], %[[VAL_1]]) : (!fir.box, i32) -> i1 +// ALL: return %[[VAL_3]] : i1 // ----- @@ -85,15 +107,15 @@ func.func @test_none(%arg0: !fir.box) -> i16 { %0 = fir.box_total_elements %arg0 : (!fir.box) -> i16 return %0 : i16 } -// CHECK-LABEL: func.func @test_none( -// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box) -> i16 { -// CHECK: %[[VAL_3:.*]] = arith.constant {{.*}} : i32 -// CHECK: %[[VAL_1:.*]] = fir.address_of(@{{.*}}) : !fir.ref> -// CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_1]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_5:.*]] = fir.call @_FortranASize(%[[VAL_0]], %[[VAL_4]], %[[VAL_3]]) : (!fir.box, !fir.ref, i32) -> i64 -// CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i64) -> i16 -// CHECK: return %[[VAL_6]] : i16 -// CHECK: } +// ALL-LABEL: func.func @test_none( +// ALL-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box) -> i16 { +// ALL: %[[VAL_3:.*]] = arith.constant {{.*}} : i32 +// ALL: %[[VAL_1:.*]] = fir.address_of(@{{.*}}) : !fir.ref> +// ALL: %[[VAL_4:.*]] = fir.convert %[[VAL_1]] : (!fir.ref>) -> !fir.ref +// ALL: %[[VAL_5:.*]] = fir.call @_FortranASize(%[[VAL_0]], %[[VAL_4]], %[[VAL_3]]) : (!fir.box, !fir.ref, i32) -> i64 +// ALL: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i64) -> i16 +// ALL: return %[[VAL_6]] : i16 +// ALL: } // ----- @@ -101,16 +123,16 @@ func.func @test_array(%arg0: !fir.box>) -> i32 { %0 = fir.box_total_elements %arg0 : (!fir.box>) -> i32 return %0 : i32 } -// CHECK-LABEL: func.func @test_array( -// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>) -> i32 { -// CHECK: %[[VAL_3:.*]] = arith.constant {{.*}} : i32 -// CHECK: %[[VAL_1:.*]] = fir.address_of({{.*}}) : !fir.ref> -// CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box -// CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_1]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_6:.*]] = fir.call @_FortranASize(%[[VAL_4]], %[[VAL_5]], %[[VAL_3]]) : (!fir.box, !fir.ref, i32) -> i64 -// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> i32 -// CHECK: return %[[VAL_7]] : i32 -// CHECK: } +// ALL-LABEL: func.func @test_array( +// ALL-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>) -> i32 { +// ALL: %[[VAL_3:.*]] = arith.constant {{.*}} : i32 +// ALL: %[[VAL_1:.*]] = fir.address_of({{.*}}) : !fir.ref> +// ALL: %[[VAL_4:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box +// ALL: %[[VAL_5:.*]] = fir.convert %[[VAL_1]] : (!fir.ref>) -> !fir.ref +// ALL: %[[VAL_6:.*]] = fir.call @_FortranASize(%[[VAL_4]], %[[VAL_5]], %[[VAL_3]]) : (!fir.box, !fir.ref, i32) -> i64 +// ALL: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> i32 +// ALL: return %[[VAL_7]] : i32 +// ALL: } // ----- @@ -118,13 +140,13 @@ func.func @test_assumed_rank(%arg0: !fir.box>) -> index { %0 = fir.box_total_elements %arg0 : (!fir.box>) -> index return %0 : index } -// CHECK-LABEL: func.func @test_assumed_rank( -// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>) -> index { -// CHECK: %[[VAL_3:.*]] = arith.constant {{.*}} : i32 -// CHECK: %[[VAL_1:.*]] = fir.address_of({{.*}}) : !fir.ref> -// CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box -// CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_1]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_6:.*]] = fir.call @_FortranASize(%[[VAL_4]], %[[VAL_5]], %[[VAL_3]]) : (!fir.box, !fir.ref, i32) -> i64 -// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> index -// CHECK: return %[[VAL_7]] : index -// CHECK: } +// ALL-LABEL: func.func @test_assumed_rank( +// ALL-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box>) -> index { +// ALL: %[[VAL_3:.*]] = arith.constant {{.*}} : i32 +// ALL: %[[VAL_1:.*]] = fir.address_of({{.*}}) : !fir.ref> +// ALL: %[[VAL_4:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box +// ALL: %[[VAL_5:.*]] = fir.convert %[[VAL_1]] : (!fir.ref>) -> !fir.ref +// ALL: %[[VAL_6:.*]] = fir.call @_FortranASize(%[[VAL_4]], %[[VAL_5]], %[[VAL_3]]) : (!fir.box, !fir.ref, i32) -> i64 +// ALL: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> index +// ALL: return %[[VAL_7]] : index +// ALL: } From 66b540d861ecc5fef0fc398c9c3590c3a7dc6ff9 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Tue, 1 Apr 2025 11:45:56 -0400 Subject: [PATCH 0258/1029] [C11] Claim conformance to WG14 N1518 (#133749) This paper introduced ranges of valid start and continuation characters for identifiers. C23 made further changes to these sets. --- clang/test/C/C11/n1518.c | 24 ++++++++++++++++++++++++ clang/www/c_status.html | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 clang/test/C/C11/n1518.c diff --git a/clang/test/C/C11/n1518.c b/clang/test/C/C11/n1518.c new file mode 100644 index 0000000000000..e950d22b20cf9 --- /dev/null +++ b/clang/test/C/C11/n1518.c @@ -0,0 +1,24 @@ +// RUN: %clang_cc1 -verify=c11,both -std=c11 %s +// RUN: %clang_cc1 -verify=c23,both -std=c23 %s + +/* WG14 N1518: Clang 15 + * Recommendations for extended identifier characters for C and C++ + * + * This paper effectively adopts UAX #31, which was later officially adopted + * for C23 via WG14 N2836 and supersedes N1518. + */ + +// This file takes test cases from clang/test/C/C23/n2836_n2939.c. +// This file contains Unicode characters; please do not "fix" them! + +// This was fine in C11, is now an error in C23. +extern int ٢; // c23-error {{character not allowed at the start of an identifier}} \ + c23-warning {{declaration does not declare anything}} + +// This was an error in C11 but is an extension in C23. +extern int ∞; // c11-error {{unexpected character }} \ + c11-warning {{declaration does not declare anything}} \ + c23-warning {{mathematical notation character in an identifier is a Clang extension}} + +int \u1DC0; // both-error {{expected identifier or '('}} +int e\u1DC0; // Ok diff --git a/clang/www/c_status.html b/clang/www/c_status.html index f4f00ac6dd808..8434ed9ecf6f8 100644 --- a/clang/www/c_status.html +++ b/clang/www/c_status.html @@ -1105,7 +1105,7 @@

C11 implementation status

Recommendations for extended identifier characters for C and C++ N1518 - Unknown + Clang 15 Atomic C1x/C++0x compatibility refinements (1st part only) From a03fce4e200d47b9133897b8f0d4688b30b42689 Mon Sep 17 00:00:00 2001 From: Doeke Wartena Date: Tue, 1 Apr 2025 17:49:36 +0200 Subject: [PATCH 0259/1029] Update README.md - fixed invalid json in example (#133890) A period (`,`) is required or you get an error. --- lldb/tools/lldb-dap/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md index 725e9cf63e02a..2f6a51a591914 100644 --- a/lldb/tools/lldb-dap/README.md +++ b/lldb/tools/lldb-dap/README.md @@ -37,7 +37,7 @@ adds `FOO=1` and `bar` to the environment: "program": "/tmp/a.out", "args": [ "one", "two", "three" ], "env": { - "FOO": "1" + "FOO": "1", "BAR": "" } } From 179062b2dc9405a81cf44dbe676817806a4e7c6a Mon Sep 17 00:00:00 2001 From: Jeremy Kun Date: Tue, 1 Apr 2025 08:58:32 -0700 Subject: [PATCH 0260/1029] [mlir][bazel] add alwayslink=True to mlir-runner utils (#133787) MacOS platforms using mlir-runner in lit tests consistently hit the following error: ``` # .---command stderr------------ # | JIT session error: Symbols not found: [ __mlir_ciface_printMemrefI32 ] # | Error: Failed to materialize symbols: { (main, { __mlir_printMemrefI32, ... }) } # `----------------------------- ``` https://github.com/google/heir/issues/1521#issuecomment-2751303404 confirms the issue is fixed by using `alwayslink` on these two targets, and I confirmed on a separate Apple M1 (OSX version Sequoia 15.3.2.). I'm not an expert on the mlir runner internals, but given the mlir-runner is purely for testing, and alwayslink at worst adds some overhead by not removing symbols, it seems low risk. --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 40073eb59fd7a..0c89b7bf18e0f 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -10330,6 +10330,7 @@ cc_library( ":mlir_float16_utils", "//llvm:Support", ], + alwayslink = True, ) # Indirection to avoid 'libmlir_c_runner_utils.so' filename clash. @@ -10359,6 +10360,7 @@ cc_library( ":mlir_c_runner_utils", ":mlir_float16_utils", ], + alwayslink = True, ) # Indirection to avoid 'libmlir_runner_utils.so' filename clash. From e8711436b3419cc9e0e8a70c6eb41dbb2a1bf132 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Tue, 1 Apr 2025 09:02:33 -0700 Subject: [PATCH 0261/1029] [SCEV] Remove EqCacheSCEV (#133186) This was added in https://reviews.llvm.org/D26389 to help with extremely deep SCEV expressions. However, this is wrong since we may cache sub-SCEVs to be equivalent that CompareValueComplexity returned 0 due to hitting the max comparison depth. This also improves compile time in some compiles: https://llvm-compile-time-tracker.com/compare.php?from=34fa037c4fd7f38faada5beedc63ad234e904247&to=e241ecf999f4dd42d4b951d4a5d4f8eabeafcff0&stat=instructions:u Similar to #100721. Fixes #130688. --- llvm/lib/Analysis/ScalarEvolution.cpp | 16 ++-------- .../Analysis/ScalarEvolutionTest.cpp | 31 +++++++++++++++++++ 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 14f9a1bec8939..0e234795837d6 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -653,8 +653,7 @@ static int CompareValueComplexity(const LoopInfo *const LI, Value *LV, // If the max analysis depth was reached, return std::nullopt, assuming we do // not know if they are equivalent for sure. static std::optional -CompareSCEVComplexity(EquivalenceClasses &EqCacheSCEV, - const LoopInfo *const LI, const SCEV *LHS, +CompareSCEVComplexity(const LoopInfo *const LI, const SCEV *LHS, const SCEV *RHS, DominatorTree &DT, unsigned Depth = 0) { // Fast-path: SCEVs are uniqued so we can do a quick equality check. if (LHS == RHS) @@ -665,9 +664,6 @@ CompareSCEVComplexity(EquivalenceClasses &EqCacheSCEV, if (LType != RType) return (int)LType - (int)RType; - if (EqCacheSCEV.isEquivalent(LHS, RHS)) - return 0; - if (Depth > MaxSCEVCompareDepth) return std::nullopt; @@ -681,8 +677,6 @@ CompareSCEVComplexity(EquivalenceClasses &EqCacheSCEV, int X = CompareValueComplexity(LI, LU->getValue(), RU->getValue(), Depth + 1); - if (X == 0) - EqCacheSCEV.unionSets(LHS, RHS); return X; } @@ -747,12 +741,10 @@ CompareSCEVComplexity(EquivalenceClasses &EqCacheSCEV, return (int)LNumOps - (int)RNumOps; for (unsigned i = 0; i != LNumOps; ++i) { - auto X = CompareSCEVComplexity(EqCacheSCEV, LI, LOps[i], ROps[i], DT, - Depth + 1); + auto X = CompareSCEVComplexity(LI, LOps[i], ROps[i], DT, Depth + 1); if (X != 0) return X; } - EqCacheSCEV.unionSets(LHS, RHS); return 0; } @@ -775,11 +767,9 @@ static void GroupByComplexity(SmallVectorImpl &Ops, LoopInfo *LI, DominatorTree &DT) { if (Ops.size() < 2) return; // Noop - EquivalenceClasses EqCacheSCEV; - // Whether LHS has provably less complexity than RHS. auto IsLessComplex = [&](const SCEV *LHS, const SCEV *RHS) { - auto Complexity = CompareSCEVComplexity(EqCacheSCEV, LI, LHS, RHS, DT); + auto Complexity = CompareSCEVComplexity(LI, LHS, RHS, DT); return Complexity && *Complexity < 0; }; if (Ops.size() == 2) { diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp index c72cecbba3cb8..95a4affdd7789 100644 --- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp +++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp @@ -1706,4 +1706,35 @@ TEST_F(ScalarEvolutionsTest, ComplexityComparatorIsStrictWeakOrdering) { }); } +TEST_F(ScalarEvolutionsTest, ComplexityComparatorIsStrictWeakOrdering2) { + // Regression test for a case where caching of equivalent values caused the + // comparator to get inconsistent. + + Type *Int64Ty = Type::getInt64Ty(Context); + Type *PtrTy = PointerType::get(Context, 0); + FunctionType *FTy = FunctionType::get(Type::getVoidTy(Context), + {PtrTy, PtrTy, PtrTy, Int64Ty}, false); + Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M); + BasicBlock *BB = BasicBlock::Create(Context, "entry", F); + ReturnInst::Create(Context, nullptr, BB); + + ScalarEvolution SE = buildSE(*F); + + const SCEV *S0 = SE.getSCEV(F->getArg(0)); + const SCEV *S1 = SE.getSCEV(F->getArg(1)); + const SCEV *S2 = SE.getSCEV(F->getArg(2)); + + const SCEV *P0 = SE.getPtrToIntExpr(S0, Int64Ty); + const SCEV *P1 = SE.getPtrToIntExpr(S1, Int64Ty); + const SCEV *P2 = SE.getPtrToIntExpr(S2, Int64Ty); + + const SCEV *M0 = SE.getNegativeSCEV(P0); + const SCEV *M2 = SE.getNegativeSCEV(P2); + + SmallVector Ops = {M2, P0, M0, P1, P2}; + // When _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG, this will + // crash if the comparator has the specific caching bug. + SE.getAddExpr(Ops); +} + } // end namespace llvm From 558ce50ebc31bbcd5ec5bfad0c0126adfde8bbb0 Mon Sep 17 00:00:00 2001 From: Jonathan Thackray Date: Tue, 1 Apr 2025 17:04:59 +0100 Subject: [PATCH 0262/1029] [Clang][LLVM] Implement multi-single vectors MOP4{A/S} (#129226) Implement all multi-single {BF/F/S/U/SU/US}MOP4{A/S} instructions in clang and llvm following the ACLE in https://github.com/ARM-software/acle/pull/381/files --- clang/include/clang/Basic/arm_sme.td | 9 + .../sme2-intrinsics/acle_sme2_mop4_2x1.c | 304 ++++++++++++++ .../acle_sme2p2_imm.cpp | 84 ++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 4 +- llvm/lib/Target/AArch64/SMEInstrFormats.td | 75 +++- .../AArch64/sme2-intrinsics-mop4a_2x1.ll | 393 ++++++++++++++++++ 6 files changed, 858 insertions(+), 11 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 6312223f5d112..3958ed70f6ad0 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -295,6 +295,7 @@ defm SVMOPS : ZAFPOuterProd<"mops">; multiclass MOP4 checks> { def _1x1 : Inst<"svmop4" # mode # "[_1x1]" # za # "[_{d}_{d}]", "vidd", t, MergeNone, i # "_1x1", [IsInOutZA, IsStreaming], checks>; def _1x2 : Inst<"svmop4" # mode # "[_1x2]" # za # "[_{d}_{d}]", "vid2", t, MergeNone, i # "_1x2", [IsInOutZA, IsStreaming], checks>; + def _2x1 : Inst<"svmop4" # mode # "[_2x1]" # za # "[_{d}_{d}]", "vi2d", t, MergeNone, i # "_2x1", [IsInOutZA, IsStreaming], checks>; } let SMETargetGuard = "sme2,sme-mop4" in { @@ -350,6 +351,10 @@ multiclass SUMOP4 che "vid2.u", t, MergeNone, "aarch64_sme_sumop4" # mode # i # "_wide_1x2", [IsStreaming, IsInOutZA], checks>; + def _2x1 : SInst<"svmop4" # mode # "[_2x1]" # za # "[_{d}_{3}]", + "vi2u", t, MergeNone, "aarch64_sme_sumop4" # mode # i # "_wide_2x1", + [IsStreaming, IsInOutZA], + checks>; } multiclass USMOP4 checks> { @@ -361,6 +366,10 @@ multiclass USMOP4 che "vid2.x", t, MergeNone, "aarch64_sme_usmop4" # mode # i # "_wide_1x2", [IsStreaming, IsInOutZA], checks>; + def _2x1 : SInst<"svmop4" # mode # "[_2x1]" # za # "[_{d}_{3}]", + "vi2x", t, MergeNone, "aarch64_sme_usmop4" # mode # i # "_wide_2x1", + [IsStreaming, IsInOutZA], + checks>; } let SMETargetGuard = "sme2,sme-mop4" in { diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c new file mode 100644 index 0000000000000..e42ed95b9b52c --- /dev/null +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c @@ -0,0 +1,304 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +#ifdef SME_OVERLOADED_FORMS +#define SME_ACLE_FUNC(A1,A2_UNUSED,A3, A4_UNUSED) A1##A3 +#else +#define SME_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svmop4a_2x1_za32_s8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.2x1.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_s8_s8(svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_s8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_s8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.2x1.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_s8_s8(svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_s8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_u8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_u8_u8(svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_u8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_u8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.2x1.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_u8_u8(svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_u8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_s8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.2x1.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_s8_u8(svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_s8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_s8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.2x1.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_s8_u8(svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_s8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_u8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.2x1.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_u8_s8(svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_u8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_u8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.2x1.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_u8_s8(svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_u8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.2x1.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_s16_s16(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.2x1.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_s16_s16(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_u16_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_u16_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.2x1.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_f16_f16(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.2x1.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_f16_f16(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.2x1.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.2x1.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za64_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.2x1.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za64_s16_s16(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za64,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za64_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.2x1.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za64_s16_s16(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za64,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za64_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.2x1.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za64_u16_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za64,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za64_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.2x1.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za64_u16_u16(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za64,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za64_s16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.2x1.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za64_s16_u16(svint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za64,_s16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za64_s16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.2x1.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za64_s16_u16(svint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za64,_s16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za64_u16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.2x1.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za64_u16_s16(svuint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za64,_u16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za64_u16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.2x1.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za64_u16_s16(svuint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za64,_u16_s16)(1, zn, zm); +} + + +// CHECK-LABEL: @test_svmop4a_2x1_za16_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x1.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za16_f16_f16(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za16,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za16_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x1.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za16_f16_f16(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za16,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za32_f32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x1.nxv4f32(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za32_f32_f32(svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za32,_f32_f32)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za32_f32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x1.nxv4f32(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za32_f32_f32(svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za32,_f32_f32)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za64_f64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x1.nxv2f64(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za64_f64_f64(svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za64,_f64_f64)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za64_f64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x1.nxv2f64(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za64_f64_f64(svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za64,_f64_f64)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x1_za16_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x1.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4a_2x1_za16_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x1,_za16,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x1_za16_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x1.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM:%.*]]) +// CHECK-NEXT: ret void +// +void test_svmop4s_2x1_za16_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x1,_za16,_bf16_bf16)(1, zn, zm); +} diff --git a/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp index 47ce2a0f5f80f..f8e57e9b24332 100644 --- a/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp +++ b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp @@ -166,3 +166,87 @@ void tests_mop4_imm_f64_f64_1x2(svfloat64_t zn, svfloat64x2_t zm) __arm_streamin svmop4s_1x2_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} return; } + +void tests_mop4_imm_s8_s8_2x1(svint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_u8_u8_2x1(svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_s8_u8_2x1(svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_u8_s8_2x1(svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_u8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_u8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_s16_s16_2x1(svint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_2x1_za64_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x1_za64_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_u16_u16_2x1(svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_2x1_za64_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x1_za64_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_s16_u16_2x1(svint16x2_t zn, svuint16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x1_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_u16_s16_2x1(svuint16x2_t zn, svint16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za64_u16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x1_za64_u16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_f16_f16_2x1(svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_2x1_za16_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmop4s_2x1_za16_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + return; +} + +void tests_mop4_imm_bf16_bf16_2x1(svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_2x1_za16_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmop4s_2x1_za16_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + return; +} + +void tests_mop4_imm_f32_f32_2x1(svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x1_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_f64_f64_2x1(svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x1_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x1_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index f08bdf78b5f96..6c25e6582b836 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3083,6 +3083,7 @@ let TargetPrefix = "aarch64" in { foreach ty = ["s", "u", "su", "us"] in { def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_1x1" : SME_OuterProduct_QuarterTile_Single_Single; def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_1x2" : SME_OuterProduct_QuarterTile_Single_Multi; + def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_2x1" : SME_OuterProduct_QuarterTile_Single_Multi; } } } @@ -3092,9 +3093,10 @@ let TargetPrefix = "aarch64" in { foreach wide = ["", "_wide"] in { def int_aarch64_sme_mop4 # mode # wide # "_1x1" : SME_OuterProduct_QuarterTile_Single_Single; def int_aarch64_sme_mop4 # mode # wide # "_1x2" : SME_OuterProduct_QuarterTile_Single_Multi; + def int_aarch64_sme_mop4 # mode # wide # "_2x1" : SME_OuterProduct_QuarterTile_Single_Multi; } } - + class SME_AddVectorToTile_Intrinsic : DefaultAttrsIntrinsic<[], [llvm_i32_ty, diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index 87a8f068083d5..ccc061da0be9a 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -273,6 +273,11 @@ class SME2_ZA_Tile_Vec_Single_Single_Pat : Pat<(intrinsic imm_ty:$tile, vt:$Zn, vt:$Zm1, vt:$Zm2), (!cast(name # _PSEUDO) $tile, $Zn, (REG_SEQUENCE ZPR2Mul2, vt:$Zm1, zsub0, vt:$Zm2, zsub1))>; + +class SME2_ZA_Tile_Vec_Multi_Single_Pat + : Pat<(intrinsic imm_ty:$tile, vt:$Zn1, vt:$Zn2, vt:$Zm), + (!cast(name # _PSEUDO) $tile, (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), $Zm)>; + //===----------------------------------------------------------------------===// // SME pattern match helpers. //===----------------------------------------------------------------------===// @@ -616,6 +621,7 @@ class sme_quarter_outer_product_i16_i32{ + // Single vectors def _MZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 0}, subtr, ZPR8Mul2_Lo, ZPR8Mul2_Hi, mnemonic>, SMEPseudo2Instr; @@ -623,8 +629,15 @@ multiclass sme_quarter_outer_product_i8_i32(op # "_1x1"), timm32_0_3, nxv16i8>; + // Multiple and single vectors def _M2ZZ_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 0}, subtr, - ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, mnemonic>; + ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, mnemonic>, SMEPseudo2Instr; + + def NAME # _M2ZZ_BToS_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat(op # "_2x1"), timm32_0_3, nxv16i8>; + + // Single and multiple vectors def _MZ2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 0}, {zm_u, 1}, subtr, ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, mnemonic>, SMEPseudo2Instr; @@ -632,11 +645,13 @@ multiclass sme_quarter_outer_product_i8_i32(op # "_1x2"), timm32_0_3, nxv16i8>; + // Multiple vectors def _M2Z2Z_BToS : sme_quarter_outer_product_i8_i32<{zn_u, 1}, {zm_u, 1}, subtr, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi, mnemonic>; } multiclass sme_quarter_outer_product_i16_i32{ + // Single vectors def _MZZ_HToS : sme_quarter_outer_product_i16_i32, SMEPseudo2Instr; @@ -644,8 +659,15 @@ multiclass sme_quarter_outer_product_i16_i32(op # "_1x1"), timm32_0_3, nxv8i16>; + // Multiple and single vectors def _M2ZZ_HToS : sme_quarter_outer_product_i16_i32; + ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr; + + def NAME # _M2ZZ_HToS_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat(op # "_2x1"), timm32_0_3, nxv8i16>; + + // Single and multiple vectors def _MZ2Z_HToS : sme_quarter_outer_product_i16_i32, SMEPseudo2Instr; @@ -653,11 +675,13 @@ multiclass sme_quarter_outer_product_i16_i32(op # "_1x2"), timm32_0_3, nxv8i16>; + // Multiple vectors def _M2Z2Z_HToS : sme_quarter_outer_product_i16_i32; } multiclass sme_quarter_outer_product_i64{ + // Single vectors def _MZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 0}, subtr, ZPR16Mul2_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr; @@ -665,8 +689,15 @@ multiclass sme_quarter_outer_product_i64(op # "_1x1"), timm32_0_7, nxv8i16>; + // Multiple and single vectors def _M2ZZ_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 0}, subtr, - ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>; + ZZ_h_mul_r_Lo, ZPR16Mul2_Hi, mnemonic>, SMEPseudo2Instr; + + def NAME # _M2ZZ_HtoD_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat(op # "_2x1"), timm32_0_7, nxv8i16>; + + // Single and multiple vectors def _MZ2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 0}, {zm_u, 1}, subtr, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi, mnemonic>, SMEPseudo2Instr; @@ -674,6 +705,7 @@ multiclass sme_quarter_outer_product_i64(op # "_1x2"), timm32_0_7, nxv8i16>; + // Multiple vectors def _M2Z2Z_HtoD : sme_quarter_outer_product_i64<{zn_u, 1}, {zm_u, 1}, subtr, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>; } @@ -5524,7 +5556,11 @@ multiclass sme2_bfmop4as_widening { def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_3, nxv8bf16>; // Multiple and single vectors - def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; + def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr; + + def NAME # _M2ZZ_S_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat(op # "_2x1"), timm32_0_3, nxv8bf16>; // Single and multiple vectors def _MZ2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr; @@ -5533,7 +5569,6 @@ multiclass sme2_bfmop4as_widening { def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_3, nxv8bf16>; - // Multiple vectors def _M2Z2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>; } @@ -5680,7 +5715,11 @@ multiclass sme2_fmop4as_fp16_non_widening { def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_1, nxv8f16>; // Multiple and single vectors - def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; + def _M2ZZ_H : sme2_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr; + + def NAME # _M2ZZ_H_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat(op # "_2x1"), timm32_0_1, nxv8f16>; // Single and multiple vectors def _MZ2Z_H : sme2_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr; @@ -5760,7 +5799,11 @@ multiclass sme2_bfmop4as_non_widening { def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_1, nxv8bf16>; // Multiple and single vectors - def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; + def _M2ZZ_H : sme2_bf16_fp16_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr; + + def NAME # _M2ZZ_H_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat(op # "_2x1"), timm32_0_1, nxv8bf16>; // Single and multiple vectors def _MZ2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr; @@ -5805,7 +5848,11 @@ multiclass sme2_fmop4as_fp32_non_widening { def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_3, nxv4f32>; // Multiple and single vectors - def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>; + def _M2ZZ_S : sme2_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZPR32Mul2_Hi>, SMEPseudo2Instr; + + def NAME # _M2ZZ_S_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat(op # "_2x1"), timm32_0_3, nxv4f32>; // Single and multiple vectors def _MZ2Z_S : sme2_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR32Mul2_Lo, ZZ_s_mul_r_Hi>, SMEPseudo2Instr; @@ -5850,7 +5897,11 @@ multiclass sme2_fmop4as_fp64_non_widening { def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_7, nxv2f64>; // Multiple and single vectors - def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>; + def _M2ZZ_D : sme2_fp64_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZPR64Mul2_Hi>, SMEPseudo2Instr; + + def NAME # _M2ZZ_D_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat(op # "_2x1"), timm32_0_7, nxv2f64>; // Single and multiple vectors def _MZ2Z_D : sme2_fp64_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR64Mul2_Lo, ZZ_d_mul_r_Hi>, SMEPseudo2Instr; @@ -5895,7 +5946,11 @@ multiclass sme2_fmop4as_fp16_fp32_widening { def : SME2_ZA_Tile_Vec_Single_Single_Pat(op # "_1x1"), timm32_0_3, nxv8f16>; // Multiple and single vectors - def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>; + def _M2ZZ_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>, SMEPseudo2Instr; + + def NAME # _M2ZZ_HtoS_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Single_Pat(op # "_2x1"), timm32_0_3, nxv8f16>; // Single and multiple vectors def _MZ2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr; diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll new file mode 100644 index 0000000000000..ef1536fae6496 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x1.ll @@ -0,0 +1,393 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-linux" + +; Widening +define void @mop4a_za32_s8( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za32_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4a za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.wide.2x1.nxv16i8(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za32_s8( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za32_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4s za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.2x1.nxv16i8(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za32_u8( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za32_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4a za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv16i8(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za32_u8( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za32_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4s za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.wide.2x1.nxv16i8(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za32_s8_u8( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za32_s8_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sumop4a za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4a.wide.2x1.nxv16i8(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za32_s8_u8( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za32_s8_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sumop4s za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4s.wide.2x1.nxv16i8(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za32_u8_s8( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za32_u8_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: usmop4a za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4a.wide.2x1.nxv16i8(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za32_u8_s8( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za32_u8_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: usmop4s za0.s, { z0.b, z1.b }, z24.b +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4s.wide.2x1.nxv16i8(i32 0, %zn1, %zn2, %zm) + ret void +} + + +define void @mop4a_za32_s16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za32_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4a za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.wide.2x1.nxv8i16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za32_s16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za32_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4s za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.2x1.nxv8i16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za32_u16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za32_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4a za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.wide.2x1.nxv8i16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za32_u16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za32_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4s za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.wide.2x1.nxv8i16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za32_f16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4a za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.wide.2x1.nxv8f16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za32_f16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.2x1.nxv8f16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za32_bf16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za32_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4a za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.wide.2x1.nxv8bf16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za32_bf16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za32_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4s za0.s, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.2x1.nxv8bf16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za64_s16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za64_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4a za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.za64.wide.2x1.nxv8i16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za64_s16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za64_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4s za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.za64.wide.2x1.nxv8i16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za64_u16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za64_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4a za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.za64.wide.2x1.nxv8i16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za64_u16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za64_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4s za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.za64.wide.2x1.nxv8i16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za64_s16_u16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za64_s16_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sumop4a za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4a.za64.wide.2x1.nxv8i16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za64_s16_u16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za64_s16_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sumop4s za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4s.za64.wide.2x1.nxv8i16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za64_u16_s16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za64_u16_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: usmop4a za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4a.za64.wide.2x1.nxv8i16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za64_u16_s16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za64_u16_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: usmop4s za0.d, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4s.za64.wide.2x1.nxv8i16(i32 0, %zn1, %zn2, %zm) + ret void +} + +; Non-widening +define void @mop4a_za16_f16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4a za0.h, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.2x1.nxv8f16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za16_f16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za0.h, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x1.nxv8f16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za32_f32( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4a za0.s, { z0.s, z1.s }, z24.s +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.2x1.nxv4f32(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za32_f32( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za0.s, { z0.s, z1.s }, z24.s +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x1.nxv4f32(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za64_f64( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4a za0.d, { z0.d, z1.d }, z24.d +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.2x1.nxv2f64(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za64_f64( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za0.d, { z0.d, z1.d }, z24.d +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x1.nxv2f64(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4a_za16_bf16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4a_za16_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4a za0.h, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.2x1.nxv8bf16(i32 0, %zn1, %zn2, %zm) + ret void +} + +define void @mop4s_za16_bf16( %zn1, %zn2, %zm) #0 { +; CHECK-LABEL: mop4s_za16_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4s za0.h, { z0.h, z1.h }, z24.h +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x1.nxv8bf16(i32 0, %zn1, %zn2, %zm) + ret void +} + +attributes #0 = {nounwind "target-features" = "+sme-i16i64,+sme-f64f64,+sme-b16b16,+sme2p1,+bf16,+sme-f16f16,+sme-mop4" } From 105c8c38dcb7fd6077c92c5e0f838c9b7ad9971e Mon Sep 17 00:00:00 2001 From: lorenzo chelini Date: Tue, 1 Apr 2025 10:22:40 -0600 Subject: [PATCH 0263/1029] [MLIR][NFC] Retire let constructor for EmitC (#133732) `let constructor` is legacy (do not use in tree!) since the tableGen backend emits most of the glue logic to build a pass. --- mlir/include/mlir/Dialect/EmitC/Transforms/Passes.h | 8 ++------ mlir/include/mlir/Dialect/EmitC/Transforms/Passes.td | 3 +-- mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp | 8 ++------ 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/mlir/include/mlir/Dialect/EmitC/Transforms/Passes.h b/mlir/include/mlir/Dialect/EmitC/Transforms/Passes.h index 5cd27149d366e..5a103f181c76b 100644 --- a/mlir/include/mlir/Dialect/EmitC/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/EmitC/Transforms/Passes.h @@ -14,12 +14,8 @@ namespace mlir { namespace emitc { -//===----------------------------------------------------------------------===// -// Passes -//===----------------------------------------------------------------------===// - -/// Creates an instance of the C-style expressions forming pass. -std::unique_ptr createFormExpressionsPass(); +#define GEN_PASS_DECL_FORMEXPRESSIONSPASS +#include "mlir/Dialect/EmitC/Transforms/Passes.h.inc" //===----------------------------------------------------------------------===// // Registration diff --git a/mlir/include/mlir/Dialect/EmitC/Transforms/Passes.td b/mlir/include/mlir/Dialect/EmitC/Transforms/Passes.td index fd083abc95715..f46b705ca2dfe 100644 --- a/mlir/include/mlir/Dialect/EmitC/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/EmitC/Transforms/Passes.td @@ -11,13 +11,12 @@ include "mlir/Pass/PassBase.td" -def FormExpressions : Pass<"form-expressions"> { +def FormExpressionsPass : Pass<"form-expressions"> { let summary = "Form C-style expressions from C-operator ops"; let description = [{ The pass wraps emitc ops modelling C operators in emitc.expression ops and then folds single-use expressions into their users where possible. }]; - let constructor = "mlir::emitc::createFormExpressionsPass()"; let dependentDialects = ["emitc::EmitCDialect"]; } diff --git a/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp b/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp index 3385514375804..224d68ab8b4a6 100644 --- a/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp +++ b/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp @@ -18,7 +18,7 @@ namespace mlir { namespace emitc { -#define GEN_PASS_DEF_FORMEXPRESSIONS +#define GEN_PASS_DEF_FORMEXPRESSIONSPASS #include "mlir/Dialect/EmitC/Transforms/Passes.h.inc" } // namespace emitc } // namespace mlir @@ -28,7 +28,7 @@ using namespace emitc; namespace { struct FormExpressionsPass - : public emitc::impl::FormExpressionsBase { + : public emitc::impl::FormExpressionsPassBase { void runOnOperation() override { Operation *rootOp = getOperation(); MLIRContext *context = rootOp->getContext(); @@ -56,7 +56,3 @@ struct FormExpressionsPass } }; } // namespace - -std::unique_ptr mlir::emitc::createFormExpressionsPass() { - return std::make_unique(); -} From 7e25b240731413d2cfca2b78ab1d0ed33d851622 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 1 Apr 2025 23:27:20 +0700 Subject: [PATCH 0264/1029] IRNormalizer: Replace cl::opts with pass parameters (#133874) Not sure why the "fold-all" option naming didn't match the variable "FoldPreOutputs", but I've preserved the difference. More annoyingly, the pass name "normalize" does not match the pass name IRNormalizer and should probably be fixed one way or the other. Also the existing test coverage for the flags is lacking. I've added a test that shows they parse, but we should have tests that they do something. --- .../llvm/Transforms/Utils/IRNormalizer.h | 21 +++++++++ llvm/lib/Passes/PassBuilder.cpp | 25 ++++++++++ llvm/lib/Passes/PassRegistry.def | 6 ++- llvm/lib/Transforms/Utils/IRNormalizer.cpp | 47 +++++-------------- .../IRNormalizer/pass-parameters.ll | 21 +++++++++ .../Transforms/IRNormalizer/reordering.ll | 2 +- 6 files changed, 86 insertions(+), 36 deletions(-) create mode 100644 llvm/test/Transforms/IRNormalizer/pass-parameters.ll diff --git a/llvm/include/llvm/Transforms/Utils/IRNormalizer.h b/llvm/include/llvm/Transforms/Utils/IRNormalizer.h index af1f715d4940d..65f03240f316a 100644 --- a/llvm/include/llvm/Transforms/Utils/IRNormalizer.h +++ b/llvm/include/llvm/Transforms/Utils/IRNormalizer.h @@ -5,8 +5,29 @@ namespace llvm { +struct IRNormalizerOptions { + /// Preserves original instruction order. + bool PreserveOrder = false; + + /// Renames all instructions (including user-named) + bool RenameAll = true; + + /// Folds all regular instructions (including pre-outputs) + bool FoldPreOutputs = true; + + /// Sorts and reorders operands in commutative instructions + bool ReorderOperands = true; +}; + /// IRNormalizer aims to transform LLVM IR into normal form. struct IRNormalizerPass : public PassInfoMixin { +private: + const IRNormalizerOptions Options; + +public: + IRNormalizerPass(IRNormalizerOptions Options = IRNormalizerOptions()) + : Options(Options) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) const; }; diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 536a7fcb67b5e..5cda1517e127d 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -793,6 +793,31 @@ Expected parseLowerMatrixIntrinsicsPassOptions(StringRef Params) { "LowerMatrixIntrinsics"); } +Expected parseIRNormalizerPassOptions(StringRef Params) { + IRNormalizerOptions Result; + while (!Params.empty()) { + StringRef ParamName; + std::tie(ParamName, Params) = Params.split(';'); + + bool Enable = !ParamName.consume_front("no-"); + if (ParamName == "preserve-order") + Result.PreserveOrder = Enable; + else if (ParamName == "rename-all") + Result.RenameAll = Enable; + else if (ParamName == "fold-all") // FIXME: Name mismatch + Result.FoldPreOutputs = Enable; + else if (ParamName == "reorder-operands") + Result.ReorderOperands = Enable; + else { + return make_error( + formatv("invalid normalize pass parameter '{0}' ", ParamName).str(), + inconvertibleErrorCode()); + } + } + + return Result; +} + Expected parseASanPassOptions(StringRef Params) { AddressSanitizerOptions Result; while (!Params.empty()) { diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index a43be480d6194..510a505995304 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -422,7 +422,6 @@ FUNCTION_PASS("move-auto-init", MoveAutoInitPass()) FUNCTION_PASS("nary-reassociate", NaryReassociatePass()) FUNCTION_PASS("newgvn", NewGVNPass()) FUNCTION_PASS("no-op-function", NoOpFunctionPass()) -FUNCTION_PASS("normalize", IRNormalizerPass()) FUNCTION_PASS("objc-arc", ObjCARCOptPass()) FUNCTION_PASS("objc-arc-contract", ObjCARCContractPass()) FUNCTION_PASS("objc-arc-expand", ObjCARCExpandPass()) @@ -568,6 +567,11 @@ FUNCTION_PASS_WITH_PARAMS( "lower-matrix-intrinsics", "LowerMatrixIntrinsicsPass", [](bool Minimal) { return LowerMatrixIntrinsicsPass(Minimal); }, parseLowerMatrixIntrinsicsPassOptions, "minimal") +FUNCTION_PASS_WITH_PARAMS( + "normalize", "IRNormalizerPass", + [](IRNormalizerOptions Options) { return IRNormalizerPass(Options); }, + parseIRNormalizerPassOptions, + "no-preserve-order;preserve-order;no-rename-all;rename-all;no-fold-all;fold-all;no-reorder-operands;reorder-operands") FUNCTION_PASS_WITH_PARAMS( "mldst-motion", "MergedLoadStoreMotionPass", [](MergedLoadStoreMotionOptions Opts) { diff --git a/llvm/lib/Transforms/Utils/IRNormalizer.cpp b/llvm/lib/Transforms/Utils/IRNormalizer.cpp index 7b9f193cbf514..75e775086493a 100644 --- a/llvm/lib/Transforms/Utils/IRNormalizer.cpp +++ b/llvm/lib/Transforms/Utils/IRNormalizer.cpp @@ -40,21 +40,13 @@ namespace { /// IRNormalizer aims to transform LLVM IR into normal form. class IRNormalizer { public: - /// \name Normalizer flags. - /// @{ - /// Preserves original order of instructions. - static cl::opt PreserveOrder; - /// Renames all instructions (including user-named). - static cl::opt RenameAll; // TODO: Don't rename on empty name - /// Folds all regular instructions (including pre-outputs). - static cl::opt FoldPreOutputs; - /// Sorts and reorders operands in commutative instructions. - static cl::opt ReorderOperands; - /// @} - bool runOnFunction(Function &F); + IRNormalizer(IRNormalizerOptions Options) : Options(Options) {} + private: + const IRNormalizerOptions Options; + // Random constant for hashing, so the state isn't zero. const uint64_t MagicHashConstant = 0x6acaa36bef8325c5ULL; DenseSet NamedInstructions; @@ -96,19 +88,6 @@ class IRNormalizer { }; } // namespace -cl::opt IRNormalizer::PreserveOrder( - "norm-preserve-order", cl::Hidden, cl::init(false), - cl::desc("Preserves original instruction order")); -cl::opt IRNormalizer::RenameAll( - "norm-rename-all", cl::Hidden, cl::init(true), - cl::desc("Renames all instructions (including user-named)")); -cl::opt IRNormalizer::FoldPreOutputs( - "norm-fold-all", cl::Hidden, cl::init(true), - cl::desc("Folds all regular instructions (including pre-outputs)")); -cl::opt IRNormalizer::ReorderOperands( - "norm-reorder-operands", cl::Hidden, cl::init(true), - cl::desc("Sorts and reorders operands in commutative instructions")); - /// Entry method to the IRNormalizer. /// /// \param F Function to normalize. @@ -118,7 +97,7 @@ bool IRNormalizer::runOnFunction(Function &F) { Outputs = collectOutputInstructions(F); - if (!PreserveOrder) + if (!Options.PreserveOrder) reorderInstructions(F); // TODO: Reorder basic blocks via a topological sort. @@ -127,8 +106,8 @@ bool IRNormalizer::runOnFunction(Function &F) { nameInstruction(I); for (auto &I : instructions(F)) { - if (!PreserveOrder) { - if (ReorderOperands) + if (!Options.PreserveOrder) { + if (Options.ReorderOperands) reorderInstructionOperandsByNames(&I); if (auto *Phi = dyn_cast(&I)) @@ -146,7 +125,7 @@ bool IRNormalizer::runOnFunction(Function &F) { void IRNormalizer::nameFunctionArguments(Function &F) const { int ArgumentCounter = 0; for (auto &A : F.args()) { - if (RenameAll || A.getName().empty()) { + if (Options.RenameAll || A.getName().empty()) { A.setName("a" + Twine(ArgumentCounter)); ArgumentCounter += 1; } @@ -167,7 +146,7 @@ void IRNormalizer::nameBasicBlocks(Function &F) const { if (isOutput(&I)) Hash = hashing::detail::hash_16_bytes(Hash, I.getOpcode()); - if (RenameAll || B.getName().empty()) { + if (Options.RenameAll || B.getName().empty()) { // Name basic block. Substring hash to make diffs more readable. B.setName("bb" + std::to_string(Hash).substr(0, 5)); } @@ -219,7 +198,7 @@ void IRNormalizer::sortCommutativeOperands(Instruction *I, T &Operands) const { void IRNormalizer::nameAsInitialInstruction(Instruction *I) const { if (I->getType()->isVoidTy()) return; - if (!(I->getName().empty() || RenameAll)) + if (!(I->getName().empty() || Options.RenameAll)) return; LLVM_DEBUG(dbgs() << "Naming initial instruction: " << *I << "\n"); @@ -359,7 +338,7 @@ void IRNormalizer::nameAsRegularInstruction(Instruction *I) { } Name.append(")"); - if ((I->getName().empty() || RenameAll) && !I->getType()->isVoidTy()) + if ((I->getName().empty() || Options.RenameAll) && !I->getType()->isVoidTy()) I->setName(Name); } @@ -379,7 +358,7 @@ void IRNormalizer::nameAsRegularInstruction(Instruction *I) { void IRNormalizer::foldInstructionName(Instruction *I) const { // If this flag is raised, fold all regular // instructions (including pre-outputs). - if (!FoldPreOutputs) { + if (!Options.FoldPreOutputs) { // Don't fold if one of the users is an output instruction. for (auto *U : I->users()) if (auto *IU = dyn_cast(U)) @@ -690,7 +669,7 @@ SetVector IRNormalizer::getOutputFootprint( PreservedAnalyses IRNormalizerPass::run(Function &F, FunctionAnalysisManager &AM) const { - IRNormalizer{}.runOnFunction(F); + IRNormalizer(Options).runOnFunction(F); PreservedAnalyses PA; PA.preserveSet(); return PA; diff --git a/llvm/test/Transforms/IRNormalizer/pass-parameters.ll b/llvm/test/Transforms/IRNormalizer/pass-parameters.ll new file mode 100644 index 0000000000000..f1bca3233dfee --- /dev/null +++ b/llvm/test/Transforms/IRNormalizer/pass-parameters.ll @@ -0,0 +1,21 @@ +; RUN: not opt -S -passes='normalize' %s 2>&1 | FileCheck -check-prefix=ERR %s +; RUN: opt -S -passes=normalize < %s | FileCheck %s +; RUN: opt -S -passes='normalize<>' < %s | FileCheck %s +; RUN: opt -S -passes='normalize' < %s | FileCheck %s +; RUN: opt -S -passes='normalize' < %s | FileCheck %s + +; FIXME: This verifies all the pass parameter names parse, but not +; that they work as expected. + +; ERR: invalid normalize pass parameter 'invalid' + +; CHECK: define i32 @0(i32 %a0, i32 %a1) { +; CHECK-NEXT: bb17254: +; CHECK-NEXT: %"vl12603(%a0, %a1)" = add i32 %a0, %a1 +; CHECK-NEXT: ret i32 %"vl12603(%a0, %a1)" +; CHECK-NEXT: } +define i32 @0(i32, i32) { + %3 = add i32 %0, %1 + ret i32 %3 +} + diff --git a/llvm/test/Transforms/IRNormalizer/reordering.ll b/llvm/test/Transforms/IRNormalizer/reordering.ll index 313d44a88e3cb..64abe8eb56ce1 100644 --- a/llvm/test/Transforms/IRNormalizer/reordering.ll +++ b/llvm/test/Transforms/IRNormalizer/reordering.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -passes=normalize -verify-each -norm-rename-all=false < %s | FileCheck %s +; RUN: opt -S -passes='normalize' -verify-each < %s | FileCheck %s define void @foo() { ; CHECK-LABEL: define void @foo() { From 1f194ff34e4e861a18f7108c7874bccbd6459f30 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Tue, 1 Apr 2025 18:28:53 +0200 Subject: [PATCH 0265/1029] [mlir] Expose `simplifyAffineExpr` through python api (#133926) --- mlir/include/mlir-c/AffineExpr.h | 10 ++++++++++ mlir/lib/Bindings/Python/IRAffine.cpp | 10 ++++++++++ mlir/lib/CAPI/IR/AffineExpr.cpp | 5 +++++ mlir/test/python/ir/affine_expr.py | 8 ++++++++ 4 files changed, 33 insertions(+) diff --git a/mlir/include/mlir-c/AffineExpr.h b/mlir/include/mlir-c/AffineExpr.h index ab768eb2ec870..161db6266cd30 100644 --- a/mlir/include/mlir-c/AffineExpr.h +++ b/mlir/include/mlir-c/AffineExpr.h @@ -104,6 +104,16 @@ MLIR_CAPI_EXPORTED MlirAffineExpr mlirAffineExprShiftSymbols(MlirAffineExpr affineExpr, uint32_t numSymbols, uint32_t shift, uint32_t offset); +/// Simplify an affine expression by flattening and some amount of simple +/// analysis. This has complexity linear in the number of nodes in 'expr'. +/// Returns the simplified expression, which is the same as the input expression +/// if it can't be simplified. When `expr` is semi-affine, a simplified +/// semi-affine expression is constructed in the sorted order of dimension and +/// symbol positions. +MLIR_CAPI_EXPORTED MlirAffineExpr mlirSimplifyAffineExpr(MlirAffineExpr expr, + uint32_t numDims, + uint32_t numSymbols); + //===----------------------------------------------------------------------===// // Affine Dimension Expression. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Bindings/Python/IRAffine.cpp b/mlir/lib/Bindings/Python/IRAffine.cpp index 3c95d29c4bcca..50f2a4f956883 100644 --- a/mlir/lib/Bindings/Python/IRAffine.cpp +++ b/mlir/lib/Bindings/Python/IRAffine.cpp @@ -599,6 +599,16 @@ void mlir::python::populateIRAffine(nb::module_ &m) { }, nb::arg("num_symbols"), nb::arg("shift"), nb::arg("offset").none() = 0) + .def_static( + "simplify_affine_expr", + [](PyAffineExpr &self, uint32_t numDims, uint32_t numSymbols) { + return PyAffineExpr( + self.getContext(), + mlirSimplifyAffineExpr(self, numDims, numSymbols)); + }, + nb::arg("expr"), nb::arg("num_dims"), nb::arg("num_symbols"), + "Simplify an affine expression by flattening and some amount of " + "simple analysis.") .def_static( "get_add", &PyAffineAddExpr::get, "Gets an affine expression containing a sum of two expressions.") diff --git a/mlir/lib/CAPI/IR/AffineExpr.cpp b/mlir/lib/CAPI/IR/AffineExpr.cpp index bc3dcd4174736..5a0a03b11ae45 100644 --- a/mlir/lib/CAPI/IR/AffineExpr.cpp +++ b/mlir/lib/CAPI/IR/AffineExpr.cpp @@ -73,6 +73,11 @@ MlirAffineExpr mlirAffineExprShiftSymbols(MlirAffineExpr affineExpr, return wrap(unwrap(affineExpr).shiftSymbols(numSymbols, shift, offset)); } +MlirAffineExpr mlirSimplifyAffineExpr(MlirAffineExpr expr, uint32_t numDims, + uint32_t numSymbols) { + return wrap(simplifyAffineExpr(unwrap(expr), numDims, numSymbols)); +} + //===----------------------------------------------------------------------===// // Affine Dimension Expression. //===----------------------------------------------------------------------===// diff --git a/mlir/test/python/ir/affine_expr.py b/mlir/test/python/ir/affine_expr.py index 2f64aff143420..c2a2ab3509ca6 100644 --- a/mlir/test/python/ir/affine_expr.py +++ b/mlir/test/python/ir/affine_expr.py @@ -416,3 +416,11 @@ def testAffineExprShift(): assert (dims[2] + dims[3]) == (dims[0] + dims[1]).shift_dims(2, 2) assert (syms[2] + syms[3]) == (syms[0] + syms[1]).shift_symbols(2, 2, 0) + + +# CHECK-LABEL: TEST: testAffineExprSimplify +@run +def testAffineExprSimplify(): + with Context() as ctx: + expr = AffineExpr.get_dim(0) + AffineExpr.get_symbol(0) + assert expr == AffineExpr.simplify_affine_expr(expr, 1, 1) From 00e6d4fe064bb48e16c1eda018e6ed5e9f99cba9 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 1 Apr 2025 17:25:04 +0100 Subject: [PATCH 0266/1029] [libclc][NFC] Delete three unused .inc files --- libclc/generic/lib/clc_unary.inc | 11 -------- libclc/generic/lib/math/binary_impl.inc | 29 ------------------- libclc/generic/lib/math/clc_sw_binary.inc | 34 ----------------------- 3 files changed, 74 deletions(-) delete mode 100644 libclc/generic/lib/clc_unary.inc delete mode 100644 libclc/generic/lib/math/binary_impl.inc delete mode 100644 libclc/generic/lib/math/clc_sw_binary.inc diff --git a/libclc/generic/lib/clc_unary.inc b/libclc/generic/lib/clc_unary.inc deleted file mode 100644 index 702ddd4c0da8b..0000000000000 --- a/libclc/generic/lib/clc_unary.inc +++ /dev/null @@ -1,11 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x) { - return __CLC_IMPL_FUNC(x); -} diff --git a/libclc/generic/lib/math/binary_impl.inc b/libclc/generic/lib/math/binary_impl.inc deleted file mode 100644 index e7637d991cdeb..0000000000000 --- a/libclc/generic/lib/math/binary_impl.inc +++ /dev/null @@ -1,29 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef __CLC_SCALAR - -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, __CLC_GENTYPE y) { - return FUNCTION_IMPL(x, y); -} - -#endif - -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, float y) { - __CLC_GENTYPE vec_y = (__CLC_GENTYPE) (y); - return FUNCTION_IMPL(x, vec_y); -} - -#ifdef cl_khr_fp64 - -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, double y) { - __CLC_GENTYPE vec_y = (__CLC_GENTYPE) (y); - return FUNCTION_IMPL(x, vec_y); -} - -#endif diff --git a/libclc/generic/lib/math/clc_sw_binary.inc b/libclc/generic/lib/math/clc_sw_binary.inc deleted file mode 100644 index fb787c2e2df2d..0000000000000 --- a/libclc/generic/lib/math/clc_sw_binary.inc +++ /dev/null @@ -1,34 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include - -#define __CLC_SW_FUNC(x) __CLC_CONCAT(__clc_, x) - -#if __CLC_FPSIZE > 16 -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x, __CLC_GENTYPE y) { - return __CLC_SW_FUNC(__CLC_FUNC)(x, y); -} -#elif __CLC_FPSIZE == 16 -#ifdef __CLC_SCALAR -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x, - __CLC_GENTYPE y) { - return convert_half( - __CLC_SW_FUNC(__CLC_FUNC)(convert_float(x), convert_float(y))); -} -#else -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNC(__CLC_GENTYPE x, - __CLC_GENTYPE y) { - return __CLC_XCONCAT(convert_half, __CLC_VECSIZE)(__CLC_SW_FUNC(__CLC_FUNC)( - __CLC_XCONCAT(convert_float, __CLC_VECSIZE)(x), - __CLC_XCONCAT(convert_float, __CLC_VECSIZE)(y))); -} -#endif -#endif - -#undef __CLC_SW_FUNC From 4b19db6db965d95d7259e224f518c5404a7989ba Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Tue, 1 Apr 2025 09:39:07 -0700 Subject: [PATCH 0267/1029] Revert "AsmPrinter: Remove ELF's special lowerRelativeReference for unnamed_addr function" (#133935) Reverts llvm/llvm-project#132684 --- .../CodeGen/TargetLoweringObjectFileImpl.h | 4 ++++ .../CodeGen/TargetLoweringObjectFileImpl.cpp | 18 ++++++++++++++++++ ...relative-reloc.ll => plt-relative-reloc.ll} | 5 ++--- ...relative-reloc.ll => plt-relative-reloc.ll} | 5 ++--- ...eloc-64.ll => x86-64-plt-relative-reloc.ll} | 5 ++--- ...e-reloc-32.ll => x86-plt-relative-reloc.ll} | 4 ++-- 6 files changed, 30 insertions(+), 11 deletions(-) rename llvm/test/CodeGen/ARM/{relative-reloc.ll => plt-relative-reloc.ll} (78%) rename llvm/test/CodeGen/RISCV/{relative-reloc.ll => plt-relative-reloc.ll} (84%) rename llvm/test/CodeGen/X86/{relative-reloc-64.ll => x86-64-plt-relative-reloc.ll} (84%) rename llvm/test/CodeGen/X86/{relative-reloc-32.ll => x86-plt-relative-reloc.ll} (89%) diff --git a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h index f035d81e85ddb..8b0e5798d1b61 100644 --- a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h +++ b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h @@ -125,6 +125,10 @@ class TargetLoweringObjectFileELF : public TargetLoweringObjectFile { lowerSymbolDifference(const MCSymbol *LHS, const MCSymbol *RHS, int64_t Addend, std::optional PCRelativeOffset) const; + const MCExpr *lowerRelativeReference(const GlobalValue *LHS, + const GlobalValue *RHS, int64_t Addend, + std::optional PCRelativeOffset, + const TargetMachine &TM) const override; const MCExpr *lowerDSOLocalEquivalent(const MCSymbol *LHS, const MCSymbol *RHS, int64_t Addend, diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index c9415292e88f7..4c20c5dc74d9a 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -1233,6 +1233,24 @@ const MCExpr *TargetLoweringObjectFileELF::lowerSymbolDifference( return Res; } +const MCExpr *TargetLoweringObjectFileELF::lowerRelativeReference( + const GlobalValue *LHS, const GlobalValue *RHS, int64_t Addend, + std::optional PCRelativeOffset, const TargetMachine &TM) const { + // We may only use a PLT-relative relocation to refer to unnamed_addr + // functions. + if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy()) + return nullptr; + + // Basic correctness checks. + if (LHS->getType()->getPointerAddressSpace() != 0 || + RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() || + RHS->isThreadLocal()) + return nullptr; + + return lowerSymbolDifference(TM.getSymbol(LHS), TM.getSymbol(RHS), Addend, + PCRelativeOffset); +} + // Reference the PLT entry of a function, optionally with a subtrahend (`RHS`). const MCExpr *TargetLoweringObjectFileELF::lowerDSOLocalEquivalent( const MCSymbol *LHS, const MCSymbol *RHS, int64_t Addend, diff --git a/llvm/test/CodeGen/ARM/relative-reloc.ll b/llvm/test/CodeGen/ARM/plt-relative-reloc.ll similarity index 78% rename from llvm/test/CodeGen/ARM/relative-reloc.ll rename to llvm/test/CodeGen/ARM/plt-relative-reloc.ll index 65053726e66bf..ede891900e6d0 100644 --- a/llvm/test/CodeGen/ARM/relative-reloc.ll +++ b/llvm/test/CodeGen/ARM/plt-relative-reloc.ll @@ -10,8 +10,7 @@ declare void @fn1() unnamed_addr declare void @fn2() unnamed_addr declare void @fn3() -;; Create a PC-relative relocation that the linker might decline if the addend symbol is preemptible. ; CHECK: .long 0 -; CHECK-NEXT: .long fn1-vtable-4 -; CHECK-NEXT: .long fn2-vtable-4 +; CHECK-NEXT: .long fn1(prel31)-vtable-4 +; CHECK-NEXT: .long fn2(prel31)-vtable-4 ; CHECK-NEXT: .long fn3-vtable-4 diff --git a/llvm/test/CodeGen/RISCV/relative-reloc.ll b/llvm/test/CodeGen/RISCV/plt-relative-reloc.ll similarity index 84% rename from llvm/test/CodeGen/RISCV/relative-reloc.ll rename to llvm/test/CodeGen/RISCV/plt-relative-reloc.ll index 6c94b9fce9308..d2dceb773b2e9 100644 --- a/llvm/test/CodeGen/RISCV/relative-reloc.ll +++ b/llvm/test/CodeGen/RISCV/plt-relative-reloc.ll @@ -12,11 +12,10 @@ declare void @fn2() unnamed_addr declare void @fn3() @global4 = external unnamed_addr global i8 -;; Create a PC-relative relocation that the linker might decline if the addend symbol is preemptible. ; CHECK: vtable: ; CHECK-NEXT: .word 0 # 0x0 -; CHECK-NEXT: .word fn1-vtable-4 -; CHECK-NEXT: .word fn2-vtable-4 +; CHECK-NEXT: .word %pltpcrel(fn1) +; CHECK-NEXT: .word %pltpcrel(fn2+4) ; CHECK-NEXT: .word fn3-vtable-4 ; CHECK-NEXT: .word global4-vtable-4 ; CHECK-NEXT: .size vtable, 20 diff --git a/llvm/test/CodeGen/X86/relative-reloc-64.ll b/llvm/test/CodeGen/X86/x86-64-plt-relative-reloc.ll similarity index 84% rename from llvm/test/CodeGen/X86/relative-reloc-64.ll rename to llvm/test/CodeGen/X86/x86-64-plt-relative-reloc.ll index 6f88edfa075b8..54736c94af248 100644 --- a/llvm/test/CodeGen/X86/relative-reloc-64.ll +++ b/llvm/test/CodeGen/X86/x86-64-plt-relative-reloc.ll @@ -12,9 +12,8 @@ declare void @fn2() unnamed_addr declare void @fn3() @global4 = external unnamed_addr global i8 -;; Create a PC-relative relocation that the linker might decline if the addend symbol is preemptible. ; CHECK: .long 0 -; CHECK-NEXT: .long fn1-vtable-4 -; CHECK-NEXT: .long fn2-vtable-4 +; CHECK-NEXT: .long fn1@PLT-vtable-4 +; CHECK-NEXT: .long fn2@PLT-vtable-4 ; CHECK-NEXT: .long fn3-vtable-4 ; CHECK-NEXT: .long global4-vtable-4 diff --git a/llvm/test/CodeGen/X86/relative-reloc-32.ll b/llvm/test/CodeGen/X86/x86-plt-relative-reloc.ll similarity index 89% rename from llvm/test/CodeGen/X86/relative-reloc-32.ll rename to llvm/test/CodeGen/X86/x86-plt-relative-reloc.ll index 7d0b1fd546a00..d5e80285b160d 100644 --- a/llvm/test/CodeGen/X86/relative-reloc-32.ll +++ b/llvm/test/CodeGen/X86/x86-plt-relative-reloc.ll @@ -11,6 +11,6 @@ declare void @fn2() unnamed_addr declare void @fn3() ; CHECK: .long 0 -; CHECK-NEXT: .long fn1-vtable-4 -; CHECK-NEXT: .long fn2-vtable-4 +; CHECK-NEXT: .long fn1@PLT-vtable-4 +; CHECK-NEXT: .long fn2@PLT-vtable-4 ; CHECK-NEXT: .long fn3-vtable-4 From dd1d41f833c9b28d8a940ba5a0b85b0d47e44e43 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Tue, 1 Apr 2025 12:40:18 -0400 Subject: [PATCH 0268/1029] [AMDGPU][True16][CodeGen] fix moveToVALU with proper subreg access in true16 (#132089) There are V2S copies between vpgr16 and spgr32 in true16 mode. This is caused by vgpr16 and sgpr32 both selectable by 16bit src in ISel. When a V2S copy and its useMI are lowered to VALU, this patch check 1. If the generated new VALU is used by a true16 inst. Add subreg access if necessary. 2. Legalize the V2S copy by replacing it to subreg_to_reg an example MIR looks like: ``` %2:sgpr_32 = COPY %1:vgpr_16 %3:sgpr_32 = S_OR_B32 %2:sgpr_32, ... %4:vgpr_16 = V_ADD_F16_t16 %3:sgpr_32, ... ``` currently lowered to ``` %2:vgpr_32 = COPY %1:vgpr_16 %3:vgpr_32 = V_OR_B32 %2:vgpr_32, ... %4:vgpr_16 = V_ADD_F16_t16 %3:vgpr_32, ... ``` after this patch ``` %2:vgpr_32 = SUBREG_TO_REG 0, %1:vgpr_16, lo16 %3:vgpr_32 = V_OR_B32 %2:vgpr_32, ... %4:vgpr_16 = V_ADD_F16_t16 %3.lo16:vgpr_32, ... ``` --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 39 +++++++++++++ .../AMDGPU/fix-sgpr-copies-f16-true16.mir | 38 +++++------- llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll | 10 ++-- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 10 ++-- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 10 ++-- llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll | 8 +-- llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll | 10 ++-- llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll | 8 +-- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 58 +++++++++---------- llvm/test/CodeGen/AMDGPU/select.f16.ll | 56 +++++++++--------- 10 files changed, 137 insertions(+), 110 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 4acbc201ec58e..260f80a5f532e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7744,6 +7744,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, Inst.setDesc(get(AMDGPU::IMPLICIT_DEF)); return; } + + // If this is a v2s copy src from vgpr16 to sgpr32, + // replace vgpr copy to subreg_to_reg + if (ST.useRealTrue16Insts() && Inst.isCopy() && + Inst.getOperand(1).getReg().isVirtual() && + RI.isVGPR(MRI, Inst.getOperand(1).getReg())) { + const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1); + if (16 == RI.getRegSizeInBits(*SrcRegRC) && + 32 == RI.getRegSizeInBits(*NewDstRC)) { + Register NewDstReg = MRI.createVirtualRegister(NewDstRC); + BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), + get(TargetOpcode::SUBREG_TO_REG), NewDstReg) + .add(MachineOperand::CreateImm(0)) + .add(Inst.getOperand(1)) + .add(MachineOperand::CreateImm(AMDGPU::lo16)); + Inst.eraseFromParent(); + + MRI.replaceRegWith(DstReg, NewDstReg); + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + return; + } + } + Register NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); legalizeOperands(Inst, MDT); @@ -7837,6 +7860,22 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, assert(NewDstRC); NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); + + // Check useMI of NewInstr. If used by a true16 instruction, + // add a lo16 subreg access if size mismatched + if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) { + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), + E = MRI.use_end(); + I != E; ++I) { + MachineInstr &UseMI = *I->getParent(); + unsigned UseMIOpcode = UseMI.getOpcode(); + if (AMDGPU::isTrue16Inst(UseMIOpcode) && + (16 == + RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) { + I->setSubReg(AMDGPU::lo16); + } + } + } } fixImplicitOperands(*NewInstr); // Legalize the operands diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir index 466f28805dfcf..419f57972a485 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir @@ -1,41 +1,35 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s -# XFAIL: * -# FIXME-TRUE16 reenable after fix-sgpr-copies is updated for true16 flow --- -name: cmp_f16 +name: cvt_hi_f32_f16 body: | - bb.0.entry: - ; GCN-LABEL: name: cmp_f16 + bb.0: + ; GCN-LABEL: name: cvt_hi_f32_f16 ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF - ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]] - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY killed [[COPY]] - ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[COPY1]], 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec + ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16 + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[SUBREG_TO_REG]] + ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY]].hi16, 0, 0, 0, implicit $mode, implicit $exec %0:vgpr_16 = IMPLICIT_DEF - %1:sreg_32 = IMPLICIT_DEF - %2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec - %3:sreg_32 = COPY %2:vgpr_16 - nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode - %4:sreg_32_xm0_xexec = COPY $scc - %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec + %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec + %2:sreg_32 = COPY %1:vgpr_16 + %3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode ... --- -name: cvt_hi_f32_f16 +name: s_or_b32 body: | bb.0: - ; GCN-LABEL: name: cvt_hi_f32_f16 + ; GCN-LABEL: name: s_or_b32 ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]] - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY1]].hi16, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16 + ; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[SUBREG_TO_REG]], [[SUBREG_TO_REG]], implicit $exec + ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[V_OR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec %0:vgpr_16 = IMPLICIT_DEF %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec %2:sreg_32 = COPY %1:vgpr_16 - %3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode + %3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc + %4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 644c88457714b..8c5bc4a33a303 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -255,15 +255,15 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3] +; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3] ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l -; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 622a335015eba..297e4f0927204 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -1093,13 +1093,13 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: ;;#ASMSTART ; GFX11-TRUE16-NEXT: ; use v0 ; GFX11-TRUE16-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 999282bf60539..ffbb9fde26e55 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -906,13 +906,13 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: ;;#ASMSTART ; GFX11-TRUE16-NEXT: ; use v0 ; GFX11-TRUE16-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 27ec1cfadd9d2..de12f2b246f57 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -259,13 +259,13 @@ define amdgpu_kernel void @rint_v2f16( ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null +; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index e16540fec0229..1a426096da197 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -255,15 +255,15 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3] +; GFX12-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3] ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l -; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.h, v0.h -; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) ; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index ae41f4381251d..0f709b044f63a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -238,13 +238,13 @@ define amdgpu_kernel void @trunc_v2f16( ; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2 ; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3 ; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null +; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null ; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX12-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index f6e9f152dca5e..51dfbda53ad4c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -736,43 +736,37 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; GFX12-TRUE16-LABEL: constant_load_v16i16_align2: ; GFX12-TRUE16: ; %bb.0: ; %entry ; GFX12-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v9, 0 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, 0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_clause 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v9, s[0:1] offset:16 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v9, s[0:1] offset:12 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v9, s[0:1] offset:8 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v9, s[0:1] offset:4 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v9, s[0:1] offset:28 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v9, s[0:1] offset:24 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v9, s[0:1] offset:20 -; GFX12-TRUE16-NEXT: global_load_d16_b16 v8, v9, s[0:1] -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x5 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-TRUE16-NEXT: global_load_d16_b16 v3, v8, s[0:1] offset:28 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v2, v8, s[0:1] offset:24 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v8, s[0:1] offset:20 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v8, s[0:1] offset:16 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v7, v8, s[0:1] offset:12 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v6, v8, s[0:1] offset:8 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v8, s[0:1] offset:4 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v8, s[0:1] +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x4 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.h -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v8.l -; GFX12-TRUE16-NEXT: s_clause 0x7 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v9, s[0:1] offset:30 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v9, s[0:1] offset:26 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v9, s[0:1] offset:22 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v9, s[0:1] offset:18 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v9, s[0:1] offset:14 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v9, s[0:1] offset:10 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v9, s[0:1] offset:6 -; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v9, s[0:1] offset:2 +; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: s_clause 0x1 ; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX12-TRUE16-NEXT: s_endpgm ; ; GFX12-FAKE16-LABEL: constant_load_v16i16_align2: diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index 9dd7b946ff5bd..7339b545686f5 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -880,17 +880,17 @@ define amdgpu_kernel void @select_v2f16( ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v5.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v3.l, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v6.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, s8 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -1066,17 +1066,17 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, 0x3900, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v4.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -1245,17 +1245,17 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0x3900, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v4.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -1428,15 +1428,15 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1.l, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e64 s0, v4.l, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3900, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -1609,15 +1609,15 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v4.l, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3900, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3800, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3900, v0.l, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; From ec290a43f68b469197abce65949fde84ecdc9146 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 1 Apr 2025 23:51:45 +0700 Subject: [PATCH 0269/1029] llvm-reduce: Reduce externally_initialized (#133859) Not sure this is the right place to put it. This is a property of GlobalVariable, not GlobalValue. But the ReduceGlobalVars reduction tries to delete the value entirely. --- .../llvm-reduce/reduce-externally-initialized.ll | 11 +++++++++++ llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp | 8 ++++++++ 2 files changed, 19 insertions(+) create mode 100644 llvm/test/tools/llvm-reduce/reduce-externally-initialized.ll diff --git a/llvm/test/tools/llvm-reduce/reduce-externally-initialized.ll b/llvm/test/tools/llvm-reduce/reduce-externally-initialized.ll new file mode 100644 index 0000000000000..edd98fa60e5fe --- /dev/null +++ b/llvm/test/tools/llvm-reduce/reduce-externally-initialized.ll @@ -0,0 +1,11 @@ +; RUN: llvm-reduce -abort-on-invalid-reduction --delta-passes=global-values --test FileCheck --test-arg --check-prefix=INTERESTING --test-arg %s --test-arg --input-file %s -o %t.0 +; RUN: FileCheck --implicit-check-not=define --check-prefix=RESULT %s < %t.0 + +; INTERESTING: @externally_initialized_keep = externally_initialized global i32 0 +; INTERESTING: @externally_initialized_drop + +; RESULT: @externally_initialized_keep = externally_initialized global i32 0, align 4 +; RESULT: @externally_initialized_drop = global i32 1, align 4 +@externally_initialized_keep = externally_initialized global i32 0, align 4 +@externally_initialized_drop = externally_initialized global i32 1, align 4 + diff --git a/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp b/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp index 577e0f5d16b63..e56876c38032e 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp @@ -64,5 +64,13 @@ void llvm::reduceGlobalValuesDeltaPass(Oracle &O, ReducerWorkItem &Program) { if (IsImplicitDSOLocal) GV.setDSOLocal(false); } + + // TODO: Should this go in a separate reduction? + if (auto *GVar = dyn_cast(&GV)) { + if (GVar->isExternallyInitialized() && !O.shouldKeep()) + GVar->setExternallyInitialized(false); + + // TODO: Reduce code model + } } } From 347c5a7af5adfe81b79dd77f7f88c626b09e8534 Mon Sep 17 00:00:00 2001 From: jimingham Date: Tue, 1 Apr 2025 09:54:06 -0700 Subject: [PATCH 0270/1029] Add a new affordance that the Python module in a dSYM (#133290) So the dSYM can be told what target it has been loaded into. When lldb is loading modules, while creating a target, it will run "command script import" on any Python modules in Resources/Python in the dSYM. However, this happens WHILE the target is being created, so it is not yet in the target list. That means that these scripts can't act on the target that they a part of when they get loaded. This patch adds a new python API that lldb will call: __lldb_module_added_to_target if it is defined in the module, passing in the Target the module was being added to, so that code in these dSYM's don't have to guess. --- lldb/bindings/python/python-wrapper.swig | 22 +++++++ .../lldb/Interpreter/ScriptInterpreter.h | 3 +- lldb/source/Core/Module.cpp | 4 +- lldb/source/Interpreter/ScriptInterpreter.cpp | 9 ++- .../Python/SWIGPythonBridge.h | 5 ++ .../Python/ScriptInterpreterPython.cpp | 8 ++- .../Python/ScriptInterpreterPythonImpl.h | 3 +- lldb/test/API/macosx/dsym_modules/Makefile | 4 ++ .../macosx/dsym_modules/TestdSYMModuleInit.py | 63 +++++++++++++++++++ lldb/test/API/macosx/dsym_modules/has_dsym.py | 28 +++++++++ lldb/test/API/macosx/dsym_modules/main.c | 9 +++ .../Python/PythonTestSuite.cpp | 6 ++ 12 files changed, 155 insertions(+), 9 deletions(-) create mode 100644 lldb/test/API/macosx/dsym_modules/Makefile create mode 100644 lldb/test/API/macosx/dsym_modules/TestdSYMModuleInit.py create mode 100644 lldb/test/API/macosx/dsym_modules/has_dsym.py create mode 100644 lldb/test/API/macosx/dsym_modules/main.c diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig index 57c7ac387145e..3d1d04e47e70b 100644 --- a/lldb/bindings/python/python-wrapper.swig +++ b/lldb/bindings/python/python-wrapper.swig @@ -966,6 +966,28 @@ bool lldb_private::python::SWIGBridge::LLDBSWIGPythonRunScriptKeywordValue( return true; } +bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallModuleNewTarget( + const char *python_module_name, const char *session_dictionary_name, + lldb::TargetSP target_sp) { + std::string python_function_name_string = python_module_name; + python_function_name_string += ".__lldb_module_added_to_target"; + const char *python_function_name = python_function_name_string.c_str(); + + PyErr_Cleaner py_err_cleaner(true); + + auto dict = PythonModule::MainModule().ResolveName( + session_dictionary_name); + auto pfunc = PythonObject::ResolveNameWithDictionary( + python_function_name, dict); + + if (!pfunc.IsAllocated()) + return true; + + pfunc(SWIGBridge::ToSWIGWrapper(std::move(target_sp)), dict); + + return true; +} + bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallModuleInit( const char *python_module_name, const char *session_dictionary_name, lldb::DebuggerSP debugger) { diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h index c5aa19959aa61..25e82779f05c6 100644 --- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h +++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h @@ -522,7 +522,8 @@ class ScriptInterpreter : public PluginInterface { LoadScriptingModule(const char *filename, const LoadScriptOptions &options, lldb_private::Status &error, StructuredData::ObjectSP *module_sp = nullptr, - FileSpec extra_search_dir = {}); + FileSpec extra_search_dir = {}, + lldb::TargetSP loaded_into_target_sp = {}); virtual bool IsReservedWord(const char *word) { return false; } diff --git a/lldb/source/Core/Module.cpp b/lldb/source/Core/Module.cpp index d70f292abaea4..53dc6fcde0381 100644 --- a/lldb/source/Core/Module.cpp +++ b/lldb/source/Core/Module.cpp @@ -1485,7 +1485,9 @@ bool Module::LoadScriptingResourceInTarget(Target *target, Status &error, scripting_fspec.Dump(scripting_stream.AsRawOstream()); LoadScriptOptions options; bool did_load = script_interpreter->LoadScriptingModule( - scripting_stream.GetData(), options, error); + scripting_stream.GetData(), options, error, + /*module_sp*/ nullptr, /*extra_path*/ {}, + target->shared_from_this()); if (!did_load) return false; } diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp index 4424b6c894356..63655cc5a50c6 100644 --- a/lldb/source/Interpreter/ScriptInterpreter.cpp +++ b/lldb/source/Interpreter/ScriptInterpreter.cpp @@ -48,11 +48,10 @@ StructuredData::DictionarySP ScriptInterpreter::GetInterpreterInfo() { return nullptr; } -bool ScriptInterpreter::LoadScriptingModule(const char *filename, - const LoadScriptOptions &options, - lldb_private::Status &error, - StructuredData::ObjectSP *module_sp, - FileSpec extra_search_dir) { +bool ScriptInterpreter::LoadScriptingModule( + const char *filename, const LoadScriptOptions &options, + lldb_private::Status &error, StructuredData::ObjectSP *module_sp, + FileSpec extra_search_dir, lldb::TargetSP loaded_into_target_sp) { error = Status::FromErrorString( "This script interpreter does not support importing modules."); return false; diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h index a2252d164ab83..504b3aa0a4df1 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h @@ -215,6 +215,11 @@ class SWIGBridge { const char *session_dictionary_name, lldb::DebuggerSP debugger); + static bool + LLDBSwigPythonCallModuleNewTarget(const char *python_module_name, + const char *session_dictionary_name, + lldb::TargetSP target); + static python::PythonObject LLDBSWIGPythonCreateOSPlugin(const char *python_class_name, const char *session_dictionary_name, diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index 4b7694de697c1..a9c81273c1302 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -2316,7 +2316,7 @@ uint64_t replace_all(std::string &str, const std::string &oldStr, bool ScriptInterpreterPythonImpl::LoadScriptingModule( const char *pathname, const LoadScriptOptions &options, lldb_private::Status &error, StructuredData::ObjectSP *module_sp, - FileSpec extra_search_dir) { + FileSpec extra_search_dir, lldb::TargetSP target_sp) { namespace fs = llvm::sys::fs; namespace path = llvm::sys::path; @@ -2495,6 +2495,12 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule( PyRefType::Owned, static_cast(module_pyobj))); } + // Finally, if we got a target passed in, then we should tell the new module + // about this target: + if (target_sp) + return SWIGBridge::LLDBSwigPythonCallModuleNewTarget( + module_name.c_str(), m_dictionary_name.c_str(), target_sp); + return true; } diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h index 2dc784777151b..0f2902813397a 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h @@ -245,7 +245,8 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython { const LoadScriptOptions &options, lldb_private::Status &error, StructuredData::ObjectSP *module_sp = nullptr, - FileSpec extra_search_dir = {}) override; + FileSpec extra_search_dir = {}, + lldb::TargetSP loaded_into_target_sp = {}) override; bool IsReservedWord(const char *word) override; diff --git a/lldb/test/API/macosx/dsym_modules/Makefile b/lldb/test/API/macosx/dsym_modules/Makefile new file mode 100644 index 0000000000000..695335e068c0c --- /dev/null +++ b/lldb/test/API/macosx/dsym_modules/Makefile @@ -0,0 +1,4 @@ +C_SOURCES := main.c +CFLAGS_EXTRAS := -std=c99 + +include Makefile.rules diff --git a/lldb/test/API/macosx/dsym_modules/TestdSYMModuleInit.py b/lldb/test/API/macosx/dsym_modules/TestdSYMModuleInit.py new file mode 100644 index 0000000000000..cd2293acbc82a --- /dev/null +++ b/lldb/test/API/macosx/dsym_modules/TestdSYMModuleInit.py @@ -0,0 +1,63 @@ +""" +Test that we read in the Python module from a dSYM, and run the +init in debugger and the init in target routines. +""" + +import os, shutil + +import lldb +import lldbsuite.test.lldbutil as lldbutil +from lldbsuite.test.lldbtest import * +from lldbsuite.test.decorators import * + + +@skipUnlessDarwin +class TestdSYMModuleInit(TestBase): + @no_debug_info_test + def test_add_module(self): + """This loads a file into a target and ensures that the python module was + correctly added and the two intialization functions are called.""" + self.exe_name = "has_dsym" + self.py_name = self.exe_name + ".py" + + # Now load the target the first time into the debugger: + self.runCmd("settings set target.load-script-from-symbol-file true") + self.interp = self.dbg.GetCommandInterpreter() + + executable = self.build_dsym(self.exe_name + "_1") + target = self.createTestTarget(file_path=executable) + self.check_answers(executable, ["1", "1", "has_dsym_1"]) + + # Now make a second target and make sure both get called: + executable_2 = self.build_dsym(self.exe_name + "_2") + target_2 = self.createTestTarget(file_path=executable_2) + self.check_answers(executable_2, ["2", "2", "has_dsym_2"]) + + def check_answers(self, name, answers): + result = lldb.SBCommandReturnObject() + self.interp.HandleCommand("report_command", result) + self.assertTrue( + result.Succeeded(), f"report_command succeeded {result.GetError()}" + ) + + cmd_results = result.GetOutput().split() + self.assertEqual(answers[0], cmd_results[0], "Right number of module imports") + self.assertEqual(answers[1], cmd_results[1], "Right number of target notices") + self.assertIn(answers[2], name, "Right target name") + + def build_dsym(self, name): + self.build(debug_info="dsym", dictionary={"EXE": name}) + executable = self.getBuildArtifact(name) + dsym_path = self.getBuildArtifact(name + ".dSYM") + python_dir_path = dsym_path + python_dir_path = os.path.join(dsym_path, "Contents", "Resources", "Python") + if not os.path.exists(python_dir_path): + os.mkdir(python_dir_path) + + python_file_name = name + ".py" + + module_dest_path = os.path.join(python_dir_path, python_file_name) + module_origin_path = os.path.join(self.getSourceDir(), self.py_name) + shutil.copy(module_origin_path, module_dest_path) + + return executable diff --git a/lldb/test/API/macosx/dsym_modules/has_dsym.py b/lldb/test/API/macosx/dsym_modules/has_dsym.py new file mode 100644 index 0000000000000..babaa9e64cdb8 --- /dev/null +++ b/lldb/test/API/macosx/dsym_modules/has_dsym.py @@ -0,0 +1,28 @@ +import lldb + + +def report_command(debugger, command, exe_ctx, result, internal_dict): + result.AppendMessage( + f'{lldb.num_module_inits} {lldb.num_target_inits} "{lldb.target_name}"' + ) + result.SetStatus(lldb.eReturnStatusSuccessFinishResult) + + +def __lldb_init_module(debugger, internal_dict): + # We only want to make one copy of the report command so it will be shared + if "has_dsym_1" in __name__: + # lldb is a convenient place to store our counters. + lldb.num_module_inits = 0 + lldb.num_target_inits = 0 + lldb.target_name = "" + + debugger.HandleCommand( + f"command script add -o -f '{__name__}.report_command' report_command" + ) + + lldb.num_module_inits += 1 + + +def __lldb_module_added_to_target(target, internal_dict): + lldb.num_target_inits += 1 + target_name = target.executable.fullpath diff --git a/lldb/test/API/macosx/dsym_modules/main.c b/lldb/test/API/macosx/dsym_modules/main.c new file mode 100644 index 0000000000000..97dc01f98693b --- /dev/null +++ b/lldb/test/API/macosx/dsym_modules/main.c @@ -0,0 +1,9 @@ +#include + +int global_test_var = 10; + +int main() { + int test_var = 10; + printf("Set a breakpoint here: %d.\n", test_var); + return global_test_var; +} diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp index 3faeb587c3a91..f7b5e3aeefe17 100644 --- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp +++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp @@ -230,6 +230,12 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallModuleInit( return false; } +bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallModuleNewTarget( + const char *python_module_name, const char *session_dictionary_name, + lldb::TargetSP target) { + return false; +} + python::PythonObject lldb_private::python::SWIGBridge::LLDBSWIGPythonCreateOSPlugin( const char *python_class_name, const char *session_dictionary_name, From 5c4302442bb07de01c533f6ec766cf14dfdf8b02 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 1 Apr 2025 23:54:10 +0700 Subject: [PATCH 0271/1029] llvm-reduce: Reduce global variable code model (#133865) The current API doesn't have a way to unset it. The query returns an optional, but the set doesn't. Alternatively I could switch the set to also use optional. --- llvm/include/llvm/IR/GlobalVariable.h | 4 ++++ llvm/lib/IR/Globals.cpp | 9 +++++++++ .../tools/llvm-reduce/reduce-code-model.ll | 18 ++++++++++++++++++ .../llvm-reduce/deltas/ReduceGlobalValues.cpp | 3 ++- 4 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/llvm-reduce/reduce-code-model.ll diff --git a/llvm/include/llvm/IR/GlobalVariable.h b/llvm/include/llvm/IR/GlobalVariable.h index 83e484816d7d4..5ea5d3b11cd9a 100644 --- a/llvm/include/llvm/IR/GlobalVariable.h +++ b/llvm/include/llvm/IR/GlobalVariable.h @@ -289,6 +289,10 @@ class GlobalVariable : public GlobalObject, public ilist_node { /// void setCodeModel(CodeModel::Model CM); + /// Remove the code model for this global. + /// + void clearCodeModel(); + // Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Value *V) { return V->getValueID() == Value::GlobalVariableVal; diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp index 8ca44719a3f94..401f8ac58bce8 100644 --- a/llvm/lib/IR/Globals.cpp +++ b/llvm/lib/IR/Globals.cpp @@ -557,6 +557,15 @@ void GlobalVariable::setCodeModel(CodeModel::Model CM) { assert(getCodeModel() == CM && "Code model representation error!"); } +void GlobalVariable::clearCodeModel() { + unsigned CodeModelData = 0; + unsigned OldData = getGlobalValueSubClassData(); + unsigned NewData = (OldData & ~(CodeModelMask << CodeModelShift)) | + (CodeModelData << CodeModelShift); + setGlobalValueSubClassData(NewData); + assert(getCodeModel() == std::nullopt && "Code model representation error!"); +} + //===----------------------------------------------------------------------===// // GlobalAlias Implementation //===----------------------------------------------------------------------===// diff --git a/llvm/test/tools/llvm-reduce/reduce-code-model.ll b/llvm/test/tools/llvm-reduce/reduce-code-model.ll new file mode 100644 index 0000000000000..898f5995d9826 --- /dev/null +++ b/llvm/test/tools/llvm-reduce/reduce-code-model.ll @@ -0,0 +1,18 @@ +; RUN: llvm-reduce -abort-on-invalid-reduction --delta-passes=global-values --test FileCheck --test-arg --check-prefix=INTERESTING --test-arg %s --test-arg --input-file %s -o %t.0 +; RUN: FileCheck --implicit-check-not=define --check-prefix=RESULT %s < %t.0 + +; INTERESTING: @code_model_large_keep = global i32 0, code_model "large", align 4 +; INTERESTING @code_model_large_drop = global i32 0 + +; RESULT: @code_model_large_keep = global i32 0, code_model "large", align 4{{$}} +; RESULT: @code_model_large_drop = global i32 0, align 4{{$}} +@code_model_large_keep = global i32 0, code_model "large", align 4 +@code_model_large_drop = global i32 0, code_model "large", align 4 + +; INTERESTING: @code_model_tiny_keep = global i32 0, code_model "tiny", align 4 +; INTERESTING @code_model_tiny_drop = global i32 0 + +; RESULT: @code_model_tiny_keep = global i32 0, code_model "tiny", align 4{{$}} +; RESULT: @code_model_tiny_drop = global i32 0, align 4{{$}} +@code_model_tiny_keep = global i32 0, code_model "tiny", align 4 +@code_model_tiny_drop = global i32 0, code_model "tiny", align 4 diff --git a/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp b/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp index e56876c38032e..659bf8dd23eff 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceGlobalValues.cpp @@ -70,7 +70,8 @@ void llvm::reduceGlobalValuesDeltaPass(Oracle &O, ReducerWorkItem &Program) { if (GVar->isExternallyInitialized() && !O.shouldKeep()) GVar->setExternallyInitialized(false); - // TODO: Reduce code model + if (GVar->getCodeModel() && !O.shouldKeep()) + GVar->clearCodeModel(); } } } From 69c5049826711022f40d7ce699ffe1d81c3e6f08 Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com> Date: Tue, 1 Apr 2025 11:55:40 -0500 Subject: [PATCH 0272/1029] [NFC][mlir] Update generate script for conv_3d_ncdhw_fcdhw (#133927) https://github.com/llvm/llvm-project/pull/129547 changed the IR directly without updating the auto generate script. Signed-off-by: Nirvedh --- mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py index 040663c882a08..48e724d80c926 100644 --- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py @@ -1140,7 +1140,7 @@ def conv_3d_ncdhw_fcdhw( them to the same data type as the accumulator/output. """ implements(ConvolutionOpInterface) - domain(D.n, D.od, D.oh, D.ow, D.f, D.kd, D.kh, D.kw, D.c) + domain(D.n, D.f, D.od, D.oh, D.ow, D.c, D.kd, D.kh, D.kw) O[D.n, D.f, D.od, D.oh, D.ow] += TypeFn.cast_signed( U, I[ From 25622aa745671d8d1885250f559c6dac0a8c2aeb Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Tue, 1 Apr 2025 11:59:09 -0500 Subject: [PATCH 0273/1029] [mlir][AMDGPU] Add gfx950 MFMAs to the amdgpu.mfma op (#133553) This commit extends the lowering of amdgpu.mfma to handle the new double-rate MFMAs in gfx950 and adds tests for these operations. It also adds support for MFMAs on small floats (f6 and f4), which are implented using the "scaled" MFMA intrinsic with a scale value of 0 in order to have an unscaled MFMA. This commit does not add a `amdgpu.scaled_mfma` operation, as that is future work. --------- Co-authored-by: Jakub Kuderski --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 10 +- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 163 +++++++++++++++--- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 14 +- .../Conversion/AMDGPUToROCDL/mfma-gfx950.mlir | 53 ++++++ mlir/test/Dialect/AMDGPU/invalid.mlir | 4 +- 5 files changed, 204 insertions(+), 40 deletions(-) create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index c0b3e5540b1df..9cdd961d96ff5 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -650,10 +650,12 @@ def AMDGPU_MFMAPermBAttr : EnumAttr, - VectorOfLengthAndType<[4], [F16]>, - VectorOfLengthAndType<[2, 4], [BF16]>, - VectorOfLengthAndType<[4, 8], [I8]>, - VectorOfLengthAndType<[8], [F8E5M2FNUZ, F8E4M3FNUZ, F8E5M2, F8E4M3FN]>]>; + VectorOfLengthAndType<[4, 8], [F16]>, + VectorOfLengthAndType<[2, 4, 8], [BF16]>, + VectorOfLengthAndType<[4, 8, 16], [I8]>, + VectorOfLengthAndType<[8], [F8E5M2FNUZ, F8E4M3FNUZ]>, + VectorOfLengthAndType<[8, 32], [F8E5M2, F8E4M3FN]>, + VectorOfLengthAndType<[32], [F6E2M3FN, F6E3M2FN, F4E2M1FN]>]>; def MFMAOutTypes : AnyTypeOf<[F64, VectorOfLengthAndType<[4, 16, 32], [F32]>, VectorOfLengthAndType<[4, 16, 32], [I32]>, diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 3acd470cff7f5..56d40d6d123bf 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -22,6 +22,7 @@ #include "../LLVMCommon/MemRefDescriptor.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/TypeSwitch.h" #include namespace mlir { @@ -36,6 +37,7 @@ using namespace mlir::amdgpu; constexpr Chipset kGfx908 = Chipset(9, 0, 8); constexpr Chipset kGfx90a = Chipset(9, 0, 0xa); constexpr Chipset kGfx942 = Chipset(9, 4, 2); +constexpr Chipset kGfx950 = Chipset(9, 5, 0); /// Convert an unsigned number `val` to i32. static Value convertUnsignedToI32(ConversionPatternRewriter &rewriter, @@ -494,8 +496,15 @@ struct SchedBarrierOpLowering : public ConvertOpToLLVMPattern { /// and LLVM AMDGPU intrinsics convention. /// /// Specifically: -/// 1. If `input` is a vector of N bytes, bitcast it to a (N * 8)-bit integer. -/// 2. If the element type is bfloat16, bitcast it to i16. +/// 1. If the element type is bfloat16, bitcast it to i16. +/// 2. If instead we have a more than 64-bit quantity, use a +/// instead, which is what the f8f6f4 intrinsics use. +/// 3. If `input` is a vector of N <= 8 bytes, bitcast it to a (N * 8)-bit +/// integer. +/// +/// Note that the type of `input` has already been LLVM type converted: +/// therefore 8-bit and smaller floats are represented as their corresponding +/// `iN` integers. static Value convertMFMAVectorOperand(ConversionPatternRewriter &rewriter, Location loc, Value input) { Type inputType = input.getType(); @@ -503,9 +512,17 @@ static Value convertMFMAVectorOperand(ConversionPatternRewriter &rewriter, if (vectorType.getElementType().isBF16()) return rewriter.create( loc, vectorType.clone(rewriter.getI16Type()), input); - if (vectorType.getElementType().isInteger(8)) { + if (vectorType.getElementType().isInteger(8) && + vectorType.getNumElements() <= 8) return rewriter.create( loc, rewriter.getIntegerType(vectorType.getNumElements() * 8), input); + if (isa(vectorType.getElementType()) && + vectorType.getElementTypeBitWidth() <= 8) { + int64_t numWords = llvm::divideCeil( + vectorType.getNumElements() * vectorType.getElementTypeBitWidth(), + 32); + return rewriter.create( + loc, VectorType::get(numWords, rewriter.getI32Type()), input); } } return input; @@ -622,12 +639,8 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, Chipset chipset) { uint32_t m = mfma.getM(), n = mfma.getN(), k = mfma.getK(), b = mfma.getBlocks(); - Type sourceElem = mfma.getSourceA().getType(); - if (auto sourceType = dyn_cast(sourceElem)) - sourceElem = sourceType.getElementType(); - Type destElem = mfma.getDestC().getType(); - if (auto destType = dyn_cast(destElem)) - destElem = destType.getElementType(); + Type sourceElem = getElementTypeOrSelf(mfma.getSourceA().getType()); + Type destElem = getElementTypeOrSelf(mfma.getDestC().getType()); if (sourceElem.isF32() && destElem.isF32()) { if (mfma.getReducePrecision() && chipset >= kGfx942) { @@ -649,6 +662,12 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, } if (sourceElem.isF16() && destElem.isF32()) { + if (chipset >= kGfx950) { + if (m == 32 && n == 32 && k == 16 && b == 1) + return ROCDL::mfma_f32_32x32x16_f16::getOperationName(); + if (m == 16 && n == 16 && k == 32 && b == 1) + return ROCDL::mfma_f32_16x16x32_f16::getOperationName(); + } if (m == 32 && n == 32 && k == 4 && b == 2) return ROCDL::mfma_f32_32x32x4f16::getOperationName(); if (m == 16 && n == 16 && k == 4 && b == 4) @@ -661,20 +680,25 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, return ROCDL::mfma_f32_16x16x16f16::getOperationName(); } - if (sourceElem.isBF16() && destElem.isF32() && chipset >= kGfx90a) { - if (m == 32 && n == 32 && k == 4 && b == 2) - return ROCDL::mfma_f32_32x32x4bf16_1k::getOperationName(); - if (m == 16 && n == 16 && k == 4 && b == 4) - return ROCDL::mfma_f32_16x16x4bf16_1k::getOperationName(); - if (m == 4 && n == 4 && k == 4 && b == 16) - return ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName(); - if (m == 32 && n == 32 && k == 8 && b == 1) - return ROCDL::mfma_f32_32x32x8bf16_1k::getOperationName(); - if (m == 16 && n == 16 && k == 16 && b == 1) - return ROCDL::mfma_f32_16x16x16bf16_1k::getOperationName(); - } - if (sourceElem.isBF16() && destElem.isF32()) { + if (chipset >= kGfx950) { + if (m == 32 && n == 32 && k == 16 && b == 1) + return ROCDL::mfma_f32_32x32x16_bf16::getOperationName(); + if (m == 16 && n == 16 && k == 32 && b == 1) + return ROCDL::mfma_f32_16x16x32_bf16::getOperationName(); + } + if (chipset >= kGfx90a) { + if (m == 32 && n == 32 && k == 4 && b == 2) + return ROCDL::mfma_f32_32x32x4bf16_1k::getOperationName(); + if (m == 16 && n == 16 && k == 4 && b == 4) + return ROCDL::mfma_f32_16x16x4bf16_1k::getOperationName(); + if (m == 4 && n == 4 && k == 4 && b == 16) + return ROCDL::mfma_f32_4x4x4bf16_1k::getOperationName(); + if (m == 32 && n == 32 && k == 8 && b == 1) + return ROCDL::mfma_f32_32x32x8bf16_1k::getOperationName(); + if (m == 16 && n == 16 && k == 16 && b == 1) + return ROCDL::mfma_f32_16x16x16bf16_1k::getOperationName(); + } if (m == 32 && n == 32 && k == 2 && b == 2) return ROCDL::mfma_f32_32x32x2bf16::getOperationName(); if (m == 16 && n == 16 && k == 2 && b == 4) @@ -687,7 +711,13 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, return ROCDL::mfma_f32_16x16x8bf16::getOperationName(); } - if (isa(sourceElem) && destElem.isInteger(32)) { + if (sourceElem.isInteger(8) && destElem.isInteger(32)) { + if (chipset >= kGfx950) { + if (m == 32 && n == 32 && k == 32 && b == 1) + return ROCDL::mfma_i32_32x32x32_i8::getOperationName(); + if (m == 16 && n == 16 && k == 64 && b == 1) + return ROCDL::mfma_i32_16x16x64_i8::getOperationName(); + } if (m == 32 && n == 32 && k == 4 && b == 2) return ROCDL::mfma_i32_32x32x4i8::getOperationName(); if (m == 16 && n == 16 && k == 4 && b == 4) @@ -750,6 +780,59 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, return std::nullopt; } +static std::optional mfmaTypeSelectCode(Type mlirElemType) { + return llvm::TypeSwitch>(mlirElemType) + .Case([](Float8E4M3FNType) { return 0u; }) + .Case([](Float8E5M2Type) { return 1u; }) + .Case([](Float6E2M3FNType) { return 2u; }) + .Case([](Float6E3M2FNType) { return 3u; }) + .Case([](Float4E2M1FNType) { return 4u; }) + .Default([](Type) { return std::nullopt; }); +} + +/// If there is a scaled MFMA instruction for the input element types `aType` +/// and `bType`, output type `destType`, problem size M, N, K, and B (number of +/// blocks) on the given `chipset`, return a tuple consisting of the +/// OperationName of the intrinsic and the type codes that need to be passed to +/// that intrinsic. Note that this is also used to implement some un-scaled +/// MFMAs, since the compiler represents the ordinary instruction as a "scaled" +/// MFMA with a scale of 0. +static std::optional> +mfmaOpToScaledIntrinsic(Type aType, Type bType, Type destType, uint32_t m, + uint32_t n, uint32_t k, uint32_t b, Chipset chipset) { + aType = getElementTypeOrSelf(aType); + bType = getElementTypeOrSelf(bType); + destType = getElementTypeOrSelf(destType); + + if (chipset < kGfx950) + return std::nullopt; + if (!isa(destType)) + return std::nullopt; + + std::optional aTypeCode = mfmaTypeSelectCode(aType); + std::optional bTypeCode = mfmaTypeSelectCode(bType); + if (!aTypeCode || !bTypeCode) + return std::nullopt; + + if (m == 32 && n == 32 && k == 64 && b == 1) + return std::tuple{ROCDL::mfma_scale_f32_32x32x64_f8f6f4::getOperationName(), + *aTypeCode, *bTypeCode}; + if (m == 16 && n == 16 && k == 128 && b == 1) + return std::tuple{ + ROCDL::mfma_scale_f32_16x16x128_f8f6f4::getOperationName(), *aTypeCode, + *bTypeCode}; + + return std::nullopt; +} + +static std::optional> +mfmaOpToScaledIntrinsic(MFMAOp mfma, Chipset chipset) { + return mfmaOpToScaledIntrinsic( + mfma.getSourceA().getType(), mfma.getSourceB().getType(), + mfma.getDestC().getType(), mfma.getM(), mfma.getN(), mfma.getK(), + mfma.getBlocks(), chipset); +} + /// Return the `rocdl` intrinsic corresponding to a WMMA operation `wmma` /// if one exists. This includes checking to ensure the intrinsic is supported /// on the architecture you are compiling for. @@ -829,16 +912,40 @@ struct MFMAOpLowering : public ConvertOpToLLVMPattern { op.getNegateA() | (op.getNegateB() << 1) | (op.getNegateC() << 2); } std::optional maybeIntrinsic = mfmaOpToIntrinsic(op, chipset); - if (!maybeIntrinsic.has_value()) + std::optional> + maybeScaledIntrinsic = mfmaOpToScaledIntrinsic(op, chipset); + if (!maybeIntrinsic.has_value() && !maybeScaledIntrinsic.has_value()) return op.emitOpError("no intrinsic matching MFMA size on given chipset"); - OperationState loweredOp(loc, *maybeIntrinsic); + + bool isScaled = + !maybeIntrinsic.has_value() && maybeScaledIntrinsic.has_value(); + if (isScaled && + (adaptor.getAbid() > 0 || getBlgpField > 0 || op.getCbsz() > 0)) { + return op.emitOpError( + "non-default abid, blgp, and cbsz aren't supported on MFMAs that can " + "be scaled as those fields are used for type information"); + } + + StringRef intrinsicName = + isScaled ? std::get<0>(*maybeScaledIntrinsic) : *maybeIntrinsic; + OperationState loweredOp(loc, intrinsicName); loweredOp.addTypes(intrinsicOutType); loweredOp.addOperands( {convertMFMAVectorOperand(rewriter, loc, adaptor.getSourceA()), convertMFMAVectorOperand(rewriter, loc, adaptor.getSourceB()), - adaptor.getDestC(), createI32Constant(rewriter, loc, op.getCbsz()), - createI32Constant(rewriter, loc, op.getAbid()), - createI32Constant(rewriter, loc, getBlgpField)}); + adaptor.getDestC()}); + if (isScaled) { + Value zero = createI32Constant(rewriter, loc, 0); + auto [_scaledName, aTypeCode, bTypeCode] = *maybeScaledIntrinsic; + loweredOp.addOperands({createI32Constant(rewriter, loc, aTypeCode), + createI32Constant(rewriter, loc, bTypeCode), + /*scale A byte=*/zero, /*scale A=*/zero, + /*scale B byte=*/zero, /*scale B=*/zero}); + } else { + loweredOp.addOperands({createI32Constant(rewriter, loc, op.getCbsz()), + createI32Constant(rewriter, loc, op.getAbid()), + createI32Constant(rewriter, loc, getBlgpField)}); + }; Value lowered = rewriter.create(loweredOp)->getResult(0); if (outType != intrinsicOutType) lowered = rewriter.create(loc, outType, lowered); diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 2b2a167b90c82..1e482515a4ee0 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -341,22 +341,24 @@ LogicalResult MFMAOp::verify() { } Type sourceBType = getSourceB().getType(); - if (sourceElem.isFloat(8)) { + if (sourceElem.isFloat(8) || sourceElem.isFloat(6) || sourceElem.isFloat(4)) { int64_t sourceBLen = 1; Type sourceBElem = sourceBType; if (auto sourceBVector = llvm::dyn_cast(sourceBType)) { sourceBLen = sourceBVector.getNumElements(); sourceBElem = sourceBVector.getElementType(); } - if (!sourceBElem.isFloat(8)) - return emitOpError("expected both source operands to have f8 elements"); + if (!sourceBElem.isFloat(8) && !sourceBElem.isFloat(6) && + !sourceBElem.isFloat(4)) + return emitOpError("expected both source operands to have small-float " + "elements if one does"); if (sourceLen != sourceBLen) return emitOpError( - "expected both f8 source vectors to have the same length"); + "expected both small-float source vectors to have the same length"); } else { if (sourceType != sourceBType) - return emitOpError( - "expected both non-f8 source operand types to match exactly"); + return emitOpError("expected both non-small-float source operand types " + "to match exactly"); } // Normalize the wider integer types the compiler expects to i8 if (sourceElem.isInteger(32)) { diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir new file mode 100644 index 0000000000000..de63f249bb530 --- /dev/null +++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir @@ -0,0 +1,53 @@ +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx950 -cse | FileCheck %s +func.func @mfma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<16xf32>, + %arg2 : vector<4xf32>, %arg3 : vector<8xbf16>, + %arg4 : vector<16xi8>, %arg5 : vector<16xi32>, + %arg6 : vector<4xi32>, %arg7 : vector<32xf8E4M3FN>, + %arg8 : vector<32xf8E5M2>, %arg9 : vector<32xf6E2M3FN>, + %arg10 : vector<32xf6E3M2FN>, %arg11 : vector<32xf4E2M1FN>) { + // CHECK: %[[c0:.+]] = llvm.mlir.constant(0 : i32) : i32 + + // CHECK: rocdl.mfma.f32.32x32x16.f16{{.*}}: (vector<8xf16>, vector<8xf16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32> + amdgpu.mfma %arg0 * %arg0 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<8xf16>, vector<8xf16>, vector<16xf32> + // CHECK: rocdl.mfma.f32.16x16x32.f16{{.*}}: (vector<8xf16>, vector<8xf16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32> + amdgpu.mfma %arg0 * %arg0 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<8xf16>, vector<8xf16>, vector<4xf32> + // CHECK: rocdl.mfma.f32.32x32x16.bf16{{.*}}: (vector<8xi16>, vector<8xi16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32> + amdgpu.mfma %arg3 * %arg3 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<8xbf16>, vector<8xbf16>, vector<16xf32> + // CHECK: rocdl.mfma.f32.16x16x32.bf16{{.*}}: (vector<8xi16>, vector<8xi16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32> + amdgpu.mfma %arg3 * %arg3 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<8xbf16>, vector<8xbf16>, vector<4xf32> + // CHECK: rocdl.mfma.i32.32x32x32.i8{{.*}}: (vector<4xi32>, vector<4xi32>, vector<16xi32>, i32, i32, i32) -> vector<16xi32> + amdgpu.mfma %arg4 * %arg4 + %arg5 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<16xi8>, vector<16xi8>, vector<16xi32> + // CHECK: rocdl.mfma.i32.16x16x64.i8{{.*}}: (vector<4xi32>, vector<4xi32>, vector<4xi32>, i32, i32, i32) -> vector<4xi32> + amdgpu.mfma %arg4 * %arg4 + %arg6 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<16xi8>, vector<16xi8>, vector<4xi32> + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c0]], %[[c0]], %[[c0]]{{.*}}: (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + amdgpu.mfma %arg7 * %arg7 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<16xf32> + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c0]], %[[c0]], %[[c0]]{{.*}}: (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + amdgpu.mfma %arg7 * %arg7 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 128 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<4xf32> + // CHECK: %[[c1:.+]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c1]], %[[c1]], %[[c0]], %[[c0]]{{.*}}: (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + amdgpu.mfma %arg8 * %arg8 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<16xf32> + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c1]], %[[c1]], %[[c0]], %[[c0]]{{.*}}: (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + amdgpu.mfma %arg8 * %arg8 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 128 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<4xf32> + // CHECK: %[[c2:.+]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c2]], %[[c2]], %[[c0]], %[[c0]]{{.*}}: (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + amdgpu.mfma %arg9 * %arg9 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<32xf6E2M3FN>, vector<32xf6E2M3FN>, vector<16xf32> + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c2]], %[[c2]], %[[c0]], %[[c0]]{{.*}}: (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + amdgpu.mfma %arg9 * %arg9 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 128 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<32xf6E2M3FN>, vector<32xf6E2M3FN>, vector<4xf32> + // CHECK: %[[c3:.+]] = llvm.mlir.constant(3 : i32) : i32 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c3]], %[[c3]], %[[c0]], %[[c0]]{{.*}}: (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + amdgpu.mfma %arg10 * %arg10 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<32xf6E3M2FN>, vector<32xf6E3M2FN>, vector<16xf32> + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c3]], %[[c3]], %[[c0]], %[[c0]]{{.*}}: (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + amdgpu.mfma %arg10 * %arg10 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 128 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<32xf6E3M2FN>, vector<32xf6E3M2FN>, vector<4xf32> + // CHECK-DAG: %[[c4:.+]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c4]], %[[c4]], %[[c0]], %[[c0]]{{.*}}: (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + amdgpu.mfma %arg11 * %arg11 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<32xf4E2M1FN>, vector<32xf4E2M1FN>, vector<16xf32> + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c4]], %[[c4]], %[[c0]], %[[c0]]{{.*}}: (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + amdgpu.mfma %arg11 * %arg11 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 128 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<32xf4E2M1FN>, vector<32xf4E2M1FN>, vector<4xf32> + + // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c2]], %[[c4]], %[[c0]], %[[c0]]{{.*}}: (vector<6xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32> + amdgpu.mfma %arg9 * %arg11 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<32xf6E2M3FN>, vector<32xf4E2M1FN>, vector<16xf32> + // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c2]], %[[c4]], %[[c0]], %[[c0]]{{.*}}: (vector<6xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32> + amdgpu.mfma %arg9 * %arg11 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 128 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<32xf6E2M3FN>, vector<32xf4E2M1FN>, vector<4xf32> + + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 7cb16f5259070..74a421f6dd50f 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -18,7 +18,7 @@ func.func @mixing_packed_stoch_round_types(%arg0: f32, %arg1: i32, %arg2: vector func.func @bad_source_types(%a: vector<2xf32>, %b: vector<4xf16>, %c: vector<32xf32>) -> vector<32xf32> { - // expected-error@+1 {{'amdgpu.mfma' op expected both non-f8 source operand types to match exactly}} + // expected-error@+1 {{'amdgpu.mfma' op expected both non-small-float source operand types to match exactly}} %d = amdgpu.mfma %a * %b + %c { m = 32 : i32, n = 32 : i32, k = 1 : i32, blocks = 2 : i32, abid = 0 : i32, cbsz = 0 : i32} blgp = none : vector<2xf32>, vector<4xf16>, vector<32xf32> @@ -29,7 +29,7 @@ func.func @bad_source_types(%a: vector<2xf32>, %b: vector<4xf16>, func.func @bad_source_types_f8(%a: vector<8xf8E5M2FNUZ>, %b: vector<8xi8>, %c: vector<32xf32>) -> vector<32xf32> { - // expected-error@+1 {{'amdgpu.mfma' op expected both source operands to have f8 elements}} + // expected-error@+1 {{'amdgpu.mfma' op expected both source operands to have small-float elements if one does}} %d = amdgpu.mfma %a * %b + %c { m = 32 : i32, n = 32 : i32, k = 1 : i32, blocks = 2 : i32, abid = 0 : i32, cbsz = 0 : i32} blgp = none : vector<8xf8E5M2FNUZ>, vector<8xi8>, vector<32xf32> From f60eed934493c2d0305d4d597d39813a12db42ef Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 2 Apr 2025 00:03:43 +0700 Subject: [PATCH 0274/1029] llvm-reduce: Add target-features-attr reduction (#133887) Try to reduce individual subtarget features in the "target-features" attribute. This attempts a textual removal of the fields in the string, not a semantic removal. Typically there's a lot of redundant feature spam in the feature list implied by the target-cpu (which I really wish clang would stop emitting). If we could parse these out, we could easily drop the fields without testing anything. --- .../reduce-target-features-attr.ll | 72 +++++++++++++++++++ llvm/tools/llvm-reduce/CMakeLists.txt | 1 + llvm/tools/llvm-reduce/DeltaManager.cpp | 1 + llvm/tools/llvm-reduce/DeltaPasses.def | 1 + .../deltas/ReduceTargetFeaturesAttr.cpp | 56 +++++++++++++++ .../deltas/ReduceTargetFeaturesAttr.h | 23 ++++++ .../secondary/llvm/tools/llvm-reduce/BUILD.gn | 1 + 7 files changed, 155 insertions(+) create mode 100644 llvm/test/tools/llvm-reduce/reduce-target-features-attr.ll create mode 100644 llvm/tools/llvm-reduce/deltas/ReduceTargetFeaturesAttr.cpp create mode 100644 llvm/tools/llvm-reduce/deltas/ReduceTargetFeaturesAttr.h diff --git a/llvm/test/tools/llvm-reduce/reduce-target-features-attr.ll b/llvm/test/tools/llvm-reduce/reduce-target-features-attr.ll new file mode 100644 index 0000000000000..b497758437358 --- /dev/null +++ b/llvm/test/tools/llvm-reduce/reduce-target-features-attr.ll @@ -0,0 +1,72 @@ +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=target-features-attr --test FileCheck --test-arg -enable-var-scope --test-arg --check-prefixes=INTERESTING,CHECK --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck -check-prefixes=RESULT,CHECK %s < %t + +; CHECK: @keep_none_from_one() [[$KEEP_NONE_FROM_ONE:#[0-9]+]] +define void @keep_none_from_one() #0 { + ret void +} + +; CHECK: @keep_one_from_one() [[$KEEP_ONE_FROM_ONE:#[0-9]+]] +define void @keep_one_from_one() #1 { + ret void +} + +; CHECK: @keep_first_from_two() [[$KEEP_FIRST_FROM_TWO:#[0-9]+]] +define void @keep_first_from_two() #2 { + ret void +} + +; CHECK: @keep_second_from_two() [[$KEEP_SECOND_FROM_TWO:#[0-9]+]] +define void @keep_second_from_two() #3 { + ret void +} + +; CHECK: @keep_all_of_two() [[$KEEP_ALL_OF_TWO:#[0-9]+]] +define void @keep_all_of_two() #4 { + ret void +} + +; CHECK: @drop_empty_element() [[$DROP_EMPTY_ELEMENT:#[0-9]+]] +define void @drop_empty_element() #5 { + ret void +} + +; CHECK: @keep_second_from_three() [[$KEEP_SECOND_FROM_THREE:#[0-9]+]] +define void @keep_second_from_three() #6 { + ret void +} + +; RESULT: define void @no_target_features() { +define void @no_target_features() { + ret void +} + +; IR verifier should probably reject this +; RESULT: define void @no_target_features_value() { +define void @no_target_features_value() #7 { + ret void +} + +attributes #0 = { "target-features"="+foo" "unique-attr-0" } +attributes #1 = { "target-features"="+foo" "unique-attr-1" } +attributes #2 = { "target-features"="+first,+second" "unique-attr-2" } +attributes #3 = { "target-features"="+first,+second" "unique-attr-3" } +attributes #4 = { "target-features"="+first,+second" "unique-attr-4" } +attributes #5 = { "target-features"="+dead,,+beef" "unique-attr-5" } +attributes #6 = { "target-features"="+a,+b,+c" "unique-attr-6" } +attributes #7 = { "target-features" } + +; INTERESTING-DAG: [[$KEEP_ONE_FROM_ONE]] = { "target-features"="+foo" +; INTERESTING-DAG: [[$KEEP_FIRST_FROM_TWO]] = { "target-features"="{{.*}}+first +; INTERESTING-DAG: [[$KEEP_SECOND_FROM_TWO]] = { "target-features"="{{.*}}+second +; INTERESTING-DAG: [[$KEEP_ALL_OF_TWO]] = { "target-features"="{{.*}}+first,+second +; INTERESTING-DAG: [[$DROP_EMPTY_ELEMENT]] = { "target-features"="{{.*}}+dead{{.*}}+beef +; INTERESTING-DAG: [[$KEEP_SECOND_FROM_THREE]] = { "target-features"="{{.*}}+b + + +; RESULT-DAG: attributes [[$KEEP_NONE_FROM_ONE]] = { "unique-attr-0" } +; RESULT-DAG: [[$KEEP_FIRST_FROM_TWO]] = { "target-features"="+first" "unique-attr-2" } +; RESULT-DAG: [[$KEEP_SECOND_FROM_TWO]] = { "target-features"="+second" "unique-attr-3" } +; RESULT-DAG: [[$KEEP_ALL_OF_TWO]] = { "target-features"="+first,+second" "unique-attr-4" } +; RESULT-DAG: [[$DROP_EMPTY_ELEMENT]] = { "target-features"="+dead,+beef" "unique-attr-5" } +; RESULT-DAG: [[$KEEP_SECOND_FROM_THREE]] = { "target-features"="+b" "unique-attr-6" } diff --git a/llvm/tools/llvm-reduce/CMakeLists.txt b/llvm/tools/llvm-reduce/CMakeLists.txt index b8ad6f71b41e5..d1423e4c0895d 100644 --- a/llvm/tools/llvm-reduce/CMakeLists.txt +++ b/llvm/tools/llvm-reduce/CMakeLists.txt @@ -58,6 +58,7 @@ add_llvm_tool(llvm-reduce deltas/ReduceRegisterMasks.cpp deltas/ReduceRegisterDefs.cpp deltas/ReduceRegisterUses.cpp + deltas/ReduceTargetFeaturesAttr.cpp deltas/ReduceUsingSimplifyCFG.cpp deltas/RunIRPasses.cpp deltas/SimplifyInstructions.cpp diff --git a/llvm/tools/llvm-reduce/DeltaManager.cpp b/llvm/tools/llvm-reduce/DeltaManager.cpp index 5281b1d5aebf2..74c1f1bfa5ea8 100644 --- a/llvm/tools/llvm-reduce/DeltaManager.cpp +++ b/llvm/tools/llvm-reduce/DeltaManager.cpp @@ -45,6 +45,7 @@ #include "deltas/ReduceRegisterMasks.h" #include "deltas/ReduceRegisterUses.h" #include "deltas/ReduceSpecialGlobals.h" +#include "deltas/ReduceTargetFeaturesAttr.h" #include "deltas/ReduceUsingSimplifyCFG.h" #include "deltas/ReduceVirtualRegisters.h" #include "deltas/RunIRPasses.h" diff --git a/llvm/tools/llvm-reduce/DeltaPasses.def b/llvm/tools/llvm-reduce/DeltaPasses.def index 060daf198c76a..4c9c581924321 100644 --- a/llvm/tools/llvm-reduce/DeltaPasses.def +++ b/llvm/tools/llvm-reduce/DeltaPasses.def @@ -45,6 +45,7 @@ DELTA_PASS_IR("operands-to-args", reduceOperandsToArgsDeltaPass, "Converting ope DELTA_PASS_IR("operands-skip", reduceOperandsSkipDeltaPass, "Reducing operands by skipping over instructions") DELTA_PASS_IR("operand-bundles", reduceOperandBundesDeltaPass, "Reducing Operand Bundles") DELTA_PASS_IR("attributes", reduceAttributesDeltaPass, "Reducing Attributes") +DELTA_PASS_IR("target-features-attr", reduceTargetFeaturesAttrDeltaPass, "Reducing target-features") DELTA_PASS_IR("module-data", reduceModuleDataDeltaPass, "Reducing Module Data") DELTA_PASS_IR("opcodes", reduceOpcodesDeltaPass, "Reducing Opcodes") DELTA_PASS_IR("volatile", reduceVolatileInstructionsDeltaPass, "Reducing Volatile Instructions") diff --git a/llvm/tools/llvm-reduce/deltas/ReduceTargetFeaturesAttr.cpp b/llvm/tools/llvm-reduce/deltas/ReduceTargetFeaturesAttr.cpp new file mode 100644 index 0000000000000..d4c25f23a2689 --- /dev/null +++ b/llvm/tools/llvm-reduce/deltas/ReduceTargetFeaturesAttr.cpp @@ -0,0 +1,56 @@ +//===- ReduceTargetFeaturesAttr.cpp - Specialized Delta Pass --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Attempt to remove individual elements of the "target-features" attribute on +// functions. +// +//===----------------------------------------------------------------------===// + +#include "ReduceTargetFeaturesAttr.h" + +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/IR/Function.h" + +// TODO: We could maybe do better if we did a semantic parse of the attributes +// through MCSubtargetInfo. Features can be flipped on and off in the string, +// some are implied by target-cpu and can't be meaningfully re-added. +void llvm::reduceTargetFeaturesAttrDeltaPass(Oracle &O, + ReducerWorkItem &WorkItem) { + Module &M = WorkItem.getModule(); + SmallString<256> NewValueString; + SmallVector SplitFeatures; + + for (Function &F : M) { + Attribute TargetFeaturesAttr = F.getFnAttribute("target-features"); + if (!TargetFeaturesAttr.isValid()) + continue; + + StringRef TargetFeatures = TargetFeaturesAttr.getValueAsString(); + TargetFeatures.split(SplitFeatures, ',', /*MaxSplit=*/-1, + /*KeepEmpty=*/false); + + ListSeparator LS(","); + + { + raw_svector_ostream OS(NewValueString); + for (StringRef Feature : SplitFeatures) { + if (O.shouldKeep()) + OS << LS << Feature; + } + } + + if (NewValueString.empty()) + F.removeFnAttr("target-features"); + else + F.addFnAttr("target-features", NewValueString); + + SplitFeatures.clear(); + NewValueString.clear(); + } +} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceTargetFeaturesAttr.h b/llvm/tools/llvm-reduce/deltas/ReduceTargetFeaturesAttr.h new file mode 100644 index 0000000000000..6bb435131f496 --- /dev/null +++ b/llvm/tools/llvm-reduce/deltas/ReduceTargetFeaturesAttr.h @@ -0,0 +1,23 @@ +//===- ReduceTargetFeaturesAttr.h - Specialized Delta Pass ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a function which calls the Generic Delta pass in order +// to reduce uninteresting attributes. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCETARGETFEATURESATTR_H +#define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCETARGETFEATURESATTR_H + +#include "Delta.h" + +namespace llvm { +void reduceTargetFeaturesAttrDeltaPass(Oracle &O, ReducerWorkItem &WorkItem); +} // namespace llvm + +#endif diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn index a576ea8af4374..ceff84dfe640a 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn @@ -47,6 +47,7 @@ executable("llvm-reduce") { "deltas/ReduceRegisterDefs.cpp", "deltas/ReduceRegisterMasks.cpp", "deltas/ReduceRegisterUses.cpp", + "deltas/ReduceTargetFeaturesAttr.cpp", "deltas/ReduceSpecialGlobals.cpp", "deltas/ReduceUsingSimplifyCFG.cpp", "deltas/ReduceVirtualRegisters.cpp", From 75242a8a1d38815b5533c3951d5beceb8033d9fa Mon Sep 17 00:00:00 2001 From: Paul Bowen-Huggett Date: Tue, 1 Apr 2025 19:05:30 +0200 Subject: [PATCH 0275/1029] [RISCV] Fix the c_slli disassembler test (NFC) (#133921) This change fixes the exhaustive text for the c.slli instruction so that each hex pattern now appears only once. The problem was spotted [here](https://github.com/llvm/llvm-project/pull/133713#discussion_r2021577609) by @topperc (for which, thank you). --- llvm/test/MC/Disassembler/RISCV/c_slli.txt | 10368 +++++++++++++------ 1 file changed, 7200 insertions(+), 3168 deletions(-) diff --git a/llvm/test/MC/Disassembler/RISCV/c_slli.txt b/llvm/test/MC/Disassembler/RISCV/c_slli.txt index 98ff2b132cc85..25204555d84e0 100644 --- a/llvm/test/MC/Disassembler/RISCV/c_slli.txt +++ b/llvm/test/MC/Disassembler/RISCV/c_slli.txt @@ -14,3202 +14,7234 @@ # RUN: -M no-aliases --show-encoding < %s 2>&1 | \ # RUN: FileCheck --check-prefix=NOHINTS %s -0x02 0x00 # GOOD: c.slli64 zero -0x02 0x00 # NOHINTS: invalid instruction encoding -0x06 0x00 # GOOD: c.slli zero, 1 -0x06 0x00 # NOHINTS: invalid instruction encoding -0x0A 0x00 # GOOD: c.slli zero, 2 -0x0A 0x00 # NOHINTS: invalid instruction encoding -0x0E 0x00 # GOOD: c.slli zero, 3 -0x0E 0x00 # NOHINTS: invalid instruction encoding -0x12 0x00 # GOOD: c.slli zero, 4 -0x12 0x00 # NOHINTS: invalid instruction encoding -0x16 0x00 # GOOD: c.slli zero, 5 -0x16 0x00 # NOHINTS: invalid instruction encoding -0x1A 0x00 # GOOD: c.slli zero, 6 -0x1A 0x00 # NOHINTS: invalid instruction encoding -0x1E 0x00 # GOOD: c.slli zero, 7 -0x1E 0x00 # NOHINTS: invalid instruction encoding -0x22 0x00 # GOOD: c.slli zero, 8 -0x22 0x00 # NOHINTS: invalid instruction encoding -0x26 0x00 # GOOD: c.slli zero, 9 -0x26 0x00 # NOHINTS: invalid instruction encoding -0x2A 0x00 # GOOD: c.slli zero, 10 -0x2A 0x00 # NOHINTS: invalid instruction encoding -0x2E 0x00 # GOOD: c.slli zero, 11 -0x2E 0x00 # NOHINTS: invalid instruction encoding -0x32 0x00 # GOOD: c.slli zero, 12 -0x32 0x00 # NOHINTS: invalid instruction encoding -0x36 0x00 # GOOD: c.slli zero, 13 -0x36 0x00 # NOHINTS: invalid instruction encoding -0x3A 0x00 # GOOD: c.slli zero, 14 -0x3A 0x00 # NOHINTS: invalid instruction encoding -0x3E 0x00 # GOOD: c.slli zero, 15 -0x3E 0x00 # NOHINTS: invalid instruction encoding -0x42 0x00 # GOOD: c.slli zero, 16 -0x42 0x00 # NOHINTS: invalid instruction encoding -0x46 0x00 # GOOD: c.slli zero, 17 -0x46 0x00 # NOHINTS: invalid instruction encoding -0x4A 0x00 # GOOD: c.slli zero, 18 -0x4A 0x00 # NOHINTS: invalid instruction encoding -0x4E 0x00 # GOOD: c.slli zero, 19 -0x4E 0x00 # NOHINTS: invalid instruction encoding -0x52 0x00 # GOOD: c.slli zero, 20 -0x52 0x00 # NOHINTS: invalid instruction encoding -0x56 0x00 # GOOD: c.slli zero, 21 -0x56 0x00 # NOHINTS: invalid instruction encoding -0x5A 0x00 # GOOD: c.slli zero, 22 -0x5A 0x00 # NOHINTS: invalid instruction encoding -0x5E 0x00 # GOOD: c.slli zero, 23 -0x5E 0x00 # NOHINTS: invalid instruction encoding -0x62 0x00 # GOOD: c.slli zero, 24 -0x62 0x00 # NOHINTS: invalid instruction encoding -0x66 0x00 # GOOD: c.slli zero, 25 -0x66 0x00 # NOHINTS: invalid instruction encoding -0x6A 0x00 # GOOD: c.slli zero, 26 -0x6A 0x00 # NOHINTS: invalid instruction encoding -0x6E 0x00 # GOOD: c.slli zero, 27 -0x6E 0x00 # NOHINTS: invalid instruction encoding -0x72 0x00 # GOOD: c.slli zero, 28 -0x72 0x00 # NOHINTS: invalid instruction encoding -0x76 0x00 # GOOD: c.slli zero, 29 -0x76 0x00 # NOHINTS: invalid instruction encoding -0x7A 0x00 # GOOD: c.slli zero, 30 -0x7A 0x00 # NOHINTS: invalid instruction encoding -0x7E 0x00 # GOOD: c.slli zero, 31 -0x7E 0x00 # NOHINTS: invalid instruction encoding -0x02 0x10 # BAD32: invalid instruction encoding -0x02 0x10 # GOOD64: c.slli zero, 32 -0x02 0x10 # NOHINTS: invalid instruction encoding -0x06 0x10 # BAD32: invalid instruction encoding -0x06 0x10 # GOOD64: c.slli zero, 33 -0x06 0x10 # NOHINTS: invalid instruction encoding -0x0A 0x10 # BAD32: invalid instruction encoding -0x0A 0x10 # GOOD64: c.slli zero, 34 -0x0A 0x10 # NOHINTS: invalid instruction encoding -0x0E 0x10 # BAD32: invalid instruction encoding -0x0E 0x10 # GOOD64: c.slli zero, 35 -0x0E 0x10 # NOHINTS: invalid instruction encoding -0x12 0x10 # BAD32: invalid instruction encoding -0x12 0x10 # GOOD64: c.slli zero, 36 -0x12 0x10 # NOHINTS: invalid instruction encoding -0x16 0x10 # BAD32: invalid instruction encoding -0x16 0x10 # GOOD64: c.slli zero, 37 -0x16 0x10 # NOHINTS: invalid instruction encoding -0x1A 0x10 # BAD32: invalid instruction encoding -0x1A 0x10 # GOOD64: c.slli zero, 38 -0x1A 0x10 # NOHINTS: invalid instruction encoding -0x1E 0x10 # BAD32: invalid instruction encoding -0x1E 0x10 # GOOD64: c.slli zero, 39 -0x1E 0x10 # NOHINTS: invalid instruction encoding -0x22 0x10 # BAD32: invalid instruction encoding -0x22 0x10 # GOOD64: c.slli zero, 40 -0x22 0x10 # NOHINTS: invalid instruction encoding -0x26 0x10 # BAD32: invalid instruction encoding -0x26 0x10 # GOOD64: c.slli zero, 41 -0x26 0x10 # NOHINTS: invalid instruction encoding -0x2A 0x10 # BAD32: invalid instruction encoding -0x2A 0x10 # GOOD64: c.slli zero, 42 -0x2A 0x10 # NOHINTS: invalid instruction encoding -0x2E 0x10 # BAD32: invalid instruction encoding -0x2E 0x10 # GOOD64: c.slli zero, 43 -0x2E 0x10 # NOHINTS: invalid instruction encoding -0x32 0x10 # BAD32: invalid instruction encoding -0x32 0x10 # GOOD64: c.slli zero, 44 -0x32 0x10 # NOHINTS: invalid instruction encoding -0x36 0x10 # BAD32: invalid instruction encoding -0x36 0x10 # GOOD64: c.slli zero, 45 -0x36 0x10 # NOHINTS: invalid instruction encoding -0x3A 0x10 # BAD32: invalid instruction encoding -0x3A 0x10 # GOOD64: c.slli zero, 46 -0x3A 0x10 # NOHINTS: invalid instruction encoding -0x3E 0x10 # BAD32: invalid instruction encoding -0x3E 0x10 # GOOD64: c.slli zero, 47 -0x3E 0x10 # NOHINTS: invalid instruction encoding -0x42 0x10 # BAD32: invalid instruction encoding -0x42 0x10 # GOOD64: c.slli zero, 48 -0x42 0x10 # NOHINTS: invalid instruction encoding -0x46 0x10 # BAD32: invalid instruction encoding -0x46 0x10 # GOOD64: c.slli zero, 49 -0x46 0x10 # NOHINTS: invalid instruction encoding -0x4A 0x10 # BAD32: invalid instruction encoding -0x4A 0x10 # GOOD64: c.slli zero, 50 -0x4A 0x10 # NOHINTS: invalid instruction encoding -0x4E 0x10 # BAD32: invalid instruction encoding -0x4E 0x10 # GOOD64: c.slli zero, 51 -0x4E 0x10 # NOHINTS: invalid instruction encoding -0x52 0x10 # BAD32: invalid instruction encoding -0x52 0x10 # GOOD64: c.slli zero, 52 -0x52 0x10 # NOHINTS: invalid instruction encoding -0x56 0x10 # BAD32: invalid instruction encoding -0x56 0x10 # GOOD64: c.slli zero, 53 -0x56 0x10 # NOHINTS: invalid instruction encoding -0x5A 0x10 # BAD32: invalid instruction encoding -0x5A 0x10 # GOOD64: c.slli zero, 54 -0x5A 0x10 # NOHINTS: invalid instruction encoding -0x5E 0x10 # BAD32: invalid instruction encoding -0x5E 0x10 # GOOD64: c.slli zero, 55 -0x5E 0x10 # NOHINTS: invalid instruction encoding -0x62 0x10 # BAD32: invalid instruction encoding -0x62 0x10 # GOOD64: c.slli zero, 56 -0x62 0x10 # NOHINTS: invalid instruction encoding -0x66 0x10 # BAD32: invalid instruction encoding -0x66 0x10 # GOOD64: c.slli zero, 57 -0x66 0x10 # NOHINTS: invalid instruction encoding -0x6A 0x10 # BAD32: invalid instruction encoding -0x6A 0x10 # GOOD64: c.slli zero, 58 -0x6A 0x10 # NOHINTS: invalid instruction encoding -0x6E 0x10 # BAD32: invalid instruction encoding -0x6E 0x10 # GOOD64: c.slli zero, 59 -0x6E 0x10 # NOHINTS: invalid instruction encoding -0x72 0x10 # BAD32: invalid instruction encoding -0x72 0x10 # GOOD64: c.slli zero, 60 -0x72 0x10 # NOHINTS: invalid instruction encoding -0x76 0x10 # BAD32: invalid instruction encoding -0x76 0x10 # GOOD64: c.slli zero, 61 -0x76 0x10 # NOHINTS: invalid instruction encoding -0x7A 0x10 # BAD32: invalid instruction encoding -0x7A 0x10 # GOOD64: c.slli zero, 62 -0x7A 0x10 # NOHINTS: invalid instruction encoding -0x7E 0x10 # BAD32: invalid instruction encoding -0x7E 0x10 # GOOD64: c.slli zero, 63 -0x7E 0x10 # NOHINTS: invalid instruction encoding -# GOOD: c.slli64 ra +# GOOD: c.slli64 zero # NOHINTS: invalid instruction encoding -0x82 0x00 -0x86 0x00 # GOOD: c.slli ra, 1 -0x86 0x00 # GOOD: c.slli ra, 1 -0x8A 0x00 # GOOD: c.slli ra, 2 -0x8E 0x00 # GOOD: c.slli ra, 3 -0x92 0x00 # GOOD: c.slli ra, 4 -0x96 0x00 # GOOD: c.slli ra, 5 -0x9A 0x00 # GOOD: c.slli ra, 6 -0x9E 0x00 # GOOD: c.slli ra, 7 -0xA2 0x00 # GOOD: c.slli ra, 8 -0xA6 0x00 # GOOD: c.slli ra, 9 -0xAA 0x00 # GOOD: c.slli ra, 10 -0xAE 0x00 # GOOD: c.slli ra, 11 -0xB2 0x00 # GOOD: c.slli ra, 12 -0xB6 0x00 # GOOD: c.slli ra, 13 -0xBA 0x00 # GOOD: c.slli ra, 14 -0xBE 0x00 # GOOD: c.slli ra, 15 -0xC2 0x00 # GOOD: c.slli ra, 16 -0xC6 0x00 # GOOD: c.slli ra, 17 -0xCA 0x00 # GOOD: c.slli ra, 18 -0xCE 0x00 # GOOD: c.slli ra, 19 -0xD2 0x00 # GOOD: c.slli ra, 20 -0xD6 0x00 # GOOD: c.slli ra, 21 -0xDA 0x00 # GOOD: c.slli ra, 22 -0xDE 0x00 # GOOD: c.slli ra, 23 -0xE2 0x00 # GOOD: c.slli ra, 24 -0xE6 0x00 # GOOD: c.slli ra, 25 -0xEA 0x00 # GOOD: c.slli ra, 26 -0xEE 0x00 # GOOD: c.slli ra, 27 -0xF2 0x00 # GOOD: c.slli ra, 28 -0xF6 0x00 # GOOD: c.slli ra, 29 -0xFA 0x00 # GOOD: c.slli ra, 30 -0xFE 0x00 # GOOD: c.slli ra, 31 -0x82 0x10 # BAD32: invalid instruction encoding -0x82 0x10 # GOOD64: c.slli ra, 32 -0x86 0x10 # BAD32: invalid instruction encoding -0x86 0x10 # GOOD64: c.slli ra, 33 -0x8A 0x10 # BAD32: invalid instruction encoding -0x8A 0x10 # GOOD64: c.slli ra, 34 -0x8E 0x10 # BAD32: invalid instruction encoding -0x8E 0x10 # GOOD64: c.slli ra, 35 -0x92 0x10 # BAD32: invalid instruction encoding -0x92 0x10 # GOOD64: c.slli ra, 36 -0x96 0x10 # BAD32: invalid instruction encoding -0x96 0x10 # GOOD64: c.slli ra, 37 -0x9A 0x10 # BAD32: invalid instruction encoding -0x9A 0x10 # GOOD64: c.slli ra, 38 -0x9E 0x10 # BAD32: invalid instruction encoding -0x9E 0x10 # GOOD64: c.slli ra, 39 -0xA2 0x10 # BAD32: invalid instruction encoding -0xA2 0x10 # GOOD64: c.slli ra, 40 -0xA6 0x10 # BAD32: invalid instruction encoding -0xA6 0x10 # GOOD64: c.slli ra, 41 -0xAA 0x10 # BAD32: invalid instruction encoding -0xAA 0x10 # GOOD64: c.slli ra, 42 -0xAE 0x10 # BAD32: invalid instruction encoding -0xAE 0x10 # GOOD64: c.slli ra, 43 -0xB2 0x10 # BAD32: invalid instruction encoding -0xB2 0x10 # GOOD64: c.slli ra, 44 -0xB6 0x10 # BAD32: invalid instruction encoding -0xB6 0x10 # GOOD64: c.slli ra, 45 -0xBA 0x10 # BAD32: invalid instruction encoding -0xBA 0x10 # GOOD64: c.slli ra, 46 -0xBE 0x10 # BAD32: invalid instruction encoding -0xBE 0x10 # GOOD64: c.slli ra, 47 -0xC2 0x10 # BAD32: invalid instruction encoding -0xC2 0x10 # GOOD64: c.slli ra, 48 -0xC6 0x10 # BAD32: invalid instruction encoding -0xC6 0x10 # GOOD64: c.slli ra, 49 -0xCA 0x10 # BAD32: invalid instruction encoding -0xCA 0x10 # GOOD64: c.slli ra, 50 -0xCE 0x10 # BAD32: invalid instruction encoding -0xCE 0x10 # GOOD64: c.slli ra, 51 -0xD2 0x10 # BAD32: invalid instruction encoding -0xD2 0x10 # GOOD64: c.slli ra, 52 -0xD6 0x10 # BAD32: invalid instruction encoding -0xD6 0x10 # GOOD64: c.slli ra, 53 -0xDA 0x10 # BAD32: invalid instruction encoding -0xDA 0x10 # GOOD64: c.slli ra, 54 -0xDE 0x10 # BAD32: invalid instruction encoding -0xDE 0x10 # GOOD64: c.slli ra, 55 -0xE2 0x10 # BAD32: invalid instruction encoding -0xE2 0x10 # GOOD64: c.slli ra, 56 -0xE6 0x10 # BAD32: invalid instruction encoding -0xE6 0x10 # GOOD64: c.slli ra, 57 -0xEA 0x10 # BAD32: invalid instruction encoding -0xEA 0x10 # GOOD64: c.slli ra, 58 -0xEE 0x10 # BAD32: invalid instruction encoding -0xEE 0x10 # GOOD64: c.slli ra, 59 -0xF2 0x10 # BAD32: invalid instruction encoding -0xF2 0x10 # GOOD64: c.slli ra, 60 -0xF6 0x10 # BAD32: invalid instruction encoding -0xF6 0x10 # GOOD64: c.slli ra, 61 -0xFA 0x10 # BAD32: invalid instruction encoding -0xFA 0x10 # GOOD64: c.slli ra, 62 -0xFE 0x10 # BAD32: invalid instruction encoding -0xFE 0x10 # GOOD64: c.slli ra, 63 -# GOOD: c.slli64 sp +0x02 0x00 + +# GOOD: c.slli zero, 1 # NOHINTS: invalid instruction encoding -0x02 0x01 -0x06 0x01 # GOOD: c.slli sp, 1 -0x0A 0x01 # GOOD: c.slli sp, 2 -0x0E 0x01 # GOOD: c.slli sp, 3 -0x12 0x01 # GOOD: c.slli sp, 4 -0x16 0x01 # GOOD: c.slli sp, 5 -0x1A 0x01 # GOOD: c.slli sp, 6 -0x1E 0x01 # GOOD: c.slli sp, 7 -0x22 0x01 # GOOD: c.slli sp, 8 -0x26 0x01 # GOOD: c.slli sp, 9 -0x2A 0x01 # GOOD: c.slli sp, 10 -0x2E 0x01 # GOOD: c.slli sp, 11 -0x32 0x01 # GOOD: c.slli sp, 12 -0x36 0x01 # GOOD: c.slli sp, 13 -0x3A 0x01 # GOOD: c.slli sp, 14 -0x3E 0x01 # GOOD: c.slli sp, 15 -0x42 0x01 # GOOD: c.slli sp, 16 -0x46 0x01 # GOOD: c.slli sp, 17 -0x4A 0x01 # GOOD: c.slli sp, 18 -0x4E 0x01 # GOOD: c.slli sp, 19 -0x52 0x01 # GOOD: c.slli sp, 20 -0x56 0x01 # GOOD: c.slli sp, 21 -0x5A 0x01 # GOOD: c.slli sp, 22 -0x5E 0x01 # GOOD: c.slli sp, 23 -0x62 0x01 # GOOD: c.slli sp, 24 -0x66 0x01 # GOOD: c.slli sp, 25 -0x6A 0x01 # GOOD: c.slli sp, 26 -0x6E 0x01 # GOOD: c.slli sp, 27 -0x72 0x01 # GOOD: c.slli sp, 28 -0x76 0x01 # GOOD: c.slli sp, 29 -0x7A 0x01 # GOOD: c.slli sp, 30 -0x7E 0x01 # GOOD: c.slli sp, 31 -0x02 0x11 # BAD32: invalid instruction encoding -0x02 0x11 # GOOD64: c.slli sp, 32 -0x06 0x11 # BAD32: invalid instruction encoding -0x06 0x11 # GOOD64: c.slli sp, 33 -0x0A 0x11 # BAD32: invalid instruction encoding -0x0A 0x11 # GOOD64: c.slli sp, 34 -0x0E 0x11 # BAD32: invalid instruction encoding -0x0E 0x11 # GOOD64: c.slli sp, 35 -0x12 0x11 # BAD32: invalid instruction encoding -0x12 0x11 # GOOD64: c.slli sp, 36 -0x16 0x11 # BAD32: invalid instruction encoding -0x16 0x11 # GOOD64: c.slli sp, 37 -0x1A 0x11 # BAD32: invalid instruction encoding -0x1A 0x11 # GOOD64: c.slli sp, 38 -0x1E 0x11 # BAD32: invalid instruction encoding -0x1E 0x11 # GOOD64: c.slli sp, 39 -0x22 0x11 # BAD32: invalid instruction encoding -0x22 0x11 # GOOD64: c.slli sp, 40 -0x26 0x11 # BAD32: invalid instruction encoding -0x26 0x11 # GOOD64: c.slli sp, 41 -0x2A 0x11 # BAD32: invalid instruction encoding -0x2A 0x11 # GOOD64: c.slli sp, 42 -0x2E 0x11 # BAD32: invalid instruction encoding -0x2E 0x11 # GOOD64: c.slli sp, 43 -0x32 0x11 # BAD32: invalid instruction encoding -0x32 0x11 # GOOD64: c.slli sp, 44 -0x36 0x11 # BAD32: invalid instruction encoding -0x36 0x11 # GOOD64: c.slli sp, 45 -0x3A 0x11 # BAD32: invalid instruction encoding -0x3A 0x11 # GOOD64: c.slli sp, 46 -0x3E 0x11 # BAD32: invalid instruction encoding -0x3E 0x11 # GOOD64: c.slli sp, 47 -0x42 0x11 # BAD32: invalid instruction encoding -0x42 0x11 # GOOD64: c.slli sp, 48 -0x46 0x11 # BAD32: invalid instruction encoding -0x46 0x11 # GOOD64: c.slli sp, 49 -0x4A 0x11 # BAD32: invalid instruction encoding -0x4A 0x11 # GOOD64: c.slli sp, 50 -0x4E 0x11 # BAD32: invalid instruction encoding -0x4E 0x11 # GOOD64: c.slli sp, 51 -0x52 0x11 # BAD32: invalid instruction encoding -0x52 0x11 # GOOD64: c.slli sp, 52 -0x56 0x11 # BAD32: invalid instruction encoding -0x56 0x11 # GOOD64: c.slli sp, 53 -0x5A 0x11 # BAD32: invalid instruction encoding -0x5A 0x11 # GOOD64: c.slli sp, 54 -0x5E 0x11 # BAD32: invalid instruction encoding -0x5E 0x11 # GOOD64: c.slli sp, 55 -0x62 0x11 # BAD32: invalid instruction encoding -0x62 0x11 # GOOD64: c.slli sp, 56 -0x66 0x11 # BAD32: invalid instruction encoding -0x66 0x11 # GOOD64: c.slli sp, 57 -0x6A 0x11 # BAD32: invalid instruction encoding -0x6A 0x11 # GOOD64: c.slli sp, 58 -0x6E 0x11 # BAD32: invalid instruction encoding -0x6E 0x11 # GOOD64: c.slli sp, 59 -0x72 0x11 # BAD32: invalid instruction encoding -0x72 0x11 # GOOD64: c.slli sp, 60 -0x76 0x11 # BAD32: invalid instruction encoding -0x76 0x11 # GOOD64: c.slli sp, 61 -0x7A 0x11 # BAD32: invalid instruction encoding -0x7A 0x11 # GOOD64: c.slli sp, 62 -0x7E 0x11 # BAD32: invalid instruction encoding -0x7E 0x11 # GOOD64: c.slli sp, 63 -# GOOD: c.slli64 gp +0x06 0x00 + +# GOOD: c.slli zero, 2 # NOHINTS: invalid instruction encoding -0x82 0x01 -0x86 0x01 # GOOD: c.slli gp, 1 -0x8A 0x01 # GOOD: c.slli gp, 2 -0x8E 0x01 # GOOD: c.slli gp, 3 -0x92 0x01 # GOOD: c.slli gp, 4 -0x96 0x01 # GOOD: c.slli gp, 5 -0x9A 0x01 # GOOD: c.slli gp, 6 -0x9E 0x01 # GOOD: c.slli gp, 7 -0xA2 0x01 # GOOD: c.slli gp, 8 -0xA6 0x01 # GOOD: c.slli gp, 9 -0xAA 0x01 # GOOD: c.slli gp, 10 -0xAE 0x01 # GOOD: c.slli gp, 11 -0xB2 0x01 # GOOD: c.slli gp, 12 -0xB6 0x01 # GOOD: c.slli gp, 13 -0xBA 0x01 # GOOD: c.slli gp, 14 -0xBE 0x01 # GOOD: c.slli gp, 15 -0xC2 0x01 # GOOD: c.slli gp, 16 -0xC6 0x01 # GOOD: c.slli gp, 17 -0xCA 0x01 # GOOD: c.slli gp, 18 -0xCE 0x01 # GOOD: c.slli gp, 19 -0xD2 0x01 # GOOD: c.slli gp, 20 -0xD6 0x01 # GOOD: c.slli gp, 21 -0xDA 0x01 # GOOD: c.slli gp, 22 -0xDE 0x01 # GOOD: c.slli gp, 23 -0xE2 0x01 # GOOD: c.slli gp, 24 -0xE6 0x01 # GOOD: c.slli gp, 25 -0xEA 0x01 # GOOD: c.slli gp, 26 -0xEE 0x01 # GOOD: c.slli gp, 27 -0xF2 0x01 # GOOD: c.slli gp, 28 -0xF6 0x01 # GOOD: c.slli gp, 29 -0xFA 0x01 # GOOD: c.slli gp, 30 -0xFE 0x01 # GOOD: c.slli gp, 31 -0x82 0x11 # BAD32: invalid instruction encoding -0x82 0x11 # GOOD64: c.slli gp, 32 -0x86 0x11 # BAD32: invalid instruction encoding -0x86 0x11 # GOOD64: c.slli gp, 33 -0x8A 0x11 # BAD32: invalid instruction encoding -0x8A 0x11 # GOOD64: c.slli gp, 34 -0x8E 0x11 # BAD32: invalid instruction encoding -0x8E 0x11 # GOOD64: c.slli gp, 35 -0x92 0x11 # BAD32: invalid instruction encoding -0x92 0x11 # GOOD64: c.slli gp, 36 -0x96 0x11 # BAD32: invalid instruction encoding -0x96 0x11 # GOOD64: c.slli gp, 37 -0x9A 0x11 # BAD32: invalid instruction encoding -0x9A 0x11 # GOOD64: c.slli gp, 38 -0x9E 0x11 # BAD32: invalid instruction encoding -0x9E 0x11 # GOOD64: c.slli gp, 39 -0xA2 0x11 # BAD32: invalid instruction encoding -0xA2 0x11 # GOOD64: c.slli gp, 40 -0xA6 0x11 # BAD32: invalid instruction encoding -0xA6 0x11 # GOOD64: c.slli gp, 41 -0xAA 0x11 # BAD32: invalid instruction encoding -0xAA 0x11 # GOOD64: c.slli gp, 42 -0xAE 0x11 # BAD32: invalid instruction encoding -0xAE 0x11 # GOOD64: c.slli gp, 43 -0xB2 0x11 # BAD32: invalid instruction encoding -0xB2 0x11 # GOOD64: c.slli gp, 44 -0xB6 0x11 # BAD32: invalid instruction encoding -0xB6 0x11 # GOOD64: c.slli gp, 45 -0xBA 0x11 # BAD32: invalid instruction encoding -0xBA 0x11 # GOOD64: c.slli gp, 46 -0xBE 0x11 # BAD32: invalid instruction encoding -0xBE 0x11 # GOOD64: c.slli gp, 47 -0xC2 0x11 # BAD32: invalid instruction encoding -0xC2 0x11 # GOOD64: c.slli gp, 48 -0xC6 0x11 # BAD32: invalid instruction encoding -0xC6 0x11 # GOOD64: c.slli gp, 49 -0xCA 0x11 # BAD32: invalid instruction encoding -0xCA 0x11 # GOOD64: c.slli gp, 50 -0xCE 0x11 # BAD32: invalid instruction encoding -0xCE 0x11 # GOOD64: c.slli gp, 51 -0xD2 0x11 # BAD32: invalid instruction encoding -0xD2 0x11 # GOOD64: c.slli gp, 52 -0xD6 0x11 # BAD32: invalid instruction encoding -0xD6 0x11 # GOOD64: c.slli gp, 53 -0xDA 0x11 # BAD32: invalid instruction encoding -0xDA 0x11 # GOOD64: c.slli gp, 54 -0xDE 0x11 # BAD32: invalid instruction encoding -0xDE 0x11 # GOOD64: c.slli gp, 55 -0xE2 0x11 # BAD32: invalid instruction encoding -0xE2 0x11 # GOOD64: c.slli gp, 56 -0xE6 0x11 # BAD32: invalid instruction encoding -0xE6 0x11 # GOOD64: c.slli gp, 57 -0xEA 0x11 # BAD32: invalid instruction encoding -0xEA 0x11 # GOOD64: c.slli gp, 58 -0xEE 0x11 # BAD32: invalid instruction encoding -0xEE 0x11 # GOOD64: c.slli gp, 59 -0xF2 0x11 # BAD32: invalid instruction encoding -0xF2 0x11 # GOOD64: c.slli gp, 60 -0xF6 0x11 # BAD32: invalid instruction encoding -0xF6 0x11 # GOOD64: c.slli gp, 61 -0xFA 0x11 # BAD32: invalid instruction encoding -0xFA 0x11 # GOOD64: c.slli gp, 62 -0xFE 0x11 # BAD32: invalid instruction encoding -0xFE 0x11 # GOOD64: c.slli gp, 63 -# GOOD: c.slli64 tp +0x0A 0x00 + +# GOOD: c.slli zero, 3 # NOHINTS: invalid instruction encoding -0x02 0x02 -0x06 0x02 # GOOD: c.slli tp, 1 -0x0A 0x02 # GOOD: c.slli tp, 2 -0x0E 0x02 # GOOD: c.slli tp, 3 -0x12 0x02 # GOOD: c.slli tp, 4 -0x16 0x02 # GOOD: c.slli tp, 5 -0x1A 0x02 # GOOD: c.slli tp, 6 -0x1E 0x02 # GOOD: c.slli tp, 7 -0x22 0x02 # GOOD: c.slli tp, 8 -0x26 0x02 # GOOD: c.slli tp, 9 -0x2A 0x02 # GOOD: c.slli tp, 10 -0x2E 0x02 # GOOD: c.slli tp, 11 -0x32 0x02 # GOOD: c.slli tp, 12 -0x36 0x02 # GOOD: c.slli tp, 13 -0x3A 0x02 # GOOD: c.slli tp, 14 -0x3E 0x02 # GOOD: c.slli tp, 15 -0x42 0x02 # GOOD: c.slli tp, 16 -0x46 0x02 # GOOD: c.slli tp, 17 -0x4A 0x02 # GOOD: c.slli tp, 18 -0x4E 0x02 # GOOD: c.slli tp, 19 -0x52 0x02 # GOOD: c.slli tp, 20 -0x56 0x02 # GOOD: c.slli tp, 21 -0x5A 0x02 # GOOD: c.slli tp, 22 -0x5E 0x02 # GOOD: c.slli tp, 23 -0x62 0x02 # GOOD: c.slli tp, 24 -0x66 0x02 # GOOD: c.slli tp, 25 -0x6A 0x02 # GOOD: c.slli tp, 26 -0x6E 0x02 # GOOD: c.slli tp, 27 -0x72 0x02 # GOOD: c.slli tp, 28 -0x76 0x02 # GOOD: c.slli tp, 29 -0x7A 0x02 # GOOD: c.slli tp, 30 -0x7E 0x02 # GOOD: c.slli tp, 31 -0x02 0x12 # BAD32: invalid instruction encoding -0x02 0x12 # GOOD64: c.slli tp, 32 -0x06 0x12 # BAD32: invalid instruction encoding -0x06 0x12 # GOOD64: c.slli tp, 33 -0x0A 0x12 # BAD32: invalid instruction encoding -0x0A 0x12 # GOOD64: c.slli tp, 34 -0x0E 0x12 # BAD32: invalid instruction encoding -0x0E 0x12 # GOOD64: c.slli tp, 35 -0x12 0x12 # BAD32: invalid instruction encoding -0x12 0x12 # GOOD64: c.slli tp, 36 -0x16 0x12 # BAD32: invalid instruction encoding -0x16 0x12 # GOOD64: c.slli tp, 37 -0x1A 0x12 # BAD32: invalid instruction encoding -0x1A 0x12 # GOOD64: c.slli tp, 38 -0x1E 0x12 # BAD32: invalid instruction encoding -0x1E 0x12 # GOOD64: c.slli tp, 39 -0x22 0x12 # BAD32: invalid instruction encoding -0x22 0x12 # GOOD64: c.slli tp, 40 -0x26 0x12 # BAD32: invalid instruction encoding -0x26 0x12 # GOOD64: c.slli tp, 41 -0x2A 0x12 # BAD32: invalid instruction encoding -0x2A 0x12 # GOOD64: c.slli tp, 42 -0x2E 0x12 # BAD32: invalid instruction encoding -0x2E 0x12 # GOOD64: c.slli tp, 43 -0x32 0x12 # BAD32: invalid instruction encoding -0x32 0x12 # GOOD64: c.slli tp, 44 -0x36 0x12 # BAD32: invalid instruction encoding -0x36 0x12 # GOOD64: c.slli tp, 45 -0x3A 0x12 # BAD32: invalid instruction encoding -0x3A 0x12 # GOOD64: c.slli tp, 46 -0x3E 0x12 # BAD32: invalid instruction encoding -0x3E 0x12 # GOOD64: c.slli tp, 47 -0x42 0x12 # BAD32: invalid instruction encoding -0x42 0x12 # GOOD64: c.slli tp, 48 -0x46 0x12 # BAD32: invalid instruction encoding -0x46 0x12 # GOOD64: c.slli tp, 49 -0x4A 0x12 # BAD32: invalid instruction encoding -0x4A 0x12 # GOOD64: c.slli tp, 50 -0x4E 0x12 # BAD32: invalid instruction encoding -0x4E 0x12 # GOOD64: c.slli tp, 51 -0x52 0x12 # BAD32: invalid instruction encoding -0x52 0x12 # GOOD64: c.slli tp, 52 -0x56 0x12 # BAD32: invalid instruction encoding -0x56 0x12 # GOOD64: c.slli tp, 53 -0x5A 0x12 # BAD32: invalid instruction encoding -0x5A 0x12 # GOOD64: c.slli tp, 54 -0x5E 0x12 # BAD32: invalid instruction encoding -0x5E 0x12 # GOOD64: c.slli tp, 55 -0x62 0x12 # BAD32: invalid instruction encoding -0x62 0x12 # GOOD64: c.slli tp, 56 -0x66 0x12 # BAD32: invalid instruction encoding -0x66 0x12 # GOOD64: c.slli tp, 57 -0x6A 0x12 # BAD32: invalid instruction encoding -0x6A 0x12 # GOOD64: c.slli tp, 58 -0x6E 0x12 # BAD32: invalid instruction encoding -0x6E 0x12 # GOOD64: c.slli tp, 59 -0x72 0x12 # BAD32: invalid instruction encoding -0x72 0x12 # GOOD64: c.slli tp, 60 -0x76 0x12 # BAD32: invalid instruction encoding -0x76 0x12 # GOOD64: c.slli tp, 61 -0x7A 0x12 # BAD32: invalid instruction encoding -0x7A 0x12 # GOOD64: c.slli tp, 62 -0x7E 0x12 # BAD32: invalid instruction encoding -0x7E 0x12 # GOOD64: c.slli tp, 63 -# GOOD: c.slli64 t0 +0x0E 0x00 + +# GOOD: c.slli zero, 4 # NOHINTS: invalid instruction encoding -0x82 0x02 -0x86 0x02 # GOOD: c.slli t0, 1 -0x8A 0x02 # GOOD: c.slli t0, 2 -0x8E 0x02 # GOOD: c.slli t0, 3 -0x92 0x02 # GOOD: c.slli t0, 4 -0x96 0x02 # GOOD: c.slli t0, 5 -0x9A 0x02 # GOOD: c.slli t0, 6 -0x9E 0x02 # GOOD: c.slli t0, 7 -0xA2 0x02 # GOOD: c.slli t0, 8 -0xA6 0x02 # GOOD: c.slli t0, 9 -0xAA 0x02 # GOOD: c.slli t0, 10 -0xAE 0x02 # GOOD: c.slli t0, 11 -0xB2 0x02 # GOOD: c.slli t0, 12 -0xB6 0x02 # GOOD: c.slli t0, 13 -0xBA 0x02 # GOOD: c.slli t0, 14 -0xBE 0x02 # GOOD: c.slli t0, 15 -0xC2 0x02 # GOOD: c.slli t0, 16 -0xC6 0x02 # GOOD: c.slli t0, 17 -0xCA 0x02 # GOOD: c.slli t0, 18 -0xCE 0x02 # GOOD: c.slli t0, 19 -0xD2 0x02 # GOOD: c.slli t0, 20 -0xD6 0x02 # GOOD: c.slli t0, 21 -0xDA 0x02 # GOOD: c.slli t0, 22 -0xDE 0x02 # GOOD: c.slli t0, 23 -0xE2 0x02 # GOOD: c.slli t0, 24 -0xE6 0x02 # GOOD: c.slli t0, 25 -0xEA 0x02 # GOOD: c.slli t0, 26 -0xEE 0x02 # GOOD: c.slli t0, 27 -0xF2 0x02 # GOOD: c.slli t0, 28 -0xF6 0x02 # GOOD: c.slli t0, 29 -0xFA 0x02 # GOOD: c.slli t0, 30 -0xFE 0x02 # GOOD: c.slli t0, 31 -0x82 0x12 # BAD32: invalid instruction encoding -0x82 0x12 # GOOD64: c.slli t0, 32 -0x86 0x12 # BAD32: invalid instruction encoding -0x86 0x12 # GOOD64: c.slli t0, 33 -0x8A 0x12 # BAD32: invalid instruction encoding -0x8A 0x12 # GOOD64: c.slli t0, 34 -0x8E 0x12 # BAD32: invalid instruction encoding -0x8E 0x12 # GOOD64: c.slli t0, 35 -0x92 0x12 # BAD32: invalid instruction encoding -0x92 0x12 # GOOD64: c.slli t0, 36 -0x96 0x12 # BAD32: invalid instruction encoding -0x96 0x12 # GOOD64: c.slli t0, 37 -0x9A 0x12 # BAD32: invalid instruction encoding -0x9A 0x12 # GOOD64: c.slli t0, 38 -0x9E 0x12 # BAD32: invalid instruction encoding -0x9E 0x12 # GOOD64: c.slli t0, 39 -0xA2 0x12 # BAD32: invalid instruction encoding -0xA2 0x12 # GOOD64: c.slli t0, 40 -0xA6 0x12 # BAD32: invalid instruction encoding -0xA6 0x12 # GOOD64: c.slli t0, 41 -0xAA 0x12 # BAD32: invalid instruction encoding -0xAA 0x12 # GOOD64: c.slli t0, 42 -0xAE 0x12 # BAD32: invalid instruction encoding -0xAE 0x12 # GOOD64: c.slli t0, 43 -0xB2 0x12 # BAD32: invalid instruction encoding -0xB2 0x12 # GOOD64: c.slli t0, 44 -0xB6 0x12 # BAD32: invalid instruction encoding -0xB6 0x12 # GOOD64: c.slli t0, 45 -0xBA 0x12 # BAD32: invalid instruction encoding -0xBA 0x12 # GOOD64: c.slli t0, 46 -0xBE 0x12 # BAD32: invalid instruction encoding -0xBE 0x12 # GOOD64: c.slli t0, 47 -0xC2 0x12 # BAD32: invalid instruction encoding -0xC2 0x12 # GOOD64: c.slli t0, 48 -0xC6 0x12 # BAD32: invalid instruction encoding -0xC6 0x12 # GOOD64: c.slli t0, 49 -0xCA 0x12 # BAD32: invalid instruction encoding -0xCA 0x12 # GOOD64: c.slli t0, 50 -0xCE 0x12 # BAD32: invalid instruction encoding -0xCE 0x12 # GOOD64: c.slli t0, 51 -0xD2 0x12 # BAD32: invalid instruction encoding -0xD2 0x12 # GOOD64: c.slli t0, 52 -0xD6 0x12 # BAD32: invalid instruction encoding -0xD6 0x12 # GOOD64: c.slli t0, 53 -0xDA 0x12 # BAD32: invalid instruction encoding -0xDA 0x12 # GOOD64: c.slli t0, 54 -0xDE 0x12 # BAD32: invalid instruction encoding -0xDE 0x12 # GOOD64: c.slli t0, 55 -0xE2 0x12 # BAD32: invalid instruction encoding -0xE2 0x12 # GOOD64: c.slli t0, 56 -0xE6 0x12 # BAD32: invalid instruction encoding -0xE6 0x12 # GOOD64: c.slli t0, 57 -0xEA 0x12 # BAD32: invalid instruction encoding -0xEA 0x12 # GOOD64: c.slli t0, 58 -0xEE 0x12 # BAD32: invalid instruction encoding -0xEE 0x12 # GOOD64: c.slli t0, 59 -0xF2 0x12 # BAD32: invalid instruction encoding -0xF2 0x12 # GOOD64: c.slli t0, 60 -0xF6 0x12 # BAD32: invalid instruction encoding -0xF6 0x12 # GOOD64: c.slli t0, 61 -0xFA 0x12 # BAD32: invalid instruction encoding -0xFA 0x12 # GOOD64: c.slli t0, 62 -0xFE 0x12 # BAD32: invalid instruction encoding -0xFE 0x12 # GOOD64: c.slli t0, 63 -# GOOD: c.slli64 t1 +0x12 0x00 + +# GOOD: c.slli zero, 5 # NOHINTS: invalid instruction encoding -0x02 0x03 -0x06 0x03 # GOOD: c.slli t1, 1 -0x0A 0x03 # GOOD: c.slli t1, 2 -0x0E 0x03 # GOOD: c.slli t1, 3 -0x12 0x03 # GOOD: c.slli t1, 4 -0x16 0x03 # GOOD: c.slli t1, 5 -0x1A 0x03 # GOOD: c.slli t1, 6 -0x1E 0x03 # GOOD: c.slli t1, 7 -0x22 0x03 # GOOD: c.slli t1, 8 -0x26 0x03 # GOOD: c.slli t1, 9 -0x2A 0x03 # GOOD: c.slli t1, 10 -0x2E 0x03 # GOOD: c.slli t1, 11 -0x32 0x03 # GOOD: c.slli t1, 12 -0x36 0x03 # GOOD: c.slli t1, 13 -0x3A 0x03 # GOOD: c.slli t1, 14 -0x3E 0x03 # GOOD: c.slli t1, 15 -0x42 0x03 # GOOD: c.slli t1, 16 -0x46 0x03 # GOOD: c.slli t1, 17 -0x4A 0x03 # GOOD: c.slli t1, 18 -0x4E 0x03 # GOOD: c.slli t1, 19 -0x52 0x03 # GOOD: c.slli t1, 20 -0x56 0x03 # GOOD: c.slli t1, 21 -0x5A 0x03 # GOOD: c.slli t1, 22 -0x5E 0x03 # GOOD: c.slli t1, 23 -0x62 0x03 # GOOD: c.slli t1, 24 -0x66 0x03 # GOOD: c.slli t1, 25 -0x6A 0x03 # GOOD: c.slli t1, 26 -0x6E 0x03 # GOOD: c.slli t1, 27 -0x72 0x03 # GOOD: c.slli t1, 28 -0x76 0x03 # GOOD: c.slli t1, 29 -0x7A 0x03 # GOOD: c.slli t1, 30 -0x7E 0x03 # GOOD: c.slli t1, 31 -0x02 0x13 # BAD32: invalid instruction encoding -0x02 0x13 # GOOD64: c.slli t1, 32 -0x06 0x13 # BAD32: invalid instruction encoding -0x06 0x13 # GOOD64: c.slli t1, 33 -0x0A 0x13 # BAD32: invalid instruction encoding -0x0A 0x13 # GOOD64: c.slli t1, 34 -0x0E 0x13 # BAD32: invalid instruction encoding -0x0E 0x13 # GOOD64: c.slli t1, 35 -0x12 0x13 # BAD32: invalid instruction encoding -0x12 0x13 # GOOD64: c.slli t1, 36 -0x16 0x13 # BAD32: invalid instruction encoding -0x16 0x13 # GOOD64: c.slli t1, 37 -0x1A 0x13 # BAD32: invalid instruction encoding -0x1A 0x13 # GOOD64: c.slli t1, 38 -0x1E 0x13 # BAD32: invalid instruction encoding -0x1E 0x13 # GOOD64: c.slli t1, 39 -0x22 0x13 # BAD32: invalid instruction encoding -0x22 0x13 # GOOD64: c.slli t1, 40 -0x26 0x13 # BAD32: invalid instruction encoding -0x26 0x13 # GOOD64: c.slli t1, 41 -0x2A 0x13 # BAD32: invalid instruction encoding -0x2A 0x13 # GOOD64: c.slli t1, 42 -0x2E 0x13 # BAD32: invalid instruction encoding -0x2E 0x13 # GOOD64: c.slli t1, 43 -0x32 0x13 # BAD32: invalid instruction encoding -0x32 0x13 # GOOD64: c.slli t1, 44 -0x36 0x13 # BAD32: invalid instruction encoding -0x36 0x13 # GOOD64: c.slli t1, 45 -0x3A 0x13 # BAD32: invalid instruction encoding -0x3A 0x13 # GOOD64: c.slli t1, 46 -0x3E 0x13 # BAD32: invalid instruction encoding -0x3E 0x13 # GOOD64: c.slli t1, 47 -0x42 0x13 # BAD32: invalid instruction encoding -0x42 0x13 # GOOD64: c.slli t1, 48 -0x46 0x13 # BAD32: invalid instruction encoding -0x46 0x13 # GOOD64: c.slli t1, 49 -0x4A 0x13 # BAD32: invalid instruction encoding -0x4A 0x13 # GOOD64: c.slli t1, 50 -0x4E 0x13 # BAD32: invalid instruction encoding -0x4E 0x13 # GOOD64: c.slli t1, 51 -0x52 0x13 # BAD32: invalid instruction encoding -0x52 0x13 # GOOD64: c.slli t1, 52 -0x56 0x13 # BAD32: invalid instruction encoding -0x56 0x13 # GOOD64: c.slli t1, 53 -0x5A 0x13 # BAD32: invalid instruction encoding -0x5A 0x13 # GOOD64: c.slli t1, 54 -0x5E 0x13 # BAD32: invalid instruction encoding -0x5E 0x13 # GOOD64: c.slli t1, 55 -0x62 0x13 # BAD32: invalid instruction encoding -0x62 0x13 # GOOD64: c.slli t1, 56 -0x66 0x13 # BAD32: invalid instruction encoding -0x66 0x13 # GOOD64: c.slli t1, 57 -0x6A 0x13 # BAD32: invalid instruction encoding -0x6A 0x13 # GOOD64: c.slli t1, 58 -0x6E 0x13 # BAD32: invalid instruction encoding -0x6E 0x13 # GOOD64: c.slli t1, 59 -0x72 0x13 # BAD32: invalid instruction encoding -0x72 0x13 # GOOD64: c.slli t1, 60 -0x76 0x13 # BAD32: invalid instruction encoding -0x76 0x13 # GOOD64: c.slli t1, 61 -0x7A 0x13 # BAD32: invalid instruction encoding -0x7A 0x13 # GOOD64: c.slli t1, 62 -0x7E 0x13 # BAD32: invalid instruction encoding -0x7E 0x13 # GOOD64: c.slli t1, 63 -# GOOD: c.slli64 t2 +0x16 0x00 + +# GOOD: c.slli zero, 6 # NOHINTS: invalid instruction encoding -0x82 0x03 -0x86 0x03 # GOOD: c.slli t2, 1 -0x8A 0x03 # GOOD: c.slli t2, 2 -0x8E 0x03 # GOOD: c.slli t2, 3 -0x92 0x03 # GOOD: c.slli t2, 4 -0x96 0x03 # GOOD: c.slli t2, 5 -0x9A 0x03 # GOOD: c.slli t2, 6 -0x9E 0x03 # GOOD: c.slli t2, 7 -0xA2 0x03 # GOOD: c.slli t2, 8 -0xA6 0x03 # GOOD: c.slli t2, 9 -0xAA 0x03 # GOOD: c.slli t2, 10 -0xAE 0x03 # GOOD: c.slli t2, 11 -0xB2 0x03 # GOOD: c.slli t2, 12 -0xB6 0x03 # GOOD: c.slli t2, 13 -0xBA 0x03 # GOOD: c.slli t2, 14 -0xBE 0x03 # GOOD: c.slli t2, 15 -0xC2 0x03 # GOOD: c.slli t2, 16 -0xC6 0x03 # GOOD: c.slli t2, 17 -0xCA 0x03 # GOOD: c.slli t2, 18 -0xCE 0x03 # GOOD: c.slli t2, 19 -0xD2 0x03 # GOOD: c.slli t2, 20 -0xD6 0x03 # GOOD: c.slli t2, 21 -0xDA 0x03 # GOOD: c.slli t2, 22 -0xDE 0x03 # GOOD: c.slli t2, 23 -0xE2 0x03 # GOOD: c.slli t2, 24 -0xE6 0x03 # GOOD: c.slli t2, 25 -0xEA 0x03 # GOOD: c.slli t2, 26 -0xEE 0x03 # GOOD: c.slli t2, 27 -0xF2 0x03 # GOOD: c.slli t2, 28 -0xF6 0x03 # GOOD: c.slli t2, 29 -0xFA 0x03 # GOOD: c.slli t2, 30 -0xFE 0x03 # GOOD: c.slli t2, 31 -0x82 0x13 # BAD32: invalid instruction encoding -0x82 0x13 # GOOD64: c.slli t2, 32 -0x86 0x13 # BAD32: invalid instruction encoding -0x86 0x13 # GOOD64: c.slli t2, 33 -0x8A 0x13 # BAD32: invalid instruction encoding -0x8A 0x13 # GOOD64: c.slli t2, 34 -0x8E 0x13 # BAD32: invalid instruction encoding -0x8E 0x13 # GOOD64: c.slli t2, 35 -0x92 0x13 # BAD32: invalid instruction encoding -0x92 0x13 # GOOD64: c.slli t2, 36 -0x96 0x13 # BAD32: invalid instruction encoding -0x96 0x13 # GOOD64: c.slli t2, 37 -0x9A 0x13 # BAD32: invalid instruction encoding -0x9A 0x13 # GOOD64: c.slli t2, 38 -0x9E 0x13 # BAD32: invalid instruction encoding -0x9E 0x13 # GOOD64: c.slli t2, 39 -0xA2 0x13 # BAD32: invalid instruction encoding -0xA2 0x13 # GOOD64: c.slli t2, 40 -0xA6 0x13 # BAD32: invalid instruction encoding -0xA6 0x13 # GOOD64: c.slli t2, 41 -0xAA 0x13 # BAD32: invalid instruction encoding -0xAA 0x13 # GOOD64: c.slli t2, 42 -0xAE 0x13 # BAD32: invalid instruction encoding -0xAE 0x13 # GOOD64: c.slli t2, 43 -0xB2 0x13 # BAD32: invalid instruction encoding -0xB2 0x13 # GOOD64: c.slli t2, 44 -0xB6 0x13 # BAD32: invalid instruction encoding -0xB6 0x13 # GOOD64: c.slli t2, 45 -0xBA 0x13 # BAD32: invalid instruction encoding -0xBA 0x13 # GOOD64: c.slli t2, 46 -0xBE 0x13 # BAD32: invalid instruction encoding -0xBE 0x13 # GOOD64: c.slli t2, 47 -0xC2 0x13 # BAD32: invalid instruction encoding -0xC2 0x13 # GOOD64: c.slli t2, 48 -0xC6 0x13 # BAD32: invalid instruction encoding -0xC6 0x13 # GOOD64: c.slli t2, 49 -0xCA 0x13 # BAD32: invalid instruction encoding -0xCA 0x13 # GOOD64: c.slli t2, 50 -0xCE 0x13 # BAD32: invalid instruction encoding -0xCE 0x13 # GOOD64: c.slli t2, 51 -0xD2 0x13 # BAD32: invalid instruction encoding -0xD2 0x13 # GOOD64: c.slli t2, 52 -0xD6 0x13 # BAD32: invalid instruction encoding -0xD6 0x13 # GOOD64: c.slli t2, 53 -0xDA 0x13 # BAD32: invalid instruction encoding -0xDA 0x13 # GOOD64: c.slli t2, 54 -0xDE 0x13 # BAD32: invalid instruction encoding -0xDE 0x13 # GOOD64: c.slli t2, 55 -0xE2 0x13 # BAD32: invalid instruction encoding -0xE2 0x13 # GOOD64: c.slli t2, 56 -0xE6 0x13 # BAD32: invalid instruction encoding -0xE6 0x13 # GOOD64: c.slli t2, 57 -0xEA 0x13 # BAD32: invalid instruction encoding -0xEA 0x13 # GOOD64: c.slli t2, 58 -0xEE 0x13 # BAD32: invalid instruction encoding -0xEE 0x13 # GOOD64: c.slli t2, 59 -0xF2 0x13 # BAD32: invalid instruction encoding -0xF2 0x13 # GOOD64: c.slli t2, 60 -0xF6 0x13 # BAD32: invalid instruction encoding -0xF6 0x13 # GOOD64: c.slli t2, 61 -0xFA 0x13 # BAD32: invalid instruction encoding -0xFA 0x13 # GOOD64: c.slli t2, 62 -0xFE 0x13 # BAD32: invalid instruction encoding -0xFE 0x13 # GOOD64: c.slli t2, 63 -# GOOD: c.slli64 s0 +0x1A 0x00 + +# GOOD: c.slli zero, 7 # NOHINTS: invalid instruction encoding -0x02 0x04 -0x06 0x04 # GOOD: c.slli s0, 1 -0x0A 0x04 # GOOD: c.slli s0, 2 -0x0E 0x04 # GOOD: c.slli s0, 3 -0x12 0x04 # GOOD: c.slli s0, 4 -0x16 0x04 # GOOD: c.slli s0, 5 -0x1A 0x04 # GOOD: c.slli s0, 6 -0x1E 0x04 # GOOD: c.slli s0, 7 -0x22 0x04 # GOOD: c.slli s0, 8 -0x26 0x04 # GOOD: c.slli s0, 9 -0x2A 0x04 # GOOD: c.slli s0, 10 -0x2E 0x04 # GOOD: c.slli s0, 11 -0x32 0x04 # GOOD: c.slli s0, 12 -0x36 0x04 # GOOD: c.slli s0, 13 -0x3A 0x04 # GOOD: c.slli s0, 14 -0x3E 0x04 # GOOD: c.slli s0, 15 -0x42 0x04 # GOOD: c.slli s0, 16 -0x46 0x04 # GOOD: c.slli s0, 17 -0x4A 0x04 # GOOD: c.slli s0, 18 -0x4E 0x04 # GOOD: c.slli s0, 19 -0x52 0x04 # GOOD: c.slli s0, 20 -0x56 0x04 # GOOD: c.slli s0, 21 -0x5A 0x04 # GOOD: c.slli s0, 22 -0x5E 0x04 # GOOD: c.slli s0, 23 -0x62 0x04 # GOOD: c.slli s0, 24 -0x66 0x04 # GOOD: c.slli s0, 25 -0x6A 0x04 # GOOD: c.slli s0, 26 -0x6E 0x04 # GOOD: c.slli s0, 27 -0x72 0x04 # GOOD: c.slli s0, 28 -0x76 0x04 # GOOD: c.slli s0, 29 -0x7A 0x04 # GOOD: c.slli s0, 30 -0x7E 0x04 # GOOD: c.slli s0, 31 -0x02 0x14 # BAD32: invalid instruction encoding -0x02 0x14 # GOOD64: c.slli s0, 32 -0x06 0x14 # BAD32: invalid instruction encoding -0x06 0x14 # GOOD64: c.slli s0, 33 -0x0A 0x14 # BAD32: invalid instruction encoding -0x0A 0x14 # GOOD64: c.slli s0, 34 -0x0E 0x14 # BAD32: invalid instruction encoding -0x0E 0x14 # GOOD64: c.slli s0, 35 -0x12 0x14 # BAD32: invalid instruction encoding -0x12 0x14 # GOOD64: c.slli s0, 36 -0x16 0x14 # BAD32: invalid instruction encoding -0x16 0x14 # GOOD64: c.slli s0, 37 -0x1A 0x14 # BAD32: invalid instruction encoding -0x1A 0x14 # GOOD64: c.slli s0, 38 -0x1E 0x14 # BAD32: invalid instruction encoding -0x1E 0x14 # GOOD64: c.slli s0, 39 -0x22 0x14 # BAD32: invalid instruction encoding -0x22 0x14 # GOOD64: c.slli s0, 40 -0x26 0x14 # BAD32: invalid instruction encoding -0x26 0x14 # GOOD64: c.slli s0, 41 -0x2A 0x14 # BAD32: invalid instruction encoding -0x2A 0x14 # GOOD64: c.slli s0, 42 -0x2E 0x14 # BAD32: invalid instruction encoding -0x2E 0x14 # GOOD64: c.slli s0, 43 -0x32 0x14 # BAD32: invalid instruction encoding -0x32 0x14 # GOOD64: c.slli s0, 44 -0x36 0x14 # BAD32: invalid instruction encoding -0x36 0x14 # GOOD64: c.slli s0, 45 -0x3A 0x14 # BAD32: invalid instruction encoding -0x3A 0x14 # GOOD64: c.slli s0, 46 -0x3E 0x14 # BAD32: invalid instruction encoding -0x3E 0x14 # GOOD64: c.slli s0, 47 -0x42 0x14 # BAD32: invalid instruction encoding -0x42 0x14 # GOOD64: c.slli s0, 48 -0x46 0x14 # BAD32: invalid instruction encoding -0x46 0x14 # GOOD64: c.slli s0, 49 -0x4A 0x14 # BAD32: invalid instruction encoding -0x4A 0x14 # GOOD64: c.slli s0, 50 -0x4E 0x14 # BAD32: invalid instruction encoding -0x4E 0x14 # GOOD64: c.slli s0, 51 -0x52 0x14 # BAD32: invalid instruction encoding -0x52 0x14 # GOOD64: c.slli s0, 52 -0x56 0x14 # BAD32: invalid instruction encoding -0x56 0x14 # GOOD64: c.slli s0, 53 -0x5A 0x14 # BAD32: invalid instruction encoding -0x5A 0x14 # GOOD64: c.slli s0, 54 -0x5E 0x14 # BAD32: invalid instruction encoding -0x5E 0x14 # GOOD64: c.slli s0, 55 -0x62 0x14 # BAD32: invalid instruction encoding -0x62 0x14 # GOOD64: c.slli s0, 56 -0x66 0x14 # BAD32: invalid instruction encoding -0x66 0x14 # GOOD64: c.slli s0, 57 -0x6A 0x14 # BAD32: invalid instruction encoding -0x6A 0x14 # GOOD64: c.slli s0, 58 -0x6E 0x14 # BAD32: invalid instruction encoding -0x6E 0x14 # GOOD64: c.slli s0, 59 -0x72 0x14 # BAD32: invalid instruction encoding -0x72 0x14 # GOOD64: c.slli s0, 60 -0x76 0x14 # BAD32: invalid instruction encoding -0x76 0x14 # GOOD64: c.slli s0, 61 -0x7A 0x14 # BAD32: invalid instruction encoding -0x7A 0x14 # GOOD64: c.slli s0, 62 -0x7E 0x14 # BAD32: invalid instruction encoding -0x7E 0x14 # GOOD64: c.slli s0, 63 -# GOOD: c.slli64 s1 +0x1E 0x00 + +# GOOD: c.slli zero, 8 # NOHINTS: invalid instruction encoding -0x82 0x04 -0x86 0x04 # GOOD: c.slli s1, 1 -0x8A 0x04 # GOOD: c.slli s1, 2 -0x8E 0x04 # GOOD: c.slli s1, 3 -0x92 0x04 # GOOD: c.slli s1, 4 -0x96 0x04 # GOOD: c.slli s1, 5 -0x9A 0x04 # GOOD: c.slli s1, 6 -0x9E 0x04 # GOOD: c.slli s1, 7 -0xA2 0x04 # GOOD: c.slli s1, 8 -0xA6 0x04 # GOOD: c.slli s1, 9 -0xAA 0x04 # GOOD: c.slli s1, 10 -0xAE 0x04 # GOOD: c.slli s1, 11 -0xB2 0x04 # GOOD: c.slli s1, 12 -0xB6 0x04 # GOOD: c.slli s1, 13 -0xBA 0x04 # GOOD: c.slli s1, 14 -0xBE 0x04 # GOOD: c.slli s1, 15 -0xC2 0x04 # GOOD: c.slli s1, 16 -0xC6 0x04 # GOOD: c.slli s1, 17 -0xCA 0x04 # GOOD: c.slli s1, 18 -0xCE 0x04 # GOOD: c.slli s1, 19 -0xD2 0x04 # GOOD: c.slli s1, 20 -0xD6 0x04 # GOOD: c.slli s1, 21 -0xDA 0x04 # GOOD: c.slli s1, 22 -0xDE 0x04 # GOOD: c.slli s1, 23 -0xE2 0x04 # GOOD: c.slli s1, 24 -0xE6 0x04 # GOOD: c.slli s1, 25 -0xEA 0x04 # GOOD: c.slli s1, 26 -0xEE 0x04 # GOOD: c.slli s1, 27 -0xF2 0x04 # GOOD: c.slli s1, 28 -0xF6 0x04 # GOOD: c.slli s1, 29 -0xFA 0x04 # GOOD: c.slli s1, 30 -0xFE 0x04 # GOOD: c.slli s1, 31 -0x82 0x14 # BAD32: invalid instruction encoding -0x82 0x14 # GOOD64: c.slli s1, 32 -0x86 0x14 # BAD32: invalid instruction encoding -0x86 0x14 # GOOD64: c.slli s1, 33 -0x8A 0x14 # BAD32: invalid instruction encoding -0x8A 0x14 # GOOD64: c.slli s1, 34 -0x8E 0x14 # BAD32: invalid instruction encoding -0x8E 0x14 # GOOD64: c.slli s1, 35 -0x92 0x14 # BAD32: invalid instruction encoding -0x92 0x14 # GOOD64: c.slli s1, 36 -0x96 0x14 # BAD32: invalid instruction encoding -0x96 0x14 # GOOD64: c.slli s1, 37 -0x9A 0x14 # BAD32: invalid instruction encoding -0x9A 0x14 # GOOD64: c.slli s1, 38 -0x9E 0x14 # BAD32: invalid instruction encoding -0x9E 0x14 # GOOD64: c.slli s1, 39 -0xA2 0x14 # BAD32: invalid instruction encoding -0xA2 0x14 # GOOD64: c.slli s1, 40 -0xA6 0x14 # BAD32: invalid instruction encoding -0xA6 0x14 # GOOD64: c.slli s1, 41 -0xAA 0x14 # BAD32: invalid instruction encoding -0xAA 0x14 # GOOD64: c.slli s1, 42 -0xAE 0x14 # BAD32: invalid instruction encoding -0xAE 0x14 # GOOD64: c.slli s1, 43 -0xB2 0x14 # BAD32: invalid instruction encoding -0xB2 0x14 # GOOD64: c.slli s1, 44 -0xB6 0x14 # BAD32: invalid instruction encoding -0xB6 0x14 # GOOD64: c.slli s1, 45 -0xBA 0x14 # BAD32: invalid instruction encoding -0xBA 0x14 # GOOD64: c.slli s1, 46 -0xBE 0x14 # BAD32: invalid instruction encoding -0xBE 0x14 # GOOD64: c.slli s1, 47 -0xC2 0x14 # BAD32: invalid instruction encoding -0xC2 0x14 # GOOD64: c.slli s1, 48 -0xC6 0x14 # BAD32: invalid instruction encoding -0xC6 0x14 # GOOD64: c.slli s1, 49 -0xCA 0x14 # BAD32: invalid instruction encoding -0xCA 0x14 # GOOD64: c.slli s1, 50 -0xCE 0x14 # BAD32: invalid instruction encoding -0xCE 0x14 # GOOD64: c.slli s1, 51 -0xD2 0x14 # BAD32: invalid instruction encoding -0xD2 0x14 # GOOD64: c.slli s1, 52 -0xD6 0x14 # BAD32: invalid instruction encoding -0xD6 0x14 # GOOD64: c.slli s1, 53 -0xDA 0x14 # BAD32: invalid instruction encoding -0xDA 0x14 # GOOD64: c.slli s1, 54 -0xDE 0x14 # BAD32: invalid instruction encoding -0xDE 0x14 # GOOD64: c.slli s1, 55 -0xE2 0x14 # BAD32: invalid instruction encoding -0xE2 0x14 # GOOD64: c.slli s1, 56 -0xE6 0x14 # BAD32: invalid instruction encoding -0xE6 0x14 # GOOD64: c.slli s1, 57 -0xEA 0x14 # BAD32: invalid instruction encoding -0xEA 0x14 # GOOD64: c.slli s1, 58 -0xEE 0x14 # BAD32: invalid instruction encoding -0xEE 0x14 # GOOD64: c.slli s1, 59 -0xF2 0x14 # BAD32: invalid instruction encoding -0xF2 0x14 # GOOD64: c.slli s1, 60 -0xF6 0x14 # BAD32: invalid instruction encoding -0xF6 0x14 # GOOD64: c.slli s1, 61 -0xFA 0x14 # BAD32: invalid instruction encoding -0xFA 0x14 # GOOD64: c.slli s1, 62 -0xFE 0x14 # BAD32: invalid instruction encoding -0xFE 0x14 # GOOD64: c.slli s1, 63 -# GOOD: c.slli64 a0 +0x22 0x00 + +# GOOD: c.slli zero, 9 # NOHINTS: invalid instruction encoding -0x02 0x05 -0x06 0x05 # GOOD: c.slli a0, 1 -0x0A 0x05 # GOOD: c.slli a0, 2 -0x0E 0x05 # GOOD: c.slli a0, 3 -0x12 0x05 # GOOD: c.slli a0, 4 -0x16 0x05 # GOOD: c.slli a0, 5 -0x1A 0x05 # GOOD: c.slli a0, 6 -0x1E 0x05 # GOOD: c.slli a0, 7 -0x22 0x05 # GOOD: c.slli a0, 8 -0x26 0x05 # GOOD: c.slli a0, 9 -0x2A 0x05 # GOOD: c.slli a0, 10 -0x2E 0x05 # GOOD: c.slli a0, 11 -0x32 0x05 # GOOD: c.slli a0, 12 -0x36 0x05 # GOOD: c.slli a0, 13 -0x3A 0x05 # GOOD: c.slli a0, 14 -0x3E 0x05 # GOOD: c.slli a0, 15 -0x42 0x05 # GOOD: c.slli a0, 16 -0x46 0x05 # GOOD: c.slli a0, 17 -0x4A 0x05 # GOOD: c.slli a0, 18 -0x4E 0x05 # GOOD: c.slli a0, 19 -0x52 0x05 # GOOD: c.slli a0, 20 -0x56 0x05 # GOOD: c.slli a0, 21 -0x5A 0x05 # GOOD: c.slli a0, 22 -0x5E 0x05 # GOOD: c.slli a0, 23 -0x62 0x05 # GOOD: c.slli a0, 24 -0x66 0x05 # GOOD: c.slli a0, 25 -0x6A 0x05 # GOOD: c.slli a0, 26 -0x6E 0x05 # GOOD: c.slli a0, 27 -0x72 0x05 # GOOD: c.slli a0, 28 -0x76 0x05 # GOOD: c.slli a0, 29 -0x7A 0x05 # GOOD: c.slli a0, 30 -0x7E 0x05 # GOOD: c.slli a0, 31 -0x02 0x15 # BAD32: invalid instruction encoding -0x02 0x15 # GOOD64: c.slli a0, 32 -0x06 0x15 # BAD32: invalid instruction encoding -0x06 0x15 # GOOD64: c.slli a0, 33 -0x0A 0x15 # BAD32: invalid instruction encoding -0x0A 0x15 # GOOD64: c.slli a0, 34 -0x0E 0x15 # BAD32: invalid instruction encoding -0x0E 0x15 # GOOD64: c.slli a0, 35 -0x12 0x15 # BAD32: invalid instruction encoding -0x12 0x15 # GOOD64: c.slli a0, 36 -0x16 0x15 # BAD32: invalid instruction encoding -0x16 0x15 # GOOD64: c.slli a0, 37 -0x1A 0x15 # BAD32: invalid instruction encoding -0x1A 0x15 # GOOD64: c.slli a0, 38 -0x1E 0x15 # BAD32: invalid instruction encoding -0x1E 0x15 # GOOD64: c.slli a0, 39 -0x22 0x15 # BAD32: invalid instruction encoding -0x22 0x15 # GOOD64: c.slli a0, 40 -0x26 0x15 # BAD32: invalid instruction encoding -0x26 0x15 # GOOD64: c.slli a0, 41 -0x2A 0x15 # BAD32: invalid instruction encoding -0x2A 0x15 # GOOD64: c.slli a0, 42 -0x2E 0x15 # BAD32: invalid instruction encoding -0x2E 0x15 # GOOD64: c.slli a0, 43 -0x32 0x15 # BAD32: invalid instruction encoding -0x32 0x15 # GOOD64: c.slli a0, 44 -0x36 0x15 # BAD32: invalid instruction encoding -0x36 0x15 # GOOD64: c.slli a0, 45 -0x3A 0x15 # BAD32: invalid instruction encoding -0x3A 0x15 # GOOD64: c.slli a0, 46 -0x3E 0x15 # BAD32: invalid instruction encoding -0x3E 0x15 # GOOD64: c.slli a0, 47 -0x42 0x15 # BAD32: invalid instruction encoding -0x42 0x15 # GOOD64: c.slli a0, 48 -0x46 0x15 # BAD32: invalid instruction encoding -0x46 0x15 # GOOD64: c.slli a0, 49 -0x4A 0x15 # BAD32: invalid instruction encoding -0x4A 0x15 # GOOD64: c.slli a0, 50 -0x4E 0x15 # BAD32: invalid instruction encoding -0x4E 0x15 # GOOD64: c.slli a0, 51 -0x52 0x15 # BAD32: invalid instruction encoding -0x52 0x15 # GOOD64: c.slli a0, 52 -0x56 0x15 # BAD32: invalid instruction encoding -0x56 0x15 # GOOD64: c.slli a0, 53 -0x5A 0x15 # BAD32: invalid instruction encoding -0x5A 0x15 # GOOD64: c.slli a0, 54 -0x5E 0x15 # BAD32: invalid instruction encoding -0x5E 0x15 # GOOD64: c.slli a0, 55 -0x62 0x15 # BAD32: invalid instruction encoding -0x62 0x15 # GOOD64: c.slli a0, 56 -0x66 0x15 # BAD32: invalid instruction encoding -0x66 0x15 # GOOD64: c.slli a0, 57 -0x6A 0x15 # BAD32: invalid instruction encoding -0x6A 0x15 # GOOD64: c.slli a0, 58 -0x6E 0x15 # BAD32: invalid instruction encoding -0x6E 0x15 # GOOD64: c.slli a0, 59 -0x72 0x15 # BAD32: invalid instruction encoding -0x72 0x15 # GOOD64: c.slli a0, 60 -0x76 0x15 # BAD32: invalid instruction encoding -0x76 0x15 # GOOD64: c.slli a0, 61 -0x7A 0x15 # BAD32: invalid instruction encoding -0x7A 0x15 # GOOD64: c.slli a0, 62 -0x7E 0x15 # BAD32: invalid instruction encoding -0x7E 0x15 # GOOD64: c.slli a0, 63 -# GOOD: c.slli64 a1 +0x26 0x00 + +# GOOD: c.slli zero, 10 # NOHINTS: invalid instruction encoding -0x82 0x05 -0x86 0x05 # GOOD: c.slli a1, 1 -0x8A 0x05 # GOOD: c.slli a1, 2 -0x8E 0x05 # GOOD: c.slli a1, 3 -0x92 0x05 # GOOD: c.slli a1, 4 -0x96 0x05 # GOOD: c.slli a1, 5 -0x9A 0x05 # GOOD: c.slli a1, 6 -0x9E 0x05 # GOOD: c.slli a1, 7 -0xA2 0x05 # GOOD: c.slli a1, 8 -0xA6 0x05 # GOOD: c.slli a1, 9 -0xAA 0x05 # GOOD: c.slli a1, 10 -0xAE 0x05 # GOOD: c.slli a1, 11 -0xB2 0x05 # GOOD: c.slli a1, 12 -0xB6 0x05 # GOOD: c.slli a1, 13 -0xBA 0x05 # GOOD: c.slli a1, 14 -0xBE 0x05 # GOOD: c.slli a1, 15 -0xC2 0x05 # GOOD: c.slli a1, 16 -0xC6 0x05 # GOOD: c.slli a1, 17 -0xCA 0x05 # GOOD: c.slli a1, 18 -0xCE 0x05 # GOOD: c.slli a1, 19 -0xD2 0x05 # GOOD: c.slli a1, 20 -0xD6 0x05 # GOOD: c.slli a1, 21 -0xDA 0x05 # GOOD: c.slli a1, 22 -0xDE 0x05 # GOOD: c.slli a1, 23 -0xE2 0x05 # GOOD: c.slli a1, 24 -0xE6 0x05 # GOOD: c.slli a1, 25 -0xEA 0x05 # GOOD: c.slli a1, 26 -0xEE 0x05 # GOOD: c.slli a1, 27 -0xF2 0x05 # GOOD: c.slli a1, 28 -0xF6 0x05 # GOOD: c.slli a1, 29 -0xFA 0x05 # GOOD: c.slli a1, 30 -0xFE 0x05 # GOOD: c.slli a1, 31 -0x82 0x15 # BAD32: invalid instruction encoding -0x82 0x15 # GOOD64: c.slli a1, 32 -0x86 0x15 # BAD32: invalid instruction encoding -0x86 0x15 # GOOD64: c.slli a1, 33 -0x8A 0x15 # BAD32: invalid instruction encoding -0x8A 0x15 # GOOD64: c.slli a1, 34 -0x8E 0x15 # BAD32: invalid instruction encoding -0x8E 0x15 # GOOD64: c.slli a1, 35 -0x92 0x15 # BAD32: invalid instruction encoding -0x92 0x15 # GOOD64: c.slli a1, 36 -0x96 0x15 # BAD32: invalid instruction encoding -0x96 0x15 # GOOD64: c.slli a1, 37 -0x9A 0x15 # BAD32: invalid instruction encoding -0x9A 0x15 # GOOD64: c.slli a1, 38 -0x9E 0x15 # BAD32: invalid instruction encoding -0x9E 0x15 # GOOD64: c.slli a1, 39 -0xA2 0x15 # BAD32: invalid instruction encoding -0xA2 0x15 # GOOD64: c.slli a1, 40 -0xA6 0x15 # BAD32: invalid instruction encoding -0xA6 0x15 # GOOD64: c.slli a1, 41 -0xAA 0x15 # BAD32: invalid instruction encoding -0xAA 0x15 # GOOD64: c.slli a1, 42 -0xAE 0x15 # BAD32: invalid instruction encoding -0xAE 0x15 # GOOD64: c.slli a1, 43 -0xB2 0x15 # BAD32: invalid instruction encoding -0xB2 0x15 # GOOD64: c.slli a1, 44 -0xB6 0x15 # BAD32: invalid instruction encoding -0xB6 0x15 # GOOD64: c.slli a1, 45 -0xBA 0x15 # BAD32: invalid instruction encoding -0xBA 0x15 # GOOD64: c.slli a1, 46 -0xBE 0x15 # BAD32: invalid instruction encoding -0xBE 0x15 # GOOD64: c.slli a1, 47 -0xC2 0x15 # BAD32: invalid instruction encoding -0xC2 0x15 # GOOD64: c.slli a1, 48 -0xC6 0x15 # BAD32: invalid instruction encoding -0xC6 0x15 # GOOD64: c.slli a1, 49 -0xCA 0x15 # BAD32: invalid instruction encoding -0xCA 0x15 # GOOD64: c.slli a1, 50 -0xCE 0x15 # BAD32: invalid instruction encoding -0xCE 0x15 # GOOD64: c.slli a1, 51 -0xD2 0x15 # BAD32: invalid instruction encoding -0xD2 0x15 # GOOD64: c.slli a1, 52 -0xD6 0x15 # BAD32: invalid instruction encoding -0xD6 0x15 # GOOD64: c.slli a1, 53 -0xDA 0x15 # BAD32: invalid instruction encoding -0xDA 0x15 # GOOD64: c.slli a1, 54 -0xDE 0x15 # BAD32: invalid instruction encoding -0xDE 0x15 # GOOD64: c.slli a1, 55 -0xE2 0x15 # BAD32: invalid instruction encoding -0xE2 0x15 # GOOD64: c.slli a1, 56 -0xE6 0x15 # BAD32: invalid instruction encoding -0xE6 0x15 # GOOD64: c.slli a1, 57 -0xEA 0x15 # BAD32: invalid instruction encoding -0xEA 0x15 # GOOD64: c.slli a1, 58 -0xEE 0x15 # BAD32: invalid instruction encoding -0xEE 0x15 # GOOD64: c.slli a1, 59 -0xF2 0x15 # BAD32: invalid instruction encoding -0xF2 0x15 # GOOD64: c.slli a1, 60 -0xF6 0x15 # BAD32: invalid instruction encoding -0xF6 0x15 # GOOD64: c.slli a1, 61 -0xFA 0x15 # BAD32: invalid instruction encoding -0xFA 0x15 # GOOD64: c.slli a1, 62 -0xFE 0x15 # BAD32: invalid instruction encoding -0xFE 0x15 # GOOD64: c.slli a1, 63 -# GOOD: c.slli64 a2 +0x2A 0x00 + +# GOOD: c.slli zero, 11 # NOHINTS: invalid instruction encoding -0x02 0x06 -0x06 0x06 # GOOD: c.slli a2, 1 -0x0A 0x06 # GOOD: c.slli a2, 2 -0x0E 0x06 # GOOD: c.slli a2, 3 -0x12 0x06 # GOOD: c.slli a2, 4 -0x16 0x06 # GOOD: c.slli a2, 5 -0x1A 0x06 # GOOD: c.slli a2, 6 -0x1E 0x06 # GOOD: c.slli a2, 7 -0x22 0x06 # GOOD: c.slli a2, 8 -0x26 0x06 # GOOD: c.slli a2, 9 -0x2A 0x06 # GOOD: c.slli a2, 10 -0x2E 0x06 # GOOD: c.slli a2, 11 -0x32 0x06 # GOOD: c.slli a2, 12 -0x36 0x06 # GOOD: c.slli a2, 13 -0x3A 0x06 # GOOD: c.slli a2, 14 -0x3E 0x06 # GOOD: c.slli a2, 15 -0x42 0x06 # GOOD: c.slli a2, 16 -0x46 0x06 # GOOD: c.slli a2, 17 -0x4A 0x06 # GOOD: c.slli a2, 18 -0x4E 0x06 # GOOD: c.slli a2, 19 -0x52 0x06 # GOOD: c.slli a2, 20 -0x56 0x06 # GOOD: c.slli a2, 21 -0x5A 0x06 # GOOD: c.slli a2, 22 -0x5E 0x06 # GOOD: c.slli a2, 23 -0x62 0x06 # GOOD: c.slli a2, 24 -0x66 0x06 # GOOD: c.slli a2, 25 -0x6A 0x06 # GOOD: c.slli a2, 26 -0x6E 0x06 # GOOD: c.slli a2, 27 -0x72 0x06 # GOOD: c.slli a2, 28 -0x76 0x06 # GOOD: c.slli a2, 29 -0x7A 0x06 # GOOD: c.slli a2, 30 -0x7E 0x06 # GOOD: c.slli a2, 31 -0x02 0x16 # BAD32: invalid instruction encoding -0x02 0x16 # GOOD64: c.slli a2, 32 -0x06 0x16 # BAD32: invalid instruction encoding -0x06 0x16 # GOOD64: c.slli a2, 33 -0x0A 0x16 # BAD32: invalid instruction encoding -0x0A 0x16 # GOOD64: c.slli a2, 34 -0x0E 0x16 # BAD32: invalid instruction encoding -0x0E 0x16 # GOOD64: c.slli a2, 35 -0x12 0x16 # BAD32: invalid instruction encoding -0x12 0x16 # GOOD64: c.slli a2, 36 -0x16 0x16 # BAD32: invalid instruction encoding -0x16 0x16 # GOOD64: c.slli a2, 37 -0x1A 0x16 # BAD32: invalid instruction encoding -0x1A 0x16 # GOOD64: c.slli a2, 38 -0x1E 0x16 # BAD32: invalid instruction encoding -0x1E 0x16 # GOOD64: c.slli a2, 39 -0x22 0x16 # BAD32: invalid instruction encoding -0x22 0x16 # GOOD64: c.slli a2, 40 -0x26 0x16 # BAD32: invalid instruction encoding -0x26 0x16 # GOOD64: c.slli a2, 41 -0x2A 0x16 # BAD32: invalid instruction encoding -0x2A 0x16 # GOOD64: c.slli a2, 42 -0x2E 0x16 # BAD32: invalid instruction encoding -0x2E 0x16 # GOOD64: c.slli a2, 43 -0x32 0x16 # BAD32: invalid instruction encoding -0x32 0x16 # GOOD64: c.slli a2, 44 -0x36 0x16 # BAD32: invalid instruction encoding -0x36 0x16 # GOOD64: c.slli a2, 45 -0x3A 0x16 # BAD32: invalid instruction encoding -0x3A 0x16 # GOOD64: c.slli a2, 46 -0x3E 0x16 # BAD32: invalid instruction encoding -0x3E 0x16 # GOOD64: c.slli a2, 47 -0x42 0x16 # BAD32: invalid instruction encoding -0x42 0x16 # GOOD64: c.slli a2, 48 -0x46 0x16 # BAD32: invalid instruction encoding -0x46 0x16 # GOOD64: c.slli a2, 49 -0x4A 0x16 # BAD32: invalid instruction encoding -0x4A 0x16 # GOOD64: c.slli a2, 50 -0x4E 0x16 # BAD32: invalid instruction encoding -0x4E 0x16 # GOOD64: c.slli a2, 51 -0x52 0x16 # BAD32: invalid instruction encoding -0x52 0x16 # GOOD64: c.slli a2, 52 -0x56 0x16 # BAD32: invalid instruction encoding -0x56 0x16 # GOOD64: c.slli a2, 53 -0x5A 0x16 # BAD32: invalid instruction encoding -0x5A 0x16 # GOOD64: c.slli a2, 54 -0x5E 0x16 # BAD32: invalid instruction encoding -0x5E 0x16 # GOOD64: c.slli a2, 55 -0x62 0x16 # BAD32: invalid instruction encoding -0x62 0x16 # GOOD64: c.slli a2, 56 -0x66 0x16 # BAD32: invalid instruction encoding -0x66 0x16 # GOOD64: c.slli a2, 57 -0x6A 0x16 # BAD32: invalid instruction encoding -0x6A 0x16 # GOOD64: c.slli a2, 58 -0x6E 0x16 # BAD32: invalid instruction encoding -0x6E 0x16 # GOOD64: c.slli a2, 59 -0x72 0x16 # BAD32: invalid instruction encoding -0x72 0x16 # GOOD64: c.slli a2, 60 -0x76 0x16 # BAD32: invalid instruction encoding -0x76 0x16 # GOOD64: c.slli a2, 61 -0x7A 0x16 # BAD32: invalid instruction encoding -0x7A 0x16 # GOOD64: c.slli a2, 62 -0x7E 0x16 # BAD32: invalid instruction encoding -0x7E 0x16 # GOOD64: c.slli a2, 63 -# GOOD: c.slli64 a3 +0x2E 0x00 + +# GOOD: c.slli zero, 12 # NOHINTS: invalid instruction encoding -0x82 0x06 -0x86 0x06 # GOOD: c.slli a3, 1 -0x8A 0x06 # GOOD: c.slli a3, 2 -0x8E 0x06 # GOOD: c.slli a3, 3 -0x92 0x06 # GOOD: c.slli a3, 4 -0x96 0x06 # GOOD: c.slli a3, 5 -0x9A 0x06 # GOOD: c.slli a3, 6 -0x9E 0x06 # GOOD: c.slli a3, 7 -0xA2 0x06 # GOOD: c.slli a3, 8 -0xA6 0x06 # GOOD: c.slli a3, 9 -0xAA 0x06 # GOOD: c.slli a3, 10 -0xAE 0x06 # GOOD: c.slli a3, 11 -0xB2 0x06 # GOOD: c.slli a3, 12 -0xB6 0x06 # GOOD: c.slli a3, 13 -0xBA 0x06 # GOOD: c.slli a3, 14 -0xBE 0x06 # GOOD: c.slli a3, 15 -0xC2 0x06 # GOOD: c.slli a3, 16 -0xC6 0x06 # GOOD: c.slli a3, 17 -0xCA 0x06 # GOOD: c.slli a3, 18 -0xCE 0x06 # GOOD: c.slli a3, 19 -0xD2 0x06 # GOOD: c.slli a3, 20 -0xD6 0x06 # GOOD: c.slli a3, 21 -0xDA 0x06 # GOOD: c.slli a3, 22 -0xDE 0x06 # GOOD: c.slli a3, 23 -0xE2 0x06 # GOOD: c.slli a3, 24 -0xE6 0x06 # GOOD: c.slli a3, 25 -0xEA 0x06 # GOOD: c.slli a3, 26 -0xEE 0x06 # GOOD: c.slli a3, 27 -0xF2 0x06 # GOOD: c.slli a3, 28 -0xF6 0x06 # GOOD: c.slli a3, 29 -0xFA 0x06 # GOOD: c.slli a3, 30 -0xFE 0x06 # GOOD: c.slli a3, 31 -0x82 0x16 # BAD32: invalid instruction encoding -0x82 0x16 # GOOD64: c.slli a3, 32 -0x86 0x16 # BAD32: invalid instruction encoding -0x86 0x16 # GOOD64: c.slli a3, 33 -0x8A 0x16 # BAD32: invalid instruction encoding -0x8A 0x16 # GOOD64: c.slli a3, 34 -0x8E 0x16 # BAD32: invalid instruction encoding -0x8E 0x16 # GOOD64: c.slli a3, 35 -0x92 0x16 # BAD32: invalid instruction encoding -0x92 0x16 # GOOD64: c.slli a3, 36 -0x96 0x16 # BAD32: invalid instruction encoding -0x96 0x16 # GOOD64: c.slli a3, 37 -0x9A 0x16 # BAD32: invalid instruction encoding -0x9A 0x16 # GOOD64: c.slli a3, 38 -0x9E 0x16 # BAD32: invalid instruction encoding -0x9E 0x16 # GOOD64: c.slli a3, 39 -0xA2 0x16 # BAD32: invalid instruction encoding -0xA2 0x16 # GOOD64: c.slli a3, 40 -0xA6 0x16 # BAD32: invalid instruction encoding -0xA6 0x16 # GOOD64: c.slli a3, 41 -0xAA 0x16 # BAD32: invalid instruction encoding -0xAA 0x16 # GOOD64: c.slli a3, 42 -0xAE 0x16 # BAD32: invalid instruction encoding -0xAE 0x16 # GOOD64: c.slli a3, 43 -0xB2 0x16 # BAD32: invalid instruction encoding -0xB2 0x16 # GOOD64: c.slli a3, 44 -0xB6 0x16 # BAD32: invalid instruction encoding -0xB6 0x16 # GOOD64: c.slli a3, 45 -0xBA 0x16 # BAD32: invalid instruction encoding -0xBA 0x16 # GOOD64: c.slli a3, 46 -0xBE 0x16 # BAD32: invalid instruction encoding -0xBE 0x16 # GOOD64: c.slli a3, 47 -0xC2 0x16 # BAD32: invalid instruction encoding -0xC2 0x16 # GOOD64: c.slli a3, 48 -0xC6 0x16 # BAD32: invalid instruction encoding -0xC6 0x16 # GOOD64: c.slli a3, 49 -0xCA 0x16 # BAD32: invalid instruction encoding -0xCA 0x16 # GOOD64: c.slli a3, 50 -0xCE 0x16 # BAD32: invalid instruction encoding -0xCE 0x16 # GOOD64: c.slli a3, 51 -0xD2 0x16 # BAD32: invalid instruction encoding -0xD2 0x16 # GOOD64: c.slli a3, 52 -0xD6 0x16 # BAD32: invalid instruction encoding -0xD6 0x16 # GOOD64: c.slli a3, 53 -0xDA 0x16 # BAD32: invalid instruction encoding -0xDA 0x16 # GOOD64: c.slli a3, 54 -0xDE 0x16 # BAD32: invalid instruction encoding -0xDE 0x16 # GOOD64: c.slli a3, 55 -0xE2 0x16 # BAD32: invalid instruction encoding -0xE2 0x16 # GOOD64: c.slli a3, 56 -0xE6 0x16 # BAD32: invalid instruction encoding -0xE6 0x16 # GOOD64: c.slli a3, 57 -0xEA 0x16 # BAD32: invalid instruction encoding -0xEA 0x16 # GOOD64: c.slli a3, 58 -0xEE 0x16 # BAD32: invalid instruction encoding -0xEE 0x16 # GOOD64: c.slli a3, 59 -0xF2 0x16 # BAD32: invalid instruction encoding -0xF2 0x16 # GOOD64: c.slli a3, 60 -0xF6 0x16 # BAD32: invalid instruction encoding -0xF6 0x16 # GOOD64: c.slli a3, 61 -0xFA 0x16 # BAD32: invalid instruction encoding -0xFA 0x16 # GOOD64: c.slli a3, 62 -0xFE 0x16 # BAD32: invalid instruction encoding -0xFE 0x16 # GOOD64: c.slli a3, 63 -# GOOD: c.slli64 a4 +0x32 0x00 + +# GOOD: c.slli zero, 13 # NOHINTS: invalid instruction encoding -0x02 0x07 -0x06 0x07 # GOOD: c.slli a4, 1 -0x0A 0x07 # GOOD: c.slli a4, 2 -0x0E 0x07 # GOOD: c.slli a4, 3 -0x12 0x07 # GOOD: c.slli a4, 4 -0x16 0x07 # GOOD: c.slli a4, 5 -0x1A 0x07 # GOOD: c.slli a4, 6 -0x1E 0x07 # GOOD: c.slli a4, 7 -0x22 0x07 # GOOD: c.slli a4, 8 -0x26 0x07 # GOOD: c.slli a4, 9 -0x2A 0x07 # GOOD: c.slli a4, 10 -0x2E 0x07 # GOOD: c.slli a4, 11 -0x32 0x07 # GOOD: c.slli a4, 12 -0x36 0x07 # GOOD: c.slli a4, 13 -0x3A 0x07 # GOOD: c.slli a4, 14 -0x3E 0x07 # GOOD: c.slli a4, 15 -0x42 0x07 # GOOD: c.slli a4, 16 -0x46 0x07 # GOOD: c.slli a4, 17 -0x4A 0x07 # GOOD: c.slli a4, 18 -0x4E 0x07 # GOOD: c.slli a4, 19 -0x52 0x07 # GOOD: c.slli a4, 20 -0x56 0x07 # GOOD: c.slli a4, 21 -0x5A 0x07 # GOOD: c.slli a4, 22 -0x5E 0x07 # GOOD: c.slli a4, 23 -0x62 0x07 # GOOD: c.slli a4, 24 -0x66 0x07 # GOOD: c.slli a4, 25 -0x6A 0x07 # GOOD: c.slli a4, 26 -0x6E 0x07 # GOOD: c.slli a4, 27 -0x72 0x07 # GOOD: c.slli a4, 28 -0x76 0x07 # GOOD: c.slli a4, 29 -0x7A 0x07 # GOOD: c.slli a4, 30 -0x7E 0x07 # GOOD: c.slli a4, 31 -0x02 0x17 # BAD32: invalid instruction encoding -0x02 0x17 # GOOD64: c.slli a4, 32 -0x06 0x17 # BAD32: invalid instruction encoding -0x06 0x17 # GOOD64: c.slli a4, 33 -0x0A 0x17 # BAD32: invalid instruction encoding -0x0A 0x17 # GOOD64: c.slli a4, 34 -0x0E 0x17 # BAD32: invalid instruction encoding -0x0E 0x17 # GOOD64: c.slli a4, 35 -0x12 0x17 # BAD32: invalid instruction encoding -0x12 0x17 # GOOD64: c.slli a4, 36 -0x16 0x17 # BAD32: invalid instruction encoding -0x16 0x17 # GOOD64: c.slli a4, 37 -0x1A 0x17 # BAD32: invalid instruction encoding -0x1A 0x17 # GOOD64: c.slli a4, 38 -0x1E 0x17 # BAD32: invalid instruction encoding -0x1E 0x17 # GOOD64: c.slli a4, 39 -0x22 0x17 # BAD32: invalid instruction encoding -0x22 0x17 # GOOD64: c.slli a4, 40 -0x26 0x17 # BAD32: invalid instruction encoding -0x26 0x17 # GOOD64: c.slli a4, 41 -0x2A 0x17 # BAD32: invalid instruction encoding -0x2A 0x17 # GOOD64: c.slli a4, 42 -0x2E 0x17 # BAD32: invalid instruction encoding -0x2E 0x17 # GOOD64: c.slli a4, 43 -0x32 0x17 # BAD32: invalid instruction encoding -0x32 0x17 # GOOD64: c.slli a4, 44 -0x36 0x17 # BAD32: invalid instruction encoding -0x36 0x17 # GOOD64: c.slli a4, 45 -0x3A 0x17 # BAD32: invalid instruction encoding -0x3A 0x17 # GOOD64: c.slli a4, 46 -0x3E 0x17 # BAD32: invalid instruction encoding -0x3E 0x17 # GOOD64: c.slli a4, 47 -0x42 0x17 # BAD32: invalid instruction encoding -0x42 0x17 # GOOD64: c.slli a4, 48 -0x46 0x17 # BAD32: invalid instruction encoding -0x46 0x17 # GOOD64: c.slli a4, 49 -0x4A 0x17 # BAD32: invalid instruction encoding -0x4A 0x17 # GOOD64: c.slli a4, 50 -0x4E 0x17 # BAD32: invalid instruction encoding -0x4E 0x17 # GOOD64: c.slli a4, 51 -0x52 0x17 # BAD32: invalid instruction encoding -0x52 0x17 # GOOD64: c.slli a4, 52 -0x56 0x17 # BAD32: invalid instruction encoding -0x56 0x17 # GOOD64: c.slli a4, 53 -0x5A 0x17 # BAD32: invalid instruction encoding -0x5A 0x17 # GOOD64: c.slli a4, 54 -0x5E 0x17 # BAD32: invalid instruction encoding -0x5E 0x17 # GOOD64: c.slli a4, 55 -0x62 0x17 # BAD32: invalid instruction encoding -0x62 0x17 # GOOD64: c.slli a4, 56 -0x66 0x17 # BAD32: invalid instruction encoding -0x66 0x17 # GOOD64: c.slli a4, 57 -0x6A 0x17 # BAD32: invalid instruction encoding -0x6A 0x17 # GOOD64: c.slli a4, 58 -0x6E 0x17 # BAD32: invalid instruction encoding -0x6E 0x17 # GOOD64: c.slli a4, 59 -0x72 0x17 # BAD32: invalid instruction encoding -0x72 0x17 # GOOD64: c.slli a4, 60 -0x76 0x17 # BAD32: invalid instruction encoding -0x76 0x17 # GOOD64: c.slli a4, 61 -0x7A 0x17 # BAD32: invalid instruction encoding -0x7A 0x17 # GOOD64: c.slli a4, 62 -0x7E 0x17 # BAD32: invalid instruction encoding -0x7E 0x17 # GOOD64: c.slli a4, 63 -# GOOD: c.slli64 a5 +0x36 0x00 + +# GOOD: c.slli zero, 14 # NOHINTS: invalid instruction encoding -0x82 0x07 -0x86 0x07 # GOOD: c.slli a5, 1 -0x8A 0x07 # GOOD: c.slli a5, 2 -0x8E 0x07 # GOOD: c.slli a5, 3 -0x92 0x07 # GOOD: c.slli a5, 4 -0x96 0x07 # GOOD: c.slli a5, 5 -0x9A 0x07 # GOOD: c.slli a5, 6 -0x9E 0x07 # GOOD: c.slli a5, 7 -0xA2 0x07 # GOOD: c.slli a5, 8 -0xA6 0x07 # GOOD: c.slli a5, 9 -0xAA 0x07 # GOOD: c.slli a5, 10 -0xAE 0x07 # GOOD: c.slli a5, 11 -0xB2 0x07 # GOOD: c.slli a5, 12 -0xB6 0x07 # GOOD: c.slli a5, 13 -0xBA 0x07 # GOOD: c.slli a5, 14 -0xBE 0x07 # GOOD: c.slli a5, 15 -0xC2 0x07 # GOOD: c.slli a5, 16 -0xC6 0x07 # GOOD: c.slli a5, 17 -0xCA 0x07 # GOOD: c.slli a5, 18 -0xCE 0x07 # GOOD: c.slli a5, 19 -0xD2 0x07 # GOOD: c.slli a5, 20 -0xD6 0x07 # GOOD: c.slli a5, 21 -0xDA 0x07 # GOOD: c.slli a5, 22 -0xDE 0x07 # GOOD: c.slli a5, 23 -0xE2 0x07 # GOOD: c.slli a5, 24 -0xE6 0x07 # GOOD: c.slli a5, 25 -0xEA 0x07 # GOOD: c.slli a5, 26 -0xEE 0x07 # GOOD: c.slli a5, 27 -0xF2 0x07 # GOOD: c.slli a5, 28 -0xF6 0x07 # GOOD: c.slli a5, 29 -0xFA 0x07 # GOOD: c.slli a5, 30 -0xFE 0x07 # GOOD: c.slli a5, 31 -0x82 0x17 # BAD32: invalid instruction encoding -0x82 0x17 # GOOD64: c.slli a5, 32 -0x86 0x17 # BAD32: invalid instruction encoding -0x86 0x17 # GOOD64: c.slli a5, 33 -0x8A 0x17 # BAD32: invalid instruction encoding -0x8A 0x17 # GOOD64: c.slli a5, 34 -0x8E 0x17 # BAD32: invalid instruction encoding -0x8E 0x17 # GOOD64: c.slli a5, 35 -0x92 0x17 # BAD32: invalid instruction encoding -0x92 0x17 # GOOD64: c.slli a5, 36 -0x96 0x17 # BAD32: invalid instruction encoding -0x96 0x17 # GOOD64: c.slli a5, 37 -0x9A 0x17 # BAD32: invalid instruction encoding -0x9A 0x17 # GOOD64: c.slli a5, 38 -0x9E 0x17 # BAD32: invalid instruction encoding -0x9E 0x17 # GOOD64: c.slli a5, 39 -0xA2 0x17 # BAD32: invalid instruction encoding -0xA2 0x17 # GOOD64: c.slli a5, 40 -0xA6 0x17 # BAD32: invalid instruction encoding -0xA6 0x17 # GOOD64: c.slli a5, 41 -0xAA 0x17 # BAD32: invalid instruction encoding -0xAA 0x17 # GOOD64: c.slli a5, 42 -0xAE 0x17 # BAD32: invalid instruction encoding -0xAE 0x17 # GOOD64: c.slli a5, 43 -0xB2 0x17 # BAD32: invalid instruction encoding -0xB2 0x17 # GOOD64: c.slli a5, 44 -0xB6 0x17 # BAD32: invalid instruction encoding -0xB6 0x17 # GOOD64: c.slli a5, 45 -0xBA 0x17 # BAD32: invalid instruction encoding -0xBA 0x17 # GOOD64: c.slli a5, 46 -0xBE 0x17 # BAD32: invalid instruction encoding -0xBE 0x17 # GOOD64: c.slli a5, 47 -0xC2 0x17 # BAD32: invalid instruction encoding -0xC2 0x17 # GOOD64: c.slli a5, 48 -0xC6 0x17 # BAD32: invalid instruction encoding -0xC6 0x17 # GOOD64: c.slli a5, 49 -0xCA 0x17 # BAD32: invalid instruction encoding -0xCA 0x17 # GOOD64: c.slli a5, 50 -0xCE 0x17 # BAD32: invalid instruction encoding -0xCE 0x17 # GOOD64: c.slli a5, 51 -0xD2 0x17 # BAD32: invalid instruction encoding -0xD2 0x17 # GOOD64: c.slli a5, 52 -0xD6 0x17 # BAD32: invalid instruction encoding -0xD6 0x17 # GOOD64: c.slli a5, 53 -0xDA 0x17 # BAD32: invalid instruction encoding -0xDA 0x17 # GOOD64: c.slli a5, 54 -0xDE 0x17 # BAD32: invalid instruction encoding -0xDE 0x17 # GOOD64: c.slli a5, 55 -0xE2 0x17 # BAD32: invalid instruction encoding -0xE2 0x17 # GOOD64: c.slli a5, 56 -0xE6 0x17 # BAD32: invalid instruction encoding -0xE6 0x17 # GOOD64: c.slli a5, 57 -0xEA 0x17 # BAD32: invalid instruction encoding -0xEA 0x17 # GOOD64: c.slli a5, 58 -0xEE 0x17 # BAD32: invalid instruction encoding -0xEE 0x17 # GOOD64: c.slli a5, 59 -0xF2 0x17 # BAD32: invalid instruction encoding -0xF2 0x17 # GOOD64: c.slli a5, 60 -0xF6 0x17 # BAD32: invalid instruction encoding -0xF6 0x17 # GOOD64: c.slli a5, 61 -0xFA 0x17 # BAD32: invalid instruction encoding -0xFA 0x17 # GOOD64: c.slli a5, 62 -0xFE 0x17 # BAD32: invalid instruction encoding -0xFE 0x17 # GOOD64: c.slli a5, 63 -# GOOD: c.slli64 a6 +0x3A 0x00 + +# GOOD: c.slli zero, 15 # NOHINTS: invalid instruction encoding -0x02 0x08 -0x06 0x08 # GOOD: c.slli a6, 1 -0x0A 0x08 # GOOD: c.slli a6, 2 -0x0E 0x08 # GOOD: c.slli a6, 3 -0x12 0x08 # GOOD: c.slli a6, 4 -0x16 0x08 # GOOD: c.slli a6, 5 -0x1A 0x08 # GOOD: c.slli a6, 6 -0x1E 0x08 # GOOD: c.slli a6, 7 -0x22 0x08 # GOOD: c.slli a6, 8 -0x26 0x08 # GOOD: c.slli a6, 9 -0x2A 0x08 # GOOD: c.slli a6, 10 -0x2E 0x08 # GOOD: c.slli a6, 11 -0x32 0x08 # GOOD: c.slli a6, 12 -0x36 0x08 # GOOD: c.slli a6, 13 -0x3A 0x08 # GOOD: c.slli a6, 14 -0x3E 0x08 # GOOD: c.slli a6, 15 -0x42 0x08 # GOOD: c.slli a6, 16 -0x46 0x08 # GOOD: c.slli a6, 17 -0x4A 0x08 # GOOD: c.slli a6, 18 -0x4E 0x08 # GOOD: c.slli a6, 19 -0x52 0x08 # GOOD: c.slli a6, 20 -0x56 0x08 # GOOD: c.slli a6, 21 -0x5A 0x08 # GOOD: c.slli a6, 22 -0x5E 0x08 # GOOD: c.slli a6, 23 -0x62 0x08 # GOOD: c.slli a6, 24 -0x66 0x08 # GOOD: c.slli a6, 25 -0x6A 0x08 # GOOD: c.slli a6, 26 -0x6E 0x08 # GOOD: c.slli a6, 27 -0x72 0x08 # GOOD: c.slli a6, 28 -0x76 0x08 # GOOD: c.slli a6, 29 -0x7A 0x08 # GOOD: c.slli a6, 30 -0x7E 0x08 # GOOD: c.slli a6, 31 -0x02 0x18 # BAD32: invalid instruction encoding -0x02 0x18 # GOOD64: c.slli a6, 32 -0x06 0x18 # BAD32: invalid instruction encoding -0x06 0x18 # GOOD64: c.slli a6, 33 -0x0A 0x18 # BAD32: invalid instruction encoding -0x0A 0x18 # GOOD64: c.slli a6, 34 -0x0E 0x18 # BAD32: invalid instruction encoding -0x0E 0x18 # GOOD64: c.slli a6, 35 -0x12 0x18 # BAD32: invalid instruction encoding -0x12 0x18 # GOOD64: c.slli a6, 36 -0x16 0x18 # BAD32: invalid instruction encoding -0x16 0x18 # GOOD64: c.slli a6, 37 -0x1A 0x18 # BAD32: invalid instruction encoding -0x1A 0x18 # GOOD64: c.slli a6, 38 -0x1E 0x18 # BAD32: invalid instruction encoding -0x1E 0x18 # GOOD64: c.slli a6, 39 -0x22 0x18 # BAD32: invalid instruction encoding -0x22 0x18 # GOOD64: c.slli a6, 40 -0x26 0x18 # BAD32: invalid instruction encoding -0x26 0x18 # GOOD64: c.slli a6, 41 -0x2A 0x18 # BAD32: invalid instruction encoding -0x2A 0x18 # GOOD64: c.slli a6, 42 -0x2E 0x18 # BAD32: invalid instruction encoding -0x2E 0x18 # GOOD64: c.slli a6, 43 -0x32 0x18 # BAD32: invalid instruction encoding -0x32 0x18 # GOOD64: c.slli a6, 44 -0x36 0x18 # BAD32: invalid instruction encoding -0x36 0x18 # GOOD64: c.slli a6, 45 -0x3A 0x18 # BAD32: invalid instruction encoding -0x3A 0x18 # GOOD64: c.slli a6, 46 -0x3E 0x18 # BAD32: invalid instruction encoding -0x3E 0x18 # GOOD64: c.slli a6, 47 -0x42 0x18 # BAD32: invalid instruction encoding -0x42 0x18 # GOOD64: c.slli a6, 48 -0x46 0x18 # BAD32: invalid instruction encoding -0x46 0x18 # GOOD64: c.slli a6, 49 -0x4A 0x18 # BAD32: invalid instruction encoding -0x4A 0x18 # GOOD64: c.slli a6, 50 -0x4E 0x18 # BAD32: invalid instruction encoding -0x4E 0x18 # GOOD64: c.slli a6, 51 -0x52 0x18 # BAD32: invalid instruction encoding -0x52 0x18 # GOOD64: c.slli a6, 52 -0x56 0x18 # BAD32: invalid instruction encoding -0x56 0x18 # GOOD64: c.slli a6, 53 -0x5A 0x18 # BAD32: invalid instruction encoding -0x5A 0x18 # GOOD64: c.slli a6, 54 -0x5E 0x18 # BAD32: invalid instruction encoding -0x5E 0x18 # GOOD64: c.slli a6, 55 -0x62 0x18 # BAD32: invalid instruction encoding -0x62 0x18 # GOOD64: c.slli a6, 56 -0x66 0x18 # BAD32: invalid instruction encoding -0x66 0x18 # GOOD64: c.slli a6, 57 -0x6A 0x18 # BAD32: invalid instruction encoding -0x6A 0x18 # GOOD64: c.slli a6, 58 -0x6E 0x18 # BAD32: invalid instruction encoding -0x6E 0x18 # GOOD64: c.slli a6, 59 -0x72 0x18 # BAD32: invalid instruction encoding -0x72 0x18 # GOOD64: c.slli a6, 60 -0x76 0x18 # BAD32: invalid instruction encoding -0x76 0x18 # GOOD64: c.slli a6, 61 -0x7A 0x18 # BAD32: invalid instruction encoding -0x7A 0x18 # GOOD64: c.slli a6, 62 -0x7E 0x18 # BAD32: invalid instruction encoding -0x7E 0x18 # GOOD64: c.slli a6, 63 -# GOOD: c.slli64 a7 +0x3E 0x00 + +# GOOD: c.slli zero, 16 # NOHINTS: invalid instruction encoding -0x82 0x08 -0x86 0x08 # GOOD: c.slli a7, 1 -0x8A 0x08 # GOOD: c.slli a7, 2 -0x8E 0x08 # GOOD: c.slli a7, 3 -0x92 0x08 # GOOD: c.slli a7, 4 -0x96 0x08 # GOOD: c.slli a7, 5 -0x9A 0x08 # GOOD: c.slli a7, 6 -0x9E 0x08 # GOOD: c.slli a7, 7 -0xA2 0x08 # GOOD: c.slli a7, 8 -0xA6 0x08 # GOOD: c.slli a7, 9 -0xAA 0x08 # GOOD: c.slli a7, 10 -0xAE 0x08 # GOOD: c.slli a7, 11 -0xB2 0x08 # GOOD: c.slli a7, 12 -0xB6 0x08 # GOOD: c.slli a7, 13 -0xBA 0x08 # GOOD: c.slli a7, 14 -0xBE 0x08 # GOOD: c.slli a7, 15 -0xC2 0x08 # GOOD: c.slli a7, 16 -0xC6 0x08 # GOOD: c.slli a7, 17 -0xCA 0x08 # GOOD: c.slli a7, 18 -0xCE 0x08 # GOOD: c.slli a7, 19 -0xD2 0x08 # GOOD: c.slli a7, 20 -0xD6 0x08 # GOOD: c.slli a7, 21 -0xDA 0x08 # GOOD: c.slli a7, 22 -0xDE 0x08 # GOOD: c.slli a7, 23 -0xE2 0x08 # GOOD: c.slli a7, 24 -0xE6 0x08 # GOOD: c.slli a7, 25 -0xEA 0x08 # GOOD: c.slli a7, 26 -0xEE 0x08 # GOOD: c.slli a7, 27 -0xF2 0x08 # GOOD: c.slli a7, 28 -0xF6 0x08 # GOOD: c.slli a7, 29 -0xFA 0x08 # GOOD: c.slli a7, 30 -0xFE 0x08 # GOOD: c.slli a7, 31 -0x82 0x18 # BAD32: invalid instruction encoding -0x82 0x18 # GOOD64: c.slli a7, 32 -0x86 0x18 # BAD32: invalid instruction encoding -0x86 0x18 # GOOD64: c.slli a7, 33 -0x8A 0x18 # BAD32: invalid instruction encoding -0x8A 0x18 # GOOD64: c.slli a7, 34 -0x8E 0x18 # BAD32: invalid instruction encoding -0x8E 0x18 # GOOD64: c.slli a7, 35 -0x92 0x18 # BAD32: invalid instruction encoding -0x92 0x18 # GOOD64: c.slli a7, 36 -0x96 0x18 # BAD32: invalid instruction encoding -0x96 0x18 # GOOD64: c.slli a7, 37 -0x9A 0x18 # BAD32: invalid instruction encoding -0x9A 0x18 # GOOD64: c.slli a7, 38 -0x9E 0x18 # BAD32: invalid instruction encoding -0x9E 0x18 # GOOD64: c.slli a7, 39 -0xA2 0x18 # BAD32: invalid instruction encoding -0xA2 0x18 # GOOD64: c.slli a7, 40 -0xA6 0x18 # BAD32: invalid instruction encoding -0xA6 0x18 # GOOD64: c.slli a7, 41 -0xAA 0x18 # BAD32: invalid instruction encoding -0xAA 0x18 # GOOD64: c.slli a7, 42 -0xAE 0x18 # BAD32: invalid instruction encoding -0xAE 0x18 # GOOD64: c.slli a7, 43 -0xB2 0x18 # BAD32: invalid instruction encoding -0xB2 0x18 # GOOD64: c.slli a7, 44 -0xB6 0x18 # BAD32: invalid instruction encoding -0xB6 0x18 # GOOD64: c.slli a7, 45 -0xBA 0x18 # BAD32: invalid instruction encoding -0xBA 0x18 # GOOD64: c.slli a7, 46 -0xBE 0x18 # BAD32: invalid instruction encoding -0xBE 0x18 # GOOD64: c.slli a7, 47 -0xC2 0x18 # BAD32: invalid instruction encoding -0xC2 0x18 # GOOD64: c.slli a7, 48 -0xC6 0x18 # BAD32: invalid instruction encoding -0xC6 0x18 # GOOD64: c.slli a7, 49 -0xCA 0x18 # BAD32: invalid instruction encoding -0xCA 0x18 # GOOD64: c.slli a7, 50 -0xCE 0x18 # BAD32: invalid instruction encoding -0xCE 0x18 # GOOD64: c.slli a7, 51 -0xD2 0x18 # BAD32: invalid instruction encoding -0xD2 0x18 # GOOD64: c.slli a7, 52 -0xD6 0x18 # BAD32: invalid instruction encoding -0xD6 0x18 # GOOD64: c.slli a7, 53 -0xDA 0x18 # BAD32: invalid instruction encoding -0xDA 0x18 # GOOD64: c.slli a7, 54 -0xDE 0x18 # BAD32: invalid instruction encoding -0xDE 0x18 # GOOD64: c.slli a7, 55 -0xE2 0x18 # BAD32: invalid instruction encoding -0xE2 0x18 # GOOD64: c.slli a7, 56 -0xE6 0x18 # BAD32: invalid instruction encoding -0xE6 0x18 # GOOD64: c.slli a7, 57 -0xEA 0x18 # BAD32: invalid instruction encoding -0xEA 0x18 # GOOD64: c.slli a7, 58 -0xEE 0x18 # BAD32: invalid instruction encoding -0xEE 0x18 # GOOD64: c.slli a7, 59 -0xF2 0x18 # BAD32: invalid instruction encoding -0xF2 0x18 # GOOD64: c.slli a7, 60 -0xF6 0x18 # BAD32: invalid instruction encoding -0xF6 0x18 # GOOD64: c.slli a7, 61 -0xFA 0x18 # BAD32: invalid instruction encoding -0xFA 0x18 # GOOD64: c.slli a7, 62 -0xFE 0x18 # BAD32: invalid instruction encoding -0xFE 0x18 # GOOD64: c.slli a7, 63 -# GOOD: c.slli64 s2 +0x42 0x00 + +# GOOD: c.slli zero, 17 # NOHINTS: invalid instruction encoding -0x02 0x09 -0x06 0x09 # GOOD: c.slli s2, 1 -0x0A 0x09 # GOOD: c.slli s2, 2 -0x0E 0x09 # GOOD: c.slli s2, 3 -0x12 0x09 # GOOD: c.slli s2, 4 -0x16 0x09 # GOOD: c.slli s2, 5 -0x1A 0x09 # GOOD: c.slli s2, 6 -0x1E 0x09 # GOOD: c.slli s2, 7 -0x22 0x09 # GOOD: c.slli s2, 8 -0x26 0x09 # GOOD: c.slli s2, 9 -0x2A 0x09 # GOOD: c.slli s2, 10 -0x2E 0x09 # GOOD: c.slli s2, 11 -0x32 0x09 # GOOD: c.slli s2, 12 -0x36 0x09 # GOOD: c.slli s2, 13 -0x3A 0x09 # GOOD: c.slli s2, 14 -0x3E 0x09 # GOOD: c.slli s2, 15 -0x42 0x09 # GOOD: c.slli s2, 16 -0x46 0x09 # GOOD: c.slli s2, 17 -0x4A 0x09 # GOOD: c.slli s2, 18 -0x4E 0x09 # GOOD: c.slli s2, 19 -0x52 0x09 # GOOD: c.slli s2, 20 -0x56 0x09 # GOOD: c.slli s2, 21 -0x5A 0x09 # GOOD: c.slli s2, 22 -0x5E 0x09 # GOOD: c.slli s2, 23 -0x62 0x09 # GOOD: c.slli s2, 24 -0x66 0x09 # GOOD: c.slli s2, 25 -0x6A 0x09 # GOOD: c.slli s2, 26 -0x6E 0x09 # GOOD: c.slli s2, 27 -0x72 0x09 # GOOD: c.slli s2, 28 -0x76 0x09 # GOOD: c.slli s2, 29 -0x7A 0x09 # GOOD: c.slli s2, 30 -0x7E 0x09 # GOOD: c.slli s2, 31 -0x02 0x19 # BAD32: invalid instruction encoding -0x02 0x19 # GOOD64: c.slli s2, 32 -0x06 0x19 # BAD32: invalid instruction encoding -0x06 0x19 # GOOD64: c.slli s2, 33 -0x0A 0x19 # BAD32: invalid instruction encoding -0x0A 0x19 # GOOD64: c.slli s2, 34 -0x0E 0x19 # BAD32: invalid instruction encoding -0x0E 0x19 # GOOD64: c.slli s2, 35 -0x12 0x19 # BAD32: invalid instruction encoding -0x12 0x19 # GOOD64: c.slli s2, 36 -0x16 0x19 # BAD32: invalid instruction encoding -0x16 0x19 # GOOD64: c.slli s2, 37 -0x1A 0x19 # BAD32: invalid instruction encoding -0x1A 0x19 # GOOD64: c.slli s2, 38 -0x1E 0x19 # BAD32: invalid instruction encoding -0x1E 0x19 # GOOD64: c.slli s2, 39 -0x22 0x19 # BAD32: invalid instruction encoding -0x22 0x19 # GOOD64: c.slli s2, 40 -0x26 0x19 # BAD32: invalid instruction encoding -0x26 0x19 # GOOD64: c.slli s2, 41 -0x2A 0x19 # BAD32: invalid instruction encoding -0x2A 0x19 # GOOD64: c.slli s2, 42 -0x2E 0x19 # BAD32: invalid instruction encoding -0x2E 0x19 # GOOD64: c.slli s2, 43 -0x32 0x19 # BAD32: invalid instruction encoding -0x32 0x19 # GOOD64: c.slli s2, 44 -0x36 0x19 # BAD32: invalid instruction encoding -0x36 0x19 # GOOD64: c.slli s2, 45 -0x3A 0x19 # BAD32: invalid instruction encoding -0x3A 0x19 # GOOD64: c.slli s2, 46 -0x3E 0x19 # BAD32: invalid instruction encoding -0x3E 0x19 # GOOD64: c.slli s2, 47 -0x42 0x19 # BAD32: invalid instruction encoding -0x42 0x19 # GOOD64: c.slli s2, 48 -0x46 0x19 # BAD32: invalid instruction encoding -0x46 0x19 # GOOD64: c.slli s2, 49 -0x4A 0x19 # BAD32: invalid instruction encoding -0x4A 0x19 # GOOD64: c.slli s2, 50 -0x4E 0x19 # BAD32: invalid instruction encoding -0x4E 0x19 # GOOD64: c.slli s2, 51 -0x52 0x19 # BAD32: invalid instruction encoding -0x52 0x19 # GOOD64: c.slli s2, 52 -0x56 0x19 # BAD32: invalid instruction encoding -0x56 0x19 # GOOD64: c.slli s2, 53 -0x5A 0x19 # BAD32: invalid instruction encoding -0x5A 0x19 # GOOD64: c.slli s2, 54 -0x5E 0x19 # BAD32: invalid instruction encoding -0x5E 0x19 # GOOD64: c.slli s2, 55 -0x62 0x19 # BAD32: invalid instruction encoding -0x62 0x19 # GOOD64: c.slli s2, 56 -0x66 0x19 # BAD32: invalid instruction encoding -0x66 0x19 # GOOD64: c.slli s2, 57 -0x6A 0x19 # BAD32: invalid instruction encoding -0x6A 0x19 # GOOD64: c.slli s2, 58 -0x6E 0x19 # BAD32: invalid instruction encoding -0x6E 0x19 # GOOD64: c.slli s2, 59 -0x72 0x19 # BAD32: invalid instruction encoding -0x72 0x19 # GOOD64: c.slli s2, 60 -0x76 0x19 # BAD32: invalid instruction encoding -0x76 0x19 # GOOD64: c.slli s2, 61 -0x7A 0x19 # BAD32: invalid instruction encoding -0x7A 0x19 # GOOD64: c.slli s2, 62 -0x7E 0x19 # BAD32: invalid instruction encoding -0x7E 0x19 # GOOD64: c.slli s2, 63 -# GOOD: c.slli64 s3 +0x46 0x00 + +# GOOD: c.slli zero, 18 # NOHINTS: invalid instruction encoding -0x82 0x09 -0x86 0x09 # GOOD: c.slli s3, 1 -0x8A 0x09 # GOOD: c.slli s3, 2 -0x8E 0x09 # GOOD: c.slli s3, 3 -0x92 0x09 # GOOD: c.slli s3, 4 -0x96 0x09 # GOOD: c.slli s3, 5 -0x9A 0x09 # GOOD: c.slli s3, 6 -0x9E 0x09 # GOOD: c.slli s3, 7 -0xA2 0x09 # GOOD: c.slli s3, 8 -0xA6 0x09 # GOOD: c.slli s3, 9 -0xAA 0x09 # GOOD: c.slli s3, 10 -0xAE 0x09 # GOOD: c.slli s3, 11 -0xB2 0x09 # GOOD: c.slli s3, 12 -0xB6 0x09 # GOOD: c.slli s3, 13 -0xBA 0x09 # GOOD: c.slli s3, 14 -0xBE 0x09 # GOOD: c.slli s3, 15 -0xC2 0x09 # GOOD: c.slli s3, 16 -0xC6 0x09 # GOOD: c.slli s3, 17 -0xCA 0x09 # GOOD: c.slli s3, 18 -0xCE 0x09 # GOOD: c.slli s3, 19 -0xD2 0x09 # GOOD: c.slli s3, 20 -0xD6 0x09 # GOOD: c.slli s3, 21 -0xDA 0x09 # GOOD: c.slli s3, 22 -0xDE 0x09 # GOOD: c.slli s3, 23 -0xE2 0x09 # GOOD: c.slli s3, 24 -0xE6 0x09 # GOOD: c.slli s3, 25 -0xEA 0x09 # GOOD: c.slli s3, 26 -0xEE 0x09 # GOOD: c.slli s3, 27 -0xF2 0x09 # GOOD: c.slli s3, 28 -0xF6 0x09 # GOOD: c.slli s3, 29 -0xFA 0x09 # GOOD: c.slli s3, 30 -0xFE 0x09 # GOOD: c.slli s3, 31 -0x82 0x19 # BAD32: invalid instruction encoding -0x82 0x19 # GOOD64: c.slli s3, 32 -0x86 0x19 # BAD32: invalid instruction encoding -0x86 0x19 # GOOD64: c.slli s3, 33 -0x8A 0x19 # BAD32: invalid instruction encoding -0x8A 0x19 # GOOD64: c.slli s3, 34 -0x8E 0x19 # BAD32: invalid instruction encoding -0x8E 0x19 # GOOD64: c.slli s3, 35 -0x92 0x19 # BAD32: invalid instruction encoding -0x92 0x19 # GOOD64: c.slli s3, 36 -0x96 0x19 # BAD32: invalid instruction encoding -0x96 0x19 # GOOD64: c.slli s3, 37 -0x9A 0x19 # BAD32: invalid instruction encoding -0x9A 0x19 # GOOD64: c.slli s3, 38 -0x9E 0x19 # BAD32: invalid instruction encoding -0x9E 0x19 # GOOD64: c.slli s3, 39 -0xA2 0x19 # BAD32: invalid instruction encoding -0xA2 0x19 # GOOD64: c.slli s3, 40 -0xA6 0x19 # BAD32: invalid instruction encoding -0xA6 0x19 # GOOD64: c.slli s3, 41 -0xAA 0x19 # BAD32: invalid instruction encoding -0xAA 0x19 # GOOD64: c.slli s3, 42 -0xAE 0x19 # BAD32: invalid instruction encoding -0xAE 0x19 # GOOD64: c.slli s3, 43 -0xB2 0x19 # BAD32: invalid instruction encoding -0xB2 0x19 # GOOD64: c.slli s3, 44 -0xB6 0x19 # BAD32: invalid instruction encoding -0xB6 0x19 # GOOD64: c.slli s3, 45 -0xBA 0x19 # BAD32: invalid instruction encoding -0xBA 0x19 # GOOD64: c.slli s3, 46 -0xBE 0x19 # BAD32: invalid instruction encoding -0xBE 0x19 # GOOD64: c.slli s3, 47 -0xC2 0x19 # BAD32: invalid instruction encoding -0xC2 0x19 # GOOD64: c.slli s3, 48 -0xC6 0x19 # BAD32: invalid instruction encoding -0xC6 0x19 # GOOD64: c.slli s3, 49 -0xCA 0x19 # BAD32: invalid instruction encoding -0xCA 0x19 # GOOD64: c.slli s3, 50 -0xCE 0x19 # BAD32: invalid instruction encoding -0xCE 0x19 # GOOD64: c.slli s3, 51 -0xD2 0x19 # BAD32: invalid instruction encoding -0xD2 0x19 # GOOD64: c.slli s3, 52 -0xD6 0x19 # BAD32: invalid instruction encoding -0xD6 0x19 # GOOD64: c.slli s3, 53 -0xDA 0x19 # BAD32: invalid instruction encoding -0xDA 0x19 # GOOD64: c.slli s3, 54 -0xDE 0x19 # BAD32: invalid instruction encoding -0xDE 0x19 # GOOD64: c.slli s3, 55 -0xE2 0x19 # BAD32: invalid instruction encoding -0xE2 0x19 # GOOD64: c.slli s3, 56 -0xE6 0x19 # BAD32: invalid instruction encoding -0xE6 0x19 # GOOD64: c.slli s3, 57 -0xEA 0x19 # BAD32: invalid instruction encoding -0xEA 0x19 # GOOD64: c.slli s3, 58 -0xEE 0x19 # BAD32: invalid instruction encoding -0xEE 0x19 # GOOD64: c.slli s3, 59 -0xF2 0x19 # BAD32: invalid instruction encoding -0xF2 0x19 # GOOD64: c.slli s3, 60 -0xF6 0x19 # BAD32: invalid instruction encoding -0xF6 0x19 # GOOD64: c.slli s3, 61 -0xFA 0x19 # BAD32: invalid instruction encoding -0xFA 0x19 # GOOD64: c.slli s3, 62 -0xFE 0x19 # BAD32: invalid instruction encoding -0xFE 0x19 # GOOD64: c.slli s3, 63 -# GOOD: c.slli64 s4 +0x4A 0x00 + +# GOOD: c.slli zero, 19 # NOHINTS: invalid instruction encoding -0x02 0x0A -0x06 0x0A # GOOD: c.slli s4, 1 -0x0A 0x0A # GOOD: c.slli s4, 2 -0x0E 0x0A # GOOD: c.slli s4, 3 -0x12 0x0A # GOOD: c.slli s4, 4 -0x16 0x0A # GOOD: c.slli s4, 5 -0x1A 0x0A # GOOD: c.slli s4, 6 -0x1E 0x0A # GOOD: c.slli s4, 7 -0x22 0x0A # GOOD: c.slli s4, 8 -0x26 0x0A # GOOD: c.slli s4, 9 -0x2A 0x0A # GOOD: c.slli s4, 10 -0x2E 0x0A # GOOD: c.slli s4, 11 -0x32 0x0A # GOOD: c.slli s4, 12 -0x36 0x0A # GOOD: c.slli s4, 13 -0x3A 0x0A # GOOD: c.slli s4, 14 -0x3E 0x0A # GOOD: c.slli s4, 15 -0x42 0x0A # GOOD: c.slli s4, 16 -0x46 0x0A # GOOD: c.slli s4, 17 -0x4A 0x0A # GOOD: c.slli s4, 18 -0x4E 0x0A # GOOD: c.slli s4, 19 -0x52 0x0A # GOOD: c.slli s4, 20 -0x56 0x0A # GOOD: c.slli s4, 21 -0x5A 0x0A # GOOD: c.slli s4, 22 -0x5E 0x0A # GOOD: c.slli s4, 23 -0x62 0x0A # GOOD: c.slli s4, 24 -0x66 0x0A # GOOD: c.slli s4, 25 -0x6A 0x0A # GOOD: c.slli s4, 26 -0x6E 0x0A # GOOD: c.slli s4, 27 -0x72 0x0A # GOOD: c.slli s4, 28 -0x76 0x0A # GOOD: c.slli s4, 29 -0x7A 0x0A # GOOD: c.slli s4, 30 -0x7E 0x0A # GOOD: c.slli s4, 31 -0x02 0x1A # BAD32: invalid instruction encoding -0x02 0x1A # GOOD64: c.slli s4, 32 -0x06 0x1A # BAD32: invalid instruction encoding -0x06 0x1A # GOOD64: c.slli s4, 33 -0x0A 0x1A # BAD32: invalid instruction encoding -0x0A 0x1A # GOOD64: c.slli s4, 34 -0x0E 0x1A # BAD32: invalid instruction encoding -0x0E 0x1A # GOOD64: c.slli s4, 35 -0x12 0x1A # BAD32: invalid instruction encoding -0x12 0x1A # GOOD64: c.slli s4, 36 -0x16 0x1A # BAD32: invalid instruction encoding -0x16 0x1A # GOOD64: c.slli s4, 37 -0x1A 0x1A # BAD32: invalid instruction encoding -0x1A 0x1A # GOOD64: c.slli s4, 38 -0x1E 0x1A # BAD32: invalid instruction encoding -0x1E 0x1A # GOOD64: c.slli s4, 39 -0x22 0x1A # BAD32: invalid instruction encoding -0x22 0x1A # GOOD64: c.slli s4, 40 -0x26 0x1A # BAD32: invalid instruction encoding -0x26 0x1A # GOOD64: c.slli s4, 41 -0x2A 0x1A # BAD32: invalid instruction encoding -0x2A 0x1A # GOOD64: c.slli s4, 42 -0x2E 0x1A # BAD32: invalid instruction encoding -0x2E 0x1A # GOOD64: c.slli s4, 43 -0x32 0x1A # BAD32: invalid instruction encoding -0x32 0x1A # GOOD64: c.slli s4, 44 -0x36 0x1A # BAD32: invalid instruction encoding -0x36 0x1A # GOOD64: c.slli s4, 45 -0x3A 0x1A # BAD32: invalid instruction encoding -0x3A 0x1A # GOOD64: c.slli s4, 46 -0x3E 0x1A # BAD32: invalid instruction encoding -0x3E 0x1A # GOOD64: c.slli s4, 47 -0x42 0x1A # BAD32: invalid instruction encoding -0x42 0x1A # GOOD64: c.slli s4, 48 -0x46 0x1A # BAD32: invalid instruction encoding -0x46 0x1A # GOOD64: c.slli s4, 49 -0x4A 0x1A # BAD32: invalid instruction encoding -0x4A 0x1A # GOOD64: c.slli s4, 50 -0x4E 0x1A # BAD32: invalid instruction encoding -0x4E 0x1A # GOOD64: c.slli s4, 51 -0x52 0x1A # BAD32: invalid instruction encoding -0x52 0x1A # GOOD64: c.slli s4, 52 -0x56 0x1A # BAD32: invalid instruction encoding -0x56 0x1A # GOOD64: c.slli s4, 53 -0x5A 0x1A # BAD32: invalid instruction encoding -0x5A 0x1A # GOOD64: c.slli s4, 54 -0x5E 0x1A # BAD32: invalid instruction encoding -0x5E 0x1A # GOOD64: c.slli s4, 55 -0x62 0x1A # BAD32: invalid instruction encoding -0x62 0x1A # GOOD64: c.slli s4, 56 -0x66 0x1A # BAD32: invalid instruction encoding -0x66 0x1A # GOOD64: c.slli s4, 57 -0x6A 0x1A # BAD32: invalid instruction encoding -0x6A 0x1A # GOOD64: c.slli s4, 58 -0x6E 0x1A # BAD32: invalid instruction encoding -0x6E 0x1A # GOOD64: c.slli s4, 59 -0x72 0x1A # BAD32: invalid instruction encoding -0x72 0x1A # GOOD64: c.slli s4, 60 -0x76 0x1A # BAD32: invalid instruction encoding -0x76 0x1A # GOOD64: c.slli s4, 61 -0x7A 0x1A # BAD32: invalid instruction encoding -0x7A 0x1A # GOOD64: c.slli s4, 62 -0x7E 0x1A # BAD32: invalid instruction encoding -0x7E 0x1A # GOOD64: c.slli s4, 63 -# GOOD: c.slli64 s5 +0x4E 0x00 + +# GOOD: c.slli zero, 20 # NOHINTS: invalid instruction encoding -0x82 0x0A -0x86 0x0A # GOOD: c.slli s5, 1 -0x8A 0x0A # GOOD: c.slli s5, 2 -0x8E 0x0A # GOOD: c.slli s5, 3 -0x92 0x0A # GOOD: c.slli s5, 4 -0x96 0x0A # GOOD: c.slli s5, 5 -0x9A 0x0A # GOOD: c.slli s5, 6 -0x9E 0x0A # GOOD: c.slli s5, 7 -0xA2 0x0A # GOOD: c.slli s5, 8 -0xA6 0x0A # GOOD: c.slli s5, 9 -0xAA 0x0A # GOOD: c.slli s5, 10 -0xAE 0x0A # GOOD: c.slli s5, 11 -0xB2 0x0A # GOOD: c.slli s5, 12 -0xB6 0x0A # GOOD: c.slli s5, 13 -0xBA 0x0A # GOOD: c.slli s5, 14 -0xBE 0x0A # GOOD: c.slli s5, 15 -0xC2 0x0A # GOOD: c.slli s5, 16 -0xC6 0x0A # GOOD: c.slli s5, 17 -0xCA 0x0A # GOOD: c.slli s5, 18 -0xCE 0x0A # GOOD: c.slli s5, 19 -0xD2 0x0A # GOOD: c.slli s5, 20 -0xD6 0x0A # GOOD: c.slli s5, 21 -0xDA 0x0A # GOOD: c.slli s5, 22 -0xDE 0x0A # GOOD: c.slli s5, 23 -0xE2 0x0A # GOOD: c.slli s5, 24 -0xE6 0x0A # GOOD: c.slli s5, 25 -0xEA 0x0A # GOOD: c.slli s5, 26 -0xEE 0x0A # GOOD: c.slli s5, 27 -0xF2 0x0A # GOOD: c.slli s5, 28 -0xF6 0x0A # GOOD: c.slli s5, 29 -0xFA 0x0A # GOOD: c.slli s5, 30 -0xFE 0x0A # GOOD: c.slli s5, 31 -0x82 0x1A # BAD32: invalid instruction encoding -0x82 0x1A # GOOD64: c.slli s5, 32 -0x86 0x1A # BAD32: invalid instruction encoding -0x86 0x1A # GOOD64: c.slli s5, 33 -0x8A 0x1A # BAD32: invalid instruction encoding -0x8A 0x1A # GOOD64: c.slli s5, 34 -0x8E 0x1A # BAD32: invalid instruction encoding -0x8E 0x1A # GOOD64: c.slli s5, 35 -0x92 0x1A # BAD32: invalid instruction encoding -0x92 0x1A # GOOD64: c.slli s5, 36 -0x96 0x1A # BAD32: invalid instruction encoding -0x96 0x1A # GOOD64: c.slli s5, 37 -0x9A 0x1A # BAD32: invalid instruction encoding -0x9A 0x1A # GOOD64: c.slli s5, 38 -0x9E 0x1A # BAD32: invalid instruction encoding -0x9E 0x1A # GOOD64: c.slli s5, 39 -0xA2 0x1A # BAD32: invalid instruction encoding -0xA2 0x1A # GOOD64: c.slli s5, 40 -0xA6 0x1A # BAD32: invalid instruction encoding -0xA6 0x1A # GOOD64: c.slli s5, 41 -0xAA 0x1A # BAD32: invalid instruction encoding -0xAA 0x1A # GOOD64: c.slli s5, 42 -0xAE 0x1A # BAD32: invalid instruction encoding -0xAE 0x1A # GOOD64: c.slli s5, 43 -0xB2 0x1A # BAD32: invalid instruction encoding -0xB2 0x1A # GOOD64: c.slli s5, 44 -0xB6 0x1A # BAD32: invalid instruction encoding -0xB6 0x1A # GOOD64: c.slli s5, 45 -0xBA 0x1A # BAD32: invalid instruction encoding -0xBA 0x1A # GOOD64: c.slli s5, 46 -0xBE 0x1A # BAD32: invalid instruction encoding -0xBE 0x1A # GOOD64: c.slli s5, 47 -0xC2 0x1A # BAD32: invalid instruction encoding -0xC2 0x1A # GOOD64: c.slli s5, 48 -0xC6 0x1A # BAD32: invalid instruction encoding -0xC6 0x1A # GOOD64: c.slli s5, 49 -0xCA 0x1A # BAD32: invalid instruction encoding -0xCA 0x1A # GOOD64: c.slli s5, 50 -0xCE 0x1A # BAD32: invalid instruction encoding -0xCE 0x1A # GOOD64: c.slli s5, 51 -0xD2 0x1A # BAD32: invalid instruction encoding -0xD2 0x1A # GOOD64: c.slli s5, 52 -0xD6 0x1A # BAD32: invalid instruction encoding -0xD6 0x1A # GOOD64: c.slli s5, 53 -0xDA 0x1A # BAD32: invalid instruction encoding -0xDA 0x1A # GOOD64: c.slli s5, 54 -0xDE 0x1A # BAD32: invalid instruction encoding -0xDE 0x1A # GOOD64: c.slli s5, 55 -0xE2 0x1A # BAD32: invalid instruction encoding -0xE2 0x1A # GOOD64: c.slli s5, 56 -0xE6 0x1A # BAD32: invalid instruction encoding -0xE6 0x1A # GOOD64: c.slli s5, 57 -0xEA 0x1A # BAD32: invalid instruction encoding -0xEA 0x1A # GOOD64: c.slli s5, 58 -0xEE 0x1A # BAD32: invalid instruction encoding -0xEE 0x1A # GOOD64: c.slli s5, 59 -0xF2 0x1A # BAD32: invalid instruction encoding -0xF2 0x1A # GOOD64: c.slli s5, 60 -0xF6 0x1A # BAD32: invalid instruction encoding -0xF6 0x1A # GOOD64: c.slli s5, 61 -0xFA 0x1A # BAD32: invalid instruction encoding -0xFA 0x1A # GOOD64: c.slli s5, 62 -0xFE 0x1A # BAD32: invalid instruction encoding -0xFE 0x1A # GOOD64: c.slli s5, 63 -# GOOD: c.slli64 s6 +0x52 0x00 + +# GOOD: c.slli zero, 21 # NOHINTS: invalid instruction encoding -0x02 0x0B -0x06 0x0B # GOOD: c.slli s6, 1 -0x0A 0x0B # GOOD: c.slli s6, 2 -0x0E 0x0B # GOOD: c.slli s6, 3 -0x12 0x0B # GOOD: c.slli s6, 4 -0x16 0x0B # GOOD: c.slli s6, 5 -0x1A 0x0B # GOOD: c.slli s6, 6 -0x1E 0x0B # GOOD: c.slli s6, 7 -0x22 0x0B # GOOD: c.slli s6, 8 -0x26 0x0B # GOOD: c.slli s6, 9 -0x2A 0x0B # GOOD: c.slli s6, 10 -0x2E 0x0B # GOOD: c.slli s6, 11 -0x32 0x0B # GOOD: c.slli s6, 12 -0x36 0x0B # GOOD: c.slli s6, 13 -0x3A 0x0B # GOOD: c.slli s6, 14 -0x3E 0x0B # GOOD: c.slli s6, 15 -0x42 0x0B # GOOD: c.slli s6, 16 -0x46 0x0B # GOOD: c.slli s6, 17 -0x4A 0x0B # GOOD: c.slli s6, 18 -0x4E 0x0B # GOOD: c.slli s6, 19 -0x52 0x0B # GOOD: c.slli s6, 20 -0x56 0x0B # GOOD: c.slli s6, 21 -0x5A 0x0B # GOOD: c.slli s6, 22 -0x5E 0x0B # GOOD: c.slli s6, 23 -0x62 0x0B # GOOD: c.slli s6, 24 -0x66 0x0B # GOOD: c.slli s6, 25 -0x6A 0x0B # GOOD: c.slli s6, 26 -0x6E 0x0B # GOOD: c.slli s6, 27 -0x72 0x0B # GOOD: c.slli s6, 28 -0x76 0x0B # GOOD: c.slli s6, 29 -0x7A 0x0B # GOOD: c.slli s6, 30 -0x7E 0x0B # GOOD: c.slli s6, 31 -0x02 0x1B # BAD32: invalid instruction encoding -0x02 0x1B # GOOD64: c.slli s6, 32 -0x06 0x1B # BAD32: invalid instruction encoding -0x06 0x1B # GOOD64: c.slli s6, 33 -0x0A 0x1B # BAD32: invalid instruction encoding -0x0A 0x1B # GOOD64: c.slli s6, 34 -0x0E 0x1B # BAD32: invalid instruction encoding -0x0E 0x1B # GOOD64: c.slli s6, 35 -0x12 0x1B # BAD32: invalid instruction encoding -0x12 0x1B # GOOD64: c.slli s6, 36 -0x16 0x1B # BAD32: invalid instruction encoding -0x16 0x1B # GOOD64: c.slli s6, 37 -0x1A 0x1B # BAD32: invalid instruction encoding -0x1A 0x1B # GOOD64: c.slli s6, 38 -0x1E 0x1B # BAD32: invalid instruction encoding -0x1E 0x1B # GOOD64: c.slli s6, 39 -0x22 0x1B # BAD32: invalid instruction encoding -0x22 0x1B # GOOD64: c.slli s6, 40 -0x26 0x1B # BAD32: invalid instruction encoding -0x26 0x1B # GOOD64: c.slli s6, 41 -0x2A 0x1B # BAD32: invalid instruction encoding -0x2A 0x1B # GOOD64: c.slli s6, 42 -0x2E 0x1B # BAD32: invalid instruction encoding -0x2E 0x1B # GOOD64: c.slli s6, 43 -0x32 0x1B # BAD32: invalid instruction encoding -0x32 0x1B # GOOD64: c.slli s6, 44 -0x36 0x1B # BAD32: invalid instruction encoding -0x36 0x1B # GOOD64: c.slli s6, 45 -0x3A 0x1B # BAD32: invalid instruction encoding -0x3A 0x1B # GOOD64: c.slli s6, 46 -0x3E 0x1B # BAD32: invalid instruction encoding -0x3E 0x1B # GOOD64: c.slli s6, 47 -0x42 0x1B # BAD32: invalid instruction encoding -0x42 0x1B # GOOD64: c.slli s6, 48 -0x46 0x1B # BAD32: invalid instruction encoding -0x46 0x1B # GOOD64: c.slli s6, 49 -0x4A 0x1B # BAD32: invalid instruction encoding -0x4A 0x1B # GOOD64: c.slli s6, 50 -0x4E 0x1B # BAD32: invalid instruction encoding -0x4E 0x1B # GOOD64: c.slli s6, 51 -0x52 0x1B # BAD32: invalid instruction encoding -0x52 0x1B # GOOD64: c.slli s6, 52 -0x56 0x1B # BAD32: invalid instruction encoding -0x56 0x1B # GOOD64: c.slli s6, 53 -0x5A 0x1B # BAD32: invalid instruction encoding -0x5A 0x1B # GOOD64: c.slli s6, 54 -0x5E 0x1B # BAD32: invalid instruction encoding -0x5E 0x1B # GOOD64: c.slli s6, 55 -0x62 0x1B # BAD32: invalid instruction encoding -0x62 0x1B # GOOD64: c.slli s6, 56 -0x66 0x1B # BAD32: invalid instruction encoding -0x66 0x1B # GOOD64: c.slli s6, 57 -0x6A 0x1B # BAD32: invalid instruction encoding -0x6A 0x1B # GOOD64: c.slli s6, 58 -0x6E 0x1B # BAD32: invalid instruction encoding -0x6E 0x1B # GOOD64: c.slli s6, 59 -0x72 0x1B # BAD32: invalid instruction encoding -0x72 0x1B # GOOD64: c.slli s6, 60 -0x76 0x1B # BAD32: invalid instruction encoding -0x76 0x1B # GOOD64: c.slli s6, 61 -0x7A 0x1B # BAD32: invalid instruction encoding -0x7A 0x1B # GOOD64: c.slli s6, 62 -0x7E 0x1B # BAD32: invalid instruction encoding -0x7E 0x1B # GOOD64: c.slli s6, 63 -# GOOD: c.slli64 s7 +0x56 0x00 + +# GOOD: c.slli zero, 22 # NOHINTS: invalid instruction encoding -0x82 0x0B -0x86 0x0B # GOOD: c.slli s7, 1 -0x8A 0x0B # GOOD: c.slli s7, 2 -0x8E 0x0B # GOOD: c.slli s7, 3 -0x92 0x0B # GOOD: c.slli s7, 4 -0x96 0x0B # GOOD: c.slli s7, 5 -0x9A 0x0B # GOOD: c.slli s7, 6 -0x9E 0x0B # GOOD: c.slli s7, 7 -0xA2 0x0B # GOOD: c.slli s7, 8 -0xA6 0x0B # GOOD: c.slli s7, 9 -0xAA 0x0B # GOOD: c.slli s7, 10 -0xAE 0x0B # GOOD: c.slli s7, 11 -0xB2 0x0B # GOOD: c.slli s7, 12 -0xB6 0x0B # GOOD: c.slli s7, 13 -0xBA 0x0B # GOOD: c.slli s7, 14 -0xBE 0x0B # GOOD: c.slli s7, 15 -0xC2 0x0B # GOOD: c.slli s7, 16 -0xC6 0x0B # GOOD: c.slli s7, 17 -0xCA 0x0B # GOOD: c.slli s7, 18 -0xCE 0x0B # GOOD: c.slli s7, 19 -0xD2 0x0B # GOOD: c.slli s7, 20 -0xD6 0x0B # GOOD: c.slli s7, 21 -0xDA 0x0B # GOOD: c.slli s7, 22 -0xDE 0x0B # GOOD: c.slli s7, 23 -0xE2 0x0B # GOOD: c.slli s7, 24 -0xE6 0x0B # GOOD: c.slli s7, 25 -0xEA 0x0B # GOOD: c.slli s7, 26 -0xEE 0x0B # GOOD: c.slli s7, 27 -0xF2 0x0B # GOOD: c.slli s7, 28 -0xF6 0x0B # GOOD: c.slli s7, 29 -0xFA 0x0B # GOOD: c.slli s7, 30 -0xFE 0x0B # GOOD: c.slli s7, 31 -0x82 0x1B # BAD32: invalid instruction encoding -0x82 0x1B # GOOD64: c.slli s7, 32 -0x86 0x1B # BAD32: invalid instruction encoding -0x86 0x1B # GOOD64: c.slli s7, 33 -0x8A 0x1B # BAD32: invalid instruction encoding -0x8A 0x1B # GOOD64: c.slli s7, 34 -0x8E 0x1B # BAD32: invalid instruction encoding -0x8E 0x1B # GOOD64: c.slli s7, 35 -0x92 0x1B # BAD32: invalid instruction encoding -0x92 0x1B # GOOD64: c.slli s7, 36 -0x96 0x1B # BAD32: invalid instruction encoding -0x96 0x1B # GOOD64: c.slli s7, 37 -0x9A 0x1B # BAD32: invalid instruction encoding -0x9A 0x1B # GOOD64: c.slli s7, 38 -0x9E 0x1B # BAD32: invalid instruction encoding -0x9E 0x1B # GOOD64: c.slli s7, 39 -0xA2 0x1B # BAD32: invalid instruction encoding -0xA2 0x1B # GOOD64: c.slli s7, 40 -0xA6 0x1B # BAD32: invalid instruction encoding -0xA6 0x1B # GOOD64: c.slli s7, 41 -0xAA 0x1B # BAD32: invalid instruction encoding -0xAA 0x1B # GOOD64: c.slli s7, 42 -0xAE 0x1B # BAD32: invalid instruction encoding -0xAE 0x1B # GOOD64: c.slli s7, 43 -0xB2 0x1B # BAD32: invalid instruction encoding -0xB2 0x1B # GOOD64: c.slli s7, 44 -0xB6 0x1B # BAD32: invalid instruction encoding -0xB6 0x1B # GOOD64: c.slli s7, 45 -0xBA 0x1B # BAD32: invalid instruction encoding -0xBA 0x1B # GOOD64: c.slli s7, 46 -0xBE 0x1B # BAD32: invalid instruction encoding -0xBE 0x1B # GOOD64: c.slli s7, 47 -0xC2 0x1B # BAD32: invalid instruction encoding -0xC2 0x1B # GOOD64: c.slli s7, 48 -0xC6 0x1B # BAD32: invalid instruction encoding -0xC6 0x1B # GOOD64: c.slli s7, 49 -0xCA 0x1B # BAD32: invalid instruction encoding -0xCA 0x1B # GOOD64: c.slli s7, 50 -0xCE 0x1B # BAD32: invalid instruction encoding -0xCE 0x1B # GOOD64: c.slli s7, 51 -0xD2 0x1B # BAD32: invalid instruction encoding -0xD2 0x1B # GOOD64: c.slli s7, 52 -0xD6 0x1B # BAD32: invalid instruction encoding -0xD6 0x1B # GOOD64: c.slli s7, 53 -0xDA 0x1B # BAD32: invalid instruction encoding -0xDA 0x1B # GOOD64: c.slli s7, 54 -0xDE 0x1B # BAD32: invalid instruction encoding -0xDE 0x1B # GOOD64: c.slli s7, 55 -0xE2 0x1B # BAD32: invalid instruction encoding -0xE2 0x1B # GOOD64: c.slli s7, 56 -0xE6 0x1B # BAD32: invalid instruction encoding -0xE6 0x1B # GOOD64: c.slli s7, 57 -0xEA 0x1B # BAD32: invalid instruction encoding -0xEA 0x1B # GOOD64: c.slli s7, 58 -0xEE 0x1B # BAD32: invalid instruction encoding -0xEE 0x1B # GOOD64: c.slli s7, 59 -0xF2 0x1B # BAD32: invalid instruction encoding -0xF2 0x1B # GOOD64: c.slli s7, 60 -0xF6 0x1B # BAD32: invalid instruction encoding -0xF6 0x1B # GOOD64: c.slli s7, 61 -0xFA 0x1B # BAD32: invalid instruction encoding -0xFA 0x1B # GOOD64: c.slli s7, 62 -0xFE 0x1B # BAD32: invalid instruction encoding -0xFE 0x1B # GOOD64: c.slli s7, 63 -# GOOD: c.slli64 s8 +0x5A 0x00 + +# GOOD: c.slli zero, 23 # NOHINTS: invalid instruction encoding -0x02 0x0C -0x06 0x0C # GOOD: c.slli s8, 1 -0x0A 0x0C # GOOD: c.slli s8, 2 -0x0E 0x0C # GOOD: c.slli s8, 3 -0x12 0x0C # GOOD: c.slli s8, 4 -0x16 0x0C # GOOD: c.slli s8, 5 -0x1A 0x0C # GOOD: c.slli s8, 6 -0x1E 0x0C # GOOD: c.slli s8, 7 -0x22 0x0C # GOOD: c.slli s8, 8 -0x26 0x0C # GOOD: c.slli s8, 9 -0x2A 0x0C # GOOD: c.slli s8, 10 -0x2E 0x0C # GOOD: c.slli s8, 11 -0x32 0x0C # GOOD: c.slli s8, 12 -0x36 0x0C # GOOD: c.slli s8, 13 -0x3A 0x0C # GOOD: c.slli s8, 14 -0x3E 0x0C # GOOD: c.slli s8, 15 -0x42 0x0C # GOOD: c.slli s8, 16 -0x46 0x0C # GOOD: c.slli s8, 17 -0x4A 0x0C # GOOD: c.slli s8, 18 -0x4E 0x0C # GOOD: c.slli s8, 19 -0x52 0x0C # GOOD: c.slli s8, 20 -0x56 0x0C # GOOD: c.slli s8, 21 -0x5A 0x0C # GOOD: c.slli s8, 22 -0x5E 0x0C # GOOD: c.slli s8, 23 -0x62 0x0C # GOOD: c.slli s8, 24 -0x66 0x0C # GOOD: c.slli s8, 25 -0x6A 0x0C # GOOD: c.slli s8, 26 -0x6E 0x0C # GOOD: c.slli s8, 27 -0x72 0x0C # GOOD: c.slli s8, 28 -0x76 0x0C # GOOD: c.slli s8, 29 -0x7A 0x0C # GOOD: c.slli s8, 30 -0x7E 0x0C # GOOD: c.slli s8, 31 -0x02 0x1C # BAD32: invalid instruction encoding -0x02 0x1C # GOOD64: c.slli s8, 32 -0x06 0x1C # BAD32: invalid instruction encoding -0x06 0x1C # GOOD64: c.slli s8, 33 -0x0A 0x1C # BAD32: invalid instruction encoding -0x0A 0x1C # GOOD64: c.slli s8, 34 -0x0E 0x1C # BAD32: invalid instruction encoding -0x0E 0x1C # GOOD64: c.slli s8, 35 -0x12 0x1C # BAD32: invalid instruction encoding -0x12 0x1C # GOOD64: c.slli s8, 36 -0x16 0x1C # BAD32: invalid instruction encoding -0x16 0x1C # GOOD64: c.slli s8, 37 -0x1A 0x1C # BAD32: invalid instruction encoding -0x1A 0x1C # GOOD64: c.slli s8, 38 -0x1E 0x1C # BAD32: invalid instruction encoding -0x1E 0x1C # GOOD64: c.slli s8, 39 -0x22 0x1C # BAD32: invalid instruction encoding -0x22 0x1C # GOOD64: c.slli s8, 40 -0x26 0x1C # BAD32: invalid instruction encoding -0x26 0x1C # GOOD64: c.slli s8, 41 -0x2A 0x1C # BAD32: invalid instruction encoding -0x2A 0x1C # GOOD64: c.slli s8, 42 -0x2E 0x1C # BAD32: invalid instruction encoding -0x2E 0x1C # GOOD64: c.slli s8, 43 -0x32 0x1C # BAD32: invalid instruction encoding -0x32 0x1C # GOOD64: c.slli s8, 44 -0x36 0x1C # BAD32: invalid instruction encoding -0x36 0x1C # GOOD64: c.slli s8, 45 -0x3A 0x1C # BAD32: invalid instruction encoding -0x3A 0x1C # GOOD64: c.slli s8, 46 -0x3E 0x1C # BAD32: invalid instruction encoding -0x3E 0x1C # GOOD64: c.slli s8, 47 -0x42 0x1C # BAD32: invalid instruction encoding -0x42 0x1C # GOOD64: c.slli s8, 48 -0x46 0x1C # BAD32: invalid instruction encoding -0x46 0x1C # GOOD64: c.slli s8, 49 -0x4A 0x1C # BAD32: invalid instruction encoding -0x4A 0x1C # GOOD64: c.slli s8, 50 -0x4E 0x1C # BAD32: invalid instruction encoding -0x4E 0x1C # GOOD64: c.slli s8, 51 -0x52 0x1C # BAD32: invalid instruction encoding -0x52 0x1C # GOOD64: c.slli s8, 52 -0x56 0x1C # BAD32: invalid instruction encoding -0x56 0x1C # GOOD64: c.slli s8, 53 -0x5A 0x1C # BAD32: invalid instruction encoding -0x5A 0x1C # GOOD64: c.slli s8, 54 -0x5E 0x1C # BAD32: invalid instruction encoding -0x5E 0x1C # GOOD64: c.slli s8, 55 -0x62 0x1C # BAD32: invalid instruction encoding -0x62 0x1C # GOOD64: c.slli s8, 56 -0x66 0x1C # BAD32: invalid instruction encoding -0x66 0x1C # GOOD64: c.slli s8, 57 -0x6A 0x1C # BAD32: invalid instruction encoding -0x6A 0x1C # GOOD64: c.slli s8, 58 -0x6E 0x1C # BAD32: invalid instruction encoding -0x6E 0x1C # GOOD64: c.slli s8, 59 -0x72 0x1C # BAD32: invalid instruction encoding -0x72 0x1C # GOOD64: c.slli s8, 60 -0x76 0x1C # BAD32: invalid instruction encoding -0x76 0x1C # GOOD64: c.slli s8, 61 -0x7A 0x1C # BAD32: invalid instruction encoding -0x7A 0x1C # GOOD64: c.slli s8, 62 -0x7E 0x1C # BAD32: invalid instruction encoding -0x7E 0x1C # GOOD64: c.slli s8, 63 -# GOOD: c.slli64 s9 +0x5E 0x00 + +# GOOD: c.slli zero, 24 # NOHINTS: invalid instruction encoding -0x82 0x0C -0x86 0x0C # GOOD: c.slli s9, 1 -0x8A 0x0C # GOOD: c.slli s9, 2 -0x8E 0x0C # GOOD: c.slli s9, 3 -0x92 0x0C # GOOD: c.slli s9, 4 -0x96 0x0C # GOOD: c.slli s9, 5 -0x9A 0x0C # GOOD: c.slli s9, 6 -0x9E 0x0C # GOOD: c.slli s9, 7 -0xA2 0x0C # GOOD: c.slli s9, 8 -0xA6 0x0C # GOOD: c.slli s9, 9 -0xAA 0x0C # GOOD: c.slli s9, 10 -0xAE 0x0C # GOOD: c.slli s9, 11 -0xB2 0x0C # GOOD: c.slli s9, 12 -0xB6 0x0C # GOOD: c.slli s9, 13 -0xBA 0x0C # GOOD: c.slli s9, 14 -0xBE 0x0C # GOOD: c.slli s9, 15 -0xC2 0x0C # GOOD: c.slli s9, 16 -0xC6 0x0C # GOOD: c.slli s9, 17 -0xCA 0x0C # GOOD: c.slli s9, 18 -0xCE 0x0C # GOOD: c.slli s9, 19 -0xD2 0x0C # GOOD: c.slli s9, 20 -0xD6 0x0C # GOOD: c.slli s9, 21 -0xDA 0x0C # GOOD: c.slli s9, 22 -0xDE 0x0C # GOOD: c.slli s9, 23 -0xE2 0x0C # GOOD: c.slli s9, 24 -0xE6 0x0C # GOOD: c.slli s9, 25 -0xEA 0x0C # GOOD: c.slli s9, 26 -0xEE 0x0C # GOOD: c.slli s9, 27 -0xF2 0x0C # GOOD: c.slli s9, 28 -0xF6 0x0C # GOOD: c.slli s9, 29 -0xFA 0x0C # GOOD: c.slli s9, 30 -0xFE 0x0C # GOOD: c.slli s9, 31 -0x82 0x1C # BAD32: invalid instruction encoding -0x82 0x1C # GOOD64: c.slli s9, 32 -0x86 0x1C # BAD32: invalid instruction encoding -0x86 0x1C # GOOD64: c.slli s9, 33 -0x8A 0x1C # BAD32: invalid instruction encoding -0x8A 0x1C # GOOD64: c.slli s9, 34 -0x8E 0x1C # BAD32: invalid instruction encoding -0x8E 0x1C # GOOD64: c.slli s9, 35 -0x92 0x1C # BAD32: invalid instruction encoding -0x92 0x1C # GOOD64: c.slli s9, 36 -0x96 0x1C # BAD32: invalid instruction encoding -0x96 0x1C # GOOD64: c.slli s9, 37 -0x9A 0x1C # BAD32: invalid instruction encoding -0x9A 0x1C # GOOD64: c.slli s9, 38 -0x9E 0x1C # BAD32: invalid instruction encoding -0x9E 0x1C # GOOD64: c.slli s9, 39 -0xA2 0x1C # BAD32: invalid instruction encoding -0xA2 0x1C # GOOD64: c.slli s9, 40 -0xA6 0x1C # BAD32: invalid instruction encoding -0xA6 0x1C # GOOD64: c.slli s9, 41 -0xAA 0x1C # BAD32: invalid instruction encoding -0xAA 0x1C # GOOD64: c.slli s9, 42 -0xAE 0x1C # BAD32: invalid instruction encoding -0xAE 0x1C # GOOD64: c.slli s9, 43 -0xB2 0x1C # BAD32: invalid instruction encoding -0xB2 0x1C # GOOD64: c.slli s9, 44 -0xB6 0x1C # BAD32: invalid instruction encoding -0xB6 0x1C # GOOD64: c.slli s9, 45 -0xBA 0x1C # BAD32: invalid instruction encoding -0xBA 0x1C # GOOD64: c.slli s9, 46 -0xBE 0x1C # BAD32: invalid instruction encoding -0xBE 0x1C # GOOD64: c.slli s9, 47 -0xC2 0x1C # BAD32: invalid instruction encoding -0xC2 0x1C # GOOD64: c.slli s9, 48 -0xC6 0x1C # BAD32: invalid instruction encoding -0xC6 0x1C # GOOD64: c.slli s9, 49 -0xCA 0x1C # BAD32: invalid instruction encoding -0xCA 0x1C # GOOD64: c.slli s9, 50 -0xCE 0x1C # BAD32: invalid instruction encoding -0xCE 0x1C # GOOD64: c.slli s9, 51 -0xD2 0x1C # BAD32: invalid instruction encoding -0xD2 0x1C # GOOD64: c.slli s9, 52 -0xD6 0x1C # BAD32: invalid instruction encoding -0xD6 0x1C # GOOD64: c.slli s9, 53 -0xDA 0x1C # BAD32: invalid instruction encoding -0xDA 0x1C # GOOD64: c.slli s9, 54 -0xDE 0x1C # BAD32: invalid instruction encoding -0xDE 0x1C # GOOD64: c.slli s9, 55 -0xE2 0x1C # BAD32: invalid instruction encoding -0xE2 0x1C # GOOD64: c.slli s9, 56 -0xE6 0x1C # BAD32: invalid instruction encoding -0xE6 0x1C # GOOD64: c.slli s9, 57 -0xEA 0x1C # BAD32: invalid instruction encoding -0xEA 0x1C # GOOD64: c.slli s9, 58 -0xEE 0x1C # BAD32: invalid instruction encoding -0xEE 0x1C # GOOD64: c.slli s9, 59 -0xF2 0x1C # BAD32: invalid instruction encoding -0xF2 0x1C # GOOD64: c.slli s9, 60 -0xF6 0x1C # BAD32: invalid instruction encoding -0xF6 0x1C # GOOD64: c.slli s9, 61 -0xFA 0x1C # BAD32: invalid instruction encoding -0xFA 0x1C # GOOD64: c.slli s9, 62 -0xFE 0x1C # BAD32: invalid instruction encoding -0xFE 0x1C # GOOD64: c.slli s9, 63 -# GOOD: c.slli64 s10 +0x62 0x00 + +# GOOD: c.slli zero, 25 # NOHINTS: invalid instruction encoding -0x02 0x0D -0x06 0x0D # GOOD: c.slli s10, 1 -0x0A 0x0D # GOOD: c.slli s10, 2 -0x0E 0x0D # GOOD: c.slli s10, 3 -0x12 0x0D # GOOD: c.slli s10, 4 -0x16 0x0D # GOOD: c.slli s10, 5 -0x1A 0x0D # GOOD: c.slli s10, 6 -0x1E 0x0D # GOOD: c.slli s10, 7 -0x22 0x0D # GOOD: c.slli s10, 8 -0x26 0x0D # GOOD: c.slli s10, 9 -0x2A 0x0D # GOOD: c.slli s10, 10 -0x2E 0x0D # GOOD: c.slli s10, 11 -0x32 0x0D # GOOD: c.slli s10, 12 -0x36 0x0D # GOOD: c.slli s10, 13 -0x3A 0x0D # GOOD: c.slli s10, 14 -0x3E 0x0D # GOOD: c.slli s10, 15 -0x42 0x0D # GOOD: c.slli s10, 16 -0x46 0x0D # GOOD: c.slli s10, 17 -0x4A 0x0D # GOOD: c.slli s10, 18 -0x4E 0x0D # GOOD: c.slli s10, 19 -0x52 0x0D # GOOD: c.slli s10, 20 -0x56 0x0D # GOOD: c.slli s10, 21 -0x5A 0x0D # GOOD: c.slli s10, 22 -0x5E 0x0D # GOOD: c.slli s10, 23 -0x62 0x0D # GOOD: c.slli s10, 24 -0x66 0x0D # GOOD: c.slli s10, 25 -0x6A 0x0D # GOOD: c.slli s10, 26 -0x6E 0x0D # GOOD: c.slli s10, 27 -0x72 0x0D # GOOD: c.slli s10, 28 -0x76 0x0D # GOOD: c.slli s10, 29 -0x7A 0x0D # GOOD: c.slli s10, 30 -0x7E 0x0D # GOOD: c.slli s10, 31 -0x02 0x1D # BAD32: invalid instruction encoding -0x02 0x1D # GOOD64: c.slli s10, 32 -0x06 0x1D # BAD32: invalid instruction encoding -0x06 0x1D # GOOD64: c.slli s10, 33 -0x0A 0x1D # BAD32: invalid instruction encoding -0x0A 0x1D # GOOD64: c.slli s10, 34 -0x0E 0x1D # BAD32: invalid instruction encoding -0x0E 0x1D # GOOD64: c.slli s10, 35 -0x12 0x1D # BAD32: invalid instruction encoding -0x12 0x1D # GOOD64: c.slli s10, 36 -0x16 0x1D # BAD32: invalid instruction encoding -0x16 0x1D # GOOD64: c.slli s10, 37 -0x1A 0x1D # BAD32: invalid instruction encoding -0x1A 0x1D # GOOD64: c.slli s10, 38 -0x1E 0x1D # BAD32: invalid instruction encoding -0x1E 0x1D # GOOD64: c.slli s10, 39 -0x22 0x1D # BAD32: invalid instruction encoding -0x22 0x1D # GOOD64: c.slli s10, 40 -0x26 0x1D # BAD32: invalid instruction encoding -0x26 0x1D # GOOD64: c.slli s10, 41 -0x2A 0x1D # BAD32: invalid instruction encoding -0x2A 0x1D # GOOD64: c.slli s10, 42 -0x2E 0x1D # BAD32: invalid instruction encoding -0x2E 0x1D # GOOD64: c.slli s10, 43 -0x32 0x1D # BAD32: invalid instruction encoding -0x32 0x1D # GOOD64: c.slli s10, 44 -0x36 0x1D # BAD32: invalid instruction encoding -0x36 0x1D # GOOD64: c.slli s10, 45 -0x3A 0x1D # BAD32: invalid instruction encoding -0x3A 0x1D # GOOD64: c.slli s10, 46 -0x3E 0x1D # BAD32: invalid instruction encoding -0x3E 0x1D # GOOD64: c.slli s10, 47 -0x42 0x1D # BAD32: invalid instruction encoding -0x42 0x1D # GOOD64: c.slli s10, 48 -0x46 0x1D # BAD32: invalid instruction encoding -0x46 0x1D # GOOD64: c.slli s10, 49 -0x4A 0x1D # BAD32: invalid instruction encoding -0x4A 0x1D # GOOD64: c.slli s10, 50 -0x4E 0x1D # BAD32: invalid instruction encoding -0x4E 0x1D # GOOD64: c.slli s10, 51 -0x52 0x1D # BAD32: invalid instruction encoding -0x52 0x1D # GOOD64: c.slli s10, 52 -0x56 0x1D # BAD32: invalid instruction encoding -0x56 0x1D # GOOD64: c.slli s10, 53 -0x5A 0x1D # BAD32: invalid instruction encoding -0x5A 0x1D # GOOD64: c.slli s10, 54 -0x5E 0x1D # BAD32: invalid instruction encoding -0x5E 0x1D # GOOD64: c.slli s10, 55 -0x62 0x1D # BAD32: invalid instruction encoding -0x62 0x1D # GOOD64: c.slli s10, 56 -0x66 0x1D # BAD32: invalid instruction encoding -0x66 0x1D # GOOD64: c.slli s10, 57 -0x6A 0x1D # BAD32: invalid instruction encoding -0x6A 0x1D # GOOD64: c.slli s10, 58 -0x6E 0x1D # BAD32: invalid instruction encoding -0x6E 0x1D # GOOD64: c.slli s10, 59 -0x72 0x1D # BAD32: invalid instruction encoding -0x72 0x1D # GOOD64: c.slli s10, 60 -0x76 0x1D # BAD32: invalid instruction encoding -0x76 0x1D # GOOD64: c.slli s10, 61 -0x7A 0x1D # BAD32: invalid instruction encoding -0x7A 0x1D # GOOD64: c.slli s10, 62 -0x7E 0x1D # BAD32: invalid instruction encoding -0x7E 0x1D # GOOD64: c.slli s10, 63 -# GOOD: c.slli64 s11 +0x66 0x00 + +# GOOD: c.slli zero, 26 # NOHINTS: invalid instruction encoding -0x82 0x0D -0x86 0x0D # GOOD: c.slli s11, 1 -0x8A 0x0D # GOOD: c.slli s11, 2 -0x8E 0x0D # GOOD: c.slli s11, 3 -0x92 0x0D # GOOD: c.slli s11, 4 -0x96 0x0D # GOOD: c.slli s11, 5 -0x9A 0x0D # GOOD: c.slli s11, 6 -0x9E 0x0D # GOOD: c.slli s11, 7 -0xA2 0x0D # GOOD: c.slli s11, 8 -0xA6 0x0D # GOOD: c.slli s11, 9 -0xAA 0x0D # GOOD: c.slli s11, 10 -0xAE 0x0D # GOOD: c.slli s11, 11 -0xB2 0x0D # GOOD: c.slli s11, 12 -0xB6 0x0D # GOOD: c.slli s11, 13 -0xBA 0x0D # GOOD: c.slli s11, 14 -0xBE 0x0D # GOOD: c.slli s11, 15 -0xC2 0x0D # GOOD: c.slli s11, 16 -0xC6 0x0D # GOOD: c.slli s11, 17 -0xCA 0x0D # GOOD: c.slli s11, 18 -0xCE 0x0D # GOOD: c.slli s11, 19 -0xD2 0x0D # GOOD: c.slli s11, 20 -0xD6 0x0D # GOOD: c.slli s11, 21 -0xDA 0x0D # GOOD: c.slli s11, 22 -0xDE 0x0D # GOOD: c.slli s11, 23 -0xE2 0x0D # GOOD: c.slli s11, 24 -0xE6 0x0D # GOOD: c.slli s11, 25 -0xEA 0x0D # GOOD: c.slli s11, 26 -0xEE 0x0D # GOOD: c.slli s11, 27 -0xF2 0x0D # GOOD: c.slli s11, 28 -0xF6 0x0D # GOOD: c.slli s11, 29 -0xFA 0x0D # GOOD: c.slli s11, 30 -0xFE 0x0D # GOOD: c.slli s11, 31 -0x82 0x1D # BAD32: invalid instruction encoding -0x82 0x1D # GOOD64: c.slli s11, 32 -0x86 0x1D # BAD32: invalid instruction encoding -0x86 0x1D # GOOD64: c.slli s11, 33 -0x8A 0x1D # BAD32: invalid instruction encoding -0x8A 0x1D # GOOD64: c.slli s11, 34 -0x8E 0x1D # BAD32: invalid instruction encoding -0x8E 0x1D # GOOD64: c.slli s11, 35 -0x92 0x1D # BAD32: invalid instruction encoding -0x92 0x1D # GOOD64: c.slli s11, 36 -0x96 0x1D # BAD32: invalid instruction encoding -0x96 0x1D # GOOD64: c.slli s11, 37 -0x9A 0x1D # BAD32: invalid instruction encoding -0x9A 0x1D # GOOD64: c.slli s11, 38 -0x9E 0x1D # BAD32: invalid instruction encoding -0x9E 0x1D # GOOD64: c.slli s11, 39 -0xA2 0x1D # BAD32: invalid instruction encoding -0xA2 0x1D # GOOD64: c.slli s11, 40 -0xA6 0x1D # BAD32: invalid instruction encoding -0xA6 0x1D # GOOD64: c.slli s11, 41 -0xAA 0x1D # BAD32: invalid instruction encoding -0xAA 0x1D # GOOD64: c.slli s11, 42 -0xAE 0x1D # BAD32: invalid instruction encoding -0xAE 0x1D # GOOD64: c.slli s11, 43 -0xB2 0x1D # BAD32: invalid instruction encoding -0xB2 0x1D # GOOD64: c.slli s11, 44 -0xB6 0x1D # BAD32: invalid instruction encoding -0xB6 0x1D # GOOD64: c.slli s11, 45 -0xBA 0x1D # BAD32: invalid instruction encoding -0xBA 0x1D # GOOD64: c.slli s11, 46 -0xBE 0x1D # BAD32: invalid instruction encoding -0xBE 0x1D # GOOD64: c.slli s11, 47 -0xC2 0x1D # BAD32: invalid instruction encoding -0xC2 0x1D # GOOD64: c.slli s11, 48 -0xC6 0x1D # BAD32: invalid instruction encoding -0xC6 0x1D # GOOD64: c.slli s11, 49 -0xCA 0x1D # BAD32: invalid instruction encoding -0xCA 0x1D # GOOD64: c.slli s11, 50 -0xCE 0x1D # BAD32: invalid instruction encoding -0xCE 0x1D # GOOD64: c.slli s11, 51 -0xD2 0x1D # BAD32: invalid instruction encoding -0xD2 0x1D # GOOD64: c.slli s11, 52 -0xD6 0x1D # BAD32: invalid instruction encoding -0xD6 0x1D # GOOD64: c.slli s11, 53 -0xDA 0x1D # BAD32: invalid instruction encoding -0xDA 0x1D # GOOD64: c.slli s11, 54 -0xDE 0x1D # BAD32: invalid instruction encoding -0xDE 0x1D # GOOD64: c.slli s11, 55 -0xE2 0x1D # BAD32: invalid instruction encoding -0xE2 0x1D # GOOD64: c.slli s11, 56 -0xE6 0x1D # BAD32: invalid instruction encoding -0xE6 0x1D # GOOD64: c.slli s11, 57 -0xEA 0x1D # BAD32: invalid instruction encoding -0xEA 0x1D # GOOD64: c.slli s11, 58 -0xEE 0x1D # BAD32: invalid instruction encoding -0xEE 0x1D # GOOD64: c.slli s11, 59 -0xF2 0x1D # BAD32: invalid instruction encoding -0xF2 0x1D # GOOD64: c.slli s11, 60 -0xF6 0x1D # BAD32: invalid instruction encoding -0xF6 0x1D # GOOD64: c.slli s11, 61 -0xFA 0x1D # BAD32: invalid instruction encoding -0xFA 0x1D # GOOD64: c.slli s11, 62 -0xFE 0x1D # BAD32: invalid instruction encoding -0xFE 0x1D # GOOD64: c.slli s11, 63 -# GOOD: c.slli64 t3 +0x6A 0x00 + +# GOOD: c.slli zero, 27 # NOHINTS: invalid instruction encoding -0x02 0x0E -0x06 0x0E # GOOD: c.slli t3, 1 -0x0A 0x0E # GOOD: c.slli t3, 2 -0x0E 0x0E # GOOD: c.slli t3, 3 -0x12 0x0E # GOOD: c.slli t3, 4 -0x16 0x0E # GOOD: c.slli t3, 5 -0x1A 0x0E # GOOD: c.slli t3, 6 -0x1E 0x0E # GOOD: c.slli t3, 7 -0x22 0x0E # GOOD: c.slli t3, 8 -0x26 0x0E # GOOD: c.slli t3, 9 -0x2A 0x0E # GOOD: c.slli t3, 10 -0x2E 0x0E # GOOD: c.slli t3, 11 -0x32 0x0E # GOOD: c.slli t3, 12 -0x36 0x0E # GOOD: c.slli t3, 13 -0x3A 0x0E # GOOD: c.slli t3, 14 -0x3E 0x0E # GOOD: c.slli t3, 15 -0x42 0x0E # GOOD: c.slli t3, 16 -0x46 0x0E # GOOD: c.slli t3, 17 -0x4A 0x0E # GOOD: c.slli t3, 18 -0x4E 0x0E # GOOD: c.slli t3, 19 -0x52 0x0E # GOOD: c.slli t3, 20 -0x56 0x0E # GOOD: c.slli t3, 21 -0x5A 0x0E # GOOD: c.slli t3, 22 -0x5E 0x0E # GOOD: c.slli t3, 23 -0x62 0x0E # GOOD: c.slli t3, 24 -0x66 0x0E # GOOD: c.slli t3, 25 -0x6A 0x0E # GOOD: c.slli t3, 26 -0x6E 0x0E # GOOD: c.slli t3, 27 -0x72 0x0E # GOOD: c.slli t3, 28 -0x76 0x0E # GOOD: c.slli t3, 29 -0x7A 0x0E # GOOD: c.slli t3, 30 -0x7E 0x0E # GOOD: c.slli t3, 31 -0x02 0x1E # BAD32: invalid instruction encoding -0x02 0x1E # GOOD64: c.slli t3, 32 -0x06 0x1E # BAD32: invalid instruction encoding -0x06 0x1E # GOOD64: c.slli t3, 33 -0x0A 0x1E # BAD32: invalid instruction encoding -0x0A 0x1E # GOOD64: c.slli t3, 34 -0x0E 0x1E # BAD32: invalid instruction encoding -0x0E 0x1E # GOOD64: c.slli t3, 35 -0x12 0x1E # BAD32: invalid instruction encoding -0x12 0x1E # GOOD64: c.slli t3, 36 -0x16 0x1E # BAD32: invalid instruction encoding -0x16 0x1E # GOOD64: c.slli t3, 37 -0x1A 0x1E # BAD32: invalid instruction encoding -0x1A 0x1E # GOOD64: c.slli t3, 38 -0x1E 0x1E # BAD32: invalid instruction encoding -0x1E 0x1E # GOOD64: c.slli t3, 39 -0x22 0x1E # BAD32: invalid instruction encoding -0x22 0x1E # GOOD64: c.slli t3, 40 -0x26 0x1E # BAD32: invalid instruction encoding -0x26 0x1E # GOOD64: c.slli t3, 41 -0x2A 0x1E # BAD32: invalid instruction encoding -0x2A 0x1E # GOOD64: c.slli t3, 42 -0x2E 0x1E # BAD32: invalid instruction encoding -0x2E 0x1E # GOOD64: c.slli t3, 43 -0x32 0x1E # BAD32: invalid instruction encoding -0x32 0x1E # GOOD64: c.slli t3, 44 -0x36 0x1E # BAD32: invalid instruction encoding -0x36 0x1E # GOOD64: c.slli t3, 45 -0x3A 0x1E # BAD32: invalid instruction encoding -0x3A 0x1E # GOOD64: c.slli t3, 46 -0x3E 0x1E # BAD32: invalid instruction encoding -0x3E 0x1E # GOOD64: c.slli t3, 47 -0x42 0x1E # BAD32: invalid instruction encoding -0x42 0x1E # GOOD64: c.slli t3, 48 -0x46 0x1E # BAD32: invalid instruction encoding -0x46 0x1E # GOOD64: c.slli t3, 49 -0x4A 0x1E # BAD32: invalid instruction encoding -0x4A 0x1E # GOOD64: c.slli t3, 50 -0x4E 0x1E # BAD32: invalid instruction encoding -0x4E 0x1E # GOOD64: c.slli t3, 51 -0x52 0x1E # BAD32: invalid instruction encoding -0x52 0x1E # GOOD64: c.slli t3, 52 -0x56 0x1E # BAD32: invalid instruction encoding -0x56 0x1E # GOOD64: c.slli t3, 53 -0x5A 0x1E # BAD32: invalid instruction encoding -0x5A 0x1E # GOOD64: c.slli t3, 54 -0x5E 0x1E # BAD32: invalid instruction encoding -0x5E 0x1E # GOOD64: c.slli t3, 55 -0x62 0x1E # BAD32: invalid instruction encoding -0x62 0x1E # GOOD64: c.slli t3, 56 -0x66 0x1E # BAD32: invalid instruction encoding -0x66 0x1E # GOOD64: c.slli t3, 57 -0x6A 0x1E # BAD32: invalid instruction encoding -0x6A 0x1E # GOOD64: c.slli t3, 58 -0x6E 0x1E # BAD32: invalid instruction encoding -0x6E 0x1E # GOOD64: c.slli t3, 59 -0x72 0x1E # BAD32: invalid instruction encoding -0x72 0x1E # GOOD64: c.slli t3, 60 -0x76 0x1E # BAD32: invalid instruction encoding -0x76 0x1E # GOOD64: c.slli t3, 61 -0x7A 0x1E # BAD32: invalid instruction encoding -0x7A 0x1E # GOOD64: c.slli t3, 62 -0x7E 0x1E # BAD32: invalid instruction encoding -0x7E 0x1E # GOOD64: c.slli t3, 63 -# GOOD: c.slli64 t4 +0x6E 0x00 + +# GOOD: c.slli zero, 28 # NOHINTS: invalid instruction encoding -0x82 0x0E -0x86 0x0E # GOOD: c.slli t4, 1 -0x8A 0x0E # GOOD: c.slli t4, 2 -0x8E 0x0E # GOOD: c.slli t4, 3 -0x92 0x0E # GOOD: c.slli t4, 4 -0x96 0x0E # GOOD: c.slli t4, 5 -0x9A 0x0E # GOOD: c.slli t4, 6 -0x9E 0x0E # GOOD: c.slli t4, 7 -0xA2 0x0E # GOOD: c.slli t4, 8 -0xA6 0x0E # GOOD: c.slli t4, 9 -0xAA 0x0E # GOOD: c.slli t4, 10 -0xAE 0x0E # GOOD: c.slli t4, 11 -0xB2 0x0E # GOOD: c.slli t4, 12 -0xB6 0x0E # GOOD: c.slli t4, 13 -0xBA 0x0E # GOOD: c.slli t4, 14 -0xBE 0x0E # GOOD: c.slli t4, 15 -0xC2 0x0E # GOOD: c.slli t4, 16 -0xC6 0x0E # GOOD: c.slli t4, 17 -0xCA 0x0E # GOOD: c.slli t4, 18 -0xCE 0x0E # GOOD: c.slli t4, 19 -0xD2 0x0E # GOOD: c.slli t4, 20 -0xD6 0x0E # GOOD: c.slli t4, 21 -0xDA 0x0E # GOOD: c.slli t4, 22 -0xDE 0x0E # GOOD: c.slli t4, 23 -0xE2 0x0E # GOOD: c.slli t4, 24 -0xE6 0x0E # GOOD: c.slli t4, 25 -0xEA 0x0E # GOOD: c.slli t4, 26 -0xEE 0x0E # GOOD: c.slli t4, 27 -0xF2 0x0E # GOOD: c.slli t4, 28 -0xF6 0x0E # GOOD: c.slli t4, 29 -0xFA 0x0E # GOOD: c.slli t4, 30 -0xFE 0x0E # GOOD: c.slli t4, 31 -0x82 0x1E # BAD32: invalid instruction encoding -0x82 0x1E # GOOD64: c.slli t4, 32 -0x86 0x1E # BAD32: invalid instruction encoding -0x86 0x1E # GOOD64: c.slli t4, 33 -0x8A 0x1E # BAD32: invalid instruction encoding -0x8A 0x1E # GOOD64: c.slli t4, 34 -0x8E 0x1E # BAD32: invalid instruction encoding -0x8E 0x1E # GOOD64: c.slli t4, 35 -0x92 0x1E # BAD32: invalid instruction encoding -0x92 0x1E # GOOD64: c.slli t4, 36 -0x96 0x1E # BAD32: invalid instruction encoding -0x96 0x1E # GOOD64: c.slli t4, 37 -0x9A 0x1E # BAD32: invalid instruction encoding -0x9A 0x1E # GOOD64: c.slli t4, 38 -0x9E 0x1E # BAD32: invalid instruction encoding -0x9E 0x1E # GOOD64: c.slli t4, 39 -0xA2 0x1E # BAD32: invalid instruction encoding -0xA2 0x1E # GOOD64: c.slli t4, 40 -0xA6 0x1E # BAD32: invalid instruction encoding -0xA6 0x1E # GOOD64: c.slli t4, 41 -0xAA 0x1E # BAD32: invalid instruction encoding -0xAA 0x1E # GOOD64: c.slli t4, 42 -0xAE 0x1E # BAD32: invalid instruction encoding -0xAE 0x1E # GOOD64: c.slli t4, 43 -0xB2 0x1E # BAD32: invalid instruction encoding -0xB2 0x1E # GOOD64: c.slli t4, 44 -0xB6 0x1E # BAD32: invalid instruction encoding -0xB6 0x1E # GOOD64: c.slli t4, 45 -0xBA 0x1E # BAD32: invalid instruction encoding -0xBA 0x1E # GOOD64: c.slli t4, 46 -0xBE 0x1E # BAD32: invalid instruction encoding -0xBE 0x1E # GOOD64: c.slli t4, 47 -0xC2 0x1E # BAD32: invalid instruction encoding -0xC2 0x1E # GOOD64: c.slli t4, 48 -0xC6 0x1E # BAD32: invalid instruction encoding -0xC6 0x1E # GOOD64: c.slli t4, 49 -0xCA 0x1E # BAD32: invalid instruction encoding -0xCA 0x1E # GOOD64: c.slli t4, 50 -0xCE 0x1E # BAD32: invalid instruction encoding -0xCE 0x1E # GOOD64: c.slli t4, 51 -0xD2 0x1E # BAD32: invalid instruction encoding -0xD2 0x1E # GOOD64: c.slli t4, 52 -0xD6 0x1E # BAD32: invalid instruction encoding -0xD6 0x1E # GOOD64: c.slli t4, 53 -0xDA 0x1E # BAD32: invalid instruction encoding -0xDA 0x1E # GOOD64: c.slli t4, 54 -0xDE 0x1E # BAD32: invalid instruction encoding -0xDE 0x1E # GOOD64: c.slli t4, 55 -0xE2 0x1E # BAD32: invalid instruction encoding -0xE2 0x1E # GOOD64: c.slli t4, 56 -0xE6 0x1E # BAD32: invalid instruction encoding -0xE6 0x1E # GOOD64: c.slli t4, 57 -0xEA 0x1E # BAD32: invalid instruction encoding -0xEA 0x1E # GOOD64: c.slli t4, 58 -0xEE 0x1E # BAD32: invalid instruction encoding -0xEE 0x1E # GOOD64: c.slli t4, 59 -0xF2 0x1E # BAD32: invalid instruction encoding -0xF2 0x1E # GOOD64: c.slli t4, 60 -0xF6 0x1E # BAD32: invalid instruction encoding -0xF6 0x1E # GOOD64: c.slli t4, 61 -0xFA 0x1E # BAD32: invalid instruction encoding -0xFA 0x1E # GOOD64: c.slli t4, 62 -0xFE 0x1E # BAD32: invalid instruction encoding -0xFE 0x1E # GOOD64: c.slli t4, 63 -# GOOD: c.slli64 t5 +0x72 0x00 + +# GOOD: c.slli zero, 29 # NOHINTS: invalid instruction encoding -0x02 0x0F -0x06 0x0F # GOOD: c.slli t5, 1 -0x0A 0x0F # GOOD: c.slli t5, 2 -0x0E 0x0F # GOOD: c.slli t5, 3 -0x12 0x0F # GOOD: c.slli t5, 4 -0x16 0x0F # GOOD: c.slli t5, 5 -0x1A 0x0F # GOOD: c.slli t5, 6 -0x1E 0x0F # GOOD: c.slli t5, 7 -0x22 0x0F # GOOD: c.slli t5, 8 -0x26 0x0F # GOOD: c.slli t5, 9 -0x2A 0x0F # GOOD: c.slli t5, 10 -0x2E 0x0F # GOOD: c.slli t5, 11 -0x32 0x0F # GOOD: c.slli t5, 12 -0x36 0x0F # GOOD: c.slli t5, 13 -0x3A 0x0F # GOOD: c.slli t5, 14 -0x3E 0x0F # GOOD: c.slli t5, 15 -0x42 0x0F # GOOD: c.slli t5, 16 -0x46 0x0F # GOOD: c.slli t5, 17 -0x4A 0x0F # GOOD: c.slli t5, 18 -0x4E 0x0F # GOOD: c.slli t5, 19 -0x52 0x0F # GOOD: c.slli t5, 20 -0x56 0x0F # GOOD: c.slli t5, 21 -0x5A 0x0F # GOOD: c.slli t5, 22 -0x5E 0x0F # GOOD: c.slli t5, 23 -0x62 0x0F # GOOD: c.slli t5, 24 -0x66 0x0F # GOOD: c.slli t5, 25 -0x6A 0x0F # GOOD: c.slli t5, 26 -0x6E 0x0F # GOOD: c.slli t5, 27 -0x72 0x0F # GOOD: c.slli t5, 28 -0x76 0x0F # GOOD: c.slli t5, 29 -0x7A 0x0F # GOOD: c.slli t5, 30 -0x7E 0x0F # GOOD: c.slli t5, 31 -0x02 0x1F # BAD32: invalid instruction encoding -0x02 0x1F # GOOD64: c.slli t5, 32 -0x06 0x1F # BAD32: invalid instruction encoding -0x06 0x1F # GOOD64: c.slli t5, 33 -0x0A 0x1F # BAD32: invalid instruction encoding -0x0A 0x1F # GOOD64: c.slli t5, 34 -0x0E 0x1F # BAD32: invalid instruction encoding -0x0E 0x1F # GOOD64: c.slli t5, 35 -0x12 0x1F # BAD32: invalid instruction encoding -0x12 0x1F # GOOD64: c.slli t5, 36 -0x16 0x1F # BAD32: invalid instruction encoding -0x16 0x1F # GOOD64: c.slli t5, 37 -0x1A 0x1F # BAD32: invalid instruction encoding -0x1A 0x1F # GOOD64: c.slli t5, 38 -0x1E 0x1F # BAD32: invalid instruction encoding -0x1E 0x1F # GOOD64: c.slli t5, 39 -0x22 0x1F # BAD32: invalid instruction encoding -0x22 0x1F # GOOD64: c.slli t5, 40 -0x26 0x1F # BAD32: invalid instruction encoding -0x26 0x1F # GOOD64: c.slli t5, 41 -0x2A 0x1F # BAD32: invalid instruction encoding -0x2A 0x1F # GOOD64: c.slli t5, 42 -0x2E 0x1F # BAD32: invalid instruction encoding -0x2E 0x1F # GOOD64: c.slli t5, 43 -0x32 0x1F # BAD32: invalid instruction encoding -0x32 0x1F # GOOD64: c.slli t5, 44 -0x36 0x1F # BAD32: invalid instruction encoding -0x36 0x1F # GOOD64: c.slli t5, 45 -0x3A 0x1F # BAD32: invalid instruction encoding -0x3A 0x1F # GOOD64: c.slli t5, 46 -0x3E 0x1F # BAD32: invalid instruction encoding -0x3E 0x1F # GOOD64: c.slli t5, 47 -0x42 0x1F # BAD32: invalid instruction encoding -0x42 0x1F # GOOD64: c.slli t5, 48 -0x46 0x1F # BAD32: invalid instruction encoding -0x46 0x1F # GOOD64: c.slli t5, 49 -0x4A 0x1F # BAD32: invalid instruction encoding -0x4A 0x1F # GOOD64: c.slli t5, 50 -0x4E 0x1F # BAD32: invalid instruction encoding -0x4E 0x1F # GOOD64: c.slli t5, 51 -0x52 0x1F # BAD32: invalid instruction encoding -0x52 0x1F # GOOD64: c.slli t5, 52 -0x56 0x1F # BAD32: invalid instruction encoding -0x56 0x1F # GOOD64: c.slli t5, 53 -0x5A 0x1F # BAD32: invalid instruction encoding -0x5A 0x1F # GOOD64: c.slli t5, 54 -0x5E 0x1F # BAD32: invalid instruction encoding -0x5E 0x1F # GOOD64: c.slli t5, 55 -0x62 0x1F # BAD32: invalid instruction encoding -0x62 0x1F # GOOD64: c.slli t5, 56 -0x66 0x1F # BAD32: invalid instruction encoding -0x66 0x1F # GOOD64: c.slli t5, 57 -0x6A 0x1F # BAD32: invalid instruction encoding -0x6A 0x1F # GOOD64: c.slli t5, 58 -0x6E 0x1F # BAD32: invalid instruction encoding -0x6E 0x1F # GOOD64: c.slli t5, 59 -0x72 0x1F # BAD32: invalid instruction encoding -0x72 0x1F # GOOD64: c.slli t5, 60 -0x76 0x1F # BAD32: invalid instruction encoding -0x76 0x1F # GOOD64: c.slli t5, 61 -0x7A 0x1F # BAD32: invalid instruction encoding -0x7A 0x1F # GOOD64: c.slli t5, 62 -0x7E 0x1F # BAD32: invalid instruction encoding -0x7E 0x1F # GOOD64: c.slli t5, 63 -# GOOD: c.slli64 t6 +0x76 0x00 + +# GOOD: c.slli zero, 30 # NOHINTS: invalid instruction encoding -0x82 0x0F -0x86 0x0F # GOOD: c.slli t6, 1 -0x8A 0x0F # GOOD: c.slli t6, 2 -0x8E 0x0F # GOOD: c.slli t6, 3 -0x92 0x0F # GOOD: c.slli t6, 4 -0x96 0x0F # GOOD: c.slli t6, 5 -0x9A 0x0F # GOOD: c.slli t6, 6 -0x9E 0x0F # GOOD: c.slli t6, 7 -0xA2 0x0F # GOOD: c.slli t6, 8 -0xA6 0x0F # GOOD: c.slli t6, 9 -0xAA 0x0F # GOOD: c.slli t6, 10 -0xAE 0x0F # GOOD: c.slli t6, 11 -0xB2 0x0F # GOOD: c.slli t6, 12 -0xB6 0x0F # GOOD: c.slli t6, 13 -0xBA 0x0F # GOOD: c.slli t6, 14 -0xBE 0x0F # GOOD: c.slli t6, 15 -0xC2 0x0F # GOOD: c.slli t6, 16 -0xC6 0x0F # GOOD: c.slli t6, 17 -0xCA 0x0F # GOOD: c.slli t6, 18 -0xCE 0x0F # GOOD: c.slli t6, 19 -0xD2 0x0F # GOOD: c.slli t6, 20 -0xD6 0x0F # GOOD: c.slli t6, 21 -0xDA 0x0F # GOOD: c.slli t6, 22 -0xDE 0x0F # GOOD: c.slli t6, 23 -0xE2 0x0F # GOOD: c.slli t6, 24 -0xE6 0x0F # GOOD: c.slli t6, 25 -0xEA 0x0F # GOOD: c.slli t6, 26 -0xEE 0x0F # GOOD: c.slli t6, 27 -0xF2 0x0F # GOOD: c.slli t6, 28 -0xF6 0x0F # GOOD: c.slli t6, 29 -0xFA 0x0F # GOOD: c.slli t6, 30 -0xFE 0x0F # GOOD: c.slli t6, 31 -0x82 0x1F # BAD32: invalid instruction encoding -0x82 0x1F # GOOD64: c.slli t6, 32 -0x86 0x1F # BAD32: invalid instruction encoding -0x86 0x1F # GOOD64: c.slli t6, 33 -0x8A 0x1F # BAD32: invalid instruction encoding -0x8A 0x1F # GOOD64: c.slli t6, 34 -0x8E 0x1F # BAD32: invalid instruction encoding -0x8E 0x1F # GOOD64: c.slli t6, 35 -0x92 0x1F # BAD32: invalid instruction encoding -0x92 0x1F # GOOD64: c.slli t6, 36 -0x96 0x1F # BAD32: invalid instruction encoding -0x96 0x1F # GOOD64: c.slli t6, 37 -0x9A 0x1F # BAD32: invalid instruction encoding -0x9A 0x1F # GOOD64: c.slli t6, 38 -0x9E 0x1F # BAD32: invalid instruction encoding -0x9E 0x1F # GOOD64: c.slli t6, 39 -0xA2 0x1F # BAD32: invalid instruction encoding -0xA2 0x1F # GOOD64: c.slli t6, 40 -0xA6 0x1F # BAD32: invalid instruction encoding -0xA6 0x1F # GOOD64: c.slli t6, 41 -0xAA 0x1F # BAD32: invalid instruction encoding -0xAA 0x1F # GOOD64: c.slli t6, 42 -0xAE 0x1F # BAD32: invalid instruction encoding -0xAE 0x1F # GOOD64: c.slli t6, 43 -0xB2 0x1F # BAD32: invalid instruction encoding -0xB2 0x1F # GOOD64: c.slli t6, 44 -0xB6 0x1F # BAD32: invalid instruction encoding -0xB6 0x1F # GOOD64: c.slli t6, 45 -0xBA 0x1F # BAD32: invalid instruction encoding -0xBA 0x1F # GOOD64: c.slli t6, 46 -0xBE 0x1F # BAD32: invalid instruction encoding -0xBE 0x1F # GOOD64: c.slli t6, 47 -0xC2 0x1F # BAD32: invalid instruction encoding -0xC2 0x1F # GOOD64: c.slli t6, 48 -0xC6 0x1F # BAD32: invalid instruction encoding -0xC6 0x1F # GOOD64: c.slli t6, 49 -0xCA 0x1F # BAD32: invalid instruction encoding -0xCA 0x1F # GOOD64: c.slli t6, 50 -0xCE 0x1F # BAD32: invalid instruction encoding -0xCE 0x1F # GOOD64: c.slli t6, 51 -0xD2 0x1F # BAD32: invalid instruction encoding -0xD2 0x1F # GOOD64: c.slli t6, 52 -0xD6 0x1F # BAD32: invalid instruction encoding -0xD6 0x1F # GOOD64: c.slli t6, 53 -0xDA 0x1F # BAD32: invalid instruction encoding -0xDA 0x1F # GOOD64: c.slli t6, 54 -0xDE 0x1F # BAD32: invalid instruction encoding -0xDE 0x1F # GOOD64: c.slli t6, 55 -0xE2 0x1F # BAD32: invalid instruction encoding -0xE2 0x1F # GOOD64: c.slli t6, 56 -0xE6 0x1F # BAD32: invalid instruction encoding -0xE6 0x1F # GOOD64: c.slli t6, 57 -0xEA 0x1F # BAD32: invalid instruction encoding -0xEA 0x1F # GOOD64: c.slli t6, 58 -0xEE 0x1F # BAD32: invalid instruction encoding -0xEE 0x1F # GOOD64: c.slli t6, 59 -0xF2 0x1F # BAD32: invalid instruction encoding -0xF2 0x1F # GOOD64: c.slli t6, 60 -0xF6 0x1F # BAD32: invalid instruction encoding -0xF6 0x1F # GOOD64: c.slli t6, 61 -0xFA 0x1F # BAD32: invalid instruction encoding -0xFA 0x1F # GOOD64: c.slli t6, 62 -0xFE 0x1F # BAD32: invalid instruction encoding -0xFE 0x1F # GOOD64: c.slli t6, 63 +0x7A 0x00 + +# GOOD: c.slli zero, 31 +# NOHINTS: invalid instruction encoding +0x7E 0x00 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 32 +# NOHINTS: invalid instruction encoding +0x02 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 33 +# NOHINTS: invalid instruction encoding +0x06 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 34 +# NOHINTS: invalid instruction encoding +0x0A 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 35 +# NOHINTS: invalid instruction encoding +0x0E 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 36 +# NOHINTS: invalid instruction encoding +0x12 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 37 +# NOHINTS: invalid instruction encoding +0x16 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 38 +# NOHINTS: invalid instruction encoding +0x1A 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 39 +# NOHINTS: invalid instruction encoding +0x1E 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 40 +# NOHINTS: invalid instruction encoding +0x22 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 41 +# NOHINTS: invalid instruction encoding +0x26 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 42 +# NOHINTS: invalid instruction encoding +0x2A 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 43 +# NOHINTS: invalid instruction encoding +0x2E 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 44 +# NOHINTS: invalid instruction encoding +0x32 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 45 +# NOHINTS: invalid instruction encoding +0x36 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 46 +# NOHINTS: invalid instruction encoding +0x3A 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 47 +# NOHINTS: invalid instruction encoding +0x3E 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 48 +# NOHINTS: invalid instruction encoding +0x42 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 49 +# NOHINTS: invalid instruction encoding +0x46 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 50 +# NOHINTS: invalid instruction encoding +0x4A 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 51 +# NOHINTS: invalid instruction encoding +0x4E 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 52 +# NOHINTS: invalid instruction encoding +0x52 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 53 +# NOHINTS: invalid instruction encoding +0x56 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 54 +# NOHINTS: invalid instruction encoding +0x5A 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 55 +# NOHINTS: invalid instruction encoding +0x5E 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 56 +# NOHINTS: invalid instruction encoding +0x62 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 57 +# NOHINTS: invalid instruction encoding +0x66 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 58 +# NOHINTS: invalid instruction encoding +0x6A 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 59 +# NOHINTS: invalid instruction encoding +0x6E 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 60 +# NOHINTS: invalid instruction encoding +0x72 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 61 +# NOHINTS: invalid instruction encoding +0x76 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 62 +# NOHINTS: invalid instruction encoding +0x7A 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli zero, 63 +# NOHINTS: invalid instruction encoding +0x7E 0x10 + +# GOOD: c.slli64 ra +0x82 0x00 + +# GOOD: c.slli ra, 1 +0x86 0x00 + +# GOOD: c.slli ra, 2 +0x8A 0x00 + +# GOOD: c.slli ra, 3 +0x8E 0x00 + +# GOOD: c.slli ra, 4 +0x92 0x00 + +# GOOD: c.slli ra, 5 +0x96 0x00 + +# GOOD: c.slli ra, 6 +0x9A 0x00 + +# GOOD: c.slli ra, 7 +0x9E 0x00 + +# GOOD: c.slli ra, 8 +0xA2 0x00 + +# GOOD: c.slli ra, 9 +0xA6 0x00 + +# GOOD: c.slli ra, 10 +0xAA 0x00 + +# GOOD: c.slli ra, 11 +0xAE 0x00 + +# GOOD: c.slli ra, 12 +0xB2 0x00 + +# GOOD: c.slli ra, 13 +0xB6 0x00 + +# GOOD: c.slli ra, 14 +0xBA 0x00 + +# GOOD: c.slli ra, 15 +0xBE 0x00 + +# GOOD: c.slli ra, 16 +0xC2 0x00 + +# GOOD: c.slli ra, 17 +0xC6 0x00 + +# GOOD: c.slli ra, 18 +0xCA 0x00 + +# GOOD: c.slli ra, 19 +0xCE 0x00 + +# GOOD: c.slli ra, 20 +0xD2 0x00 + +# GOOD: c.slli ra, 21 +0xD6 0x00 + +# GOOD: c.slli ra, 22 +0xDA 0x00 + +# GOOD: c.slli ra, 23 +0xDE 0x00 + +# GOOD: c.slli ra, 24 +0xE2 0x00 + +# GOOD: c.slli ra, 25 +0xE6 0x00 + +# GOOD: c.slli ra, 26 +0xEA 0x00 + +# GOOD: c.slli ra, 27 +0xEE 0x00 + +# GOOD: c.slli ra, 28 +0xF2 0x00 + +# GOOD: c.slli ra, 29 +0xF6 0x00 + +# GOOD: c.slli ra, 30 +0xFA 0x00 + +# GOOD: c.slli ra, 31 +0xFE 0x00 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 32 +0x82 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 33 +0x86 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 34 +0x8A 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 35 +0x8E 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 36 +0x92 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 37 +0x96 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 38 +0x9A 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 39 +0x9E 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 40 +0xA2 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 41 +0xA6 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 42 +0xAA 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 43 +0xAE 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 44 +0xB2 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 45 +0xB6 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 46 +0xBA 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 47 +0xBE 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 48 +0xC2 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 49 +0xC6 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 50 +0xCA 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 51 +0xCE 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 52 +0xD2 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 53 +0xD6 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 54 +0xDA 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 55 +0xDE 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 56 +0xE2 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 57 +0xE6 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 58 +0xEA 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 59 +0xEE 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 60 +0xF2 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 61 +0xF6 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 62 +0xFA 0x10 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli ra, 63 +0xFE 0x10 + +# GOOD: c.slli64 sp +0x02 0x01 + +# GOOD: c.slli sp, 1 +0x06 0x01 + +# GOOD: c.slli sp, 2 +0x0A 0x01 + +# GOOD: c.slli sp, 3 +0x0E 0x01 + +# GOOD: c.slli sp, 4 +0x12 0x01 + +# GOOD: c.slli sp, 5 +0x16 0x01 + +# GOOD: c.slli sp, 6 +0x1A 0x01 + +# GOOD: c.slli sp, 7 +0x1E 0x01 + +# GOOD: c.slli sp, 8 +0x22 0x01 + +# GOOD: c.slli sp, 9 +0x26 0x01 + +# GOOD: c.slli sp, 10 +0x2A 0x01 + +# GOOD: c.slli sp, 11 +0x2E 0x01 + +# GOOD: c.slli sp, 12 +0x32 0x01 + +# GOOD: c.slli sp, 13 +0x36 0x01 + +# GOOD: c.slli sp, 14 +0x3A 0x01 + +# GOOD: c.slli sp, 15 +0x3E 0x01 + +# GOOD: c.slli sp, 16 +0x42 0x01 + +# GOOD: c.slli sp, 17 +0x46 0x01 + +# GOOD: c.slli sp, 18 +0x4A 0x01 + +# GOOD: c.slli sp, 19 +0x4E 0x01 + +# GOOD: c.slli sp, 20 +0x52 0x01 + +# GOOD: c.slli sp, 21 +0x56 0x01 + +# GOOD: c.slli sp, 22 +0x5A 0x01 + +# GOOD: c.slli sp, 23 +0x5E 0x01 + +# GOOD: c.slli sp, 24 +0x62 0x01 + +# GOOD: c.slli sp, 25 +0x66 0x01 + +# GOOD: c.slli sp, 26 +0x6A 0x01 + +# GOOD: c.slli sp, 27 +0x6E 0x01 + +# GOOD: c.slli sp, 28 +0x72 0x01 + +# GOOD: c.slli sp, 29 +0x76 0x01 + +# GOOD: c.slli sp, 30 +0x7A 0x01 + +# GOOD: c.slli sp, 31 +0x7E 0x01 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 32 +0x02 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 33 +0x06 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 34 +0x0A 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 35 +0x0E 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 36 +0x12 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 37 +0x16 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 38 +0x1A 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 39 +0x1E 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 40 +0x22 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 41 +0x26 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 42 +0x2A 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 43 +0x2E 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 44 +0x32 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 45 +0x36 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 46 +0x3A 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 47 +0x3E 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 48 +0x42 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 49 +0x46 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 50 +0x4A 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 51 +0x4E 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 52 +0x52 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 53 +0x56 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 54 +0x5A 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 55 +0x5E 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 56 +0x62 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 57 +0x66 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 58 +0x6A 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 59 +0x6E 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 60 +0x72 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 61 +0x76 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 62 +0x7A 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli sp, 63 +0x7E 0x11 + +# GOOD: c.slli64 gp +0x82 0x01 + +# GOOD: c.slli gp, 1 +0x86 0x01 + +# GOOD: c.slli gp, 2 +0x8A 0x01 + +# GOOD: c.slli gp, 3 +0x8E 0x01 + +# GOOD: c.slli gp, 4 +0x92 0x01 + +# GOOD: c.slli gp, 5 +0x96 0x01 + +# GOOD: c.slli gp, 6 +0x9A 0x01 + +# GOOD: c.slli gp, 7 +0x9E 0x01 + +# GOOD: c.slli gp, 8 +0xA2 0x01 + +# GOOD: c.slli gp, 9 +0xA6 0x01 + +# GOOD: c.slli gp, 10 +0xAA 0x01 + +# GOOD: c.slli gp, 11 +0xAE 0x01 + +# GOOD: c.slli gp, 12 +0xB2 0x01 + +# GOOD: c.slli gp, 13 +0xB6 0x01 + +# GOOD: c.slli gp, 14 +0xBA 0x01 + +# GOOD: c.slli gp, 15 +0xBE 0x01 + +# GOOD: c.slli gp, 16 +0xC2 0x01 + +# GOOD: c.slli gp, 17 +0xC6 0x01 + +# GOOD: c.slli gp, 18 +0xCA 0x01 + +# GOOD: c.slli gp, 19 +0xCE 0x01 + +# GOOD: c.slli gp, 20 +0xD2 0x01 + +# GOOD: c.slli gp, 21 +0xD6 0x01 + +# GOOD: c.slli gp, 22 +0xDA 0x01 + +# GOOD: c.slli gp, 23 +0xDE 0x01 + +# GOOD: c.slli gp, 24 +0xE2 0x01 + +# GOOD: c.slli gp, 25 +0xE6 0x01 + +# GOOD: c.slli gp, 26 +0xEA 0x01 + +# GOOD: c.slli gp, 27 +0xEE 0x01 + +# GOOD: c.slli gp, 28 +0xF2 0x01 + +# GOOD: c.slli gp, 29 +0xF6 0x01 + +# GOOD: c.slli gp, 30 +0xFA 0x01 + +# GOOD: c.slli gp, 31 +0xFE 0x01 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 32 +0x82 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 33 +0x86 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 34 +0x8A 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 35 +0x8E 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 36 +0x92 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 37 +0x96 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 38 +0x9A 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 39 +0x9E 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 40 +0xA2 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 41 +0xA6 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 42 +0xAA 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 43 +0xAE 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 44 +0xB2 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 45 +0xB6 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 46 +0xBA 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 47 +0xBE 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 48 +0xC2 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 49 +0xC6 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 50 +0xCA 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 51 +0xCE 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 52 +0xD2 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 53 +0xD6 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 54 +0xDA 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 55 +0xDE 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 56 +0xE2 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 57 +0xE6 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 58 +0xEA 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 59 +0xEE 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 60 +0xF2 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 61 +0xF6 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 62 +0xFA 0x11 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli gp, 63 +0xFE 0x11 + +# GOOD: c.slli64 tp +0x02 0x02 + +# GOOD: c.slli tp, 1 +0x06 0x02 + +# GOOD: c.slli tp, 2 +0x0A 0x02 + +# GOOD: c.slli tp, 3 +0x0E 0x02 + +# GOOD: c.slli tp, 4 +0x12 0x02 + +# GOOD: c.slli tp, 5 +0x16 0x02 + +# GOOD: c.slli tp, 6 +0x1A 0x02 + +# GOOD: c.slli tp, 7 +0x1E 0x02 + +# GOOD: c.slli tp, 8 +0x22 0x02 + +# GOOD: c.slli tp, 9 +0x26 0x02 + +# GOOD: c.slli tp, 10 +0x2A 0x02 + +# GOOD: c.slli tp, 11 +0x2E 0x02 + +# GOOD: c.slli tp, 12 +0x32 0x02 + +# GOOD: c.slli tp, 13 +0x36 0x02 + +# GOOD: c.slli tp, 14 +0x3A 0x02 + +# GOOD: c.slli tp, 15 +0x3E 0x02 + +# GOOD: c.slli tp, 16 +0x42 0x02 + +# GOOD: c.slli tp, 17 +0x46 0x02 + +# GOOD: c.slli tp, 18 +0x4A 0x02 + +# GOOD: c.slli tp, 19 +0x4E 0x02 + +# GOOD: c.slli tp, 20 +0x52 0x02 + +# GOOD: c.slli tp, 21 +0x56 0x02 + +# GOOD: c.slli tp, 22 +0x5A 0x02 + +# GOOD: c.slli tp, 23 +0x5E 0x02 + +# GOOD: c.slli tp, 24 +0x62 0x02 + +# GOOD: c.slli tp, 25 +0x66 0x02 + +# GOOD: c.slli tp, 26 +0x6A 0x02 + +# GOOD: c.slli tp, 27 +0x6E 0x02 + +# GOOD: c.slli tp, 28 +0x72 0x02 + +# GOOD: c.slli tp, 29 +0x76 0x02 + +# GOOD: c.slli tp, 30 +0x7A 0x02 + +# GOOD: c.slli tp, 31 +0x7E 0x02 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 32 +0x02 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 33 +0x06 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 34 +0x0A 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 35 +0x0E 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 36 +0x12 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 37 +0x16 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 38 +0x1A 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 39 +0x1E 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 40 +0x22 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 41 +0x26 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 42 +0x2A 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 43 +0x2E 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 44 +0x32 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 45 +0x36 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 46 +0x3A 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 47 +0x3E 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 48 +0x42 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 49 +0x46 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 50 +0x4A 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 51 +0x4E 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 52 +0x52 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 53 +0x56 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 54 +0x5A 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 55 +0x5E 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 56 +0x62 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 57 +0x66 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 58 +0x6A 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 59 +0x6E 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 60 +0x72 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 61 +0x76 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 62 +0x7A 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli tp, 63 +0x7E 0x12 + +# GOOD: c.slli64 t0 +0x82 0x02 + +# GOOD: c.slli t0, 1 +0x86 0x02 + +# GOOD: c.slli t0, 2 +0x8A 0x02 + +# GOOD: c.slli t0, 3 +0x8E 0x02 + +# GOOD: c.slli t0, 4 +0x92 0x02 + +# GOOD: c.slli t0, 5 +0x96 0x02 + +# GOOD: c.slli t0, 6 +0x9A 0x02 + +# GOOD: c.slli t0, 7 +0x9E 0x02 + +# GOOD: c.slli t0, 8 +0xA2 0x02 + +# GOOD: c.slli t0, 9 +0xA6 0x02 + +# GOOD: c.slli t0, 10 +0xAA 0x02 + +# GOOD: c.slli t0, 11 +0xAE 0x02 + +# GOOD: c.slli t0, 12 +0xB2 0x02 + +# GOOD: c.slli t0, 13 +0xB6 0x02 + +# GOOD: c.slli t0, 14 +0xBA 0x02 + +# GOOD: c.slli t0, 15 +0xBE 0x02 + +# GOOD: c.slli t0, 16 +0xC2 0x02 + +# GOOD: c.slli t0, 17 +0xC6 0x02 + +# GOOD: c.slli t0, 18 +0xCA 0x02 + +# GOOD: c.slli t0, 19 +0xCE 0x02 + +# GOOD: c.slli t0, 20 +0xD2 0x02 + +# GOOD: c.slli t0, 21 +0xD6 0x02 + +# GOOD: c.slli t0, 22 +0xDA 0x02 + +# GOOD: c.slli t0, 23 +0xDE 0x02 + +# GOOD: c.slli t0, 24 +0xE2 0x02 + +# GOOD: c.slli t0, 25 +0xE6 0x02 + +# GOOD: c.slli t0, 26 +0xEA 0x02 + +# GOOD: c.slli t0, 27 +0xEE 0x02 + +# GOOD: c.slli t0, 28 +0xF2 0x02 + +# GOOD: c.slli t0, 29 +0xF6 0x02 + +# GOOD: c.slli t0, 30 +0xFA 0x02 + +# GOOD: c.slli t0, 31 +0xFE 0x02 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 32 +0x82 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 33 +0x86 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 34 +0x8A 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 35 +0x8E 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 36 +0x92 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 37 +0x96 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 38 +0x9A 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 39 +0x9E 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 40 +0xA2 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 41 +0xA6 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 42 +0xAA 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 43 +0xAE 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 44 +0xB2 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 45 +0xB6 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 46 +0xBA 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 47 +0xBE 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 48 +0xC2 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 49 +0xC6 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 50 +0xCA 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 51 +0xCE 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 52 +0xD2 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 53 +0xD6 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 54 +0xDA 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 55 +0xDE 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 56 +0xE2 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 57 +0xE6 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 58 +0xEA 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 59 +0xEE 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 60 +0xF2 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 61 +0xF6 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 62 +0xFA 0x12 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t0, 63 +0xFE 0x12 + +# GOOD: c.slli64 t1 +0x02 0x03 + +# GOOD: c.slli t1, 1 +0x06 0x03 + +# GOOD: c.slli t1, 2 +0x0A 0x03 + +# GOOD: c.slli t1, 3 +0x0E 0x03 + +# GOOD: c.slli t1, 4 +0x12 0x03 + +# GOOD: c.slli t1, 5 +0x16 0x03 + +# GOOD: c.slli t1, 6 +0x1A 0x03 + +# GOOD: c.slli t1, 7 +0x1E 0x03 + +# GOOD: c.slli t1, 8 +0x22 0x03 + +# GOOD: c.slli t1, 9 +0x26 0x03 + +# GOOD: c.slli t1, 10 +0x2A 0x03 + +# GOOD: c.slli t1, 11 +0x2E 0x03 + +# GOOD: c.slli t1, 12 +0x32 0x03 + +# GOOD: c.slli t1, 13 +0x36 0x03 + +# GOOD: c.slli t1, 14 +0x3A 0x03 + +# GOOD: c.slli t1, 15 +0x3E 0x03 + +# GOOD: c.slli t1, 16 +0x42 0x03 + +# GOOD: c.slli t1, 17 +0x46 0x03 + +# GOOD: c.slli t1, 18 +0x4A 0x03 + +# GOOD: c.slli t1, 19 +0x4E 0x03 + +# GOOD: c.slli t1, 20 +0x52 0x03 + +# GOOD: c.slli t1, 21 +0x56 0x03 + +# GOOD: c.slli t1, 22 +0x5A 0x03 + +# GOOD: c.slli t1, 23 +0x5E 0x03 + +# GOOD: c.slli t1, 24 +0x62 0x03 + +# GOOD: c.slli t1, 25 +0x66 0x03 + +# GOOD: c.slli t1, 26 +0x6A 0x03 + +# GOOD: c.slli t1, 27 +0x6E 0x03 + +# GOOD: c.slli t1, 28 +0x72 0x03 + +# GOOD: c.slli t1, 29 +0x76 0x03 + +# GOOD: c.slli t1, 30 +0x7A 0x03 + +# GOOD: c.slli t1, 31 +0x7E 0x03 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 32 +0x02 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 33 +0x06 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 34 +0x0A 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 35 +0x0E 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 36 +0x12 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 37 +0x16 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 38 +0x1A 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 39 +0x1E 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 40 +0x22 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 41 +0x26 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 42 +0x2A 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 43 +0x2E 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 44 +0x32 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 45 +0x36 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 46 +0x3A 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 47 +0x3E 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 48 +0x42 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 49 +0x46 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 50 +0x4A 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 51 +0x4E 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 52 +0x52 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 53 +0x56 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 54 +0x5A 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 55 +0x5E 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 56 +0x62 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 57 +0x66 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 58 +0x6A 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 59 +0x6E 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 60 +0x72 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 61 +0x76 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 62 +0x7A 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t1, 63 +0x7E 0x13 + +# GOOD: c.slli64 t2 +0x82 0x03 + +# GOOD: c.slli t2, 1 +0x86 0x03 + +# GOOD: c.slli t2, 2 +0x8A 0x03 + +# GOOD: c.slli t2, 3 +0x8E 0x03 + +# GOOD: c.slli t2, 4 +0x92 0x03 + +# GOOD: c.slli t2, 5 +0x96 0x03 + +# GOOD: c.slli t2, 6 +0x9A 0x03 + +# GOOD: c.slli t2, 7 +0x9E 0x03 + +# GOOD: c.slli t2, 8 +0xA2 0x03 + +# GOOD: c.slli t2, 9 +0xA6 0x03 + +# GOOD: c.slli t2, 10 +0xAA 0x03 + +# GOOD: c.slli t2, 11 +0xAE 0x03 + +# GOOD: c.slli t2, 12 +0xB2 0x03 + +# GOOD: c.slli t2, 13 +0xB6 0x03 + +# GOOD: c.slli t2, 14 +0xBA 0x03 + +# GOOD: c.slli t2, 15 +0xBE 0x03 + +# GOOD: c.slli t2, 16 +0xC2 0x03 + +# GOOD: c.slli t2, 17 +0xC6 0x03 + +# GOOD: c.slli t2, 18 +0xCA 0x03 + +# GOOD: c.slli t2, 19 +0xCE 0x03 + +# GOOD: c.slli t2, 20 +0xD2 0x03 + +# GOOD: c.slli t2, 21 +0xD6 0x03 + +# GOOD: c.slli t2, 22 +0xDA 0x03 + +# GOOD: c.slli t2, 23 +0xDE 0x03 + +# GOOD: c.slli t2, 24 +0xE2 0x03 + +# GOOD: c.slli t2, 25 +0xE6 0x03 + +# GOOD: c.slli t2, 26 +0xEA 0x03 + +# GOOD: c.slli t2, 27 +0xEE 0x03 + +# GOOD: c.slli t2, 28 +0xF2 0x03 + +# GOOD: c.slli t2, 29 +0xF6 0x03 + +# GOOD: c.slli t2, 30 +0xFA 0x03 + +# GOOD: c.slli t2, 31 +0xFE 0x03 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 32 +0x82 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 33 +0x86 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 34 +0x8A 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 35 +0x8E 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 36 +0x92 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 37 +0x96 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 38 +0x9A 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 39 +0x9E 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 40 +0xA2 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 41 +0xA6 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 42 +0xAA 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 43 +0xAE 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 44 +0xB2 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 45 +0xB6 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 46 +0xBA 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 47 +0xBE 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 48 +0xC2 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 49 +0xC6 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 50 +0xCA 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 51 +0xCE 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 52 +0xD2 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 53 +0xD6 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 54 +0xDA 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 55 +0xDE 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 56 +0xE2 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 57 +0xE6 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 58 +0xEA 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 59 +0xEE 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 60 +0xF2 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 61 +0xF6 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 62 +0xFA 0x13 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t2, 63 +0xFE 0x13 + +# GOOD: c.slli64 s0 +0x02 0x04 + +# GOOD: c.slli s0, 1 +0x06 0x04 + +# GOOD: c.slli s0, 2 +0x0A 0x04 + +# GOOD: c.slli s0, 3 +0x0E 0x04 + +# GOOD: c.slli s0, 4 +0x12 0x04 + +# GOOD: c.slli s0, 5 +0x16 0x04 + +# GOOD: c.slli s0, 6 +0x1A 0x04 + +# GOOD: c.slli s0, 7 +0x1E 0x04 + +# GOOD: c.slli s0, 8 +0x22 0x04 + +# GOOD: c.slli s0, 9 +0x26 0x04 + +# GOOD: c.slli s0, 10 +0x2A 0x04 + +# GOOD: c.slli s0, 11 +0x2E 0x04 + +# GOOD: c.slli s0, 12 +0x32 0x04 + +# GOOD: c.slli s0, 13 +0x36 0x04 + +# GOOD: c.slli s0, 14 +0x3A 0x04 + +# GOOD: c.slli s0, 15 +0x3E 0x04 + +# GOOD: c.slli s0, 16 +0x42 0x04 + +# GOOD: c.slli s0, 17 +0x46 0x04 + +# GOOD: c.slli s0, 18 +0x4A 0x04 + +# GOOD: c.slli s0, 19 +0x4E 0x04 + +# GOOD: c.slli s0, 20 +0x52 0x04 + +# GOOD: c.slli s0, 21 +0x56 0x04 + +# GOOD: c.slli s0, 22 +0x5A 0x04 + +# GOOD: c.slli s0, 23 +0x5E 0x04 + +# GOOD: c.slli s0, 24 +0x62 0x04 + +# GOOD: c.slli s0, 25 +0x66 0x04 + +# GOOD: c.slli s0, 26 +0x6A 0x04 + +# GOOD: c.slli s0, 27 +0x6E 0x04 + +# GOOD: c.slli s0, 28 +0x72 0x04 + +# GOOD: c.slli s0, 29 +0x76 0x04 + +# GOOD: c.slli s0, 30 +0x7A 0x04 + +# GOOD: c.slli s0, 31 +0x7E 0x04 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 32 +0x02 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 33 +0x06 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 34 +0x0A 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 35 +0x0E 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 36 +0x12 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 37 +0x16 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 38 +0x1A 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 39 +0x1E 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 40 +0x22 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 41 +0x26 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 42 +0x2A 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 43 +0x2E 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 44 +0x32 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 45 +0x36 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 46 +0x3A 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 47 +0x3E 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 48 +0x42 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 49 +0x46 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 50 +0x4A 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 51 +0x4E 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 52 +0x52 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 53 +0x56 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 54 +0x5A 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 55 +0x5E 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 56 +0x62 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 57 +0x66 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 58 +0x6A 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 59 +0x6E 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 60 +0x72 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 61 +0x76 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 62 +0x7A 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s0, 63 +0x7E 0x14 + +# GOOD: c.slli64 s1 +0x82 0x04 + +# GOOD: c.slli s1, 1 +0x86 0x04 + +# GOOD: c.slli s1, 2 +0x8A 0x04 + +# GOOD: c.slli s1, 3 +0x8E 0x04 + +# GOOD: c.slli s1, 4 +0x92 0x04 + +# GOOD: c.slli s1, 5 +0x96 0x04 + +# GOOD: c.slli s1, 6 +0x9A 0x04 + +# GOOD: c.slli s1, 7 +0x9E 0x04 + +# GOOD: c.slli s1, 8 +0xA2 0x04 + +# GOOD: c.slli s1, 9 +0xA6 0x04 + +# GOOD: c.slli s1, 10 +0xAA 0x04 + +# GOOD: c.slli s1, 11 +0xAE 0x04 + +# GOOD: c.slli s1, 12 +0xB2 0x04 + +# GOOD: c.slli s1, 13 +0xB6 0x04 + +# GOOD: c.slli s1, 14 +0xBA 0x04 + +# GOOD: c.slli s1, 15 +0xBE 0x04 + +# GOOD: c.slli s1, 16 +0xC2 0x04 + +# GOOD: c.slli s1, 17 +0xC6 0x04 + +# GOOD: c.slli s1, 18 +0xCA 0x04 + +# GOOD: c.slli s1, 19 +0xCE 0x04 + +# GOOD: c.slli s1, 20 +0xD2 0x04 + +# GOOD: c.slli s1, 21 +0xD6 0x04 + +# GOOD: c.slli s1, 22 +0xDA 0x04 + +# GOOD: c.slli s1, 23 +0xDE 0x04 + +# GOOD: c.slli s1, 24 +0xE2 0x04 + +# GOOD: c.slli s1, 25 +0xE6 0x04 + +# GOOD: c.slli s1, 26 +0xEA 0x04 + +# GOOD: c.slli s1, 27 +0xEE 0x04 + +# GOOD: c.slli s1, 28 +0xF2 0x04 + +# GOOD: c.slli s1, 29 +0xF6 0x04 + +# GOOD: c.slli s1, 30 +0xFA 0x04 + +# GOOD: c.slli s1, 31 +0xFE 0x04 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 32 +0x82 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 33 +0x86 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 34 +0x8A 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 35 +0x8E 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 36 +0x92 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 37 +0x96 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 38 +0x9A 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 39 +0x9E 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 40 +0xA2 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 41 +0xA6 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 42 +0xAA 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 43 +0xAE 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 44 +0xB2 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 45 +0xB6 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 46 +0xBA 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 47 +0xBE 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 48 +0xC2 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 49 +0xC6 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 50 +0xCA 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 51 +0xCE 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 52 +0xD2 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 53 +0xD6 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 54 +0xDA 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 55 +0xDE 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 56 +0xE2 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 57 +0xE6 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 58 +0xEA 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 59 +0xEE 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 60 +0xF2 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 61 +0xF6 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 62 +0xFA 0x14 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s1, 63 +0xFE 0x14 + +# GOOD: c.slli64 a0 +0x02 0x05 + +# GOOD: c.slli a0, 1 +0x06 0x05 + +# GOOD: c.slli a0, 2 +0x0A 0x05 + +# GOOD: c.slli a0, 3 +0x0E 0x05 + +# GOOD: c.slli a0, 4 +0x12 0x05 + +# GOOD: c.slli a0, 5 +0x16 0x05 + +# GOOD: c.slli a0, 6 +0x1A 0x05 + +# GOOD: c.slli a0, 7 +0x1E 0x05 + +# GOOD: c.slli a0, 8 +0x22 0x05 + +# GOOD: c.slli a0, 9 +0x26 0x05 + +# GOOD: c.slli a0, 10 +0x2A 0x05 + +# GOOD: c.slli a0, 11 +0x2E 0x05 + +# GOOD: c.slli a0, 12 +0x32 0x05 + +# GOOD: c.slli a0, 13 +0x36 0x05 + +# GOOD: c.slli a0, 14 +0x3A 0x05 + +# GOOD: c.slli a0, 15 +0x3E 0x05 + +# GOOD: c.slli a0, 16 +0x42 0x05 + +# GOOD: c.slli a0, 17 +0x46 0x05 + +# GOOD: c.slli a0, 18 +0x4A 0x05 + +# GOOD: c.slli a0, 19 +0x4E 0x05 + +# GOOD: c.slli a0, 20 +0x52 0x05 + +# GOOD: c.slli a0, 21 +0x56 0x05 + +# GOOD: c.slli a0, 22 +0x5A 0x05 + +# GOOD: c.slli a0, 23 +0x5E 0x05 + +# GOOD: c.slli a0, 24 +0x62 0x05 + +# GOOD: c.slli a0, 25 +0x66 0x05 + +# GOOD: c.slli a0, 26 +0x6A 0x05 + +# GOOD: c.slli a0, 27 +0x6E 0x05 + +# GOOD: c.slli a0, 28 +0x72 0x05 + +# GOOD: c.slli a0, 29 +0x76 0x05 + +# GOOD: c.slli a0, 30 +0x7A 0x05 + +# GOOD: c.slli a0, 31 +0x7E 0x05 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 32 +0x02 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 33 +0x06 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 34 +0x0A 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 35 +0x0E 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 36 +0x12 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 37 +0x16 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 38 +0x1A 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 39 +0x1E 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 40 +0x22 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 41 +0x26 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 42 +0x2A 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 43 +0x2E 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 44 +0x32 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 45 +0x36 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 46 +0x3A 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 47 +0x3E 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 48 +0x42 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 49 +0x46 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 50 +0x4A 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 51 +0x4E 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 52 +0x52 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 53 +0x56 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 54 +0x5A 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 55 +0x5E 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 56 +0x62 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 57 +0x66 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 58 +0x6A 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 59 +0x6E 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 60 +0x72 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 61 +0x76 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 62 +0x7A 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a0, 63 +0x7E 0x15 + +# GOOD: c.slli64 a1 +0x82 0x05 + +# GOOD: c.slli a1, 1 +0x86 0x05 + +# GOOD: c.slli a1, 2 +0x8A 0x05 + +# GOOD: c.slli a1, 3 +0x8E 0x05 + +# GOOD: c.slli a1, 4 +0x92 0x05 + +# GOOD: c.slli a1, 5 +0x96 0x05 + +# GOOD: c.slli a1, 6 +0x9A 0x05 + +# GOOD: c.slli a1, 7 +0x9E 0x05 + +# GOOD: c.slli a1, 8 +0xA2 0x05 + +# GOOD: c.slli a1, 9 +0xA6 0x05 + +# GOOD: c.slli a1, 10 +0xAA 0x05 + +# GOOD: c.slli a1, 11 +0xAE 0x05 + +# GOOD: c.slli a1, 12 +0xB2 0x05 + +# GOOD: c.slli a1, 13 +0xB6 0x05 + +# GOOD: c.slli a1, 14 +0xBA 0x05 + +# GOOD: c.slli a1, 15 +0xBE 0x05 + +# GOOD: c.slli a1, 16 +0xC2 0x05 + +# GOOD: c.slli a1, 17 +0xC6 0x05 + +# GOOD: c.slli a1, 18 +0xCA 0x05 + +# GOOD: c.slli a1, 19 +0xCE 0x05 + +# GOOD: c.slli a1, 20 +0xD2 0x05 + +# GOOD: c.slli a1, 21 +0xD6 0x05 + +# GOOD: c.slli a1, 22 +0xDA 0x05 + +# GOOD: c.slli a1, 23 +0xDE 0x05 + +# GOOD: c.slli a1, 24 +0xE2 0x05 + +# GOOD: c.slli a1, 25 +0xE6 0x05 + +# GOOD: c.slli a1, 26 +0xEA 0x05 + +# GOOD: c.slli a1, 27 +0xEE 0x05 + +# GOOD: c.slli a1, 28 +0xF2 0x05 + +# GOOD: c.slli a1, 29 +0xF6 0x05 + +# GOOD: c.slli a1, 30 +0xFA 0x05 + +# GOOD: c.slli a1, 31 +0xFE 0x05 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 32 +0x82 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 33 +0x86 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 34 +0x8A 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 35 +0x8E 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 36 +0x92 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 37 +0x96 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 38 +0x9A 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 39 +0x9E 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 40 +0xA2 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 41 +0xA6 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 42 +0xAA 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 43 +0xAE 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 44 +0xB2 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 45 +0xB6 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 46 +0xBA 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 47 +0xBE 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 48 +0xC2 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 49 +0xC6 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 50 +0xCA 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 51 +0xCE 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 52 +0xD2 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 53 +0xD6 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 54 +0xDA 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 55 +0xDE 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 56 +0xE2 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 57 +0xE6 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 58 +0xEA 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 59 +0xEE 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 60 +0xF2 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 61 +0xF6 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 62 +0xFA 0x15 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a1, 63 +0xFE 0x15 + +# GOOD: c.slli64 a2 +0x02 0x06 + +# GOOD: c.slli a2, 1 +0x06 0x06 + +# GOOD: c.slli a2, 2 +0x0A 0x06 + +# GOOD: c.slli a2, 3 +0x0E 0x06 + +# GOOD: c.slli a2, 4 +0x12 0x06 + +# GOOD: c.slli a2, 5 +0x16 0x06 + +# GOOD: c.slli a2, 6 +0x1A 0x06 + +# GOOD: c.slli a2, 7 +0x1E 0x06 + +# GOOD: c.slli a2, 8 +0x22 0x06 + +# GOOD: c.slli a2, 9 +0x26 0x06 + +# GOOD: c.slli a2, 10 +0x2A 0x06 + +# GOOD: c.slli a2, 11 +0x2E 0x06 + +# GOOD: c.slli a2, 12 +0x32 0x06 + +# GOOD: c.slli a2, 13 +0x36 0x06 + +# GOOD: c.slli a2, 14 +0x3A 0x06 + +# GOOD: c.slli a2, 15 +0x3E 0x06 + +# GOOD: c.slli a2, 16 +0x42 0x06 + +# GOOD: c.slli a2, 17 +0x46 0x06 + +# GOOD: c.slli a2, 18 +0x4A 0x06 + +# GOOD: c.slli a2, 19 +0x4E 0x06 + +# GOOD: c.slli a2, 20 +0x52 0x06 + +# GOOD: c.slli a2, 21 +0x56 0x06 + +# GOOD: c.slli a2, 22 +0x5A 0x06 + +# GOOD: c.slli a2, 23 +0x5E 0x06 + +# GOOD: c.slli a2, 24 +0x62 0x06 + +# GOOD: c.slli a2, 25 +0x66 0x06 + +# GOOD: c.slli a2, 26 +0x6A 0x06 + +# GOOD: c.slli a2, 27 +0x6E 0x06 + +# GOOD: c.slli a2, 28 +0x72 0x06 + +# GOOD: c.slli a2, 29 +0x76 0x06 + +# GOOD: c.slli a2, 30 +0x7A 0x06 + +# GOOD: c.slli a2, 31 +0x7E 0x06 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 32 +0x02 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 33 +0x06 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 34 +0x0A 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 35 +0x0E 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 36 +0x12 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 37 +0x16 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 38 +0x1A 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 39 +0x1E 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 40 +0x22 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 41 +0x26 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 42 +0x2A 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 43 +0x2E 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 44 +0x32 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 45 +0x36 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 46 +0x3A 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 47 +0x3E 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 48 +0x42 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 49 +0x46 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 50 +0x4A 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 51 +0x4E 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 52 +0x52 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 53 +0x56 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 54 +0x5A 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 55 +0x5E 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 56 +0x62 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 57 +0x66 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 58 +0x6A 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 59 +0x6E 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 60 +0x72 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 61 +0x76 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 62 +0x7A 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a2, 63 +0x7E 0x16 + +# GOOD: c.slli64 a3 +0x82 0x06 + +# GOOD: c.slli a3, 1 +0x86 0x06 + +# GOOD: c.slli a3, 2 +0x8A 0x06 + +# GOOD: c.slli a3, 3 +0x8E 0x06 + +# GOOD: c.slli a3, 4 +0x92 0x06 + +# GOOD: c.slli a3, 5 +0x96 0x06 + +# GOOD: c.slli a3, 6 +0x9A 0x06 + +# GOOD: c.slli a3, 7 +0x9E 0x06 + +# GOOD: c.slli a3, 8 +0xA2 0x06 + +# GOOD: c.slli a3, 9 +0xA6 0x06 + +# GOOD: c.slli a3, 10 +0xAA 0x06 + +# GOOD: c.slli a3, 11 +0xAE 0x06 + +# GOOD: c.slli a3, 12 +0xB2 0x06 + +# GOOD: c.slli a3, 13 +0xB6 0x06 + +# GOOD: c.slli a3, 14 +0xBA 0x06 + +# GOOD: c.slli a3, 15 +0xBE 0x06 + +# GOOD: c.slli a3, 16 +0xC2 0x06 + +# GOOD: c.slli a3, 17 +0xC6 0x06 + +# GOOD: c.slli a3, 18 +0xCA 0x06 + +# GOOD: c.slli a3, 19 +0xCE 0x06 + +# GOOD: c.slli a3, 20 +0xD2 0x06 + +# GOOD: c.slli a3, 21 +0xD6 0x06 + +# GOOD: c.slli a3, 22 +0xDA 0x06 + +# GOOD: c.slli a3, 23 +0xDE 0x06 + +# GOOD: c.slli a3, 24 +0xE2 0x06 + +# GOOD: c.slli a3, 25 +0xE6 0x06 + +# GOOD: c.slli a3, 26 +0xEA 0x06 + +# GOOD: c.slli a3, 27 +0xEE 0x06 + +# GOOD: c.slli a3, 28 +0xF2 0x06 + +# GOOD: c.slli a3, 29 +0xF6 0x06 + +# GOOD: c.slli a3, 30 +0xFA 0x06 + +# GOOD: c.slli a3, 31 +0xFE 0x06 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 32 +0x82 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 33 +0x86 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 34 +0x8A 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 35 +0x8E 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 36 +0x92 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 37 +0x96 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 38 +0x9A 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 39 +0x9E 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 40 +0xA2 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 41 +0xA6 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 42 +0xAA 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 43 +0xAE 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 44 +0xB2 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 45 +0xB6 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 46 +0xBA 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 47 +0xBE 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 48 +0xC2 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 49 +0xC6 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 50 +0xCA 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 51 +0xCE 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 52 +0xD2 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 53 +0xD6 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 54 +0xDA 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 55 +0xDE 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 56 +0xE2 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 57 +0xE6 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 58 +0xEA 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 59 +0xEE 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 60 +0xF2 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 61 +0xF6 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 62 +0xFA 0x16 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a3, 63 +0xFE 0x16 + +# GOOD: c.slli64 a4 +0x02 0x07 + +# GOOD: c.slli a4, 1 +0x06 0x07 + +# GOOD: c.slli a4, 2 +0x0A 0x07 + +# GOOD: c.slli a4, 3 +0x0E 0x07 + +# GOOD: c.slli a4, 4 +0x12 0x07 + +# GOOD: c.slli a4, 5 +0x16 0x07 + +# GOOD: c.slli a4, 6 +0x1A 0x07 + +# GOOD: c.slli a4, 7 +0x1E 0x07 + +# GOOD: c.slli a4, 8 +0x22 0x07 + +# GOOD: c.slli a4, 9 +0x26 0x07 + +# GOOD: c.slli a4, 10 +0x2A 0x07 + +# GOOD: c.slli a4, 11 +0x2E 0x07 + +# GOOD: c.slli a4, 12 +0x32 0x07 + +# GOOD: c.slli a4, 13 +0x36 0x07 + +# GOOD: c.slli a4, 14 +0x3A 0x07 + +# GOOD: c.slli a4, 15 +0x3E 0x07 + +# GOOD: c.slli a4, 16 +0x42 0x07 + +# GOOD: c.slli a4, 17 +0x46 0x07 + +# GOOD: c.slli a4, 18 +0x4A 0x07 + +# GOOD: c.slli a4, 19 +0x4E 0x07 + +# GOOD: c.slli a4, 20 +0x52 0x07 + +# GOOD: c.slli a4, 21 +0x56 0x07 + +# GOOD: c.slli a4, 22 +0x5A 0x07 + +# GOOD: c.slli a4, 23 +0x5E 0x07 + +# GOOD: c.slli a4, 24 +0x62 0x07 + +# GOOD: c.slli a4, 25 +0x66 0x07 + +# GOOD: c.slli a4, 26 +0x6A 0x07 + +# GOOD: c.slli a4, 27 +0x6E 0x07 + +# GOOD: c.slli a4, 28 +0x72 0x07 + +# GOOD: c.slli a4, 29 +0x76 0x07 + +# GOOD: c.slli a4, 30 +0x7A 0x07 + +# GOOD: c.slli a4, 31 +0x7E 0x07 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 32 +0x02 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 33 +0x06 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 34 +0x0A 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 35 +0x0E 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 36 +0x12 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 37 +0x16 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 38 +0x1A 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 39 +0x1E 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 40 +0x22 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 41 +0x26 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 42 +0x2A 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 43 +0x2E 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 44 +0x32 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 45 +0x36 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 46 +0x3A 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 47 +0x3E 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 48 +0x42 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 49 +0x46 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 50 +0x4A 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 51 +0x4E 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 52 +0x52 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 53 +0x56 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 54 +0x5A 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 55 +0x5E 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 56 +0x62 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 57 +0x66 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 58 +0x6A 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 59 +0x6E 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 60 +0x72 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 61 +0x76 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 62 +0x7A 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a4, 63 +0x7E 0x17 + +# GOOD: c.slli64 a5 +0x82 0x07 + +# GOOD: c.slli a5, 1 +0x86 0x07 + +# GOOD: c.slli a5, 2 +0x8A 0x07 + +# GOOD: c.slli a5, 3 +0x8E 0x07 + +# GOOD: c.slli a5, 4 +0x92 0x07 + +# GOOD: c.slli a5, 5 +0x96 0x07 + +# GOOD: c.slli a5, 6 +0x9A 0x07 + +# GOOD: c.slli a5, 7 +0x9E 0x07 + +# GOOD: c.slli a5, 8 +0xA2 0x07 + +# GOOD: c.slli a5, 9 +0xA6 0x07 + +# GOOD: c.slli a5, 10 +0xAA 0x07 + +# GOOD: c.slli a5, 11 +0xAE 0x07 + +# GOOD: c.slli a5, 12 +0xB2 0x07 + +# GOOD: c.slli a5, 13 +0xB6 0x07 + +# GOOD: c.slli a5, 14 +0xBA 0x07 + +# GOOD: c.slli a5, 15 +0xBE 0x07 + +# GOOD: c.slli a5, 16 +0xC2 0x07 + +# GOOD: c.slli a5, 17 +0xC6 0x07 + +# GOOD: c.slli a5, 18 +0xCA 0x07 + +# GOOD: c.slli a5, 19 +0xCE 0x07 + +# GOOD: c.slli a5, 20 +0xD2 0x07 + +# GOOD: c.slli a5, 21 +0xD6 0x07 + +# GOOD: c.slli a5, 22 +0xDA 0x07 + +# GOOD: c.slli a5, 23 +0xDE 0x07 + +# GOOD: c.slli a5, 24 +0xE2 0x07 + +# GOOD: c.slli a5, 25 +0xE6 0x07 + +# GOOD: c.slli a5, 26 +0xEA 0x07 + +# GOOD: c.slli a5, 27 +0xEE 0x07 + +# GOOD: c.slli a5, 28 +0xF2 0x07 + +# GOOD: c.slli a5, 29 +0xF6 0x07 + +# GOOD: c.slli a5, 30 +0xFA 0x07 + +# GOOD: c.slli a5, 31 +0xFE 0x07 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 32 +0x82 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 33 +0x86 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 34 +0x8A 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 35 +0x8E 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 36 +0x92 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 37 +0x96 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 38 +0x9A 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 39 +0x9E 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 40 +0xA2 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 41 +0xA6 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 42 +0xAA 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 43 +0xAE 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 44 +0xB2 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 45 +0xB6 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 46 +0xBA 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 47 +0xBE 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 48 +0xC2 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 49 +0xC6 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 50 +0xCA 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 51 +0xCE 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 52 +0xD2 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 53 +0xD6 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 54 +0xDA 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 55 +0xDE 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 56 +0xE2 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 57 +0xE6 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 58 +0xEA 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 59 +0xEE 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 60 +0xF2 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 61 +0xF6 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 62 +0xFA 0x17 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a5, 63 +0xFE 0x17 + +# GOOD: c.slli64 a6 +0x02 0x08 + +# GOOD: c.slli a6, 1 +0x06 0x08 + +# GOOD: c.slli a6, 2 +0x0A 0x08 + +# GOOD: c.slli a6, 3 +0x0E 0x08 + +# GOOD: c.slli a6, 4 +0x12 0x08 + +# GOOD: c.slli a6, 5 +0x16 0x08 + +# GOOD: c.slli a6, 6 +0x1A 0x08 + +# GOOD: c.slli a6, 7 +0x1E 0x08 + +# GOOD: c.slli a6, 8 +0x22 0x08 + +# GOOD: c.slli a6, 9 +0x26 0x08 + +# GOOD: c.slli a6, 10 +0x2A 0x08 + +# GOOD: c.slli a6, 11 +0x2E 0x08 + +# GOOD: c.slli a6, 12 +0x32 0x08 + +# GOOD: c.slli a6, 13 +0x36 0x08 + +# GOOD: c.slli a6, 14 +0x3A 0x08 + +# GOOD: c.slli a6, 15 +0x3E 0x08 + +# GOOD: c.slli a6, 16 +0x42 0x08 + +# GOOD: c.slli a6, 17 +0x46 0x08 + +# GOOD: c.slli a6, 18 +0x4A 0x08 + +# GOOD: c.slli a6, 19 +0x4E 0x08 + +# GOOD: c.slli a6, 20 +0x52 0x08 + +# GOOD: c.slli a6, 21 +0x56 0x08 + +# GOOD: c.slli a6, 22 +0x5A 0x08 + +# GOOD: c.slli a6, 23 +0x5E 0x08 + +# GOOD: c.slli a6, 24 +0x62 0x08 + +# GOOD: c.slli a6, 25 +0x66 0x08 + +# GOOD: c.slli a6, 26 +0x6A 0x08 + +# GOOD: c.slli a6, 27 +0x6E 0x08 + +# GOOD: c.slli a6, 28 +0x72 0x08 + +# GOOD: c.slli a6, 29 +0x76 0x08 + +# GOOD: c.slli a6, 30 +0x7A 0x08 + +# GOOD: c.slli a6, 31 +0x7E 0x08 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 32 +0x02 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 33 +0x06 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 34 +0x0A 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 35 +0x0E 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 36 +0x12 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 37 +0x16 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 38 +0x1A 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 39 +0x1E 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 40 +0x22 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 41 +0x26 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 42 +0x2A 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 43 +0x2E 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 44 +0x32 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 45 +0x36 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 46 +0x3A 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 47 +0x3E 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 48 +0x42 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 49 +0x46 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 50 +0x4A 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 51 +0x4E 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 52 +0x52 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 53 +0x56 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 54 +0x5A 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 55 +0x5E 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 56 +0x62 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 57 +0x66 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 58 +0x6A 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 59 +0x6E 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 60 +0x72 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 61 +0x76 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 62 +0x7A 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a6, 63 +0x7E 0x18 + +# GOOD: c.slli64 a7 +0x82 0x08 + +# GOOD: c.slli a7, 1 +0x86 0x08 + +# GOOD: c.slli a7, 2 +0x8A 0x08 + +# GOOD: c.slli a7, 3 +0x8E 0x08 + +# GOOD: c.slli a7, 4 +0x92 0x08 + +# GOOD: c.slli a7, 5 +0x96 0x08 + +# GOOD: c.slli a7, 6 +0x9A 0x08 + +# GOOD: c.slli a7, 7 +0x9E 0x08 + +# GOOD: c.slli a7, 8 +0xA2 0x08 + +# GOOD: c.slli a7, 9 +0xA6 0x08 + +# GOOD: c.slli a7, 10 +0xAA 0x08 + +# GOOD: c.slli a7, 11 +0xAE 0x08 + +# GOOD: c.slli a7, 12 +0xB2 0x08 + +# GOOD: c.slli a7, 13 +0xB6 0x08 + +# GOOD: c.slli a7, 14 +0xBA 0x08 + +# GOOD: c.slli a7, 15 +0xBE 0x08 + +# GOOD: c.slli a7, 16 +0xC2 0x08 + +# GOOD: c.slli a7, 17 +0xC6 0x08 + +# GOOD: c.slli a7, 18 +0xCA 0x08 + +# GOOD: c.slli a7, 19 +0xCE 0x08 + +# GOOD: c.slli a7, 20 +0xD2 0x08 + +# GOOD: c.slli a7, 21 +0xD6 0x08 + +# GOOD: c.slli a7, 22 +0xDA 0x08 + +# GOOD: c.slli a7, 23 +0xDE 0x08 + +# GOOD: c.slli a7, 24 +0xE2 0x08 + +# GOOD: c.slli a7, 25 +0xE6 0x08 + +# GOOD: c.slli a7, 26 +0xEA 0x08 + +# GOOD: c.slli a7, 27 +0xEE 0x08 + +# GOOD: c.slli a7, 28 +0xF2 0x08 + +# GOOD: c.slli a7, 29 +0xF6 0x08 + +# GOOD: c.slli a7, 30 +0xFA 0x08 + +# GOOD: c.slli a7, 31 +0xFE 0x08 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 32 +0x82 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 33 +0x86 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 34 +0x8A 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 35 +0x8E 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 36 +0x92 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 37 +0x96 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 38 +0x9A 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 39 +0x9E 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 40 +0xA2 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 41 +0xA6 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 42 +0xAA 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 43 +0xAE 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 44 +0xB2 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 45 +0xB6 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 46 +0xBA 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 47 +0xBE 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 48 +0xC2 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 49 +0xC6 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 50 +0xCA 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 51 +0xCE 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 52 +0xD2 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 53 +0xD6 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 54 +0xDA 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 55 +0xDE 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 56 +0xE2 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 57 +0xE6 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 58 +0xEA 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 59 +0xEE 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 60 +0xF2 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 61 +0xF6 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 62 +0xFA 0x18 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli a7, 63 +0xFE 0x18 + +# GOOD: c.slli64 s2 +0x02 0x09 + +# GOOD: c.slli s2, 1 +0x06 0x09 + +# GOOD: c.slli s2, 2 +0x0A 0x09 + +# GOOD: c.slli s2, 3 +0x0E 0x09 + +# GOOD: c.slli s2, 4 +0x12 0x09 + +# GOOD: c.slli s2, 5 +0x16 0x09 + +# GOOD: c.slli s2, 6 +0x1A 0x09 + +# GOOD: c.slli s2, 7 +0x1E 0x09 + +# GOOD: c.slli s2, 8 +0x22 0x09 + +# GOOD: c.slli s2, 9 +0x26 0x09 + +# GOOD: c.slli s2, 10 +0x2A 0x09 + +# GOOD: c.slli s2, 11 +0x2E 0x09 + +# GOOD: c.slli s2, 12 +0x32 0x09 + +# GOOD: c.slli s2, 13 +0x36 0x09 + +# GOOD: c.slli s2, 14 +0x3A 0x09 + +# GOOD: c.slli s2, 15 +0x3E 0x09 + +# GOOD: c.slli s2, 16 +0x42 0x09 + +# GOOD: c.slli s2, 17 +0x46 0x09 + +# GOOD: c.slli s2, 18 +0x4A 0x09 + +# GOOD: c.slli s2, 19 +0x4E 0x09 + +# GOOD: c.slli s2, 20 +0x52 0x09 + +# GOOD: c.slli s2, 21 +0x56 0x09 + +# GOOD: c.slli s2, 22 +0x5A 0x09 + +# GOOD: c.slli s2, 23 +0x5E 0x09 + +# GOOD: c.slli s2, 24 +0x62 0x09 + +# GOOD: c.slli s2, 25 +0x66 0x09 + +# GOOD: c.slli s2, 26 +0x6A 0x09 + +# GOOD: c.slli s2, 27 +0x6E 0x09 + +# GOOD: c.slli s2, 28 +0x72 0x09 + +# GOOD: c.slli s2, 29 +0x76 0x09 + +# GOOD: c.slli s2, 30 +0x7A 0x09 + +# GOOD: c.slli s2, 31 +0x7E 0x09 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 32 +0x02 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 33 +0x06 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 34 +0x0A 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 35 +0x0E 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 36 +0x12 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 37 +0x16 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 38 +0x1A 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 39 +0x1E 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 40 +0x22 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 41 +0x26 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 42 +0x2A 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 43 +0x2E 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 44 +0x32 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 45 +0x36 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 46 +0x3A 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 47 +0x3E 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 48 +0x42 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 49 +0x46 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 50 +0x4A 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 51 +0x4E 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 52 +0x52 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 53 +0x56 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 54 +0x5A 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 55 +0x5E 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 56 +0x62 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 57 +0x66 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 58 +0x6A 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 59 +0x6E 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 60 +0x72 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 61 +0x76 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 62 +0x7A 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s2, 63 +0x7E 0x19 + +# GOOD: c.slli64 s3 +0x82 0x09 + +# GOOD: c.slli s3, 1 +0x86 0x09 + +# GOOD: c.slli s3, 2 +0x8A 0x09 + +# GOOD: c.slli s3, 3 +0x8E 0x09 + +# GOOD: c.slli s3, 4 +0x92 0x09 + +# GOOD: c.slli s3, 5 +0x96 0x09 + +# GOOD: c.slli s3, 6 +0x9A 0x09 + +# GOOD: c.slli s3, 7 +0x9E 0x09 + +# GOOD: c.slli s3, 8 +0xA2 0x09 + +# GOOD: c.slli s3, 9 +0xA6 0x09 + +# GOOD: c.slli s3, 10 +0xAA 0x09 + +# GOOD: c.slli s3, 11 +0xAE 0x09 + +# GOOD: c.slli s3, 12 +0xB2 0x09 + +# GOOD: c.slli s3, 13 +0xB6 0x09 + +# GOOD: c.slli s3, 14 +0xBA 0x09 + +# GOOD: c.slli s3, 15 +0xBE 0x09 + +# GOOD: c.slli s3, 16 +0xC2 0x09 + +# GOOD: c.slli s3, 17 +0xC6 0x09 + +# GOOD: c.slli s3, 18 +0xCA 0x09 + +# GOOD: c.slli s3, 19 +0xCE 0x09 + +# GOOD: c.slli s3, 20 +0xD2 0x09 + +# GOOD: c.slli s3, 21 +0xD6 0x09 + +# GOOD: c.slli s3, 22 +0xDA 0x09 + +# GOOD: c.slli s3, 23 +0xDE 0x09 + +# GOOD: c.slli s3, 24 +0xE2 0x09 + +# GOOD: c.slli s3, 25 +0xE6 0x09 + +# GOOD: c.slli s3, 26 +0xEA 0x09 + +# GOOD: c.slli s3, 27 +0xEE 0x09 + +# GOOD: c.slli s3, 28 +0xF2 0x09 + +# GOOD: c.slli s3, 29 +0xF6 0x09 + +# GOOD: c.slli s3, 30 +0xFA 0x09 + +# GOOD: c.slli s3, 31 +0xFE 0x09 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 32 +0x82 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 33 +0x86 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 34 +0x8A 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 35 +0x8E 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 36 +0x92 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 37 +0x96 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 38 +0x9A 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 39 +0x9E 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 40 +0xA2 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 41 +0xA6 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 42 +0xAA 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 43 +0xAE 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 44 +0xB2 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 45 +0xB6 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 46 +0xBA 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 47 +0xBE 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 48 +0xC2 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 49 +0xC6 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 50 +0xCA 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 51 +0xCE 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 52 +0xD2 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 53 +0xD6 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 54 +0xDA 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 55 +0xDE 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 56 +0xE2 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 57 +0xE6 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 58 +0xEA 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 59 +0xEE 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 60 +0xF2 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 61 +0xF6 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 62 +0xFA 0x19 + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s3, 63 +0xFE 0x19 + +# GOOD: c.slli64 s4 +0x02 0x0A + +# GOOD: c.slli s4, 1 +0x06 0x0A + +# GOOD: c.slli s4, 2 +0x0A 0x0A + +# GOOD: c.slli s4, 3 +0x0E 0x0A + +# GOOD: c.slli s4, 4 +0x12 0x0A + +# GOOD: c.slli s4, 5 +0x16 0x0A + +# GOOD: c.slli s4, 6 +0x1A 0x0A + +# GOOD: c.slli s4, 7 +0x1E 0x0A + +# GOOD: c.slli s4, 8 +0x22 0x0A + +# GOOD: c.slli s4, 9 +0x26 0x0A + +# GOOD: c.slli s4, 10 +0x2A 0x0A + +# GOOD: c.slli s4, 11 +0x2E 0x0A + +# GOOD: c.slli s4, 12 +0x32 0x0A + +# GOOD: c.slli s4, 13 +0x36 0x0A + +# GOOD: c.slli s4, 14 +0x3A 0x0A + +# GOOD: c.slli s4, 15 +0x3E 0x0A + +# GOOD: c.slli s4, 16 +0x42 0x0A + +# GOOD: c.slli s4, 17 +0x46 0x0A + +# GOOD: c.slli s4, 18 +0x4A 0x0A + +# GOOD: c.slli s4, 19 +0x4E 0x0A + +# GOOD: c.slli s4, 20 +0x52 0x0A + +# GOOD: c.slli s4, 21 +0x56 0x0A + +# GOOD: c.slli s4, 22 +0x5A 0x0A + +# GOOD: c.slli s4, 23 +0x5E 0x0A + +# GOOD: c.slli s4, 24 +0x62 0x0A + +# GOOD: c.slli s4, 25 +0x66 0x0A + +# GOOD: c.slli s4, 26 +0x6A 0x0A + +# GOOD: c.slli s4, 27 +0x6E 0x0A + +# GOOD: c.slli s4, 28 +0x72 0x0A + +# GOOD: c.slli s4, 29 +0x76 0x0A + +# GOOD: c.slli s4, 30 +0x7A 0x0A + +# GOOD: c.slli s4, 31 +0x7E 0x0A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 32 +0x02 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 33 +0x06 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 34 +0x0A 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 35 +0x0E 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 36 +0x12 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 37 +0x16 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 38 +0x1A 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 39 +0x1E 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 40 +0x22 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 41 +0x26 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 42 +0x2A 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 43 +0x2E 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 44 +0x32 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 45 +0x36 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 46 +0x3A 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 47 +0x3E 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 48 +0x42 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 49 +0x46 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 50 +0x4A 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 51 +0x4E 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 52 +0x52 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 53 +0x56 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 54 +0x5A 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 55 +0x5E 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 56 +0x62 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 57 +0x66 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 58 +0x6A 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 59 +0x6E 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 60 +0x72 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 61 +0x76 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 62 +0x7A 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s4, 63 +0x7E 0x1A + +# GOOD: c.slli64 s5 +0x82 0x0A + +# GOOD: c.slli s5, 1 +0x86 0x0A + +# GOOD: c.slli s5, 2 +0x8A 0x0A + +# GOOD: c.slli s5, 3 +0x8E 0x0A + +# GOOD: c.slli s5, 4 +0x92 0x0A + +# GOOD: c.slli s5, 5 +0x96 0x0A + +# GOOD: c.slli s5, 6 +0x9A 0x0A + +# GOOD: c.slli s5, 7 +0x9E 0x0A + +# GOOD: c.slli s5, 8 +0xA2 0x0A + +# GOOD: c.slli s5, 9 +0xA6 0x0A + +# GOOD: c.slli s5, 10 +0xAA 0x0A + +# GOOD: c.slli s5, 11 +0xAE 0x0A + +# GOOD: c.slli s5, 12 +0xB2 0x0A + +# GOOD: c.slli s5, 13 +0xB6 0x0A + +# GOOD: c.slli s5, 14 +0xBA 0x0A + +# GOOD: c.slli s5, 15 +0xBE 0x0A + +# GOOD: c.slli s5, 16 +0xC2 0x0A + +# GOOD: c.slli s5, 17 +0xC6 0x0A + +# GOOD: c.slli s5, 18 +0xCA 0x0A + +# GOOD: c.slli s5, 19 +0xCE 0x0A + +# GOOD: c.slli s5, 20 +0xD2 0x0A + +# GOOD: c.slli s5, 21 +0xD6 0x0A + +# GOOD: c.slli s5, 22 +0xDA 0x0A + +# GOOD: c.slli s5, 23 +0xDE 0x0A + +# GOOD: c.slli s5, 24 +0xE2 0x0A + +# GOOD: c.slli s5, 25 +0xE6 0x0A + +# GOOD: c.slli s5, 26 +0xEA 0x0A + +# GOOD: c.slli s5, 27 +0xEE 0x0A + +# GOOD: c.slli s5, 28 +0xF2 0x0A + +# GOOD: c.slli s5, 29 +0xF6 0x0A + +# GOOD: c.slli s5, 30 +0xFA 0x0A + +# GOOD: c.slli s5, 31 +0xFE 0x0A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 32 +0x82 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 33 +0x86 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 34 +0x8A 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 35 +0x8E 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 36 +0x92 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 37 +0x96 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 38 +0x9A 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 39 +0x9E 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 40 +0xA2 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 41 +0xA6 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 42 +0xAA 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 43 +0xAE 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 44 +0xB2 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 45 +0xB6 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 46 +0xBA 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 47 +0xBE 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 48 +0xC2 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 49 +0xC6 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 50 +0xCA 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 51 +0xCE 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 52 +0xD2 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 53 +0xD6 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 54 +0xDA 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 55 +0xDE 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 56 +0xE2 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 57 +0xE6 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 58 +0xEA 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 59 +0xEE 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 60 +0xF2 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 61 +0xF6 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 62 +0xFA 0x1A + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s5, 63 +0xFE 0x1A + +# GOOD: c.slli64 s6 +0x02 0x0B + +# GOOD: c.slli s6, 1 +0x06 0x0B + +# GOOD: c.slli s6, 2 +0x0A 0x0B + +# GOOD: c.slli s6, 3 +0x0E 0x0B + +# GOOD: c.slli s6, 4 +0x12 0x0B + +# GOOD: c.slli s6, 5 +0x16 0x0B + +# GOOD: c.slli s6, 6 +0x1A 0x0B + +# GOOD: c.slli s6, 7 +0x1E 0x0B + +# GOOD: c.slli s6, 8 +0x22 0x0B + +# GOOD: c.slli s6, 9 +0x26 0x0B + +# GOOD: c.slli s6, 10 +0x2A 0x0B + +# GOOD: c.slli s6, 11 +0x2E 0x0B + +# GOOD: c.slli s6, 12 +0x32 0x0B + +# GOOD: c.slli s6, 13 +0x36 0x0B + +# GOOD: c.slli s6, 14 +0x3A 0x0B + +# GOOD: c.slli s6, 15 +0x3E 0x0B + +# GOOD: c.slli s6, 16 +0x42 0x0B + +# GOOD: c.slli s6, 17 +0x46 0x0B + +# GOOD: c.slli s6, 18 +0x4A 0x0B + +# GOOD: c.slli s6, 19 +0x4E 0x0B + +# GOOD: c.slli s6, 20 +0x52 0x0B + +# GOOD: c.slli s6, 21 +0x56 0x0B + +# GOOD: c.slli s6, 22 +0x5A 0x0B + +# GOOD: c.slli s6, 23 +0x5E 0x0B + +# GOOD: c.slli s6, 24 +0x62 0x0B + +# GOOD: c.slli s6, 25 +0x66 0x0B + +# GOOD: c.slli s6, 26 +0x6A 0x0B + +# GOOD: c.slli s6, 27 +0x6E 0x0B + +# GOOD: c.slli s6, 28 +0x72 0x0B + +# GOOD: c.slli s6, 29 +0x76 0x0B + +# GOOD: c.slli s6, 30 +0x7A 0x0B + +# GOOD: c.slli s6, 31 +0x7E 0x0B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 32 +0x02 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 33 +0x06 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 34 +0x0A 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 35 +0x0E 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 36 +0x12 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 37 +0x16 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 38 +0x1A 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 39 +0x1E 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 40 +0x22 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 41 +0x26 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 42 +0x2A 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 43 +0x2E 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 44 +0x32 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 45 +0x36 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 46 +0x3A 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 47 +0x3E 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 48 +0x42 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 49 +0x46 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 50 +0x4A 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 51 +0x4E 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 52 +0x52 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 53 +0x56 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 54 +0x5A 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 55 +0x5E 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 56 +0x62 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 57 +0x66 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 58 +0x6A 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 59 +0x6E 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 60 +0x72 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 61 +0x76 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 62 +0x7A 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s6, 63 +0x7E 0x1B + +# GOOD: c.slli64 s7 +0x82 0x0B + +# GOOD: c.slli s7, 1 +0x86 0x0B + +# GOOD: c.slli s7, 2 +0x8A 0x0B + +# GOOD: c.slli s7, 3 +0x8E 0x0B + +# GOOD: c.slli s7, 4 +0x92 0x0B + +# GOOD: c.slli s7, 5 +0x96 0x0B + +# GOOD: c.slli s7, 6 +0x9A 0x0B + +# GOOD: c.slli s7, 7 +0x9E 0x0B + +# GOOD: c.slli s7, 8 +0xA2 0x0B + +# GOOD: c.slli s7, 9 +0xA6 0x0B + +# GOOD: c.slli s7, 10 +0xAA 0x0B + +# GOOD: c.slli s7, 11 +0xAE 0x0B + +# GOOD: c.slli s7, 12 +0xB2 0x0B + +# GOOD: c.slli s7, 13 +0xB6 0x0B + +# GOOD: c.slli s7, 14 +0xBA 0x0B + +# GOOD: c.slli s7, 15 +0xBE 0x0B + +# GOOD: c.slli s7, 16 +0xC2 0x0B + +# GOOD: c.slli s7, 17 +0xC6 0x0B + +# GOOD: c.slli s7, 18 +0xCA 0x0B + +# GOOD: c.slli s7, 19 +0xCE 0x0B + +# GOOD: c.slli s7, 20 +0xD2 0x0B + +# GOOD: c.slli s7, 21 +0xD6 0x0B + +# GOOD: c.slli s7, 22 +0xDA 0x0B + +# GOOD: c.slli s7, 23 +0xDE 0x0B + +# GOOD: c.slli s7, 24 +0xE2 0x0B + +# GOOD: c.slli s7, 25 +0xE6 0x0B + +# GOOD: c.slli s7, 26 +0xEA 0x0B + +# GOOD: c.slli s7, 27 +0xEE 0x0B + +# GOOD: c.slli s7, 28 +0xF2 0x0B + +# GOOD: c.slli s7, 29 +0xF6 0x0B + +# GOOD: c.slli s7, 30 +0xFA 0x0B + +# GOOD: c.slli s7, 31 +0xFE 0x0B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 32 +0x82 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 33 +0x86 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 34 +0x8A 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 35 +0x8E 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 36 +0x92 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 37 +0x96 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 38 +0x9A 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 39 +0x9E 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 40 +0xA2 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 41 +0xA6 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 42 +0xAA 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 43 +0xAE 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 44 +0xB2 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 45 +0xB6 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 46 +0xBA 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 47 +0xBE 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 48 +0xC2 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 49 +0xC6 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 50 +0xCA 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 51 +0xCE 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 52 +0xD2 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 53 +0xD6 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 54 +0xDA 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 55 +0xDE 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 56 +0xE2 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 57 +0xE6 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 58 +0xEA 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 59 +0xEE 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 60 +0xF2 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 61 +0xF6 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 62 +0xFA 0x1B + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s7, 63 +0xFE 0x1B + +# GOOD: c.slli64 s8 +0x02 0x0C + +# GOOD: c.slli s8, 1 +0x06 0x0C + +# GOOD: c.slli s8, 2 +0x0A 0x0C + +# GOOD: c.slli s8, 3 +0x0E 0x0C + +# GOOD: c.slli s8, 4 +0x12 0x0C + +# GOOD: c.slli s8, 5 +0x16 0x0C + +# GOOD: c.slli s8, 6 +0x1A 0x0C + +# GOOD: c.slli s8, 7 +0x1E 0x0C + +# GOOD: c.slli s8, 8 +0x22 0x0C + +# GOOD: c.slli s8, 9 +0x26 0x0C + +# GOOD: c.slli s8, 10 +0x2A 0x0C + +# GOOD: c.slli s8, 11 +0x2E 0x0C + +# GOOD: c.slli s8, 12 +0x32 0x0C + +# GOOD: c.slli s8, 13 +0x36 0x0C + +# GOOD: c.slli s8, 14 +0x3A 0x0C + +# GOOD: c.slli s8, 15 +0x3E 0x0C + +# GOOD: c.slli s8, 16 +0x42 0x0C + +# GOOD: c.slli s8, 17 +0x46 0x0C + +# GOOD: c.slli s8, 18 +0x4A 0x0C + +# GOOD: c.slli s8, 19 +0x4E 0x0C + +# GOOD: c.slli s8, 20 +0x52 0x0C + +# GOOD: c.slli s8, 21 +0x56 0x0C + +# GOOD: c.slli s8, 22 +0x5A 0x0C + +# GOOD: c.slli s8, 23 +0x5E 0x0C + +# GOOD: c.slli s8, 24 +0x62 0x0C + +# GOOD: c.slli s8, 25 +0x66 0x0C + +# GOOD: c.slli s8, 26 +0x6A 0x0C + +# GOOD: c.slli s8, 27 +0x6E 0x0C + +# GOOD: c.slli s8, 28 +0x72 0x0C + +# GOOD: c.slli s8, 29 +0x76 0x0C + +# GOOD: c.slli s8, 30 +0x7A 0x0C + +# GOOD: c.slli s8, 31 +0x7E 0x0C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 32 +0x02 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 33 +0x06 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 34 +0x0A 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 35 +0x0E 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 36 +0x12 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 37 +0x16 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 38 +0x1A 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 39 +0x1E 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 40 +0x22 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 41 +0x26 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 42 +0x2A 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 43 +0x2E 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 44 +0x32 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 45 +0x36 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 46 +0x3A 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 47 +0x3E 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 48 +0x42 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 49 +0x46 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 50 +0x4A 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 51 +0x4E 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 52 +0x52 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 53 +0x56 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 54 +0x5A 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 55 +0x5E 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 56 +0x62 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 57 +0x66 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 58 +0x6A 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 59 +0x6E 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 60 +0x72 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 61 +0x76 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 62 +0x7A 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s8, 63 +0x7E 0x1C + +# GOOD: c.slli64 s9 +0x82 0x0C + +# GOOD: c.slli s9, 1 +0x86 0x0C + +# GOOD: c.slli s9, 2 +0x8A 0x0C + +# GOOD: c.slli s9, 3 +0x8E 0x0C + +# GOOD: c.slli s9, 4 +0x92 0x0C + +# GOOD: c.slli s9, 5 +0x96 0x0C + +# GOOD: c.slli s9, 6 +0x9A 0x0C + +# GOOD: c.slli s9, 7 +0x9E 0x0C + +# GOOD: c.slli s9, 8 +0xA2 0x0C + +# GOOD: c.slli s9, 9 +0xA6 0x0C + +# GOOD: c.slli s9, 10 +0xAA 0x0C + +# GOOD: c.slli s9, 11 +0xAE 0x0C + +# GOOD: c.slli s9, 12 +0xB2 0x0C + +# GOOD: c.slli s9, 13 +0xB6 0x0C + +# GOOD: c.slli s9, 14 +0xBA 0x0C + +# GOOD: c.slli s9, 15 +0xBE 0x0C + +# GOOD: c.slli s9, 16 +0xC2 0x0C + +# GOOD: c.slli s9, 17 +0xC6 0x0C + +# GOOD: c.slli s9, 18 +0xCA 0x0C + +# GOOD: c.slli s9, 19 +0xCE 0x0C + +# GOOD: c.slli s9, 20 +0xD2 0x0C + +# GOOD: c.slli s9, 21 +0xD6 0x0C + +# GOOD: c.slli s9, 22 +0xDA 0x0C + +# GOOD: c.slli s9, 23 +0xDE 0x0C + +# GOOD: c.slli s9, 24 +0xE2 0x0C + +# GOOD: c.slli s9, 25 +0xE6 0x0C + +# GOOD: c.slli s9, 26 +0xEA 0x0C + +# GOOD: c.slli s9, 27 +0xEE 0x0C + +# GOOD: c.slli s9, 28 +0xF2 0x0C + +# GOOD: c.slli s9, 29 +0xF6 0x0C + +# GOOD: c.slli s9, 30 +0xFA 0x0C + +# GOOD: c.slli s9, 31 +0xFE 0x0C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 32 +0x82 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 33 +0x86 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 34 +0x8A 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 35 +0x8E 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 36 +0x92 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 37 +0x96 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 38 +0x9A 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 39 +0x9E 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 40 +0xA2 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 41 +0xA6 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 42 +0xAA 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 43 +0xAE 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 44 +0xB2 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 45 +0xB6 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 46 +0xBA 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 47 +0xBE 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 48 +0xC2 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 49 +0xC6 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 50 +0xCA 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 51 +0xCE 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 52 +0xD2 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 53 +0xD6 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 54 +0xDA 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 55 +0xDE 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 56 +0xE2 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 57 +0xE6 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 58 +0xEA 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 59 +0xEE 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 60 +0xF2 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 61 +0xF6 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 62 +0xFA 0x1C + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s9, 63 +0xFE 0x1C + +# GOOD: c.slli64 s10 +0x02 0x0D + +# GOOD: c.slli s10, 1 +0x06 0x0D + +# GOOD: c.slli s10, 2 +0x0A 0x0D + +# GOOD: c.slli s10, 3 +0x0E 0x0D + +# GOOD: c.slli s10, 4 +0x12 0x0D + +# GOOD: c.slli s10, 5 +0x16 0x0D + +# GOOD: c.slli s10, 6 +0x1A 0x0D + +# GOOD: c.slli s10, 7 +0x1E 0x0D + +# GOOD: c.slli s10, 8 +0x22 0x0D + +# GOOD: c.slli s10, 9 +0x26 0x0D + +# GOOD: c.slli s10, 10 +0x2A 0x0D + +# GOOD: c.slli s10, 11 +0x2E 0x0D + +# GOOD: c.slli s10, 12 +0x32 0x0D + +# GOOD: c.slli s10, 13 +0x36 0x0D + +# GOOD: c.slli s10, 14 +0x3A 0x0D + +# GOOD: c.slli s10, 15 +0x3E 0x0D + +# GOOD: c.slli s10, 16 +0x42 0x0D + +# GOOD: c.slli s10, 17 +0x46 0x0D + +# GOOD: c.slli s10, 18 +0x4A 0x0D + +# GOOD: c.slli s10, 19 +0x4E 0x0D + +# GOOD: c.slli s10, 20 +0x52 0x0D + +# GOOD: c.slli s10, 21 +0x56 0x0D + +# GOOD: c.slli s10, 22 +0x5A 0x0D + +# GOOD: c.slli s10, 23 +0x5E 0x0D + +# GOOD: c.slli s10, 24 +0x62 0x0D + +# GOOD: c.slli s10, 25 +0x66 0x0D + +# GOOD: c.slli s10, 26 +0x6A 0x0D + +# GOOD: c.slli s10, 27 +0x6E 0x0D + +# GOOD: c.slli s10, 28 +0x72 0x0D + +# GOOD: c.slli s10, 29 +0x76 0x0D + +# GOOD: c.slli s10, 30 +0x7A 0x0D + +# GOOD: c.slli s10, 31 +0x7E 0x0D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 32 +0x02 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 33 +0x06 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 34 +0x0A 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 35 +0x0E 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 36 +0x12 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 37 +0x16 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 38 +0x1A 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 39 +0x1E 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 40 +0x22 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 41 +0x26 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 42 +0x2A 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 43 +0x2E 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 44 +0x32 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 45 +0x36 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 46 +0x3A 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 47 +0x3E 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 48 +0x42 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 49 +0x46 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 50 +0x4A 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 51 +0x4E 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 52 +0x52 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 53 +0x56 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 54 +0x5A 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 55 +0x5E 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 56 +0x62 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 57 +0x66 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 58 +0x6A 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 59 +0x6E 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 60 +0x72 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 61 +0x76 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 62 +0x7A 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s10, 63 +0x7E 0x1D + +# GOOD: c.slli64 s11 +0x82 0x0D + +# GOOD: c.slli s11, 1 +0x86 0x0D + +# GOOD: c.slli s11, 2 +0x8A 0x0D + +# GOOD: c.slli s11, 3 +0x8E 0x0D + +# GOOD: c.slli s11, 4 +0x92 0x0D + +# GOOD: c.slli s11, 5 +0x96 0x0D + +# GOOD: c.slli s11, 6 +0x9A 0x0D + +# GOOD: c.slli s11, 7 +0x9E 0x0D + +# GOOD: c.slli s11, 8 +0xA2 0x0D + +# GOOD: c.slli s11, 9 +0xA6 0x0D + +# GOOD: c.slli s11, 10 +0xAA 0x0D + +# GOOD: c.slli s11, 11 +0xAE 0x0D + +# GOOD: c.slli s11, 12 +0xB2 0x0D + +# GOOD: c.slli s11, 13 +0xB6 0x0D + +# GOOD: c.slli s11, 14 +0xBA 0x0D + +# GOOD: c.slli s11, 15 +0xBE 0x0D + +# GOOD: c.slli s11, 16 +0xC2 0x0D + +# GOOD: c.slli s11, 17 +0xC6 0x0D + +# GOOD: c.slli s11, 18 +0xCA 0x0D + +# GOOD: c.slli s11, 19 +0xCE 0x0D + +# GOOD: c.slli s11, 20 +0xD2 0x0D + +# GOOD: c.slli s11, 21 +0xD6 0x0D + +# GOOD: c.slli s11, 22 +0xDA 0x0D + +# GOOD: c.slli s11, 23 +0xDE 0x0D + +# GOOD: c.slli s11, 24 +0xE2 0x0D + +# GOOD: c.slli s11, 25 +0xE6 0x0D + +# GOOD: c.slli s11, 26 +0xEA 0x0D + +# GOOD: c.slli s11, 27 +0xEE 0x0D + +# GOOD: c.slli s11, 28 +0xF2 0x0D + +# GOOD: c.slli s11, 29 +0xF6 0x0D + +# GOOD: c.slli s11, 30 +0xFA 0x0D + +# GOOD: c.slli s11, 31 +0xFE 0x0D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 32 +0x82 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 33 +0x86 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 34 +0x8A 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 35 +0x8E 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 36 +0x92 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 37 +0x96 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 38 +0x9A 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 39 +0x9E 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 40 +0xA2 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 41 +0xA6 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 42 +0xAA 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 43 +0xAE 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 44 +0xB2 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 45 +0xB6 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 46 +0xBA 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 47 +0xBE 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 48 +0xC2 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 49 +0xC6 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 50 +0xCA 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 51 +0xCE 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 52 +0xD2 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 53 +0xD6 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 54 +0xDA 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 55 +0xDE 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 56 +0xE2 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 57 +0xE6 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 58 +0xEA 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 59 +0xEE 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 60 +0xF2 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 61 +0xF6 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 62 +0xFA 0x1D + +# BAD32: invalid instruction encoding +# GOOD64: c.slli s11, 63 +0xFE 0x1D + +# GOOD: c.slli64 t3 +0x02 0x0E + +# GOOD: c.slli t3, 1 +0x06 0x0E + +# GOOD: c.slli t3, 2 +0x0A 0x0E + +# GOOD: c.slli t3, 3 +0x0E 0x0E + +# GOOD: c.slli t3, 4 +0x12 0x0E + +# GOOD: c.slli t3, 5 +0x16 0x0E + +# GOOD: c.slli t3, 6 +0x1A 0x0E + +# GOOD: c.slli t3, 7 +0x1E 0x0E + +# GOOD: c.slli t3, 8 +0x22 0x0E + +# GOOD: c.slli t3, 9 +0x26 0x0E + +# GOOD: c.slli t3, 10 +0x2A 0x0E + +# GOOD: c.slli t3, 11 +0x2E 0x0E + +# GOOD: c.slli t3, 12 +0x32 0x0E + +# GOOD: c.slli t3, 13 +0x36 0x0E + +# GOOD: c.slli t3, 14 +0x3A 0x0E + +# GOOD: c.slli t3, 15 +0x3E 0x0E + +# GOOD: c.slli t3, 16 +0x42 0x0E + +# GOOD: c.slli t3, 17 +0x46 0x0E + +# GOOD: c.slli t3, 18 +0x4A 0x0E + +# GOOD: c.slli t3, 19 +0x4E 0x0E + +# GOOD: c.slli t3, 20 +0x52 0x0E + +# GOOD: c.slli t3, 21 +0x56 0x0E + +# GOOD: c.slli t3, 22 +0x5A 0x0E + +# GOOD: c.slli t3, 23 +0x5E 0x0E + +# GOOD: c.slli t3, 24 +0x62 0x0E + +# GOOD: c.slli t3, 25 +0x66 0x0E + +# GOOD: c.slli t3, 26 +0x6A 0x0E + +# GOOD: c.slli t3, 27 +0x6E 0x0E + +# GOOD: c.slli t3, 28 +0x72 0x0E + +# GOOD: c.slli t3, 29 +0x76 0x0E + +# GOOD: c.slli t3, 30 +0x7A 0x0E + +# GOOD: c.slli t3, 31 +0x7E 0x0E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 32 +0x02 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 33 +0x06 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 34 +0x0A 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 35 +0x0E 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 36 +0x12 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 37 +0x16 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 38 +0x1A 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 39 +0x1E 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 40 +0x22 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 41 +0x26 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 42 +0x2A 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 43 +0x2E 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 44 +0x32 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 45 +0x36 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 46 +0x3A 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 47 +0x3E 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 48 +0x42 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 49 +0x46 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 50 +0x4A 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 51 +0x4E 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 52 +0x52 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 53 +0x56 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 54 +0x5A 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 55 +0x5E 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 56 +0x62 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 57 +0x66 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 58 +0x6A 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 59 +0x6E 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 60 +0x72 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 61 +0x76 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 62 +0x7A 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t3, 63 +0x7E 0x1E + +# GOOD: c.slli64 t4 +0x82 0x0E + +# GOOD: c.slli t4, 1 +0x86 0x0E + +# GOOD: c.slli t4, 2 +0x8A 0x0E + +# GOOD: c.slli t4, 3 +0x8E 0x0E + +# GOOD: c.slli t4, 4 +0x92 0x0E + +# GOOD: c.slli t4, 5 +0x96 0x0E + +# GOOD: c.slli t4, 6 +0x9A 0x0E + +# GOOD: c.slli t4, 7 +0x9E 0x0E + +# GOOD: c.slli t4, 8 +0xA2 0x0E + +# GOOD: c.slli t4, 9 +0xA6 0x0E + +# GOOD: c.slli t4, 10 +0xAA 0x0E + +# GOOD: c.slli t4, 11 +0xAE 0x0E + +# GOOD: c.slli t4, 12 +0xB2 0x0E + +# GOOD: c.slli t4, 13 +0xB6 0x0E + +# GOOD: c.slli t4, 14 +0xBA 0x0E + +# GOOD: c.slli t4, 15 +0xBE 0x0E + +# GOOD: c.slli t4, 16 +0xC2 0x0E + +# GOOD: c.slli t4, 17 +0xC6 0x0E + +# GOOD: c.slli t4, 18 +0xCA 0x0E + +# GOOD: c.slli t4, 19 +0xCE 0x0E + +# GOOD: c.slli t4, 20 +0xD2 0x0E + +# GOOD: c.slli t4, 21 +0xD6 0x0E + +# GOOD: c.slli t4, 22 +0xDA 0x0E + +# GOOD: c.slli t4, 23 +0xDE 0x0E + +# GOOD: c.slli t4, 24 +0xE2 0x0E + +# GOOD: c.slli t4, 25 +0xE6 0x0E + +# GOOD: c.slli t4, 26 +0xEA 0x0E + +# GOOD: c.slli t4, 27 +0xEE 0x0E + +# GOOD: c.slli t4, 28 +0xF2 0x0E + +# GOOD: c.slli t4, 29 +0xF6 0x0E + +# GOOD: c.slli t4, 30 +0xFA 0x0E + +# GOOD: c.slli t4, 31 +0xFE 0x0E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 32 +0x82 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 33 +0x86 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 34 +0x8A 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 35 +0x8E 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 36 +0x92 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 37 +0x96 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 38 +0x9A 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 39 +0x9E 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 40 +0xA2 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 41 +0xA6 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 42 +0xAA 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 43 +0xAE 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 44 +0xB2 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 45 +0xB6 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 46 +0xBA 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 47 +0xBE 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 48 +0xC2 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 49 +0xC6 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 50 +0xCA 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 51 +0xCE 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 52 +0xD2 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 53 +0xD6 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 54 +0xDA 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 55 +0xDE 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 56 +0xE2 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 57 +0xE6 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 58 +0xEA 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 59 +0xEE 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 60 +0xF2 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 61 +0xF6 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 62 +0xFA 0x1E + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t4, 63 +0xFE 0x1E + +# GOOD: c.slli64 t5 +0x02 0x0F + +# GOOD: c.slli t5, 1 +0x06 0x0F + +# GOOD: c.slli t5, 2 +0x0A 0x0F + +# GOOD: c.slli t5, 3 +0x0E 0x0F + +# GOOD: c.slli t5, 4 +0x12 0x0F + +# GOOD: c.slli t5, 5 +0x16 0x0F + +# GOOD: c.slli t5, 6 +0x1A 0x0F + +# GOOD: c.slli t5, 7 +0x1E 0x0F + +# GOOD: c.slli t5, 8 +0x22 0x0F + +# GOOD: c.slli t5, 9 +0x26 0x0F + +# GOOD: c.slli t5, 10 +0x2A 0x0F + +# GOOD: c.slli t5, 11 +0x2E 0x0F + +# GOOD: c.slli t5, 12 +0x32 0x0F + +# GOOD: c.slli t5, 13 +0x36 0x0F + +# GOOD: c.slli t5, 14 +0x3A 0x0F + +# GOOD: c.slli t5, 15 +0x3E 0x0F + +# GOOD: c.slli t5, 16 +0x42 0x0F + +# GOOD: c.slli t5, 17 +0x46 0x0F + +# GOOD: c.slli t5, 18 +0x4A 0x0F + +# GOOD: c.slli t5, 19 +0x4E 0x0F + +# GOOD: c.slli t5, 20 +0x52 0x0F + +# GOOD: c.slli t5, 21 +0x56 0x0F + +# GOOD: c.slli t5, 22 +0x5A 0x0F + +# GOOD: c.slli t5, 23 +0x5E 0x0F + +# GOOD: c.slli t5, 24 +0x62 0x0F + +# GOOD: c.slli t5, 25 +0x66 0x0F + +# GOOD: c.slli t5, 26 +0x6A 0x0F + +# GOOD: c.slli t5, 27 +0x6E 0x0F + +# GOOD: c.slli t5, 28 +0x72 0x0F + +# GOOD: c.slli t5, 29 +0x76 0x0F + +# GOOD: c.slli t5, 30 +0x7A 0x0F + +# GOOD: c.slli t5, 31 +0x7E 0x0F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 32 +0x02 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 33 +0x06 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 34 +0x0A 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 35 +0x0E 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 36 +0x12 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 37 +0x16 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 38 +0x1A 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 39 +0x1E 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 40 +0x22 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 41 +0x26 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 42 +0x2A 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 43 +0x2E 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 44 +0x32 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 45 +0x36 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 46 +0x3A 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 47 +0x3E 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 48 +0x42 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 49 +0x46 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 50 +0x4A 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 51 +0x4E 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 52 +0x52 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 53 +0x56 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 54 +0x5A 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 55 +0x5E 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 56 +0x62 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 57 +0x66 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 58 +0x6A 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 59 +0x6E 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 60 +0x72 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 61 +0x76 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 62 +0x7A 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t5, 63 +0x7E 0x1F + +# GOOD: c.slli64 t6 +0x82 0x0F + +# GOOD: c.slli t6, 1 +0x86 0x0F + +# GOOD: c.slli t6, 2 +0x8A 0x0F + +# GOOD: c.slli t6, 3 +0x8E 0x0F + +# GOOD: c.slli t6, 4 +0x92 0x0F + +# GOOD: c.slli t6, 5 +0x96 0x0F + +# GOOD: c.slli t6, 6 +0x9A 0x0F + +# GOOD: c.slli t6, 7 +0x9E 0x0F + +# GOOD: c.slli t6, 8 +0xA2 0x0F + +# GOOD: c.slli t6, 9 +0xA6 0x0F + +# GOOD: c.slli t6, 10 +0xAA 0x0F + +# GOOD: c.slli t6, 11 +0xAE 0x0F + +# GOOD: c.slli t6, 12 +0xB2 0x0F + +# GOOD: c.slli t6, 13 +0xB6 0x0F + +# GOOD: c.slli t6, 14 +0xBA 0x0F + +# GOOD: c.slli t6, 15 +0xBE 0x0F + +# GOOD: c.slli t6, 16 +0xC2 0x0F + +# GOOD: c.slli t6, 17 +0xC6 0x0F + +# GOOD: c.slli t6, 18 +0xCA 0x0F + +# GOOD: c.slli t6, 19 +0xCE 0x0F + +# GOOD: c.slli t6, 20 +0xD2 0x0F + +# GOOD: c.slli t6, 21 +0xD6 0x0F + +# GOOD: c.slli t6, 22 +0xDA 0x0F + +# GOOD: c.slli t6, 23 +0xDE 0x0F + +# GOOD: c.slli t6, 24 +0xE2 0x0F + +# GOOD: c.slli t6, 25 +0xE6 0x0F + +# GOOD: c.slli t6, 26 +0xEA 0x0F + +# GOOD: c.slli t6, 27 +0xEE 0x0F + +# GOOD: c.slli t6, 28 +0xF2 0x0F + +# GOOD: c.slli t6, 29 +0xF6 0x0F + +# GOOD: c.slli t6, 30 +0xFA 0x0F + +# GOOD: c.slli t6, 31 +0xFE 0x0F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 32 +0x82 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 33 +0x86 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 34 +0x8A 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 35 +0x8E 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 36 +0x92 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 37 +0x96 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 38 +0x9A 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 39 +0x9E 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 40 +0xA2 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 41 +0xA6 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 42 +0xAA 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 43 +0xAE 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 44 +0xB2 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 45 +0xB6 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 46 +0xBA 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 47 +0xBE 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 48 +0xC2 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 49 +0xC6 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 50 +0xCA 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 51 +0xCE 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 52 +0xD2 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 53 +0xD6 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 54 +0xDA 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 55 +0xDE 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 56 +0xE2 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 57 +0xE6 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 58 +0xEA 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 59 +0xEE 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 60 +0xF2 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 61 +0xF6 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 62 +0xFA 0x1F + +# BAD32: invalid instruction encoding +# GOOD64: c.slli t6, 63 +0xFE 0x1F From 01889de8e9b16eeed7ed9f6cdc18636ad20a01ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 1 Apr 2025 10:06:45 -0700 Subject: [PATCH 0276/1029] [flang][device] Enable Stop functions on device build (#133803) Update `StopStatement` and `StopStatementText` to be build for the device. --- flang-rt/lib/runtime/CMakeLists.txt | 1 + flang-rt/lib/runtime/stop.cpp | 52 +++++++++++++++++++++++++++-- flang/include/flang/Runtime/stop.h | 7 ++-- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt index 572b4d54552c1..c5e7bdce5b2fd 100644 --- a/flang-rt/lib/runtime/CMakeLists.txt +++ b/flang-rt/lib/runtime/CMakeLists.txt @@ -57,6 +57,7 @@ set(supported_sources pseudo-unit.cpp ragged.cpp stat.cpp + stop.cpp sum.cpp support.cpp terminator.cpp diff --git a/flang-rt/lib/runtime/stop.cpp b/flang-rt/lib/runtime/stop.cpp index 1d70a137377aa..a4ef7104442f4 100644 --- a/flang-rt/lib/runtime/stop.cpp +++ b/flang-rt/lib/runtime/stop.cpp @@ -65,8 +65,33 @@ static void CloseAllExternalUnits(const char *why) { Fortran::runtime::io::ExternalFileUnit::CloseAll(handler); } -[[noreturn]] void RTNAME(StopStatement)( +[[noreturn]] RT_API_ATTRS void RTNAME(StopStatement)( int code, bool isErrorStop, bool quiet) { +#if defined(RT_DEVICE_COMPILATION) + if (Fortran::runtime::executionEnvironment.noStopMessage && code == 0) { + quiet = true; + } + if (!quiet) { + if (isErrorStop) { + std::printf("Fortran ERROR STOP"); + } else { + std::printf("Fortran STOP"); + } + if (code != EXIT_SUCCESS) { + std::printf(": code %d\n", code); + } + std::printf('\n'); + } +#if defined(__CUDACC__) + // NVCC supports __trap(). + __trap(); +#elif defined(__clang__) + // Clang supports __builtin_trap(). + __builtin_trap(); +#else +#error "unsupported compiler" +#endif +#else CloseAllExternalUnits("STOP statement"); if (Fortran::runtime::executionEnvironment.noStopMessage && code == 0) { quiet = true; @@ -80,10 +105,32 @@ static void CloseAllExternalUnits(const char *why) { DescribeIEEESignaledExceptions(); } std::exit(code); +#endif } -[[noreturn]] void RTNAME(StopStatementText)( +[[noreturn]] RT_API_ATTRS void RTNAME(StopStatementText)( const char *code, std::size_t length, bool isErrorStop, bool quiet) { +#if defined(RT_DEVICE_COMPILATION) + if (!quiet) { + if (Fortran::runtime::executionEnvironment.noStopMessage && !isErrorStop) { + std::printf("%s\n", code); + } else { + std::printf( + "Fortran %s: %s\n", isErrorStop ? "ERROR STOP" : "STOP", code); + } + } + if (isErrorStop) { +#if defined(__CUDACC__) + // NVCC supports __trap(). + __trap(); +#elif defined(__clang__) + // Clang supports __builtin_trap(). + __builtin_trap(); +#else +#error "unsupported compiler" +#endif + } +#else CloseAllExternalUnits("STOP statement"); if (!quiet) { if (Fortran::runtime::executionEnvironment.noStopMessage && !isErrorStop) { @@ -99,6 +146,7 @@ static void CloseAllExternalUnits(const char *why) { } else { std::exit(EXIT_SUCCESS); } +#endif } static bool StartPause() { diff --git a/flang/include/flang/Runtime/stop.h b/flang/include/flang/Runtime/stop.h index 24ae2cbe01ec6..02bce65765907 100644 --- a/flang/include/flang/Runtime/stop.h +++ b/flang/include/flang/Runtime/stop.h @@ -17,9 +17,10 @@ FORTRAN_EXTERN_C_BEGIN // Program-initiated image stop -NORETURN void RTNAME(StopStatement)(int code DEFAULT_VALUE(EXIT_SUCCESS), - bool isErrorStop DEFAULT_VALUE(false), bool quiet DEFAULT_VALUE(false)); -NORETURN void RTNAME(StopStatementText)(const char *, size_t, +NORETURN RT_API_ATTRS void RTNAME(StopStatement)( + int code DEFAULT_VALUE(EXIT_SUCCESS), bool isErrorStop DEFAULT_VALUE(false), + bool quiet DEFAULT_VALUE(false)); +NORETURN RT_API_ATTRS void RTNAME(StopStatementText)(const char *, size_t, bool isErrorStop DEFAULT_VALUE(false), bool quiet DEFAULT_VALUE(false)); void RTNAME(PauseStatement)(NO_ARGUMENTS); void RTNAME(PauseStatementInt)(int); From 7003f7d23aeca97b0301e605ed8b02436b38f789 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Tue, 1 Apr 2025 13:09:45 -0400 Subject: [PATCH 0277/1029] [clang-sycl-linker] Replace llvm-link with API calls (#133797) This PR has the following changes: Replace llvm-link with calls to linkInModule to link device files Add -print-linked-module option to dump linked module for testing Added a test to verify that linking is working as expected. We will eventually move to using thin LTO for linking device inputs. Thanks --------- Signed-off-by: Arvind Sudarsanam --- clang/test/Driver/Inputs/SYCL/bar.ll | 7 + clang/test/Driver/Inputs/SYCL/baz.ll | 15 ++ clang/test/Driver/Inputs/SYCL/foo.ll | 19 ++ clang/test/Driver/Inputs/SYCL/libsycl.ll | 13 ++ clang/test/Driver/clang-sycl-linker-test.cpp | 43 ++--- clang/test/Driver/link-device-code.test | 23 +++ clang/test/Driver/sycl-link-spirv-target.cpp | 4 +- clang/tools/clang-sycl-linker/CMakeLists.txt | 4 + .../clang-sycl-linker/ClangSYCLLinker.cpp | 168 +++++++++--------- clang/tools/clang-sycl-linker/SYCLLinkOpts.td | 16 +- 10 files changed, 201 insertions(+), 111 deletions(-) create mode 100644 clang/test/Driver/Inputs/SYCL/bar.ll create mode 100644 clang/test/Driver/Inputs/SYCL/baz.ll create mode 100644 clang/test/Driver/Inputs/SYCL/foo.ll create mode 100644 clang/test/Driver/Inputs/SYCL/libsycl.ll create mode 100644 clang/test/Driver/link-device-code.test diff --git a/clang/test/Driver/Inputs/SYCL/bar.ll b/clang/test/Driver/Inputs/SYCL/bar.ll new file mode 100644 index 0000000000000..d17221b8dca18 --- /dev/null +++ b/clang/test/Driver/Inputs/SYCL/bar.ll @@ -0,0 +1,7 @@ +target triple = "spirv64" + +define spir_func i32 @bar_func1(i32 %a, i32 %b) { +entry: + %res = add nsw i32 %b, %a + ret i32 %res +} diff --git a/clang/test/Driver/Inputs/SYCL/baz.ll b/clang/test/Driver/Inputs/SYCL/baz.ll new file mode 100644 index 0000000000000..6cdf3735ed77e --- /dev/null +++ b/clang/test/Driver/Inputs/SYCL/baz.ll @@ -0,0 +1,15 @@ +target triple = "spirv64" + +define spir_func i32 @bar_func1(i32 %a, i32 %b) { +entry: + %mul = shl nsw i32 %a, 1 + %res = add nsw i32 %mul, %b + ret i32 %res +} + +define spir_func i32 @baz_func1(i32 %a) { +entry: + %add = add nsw i32 %a, 5 + %res = tail call spir_func i32 @bar_func1(i32 %a, i32 %add) + ret i32 %res +} diff --git a/clang/test/Driver/Inputs/SYCL/foo.ll b/clang/test/Driver/Inputs/SYCL/foo.ll new file mode 100644 index 0000000000000..43aaf1424ee2d --- /dev/null +++ b/clang/test/Driver/Inputs/SYCL/foo.ll @@ -0,0 +1,19 @@ +target triple = "spirv64" + +define spir_func i32 @foo_func1(i32 %a, i32 %b) { +entry: + %call = tail call spir_func i32 @addFive(i32 %b) + %res = tail call spir_func i32 @bar_func1(i32 %a, i32 %call) + ret i32 %res +} + +declare spir_func i32 @bar_func1(i32, i32) + +declare spir_func i32 @addFive(i32) + +define spir_func i32 @foo_func2(i32 %c, i32 %d, i32 %e) { +entry: + %call = tail call spir_func i32 @foo_func1(i32 %c, i32 %d) + %res = mul nsw i32 %call, %e + ret i32 %res +} diff --git a/clang/test/Driver/Inputs/SYCL/libsycl.ll b/clang/test/Driver/Inputs/SYCL/libsycl.ll new file mode 100644 index 0000000000000..fdc4643e97b6a --- /dev/null +++ b/clang/test/Driver/Inputs/SYCL/libsycl.ll @@ -0,0 +1,13 @@ +target triple = "spirv64" + +define spir_func i32 @addFive(i32 %a) { +entry: + %res = add nsw i32 %a, 5 + ret i32 %res +} + +define spir_func i32 @unusedFunc(i32 %a) { +entry: + %res = mul nsw i32 %a, 5 + ret i32 %res +} diff --git a/clang/test/Driver/clang-sycl-linker-test.cpp b/clang/test/Driver/clang-sycl-linker-test.cpp index f358900b4fbd8..729561bd09cd8 100644 --- a/clang/test/Driver/clang-sycl-linker-test.cpp +++ b/clang/test/Driver/clang-sycl-linker-test.cpp @@ -1,48 +1,41 @@ // Tests the clang-sycl-linker tool. // -// Test a simple case without arguments. -// RUN: %clangxx -emit-llvm -c %s -o %t_1.bc -// RUN: %clangxx -emit-llvm -c %s -o %t_2.bc -// RUN: clang-sycl-linker --dry-run -triple spirv64 %t_1.bc %t_2.bc -o a.spv 2>&1 \ -// RUN: | FileCheck %s --check-prefix=SIMPLE -// SIMPLE: "{{.*}}llvm-link{{.*}}" {{.*}}.bc {{.*}}.bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings -// SIMPLE-NEXT: "{{.*}}llvm-spirv{{.*}}" {{.*}}-o a.spv [[FIRSTLLVMLINKOUT]].bc +// Test the dry run of a simple case to link two input files. +// RUN: %clangxx -emit-llvm -c -target spirv64 %s -o %t_1.bc +// RUN: %clangxx -emit-llvm -c -target spirv64 %s -o %t_2.bc +// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 %t_1.bc %t_2.bc -o a.spv 2>&1 \ +// RUN: | FileCheck %s --check-prefix=SIMPLE-FO +// SIMPLE-FO: sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc +// SIMPLE-FO-NEXT: "{{.*}}llvm-spirv{{.*}}" {{.*}}-o a.spv [[LLVMLINKOUT]].bc // -// Test that llvm-link is not called when only one input is present. -// RUN: clang-sycl-linker --dry-run -triple spirv64 %t_1.bc -o a.spv 2>&1 \ -// RUN: | FileCheck %s --check-prefix=SIMPLE-NO-LINK -// SIMPLE-NO-LINK: "{{.*}}llvm-spirv{{.*}}" {{.*}}-o a.spv {{.*}}.bc -// -// Test a simple case with device library files specified. +// Test the dry run of a simple case with device library files specified. // RUN: touch %T/lib1.bc // RUN: touch %T/lib2.bc -// RUN: clang-sycl-linker --dry-run -triple spirv64 %t_1.bc %t_2.bc --library-path=%T --device-libs=lib1.bc,lib2.bc -o a.spv 2>&1 \ +// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 %t_1.bc %t_2.bc --library-path=%T --device-libs=lib1.bc,lib2.bc -o a.spv 2>&1 \ // RUN: | FileCheck %s --check-prefix=DEVLIBS -// DEVLIBS: "{{.*}}llvm-link{{.*}}" {{.*}}.bc {{.*}}.bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings -// DEVLIBS-NEXT: "{{.*}}llvm-link{{.*}}" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}lib1.bc {{.*}}lib2.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings -// DEVLIBS-NEXT: "{{.*}}llvm-spirv{{.*}}" {{.*}}-o a.spv [[SECONDLLVMLINKOUT]].bc +// DEVLIBS: sycl-device-link: inputs: {{.*}}.bc libfiles: {{.*}}lib1.bc, {{.*}}lib2.bc output: [[LLVMLINKOUT:.*]].bc +// DEVLIBS-NEXT: "{{.*}}llvm-spirv{{.*}}" {{.*}}-o a.spv [[LLVMLINKOUT]].bc // -// Test a simple case with .o (fat object) as input. -// TODO: Remove this test once fat object support is added. -// RUN: %clangxx -c %s -o %t.o -// RUN: not clang-sycl-linker --dry-run -triple spirv64 %t.o -o a.spv 2>&1 \ +// Test a simple case with a random file (not bitcode) as input. +// RUN: touch %t.o +// RUN: not clang-sycl-linker -triple spirv64 %t.o -o a.spv 2>&1 \ // RUN: | FileCheck %s --check-prefix=FILETYPEERROR // FILETYPEERROR: Unsupported file type // // Test to see if device library related errors are emitted. -// RUN: not clang-sycl-linker --dry-run -triple spirv64 %t_1.bc %t_2.bc --library-path=%T --device-libs= -o a.spv 2>&1 \ +// RUN: not clang-sycl-linker --dry-run -triple=spirv64 %t_1.bc %t_2.bc --library-path=%T --device-libs= -o a.spv 2>&1 \ // RUN: | FileCheck %s --check-prefix=DEVLIBSERR1 // DEVLIBSERR1: Number of device library files cannot be zero -// RUN: not clang-sycl-linker --dry-run -triple spirv64 %t_1.bc %t_2.bc --library-path=%T --device-libs=lib1.bc,lib2.bc,lib3.bc -o a.spv 2>&1 \ +// RUN: not clang-sycl-linker --dry-run -triple=spirv64 %t_1.bc %t_2.bc --library-path=%T --device-libs=lib1.bc,lib2.bc,lib3.bc -o a.spv 2>&1 \ // RUN: | FileCheck %s --check-prefix=DEVLIBSERR2 // DEVLIBSERR2: '{{.*}}lib3.bc' SYCL device library file is not found // // Test if correct set of llvm-spirv options are emitted for windows environment. -// RUN: clang-sycl-linker --dry-run -triple spirv64 --is-windows-msvc-env %t_1.bc %t_2.bc -o a.spv 2>&1 \ +// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 --is-windows-msvc-env %t_1.bc %t_2.bc -o a.spv 2>&1 \ // RUN: | FileCheck %s --check-prefix=LLVMOPTSWIN // LLVMOPTSWIN: -spirv-debug-info-version=ocl-100 -spirv-allow-extra-diexpressions -spirv-allow-unknown-intrinsics=llvm.genx. -spirv-ext= // // Test if correct set of llvm-spirv options are emitted for linux environment. -// RUN: clang-sycl-linker --dry-run -triple spirv64 %t_1.bc %t_2.bc -o a.spv 2>&1 \ +// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 %t_1.bc %t_2.bc -o a.spv 2>&1 \ // RUN: | FileCheck %s --check-prefix=LLVMOPTSLIN // LLVMOPTSLIN: -spirv-debug-info-version=nonsemantic-shader-200 -spirv-allow-unknown-intrinsics=llvm.genx. -spirv-ext= diff --git a/clang/test/Driver/link-device-code.test b/clang/test/Driver/link-device-code.test new file mode 100644 index 0000000000000..a5f52fdb7d9ae --- /dev/null +++ b/clang/test/Driver/link-device-code.test @@ -0,0 +1,23 @@ +# RUN: llvm-as %S/Inputs/SYCL/foo.ll -o %t.foo.bc +# RUN: llvm-as %S/Inputs/SYCL/bar.ll -o %t.bar.bc +# RUN: llvm-as %S/Inputs/SYCL/baz.ll -o %t.baz.bc +# RUN: llvm-as %S/Inputs/SYCL/libsycl.ll -o %t.libsycl.bc +# RUN: clang-sycl-linker %t.foo.bc %t.bar.bc -triple=spirv64 --dry-run -o a.spv --print-linked-module 2>&1 | FileCheck %s --check-prefix=CHECK-SIMPLE + +# RUN: not clang-sycl-linker %t.bar.bc %t.baz.bc -triple=spirv64 --dry-run -o a.spv --print-linked-module 2>&1 | FileCheck %s --check-prefix=CHECK-MULTIPLE-DEFS + +# RUN: clang-sycl-linker %t.foo.bc %t.bar.bc -device-libs=%t.libsycl.bc -library-path= -triple=spirv64 --dry-run -o a.spv --print-linked-module 2>&1 | FileCheck %s --check-prefix=CHECK-DEVICE-LIB + +; CHECK-SIMPLE: define {{.*}}foo_func1{{.*}} +; CHECK-SIMPLE: define {{.*}}foo_func2{{.*}} +; CHECK-SIMPLE: define {{.*}}bar_func1{{.*}} +; CHECK-SIMPLE-NOT: define {{.*}}addFive{{.*}} +; CHECK-SIMPLE-NOT: define {{.*}}unusedFunc{{.*}} + +; CHECK-MULTIPLE-DEFS: error: Linking globals named {{.*}}bar_func1{{.*}} symbol multiply defined! + +; CHECK-DEVICE-LIB: define {{.*}}foo_func1{{.*}} +; CHECK-DEVICE-LIB: define {{.*}}foo_func2{{.*}} +; CHECK-DEVICE-LIB: define {{.*}}bar_func1{{.*}} +; CHECK-DEVICE-LIB: define {{.*}}addFive{{.*}} +; CHECK-DEVICE-LIB-NOT: define {{.*}}unusedFunc{{.*}} diff --git a/clang/test/Driver/sycl-link-spirv-target.cpp b/clang/test/Driver/sycl-link-spirv-target.cpp index 85566c67ea92b..7585ef8b14a59 100644 --- a/clang/test/Driver/sycl-link-spirv-target.cpp +++ b/clang/test/Driver/sycl-link-spirv-target.cpp @@ -4,6 +4,6 @@ // Test that -Xlinker options are being passed to clang-sycl-linker. // RUN: touch %t.bc // RUN: %clangxx -### --target=spirv64 --sycl-link -Xlinker --llvm-spirv-path=/tmp \ -// RUN: -Xlinker --library-path=/tmp -Xlinker --device-libs=lib1.bc,lib2.bc %t.bc 2>&1 \ +// RUN: -Xlinker -triple=spirv64 -Xlinker --library-path=/tmp -Xlinker --device-libs=lib1.bc,lib2.bc %t.bc 2>&1 \ // RUN: | FileCheck %s -check-prefix=XLINKEROPTS -// XLINKEROPTS: "{{.*}}clang-sycl-linker{{.*}}" "--llvm-spirv-path=/tmp" "--library-path=/tmp" "--device-libs=lib1.bc,lib2.bc" "{{.*}}.bc" "-o" "a.out" +// XLINKEROPTS: "{{.*}}clang-sycl-linker{{.*}}" "--llvm-spirv-path=/tmp" "-triple=spirv64" "--library-path=/tmp" "--device-libs=lib1.bc,lib2.bc" "{{.*}}.bc" "-o" "a.out" diff --git a/clang/tools/clang-sycl-linker/CMakeLists.txt b/clang/tools/clang-sycl-linker/CMakeLists.txt index 5665ad7d7186e..382c0ca441940 100644 --- a/clang/tools/clang-sycl-linker/CMakeLists.txt +++ b/clang/tools/clang-sycl-linker/CMakeLists.txt @@ -1,6 +1,10 @@ set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} BinaryFormat + BitWriter + Core + IRReader + Linker Option Object TargetParser diff --git a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp index 2bcb3757d49d0..8dd0394e9610e 100644 --- a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp +++ b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp @@ -21,8 +21,10 @@ #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/CodeGen/CommandFlags.h" #include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IRReader/IRReader.h" #include "llvm/LTO/LTO.h" +#include "llvm/Linker/Linker.h" #include "llvm/Object/Archive.h" #include "llvm/Object/ArchiveWriter.h" #include "llvm/Object/Binary.h" @@ -180,7 +182,7 @@ Error executeCommands(StringRef ExecutablePath, ArrayRef Args) { } Expected> getInput(const ArgList &Args) { - // Collect all input bitcode files to be passed to llvm-link. + // Collect all input bitcode files to be passed to the device linking stage. SmallVector BitcodeFiles; for (const opt::Arg *Arg : Args.filtered(OPT_INPUT)) { std::optional Filename = std::string(Arg->getValue()); @@ -191,7 +193,7 @@ Expected> getInput(const ArgList &Args) { if (auto EC = identify_magic(*Filename, Magic)) return createStringError("Failed to open file " + *Filename); // TODO: Current use case involves LLVM IR bitcode files as input. - // This will be extended to support objects and SPIR-V IR files. + // This will be extended to support SPIR-V IR files. if (Magic != file_magic::bitcode) return createStringError("Unsupported file type"); BitcodeFiles.push_back(*Filename); @@ -199,52 +201,28 @@ Expected> getInput(const ArgList &Args) { return BitcodeFiles; } -/// Link all SYCL device input files into one before adding device library -/// files. Device linking is performed using llvm-link tool. -/// 'InputFiles' is the list of all LLVM IR device input files. -/// 'Args' encompasses all arguments required for linking device code and will -/// be parsed to generate options required to be passed into llvm-link. -Expected linkDeviceInputFiles(ArrayRef InputFiles, - const ArgList &Args) { - llvm::TimeTraceScope TimeScope("SYCL LinkDeviceInputFiles"); - - assert(InputFiles.size() && "No inputs to llvm-link"); - // Early check to see if there is only one input. - if (InputFiles.size() < 2) - return InputFiles[0]; - - Expected LLVMLinkPath = - findProgram(Args, "llvm-link", {getMainExecutable("llvm-link")}); - if (!LLVMLinkPath) - return LLVMLinkPath.takeError(); - - SmallVector CmdArgs; - CmdArgs.push_back(*LLVMLinkPath); - for (auto &File : InputFiles) - CmdArgs.push_back(File); - // Create a new file to write the linked device file to. - auto OutFileOrErr = - createTempFile(Args, sys::path::filename(OutputFile), "bc"); - if (!OutFileOrErr) - return OutFileOrErr.takeError(); - CmdArgs.push_back("-o"); - CmdArgs.push_back(*OutFileOrErr); - CmdArgs.push_back("--suppress-warnings"); - if (Error Err = executeCommands(*LLVMLinkPath, CmdArgs)) - return std::move(Err); - return Args.MakeArgString(*OutFileOrErr); +/// Handle cases where input file is a LLVM IR bitcode file. +/// When clang-sycl-linker is called via clang-linker-wrapper tool, input files +/// are LLVM IR bitcode files. +// TODO: Support SPIR-V IR files. +Expected> getBitcodeModule(StringRef File, + LLVMContext &C) { + SMDiagnostic Err; + + auto M = getLazyIRFileModule(File, Err, C); + if (M) + return M; + return createStringError(Err.getMessage()); } -// This utility function is used to gather all SYCL device library files that -// will be linked with input device files. -// The list of files and its location are passed from driver. +/// Gather all SYCL device library files that will be linked with input device +/// files. +/// The list of files and its location are passed from driver. Expected> getSYCLDeviceLibs(const ArgList &Args) { SmallVector DeviceLibFiles; StringRef LibraryPath; if (Arg *A = Args.getLastArg(OPT_library_path_EQ)) LibraryPath = A->getValue(); - if (LibraryPath.empty()) - return DeviceLibFiles; if (Arg *A = Args.getLastArg(OPT_device_libs_EQ)) { if (A->getValues().size() == 0) return createStringError( @@ -264,44 +242,75 @@ Expected> getSYCLDeviceLibs(const ArgList &Args) { return DeviceLibFiles; } -/// Link all device library files and input file into one LLVM IR file. This -/// linking is performed using llvm-link tool. -/// 'InputFiles' is the list of all LLVM IR device input files. -/// 'Args' encompasses all arguments required for linking device code and will -/// be parsed to generate options required to be passed into llvm-link tool. -static Expected linkDeviceLibFiles(StringRef InputFile, - const ArgList &Args) { - llvm::TimeTraceScope TimeScope("LinkDeviceLibraryFiles"); +/// Following tasks are performed: +/// 1. Link all SYCL device bitcode images into one image. Device linking is +/// performed using the linkInModule API. +/// 2. Gather all SYCL device library bitcode images. +/// 3. Link all the images gathered in Step 2 with the output of Step 1 using +/// linkInModule API. LinkOnlyNeeded flag is used. +Expected linkDeviceCode(ArrayRef InputFiles, + const ArgList &Args) { + llvm::TimeTraceScope TimeScope("SYCL link device code"); + + assert(InputFiles.size() && "No inputs to link"); + + LLVMContext C; + auto LinkerOutput = std::make_unique("sycl-device-link", C); + Linker L(*LinkerOutput); + // Link SYCL device input files. + for (auto &File : InputFiles) { + auto ModOrErr = getBitcodeModule(File, C); + if (!ModOrErr) + return ModOrErr.takeError(); + if (L.linkInModule(std::move(*ModOrErr))) + return createStringError("Could not link IR"); + } + // Get all SYCL device library files, if any. auto SYCLDeviceLibFiles = getSYCLDeviceLibs(Args); if (!SYCLDeviceLibFiles) return SYCLDeviceLibFiles.takeError(); - if ((*SYCLDeviceLibFiles).empty()) - return InputFile; - Expected LLVMLinkPath = - findProgram(Args, "llvm-link", {getMainExecutable("llvm-link")}); - if (!LLVMLinkPath) - return LLVMLinkPath.takeError(); + // Link in SYCL device library files. + const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); + for (auto &File : *SYCLDeviceLibFiles) { + auto LibMod = getBitcodeModule(File, C); + if (!LibMod) + return LibMod.takeError(); + if ((*LibMod)->getTargetTriple() == Triple) { + unsigned Flags = Linker::Flags::LinkOnlyNeeded; + if (L.linkInModule(std::move(*LibMod), Flags)) + return createStringError("Could not link IR"); + } + } + + // Dump linked output for testing. + if (Args.hasArg(OPT_print_linked_module)) + outs() << *LinkerOutput; // Create a new file to write the linked device file to. - auto OutFileOrErr = + auto BitcodeOutput = createTempFile(Args, sys::path::filename(OutputFile), "bc"); - if (!OutFileOrErr) - return OutFileOrErr.takeError(); + if (!BitcodeOutput) + return BitcodeOutput.takeError(); + + // Write the final output into 'BitcodeOutput' file. + int FD = -1; + if (std::error_code EC = sys::fs::openFileForWrite(*BitcodeOutput, FD)) + return errorCodeToError(EC); + llvm::raw_fd_ostream OS(FD, true); + WriteBitcodeToFile(*LinkerOutput, OS); + + if (Verbose) { + std::string Inputs = llvm::join(InputFiles.begin(), InputFiles.end(), ", "); + std::string LibInputs = llvm::join((*SYCLDeviceLibFiles).begin(), + (*SYCLDeviceLibFiles).end(), ", "); + errs() << formatv( + "sycl-device-link: inputs: {0} libfiles: {1} output: {2}\n", Inputs, + LibInputs, *BitcodeOutput); + } - SmallVector CmdArgs; - CmdArgs.push_back(*LLVMLinkPath); - CmdArgs.push_back("-only-needed"); - CmdArgs.push_back(InputFile); - for (auto &File : *SYCLDeviceLibFiles) - CmdArgs.push_back(File); - CmdArgs.push_back("-o"); - CmdArgs.push_back(*OutFileOrErr); - CmdArgs.push_back("--suppress-warnings"); - if (Error Err = executeCommands(*LLVMLinkPath, CmdArgs)) - return std::move(Err); - return *OutFileOrErr; + return *BitcodeOutput; } /// Add any llvm-spirv option that relies on a specific Triple in addition @@ -345,7 +354,7 @@ static void getSPIRVTransOpts(const ArgList &Args, ",+SPV_INTEL_arbitrary_precision_fixed_point" ",+SPV_INTEL_arbitrary_precision_floating_point" ",+SPV_INTEL_variable_length_array,+SPV_INTEL_fp_fast_math_mode" - ",+SPV_INTEL_long_constant_composite" + ",+SPV_INTEL_long_composites" ",+SPV_INTEL_arithmetic_fence" ",+SPV_INTEL_global_variable_decorations" ",+SPV_INTEL_cache_controls" @@ -385,7 +394,7 @@ static Expected runLLVMToSPIRVTranslation(StringRef File, SmallVector CmdArgs; CmdArgs.push_back(*LLVMToSPIRVProg); - const llvm::Triple Triple(Args.getLastArgValue(OPT_triple)); + const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); getSPIRVTransOpts(Args, CmdArgs, Triple); StringRef LLVMToSPIRVOptions; if (Arg *A = Args.getLastArg(OPT_llvm_spirv_options_EQ)) @@ -422,20 +431,19 @@ static Expected runLLVMToSPIRVTranslation(StringRef File, return OutputFile; } +/// Performs the following steps: +/// 1. Link input device code (user code and SYCL device library code). +/// 2. Run SPIR-V code generation. Error runSYCLLink(ArrayRef Files, const ArgList &Args) { llvm::TimeTraceScope TimeScope("SYCLDeviceLink"); - // First llvm-link step - auto LinkedFile = linkDeviceInputFiles(Files, Args); + + // Link all input bitcode files and SYCL device library files, if any. + auto LinkedFile = linkDeviceCode(Files, Args); if (!LinkedFile) reportError(LinkedFile.takeError()); - // second llvm-link step - auto DeviceLinkedFile = linkDeviceLibFiles(*LinkedFile, Args); - if (!DeviceLinkedFile) - reportError(DeviceLinkedFile.takeError()); - // LLVM to SPIR-V translation step - auto SPVFile = runLLVMToSPIRVTranslation(*DeviceLinkedFile, Args); + auto SPVFile = runLLVMToSPIRVTranslation(*LinkedFile, Args); if (!SPVFile) return SPVFile.takeError(); return Error::success(); diff --git a/clang/tools/clang-sycl-linker/SYCLLinkOpts.td b/clang/tools/clang-sycl-linker/SYCLLinkOpts.td index 959fd6c3e867c..40c7310076045 100644 --- a/clang/tools/clang-sycl-linker/SYCLLinkOpts.td +++ b/clang/tools/clang-sycl-linker/SYCLLinkOpts.td @@ -24,10 +24,14 @@ def device_libs_EQ : CommaJoined<["--", "-"], "device-libs=">, Flags<[LinkerOnlyOption]>, HelpText<"A comma separated list of device libraries that are linked during the device link.">; -def triple : Joined<["--"], "triple">, - HelpText<"The device target triple">; -def arch : Separate<["--", "-"], "arch">, - HelpText<"Specify the name of the target architecture.">; +def arch_EQ : Joined<["--", "-"], "arch=">, + Flags<[LinkerOnlyOption]>, + MetaVarName<"">, + HelpText<"The device subarchitecture">; +def triple_EQ : Joined<["--", "-"], "triple=">, + Flags<[LinkerOnlyOption]>, + MetaVarName<"">, + HelpText<"The device target triple">; def save_temps : Flag<["--", "-"], "save-temps">, Flags<[LinkerOnlyOption]>, HelpText<"Save intermediate results">; @@ -50,3 +54,7 @@ def llvm_spirv_path_EQ : Joined<["--"], "llvm-spirv-path=">, def llvm_spirv_options_EQ : Joined<["--", "-"], "llvm-spirv-options=">, Flags<[LinkerOnlyOption]>, HelpText<"Options that will control llvm-spirv step">; + +def print_linked_module : Flag<["--"], "print-linked-module">, + Flags<[LinkerOnlyOption]>, + HelpText<"Print the linked module's IR for testing">; From 602d05fbe89a5663a07dc51862409778bfe6fec6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 2 Apr 2025 00:11:46 +0700 Subject: [PATCH 0278/1029] llvm-reduce: Make myself maintainer (#133919) --- llvm/Maintainers.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md index fbb170cec8737..edba6edb400f5 100644 --- a/llvm/Maintainers.md +++ b/llvm/Maintainers.md @@ -412,6 +412,11 @@ tejohnson@google.com (email), [teresajohnson](https://github.com/teresajohnson) Aiden Grossman \ agrossman154@yahoo.com (email), [boomanaiden154](https://github.com/boomanaiden154) (Github) +#### llvm-reduce + +Matt Arsenault \ +Matthew.Arsenault@amd.com, arsenm2@gmail.com (email), [arsenm](https://github.com/arsenm) (GitHub) + ### Other #### Release management From f14ff59da7f98a405999bcc8481b20446de0d0cd Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 1 Apr 2025 18:15:37 +0100 Subject: [PATCH 0279/1029] [libclc] Move exp, exp2 and expm1 to the CLC library (#133932) These all share the use of a common helper function so are handled in one go. These builtins are also now vectorized. --- .../include/clc/math/clc_exp.h} | 14 +- libclc/clc/include/clc/math/clc_exp2.h | 20 +++ libclc/clc/include/clc/math/clc_exp_helper.h | 20 +++ .../clc/include/clc/math/clc_exp_helper.inc | 13 ++ libclc/clc/include/clc/math/clc_expm1.h | 20 +++ libclc/clc/lib/generic/SOURCES | 4 + libclc/clc/lib/generic/math/clc_exp.cl | 19 ++ libclc/clc/lib/generic/math/clc_exp.inc | 76 ++++++++ libclc/clc/lib/generic/math/clc_exp2.cl | 20 +++ libclc/clc/lib/generic/math/clc_exp2.inc | 68 +++++++ libclc/clc/lib/generic/math/clc_exp_helper.cl | 20 +++ .../clc/lib/generic/math/clc_exp_helper.inc | 54 ++++++ libclc/clc/lib/generic/math/clc_expm1.cl | 20 +++ libclc/clc/lib/generic/math/clc_expm1.inc | 169 ++++++++++++++++++ libclc/clspv/lib/SOURCES | 1 - libclc/generic/lib/SOURCES | 1 - libclc/generic/lib/math/exp.cl | 77 +------- libclc/generic/lib/math/exp2.cl | 65 +------ libclc/generic/lib/math/exp_helper.cl | 55 ------ libclc/generic/lib/math/expm1.cl | 151 +--------------- libclc/spirv/lib/SOURCES | 1 - 21 files changed, 545 insertions(+), 343 deletions(-) rename libclc/{generic/lib/math/exp_helper.h => clc/include/clc/math/clc_exp.h} (60%) create mode 100644 libclc/clc/include/clc/math/clc_exp2.h create mode 100644 libclc/clc/include/clc/math/clc_exp_helper.h create mode 100644 libclc/clc/include/clc/math/clc_exp_helper.inc create mode 100644 libclc/clc/include/clc/math/clc_expm1.h create mode 100644 libclc/clc/lib/generic/math/clc_exp.cl create mode 100644 libclc/clc/lib/generic/math/clc_exp.inc create mode 100644 libclc/clc/lib/generic/math/clc_exp2.cl create mode 100644 libclc/clc/lib/generic/math/clc_exp2.inc create mode 100644 libclc/clc/lib/generic/math/clc_exp_helper.cl create mode 100644 libclc/clc/lib/generic/math/clc_exp_helper.inc create mode 100644 libclc/clc/lib/generic/math/clc_expm1.cl create mode 100644 libclc/clc/lib/generic/math/clc_expm1.inc delete mode 100644 libclc/generic/lib/math/exp_helper.cl diff --git a/libclc/generic/lib/math/exp_helper.h b/libclc/clc/include/clc/math/clc_exp.h similarity index 60% rename from libclc/generic/lib/math/exp_helper.h rename to libclc/clc/include/clc/math/clc_exp.h index 84a8febb4bb12..00b5a7f69779a 100644 --- a/libclc/generic/lib/math/exp_helper.h +++ b/libclc/clc/include/clc/math/clc_exp.h @@ -6,9 +6,15 @@ // //===----------------------------------------------------------------------===// -#ifdef cl_khr_fp64 +#ifndef __CLC_MATH_CLC_EXP_H__ +#define __CLC_MATH_CLC_EXP_H__ -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_DECL double __clc_exp_helper(double x, double x_min, double x_max, double r, int n); +#define __CLC_BODY +#define __CLC_FUNCTION __clc_exp -#endif +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_EXP_H__ diff --git a/libclc/clc/include/clc/math/clc_exp2.h b/libclc/clc/include/clc/math/clc_exp2.h new file mode 100644 index 0000000000000..20ee54b0a2755 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_exp2.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_EXP2_H__ +#define __CLC_MATH_CLC_EXP2_H__ + +#define __CLC_BODY +#define __CLC_FUNCTION __clc_exp2 + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_EXP2_H__ diff --git a/libclc/clc/include/clc/math/clc_exp_helper.h b/libclc/clc/include/clc/math/clc_exp_helper.h new file mode 100644 index 0000000000000..8a2db5de16764 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_exp_helper.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_EXP_HELPER +#define __CLC_MATH_CLC_EXP_HELPER + +#define __DOUBLE_ONLY +#define __CLC_BODY + +#include + +#undef __CLC_BODY +#undef __DOUBLE_ONLY + +#endif // __CLC_MATH_CLC_EXP_HELPER diff --git a/libclc/clc/include/clc/math/clc_exp_helper.inc b/libclc/clc/include/clc/math/clc_exp_helper.inc new file mode 100644 index 0000000000000..cdf650405c815 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_exp_helper.inc @@ -0,0 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE __clc_exp_helper(__CLC_GENTYPE x, + __CLC_GENTYPE x_min, + __CLC_GENTYPE x_max, + __CLC_GENTYPE r, + __CLC_INTN n); diff --git a/libclc/clc/include/clc/math/clc_expm1.h b/libclc/clc/include/clc/math/clc_expm1.h new file mode 100644 index 0000000000000..0359c3916f1c5 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_expm1.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_EXPM1_H__ +#define __CLC_MATH_CLC_EXPM1_H__ + +#define __CLC_BODY +#define __CLC_FUNCTION __clc_expm1 + +#include + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_EXPM1_H__ diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index becfa3ff6dbed..8c8932e722693 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -32,7 +32,11 @@ math/clc_ceil.cl math/clc_copysign.cl math/clc_cospi.cl math/clc_ep_log.cl +math/clc_exp.cl math/clc_exp10.cl +math/clc_exp2.cl +math/clc_expm1.cl +math/clc_exp_helper.cl math/clc_fabs.cl math/clc_fma.cl math/clc_fmod.cl diff --git a/libclc/clc/lib/generic/math/clc_exp.cl b/libclc/clc/lib/generic/math/clc_exp.cl new file mode 100644 index 0000000000000..6ff452721881c --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_exp.cl @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/math/clc_exp.inc b/libclc/clc/lib/generic/math/clc_exp.inc new file mode 100644 index 0000000000000..5057bf8034e92 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_exp.inc @@ -0,0 +1,76 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_exp(__CLC_GENTYPE x) { + // Reduce x + const __CLC_GENTYPE ln2HI = 0x1.62e300p-1f; + const __CLC_GENTYPE ln2LO = 0x1.2fefa2p-17f; + const __CLC_GENTYPE invln2 = 0x1.715476p+0f; + + __CLC_GENTYPE fhalF = x < 0.0f ? -0.5f : 0.5f; + __CLC_INTN p = __CLC_CONVERT_INTN(__clc_mad(x, invln2, fhalF)); + __CLC_GENTYPE fp = __CLC_CONVERT_GENTYPE(p); + __CLC_GENTYPE hi = __clc_mad(fp, -ln2HI, x); // t*ln2HI is exact here + __CLC_GENTYPE lo = -fp * ln2LO; + + // Evaluate poly + __CLC_GENTYPE t = hi + lo; + __CLC_GENTYPE tt = t * t; + __CLC_GENTYPE v = __clc_mad( + tt, + -__clc_mad( + tt, + __clc_mad(tt, + __clc_mad(tt, + __clc_mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f), + 0x1.1566aap-14f), + -0x1.6c16c2p-9f), + 0x1.555556p-3f), + t); + + __CLC_GENTYPE y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi); + + // Scale by 2^p + __CLC_GENTYPE r = __CLC_AS_GENTYPE(__CLC_AS_INTN(y) + (p << 23)); + + // ln(largest_normal) = 88.72283905206835305366 + const __CLC_GENTYPE ulim = 0x1.62e430p+6f; + // ln(smallest_normal) = -87.33654475055310898657 + const __CLC_GENTYPE llim = -0x1.5d589ep+6f; + + r = x < llim ? 0.0f : r; + r = x < ulim ? r : __CLC_AS_GENTYPE((__CLC_UINTN)0x7f800000); + return __clc_isnan(x) ? x : r; +} + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_exp(__CLC_GENTYPE x) { + + const __CLC_GENTYPE X_MIN = -0x1.74910d52d3051p+9; // -1075*ln(2) + const __CLC_GENTYPE X_MAX = 0x1.62e42fefa39efp+9; // 1024*ln(2) + const __CLC_GENTYPE R_64_BY_LOG2 = 0x1.71547652b82fep+6; // 64/ln(2) + const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.62e42fefa0000p-7; // head ln(2)/64 + const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.cf79abc9e3b39p-46; // tail ln(2)/64 + + __CLC_INTN n = __CLC_CONVERT_INTN(x * R_64_BY_LOG2); + __CLC_GENTYPE r = + __clc_fma(-R_LOG2_BY_64_TL, __CLC_CONVERT_GENTYPE(n), + __clc_fma(-R_LOG2_BY_64_LD, __CLC_CONVERT_GENTYPE(n), x)); + return __clc_exp_helper(x, X_MIN, X_MAX, r, n); +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_exp(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_exp(__CLC_CONVERT_FLOATN(x))); +} + +#endif diff --git a/libclc/clc/lib/generic/math/clc_exp2.cl b/libclc/clc/lib/generic/math/clc_exp2.cl new file mode 100644 index 0000000000000..9635f84e5a9a6 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_exp2.cl @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/math/clc_exp2.inc b/libclc/clc/lib/generic/math/clc_exp2.inc new file mode 100644 index 0000000000000..6da361a43ed4c --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_exp2.inc @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_exp2(__CLC_GENTYPE x) { + // Reduce x + const __CLC_GENTYPE ln2HI = 0x1.62e300p-1f; + const __CLC_GENTYPE ln2LO = 0x1.2fefa2p-17f; + + __CLC_GENTYPE t = __clc_rint(x); + __CLC_INTN p = __CLC_CONVERT_INTN(t); + __CLC_GENTYPE tt = x - t; + __CLC_GENTYPE hi = tt * ln2HI; + __CLC_GENTYPE lo = tt * ln2LO; + + // Evaluate poly + t = hi + lo; + tt = t * t; + __CLC_GENTYPE v = __clc_mad( + tt, + -__clc_mad( + tt, + __clc_mad(tt, + __clc_mad(tt, + __clc_mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f), + 0x1.1566aap-14f), + -0x1.6c16c2p-9f), + 0x1.555556p-3f), + t); + + __CLC_GENTYPE y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi); + + // Scale by 2^p + __CLC_GENTYPE r = __CLC_AS_FLOATN(__CLC_AS_INTN(y) + (p << 23)); + + const __CLC_GENTYPE ulim = 128.0f; + const __CLC_GENTYPE llim = -126.0f; + + r = x < llim ? 0.0f : r; + r = x < ulim ? r : __CLC_AS_FLOATN((__CLC_UINTN)0x7f800000); + return __clc_isnan(x) ? x : r; +} + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_exp2(__CLC_GENTYPE x) { + const __CLC_GENTYPE R_LN2 = 0x1.62e42fefa39efp-1; // ln(2) + const __CLC_GENTYPE R_1_BY_64 = 1.0 / 64.0; + + __CLC_INTN n = __CLC_CONVERT_INTN(x * 64.0); + __CLC_GENTYPE r = R_LN2 * __clc_fma(-R_1_BY_64, __CLC_CONVERT_GENTYPE(n), x); + + return __clc_exp_helper(x, -1074.0, 1024.0, r, n); +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_exp2(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_exp2(__CLC_CONVERT_FLOATN(x))); +} + +#endif diff --git a/libclc/clc/lib/generic/math/clc_exp_helper.cl b/libclc/clc/lib/generic/math/clc_exp_helper.cl new file mode 100644 index 0000000000000..92ff8f7fe4e6f --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_exp_helper.cl @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#define __DOUBLE_ONLY +#define __CLC_BODY + +#include diff --git a/libclc/clc/lib/generic/math/clc_exp_helper.inc b/libclc/clc/lib/generic/math/clc_exp_helper.inc new file mode 100644 index 0000000000000..70ced7e9ea485 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_exp_helper.inc @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_exp_helper(__CLC_GENTYPE x, + __CLC_GENTYPE x_min, + __CLC_GENTYPE x_max, + __CLC_GENTYPE r, + __CLC_INTN n) { + + __CLC_INTN j = n & 0x3f; + __CLC_INTN m = n >> 6; + + // 6 term tail of Taylor expansion of e^r + __CLC_GENTYPE z2 = + r * __clc_fma( + r, + __clc_fma(r, + __clc_fma(r, + __clc_fma(r, + __clc_fma(r, 0x1.6c16c16c16c17p-10, + 0x1.1111111111111p-7), + 0x1.5555555555555p-5), + 0x1.5555555555555p-3), + 0x1.0000000000000p-1), + 1.0); + + __CLC_GENTYPE tv0 = USE_TABLE(two_to_jby64_ep_tbl_head, j); + __CLC_GENTYPE tv1 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); + z2 = __clc_fma(tv0 + tv1, z2, tv1) + tv0; + + __CLC_INTN small_value = + (m < -1022) || ((m == -1022) && __CLC_CONVERT_INTN(z2 < 1.0)); + + __CLC_INTN n1 = m >> 2; + __CLC_INTN n2 = m - n1; + __CLC_GENTYPE z3 = + z2 * __CLC_AS_GENTYPE((__CLC_CONVERT_LONGN(n1) + 1023) << 52); + z3 *= __CLC_AS_GENTYPE((__CLC_CONVERT_LONGN(n2) + 1023) << 52); + + z2 = __clc_ldexp(z2, m); + z2 = __CLC_CONVERT_LONGN(small_value) ? z3 : z2; + + z2 = __clc_isnan(x) ? x : z2; + + z2 = x > x_max ? __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) : z2; + z2 = x < x_min ? 0.0 : z2; + + return z2; +} diff --git a/libclc/clc/lib/generic/math/clc_expm1.cl b/libclc/clc/lib/generic/math/clc_expm1.cl new file mode 100644 index 0000000000000..8695b46eb90ea --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_expm1.cl @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/math/clc_expm1.inc b/libclc/clc/lib/generic/math/clc_expm1.inc new file mode 100644 index 0000000000000..6abee9b3f0cc9 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_expm1.inc @@ -0,0 +1,169 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +/* Refer to the exp routine for the underlying algorithm */ +#if __CLC_FPSIZE == 32 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_expm1(__CLC_GENTYPE x) { + // 128*log2 : 88.722839111673 + const __CLC_GENTYPE X_MAX = 0x1.62e42ep+6f; + // -149*log2 : -103.27892990343184 + const __CLC_GENTYPE X_MIN = -0x1.9d1da0p+6f; + // 64/log2 : 92.332482616893657 + const __CLC_GENTYPE R_64_BY_LOG2 = 0x1.715476p+6f; + // log2/64 lead: 0.0108032227 + const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f; + // log2/64 tail: 0.0000272020388 + const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f; + + __CLC_UINTN xi = __CLC_AS_UINTN(x); + __CLC_INTN n = __CLC_CONVERT_INTN(x * R_64_BY_LOG2); + __CLC_GENTYPE fn = __CLC_CONVERT_GENTYPE(n); + + __CLC_INTN j = n & 0x3f; + __CLC_INTN m = n >> 6; + + __CLC_GENTYPE r = + __clc_mad(fn, -R_LOG2_BY_64_TL, __clc_mad(fn, -R_LOG2_BY_64_LD, x)); + + // Truncated Taylor series + __CLC_GENTYPE z2 = __clc_mad( + r * r, __clc_mad(r, __clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), 0.5f), + r); + + __CLC_GENTYPE m2 = __CLC_AS_GENTYPE((m + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); + __CLC_GENTYPE exp_head = USE_TABLE(exp_tbl_ep_head, j); + __CLC_GENTYPE exp_tail = USE_TABLE(exp_tbl_ep_tail, j); + + __CLC_GENTYPE two_to_jby64_h = exp_head * m2; + __CLC_GENTYPE two_to_jby64_t = exp_tail * m2; + __CLC_GENTYPE two_to_jby64 = two_to_jby64_h + two_to_jby64_t; + + z2 = __clc_mad(z2, two_to_jby64, two_to_jby64_t) + (two_to_jby64_h - 1.0f); + // Make subnormals work + z2 = x == 0.f ? x : z2; + z2 = x < X_MIN || m < -24 ? -1.0f : z2; + z2 = x > X_MAX ? __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32) : z2; + z2 = __clc_isnan(x) ? x : z2; + + return z2; +} + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_expm1(__CLC_GENTYPE x) { + const __CLC_GENTYPE max_expm1_arg = 709.8; + const __CLC_GENTYPE min_expm1_arg = -37.42994775023704; + // 0x3FCC8FF7C79A9A22 = log(1+1/4) + const __CLC_GENTYPE log_OnePlus_OneByFour = 0.22314355131420976; + // 0xBFD269621134DB93 = log(1-1/4) + const __CLC_GENTYPE log_OneMinus_OneByFour = -0.28768207245178096; + const __CLC_GENTYPE sixtyfour_by_lnof2 = + 92.33248261689366; // 0x40571547652b82fe + const __CLC_GENTYPE lnof2_by_64_head = + 0.010830424696223417; // 0x3f862e42fefa0000 + const __CLC_GENTYPE lnof2_by_64_tail = + 2.5728046223276688e-14; // 0x3d1cf79abc9e3b39 + + // First, assume log(1-1/4) < x < log(1+1/4) i.e -0.28768 < x < 0.22314 + __CLC_GENTYPE u = __CLC_AS_GENTYPE(__CLC_AS_ULONGN(x) & 0xffffffffff000000UL); + __CLC_GENTYPE v = x - u; + __CLC_GENTYPE y = u * u * 0.5; + __CLC_GENTYPE z = v * (x + u) * 0.5; + + __CLC_GENTYPE q = __clc_fma( + x, + __clc_fma( + x, + __clc_fma( + x, + __clc_fma( + x, + __clc_fma( + x, + __clc_fma(x, + __clc_fma(x, + __clc_fma(x, 2.4360682937111612e-8, + 2.7582184028154370e-7), + 2.7558212415361945e-6), + 2.4801576918453420e-5), + 1.9841269447671544e-4), + 1.3888888890687830e-3), + 8.3333333334012270e-3), + 4.1666666666665560e-2), + 1.6666666666666632e-1); + q *= x * x * x; + + __CLC_GENTYPE z1g = (u + y) + (q + (v + z)); + __CLC_GENTYPE z1 = x + (y + (q + z)); + z1 = y >= 0x1.0p-7 ? z1g : z1; + + // Now assume outside interval around 0 + __CLC_INTN n = __CLC_CONVERT_INTN(x * sixtyfour_by_lnof2); + __CLC_INTN j = n & 0x3f; + __CLC_INTN m = n >> 6; + + __CLC_GENTYPE f1 = USE_TABLE(two_to_jby64_ep_tbl_head, j); + __CLC_GENTYPE f2 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); + __CLC_GENTYPE f = f1 + f2; + + __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(-n); + __CLC_GENTYPE r = + __clc_fma(dn, lnof2_by_64_tail, __clc_fma(dn, lnof2_by_64_head, x)); + + q = __clc_fma(r, + __clc_fma(r, + __clc_fma(r, + __clc_fma(r, 1.38889490863777199667e-03, + 8.33336798434219616221e-03), + 4.16666666662260795726e-02), + 1.66666666665260878863e-01), + 5.00000000000000008883e-01); + q = __clc_fma(r * r, q, r); + + __CLC_GENTYPE twopm = __CLC_AS_GENTYPE(__CLC_CONVERT_LONGN(m + EXPBIAS_DP64) + << EXPSHIFTBITS_DP64); + __CLC_GENTYPE twopmm = __CLC_AS_GENTYPE(__CLC_CONVERT_LONGN(EXPBIAS_DP64 - m) + << EXPSHIFTBITS_DP64); + + // Computations for m > 52, including where result is close to Inf + __CLC_ULONGN uval = __CLC_AS_ULONGN(0x1.0p+1023 * (f1 + (f * q + (f2)))); + __CLC_INTN e = __CLC_CONVERT_INTN(uval >> EXPSHIFTBITS_DP64) + 1; + + __CLC_GENTYPE zme1024 = __CLC_AS_GENTYPE( + (__CLC_CONVERT_ULONGN(e) << EXPSHIFTBITS_DP64) | (uval & MANTBITS_DP64)); + zme1024 = __CLC_CONVERT_LONGN(e == 2047) + ? __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) + : zme1024; + + __CLC_GENTYPE zmg52 = twopm * (f1 + __clc_fma(f, q, f2 - twopmm)); + zmg52 = __CLC_CONVERT_LONGN(m == 1024) ? zme1024 : zmg52; + + // For m < 53 + __CLC_GENTYPE zml53 = + twopm * ((f1 - twopmm) + __clc_fma(f1, q, f2 * (1.0 + q))); + + // For m < -7 + __CLC_GENTYPE zmln7 = __clc_fma(twopm, f1 + __clc_fma(f, q, f2), -1.0); + + z = __CLC_CONVERT_LONGN(m < 53) ? zml53 : zmg52; + z = __CLC_CONVERT_LONGN(m < -7) ? zmln7 : z; + z = x > log_OneMinus_OneByFour && x < log_OnePlus_OneByFour ? z1 : z; + z = x > max_expm1_arg ? __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) : z; + z = x < min_expm1_arg ? -1.0 : z; + + return z; +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_expm1(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_expm1(__CLC_CONVERT_FLOATN(x))); +} + +#endif diff --git a/libclc/clspv/lib/SOURCES b/libclc/clspv/lib/SOURCES index 64122395438aa..d2fea9d586287 100644 --- a/libclc/clspv/lib/SOURCES +++ b/libclc/clspv/lib/SOURCES @@ -25,7 +25,6 @@ subnormal_config.cl ../../generic/lib/math/exp.cl ../../generic/lib/math/exp10.cl ../../generic/lib/math/exp2.cl -../../generic/lib/math/exp_helper.cl ../../generic/lib/math/expm1.cl ../../generic/lib/math/fdim.cl ../../generic/lib/math/fmod.cl diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES index a9dc2304c0d0e..5f473ff8b9424 100644 --- a/libclc/generic/lib/SOURCES +++ b/libclc/generic/lib/SOURCES @@ -97,7 +97,6 @@ math/cospi.cl math/erf.cl math/erfc.cl math/exp.cl -math/exp_helper.cl math/expm1.cl math/exp2.cl math/exp10.cl diff --git a/libclc/generic/lib/math/exp.cl b/libclc/generic/lib/math/exp.cl index bdf8023f6ec31..cebd630e52ab7 100644 --- a/libclc/generic/lib/math/exp.cl +++ b/libclc/generic/lib/math/exp.cl @@ -7,77 +7,8 @@ //===----------------------------------------------------------------------===// #include -#include -#include +#include -_CLC_OVERLOAD _CLC_DEF float exp(float x) { - - // Reduce x - const float ln2HI = 0x1.62e300p-1f; - const float ln2LO = 0x1.2fefa2p-17f; - const float invln2 = 0x1.715476p+0f; - - float fhalF = x < 0.0f ? -0.5f : 0.5f; - int p = mad(x, invln2, fhalF); - float fp = (float)p; - float hi = mad(fp, -ln2HI, x); // t*ln2HI is exact here - float lo = -fp*ln2LO; - - // Evaluate poly - float t = hi + lo; - float tt = t*t; - float v = mad(tt, - -mad(tt, - mad(tt, - mad(tt, - mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f), - 0x1.1566aap-14f), - -0x1.6c16c2p-9f), - 0x1.555556p-3f), - t); - - float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi); - - // Scale by 2^p - float r = as_float(as_int(y) + (p << 23)); - - const float ulim = 0x1.62e430p+6f; // ln(largest_normal) = 88.72283905206835305366 - const float llim = -0x1.5d589ep+6f; // ln(smallest_normal) = -87.33654475055310898657 - - r = x < llim ? 0.0f : r; - r = x < ulim ? r : as_float(0x7f800000); - return isnan(x) ? x : r; -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, exp, float) - -#ifdef cl_khr_fp64 - -#include "exp_helper.h" - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_OVERLOAD _CLC_DEF double exp(double x) { - - const double X_MIN = -0x1.74910d52d3051p+9; // -1075*ln(2) - const double X_MAX = 0x1.62e42fefa39efp+9; // 1024*ln(2) - const double R_64_BY_LOG2 = 0x1.71547652b82fep+6; // 64/ln(2) - const double R_LOG2_BY_64_LD = 0x1.62e42fefa0000p-7; // head ln(2)/64 - const double R_LOG2_BY_64_TL = 0x1.cf79abc9e3b39p-46; // tail ln(2)/64 - - int n = convert_int(x * R_64_BY_LOG2); - double r = fma(-R_LOG2_BY_64_TL, (double)n, fma(-R_LOG2_BY_64_LD, (double)n, x)); - return __clc_exp_helper(x, X_MIN, X_MAX, r, n); -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, exp, double) - -#endif - -#ifdef cl_khr_fp16 - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -_CLC_DEFINE_UNARY_BUILTIN_FP16(exp) - -#endif +#define FUNCTION exp +#define __CLC_BODY +#include diff --git a/libclc/generic/lib/math/exp2.cl b/libclc/generic/lib/math/exp2.cl index 611a40d4a8e68..465c39174a734 100644 --- a/libclc/generic/lib/math/exp2.cl +++ b/libclc/generic/lib/math/exp2.cl @@ -7,65 +7,8 @@ //===----------------------------------------------------------------------===// #include -#include -#include +#include -_CLC_OVERLOAD _CLC_DEF float exp2(float x) { - - // Reduce x - const float ln2HI = 0x1.62e300p-1f; - const float ln2LO = 0x1.2fefa2p-17f; - - float t = rint(x); - int p = (int)t; - float tt = x - t; - float hi = tt * ln2HI; - float lo = tt * ln2LO; - - // Evaluate poly - t = hi + lo; - tt = t*t; - float v = mad(tt, - -mad(tt, - mad(tt, - mad(tt, - mad(tt, 0x1.637698p-25f, -0x1.bbd41cp-20f), - 0x1.1566aap-14f), - -0x1.6c16c2p-9f), - 0x1.555556p-3f), - t); - - float y = 1.0f - (((-lo) - MATH_DIVIDE(t * v, 2.0f - v)) - hi); - - // Scale by 2^p - float r = as_float(as_int(y) + (p << 23)); - - const float ulim = 128.0f; - const float llim = -126.0f; - - r = x < llim ? 0.0f : r; - r = x < ulim ? r : as_float(0x7f800000); - return isnan(x) ? x : r; -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, exp2, float) - -#ifdef cl_khr_fp64 - -#include "exp_helper.h" - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_OVERLOAD _CLC_DEF double exp2(double x) { - const double R_LN2 = 0x1.62e42fefa39efp-1; // ln(2) - const double R_1_BY_64 = 1.0 / 64.0; - - int n = convert_int(x * 64.0); - double r = R_LN2 * fma(-R_1_BY_64, (double)n, x); - return __clc_exp_helper(x, -1074.0, 1024.0, r, n); -} - - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, exp2, double) - -#endif +#define FUNCTION exp2 +#define __CLC_BODY +#include diff --git a/libclc/generic/lib/math/exp_helper.cl b/libclc/generic/lib/math/exp_helper.cl deleted file mode 100644 index b413228719bfb..0000000000000 --- a/libclc/generic/lib/math/exp_helper.cl +++ /dev/null @@ -1,55 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include -#include - -#ifdef cl_khr_fp64 - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_DEF double __clc_exp_helper(double x, double x_min, double x_max, double r, int n) { - - int j = n & 0x3f; - int m = n >> 6; - - // 6 term tail of Taylor expansion of e^r - double z2 = r * fma(r, - fma(r, - fma(r, - fma(r, - fma(r, 0x1.6c16c16c16c17p-10, 0x1.1111111111111p-7), - 0x1.5555555555555p-5), - 0x1.5555555555555p-3), - 0x1.0000000000000p-1), - 1.0); - - double tv0 = USE_TABLE(two_to_jby64_ep_tbl_head, j); - double tv1 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); - z2 = fma(tv0 + tv1, z2, tv1) + tv0; - - int small_value = (m < -1022) || ((m == -1022) && (z2 < 1.0)); - - int n1 = m >> 2; - int n2 = m-n1; - double z3= z2 * as_double(((long)n1 + 1023) << 52); - z3 *= as_double(((long)n2 + 1023) << 52); - - z2 = ldexp(z2, m); - z2 = small_value ? z3: z2; - - z2 = isnan(x) ? x : z2; - - z2 = x > x_max ? as_double(PINFBITPATT_DP64) : z2; - z2 = x < x_min ? 0.0 : z2; - - return z2; -} - -#endif // cl_khr_fp64 diff --git a/libclc/generic/lib/math/expm1.cl b/libclc/generic/lib/math/expm1.cl index e66020e20fab4..b4eed66d692b4 100644 --- a/libclc/generic/lib/math/expm1.cl +++ b/libclc/generic/lib/math/expm1.cl @@ -7,151 +7,8 @@ //===----------------------------------------------------------------------===// #include -#include -#include -#include +#include -/* Refer to the exp routine for the underlying algorithm */ - -_CLC_OVERLOAD _CLC_DEF float expm1(float x) { - const float X_MAX = 0x1.62e42ep+6f; // 128*log2 : 88.722839111673 - const float X_MIN = -0x1.9d1da0p+6f; // -149*log2 : -103.27892990343184 - - const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657 - const float R_LOG2_BY_64_LD = 0x1.620000p-7f; // log2/64 lead: 0.0108032227 - const float R_LOG2_BY_64_TL = 0x1.c85fdep-16f; // log2/64 tail: 0.0000272020388 - - uint xi = as_uint(x); - int n = (int)(x * R_64_BY_LOG2); - float fn = (float)n; - - int j = n & 0x3f; - int m = n >> 6; - - float r = mad(fn, -R_LOG2_BY_64_TL, mad(fn, -R_LOG2_BY_64_LD, x)); - - // Truncated Taylor series - float z2 = mad(r*r, mad(r, mad(r, 0x1.555556p-5f, 0x1.555556p-3f), 0.5f), r); - - float m2 = as_float((m + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); - float exp_head = USE_TABLE(exp_tbl_ep_head, j); - float exp_tail = USE_TABLE(exp_tbl_ep_tail, j); - - float two_to_jby64_h = exp_head * m2; - float two_to_jby64_t = exp_tail * m2; - float two_to_jby64 = two_to_jby64_h + two_to_jby64_t; - - z2 = mad(z2, two_to_jby64, two_to_jby64_t) + (two_to_jby64_h - 1.0f); - //Make subnormals work - z2 = x == 0.f ? x : z2; - z2 = x < X_MIN | m < -24 ? -1.0f : z2; - z2 = x > X_MAX ? as_float(PINFBITPATT_SP32) : z2; - z2 = isnan(x) ? x : z2; - - return z2; -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, expm1, float) - -#ifdef cl_khr_fp64 - -#include "exp_helper.h" - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_OVERLOAD _CLC_DEF double expm1(double x) { - const double max_expm1_arg = 709.8; - const double min_expm1_arg = -37.42994775023704; - const double log_OnePlus_OneByFour = 0.22314355131420976; //0x3FCC8FF7C79A9A22 = log(1+1/4) - const double log_OneMinus_OneByFour = -0.28768207245178096; //0xBFD269621134DB93 = log(1-1/4) - const double sixtyfour_by_lnof2 = 92.33248261689366; //0x40571547652b82fe - const double lnof2_by_64_head = 0.010830424696223417; //0x3f862e42fefa0000 - const double lnof2_by_64_tail = 2.5728046223276688e-14; //0x3d1cf79abc9e3b39 - - // First, assume log(1-1/4) < x < log(1+1/4) i.e -0.28768 < x < 0.22314 - double u = as_double(as_ulong(x) & 0xffffffffff000000UL); - double v = x - u; - double y = u * u * 0.5; - double z = v * (x + u) * 0.5; - - double q = fma(x, - fma(x, - fma(x, - fma(x, - fma(x, - fma(x, - fma(x, - fma(x,2.4360682937111612e-8, 2.7582184028154370e-7), - 2.7558212415361945e-6), - 2.4801576918453420e-5), - 1.9841269447671544e-4), - 1.3888888890687830e-3), - 8.3333333334012270e-3), - 4.1666666666665560e-2), - 1.6666666666666632e-1); - q *= x * x * x; - - double z1g = (u + y) + (q + (v + z)); - double z1 = x + (y + (q + z)); - z1 = y >= 0x1.0p-7 ? z1g : z1; - - // Now assume outside interval around 0 - int n = (int)(x * sixtyfour_by_lnof2); - int j = n & 0x3f; - int m = n >> 6; - - double f1 = USE_TABLE(two_to_jby64_ep_tbl_head, j); - double f2 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); - double f = f1 + f2; - - double dn = -n; - double r = fma(dn, lnof2_by_64_tail, fma(dn, lnof2_by_64_head, x)); - - q = fma(r, - fma(r, - fma(r, - fma(r, 1.38889490863777199667e-03, 8.33336798434219616221e-03), - 4.16666666662260795726e-02), - 1.66666666665260878863e-01), - 5.00000000000000008883e-01); - q = fma(r*r, q, r); - - double twopm = as_double((long)(m + EXPBIAS_DP64) << EXPSHIFTBITS_DP64); - double twopmm = as_double((long)(EXPBIAS_DP64 - m) << EXPSHIFTBITS_DP64); - - // Computations for m > 52, including where result is close to Inf - ulong uval = as_ulong(0x1.0p+1023 * (f1 + (f * q + (f2)))); - int e = (int)(uval >> EXPSHIFTBITS_DP64) + 1; - - double zme1024 = as_double(((long)e << EXPSHIFTBITS_DP64) | (uval & MANTBITS_DP64)); - zme1024 = e == 2047 ? as_double(PINFBITPATT_DP64) : zme1024; - - double zmg52 = twopm * (f1 + fma(f, q, f2 - twopmm)); - zmg52 = m == 1024 ? zme1024 : zmg52; - - // For m < 53 - double zml53 = twopm * ((f1 - twopmm) + fma(f1, q, f2*(1.0 + q))); - - // For m < -7 - double zmln7 = fma(twopm, f1 + fma(f, q, f2), -1.0); - - z = m < 53 ? zml53 : zmg52; - z = m < -7 ? zmln7 : z; - z = x > log_OneMinus_OneByFour & x < log_OnePlus_OneByFour ? z1 : z; - z = x > max_expm1_arg ? as_double(PINFBITPATT_DP64) : z; - z = x < min_expm1_arg ? -1.0 : z; - - return z; -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, expm1, double) - -#endif - -#ifdef cl_khr_fp16 - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -_CLC_DEFINE_UNARY_BUILTIN_FP16(expm1) - -#endif +#define FUNCTION expm1 +#define __CLC_BODY +#include diff --git a/libclc/spirv/lib/SOURCES b/libclc/spirv/lib/SOURCES index 5c6051398c58f..5446fe13a6d93 100644 --- a/libclc/spirv/lib/SOURCES +++ b/libclc/spirv/lib/SOURCES @@ -35,7 +35,6 @@ subnormal_config.cl ../../generic/lib/math/erf.cl ../../generic/lib/math/erfc.cl ../../generic/lib/math/exp.cl -../../generic/lib/math/exp_helper.cl ../../generic/lib/math/expm1.cl ../../generic/lib/math/exp2.cl ../../generic/lib/math/exp10.cl From 44b87e42066064d74f4a9c69ea02aa9558ca6a1a Mon Sep 17 00:00:00 2001 From: lntue Date: Tue, 1 Apr 2025 13:29:28 -0400 Subject: [PATCH 0280/1029] [libc] Reduce the range of hypotf exhaustive test to be run automatically. (#133944) The current setup of `hypotf` exhaustive tests might take days to finish. --- libc/test/src/math/exhaustive/hypotf_test.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libc/test/src/math/exhaustive/hypotf_test.cpp b/libc/test/src/math/exhaustive/hypotf_test.cpp index f4c63eadb42e9..695a2fbcafa4a 100644 --- a/libc/test/src/math/exhaustive/hypotf_test.cpp +++ b/libc/test/src/math/exhaustive/hypotf_test.cpp @@ -49,11 +49,15 @@ struct HypotfChecker : public virtual LIBC_NAMESPACE::testing::Test { } }; -using LlvmLibcHypotfExhaustiveTest = LlvmLibcExhaustiveMathTest; +using LlvmLibcHypotfExhaustiveTest = + LlvmLibcExhaustiveMathTest; // Range of the first input: [2^23, 2^24]; static constexpr uint32_t START = (23U + 127U) << 23; -static constexpr uint32_t STOP = (24U + 127U) << 23; +// static constexpr uint32_t STOP = (24U + 127U) << 23; +// Use a smaller range for automated tests, since the full range takes too long +// and should only be run manually. +static constexpr uint32_t STOP = ((23U + 127U) << 23) + 1024U; TEST_F(LlvmLibcHypotfExhaustiveTest, PositiveRange) { test_full_range_all_roundings(START, STOP); From 9586117c3ab6f16883a646847cfa65b065ad4ae3 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 1 Apr 2025 10:35:50 -0700 Subject: [PATCH 0281/1029] [clang-sycl-linker] Fix a warning This patch fixes: clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp:127:13: error: function 'getMainExecutable' is not needed and will not be emitted [-Werror,-Wunneeded-internal-declaration] --- clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp index 8dd0394e9610e..f4a16549340a1 100644 --- a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp +++ b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp @@ -124,12 +124,6 @@ const OptTable &getOptTable() { exit(EXIT_FAILURE); } -std::string getMainExecutable(const char *Name) { - void *Ptr = (void *)(intptr_t)&getMainExecutable; - auto COWPath = sys::fs::getMainExecutable(Name, Ptr); - return sys::path::parent_path(COWPath).str(); -} - Expected createTempFile(const ArgList &Args, const Twine &Prefix, StringRef Extension) { SmallString<128> OutputFile; From d8bf0398e5d5a9b9f7f186c0535a069055a03150 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 1 Apr 2025 16:14:52 +0100 Subject: [PATCH 0282/1029] [AArch64] Use tablegen HasOneUse. NFC --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td | 10 ++++------ llvm/lib/Target/AArch64/SVEInstrFormats.td | 16 ++++++---------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 3ee71c14c6bd4..a2f326c994c2f 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -392,15 +392,13 @@ def AArch64splice : SDNode<"AArch64ISD::SPLICE", SDT_AArch64Arith>; def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>; +let HasOneUse = 1 in def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2), - (AArch64mul_p node:$pred, node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; + (AArch64mul_p node:$pred, node:$src1, node:$src2)>; +let HasOneUse = 1 in def AArch64fmul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2), - (AArch64fmul_p node:$pred, node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; + (AArch64fmul_p node:$pred, node:$src1, node:$src2)>; def AArch64fabd_p : PatFrags<(ops node:$pg, node:$op1, node:$op2), diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index eafaf1717902e..772c440685072 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -119,10 +119,9 @@ def SDT_AArch64Setcc : SDTypeProfile<1, 4, [ ]>; def AArch64setcc_z : SDNode<"AArch64ISD::SETCC_MERGE_ZERO", SDT_AArch64Setcc>; +let HasOneUse = 1 in def AArch64setcc_z_oneuse : PatFrag<(ops node:$pg, node:$op1, node:$op2, node:$cc), - (AArch64setcc_z node:$pg, node:$op1, node:$op2, node:$cc), [{ - return N->hasOneUse(); -}]>; + (AArch64setcc_z node:$pg, node:$op1, node:$op2, node:$cc)>; def SVEPatternOperand : AsmOperandClass { let Name = "SVEPattern"; @@ -409,15 +408,12 @@ def sve_ext_imm_0_63 : ComplexPattern">; def sve_ext_imm_0_127 : ComplexPattern">; def sve_ext_imm_0_255 : ComplexPattern">; +let HasOneUse = 1 in def int_aarch64_sve_cntp_oneuse : PatFrag<(ops node:$pred, node:$src2), - (int_aarch64_sve_cntp node:$pred, node:$src2), [{ - return N->hasOneUse(); -}]>; - + (int_aarch64_sve_cntp node:$pred, node:$src2)>; +let HasOneUse = 1 in def step_vector_oneuse : PatFrag<(ops node:$idx), - (step_vector node:$idx), [{ - return N->hasOneUse(); -}]>; + (step_vector node:$idx)>; //===----------------------------------------------------------------------===// From 7d91c4f3eb689c0dc5ae95acbf321fec7e96cca7 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 1 Apr 2025 18:41:21 +0100 Subject: [PATCH 0283/1029] [ARM] Use tablegen HasOneUse. NFC --- llvm/lib/Target/ARM/ARMInstrInfo.td | 15 ++++++--------- llvm/lib/Target/ARM/ARMInstrThumb2.td | 5 ++--- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 2a3a4e91eee4c..d6387ff848593 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -465,19 +465,16 @@ class BinOpFrag : PatFrag<(ops node:$LHS, node:$RHS), res>; class UnOpFrag : PatFrag<(ops node:$Src), res>; // An 'and' node with a single use. -def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ - return N->hasOneUse(); -}]>; +let HasOneUse = 1 in +def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs)>; // An 'xor' node with a single use. -def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{ - return N->hasOneUse(); -}]>; +let HasOneUse = 1 in +def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs)>; // An 'fmul' node with a single use. -def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{ - return N->hasOneUse(); -}]>; +let HasOneUse = 1 in +def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs)>; // An 'fadd' node which checks for single non-hazardous use. def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td index 033df9c2fd204..9f80af07df0fc 100644 --- a/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -5735,11 +5735,10 @@ def t2CSINC : CS<"csinc", 0b1001>; def t2CSINV : CS<"csinv", 0b1010>; def t2CSNEG : CS<"csneg", 0b1011>; +let HasOneUse = 1 in def ARMcsinc_su : PatFrag<(ops node:$lhs, node:$rhs, node:$cc, node:$flags), - (ARMcsinc node:$lhs, node:$rhs, node:$cc, node:$flags), [{ - return N->hasOneUse(); -}]>; + (ARMcsinc node:$lhs, node:$rhs, node:$cc, node:$flags)>; let Predicates = [HasV8_1MMainline] in { multiclass CSPats { From a30caa6a73286eabe22983ce5a7cca27a25d0790 Mon Sep 17 00:00:00 2001 From: Sam Clegg Date: Tue, 1 Apr 2025 10:47:35 -0700 Subject: [PATCH 0284/1029] [WebAssembly] Add missing tests from #133289 (#133938) --- llvm/test/CodeGen/WebAssembly/libcalls64.ll | 352 ++++++++++++++++++++ 1 file changed, 352 insertions(+) create mode 100644 llvm/test/CodeGen/WebAssembly/libcalls64.ll diff --git a/llvm/test/CodeGen/WebAssembly/libcalls64.ll b/llvm/test/CodeGen/WebAssembly/libcalls64.ll new file mode 100644 index 0000000000000..e80a372c1691f --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/libcalls64.ll @@ -0,0 +1,352 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-keep-registers | FileCheck %s + +; Test a subset of compiler-rt/libm libcalls expected to be emitted by the wasm backend + +target triple = "wasm64-unknown-unknown" + +declare fp128 @llvm.sqrt.f128(fp128) +declare fp128 @llvm.floor.f128(fp128) +declare fp128 @llvm.trunc.f128(fp128) +declare fp128 @llvm.nearbyint.f128(fp128) +declare fp128 @llvm.pow.f128(fp128, fp128) +declare fp128 @llvm.powi.f128.i32(fp128, i32) + +declare double @llvm.tan.f64(double) +declare double @llvm.cos.f64(double) +declare double @llvm.log10.f64(double) +declare double @llvm.pow.f64(double, double) +declare double @llvm.powi.f64.i32(double, i32) +declare double @llvm.log.f64(double) +declare double @llvm.exp.f64(double) +declare double @llvm.exp10.f64(double) +declare double @llvm.ldexp.f64.i32(double, i32) +declare {double, i32} @llvm.frexp.f64.i32(double) +declare i32 @llvm.lround(double) +declare {double, double} @llvm.modf.f64(double) + +declare void @escape_value(i32) + +define fp128 @fp128libcalls(fp128 %x, fp128 %y, i32 %z) { + ; compiler-rt call +; CHECK-LABEL: fp128libcalls: +; CHECK: .functype fp128libcalls (i64, i64, i64, i64, i64, i32) -> () +; CHECK-NEXT: .local i64 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get $push18=, __stack_pointer +; CHECK-NEXT: i64.const $push19=, 144 +; CHECK-NEXT: i64.sub $push39=, $pop18, $pop19 +; CHECK-NEXT: local.tee $push38=, 6, $pop39 +; CHECK-NEXT: global.set __stack_pointer, $pop38 +; CHECK-NEXT: local.get $push40=, 6 +; CHECK-NEXT: i64.const $push36=, 128 +; CHECK-NEXT: i64.add $push37=, $pop40, $pop36 +; CHECK-NEXT: local.get $push44=, 1 +; CHECK-NEXT: local.get $push43=, 2 +; CHECK-NEXT: local.get $push42=, 3 +; CHECK-NEXT: local.get $push41=, 4 +; CHECK-NEXT: call __addtf3, $pop37, $pop44, $pop43, $pop42, $pop41 +; CHECK-NEXT: local.get $push45=, 6 +; CHECK-NEXT: i64.const $push34=, 112 +; CHECK-NEXT: i64.add $push35=, $pop45, $pop34 +; CHECK-NEXT: local.get $push46=, 6 +; CHECK-NEXT: i64.load $push1=, 128($pop46) +; CHECK-NEXT: local.get $push47=, 6 +; CHECK-NEXT: i64.load $push0=, 136($pop47) +; CHECK-NEXT: local.get $push49=, 3 +; CHECK-NEXT: local.get $push48=, 4 +; CHECK-NEXT: call __multf3, $pop35, $pop1, $pop0, $pop49, $pop48 +; CHECK-NEXT: local.get $push50=, 6 +; CHECK-NEXT: i64.const $push32=, 96 +; CHECK-NEXT: i64.add $push33=, $pop50, $pop32 +; CHECK-NEXT: local.get $push51=, 6 +; CHECK-NEXT: i64.load $push3=, 112($pop51) +; CHECK-NEXT: local.get $push52=, 6 +; CHECK-NEXT: i64.load $push2=, 120($pop52) +; CHECK-NEXT: local.get $push54=, 3 +; CHECK-NEXT: local.get $push53=, 4 +; CHECK-NEXT: call __divtf3, $pop33, $pop3, $pop2, $pop54, $pop53 +; CHECK-NEXT: local.get $push55=, 6 +; CHECK-NEXT: i64.const $push30=, 80 +; CHECK-NEXT: i64.add $push31=, $pop55, $pop30 +; CHECK-NEXT: local.get $push56=, 6 +; CHECK-NEXT: i64.load $push5=, 96($pop56) +; CHECK-NEXT: local.get $push57=, 6 +; CHECK-NEXT: i64.load $push4=, 104($pop57) +; CHECK-NEXT: call sqrtl, $pop31, $pop5, $pop4 +; CHECK-NEXT: local.get $push58=, 6 +; CHECK-NEXT: i64.const $push28=, 64 +; CHECK-NEXT: i64.add $push29=, $pop58, $pop28 +; CHECK-NEXT: local.get $push59=, 6 +; CHECK-NEXT: i64.load $push7=, 80($pop59) +; CHECK-NEXT: local.get $push60=, 6 +; CHECK-NEXT: i64.load $push6=, 88($pop60) +; CHECK-NEXT: call floorl, $pop29, $pop7, $pop6 +; CHECK-NEXT: local.get $push61=, 6 +; CHECK-NEXT: i64.const $push26=, 48 +; CHECK-NEXT: i64.add $push27=, $pop61, $pop26 +; CHECK-NEXT: local.get $push62=, 6 +; CHECK-NEXT: i64.load $push9=, 64($pop62) +; CHECK-NEXT: local.get $push63=, 6 +; CHECK-NEXT: i64.load $push8=, 72($pop63) +; CHECK-NEXT: local.get $push65=, 3 +; CHECK-NEXT: local.get $push64=, 4 +; CHECK-NEXT: call powl, $pop27, $pop9, $pop8, $pop65, $pop64 +; CHECK-NEXT: local.get $push66=, 6 +; CHECK-NEXT: i64.const $push24=, 32 +; CHECK-NEXT: i64.add $push25=, $pop66, $pop24 +; CHECK-NEXT: local.get $push67=, 6 +; CHECK-NEXT: i64.load $push11=, 48($pop67) +; CHECK-NEXT: local.get $push68=, 6 +; CHECK-NEXT: i64.load $push10=, 56($pop68) +; CHECK-NEXT: local.get $push69=, 5 +; CHECK-NEXT: call __powitf2, $pop25, $pop11, $pop10, $pop69 +; CHECK-NEXT: local.get $push70=, 6 +; CHECK-NEXT: i64.const $push22=, 16 +; CHECK-NEXT: i64.add $push23=, $pop70, $pop22 +; CHECK-NEXT: local.get $push71=, 6 +; CHECK-NEXT: i64.load $push13=, 32($pop71) +; CHECK-NEXT: local.get $push72=, 6 +; CHECK-NEXT: i64.load $push12=, 40($pop72) +; CHECK-NEXT: call truncl, $pop23, $pop13, $pop12 +; CHECK-NEXT: local.get $push75=, 6 +; CHECK-NEXT: local.get $push73=, 6 +; CHECK-NEXT: i64.load $push15=, 16($pop73) +; CHECK-NEXT: local.get $push74=, 6 +; CHECK-NEXT: i64.load $push14=, 24($pop74) +; CHECK-NEXT: call nearbyintl, $pop75, $pop15, $pop14 +; CHECK-NEXT: local.get $push77=, 0 +; CHECK-NEXT: local.get $push76=, 6 +; CHECK-NEXT: i64.load $push16=, 8($pop76) +; CHECK-NEXT: i64.store 8($pop77), $pop16 +; CHECK-NEXT: local.get $push79=, 0 +; CHECK-NEXT: local.get $push78=, 6 +; CHECK-NEXT: i64.load $push17=, 0($pop78) +; CHECK-NEXT: i64.store 0($pop79), $pop17 +; CHECK-NEXT: local.get $push80=, 6 +; CHECK-NEXT: i64.const $push20=, 144 +; CHECK-NEXT: i64.add $push21=, $pop80, $pop20 +; CHECK-NEXT: global.set __stack_pointer, $pop21 +; CHECK-NEXT: return + %a = fadd fp128 %x, %y + %b = fmul fp128 %a, %y + %c = fdiv fp128 %b, %y + ; libm calls + %d = call fp128 @llvm.sqrt.f128(fp128 %c) + %e = call fp128 @llvm.floor.f128(fp128 %d) + %f = call fp128 @llvm.pow.f128(fp128 %e, fp128 %y) + %g = call fp128 @llvm.powi.f128.i32(fp128 %f, i32 %z) + %h = call fp128 @llvm.trunc.f128(fp128 %g) + %i = call fp128 @llvm.nearbyint.f128(fp128 %h) + ret fp128 %i +} + +define i128 @i128libcalls(i128 %x, i128 %y) { + ; Basic ops should be expanded +; CHECK-LABEL: i128libcalls: +; CHECK: .functype i128libcalls (i64, i64, i64, i64, i64) -> () +; CHECK-NEXT: .local i64, i64 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get $push8=, __stack_pointer +; CHECK-NEXT: i64.const $push9=, 32 +; CHECK-NEXT: i64.sub $push17=, $pop8, $pop9 +; CHECK-NEXT: local.tee $push16=, 5, $pop17 +; CHECK-NEXT: global.set __stack_pointer, $pop16 +; CHECK-NEXT: local.get $push18=, 5 +; CHECK-NEXT: i64.const $push12=, 16 +; CHECK-NEXT: i64.add $push13=, $pop18, $pop12 +; CHECK-NEXT: local.get $push20=, 1 +; CHECK-NEXT: local.get $push19=, 3 +; CHECK-NEXT: i64.add $push15=, $pop20, $pop19 +; CHECK-NEXT: local.tee $push14=, 6, $pop15 +; CHECK-NEXT: local.get $push22=, 2 +; CHECK-NEXT: local.get $push21=, 4 +; CHECK-NEXT: i64.add $push0=, $pop22, $pop21 +; CHECK-NEXT: local.get $push24=, 6 +; CHECK-NEXT: local.get $push23=, 1 +; CHECK-NEXT: i64.lt_u $push1=, $pop24, $pop23 +; CHECK-NEXT: i64.extend_i32_u $push2=, $pop1 +; CHECK-NEXT: i64.add $push3=, $pop0, $pop2 +; CHECK-NEXT: local.get $push26=, 3 +; CHECK-NEXT: local.get $push25=, 4 +; CHECK-NEXT: call __multi3, $pop13, $pop14, $pop3, $pop26, $pop25 +; CHECK-NEXT: local.get $push31=, 5 +; CHECK-NEXT: local.get $push27=, 5 +; CHECK-NEXT: i64.load $push5=, 16($pop27) +; CHECK-NEXT: local.get $push28=, 5 +; CHECK-NEXT: i64.load $push4=, 24($pop28) +; CHECK-NEXT: local.get $push30=, 3 +; CHECK-NEXT: local.get $push29=, 4 +; CHECK-NEXT: call __umodti3, $pop31, $pop5, $pop4, $pop30, $pop29 +; CHECK-NEXT: local.get $push33=, 0 +; CHECK-NEXT: local.get $push32=, 5 +; CHECK-NEXT: i64.load $push6=, 8($pop32) +; CHECK-NEXT: i64.store 8($pop33), $pop6 +; CHECK-NEXT: local.get $push35=, 0 +; CHECK-NEXT: local.get $push34=, 5 +; CHECK-NEXT: i64.load $push7=, 0($pop34) +; CHECK-NEXT: i64.store 0($pop35), $pop7 +; CHECK-NEXT: local.get $push36=, 5 +; CHECK-NEXT: i64.const $push10=, 32 +; CHECK-NEXT: i64.add $push11=, $pop36, $pop10 +; CHECK-NEXT: global.set __stack_pointer, $pop11 +; CHECK-NEXT: return + %a = add i128 %x, %y + %b = mul i128 %a, %y + %c = urem i128 %b, %y + ret i128 %c +} + +define double @f64libcalls(double %x, double %y, i32 %z) { +; CHECK-LABEL: f64libcalls: +; CHECK: .functype f64libcalls (f64, f64, i32) -> (f64) +; CHECK-NEXT: .local i64 +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get $push12=, __stack_pointer +; CHECK-NEXT: i64.const $push13=, 16 +; CHECK-NEXT: i64.sub $push21=, $pop12, $pop13 +; CHECK-NEXT: local.tee $push20=, 3, $pop21 +; CHECK-NEXT: global.set __stack_pointer, $pop20 +; CHECK-NEXT: local.get $push25=, 0 +; CHECK-NEXT: local.get $push22=, 0 +; CHECK-NEXT: call $push0=, tan, $pop22 +; CHECK-NEXT: call $push1=, cos, $pop0 +; CHECK-NEXT: call $push2=, log10, $pop1 +; CHECK-NEXT: local.get $push23=, 1 +; CHECK-NEXT: call $push3=, pow, $pop2, $pop23 +; CHECK-NEXT: local.get $push24=, 2 +; CHECK-NEXT: call $push4=, __powidf2, $pop3, $pop24 +; CHECK-NEXT: call $push5=, log, $pop4 +; CHECK-NEXT: call $push6=, exp, $pop5 +; CHECK-NEXT: call $push7=, exp10, $pop6 +; CHECK-NEXT: call $push8=, cbrt, $pop7 +; CHECK-NEXT: call $push9=, lround, $pop8 +; CHECK-NEXT: call $push10=, ldexp, $pop25, $pop9 +; CHECK-NEXT: local.get $push26=, 3 +; CHECK-NEXT: i64.const $push18=, 4 +; CHECK-NEXT: i64.add $push19=, $pop26, $pop18 +; CHECK-NEXT: call $push27=, frexp, $pop10, $pop19 +; CHECK-NEXT: local.set 0, $pop27 +; CHECK-NEXT: local.get $push28=, 3 +; CHECK-NEXT: i32.load $push11=, 4($pop28) +; CHECK-NEXT: call escape_value, $pop11 +; CHECK-NEXT: local.get $push31=, 0 +; CHECK-NEXT: local.get $push29=, 3 +; CHECK-NEXT: i64.const $push16=, 8 +; CHECK-NEXT: i64.add $push17=, $pop29, $pop16 +; CHECK-NEXT: call $push30=, modf, $pop31, $pop17 +; CHECK-NEXT: local.set 0, $pop30 +; CHECK-NEXT: local.get $push32=, 3 +; CHECK-NEXT: i64.const $push14=, 16 +; CHECK-NEXT: i64.add $push15=, $pop32, $pop14 +; CHECK-NEXT: global.set __stack_pointer, $pop15 +; CHECK-NEXT: local.get $push33=, 0 +; CHECK-NEXT: return $pop33 + + + %k = call double @llvm.tan.f64(double %x) + %a = call double @llvm.cos.f64(double %k) + %b = call double @llvm.log10.f64(double %a) + %c = call double @llvm.pow.f64(double %b, double %y) + %d = call double @llvm.powi.f64.i32(double %c, i32 %z) + %e = call double @llvm.log.f64(double %d) + %f = call double @llvm.exp.f64(double %e) + %g = call double @llvm.exp10.f64(double %f) + %h = call fast double @llvm.pow.f64(double %g, double 0x3FD5555555555555) + %i = call i32 @llvm.lround(double %h) + %j = call double @llvm.ldexp.f64.i32(double %x, i32 %i); + %result = call {double, i32} @llvm.frexp.f64.i32(double %j) + %result.0 = extractvalue { double, i32 } %result, 0 + %result.1 = extractvalue { double, i32 } %result, 1 + %resultModf = call {double, double} @llvm.modf.f64(double %result.0) + %resultModf.0 = extractvalue { double, double } %resultModf, 0 + call void @escape_value(i32 %result.1) + ret double %resultModf.0 +} + +; fcmp ord and unord (RTLIB::O_F32 / RTLIB::UO_F32 etc) are a special case (see +; comment in WebAssemblyRunimeLibcallSignatures.cpp) so check them separately. +; no libcalls are needed for f32 and f64 + +define i1 @unordd(double %x, double %y) { +; CHECK-LABEL: unordd: +; CHECK: .functype unordd (f64, f64) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push8=, 0 +; CHECK-NEXT: local.get $push7=, 0 +; CHECK-NEXT: f64.ne $push4=, $pop8, $pop7 +; CHECK-NEXT: local.get $push10=, 1 +; CHECK-NEXT: local.get $push9=, 1 +; CHECK-NEXT: f64.ne $push3=, $pop10, $pop9 +; CHECK-NEXT: i32.or $push5=, $pop4, $pop3 +; CHECK-NEXT: local.get $push12=, 0 +; CHECK-NEXT: local.get $push11=, 0 +; CHECK-NEXT: f64.eq $push1=, $pop12, $pop11 +; CHECK-NEXT: local.get $push14=, 1 +; CHECK-NEXT: local.get $push13=, 1 +; CHECK-NEXT: f64.eq $push0=, $pop14, $pop13 +; CHECK-NEXT: i32.and $push2=, $pop1, $pop0 +; CHECK-NEXT: i32.xor $push6=, $pop5, $pop2 +; CHECK-NEXT: return $pop6 + %a = fcmp uno double %x, %y + %b = fcmp ord double %x, %y + %c = xor i1 %a, %b + ret i1 %c +} + +define i1 @unordf(float %x, float %y) { +; CHECK-LABEL: unordf: +; CHECK: .functype unordf (f32, f32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push8=, 0 +; CHECK-NEXT: local.get $push7=, 0 +; CHECK-NEXT: f32.ne $push4=, $pop8, $pop7 +; CHECK-NEXT: local.get $push10=, 1 +; CHECK-NEXT: local.get $push9=, 1 +; CHECK-NEXT: f32.ne $push3=, $pop10, $pop9 +; CHECK-NEXT: i32.or $push5=, $pop4, $pop3 +; CHECK-NEXT: local.get $push12=, 0 +; CHECK-NEXT: local.get $push11=, 0 +; CHECK-NEXT: f32.eq $push1=, $pop12, $pop11 +; CHECK-NEXT: local.get $push14=, 1 +; CHECK-NEXT: local.get $push13=, 1 +; CHECK-NEXT: f32.eq $push0=, $pop14, $pop13 +; CHECK-NEXT: i32.and $push2=, $pop1, $pop0 +; CHECK-NEXT: i32.xor $push6=, $pop5, $pop2 +; CHECK-NEXT: return $pop6 + %a = fcmp uno float %x, %y + %b = fcmp ord float %x, %y + %c = xor i1 %a, %b + ret i1 %c +} + +define i1 @unordt(fp128 %x, fp128 %y) { +; CHECK-LABEL: unordt: +; CHECK: .functype unordt (i64, i64, i64, i64) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push6=, 0 +; CHECK-NEXT: local.get $push5=, 1 +; CHECK-NEXT: local.get $push4=, 2 +; CHECK-NEXT: local.get $push3=, 3 +; CHECK-NEXT: call $push1=, __unordtf2, $pop6, $pop5, $pop4, $pop3 +; CHECK-NEXT: i32.const $push0=, 0 +; CHECK-NEXT: i32.ne $push2=, $pop1, $pop0 +; CHECK-NEXT: return $pop2 + %a = fcmp uno fp128 %x, %y + ret i1 %a +} + +define i1 @ordt(fp128 %x, fp128 %y) { +; CHECK-LABEL: ordt: +; CHECK: .functype ordt (i64, i64, i64, i64) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get $push5=, 0 +; CHECK-NEXT: local.get $push4=, 1 +; CHECK-NEXT: local.get $push3=, 2 +; CHECK-NEXT: local.get $push2=, 3 +; CHECK-NEXT: call $push0=, __unordtf2, $pop5, $pop4, $pop3, $pop2 +; CHECK-NEXT: i32.eqz $push1=, $pop0 +; CHECK-NEXT: return $pop1 + %a = fcmp ord fp128 %x, %y + ret i1 %a +} From afa32d3e0e1a2dddfc76f2e9b80c4a6cc5a5387f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 1 Apr 2025 11:00:50 -0700 Subject: [PATCH 0285/1029] [flang][cuda] Fix char argument This would fail with `error: argument of type "char" is incompatible with parameter of type "const char *"` --- flang-rt/lib/runtime/stop.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang-rt/lib/runtime/stop.cpp b/flang-rt/lib/runtime/stop.cpp index a4ef7104442f4..69a99e6c28955 100644 --- a/flang-rt/lib/runtime/stop.cpp +++ b/flang-rt/lib/runtime/stop.cpp @@ -80,7 +80,7 @@ static void CloseAllExternalUnits(const char *why) { if (code != EXIT_SUCCESS) { std::printf(": code %d\n", code); } - std::printf('\n'); + std::printf("\n"); } #if defined(__CUDACC__) // NVCC supports __trap(). From bd7585bea3906aa2d288f7238382e99b8d5506eb Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Apr 2025 11:14:25 -0700 Subject: [PATCH 0286/1029] [RISCV] Improve error for using x18-x27 in a register list with RVE. (#133936) matchRegisterNameHelper returns MCRegister() for RVE so the first RVE check was dead. For the second check, I've moved the RVE check from the comma parsing to the identifier parsing so the diagnostic points at the register. Note we're using matchRegisterName instead of matchRegisterNameHelper to avoid allowing ABI names so we don't get the RVE check that lives inside matchRegisterNameHelper. The errors for RVE in general should probably say something other than "invalid register", but that's a problem throughout the assembler. --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 57 +++++++++---------- .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 4 +- llvm/test/MC/RISCV/rv32e-xqccmp-invalid.s | 7 ++- llvm/test/MC/RISCV/rv32e-zcmp-invalid.s | 5 +- llvm/test/MC/RISCV/rv64e-xqccmp-invalid.s | 7 ++- llvm/test/MC/RISCV/rv64e-zcmp-invalid.s | 5 +- 6 files changed, 46 insertions(+), 39 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index f1ccf0cd052ba..1e07ada1f9701 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2577,7 +2577,7 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, if (parseToken(AsmToken::LCurly, "register list must start with '{'")) return ParseStatus::Failure; - bool IsEABI = isRVE(); + bool IsRVE = isRVE(); if (getLexer().isNot(AsmToken::Identifier)) return Error(getLoc(), "register list must start from 'ra' or 'x1'"); @@ -2617,46 +2617,41 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, // parse case like -s1 if (parseOptionalToken(AsmToken::Minus)) { StringRef EndName = getLexer().getTok().getIdentifier(); - // FIXME: the register mapping and checks of EABI is wrong + // FIXME: the register mapping and checks of RVE is wrong RegEnd = matchRegisterNameHelper(EndName); if (!(RegEnd == RISCV::X9 || (RegEnd >= RISCV::X18 && RegEnd <= RISCV::X27))) return Error(getLoc(), "invalid register"); - if (IsEABI && RegEnd != RISCV::X9) - return Error(getLoc(), "contiguous register list of EABI can only be " - "'s0-s1' or 'x8-x9' pair"); getLexer().Lex(); } - if (!IsEABI) { - // parse extra part like ', x18[-x20]' for XRegList - if (parseOptionalToken(AsmToken::Comma)) { - if (RegEnd != RISCV::X9) - return Error( - getLoc(), - "first contiguous registers pair of register list must be 'x8-x9'"); + // parse extra part like ', x18[-x20]' for XRegList + if (parseOptionalToken(AsmToken::Comma)) { + if (RegEnd != RISCV::X9) + return Error( + getLoc(), + "first contiguous registers pair of register list must be 'x8-x9'"); - // parse ', x18' for extra part - if (getLexer().isNot(AsmToken::Identifier)) + // parse ', x18' for extra part + if (getLexer().isNot(AsmToken::Identifier) || IsRVE) + return Error(getLoc(), "invalid register"); + StringRef EndName = getLexer().getTok().getIdentifier(); + RegEnd = MatchRegisterName(EndName); + if (RegEnd != RISCV::X18) + return Error(getLoc(), + "second contiguous registers pair of register list " + "must start from 'x18'"); + getLexer().Lex(); + + // parse '-x20' for extra part + if (parseOptionalToken(AsmToken::Minus)) { + if (getLexer().isNot(AsmToken::Identifier) || IsRVE) return Error(getLoc(), "invalid register"); - StringRef EndName = getLexer().getTok().getIdentifier(); + EndName = getLexer().getTok().getIdentifier(); RegEnd = MatchRegisterName(EndName); - if (RegEnd != RISCV::X18) - return Error(getLoc(), - "second contiguous registers pair of register list " - "must start from 'x18'"); + if (!(RegEnd >= RISCV::X19 && RegEnd <= RISCV::X27)) + return Error(getLoc(), "invalid register"); getLexer().Lex(); - - // parse '-x20' for extra part - if (parseOptionalToken(AsmToken::Minus)) { - if (getLexer().isNot(AsmToken::Identifier)) - return Error(getLoc(), "invalid register"); - EndName = getLexer().getTok().getIdentifier(); - RegEnd = MatchRegisterName(EndName); - if (!(RegEnd >= RISCV::X19 && RegEnd <= RISCV::X27)) - return Error(getLoc(), "invalid register"); - getLexer().Lex(); - } } } @@ -2667,7 +2662,7 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, if (parseToken(AsmToken::RCurly, "register list must end with '}'")) return ParseStatus::Failure; - auto Encode = RISCVZC::encodeRlist(RegEnd, IsEABI); + auto Encode = RISCVZC::encodeRlist(RegEnd, IsRVE); assert(Encode != RISCVZC::INVALID_RLIST); if (MustIncludeS0) assert(Encode != RISCVZC::RA); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 15c07533bc3a3..d6672de02862d 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -603,8 +603,8 @@ enum RLISTENCODE { INVALID_RLIST, }; -inline unsigned encodeRlist(MCRegister EndReg, bool IsRV32E = false) { - assert((!IsRV32E || EndReg <= RISCV::X9) && "Invalid Rlist for RV32E"); +inline unsigned encodeRlist(MCRegister EndReg, bool IsRVE = false) { + assert((!IsRVE || EndReg <= RISCV::X9) && "Invalid Rlist for RV32E"); switch (EndReg) { case RISCV::X1: return RLISTENCODE::RA; diff --git a/llvm/test/MC/RISCV/rv32e-xqccmp-invalid.s b/llvm/test/MC/RISCV/rv32e-xqccmp-invalid.s index 6c3ef3000e77e..f24caa14883ed 100644 --- a/llvm/test/MC/RISCV/rv32e-xqccmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32e-xqccmp-invalid.s @@ -14,8 +14,11 @@ qc.cm.push {ra,s0-s2}, -16 # CHECK: :[[@LINE+1]]:21: error: invalid register qc.cm.popret {ra,s0-s2}, 16 # CHECK-DIS: ba72 -# CHECK: :[[@LINE+1]]:21: error: register list must end with '}' +# CHECK: :[[@LINE+1]]:23: error: invalid register qc.cm.pop {x1, x8-x9, x18}, 16 # CHECK-DIS: b972 -# CHECK: :[[@LINE+1]]:24: error: register list must end with '}' +# CHECK: :[[@LINE+1]]:26: error: invalid register qc.cm.pushfp {x1, x8-x9, x18}, -16 +# CHECK-DIS: b972 +# CHECK: :[[@LINE+1]]:22: error: invalid register +qc.cm.pushfp {ra, s0-s2}, -16 diff --git a/llvm/test/MC/RISCV/rv32e-zcmp-invalid.s b/llvm/test/MC/RISCV/rv32e-zcmp-invalid.s index eaf6b350c2341..a2942a84b9d3f 100644 --- a/llvm/test/MC/RISCV/rv32e-zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32e-zcmp-invalid.s @@ -14,5 +14,8 @@ cm.push {ra,s0-s2}, -16 # CHECK: :[[@LINE+1]]:18: error: invalid register cm.popret {ra,s0-s2}, 16 # CHECK-DIS: ba72 -# CHECK: :[[@LINE+1]]:18: error: register list must end with '}' +# CHECK: :[[@LINE+1]]:20: error: invalid register cm.pop {x1, x8-x9, x18}, 16 +# CHECK-DIS: ba72 +# CHECK: :[[@LINE+1]]:16: error: invalid register +cm.pop {ra, s0-s2}, 16 diff --git a/llvm/test/MC/RISCV/rv64e-xqccmp-invalid.s b/llvm/test/MC/RISCV/rv64e-xqccmp-invalid.s index f34ce83448070..39d9179456564 100644 --- a/llvm/test/MC/RISCV/rv64e-xqccmp-invalid.s +++ b/llvm/test/MC/RISCV/rv64e-xqccmp-invalid.s @@ -14,8 +14,11 @@ qc.cm.push {ra,s0-s2}, -32 # CHECK: :[[@LINE+1]]:21: error: invalid register qc.cm.popret {ra,s0-s2}, 32 # CHECK-DIS: ba72 -# CHECK: :[[@LINE+1]]:21: error: register list must end with '}' +# CHECK: :[[@LINE+1]]:23: error: invalid register qc.cm.pop {x1, x8-x9, x18}, 32 # CHECK-DIS: b972 -# CHECK: :[[@LINE+1]]:24: error: register list must end with '}' +# CHECK: :[[@LINE+1]]:26: error: invalid register qc.cm.pushfp {x1, x8-x9, x18}, -32 +# CHECK-DIS: b972 +# CHECK: :[[@LINE+1]]:22: error: invalid register +qc.cm.pushfp {ra, s0-s2}, -32 diff --git a/llvm/test/MC/RISCV/rv64e-zcmp-invalid.s b/llvm/test/MC/RISCV/rv64e-zcmp-invalid.s index e99721d96a17c..45081c65a2bc5 100644 --- a/llvm/test/MC/RISCV/rv64e-zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv64e-zcmp-invalid.s @@ -14,5 +14,8 @@ cm.push {ra,s0-s2}, -32 # CHECK: :[[@LINE+1]]:18: error: invalid register cm.popret {ra,s0-s2}, 32 # CHECK-DIS: ba72 -# CHECK: :[[@LINE+1]]:18: error: register list must end with '}' +# CHECK: :[[@LINE+1]]:20: error: invalid register cm.pop {x1, x8-x9, x18}, 32 +# CHECK-DIS: ba72 +# CHECK: :[[@LINE+1]]:16: error: invalid register +cm.pop {ra, s0-s2}, 32 From 4d1c82742302e205071a89ff42c0e90e548e861c Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Wed, 2 Apr 2025 03:15:29 +0900 Subject: [PATCH 0287/1029] [WebAssembly] Support parsing .lto_set_conditional (#126546) In the split-LTO-unit mode in ThinLTO, a compilation module is split into two and global variables that meet a specific criteria is moved to the split module. https://github.com/llvm/llvm-project/blob/d21fc58aeeaa7f0369a24dbe70a0360e0edbf76f/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp#L315-L366 And if there is an originally local-linkage global value defined in the original module and referenced in the split module or the vice versa, that value is _promoted_ by attaching a module ID to their names in order to prevent name clashes because now they can be referenced from other modules. https://github.com/llvm/llvm-project/blob/d21fc58aeeaa7f0369a24dbe70a0360e0edbf76f/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp#L46-L100 And when that promoted global value is a function, a `.lto_set_conditional` entry is written to the original module to avoid breaking references from inline assembly: https://github.com/llvm/llvm-project/blob/d21fc58aeeaa7f0369a24dbe70a0360e0edbf76f/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp#L84-L91 The syntax of this is, if the original function name is `symbolA` and the module ID is `123`, ```ll module asm ".lto_set_conditional symbolA,symbolA.123" ``` These symbols are parsed here: https://github.com/llvm/llvm-project/blob/648981f913431749c4656268ed670677a88511f6/llvm/lib/MC/MCParser/AsmParser.cpp#L6467 The first function symbol in this `.lto_set_conditional` do not exist as a function in the bitcode anymore because it was renamed to the second. So they are not assigned as function symbols but they are not really data either, so the object writer crashes here: https://github.com/llvm/llvm-project/blob/5b9e6c7993359c16b4d645c851bb7fe2fd7b78c7/llvm/lib/MC/WasmObjectWriter.cpp#L1820 This PR makes the object writer just skip those symbols. --- This problem was discovered when I was testing with `-fwhole-program-vtables`. The reason we didn't have this problem before with ThinLTO was because `-fsplit-lto-unit`, which splits LTO units when possible, defaults to false, but it defaults to true when `-fwhole-program-vtables` is used. --- llvm/lib/MC/WasmObjectWriter.cpp | 12 ++++++++++++ llvm/test/MC/WebAssembly/lto-set-conditional.s | 8 ++++++++ 2 files changed, 20 insertions(+) create mode 100644 llvm/test/MC/WebAssembly/lto-set-conditional.s diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp index 9c919696a0ac2..9d5a290f70cad 100644 --- a/llvm/lib/MC/WasmObjectWriter.cpp +++ b/llvm/lib/MC/WasmObjectWriter.cpp @@ -1785,6 +1785,18 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm, WS.setIndex(InvalidIndex); continue; } + // In bitcode generated by split-LTO-unit mode in ThinLTO, these lines can + // appear: + // module asm ".lto_set_conditional symbolA,symbolA.[moduleId]" + // ... + // (Here [moduleId] will be replaced by a real module hash ID) + // + // Here the original symbol (symbolA here) has been renamed to the new name + // created by attaching its module ID, so the original symbol does not + // appear in the bitcode anymore, and thus not in DataLocations. We should + // ignore them. + if (WS.isData() && WS.isDefined() && !DataLocations.count(&WS)) + continue; LLVM_DEBUG(dbgs() << "adding to symtab: " << WS << "\n"); uint32_t Flags = 0; diff --git a/llvm/test/MC/WebAssembly/lto-set-conditional.s b/llvm/test/MC/WebAssembly/lto-set-conditional.s new file mode 100644 index 0000000000000..c9519e232c2d8 --- /dev/null +++ b/llvm/test/MC/WebAssembly/lto-set-conditional.s @@ -0,0 +1,8 @@ +# RUN: llvm-mc -triple=wasm32-unknown-unknown + +# Tests if `.lto_set_conditional` directives are parsed without crashing. +.lto_set_conditional a, a.new +.type a.new,@function +a.new: + .functype a.new () -> () + end_function From 79487757b7f4b33a0940753fb02e39d0388e733a Mon Sep 17 00:00:00 2001 From: Virginia Cangelosi Date: Tue, 1 Apr 2025 19:20:27 +0100 Subject: [PATCH 0288/1029] [Clang][LLVM] Implement multi-multi vectors MOP4{A/S} (#129230) Implement all multi-multi {BF/F/S/U/SU/US}MOP4{A/S} instructions in clang and llvm following the acle in https://github.com/ARM-software/acle/pull/381/files --- clang/include/clang/Basic/arm_sme.td | 9 + .../sme2-intrinsics/acle_sme2_mop4_2x2.c | 466 +++++++++++++++ .../acle_sme2p2_imm.cpp | 77 +++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 10 + llvm/lib/Target/AArch64/SMEInstrFormats.td | 57 +- .../AArch64/sme2-intrinsics-mop4a_2x2.ll | 544 ++++++++++++++++++ 6 files changed, 1154 insertions(+), 9 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x2.c create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x2.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index 3958ed70f6ad0..1bfcf4c31d552 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -296,6 +296,7 @@ multiclass MOP4 check def _1x1 : Inst<"svmop4" # mode # "[_1x1]" # za # "[_{d}_{d}]", "vidd", t, MergeNone, i # "_1x1", [IsInOutZA, IsStreaming], checks>; def _1x2 : Inst<"svmop4" # mode # "[_1x2]" # za # "[_{d}_{d}]", "vid2", t, MergeNone, i # "_1x2", [IsInOutZA, IsStreaming], checks>; def _2x1 : Inst<"svmop4" # mode # "[_2x1]" # za # "[_{d}_{d}]", "vi2d", t, MergeNone, i # "_2x1", [IsInOutZA, IsStreaming], checks>; + def _2x2 : Inst<"svmop4" # mode # "[_2x2]" # za # "[_{d}_{d}]", "vi22", t, MergeNone, i # "_2x2", [IsInOutZA, IsStreaming], checks>; } let SMETargetGuard = "sme2,sme-mop4" in { @@ -355,6 +356,10 @@ multiclass SUMOP4 che "vi2u", t, MergeNone, "aarch64_sme_sumop4" # mode # i # "_wide_2x1", [IsStreaming, IsInOutZA], checks>; + def _2x2 : SInst<"svmop4" # mode # "[_2x2]" # za # "[_{d}_{3}]", + "vi2.x2.u", t, MergeNone, "aarch64_sme_sumop4" # mode # i # "_wide_2x2", + [IsStreaming, IsInOutZA], + checks>; } multiclass USMOP4 checks> { @@ -370,6 +375,10 @@ multiclass USMOP4 che "vi2x", t, MergeNone, "aarch64_sme_usmop4" # mode # i # "_wide_2x1", [IsStreaming, IsInOutZA], checks>; + def _2x2 : SInst<"svmop4" # mode # "[_2x2]" # za # "[_{d}_{3}]", + "vi2.u2.x", t, MergeNone, "aarch64_sme_usmop4" # mode # i # "_wide_2x2", + [IsStreaming, IsInOutZA], + checks>; } let SMETargetGuard = "sme2,sme-mop4" in { diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x2.c new file mode 100644 index 0000000000000..3e9612e3cc582 --- /dev/null +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x2.c @@ -0,0 +1,466 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +#ifdef SME_OVERLOADED_FORMS +#define SME_ACLE_FUNC(A1,A2_UNUSED,A3, A4_UNUSED) A1##A3 +#else +#define SME_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +// CHECK-LABEL: @test_svmop4a_2x2_za32_s8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4a_2x2_za32_s8_s810svint8x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za32_s8_s8(svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za32,_s8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za32_s8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4s_2x2_za32_s8_s810svint8x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za32_s8_s8(svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za32,_s8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za32_u8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4a_2x2_za32_u8_u811svuint8x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za32_u8_u8(svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za32,_u8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za32_u8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4s_2x2_za32_u8_u811svuint8x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za32_u8_u8(svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za32,_u8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za32_s8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4a_2x2_za32_s8_u810svint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za32_s8_u8(svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za32,_s8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za32_s8_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4s_2x2_za32_s8_u810svint8x2_t11svuint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za32_s8_u8(svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za32,_s8_u8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za32_u8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4a_2x2_za32_u8_s811svuint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za32_u8_s8(svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za32,_u8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za32_u8_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z27test_svmop4s_2x2_za32_u8_s811svuint8x2_t10svint8x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.wide.2x2.nxv16i8(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za32_u8_s8(svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za32,_u8_s8)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za32_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_2x2_za32_s16_s1611svint16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za32_s16_s16(svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za32,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za32_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_2x2_za32_s16_s1611svint16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za32_s16_s16(svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za32,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za32_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_2x2_za32_u16_u1612svuint16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za32_u16_u16(svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za32,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za32_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_2x2_za32_u16_u1612svuint16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za32_u16_u16(svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za32,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za32_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.2x2.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_2x2_za32_f16_f1613svfloat16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.2x2.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za32_f16_f16(svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za32,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za32_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.2x2.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_2x2_za32_f16_f1613svfloat16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.2x2.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za32_f16_f16(svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za32,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za32_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.2x2.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svmop4a_2x2_za32_bf16_bf1614svbfloat16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.wide.2x2.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za32_bf16_bf16(svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za32,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za32_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.2x2.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svmop4s_2x2_za32_bf16_bf1614svbfloat16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.wide.2x2.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za32_bf16_bf16(svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za32,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za64_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_2x2_za64_s16_s1611svint16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4a.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za64_s16_s16(svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za64,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za64_s16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_2x2_za64_s16_s1611svint16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.smop4s.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za64_s16_s16(svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za64,_s16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za64_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_2x2_za64_u16_u1612svuint16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4a.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za64_u16_u16(svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za64,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za64_u16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_2x2_za64_u16_u1612svuint16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.umop4s.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za64_u16_u16(svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za64,_u16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za64_s16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_2x2_za64_s16_u1611svint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4a.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za64_s16_u16(svint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za64,_s16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za64_s16_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_2x2_za64_s16_u1611svint16x2_t12svuint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.sumop4s.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za64_s16_u16(svint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za64,_s16_u16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za64_u16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_2x2_za64_u16_s1612svuint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4a.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za64_u16_s16(svuint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za64,_u16_s16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za64_u16_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_2x2_za64_u16_s1612svuint16x2_t11svint16x2_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.usmop4s.za64.wide.2x2.nxv8i16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za64_u16_s16(svuint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za64,_u16_s16)(1, zn, zm); +} + + +// CHECK-LABEL: @test_svmop4a_2x2_za16_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x2.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_2x2_za16_f16_f1613svfloat16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x2.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za16_f16_f16(svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za16,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za16_f16_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x2.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_2x2_za16_f16_f1613svfloat16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x2.nxv8f16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za16_f16_f16(svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za16,_f16_f16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za32_f32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x2.nxv4f32(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_2x2_za32_f32_f3213svfloat32x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x2.nxv4f32(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za32_f32_f32(svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za32,_f32_f32)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za32_f32_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x2.nxv4f32(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_2x2_za32_f32_f3213svfloat32x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x2.nxv4f32(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za32_f32_f32(svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za32,_f32_f32)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za64_f64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x2.nxv2f64(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4a_2x2_za64_f64_f6413svfloat64x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x2.nxv2f64(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za64_f64_f64(svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za64,_f64_f64)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za64_f64_f64( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x2.nxv2f64(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z29test_svmop4s_2x2_za64_f64_f6413svfloat64x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x2.nxv2f64(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za64_f64_f64(svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za64,_f64_f64)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4a_2x2_za16_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x2.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svmop4a_2x2_za16_bf16_bf1614svbfloat16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4a.2x2.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4a_2x2_za16_bf16_bf16(svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4a,_2x2,_za16,_bf16_bf16)(1, zn, zm); +} + +// CHECK-LABEL: @test_svmop4s_2x2_za16_bf16_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x2.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z31test_svmop4s_2x2_za16_bf16_bf1614svbfloat16x2_tS_( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.mop4s.2x2.nxv8bf16(i32 1, [[ZN_COERCE0:%.*]], [[ZN_COERCE1:%.*]], [[ZM_COERCE0:%.*]], [[ZM_COERCE1:%.*]]) +// CPP-CHECK-NEXT: ret void +// +void test_svmop4s_2x2_za16_bf16_bf16(svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { + SME_ACLE_FUNC(svmop4s,_2x2,_za16,_bf16_bf16)(1, zn, zm); +} diff --git a/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp index f8e57e9b24332..ae309853c976e 100644 --- a/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp +++ b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_imm.cpp @@ -250,3 +250,80 @@ void tests_mop4_imm_f64_f64_2x1(svfloat64x2_t zn, svfloat64_t zm) __arm_streamin svmop4s_2x1_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} return; } + +void tests_mop4_imm_s8_s8_2x2(svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x2_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x2_za32_s8_s8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_u8_u8_2x2(svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x2_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x2_za32_u8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_s8_u8_2x2(svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x2_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x2_za32_s8_u8(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4a_2x2_za32_u8_s8(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x2_za32_u8_s8(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_s16_s16_2x2(svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x2_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x2_za32_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_2x2_za64_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x2_za64_s16_s16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_u16_u16_2x2(svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x2_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x2_za32_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_2x2_za64_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x2_za64_u16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_s16_u16_2x2(svint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x2_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x2_za64_s16_u16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4a_2x2_za64_u16_s16(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x2_za64_u16_s16(-1, zm, zn); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} + +void tests_mop4_imm_f16_f16_2x2(svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x2_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x2_za32_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_2x2_za16_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmop4s_2x2_za16_f16_f16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + return; +} + +void tests_mop4_imm_bf16_bf16_2x2(svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x2_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x2_za32_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + + svmop4a_2x2_za16_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + svmop4s_2x2_za16_bf16_bf16(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}} + return; + +} + +void tests_mop4_imm_f32_f32_2x2(svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x2_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + svmop4s_2x2_za32_f32_f32(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}} + return; +} + +void tests_mop4_imm_f64_f64_2x2(svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") { + svmop4a_2x2_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + svmop4s_2x2_za64_f64_f64(-1, zn, zm); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 7]}} + return; +} \ No newline at end of file diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 6c25e6582b836..0f7e963f46e77 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3077,6 +3077,14 @@ let TargetPrefix = "aarch64" in { LLVMMatchType<0>, LLVMMatchType<0>], [ImmArg>, IntrNoMem, IntrHasSideEffects]>; + class SME_OuterProduct_QuarterTile_Multi_Multi + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, + LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMMatchType<0>], [ImmArg>, IntrNoMem, IntrHasSideEffects]>; + // 2-way and 4-way multi-vector signed/unsigned Quarter Tile Quarter Product A/S foreach mode = ["s", "a"] in { foreach za = ["", "_za64"] in { @@ -3084,6 +3092,7 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_1x1" : SME_OuterProduct_QuarterTile_Single_Single; def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_1x2" : SME_OuterProduct_QuarterTile_Single_Multi; def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_2x1" : SME_OuterProduct_QuarterTile_Single_Multi; + def int_aarch64_sme_ # ty # "mop4" # mode # za # "_wide_2x2" : SME_OuterProduct_QuarterTile_Multi_Multi; } } } @@ -3094,6 +3103,7 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sme_mop4 # mode # wide # "_1x1" : SME_OuterProduct_QuarterTile_Single_Single; def int_aarch64_sme_mop4 # mode # wide # "_1x2" : SME_OuterProduct_QuarterTile_Single_Multi; def int_aarch64_sme_mop4 # mode # wide # "_2x1" : SME_OuterProduct_QuarterTile_Single_Multi; + def int_aarch64_sme_mop4 # mode # wide # "_2x2" : SME_OuterProduct_QuarterTile_Multi_Multi; } } diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td index ccc061da0be9a..c008cda21cf05 100644 --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -278,6 +278,9 @@ class SME2_ZA_Tile_Vec_Multi_Single_Pat(name # _PSEUDO) $tile, (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), $Zm)>; +class SME2_ZA_Tile_Vec_Multi_Multi_Pat + : Pat<(intrinsic imm_ty:$tile, vt:$Zn1, vt:$Zn2, vt:$Zm1, vt:$Zm2), + (!cast(name # _PSEUDO) $tile, (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1), (REG_SEQUENCE ZPR2Mul2, vt:$Zm1, zsub0, vt:$Zm2, zsub1))>; //===----------------------------------------------------------------------===// // SME pattern match helpers. //===----------------------------------------------------------------------===// @@ -647,7 +650,11 @@ multiclass sme_quarter_outer_product_i8_i32; + ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi, mnemonic>, SMEPseudo2Instr; + + def NAME # _M2Z2Z_BToS_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Multi_Pat(op # "_2x2"), timm32_0_3, nxv16i8>; } multiclass sme_quarter_outer_product_i16_i32{ @@ -677,7 +684,11 @@ multiclass sme_quarter_outer_product_i16_i32; + ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>, SMEPseudo2Instr; + + def NAME # _M2Z2Z_HToS_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Multi_Pat(op # "_2x2"), timm32_0_3, nxv8i16>; } multiclass sme_quarter_outer_product_i64{ @@ -707,7 +718,11 @@ multiclass sme_quarter_outer_product_i64; + ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi, mnemonic>, SMEPseudo2Instr; + + def NAME # _M2Z2Z_HtoD_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Multi_Pat(op # "_2x2"), timm32_0_7, nxv8i16>; } //===----------------------------------------------------------------------===// @@ -5570,7 +5585,11 @@ multiclass sme2_bfmop4as_widening { def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_3, nxv8bf16>; // Multiple vectors - def _M2Z2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>; + def _M2Z2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr; + + def NAME # _M2Z2Z_S_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Multi_Pat(op # "_2x2"), timm32_0_3, nxv8bf16>; } class sme2_multi2_fmul_sm size, string mnemonic, RegisterOperand vector_ty, RegisterOperand zpr_ty> @@ -5729,7 +5748,11 @@ multiclass sme2_fmop4as_fp16_non_widening { def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_1, nxv8f16>; // Multiple vectors - def _M2Z2Z_H : sme2_fp16_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>; + def _M2Z2Z_H : sme2_fp16_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr; + + def NAME # _M2Z2Z_H_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Multi_Pat(op # "_2x2"), timm32_0_1, nxv8f16>; } class sme2_fp8_fp32_quarter_tile_outer_product @@ -5813,7 +5836,11 @@ multiclass sme2_bfmop4as_non_widening { def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_1, nxv8bf16>; // Multiple vectors - def _M2Z2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>; + def _M2Z2Z_H : sme2_bf16_fp16_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr; + + def NAME # _M2Z2Z_H_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Multi_Pat(op # "_2x2"), timm32_0_1, nxv8bf16>; } class sme2_fp32_quarter_tile_outer_product @@ -5862,7 +5889,11 @@ multiclass sme2_fmop4as_fp32_non_widening { def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_3, nxv4f32>; // Multiple vectors - def _M2Z2Z_S : sme2_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZZ_s_mul_r_Hi>; + def _M2Z2Z_S : sme2_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_s_mul_r_Lo, ZZ_s_mul_r_Hi>, SMEPseudo2Instr; + + def NAME # _M2Z2Z_S_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Multi_Pat(op # "_2x2"), timm32_0_3, nxv4f32>; } class sme2_fp64_quarter_tile_outer_product @@ -5911,7 +5942,11 @@ multiclass sme2_fmop4as_fp64_non_widening { def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_7, nxv2f64>; // Multiple vectors - def _M2Z2Z_D : sme2_fp64_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZZ_d_mul_r_Hi>; + def _M2Z2Z_D : sme2_fp64_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_d_mul_r_Lo, ZZ_d_mul_r_Hi>, SMEPseudo2Instr; + + def NAME # _M2Z2Z_D_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Multi_Pat(op # "_2x2"), timm32_0_7, nxv2f64>; } class sme2_fp16_fp32_quarter_tile_outer_product @@ -5960,7 +5995,11 @@ multiclass sme2_fmop4as_fp16_fp32_widening { def : SME2_ZA_Tile_Vec_Multi_Pat(op # "_1x2"), timm32_0_3, nxv8f16>; // Multiple vectors - def _M2Z2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>; + def _M2Z2Z_HtoS : sme2_fp16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>, SMEPseudo2Instr; + + def NAME # _M2Z2Z_HtoS_PSEUDO : sme2_quarter_tile_outer_product_pseudo, SMEPseudo2Instr; + + def : SME2_ZA_Tile_Vec_Multi_Multi_Pat(op # "_2x2"), timm32_0_3, nxv8f16>; } class sme2_fp8_fp16_quarter_tile_outer_product diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x2.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x2.ll new file mode 100644 index 0000000000000..4d0d696dc42cf --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4a_2x2.ll @@ -0,0 +1,544 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s + +target triple = "aarch64-linux" + +; Widening +define void @mop4a_za32_s8( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4a za0.s, { z0.b, z1.b }, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.wide.2x2.nxv16i8(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_s8( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4s za0.s, { z0.b, z1.b }, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.2x2.nxv16i8(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_u8( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4a za0.s, { z0.b, z1.b }, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.wide.2x2.nxv16i8(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_u8( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4s za0.s, { z0.b, z1.b }, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.wide.2x2.nxv16i8(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_s8_u8( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_s8_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sumop4a za0.s, { z0.b, z1.b }, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4a.wide.2x2.nxv16i8(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_s8_u8( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_s8_u8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sumop4s za0.s, { z0.b, z1.b }, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4s.wide.2x2.nxv16i8(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_u8_s8( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_u8_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: usmop4a za0.s, { z0.b, z1.b }, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4a.wide.2x2.nxv16i8(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_u8_s8( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_u8_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: usmop4s za0.s, { z0.b, z1.b }, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4s.wide.2x2.nxv16i8(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + + +define void @mop4a_za32_s16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4a za0.s, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.wide.2x2.nxv8i16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_s16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4s za0.s, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.2x2.nxv8i16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_u16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4a za0.s, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.wide.2x2.nxv8i16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_u16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4s za0.s, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.wide.2x2.nxv8i16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_f16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4a za0.s, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.wide.2x2.nxv8f16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_f16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za0.s, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.2x2.nxv8f16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_bf16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4a za0.s, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.wide.2x2.nxv8bf16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_bf16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4s za0.s, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.2x2.nxv8bf16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za64_s16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za64_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4a za0.d, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4a.za64.wide.2x2.nxv8i16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_s16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4s za0.d, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.za64.wide.2x2.nxv8i16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za64_u16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za64_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4a za0.d, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4a.za64.wide.2x2.nxv8i16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_u16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: umop4s za0.d, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.umop4s.za64.wide.2x2.nxv8i16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za64_s16_u16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za64_s16_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sumop4a za0.d, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4a.za64.wide.2x2.nxv8i16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_s16_u16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_s16_u16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: sumop4s za0.d, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.sumop4s.za64.wide.2x2.nxv8i16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za64_u16_s16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za64_u16_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: usmop4a za0.d, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4a.za64.wide.2x2.nxv8i16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_u16_s16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_u16_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: usmop4s za0.d, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.usmop4s.za64.wide.2x2.nxv8i16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +; Non-widening +define void @mop4a_za16_f16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4a za0.h, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.2x2.nxv8f16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za16_f16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za16_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za0.h, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x2.nxv8f16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za32_f32( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4a za0.s, { z0.s, z1.s }, { z24.s, z25.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.2x2.nxv4f32(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_f32( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za0.s, { z0.s, z1.s }, { z24.s, z25.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x2.nxv4f32(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za64_f64( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4a za0.d, { z0.d, z1.d }, { z24.d, z25.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.2x2.nxv2f64(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_f64( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za0.d, { z0.d, z1.d }, { z24.d, z25.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x2.nxv2f64(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4a_za16_bf16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4a_za16_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4a za0.h, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4a.2x2.nxv8bf16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za16_bf16( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za16_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4s za0.h, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x2.nxv8bf16(i32 0, %zn1, %zn2, %zm1, %zm2) + ret void +} + +; Tile limits + +define void @mop4s_za32_s8_limit( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_s8_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4s za3.s, { z0.b, z1.b }, { z24.b, z25.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.2x2.nxv16i8(i32 3, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_s16_limit( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_s16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4s za3.s, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.wide.2x2.nxv8i16(i32 3, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_f16_limit( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_f16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za3.s, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.2x2.nxv8f16(i32 3, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_bf16_limit( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_bf16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4s za3.s, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.wide.2x2.nxv8bf16(i32 3, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_s16_limit( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_s16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: smop4s za7.d, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.smop4s.za64.wide.2x2.nxv8i16(i32 7, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za64_f64_limit( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za64_f64_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za7.d, { z0.d, z1.d }, { z24.d, z25.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x2.nxv2f64(i32 7, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za32_f32_limit( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za32_f32_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za3.s, { z0.s, z1.s }, { z24.s, z25.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x2.nxv4f32(i32 3, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za16_f16_limit( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za16_f16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: fmop4s za1.h, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x2.nxv8f16(i32 1, %zn1, %zn2, %zm1, %zm2) + ret void +} + +define void @mop4s_za16_bf16_limit( %zn1, %zn2, %zm1, %zm2) #0 { +; CHECK-LABEL: mop4s_za16_bf16_limit: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z25.d, z3.d +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov z24.d, z2.d +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: bfmop4s za1.h, { z0.h, z1.h }, { z24.h, z25.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.mop4s.2x2.nxv8bf16(i32 1, %zn1, %zn2, %zm1, %zm2) + ret void +} + +attributes #0 = {nounwind "target-features" = "+sme-i16i64,+sme-f64f64,+sme-b16b16,+sme2p1,+bf16,+sme-f16f16,+sme-mop4" } From 2b064108ed55af510379edc81622983385c7977f Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Tue, 1 Apr 2025 14:24:54 -0400 Subject: [PATCH 0289/1029] Fix a build error (#133957) This fixes error reported in post-commit testing of https://github.com/llvm/llvm-project/pull/133797 LOG: https://lab.llvm.org/buildbot/#/builders/140/builds/20266 Thanks Signed-off-by: Arvind Sudarsanam --- clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp index f4a16549340a1..fccea538b1dc5 100644 --- a/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp +++ b/clang/tools/clang-sycl-linker/ClangSYCLLinker.cpp @@ -205,7 +205,7 @@ Expected> getBitcodeModule(StringRef File, auto M = getLazyIRFileModule(File, Err, C); if (M) - return M; + return std::move(M); return createStringError(Err.getMessage()); } From 0e3049c562ccdea288c3b1f3b3d1ce5992d284b0 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 1 Apr 2025 14:30:06 -0400 Subject: [PATCH 0290/1029] [SLP]Support revectorization of the previously vectorized scalars If the scalar instructions is marked for the vectorization in the tree, it cannot be vectorized as part of the another node in the same tree, in general. It may prevent some potentially profitable vectorization opportunities, since some nodes end up being buildvector/gather nodes, which add to the total cost. Patch allows revectorization of the previously vectorized scalars. Reviewers: hiraditya, RKSimon Reviewed By: RKSimon, hiraditya Pull Request: https://github.com/llvm/llvm-project/pull/133091 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 217 +++++++++++------- .../AArch64/reused-scalar-repeated-in-node.ll | 18 +- .../AArch64/transpose-inseltpoison.ll | 21 +- .../SLPVectorizer/AArch64/transpose.ll | 21 +- .../AArch64/vec3-reorder-reshuffle.ll | 4 +- .../X86/cast-operand-extracted.ll | 10 +- .../X86/scatter-vectorize-reorder.ll | 7 +- .../X86/vec3-reorder-reshuffle.ll | 4 +- 8 files changed, 176 insertions(+), 126 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 18c896767b6d2..d282105135566 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3756,11 +3756,6 @@ class BoUpSLP { if (isa(V)) continue; auto It = ScalarToTreeEntries.find(V); - assert( - (It == ScalarToTreeEntries.end() || - (It->getSecond().size() == 1 && It->getSecond().front() == Last) || - doesNotNeedToBeScheduled(V)) && - "Scalar already in tree!"); if (It == ScalarToTreeEntries.end()) { ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last); (void)Processed.insert(V); @@ -4020,6 +4015,9 @@ class BoUpSLP { private: /// Used for getting a "good" final ordering of instructions. int SchedulingPriority = 0; + /// True if this instruction (or bundle) is scheduled (or considered as + /// scheduled in the dry-run). + bool IsScheduled = false; /// The kind of the ScheduleEntity. const Kind K = Kind::ScheduleData; @@ -4033,6 +4031,10 @@ class BoUpSLP { return SD->isReady(); return cast(this)->isReady(); } + /// Gets/sets if the bundle is scheduled. + bool isScheduled() const { return IsScheduled; } + void setScheduled(bool Scheduled) { IsScheduled = Scheduled; } + static bool classof(const ScheduleEntity *) { return true; } }; @@ -4105,10 +4107,6 @@ class BoUpSLP { IsScheduled = false; } - /// Gets/sets if the bundle is scheduled. - bool isScheduled() const { return IsScheduled; } - void setScheduled(bool Scheduled) { IsScheduled = Scheduled; } - /// Gets the number of unscheduled dependencies. int getUnscheduledDeps() const { return UnscheduledDeps; } /// Gets the number of dependencies. @@ -4183,10 +4181,6 @@ class BoUpSLP { /// for scheduling. /// Note that this is negative as long as Dependencies is not calculated. int UnscheduledDeps = InvalidDeps; - - /// True if this instruction is scheduled (or considered as scheduled in the - /// dry-run). - bool IsScheduled = false; }; #ifndef NDEBUG @@ -4231,11 +4225,6 @@ class BoUpSLP { } } - bool isScheduled() const { - return all_of(Bundle, - [](const ScheduleData *SD) { return SD->isScheduled(); }); - } - /// Returns the number of unscheduled dependencies in the bundle. int unscheduledDepsInBundle() const { assert(*this && "bundle must not be empty"); @@ -4492,12 +4481,22 @@ class BoUpSLP { ProcessBundleMember(SD, nullptr); } else { ScheduleBundle &Bundle = *cast(Data); - for_each(Bundle.getBundle(), [](ScheduleData *SD) { - SD->setScheduled(/*Scheduled=*/true); - }); + Bundle.setScheduled(/*Scheduled=*/true); LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n"); - for (ScheduleData *SD : Bundle.getBundle()) - ProcessBundleMember(SD, &Bundle); + auto AreAllBundlesScheduled = [&](const ScheduleData *SD) { + ArrayRef SDBundles = + getScheduleBundles(SD->getInst()); + return !SDBundles.empty() && + all_of(SDBundles, [&](const ScheduleBundle *SDBundle) { + return SDBundle->isScheduled(); + }); + }; + for (ScheduleData *SD : Bundle.getBundle()) { + if (AreAllBundlesScheduled(SD)) { + SD->setScheduled(/*Scheduled=*/true); + ProcessBundleMember(SD, &Bundle); + } + } } } @@ -4528,10 +4527,11 @@ class BoUpSLP { SD->verify(); } - for (const ScheduleEntity *Bundle : ReadyInsts) { - assert(Bundle->isReady() && "item in ready list not ready?"); - (void)Bundle; - } + assert(all_of(ReadyInsts, + [](const ScheduleEntity *Bundle) { + return Bundle->isReady(); + }) && + "item in ready list not ready?"); } /// Put all instructions into the ReadyList which are ready for scheduling. @@ -7228,7 +7228,7 @@ void BoUpSLP::buildExternalUses( // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in FoundLane will // be used. - if (any_of(UseEntries, [&](TreeEntry *UseEntry) { + if (all_of(UseEntries, [&](TreeEntry *UseEntry) { return UseEntry->State == TreeEntry::ScatterVectorize || !doesInTreeUserNeedToExtract( Scalar, getRootEntryInstruction(*UseEntry), TLI, @@ -9246,14 +9246,47 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // We now know that this is a vector of instructions of the same type from // the same block. - // Check that none of the instructions in the bundle are already in the tree. - for (Value *V : VL) { - if ((!IsScatterVectorizeUserTE && !isa(V)) || - doesNotNeedToBeScheduled(V)) - continue; - if (isVectorized(V)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V - << ") is already in tree.\n"); + // Check that none of the instructions in the bundle are already in the tree + // and the node may be not profitable for the vectorization as the small + // alternate node. + if (S && S.isAltShuffle()) { + auto GetNumVectorizedExtracted = [&]() { + APInt Extracted = APInt::getZero(VL.size()); + APInt Vectorized = APInt::getAllOnes(VL.size()); + for (auto [Idx, V] : enumerate(VL)) { + auto *I = dyn_cast(V); + if (!I || doesNotNeedToBeScheduled(I) || + all_of(I->operands(), [&](const Use &U) { + return isa(U.get()); + })) + continue; + if (isVectorized(I)) + Vectorized.clearBit(Idx); + else if (!I->hasOneUser() && !areAllUsersVectorized(I, UserIgnoreList)) + Extracted.setBit(Idx); + } + return std::make_pair(Vectorized, Extracted); + }; + auto [Vectorized, Extracted] = GetNumVectorizedExtracted(); + constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput; + bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2; + if (!Vectorized.isAllOnes() && !PreferScalarize) { + // Rough cost estimation, if the vector code (+ potential extracts) is + // more profitable than the scalar + buildvector. + Type *ScalarTy = VL.front()->getType(); + auto *VecTy = getWidenedType(ScalarTy, VL.size()); + InstructionCost VectorizeCostEstimate = + ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) + + ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted, + /*Insert=*/false, /*Extract=*/true, Kind); + InstructionCost ScalarizeCostEstimate = + ::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Vectorized, + /*Insert=*/true, /*Extract=*/false, Kind); + PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate; + } + if (PreferScalarize) { + LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate " + "node is not profitable.\n"); if (TryToFindDuplicates(S)) { auto Invalid = ScheduleBundle::invalid(); newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx, @@ -9342,8 +9375,6 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, #endif if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) { LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); - assert((!BS.getScheduleData(VL0) || BS.getScheduleBundles(VL0).empty()) && - "tryScheduleBundle should not create bundle on failure"); // Last chance to try to vectorize alternate node. if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(SmallNodeSize, S)) @@ -12120,7 +12151,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, SmallBitVector UsedScalars(Sz, false); for (unsigned I = 0; I < Sz; ++I) { if (isa(UniqueValues[I]) && - is_contained(getTreeEntries(UniqueValues[I]), E)) + getTreeEntries(UniqueValues[I]).front() == E) continue; UsedScalars.set(I); } @@ -13641,6 +13672,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, for (ExternalUser &EU : ExternalUses) { ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane); } + SmallDenseSet, 8> CheckedScalarUser; for (ExternalUser &EU : ExternalUses) { // Uses by ephemeral values are free (because the ephemeral value will be // removed prior to code generation, and so the extraction will be @@ -13648,6 +13680,12 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, if (EphValues.count(EU.User)) continue; + // Check if the scalar for the given user or all users is accounted already. + if (!CheckedScalarUser.insert(std::make_pair(EU.Scalar, EU.User)).second || + (EU.User && + CheckedScalarUser.contains(std::make_pair(EU.Scalar, nullptr)))) + continue; + // Used in unreachable blocks or in EH pads (rarely executed) or is // terminated with unreachable instruction. if (BasicBlock *UserParent = @@ -14350,10 +14388,16 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( PHINode *UserPHI = UseEI.UserTE->State != TreeEntry::SplitVectorize ? dyn_cast(UseEI.UserTE->getMainOp()) : nullptr; - const Instruction *InsertPt = + Instruction *InsertPt = UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator() : &getLastInstructionInBundle(UseEI.UserTE); if (TEInsertPt == InsertPt) { + // If the schedulable insertion point is used in multiple entries - just + // exit, no known ordering at this point, available only after real + // scheduling. + if (!doesNotNeedToBeScheduled(InsertPt) && + (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx)) + continue; // If the users are the PHI nodes with the same incoming blocks - skip. if (TEUseEI.UserTE->State == TreeEntry::Vectorize && TEUseEI.UserTE->getOpcode() == Instruction::PHI && @@ -15065,19 +15109,29 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { // Set the insert point to the beginning of the basic block if the entry // should not be scheduled. - const auto *It = BlocksSchedules.find(BB); - auto IsNotScheduledEntry = [&](const TreeEntry *E) { + auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * { if (E->isGather()) - return false; + return nullptr; // Found previously that the instruction do not need to be scheduled. - return It == BlocksSchedules.end() || all_of(E->Scalars, [&](Value *V) { - if (!isa(V)) - return true; - return It->second->getScheduleBundles(V).empty(); - }); + const auto *It = BlocksSchedules.find(BB); + if (It == BlocksSchedules.end()) + return nullptr; + for (Value *V : E->Scalars) { + auto *I = dyn_cast(V); + if (!I || isa(I) || doesNotNeedToBeScheduled(I)) + continue; + ArrayRef Bundles = It->second->getScheduleBundles(I); + if (Bundles.empty()) + continue; + const auto *It = find_if( + Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; }); + if (It != Bundles.end()) + return *It; + } + return nullptr; }; - if (IsNotScheduledEntry(E) || - (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) { + const ScheduleBundle *Bundle = FindScheduleBundle(E); + if (!E->isGather() && !Bundle) { if ((E->getOpcode() == Instruction::GetElementPtr && any_of(E->Scalars, [](Value *V) { @@ -15103,19 +15157,10 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { // scheduled, and the last instruction is VL.back(). So we start with // VL.back() and iterate over schedule data until we reach the end of the // bundle. The end of the bundle is marked by null ScheduleData. - if (It != BlocksSchedules.end() && !E->isGather()) { - Value *V = E->isOneOf(E->Scalars.back()); - if (doesNotNeedToBeScheduled(V)) - V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled); - if (ArrayRef Bundles = It->second->getScheduleBundles(V); - !Bundles.empty()) { - const auto *It = find_if( - Bundles, [&](ScheduleBundle *B) { return B->getTreeEntry() == E; }); - assert(It != Bundles.end() && "Failed to find bundle"); - Res = (*It)->getBundle().back()->getInst(); - return *Res; - } - assert(E->getOpcode() == Instruction::PHI && "Expected PHI"); + if (Bundle) { + assert(!E->isGather() && "Gathered instructions should not be scheduled"); + Res = Bundle->getBundle().back()->getInst(); + return *Res; } // LastInst can still be null at this point if there's either not an entry @@ -15868,10 +15913,10 @@ BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx, const InstructionsState &S) { if (!S) return nullptr; - if (TreeEntry *VE = getSameValuesTreeEntry(S.getMainOp(), VL); - VE && VE->UserTreeIndex.UserTE == E && - VE->UserTreeIndex.EdgeIdx == NodeIdx) - return VE; + for (TreeEntry *TE : ScalarToTreeEntries.lookup(S.getMainOp())) + if (TE->UserTreeIndex.UserTE == E && TE->UserTreeIndex.EdgeIdx == NodeIdx && + TE->isSame(VL)) + return TE; return nullptr; } @@ -17521,13 +17566,13 @@ Value *BoUpSLP::vectorizeTree( const ExtraValueToDebugLocsMap &ExternallyUsedValues, Instruction *ReductionRoot, ArrayRef> VectorValuesAndScales) { + // Clean Entry-to-LastInstruction table. It can be affected after scheduling, + // need to rebuild it. + EntryToLastInstruction.clear(); // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { scheduleBlock(BSIter.second.get()); } - // Clean Entry-to-LastInstruction table. It can be affected after scheduling, - // need to rebuild it. - EntryToLastInstruction.clear(); if (ReductionRoot) Builder.SetInsertPoint(ReductionRoot->getParent(), @@ -18366,18 +18411,10 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, // dependencies. As soon as the bundle is "ready" it means that there are no // cyclic dependencies and we can schedule it. Note that's important that we // don't "schedule" the bundle yet. - SmallPtrSet Visited; while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) && !ReadyInsts.empty()) { ScheduleEntity *Picked = ReadyInsts.pop_back_val(); - const auto *PickedBundle = dyn_cast(Picked); - if (PickedBundle && !Visited.insert(PickedBundle).second) { - assert(PickedBundle->isScheduled() && "bundle must be scheduled"); - continue; - } - assert((PickedBundle ? PickedBundle->isReady() - : cast(Picked)->isReady()) && - "must be ready to schedule"); + assert(Picked->isReady() && "must be ready to schedule"); schedule(Picked, ReadyInsts); if (Picked == &Bundle) break; @@ -18431,8 +18468,16 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, TryScheduleBundleImpl(ReSchedule, Bundle); if (!Bundle.isReady()) { for (ScheduleData *BD : Bundle.getBundle()) { - if (BD->isReady()) - ReadyInsts.insert(BD); + if (BD->isReady()) { + ArrayRef Bundles = getScheduleBundles(BD->getInst()); + if (Bundles.empty()) { + ReadyInsts.insert(BD); + continue; + } + for (ScheduleBundle *B : Bundles) + if (B->isReady()) + ReadyInsts.insert(B); + } } ScheduledBundlesList.pop_back(); for (Value *V : VL) { @@ -18763,6 +18808,11 @@ void BoUpSLP::BlockScheduling::resetSchedule() { SD->setScheduled(/*Scheduled=*/false); SD->resetUnscheduledDeps(); } + for (ScheduleBundle *Bundle : getScheduleBundles(I)) { + assert(isInSchedulingRegion(*Bundle) && + "ScheduleBundle not in scheduling region"); + Bundle->setScheduled(/*Scheduled=*/false); + } } ReadyInsts.clear(); } @@ -18821,6 +18871,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { Instruction *LastScheduledInst = BS->ScheduleEnd; // Do the "real" scheduling. + SmallPtrSet Scheduled; while (!ReadyInsts.empty()) { auto *Picked = *ReadyInsts.begin(); ReadyInsts.erase(ReadyInsts.begin()); @@ -18830,10 +18881,14 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (auto *Bundle = dyn_cast(Picked)) { for (const ScheduleData *BundleMember : Bundle->getBundle()) { Instruction *PickedInst = BundleMember->getInst(); + if (!Scheduled.insert(PickedInst).second) + continue; if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst) PickedInst->moveAfter(LastScheduledInst->getPrevNode()); LastScheduledInst = PickedInst; } + EntryToLastInstruction.try_emplace(Bundle->getTreeEntry(), + LastScheduledInst); } else { auto *SD = cast(Picked); Instruction *PickedInst = SD->getInst(); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll index 3cab4a4da3f8e..fcd3bfc3f323a 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll @@ -39,28 +39,26 @@ define void @test() { ; CHECK: [[BB77]]: ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP12]], float [[I70]], i32 0 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x float> poison, float [[I68]], i32 0 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x float> [[TMP30]], float [[I66]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x float> [[TMP14]], float [[I68]], i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP19]], float [[I66]], i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x float> [[TMP16]], float [[I67]], i32 6 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I69]], i32 7 ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP25]], <16 x i32> ; CHECK-NEXT: br label %[[BB78:.*]] ; CHECK: [[BB78]]: ; CHECK-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP17]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x float> [ [[TMP31]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[TMP22:%.*]] = phi <8 x float> [ [[TMP21]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP20]], <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <16 x float> [[TMP23]], <16 x float> [[TMP22]], <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x float> [[TMP40]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP13]] ; CHECK-NEXT: [[TMP26:%.*]] = fmul fast <16 x float> [[TMP38]], [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]] ; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], poison ; CHECK-NEXT: [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison ; CHECK-NEXT: [[TMP36]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP37]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP31]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> ; CHECK-NEXT: br i1 poison, label %[[BB78]], label %[[BB167]] ; CHECK: [[BB167]]: ; CHECK-NEXT: [[TMP32:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP29]], %[[BB78]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll index f79db7d7ad0cb..ab6c7443f80e8 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -123,18 +123,17 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i64 1 -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_1]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP10]] ; CHECK-NEXT: ret <4 x i32> [[TMP9]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll index 1330e5557e559..3063d85e122d8 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -123,18 +123,17 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i64 1 -; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0_1]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP10]] ; CHECK-NEXT: ret <4 x i32> [[TMP9]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll index 2d94babb56874..47153d91956d5 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll @@ -206,15 +206,15 @@ define i32 @reorder_indices_1(float %0) { ; POW2-ONLY-SAME: float [[TMP0:%.*]]) { ; POW2-ONLY-NEXT: entry: ; POW2-ONLY-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 +; POW2-ONLY-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr float, ptr [[NOR1]], i64 1 ; POW2-ONLY-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4 +; POW2-ONLY-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX_I]], align 4 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 ; POW2-ONLY-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] ; POW2-ONLY-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] ; POW2-ONLY-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) -; POW2-ONLY-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> -; POW2-ONLY-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 ; POW2-ONLY-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] ; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 ; POW2-ONLY-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll b/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll index 860d0ed29332c..fa46bd3d83249 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll @@ -8,19 +8,19 @@ define void @test(ptr %0, i32 %add651) { ; CHECK-NEXT: [[PREDPEL11:%.*]] = alloca [0 x [0 x [25 x i32]]], i32 0, align 16 ; CHECK-NEXT: [[ARRAYIDX469_6:%.*]] = getelementptr i8, ptr [[PREDPEL11]], i64 28 ; CHECK-NEXT: [[ARRAYIDX469_7:%.*]] = getelementptr i8, ptr [[PREDPEL11]], i64 32 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PREDPEL11]], i64 36 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX469_7]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PREDPEL11]], i64 40 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX469_6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX469_7]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX469_7]], align 16 ; CHECK-NEXT: [[CONV470_7:%.*]] = trunc i32 [[TMP2]] to i16 ; CHECK-NEXT: store i16 [[CONV470_7]], ptr [[TMP0]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP0]], align 8 ; CHECK-NEXT: [[ARRAYIDX660:%.*]] = getelementptr i8, ptr [[TMP4]], i64 7800 ; CHECK-NEXT: [[ARRAYIDX689:%.*]] = getelementptr i8, ptr [[TMP4]], i64 7816 -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP1]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP5]], <2 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> , <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[TMP5]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP8]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[ADD651]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll index f875d45db61dd..533b0df21e160 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll @@ -8,14 +8,13 @@ define void @test() { ; CHECK-NEXT: [[ARRAYIDX21_I:%.*]] = getelementptr inbounds [4 x float], ptr undef, i64 2 ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr undef, align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr undef, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP0]], float [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP3]], <2 x float> [[TMP6]]) ; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll index 22a59d3da52a6..36151df96bfca 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll @@ -205,15 +205,15 @@ define i32 @reorder_indices_1(float %0) { ; POW2-ONLY-SAME: float [[TMP0:%.*]]) { ; POW2-ONLY-NEXT: entry: ; POW2-ONLY-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 +; POW2-ONLY-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr float, ptr [[NOR1]], i64 1 ; POW2-ONLY-NEXT: [[ARRAYIDX2_I265:%.*]] = getelementptr float, ptr [[NOR1]], i64 2 ; POW2-ONLY-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2_I265]], align 4 +; POW2-ONLY-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX_I]], align 4 ; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[NOR1]], align 4 ; POW2-ONLY-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 ; POW2-ONLY-NEXT: [[TMP4:%.*]] = fneg float [[TMP3]] ; POW2-ONLY-NEXT: [[NEG11_I:%.*]] = fmul float [[TMP4]], [[TMP0]] ; POW2-ONLY-NEXT: [[TMP5:%.*]] = call float @llvm.fmuladd.f32(float [[TMP1]], float 0.000000e+00, float [[NEG11_I]]) -; POW2-ONLY-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> -; POW2-ONLY-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 ; POW2-ONLY-NEXT: [[TMP8:%.*]] = fneg <2 x float> [[TMP7]] ; POW2-ONLY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 ; POW2-ONLY-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> zeroinitializer From 1407f5bee9aa8e2a8a4fcab63ab0a3030a8b0dcf Mon Sep 17 00:00:00 2001 From: Max191 <44243577+Max191@users.noreply.github.com> Date: Tue, 1 Apr 2025 14:51:58 -0400 Subject: [PATCH 0291/1029] [mlir] Canonicalize extract_slice(unpack) (#133777) Canonicalizes a chain of `linalg.unpack -> tensor.extract_slice` into a `linalg.unpack` with reduced dest sizes. This will only happen when the unpack op's only user is a non rank-reducing slice with zero offset and unit strides. --------- Signed-off-by: Max Dawkins Signed-off-by: Max Dawkins Co-authored-by: Max Dawkins --- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 24 +++++++ mlir/test/Dialect/Linalg/canonicalize.mlir | 75 ++++++++++++++++++++++ 2 files changed, 99 insertions(+) diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index ff89ead59981c..d6b093c5fb86b 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -29,6 +29,7 @@ #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/Matchers.h" @@ -5243,6 +5244,29 @@ LogicalResult UnPackOp::canonicalize(UnPackOp unPackOp, [&]() { unPackOp.setDpsInitOperand(0, newDest); }); return success(); } + /// extract_slice(unpack(x into y)) -> unpack(x into extract_slice(y)) + if (unPackOp->hasOneUse()) { + auto extractSliceUser = + dyn_cast(*unPackOp->getUsers().begin()); + if (extractSliceUser && + areAllConstantIntValue(extractSliceUser.getMixedOffsets(), 0) && + areAllConstantIntValue(extractSliceUser.getMixedStrides(), 1) && + extractSliceUser.getSourceType().getRank() == + extractSliceUser.getResultType().getRank()) { + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(unPackOp); + auto newDest = rewriter.create( + unPackOp->getLoc(), unPackOp.getDest(), + extractSliceUser.getMixedOffsets(), extractSliceUser.getMixedSizes(), + extractSliceUser.getMixedStrides()); + rewriter.modifyOpInPlace(unPackOp, [&]() { + unPackOp.setDpsInitOperand(0, newDest); + unPackOp.getResult().setType(newDest.getType()); + }); + rewriter.replaceOp(extractSliceUser, unPackOp); + return success(); + } + } // Insert tensor.cast ops if static shape inference is available.. SmallVector srcShape, destShape; diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir index f99491c25d832..86cb8f58abe02 100644 --- a/mlir/test/Dialect/Linalg/canonicalize.mlir +++ b/mlir/test/Dialect/Linalg/canonicalize.mlir @@ -1772,3 +1772,78 @@ func.func @fold_cast_unpack_dynamic_tile_size( into %res {test_attr} : tensor<1x1x?x1xi32> -> tensor<7x?xi32> return %unpack : tensor<7x?xi32> } + +// ----- + +//===----------------------------------------------------------------------===// +// linalg.unpack + tensor.extract_slice +//===----------------------------------------------------------------------===// + +func.func @fold_extract_slice_into_unpack( + %src : tensor<28x2x?x16x16xf32>, %dest : tensor<28x32x?xf32>, %size : index +) -> tensor<28x28x?xf32> { + %unpack = linalg.unpack %src + outer_dims_perm = [0, 1, 2] + inner_dims_pos = [1, 2] + inner_tiles = [16, 16] + into %dest : tensor<28x2x?x16x16xf32> -> tensor<28x32x?xf32> + %extracted_slice = tensor.extract_slice %unpack + [0, 0, 0] [28, 28, %size] [1, 1, 1] : tensor<28x32x?xf32> to tensor<28x28x?xf32> + return %extracted_slice : tensor<28x28x?xf32> +} + +// CHECK-LABEL: func @fold_extract_slice_into_unpack +// CHECK-SAME: %[[SRC:.+]]: tensor<28x2x?x16x16xf32> +// CHECK-SAME: %[[DEST:.+]]: tensor<28x32x?xf32> +// CHECK-SAME: %[[SIZE:.+]]: index +// CHECK: %[[DEST_SLICE:.+]] = tensor.extract_slice %[[DEST]] +// CHECK-SAME: [0, 0, 0] [28, 28, %[[SIZE]]] [1, 1, 1] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[SRC]] +// CHECK-SAME: into %[[DEST_SLICE]] +// CHECK: return %[[UNPACK]] + +// ----- + +func.func @no_fold_extract_slice_into_unpack_rank_reducing( + %src : tensor<28x2x16xf32>, %dest : tensor<28x32xf32> +) -> tensor<28xf32> { + %unpack = linalg.unpack %src + outer_dims_perm = [0, 1] + inner_dims_pos = [1] + inner_tiles = [16] + into %dest : tensor<28x2x16xf32> -> tensor<28x32xf32> + %extracted_slice = tensor.extract_slice %unpack + [0, 0] [1, 28] [1, 1] : tensor<28x32xf32> to tensor<28xf32> + return %extracted_slice : tensor<28xf32> +} + +// CHECK-LABEL: func @no_fold_extract_slice_into_unpack_rank_reducing +// CHECK-SAME: %[[SRC:.+]]: tensor<28x2x16xf32> +// CHECK-SAME: %[[DEST:.+]]: tensor<28x32xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[SRC]] +// CHECK-SAME: into %[[DEST]] +// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[UNPACK]] +// CHECK: return %[[SLICE]] + +// ----- + +func.func @no_fold_extract_slice_into_unpack_non_zero_offset( + %src : tensor<28x2x16xf32>, %dest : tensor<28x32xf32> +) -> tensor<28x28xf32> { + %unpack = linalg.unpack %src + outer_dims_perm = [0, 1] + inner_dims_pos = [1] + inner_tiles = [16] + into %dest : tensor<28x2x16xf32> -> tensor<28x32xf32> + %extracted_slice = tensor.extract_slice %unpack + [0, 1] [28, 28] [1, 1] : tensor<28x32xf32> to tensor<28x28xf32> + return %extracted_slice : tensor<28x28xf32> +} + +// CHECK-LABEL: func @no_fold_extract_slice_into_unpack_non_zero_offset +// CHECK-SAME: %[[SRC:.+]]: tensor<28x2x16xf32> +// CHECK-SAME: %[[DEST:.+]]: tensor<28x32xf32> +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[SRC]] +// CHECK-SAME: into %[[DEST]] +// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[UNPACK]] +// CHECK: return %[[SLICE]] From ac0649a75a60743faa16b77396db617587d45fea Mon Sep 17 00:00:00 2001 From: Mark Danial Date: Tue, 1 Apr 2025 15:47:19 -0400 Subject: [PATCH 0292/1029] [OpenMP] [AIX] Add missing } in openmp/runtime/src/z_Linux_util.cpp (#133973) Changes from https://github.com/llvm/llvm-project/pull/133034 removed a `}` presumably accidentally that are causing failures in the AIX flang bot. --- openmp/runtime/src/z_Linux_util.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp index 0c89f2346750f..87f2711186420 100644 --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -2520,6 +2520,8 @@ int __kmp_get_load_balance(int max) { glb_running_threads = running_threads; return running_threads; +} + #elif KMP_OS_HAIKU int __kmp_get_load_balance(int max) { return -1; } From 23fb048ce35f672d8db3f466a2522354bbce66e5 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 1 Apr 2025 12:55:07 -0700 Subject: [PATCH 0293/1029] [CI] Fix Monolithic Linux Build in Ubuntu 24.04 (#133628) This patch fixes the monolithic linux build in Ubuntu 24.04. Newer versions of debian/ubuntu pass a warning when installing packages at the system level using pip as it interferes with system package manager installed python packages. We do not use any system package manager installed python packages, so we just ignore the warning (that is an error without passing the flag) by passing the --break-system-packages flag. --- .ci/monolithic-linux.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index 4b6e56b4a4eda..ec7a85bc5f15f 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -53,9 +53,9 @@ targets="${2}" lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests" echo "--- cmake" -pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt -pip install -q -r "${MONOREPO_ROOT}"/lldb/test/requirements.txt -pip install -q -r "${MONOREPO_ROOT}"/.ci/requirements.txt +pip install --break-system-packages -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt +pip install --break-system-packages -q -r "${MONOREPO_ROOT}"/lldb/test/requirements.txt +pip install --break-system-packages -q -r "${MONOREPO_ROOT}"/.ci/requirements.txt cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D LLVM_ENABLE_PROJECTS="${projects}" \ -G Ninja \ From 4a73c99329ed36712a6faa8e37aa895cfeac23ce Mon Sep 17 00:00:00 2001 From: Qiongsi Wu Date: Tue, 1 Apr 2025 12:55:17 -0700 Subject: [PATCH 0294/1029] [clang][Modules] Fix the Size of `RecordDecl`'s `BitCodeAbbrevOp` (#133500) https://github.com/llvm/llvm-project/pull/102040/files#diff-125f472e690aa3d973bc42aa3c5d580226c5c47661551aca2889f960681aa64dR2477 added 1 bit to `RecordDecl`'s serialization format, but did not increment its abbreviation size. This can lead to rare cases where a record may overflow if the `RecordDecl`'s `getArgPassingRestrictions()` returns something bigger than 1 (see [here](https://github.com/llvm/llvm-project/blob/b3f01a6aa45b00240cec1c64286b85d7ba87e2af/clang/lib/Serialization/ASTWriterDecl.cpp#L688)). rdar://143763558 --- clang/lib/Serialization/ASTWriterDecl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index a14b8cf201bba..f377c145a4204 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -2579,7 +2579,7 @@ void ASTWriter::WriteDeclAbbrevs() { // RecordDecl Abv->Add(BitCodeAbbrevOp( BitCodeAbbrevOp::Fixed, - 13)); // Packed Record Decl Bits: FlexibleArrayMember, + 14)); // Packed Record Decl Bits: FlexibleArrayMember, // AnonymousStructUnion, hasObjectMember, hasVolatileMember, // isNonTrivialToPrimitiveDefaultInitialize, // isNonTrivialToPrimitiveCopy, isNonTrivialToPrimitiveDestroy, From 782e0cef762c1346396eb7dd75462f842be350e3 Mon Sep 17 00:00:00 2001 From: David Peixotto Date: Tue, 1 Apr 2025 12:55:41 -0700 Subject: [PATCH 0295/1029] [lldb] Fix intel trace plugin tests (#133826) The tests for the [intel-pt](https://github.com/llvm/llvm-project/blob/348374028970c956f2e49ab7553b495d7408ccd9/lldb/docs/use/intel_pt.rst) trace plugin were failing for multiple reasons. On machines where tracing is supported many of the tests were crashing because of a nullptr dereference. It looks like the `core_file` parameter in `ProcessTrace::CreateInstance` was once ignored, but was changed to always being dereferenced. This caused the tests to fail even when tracing was supported. On machines where tracing is not supported we would still run tests that attempt to take a trace. These would obviously fail because the required hardware is not present. Note that some of the tests simply read serialized json as trace files which does not require any special hardware. This PR fixes these two issues by guarding the pointer dereference and then skipping unsupported tests on machines. With these changes the trace tests pass on both types of machines. We also add a new unit test to validate that a process can be created with a nullptr core_file through the generic process trace plugin path. --- .../test/tools/intelpt/intelpt_testcase.py | 7 +++ lldb/source/Target/ProcessTrace.cpp | 3 +- .../trace/TestTraceDumpFunctionCalls.py | 2 + .../API/commands/trace/TestTraceEvents.py | 1 + lldb/test/API/commands/trace/TestTraceSave.py | 2 + .../API/commands/trace/TestTraceStartStop.py | 1 + lldb/test/API/commands/trace/TestTraceTSC.py | 1 + .../TestTraceStartStopMultipleThreads.py | 1 + lldb/unittests/Process/CMakeLists.txt | 4 +- lldb/unittests/Process/ProcessTraceTest.cpp | 63 +++++++++++++++++++ 10 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 lldb/unittests/Process/ProcessTraceTest.cpp diff --git a/lldb/packages/Python/lldbsuite/test/tools/intelpt/intelpt_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/intelpt/intelpt_testcase.py index f1b7d7c33bf07..6e1d7e38f3a0f 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/intelpt/intelpt_testcase.py +++ b/lldb/packages/Python/lldbsuite/test/tools/intelpt/intelpt_testcase.py @@ -18,6 +18,13 @@ def wrapper(*args, **kwargs): return wrapper +def skipIfNoIntelPT(func): + """Skip tests if the system does not support tracing.""" + + supported = os.path.exists("/sys/bus/event_source/devices/intel_pt/type") + return unittest.skipIf(not supported, "intel-pt tracing is unsupported")(func) + + # Class that should be used by all python Intel PT tests. # # It has a handy check that skips the test if the intel-pt plugin is not enabled. diff --git a/lldb/source/Target/ProcessTrace.cpp b/lldb/source/Target/ProcessTrace.cpp index f131339905474..02272b1651da5 100644 --- a/lldb/source/Target/ProcessTrace.cpp +++ b/lldb/source/Target/ProcessTrace.cpp @@ -36,7 +36,8 @@ ProcessSP ProcessTrace::CreateInstance(TargetSP target_sp, bool can_connect) { if (can_connect) return nullptr; - return std::make_shared(target_sp, listener_sp, *crash_file); + return std::make_shared(target_sp, listener_sp, + crash_file ? *crash_file : FileSpec()); } bool ProcessTrace::CanDebug(TargetSP target_sp, bool plugin_specified_by_name) { diff --git a/lldb/test/API/commands/trace/TestTraceDumpFunctionCalls.py b/lldb/test/API/commands/trace/TestTraceDumpFunctionCalls.py index 761c262ae4de0..ebfda6226eef0 100644 --- a/lldb/test/API/commands/trace/TestTraceDumpFunctionCalls.py +++ b/lldb/test/API/commands/trace/TestTraceDumpFunctionCalls.py @@ -133,6 +133,7 @@ def testFunctionCallsWithErrors(self): ], ) + @skipIfNoIntelPT def testInlineFunctionCalls(self): self.expect( "file " + os.path.join(self.getSourceDir(), "inline-function", "a.out") @@ -194,6 +195,7 @@ def testInlineFunctionCalls(self): ], ) + @skipIfNoIntelPT def testIncompleteInlineFunctionCalls(self): self.expect( "file " + os.path.join(self.getSourceDir(), "inline-function", "a.out") diff --git a/lldb/test/API/commands/trace/TestTraceEvents.py b/lldb/test/API/commands/trace/TestTraceEvents.py index c20bcc247105b..af23907bb6019 100644 --- a/lldb/test/API/commands/trace/TestTraceEvents.py +++ b/lldb/test/API/commands/trace/TestTraceEvents.py @@ -45,6 +45,7 @@ def testCPUEvents(self): ], ) + @skipIfNoIntelPT @testSBAPIAndCommands def testPauseEvents(self): """ diff --git a/lldb/test/API/commands/trace/TestTraceSave.py b/lldb/test/API/commands/trace/TestTraceSave.py index af38669cb4fce..4e3c70695bcee 100644 --- a/lldb/test/API/commands/trace/TestTraceSave.py +++ b/lldb/test/API/commands/trace/TestTraceSave.py @@ -43,6 +43,7 @@ def testErrorMessages(self): "trace save", substrs=["error: Process is not being traced"], error=True ) + @skipIfNoIntelPT def testSaveToInvalidDir(self): self.expect( "target create " @@ -165,6 +166,7 @@ def checkSessionBundle(session_file_path): copied_cpu = find(lambda cor: cor["id"] == cpu["id"], copy["cpus"]) self.assertIsNotNone(copied_cpu) + @skipIfNoIntelPT def testSaveTrace(self): self.expect( "target create " diff --git a/lldb/test/API/commands/trace/TestTraceStartStop.py b/lldb/test/API/commands/trace/TestTraceStartStop.py index 5add321b4c83f..9450f8b0961a8 100644 --- a/lldb/test/API/commands/trace/TestTraceStartStop.py +++ b/lldb/test/API/commands/trace/TestTraceStartStop.py @@ -5,6 +5,7 @@ from lldbsuite.test.decorators import * +@skipIfNoIntelPT class TestTraceStartStop(TraceIntelPTTestCaseBase): def expectGenericHelpMessageForStartCommand(self): self.expect( diff --git a/lldb/test/API/commands/trace/TestTraceTSC.py b/lldb/test/API/commands/trace/TestTraceTSC.py index 4a19065e60c2b..b20ba5255549c 100644 --- a/lldb/test/API/commands/trace/TestTraceTSC.py +++ b/lldb/test/API/commands/trace/TestTraceTSC.py @@ -5,6 +5,7 @@ from lldbsuite.test.decorators import * +@skipIfNoIntelPT class TestTraceTimestampCounters(TraceIntelPTTestCaseBase): @testSBAPIAndCommands @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"])) diff --git a/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py b/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py index 12f99f07c78a8..017f5c845c7c6 100644 --- a/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py +++ b/lldb/test/API/commands/trace/multiple-threads/TestTraceStartStopMultipleThreads.py @@ -6,6 +6,7 @@ from lldbsuite.test.decorators import * +@skipIfNoIntelPT class TestTraceStartStopMultipleThreads(TraceIntelPTTestCaseBase): @skipIf(oslist=no_match(["linux"]), archs=no_match(["i386", "x86_64"])) @testSBAPIAndCommands diff --git a/lldb/unittests/Process/CMakeLists.txt b/lldb/unittests/Process/CMakeLists.txt index d3b37e006fd89..a240d773c3f30 100644 --- a/lldb/unittests/Process/CMakeLists.txt +++ b/lldb/unittests/Process/CMakeLists.txt @@ -7,8 +7,9 @@ endif() add_subdirectory(Utility) add_subdirectory(minidump) -add_lldb_unittest(ProcessEventDataTests +add_lldb_unittest(ProcessTests ProcessEventDataTest.cpp + ProcessTraceTest.cpp LINK_LIBS lldbCore @@ -18,5 +19,6 @@ add_lldb_unittest(ProcessEventDataTests lldbUtility lldbUtilityHelpers lldbInterpreter + lldbPluginPlatformLinux lldbPluginPlatformMacOSX ) diff --git a/lldb/unittests/Process/ProcessTraceTest.cpp b/lldb/unittests/Process/ProcessTraceTest.cpp new file mode 100644 index 0000000000000..fc6b92e868248 --- /dev/null +++ b/lldb/unittests/Process/ProcessTraceTest.cpp @@ -0,0 +1,63 @@ +//===-- ProcessEventDataTest.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Target/ProcessTrace.h" +#include "Plugins/Platform/Linux/PlatformLinux.h" +#include "lldb/Core/Debugger.h" +#include "lldb/Host/HostInfo.h" +#include "gtest/gtest.h" + +using namespace lldb_private; +using namespace lldb; +using namespace platform_linux; + +// This is needed for the tests that create a trace process. +class ProcessTraceTest : public ::testing::Test { +public: + void SetUp() override { + ProcessTrace::Initialize(); + FileSystem::Initialize(); + HostInfo::Initialize(); + PlatformLinux::Initialize(); + } + void TearDown() override { + PlatformLinux::Initialize(); + HostInfo::Terminate(); + FileSystem::Terminate(); + ProcessTrace::Terminate(); + } +}; + +TargetSP CreateTarget(DebuggerSP &debugger_sp, const ArchSpec &arch) { + PlatformSP platform_sp; + TargetSP target_sp; + debugger_sp->GetTargetList().CreateTarget( + *debugger_sp, "", arch, eLoadDependentsNo, platform_sp, target_sp); + return target_sp; +} + +// Test that we can create a process trace with a nullptr core file. +TEST_F(ProcessTraceTest, ConstructorWithNullptrCoreFile) { + ArchSpec arch("i386-pc-linux"); + + Platform::SetHostPlatform(PlatformLinux::CreateInstance(true, &arch)); + ASSERT_NE(Platform::GetHostPlatform(), nullptr); + + DebuggerSP debugger_sp = Debugger::CreateInstance(); + ASSERT_TRUE(debugger_sp); + + TargetSP target_sp = CreateTarget(debugger_sp, arch); + ASSERT_TRUE(target_sp); + + ProcessSP process_sp = target_sp->CreateProcess( + /*listener*/ nullptr, "trace", + /*crash_file*/ nullptr, + /*can_connect*/ false); + + ASSERT_NE(process_sp, nullptr); +} From ce296f1ebac896d03bdb783d9c5d3c93ab76f1ac Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 1 Apr 2025 12:58:16 -0700 Subject: [PATCH 0296/1029] [CI] Exclude gn changes from running premerge (#133623) These changes are mostly pushed by the gnsyncbot directly to main and thus don't go through a PR, but we still test on main to see if main is broken. Given these touch llvm/, they end up burning a decent amount of testing time for no real benefit, so I think it makes sense to exclude them from premerge testing explicitly. --- .ci/compute_projects.py | 5 +++++ .ci/compute_projects_test.py | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py index 7445e92eba1b1..ff43547c9bbe5 100644 --- a/.ci/compute_projects.py +++ b/.ci/compute_projects.py @@ -200,6 +200,11 @@ def _get_modified_projects(modified_files: list[str]) -> Set[str]: # documentation builds. if len(path_parts) > 2 and path_parts[1] == "docs": continue + # Exclude files for the gn build. We do not test it within premerge + # and changes occur often enough that they otherwise take up + # capacity. + if len(path_parts) > 3 and path_parts[:3] == ("llvm", "utils", "gn"): + continue modified_projects.add(pathlib.Path(modified_file).parts[0]) return modified_projects diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py index 1807337aefed4..e787fd8133c86 100644 --- a/.ci/compute_projects_test.py +++ b/.ci/compute_projects_test.py @@ -179,6 +179,15 @@ def test_exclude_docs(self): self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") + def test_exclude_gn(self): + env_variables = compute_projects.get_env_variables( + ["llvm/utils/gn/build/BUILD.gn"], "Linux" + ) + self.assertEqual(env_variables["projects_to_build"], "") + self.assertEqual(env_variables["project_check_targets"], "") + self.assertEqual(env_variables["runtimes_to_build"], "") + self.assertEqual(env_variables["runtimes_check_targets"], "") + if __name__ == "__main__": unittest.main() From 9f3d8e8fb8d389176e12c06de59cce3fd1ab8db2 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Tue, 1 Apr 2025 13:03:24 -0700 Subject: [PATCH 0297/1029] [CIR] Upstream support for while and do..while loops (#133157) This adds basic support for while and do..while loops. Support for break and continue are left for a subsequent patch. --- .../CIR/Dialect/Builder/CIRBaseBuilder.h | 16 +++ .../include/clang/CIR/Dialect/IR/CIRDialect.h | 3 + clang/include/clang/CIR/Dialect/IR/CIROps.td | 103 +++++++++++++++- clang/lib/CIR/CodeGen/CIRGenFunction.h | 4 + clang/lib/CIR/CodeGen/CIRGenStmt.cpp | 113 +++++++++++++++++- clang/lib/CIR/Dialect/IR/CIRDialect.cpp | 14 --- clang/test/CIR/CodeGen/loop.cpp | 76 ++++++++++++ clang/test/CIR/Transforms/loop.cir | 41 +++++++ 8 files changed, 351 insertions(+), 19 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index 8b17cb7446afa..e666be0b25d75 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -111,6 +111,22 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return cir::BoolAttr::get(getContext(), getBoolTy(), state); } + /// Create a do-while operation. + cir::DoWhileOp createDoWhile( + mlir::Location loc, + llvm::function_ref condBuilder, + llvm::function_ref bodyBuilder) { + return create(loc, condBuilder, bodyBuilder); + } + + /// Create a while operation. + cir::WhileOp createWhile( + mlir::Location loc, + llvm::function_ref condBuilder, + llvm::function_ref bodyBuilder) { + return create(loc, condBuilder, bodyBuilder); + } + /// Create a for operation. cir::ForOp createFor( mlir::Location loc, diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.h b/clang/include/clang/CIR/Dialect/IR/CIRDialect.h index da3b41371b9ab..4d7f537418a90 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.h +++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.h @@ -32,6 +32,9 @@ #include "clang/CIR/Interfaces/CIRLoopOpInterface.h" #include "clang/CIR/Interfaces/CIROpInterfaces.h" +using BuilderCallbackRef = + llvm::function_ref; + // TableGen'erated files for MLIR dialects require that a macro be defined when // they are included. GET_OP_CLASSES tells the file to define the classes for // the operations of that dialect. diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index 455cc2b8b0277..3965372755685 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -424,7 +424,8 @@ def StoreOp : CIR_Op<"store", [ // ReturnOp //===----------------------------------------------------------------------===// -def ReturnOp : CIR_Op<"return", [ParentOneOf<["FuncOp", "ScopeOp", "ForOp"]>, +def ReturnOp : CIR_Op<"return", [ParentOneOf<["FuncOp", "ScopeOp", "DoWhileOp", + "WhileOp", "ForOp"]>, Terminator]> { let summary = "Return from function"; let description = [{ @@ -511,7 +512,8 @@ def ConditionOp : CIR_Op<"condition", [ //===----------------------------------------------------------------------===// def YieldOp : CIR_Op<"yield", [ReturnLike, Terminator, - ParentOneOf<["ScopeOp", "ForOp"]>]> { + ParentOneOf<["ScopeOp", "WhileOp", "ForOp", + "DoWhileOp"]>]> { let summary = "Represents the default branching behaviour of a region"; let description = [{ The `cir.yield` operation terminates regions on different CIR operations, @@ -759,11 +761,106 @@ def BrCondOp : CIR_Op<"brcond", }]; } +//===----------------------------------------------------------------------===// +// Common loop op definitions +//===----------------------------------------------------------------------===// + +class LoopOpBase : CIR_Op { + let extraClassDefinition = [{ + void $cppClass::getSuccessorRegions( + mlir::RegionBranchPoint point, + llvm::SmallVectorImpl ®ions) { + LoopOpInterface::getLoopOpSuccessorRegions(*this, point, regions); + } + llvm::SmallVector $cppClass::getLoopRegions() { + return {&getBody()}; + } + }]; +} + +//===----------------------------------------------------------------------===// +// While & DoWhileOp +//===----------------------------------------------------------------------===// + +class WhileOpBase : LoopOpBase { + defvar isWhile = !eq(mnemonic, "while"); + let summary = "C/C++ " # !if(isWhile, "while", "do-while") # " loop"; + let builders = [ + OpBuilder<(ins "BuilderCallbackRef":$condBuilder, + "BuilderCallbackRef":$bodyBuilder), [{ + mlir::OpBuilder::InsertionGuard guard($_builder); + $_builder.createBlock($_state.addRegion()); + }] # !if(isWhile, [{ + condBuilder($_builder, $_state.location); + $_builder.createBlock($_state.addRegion()); + bodyBuilder($_builder, $_state.location); + }], [{ + bodyBuilder($_builder, $_state.location); + $_builder.createBlock($_state.addRegion()); + condBuilder($_builder, $_state.location); + }])> + ]; +} + +def WhileOp : WhileOpBase<"while"> { + let regions = (region SizedRegion<1>:$cond, MinSizedRegion<1>:$body); + let assemblyFormat = "$cond `do` $body attr-dict"; + + let description = [{ + Represents a C/C++ while loop. It consists of two regions: + + - `cond`: single block region with the loop's condition. Should be + terminated with a `cir.condition` operation. + - `body`: contains the loop body and an arbitrary number of blocks. + + Example: + + ```mlir + cir.while { + cir.break + ^bb2: + cir.yield + } do { + cir.condition %cond : cir.bool + } + ``` + }]; +} + +def DoWhileOp : WhileOpBase<"do"> { + let regions = (region MinSizedRegion<1>:$body, SizedRegion<1>:$cond); + let assemblyFormat = " $body `while` $cond attr-dict"; + + let extraClassDeclaration = [{ + mlir::Region &getEntry() { return getBody(); } + }]; + + let description = [{ + Represents a C/C++ do-while loop. Identical to `cir.while` but the + condition is evaluated after the body. + + Example: + + ```mlir + cir.do { + cir.break + ^bb2: + cir.yield + } while { + cir.condition %cond : cir.bool + } + ``` + }]; +} + //===----------------------------------------------------------------------===// // ForOp //===----------------------------------------------------------------------===// -def ForOp : CIR_Op<"for", [LoopOpInterface, NoRegionArguments]> { +def ForOp : LoopOpBase<"for"> { let summary = "C/C++ for loop counterpart"; let description = [{ Represents a C/C++ for loop. It consists of three regions: diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 3b8171eea9ee0..5cae4d5da9516 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -395,6 +395,8 @@ class CIRGenFunction : public CIRGenTypeCache { LValue emitBinaryOperatorLValue(const BinaryOperator *e); + mlir::LogicalResult emitDoStmt(const clang::DoStmt &s); + /// Emit an expression as an initializer for an object (variable, field, etc.) /// at the given location. The expression is not necessarily the normal /// initializer for the object, and the address is not necessarily @@ -493,6 +495,8 @@ class CIRGenFunction : public CIRGenTypeCache { /// inside a function, including static vars etc. void emitVarDecl(const clang::VarDecl &d); + mlir::LogicalResult emitWhileStmt(const clang::WhileStmt &s); + /// ---------------------- /// CIR build helpers /// ----------------- diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp index aa04ff6345fc6..b5c1f0ae2a7ef 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp @@ -75,6 +75,10 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s, case Stmt::ForStmtClass: return emitForStmt(cast(*s)); + case Stmt::WhileStmtClass: + return emitWhileStmt(cast(*s)); + case Stmt::DoStmtClass: + return emitDoStmt(cast(*s)); case Stmt::OMPScopeDirectiveClass: case Stmt::OMPErrorDirectiveClass: @@ -97,8 +101,6 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s, case Stmt::SYCLKernelCallStmtClass: case Stmt::IfStmtClass: case Stmt::SwitchStmtClass: - case Stmt::WhileStmtClass: - case Stmt::DoStmtClass: case Stmt::CoroutineBodyStmtClass: case Stmt::CoreturnStmtClass: case Stmt::CXXTryStmtClass: @@ -387,3 +389,110 @@ mlir::LogicalResult CIRGenFunction::emitForStmt(const ForStmt &s) { terminateBody(builder, forOp.getBody(), getLoc(s.getEndLoc())); return mlir::success(); } + +mlir::LogicalResult CIRGenFunction::emitDoStmt(const DoStmt &s) { + cir::DoWhileOp doWhileOp; + + // TODO: pass in array of attributes. + auto doStmtBuilder = [&]() -> mlir::LogicalResult { + mlir::LogicalResult loopRes = mlir::success(); + assert(!cir::MissingFeatures::loopInfoStack()); + // From LLVM: if there are any cleanups between here and the loop-exit + // scope, create a block to stage a loop exit along. + // We probably already do the right thing because of ScopeOp, but make + // sure we handle all cases. + assert(!cir::MissingFeatures::requiresCleanups()); + + doWhileOp = builder.createDoWhile( + getLoc(s.getSourceRange()), + /*condBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + assert(!cir::MissingFeatures::createProfileWeightsForLoop()); + assert(!cir::MissingFeatures::emitCondLikelihoodViaExpectIntrinsic()); + // C99 6.8.5p2/p4: The first substatement is executed if the + // expression compares unequal to 0. The condition must be a + // scalar type. + mlir::Value condVal = evaluateExprAsBool(s.getCond()); + builder.createCondition(condVal); + }, + /*bodyBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + // The scope of the do-while loop body is a nested scope. + if (emitStmt(s.getBody(), /*useCurrentScope=*/false).failed()) + loopRes = mlir::failure(); + emitStopPoint(&s); + }); + return loopRes; + }; + + mlir::LogicalResult res = mlir::success(); + mlir::Location scopeLoc = getLoc(s.getSourceRange()); + builder.create(scopeLoc, /*scopeBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + LexicalScope lexScope{ + *this, loc, builder.getInsertionBlock()}; + res = doStmtBuilder(); + }); + + if (res.failed()) + return res; + + terminateBody(builder, doWhileOp.getBody(), getLoc(s.getEndLoc())); + return mlir::success(); +} + +mlir::LogicalResult CIRGenFunction::emitWhileStmt(const WhileStmt &s) { + cir::WhileOp whileOp; + + // TODO: pass in array of attributes. + auto whileStmtBuilder = [&]() -> mlir::LogicalResult { + mlir::LogicalResult loopRes = mlir::success(); + assert(!cir::MissingFeatures::loopInfoStack()); + // From LLVM: if there are any cleanups between here and the loop-exit + // scope, create a block to stage a loop exit along. + // We probably already do the right thing because of ScopeOp, but make + // sure we handle all cases. + assert(!cir::MissingFeatures::requiresCleanups()); + + whileOp = builder.createWhile( + getLoc(s.getSourceRange()), + /*condBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + assert(!cir::MissingFeatures::createProfileWeightsForLoop()); + assert(!cir::MissingFeatures::emitCondLikelihoodViaExpectIntrinsic()); + mlir::Value condVal; + // If the for statement has a condition scope, + // emit the local variable declaration. + if (s.getConditionVariable()) + emitDecl(*s.getConditionVariable()); + // C99 6.8.5p2/p4: The first substatement is executed if the + // expression compares unequal to 0. The condition must be a + // scalar type. + condVal = evaluateExprAsBool(s.getCond()); + builder.createCondition(condVal); + }, + /*bodyBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + // The scope of the while loop body is a nested scope. + if (emitStmt(s.getBody(), /*useCurrentScope=*/false).failed()) + loopRes = mlir::failure(); + emitStopPoint(&s); + }); + return loopRes; + }; + + mlir::LogicalResult res = mlir::success(); + mlir::Location scopeLoc = getLoc(s.getSourceRange()); + builder.create(scopeLoc, /*scopeBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + LexicalScope lexScope{ + *this, loc, builder.getInsertionBlock()}; + res = whileStmtBuilder(); + }); + + if (res.failed()) + return res; + + terminateBody(builder, whileOp.getBody(), getLoc(s.getEndLoc())); + return mlir::success(); +} diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp index cdcfa77b66379..4ace083e3c081 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp @@ -538,20 +538,6 @@ Block *cir::BrCondOp::getSuccessorForOperands(ArrayRef operands) { return nullptr; } -//===----------------------------------------------------------------------===// -// ForOp -//===----------------------------------------------------------------------===// - -void cir::ForOp::getSuccessorRegions( - mlir::RegionBranchPoint point, - llvm::SmallVectorImpl ®ions) { - LoopOpInterface::getLoopOpSuccessorRegions(*this, point, regions); -} - -llvm::SmallVector cir::ForOp::getLoopRegions() { - return {&getBody()}; -} - //===----------------------------------------------------------------------===// // GlobalOp //===----------------------------------------------------------------------===// diff --git a/clang/test/CIR/CodeGen/loop.cpp b/clang/test/CIR/CodeGen/loop.cpp index f0b570a92964d..a950460e8838d 100644 --- a/clang/test/CIR/CodeGen/loop.cpp +++ b/clang/test/CIR/CodeGen/loop.cpp @@ -189,3 +189,79 @@ void l3() { // OGCG: [[FOR_COND]]: // OGCG: store i32 0, ptr %[[I]], align 4 // OGCG: br label %[[FOR_COND]] + +void test_do_while_false() { + do { + } while (0); +} + +// CIR: cir.func @test_do_while_false() +// CIR-NEXT: cir.scope { +// CIR-NEXT: cir.do { +// CIR-NEXT: cir.yield +// CIR-NEXT: } while { +// CIR-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i +// CIR-NEXT: %[[FALSE:.*]] = cir.cast(int_to_bool, %[[ZERO]] : !s32i), !cir.bool +// CIR-NEXT: cir.condition(%[[FALSE]]) + +// LLVM: define void @test_do_while_false() +// LLVM: br label %[[LABEL1:.*]] +// LLVM: [[LABEL1]]: +// LLVM: br label %[[LABEL3:.*]] +// LLVM: [[LABEL2:.*]]: +// LLVM: br i1 false, label %[[LABEL3]], label %[[LABEL4:.*]] +// LLVM: [[LABEL3]]: +// LLVM: br label %[[LABEL2]] +// LLVM: [[LABEL4]]: +// LLVM: br label %[[LABEL5:.*]] +// LLVM: [[LABEL5]]: +// LLVM: ret void + +// OGCG: define{{.*}} void @_Z19test_do_while_falsev() +// OGCG: entry: +// OGCG: br label %[[DO_BODY:.*]] +// OGCG: [[DO_BODY]]: +// OGCG: br label %[[DO_END:.*]] +// OGCG: [[DO_END]]: +// OGCG: ret void + +void test_empty_while_true() { + while (true) { + return; + } +} + +// CIR: cir.func @test_empty_while_true() +// CIR-NEXT: cir.scope { +// CIR-NEXT: cir.while { +// CIR-NEXT: %[[TRUE:.*]] = cir.const #true +// CIR-NEXT: cir.condition(%[[TRUE]]) +// CIR-NEXT: } do { +// CIR-NEXT: cir.scope { +// CIR-NEXT: cir.return +// CIR-NEXT: } +// CIR-NEXT: cir.yield + +// LLVM: define void @test_empty_while_true() +// LLVM: br label %[[LABEL1:.*]] +// LLVM: [[LABEL1]]: +// LLVM: br label %[[LABEL2:.*]] +// LLVM: [[LABEL2]]: +// LLVM: br i1 true, label %[[LABEL3:.*]], label %[[LABEL6:.*]] +// LLVM: [[LABEL3]]: +// LLVM: br label %[[LABEL4]] +// LLVM: [[LABEL4]]: +// LLVM: ret void +// LLVM: [[LABEL5:.*]]: +// LLVM-SAME: ; No predecessors! +// LLVM: br label %[[LABEL2:.*]] +// LLVM: [[LABEL6]]: +// LLVM: br label %[[LABEL7:.*]] +// LLVM: [[LABEL7]]: +// LLVM: ret void + +// OGCG: define{{.*}} void @_Z21test_empty_while_truev() +// OGCG: entry: +// OGCG: br label %[[WHILE_BODY:.*]] +// OGCG: [[WHILE_BODY]]: +// OGCG: ret void diff --git a/clang/test/CIR/Transforms/loop.cir b/clang/test/CIR/Transforms/loop.cir index 4fde3a7bb43f1..d02412d049158 100644 --- a/clang/test/CIR/Transforms/loop.cir +++ b/clang/test/CIR/Transforms/loop.cir @@ -26,4 +26,45 @@ module { // CHECK: cir.br ^bb[[#COND:]] // CHECK: ^bb[[#EXIT]]: // CHECK: cir.return +// CHECK: } + + // Test while cir.loop operation lowering. + cir.func @testWhile(%arg0 : !cir.bool) { + cir.while { + cir.condition(%arg0) + } do { + cir.yield + } + cir.return + } + +// CHECK: cir.func @testWhile(%arg0: !cir.bool) { +// CHECK: cir.br ^bb[[#COND:]] +// CHECK: ^bb[[#COND]]: +// CHECK: cir.brcond %arg0 ^bb[[#BODY:]], ^bb[[#EXIT:]] +// CHECK: ^bb[[#BODY]]: +// CHECK: cir.br ^bb[[#COND:]] +// CHECK: ^bb[[#EXIT]]: +// CHECK: cir.return +// CHECK: } + + + // Test do-while cir.loop operation lowering. + cir.func @testDoWhile(%arg0 : !cir.bool) { + cir.do { + cir.yield + } while { + cir.condition(%arg0) + } + cir.return + } + +// CHECK: cir.func @testDoWhile(%arg0: !cir.bool) { +// CHECK: cir.br ^bb[[#BODY:]] +// CHECK: ^bb[[#COND]]: +// CHECK: cir.brcond %arg0 ^bb[[#BODY:]], ^bb[[#EXIT:]] +// CHECK: ^bb[[#BODY]]: +// CHECK: cir.br ^bb[[#COND:]] +// CHECK: ^bb[[#EXIT]]: +// CHECK: cir.return // CHECK: } From 3c7a0e6c826b8bcfa4ec6154fd4247658ca3a03f Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Tue, 1 Apr 2025 13:08:57 -0700 Subject: [PATCH 0298/1029] [NVPTX] Cleanup and refactor atomic lowering (#133781) Cleanup lowering of atomic instructions and intrninsics. The TableGen changes are primarily a refactor, though sub variants are now lowered via operation legalization, potentially allowing for more DAG optimization. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1 + llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 17 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 787 ++++---------------- llvm/test/CodeGen/NVPTX/atomics.ll | 22 +- 4 files changed, 184 insertions(+), 643 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 8a4b83365ae84..b566cdd4b6bfc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -994,6 +994,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom); + setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand); // No FPOW or FREM in PTX. // Now deduce the information based on the above mentioned diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index fe9bb621b481c..7d0c47fa464c5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -216,16 +216,25 @@ class fpimm_pos_inf // Utility class to wrap up information about a register and DAG type for more // convenient iteration and parameterization -class RegTyInfo { +class RegTyInfo { ValueType Ty = ty; NVPTXRegClass RC = rc; Operand Imm = imm; + SDNode ImmNode = imm_node; + bit SupportsImm = supports_imm; int Size = ty.Size; } -def I16RT : RegTyInfo; -def I32RT : RegTyInfo; -def I64RT : RegTyInfo; +def I16RT : RegTyInfo; +def I32RT : RegTyInfo; +def I64RT : RegTyInfo; + +def F32RT : RegTyInfo; +def F64RT : RegTyInfo; +def F16RT : RegTyInfo; +def BF16RT : RegTyInfo; + // Template for instructions which take three int64, int32, or int16 args. // The instructions are named "" (e.g. "add.s64"). diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index b2e05a567b4fe..34cb63e44ca71 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1975,529 +1975,135 @@ def INT_FNS_iii : INT_FNS_MBO<(ins i32imm:$mask, i32imm:$base, i32imm:$ // Atomic Functions //----------------------------------- -class ATOMIC_GLOBAL_CHK - : PatFrag; -class ATOMIC_SHARED_CHK - : PatFrag; -class ATOMIC_GENERIC_CHK - : PatFrag; - -multiclass F_ATOMIC_2< - ValueType regT, NVPTXRegClass regclass, - string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, - Operand IMMType, SDNode IMM, list Pred = []> { - let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def r : NVPTXInst<(outs regclass:$dst), (ins ADDR:$addr, regclass:$b), - "atom" # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b;", - [(set (regT regclass:$dst), (IntOp addr:$addr, (regT regclass:$b)))]>, - Requires; - if !not(!or(!eq(TypeStr, ".f16"), !eq(TypeStr, ".bf16"))) then - def i : NVPTXInst<(outs regclass:$dst), (ins ADDR:$addr, IMMType:$b), - "atom" # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b;", - [(set (regT regclass:$dst), (IntOp addr:$addr, IMM:$b))]>, - Requires; - } -} +class ATOMIC_GLOBAL_CHK + : PatFrag; +class ATOMIC_SHARED_CHK + : PatFrag; +class ATOMIC_GENERIC_CHK + : PatFrag; + -// has 2 operands, neg the second one -multiclass F_ATOMIC_2_NEG< - ValueType regT, NVPTXRegClass regclass, - string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, - list Pred = []> { +multiclass F_ATOMIC_2 preds> { + defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b;"; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def reg : NVPTXInst<(outs regclass:$dst), (ins ADDR:$addr, regclass:$b), - !strconcat( - "{{ \n\t", - ".reg \t.s", TypeStr, " temp; \n\t", - "neg.s", TypeStr, " \ttemp, $b; \n\t", - "atom", SpaceStr, OpcStr, ".u", TypeStr, " \t$dst, [$addr], temp; \n\t", - "}}"), - [(set (regT regclass:$dst), (IntOp addr:$addr, (regT regclass:$b)))]>, - Requires; + def r : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b))]>, + Requires; + if t.SupportsImm then + def i : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b)))]>, + Requires; } } // has 3 operands -multiclass F_ATOMIC_3< - ValueType regT, NVPTXRegClass regclass, string SemStr, - string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, - Operand IMMType, list Pred = []> { +multiclass F_ATOMIC_3 preds> { + defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;"; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def rr : NVPTXInst<(outs regclass:$dst), - (ins ADDR:$addr, regclass:$b, regclass:$c), - "atom" # SemStr # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b, $c;", - [(set (regT regclass:$dst), (IntOp addr:$addr, regT:$b, regT:$c))]>, - Requires; - - def ir : NVPTXInst<(outs regclass:$dst), - (ins ADDR:$addr, IMMType:$b, regclass:$c), - "atom" # SemStr # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b, $c;", - [(set (regT regclass:$dst), (IntOp addr:$addr, imm:$b, regT:$c))]>, - Requires; - - def ri : NVPTXInst<(outs regclass:$dst), - (ins ADDR:$addr, regclass:$b, IMMType:$c), - "atom" # SemStr # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b, $c;", - [(set (regT regclass:$dst), (IntOp addr:$addr, regT:$b, imm:$c))]>, - Requires; - - def ii : NVPTXInst<(outs regclass:$dst), - (ins ADDR:$addr, IMMType:$b, IMMType:$c), - "atom" # SemStr # SpaceStr # OpcStr # TypeStr # " \t$dst, [$addr], $b, $c;", - [(set (regT regclass:$dst), (IntOp addr:$addr, imm:$b, imm:$c))]>, - Requires; + def rr : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.RC:$c), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, t.Ty:$c))]>, + Requires; + + def ir : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.RC:$c), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c))]>, + Requires; + + def ri : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.Imm:$c), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)))]>, + Requires; + + def ii : NVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), + asm_str, + [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)))]>, + Requires; } } +multiclass F_ATOMIC_2_AS preds = []> { + defvar frag_pat = (frag node:$a, node:$b); + defm _G : F_ATOMIC_2, preds>; + defm _S : F_ATOMIC_2, preds>; + defm _GEN : F_ATOMIC_2, preds>; +} + +multiclass F_ATOMIC_3_AS preds = []> { + defvar frag_pat = (frag node:$a, node:$b, node:$c); + defm _G : F_ATOMIC_3, preds>; + defm _S : F_ATOMIC_3, preds>; + defm _GEN : F_ATOMIC_3, preds>; +} + // atom_add +defm INT_PTX_ATOM_ADD_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_ADD_64 : F_ATOMIC_2_AS; -def atomic_load_add_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_add_i32 node:$a, node:$b)>; -def atomic_load_add_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_add_i32 node:$a, node:$b)>; -def atomic_load_add_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_add_i32 node:$a, node:$b)>; -def atomic_load_add_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_add_i64 node:$a, node:$b)>; -def atomic_load_add_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_add_i64 node:$a, node:$b)>; -def atomic_load_add_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_add_i64 node:$a, node:$b)>; -def atomic_load_add_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_fadd node:$a, node:$b)>; -def atomic_load_add_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_fadd node:$a, node:$b)>; -def atomic_load_add_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_fadd node:$a, node:$b)>; - -defm INT_PTX_ATOM_ADD_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_GEN_32_USE_G : F_ATOMIC_2; - -defm INT_PTX_ATOM_ADD_G_64 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_S_64 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_GEN_64 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_GEN_64_USE_G : F_ATOMIC_2; - -defm INT_PTX_ATOM_ADD_G_F16 : F_ATOMIC_2, hasPTX<63>]>; -defm INT_PTX_ATOM_ADD_S_F16 : F_ATOMIC_2, hasPTX<63>]>; -defm INT_PTX_ATOM_ADD_GEN_F16 : F_ATOMIC_2, hasPTX<63>]>; - -defm INT_PTX_ATOM_ADD_G_BF16 : F_ATOMIC_2, hasPTX<78>]>; -defm INT_PTX_ATOM_ADD_S_BF16 : F_ATOMIC_2, hasPTX<78>]>; -defm INT_PTX_ATOM_ADD_GEN_BF16 : F_ATOMIC_2, hasPTX<78>]>; - -defm INT_PTX_ATOM_ADD_G_F32 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_S_F32 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_GEN_F32 : F_ATOMIC_2; - -defm INT_PTX_ATOM_ADD_G_F64 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_S_F64 : F_ATOMIC_2; -defm INT_PTX_ATOM_ADD_GEN_F64 : F_ATOMIC_2; - -// atom_sub - -def atomic_load_sub_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i32 node:$a, node:$b)>; -def atomic_load_sub_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i32 node:$a, node:$b)>; -def atomic_load_sub_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i32 node:$a, node:$b)>; -def atomic_load_sub_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i64 node:$a, node:$b)>; -def atomic_load_sub_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i64 node:$a, node:$b)>; -def atomic_load_sub_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_sub_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_SUB_G_32 : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_G_64 : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_GEN_32 : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_GEN_32_USE_G : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_S_32 : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_S_64 : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_GEN_64 : F_ATOMIC_2_NEG; -defm INT_PTX_ATOM_SUB_GEN_64_USE_G : F_ATOMIC_2_NEG; +defm INT_PTX_ATOM_ADD_F16 : F_ATOMIC_2_AS, hasPTX<63>]>; +defm INT_PTX_ATOM_ADD_BF16 : F_ATOMIC_2_AS, hasPTX<78>]>; +defm INT_PTX_ATOM_ADD_F32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_ADD_F64 : F_ATOMIC_2_AS; // atom_swap - -def atomic_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_swap_i32 node:$a, node:$b)>; -def atomic_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_swap_i32 node:$a, node:$b)>; -def atomic_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_swap_i32 node:$a, node:$b)>; -def atomic_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_swap_i64 node:$a, node:$b)>; -def atomic_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_swap_i64 node:$a, node:$b)>; -def atomic_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_swap_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_SWAP_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_G_64 : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_S_64 : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_GEN_64 : F_ATOMIC_2; -defm INT_PTX_ATOM_SWAP_GEN_64_USE_G : F_ATOMIC_2; +defm INT_PTX_ATOM_SWAP_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_SWAP_64 : F_ATOMIC_2_AS; // atom_max - -def atomic_load_max_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) - , (atomic_load_max_i32 node:$a, node:$b)>; -def atomic_load_max_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_max_i32 node:$a, node:$b)>; -def atomic_load_max_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_max_i32 node:$a, node:$b)>; -def atomic_load_max_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b) - , (atomic_load_max_i64 node:$a, node:$b)>; -def atomic_load_max_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_max_i64 node:$a, node:$b)>; -def atomic_load_max_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_max_i64 node:$a, node:$b)>; -def atomic_load_umax_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i32 node:$a, node:$b)>; -def atomic_load_umax_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i32 node:$a, node:$b)>; -def atomic_load_umax_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i32 node:$a, node:$b)>; -def atomic_load_umax_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i64 node:$a, node:$b)>; -def atomic_load_umax_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i64 node:$a, node:$b)>; -def atomic_load_umax_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umax_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_LOAD_MAX_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MAX_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MAX_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MAX_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MAX_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_MAX_S_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_MAX_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_MAX_GEN_64_USE_G : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMAX_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMAX_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMAX_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMAX_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMAX_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMAX_S_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMAX_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMAX_GEN_64_USE_G : F_ATOMIC_2]>; +defm INT_PTX_ATOMIC_MAX_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOMIC_MAX_64 : F_ATOMIC_2_AS]>; +defm INT_PTX_ATOMIC_UMAX_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOMIC_UMAX_64 : F_ATOMIC_2_AS]>; // atom_min - -def atomic_load_min_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_min_i32 node:$a, node:$b)>; -def atomic_load_min_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_min_i32 node:$a, node:$b)>; -def atomic_load_min_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_min_i32 node:$a, node:$b)>; -def atomic_load_min_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_min_i64 node:$a, node:$b)>; -def atomic_load_min_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_min_i64 node:$a, node:$b)>; -def atomic_load_min_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_min_i64 node:$a, node:$b)>; -def atomic_load_umin_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i32 node:$a, node:$b)>; -def atomic_load_umin_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i32 node:$a, node:$b)>; -def atomic_load_umin_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i32 node:$a, node:$b)>; -def atomic_load_umin_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i64 node:$a, node:$b)>; -def atomic_load_umin_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i64 node:$a, node:$b)>; -def atomic_load_umin_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_umin_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_LOAD_MIN_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MIN_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MIN_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MIN_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_MIN_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_MIN_S_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_MIN_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_MIN_GEN_64_USE_G : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMIN_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMIN_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMIN_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMIN_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_LOAD_UMIN_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMIN_S_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMIN_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_LOAD_UMIN_GEN_64_USE_G : F_ATOMIC_2]>; +defm INT_PTX_ATOMIC_MIN_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOMIC_MIN_64 : F_ATOMIC_2_AS]>; +defm INT_PTX_ATOMIC_UMIN_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOMIC_UMIN_64 : F_ATOMIC_2_AS]>; // atom_inc atom_dec - -def atomic_load_inc_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>; -def atomic_load_inc_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>; -def atomic_load_inc_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_inc_32 node:$a, node:$b)>; -def atomic_load_dec_32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; -def atomic_load_dec_32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; -def atomic_load_dec_32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (int_nvvm_atomic_load_dec_32 node:$a, node:$b)>; - -defm INT_PTX_ATOM_INC_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_INC_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_INC_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_INC_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_DEC_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_DEC_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_DEC_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_DEC_GEN_32_USE_G : F_ATOMIC_2; +defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2_AS; // atom_and - -def atomic_load_and_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_and_i32 node:$a, node:$b)>; -def atomic_load_and_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_and_i32 node:$a, node:$b)>; -def atomic_load_and_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_and_i32 node:$a, node:$b)>; -def atomic_load_and_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_and_i64 node:$a, node:$b)>; -def atomic_load_and_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_and_i64 node:$a, node:$b)>; -def atomic_load_and_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_and_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_AND_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_AND_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_AND_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_AND_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_AND_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_AND_S_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_AND_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_AND_GEN_64_USE_G : F_ATOMIC_2]>; +defm INT_PTX_ATOM_AND_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_AND_64 : F_ATOMIC_2_AS]>; // atom_or - -def atomic_load_or_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_or_i32 node:$a, node:$b)>; -def atomic_load_or_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_or_i32 node:$a, node:$b)>; -def atomic_load_or_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_or_i32 node:$a, node:$b)>; -def atomic_load_or_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_or_i64 node:$a, node:$b)>; -def atomic_load_or_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_or_i64 node:$a, node:$b)>; -def atomic_load_or_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_or_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_OR_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_OR_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_OR_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_OR_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_OR_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_OR_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_OR_GEN_64_USE_G : F_ATOMIC_2]>; -defm INT_PTX_ATOM_OR_S_64 : F_ATOMIC_2]>; +defm INT_PTX_ATOM_OR_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_OR_64 : F_ATOMIC_2_AS]>; // atom_xor +defm INT_PTX_ATOM_XOR_32 : F_ATOMIC_2_AS; +defm INT_PTX_ATOM_XOR_64 : F_ATOMIC_2_AS]>; -def atomic_load_xor_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i32 node:$a, node:$b)>; -def atomic_load_xor_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i32 node:$a, node:$b)>; -def atomic_load_xor_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i32 node:$a, node:$b)>; -def atomic_load_xor_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i64 node:$a, node:$b)>; -def atomic_load_xor_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i64 node:$a, node:$b)>; -def atomic_load_xor_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b), - (atomic_load_xor_i64 node:$a, node:$b)>; - -defm INT_PTX_ATOM_XOR_G_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_XOR_S_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_XOR_GEN_32 : F_ATOMIC_2; -defm INT_PTX_ATOM_XOR_GEN_32_USE_G : F_ATOMIC_2; -defm INT_PTX_ATOM_XOR_G_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_XOR_S_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2]>; -defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2]>; - -multiclass ternary_atomic_op_as { - // one record per address space - def NAME#_generic: PatFrag<(ops node:$ptr, node:$cmp, node:$val), - (!cast(NAME) node:$ptr, node:$cmp, node:$val), - AS_match.generic>; - - def NAME#_global: PatFrag<(ops node:$ptr, node:$cmp, node:$val), - (!cast(NAME) node:$ptr, node:$cmp, node:$val), - AS_match.global>; - - def NAME#_shared: PatFrag<(ops node:$ptr, node:$cmp, node:$val), - (!cast(NAME) node:$ptr, node:$cmp, node:$val), - AS_match.shared>; -} - -// generate pattern fragments for size x memory order -// NOTE: i8 cmpxchg is not supported in ptx, and AtomicExpandPass will emulate all i8 cmpxchgs -// using larger-bitwidth cas -foreach size = ["i16", "i32", "i64"] in { - foreach order = ["", "_monotonic", "_acquire", "_release", "_acq_rel", "_seq_cst"] in { - defm atomic_cmp_swap#_#size#order: ternary_atomic_op_as; - } -} - -// eg. with type = 32, order = ".acquire", addrspace = ".global", -// atomic_cmp_swap_pat = atomic_cmp_swap_i32_acquire_global. -// preds = [hasSM<70>, hasPTX<63>] -// F_ATOMIC_3, hasPTX<63>]> -multiclass INT_PTX_ATOM_CAS preds> - : F_ATOMIC_3("i"#type), - !cast("Int"#type#"Regs"), - order, - addrspace, - ".b"#type, - ".cas", - !cast(atomic_cmp_swap_pat), - !cast("i"#type#"imm"), - preds>; // Define atom.cas for all combinations of size x addrspace x memory order // supported in PTX *and* on the hardware. -foreach size = ["32", "64"] in { - foreach addrspace = ["generic", "global", "shared"] in { - defvar cas_addrspace_string = !if(!eq(addrspace, "generic"), "", "."#addrspace); - foreach order = ["acquire", "release", "acq_rel", "monotonic"] in { - defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order); - // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. - // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- - // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. - defm INT_PTX_ATOM_CAS_#size#_#order#addrspace - : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size, - cas_order_string, cas_addrspace_string, - [hasSM<70>, hasPTX<63>]>; - defm INT_PTX_ATOM_CAS_#size#_#order#_old#addrspace - : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size, - "", cas_addrspace_string, []>; - } +foreach t = [I32RT, I64RT] in { + foreach order = ["acquire", "release", "acq_rel", "monotonic"] in { + defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order); + defvar atomic_cmp_swap_pat = !cast("atomic_cmp_swap_i"#t.Size#_#order); + // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. + // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- + // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. + defm INT_PTX_ATOM_CAS_#t.Size#_#order + : F_ATOMIC_3_AS, hasPTX<63>]>; + defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old + : F_ATOMIC_3_AS; } } // Note that 16-bit CAS support in PTX is emulated. -defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3, hasPTX<63>]>; -defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3, hasPTX<63>]>; +defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS, hasPTX<63>]>; // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} @@ -2505,185 +2111,116 @@ defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3 Preds, - dag ins, dag Operands> - : NVPTXInst<(outs regclass:$result), ins, - AsmStr, - [(set regT:$result, Operands)]>, - Requires; // Define instruction variants for all addressing modes. -multiclass ATOM2P_impl Preds> { - let AddedComplexity = 1 in { - def : ATOM23_impl; - } - // tablegen can't infer argument types from Intrinsic (though it can - // from Instruction) so we have to enforce specific type on - // immediates via explicit cast to ImmTy. - def : ATOM23_impl; -} - -multiclass ATOM3P_impl Preds> { - // Variants for register/immediate permutations of $b and $c - let AddedComplexity = 2 in { - def : ATOM23_impl; - } - let AddedComplexity = 1 in { - def : ATOM23_impl; - def : ATOM23_impl; - } - def : ATOM23_impl; -} // Constructs intrinsic name and instruction asm strings. multiclass ATOM2N_impl Preds> { - defm : ATOM2P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr) - # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr) - # "." # OpStr # "." # TypeStr - # " \t$result, [$src], $b;", - !cast( - "int_nvvm_atomic_" # OpStr - # "_" # SpaceStr # "_" # IntTypeStr - # !if(!empty(ScopeStr), "", "_" # ScopeStr)), - regT, regclass, ImmType, Imm, ImmTy, Preds>; + RegTyInfo t, list Preds> { + defm "" : F_ATOMIC_2( + "int_nvvm_atomic_" # OpStr + # "_" # SpaceStr # "_" # IntTypeStr + # !if(!empty(ScopeStr), "", "_" # ScopeStr)), + preds = Preds>; } multiclass ATOM3N_impl Preds> { - defm : ATOM3P_impl<"atom" # !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr) - # !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr) - # "." # OpStr # "." # TypeStr - # " \t$result, [$src], $b, $c;", - !cast( - "int_nvvm_atomic_" # OpStr - # "_" # SpaceStr # "_" # IntTypeStr - # !if(!empty(ScopeStr), "", "_" # ScopeStr)), - regT, regclass, ImmType, Imm, ImmTy, Preds>; -} - -// Constructs variants for different address spaces. -// For now we only need variants for generic space pointers. -multiclass ATOM2A_impl Preds> { - defm _gen_ : ATOM2N_impl; -} -multiclass ATOM3A_impl Preds> { - defm _gen_ : ATOM3N_impl; + RegTyInfo t, list Preds> { + defm "" : F_ATOMIC_3( + "int_nvvm_atomic_" # OpStr + # "_" # SpaceStr # "_" # IntTypeStr + # !if(!empty(ScopeStr), "", "_" # ScopeStr)), + preds = Preds>; } // Constructs variants for different scopes of atomic op. multiclass ATOM2S_impl Preds> { + RegTyInfo t, list Preds> { // .gpu scope is default and is currently covered by existing // atomics w/o explicitly specified scope. - defm _cta : ATOM2A_impl; - defm _sys : ATOM2A_impl; + foreach scope = ["cta", "sys"] in { + // For now we only need variants for generic space pointers. + foreach space = ["gen"] in { + defm _#scope#space : ATOM2N_impl; + } + } } multiclass ATOM3S_impl Preds> { + RegTyInfo t, list Preds> { // No need to define ".gpu"-scoped atomics. They do the same thing // as the regular, non-scoped atomics defined elsewhere. - defm _cta : ATOM3A_impl; - defm _sys : ATOM3A_impl; + foreach scope = ["cta", "sys"] in { + // For now we only need variants for generic space pointers. + foreach space = ["gen"] in { + defm _#scope#space : ATOM3N_impl; + } + } } // atom.add multiclass ATOM2_add_impl { - defm _s32 : ATOM2S_impl; - defm _u32 : ATOM2S_impl; - defm _u64 : ATOM2S_impl; - defm _bf16 : ATOM2S_impl, hasPTX<78>]>; - defm _f16 : ATOM2S_impl, hasPTX<63>]>; - defm _f32 : ATOM2S_impl; - defm _f64 : ATOM2S_impl; + defm _s32 : ATOM2S_impl; + defm _u32 : ATOM2S_impl; + defm _u64 : ATOM2S_impl; + defm _bf16 : ATOM2S_impl, hasPTX<78>]>; + defm _f16 : ATOM2S_impl; + defm _f32 : ATOM2S_impl; + defm _f64 : ATOM2S_impl; } // atom.{and,or,xor} multiclass ATOM2_bitwise_impl { - defm _b32 : ATOM2S_impl; - defm _b64 : ATOM2S_impl; + defm _b32 : ATOM2S_impl; + defm _b64 : ATOM2S_impl; } // atom.exch multiclass ATOM2_exch_impl { - defm _b32 : ATOM2S_impl; - defm _b64 : ATOM2S_impl; + defm _b32 : ATOM2S_impl; + defm _b64 : ATOM2S_impl; } // atom.{min,max} multiclass ATOM2_minmax_impl { - defm _s32 : ATOM2S_impl; - defm _u32 : ATOM2S_impl; - defm _s64 : ATOM2S_impl; - defm _u64 : ATOM2S_impl; + defm _s32 : ATOM2S_impl; + defm _u32 : ATOM2S_impl; + defm _s64 : ATOM2S_impl; + defm _u64 : ATOM2S_impl; } // atom.{inc,dec} multiclass ATOM2_incdec_impl { - defm _u32 : ATOM2S_impl; + defm _u32 : ATOM2S_impl; } // atom.cas multiclass ATOM3_cas_impl { - defm _b16 : ATOM3S_impl; - defm _b32 : ATOM3S_impl; - defm _b64 : ATOM3S_impl; -} - -defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; -defm INT_PTX_SATOM_AND : ATOM2_bitwise_impl<"and">; -defm INT_PTX_SATOM_CAS : ATOM3_cas_impl<"cas">; -defm INT_PTX_SATOM_DEC : ATOM2_incdec_impl<"dec">; -defm INT_PTX_SATOM_EXCH: ATOM2_exch_impl<"exch">; -defm INT_PTX_SATOM_INC : ATOM2_incdec_impl<"inc">; -defm INT_PTX_SATOM_MAX : ATOM2_minmax_impl<"max">; -defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">; -defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">; -defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; + defm _b16 : ATOM3S_impl; + defm _b32 : ATOM3S_impl; + defm _b64 : ATOM3S_impl; +} + +defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; +defm INT_PTX_SATOM_AND : ATOM2_bitwise_impl<"and">; +defm INT_PTX_SATOM_CAS : ATOM3_cas_impl<"cas">; +defm INT_PTX_SATOM_DEC : ATOM2_incdec_impl<"dec">; +defm INT_PTX_SATOM_EXCH : ATOM2_exch_impl<"exch">; +defm INT_PTX_SATOM_INC : ATOM2_incdec_impl<"inc">; +defm INT_PTX_SATOM_MAX : ATOM2_minmax_impl<"max">; +defm INT_PTX_SATOM_MIN : ATOM2_minmax_impl<"min">; +defm INT_PTX_SATOM_OR : ATOM2_bitwise_impl<"or">; +defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">; //----------------------------------- // Support for ldu on sm_20 or later diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index e1fbb53891902..e1d9aaf7cfb20 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -40,18 +40,15 @@ define i64 @atom1(ptr %addr, i64 %val) { define i32 @atom2(ptr %subr, i32 %val) { ; CHECK-LABEL: atom2( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [atom2_param_0]; ; CHECK-NEXT: ld.param.u32 %r1, [atom2_param_1]; -; CHECK-NEXT: { -; CHECK-NEXT: .reg .s32 temp; -; CHECK-NEXT: neg.s32 temp, %r1; -; CHECK-NEXT: atom.add.u32 %r2, [%rd1], temp; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: neg.s32 %r2, %r1; +; CHECK-NEXT: atom.add.u32 %r3, [%rd1], %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %ret = atomicrmw sub ptr %subr, i32 %val seq_cst ret i32 %ret @@ -61,17 +58,14 @@ define i32 @atom2(ptr %subr, i32 %val) { define i64 @atom3(ptr %subr, i64 %val) { ; CHECK-LABEL: atom3( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd1, [atom3_param_0]; ; CHECK-NEXT: ld.param.u64 %rd2, [atom3_param_1]; -; CHECK-NEXT: { -; CHECK-NEXT: .reg .s64 temp; -; CHECK-NEXT: neg.s64 temp, %rd2; -; CHECK-NEXT: atom.add.u64 %rd3, [%rd1], temp; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-NEXT: neg.s64 %rd3, %rd2; +; CHECK-NEXT: atom.add.u64 %rd4, [%rd1], %rd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-NEXT: ret; %ret = atomicrmw sub ptr %subr, i64 %val seq_cst ret i64 %ret From dc17429ae6961a6783371dcf6749eea657b5446a Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Tue, 1 Apr 2025 17:15:18 -0300 Subject: [PATCH 0299/1029] [clang] improved preservation of template keyword (#133610) --- clang-tools-extra/clangd/AST.cpp | 3 +- clang-tools-extra/clangd/CodeComplete.cpp | 1 - clang-tools-extra/clangd/DumpAST.cpp | 1 - clang-tools-extra/clangd/FindTarget.cpp | 1 - .../include-cleaner/lib/WalkAST.cpp | 1 - clang/docs/ReleaseNotes.rst | 4 + clang/include/clang/AST/ASTContext.h | 17 +- clang/include/clang/AST/ASTImporter.h | 8 + clang/include/clang/AST/ASTNodeTraverser.h | 3 +- clang/include/clang/AST/AbstractBasicReader.h | 4 +- clang/include/clang/AST/AbstractBasicWriter.h | 1 - clang/include/clang/AST/NestedNameSpecifier.h | 20 +- clang/include/clang/AST/ODRHash.h | 1 + clang/include/clang/AST/PropertiesBase.td | 15 +- clang/include/clang/AST/RecursiveASTVisitor.h | 5 +- clang/include/clang/AST/TemplateName.h | 133 ++++----- clang/include/clang/AST/Type.h | 22 +- clang/include/clang/AST/TypeLoc.h | 9 +- clang/include/clang/AST/TypeProperties.td | 47 ++-- clang/include/clang/Sema/DeclSpec.h | 3 +- clang/lib/AST/ASTContext.cpp | 256 ++++++++---------- clang/lib/AST/ASTImporter.cpp | 63 ++--- clang/lib/AST/ASTStructuralEquivalence.cpp | 37 +-- clang/lib/AST/ItaniumMangle.cpp | 35 ++- clang/lib/AST/NestedNameSpecifier.cpp | 102 ++----- clang/lib/AST/ODRHash.cpp | 19 +- clang/lib/AST/QualTypeNames.cpp | 8 +- clang/lib/AST/TemplateName.cpp | 67 ++++- clang/lib/AST/TextNodeDumper.cpp | 4 - clang/lib/AST/Type.cpp | 27 +- clang/lib/AST/TypeLoc.cpp | 5 +- clang/lib/AST/TypePrinter.cpp | 12 +- clang/lib/ExtractAPI/DeclarationFragments.cpp | 7 - clang/lib/Index/IndexTypeSourceInfo.cpp | 1 - clang/lib/Parse/ParseExprCXX.cpp | 8 +- clang/lib/Sema/DeclSpec.cpp | 6 +- clang/lib/Sema/HeuristicResolver.cpp | 6 +- clang/lib/Sema/SemaCXXScopeSpec.cpp | 31 +-- clang/lib/Sema/SemaCodeComplete.cpp | 6 +- clang/lib/Sema/SemaCoroutine.cpp | 3 +- clang/lib/Sema/SemaDecl.cpp | 24 +- clang/lib/Sema/SemaDeclCXX.cpp | 3 +- clang/lib/Sema/SemaExpr.cpp | 2 +- clang/lib/Sema/SemaExprCXX.cpp | 1 - clang/lib/Sema/SemaLookup.cpp | 4 +- clang/lib/Sema/SemaTemplate.cpp | 47 ++-- clang/lib/Sema/SemaTemplateDeduction.cpp | 3 +- clang/lib/Sema/TreeTransform.h | 149 +++++----- clang/lib/Serialization/ASTReader.cpp | 10 +- clang/lib/Serialization/ASTWriter.cpp | 2 - clang/lib/Tooling/Syntax/BuildTree.cpp | 2 - clang/test/AST/ast-dump-decl.cpp | 20 +- clang/test/AST/ast-dump-expr.cpp | 3 +- clang/test/AST/ast-dump-templates.cpp | 6 +- clang/test/CXX/class.access/p6.cpp | 4 +- clang/test/CXX/drs/cwg2xx.cpp | 4 +- .../expr.prim.req/nested-requirement.cpp | 2 +- clang/test/SemaCXX/static-assert.cpp | 4 +- .../aggregate-deduction-candidate.cpp | 8 +- .../dependent-template-recover.cpp | 18 ++ .../instantiate-requires-expr.cpp | 4 +- clang/tools/libclang/CIndex.cpp | 2 - .../array/array.overview/nttp.verify.cpp | 2 +- .../adapt/inout_ptr/inout_ptr.verify.cpp | 2 +- .../smartptr/adapt/out_ptr/out_ptr.verify.cpp | 2 +- .../utility/pairs/pairs.pair/nttp.verify.cpp | 2 +- 66 files changed, 612 insertions(+), 720 deletions(-) diff --git a/clang-tools-extra/clangd/AST.cpp b/clang-tools-extra/clangd/AST.cpp index f3eee1c6335f9..66b587f00ff4a 100644 --- a/clang-tools-extra/clangd/AST.cpp +++ b/clang-tools-extra/clangd/AST.cpp @@ -119,8 +119,7 @@ getQualification(ASTContext &Context, const DeclContext *DestContext, // There can't be any more tag parents after hitting a namespace. assert(!ReachedNS); (void)ReachedNS; - NNS = NestedNameSpecifier::Create(Context, nullptr, false, - TD->getTypeForDecl()); + NNS = NestedNameSpecifier::Create(Context, nullptr, TD->getTypeForDecl()); } else if (auto *NSD = llvm::dyn_cast(CurContext)) { ReachedNS = true; NNS = NestedNameSpecifier::Create(Context, nullptr, NSD); diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp index 008cc96c91996..0eb196fbad46a 100644 --- a/clang-tools-extra/clangd/CodeComplete.cpp +++ b/clang-tools-extra/clangd/CodeComplete.cpp @@ -1467,7 +1467,6 @@ bool allowIndex(CodeCompletionContext &CC) { return true; case NestedNameSpecifier::Super: case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: // Unresolved inside a template. case NestedNameSpecifier::Identifier: return false; diff --git a/clang-tools-extra/clangd/DumpAST.cpp b/clang-tools-extra/clangd/DumpAST.cpp index e605f82e91fe4..584bb1f088380 100644 --- a/clang-tools-extra/clangd/DumpAST.cpp +++ b/clang-tools-extra/clangd/DumpAST.cpp @@ -157,7 +157,6 @@ class DumpVisitor : public RecursiveASTVisitor { NNS_KIND(Identifier); NNS_KIND(Namespace); NNS_KIND(TypeSpec); - NNS_KIND(TypeSpecWithTemplate); NNS_KIND(Global); NNS_KIND(Super); NNS_KIND(NamespaceAlias); diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp index bb4c91b831354..62f220b32bd10 100644 --- a/clang-tools-extra/clangd/FindTarget.cpp +++ b/clang-tools-extra/clangd/FindTarget.cpp @@ -500,7 +500,6 @@ struct TargetFinder { } return; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: add(QualType(NNS->getAsType(), 0), Flags); return; case NestedNameSpecifier::Global: diff --git a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp index 7a140c991925c..dff0c711f04c5 100644 --- a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp +++ b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp @@ -144,7 +144,6 @@ class ASTWalker : public RecursiveASTVisitor { case NestedNameSpecifier::Global: return true; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: case NestedNameSpecifier::Super: case NestedNameSpecifier::Identifier: return false; diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 75a173a48e67e..c3b64d84a1b1c 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -275,6 +275,10 @@ Improvements to Clang's diagnostics - Diagnostics on chained comparisons (``a < b < c``) are now an error by default. This can be disabled with ``-Wno-error=parentheses``. - Clang now better preserves the sugared types of pointers to member. +- Clang now better preserves the presence of the template keyword with dependent + prefixes. +- When printing types for diagnostics, clang now doesn't suppress the scopes of + template arguments contained within nested names. - The ``-Wshift-bool`` warning has been added to warn about shifting a boolean. (#GH28334) - Fixed diagnostics adding a trailing ``::`` when printing some source code constructs, like base classes. diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index af8c49e99a7ce..f386282890b5a 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -1837,15 +1837,14 @@ class ASTContext : public RefCountedBase { TagDecl *OwnedTagDecl = nullptr) const; QualType getDependentNameType(ElaboratedTypeKeyword Keyword, NestedNameSpecifier *NNS, - const IdentifierInfo *Name, - QualType Canon = QualType()) const; + const IdentifierInfo *Name) const; QualType getDependentTemplateSpecializationType( - ElaboratedTypeKeyword Keyword, NestedNameSpecifier *NNS, - const IdentifierInfo *Name, ArrayRef Args) const; + ElaboratedTypeKeyword Keyword, const DependentTemplateStorage &Name, + ArrayRef Args) const; QualType getDependentTemplateSpecializationType( - ElaboratedTypeKeyword Keyword, NestedNameSpecifier *NNS, - const IdentifierInfo *Name, ArrayRef Args) const; + ElaboratedTypeKeyword Keyword, const DependentTemplateStorage &Name, + ArrayRef Args, bool IsCanonical = false) const; TemplateArgument getInjectedTemplateArg(NamedDecl *ParamDecl) const; @@ -2393,11 +2392,9 @@ class ASTContext : public RefCountedBase { TemplateName getQualifiedTemplateName(NestedNameSpecifier *NNS, bool TemplateKeyword, TemplateName Template) const; + TemplateName + getDependentTemplateName(const DependentTemplateStorage &Name) const; - TemplateName getDependentTemplateName(NestedNameSpecifier *NNS, - const IdentifierInfo *Name) const; - TemplateName getDependentTemplateName(NestedNameSpecifier *NNS, - OverloadedOperatorKind Operator) const; TemplateName getSubstTemplateTemplateParm(TemplateName replacement, Decl *AssociatedDecl, unsigned Index, diff --git a/clang/include/clang/AST/ASTImporter.h b/clang/include/clang/AST/ASTImporter.h index 8c3fa842ab8b9..a2550716e3c7f 100644 --- a/clang/include/clang/AST/ASTImporter.h +++ b/clang/include/clang/AST/ASTImporter.h @@ -446,6 +446,14 @@ class TypeSourceInfo; /// returns nullptr only if the FromId was nullptr. IdentifierInfo *Import(const IdentifierInfo *FromId); + /// Import the given identifier or overloaded operator from the "from" + /// context into the "to" context. + /// + /// \returns The equivalent identifier or overloaded operator in the "to" + /// context. + IdentifierOrOverloadedOperator + Import(IdentifierOrOverloadedOperator FromIO); + /// Import the given Objective-C selector from the "from" /// context into the "to" context. /// diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h index 83a6b77704f34..f086d8134a64b 100644 --- a/clang/include/clang/AST/ASTNodeTraverser.h +++ b/clang/include/clang/AST/ASTNodeTraverser.h @@ -396,8 +396,7 @@ class ASTNodeTraverser // FIXME: Provide a NestedNameSpecifier visitor. NestedNameSpecifier *Qualifier = T->getQualifier(); if (NestedNameSpecifier::SpecifierKind K = Qualifier->getKind(); - K == NestedNameSpecifier::TypeSpec || - K == NestedNameSpecifier::TypeSpecWithTemplate) + K == NestedNameSpecifier::TypeSpec) Visit(Qualifier->getAsType()); if (T->isSugared()) Visit(T->getMostRecentCXXRecordDecl()->getTypeForDecl()); diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h index 4b627c65e276b..5ab438715ecf7 100644 --- a/clang/include/clang/AST/AbstractBasicReader.h +++ b/clang/include/clang/AST/AbstractBasicReader.h @@ -279,10 +279,8 @@ class DataStreamBasicReader : public BasicReaderBase { continue; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: cur = NestedNameSpecifier::Create(ctx, cur, - kind == NestedNameSpecifier::TypeSpecWithTemplate, - asImpl().readQualType().getTypePtr()); + asImpl().readQualType().getTypePtr()); continue; case NestedNameSpecifier::Global: diff --git a/clang/include/clang/AST/AbstractBasicWriter.h b/clang/include/clang/AST/AbstractBasicWriter.h index b941add8bde88..f65d94abc2ff1 100644 --- a/clang/include/clang/AST/AbstractBasicWriter.h +++ b/clang/include/clang/AST/AbstractBasicWriter.h @@ -260,7 +260,6 @@ class DataStreamBasicWriter : public BasicWriterBase { continue; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: asImpl().writeQualType(QualType(NNS->getAsType(), 0)); continue; diff --git a/clang/include/clang/AST/NestedNameSpecifier.h b/clang/include/clang/AST/NestedNameSpecifier.h index 273e73e7c1e95..d7da3272d0943 100644 --- a/clang/include/clang/AST/NestedNameSpecifier.h +++ b/clang/include/clang/AST/NestedNameSpecifier.h @@ -52,8 +52,7 @@ class NestedNameSpecifier : public llvm::FoldingSetNode { enum StoredSpecifierKind { StoredIdentifier = 0, StoredDecl = 1, - StoredTypeSpec = 2, - StoredTypeSpecWithTemplate = 3 + StoredTypeSpec = 2 }; /// The nested name specifier that precedes this nested name @@ -89,10 +88,6 @@ class NestedNameSpecifier : public llvm::FoldingSetNode { /// A type, stored as a Type*. TypeSpec, - /// A type that was preceded by the 'template' keyword, - /// stored as a Type*. - TypeSpecWithTemplate, - /// The global specifier '::'. There is no stored value. Global, @@ -137,9 +132,8 @@ class NestedNameSpecifier : public llvm::FoldingSetNode { const NamespaceAliasDecl *Alias); /// Builds a nested name specifier that names a type. - static NestedNameSpecifier *Create(const ASTContext &Context, - NestedNameSpecifier *Prefix, - bool Template, const Type *T); + static NestedNameSpecifier * + Create(const ASTContext &Context, NestedNameSpecifier *Prefix, const Type *T); /// Builds a specifier that consists of just an identifier. /// @@ -194,8 +188,7 @@ class NestedNameSpecifier : public llvm::FoldingSetNode { /// Retrieve the type stored in this nested name specifier. const Type *getAsType() const { - if (Prefix.getInt() == StoredTypeSpec || - Prefix.getInt() == StoredTypeSpecWithTemplate) + if (Prefix.getInt() == StoredTypeSpec) return (const Type *)Specifier; return nullptr; @@ -401,13 +394,10 @@ class NestedNameSpecifierLocBuilder { /// \param Context The AST context in which this nested-name-specifier /// resides. /// - /// \param TemplateKWLoc The location of the 'template' keyword, if present. - /// /// \param TL The TypeLoc that describes the type preceding the '::'. /// /// \param ColonColonLoc The location of the trailing '::'. - void Extend(ASTContext &Context, SourceLocation TemplateKWLoc, TypeLoc TL, - SourceLocation ColonColonLoc); + void Extend(ASTContext &Context, TypeLoc TL, SourceLocation ColonColonLoc); /// Extend the current nested-name-specifier by another /// nested-name-specifier component of the form 'identifier::'. diff --git a/clang/include/clang/AST/ODRHash.h b/clang/include/clang/AST/ODRHash.h index a1caa6d39a87c..a923901b32dc0 100644 --- a/clang/include/clang/AST/ODRHash.h +++ b/clang/include/clang/AST/ODRHash.h @@ -94,6 +94,7 @@ class ODRHash { void AddStmt(const Stmt *S); void AddIdentifierInfo(const IdentifierInfo *II); void AddNestedNameSpecifier(const NestedNameSpecifier *NNS); + void AddDependentTemplateName(const DependentTemplateStorage &Name); void AddTemplateName(TemplateName Name); void AddDeclarationName(DeclarationName Name, bool TreatAsDecl = false); void AddTemplateArgument(TemplateArgument TA); diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td index 42883b6419261..178308a24e1a0 100644 --- a/clang/include/clang/AST/PropertiesBase.td +++ b/clang/include/clang/AST/PropertiesBase.td @@ -692,25 +692,26 @@ let Class = PropertyTypeCase in { let Class = PropertyTypeCase in { def : ReadHelper<[{ auto dtn = node.getAsDependentTemplateName(); + auto name = dtn->getName(); }]>; def : Property<"qualifier", NestedNameSpecifier> { let Read = [{ dtn->getQualifier() }]; } def : Property<"identifier", Optional> { - let Read = [{ makeOptionalFromPointer( - dtn->isIdentifier() - ? dtn->getIdentifier() - : nullptr) }]; + let Read = [{ makeOptionalFromPointer(name.getIdentifier()) }]; } def : Property<"operatorKind", OverloadedOperatorKind> { let Conditional = [{ !identifier }]; - let Read = [{ dtn->getOperator() }]; + let Read = [{ name.getOperator() }]; + } + def : Property<"HasTemplateKeyword", Bool> { + let Read = [{ dtn->hasTemplateKeyword() }]; } def : Creator<[{ if (identifier) { - return ctx.getDependentTemplateName(qualifier, *identifier); + return ctx.getDependentTemplateName({qualifier, *identifier, HasTemplateKeyword}); } else { - return ctx.getDependentTemplateName(qualifier, *operatorKind); + return ctx.getDependentTemplateName({qualifier, *operatorKind, HasTemplateKeyword}); } }]>; } diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 0d5d515c0e6f7..0530996ed20d3 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -795,7 +795,6 @@ bool RecursiveASTVisitor::TraverseNestedNameSpecifier( return true; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: TRY_TO(TraverseType(QualType(NNS->getAsType(), 0))); } @@ -820,7 +819,6 @@ bool RecursiveASTVisitor::TraverseNestedNameSpecifierLoc( return true; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: TRY_TO(TraverseTypeLoc(NNS.getTypeLoc())); break; } @@ -1172,7 +1170,8 @@ DEF_TRAVERSE_TYPE(DependentNameType, { TRY_TO(TraverseNestedNameSpecifier(T->getQualifier())); }) DEF_TRAVERSE_TYPE(DependentTemplateSpecializationType, { - TRY_TO(TraverseNestedNameSpecifier(T->getQualifier())); + const DependentTemplateStorage &S = T->getDependentTemplateName(); + TRY_TO(TraverseNestedNameSpecifier(S.getQualifier())); TRY_TO(TraverseTemplateArguments(T->template_arguments())); }) diff --git a/clang/include/clang/AST/TemplateName.h b/clang/include/clang/AST/TemplateName.h index ce97f834bfc1d..1a56133b72d6e 100644 --- a/clang/include/clang/AST/TemplateName.h +++ b/clang/include/clang/AST/TemplateName.h @@ -16,6 +16,7 @@ #include "clang/AST/DependenceFlags.h" #include "clang/AST/NestedNameSpecifier.h" #include "clang/Basic/LLVM.h" +#include "clang/Basic/OperatorKinds.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/PointerUnion.h" @@ -537,6 +538,35 @@ class QualifiedTemplateName : public llvm::FoldingSetNode { } }; +struct IdentifierOrOverloadedOperator { + IdentifierOrOverloadedOperator() = default; + IdentifierOrOverloadedOperator(const IdentifierInfo *II); + IdentifierOrOverloadedOperator(OverloadedOperatorKind OOK); + + /// Returns the identifier to which this template name refers. + const IdentifierInfo *getIdentifier() const { + if (getOperator() != OO_None) + return nullptr; + return reinterpret_cast(PtrOrOp); + } + + /// Return the overloaded operator to which this template name refers. + OverloadedOperatorKind getOperator() const { + uintptr_t OOK = -PtrOrOp; + return OOK < NUM_OVERLOADED_OPERATORS ? OverloadedOperatorKind(OOK) + : OO_None; + } + + void Profile(llvm::FoldingSetNodeID &ID) const; + + bool operator==(const IdentifierOrOverloadedOperator &Other) const { + return PtrOrOp == Other.PtrOrOp; + }; + +private: + uintptr_t PtrOrOp = 0; +}; + /// Represents a dependent template name that cannot be /// resolved prior to template instantiation. /// @@ -545,104 +575,53 @@ class QualifiedTemplateName : public llvm::FoldingSetNode { /// DependentTemplateName can refer to "MetaFun::template apply", /// where "MetaFun::" is the nested name specifier and "apply" is the /// template name referenced. The "template" keyword is implied. -class DependentTemplateName : public llvm::FoldingSetNode { - friend class ASTContext; - +class DependentTemplateStorage { /// The nested name specifier that qualifies the template /// name. /// /// The bit stored in this qualifier describes whether the \c Name field - /// is interpreted as an IdentifierInfo pointer (when clear) or as an - /// overloaded operator kind (when set). + /// was preceeded by a template keyword. llvm::PointerIntPair Qualifier; /// The dependent template name. - union { - /// The identifier template name. - /// - /// Only valid when the bit on \c Qualifier is clear. - const IdentifierInfo *Identifier; - - /// The overloaded operator name. - /// - /// Only valid when the bit on \c Qualifier is set. - OverloadedOperatorKind Operator; - }; - - /// The canonical template name to which this dependent - /// template name refers. - /// - /// The canonical template name for a dependent template name is - /// another dependent template name whose nested name specifier is - /// canonical. - TemplateName CanonicalTemplateName; - - DependentTemplateName(NestedNameSpecifier *Qualifier, - const IdentifierInfo *Identifier) - : Qualifier(Qualifier, false), Identifier(Identifier), - CanonicalTemplateName(this) {} - - DependentTemplateName(NestedNameSpecifier *Qualifier, - const IdentifierInfo *Identifier, - TemplateName Canon) - : Qualifier(Qualifier, false), Identifier(Identifier), - CanonicalTemplateName(Canon) {} - - DependentTemplateName(NestedNameSpecifier *Qualifier, - OverloadedOperatorKind Operator) - : Qualifier(Qualifier, true), Operator(Operator), - CanonicalTemplateName(this) {} - - DependentTemplateName(NestedNameSpecifier *Qualifier, - OverloadedOperatorKind Operator, - TemplateName Canon) - : Qualifier(Qualifier, true), Operator(Operator), - CanonicalTemplateName(Canon) {} + IdentifierOrOverloadedOperator Name; public: + DependentTemplateStorage(NestedNameSpecifier *Qualifier, + IdentifierOrOverloadedOperator Name, + bool HasTemplateKeyword); + /// Return the nested name specifier that qualifies this name. NestedNameSpecifier *getQualifier() const { return Qualifier.getPointer(); } - /// Determine whether this template name refers to an identifier. - bool isIdentifier() const { return !Qualifier.getInt(); } + IdentifierOrOverloadedOperator getName() const { return Name; } - /// Returns the identifier to which this template name refers. - const IdentifierInfo *getIdentifier() const { - assert(isIdentifier() && "Template name isn't an identifier?"); - return Identifier; - } - - /// Determine whether this template name refers to an overloaded - /// operator. - bool isOverloadedOperator() const { return Qualifier.getInt(); } + /// Was this template name was preceeded by the template keyword? + bool hasTemplateKeyword() const { return Qualifier.getInt(); } - /// Return the overloaded operator to which this template name refers. - OverloadedOperatorKind getOperator() const { - assert(isOverloadedOperator() && - "Template name isn't an overloaded operator?"); - return Operator; - } + TemplateNameDependence getDependence() const; - void Profile(llvm::FoldingSetNodeID &ID) { - if (isIdentifier()) - Profile(ID, getQualifier(), getIdentifier()); - else - Profile(ID, getQualifier(), getOperator()); + void Profile(llvm::FoldingSetNodeID &ID) const { + Profile(ID, getQualifier(), getName(), hasTemplateKeyword()); } static void Profile(llvm::FoldingSetNodeID &ID, NestedNameSpecifier *NNS, - const IdentifierInfo *Identifier) { + IdentifierOrOverloadedOperator Name, + bool HasTemplateKeyword) { ID.AddPointer(NNS); - ID.AddBoolean(false); - ID.AddPointer(Identifier); + ID.AddBoolean(HasTemplateKeyword); + Name.Profile(ID); } - static void Profile(llvm::FoldingSetNodeID &ID, NestedNameSpecifier *NNS, - OverloadedOperatorKind Operator) { - ID.AddPointer(NNS); - ID.AddBoolean(true); - ID.AddInteger(Operator); - } + void print(raw_ostream &OS, const PrintingPolicy &Policy) const; +}; + +class DependentTemplateName : public DependentTemplateStorage, + public llvm::FoldingSetNode { + friend class ASTContext; + using DependentTemplateStorage::DependentTemplateStorage; + DependentTemplateName(const DependentTemplateStorage &S) + : DependentTemplateStorage(S) {} }; } // namespace clang. diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index a809102c069a8..988362787a452 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -7098,21 +7098,17 @@ class DependentTemplateSpecializationType : public TypeWithKeyword, public llvm::FoldingSetNode { friend class ASTContext; // ASTContext creates these - /// The nested name specifier containing the qualifier. - NestedNameSpecifier *NNS; - - /// The identifier of the template. - const IdentifierInfo *Name; + DependentTemplateStorage Name; DependentTemplateSpecializationType(ElaboratedTypeKeyword Keyword, - NestedNameSpecifier *NNS, - const IdentifierInfo *Name, + const DependentTemplateStorage &Name, ArrayRef Args, QualType Canon); public: - NestedNameSpecifier *getQualifier() const { return NNS; } - const IdentifierInfo *getIdentifier() const { return Name; } + const DependentTemplateStorage &getDependentTemplateName() const { + return Name; + } ArrayRef template_arguments() const { return {reinterpret_cast(this + 1), @@ -7123,14 +7119,12 @@ class DependentTemplateSpecializationType : public TypeWithKeyword, QualType desugar() const { return QualType(this, 0); } void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context) { - Profile(ID, Context, getKeyword(), NNS, Name, template_arguments()); + Profile(ID, Context, getKeyword(), Name, template_arguments()); } - static void Profile(llvm::FoldingSetNodeID &ID, - const ASTContext &Context, + static void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context, ElaboratedTypeKeyword Keyword, - NestedNameSpecifier *Qualifier, - const IdentifierInfo *Name, + const DependentTemplateStorage &Name, ArrayRef Args); static bool classof(const Type *T) { diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h index 17ce09fa5da4f..92661b8b13fe0 100644 --- a/clang/include/clang/AST/TypeLoc.h +++ b/clang/include/clang/AST/TypeLoc.h @@ -2502,8 +2502,9 @@ class DependentTemplateSpecializationTypeLoc : if (!getLocalData()->QualifierData) return NestedNameSpecifierLoc(); - return NestedNameSpecifierLoc(getTypePtr()->getQualifier(), - getLocalData()->QualifierData); + return NestedNameSpecifierLoc( + getTypePtr()->getDependentTemplateName().getQualifier(), + getLocalData()->QualifierData); } void setQualifierLoc(NestedNameSpecifierLoc QualifierLoc) { @@ -2516,8 +2517,8 @@ class DependentTemplateSpecializationTypeLoc : return; } - assert(QualifierLoc.getNestedNameSpecifier() - == getTypePtr()->getQualifier() && + assert(QualifierLoc.getNestedNameSpecifier() == + getTypePtr()->getDependentTemplateName().getQualifier() && "Inconsistent nested-name-specifier pointer"); getLocalData()->QualifierData = QualifierLoc.getOpaqueData(); } diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index 27f71bf5cc62f..10eb40dc90ad4 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -774,22 +774,37 @@ let Class = TemplateSpecializationType in { } let Class = DependentTemplateSpecializationType in { - def : Property<"keyword", ElaboratedTypeKeyword> { - let Read = [{ node->getKeyword() }]; - } + def : ReadHelper<[{ + const auto &dtn = node->getDependentTemplateName(); + auto name = dtn.getName(); + }]>; + def : Property<"qualifier", NestedNameSpecifier> { - let Read = [{ node->getQualifier() }]; + let Read = [{ dtn.getQualifier() }]; + } + def : Property<"identifier", Optional> { + let Read = [{ makeOptionalFromPointer(name.getIdentifier()) }]; } - def : Property<"name", Identifier> { - let Read = [{ node->getIdentifier() }]; + def : Property<"operatorKind", OverloadedOperatorKind> { + let Conditional = [{ !identifier }]; + let Read = [{ name.getOperator() }]; + } + def : Property<"HasTemplateKeyword", Bool> { + let Read = [{ dtn.hasTemplateKeyword() }]; + } + + def : Property<"keyword", ElaboratedTypeKeyword> { + let Read = [{ node->getKeyword() }]; } def : Property<"templateArguments", Array> { let Read = [{ node->template_arguments() }]; } def : Creator<[{ - return ctx.getDependentTemplateSpecializationType(keyword, qualifier, - name, templateArguments); + DependentTemplateStorage S(qualifier, identifier ? IdentifierOrOverloadedOperator(*identifier) : + IdentifierOrOverloadedOperator(*operatorKind), + HasTemplateKeyword); + return ctx.getDependentTemplateSpecializationType(keyword, S, templateArguments); }]>; } @@ -926,22 +941,10 @@ let Class = DependentNameType in { def : Property<"qualifier", NestedNameSpecifier> { let Read = [{ node->getQualifier() }]; } - def : Property<"name", Identifier> { - let Read = [{ node->getIdentifier() }]; - } - def : Property<"underlyingType", Optional> { - let Read = [{ - node->isCanonicalUnqualified() - ? std::nullopt - : std::optional(node->getCanonicalTypeInternal()) - }]; - } + def : Property<"name", Identifier> { let Read = [{ node->getIdentifier() }]; } def : Creator<[{ - QualType canon = (underlyingType - ? ctx.getCanonicalType(*underlyingType) - : QualType()); - return ctx.getDependentNameType(keyword, qualifier, name, canon); + return ctx.getDependentNameType(keyword, qualifier, name); }]>; } diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h index 5f5df3a45d41d..6c4a32c4ac2f0 100644 --- a/clang/include/clang/Sema/DeclSpec.h +++ b/clang/include/clang/Sema/DeclSpec.h @@ -107,8 +107,7 @@ class CXXScopeSpec { /// \param TL The TypeLoc that describes the type preceding the '::'. /// /// \param ColonColonLoc The location of the trailing '::'. - void Extend(ASTContext &Context, SourceLocation TemplateKWLoc, TypeLoc TL, - SourceLocation ColonColonLoc); + void Extend(ASTContext &Context, TypeLoc TL, SourceLocation ColonColonLoc); /// Extend the current nested-name-specifier by another /// nested-name-specifier component of the form 'identifier::'. diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 2d9480ebcf00c..089d01839e1cf 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -4033,7 +4033,6 @@ QualType ASTContext::getMemberPointerType(QualType T, if (!Qualifier) { assert(Cls && "At least one of Qualifier or Cls must be provided"); Qualifier = NestedNameSpecifier::Create(*this, /*Prefix=*/nullptr, - /*Template=*/false, getTypeDeclType(Cls).getTypePtr()); } else if (!Cls) { Cls = Qualifier->getAsRecordDecl(); @@ -4052,8 +4051,7 @@ QualType ASTContext::getMemberPointerType(QualType T, if (!Cls) return getCanonicalNestedNameSpecifier(Qualifier); NestedNameSpecifier *R = NestedNameSpecifier::Create( - *this, /*Prefix=*/nullptr, /*Template=*/false, - Cls->getCanonicalDecl()->getTypeForDecl()); + *this, /*Prefix=*/nullptr, Cls->getCanonicalDecl()->getTypeForDecl()); assert(R == getCanonicalNestedNameSpecifier(R)); return R; }(); @@ -5739,24 +5737,26 @@ ASTContext::getMacroQualifiedType(QualType UnderlyingTy, QualType ASTContext::getDependentNameType(ElaboratedTypeKeyword Keyword, NestedNameSpecifier *NNS, - const IdentifierInfo *Name, - QualType Canon) const { - if (Canon.isNull()) { - NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS); - if (CanonNNS != NNS) - Canon = getDependentNameType(Keyword, CanonNNS, Name); - } - + const IdentifierInfo *Name) const { llvm::FoldingSetNodeID ID; DependentNameType::Profile(ID, Keyword, NNS, Name); void *InsertPos = nullptr; - DependentNameType *T - = DependentNameTypes.FindNodeOrInsertPos(ID, InsertPos); - if (T) + if (DependentNameType *T = + DependentNameTypes.FindNodeOrInsertPos(ID, InsertPos)) return QualType(T, 0); - T = new (*this, alignof(DependentNameType)) + QualType Canon; + if (NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS); + CanonNNS != NNS) { + Canon = getDependentNameType(Keyword, CanonNNS, Name); + [[maybe_unused]] DependentNameType *T = + DependentNameTypes.FindNodeOrInsertPos(ID, InsertPos); + assert(!T && "broken canonicalization"); + assert(Canon.isCanonical()); + } + + DependentNameType *T = new (*this, alignof(DependentNameType)) DependentNameType(Keyword, NNS, Name, Canon); Types.push_back(T); DependentNameTypes.InsertNode(T, InsertPos); @@ -5764,61 +5764,63 @@ QualType ASTContext::getDependentNameType(ElaboratedTypeKeyword Keyword, } QualType ASTContext::getDependentTemplateSpecializationType( - ElaboratedTypeKeyword Keyword, NestedNameSpecifier *NNS, - const IdentifierInfo *Name, ArrayRef Args) const { + ElaboratedTypeKeyword Keyword, const DependentTemplateStorage &Name, + ArrayRef Args) const { // TODO: avoid this copy SmallVector ArgCopy; for (unsigned I = 0, E = Args.size(); I != E; ++I) ArgCopy.push_back(Args[I].getArgument()); - return getDependentTemplateSpecializationType(Keyword, NNS, Name, ArgCopy); + return getDependentTemplateSpecializationType(Keyword, Name, ArgCopy); } -QualType -ASTContext::getDependentTemplateSpecializationType( - ElaboratedTypeKeyword Keyword, - NestedNameSpecifier *NNS, - const IdentifierInfo *Name, - ArrayRef Args) const { - assert((!NNS || NNS->isDependent()) && - "nested-name-specifier must be dependent"); - +QualType ASTContext::getDependentTemplateSpecializationType( + ElaboratedTypeKeyword Keyword, const DependentTemplateStorage &Name, + ArrayRef Args, bool IsCanonical) const { llvm::FoldingSetNodeID ID; - DependentTemplateSpecializationType::Profile(ID, *this, Keyword, NNS, - Name, Args); + DependentTemplateSpecializationType::Profile(ID, *this, Keyword, Name, Args); void *InsertPos = nullptr; - DependentTemplateSpecializationType *T - = DependentTemplateSpecializationTypes.FindNodeOrInsertPos(ID, InsertPos); - if (T) + if (auto *T = DependentTemplateSpecializationTypes.FindNodeOrInsertPos( + ID, InsertPos)) return QualType(T, 0); - NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS); - - ElaboratedTypeKeyword CanonKeyword = Keyword; - if (Keyword == ElaboratedTypeKeyword::None) - CanonKeyword = ElaboratedTypeKeyword::Typename; - - bool AnyNonCanonArgs = false; - auto CanonArgs = - ::getCanonicalTemplateArguments(*this, Args, AnyNonCanonArgs); + NestedNameSpecifier *NNS = Name.getQualifier(); QualType Canon; - if (AnyNonCanonArgs || CanonNNS != NNS || CanonKeyword != Keyword) { - Canon = getDependentTemplateSpecializationType(CanonKeyword, CanonNNS, - Name, - CanonArgs); - - // Find the insert position again. - [[maybe_unused]] auto *Nothing = - DependentTemplateSpecializationTypes.FindNodeOrInsertPos(ID, InsertPos); - assert(!Nothing && "canonical type broken"); + if (!IsCanonical) { + ElaboratedTypeKeyword CanonKeyword = Keyword != ElaboratedTypeKeyword::None + ? Keyword + : ElaboratedTypeKeyword::Typename; + NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS); + bool AnyNonCanonArgs = false; + auto CanonArgs = + ::getCanonicalTemplateArguments(*this, Args, AnyNonCanonArgs); + + if (AnyNonCanonArgs || CanonNNS != NNS || !Name.hasTemplateKeyword() || + CanonKeyword != Keyword) { + Canon = getDependentTemplateSpecializationType( + CanonKeyword, {CanonNNS, Name.getName(), /*HasTemplateKeyword=*/true}, + CanonArgs, /*IsCanonical=*/true); + // Find the insert position again. + [[maybe_unused]] auto *Nothing = + DependentTemplateSpecializationTypes.FindNodeOrInsertPos(ID, + InsertPos); + assert(!Nothing && "canonical type broken"); + } + } else { + assert(Keyword != ElaboratedTypeKeyword::None); + assert(Name.hasTemplateKeyword()); + assert(NNS == getCanonicalNestedNameSpecifier(NNS)); +#ifndef NDEBUG + for (const auto &Arg : Args) + assert(Arg.structurallyEquals(getCanonicalTemplateArgument(Arg))); +#endif } - void *Mem = Allocate((sizeof(DependentTemplateSpecializationType) + sizeof(TemplateArgument) * Args.size()), alignof(DependentTemplateSpecializationType)); - T = new (Mem) DependentTemplateSpecializationType(Keyword, NNS, - Name, Args, Canon); + auto *T = + new (Mem) DependentTemplateSpecializationType(Keyword, Name, Args, Canon); Types.push_back(T); DependentTemplateSpecializationTypes.InsertNode(T, InsertPos); return QualType(T, 0); @@ -6916,12 +6918,13 @@ ASTContext::getNameForTemplate(TemplateName Name, case TemplateName::DependentTemplate: { DependentTemplateName *DTN = Name.getAsDependentTemplateName(); + IdentifierOrOverloadedOperator TN = DTN->getName(); DeclarationName DName; - if (DTN->isIdentifier()) { - DName = DeclarationNames.getIdentifier(DTN->getIdentifier()); + if (const IdentifierInfo *II = TN.getIdentifier()) { + DName = DeclarationNames.getIdentifier(II); return DeclarationNameInfo(DName, NameLoc); } else { - DName = DeclarationNames.getCXXOperatorName(DTN->getOperator()); + DName = DeclarationNames.getCXXOperatorName(TN.getOperator()); // DNInfo work in progress: FIXME: source locations? DeclarationNameLoc DNLoc = DeclarationNameLoc::makeCXXOperatorNameLoc(SourceRange()); @@ -6996,7 +6999,13 @@ TemplateName ASTContext::getCanonicalTemplateName(TemplateName Name, case TemplateName::DependentTemplate: { DependentTemplateName *DTN = Name.getAsDependentTemplateName(); assert(DTN && "Non-dependent template names must refer to template decls."); - return DTN->CanonicalTemplateName; + NestedNameSpecifier *Qualifier = DTN->getQualifier(); + NestedNameSpecifier *CanonQualifier = + getCanonicalNestedNameSpecifier(Qualifier); + if (Qualifier != CanonQualifier || !DTN->hasTemplateKeyword()) + return getDependentTemplateName({CanonQualifier, DTN->getName(), + /*HasTemplateKeyword=*/true}); + return Name; } case TemplateName::SubstTemplateTemplateParmPack: { @@ -7229,7 +7238,6 @@ static bool isSameQualifier(const NestedNameSpecifier *X, // We've already checked that we named the same namespace. break; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: if (X->getAsType()->getCanonicalTypeInternal() != Y->getAsType()->getCanonicalTypeInternal()) return false; @@ -7608,8 +7616,7 @@ ASTContext::getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const { // The difference between TypeSpec and TypeSpecWithTemplate is that the // latter will have the 'template' keyword when printed. - case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: { + case NestedNameSpecifier::TypeSpec: { const Type *T = getCanonicalType(NNS->getAsType()); // If we have some kind of dependent-named type (e.g., "typename T::type"), @@ -7622,11 +7629,19 @@ ASTContext::getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const { if (const auto *DNT = T->getAs()) return NestedNameSpecifier::Create(*this, DNT->getQualifier(), DNT->getIdentifier()); - if (const auto *DTST = T->getAs()) - return NestedNameSpecifier::Create(*this, DTST->getQualifier(), true, T); - - // TODO: Set 'Template' parameter to true for other template types. - return NestedNameSpecifier::Create(*this, nullptr, false, T); + if (const auto *DTST = T->getAs()) { + const DependentTemplateStorage &DTN = DTST->getDependentTemplateName(); + QualType NewT = getDependentTemplateSpecializationType( + ElaboratedTypeKeyword::Typename, + {/*NNS=*/nullptr, DTN.getName(), /*HasTemplateKeyword=*/true}, + DTST->template_arguments(), /*IsCanonical=*/true); + assert(NewT.isCanonical()); + NestedNameSpecifier *Prefix = DTN.getQualifier(); + if (!Prefix) + Prefix = getCanonicalNestedNameSpecifier(NNS->getPrefix()); + return NestedNameSpecifier::Create(*this, Prefix, NewT.getTypePtr()); + } + return NestedNameSpecifier::Create(*this, nullptr, T); } case NestedNameSpecifier::Global: @@ -10056,75 +10071,20 @@ TemplateName ASTContext::getQualifiedTemplateName(NestedNameSpecifier *NNS, return TemplateName(QTN); } -/// Retrieve the template name that represents a dependent -/// template name such as \c MetaFun::template apply. -TemplateName -ASTContext::getDependentTemplateName(NestedNameSpecifier *NNS, - const IdentifierInfo *Name) const { - assert((!NNS || NNS->isDependent()) && - "Nested name specifier must be dependent"); - - llvm::FoldingSetNodeID ID; - DependentTemplateName::Profile(ID, NNS, Name); - - void *InsertPos = nullptr; - DependentTemplateName *QTN = - DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos); - - if (QTN) - return TemplateName(QTN); - - NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS); - if (CanonNNS == NNS) { - QTN = new (*this, alignof(DependentTemplateName)) - DependentTemplateName(NNS, Name); - } else { - TemplateName Canon = getDependentTemplateName(CanonNNS, Name); - QTN = new (*this, alignof(DependentTemplateName)) - DependentTemplateName(NNS, Name, Canon); - DependentTemplateName *CheckQTN = - DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos); - assert(!CheckQTN && "Dependent type name canonicalization broken"); - (void)CheckQTN; - } - - DependentTemplateNames.InsertNode(QTN, InsertPos); - return TemplateName(QTN); -} - /// Retrieve the template name that represents a dependent /// template name such as \c MetaFun::template operator+. TemplateName -ASTContext::getDependentTemplateName(NestedNameSpecifier *NNS, - OverloadedOperatorKind Operator) const { - assert((!NNS || NNS->isDependent()) && - "Nested name specifier must be dependent"); - +ASTContext::getDependentTemplateName(const DependentTemplateStorage &S) const { llvm::FoldingSetNodeID ID; - DependentTemplateName::Profile(ID, NNS, Operator); + S.Profile(ID); void *InsertPos = nullptr; - DependentTemplateName *QTN - = DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos); - - if (QTN) + if (DependentTemplateName *QTN = + DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos)) return TemplateName(QTN); - NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS); - if (CanonNNS == NNS) { - QTN = new (*this, alignof(DependentTemplateName)) - DependentTemplateName(NNS, Operator); - } else { - TemplateName Canon = getDependentTemplateName(CanonNNS, Operator); - QTN = new (*this, alignof(DependentTemplateName)) - DependentTemplateName(NNS, Operator, Canon); - - DependentTemplateName *CheckQTN - = DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos); - assert(!CheckQTN && "Dependent template name canonicalization broken"); - (void)CheckQTN; - } - + DependentTemplateName *QTN = + new (*this, alignof(DependentTemplateName)) DependentTemplateName(S); DependentTemplateNames.InsertNode(QTN, InsertPos); return TemplateName(QTN); } @@ -13543,19 +13503,12 @@ static NestedNameSpecifier *getCommonNNS(ASTContext &Ctx, R = NestedNameSpecifier::Create(Ctx, P, ::getCommonDeclChecked(N1, N2)); break; } - case NestedNameSpecifier::SpecifierKind::TypeSpec: - case NestedNameSpecifier::SpecifierKind::TypeSpecWithTemplate: { + case NestedNameSpecifier::SpecifierKind::TypeSpec: { // FIXME: See comment below, on Super case. if (K2 == NestedNameSpecifier::SpecifierKind::Super) return Ctx.getCanonicalNestedNameSpecifier(NNS1); - assert(K2 == NestedNameSpecifier::SpecifierKind::TypeSpec || - K2 == NestedNameSpecifier::SpecifierKind::TypeSpecWithTemplate); - - // Only keep the template keyword if both sides have it. - bool Template = - K1 == NestedNameSpecifier::SpecifierKind::TypeSpecWithTemplate && - K2 == NestedNameSpecifier::SpecifierKind::TypeSpecWithTemplate; + assert(K2 == NestedNameSpecifier::SpecifierKind::TypeSpec); const Type *T1 = NNS1->getAsType(), *T2 = NNS2->getAsType(); if (T1 == T2) { @@ -13569,13 +13522,12 @@ static NestedNameSpecifier *getCommonNNS(ASTContext &Ctx, bool IsSame = isa(T1); NestedNameSpecifier *P = ::getCommonNNS(Ctx, NNS1->getPrefix(), NNS2->getPrefix(), IsSame); - R = NestedNameSpecifier::Create(Ctx, P, Template, T1); + R = NestedNameSpecifier::Create(Ctx, P, T1); break; } // TODO: Try to salvage the original prefix. // If getCommonSugaredType removed any top level sugar, the original prefix // is not applicable anymore. - NestedNameSpecifier *P = nullptr; const Type *T = Ctx.getCommonSugaredType(QualType(T1, 0), QualType(T2, 0), /*Unqualified=*/true) .getTypePtr(); @@ -13585,7 +13537,7 @@ static NestedNameSpecifier *getCommonNNS(ASTContext &Ctx, case Type::Elaborated: { // An ElaboratedType is stripped off, it's Qualifier becomes the prefix. auto *ET = cast(T); - R = NestedNameSpecifier::Create(Ctx, ET->getQualifier(), Template, + R = NestedNameSpecifier::Create(Ctx, ET->getQualifier(), ET->getNamedType().getTypePtr()); break; } @@ -13600,16 +13552,17 @@ static NestedNameSpecifier *getCommonNNS(ASTContext &Ctx, // A DependentTemplateSpecializationType loses it's Qualifier, which // is turned into the prefix. auto *DTST = cast(T); - T = Ctx.getDependentTemplateSpecializationType( - DTST->getKeyword(), /*NNS=*/nullptr, DTST->getIdentifier(), - DTST->template_arguments()) + const DependentTemplateStorage &DTN = DTST->getDependentTemplateName(); + DependentTemplateStorage NewDTN(/*Qualifier=*/nullptr, DTN.getName(), + DTN.hasTemplateKeyword()); + T = Ctx.getDependentTemplateSpecializationType(DTST->getKeyword(), NewDTN, + DTST->template_arguments()) .getTypePtr(); - P = DTST->getQualifier(); - R = NestedNameSpecifier::Create(Ctx, DTST->getQualifier(), Template, T); + R = NestedNameSpecifier::Create(Ctx, DTN.getQualifier(), T); break; } default: - R = NestedNameSpecifier::Create(Ctx, P, Template, T); + R = NestedNameSpecifier::Create(Ctx, /*Prefix=*/nullptr, T); break; } break; @@ -14052,19 +14005,22 @@ static QualType getCommonNonSugarTypeNode(ASTContext &Ctx, const Type *X, assert(NX->getIdentifier() == NY->getIdentifier()); return Ctx.getDependentNameType( getCommonTypeKeyword(NX, NY), - getCommonQualifier(Ctx, NX, NY, /*IsSame=*/true), NX->getIdentifier(), - NX->getCanonicalTypeInternal()); + getCommonQualifier(Ctx, NX, NY, /*IsSame=*/true), NX->getIdentifier()); } case Type::DependentTemplateSpecialization: { const auto *TX = cast(X), *TY = cast(Y); - assert(TX->getIdentifier() == TY->getIdentifier()); auto As = getCommonTemplateArguments(Ctx, TX->template_arguments(), TY->template_arguments()); + const DependentTemplateStorage &SX = TX->getDependentTemplateName(), + &SY = TY->getDependentTemplateName(); + assert(SX.getName() == SY.getName()); + DependentTemplateStorage Name( + getCommonNNS(Ctx, SX.getQualifier(), SY.getQualifier(), + /*IsSame=*/true), + SX.getName(), SX.hasTemplateKeyword() || SY.hasTemplateKeyword()); return Ctx.getDependentTemplateSpecializationType( - getCommonTypeKeyword(TX, TY), - getCommonQualifier(Ctx, TX, TY, /*IsSame=*/true), TX->getIdentifier(), - As); + getCommonTypeKeyword(TX, TY), Name, As); } case Type::UnaryTransform: { const auto *TX = cast(X), diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 1db30b3f3f76f..9a84e402e3d69 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -1707,11 +1707,10 @@ ASTNodeImporter::VisitPackExpansionType(const PackExpansionType *T) { ExpectedType ASTNodeImporter::VisitDependentTemplateSpecializationType( const DependentTemplateSpecializationType *T) { - auto ToQualifierOrErr = import(T->getQualifier()); - if (!ToQualifierOrErr) - return ToQualifierOrErr.takeError(); - - IdentifierInfo *ToName = Importer.Import(T->getIdentifier()); + const DependentTemplateStorage &DTN = T->getDependentTemplateName(); + auto QualifierOrErr = import(DTN.getQualifier()); + if (!QualifierOrErr) + return QualifierOrErr.takeError(); SmallVector ToPack; ToPack.reserve(T->template_arguments().size()); @@ -1719,7 +1718,10 @@ ExpectedType ASTNodeImporter::VisitDependentTemplateSpecializationType( return std::move(Err); return Importer.getToContext().getDependentTemplateSpecializationType( - T->getKeyword(), *ToQualifierOrErr, ToName, ToPack); + T->getKeyword(), + {*QualifierOrErr, Importer.Import(DTN.getName()), + DTN.hasTemplateKeyword()}, + ToPack); } ExpectedType @@ -1729,18 +1731,8 @@ ASTNodeImporter::VisitDependentNameType(const DependentNameType *T) { return ToQualifierOrErr.takeError(); IdentifierInfo *Name = Importer.Import(T->getIdentifier()); - - QualType Canon; - if (T != T->getCanonicalTypeInternal().getTypePtr()) { - if (ExpectedType TyOrErr = import(T->getCanonicalTypeInternal())) - Canon = (*TyOrErr).getCanonicalType(); - else - return TyOrErr.takeError(); - } - return Importer.getToContext().getDependentNameType(T->getKeyword(), - *ToQualifierOrErr, - Name, Canon); + *ToQualifierOrErr, Name); } ExpectedType @@ -9788,12 +9780,8 @@ ASTImporter::Import(NestedNameSpecifier *FromNNS) { return RDOrErr.takeError(); case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: if (ExpectedTypePtr TyOrErr = Import(FromNNS->getAsType())) { - bool TSTemplate = - FromNNS->getKind() == NestedNameSpecifier::TypeSpecWithTemplate; - return NestedNameSpecifier::Create(ToContext, Prefix, TSTemplate, - *TyOrErr); + return NestedNameSpecifier::Create(ToContext, Prefix, *TyOrErr); } else { return TyOrErr.takeError(); } @@ -9851,21 +9839,13 @@ ASTImporter::Import(NestedNameSpecifierLoc FromNNS) { ToLocalBeginLoc, ToLocalEndLoc); break; - case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: { + case NestedNameSpecifier::TypeSpec: { SourceLocation ToTLoc; if (Error Err = importInto(ToTLoc, NNS.getTypeLoc().getBeginLoc())) return std::move(Err); TypeSourceInfo *TSI = getToContext().getTrivialTypeSourceInfo( - QualType(Spec->getAsType(), 0), ToTLoc); - if (Kind == NestedNameSpecifier::TypeSpecWithTemplate) - // ToLocalBeginLoc is here the location of the 'template' keyword. - Builder.Extend(getToContext(), ToLocalBeginLoc, TSI->getTypeLoc(), - ToLocalEndLoc); - else - // No location for 'template' keyword here. - Builder.Extend(getToContext(), SourceLocation{}, TSI->getTypeLoc(), - ToLocalEndLoc); + QualType(Spec->getAsType(), 0), ToTLoc); + Builder.Extend(getToContext(), TSI->getTypeLoc(), ToLocalEndLoc); break; } @@ -9934,14 +9914,8 @@ Expected ASTImporter::Import(TemplateName From) { auto QualifierOrErr = Import(DTN->getQualifier()); if (!QualifierOrErr) return QualifierOrErr.takeError(); - - if (DTN->isIdentifier()) { - return ToContext.getDependentTemplateName(*QualifierOrErr, - Import(DTN->getIdentifier())); - } - - return ToContext.getDependentTemplateName(*QualifierOrErr, - DTN->getOperator()); + return ToContext.getDependentTemplateName( + {*QualifierOrErr, Import(DTN->getName()), DTN->hasTemplateKeyword()}); } case TemplateName::SubstTemplateTemplateParm: { @@ -10312,6 +10286,13 @@ IdentifierInfo *ASTImporter::Import(const IdentifierInfo *FromId) { return ToId; } +IdentifierOrOverloadedOperator +ASTImporter::Import(IdentifierOrOverloadedOperator FromIO) { + if (const IdentifierInfo *FromII = FromIO.getIdentifier()) + return Import(FromII); + return FromIO.getOperator(); +} + Expected ASTImporter::Import(Selector FromSel) { if (FromSel.isNull()) return Selector{}; diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp index a4349bdaaf682..c769722521d9c 100644 --- a/clang/lib/AST/ASTStructuralEquivalence.cpp +++ b/clang/lib/AST/ASTStructuralEquivalence.cpp @@ -566,7 +566,6 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, return IsStructurallyEquivalent(Context, NNS1->getAsNamespaceAlias(), NNS2->getAsNamespaceAlias()); case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: return IsStructurallyEquivalent(Context, QualType(NNS1->getAsType(), 0), QualType(NNS2->getAsType(), 0)); case NestedNameSpecifier::Global: @@ -578,6 +577,19 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, return false; } +static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, + const DependentTemplateStorage &S1, + const DependentTemplateStorage &S2) { + if (!IsStructurallyEquivalent(Context, S1.getQualifier(), S2.getQualifier())) + return false; + + IdentifierOrOverloadedOperator IO1 = S1.getName(), IO2 = S2.getName(); + const IdentifierInfo *II1 = IO1.getIdentifier(), *II2 = IO2.getIdentifier(); + if (!II1 || !II2) + return IO1.getOperator() == IO2.getOperator(); + return IsStructurallyEquivalent(II1, II2); +} + static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, const TemplateName &N1, const TemplateName &N2) { @@ -614,19 +626,9 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, return TN1->getDeclName() == TN2->getDeclName(); } - case TemplateName::DependentTemplate: { - DependentTemplateName *DN1 = N1.getAsDependentTemplateName(), - *DN2 = N2.getAsDependentTemplateName(); - if (!IsStructurallyEquivalent(Context, DN1->getQualifier(), - DN2->getQualifier())) - return false; - if (DN1->isIdentifier() && DN2->isIdentifier()) - return IsStructurallyEquivalent(DN1->getIdentifier(), - DN2->getIdentifier()); - else if (DN1->isOverloadedOperator() && DN2->isOverloadedOperator()) - return DN1->getOperator() == DN2->getOperator(); - return false; - } + case TemplateName::DependentTemplate: + return IsStructurallyEquivalent(Context, *N1.getAsDependentTemplateName(), + *N2.getAsDependentTemplateName()); case TemplateName::SubstTemplateTemplateParmPack: { SubstTemplateTemplateParmPackStorage @@ -1315,11 +1317,10 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, case Type::DependentTemplateSpecialization: { const auto *Spec1 = cast(T1); const auto *Spec2 = cast(T2); - if (!IsStructurallyEquivalent(Context, Spec1->getQualifier(), - Spec2->getQualifier())) + if (Spec1->getKeyword() != Spec2->getKeyword()) return false; - if (!IsStructurallyEquivalent(Spec1->getIdentifier(), - Spec2->getIdentifier())) + if (!IsStructurallyEquivalent(Context, Spec1->getDependentTemplateName(), + Spec2->getDependentTemplateName())) return false; if (!IsStructurallyEquivalent(Context, Spec1->template_arguments(), Spec2->template_arguments())) diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 49a04861ae25d..b81981606866a 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -1327,7 +1327,7 @@ void CXXNameMangler::manglePrefix(QualType type) { type->getAs()) { if (!mangleSubstitution(QualType(DTST, 0))) { TemplateName Template = getASTContext().getDependentTemplateName( - DTST->getQualifier(), DTST->getIdentifier()); + DTST->getDependentTemplateName()); mangleTemplatePrefix(Template); // FIXME: GCC does not appear to mangle the template arguments when @@ -1395,8 +1395,7 @@ void CXXNameMangler::mangleUnresolvedPrefix(NestedNameSpecifier *qualifier, mangleSourceNameWithAbiTags(qualifier->getAsNamespaceAlias()); break; - case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: { + case NestedNameSpecifier::TypeSpec: { const Type *type = qualifier->getAsType(); // We only want to use an unresolved-type encoding if this is one of: @@ -2181,7 +2180,17 @@ void CXXNameMangler::manglePrefix(NestedNameSpecifier *qualifier) { return; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: + if (NestedNameSpecifier *Prefix = qualifier->getPrefix()) { + const auto *DTST = + cast(qualifier->getAsType()); + QualType NewT = getASTContext().getDependentTemplateSpecializationType( + DTST->getKeyword(), + {Prefix, DTST->getDependentTemplateName().getName(), + /*HasTemplateKeyword=*/true}, + DTST->template_arguments(), /*IsCanonical=*/true); + manglePrefix(NewT); + return; + } manglePrefix(QualType(qualifier->getAsType(), 0)); return; @@ -2265,10 +2274,11 @@ void CXXNameMangler::mangleTemplatePrefix(TemplateName Template) { if (Clang11Compat && mangleSubstitution(Template)) return; - if (const IdentifierInfo *Id = Dependent->getIdentifier()) + if (IdentifierOrOverloadedOperator Name = Dependent->getName(); + const IdentifierInfo *Id = Name.getIdentifier()) mangleSourceName(Id); else - mangleOperatorName(Dependent->getOperator(), UnknownArity); + mangleOperatorName(Name.getOperator(), UnknownArity); addSubstitution(Template); } @@ -2376,12 +2386,13 @@ void CXXNameMangler::mangleType(TemplateName TN) { case TemplateName::DependentTemplate: { const DependentTemplateName *Dependent = TN.getAsDependentTemplateName(); - assert(Dependent->isIdentifier()); + const IdentifierInfo *II = Dependent->getName().getIdentifier(); + assert(II); // ::= // ::= mangleUnresolvedPrefix(Dependent->getQualifier()); - mangleSourceName(Dependent->getIdentifier()); + mangleSourceName(II); break; } @@ -2572,8 +2583,8 @@ bool CXXNameMangler::mangleUnresolvedTypeOrSimpleId(QualType Ty, const DependentTemplateSpecializationType *DTST = cast(Ty); TemplateName Template = getASTContext().getDependentTemplateName( - DTST->getQualifier(), DTST->getIdentifier()); - mangleSourceName(DTST->getIdentifier()); + DTST->getDependentTemplateName()); + mangleTemplatePrefix(Template); mangleTemplateArgs(Template, DTST->template_arguments()); break; } @@ -4481,10 +4492,8 @@ void CXXNameMangler::mangleType(const DependentTemplateSpecializationType *T) { // Dependently-scoped template types are nested if they have a prefix. Out << 'N'; - // TODO: avoid making this TemplateName. TemplateName Prefix = - getASTContext().getDependentTemplateName(T->getQualifier(), - T->getIdentifier()); + getASTContext().getDependentTemplateName(T->getDependentTemplateName()); mangleTemplatePrefix(Prefix); // FIXME: GCC does not appear to mangle the template arguments when diff --git a/clang/lib/AST/NestedNameSpecifier.cpp b/clang/lib/AST/NestedNameSpecifier.cpp index d3195e6487f0b..51aa2d69d0f0d 100644 --- a/clang/lib/AST/NestedNameSpecifier.cpp +++ b/clang/lib/AST/NestedNameSpecifier.cpp @@ -98,14 +98,13 @@ NestedNameSpecifier::Create(const ASTContext &Context, return FindOrInsert(Context, Mockup); } -NestedNameSpecifier * -NestedNameSpecifier::Create(const ASTContext &Context, - NestedNameSpecifier *Prefix, - bool Template, const Type *T) { +NestedNameSpecifier *NestedNameSpecifier::Create(const ASTContext &Context, + NestedNameSpecifier *Prefix, + const Type *T) { assert(T && "Type cannot be NULL"); NestedNameSpecifier Mockup; Mockup.Prefix.setPointer(Prefix); - Mockup.Prefix.setInt(Template? StoredTypeSpecWithTemplate : StoredTypeSpec); + Mockup.Prefix.setInt(StoredTypeSpec); Mockup.Specifier = const_cast(T); return FindOrInsert(Context, Mockup); } @@ -155,9 +154,6 @@ NestedNameSpecifier::SpecifierKind NestedNameSpecifier::getKind() const { case StoredTypeSpec: return TypeSpec; - - case StoredTypeSpecWithTemplate: - return TypeSpecWithTemplate; } llvm_unreachable("Invalid NNS Kind!"); @@ -189,7 +185,6 @@ CXXRecordDecl *NestedNameSpecifier::getAsRecordDecl() const { return dyn_cast(static_cast(Specifier)); case StoredTypeSpec: - case StoredTypeSpecWithTemplate: return getAsType()->getAsCXXRecordDecl(); } @@ -222,9 +217,13 @@ NestedNameSpecifierDependence NestedNameSpecifier::getDependence() const { return NestedNameSpecifierDependence::None; } - case TypeSpec: - case TypeSpecWithTemplate: - return toNestedNameSpecifierDependendence(getAsType()->getDependence()); + case TypeSpec: { + NestedNameSpecifierDependence Dep = + toNestedNameSpecifierDependendence(getAsType()->getDependence()); + if (NestedNameSpecifier *Prefix = getPrefix()) + Dep |= Prefix->getDependence(); + return Dep; + } } llvm_unreachable("Invalid NNS Kind!"); } @@ -254,17 +253,17 @@ NestedNameSpecifier::translateToType(const ASTContext &Context) const { .getDependentNameType(ElaboratedTypeKeyword::None, Prefix, getAsIdentifier()) .getTypePtr(); - case SpecifierKind::TypeSpec: - case SpecifierKind::TypeSpecWithTemplate: { + case SpecifierKind::TypeSpec: { const Type *T = getAsType(); switch (T->getTypeClass()) { case Type::DependentTemplateSpecialization: { const auto *DT = cast(T); - // FIXME: The type node can't represent the template keyword. + const DependentTemplateStorage &DTN = DT->getDependentTemplateName(); return Context - .getDependentTemplateSpecializationType(ElaboratedTypeKeyword::None, - Prefix, DT->getIdentifier(), - DT->template_arguments()) + .getDependentTemplateSpecializationType( + ElaboratedTypeKeyword::None, + {Prefix, DTN.getName(), DTN.hasTemplateKeyword()}, + DT->template_arguments()) .getTypePtr(); } case Type::Record: @@ -324,59 +323,11 @@ void NestedNameSpecifier::print(raw_ostream &OS, const PrintingPolicy &Policy, OS << "__super"; break; - case TypeSpecWithTemplate: - OS << "template "; - // Fall through to print the type. - [[fallthrough]]; - case TypeSpec: { - const auto *Record = - dyn_cast_or_null(getAsRecordDecl()); - if (ResolveTemplateArguments && Record) { - // Print the type trait with resolved template parameters. - Record->printName(OS, Policy); - printTemplateArgumentList( - OS, Record->getTemplateArgs().asArray(), Policy, - Record->getSpecializedTemplate()->getTemplateParameters()); - break; - } - const Type *T = getAsType(); - PrintingPolicy InnerPolicy(Policy); InnerPolicy.SuppressScope = true; InnerPolicy.SuppressTagKeyword = true; - - // Nested-name-specifiers are intended to contain minimally-qualified - // types. An actual ElaboratedType will not occur, since we'll store - // just the type that is referred to in the nested-name-specifier (e.g., - // a TypedefType, TagType, etc.). However, when we are dealing with - // dependent template-id types (e.g., Outer::template Inner), - // the type requires its own nested-name-specifier for uniqueness, so we - // suppress that nested-name-specifier during printing. - assert(!isa(T) && - "Elaborated type in nested-name-specifier"); - if (const TemplateSpecializationType *SpecType - = dyn_cast(T)) { - // Print the template name without its corresponding - // nested-name-specifier. - SpecType->getTemplateName().print(OS, InnerPolicy, - TemplateName::Qualified::None); - - // Print the template argument list. - printTemplateArgumentList(OS, SpecType->template_arguments(), - InnerPolicy); - } else if (const auto *DepSpecType = - dyn_cast(T)) { - // Print the template name without its corresponding - // nested-name-specifier. - OS << DepSpecType->getIdentifier()->getName(); - // Print the template argument list. - printTemplateArgumentList(OS, DepSpecType->template_arguments(), - InnerPolicy); - } else { - // Print the type normally - QualType(T, 0).print(OS, InnerPolicy); - } + QualType(getAsType(), 0).print(OS, InnerPolicy); break; } } @@ -421,7 +372,6 @@ NestedNameSpecifierLoc::getLocalDataLength(NestedNameSpecifier *Qualifier) { Length += sizeof(SourceLocation::UIntTy); break; - case NestedNameSpecifier::TypeSpecWithTemplate: case NestedNameSpecifier::TypeSpec: // The "void*" that points at the TypeLoc data. // Note: the 'template' keyword is part of the TypeLoc. @@ -485,7 +435,6 @@ SourceRange NestedNameSpecifierLoc::getLocalSourceRange() const { LoadSourceLocation(Data, Offset), LoadSourceLocation(Data, Offset + sizeof(SourceLocation::UIntTy))); - case NestedNameSpecifier::TypeSpecWithTemplate: case NestedNameSpecifier::TypeSpec: { // The "void*" that points at the TypeLoc data. // Note: the 'template' keyword is part of the TypeLoc. @@ -500,8 +449,7 @@ SourceRange NestedNameSpecifierLoc::getLocalSourceRange() const { } TypeLoc NestedNameSpecifierLoc::getTypeLoc() const { - if (Qualifier->getKind() != NestedNameSpecifier::TypeSpec && - Qualifier->getKind() != NestedNameSpecifier::TypeSpecWithTemplate) + if (Qualifier->getKind() != NestedNameSpecifier::TypeSpec) return TypeLoc(); // The "void*" that points at the TypeLoc data. @@ -609,13 +557,10 @@ operator=(const NestedNameSpecifierLocBuilder &Other) { return *this; } -void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context, - SourceLocation TemplateKWLoc, - TypeLoc TL, +void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context, TypeLoc TL, SourceLocation ColonColonLoc) { - Representation = NestedNameSpecifier::Create(Context, Representation, - TemplateKWLoc.isValid(), - TL.getTypePtr()); + Representation = + NestedNameSpecifier::Create(Context, Representation, TL.getTypePtr()); // Push source-location info into the buffer. SavePointer(TL.getOpaqueData(), Buffer, BufferSize, BufferCapacity); @@ -697,8 +642,7 @@ void NestedNameSpecifierLocBuilder::MakeTrivial(ASTContext &Context, SaveSourceLocation(R.getBegin(), Buffer, BufferSize, BufferCapacity); break; - case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: { + case NestedNameSpecifier::TypeSpec: { TypeSourceInfo *TSInfo = Context.getTrivialTypeSourceInfo(QualType(NNS->getAsType(), 0), R.getBegin()); diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp index 4c428cce32475..f8446dfbc6859 100644 --- a/clang/lib/AST/ODRHash.cpp +++ b/clang/lib/AST/ODRHash.cpp @@ -128,7 +128,6 @@ void ODRHash::AddNestedNameSpecifier(const NestedNameSpecifier *NNS) { AddDecl(NNS->getAsNamespaceAlias()); break; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: AddType(NNS->getAsType()); break; case NestedNameSpecifier::Global: @@ -137,6 +136,16 @@ void ODRHash::AddNestedNameSpecifier(const NestedNameSpecifier *NNS) { } } +void ODRHash::AddDependentTemplateName(const DependentTemplateStorage &Name) { + if (NestedNameSpecifier *NNS = Name.getQualifier()) + AddNestedNameSpecifier(NNS); + if (IdentifierOrOverloadedOperator IO = Name.getName(); + const IdentifierInfo *II = IO.getIdentifier()) + AddIdentifierInfo(II); + else + ID.AddInteger(IO.getOperator()); +} + void ODRHash::AddTemplateName(TemplateName Name) { auto Kind = Name.getKind(); ID.AddInteger(Kind); @@ -153,10 +162,13 @@ void ODRHash::AddTemplateName(TemplateName Name) { AddTemplateName(QTN->getUnderlyingTemplate()); break; } + case TemplateName::DependentTemplate: { + AddDependentTemplateName(*Name.getAsDependentTemplateName()); + break; + } // TODO: Support these cases. case TemplateName::OverloadedTemplate: case TemplateName::AssumedTemplate: - case TemplateName::DependentTemplate: case TemplateName::SubstTemplateTemplateParm: case TemplateName::SubstTemplateTemplateParmPack: case TemplateName::UsingTemplate: @@ -1221,8 +1233,7 @@ class ODRTypeVisitor : public TypeVisitor { void VisitDependentTemplateSpecializationType( const DependentTemplateSpecializationType *T) { - AddIdentifierInfo(T->getIdentifier()); - AddNestedNameSpecifier(T->getQualifier()); + Hash.AddDependentTemplateName(T->getDependentTemplateName()); ID.AddInteger(T->template_arguments().size()); for (const auto &TA : T->template_arguments()) { Hash.AddTemplateArgument(TA); diff --git a/clang/lib/AST/QualTypeNames.cpp b/clang/lib/AST/QualTypeNames.cpp index 3c814b777f8ab..d8ab1092d3ea4 100644 --- a/clang/lib/AST/QualTypeNames.cpp +++ b/clang/lib/AST/QualTypeNames.cpp @@ -212,6 +212,7 @@ static NestedNameSpecifier *getFullyQualifiedNestedNameSpecifier( bool WithGlobalNsPrefix) { switch (Scope->getKind()) { case NestedNameSpecifier::Global: + case NestedNameSpecifier::Super: // Already fully qualified return Scope; case NestedNameSpecifier::Namespace: @@ -232,9 +233,7 @@ static NestedNameSpecifier *getFullyQualifiedNestedNameSpecifier( // but use the name of it's prefix. return getFullyQualifiedNestedNameSpecifier( Ctx, Scope->getPrefix(), WithGlobalNsPrefix); - case NestedNameSpecifier::Super: - case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: { + case NestedNameSpecifier::TypeSpec: { const Type *Type = Scope->getAsType(); // Find decl context. const TagDecl *TD = nullptr; @@ -366,8 +365,7 @@ NestedNameSpecifier *createNestedNameSpecifier(const ASTContext &Ctx, } return NestedNameSpecifier::Create( - Ctx, createOuterNNS(Ctx, TD, FullyQualify, WithGlobalNsPrefix), - false /*No TemplateKeyword*/, TypePtr); + Ctx, createOuterNNS(Ctx, TD, FullyQualify, WithGlobalNsPrefix), TypePtr); } /// Return the fully qualified type, including fully-qualified diff --git a/clang/lib/AST/TemplateName.cpp b/clang/lib/AST/TemplateName.cpp index 9e0a7dc2b8cdc..031b58123fc99 100644 --- a/clang/lib/AST/TemplateName.cpp +++ b/clang/lib/AST/TemplateName.cpp @@ -122,6 +122,31 @@ void SubstTemplateTemplateParmPackStorage::Profile( ID.AddBoolean(Final); } +IdentifierOrOverloadedOperator::IdentifierOrOverloadedOperator( + const IdentifierInfo *II) + : PtrOrOp(reinterpret_cast(II)) { + static_assert(NUM_OVERLOADED_OPERATORS <= 4096, + "NUM_OVERLOADED_OPERATORS is too large"); + assert(II); + assert(getIdentifier() == II); +} +IdentifierOrOverloadedOperator::IdentifierOrOverloadedOperator( + OverloadedOperatorKind OOK) + : PtrOrOp(-uintptr_t(OOK)) { + assert(OOK != OO_None); + assert(getOperator() == OOK); +} + +void IdentifierOrOverloadedOperator::Profile(llvm::FoldingSetNodeID &ID) const { + if (auto *Identifier = getIdentifier()) { + ID.AddBoolean(false); + ID.AddPointer(Identifier); + } else { + ID.AddBoolean(true); + ID.AddInteger(getOperator()); + } +} + TemplateName::TemplateName(void *Ptr) { Storage = StorageType::getFromOpaqueValue(Ptr); } @@ -275,6 +300,36 @@ UsingShadowDecl *TemplateName::getAsUsingShadowDecl() const { return nullptr; } +DependentTemplateStorage::DependentTemplateStorage( + NestedNameSpecifier *Qualifier, IdentifierOrOverloadedOperator Name, + bool HasTemplateKeyword) + : Qualifier(Qualifier, HasTemplateKeyword), Name(Name) { + assert((!Qualifier || Qualifier->isDependent()) && + "Qualifier must be dependent"); +} + +TemplateNameDependence DependentTemplateStorage::getDependence() const { + auto D = TemplateNameDependence::DependentInstantiation; + if (NestedNameSpecifier *Qualifier = getQualifier()) + D |= toTemplateNameDependence(Qualifier->getDependence()); + return D; +} + +void DependentTemplateStorage::print(raw_ostream &OS, + const PrintingPolicy &Policy) const { + if (NestedNameSpecifier *NNS = getQualifier()) + NNS->print(OS, Policy); + + if (hasTemplateKeyword()) + OS << "template "; + + IdentifierOrOverloadedOperator Name = getName(); + if (const IdentifierInfo *II = Name.getIdentifier()) + OS << II->getName(); + else + OS << "operator " << getOperatorSpelling(Name.getOperator()); +} + DeducedTemplateStorage *TemplateName::getAsDeducedTemplateName() const { if (UncommonTemplateNameStorage *Uncommon = dyn_cast_if_present(Storage)) @@ -313,7 +368,8 @@ TemplateNameDependence TemplateName::getDependence() const { case NameKind::DependentTemplate: { DependentTemplateName *S = getAsDependentTemplateName(); auto D = TemplateNameDependence::DependentInstantiation; - D |= toTemplateNameDependence(S->getQualifier()->getDependence()); + if (NestedNameSpecifier *Qualifier = S->getQualifier()) + D |= toTemplateNameDependence(Qualifier->getDependence()); return D; } case NameKind::SubstTemplateTemplateParm: { @@ -401,14 +457,7 @@ void TemplateName::print(raw_ostream &OS, const PrintingPolicy &Policy, else OS << *UTD; } else if (DependentTemplateName *DTN = getAsDependentTemplateName()) { - if (NestedNameSpecifier *NNS = DTN->getQualifier()) - NNS->print(OS, Policy); - OS << "template "; - - if (DTN->isIdentifier()) - OS << DTN->getIdentifier()->getName(); - else - OS << "operator " << getOperatorSpelling(DTN->getOperator()); + DTN->print(OS, Policy); } else if (SubstTemplateTemplateParmStorage *subst = getAsSubstTemplateTemplateParm()) { subst->getReplacement().print(OS, Policy, Qual); diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index f18cf703bdaa6..1fe6f2c722acf 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -1027,10 +1027,6 @@ void clang::TextNodeDumper::dumpNestedNameSpecifier(const NestedNameSpecifier *N OS << " TypeSpec"; dumpType(QualType(NNS->getAsType(), 0)); break; - case NestedNameSpecifier::TypeSpecWithTemplate: - OS << " TypeSpecWithTemplate"; - dumpType(QualType(NNS->getAsType(), 0)); - break; case NestedNameSpecifier::Global: OS << " Global"; break; diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 08798219c0b83..9fda02b430e48 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -3270,16 +3270,13 @@ StringRef TypeWithKeyword::getKeywordName(ElaboratedTypeKeyword Keyword) { } DependentTemplateSpecializationType::DependentTemplateSpecializationType( - ElaboratedTypeKeyword Keyword, NestedNameSpecifier *NNS, - const IdentifierInfo *Name, ArrayRef Args, QualType Canon) + ElaboratedTypeKeyword Keyword, const DependentTemplateStorage &Name, + ArrayRef Args, QualType Canon) : TypeWithKeyword(Keyword, DependentTemplateSpecialization, Canon, - TypeDependence::DependentInstantiation | - (NNS ? toTypeDependence(NNS->getDependence()) - : TypeDependence::None)), - NNS(NNS), Name(Name) { + + toTypeDependence(Name.getDependence())), + Name(Name) { DependentTemplateSpecializationTypeBits.NumArgs = Args.size(); - assert((!NNS || NNS->isDependent()) && - "DependentTemplateSpecializatonType requires dependent qualifier"); auto *ArgBuffer = const_cast(template_arguments().data()); for (const TemplateArgument &Arg : Args) { addDependence(toTypeDependence(Arg.getDependence() & @@ -3289,16 +3286,12 @@ DependentTemplateSpecializationType::DependentTemplateSpecializationType( } } -void -DependentTemplateSpecializationType::Profile(llvm::FoldingSetNodeID &ID, - const ASTContext &Context, - ElaboratedTypeKeyword Keyword, - NestedNameSpecifier *Qualifier, - const IdentifierInfo *Name, - ArrayRef Args) { +void DependentTemplateSpecializationType::Profile( + llvm::FoldingSetNodeID &ID, const ASTContext &Context, + ElaboratedTypeKeyword Keyword, const DependentTemplateStorage &Name, + ArrayRef Args) { ID.AddInteger(llvm::to_underlying(Keyword)); - ID.AddPointer(Qualifier); - ID.AddPointer(Name); + Name.Profile(ID); for (const TemplateArgument &Arg : Args) Arg.Profile(ID, Context); } diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp index fbb7fc5cd7690..24726901b8f55 100644 --- a/clang/lib/AST/TypeLoc.cpp +++ b/clang/lib/AST/TypeLoc.cpp @@ -569,9 +569,10 @@ void DependentTemplateSpecializationTypeLoc::initializeLocal(ASTContext &Context, SourceLocation Loc) { setElaboratedKeywordLoc(Loc); - if (getTypePtr()->getQualifier()) { + if (NestedNameSpecifier *Qualifier = + getTypePtr()->getDependentTemplateName().getQualifier()) { NestedNameSpecifierLocBuilder Builder; - Builder.MakeTrivial(Context, getTypePtr()->getQualifier(), Loc); + Builder.MakeTrivial(Context, Qualifier, Loc); setQualifierLoc(Builder.getWithLocInContext(Context)); } else { setQualifierLoc(NestedNameSpecifierLoc()); diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index 3982ca3b50604..4ec252e3f89b5 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -1793,9 +1793,7 @@ void TypePrinter::printDependentTemplateSpecializationBefore( if (T->getKeyword() != ElaboratedTypeKeyword::None) OS << " "; - if (T->getQualifier()) - T->getQualifier()->print(OS, Policy); - OS << "template " << T->getIdentifier()->getName(); + T->getDependentTemplateName().print(OS, Policy); printTemplateArgumentList(OS, T->template_arguments(), Policy); spaceBeforePlaceHolder(OS); } @@ -2498,14 +2496,18 @@ void clang::printTemplateArgumentList(raw_ostream &OS, ArrayRef Args, const PrintingPolicy &Policy, const TemplateParameterList *TPL) { - printTo(OS, Args, Policy, TPL, /*isPack*/ false, /*parmIndex*/ 0); + PrintingPolicy InnerPolicy = Policy; + InnerPolicy.SuppressScope = false; + printTo(OS, Args, InnerPolicy, TPL, /*isPack*/ false, /*parmIndex*/ 0); } void clang::printTemplateArgumentList(raw_ostream &OS, ArrayRef Args, const PrintingPolicy &Policy, const TemplateParameterList *TPL) { - printTo(OS, Args, Policy, TPL, /*isPack*/ false, /*parmIndex*/ 0); + PrintingPolicy InnerPolicy = Policy; + InnerPolicy.SuppressScope = false; + printTo(OS, Args, InnerPolicy, TPL, /*isPack*/ false, /*parmIndex*/ 0); } std::string Qualifiers::getAsString() const { diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp index 480e33f607bb0..d7eebcbc3c2f9 100644 --- a/clang/lib/ExtractAPI/DeclarationFragments.cpp +++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp @@ -249,13 +249,6 @@ DeclarationFragmentsBuilder::getFragmentsForNNS(const NestedNameSpecifier *NNS, Fragments.append("__super", DeclarationFragments::FragmentKind::Keyword); break; - case NestedNameSpecifier::TypeSpecWithTemplate: - // A type prefixed by the `template` keyword. - Fragments.append("template", DeclarationFragments::FragmentKind::Keyword); - Fragments.appendSpace(); - // Fallthrough after adding the keyword to handle the actual type. - [[fallthrough]]; - case NestedNameSpecifier::TypeSpec: { const Type *T = NNS->getAsType(); // FIXME: Handle C++ template specialization type diff --git a/clang/lib/Index/IndexTypeSourceInfo.cpp b/clang/lib/Index/IndexTypeSourceInfo.cpp index d5d0a3c422871..98b5513128fbe 100644 --- a/clang/lib/Index/IndexTypeSourceInfo.cpp +++ b/clang/lib/Index/IndexTypeSourceInfo.cpp @@ -277,7 +277,6 @@ void IndexingContext::indexNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS, break; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: indexTypeLoc(NNS.getTypeLoc(), Parent, DC); break; } diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp index 26be78ee8ca15..941e681247de1 100644 --- a/clang/lib/Parse/ParseExprCXX.cpp +++ b/clang/lib/Parse/ParseExprCXX.cpp @@ -587,12 +587,12 @@ bool Parser::ParseOptionalCXXScopeSpecifier( << II.getName() << FixItHint::CreateInsertion(Tok.getLocation(), "template "); } - - SourceLocation TemplateNameLoc = ConsumeToken(); + ConsumeToken(); TemplateNameKind TNK = Actions.ActOnTemplateName( - getCurScope(), SS, TemplateNameLoc, TemplateName, ObjectType, - EnteringContext, Template, /*AllowInjectedClassName*/ true); + getCurScope(), SS, /*TemplateKWLoc=*/SourceLocation(), TemplateName, + ObjectType, EnteringContext, Template, + /*AllowInjectedClassName=*/true); if (AnnotateTemplateIdToken(Template, TNK, SS, SourceLocation(), TemplateName, false)) return true; diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp index 95e14ca0fa3b7..ee5a862c32509 100644 --- a/clang/lib/Sema/DeclSpec.cpp +++ b/clang/lib/Sema/DeclSpec.cpp @@ -48,9 +48,9 @@ void UnqualifiedId::setConstructorTemplateId(TemplateIdAnnotation *TemplateId) { EndLocation = TemplateId->RAngleLoc; } -void CXXScopeSpec::Extend(ASTContext &Context, SourceLocation TemplateKWLoc, - TypeLoc TL, SourceLocation ColonColonLoc) { - Builder.Extend(Context, TemplateKWLoc, TL, ColonColonLoc); +void CXXScopeSpec::Extend(ASTContext &Context, TypeLoc TL, + SourceLocation ColonColonLoc) { + Builder.Extend(Context, TL, ColonColonLoc); if (Range.getBegin().isInvalid()) Range.setBegin(TL.getBeginLoc()); Range.setEnd(ColonColonLoc); diff --git a/clang/lib/Sema/HeuristicResolver.cpp b/clang/lib/Sema/HeuristicResolver.cpp index 4544d75ea73c4..f6ee000a58f4b 100644 --- a/clang/lib/Sema/HeuristicResolver.cpp +++ b/clang/lib/Sema/HeuristicResolver.cpp @@ -365,9 +365,10 @@ HeuristicResolverImpl::resolveDependentNameType(const DependentNameType *DNT) { std::vector HeuristicResolverImpl::resolveTemplateSpecializationType( const DependentTemplateSpecializationType *DTST) { + const DependentTemplateStorage &DTN = DTST->getDependentTemplateName(); return resolveDependentMember( - resolveNestedNameSpecifierToType(DTST->getQualifier()), - DTST->getIdentifier(), TemplateFilter); + resolveNestedNameSpecifierToType(DTN.getQualifier()), + DTN.getName().getIdentifier(), TemplateFilter); } std::vector @@ -409,7 +410,6 @@ QualType HeuristicResolverImpl::resolveNestedNameSpecifierToType( // the TypeSpec cases too. switch (NNS->getKind()) { case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: return QualType(NNS->getAsType(), 0); case NestedNameSpecifier::Identifier: { return resolveDeclsToType( diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp index f04f7f9929442..545da5c295832 100644 --- a/clang/lib/Sema/SemaCXXScopeSpec.cpp +++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp @@ -145,8 +145,7 @@ DeclContext *Sema::computeDeclContext(const CXXScopeSpec &SS, case NestedNameSpecifier::NamespaceAlias: return NNS->getAsNamespaceAlias()->getNamespace(); - case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: { + case NestedNameSpecifier::TypeSpec: { const TagType *Tag = NNS->getAsType()->getAs(); assert(Tag && "Non-tag type in nested-name-specifier"); return Tag->getDecl(); @@ -687,8 +686,7 @@ bool Sema::BuildCXXNestedNameSpecifier(Scope *S, NestedNameSpecInfo &IdInfo, llvm_unreachable("Unhandled TypeDecl node in nested-name-specifier"); } - SS.Extend(Context, SourceLocation(), TLB.getTypeLocInContext(Context, T), - IdInfo.CCLoc); + SS.Extend(Context, TLB.getTypeLocInContext(Context, T), IdInfo.CCLoc); return false; } @@ -735,8 +733,8 @@ bool Sema::BuildCXXNestedNameSpecifier(Scope *S, NestedNameSpecInfo &IdInfo, QualType T = Context.getTypeDeclType(ContainingClass); TypeLocBuilder TLB; TLB.pushTrivial(Context, T, IdInfo.IdentifierLoc); - SS.Extend(Context, /*TemplateKWLoc=*/SourceLocation(), - TLB.getTypeLocInContext(Context, T), IdInfo.IdentifierLoc); + SS.Extend(Context, TLB.getTypeLocInContext(Context, T), + IdInfo.IdentifierLoc); // Add the identifier to form a dependent name. SS.Extend(Context, IdInfo.Identifier, IdInfo.IdentifierLoc, IdInfo.CCLoc); @@ -804,8 +802,7 @@ bool Sema::ActOnCXXNestedNameSpecifierDecltype(CXXScopeSpec &SS, DecltypeTypeLoc DecltypeTL = TLB.push(T); DecltypeTL.setDecltypeLoc(DS.getTypeSpecTypeLoc()); DecltypeTL.setRParenLoc(DS.getTypeofParensRange().getEnd()); - SS.Extend(Context, SourceLocation(), TLB.getTypeLocInContext(Context, T), - ColonColonLoc); + SS.Extend(Context, TLB.getTypeLocInContext(Context, T), ColonColonLoc); return false; } @@ -827,8 +824,7 @@ bool Sema::ActOnCXXNestedNameSpecifierIndexedPack(CXXScopeSpec &SS, DS.getBeginLoc()); PackIndexingTypeLoc PIT = TLB.push(Type); PIT.setEllipsisLoc(DS.getEllipsisLoc()); - SS.Extend(Context, SourceLocation(), TLB.getTypeLocInContext(Context, Type), - ColonColonLoc); + SS.Extend(Context, TLB.getTypeLocInContext(Context, Type), ColonColonLoc); return false; } @@ -862,12 +858,14 @@ bool Sema::ActOnCXXNestedNameSpecifier(Scope *S, translateTemplateArguments(TemplateArgsIn, TemplateArgs); DependentTemplateName *DTN = Template.getAsDependentTemplateName(); - if (DTN && DTN->isIdentifier()) { + if (DTN && DTN->getName().getIdentifier()) { // Handle a dependent template specialization for which we cannot resolve // the template name. assert(DTN->getQualifier() == SS.getScopeRep()); QualType T = Context.getDependentTemplateSpecializationType( - ElaboratedTypeKeyword::None, DTN->getQualifier(), DTN->getIdentifier(), + ElaboratedTypeKeyword::None, + {/*Qualifier=*/nullptr, DTN->getName().getIdentifier(), + TemplateKWLoc.isValid()}, TemplateArgs.arguments()); // Create source-location information for this type. @@ -875,7 +873,6 @@ bool Sema::ActOnCXXNestedNameSpecifier(Scope *S, DependentTemplateSpecializationTypeLoc SpecTL = Builder.push(T); SpecTL.setElaboratedKeywordLoc(SourceLocation()); - SpecTL.setQualifierLoc(SS.getWithLocInContext(Context)); SpecTL.setTemplateKeywordLoc(TemplateKWLoc); SpecTL.setTemplateNameLoc(TemplateNameLoc); SpecTL.setLAngleLoc(LAngleLoc); @@ -883,8 +880,7 @@ bool Sema::ActOnCXXNestedNameSpecifier(Scope *S, for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I) SpecTL.setArgLocInfo(I, TemplateArgs[I].getLocInfo()); - SS.Extend(Context, TemplateKWLoc, Builder.getTypeLocInContext(Context, T), - CCLoc); + SS.Extend(Context, Builder.getTypeLocInContext(Context, T), CCLoc); return false; } @@ -932,9 +928,7 @@ bool Sema::ActOnCXXNestedNameSpecifier(Scope *S, for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I) SpecTL.setArgLocInfo(I, TemplateArgs[I].getLocInfo()); - - SS.Extend(Context, TemplateKWLoc, Builder.getTypeLocInContext(Context, T), - CCLoc); + SS.Extend(Context, Builder.getTypeLocInContext(Context, T), CCLoc); return false; } @@ -1007,7 +1001,6 @@ bool Sema::ShouldEnterDeclaratorScope(Scope *S, const CXXScopeSpec &SS) { case NestedNameSpecifier::Identifier: case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: case NestedNameSpecifier::Super: // These are never namespace scopes. return true; diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 2003701b65654..54cafc2010f09 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -755,7 +755,7 @@ getRequiredQualification(ASTContext &Context, const DeclContext *CurContext, Result = NestedNameSpecifier::Create(Context, Result, Namespace); } else if (const auto *TD = dyn_cast(Parent)) Result = NestedNameSpecifier::Create( - Context, Result, false, Context.getTypeDeclType(TD).getTypePtr()); + Context, Result, Context.getTypeDeclType(TD).getTypePtr()); } return Result; } @@ -1216,7 +1216,7 @@ void ResultBuilder::MaybeAddResult(Result R, DeclContext *CurContext) { NestedNameSpecifier::Create(SemaRef.Context, nullptr, Namespace); else if (const TagDecl *Tag = dyn_cast(Ctx)) R.Qualifier = NestedNameSpecifier::Create( - SemaRef.Context, nullptr, false, + SemaRef.Context, nullptr, SemaRef.Context.getTypeDeclType(Tag).getTypePtr()); else R.QualifierIsInformative = false; @@ -1405,7 +1405,7 @@ void ResultBuilder::AddResult(Result R, DeclContext *CurContext, NestedNameSpecifier::Create(SemaRef.Context, nullptr, Namespace); else if (const auto *Tag = dyn_cast(Ctx)) R.Qualifier = NestedNameSpecifier::Create( - SemaRef.Context, nullptr, false, + SemaRef.Context, nullptr, SemaRef.Context.getTypeDeclType(Tag).getTypePtr()); else R.QualifierIsInformative = false; diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp index 53536b0d14037..6f873cafa98fd 100644 --- a/clang/lib/Sema/SemaCoroutine.cpp +++ b/clang/lib/Sema/SemaCoroutine.cpp @@ -116,8 +116,7 @@ static QualType lookupPromiseType(Sema &S, const FunctionDecl *FD, auto buildElaboratedType = [&]() { auto *NNS = NestedNameSpecifier::Create(S.Context, nullptr, S.getStdNamespace()); - NNS = NestedNameSpecifier::Create(S.Context, NNS, false, - CoroTrait.getTypePtr()); + NNS = NestedNameSpecifier::Create(S.Context, NNS, CoroTrait.getTypePtr()); return S.Context.getElaboratedType(ElaboratedTypeKeyword::None, NNS, PromiseType); }; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 5527ed5419fc8..2246f0f1b3121 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -252,8 +252,8 @@ static ParsedType recoverFromTypeInKnownDependentBase(Sema &S, S.Diag(NameLoc, diag::ext_found_in_dependent_base) << &II; ASTContext &Context = S.Context; - auto *NNS = NestedNameSpecifier::Create(Context, nullptr, false, - cast(Context.getRecordType(RD))); + auto *NNS = NestedNameSpecifier::Create( + Context, nullptr, cast(Context.getRecordType(RD))); QualType T = Context.getDependentNameType(ElaboratedTypeKeyword::Typename, NNS, &II); @@ -580,10 +580,10 @@ synthesizeCurrentNestedNameSpecifier(ASTContext &Context, DeclContext *DC) { auto *ND = dyn_cast(DC); if (ND && !ND->isInline() && !ND->isAnonymousNamespace()) return NestedNameSpecifier::Create(Context, nullptr, ND); - else if (auto *RD = dyn_cast(DC)) - return NestedNameSpecifier::Create(Context, nullptr, RD->isTemplateDecl(), + if (auto *RD = dyn_cast(DC)) + return NestedNameSpecifier::Create(Context, nullptr, RD->getTypeForDecl()); - else if (isa(DC)) + if (isa(DC)) return NestedNameSpecifier::GlobalSpecifier(Context); } llvm_unreachable("something isn't in TU scope?"); @@ -624,8 +624,7 @@ ParsedType Sema::ActOnMSVCUnknownTypeName(const IdentifierInfo &II, findRecordWithDependentBasesOfEnclosingMethod(CurContext)) { // Build a DependentNameType that will perform lookup into RD at // instantiation time. - NNS = NestedNameSpecifier::Create(Context, nullptr, RD->isTemplateDecl(), - RD->getTypeForDecl()); + NNS = NestedNameSpecifier::Create(Context, nullptr, RD->getTypeForDecl()); // Diagnose that this identifier was undeclared, and retry the lookup during // template instantiation. @@ -6243,11 +6242,12 @@ bool Sema::diagnoseQualifiedDeclaration(CXXScopeSpec &SS, DeclContext *DC, NestedNameSpecifierLoc SpecLoc(SS.getScopeRep(), SS.location_data()); do { - if (SpecLoc.getNestedNameSpecifier()->getKind() == - NestedNameSpecifier::TypeSpecWithTemplate) - Diag(Loc, diag::ext_template_after_declarative_nns) - << FixItHint::CreateRemoval( - SpecLoc.getTypeLoc().getTemplateKeywordLoc()); + if (TypeLoc TL = SpecLoc.getTypeLoc()) { + if (SourceLocation TemplateKeywordLoc = TL.getTemplateKeywordLoc(); + TemplateKeywordLoc.isValid()) + Diag(Loc, diag::ext_template_after_declarative_nns) + << FixItHint::CreateRemoval(TemplateKeywordLoc); + } if (const Type *T = SpecLoc.getNestedNameSpecifier()->getAsType()) { if (const auto *TST = T->getAsAdjusted()) { diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index d724e183b69bd..43bf9b7cd0f95 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -14799,8 +14799,7 @@ buildSingleCopyAssignRecursively(Sema &S, SourceLocation Loc, QualType T, CXXScopeSpec SS; const Type *CanonicalT = S.Context.getCanonicalType(T.getTypePtr()); SS.MakeTrivial(S.Context, - NestedNameSpecifier::Create(S.Context, nullptr, false, - CanonicalT), + NestedNameSpecifier::Create(S.Context, nullptr, CanonicalT), Loc); // Create the reference to operator=. diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 1c0ef39878d7f..7cc8374e69d73 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -2684,7 +2684,7 @@ recoverFromMSUnqualifiedLookup(Sema &S, ASTContext &Context, // perform name lookup during template instantiation. CXXScopeSpec SS; auto *NNS = - NestedNameSpecifier::Create(Context, nullptr, true, RD->getTypeForDecl()); + NestedNameSpecifier::Create(Context, nullptr, RD->getTypeForDecl()); SS.MakeTrivial(Context, NNS, SourceRange(Loc, Loc)); return DependentScopeDeclRefExpr::Create( Context, SS.getWithLocInContext(Context), TemplateKWLoc, NameInfo, diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 46895db4a0756..19fd51134d160 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -508,7 +508,6 @@ bool Sema::checkLiteralOperatorId(const CXXScopeSpec &SS, switch (SS.getScopeRep()->getKind()) { case NestedNameSpecifier::Identifier: case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: // Per C++11 [over.literal]p2, literal operators can only be declared at // namespace scope. Therefore, this unqualified-id cannot name anything. // Reject it early, because we have no AST representation for this in the diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp index 876340010cf5f..a77ca779a9ee3 100644 --- a/clang/lib/Sema/SemaLookup.cpp +++ b/clang/lib/Sema/SemaLookup.cpp @@ -4530,7 +4530,6 @@ static void getNestedNameSpecifierIdentifiers( II = NNS->getAsNamespaceAlias()->getIdentifier(); break; - case NestedNameSpecifier::TypeSpecWithTemplate: case NestedNameSpecifier::TypeSpec: II = QualType(NNS->getAsType(), 0).getBaseTypeIdentifier(); break; @@ -4895,8 +4894,7 @@ TypoCorrectionConsumer::NamespaceSpecifierSet::buildNestedNameSpecifier( NNS = NestedNameSpecifier::Create(Context, NNS, ND); ++NumSpecifiers; } else if (auto *RD = dyn_cast_or_null(C)) { - NNS = NestedNameSpecifier::Create(Context, NNS, RD->isTemplateDecl(), - RD->getTypeForDecl()); + NNS = NestedNameSpecifier::Create(Context, NNS, RD->getTypeForDecl()); ++NumSpecifiers; } } diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index be81b6a46b2c0..de2b1fdbc44e2 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -364,8 +364,8 @@ bool Sema::DiagnoseUnknownTemplateName(const IdentifierInfo &II, // The code is missing a 'template' keyword prior to the dependent template // name. NestedNameSpecifier *Qualifier = (NestedNameSpecifier *)SS->getScopeRep(); - SuggestedTemplate - = TemplateTy::make(Context.getDependentTemplateName(Qualifier, &II)); + SuggestedTemplate = TemplateTy::make(Context.getDependentTemplateName( + {Qualifier, &II, /*HasTemplateKeyword=*/false})); Diag(IILoc, diag::err_template_kw_missing) << SuggestedTemplate.get() << FixItHint::CreateInsertion(IILoc, "template "); @@ -2777,7 +2777,8 @@ TemplateParameterList *Sema::MatchTemplateParametersToScopeSpecifier( // Look one step prior in a dependent template specialization type. if (const DependentTemplateSpecializationType *DependentTST = T->getAs()) { - if (NestedNameSpecifier *NNS = DependentTST->getQualifier()) + if (NestedNameSpecifier *NNS = + DependentTST->getDependentTemplateName().getQualifier()) T = QualType(NNS->getAsType(), 0); else T = QualType(); @@ -3480,16 +3481,17 @@ Sema::findFailedBooleanCondition(Expr *Cond) { QualType Sema::CheckTemplateIdType(TemplateName Name, SourceLocation TemplateLoc, TemplateArgumentListInfo &TemplateArgs) { - DependentTemplateName *DTN = - Name.getUnderlying().getAsDependentTemplateName(); - if (DTN && DTN->isIdentifier()) + // FIXME: 'getUnderlying' loses SubstTemplateTemplateParm nodes from alias + // template substitutions. + if (DependentTemplateName *DTN = + Name.getUnderlying().getAsDependentTemplateName(); + DTN && DTN->getName().getIdentifier()) // When building a template-id where the template-name is dependent, // assume the template is a type template. Either our assumption is // correct, or the code is ill-formed and will be diagnosed when the // dependent name is substituted. return Context.getDependentTemplateSpecializationType( - ElaboratedTypeKeyword::None, DTN->getQualifier(), DTN->getIdentifier(), - TemplateArgs.arguments()); + ElaboratedTypeKeyword::None, *DTN, TemplateArgs.arguments()); if (Name.getAsAssumedTemplateName() && resolveAssumedTemplateNameAsType(/*Scope=*/nullptr, Name, TemplateLoc)) @@ -3824,8 +3826,7 @@ TypeResult Sema::ActOnTemplateIdType( if (DependentTemplateName *DTN = Template.getAsDependentTemplateName()) { assert(SS.getScopeRep() == DTN->getQualifier()); QualType T = Context.getDependentTemplateSpecializationType( - ElaboratedTypeKeyword::None, DTN->getQualifier(), DTN->getIdentifier(), - TemplateArgs.arguments()); + ElaboratedTypeKeyword::None, *DTN, TemplateArgs.arguments()); // Build type-source information. TypeLocBuilder TLB; DependentTemplateSpecializationTypeLoc SpecTL @@ -3894,8 +3895,7 @@ TypeResult Sema::ActOnTagTemplateIdType(TagUseKind TUK, if (DependentTemplateName *DTN = Template.getAsDependentTemplateName()) { assert(SS.getScopeRep() == DTN->getQualifier()); QualType T = Context.getDependentTemplateSpecializationType( - Keyword, DTN->getQualifier(), DTN->getIdentifier(), - TemplateArgs.arguments()); + Keyword, *DTN, TemplateArgs.arguments()); // Build type-source information. TypeLocBuilder TLB; @@ -4812,13 +4812,14 @@ TemplateNameKind Sema::ActOnTemplateName(Scope *S, switch (Name.getKind()) { case UnqualifiedIdKind::IK_Identifier: - Result = TemplateTy::make( - Context.getDependentTemplateName(Qualifier, Name.Identifier)); + Result = TemplateTy::make(Context.getDependentTemplateName( + {Qualifier, Name.Identifier, TemplateKWLoc.isValid()})); return TNK_Dependent_template_name; case UnqualifiedIdKind::IK_OperatorFunctionId: Result = TemplateTy::make(Context.getDependentTemplateName( - Qualifier, Name.OperatorFunctionId.Operator)); + {Qualifier, Name.OperatorFunctionId.Operator, + TemplateKWLoc.isValid()})); return TNK_Function_template; case UnqualifiedIdKind::IK_LiteralOperatorId: @@ -5332,7 +5333,7 @@ bool Sema::CheckTemplateArgument(NamedDecl *Param, TemplateArgumentLoc &ArgLoc, // know that we need a non-type template argument, convert this // template name into an expression. - DeclarationNameInfo NameInfo(DTN->getIdentifier(), + DeclarationNameInfo NameInfo(DTN->getName().getIdentifier(), ArgLoc.getTemplateNameLoc()); CXXScopeSpec SS; @@ -6071,8 +6072,9 @@ bool UnnamedLocalNoLinkageFinder::VisitDependentNameType( bool UnnamedLocalNoLinkageFinder::VisitDependentTemplateSpecializationType( const DependentTemplateSpecializationType* T) { - if (auto *Q = T->getQualifier()) + if (auto *Q = T->getDependentTemplateName().getQualifier()) return VisitNestedNameSpecifier(Q); + return false; } @@ -6154,7 +6156,6 @@ bool UnnamedLocalNoLinkageFinder::VisitNestedNameSpecifier( return false; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: return Visit(QualType(NNS->getAsType(), 0)); } llvm_unreachable("Invalid NestedNameSpecifier::Kind!"); @@ -7526,9 +7527,8 @@ ExprResult Sema::BuildExpressionFromDeclTemplateArgument( isa(VD))); QualType ClassType = Context.getTypeDeclType(cast(VD->getDeclContext())); - NestedNameSpecifier *Qualifier - = NestedNameSpecifier::Create(Context, nullptr, false, - ClassType.getTypePtr()); + NestedNameSpecifier *Qualifier = + NestedNameSpecifier::Create(Context, nullptr, ClassType.getTypePtr()); SS.MakeTrivial(Context, Qualifier, Loc); } @@ -10694,15 +10694,14 @@ Sema::ActOnTypenameType(Scope *S, SourceLocation TypenameLoc, assert(DTN && "dependent template has non-dependent name?"); assert(DTN->getQualifier() == SS.getScopeRep()); - if (!DTN->isIdentifier()) { + if (!DTN->getName().getIdentifier()) { Diag(TemplateIILoc, diag::err_template_id_not_a_type) << Template; NoteAllFoundTemplates(Template); return true; } QualType T = Context.getDependentTemplateSpecializationType( - ElaboratedTypeKeyword::Typename, DTN->getQualifier(), - DTN->getIdentifier(), TemplateArgs.arguments()); + ElaboratedTypeKeyword::Typename, *DTN, TemplateArgs.arguments()); // Create source-location information for this type. TypeLocBuilder Builder; diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 740a7a1513975..b39eb8fd5512e 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -7031,7 +7031,8 @@ MarkUsedTemplateParameters(ASTContext &Ctx, QualType T, const DependentTemplateSpecializationType *Spec = cast(T); - MarkUsedTemplateParameters(Ctx, Spec->getQualifier(), + MarkUsedTemplateParameters(Ctx, + Spec->getDependentTemplateName().getQualifier(), OnlyDeduced, Depth, Used); for (const auto &Arg : Spec->template_arguments()) diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 8fdb2cf6dce6c..e455b225d7f49 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -705,7 +705,7 @@ class TreeTransform { QualType TransformDependentTemplateSpecializationType( TypeLocBuilder &TLB, DependentTemplateSpecializationTypeLoc TL, - NestedNameSpecifierLoc QualifierLoc); + CXXScopeSpec &SS); /// Transforms the parameters of a function type into the /// given vectors. @@ -1132,38 +1132,21 @@ class TreeTransform { /// nested-name-specifier and the given type. Subclasses may override /// this routine to provide different behavior. QualType RebuildDependentTemplateSpecializationType( - ElaboratedTypeKeyword Keyword, - NestedNameSpecifierLoc QualifierLoc, - SourceLocation TemplateKWLoc, - const IdentifierInfo *Name, - SourceLocation NameLoc, - TemplateArgumentListInfo &Args, - bool AllowInjectedClassName) { - // Rebuild the template name. - // TODO: avoid TemplateName abstraction - CXXScopeSpec SS; - SS.Adopt(QualifierLoc); - TemplateName InstName = getDerived().RebuildTemplateName( - SS, TemplateKWLoc, *Name, NameLoc, QualType(), nullptr, - AllowInjectedClassName); - - if (InstName.isNull()) - return QualType(); - + ElaboratedTypeKeyword Keyword, NestedNameSpecifier *NNS, + SourceLocation TemplateKWLoc, TemplateName Name, SourceLocation NameLoc, + TemplateArgumentListInfo &Args, bool AllowInjectedClassName) { // If it's still dependent, make a dependent specialization. - if (InstName.getAsDependentTemplateName()) + if (const DependentTemplateStorage *S = Name.getAsDependentTemplateName()) return SemaRef.Context.getDependentTemplateSpecializationType( - Keyword, QualifierLoc.getNestedNameSpecifier(), Name, - Args.arguments()); + Keyword, *S, Args.arguments()); // Otherwise, make an elaborated type wrapping a non-dependent // specialization. QualType T = - getDerived().RebuildTemplateSpecializationType(InstName, NameLoc, Args); + getDerived().RebuildTemplateSpecializationType(Name, NameLoc, Args); if (T.isNull()) return QualType(); - return SemaRef.Context.getElaboratedType( - Keyword, QualifierLoc.getNestedNameSpecifier(), T); + return SemaRef.Context.getElaboratedType(Keyword, NNS, T); } /// Build a new typename type that refers to an identifier. @@ -1332,6 +1315,13 @@ class TreeTransform { SourceLocation NameLoc, QualType ObjectType, bool AllowInjectedClassName); + TemplateName RebuildTemplateName(CXXScopeSpec &SS, + SourceLocation TemplateKWLoc, + IdentifierOrOverloadedOperator IO, + SourceLocation NameLoc, QualType ObjectType, + NamedDecl *FirstQualifierInScope, + bool AllowInjectedClassName); + /// Build a new template name given a template template parameter pack /// and the /// @@ -4634,7 +4624,6 @@ NestedNameSpecifierLoc TreeTransform::TransformNestedNameSpecifierLoc( break; } - case NestedNameSpecifier::TypeSpecWithTemplate: case NestedNameSpecifier::TypeSpec: { TypeLoc TL = TransformTypeInObjectScope(Q.getTypeLoc(), ObjectType, FirstQualifierInScope, SS); @@ -4654,8 +4643,7 @@ NestedNameSpecifierLoc TreeTransform::TransformNestedNameSpecifierLoc( TL = ETL.getNamedTypeLoc(); } - SS.Extend(SemaRef.Context, TL.getTemplateKeywordLoc(), TL, - Q.getLocalEndLoc()); + SS.Extend(SemaRef.Context, TL, Q.getLocalEndLoc()); break; } // If the nested-name-specifier is an invalid type def, don't emit an @@ -4753,6 +4741,22 @@ ::TransformDeclarationNameInfo(const DeclarationNameInfo &NameInfo) { llvm_unreachable("Unknown name kind."); } +template +TemplateName TreeTransform::RebuildTemplateName( + CXXScopeSpec &SS, SourceLocation TemplateKWLoc, + IdentifierOrOverloadedOperator IO, SourceLocation NameLoc, + QualType ObjectType, NamedDecl *FirstQualifierInScope, + bool AllowInjectedClassName) { + if (const IdentifierInfo *II = IO.getIdentifier()) { + return getDerived().RebuildTemplateName(SS, TemplateKWLoc, *II, NameLoc, + ObjectType, FirstQualifierInScope, + AllowInjectedClassName); + } + return getDerived().RebuildTemplateName(SS, TemplateKWLoc, IO.getOperator(), + NameLoc, ObjectType, + AllowInjectedClassName); +} + template TemplateName TreeTransform::TransformTemplateName(CXXScopeSpec &SS, @@ -4794,20 +4798,9 @@ TreeTransform::TransformTemplateName(CXXScopeSpec &SS, // FIXME: Preserve the location of the "template" keyword. SourceLocation TemplateKWLoc = NameLoc; - - if (DTN->isIdentifier()) { - return getDerived().RebuildTemplateName(SS, - TemplateKWLoc, - *DTN->getIdentifier(), - NameLoc, - ObjectType, - FirstQualifierInScope, - AllowInjectedClassName); - } - - return getDerived().RebuildTemplateName(SS, TemplateKWLoc, - DTN->getOperator(), NameLoc, - ObjectType, AllowInjectedClassName); + return getDerived().RebuildTemplateName( + SS, TemplateKWLoc, DTN->getName(), NameLoc, ObjectType, + FirstQualifierInScope, AllowInjectedClassName); } // FIXME: Try to preserve more of the TemplateName. @@ -5401,13 +5394,14 @@ TypeSourceInfo *TreeTransform::TransformTSIInObjectScope( DependentTemplateSpecializationTypeLoc SpecTL = TL.castAs(); - TemplateName Template - = getDerived().RebuildTemplateName(SS, - SpecTL.getTemplateKeywordLoc(), - *SpecTL.getTypePtr()->getIdentifier(), - SpecTL.getTemplateNameLoc(), - ObjectType, UnqualLookup, - /*AllowInjectedClassName*/true); + const IdentifierInfo *II = SpecTL.getTypePtr() + ->getDependentTemplateName() + .getName() + .getIdentifier(); + TemplateName Template = getDerived().RebuildTemplateName( + SS, SpecTL.getTemplateKeywordLoc(), *II, SpecTL.getTemplateNameLoc(), + ObjectType, UnqualLookup, + /*AllowInjectedClassName*/ true); if (Template.isNull()) return nullptr; @@ -7430,9 +7424,9 @@ QualType TreeTransform::TransformDependentTemplateSpecializationType( // FIXME: maybe don't rebuild if all the template arguments are the same. if (DependentTemplateName *DTN = Template.getAsDependentTemplateName()) { + assert(DTN->getQualifier() == SS.getScopeRep()); QualType Result = getSema().Context.getDependentTemplateSpecializationType( - TL.getTypePtr()->getKeyword(), DTN->getQualifier(), - DTN->getIdentifier(), NewTemplateArgs.arguments()); + TL.getTypePtr()->getKeyword(), *DTN, NewTemplateArgs.arguments()); DependentTemplateSpecializationTypeLoc NewTL = TLB.push(Result); @@ -7755,15 +7749,15 @@ QualType TreeTransform:: return QualType(); } - return getDerived() - .TransformDependentTemplateSpecializationType(TLB, TL, QualifierLoc); + CXXScopeSpec SS; + SS.Adopt(QualifierLoc); + return getDerived().TransformDependentTemplateSpecializationType(TLB, TL, SS); } -template -QualType TreeTransform:: -TransformDependentTemplateSpecializationType(TypeLocBuilder &TLB, - DependentTemplateSpecializationTypeLoc TL, - NestedNameSpecifierLoc QualifierLoc) { +template +QualType TreeTransform::TransformDependentTemplateSpecializationType( + TypeLocBuilder &TLB, DependentTemplateSpecializationTypeLoc TL, + CXXScopeSpec &SS) { const DependentTemplateSpecializationType *T = TL.getTypePtr(); TemplateArgumentListInfo NewTemplateArgs; @@ -7777,13 +7771,25 @@ TransformDependentTemplateSpecializationType(TypeLocBuilder &TLB, NewTemplateArgs)) return QualType(); - QualType Result = getDerived().RebuildDependentTemplateSpecializationType( - T->getKeyword(), QualifierLoc, TL.getTemplateKeywordLoc(), - T->getIdentifier(), TL.getTemplateNameLoc(), NewTemplateArgs, - /*AllowInjectedClassName*/ false); - if (Result.isNull()) - return QualType(); + const DependentTemplateStorage &DTN = T->getDependentTemplateName(); + QualType Result = TL.getType(); + if (getDerived().AlwaysRebuild() || SS.getScopeRep() != DTN.getQualifier()) { + TemplateName Name = getDerived().RebuildTemplateName( + SS, TL.getTemplateKeywordLoc(), DTN.getName(), TL.getTemplateNameLoc(), + /*ObjectType=*/QualType(), /*FirstQualifierInScope=*/nullptr, + /*AllowInjectedClassName=*/false); + if (Name.isNull()) + return QualType(); + Result = getDerived().RebuildDependentTemplateSpecializationType( + T->getKeyword(), SS.getScopeRep(), TL.getTemplateKeywordLoc(), Name, + TL.getTemplateNameLoc(), NewTemplateArgs, + /*AllowInjectedClassName=*/false); + if (Result.isNull()) + return QualType(); + } + + NestedNameSpecifierLoc QualifierLoc = SS.getWithLocInContext(SemaRef.Context); if (const ElaboratedType *ElabT = dyn_cast(Result)) { QualType NamedT = ElabT->getNamedType(); @@ -7801,7 +7807,8 @@ TransformDependentTemplateSpecializationType(TypeLocBuilder &TLB, ElaboratedTypeLoc NewTL = TLB.push(Result); NewTL.setElaboratedKeywordLoc(TL.getElaboratedKeywordLoc()); NewTL.setQualifierLoc(QualifierLoc); - } else if (isa(Result)) { + } else { + assert(isa(Result)); DependentTemplateSpecializationTypeLoc SpecTL = TLB.push(Result); SpecTL.setElaboratedKeywordLoc(TL.getElaboratedKeywordLoc()); @@ -7812,15 +7819,6 @@ TransformDependentTemplateSpecializationType(TypeLocBuilder &TLB, SpecTL.setRAngleLoc(TL.getRAngleLoc()); for (unsigned I = 0, E = NewTemplateArgs.size(); I != E; ++I) SpecTL.setArgLocInfo(I, NewTemplateArgs[I].getLocInfo()); - } else { - TemplateSpecializationTypeLoc SpecTL - = TLB.push(Result); - SpecTL.setTemplateKeywordLoc(TL.getTemplateKeywordLoc()); - SpecTL.setTemplateNameLoc(TL.getTemplateNameLoc()); - SpecTL.setLAngleLoc(TL.getLAngleLoc()); - SpecTL.setRAngleLoc(TL.getRAngleLoc()); - for (unsigned I = 0, E = NewTemplateArgs.size(); I != E; ++I) - SpecTL.setArgLocInfo(I, NewTemplateArgs[I].getLocInfo()); } return Result; } @@ -17532,8 +17530,7 @@ TreeTransform::RebuildCXXPseudoDestructorExpr(Expr *Base, << ScopeType->getType() << getSema().getLangOpts().CPlusPlus; return ExprError(); } - SS.Extend(SemaRef.Context, SourceLocation(), ScopeType->getTypeLoc(), - CCLoc); + SS.Extend(SemaRef.Context, ScopeType->getTypeLoc(), CCLoc); } SourceLocation TemplateKWLoc; // FIXME: retrieve it from caller. diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 0cd2cedb48dd9..58a57d6c54523 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -9914,18 +9914,12 @@ ASTRecordReader::readNestedNameSpecifierLoc() { break; } - case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: { - bool Template = readBool(); + case NestedNameSpecifier::TypeSpec: { TypeSourceInfo *T = readTypeSourceInfo(); if (!T) return NestedNameSpecifierLoc(); SourceLocation ColonColonLoc = readSourceLocation(); - - // FIXME: 'template' keyword location not saved anywhere, so we fake it. - Builder.Extend(Context, - Template? T->getTypeLoc().getBeginLoc() : SourceLocation(), - T->getTypeLoc(), ColonColonLoc); + Builder.Extend(Context, T->getTypeLoc(), ColonColonLoc); break; } diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 99ac26cb43cac..84f7f2bc5fce4 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -7022,8 +7022,6 @@ void ASTRecordWriter::AddNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS) { break; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: - Record->push_back(Kind == NestedNameSpecifier::TypeSpecWithTemplate); AddTypeRef(NNS.getTypeLoc().getType()); AddTypeLoc(NNS.getTypeLoc()); AddSourceLocation(NNS.getLocalSourceRange().getEnd()); diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp index 3e50d67f4d6ef..8b746c02dbfc9 100644 --- a/clang/lib/Tooling/Syntax/BuildTree.cpp +++ b/clang/lib/Tooling/Syntax/BuildTree.cpp @@ -959,8 +959,6 @@ class BuildTreeVisitor : public RecursiveASTVisitor { case NestedNameSpecifier::NamespaceAlias: case NestedNameSpecifier::Identifier: return syntax::NodeKind::IdentifierNameSpecifier; - case NestedNameSpecifier::TypeSpecWithTemplate: - return syntax::NodeKind::SimpleTemplateNameSpecifier; case NestedNameSpecifier::TypeSpec: { const auto *NNSType = NNS.getAsType(); assert(NNSType); diff --git a/clang/test/AST/ast-dump-decl.cpp b/clang/test/AST/ast-dump-decl.cpp index d79051fb6efaa..08d8fba795daa 100644 --- a/clang/test/AST/ast-dump-decl.cpp +++ b/clang/test/AST/ast-dump-decl.cpp @@ -351,8 +351,8 @@ namespace testClassTemplateDecl { // CHECK-NEXT: | |-CXXDestructorDecl 0x{{.+}} col:5 used ~TestClassTemplate 'void () noexcept' implicit_instantiation instantiated_from 0x[[#TEMPLATE_DESTRUCTOR_DECL]]{{$}} // CHECK-NEXT: | |-CXXMethodDecl 0x{{.+}} col:9 j 'int ()' implicit_instantiation instantiated_from 0x[[#TEMPLATE_METHOD_DECL]]{{$}} // CHECK-NEXT: | |-FieldDecl 0x{{.+}} col:9 i 'int'{{$}} -// CHECK-NEXT: | `-CXXConstructorDecl 0x{{.+}} col:30 implicit constexpr TestClassTemplate 'void (const TestClassTemplate &)' inline default trivial noexcept-unevaluated 0x{{.+}}{{$}} -// CHECK-NEXT: | `-ParmVarDecl 0x{{.+}} col:30 'const TestClassTemplate &'{{$}} +// CHECK-NEXT: | `-CXXConstructorDecl 0x{{.+}} col:30 implicit constexpr TestClassTemplate 'void (const TestClassTemplate &)' inline default trivial noexcept-unevaluated 0x{{.+}}{{$}} +// CHECK-NEXT: | `-ParmVarDecl 0x{{.+}} col:30 'const TestClassTemplate &'{{$}} // CHECK-NEXT: |-ClassTemplateSpecialization 0x{{.+}} 'TestClassTemplate'{{$}} // CHECK-NEXT: |-ClassTemplateSpecialization 0x{{.+}} 'TestClassTemplate'{{$}} // CHECK-NEXT: `-ClassTemplateSpecialization 0x{{.+}} 'TestClassTemplate'{{$}} @@ -654,10 +654,10 @@ namespace testCanonicalTemplate { // CHECK-NEXT: | `-ClassTemplateSpecialization 0x{{.+}} 'TestClassTemplate'{{$}} // CHECK-NEXT: |-CXXConstructorDecl 0x{{.+}} col:31 implicit used constexpr TestClassTemplate 'void () noexcept' inline default trivial{{$}} // CHECK-NEXT: | `-CompoundStmt 0x{{.+}} {{$}} - // CHECK-NEXT: |-CXXConstructorDecl 0x{{.+}} col:31 implicit constexpr TestClassTemplate 'void (const TestClassTemplate &)' inline default trivial noexcept-unevaluated 0x{{.+}}{{$}} - // CHECK-NEXT: | `-ParmVarDecl 0x{{.+}} col:31 'const TestClassTemplate &'{{$}} - // CHECK-NEXT: `-CXXConstructorDecl 0x{{.+}} col:31 implicit constexpr TestClassTemplate 'void (TestClassTemplate &&)' inline default trivial noexcept-unevaluated 0x{{.+}}{{$}} - // CHECK-NEXT: `-ParmVarDecl 0x{{.+}} col:31 'TestClassTemplate &&'{{$}} + // CHECK-NEXT: |-CXXConstructorDecl 0x{{.+}} col:31 implicit constexpr TestClassTemplate 'void (const TestClassTemplate &)' inline default trivial noexcept-unevaluated 0x{{.+}}{{$}} + // CHECK-NEXT: | `-ParmVarDecl 0x{{.+}} col:31 'const TestClassTemplate &'{{$}} + // CHECK-NEXT: `-CXXConstructorDecl 0x{{.+}} col:31 implicit constexpr TestClassTemplate 'void (TestClassTemplate &&)' inline default trivial noexcept-unevaluated 0x{{.+}}{{$}} + // CHECK-NEXT: `-ParmVarDecl 0x{{.+}} col:31 'TestClassTemplate &&'{{$}} template class TestClassTemplate2; @@ -682,10 +682,10 @@ namespace testCanonicalTemplate { // CHECK-NEXT: |-CXXRecordDecl 0x{{.+}} col:31 implicit class TestClassTemplate2{{$}} // CHECK-NEXT: |-CXXConstructorDecl 0x{{.+}} col:31 implicit used constexpr TestClassTemplate2 'void () noexcept' inline default trivial{{$}} // CHECK-NEXT: | `-CompoundStmt 0x{{.+}} {{$}} - // CHECK-NEXT: |-CXXConstructorDecl 0x{{.+}} col:31 implicit constexpr TestClassTemplate2 'void (const TestClassTemplate2 &)' inline default trivial noexcept-unevaluated 0x{{.+}}{{$}} - // CHECK-NEXT: | `-ParmVarDecl 0x{{.+}} col:31 'const TestClassTemplate2 &'{{$}} - // CHECK-NEXT: `-CXXConstructorDecl 0x{{.+}} col:31 implicit constexpr TestClassTemplate2 'void (TestClassTemplate2 &&)' inline default trivial noexcept-unevaluated 0x{{.+}}{{$}} - // CHECK-NEXT: `-ParmVarDecl 0x{{.+}} col:31 'TestClassTemplate2 &&'{{$}} + // CHECK-NEXT: |-CXXConstructorDecl 0x{{.+}} col:31 implicit constexpr TestClassTemplate2 'void (const TestClassTemplate2 &)' inline default trivial noexcept-unevaluated 0x{{.+}}{{$}} + // CHECK-NEXT: | `-ParmVarDecl 0x{{.+}} col:31 'const TestClassTemplate2 &'{{$}} + // CHECK-NEXT: `-CXXConstructorDecl 0x{{.+}} col:31 implicit constexpr TestClassTemplate2 'void (TestClassTemplate2 &&)' inline default trivial noexcept-unevaluated 0x{{.+}}{{$}} + // CHECK-NEXT: `-ParmVarDecl 0x{{.+}} col:31 'TestClassTemplate2 &&'{{$}} // CHECK: ClassTemplateDecl 0x{{.+}} prev 0x{{.+}} <{{.+}}:[[@LINE-26]]:3, col:31> col:31 TestClassTemplate2{{$}} // CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} col:21 typename depth 0 index 0 T1{{$}} diff --git a/clang/test/AST/ast-dump-expr.cpp b/clang/test/AST/ast-dump-expr.cpp index 5da025c229ea3..2efd0b5e8ac21 100644 --- a/clang/test/AST/ast-dump-expr.cpp +++ b/clang/test/AST/ast-dump-expr.cpp @@ -229,11 +229,10 @@ void PostfixExpressions(S a, S *p, U *r) { // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'S *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'S *' - // FIXME: there is no mention that this used the template keyword. r->template U::~U(); // CHECK: CXXMemberCallExpr 0x{{[^ ]*}} 'void' // CHECK-NEXT: MemberExpr 0x{{[^ ]*}} '' ->~U 0x{{[^ ]*}} - // CHECK-NEXT: NestedNameSpecifier TypeSpecWithTemplate 'template U':'U' + // CHECK-NEXT: NestedNameSpecifier TypeSpec 'template U':'U' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'U *' lvalue ParmVar 0x{{[^ ]*}} 'r' 'U *' diff --git a/clang/test/AST/ast-dump-templates.cpp b/clang/test/AST/ast-dump-templates.cpp index fb0132bf7dbc2..d6982a0927e8c 100644 --- a/clang/test/AST/ast-dump-templates.cpp +++ b/clang/test/AST/ast-dump-templates.cpp @@ -174,7 +174,7 @@ namespace TestDependentMemberPointer { // DUMP-NEXT: | `-BuiltinType {{.+}} 'int' // DUMP-NEXT: `-TypeAliasDecl {{.+}} Z 'int U::template V::*'{{$}} // DUMP-NEXT: `-MemberPointerType {{.+}} 'int U::template V::*' dependent -// DUMP-NEXT: |-DependentTemplateSpecializationType {{.+}} 'U::template V' dependent +// DUMP-NEXT: |-DependentTemplateSpecializationType {{.+}} 'template V' dependent // DUMP-NEXT: `-BuiltinType {{.+}} 'int' } // namespace TestDependentMemberPointer @@ -6588,7 +6588,7 @@ namespace TestDependentMemberPointer { // JSON-NEXT: "tokLen": 9 // JSON-NEXT: }, // JSON-NEXT: "end": { -// JSON-NEXT: "offset": 6359, +// JSON-NEXT: "offset": 6356, // JSON-NEXT: "line": 179, // JSON-NEXT: "col": 1, // JSON-NEXT: "tokLen": 1 @@ -6896,7 +6896,7 @@ namespace TestDependentMemberPointer { // JSON-NEXT: "id": "0x{{.*}}", // JSON-NEXT: "kind": "DependentTemplateSpecializationType", // JSON-NEXT: "type": { -// JSON-NEXT: "qualType": "U::template V" +// JSON-NEXT: "qualType": "template V" // JSON-NEXT: }, // JSON-NEXT: "isDependent": true, // JSON-NEXT: "isInstantiationDependent": true diff --git a/clang/test/CXX/class.access/p6.cpp b/clang/test/CXX/class.access/p6.cpp index 6f266728faa6b..15f2644f6ac1d 100644 --- a/clang/test/CXX/class.access/p6.cpp +++ b/clang/test/CXX/class.access/p6.cpp @@ -92,7 +92,7 @@ namespace test3 { template class Outer::A { public: - static void foo(); // expected-note {{'Outer::A::foo' declared here}} + static void foo(); // expected-note {{'Outer::A::foo' declared here}} }; class B { @@ -102,7 +102,7 @@ namespace test3 { void test() { Outer::A::foo(); - Outer::A::foo(); // expected-error {{no member named 'foo' in 'test3::Outer::A'; did you mean 'Outer::A::foo'?}} + Outer::A::foo(); // expected-error {{no member named 'foo' in 'test3::Outer::A'; did you mean 'Outer::A::foo'?}} } } diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp index b621318a9ce41..b2ae8f88ead74 100644 --- a/clang/test/CXX/drs/cwg2xx.cpp +++ b/clang/test/CXX/drs/cwg2xx.cpp @@ -98,8 +98,8 @@ template class Templ { // #cwg203-ex3-Templ void foo() { Templ x(&Derived::func); } // expected-error@-1 {{no matching constructor for initialization of 'Templ'}} -// expected-note@#cwg203-ex3-Templ {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int (Derived::*)() const' (aka 'int (Base::*)() const') to 'const Templ' for 1st argument}} -// since-cxx11-note@#cwg203-ex3-Templ {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int (Derived::*)() const' (aka 'int (Base::*)() const') to 'Templ' for 1st argument}} +// expected-note@#cwg203-ex3-Templ {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int (Derived::*)() const' (aka 'int (Base::*)() const') to 'const Templ' for 1st argument}} +// since-cxx11-note@#cwg203-ex3-Templ {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int (Derived::*)() const' (aka 'int (Base::*)() const') to 'Templ' for 1st argument}} // expected-note@#cwg203-ex3-Templ-ctor {{candidate template ignored: could not match 'cwg203::ex3::Derived' against 'cwg203::ex3::Base'}} } // namespace ex3 diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp index 763d983d20f61..651cca927d513 100644 --- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp +++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp @@ -150,7 +150,7 @@ void func() { // expected-note@#bar {{while substituting template arguments into constraint expression here}} // expected-note@#bar {{while checking the satisfaction of nested requirement requested here}} // expected-note@#bar {{candidate template ignored: constraints not satisfied [with T = False]}} - // expected-note@#bar {{because 'X::value' evaluated to false}} + // expected-note@#bar {{because 'X::value' evaluated to false}} bar(); // expected-note@-1 {{while checking constraint satisfaction for template 'bar' required here}} \ diff --git a/clang/test/SemaCXX/static-assert.cpp b/clang/test/SemaCXX/static-assert.cpp index 0d384b6b499f7..bf6a2eeb432a3 100644 --- a/clang/test/SemaCXX/static-assert.cpp +++ b/clang/test/SemaCXX/static-assert.cpp @@ -196,8 +196,10 @@ struct NestedTemplates1 { template void foo2() { + // FIXME: Here the template keyword is dropped because the failed condition + // for a static assert is always printed with canonical types. static_assert(::ns::NestedTemplates1::NestedTemplates2::template NestedTemplates3::value, "message"); - // expected-error@-1{{static assertion failed due to requirement '::ns::NestedTemplates1::NestedTemplates2::template NestedTemplates3::value': message}} + // expected-error@-1{{static assertion failed due to requirement '::ns::NestedTemplates1::NestedTemplates2::NestedTemplates3::value': message}} } template void foo2(); // expected-note@-1{{in instantiation of function template specialization 'foo2' requested here}} diff --git a/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp b/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp index 0854ac9178b4f..2ecd2694ce09d 100644 --- a/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp +++ b/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp @@ -331,7 +331,7 @@ namespace DeduceArity { // CHECK: | |-ParmVarDecl {{.*}} 'Types' // CHECK: | `-ParmVarDecl {{.*}} 'T...' pack // CHECK: |-CXXDeductionGuideDecl {{.*}} implicit used - // CHECK-SAME: 'auto (Types, DeduceArity::X, DeduceArity::Y, DeduceArity::Z) -> + // CHECK-SAME: 'auto (Types, DeduceArity::X, DeduceArity::Y, DeduceArity::Z) -> // CHECK-SAME: DeduceArity::F' // CHECK: | |-TemplateArgument pack // CHECK: | | |-TemplateArgument type 'DeduceArity::X' @@ -343,16 +343,16 @@ namespace DeduceArity { // CHECK: | | `-TemplateArgument type 'DeduceArity::Z' // CHECK: | | `-RecordType {{.*}} 'DeduceArity::Z' // CHECK: | | `-CXXRecord {{.*}} 'Z' - // CHECK: | |-ParmVarDecl {{.*}} 'Types':'DeduceArity::Types' + // CHECK: | |-ParmVarDecl {{.*}} 'Types':'DeduceArity::Types' // CHECK: | |-ParmVarDecl {{.*}} 'DeduceArity::X' // CHECK: | |-ParmVarDecl {{.*}} 'DeduceArity::Y' // CHECK: | `-ParmVarDecl {{.*}} 'DeduceArity::Z' - // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit 'auto (Types, DeduceArity::X) -> DeduceArity::F' + // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit 'auto (Types, DeduceArity::X) -> DeduceArity::F' // CHECK: |-TemplateArgument pack // CHECK: | `-TemplateArgument type 'DeduceArity::X' // CHECK: | `-RecordType {{.*}} 'DeduceArity::X' // CHECK: | `-CXXRecord {{.*}} 'X' - // CHECK: |-ParmVarDecl {{.*}} 'Types':'DeduceArity::Types' + // CHECK: |-ParmVarDecl {{.*}} 'Types':'DeduceArity::Types' // CHECK: `-ParmVarDecl {{.*}} 'DeduceArity::X' // CHECK: FunctionProtoType {{.*}} 'auto (Types, T...) -> F' dependent trailing_return cdecl // CHECK: |-InjectedClassNameType {{.*}} 'F' dependent diff --git a/clang/test/SemaTemplate/dependent-template-recover.cpp b/clang/test/SemaTemplate/dependent-template-recover.cpp index c7e27e8da25f1..251a8f9816417 100644 --- a/clang/test/SemaTemplate/dependent-template-recover.cpp +++ b/clang/test/SemaTemplate/dependent-template-recover.cpp @@ -134,3 +134,21 @@ namespace PR9401 { const D::template B::template E > > A::B::a = typename C::template B::template E >(g); } + +namespace templ_spec { + template using A = void; // expected-note 2{{template parameter is declared here}} + template struct B { + A> t1; + // expected-error@-1 {{'A>' (aka 'void')}} + + A> t2; // expected-error {{use 'template' keyword}} + // expected-error@-1 {{'A>' (aka 'void')}} + + // FIXME: Why error recovery for the non-typename case is so bad? + A> t3; // expected-error {{did you forget 'typename'}} + // expected-error@-1 {{'A' (aka 'void')}} + + A> t4; // expected-error {{use 'template' keyword}} expected-error {{did you forget 'typename'}} + // expected-error@-1 {{'A' (aka 'void')}} + }; +} // namespace templ_spec diff --git a/clang/test/SemaTemplate/instantiate-requires-expr.cpp b/clang/test/SemaTemplate/instantiate-requires-expr.cpp index a1f5456156a06..ab5fac1f9e63e 100644 --- a/clang/test/SemaTemplate/instantiate-requires-expr.cpp +++ b/clang/test/SemaTemplate/instantiate-requires-expr.cpp @@ -72,8 +72,8 @@ namespace type_requirement { template requires false_v; }> - // expected-note@-1 {{because 'false_v::template temp >; }>' evaluated to false}} - // expected-note@-2 {{because 'false_v::template temp >; }>' evaluated to false}} + // expected-note@-1 {{because 'false_v::template temp >; }>' evaluated to false}} + // expected-note@-2 {{because 'false_v::template temp >; }>' evaluated to false}} struct r2 {}; using r2i1 = r2>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template]}} diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index e498a875bbbe8..197ba2cd6856e 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -1457,7 +1457,6 @@ bool CursorVisitor::VisitNestedNameSpecifier(NestedNameSpecifier *NNS, break; } - case NestedNameSpecifier::TypeSpecWithTemplate: case NestedNameSpecifier::Global: case NestedNameSpecifier::Identifier: case NestedNameSpecifier::Super: @@ -1492,7 +1491,6 @@ bool CursorVisitor::VisitNestedNameSpecifierLoc( break; case NestedNameSpecifier::TypeSpec: - case NestedNameSpecifier::TypeSpecWithTemplate: if (Visit(Q.getTypeLoc())) return true; diff --git a/libcxx/test/std/containers/sequences/array/array.overview/nttp.verify.cpp b/libcxx/test/std/containers/sequences/array/array.overview/nttp.verify.cpp index 3eb8e2596f85b..f50febf5f2485 100644 --- a/libcxx/test/std/containers/sequences/array/array.overview/nttp.verify.cpp +++ b/libcxx/test/std/containers/sequences/array/array.overview/nttp.verify.cpp @@ -61,7 +61,7 @@ using E = test{}>; // expected-error@-1 {{non-type template parameter has non-literal type 'std::array'}} using F = test{}>; -// expected-error@-1 {{type 'std::array' (aka 'std::array') of non-type template parameter is not a structural type}} +// expected-error-re@-1 {{type 'std::array<{{(std::)?}}string, 2>'{{( \(aka 'std::array'\))?}} of non-type template parameter is not a structural type}} } // namespace test_ctad namespace test_auto { diff --git a/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr.verify.cpp b/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr.verify.cpp index d6d70782c5eb5..fba0f61b76060 100644 --- a/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr.verify.cpp +++ b/libcxx/test/std/utilities/smartptr/adapt/inout_ptr/inout_ptr.verify.cpp @@ -26,7 +26,7 @@ int main(int, char**) { // expected-error-re@*:* {{static assertion failed due to requirement {{.*}}std::shared_ptr<> is not supported}} std::ignore = std::inout_ptr(sPtr); - // expected-error@*:* {{no matching conversion for functional-style cast from 'std::shared_ptr' to 'std::inout_ptr_t, _Ptr>' (aka 'inout_ptr_t, int *>'}} + // expected-error-re@*:* {{no matching conversion for functional-style cast from 'std::shared_ptr' to 'std::inout_ptr_t<{{(std::)?}}shared_ptr, _Ptr>'{{( \(aka 'inout_ptr_t, int *>')?}}}} std::ignore = std::inout_ptr(sPtr); } diff --git a/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr.verify.cpp b/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr.verify.cpp index 1fe78ecb22789..da3ffbba94a8f 100644 --- a/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr.verify.cpp +++ b/libcxx/test/std/utilities/smartptr/adapt/out_ptr/out_ptr.verify.cpp @@ -26,7 +26,7 @@ int main(int, char**) { // expected-error-re@*:* {{static assertion failed due to requirement {{.*}}Using std::shared_ptr<> without a deleter in std::out_ptr is not supported.}} std::ignore = std::out_ptr(sPtr); - // expected-error@*:* {{no matching conversion for functional-style cast from 'std::shared_ptr' to 'std::out_ptr_t, _Ptr>' (aka 'out_ptr_t, int *>')}} + // expected-error-re@*:* {{no matching conversion for functional-style cast from 'std::shared_ptr' to 'std::out_ptr_t<{{(std::)?}}shared_ptr, _Ptr>'{{( \(aka 'out_ptr_t, int *>'\))?}}}} std::ignore = std::out_ptr(sPtr); } diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.verify.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.verify.cpp index b9b5432a30b21..9a39c18f138fb 100644 --- a/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.verify.cpp +++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.verify.cpp @@ -74,7 +74,7 @@ using H = test{}>; // expected-error@-1 {{non-type template parameter has non-literal type 'std::pair'}} using I = test{}>; -// expected-error@-1 {{type 'std::pair' (aka 'std::pair') of non-type template parameter is not a structural type}} +// expected-error-re@-1 {{type 'std::pair<{{(std::)?}}string, {{(std::)?}}string>'{{( \(aka 'std::pair'\))?}} of non-type template parameter is not a structural type}} } // namespace test_ctad namespace test_auto { From adba14acea99cc6a17d837763a3248c9d4a2fadf Mon Sep 17 00:00:00 2001 From: Matthias Braun Date: Tue, 1 Apr 2025 13:20:43 -0700 Subject: [PATCH 0300/1029] Stop using __attribute__((retain)) in GCC builds (#133793) GCC sometimes produces warnings about `__attribute__((retain))` despite `__has_attribute(retain)` being 1. See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99587 The amount of users who benefit from the attribute is probably very small compared to the amount of `-Werror` enabled builds or the desire to keep `-Wattributes` enabled in the LLVM build. So for now drop usage of the `retain` attribute in GCC builds. --- llvm/include/llvm/Support/Compiler.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h index ff6f5e44ae2f1..d3772896069cc 100644 --- a/llvm/include/llvm/Support/Compiler.h +++ b/llvm/include/llvm/Support/Compiler.h @@ -230,7 +230,11 @@ #define LLVM_ATTRIBUTE_USED #endif -#if __has_attribute(retain) +// Only enabled for clang: +// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99587 +// GCC may produce "warning: ‘retain’ attribute ignored" (despite +// __has_attribute(retain) being 1). +#if defined(__clang__) && __has_attribute(retain) #define LLVM_ATTRIBUTE_RETAIN __attribute__((__retain__)) #else #define LLVM_ATTRIBUTE_RETAIN From 537b6541e8067d7ef7aa38791989fca6303b7fdf Mon Sep 17 00:00:00 2001 From: Amy Huang Date: Tue, 1 Apr 2025 13:32:43 -0700 Subject: [PATCH 0301/1029] Fix libc BUILD.bazel after commit 8741412 (#133980) Recent changes add dependencies to some atan functions. Edit bazel build file to look more like the CMake file. See https://github.com/llvm/llvm-project/commit/8741412bdfc0a60719f116add7d828694ef48c02 --- .../bazel/llvm-project-overlay/libc/BUILD.bazel | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 77aa75362c71d..5298c625c5d66 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1914,6 +1914,19 @@ libc_support_library( ], ) +libc_support_library( + name = "atan_utils", + hdrs = ["src/math/generic/atan_utils.h"], + deps = [ + ":__support_integer_literals", + ":__support_fputil_double_double", + ":__support_fputil_dyadic_float", + ":__support_fputil_multiply_add", + ":__support_fputil_polyeval", + ":__support_macros_optimization", + ], +) + libc_support_library( name = "log_range_reduction", hdrs = ["src/math/generic/log_range_reduction.h"], @@ -2313,7 +2326,7 @@ libc_math_function( ":__support_fputil_double_double", ":__support_fputil_nearest_integer", ":__support_macros_optimization", - ":inv_trigf_utils", + ":atan_utils", ], ) @@ -2331,7 +2344,7 @@ libc_math_function( additional_deps = [ ":__support_fputil_double_double", ":__support_fputil_nearest_integer", - ":inv_trigf_utils", + ":atan_utils", ], ) From ec59313c0416018dc008c586478b825fe42bf323 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 1 Apr 2025 21:45:00 +0100 Subject: [PATCH 0302/1029] [EquivalenceClasses] Use range-based for loops (NFC). --- llvm/include/llvm/ADT/EquivalenceClasses.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h index 4f692052847aa..46f186a71f5ce 100644 --- a/llvm/include/llvm/ADT/EquivalenceClasses.h +++ b/llvm/include/llvm/ADT/EquivalenceClasses.h @@ -146,9 +146,9 @@ class EquivalenceClasses { EquivalenceClasses &operator=(const EquivalenceClasses &RHS) { TheMapping.clear(); - for (iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) - if (I->isLeader()) { - member_iterator MI = RHS.member_begin(*I); + for (const auto &E : RHS) + if (E.isLeader()) { + member_iterator MI = RHS.member_begin(E); member_iterator LeaderIt = member_begin(insert(*MI)); for (++MI; MI != member_end(); ++MI) unionSets(LeaderIt, member_begin(insert(*MI))); @@ -207,8 +207,9 @@ class EquivalenceClasses { /// Note that this is a linear time operation. unsigned getNumClasses() const { unsigned NC = 0; - for (iterator I = begin(), E = end(); I != E; ++I) - if (I->isLeader()) ++NC; + for (const auto &E : *this) + if (E.isLeader()) + ++NC; return NC; } From bb179c483aa2f709a4e53a8c8860f90b8e2e6f47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 1 Apr 2025 13:50:42 -0700 Subject: [PATCH 0303/1029] [flang][rt] Allow ReportFatalUserError to be build on device (#133979) --- flang-rt/lib/runtime/stop.cpp | 2 +- flang/include/flang/Runtime/stop.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flang-rt/lib/runtime/stop.cpp b/flang-rt/lib/runtime/stop.cpp index 69a99e6c28955..c4d25813d5e93 100644 --- a/flang-rt/lib/runtime/stop.cpp +++ b/flang-rt/lib/runtime/stop.cpp @@ -240,7 +240,7 @@ static RT_NOINLINE_ATTR void PrintBacktrace() { RT_OPTNONE_ATTR void FORTRAN_PROCEDURE_NAME(backtrace)() { PrintBacktrace(); } -[[noreturn]] void RTNAME(ReportFatalUserError)( +[[noreturn]] RT_API_ATTRS void RTNAME(ReportFatalUserError)( const char *message, const char *source, int line) { Fortran::runtime::Terminator{source, line}.Crash(message); } diff --git a/flang/include/flang/Runtime/stop.h b/flang/include/flang/Runtime/stop.h index 02bce65765907..4ddc5cf49ec8f 100644 --- a/flang/include/flang/Runtime/stop.h +++ b/flang/include/flang/Runtime/stop.h @@ -35,7 +35,7 @@ void FORTRAN_PROCEDURE_NAME(backtrace)(NO_ARGUMENTS); // Crash with an error message when the program dynamically violates a Fortran // constraint. -NORETURN void RTNAME(ReportFatalUserError)( +NORETURN RT_API_ATTRS void RTNAME(ReportFatalUserError)( const char *message, const char *source, int line); FORTRAN_EXTERN_C_END From 16603d838c0dfa6881f9b8118c5a0b5ac2789752 Mon Sep 17 00:00:00 2001 From: Steven Perron Date: Tue, 1 Apr 2025 16:59:46 -0400 Subject: [PATCH 0304/1029] [HLSL] Add SPIR-V target type for RWStructuredBuffers (#133468) This PR adds the target type for main storage for HLSL raw buffer types. It does not handle the counter variables that are associated with those buffers. This is implementing part of https://github.com/llvm/wg-hlsl/blob/main/proposals/0018-spirv-resource-representation.md. We do not handle other HLSL raw buffer types. --- clang/lib/CodeGen/Targets/SPIR.cpp | 15 ++-- .../StructuredBuffers-constructors.hlsl | 69 +++++++++++-------- 2 files changed, 51 insertions(+), 33 deletions(-) diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index 43f511e572d37..225d9dfbd980b 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -386,14 +386,21 @@ llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType( if (ContainedTy.isNull()) return nullptr; - assert(!ResAttrs.RawBuffer && - "Raw buffers handles are not implemented for SPIR-V yet"); assert(!ResAttrs.IsROV && "Rasterizer order views not implemented for SPIR-V yet"); - // convert element type llvm::Type *ElemType = CGM.getTypes().ConvertType(ContainedTy); - return getSPIRVImageTypeFromHLSLResource(ResAttrs, ElemType, Ctx); + if (!ResAttrs.RawBuffer) { + // convert element type + return getSPIRVImageTypeFromHLSLResource(ResAttrs, ElemType, Ctx); + } + + llvm::ArrayType *RuntimeArrayType = llvm::ArrayType::get(ElemType, 0); + uint32_t StorageClass = /* StorageBuffer storage class */ 12; + bool IsWritable = ResAttrs.ResourceClass == llvm::dxil::ResourceClass::UAV; + return llvm::TargetExtType::get(Ctx, "spirv.VulkanBuffer", + {RuntimeArrayType}, + {StorageClass, IsWritable}); } case llvm::dxil::ResourceClass::CBuffer: llvm_unreachable("CBuffer handles are not implemented for SPIR-V yet"); diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl index 04534c5550252..8a1429fd1a6fc 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl @@ -1,59 +1,70 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV +// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -DSPIRV -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -// NOTE: SPIRV codegen for resource types is not yet implemented StructuredBuffer Buf : register(t10); RWStructuredBuffer Buf2 : register(u5, space1); + +#ifndef SPIRV +// NOTE: SPIRV codegen for these resource types is not implemented yet. AppendStructuredBuffer Buf3 : register(u3); ConsumeStructuredBuffer Buf4 : register(u4); RasterizerOrderedStructuredBuffer Buf5 : register(u1, space2); +#endif + +// CHECK-DXIL: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) } +// CHECK-DXIL: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) } +// CHECK-DXIL: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) } +// CHECK-DXIL: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) } +// CHECK-DXIL: %"class.hlsl::RasterizerOrderedStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 1) } + +// CHECK-SPIRV: %"class.hlsl::StructuredBuffer" = type { target("spirv.VulkanBuffer", [0 x float], 12, 0) } +// CHECK-SPIRV: %"class.hlsl::RWStructuredBuffer" = type { target("spirv.VulkanBuffer", [0 x float], 12, 1) } -// CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) } -// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) } -// CHECK: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) } -// CHECK: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) } -// CHECK: %"class.hlsl::RasterizerOrderedStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 1) } -// CHECK: @_ZL3Buf = internal global %"class.hlsl::StructuredBuffer" poison, align 4 -// CHECK: @_ZL4Buf2 = internal global %"class.hlsl::RWStructuredBuffer" poison, align 4 -// CHECK: @_ZL4Buf3 = internal global %"class.hlsl::AppendStructuredBuffer" poison, align 4 -// CHECK: @_ZL4Buf4 = internal global %"class.hlsl::ConsumeStructuredBuffer" poison, align 4 -// CHECK: @_ZL4Buf5 = internal global %"class.hlsl::RasterizerOrderedStructuredBuffer" poison, align 4 +// CHECK: @_ZL3Buf = internal global %"class.hlsl::StructuredBuffer" poison +// CHECK: @_ZL4Buf2 = internal global %"class.hlsl::RWStructuredBuffer" poison +// CHECK-DXIL: @_ZL4Buf3 = internal global %"class.hlsl::AppendStructuredBuffer" poison, align 4 +// CHECK-DXIL: @_ZL4Buf4 = internal global %"class.hlsl::ConsumeStructuredBuffer" poison, align 4 +// CHECK-DXIL: @_ZL4Buf5 = internal global %"class.hlsl::RasterizerOrderedStructuredBuffer" poison, align 4 // CHECK: define internal void @_init_resource__ZL3Buf() // CHECK-DXIL: [[H:%.*]] = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t(i32 0, i32 10, i32 1, i32 0, i1 false) // CHECK-DXIL: store target("dx.RawBuffer", float, 0, 0) [[H]], ptr @_ZL3Buf, align 4 +// CHECK-SPIRV: [[H:%.*]] = call target("spirv.VulkanBuffer", [0 x float], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0f32_12_0t(i32 0, i32 10, i32 1, i32 0, i1 false) +// CHECK-SPIRV: store target("spirv.VulkanBuffer", [0 x float], 12, 0) [[H]], ptr @_ZL3Buf, align 8 // CHECK: define internal void @_init_resource__ZL4Buf2() // CHECK-DXIL: [[H:%.*]] = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 5, i32 1, i32 0, i1 false) // CHECK-DXIL: store target("dx.RawBuffer", float, 1, 0) [[H]], ptr @_ZL4Buf2, align 4 +// CHECK-SPIRV: [[H:%.*]] = call target("spirv.VulkanBuffer", [0 x float], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0f32_12_1t(i32 1, i32 5, i32 1, i32 0, i1 false) +// CHECK-SPIRV: store target("spirv.VulkanBuffer", [0 x float], 12, 1) [[H]], ptr @_ZL4Buf2, align 8 -// CHECK: define internal void @_init_resource__ZL4Buf3() +// CHECK-DXIL: define internal void @_init_resource__ZL4Buf3() // CHECK-DXIL: [[H:%.*]] = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 3, i32 1, i32 0, i1 false) // CHECK-DXIL: store target("dx.RawBuffer", float, 1, 0) [[H]], ptr @_ZL4Buf3, align 4 -// CHECK: define internal void @_init_resource__ZL4Buf4() +// CHECK-DXIL: define internal void @_init_resource__ZL4Buf4() // CHECK-DXIL: [[H:%.*]] = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 4, i32 1, i32 0, i1 false) // CHECK-DXIL: store target("dx.RawBuffer", float, 1, 0) [[H]], ptr @_ZL4Buf4, align 4 -// CHECK: define internal void @_init_resource__ZL4Buf5() +// CHECK-DXIL: define internal void @_init_resource__ZL4Buf5() // CHECK-DXIL: [[H:%.*]] = call target("dx.RawBuffer", float, 1, 1) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_1t(i32 2, i32 1, i32 1, i32 0, i1 false) // CHECK-DXIL: store target("dx.RawBuffer", float, 1, 1) [[H]], ptr @_ZL4Buf5, align 4 -// CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) -// CHECK-NEXT: entry: -// CHECK: define linkonce_odr void @_ZN4hlsl18RWStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) -// CHECK-NEXT: entry: -// CHECK: define linkonce_odr void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) -// CHECK-NEXT: entry: -// CHECK: define linkonce_odr void @_ZN4hlsl23ConsumeStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) -// CHECK: define linkonce_odr void @_ZN4hlsl33RasterizerOrderedStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) +// CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC2Ev(ptr noundef nonnull align {{[48]}} dereferenceable({{[48]}}) %this) // CHECK-NEXT: entry: +// CHECK-DXIL: define linkonce_odr void @_ZN4hlsl18RWStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) +// CHECK-DXIL-NEXT: entry: +// CHECK-DXIL: define linkonce_odr void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) +// CHECK-DXIL-NEXT: entry: +// CHECK-DXIL: define linkonce_odr void @_ZN4hlsl23ConsumeStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) +// CHECK-DXIL: define linkonce_odr void @_ZN4hlsl33RasterizerOrderedStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) +// CHECK-DXIL-NEXT: entry: -// CHECK: define internal void @_GLOBAL__sub_I_StructuredBuffers_constructors.hlsl() -// CHECK: call void @_init_resource__ZL3Buf() -// CHECK: call void @_init_resource__ZL4Buf2() -// CHECK: call void @_init_resource__ZL4Buf3() -// CHECK: call void @_init_resource__ZL4Buf4() -// CHECK: call void @_init_resource__ZL4Buf5() +// CHECK: define {{.*}} void @_GLOBAL__sub_I_StructuredBuffers_constructors.hlsl() +// CHECK: call {{.*}} @_init_resource__ZL3Buf() +// CHECK: call {{.*}} @_init_resource__ZL4Buf2() +// CHECK-DXIL: call void @_init_resource__ZL4Buf3() +// CHECK-DXIL: call void @_init_resource__ZL4Buf4() +// CHECK-DXIL: call void @_init_resource__ZL4Buf5() From 96d60c00e5ed5bddedad0eab83a089957a9cf388 Mon Sep 17 00:00:00 2001 From: AdityaK Date: Tue, 1 Apr 2025 14:10:17 -0700 Subject: [PATCH 0305/1029] [mlir][spirv] Verify matching of entry block arguments and function signature (#133167) Fixes: #132894 --- mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp | 18 ++++++++++++++++++ .../Dialect/SPIRV/IR/function-decorations.mlir | 17 +++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp index da9855b02860d..16e91b0cb2cfc 100644 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp @@ -1021,6 +1021,24 @@ LogicalResult spirv::FuncOp::verifyType() { LogicalResult spirv::FuncOp::verifyBody() { FunctionType fnType = getFunctionType(); + if (!isExternal()) { + Block &entryBlock = front(); + + unsigned numArguments = this->getNumArguments(); + if (entryBlock.getNumArguments() != numArguments) + return emitOpError("entry block must have ") + << numArguments << " arguments to match function signature"; + + for (auto [index, fnArgType, blockArgType] : + llvm::enumerate(getArgumentTypes(), entryBlock.getArgumentTypes())) { + if (blockArgType != fnArgType) { + return emitOpError("type of entry block argument #") + << index << '(' << blockArgType + << ") must match the type of the corresponding argument in " + << "function signature(" << fnArgType << ')'; + } + } + } auto walkResult = walk([fnType](Operation *op) -> WalkResult { if (auto retOp = dyn_cast(op)) { diff --git a/mlir/test/Dialect/SPIRV/IR/function-decorations.mlir b/mlir/test/Dialect/SPIRV/IR/function-decorations.mlir index 07e187e6a7d68..f09767a416f6b 100644 --- a/mlir/test/Dialect/SPIRV/IR/function-decorations.mlir +++ b/mlir/test/Dialect/SPIRV/IR/function-decorations.mlir @@ -73,3 +73,20 @@ spirv.func @no_decoration_name_attr(%arg0 : !spirv.ptr { spirv.decoration = #spirv.decoration, random_attr = #spirv.decoration }) "None" { spirv.Return } + +// ----- + +// expected-error @+1 {{'spirv.func' op entry block must have 1 arguments to match function signature}} +spirv.func @f(f32) "None" { + %c0 = arith.constant 0 : index + spirv.Return +} + +// ----- + +// expected-error @+1 {{'spirv.func' op type of entry block argument #0('f64') must match the type of the corresponding argument in function signature('f32')}} +spirv.func @f(f32) "None" { + ^bb0(%arg0: f64): + %c0 = arith.constant 0 : index + spirv.Return +} From 5ffd9bdb50b5753bbf668e4eab3647dfb46cd0d6 Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Tue, 1 Apr 2025 14:12:44 -0700 Subject: [PATCH 0306/1029] [cmake] Refactor clang unittest cmake (#133545) Pass all the dependencies into add_clang_unittest. This is consistent with how it is done for LLDB. I borrowed the same named argument list structure from add_lldb_unittest. This is a necessary step towards consolidating unit tests into fewer binaries, but seems like a good refactoring in its own right. --- clang/unittests/AST/ByteCode/CMakeLists.txt | 12 ++----- clang/unittests/AST/CMakeLists.txt | 23 ++++--------- clang/unittests/ASTMatchers/CMakeLists.txt | 22 ++++--------- .../ASTMatchers/Dynamic/CMakeLists.txt | 18 +++-------- clang/unittests/Analysis/CMakeLists.txt | 18 +++-------- .../Analysis/FlowSensitive/CMakeLists.txt | 18 +++-------- clang/unittests/Basic/CMakeLists.txt | 18 +++-------- clang/unittests/CMakeLists.txt | 32 ++++++++++++++++--- clang/unittests/CodeGen/CMakeLists.txt | 15 +++------ clang/unittests/CrossTU/CMakeLists.txt | 12 ++----- .../unittests/DirectoryWatcher/CMakeLists.txt | 11 ++----- clang/unittests/Driver/CMakeLists.txt | 19 ++++------- clang/unittests/Format/CMakeLists.txt | 11 ++----- clang/unittests/Frontend/CMakeLists.txt | 12 +++---- clang/unittests/Index/CMakeLists.txt | 13 +++----- clang/unittests/InstallAPI/CMakeLists.txt | 9 ++---- clang/unittests/Interpreter/CMakeLists.txt | 25 +++++++-------- .../Interpreter/ExceptionTests/CMakeLists.txt | 20 ++++++------ clang/unittests/Lex/CMakeLists.txt | 16 +++------- clang/unittests/Rewrite/CMakeLists.txt | 10 ++---- clang/unittests/Sema/CMakeLists.txt | 18 +++-------- clang/unittests/Serialization/CMakeLists.txt | 17 ++++------ clang/unittests/StaticAnalyzer/CMakeLists.txt | 18 +++-------- clang/unittests/Support/CMakeLists.txt | 11 ++----- clang/unittests/Tooling/CMakeLists.txt | 28 +++++++--------- clang/unittests/Tooling/Syntax/CMakeLists.txt | 15 +++------ clang/unittests/libclang/CMakeLists.txt | 5 +-- .../libclang/CrashTests/CMakeLists.txt | 5 +-- 28 files changed, 162 insertions(+), 289 deletions(-) diff --git a/clang/unittests/AST/ByteCode/CMakeLists.txt b/clang/unittests/AST/ByteCode/CMakeLists.txt index b862fb4834fbd..7ccadda2eeb26 100644 --- a/clang/unittests/AST/ByteCode/CMakeLists.txt +++ b/clang/unittests/AST/ByteCode/CMakeLists.txt @@ -2,19 +2,13 @@ add_clang_unittest(InterpTests BitcastBuffer.cpp Descriptor.cpp toAPValue.cpp - ) - -clang_target_link_libraries(InterpTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangBasic clangFrontend clangSerialization clangTooling - ) - - target_link_libraries(InterpTests - PRIVATE + LINK_LIBS clangTesting -) + ) diff --git a/clang/unittests/AST/CMakeLists.txt b/clang/unittests/AST/CMakeLists.txt index bfa6082a6ffa4..f27d34e8a0719 100644 --- a/clang/unittests/AST/CMakeLists.txt +++ b/clang/unittests/AST/CMakeLists.txt @@ -1,10 +1,3 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - TargetParser - ) - - add_subdirectory(ByteCode) add_clang_unittest(ASTTests @@ -43,10 +36,7 @@ add_clang_unittest(ASTTests TemplateNameTest.cpp TypePrinterTest.cpp UnresolvedSetTest.cpp - ) - -clang_target_link_libraries(ASTTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangBasic @@ -54,11 +44,12 @@ clang_target_link_libraries(ASTTests clangLex clangSerialization clangTooling - ) - -target_link_libraries(ASTTests - PRIVATE + LINK_LIBS clangTesting LLVMTestingAnnotations LLVMTestingSupport -) + LLVM_COMPONENTS + FrontendOpenMP + Support + TargetParser + ) diff --git a/clang/unittests/ASTMatchers/CMakeLists.txt b/clang/unittests/ASTMatchers/CMakeLists.txt index 6a1e629d81b65..47bd5c108bb5a 100644 --- a/clang/unittests/ASTMatchers/CMakeLists.txt +++ b/clang/unittests/ASTMatchers/CMakeLists.txt @@ -1,31 +1,23 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - TargetParser - ) - add_clang_unittest(ASTMatchersTests ASTMatchersInternalTest.cpp ASTMatchersNodeTest.cpp ASTMatchersNarrowingTest.cpp ASTMatchersTraversalTest.cpp GtestMatchersTest.cpp - ) - -clang_target_link_libraries(ASTMatchersTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangBasic clangFrontend clangSerialization clangTooling - ) - -target_link_libraries(ASTMatchersTests - PRIVATE + LINK_LIBS clangTesting LLVMTestingSupport -) + LLVM_COMPONENTS + FrontendOpenMP + Support + TargetParser + ) add_subdirectory(Dynamic) diff --git a/clang/unittests/ASTMatchers/Dynamic/CMakeLists.txt b/clang/unittests/ASTMatchers/Dynamic/CMakeLists.txt index 6d0e12bcb0759..b6db7ce62afe7 100644 --- a/clang/unittests/ASTMatchers/Dynamic/CMakeLists.txt +++ b/clang/unittests/ASTMatchers/Dynamic/CMakeLists.txt @@ -1,16 +1,8 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - ) - add_clang_unittest(DynamicASTMatchersTests VariantValueTest.cpp ParserTest.cpp RegistryTest.cpp - ) - -clang_target_link_libraries(DynamicASTMatchersTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangBasic @@ -18,9 +10,9 @@ clang_target_link_libraries(DynamicASTMatchersTests clangFrontend clangSerialization clangTooling - ) - -target_link_libraries(DynamicASTMatchersTests - PRIVATE + LINK_LIBS clangTesting + LLVM_COMPONENTS + FrontendOpenMP + Support ) diff --git a/clang/unittests/Analysis/CMakeLists.txt b/clang/unittests/Analysis/CMakeLists.txt index cfea57f53f033..059a74843155c 100644 --- a/clang/unittests/Analysis/CMakeLists.txt +++ b/clang/unittests/Analysis/CMakeLists.txt @@ -1,8 +1,3 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - ) - add_clang_unittest(ClangAnalysisTests CFGDominatorTree.cpp CFGTest.cpp @@ -11,10 +6,7 @@ add_clang_unittest(ClangAnalysisTests IntervalPartitionTest.cpp MacroExpansionContextTest.cpp UnsafeBufferUsageTest.cpp - ) - -clang_target_link_libraries(ClangAnalysisTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangAnalysis @@ -23,12 +15,12 @@ clang_target_link_libraries(ClangAnalysisTests clangLex clangSerialization clangTooling - ) - -target_link_libraries(ClangAnalysisTests - PRIVATE + LINK_LIBS clangTesting LLVMTestingSupport + LLVM_COMPONENTS + FrontendOpenMP + Support ) add_subdirectory(FlowSensitive) diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt index 6c01ae8fc2e54..4ac563143cd68 100644 --- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt +++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt @@ -1,8 +1,3 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - ) - add_clang_unittest(ClangAnalysisFlowSensitiveTests ArenaTest.cpp ASTOpsTest.cpp @@ -30,10 +25,7 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests UncheckedOptionalAccessModelTest.cpp ValueTest.cpp WatchedLiteralsSolverTest.cpp - ) - -clang_target_link_libraries(ClangAnalysisFlowSensitiveTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangAnalysis @@ -44,11 +36,11 @@ clang_target_link_libraries(ClangAnalysisFlowSensitiveTests clangLex clangSerialization clangTooling - ) - -target_link_libraries(ClangAnalysisFlowSensitiveTests - PRIVATE + LINK_LIBS clangTesting LLVMTestingAnnotations LLVMTestingSupport + LLVM_COMPONENTS + FrontendOpenMP + Support ) diff --git a/clang/unittests/Basic/CMakeLists.txt b/clang/unittests/Basic/CMakeLists.txt index 3844ba49add8d..e818bd3e2c372 100644 --- a/clang/unittests/Basic/CMakeLists.txt +++ b/clang/unittests/Basic/CMakeLists.txt @@ -1,7 +1,3 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(BasicTests CharInfoTest.cpp DarwinSDKInfoTest.cpp @@ -12,15 +8,11 @@ add_clang_unittest(BasicTests SanitizersTest.cpp SarifTest.cpp SourceManagerTest.cpp - ) - -clang_target_link_libraries(BasicTests - PRIVATE + CLANG_LIBS clangBasic clangLex - ) - -target_link_libraries(BasicTests - PRIVATE + LINK_LIBS LLVMTestingSupport -) + LLVM_COMPONENTS + Support + ) diff --git a/clang/unittests/CMakeLists.txt b/clang/unittests/CMakeLists.txt index 85d265426ec80..580533a97d700 100644 --- a/clang/unittests/CMakeLists.txt +++ b/clang/unittests/CMakeLists.txt @@ -15,12 +15,36 @@ if(CLANG_BUILT_STANDALONE) endif() endif() -# add_clang_unittest(test_dirname file1.cpp file2.cpp) +# add_clang_unittest(test_name file1.cpp file2.cpp) # # Will compile the list of files together and link against the clang -# Produces a binary named 'basename(test_dirname)'. -function(add_clang_unittest test_dirname) - add_unittest(ClangUnitTests ${test_dirname} ${ARGN}) +# Produces a binary named 'basename(test_name)'. +function(add_clang_unittest test_name) + cmake_parse_arguments(ARG + "" + "" + "CLANG_LIBS;LINK_LIBS;LLVM_COMPONENTS" + ${ARGN}) + + if (NOT ${test_name} MATCHES "Tests$") + message(FATAL_ERROR "Unit test name must end with 'Tests' for lit to find it.") + endif() + + # LLVM_COMPONENTS is for LLVM_LINK_COMPONENTS deps, and must be before + # add_unittest. + list(APPEND LLVM_LINK_COMPONENTS ${ARG_LLVM_COMPONENTS}) + + add_unittest(ClangUnitTests ${test_name} ${ARG_UNPARSED_ARGUMENTS}) + + # Clang libs either come from the entire dylib, or individual libraries. + if (CLANG_LINK_CLANG_DYLIB) + list(APPEND ARG_LINK_LIBS clang-cpp) + else() + list(APPEND ARG_LINK_LIBS ${ARG_CLANG_LIBS}) + endif() + + # LINK_LIBS is for normal library dependencies. + target_link_libraries(${test_name} PRIVATE ${ARG_LINK_LIBS}) endfunction() add_subdirectory(Basic) diff --git a/clang/unittests/CodeGen/CMakeLists.txt b/clang/unittests/CodeGen/CMakeLists.txt index a437f441568f2..f5bcecb0b08a3 100644 --- a/clang/unittests/CodeGen/CMakeLists.txt +++ b/clang/unittests/CodeGen/CMakeLists.txt @@ -1,18 +1,9 @@ -set(LLVM_LINK_COMPONENTS - Core - Support - TargetParser - ) - add_clang_unittest(ClangCodeGenTests BufferSourceTest.cpp CodeGenExternalTest.cpp TBAAMetadataTest.cpp CheckTargetFeaturesTest.cpp - ) - -clang_target_link_libraries(ClangCodeGenTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangCodeGen @@ -20,4 +11,8 @@ clang_target_link_libraries(ClangCodeGenTests clangLex clangParse clangSerialization + LLVM_COMPONENTS + Core + Support + TargetParser ) diff --git a/clang/unittests/CrossTU/CMakeLists.txt b/clang/unittests/CrossTU/CMakeLists.txt index 222b7e83dc38c..ee81c57ca1dce 100644 --- a/clang/unittests/CrossTU/CMakeLists.txt +++ b/clang/unittests/CrossTU/CMakeLists.txt @@ -1,18 +1,12 @@ -set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - Support - ) - add_clang_unittest(CrossTUTests CrossTranslationUnitTest.cpp - ) - -clang_target_link_libraries(CrossTUTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangCrossTU clangFrontend clangSerialization clangTooling + LLVM_COMPONENTS + Support ) diff --git a/clang/unittests/DirectoryWatcher/CMakeLists.txt b/clang/unittests/DirectoryWatcher/CMakeLists.txt index 38882c9ec2162..58e0aee2d1076 100644 --- a/clang/unittests/DirectoryWatcher/CMakeLists.txt +++ b/clang/unittests/DirectoryWatcher/CMakeLists.txt @@ -1,17 +1,12 @@ if(APPLE OR CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME STREQUAL Windows) - set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(DirectoryWatcherTests DirectoryWatcherTest.cpp - ) - - target_link_libraries(DirectoryWatcherTests - PRIVATE + LINK_LIBS LLVMTestingSupport clangDirectoryWatcher + LLVM_COMPONENTS + Support ) endif() diff --git a/clang/unittests/Driver/CMakeLists.txt b/clang/unittests/Driver/CMakeLists.txt index efdd07ea23889..fa0e87c3318df 100644 --- a/clang/unittests/Driver/CMakeLists.txt +++ b/clang/unittests/Driver/CMakeLists.txt @@ -1,11 +1,3 @@ -set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - MC - Option - Support - TargetParser - ) - add_clang_unittest(ClangDriverTests DistroTest.cpp DXCModeTest.cpp @@ -15,12 +7,15 @@ add_clang_unittest(ClangDriverTests MultilibBuilderTest.cpp MultilibTest.cpp SanitizerArgsTest.cpp - ) - -clang_target_link_libraries(ClangDriverTests - PRIVATE + CLANG_LIBS clangDriver clangBasic clangFrontend # For TextDiagnosticPrinter. clangSerialization + LLVM_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + MC + Option + Support + TargetParser ) diff --git a/clang/unittests/Format/CMakeLists.txt b/clang/unittests/Format/CMakeLists.txt index 71f5886d946c8..5bd6a17182d29 100644 --- a/clang/unittests/Format/CMakeLists.txt +++ b/clang/unittests/Format/CMakeLists.txt @@ -1,7 +1,3 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(FormatTests BracesInserterTest.cpp BracesRemoverTest.cpp @@ -36,12 +32,11 @@ add_clang_unittest(FormatTests SortIncludesTest.cpp UsingDeclarationsSorterTest.cpp TokenAnnotatorTest.cpp - ) - -clang_target_link_libraries(FormatTests - PRIVATE + CLANG_LIBS clangBasic clangFormat clangRewrite clangToolingCore + LLVM_COMPONENTS + Support ) diff --git a/clang/unittests/Frontend/CMakeLists.txt b/clang/unittests/Frontend/CMakeLists.txt index 3c94846243870..bbf0396014fa9 100644 --- a/clang/unittests/Frontend/CMakeLists.txt +++ b/clang/unittests/Frontend/CMakeLists.txt @@ -1,8 +1,3 @@ -set(LLVM_LINK_COMPONENTS - Support - TargetParser - ) - add_clang_unittest(FrontendTests ASTUnitTest.cpp CompilerInvocationTest.cpp @@ -17,9 +12,7 @@ add_clang_unittest(FrontendTests OutputStreamTest.cpp TextDiagnosticTest.cpp UtilsTest.cpp - ) -clang_target_link_libraries(FrontendTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangFrontend @@ -29,4 +22,7 @@ clang_target_link_libraries(FrontendTests clangFrontendTool clangSerialization clangTooling + LLVM_COMPONENTS + Support + TargetParser ) diff --git a/clang/unittests/Index/CMakeLists.txt b/clang/unittests/Index/CMakeLists.txt index ea940e9d7a9ef..15e9ba0643eaf 100644 --- a/clang/unittests/Index/CMakeLists.txt +++ b/clang/unittests/Index/CMakeLists.txt @@ -1,14 +1,6 @@ -set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - Support - ) - add_clang_unittest(IndexTests IndexTests.cpp - ) - -clang_target_link_libraries(IndexTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangFrontend @@ -16,4 +8,7 @@ clang_target_link_libraries(IndexTests clangLex clangSerialization clangTooling + LLVM_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Support ) diff --git a/clang/unittests/InstallAPI/CMakeLists.txt b/clang/unittests/InstallAPI/CMakeLists.txt index 4255001ff51f1..c174fa3f87161 100644 --- a/clang/unittests/InstallAPI/CMakeLists.txt +++ b/clang/unittests/InstallAPI/CMakeLists.txt @@ -1,11 +1,8 @@ add_clang_unittest(InstallAPITests HeaderFileTest.cpp FileListTest.cpp - ) - -clang_target_link_libraries(InstallAPITests - PRIVATE + CLANG_LIBS clangInstallAPI + LINK_LIBS + LLVMTestingSupport ) - -target_link_libraries(InstallAPITests PRIVATE LLVMTestingSupport) diff --git a/clang/unittests/Interpreter/CMakeLists.txt b/clang/unittests/Interpreter/CMakeLists.txt index 95378f9cfe737..9df1a4b03da47 100644 --- a/clang/unittests/Interpreter/CMakeLists.txt +++ b/clang/unittests/Interpreter/CMakeLists.txt @@ -1,12 +1,3 @@ -set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - Core - MC - OrcJIT - Support - TargetParser - ) - add_clang_unittest(ClangReplInterpreterTests IncrementalCompilerBuilderTest.cpp IncrementalProcessingTest.cpp @@ -15,16 +6,24 @@ add_clang_unittest(ClangReplInterpreterTests CodeCompletionTest.cpp EXPORT_SYMBOLS - ) - -target_link_libraries(ClangReplInterpreterTests PUBLIC LLVMTestingSupport) -clang_target_link_libraries(ClangReplInterpreterTests PRIVATE + CLANG_LIBS clangAST clangBasic clangInterpreter clangFrontend clangSema + + LINK_LIBS + LLVMTestingSupport + + LLVM_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Core + MC + OrcJIT + Support + TargetParser ) # Exceptions on Windows are not yet supported. diff --git a/clang/unittests/Interpreter/ExceptionTests/CMakeLists.txt b/clang/unittests/Interpreter/ExceptionTests/CMakeLists.txt index 24ae9cd78b5ca..eb366a860661c 100644 --- a/clang/unittests/Interpreter/ExceptionTests/CMakeLists.txt +++ b/clang/unittests/Interpreter/ExceptionTests/CMakeLists.txt @@ -3,24 +3,22 @@ set(LLVM_REQUIRES_EH ON) set(LLVM_REQUIRES_RTTI ON) -set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - Core - OrcJIT - Support - ) - add_clang_unittest(ClangReplInterpreterExceptionTests InterpreterExceptionTest.cpp - EXPORT_SYMBOLS - ) -llvm_update_compile_flags(ClangReplInterpreterExceptionTests) -target_link_libraries(ClangReplInterpreterExceptionTests PUBLIC + CLANG_LIBS clangAST clangBasic clangInterpreter clangFrontend + + LLVM_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Core + OrcJIT + Support ) + +llvm_update_compile_flags(ClangReplInterpreterExceptionTests) add_dependencies(ClangReplInterpreterExceptionTests clang-resource-headers) diff --git a/clang/unittests/Lex/CMakeLists.txt b/clang/unittests/Lex/CMakeLists.txt index 5ec93946594b7..96ca6dda9cd85 100644 --- a/clang/unittests/Lex/CMakeLists.txt +++ b/clang/unittests/Lex/CMakeLists.txt @@ -1,7 +1,3 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(LexTests DependencyDirectivesScannerTest.cpp HeaderMapTest.cpp @@ -13,19 +9,15 @@ add_clang_unittest(LexTests PPConditionalDirectiveRecordTest.cpp PPDependencyDirectivesTest.cpp PPMemoryAllocationsTest.cpp - ) - -clang_target_link_libraries(LexTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangLex clangParse clangSema - ) - -target_link_libraries(LexTests - PRIVATE + LINK_LIBS LLVMTestingAnnotations LLVMTestingSupport + LLVM_COMPONENTS + Support ) diff --git a/clang/unittests/Rewrite/CMakeLists.txt b/clang/unittests/Rewrite/CMakeLists.txt index 3c5e2f8e5354b..498613254e72b 100644 --- a/clang/unittests/Rewrite/CMakeLists.txt +++ b/clang/unittests/Rewrite/CMakeLists.txt @@ -1,14 +1,10 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(RewriteTests RewriterTest.cpp - ) -clang_target_link_libraries(RewriteTests - PRIVATE + CLANG_LIBS clangFrontend clangRewrite clangSerialization clangTooling + LLVM_COMPONENTS + Support ) diff --git a/clang/unittests/Sema/CMakeLists.txt b/clang/unittests/Sema/CMakeLists.txt index 17d39408000a4..acc76c932afeb 100644 --- a/clang/unittests/Sema/CMakeLists.txt +++ b/clang/unittests/Sema/CMakeLists.txt @@ -1,8 +1,3 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - ) - add_clang_unittest(SemaTests ExternalSemaSourceTest.cpp CodeCompleteTest.cpp @@ -10,10 +5,7 @@ add_clang_unittest(SemaTests GslOwnerPointerInference.cpp SemaLookupTest.cpp SemaNoloadLookupTest.cpp - ) - -clang_target_link_libraries(SemaTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangBasic @@ -22,11 +14,11 @@ clang_target_link_libraries(SemaTests clangSema clangSerialization clangTooling - ) - -target_link_libraries(SemaTests - PRIVATE + LINK_LIBS LLVMTestingAnnotations LLVMTestingSupport clangTesting + LLVM_COMPONENTS + FrontendOpenMP + Support ) diff --git a/clang/unittests/Serialization/CMakeLists.txt b/clang/unittests/Serialization/CMakeLists.txt index e7005b5d511eb..6782e6b4d7330 100644 --- a/clang/unittests/Serialization/CMakeLists.txt +++ b/clang/unittests/Serialization/CMakeLists.txt @@ -1,10 +1,3 @@ -set(LLVM_LINK_COMPONENTS - BitReader - BitstreamReader - FrontendOpenMP - Support - ) - add_clang_unittest(SerializationTests ForceCheckFileInputTest.cpp InMemoryModuleCacheTest.cpp @@ -14,10 +7,7 @@ add_clang_unittest(SerializationTests LoadSpecLazilyTest.cpp SourceLocationEncodingTest.cpp VarDeclConstantInitTest.cpp - ) - -clang_target_link_libraries(SerializationTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangFrontend @@ -26,4 +16,9 @@ clang_target_link_libraries(SerializationTests clangSerialization clangTooling clangASTMatchers + LLVM_COMPONENTS + BitReader + BitstreamReader + FrontendOpenMP + Support ) diff --git a/clang/unittests/StaticAnalyzer/CMakeLists.txt b/clang/unittests/StaticAnalyzer/CMakeLists.txt index 3b01a4e9e5327..143b7eedbfe05 100644 --- a/clang/unittests/StaticAnalyzer/CMakeLists.txt +++ b/clang/unittests/StaticAnalyzer/CMakeLists.txt @@ -1,8 +1,3 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - ) - add_clang_unittest(StaticAnalysisTests AnalyzerOptionsTest.cpp APSIntTypeTest.cpp @@ -25,10 +20,7 @@ add_clang_unittest(StaticAnalysisTests SValTest.cpp TestReturnValueUnderConstruction.cpp Z3CrosscheckOracleTest.cpp - ) - -clang_target_link_libraries(StaticAnalysisTests - PRIVATE + CLANG_LIBS clangBasic clangAnalysis clangAST @@ -39,9 +31,9 @@ clang_target_link_libraries(StaticAnalysisTests clangStaticAnalyzerCore clangStaticAnalyzerFrontend clangTooling - ) - -target_link_libraries(StaticAnalysisTests - PRIVATE + LINK_LIBS clangTesting + LLVM_COMPONENTS + FrontendOpenMP + Support ) diff --git a/clang/unittests/Support/CMakeLists.txt b/clang/unittests/Support/CMakeLists.txt index 22be5ed18cc7a..d0ce4f6d10617 100644 --- a/clang/unittests/Support/CMakeLists.txt +++ b/clang/unittests/Support/CMakeLists.txt @@ -1,15 +1,10 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(ClangSupportTests TimeProfilerTest.cpp - ) - -clang_target_link_libraries(ClangSupportTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangFrontend clangSerialization + LLVM_COMPONENTS + Support ) diff --git a/clang/unittests/Tooling/CMakeLists.txt b/clang/unittests/Tooling/CMakeLists.txt index 401978c31863c..106c6b9dc38bd 100644 --- a/clang/unittests/Tooling/CMakeLists.txt +++ b/clang/unittests/Tooling/CMakeLists.txt @@ -1,13 +1,3 @@ -set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - MC - Option - FrontendOpenMP - Support - TargetParser - ) - - add_clang_unittest(ToolingTests ASTSelectionTest.cpp CastExprTest.cpp @@ -69,10 +59,8 @@ add_clang_unittest(ToolingTests StencilTest.cpp ToolingTest.cpp TransformerTest.cpp - ) -clang_target_link_libraries(ToolingTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangBasic @@ -89,13 +77,19 @@ clang_target_link_libraries(ToolingTests clangToolingInclusionsStdlib clangToolingRefactoring clangTransformer - ) -target_link_libraries(ToolingTests - PRIVATE + LINK_LIBS LLVMTestingAnnotations LLVMTestingSupport clangTesting -) + + LLVM_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + MC + Option + FrontendOpenMP + Support + TargetParser + ) add_subdirectory(Syntax) diff --git a/clang/unittests/Tooling/Syntax/CMakeLists.txt b/clang/unittests/Tooling/Syntax/CMakeLists.txt index ff3b6176f879f..db110fefa954f 100644 --- a/clang/unittests/Tooling/Syntax/CMakeLists.txt +++ b/clang/unittests/Tooling/Syntax/CMakeLists.txt @@ -1,7 +1,3 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(SyntaxTests TreeTestBase.cpp BuildTreeTest.cpp @@ -9,10 +5,8 @@ add_clang_unittest(SyntaxTests SynthesisTest.cpp TreeTest.cpp TokensTest.cpp -) -clang_target_link_libraries(SyntaxTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangFrontend @@ -21,11 +15,12 @@ clang_target_link_libraries(SyntaxTests clangTooling clangToolingCore clangToolingSyntax - ) -target_link_libraries(SyntaxTests - PRIVATE + LINK_LIBS clangTesting LLVMTestingAnnotations LLVMTestingSupport + + LLVM_COMPONENTS + Support ) diff --git a/clang/unittests/libclang/CMakeLists.txt b/clang/unittests/libclang/CMakeLists.txt index b3644a0e710e1..ba86c3c4d91e0 100644 --- a/clang/unittests/libclang/CMakeLists.txt +++ b/clang/unittests/libclang/CMakeLists.txt @@ -1,9 +1,6 @@ add_clang_unittest(libclangTests LibclangTest.cpp - ) - -target_link_libraries(libclangTests - PRIVATE + LINK_LIBS libclang ) diff --git a/clang/unittests/libclang/CrashTests/CMakeLists.txt b/clang/unittests/libclang/CrashTests/CMakeLists.txt index 82f0e4c16e901..de7b5a8f6ee91 100644 --- a/clang/unittests/libclang/CrashTests/CMakeLists.txt +++ b/clang/unittests/libclang/CrashTests/CMakeLists.txt @@ -1,8 +1,5 @@ add_clang_unittest(libclangCrashTests LibclangCrashTest.cpp - ) - -target_link_libraries(libclangCrashTests - PRIVATE + LINK_LIBS libclang ) From b6edd25f1787fe7deb4491462227719fb7bda5b1 Mon Sep 17 00:00:00 2001 From: Andre Kuhlenschmidt Date: Tue, 1 Apr 2025 14:30:10 -0700 Subject: [PATCH 0307/1029] [flang][intrinsics] NFC: make comment consistent (#133972) Just makes this named argument comment consistent with all the others in the file. --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 0948396ac3fb8..8bbec6d6a7535 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -756,7 +756,7 @@ static constexpr IntrinsicHandler handlers[]{ {"perror", &I::genPerror, {{{"string", asBox}}}, - /*isElemental*/ false}, + /*isElemental=*/false}, {"popcnt", &I::genPopcnt}, {"poppar", &I::genPoppar}, {"present", From 86e66d2820ff50d56544d7350761adbfe27aa164 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 1 Apr 2025 14:36:50 -0700 Subject: [PATCH 0308/1029] [gn] port 7003f7d23aeca --- .../utils/gn/secondary/clang/tools/clang-sycl-linker/BUILD.gn | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/utils/gn/secondary/clang/tools/clang-sycl-linker/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-sycl-linker/BUILD.gn index 6199b9aaa1a8f..224a1d8bb684f 100644 --- a/llvm/utils/gn/secondary/clang/tools/clang-sycl-linker/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/tools/clang-sycl-linker/BUILD.gn @@ -10,6 +10,10 @@ executable("clang-sycl-linker") { ":SYCLLinkOpts", "//clang/lib/Basic", "//llvm/lib/BinaryFormat", + "//llvm/lib/Bitcode/Writer", + "//llvm/lib/IR", + "//llvm/lib/IRReader", + "//llvm/lib/Linker", "//llvm/lib/Option", "//llvm/lib/Object", "//llvm/lib/Support", From e55164ae1098bbf8ceb87b83a4b282b08bb7bef9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Apr 2025 14:31:40 -0700 Subject: [PATCH 0309/1029] [RISCV] Use AsmToken::getEndLoc(). NFC --- llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 1e07ada1f9701..8f9a5ae75fca7 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -1659,7 +1659,7 @@ ParseStatus RISCVAsmParser::parseRegister(OperandVector &Operands, if (HadParens) Operands.push_back(RISCVOperand::createToken("(", FirstS)); SMLoc S = getLoc(); - SMLoc E = SMLoc::getFromPointer(S.getPointer() + Name.size()); + SMLoc E = getTok().getEndLoc(); getLexer().Lex(); Operands.push_back(RISCVOperand::createReg(Reg, S, E)); } @@ -2272,7 +2272,7 @@ ParseStatus RISCVAsmParser::parseMaskReg(OperandVector &Operands) { if (Reg != RISCV::V0) return ParseStatus::NoMatch; SMLoc S = getLoc(); - SMLoc E = SMLoc::getFromPointer(S.getPointer() + Name.size()); + SMLoc E = getTok().getEndLoc(); getLexer().Lex(); Operands.push_back(RISCVOperand::createReg(Reg, S, E)); return ParseStatus::Success; @@ -2295,7 +2295,7 @@ ParseStatus RISCVAsmParser::parseGPRAsFPR(OperandVector &Operands) { if (!Reg) return ParseStatus::NoMatch; SMLoc S = getLoc(); - SMLoc E = SMLoc::getFromPointer(S.getPointer() + Name.size()); + SMLoc E = getTok().getEndLoc(); getLexer().Lex(); Operands.push_back(RISCVOperand::createReg( Reg, S, E, !getSTI().hasFeature(RISCV::FeatureStdExtF))); @@ -2328,7 +2328,7 @@ ParseStatus RISCVAsmParser::parseGPRPairAsFPR64(OperandVector &Operands) { } SMLoc S = getLoc(); - SMLoc E = SMLoc::getFromPointer(S.getPointer() + Name.size()); + SMLoc E = getTok().getEndLoc(); getLexer().Lex(); const MCRegisterInfo *RI = getContext().getRegisterInfo(); @@ -2370,7 +2370,7 @@ ParseStatus RISCVAsmParser::parseGPRPair(OperandVector &Operands, return TokError("register must be even"); SMLoc S = getLoc(); - SMLoc E = SMLoc::getFromPointer(S.getPointer() + Name.size()); + SMLoc E = getTok().getEndLoc(); getLexer().Lex(); const MCRegisterInfo *RI = getContext().getRegisterInfo(); From 676755561d5a2f074411ad289fed55c977571a32 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Tue, 1 Apr 2025 14:58:30 -0700 Subject: [PATCH 0310/1029] Reland "[HLSL][RootSignature] Implement parsing of a DescriptorTable with empty clauses" (#133958) This pr relands https://github.com/llvm/llvm-project/pull/133302. It resolves two issues: - Linking error during build, [here](https://github.com/llvm/llvm-project/pull/133302#issuecomment-2767259848). There was a missing dependency for `clangLex` for the `ParseHLSLRootSignatureTest.cpp` unit testing. This library was added to the dependencies to resolve the error. It wasn't caught previously as the library was transitively linked in most build environments - Warning of unused declaration, [here](https://github.com/llvm/llvm-project/pull/133302#issuecomment-2767091368). There was a usability line in `LexHLSLRootSignature.h` of the form `using TokenKind = enum RootSignatureToken::Kind` which causes this error. The declaration is removed from the header file to be used locally in the `.cpp` files that use it. Notably, the original pr would also exposed `clang::hlsl::TokenKind` to everywhere it was included, which had a name clash with `tok::TokenKind`. This is another motivation to change to the proposed resolution. --------- Co-authored-by: Finn Plummer --- .../clang/Basic/DiagnosticParseKinds.td | 4 + .../clang/Lex/HLSLRootSignatureTokenKinds.def | 23 +- .../include/clang/Lex/LexHLSLRootSignature.h | 22 +- .../clang/Parse/ParseHLSLRootSignature.h | 107 ++++++++ clang/lib/Lex/LexHLSLRootSignature.cpp | 18 +- clang/lib/Parse/CMakeLists.txt | 1 + clang/lib/Parse/ParseHLSLRootSignature.cpp | 168 ++++++++++++ clang/unittests/CMakeLists.txt | 1 + .../Lex/LexHLSLRootSignatureTest.cpp | 60 ++--- clang/unittests/Parse/CMakeLists.txt | 20 ++ .../Parse/ParseHLSLRootSignatureTest.cpp | 245 ++++++++++++++++++ .../llvm/Frontend/HLSL/HLSLRootSignature.h | 44 ++++ 12 files changed, 660 insertions(+), 53 deletions(-) create mode 100644 clang/include/clang/Parse/ParseHLSLRootSignature.h create mode 100644 clang/lib/Parse/ParseHLSLRootSignature.cpp create mode 100644 clang/unittests/Parse/CMakeLists.txt create mode 100644 clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp create mode 100644 llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 86c361b4dbcf7..2582e1e5ef0f6 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1830,4 +1830,8 @@ def err_hlsl_virtual_function def err_hlsl_virtual_inheritance : Error<"virtual inheritance is unsupported in HLSL">; +// HLSL Root Siganture diagnostic messages +def err_hlsl_unexpected_end_of_params + : Error<"expected %0 to denote end of parameters, or, another valid parameter of %1">; + } // end of Parser diagnostics diff --git a/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def b/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def index e6df763920430..c514d3456146a 100644 --- a/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def +++ b/clang/include/clang/Lex/HLSLRootSignatureTokenKinds.def @@ -14,16 +14,16 @@ //===----------------------------------------------------------------------===// #ifndef TOK -#define TOK(X) +#define TOK(X, SPELLING) #endif #ifndef PUNCTUATOR -#define PUNCTUATOR(X,Y) TOK(pu_ ## X) +#define PUNCTUATOR(X,Y) TOK(pu_ ## X, Y) #endif #ifndef KEYWORD -#define KEYWORD(X) TOK(kw_ ## X) +#define KEYWORD(X) TOK(kw_ ## X, #X) #endif #ifndef ENUM -#define ENUM(NAME, LIT) TOK(en_ ## NAME) +#define ENUM(NAME, LIT) TOK(en_ ## NAME, LIT) #endif // Defines the various types of enum @@ -49,15 +49,15 @@ #endif // General Tokens: -TOK(invalid) -TOK(end_of_stream) -TOK(int_literal) +TOK(invalid, "invalid identifier") +TOK(end_of_stream, "end of stream") +TOK(int_literal, "integer literal") // Register Tokens: -TOK(bReg) -TOK(tReg) -TOK(uReg) -TOK(sReg) +TOK(bReg, "b register") +TOK(tReg, "t register") +TOK(uReg, "u register") +TOK(sReg, "s register") // Punctuators: PUNCTUATOR(l_paren, '(') @@ -69,6 +69,7 @@ PUNCTUATOR(plus, '+') PUNCTUATOR(minus, '-') // RootElement Keywords: +KEYWORD(RootSignature) // used only for diagnostic messaging KEYWORD(DescriptorTable) // DescriptorTable Keywords: diff --git a/clang/include/clang/Lex/LexHLSLRootSignature.h b/clang/include/clang/Lex/LexHLSLRootSignature.h index 21c44e0351d9e..4dc80ff546aa0 100644 --- a/clang/include/clang/Lex/LexHLSLRootSignature.h +++ b/clang/include/clang/Lex/LexHLSLRootSignature.h @@ -13,6 +13,7 @@ #ifndef LLVM_CLANG_LEX_LEXHLSLROOTSIGNATURE_H #define LLVM_CLANG_LEX_LEXHLSLROOTSIGNATURE_H +#include "clang/Basic/Diagnostic.h" #include "clang/Basic/SourceLocation.h" #include "llvm/ADT/SmallVector.h" @@ -24,11 +25,11 @@ namespace hlsl { struct RootSignatureToken { enum Kind { -#define TOK(X) X, +#define TOK(X, SPELLING) X, #include "clang/Lex/HLSLRootSignatureTokenKinds.def" }; - Kind Kind = Kind::invalid; + Kind TokKind = Kind::invalid; // Retain the SouceLocation of the token for diagnostics clang::SourceLocation TokLoc; @@ -38,10 +39,21 @@ struct RootSignatureToken { // Constructors RootSignatureToken(clang::SourceLocation TokLoc) : TokLoc(TokLoc) {} - RootSignatureToken(enum Kind Kind, clang::SourceLocation TokLoc) - : Kind(Kind), TokLoc(TokLoc) {} + RootSignatureToken(Kind TokKind, clang::SourceLocation TokLoc) + : TokKind(TokKind), TokLoc(TokLoc) {} }; -using TokenKind = enum RootSignatureToken::Kind; + +inline const DiagnosticBuilder & +operator<<(const DiagnosticBuilder &DB, const RootSignatureToken::Kind Kind) { + switch (Kind) { +#define TOK(X, SPELLING) \ + case RootSignatureToken::Kind::X: \ + DB << SPELLING; \ + break; +#include "clang/Lex/HLSLRootSignatureTokenKinds.def" + } + return DB; +} class RootSignatureLexer { public: diff --git a/clang/include/clang/Parse/ParseHLSLRootSignature.h b/clang/include/clang/Parse/ParseHLSLRootSignature.h new file mode 100644 index 0000000000000..18cc2c6692551 --- /dev/null +++ b/clang/include/clang/Parse/ParseHLSLRootSignature.h @@ -0,0 +1,107 @@ +//===--- ParseHLSLRootSignature.h -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the RootSignatureParser interface. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_PARSE_PARSEHLSLROOTSIGNATURE_H +#define LLVM_CLANG_PARSE_PARSEHLSLROOTSIGNATURE_H + +#include "clang/Basic/DiagnosticParse.h" +#include "clang/Lex/LexHLSLRootSignature.h" +#include "clang/Lex/Preprocessor.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" + +#include "llvm/Frontend/HLSL/HLSLRootSignature.h" + +namespace clang { +namespace hlsl { + +class RootSignatureParser { +public: + RootSignatureParser(SmallVector &Elements, + RootSignatureLexer &Lexer, clang::Preprocessor &PP); + + /// Consumes tokens from the Lexer and constructs the in-memory + /// representations of the RootElements. Tokens are consumed until an + /// error is encountered or the end of the buffer. + /// + /// Returns true if a parsing error is encountered. + bool parse(); + +private: + DiagnosticsEngine &getDiags() { return PP.getDiagnostics(); } + + // All private Parse.* methods follow a similar pattern: + // - Each method will start with an assert to denote what the CurToken is + // expected to be and will parse from that token forward + // + // - Therefore, it is the callers responsibility to ensure that you are + // at the correct CurToken. This should be done with the pattern of: + // + // if (TryConsumeExpectedToken(RootSignatureToken::Kind)) + // if (Parse.*()) + // return true; + // + // or, + // + // if (ConsumeExpectedToken(RootSignatureToken::Kind, ...)) + // return true; + // if (Parse.*()) + // return true; + // + // - All methods return true if a parsing error is encountered. It is the + // callers responsibility to propogate this error up, or deal with it + // otherwise + // + // - An error will be raised if the proceeding tokens are not what is + // expected, or, there is a lexing error + + /// Root Element parse methods: + bool parseDescriptorTable(); + bool parseDescriptorTableClause(); + + /// Invoke the Lexer to consume a token and update CurToken with the result + void consumeNextToken() { CurToken = Lexer.ConsumeToken(); } + + /// Return true if the next token one of the expected kinds + bool peekExpectedToken(RootSignatureToken::Kind Expected); + bool peekExpectedToken(ArrayRef AnyExpected); + + /// Consumes the next token and report an error if it is not of the expected + /// kind. + /// + /// Returns true if there was an error reported. + bool consumeExpectedToken( + RootSignatureToken::Kind Expected, unsigned DiagID = diag::err_expected, + RootSignatureToken::Kind Context = RootSignatureToken::Kind::invalid); + + /// Peek if the next token is of the expected kind and if it is then consume + /// it. + /// + /// Returns true if it successfully matches the expected kind and the token + /// was consumed. + bool tryConsumeExpectedToken(RootSignatureToken::Kind Expected); + bool tryConsumeExpectedToken(ArrayRef Expected); + +private: + SmallVector &Elements; + RootSignatureLexer &Lexer; + + clang::Preprocessor &PP; + + RootSignatureToken CurToken; +}; + +} // namespace hlsl +} // namespace clang + +#endif // LLVM_CLANG_PARSE_PARSEHLSLROOTSIGNATURE_H diff --git a/clang/lib/Lex/LexHLSLRootSignature.cpp b/clang/lib/Lex/LexHLSLRootSignature.cpp index fb4aab20c7275..b065d9855ddac 100644 --- a/clang/lib/Lex/LexHLSLRootSignature.cpp +++ b/clang/lib/Lex/LexHLSLRootSignature.cpp @@ -11,6 +11,8 @@ namespace clang { namespace hlsl { +using TokenKind = RootSignatureToken::Kind; + // Lexer Definitions static bool IsNumberChar(char C) { @@ -34,7 +36,7 @@ RootSignatureToken RootSignatureLexer::LexToken() { switch (C) { #define PUNCTUATOR(X, Y) \ case Y: { \ - Result.Kind = TokenKind::pu_##X; \ + Result.TokKind = TokenKind::pu_##X; \ AdvanceBuffer(); \ return Result; \ } @@ -45,7 +47,7 @@ RootSignatureToken RootSignatureLexer::LexToken() { // Integer literal if (isdigit(C)) { - Result.Kind = TokenKind::int_literal; + Result.TokKind = TokenKind::int_literal; Result.NumSpelling = Buffer.take_while(IsNumberChar); AdvanceBuffer(Result.NumSpelling.size()); return Result; @@ -65,16 +67,16 @@ RootSignatureToken RootSignatureLexer::LexToken() { // Convert character to the register type. switch (C) { case 'b': - Result.Kind = TokenKind::bReg; + Result.TokKind = TokenKind::bReg; break; case 't': - Result.Kind = TokenKind::tReg; + Result.TokKind = TokenKind::tReg; break; case 'u': - Result.Kind = TokenKind::uReg; + Result.TokKind = TokenKind::uReg; break; case 's': - Result.Kind = TokenKind::sReg; + Result.TokKind = TokenKind::sReg; break; default: llvm_unreachable("Switch for an expected token was not provided"); @@ -100,14 +102,14 @@ RootSignatureToken RootSignatureLexer::LexToken() { #include "clang/Lex/HLSLRootSignatureTokenKinds.def" // Then attempt to retreive a string from it - Result.Kind = Switch.Default(TokenKind::invalid); + Result.TokKind = Switch.Default(TokenKind::invalid); AdvanceBuffer(TokSpelling.size()); return Result; } RootSignatureToken RootSignatureLexer::ConsumeToken() { // If we previously peeked then just return the previous value over - if (NextToken && NextToken->Kind != TokenKind::end_of_stream) { + if (NextToken && NextToken->TokKind != TokenKind::end_of_stream) { RootSignatureToken Result = *NextToken; NextToken = std::nullopt; return Result; diff --git a/clang/lib/Parse/CMakeLists.txt b/clang/lib/Parse/CMakeLists.txt index 22e902f7e1bc5..00fde537bb9c6 100644 --- a/clang/lib/Parse/CMakeLists.txt +++ b/clang/lib/Parse/CMakeLists.txt @@ -14,6 +14,7 @@ add_clang_library(clangParse ParseExpr.cpp ParseExprCXX.cpp ParseHLSL.cpp + ParseHLSLRootSignature.cpp ParseInit.cpp ParseObjc.cpp ParseOpenMP.cpp diff --git a/clang/lib/Parse/ParseHLSLRootSignature.cpp b/clang/lib/Parse/ParseHLSLRootSignature.cpp new file mode 100644 index 0000000000000..93a9689ebdf72 --- /dev/null +++ b/clang/lib/Parse/ParseHLSLRootSignature.cpp @@ -0,0 +1,168 @@ +//=== ParseHLSLRootSignature.cpp - Parse Root Signature -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Parse/ParseHLSLRootSignature.h" + +#include "llvm/Support/raw_ostream.h" + +using namespace llvm::hlsl::rootsig; + +namespace clang { +namespace hlsl { + +using TokenKind = RootSignatureToken::Kind; + +RootSignatureParser::RootSignatureParser(SmallVector &Elements, + RootSignatureLexer &Lexer, + Preprocessor &PP) + : Elements(Elements), Lexer(Lexer), PP(PP), CurToken(SourceLocation()) {} + +bool RootSignatureParser::parse() { + // Iterate as many RootElements as possible + while (tryConsumeExpectedToken(TokenKind::kw_DescriptorTable)) { + // Dispatch onto parser method. + // We guard against the unreachable here as we just ensured that CurToken + // will be one of the kinds in the while condition + switch (CurToken.TokKind) { + case TokenKind::kw_DescriptorTable: + if (parseDescriptorTable()) + return true; + break; + default: + llvm_unreachable("Switch for consumed token was not provided"); + } + + if (!tryConsumeExpectedToken(TokenKind::pu_comma)) + break; + } + + if (!tryConsumeExpectedToken(TokenKind::end_of_stream)) { + getDiags().Report(CurToken.TokLoc, diag::err_hlsl_unexpected_end_of_params) + << /*expected=*/TokenKind::end_of_stream + << /*param of=*/TokenKind::kw_RootSignature; + return true; + } + return false; +} + +bool RootSignatureParser::parseDescriptorTable() { + assert(CurToken.TokKind == TokenKind::kw_DescriptorTable && + "Expects to only be invoked starting at given keyword"); + + DescriptorTable Table; + + if (consumeExpectedToken(TokenKind::pu_l_paren, diag::err_expected_after, + CurToken.TokKind)) + return true; + + // Iterate as many Clauses as possible + while (tryConsumeExpectedToken({TokenKind::kw_CBV, TokenKind::kw_SRV, + TokenKind::kw_UAV, TokenKind::kw_Sampler})) { + if (parseDescriptorTableClause()) + return true; + + Table.NumClauses++; + + if (!tryConsumeExpectedToken(TokenKind::pu_comma)) + break; + } + + if (!tryConsumeExpectedToken(TokenKind::pu_r_paren)) { + getDiags().Report(CurToken.TokLoc, diag::err_hlsl_unexpected_end_of_params) + << /*expected=*/TokenKind::pu_r_paren + << /*param of=*/TokenKind::kw_DescriptorTable; + return true; + } + + Elements.push_back(Table); + return false; +} + +bool RootSignatureParser::parseDescriptorTableClause() { + assert((CurToken.TokKind == TokenKind::kw_CBV || + CurToken.TokKind == TokenKind::kw_SRV || + CurToken.TokKind == TokenKind::kw_UAV || + CurToken.TokKind == TokenKind::kw_Sampler) && + "Expects to only be invoked starting at given keyword"); + + DescriptorTableClause Clause; + switch (CurToken.TokKind) { + default: + llvm_unreachable("Switch for consumed token was not provided"); + case TokenKind::kw_CBV: + Clause.Type = ClauseType::CBuffer; + break; + case TokenKind::kw_SRV: + Clause.Type = ClauseType::SRV; + break; + case TokenKind::kw_UAV: + Clause.Type = ClauseType::UAV; + break; + case TokenKind::kw_Sampler: + Clause.Type = ClauseType::Sampler; + break; + } + + if (consumeExpectedToken(TokenKind::pu_l_paren, diag::err_expected_after, + CurToken.TokKind)) + return true; + + if (consumeExpectedToken(TokenKind::pu_r_paren, diag::err_expected_after, + CurToken.TokKind)) + return true; + + Elements.push_back(Clause); + return false; +} + +bool RootSignatureParser::peekExpectedToken(TokenKind Expected) { + return peekExpectedToken(ArrayRef{Expected}); +} + +bool RootSignatureParser::peekExpectedToken(ArrayRef AnyExpected) { + RootSignatureToken Result = Lexer.PeekNextToken(); + return llvm::is_contained(AnyExpected, Result.TokKind); +} + +bool RootSignatureParser::consumeExpectedToken(TokenKind Expected, + unsigned DiagID, + TokenKind Context) { + if (tryConsumeExpectedToken(Expected)) + return false; + + // Report unexpected token kind error + DiagnosticBuilder DB = getDiags().Report(CurToken.TokLoc, DiagID); + switch (DiagID) { + case diag::err_expected: + DB << Expected; + break; + case diag::err_expected_either: + case diag::err_expected_after: + DB << Expected << Context; + break; + default: + break; + } + return true; +} + +bool RootSignatureParser::tryConsumeExpectedToken(TokenKind Expected) { + return tryConsumeExpectedToken(ArrayRef{Expected}); +} + +bool RootSignatureParser::tryConsumeExpectedToken( + ArrayRef AnyExpected) { + // If not the expected token just return + if (!peekExpectedToken(AnyExpected)) + return false; + consumeNextToken(); + return true; +} + +} // namespace hlsl +} // namespace clang diff --git a/clang/unittests/CMakeLists.txt b/clang/unittests/CMakeLists.txt index 580533a97d700..f3823ba309420 100644 --- a/clang/unittests/CMakeLists.txt +++ b/clang/unittests/CMakeLists.txt @@ -49,6 +49,7 @@ endfunction() add_subdirectory(Basic) add_subdirectory(Lex) +add_subdirectory(Parse) add_subdirectory(Driver) if(CLANG_ENABLE_STATIC_ANALYZER) add_subdirectory(Analysis) diff --git a/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp b/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp index d72a842922f98..36bd201df1287 100644 --- a/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp +++ b/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp @@ -10,6 +10,7 @@ #include "gtest/gtest.h" using namespace clang; +using TokenKind = hlsl::RootSignatureToken::Kind; namespace { @@ -20,18 +21,18 @@ class LexHLSLRootSignatureTest : public ::testing::Test { void CheckTokens(hlsl::RootSignatureLexer &Lexer, SmallVector &Computed, - SmallVector &Expected) { + SmallVector &Expected) { for (unsigned I = 0, E = Expected.size(); I != E; ++I) { // Skip these to help with the macro generated test - if (Expected[I] == hlsl::TokenKind::invalid || - Expected[I] == hlsl::TokenKind::end_of_stream) + if (Expected[I] == TokenKind::invalid || + Expected[I] == TokenKind::end_of_stream) continue; hlsl::RootSignatureToken Result = Lexer.ConsumeToken(); - ASSERT_EQ(Result.Kind, Expected[I]); + ASSERT_EQ(Result.TokKind, Expected[I]); Computed.push_back(Result); } hlsl::RootSignatureToken EndOfStream = Lexer.ConsumeToken(); - ASSERT_EQ(EndOfStream.Kind, hlsl::TokenKind::end_of_stream); + ASSERT_EQ(EndOfStream.TokKind, TokenKind::end_of_stream); ASSERT_TRUE(Lexer.EndOfBuffer()); } }; @@ -49,11 +50,10 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexNumbersTest) { hlsl::RootSignatureLexer Lexer(Source, TokLoc); SmallVector Tokens; - SmallVector Expected = { - hlsl::TokenKind::pu_minus, hlsl::TokenKind::int_literal, - hlsl::TokenKind::int_literal, hlsl::TokenKind::pu_plus, - hlsl::TokenKind::int_literal, hlsl::TokenKind::pu_plus, - hlsl::TokenKind::int_literal, + SmallVector Expected = { + TokenKind::pu_minus, TokenKind::int_literal, TokenKind::int_literal, + TokenKind::pu_plus, TokenKind::int_literal, TokenKind::pu_plus, + TokenKind::int_literal, }; CheckTokens(Lexer, Tokens, Expected); @@ -85,6 +85,8 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexAllTokensTest) { (),|=+- + RootSignature + DescriptorTable CBV SRV UAV Sampler @@ -112,8 +114,8 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexAllTokensTest) { hlsl::RootSignatureLexer Lexer(Source, TokLoc); SmallVector Tokens; - SmallVector Expected = { -#define TOK(NAME) hlsl::TokenKind::NAME, + SmallVector Expected = { +#define TOK(NAME, SPELLING) TokenKind::NAME, #include "clang/Lex/HLSLRootSignatureTokenKinds.def" }; @@ -134,17 +136,17 @@ TEST_F(LexHLSLRootSignatureTest, ValidCaseInsensitiveKeywordsTest) { hlsl::RootSignatureLexer Lexer(Source, TokLoc); SmallVector Tokens; - SmallVector Expected = { - hlsl::TokenKind::kw_DescriptorTable, - hlsl::TokenKind::kw_CBV, - hlsl::TokenKind::kw_SRV, - hlsl::TokenKind::kw_UAV, - hlsl::TokenKind::kw_Sampler, - hlsl::TokenKind::kw_space, - hlsl::TokenKind::kw_visibility, - hlsl::TokenKind::kw_flags, - hlsl::TokenKind::kw_numDescriptors, - hlsl::TokenKind::kw_offset, + SmallVector Expected = { + TokenKind::kw_DescriptorTable, + TokenKind::kw_CBV, + TokenKind::kw_SRV, + TokenKind::kw_UAV, + TokenKind::kw_Sampler, + TokenKind::kw_space, + TokenKind::kw_visibility, + TokenKind::kw_flags, + TokenKind::kw_numDescriptors, + TokenKind::kw_offset, }; CheckTokens(Lexer, Tokens, Expected); @@ -160,26 +162,26 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexPeekTest) { // Test basic peek hlsl::RootSignatureToken Res = Lexer.PeekNextToken(); - ASSERT_EQ(Res.Kind, hlsl::TokenKind::pu_r_paren); + ASSERT_EQ(Res.TokKind, TokenKind::pu_r_paren); // Ensure it doesn't peek past one element Res = Lexer.PeekNextToken(); - ASSERT_EQ(Res.Kind, hlsl::TokenKind::pu_r_paren); + ASSERT_EQ(Res.TokKind, TokenKind::pu_r_paren); Res = Lexer.ConsumeToken(); - ASSERT_EQ(Res.Kind, hlsl::TokenKind::pu_r_paren); + ASSERT_EQ(Res.TokKind, TokenKind::pu_r_paren); // Invoke after reseting the NextToken Res = Lexer.PeekNextToken(); - ASSERT_EQ(Res.Kind, hlsl::TokenKind::int_literal); + ASSERT_EQ(Res.TokKind, TokenKind::int_literal); // Ensure we can still consume the second token Res = Lexer.ConsumeToken(); - ASSERT_EQ(Res.Kind, hlsl::TokenKind::int_literal); + ASSERT_EQ(Res.TokKind, TokenKind::int_literal); // Ensure end of stream token Res = Lexer.PeekNextToken(); - ASSERT_EQ(Res.Kind, hlsl::TokenKind::end_of_stream); + ASSERT_EQ(Res.TokKind, TokenKind::end_of_stream); } } // anonymous namespace diff --git a/clang/unittests/Parse/CMakeLists.txt b/clang/unittests/Parse/CMakeLists.txt new file mode 100644 index 0000000000000..2a31be625042e --- /dev/null +++ b/clang/unittests/Parse/CMakeLists.txt @@ -0,0 +1,20 @@ +set(LLVM_LINK_COMPONENTS + Support + ) +add_clang_unittest(ParseTests + ParseHLSLRootSignatureTest.cpp + ) +clang_target_link_libraries(ParseTests + PRIVATE + clangAST + clangBasic + clangLex + clangParse + clangSema + ) +target_link_libraries(ParseTests + PRIVATE + LLVMTestingAnnotations + LLVMTestingSupport + clangTesting + ) diff --git a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp new file mode 100644 index 0000000000000..acdf455a5d6aa --- /dev/null +++ b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp @@ -0,0 +1,245 @@ +//=== ParseHLSLRootSignatureTest.cpp - Parse Root Signature tests ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceLocation.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TargetInfo.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/HeaderSearchOptions.h" +#include "clang/Lex/Lexer.h" +#include "clang/Lex/ModuleLoader.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/PreprocessorOptions.h" + +#include "clang/Lex/LexHLSLRootSignature.h" +#include "clang/Parse/ParseHLSLRootSignature.h" +#include "gtest/gtest.h" + +using namespace clang; +using namespace llvm::hlsl::rootsig; + +namespace { + +// Diagnostic helper for helper tests +class ExpectedDiagConsumer : public DiagnosticConsumer { + virtual void anchor() {} + + void HandleDiagnostic(DiagnosticsEngine::Level DiagLevel, + const Diagnostic &Info) override { + if (!FirstDiag || !ExpectedDiagID.has_value()) { + Satisfied = false; + return; + } + FirstDiag = false; + + Satisfied = ExpectedDiagID.value() == Info.getID(); + } + + bool FirstDiag = true; + bool Satisfied = false; + std::optional ExpectedDiagID; + +public: + void setNoDiag() { + Satisfied = true; + ExpectedDiagID = std::nullopt; + } + + void setExpected(unsigned DiagID) { + Satisfied = false; + ExpectedDiagID = DiagID; + } + + bool isSatisfied() { return Satisfied; } +}; + +// The test fixture. +class ParseHLSLRootSignatureTest : public ::testing::Test { +protected: + ParseHLSLRootSignatureTest() + : FileMgr(FileMgrOpts), DiagID(new DiagnosticIDs()), + Consumer(new ExpectedDiagConsumer()), + Diags(DiagID, new DiagnosticOptions, Consumer), + SourceMgr(Diags, FileMgr), TargetOpts(new TargetOptions) { + // This is an arbitrarily chosen target triple to create the target info. + TargetOpts->Triple = "dxil"; + Target = TargetInfo::CreateTargetInfo(Diags, TargetOpts); + } + + std::unique_ptr createPP(StringRef Source, + TrivialModuleLoader &ModLoader) { + std::unique_ptr Buf = + llvm::MemoryBuffer::getMemBuffer(Source); + SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf))); + + HeaderSearchOptions SearchOpts; + HeaderSearch HeaderInfo(SearchOpts, SourceMgr, Diags, LangOpts, + Target.get()); + std::unique_ptr PP = std::make_unique( + std::make_shared(), Diags, LangOpts, SourceMgr, + HeaderInfo, ModLoader, + /*IILookup =*/nullptr, + /*OwnsHeaderSearch =*/false); + PP->Initialize(*Target); + PP->EnterMainSourceFile(); + return PP; + } + + FileSystemOptions FileMgrOpts; + FileManager FileMgr; + IntrusiveRefCntPtr DiagID; + ExpectedDiagConsumer *Consumer; + DiagnosticsEngine Diags; + SourceManager SourceMgr; + LangOptions LangOpts; + std::shared_ptr TargetOpts; + IntrusiveRefCntPtr Target; +}; + +// Valid Parser Tests + +TEST_F(ParseHLSLRootSignatureTest, ValidParseEmptyTest) { + const llvm::StringLiteral Source = R"cc()cc"; + + TrivialModuleLoader ModLoader; + auto PP = createPP(Source, ModLoader); + auto TokLoc = SourceLocation(); + + hlsl::RootSignatureLexer Lexer(Source, TokLoc); + SmallVector Elements; + hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); + + // Test no diagnostics produced + Consumer->setNoDiag(); + + ASSERT_FALSE(Parser.parse()); + ASSERT_EQ((int)Elements.size(), 0); + + ASSERT_TRUE(Consumer->isSatisfied()); +} + +TEST_F(ParseHLSLRootSignatureTest, ValidParseDTClausesTest) { + const llvm::StringLiteral Source = R"cc( + DescriptorTable( + CBV(), + SRV(), + Sampler(), + UAV() + ), + DescriptorTable() + )cc"; + + TrivialModuleLoader ModLoader; + auto PP = createPP(Source, ModLoader); + auto TokLoc = SourceLocation(); + + hlsl::RootSignatureLexer Lexer(Source, TokLoc); + SmallVector Elements; + hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); + + // Test no diagnostics produced + Consumer->setNoDiag(); + + ASSERT_FALSE(Parser.parse()); + + // First Descriptor Table with 4 elements + RootElement Elem = Elements[0]; + ASSERT_TRUE(std::holds_alternative(Elem)); + ASSERT_EQ(std::get(Elem).Type, ClauseType::CBuffer); + + Elem = Elements[1]; + ASSERT_TRUE(std::holds_alternative(Elem)); + ASSERT_EQ(std::get(Elem).Type, ClauseType::SRV); + + Elem = Elements[2]; + ASSERT_TRUE(std::holds_alternative(Elem)); + ASSERT_EQ(std::get(Elem).Type, ClauseType::Sampler); + + Elem = Elements[3]; + ASSERT_TRUE(std::holds_alternative(Elem)); + ASSERT_EQ(std::get(Elem).Type, ClauseType::UAV); + + Elem = Elements[4]; + ASSERT_TRUE(std::holds_alternative(Elem)); + ASSERT_EQ(std::get(Elem).NumClauses, (uint32_t)4); + + // Empty Descriptor Table + Elem = Elements[5]; + ASSERT_TRUE(std::holds_alternative(Elem)); + ASSERT_EQ(std::get(Elem).NumClauses, 0u); + ASSERT_TRUE(Consumer->isSatisfied()); +} + +// Invalid Parser Tests + +TEST_F(ParseHLSLRootSignatureTest, InvalidParseUnexpectedTokenTest) { + const llvm::StringLiteral Source = R"cc( + DescriptorTable() + space + )cc"; + + TrivialModuleLoader ModLoader; + auto PP = createPP(Source, ModLoader); + auto TokLoc = SourceLocation(); + + hlsl::RootSignatureLexer Lexer(Source, TokLoc); + SmallVector Elements; + hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); + + // Test correct diagnostic produced + Consumer->setExpected(diag::err_hlsl_unexpected_end_of_params); + ASSERT_TRUE(Parser.parse()); + + ASSERT_TRUE(Consumer->isSatisfied()); +} + +TEST_F(ParseHLSLRootSignatureTest, InvalidParseInvalidTokenTest) { + const llvm::StringLiteral Source = R"cc( + notAnIdentifier + )cc"; + + TrivialModuleLoader ModLoader; + auto PP = createPP(Source, ModLoader); + auto TokLoc = SourceLocation(); + + hlsl::RootSignatureLexer Lexer(Source, TokLoc); + SmallVector Elements; + hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); + + // Test correct diagnostic produced - invalid token + Consumer->setExpected(diag::err_hlsl_unexpected_end_of_params); + ASSERT_TRUE(Parser.parse()); + + ASSERT_TRUE(Consumer->isSatisfied()); +} + +TEST_F(ParseHLSLRootSignatureTest, InvalidParseUnexpectedEndOfStreamTest) { + const llvm::StringLiteral Source = R"cc( + DescriptorTable + )cc"; + + TrivialModuleLoader ModLoader; + auto PP = createPP(Source, ModLoader); + auto TokLoc = SourceLocation(); + + hlsl::RootSignatureLexer Lexer(Source, TokLoc); + SmallVector Elements; + hlsl::RootSignatureParser Parser(Elements, Lexer, *PP); + + // Test correct diagnostic produced - end of stream + Consumer->setExpected(diag::err_expected_after); + ASSERT_TRUE(Parser.parse()); + + ASSERT_TRUE(Consumer->isSatisfied()); +} + +} // anonymous namespace diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h new file mode 100644 index 0000000000000..c1b67844c747f --- /dev/null +++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h @@ -0,0 +1,44 @@ +//===- HLSLRootSignature.h - HLSL Root Signature helper objects -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file This file contains helper objects for working with HLSL Root +/// Signatures. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_FRONTEND_HLSL_HLSLROOTSIGNATURE_H +#define LLVM_FRONTEND_HLSL_HLSLROOTSIGNATURE_H + +#include "llvm/Support/DXILABI.h" +#include + +namespace llvm { +namespace hlsl { +namespace rootsig { + +// Definitions of the in-memory data layout structures + +// Models the end of a descriptor table and stores its visibility +struct DescriptorTable { + uint32_t NumClauses = 0; // The number of clauses in the table +}; + +// Models DTClause : CBV | SRV | UAV | Sampler, by collecting like parameters +using ClauseType = llvm::dxil::ResourceClass; +struct DescriptorTableClause { + ClauseType Type; +}; + +// Models RootElement : DescriptorTable | DescriptorTableClause +using RootElement = std::variant; + +} // namespace rootsig +} // namespace hlsl +} // namespace llvm + +#endif // LLVM_FRONTEND_HLSL_HLSLROOTSIGNATURE_H From e0c8fc793c2a6142ce86575290fe7933812b6f36 Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Tue, 1 Apr 2025 15:25:01 -0700 Subject: [PATCH 0311/1029] Reapply "[sanitizer] intercept getservent_r, getservbyname_r, getservbyport_r" (#133358) (#133528) This reverts commit 52d7f14a895eb8669d72cd02754e5586de3e61d8. --- compiler-rt/lib/msan/tests/msan_test.cpp | 95 +++++++++++++++++++ .../sanitizer_common_interceptors.inc | 68 +++++++++++++ .../sanitizer_platform_interceptors.h | 4 + .../sanitizer_platform_limits_posix.h | 9 ++ .../TestCases/Linux/getservent_r.cpp | 44 +++++++++ .../test/sanitizer_common/lit.common.cfg.py | 3 + 6 files changed, 223 insertions(+) create mode 100644 compiler-rt/test/sanitizer_common/TestCases/Linux/getservent_r.cpp diff --git a/compiler-rt/lib/msan/tests/msan_test.cpp b/compiler-rt/lib/msan/tests/msan_test.cpp index a126dd4fdd55e..d1c481483dfad 100644 --- a/compiler-rt/lib/msan/tests/msan_test.cpp +++ b/compiler-rt/lib/msan/tests/msan_test.cpp @@ -4908,5 +4908,100 @@ TEST(MemorySanitizer, timer_create) { EXPECT_POISONED(timer2); timer_delete(timer); } + +TEST(MemorySanitizer, getservent_r) { + if (access("/etc/services", O_RDONLY) != 0) + GTEST_SKIP() << "Missing /etc/services"; + struct servent result_buf; + struct servent *result; + char buf[1024]; + EXPECT_POISONED(result_buf); + EXPECT_POISONED(result); + EXPECT_POISONED(buf); + ASSERT_EQ(getservent_r(&result_buf, buf, sizeof(buf), &result), 0); + EXPECT_NOT_POISONED(result); + ASSERT_NE(result, nullptr); + EXPECT_NOT_POISONED(result_buf); + EXPECT_NOT_POISONED(buf); +} + +TEST(MemorySanitizer, getservbyname_r) { + if (access("/etc/services", O_RDONLY) != 0) + GTEST_SKIP() << "Missing /etc/services"; + struct servent result_buf; + struct servent *result; + char buf[1024]; + EXPECT_POISONED(result_buf); + EXPECT_POISONED(result); + EXPECT_POISONED(buf); + ASSERT_EQ( + getservbyname_r("ssh", nullptr, &result_buf, buf, sizeof(buf), &result), + 0); + EXPECT_NOT_POISONED(result); + // If this fails, check /etc/services if "ssh" exists. I picked this because + // it should exist everywhere, if it doesn't, I am sorry. Disable the test + // then please. + ASSERT_NE(result, nullptr); + EXPECT_NOT_POISONED(result_buf); + EXPECT_NOT_POISONED(buf); +} + +TEST(MemorySanitizer, getservbyname_r_unknown) { + if (access("/etc/services", O_RDONLY) != 0) + GTEST_SKIP() << "Missing /etc/services"; + struct servent result_buf; + struct servent *result; + char buf[1024]; + EXPECT_POISONED(result_buf); + EXPECT_POISONED(result); + EXPECT_POISONED(buf); + ASSERT_EQ(getservbyname_r("invalidhadfuiasdhi", nullptr, &result_buf, buf, + sizeof(buf), &result), + 0); + EXPECT_NOT_POISONED(result); + ASSERT_EQ(result, nullptr); + EXPECT_POISONED(result_buf); + EXPECT_POISONED(buf); +} + +TEST(MemorySanitizer, getservbyport_r) { + if (access("/etc/services", O_RDONLY) != 0) + GTEST_SKIP() << "Missing /etc/services"; + struct servent result_buf; + struct servent *result; + char buf[1024]; + EXPECT_POISONED(result_buf); + EXPECT_POISONED(result); + EXPECT_POISONED(buf); + ASSERT_EQ(getservbyport_r(htons(22), nullptr, &result_buf, buf, sizeof(buf), + &result), + 0); + EXPECT_NOT_POISONED(result); + // If this fails, check /etc/services if "ssh" exists. I picked this because + // it should exist everywhere, if it doesn't, I am sorry. Disable the test + // then please. + ASSERT_NE(result, nullptr); + EXPECT_NOT_POISONED(result_buf); + EXPECT_NOT_POISONED(buf); +} + +TEST(MemorySanitizer, getservbyport_r_smallbuf) { + if (access("/etc/services", O_RDONLY) != 0) + GTEST_SKIP() << "Missing /etc/services"; + struct servent result_buf; + struct servent *result; + char buf[1]; + EXPECT_POISONED(result_buf); + EXPECT_POISONED(result); + EXPECT_POISONED(buf); + ASSERT_EQ(getservbyport_r(htons(22), nullptr, &result_buf, buf, sizeof(buf), + &result), + ERANGE); + EXPECT_NOT_POISONED(result); + ASSERT_EQ(result, nullptr); + EXPECT_POISONED(result_buf); + EXPECT_POISONED(buf); +} + #endif } // namespace diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc index 761dbd3f5a679..5a15d75f0c86a 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc @@ -10279,6 +10279,71 @@ INTERCEPTOR(SSIZE_T, freadlink, int fd, char *buf, SIZE_T bufsiz) { # define INIT_FREADLINK #endif +#if SANITIZER_INTERCEPT_GETSERVENT_R || SANITIZER_INTERCEPT_GETSERVBYNAME_R || \ + SANITIZER_INTERCEPT_GETSERVBYPORT_R + +UNUSED static void HandleGetServentReentrantResult( + void *ctx, int res, struct __sanitizer_servent *result_buf, char *buf, + SIZE_T buflen, struct __sanitizer_servent **result) { + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (char *)result, sizeof(void *)); + if (res) + return; + if (*result) { + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (char *)*result, + sizeof(__sanitizer_servent)); + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, buf, buflen); + } +} + +#endif + +#if SANITIZER_INTERCEPT_GETSERVENT_R +INTERCEPTOR(int, getservent_r, struct __sanitizer_servent *result_buf, + char *buf, SIZE_T buflen, struct __sanitizer_servent **result) { + void *ctx; + COMMON_INTERCEPTOR_ENTER(ctx, getservent_r, result_buf, buf, buflen, result); + int res = REAL(getservent_r)(result_buf, buf, buflen, result); + HandleGetServentReentrantResult(ctx, res, result_buf, buf, buflen, result); + return res; +} +# define INIT_GETSERVENT_R COMMON_INTERCEPT_FUNCTION(getservent_r) +#else +# define INIT_GETSERVENT_R +#endif + +#if SANITIZER_INTERCEPT_GETSERVBYNAME_R +INTERCEPTOR(int, getservbyname_r, const char *name, const char *proto, + struct __sanitizer_servent *result_buf, char *buf, SIZE_T buflen, + struct __sanitizer_servent **result) { + void *ctx; + COMMON_INTERCEPTOR_ENTER(ctx, getservbyname_r, name, proto, result_buf, buf, + buflen, result); + COMMON_INTERCEPTOR_READ_STRING(ctx, name, internal_strlen(name)); + int res = REAL(getservbyname_r)(name, proto, result_buf, buf, buflen, result); + HandleGetServentReentrantResult(ctx, res, result_buf, buf, buflen, result); + return res; +} +# define INIT_GETSERVBYNAME_R COMMON_INTERCEPT_FUNCTION(getservbyname_r) +#else +# define INIT_GETSERVBYNAME_R +#endif + +#if SANITIZER_INTERCEPT_GETSERVBYPORT_R +INTERCEPTOR(int, getservbyport_r, int port, const char *proto, + struct __sanitizer_servent *result_buf, char *buf, SIZE_T buflen, + struct __sanitizer_servent **result) { + void *ctx; + COMMON_INTERCEPTOR_ENTER(ctx, getservbyport_r, port, proto, result_buf, buf, + buflen, result); + int res = REAL(getservbyport_r)(port, proto, result_buf, buf, buflen, result); + HandleGetServentReentrantResult(ctx, res, result_buf, buf, buflen, result); + return res; +} +# define INIT_GETSERVBYPORT_R COMMON_INTERCEPT_FUNCTION(getservbyport_r) +#else +# define INIT_GETSERVBYPORT_R +#endif + #include "sanitizer_common_interceptors_netbsd_compat.inc" namespace __sanitizer { @@ -10604,4 +10669,7 @@ static void InitializeCommonInterceptors() { INIT_FREADLINK; INIT___PRINTF_CHK; + INIT_GETSERVENT_R; + INIT_GETSERVBYNAME_R; + INIT_GETSERVBYPORT_R; } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h index 468b5494d0092..b8f2f738e7478 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h @@ -645,6 +645,10 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, # define SI_MAC_OS_DEPLOYMENT_MIN_13_00 0 #endif #define SANITIZER_INTERCEPT_FREADLINK (SI_MAC && SI_MAC_OS_DEPLOYMENT_MIN_13_00) +#define SANITIZER_INTERCEPT_GETSERVENT_R SI_GLIBC +#define SANITIZER_INTERCEPT_GETSERVBYNAME_R SI_GLIBC +#define SANITIZER_INTERCEPT_GETSERVBYPORT_R SI_GLIBC + // This macro gives a way for downstream users to override the above // interceptor macros irrespective of the platform they are on. They have // to do two things: diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h index 67f00ff6f9e72..1f7e3d21b6a6f 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h @@ -1509,6 +1509,15 @@ extern unsigned IOCTL_KIOCSOUND; extern unsigned IOCTL_PIO_SCRNMAP; #endif +# if SANITIZER_GLIBC +struct __sanitizer_servent { + char *s_name; + char **s_aliases; + int s_port; + char *s_proto; +}; +# endif + extern const int si_SEGV_MAPERR; extern const int si_SEGV_ACCERR; } // namespace __sanitizer diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/getservent_r.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/getservent_r.cpp new file mode 100644 index 0000000000000..b356c0ed807f6 --- /dev/null +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/getservent_r.cpp @@ -0,0 +1,44 @@ +// RUN: %clangxx -O0 %s -o %t && %run %t + +// REQUIRES: glibc, netbase + +#include +#include +#include +#include +#include +#include +#include +#include + +void CheckResult(const char *file, int line, int ret) { + if (ret != 0) { + fprintf(stderr, "ERROR: %s:%d - %s\n", file, line, strerror(ret)); + } + assert(ret == 0); +} + +#define CHECK_RESULT(ret) CheckResult(__FILE__, __LINE__, ret) + +int main(void) { + assert(access("/etc/services", O_RDONLY) == 0); + struct servent result_buf; + struct servent *result; + char buf[1024]; + // If these fail, check /etc/services if "ssh" exists. I picked this because + // it should exist everywhere, if it doesn't, I am sorry. Disable the test + // then please. + CHECK_RESULT( + getservbyname_r("ssh", nullptr, &result_buf, buf, sizeof(buf), &result)); + assert(result != nullptr); + CHECK_RESULT(getservbyport_r(htons(22), nullptr, &result_buf, buf, + sizeof(buf), &result)); + assert(result != nullptr); + + CHECK_RESULT(getservent_r(&result_buf, buf, sizeof(buf), &result)); + assert(result != nullptr); + + CHECK_RESULT(getservbyname_r("invalidhadfuiasdhi", nullptr, &result_buf, buf, + sizeof(buf), &result)); + assert(result == nullptr); +} diff --git a/compiler-rt/test/sanitizer_common/lit.common.cfg.py b/compiler-rt/test/sanitizer_common/lit.common.cfg.py index 5406e8838f2fc..c3c1336bacd53 100644 --- a/compiler-rt/test/sanitizer_common/lit.common.cfg.py +++ b/compiler-rt/test/sanitizer_common/lit.common.cfg.py @@ -100,3 +100,6 @@ def build_invocation(compile_flags): if config.host_os == "NetBSD": config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix)) + +if os.path.exists("/etc/services"): + config.available_features.add("netbase") From 69f59d59cb02c06f1fac93ea5b19c2df9a684109 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Tue, 1 Apr 2025 15:30:13 -0700 Subject: [PATCH 0312/1029] [mlir][IR] Delete `match` and `rewrite` functions (#130259) The `match` and `rewrite` functions have been deprecated in #130031. This commit deletes them entirely. Note for LLVM integration: Update your patterns to use `matchAndRewrite` instead of separate `match` / `rewrite`. --- .../mlir/Conversion/LLVMCommon/Pattern.h | 11 --- mlir/include/mlir/IR/PatternMatch.h | 50 ------------- .../mlir/Transforms/DialectConversion.h | 74 ------------------- 3 files changed, 135 deletions(-) diff --git a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h index e78f174ff8586..c65f7d7217be5 100644 --- a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h +++ b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h @@ -40,11 +40,6 @@ LogicalResult oneToOneRewrite( /// during the entire pattern lifetime. class ConvertToLLVMPattern : public ConversionPattern { public: - /// `SplitMatchAndRewrite` is deprecated. Use `matchAndRewrite` instead of - /// separate `match` and `rewrite`. - using SplitMatchAndRewrite = - detail::ConversionSplitMatchAndRewriteImpl; - ConvertToLLVMPattern(StringRef rootOpName, MLIRContext *context, const LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1); @@ -147,16 +142,10 @@ class ConvertToLLVMPattern : public ConversionPattern { template class ConvertOpToLLVMPattern : public ConvertToLLVMPattern { public: - using OperationT = SourceOp; using OpAdaptor = typename SourceOp::Adaptor; using OneToNOpAdaptor = typename SourceOp::template GenericAdaptor>; - /// `SplitMatchAndRewrite` is deprecated. Use `matchAndRewrite` instead of - /// separate `match` and `rewrite`. - using SplitMatchAndRewrite = detail::ConversionSplitMatchAndRewriteImpl< - ConvertOpToLLVMPattern>; - explicit ConvertOpToLLVMPattern(const LLVMTypeConverter &typeConverter, PatternBenefit benefit = 1) : ConvertToLLVMPattern(SourceOp::getOperationName(), diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h index d1f00c34f87b4..fc6ae8fb55fec 100644 --- a/mlir/include/mlir/IR/PatternMatch.h +++ b/mlir/include/mlir/IR/PatternMatch.h @@ -234,48 +234,9 @@ class Pattern { // RewritePattern //===----------------------------------------------------------------------===// -namespace detail { -/// Helper class that derives from a RewritePattern class and provides separate -/// `match` and `rewrite` entry points instead of a combined `matchAndRewrite`. -/// -/// This class is deprecated. Use `matchAndRewrite` instead of separate `match` -/// and `rewrite`. -template -class SplitMatchAndRewriteImpl : public PatternT { - using PatternT::PatternT; - - /// Attempt to match against IR rooted at the specified operation, which is - /// the same operation kind as getRootKind(). - /// - /// Note: This function must not modify the IR. - virtual LogicalResult match(typename PatternT::OperationT op) const = 0; - - /// Rewrite the IR rooted at the specified operation with the result of - /// this pattern, generating any new operations with the specified - /// rewriter. - virtual void rewrite(typename PatternT::OperationT op, - PatternRewriter &rewriter) const = 0; - - LogicalResult matchAndRewrite(typename PatternT::OperationT op, - PatternRewriter &rewriter) const final { - if (succeeded(match(op))) { - rewrite(op, rewriter); - return success(); - } - return failure(); - } -}; -} // namespace detail - /// RewritePattern is the common base class for all DAG to DAG replacements. class RewritePattern : public Pattern { public: - using OperationT = Operation *; - - /// `SplitMatchAndRewrite` is deprecated. Use `matchAndRewrite` instead of - /// separate `match` and `rewrite`. - using SplitMatchAndRewrite = detail::SplitMatchAndRewriteImpl; - virtual ~RewritePattern() = default; /// Attempt to match against code rooted at the specified operation, @@ -334,7 +295,6 @@ namespace detail { /// class or Interface. template struct OpOrInterfaceRewritePatternBase : public RewritePattern { - using OperationT = SourceOp; using RewritePattern::RewritePattern; /// Wrapper around the RewritePattern method that passes the derived op type. @@ -357,11 +317,6 @@ template struct OpRewritePattern : public detail::OpOrInterfaceRewritePatternBase { - /// `SplitMatchAndRewrite` is deprecated. Use `matchAndRewrite` instead of - /// separate `match` and `rewrite`. - using SplitMatchAndRewrite = - detail::SplitMatchAndRewriteImpl>; - /// Patterns must specify the root operation name they match against, and can /// also specify the benefit of the pattern matching and a list of generated /// ops. @@ -378,11 +333,6 @@ template struct OpInterfaceRewritePattern : public detail::OpOrInterfaceRewritePatternBase { - /// `SplitMatchAndRewrite` is deprecated. Use `matchAndRewrite` instead of - /// separate `match` and `rewrite`. - using SplitMatchAndRewrite = - detail::SplitMatchAndRewriteImpl>; - OpInterfaceRewritePattern(MLIRContext *context, PatternBenefit benefit = 1) : detail::OpOrInterfaceRewritePatternBase( Pattern::MatchInterfaceOpTypeTag(), SourceOp::getInterfaceID(), diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 6a9316cbc690f..ecfa5248b7559 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -531,82 +531,14 @@ class TypeConverter { // Conversion Patterns //===----------------------------------------------------------------------===// -namespace detail { -/// Helper class that derives from a ConversionRewritePattern class and -/// provides separate `match` and `rewrite` entry points instead of a combined -/// `matchAndRewrite`. -template -class ConversionSplitMatchAndRewriteImpl : public PatternT { - using PatternT::PatternT; - - /// Attempt to match against IR rooted at the specified operation, which is - /// the same operation kind as getRootKind(). - /// - /// Note: This function must not modify the IR. - virtual LogicalResult match(typename PatternT::OperationT op) const = 0; - - /// Rewrite the IR rooted at the specified operation with the result of - /// this pattern, generating any new operations with the specified - /// rewriter. - virtual void rewrite(typename PatternT::OperationT op, - typename PatternT::OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const { - // One of the two `rewrite` functions must be implemented. - llvm_unreachable("rewrite is not implemented"); - } - - virtual void rewrite(typename PatternT::OperationT op, - typename PatternT::OneToNOpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const { - if constexpr (std::is_same>::value) { - rewrite(op, PatternT::getOneToOneAdaptorOperands(adaptor), rewriter); - } else { - SmallVector oneToOneOperands = - PatternT::getOneToOneAdaptorOperands(adaptor.getOperands()); - rewrite(op, typename PatternT::OpAdaptor(oneToOneOperands, adaptor), - rewriter); - } - } - - LogicalResult - matchAndRewrite(typename PatternT::OperationT op, - typename PatternT::OneToNOpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const final { - if (succeeded(match(op))) { - rewrite(op, adaptor, rewriter); - return success(); - } - return failure(); - } - - LogicalResult - matchAndRewrite(typename PatternT::OperationT op, - typename PatternT::OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const final { - // Users would normally override this function in conversion patterns to - // implement a 1:1 pattern. Patterns that are derived from this class have - // separate `match` and `rewrite` functions, so this `matchAndRewrite` - // overload is obsolete. - llvm_unreachable("this function is unreachable"); - } -}; -} // namespace detail - /// Base class for the conversion patterns. This pattern class enables type /// conversions, and other uses specific to the conversion framework. As such, /// patterns of this type can only be used with the 'apply*' methods below. class ConversionPattern : public RewritePattern { public: - using OperationT = Operation *; using OpAdaptor = ArrayRef; using OneToNOpAdaptor = ArrayRef; - /// `SplitMatchAndRewrite` is deprecated. Use `matchAndRewrite` instead of - /// separate `match` and `rewrite`. - using SplitMatchAndRewrite = - detail::ConversionSplitMatchAndRewriteImpl; - /// Hook for derived classes to implement combined matching and rewriting. /// This overload supports only 1:1 replacements. The 1:N overload is called /// by the driver. By default, it calls this 1:1 overload or reports a fatal @@ -671,16 +603,10 @@ class ConversionPattern : public RewritePattern { template class OpConversionPattern : public ConversionPattern { public: - using OperationT = SourceOp; using OpAdaptor = typename SourceOp::Adaptor; using OneToNOpAdaptor = typename SourceOp::template GenericAdaptor>; - /// `SplitMatchAndRewrite` is deprecated. Use `matchAndRewrite` instead of - /// separate `match` and `rewrite`. - using SplitMatchAndRewrite = - detail::ConversionSplitMatchAndRewriteImpl>; - OpConversionPattern(MLIRContext *context, PatternBenefit benefit = 1) : ConversionPattern(SourceOp::getOperationName(), benefit, context) {} OpConversionPattern(const TypeConverter &typeConverter, MLIRContext *context, From 7b2b3faeb1d82148872720e254cc34b3e6d48b31 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 1 Apr 2025 15:31:48 -0700 Subject: [PATCH 0313/1029] [gn build] Port 676755561d5a (ParseTests) --- llvm/utils/gn/secondary/clang/lib/Parse/BUILD.gn | 1 + llvm/utils/gn/secondary/clang/unittests/BUILD.gn | 1 + .../gn/secondary/clang/unittests/Parse/BUILD.gn | 15 +++++++++++++++ 3 files changed, 17 insertions(+) create mode 100644 llvm/utils/gn/secondary/clang/unittests/Parse/BUILD.gn diff --git a/llvm/utils/gn/secondary/clang/lib/Parse/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Parse/BUILD.gn index 12d2cadd6c1b7..370a7ce92f090 100644 --- a/llvm/utils/gn/secondary/clang/lib/Parse/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Parse/BUILD.gn @@ -22,6 +22,7 @@ static_library("Parse") { "ParseExpr.cpp", "ParseExprCXX.cpp", "ParseHLSL.cpp", + "ParseHLSLRootSignature.cpp", "ParseInit.cpp", "ParseObjc.cpp", "ParseOpenACC.cpp", diff --git a/llvm/utils/gn/secondary/clang/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/BUILD.gn index 4aa844ac5a3c2..5ba8f2171eab0 100644 --- a/llvm/utils/gn/secondary/clang/unittests/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/BUILD.gn @@ -16,6 +16,7 @@ group("unittests") { "InstallAPI:InstallAPITests", "Interpreter:ClangReplInterpreterTests", "Lex:LexTests", + "Parse:ParseTests", "Rewrite:RewriteTests", "Sema:SemaTests", "Serialization:SerializationTests", diff --git a/llvm/utils/gn/secondary/clang/unittests/Parse/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Parse/BUILD.gn new file mode 100644 index 0000000000000..a67f5e7fa71ef --- /dev/null +++ b/llvm/utils/gn/secondary/clang/unittests/Parse/BUILD.gn @@ -0,0 +1,15 @@ +import("//third-party/unittest/unittest.gni") + +unittest("ParseTests") { + configs += [ "//llvm/utils/gn/build:clang_code" ] + deps = [ + "//clang/lib/AST", + "//clang/lib/Basic", + "//clang/lib/Lex", + "//clang/lib/Parse", + "//clang/lib/Sema", + "//llvm/lib/Support", + "//llvm/lib/Testing/Support", + ] + sources = [ "ParseHLSLRootSignatureTest.cpp" ] +} From c8764f0c655b2edb139896ecbb9f5bfd932fbe4b Mon Sep 17 00:00:00 2001 From: Julian Lettner Date: Tue, 1 Apr 2025 15:36:35 -0700 Subject: [PATCH 0314/1029] Fix handling of auto_continue for stop hooks (#129622) Follow-up fix discussed here: https://github.com/llvm/llvm-project/pull/129578#issuecomment-2695838042 --------- Co-authored-by: Jim Ingham --- lldb/source/Target/Target.cpp | 17 +++++++---------- .../commands/target/stop-hooks/TestStopHooks.py | 13 ++++++++----- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 09c0c0b8a5db0..42b1561fb2993 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -3094,7 +3094,6 @@ bool Target::RunStopHooks() { bool print_hook_header = (m_stop_hooks.size() != 1); bool print_thread_header = (num_exe_ctx != 1); - bool auto_continue = false; bool should_stop = false; bool requested_continue = false; @@ -3108,10 +3107,6 @@ bool Target::RunStopHooks() { if (!cur_hook_sp->ExecutionContextPasses(exc_ctx)) continue; - // We only consult the auto-continue for a stop hook if it matched the - // specifier. - auto_continue |= cur_hook_sp->GetAutoContinue(); - if (print_hook_header && !any_thread_matched) { StreamString s; cur_hook_sp->GetDescription(s, eDescriptionLevelBrief); @@ -3130,7 +3125,10 @@ bool Target::RunStopHooks() { auto result = cur_hook_sp->HandleStop(exc_ctx, output_sp); switch (result) { case StopHook::StopHookResult::KeepStopped: - should_stop = true; + if (cur_hook_sp->GetAutoContinue()) + requested_continue = true; + else + should_stop = true; break; case StopHook::StopHookResult::RequestContinue: requested_continue = true; @@ -3155,10 +3153,9 @@ bool Target::RunStopHooks() { } } - // Resume iff: - // 1) At least one hook requested to continue and no hook asked to stop, or - // 2) at least one hook had auto continue on. - if ((requested_continue && !should_stop) || auto_continue) { + // Resume iff at least one hook requested to continue and no hook asked to + // stop. + if (requested_continue && !should_stop) { Log *log = GetLog(LLDBLog::Process); Status error = m_process_sp->PrivateResume(); if (error.Success()) { diff --git a/lldb/test/API/commands/target/stop-hooks/TestStopHooks.py b/lldb/test/API/commands/target/stop-hooks/TestStopHooks.py index 0c42fda260d1b..7d52676121827 100644 --- a/lldb/test/API/commands/target/stop-hooks/TestStopHooks.py +++ b/lldb/test/API/commands/target/stop-hooks/TestStopHooks.py @@ -9,9 +9,6 @@ class TestStopHooks(TestBase): - # If your test case doesn't stress debug info, then - # set this to true. That way it won't be run once for - # each debug info format. NO_DEBUG_INFO_TESTCASE = True def setUp(self): @@ -42,12 +39,18 @@ def step_out_test(self): interp = self.dbg.GetCommandInterpreter() result = lldb.SBCommandReturnObject() - interp.HandleCommand("target stop-hook add -o 'expr g_var++'", result) + # Add two stop hooks here, one to auto-continue and one not. Make sure + # that we still stop in that case. + interp.HandleCommand("target stop-hook add -G false -o 'expr g_var++'", result) self.assertTrue(result.Succeeded(), "Set the target stop hook") + + interp.HandleCommand("target stop-hook add -G true -o 'expr g_var++'", result) + self.assertTrue(result.Succeeded(), "Set the second target stop hook") + thread.StepOut() var = target.FindFirstGlobalVariable("g_var") self.assertTrue(var.IsValid()) - self.assertEqual(var.GetValueAsUnsigned(), 1, "Updated g_var") + self.assertEqual(var.GetValueAsUnsigned(), 2, "Updated g_var") def after_expr_test(self): interp = self.dbg.GetCommandInterpreter() From 749535ba2808e133682074f712ac6829335f8875 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Apr 2025 15:51:39 -0700 Subject: [PATCH 0315/1029] [RISCV] Use tablegen HasOneUse. NFC (#133974) --- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 12 +- .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 135 +++++++----------- 2 files changed, 54 insertions(+), 93 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index c9386f2307175..89e5ad8067c1b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1299,9 +1299,9 @@ def ext : PatFrags<(ops node:$A), [(sext node:$A), (zext node:$A)]>; class binop_oneuse : PatFrag<(ops node:$A, node:$B), - (operator node:$A, node:$B), [{ - return N->hasOneUse(); -}]>; + (operator node:$A, node:$B)> { + let HasOneUse = 1; +} def and_oneuse : binop_oneuse; def mul_oneuse : binop_oneuse; @@ -1315,9 +1315,9 @@ def mul_const_oneuse : PatFrag<(ops node:$A, node:$B), class unop_oneuse : PatFrag<(ops node:$A), - (operator node:$A), [{ - return N->hasOneUse(); -}]>; + (operator node:$A)> { + let HasOneUse = 1; +} def sext_oneuse : unop_oneuse; def zext_oneuse : unop_oneuse; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 5d98ffedcbb9a..f80cbc9e2fb5e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -492,101 +492,62 @@ def SDTRVVVecReduce : SDTypeProfile<1, 6, [ SDTCisVT<6, XLenVT> ]>; -def riscv_add_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, - node:$E), - (riscv_add_vl node:$A, node:$B, node:$C, - node:$D, node:$E), [{ - return N->hasOneUse(); -}]>; - -def riscv_sub_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, - node:$E), - (riscv_sub_vl node:$A, node:$B, node:$C, - node:$D, node:$E), [{ - return N->hasOneUse(); -}]>; - -def riscv_mul_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, - node:$E), - (riscv_mul_vl node:$A, node:$B, node:$C, - node:$D, node:$E), [{ - return N->hasOneUse(); -}]>; - -def riscv_vwmul_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, +let HasOneUse = 1 in { + def riscv_add_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, node:$E), - (riscv_vwmul_vl node:$A, node:$B, node:$C, - node:$D, node:$E), [{ - return N->hasOneUse(); -}]>; - -def riscv_vwmulu_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, - node:$E), - (riscv_vwmulu_vl node:$A, node:$B, node:$C, - node:$D, node:$E), [{ - return N->hasOneUse(); -}]>; - -def riscv_vwmulsu_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, + (riscv_add_vl node:$A, node:$B, node:$C, + node:$D, node:$E)>; + def riscv_sub_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, + node:$E), + (riscv_sub_vl node:$A, node:$B, node:$C, + node:$D, node:$E)>; + def riscv_mul_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, + node:$E), + (riscv_mul_vl node:$A, node:$B, node:$C, + node:$D, node:$E)>; + def riscv_vwmul_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, node:$E), - (riscv_vwmulsu_vl node:$A, node:$B, node:$C, - node:$D, node:$E), [{ - return N->hasOneUse(); -}]>; - -def riscv_sext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C), - (riscv_sext_vl node:$A, node:$B, node:$C), [{ - return N->hasOneUse(); -}]>; - -def riscv_zext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C), - (riscv_zext_vl node:$A, node:$B, node:$C), [{ - return N->hasOneUse(); -}]>; - -def riscv_ext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C), - (riscv_ext_vl node:$A, node:$B, node:$C), [{ - return N->hasOneUse(); -}]>; - -def riscv_fpextend_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C), - (riscv_fpextend_vl node:$A, node:$B, node:$C), [{ - return N->hasOneUse(); -}]>; + (riscv_vwmul_vl node:$A, node:$B, node:$C, + node:$D, node:$E)>; + def riscv_vwmulu_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, + node:$E), + (riscv_vwmulu_vl node:$A, node:$B, node:$C, + node:$D, node:$E)>; + def riscv_vwmulsu_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, + node:$E), + (riscv_vwmulsu_vl node:$A, node:$B, node:$C, + node:$D, node:$E)>; + def riscv_sext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C), + (riscv_sext_vl node:$A, node:$B, node:$C)>; + def riscv_zext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C), + (riscv_zext_vl node:$A, node:$B, node:$C)>; + def riscv_ext_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C), + (riscv_ext_vl node:$A, node:$B, node:$C)>; + def riscv_fpextend_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C), + (riscv_fpextend_vl node:$A, node:$B, node:$C)>; + def riscv_vfmadd_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, + node:$E), + (riscv_vfmadd_vl node:$A, node:$B, + node:$C, node:$D, node:$E)>; + def riscv_vfnmadd_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, + node:$E), + (riscv_vfnmadd_vl node:$A, node:$B, + node:$C, node:$D, node:$E)>; + def riscv_vfmsub_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, + node:$E), + (riscv_vfmsub_vl node:$A, node:$B, + node:$C, node:$D, node:$E)>; + def riscv_vfnmsub_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, + node:$E), + (riscv_vfnmsub_vl node:$A, node:$B, + node:$C, node:$D, node:$E)>; +} // HasOneUse = 1 def riscv_fpextend_vl_sameuser : PatFrag<(ops node:$A, node:$B, node:$C), (riscv_fpextend_vl node:$A, node:$B, node:$C), [{ return !N->use_empty() && all_equal(N->users()); }]>; -def riscv_vfmadd_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, - node:$E), - (riscv_vfmadd_vl node:$A, node:$B, - node:$C, node:$D, node:$E), [{ - return N->hasOneUse(); -}]>; - -def riscv_vfnmadd_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, - node:$E), - (riscv_vfnmadd_vl node:$A, node:$B, - node:$C, node:$D, node:$E), [{ - return N->hasOneUse(); -}]>; - -def riscv_vfmsub_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, - node:$E), - (riscv_vfmsub_vl node:$A, node:$B, - node:$C, node:$D, node:$E), [{ - return N->hasOneUse(); -}]>; - -def riscv_vfnmsub_vl_oneuse : PatFrag<(ops node:$A, node:$B, node:$C, node:$D, - node:$E), - (riscv_vfnmsub_vl node:$A, node:$B, - node:$C, node:$D, node:$E), [{ - return N->hasOneUse(); -}]>; - foreach kind = ["ADD", "UMAX", "SMAX", "UMIN", "SMIN", "AND", "OR", "XOR", "FADD", "SEQ_FADD", "FMIN", "FMAX"] in def rvv_vecreduce_#kind#_vl : SDNode<"RISCVISD::VECREDUCE_"#kind#"_VL", SDTRVVVecReduce>; From 07504afc42bd295ca290c8c462869759b97e0fdc Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Tue, 1 Apr 2025 16:23:19 -0700 Subject: [PATCH 0316/1029] [libc] Stop depending on .cpp files in libcxx_shared_headers library. (#133999) Fix two instances of libcxx_shared_headers depending on .cpp files (in Bazel build): * Don't depend on exit syscall in LIBC_ASSERT implementation. This dependency is not used, since LIBC_ASSERT always uses system in the overlay mode, which is the only mode supported by Bazel. * Don't depend on libc_errno in str-to-float and str-to-integer conversions. We only need the ERANGE value, which can be obtained from the proxy header instead. --- libc/src/__support/CMakeLists.txt | 4 +-- libc/src/__support/libc_assert.h | 2 +- libc/src/__support/str_to_float.h | 2 +- libc/src/__support/str_to_integer.h | 2 +- .../llvm-project-overlay/libc/BUILD.bazel | 26 +++---------------- 5 files changed, 8 insertions(+), 28 deletions(-) diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt index 17f03a6b6c4a0..f92499fdbf451 100644 --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -165,7 +165,7 @@ add_header_library( DEPENDS .ctype_utils .str_to_num_result - libc.src.errno.errno + libc.hdr.errno_macros libc.src.__support.CPP.limits libc.src.__support.CPP.type_traits libc.src.__support.common @@ -217,6 +217,7 @@ add_header_library( .str_to_integer .str_to_num_result .uint128 + libc.hdr.errno_macros libc.src.__support.common libc.src.__support.CPP.bit libc.src.__support.CPP.limits @@ -226,7 +227,6 @@ add_header_library( libc.src.__support.macros.config libc.src.__support.macros.null_check libc.src.__support.macros.optimization - libc.src.errno.errno ) add_header_library( diff --git a/libc/src/__support/libc_assert.h b/libc/src/__support/libc_assert.h index 3db179ff67212..ada1795ccb80a 100644 --- a/libc/src/__support/libc_assert.h +++ b/libc/src/__support/libc_assert.h @@ -9,7 +9,6 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_LIBC_ASSERT_H #define LLVM_LIBC_SRC___SUPPORT_LIBC_ASSERT_H -#include "src/__support/macros/config.h" #if defined(LIBC_COPT_USE_C_ASSERT) || !defined(LIBC_FULL_BUILD) // The build is configured to just use the public API @@ -25,6 +24,7 @@ #include "src/__support/OSUtil/io.h" #include "src/__support/integer_to_string.h" #include "src/__support/macros/attributes.h" // For LIBC_INLINE +#include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // For LIBC_UNLIKELY namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h index 48c88309c58e9..0748e1cb8a8b4 100644 --- a/libc/src/__support/str_to_float.h +++ b/libc/src/__support/str_to_float.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_STR_TO_FLOAT_H #define LLVM_LIBC_SRC___SUPPORT_STR_TO_FLOAT_H +#include "hdr/errno_macros.h" // For ERANGE #include "src/__support/CPP/bit.h" #include "src/__support/CPP/limits.h" #include "src/__support/CPP/optional.h" @@ -31,7 +32,6 @@ #include "src/__support/str_to_integer.h" #include "src/__support/str_to_num_result.h" #include "src/__support/uint128.h" -#include "src/errno/libc_errno.h" // For ERANGE #include diff --git a/libc/src/__support/str_to_integer.h b/libc/src/__support/str_to_integer.h index 9212ad25d0820..76a99a8948941 100644 --- a/libc/src/__support/str_to_integer.h +++ b/libc/src/__support/str_to_integer.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_STR_TO_INTEGER_H #define LLVM_LIBC_SRC___SUPPORT_STR_TO_INTEGER_H +#include "hdr/errno_macros.h" // For ERANGE #include "src/__support/CPP/limits.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/CPP/type_traits/make_unsigned.h" @@ -24,7 +25,6 @@ #include "src/__support/macros/config.h" #include "src/__support/str_to_num_result.h" #include "src/__support/uint128.h" -#include "src/errno/libc_errno.h" // For ERANGE namespace LIBC_NAMESPACE_DECL { namespace internal { diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 5298c625c5d66..1cb48974f4905 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -841,12 +841,6 @@ libc_support_library( libc_support_library( name = "__support_libc_assert", hdrs = ["src/__support/libc_assert.h"], - deps = [ - ":__support_integer_to_string", - ":__support_macros_attributes", - ":__support_osutil_exit", - ":__support_osutil_io", - ], ) libc_support_library( @@ -868,7 +862,7 @@ libc_support_library( ":__support_ctype_utils", ":__support_str_to_num_result", ":__support_uint128", - ":errno", + ":hdr_errno_macros", ], ) @@ -892,7 +886,7 @@ libc_support_library( ":__support_str_to_integer", ":__support_str_to_num_result", ":__support_uint128", - ":errno", + ":hdr_errno_macros", ], ) @@ -1590,21 +1584,6 @@ libc_support_library( ########################## externally shared targets ########################### -# TODO: Remove this once downstream users are migrated to libcxx_shared_headers. -libc_support_library( - name = "libc_external_common", - hdrs = glob( - ["shared/*.h"], - exclude = ["shared/rpc_server.h"], - ), - deps = [ - ":__support_common", - ":__support_fputil_fp_bits", - ":__support_str_to_float", - ":__support_str_to_integer", - ], -) - libc_header_library( name = "libcxx_shared_headers", hdrs = [ @@ -1911,6 +1890,7 @@ libc_support_library( ":__support_fputil_fma", ":__support_fputil_multiply_add", ":__support_fputil_polyeval", + ":__support_integer_literals", ], ) From d53555499f85b7aedb765c66d6f3850c7bc6126d Mon Sep 17 00:00:00 2001 From: Mats Jun Larsen Date: Wed, 2 Apr 2025 01:27:29 +0200 Subject: [PATCH 0317/1029] [lldb] Prefer PointerType::get with LLVMContext over Type (NFC) (#133869) Part of #123569 --- .../Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp | 4 ++-- lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp index ae0682d717948..c7c292a8a7e42 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/IRDynamicChecks.cpp @@ -240,7 +240,7 @@ class Instrumenter { FunctionType *fun_ty = FunctionType::get( llvm::Type::getVoidTy(m_module.getContext()), params, true); - PointerType *fun_ptr_ty = PointerType::getUnqual(fun_ty); + PointerType *fun_ptr_ty = PointerType::getUnqual(m_module.getContext()); Constant *fun_addr_int = ConstantInt::get(GetIntptrTy(), start_address, false); return {fun_ty, ConstantExpr::getIntToPtr(fun_addr_int, fun_ptr_ty)}; @@ -264,7 +264,7 @@ class Instrumenter { FunctionType *fun_ty = FunctionType::get( llvm::Type::getVoidTy(m_module.getContext()), params, true); - PointerType *fun_ptr_ty = PointerType::getUnqual(fun_ty); + PointerType *fun_ptr_ty = PointerType::getUnqual(m_module.getContext()); Constant *fun_addr_int = ConstantInt::get(GetIntptrTy(), start_address, false); return {fun_ty, ConstantExpr::getIntToPtr(fun_addr_int, fun_ptr_ty)}; diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp b/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp index 879f006336ba5..a343766ce9c4f 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp @@ -462,7 +462,7 @@ bool IRForTarget::RewriteObjCConstString(llvm::GlobalVariable *ns_str, FunctionType::get(ns_str_ty, CFSCWB_arg_types, false); // Build the constant containing the pointer to the function - PointerType *CFSCWB_ptr_ty = PointerType::getUnqual(CFSCWB_ty); + PointerType *CFSCWB_ptr_ty = PointerType::getUnqual(m_module->getContext()); Constant *CFSCWB_addr_int = ConstantInt::get(m_intptr_ty, CFStringCreateWithBytes_addr, false); m_CFStringCreateWithBytes = { @@ -814,7 +814,7 @@ bool IRForTarget::RewriteObjCSelector(Instruction *selector_load) { FunctionType::get(sel_ptr_type, srN_arg_types, false); // Build the constant containing the pointer to the function - PointerType *srN_ptr_ty = PointerType::getUnqual(srN_type); + PointerType *srN_ptr_ty = PointerType::getUnqual(m_module->getContext()); Constant *srN_addr_int = ConstantInt::get(m_intptr_ty, sel_registerName_addr, false); m_sel_registerName = {srN_type, @@ -1031,7 +1031,7 @@ bool IRForTarget::MaybeHandleVariable(Value *llvm_value_ptr) { // // We also do this for any user-declared persistent variables. compiler_type = compiler_type.GetPointerType(); - value_type = PointerType::get(global_variable->getType(), 0); + value_type = PointerType::getUnqual(global_variable->getContext()); } else { value_type = global_variable->getType(); } From d72be157823d41e7eaf457cc37ea99c07431a25c Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Tue, 1 Apr 2025 23:43:35 +0000 Subject: [PATCH 0318/1029] Revert "[CI] Fix Monolithic Linux Build in Ubuntu 24.04 (#133628)" This reverts commit 23fb048ce35f672d8db3f466a2522354bbce66e5. This broke the new premerge system as it appears the pip installations within the CI image do not support this option. Buildkite was unaffected. --- .ci/monolithic-linux.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index ec7a85bc5f15f..4b6e56b4a4eda 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -53,9 +53,9 @@ targets="${2}" lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests" echo "--- cmake" -pip install --break-system-packages -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt -pip install --break-system-packages -q -r "${MONOREPO_ROOT}"/lldb/test/requirements.txt -pip install --break-system-packages -q -r "${MONOREPO_ROOT}"/.ci/requirements.txt +pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt +pip install -q -r "${MONOREPO_ROOT}"/lldb/test/requirements.txt +pip install -q -r "${MONOREPO_ROOT}"/.ci/requirements.txt cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D LLVM_ENABLE_PROJECTS="${projects}" \ -G Ninja \ From 1a4dc189ad26ced9d9a74e40fa50b7d81428edba Mon Sep 17 00:00:00 2001 From: Petr Sumbera Date: Wed, 2 Apr 2025 01:57:14 +0200 Subject: [PATCH 0319/1029] [OpenMP] allow openmp build for sparc (#133239) --- openmp/runtime/CMakeLists.txt | 8 ++++++-- .../runtime/cmake/LibompGetArchitecture.cmake | 4 ++++ openmp/runtime/cmake/config-ix.cmake | 4 +++- openmp/runtime/src/kmp_gsupport.cpp | 2 +- openmp/runtime/src/kmp_os.h | 7 ++++--- openmp/runtime/src/kmp_platform.h | 19 +++++++++++++++++-- openmp/runtime/src/kmp_runtime.cpp | 2 +- 7 files changed, 36 insertions(+), 10 deletions(-) diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt index bcae02eba6a59..6041fb2b23a0f 100644 --- a/openmp/runtime/CMakeLists.txt +++ b/openmp/runtime/CMakeLists.txt @@ -30,7 +30,7 @@ if(${OPENMP_STANDALONE_BUILD}) # If adding a new architecture, take a look at cmake/LibompGetArchitecture.cmake libomp_get_architecture(LIBOMP_DETECTED_ARCH) set(LIBOMP_ARCH ${LIBOMP_DETECTED_ARCH} CACHE STRING - "The architecture to build for (x86_64/i386/arm/ppc/ppc64/ppc64le/aarch64/aarch64_32/mic/mips/mips64/riscv64/loongarch64/ve/s390x/wasm32).") + "The architecture to build for (x86_64/i386/arm/ppc/ppc64/ppc64le/aarch64/aarch64_32/mic/mips/mips64/riscv64/loongarch64/ve/s390x/sparc/sparcv9/wasm32).") # Should assertions be enabled? They are on by default. set(LIBOMP_ENABLE_ASSERTIONS TRUE CACHE BOOL "enable assertions?") @@ -71,6 +71,10 @@ else() # Part of LLVM build set(LIBOMP_ARCH ve) elseif(LIBOMP_NATIVE_ARCH MATCHES "s390x") set(LIBOMP_ARCH s390x) + elseif(LIBOMP_NATIVE_ARCH MATCHES "sparcv9") + set(LIBOMP_ARCH sparcv9) + elseif(LIBOMP_NATIVE_ARCH MATCHES "sparc") + set(LIBOMP_ARCH sparc) elseif(LIBOMP_NATIVE_ARCH MATCHES "wasm") set(LIBOMP_ARCH wasm32) else() @@ -93,7 +97,7 @@ if(LIBOMP_ARCH STREQUAL "aarch64") endif() endif() -libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc ppc64 ppc64le aarch64 aarch64_32 aarch64_a64fx mic mips mips64 riscv64 loongarch64 ve s390x wasm32) +libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc ppc64 ppc64le aarch64 aarch64_32 aarch64_a64fx mic mips mips64 riscv64 loongarch64 ve s390x sparc sparcv9 wasm32) set(LIBOMP_LIB_TYPE normal CACHE STRING "Performance,Profiling,Stubs library (normal/profile/stubs)") diff --git a/openmp/runtime/cmake/LibompGetArchitecture.cmake b/openmp/runtime/cmake/LibompGetArchitecture.cmake index 2d5c6622c9f7d..81aa700e3b6db 100644 --- a/openmp/runtime/cmake/LibompGetArchitecture.cmake +++ b/openmp/runtime/cmake/LibompGetArchitecture.cmake @@ -59,6 +59,10 @@ function(libomp_get_architecture return_arch) #error ARCHITECTURE=s390x #elif defined(__wasm32__) #error ARCHITECTURE=wasm32 + #elif defined(__sparcv9) + #error ARCHITECTURE=sparcv9 + #elif defined(__sparc) + #error ARCHITECTURE=sparc #else #error ARCHITECTURE=UnknownArchitecture #endif diff --git a/openmp/runtime/cmake/config-ix.cmake b/openmp/runtime/cmake/config-ix.cmake index ac2bbb902161e..144c657be67e8 100644 --- a/openmp/runtime/cmake/config-ix.cmake +++ b/openmp/runtime/cmake/config-ix.cmake @@ -316,7 +316,9 @@ else() (LIBOMP_ARCH STREQUAL ppc64) OR (LIBOMP_ARCH STREQUAL riscv64) OR (LIBOMP_ARCH STREQUAL loongarch64) OR - (LIBOMP_ARCH STREQUAL s390x)) + (LIBOMP_ARCH STREQUAL s390x) OR + (LIBOMP_ARCH STREQUAL sparc) OR + (LIBOMP_ARCH STREQUAL sparcv9)) AND # OS supported? ((WIN32 AND LIBOMP_HAVE_PSAPI) OR APPLE OR (NOT (WIN32 OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX") AND LIBOMP_HAVE_WEAK_ATTRIBUTE))) diff --git a/openmp/runtime/src/kmp_gsupport.cpp b/openmp/runtime/src/kmp_gsupport.cpp index 86cf16470e14b..0d04045f7b165 100644 --- a/openmp/runtime/src/kmp_gsupport.cpp +++ b/openmp/runtime/src/kmp_gsupport.cpp @@ -358,7 +358,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ORDERED_END)(void) { // (IA-32 architecture) or 64-bit signed (Intel(R) 64). #if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_WASM || \ - KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 + KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC32 #define KMP_DISPATCH_INIT __kmp_aux_dispatch_init_4 #define KMP_DISPATCH_FINI_CHUNK __kmp_aux_dispatch_fini_chunk_4 #define KMP_DISPATCH_NEXT __kmpc_dispatch_next_4 diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h index 29a281f096855..e8ad2a6fdb78e 100644 --- a/openmp/runtime/src/kmp_os.h +++ b/openmp/runtime/src/kmp_os.h @@ -179,11 +179,11 @@ typedef unsigned long long kmp_uint64; #endif /* KMP_OS_UNIX */ #if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_WASM || \ - KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 + KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC32 #define KMP_SIZE_T_SPEC KMP_UINT32_SPEC #elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \ - KMP_ARCH_VE || KMP_ARCH_S390X + KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_SPARC64 #define KMP_SIZE_T_SPEC KMP_UINT64_SPEC #else #error "Can't determine size_t printf format specifier." @@ -1052,7 +1052,8 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v); #if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || \ KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \ - KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 + KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || \ + KMP_ARCH_SPARC #if KMP_OS_WINDOWS #undef KMP_MB #define KMP_MB() std::atomic_thread_fence(std::memory_order_seq_cst) diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h index 9ed14159376cc..80afba6c6af2e 100644 --- a/openmp/runtime/src/kmp_platform.h +++ b/openmp/runtime/src/kmp_platform.h @@ -129,6 +129,7 @@ #define KMP_ARCH_LOONGARCH64 0 #define KMP_ARCH_VE 0 #define KMP_ARCH_S390X 0 +#define KMP_ARCH_SPARC 0 #if KMP_OS_WINDOWS #if defined(_M_AMD64) || defined(__x86_64) @@ -200,6 +201,9 @@ #elif defined __s390x__ #undef KMP_ARCH_S390X #define KMP_ARCH_S390X 1 +#elif defined __sparc || defined __sparc__ +#undef KMP_ARCH_SPARC +#define KMP_ARCH_SPARC 1 #endif #endif @@ -246,6 +250,17 @@ #define KMP_ARCH_PPC64 \ (KMP_ARCH_PPC64_ELFv2 || KMP_ARCH_PPC64_ELFv1 || KMP_ARCH_PPC64_XCOFF) +#if defined(KMP_ARCH_SPARC) +#undef KMP_ARCH_SPARC32 +#undef KMP_ARCH_SPARC64 +#if defined(__sparcv9) || defined(__sparc64__) +#define KMP_ARCH_SPARC64 1 +#endif +#if defined(__sparc) && !defined(__sparcv9) && !defined(__sparc64__) +#define KMP_ARCH_SPARC32 1 +#endif +#endif + #if defined(__MIC__) || defined(__MIC2__) #define KMP_MIC 1 #if __MIC2__ || __KNC__ @@ -264,7 +279,7 @@ /* Specify 32 bit architectures here */ #define KMP_32_BIT_ARCH \ (KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS || KMP_ARCH_WASM || \ - KMP_ARCH_PPC || KMP_ARCH_AARCH64_32) + KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC32) // Platforms which support Intel(R) Many Integrated Core Architecture #define KMP_MIC_SUPPORTED \ @@ -275,7 +290,7 @@ KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64 + \ KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64 + KMP_ARCH_VE + \ KMP_ARCH_S390X + KMP_ARCH_WASM + KMP_ARCH_PPC + \ - KMP_ARCH_AARCH64_32) + KMP_ARCH_AARCH64_32 + KMP_ARCH_SPARC) #error Unknown or unsupported architecture #endif diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index 64548b1397c34..9f679aa8d334f 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -8978,7 +8978,7 @@ __kmp_determine_reduction_method( // KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \ - KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 + KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HAIKU || KMP_OS_HURD || \ From 2f25345670081f1ca460ea3f42a0585ef3f1e877 Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Tue, 1 Apr 2025 21:02:00 -0300 Subject: [PATCH 0320/1029] [clang] fix missing initialization msan failure fixes msan failure reported here: https://lab.llvm.org/buildbot/#/builders/94/builds/5821/steps/17/logs/stdio This was a regression introduced here: https://github.com/llvm/llvm-project/pull/133610 --- clang/lib/Sema/SemaCXXScopeSpec.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp index 545da5c295832..ea7936b0e5b8f 100644 --- a/clang/lib/Sema/SemaCXXScopeSpec.cpp +++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp @@ -873,6 +873,7 @@ bool Sema::ActOnCXXNestedNameSpecifier(Scope *S, DependentTemplateSpecializationTypeLoc SpecTL = Builder.push(T); SpecTL.setElaboratedKeywordLoc(SourceLocation()); + SpecTL.setQualifierLoc(NestedNameSpecifierLoc()); SpecTL.setTemplateKeywordLoc(TemplateKWLoc); SpecTL.setTemplateNameLoc(TemplateNameLoc); SpecTL.setLAngleLoc(LAngleLoc); From ad1ca5f4a2bc09f99fd82e5444f5da37c2985e97 Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Tue, 1 Apr 2025 21:11:56 -0300 Subject: [PATCH 0321/1029] [clang] Concepts: support pack expansions for type constraints (#132626) This reverts an earlier attempt (adb0d8ddceb143749c519d14b8b31b481071da77 and 50e5411e4247421fd606f0a206682fcdf0303ae3) to support these expansions, which was limited to type arguments and which subverted the purpose of SubstTemplateTypeParmType. This propagates the ArgumentPackSubstitutionIndex along with the AssociatedConstraint, so that the pack expansion works, without needing any new transforms or otherwise any changes to the template instantiation process. This keeps the tests from the reverted commits, and adds a few more showing the new solution also works for NTTPs. Fixes https://github.com/llvm/llvm-project/issues/131798 --- .../modernize/UseConstraintsCheck.cpp | 4 +- clang/docs/ReleaseNotes.rst | 3 + clang/include/clang/AST/ASTConcept.h | 11 +- clang/include/clang/AST/ASTContext.h | 4 +- clang/include/clang/AST/Decl.h | 17 +- clang/include/clang/AST/DeclTemplate.h | 26 ++-- clang/include/clang/AST/PropertiesBase.td | 1 - clang/include/clang/AST/Type.h | 29 +--- clang/include/clang/AST/TypeProperties.td | 5 +- clang/include/clang/Sema/Sema.h | 24 +-- clang/include/clang/Sema/SemaConcept.h | 11 +- clang/lib/AST/ASTContext.cpp | 7 +- clang/lib/AST/ASTImporter.cpp | 7 +- clang/lib/AST/DeclTemplate.cpp | 26 ++-- clang/lib/AST/Type.cpp | 6 +- clang/lib/Sema/SemaCodeComplete.cpp | 9 +- clang/lib/Sema/SemaConcept.cpp | 94 +++++------ clang/lib/Sema/SemaDecl.cpp | 5 +- clang/lib/Sema/SemaExprCXX.cpp | 3 +- clang/lib/Sema/SemaOverload.cpp | 4 +- clang/lib/Sema/SemaTemplate.cpp | 16 +- clang/lib/Sema/SemaTemplateDeduction.cpp | 14 +- clang/lib/Sema/SemaTemplateInstantiate.cpp | 146 ++---------------- clang/lib/Sema/SemaType.cpp | 8 +- clang/lib/Serialization/ASTReaderDecl.cpp | 4 +- clang/lib/Serialization/ASTWriterDecl.cpp | 1 + clang/test/SemaCXX/cxx20-ctad-type-alias.cpp | 2 +- .../SemaCXX/fold_lambda_with_variadics.cpp | 47 ++++++ clang/unittests/AST/SourceLocationTest.cpp | 6 +- 29 files changed, 236 insertions(+), 304 deletions(-) diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp index ea4d99586c711..fb82efb4dd211 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp @@ -356,7 +356,7 @@ static std::vector handleReturnType(const FunctionDecl *Function, if (!TypeText) return {}; - SmallVector ExistingConstraints; + SmallVector ExistingConstraints; Function->getAssociatedConstraints(ExistingConstraints); if (!ExistingConstraints.empty()) { // FIXME - Support adding new constraints to existing ones. Do we need to @@ -404,7 +404,7 @@ handleTrailingTemplateType(const FunctionTemplateDecl *FunctionTemplate, if (!ConditionText) return {}; - SmallVector ExistingConstraints; + SmallVector ExistingConstraints; Function->getAssociatedConstraints(ExistingConstraints); if (!ExistingConstraints.empty()) { // FIXME - Support adding new constraints to existing ones. Do we need to diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c3b64d84a1b1c..c4e82678949ff 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -370,6 +370,9 @@ Bug Fixes to C++ Support - Clang now uses the parameter location for abbreviated function templates in ``extern "C"``. (#GH46386) - Clang will emit an error instead of crash when use co_await or co_yield in C++26 braced-init-list template parameter initialization. (#GH78426) +- Improved fix for an issue with pack expansions of type constraints, where this + now also works if the constraint has non-type or template template parameters. + (#GH131798) - Fixes matching of nested template template parameters. (#GH130362) - Correctly diagnoses template template paramters which have a pack parameter not in the last position. diff --git a/clang/include/clang/AST/ASTConcept.h b/clang/include/clang/AST/ASTConcept.h index 00500e214f4ce..f89899c3ea7b1 100644 --- a/clang/include/clang/AST/ASTConcept.h +++ b/clang/include/clang/AST/ASTConcept.h @@ -229,12 +229,15 @@ class TypeConstraint { /// type-constraint. Expr *ImmediatelyDeclaredConstraint = nullptr; ConceptReference *ConceptRef; + int ArgumentPackSubstitutionIndex; public: TypeConstraint(ConceptReference *ConceptRef, - Expr *ImmediatelyDeclaredConstraint) + Expr *ImmediatelyDeclaredConstraint, + int ArgumentPackSubstitutionIndex) : ImmediatelyDeclaredConstraint(ImmediatelyDeclaredConstraint), - ConceptRef(ConceptRef) {} + ConceptRef(ConceptRef), + ArgumentPackSubstitutionIndex(ArgumentPackSubstitutionIndex) {} /// \brief Get the immediately-declared constraint expression introduced by /// this type-constraint, that is - the constraint expression that is added to @@ -245,6 +248,10 @@ class TypeConstraint { ConceptReference *getConceptReference() const { return ConceptRef; } + int getArgumentPackSubstitutionIndex() const { + return ArgumentPackSubstitutionIndex; + } + // FIXME: Instead of using these concept related functions the callers should // directly work with the corresponding ConceptReference. ConceptDecl *getNamedConcept() const { return ConceptRef->getNamedConcept(); } diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index f386282890b5a..a24f30815e6b9 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -1798,9 +1798,7 @@ class ASTContext : public RefCountedBase { QualType getSubstTemplateTypeParmType(QualType Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, - SubstTemplateTypeParmTypeFlag Flag = - SubstTemplateTypeParmTypeFlag::None) const; + std::optional PackIndex) const; QualType getSubstTemplateTypeParmPackType(Decl *AssociatedDecl, unsigned Index, bool Final, const TemplateArgument &ArgPack); diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index efac36e49351e..9e7e93d98c9d1 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -78,6 +78,18 @@ class UnresolvedSetImpl; class VarTemplateDecl; enum class ImplicitParamKind; +// Holds a constraint expression along with a pack expansion index, if +// expanded. +struct AssociatedConstraint { + const Expr *ConstraintExpr; + int ArgumentPackSubstitutionIndex; + + explicit AssociatedConstraint(const Expr *ConstraintExpr, + int ArgumentPackSubstitutionIndex = -1) + : ConstraintExpr(ConstraintExpr), + ArgumentPackSubstitutionIndex(ArgumentPackSubstitutionIndex) {} +}; + /// The top declaration context. class TranslationUnitDecl : public Decl, public DeclContext, @@ -2631,9 +2643,10 @@ class FunctionDecl : public DeclaratorDecl, /// /// Use this instead of getTrailingRequiresClause for concepts APIs that /// accept an ArrayRef of constraint expressions. - void getAssociatedConstraints(SmallVectorImpl &AC) const { + void + getAssociatedConstraints(SmallVectorImpl &AC) const { if (auto *TRC = getTrailingRequiresClause()) - AC.push_back(TRC); + AC.emplace_back(TRC); } /// Get the message that indicates why this function was deleted. diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h index b27e698236c02..37fe0acf5d4d5 100644 --- a/clang/include/clang/AST/DeclTemplate.h +++ b/clang/include/clang/AST/DeclTemplate.h @@ -195,7 +195,8 @@ class TemplateParameterList final /// /// The constraints in the resulting list are to be treated as if in a /// conjunction ("and"). - void getAssociatedConstraints(llvm::SmallVectorImpl &AC) const; + void getAssociatedConstraints( + llvm::SmallVectorImpl &AC) const; bool hasAssociatedConstraints() const; @@ -422,7 +423,8 @@ class TemplateDecl : public NamedDecl { /// including constraint-expressions derived from the requires-clause, /// trailing requires-clause (for functions and methods) and constrained /// template parameters. - void getAssociatedConstraints(llvm::SmallVectorImpl &AC) const; + void getAssociatedConstraints( + llvm::SmallVectorImpl &AC) const; bool hasAssociatedConstraints() const; @@ -1341,7 +1343,8 @@ class TemplateTypeParmDecl final : public TypeDecl, } void setTypeConstraint(ConceptReference *CR, - Expr *ImmediatelyDeclaredConstraint); + Expr *ImmediatelyDeclaredConstraint, + int ArgumentPackSubstitutionIndex); /// Determine whether this template parameter has a type-constraint. bool hasTypeConstraint() const { @@ -1353,9 +1356,11 @@ class TemplateTypeParmDecl final : public TypeDecl, /// /// Use this instead of getTypeConstraint for concepts APIs that /// accept an ArrayRef of constraint expressions. - void getAssociatedConstraints(llvm::SmallVectorImpl &AC) const { + void getAssociatedConstraints( + llvm::SmallVectorImpl &AC) const { if (HasTypeConstraint) - AC.push_back(getTypeConstraint()->getImmediatelyDeclaredConstraint()); + AC.emplace_back(getTypeConstraint()->getImmediatelyDeclaredConstraint(), + getTypeConstraint()->getArgumentPackSubstitutionIndex()); } SourceRange getSourceRange() const override LLVM_READONLY; @@ -1574,9 +1579,10 @@ class NonTypeTemplateParmDecl final /// /// Use this instead of getPlaceholderImmediatelyDeclaredConstraint for /// concepts APIs that accept an ArrayRef of constraint expressions. - void getAssociatedConstraints(llvm::SmallVectorImpl &AC) const { + void getAssociatedConstraints( + llvm::SmallVectorImpl &AC) const { if (Expr *E = getPlaceholderTypeConstraint()) - AC.push_back(E); + AC.emplace_back(E); } // Implement isa/cast/dyncast/etc. @@ -2169,7 +2175,8 @@ class ClassTemplatePartialSpecializationDecl /// /// The constraints in the resulting list are to be treated as if in a /// conjunction ("and"). - void getAssociatedConstraints(llvm::SmallVectorImpl &AC) const { + void getAssociatedConstraints( + llvm::SmallVectorImpl &AC) const { TemplateParams->getAssociatedConstraints(AC); } @@ -2943,7 +2950,8 @@ class VarTemplatePartialSpecializationDecl /// /// The constraints in the resulting list are to be treated as if in a /// conjunction ("and"). - void getAssociatedConstraints(llvm::SmallVectorImpl &AC) const { + void getAssociatedConstraints( + llvm::SmallVectorImpl &AC) const { TemplateParams->getAssociatedConstraints(AC); } diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td index 178308a24e1a0..5171555008ac9 100644 --- a/clang/include/clang/AST/PropertiesBase.td +++ b/clang/include/clang/AST/PropertiesBase.td @@ -137,7 +137,6 @@ def Selector : PropertyType; def SourceLocation : PropertyType; def StmtRef : RefPropertyType<"Stmt"> { let ConstWhenWriting = 1; } def ExprRef : SubclassPropertyType<"Expr", StmtRef>; -def SubstTemplateTypeParmTypeFlag : EnumPropertyType; def TemplateArgument : PropertyType; def TemplateArgumentKind : EnumPropertyType<"TemplateArgument::ArgKind">; def TemplateName : DefaultValuePropertyType; diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 988362787a452..cfd417068abb7 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -1786,15 +1786,6 @@ enum class AutoTypeKeyword { GNUAutoType }; -enum class SubstTemplateTypeParmTypeFlag { - None, - - /// Whether to expand the pack using the stored PackIndex in place. This is - /// useful for e.g. substituting into an atomic constraint expression, where - /// that expression is part of an unexpanded pack. - ExpandPacksInPlace, -}; - enum class ArraySizeModifier; enum class ElaboratedTypeKeyword; enum class VectorKind; @@ -2164,9 +2155,6 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { LLVM_PREFERRED_TYPE(bool) unsigned HasNonCanonicalUnderlyingType : 1; - LLVM_PREFERRED_TYPE(SubstTemplateTypeParmTypeFlag) - unsigned SubstitutionFlag : 1; - // The index of the template parameter this substitution represents. unsigned Index : 15; @@ -6409,8 +6397,7 @@ class SubstTemplateTypeParmType final Decl *AssociatedDecl; SubstTemplateTypeParmType(QualType Replacement, Decl *AssociatedDecl, - unsigned Index, std::optional PackIndex, - SubstTemplateTypeParmTypeFlag Flag); + unsigned Index, std::optional PackIndex); public: /// Gets the type that was substituted for the template @@ -6439,31 +6426,21 @@ class SubstTemplateTypeParmType final return SubstTemplateTypeParmTypeBits.PackIndex - 1; } - SubstTemplateTypeParmTypeFlag getSubstitutionFlag() const { - return static_cast( - SubstTemplateTypeParmTypeBits.SubstitutionFlag); - } - bool isSugared() const { return true; } QualType desugar() const { return getReplacementType(); } void Profile(llvm::FoldingSetNodeID &ID) { Profile(ID, getReplacementType(), getAssociatedDecl(), getIndex(), - getPackIndex(), getSubstitutionFlag()); + getPackIndex()); } static void Profile(llvm::FoldingSetNodeID &ID, QualType Replacement, const Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, - SubstTemplateTypeParmTypeFlag Flag) { + std::optional PackIndex) { Replacement.Profile(ID); ID.AddPointer(AssociatedDecl); ID.AddInteger(Index); ID.AddInteger(PackIndex ? *PackIndex - 1 : 0); - ID.AddInteger(llvm::to_underlying(Flag)); - assert((Flag != SubstTemplateTypeParmTypeFlag::ExpandPacksInPlace || - PackIndex) && - "ExpandPacksInPlace needs a valid PackIndex"); } static bool classof(const Type *T) { diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index 10eb40dc90ad4..391fd26a086f7 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -842,14 +842,11 @@ let Class = SubstTemplateTypeParmType in { def : Property<"PackIndex", Optional> { let Read = [{ node->getPackIndex() }]; } - def : Property<"SubstitutionFlag", SubstTemplateTypeParmTypeFlag> { - let Read = [{ node->getSubstitutionFlag() }]; - } // The call to getCanonicalType here existed in ASTReader.cpp, too. def : Creator<[{ return ctx.getSubstTemplateTypeParmType( - replacementType, associatedDecl, Index, PackIndex, SubstitutionFlag); + replacementType, associatedDecl, Index, PackIndex); }]>; } diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 09168218a9e36..822cae99ddae7 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11351,7 +11351,6 @@ class Sema final : public SemaBase { ConceptDecl *NamedConcept, NamedDecl *FoundDecl, const TemplateArgumentListInfo *TemplateArgs, TemplateTypeParmDecl *ConstrainedParameter, - QualType ConstrainedType, SourceLocation EllipsisLoc); bool AttachTypeConstraint(AutoTypeLoc TL, @@ -14552,13 +14551,14 @@ class Sema final : public SemaBase { /// \returns true if an error occurred and satisfaction could not be checked, /// false otherwise. bool CheckConstraintSatisfaction( - const NamedDecl *Template, ArrayRef ConstraintExprs, + const NamedDecl *Template, + ArrayRef AssociatedConstraints, const MultiLevelTemplateArgumentList &TemplateArgLists, SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction) { llvm::SmallVector Converted; - return CheckConstraintSatisfaction(Template, ConstraintExprs, Converted, - TemplateArgLists, TemplateIDRange, - Satisfaction); + return CheckConstraintSatisfaction(Template, AssociatedConstraints, + Converted, TemplateArgLists, + TemplateIDRange, Satisfaction); } /// \brief Check whether the given list of constraint expressions are @@ -14584,7 +14584,8 @@ class Sema final : public SemaBase { /// \returns true if an error occurred and satisfaction could not be checked, /// false otherwise. bool CheckConstraintSatisfaction( - const NamedDecl *Template, ArrayRef ConstraintExprs, + const NamedDecl *Template, + ArrayRef AssociatedConstraints, llvm::SmallVectorImpl &ConvertedConstraints, const MultiLevelTemplateArgumentList &TemplateArgList, SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction); @@ -14662,7 +14663,7 @@ class Sema final : public SemaBase { const NormalizedConstraint *getNormalizedAssociatedConstraints( const NamedDecl *ConstrainedDecl, - ArrayRef AssociatedConstraints); + ArrayRef AssociatedConstraints); /// \brief Check whether the given declaration's associated constraints are /// at least as constrained than another declaration's according to the @@ -14673,17 +14674,18 @@ class Sema final : public SemaBase { /// /// \returns true if an error occurred, false otherwise. bool IsAtLeastAsConstrained(const NamedDecl *D1, - MutableArrayRef AC1, + MutableArrayRef AC1, const NamedDecl *D2, - MutableArrayRef AC2, bool &Result); + MutableArrayRef AC2, + bool &Result); /// If D1 was not at least as constrained as D2, but would've been if a pair /// of atomic constraints involved had been declared in a concept and not /// repeated in two separate places in code. /// \returns true if such a diagnostic was emitted, false otherwise. bool MaybeEmitAmbiguousAtomicConstraintsDiagnostic( - const NamedDecl *D1, ArrayRef AC1, const NamedDecl *D2, - ArrayRef AC2); + const NamedDecl *D1, ArrayRef AC1, + const NamedDecl *D2, ArrayRef AC2); private: /// Caches pairs of template-like decls whose associated constraints were diff --git a/clang/include/clang/Sema/SemaConcept.h b/clang/include/clang/Sema/SemaConcept.h index cbb3720c30ee2..648a9c51ae6c1 100644 --- a/clang/include/clang/Sema/SemaConcept.h +++ b/clang/include/clang/Sema/SemaConcept.h @@ -114,7 +114,8 @@ struct NormalizedConstraint { private: static std::optional - fromConstraintExprs(Sema &S, const NamedDecl *D, ArrayRef E); + fromAssociatedConstraints(Sema &S, const NamedDecl *D, + ArrayRef ACs); static std::optional fromConstraintExpr(Sema &S, const NamedDecl *D, const Expr *E); }; @@ -138,7 +139,7 @@ struct alignas(ConstraintAlignment) FoldExpandedConstraint { const NormalizedConstraint *getNormalizedAssociatedConstraints( Sema &S, const NamedDecl *ConstrainedDecl, - ArrayRef AssociatedConstraints); + ArrayRef AssociatedConstraints); /// \brief SubsumptionChecker establishes subsumption /// between two set of constraints. @@ -149,8 +150,10 @@ class SubsumptionChecker { SubsumptionChecker(Sema &SemaRef, SubsumptionCallable Callable = {}); - std::optional Subsumes(const NamedDecl *DP, ArrayRef P, - const NamedDecl *DQ, ArrayRef Q); + std::optional Subsumes(const NamedDecl *DP, + ArrayRef P, + const NamedDecl *DQ, + ArrayRef Q); bool Subsumes(const NormalizedConstraint *P, const NormalizedConstraint *Q); diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 089d01839e1cf..552b5823add36 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -5447,11 +5447,10 @@ QualType ASTContext::getHLSLAttributedResourceType( /// Retrieve a substitution-result type. QualType ASTContext::getSubstTemplateTypeParmType( QualType Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, - SubstTemplateTypeParmTypeFlag Flag) const { + std::optional PackIndex) const { llvm::FoldingSetNodeID ID; SubstTemplateTypeParmType::Profile(ID, Replacement, AssociatedDecl, Index, - PackIndex, Flag); + PackIndex); void *InsertPos = nullptr; SubstTemplateTypeParmType *SubstParm = SubstTemplateTypeParmTypes.FindNodeOrInsertPos(ID, InsertPos); @@ -5461,7 +5460,7 @@ QualType ASTContext::getSubstTemplateTypeParmType( !Replacement.isCanonical()), alignof(SubstTemplateTypeParmType)); SubstParm = new (Mem) SubstTemplateTypeParmType(Replacement, AssociatedDecl, - Index, PackIndex, Flag); + Index, PackIndex); Types.push_back(SubstParm); SubstTemplateTypeParmTypes.InsertNode(SubstParm, InsertPos); } diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 9a84e402e3d69..81acb013b0f7d 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -1631,8 +1631,8 @@ ExpectedType ASTNodeImporter::VisitSubstTemplateTypeParmType( return ToReplacementTypeOrErr.takeError(); return Importer.getToContext().getSubstTemplateTypeParmType( - *ToReplacementTypeOrErr, *ReplacedOrErr, T->getIndex(), T->getPackIndex(), - T->getSubstitutionFlag()); + *ToReplacementTypeOrErr, *ReplacedOrErr, T->getIndex(), + T->getPackIndex()); } ExpectedType ASTNodeImporter::VisitSubstTemplateTypeParmPackType( @@ -5975,7 +5975,8 @@ ASTNodeImporter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) { if (Err) return std::move(Err); - ToD->setTypeConstraint(ToConceptRef, ToIDC); + ToD->setTypeConstraint(ToConceptRef, ToIDC, + TC->getArgumentPackSubstitutionIndex()); } if (Error Err = importTemplateParameterDefaultArgument(D, ToD)) diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index c0f5be51db5f3..8f6916aeb4bd6 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -223,20 +223,21 @@ static bool AdoptTemplateParameterList(TemplateParameterList *Params, return Invalid; } -void TemplateParameterList:: -getAssociatedConstraints(llvm::SmallVectorImpl &AC) const { +void TemplateParameterList::getAssociatedConstraints( + llvm::SmallVectorImpl &ACs) const { if (HasConstrainedParameters) for (const NamedDecl *Param : *this) { if (const auto *TTP = dyn_cast(Param)) { if (const auto *TC = TTP->getTypeConstraint()) - AC.push_back(TC->getImmediatelyDeclaredConstraint()); + ACs.emplace_back(TC->getImmediatelyDeclaredConstraint(), + TC->getArgumentPackSubstitutionIndex()); } else if (const auto *NTTP = dyn_cast(Param)) { if (const Expr *E = NTTP->getPlaceholderTypeConstraint()) - AC.push_back(E); + ACs.emplace_back(E); } } if (HasRequiresClause) - AC.push_back(getRequiresClause()); + ACs.emplace_back(getRequiresClause()); } bool TemplateParameterList::hasAssociatedConstraints() const { @@ -286,12 +287,12 @@ TemplateDecl::TemplateDecl(Kind DK, DeclContext *DC, SourceLocation L, void TemplateDecl::anchor() {} -void TemplateDecl:: -getAssociatedConstraints(llvm::SmallVectorImpl &AC) const { - TemplateParams->getAssociatedConstraints(AC); +void TemplateDecl::getAssociatedConstraints( + llvm::SmallVectorImpl &ACs) const { + TemplateParams->getAssociatedConstraints(ACs); if (auto *FD = dyn_cast_or_null(getTemplatedDecl())) if (const Expr *TRC = FD->getTrailingRequiresClause()) - AC.push_back(TRC); + ACs.emplace_back(TRC); } bool TemplateDecl::hasAssociatedConstraints() const { @@ -748,14 +749,15 @@ bool TemplateTypeParmDecl::isParameterPack() const { } void TemplateTypeParmDecl::setTypeConstraint( - ConceptReference *Loc, Expr *ImmediatelyDeclaredConstraint) { + ConceptReference *Loc, Expr *ImmediatelyDeclaredConstraint, + int ArgumentPackSubstitutionIndex) { assert(HasTypeConstraint && "HasTypeConstraint=true must be passed at construction in order to " "call setTypeConstraint"); assert(!TypeConstraintInitialized && "TypeConstraint was already initialized!"); - new (getTrailingObjects()) - TypeConstraint(Loc, ImmediatelyDeclaredConstraint); + new (getTrailingObjects()) TypeConstraint( + Loc, ImmediatelyDeclaredConstraint, ArgumentPackSubstitutionIndex); TypeConstraintInitialized = true; } diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 9fda02b430e48..667ffc0e599a6 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -4263,7 +4263,7 @@ static const TemplateTypeParmDecl *getReplacedParameter(Decl *D, SubstTemplateTypeParmType::SubstTemplateTypeParmType( QualType Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, SubstTemplateTypeParmTypeFlag Flag) + std::optional PackIndex) : Type(SubstTemplateTypeParm, Replacement.getCanonicalType(), Replacement->getDependence()), AssociatedDecl(AssociatedDecl) { @@ -4274,10 +4274,6 @@ SubstTemplateTypeParmType::SubstTemplateTypeParmType( SubstTemplateTypeParmTypeBits.Index = Index; SubstTemplateTypeParmTypeBits.PackIndex = PackIndex ? *PackIndex + 1 : 0; - SubstTemplateTypeParmTypeBits.SubstitutionFlag = llvm::to_underlying(Flag); - assert((Flag != SubstTemplateTypeParmTypeFlag::ExpandPacksInPlace || - PackIndex) && - "ExpandPacksInPlace needs a valid PackIndex"); assert(AssociatedDecl != nullptr); } diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp index 54cafc2010f09..44a49a6e3148e 100644 --- a/clang/lib/Sema/SemaCodeComplete.cpp +++ b/clang/lib/Sema/SemaCodeComplete.cpp @@ -5463,8 +5463,9 @@ class ConceptInfo { // that T is attached to in order to gather the relevant constraints. ConceptInfo(const TemplateTypeParmType &BaseType, Scope *S) { auto *TemplatedEntity = getTemplatedEntity(BaseType.getDecl(), S); - for (const Expr *E : constraintsForTemplatedEntity(TemplatedEntity)) - believe(E, &BaseType); + for (const AssociatedConstraint &AC : + constraintsForTemplatedEntity(TemplatedEntity)) + believe(AC.ConstraintExpr, &BaseType); } std::vector members() { @@ -5696,9 +5697,9 @@ class ConceptInfo { // Gets all the type constraint expressions that might apply to the type // variables associated with DC (as returned by getTemplatedEntity()). - static SmallVector + static SmallVector constraintsForTemplatedEntity(DeclContext *DC) { - SmallVector Result; + SmallVector Result; if (DC == nullptr) return Result; // Primary templates can have constraints. diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index ebee5994bfed2..e6117f97ad1f4 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -567,11 +567,12 @@ static ExprResult calculateConstraintSatisfaction( } static bool CheckConstraintSatisfaction( - Sema &S, const NamedDecl *Template, ArrayRef ConstraintExprs, + Sema &S, const NamedDecl *Template, + ArrayRef AssociatedConstraints, llvm::SmallVectorImpl &Converted, const MultiLevelTemplateArgumentList &TemplateArgsLists, SourceRange TemplateIDRange, ConstraintSatisfaction &Satisfaction) { - if (ConstraintExprs.empty()) { + if (AssociatedConstraints.empty()) { Satisfaction.IsSatisfied = true; return false; } @@ -592,10 +593,12 @@ static bool CheckConstraintSatisfaction( if (Inst.isInvalid()) return true; - for (const Expr *ConstraintExpr : ConstraintExprs) { + for (const AssociatedConstraint &AC : AssociatedConstraints) { + Sema::ArgumentPackSubstitutionIndexRAII _(S, + AC.ArgumentPackSubstitutionIndex); ExprResult Res = calculateConstraintSatisfaction( S, Template, TemplateIDRange.getBegin(), TemplateArgsLists, - ConstraintExpr, Satisfaction); + AC.ConstraintExpr, Satisfaction); if (Res.isInvalid()) return true; @@ -603,7 +606,8 @@ static bool CheckConstraintSatisfaction( if (!Satisfaction.IsSatisfied) { // Backfill the 'converted' list with nulls so we can keep the Converted // and unconverted lists in sync. - Converted.append(ConstraintExprs.size() - Converted.size(), nullptr); + Converted.append(AssociatedConstraints.size() - Converted.size(), + nullptr); // [temp.constr.op] p2 // [...] To determine if a conjunction is satisfied, the satisfaction // of the first operand is checked. If that is not satisfied, the @@ -615,17 +619,18 @@ static bool CheckConstraintSatisfaction( } bool Sema::CheckConstraintSatisfaction( - const NamedDecl *Template, ArrayRef ConstraintExprs, + const NamedDecl *Template, + ArrayRef AssociatedConstraints, llvm::SmallVectorImpl &ConvertedConstraints, const MultiLevelTemplateArgumentList &TemplateArgsLists, SourceRange TemplateIDRange, ConstraintSatisfaction &OutSatisfaction) { - if (ConstraintExprs.empty()) { + if (AssociatedConstraints.empty()) { OutSatisfaction.IsSatisfied = true; return false; } if (!Template) { return ::CheckConstraintSatisfaction( - *this, nullptr, ConstraintExprs, ConvertedConstraints, + *this, nullptr, AssociatedConstraints, ConvertedConstraints, TemplateArgsLists, TemplateIDRange, OutSatisfaction); } // Invalid templates could make their way here. Substituting them could result @@ -654,7 +659,7 @@ bool Sema::CheckConstraintSatisfaction( auto Satisfaction = std::make_unique(Template, FlattenedArgs); - if (::CheckConstraintSatisfaction(*this, Template, ConstraintExprs, + if (::CheckConstraintSatisfaction(*this, Template, AssociatedConstraints, ConvertedConstraints, TemplateArgsLists, TemplateIDRange, *Satisfaction)) { OutSatisfaction = *Satisfaction; @@ -923,8 +928,10 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD, ForOverloadResolution); return CheckConstraintSatisfaction( - FD, {FD->getTrailingRequiresClause()}, *MLTAL, - SourceRange(UsageLoc.isValid() ? UsageLoc : FD->getLocation()), + FD, + AssociatedConstraint(FD->getTrailingRequiresClause(), + ArgumentPackSubstitutionIndex), + *MLTAL, SourceRange(UsageLoc.isValid() ? UsageLoc : FD->getLocation()), Satisfaction); } @@ -1099,13 +1106,13 @@ bool Sema::FriendConstraintsDependOnEnclosingTemplate(const FunctionDecl *FD) { assert(FD->getDescribedFunctionTemplate() && "Non-function templates don't need to be checked"); - SmallVector ACs; + SmallVector ACs; FD->getDescribedFunctionTemplate()->getAssociatedConstraints(ACs); unsigned OldTemplateDepth = CalculateTemplateDepthForConstraints(*this, FD); - for (const Expr *Constraint : ACs) + for (const AssociatedConstraint &AC : ACs) if (ConstraintExpressionDependsOnEnclosingTemplate(FD, OldTemplateDepth, - Constraint)) + AC.ConstraintExpr)) return true; return false; @@ -1115,7 +1122,7 @@ bool Sema::EnsureTemplateArgumentListConstraints( TemplateDecl *TD, const MultiLevelTemplateArgumentList &TemplateArgsLists, SourceRange TemplateIDRange) { ConstraintSatisfaction Satisfaction; - llvm::SmallVector AssociatedConstraints; + llvm::SmallVector AssociatedConstraints; TD->getAssociatedConstraints(AssociatedConstraints); if (CheckConstraintSatisfaction(TD, AssociatedConstraints, TemplateArgsLists, TemplateIDRange, Satisfaction)) @@ -1146,7 +1153,7 @@ bool Sema::CheckInstantiatedFunctionTemplateConstraints( FunctionTemplateDecl *Template = Decl->getPrimaryTemplate(); // Note - code synthesis context for the constraints check is created // inside CheckConstraintsSatisfaction. - SmallVector TemplateAC; + SmallVector TemplateAC; Template->getAssociatedConstraints(TemplateAC); if (TemplateAC.empty()) { Satisfaction.IsSatisfied = true; @@ -1438,7 +1445,7 @@ void Sema::DiagnoseUnsatisfiedConstraint( const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints( const NamedDecl *ConstrainedDecl, - ArrayRef AssociatedConstraints) { + ArrayRef AssociatedConstraints) { // In case the ConstrainedDecl comes from modules, it is necessary to use // the canonical decl to avoid different atomic constraints with the 'same' // declarations. @@ -1446,9 +1453,8 @@ const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints( auto CacheEntry = NormalizationCache.find(ConstrainedDecl); if (CacheEntry == NormalizationCache.end()) { - auto Normalized = - NormalizedConstraint::fromConstraintExprs(*this, ConstrainedDecl, - AssociatedConstraints); + auto Normalized = NormalizedConstraint::fromAssociatedConstraints( + *this, ConstrainedDecl, AssociatedConstraints); CacheEntry = NormalizationCache .try_emplace(ConstrainedDecl, @@ -1463,7 +1469,7 @@ const NormalizedConstraint *Sema::getNormalizedAssociatedConstraints( const NormalizedConstraint *clang::getNormalizedAssociatedConstraints( Sema &S, const NamedDecl *ConstrainedDecl, - ArrayRef AssociatedConstraints) { + ArrayRef AssociatedConstraints) { return S.getNormalizedAssociatedConstraints(ConstrainedDecl, AssociatedConstraints); } @@ -1593,14 +1599,14 @@ NormalizedConstraint &NormalizedConstraint::getRHS() const { } std::optional -NormalizedConstraint::fromConstraintExprs(Sema &S, const NamedDecl *D, - ArrayRef E) { - assert(E.size() != 0); - auto Conjunction = fromConstraintExpr(S, D, E[0]); +NormalizedConstraint::fromAssociatedConstraints( + Sema &S, const NamedDecl *D, ArrayRef ACs) { + assert(ACs.size() != 0); + auto Conjunction = fromConstraintExpr(S, D, ACs[0].ConstraintExpr); if (!Conjunction) return std::nullopt; - for (unsigned I = 1; I < E.size(); ++I) { - auto Next = fromConstraintExpr(S, D, E[I]); + for (unsigned I = 1; I < ACs.size(); ++I) { + auto Next = fromConstraintExpr(S, D, ACs[I].ConstraintExpr); if (!Next) return std::nullopt; *Conjunction = NormalizedConstraint(S.Context, std::move(*Conjunction), @@ -1655,8 +1661,8 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, const NamedDecl *D, // expression, the program is ill-formed; no diagnostic is required. // [...] ConceptDecl *CD = CSE->getNamedConcept(); - SubNF = S.getNormalizedAssociatedConstraints(CD, - {CD->getConstraintExpr()}); + SubNF = S.getNormalizedAssociatedConstraints( + CD, AssociatedConstraint(CD->getConstraintExpr())); if (!SubNF) return std::nullopt; } @@ -1731,9 +1737,9 @@ bool FoldExpandedConstraint::AreCompatibleForSubsumption( } bool Sema::IsAtLeastAsConstrained(const NamedDecl *D1, - MutableArrayRef AC1, + MutableArrayRef AC1, const NamedDecl *D2, - MutableArrayRef AC2, + MutableArrayRef AC2, bool &Result) { #ifndef NDEBUG if (const auto *FD1 = dyn_cast(D1)) { @@ -1771,13 +1777,15 @@ bool Sema::IsAtLeastAsConstrained(const NamedDecl *D1, for (size_t I = 0; I != AC1.size() && I != AC2.size(); ++I) { if (Depth2 > Depth1) { - AC1[I] = AdjustConstraintDepth(*this, Depth2 - Depth1) - .TransformExpr(const_cast(AC1[I])) - .get(); + AC1[I].ConstraintExpr = + AdjustConstraintDepth(*this, Depth2 - Depth1) + .TransformExpr(const_cast(AC1[I].ConstraintExpr)) + .get(); } else if (Depth1 > Depth2) { - AC2[I] = AdjustConstraintDepth(*this, Depth1 - Depth2) - .TransformExpr(const_cast(AC2[I])) - .get(); + AC2[I].ConstraintExpr = + AdjustConstraintDepth(*this, Depth1 - Depth2) + .TransformExpr(const_cast(AC2[I].ConstraintExpr)) + .get(); } } @@ -1793,9 +1801,8 @@ bool Sema::IsAtLeastAsConstrained(const NamedDecl *D1, } bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic( - const NamedDecl *D1, ArrayRef AC1, const NamedDecl *D2, - ArrayRef AC2) { - + const NamedDecl *D1, ArrayRef AC1, + const NamedDecl *D2, ArrayRef AC2) { if (isSFINAEContext()) // No need to work here because our notes would be discarded. return false; @@ -2106,10 +2113,9 @@ void SubsumptionChecker::AddUniqueClauseToFormula(Formula &F, Clause C) { F.push_back(C); } -std::optional SubsumptionChecker::Subsumes(const NamedDecl *DP, - ArrayRef P, - const NamedDecl *DQ, - ArrayRef Q) { +std::optional SubsumptionChecker::Subsumes( + const NamedDecl *DP, ArrayRef P, const NamedDecl *DQ, + ArrayRef Q) { const NormalizedConstraint *PNormalized = getNormalizedAssociatedConstraints(SemaRef, DP, P); if (!PNormalized) diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 2246f0f1b3121..bbefbbf294dd1 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -19130,8 +19130,9 @@ static void SetEligibleMethods(Sema &S, CXXRecordDecl *Record, AnotherMethodIsMoreConstrained = true; break; } - if (S.IsAtLeastAsConstrained(OtherMethod, {OtherConstraints}, OrigMethod, - {Constraints}, + AssociatedConstraint Other(OtherConstraints); + AssociatedConstraint Orig(Constraints); + if (S.IsAtLeastAsConstrained(OtherMethod, {Other}, OrigMethod, {Orig}, AnotherMethodIsMoreConstrained)) { // There was an error with the constraints comparison. Exit the loop // and don't consider this function eligible. diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 19fd51134d160..fa492bc124abd 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -9551,7 +9551,8 @@ concepts::NestedRequirement * Sema::BuildNestedRequirement(Expr *Constraint) { ConstraintSatisfaction Satisfaction; if (!Constraint->isInstantiationDependent() && - CheckConstraintSatisfaction(nullptr, {Constraint}, /*TemplateArgs=*/{}, + CheckConstraintSatisfaction(nullptr, AssociatedConstraint(Constraint), + /*TemplateArgs=*/{}, Constraint->getSourceRange(), Satisfaction)) return nullptr; return new (Context) concepts::NestedRequirement(Context, Constraint, diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 6d8006b35dcf4..1802f8f4e1f91 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -11280,12 +11280,12 @@ MaybeDiagnoseAmbiguousConstraints(Sema &S, ArrayRef Cands) { // source-level construct. This behavior is quite confusing and we should try // to help the user figure out what happened. - SmallVector FirstAC, SecondAC; + SmallVector FirstAC, SecondAC; FunctionDecl *FirstCand = nullptr, *SecondCand = nullptr; for (auto I = Cands.begin(), E = Cands.end(); I != E; ++I) { if (!I->Function) continue; - SmallVector AC; + SmallVector AC; if (auto *Template = I->Function->getPrimaryTemplate()) Template->getAssociatedConstraints(AC); else diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index de2b1fdbc44e2..eace9b87a5bfe 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -1133,8 +1133,7 @@ bool Sema::BuildTypeConstraint(const CXXScopeSpec &SS, SS.isSet() ? SS.getWithLocInContext(Context) : NestedNameSpecifierLoc(), ConceptName, CD, /*FoundDecl=*/USD ? cast(USD) : CD, TypeConstr->LAngleLoc.isValid() ? &TemplateArgs : nullptr, - ConstrainedParameter, Context.getTypeDeclType(ConstrainedParameter), - EllipsisLoc); + ConstrainedParameter, EllipsisLoc); } template @@ -1191,7 +1190,6 @@ bool Sema::AttachTypeConstraint(NestedNameSpecifierLoc NS, ConceptDecl *NamedConcept, NamedDecl *FoundDecl, const TemplateArgumentListInfo *TemplateArgs, TemplateTypeParmDecl *ConstrainedParameter, - QualType ConstrainedType, SourceLocation EllipsisLoc) { // C++2a [temp.param]p4: // [...] If Q is of the form C, then let E' be @@ -1200,7 +1198,7 @@ bool Sema::AttachTypeConstraint(NestedNameSpecifierLoc NS, TemplateArgs ? ASTTemplateArgumentListInfo::Create(Context, *TemplateArgs) : nullptr; - QualType ParamAsArgument = ConstrainedType; + QualType ParamAsArgument(ConstrainedParameter->getTypeForDecl(), 0); ExprResult ImmediatelyDeclaredConstraint = formImmediatelyDeclaredConstraint( *this, NS, NameInfo, NamedConcept, FoundDecl, @@ -1223,7 +1221,8 @@ bool Sema::AttachTypeConstraint(NestedNameSpecifierLoc NS, /*NamedConcept=*/NamedConcept, /*ArgsWritten=*/ArgsAsWritten); ConstrainedParameter->setTypeConstraint(CL, - ImmediatelyDeclaredConstraint.get()); + ImmediatelyDeclaredConstraint.get(), + /*ArgumentPackSubstitutionIndex=*/-1); return false; } @@ -4062,7 +4061,7 @@ static void checkMoreSpecializedThanPrimary(Sema &S, PartialSpecDecl *Partial) { } S.NoteTemplateLocation(*Template); - SmallVector PartialAC, TemplateAC; + SmallVector PartialAC, TemplateAC; Template->getAssociatedConstraints(TemplateAC); Partial->getAssociatedConstraints(PartialAC); S.MaybeEmitAmbiguousAtomicConstraintsDiagnostic(Partial, PartialAC, Template, @@ -4604,7 +4603,8 @@ Sema::CheckConceptTemplateId(const CXXScopeSpec &SS, if (!AreArgsDependent && CheckConstraintSatisfaction( - NamedConcept, {NamedConcept->getConstraintExpr()}, MLTAL, + NamedConcept, AssociatedConstraint(NamedConcept->getConstraintExpr()), + MLTAL, SourceRange(SS.isSet() ? SS.getBeginLoc() : ConceptNameInfo.getLoc(), TemplateArgs->getRAngleLoc()), Satisfaction)) @@ -7432,7 +7432,7 @@ bool Sema::CheckTemplateTemplateArgument(TemplateTemplateParmDecl *Param, // C++20[temp.func.order]p2 // [...] If both deductions succeed, the partial ordering selects the // more constrained template (if one exists) as determined below. - SmallVector ParamsAC, TemplateAC; + SmallVector ParamsAC, TemplateAC; Params->getAssociatedConstraints(ParamsAC); // C++20[temp.arg.template]p3 // [...] In this comparison, if P is unconstrained, the constraints on A diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index b39eb8fd5512e..9969f1762fe36 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -3255,7 +3255,7 @@ CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template, ArrayRef SugaredDeducedArgs, ArrayRef CanonicalDeducedArgs, TemplateDeductionInfo &Info) { - llvm::SmallVector AssociatedConstraints; + llvm::SmallVector AssociatedConstraints; Template->getAssociatedConstraints(AssociatedConstraints); std::optional> Innermost; @@ -5245,9 +5245,9 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type, ImplicitConceptSpecializationDecl::Create( S.getASTContext(), Concept->getDeclContext(), Concept->getLocation(), CTAI.CanonicalConverted)); - if (S.CheckConstraintSatisfaction(Concept, {Concept->getConstraintExpr()}, - MLTAL, TypeLoc.getLocalSourceRange(), - Satisfaction)) + if (S.CheckConstraintSatisfaction( + Concept, AssociatedConstraint(Concept->getConstraintExpr()), MLTAL, + TypeLoc.getLocalSourceRange(), Satisfaction)) return true; if (!Satisfaction.IsSatisfied) { std::string Buf; @@ -6121,7 +6121,7 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate( !Context.hasSameType(FD1->getReturnType(), FD2->getReturnType())) return nullptr; - llvm::SmallVector AC1, AC2; + llvm::SmallVector AC1, AC2; FT1->getAssociatedConstraints(AC1); FT2->getAssociatedConstraints(AC2); bool AtLeastAsConstrained1, AtLeastAsConstrained2; @@ -6226,7 +6226,7 @@ FunctionDecl *Sema::getMoreConstrainedFunction(FunctionDecl *FD1, if (FunctionDecl *P = FD2->getTemplateInstantiationPattern(false)) F2 = P; - llvm::SmallVector AC1, AC2; + llvm::SmallVector AC1, AC2; F1->getAssociatedConstraints(AC1); F2->getAssociatedConstraints(AC2); bool AtLeastAsConstrained1, AtLeastAsConstrained2; @@ -6458,7 +6458,7 @@ getMoreSpecialized(Sema &S, QualType T1, QualType T2, TemplateLikeDecl *P1, if (!TemplateArgumentListAreEqual(S.getASTContext())(P1, P2)) return nullptr; - llvm::SmallVector AC1, AC2; + llvm::SmallVector AC1, AC2; P1->getAssociatedConstraints(AC1); P2->getAssociatedConstraints(AC2); bool AtLeastAsConstrained1, AtLeastAsConstrained2; diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 00dcadb41e8fb..9f5ca9dca8e89 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1717,24 +1717,6 @@ namespace { SubstTemplateTypeParmPackTypeLoc TL, bool SuppressObjCLifetime); - QualType - TransformSubstTemplateTypeParmType(TypeLocBuilder &TLB, - SubstTemplateTypeParmTypeLoc TL) { - const SubstTemplateTypeParmType *Type = TL.getTypePtr(); - if (Type->getSubstitutionFlag() != - SubstTemplateTypeParmTypeFlag::ExpandPacksInPlace) - return inherited::TransformSubstTemplateTypeParmType(TLB, TL); - - assert(Type->getPackIndex()); - TemplateArgument TA = TemplateArgs( - Type->getReplacedParameter()->getDepth(), Type->getIndex()); - assert(*Type->getPackIndex() + 1 <= TA.pack_size()); - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex( - SemaRef, TA.pack_size() - 1 - *Type->getPackIndex()); - - return inherited::TransformSubstTemplateTypeParmType(TLB, TL); - } - CXXRecordDecl::LambdaDependencyKind ComputeLambdaDependency(LambdaScopeInfo *LSI) { if (auto TypeAlias = @@ -2894,8 +2876,11 @@ TemplateInstantiator::TransformNestedRequirement( return nullptr; llvm::SmallVector Result; if (!SemaRef.CheckConstraintSatisfaction( - nullptr, {Req->getConstraintExpr()}, Result, TemplateArgs, - Req->getConstraintExpr()->getSourceRange(), Satisfaction) && + nullptr, + AssociatedConstraint(Req->getConstraintExpr(), + SemaRef.ArgumentPackSubstitutionIndex), + Result, TemplateArgs, Req->getConstraintExpr()->getSourceRange(), + Satisfaction) && !Result.empty()) TransConstraint = Result[0]; assert(!Trap.hasErrorOccurred() && "Substitution failures must be handled " @@ -3169,68 +3154,6 @@ namespace { } // namespace -namespace { - -struct ExpandPackedTypeConstraints - : TreeTransform { - - using inherited = TreeTransform; - - const MultiLevelTemplateArgumentList &TemplateArgs; - - ExpandPackedTypeConstraints( - Sema &SemaRef, const MultiLevelTemplateArgumentList &TemplateArgs) - : inherited(SemaRef), TemplateArgs(TemplateArgs) {} - - using inherited::TransformTemplateTypeParmType; - - QualType TransformTemplateTypeParmType(TypeLocBuilder &TLB, - TemplateTypeParmTypeLoc TL, bool) { - const TemplateTypeParmType *T = TL.getTypePtr(); - if (!T->isParameterPack()) { - TemplateTypeParmTypeLoc NewTL = - TLB.push(TL.getType()); - NewTL.setNameLoc(TL.getNameLoc()); - return TL.getType(); - } - - assert(SemaRef.ArgumentPackSubstitutionIndex != -1); - - TemplateArgument Arg = TemplateArgs(T->getDepth(), T->getIndex()); - - std::optional PackIndex; - if (Arg.getKind() == TemplateArgument::Pack) - PackIndex = Arg.pack_size() - 1 - SemaRef.ArgumentPackSubstitutionIndex; - - QualType Result = SemaRef.Context.getSubstTemplateTypeParmType( - TL.getType(), T->getDecl(), T->getIndex(), PackIndex, - SubstTemplateTypeParmTypeFlag::ExpandPacksInPlace); - SubstTemplateTypeParmTypeLoc NewTL = - TLB.push(Result); - NewTL.setNameLoc(TL.getNameLoc()); - return Result; - } - - QualType TransformSubstTemplateTypeParmType(TypeLocBuilder &TLB, - SubstTemplateTypeParmTypeLoc TL) { - const SubstTemplateTypeParmType *T = TL.getTypePtr(); - if (T->getPackIndex()) { - SubstTemplateTypeParmTypeLoc TypeLoc = - TLB.push(TL.getType()); - TypeLoc.setNameLoc(TL.getNameLoc()); - return TypeLoc.getType(); - } - return inherited::TransformSubstTemplateTypeParmType(TLB, TL); - } - - bool SubstTemplateArguments(ArrayRef Args, - TemplateArgumentListInfo &Out) { - return inherited::TransformTemplateArguments(Args.begin(), Args.end(), Out); - } -}; - -} // namespace - bool Sema::SubstTypeConstraint( TemplateTypeParmDecl *Inst, const TypeConstraint *TC, const MultiLevelTemplateArgumentList &TemplateArgs, @@ -3239,61 +3162,11 @@ bool Sema::SubstTypeConstraint( TC->getTemplateArgsAsWritten(); if (!EvaluateConstraints) { - bool ShouldExpandExplicitTemplateArgs = - TemplArgInfo && ArgumentPackSubstitutionIndex != -1 && - llvm::any_of(TemplArgInfo->arguments(), [](auto &Arg) { - return Arg.getArgument().containsUnexpandedParameterPack(); - }); - - // We want to transform the packs into Subst* nodes for type constraints - // inside a pack expansion. For example, - // - // template void foo() { - // bar([](C auto value) {}...); - // } - // - // As we expand Ts in the process of instantiating foo(), and retain - // the original template depths of Ts until the constraint evaluation, we - // would otherwise have no chance to expand Ts by the time of evaluating - // C. - // - // So we form a Subst* node for Ts along with a proper substitution index - // here, and substitute the node with a complete MLTAL later in evaluation. - if (ShouldExpandExplicitTemplateArgs) { - TemplateArgumentListInfo InstArgs; - InstArgs.setLAngleLoc(TemplArgInfo->LAngleLoc); - InstArgs.setRAngleLoc(TemplArgInfo->RAngleLoc); - if (ExpandPackedTypeConstraints(*this, TemplateArgs) - .SubstTemplateArguments(TemplArgInfo->arguments(), InstArgs)) - return true; - - // The type of the original parameter. - auto *ConstraintExpr = TC->getImmediatelyDeclaredConstraint(); - QualType ConstrainedType; - - if (auto *FE = dyn_cast(ConstraintExpr)) { - assert(FE->getLHS()); - ConstraintExpr = FE->getLHS(); - } - auto *CSE = cast(ConstraintExpr); - assert(!CSE->getTemplateArguments().empty() && - "Empty template arguments?"); - ConstrainedType = CSE->getTemplateArguments()[0].getAsType(); - assert(!ConstrainedType.isNull() && - "Failed to extract the original ConstrainedType?"); - - return AttachTypeConstraint( - TC->getNestedNameSpecifierLoc(), TC->getConceptNameInfo(), - TC->getNamedConcept(), - /*FoundDecl=*/TC->getConceptReference()->getFoundDecl(), &InstArgs, - Inst, ConstrainedType, - Inst->isParameterPack() - ? cast(TC->getImmediatelyDeclaredConstraint()) - ->getEllipsisLoc() - : SourceLocation()); - } + auto Index = TC->getArgumentPackSubstitutionIndex(); + if (Index == -1) + Index = SemaRef.ArgumentPackSubstitutionIndex; Inst->setTypeConstraint(TC->getConceptReference(), - TC->getImmediatelyDeclaredConstraint()); + TC->getImmediatelyDeclaredConstraint(), Index); return false; } @@ -3310,7 +3183,6 @@ bool Sema::SubstTypeConstraint( TC->getNestedNameSpecifierLoc(), TC->getConceptNameInfo(), TC->getNamedConcept(), /*FoundDecl=*/TC->getConceptReference()->getFoundDecl(), &InstArgs, Inst, - Context.getTypeDeclType(Inst), Inst->isParameterPack() ? cast(TC->getImmediatelyDeclaredConstraint()) ->getEllipsisLoc() diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 776d6e55acc18..2df961a48c7c3 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -3052,9 +3052,7 @@ InventTemplateParameter(TypeProcessingState &state, QualType T, AutoLoc.getNestedNameSpecifierLoc(), AutoLoc.getConceptNameInfo(), AutoLoc.getNamedConcept(), /*FoundDecl=*/AutoLoc.getFoundDecl(), AutoLoc.hasExplicitTemplateArgs() ? &TAL : nullptr, - InventedTemplateParam, - S.Context.getTypeDeclType(InventedTemplateParam), - D.getEllipsisLoc()); + InventedTemplateParam, D.getEllipsisLoc()); } } else { // The 'auto' appears in the decl-specifiers; we've not finished forming @@ -3091,9 +3089,7 @@ InventTemplateParameter(TypeProcessingState &state, QualType T, /*FoundDecl=*/ USD ? cast(USD) : CD, TemplateId->LAngleLoc.isValid() ? &TemplateArgsInfo : nullptr, - InventedTemplateParam, - S.Context.getTypeDeclType(InventedTemplateParam), - D.getEllipsisLoc()); + InventedTemplateParam, D.getEllipsisLoc()); } } } diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index c3341e00bacef..77daeaee5dd1f 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -2706,8 +2706,10 @@ void ASTDeclReader::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) { if (Record.readBool()) CR = Record.readConceptReference(); Expr *ImmediatelyDeclaredConstraint = Record.readExpr(); + int ArgumentPackSubstitutionIndex = Record.readInt(); - D->setTypeConstraint(CR, ImmediatelyDeclaredConstraint); + D->setTypeConstraint(CR, ImmediatelyDeclaredConstraint, + ArgumentPackSubstitutionIndex); if ((D->ExpandedParameterPack = Record.readInt())) D->NumExpanded = Record.readInt(); } diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index f377c145a4204..b896a04a0b14b 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -2036,6 +2036,7 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) { if (CR) Record.AddConceptReference(CR); Record.AddStmt(TC->getImmediatelyDeclaredConstraint()); + Record.push_back(TC->getArgumentPackSubstitutionIndex()); Record.push_back(D->isExpandedParameterPack()); if (D->isExpandedParameterPack()) Record.push_back(D->getNumExpansionParameters()); diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp index 832ce15e66250..c863cc841af42 100644 --- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp +++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp @@ -485,7 +485,7 @@ struct Out { A(T2); }; A(int) -> A; - + template using B = A; }; diff --git a/clang/test/SemaCXX/fold_lambda_with_variadics.cpp b/clang/test/SemaCXX/fold_lambda_with_variadics.cpp index 69572bea3664a..106da7d0e2663 100644 --- a/clang/test/SemaCXX/fold_lambda_with_variadics.cpp +++ b/clang/test/SemaCXX/fold_lambda_with_variadics.cpp @@ -238,4 +238,51 @@ static_assert(bar()(123)); // expected-note@#C {{evaluated to false}} // expected-note@#same_as 2{{evaluated to false}} +template +concept same_as_v = __is_same(T, decltype(U)); // #same_as_v + +template constexpr auto baz() { + return Overloaded{[](same_as_v auto value) { return value; }...}; // #baz +} + +static_assert(baz<1, 1.>()(123) == 123); +static_assert(baz<1, 1.>()(2.718) == 2.718); + +static_assert(baz<1, 1.>()('c')); +// expected-error@-1 {{no matching function}} + +// expected-note@#baz {{constraints not satisfied}} +// expected-note@#baz {{'same_as_v' evaluated to false}} +// expected-note@#same_as_v {{evaluated to false}} + +// expected-note@#baz {{constraints not satisfied}} +// expected-note@#baz {{'same_as_v' evaluated to false}} +// expected-note@#same_as_v {{evaluated to false}} + +template constexpr auto bazz() { + return Overloaded{[](same_as_v auto value) { return Ts; }...}; // #bazz +} + +static_assert(bazz<1, 2>()(1)); +// expected-error@-1 {{is ambiguous}} +// expected-note@#bazz 2{{candidate function [with value:auto = int]}} + } // namespace GH101754 + +namespace GH131798 { + template + struct tuple { T0 elem0; }; + + template + concept C = true; + + template + struct Foo {}; + + template + constexpr tuple fs{[] (C> auto) {}...}; + + int main() { + fs<0>.elem0(1); + } +} // namspace GH131798 diff --git a/clang/unittests/AST/SourceLocationTest.cpp b/clang/unittests/AST/SourceLocationTest.cpp index daea2d62fe496..5b461d1cf4400 100644 --- a/clang/unittests/AST/SourceLocationTest.cpp +++ b/clang/unittests/AST/SourceLocationTest.cpp @@ -1094,11 +1094,11 @@ class ConceptSpecializationExprConceptReferenceRangeVerifier protected: SourceRange getRange(const VarTemplateDecl &Node) override { assert(Node.hasAssociatedConstraints()); - SmallVector ACs; + SmallVector ACs; Node.getAssociatedConstraints(ACs); - for (const Expr *Constraint : ACs) { + for (const AssociatedConstraint &AC : ACs) { if (const ConceptSpecializationExpr *CSConstraint = - dyn_cast(Constraint)) { + dyn_cast(AC.ConstraintExpr)) { return CSConstraint->getConceptReference()->getSourceRange(); } } From e25187bc3e1459b4eee6d0acd4e46b475a347f5d Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Wed, 2 Apr 2025 08:46:02 +0800 Subject: [PATCH 0322/1029] LLVM/Test: Add vectorizing testcases for fminimumnum and fminimumnum (#133843) Vectorizing of fminimumnum and fminimumnum have not support yet. Let's add the testcase for it now, and we will update the testcase when we support it. --- .../LoopVectorize/AArch64/fminimumnum.ll | 255 +++++++++ .../LoopVectorize/RISCV/fminimumnum.ll | 255 +++++++++ .../LoopVectorize/X86/fminimumnum.ll | 255 +++++++++ .../SLPVectorizer/AArch64/fminimumnum.ll | 516 ++++++++++++++++++ .../SLPVectorizer/RISCV/fminimumnum.ll | 516 ++++++++++++++++++ .../SLPVectorizer/X86/fminimumnum.ll | 510 +++++++++++++++++ 6 files changed, 2307 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll new file mode 100644 index 0000000000000..f5cc0f6e13785 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fminimumnum.ll @@ -0,0 +1,255 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet. +; RUN: opt --passes=loop-vectorize --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s + +define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmin32( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[OUT:%.*]] = tail call float @llvm.minimumnum.f32(float [[IN1]], float [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[OUT]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x float], ptr %input1, i64 0, i64 %iv + %in1 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr %input2, i64 0, i64 %iv + %in2 = load float, ptr %arrayidx2, align 4 + %out = tail call float @llvm.minimumnum.f32(float %in1, float %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr %output, i64 0, i64 %iv + store float %out, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare float @llvm.minimumnum.f32(float, float) + +define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmax32( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[OUT:%.*]] = tail call float @llvm.maximumnum.f32(float [[IN1]], float [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[OUT]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x float], ptr %input1, i64 0, i64 %iv + %in1 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr %input2, i64 0, i64 %iv + %in2 = load float, ptr %arrayidx2, align 4 + %out = tail call float @llvm.maximumnum.f32(float %in1, float %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr %output, i64 0, i64 %iv + store float %out, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare float @llvm.maximumnum.f32(float, float) + +define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmin64( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[OUT:%.*]] = tail call double @llvm.minimumnum.f64(double [[IN1]], double [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store double [[OUT]], ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x double], ptr %input1, i64 0, i64 %iv + %in1 = load double, ptr %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr %input2, i64 0, i64 %iv + %in2 = load double, ptr %arrayidx2, align 8 + %out = tail call double @llvm.minimumnum.f64(double %in1, double %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr %output, i64 0, i64 %iv + store double %out, ptr %arrayidx4, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare double @llvm.minimumnum.f64(double, double) + +define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmax64( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[OUT:%.*]] = tail call double @llvm.maximumnum.f64(double [[IN1]], double [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store double [[OUT]], ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x double], ptr %input1, i64 0, i64 %iv + %in1 = load double, ptr %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr %input2, i64 0, i64 %iv + %in2 = load double, ptr %arrayidx2, align 8 + %out = tail call double @llvm.maximumnum.f64(double %in1, double %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr %output, i64 0, i64 %iv + store double %out, ptr %arrayidx4, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare double @llvm.maximumnum.f64(double, double) + +define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmin16( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[OUT:%.*]] = tail call half @llvm.minimumnum.f16(half [[IN1]], half [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x half], ptr %input1, i64 0, i64 %iv + %in1 = load half, ptr %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr %input2, i64 0, i64 %iv + %in2 = load half, ptr %arrayidx2, align 2 + %out = tail call half @llvm.minimumnum.f16(half %in1, half %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr %output, i64 0, i64 %iv + store half %out, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare half @llvm.minimumnum.f16(half, half) + +define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmax16( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[OUT:%.*]] = tail call half @llvm.maximumnum.f16(half [[IN1]], half [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x half], ptr %input1, i64 0, i64 %iv + %in1 = load half, ptr %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr %input2, i64 0, i64 %iv + %in2 = load half, ptr %arrayidx2, align 2 + %out = tail call half @llvm.maximumnum.f16(half %in1, half %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr %output, i64 0, i64 %iv + store half %out, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare half @llvm.maximumnum.f16(half, half) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll new file mode 100644 index 0000000000000..b97fa2499cfd5 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll @@ -0,0 +1,255 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet. +; RUN: opt --passes=loop-vectorize --mtriple=riscv64 -mattr="+zvfh,+v,+zfh" -S < %s | FileCheck %s + +define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmin32( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[OUT:%.*]] = tail call float @llvm.minimumnum.f32(float [[IN1]], float [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[OUT]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x float], ptr %input1, i64 0, i64 %iv + %in1 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr %input2, i64 0, i64 %iv + %in2 = load float, ptr %arrayidx2, align 4 + %out = tail call float @llvm.minimumnum.f32(float %in1, float %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr %output, i64 0, i64 %iv + store float %out, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare float @llvm.minimumnum.f32(float, float) + +define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmax32( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[OUT:%.*]] = tail call float @llvm.maximumnum.f32(float [[IN1]], float [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[OUT]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x float], ptr %input1, i64 0, i64 %iv + %in1 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr %input2, i64 0, i64 %iv + %in2 = load float, ptr %arrayidx2, align 4 + %out = tail call float @llvm.maximumnum.f32(float %in1, float %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr %output, i64 0, i64 %iv + store float %out, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare float @llvm.maximumnum.f32(float, float) + +define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmin64( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[OUT:%.*]] = tail call double @llvm.minimumnum.f64(double [[IN1]], double [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store double [[OUT]], ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x double], ptr %input1, i64 0, i64 %iv + %in1 = load double, ptr %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr %input2, i64 0, i64 %iv + %in2 = load double, ptr %arrayidx2, align 8 + %out = tail call double @llvm.minimumnum.f64(double %in1, double %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr %output, i64 0, i64 %iv + store double %out, ptr %arrayidx4, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare double @llvm.minimumnum.f64(double, double) + +define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmax64( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[OUT:%.*]] = tail call double @llvm.maximumnum.f64(double [[IN1]], double [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store double [[OUT]], ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x double], ptr %input1, i64 0, i64 %iv + %in1 = load double, ptr %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr %input2, i64 0, i64 %iv + %in2 = load double, ptr %arrayidx2, align 8 + %out = tail call double @llvm.maximumnum.f64(double %in1, double %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr %output, i64 0, i64 %iv + store double %out, ptr %arrayidx4, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare double @llvm.maximumnum.f64(double, double) + +define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmin16( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[OUT:%.*]] = tail call half @llvm.minimumnum.f16(half [[IN1]], half [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x half], ptr %input1, i64 0, i64 %iv + %in1 = load half, ptr %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr %input2, i64 0, i64 %iv + %in2 = load half, ptr %arrayidx2, align 2 + %out = tail call half @llvm.minimumnum.f16(half %in1, half %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr %output, i64 0, i64 %iv + store half %out, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare half @llvm.minimumnum.f16(half, half) + +define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmax16( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[OUT:%.*]] = tail call half @llvm.maximumnum.f16(half [[IN1]], half [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x half], ptr %input1, i64 0, i64 %iv + %in1 = load half, ptr %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr %input2, i64 0, i64 %iv + %in2 = load half, ptr %arrayidx2, align 2 + %out = tail call half @llvm.maximumnum.f16(half %in1, half %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr %output, i64 0, i64 %iv + store half %out, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare half @llvm.maximumnum.f16(half, half) diff --git a/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll new file mode 100644 index 0000000000000..bbb8c469d79b0 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/fminimumnum.ll @@ -0,0 +1,255 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; FIXME: fmaximumnum/fminimumnum have no vectorizing support yet. +; RUN: opt --passes=loop-vectorize --mtriple=x86_64 -S < %s | FileCheck %s + +define void @fmin32(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmin32( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[OUT:%.*]] = tail call float @llvm.minimumnum.f32(float [[IN1]], float [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[OUT]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x float], ptr %input1, i64 0, i64 %iv + %in1 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr %input2, i64 0, i64 %iv + %in2 = load float, ptr %arrayidx2, align 4 + %out = tail call float @llvm.minimumnum.f32(float %in1, float %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr %output, i64 0, i64 %iv + store float %out, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare float @llvm.minimumnum.f32(float, float) + +define void @fmax32(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmax32( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[OUT:%.*]] = tail call float @llvm.maximumnum.f32(float [[IN1]], float [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x float], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[OUT]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x float], ptr %input1, i64 0, i64 %iv + %in1 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw [4096 x float], ptr %input2, i64 0, i64 %iv + %in2 = load float, ptr %arrayidx2, align 4 + %out = tail call float @llvm.maximumnum.f32(float %in1, float %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x float], ptr %output, i64 0, i64 %iv + store float %out, ptr %arrayidx4, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare float @llvm.maximumnum.f32(float, float) + +define void @fmin64(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmin64( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[OUT:%.*]] = tail call double @llvm.minimumnum.f64(double [[IN1]], double [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store double [[OUT]], ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x double], ptr %input1, i64 0, i64 %iv + %in1 = load double, ptr %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr %input2, i64 0, i64 %iv + %in2 = load double, ptr %arrayidx2, align 8 + %out = tail call double @llvm.minimumnum.f64(double %in1, double %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr %output, i64 0, i64 %iv + store double %out, ptr %arrayidx4, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare double @llvm.minimumnum.f64(double, double) + +define void @fmax64(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmax64( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[OUT:%.*]] = tail call double @llvm.maximumnum.f64(double [[IN1]], double [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x double], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store double [[OUT]], ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x double], ptr %input1, i64 0, i64 %iv + %in1 = load double, ptr %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds nuw [4096 x double], ptr %input2, i64 0, i64 %iv + %in2 = load double, ptr %arrayidx2, align 8 + %out = tail call double @llvm.maximumnum.f64(double %in1, double %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x double], ptr %output, i64 0, i64 %iv + store double %out, ptr %arrayidx4, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare double @llvm.maximumnum.f64(double, double) + +define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmin16( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[OUT:%.*]] = tail call half @llvm.minimumnum.f16(half [[IN1]], half [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x half], ptr %input1, i64 0, i64 %iv + %in1 = load half, ptr %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr %input2, i64 0, i64 %iv + %in2 = load half, ptr %arrayidx2, align 2 + %out = tail call half @llvm.minimumnum.f16(half %in1, half %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr %output, i64 0, i64 %iv + store half %out, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare half @llvm.minimumnum.f16(half, half) + +define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef readonly captures(none) %input2, ptr noundef writeonly captures(none) %output) { +; CHECK-LABEL: define void @fmax16( +; CHECK-SAME: ptr noundef readonly captures(none) [[INPUT1:%.*]], ptr noundef readonly captures(none) [[INPUT2:%.*]], ptr noundef writeonly captures(none) [[OUTPUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT1]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN1:%.*]] = load half, ptr [[ARRAYIDX]], align 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[INPUT2]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: [[IN2:%.*]] = load half, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[OUT:%.*]] = tail call half @llvm.maximumnum.f16(half [[IN1]], half [[IN2]]) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw [4096 x half], ptr [[OUTPUT]], i64 0, i64 [[INDVARS_IV]] +; CHECK-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw [4096 x half], ptr %input1, i64 0, i64 %iv + %in1 = load half, ptr %arrayidx, align 2 + %arrayidx2 = getelementptr inbounds nuw [4096 x half], ptr %input2, i64 0, i64 %iv + %in2 = load half, ptr %arrayidx2, align 2 + %out = tail call half @llvm.maximumnum.f16(half %in1, half %in2) + %arrayidx4 = getelementptr inbounds nuw [4096 x half], ptr %output, i64 0, i64 %iv + store half %out, ptr %arrayidx4, align 2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 4096 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + +declare half @llvm.maximumnum.f16(half, half) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll new file mode 100644 index 0000000000000..a29e711a84132 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/fminimumnum.ll @@ -0,0 +1,516 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt --passes=slp-vectorizer --mtriple=aarch64 -mattr="+neon" -S < %s | FileCheck %s + +@input1_f32 = global [9 x float] zeroinitializer, align 16 +@input2_f32 = global [9 x float] zeroinitializer, align 16 +@output_f32 = global [9 x float] zeroinitializer, align 16 +@input1_f64 = global [9 x double] zeroinitializer, align 16 +@input2_f64 = global [9 x double] zeroinitializer, align 16 +@output_f64 = global [9 x double] zeroinitializer, align 16 +@input1_f16 = global [9 x half] zeroinitializer, align 16 +@input2_f16 = global [9 x half] zeroinitializer, align 16 +@output_f16 = global [9 x half] zeroinitializer, align 16 + +define void @fmin32() { +; CHECK-LABEL: define void @fmin32( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @input1_f32, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @input2_f32, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]]) +; CHECK-NEXT: store float [[TMP2]], ptr @output_f32, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]]) +; CHECK-NEXT: store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]]) +; CHECK-NEXT: store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]]) +; CHECK-NEXT: store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]]) +; CHECK-NEXT: store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]]) +; CHECK-NEXT: store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]]) +; CHECK-NEXT: store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]]) +; CHECK-NEXT: store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]]) +; CHECK-NEXT: store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load float, ptr @input1_f32, align 16 + %input0_1 = load float, ptr @input2_f32, align 16 + %output0 = tail call float @llvm.minimumnum.f32(float %input0_0, float %input0_1) + store float %output0, ptr @output_f32, align 16 + %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4 + %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4 + %output1 = tail call float @llvm.minimumnum.f32(float %input1_1, float %input1_2) + store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4 + %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8 + %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8 + %output2 = tail call float @llvm.minimumnum.f32(float %input2_1, float %input2_2) + store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8 + %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4 + %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4 + %output3 = tail call float @llvm.minimumnum.f32(float %input3_1, float %input3_2) + store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4 + %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16 + %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16 + %output4 = tail call float @llvm.minimumnum.f32(float %input4_1, float %input4_2) + store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16 + %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4 + %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4 + %output5 = tail call float @llvm.minimumnum.f32(float %input5_1, float %input5_2) + store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4 + %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8 + %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8 + %output6 = tail call float @llvm.minimumnum.f32(float %input6_1, float %input6_2) + store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8 + %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4 + %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4 + %output7 = tail call float @llvm.minimumnum.f32(float %input7_1, float %input7_2) + store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4 + %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16 + %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16 + %output8 = tail call float @llvm.minimumnum.f32(float %input8_1, float %input8_2) + store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16 + ret void +} + +declare float @llvm.minimumnum.f32(float, float) + +define void @fmax32() { +; CHECK-LABEL: define void @fmax32( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @input1_f32, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @input2_f32, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]]) +; CHECK-NEXT: store float [[TMP2]], ptr @output_f32, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]]) +; CHECK-NEXT: store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]]) +; CHECK-NEXT: store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]]) +; CHECK-NEXT: store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]]) +; CHECK-NEXT: store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]]) +; CHECK-NEXT: store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]]) +; CHECK-NEXT: store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]]) +; CHECK-NEXT: store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]]) +; CHECK-NEXT: store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load float, ptr @input1_f32, align 16 + %input0_1 = load float, ptr @input2_f32, align 16 + %output0 = tail call float @llvm.maximumnum.f32(float %input0_0, float %input0_1) + store float %output0, ptr @output_f32, align 16 + %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4 + %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4 + %output1 = tail call float @llvm.maximumnum.f32(float %input1_1, float %input1_2) + store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4 + %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8 + %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8 + %output2 = tail call float @llvm.maximumnum.f32(float %input2_1, float %input2_2) + store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8 + %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4 + %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4 + %output3 = tail call float @llvm.maximumnum.f32(float %input3_1, float %input3_2) + store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4 + %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16 + %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16 + %output4 = tail call float @llvm.maximumnum.f32(float %input4_1, float %input4_2) + store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16 + %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4 + %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4 + %output5 = tail call float @llvm.maximumnum.f32(float %input5_1, float %input5_2) + store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4 + %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8 + %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8 + %output6 = tail call float @llvm.maximumnum.f32(float %input6_1, float %input6_2) + store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8 + %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4 + %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4 + %output7 = tail call float @llvm.maximumnum.f32(float %input7_1, float %input7_2) + store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4 + %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16 + %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16 + %output8 = tail call float @llvm.maximumnum.f32(float %input8_1, float %input8_2) + store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16 + ret void +} + +declare float @llvm.maximumnum.f32(float, float) + +define void @fmin64() { +; CHECK-LABEL: define void @fmin64( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @input1_f64, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr @input2_f64, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]]) +; CHECK-NEXT: store double [[TMP2]], ptr @output_f64, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]]) +; CHECK-NEXT: store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]]) +; CHECK-NEXT: store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]]) +; CHECK-NEXT: store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]]) +; CHECK-NEXT: store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]]) +; CHECK-NEXT: store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]]) +; CHECK-NEXT: store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]]) +; CHECK-NEXT: store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]]) +; CHECK-NEXT: store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load double, ptr @input1_f64, align 16 + %input0_1 = load double, ptr @input2_f64, align 16 + %output0 = tail call double @llvm.minimumnum.f64(double %input0_0, double %input0_1) + store double %output0, ptr @output_f64, align 16 + %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8 + %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8 + %output1 = tail call double @llvm.minimumnum.f64(double %input1_1, double %input1_2) + store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8 + %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16 + %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16 + %output2 = tail call double @llvm.minimumnum.f64(double %input2_1, double %input2_2) + store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16 + %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8 + %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8 + %output3 = tail call double @llvm.minimumnum.f64(double %input3_1, double %input3_2) + store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8 + %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16 + %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16 + %output4 = tail call double @llvm.minimumnum.f64(double %input4_1, double %input4_2) + store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16 + %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8 + %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8 + %output5 = tail call double @llvm.minimumnum.f64(double %input5_1, double %input5_2) + store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8 + %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16 + %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16 + %output6 = tail call double @llvm.minimumnum.f64(double %input6_1, double %input6_2) + store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16 + %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8 + %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8 + %output7 = tail call double @llvm.minimumnum.f64(double %input7_1, double %input7_2) + store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8 + %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16 + %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16 + %output8 = tail call double @llvm.minimumnum.f64(double %input8_1, double %input8_2) + store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16 + ret void +} + +declare double @llvm.minimumnum.f64(double, double) + +define void @fmax64() { +; CHECK-LABEL: define void @fmax64( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @input1_f64, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr @input2_f64, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]]) +; CHECK-NEXT: store double [[TMP2]], ptr @output_f64, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]]) +; CHECK-NEXT: store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]]) +; CHECK-NEXT: store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]]) +; CHECK-NEXT: store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]]) +; CHECK-NEXT: store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]]) +; CHECK-NEXT: store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]]) +; CHECK-NEXT: store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]]) +; CHECK-NEXT: store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]]) +; CHECK-NEXT: store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load double, ptr @input1_f64, align 16 + %input0_1 = load double, ptr @input2_f64, align 16 + %output0 = tail call double @llvm.maximumnum.f64(double %input0_0, double %input0_1) + store double %output0, ptr @output_f64, align 16 + %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8 + %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8 + %output1 = tail call double @llvm.maximumnum.f64(double %input1_1, double %input1_2) + store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8 + %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16 + %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16 + %output2 = tail call double @llvm.maximumnum.f64(double %input2_1, double %input2_2) + store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16 + %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8 + %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8 + %output3 = tail call double @llvm.maximumnum.f64(double %input3_1, double %input3_2) + store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8 + %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16 + %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16 + %output4 = tail call double @llvm.maximumnum.f64(double %input4_1, double %input4_2) + store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16 + %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8 + %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8 + %output5 = tail call double @llvm.maximumnum.f64(double %input5_1, double %input5_2) + store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8 + %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16 + %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16 + %output6 = tail call double @llvm.maximumnum.f64(double %input6_1, double %input6_2) + store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16 + %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8 + %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8 + %output7 = tail call double @llvm.maximumnum.f64(double %input7_1, double %input7_2) + store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8 + %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16 + %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16 + %output8 = tail call double @llvm.maximumnum.f64(double %input8_1, double %input8_2) + store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16 + ret void +} + +declare double @llvm.maximumnum.f64(double, double) + +define void @fmin16() { +; CHECK-LABEL: define void @fmin16( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load half, ptr @input1_f16, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load half, ptr @input2_f16, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]]) +; CHECK-NEXT: store half [[TMP2]], ptr @output_f16, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]]) +; CHECK-NEXT: store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]]) +; CHECK-NEXT: store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]]) +; CHECK-NEXT: store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]]) +; CHECK-NEXT: store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]]) +; CHECK-NEXT: store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]]) +; CHECK-NEXT: store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]]) +; CHECK-NEXT: store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]]) +; CHECK-NEXT: store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load half, ptr @input1_f16, align 16 + %input0_1 = load half, ptr @input2_f16, align 16 + %output0 = tail call half @llvm.minimumnum.f16(half %input0_0, half %input0_1) + store half %output0, ptr @output_f16, align 16 + %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2 + %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2 + %output1 = tail call half @llvm.minimumnum.f16(half %input1_1, half %input1_2) + store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2 + %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4 + %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4 + %output2 = tail call half @llvm.minimumnum.f16(half %input2_1, half %input2_2) + store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4 + %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2 + %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2 + %output3 = tail call half @llvm.minimumnum.f16(half %input3_1, half %input3_2) + store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2 + %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8 + %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8 + %output4 = tail call half @llvm.minimumnum.f16(half %input4_1, half %input4_2) + store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8 + %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2 + %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2 + %output5 = tail call half @llvm.minimumnum.f16(half %input5_1, half %input5_2) + store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2 + %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4 + %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4 + %output6 = tail call half @llvm.minimumnum.f16(half %input6_1, half %input6_2) + store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4 + %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2 + %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2 + %output7 = tail call half @llvm.minimumnum.f16(half %input7_1, half %input7_2) + store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2 + %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16 + %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16 + %output8 = tail call half @llvm.minimumnum.f16(half %input8_1, half %input8_2) + store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16 + ret void +} + +declare half @llvm.minimumnum.f16(half, half) + +define void @fmax16() { +; CHECK-LABEL: define void @fmax16( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load half, ptr @input1_f16, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load half, ptr @input2_f16, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]]) +; CHECK-NEXT: store half [[TMP2]], ptr @output_f16, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]]) +; CHECK-NEXT: store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]]) +; CHECK-NEXT: store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]]) +; CHECK-NEXT: store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]]) +; CHECK-NEXT: store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]]) +; CHECK-NEXT: store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]]) +; CHECK-NEXT: store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]]) +; CHECK-NEXT: store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]]) +; CHECK-NEXT: store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load half, ptr @input1_f16, align 16 + %input0_1 = load half, ptr @input2_f16, align 16 + %output0 = tail call half @llvm.maximumnum.f16(half %input0_0, half %input0_1) + store half %output0, ptr @output_f16, align 16 + %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2 + %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2 + %output1 = tail call half @llvm.maximumnum.f16(half %input1_1, half %input1_2) + store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2 + %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4 + %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4 + %output2 = tail call half @llvm.maximumnum.f16(half %input2_1, half %input2_2) + store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4 + %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2 + %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2 + %output3 = tail call half @llvm.maximumnum.f16(half %input3_1, half %input3_2) + store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2 + %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8 + %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8 + %output4 = tail call half @llvm.maximumnum.f16(half %input4_1, half %input4_2) + store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8 + %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2 + %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2 + %output5 = tail call half @llvm.maximumnum.f16(half %input5_1, half %input5_2) + store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2 + %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4 + %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4 + %output6 = tail call half @llvm.maximumnum.f16(half %input6_1, half %input6_2) + store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4 + %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2 + %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2 + %output7 = tail call half @llvm.maximumnum.f16(half %input7_1, half %input7_2) + store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2 + %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16 + %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16 + %output8 = tail call half @llvm.maximumnum.f16(half %input8_1, half %input8_2) + store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16 + ret void +} + +declare half @llvm.maximumnum.f16(half, half) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll new file mode 100644 index 0000000000000..920abfad776e0 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/fminimumnum.ll @@ -0,0 +1,516 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt --passes=slp-vectorizer --mtriple=riscv64 -mattr="+zvfh,+v,+zfh" -S < %s | FileCheck %s + +@input1_f32 = global [9 x float] zeroinitializer, align 16 +@input2_f32 = global [9 x float] zeroinitializer, align 16 +@output_f32 = global [9 x float] zeroinitializer, align 16 +@input1_f64 = global [9 x double] zeroinitializer, align 16 +@input2_f64 = global [9 x double] zeroinitializer, align 16 +@output_f64 = global [9 x double] zeroinitializer, align 16 +@input1_f16 = global [9 x half] zeroinitializer, align 16 +@input2_f16 = global [9 x half] zeroinitializer, align 16 +@output_f16 = global [9 x half] zeroinitializer, align 16 + +define void @fmin32() { +; CHECK-LABEL: define void @fmin32( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @input1_f32, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @input2_f32, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]]) +; CHECK-NEXT: store float [[TMP2]], ptr @output_f32, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]]) +; CHECK-NEXT: store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]]) +; CHECK-NEXT: store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]]) +; CHECK-NEXT: store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]]) +; CHECK-NEXT: store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]]) +; CHECK-NEXT: store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]]) +; CHECK-NEXT: store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]]) +; CHECK-NEXT: store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]]) +; CHECK-NEXT: store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load float, ptr @input1_f32, align 16 + %input0_1 = load float, ptr @input2_f32, align 16 + %output0 = tail call float @llvm.minimumnum.f32(float %input0_0, float %input0_1) + store float %output0, ptr @output_f32, align 16 + %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4 + %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4 + %output1 = tail call float @llvm.minimumnum.f32(float %input1_1, float %input1_2) + store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4 + %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8 + %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8 + %output2 = tail call float @llvm.minimumnum.f32(float %input2_1, float %input2_2) + store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8 + %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4 + %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4 + %output3 = tail call float @llvm.minimumnum.f32(float %input3_1, float %input3_2) + store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4 + %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16 + %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16 + %output4 = tail call float @llvm.minimumnum.f32(float %input4_1, float %input4_2) + store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16 + %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4 + %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4 + %output5 = tail call float @llvm.minimumnum.f32(float %input5_1, float %input5_2) + store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4 + %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8 + %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8 + %output6 = tail call float @llvm.minimumnum.f32(float %input6_1, float %input6_2) + store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8 + %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4 + %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4 + %output7 = tail call float @llvm.minimumnum.f32(float %input7_1, float %input7_2) + store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4 + %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16 + %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16 + %output8 = tail call float @llvm.minimumnum.f32(float %input8_1, float %input8_2) + store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16 + ret void +} + +declare float @llvm.minimumnum.f32(float, float) + +define void @fmax32() { +; CHECK-LABEL: define void @fmax32( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @input1_f32, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @input2_f32, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]]) +; CHECK-NEXT: store float [[TMP2]], ptr @output_f32, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]]) +; CHECK-NEXT: store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]]) +; CHECK-NEXT: store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]]) +; CHECK-NEXT: store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]]) +; CHECK-NEXT: store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]]) +; CHECK-NEXT: store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]]) +; CHECK-NEXT: store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]]) +; CHECK-NEXT: store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]]) +; CHECK-NEXT: store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load float, ptr @input1_f32, align 16 + %input0_1 = load float, ptr @input2_f32, align 16 + %output0 = tail call float @llvm.maximumnum.f32(float %input0_0, float %input0_1) + store float %output0, ptr @output_f32, align 16 + %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4 + %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4 + %output1 = tail call float @llvm.maximumnum.f32(float %input1_1, float %input1_2) + store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4 + %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8 + %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8 + %output2 = tail call float @llvm.maximumnum.f32(float %input2_1, float %input2_2) + store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8 + %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4 + %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4 + %output3 = tail call float @llvm.maximumnum.f32(float %input3_1, float %input3_2) + store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4 + %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16 + %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16 + %output4 = tail call float @llvm.maximumnum.f32(float %input4_1, float %input4_2) + store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16 + %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4 + %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4 + %output5 = tail call float @llvm.maximumnum.f32(float %input5_1, float %input5_2) + store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4 + %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8 + %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8 + %output6 = tail call float @llvm.maximumnum.f32(float %input6_1, float %input6_2) + store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8 + %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4 + %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4 + %output7 = tail call float @llvm.maximumnum.f32(float %input7_1, float %input7_2) + store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4 + %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16 + %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16 + %output8 = tail call float @llvm.maximumnum.f32(float %input8_1, float %input8_2) + store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16 + ret void +} + +declare float @llvm.maximumnum.f32(float, float) + +define void @fmin64() { +; CHECK-LABEL: define void @fmin64( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @input1_f64, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr @input2_f64, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]]) +; CHECK-NEXT: store double [[TMP2]], ptr @output_f64, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]]) +; CHECK-NEXT: store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]]) +; CHECK-NEXT: store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]]) +; CHECK-NEXT: store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]]) +; CHECK-NEXT: store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]]) +; CHECK-NEXT: store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]]) +; CHECK-NEXT: store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]]) +; CHECK-NEXT: store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]]) +; CHECK-NEXT: store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load double, ptr @input1_f64, align 16 + %input0_1 = load double, ptr @input2_f64, align 16 + %output0 = tail call double @llvm.minimumnum.f64(double %input0_0, double %input0_1) + store double %output0, ptr @output_f64, align 16 + %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8 + %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8 + %output1 = tail call double @llvm.minimumnum.f64(double %input1_1, double %input1_2) + store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8 + %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16 + %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16 + %output2 = tail call double @llvm.minimumnum.f64(double %input2_1, double %input2_2) + store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16 + %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8 + %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8 + %output3 = tail call double @llvm.minimumnum.f64(double %input3_1, double %input3_2) + store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8 + %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16 + %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16 + %output4 = tail call double @llvm.minimumnum.f64(double %input4_1, double %input4_2) + store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16 + %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8 + %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8 + %output5 = tail call double @llvm.minimumnum.f64(double %input5_1, double %input5_2) + store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8 + %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16 + %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16 + %output6 = tail call double @llvm.minimumnum.f64(double %input6_1, double %input6_2) + store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16 + %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8 + %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8 + %output7 = tail call double @llvm.minimumnum.f64(double %input7_1, double %input7_2) + store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8 + %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16 + %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16 + %output8 = tail call double @llvm.minimumnum.f64(double %input8_1, double %input8_2) + store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16 + ret void +} + +declare double @llvm.minimumnum.f64(double, double) + +define void @fmax64() { +; CHECK-LABEL: define void @fmax64( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @input1_f64, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr @input2_f64, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]]) +; CHECK-NEXT: store double [[TMP2]], ptr @output_f64, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]]) +; CHECK-NEXT: store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]]) +; CHECK-NEXT: store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]]) +; CHECK-NEXT: store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]]) +; CHECK-NEXT: store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]]) +; CHECK-NEXT: store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]]) +; CHECK-NEXT: store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]]) +; CHECK-NEXT: store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]]) +; CHECK-NEXT: store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load double, ptr @input1_f64, align 16 + %input0_1 = load double, ptr @input2_f64, align 16 + %output0 = tail call double @llvm.maximumnum.f64(double %input0_0, double %input0_1) + store double %output0, ptr @output_f64, align 16 + %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8 + %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8 + %output1 = tail call double @llvm.maximumnum.f64(double %input1_1, double %input1_2) + store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8 + %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16 + %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16 + %output2 = tail call double @llvm.maximumnum.f64(double %input2_1, double %input2_2) + store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16 + %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8 + %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8 + %output3 = tail call double @llvm.maximumnum.f64(double %input3_1, double %input3_2) + store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8 + %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16 + %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16 + %output4 = tail call double @llvm.maximumnum.f64(double %input4_1, double %input4_2) + store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16 + %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8 + %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8 + %output5 = tail call double @llvm.maximumnum.f64(double %input5_1, double %input5_2) + store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8 + %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16 + %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16 + %output6 = tail call double @llvm.maximumnum.f64(double %input6_1, double %input6_2) + store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16 + %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8 + %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8 + %output7 = tail call double @llvm.maximumnum.f64(double %input7_1, double %input7_2) + store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8 + %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16 + %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16 + %output8 = tail call double @llvm.maximumnum.f64(double %input8_1, double %input8_2) + store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16 + ret void +} + +declare double @llvm.maximumnum.f64(double, double) + +define void @fmin16() { +; CHECK-LABEL: define void @fmin16( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load half, ptr @input1_f16, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load half, ptr @input2_f16, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]]) +; CHECK-NEXT: store half [[TMP2]], ptr @output_f16, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]]) +; CHECK-NEXT: store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]]) +; CHECK-NEXT: store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]]) +; CHECK-NEXT: store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]]) +; CHECK-NEXT: store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]]) +; CHECK-NEXT: store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]]) +; CHECK-NEXT: store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]]) +; CHECK-NEXT: store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]]) +; CHECK-NEXT: store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load half, ptr @input1_f16, align 16 + %input0_1 = load half, ptr @input2_f16, align 16 + %output0 = tail call half @llvm.minimumnum.f16(half %input0_0, half %input0_1) + store half %output0, ptr @output_f16, align 16 + %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2 + %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2 + %output1 = tail call half @llvm.minimumnum.f16(half %input1_1, half %input1_2) + store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2 + %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4 + %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4 + %output2 = tail call half @llvm.minimumnum.f16(half %input2_1, half %input2_2) + store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4 + %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2 + %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2 + %output3 = tail call half @llvm.minimumnum.f16(half %input3_1, half %input3_2) + store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2 + %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8 + %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8 + %output4 = tail call half @llvm.minimumnum.f16(half %input4_1, half %input4_2) + store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8 + %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2 + %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2 + %output5 = tail call half @llvm.minimumnum.f16(half %input5_1, half %input5_2) + store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2 + %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4 + %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4 + %output6 = tail call half @llvm.minimumnum.f16(half %input6_1, half %input6_2) + store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4 + %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2 + %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2 + %output7 = tail call half @llvm.minimumnum.f16(half %input7_1, half %input7_2) + store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2 + %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16 + %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16 + %output8 = tail call half @llvm.minimumnum.f16(half %input8_1, half %input8_2) + store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16 + ret void +} + +declare half @llvm.minimumnum.f16(half, half) + +define void @fmax16() { +; CHECK-LABEL: define void @fmax16( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load half, ptr @input1_f16, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load half, ptr @input2_f16, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]]) +; CHECK-NEXT: store half [[TMP2]], ptr @output_f16, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]]) +; CHECK-NEXT: store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]]) +; CHECK-NEXT: store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]]) +; CHECK-NEXT: store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]]) +; CHECK-NEXT: store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]]) +; CHECK-NEXT: store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]]) +; CHECK-NEXT: store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]]) +; CHECK-NEXT: store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]]) +; CHECK-NEXT: store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load half, ptr @input1_f16, align 16 + %input0_1 = load half, ptr @input2_f16, align 16 + %output0 = tail call half @llvm.maximumnum.f16(half %input0_0, half %input0_1) + store half %output0, ptr @output_f16, align 16 + %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2 + %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2 + %output1 = tail call half @llvm.maximumnum.f16(half %input1_1, half %input1_2) + store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2 + %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4 + %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4 + %output2 = tail call half @llvm.maximumnum.f16(half %input2_1, half %input2_2) + store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4 + %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2 + %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2 + %output3 = tail call half @llvm.maximumnum.f16(half %input3_1, half %input3_2) + store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2 + %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8 + %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8 + %output4 = tail call half @llvm.maximumnum.f16(half %input4_1, half %input4_2) + store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8 + %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2 + %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2 + %output5 = tail call half @llvm.maximumnum.f16(half %input5_1, half %input5_2) + store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2 + %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4 + %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4 + %output6 = tail call half @llvm.maximumnum.f16(half %input6_1, half %input6_2) + store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4 + %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2 + %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2 + %output7 = tail call half @llvm.maximumnum.f16(half %input7_1, half %input7_2) + store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2 + %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16 + %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16 + %output8 = tail call half @llvm.maximumnum.f16(half %input8_1, half %input8_2) + store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16 + ret void +} + +declare half @llvm.maximumnum.f16(half, half) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll new file mode 100644 index 0000000000000..f058636034bcf --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/fminimumnum.ll @@ -0,0 +1,510 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt --passes=slp-vectorizer --mtriple=x86_64 -S < %s | FileCheck %s + +@input1_f32 = global [9 x float] zeroinitializer, align 16 +@input2_f32 = global [9 x float] zeroinitializer, align 16 +@output_f32 = global [9 x float] zeroinitializer, align 16 +@input1_f64 = global [9 x double] zeroinitializer, align 16 +@input2_f64 = global [9 x double] zeroinitializer, align 16 +@output_f64 = global [9 x double] zeroinitializer, align 16 +@input1_f16 = global [9 x half] zeroinitializer, align 16 +@input2_f16 = global [9 x half] zeroinitializer, align 16 +@output_f16 = global [9 x half] zeroinitializer, align 16 + +define void @fmin32() { +; CHECK-LABEL: define void @fmin32() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @input1_f32, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @input2_f32, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP0]], float [[TMP1]]) +; CHECK-NEXT: store float [[TMP2]], ptr @output_f32, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP3]], float [[TMP4]]) +; CHECK-NEXT: store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP6]], float [[TMP7]]) +; CHECK-NEXT: store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP9]], float [[TMP10]]) +; CHECK-NEXT: store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP14:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP12]], float [[TMP13]]) +; CHECK-NEXT: store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP17:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP15]], float [[TMP16]]) +; CHECK-NEXT: store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP20:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP18]], float [[TMP19]]) +; CHECK-NEXT: store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP23:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP21]], float [[TMP22]]) +; CHECK-NEXT: store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.minimumnum.f32(float [[TMP24]], float [[TMP25]]) +; CHECK-NEXT: store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load float, ptr @input1_f32, align 16 + %input0_1 = load float, ptr @input2_f32, align 16 + %output0 = tail call float @llvm.minimumnum.f32(float %input0_0, float %input0_1) + store float %output0, ptr @output_f32, align 16 + %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4 + %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4 + %output1 = tail call float @llvm.minimumnum.f32(float %input1_1, float %input1_2) + store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4 + %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8 + %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8 + %output2 = tail call float @llvm.minimumnum.f32(float %input2_1, float %input2_2) + store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8 + %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4 + %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4 + %output3 = tail call float @llvm.minimumnum.f32(float %input3_1, float %input3_2) + store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4 + %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16 + %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16 + %output4 = tail call float @llvm.minimumnum.f32(float %input4_1, float %input4_2) + store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16 + %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4 + %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4 + %output5 = tail call float @llvm.minimumnum.f32(float %input5_1, float %input5_2) + store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4 + %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8 + %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8 + %output6 = tail call float @llvm.minimumnum.f32(float %input6_1, float %input6_2) + store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8 + %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4 + %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4 + %output7 = tail call float @llvm.minimumnum.f32(float %input7_1, float %input7_2) + store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4 + %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16 + %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16 + %output8 = tail call float @llvm.minimumnum.f32(float %input8_1, float %input8_2) + store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16 + ret void +} + +declare float @llvm.minimumnum.f32(float, float) + +define void @fmax32() { +; CHECK-LABEL: define void @fmax32() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @input1_f32, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @input2_f32, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP0]], float [[TMP1]]) +; CHECK-NEXT: store float [[TMP2]], ptr @output_f32, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP5:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP3]], float [[TMP4]]) +; CHECK-NEXT: store float [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP6]], float [[TMP7]]) +; CHECK-NEXT: store float [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP9]], float [[TMP10]]) +; CHECK-NEXT: store float [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP14:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP12]], float [[TMP13]]) +; CHECK-NEXT: store float [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP17:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP15]], float [[TMP16]]) +; CHECK-NEXT: store float [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP20:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP18]], float [[TMP19]]) +; CHECK-NEXT: store float [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP22:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP23:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP21]], float [[TMP22]]) +; CHECK-NEXT: store float [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4 +; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call float @llvm.maximumnum.f32(float [[TMP24]], float [[TMP25]]) +; CHECK-NEXT: store float [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load float, ptr @input1_f32, align 16 + %input0_1 = load float, ptr @input2_f32, align 16 + %output0 = tail call float @llvm.maximumnum.f32(float %input0_0, float %input0_1) + store float %output0, ptr @output_f32, align 16 + %input1_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 4), align 4 + %input1_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 4), align 4 + %output1 = tail call float @llvm.maximumnum.f32(float %input1_1, float %input1_2) + store float %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 4), align 4 + %input2_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 8), align 8 + %input2_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 8), align 8 + %output2 = tail call float @llvm.maximumnum.f32(float %input2_1, float %input2_2) + store float %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 8), align 8 + %input3_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 12), align 4 + %input3_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 12), align 4 + %output3 = tail call float @llvm.maximumnum.f32(float %input3_1, float %input3_2) + store float %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 12), align 4 + %input4_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 16), align 16 + %input4_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 16), align 16 + %output4 = tail call float @llvm.maximumnum.f32(float %input4_1, float %input4_2) + store float %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 16), align 16 + %input5_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 20), align 4 + %input5_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 20), align 4 + %output5 = tail call float @llvm.maximumnum.f32(float %input5_1, float %input5_2) + store float %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 20), align 4 + %input6_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 24), align 8 + %input6_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 24), align 8 + %output6 = tail call float @llvm.maximumnum.f32(float %input6_1, float %input6_2) + store float %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 24), align 8 + %input7_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 28), align 4 + %input7_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 28), align 4 + %output7 = tail call float @llvm.maximumnum.f32(float %input7_1, float %input7_2) + store float %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 28), align 4 + %input8_1 = load float, ptr getelementptr inbounds nuw (i8, ptr @input1_f32, i64 32), align 16 + %input8_2 = load float, ptr getelementptr inbounds nuw (i8, ptr @input2_f32, i64 32), align 16 + %output8 = tail call float @llvm.maximumnum.f32(float %input8_1, float %input8_2) + store float %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f32, i64 32), align 16 + ret void +} + +declare float @llvm.maximumnum.f32(float, float) + +define void @fmin64() { +; CHECK-LABEL: define void @fmin64() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @input1_f64, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr @input2_f64, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP0]], double [[TMP1]]) +; CHECK-NEXT: store double [[TMP2]], ptr @output_f64, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP3]], double [[TMP4]]) +; CHECK-NEXT: store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP8:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP6]], double [[TMP7]]) +; CHECK-NEXT: store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP9]], double [[TMP10]]) +; CHECK-NEXT: store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP14:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP12]], double [[TMP13]]) +; CHECK-NEXT: store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP17:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP15]], double [[TMP16]]) +; CHECK-NEXT: store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP20:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP18]], double [[TMP19]]) +; CHECK-NEXT: store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP23:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP21]], double [[TMP22]]) +; CHECK-NEXT: store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call double @llvm.minimumnum.f64(double [[TMP24]], double [[TMP25]]) +; CHECK-NEXT: store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load double, ptr @input1_f64, align 16 + %input0_1 = load double, ptr @input2_f64, align 16 + %output0 = tail call double @llvm.minimumnum.f64(double %input0_0, double %input0_1) + store double %output0, ptr @output_f64, align 16 + %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8 + %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8 + %output1 = tail call double @llvm.minimumnum.f64(double %input1_1, double %input1_2) + store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8 + %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16 + %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16 + %output2 = tail call double @llvm.minimumnum.f64(double %input2_1, double %input2_2) + store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16 + %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8 + %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8 + %output3 = tail call double @llvm.minimumnum.f64(double %input3_1, double %input3_2) + store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8 + %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16 + %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16 + %output4 = tail call double @llvm.minimumnum.f64(double %input4_1, double %input4_2) + store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16 + %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8 + %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8 + %output5 = tail call double @llvm.minimumnum.f64(double %input5_1, double %input5_2) + store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8 + %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16 + %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16 + %output6 = tail call double @llvm.minimumnum.f64(double %input6_1, double %input6_2) + store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16 + %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8 + %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8 + %output7 = tail call double @llvm.minimumnum.f64(double %input7_1, double %input7_2) + store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8 + %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16 + %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16 + %output8 = tail call double @llvm.minimumnum.f64(double %input8_1, double %input8_2) + store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16 + ret void +} + +declare double @llvm.minimumnum.f64(double, double) + +define void @fmax64() { +; CHECK-LABEL: define void @fmax64() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr @input1_f64, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr @input2_f64, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP0]], double [[TMP1]]) +; CHECK-NEXT: store double [[TMP2]], ptr @output_f64, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP3]], double [[TMP4]]) +; CHECK-NEXT: store double [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP8:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP6]], double [[TMP7]]) +; CHECK-NEXT: store double [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16 +; CHECK-NEXT: [[TMP9:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP11:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP9]], double [[TMP10]]) +; CHECK-NEXT: store double [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP14:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP12]], double [[TMP13]]) +; CHECK-NEXT: store double [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP17:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP15]], double [[TMP16]]) +; CHECK-NEXT: store double [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8 +; CHECK-NEXT: [[TMP18:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP20:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP18]], double [[TMP19]]) +; CHECK-NEXT: store double [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16 +; CHECK-NEXT: [[TMP21:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP23:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP21]], double [[TMP22]]) +; CHECK-NEXT: store double [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call double @llvm.maximumnum.f64(double [[TMP24]], double [[TMP25]]) +; CHECK-NEXT: store double [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load double, ptr @input1_f64, align 16 + %input0_1 = load double, ptr @input2_f64, align 16 + %output0 = tail call double @llvm.maximumnum.f64(double %input0_0, double %input0_1) + store double %output0, ptr @output_f64, align 16 + %input1_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 8), align 8 + %input1_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 8), align 8 + %output1 = tail call double @llvm.maximumnum.f64(double %input1_1, double %input1_2) + store double %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 8), align 8 + %input2_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 16), align 16 + %input2_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 16), align 16 + %output2 = tail call double @llvm.maximumnum.f64(double %input2_1, double %input2_2) + store double %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 16), align 16 + %input3_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 24), align 8 + %input3_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 24), align 8 + %output3 = tail call double @llvm.maximumnum.f64(double %input3_1, double %input3_2) + store double %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 24), align 8 + %input4_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 32), align 16 + %input4_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 32), align 16 + %output4 = tail call double @llvm.maximumnum.f64(double %input4_1, double %input4_2) + store double %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 32), align 16 + %input5_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 40), align 8 + %input5_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 40), align 8 + %output5 = tail call double @llvm.maximumnum.f64(double %input5_1, double %input5_2) + store double %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 40), align 8 + %input6_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 48), align 16 + %input6_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 48), align 16 + %output6 = tail call double @llvm.maximumnum.f64(double %input6_1, double %input6_2) + store double %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 48), align 16 + %input7_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 56), align 8 + %input7_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 56), align 8 + %output7 = tail call double @llvm.maximumnum.f64(double %input7_1, double %input7_2) + store double %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 56), align 8 + %input8_1 = load double, ptr getelementptr inbounds nuw (i8, ptr @input1_f64, i64 64), align 16 + %input8_2 = load double, ptr getelementptr inbounds nuw (i8, ptr @input2_f64, i64 64), align 16 + %output8 = tail call double @llvm.maximumnum.f64(double %input8_1, double %input8_2) + store double %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f64, i64 64), align 16 + ret void +} + +declare double @llvm.maximumnum.f64(double, double) + +define void @fmin16() { +; CHECK-LABEL: define void @fmin16() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load half, ptr @input1_f16, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load half, ptr @input2_f16, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP0]], half [[TMP1]]) +; CHECK-NEXT: store half [[TMP2]], ptr @output_f16, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP5:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP3]], half [[TMP4]]) +; CHECK-NEXT: store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP6]], half [[TMP7]]) +; CHECK-NEXT: store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP11:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP9]], half [[TMP10]]) +; CHECK-NEXT: store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP12]], half [[TMP13]]) +; CHECK-NEXT: store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP17:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP15]], half [[TMP16]]) +; CHECK-NEXT: store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP20:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP18]], half [[TMP19]]) +; CHECK-NEXT: store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP23:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP21]], half [[TMP22]]) +; CHECK-NEXT: store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call half @llvm.minimumnum.f16(half [[TMP24]], half [[TMP25]]) +; CHECK-NEXT: store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load half, ptr @input1_f16, align 16 + %input0_1 = load half, ptr @input2_f16, align 16 + %output0 = tail call half @llvm.minimumnum.f16(half %input0_0, half %input0_1) + store half %output0, ptr @output_f16, align 16 + %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2 + %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2 + %output1 = tail call half @llvm.minimumnum.f16(half %input1_1, half %input1_2) + store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2 + %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4 + %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4 + %output2 = tail call half @llvm.minimumnum.f16(half %input2_1, half %input2_2) + store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4 + %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2 + %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2 + %output3 = tail call half @llvm.minimumnum.f16(half %input3_1, half %input3_2) + store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2 + %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8 + %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8 + %output4 = tail call half @llvm.minimumnum.f16(half %input4_1, half %input4_2) + store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8 + %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2 + %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2 + %output5 = tail call half @llvm.minimumnum.f16(half %input5_1, half %input5_2) + store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2 + %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4 + %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4 + %output6 = tail call half @llvm.minimumnum.f16(half %input6_1, half %input6_2) + store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4 + %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2 + %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2 + %output7 = tail call half @llvm.minimumnum.f16(half %input7_1, half %input7_2) + store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2 + %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16 + %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16 + %output8 = tail call half @llvm.minimumnum.f16(half %input8_1, half %input8_2) + store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16 + ret void +} + +declare half @llvm.minimumnum.f16(half, half) + +define void @fmax16() { +; CHECK-LABEL: define void @fmax16() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load half, ptr @input1_f16, align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load half, ptr @input2_f16, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP0]], half [[TMP1]]) +; CHECK-NEXT: store half [[TMP2]], ptr @output_f16, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP4:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP5:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP3]], half [[TMP4]]) +; CHECK-NEXT: store half [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2 +; CHECK-NEXT: [[TMP6:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP6]], half [[TMP7]]) +; CHECK-NEXT: store half [[TMP8]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4 +; CHECK-NEXT: [[TMP9:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP10:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP11:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP9]], half [[TMP10]]) +; CHECK-NEXT: store half [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2 +; CHECK-NEXT: [[TMP12:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP13:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP14:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP12]], half [[TMP13]]) +; CHECK-NEXT: store half [[TMP14]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8 +; CHECK-NEXT: [[TMP15:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP16:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP17:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP15]], half [[TMP16]]) +; CHECK-NEXT: store half [[TMP17]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2 +; CHECK-NEXT: [[TMP18:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP20:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP18]], half [[TMP19]]) +; CHECK-NEXT: store half [[TMP20]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP22:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP23:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP21]], half [[TMP22]]) +; CHECK-NEXT: store half [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2 +; CHECK-NEXT: [[TMP24:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16 +; CHECK-NEXT: [[TMP25:%.*]] = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16 +; CHECK-NEXT: [[TMP26:%.*]] = tail call half @llvm.maximumnum.f16(half [[TMP24]], half [[TMP25]]) +; CHECK-NEXT: store half [[TMP26]], ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16 +; CHECK-NEXT: ret void +; +entry: + %input0_0 = load half, ptr @input1_f16, align 16 + %input0_1 = load half, ptr @input2_f16, align 16 + %output0 = tail call half @llvm.maximumnum.f16(half %input0_0, half %input0_1) + store half %output0, ptr @output_f16, align 16 + %input1_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 2), align 2 + %input1_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 2), align 2 + %output1 = tail call half @llvm.maximumnum.f16(half %input1_1, half %input1_2) + store half %output1, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 2), align 2 + %input2_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 4), align 4 + %input2_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 4), align 4 + %output2 = tail call half @llvm.maximumnum.f16(half %input2_1, half %input2_2) + store half %output2, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 4), align 4 + %input3_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 6), align 2 + %input3_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 6), align 2 + %output3 = tail call half @llvm.maximumnum.f16(half %input3_1, half %input3_2) + store half %output3, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 6), align 2 + %input4_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 8), align 8 + %input4_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 8), align 8 + %output4 = tail call half @llvm.maximumnum.f16(half %input4_1, half %input4_2) + store half %output4, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 8), align 8 + %input5_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 10), align 2 + %input5_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 10), align 2 + %output5 = tail call half @llvm.maximumnum.f16(half %input5_1, half %input5_2) + store half %output5, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 10), align 2 + %input6_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 12), align 4 + %input6_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 12), align 4 + %output6 = tail call half @llvm.maximumnum.f16(half %input6_1, half %input6_2) + store half %output6, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 12), align 4 + %input7_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 14), align 2 + %input7_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 14), align 2 + %output7 = tail call half @llvm.maximumnum.f16(half %input7_1, half %input7_2) + store half %output7, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 14), align 2 + %input8_1 = load half, ptr getelementptr inbounds nuw (i8, ptr @input1_f16, i64 16), align 16 + %input8_2 = load half, ptr getelementptr inbounds nuw (i8, ptr @input2_f16, i64 16), align 16 + %output8 = tail call half @llvm.maximumnum.f16(half %input8_1, half %input8_2) + store half %output8, ptr getelementptr inbounds nuw (i8, ptr @output_f16, i64 16), align 16 + ret void +} + +declare half @llvm.maximumnum.f16(half, half) From ae8dd63681bf93b04ff8a29e3cbbd152bd97c5c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 1 Apr 2025 17:59:11 -0700 Subject: [PATCH 0323/1029] [flang][cuda] Add interface and lowering for all_sync (#134001) --- .../flang/Optimizer/Builder/IntrinsicCall.h | 1 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 19 +++++++++++++++++++ flang/module/cudadevice.f90 | 7 +++++++ flang/test/Lower/CUDA/cuda-device-proc.cuf | 9 +++++++++ 4 files changed, 36 insertions(+) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 83f08bb88f7f3..a31bbd0a1bd88 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -441,6 +441,7 @@ struct IntrinsicLibrary { fir::ExtendedValue genUbound(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genUnpack(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genVerify(mlir::Type, llvm::ArrayRef); + mlir::Value genVoteAllSync(mlir::Type, llvm::ArrayRef); /// Implement all conversion functions like DBLE, the first argument is /// the value to convert. There may be an additional KIND arguments that diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 8bbec6d6a7535..9029ea69dd5c4 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -260,6 +260,10 @@ static constexpr IntrinsicHandler handlers[]{ &I::genAll, {{{"mask", asAddr}, {"dim", asValue}}}, /*isElemental=*/false}, + {"all_sync", + &I::genVoteAllSync, + {{{"mask", asValue}, {"pred", asValue}}}, + /*isElemental=*/false}, {"allocated", &I::genAllocated, {{{"array", asInquired}, {"scalar", asInquired}}}, @@ -6495,6 +6499,21 @@ IntrinsicLibrary::genMatchAllSync(mlir::Type resultType, return value; } +// ALL_SYNC +mlir::Value IntrinsicLibrary::genVoteAllSync(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 2); + + llvm::StringRef funcName = "llvm.nvvm.vote.all.sync"; + mlir::MLIRContext *context = builder.getContext(); + mlir::Type i32Ty = builder.getI32Type(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {i32Ty, i32Ty}, {i32Ty}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + llvm::SmallVector filteredArgs; + return builder.create(loc, funcOp, args).getResult(0); +} + // MATCH_ANY_SYNC mlir::Value IntrinsicLibrary::genMatchAnySync(mlir::Type resultType, diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index baaa112f5d8c2..6b8aa4de74240 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -1015,6 +1015,13 @@ attributes(device) integer function match_any_syncjd(mask, val) end function end interface + interface all_sync + attributes(device) integer function all_sync(mask, pred) + !dir$ ignore_tkr(d) mask, (td) pred + integer, value :: mask, pred + end function + end interface + ! LDCG interface __ldcg attributes(device) pure integer(4) function __ldcg_i4(x) bind(c) diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 617d57d097522..9758107c84031 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -296,6 +296,15 @@ end ! CHECK: fir.call @__ldlu_r8x2_(%{{.*}}, %{{.*}}) fastmath : (!fir.ref>, !fir.ref>) -> () ! CHECK: fir.call @__ldcv_r8x2_(%{{.*}}, %{{.*}}) fastmath : (!fir.ref>, !fir.ref>) -> () +attributes(device) subroutine testVote() + integer :: a, ipred, mask, v32 + a = all_sync(mask, v32) + +end subroutine + +! CHECK-LABEL: func.func @_QPtestvote() +! CHECK: fir.call @llvm.nvvm.vote.all.sync + ! CHECK-DAG: func.func private @__ldca_i4x4_(!fir.ref>, !fir.ref>) ! CHECK-DAG: func.func private @__ldcg_i4x4_(!fir.ref>, !fir.ref>) From 46968310cb837e4b32859edef2107080b828b117 Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Wed, 2 Apr 2025 09:11:20 +0800 Subject: [PATCH 0324/1029] [LoongArch] Move fix-tle-le-sym-type test to test/MC. NFC (#133839) --- .../CodeGen/LoongArch/fix-tle-le-sym-type.ll | 24 ----------------- .../Relocations/relocation-specifier.s | 26 +++++++++++++++++++ 2 files changed, 26 insertions(+), 24 deletions(-) delete mode 100644 llvm/test/CodeGen/LoongArch/fix-tle-le-sym-type.ll create mode 100644 llvm/test/MC/LoongArch/Relocations/relocation-specifier.s diff --git a/llvm/test/CodeGen/LoongArch/fix-tle-le-sym-type.ll b/llvm/test/CodeGen/LoongArch/fix-tle-le-sym-type.ll deleted file mode 100644 index d39454a51a445..0000000000000 --- a/llvm/test/CodeGen/LoongArch/fix-tle-le-sym-type.ll +++ /dev/null @@ -1,24 +0,0 @@ -; RUN: llc --mtriple=loongarch32 --filetype=obj %s -o %t-la32 -; RUN: llvm-readelf -s %t-la32 | FileCheck %s --check-prefix=LA32 - -; RUN: llc --mtriple=loongarch64 --filetype=obj %s -o %t-la64 -; RUN: llvm-readelf -s %t-la64 | FileCheck %s --check-prefix=LA64 - -; LA32: Symbol table '.symtab' contains [[#]] entries: -; LA32-NEXT: Num: Value Size Type Bind Vis Ndx Name -; LA32: 00000000 0 TLS GLOBAL DEFAULT UND tls_sym - -; LA64: Symbol table '.symtab' contains [[#]] entries: -; LA64-NEXT: Num: Value Size Type Bind Vis Ndx Name -; LA64: 0000000000000000 0 TLS GLOBAL DEFAULT UND tls_sym - -@tls_sym = external thread_local(localexec) global i32 - -define dso_local signext i32 @test_tlsle() nounwind { -entry: - %0 = call ptr @llvm.threadlocal.address.p0(ptr @tls_sym) - %1 = load i32, ptr %0 - ret i32 %1 -} - -declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) diff --git a/llvm/test/MC/LoongArch/Relocations/relocation-specifier.s b/llvm/test/MC/LoongArch/Relocations/relocation-specifier.s new file mode 100644 index 0000000000000..d0898aaab92fe --- /dev/null +++ b/llvm/test/MC/LoongArch/Relocations/relocation-specifier.s @@ -0,0 +1,26 @@ +# RUN: llvm-mc --filetype=obj --triple=loongarch32 %s -o %t-la32 +# RUN: llvm-readelf -rs %t-la32 | FileCheck %s --check-prefixes=CHECK,RELOC32 +# RUN: llvm-mc --filetype=obj --triple=loongarch64 %s -o %t-la64 +# RUN: llvm-readelf -rs %t-la64 | FileCheck %s --check-prefixes=CHECK,RELOC64 + +## This test is similar to test/MC/CSKY/relocation-specifier.s. + +# RELOC32: '.rela.data' +# RELOC32: R_LARCH_32 00000000 .data + 0 + +# RELOC64: '.rela.data' +# RELOC64: R_LARCH_32 0000000000000000 .data + 0 + +# CHECK: TLS GLOBAL DEFAULT UND gd +# CHECK: TLS GLOBAL DEFAULT UND ld +# CHECK: TLS GLOBAL DEFAULT UND ie +# CHECK: TLS GLOBAL DEFAULT UND le + +pcalau12i $t1, %gd_pc_hi20(gd) +pcalau12i $t1, %ld_pc_hi20(ld) +pcalau12i $t1, %ie_pc_hi20(ie) +lu12i.w $t1, %le_hi20_r(le) + +.data +local: +.long local From 222297b02034914499458cfa4772c412ea35573d Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Wed, 2 Apr 2025 09:40:22 +0800 Subject: [PATCH 0325/1029] [mlir] Use Region::hasOneBlock (NFC) (#133879) --- mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp | 4 ++-- mlir/lib/Transforms/Utils/RegionUtils.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp index 2959d67b366b9..6ffe7afbc727c 100644 --- a/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/ControlFlowOps.cpp @@ -308,7 +308,7 @@ LogicalResult LoopOp::verifyRegions() { return emitOpError( "should not have 'spirv.mlir.merge' op outside the merge block"); - if (std::next(region.begin()) == region.end()) + if (region.hasOneBlock()) return emitOpError( "must have an entry block branching to the loop header block"); // The first block is the entry block. @@ -502,7 +502,7 @@ LogicalResult SelectionOp::verifyRegions() { return emitOpError( "should not have 'spirv.mlir.merge' op outside the merge block"); - if (std::next(region.begin()) == region.end()) + if (region.hasOneBlock()) return emitOpError("must have a selection header block"); return success(); diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp index fc9492efa5805..4985d718c1780 100644 --- a/mlir/lib/Transforms/Utils/RegionUtils.cpp +++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp @@ -200,7 +200,7 @@ LogicalResult mlir::eraseUnreachableBlocks(RewriterBase &rewriter, continue; // If this is a single block region, just collect the nested regions. - if (std::next(region->begin()) == region->end()) { + if (region->hasOneBlock()) { for (Operation &op : region->front()) for (Region ®ion : op.getRegions()) worklist.push_back(®ion); From 55ac652745c6ec94477b1862018f477748faad15 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 2 Apr 2025 08:48:02 +0700 Subject: [PATCH 0326/1029] llvm-reduce: Do not delete convergencectrl in operand-bundles (#133858) The IR verifier will fail if there are any convergent calls without a convergencectrl bundle, if there are any convergencectrl bundles. With the current verifier rules, we would need to drop all the instances of convergencectrl in the function as a set, and strip all the convergence token intrinsics. As such, I think it would be more appropriate to have a separate convergence reduction pass. --- .../remove-operand-bundles-convergencectrl.ll | 36 +++++++++++++++++++ .../deltas/ReduceOperandBundles.cpp | 14 ++++++-- 2 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 llvm/test/tools/llvm-reduce/remove-operand-bundles-convergencectrl.ll diff --git a/llvm/test/tools/llvm-reduce/remove-operand-bundles-convergencectrl.ll b/llvm/test/tools/llvm-reduce/remove-operand-bundles-convergencectrl.ll new file mode 100644 index 0000000000000..cd970f5fa0fa5 --- /dev/null +++ b/llvm/test/tools/llvm-reduce/remove-operand-bundles-convergencectrl.ll @@ -0,0 +1,36 @@ +; Check that invalid reductions aren't introduced by deleting +; convergencectrl bundles in convergent functions +; +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=operand-bundles --test FileCheck --test-arg --check-prefixes=CHECK,INTERESTING --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck --check-prefixes=CHECK,RESULT %s < %t + +; CHECK-LABEL: define float @convergentctrl_one_interesting( +; INTERESTING: %interesting = call float @convergent.extern.func( +; RESULT: %entry.token = call token @llvm.experimental.convergence.entry() +; RESULT: %interesting = call float @convergent.extern.func(float %x) [ "convergencectrl"(token %entry.token) ] +; RESULT: %boring = call float @convergent.extern.func(float %x) [ "convergencectrl"(token %entry.token) ] +define float @convergentctrl_one_interesting(float %x, float %y) #0 { + %entry.token = call token @llvm.experimental.convergence.entry() + %interesting = call float @convergent.extern.func(float %x) [ "convergencectrl"(token %entry.token) ] + %boring = call float @convergent.extern.func(float %x) [ "convergencectrl"(token %entry.token) ] + %add = fadd float %interesting, %boring + ret float %add +} + +; In theory we could remove the bundle here, since all convergencectrl +; in the function will be removed. + +; CHECK-LABEL: define float @convergentctrl_can_remove_all( +; RESULT: %entry.token = call token @llvm.experimental.convergence.entry() +; RESULT: %val = call float @convergent.extern.func(float %x) [ "convergencectrl"(token %entry.token) ] +define float @convergentctrl_can_remove_all(float %x, float %y) #0 { + %entry.token = call token @llvm.experimental.convergence.entry() + %val = call float @convergent.extern.func(float %x) [ "convergencectrl"(token %entry.token) ] + ret float %val +} + +declare float @convergent.extern.func(float) #0 +declare token @llvm.experimental.convergence.entry() #1 + +attributes #0 = { convergent } +attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp index a3e24f33dc77c..9ea7351d642c1 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp @@ -31,6 +31,13 @@ using namespace llvm; namespace { +/// Return true if stripping the bundle from a call will result in invalid IR. +static bool shouldKeepBundleTag(uint32_t BundleTagID) { + // In convergent functions using convergencectrl bundles, all convergent calls + // must use the convergence bundles so don't try to remove them. + return BundleTagID == LLVMContext::OB_convergencectrl; +} + /// Given ChunksToKeep, produce a map of calls and indexes of operand bundles /// to be preserved for each call. class OperandBundleRemapper : public InstVisitor { @@ -52,9 +59,12 @@ class OperandBundleRemapper : public InstVisitor { OperandBundlesToKeepIndexes.reserve(Call.getNumOperandBundles()); // Enumerate every operand bundle on this call. - for (unsigned BundleIndex : seq(Call.getNumOperandBundles())) - if (O.shouldKeep()) // Should we keep this one? + for (unsigned BundleIndex : seq(Call.getNumOperandBundles())) { + if (shouldKeepBundleTag( + Call.getOperandBundleAt(BundleIndex).getTagID()) || + O.shouldKeep()) // Should we keep this one? OperandBundlesToKeepIndexes.emplace_back(BundleIndex); + } } }; From 97dcbdef6089175c45e14fcbcf5c88b10233a79a Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Tue, 1 Apr 2025 18:56:19 -0700 Subject: [PATCH 0327/1029] Revert "[clang-format] Handle C++ keywords in other languages better (#132941)" This reverts commit ab7cee8a0ecf29fdb47c64c8d431a694d63390d2 which had formatting errors. --- clang/lib/Format/FormatTokenLexer.cpp | 3 ++ clang/unittests/Format/FormatTestJS.cpp | 42 ++++++++--------------- clang/unittests/Format/FormatTestJava.cpp | 2 -- 3 files changed, 18 insertions(+), 29 deletions(-) diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp index 014b10b206d90..eed54a11684b5 100644 --- a/clang/lib/Format/FormatTokenLexer.cpp +++ b/clang/lib/Format/FormatTokenLexer.cpp @@ -1306,12 +1306,15 @@ FormatToken *FormatTokenLexer::getNextToken() { FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete, tok::kw_operator)) { FormatTok->Tok.setKind(tok::identifier); + FormatTok->Tok.setIdentifierInfo(nullptr); } else if (Style.isJavaScript() && FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_operator)) { FormatTok->Tok.setKind(tok::identifier); + FormatTok->Tok.setIdentifierInfo(nullptr); } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) { FormatTok->Tok.setKind(tok::identifier); + FormatTok->Tok.setIdentifierInfo(nullptr); } } else if (FormatTok->is(tok::greatergreater)) { FormatTok->Tok.setKind(tok::greater); diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp index 3dae67fbcdfcb..78c9f887a159b 100644 --- a/clang/unittests/Format/FormatTestJS.cpp +++ b/clang/unittests/Format/FormatTestJS.cpp @@ -828,18 +828,12 @@ TEST_F(FormatTestJS, AsyncFunctions) { "} "); // clang-format must not insert breaks between async and function, otherwise // automatic semicolon insertion may trigger (in particular in a class body). - auto Style = getGoogleJSStyleWithColumns(10); verifyFormat("async function\n" "hello(\n" " myparamnameiswaytooloooong) {\n" "}", "async function hello(myparamnameiswaytooloooong) {}", - Style); - verifyFormat("async function\n" - "union(\n" - " myparamnameiswaytooloooong) {\n" - "}", - Style); + getGoogleJSStyleWithColumns(10)); verifyFormat("class C {\n" " async hello(\n" " myparamnameiswaytooloooong) {\n" @@ -847,7 +841,7 @@ TEST_F(FormatTestJS, AsyncFunctions) { "}", "class C {\n" " async hello(myparamnameiswaytooloooong) {} }", - Style); + getGoogleJSStyleWithColumns(10)); verifyFormat("async function* f() {\n" " yield fetch(x);\n" "}"); @@ -1344,16 +1338,15 @@ TEST_F(FormatTestJS, WrapRespectsAutomaticSemicolonInsertion) { // The following statements must not wrap, as otherwise the program meaning // would change due to automatic semicolon insertion. // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.9.1. - auto Style =getGoogleJSStyleWithColumns(10); - verifyFormat("return aaaaa;", Style); - verifyFormat("yield aaaaa;", Style); - verifyFormat("return /* hello! */ aaaaa;", Style); - verifyFormat("continue aaaaa;", Style); - verifyFormat("continue /* hello! */ aaaaa;", Style); - verifyFormat("break aaaaa;", Style); - verifyFormat("throw aaaaa;", Style); - verifyFormat("aaaaaaaaa++;", Style); - verifyFormat("aaaaaaaaa--;", Style); + verifyFormat("return aaaaa;", getGoogleJSStyleWithColumns(10)); + verifyFormat("yield aaaaa;", getGoogleJSStyleWithColumns(10)); + verifyFormat("return /* hello! */ aaaaa;", getGoogleJSStyleWithColumns(10)); + verifyFormat("continue aaaaa;", getGoogleJSStyleWithColumns(10)); + verifyFormat("continue /* hello! */ aaaaa;", getGoogleJSStyleWithColumns(10)); + verifyFormat("break aaaaa;", getGoogleJSStyleWithColumns(10)); + verifyFormat("throw aaaaa;", getGoogleJSStyleWithColumns(10)); + verifyFormat("aaaaaaaaa++;", getGoogleJSStyleWithColumns(10)); + verifyFormat("aaaaaaaaa--;", getGoogleJSStyleWithColumns(10)); verifyFormat("return [\n" " aaa\n" "];", @@ -1373,13 +1366,12 @@ TEST_F(FormatTestJS, WrapRespectsAutomaticSemicolonInsertion) { // Ideally the foo() bit should be indented relative to the async function(). verifyFormat("async function\n" "foo() {}", - Style); - verifyFormat("await theReckoning;", Style); - verifyFormat("some['a']['b']", Style); - verifyFormat("union['a']['b']", Style); + getGoogleJSStyleWithColumns(10)); + verifyFormat("await theReckoning;", getGoogleJSStyleWithColumns(10)); + verifyFormat("some['a']['b']", getGoogleJSStyleWithColumns(10)); verifyFormat("x = (a['a']\n" " ['b']);", - Style); + getGoogleJSStyleWithColumns(10)); verifyFormat("function f() {\n" " return foo.bar(\n" " (param): param is {\n" @@ -2508,10 +2500,6 @@ TEST_F(FormatTestJS, NonNullAssertionOperator) { TEST_F(FormatTestJS, CppKeywords) { // Make sure we don't mess stuff up because of C++ keywords. verifyFormat("return operator && (aa);"); - verifyFormat("enum operator {\n" - " A = 1,\n" - " B\n" - "}"); // .. or QT ones. verifyFormat("const slots: Slot[];"); // use the "!" assertion operator to validate that clang-format understands diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp index e01c1d6d7e684..33998bc7ff858 100644 --- a/clang/unittests/Format/FormatTestJava.cpp +++ b/clang/unittests/Format/FormatTestJava.cpp @@ -158,8 +158,6 @@ TEST_F(FormatTestJava, AnonymousClasses) { TEST_F(FormatTestJava, EnumDeclarations) { verifyFormat("enum SomeThing { ABC, CDE }"); - // A C++ keyword should not mess things up. - verifyFormat("enum union { ABC, CDE }"); verifyFormat("enum SomeThing {\n" " ABC,\n" " CDE,\n" From 03a791f70364921ec3d3b7de8ddc6be8279c2fba Mon Sep 17 00:00:00 2001 From: dpalermo Date: Tue, 1 Apr 2025 22:19:27 -0500 Subject: [PATCH 0328/1029] Revert "[cmake] Refactor clang unittest cmake" (#134022) Reverts llvm/llvm-project#133545 This change is breaking several buildbots as well as developer's builds. Reverting to allow people to make progress. --- clang/unittests/AST/ByteCode/CMakeLists.txt | 12 +++++-- clang/unittests/AST/CMakeLists.txt | 23 +++++++++---- clang/unittests/ASTMatchers/CMakeLists.txt | 22 +++++++++---- .../ASTMatchers/Dynamic/CMakeLists.txt | 18 ++++++++--- clang/unittests/Analysis/CMakeLists.txt | 18 ++++++++--- .../Analysis/FlowSensitive/CMakeLists.txt | 18 ++++++++--- clang/unittests/Basic/CMakeLists.txt | 18 ++++++++--- clang/unittests/CMakeLists.txt | 32 +++---------------- clang/unittests/CodeGen/CMakeLists.txt | 15 ++++++--- clang/unittests/CrossTU/CMakeLists.txt | 12 +++++-- .../unittests/DirectoryWatcher/CMakeLists.txt | 11 +++++-- clang/unittests/Driver/CMakeLists.txt | 19 +++++++---- clang/unittests/Format/CMakeLists.txt | 11 +++++-- clang/unittests/Frontend/CMakeLists.txt | 12 ++++--- clang/unittests/Index/CMakeLists.txt | 13 +++++--- clang/unittests/InstallAPI/CMakeLists.txt | 9 ++++-- clang/unittests/Interpreter/CMakeLists.txt | 25 ++++++++------- .../Interpreter/ExceptionTests/CMakeLists.txt | 20 ++++++------ clang/unittests/Lex/CMakeLists.txt | 16 +++++++--- clang/unittests/Rewrite/CMakeLists.txt | 10 ++++-- clang/unittests/Sema/CMakeLists.txt | 18 ++++++++--- clang/unittests/Serialization/CMakeLists.txt | 17 ++++++---- clang/unittests/StaticAnalyzer/CMakeLists.txt | 18 ++++++++--- clang/unittests/Support/CMakeLists.txt | 11 +++++-- clang/unittests/Tooling/CMakeLists.txt | 28 +++++++++------- clang/unittests/Tooling/Syntax/CMakeLists.txt | 15 ++++++--- clang/unittests/libclang/CMakeLists.txt | 5 ++- .../libclang/CrashTests/CMakeLists.txt | 5 ++- 28 files changed, 289 insertions(+), 162 deletions(-) diff --git a/clang/unittests/AST/ByteCode/CMakeLists.txt b/clang/unittests/AST/ByteCode/CMakeLists.txt index 7ccadda2eeb26..b862fb4834fbd 100644 --- a/clang/unittests/AST/ByteCode/CMakeLists.txt +++ b/clang/unittests/AST/ByteCode/CMakeLists.txt @@ -2,13 +2,19 @@ add_clang_unittest(InterpTests BitcastBuffer.cpp Descriptor.cpp toAPValue.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(InterpTests + PRIVATE clangAST clangASTMatchers clangBasic clangFrontend clangSerialization clangTooling - LINK_LIBS - clangTesting ) + + target_link_libraries(InterpTests + PRIVATE + clangTesting +) diff --git a/clang/unittests/AST/CMakeLists.txt b/clang/unittests/AST/CMakeLists.txt index f27d34e8a0719..bfa6082a6ffa4 100644 --- a/clang/unittests/AST/CMakeLists.txt +++ b/clang/unittests/AST/CMakeLists.txt @@ -1,3 +1,10 @@ +set(LLVM_LINK_COMPONENTS + FrontendOpenMP + Support + TargetParser + ) + + add_subdirectory(ByteCode) add_clang_unittest(ASTTests @@ -36,7 +43,10 @@ add_clang_unittest(ASTTests TemplateNameTest.cpp TypePrinterTest.cpp UnresolvedSetTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(ASTTests + PRIVATE clangAST clangASTMatchers clangBasic @@ -44,12 +54,11 @@ add_clang_unittest(ASTTests clangLex clangSerialization clangTooling - LINK_LIBS + ) + +target_link_libraries(ASTTests + PRIVATE clangTesting LLVMTestingAnnotations LLVMTestingSupport - LLVM_COMPONENTS - FrontendOpenMP - Support - TargetParser - ) +) diff --git a/clang/unittests/ASTMatchers/CMakeLists.txt b/clang/unittests/ASTMatchers/CMakeLists.txt index 47bd5c108bb5a..6a1e629d81b65 100644 --- a/clang/unittests/ASTMatchers/CMakeLists.txt +++ b/clang/unittests/ASTMatchers/CMakeLists.txt @@ -1,23 +1,31 @@ +set(LLVM_LINK_COMPONENTS + FrontendOpenMP + Support + TargetParser + ) + add_clang_unittest(ASTMatchersTests ASTMatchersInternalTest.cpp ASTMatchersNodeTest.cpp ASTMatchersNarrowingTest.cpp ASTMatchersTraversalTest.cpp GtestMatchersTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(ASTMatchersTests + PRIVATE clangAST clangASTMatchers clangBasic clangFrontend clangSerialization clangTooling - LINK_LIBS + ) + +target_link_libraries(ASTMatchersTests + PRIVATE clangTesting LLVMTestingSupport - LLVM_COMPONENTS - FrontendOpenMP - Support - TargetParser - ) +) add_subdirectory(Dynamic) diff --git a/clang/unittests/ASTMatchers/Dynamic/CMakeLists.txt b/clang/unittests/ASTMatchers/Dynamic/CMakeLists.txt index b6db7ce62afe7..6d0e12bcb0759 100644 --- a/clang/unittests/ASTMatchers/Dynamic/CMakeLists.txt +++ b/clang/unittests/ASTMatchers/Dynamic/CMakeLists.txt @@ -1,8 +1,16 @@ +set(LLVM_LINK_COMPONENTS + FrontendOpenMP + Support + ) + add_clang_unittest(DynamicASTMatchersTests VariantValueTest.cpp ParserTest.cpp RegistryTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(DynamicASTMatchersTests + PRIVATE clangAST clangASTMatchers clangBasic @@ -10,9 +18,9 @@ add_clang_unittest(DynamicASTMatchersTests clangFrontend clangSerialization clangTooling - LINK_LIBS + ) + +target_link_libraries(DynamicASTMatchersTests + PRIVATE clangTesting - LLVM_COMPONENTS - FrontendOpenMP - Support ) diff --git a/clang/unittests/Analysis/CMakeLists.txt b/clang/unittests/Analysis/CMakeLists.txt index 059a74843155c..cfea57f53f033 100644 --- a/clang/unittests/Analysis/CMakeLists.txt +++ b/clang/unittests/Analysis/CMakeLists.txt @@ -1,3 +1,8 @@ +set(LLVM_LINK_COMPONENTS + FrontendOpenMP + Support + ) + add_clang_unittest(ClangAnalysisTests CFGDominatorTree.cpp CFGTest.cpp @@ -6,7 +11,10 @@ add_clang_unittest(ClangAnalysisTests IntervalPartitionTest.cpp MacroExpansionContextTest.cpp UnsafeBufferUsageTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(ClangAnalysisTests + PRIVATE clangAST clangASTMatchers clangAnalysis @@ -15,12 +23,12 @@ add_clang_unittest(ClangAnalysisTests clangLex clangSerialization clangTooling - LINK_LIBS + ) + +target_link_libraries(ClangAnalysisTests + PRIVATE clangTesting LLVMTestingSupport - LLVM_COMPONENTS - FrontendOpenMP - Support ) add_subdirectory(FlowSensitive) diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt index 4ac563143cd68..6c01ae8fc2e54 100644 --- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt +++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt @@ -1,3 +1,8 @@ +set(LLVM_LINK_COMPONENTS + FrontendOpenMP + Support + ) + add_clang_unittest(ClangAnalysisFlowSensitiveTests ArenaTest.cpp ASTOpsTest.cpp @@ -25,7 +30,10 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests UncheckedOptionalAccessModelTest.cpp ValueTest.cpp WatchedLiteralsSolverTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(ClangAnalysisFlowSensitiveTests + PRIVATE clangAST clangASTMatchers clangAnalysis @@ -36,11 +44,11 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests clangLex clangSerialization clangTooling - LINK_LIBS + ) + +target_link_libraries(ClangAnalysisFlowSensitiveTests + PRIVATE clangTesting LLVMTestingAnnotations LLVMTestingSupport - LLVM_COMPONENTS - FrontendOpenMP - Support ) diff --git a/clang/unittests/Basic/CMakeLists.txt b/clang/unittests/Basic/CMakeLists.txt index e818bd3e2c372..3844ba49add8d 100644 --- a/clang/unittests/Basic/CMakeLists.txt +++ b/clang/unittests/Basic/CMakeLists.txt @@ -1,3 +1,7 @@ +set(LLVM_LINK_COMPONENTS + Support + ) + add_clang_unittest(BasicTests CharInfoTest.cpp DarwinSDKInfoTest.cpp @@ -8,11 +12,15 @@ add_clang_unittest(BasicTests SanitizersTest.cpp SarifTest.cpp SourceManagerTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(BasicTests + PRIVATE clangBasic clangLex - LINK_LIBS - LLVMTestingSupport - LLVM_COMPONENTS - Support ) + +target_link_libraries(BasicTests + PRIVATE + LLVMTestingSupport +) diff --git a/clang/unittests/CMakeLists.txt b/clang/unittests/CMakeLists.txt index f3823ba309420..9b3ce8aa7de73 100644 --- a/clang/unittests/CMakeLists.txt +++ b/clang/unittests/CMakeLists.txt @@ -15,36 +15,12 @@ if(CLANG_BUILT_STANDALONE) endif() endif() -# add_clang_unittest(test_name file1.cpp file2.cpp) +# add_clang_unittest(test_dirname file1.cpp file2.cpp) # # Will compile the list of files together and link against the clang -# Produces a binary named 'basename(test_name)'. -function(add_clang_unittest test_name) - cmake_parse_arguments(ARG - "" - "" - "CLANG_LIBS;LINK_LIBS;LLVM_COMPONENTS" - ${ARGN}) - - if (NOT ${test_name} MATCHES "Tests$") - message(FATAL_ERROR "Unit test name must end with 'Tests' for lit to find it.") - endif() - - # LLVM_COMPONENTS is for LLVM_LINK_COMPONENTS deps, and must be before - # add_unittest. - list(APPEND LLVM_LINK_COMPONENTS ${ARG_LLVM_COMPONENTS}) - - add_unittest(ClangUnitTests ${test_name} ${ARG_UNPARSED_ARGUMENTS}) - - # Clang libs either come from the entire dylib, or individual libraries. - if (CLANG_LINK_CLANG_DYLIB) - list(APPEND ARG_LINK_LIBS clang-cpp) - else() - list(APPEND ARG_LINK_LIBS ${ARG_CLANG_LIBS}) - endif() - - # LINK_LIBS is for normal library dependencies. - target_link_libraries(${test_name} PRIVATE ${ARG_LINK_LIBS}) +# Produces a binary named 'basename(test_dirname)'. +function(add_clang_unittest test_dirname) + add_unittest(ClangUnitTests ${test_dirname} ${ARGN}) endfunction() add_subdirectory(Basic) diff --git a/clang/unittests/CodeGen/CMakeLists.txt b/clang/unittests/CodeGen/CMakeLists.txt index f5bcecb0b08a3..a437f441568f2 100644 --- a/clang/unittests/CodeGen/CMakeLists.txt +++ b/clang/unittests/CodeGen/CMakeLists.txt @@ -1,9 +1,18 @@ +set(LLVM_LINK_COMPONENTS + Core + Support + TargetParser + ) + add_clang_unittest(ClangCodeGenTests BufferSourceTest.cpp CodeGenExternalTest.cpp TBAAMetadataTest.cpp CheckTargetFeaturesTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(ClangCodeGenTests + PRIVATE clangAST clangBasic clangCodeGen @@ -11,8 +20,4 @@ add_clang_unittest(ClangCodeGenTests clangLex clangParse clangSerialization - LLVM_COMPONENTS - Core - Support - TargetParser ) diff --git a/clang/unittests/CrossTU/CMakeLists.txt b/clang/unittests/CrossTU/CMakeLists.txt index ee81c57ca1dce..222b7e83dc38c 100644 --- a/clang/unittests/CrossTU/CMakeLists.txt +++ b/clang/unittests/CrossTU/CMakeLists.txt @@ -1,12 +1,18 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Support + ) + add_clang_unittest(CrossTUTests CrossTranslationUnitTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(CrossTUTests + PRIVATE clangAST clangBasic clangCrossTU clangFrontend clangSerialization clangTooling - LLVM_COMPONENTS - Support ) diff --git a/clang/unittests/DirectoryWatcher/CMakeLists.txt b/clang/unittests/DirectoryWatcher/CMakeLists.txt index 58e0aee2d1076..38882c9ec2162 100644 --- a/clang/unittests/DirectoryWatcher/CMakeLists.txt +++ b/clang/unittests/DirectoryWatcher/CMakeLists.txt @@ -1,12 +1,17 @@ if(APPLE OR CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME STREQUAL Windows) + set(LLVM_LINK_COMPONENTS + Support + ) + add_clang_unittest(DirectoryWatcherTests DirectoryWatcherTest.cpp - LINK_LIBS + ) + + target_link_libraries(DirectoryWatcherTests + PRIVATE LLVMTestingSupport clangDirectoryWatcher - LLVM_COMPONENTS - Support ) endif() diff --git a/clang/unittests/Driver/CMakeLists.txt b/clang/unittests/Driver/CMakeLists.txt index fa0e87c3318df..efdd07ea23889 100644 --- a/clang/unittests/Driver/CMakeLists.txt +++ b/clang/unittests/Driver/CMakeLists.txt @@ -1,3 +1,11 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + MC + Option + Support + TargetParser + ) + add_clang_unittest(ClangDriverTests DistroTest.cpp DXCModeTest.cpp @@ -7,15 +15,12 @@ add_clang_unittest(ClangDriverTests MultilibBuilderTest.cpp MultilibTest.cpp SanitizerArgsTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(ClangDriverTests + PRIVATE clangDriver clangBasic clangFrontend # For TextDiagnosticPrinter. clangSerialization - LLVM_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - MC - Option - Support - TargetParser ) diff --git a/clang/unittests/Format/CMakeLists.txt b/clang/unittests/Format/CMakeLists.txt index 5bd6a17182d29..71f5886d946c8 100644 --- a/clang/unittests/Format/CMakeLists.txt +++ b/clang/unittests/Format/CMakeLists.txt @@ -1,3 +1,7 @@ +set(LLVM_LINK_COMPONENTS + Support + ) + add_clang_unittest(FormatTests BracesInserterTest.cpp BracesRemoverTest.cpp @@ -32,11 +36,12 @@ add_clang_unittest(FormatTests SortIncludesTest.cpp UsingDeclarationsSorterTest.cpp TokenAnnotatorTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(FormatTests + PRIVATE clangBasic clangFormat clangRewrite clangToolingCore - LLVM_COMPONENTS - Support ) diff --git a/clang/unittests/Frontend/CMakeLists.txt b/clang/unittests/Frontend/CMakeLists.txt index bbf0396014fa9..3c94846243870 100644 --- a/clang/unittests/Frontend/CMakeLists.txt +++ b/clang/unittests/Frontend/CMakeLists.txt @@ -1,3 +1,8 @@ +set(LLVM_LINK_COMPONENTS + Support + TargetParser + ) + add_clang_unittest(FrontendTests ASTUnitTest.cpp CompilerInvocationTest.cpp @@ -12,7 +17,9 @@ add_clang_unittest(FrontendTests OutputStreamTest.cpp TextDiagnosticTest.cpp UtilsTest.cpp - CLANG_LIBS + ) +clang_target_link_libraries(FrontendTests + PRIVATE clangAST clangBasic clangFrontend @@ -22,7 +29,4 @@ add_clang_unittest(FrontendTests clangFrontendTool clangSerialization clangTooling - LLVM_COMPONENTS - Support - TargetParser ) diff --git a/clang/unittests/Index/CMakeLists.txt b/clang/unittests/Index/CMakeLists.txt index 15e9ba0643eaf..ea940e9d7a9ef 100644 --- a/clang/unittests/Index/CMakeLists.txt +++ b/clang/unittests/Index/CMakeLists.txt @@ -1,6 +1,14 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Support + ) + add_clang_unittest(IndexTests IndexTests.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(IndexTests + PRIVATE clangAST clangBasic clangFrontend @@ -8,7 +16,4 @@ add_clang_unittest(IndexTests clangLex clangSerialization clangTooling - LLVM_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - Support ) diff --git a/clang/unittests/InstallAPI/CMakeLists.txt b/clang/unittests/InstallAPI/CMakeLists.txt index c174fa3f87161..4255001ff51f1 100644 --- a/clang/unittests/InstallAPI/CMakeLists.txt +++ b/clang/unittests/InstallAPI/CMakeLists.txt @@ -1,8 +1,11 @@ add_clang_unittest(InstallAPITests HeaderFileTest.cpp FileListTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(InstallAPITests + PRIVATE clangInstallAPI - LINK_LIBS - LLVMTestingSupport ) + +target_link_libraries(InstallAPITests PRIVATE LLVMTestingSupport) diff --git a/clang/unittests/Interpreter/CMakeLists.txt b/clang/unittests/Interpreter/CMakeLists.txt index 9df1a4b03da47..95378f9cfe737 100644 --- a/clang/unittests/Interpreter/CMakeLists.txt +++ b/clang/unittests/Interpreter/CMakeLists.txt @@ -1,3 +1,12 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Core + MC + OrcJIT + Support + TargetParser + ) + add_clang_unittest(ClangReplInterpreterTests IncrementalCompilerBuilderTest.cpp IncrementalProcessingTest.cpp @@ -6,24 +15,16 @@ add_clang_unittest(ClangReplInterpreterTests CodeCompletionTest.cpp EXPORT_SYMBOLS + ) + +target_link_libraries(ClangReplInterpreterTests PUBLIC LLVMTestingSupport) - CLANG_LIBS +clang_target_link_libraries(ClangReplInterpreterTests PRIVATE clangAST clangBasic clangInterpreter clangFrontend clangSema - - LINK_LIBS - LLVMTestingSupport - - LLVM_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - Core - MC - OrcJIT - Support - TargetParser ) # Exceptions on Windows are not yet supported. diff --git a/clang/unittests/Interpreter/ExceptionTests/CMakeLists.txt b/clang/unittests/Interpreter/ExceptionTests/CMakeLists.txt index eb366a860661c..24ae9cd78b5ca 100644 --- a/clang/unittests/Interpreter/ExceptionTests/CMakeLists.txt +++ b/clang/unittests/Interpreter/ExceptionTests/CMakeLists.txt @@ -3,22 +3,24 @@ set(LLVM_REQUIRES_EH ON) set(LLVM_REQUIRES_RTTI ON) +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Core + OrcJIT + Support + ) + add_clang_unittest(ClangReplInterpreterExceptionTests InterpreterExceptionTest.cpp + EXPORT_SYMBOLS + ) - CLANG_LIBS +llvm_update_compile_flags(ClangReplInterpreterExceptionTests) +target_link_libraries(ClangReplInterpreterExceptionTests PUBLIC clangAST clangBasic clangInterpreter clangFrontend - - LLVM_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - Core - OrcJIT - Support ) - -llvm_update_compile_flags(ClangReplInterpreterExceptionTests) add_dependencies(ClangReplInterpreterExceptionTests clang-resource-headers) diff --git a/clang/unittests/Lex/CMakeLists.txt b/clang/unittests/Lex/CMakeLists.txt index 96ca6dda9cd85..5ec93946594b7 100644 --- a/clang/unittests/Lex/CMakeLists.txt +++ b/clang/unittests/Lex/CMakeLists.txt @@ -1,3 +1,7 @@ +set(LLVM_LINK_COMPONENTS + Support + ) + add_clang_unittest(LexTests DependencyDirectivesScannerTest.cpp HeaderMapTest.cpp @@ -9,15 +13,19 @@ add_clang_unittest(LexTests PPConditionalDirectiveRecordTest.cpp PPDependencyDirectivesTest.cpp PPMemoryAllocationsTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(LexTests + PRIVATE clangAST clangBasic clangLex clangParse clangSema - LINK_LIBS + ) + +target_link_libraries(LexTests + PRIVATE LLVMTestingAnnotations LLVMTestingSupport - LLVM_COMPONENTS - Support ) diff --git a/clang/unittests/Rewrite/CMakeLists.txt b/clang/unittests/Rewrite/CMakeLists.txt index 498613254e72b..3c5e2f8e5354b 100644 --- a/clang/unittests/Rewrite/CMakeLists.txt +++ b/clang/unittests/Rewrite/CMakeLists.txt @@ -1,10 +1,14 @@ +set(LLVM_LINK_COMPONENTS + Support + ) + add_clang_unittest(RewriteTests RewriterTest.cpp - CLANG_LIBS + ) +clang_target_link_libraries(RewriteTests + PRIVATE clangFrontend clangRewrite clangSerialization clangTooling - LLVM_COMPONENTS - Support ) diff --git a/clang/unittests/Sema/CMakeLists.txt b/clang/unittests/Sema/CMakeLists.txt index acc76c932afeb..17d39408000a4 100644 --- a/clang/unittests/Sema/CMakeLists.txt +++ b/clang/unittests/Sema/CMakeLists.txt @@ -1,3 +1,8 @@ +set(LLVM_LINK_COMPONENTS + FrontendOpenMP + Support + ) + add_clang_unittest(SemaTests ExternalSemaSourceTest.cpp CodeCompleteTest.cpp @@ -5,7 +10,10 @@ add_clang_unittest(SemaTests GslOwnerPointerInference.cpp SemaLookupTest.cpp SemaNoloadLookupTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(SemaTests + PRIVATE clangAST clangASTMatchers clangBasic @@ -14,11 +22,11 @@ add_clang_unittest(SemaTests clangSema clangSerialization clangTooling - LINK_LIBS + ) + +target_link_libraries(SemaTests + PRIVATE LLVMTestingAnnotations LLVMTestingSupport clangTesting - LLVM_COMPONENTS - FrontendOpenMP - Support ) diff --git a/clang/unittests/Serialization/CMakeLists.txt b/clang/unittests/Serialization/CMakeLists.txt index 6782e6b4d7330..e7005b5d511eb 100644 --- a/clang/unittests/Serialization/CMakeLists.txt +++ b/clang/unittests/Serialization/CMakeLists.txt @@ -1,3 +1,10 @@ +set(LLVM_LINK_COMPONENTS + BitReader + BitstreamReader + FrontendOpenMP + Support + ) + add_clang_unittest(SerializationTests ForceCheckFileInputTest.cpp InMemoryModuleCacheTest.cpp @@ -7,7 +14,10 @@ add_clang_unittest(SerializationTests LoadSpecLazilyTest.cpp SourceLocationEncodingTest.cpp VarDeclConstantInitTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(SerializationTests + PRIVATE clangAST clangBasic clangFrontend @@ -16,9 +26,4 @@ add_clang_unittest(SerializationTests clangSerialization clangTooling clangASTMatchers - LLVM_COMPONENTS - BitReader - BitstreamReader - FrontendOpenMP - Support ) diff --git a/clang/unittests/StaticAnalyzer/CMakeLists.txt b/clang/unittests/StaticAnalyzer/CMakeLists.txt index 143b7eedbfe05..3b01a4e9e5327 100644 --- a/clang/unittests/StaticAnalyzer/CMakeLists.txt +++ b/clang/unittests/StaticAnalyzer/CMakeLists.txt @@ -1,3 +1,8 @@ +set(LLVM_LINK_COMPONENTS + FrontendOpenMP + Support + ) + add_clang_unittest(StaticAnalysisTests AnalyzerOptionsTest.cpp APSIntTypeTest.cpp @@ -20,7 +25,10 @@ add_clang_unittest(StaticAnalysisTests SValTest.cpp TestReturnValueUnderConstruction.cpp Z3CrosscheckOracleTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(StaticAnalysisTests + PRIVATE clangBasic clangAnalysis clangAST @@ -31,9 +39,9 @@ add_clang_unittest(StaticAnalysisTests clangStaticAnalyzerCore clangStaticAnalyzerFrontend clangTooling - LINK_LIBS + ) + +target_link_libraries(StaticAnalysisTests + PRIVATE clangTesting - LLVM_COMPONENTS - FrontendOpenMP - Support ) diff --git a/clang/unittests/Support/CMakeLists.txt b/clang/unittests/Support/CMakeLists.txt index d0ce4f6d10617..22be5ed18cc7a 100644 --- a/clang/unittests/Support/CMakeLists.txt +++ b/clang/unittests/Support/CMakeLists.txt @@ -1,10 +1,15 @@ +set(LLVM_LINK_COMPONENTS + Support + ) + add_clang_unittest(ClangSupportTests TimeProfilerTest.cpp - CLANG_LIBS + ) + +clang_target_link_libraries(ClangSupportTests + PRIVATE clangAST clangBasic clangFrontend clangSerialization - LLVM_COMPONENTS - Support ) diff --git a/clang/unittests/Tooling/CMakeLists.txt b/clang/unittests/Tooling/CMakeLists.txt index 106c6b9dc38bd..401978c31863c 100644 --- a/clang/unittests/Tooling/CMakeLists.txt +++ b/clang/unittests/Tooling/CMakeLists.txt @@ -1,3 +1,13 @@ +set(LLVM_LINK_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + MC + Option + FrontendOpenMP + Support + TargetParser + ) + + add_clang_unittest(ToolingTests ASTSelectionTest.cpp CastExprTest.cpp @@ -59,8 +69,10 @@ add_clang_unittest(ToolingTests StencilTest.cpp ToolingTest.cpp TransformerTest.cpp + ) - CLANG_LIBS +clang_target_link_libraries(ToolingTests + PRIVATE clangAST clangASTMatchers clangBasic @@ -77,19 +89,13 @@ add_clang_unittest(ToolingTests clangToolingInclusionsStdlib clangToolingRefactoring clangTransformer + ) - LINK_LIBS +target_link_libraries(ToolingTests + PRIVATE LLVMTestingAnnotations LLVMTestingSupport clangTesting - - LLVM_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - MC - Option - FrontendOpenMP - Support - TargetParser - ) +) add_subdirectory(Syntax) diff --git a/clang/unittests/Tooling/Syntax/CMakeLists.txt b/clang/unittests/Tooling/Syntax/CMakeLists.txt index db110fefa954f..ff3b6176f879f 100644 --- a/clang/unittests/Tooling/Syntax/CMakeLists.txt +++ b/clang/unittests/Tooling/Syntax/CMakeLists.txt @@ -1,3 +1,7 @@ +set(LLVM_LINK_COMPONENTS + Support + ) + add_clang_unittest(SyntaxTests TreeTestBase.cpp BuildTreeTest.cpp @@ -5,8 +9,10 @@ add_clang_unittest(SyntaxTests SynthesisTest.cpp TreeTest.cpp TokensTest.cpp +) - CLANG_LIBS +clang_target_link_libraries(SyntaxTests + PRIVATE clangAST clangBasic clangFrontend @@ -15,12 +21,11 @@ add_clang_unittest(SyntaxTests clangTooling clangToolingCore clangToolingSyntax + ) - LINK_LIBS +target_link_libraries(SyntaxTests + PRIVATE clangTesting LLVMTestingAnnotations LLVMTestingSupport - - LLVM_COMPONENTS - Support ) diff --git a/clang/unittests/libclang/CMakeLists.txt b/clang/unittests/libclang/CMakeLists.txt index ba86c3c4d91e0..b3644a0e710e1 100644 --- a/clang/unittests/libclang/CMakeLists.txt +++ b/clang/unittests/libclang/CMakeLists.txt @@ -1,6 +1,9 @@ add_clang_unittest(libclangTests LibclangTest.cpp - LINK_LIBS + ) + +target_link_libraries(libclangTests + PRIVATE libclang ) diff --git a/clang/unittests/libclang/CrashTests/CMakeLists.txt b/clang/unittests/libclang/CrashTests/CMakeLists.txt index de7b5a8f6ee91..82f0e4c16e901 100644 --- a/clang/unittests/libclang/CrashTests/CMakeLists.txt +++ b/clang/unittests/libclang/CrashTests/CMakeLists.txt @@ -1,5 +1,8 @@ add_clang_unittest(libclangCrashTests LibclangCrashTest.cpp - LINK_LIBS + ) + +target_link_libraries(libclangCrashTests + PRIVATE libclang ) From d40bab359c408b0084cd3c115213205050401a9e Mon Sep 17 00:00:00 2001 From: donald chen Date: Wed, 2 Apr 2025 11:56:13 +0800 Subject: [PATCH 0329/1029] [mlir][liveness] fix bugs in liveness analysis (#133416) This patch fixes the following bugs: - In SparseBackwardAnalysis, the setToExitState function should propagate changes if it modifies the lattice. Previously, this issue was masked because multi-block scenarios were not tested, and the traversal order of backward data flow analysis starts from the end of the program. - The method in liveness analysis for determining whether the non-forwarded operand in branch/region branch operations is live is incorrect, which may cause originally live variables to be marked as not live. --- .../mlir/Analysis/DataFlow/SparseAnalysis.h | 6 +- .../Analysis/DataFlow/LivenessAnalysis.cpp | 93 ++++++++++++++----- .../DataFlow/test-liveness-analysis.mlir | 37 +++++++- 3 files changed, 107 insertions(+), 29 deletions(-) diff --git a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h index b9cb549a0e438..1b2c679176107 100644 --- a/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h +++ b/mlir/include/mlir/Analysis/DataFlow/SparseAnalysis.h @@ -413,10 +413,12 @@ class AbstractSparseBackwardDataFlowAnalysis : public DataFlowAnalysis { // Visit operands on call instructions that are not forwarded. virtual void visitCallOperand(OpOperand &operand) = 0; - /// Set the given lattice element(s) at control flow exit point(s). + /// Set the given lattice element(s) at control flow exit point(s) and + /// propagate the update if it chaned. virtual void setToExitState(AbstractSparseLattice *lattice) = 0; - /// Set the given lattice element(s) at control flow exit point(s). + /// Set the given lattice element(s) at control flow exit point(s) and + /// propagate the update if it chaned. void setAllToExitStates(ArrayRef lattices); /// Get the lattice element for a value. diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp index 9fb4d9df2530d..c12149a1a0242 100644 --- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp @@ -59,7 +59,9 @@ ChangeResult Liveness::meet(const AbstractSparseLattice &other) { /// (1.a) is an operand of an op with memory effects OR /// (1.b) is a non-forwarded branch operand and its branch op could take the /// control to a block that has an op with memory effects OR -/// (1.c) is a non-forwarded call operand. +/// (1.c) is a non-forwarded branch operand and its branch op could result +/// in different live result OR +/// (1.d) is a non-forwarded call operand. /// /// A value `A` is said to be "used to compute" value `B` iff `B` cannot be /// computed in the absence of `A`. Thus, in this implementation, we say that @@ -106,51 +108,88 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) { // the forwarded branch operands or the non-branch operands. Thus they need // to be handled separately. This is where we handle them. - // This marks values of type (1.b) liveness as "live". A non-forwarded + // This marks values of type (1.b/1.c) liveness as "live". A non-forwarded // branch operand will be live if a block where its op could take the control - // has an op with memory effects. + // has an op with memory effects or could result in different results. // Populating such blocks in `blocks`. + bool mayLive = false; SmallVector blocks; if (isa(op)) { - // When the op is a `RegionBranchOpInterface`, like an `scf.for` or an - // `scf.index_switch` op, its branch operand controls the flow into this - // op's regions. - for (Region ®ion : op->getRegions()) { - for (Block &block : region) - blocks.push_back(&block); + if (op->getNumResults() != 0) { + // This mark value of type 1.c liveness as may live, because the region + // branch operation has a return value, and the non-forwarded operand can + // determine the region to jump to, it can thereby control the result of + // the region branch operation. + // Therefore, if the result value is live, we conservatively consider the + // non-forwarded operand of the region branch operation with result may + // live and record all result. + for (Value result : op->getResults()) { + if (getLatticeElement(result)->isLive) { + mayLive = true; + break; + } + } + } else { + // When the op is a `RegionBranchOpInterface`, like an `scf.for` or an + // `scf.index_switch` op, its branch operand controls the flow into this + // op's regions. + for (Region ®ion : op->getRegions()) { + for (Block &block : region) + blocks.push_back(&block); + } } } else if (isa(op)) { - // When the op is a `BranchOpInterface`, like a `cf.cond_br` or a - // `cf.switch` op, its branch operand controls the flow into this op's - // successors. - blocks = op->getSuccessors(); + // We cannot track all successor blocks of the branch operation(More + // specifically, it's the successor's successor). Additionally, different + // blocks might also lead to the different block argument described in 1.c. + // Therefore, we conservatively consider the non-forwarded operand of the + // branch operation may live. + mayLive = true; } else { - // When the op is a `RegionBranchTerminatorOpInterface`, like an - // `scf.condition` op or return-like, like an `scf.yield` op, its branch - // operand controls the flow into this op's parent's (which is a - // `RegionBranchOpInterface`'s) regions. Operation *parentOp = op->getParentOp(); assert(isa(parentOp) && "expected parent op to implement `RegionBranchOpInterface`"); - for (Region ®ion : parentOp->getRegions()) { - for (Block &block : region) - blocks.push_back(&block); + if (parentOp->getNumResults() != 0) { + // This mark value of type 1.c liveness as may live, because the region + // branch operation has a return value, and the non-forwarded operand can + // determine the region to jump to, it can thereby control the result of + // the region branch operation. + // Therefore, if the result value is live, we conservatively consider the + // non-forwarded operand of the region branch operation with result may + // live and record all result. + for (Value result : parentOp->getResults()) { + if (getLatticeElement(result)->isLive) { + mayLive = true; + break; + } + } + } else { + // When the op is a `RegionBranchTerminatorOpInterface`, like an + // `scf.condition` op or return-like, like an `scf.yield` op, its branch + // operand controls the flow into this op's parent's (which is a + // `RegionBranchOpInterface`'s) regions. + for (Region ®ion : parentOp->getRegions()) { + for (Block &block : region) + blocks.push_back(&block); + } } } - bool foundMemoryEffectingOp = false; for (Block *block : blocks) { - if (foundMemoryEffectingOp) + if (mayLive) break; for (Operation &nestedOp : *block) { if (!isMemoryEffectFree(&nestedOp)) { - Liveness *operandLiveness = getLatticeElement(operand.get()); - propagateIfChanged(operandLiveness, operandLiveness->markLive()); - foundMemoryEffectingOp = true; + mayLive = true; break; } } } + if (mayLive) { + Liveness *operandLiveness = getLatticeElement(operand.get()); + propagateIfChanged(operandLiveness, operandLiveness->markLive()); + } + // Now that we have checked for memory-effecting ops in the blocks of concern, // we will simply visit the op with this non-forwarded operand to potentially // mark it "live" due to type (1.a/3) liveness. @@ -191,8 +230,12 @@ void LivenessAnalysis::visitCallOperand(OpOperand &operand) { } void LivenessAnalysis::setToExitState(Liveness *lattice) { + if (lattice->isLive) { + return; + } // This marks values of type (2) liveness as "live". (void)lattice->markLive(); + propagateIfChanged(lattice, ChangeResult::Change); } //===----------------------------------------------------------------------===// diff --git a/mlir/test/Analysis/DataFlow/test-liveness-analysis.mlir b/mlir/test/Analysis/DataFlow/test-liveness-analysis.mlir index b6aed1c0b054e..a89a0f4084e99 100644 --- a/mlir/test/Analysis/DataFlow/test-liveness-analysis.mlir +++ b/mlir/test/Analysis/DataFlow/test-liveness-analysis.mlir @@ -59,16 +59,49 @@ func.func @test_3_BranchOpInterface_type_1.b(%arg0: i32, %arg1: memref, %ar // ----- +// Positive test: Type(1.c) "is a non-forwarded branch operand and its branch +// op could result in different result" +// CHECK-LABEL: test_tag: cond_br: +// CHECK-NEXT: operand #0: live +// CHECK-NEXT: operand #1: live +// CHECK-NEXT: operand #2: live +func.func @test_branch_result_in_different_result_1.c(%arg0 : tensor, %arg1 : tensor, %arg2 : i1) -> tensor { + cf.cond_br %arg2, ^bb1(%arg0 : tensor), ^bb2(%arg1 : tensor) {tag = "cond_br"} +^bb1(%0 : tensor): + cf.br ^bb3(%0 : tensor) +^bb2(%1 : tensor): + cf.br ^bb3(%1 : tensor) +^bb3(%2 : tensor): + return %2 : tensor +} + +// ----- + +// Positive test: Type(1.c) "is a non-forwarded branch operand and its branch +// op could result in different result" +// CHECK-LABEL: test_tag: region_branch: +// CHECK-NEXT: operand #0: live +func.func @test_region_branch_result_in_different_result_1.c(%arg0 : tensor, %arg1 : tensor, %arg2 : i1) -> tensor { + %0 = scf.if %arg2 -> tensor { + scf.yield %arg0 : tensor + } else { + scf.yield %arg1 : tensor + } {tag="region_branch"} + return %0 : tensor +} + +// ----- + func.func private @private(%arg0 : i32, %arg1 : i32) { func.return } -// Positive test: Type (1.c) "is a non-forwarded call operand" +// Positive test: Type (1.d) "is a non-forwarded call operand" // CHECK-LABEL: test_tag: call // CHECK-LABEL: operand #0: not live // CHECK-LABEL: operand #1: not live // CHECK-LABEL: operand #2: live -func.func @test_4_type_1.c(%arg0: i32, %arg1: i32, %device: i32, %m0: memref) { +func.func @test_4_type_1.d(%arg0: i32, %arg1: i32, %device: i32, %m0: memref) { test.call_on_device @private(%arg0, %arg1), %device {tag = "call"} : (i32, i32, i32) -> () return } From 0cfabd37df9940346f3bf8a4d74c19e1f48a00e9 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Tue, 1 Apr 2025 21:41:30 -0700 Subject: [PATCH 0330/1029] [RISCV] Add Xqci Insn Formats (#132986) --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 22 +- .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 8 +- llvm/lib/Target/RISCV/RISCVInstrFormats.td | 8 +- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 45 ++-- llvm/lib/Target/RISCV/RISCVInstrInfoC.td | 6 - llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 215 ++++++++++++++++++ llvm/test/MC/RISCV/insn_xqci-invalid.s | 111 +++++++++ llvm/test/MC/RISCV/insn_xqci.s | 41 ++++ llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s | 2 +- 9 files changed, 429 insertions(+), 29 deletions(-) create mode 100644 llvm/test/MC/RISCV/insn_xqci-invalid.s create mode 100644 llvm/test/MC/RISCV/insn_xqci.s diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 8f9a5ae75fca7..5c940f95a0a41 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -867,6 +867,16 @@ struct RISCVOperand final : public MCParsedAsmOperand { [](int64_t Imm) { return Imm != 0 && isShiftedInt<6, 4>(Imm); }); } + bool isSImm16() const { + if (!isImm()) + return false; + RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; + int64_t Imm; + bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); + return IsConstantImm && isInt<16>(fixImmediateForRV32(Imm, isRV64Imm())) && + VK == RISCVMCExpr::VK_None; + } + bool isSImm16NonZero() const { return isSImmPred([](int64_t Imm) { return Imm != 0 && isInt<16>(Imm); }); } @@ -1511,6 +1521,9 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return generateImmOutOfRangeError( Operands, ErrorInfo, -(1 << 12), (1 << 12) - 2, "immediate must be a multiple of 2 bytes in the range"); + case Match_InvalidSImm16: + return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 15), + (1 << 15) - 1); case Match_InvalidSImm16NonZero: return generateImmOutOfRangeError( Operands, ErrorInfo, -(1 << 15), (1 << 15) - 1, @@ -3150,10 +3163,13 @@ bool RISCVAsmParser::parseDirectiveAttribute() { return false; } -bool isValidInsnFormat(StringRef Format, bool AllowC) { +bool isValidInsnFormat(StringRef Format, const MCSubtargetInfo &STI) { return StringSwitch(Format) .Cases("r", "r4", "i", "b", "sb", "u", "j", "uj", "s", true) - .Cases("cr", "ci", "ciw", "css", "cl", "cs", "ca", "cb", "cj", AllowC) + .Cases("cr", "ci", "ciw", "css", "cl", "cs", "ca", "cb", "cj", + STI.hasFeature(RISCV::FeatureStdExtZca)) + .Cases("qc.eai", "qc.ei", "qc.eb", "qc.ej", "qc.es", + !STI.hasFeature(RISCV::Feature64Bit)) .Default(false); } @@ -3243,7 +3259,7 @@ bool RISCVAsmParser::parseDirectiveInsn(SMLoc L) { return false; } - if (!isValidInsnFormat(Format, AllowC)) + if (!isValidInsnFormat(Format, getSTI())) return Error(ErrorLoc, "invalid instruction format"); std::string FormatName = (".insn_" + Format).str(); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index d6672de02862d..adccd1e6c5002 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -51,7 +51,12 @@ enum { InstFormatCLH = 19, InstFormatCSB = 20, InstFormatCSH = 21, - InstFormatOther = 22, + InstFormatQC_EAI = 22, + InstFormatQC_EI = 23, + InstFormatQC_EB = 24, + InstFormatQC_EJ = 25, + InstFormatQC_ES = 26, + InstFormatOther = 31, InstFormatMask = 31, InstFormatShift = 0, @@ -333,6 +338,7 @@ enum OperandType : unsigned { OPERAND_SIMM11, OPERAND_SIMM12, OPERAND_SIMM12_LSB00000, + OPERAND_SIMM16, OPERAND_SIMM16_NONZERO, OPERAND_SIMM20, OPERAND_SIMM26, diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td index d95e806b79f25..0bb0ba57ff50d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td @@ -52,7 +52,13 @@ def InstFormatCLB : InstFormat<18>; def InstFormatCLH : InstFormat<19>; def InstFormatCSB : InstFormat<20>; def InstFormatCSH : InstFormat<21>; -def InstFormatOther : InstFormat<22>; +def InstFormatQC_EAI : InstFormat<22>; +def InstFormatQC_EI : InstFormat<23>; +def InstFormatQC_EB : InstFormat<24>; +def InstFormatQC_EJ : InstFormat<25>; +def InstFormatQC_ES : InstFormat<26>; +def InstFormatOther : InstFormat<31>; + class RISCVVConstraint val> { bits<3> Value = val; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 89e5ad8067c1b..c87452171f090 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1144,6 +1144,33 @@ def AnyReg : Operand { let ParserMatchClass = AnyRegOperand; } +// isCodeGenOnly = 1 to hide them from the tablegened assembly parser. +let isCodeGenOnly = 1, hasSideEffects = 1, mayLoad = 1, mayStore = 1, + hasNoSchedulingInfo = 1 in { +def Insn16 : RVInst16<(outs), (ins uimm16:$value), "", "", [], InstFormatOther> { + bits<16> value; + + let Inst{15-0} = value; + let AsmString = ".insn 0x2, $value"; +} +def Insn32 : RVInst<(outs), (ins uimm32:$value), "", "", [], InstFormatOther> { + bits<32> value; + + let Inst{31-0} = value; + let AsmString = ".insn 0x4, $value"; +} +def Insn48 : RVInst48<(outs), (ins uimm48:$value), "", "", [], InstFormatOther> { + bits<48> value; + let Inst{47-0} = value; + let AsmString = ".insn 0x6, $value"; +} +def Insn64 : RVInst64<(outs), (ins uimm64:$value), "", "", [], InstFormatOther> { + bits<64> value; + let Inst{63-0} = value; + let AsmString = ".insn 0x8, $value"; +} +} // isCodeGenOnly, hasSideEffects, mayLoad, mayStore, hasNoSchedulingInfo + // isCodeGenOnly = 1 to hide them from the tablegened assembly parser. let isCodeGenOnly = 1, hasSideEffects = 1, mayLoad = 1, mayStore = 1, hasNoSchedulingInfo = 1 in { @@ -1179,23 +1206,7 @@ def InsnS : DirectiveInsnS<(outs), (ins uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs2, AnyReg:$rs1, simm12:$imm12), "$opcode, $funct3, $rs2, ${imm12}(${rs1})">; -def Insn32 : RVInst<(outs), (ins uimm32:$value), "", "", [], InstFormatOther> { - bits<32> value; - - let Inst{31-0} = value; - let AsmString = ".insn 0x4, $value"; -} -def Insn48 : RVInst48<(outs), (ins uimm48:$value), "", "", [], InstFormatOther> { - bits<48> value; - let Inst{47-0} = value; - let AsmString = ".insn 0x6, $value"; -} -def Insn64 : RVInst64<(outs), (ins uimm64:$value), "", "", [], InstFormatOther> { - bits<64> value; - let Inst{63-0} = value; - let AsmString = ".insn 0x8, $value"; -} -} +} // isCodeGenOnly, hasSideEffects, mayLoad, mayStore, hasNoSchedulingInfo // Use InstAliases to match these so that we can combine the insn and format // into a mnemonic to use as the key for the tablegened asm matcher table. The diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index 718d95aa1a4bc..1c94af58880f2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -799,12 +799,6 @@ def InsnCJ : DirectiveInsnCJ<(outs), (ins uimm2_opcode:$opcode, uimm3:$funct3, bare_simm12_lsb0:$imm11), "$opcode, $funct3, $imm11">; -def Insn16 : RVInst16<(outs), (ins uimm16:$value), "", "", [], InstFormatOther> { - bits<16> value; - - let Inst{15-0} = value; - let AsmString = ".insn 0x2, $value"; -} } // Use InstAliases to match these so that we can combine the insn and format diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 23feb52a0c2ca..a99cebe666808 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -94,6 +94,8 @@ def simm5nonzero : RISCVOp, def simm11 : RISCVSImmLeafOp<11>; +def simm16 : RISCVSImmOp<16>; + def simm16nonzero : RISCVOp, ImmLeaf(Imm);}]> { let ParserMatchClass = SImmAsmOperand<16, "NonZero">; @@ -139,6 +141,219 @@ def simm32_lsb0 : Operand { // Instruction Formats //===----------------------------------------------------------------------===// + +class DirectiveInsnQC_EAI + : RVInst48 { + bits<7> opcode; + bits<3> func3; + bits<1> func1; + + bits<5> rd; + bits<32> imm32; + + let Inst{47-16} = imm32; + let Inst{15} = func1; + let Inst{14-12} = func3; + let Inst{11-7} = rd; + let Inst{6-0} = opcode; + + let AsmString = ".insn qc.eai " # argstr; +} + +class DirectiveInsnQC_EI + : RVInst48 { + bits<7> opcode; + bits<3> func3; + bits<2> func2; + + bits<5> rd; + bits<5> rs1; + bits<26> imm26; + + let Inst{47-32} = imm26{25-10}; + let Inst{31-30} = func2; + let Inst{29-20} = imm26{9-0}; + let Inst{19-15} = rs1; + let Inst{14-12} = func3; + let Inst{11-7} = rd; + let Inst{6-0} = opcode; + + let AsmString = ".insn qc.ei " # argstr; +} + +class DirectiveInsnQC_EB + : RVInst48 { + bits<7> opcode; + bits<3> func3; + bits<5> func5; + + bits<5> rs1; + bits<12> imm12; // This one is the PC-relative offset + bits<16> imm16; + + let Inst{47-32} = imm16; + let Inst{31} = imm12{11}; + let Inst{30-25} = imm12{9-4}; + let Inst{24-20} = func5; + let Inst{19-15} = rs1; + let Inst{14-12} = func3; + let Inst{11-8} = imm12{3-0}; + let Inst{7} = imm12{10}; + let Inst{6-0} = opcode; + + let AsmString = ".insn qc.eb " # argstr; +} + +class DirectiveInsnQC_EJ + : RVInst48 { + bits<7> opcode; + bits<3> func3; + bits<2> func2; + bits<5> func5; + + bits<31> imm31; + + let Inst{47-32} = imm31{30-15}; + let Inst{31} = imm31{11}; + let Inst{30-25} = imm31{9-4}; + let Inst{24-20} = func5; + let Inst{19-17} = imm31{14-12}; + let Inst{16-15} = func2; + let Inst{14-12} = func3; + let Inst{11-8} = imm31{3-0}; + let Inst{7} = imm31{10}; + let Inst{6-0} = opcode; + + let AsmString = ".insn qc.ej " # argstr; +} + +class DirectiveInsnQC_ES + : RVInst48 { + bits<7> opcode; + bits<3> func3; + bits<2> func2; + + bits<5> rs1; + bits<5> rs2; + bits<26> imm26; + + let Inst{47-32} = imm26{25-10}; + let Inst{31-30} = func2; + let Inst{29-25} = imm26{9-5}; + let Inst{24-20} = rs2; + let Inst{19-15} = rs1; + let Inst{14-12} = func3; + let Inst{11-7} = imm26{4-0}; + let Inst{6-0} = opcode; + + let AsmString = ".insn qc.es " # argstr; +} + + +let isCodeGenOnly = true, hasSideEffects = true, mayLoad = true, + mayStore = true, hasNoSchedulingInfo = true, Predicates=[IsRV32] in { +def InsnQC_EAI : DirectiveInsnQC_EAI<(outs AnyReg:$rd), + (ins uimm7_opcode:$opcode, + uimm3:$func3, + uimm1:$func1, + simm32:$imm32), + "$opcode, $func3, $func1, $rd, $imm32">; +def InsnQC_EI : DirectiveInsnQC_EI<(outs AnyReg:$rd), + (ins uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs1, + simm26:$imm26), + "$opcode, $func3, $func2, $rd, $rs1, $imm26">; +def InsnQC_EI_Mem : DirectiveInsnQC_EI<(outs AnyReg:$rd), + (ins uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs1, + simm26:$imm26), + "$opcode, $func3, $func2, $rd, ${imm26}(${rs1})">; +def InsnQC_EB : DirectiveInsnQC_EB<(outs), + (ins uimm7_opcode:$opcode, + uimm3:$func3, + uimm5:$func5, + AnyReg:$rs1, + simm16:$imm16, + simm13_lsb0:$imm12), + "$opcode, $func3, $func5, $rs1, $imm16, $imm12">; +def InsnQC_EJ : DirectiveInsnQC_EJ<(outs), + (ins uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + uimm5:$func5, + simm32_lsb0:$imm31), + "$opcode, $func3, $func2, $func5, $imm31">; +def InsnQC_ES : DirectiveInsnQC_ES<(outs), + (ins uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs2, + AnyReg:$rs1, + simm26:$imm26), + "$opcode, $func3, $func2, $rs2, ${imm26}(${rs1})">; +} // isCodeGenOnly, hasSideEffects, mayLoad, mayStore, hasNoSchedulingInfo, Predicates + +let EmitPriority = 0, Predicates = [IsRV32] in { +def : InstAlias<".insn_qc.eai $opcode, $func3, $func1, $rd, $imm32", + (InsnQC_EAI AnyReg:$rd, + uimm7_opcode:$opcode, + uimm3:$func3, + uimm1:$func1, + simm32:$imm32)>; +def : InstAlias<".insn_qc.ei $opcode, $func3, $func2, $rd, $rs1, $imm26", + (InsnQC_EI AnyReg:$rd, + uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs1, + simm26:$imm26)>; +def : InstAlias<".insn_qc.ei $opcode, $func3, $func2, $rd, ${imm26}(${rs1})", + (InsnQC_EI_Mem AnyReg:$rd, + uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs1, + simm26:$imm26)>; +def : InstAlias<".insn_qc.ei $opcode, $func3, $func2, $rd, (${rs1})", + (InsnQC_EI_Mem AnyReg:$rd, + uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs1, + 0)>; +def : InstAlias<".insn_qc.eb $opcode, $func3, $func5, $rs1, $imm16, $imm12", + (InsnQC_EB uimm7_opcode:$opcode, + uimm3:$func3, + uimm5:$func5, + AnyReg:$rs1, + simm16:$imm16, + simm13_lsb0:$imm12)>; +def : InstAlias<".insn_qc.ej $opcode, $func3, $func2, $func5, $imm31", + (InsnQC_EJ uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + uimm5:$func5, + simm32_lsb0:$imm31)>; +def : InstAlias<".insn_qc.es $opcode, $func3, $func2, $rs2, ${imm26}(${rs1})", + (InsnQC_ES uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs2, + AnyReg:$rs1, + simm26:$imm26)>; +def : InstAlias<".insn_qc.es $opcode, $func3, $func2, $rs2, (${rs1})", + (InsnQC_ES uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs2, + AnyReg:$rs1, + 0)>; +} // EmitPriority = 0, Predicates = [IsRV32] + //===----------------------------------------------------------------------===// // Instruction Class Templates //===----------------------------------------------------------------------===// diff --git a/llvm/test/MC/RISCV/insn_xqci-invalid.s b/llvm/test/MC/RISCV/insn_xqci-invalid.s new file mode 100644 index 0000000000000..8177adaf8ac50 --- /dev/null +++ b/llvm/test/MC/RISCV/insn_xqci-invalid.s @@ -0,0 +1,111 @@ +# RUN: not llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ +# RUN: 2>&1 | FileCheck -check-prefixes=CHECK-ERR %s + +.insn qc.eai 128, 0, 0, x0, 0 +# CHECK-ERR: [[@LINE-1]]:14: error: opcode must be a valid opcode name or an immediate in the range [0, 127] + +.insn qc.eai 127, 8, 0, x0, 0 +# CHECK-ERR: [[@LINE-1]]:19: error: immediate must be an integer in the range [0, 7] + +.insn qc.eai 127, 7, 2, x0, 0 +# CHECK-ERR: [[@LINE-1]]:22: error: immediate must be an integer in the range [0, 1] + +.insn qc.eai 127, 7, 1, not_a_reg, 0 +# CHECK-ERR: [[@LINE-1]]:25: error: invalid operand for instruction + +.insn qc.eai 127, 7, 1, x31, 0x100000000 +# CHECK-ERR: [[@LINE-1]]:30: error: immediate must be an integer in the range [-2147483648, 4294967295] + +.insn qc.eai 126, 7, 1, x31, 0xFFFFFFFF, extra +# CHECK-ERR: [[@LINE-1]]:42: error: invalid operand for instruction + +.insn qc.ei 128, 0, 0, x31, x0, 0 +# CHECK-ERR: [[@LINE-1]]:13: error: opcode must be a valid opcode name or an immediate in the range [0, 127] + +.insn qc.ei 127, 8, 0, x0, x0, 0 +# CHECK-ERR: [[@LINE-1]]:18: error: immediate must be an integer in the range [0, 7] + +.insn qc.ei 127, 7, 4, x0, x0, 0 +# CHECK-ERR: [[@LINE-1]]:21: error: immediate must be an integer in the range [0, 3] + +.insn qc.ei 127, 7, 3, not_a_reg, x0, 0 +# CHECK-ERR: [[@LINE-1]]:24: error: invalid operand for instruction + +.insn qc.ei 127, 7, 3, x31, not_a_reg, 0 +# CHECK-ERR: [[@LINE-1]]:29: error: immediate must be an integer in the range [-33554432, 33554431] + +.insn qc.ei 127, 7, 3, x31, x31, 0x2000000 +# CHECK-ERR: [[@LINE-1]]:34: error: immediate must be an integer in the range [-33554432, 33554431] + +.insn qc.ei 127, 7, 3, x31, x31, 0x1000000, extra +# CHECK-ERR: [[@LINE-1]]:45: error: invalid operand for instruction + +.insn qc.ei 126, 7, 3, x31, 0x2000000(x0) +# CHECK-ERR: [[@LINE-1]]:29: error: immediate must be an integer in the range [-33554432, 33554431] + +.insn qc.ei 126, 7, 3, x31, 0x1000000(not_a_reg) +# CHECK-ERR: [[@LINE-1]]:39: error: expected register + +.insn qc.ei 126, 7, 3, x31, 0x1000000(x31), extra +# CHECK-ERR: [[@LINE-1]]:45: error: invalid operand for instruction + +.insn qc.eb 128, 0, 0, x0, 0, 0 +# CHECK-ERR: [[@LINE-1]]:13: error: opcode must be a valid opcode name or an immediate in the range [0, 127] + +.insn qc.eb 127, 8, 0, x0, 0, 0 +# CHECK-ERR: [[@LINE-1]]:18: error: immediate must be an integer in the range [0, 7] + +.insn qc.eb 127, 7, 32, x0, 0, 0 +# CHECK-ERR: [[@LINE-1]]:21: error: immediate must be an integer in the range [0, 31] + +.insn qc.eb 127, 7, 31, not_a_reg, 0, 0 +# CHECK-ERR: [[@LINE-1]]:25: error: invalid operand for instruction + +.insn qc.eb 127, 7, 31, x31, 0x8000, 0 +# CHECK-ERR: [[@LINE-1]]:30: error: immediate must be an integer in the range [-32768, 32767] + +.insn qc.eb 127, 7, 31, x31, 0x4000, 0x1000 +# CHECK-ERR: [[@LINE-1]]:38: error: immediate must be a multiple of 2 bytes in the range [-4096, 4094] + +.insn qc.eb 127, 7, 31, x31, 0x4000, 0x800, extra +# CHECK-ERR: [[@LINE-1]]:45: error: invalid operand for instruction + + +.insn qc.ej 128, 0, 0, 0, 0 +# CHECK-ERR: [[@LINE-1]]:13: error: opcode must be a valid opcode name or an immediate in the range [0, 127] + +.insn qc.ej 127, 8, 0, 0, 0 +# CHECK-ERR: [[@LINE-1]]:18: error: immediate must be an integer in the range [0, 7] + +.insn qc.ej 127, 7, 4, 0, 0 +# CHECK-ERR: [[@LINE-1]]:21: error: immediate must be an integer in the range [0, 3] + +.insn qc.ej 127, 7, 3, 32, 0 +# CHECK-ERR: [[@LINE-1]]:24: error: immediate must be an integer in the range [0, 31] + +.insn qc.ej 127, 7, 3, 31, 0x100000000 +# CHECK-ERR: [[@LINE-1]]:28: error: operand must be a multiple of 2 bytes in the range [-2147483648, 2147483646] + +.insn qc.ej 127, 7, 3, 31, 0x80000000, extra +# CHECK-ERR: [[@LINE-1]]:40: error: invalid operand for instruction + +.insn qc.es 128, 0, 0, x0, 0(x0) +# CHECK-ERR: [[@LINE-1]]:13: error: opcode must be a valid opcode name or an immediate in the range [0, 127] + +.insn qc.es 127, 8, 0, x0, 0(x0) +# CHECK-ERR: [[@LINE-1]]:18: error: immediate must be an integer in the range [0, 7] + +.insn qc.es 127, 7, 4, x0, 0(x0) +# CHECK-ERR: [[@LINE-1]]:21: error: immediate must be an integer in the range [0, 3] + +.insn qc.es 127, 7, 3, not_a_reg, 0(x0) +# CHECK-ERR: [[@LINE-1]]:24: error: invalid operand for instruction + +.insn qc.es 127, 7, 3, x31, 0x2000000(x0) +# CHECK-ERR: [[@LINE-1]]:29: error: immediate must be an integer in the range [-33554432, 33554431] + +.insn qc.es 127, 7, 3, x31, 0x1000000(not_a_reg) +# CHECK-ERR: [[@LINE-1]]:39: error: expected register + +.insn qc.es 127, 7, 3, x31, 0x1000000(x31), extra +# CHECK-ERR: [[@LINE-1]]:45: error: invalid operand for instruction diff --git a/llvm/test/MC/RISCV/insn_xqci.s b/llvm/test/MC/RISCV/insn_xqci.s new file mode 100644 index 0000000000000..098745ec22294 --- /dev/null +++ b/llvm/test/MC/RISCV/insn_xqci.s @@ -0,0 +1,41 @@ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ASM %s +# RUN: llvm-mc -filetype=obj -triple=riscv32 < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqcilia,+experimental-xqcilo,+experimental-xqcibi,+experimental-xqcilb \ +# RUN: -M no-aliases -d -r - \ +# RUN: | FileCheck -check-prefixes=CHECK-OBJ %s + +# CHECK-ASM: .insn qc.eai 31, 2, 0, a0, 16711935 +# CHECK-ASM: encoding: [0x1f,0x25,0xff,0x00,0xff,0x00] +# CHECK-OBJ: qc.e.addai a0, 0xff00ff +.insn qc.eai 0x1f, 2, 0, a0, 0x00FF00FF + +# CHECK-ASM: .insn qc.ei 31, 3, 2, a0, a1, 16711935 +# CHECK-ASM: encoding: [0x1f,0xb5,0xf5,0x8f,0xc0,0x3f] +# CHECK-OBJ: qc.e.addi a0, a1, 0xff00ff +.insn qc.ei 0x1f, 3, 2, a0, a1, 0x00FF00FF + +# CHECK-ASM: .insn qc.ei 31, 5, 0, a1, 16711935(a0) +# CHECK-ASM: encoding: [0x9f,0x55,0xf5,0x0f,0xc0,0x3f] +# CHECK-OBJ: qc.e.lb a1, 0xff00ff(a0) +.insn qc.ei 0x1f, 5, 0, a1, 0x00FF00FF(a0) + +# CHECK-ASM: .insn qc.ei 31, 5, 0, a1, 0(a0) +# CHECK-ASM: encoding: [0x9f,0x55,0x05,0x00,0x00,0x00] +# CHECK-OBJ: qc.e.lb a1, 0x0(a0) +.insn qc.ei 0x1f, 5, 0, a1, (a0) + +# CHECK-ASM: .insn qc.eb 31, 4, 24, a0, 17476, 22 +# CHECK-ASM: encoding: [0x1f,0x4b,0x85,0x01,0x44,0x44] +# CHECK-OBJ: qc.e.beqi a0, 0x4444, 0x2e +.insn qc.eb 0x1f, 4, 24, a0, 0x4444, 22 + +# CHECK-ASM: .insn qc.ej 31, 4, 0, 0, 22 +# CHECK-ASM: encoding: [0x1f,0x4b,0x00,0x00,0x00,0x00] +# CHECK-OBJ: qc.e.j 0x34 +.insn qc.ej 0x1f, 4, 0, 0, 22 + +# CHECK-ASM: .insn qc.es 31, 6, 1, a1, 0(a0) +# CHECK-ASM: encoding: [0x1f,0x60,0xb5,0x40,0x00,0x00] +# CHECK-OBJ: qc.e.sb a1, 0x0(a0) +.insn qc.es 0x1f, 6, 1, a1, (a0) diff --git a/llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s b/llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s index fe6d0de0a4b00..e45c43a50048a 100644 --- a/llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s +++ b/llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s @@ -1,7 +1,7 @@ # RUN: not llvm-mc -triple riscv32 -mattr=+xtheadmemidx < %s 2>&1 | FileCheck %s # RUN: not llvm-mc -triple riscv64 -mattr=+xtheadmemidx < %s 2>&1 | FileCheck %s -th.ldia 0(a0), (a1), 0, 0 # CHECK: :[[@LINE]]:23: error: invalid operand for instruction +th.ldia 0(a0), (a1), 0, 0 # CHECK: :[[@LINE]]:26: error: invalid operand for instruction th.ldib a0, 2(a1), 15, 1 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction th.lwia a0, (a1), 30, 2 # CHECK: :[[@LINE]]:20: error: immediate must be an integer in the range [-16, 15] th.lwib a0, (a1), -16, 43 # CHECK: :[[@LINE]]:25: error: immediate must be an integer in the range [0, 3] From adbe9e20bf157fda89a8896d3832dc9e3cd4351c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 2 Apr 2025 11:41:47 +0700 Subject: [PATCH 0331/1029] llvm-reduce: Skip setting br conditions on already constant branches (#133841) If we are trying to simplify branch conditions to true, ignore branches already set to a constant true. If we are simplifying to constant false, ignore the already constant false cases. This saves steps in this edge case, and avoids the side effect of running simplifycfg on blocks we did not intend to modify. --- ...fy-conditionals-already-constant-brcond.ll | 139 ++++++++++++++++++ .../deltas/ReduceUsingSimplifyCFG.cpp | 13 +- 2 files changed, 146 insertions(+), 6 deletions(-) create mode 100644 llvm/test/tools/llvm-reduce/simplify-conditionals-already-constant-brcond.ll diff --git a/llvm/test/tools/llvm-reduce/simplify-conditionals-already-constant-brcond.ll b/llvm/test/tools/llvm-reduce/simplify-conditionals-already-constant-brcond.ll new file mode 100644 index 0000000000000..bcb1effd1fb71 --- /dev/null +++ b/llvm/test/tools/llvm-reduce/simplify-conditionals-already-constant-brcond.ll @@ -0,0 +1,139 @@ +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=simplify-conditionals-false --test FileCheck --test-arg --check-prefixes=INTERESTING --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck -check-prefixes=RESULT-FALSE,CHECK %s < %t + +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=simplify-conditionals-true --test FileCheck --test-arg --check-prefixes=INTERESTING --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck -check-prefixes=RESULT-TRUE,CHECK %s < %t + +; Check that simplify-conditionals-true/false do not attempt block +; simplification in cases that happened to already use a constant +; true/false branch. We should not get the side effect of running +; simplifycfg on blocks where we did not change the terminator value, +; and not introduce unreachable code. + + +; CHECK-LABEL: @br_false( +; RESULT-FALSE: br i1 false, label %will_be_unreachable, label %exit +; RESULT-TRUE: br label %will_be_unreachable +define i1 @br_false(i64 %int) { +entry: + %i2p = inttoptr i64 %int to ptr + br i1 false, label %will_be_unreachable, label %exit + +will_be_unreachable: + %load = load ptr, ptr %i2p, align 8 + br label %for.body + +for.body: + br label %for.body + +exit: + ret i1 false +} + +; CHECK-LABEL: @br_false_keep_in_unreachable( +; CHECK: entry +; INTERESTING: [[I2P:%.+]] = inttoptr i64 %int to ptr +; INTERESTING: load ptr, ptr [[I2P]] + +; RESULT-FALSE: br i1 false, label %will_be_unreachable, label %exit +; RESULT-TRUE: br label %will_be_unreachable +define i1 @br_false_keep_in_unreachable(i64 %int) { +entry: + br i1 false, label %will_be_unreachable, label %exit + +will_be_unreachable: + %i2p = inttoptr i64 %int to ptr + %load = load ptr, ptr %i2p, align 8 + br label %for.body + +for.body: + br label %for.body + +exit: + ret i1 false +} + +; CHECK-LABEL: @br_true( + +; RESULT-FALSE: br label %will_be_unreachable +; RESULT-TRUE: br i1 true, label %exit, label %will_be_unreachable +define i1 @br_true(i64 %int) { +entry: + %i2p = inttoptr i64 %int to ptr + br i1 true, label %exit, label %will_be_unreachable + +will_be_unreachable: + %load = load ptr, ptr %i2p, align 8 + br label %for.body + +for.body: + br label %for.body + +exit: + ret i1 false +} + +; CHECK-LABEL: @br_true_keep_in_unreachable( +; CHECK: entry: +; INTERESTING: [[I2P:%.+]] = inttoptr i64 %int to ptr +; INTERESTING: load ptr, ptr [[I2P]] + +; RESULT-FALSE: br label %will_be_unreachable +; RESULT-TRUE: br i1 true, label %exit, label %will_be_unreachable +define i1 @br_true_keep_in_unreachable(i64 %int) { +entry: + %i2p = inttoptr i64 %int to ptr + br i1 true, label %exit, label %will_be_unreachable + +will_be_unreachable: + %load = load ptr, ptr %i2p, align 8 + br label %for.body + +for.body: + br label %for.body + +exit: + ret i1 false +} + +; CHECK-LABEL: @br_poison( +; RESULT-FALSE: br label %will_be_unreachable +; RESULT-TRUE: br label %exit +define i1 @br_poison(i64 %int) { +entry: + %i2p = inttoptr i64 %int to ptr + br i1 poison, label %exit, label %will_be_unreachable + +will_be_unreachable: + %load = load ptr, ptr %i2p, align 8 + br label %for.body + +for.body: + br label %for.body + +exit: + ret i1 false +} + +; CHECK-LABEL: @br_poison_keep_in_unreachable( +; CHECK: entry: +; INTERESTING: [[I2P:%.+]] = inttoptr i64 %int to ptr +; INTERESTING: load ptr, ptr [[I2P]] + +; RESULT-FALSE: br label %will_be_unreachable +; RESULT-TRUE: br label %exit +define i1 @br_poison_keep_in_unreachable(i64 %int) { +entry: + %i2p = inttoptr i64 %int to ptr + br i1 poison, label %exit, label %will_be_unreachable + +will_be_unreachable: + %load = load ptr, ptr %i2p, align 8 + br label %for.body + +for.body: + br label %for.body + +exit: + ret i1 false +} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp index ec37e248da8ed..3d6b35d1895e7 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp @@ -37,17 +37,18 @@ static void reduceConditionals(Oracle &O, ReducerWorkItem &WorkItem, Module &M = WorkItem.getModule(); SmallVector ToSimplify; + LLVMContext &Ctx = M.getContext(); + ConstantInt *ConstValToSet = + Direction ? ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx); + for (auto &F : M) { for (auto &BB : F) { auto *BR = dyn_cast(BB.getTerminator()); - if (!BR || !BR->isConditional() || O.shouldKeep()) + if (!BR || !BR->isConditional() || BR->getCondition() == ConstValToSet || + O.shouldKeep()) continue; - if (Direction) - BR->setCondition(ConstantInt::getTrue(BR->getContext())); - else - BR->setCondition(ConstantInt::getFalse(BR->getContext())); - + BR->setCondition(ConstValToSet); ToSimplify.push_back(&BB); } } From 23d894e0a3cf5cbc1ac754761da4f82608e14747 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 2 Apr 2025 10:27:12 +0700 Subject: [PATCH 0332/1029] llvm-reduce: Fix comment typo --- llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp b/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp index 44f1e52204f2f..6022b53fae29c 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceFunctions.cpp @@ -42,7 +42,7 @@ void llvm::reduceFunctionsDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { }); // Then, drop body of each of them. We want to batch this and do nothing else - // here so that minimal number of remaining exteranal uses will remain. + // here so that minimal number of remaining external uses will remain. for (Constant *F : FuncsToRemove) F->dropAllReferences(); From 68fb7a5a1d203dde7badf67031bdd9eb650eef5d Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 1 Apr 2025 21:51:54 -0700 Subject: [PATCH 0333/1029] Revert "[RISCV] Add Xqci Insn Formats (#132986)" This reverts commit 0cfabd37df9940346f3bf8a4d74c19e1f48a00e9. Multiple builtbot failures have been reported: https://github.com/llvm/llvm-project/pull/132986 --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 22 +- .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 8 +- llvm/lib/Target/RISCV/RISCVInstrFormats.td | 8 +- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 45 ++-- llvm/lib/Target/RISCV/RISCVInstrInfoC.td | 6 + llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 215 ------------------ llvm/test/MC/RISCV/insn_xqci-invalid.s | 111 --------- llvm/test/MC/RISCV/insn_xqci.s | 41 ---- llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s | 2 +- 9 files changed, 29 insertions(+), 429 deletions(-) delete mode 100644 llvm/test/MC/RISCV/insn_xqci-invalid.s delete mode 100644 llvm/test/MC/RISCV/insn_xqci.s diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 5c940f95a0a41..8f9a5ae75fca7 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -867,16 +867,6 @@ struct RISCVOperand final : public MCParsedAsmOperand { [](int64_t Imm) { return Imm != 0 && isShiftedInt<6, 4>(Imm); }); } - bool isSImm16() const { - if (!isImm()) - return false; - RISCVMCExpr::Specifier VK = RISCVMCExpr::VK_None; - int64_t Imm; - bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK); - return IsConstantImm && isInt<16>(fixImmediateForRV32(Imm, isRV64Imm())) && - VK == RISCVMCExpr::VK_None; - } - bool isSImm16NonZero() const { return isSImmPred([](int64_t Imm) { return Imm != 0 && isInt<16>(Imm); }); } @@ -1521,9 +1511,6 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return generateImmOutOfRangeError( Operands, ErrorInfo, -(1 << 12), (1 << 12) - 2, "immediate must be a multiple of 2 bytes in the range"); - case Match_InvalidSImm16: - return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 15), - (1 << 15) - 1); case Match_InvalidSImm16NonZero: return generateImmOutOfRangeError( Operands, ErrorInfo, -(1 << 15), (1 << 15) - 1, @@ -3163,13 +3150,10 @@ bool RISCVAsmParser::parseDirectiveAttribute() { return false; } -bool isValidInsnFormat(StringRef Format, const MCSubtargetInfo &STI) { +bool isValidInsnFormat(StringRef Format, bool AllowC) { return StringSwitch(Format) .Cases("r", "r4", "i", "b", "sb", "u", "j", "uj", "s", true) - .Cases("cr", "ci", "ciw", "css", "cl", "cs", "ca", "cb", "cj", - STI.hasFeature(RISCV::FeatureStdExtZca)) - .Cases("qc.eai", "qc.ei", "qc.eb", "qc.ej", "qc.es", - !STI.hasFeature(RISCV::Feature64Bit)) + .Cases("cr", "ci", "ciw", "css", "cl", "cs", "ca", "cb", "cj", AllowC) .Default(false); } @@ -3259,7 +3243,7 @@ bool RISCVAsmParser::parseDirectiveInsn(SMLoc L) { return false; } - if (!isValidInsnFormat(Format, getSTI())) + if (!isValidInsnFormat(Format, AllowC)) return Error(ErrorLoc, "invalid instruction format"); std::string FormatName = (".insn_" + Format).str(); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index adccd1e6c5002..d6672de02862d 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -51,12 +51,7 @@ enum { InstFormatCLH = 19, InstFormatCSB = 20, InstFormatCSH = 21, - InstFormatQC_EAI = 22, - InstFormatQC_EI = 23, - InstFormatQC_EB = 24, - InstFormatQC_EJ = 25, - InstFormatQC_ES = 26, - InstFormatOther = 31, + InstFormatOther = 22, InstFormatMask = 31, InstFormatShift = 0, @@ -338,7 +333,6 @@ enum OperandType : unsigned { OPERAND_SIMM11, OPERAND_SIMM12, OPERAND_SIMM12_LSB00000, - OPERAND_SIMM16, OPERAND_SIMM16_NONZERO, OPERAND_SIMM20, OPERAND_SIMM26, diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td index 0bb0ba57ff50d..d95e806b79f25 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td @@ -52,13 +52,7 @@ def InstFormatCLB : InstFormat<18>; def InstFormatCLH : InstFormat<19>; def InstFormatCSB : InstFormat<20>; def InstFormatCSH : InstFormat<21>; -def InstFormatQC_EAI : InstFormat<22>; -def InstFormatQC_EI : InstFormat<23>; -def InstFormatQC_EB : InstFormat<24>; -def InstFormatQC_EJ : InstFormat<25>; -def InstFormatQC_ES : InstFormat<26>; -def InstFormatOther : InstFormat<31>; - +def InstFormatOther : InstFormat<22>; class RISCVVConstraint val> { bits<3> Value = val; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index c87452171f090..89e5ad8067c1b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1144,33 +1144,6 @@ def AnyReg : Operand { let ParserMatchClass = AnyRegOperand; } -// isCodeGenOnly = 1 to hide them from the tablegened assembly parser. -let isCodeGenOnly = 1, hasSideEffects = 1, mayLoad = 1, mayStore = 1, - hasNoSchedulingInfo = 1 in { -def Insn16 : RVInst16<(outs), (ins uimm16:$value), "", "", [], InstFormatOther> { - bits<16> value; - - let Inst{15-0} = value; - let AsmString = ".insn 0x2, $value"; -} -def Insn32 : RVInst<(outs), (ins uimm32:$value), "", "", [], InstFormatOther> { - bits<32> value; - - let Inst{31-0} = value; - let AsmString = ".insn 0x4, $value"; -} -def Insn48 : RVInst48<(outs), (ins uimm48:$value), "", "", [], InstFormatOther> { - bits<48> value; - let Inst{47-0} = value; - let AsmString = ".insn 0x6, $value"; -} -def Insn64 : RVInst64<(outs), (ins uimm64:$value), "", "", [], InstFormatOther> { - bits<64> value; - let Inst{63-0} = value; - let AsmString = ".insn 0x8, $value"; -} -} // isCodeGenOnly, hasSideEffects, mayLoad, mayStore, hasNoSchedulingInfo - // isCodeGenOnly = 1 to hide them from the tablegened assembly parser. let isCodeGenOnly = 1, hasSideEffects = 1, mayLoad = 1, mayStore = 1, hasNoSchedulingInfo = 1 in { @@ -1206,7 +1179,23 @@ def InsnS : DirectiveInsnS<(outs), (ins uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs2, AnyReg:$rs1, simm12:$imm12), "$opcode, $funct3, $rs2, ${imm12}(${rs1})">; -} // isCodeGenOnly, hasSideEffects, mayLoad, mayStore, hasNoSchedulingInfo +def Insn32 : RVInst<(outs), (ins uimm32:$value), "", "", [], InstFormatOther> { + bits<32> value; + + let Inst{31-0} = value; + let AsmString = ".insn 0x4, $value"; +} +def Insn48 : RVInst48<(outs), (ins uimm48:$value), "", "", [], InstFormatOther> { + bits<48> value; + let Inst{47-0} = value; + let AsmString = ".insn 0x6, $value"; +} +def Insn64 : RVInst64<(outs), (ins uimm64:$value), "", "", [], InstFormatOther> { + bits<64> value; + let Inst{63-0} = value; + let AsmString = ".insn 0x8, $value"; +} +} // Use InstAliases to match these so that we can combine the insn and format // into a mnemonic to use as the key for the tablegened asm matcher table. The diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index 1c94af58880f2..718d95aa1a4bc 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -799,6 +799,12 @@ def InsnCJ : DirectiveInsnCJ<(outs), (ins uimm2_opcode:$opcode, uimm3:$funct3, bare_simm12_lsb0:$imm11), "$opcode, $funct3, $imm11">; +def Insn16 : RVInst16<(outs), (ins uimm16:$value), "", "", [], InstFormatOther> { + bits<16> value; + + let Inst{15-0} = value; + let AsmString = ".insn 0x2, $value"; +} } // Use InstAliases to match these so that we can combine the insn and format diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index a99cebe666808..23feb52a0c2ca 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -94,8 +94,6 @@ def simm5nonzero : RISCVOp, def simm11 : RISCVSImmLeafOp<11>; -def simm16 : RISCVSImmOp<16>; - def simm16nonzero : RISCVOp, ImmLeaf(Imm);}]> { let ParserMatchClass = SImmAsmOperand<16, "NonZero">; @@ -141,219 +139,6 @@ def simm32_lsb0 : Operand { // Instruction Formats //===----------------------------------------------------------------------===// - -class DirectiveInsnQC_EAI - : RVInst48 { - bits<7> opcode; - bits<3> func3; - bits<1> func1; - - bits<5> rd; - bits<32> imm32; - - let Inst{47-16} = imm32; - let Inst{15} = func1; - let Inst{14-12} = func3; - let Inst{11-7} = rd; - let Inst{6-0} = opcode; - - let AsmString = ".insn qc.eai " # argstr; -} - -class DirectiveInsnQC_EI - : RVInst48 { - bits<7> opcode; - bits<3> func3; - bits<2> func2; - - bits<5> rd; - bits<5> rs1; - bits<26> imm26; - - let Inst{47-32} = imm26{25-10}; - let Inst{31-30} = func2; - let Inst{29-20} = imm26{9-0}; - let Inst{19-15} = rs1; - let Inst{14-12} = func3; - let Inst{11-7} = rd; - let Inst{6-0} = opcode; - - let AsmString = ".insn qc.ei " # argstr; -} - -class DirectiveInsnQC_EB - : RVInst48 { - bits<7> opcode; - bits<3> func3; - bits<5> func5; - - bits<5> rs1; - bits<12> imm12; // This one is the PC-relative offset - bits<16> imm16; - - let Inst{47-32} = imm16; - let Inst{31} = imm12{11}; - let Inst{30-25} = imm12{9-4}; - let Inst{24-20} = func5; - let Inst{19-15} = rs1; - let Inst{14-12} = func3; - let Inst{11-8} = imm12{3-0}; - let Inst{7} = imm12{10}; - let Inst{6-0} = opcode; - - let AsmString = ".insn qc.eb " # argstr; -} - -class DirectiveInsnQC_EJ - : RVInst48 { - bits<7> opcode; - bits<3> func3; - bits<2> func2; - bits<5> func5; - - bits<31> imm31; - - let Inst{47-32} = imm31{30-15}; - let Inst{31} = imm31{11}; - let Inst{30-25} = imm31{9-4}; - let Inst{24-20} = func5; - let Inst{19-17} = imm31{14-12}; - let Inst{16-15} = func2; - let Inst{14-12} = func3; - let Inst{11-8} = imm31{3-0}; - let Inst{7} = imm31{10}; - let Inst{6-0} = opcode; - - let AsmString = ".insn qc.ej " # argstr; -} - -class DirectiveInsnQC_ES - : RVInst48 { - bits<7> opcode; - bits<3> func3; - bits<2> func2; - - bits<5> rs1; - bits<5> rs2; - bits<26> imm26; - - let Inst{47-32} = imm26{25-10}; - let Inst{31-30} = func2; - let Inst{29-25} = imm26{9-5}; - let Inst{24-20} = rs2; - let Inst{19-15} = rs1; - let Inst{14-12} = func3; - let Inst{11-7} = imm26{4-0}; - let Inst{6-0} = opcode; - - let AsmString = ".insn qc.es " # argstr; -} - - -let isCodeGenOnly = true, hasSideEffects = true, mayLoad = true, - mayStore = true, hasNoSchedulingInfo = true, Predicates=[IsRV32] in { -def InsnQC_EAI : DirectiveInsnQC_EAI<(outs AnyReg:$rd), - (ins uimm7_opcode:$opcode, - uimm3:$func3, - uimm1:$func1, - simm32:$imm32), - "$opcode, $func3, $func1, $rd, $imm32">; -def InsnQC_EI : DirectiveInsnQC_EI<(outs AnyReg:$rd), - (ins uimm7_opcode:$opcode, - uimm3:$func3, - uimm2:$func2, - AnyReg:$rs1, - simm26:$imm26), - "$opcode, $func3, $func2, $rd, $rs1, $imm26">; -def InsnQC_EI_Mem : DirectiveInsnQC_EI<(outs AnyReg:$rd), - (ins uimm7_opcode:$opcode, - uimm3:$func3, - uimm2:$func2, - AnyReg:$rs1, - simm26:$imm26), - "$opcode, $func3, $func2, $rd, ${imm26}(${rs1})">; -def InsnQC_EB : DirectiveInsnQC_EB<(outs), - (ins uimm7_opcode:$opcode, - uimm3:$func3, - uimm5:$func5, - AnyReg:$rs1, - simm16:$imm16, - simm13_lsb0:$imm12), - "$opcode, $func3, $func5, $rs1, $imm16, $imm12">; -def InsnQC_EJ : DirectiveInsnQC_EJ<(outs), - (ins uimm7_opcode:$opcode, - uimm3:$func3, - uimm2:$func2, - uimm5:$func5, - simm32_lsb0:$imm31), - "$opcode, $func3, $func2, $func5, $imm31">; -def InsnQC_ES : DirectiveInsnQC_ES<(outs), - (ins uimm7_opcode:$opcode, - uimm3:$func3, - uimm2:$func2, - AnyReg:$rs2, - AnyReg:$rs1, - simm26:$imm26), - "$opcode, $func3, $func2, $rs2, ${imm26}(${rs1})">; -} // isCodeGenOnly, hasSideEffects, mayLoad, mayStore, hasNoSchedulingInfo, Predicates - -let EmitPriority = 0, Predicates = [IsRV32] in { -def : InstAlias<".insn_qc.eai $opcode, $func3, $func1, $rd, $imm32", - (InsnQC_EAI AnyReg:$rd, - uimm7_opcode:$opcode, - uimm3:$func3, - uimm1:$func1, - simm32:$imm32)>; -def : InstAlias<".insn_qc.ei $opcode, $func3, $func2, $rd, $rs1, $imm26", - (InsnQC_EI AnyReg:$rd, - uimm7_opcode:$opcode, - uimm3:$func3, - uimm2:$func2, - AnyReg:$rs1, - simm26:$imm26)>; -def : InstAlias<".insn_qc.ei $opcode, $func3, $func2, $rd, ${imm26}(${rs1})", - (InsnQC_EI_Mem AnyReg:$rd, - uimm7_opcode:$opcode, - uimm3:$func3, - uimm2:$func2, - AnyReg:$rs1, - simm26:$imm26)>; -def : InstAlias<".insn_qc.ei $opcode, $func3, $func2, $rd, (${rs1})", - (InsnQC_EI_Mem AnyReg:$rd, - uimm7_opcode:$opcode, - uimm3:$func3, - uimm2:$func2, - AnyReg:$rs1, - 0)>; -def : InstAlias<".insn_qc.eb $opcode, $func3, $func5, $rs1, $imm16, $imm12", - (InsnQC_EB uimm7_opcode:$opcode, - uimm3:$func3, - uimm5:$func5, - AnyReg:$rs1, - simm16:$imm16, - simm13_lsb0:$imm12)>; -def : InstAlias<".insn_qc.ej $opcode, $func3, $func2, $func5, $imm31", - (InsnQC_EJ uimm7_opcode:$opcode, - uimm3:$func3, - uimm2:$func2, - uimm5:$func5, - simm32_lsb0:$imm31)>; -def : InstAlias<".insn_qc.es $opcode, $func3, $func2, $rs2, ${imm26}(${rs1})", - (InsnQC_ES uimm7_opcode:$opcode, - uimm3:$func3, - uimm2:$func2, - AnyReg:$rs2, - AnyReg:$rs1, - simm26:$imm26)>; -def : InstAlias<".insn_qc.es $opcode, $func3, $func2, $rs2, (${rs1})", - (InsnQC_ES uimm7_opcode:$opcode, - uimm3:$func3, - uimm2:$func2, - AnyReg:$rs2, - AnyReg:$rs1, - 0)>; -} // EmitPriority = 0, Predicates = [IsRV32] - //===----------------------------------------------------------------------===// // Instruction Class Templates //===----------------------------------------------------------------------===// diff --git a/llvm/test/MC/RISCV/insn_xqci-invalid.s b/llvm/test/MC/RISCV/insn_xqci-invalid.s deleted file mode 100644 index 8177adaf8ac50..0000000000000 --- a/llvm/test/MC/RISCV/insn_xqci-invalid.s +++ /dev/null @@ -1,111 +0,0 @@ -# RUN: not llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ -# RUN: 2>&1 | FileCheck -check-prefixes=CHECK-ERR %s - -.insn qc.eai 128, 0, 0, x0, 0 -# CHECK-ERR: [[@LINE-1]]:14: error: opcode must be a valid opcode name or an immediate in the range [0, 127] - -.insn qc.eai 127, 8, 0, x0, 0 -# CHECK-ERR: [[@LINE-1]]:19: error: immediate must be an integer in the range [0, 7] - -.insn qc.eai 127, 7, 2, x0, 0 -# CHECK-ERR: [[@LINE-1]]:22: error: immediate must be an integer in the range [0, 1] - -.insn qc.eai 127, 7, 1, not_a_reg, 0 -# CHECK-ERR: [[@LINE-1]]:25: error: invalid operand for instruction - -.insn qc.eai 127, 7, 1, x31, 0x100000000 -# CHECK-ERR: [[@LINE-1]]:30: error: immediate must be an integer in the range [-2147483648, 4294967295] - -.insn qc.eai 126, 7, 1, x31, 0xFFFFFFFF, extra -# CHECK-ERR: [[@LINE-1]]:42: error: invalid operand for instruction - -.insn qc.ei 128, 0, 0, x31, x0, 0 -# CHECK-ERR: [[@LINE-1]]:13: error: opcode must be a valid opcode name or an immediate in the range [0, 127] - -.insn qc.ei 127, 8, 0, x0, x0, 0 -# CHECK-ERR: [[@LINE-1]]:18: error: immediate must be an integer in the range [0, 7] - -.insn qc.ei 127, 7, 4, x0, x0, 0 -# CHECK-ERR: [[@LINE-1]]:21: error: immediate must be an integer in the range [0, 3] - -.insn qc.ei 127, 7, 3, not_a_reg, x0, 0 -# CHECK-ERR: [[@LINE-1]]:24: error: invalid operand for instruction - -.insn qc.ei 127, 7, 3, x31, not_a_reg, 0 -# CHECK-ERR: [[@LINE-1]]:29: error: immediate must be an integer in the range [-33554432, 33554431] - -.insn qc.ei 127, 7, 3, x31, x31, 0x2000000 -# CHECK-ERR: [[@LINE-1]]:34: error: immediate must be an integer in the range [-33554432, 33554431] - -.insn qc.ei 127, 7, 3, x31, x31, 0x1000000, extra -# CHECK-ERR: [[@LINE-1]]:45: error: invalid operand for instruction - -.insn qc.ei 126, 7, 3, x31, 0x2000000(x0) -# CHECK-ERR: [[@LINE-1]]:29: error: immediate must be an integer in the range [-33554432, 33554431] - -.insn qc.ei 126, 7, 3, x31, 0x1000000(not_a_reg) -# CHECK-ERR: [[@LINE-1]]:39: error: expected register - -.insn qc.ei 126, 7, 3, x31, 0x1000000(x31), extra -# CHECK-ERR: [[@LINE-1]]:45: error: invalid operand for instruction - -.insn qc.eb 128, 0, 0, x0, 0, 0 -# CHECK-ERR: [[@LINE-1]]:13: error: opcode must be a valid opcode name or an immediate in the range [0, 127] - -.insn qc.eb 127, 8, 0, x0, 0, 0 -# CHECK-ERR: [[@LINE-1]]:18: error: immediate must be an integer in the range [0, 7] - -.insn qc.eb 127, 7, 32, x0, 0, 0 -# CHECK-ERR: [[@LINE-1]]:21: error: immediate must be an integer in the range [0, 31] - -.insn qc.eb 127, 7, 31, not_a_reg, 0, 0 -# CHECK-ERR: [[@LINE-1]]:25: error: invalid operand for instruction - -.insn qc.eb 127, 7, 31, x31, 0x8000, 0 -# CHECK-ERR: [[@LINE-1]]:30: error: immediate must be an integer in the range [-32768, 32767] - -.insn qc.eb 127, 7, 31, x31, 0x4000, 0x1000 -# CHECK-ERR: [[@LINE-1]]:38: error: immediate must be a multiple of 2 bytes in the range [-4096, 4094] - -.insn qc.eb 127, 7, 31, x31, 0x4000, 0x800, extra -# CHECK-ERR: [[@LINE-1]]:45: error: invalid operand for instruction - - -.insn qc.ej 128, 0, 0, 0, 0 -# CHECK-ERR: [[@LINE-1]]:13: error: opcode must be a valid opcode name or an immediate in the range [0, 127] - -.insn qc.ej 127, 8, 0, 0, 0 -# CHECK-ERR: [[@LINE-1]]:18: error: immediate must be an integer in the range [0, 7] - -.insn qc.ej 127, 7, 4, 0, 0 -# CHECK-ERR: [[@LINE-1]]:21: error: immediate must be an integer in the range [0, 3] - -.insn qc.ej 127, 7, 3, 32, 0 -# CHECK-ERR: [[@LINE-1]]:24: error: immediate must be an integer in the range [0, 31] - -.insn qc.ej 127, 7, 3, 31, 0x100000000 -# CHECK-ERR: [[@LINE-1]]:28: error: operand must be a multiple of 2 bytes in the range [-2147483648, 2147483646] - -.insn qc.ej 127, 7, 3, 31, 0x80000000, extra -# CHECK-ERR: [[@LINE-1]]:40: error: invalid operand for instruction - -.insn qc.es 128, 0, 0, x0, 0(x0) -# CHECK-ERR: [[@LINE-1]]:13: error: opcode must be a valid opcode name or an immediate in the range [0, 127] - -.insn qc.es 127, 8, 0, x0, 0(x0) -# CHECK-ERR: [[@LINE-1]]:18: error: immediate must be an integer in the range [0, 7] - -.insn qc.es 127, 7, 4, x0, 0(x0) -# CHECK-ERR: [[@LINE-1]]:21: error: immediate must be an integer in the range [0, 3] - -.insn qc.es 127, 7, 3, not_a_reg, 0(x0) -# CHECK-ERR: [[@LINE-1]]:24: error: invalid operand for instruction - -.insn qc.es 127, 7, 3, x31, 0x2000000(x0) -# CHECK-ERR: [[@LINE-1]]:29: error: immediate must be an integer in the range [-33554432, 33554431] - -.insn qc.es 127, 7, 3, x31, 0x1000000(not_a_reg) -# CHECK-ERR: [[@LINE-1]]:39: error: expected register - -.insn qc.es 127, 7, 3, x31, 0x1000000(x31), extra -# CHECK-ERR: [[@LINE-1]]:45: error: invalid operand for instruction diff --git a/llvm/test/MC/RISCV/insn_xqci.s b/llvm/test/MC/RISCV/insn_xqci.s deleted file mode 100644 index 098745ec22294..0000000000000 --- a/llvm/test/MC/RISCV/insn_xqci.s +++ /dev/null @@ -1,41 +0,0 @@ -# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ -# RUN: | FileCheck -check-prefixes=CHECK-ASM %s -# RUN: llvm-mc -filetype=obj -triple=riscv32 < %s \ -# RUN: | llvm-objdump --mattr=+experimental-xqcilia,+experimental-xqcilo,+experimental-xqcibi,+experimental-xqcilb \ -# RUN: -M no-aliases -d -r - \ -# RUN: | FileCheck -check-prefixes=CHECK-OBJ %s - -# CHECK-ASM: .insn qc.eai 31, 2, 0, a0, 16711935 -# CHECK-ASM: encoding: [0x1f,0x25,0xff,0x00,0xff,0x00] -# CHECK-OBJ: qc.e.addai a0, 0xff00ff -.insn qc.eai 0x1f, 2, 0, a0, 0x00FF00FF - -# CHECK-ASM: .insn qc.ei 31, 3, 2, a0, a1, 16711935 -# CHECK-ASM: encoding: [0x1f,0xb5,0xf5,0x8f,0xc0,0x3f] -# CHECK-OBJ: qc.e.addi a0, a1, 0xff00ff -.insn qc.ei 0x1f, 3, 2, a0, a1, 0x00FF00FF - -# CHECK-ASM: .insn qc.ei 31, 5, 0, a1, 16711935(a0) -# CHECK-ASM: encoding: [0x9f,0x55,0xf5,0x0f,0xc0,0x3f] -# CHECK-OBJ: qc.e.lb a1, 0xff00ff(a0) -.insn qc.ei 0x1f, 5, 0, a1, 0x00FF00FF(a0) - -# CHECK-ASM: .insn qc.ei 31, 5, 0, a1, 0(a0) -# CHECK-ASM: encoding: [0x9f,0x55,0x05,0x00,0x00,0x00] -# CHECK-OBJ: qc.e.lb a1, 0x0(a0) -.insn qc.ei 0x1f, 5, 0, a1, (a0) - -# CHECK-ASM: .insn qc.eb 31, 4, 24, a0, 17476, 22 -# CHECK-ASM: encoding: [0x1f,0x4b,0x85,0x01,0x44,0x44] -# CHECK-OBJ: qc.e.beqi a0, 0x4444, 0x2e -.insn qc.eb 0x1f, 4, 24, a0, 0x4444, 22 - -# CHECK-ASM: .insn qc.ej 31, 4, 0, 0, 22 -# CHECK-ASM: encoding: [0x1f,0x4b,0x00,0x00,0x00,0x00] -# CHECK-OBJ: qc.e.j 0x34 -.insn qc.ej 0x1f, 4, 0, 0, 22 - -# CHECK-ASM: .insn qc.es 31, 6, 1, a1, 0(a0) -# CHECK-ASM: encoding: [0x1f,0x60,0xb5,0x40,0x00,0x00] -# CHECK-OBJ: qc.e.sb a1, 0x0(a0) -.insn qc.es 0x1f, 6, 1, a1, (a0) diff --git a/llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s b/llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s index e45c43a50048a..fe6d0de0a4b00 100644 --- a/llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s +++ b/llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s @@ -1,7 +1,7 @@ # RUN: not llvm-mc -triple riscv32 -mattr=+xtheadmemidx < %s 2>&1 | FileCheck %s # RUN: not llvm-mc -triple riscv64 -mattr=+xtheadmemidx < %s 2>&1 | FileCheck %s -th.ldia 0(a0), (a1), 0, 0 # CHECK: :[[@LINE]]:26: error: invalid operand for instruction +th.ldia 0(a0), (a1), 0, 0 # CHECK: :[[@LINE]]:23: error: invalid operand for instruction th.ldib a0, 2(a1), 15, 1 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction th.lwia a0, (a1), 30, 2 # CHECK: :[[@LINE]]:20: error: immediate must be an integer in the range [-16, 15] th.lwib a0, (a1), -16, 43 # CHECK: :[[@LINE]]:25: error: immediate must be an integer in the range [0, 3] From e060acbd3b0fe362b81b7edd8741eee854aa3d99 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 1 Apr 2025 21:58:32 -0700 Subject: [PATCH 0334/1029] [Sema] Use llvm::erase_if (NFC) (#134017) --- clang/lib/Sema/TreeTransform.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index e455b225d7f49..916b8e2735cd0 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -11963,10 +11963,10 @@ void OpenACCClauseTransform::VisitAttachClause( llvm::SmallVector VarList = VisitVarList(C.getVarList()); // Ensure each var is a pointer type. - VarList.erase(std::remove_if(VarList.begin(), VarList.end(), [&](Expr *E) { + llvm::erase_if(VarList, [&](Expr *E) { return Self.getSema().OpenACC().CheckVarIsPointerType( OpenACCClauseKind::Attach, E); - }), VarList.end()); + }); ParsedClause.setVarListDetails(VarList, /*IsReadOnly=*/false, /*IsZero=*/false); @@ -12026,10 +12026,10 @@ void OpenACCClauseTransform::VisitDevicePtrClause( llvm::SmallVector VarList = VisitVarList(C.getVarList()); // Ensure each var is a pointer type. - VarList.erase(std::remove_if(VarList.begin(), VarList.end(), [&](Expr *E) { + llvm::erase_if(VarList, [&](Expr *E) { return Self.getSema().OpenACC().CheckVarIsPointerType( OpenACCClauseKind::DevicePtr, E); - }), VarList.end()); + }); ParsedClause.setVarListDetails(VarList, /*IsReadOnly=*/false, /*IsZero=*/false); From cc10896fa2cc4096387f5cff09c23ff22deabd97 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 1 Apr 2025 21:58:57 -0700 Subject: [PATCH 0335/1029] [SandboxVectorizer] Use llvm::erase (NFC) (#134018) --- .../llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h index 2af3c6d0ea517..681cf881bce0f 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h @@ -112,9 +112,7 @@ class SchedBundle { ContainerTy Nodes; /// Called by the DGNode destructor to avoid accessing freed memory. - void eraseFromBundle(DGNode *N) { - Nodes.erase(std::remove(Nodes.begin(), Nodes.end(), N), Nodes.end()); - } + void eraseFromBundle(DGNode *N) { llvm::erase(Nodes, N); } friend void DGNode::setSchedBundle(SchedBundle &); // For eraseFromBunde(). friend DGNode::~DGNode(); // For eraseFromBundle(). From 86c382514e68b9b58a6cad266d99ca0a1ba1c856 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 1 Apr 2025 21:59:19 -0700 Subject: [PATCH 0336/1029] [Target] Construct SmallVector with ArrayRef (NFC) (#134019) --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 3 +-- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index e2ce5e4fc17e1..5ed14cd21840c 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -14992,8 +14992,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, } } - SmallVector Ops(PromOp.getNode()->op_begin(), - PromOp.getNode()->op_end()); + SmallVector Ops(PromOp.getNode()->ops()); // If this node has constant inputs, then they'll need to be promoted here. for (unsigned i = 0; i < 2; ++i) { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 0dc62ef04ec0f..967a6cf82433f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -17787,8 +17787,7 @@ static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, ConcatOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ConcatVT, ConcatOp, InVal, NewIdx); - SmallVector ConcatOps; - ConcatOps.append(InVec->op_begin(), InVec->op_end()); + SmallVector ConcatOps(InVec->ops()); ConcatOps[ConcatOpIdx] = ConcatOp; return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); } From d760dbe6ebcf87c82aea7b1adb10d41e65c57830 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 1 Apr 2025 21:59:51 -0700 Subject: [PATCH 0337/1029] [mlir] Remove extraneous calls to make_range (NFC) (#134020) --- mlir/lib/Dialect/Mesh/IR/MeshOps.cpp | 34 ++++++++-------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp index 65475b69dbdb1..1a1334f0ea474 100644 --- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp +++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp @@ -693,10 +693,7 @@ bool MeshSharding::equalSplitAndPartialAxes(const MeshSharding &rhs) const { if (getPartialAxes().size() != rhs.getPartialAxes().size() || (!getPartialAxes().empty() && getPartialType() != rhs.getPartialType()) || - !llvm::equal( - llvm::make_range(getPartialAxes().begin(), getPartialAxes().end()), - llvm::make_range(rhs.getPartialAxes().begin(), - rhs.getPartialAxes().end()))) { + !llvm::equal(getPartialAxes(), rhs.getPartialAxes())) { return false; } @@ -708,11 +705,9 @@ bool MeshSharding::equalSplitAndPartialAxes(const MeshSharding &rhs) const { return false; } - return llvm::all_of(llvm::make_range(getSplitAxes().begin() + minSize, - getSplitAxes().end()), + return llvm::all_of(llvm::drop_begin(getSplitAxes(), minSize), std::mem_fn(&MeshAxesAttr::empty)) && - llvm::all_of(llvm::make_range(rhs.getSplitAxes().begin() + minSize, - rhs.getSplitAxes().end()), + llvm::all_of(llvm::drop_begin(rhs.getSplitAxes(), minSize), std::mem_fn(&MeshAxesAttr::empty)); } @@ -723,19 +718,14 @@ bool MeshSharding::equalHaloAndShardSizes(const MeshSharding &rhs) const { bool MeshSharding::equalShardSizes(const MeshSharding &rhs) const { if (rhs.getStaticShardedDimsOffsets().size() != getStaticShardedDimsOffsets().size() || - !llvm::equal(llvm::make_range(getStaticShardedDimsOffsets().begin(), - getStaticShardedDimsOffsets().end()), - llvm::make_range(rhs.getStaticShardedDimsOffsets().begin(), - rhs.getStaticShardedDimsOffsets().end()))) { + !llvm::equal(getStaticShardedDimsOffsets(), + rhs.getStaticShardedDimsOffsets())) { return false; } if (rhs.getDynamicShardedDimsOffsets().size() != getDynamicShardedDimsOffsets().size() || - !llvm::equal( - llvm::make_range(getDynamicShardedDimsOffsets().begin(), - getDynamicShardedDimsOffsets().end()), - llvm::make_range(rhs.getDynamicShardedDimsOffsets().begin(), - rhs.getDynamicShardedDimsOffsets().end()))) { + !llvm::equal(getDynamicShardedDimsOffsets(), + rhs.getDynamicShardedDimsOffsets())) { return false; } return true; @@ -743,17 +733,11 @@ bool MeshSharding::equalShardSizes(const MeshSharding &rhs) const { bool MeshSharding::equalHaloSizes(const MeshSharding &rhs) const { if (rhs.getStaticHaloSizes().size() != getStaticHaloSizes().size() || - !llvm::equal(llvm::make_range(getStaticHaloSizes().begin(), - getStaticHaloSizes().end()), - llvm::make_range(rhs.getStaticHaloSizes().begin(), - rhs.getStaticHaloSizes().end()))) { + !llvm::equal(getStaticHaloSizes(), rhs.getStaticHaloSizes())) { return false; } if (rhs.getDynamicHaloSizes().size() != getDynamicHaloSizes().size() || - !llvm::equal(llvm::make_range(getDynamicHaloSizes().begin(), - getDynamicHaloSizes().end()), - llvm::make_range(rhs.getDynamicHaloSizes().begin(), - rhs.getDynamicHaloSizes().end()))) { + !llvm::equal(getDynamicHaloSizes(), rhs.getDynamicHaloSizes())) { return false; } return true; From f1025c0e8752057dc19b922040055d81ce3b6f31 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 1 Apr 2025 22:00:01 -0700 Subject: [PATCH 0338/1029] [mlir] Construct SmallVector with ArrayRef (NFC) (#134023) --- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 78c242571935c..6868c393f99e6 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -301,7 +301,7 @@ LogicalResult TensorDescType::verify( // For 1D tensor, pad the shape with an outer unit dimension to allow common // validation logic. - SmallVector tensorShape(shape.begin(), shape.end()); + SmallVector tensorShape(shape); if (rank == 1) tensorShape = {1, tensorShape.back()}; From 28b300d546948baf4396c3467507dea8b9e34881 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Tue, 1 Apr 2025 22:12:09 -0700 Subject: [PATCH 0339/1029] [lldb] Update ScriptInterpreterLua::LoadScriptingModule Update the ScriptInterpreterLua::LoadScriptingModule signature after the TargetSP argument was added in #133290. --- .../Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp | 2 +- .../Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp index 191863ae25d7b..0a0f19f7d09b9 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp @@ -228,7 +228,7 @@ void ScriptInterpreterLua::ExecuteInterpreterLoop() { bool ScriptInterpreterLua::LoadScriptingModule( const char *filename, const LoadScriptOptions &options, lldb_private::Status &error, StructuredData::ObjectSP *module_sp, - FileSpec extra_search_dir) { + FileSpec extra_search_dir, lldb::TargetSP loaded_into_target_sp) { if (llvm::Error e = m_lua->LoadModule(filename)) { error = Status::FromErrorStringWithFormatv( diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h index ca14e189acd84..7bfbac0ef9d00 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h +++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h @@ -47,7 +47,8 @@ class ScriptInterpreterLua : public ScriptInterpreter { const LoadScriptOptions &options, lldb_private::Status &error, StructuredData::ObjectSP *module_sp = nullptr, - FileSpec extra_search_dir = {}) override; + FileSpec extra_search_dir = {}, + lldb::TargetSP loaded_into_target_sp = {}) override; StructuredData::DictionarySP GetInterpreterInfo() override; From e020fc18959a71c75257dd89ce3d6c86eb388043 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Apr 2025 22:14:17 -0700 Subject: [PATCH 0340/1029] [TableGen] Directly use SDNode functions to implement HasOneUse and HasNoUse. NFC (#133976) The SDValue functions we were calling wrap SDNode functions we can call directly. --- llvm/test/TableGen/HasNoUse.td | 2 +- llvm/test/TableGen/predicate-patfags.td | 2 +- llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/TableGen/HasNoUse.td b/llvm/test/TableGen/HasNoUse.td index 6e6bcc2a81df7..0947be11caa4c 100644 --- a/llvm/test/TableGen/HasNoUse.td +++ b/llvm/test/TableGen/HasNoUse.td @@ -13,7 +13,7 @@ def NO_RET_ATOMIC_ADD : I<(outs), (ins GPR32Op:$src0, GPR32Op:$src1), []>; // SDAG-NEXT: SDNode *N = Node; // SDAG-NEXT: (void)N; // SDAG-NEXT: if (cast(N)->getMemoryVT() != MVT::i32) return false; -// SDAG-NEXT: if (!SDValue(N, 0).use_empty()) return false; +// SDAG-NEXT: if (N->hasAnyUseOfValue(0)) return false; // SDAG-NEXT: return true; // GISEL: GIM_CheckOpcode, /*MI*/0, GIMT_Encode2(TargetOpcode::G_ATOMICRMW_ADD), diff --git a/llvm/test/TableGen/predicate-patfags.td b/llvm/test/TableGen/predicate-patfags.td index 39133f324f305..b69c4acfd6431 100644 --- a/llvm/test/TableGen/predicate-patfags.td +++ b/llvm/test/TableGen/predicate-patfags.td @@ -52,7 +52,7 @@ def TGTmul24_oneuse : PatFrag< // SDAG: OPC_CheckPredicate0, // Predicate_TGTmul24_oneuse // SCUSTOM: return N->hasOneUse(); -// SBUILTIN: if (!SDValue(N, 0).hasOneUse()) return false; +// SBUILTIN: if (!N->hasNUsesOfValue(1, 0)) return false; // GISEL: GIM_CheckOpcode, /*MI*/1, GIMT_Encode2(TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS), // GISEL: GIM_CheckIntrinsicID, /*MI*/1, /*Op*/1, GIMT_Encode2(Intrinsic::tgt_mul24), diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp index 5ee8c50d6e51b..7f58c4a88c76d 100644 --- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp +++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp @@ -1127,9 +1127,9 @@ std::string TreePredicateFn::getPredCode() const { } if (hasNoUse()) - Code += "if (!SDValue(N, 0).use_empty()) return false;\n"; + Code += "if (N->hasAnyUseOfValue(0)) return false;\n"; if (hasOneUse()) - Code += "if (!SDValue(N, 0).hasOneUse()) return false;\n"; + Code += "if (!N->hasNUsesOfValue(1, 0)) return false;\n"; std::string PredicateCode = std::string(PatFragRec->getRecord()->getValueAsString("PredicateCode")); From 9d06e0879b5600b19cd8cebd98e4f92b5e62400f Mon Sep 17 00:00:00 2001 From: Sirraide Date: Wed, 2 Apr 2025 08:06:29 +0200 Subject: [PATCH 0341/1029] [Clang] [NFC] Introduce a helper for emitting compatibility diagnostics (#132348) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a follow-up to #132129. Currently, only `Parser` and `SemaBase` get a `DiagCompat()` helper; I’m planning to keep refactoring compatibility warnings and add more helpers to other classes as needed. I also refactored a single parser compat warning just to make sure everything works properly when diagnostics across multiple components (i.e. Sema and Parser in this case) are involved. --- clang/include/clang/Basic/CMakeLists.txt | 10 +++ clang/include/clang/Basic/Diagnostic.td | 18 +++++ clang/include/clang/Basic/DiagnosticAST.h | 12 +++ .../include/clang/Basic/DiagnosticAnalysis.h | 12 +++ clang/include/clang/Basic/DiagnosticComment.h | 12 +++ clang/include/clang/Basic/DiagnosticCrossTU.h | 12 +++ clang/include/clang/Basic/DiagnosticDriver.h | 12 +++ .../include/clang/Basic/DiagnosticFrontend.h | 12 +++ clang/include/clang/Basic/DiagnosticIDs.h | 18 +++++ .../clang/Basic/DiagnosticInstallAPI.h | 12 +++ clang/include/clang/Basic/DiagnosticLex.h | 12 +++ clang/include/clang/Basic/DiagnosticParse.h | 12 +++ .../clang/Basic/DiagnosticParseKinds.td | 12 +-- .../clang/Basic/DiagnosticRefactoring.h | 12 +++ clang/include/clang/Basic/DiagnosticSema.h | 13 ++++ .../clang/Basic/DiagnosticSemaKinds.td | 2 +- .../clang/Basic/DiagnosticSerialization.h | 12 +++ clang/include/clang/Parse/Parser.h | 6 ++ clang/include/clang/Sema/SemaBase.h | 4 + clang/lib/Basic/DiagnosticIDs.cpp | 46 ++++++++++++ clang/lib/Parse/ParseDecl.cpp | 7 +- clang/lib/Parse/Parser.cpp | 10 +++ clang/lib/Sema/SemaBase.cpp | 7 ++ clang/lib/Sema/SemaDecl.cpp | 11 +-- clang/lib/Sema/SemaDeclCXX.cpp | 75 ++++++------------- clang/lib/Sema/SemaExpr.cpp | 4 +- clang/lib/Sema/SemaTemplate.cpp | 19 +---- clang/test/Misc/show-diag-options.c | 2 +- .../TableGen/ClangDiagnosticsEmitter.cpp | 44 +++++++++++ clang/utils/TableGen/TableGen.cpp | 6 ++ clang/utils/TableGen/TableGenBackends.h | 3 + 31 files changed, 356 insertions(+), 93 deletions(-) diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt index a671d5c764c22..6be6d063c20b4 100644 --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -8,6 +8,11 @@ macro(clang_diag_gen component) -gen-clang-diags-enums -clang-component=${component} SOURCE Diagnostic.td TARGET ClangDiagnostic${component}Enums) + + clang_tablegen(Diagnostic${component}CompatIDs.inc + -gen-clang-diags-compat-ids -clang-component=${component} + SOURCE Diagnostic.td + TARGET ClangDiagnostic${component}CompatIDs) endmacro(clang_diag_gen) clang_diag_gen(Analysis) @@ -31,6 +36,11 @@ clang_tablegen(DiagnosticIndexName.inc -gen-clang-diags-index-name SOURCE Diagnostic.td TARGET ClangDiagnosticIndexName) +clang_tablegen(DiagnosticAllCompatIDs.inc + -gen-clang-diags-compat-ids + SOURCE Diagnostic.td + TARGET ClangDiagnosticAllCompatIDs) + clang_tablegen(AttrList.inc -gen-clang-attr-list -I ${CMAKE_CURRENT_SOURCE_DIR}/../../ SOURCE Attr.td diff --git a/clang/include/clang/Basic/Diagnostic.td b/clang/include/clang/Basic/Diagnostic.td index b31d846210a8e..65b19f3feea4f 100644 --- a/clang/include/clang/Basic/Diagnostic.td +++ b/clang/include/clang/Basic/Diagnostic.td @@ -155,6 +155,19 @@ class DefaultWarnNoWerror { } class DefaultRemark { Severity DefaultSeverity = SEV_Remark; } +class CompatWarningId { + string Component = ?; + string Name = name; + string Diag = diag; + string DiagPre = diag_pre; + int Std = std; + + // This is unused, but Tablegen will complain if it's missing because we define + // the compatibility ids in the same place as the other diagnostics (which means + // that we'll be inside a 'let CategoryName = "" in { ... }' block). + string CategoryName = ?; +} + // C++ compatibility warnings. multiclass CXXCompat< string message, @@ -178,6 +191,11 @@ multiclass CXXCompat< "CXX98Compat", "CXXPre"#std_ver#"Compat"))>, DefaultIgnore; + + def : CompatWarningId< + NAME, std_ver, + "compat_cxx"#std_ver#"_"#NAME, + "compat_pre_cxx"#std_ver#"_"#NAME>; } // These generate pairs of C++ compatibility warnings of the form: diff --git a/clang/include/clang/Basic/DiagnosticAST.h b/clang/include/clang/Basic/DiagnosticAST.h index 4f82114b7406b..41e2598f7cc3b 100644 --- a/clang/include/clang/Basic/DiagnosticAST.h +++ b/clang/include/clang/Basic/DiagnosticAST.h @@ -36,6 +36,18 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticASTCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICAST_H diff --git a/clang/include/clang/Basic/DiagnosticAnalysis.h b/clang/include/clang/Basic/DiagnosticAnalysis.h index 1a49461bcd173..5ead092b946c5 100644 --- a/clang/include/clang/Basic/DiagnosticAnalysis.h +++ b/clang/include/clang/Basic/DiagnosticAnalysis.h @@ -35,6 +35,18 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticAnalysisCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICANALYSIS_H diff --git a/clang/include/clang/Basic/DiagnosticComment.h b/clang/include/clang/Basic/DiagnosticComment.h index 53143ef132e4b..08e66e8051834 100644 --- a/clang/include/clang/Basic/DiagnosticComment.h +++ b/clang/include/clang/Basic/DiagnosticComment.h @@ -36,6 +36,18 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticCommentCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICCOMMENT_H diff --git a/clang/include/clang/Basic/DiagnosticCrossTU.h b/clang/include/clang/Basic/DiagnosticCrossTU.h index 428da95011027..761716d781446 100644 --- a/clang/include/clang/Basic/DiagnosticCrossTU.h +++ b/clang/include/clang/Basic/DiagnosticCrossTU.h @@ -36,6 +36,18 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticCrossTUCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICCROSSTU_H diff --git a/clang/include/clang/Basic/DiagnosticDriver.h b/clang/include/clang/Basic/DiagnosticDriver.h index c472afa3f6e96..864a23a49e4cd 100644 --- a/clang/include/clang/Basic/DiagnosticDriver.h +++ b/clang/include/clang/Basic/DiagnosticDriver.h @@ -36,6 +36,18 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticDriverCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICDRIVER_H diff --git a/clang/include/clang/Basic/DiagnosticFrontend.h b/clang/include/clang/Basic/DiagnosticFrontend.h index 766cac3d655b3..3506f05daae54 100644 --- a/clang/include/clang/Basic/DiagnosticFrontend.h +++ b/clang/include/clang/Basic/DiagnosticFrontend.h @@ -36,6 +36,18 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticFrontendCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICFRONTEND_H diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h index 017ef7065610f..f2bd19f9b6e8a 100644 --- a/clang/include/clang/Basic/DiagnosticIDs.h +++ b/clang/include/clang/Basic/DiagnosticIDs.h @@ -25,6 +25,7 @@ namespace clang { class DiagnosticsEngine; class DiagnosticBuilder; + class LangOptions; class SourceLocation; // Import the diagnostic enums themselves. @@ -104,6 +105,18 @@ namespace clang { }; } + namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticCommonCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END + } // end namespace diag_compat + class DiagnosticMapping { LLVM_PREFERRED_TYPE(diag::Severity) unsigned Severity : 3; @@ -464,6 +477,11 @@ class DiagnosticIDs : public RefCountedBase { /// given group name. static StringRef getNearestOption(diag::Flavor Flavor, StringRef Group); + /// Get the appropriate diagnostic Id to use for issuing a compatibility + /// diagnostic. For use by the various DiagCompat() helpers. + static unsigned getCXXCompatDiagId(const LangOptions &LangOpts, + unsigned CompatDiagId); + private: /// Classify the specified diagnostic ID into a Level, consumable by /// the DiagnosticClient. diff --git a/clang/include/clang/Basic/DiagnosticInstallAPI.h b/clang/include/clang/Basic/DiagnosticInstallAPI.h index cbdb00362624b..4619bfeea05a2 100644 --- a/clang/include/clang/Basic/DiagnosticInstallAPI.h +++ b/clang/include/clang/Basic/DiagnosticInstallAPI.h @@ -35,5 +35,17 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticInstallAPICompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat } // namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICINSTALLAPI_H diff --git a/clang/include/clang/Basic/DiagnosticLex.h b/clang/include/clang/Basic/DiagnosticLex.h index d14bf97e8642e..6fa90f785bbf8 100644 --- a/clang/include/clang/Basic/DiagnosticLex.h +++ b/clang/include/clang/Basic/DiagnosticLex.h @@ -35,6 +35,18 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticLexCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICLEX_H diff --git a/clang/include/clang/Basic/DiagnosticParse.h b/clang/include/clang/Basic/DiagnosticParse.h index 275e1a4c39b3f..e2a4368a59c4b 100644 --- a/clang/include/clang/Basic/DiagnosticParse.h +++ b/clang/include/clang/Basic/DiagnosticParse.h @@ -36,6 +36,18 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticParseCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICPARSE_H diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 2582e1e5ef0f6..954f538e15026 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -11,6 +11,12 @@ //===----------------------------------------------------------------------===// let Component = "Parse" in { +let CategoryName = "Parse Issue" in { +// C++11 compatibility with C++98. +defm enum_fixed_underlying_type : CXX11Compat< + "enumeration types with a fixed underlying type are", + /*ext_warn=*/false>; +} def err_asm_qualifier_ignored : Error< "expected 'volatile', 'inline', 'goto', or '('">, CatInlineAsm; @@ -107,9 +113,6 @@ def err_enumerator_list_missing_comma : Error< "missing ',' between enumerators">; def err_enumerator_unnamed_no_def : Error< "unnamed enumeration must be a definition">; -def ext_cxx11_enum_fixed_underlying_type : Extension< - "enumeration types with a fixed underlying type are a C++11 extension">, - InGroup; def ext_ms_c_enum_fixed_underlying_type : Extension< "enumeration types with a fixed underlying type are a Microsoft extension">, InGroup; @@ -119,9 +122,6 @@ def ext_c23_enum_fixed_underlying_type : Extension< def warn_c17_compat_enum_fixed_underlying_type : Warning< "enumeration types with a fixed underlying type are incompatible with C standards before C23">, DefaultIgnore, InGroup; -def warn_cxx98_compat_enum_fixed_underlying_type : Warning< - "enumeration types with a fixed underlying type are incompatible with C++98">, - InGroup, DefaultIgnore; def ext_enum_base_in_type_specifier : ExtWarn< "non-defining declaration of enumeration with a fixed underlying type is " "only permitted as a standalone declaration" diff --git a/clang/include/clang/Basic/DiagnosticRefactoring.h b/clang/include/clang/Basic/DiagnosticRefactoring.h index 59d4bc912733a..b3f3a10925f09 100644 --- a/clang/include/clang/Basic/DiagnosticRefactoring.h +++ b/clang/include/clang/Basic/DiagnosticRefactoring.h @@ -36,6 +36,18 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticRefactoringCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICREFACTORING_H diff --git a/clang/include/clang/Basic/DiagnosticSema.h b/clang/include/clang/Basic/DiagnosticSema.h index 84986c7bccf71..943b2f64f427e 100644 --- a/clang/include/clang/Basic/DiagnosticSema.h +++ b/clang/include/clang/Basic/DiagnosticSema.h @@ -35,7 +35,20 @@ enum { #undef DIAG_ENUM_END #undef DIAG_ENUM_ITEM #undef DIAG_ENUM + } // end namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticSemaCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICSEMA_H diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 5e45482584946..265bed2df43cf 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -61,7 +61,7 @@ defm decomp_decl_cond : CXX26Compat<"structured binding declaration in a conditi // Compatibility warnings duplicated across multiple language versions. foreach std = [14, 20, 23] in { - defm constexpr_body_invalid_stmt : CXXCompat< + defm cxx#std#_constexpr_body_invalid_stmt : CXXCompat< "use of this statement in a constexpr %select{function|constructor}0 is", std>; } diff --git a/clang/include/clang/Basic/DiagnosticSerialization.h b/clang/include/clang/Basic/DiagnosticSerialization.h index 6fb836dca1b04..c8fb034e9bd4a 100644 --- a/clang/include/clang/Basic/DiagnosticSerialization.h +++ b/clang/include/clang/Basic/DiagnosticSerialization.h @@ -36,6 +36,18 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/DiagnosticSerializationCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICSERIALIZATION_H diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index c8ceef8f8987d..5770692c42f13 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -1263,6 +1263,12 @@ class Parser : public CodeCompletionHandler { return Diag(Tok, DiagID); } + DiagnosticBuilder DiagCompat(SourceLocation Loc, unsigned CompatDiagId); + DiagnosticBuilder DiagCompat(const Token &Tok, unsigned CompatDiagId); + DiagnosticBuilder DiagCompat(unsigned CompatDiagId) { + return DiagCompat(Tok, CompatDiagId); + } + private: void SuggestParentheses(SourceLocation Loc, unsigned DK, SourceRange ParenRange); diff --git a/clang/include/clang/Sema/SemaBase.h b/clang/include/clang/Sema/SemaBase.h index 463cae83c7e81..550f530af72f5 100644 --- a/clang/include/clang/Sema/SemaBase.h +++ b/clang/include/clang/Sema/SemaBase.h @@ -219,6 +219,10 @@ class SemaBase { SemaDiagnosticBuilder Diag(SourceLocation Loc, const PartialDiagnostic &PD, bool DeferHint = false); + /// Emit a compatibility diagnostic. + SemaDiagnosticBuilder DiagCompat(SourceLocation Loc, unsigned CompatDiagId, + bool DeferHint = false); + /// Build a partial diagnostic. PartialDiagnostic PDiag(unsigned DiagID = 0); }; diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp index ca5b8d2da769e..d5928431f41a2 100644 --- a/clang/lib/Basic/DiagnosticIDs.cpp +++ b/clang/lib/Basic/DiagnosticIDs.cpp @@ -13,6 +13,7 @@ #include "clang/Basic/DiagnosticIDs.h" #include "clang/Basic/AllDiagnostics.h" #include "clang/Basic/DiagnosticCategories.h" +#include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceManager.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -769,6 +770,51 @@ StringRef DiagnosticIDs::getNearestOption(diag::Flavor Flavor, return Best; } +unsigned DiagnosticIDs::getCXXCompatDiagId(const LangOptions &LangOpts, + unsigned CompatDiagId) { + struct CompatDiag { + unsigned StdVer; + unsigned DiagId; + unsigned PreDiagId; + }; + + // We encode the standard version such that C++98 < C++11 < C++14 etc. The + // actual numbers don't really matter for this, but the definitions of the + // compat diags in the Tablegen file use the standard version number (i.e. + // 98, 11, 14, etc.), so we base the encoding here on that. +#define DIAG_COMPAT_IDS_BEGIN() +#define DIAG_COMPAT_IDS_END() +#define DIAG_COMPAT_ID(Value, Name, Std, Diag, DiagPre) \ + {Std == 98 ? 1998 : 2000 + Std, diag::Diag, diag::DiagPre}, + static constexpr CompatDiag Diags[]{ +#include "clang/Basic/DiagnosticAllCompatIDs.inc" + }; +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END + + assert(CompatDiagId < std::size(Diags) && "Invalid compat diag id"); + + unsigned StdVer = [&] { + if (LangOpts.CPlusPlus26) + return 2026; + if (LangOpts.CPlusPlus23) + return 2023; + if (LangOpts.CPlusPlus20) + return 2020; + if (LangOpts.CPlusPlus17) + return 2017; + if (LangOpts.CPlusPlus14) + return 2014; + if (LangOpts.CPlusPlus11) + return 2011; + return 1998; + }(); + + const CompatDiag &D = Diags[CompatDiagId]; + return StdVer >= D.StdVer ? D.DiagId : D.PreDiagId; +} + /// ProcessDiag - This is the method used to report a diagnostic that is /// finally fully formed. bool DiagnosticIDs::ProcessDiag(DiagnosticsEngine &Diag, diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 3f156407edc99..d77400e0f8272 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -5452,11 +5452,8 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS, BaseRange = SourceRange(ColonLoc, DeclaratorInfo.getSourceRange().getEnd()); if (!getLangOpts().ObjC) { - if (getLangOpts().CPlusPlus11) - Diag(ColonLoc, diag::warn_cxx98_compat_enum_fixed_underlying_type) - << BaseRange; - else if (getLangOpts().CPlusPlus) - Diag(ColonLoc, diag::ext_cxx11_enum_fixed_underlying_type) + if (getLangOpts().CPlusPlus) + DiagCompat(ColonLoc, diag_compat::enum_fixed_underlying_type) << BaseRange; else if (getLangOpts().MicrosoftExt && !getLangOpts().C23) Diag(ColonLoc, diag::ext_ms_c_enum_fixed_underlying_type) diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 2eca89179453b..1227514121973 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -89,6 +89,16 @@ DiagnosticBuilder Parser::Diag(const Token &Tok, unsigned DiagID) { return Diag(Tok.getLocation(), DiagID); } +DiagnosticBuilder Parser::DiagCompat(SourceLocation Loc, + unsigned CompatDiagId) { + return Diag(Loc, + DiagnosticIDs::getCXXCompatDiagId(getLangOpts(), CompatDiagId)); +} + +DiagnosticBuilder Parser::DiagCompat(const Token &Tok, unsigned CompatDiagId) { + return DiagCompat(Tok.getLocation(), CompatDiagId); +} + /// Emits a diagnostic suggesting parentheses surrounding a /// given range. /// diff --git a/clang/lib/Sema/SemaBase.cpp b/clang/lib/Sema/SemaBase.cpp index 85c4a0ab40fed..9b677f446f3e6 100644 --- a/clang/lib/Sema/SemaBase.cpp +++ b/clang/lib/Sema/SemaBase.cpp @@ -88,4 +88,11 @@ Sema::SemaDiagnosticBuilder SemaBase::Diag(SourceLocation Loc, return Diag(Loc, PD.getDiagID(), DeferHint) << PD; } +SemaBase::SemaDiagnosticBuilder SemaBase::DiagCompat(SourceLocation Loc, + unsigned CompatDiagId, + bool DeferHint) { + return Diag(Loc, + DiagnosticIDs::getCXXCompatDiagId(getLangOpts(), CompatDiagId), + DeferHint); +} } // namespace clang diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index bbefbbf294dd1..9b7b3f856cc55 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -7649,10 +7649,7 @@ NamedDecl *Sema::ActOnVariableDeclarator( IsVariableTemplate = true; // Only C++1y supports variable templates (N3651). - Diag(D.getIdentifierLoc(), - getLangOpts().CPlusPlus14 - ? diag::compat_cxx14_variable_template - : diag::compat_pre_cxx14_variable_template); + DiagCompat(D.getIdentifierLoc(), diag_compat::variable_template); } } } else { @@ -7718,10 +7715,8 @@ NamedDecl *Sema::ActOnVariableDeclarator( } else if (RD->isUnion()) { // C++98 [class.union]p1: If a union contains a static data member, // the program is ill-formed. C++11 drops this restriction. - Diag(D.getIdentifierLoc(), - getLangOpts().CPlusPlus11 - ? diag::compat_cxx11_static_data_member_in_union - : diag::compat_pre_cxx11_static_data_member_in_union) + DiagCompat(D.getIdentifierLoc(), + diag_compat::static_data_member_in_union) << Name; } } diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 43bf9b7cd0f95..b6ba7231d9a26 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -807,10 +807,8 @@ Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D, for (auto Loc : BadSpecifierLocs) Err << SourceRange(Loc, Loc); } else if (!CPlusPlus20Specifiers.empty()) { - auto &&Warn = Diag(CPlusPlus20SpecifierLocs.front(), - getLangOpts().CPlusPlus20 - ? diag::compat_cxx20_decomp_decl_spec - : diag::compat_pre_cxx20_decomp_decl_spec); + auto &&Warn = DiagCompat(CPlusPlus20SpecifierLocs.front(), + diag_compat::decomp_decl_spec); Warn << (int)CPlusPlus20Specifiers.size() << llvm::join(CPlusPlus20Specifiers.begin(), CPlusPlus20Specifiers.end(), " "); @@ -2040,10 +2038,8 @@ static bool CheckConstexprDeclStmt(Sema &SemaRef, const FunctionDecl *Dcl, // C++1y allows types to be defined, not just declared. if (cast(DclIt)->isThisDeclarationADefinition()) { if (Kind == Sema::CheckConstexprKind::Diagnose) { - SemaRef.Diag(DS->getBeginLoc(), - SemaRef.getLangOpts().CPlusPlus14 - ? diag::compat_cxx14_constexpr_type_definition - : diag::compat_pre_cxx14_constexpr_type_definition) + SemaRef.DiagCompat(DS->getBeginLoc(), + diag_compat::constexpr_type_definition) << isa(Dcl); } else if (!SemaRef.getLangOpts().CPlusPlus14) { return false; @@ -2068,10 +2064,8 @@ static bool CheckConstexprDeclStmt(Sema &SemaRef, const FunctionDecl *Dcl, if (VD->isThisDeclarationADefinition()) { if (VD->isStaticLocal()) { if (Kind == Sema::CheckConstexprKind::Diagnose) { - SemaRef.Diag(VD->getLocation(), - SemaRef.getLangOpts().CPlusPlus23 - ? diag::compat_cxx23_constexpr_static_var - : diag::compat_pre_cxx23_constexpr_static_var) + SemaRef.DiagCompat(VD->getLocation(), + diag_compat::constexpr_static_var) << isa(Dcl) << (VD->getTLSKind() == VarDecl::TLS_Dynamic); } else if (!SemaRef.getLangOpts().CPlusPlus23) { @@ -2091,11 +2085,8 @@ static bool CheckConstexprDeclStmt(Sema &SemaRef, const FunctionDecl *Dcl, if (!VD->getType()->isDependentType() && !VD->hasInit() && !VD->isCXXForRangeDecl()) { if (Kind == Sema::CheckConstexprKind::Diagnose) { - SemaRef.Diag( - VD->getLocation(), - SemaRef.getLangOpts().CPlusPlus20 - ? diag::compat_cxx20_constexpr_local_var_no_init - : diag::compat_pre_cxx20_constexpr_local_var_no_init) + SemaRef.DiagCompat(VD->getLocation(), + diag_compat::constexpr_local_var_no_init) << isa(Dcl); } else if (!SemaRef.getLangOpts().CPlusPlus20) { return false; @@ -2104,10 +2095,7 @@ static bool CheckConstexprDeclStmt(Sema &SemaRef, const FunctionDecl *Dcl, } } if (Kind == Sema::CheckConstexprKind::Diagnose) { - SemaRef.Diag(VD->getLocation(), - SemaRef.getLangOpts().CPlusPlus14 - ? diag::compat_cxx14_constexpr_local_var - : diag::compat_pre_cxx14_constexpr_local_var) + SemaRef.DiagCompat(VD->getLocation(), diag_compat::constexpr_local_var) << isa(Dcl); } else if (!SemaRef.getLangOpts().CPlusPlus14) { return false; @@ -2177,10 +2165,8 @@ static bool CheckConstexprCtorInitializer(Sema &SemaRef, if (!Inits.count(Field)) { if (Kind == Sema::CheckConstexprKind::Diagnose) { if (!Diagnosed) { - SemaRef.Diag(Dcl->getLocation(), - SemaRef.getLangOpts().CPlusPlus20 - ? diag::compat_cxx20_constexpr_ctor_missing_init - : diag::compat_pre_cxx20_constexpr_ctor_missing_init); + SemaRef.DiagCompat(Dcl->getLocation(), + diag_compat::constexpr_ctor_missing_init); Diagnosed = true; } SemaRef.Diag(Field->getLocation(), @@ -2391,10 +2377,8 @@ static bool CheckConstexprFunctionBody(Sema &SemaRef, const FunctionDecl *Dcl, break; case Sema::CheckConstexprKind::Diagnose: - SemaRef.Diag(Body->getBeginLoc(), - SemaRef.getLangOpts().CPlusPlus20 - ? diag::compat_cxx20_constexpr_function_try_block - : diag::compat_pre_cxx20_constexpr_function_try_block) + SemaRef.DiagCompat(Body->getBeginLoc(), + diag_compat::constexpr_function_try_block) << isa(Dcl); break; } @@ -2421,22 +2405,13 @@ static bool CheckConstexprFunctionBody(Sema &SemaRef, const FunctionDecl *Dcl, (Cxx1yLoc.isValid() && !SemaRef.getLangOpts().CPlusPlus17)) return false; } else if (Cxx2bLoc.isValid()) { - SemaRef.Diag(Cxx2bLoc, - SemaRef.getLangOpts().CPlusPlus23 - ? diag::compat_cxx23_constexpr_body_invalid_stmt - : diag::compat_pre_cxx23_constexpr_body_invalid_stmt) + SemaRef.DiagCompat(Cxx2bLoc, diag_compat::cxx23_constexpr_body_invalid_stmt) << isa(Dcl); } else if (Cxx2aLoc.isValid()) { - SemaRef.Diag(Cxx2aLoc, - SemaRef.getLangOpts().CPlusPlus20 - ? diag::compat_cxx20_constexpr_body_invalid_stmt - : diag::compat_pre_cxx20_constexpr_body_invalid_stmt) + SemaRef.DiagCompat(Cxx2aLoc, diag_compat::cxx20_constexpr_body_invalid_stmt) << isa(Dcl); } else if (Cxx1yLoc.isValid()) { - SemaRef.Diag(Cxx1yLoc, - SemaRef.getLangOpts().CPlusPlus14 - ? diag::compat_cxx14_constexpr_body_invalid_stmt - : diag::compat_pre_cxx14_constexpr_body_invalid_stmt) + SemaRef.DiagCompat(Cxx1yLoc, diag_compat::cxx14_constexpr_body_invalid_stmt) << isa(Dcl); } @@ -2453,11 +2428,8 @@ static bool CheckConstexprFunctionBody(Sema &SemaRef, const FunctionDecl *Dcl, if (Constructor->getNumCtorInitializers() == 0 && RD->hasVariantMembers()) { if (Kind == Sema::CheckConstexprKind::Diagnose) { - SemaRef.Diag( - Dcl->getLocation(), - SemaRef.getLangOpts().CPlusPlus20 - ? diag::compat_cxx20_constexpr_union_ctor_no_init - : diag::compat_pre_cxx20_constexpr_union_ctor_no_init); + SemaRef.DiagCompat(Dcl->getLocation(), + diag_compat::constexpr_union_ctor_no_init); } else if (!SemaRef.getLangOpts().CPlusPlus20) { return false; } @@ -2520,11 +2492,8 @@ static bool CheckConstexprFunctionBody(Sema &SemaRef, const FunctionDecl *Dcl, } else if (ReturnStmts.size() > 1) { switch (Kind) { case Sema::CheckConstexprKind::Diagnose: - SemaRef.Diag( - ReturnStmts.back(), - SemaRef.getLangOpts().CPlusPlus14 - ? diag::compat_cxx14_constexpr_body_multiple_return - : diag::compat_pre_cxx14_constexpr_body_multiple_return); + SemaRef.DiagCompat(ReturnStmts.back(), + diag_compat::constexpr_body_multiple_return); for (unsigned I = 0; I < ReturnStmts.size() - 1; ++I) SemaRef.Diag(ReturnStmts[I], diag::note_constexpr_body_previous_return); @@ -17825,9 +17794,7 @@ Decl *Sema::ActOnFriendTypeDecl(Scope *S, const DeclSpec &DS, << FixItHint::CreateInsertion(getLocForEndOfToken(FriendLoc), InsertionText); } else { - Diag(FriendLoc, getLangOpts().CPlusPlus11 - ? diag::compat_cxx11_nonclass_type_friend - : diag::compat_pre_cxx11_nonclass_type_friend) + DiagCompat(FriendLoc, diag_compat::nonclass_type_friend) << T << DS.getSourceRange(); } } diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 7cc8374e69d73..e7f418ae6802e 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6508,9 +6508,7 @@ ExprResult Sema::ActOnCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, if (const auto *ULE = dyn_cast(Fn); ULE && ULE->hasExplicitTemplateArgs() && ULE->decls_begin() == ULE->decls_end()) { - Diag(Fn->getExprLoc(), getLangOpts().CPlusPlus20 - ? diag::compat_cxx20_adl_only_template_id - : diag::compat_pre_cxx20_adl_only_template_id) + DiagCompat(Fn->getExprLoc(), diag_compat::adl_only_template_id) << ULE->getName(); } diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index eace9b87a5bfe..1f87ef4b27bab 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -2234,10 +2234,7 @@ static bool DiagnoseDefaultTemplateArgument(Sema &S, // template-argument, that declaration shall be a definition and shall be // the only declaration of the function template in the translation unit. // (C++98/03 doesn't have this wording; see DR226). - S.Diag(ParamLoc, - S.getLangOpts().CPlusPlus11 - ? diag::compat_cxx11_templ_default_in_function_templ - : diag::compat_pre_cxx11_templ_default_in_function_templ) + S.DiagCompat(ParamLoc, diag_compat::templ_default_in_function_templ) << DefArgRange; return false; @@ -6432,10 +6429,7 @@ static bool CheckTemplateArgumentAddressOfObjectOrFunction( bool ExtraParens = false; while (ParenExpr *Parens = dyn_cast(Arg)) { if (!Invalid && !ExtraParens) { - S.Diag(Arg->getBeginLoc(), - S.getLangOpts().CPlusPlus11 - ? diag::compat_cxx11_template_arg_extra_parens - : diag::compat_pre_cxx11_template_arg_extra_parens) + S.DiagCompat(Arg->getBeginLoc(), diag_compat::template_arg_extra_parens) << Arg->getSourceRange(); ExtraParens = true; } @@ -6655,10 +6649,7 @@ CheckTemplateArgumentPointerToMember(Sema &S, NonTypeTemplateParmDecl *Param, bool ExtraParens = false; while (ParenExpr *Parens = dyn_cast(Arg)) { if (!Invalid && !ExtraParens) { - S.Diag(Arg->getBeginLoc(), - S.getLangOpts().CPlusPlus11 - ? diag::compat_cxx11_template_arg_extra_parens - : diag::compat_pre_cxx11_template_arg_extra_parens) + S.DiagCompat(Arg->getBeginLoc(), diag_compat::template_arg_extra_parens) << Arg->getSourceRange(); ExtraParens = true; } @@ -10639,9 +10630,7 @@ TypeResult Sema::ActOnTypenameType(Scope *S, SourceLocation TypenameLoc, return true; if (TypenameLoc.isValid() && S && !S->getTemplateParamParent()) - Diag(TypenameLoc, getLangOpts().CPlusPlus11 - ? diag::compat_cxx11_typename_outside_of_template - : diag::compat_pre_cxx11_typename_outside_of_template) + DiagCompat(TypenameLoc, diag_compat::typename_outside_of_template) << FixItHint::CreateRemoval(TypenameLoc); NestedNameSpecifierLoc QualifierLoc = SS.getWithLocInContext(Context); diff --git a/clang/test/Misc/show-diag-options.c b/clang/test/Misc/show-diag-options.c index 4e98d63195f10..b09820379da36 100644 --- a/clang/test/Misc/show-diag-options.c +++ b/clang/test/Misc/show-diag-options.c @@ -18,7 +18,7 @@ void test(int x, int y) { // BASE: {{.*}}: warning: {{[a-z ]+$}} // OPTION: {{.*}}: warning: {{[a-z ]+}} [-Wparentheses] // OPTION_ERROR: {{.*}}: error: {{[a-z ]+}} [-Werror,-Wparentheses] - // CATEGORY_ID: {{.*}}: warning: {{[a-z ]+}} [2] + // CATEGORY_ID: {{.*}}: warning: {{[a-z ]+}} [{{[0-9]+}}] // CATEGORY_NAME: {{.*}}: warning: {{[a-z ]+}} [Semantic Issue] // OPTION_ERROR_CATEGORY: {{.*}}: error: {{[a-z ]+}} [-Werror,-Wparentheses,Semantic Issue] diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp index 8f846a4744bbf..73facbc916714 100644 --- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp +++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp @@ -1518,6 +1518,50 @@ static void verifyDiagnosticWording(const Record &Diag) { // runs into odd situations like [[clang::warn_unused_result]], // #pragma clang, or --unwindlib=libgcc. } + +/// ClangDiagsCompatIDsEmitter - Emit a set of 'compatibility diagnostic ids' +/// that map to a set of 2 regular diagnostic ids each and which are used to +/// simplify emitting compatibility warnings. +void clang::EmitClangDiagsCompatIDs(const llvm::RecordKeeper &Records, + llvm::raw_ostream &OS, + const std::string &Component) { + ArrayRef Ids = + Records.getAllDerivedDefinitions("CompatWarningId"); + + StringRef PrevComponent = ""; + for (auto [I, R] : enumerate(make_pointee_range(Ids))) { + StringRef DiagComponent = R.getValueAsString("Component"); + if (!Component.empty() && Component != DiagComponent) + continue; + + StringRef CompatDiagName = R.getValueAsString("Name"); + StringRef Diag = R.getValueAsString("Diag"); + StringRef DiagPre = R.getValueAsString("DiagPre"); + int64_t CXXStdVer = R.getValueAsInt("Std"); + + // We don't want to create empty enums since some compilers (including + // Clang) warn about that, so these macros are used to avoid having to + // unconditionally write 'enum {' and '};' in the headers. + if (PrevComponent != DiagComponent) { + if (!PrevComponent.empty()) + OS << "DIAG_COMPAT_IDS_END()\n"; + OS << "DIAG_COMPAT_IDS_BEGIN()\n"; + PrevComponent = DiagComponent; + } + + // FIXME: We sometimes define multiple compat diagnostics with the same + // name, e.g. 'constexpr_body_invalid_stmt' exists for C++14/20/23. It would + // be nice if we could combine all of them into a single compatibility diag + // id. + OS << "DIAG_COMPAT_ID(" << I << ","; + OS << CompatDiagName << "," << CXXStdVer << "," << Diag << "," << DiagPre; + OS << ")\n"; + } + + if (!PrevComponent.empty()) + OS << "DIAG_COMPAT_IDS_END()\n"; +} + /// ClangDiagsEnumsEmitter - The top-level class emits .def files containing /// declarations of Clang diagnostic enums for selects. void clang::EmitClangDiagsEnums(const RecordKeeper &Records, raw_ostream &OS, diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp index 7ffe6d2a913a9..4d3d56ed4b9d7 100644 --- a/clang/utils/TableGen/TableGen.cpp +++ b/clang/utils/TableGen/TableGen.cpp @@ -48,6 +48,7 @@ enum ActionType { GenClangBasicWriter, GenClangBuiltins, GenClangBuiltinTemplates, + GenClangDiagsCompatIDs, GenClangDiagsDefs, GenClangDiagsEnums, GenClangDiagGroups, @@ -176,6 +177,8 @@ cl::opt Action( "Generate clang builtins list"), clEnumValN(GenClangBuiltinTemplates, "gen-clang-builtin-templates", "Generate clang builtins list"), + clEnumValN(GenClangDiagsCompatIDs, "gen-clang-diags-compat-ids", + "Generate Clang diagnostic compatibility ids"), clEnumValN(GenClangDiagsDefs, "gen-clang-diags-defs", "Generate Clang diagnostics definitions"), clEnumValN(GenClangDiagsEnums, "gen-clang-diags-enums", @@ -399,6 +402,9 @@ bool ClangTableGenMain(raw_ostream &OS, const RecordKeeper &Records) { case GenClangBuiltinTemplates: EmitClangBuiltinTemplates(Records, OS); break; + case GenClangDiagsCompatIDs: + EmitClangDiagsCompatIDs(Records, OS, ClangComponent); + break; case GenClangDiagsDefs: EmitClangDiagsDefs(Records, OS, ClangComponent); break; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index ae8ea3ad34aa5..c26ce2825ea99 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -91,6 +91,9 @@ void EmitClangBuiltins(const llvm::RecordKeeper &Records, void EmitClangBuiltinTemplates(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitClangDiagsCompatIDs(const llvm::RecordKeeper &Records, + llvm::raw_ostream &OS, + const std::string &Component); void EmitClangDiagsDefs(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS, const std::string &Component); void EmitClangDiagsEnums(const llvm::RecordKeeper &Records, From a2ca2f3f10002da61e9860d0ce11e0272482baba Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Wed, 2 Apr 2025 02:10:55 -0400 Subject: [PATCH 0342/1029] [CIR] Fix cir-canonicalize pass upstreaming issues. NFC - Fix typos in 'RemoveEmptyScope' pattern rewriting and combine 'match' and 'rewrite' into 'matchAndRewrite' as they are deprecated. --- .../Dialect/Transforms/CIRCanonicalize.cpp | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp index 1d2d02312d941..cdac69e66dba3 100644 --- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp +++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp @@ -61,26 +61,27 @@ struct RemoveRedundantBranches : public OpRewritePattern { } }; -struct RemoveEmptyScope - : public OpRewritePattern::SplitMatchAndRewrite { - using SplitMatchAndRewrite::SplitMatchAndRewrite; +struct RemoveEmptyScope : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; - LogicalResult match(ScopeOp op) const final { + LogicalResult matchAndRewrite(ScopeOp op, + PatternRewriter &rewriter) const final { // TODO: Remove this logic once CIR uses MLIR infrastructure to remove // trivially dead operations - if (op.isEmpty()) + if (op.isEmpty()) { + rewriter.eraseOp(op); return success(); + } Region ®ion = op.getScopeRegion(); - if (region.getBlocks().front().getOperations().size() == 1) - return success(isa(region.getBlocks().front().front())); + if (region.getBlocks().front().getOperations().size() == 1 && + isa(region.getBlocks().front().front())) { + rewriter.eraseOp(op); + return success(); + } return failure(); } - - void rewrite(ScopeOp op, PatternRewriter &rewriter) const final { - rewriter.eraseOp(op); - } }; //===----------------------------------------------------------------------===// From 076397ff3217cf45fd08024dd7bcd2bc8fb229ab Mon Sep 17 00:00:00 2001 From: Sirraide Date: Wed, 2 Apr 2025 08:29:05 +0200 Subject: [PATCH 0343/1029] Revert "[Clang] [NFC] Introduce a helper for emitting compatibility diagnostics" (#134036) Reverts llvm/llvm-project#132348 Some tests are failing and I still need to figure out what is going on here. --- clang/include/clang/Basic/CMakeLists.txt | 10 --- clang/include/clang/Basic/Diagnostic.td | 18 ----- clang/include/clang/Basic/DiagnosticAST.h | 12 --- .../include/clang/Basic/DiagnosticAnalysis.h | 12 --- clang/include/clang/Basic/DiagnosticComment.h | 12 --- clang/include/clang/Basic/DiagnosticCrossTU.h | 12 --- clang/include/clang/Basic/DiagnosticDriver.h | 12 --- .../include/clang/Basic/DiagnosticFrontend.h | 12 --- clang/include/clang/Basic/DiagnosticIDs.h | 18 ----- .../clang/Basic/DiagnosticInstallAPI.h | 12 --- clang/include/clang/Basic/DiagnosticLex.h | 12 --- clang/include/clang/Basic/DiagnosticParse.h | 12 --- .../clang/Basic/DiagnosticParseKinds.td | 12 +-- .../clang/Basic/DiagnosticRefactoring.h | 12 --- clang/include/clang/Basic/DiagnosticSema.h | 13 ---- .../clang/Basic/DiagnosticSemaKinds.td | 2 +- .../clang/Basic/DiagnosticSerialization.h | 12 --- clang/include/clang/Parse/Parser.h | 6 -- clang/include/clang/Sema/SemaBase.h | 4 - clang/lib/Basic/DiagnosticIDs.cpp | 46 ------------ clang/lib/Parse/ParseDecl.cpp | 7 +- clang/lib/Parse/Parser.cpp | 10 --- clang/lib/Sema/SemaBase.cpp | 7 -- clang/lib/Sema/SemaDecl.cpp | 11 ++- clang/lib/Sema/SemaDeclCXX.cpp | 75 +++++++++++++------ clang/lib/Sema/SemaExpr.cpp | 4 +- clang/lib/Sema/SemaTemplate.cpp | 19 ++++- clang/test/Misc/show-diag-options.c | 2 +- .../TableGen/ClangDiagnosticsEmitter.cpp | 44 ----------- clang/utils/TableGen/TableGen.cpp | 6 -- clang/utils/TableGen/TableGenBackends.h | 3 - 31 files changed, 93 insertions(+), 356 deletions(-) diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt index 6be6d063c20b4..a671d5c764c22 100644 --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -8,11 +8,6 @@ macro(clang_diag_gen component) -gen-clang-diags-enums -clang-component=${component} SOURCE Diagnostic.td TARGET ClangDiagnostic${component}Enums) - - clang_tablegen(Diagnostic${component}CompatIDs.inc - -gen-clang-diags-compat-ids -clang-component=${component} - SOURCE Diagnostic.td - TARGET ClangDiagnostic${component}CompatIDs) endmacro(clang_diag_gen) clang_diag_gen(Analysis) @@ -36,11 +31,6 @@ clang_tablegen(DiagnosticIndexName.inc -gen-clang-diags-index-name SOURCE Diagnostic.td TARGET ClangDiagnosticIndexName) -clang_tablegen(DiagnosticAllCompatIDs.inc - -gen-clang-diags-compat-ids - SOURCE Diagnostic.td - TARGET ClangDiagnosticAllCompatIDs) - clang_tablegen(AttrList.inc -gen-clang-attr-list -I ${CMAKE_CURRENT_SOURCE_DIR}/../../ SOURCE Attr.td diff --git a/clang/include/clang/Basic/Diagnostic.td b/clang/include/clang/Basic/Diagnostic.td index 65b19f3feea4f..b31d846210a8e 100644 --- a/clang/include/clang/Basic/Diagnostic.td +++ b/clang/include/clang/Basic/Diagnostic.td @@ -155,19 +155,6 @@ class DefaultWarnNoWerror { } class DefaultRemark { Severity DefaultSeverity = SEV_Remark; } -class CompatWarningId { - string Component = ?; - string Name = name; - string Diag = diag; - string DiagPre = diag_pre; - int Std = std; - - // This is unused, but Tablegen will complain if it's missing because we define - // the compatibility ids in the same place as the other diagnostics (which means - // that we'll be inside a 'let CategoryName = "" in { ... }' block). - string CategoryName = ?; -} - // C++ compatibility warnings. multiclass CXXCompat< string message, @@ -191,11 +178,6 @@ multiclass CXXCompat< "CXX98Compat", "CXXPre"#std_ver#"Compat"))>, DefaultIgnore; - - def : CompatWarningId< - NAME, std_ver, - "compat_cxx"#std_ver#"_"#NAME, - "compat_pre_cxx"#std_ver#"_"#NAME>; } // These generate pairs of C++ compatibility warnings of the form: diff --git a/clang/include/clang/Basic/DiagnosticAST.h b/clang/include/clang/Basic/DiagnosticAST.h index 41e2598f7cc3b..4f82114b7406b 100644 --- a/clang/include/clang/Basic/DiagnosticAST.h +++ b/clang/include/clang/Basic/DiagnosticAST.h @@ -36,18 +36,6 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticASTCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICAST_H diff --git a/clang/include/clang/Basic/DiagnosticAnalysis.h b/clang/include/clang/Basic/DiagnosticAnalysis.h index 5ead092b946c5..1a49461bcd173 100644 --- a/clang/include/clang/Basic/DiagnosticAnalysis.h +++ b/clang/include/clang/Basic/DiagnosticAnalysis.h @@ -35,18 +35,6 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticAnalysisCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICANALYSIS_H diff --git a/clang/include/clang/Basic/DiagnosticComment.h b/clang/include/clang/Basic/DiagnosticComment.h index 08e66e8051834..53143ef132e4b 100644 --- a/clang/include/clang/Basic/DiagnosticComment.h +++ b/clang/include/clang/Basic/DiagnosticComment.h @@ -36,18 +36,6 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticCommentCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICCOMMENT_H diff --git a/clang/include/clang/Basic/DiagnosticCrossTU.h b/clang/include/clang/Basic/DiagnosticCrossTU.h index 761716d781446..428da95011027 100644 --- a/clang/include/clang/Basic/DiagnosticCrossTU.h +++ b/clang/include/clang/Basic/DiagnosticCrossTU.h @@ -36,18 +36,6 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticCrossTUCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICCROSSTU_H diff --git a/clang/include/clang/Basic/DiagnosticDriver.h b/clang/include/clang/Basic/DiagnosticDriver.h index 864a23a49e4cd..c472afa3f6e96 100644 --- a/clang/include/clang/Basic/DiagnosticDriver.h +++ b/clang/include/clang/Basic/DiagnosticDriver.h @@ -36,18 +36,6 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticDriverCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICDRIVER_H diff --git a/clang/include/clang/Basic/DiagnosticFrontend.h b/clang/include/clang/Basic/DiagnosticFrontend.h index 3506f05daae54..766cac3d655b3 100644 --- a/clang/include/clang/Basic/DiagnosticFrontend.h +++ b/clang/include/clang/Basic/DiagnosticFrontend.h @@ -36,18 +36,6 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticFrontendCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICFRONTEND_H diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h index f2bd19f9b6e8a..017ef7065610f 100644 --- a/clang/include/clang/Basic/DiagnosticIDs.h +++ b/clang/include/clang/Basic/DiagnosticIDs.h @@ -25,7 +25,6 @@ namespace clang { class DiagnosticsEngine; class DiagnosticBuilder; - class LangOptions; class SourceLocation; // Import the diagnostic enums themselves. @@ -105,18 +104,6 @@ namespace clang { }; } - namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticCommonCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END - } // end namespace diag_compat - class DiagnosticMapping { LLVM_PREFERRED_TYPE(diag::Severity) unsigned Severity : 3; @@ -477,11 +464,6 @@ class DiagnosticIDs : public RefCountedBase { /// given group name. static StringRef getNearestOption(diag::Flavor Flavor, StringRef Group); - /// Get the appropriate diagnostic Id to use for issuing a compatibility - /// diagnostic. For use by the various DiagCompat() helpers. - static unsigned getCXXCompatDiagId(const LangOptions &LangOpts, - unsigned CompatDiagId); - private: /// Classify the specified diagnostic ID into a Level, consumable by /// the DiagnosticClient. diff --git a/clang/include/clang/Basic/DiagnosticInstallAPI.h b/clang/include/clang/Basic/DiagnosticInstallAPI.h index 4619bfeea05a2..cbdb00362624b 100644 --- a/clang/include/clang/Basic/DiagnosticInstallAPI.h +++ b/clang/include/clang/Basic/DiagnosticInstallAPI.h @@ -35,17 +35,5 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticInstallAPICompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat } // namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICINSTALLAPI_H diff --git a/clang/include/clang/Basic/DiagnosticLex.h b/clang/include/clang/Basic/DiagnosticLex.h index 6fa90f785bbf8..d14bf97e8642e 100644 --- a/clang/include/clang/Basic/DiagnosticLex.h +++ b/clang/include/clang/Basic/DiagnosticLex.h @@ -35,18 +35,6 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticLexCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICLEX_H diff --git a/clang/include/clang/Basic/DiagnosticParse.h b/clang/include/clang/Basic/DiagnosticParse.h index e2a4368a59c4b..275e1a4c39b3f 100644 --- a/clang/include/clang/Basic/DiagnosticParse.h +++ b/clang/include/clang/Basic/DiagnosticParse.h @@ -36,18 +36,6 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticParseCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICPARSE_H diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 954f538e15026..2582e1e5ef0f6 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -11,12 +11,6 @@ //===----------------------------------------------------------------------===// let Component = "Parse" in { -let CategoryName = "Parse Issue" in { -// C++11 compatibility with C++98. -defm enum_fixed_underlying_type : CXX11Compat< - "enumeration types with a fixed underlying type are", - /*ext_warn=*/false>; -} def err_asm_qualifier_ignored : Error< "expected 'volatile', 'inline', 'goto', or '('">, CatInlineAsm; @@ -113,6 +107,9 @@ def err_enumerator_list_missing_comma : Error< "missing ',' between enumerators">; def err_enumerator_unnamed_no_def : Error< "unnamed enumeration must be a definition">; +def ext_cxx11_enum_fixed_underlying_type : Extension< + "enumeration types with a fixed underlying type are a C++11 extension">, + InGroup; def ext_ms_c_enum_fixed_underlying_type : Extension< "enumeration types with a fixed underlying type are a Microsoft extension">, InGroup; @@ -122,6 +119,9 @@ def ext_c23_enum_fixed_underlying_type : Extension< def warn_c17_compat_enum_fixed_underlying_type : Warning< "enumeration types with a fixed underlying type are incompatible with C standards before C23">, DefaultIgnore, InGroup; +def warn_cxx98_compat_enum_fixed_underlying_type : Warning< + "enumeration types with a fixed underlying type are incompatible with C++98">, + InGroup, DefaultIgnore; def ext_enum_base_in_type_specifier : ExtWarn< "non-defining declaration of enumeration with a fixed underlying type is " "only permitted as a standalone declaration" diff --git a/clang/include/clang/Basic/DiagnosticRefactoring.h b/clang/include/clang/Basic/DiagnosticRefactoring.h index b3f3a10925f09..59d4bc912733a 100644 --- a/clang/include/clang/Basic/DiagnosticRefactoring.h +++ b/clang/include/clang/Basic/DiagnosticRefactoring.h @@ -36,18 +36,6 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticRefactoringCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICREFACTORING_H diff --git a/clang/include/clang/Basic/DiagnosticSema.h b/clang/include/clang/Basic/DiagnosticSema.h index 943b2f64f427e..84986c7bccf71 100644 --- a/clang/include/clang/Basic/DiagnosticSema.h +++ b/clang/include/clang/Basic/DiagnosticSema.h @@ -35,20 +35,7 @@ enum { #undef DIAG_ENUM_END #undef DIAG_ENUM_ITEM #undef DIAG_ENUM - } // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticSemaCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICSEMA_H diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 265bed2df43cf..5e45482584946 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -61,7 +61,7 @@ defm decomp_decl_cond : CXX26Compat<"structured binding declaration in a conditi // Compatibility warnings duplicated across multiple language versions. foreach std = [14, 20, 23] in { - defm cxx#std#_constexpr_body_invalid_stmt : CXXCompat< + defm constexpr_body_invalid_stmt : CXXCompat< "use of this statement in a constexpr %select{function|constructor}0 is", std>; } diff --git a/clang/include/clang/Basic/DiagnosticSerialization.h b/clang/include/clang/Basic/DiagnosticSerialization.h index c8fb034e9bd4a..6fb836dca1b04 100644 --- a/clang/include/clang/Basic/DiagnosticSerialization.h +++ b/clang/include/clang/Basic/DiagnosticSerialization.h @@ -36,18 +36,6 @@ enum { #undef DIAG_ENUM_ITEM #undef DIAG_ENUM } // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticSerializationCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat } // end namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICSERIALIZATION_H diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 5770692c42f13..c8ceef8f8987d 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -1263,12 +1263,6 @@ class Parser : public CodeCompletionHandler { return Diag(Tok, DiagID); } - DiagnosticBuilder DiagCompat(SourceLocation Loc, unsigned CompatDiagId); - DiagnosticBuilder DiagCompat(const Token &Tok, unsigned CompatDiagId); - DiagnosticBuilder DiagCompat(unsigned CompatDiagId) { - return DiagCompat(Tok, CompatDiagId); - } - private: void SuggestParentheses(SourceLocation Loc, unsigned DK, SourceRange ParenRange); diff --git a/clang/include/clang/Sema/SemaBase.h b/clang/include/clang/Sema/SemaBase.h index 550f530af72f5..463cae83c7e81 100644 --- a/clang/include/clang/Sema/SemaBase.h +++ b/clang/include/clang/Sema/SemaBase.h @@ -219,10 +219,6 @@ class SemaBase { SemaDiagnosticBuilder Diag(SourceLocation Loc, const PartialDiagnostic &PD, bool DeferHint = false); - /// Emit a compatibility diagnostic. - SemaDiagnosticBuilder DiagCompat(SourceLocation Loc, unsigned CompatDiagId, - bool DeferHint = false); - /// Build a partial diagnostic. PartialDiagnostic PDiag(unsigned DiagID = 0); }; diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp index d5928431f41a2..ca5b8d2da769e 100644 --- a/clang/lib/Basic/DiagnosticIDs.cpp +++ b/clang/lib/Basic/DiagnosticIDs.cpp @@ -13,7 +13,6 @@ #include "clang/Basic/DiagnosticIDs.h" #include "clang/Basic/AllDiagnostics.h" #include "clang/Basic/DiagnosticCategories.h" -#include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceManager.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" @@ -770,51 +769,6 @@ StringRef DiagnosticIDs::getNearestOption(diag::Flavor Flavor, return Best; } -unsigned DiagnosticIDs::getCXXCompatDiagId(const LangOptions &LangOpts, - unsigned CompatDiagId) { - struct CompatDiag { - unsigned StdVer; - unsigned DiagId; - unsigned PreDiagId; - }; - - // We encode the standard version such that C++98 < C++11 < C++14 etc. The - // actual numbers don't really matter for this, but the definitions of the - // compat diags in the Tablegen file use the standard version number (i.e. - // 98, 11, 14, etc.), so we base the encoding here on that. -#define DIAG_COMPAT_IDS_BEGIN() -#define DIAG_COMPAT_IDS_END() -#define DIAG_COMPAT_ID(Value, Name, Std, Diag, DiagPre) \ - {Std == 98 ? 1998 : 2000 + Std, diag::Diag, diag::DiagPre}, - static constexpr CompatDiag Diags[]{ -#include "clang/Basic/DiagnosticAllCompatIDs.inc" - }; -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END - - assert(CompatDiagId < std::size(Diags) && "Invalid compat diag id"); - - unsigned StdVer = [&] { - if (LangOpts.CPlusPlus26) - return 2026; - if (LangOpts.CPlusPlus23) - return 2023; - if (LangOpts.CPlusPlus20) - return 2020; - if (LangOpts.CPlusPlus17) - return 2017; - if (LangOpts.CPlusPlus14) - return 2014; - if (LangOpts.CPlusPlus11) - return 2011; - return 1998; - }(); - - const CompatDiag &D = Diags[CompatDiagId]; - return StdVer >= D.StdVer ? D.DiagId : D.PreDiagId; -} - /// ProcessDiag - This is the method used to report a diagnostic that is /// finally fully formed. bool DiagnosticIDs::ProcessDiag(DiagnosticsEngine &Diag, diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index d77400e0f8272..3f156407edc99 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -5452,8 +5452,11 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS, BaseRange = SourceRange(ColonLoc, DeclaratorInfo.getSourceRange().getEnd()); if (!getLangOpts().ObjC) { - if (getLangOpts().CPlusPlus) - DiagCompat(ColonLoc, diag_compat::enum_fixed_underlying_type) + if (getLangOpts().CPlusPlus11) + Diag(ColonLoc, diag::warn_cxx98_compat_enum_fixed_underlying_type) + << BaseRange; + else if (getLangOpts().CPlusPlus) + Diag(ColonLoc, diag::ext_cxx11_enum_fixed_underlying_type) << BaseRange; else if (getLangOpts().MicrosoftExt && !getLangOpts().C23) Diag(ColonLoc, diag::ext_ms_c_enum_fixed_underlying_type) diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 1227514121973..2eca89179453b 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -89,16 +89,6 @@ DiagnosticBuilder Parser::Diag(const Token &Tok, unsigned DiagID) { return Diag(Tok.getLocation(), DiagID); } -DiagnosticBuilder Parser::DiagCompat(SourceLocation Loc, - unsigned CompatDiagId) { - return Diag(Loc, - DiagnosticIDs::getCXXCompatDiagId(getLangOpts(), CompatDiagId)); -} - -DiagnosticBuilder Parser::DiagCompat(const Token &Tok, unsigned CompatDiagId) { - return DiagCompat(Tok.getLocation(), CompatDiagId); -} - /// Emits a diagnostic suggesting parentheses surrounding a /// given range. /// diff --git a/clang/lib/Sema/SemaBase.cpp b/clang/lib/Sema/SemaBase.cpp index 9b677f446f3e6..85c4a0ab40fed 100644 --- a/clang/lib/Sema/SemaBase.cpp +++ b/clang/lib/Sema/SemaBase.cpp @@ -88,11 +88,4 @@ Sema::SemaDiagnosticBuilder SemaBase::Diag(SourceLocation Loc, return Diag(Loc, PD.getDiagID(), DeferHint) << PD; } -SemaBase::SemaDiagnosticBuilder SemaBase::DiagCompat(SourceLocation Loc, - unsigned CompatDiagId, - bool DeferHint) { - return Diag(Loc, - DiagnosticIDs::getCXXCompatDiagId(getLangOpts(), CompatDiagId), - DeferHint); -} } // namespace clang diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 9b7b3f856cc55..bbefbbf294dd1 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -7649,7 +7649,10 @@ NamedDecl *Sema::ActOnVariableDeclarator( IsVariableTemplate = true; // Only C++1y supports variable templates (N3651). - DiagCompat(D.getIdentifierLoc(), diag_compat::variable_template); + Diag(D.getIdentifierLoc(), + getLangOpts().CPlusPlus14 + ? diag::compat_cxx14_variable_template + : diag::compat_pre_cxx14_variable_template); } } } else { @@ -7715,8 +7718,10 @@ NamedDecl *Sema::ActOnVariableDeclarator( } else if (RD->isUnion()) { // C++98 [class.union]p1: If a union contains a static data member, // the program is ill-formed. C++11 drops this restriction. - DiagCompat(D.getIdentifierLoc(), - diag_compat::static_data_member_in_union) + Diag(D.getIdentifierLoc(), + getLangOpts().CPlusPlus11 + ? diag::compat_cxx11_static_data_member_in_union + : diag::compat_pre_cxx11_static_data_member_in_union) << Name; } } diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index b6ba7231d9a26..43bf9b7cd0f95 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -807,8 +807,10 @@ Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D, for (auto Loc : BadSpecifierLocs) Err << SourceRange(Loc, Loc); } else if (!CPlusPlus20Specifiers.empty()) { - auto &&Warn = DiagCompat(CPlusPlus20SpecifierLocs.front(), - diag_compat::decomp_decl_spec); + auto &&Warn = Diag(CPlusPlus20SpecifierLocs.front(), + getLangOpts().CPlusPlus20 + ? diag::compat_cxx20_decomp_decl_spec + : diag::compat_pre_cxx20_decomp_decl_spec); Warn << (int)CPlusPlus20Specifiers.size() << llvm::join(CPlusPlus20Specifiers.begin(), CPlusPlus20Specifiers.end(), " "); @@ -2038,8 +2040,10 @@ static bool CheckConstexprDeclStmt(Sema &SemaRef, const FunctionDecl *Dcl, // C++1y allows types to be defined, not just declared. if (cast(DclIt)->isThisDeclarationADefinition()) { if (Kind == Sema::CheckConstexprKind::Diagnose) { - SemaRef.DiagCompat(DS->getBeginLoc(), - diag_compat::constexpr_type_definition) + SemaRef.Diag(DS->getBeginLoc(), + SemaRef.getLangOpts().CPlusPlus14 + ? diag::compat_cxx14_constexpr_type_definition + : diag::compat_pre_cxx14_constexpr_type_definition) << isa(Dcl); } else if (!SemaRef.getLangOpts().CPlusPlus14) { return false; @@ -2064,8 +2068,10 @@ static bool CheckConstexprDeclStmt(Sema &SemaRef, const FunctionDecl *Dcl, if (VD->isThisDeclarationADefinition()) { if (VD->isStaticLocal()) { if (Kind == Sema::CheckConstexprKind::Diagnose) { - SemaRef.DiagCompat(VD->getLocation(), - diag_compat::constexpr_static_var) + SemaRef.Diag(VD->getLocation(), + SemaRef.getLangOpts().CPlusPlus23 + ? diag::compat_cxx23_constexpr_static_var + : diag::compat_pre_cxx23_constexpr_static_var) << isa(Dcl) << (VD->getTLSKind() == VarDecl::TLS_Dynamic); } else if (!SemaRef.getLangOpts().CPlusPlus23) { @@ -2085,8 +2091,11 @@ static bool CheckConstexprDeclStmt(Sema &SemaRef, const FunctionDecl *Dcl, if (!VD->getType()->isDependentType() && !VD->hasInit() && !VD->isCXXForRangeDecl()) { if (Kind == Sema::CheckConstexprKind::Diagnose) { - SemaRef.DiagCompat(VD->getLocation(), - diag_compat::constexpr_local_var_no_init) + SemaRef.Diag( + VD->getLocation(), + SemaRef.getLangOpts().CPlusPlus20 + ? diag::compat_cxx20_constexpr_local_var_no_init + : diag::compat_pre_cxx20_constexpr_local_var_no_init) << isa(Dcl); } else if (!SemaRef.getLangOpts().CPlusPlus20) { return false; @@ -2095,7 +2104,10 @@ static bool CheckConstexprDeclStmt(Sema &SemaRef, const FunctionDecl *Dcl, } } if (Kind == Sema::CheckConstexprKind::Diagnose) { - SemaRef.DiagCompat(VD->getLocation(), diag_compat::constexpr_local_var) + SemaRef.Diag(VD->getLocation(), + SemaRef.getLangOpts().CPlusPlus14 + ? diag::compat_cxx14_constexpr_local_var + : diag::compat_pre_cxx14_constexpr_local_var) << isa(Dcl); } else if (!SemaRef.getLangOpts().CPlusPlus14) { return false; @@ -2165,8 +2177,10 @@ static bool CheckConstexprCtorInitializer(Sema &SemaRef, if (!Inits.count(Field)) { if (Kind == Sema::CheckConstexprKind::Diagnose) { if (!Diagnosed) { - SemaRef.DiagCompat(Dcl->getLocation(), - diag_compat::constexpr_ctor_missing_init); + SemaRef.Diag(Dcl->getLocation(), + SemaRef.getLangOpts().CPlusPlus20 + ? diag::compat_cxx20_constexpr_ctor_missing_init + : diag::compat_pre_cxx20_constexpr_ctor_missing_init); Diagnosed = true; } SemaRef.Diag(Field->getLocation(), @@ -2377,8 +2391,10 @@ static bool CheckConstexprFunctionBody(Sema &SemaRef, const FunctionDecl *Dcl, break; case Sema::CheckConstexprKind::Diagnose: - SemaRef.DiagCompat(Body->getBeginLoc(), - diag_compat::constexpr_function_try_block) + SemaRef.Diag(Body->getBeginLoc(), + SemaRef.getLangOpts().CPlusPlus20 + ? diag::compat_cxx20_constexpr_function_try_block + : diag::compat_pre_cxx20_constexpr_function_try_block) << isa(Dcl); break; } @@ -2405,13 +2421,22 @@ static bool CheckConstexprFunctionBody(Sema &SemaRef, const FunctionDecl *Dcl, (Cxx1yLoc.isValid() && !SemaRef.getLangOpts().CPlusPlus17)) return false; } else if (Cxx2bLoc.isValid()) { - SemaRef.DiagCompat(Cxx2bLoc, diag_compat::cxx23_constexpr_body_invalid_stmt) + SemaRef.Diag(Cxx2bLoc, + SemaRef.getLangOpts().CPlusPlus23 + ? diag::compat_cxx23_constexpr_body_invalid_stmt + : diag::compat_pre_cxx23_constexpr_body_invalid_stmt) << isa(Dcl); } else if (Cxx2aLoc.isValid()) { - SemaRef.DiagCompat(Cxx2aLoc, diag_compat::cxx20_constexpr_body_invalid_stmt) + SemaRef.Diag(Cxx2aLoc, + SemaRef.getLangOpts().CPlusPlus20 + ? diag::compat_cxx20_constexpr_body_invalid_stmt + : diag::compat_pre_cxx20_constexpr_body_invalid_stmt) << isa(Dcl); } else if (Cxx1yLoc.isValid()) { - SemaRef.DiagCompat(Cxx1yLoc, diag_compat::cxx14_constexpr_body_invalid_stmt) + SemaRef.Diag(Cxx1yLoc, + SemaRef.getLangOpts().CPlusPlus14 + ? diag::compat_cxx14_constexpr_body_invalid_stmt + : diag::compat_pre_cxx14_constexpr_body_invalid_stmt) << isa(Dcl); } @@ -2428,8 +2453,11 @@ static bool CheckConstexprFunctionBody(Sema &SemaRef, const FunctionDecl *Dcl, if (Constructor->getNumCtorInitializers() == 0 && RD->hasVariantMembers()) { if (Kind == Sema::CheckConstexprKind::Diagnose) { - SemaRef.DiagCompat(Dcl->getLocation(), - diag_compat::constexpr_union_ctor_no_init); + SemaRef.Diag( + Dcl->getLocation(), + SemaRef.getLangOpts().CPlusPlus20 + ? diag::compat_cxx20_constexpr_union_ctor_no_init + : diag::compat_pre_cxx20_constexpr_union_ctor_no_init); } else if (!SemaRef.getLangOpts().CPlusPlus20) { return false; } @@ -2492,8 +2520,11 @@ static bool CheckConstexprFunctionBody(Sema &SemaRef, const FunctionDecl *Dcl, } else if (ReturnStmts.size() > 1) { switch (Kind) { case Sema::CheckConstexprKind::Diagnose: - SemaRef.DiagCompat(ReturnStmts.back(), - diag_compat::constexpr_body_multiple_return); + SemaRef.Diag( + ReturnStmts.back(), + SemaRef.getLangOpts().CPlusPlus14 + ? diag::compat_cxx14_constexpr_body_multiple_return + : diag::compat_pre_cxx14_constexpr_body_multiple_return); for (unsigned I = 0; I < ReturnStmts.size() - 1; ++I) SemaRef.Diag(ReturnStmts[I], diag::note_constexpr_body_previous_return); @@ -17794,7 +17825,9 @@ Decl *Sema::ActOnFriendTypeDecl(Scope *S, const DeclSpec &DS, << FixItHint::CreateInsertion(getLocForEndOfToken(FriendLoc), InsertionText); } else { - DiagCompat(FriendLoc, diag_compat::nonclass_type_friend) + Diag(FriendLoc, getLangOpts().CPlusPlus11 + ? diag::compat_cxx11_nonclass_type_friend + : diag::compat_pre_cxx11_nonclass_type_friend) << T << DS.getSourceRange(); } } diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index e7f418ae6802e..7cc8374e69d73 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -6508,7 +6508,9 @@ ExprResult Sema::ActOnCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc, if (const auto *ULE = dyn_cast(Fn); ULE && ULE->hasExplicitTemplateArgs() && ULE->decls_begin() == ULE->decls_end()) { - DiagCompat(Fn->getExprLoc(), diag_compat::adl_only_template_id) + Diag(Fn->getExprLoc(), getLangOpts().CPlusPlus20 + ? diag::compat_cxx20_adl_only_template_id + : diag::compat_pre_cxx20_adl_only_template_id) << ULE->getName(); } diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 1f87ef4b27bab..eace9b87a5bfe 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -2234,7 +2234,10 @@ static bool DiagnoseDefaultTemplateArgument(Sema &S, // template-argument, that declaration shall be a definition and shall be // the only declaration of the function template in the translation unit. // (C++98/03 doesn't have this wording; see DR226). - S.DiagCompat(ParamLoc, diag_compat::templ_default_in_function_templ) + S.Diag(ParamLoc, + S.getLangOpts().CPlusPlus11 + ? diag::compat_cxx11_templ_default_in_function_templ + : diag::compat_pre_cxx11_templ_default_in_function_templ) << DefArgRange; return false; @@ -6429,7 +6432,10 @@ static bool CheckTemplateArgumentAddressOfObjectOrFunction( bool ExtraParens = false; while (ParenExpr *Parens = dyn_cast(Arg)) { if (!Invalid && !ExtraParens) { - S.DiagCompat(Arg->getBeginLoc(), diag_compat::template_arg_extra_parens) + S.Diag(Arg->getBeginLoc(), + S.getLangOpts().CPlusPlus11 + ? diag::compat_cxx11_template_arg_extra_parens + : diag::compat_pre_cxx11_template_arg_extra_parens) << Arg->getSourceRange(); ExtraParens = true; } @@ -6649,7 +6655,10 @@ CheckTemplateArgumentPointerToMember(Sema &S, NonTypeTemplateParmDecl *Param, bool ExtraParens = false; while (ParenExpr *Parens = dyn_cast(Arg)) { if (!Invalid && !ExtraParens) { - S.DiagCompat(Arg->getBeginLoc(), diag_compat::template_arg_extra_parens) + S.Diag(Arg->getBeginLoc(), + S.getLangOpts().CPlusPlus11 + ? diag::compat_cxx11_template_arg_extra_parens + : diag::compat_pre_cxx11_template_arg_extra_parens) << Arg->getSourceRange(); ExtraParens = true; } @@ -10630,7 +10639,9 @@ TypeResult Sema::ActOnTypenameType(Scope *S, SourceLocation TypenameLoc, return true; if (TypenameLoc.isValid() && S && !S->getTemplateParamParent()) - DiagCompat(TypenameLoc, diag_compat::typename_outside_of_template) + Diag(TypenameLoc, getLangOpts().CPlusPlus11 + ? diag::compat_cxx11_typename_outside_of_template + : diag::compat_pre_cxx11_typename_outside_of_template) << FixItHint::CreateRemoval(TypenameLoc); NestedNameSpecifierLoc QualifierLoc = SS.getWithLocInContext(Context); diff --git a/clang/test/Misc/show-diag-options.c b/clang/test/Misc/show-diag-options.c index b09820379da36..4e98d63195f10 100644 --- a/clang/test/Misc/show-diag-options.c +++ b/clang/test/Misc/show-diag-options.c @@ -18,7 +18,7 @@ void test(int x, int y) { // BASE: {{.*}}: warning: {{[a-z ]+$}} // OPTION: {{.*}}: warning: {{[a-z ]+}} [-Wparentheses] // OPTION_ERROR: {{.*}}: error: {{[a-z ]+}} [-Werror,-Wparentheses] - // CATEGORY_ID: {{.*}}: warning: {{[a-z ]+}} [{{[0-9]+}}] + // CATEGORY_ID: {{.*}}: warning: {{[a-z ]+}} [2] // CATEGORY_NAME: {{.*}}: warning: {{[a-z ]+}} [Semantic Issue] // OPTION_ERROR_CATEGORY: {{.*}}: error: {{[a-z ]+}} [-Werror,-Wparentheses,Semantic Issue] diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp index 73facbc916714..8f846a4744bbf 100644 --- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp +++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp @@ -1518,50 +1518,6 @@ static void verifyDiagnosticWording(const Record &Diag) { // runs into odd situations like [[clang::warn_unused_result]], // #pragma clang, or --unwindlib=libgcc. } - -/// ClangDiagsCompatIDsEmitter - Emit a set of 'compatibility diagnostic ids' -/// that map to a set of 2 regular diagnostic ids each and which are used to -/// simplify emitting compatibility warnings. -void clang::EmitClangDiagsCompatIDs(const llvm::RecordKeeper &Records, - llvm::raw_ostream &OS, - const std::string &Component) { - ArrayRef Ids = - Records.getAllDerivedDefinitions("CompatWarningId"); - - StringRef PrevComponent = ""; - for (auto [I, R] : enumerate(make_pointee_range(Ids))) { - StringRef DiagComponent = R.getValueAsString("Component"); - if (!Component.empty() && Component != DiagComponent) - continue; - - StringRef CompatDiagName = R.getValueAsString("Name"); - StringRef Diag = R.getValueAsString("Diag"); - StringRef DiagPre = R.getValueAsString("DiagPre"); - int64_t CXXStdVer = R.getValueAsInt("Std"); - - // We don't want to create empty enums since some compilers (including - // Clang) warn about that, so these macros are used to avoid having to - // unconditionally write 'enum {' and '};' in the headers. - if (PrevComponent != DiagComponent) { - if (!PrevComponent.empty()) - OS << "DIAG_COMPAT_IDS_END()\n"; - OS << "DIAG_COMPAT_IDS_BEGIN()\n"; - PrevComponent = DiagComponent; - } - - // FIXME: We sometimes define multiple compat diagnostics with the same - // name, e.g. 'constexpr_body_invalid_stmt' exists for C++14/20/23. It would - // be nice if we could combine all of them into a single compatibility diag - // id. - OS << "DIAG_COMPAT_ID(" << I << ","; - OS << CompatDiagName << "," << CXXStdVer << "," << Diag << "," << DiagPre; - OS << ")\n"; - } - - if (!PrevComponent.empty()) - OS << "DIAG_COMPAT_IDS_END()\n"; -} - /// ClangDiagsEnumsEmitter - The top-level class emits .def files containing /// declarations of Clang diagnostic enums for selects. void clang::EmitClangDiagsEnums(const RecordKeeper &Records, raw_ostream &OS, diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp index 4d3d56ed4b9d7..7ffe6d2a913a9 100644 --- a/clang/utils/TableGen/TableGen.cpp +++ b/clang/utils/TableGen/TableGen.cpp @@ -48,7 +48,6 @@ enum ActionType { GenClangBasicWriter, GenClangBuiltins, GenClangBuiltinTemplates, - GenClangDiagsCompatIDs, GenClangDiagsDefs, GenClangDiagsEnums, GenClangDiagGroups, @@ -177,8 +176,6 @@ cl::opt Action( "Generate clang builtins list"), clEnumValN(GenClangBuiltinTemplates, "gen-clang-builtin-templates", "Generate clang builtins list"), - clEnumValN(GenClangDiagsCompatIDs, "gen-clang-diags-compat-ids", - "Generate Clang diagnostic compatibility ids"), clEnumValN(GenClangDiagsDefs, "gen-clang-diags-defs", "Generate Clang diagnostics definitions"), clEnumValN(GenClangDiagsEnums, "gen-clang-diags-enums", @@ -402,9 +399,6 @@ bool ClangTableGenMain(raw_ostream &OS, const RecordKeeper &Records) { case GenClangBuiltinTemplates: EmitClangBuiltinTemplates(Records, OS); break; - case GenClangDiagsCompatIDs: - EmitClangDiagsCompatIDs(Records, OS, ClangComponent); - break; case GenClangDiagsDefs: EmitClangDiagsDefs(Records, OS, ClangComponent); break; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index c26ce2825ea99..ae8ea3ad34aa5 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -91,9 +91,6 @@ void EmitClangBuiltins(const llvm::RecordKeeper &Records, void EmitClangBuiltinTemplates(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangDiagsCompatIDs(const llvm::RecordKeeper &Records, - llvm::raw_ostream &OS, - const std::string &Component); void EmitClangDiagsDefs(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS, const std::string &Component); void EmitClangDiagsEnums(const llvm::RecordKeeper &Records, From 964650b69e57d5e9e6102a4b400c8da16f6a1e27 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 2 Apr 2025 13:38:46 +0700 Subject: [PATCH 0344/1029] llvm-reduce: Add reduceOperandsToPoison reduction (#132862) For now use it only for TargetExtTypes, which do not always support zero initializers. --- .../reduce-operands-target-ext-ty.ll | 5 +++++ llvm/tools/llvm-reduce/DeltaPasses.def | 1 + .../tools/llvm-reduce/deltas/ReduceOperands.cpp | 17 +++++++++++++++-- llvm/tools/llvm-reduce/deltas/ReduceOperands.h | 1 + 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/llvm/test/tools/llvm-reduce/reduce-operands-target-ext-ty.ll b/llvm/test/tools/llvm-reduce/reduce-operands-target-ext-ty.ll index 1e1a8e7ec5e07..b3548dbb569db 100644 --- a/llvm/test/tools/llvm-reduce/reduce-operands-target-ext-ty.ll +++ b/llvm/test/tools/llvm-reduce/reduce-operands-target-ext-ty.ll @@ -4,12 +4,16 @@ ; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=operands-one --test FileCheck --test-arg %s --test-arg --input-file %s -o %t ; RUN: FileCheck --check-prefixes=CHECK,ONE %s < %t +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=operands-poison --test FileCheck --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck --check-prefixes=CHECK,POISON %s < %t + declare void @uses_ext_ty(target("sometarget.sometype")) ; TODO: Should support reduce to poison ; CHECK-LABEL: @foo( ; ZERO: call void @uses_ext_ty(target("sometarget.sometype") %arg) ; ONE: call void @uses_ext_ty(target("sometarget.sometype") %arg) +; POISON: call void @uses_ext_ty(target("sometarget.sometype") poison) define void @foo(target("sometarget.sometype") %arg) { call void @uses_ext_ty(target("sometarget.sometype") %arg) ret void @@ -20,6 +24,7 @@ declare void @uses_zeroinit_ext_ty(target("sometarget.sometype")) ; CHECK-LABEL: @bar( ; ZERO: call void @uses_zeroinit_ext_ty(target("spirv.sometype") zeroinitializer) ; ONE: call void @uses_zeroinit_ext_ty(target("spirv.sometype") %arg) +; POISON: call void @uses_zeroinit_ext_ty(target("spirv.sometype") poison) define void @bar(target("spirv.sometype") %arg) { call void @uses_zeroinit_ext_ty(target("spirv.sometype") %arg) ret void diff --git a/llvm/tools/llvm-reduce/DeltaPasses.def b/llvm/tools/llvm-reduce/DeltaPasses.def index 4c9c581924321..84d01ab6646b8 100644 --- a/llvm/tools/llvm-reduce/DeltaPasses.def +++ b/llvm/tools/llvm-reduce/DeltaPasses.def @@ -41,6 +41,7 @@ DELTA_PASS_IR("ir-passes", runIRPassesDeltaPass, "Running passes") DELTA_PASS_IR("operands-zero", reduceOperandsZeroDeltaPass, "Reducing Operands to zero") DELTA_PASS_IR("operands-one", reduceOperandsOneDeltaPass, "Reducing Operands to one") DELTA_PASS_IR("operands-nan", reduceOperandsNaNDeltaPass, "Reducing Operands to NaN") +DELTA_PASS_IR("operands-poison", reduceOperandsPoisonDeltaPass, "Reducing Operands to poison") DELTA_PASS_IR("operands-to-args", reduceOperandsToArgsDeltaPass, "Converting operands to function arguments") DELTA_PASS_IR("operands-skip", reduceOperandsSkipDeltaPass, "Reducing operands by skipping over instructions") DELTA_PASS_IR("operand-bundles", reduceOperandBundesDeltaPass, "Reducing Operand Bundles") diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp index c135f0c9e5c36..a4fdd9ce8033b 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp @@ -135,8 +135,6 @@ void llvm::reduceOperandsZeroDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { return nullptr; if (TET->hasProperty(TargetExtType::HasZeroInit)) return ConstantTargetNone::get(TET); - - // TODO: Poison reduction for this case return nullptr; } @@ -168,3 +166,18 @@ void llvm::reduceOperandsNaNDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { }; extractOperandsFromModule(O, WorkItem, ReduceValue); } + +void llvm::reduceOperandsPoisonDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { + auto ReduceValue = [](Use &Op) -> Value * { + Type *Ty = Op->getType(); + if (auto *TET = dyn_cast(Ty)) { + if (isa(Op)) + return nullptr; + return PoisonValue::get(TET); + } + + return nullptr; + }; + + extractOperandsFromModule(O, WorkItem, ReduceValue); +} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperands.h b/llvm/tools/llvm-reduce/deltas/ReduceOperands.h index 2c86ba920442b..cdd5a08056ca5 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperands.h +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperands.h @@ -15,6 +15,7 @@ namespace llvm { void reduceOperandsOneDeltaPass(Oracle &, ReducerWorkItem &); void reduceOperandsZeroDeltaPass(Oracle &, ReducerWorkItem &); void reduceOperandsNaNDeltaPass(Oracle &, ReducerWorkItem &); +void reduceOperandsPoisonDeltaPass(Oracle &, ReducerWorkItem &); } // namespace llvm #endif From c47023dceb11fcb06c2405ea11eca10ea1139aa0 Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Wed, 2 Apr 2025 15:40:37 +0900 Subject: [PATCH 0345/1029] [clang][CodeGen] Make pragma-loop test more rboust (NFC) (#133707) pragma-loop.cpp contains tests for loop metadata generated by pragma directives. These tests didn't work as (perhaps) expected. This is because the regex `.*` consumes multiple elements in the metadata. For example, there was a check directive like this. ``` // CHECK: ![[LOOP_9]] = distinct !{![[LOOP_9]], ![[WIDTH_8:.*]], ![[FIXED_VEC]], ...} ``` In the above case, `[[WIDTH_8]]` would have been expected to match a node like `[[WIDTH_8]] = !{!"llvm.loop.vectorize.width", i32 8}`. However, since there is no check directive to verify the contents of `[[WIDTH_8]]`, the regex `.*` consumed more than one element. There were other similar cases. This patch fixes the problem by not using regex matcher in the metadata content (except for follow-up metadata). Instead, it uses string variables whose contents are validated elsewhere. Related: https://github.com/llvm/llvm-project/pull/131985#discussion_r2014369699 --- clang/test/CodeGenCXX/pragma-loop.cpp | 92 +++++++++++++++------------ 1 file changed, 51 insertions(+), 41 deletions(-) diff --git a/clang/test/CodeGenCXX/pragma-loop.cpp b/clang/test/CodeGenCXX/pragma-loop.cpp index 76bdcc4a5a9c9..4857299f1c037 100644 --- a/clang/test/CodeGenCXX/pragma-loop.cpp +++ b/clang/test/CodeGenCXX/pragma-loop.cpp @@ -203,60 +203,70 @@ void for_test_scalable_1(int *List, int Length) { } } -// CHECK: ![[LOOP_1]] = distinct !{![[LOOP_1]], [[MP:![0-9]+]], ![[UNROLL_FULL:.*]]} -// CHECK: ![[UNROLL_FULL]] = !{!"llvm.loop.unroll.full"} +// CHECK-DAG: ![[MP:[0-9]+]] = !{!"llvm.loop.mustprogress"} -// CHECK: ![[LOOP_2]] = distinct !{![[LOOP_2]], [[MP]], ![[UNROLL_DISABLE:.*]], ![[DISTRIBUTE_DISABLE:.*]], ![[WIDTH_8:.*]], ![[FIXED_VEC:.*]], ![[INTERLEAVE_4:.*]], ![[VECTORIZE_ENABLE:.*]]} -// CHECK: ![[UNROLL_DISABLE]] = !{!"llvm.loop.unroll.disable"} -// CHECK: ![[DISTRIBUTE_DISABLE]] = !{!"llvm.loop.distribute.enable", i1 false} -// CHECK: ![[WIDTH_8]] = !{!"llvm.loop.vectorize.width", i32 8} -// CHECK: ![[FIXED_VEC]] = !{!"llvm.loop.vectorize.scalable.enable", i1 false} -// CHECK: ![[INTERLEAVE_4]] = !{!"llvm.loop.interleave.count", i32 4} -// CHECK: ![[VECTORIZE_ENABLE]] = !{!"llvm.loop.vectorize.enable", i1 true} +// CHECK-DAG: ![[UNROLL_DISABLE:[0-9]+]] = !{!"llvm.loop.unroll.disable"} +// CHECK-DAG: ![[UNROLL_8:[0-9]+]] = !{!"llvm.loop.unroll.count", i32 8} +// CHECK-DAG: ![[UNROLL_24:[0-9]+]] = !{!"llvm.loop.unroll.count", i32 24} +// CHECK-DAG: ![[UNROLL_32:[0-9]+]] = !{!"llvm.loop.unroll.count", i32 32} +// CHECK-DAG: ![[UNROLL_FULL:[0-9]+]] = !{!"llvm.loop.unroll.full"} -// CHECK: ![[LOOP_3]] = distinct !{![[LOOP_3]], [[MP]], ![[INTERLEAVE_4:.*]], ![[VECTORIZE_ENABLE]], ![[FOLLOWUP_VECTOR_3:.*]]} -// CHECK: ![[FOLLOWUP_VECTOR_3]] = !{!"llvm.loop.vectorize.followup_all", [[MP]], ![[ISVECTORIZED:.*]], ![[UNROLL_8:.*]]} -// CHECK: ![[ISVECTORIZED]] = !{!"llvm.loop.isvectorized"} -// CHECK: ![[UNROLL_8]] = !{!"llvm.loop.unroll.count", i32 8} +// CHECK-DAG: ![[DISTRIBUTE_DISABLE:[0-9]+]] = !{!"llvm.loop.distribute.enable", i1 false} -// CHECK: ![[LOOP_4]] = distinct !{![[LOOP_4]], ![[WIDTH_2:.*]], ![[FIXED_VEC]], ![[INTERLEAVE_2:.*]], ![[VECTORIZE_ENABLE]]} -// CHECK: ![[WIDTH_2]] = !{!"llvm.loop.vectorize.width", i32 2} -// CHECK: ![[INTERLEAVE_2]] = !{!"llvm.loop.interleave.count", i32 2} +// CHECK-DAG: ![[INTERLEAVE_2:[0-9]+]] = !{!"llvm.loop.interleave.count", i32 2} +// CHECK-DAG: ![[INTERLEAVE_4:[0-9]+]] = !{!"llvm.loop.interleave.count", i32 4} +// CHECK-DAG: ![[INTERLEAVE_8:[0-9]+]] = !{!"llvm.loop.interleave.count", i32 8} +// CHECK-DAG: ![[INTERLEAVE_10:[0-9]+]] = !{!"llvm.loop.interleave.count", i32 10} +// CHECK-DAG: ![[INTERLEAVE_16:[0-9]+]] = !{!"llvm.loop.interleave.count", i32 16} -// CHECK: ![[LOOP_5]] = distinct !{![[LOOP_5]], ![[UNROLL_DISABLE:.*]], ![[DISTRIBUTE_DISABLE:.*]], ![[WIDTH_1:.*]]} -// CHECK: ![[WIDTH_1]] = !{!"llvm.loop.vectorize.width", i32 1} +// CHECK-DAG: ![[VECTORIZE_ENABLE:[0-9]+]] = !{!"llvm.loop.vectorize.enable", i1 true} +// CHECK-DAG: ![[FIXED_VEC:[0-9]+]] = !{!"llvm.loop.vectorize.scalable.enable", i1 false} +// CHECK-DAG: ![[SCALABLE_VEC:[0-9]+]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +// CHECK-DAG: ![[WIDTH_1:[0-9]+]] = !{!"llvm.loop.vectorize.width", i32 1} +// CHECK-DAG: ![[WIDTH_2:[0-9]+]] = !{!"llvm.loop.vectorize.width", i32 2} +// CHECK-DAG: ![[WIDTH_5:[0-9]+]] = !{!"llvm.loop.vectorize.width", i32 5} +// CHECK-DAG: ![[WIDTH_6:[0-9]+]] = !{!"llvm.loop.vectorize.width", i32 6} +// CHECK-DAG: ![[WIDTH_8:[0-9]+]] = !{!"llvm.loop.vectorize.width", i32 8} +// CHECK-DAG: ![[WIDTH_10:[0-9]+]] = !{!"llvm.loop.vectorize.width", i32 10} +// CHECK-DAG: ![[WIDTH_16:[0-9]+]] = !{!"llvm.loop.vectorize.width", i32 16} -// CHECK: ![[LOOP_6]] = distinct !{![[LOOP_6]], [[MP]], ![[WIDTH_2:.*]], ![[FIXED_VEC]], ![[INTERLEAVE_2:.*]], ![[FOLLOWUP_VECTOR_3]]} +// CHECK-DAG: ![[ISVECTORIZED:[0-9]+]] = !{!"llvm.loop.isvectorized"} -// CHECK: ![[LOOP_7]] = distinct !{![[LOOP_7]], [[MP]], ![[WIDTH_5:.*]], ![[FIXED_VEC]], ![[VECTORIZE_ENABLE]]} -// CHECK: ![[WIDTH_5]] = !{!"llvm.loop.vectorize.width", i32 5} +// CHECK-DAG: ![[LOOP_1]] = distinct !{![[LOOP_1]], ![[MP]], ![[UNROLL_FULL]]} -// CHECK: ![[LOOP_8]] = distinct !{![[LOOP_8]], [[MP]], ![[WIDTH_5:.*]], ![[FIXED_VEC]], ![[VECTORIZE_ENABLE]]} +// CHECK-DAG: ![[LOOP_2]] = distinct !{![[LOOP_2]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_8]], ![[FIXED_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]} -// CHECK: ![[LOOP_9]] = distinct !{![[LOOP_9]], ![[WIDTH_8:.*]], ![[FIXED_VEC]], ![[INTERLEAVE_8:.*]], ![[FOLLOWUP_VECTOR_3]]} +// CHECK-DAG: ![[LOOP_3]] = distinct !{![[LOOP_3]], ![[MP]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]], ![[FOLLOWUP_VECTOR_3:[0-9]+]]} +// CHECK-DAG: ![[FOLLOWUP_VECTOR_3]] = !{!"llvm.loop.vectorize.followup_all", ![[MP]], ![[ISVECTORIZED]], ![[UNROLL_8]]} -// CHECK: ![[LOOP_10]] = distinct !{![[LOOP_10]], ![[WIDTH_2:.*]], ![[FIXED_VEC]], ![[INTERLEAVE_2:.*]], ![[FOLLOWUP_VECTOR_3]]} +// CHECK-DAG: ![[LOOP_4]] = distinct !{![[LOOP_4]], ![[WIDTH_2]], ![[FIXED_VEC]], ![[INTERLEAVE_2]], ![[VECTORIZE_ENABLE]]} -// CHECK: ![[LOOP_11]] = distinct !{![[LOOP_11]], ![[WIDTH_2:.*]], ![[FIXED_VEC]], ![[INTERLEAVE_4:.*]], ![[FOLLOWUP_VECTOR_3]]} +// CHECK-DAG: ![[LOOP_5]] = distinct !{![[LOOP_5]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]]} -// CHECK: ![[LOOP_12]] = distinct !{![[LOOP_12]], ![[WIDTH_6:.*]], ![[FIXED_VEC]], ![[INTERLEAVE_10:.*]], ![[FOLLOWUP_VECTOR_12:.*]]} -// CHECK: ![[FOLLOWUP_VECTOR_12]] = !{!"llvm.loop.vectorize.followup_all", ![[ISVECTORIZED:.*]], ![[UNROLL_24:.*]]} -// CHECK: ![[UNROLL_24]] = !{!"llvm.loop.unroll.count", i32 24} +// CHECK-DAG: ![[LOOP_6]] = distinct !{![[LOOP_6]], ![[MP]], ![[WIDTH_2]], ![[FIXED_VEC]], ![[INTERLEAVE_2]], ![[VECTORIZE_ENABLE]], ![[FOLLOWUP_VECTOR_3]]} -// CHECK: ![[LOOP_13]] = distinct !{![[LOOP_13]], ![[WIDTH_8:.*]], ![[INTERLEAVE_16:.*]], ![[VECTORIZE_ENABLE]], ![[FOLLOWUP_VECTOR_13:.*]]} -// CHECK: ![[INTERLEAVE_16]] = !{!"llvm.loop.interleave.count", i32 16} -// CHECK: ![[FOLLOWUP_VECTOR_13]] = !{!"llvm.loop.vectorize.followup_all", ![[ISVECTORIZED:.*]], ![[UNROLL_32:.*]]} -// CHECK: ![[UNROLL_32]] = !{!"llvm.loop.unroll.count", i32 32} +// CHECK-DAG: ![[LOOP_7]] = distinct !{![[LOOP_7]], ![[MP]], ![[WIDTH_5]], ![[FIXED_VEC]], ![[VECTORIZE_ENABLE]]} -// CHECK: ![[LOOP_14]] = distinct !{![[LOOP_14]], [[MP]], ![[WIDTH_10:.*]], ![[FIXED_VEC]], ![[VECTORIZE_ENABLE]]} -// CHECK: ![[WIDTH_10]] = !{!"llvm.loop.vectorize.width", i32 10} +// CHECK-DAG: ![[LOOP_8]] = distinct !{![[LOOP_8]], ![[MP]], ![[WIDTH_5]], ![[FIXED_VEC]], ![[VECTORIZE_ENABLE]]} -// CHECK: ![[LOOP_15]] = distinct !{![[LOOP_15]], ![[UNROLL_DISABLE:.*]], ![[DISTRIBUTE_DISABLE:.*]], ![[WIDTH_16:.*]], ![[FIXED_VEC]], ![[INTERLEAVE_4:.*]], ![[VECTORIZE_ENABLE:.*]]} -// CHECK: ![[WIDTH_16]] = !{!"llvm.loop.vectorize.width", i32 16} +// CHECK-DAG: ![[LOOP_9]] = distinct !{![[LOOP_9]], ![[MP]], ![[WIDTH_8]], ![[FIXED_VEC]], ![[INTERLEAVE_8]], ![[VECTORIZE_ENABLE]], ![[FOLLOWUP_VECTOR_3]]} -// CHECK: ![[LOOP_16]] = distinct !{![[LOOP_16]], ![[UNROLL_DISABLE:.*]], ![[DISTRIBUTE_DISABLE:.*]], ![[WIDTH_16]], ![[SCALABLE_VEC:.*]], ![[INTERLEAVE_4:.*]], ![[VECTORIZE_ENABLE:.*]]} -// CHECK: ![[SCALABLE_VEC]] = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +// CHECK-DAG: ![[LOOP_10]] = distinct !{![[LOOP_10]], ![[MP]], ![[WIDTH_2]], ![[FIXED_VEC]], ![[INTERLEAVE_2]], ![[VECTORIZE_ENABLE]], ![[FOLLOWUP_VECTOR_3]]} -// CHECK: ![[LOOP_17]] = distinct !{![[LOOP_17]], ![[UNROLL_DISABLE:.*]], ![[DISTRIBUTE_DISABLE:.*]], ![[FIXED_VEC]], ![[INTERLEAVE_4:.*]], ![[VECTORIZE_ENABLE:.*]]} -// CHECK: ![[LOOP_18]] = distinct !{![[LOOP_18]], ![[UNROLL_DISABLE:.*]], ![[DISTRIBUTE_DISABLE:.*]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4:.*]], ![[VECTORIZE_ENABLE:.*]]} -// CHECK: ![[LOOP_19]] = distinct !{![[LOOP_19]], ![[UNROLL_DISABLE:.*]], ![[DISTRIBUTE_DISABLE:.*]], ![[WIDTH_1]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4:.*]], ![[VECTORIZE_ENABLE:.*]]} +// CHECK-DAG: ![[LOOP_11]] = distinct !{![[LOOP_11]], ![[MP]], ![[WIDTH_2]], ![[FIXED_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]], ![[FOLLOWUP_VECTOR_3]]} + +// CHECK-DAG: ![[LOOP_12]] = distinct !{![[LOOP_12]], ![[MP]], ![[WIDTH_6]], ![[FIXED_VEC]], ![[INTERLEAVE_10]], ![[VECTORIZE_ENABLE]], ![[FOLLOWUP_VECTOR_12:[0-9]+]]} +// CHECK-DAG: ![[FOLLOWUP_VECTOR_12]] = !{!"llvm.loop.vectorize.followup_all", ![[MP]], ![[ISVECTORIZED]], ![[UNROLL_24]]} + +// CHECK-DAG: ![[LOOP_13]] = distinct !{![[LOOP_13]], ![[MP]], ![[WIDTH_8]], ![[FIXED_VEC]], ![[INTERLEAVE_16]], ![[VECTORIZE_ENABLE]], ![[FOLLOWUP_VECTOR_13:[0-9]+]]} +// CHECK-DAG: ![[FOLLOWUP_VECTOR_13]] = !{!"llvm.loop.vectorize.followup_all", ![[MP]], ![[ISVECTORIZED]], ![[UNROLL_32]]} + +// CHECK-DAG: ![[LOOP_14]] = distinct !{![[LOOP_14]], ![[MP]], ![[WIDTH_10]], ![[FIXED_VEC]], ![[VECTORIZE_ENABLE]]} + +// CHECK-DAG: ![[LOOP_15]] = distinct !{![[LOOP_15]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_16]], ![[FIXED_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]} + +// CHECK-DAG: ![[LOOP_16]] = distinct !{![[LOOP_16]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_16]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]} + +// CHECK-DAG: ![[LOOP_17]] = distinct !{![[LOOP_17]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[FIXED_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]} +// CHECK-DAG: ![[LOOP_18]] = distinct !{![[LOOP_18]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]} +// CHECK-DAG: ![[LOOP_19]] = distinct !{![[LOOP_19]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]} From 528e408b94cd093e582de8352acebf85a41f5d56 Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Wed, 2 Apr 2025 15:41:40 +0900 Subject: [PATCH 0346/1029] [LoopInterchange] Add an option to control the cost heuristics applied (#133664) LoopInterchange has several heuristic functions to determine if exchanging two loops is profitable or not. Whether or not to use each heuristic and the order in which to use them were fixed, but #125830 allows them to be changed internally at will. This patch adds a new option to control them via the compiler option. The previous patch also added an option to prioritize the vectorization heuristic. This patch also removes it to avoid conflicts between it and the newly introduced one, e.g., both `-loop-interchange-prioritize-vectorization=1` and `-loop-interchange-profitabilities='cache,vectorization'` are specified. --- .../lib/Transforms/Scalar/LoopInterchange.cpp | 56 +++++++++++-------- .../profitability-vectorization.ll | 2 +- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 4366418b2379d..e777f950a7c5a 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Scalar/LoopInterchange.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -72,6 +73,13 @@ using LoopVector = SmallVector; // TODO: Check if we can use a sparse matrix here. using CharMatrix = std::vector>; +/// Types of rules used in profitability check. +enum class RuleTy { + PerLoopCacheAnalysis, + PerInstrOrderCost, + ForVectorization, +}; + } // end anonymous namespace // Minimum loop depth supported. @@ -84,12 +92,31 @@ static cl::opt MaxLoopNestDepth( "loop-interchange-max-loop-nest-depth", cl::init(10), cl::Hidden, cl::desc("Maximum depth of loop nest considered for the transform")); -static cl::opt PrioritizeVectorization( - "loop-interchange-prioritize-vectorization", cl::init(false), cl::Hidden, - cl::desc("Prioritize increasing vectorization opportunity over cache cost " - "when determining profitability")); +// We prefer cache cost to vectorization by default. +static cl::list Profitabilities( + "loop-interchange-profitabilities", cl::ZeroOrMore, + cl::MiscFlags::CommaSeparated, cl::Hidden, + cl::desc("List of profitability heuristics to be used. They are applied in " + "the given order"), + cl::list_init({RuleTy::PerLoopCacheAnalysis, + RuleTy::PerInstrOrderCost, + RuleTy::ForVectorization}), + cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache", + "Prioritize loop cache cost"), + clEnumValN(RuleTy::PerInstrOrderCost, "instorder", + "Prioritize the IVs order of each instruction"), + clEnumValN(RuleTy::ForVectorization, "vectorize", + "Prioritize vectorization"))); #ifndef NDEBUG +static bool noDuplicateRules(ArrayRef Rules) { + SmallSet Set; + for (RuleTy Rule : Rules) + if (!Set.insert(Rule).second) + return false; + return true; +} + static void printDepMatrix(CharMatrix &DepMatrix) { for (auto &Row : DepMatrix) { for (auto D : Row) @@ -1204,26 +1231,9 @@ bool LoopInterchangeProfitability::isProfitable( // second highest priority rule (isProfitablePerInstrOrderCost by default). // Likewise, if it failed to analysis the profitability then only, the last // rule (isProfitableForVectorization by default) will decide. - enum class RuleTy { - PerLoopCacheAnalysis, - PerInstrOrderCost, - ForVectorization, - }; - - // We prefer cache cost to vectorization by default. - RuleTy RuleOrder[3] = {RuleTy::PerLoopCacheAnalysis, - RuleTy::PerInstrOrderCost, RuleTy::ForVectorization}; - - // If we prefer vectorization to cache cost, change the order of application - // of each rule. - if (PrioritizeVectorization) { - RuleOrder[0] = RuleTy::ForVectorization; - RuleOrder[1] = RuleTy::PerLoopCacheAnalysis; - RuleOrder[2] = RuleTy::PerInstrOrderCost; - } - + assert(noDuplicateRules(Profitabilities) && "Detect duplicate rules"); std::optional shouldInterchange; - for (RuleTy RT : RuleOrder) { + for (RuleTy RT : Profitabilities) { switch (RT) { case RuleTy::PerLoopCacheAnalysis: shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC); diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll index 0018aa0308f28..85be48cb9a710 100644 --- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll +++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization.ll @@ -3,7 +3,7 @@ ; RUN: FileCheck -input-file %t --check-prefix=PROFIT-CACHE %s ; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 \ -; RUN: -pass-remarks-output=%t -disable-output -loop-interchange-prioritize-vectorization=1 +; RUN: -pass-remarks-output=%t -disable-output -loop-interchange-profitabilities=vectorize,cache,instorder ; RUN: FileCheck -input-file %t --check-prefix=PROFIT-VEC %s @A = dso_local global [256 x [256 x float]] zeroinitializer From 09e19cfacfe5478a69f19014156deb384e5163c7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 2 Apr 2025 13:44:45 +0700 Subject: [PATCH 0347/1029] llvm-reduce: Do not reduce alloca array sizes to 0 (#132864) Fixes #64340 --- .../llvm-reduce/reduce-operands-alloca.ll | 69 +++++++++++++++++++ .../llvm-reduce/deltas/ReduceOperands.cpp | 5 ++ 2 files changed, 74 insertions(+) create mode 100644 llvm/test/tools/llvm-reduce/reduce-operands-alloca.ll diff --git a/llvm/test/tools/llvm-reduce/reduce-operands-alloca.ll b/llvm/test/tools/llvm-reduce/reduce-operands-alloca.ll new file mode 100644 index 0000000000000..61c46185b3378 --- /dev/null +++ b/llvm/test/tools/llvm-reduce/reduce-operands-alloca.ll @@ -0,0 +1,69 @@ +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=operands-zero --test FileCheck --test-arg --check-prefix=CHECK --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck %s --check-prefixes=CHECK,ZERO < %t + +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=operands-one --test FileCheck --test-arg --check-prefix=CHECK --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck %s --check-prefixes=CHECK,ONE < %t + +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=operands-poison --test FileCheck --test-arg --check-prefix=CHECK --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck %s --check-prefixes=CHECK,POISON < %t + + +; CHECK-LABEL: @dyn_alloca( +; ZERO: %alloca = alloca i32, i32 %size, align 4 +; ONE: %alloca = alloca i32, align 4 +; POISON: %alloca = alloca i32, i32 %size, align 4 +define void @dyn_alloca(i32 %size) { + %alloca = alloca i32, i32 %size + store i32 0, ptr %alloca + ret void +} + +; CHECK-LABEL: @alloca_0_elt( +; ZERO: %alloca = alloca i32, i32 0, align 4 +; ONE: %alloca = alloca i32, i32 0, align 4 +; POISON: %alloca = alloca i32, i32 0, align 4 +define void @alloca_0_elt() { + %alloca = alloca i32, i32 0 + store i32 0, ptr %alloca + ret void +} + +; CHECK-LABEL: @alloca_1_elt( +; ZERO: %alloca = alloca i32, align 4 +; ONE: %alloca = alloca i32, align 4 +; POISON: %alloca = alloca i32, align 4 +define void @alloca_1_elt() { + %alloca = alloca i32, i32 1 + store i32 0, ptr %alloca + ret void +} + +; CHECK-LABEL: @alloca_1024_elt( +; ZERO: %alloca = alloca i32, i32 1024, align 4 +; ONE: %alloca = alloca i32, align 4 +; POISON: %alloca = alloca i32, i32 1024, align 4 +define void @alloca_1024_elt() { + %alloca = alloca i32, i32 1024 + store i32 0, ptr %alloca + ret void +} + +; CHECK-LABEL: @alloca_poison_elt( +; ZERO: %alloca = alloca i32, i32 poison, align 4 +; ONE: %alloca = alloca i32, align 4 +; POISON: %alloca = alloca i32, i32 poison, align 4 +define void @alloca_poison_elt() { + %alloca = alloca i32, i32 poison + store i32 0, ptr %alloca + ret void +} + +; CHECK-LABEL: @alloca_constexpr_elt( +; ZERO: %alloca = alloca i32, i32 ptrtoint (ptr @alloca_constexpr_elt to i32) +; ONE: %alloca = alloca i32, align 4 +; POISON: %alloca = alloca i32, i32 ptrtoint (ptr @alloca_constexpr_elt to i32) +define void @alloca_constexpr_elt() { + %alloca = alloca i32, i32 ptrtoint (ptr @alloca_constexpr_elt to i32) + store i32 0, ptr %alloca + ret void +} diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp index a4fdd9ce8033b..b0bca015434fa 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp @@ -125,6 +125,11 @@ void llvm::reduceOperandsZeroDeltaPass(Oracle &O, ReducerWorkItem &WorkItem) { auto ReduceValue = [](Use &Op) -> Value * { if (!shouldReduceOperand(Op)) return nullptr; + + // Avoid introducing 0-sized allocations. + if (isa(Op.getUser())) + return nullptr; + // Don't duplicate an existing switch case. if (auto *IntTy = dyn_cast(Op->getType())) if (switchCaseExists(Op, ConstantInt::get(IntTy, 0))) From 536fe74aaac437e147fc64dada6af8aab79a8b54 Mon Sep 17 00:00:00 2001 From: Sudharsan Veeravalli Date: Wed, 2 Apr 2025 12:14:50 +0530 Subject: [PATCH 0348/1029] [RISCV] Modify register type of extd* Xqcibm instructions (#134027) The v0.8 spec specifies that rs1 cannot be x31 (t6) since these instructions operate on a pair of registers (rs1 and rs1 + 1) with no wrap around. The latest spec can be found here: https://github.com/quic/riscv-unified-db/releases/tag/Xqci-0.8.0 --- .../RISCV/Disassembler/RISCVDisassembler.cpp | 10 +++++ llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 16 ++++---- llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 5 +++ .../RISCV/rvv/vsetvli-insert-zve64f.mir | 4 +- llvm/test/MC/RISCV/xqcibm-invalid.s | 40 +++++++++++++++---- llvm/test/MC/RISCV/xqcibm-valid.s | 6 +-- 6 files changed, 61 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 5f268006c6fdd..099490173bf08 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -197,6 +197,16 @@ DecodeGPRNoX0X2RegisterClass(MCInst &Inst, uint64_t RegNo, uint32_t Address, return DecodeGPRNoX0RegisterClass(Inst, RegNo, Address, Decoder); } +static DecodeStatus DecodeGPRNoX31RegisterClass(MCInst &Inst, uint32_t RegNo, + uint64_t Address, + const MCDisassembler *Decoder) { + if (RegNo == 31) { + return MCDisassembler::Fail; + } + + return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder); +} + static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint32_t RegNo, uint64_t Address, const MCDisassembler *Decoder) { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 23feb52a0c2ca..2479bbd1258a4 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -584,15 +584,15 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { def QC_INSBPR : QCIRVInstRR<0b00010, GPR, "qc.insbpr">; def QC_INSBPRH : QCIRVInstRR<0b00011, GPR, "qc.insbprh">; def QC_EXTU : QCIBitManipRII<0b010, 0b00, GPRNoX0, "qc.extu">; - def QC_EXTDU : QCIBitManipRII<0b010, 0b10, GPR, "qc.extdu">; - def QC_EXTDUR : QCIRVInstRR<0b00100, GPR, "qc.extdur">; - def QC_EXTDUPR : QCIRVInstRR<0b00110, GPR, "qc.extdupr">; - def QC_EXTDUPRH : QCIRVInstRR<0b00111, GPR, "qc.extduprh">; + def QC_EXTDU : QCIBitManipRII<0b010, 0b10, GPRNoX31, "qc.extdu">; + def QC_EXTDUR : QCIRVInstRR<0b00100, GPRNoX31, "qc.extdur">; + def QC_EXTDUPR : QCIRVInstRR<0b00110, GPRNoX31, "qc.extdupr">; + def QC_EXTDUPRH : QCIRVInstRR<0b00111, GPRNoX31, "qc.extduprh">; def QC_EXT : QCIBitManipRII<0b010, 0b01, GPRNoX0, "qc.ext">; - def QC_EXTD : QCIBitManipRII<0b010, 0b11, GPR, "qc.extd">; - def QC_EXTDR : QCIRVInstRR<0b00101, GPR, "qc.extdr">; - def QC_EXTDPR : QCIRVInstRR<0b01000, GPR, "qc.extdpr">; - def QC_EXTDPRH : QCIRVInstRR<0b01001, GPR, "qc.extdprh">; + def QC_EXTD : QCIBitManipRII<0b010, 0b11, GPRNoX31, "qc.extd">; + def QC_EXTDR : QCIRVInstRR<0b00101, GPRNoX31, "qc.extdr">; + def QC_EXTDPR : QCIRVInstRR<0b01000, GPRNoX31, "qc.extdpr">; + def QC_EXTDPRH : QCIRVInstRR<0b01001, GPRNoX31, "qc.extdprh">; def QC_COMPRESS2 : QCIRVInstI<0b0000, "qc.compress2">; def QC_COMPRESS3 : QCIRVInstI<0b0001, "qc.compress3">; def QC_EXPAND2 : QCIRVInstI<0b0010, "qc.expand2">; diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 2332260ff1ca6..8d09caf1da2d5 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -302,6 +302,11 @@ def GPRX1X5 : GPRRegisterClass<(add X1, X5)> { let DiagnosticString = "register must be ra or t0 (x1 or x5)"; } +def GPRNoX31 : GPRRegisterClass<(sub GPR, X31)> { + let DiagnosticType = "InvalidRegClassGPRX31"; + let DiagnosticString = "register must be a GPR excluding t6 (x31)"; +} + //===----------------------------------------------------------------------===// // Even-Odd GPR Pairs //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-zve64f.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-zve64f.mir index f65bba1b7b9c7..fc3bb13df77d6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-zve64f.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-zve64f.mir @@ -24,7 +24,7 @@ body: | ; CHECK-NEXT: renamable $v8 = PseudoVLE64_V_M1 undef renamable $v8, [[COPY1]], 1, 6 /* e64 */, 2 /* tu, ma */, implicit $vl, implicit $vtype :: (load unknown-size, align 8) ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 8, 208 /* e32, m1, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: renamable $v9 = PseudoVLE32_V_M1 undef renamable $v9, [[COPY]], 8, 5 /* e32 */, 2 /* tu, ma */, implicit $vl, implicit $vtype :: (load unknown-size, align 4) - ; CHECK-NEXT: INLINEASM &"# use $0 $1 $2 $3", 1 /* sideeffect attdialect */, 3145737 /* reguse:VR */, killed renamable $v10, 3145737 /* reguse:VR */, killed renamable $v11, 3145737 /* reguse:VR */, killed renamable $v8, 3145737 /* reguse:VR */, killed renamable $v9 + ; CHECK-NEXT: INLINEASM &"# use $0 $1 $2 $3", 1 /* sideeffect attdialect */, 3997705 /* reguse:VR */, killed renamable $v10, 3997705 /* reguse:VR */, killed renamable $v11, 3997705 /* reguse:VR */, killed renamable $v8, 3997705 /* reguse:VR */, killed renamable $v9 ; CHECK-NEXT: PseudoRET %3:gpr = COPY $x12 %2:gpr = COPY $x11 @@ -34,7 +34,7 @@ body: | renamable $v11 = PseudoVMV_S_X undef renamable $v11, %1, 8, 5 /* e32 */ renamable $v8 = PseudoVLE64_V_M1 undef renamable $v8, %2, 1, 6 /* e64 */, 2 /* tu, ma */ :: (load unknown-size, align 8) renamable $v9 = PseudoVLE32_V_M1 undef renamable $v9, %3, 8, 5 /* e32 */, 2 /* tu, ma */ :: (load unknown-size, align 4) - INLINEASM &"# use $0 $1 $2 $3", 1 /* sideeffect attdialect */, 3145737 /* reguse:VR */, killed renamable $v10, 3145737 /* reguse:VR */, killed renamable $v11, 3145737 /* reguse:VR */, killed renamable $v8, 3145737 /* reguse:VR */, killed renamable $v9 + INLINEASM &"# use $0 $1 $2 $3", 1 /* sideeffect attdialect */, 3997705 /* reguse:VR */, killed renamable $v10, 3997705 /* reguse:VR */, killed renamable $v11, 3997705 /* reguse:VR */, killed renamable $v8, 3997705 /* reguse:VR */, killed renamable $v9 PseudoRET ... diff --git a/llvm/test/MC/RISCV/xqcibm-invalid.s b/llvm/test/MC/RISCV/xqcibm-invalid.s index 7bb305fa9fa30..6ed3ec4c7f65c 100644 --- a/llvm/test/MC/RISCV/xqcibm-invalid.s +++ b/llvm/test/MC/RISCV/xqcibm-invalid.s @@ -269,7 +269,8 @@ qc.ext x27, x6, 31, 41 qc.ext x27, x6, 31, 1 -# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +# CHECK-PLUS: :[[@LINE+2]]:14: error: register must be a GPR excluding t6 (x31) +# CHECK-MINUS: :[[@LINE+1]]:14: error: invalid operand for instruction qc.extdu x1, 8, 8, 8 # CHECK: :[[@LINE+1]]:1: error: too few operands for instruction @@ -289,7 +290,8 @@ qc.extdu x1, x8, 8, 78 qc.extdu x1, x8, 8, 8 -# CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction +# CHECK-PLUS: :[[@LINE+2]]:14: error: register must be a GPR excluding t6 (x31) +# CHECK-MINUS: :[[@LINE+1]]:14: error: invalid operand for instruction qc.extd x13, 21, 10, 15 # CHECK: :[[@LINE+1]]:1: error: too few operands for instruction @@ -396,6 +398,10 @@ qc.extdur x9, x19 # CHECK-MINUS: :[[@LINE+1]]:11: error: invalid operand for instruction qc.extdur x0, x19, x29 +# CHECK-PLUS: :[[@LINE+2]]:15: error: register must be a GPR excluding t6 (x31) +# CHECK-MINUS: :[[@LINE+1]]:15: error: invalid operand for instruction +qc.extdur x9, x31, x29 + # CHECK-PLUS: :[[@LINE+2]]:20: error: register must be a GPR excluding zero (x0) # CHECK-MINUS: :[[@LINE+1]]:20: error: invalid operand for instruction qc.extdur x9, x19, x0 @@ -406,21 +412,25 @@ qc.extdur x9, x19, x29 # CHECK-PLUS: :[[@LINE+2]]:20: error: register must be a GPR excluding zero (x0) # CHECK-MINUS: :[[@LINE+1]]:20: error: invalid operand for instruction -qc.extdr x12, x31, 30 +qc.extdr x12, x29, 30 # CHECK: :[[@LINE+1]]:1: error: too few operands for instruction -qc.extdr x12, x31 +qc.extdr x12, x29 # CHECK-PLUS: :[[@LINE+2]]:10: error: register must be a GPR excluding zero (x0) # CHECK-MINUS: :[[@LINE+1]]:10: error: invalid operand for instruction -qc.extdr x0, x31, x30 +qc.extdr x0, x29, x30 + +# CHECK-PLUS: :[[@LINE+2]]:15: error: register must be a GPR excluding t6 (x31) +# CHECK-MINUS: :[[@LINE+1]]:15: error: invalid operand for instruction +qc.extdr x12, x31, x30 # CHECK-PLUS: :[[@LINE+2]]:20: error: register must be a GPR excluding zero (x0) # CHECK-MINUS: :[[@LINE+1]]:20: error: invalid operand for instruction -qc.extdr x12, x31, x0 +qc.extdr x12, x29, x0 # CHECK-MINUS: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqcibm' (Qualcomm uC Bit Manipulation Extension) -qc.extdr x12, x31, x30 +qc.extdr x12, x29, x30 # CHECK-PLUS: :[[@LINE+2]]:22: error: register must be a GPR excluding zero (x0) @@ -434,6 +444,10 @@ qc.extdupr x13, x23 # CHECK-MINUS: :[[@LINE+1]]:12: error: invalid operand for instruction qc.extdupr x0, x23, x3 +# CHECK-PLUS: :[[@LINE+2]]:17: error: register must be a GPR excluding t6 (x31) +# CHECK-MINUS: :[[@LINE+1]]:17: error: invalid operand for instruction +qc.extdupr x13, x31, x3 + # CHECK-PLUS: :[[@LINE+2]]:22: error: register must be a GPR excluding zero (x0) # CHECK-MINUS: :[[@LINE+1]]:22: error: invalid operand for instruction qc.extdupr x13, x23, x0 @@ -453,6 +467,10 @@ qc.extduprh x18, x8 # CHECK-MINUS: :[[@LINE+1]]:13: error: invalid operand for instruction qc.extduprh x0, x8, x9 +# CHECK-PLUS: :[[@LINE+2]]:18: error: register must be a GPR excluding t6 (x31) +# CHECK-MINUS: :[[@LINE+1]]:18: error: invalid operand for instruction +qc.extduprh x18, x31, x9 + # CHECK-PLUS: :[[@LINE+2]]:22: error: register must be a GPR excluding zero (x0) # CHECK-MINUS: :[[@LINE+1]]:22: error: invalid operand for instruction qc.extduprh x18, x8, x0 @@ -472,6 +490,10 @@ qc.extdpr x1, x4 # CHECK-MINUS: :[[@LINE+1]]:11: error: invalid operand for instruction qc.extdpr x0, x4, x15 +# CHECK-PLUS: :[[@LINE+2]]:15: error: register must be a GPR excluding t6 (x31) +# CHECK-MINUS: :[[@LINE+1]]:15: error: invalid operand for instruction +qc.extdpr x1, x31, x15 + # CHECK-PLUS: :[[@LINE+2]]:19: error: register must be a GPR excluding zero (x0) # CHECK-MINUS: :[[@LINE+1]]:19: error: invalid operand for instruction qc.extdpr x1, x4, x0 @@ -491,6 +513,10 @@ qc.extdprh x6, x24 # CHECK-MINUS: :[[@LINE+1]]:12: error: invalid operand for instruction qc.extdprh x0, x24, x25 +# CHECK-PLUS: :[[@LINE+2]]:16: error: register must be a GPR excluding t6 (x31) +# CHECK-MINUS: :[[@LINE+1]]:16: error: invalid operand for instruction +qc.extdprh x6, x31, x25 + # CHECK-PLUS: :[[@LINE+2]]:21: error: register must be a GPR excluding zero (x0) # CHECK-MINUS: :[[@LINE+1]]:21: error: invalid operand for instruction qc.extdprh x6, x24, x0 diff --git a/llvm/test/MC/RISCV/xqcibm-valid.s b/llvm/test/MC/RISCV/xqcibm-valid.s index d5603c6d52c90..70248ad00cb76 100644 --- a/llvm/test/MC/RISCV/xqcibm-valid.s +++ b/llvm/test/MC/RISCV/xqcibm-valid.s @@ -90,9 +90,9 @@ qc.insbprh x2, x3, x11 # CHECK-ENC: encoding: [0x8b,0xb4,0xd9,0x09] qc.extdur x9, x19, x29 -# CHECK-INST: qc.extdr a2, t6, t5 -# CHECK-ENC: encoding: [0x0b,0xb6,0xef,0x0b] -qc.extdr x12, x31, x30 +# CHECK-INST: qc.extdr a2, t4, t5 +# CHECK-ENC: encoding: [0x0b,0xb6,0xee,0x0b] +qc.extdr x12, x29, x30 # CHECK-INST: qc.extdupr a3, s7, gp # CHECK-ENC: encoding: [0x8b,0xb6,0x3b,0x0c] From 730e8a4a59a5398f61c526eb00eb409e9306d19c Mon Sep 17 00:00:00 2001 From: nawrinsu Date: Wed, 2 Apr 2025 00:17:50 -0700 Subject: [PATCH 0349/1029] [OpenMP] Add memory allocation using hwloc (#132843) This patch adds support for memory allocation using hwloc. To enable memory allocation using hwloc, env KMP_TOPOLOGY_METHOD=hwloc needs to be used. If hwloc is not supported/available, allocation will fallback to default path. --- openmp/runtime/src/kmp.h | 4 + openmp/runtime/src/kmp_affinity.cpp | 1 + openmp/runtime/src/kmp_alloc.cpp | 416 ++++++++++++++++------ openmp/runtime/src/kmp_global.cpp | 1 + openmp/runtime/src/kmp_settings.cpp | 4 +- openmp/runtime/test/api/omp_alloc_hwloc.c | 25 ++ 6 files changed, 343 insertions(+), 108 deletions(-) create mode 100644 openmp/runtime/test/api/omp_alloc_hwloc.c diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 3d34513491154..28a5522f3a582 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1107,6 +1107,7 @@ extern omp_allocator_handle_t __kmp_def_allocator; #endif extern int __kmp_memkind_available; +extern bool __kmp_hwloc_available; typedef omp_memspace_handle_t kmp_memspace_t; // placeholder @@ -1119,6 +1120,9 @@ typedef struct kmp_allocator_t { kmp_uint64 pool_size; kmp_uint64 pool_used; bool pinned; +#if KMP_USE_HWLOC + omp_alloctrait_value_t membind; +#endif } kmp_allocator_t; extern omp_allocator_handle_t __kmpc_init_allocator(int gtid, diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp index c3d5ecf1345e8..f2520db145552 100644 --- a/openmp/runtime/src/kmp_affinity.cpp +++ b/openmp/runtime/src/kmp_affinity.cpp @@ -1444,6 +1444,7 @@ void KMPAffinity::pick_api() { if (__kmp_affinity_top_method == affinity_top_method_hwloc && __kmp_affinity.type != affinity_disabled) { affinity_dispatch = new KMPHwlocAffinity(); + __kmp_hwloc_available = true; } else #endif { diff --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp index fb1b0eb5f0fe5..783d9ffe88aa3 100644 --- a/openmp/runtime/src/kmp_alloc.cpp +++ b/openmp/runtime/src/kmp_alloc.cpp @@ -14,6 +14,20 @@ #include "kmp_io.h" #include "kmp_wrapper_malloc.h" +#if KMP_USE_HWLOC +#if HWLOC_API_VERSION > 0x00020300 +#define KMP_HWLOC_LOCATION_TYPE_CPUSET HWLOC_LOCATION_TYPE_CPUSET +#elif HWLOC_API_VERSION == 0x00020300 +#define KMP_HWLOC_LOCATION_TYPE_CPUSET \ + hwloc_location::HWLOC_LOCATION_TYPE_CPUSET +#else +enum hwloc_memattr_id_e { + HWLOC_MEMATTR_ID_BANDWIDTH, + HWLOC_MEMATTR_ID_CAPACITY +}; +#endif +#endif // KMP_USE_HWLOC + // Disable bget when it is not used #if KMP_USE_BGET @@ -1356,6 +1370,74 @@ void __kmp_fini_memkind() { #endif } +#if KMP_USE_HWLOC +static bool __kmp_is_hwloc_membind_supported(hwloc_membind_policy_t policy) { +#if HWLOC_API_VERSION >= 0x00020300 + const hwloc_topology_support *support; + support = hwloc_topology_get_support(__kmp_hwloc_topology); + if (support) { + if (policy == HWLOC_MEMBIND_BIND) + return (support->membind->alloc_membind && + support->membind->bind_membind); + if (policy == HWLOC_MEMBIND_INTERLEAVE) + return (support->membind->alloc_membind && + support->membind->interleave_membind); + } + return false; +#else + return false; +#endif +} + +void *__kmp_hwloc_alloc_membind(hwloc_memattr_id_e attr, size_t size, + hwloc_membind_policy_t policy) { +#if HWLOC_API_VERSION >= 0x00020300 + void *ptr = NULL; + hwloc_obj_t node; + struct hwloc_location initiator; + int ret; + // TODO: We should make this more efficient by getting rid of the OS syscall + // 'hwloc_bitmap_alloc' and 'hwloc_get_cpubind' to get affinity and instead + // use th_affin_mask field when it's capable of getting the underlying + // mask implementation. + hwloc_cpuset_t mask = hwloc_bitmap_alloc(); + ret = hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); + if (ret < 0) { + hwloc_bitmap_free(mask); + return ptr; + } + initiator.type = KMP_HWLOC_LOCATION_TYPE_CPUSET; + initiator.location.cpuset = mask; + ret = hwloc_memattr_get_best_target(__kmp_hwloc_topology, attr, &initiator, 0, + &node, NULL); + if (ret < 0) { + return ptr; + } + return hwloc_alloc_membind(__kmp_hwloc_topology, size, node->nodeset, policy, + HWLOC_MEMBIND_BYNODESET); +#else + return NULL; +#endif +} + +void *__kmp_hwloc_membind_policy(omp_memspace_handle_t ms, size_t size, + hwloc_membind_policy_t policy) { +#if HWLOC_API_VERSION >= 0x00020300 + void *ptr = NULL; + if (ms == omp_high_bw_mem_space) { + ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_BANDWIDTH, size, policy); + } else if (ms == omp_large_cap_mem_space) { + ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_CAPACITY, size, policy); + } else { + ptr = hwloc_alloc(__kmp_hwloc_topology, size); + } + return ptr; +#else + return NULL; +#endif +} +#endif // KMP_USE_HWLOC + void __kmp_init_target_mem() { *(void **)(&kmp_target_alloc_host) = KMP_DLSYM("llvm_omp_target_alloc_host"); *(void **)(&kmp_target_alloc_shared) = @@ -1412,6 +1494,13 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms, al->fb_data = RCAST(kmp_allocator_t *, traits[i].value); break; case omp_atk_partition: +#if KMP_USE_HWLOC + al->membind = (omp_alloctrait_value_t)traits[i].value; + KMP_DEBUG_ASSERT(al->membind == omp_atv_environment || + al->membind == omp_atv_nearest || + al->membind == omp_atv_blocked || + al->membind == omp_atv_interleaved); +#endif al->memkind = RCAST(void **, traits[i].value); break; default: @@ -1466,7 +1555,8 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms, __kmp_free(al); return omp_null_allocator; } else { - if (ms == omp_high_bw_mem_space) { + if (!__kmp_hwloc_available && + (ms == omp_high_bw_mem_space || ms == omp_large_cap_mem_space)) { // cannot detect HBW memory presence without memkind library __kmp_free(al); return omp_null_allocator; @@ -1573,8 +1663,9 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size, if (allocator > kmp_max_mem_alloc) is_pinned = al->pinned; - // Use default allocator if libmemkind is not available - int use_default_allocator = (__kmp_memkind_available) ? false : true; + // Use default allocator if hwloc and libmemkind are not available + int use_default_allocator = + (!__kmp_hwloc_available && !__kmp_memkind_available); if (KMP_IS_TARGET_MEM_ALLOC(allocator)) { // Use size input directly as the memory may not be accessible on host. @@ -1610,38 +1701,152 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size, } } - if (__kmp_memkind_available) { - if (allocator < kmp_max_mem_alloc) { - // pre-defined allocator - if (allocator == omp_high_bw_mem_alloc && mk_hbw_preferred) { - ptr = kmp_mk_alloc(*mk_hbw_preferred, desc.size_a); - } else if (allocator == omp_large_cap_mem_alloc && mk_dax_kmem_all) { - ptr = kmp_mk_alloc(*mk_dax_kmem_all, desc.size_a); +#if KMP_USE_HWLOC + if (__kmp_hwloc_available) { + if (__kmp_is_hwloc_membind_supported(HWLOC_MEMBIND_BIND)) { + if (allocator < kmp_max_mem_alloc) { + // pre-defined allocator + if (allocator == omp_high_bw_mem_alloc) { + ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_BANDWIDTH, + desc.size_a, HWLOC_MEMBIND_BIND); + if (ptr == NULL) + use_default_allocator = true; + } else if (allocator == omp_large_cap_mem_alloc) { + ptr = __kmp_hwloc_alloc_membind(HWLOC_MEMATTR_ID_CAPACITY, + desc.size_a, HWLOC_MEMBIND_BIND); + if (ptr == NULL) + use_default_allocator = true; + } else { + use_default_allocator = true; + } + if (use_default_allocator) { + ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a); + } + } else if (al->pool_size > 0) { + // custom allocator with pool size requested + kmp_uint64 used = + KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a); + if (used + desc.size_a > al->pool_size) { + // not enough space, need to go fallback path + KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); + if (al->fb == omp_atv_default_mem_fb) { + al = (kmp_allocator_t *)omp_default_mem_alloc; + ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a); + } else if (al->fb == omp_atv_abort_fb) { + KMP_ASSERT(0); // abort fallback requested + } else if (al->fb == omp_atv_allocator_fb) { + KMP_ASSERT(al != al->fb_data); + al = al->fb_data; + return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al); + } // else ptr == NULL; + } else { + // pool has enough space + if (al->membind == omp_atv_interleaved) { + if (__kmp_is_hwloc_membind_supported(HWLOC_MEMBIND_INTERLEAVE)) { + ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a, + HWLOC_MEMBIND_INTERLEAVE); + } + } else if (al->membind == omp_atv_environment) { + ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a, + HWLOC_MEMBIND_DEFAULT); + } else { + ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a); + } + if (ptr == NULL) { + if (al->fb == omp_atv_default_mem_fb) { + al = (kmp_allocator_t *)omp_default_mem_alloc; + ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a); + } else if (al->fb == omp_atv_abort_fb) { + KMP_ASSERT(0); // abort fallback requested + } else if (al->fb == omp_atv_allocator_fb) { + KMP_ASSERT(al != al->fb_data); + al = al->fb_data; + return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al); + } + } + } } else { - ptr = kmp_mk_alloc(*mk_default, desc.size_a); + // custom allocator, pool size not requested + if (al->membind == omp_atv_interleaved) { + if (__kmp_is_hwloc_membind_supported(HWLOC_MEMBIND_INTERLEAVE)) { + ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a, + HWLOC_MEMBIND_INTERLEAVE); + } + } else if (al->membind == omp_atv_environment) { + ptr = __kmp_hwloc_membind_policy(al->memspace, desc.size_a, + HWLOC_MEMBIND_DEFAULT); + } else { + ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a); + } + if (ptr == NULL) { + if (al->fb == omp_atv_default_mem_fb) { + al = (kmp_allocator_t *)omp_default_mem_alloc; + ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a); + } else if (al->fb == omp_atv_abort_fb) { + KMP_ASSERT(0); // abort fallback requested + } else if (al->fb == omp_atv_allocator_fb) { + KMP_ASSERT(al != al->fb_data); + al = al->fb_data; + return __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al); + } + } } - } else if (al->pool_size > 0) { - // custom allocator with pool size requested - kmp_uint64 used = - KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a); - if (used + desc.size_a > al->pool_size) { - // not enough space, need to go fallback path - KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); - if (al->fb == omp_atv_default_mem_fb) { - al = (kmp_allocator_t *)omp_default_mem_alloc; + } else { // alloc membind not supported, use hwloc_alloc + ptr = hwloc_alloc(__kmp_hwloc_topology, desc.size_a); + } + } else { +#endif + if (__kmp_memkind_available) { + if (allocator < kmp_max_mem_alloc) { + // pre-defined allocator + if (allocator == omp_high_bw_mem_alloc && mk_hbw_preferred) { + ptr = kmp_mk_alloc(*mk_hbw_preferred, desc.size_a); + } else if (allocator == omp_large_cap_mem_alloc && mk_dax_kmem_all) { + ptr = kmp_mk_alloc(*mk_dax_kmem_all, desc.size_a); + } else { ptr = kmp_mk_alloc(*mk_default, desc.size_a); - } else if (al->fb == omp_atv_abort_fb) { - KMP_ASSERT(0); // abort fallback requested - } else if (al->fb == omp_atv_allocator_fb) { - KMP_ASSERT(al != al->fb_data); - al = al->fb_data; - ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al); - if (is_pinned && kmp_target_lock_mem) - kmp_target_lock_mem(ptr, size, default_device); - return ptr; - } // else ptr == NULL; + } + } else if (al->pool_size > 0) { + // custom allocator with pool size requested + kmp_uint64 used = + KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a); + if (used + desc.size_a > al->pool_size) { + // not enough space, need to go fallback path + KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); + if (al->fb == omp_atv_default_mem_fb) { + al = (kmp_allocator_t *)omp_default_mem_alloc; + ptr = kmp_mk_alloc(*mk_default, desc.size_a); + } else if (al->fb == omp_atv_abort_fb) { + KMP_ASSERT(0); // abort fallback requested + } else if (al->fb == omp_atv_allocator_fb) { + KMP_ASSERT(al != al->fb_data); + al = al->fb_data; + ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al); + if (is_pinned && kmp_target_lock_mem) + kmp_target_lock_mem(ptr, size, default_device); + return ptr; + } // else ptr == NULL; + } else { + // pool has enough space + ptr = kmp_mk_alloc(*al->memkind, desc.size_a); + if (ptr == NULL) { + if (al->fb == omp_atv_default_mem_fb) { + al = (kmp_allocator_t *)omp_default_mem_alloc; + ptr = kmp_mk_alloc(*mk_default, desc.size_a); + } else if (al->fb == omp_atv_abort_fb) { + KMP_ASSERT(0); // abort fallback requested + } else if (al->fb == omp_atv_allocator_fb) { + KMP_ASSERT(al != al->fb_data); + al = al->fb_data; + ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al); + if (is_pinned && kmp_target_lock_mem) + kmp_target_lock_mem(ptr, size, default_device); + return ptr; + } + } + } } else { - // pool has enough space + // custom allocator, pool size not requested ptr = kmp_mk_alloc(*al->memkind, desc.size_a); if (ptr == NULL) { if (al->fb == omp_atv_default_mem_fb) { @@ -1659,13 +1864,39 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size, } } } - } else { - // custom allocator, pool size not requested - ptr = kmp_mk_alloc(*al->memkind, desc.size_a); - if (ptr == NULL) { + } else if (allocator < kmp_max_mem_alloc) { + // pre-defined allocator + if (allocator == omp_high_bw_mem_alloc) { + KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc"); + } else if (allocator == omp_large_cap_mem_alloc) { + KMP_WARNING(OmpNoAllocator, "omp_large_cap_mem_alloc"); + } else if (allocator == omp_const_mem_alloc) { + KMP_WARNING(OmpNoAllocator, "omp_const_mem_alloc"); + } else if (allocator == omp_low_lat_mem_alloc) { + KMP_WARNING(OmpNoAllocator, "omp_low_lat_mem_alloc"); + } else if (allocator == omp_cgroup_mem_alloc) { + KMP_WARNING(OmpNoAllocator, "omp_cgroup_mem_alloc"); + } else if (allocator == omp_pteam_mem_alloc) { + KMP_WARNING(OmpNoAllocator, "omp_pteam_mem_alloc"); + } else if (allocator == omp_thread_mem_alloc) { + KMP_WARNING(OmpNoAllocator, "omp_thread_mem_alloc"); + } else { // default allocator requested + use_default_allocator = true; + } + if (use_default_allocator) { + ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a); + use_default_allocator = false; + } + } else if (al->pool_size > 0) { + // custom allocator with pool size requested + kmp_uint64 used = + KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a); + if (used + desc.size_a > al->pool_size) { + // not enough space, need to go fallback path + KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); if (al->fb == omp_atv_default_mem_fb) { al = (kmp_allocator_t *)omp_default_mem_alloc; - ptr = kmp_mk_alloc(*mk_default, desc.size_a); + ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a); } else if (al->fb == omp_atv_abort_fb) { KMP_ASSERT(0); // abort fallback requested } else if (al->fb == omp_atv_allocator_fb) { @@ -1675,66 +1906,25 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size, if (is_pinned && kmp_target_lock_mem) kmp_target_lock_mem(ptr, size, default_device); return ptr; - } - } - } - } else if (allocator < kmp_max_mem_alloc) { - // pre-defined allocator - if (allocator == omp_high_bw_mem_alloc) { - KMP_WARNING(OmpNoAllocator, "omp_high_bw_mem_alloc"); - } else if (allocator == omp_large_cap_mem_alloc) { - KMP_WARNING(OmpNoAllocator, "omp_large_cap_mem_alloc"); - } else if (allocator == omp_const_mem_alloc) { - KMP_WARNING(OmpNoAllocator, "omp_const_mem_alloc"); - } else if (allocator == omp_low_lat_mem_alloc) { - KMP_WARNING(OmpNoAllocator, "omp_low_lat_mem_alloc"); - } else if (allocator == omp_cgroup_mem_alloc) { - KMP_WARNING(OmpNoAllocator, "omp_cgroup_mem_alloc"); - } else if (allocator == omp_pteam_mem_alloc) { - KMP_WARNING(OmpNoAllocator, "omp_pteam_mem_alloc"); - } else if (allocator == omp_thread_mem_alloc) { - KMP_WARNING(OmpNoAllocator, "omp_thread_mem_alloc"); - } else { // default allocator requested - use_default_allocator = true; - } - if (use_default_allocator) { - ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a); - use_default_allocator = false; - } - } else if (al->pool_size > 0) { - // custom allocator with pool size requested - kmp_uint64 used = - KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, desc.size_a); - if (used + desc.size_a > al->pool_size) { - // not enough space, need to go fallback path - KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); - if (al->fb == omp_atv_default_mem_fb) { - al = (kmp_allocator_t *)omp_default_mem_alloc; + } // else ptr == NULL + } else { + // pool has enough space ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a); - } else if (al->fb == omp_atv_abort_fb) { - KMP_ASSERT(0); // abort fallback requested - } else if (al->fb == omp_atv_allocator_fb) { - KMP_ASSERT(al != al->fb_data); - al = al->fb_data; - ptr = __kmp_alloc(gtid, algn, size, (omp_allocator_handle_t)al); - if (is_pinned && kmp_target_lock_mem) - kmp_target_lock_mem(ptr, size, default_device); - return ptr; - } // else ptr == NULL; + if (ptr == NULL && al->fb == omp_atv_abort_fb) { + KMP_ASSERT(0); // abort fallback requested + } // no sense to look for another fallback because of same internal + // alloc + } } else { - // pool has enough space + // custom allocator, pool size not requested ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a); if (ptr == NULL && al->fb == omp_atv_abort_fb) { KMP_ASSERT(0); // abort fallback requested } // no sense to look for another fallback because of same internal alloc } - } else { - // custom allocator, pool size not requested - ptr = __kmp_thread_malloc(__kmp_thread_from_gtid(gtid), desc.size_a); - if (ptr == NULL && al->fb == omp_atv_abort_fb) { - KMP_ASSERT(0); // abort fallback requested - } // no sense to look for another fallback because of same internal alloc +#if KMP_USE_HWLOC } +#endif KE_TRACE(10, ("__kmp_alloc: T#%d %p=alloc(%d)\n", gtid, ptr, desc.size_a)); if (ptr == NULL) return NULL; @@ -1864,34 +2054,48 @@ void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) { kmp_target_unlock_mem(desc.ptr_alloc, device); } - if (__kmp_memkind_available) { - if (oal < kmp_max_mem_alloc) { - // pre-defined allocator - if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) { - kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc); - } else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) { - kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc); +#if KMP_USE_HWLOC + if (__kmp_hwloc_available) { + if (oal > kmp_max_mem_alloc && al->pool_size > 0) { + kmp_uint64 used = + KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); + (void)used; // to suppress compiler warning + KMP_DEBUG_ASSERT(used >= desc.size_a); + } + hwloc_free(__kmp_hwloc_topology, desc.ptr_alloc, desc.size_a); + } else { +#endif + if (__kmp_memkind_available) { + if (oal < kmp_max_mem_alloc) { + // pre-defined allocator + if (oal == omp_high_bw_mem_alloc && mk_hbw_preferred) { + kmp_mk_free(*mk_hbw_preferred, desc.ptr_alloc); + } else if (oal == omp_large_cap_mem_alloc && mk_dax_kmem_all) { + kmp_mk_free(*mk_dax_kmem_all, desc.ptr_alloc); + } else { + kmp_mk_free(*mk_default, desc.ptr_alloc); + } } else { - kmp_mk_free(*mk_default, desc.ptr_alloc); + if (al->pool_size > 0) { // custom allocator with pool size requested + kmp_uint64 used = + KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); + (void)used; // to suppress compiler warning + KMP_DEBUG_ASSERT(used >= desc.size_a); + } + kmp_mk_free(*al->memkind, desc.ptr_alloc); } } else { - if (al->pool_size > 0) { // custom allocator with pool size requested + if (oal > kmp_max_mem_alloc && al->pool_size > 0) { kmp_uint64 used = KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); (void)used; // to suppress compiler warning KMP_DEBUG_ASSERT(used >= desc.size_a); } - kmp_mk_free(*al->memkind, desc.ptr_alloc); + __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc); } - } else { - if (oal > kmp_max_mem_alloc && al->pool_size > 0) { - kmp_uint64 used = - KMP_TEST_THEN_ADD64((kmp_int64 *)&al->pool_used, -desc.size_a); - (void)used; // to suppress compiler warning - KMP_DEBUG_ASSERT(used >= desc.size_a); - } - __kmp_thread_free(__kmp_thread_from_gtid(gtid), desc.ptr_alloc); +#if KMP_USE_HWLOC } +#endif } /* If LEAK_MEMORY is defined, __kmp_free() will *not* free memory. It causes diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp index 52e0fdbdfb1da..eb077bca4ce21 100644 --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -296,6 +296,7 @@ kmp_int32 __kmp_max_task_priority = 0; kmp_uint64 __kmp_taskloop_min_tasks = 0; int __kmp_memkind_available = 0; +bool __kmp_hwloc_available = false; omp_allocator_handle_t const omp_null_allocator = NULL; omp_allocator_handle_t const omp_default_mem_alloc = (omp_allocator_handle_t const)1; diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp index 8b6092cb1085c..392a02ebbd9aa 100644 --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -3767,7 +3767,7 @@ static void __kmp_stg_parse_allocator(char const *name, char const *value, if (__kmp_match_str("omp_high_bw_mem_alloc", scan, &next)) { SKIP_WS(next); if (is_memalloc) { - if (__kmp_memkind_available) { + if (__kmp_hwloc_available || __kmp_memkind_available) { __kmp_def_allocator = omp_high_bw_mem_alloc; return; } else { @@ -3780,7 +3780,7 @@ static void __kmp_stg_parse_allocator(char const *name, char const *value, } else if (__kmp_match_str("omp_large_cap_mem_alloc", scan, &next)) { SKIP_WS(next); if (is_memalloc) { - if (__kmp_memkind_available) { + if (__kmp_hwloc_available || __kmp_memkind_available) { __kmp_def_allocator = omp_large_cap_mem_alloc; return; } else { diff --git a/openmp/runtime/test/api/omp_alloc_hwloc.c b/openmp/runtime/test/api/omp_alloc_hwloc.c new file mode 100644 index 0000000000000..7cdcae9b36e8a --- /dev/null +++ b/openmp/runtime/test/api/omp_alloc_hwloc.c @@ -0,0 +1,25 @@ +// RUN: %libomp-compile && env KMP_TOPOLOGY_METHOD=hwloc %libomp-run +// REQUIRES: hwloc + +#include +#include + +int main() { + void *p[2]; +#pragma omp parallel num_threads(2) + { + int i = omp_get_thread_num(); + p[i] = omp_alloc(1024 * 1024, omp_get_default_allocator()); +#pragma omp barrier + printf("th %d, ptr %p\n", i, p[i]); + omp_free(p[i], omp_get_default_allocator()); + } + // Both pointers should be non-NULL + if (p[0] != NULL && p[1] != NULL) { + printf("passed\n"); + return 0; + } else { + printf("failed: pointers %p %p\n", p[0], p[1]); + return 1; + } +} From 5d364481e36871584affa54c58a803a936b9a5d6 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Wed, 2 Apr 2025 09:24:38 +0200 Subject: [PATCH 0350/1029] [flang][OpenMP] Upstream first part of `do concurrent` mapping (#126026) This PR starts the effort to upstream AMD's internal implementation of `do concurrent` to OpenMP mapping. This replaces #77285 since we extended this WIP quite a bit on our fork over the past year. An important part of this PR is a document that describes the current status downstream, the upstreaming status, and next steps to make this pass much more useful. In addition to this document, this PR also contains the skeleton of the pass (no useful transformations are done yet) and some testing for the added command line options. This looks like a huge PR but a lot of the added stuff is documentation. It is also worth noting that the downstream pass has been validated on https://github.com/BerkeleyLab/fiats. For the CPU mapping, this achived performance speed-ups that match pure OpenMP, for GPU mapping we are still working on extending our support for implicit memory mapping and locality specifiers. PR stack: - https://github.com/llvm/llvm-project/pull/126026 (this PR) - https://github.com/llvm/llvm-project/pull/127595 - https://github.com/llvm/llvm-project/pull/127633 - https://github.com/llvm/llvm-project/pull/127634 - https://github.com/llvm/llvm-project/pull/127635 --- clang/include/clang/Driver/Options.td | 4 + clang/lib/Driver/ToolChains/Flang.cpp | 3 +- flang/docs/DoConcurrentConversionToOpenMP.md | 155 ++++++++++++++++++ flang/docs/index.md | 1 + .../include/flang/Frontend/CodeGenOptions.def | 2 + flang/include/flang/Frontend/CodeGenOptions.h | 5 + flang/include/flang/Optimizer/OpenMP/Passes.h | 2 + .../include/flang/Optimizer/OpenMP/Passes.td | 30 ++++ flang/include/flang/Optimizer/OpenMP/Utils.h | 26 +++ .../flang/Optimizer/Passes/Pipelines.h | 18 +- flang/lib/Frontend/CompilerInvocation.cpp | 28 ++++ flang/lib/Frontend/FrontendActions.cpp | 39 ++++- flang/lib/Optimizer/OpenMP/CMakeLists.txt | 1 + .../OpenMP/DoConcurrentConversion.cpp | 99 +++++++++++ flang/lib/Optimizer/Passes/Pipelines.cpp | 12 +- .../test/Driver/do_concurrent_to_omp_cli.f90 | 20 +++ .../Transforms/DoConcurrent/basic_host.f90 | 53 ++++++ flang/tools/bbc/bbc.cpp | 20 ++- 18 files changed, 506 insertions(+), 12 deletions(-) create mode 100644 flang/docs/DoConcurrentConversionToOpenMP.md create mode 100644 flang/include/flang/Optimizer/OpenMP/Utils.h create mode 100644 flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp create mode 100644 flang/test/Driver/do_concurrent_to_omp_cli.f90 create mode 100644 flang/test/Transforms/DoConcurrent/basic_host.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 89cb03cc33b98..4c01088076818 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6976,6 +6976,10 @@ defm loop_versioning : BoolOptionWithoutMarshalling<"f", "version-loops-for-stri def fhermetic_module_files : Flag<["-"], "fhermetic-module-files">, Group, HelpText<"Emit hermetic module files (no nested USE association)">; + +def fdo_concurrent_to_openmp_EQ : Joined<["-"], "fdo-concurrent-to-openmp=">, + HelpText<"Try to map `do concurrent` loops to OpenMP [none|host|device]">, + Values<"none, host, device">; } // let Visibility = [FC1Option, FlangOption] def J : JoinedOrSeparate<["-"], "J">, diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index a44513a83a2d7..8312234e33a64 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -158,7 +158,8 @@ void Flang::addCodegenOptions(const ArgList &Args, CmdArgs.push_back("-fversion-loops-for-stride"); Args.addAllArgs(CmdArgs, - {options::OPT_flang_experimental_hlfir, + {options::OPT_fdo_concurrent_to_openmp_EQ, + options::OPT_flang_experimental_hlfir, options::OPT_flang_deprecated_no_hlfir, options::OPT_fno_ppc_native_vec_elem_order, options::OPT_fppc_native_vec_elem_order, diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md b/flang/docs/DoConcurrentConversionToOpenMP.md new file mode 100644 index 0000000000000..62bc3172f8e3b --- /dev/null +++ b/flang/docs/DoConcurrentConversionToOpenMP.md @@ -0,0 +1,155 @@ + + +# `DO CONCURRENT` mapping to OpenMP + +```{contents} +--- +local: +--- +``` + +This document seeks to describe the effort to parallelize `do concurrent` loops +by mapping them to OpenMP worksharing constructs. The goals of this document +are: +* Describing how to instruct `flang` to map `DO CONCURRENT` loops to OpenMP + constructs. +* Tracking the current status of such mapping. +* Describing the limitations of the current implementation. +* Describing next steps. +* Tracking the current upstreaming status (from the AMD ROCm fork). + +## Usage + +In order to enable `do concurrent` to OpenMP mapping, `flang` adds a new +compiler flag: `-fdo-concurrent-to-openmp`. This flag has 3 possible values: +1. `host`: this maps `do concurrent` loops to run in parallel on the host CPU. + This maps such loops to the equivalent of `omp parallel do`. +2. `device`: this maps `do concurrent` loops to run in parallel on a target device. + This maps such loops to the equivalent of + `omp target teams distribute parallel do`. +3. `none`: this disables `do concurrent` mapping altogether. In that case, such + loops are emitted as sequential loops. + +The `-fdo-concurrent-to-openmp` compiler switch is currently available only when +OpenMP is also enabled. So you need to provide the following options to flang in +order to enable it: +``` +flang ... -fopenmp -fdo-concurrent-to-openmp=[host|device|none] ... +``` +For mapping to device, the target device architecture must be specified as well. +See `-fopenmp-targets` and `--offload-arch` for more info. + +## Current status + +Under the hood, `do concurrent` mapping is implemented in the +`DoConcurrentConversionPass`. This is still an experimental pass which means +that: +* It has been tested in a very limited way so far. +* It has been tested mostly on simple synthetic inputs. + + + +## Next steps + +This section describes some of the open questions/issues that are not tackled yet +even in the downstream implementation. + +### Delayed privatization + +So far, we emit the privatization logic for IVs inline in the parallel/target +region. This is enough for our purposes right now since we don't +localize/privatize any sophisticated types of variables yet. Once we have need +for more advanced localization through `do concurrent`'s locality specifiers +(see below), delayed privatization will enable us to have a much cleaner IR. +Once delayed privatization's implementation upstream is supported for the +required constructs by the pass, we will move to it rather than inlined/early +privatization. + +### Locality specifiers for `do concurrent` + +Locality specifiers will enable the user to control the data environment of the +loop nest in a more fine-grained way. Implementing these specifiers on the +`FIR` dialect level is needed in order to support this in the +`DoConcurrentConversionPass`. + +Such specifiers will also unlock a potential solution to the +non-perfectly-nested loops' IVs issue described above. In particular, for a +non-perfectly nested loop, one middle-ground proposal/solution would be to: +* Emit the loop's IV as shared/mapped just like we do currently. +* Emit a warning that the IV of the loop is emitted as shared/mapped. +* Given support for `LOCAL`, we can recommend the user to explicitly + localize/privatize the loop's IV if they choose to. + +#### Sharing TableGen clause records from the OpenMP dialect + +At the moment, the FIR dialect does not have a way to model locality specifiers +on the IR level. Instead, something similar to early/eager privatization in OpenMP +is done for the locality specifiers in `fir.do_loop` ops. Having locality specifier +modelled in a way similar to delayed privatization (i.e. the `omp.private` op) and +reductions (i.e. the `omp.declare_reduction` op) can make mapping `do concurrent` +to OpenMP (and other parallel programming models) much easier. + +Therefore, one way to approach this problem is to extract the TableGen records +for relevant OpenMP clauses in a shared dialect for "data environment management" +and use these shared records for OpenMP, `do concurrent`, and possibly OpenACC +as well. + +#### Supporting reductions + +Similar to locality specifiers, mapping reductions from `do concurrent` to OpenMP +is also still an open TODO. We can potentially extend the MLIR infrastructure +proposed in the previous section to share reduction records among the different +relevant dialects as well. + +### More advanced detection of loop nests + +As pointed out earlier, any intervening code between the headers of 2 nested +`do concurrent` loops prevents us from detecting this as a loop nest. In some +cases this is overly conservative. Therefore, a more flexible detection logic +of loop nests needs to be implemented. + +### Data-dependence analysis + +Right now, we map loop nests without analysing whether such mapping is safe to +do or not. We probably need to at least warn the user of unsafe loop nests due +to loop-carried dependencies. + +### Non-rectangular loop nests + +So far, we did not need to use the pass for non-rectangular loop nests. For +example: +```fortran +do concurrent(i=1:n) + do concurrent(j=i:n) + ... + end do +end do +``` +We defer this to the (hopefully) near future when we get the conversion in a +good share for the samples/projects at hand. + +### Generalizing the pass to other parallel programming models + +Once we have a stable and capable `do concurrent` to OpenMP mapping, we can take +this in a more generalized direction and allow the pass to target other models; +e.g. OpenACC. This goal should be kept in mind from the get-go even while only +targeting OpenMP. + + +## Upstreaming status + +- [x] Command line options for `flang` and `bbc`. +- [x] Conversion pass skeleton (no transormations happen yet). +- [x] Status description and tracking document (this document). +- [ ] Basic host/CPU mapping support. +- [ ] Basic device/GPU mapping support. +- [ ] More advanced host and device support (expaned to multiple items as needed). diff --git a/flang/docs/index.md b/flang/docs/index.md index 1de0ee2e6f0d6..8ab5c0bcb2123 100644 --- a/flang/docs/index.md +++ b/flang/docs/index.md @@ -51,6 +51,7 @@ on how to get in touch with us and to learn more about the current status. DebugGeneration Directives DoConcurrent + DoConcurrentConversionToOpenMP Extensions F202X FIRArrayOperations diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def index 5d6af4271d4f6..57830bf51a1b3 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.def +++ b/flang/include/flang/Frontend/CodeGenOptions.def @@ -43,5 +43,7 @@ ENUM_CODEGENOPT(DebugInfo, llvm::codegenoptions::DebugInfoKind, 4, llvm::codeg ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 3, llvm::driver::VectorLibrary::NoLibrary) ///< Vector functions library to use ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 2, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers +ENUM_CODEGENOPT(DoConcurrentMapping, DoConcurrentMappingKind, 2, DoConcurrentMappingKind::DCMK_None) ///< Map `do concurrent` to OpenMP + #undef CODEGENOPT #undef ENUM_CODEGENOPT diff --git a/flang/include/flang/Frontend/CodeGenOptions.h b/flang/include/flang/Frontend/CodeGenOptions.h index f19943335737b..23d99e1f0897a 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.h +++ b/flang/include/flang/Frontend/CodeGenOptions.h @@ -15,6 +15,7 @@ #ifndef FORTRAN_FRONTEND_CODEGENOPTIONS_H #define FORTRAN_FRONTEND_CODEGENOPTIONS_H +#include "flang/Optimizer/OpenMP/Utils.h" #include "llvm/Frontend/Debug/Options.h" #include "llvm/Frontend/Driver/CodeGenOptions.h" #include "llvm/Support/CodeGen.h" @@ -143,6 +144,10 @@ class CodeGenOptions : public CodeGenOptionsBase { /// (-mlarge-data-threshold). uint64_t LargeDataThreshold; + /// Optionally map `do concurrent` loops to OpenMP. This is only valid of + /// OpenMP is enabled. + using DoConcurrentMappingKind = flangomp::DoConcurrentMappingKind; + // Define accessors/mutators for code generation options of enumeration type. #define CODEGENOPT(Name, Bits, Default) #define ENUM_CODEGENOPT(Name, Type, Bits, Default) \ diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.h b/flang/include/flang/Optimizer/OpenMP/Passes.h index feb395f1a12db..c67bddbcd2704 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.h +++ b/flang/include/flang/Optimizer/OpenMP/Passes.h @@ -13,6 +13,7 @@ #ifndef FORTRAN_OPTIMIZER_OPENMP_PASSES_H #define FORTRAN_OPTIMIZER_OPENMP_PASSES_H +#include "flang/Optimizer/OpenMP/Utils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/Pass/Pass.h" @@ -30,6 +31,7 @@ namespace flangomp { /// divided into units of work. bool shouldUseWorkshareLowering(mlir::Operation *op); +std::unique_ptr createDoConcurrentConversionPass(bool mapToDevice); } // namespace flangomp #endif // FORTRAN_OPTIMIZER_OPENMP_PASSES_H diff --git a/flang/include/flang/Optimizer/OpenMP/Passes.td b/flang/include/flang/Optimizer/OpenMP/Passes.td index 3add0c560f88d..fcc7a4ca31fef 100644 --- a/flang/include/flang/Optimizer/OpenMP/Passes.td +++ b/flang/include/flang/Optimizer/OpenMP/Passes.td @@ -50,6 +50,36 @@ def FunctionFilteringPass : Pass<"omp-function-filtering"> { ]; } +def DoConcurrentConversionPass : Pass<"omp-do-concurrent-conversion", "mlir::func::FuncOp"> { + let summary = "Map `DO CONCURRENT` loops to OpenMP worksharing loops."; + + let description = [{ This is an experimental pass to map `DO CONCURRENT` loops + to their correspnding equivalent OpenMP worksharing constructs. + + For now the following is supported: + - Mapping simple loops to `parallel do`. + + Still TODO: + - More extensive testing. + }]; + + let dependentDialects = ["mlir::omp::OpenMPDialect"]; + + let options = [ + Option<"mapTo", "map-to", + "flangomp::DoConcurrentMappingKind", + /*default=*/"flangomp::DoConcurrentMappingKind::DCMK_None", + "Try to map `do concurrent` loops to OpenMP [none|host|device]", + [{::llvm::cl::values( + clEnumValN(flangomp::DoConcurrentMappingKind::DCMK_None, + "none", "Do not lower `do concurrent` to OpenMP"), + clEnumValN(flangomp::DoConcurrentMappingKind::DCMK_Host, + "host", "Lower to run in parallel on the CPU"), + clEnumValN(flangomp::DoConcurrentMappingKind::DCMK_Device, + "device", "Lower to run in parallel on the GPU") + )}]>, + ]; +} // Needs to be scheduled on Module as we create functions in it def LowerWorkshare : Pass<"lower-workshare", "::mlir::ModuleOp"> { diff --git a/flang/include/flang/Optimizer/OpenMP/Utils.h b/flang/include/flang/Optimizer/OpenMP/Utils.h new file mode 100644 index 0000000000000..636c768b016b7 --- /dev/null +++ b/flang/include/flang/Optimizer/OpenMP/Utils.h @@ -0,0 +1,26 @@ +//===-- Optimizer/OpenMP/Utils.h --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_OPENMP_UTILS_H +#define FORTRAN_OPTIMIZER_OPENMP_UTILS_H + +namespace flangomp { + +enum class DoConcurrentMappingKind { + DCMK_None, ///< Do not lower `do concurrent` to OpenMP. + DCMK_Host, ///< Lower to run in parallel on the CPU. + DCMK_Device ///< Lower to run in parallel on the GPU. +}; + +} // namespace flangomp + +#endif // FORTRAN_OPTIMIZER_OPENMP_UTILS_H diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h index ef5d44ded706c..a3f59ee8dd013 100644 --- a/flang/include/flang/Optimizer/Passes/Pipelines.h +++ b/flang/include/flang/Optimizer/Passes/Pipelines.h @@ -128,6 +128,17 @@ void createHLFIRToFIRPassPipeline( mlir::PassManager &pm, bool enableOpenMP, llvm::OptimizationLevel optLevel = defaultOptLevel); +struct OpenMPFIRPassPipelineOpts { + /// Whether code is being generated for a target device rather than the host + /// device + bool isTargetDevice; + + /// Controls how to map `do concurrent` loops; to device, host, or none at + /// all. + Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind + doConcurrentMappingKind; +}; + /// Create a pass pipeline for handling certain OpenMP transformations needed /// prior to FIR lowering. /// @@ -135,9 +146,10 @@ void createHLFIRToFIRPassPipeline( /// that the FIR is correct with respect to OpenMP operations/attributes. /// /// \param pm - MLIR pass manager that will hold the pipeline definition. -/// \param isTargetDevice - Whether code is being generated for a target device -/// rather than the host device. -void createOpenMPFIRPassPipeline(mlir::PassManager &pm, bool isTargetDevice); +/// \param opts - options to control OpenMP code-gen; see struct docs for more +/// details. +void createOpenMPFIRPassPipeline(mlir::PassManager &pm, + OpenMPFIRPassPipelineOpts opts); #if !defined(FLANG_EXCLUDE_CODEGEN) void createDebugPasses(mlir::PassManager &pm, diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 229695b18d278..1ea7834746540 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -158,6 +158,32 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts, return true; } +static void parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts, + llvm::opt::ArgList &args, + clang::DiagnosticsEngine &diags) { + llvm::opt::Arg *arg = + args.getLastArg(clang::driver::options::OPT_fdo_concurrent_to_openmp_EQ); + if (!arg) + return; + + using DoConcurrentMappingKind = + Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; + std::optional val = + llvm::StringSwitch>( + arg->getValue()) + .Case("none", DoConcurrentMappingKind::DCMK_None) + .Case("host", DoConcurrentMappingKind::DCMK_Host) + .Case("device", DoConcurrentMappingKind::DCMK_Device) + .Default(std::nullopt); + + if (!val.has_value()) { + diags.Report(clang::diag::err_drv_invalid_value) + << arg->getAsString(args) << arg->getValue(); + } + + opts.setDoConcurrentMapping(val.value()); +} + static bool parseVectorLibArg(Fortran::frontend::CodeGenOptions &opts, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { @@ -433,6 +459,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, clang::driver::options::OPT_funderscoring, false)) { opts.Underscoring = 0; } + + parseDoConcurrentMapping(opts, args, diags); } /// Parses all target input arguments and populates the target diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index 2cb5260334a0f..bd2c0632cb35d 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -287,16 +287,45 @@ bool CodeGenAction::beginSourceFileAction() { // Add OpenMP-related passes // WARNING: These passes must be run immediately after the lowering to ensure // that the FIR is correct with respect to OpenMP operations/attributes. - if (ci.getInvocation().getFrontendOpts().features.IsEnabled( - Fortran::common::LanguageFeature::OpenMP)) { - bool isDevice = false; + bool isOpenMPEnabled = + ci.getInvocation().getFrontendOpts().features.IsEnabled( + Fortran::common::LanguageFeature::OpenMP); + + fir::OpenMPFIRPassPipelineOpts opts; + + using DoConcurrentMappingKind = + Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; + opts.doConcurrentMappingKind = + ci.getInvocation().getCodeGenOpts().getDoConcurrentMapping(); + + if (opts.doConcurrentMappingKind != DoConcurrentMappingKind::DCMK_None && + !isOpenMPEnabled) { + unsigned diagID = ci.getDiagnostics().getCustomDiagID( + clang::DiagnosticsEngine::Warning, + "OpenMP is required for lowering `do concurrent` loops to OpenMP." + "Enable OpenMP using `-fopenmp`." + "`do concurrent` loops will be serialized."); + ci.getDiagnostics().Report(diagID); + opts.doConcurrentMappingKind = DoConcurrentMappingKind::DCMK_None; + } + + if (opts.doConcurrentMappingKind != DoConcurrentMappingKind::DCMK_None) { + unsigned diagID = ci.getDiagnostics().getCustomDiagID( + clang::DiagnosticsEngine::Warning, + "Mapping `do concurrent` to OpenMP is still experimental."); + ci.getDiagnostics().Report(diagID); + } + + if (isOpenMPEnabled) { + opts.isTargetDevice = false; if (auto offloadMod = llvm::dyn_cast( mlirModule->getOperation())) - isDevice = offloadMod.getIsTargetDevice(); + opts.isTargetDevice = offloadMod.getIsTargetDevice(); + // WARNING: This pipeline must be run immediately after the lowering to // ensure that the FIR is correct with respect to OpenMP operations/ // attributes. - fir::createOpenMPFIRPassPipeline(pm, isDevice); + fir::createOpenMPFIRPassPipeline(pm, opts); } pm.enableVerifier(/*verifyPasses=*/true); diff --git a/flang/lib/Optimizer/OpenMP/CMakeLists.txt b/flang/lib/Optimizer/OpenMP/CMakeLists.txt index 4a48d6e0936db..3acf143594356 100644 --- a/flang/lib/Optimizer/OpenMP/CMakeLists.txt +++ b/flang/lib/Optimizer/OpenMP/CMakeLists.txt @@ -1,6 +1,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_flang_library(FlangOpenMPTransforms + DoConcurrentConversion.cpp FunctionFiltering.cpp GenericLoopConversion.cpp MapsForPrivatizedSymbols.cpp diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp new file mode 100644 index 0000000000000..cebf6cd8ed0df --- /dev/null +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -0,0 +1,99 @@ +//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/OpenMP/Passes.h" +#include "flang/Optimizer/OpenMP/Utils.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/Transforms/DialectConversion.h" + +namespace flangomp { +#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS +#include "flang/Optimizer/OpenMP/Passes.h.inc" +} // namespace flangomp + +#define DEBUG_TYPE "do-concurrent-conversion" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ") + +namespace { +class DoConcurrentConversion : public mlir::OpConversionPattern { +public: + using mlir::OpConversionPattern::OpConversionPattern; + + DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice) + : OpConversionPattern(context), mapToDevice(mapToDevice) {} + + mlir::LogicalResult + matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + // TODO This will be filled in with the next PRs that upstreams the rest of + // the ROCm implementaion. + return mlir::success(); + } + + bool mapToDevice; +}; + +class DoConcurrentConversionPass + : public flangomp::impl::DoConcurrentConversionPassBase< + DoConcurrentConversionPass> { +public: + DoConcurrentConversionPass() = default; + + DoConcurrentConversionPass( + const flangomp::DoConcurrentConversionPassOptions &options) + : DoConcurrentConversionPassBase(options) {} + + void runOnOperation() override { + mlir::func::FuncOp func = getOperation(); + + if (func.isDeclaration()) + return; + + mlir::MLIRContext *context = &getContext(); + + if (mapTo != flangomp::DoConcurrentMappingKind::DCMK_Host && + mapTo != flangomp::DoConcurrentMappingKind::DCMK_Device) { + mlir::emitWarning(mlir::UnknownLoc::get(context), + "DoConcurrentConversionPass: invalid `map-to` value. " + "Valid values are: `host` or `device`"); + return; + } + + mlir::RewritePatternSet patterns(context); + patterns.insert( + context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device); + mlir::ConversionTarget target(*context); + target.addDynamicallyLegalOp([&](fir::DoLoopOp op) { + // The goal is to handle constructs that eventually get lowered to + // `fir.do_loop` with the `unordered` attribute (e.g. array expressions). + // Currently, this is only enabled for the `do concurrent` construct since + // the pass runs early in the pipeline. + return !op.getUnordered(); + }); + target.markUnknownOpDynamicallyLegal( + [](mlir::Operation *) { return true; }); + + if (mlir::failed(mlir::applyFullConversion(getOperation(), target, + std::move(patterns)))) { + mlir::emitError(mlir::UnknownLoc::get(context), + "error in converting do-concurrent op"); + signalPassFailure(); + } + } +}; +} // namespace + +std::unique_ptr +flangomp::createDoConcurrentConversionPass(bool mapToDevice) { + DoConcurrentConversionPassOptions options; + options.mapTo = mapToDevice ? flangomp::DoConcurrentMappingKind::DCMK_Device + : flangomp::DoConcurrentMappingKind::DCMK_Host; + + return std::make_unique(options); +} diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp index 6ec19556625bc..81ff6bf9b2c6a 100644 --- a/flang/lib/Optimizer/Passes/Pipelines.cpp +++ b/flang/lib/Optimizer/Passes/Pipelines.cpp @@ -287,12 +287,20 @@ void createHLFIRToFIRPassPipeline(mlir::PassManager &pm, bool enableOpenMP, /// \param pm - MLIR pass manager that will hold the pipeline definition. /// \param isTargetDevice - Whether code is being generated for a target device /// rather than the host device. -void createOpenMPFIRPassPipeline(mlir::PassManager &pm, bool isTargetDevice) { +void createOpenMPFIRPassPipeline(mlir::PassManager &pm, + OpenMPFIRPassPipelineOpts opts) { + using DoConcurrentMappingKind = + Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; + + if (opts.doConcurrentMappingKind != DoConcurrentMappingKind::DCMK_None) + pm.addPass(flangomp::createDoConcurrentConversionPass( + opts.doConcurrentMappingKind == DoConcurrentMappingKind::DCMK_Device)); + pm.addPass(flangomp::createMapInfoFinalizationPass()); pm.addPass(flangomp::createMapsForPrivatizedSymbolsPass()); pm.addPass(flangomp::createMarkDeclareTargetPass()); pm.addPass(flangomp::createGenericLoopConversionPass()); - if (isTargetDevice) + if (opts.isTargetDevice) pm.addPass(flangomp::createFunctionFilteringPass()); } diff --git a/flang/test/Driver/do_concurrent_to_omp_cli.f90 b/flang/test/Driver/do_concurrent_to_omp_cli.f90 new file mode 100644 index 0000000000000..41b7575e206af --- /dev/null +++ b/flang/test/Driver/do_concurrent_to_omp_cli.f90 @@ -0,0 +1,20 @@ +! UNSUPPORTED: system-windows + +! RUN: %flang --help | FileCheck %s --check-prefix=FLANG + +! FLANG: -fdo-concurrent-to-openmp= +! FLANG-NEXT: Try to map `do concurrent` loops to OpenMP [none|host|device] + +! RUN: bbc --help | FileCheck %s --check-prefix=BBC + +! BBC: -fdo-concurrent-to-openmp= +! BBC-SAME: Try to map `do concurrent` loops to OpenMP [none|host|device] + +! RUN: %flang -fdo-concurrent-to-openmp=host %s 2>&1 \ +! RUN: | FileCheck %s --check-prefix=OPT + +! OPT: warning: OpenMP is required for lowering `do concurrent` loops to OpenMP. +! OPT-SAME: Enable OpenMP using `-fopenmp`. + +program test_cli +end program diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90 new file mode 100644 index 0000000000000..b569668ab0f0e --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/basic_host.f90 @@ -0,0 +1,53 @@ +! Mark as xfail for now until we upstream the relevant part. This is just for +! demo purposes at this point. Upstreaming this is the next step. +! XFAIL: * + +! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`. + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \ +! RUN: | FileCheck %s +! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \ +! RUN: | FileCheck %s + +! CHECK-LABEL: do_concurrent_basic +program do_concurrent_basic + ! CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + + implicit none + integer :: a(10) + integer :: i + + ! CHECK-NOT: fir.do_loop + + ! CHECK: omp.parallel { + + ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"} + ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + + ! CHECK: %[[C1:.*]] = arith.constant 1 : i32 + ! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index + ! CHECK: %[[C10:.*]] = arith.constant 10 : i32 + ! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index + ! CHECK: %[[STEP:.*]] = arith.constant 1 : index + + ! CHECK: omp.wsloop { + ! CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32 + ! CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref + ! CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref + ! CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref + ! CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64 + ! CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[ARR]]#0 (%[[IV_VAL_I64]]) : (!fir.ref>, i64) -> !fir.ref + ! CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref + ! CHECK-NEXT: omp.yield + ! CHECK-NEXT: } + ! CHECK-NEXT: } + + ! CHECK-NEXT: omp.terminator + ! CHECK-NEXT: } + do concurrent (i=1:10) + a(i) = i + end do + + ! CHECK-NOT: fir.do_loop +end program do_concurrent_basic diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index 2cc75b7aa4e87..c38e59f47c542 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -142,6 +142,12 @@ static llvm::cl::opt llvm::cl::desc("enable openmp device compilation"), llvm::cl::init(false)); +static llvm::cl::opt enableDoConcurrentToOpenMPConversion( + "fdo-concurrent-to-openmp", + llvm::cl::desc( + "Try to map `do concurrent` loops to OpenMP [none|host|device]"), + llvm::cl::init("none")); + static llvm::cl::opt enableOpenMPGPU("fopenmp-is-gpu", llvm::cl::desc("enable openmp GPU target codegen"), @@ -315,7 +321,19 @@ createTargetMachine(llvm::StringRef targetTriple, std::string &error) { static llvm::LogicalResult runOpenMPPasses(mlir::ModuleOp mlirModule) { mlir::PassManager pm(mlirModule->getName(), mlir::OpPassManager::Nesting::Implicit); - fir::createOpenMPFIRPassPipeline(pm, enableOpenMPDevice); + using DoConcurrentMappingKind = + Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; + + fir::OpenMPFIRPassPipelineOpts opts; + opts.isTargetDevice = enableOpenMPDevice; + opts.doConcurrentMappingKind = + llvm::StringSwitch( + enableDoConcurrentToOpenMPConversion) + .Case("host", DoConcurrentMappingKind::DCMK_Host) + .Case("device", DoConcurrentMappingKind::DCMK_Device) + .Default(DoConcurrentMappingKind::DCMK_None); + + fir::createOpenMPFIRPassPipeline(pm, opts); (void)mlir::applyPassManagerCLOptions(pm); if (mlir::failed(pm.run(mlirModule))) { llvm::errs() << "FATAL: failed to correctly apply OpenMP pass pipeline"; From 8a691cc6157b2c3bc91af767eb1154d7a715562a Mon Sep 17 00:00:00 2001 From: Mariya Podchishchaeva Date: Wed, 2 Apr 2025 09:25:43 +0200 Subject: [PATCH 0351/1029] [MS][clang] Make sure vector deleting dtor calls correct operator delete (#133950) During additional testing I spotted that vector deleting dtor calls operator delete, not operator delete[] when performing array deletion. This patch fixes that. --- clang/include/clang/AST/DeclCXX.h | 6 +++++ clang/include/clang/Sema/Sema.h | 3 ++- clang/lib/AST/DeclCXX.cpp | 7 ++++++ clang/lib/CodeGen/CGClass.cpp | 2 +- clang/lib/Sema/SemaDeclCXX.cpp | 13 ++++++++++- clang/lib/Sema/SemaExprCXX.cpp | 8 +++---- .../microsoft-vector-deleting-dtors.cpp | 23 +++++++++++++++++-- 7 files changed, 52 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h index dbd02ef7f8011..7dbefeea4b1a3 100644 --- a/clang/include/clang/AST/DeclCXX.h +++ b/clang/include/clang/AST/DeclCXX.h @@ -2852,6 +2852,7 @@ class CXXDestructorDecl : public CXXMethodDecl { // FIXME: Don't allocate storage for these except in the first declaration // of a virtual destructor. FunctionDecl *OperatorDelete = nullptr; + FunctionDecl *OperatorArrayDelete = nullptr; Expr *OperatorDeleteThisArg = nullptr; CXXDestructorDecl(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc, @@ -2877,11 +2878,16 @@ class CXXDestructorDecl : public CXXMethodDecl { static CXXDestructorDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID); void setOperatorDelete(FunctionDecl *OD, Expr *ThisArg); + void setOperatorArrayDelete(FunctionDecl *OD, Expr *ThisArg); const FunctionDecl *getOperatorDelete() const { return getCanonicalDecl()->OperatorDelete; } + const FunctionDecl *getArrayOperatorDelete() const { + return getCanonicalDecl()->OperatorArrayDelete; + } + Expr *getOperatorDeleteThisArg() const { return getCanonicalDecl()->OperatorDeleteThisArg; } diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 822cae99ddae7..6e504b7211567 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -8335,7 +8335,8 @@ class Sema final : public SemaBase { bool Overaligned, DeclarationName Name); FunctionDecl *FindDeallocationFunctionForDestructor(SourceLocation StartLoc, - CXXRecordDecl *RD); + CXXRecordDecl *RD, + DeclarationName Name); /// ActOnCXXDelete - Parsed a C++ 'delete' expression (C++ 5.3.5), as in: /// @code ::delete ptr; @endcode diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 7eff776882629..3c447a905a83c 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -3026,6 +3026,13 @@ void CXXDestructorDecl::setOperatorDelete(FunctionDecl *OD, Expr *ThisArg) { } } +void CXXDestructorDecl::setOperatorArrayDelete(FunctionDecl *OD, + Expr *ThisArg) { + auto *First = cast(getFirstDecl()); + if (OD && !First->OperatorArrayDelete) + First->OperatorArrayDelete = OD; +} + bool CXXDestructorDecl::isCalledByDelete(const FunctionDecl *OpDel) const { // C++20 [expr.delete]p6: If the value of the operand of the delete- // expression is not a null pointer value and the selected deallocation diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp index f508930cc9f2b..c683dbb0af825 100644 --- a/clang/lib/CodeGen/CGClass.cpp +++ b/clang/lib/CodeGen/CGClass.cpp @@ -1489,7 +1489,7 @@ static void EmitConditionalArrayDtorCall(const CXXDestructorDecl *DD, CGF.EmitBlock(callDeleteBB); const CXXDestructorDecl *Dtor = cast(CGF.CurCodeDecl); const CXXRecordDecl *ClassDecl = Dtor->getParent(); - CGF.EmitDeleteCall(Dtor->getOperatorDelete(), allocatedPtr, + CGF.EmitDeleteCall(Dtor->getArrayOperatorDelete(), allocatedPtr, CGF.getContext().getTagDeclType(ClassDecl)); CGF.EmitBranchThroughCleanup(CGF.ReturnBlock); diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 43bf9b7cd0f95..cbd37aa0dd6a4 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -11042,9 +11042,11 @@ bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) { else Loc = RD->getLocation(); + DeclarationName Name = + Context.DeclarationNames.getCXXOperatorName(OO_Delete); // If we have a virtual destructor, look up the deallocation function if (FunctionDecl *OperatorDelete = - FindDeallocationFunctionForDestructor(Loc, RD)) { + FindDeallocationFunctionForDestructor(Loc, RD, Name)) { Expr *ThisArg = nullptr; // If the notional 'delete this' expression requires a non-trivial @@ -11075,6 +11077,15 @@ bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) { DiagnoseUseOfDecl(OperatorDelete, Loc); MarkFunctionReferenced(Loc, OperatorDelete); Destructor->setOperatorDelete(OperatorDelete, ThisArg); + // Lookup delete[] too in case we have to emit a vector deleting dtor; + DeclarationName VDeleteName = + Context.DeclarationNames.getCXXOperatorName(OO_Array_Delete); + FunctionDecl *ArrOperatorDelete = + FindDeallocationFunctionForDestructor(Loc, RD, VDeleteName); + // delete[] in the TU will make sure the operator is referenced and its + // uses diagnosed, otherwise vector deleting dtor won't be called anyway, + // so just record it in the destructor. + Destructor->setOperatorArrayDelete(ArrOperatorDelete, ThisArg); } } diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index fa492bc124abd..78eba8e262771 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -3265,9 +3265,8 @@ FunctionDecl *Sema::FindUsualDeallocationFunction(SourceLocation StartLoc, return Result.FD; } -FunctionDecl *Sema::FindDeallocationFunctionForDestructor(SourceLocation Loc, - CXXRecordDecl *RD) { - DeclarationName Name = Context.DeclarationNames.getCXXOperatorName(OO_Delete); +FunctionDecl *Sema::FindDeallocationFunctionForDestructor( + SourceLocation Loc, CXXRecordDecl *RD, DeclarationName Name) { FunctionDecl *OperatorDelete = nullptr; if (FindDeallocationFunction(Loc, RD, Name, OperatorDelete)) @@ -3275,8 +3274,7 @@ FunctionDecl *Sema::FindDeallocationFunctionForDestructor(SourceLocation Loc, if (OperatorDelete) return OperatorDelete; - // If there's no class-specific operator delete, look up the global - // non-array delete. + // If there's no class-specific operator delete, look up the global delete. return FindUsualDeallocationFunction( Loc, true, hasNewExtendedAlignment(*this, Context.getRecordType(RD)), Name); diff --git a/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp b/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp index ebff4f6a851b0..439ff84456033 100644 --- a/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp +++ b/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp @@ -28,6 +28,13 @@ struct JustAWeirdBird { } }; +int i = 0; +struct HasOperatorDelete : public Bird{ +~HasOperatorDelete() { } +void operator delete(void *p) { i-=2; } +void operator delete[](void *p) { i--; } +}; + // Vector deleting dtor for Bird is an alias because no new Bird[] expressions // in the TU. // X64: @"??_EBird@@UEAAPEAXI@Z" = weak dso_local unnamed_addr alias ptr (ptr, i32), ptr @"??_GBird@@UEAAPEAXI@Z" @@ -53,6 +60,9 @@ void bar() { JustAWeirdBird B; B.doSmth(38); + + Bird *p = new HasOperatorDelete[2]; + dealloc(p); } // CHECK-LABEL: define dso_local void @{{.*}}dealloc{{.*}}( @@ -129,8 +139,8 @@ void bar() { // CHECK-NEXT: %[[ISFIRSTBITZERO:.*]] = icmp eq i32 %[[FIRSTBIT]], 0 // CHECK-NEXT: br i1 %[[ISFIRSTBITZERO]], label %dtor.continue, label %dtor.call_delete_after_array_destroy // CHECK: dtor.call_delete_after_array_destroy: -// X64-NEXT: call void @"??3@YAXPEAX_K@Z"(ptr noundef %[[COOKIEGEP]], i64 noundef 8) -// X86-NEXT: call void @"??3@YAXPAXI@Z"(ptr noundef %[[COOKIEGEP]], i32 noundef 4) +// X64-NEXT: call void @"??_V@YAXPEAX_K@Z"(ptr noundef %[[COOKIEGEP]], i64 noundef 8) +// X86-NEXT: call void @"??_V@YAXPAXI@Z"(ptr noundef %[[COOKIEGEP]], i32 noundef 4) // CHECK-NEXT: br label %dtor.continue // CHECK: dtor.scalar: // X64-NEXT: call void @"??1Parrot@@UEAA@XZ"(ptr noundef nonnull align 8 dereferenceable(8) %[[LTHIS]]) @@ -150,3 +160,12 @@ void bar() { // X64-SAME: ptr noundef nonnull align 8 dereferenceable(8) %this, i32 noundef %should_call_delete) // X86: define weak dso_local x86_thiscallcc noundef ptr @"??_EJustAWeirdBird@@UAEPAXI@Z"( // X86-SAME: ptr noundef nonnull align 4 dereferenceable(4) %this, i32 noundef %should_call_delete) unnamed_addr + +// X64-LABEL: define weak dso_local noundef ptr @"??_EHasOperatorDelete@@UEAAPEAXI@Z" +// X86-LABEL: define weak dso_local x86_thiscallcc noundef ptr @"??_EHasOperatorDelete@@UAEPAXI@Z" +// CHECK: dtor.call_delete_after_array_destroy: +// X64-NEXT: call void @"??_VHasOperatorDelete@@SAXPEAX@Z" +// X86-NEXT: call void @"??_VHasOperatorDelete@@SAXPAX@Z" +// CHECK: dtor.call_delete: +// X64-NEXT: call void @"??3HasOperatorDelete@@SAXPEAX@Z" +// X86-NEXT: call void @"??3HasOperatorDelete@@SAXPAX@Z" From bd788dbf516be98044254336f54b72d077f69771 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Wed, 2 Apr 2025 09:52:27 +0200 Subject: [PATCH 0352/1029] [AMDGPU] Remove detection of hip runtime for Spack (#133263) There is special logic to detect the hip runtime when llvm is installed with Spack. It works by matching the install prefix of llvm against `llvm-amdgpu-*` followed by effectively globbing for ``` /../hip-x.y.z-*/ ``` and checking there is exactly one such directory. I would suggest to remove autodetection for the following reasons: 1. In the Spack ecosystem it's by design that every package lives in its own prefix, and can only know where its dependencies are installed, it has no clue what its dependents are and where they are installed. This heuristic detection breaks that invariant, since `hip` is a dependent of `llvm`, and can be surprising to Spack users. 2. The detection can lead to false positives, since users can be using an llvm installed "upstream" with their own build of hip locally, and they may not realize that clang is picking up upstream hip instead of their local copy. 3. It only works if the directory name is `llvm-amdgpu-*` which happens to be the name of AMD's fork of `llvm`, so it makes no sense that this code lives in the main LLVM repo for which the Spack package name is `llvm`. Feels wrong that LLVM knows about Spack package names, which can change over time. 4. Users can change the install directory structure, meaning that this detection is not robust under config changes in Spack. --- clang/lib/Driver/ToolChains/AMDGPU.cpp | 67 +------------------ .../bin/.hipVersion | 5 -- .../include/hip/hip_runtime.h | 0 .../amdgcn/bitcode/asanrtl.bc | 0 .../amdgcn/bitcode/hip.bc | 0 .../amdgcn/bitcode/ockl.bc | 0 .../amdgcn/bitcode/oclc_abi_version_400.bc | 0 .../amdgcn/bitcode/oclc_abi_version_500.bc | 0 .../amdgcn/bitcode/oclc_abi_version_600.bc | 0 .../oclc_correctly_rounded_sqrt_off.bc | 0 .../bitcode/oclc_correctly_rounded_sqrt_on.bc | 0 .../amdgcn/bitcode/oclc_daz_opt_off.bc | 0 .../amdgcn/bitcode/oclc_daz_opt_on.bc | 0 .../amdgcn/bitcode/oclc_finite_only_off.bc | 0 .../amdgcn/bitcode/oclc_finite_only_on.bc | 0 .../amdgcn/bitcode/oclc_isa_version_1010.bc | 0 .../amdgcn/bitcode/oclc_isa_version_1011.bc | 0 .../amdgcn/bitcode/oclc_isa_version_1012.bc | 0 .../amdgcn/bitcode/oclc_isa_version_803.bc | 0 .../amdgcn/bitcode/oclc_isa_version_900.bc | 0 .../amdgcn/bitcode/oclc_isa_version_908.bc | 0 .../amdgcn/bitcode/oclc_unsafe_math_off.bc | 0 .../amdgcn/bitcode/oclc_unsafe_math_on.bc | 0 .../bitcode/oclc_wavefrontsize64_off.bc | 0 .../amdgcn/bitcode/oclc_wavefrontsize64_on.bc | 0 .../amdgcn/bitcode/ocml.bc | 0 .../amdgcn/bitcode/opencl.bc | 0 .../bin/.keep | 0 clang/test/Driver/rocm-detect.hip | 47 ------------- 29 files changed, 1 insertion(+), 118 deletions(-) delete mode 100644 clang/test/Driver/Inputs/rocm-spack/hip-4.0.0-5f63slrursbrvfe2txrrjkynbsywsob5/bin/.hipVersion delete mode 100644 clang/test/Driver/Inputs/rocm-spack/hip-4.0.0-5f63slrursbrvfe2txrrjkynbsywsob5/include/hip/hip_runtime.h delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/asanrtl.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/hip.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/ockl.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_abi_version_400.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_abi_version_500.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_abi_version_600.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_daz_opt_off.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_daz_opt_on.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_finite_only_off.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_finite_only_on.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_1010.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_1011.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_1012.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_803.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_900.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_908.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_unsafe_math_off.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_unsafe_math_on.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_wavefrontsize64_off.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_wavefrontsize64_on.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/ocml.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/opencl.bc delete mode 100644 clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/bin/.keep diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 72c03fb3154e2..dffc70d5e5b69 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -32,48 +32,6 @@ using namespace clang::driver::toolchains; using namespace clang; using namespace llvm::opt; -// Look for sub-directory starts with PackageName under ROCm candidate path. -// If there is one and only one matching sub-directory found, append the -// sub-directory to Path. If there is no matching sub-directory or there are -// more than one matching sub-directories, diagnose them. Returns the full -// path of the package if there is only one matching sub-directory, otherwise -// returns an empty string. -llvm::SmallString<0> -RocmInstallationDetector::findSPACKPackage(const Candidate &Cand, - StringRef PackageName) { - if (!Cand.isSPACK()) - return {}; - std::error_code EC; - std::string Prefix = Twine(PackageName + "-" + Cand.SPACKReleaseStr).str(); - llvm::SmallVector> SubDirs; - for (llvm::vfs::directory_iterator File = D.getVFS().dir_begin(Cand.Path, EC), - FileEnd; - File != FileEnd && !EC; File.increment(EC)) { - llvm::StringRef FileName = llvm::sys::path::filename(File->path()); - if (FileName.starts_with(Prefix)) { - SubDirs.push_back(FileName); - if (SubDirs.size() > 1) - break; - } - } - if (SubDirs.size() == 1) { - auto PackagePath = Cand.Path; - llvm::sys::path::append(PackagePath, SubDirs[0]); - return PackagePath; - } - if (SubDirs.size() == 0 && Verbose) { - llvm::errs() << "SPACK package " << Prefix << " not found at " << Cand.Path - << '\n'; - return {}; - } - - if (SubDirs.size() > 1 && Verbose) { - llvm::errs() << "Cannot use SPACK package " << Prefix << " at " << Cand.Path - << " due to multiple installations for the same version\n"; - } - return {}; -} - void RocmInstallationDetector::scanLibDevicePath(llvm::StringRef Path) { assert(!Path.empty()); @@ -187,10 +145,7 @@ RocmInstallationDetector::getInstallationPathCandidates() { auto DoPrintROCmSearchDirs = [&]() { if (PrintROCmSearchDirs) for (auto Cand : ROCmSearchDirs) { - llvm::errs() << "ROCm installation search path"; - if (Cand.isSPACK()) - llvm::errs() << " (Spack " << Cand.SPACKReleaseStr << ")"; - llvm::errs() << ": " << Cand.Path << '\n'; + llvm::errs() << "ROCm installation search path: " << Cand.Path << '\n'; } }; @@ -226,22 +181,6 @@ RocmInstallationDetector::getInstallationPathCandidates() { ParentName = llvm::sys::path::filename(ParentDir); } - // Detect ROCm packages built with SPACK. - // clang is installed at - // /llvm-amdgpu--/bin directory. - // We only consider the parent directory of llvm-amdgpu package as ROCm - // installation candidate for SPACK. - if (ParentName.starts_with("llvm-amdgpu-")) { - auto SPACKPostfix = - ParentName.drop_front(strlen("llvm-amdgpu-")).split('-'); - auto SPACKReleaseStr = SPACKPostfix.first; - if (!SPACKReleaseStr.empty()) { - ParentDir = llvm::sys::path::parent_path(ParentDir); - return Candidate(ParentDir.str(), /*StrictChecking=*/true, - SPACKReleaseStr); - } - } - // Some versions of the rocm llvm package install to /opt/rocm/llvm/bin // Some versions of the aomp package install to /opt/rocm/aomp/bin if (ParentName == "llvm" || ParentName.starts_with("aomp")) @@ -462,10 +401,6 @@ void RocmInstallationDetector::detectHIPRuntime() { InstallPath = Candidate.Path; if (InstallPath.empty() || !FS.exists(InstallPath)) continue; - // HIP runtime built by SPACK is installed to - // /hip-- directory. - auto SPACKPath = findSPACKPackage(Candidate, "hip"); - InstallPath = SPACKPath.empty() ? InstallPath : SPACKPath; BinPath = InstallPath; llvm::sys::path::append(BinPath, "bin"); diff --git a/clang/test/Driver/Inputs/rocm-spack/hip-4.0.0-5f63slrursbrvfe2txrrjkynbsywsob5/bin/.hipVersion b/clang/test/Driver/Inputs/rocm-spack/hip-4.0.0-5f63slrursbrvfe2txrrjkynbsywsob5/bin/.hipVersion deleted file mode 100644 index 95d5b2ba83d37..0000000000000 --- a/clang/test/Driver/Inputs/rocm-spack/hip-4.0.0-5f63slrursbrvfe2txrrjkynbsywsob5/bin/.hipVersion +++ /dev/null @@ -1,5 +0,0 @@ -# NOTE: The trailing whitespace is added on purpose to verify that these -# whitespaces are trimmed before paring. -HIP_VERSION_MAJOR=4 -HIP_VERSION_MINOR=0 -HIP_VERSION_PATCH=20214-a2917cd diff --git a/clang/test/Driver/Inputs/rocm-spack/hip-4.0.0-5f63slrursbrvfe2txrrjkynbsywsob5/include/hip/hip_runtime.h b/clang/test/Driver/Inputs/rocm-spack/hip-4.0.0-5f63slrursbrvfe2txrrjkynbsywsob5/include/hip/hip_runtime.h deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/asanrtl.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/asanrtl.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/hip.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/hip.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/ockl.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/ockl.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_abi_version_400.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_abi_version_400.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_abi_version_500.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_abi_version_500.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_abi_version_600.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_abi_version_600.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_correctly_rounded_sqrt_off.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_daz_opt_off.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_daz_opt_off.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_daz_opt_on.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_daz_opt_on.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_finite_only_off.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_finite_only_off.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_finite_only_on.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_finite_only_on.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_1010.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_1010.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_1011.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_1011.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_1012.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_1012.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_803.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_803.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_900.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_900.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_908.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_isa_version_908.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_unsafe_math_off.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_unsafe_math_off.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_unsafe_math_on.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_unsafe_math_on.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_wavefrontsize64_off.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_wavefrontsize64_off.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_wavefrontsize64_on.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/oclc_wavefrontsize64_on.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/ocml.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/ocml.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/opencl.bc b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/opencl.bc deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/bin/.keep b/clang/test/Driver/Inputs/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/bin/.keep deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang/test/Driver/rocm-detect.hip b/clang/test/Driver/rocm-detect.hip index 4aafeb97c00b5..b28b2bc6379dd 100644 --- a/clang/test/Driver/rocm-detect.hip +++ b/clang/test/Driver/rocm-detect.hip @@ -94,33 +94,6 @@ // RUN: --print-rocm-search-dirs %s 2>&1 \ // RUN: | FileCheck -check-prefixes=ROCM-REL %s -// Test ROCm installation built by SPACK by invoke clang at %t/rocm-spack/llvm-amdgpu-* -// directory through a soft link. - -// RUN: rm -rf %t/rocm-spack -// RUN: cp -r %S/Inputs/rocm-spack %t -// RUN: ln -fs %clang %t/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/bin/clang -// RUN: %t/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/bin/clang -### -no-canonical-prefixes -v \ -// RUN: -resource-dir=%t/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/lib/clang \ -// RUN: -target x86_64-linux-gnu --cuda-gpu-arch=gfx900 --print-rocm-search-dirs %s 2>&1 \ -// RUN: | FileCheck -check-prefixes=SPACK %s - -// Test SPACK installation with multiple hip and rocm-device-libs packages of the same -// ROCm release. --hip-path and --rocm-device-lib-path can be used to specify them. - -// RUN: cp -r %t/rocm-spack/hip-* %t/rocm-spack/hip-4.0.0-abcd -// RUN: %t/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/bin/clang -### -no-canonical-prefixes -v \ -// RUN: -target x86_64-linux-gnu --cuda-gpu-arch=gfx900 \ -// RUN: --hip-path=%t/rocm-spack/hip-4.0.0-abcd \ -// RUN: %s 2>&1 | FileCheck -check-prefixes=SPACK-SET %s - -// Test invalid SPACK ROCm installation missing hip and rocm-device-libs packages. - -// RUN: rm -rf %t/rocm-spack/hip-* -// RUN: rm -rf %t/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn -// RUN: %t/rocm-spack/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/bin/clang --version 2>&1 \ -// RUN: | FileCheck -check-prefixes=SPACK-MISS-SILENT %s - // GFX902-DEFAULTLIBS: error: cannot find ROCm device library for gfx902; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library // NODEFAULTLIBS-NOT: error: cannot find @@ -145,23 +118,3 @@ // ROCM-REL: ROCm installation search path: {{.*}}/opt/rocm // ROCM-REL: ROCm installation search path: {{.*}}/opt/rocm-3.10.0 - -// SPACK: InstalledDir: [[DIR:.*]]/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/bin -// SPACK: ROCm installation search path (Spack 4.0.0): [[DIR]] -// SPACK: ROCm installation search path: [[CLANG:.*]] -// SPACK: ROCm installation search path: [[DIR]]/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z -// SPACK: ROCm installation search path: [[DIR]]/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/lib/clang -// SPACK: ROCm installation search path: /opt/rocm -// SPACK: Found HIP installation: [[DIR]]/hip-4.0.0-5f63slrursbrvfe2txrrjkynbsywsob5, version 4.0.20214-a2917cd -// SPACK: "-triple" "amdgcn-amd-amdhsa" -// SPACK-SAME: "-mlink-builtin-bitcode" "[[DIR]]/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/hip.bc" -// SPACK-SAME: "-idirafter" "[[DIR]]/hip-4.0.0-5f63slrursbrvfe2txrrjkynbsywsob5/include" - -// SPACK-SET: InstalledDir: [[DIR:.*]]/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/bin -// SPACK-SET: Found HIP installation: [[DIR]]/hip-4.0.0-abcd, version 4.0.20214-a2917cd -// SPACK-SET: "-triple" "amdgcn-amd-amdhsa" -// SPACK-SET-SAME: "-mlink-builtin-bitcode" "[[DIR]]/llvm-amdgpu-4.0.0-ieagcs7inf7runpyfvepqkurasoglq4z/amdgcn/bitcode/hip.bc" -// SPACK-SET-SAME: "-idirafter" "[[DIR]]/hip-4.0.0-abcd/include" - -// SPACK-MISS-SILENT-NOT: SPACK package hip-{{.*}} not found at -// SPACK-MISS-SILENT-NOT: Found HIP installation From cde2ea377d457e272ce1572d588643e5ee533c30 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 2 Apr 2025 15:00:41 +0700 Subject: [PATCH 0353/1029] llvm-reduce: Defer a shouldKeep call in operand reduction (#133387) Ideally shouldKeep is only called in contexts that will successfully do something. --- llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp index b0bca015434fa..8b6446725b7d4 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOperands.cpp @@ -26,8 +26,8 @@ extractOperandsFromModule(Oracle &O, ReducerWorkItem &WorkItem, for (auto &I : instructions(&F)) { if (PHINode *Phi = dyn_cast(&I)) { for (auto &Op : Phi->incoming_values()) { - if (!O.shouldKeep()) { - if (Value *Reduced = ReduceValue(Op)) + if (Value *Reduced = ReduceValue(Op)) { + if (!O.shouldKeep()) Phi->setIncomingValueForBlock(Phi->getIncomingBlock(Op), Reduced); } } From 41d718b1cf3db952a79c5598dba2e3379ee88efa Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Wed, 2 Apr 2025 10:12:52 +0200 Subject: [PATCH 0354/1029] [flang][OpenMP] Upstream `do concurrent` loop-nest detection. (#127595) Upstreams the next part of do concurrent to OpenMP mapping pass (from AMD's ROCm implementation). See https://github.com/llvm/llvm-project/pull/126026 for more context. This PR add loop nest detection logic. This enables us to discover muli-range do concurrent loops and then map them as "collapsed" loop nests to OpenMP. This is a follow up for https://github.com/llvm/llvm-project/pull/126026, only the latest commit is relevant. This is a replacement for https://github.com/llvm/llvm-project/pull/127478 using a `/user//` branch. PR stack: - https://github.com/llvm/llvm-project/pull/126026 - https://github.com/llvm/llvm-project/pull/127595 (this PR) - https://github.com/llvm/llvm-project/pull/127633 - https://github.com/llvm/llvm-project/pull/127634 - https://github.com/llvm/llvm-project/pull/127635 --- flang/docs/DoConcurrentConversionToOpenMP.md | 85 +++++++++++ .../OpenMP/DoConcurrentConversion.cpp | 135 ++++++++++++++++++ .../DoConcurrent/loop_nest_test.f90 | 89 ++++++++++++ 3 files changed, 309 insertions(+) create mode 100644 flang/test/Transforms/DoConcurrent/loop_nest_test.f90 diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md b/flang/docs/DoConcurrentConversionToOpenMP.md index 62bc3172f8e3b..7b49af742f242 100644 --- a/flang/docs/DoConcurrentConversionToOpenMP.md +++ b/flang/docs/DoConcurrentConversionToOpenMP.md @@ -53,6 +53,79 @@ that: * It has been tested in a very limited way so far. * It has been tested mostly on simple synthetic inputs. +### Loop nest detection + +On the `FIR` dialect level, the following loop: +```fortran + do concurrent(i=1:n, j=1:m, k=1:o) + a(i,j,k) = i + j + k + end do +``` +is modelled as a nest of `fir.do_loop` ops such that an outer loop's region +contains **only** the following: + 1. The operations needed to assign/update the outer loop's induction variable. + 1. The inner loop itself. + +So the MLIR structure for the above example looks similar to the following: +``` + fir.do_loop %i_idx = %34 to %36 step %c1 unordered { + %i_idx_2 = fir.convert %i_idx : (index) -> i32 + fir.store %i_idx_2 to %i_iv#1 : !fir.ref + + fir.do_loop %j_idx = %37 to %39 step %c1_3 unordered { + %j_idx_2 = fir.convert %j_idx : (index) -> i32 + fir.store %j_idx_2 to %j_iv#1 : !fir.ref + + fir.do_loop %k_idx = %40 to %42 step %c1_5 unordered { + %k_idx_2 = fir.convert %k_idx : (index) -> i32 + fir.store %k_idx_2 to %k_iv#1 : !fir.ref + + ... loop nest body goes here ... + } + } + } +``` +This applies to multi-range loops in general; they are represented in the IR as +a nest of `fir.do_loop` ops with the above nesting structure. + +Therefore, the pass detects such "perfectly" nested loop ops to identify multi-range +loops and map them as "collapsed" loops in OpenMP. + +#### Further info regarding loop nest detection + +Loop nest detection is currently limited to the scenario described in the previous +section. However, this is quite limited and can be extended in the future to cover +more cases. At the moment, for the following loop nest, even though both loops are +perfectly nested, only the outer loop is parallelized: +```fortran +do concurrent(i=1:n) + do concurrent(j=1:m) + a(i,j) = i * j + end do +end do +``` + +Similarly, for the following loop nest, even though the intervening statement `x = 41` +does not have any memory effects that would affect parallelization, this nest is +not parallelized either (only the outer loop is). + +```fortran +do concurrent(i=1:n) + x = 41 + do concurrent(j=1:m) + a(i,j) = i * j + end do +end do +``` + +The above also has the consequence that the `j` variable will **not** be +privatized in the OpenMP parallel/target region. In other words, it will be +treated as if it was a `shared` variable. For more details about privatization, +see the "Data environment" section below. + +See `flang/test/Transforms/DoConcurrent/loop_nest_test.f90` for more examples +of what is and is not detected as a perfect loop nest. + + +### Multi-range loops + +The pass currently supports multi-range loops as well. Given the following +example: + +```fortran + do concurrent(i=1:n, j=1:m) + a(i,j) = i * j + end do +``` + +The generated `omp.loop_nest` operation look like: + +``` +omp.loop_nest (%arg0, %arg1) + : index = (%17, %19) to (%18, %20) + inclusive step (%c1_2, %c1_4) { + fir.store %arg0 to %private_i#1 : !fir.ref + fir.store %arg1 to %private_j#1 : !fir.ref + ... + omp.yield +} +``` + +It is worth noting that we have privatized versions for both iteration +variables: `i` and `j`. These are locally allocated inside the parallel/target +OpenMP region similar to what the single-range example in previous section +shows. + " << I.second->getData()->getName() - << "\n"); + << std::distance(GVtoClusterMap.member_begin(*C), + GVtoClusterMap.member_end()) + << ") ----> " << C->getData()->getName() << "\n"); - for (ClusterMapType::member_iterator MI = - GVtoClusterMap.findLeader(*I.second); + for (ClusterMapType::member_iterator MI = GVtoClusterMap.findLeader(*C); MI != GVtoClusterMap.member_end(); ++MI) { if (!Visited.insert(*MI).second) continue; diff --git a/llvm/test/CodeGen/WebAssembly/cfi.ll b/llvm/test/CodeGen/WebAssembly/cfi.ll index b658cc053bcbe..f032aa50619d2 100644 --- a/llvm/test/CodeGen/WebAssembly/cfi.ll +++ b/llvm/test/CodeGen/WebAssembly/cfi.ll @@ -14,13 +14,13 @@ define void @h() !type !0 { } ; CHECK-LABEL: f: -; CHECK: .indidx 1 +; CHECK: .indidx 2 define void @f() !type !0 { ret void } ; CHECK-LABEL: g: -; CHECK: .indidx 2 +; CHECK: .indidx 1 define void @g() !type !1 { ret void } diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/type-inference.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/type-inference.td index ed4e0e411c7af..a965dc7ccdcad 100644 --- a/llvm/test/TableGen/GlobalISelCombinerEmitter/type-inference.td +++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/type-inference.td @@ -50,7 +50,7 @@ def infer_complex_tempreg: GICombineRule < // CHECK-NEXT: Groups for __infer_variadic_outs_match_1: [dst, x] // CHECK-NEXT: Groups for __infer_variadic_outs_apply_0: [tmp, y] // CHECK-NEXT: Groups for __infer_variadic_outs_apply_1: -// CHECK-NEXT: Final Type Equivalence Classes: [tmp, dst, x, y] [vec] +// CHECK-NEXT: Final Type Equivalence Classes: [vec] [tmp, dst, x, y] // CHECK-NEXT: INFER: MachineOperand $tmp -> GITypeOf<$dst> // CHECK-NEXT: Apply patterns for rule infer_variadic_outs after inference: // CHECK-NEXT: (CodeGenInstructionPattern name:__infer_variadic_outs_apply_0 G_FNEG operands:[GITypeOf<$dst>:$tmp, $y]) diff --git a/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll b/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll index 92281e274adf0..d7ba3a6814194 100644 --- a/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll +++ b/llvm/test/Transforms/LowerTypeTests/function-disjoint.ll @@ -5,20 +5,20 @@ target datalayout = "e-p:64:64" -; X64: @f = alias void (), ptr @[[JT0:.*]] ; X64: @g = alias void (), ptr @[[JT1:.*]] +; X64: @f = alias void (), ptr @[[JT0:.*]] ; WASM32: private constant [0 x i8] zeroinitializer @0 = private unnamed_addr constant [2 x ptr] [ptr @f, ptr @g], align 16 ; X64: define hidden void @f.cfi() -; WASM32: define void @f() !type !{{[0-9]+}} !wasm.index ![[I0:[0-9]+]] +; WASM32: define void @f() !type !{{[0-9]+}} !wasm.index ![[I1:[0-9]+]] define void @f() !type !0 { ret void } ; X64: define hidden void @g.cfi() -; WASM32: define void @g() !type !{{[0-9]+}} !wasm.index ![[I1:[0-9]+]] +; WASM32: define void @g() !type !{{[0-9]+}} !wasm.index ![[I0:[0-9]+]] define void @g() !type !1 { ret void } @@ -30,20 +30,20 @@ declare i1 @llvm.type.test(ptr %ptr, metadata %bitset) nounwind readnone define i1 @foo(ptr %p) { ; X64: icmp eq i64 {{.*}}, ptrtoint (ptr @[[JT0]] to i64) - ; WASM32: icmp eq i64 {{.*}}, ptrtoint (ptr getelementptr (i8, ptr null, i64 1) to i64) + ; WASM32: icmp eq i64 {{.*}}, ptrtoint (ptr getelementptr (i8, ptr null, i64 2) to i64) %x = call i1 @llvm.type.test(ptr %p, metadata !"typeid1") ; X64: icmp eq i64 {{.*}}, ptrtoint (ptr @[[JT1]] to i64) - ; WASM32: icmp eq i64 {{.*}}, ptrtoint (ptr getelementptr (i8, ptr null, i64 2) to i64) + ; WASM32: icmp eq i64 {{.*}}, ptrtoint (ptr getelementptr (i8, ptr null, i64 1) to i64) %y = call i1 @llvm.type.test(ptr %p, metadata !"typeid2") %z = add i1 %x, %y ret i1 %z } -; X64: define private void @[[JT0]]() #{{.*}} align 8 { -; X64: call void asm sideeffect "jmp ${0:c}@plt\0Aint3\0Aint3\0Aint3\0A", "s"(ptr @f.cfi) - ; X64: define private void @[[JT1]]() #{{.*}} align 8 { ; X64: call void asm sideeffect "jmp ${0:c}@plt\0Aint3\0Aint3\0Aint3\0A", "s"(ptr @g.cfi) -; WASM32: ![[I0]] = !{i64 1} +; X64: define private void @[[JT0]]() #{{.*}} align 8 { +; X64: call void asm sideeffect "jmp ${0:c}@plt\0Aint3\0Aint3\0Aint3\0A", "s"(ptr @f.cfi) + ; WASM32: ![[I1]] = !{i64 2} +; WASM32: ![[I0]] = !{i64 1} diff --git a/llvm/test/Transforms/LowerTypeTests/nonstring.ll b/llvm/test/Transforms/LowerTypeTests/nonstring.ll index ff8cc52d48344..ff7189aa0189c 100644 --- a/llvm/test/Transforms/LowerTypeTests/nonstring.ll +++ b/llvm/test/Transforms/LowerTypeTests/nonstring.ll @@ -4,8 +4,8 @@ target datalayout = "e-p:32:32" -; CHECK: @[[ANAME:.*]] = private constant { i32 } ; CHECK: @[[BNAME:.*]] = private constant { [2 x i32] } +; CHECK: @[[ANAME:.*]] = private constant { i32 } @a = constant i32 1, !type !0 @b = constant [2 x i32] [i32 2, i32 3], !type !1 diff --git a/llvm/test/tools/llvm-split/preserve-locals.ll b/llvm/test/tools/llvm-split/preserve-locals.ll index d128daaf35dd7..ff0610bd65499 100644 --- a/llvm/test/tools/llvm-split/preserve-locals.ll +++ b/llvm/test/tools/llvm-split/preserve-locals.ll @@ -2,14 +2,15 @@ ; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=CHECK0 %s ; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=CHECK1 %s -; The local_var and local_func must not be separated. -; CHECK0: @local_var -; CHECK0: define internal fastcc void @local_func ; The main and a must not be separated. ; The main and local_func must not be together. -; CHECK1: @a -; CHECK1: define i32 @main -; CHECK1: declare dso_local fastcc void @local_func +; CHECK0: @a +; CHECK0: define i32 @main +; CHECK0: declare dso_local fastcc void @local_func + +; The local_var and local_func must not be separated. +; CHECK1: @local_var +; CHECK1: define internal fastcc void @local_func @a = internal global i32 0, align 4 @global_storage = common global i32 0, align 4 diff --git a/llvm/test/tools/llvm-split/scc-const-alias.ll b/llvm/test/tools/llvm-split/scc-const-alias.ll index 20670af416c44..9e66f38f50843 100644 --- a/llvm/test/tools/llvm-split/scc-const-alias.ll +++ b/llvm/test/tools/llvm-split/scc-const-alias.ll @@ -5,12 +5,12 @@ ; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s ; Checks are not critical here - verifier will assert if we fail. -; CHECK0: @g1 = global i32 99 -; CHECK0: @c1Alias = external global i8 -; CHECK0: @g1Alias = internal alias i8, ptr @g1 +; CHECK0: @g1 = external global i32 +; CHECK0: @c1Alias = internal alias i8, inttoptr (i64 42 to ptr) -; CHECK1: @g1 = external global i32 -; CHECK1: @c1Alias = internal alias i8, inttoptr (i64 42 to ptr) +; CHECK1: @g1 = global i32 99 +; CHECK1: @c1Alias = external global i8 +; CHECK1: @g1Alias = internal alias i8, ptr @g1 ; Third file is actually empty. ; CHECK2: @g1 = external global i32 diff --git a/llvm/test/tools/llvm-split/scc-global-alias.ll b/llvm/test/tools/llvm-split/scc-global-alias.ll index ee3b6a1c1ce1a..b3b52ccd535a0 100644 --- a/llvm/test/tools/llvm-split/scc-global-alias.ll +++ b/llvm/test/tools/llvm-split/scc-global-alias.ll @@ -5,16 +5,16 @@ ; RUN: llvm-dis -o - %t2 | FileCheck --check-prefix=CHECK2 %s ; Checks are not critical here - verifier will assert if we fail. -; CHECK0: @funInternal2Alias = alias -; CHECK0: @funExternal2Alias = alias -; CHECK0: define internal i32 @funInternal2 -; CHECK0: define i32 @funExternal2 +; CHECK0: @funInternalAlias = alias +; CHECK0: define internal i32 @funInternal -; CHECK1: @funInternalAlias = alias -; CHECK1: define internal i32 @funInternal +; CHECK1: @funExternalAlias = alias +; CHECK1: define i32 @funExternal -; CHECK2: @funExternalAlias = alias -; CHECK2: define i32 @funExternal +; CHECK2: @funInternal2Alias = alias +; CHECK2: @funExternal2Alias = alias +; CHECK2: define internal i32 @funInternal2 +; CHECK2: define i32 @funExternal2 @funInternalAlias = alias i32 (), ptr @funInternal @funExternalAlias = alias i32 (), ptr @funExternal diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp index feaa3602f88cb..3259f8caeb773 100644 --- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp @@ -583,7 +583,8 @@ CombineRuleOperandTypeChecker::getRuleEqClasses() const { errs() << "Final Type Equivalence Classes: "; for (const auto &Class : TECs) { // only print non-empty classes. - if (auto MembIt = TECs.member_begin(Class); MembIt != TECs.member_end()) { + if (auto MembIt = TECs.member_begin(*Class); + MembIt != TECs.member_end()) { errs() << '['; StringRef Sep = ""; for (; MembIt != TECs.member_end(); ++MembIt) { From c87dc2b7d4ac0131cb97f096be522a50a4b3068b Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Wed, 2 Apr 2025 12:41:47 -0700 Subject: [PATCH 0431/1029] [lldb-dap] Speed up TestDAP_Progress (#134048) While trying to make progress on #133782, I noticed that TestDAP_Progress was taking 90 seconds to complete. This patch brings that down to 10 seocnds by making the following changes: 1. Don't call `wait_for_event` with a 15 second timeout. By the time we call this, all progress events have been emitted, which means that we're just sitting there until we hit the timeout. 2. Don't use 10 steps (= 10 seconds) for indeterminate progress. We have two indeterminate progress tests so that's 6 seconds instead of 20. 3. Don't launch the process over and over. Once we have a dap session, we can clear the progress vector and emit new progress events. --- .../lldb-dap/progress/Progress_emitter.py | 6 +-- .../lldb-dap/progress/TestDAP_Progress.py | 53 +++---------------- 2 files changed, 10 insertions(+), 49 deletions(-) diff --git a/lldb/test/API/tools/lldb-dap/progress/Progress_emitter.py b/lldb/test/API/tools/lldb-dap/progress/Progress_emitter.py index 445d1bdf4e496..0bf785e3201b0 100644 --- a/lldb/test/API/tools/lldb-dap/progress/Progress_emitter.py +++ b/lldb/test/API/tools/lldb-dap/progress/Progress_emitter.py @@ -88,11 +88,11 @@ def __call__(self, debugger, command, exe_ctx, result): progress = lldb.SBProgress( "Progress tester", "Initial Detail", total, debugger ) - # Check to see if total is set to None to indicate an indeterminate progress - # then default to 10 steps. + # Check to see if total is set to None to indicate an indeterminate + # progress then default to 3 steps. with progress: if total is None: - total = 10 + total = 3 for i in range(1, total): if cmd_options.no_details: diff --git a/lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py b/lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py index f723a2d254825..ffe3d38eb49a3 100755 --- a/lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py +++ b/lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py @@ -19,7 +19,6 @@ def verify_progress_events( expected_not_in_message=None, only_verify_first_update=False, ): - self.dap_server.wait_for_event("progressEnd", 15) self.assertTrue(len(self.dap_server.progress_events) > 0) start_found = False update_found = False @@ -45,20 +44,18 @@ def verify_progress_events( self.assertTrue(start_found) self.assertTrue(update_found) self.assertTrue(end_found) + self.dap_server.progress_events.clear() @skipIfWindows - def test_output(self): + def test(self): program = self.getBuildArtifact("a.out") self.build_and_launch(program) progress_emitter = os.path.join(os.getcwd(), "Progress_emitter.py") - source = "main.cpp" - breakpoint_ids = self.set_source_breakpoints( - source, [line_number(source, "// break here")] - ) - self.continue_to_breakpoints(breakpoint_ids) self.dap_server.request_evaluate( f"`command script import {progress_emitter}", context="repl" ) + + # Test details. self.dap_server.request_evaluate( "`test-progress --total 3 --seconds 1", context="repl" ) @@ -68,19 +65,7 @@ def test_output(self): expected_not_in_message="Progress tester", ) - @skipIfWindows - def test_output_nodetails(self): - program = self.getBuildArtifact("a.out") - self.build_and_launch(program) - progress_emitter = os.path.join(os.getcwd(), "Progress_emitter.py") - source = "main.cpp" - breakpoint_ids = self.set_source_breakpoints( - source, [line_number(source, "// break here")] - ) - self.continue_to_breakpoints(breakpoint_ids) - self.dap_server.request_evaluate( - f"`command script import {progress_emitter}", context="repl" - ) + # Test no details. self.dap_server.request_evaluate( "`test-progress --total 3 --seconds 1 --no-details", context="repl" ) @@ -90,19 +75,7 @@ def test_output_nodetails(self): expected_message="Initial Detail", ) - @skipIfWindows - def test_output_indeterminate(self): - program = self.getBuildArtifact("a.out") - self.build_and_launch(program) - progress_emitter = os.path.join(os.getcwd(), "Progress_emitter.py") - source = "main.cpp" - breakpoint_ids = self.set_source_breakpoints( - source, [line_number(source, "// break here")] - ) - self.continue_to_breakpoints(breakpoint_ids) - self.dap_server.request_evaluate( - f"`command script import {progress_emitter}", context="repl" - ) + # Test details indeterminate. self.dap_server.request_evaluate("`test-progress --seconds 1", context="repl") self.verify_progress_events( @@ -111,19 +84,7 @@ def test_output_indeterminate(self): only_verify_first_update=True, ) - @skipIfWindows - def test_output_nodetails_indeterminate(self): - program = self.getBuildArtifact("a.out") - self.build_and_launch(program) - progress_emitter = os.path.join(os.getcwd(), "Progress_emitter.py") - source = "main.cpp" - breakpoint_ids = self.set_source_breakpoints( - source, [line_number(source, "// break here")] - ) - self.dap_server.request_evaluate( - f"`command script import {progress_emitter}", context="repl" - ) - + # Test no details indeterminate. self.dap_server.request_evaluate( "`test-progress --seconds 1 --no-details", context="repl" ) From 6f1347d57bdaed75b73b2013a96a4a69c8969ebe Mon Sep 17 00:00:00 2001 From: ofri frishman Date: Wed, 2 Apr 2025 23:06:43 +0300 Subject: [PATCH 0432/1029] [MLIR] Bubble up tensor.extract_slice through tensor.collapse_shape (#131982) Add a pattern that bubbles up tensor.extract_slice through tensor.collapse_shape. The pattern is registered in a pattern population function that is used by the transform op transform.apply_patterns.tensor.bubble_up_extract_slice and by the tranform op transform.structured.fuse as a cleanup pattern. This pattern enables tiling and fusing op chains which contain tensor.collapse_shape if added as a cleanup pattern of tile and fuse utility. Without this pattern that would not be possible, as tensor.collapse_shape does not implement the tiling interface. This is an additional pattern to the one added in PR #126898 --- .../Tensor/Transforms/ReshapePatterns.cpp | 254 +++++++++++++++++- .../Dialect/Linalg/transform-op-fuse.mlir | 49 ++++ .../Tensor/bubble-up-extract-slice-op.mlir | 174 ++++++++++++ 3 files changed, 476 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp index acedf51d0e240..eed44e60d6591 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/ReshapePatterns.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/Tensor/Transforms/Transforms.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Interfaces/ValueBoundsOpInterface.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Support/Debug.h" #include "llvm/Support/LogicalResult.h" @@ -428,6 +429,256 @@ struct BubbleUpExpandShapeThroughExtractSlice } }; +/// Converts `tensor.extract_slice(tensor.collapse_shape)` to +/// `tensor.collapse_shape(tensor.extract_slice)`. +/// +/// For this transformation to be possible - after bubbling up, the extraction +/// of the contiguous slice must be representable as a single slice obtained via +/// tensor.extract_slice within each reassociation group of the src. +/// +/// In case the size and offset extracted are static then this is possible if +/// the following conditions are met within each reassociation group: +/// Let T be a tensor of shape [A0, A1, ..., An] (these are the sizes of the +/// dimensions in the reassociation group), and let S = [S0, S1, ..., Sn] be the +/// shape of a desired slice. A slice of shape S can be extracted as a +/// contiguous span of elements if and only if there exists an index k in {0, 1, +/// ..., n} such that: +/// S_i = 1 for all i < k (that is, all leading dimensions are singleton), +/// 1 <= S_k <= A_k (that is, non trivial slicing occurs along exactly +/// one dimension), +/// S_i = A_i for all i > k (that is, all trailing dimensions are preserved +/// in full). +/// In other words, the slice shape S must be of the form: +/// [ 1, 1, ..., 1, Sk, Ak + 1, Ak + 2, ...,An ] +/// +/// In case the size and/or offset extracted are dynamic then this is possible +/// only if there is single dimension in the reassociation group that has a size +/// not equal to 1. +/// In other words, the tensor shape must be of the form: +/// [ 1, 1, ..., 1, A, 1, ...,1 ] +/// Note - it might be possible to enable this pattern for more cases when the +/// size/offset are dynamic via performing an analysis of the possible values +/// that could be given to the size/offset. +/// +/// Example: +/// The transformation is possible because each reassociation group can be +/// represented as a contiguous slice (i.e., [8x16->2x16], [1x7->1x?], +/// [20->10]). +/// ``` +/// BEFORE: +/// %collapse = tensor.collapse_shape %src [[0, 1], [2, 3], [4]] ... +/// tensor<8x16x1x7x20f32> to tensor<128x7x20xf32> +/// %slice = tensor.extract_slice %slice [0, 0, 0][32, %size, 10][1, 1, 1] +/// tensor<128x7x20xf32> to tensor<32x?x10xf32> +/// +/// AFTER: +/// %slice = tensor.extract_slice %src [0, 0, 0, 0, 0][2, 16, 1, %size, 10] +// [1, 1, 1, 1, 1] : tensor<8x16x1x7x20f32> to tensor<2x16x1x?x10xf32> +/// %collapse = tensor.collapse_shape %slice [[0, 1], [2, 3], [4]] ... +/// tensor<2x16x1x?x10xf32> to tensor<32x?x10xf32> +/// ``` +/// +/// Negative example: +/// The transformation is not possible because we cannot use a single slice to +/// represent the reassociation group [2x3x10->???]. If we would want the +/// collapse to be after the extraction, we would need to extract multiple +/// slices and concat them together. +/// ``` +/// %collapse = tensor.collapse_shape %src [[0, 1, 2]] : tensor<2x3x10xf32> into +/// tensor<60xf32> %extract = tensor.extract_slice %collapse[0][15][1] : +/// tensor<60xf32> to tensor<15xf32> +/// ``` +/// If we would want the collapse to be after the extraction, a possible +/// alternate transformation could be to extract multiple slices and concat them +/// together: +/// ``` +/// %extract_1 = tensor.extract_slice %src[0, 0, 0][1, 1, 10] : +/// tensor<2x3x10xf32> to tensor <1x1x10xf32> +/// %extract_2 = tensor.extract_slice %src[0, 1, 0][1, 1, 5] : +/// tensor<2x3x10xf32> to tensor <1x1x5xf32> +/// %concat = tosa.concat %extract_1, %extract_2 {axis = 0 : i32} : +/// (<1x1x10xf32>, <1x1x5xf32>) -> <1x1x15xf32> +/// %collapse = tensor.collapse_shape %concat [[0, 1, 2]] : tensor<1x1x15xf32> +/// to tensor<15xf32> +/// ``` +/// But this is not the intended purpose of the transformation. +struct BubbleUpCollapseShapeThroughExtractSlice + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp, + PatternRewriter &rewriter) const override { + auto collapseShapeOp = + sliceOp.getSource().getDefiningOp(); + if (!collapseShapeOp) { + return rewriter.notifyMatchFailure( + sliceOp, + "tensor.extract_slice source not produced by tensor.collapse_shape"); + } + + if (!sliceOp.hasUnitStride()) { + return rewriter.notifyMatchFailure( + sliceOp, "unsupported: non-unit stride. Only contiguous slices can " + "be supported in this transformation."); + } + + // The tensor.extract_slice before applying the pattern works on the result + // of the tensor.collapse_shape, so variables (i.e. inputs for + // ExtractSliceOp) referring to the state before applying the pattern are + // named with the prefix "collapsed", and ones referring to the state after + // applying the pattern are named with the prefix "expanded". + SmallVector collapsedOffsets = sliceOp.getMixedOffsets(); + SmallVector collapsedSizes = sliceOp.getMixedSizes(); + + if (static_cast(sliceOp.getResultType().getRank()) != + collapsedSizes.size()) { + return rewriter.notifyMatchFailure(sliceOp, + "unimplemented: rank reducing slice"); + } + + ArrayRef srcShape = collapseShapeOp.getSrcType().getShape(); + SmallVector reassociationIndices = + collapseShapeOp.getReassociationIndices(); + + // Compute new offsets, sizes, and strides for tensor.extract_slice. + // The new tensor.extract_slice will work on a tensor that has has a rank + // equal to the rank of the src of the collapse_shape. In each iteration of + // the loop, the offsets and sizes will be computed per reassociation group. + SmallVector expandedOffsets, expandedSizes; + SmallVector expandedStrides(srcShape.size(), + rewriter.getIndexAttr(1)); + + for (auto [collapsedSize, collapsedOffset, reassocIndices] : + llvm::zip_equal(collapsedSizes, collapsedOffsets, + collapseShapeOp.getReassociationIndices())) { + // CASE #1 - size and/or offset are dynamic. + // In this case, the slice can be represented as a contiguous slice only + // if there is a single dimension in the reassociation group that has a + // size not equal to 1. + if (isa(collapsedSize) || isa(collapsedOffset)) { + int nonUnitSizeCount = 0; + for (int64_t expandedShapeIdx : reassocIndices) { + if (srcShape[expandedShapeIdx] != 1) { + nonUnitSizeCount++; + expandedSizes.push_back(collapsedSize); + expandedOffsets.push_back(collapsedOffset); + continue; + } + + expandedSizes.push_back(rewriter.getIndexAttr(1)); + expandedOffsets.push_back(rewriter.getIndexAttr(0)); + } + + if (nonUnitSizeCount != 1) { + return rewriter.notifyMatchFailure( + sliceOp, + "unsupported: slice cannot be verified to be contiguous"); + } + continue; + } + + // CASE #2 = size and offset are static. + // Verify that the slice can be represented as a contiguous slice of the + // src of the collapse_shape. + // Checking this is done on order of most internal dimensions first, + // so traversal is done in reverse order of the reassociation group. + // If the expected slice shape is [1, 1, ..., 1, Sk, Ak + 1, Ak + 2, + // ...,An] then we first find the size and offset for n...k+1 then for k + // and then for k-1...0. + + // currentCollapsedsize and currentCollapsedOffset are initialized with + // the original collapsed size and offset and divided by the expanded + // shape size in each dimension as we go along the reassociation group. + // In essence we are spreading the original collapsed size and offset over + // the various expanded slice dimensions. + // The variables are used both to check the validity of the slice and to + // compute the expanded sizes and offsets. + int64_t currentCollapsedsize = getConstantIntValue(collapsedSize).value(); + int64_t currentCollapsedOffset = + getConstantIntValue(collapsedOffset).value(); + + SmallVector groupExpandedSizes, groupExpandedOffsets; + + ReassociationIndices reversedReassocIndices(reassocIndices.rbegin(), + reassocIndices.rend()); + int64_t idx = 0; + int64_t reassocGroupSize = reassocIndices.size(); + + // First handle the trailing dimensions where the slice size should be + // equal to the tensor shape and the offset should be 0 (n...k+1). + for (; idx < reassocGroupSize; ++idx) { + int64_t expandedShapeSize = srcShape[reversedReassocIndices[idx]]; + + if (currentCollapsedsize < expandedShapeSize) + break; + + // We need to make sure that the slice size can be set to the shape size + // and the offset to 0. + if ((currentCollapsedsize % expandedShapeSize) != 0 || + (currentCollapsedOffset % expandedShapeSize) != 0) { + return rewriter.notifyMatchFailure( + sliceOp, "unsupported: cannot be extracted as a contiguous slice " + "of the src of the collapse_shape"); + } + + groupExpandedSizes.push_back(rewriter.getIndexAttr(expandedShapeSize)); + groupExpandedOffsets.push_back(rewriter.getIndexAttr(0)); + + currentCollapsedsize /= expandedShapeSize; + currentCollapsedOffset /= expandedShapeSize; + } + + // Now handle the first dim where slicing occurs on (k). + if (idx < reassocGroupSize) { + int64_t expandedShapeSize = srcShape[reversedReassocIndices[idx]]; + int64_t offsetInDim = currentCollapsedOffset % expandedShapeSize; + // We need to make sure that the slice size in this dim + offset will + // not exceed the shape size. + if ((currentCollapsedsize + offsetInDim) >= expandedShapeSize) { + return rewriter.notifyMatchFailure( + sliceOp, "unsupported: slice cannot be extracted as a contiguous " + "slice of the src of the collapse_shape"); + } + + groupExpandedSizes.push_back( + rewriter.getIndexAttr(currentCollapsedsize)); + groupExpandedOffsets.push_back(rewriter.getIndexAttr(offsetInDim)); + + currentCollapsedOffset /= expandedShapeSize; + } + + // Now handle the leading dimensions where the slice size is equal to 1 + // (k-1...0). + // The size for these dimensions must be 1 because of how we constructed + // the slice size of the expanded shape. We spread the original collapsed + // size over the expanded shape sizes until we reached dimension k where + // the remaining size was smaller than the expanded shape size, and spread + // the remaining size on it. So, now we are left with only 1s. + for (idx++; idx < reassocGroupSize; ++idx) { + int64_t expandedShapeSize = srcShape[reversedReassocIndices[idx]]; + int64_t offsetInDim = currentCollapsedOffset % expandedShapeSize; + groupExpandedSizes.push_back(rewriter.getIndexAttr(1)); + groupExpandedOffsets.push_back(rewriter.getIndexAttr(offsetInDim)); + currentCollapsedOffset /= expandedShapeSize; + } + + expandedSizes.append(groupExpandedSizes.rbegin(), + groupExpandedSizes.rend()); + expandedOffsets.append(groupExpandedOffsets.rbegin(), + groupExpandedOffsets.rend()); + } + + Value newSliceOp = rewriter.create( + collapseShapeOp->getLoc(), collapseShapeOp.getSrc(), expandedOffsets, + expandedSizes, expandedStrides); + rewriter.replaceOpWithNewOp( + sliceOp, sliceOp.getResultType(), newSliceOp, + collapseShapeOp.getReassociationIndices()); + + return success(); + } +}; + } // namespace void mlir::tensor::populateReassociativeReshapeFoldingPatterns( @@ -448,5 +699,6 @@ void mlir::tensor::populateBubbleUpExpandShapePatterns( void mlir::tensor::populateBubbleUpExtractSliceOpPatterns( RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); } diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse.mlir index 9bcc125ce1ba9..962858076db93 100644 --- a/mlir/test/Dialect/Linalg/transform-op-fuse.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-fuse.mlir @@ -438,3 +438,52 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +// CHECK-LABEL: func.func @bubble_up_extract_slice_through_collapse_shape( +// CHECK: scf.for %[[X:[A-Za-z0-9]+]] = {{.*}} -> (tensor<8x1800x32xf32>) { +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[EXTRACT]] +// CHECK: %[[EXP1:.*]] = linalg.exp ins(%[[COLLAPSE]] +func.func @bubble_up_extract_slice_through_collapse_shape(%0: tensor<1x8x1800x32xf32>) -> tensor<8x1800x32xf32> { + %expand = tensor.collapse_shape %0 [[0, 1], [2], [3]] : tensor<1x8x1800x32xf32> into tensor<8x1800x32xf32> + %empty = tensor.empty() : tensor<8x1800x32xf32> + %exp = linalg.exp ins(%expand : tensor<8x1800x32xf32>) outs(%empty : tensor<8x1800x32xf32>) -> tensor<8x1800x32xf32> + return %exp : tensor<8x1800x32xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.exp"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %transformed, %loops:1 = transform.structured.fuse %0 [1, 0, 0] interchange [0, 1, 2] apply_cleanup = true : + (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">) + transform.yield + } +} + +// ----- + +// CHECK-LABEL: func.func @bubble_up_extract_slice_through_collapse_shape_with_collapse_producer( +// CHECK: scf.for %[[X:[A-Za-z0-9]+]] = {{.*}} +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice +// CHECK: %[[ABS:.*]] = linalg.abs ins(%[[EXTRACT]] +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[ABS]] +// CHECK: %[[EXP:.*]] = linalg.exp ins(%[[COLLAPSE]] +func.func @bubble_up_extract_slice_through_collapse_shape_with_collapse_producer(%0: tensor<1x8x1800x32xf32>) -> tensor<8x1800x32xf32> { + %empty1 = tensor.empty() : tensor<1x8x1800x32xf32> + %abs = linalg.abs ins(%0 : tensor<1x8x1800x32xf32>) outs(%empty1 : tensor<1x8x1800x32xf32>) -> tensor<1x8x1800x32xf32> + %expand = tensor.collapse_shape %abs [[0, 1], [2], [3]] : tensor<1x8x1800x32xf32> into tensor<8x1800x32xf32> + %empty2 = tensor.empty() : tensor<8x1800x32xf32> + %exp = linalg.exp ins(%expand : tensor<8x1800x32xf32>) outs(%empty2 : tensor<8x1800x32xf32>) -> tensor<8x1800x32xf32> + return %exp : tensor<8x1800x32xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.exp"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %transformed, %loops:1 = transform.structured.fuse %0 [1, 0, 0] interchange [0, 1, 2] apply_cleanup = true : + (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">) + transform.yield + } +} diff --git a/mlir/test/Dialect/Tensor/bubble-up-extract-slice-op.mlir b/mlir/test/Dialect/Tensor/bubble-up-extract-slice-op.mlir index 3900bc56f433d..34128d6a5ec8b 100644 --- a/mlir/test/Dialect/Tensor/bubble-up-extract-slice-op.mlir +++ b/mlir/test/Dialect/Tensor/bubble-up-extract-slice-op.mlir @@ -1,5 +1,15 @@ // RUN: mlir-opt -split-input-file -transform-interpreter %s | FileCheck %s +///---------------------------------------------------------------------------------------- +/// [Pattern: BubbleUpExpandShapeThroughExtractSlice] +/// +/// IN: tensor.expand_shape(tensor.extract_slice) +/// OUT:tensor.extract_slice(tensor.expand_shape) +/// +/// Note: tensor.extract_slice is bubbled up to be before tensor.expand_shape. +/// Some tests are negative tests for cases where the pattern cannot be applied. +///---------------------------------------------------------------------------------------- + // CHECK-LABEL: func.func @bubble_up_extract_slice_through_expand_shape( // CHECK-SAME: %[[SRC:.*]]: tensor<60xf32>) -> tensor<1x1x5xf32> { // CHECK: %[[C1:.+]] = arith.constant 5 : index @@ -113,6 +123,170 @@ func.func @bubble_up_extract_slice_affine_apply_not_folded(%src: tensor<60xf32>, return %extract : tensor } +///---------------------------------------------------------------------------------------- +/// [Pattern: BubbleUpCollapseShapeThroughExtractSlice] +/// +/// IN: tensor.collapse_shape(tensor.extract_slice) +/// OUT:tensor.extract_slice(tensor.collapse_shape) +/// +/// Note: tensor.extract_slice is bubbled up to be before tensor.collapse_shape. +/// Some tests are negative tests for cases where the pattern cannot be applied. +///---------------------------------------------------------------------------------------- + +// CHECK-LABEL: func.func @bubble_up_extract_slice_through_collapse_shape_single_reassoc_group( +// CHECK-SAME: %[[SRC:.*]]: tensor<6x5x2xf32>) -> tensor<1xf32> { +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice %[[SRC]][0, 0, 0] [1, 1, 1] [1, 1, 1] +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[EXTRACT]] {{\[\[}}0, 1, 2]] +// CHECK: return %[[COLLAPSE]] +func.func @bubble_up_extract_slice_through_collapse_shape_single_reassoc_group(%src: tensor<6x5x2xf32>) -> tensor<1xf32> { + %collapse = tensor.collapse_shape %src [[0, 1, 2]] : tensor<6x5x2xf32> into tensor<60xf32> + %extract = tensor.extract_slice %collapse[0][1][1] : tensor<60xf32> to tensor<1xf32> + return %extract : tensor<1xf32> +} + +// CHECK-LABEL: func.func @bubble_up_extract_slice_through_collapse_shape_multiple_reassoc_group( +// CHECK-SAME: %[[SRC:.*]]: tensor<6x5x3x10xf32>) -> tensor<15x10xf32> { +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice %[[SRC]][1, 0, 1, 0] [3, 5, 1, 10] [1, 1, 1, 1] +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[EXTRACT]] {{\[\[}}0, 1], [2, 3]] +// CHECK: return %[[COLLAPSE]] +func.func @bubble_up_extract_slice_through_collapse_shape_multiple_reassoc_group(%src: tensor<6x5x3x10xf32>) -> tensor<15x10xf32> { + %collapse = tensor.collapse_shape %src [[0, 1], [2, 3]] : tensor<6x5x3x10xf32> into tensor<30x30xf32> + %extract = tensor.extract_slice %collapse[5, 10][15, 10][1, 1] : tensor<30x30xf32> to tensor<15x10xf32> + return %extract : tensor<15x10xf32> +} + +// CHECK-LABEL: func.func @bubble_up_extract_slice_through_collapse_shape_offset_on_leading_dim( +// CHECK-SAME: %[[SRC:.*]]: tensor<6x5x2xf32>) -> tensor<4xf32> { +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice %[[SRC]][2, 0, 0] [1, 2, 2] [1, 1, 1] +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[EXTRACT]] {{\[\[}}0, 1, 2]] +// CHECK: return %[[COLLAPSE]] +func.func @bubble_up_extract_slice_through_collapse_shape_offset_on_leading_dim(%src: tensor<6x5x2xf32>) -> tensor<4xf32> { + %collapse = tensor.collapse_shape %src [[0, 1, 2]] : tensor<6x5x2xf32> into tensor<60xf32> + %extract = tensor.extract_slice %collapse[20][4][1] : tensor<60xf32> to tensor<4xf32> + return %extract : tensor<4xf32> +} + +// CHECK-LABEL: func.func @bubble_up_extract_slice_through_collapse_shape_dynamic_size( +// CHECK-SAME: %[[SRC:.*]]: tensor<1x5x1xf32>, +// CHECK-SAME: %[[SIZE:.*]]: index) -> tensor { +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice %[[SRC]][0, 0, 0] [1, %[[SIZE]], 1] [1, 1, 1] +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[EXTRACT]] {{\[\[}}0, 1, 2]] +// CHECK: return %[[COLLAPSE]] +func.func @bubble_up_extract_slice_through_collapse_shape_dynamic_size(%src: tensor<1x5x1xf32>, %size : index) -> tensor { + %collapse = tensor.collapse_shape %src [[0, 1, 2]] : tensor<1x5x1xf32> into tensor<5xf32> + %extract = tensor.extract_slice %collapse[0][%size][1] : tensor<5xf32> to tensor + return %extract : tensor +} + +// CHECK-LABEL: func.func @bubble_up_extract_slice_through_collapse_shape_dynamic_size_and_src( +// CHECK-SAME: %[[SRC:.*]]: tensor<1x?x1xf32>, +// CHECK-SAME: %[[SIZE:.*]]: index) -> tensor { +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice %[[SRC]][0, 0, 0] [1, %[[SIZE]], 1] [1, 1, 1] +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[EXTRACT]] {{\[\[}}0, 1, 2]] +// CHECK: return %[[COLLAPSE]] +func.func @bubble_up_extract_slice_through_collapse_shape_dynamic_size_and_src(%src: tensor<1x?x1xf32>, %size : index) -> tensor { + %collapse = tensor.collapse_shape %src [[0, 1, 2]] : tensor<1x?x1xf32> into tensor + %extract = tensor.extract_slice %collapse[0][%size][1] : tensor to tensor + return %extract : tensor +} + + +// CHECK-LABEL: func.func @bubble_up_extract_slice_through_collapse_shape_dynamic_offset( +// CHECK-SAME: %[[SRC:.*]]: tensor<1x5x1xf32>, +// CHECK-SAME: %[[OFFSET:.*]]: index) -> tensor<3xf32> { +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice %[[SRC]][0, %[[OFFSET]], 0] [1, 3, 1] [1, 1, 1] +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[EXTRACT]] {{\[\[}}0, 1, 2]] +// CHECK: return %[[COLLAPSE]] +func.func @bubble_up_extract_slice_through_collapse_shape_dynamic_offset(%src: tensor<1x5x1xf32>, %offset : index) -> tensor<3xf32> { + %collapse = tensor.collapse_shape %src [[0, 1, 2]] : tensor<1x5x1xf32> into tensor<5xf32> + %extract = tensor.extract_slice %collapse[%offset][3][1] : tensor<5xf32> to tensor<3xf32> + return %extract : tensor<3xf32> +} + +// CHECK-LABEL: func.func @bubble_up_extract_slice_through_collapse_shape_dynamic_offset_and_size( +// CHECK-SAME: %[[SRC:.*]]: tensor<14x1xf32>, +// CHECK-SAME: %[[OFFSET:.*]]: index, +// CHECK-SAME: %[[SIZE:.*]]: index) -> tensor { +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice %[[SRC]]{{\[}}%[[OFFSET]], 0] {{\[}}%[[SIZE]], 1] [1, 1] +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[EXTRACT]] {{\[\[}}0, 1]] +// CHECK: return %[[COLLAPSE]] +func.func @bubble_up_extract_slice_through_collapse_shape_dynamic_offset_and_size(%src: tensor<14x1xf32>, %offset : index, %size : index) -> tensor { + %collapse = tensor.collapse_shape %src [[0, 1]] : tensor<14x1xf32> into tensor<14xf32> + %extract = tensor.extract_slice %collapse[%offset][%size][1] : tensor<14xf32> to tensor + return %extract : tensor +} + +// CHECK-LABEL: func.func @bubble_up_extract_slice_through_collapse_shape_dynamic_and_static_groups( +// CHECK-SAME: %[[SRC:.*]]: tensor<5x10x1x1x40xf32>, +// CHECK-SAME: %[[OFFSET:.*]]: index, +// CHECK-SAME: %[[SIZE:.*]]: index) -> tensor<20x?xf32> { +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice %[[SRC]][1, 0, 0, 0, %[[OFFSET]]] [2, 10, 1, 1, %[[SIZE]]] [1, 1, 1, 1, 1] +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[EXTRACT]] {{\[\[}}0, 1], [2, 3, 4]] +// CHECK: return %[[COLLAPSE]] +func.func @bubble_up_extract_slice_through_collapse_shape_dynamic_and_static_groups(%src: tensor<5x10x1x1x40xf32>, %offset : index, %size : index) -> tensor<20x?xf32> { + %collapse = tensor.collapse_shape %src [[0, 1], [2, 3, 4]] : tensor<5x10x1x1x40xf32> into tensor<50x40xf32> + %extract = tensor.extract_slice %collapse[10, %offset][20, %size][1, 1] : tensor<50x40xf32> to tensor<20x?xf32> + return %extract : tensor<20x?xf32> +} + +/// The 2 following tests are cases where the bubble up cannot occur because the contiguous size extracted +/// from the collapsed shape cannot be expressed via a single extract_slice op. +/// In the first test it is because the size extracted cannot be expressed as a slice +/// of the form [ 1, 1, ..., 1, Sk, Ak + 1, Ak + 2, ...,An ] (see the pattern documentation for more details). +/// In the second test, the size can be expressed as the required form, but the offset is such that the pattern +/// cannot be applied. + +// CHECK-LABEL: func.func @no_bubble_up_extract_slice_through_collapse_shape_on_non_contiguous_1( +// CHECK-SAME: %[[SRC:.*]]: tensor<2x3x10xf32>) -> tensor<15xf32> { +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice +func.func @no_bubble_up_extract_slice_through_collapse_shape_on_non_contiguous_1(%src: tensor<2x3x10xf32>) -> tensor<15xf32> { + %collapse = tensor.collapse_shape %src [[0, 1, 2]] : tensor<2x3x10xf32> into tensor<60xf32> + %extract = tensor.extract_slice %collapse[0][15][1] : tensor<60xf32> to tensor<15xf32> + return %extract : tensor<15xf32> +} + +// CHECK-LABEL: func.func @no_bubble_up_extract_slice_through_collapse_shape_on_non_contiguous_2( +// CHECK-SAME: %[[SRC:.*]]: tensor<2x3x10xf32>) -> tensor<20xf32> { +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice +func.func @no_bubble_up_extract_slice_through_collapse_shape_on_non_contiguous_2(%src: tensor<2x3x10xf32>) -> tensor<20xf32> { + %collapse = tensor.collapse_shape %src [[0, 1, 2]] : tensor<2x3x10xf32> into tensor<60xf32> + %extract = tensor.extract_slice %collapse[20][20][1] : tensor<60xf32> to tensor<20xf32> + return %extract : tensor<20xf32> +} + +// CHECK-LABEL: func.func @no_bubble_up_extract_slice_through_collapse_shape_on_stride( +// CHECK-SAME: %[[SRC:.*]]: tensor<2x3x10xf32>) -> tensor<5xf32> { +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice +func.func @no_bubble_up_extract_slice_through_collapse_shape_on_stride(%src: tensor<2x3x10xf32>) -> tensor<5xf32> { + %collapse = tensor.collapse_shape %src [[0, 1, 2]] : tensor<2x3x10xf32> into tensor<60xf32> + %extract = tensor.extract_slice %collapse[0][5][2] : tensor<60xf32> to tensor<5xf32> + return %extract : tensor<5xf32> +} + +// CHECK-LABEL: func.func @no_bubble_up_extract_slice_through_collapse_shape_on_rank_reducing( +// CHECK-SAME: %[[SRC:.*]]: tensor<6x5x2x1xf32>) -> tensor<1xf32> { +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice +func.func @no_bubble_up_extract_slice_through_collapse_shape_on_rank_reducing(%src: tensor<6x5x2x1xf32>) -> tensor<1xf32> { + %collapse = tensor.collapse_shape %src [[0, 1, 2], [3]] : tensor<6x5x2x1xf32> into tensor<60x1xf32> + %extract = tensor.extract_slice %collapse[0, 0][1, 1][1, 1] : tensor<60x1xf32> to tensor<1xf32> + return %extract : tensor<1xf32> +} + +// CHECK-LABEL: func.func @no_bubble_up_extract_slice_through_collapse_shape_on_unsupported_dynamic( +// CHECK-SAME: %[[SRC:.*]]: tensor<1x5x2xf32>, +// CHECK-SAME: %[[SIZE:.*]]: index) -> tensor { +// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape +// CHECK: %[[EXTRACT:.*]] = tensor.extract_slice +func.func @no_bubble_up_extract_slice_through_collapse_shape_on_unsupported_dynamic(%src: tensor<1x5x2xf32>, %size : index) -> tensor { + %collapse = tensor.collapse_shape %src [[0, 1, 2]] : tensor<1x5x2xf32> into tensor<10xf32> + %extract = tensor.extract_slice %collapse[0][%size][1] : tensor<10xf32> to tensor + return %extract : tensor +} + module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) { %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func"> From fb0e7b5f161118a24eeef39b05882f6950be43c0 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Wed, 2 Apr 2025 16:08:26 -0400 Subject: [PATCH 0433/1029] [AMDGPU][True16][CodeGen] Implement sgpr folding in true16 (#128929) We haven't implemented 16 bit SGPRs. Currently allow 32-bit SGPRs to be folded into True16 bit instructions taking 16 bit values. Also use sgpr_32 when Imm is copied to spgr_lo16 so it could be further folded. This improves generated code quality. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 88 +++- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 1 + llvm/test/CodeGen/AMDGPU/bf16.ll | 67 ++- llvm/test/CodeGen/AMDGPU/bswap.ll | 145 ++++-- .../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 84 ++-- llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 6 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 4 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 380 ++++++---------- llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/fp-classify.ll | 117 ++--- llvm/test/CodeGen/AMDGPU/fpext.f16.ll | 4 +- llvm/test/CodeGen/AMDGPU/fptosi.f16.ll | 7 +- llvm/test/CodeGen/AMDGPU/fptoui.f16.ll | 7 +- llvm/test/CodeGen/AMDGPU/icmp.i16.ll | 22 +- llvm/test/CodeGen/AMDGPU/imm16.ll | 56 +-- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 415 ++++++------------ .../CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll | 13 +- .../llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll | 31 +- .../llvm.amdgcn.raw.tbuffer.store.d16.ll | 70 +-- ...lvm.amdgcn.struct.ptr.tbuffer.store.d16.ll | 34 +- .../llvm.amdgcn.struct.tbuffer.store.d16.ll | 77 +--- .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 48 +- llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 26 +- llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll | 26 +- llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/true16-fold.mir | 60 +++ llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 7 +- 27 files changed, 764 insertions(+), 1055 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/true16-fold.mir diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 46bd5d8044c45..2bfc37b68a2ec 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -12,8 +12,11 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineOperand.h" @@ -576,6 +579,10 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { } MachineOperand *New = Fold.OpToFold; + // Rework once the VS_16 register class is updated to include proper + // 16-bit SGPRs instead of 32-bit ones. + if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg())) + Old.setSubReg(AMDGPU::NoSubRegister); Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI); Old.setIsUndef(New->isUndef()); return true; @@ -947,9 +954,15 @@ void SIFoldOperandsImpl::foldOperand( return; // FIXME: Fold operands with subregs. - if (UseOp->isReg() && OpToFold.isReg() && - (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister)) - return; + if (UseOp->isReg() && OpToFold.isReg()) { + if (UseOp->isImplicit()) + return; + // Allow folding from SGPRs to 16-bit VGPRs. + if (UseOp->getSubReg() != AMDGPU::NoSubRegister && + (UseOp->getSubReg() != AMDGPU::lo16 || + !TRI->isSGPRReg(*MRI, OpToFold.getReg()))) + return; + } // Special case for REG_SEQUENCE: We can't fold literals into // REG_SEQUENCE instructions, so we have to fold them into the @@ -1040,6 +1053,14 @@ void SIFoldOperandsImpl::foldOperand( } } + // Allow immediates COPYd into sgpr_lo16 to be further folded while + // still being legal if not further folded + if (DestRC == &AMDGPU::SGPR_LO16RegClass) { + assert(ST->useRealTrue16Insts()); + MRI->setRegClass(DestReg, &AMDGPU::SGPR_32RegClass); + DestRC = &AMDGPU::SGPR_32RegClass; + } + // In order to fold immediates into copies, we need to change the // copy to a MOV. @@ -1073,9 +1094,43 @@ void SIFoldOperandsImpl::foldOperand( UseMI->getOperand(0).getReg().isVirtual() && !UseMI->getOperand(1).getSubReg()) { LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI); + unsigned Size = TII->getOpSize(*UseMI, 1); Register UseReg = OpToFold.getReg(); UseMI->getOperand(1).setReg(UseReg); - UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); + unsigned SubRegIdx = OpToFold.getSubReg(); + // Hack to allow 32-bit SGPRs to be folded into True16 instructions + // Remove this if 16-bit SGPRs (i.e. SGPR_LO16) are added to the + // VS_16RegClass + // + // Excerpt from AMDGPUGenRegisterInfo.inc + // NoSubRegister, //0 + // hi16, // 1 + // lo16, // 2 + // sub0, // 3 + // ... + // sub1, // 11 + // sub1_hi16, // 12 + // sub1_lo16, // 13 + static_assert(AMDGPU::sub1_hi16 == 12, "Subregister layout has changed"); + if (Size == 2 && TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isSGPRReg(*MRI, UseReg)) { + // Produce the 32 bit subregister index to which the 16-bit subregister + // is aligned. + if (SubRegIdx > AMDGPU::sub1) { + LaneBitmask M = TRI->getSubRegIndexLaneMask(SubRegIdx); + M |= M.getLane(M.getHighestLane() - 1); + SmallVector Indexes; + TRI->getCoveringSubRegIndexes(TRI->getRegClassForReg(*MRI, UseReg), M, + Indexes); + assert(Indexes.size() == 1 && "Expected one 32-bit subreg to cover"); + SubRegIdx = Indexes[0]; + // 32-bit registers do not have a sub0 index + } else if (TII->getOpSize(*UseMI, 1) == 4) + SubRegIdx = 0; + else + SubRegIdx = AMDGPU::sub0; + } + UseMI->getOperand(1).setSubReg(SubRegIdx); UseMI->getOperand(1).setIsKill(false); CopiesToReplace.push_back(UseMI); OpToFold.setIsKill(false); @@ -1713,6 +1768,31 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( if (OpToFold.isReg() && !OpToFold.getReg().isVirtual()) return false; + // True16: Fix malformed 16-bit sgpr COPY produced by peephole-opt + // Can remove this code if proper 16-bit SGPRs are implemented + // Example: Pre-peephole-opt + // %29:sgpr_lo16 = COPY %16.lo16:sreg_32 + // %32:sreg_32 = COPY %29:sgpr_lo16 + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32 + // Post-peephole-opt and DCE + // %32:sreg_32 = COPY %16.lo16:sreg_32 + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32 + // After this transform + // %32:sreg_32 = COPY %16:sreg_32 + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %32:sreg_32 + // After the fold operands pass + // %30:sreg_32 = S_PACK_LL_B32_B16 killed %31:sreg_32, killed %16:sreg_32 + if (MI.getOpcode() == AMDGPU::COPY && OpToFold.isReg() && + OpToFold.getSubReg()) { + const TargetRegisterClass *DstRC = + MRI->getRegClass(MI.getOperand(0).getReg()); + if (DstRC == &AMDGPU::SReg_32RegClass && + DstRC == MRI->getRegClass(OpToFold.getReg())) { + assert(OpToFold.getSubReg() == AMDGPU::lo16); + OpToFold.setSubReg(0); + } + } + // Prevent folding operands backwards in the function. For example, // the COPY opcode must not be replaced by 1 in this example: // diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 170e794af1b4d..071f55ce16403 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -776,6 +776,7 @@ let SubtargetPredicate = isGFX11Plus in { // Restrict src0 to be VGPR def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS, [], /*VOP1Only=*/ 1>; + let isAsCheapAsAMove = 1 in defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>; defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>; defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>; diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 375ae0dee7962..8582b61bbbd82 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -38819,16 +38819,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX11TRUE16-LABEL: s_select_v2bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_lshr_b32 s2, s0, 16 -; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, vcc_lo -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v1.l, vcc_lo +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; @@ -38936,19 +38934,17 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; ; GFX11TRUE16-LABEL: s_vselect_v2bf16: ; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_lshr_b32 s3, s1, 16 -; GFX11TRUE16-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11TRUE16-NEXT: s_lshr_b32 s3, s0, 16 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2 -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11TRUE16-NEXT: s_lshr_b32 s0, s1, 16 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, s0, v0.l, s2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, s1, v0.h, vcc_lo +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; ; GFX11FAKE16-LABEL: s_vselect_v2bf16: @@ -40655,30 +40651,25 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; ; GFX11TRUE16-LABEL: s_vselect_v4bf16: ; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_lshr_b32 s7, s3, 16 +; GFX11TRUE16-NEXT: s_lshr_b32 s7, s1, 16 +; GFX11TRUE16-NEXT: s_lshr_b32 s9, s0, 16 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 -; GFX11TRUE16-NEXT: s_lshr_b32 s8, s1, 16 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 -; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11TRUE16-NEXT: s_lshr_b32 s7, s0, 16 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v2 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s8 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, s2 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, s0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, s1 -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s6 -; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v2.l, s4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v3.l, vcc_lo -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v3.h, s5 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s7 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s9 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s1 +; GFX11TRUE16-NEXT: s_lshr_b32 s8, s3, 16 +; GFX11TRUE16-NEXT: s_lshr_b32 s0, s2, 16 +; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, s8, v0.l, s6 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, s0, v0.h, s4 +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s2, v1.l, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, s3, v1.h, s5 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v4 -; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v2 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; ; GFX11FAKE16-LABEL: s_vselect_v4bf16: diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll index 4787f21e28598..5730d75568fd7 100644 --- a/llvm/test/CodeGen/AMDGPU/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/bswap.ll @@ -303,18 +303,32 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: test_bswap_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203 -; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_endpgm +; GFX11-REAL16-LABEL: test_bswap_i64: +; GFX11-REAL16: ; %bb.0: +; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-REAL16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-REAL16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s2, 0x10203 +; GFX11-REAL16-NEXT: s_mov_b32 s2, -1 +; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-REAL16-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-REAL16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_bswap_i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[2:3], 0x0 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s4, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s5, 0x10203 +; GFX11-FAKE16-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %in, align 8 %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone store i64 %bswap, ptr addrspace(1) %out, align 8 @@ -364,20 +378,36 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: test_bswap_v2i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_perm_b32 v3, 0, s6, 0x10203 -; GFX11-NEXT: v_perm_b32 v2, 0, s7, 0x10203 -; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203 -; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: s_endpgm +; GFX11-REAL16-LABEL: test_bswap_v2i64: +; GFX11-REAL16: ; %bb.0: +; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-REAL16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-REAL16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-REAL16-NEXT: s_mov_b32 s2, -1 +; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s4, 0x10203 +; GFX11-REAL16-NEXT: v_perm_b32 v2, 0, s6, 0x10203 +; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-REAL16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-REAL16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-REAL16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_bswap_v2i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, 0, s6, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, 0, s7, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s4, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s5, 0x10203 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_endpgm %val = load <2 x i64>, ptr addrspace(1) %in, align 16 %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone store <2 x i64> %bswap, ptr addrspace(1) %out, align 16 @@ -445,26 +475,49 @@ define amdgpu_kernel void @test_bswap_v4i64(ptr addrspace(1) %out, ptr addrspace ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: test_bswap_v4i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_perm_b32 v7, 0, s6, 0x10203 -; GFX11-NEXT: v_perm_b32 v6, 0, s7, 0x10203 -; GFX11-NEXT: v_perm_b32 v5, 0, s4, 0x10203 -; GFX11-NEXT: v_perm_b32 v4, 0, s5, 0x10203 -; GFX11-NEXT: v_perm_b32 v3, 0, s2, 0x10203 -; GFX11-NEXT: v_perm_b32 v2, 0, s3, 0x10203 -; GFX11-NEXT: v_perm_b32 v1, 0, s0, 0x10203 -; GFX11-NEXT: v_perm_b32 v0, 0, s1, 0x10203 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-REAL16-LABEL: test_bswap_v4i64: +; GFX11-REAL16: ; %bb.0: +; GFX11-REAL16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-REAL16-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX11-REAL16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-REAL16-NEXT: s_mov_b32 s10, -1 +; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-REAL16-NEXT: v_perm_b32 v0, 0, s4, 0x10203 +; GFX11-REAL16-NEXT: v_perm_b32 v2, 0, s6, 0x10203 +; GFX11-REAL16-NEXT: v_perm_b32 v4, 0, s0, 0x10203 +; GFX11-REAL16-NEXT: v_perm_b32 v6, 0, s2, 0x10203 +; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-REAL16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-REAL16-NEXT: v_mov_b32_e32 v5, v4 +; GFX11-REAL16-NEXT: v_mov_b32_e32 v7, v6 +; GFX11-REAL16-NEXT: s_clause 0x1 +; GFX11-REAL16-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 offset:16 +; GFX11-REAL16-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 +; GFX11-REAL16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: test_bswap_v4i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_perm_b32 v7, 0, s6, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v6, 0, s7, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, 0, s4, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v4, 0, s5, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, 0, s2, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, 0, s3, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, 0, s0, 0x10203 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, 0, s1, 0x10203 +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[4:7], off, s[8:11], 0 offset:16 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm %val = load <4 x i64>, ptr addrspace(1) %in, align 32 %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone store <4 x i64> %bswap, ptr addrspace(1) %out, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index fdbe59c88d22e..8aab9ec885f3c 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -216,34 +216,19 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 ; VI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: extract_vector_elt_v3f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2 -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: extract_vector_elt_v3f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-FAKE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2 -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: extract_vector_elt_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2 +; GFX11-NEXT: s_endpgm %p0 = extractelement <3 x half> %foo, i32 0 %p1 = extractelement <3 x half> %foo, i32 2 %out1 = getelementptr half, ptr addrspace(1) %out, i32 1 @@ -284,35 +269,20 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: dynamic_extract_vector_elt_v3f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s6, 4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: dynamic_extract_vector_elt_v3f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s6, 4 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 -; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: dynamic_extract_vector_elt_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s4, s6, 4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_endpgm %p0 = extractelement <3 x half> %foo, i32 %idx %out1 = getelementptr half, ptr addrspace(1) %out, i32 1 store half %p0, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 93e2d072a6c9b..32f75f3835226 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -297,10 +297,8 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, |v0.l|, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, |s2|, s3 ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 9063af4351297..9ef48588a51ae 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -175,9 +175,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, s2, s2 ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 7e4b1259db3aa..4f77486794527 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -60,34 +60,19 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag, ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_copysign_f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_copysign_f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s3 -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 -; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_copysign_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm %out = call half @llvm.copysign.f16(half %mag, half %sign) store half %out, ptr addrspace(1) %arg_out ret void @@ -1928,122 +1913,63 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1) ; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_and_b32 s5, s3, 0x1ff -; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 8 -; GFX11-TRUE16-NEXT: s_or_b32 s2, s5, s2 -; GFX11-TRUE16-NEXT: s_and_b32 s5, s6, 0xffe -; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-TRUE16-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; GFX11-TRUE16-NEXT: s_sub_i32 s3, 0x3f1, s2 -; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0xfc10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, s5, v0 -; GFX11-TRUE16-NEXT: v_med3_i32 v1, s3, 0, 13 -; GFX11-TRUE16-NEXT: s_lshl_b32 s3, s2, 12 -; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s2, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x1000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, s3, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s2, 31 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 7, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0x7e00 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v3 :: v_dual_add_nc_u32 v1, v1, v2 -; GFX11-TRUE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v1, vcc_lo -; GFX11-TRUE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_and_b32 s5, s3, 0x1ff -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 8 -; GFX11-FAKE16-NEXT: s_or_b32 s2, s5, s2 -; GFX11-FAKE16-NEXT: s_and_b32 s5, s6, 0xffe -; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0 -; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-FAKE16-NEXT: s_bfe_u32 s2, s3, 0xb0014 -; GFX11-FAKE16-NEXT: s_sub_i32 s3, 0x3f1, s2 -; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0xfc10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, s5, v0 -; GFX11-FAKE16-NEXT: v_med3_i32 v1, s3, 0, 13 -; GFX11-FAKE16-NEXT: s_lshl_b32 s3, s2, 12 -; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s2, 1 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, 0x1000, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, v1, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, v1, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v2 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, s3, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s2, 31 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 7, v1 -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 2, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0x7e00 :: v_dual_add_nc_u32 v1, v1, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v1, vcc_lo -; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v2, vcc_lo -; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_mov_b32 v1, 0 -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 -; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff +; GFX11-NEXT: s_lshr_b32 s6, s3, 8 +; GFX11-NEXT: s_or_b32 s2, s5, s2 +; GFX11-NEXT: s_and_b32 s5, s6, 0xffe +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_bfe_u32 s2, s3, 0xb0014 +; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s2 +; GFX11-NEXT: s_addk_i32 s2, 0xfc10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, s5, v0 +; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 +; GFX11-NEXT: s_lshl_b32 s3, s2, 12 +; GFX11-NEXT: s_cmp_lt_i32 s2, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x1000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v1, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmp_lt_i32 s2, 31 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v2, 7, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0x7e00 :: v_dual_add_nc_u32 v1, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v1, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v2, vcc_lo +; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm %mag.trunc = fptrunc double %mag to half %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign) store half %result, ptr addrspace(1) %arg_out @@ -2114,44 +2040,24 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_copysign_v2f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_copysign_v2f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s3 -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 -; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_copysign_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v0, s3 +; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: s_endpgm %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign) store <2 x half> %out, ptr addrspace(1) %arg_out ret void @@ -2244,24 +2150,23 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 -; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v3, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s3 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_store_b16 v4, v0, s[4:5] offset:4 ; GFX11-TRUE16-NEXT: global_store_b32 v4, v1, s[4:5] @@ -2391,62 +2296,31 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_copysign_v4f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s2 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v4, v5 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v6, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v4 -; GFX11-TRUE16-NEXT: global_store_b64 v5, v[0:1], s[4:5] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_copysign_v4f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0 -; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 -; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 16 -; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2 -; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v0 -; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v3, 16, v4 -; GFX11-FAKE16-NEXT: global_store_b64 v5, v[0:1], s[4:5] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_copysign_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-NEXT: s_lshr_b32 s3, s3, 16 +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 +; GFX11-NEXT: s_lshr_b32 s6, s1, 16 +; GFX11-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0 +; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4 +; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5] +; GFX11-NEXT: s_endpgm %out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign) store <4 x half> %out, ptr addrspace(1) %arg_out ret void diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 9642b36ecb7e8..67bec43078803 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -55,10 +55,8 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f16_e64 v0.l, v0.l, |v0.h| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_sub_f16_e64 v0.l, s3, |s2| ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -132,10 +130,8 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -|v0.h| +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, s3, -|s2| ; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 6a0d52962265d..498df8a65feda 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -620,32 +620,18 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: test_isinf_pattern_f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 0x204 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: test_isinf_pattern_f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: test_isinf_pattern_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %fabs = tail call half @llvm.fabs.f16(half %x) #1 %cmp = fcmp oeq half %fabs, 0xH7C00 %ext = zext i1 %cmp to i32 @@ -684,32 +670,18 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: test_isfinite_pattern_0_f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 0x1f8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: test_isfinite_pattern_0_f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: test_isfinite_pattern_0_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 %x.fabs = tail call half @llvm.fabs.f16(half %x) #1 %ninf = fcmp une half %x.fabs, 0xH7C00 @@ -747,32 +719,18 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: test_isfinite_pattern_4_f16: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 0x1f8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: test_isfinite_pattern_4_f16: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: test_isfinite_pattern_4_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 %x.fabs = tail call half @llvm.fabs.f16(half %x) #1 %ninf = fcmp one half %x.fabs, 0xH7C00 @@ -786,3 +744,6 @@ declare half @llvm.fabs.f16(half) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-FAKE16: {{.*}} +; GFX11-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index 59ba9b72e2911..fa358c92e07ea 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -393,10 +393,8 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a) ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll index f84e14ea62273..97a94edc9205a 100644 --- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll @@ -616,11 +616,10 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, -1.0, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll index bba3a23df11a5..72ddc32b2ba5c 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll @@ -614,11 +614,10 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) { ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_eq_f16_e32 vcc_lo, 1.0, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll index 77575c78fb349..6a4ae7f4e0d78 100644 --- a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll @@ -1,8 +1,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s| FileCheck -check-prefix=GCN -check-prefix=SI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s| FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s -; FIXME-TRUE16. In true16 flow, the codegen introduces addtional s2v copy and mov, and revert the operand order thus picking different cmp instructions -; This should be corrected after addtional mov/copy is removed ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s| FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s ;;;==========================================================================;;; @@ -215,7 +213,7 @@ entry: ; VI: v_cmp_eq_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_eq_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_eq_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_eq_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_eq_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_eq_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -233,7 +231,7 @@ entry: ; VI: v_cmp_ne_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ne_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_ne_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_ne_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_ne_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ne_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -251,7 +249,7 @@ entry: ; VI: v_cmp_lt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_lt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_gt_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_lt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ugt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -269,7 +267,7 @@ entry: ; VI: v_cmp_le_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_le_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_ge_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_le_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_uge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -287,7 +285,7 @@ entry: ; VI: v_cmp_gt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_gt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_lt_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_gt_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ult_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -305,7 +303,7 @@ entry: ; VI: v_cmp_ge_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_ge_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_le_u16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_ge_u16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_ule_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -323,7 +321,7 @@ entry: ; VI: v_cmp_lt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_lt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_lt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_gt_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_lt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_sgt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -341,7 +339,7 @@ entry: ; VI: v_cmp_le_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_le_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_ge_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_le_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_sge_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -359,7 +357,7 @@ entry: ; VI: v_cmp_gt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_gt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_gt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_lt_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_gt_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_slt_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -377,7 +375,7 @@ entry: ; VI: v_cmp_ge_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; SI: v_cmp_ge_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}} ; GFX11-FAKE16: v_cmp_ge_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}} -; GFX11-TRUE16: v_cmp_le_i16_e32 vcc_lo, v{{[0-9]+}}.{{(l|h)}}, v{{[0-9]+}}.{{(l|h)}} +; GFX11-TRUE16: v_cmp_ge_i16_e32 vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}.{{(l|h)}} define amdgpu_kernel void @i16_sle_v_s(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, i16 %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll index bc4a8634dbe50..8ca87678a36f3 100644 --- a/llvm/test/CodeGen/AMDGPU/imm16.ll +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -647,10 +647,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0, v0.l ; encoding: [0x80,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -715,10 +713,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0.5, v0.l ; encoding: [0xf0,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -783,10 +779,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -0.5, v0.l ; encoding: [0xf1,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -851,10 +845,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; encoding: [0xf2,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -919,10 +911,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -1.0, v0.l ; encoding: [0xf3,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -987,10 +977,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l ; encoding: [0xf4,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1055,10 +1043,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -2.0, v0.l ; encoding: [0xf5,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1123,10 +1109,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 4.0, v0.l ; encoding: [0xf6,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1191,10 +1175,8 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -4.0, v0.l ; encoding: [0xf7,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1455,10 +1437,8 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x) ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1, v0.l ; encoding: [0x81,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1523,10 +1503,8 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x) ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 2, v0.l ; encoding: [0x82,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1591,10 +1569,8 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x) ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 16, v0.l ; encoding: [0x90,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1896,10 +1872,8 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x) ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 63, v0.l ; encoding: [0xbf,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; @@ -1964,10 +1938,8 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x) ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8] ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31] ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e] +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00] ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf] -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 64, v0.l ; encoding: [0xc0,0x00,0x00,0x64] ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80] ; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf] ; diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 93052fe76cd6b..97c97ac8a7ad3 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -199,9 +199,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_mov_b32 s4, s2 -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s2 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-TRUE16-NEXT: ;;#ASMSTART @@ -356,42 +354,23 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s4, 16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_mov_b32 s4, s3 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, s4, s2 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-TRUE16-NEXT: ;;#ASMSTART -; GFX11-TRUE16-NEXT: ; use s3 -; GFX11-TRUE16-NEXT: ;;#ASMEND -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s4, 16 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, s3, s2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FAKE16-NEXT: ;;#ASMSTART -; GFX11-FAKE16-NEXT: ; use s3 -; GFX11-FAKE16-NEXT: ;;#ASMEND -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s3, s4, 16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_pack_lh_b32_b16 s2, s3, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s3 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 @@ -468,52 +447,27 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s4, 16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_mov_b32 s4, s3 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 -; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-TRUE16-NEXT: ;;#ASMSTART -; GFX11-TRUE16-NEXT: ; use s3 -; GFX11-TRUE16-NEXT: ;;#ASMEND -; GFX11-TRUE16-NEXT: ;;#ASMSTART -; GFX11-TRUE16-NEXT: ; use s2 -; GFX11-TRUE16-NEXT: ;;#ASMEND -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s4, 16 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s3, s2 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 -; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-FAKE16-NEXT: ;;#ASMSTART -; GFX11-FAKE16-NEXT: ; use s3 -; GFX11-FAKE16-NEXT: ;;#ASMEND -; GFX11-FAKE16-NEXT: ;;#ASMSTART -; GFX11-FAKE16-NEXT: ; use s2 -; GFX11-FAKE16-NEXT: ;;#ASMEND -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-NEXT: s_lshr_b32 s3, s4, 16 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshr_b32 s2, s2, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_pack_ll_b32_b16 s4, s3, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s3 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use s2 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_endpgm %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 %elt = trunc i32 %elt.hi to i16 @@ -1792,34 +1746,19 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: v_insertelement_v4f16_0: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0 -; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: v_insertelement_v4f16_0: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, s4, v0 -; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: v_insertelement_v4f16_0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s4, v0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext @@ -1978,34 +1917,19 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: v_insertelement_v4f16_2: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1 -; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: v_insertelement_v4f16_2: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 -; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: v_insertelement_v4f16_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext @@ -2164,34 +2088,19 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: v_insertelement_v4i16_2: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1 -; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: v_insertelement_v4i16_2: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3] -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 -; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: v_insertelement_v4i16_2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -2583,34 +2492,19 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: v_insertelement_v8i16_6: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v4, s[2:3] -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v5, v3 -; GFX11-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: v_insertelement_v8i16_6: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v4, s[2:3] -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0xffff, s4, v3 -; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: v_insertelement_v8i16_6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -2799,11 +2693,10 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 4, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v5, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v4, s[2:3] ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s4 ; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 7 ; GFX11-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 @@ -2816,19 +2709,19 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 3 ; GFX11-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s10, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 1 ; GFX11-TRUE16-NEXT: s_cselect_b32 s5, -1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v4.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v4.l, s6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, v4.l, s7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v4.l, s5 -; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[0:1] +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, s4, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, s4, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, s4, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, s4, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, s4, s8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, s4, s9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s4, s10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, s4, s5 +; GFX11-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_insertelement_v8f16_dynamic: @@ -3078,45 +2971,24 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a ; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; CI-NEXT: s_endpgm ; -; GFX11-TRUE16-LABEL: v_insertelement_v16i16_6: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v8, s[2:3] -; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s4 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v9, v3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX11-TRUE16-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX11-TRUE16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: v_insertelement_v16i16_6: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v8, s[2:3] -; GFX11-FAKE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0xffff, s4, v3 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX11-FAKE16-NEXT: global_store_b128 v8, v[0:3], s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: v_insertelement_v16i16_6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -3443,13 +3315,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 5, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v9, s[2:3] -; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v9, s[2:3] offset:16 +; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v8, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, s4 ; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 7 ; GFX11-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 @@ -3478,30 +3349,30 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 11 ; GFX11-TRUE16-NEXT: s_cselect_b32 s17, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 8 -; GFX11-TRUE16-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s18, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 9 ; GFX11-TRUE16-NEXT: s_cselect_b32 s5, -1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v8.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, s4, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v8.l, s12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v8.l, s13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v8.l, s14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.h, v8.l, s15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v8.l, s16 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v8.l, s17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v8.l, s4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v8.l, s5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v8.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v8.l, s6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, v8.l, s7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v8.l, s8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v8.l, s9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v8.l, s11 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, s4, s12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, s4, s13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, s4, s14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.h, s4, s15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, s4, s16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, s4, s17 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, s4, s18 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, s4, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, s4, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, s4, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, s4, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, s4, s8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, s4, s9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s4, s10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, s4, s11 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b128 v9, v[4:7], s[0:1] offset:16 -; GFX11-TRUE16-NEXT: global_store_b128 v9, v[0:3], s[0:1] +; GFX11-TRUE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-TRUE16-NEXT: global_store_b128 v8, v[0:3], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_insertelement_v16f16_dynamic: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll index 07421afde7622..b77b2f7441a0c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll @@ -78,20 +78,19 @@ define amdgpu_kernel void @v_alignbyte_b32_2(ptr addrspace(1) %out, ptr addrspac ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v2, v0, s[6:7] glc dlc +; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x3c ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_alignbyte_b32 v0, v1, v2, v0.l -; GFX11-TRUE16-NEXT: global_store_b32 v3, v0, s[0:1] +; GFX11-TRUE16-NEXT: v_alignbyte_b32 v0, v1, v0, s2 +; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_alignbyte_b32_2: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll index 839892e38db49..d8e2ce3728a9b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll @@ -35,25 +35,15 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; -; GFX11-PACKED-TRUE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-TRUE16: ; %bb.0: ; %main_body -; GFX11-PACKED-TRUE16-NEXT: s_clause 0x1 -; GFX11-PACKED-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX11-PACKED-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-PACKED-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6 -; GFX11-PACKED-TRUE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-TRUE16-NEXT: s_endpgm -; -; GFX11-PACKED-FAKE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-FAKE16: ; %bb.0: ; %main_body -; GFX11-PACKED-FAKE16-NEXT: s_clause 0x1 -; GFX11-PACKED-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX11-PACKED-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-PACKED-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-PACKED-FAKE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-FAKE16-NEXT: s_endpgm +; GFX11-PACKED-LABEL: tbuffer_store_d16_x: +; GFX11-PACKED: ; %bb.0: ; %main_body +; GFX11-PACKED-NEXT: s_clause 0x1 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 33, i32 0) ret void @@ -217,3 +207,6 @@ declare void @llvm.amdgcn.raw.ptr.tbuffer.store.f16(half, ptr addrspace(8), i32, declare void @llvm.amdgcn.raw.ptr.tbuffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32) declare void @llvm.amdgcn.raw.ptr.tbuffer.store.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32) declare void @llvm.amdgcn.raw.ptr.tbuffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-PACKED-FAKE16: {{.*}} +; GFX11-PACKED-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index c53c491c216e7..052f7f1c8310b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -39,55 +39,25 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) { ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] ; GFX10-PACKED-NEXT: s_endpgm ; -; GFX11-PACKED-TRUE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-TRUE16: ; %bb.0: ; %main_body -; GFX11-PACKED-TRUE16-NEXT: s_clause 0x1 -; GFX11-PACKED-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX11-PACKED-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-PACKED-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6 -; GFX11-PACKED-TRUE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-TRUE16-NEXT: s_endpgm -; -; GFX11-PACKED-FAKE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-FAKE16: ; %bb.0: ; %main_body -; GFX11-PACKED-FAKE16-NEXT: s_clause 0x1 -; GFX11-PACKED-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX11-PACKED-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-PACKED-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-PACKED-FAKE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] -; GFX11-PACKED-FAKE16-NEXT: s_endpgm -; -; GFX12-PACKED-SDAG-TRUE16-LABEL: tbuffer_store_d16_x: -; GFX12-PACKED-SDAG-TRUE16: ; %bb.0: ; %main_body -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6 -; GFX12-PACKED-SDAG-TRUE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_endpgm -; -; GFX12-PACKED-SDAG-FAKE16-LABEL: tbuffer_store_d16_x: -; GFX12-PACKED-SDAG-FAKE16: ; %bb.0: ; %main_body -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-PACKED-SDAG-FAKE16-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_endpgm +; GFX11-PACKED-LABEL: tbuffer_store_d16_x: +; GFX11-PACKED: ; %bb.0: ; %main_body +; GFX11-PACKED-NEXT: s_clause 0x1 +; GFX11-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] +; GFX11-PACKED-NEXT: s_endpgm ; -; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_x: -; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body -; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 -; GFX12-PACKED-GISEL-NEXT: s_load_b32 s6, s[4:5], 0x34 -; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] -; GFX12-PACKED-GISEL-NEXT: s_endpgm +; GFX12-PACKED-LABEL: tbuffer_store_d16_x: +; GFX12-PACKED: ; %bb.0: ; %main_body +; GFX12-PACKED-NEXT: s_clause 0x1 +; GFX12-PACKED-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] +; GFX12-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.raw.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 0, i32 0, i32 33, i32 0) ret void @@ -298,5 +268,9 @@ declare void @llvm.amdgcn.raw.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i3 declare void @llvm.amdgcn.raw.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32) declare void @llvm.amdgcn.raw.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-PACKED-FAKE16: {{.*}} +; GFX11-PACKED-TRUE16: {{.*}} ; GFX12-PACKED-GISEL-FAKE16: {{.*}} ; GFX12-PACKED-GISEL-TRUE16: {{.*}} +; GFX12-PACKED-SDAG-FAKE16: {{.*}} +; GFX12-PACKED-SDAG-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll index 530ace778cdc9..fc8f8afa82c2d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.store.d16.ll @@ -38,27 +38,16 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; -; GFX11-PACKED-TRUE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-TRUE16: ; %bb.0: ; %main_body -; GFX11-PACKED-TRUE16-NEXT: s_clause 0x1 -; GFX11-PACKED-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX11-PACKED-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-PACKED-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6 -; GFX11-PACKED-TRUE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-PACKED-TRUE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-TRUE16-NEXT: s_endpgm -; -; GFX11-PACKED-FAKE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-FAKE16: ; %bb.0: ; %main_body -; GFX11-PACKED-FAKE16-NEXT: s_clause 0x1 -; GFX11-PACKED-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX11-PACKED-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-PACKED-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-PACKED-FAKE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-FAKE16-NEXT: s_endpgm +; GFX11-PACKED-LABEL: tbuffer_store_d16_x: +; GFX11-PACKED: ; %bb.0: ; %main_body +; GFX11-PACKED-NEXT: s_clause 0x1 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.tbuffer.store.f16(half %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) ret void @@ -242,3 +231,6 @@ declare void @llvm.amdgcn.struct.ptr.tbuffer.store.f16(half, ptr addrspace(8), i declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32, i32) declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v3f16(<3 x half>, ptr addrspace(8), i32, i32, i32, i32, i32) declare void @llvm.amdgcn.struct.ptr.tbuffer.store.v4f16(<4 x half>, ptr addrspace(8), i32, i32, i32, i32, i32) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-PACKED-FAKE16: {{.*}} +; GFX11-PACKED-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index bdb82999197d9..d025e7a15e25a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -44,60 +44,27 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 ; GFX10-PACKED-NEXT: tbuffer_store_format_d16_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_11_11_SSCALED] idxen ; GFX10-PACKED-NEXT: s_endpgm ; -; GFX11-PACKED-TRUE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-TRUE16: ; %bb.0: ; %main_body -; GFX11-PACKED-TRUE16-NEXT: s_clause 0x1 -; GFX11-PACKED-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX11-PACKED-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-PACKED-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6 -; GFX11-PACKED-TRUE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-PACKED-TRUE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-TRUE16-NEXT: s_endpgm -; -; GFX11-PACKED-FAKE16-LABEL: tbuffer_store_d16_x: -; GFX11-PACKED-FAKE16: ; %bb.0: ; %main_body -; GFX11-PACKED-FAKE16-NEXT: s_clause 0x1 -; GFX11-PACKED-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX11-PACKED-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-PACKED-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-PACKED-FAKE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-PACKED-FAKE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX11-PACKED-FAKE16-NEXT: s_endpgm -; -; GFX12-PACKED-SDAG-TRUE16-LABEL: tbuffer_store_d16_x: -; GFX12-PACKED-SDAG-TRUE16: ; %bb.0: ; %main_body -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s6 -; GFX12-PACKED-SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX12-PACKED-SDAG-TRUE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX12-PACKED-SDAG-TRUE16-NEXT: s_endpgm -; -; GFX12-PACKED-SDAG-FAKE16-LABEL: tbuffer_store_d16_x: -; GFX12-PACKED-SDAG-FAKE16: ; %bb.0: ; %main_body -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_clause 0x1 -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-PACKED-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, s7 -; GFX12-PACKED-SDAG-FAKE16-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX12-PACKED-SDAG-FAKE16-NEXT: s_endpgm +; GFX11-PACKED-LABEL: tbuffer_store_d16_x: +; GFX11-PACKED: ; %bb.0: ; %main_body +; GFX11-PACKED-NEXT: s_clause 0x1 +; GFX11-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX11-PACKED-NEXT: s_endpgm ; -; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_x: -; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body -; GFX12-PACKED-GISEL-NEXT: s_clause 0x1 -; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 -; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s6 -; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s7 -; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen -; GFX12-PACKED-GISEL-NEXT: s_endpgm +; GFX12-PACKED-LABEL: tbuffer_store_d16_x: +; GFX12-PACKED: ; %bb.0: ; %main_body +; GFX12-PACKED-NEXT: s_clause 0x1 +; GFX12-PACKED-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s7 +; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, v1, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM] idxen +; GFX12-PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 33, i32 0) ret void @@ -331,5 +298,9 @@ declare void @llvm.amdgcn.struct.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, declare void @llvm.amdgcn.struct.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32) declare void @llvm.amdgcn.struct.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-PACKED-FAKE16: {{.*}} +; GFX11-PACKED-TRUE16: {{.*}} ; GFX12-PACKED-GISEL-FAKE16: {{.*}} ; GFX12-PACKED-GISEL-TRUE16: {{.*}} +; GFX12-PACKED-SDAG-FAKE16: {{.*}} +; GFX12-PACKED-SDAG-TRUE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index 26bcd61891fa5..18c462ffd0ff5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -77,42 +77,17 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm ; -; GFX11SELDAG-TRUE16-LABEL: sgpr_isnan_f16: -; GFX11SELDAG-TRUE16: ; %bb.0: -; GFX11SELDAG-TRUE16-NEXT: s_clause 0x1 -; GFX11SELDAG-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11SELDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11SELDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 3 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, s2 -; GFX11SELDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11SELDAG-TRUE16-NEXT: s_endpgm -; -; GFX11SELDAG-FAKE16-LABEL: sgpr_isnan_f16: -; GFX11SELDAG-FAKE16: ; %bb.0: -; GFX11SELDAG-FAKE16-NEXT: s_clause 0x1 -; GFX11SELDAG-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11SELDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11SELDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 -; GFX11SELDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 3 -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 -; GFX11SELDAG-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11SELDAG-FAKE16-NEXT: s_endpgm -; -; GFX11GLISEL-LABEL: sgpr_isnan_f16: -; GFX11GLISEL: ; %bb.0: -; GFX11GLISEL-NEXT: s_clause 0x1 -; GFX11GLISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11GLISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11GLISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11GLISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s2, s2, 3 -; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 -; GFX11GLISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11GLISEL-NEXT: s_endpgm +; GFX11CHECK-LABEL: sgpr_isnan_f16: +; GFX11CHECK: ; %bb.0: +; GFX11CHECK-NEXT: s_clause 0x1 +; GFX11CHECK-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 +; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) +; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 +; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 +; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11CHECK-NEXT: s_endpgm %result = call i1 @llvm.is.fpclass.f16(half %x, i32 3) %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -4311,4 +4286,5 @@ attributes #0 = { "denormal-fp-math"="ieee,preserve-sign" } ; Maybe daz attributes #1 = { "denormal-fp-math"="ieee,dynamic" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11GLISEL: {{.*}} ; GFX11SELDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 297e4f0927204..362b9dacaf257 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -1083,23 +1083,19 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-TRUE16-LABEL: s_maximum_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 -; GFX11-TRUE16-NEXT: v_pk_max_f16 v2, s0, s1 -; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, s0, s1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-TRUE16-NEXT: ;;#ASMSTART ; GFX11-TRUE16-NEXT: ; use v0 ; GFX11-TRUE16-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index ffbb9fde26e55..f6d37b34807b1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -896,23 +896,19 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX11-TRUE16-LABEL: s_minimum_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s1 -; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 -; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 -; GFX11-TRUE16-NEXT: v_pk_min_f16 v2, s0, s1 -; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v0, s0, s1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, s0, s1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-TRUE16-NEXT: ;;#ASMSTART ; GFX11-TRUE16-NEXT: ; use v0 ; GFX11-TRUE16-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll index 452acbc801f44..2f1dfa11fd34d 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -790,14 +790,10 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; ; GFX11-SDAG-TRUE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX11-SDAG-TRUE16: ; %bb.0: -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3 -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s0, s2, 16 -; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s1, s3, 16 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, s1 -; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v0.l, s2, s3 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s0, s3, 16 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s1, s2, 16 +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v1.l, s1, s0 ; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir new file mode 100644 index 0000000000000..ef6e4007b8f7a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir @@ -0,0 +1,60 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass si-fold-operands -mattr=+real-true16 -o - %s | FileCheck %s + +--- +name: fold_16bit_subreg_1 +tracksRegLiveness: true +registers: +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_16bit_subreg_1 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_F16_t16_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed [[DEF1]], 2, [[DEF]].sub1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_t16_e64_]] + %0:sreg_64_xexec = IMPLICIT_DEF + %1:sgpr_lo16 = COPY %0.sub1_lo16:sreg_64_xexec + %2:vgpr_16 = COPY %1:sgpr_lo16 + %3:vgpr_16 = IMPLICIT_DEF + %4:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed %3:vgpr_16, 2, killed %2:vgpr_16, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %4 +... + +--- +name: fold_16bit_subreg_0 +tracksRegLiveness: true +registers: +body: | + bb.0.entry: + ; CHECK-LABEL: name: fold_16bit_subreg_0 + ; CHECK: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_CMP_EQ_F16_t16_e64_:%[0-9]+]]:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed [[DEF1]], 2, [[DEF]].sub0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_EQ_F16_t16_e64_]] + %0:sreg_64_xexec = IMPLICIT_DEF + %1:sgpr_lo16 = COPY %0.lo16:sreg_64_xexec + %2:vgpr_16 = COPY %1:sgpr_lo16 + %3:vgpr_16 = IMPLICIT_DEF + %4:sreg_32 = nofpexcept V_CMP_EQ_F16_t16_e64 0, killed %3:vgpr_16, 2, killed %2:vgpr_16, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0, implicit %4 +... + +--- +name: sgpr_lo16 +tracksRegLiveness: true +registers: +body: | + bb.0.entry: + ; CHECK-LABEL: name: sgpr_lo16 + ; CHECK: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_ALIGNBIT_B32_t16_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, [[DEF]], 0, killed [[DEF1]], 0, 30, 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_t16_e64_]] + %0:sreg_32 = IMPLICIT_DEF + %1:sreg_32 = IMPLICIT_DEF + %2:sreg_32 = S_MOV_B32 30 + %3:sgpr_lo16 = COPY %2.lo16:sreg_32 + %4:vgpr_16 = COPY %3:sgpr_lo16 + %5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec + S_ENDPGM 0, implicit %5 +... diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 12137bdf25ba4..40a4d4af143a4 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -187,11 +187,8 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; SDAG-GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; SDAG-GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 -; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3 -; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff -; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.l, v0.h, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, s2, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.l, s3, 0, 0xff ; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; SDAG-GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SDAG-GFX12-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 From c59d3a268444b14c2db0fdea7e4bd929bb30630b Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 2 Apr 2025 22:12:59 +0200 Subject: [PATCH 0434/1029] [libc++] Add visibility annotations to the std namespace with GCC (#133233) This allows us to remove the need for `_LIBCPP_TEMPLATE_VIS` and fixes a bunch of missing annotations for RTTI when used across dylib boundaries. `_LIBCPP_TEMPLATE_VIS` itself will be removed in a separate patch, since it touches a lot of code. This patch is a no-op for Clang. Only GCC is affected. --- libcxx/docs/ReleaseNotes/21.rst | 5 ++++- libcxx/include/__config | 19 +++++++++---------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst index 877aa06f8b7e4..7af109ddc8657 100644 --- a/libcxx/docs/ReleaseNotes/21.rst +++ b/libcxx/docs/ReleaseNotes/21.rst @@ -77,7 +77,10 @@ LLVM 22 ABI Affecting Changes --------------------- -- TODO +- When using GCC, the ``std`` namespace is now annotated with ``[[gnu::visibility("default")]]``. This may cause more + symbols to be exported from shared libraries when building with ``-fvisibility=hidden``. This also fixes RTTI + comparison between shared libraries, since all RTTI has the correct visibility now. There is no behaviour change on + Clang. Build System Changes diff --git a/libcxx/include/__config b/libcxx/include/__config index 35e62d0a19e85..ce8bc38acfe3e 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -386,7 +386,7 @@ typedef __char32_t char32_t; # define _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS # define _LIBCPP_TEMPLATE_VIS # define _LIBCPP_TEMPLATE_DATA_VIS -# define _LIBCPP_TYPE_VISIBILITY_DEFAULT +# define _LIBCPP_NAMESPACE_VISIBILITY # else @@ -414,17 +414,16 @@ typedef __char32_t char32_t; # define _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS # endif -// GCC doesn't support the type_visibility attribute, so we have to keep the visibility attribute on templates -# if !defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS) && !__has_attribute(__type_visibility__) -# define _LIBCPP_TEMPLATE_VIS __attribute__((__visibility__("default"))) -# else -# define _LIBCPP_TEMPLATE_VIS -# endif +// This is kept to avoid a huge library-wide diff in the first step. +// TODO: Remove this in a follow-up patch +# define _LIBCPP_TEMPLATE_VIS # if !defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS) && __has_attribute(__type_visibility__) -# define _LIBCPP_TYPE_VISIBILITY_DEFAULT __attribute__((__type_visibility__("default"))) +# define _LIBCPP_NAMESPACE_VISIBILITY __attribute__((__type_visibility__("default"))) +# elif !defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS) +# define _LIBCPP_NAMESPACE_VISIBILITY __attribute__((__visibility__("default"))) # else -# define _LIBCPP_TYPE_VISIBILITY_DEFAULT +# define _LIBCPP_NAMESPACE_VISIBILITY # endif # endif // defined(_LIBCPP_OBJECT_FORMAT_COFF) @@ -583,7 +582,7 @@ typedef __char32_t char32_t; // If it's not clear whether using the unversioned namespace is the correct thing to do, it's not. The versioned // namespace (_LIBCPP_BEGIN_NAMESPACE_STD) should almost always be used. # define _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD \ - _LIBCPP_PUSH_EXTENSION_DIAGNOSTICS namespace _LIBCPP_TYPE_VISIBILITY_DEFAULT std { + _LIBCPP_PUSH_EXTENSION_DIAGNOSTICS namespace _LIBCPP_NAMESPACE_VISIBILITY std { # define _LIBCPP_END_UNVERSIONED_NAMESPACE_STD } _LIBCPP_POP_EXTENSION_DIAGNOSTICS From 2026873fb8a1f654aa920cd5ea8074e55053973b Mon Sep 17 00:00:00 2001 From: David Peixotto Date: Wed, 2 Apr 2025 13:15:31 -0700 Subject: [PATCH 0435/1029] Add enable/disable api for SystemRuntime plugins (#133794) This commit adds support for enabling and disabling plugins by name. The changes are made generically in the `PluginInstances` class, but currently we only expose the ability to SystemRuntime plugins. Other plugins types can be added easily. We had a few design goals for how disabled plugins should work 1. Plugins that are disabled should still be visible to the system. This allows us to dynamically enable and disable plugins and report their state to the user. 2. Plugin order should be stable across disable and enable changes. We want avoid changing the order of plugin lookup. When a plugin is re-enabled it should return to its original slot in the creation order. 3. Disabled plugins should not appear in PluginManager operations. Clients should be able to assume that only enabled plugins will be returned from the PluginManager. For the implementation we modify the plugin instance to maintain a bool of its enabled state. Existing clients external to the Instances class expect to iterate over only enabled instance so we skip over disabed instances in the query and snapshot apis. This way the client does not have to manually check which instances are enabled. --- lldb/include/lldb/Core/PluginManager.h | 13 + lldb/source/Core/PluginManager.cpp | 55 +++- lldb/unittests/Core/CMakeLists.txt | 1 + lldb/unittests/Core/PluginManagerTest.cpp | 367 ++++++++++++++++++++++ 4 files changed, 433 insertions(+), 3 deletions(-) create mode 100644 lldb/unittests/Core/PluginManagerTest.cpp diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h index e4e0c3eea67f8..a6dab045adf27 100644 --- a/lldb/include/lldb/Core/PluginManager.h +++ b/lldb/include/lldb/Core/PluginManager.h @@ -22,6 +22,7 @@ #include #include +#include #define LLDB_PLUGIN_DEFINE_ADV(ClassName, PluginName) \ namespace lldb_private { \ @@ -47,6 +48,12 @@ class CommandInterpreter; class Debugger; class StringList; +struct RegisteredPluginInfo { + llvm::StringRef name = ""; + llvm::StringRef description = ""; + bool enabled = false; +}; + class PluginManager { public: static void Initialize(); @@ -168,6 +175,12 @@ class PluginManager { static SystemRuntimeCreateInstance GetSystemRuntimeCreateCallbackAtIndex(uint32_t idx); + static std::vector GetSystemRuntimePluginInfo(); + + // Modify the enabled state of a SystemRuntime plugin. + // Returns false if the plugin name is not found. + static bool SetSystemRuntimePluginEnabled(llvm::StringRef name, bool enabled); + // ObjectFile static bool RegisterPlugin(llvm::StringRef name, llvm::StringRef description, diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp index 95eb940efcef2..e6cb248ef31ce 100644 --- a/lldb/source/Core/PluginManager.cpp +++ b/lldb/source/Core/PluginManager.cpp @@ -188,11 +188,13 @@ template struct PluginInstance { PluginInstance(llvm::StringRef name, llvm::StringRef description, Callback create_callback, DebuggerInitializeCallback debugger_init_callback = nullptr) - : name(name), description(description), create_callback(create_callback), + : name(name), description(description), enabled(true), + create_callback(create_callback), debugger_init_callback(debugger_init_callback) {} llvm::StringRef name; llvm::StringRef description; + bool enabled; Callback create_callback; DebuggerInitializeCallback debugger_init_callback; }; @@ -250,7 +252,9 @@ template class PluginInstances { } void PerformDebuggerCallback(Debugger &debugger) { - for (auto &instance : m_instances) { + for (const auto &instance : m_instances) { + if (!instance.enabled) + continue; if (instance.debugger_init_callback) instance.debugger_init_callback(debugger); } @@ -260,7 +264,14 @@ template class PluginInstances { // Note that this is a copy of the internal state so modifications // to the returned instances will not be reflected back to instances // stored by the PluginInstances object. - std::vector GetSnapshot() { return m_instances; } + std::vector GetSnapshot() { + std::vector enabled_instances; + for (const auto &instance : m_instances) { + if (instance.enabled) + enabled_instances.push_back(instance); + } + return enabled_instances; + } const Instance *GetInstanceAtIndex(uint32_t idx) { uint32_t count = 0; @@ -280,12 +291,41 @@ template class PluginInstances { const Instance * FindEnabledInstance(std::function predicate) const { for (const auto &instance : m_instances) { + if (!instance.enabled) + continue; if (predicate(instance)) return &instance; } return nullptr; } + // Return a list of all the registered plugin instances. This includes both + // enabled and disabled instances. The instances are listed in the order they + // were registered which is the order they would be queried if they were all + // enabled. + std::vector GetPluginInfoForAllInstances() { + // Lookup the plugin info for each instance in the sorted order. + std::vector plugin_infos; + plugin_infos.reserve(m_instances.size()); + for (const Instance &instance : m_instances) + plugin_infos.push_back( + {instance.name, instance.description, instance.enabled}); + + return plugin_infos; + } + + bool SetInstanceEnabled(llvm::StringRef name, bool enable) { + auto it = std::find_if( + m_instances.begin(), m_instances.end(), + [&](const Instance &instance) { return instance.name == name; }); + + if (it == m_instances.end()) + return false; + + it->enabled = enable; + return true; + } + private: std::vector m_instances; }; @@ -627,6 +667,15 @@ PluginManager::GetSystemRuntimeCreateCallbackAtIndex(uint32_t idx) { return GetSystemRuntimeInstances().GetCallbackAtIndex(idx); } +std::vector PluginManager::GetSystemRuntimePluginInfo() { + return GetSystemRuntimeInstances().GetPluginInfoForAllInstances(); +} + +bool PluginManager::SetSystemRuntimePluginEnabled(llvm::StringRef name, + bool enable) { + return GetSystemRuntimeInstances().SetInstanceEnabled(name, enable); +} + #pragma mark ObjectFile struct ObjectFileInstance : public PluginInstance { diff --git a/lldb/unittests/Core/CMakeLists.txt b/lldb/unittests/Core/CMakeLists.txt index 60265f794b5e8..8580f5887ea2b 100644 --- a/lldb/unittests/Core/CMakeLists.txt +++ b/lldb/unittests/Core/CMakeLists.txt @@ -7,6 +7,7 @@ add_lldb_unittest(LLDBCoreTests FormatEntityTest.cpp MangledTest.cpp ModuleSpecTest.cpp + PluginManagerTest.cpp ProgressReportTest.cpp RichManglingContextTest.cpp SourceLocationSpecTest.cpp diff --git a/lldb/unittests/Core/PluginManagerTest.cpp b/lldb/unittests/Core/PluginManagerTest.cpp new file mode 100644 index 0000000000000..ca1003ca9a85a --- /dev/null +++ b/lldb/unittests/Core/PluginManagerTest.cpp @@ -0,0 +1,367 @@ + +#include "lldb/Core/PluginManager.h" + +#include "gtest/gtest.h" + +using namespace lldb; +using namespace lldb_private; + +// Mock system runtime plugin create functions. +SystemRuntime *CreateSystemRuntimePluginA(Process *process) { return nullptr; } + +SystemRuntime *CreateSystemRuntimePluginB(Process *process) { return nullptr; } + +SystemRuntime *CreateSystemRuntimePluginC(Process *process) { return nullptr; } + +// Test class for testing the PluginManager. +// The PluginManager modifies global state when registering new plugins. This +// class is intended to undo those modifications in the destructor to give each +// test a clean slate with no registered plugins at the start of a test. +class PluginManagerTest : public testing::Test { +public: + // Remove any pre-registered plugins so we have a known starting point. + static void SetUpTestSuite() { RemoveAllRegisteredSystemRuntimePlugins(); } + + // Add mock system runtime plugins for testing. + void RegisterMockSystemRuntimePlugins() { + ASSERT_TRUE(PluginManager::RegisterPlugin("a", "test instance A", + CreateSystemRuntimePluginA)); + ASSERT_TRUE(PluginManager::RegisterPlugin("b", "test instance B", + CreateSystemRuntimePluginB)); + ASSERT_TRUE(PluginManager::RegisterPlugin("c", "test instance C", + CreateSystemRuntimePluginC)); + } + + // Remove any plugins added during the tests. + virtual ~PluginManagerTest() override { + RemoveAllRegisteredSystemRuntimePlugins(); + } + +protected: + std::vector m_system_runtime_plugins; + + static void RemoveAllRegisteredSystemRuntimePlugins() { + // Enable all currently registered plugins so we can get a handle to + // their create callbacks in the loop below. Only enabled plugins + // are returned from the PluginManager Get*CreateCallbackAtIndex apis. + for (const RegisteredPluginInfo &PluginInfo : + PluginManager::GetSystemRuntimePluginInfo()) { + PluginManager::SetSystemRuntimePluginEnabled(PluginInfo.name, true); + } + + // Get a handle to the create call backs for all the registered plugins. + std::vector registered_plugin_callbacks; + SystemRuntimeCreateInstance create_callback = nullptr; + for (uint32_t idx = 0; + (create_callback = + PluginManager::GetSystemRuntimeCreateCallbackAtIndex(idx)) != + nullptr; + ++idx) { + registered_plugin_callbacks.push_back((create_callback)); + } + + // Remove all currently registered plugins. + for (SystemRuntimeCreateInstance create_callback : + registered_plugin_callbacks) { + PluginManager::UnregisterPlugin(create_callback); + } + } +}; + +// Test basic register functionality. +TEST_F(PluginManagerTest, RegisterSystemRuntimePlugin) { + RegisterMockSystemRuntimePlugins(); + + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginA); + + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginB); + + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(2), + CreateSystemRuntimePluginC); + + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(3), nullptr); +} + +// Test basic un-register functionality. +TEST_F(PluginManagerTest, UnRegisterSystemRuntimePlugin) { + RegisterMockSystemRuntimePlugins(); + + ASSERT_TRUE(PluginManager::UnregisterPlugin(CreateSystemRuntimePluginB)); + + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginA); + + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginC); + + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(2), nullptr); +} + +// Test registered plugin info functionality. +TEST_F(PluginManagerTest, SystemRuntimePluginInfo) { + RegisterMockSystemRuntimePlugins(); + + std::vector plugin_info = + PluginManager::GetSystemRuntimePluginInfo(); + ASSERT_EQ(plugin_info.size(), 3u); + ASSERT_EQ(plugin_info[0].name, "a"); + ASSERT_EQ(plugin_info[0].description, "test instance A"); + ASSERT_EQ(plugin_info[0].enabled, true); + ASSERT_EQ(plugin_info[1].name, "b"); + ASSERT_EQ(plugin_info[1].description, "test instance B"); + ASSERT_EQ(plugin_info[1].enabled, true); + ASSERT_EQ(plugin_info[2].name, "c"); + ASSERT_EQ(plugin_info[2].description, "test instance C"); + ASSERT_EQ(plugin_info[2].enabled, true); +} + +// Test basic un-register functionality. +TEST_F(PluginManagerTest, UnRegisterSystemRuntimePluginInfo) { + RegisterMockSystemRuntimePlugins(); + + // Initial plugin info has all three registered plugins. + std::vector plugin_info = + PluginManager::GetSystemRuntimePluginInfo(); + ASSERT_EQ(plugin_info.size(), 3u); + + ASSERT_TRUE(PluginManager::UnregisterPlugin(CreateSystemRuntimePluginB)); + + // After un-registering a plugin it should be removed from plugin info. + plugin_info = PluginManager::GetSystemRuntimePluginInfo(); + ASSERT_EQ(plugin_info.size(), 2u); + ASSERT_EQ(plugin_info[0].name, "a"); + ASSERT_EQ(plugin_info[0].enabled, true); + ASSERT_EQ(plugin_info[1].name, "c"); + ASSERT_EQ(plugin_info[1].enabled, true); +} + +// Test plugin disable functionality. +TEST_F(PluginManagerTest, SystemRuntimePluginDisable) { + RegisterMockSystemRuntimePlugins(); + + // Disable plugin should succeed. + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("b", false)); + + // Disabling a plugin does not remove it from plugin info. + std::vector plugin_info = + PluginManager::GetSystemRuntimePluginInfo(); + ASSERT_EQ(plugin_info.size(), 3u); + ASSERT_EQ(plugin_info[0].name, "a"); + ASSERT_EQ(plugin_info[0].enabled, true); + ASSERT_EQ(plugin_info[1].name, "b"); + ASSERT_EQ(plugin_info[1].enabled, false); + ASSERT_EQ(plugin_info[2].name, "c"); + ASSERT_EQ(plugin_info[2].enabled, true); + + // Disabling a plugin does remove it from available plugins. + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginA); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginC); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(2), nullptr); +} + +// Test plugin disable and enable functionality. +TEST_F(PluginManagerTest, SystemRuntimePluginDisableThenEnable) { + RegisterMockSystemRuntimePlugins(); + + // Initially plugin b is available in slot 1. + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginB); + + // Disabling it will remove it from available plugins. + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("b", false)); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginA); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginC); + + // We can re-enable the plugin later and it should go back to the original + // slot. + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("b", true)); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginA); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginB); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(2), + CreateSystemRuntimePluginC); + + // And show up in the plugin info correctly. + std::vector plugin_info = + PluginManager::GetSystemRuntimePluginInfo(); + ASSERT_EQ(plugin_info.size(), 3u); + ASSERT_EQ(plugin_info[0].name, "a"); + ASSERT_EQ(plugin_info[0].enabled, true); + ASSERT_EQ(plugin_info[1].name, "b"); + ASSERT_EQ(plugin_info[1].enabled, true); + ASSERT_EQ(plugin_info[2].name, "c"); + ASSERT_EQ(plugin_info[2].enabled, true); +} + +// Test calling disable on an already disabled plugin is ok. +TEST_F(PluginManagerTest, SystemRuntimePluginDisableDisabled) { + RegisterMockSystemRuntimePlugins(); + + // Initial call to disable the plugin should succeed. + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("b", false)); + + // The second call should also succeed because the plugin is already disabled. + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("b", false)); + + // The call to re-enable the plugin should succeed. + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("b", true)); + + // The second call should also succeed since the plugin is already enabled. + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("b", true)); +} + +// Test calling disable on an already disabled plugin is ok. +TEST_F(PluginManagerTest, SystemRuntimePluginDisableNonExistent) { + RegisterMockSystemRuntimePlugins(); + + // Both enable and disable should return false for a non-existent plugin. + ASSERT_FALSE( + PluginManager::SetSystemRuntimePluginEnabled("does_not_exist", true)); + ASSERT_FALSE( + PluginManager::SetSystemRuntimePluginEnabled("does_not_exist", false)); +} + +// Test disabling all plugins and then re-enabling them in a different +// order will restore the original plugin order. +TEST_F(PluginManagerTest, SystemRuntimePluginDisableAll) { + RegisterMockSystemRuntimePlugins(); + + // Validate initial state of registered plugins. + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginA); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginB); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(2), + CreateSystemRuntimePluginC); + + // Disable all the active plugins. + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("a", false)); + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("b", false)); + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("c", false)); + + // Should have no active plugins. + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), nullptr); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), nullptr); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(2), nullptr); + + // And show up in the plugin info correctly. + std::vector plugin_info = + PluginManager::GetSystemRuntimePluginInfo(); + ASSERT_EQ(plugin_info.size(), 3u); + ASSERT_EQ(plugin_info[0].name, "a"); + ASSERT_EQ(plugin_info[0].enabled, false); + ASSERT_EQ(plugin_info[1].name, "b"); + ASSERT_EQ(plugin_info[1].enabled, false); + ASSERT_EQ(plugin_info[2].name, "c"); + ASSERT_EQ(plugin_info[2].enabled, false); + + // Enable plugins in reverse order and validate expected indicies. + // They should show up in the original plugin order. + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("c", true)); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginC); + + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("a", true)); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginA); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginC); + + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("b", true)); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginA); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginB); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(2), + CreateSystemRuntimePluginC); +} + +// Test un-registering a disabled plugin works. +TEST_F(PluginManagerTest, UnRegisterDisabledSystemRuntimePlugin) { + RegisterMockSystemRuntimePlugins(); + + // Initial plugin info has all three registered plugins. + std::vector plugin_info = + PluginManager::GetSystemRuntimePluginInfo(); + ASSERT_EQ(plugin_info.size(), 3u); + + // First disable a plugin, then unregister it. Both should succeed. + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("b", false)); + ASSERT_TRUE(PluginManager::UnregisterPlugin(CreateSystemRuntimePluginB)); + + // After un-registering a plugin it should be removed from plugin info. + plugin_info = PluginManager::GetSystemRuntimePluginInfo(); + ASSERT_EQ(plugin_info.size(), 2u); + ASSERT_EQ(plugin_info[0].name, "a"); + ASSERT_EQ(plugin_info[0].enabled, true); + ASSERT_EQ(plugin_info[1].name, "c"); + ASSERT_EQ(plugin_info[1].enabled, true); +} + +// Test un-registering and then re-registering a plugin will change the order of +// loaded plugins. +TEST_F(PluginManagerTest, UnRegisterSystemRuntimePluginChangesOrder) { + RegisterMockSystemRuntimePlugins(); + + std::vector plugin_info = + PluginManager::GetSystemRuntimePluginInfo(); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginA); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginB); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(2), + CreateSystemRuntimePluginC); + + ASSERT_EQ(plugin_info.size(), 3u); + ASSERT_EQ(plugin_info[0].name, "a"); + ASSERT_EQ(plugin_info[1].name, "b"); + ASSERT_EQ(plugin_info[2].name, "c"); + + // Unregister and then registering a plugin puts it at the end of the order + // list. + ASSERT_TRUE(PluginManager::UnregisterPlugin(CreateSystemRuntimePluginB)); + ASSERT_TRUE(PluginManager::RegisterPlugin("b", "New test instance B", + CreateSystemRuntimePluginB)); + + // Check the callback indices match as expected. + plugin_info = PluginManager::GetSystemRuntimePluginInfo(); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginA); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginC); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(2), + CreateSystemRuntimePluginB); + + // And plugin info should match as well. + ASSERT_EQ(plugin_info.size(), 3u); + ASSERT_EQ(plugin_info[0].name, "a"); + ASSERT_EQ(plugin_info[1].name, "c"); + ASSERT_EQ(plugin_info[2].name, "b"); + ASSERT_EQ(plugin_info[2].description, "New test instance B"); + + // Disabling and re-enabling the "c" plugin should slot it back + // into the middle of the order. Originally it was last, but after + // un-registering and re-registering "b" it should now stay in + // the middle of the order. + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("c", false)); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginA); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginB); + + // And re-enabling + ASSERT_TRUE(PluginManager::SetSystemRuntimePluginEnabled("c", true)); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(0), + CreateSystemRuntimePluginA); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(1), + CreateSystemRuntimePluginC); + ASSERT_EQ(PluginManager::GetSystemRuntimeCreateCallbackAtIndex(2), + CreateSystemRuntimePluginB); +} From 564e04b703dc5df062f862e32c00bf1a1716f96f Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Wed, 2 Apr 2025 15:16:33 -0500 Subject: [PATCH 0436/1029] [flang][OpenMP] Use function symbol on DECLARE TARGET (#134107) Consider: ``` function foo() !$omp declare target(foo) ! This `foo` was a function-result symbol ... end ``` When resolving symbols, for this case use the symbol corresponding to the function instead of the symbol corresponding to the function result. Currently, this will result in an error: ``` error: A variable that appears in a DECLARE TARGET directive must be declared in the scope of a module or have the SAVE attribute, either explicitly or implicitly ``` --- flang/lib/Semantics/resolve-directives.cpp | 9 +++++ flang/lib/Semantics/unparse-with-symbols.cpp | 8 +++++ .../OpenMP/declare-target-func-and-subr.f90 | 7 ++++ ...lare-target-function-name-with-symbols.f90 | 34 +++++++++++++++++++ 4 files changed, 58 insertions(+) create mode 100644 flang/test/Semantics/OpenMP/declare-target-function-name-with-symbols.f90 diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index f905da0a7239d..a5b3391859500 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -2514,6 +2514,15 @@ void OmpAttributeVisitor::ResolveOmpObject( name->ToString()); } } + if (ompFlag == Symbol::Flag::OmpDeclareTarget) { + if (symbol->IsFuncResult()) { + if (Symbol * func{currScope().symbol()}) { + CHECK(func->IsSubprogram()); + func->set(ompFlag); + name->symbol = func; + } + } + } if (GetContext().directive == llvm::omp::Directive::OMPD_target_data) { checkExclusivelists(symbol, Symbol::Flag::OmpUseDevicePtr, diff --git a/flang/lib/Semantics/unparse-with-symbols.cpp b/flang/lib/Semantics/unparse-with-symbols.cpp index 02afb89ae57fa..2716d88efb9fb 100644 --- a/flang/lib/Semantics/unparse-with-symbols.cpp +++ b/flang/lib/Semantics/unparse-with-symbols.cpp @@ -61,6 +61,14 @@ class SymbolDumpVisitor { currStmt_ = std::nullopt; } + bool Pre(const parser::OpenMPDeclareTargetConstruct &x) { + currStmt_ = x.source; + return true; + } + void Post(const parser::OpenMPDeclareTargetConstruct &) { + currStmt_ = std::nullopt; + } + private: std::optional currStmt_; // current statement we are processing std::multimap symbols_; // location to symbol diff --git a/flang/test/Lower/OpenMP/declare-target-func-and-subr.f90 b/flang/test/Lower/OpenMP/declare-target-func-and-subr.f90 index db8320a598052..1c43f1d09eddb 100644 --- a/flang/test/Lower/OpenMP/declare-target-func-and-subr.f90 +++ b/flang/test/Lower/OpenMP/declare-target-func-and-subr.f90 @@ -85,6 +85,13 @@ FUNCTION FUNC_DEFAULT_EXTENDEDLIST() RESULT(I) I = 1 END FUNCTION FUNC_DEFAULT_EXTENDEDLIST +! ALL-LABEL: func.func @_QPfunc_name_as_result() +! ALL-SAME: {{.*}}attributes {omp.declare_target = #omp.declaretarget{{.*}} +FUNCTION FUNC_NAME_AS_RESULT() +!$omp declare target(FUNC_NAME_AS_RESULT) + FUNC_NAME_AS_RESULT = 1.0 +END FUNCTION FUNC_NAME_AS_RESULT + !! ----- ! Check specification valid forms of declare target with subroutines diff --git a/flang/test/Semantics/OpenMP/declare-target-function-name-with-symbols.f90 b/flang/test/Semantics/OpenMP/declare-target-function-name-with-symbols.f90 new file mode 100644 index 0000000000000..9a0acdb3dd100 --- /dev/null +++ b/flang/test/Semantics/OpenMP/declare-target-function-name-with-symbols.f90 @@ -0,0 +1,34 @@ +!RUN: %flang_fc1 -fdebug-unparse-with-symbols -fopenmp %s 2>&1 | FileCheck %s + +! This used to crash. + +module test + contains + function ex(a, b, c) + !$omp declare target(ex) + integer :: a, b, c + ex = a + b + c + end function ex +end module test + +!CHECK: !DEF: /test Module +!CHECK: module test +!CHECK: contains +!CHECK: !DEF: /test/ex PUBLIC (Function, OmpDeclareTarget) Subprogram REAL(4) +!CHECK: !DEF: /test/ex/a ObjectEntity INTEGER(4) +!CHECK: !DEF: /test/ex/b ObjectEntity INTEGER(4) +!CHECK: !DEF: /test/ex/c ObjectEntity INTEGER(4) +!CHECK: function ex(a, b, c) +!CHECK: !$omp declare target (ex) +!CHECK: !REF: /test/ex/a +!CHECK: !REF: /test/ex/b +!CHECK: !REF: /test/ex/c +!CHECK: integer a, b, c +!CHECK: !DEF: /test/ex/ex (Implicit, OmpDeclareTarget) ObjectEntity REAL(4) +!CHECK: !REF: /test/ex/a +!CHECK: !REF: /test/ex/b +!CHECK: !REF: /test/ex/c +!CHECK: ex = a+b+c +!CHECK: end function ex +!CHECK: end module test + From df9e5ae5b40c4d245d904a2565e46f5b7ab9c7c8 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 2 Apr 2025 21:21:52 +0100 Subject: [PATCH 0437/1029] [InstCombine] Match scalable splats in m_ImmConstant (#132522) #118806 fixed an infinite loop in FoldShiftByConstant that could occur when the shift amount was a ConstantExpr. However this meant that FoldShiftByConstant no longer kicked in for scalable vectors because scalable splats are represented by ConstantExprs. This fixes it by allowing scalable splats of non-ConstantExprs in m_ImmConstant, which also fixes a few other test cases where scalable splats were being missed. But I'm also hoping that UseConstantIntForScalableSplat will eventually remove the need for this. I noticed this when trying to reverse a combine on RISC-V in #132245, and saw that the resulting vector and scalar forms were different. --------- Co-authored-by: Yingwei Zheng --- llvm/include/llvm/IR/PatternMatch.h | 51 +++++++++++++++---- llvm/test/Transforms/InstCombine/select.ll | 3 +- llvm/test/Transforms/InstCombine/shl-bo.ll | 11 ++++ .../InstCombine/shl-twice-constant.ll | 11 ++++ llvm/test/Transforms/InstCombine/sub.ll | 4 +- 5 files changed, 66 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index b3eeb1d7ba88a..2d27c19e1b85e 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -858,18 +858,51 @@ inline bind_ty m_BasicBlock(const BasicBlock *&V) { return V; } +// TODO: Remove once UseConstant{Int,FP}ForScalableSplat is enabled by default, +// and use m_Unless(m_ConstantExpr). +struct immconstant_ty { + template static bool isImmConstant(ITy *V) { + if (auto *CV = dyn_cast(V)) { + if (!isa(CV) && !CV->containsConstantExpression()) + return true; + + if (CV->getType()->isVectorTy()) { + if (auto *Splat = CV->getSplatValue(/*AllowPoison=*/true)) { + if (!isa(Splat) && + !Splat->containsConstantExpression()) { + return true; + } + } + } + } + return false; + } +}; + +struct match_immconstant_ty : immconstant_ty { + template bool match(ITy *V) { return isImmConstant(V); } +}; + /// Match an arbitrary immediate Constant and ignore it. -inline match_combine_and, - match_unless> -m_ImmConstant() { - return m_CombineAnd(m_Constant(), m_Unless(m_ConstantExpr())); -} +inline match_immconstant_ty m_ImmConstant() { return match_immconstant_ty(); } + +struct bind_immconstant_ty : immconstant_ty { + Constant *&VR; + + bind_immconstant_ty(Constant *&V) : VR(V) {} + + template bool match(ITy *V) { + if (isImmConstant(V)) { + VR = cast(V); + return true; + } + return false; + } +}; /// Match an immediate Constant, capturing the value if we match. -inline match_combine_and, - match_unless> -m_ImmConstant(Constant *&C) { - return m_CombineAnd(m_Constant(C), m_Unless(m_ConstantExpr())); +inline bind_immconstant_ty m_ImmConstant(Constant *&C) { + return bind_immconstant_ty(C); } /// Match a specified Value*. diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index 2078b795817f8..3d81b72dd232e 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -3519,8 +3519,7 @@ define @scalable_sign_bits( %x) { define @scalable_non_zero( %x) { ; CHECK-LABEL: @scalable_non_zero( -; CHECK-NEXT: [[A:%.*]] = or [[X:%.*]], splat (i32 1) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult [[A]], splat (i32 57) +; CHECK-NEXT: [[CMP:%.*]] = icmp ult [[X:%.*]], splat (i32 56) ; CHECK-NEXT: ret [[CMP]] ; %a = or %x, splat (i32 1) diff --git a/llvm/test/Transforms/InstCombine/shl-bo.ll b/llvm/test/Transforms/InstCombine/shl-bo.ll index c32ac2eacb25a..5ee8716d5d119 100644 --- a/llvm/test/Transforms/InstCombine/shl-bo.ll +++ b/llvm/test/Transforms/InstCombine/shl-bo.ll @@ -656,3 +656,14 @@ define <16 x i8> @test_FoldShiftByConstant_CreateAnd(<16 x i8> %in0) { %vshl_n = shl <16 x i8> %tmp, ret <16 x i8> %vshl_n } + +define @test_FoldShiftByConstant_CreateAnd_scalable( %x) { +; CHECK-LABEL: @test_FoldShiftByConstant_CreateAnd_scalable( +; CHECK-NEXT: [[TMP1:%.*]] = shl [[X:%.*]], splat (i8 2) +; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], splat (i8 8) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = and %x, splat (i8 2) + %2 = shl %1, splat (i8 2) + ret %2 +} diff --git a/llvm/test/Transforms/InstCombine/shl-twice-constant.ll b/llvm/test/Transforms/InstCombine/shl-twice-constant.ll index bbdd7fa3d1c40..151db29fe3e5f 100644 --- a/llvm/test/Transforms/InstCombine/shl-twice-constant.ll +++ b/llvm/test/Transforms/InstCombine/shl-twice-constant.ll @@ -14,3 +14,14 @@ define i64 @testfunc() { %shl2 = shl i64 %shl1, ptrtoint (ptr @c to i64) ret i64 %shl2 } + +define @scalable() { +; CHECK-LABEL: @scalable( +; CHECK-NEXT: [[SHL1:%.*]] = shl nuw splat (i64 1), shufflevector ( insertelement ( poison, i64 ptrtoint (ptr @c2 to i64), i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[SHL2:%.*]] = shl [[SHL1]], shufflevector ( insertelement ( poison, i64 ptrtoint (ptr @c to i64), i64 0), poison, zeroinitializer) +; CHECK-NEXT: ret [[SHL2]] +; + %shl1 = shl splat (i64 1), splat (i64 ptrtoint (ptr @c2 to i64)) + %shl2 = shl %shl1, splat (i64 ptrtoint (ptr @c to i64)) + ret %shl2 +} diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll index e89419d1f3838..81ecd8506514e 100644 --- a/llvm/test/Transforms/InstCombine/sub.ll +++ b/llvm/test/Transforms/InstCombine/sub.ll @@ -857,11 +857,9 @@ define <2 x i16> @test44vecminval(<2 x i16> %x) { ret <2 x i16> %sub } -; FIXME: This isn't combined to xor as above because the pattern in visitSub -; uses m_ImmConstant which matches Constant but (explicitly) not ConstantExpr. define @test44scalablevecminval( %x) { ; CHECK-LABEL: @test44scalablevecminval( -; CHECK-NEXT: [[SUB:%.*]] = add [[X:%.*]], splat (i16 -32768) +; CHECK-NEXT: [[SUB:%.*]] = xor [[X:%.*]], splat (i16 -32768) ; CHECK-NEXT: ret [[SUB]] ; %sub = sub nsw %x, splat (i16 -32768) From 2bee24632f38699f1af8fdf4daa5b28053c7ae5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Wed, 2 Apr 2025 21:26:41 +0100 Subject: [PATCH 0438/1029] [mlir][bugfix] Fix erroneous condition in `getEffectsOnResource` (#133638) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch corrects an invalid condition in `getEffectsOnResource` used to identify relevant "resources": ```cpp return it.getResource() != resource; ``` The current implementation assumes that only one instance of each resource will exist, so comparing raw pointers is both safe and sufficient. This assumption stems from constructs like: ```cpp static DerivedResource *get() { static DerivedResource instance; return &instance; } ``` i.e., resource instances returned via static singleton methods. However, as discussed in * https://github.com/llvm/llvm-project/issues/129216, this assumption breaks in practice — notably on macOS (Apple Silicon) when built with: * `-DBUILD_SHARED_LIBS=On`. In such cases, multiple instances of the same logical resource may exist across shared library boundaries, leading to incorrect behavior and causing failures in tests like: * test/Dialect/Transform/check-use-after-free.mlir This patch replaces the pointer comparison with a comparison based on resource identity: ```cpp return it.getResource()->getResourceID() != resource->getResourceID(); ``` This approach aligns better with the intent of `getEffectsOnResource`, which is to: ```cpp /// Collect all of the effect instances that operate on the provided /// resource (...) ``` Fixes #129216 --- mlir/include/mlir/Interfaces/SideEffectInterfaceBase.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Interfaces/SideEffectInterfaceBase.td b/mlir/include/mlir/Interfaces/SideEffectInterfaceBase.td index 45a9ffa94363e..043829c24fda8 100644 --- a/mlir/include/mlir/Interfaces/SideEffectInterfaceBase.td +++ b/mlir/include/mlir/Interfaces/SideEffectInterfaceBase.td @@ -140,7 +140,7 @@ class EffectOpInterfaceBase }] # baseEffect # [{>> & effects) { getEffects(effects); ::llvm::erase_if(effects, [&](auto &it) { - return it.getResource() != resource; + return it.getResource()->getResourceID() != resource->getResourceID(); }); } }]; From bb8a7a7349f9842e587cb43b2a81b3d46c1e70ef Mon Sep 17 00:00:00 2001 From: erichkeane Date: Wed, 2 Apr 2025 13:33:01 -0700 Subject: [PATCH 0439/1029] [OpenACC] Implement 'pqr-list' has at least one 1 item. OpenACC Github PR#499 defines the pqr-list as having at least 1 item. We already handle that for all but 'wait', so this patch just does the work to add it for 'wait', plus adds tests. --- clang/lib/Parse/ParseOpenACC.cpp | 27 +- .../ast-print-openacc-combined-construct.cpp | 4 +- .../ast-print-openacc-compute-construct.cpp | 4 +- .../AST/ast-print-openacc-data-construct.cpp | 6 +- .../AST/ast-print-openacc-wait-construct.cpp | 4 +- clang/test/ParserOpenACC/parse-wait-clause.c | 9 + .../test/ParserOpenACC/parse-wait-construct.c | 9 + .../combined-construct-wait-ast.cpp | 4 +- .../combined-construct-wait-clause.c | 3 - .../compute-construct-intexpr-clause-ast.cpp | 4 +- .../compute-construct-wait-clause.c | 3 - .../SemaOpenACC/data-construct-wait-ast.cpp | 4 +- .../SemaOpenACC/data-construct-wait-clause.c | 2 +- clang/test/SemaOpenACC/no-empty-pqr-list.cpp | 437 ++++++++++++++++++ clang/test/SemaOpenACC/update-construct.cpp | 1 - clang/test/SemaOpenACC/wait-construct-ast.cpp | 4 +- 16 files changed, 493 insertions(+), 32 deletions(-) create mode 100644 clang/test/SemaOpenACC/no-empty-pqr-list.cpp diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp index 6c854ddcb505b..4f4ae362983d0 100644 --- a/clang/lib/Parse/ParseOpenACC.cpp +++ b/clang/lib/Parse/ParseOpenACC.cpp @@ -1274,19 +1274,32 @@ Parser::ParseOpenACCWaitArgument(SourceLocation Loc, bool IsDirective) { ConsumeToken(); } + + // OpenACC 3.3, section 2.16: // the term 'async-argument' means a nonnegative scalar integer expression, or // one of the special values 'acc_async_noval' or 'acc_async_sync', as defined // in the C header file and the Fortran opacc module. - bool FirstArg = true; + OpenACCIntExprParseResult Res = ParseOpenACCAsyncArgument( + IsDirective ? OpenACCDirectiveKind::Wait + : OpenACCDirectiveKind::Invalid, + IsDirective ? OpenACCClauseKind::Invalid : OpenACCClauseKind::Wait, + Loc); + + if (Res.first.isInvalid() && + Res.second == OpenACCParseCanContinue::Cannot) { + Result.Failed = true; + return Result; + } + + if (Res.first.isUsable()) + Result.QueueIdExprs.push_back(Res.first.get()); + while (!getCurToken().isOneOf(tok::r_paren, tok::annot_pragma_openacc_end)) { - if (!FirstArg) { - if (ExpectAndConsume(tok::comma)) { - Result.Failed = true; - return Result; - } + if (ExpectAndConsume(tok::comma)) { + Result.Failed = true; + return Result; } - FirstArg = false; OpenACCIntExprParseResult Res = ParseOpenACCAsyncArgument( IsDirective ? OpenACCDirectiveKind::Wait diff --git a/clang/test/AST/ast-print-openacc-combined-construct.cpp b/clang/test/AST/ast-print-openacc-combined-construct.cpp index 25fa29cbbe04e..b5afc1515aa18 100644 --- a/clang/test/AST/ast-print-openacc-combined-construct.cpp +++ b/clang/test/AST/ast-print-openacc-combined-construct.cpp @@ -139,8 +139,8 @@ void foo() { #pragma acc kernels loop deviceptr(iPtr, arrayPtr[0]) for(int i = 0;i<5;++i); -// CHECK: #pragma acc parallel loop wait() -#pragma acc parallel loop wait() +// CHECK: #pragma acc parallel loop wait +#pragma acc parallel loop wait for(int i = 0;i<5;++i); // CHECK: #pragma acc parallel loop wait(*iPtr, i) diff --git a/clang/test/AST/ast-print-openacc-compute-construct.cpp b/clang/test/AST/ast-print-openacc-compute-construct.cpp index fe580c86ac8ea..9516bfd843000 100644 --- a/clang/test/AST/ast-print-openacc-compute-construct.cpp +++ b/clang/test/AST/ast-print-openacc-compute-construct.cpp @@ -88,8 +88,8 @@ void foo() { #pragma acc parallel wait while(true); -// CHECK: #pragma acc parallel wait() -#pragma acc parallel wait() +// CHECK: #pragma acc parallel wait +#pragma acc parallel wait while(true); // CHECK: #pragma acc parallel wait(*iPtr, i) diff --git a/clang/test/AST/ast-print-openacc-data-construct.cpp b/clang/test/AST/ast-print-openacc-data-construct.cpp index f568ae5ce6346..6d6f54cb45ada 100644 --- a/clang/test/AST/ast-print-openacc-data-construct.cpp +++ b/clang/test/AST/ast-print-openacc-data-construct.cpp @@ -50,11 +50,11 @@ void foo() { #pragma acc exit data copyout(i) async // CHECK: #pragma acc data default(none) wait -#pragma acc data default(none) wait() +#pragma acc data default(none) wait ; -// CHECK: #pragma acc enter data copyin(Var) wait() -#pragma acc enter data copyin(Var) wait() +// CHECK: #pragma acc enter data copyin(Var) wait +#pragma acc enter data copyin(Var) wait // CHECK: #pragma acc exit data copyout(Var) wait(*iPtr, i) #pragma acc exit data copyout(Var) wait(*iPtr, i) diff --git a/clang/test/AST/ast-print-openacc-wait-construct.cpp b/clang/test/AST/ast-print-openacc-wait-construct.cpp index 35354596be8d0..8223792277d8f 100644 --- a/clang/test/AST/ast-print-openacc-wait-construct.cpp +++ b/clang/test/AST/ast-print-openacc-wait-construct.cpp @@ -5,8 +5,8 @@ void uses() { int I; float array[5]; -// CHECK: #pragma acc wait() if(I == array[I]) -#pragma acc wait() if(I == array[I]) +// CHECK: #pragma acc wait if(I == array[I]) +#pragma acc wait if(I == array[I]) // CHECK: #pragma acc wait(*iPtr, I) async #pragma acc wait(*iPtr, I) async diff --git a/clang/test/ParserOpenACC/parse-wait-clause.c b/clang/test/ParserOpenACC/parse-wait-clause.c index 9c7faa5c02eb3..16e31a67c094f 100644 --- a/clang/test/ParserOpenACC/parse-wait-clause.c +++ b/clang/test/ParserOpenACC/parse-wait-clause.c @@ -10,14 +10,17 @@ void func() { #pragma acc parallel wait clause-list {} + // expected-error@+3{{expected expression}} // expected-error@+2{{expected ')'}} // expected-note@+1{{to match this '('}} #pragma acc parallel wait ( {} + // expected-error@+1{{expected expression}} #pragma acc parallel wait () {} + // expected-error@+2{{expected expression}} // expected-error@+1{{invalid OpenACC clause 'clause'}} #pragma acc parallel wait () clause-list {} @@ -52,26 +55,32 @@ void func() { #pragma acc parallel wait (devnum: i + j) clause-list {} + // expected-error@+3{{expected expression}} // expected-error@+2{{expected ')'}} // expected-note@+1{{to match this '('}} #pragma acc parallel wait (queues: {} + // expected-error@+1{{expected expression}} #pragma acc parallel wait (queues:) {} + // expected-error@+2{{expected expression}} // expected-error@+1{{invalid OpenACC clause 'clause'}} #pragma acc parallel wait (queues:) clause-list {} + // expected-error@+3{{expected expression}} // expected-error@+2{{expected ')'}} // expected-note@+1{{to match this '('}} #pragma acc parallel wait (devnum: i + j:queues: {} + // expected-error@+1{{expected expression}} #pragma acc parallel wait (devnum: i + j:queues:) {} + // expected-error@+2{{expected expression}} // expected-error@+1{{invalid OpenACC clause 'clause'}} #pragma acc parallel wait (devnum: i + j:queues:) clause-list {} diff --git a/clang/test/ParserOpenACC/parse-wait-construct.c b/clang/test/ParserOpenACC/parse-wait-construct.c index 17b7ecbde7856..491c3bee4ac5a 100644 --- a/clang/test/ParserOpenACC/parse-wait-construct.c +++ b/clang/test/ParserOpenACC/parse-wait-construct.c @@ -8,12 +8,15 @@ void func() { // expected-error@+1{{invalid OpenACC clause 'clause'}} #pragma acc wait clause-list + // expected-error@+3{{expected expression}} // expected-error@+2{{expected ')'}} // expected-note@+1{{to match this '('}} #pragma acc wait ( + // expected-error@+1{{expected expression}} #pragma acc wait () + // expected-error@+2{{expected expression}} // expected-error@+1{{invalid OpenACC clause 'clause'}} #pragma acc wait () clause-list @@ -41,21 +44,27 @@ void func() { // expected-error@+1{{invalid OpenACC clause 'clause'}} #pragma acc wait (devnum: i + j) clause-list + // expected-error@+3{{expected expression}} // expected-error@+2{{expected ')'}} // expected-note@+1{{to match this '('}} #pragma acc wait (queues: + // expected-error@+1{{expected expression}} #pragma acc wait (queues:) + // expected-error@+2{{expected expression}} // expected-error@+1{{invalid OpenACC clause 'clause'}} #pragma acc wait (queues:) clause-list + // expected-error@+3{{expected expression}} // expected-error@+2{{expected ')'}} // expected-note@+1{{to match this '('}} #pragma acc wait (devnum: i + j:queues: + // expected-error@+1{{expected expression}} #pragma acc wait (devnum: i + j:queues:) + // expected-error@+2{{expected expression}} // expected-error@+1{{invalid OpenACC clause 'clause'}} #pragma acc wait (devnum: i + j:queues:) clause-list diff --git a/clang/test/SemaOpenACC/combined-construct-wait-ast.cpp b/clang/test/SemaOpenACC/combined-construct-wait-ast.cpp index 0620901fd9fdb..ca28e32661ff7 100644 --- a/clang/test/SemaOpenACC/combined-construct-wait-ast.cpp +++ b/clang/test/SemaOpenACC/combined-construct-wait-ast.cpp @@ -20,7 +20,7 @@ void NormalUses() { // CHECK-NEXT: wait clause // CHECK-NEXT: <<>> // CHECK-NEXT: ForStmt -#pragma acc serial loop wait() +#pragma acc serial loop wait for (int i = 0; i < 5; ++i) {} // CHECK: OpenACCCombinedConstruct{{.*}}serial loop // CHECK-NEXT: wait clause @@ -105,7 +105,7 @@ void TemplUses(U u) { // CHECK-NEXT: <<>> // CHECK-NEXT: ForStmt -#pragma acc serial loop wait() +#pragma acc serial loop wait for (int i = 0; i < 5; ++i) {} // CHECK: OpenACCCombinedConstruct{{.*}}serial loop // CHECK-NEXT: wait clause diff --git a/clang/test/SemaOpenACC/combined-construct-wait-clause.c b/clang/test/SemaOpenACC/combined-construct-wait-clause.c index 7551dd1840963..61aa5bc6a4a0a 100644 --- a/clang/test/SemaOpenACC/combined-construct-wait-clause.c +++ b/clang/test/SemaOpenACC/combined-construct-wait-clause.c @@ -10,9 +10,6 @@ void uses() { #pragma acc parallel loop wait for (unsigned i = 0; i < 5; ++i); -#pragma acc serial loop wait() - for (unsigned i = 0; i < 5; ++i); - #pragma acc kernels loop wait(getS(), getI()) for (unsigned i = 0; i < 5; ++i); diff --git a/clang/test/SemaOpenACC/compute-construct-intexpr-clause-ast.cpp b/clang/test/SemaOpenACC/compute-construct-intexpr-clause-ast.cpp index baf7aa62c5f7f..58476dfcee06d 100644 --- a/clang/test/SemaOpenACC/compute-construct-intexpr-clause-ast.cpp +++ b/clang/test/SemaOpenACC/compute-construct-intexpr-clause-ast.cpp @@ -144,7 +144,7 @@ void NormalUses() { // CHECK-NEXT: WhileStmt // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: CompoundStmt -#pragma acc parallel wait() +#pragma acc parallel wait while (true){} // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel // CHECK-NEXT: wait clause @@ -378,7 +378,7 @@ void TemplUses(T t, U u) { // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: CompoundStmt -#pragma acc parallel wait() +#pragma acc parallel wait while (true){} // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel // CHECK-NEXT: wait clause diff --git a/clang/test/SemaOpenACC/compute-construct-wait-clause.c b/clang/test/SemaOpenACC/compute-construct-wait-clause.c index df82740a465a8..7aba24243e8e6 100644 --- a/clang/test/SemaOpenACC/compute-construct-wait-clause.c +++ b/clang/test/SemaOpenACC/compute-construct-wait-clause.c @@ -10,9 +10,6 @@ void uses() { #pragma acc parallel wait while(1); -#pragma acc serial wait() - while(1); - #pragma acc kernels wait(getS(), getI()) while(1); diff --git a/clang/test/SemaOpenACC/data-construct-wait-ast.cpp b/clang/test/SemaOpenACC/data-construct-wait-ast.cpp index 7fb82313669df..2a8e56bc0b363 100644 --- a/clang/test/SemaOpenACC/data-construct-wait-ast.cpp +++ b/clang/test/SemaOpenACC/data-construct-wait-ast.cpp @@ -26,7 +26,7 @@ void NormalUses() { // CHECK-NEXT: wait clause // CHECK-NEXT: <<>> // CHECK-NEXT: NullStmt -#pragma acc enter data copyin(I) wait() +#pragma acc enter data copyin(I) wait // CHECK: OpenACCEnterDataConstruct{{.*}}enter data // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'I' 'int' @@ -119,7 +119,7 @@ void TemplUses(U u) { // CHECK-NEXT: <<>> // CHECK-NEXT: NullStmt -#pragma acc enter data copyin(I) wait() +#pragma acc enter data copyin(I) wait // CHECK: OpenACCEnterDataConstruct{{.*}}enter data // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'I' 'U' diff --git a/clang/test/SemaOpenACC/data-construct-wait-clause.c b/clang/test/SemaOpenACC/data-construct-wait-clause.c index cef2dbdca29ed..91ef97e3fa749 100644 --- a/clang/test/SemaOpenACC/data-construct-wait-clause.c +++ b/clang/test/SemaOpenACC/data-construct-wait-clause.c @@ -10,7 +10,7 @@ void uses() { #pragma acc data copyin(arr[0]) wait ; -#pragma acc enter data copyin(arr[0]) wait() +#pragma acc enter data copyin(arr[0]) wait #pragma acc exit data copyout(arr[0]) wait(getS(), getI()) diff --git a/clang/test/SemaOpenACC/no-empty-pqr-list.cpp b/clang/test/SemaOpenACC/no-empty-pqr-list.cpp new file mode 100644 index 0000000000000..fdac89646a5f8 --- /dev/null +++ b/clang/test/SemaOpenACC/no-empty-pqr-list.cpp @@ -0,0 +1,437 @@ +// RUN: %clang_cc1 %s -fopenacc -verify + +void Compute() { + // expected-error@+1{{expected expression}} +#pragma acc parallel wait() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial wait() + ; + // expected-error@+1{{expected expression}} +#pragma acc kernels wait() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel num_gangs() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial num_gangs() + ; + // expected-error@+1{{expected expression}} +#pragma acc kernels num_gangs() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel num_workers() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial num_workers() + ; + // expected-error@+1{{expected expression}} +#pragma acc kernels num_workers() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel vector_length() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial vector_length() + ; + // expected-error@+1{{expected expression}} +#pragma acc kernels vector_length() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel reduction(+:) + ; + // expected-error@+1{{expected expression}} +#pragma acc serial reduction(+:) + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel copy() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial copy() + ; + // expected-error@+1{{expected expression}} +#pragma acc kernels copy() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel copyin() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial copyin(readonly:) + ; + // expected-error@+1{{expected expression}} +#pragma acc kernels copyin() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel copyout() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial copyout(zero:) + ; + // expected-error@+1{{expected expression}} +#pragma acc kernels copyout() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel create() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial create(zero:) + ; + // expected-error@+1{{expected expression}} +#pragma acc kernels create() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel no_create() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial no_create() + ; + // expected-error@+1{{expected expression}} +#pragma acc kernels no_create() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel present() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial present() + ; + // expected-error@+1{{expected expression}} +#pragma acc kernels present() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel deviceptr() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial deviceptr() + ; + // expected-error@+1{{expected expression}} +#pragma acc kernels deviceptr() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel attach() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial attach() + ; + // expected-error@+1{{expected expression}} +#pragma acc kernels attach() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel private() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial private() + ; + + // expected-error@+1{{expected expression}} +#pragma acc parallel firstprivate() + ; + // expected-error@+1{{expected expression}} +#pragma acc serial firstprivate() + ; + + // expected-error@+1{{expected identifier}} +#pragma acc parallel device_type() + ; + // expected-error@+1{{expected identifier}} +#pragma acc serial device_type() + ; + // expected-error@+1{{expected identifier}} +#pragma acc kernels device_type() + ; +} + +void Data(int i) { + // expected-error@+1{{expected expression}} +#pragma acc data default(none) wait() + // expected-error@+1{{expected expression}} +#pragma acc enter data copyin(i) wait() + // expected-error@+1{{expected expression}} +#pragma acc exit data copyout(i) wait() + + // expected-error@+1{{expected identifier}} +#pragma acc data default(none) device_type() + + // expected-error@+1{{expected expression}} +#pragma acc data copy() + + // expected-error@+1{{expected expression}} +#pragma acc data copyin() + // expected-error@+1{{expected expression}} +#pragma acc enter data copyin() + + // expected-error@+1{{expected expression}} +#pragma acc data copyout() + // expected-error@+1{{expected expression}} +#pragma acc exit data copyout() + + // expected-error@+1{{expected expression}} +#pragma acc exit data delete() + + // expected-error@+1{{expected expression}} +#pragma acc exit data detach() + + // expected-error@+1{{expected expression}} +#pragma acc data create() + // expected-error@+1{{expected expression}} +#pragma acc enter data create() + + // expected-error@+1{{expected expression}} +#pragma acc data default(none) no_create() + + // expected-error@+1{{expected expression}} +#pragma acc data present() + + // expected-error@+1{{expected expression}} +#pragma acc data deviceptr() + + // expected-error@+1{{expected expression}} +#pragma acc data attach() + // expected-error@+1{{expected expression}} +#pragma acc enter data attach() + + // expected-error@+1{{expected expression}} +#pragma acc host_data use_device() + ; +} + +void Executable(int i) { + // expected-error@+1{{expected identifier}} +#pragma acc init device_type() + // expected-error@+1{{expected identifier}} +#pragma acc shutdown device_type() + // expected-error@+1{{expected identifier}} +#pragma acc set if(true) device_type() + // expected-error@+1{{expected identifier}} +#pragma acc update self(i) device_type() + + // expected-error@+1{{expected expression}} +#pragma acc update self(i) wait() + // expected-error@+1{{expected expression}} +#pragma acc update self() + // expected-error@+1{{expected expression}} +#pragma acc update host() + // expected-error@+1{{expected expression}} +#pragma acc update device() + + // expected-error@+1{{expected expression}} +#pragma acc wait() +} + +void Other() { + // expected-error@+1{{expected expression}} +#pragma acc loop gang() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc loop worker() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc loop vector() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc loop tile() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected identifier}} +#pragma acc loop device_type() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc loop private() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc loop reduction(+:) + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc cache() + + // expected-error@+1{{expected expression}} +#pragma acc declare copy() + // expected-error@+1{{expected expression}} +#pragma acc declare copyin() + // expected-error@+1{{expected expression}} +#pragma acc declare copyout() + // expected-error@+1{{expected expression}} +#pragma acc declare create() + // expected-error@+1{{expected expression}} +#pragma acc declare present() + // expected-error@+1{{expected expression}} +#pragma acc declare deviceptr() + // expected-error@+1{{expected expression}} +#pragma acc declare device_resident() + // expected-error@+1{{expected expression}} +#pragma acc declare link() + + auto L1 =[]{}; + + // expected-error@+1{{expected identifier}} +#pragma acc routine(L1) seq device_type() + + // expected-error@+1{{expected identifier}} +#pragma acc routine seq device_type() + auto L2 =[]{}; +} + +void Combined() { + // expected-error@+1{{expected expression}} +#pragma acc parallel loop gang() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop gang() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop gang() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop tile() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop tile() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop tile() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected identifier}} +#pragma acc parallel loop device_type() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected identifier}} +#pragma acc serial loop device_type() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected identifier}} +#pragma acc kernels loop device_type() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop reduction(+:) + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop reduction(+:) + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop reduction(+:) + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop wait() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop wait() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop wait() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop num_gangs() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop num_gangs() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop copy() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop copy() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop copy() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop copyin() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop copyin() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop copyin() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop copyout() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop copyout() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop copyout() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop create() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop create() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop create() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop no_create() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop no_create() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop present() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop present() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop present() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop deviceptr() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop deviceptr() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop deviceptr() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop attach() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop attach() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop attach() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop private() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop private() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc kernels loop private() + for(int i = 0; i < 5; ++i); + + // expected-error@+1{{expected expression}} +#pragma acc parallel loop firstprivate() + for(int i = 0; i < 5; ++i); + // expected-error@+1{{expected expression}} +#pragma acc serial loop firstprivate() + for(int i = 0; i < 5; ++i); +} diff --git a/clang/test/SemaOpenACC/update-construct.cpp b/clang/test/SemaOpenACC/update-construct.cpp index 30c079c8befd4..92a8f3a0e2d4c 100644 --- a/clang/test/SemaOpenACC/update-construct.cpp +++ b/clang/test/SemaOpenACC/update-construct.cpp @@ -96,7 +96,6 @@ void uses() { // Checking for 'wait', which has a complicated set arguments. #pragma acc update self(Var) wait -#pragma acc update self(Var) wait() #pragma acc update self(Var) wait(getI(), getI()) #pragma acc update self(Var) wait(devnum: getI(): getI()) #pragma acc update self(Var) wait(devnum: getI(): queues: getI(), getI()) diff --git a/clang/test/SemaOpenACC/wait-construct-ast.cpp b/clang/test/SemaOpenACC/wait-construct-ast.cpp index 58214f1f7c886..795c5a2e6401e 100644 --- a/clang/test/SemaOpenACC/wait-construct-ast.cpp +++ b/clang/test/SemaOpenACC/wait-construct-ast.cpp @@ -21,7 +21,7 @@ void NormalFunc() { // CHECK-NEXT: CallExpr{{.*}}'int' // CHECK-NEXT: ImplicitCastExpr{{.*}}'int (*)()' // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' -#pragma acc wait() async +#pragma acc wait async // CHECK-NEXT: OpenACCWaitConstruct{{.*}}wait // CHECK-NEXT: <<> // CHECK-NEXT: async clause @@ -90,7 +90,7 @@ void TemplFunc(T t) { // CHECK-NEXT: async clause // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '' // CHECK-NEXT: NestedNameSpecifier{{.*}} 'T' -#pragma acc wait() async +#pragma acc wait async // CHECK-NEXT: OpenACCWaitConstruct{{.*}}wait // CHECK-NEXT: <<> // CHECK-NEXT: async clause From 554859c73688ec11786a060613989d1a333991bb Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Wed, 2 Apr 2025 15:38:10 -0500 Subject: [PATCH 0440/1029] [TTI] Make isLegalMasked{Load,Store} take an address space (#134006) In order to facilitate targets that only support masked loads/stores on certain address spaces (AMDGPU will support them in an upcoming patch, but only for address space 7), add an AddressSpace parameter to isLegalMaskedLoad and isLegalMaskedStore --- .../llvm/Analysis/TargetTransformInfo.h | 22 ++++++++++++------- .../llvm/Analysis/TargetTransformInfoImpl.h | 6 +++-- llvm/lib/Analysis/TargetTransformInfo.cpp | 12 +++++----- .../AArch64/AArch64TargetTransformInfo.h | 6 +++-- .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 9 +++++--- llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 7 +++--- .../Hexagon/HexagonTargetTransformInfo.cpp | 6 +++-- .../Hexagon/HexagonTargetTransformInfo.h | 6 +++-- .../Target/RISCV/RISCVTargetTransformInfo.h | 6 +++-- llvm/lib/Target/VE/VETargetTransformInfo.h | 6 +++-- .../lib/Target/X86/X86TargetTransformInfo.cpp | 10 +++++---- llvm/lib/Target/X86/X86TargetTransformInfo.h | 6 +++-- .../Scalar/ScalarizeMaskedMemIntrin.cpp | 8 +++++-- .../Transforms/Vectorize/LoopVectorize.cpp | 20 ++++++++++------- 14 files changed, 82 insertions(+), 48 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 99e21aca97631..4835c66a7a3bc 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -791,9 +791,11 @@ class TargetTransformInfo { ScalarEvolution *SE) const; /// Return true if the target supports masked store. - bool isLegalMaskedStore(Type *DataType, Align Alignment) const; + bool isLegalMaskedStore(Type *DataType, Align Alignment, + unsigned AddressSpace) const; /// Return true if the target supports masked load. - bool isLegalMaskedLoad(Type *DataType, Align Alignment) const; + bool isLegalMaskedLoad(Type *DataType, Align Alignment, + unsigned AddressSpace) const; /// Return true if the target supports nontemporal store. bool isLegalNTStore(Type *DataType, Align Alignment) const; @@ -2015,8 +2017,10 @@ class TargetTransformInfo::Concept { TargetLibraryInfo *LibInfo) = 0; virtual AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const = 0; - virtual bool isLegalMaskedStore(Type *DataType, Align Alignment) = 0; - virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0; + virtual bool isLegalMaskedStore(Type *DataType, Align Alignment, + unsigned AddressSpace) = 0; + virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment, + unsigned AddressSpace) = 0; virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0; virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalBroadcastLoad(Type *ElementTy, @@ -2562,11 +2566,13 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { ScalarEvolution *SE) const override { return Impl.getPreferredAddressingMode(L, SE); } - bool isLegalMaskedStore(Type *DataType, Align Alignment) override { - return Impl.isLegalMaskedStore(DataType, Alignment); + bool isLegalMaskedStore(Type *DataType, Align Alignment, + unsigned AddressSpace) override { + return Impl.isLegalMaskedStore(DataType, Alignment, AddressSpace); } - bool isLegalMaskedLoad(Type *DataType, Align Alignment) override { - return Impl.isLegalMaskedLoad(DataType, Alignment); + bool isLegalMaskedLoad(Type *DataType, Align Alignment, + unsigned AddressSpace) override { + return Impl.isLegalMaskedLoad(DataType, Alignment, AddressSpace); } bool isLegalNTStore(Type *DataType, Align Alignment) override { return Impl.isLegalNTStore(DataType, Alignment); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 745758426c714..261d5eacc91b0 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -276,11 +276,13 @@ class TargetTransformInfoImplBase { return TTI::AMK_None; } - bool isLegalMaskedStore(Type *DataType, Align Alignment) const { + bool isLegalMaskedStore(Type *DataType, Align Alignment, + unsigned AddressSpace) const { return false; } - bool isLegalMaskedLoad(Type *DataType, Align Alignment) const { + bool isLegalMaskedLoad(Type *DataType, Align Alignment, + unsigned AddressSpace) const { return false; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 4df551aca30a7..e3212135e9b19 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -462,14 +462,14 @@ TargetTransformInfo::getPreferredAddressingMode(const Loop *L, return TTIImpl->getPreferredAddressingMode(L, SE); } -bool TargetTransformInfo::isLegalMaskedStore(Type *DataType, - Align Alignment) const { - return TTIImpl->isLegalMaskedStore(DataType, Alignment); +bool TargetTransformInfo::isLegalMaskedStore(Type *DataType, Align Alignment, + unsigned AddressSpace) const { + return TTIImpl->isLegalMaskedStore(DataType, Alignment, AddressSpace); } -bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType, - Align Alignment) const { - return TTIImpl->isLegalMaskedLoad(DataType, Alignment); +bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType, Align Alignment, + unsigned AddressSpace) const { + return TTIImpl->isLegalMaskedLoad(DataType, Alignment, AddressSpace); } bool TargetTransformInfo::isLegalNTStore(Type *DataType, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 1b8c759fd90b4..ae0df6b895ec8 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -290,11 +290,13 @@ class AArch64TTIImpl : public BasicTTIImplBase { return isElementTypeLegalForScalableVector(DataType->getScalarType()); } - bool isLegalMaskedLoad(Type *DataType, Align Alignment) { + bool isLegalMaskedLoad(Type *DataType, Align Alignment, + unsigned /*AddressSpace*/) { return isLegalMaskedLoadStore(DataType, Alignment); } - bool isLegalMaskedStore(Type *DataType, Align Alignment) { + bool isLegalMaskedStore(Type *DataType, Align Alignment, + unsigned /*AddressSpace*/) { return isLegalMaskedLoadStore(DataType, Alignment); } diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 8f0db457a982e..1b134bbe5ff6a 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1122,7 +1122,8 @@ bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) { return false; } -bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { +bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment, + unsigned /*AddressSpace*/) { if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps()) return false; @@ -1595,9 +1596,11 @@ ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) { if (ST->hasMVEIntegerOps()) { - if (Opcode == Instruction::Load && isLegalMaskedLoad(Src, Alignment)) + if (Opcode == Instruction::Load && + isLegalMaskedLoad(Src, Alignment, AddressSpace)) return ST->getMVEVectorCostFactor(CostKind); - if (Opcode == Instruction::Store && isLegalMaskedStore(Src, Alignment)) + if (Opcode == Instruction::Store && + isLegalMaskedStore(Src, Alignment, AddressSpace)) return ST->getMVEVectorCostFactor(CostKind); } if (!isa(Src)) diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 103d2ed1c6281..ca5129c997fb0 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -184,10 +184,11 @@ class ARMTTIImpl : public BasicTTIImplBase { bool isProfitableLSRChainElement(Instruction *I); - bool isLegalMaskedLoad(Type *DataTy, Align Alignment); + bool isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace); - bool isLegalMaskedStore(Type *DataTy, Align Alignment) { - return isLegalMaskedLoad(DataTy, Alignment); + bool isLegalMaskedStore(Type *DataTy, Align Alignment, + unsigned AddressSpace) { + return isLegalMaskedLoad(DataTy, Alignment, AddressSpace); } bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index bbb9d065b6243..c3c77b514882b 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -340,13 +340,15 @@ InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return 1; } -bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/) { +bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/, + unsigned /*AddressSpace*/) { // This function is called from scalarize-masked-mem-intrin, which runs // in pre-isel. Use ST directly instead of calling isHVXVectorType. return HexagonMaskedVMem && ST.isTypeForHVX(DataType); } -bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/) { +bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/, + unsigned /*AddressSpace*/) { // This function is called from scalarize-masked-mem-intrin, which runs // in pre-isel. Use ST directly instead of calling isHVXVectorType. return HexagonMaskedVMem && ST.isTypeForHVX(DataType); diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 826644d08d1ac..b23369ac054b9 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -157,8 +157,10 @@ class HexagonTTIImpl : public BasicTTIImplBase { return 1; } - bool isLegalMaskedStore(Type *DataType, Align Alignment); - bool isLegalMaskedLoad(Type *DataType, Align Alignment); + bool isLegalMaskedStore(Type *DataType, Align Alignment, + unsigned AddressSpace); + bool isLegalMaskedLoad(Type *DataType, Align Alignment, + unsigned AddressSpace); /// @} diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 2b562b5f35ecf..1c5524748b605 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -262,10 +262,12 @@ class RISCVTTIImpl : public BasicTTIImplBase { return TLI->isLegalElementTypeForRVV(ElemType); } - bool isLegalMaskedLoad(Type *DataType, Align Alignment) { + bool isLegalMaskedLoad(Type *DataType, Align Alignment, + unsigned /*AddressSpace*/) { return isLegalMaskedLoadStore(DataType, Alignment); } - bool isLegalMaskedStore(Type *DataType, Align Alignment) { + bool isLegalMaskedStore(Type *DataType, Align Alignment, + unsigned /*AddressSpace*/) { return isLegalMaskedLoadStore(DataType, Alignment); } diff --git a/llvm/lib/Target/VE/VETargetTransformInfo.h b/llvm/lib/Target/VE/VETargetTransformInfo.h index 7a73280e76d95..f0fa01ef22912 100644 --- a/llvm/lib/Target/VE/VETargetTransformInfo.h +++ b/llvm/lib/Target/VE/VETargetTransformInfo.h @@ -133,10 +133,12 @@ class VETTIImpl : public BasicTTIImplBase { } // Load & Store { - bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) { + bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment, + unsigned /*AddressSpace*/) { return isVectorLaneType(*getLaneType(DataType)); } - bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) { + bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment, + unsigned /*AddressSpace*/) { return isVectorLaneType(*getLaneType(DataType)); } bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 8bee87a22db16..7d168d33bb3e9 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5368,8 +5368,8 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, unsigned NumElem = SrcVTy->getNumElements(); auto *MaskTy = FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem); - if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) || - (IsStore && !isLegalMaskedStore(SrcVTy, Alignment))) { + if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment, AddressSpace)) || + (IsStore && !isLegalMaskedStore(SrcVTy, Alignment, AddressSpace))) { // Scalarization APInt DemandedElts = APInt::getAllOnes(NumElem); InstructionCost MaskSplitCost = getScalarizationOverhead( @@ -6253,7 +6253,8 @@ static bool isLegalMaskedLoadStore(Type *ScalarTy, const X86Subtarget *ST) { ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); } -bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { +bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment, + unsigned AddressSpace) { Type *ScalarTy = DataTy->getScalarType(); // The backend can't handle a single element vector w/o CFCMOV. @@ -6265,7 +6266,8 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { return isLegalMaskedLoadStore(ScalarTy, ST); } -bool X86TTIImpl::isLegalMaskedStore(Type *DataTy, Align Alignment) { +bool X86TTIImpl::isLegalMaskedStore(Type *DataTy, Align Alignment, + unsigned AddressSpace) { Type *ScalarTy = DataTy->getScalarType(); // The backend can't handle a single element vector w/o CFCMOV. diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index 9a427d4388d0b..5b6204d665206 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -262,8 +262,10 @@ class X86TTIImpl : public BasicTTIImplBase { bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2); bool canMacroFuseCmp(); - bool isLegalMaskedLoad(Type *DataType, Align Alignment); - bool isLegalMaskedStore(Type *DataType, Align Alignment); + bool isLegalMaskedLoad(Type *DataType, Align Alignment, + unsigned AddressSpace); + bool isLegalMaskedStore(Type *DataType, Align Alignment, + unsigned AddressSpace); bool isLegalNTLoad(Type *DataType, Align Alignment); bool isLegalNTStore(Type *DataType, Align Alignment); bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const; diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index 63fcc1760ccaf..e24088c294987 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -1098,14 +1098,18 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, // Scalarize unsupported vector masked load if (TTI.isLegalMaskedLoad( CI->getType(), - cast(CI->getArgOperand(1))->getAlignValue())) + cast(CI->getArgOperand(1))->getAlignValue(), + cast(CI->getArgOperand(0)->getType()) + ->getAddressSpace())) return false; scalarizeMaskedLoad(DL, HasBranchDivergence, CI, DTU, ModifiedDT); return true; case Intrinsic::masked_store: if (TTI.isLegalMaskedStore( CI->getArgOperand(0)->getType(), - cast(CI->getArgOperand(2))->getAlignValue())) + cast(CI->getArgOperand(2))->getAlignValue(), + cast(CI->getArgOperand(1)->getType()) + ->getAddressSpace())) return false; scalarizeMaskedStore(DL, HasBranchDivergence, CI, DTU, ModifiedDT); return true; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 55cc801e91452..ca77a4295f4f4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1255,16 +1255,18 @@ class LoopVectorizationCostModel { /// Returns true if the target machine supports masked store operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const { + bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment, + unsigned AddressSpace) const { return Legal->isConsecutivePtr(DataType, Ptr) && - TTI.isLegalMaskedStore(DataType, Alignment); + TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace); } /// Returns true if the target machine supports masked load operation /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const { + bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment, + unsigned AddressSpace) const { return Legal->isConsecutivePtr(DataType, Ptr) && - TTI.isLegalMaskedLoad(DataType, Alignment); + TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace); } /// Returns true if the target machine can represent \p V as a masked gather @@ -3220,13 +3222,14 @@ bool LoopVectorizationCostModel::isScalarWithPredication( case Instruction::Store: { auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getLoadStoreType(I); + unsigned AS = getLoadStoreAddressSpace(I); Type *VTy = Ty; if (VF.isVector()) VTy = VectorType::get(Ty, VF); const Align Alignment = getLoadStoreAlignment(I); - return isa(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) || + return isa(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment, AS) || TTI.isLegalMaskedGather(VTy, Alignment)) - : !(isLegalMaskedStore(Ty, Ptr, Alignment) || + : !(isLegalMaskedStore(Ty, Ptr, Alignment, AS) || TTI.isLegalMaskedScatter(VTy, Alignment)); } case Instruction::UDiv: @@ -3427,8 +3430,9 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( auto *Ty = getLoadStoreType(I); const Align Alignment = getLoadStoreAlignment(I); - return isa(I) ? TTI.isLegalMaskedLoad(Ty, Alignment) - : TTI.isLegalMaskedStore(Ty, Alignment); + unsigned AS = getLoadStoreAddressSpace(I); + return isa(I) ? TTI.isLegalMaskedLoad(Ty, Alignment, AS) + : TTI.isLegalMaskedStore(Ty, Alignment, AS); } bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( From 51d1c7288662ea801b07133fd2d22aff6bac50e2 Mon Sep 17 00:00:00 2001 From: Abhinav Kumar <96587705+kr-2003@users.noreply.github.com> Date: Thu, 3 Apr 2025 02:11:47 +0530 Subject: [PATCH 0441/1029] [libc] Added support for fixed-points in ``is_signed`` and ``is_unsigned``. (#133371) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #133365 ## Changes Done - Changed the signed checking to ```cpp struct is_signed : bool_constant<((is_fixed_point || is_arithmetic_v) && (T(-1) < T(0)))> ``` in ``/libc/src/__support/CPP/type_traits/is_signed.h``. Added check for fixed-points. - But, got to know that this will fail for ``unsigned _Fract`` or any unsigned fixed-point because ``unsigned _Fract`` can’t represent -1 in T(-1), while ``unsigned int`` can handle it via wrapping. - That's why I explicity added ``is_signed`` check for ``unsigned`` fixed-points. - Same changes to ``/libc/src/__support/CPP/type_traits/is_unsigned.h``. - Added tests for ``is_signed`` and ``is_unsigned``. --- .../src/__support/CPP/type_traits/is_signed.h | 25 ++++++- .../__support/CPP/type_traits/is_unsigned.h | 27 +++++++- .../src/__support/CPP/type_traits_test.cpp | 65 ++++++++++++++++++- 3 files changed, 113 insertions(+), 4 deletions(-) diff --git a/libc/src/__support/CPP/type_traits/is_signed.h b/libc/src/__support/CPP/type_traits/is_signed.h index 3f56fb38aabb0..2ddb43ac4ee3e 100644 --- a/libc/src/__support/CPP/type_traits/is_signed.h +++ b/libc/src/__support/CPP/type_traits/is_signed.h @@ -8,20 +8,43 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_SIGNED_H #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_SIGNED_H +#include "include/llvm-libc-macros/stdfix-macros.h" #include "src/__support/CPP/type_traits/bool_constant.h" #include "src/__support/CPP/type_traits/is_arithmetic.h" +#include "src/__support/CPP/type_traits/is_same.h" +#include "src/__support/CPP/type_traits/remove_cv.h" #include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { namespace cpp { -// is_signed +#ifndef LIBC_COMPILER_HAS_FIXED_POINT template struct is_signed : bool_constant<(is_arithmetic_v && (T(-1) < T(0)))> { LIBC_INLINE constexpr operator bool() const { return is_signed::value; } LIBC_INLINE constexpr bool operator()() const { return is_signed::value; } }; +#else +template struct is_signed { +private: + template + LIBC_INLINE static constexpr bool __is_unqualified_any_of() { + return (... || is_same_v, Args>); + } + +public: + LIBC_INLINE_VAR static constexpr bool value = + (is_arithmetic_v && (T(-1) < T(0))) || + __is_unqualified_any_of(); + LIBC_INLINE constexpr operator bool() const { return is_signed::value; } + LIBC_INLINE constexpr bool operator()() const { return is_signed::value; } +}; +#endif // LIBC_COMPILER_HAS_FIXED_POINT + template LIBC_INLINE_VAR constexpr bool is_signed_v = is_signed::value; diff --git a/libc/src/__support/CPP/type_traits/is_unsigned.h b/libc/src/__support/CPP/type_traits/is_unsigned.h index eed519b1c067e..3ae6337ceb50a 100644 --- a/libc/src/__support/CPP/type_traits/is_unsigned.h +++ b/libc/src/__support/CPP/type_traits/is_unsigned.h @@ -8,20 +8,45 @@ #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_UNSIGNED_H #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_UNSIGNED_H +#include "include/llvm-libc-macros/stdfix-macros.h" #include "src/__support/CPP/type_traits/bool_constant.h" #include "src/__support/CPP/type_traits/is_arithmetic.h" +#include "src/__support/CPP/type_traits/is_same.h" +#include "src/__support/CPP/type_traits/remove_cv.h" #include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" namespace LIBC_NAMESPACE_DECL { namespace cpp { -// is_unsigned +#ifndef LIBC_COMPILER_HAS_FIXED_POINT template struct is_unsigned : bool_constant<(is_arithmetic_v && (T(-1) > T(0)))> { LIBC_INLINE constexpr operator bool() const { return is_unsigned::value; } LIBC_INLINE constexpr bool operator()() const { return is_unsigned::value; } }; +#else +template struct is_unsigned { +private: + template + LIBC_INLINE static constexpr bool __is_unqualified_any_of() { + return (... || is_same_v, Args>); + } + +public: + LIBC_INLINE_VAR static constexpr bool value = + (is_arithmetic_v && (T(-1) > T(0))) || + __is_unqualified_any_of(); + LIBC_INLINE constexpr operator bool() const { return is_unsigned::value; } + LIBC_INLINE constexpr bool operator()() const { return is_unsigned::value; } +}; +#endif // LIBC_COMPILER_HAS_FIXED_POINT + template LIBC_INLINE_VAR constexpr bool is_unsigned_v = is_unsigned::value; diff --git a/libc/test/src/__support/CPP/type_traits_test.cpp b/libc/test/src/__support/CPP/type_traits_test.cpp index 4b3e48c6a6c0f..3a607ec286051 100644 --- a/libc/test/src/__support/CPP/type_traits_test.cpp +++ b/libc/test/src/__support/CPP/type_traits_test.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "include/llvm-libc-macros/stdfix-macros.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/macros/config.h" #include "test/UnitTest/Test.h" @@ -409,7 +410,37 @@ TEST(LlvmLibcTypeTraitsTest, is_object) { // TODO is_scalar -// TODO is_signed +TEST(LlvmLibcTypeTraitsTest, is_signed) { + EXPECT_TRUE((is_signed_v)); + EXPECT_TRUE((is_signed_v)); + EXPECT_TRUE((is_signed_v)); + EXPECT_FALSE((is_signed_v)); + EXPECT_FALSE((is_signed_v)); + EXPECT_FALSE((is_signed_v)); + EXPECT_TRUE((is_signed_v)); + EXPECT_TRUE((is_signed_v)); + EXPECT_TRUE((is_signed_v)); + +#ifdef LIBC_COMPILER_HAS_FIXED_POINT + // for fixed point types + EXPECT_TRUE((is_signed_v)); + EXPECT_FALSE((is_signed_v)); + EXPECT_TRUE((is_signed_v)); + EXPECT_FALSE((is_signed_v)); + EXPECT_TRUE((is_signed_v)); + EXPECT_FALSE((is_signed_v)); + EXPECT_TRUE((is_signed_v)); + EXPECT_FALSE((is_signed_v)); + EXPECT_TRUE((is_signed_v)); + EXPECT_FALSE((is_signed_v)); + EXPECT_TRUE((is_signed_v)); + EXPECT_FALSE((is_signed_v)); + EXPECT_TRUE((is_signed_v)); + EXPECT_FALSE((is_signed_v)); + EXPECT_TRUE((is_signed_v)); + EXPECT_FALSE((is_signed_v)); +#endif +} // TODO is_trivially_constructible @@ -419,7 +450,37 @@ TEST(LlvmLibcTypeTraitsTest, is_object) { // TODO is_union -// TODO is_unsigned +TEST(LlvmLibcTypeTraitsTest, is_unsigned) { + EXPECT_FALSE((is_unsigned_v)); + EXPECT_FALSE((is_unsigned_v)); + EXPECT_FALSE((is_unsigned_v)); + EXPECT_TRUE((is_unsigned_v)); + EXPECT_TRUE((is_unsigned_v)); + EXPECT_TRUE((is_unsigned_v)); + EXPECT_FALSE((is_unsigned_v)); + EXPECT_FALSE((is_unsigned_v)); + EXPECT_FALSE((is_unsigned_v)); + +#ifdef LIBC_COMPILER_HAS_FIXED_POINT + // for fixed point types + EXPECT_FALSE((is_unsigned_v)); + EXPECT_TRUE((is_unsigned_v)); + EXPECT_FALSE((is_unsigned_v)); + EXPECT_TRUE((is_unsigned_v)); + EXPECT_FALSE((is_unsigned_v)); + EXPECT_TRUE((is_unsigned_v)); + EXPECT_FALSE((is_unsigned_v)); + EXPECT_TRUE((is_unsigned_v)); + EXPECT_FALSE((is_unsigned_v)); + EXPECT_TRUE((is_unsigned_v)); + EXPECT_FALSE((is_unsigned_v)); + EXPECT_TRUE((is_unsigned_v)); + EXPECT_FALSE((is_unsigned_v)); + EXPECT_TRUE((is_unsigned_v)); + EXPECT_FALSE((is_unsigned_v)); + EXPECT_TRUE((is_unsigned_v)); +#endif +} // TODO is_void From a8585654c2be671d78206666223dae28fe3ac511 Mon Sep 17 00:00:00 2001 From: George Burgess IV Date: Wed, 2 Apr 2025 14:44:18 -0600 Subject: [PATCH 0442/1029] [llvm][utils] skip revert-checking reverts across branches (#134108) e2ba1b6ffde4ec607342b1b746d1b57f0f04390a references that it reverts a commit that's not a parent of e2ba1b6ffde4ec607342b1b746d1b57f0f04390a. Functionally, this can (and demonstrably does) work(*), but from the standpoint of the revert checker, it's nonsense. Print a `logging.error` when it's detected. Tested by running the revert checker against a commit range that includes the aforementioned commit; the logging.error was fired appropriately. (*) - the specifics here are: - the _SHA_ that was referenced was on a non-main branch, but - the commit from the non-main branch was merged into the non-main branch from main - ...so the _functional_ commit being reverted was originally landed on main, but the _SHA_ referenced from main was from a branch that was cut before the reverted-commit was landed on main --- llvm/utils/revert_checker.py | 38 +++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/llvm/utils/revert_checker.py b/llvm/utils/revert_checker.py index b1c6e228e4d41..3c3ff3e6b846f 100755 --- a/llvm/utils/revert_checker.py +++ b/llvm/utils/revert_checker.py @@ -211,7 +211,12 @@ def _rev_parse(git_dir: str, ref: str) -> str: def _find_common_parent_commit(git_dir: str, ref_a: str, ref_b: str) -> str: - """Finds the closest common parent commit between `ref_a` and `ref_b`.""" + """Finds the closest common parent commit between `ref_a` and `ref_b`. + + Returns: + A SHA. Note that `ref_a` will be returned if `ref_a` is a parent of + `ref_b`, and vice-versa. + """ return subprocess.check_output( ["git", "-C", git_dir, "merge-base", ref_a, ref_b], encoding="utf-8", @@ -341,16 +346,31 @@ def find_reverts( ) continue - if object_type == "commit": - all_reverts.append(Revert(sha, reverted_sha)) + if object_type != "commit": + logging.error( + "%s claims to revert the %s %s, which isn't a commit", + sha, + object_type, + reverted_sha, + ) + continue + + # Rarely, reverts will cite SHAs on other branches (e.g., revert + # commit says it reverts a commit with SHA ${X}, but ${X} is not a + # parent of the revert). This can happen if e.g., the revert has + # been mirrored to another branch. Treat them the same as + # reverts of non-commits. + if _find_common_parent_commit(git_dir, sha, reverted_sha) != reverted_sha: + logging.error( + "%s claims to revert %s, which is a commit that is not " + "a parent of the revert", + sha, + reverted_sha, + ) continue - logging.error( - "%s claims to revert %s -- which isn't a commit -- %s", - sha, - object_type, - reverted_sha, - ) + all_reverts.append(Revert(sha, reverted_sha)) + # Since `all_reverts` contains reverts in log order (e.g., newer comes before # older), we need to reverse this to keep with our guarantee of older = From a1b0b4997e1744827ae8c066f677e416c0b6f16d Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 2 Apr 2025 13:46:56 -0700 Subject: [PATCH 0443/1029] [SandboxVec][NFC] Replace std::regex with llvm::Regex (#134110) --- .../Vectorize/SandboxVectorizer/SandboxVectorizer.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp index 20186426a5259..ed2f80ba8900a 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp @@ -11,9 +11,9 @@ #include "llvm/IR/Module.h" #include "llvm/SandboxIR/Constant.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Regex.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/Debug.h" #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h" -#include using namespace llvm; @@ -92,8 +92,9 @@ bool SandboxVectorizerPass::allowFile(const std::string &SrcFilePath) { if (FileNameToMatch.empty()) return false; // Note: This only runs when debugging so its OK not to reuse the regex. - std::regex FileNameRegex(std::string(".*") + FileNameToMatch); - if (std::regex_match(SrcFilePath, FileNameRegex)) + Regex FileNameRegex(".*" + FileNameToMatch + "$"); + assert(FileNameRegex.isValid() && "Bad regex!"); + if (FileNameRegex.match(SrcFilePath)) return true; } while (DelimPos != std::string::npos); return false; From 4b67c53e206ea13963acb2452d681aa744344018 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 2 Apr 2025 21:50:24 +0100 Subject: [PATCH 0444/1029] [VPlan] Use recipe debug loc instead of instr DLs in more cases (NFC) Update both VPInterleaveRecipe and VPReplicateRecipe codegen to use debug location directly from the recipe, not the underlying instruction. This removes another dependency on underlying instructions. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlan.h | 5 +++-- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ca77a4295f4f4..d214b2f2fb4cd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2385,7 +2385,7 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr, RepRecipe->setFlags(Cloned); - if (auto DL = Instr->getDebugLoc()) + if (auto DL = RepRecipe->getDebugLoc()) State.setDebugLocFrom(DL); // Replace the operands of the cloned instructions with their scalar diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 37e0a176ab1cc..65a4d0ad406cd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2190,8 +2190,9 @@ class VPInterleaveRecipe : public VPRecipeBase { VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Addr, ArrayRef StoredValues, VPValue *Mask, bool NeedsMaskForGaps) - : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}), IG(IG), - NeedsMaskForGaps(NeedsMaskForGaps) { + : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}, + IG->getInsertPos()->getDebugLoc()), + IG(IG), NeedsMaskForGaps(NeedsMaskForGaps) { for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) { if (I->getType()->isVoidTy()) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f2d3b1588229a..b16a8fc563f4c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3110,7 +3110,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds); } - State.setDebugLocFrom(Instr->getDebugLoc()); + State.setDebugLocFrom(getDebugLoc()); Value *PoisonVec = PoisonValue::get(VecTy); auto CreateGroupMask = [&BlockInMask, &State, From 38937ac24cc91653c2984e112d548955cb442484 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 2 Apr 2025 13:31:40 -0700 Subject: [PATCH 0445/1029] [RISCV] Check line and column for errors in rv(32/64)zcmp-invalid.s. NFC Same for the Xqccmp version. --- llvm/test/MC/RISCV/rv32xqccmp-invalid.s | 24 ++++++++++++------------ llvm/test/MC/RISCV/rv32zcmp-invalid.s | 16 ++++++++-------- llvm/test/MC/RISCV/rv64xqccmp-invalid.s | 22 +++++++++++----------- llvm/test/MC/RISCV/rv64zcmp-invalid.s | 18 +++++++++--------- 4 files changed, 40 insertions(+), 40 deletions(-) diff --git a/llvm/test/MC/RISCV/rv32xqccmp-invalid.s b/llvm/test/MC/RISCV/rv32xqccmp-invalid.s index 059009de3d830..a13d134100dbe 100644 --- a/llvm/test/MC/RISCV/rv32xqccmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32xqccmp-invalid.s @@ -1,39 +1,39 @@ # RUN: not llvm-mc -triple=riscv32 -mattr=+experimental-xqccmp -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-ERROR %s -# CHECK-ERROR: error: invalid operand for instruction +# CHECK-ERROR: :[[@LINE+1]]:14: error: invalid operand for instruction qc.cm.mvsa01 a1, a2 -# CHECK-ERROR: error: rs1 and rs2 must be different +# CHECK-ERROR: :[[@LINE+1]]:14: error: rs1 and rs2 must be different qc.cm.mvsa01 s0, s0 -# CHECK-ERROR: error: invalid operand for instruction +# CHECK-ERROR: :[[@LINE+1]]:14: error: invalid operand for instruction qc.cm.mva01s a1, a2 -# CHECK-ERROR: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported +# CHECK-ERROR: :[[@LINE+1]]:26: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported qc.cm.popretz {ra, s0-s10}, 112 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] +# CHECK-ERROR: :[[@LINE+1]]:28: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] qc.cm.popretz {ra, s0-s1}, 112 -# CHECK-ERROR: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:18: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.push {ra}, 16 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:24: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.pushfp {ra, s0}, 16 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] +# CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] qc.cm.pop {ra, s0-s1}, -32 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:19: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.push {ra}, -8 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.pushfp {ra, s0}, -12 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] +# CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] qc.cm.pop {ra, s0-s1}, -40 -# CHECK-ERROR: error: register list must include 's0' or 'x8' +# CHECK-ERROR: :[[@LINE+1]]:17: error: register list must include 's0' or 'x8' qc.cm.pushfp {ra}, -16 diff --git a/llvm/test/MC/RISCV/rv32zcmp-invalid.s b/llvm/test/MC/RISCV/rv32zcmp-invalid.s index f89829a33bd9e..0a62faa80dc7c 100644 --- a/llvm/test/MC/RISCV/rv32zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32zcmp-invalid.s @@ -1,28 +1,28 @@ # RUN: not llvm-mc -triple=riscv32 -mattr=zcmp -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-ERROR %s -# CHECK-ERROR: error: invalid operand for instruction +# CHECK-ERROR: :[[@LINE+1]]:11: error: invalid operand for instruction cm.mvsa01 a1, a2 -# CHECK-ERROR: error: rs1 and rs2 must be different +# CHECK-ERROR: :[[@LINE+1]]:11: error: rs1 and rs2 must be different cm.mvsa01 s0, s0 -# CHECK-ERROR: error: invalid operand for instruction +# CHECK-ERROR: :[[@LINE+1]]:11: error: invalid operand for instruction cm.mva01s a1, a2 -# CHECK-ERROR: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported +# CHECK-ERROR: :[[@LINE+1]]:23: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported cm.popretz {ra, s0-s10}, 112 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] +# CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] cm.popretz {ra, s0-s1}, 112 -# CHECK-ERROR: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:15: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] cm.push {ra}, 16 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] +# CHECK-ERROR: :[[@LINE+1]]:22: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] cm.pop {ra, s0-s1}, -32 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:16: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] cm.push {ra}, -8 # CHECK-ERROR: :[[@LINE+1]]:9: error: register list must start from 'ra' or 'x1' diff --git a/llvm/test/MC/RISCV/rv64xqccmp-invalid.s b/llvm/test/MC/RISCV/rv64xqccmp-invalid.s index ba0ed29afa108..124e95a5d0fc2 100644 --- a/llvm/test/MC/RISCV/rv64xqccmp-invalid.s +++ b/llvm/test/MC/RISCV/rv64xqccmp-invalid.s @@ -1,35 +1,35 @@ # RUN: not llvm-mc -triple=riscv64 -mattr=experimental-xqccmp -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-ERROR %s -# CHECK-ERROR: error: invalid operand for instruction +# CHECK-ERROR: :[[@LINE+1]]:14: error: invalid operand for instruction qc.cm.mvsa01 a1, a2 -# CHECK-ERROR: error: rs1 and rs2 must be different +# CHECK-ERROR: :[[@LINE+1]]:14: error: rs1 and rs2 must be different qc.cm.mvsa01 s0, s0 -# CHECK-ERROR: error: invalid operand for instruction +# CHECK-ERROR: :[[@LINE+1]]:14: error: invalid operand for instruction qc.cm.mva01s a1, a2 -# CHECK-ERROR: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported +# CHECK-ERROR: :[[@LINE+1]]:26: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported qc.cm.popretz {ra, s0-s10}, 112 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] +# CHECK-ERROR: :[[@LINE+1]]:28: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] qc.cm.popretz {ra, s0-s1}, 112 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:18: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.push {ra}, 16 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:24: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.pushfp {ra, s0}, 16 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] +# CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] qc.cm.pop {ra, s0-s1}, -32 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:19: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.push {ra}, -15 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:23: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.push {ra, s0}, -15 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] +# CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] qc.cm.pop {ra, s0-s1}, -33 diff --git a/llvm/test/MC/RISCV/rv64zcmp-invalid.s b/llvm/test/MC/RISCV/rv64zcmp-invalid.s index 7f90bf73ac713..6c4f8a9ec3293 100644 --- a/llvm/test/MC/RISCV/rv64zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv64zcmp-invalid.s @@ -1,31 +1,31 @@ # RUN: not llvm-mc -triple=riscv64 -mattr=zcmp -M no-aliases -show-encoding < %s 2>&1 \ # RUN: | FileCheck -check-prefixes=CHECK-ERROR %s -# CHECK-ERROR: error: invalid operand for instruction +# CHECK-ERROR: :[[@LINE+1]]:11: error: invalid operand for instruction cm.mvsa01 a1, a2 -# CHECK-ERROR: error: rs1 and rs2 must be different +# CHECK-ERROR: :[[@LINE+1]]:11: error: rs1 and rs2 must be different cm.mvsa01 s0, s0 -# CHECK-ERROR: error: invalid operand for instruction +# CHECK-ERROR: :[[@LINE+1]]:11: error: invalid operand for instruction cm.mva01s a1, a2 -# CHECK-ERROR: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported +# CHECK-ERROR: :[[@LINE+1]]:23: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported cm.popretz {ra, s0-s10}, 112 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] +# CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] cm.popretz {ra, s0-s1}, 112 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:15: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] cm.push {ra}, 16 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] +# CHECK-ERROR: :[[@LINE+1]]:22: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] cm.pop {ra, s0-s1}, -32 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:16: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] cm.push {ra}, -15 -# CHECK-ERROR: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] +# CHECK-ERROR: :[[@LINE+1]]:22: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] cm.pop {ra, s0-s1}, -33 # CHECK-ERROR: :[[@LINE+1]]:9: error: register list must start from 'ra' or 'x1' From 066787b9bdc4ec5ae7e365b651f37840fd5bb2b5 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Wed, 2 Apr 2025 17:10:53 -0400 Subject: [PATCH 0446/1029] [AMDGPU][True16][CodeGen] fold clamp update for true16 (#128919) Check through COPY for possible clamp folding for v_mad_mixhi_f16 isel --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 8 +- llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 29 +---- llvm/test/CodeGen/AMDGPU/true16-fold.mir | 149 ++++++++++++++++++++++ 3 files changed, 163 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 2bfc37b68a2ec..d6acf9e081b9f 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1899,7 +1899,13 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) { if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg())) return false; - MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); + if (!ClampSrc->getReg().isVirtual()) + return false; + + // Look through COPY. COPY only observed with True16. + Register DefSrcReg = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI); + MachineInstr *Def = + MRI->getVRegDef(DefSrcReg.isVirtual() ? DefSrcReg : ClampSrc->getReg()); // The type of clamp must be compatible. if (TII->getClampMask(*Def) != TII->getClampMask(MI)) diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 15cb404a3840a..beac41e42e0c6 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -269,19 +269,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2 } define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 { -; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; SDAG-GFX1100-TRUE16: ; %bb.0: -; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v0.l, v0.l, v0.l clamp -; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; SDAG-GFX1100-FAKE16: ; %bb.0: -; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; GFX900: ; %bb.0: @@ -312,12 +304,6 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: -; GISEL-GFX1100: ; %bb.0: -; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp -; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1524,10 +1510,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v3.l, v3.l, v3.l clamp -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir index ef6e4007b8f7a..93cc12f152cca 100644 --- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir @@ -57,4 +57,153 @@ body: | %4:vgpr_16 = COPY %3:sgpr_lo16 %5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec S_ENDPGM 0, implicit %5 + +--- +name: fold_16bit_madmix_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_madmix_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]] + ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIXLO_F16_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec + %6:vgpr_16 = COPY %5 + %7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %7 + S_ENDPGM 0, implicit $vgpr0 +... + +--- +name: fold_16bit_subreg_1_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_subreg_1_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec + %6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %5.lo16, 0, %5.lo16, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %6 + S_ENDPGM 0, implicit $vgpr0 +... + +--- +name: fold_16bit_subreg_2_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_subreg_2_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec + %6:vgpr_16 = COPY %5.lo16 + %7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %7 + S_ENDPGM 0, implicit $vgpr0 +... + +--- +name: fold_16bit_phyreg_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_phyreg_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr10_lo16 = COPY [[V_FMA_MIXLO_F16_]] + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sreg_32 = IMPLICIT_DEF + %4:vgpr_32 = COPY %3 + %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec + $vgpr10_lo16 = COPY %5 + %6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %6 + S_ENDPGM 0, implicit $vgpr0 +... + +--- +name: fold_16bit_undef_clamp +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_16bit_undef_clamp + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[DEF]], 0, [[DEF]], -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:vgpr_16 = IMPLICIT_DEF + %4:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %3, 0, %3, -1, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %4 + S_ENDPGM 0, implicit $vgpr0 ... From db21ae7803333032e466ead0c2a29c6760739936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 2 Apr 2025 14:26:09 -0700 Subject: [PATCH 0447/1029] [flang][cuda] Support any_sync and ballot_sync (#134135) --- .../flang/Optimizer/Builder/IntrinsicCall.h | 2 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 39 ++++++++++++++++--- flang/module/cudadevice.f90 | 14 +++++++ flang/test/Lower/CUDA/cuda-device-proc.cuf | 6 ++- 4 files changed, 53 insertions(+), 8 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index a31bbd0a1bd88..4cbef141ced94 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -442,6 +442,8 @@ struct IntrinsicLibrary { fir::ExtendedValue genUnpack(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genVerify(mlir::Type, llvm::ArrayRef); mlir::Value genVoteAllSync(mlir::Type, llvm::ArrayRef); + mlir::Value genVoteAnySync(mlir::Type, llvm::ArrayRef); + mlir::Value genVoteBallotSync(mlir::Type, llvm::ArrayRef); /// Implement all conversion functions like DBLE, the first argument is /// the value to convert. There may be an additional KIND arguments that diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 9029ea69dd5c4..8aed288d128b6 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -273,6 +273,10 @@ static constexpr IntrinsicHandler handlers[]{ &I::genAny, {{{"mask", asAddr}, {"dim", asValue}}}, /*isElemental=*/false}, + {"any_sync", + &I::genVoteAnySync, + {{{"mask", asValue}, {"pred", asValue}}}, + /*isElemental=*/false}, {"asind", &I::genAsind}, {"associated", &I::genAssociated, @@ -335,6 +339,10 @@ static constexpr IntrinsicHandler handlers[]{ {"atomicsubi", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicsubl", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicxori", &I::genAtomicXor, {{{"a", asAddr}, {"v", asValue}}}, false}, + {"ballot_sync", + &I::genVoteBallotSync, + {{{"mask", asValue}, {"pred", asValue}}}, + /*isElemental=*/false}, {"bessel_jn", &I::genBesselJn, {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}}, @@ -6499,12 +6507,9 @@ IntrinsicLibrary::genMatchAllSync(mlir::Type resultType, return value; } -// ALL_SYNC -mlir::Value IntrinsicLibrary::genVoteAllSync(mlir::Type resultType, - llvm::ArrayRef args) { - assert(args.size() == 2); - - llvm::StringRef funcName = "llvm.nvvm.vote.all.sync"; +static mlir::Value genVoteSync(fir::FirOpBuilder &builder, mlir::Location loc, + llvm::StringRef funcName, + llvm::ArrayRef args) { mlir::MLIRContext *context = builder.getContext(); mlir::Type i32Ty = builder.getI32Type(); mlir::FunctionType ftype = @@ -6514,6 +6519,28 @@ mlir::Value IntrinsicLibrary::genVoteAllSync(mlir::Type resultType, return builder.create(loc, funcOp, args).getResult(0); } +// ALL_SYNC +mlir::Value IntrinsicLibrary::genVoteAllSync(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 2); + return genVoteSync(builder, loc, "llvm.nvvm.vote.all.sync", args); +} + +// ANY_SYNC +mlir::Value IntrinsicLibrary::genVoteAnySync(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 2); + return genVoteSync(builder, loc, "llvm.nvvm.vote.any.sync", args); +} + +// BALLOT_SYNC +mlir::Value +IntrinsicLibrary::genVoteBallotSync(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 2); + return genVoteSync(builder, loc, "llvm.nvvm.vote.ballot.sync", args); +} + // MATCH_ANY_SYNC mlir::Value IntrinsicLibrary::genMatchAnySync(mlir::Type resultType, diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 6b8aa4de74240..591e25e4108b2 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -1022,6 +1022,20 @@ attributes(device) integer function all_sync(mask, pred) end function end interface + interface any_sync + attributes(device) integer function any_sync(mask, pred) + !dir$ ignore_tkr(d) mask, (td) pred + integer, value :: mask, pred + end function + end interface + + interface ballot_sync + attributes(device) integer function ballot_sync(mask, pred) + !dir$ ignore_tkr(d) mask, (td) pred + integer, value :: mask, pred + end function + end interface + ! LDCG interface __ldcg attributes(device) pure integer(4) function __ldcg_i4(x) bind(c) diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 9758107c84031..6a7fee73f338a 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -299,12 +299,14 @@ end attributes(device) subroutine testVote() integer :: a, ipred, mask, v32 a = all_sync(mask, v32) - + a = any_sync(mask, v32) + a = ballot_sync(mask, v32) end subroutine ! CHECK-LABEL: func.func @_QPtestvote() ! CHECK: fir.call @llvm.nvvm.vote.all.sync - +! CHECK: fir.call @llvm.nvvm.vote.any.sync +! CHECK: fir.call @llvm.nvvm.vote.ballot.sync ! CHECK-DAG: func.func private @__ldca_i4x4_(!fir.ref>, !fir.ref>) ! CHECK-DAG: func.func private @__ldcg_i4x4_(!fir.ref>, !fir.ref>) From 42b3f91fd6c850492f6f899d6a0f12ccb948aa1a Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com> Date: Wed, 2 Apr 2025 16:32:36 -0500 Subject: [PATCH 0448/1029] [mlir] Vectorize tensor.pad with low padding for unit dims (#133808) We currently do not have masked vectorization support for tenor.pad with low padding. However, we can allow this in the special case where the result dimension after padding is a unit dim. The reason is when we actually have a low pad on a unit dim, the input size of that dimension will be (or should be for correct IR) dynamically zero and hence we will create a zero mask which is correct. If the low pad is dynamically zero then the lowering is correct as well. --------- Signed-off-by: Nirvedh --- .../Linalg/Transforms/Vectorization.cpp | 22 ++++++++-- .../Linalg/vectorization-unsupported.mlir | 27 ++++++++++++ mlir/test/Dialect/Linalg/vectorization.mlir | 42 +++++++++++++++++++ 3 files changed, 87 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 2dcd897330d1e..8c8b1b85ef5a3 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -2178,11 +2178,25 @@ vectorizePadOpPrecondition(tensor::PadOp padOp, inputVectorSizes))) return failure(); - if (llvm::any_of(padOp.getLow(), [](Value v) { - std::optional res = getConstantIntValue(v); - return !res.has_value() || res.value() != 0; + // Padding with non-zero low pad values is not supported, unless the + // corresponding result dim is 1 as this would require shifting the results to + // the right for the low padded dims by the required amount of low padding. + // However, we do support low padding if the dims being low padded have result + // sizes of 1. The reason is when we have a low pad on a unit result dim, the + // input size of that dimension will be dynamically zero (as the sum of the + // low pad and input dim size has to be one) and hence we will create a zero + // mask as the lowering logic just makes the mask one for the input dim size - + // which is zero here. Hence we will load the pad value which is what we want + // in this case. If the low pad is dynamically zero then the lowering is + // correct as well as no shifts are necessary. + if (llvm::any_of(llvm::enumerate(padOp.getLow()), [&](const auto &en) { + Value padValue = en.value(); + unsigned pos = en.index(); + std::optional pad = getConstantIntValue(padValue); + return (!pad.has_value() || pad.value() != 0) && + resultTensorShape[pos] != 1; })) { - LDBG("low pad must all be zero: " << padOp << "\n"); + LDBG("low pad must all be zero for all non unit dims: " << padOp << "\n"); return failure(); } diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir index 2d1f0191eb798..f653a4852b074 100644 --- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir +++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir @@ -305,6 +305,33 @@ module attributes {transform.with_named_sequence} { // ----- +// Padding with non-zero low pad values is not supported, unless the corresponding +// result dim is 1. Here `%l0` being a non-zero low pad applied to a +// non-unit result dimension makes this case unsupported. +func.func @tensor_pad_non_zero_low_pad( + %0 : tensor, %h0 : index, %h1 : index, %l0 : index) + -> tensor<2x4xf32> { + // expected-error @+3 {{Attempted to vectorize, but failed}} + %cst = arith.constant 42.43 : f32 + %c0 = arith.constant 0 : index + %1 = tensor.pad %0 low[%l0, %c0] high[%h0, %h1] { + ^bb0(%hh1: index, %hh2: index): + tensor.yield %cst : f32 + } : tensor to tensor<2x4xf32> + return %1: tensor<2x4xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [2, 4] : !transform.any_op + transform.yield + } +} + +// ----- + // With dynamically shaped source, the vectorizer infers the vector size for // xfer Ops from the destination tensor and, conservatively, assumes // out-of-bounds accesses. Out-of-bounds accesses require a pad value, but diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir index c6d9ec6215715..299be1296aa66 100644 --- a/mlir/test/Dialect/Linalg/vectorization.mlir +++ b/mlir/test/Dialect/Linalg/vectorization.mlir @@ -664,6 +664,48 @@ module attributes {transform.with_named_sequence} { } } +// ----- +// This case is supported because low padding `%l0` is applied on +// a unit dimension which is supported, non unit result dimension low +// padding is currently unsupported. +// CHECK-LABEL: func @test_masked_vectorize_non_zero_low_pad_unit_res_dim +func.func @test_masked_vectorize_non_zero_low_pad_unit_res_dim( + %0 : tensor, %h0 : index, %h1 : index, %l0 : index) + -> tensor<1x4xf32> +{ + // CHECK-DAG: %[[C42:.*]] = arith.constant 4.243000e+01 : f32 + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[C0_1:.*]] = arith.constant 0 : index + // CHECK-DAG: %[[D0:.*]] = tensor.dim {{.*}} : tensor + // CHECK-DAG: %[[D1:.*]] = tensor.dim {{.*}} : tensor + // CHECK: %[[MASK:.*]] = vector.create_mask %[[D0]], %[[D1]] : vector<1x4xi1> + // CHECK: %[[MASKED_READ:.*]] = vector.mask %[[MASK]] { + // CHECK-SAME: vector.transfer_read %{{.*}}[%[[C0_1]], %[[C0_1]]], %[[C42]] + // CHECK-SAME: {in_bounds = [true, true]} : tensor, vector<1x4xf32> + // CHECK-SAME: } : vector<1x4xi1> -> vector<1x4xf32> + // CHECK-DAG: %[[EMPTY:.*]] = tensor.empty() : tensor<1x4xf32> + // CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index + // CHECK: %[[MASKED_WRITE:.*]] = vector.transfer_write %[[MASKED_READ]], %[[EMPTY]][%[[C0_2]], %[[C0_2]]] + // CHECK-SAME: {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32> + // CHECK: return %[[MASKED_WRITE]] : tensor<1x4xf32> + %cst = arith.constant 42.43 : f32 + %c0 = arith.constant 0 : index + %1 = tensor.pad %0 low[%l0, %c0] high[%h0, %h1] { + ^bb0(%hh1: index, %hh2: index): + tensor.yield %cst : f32 + } : tensor to tensor<1x4xf32> + return %1: tensor<1x4xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %0 vector_sizes [1, 4] : !transform.any_op + transform.yield + } +} + // ----- // Input identical as the test in vectorization-with-patterns.mlir. Output is From 81601cf3ab5dd09635c0b3025004c3cfc146498d Mon Sep 17 00:00:00 2001 From: Chris B Date: Wed, 2 Apr 2025 16:43:10 -0500 Subject: [PATCH 0449/1029] [Docs] Clarify that `reassoc` isn't just for reassociation (#133168) The `reassoc` fast-math flag allows a much wider array of algebraic transformations than just strictly reassociations. In some cases it does commutations, distributions, and folds away redundant inverse operations... While it might make sense to fix the flag naming at some point, in the meantime we should at least have the docs be accurate to avoid confusion. --- llvm/docs/LangRef.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index e1636e154d43b..d242c945816cc 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -3971,8 +3971,9 @@ output, given the original flags. for places where this can apply to LLVM's intrinsic math functions. ``reassoc`` - Allow reassociation transformations for floating-point instructions. - This may dramatically change results in floating-point. + Allow algebraically equivalent transformations for floating-point + instructions such as reassociation transformations. This may dramatically + change results in floating-point. .. _uselistorder: From 380defd4b3cd980d8b2031136a02764182eaa85b Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 2 Apr 2025 22:46:38 +0100 Subject: [PATCH 0450/1029] [VPlan] Update VPInterleaveRecipe to take debug loc directly as arg (NFC) --- llvm/lib/Transforms/Vectorize/VPlan.h | 7 ++++--- llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +- llvm/unittests/Transforms/Vectorize/VPlanTest.cpp | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 65a4d0ad406cd..50baf220a1002 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -2189,9 +2189,10 @@ class VPInterleaveRecipe : public VPRecipeBase { public: VPInterleaveRecipe(const InterleaveGroup *IG, VPValue *Addr, ArrayRef StoredValues, VPValue *Mask, - bool NeedsMaskForGaps) + bool NeedsMaskForGaps, DebugLoc DL) : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}, - IG->getInsertPos()->getDebugLoc()), + DL), + IG(IG), NeedsMaskForGaps(NeedsMaskForGaps) { for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) { @@ -2211,7 +2212,7 @@ class VPInterleaveRecipe : public VPRecipeBase { VPInterleaveRecipe *clone() override { return new VPInterleaveRecipe(IG, getAddr(), getStoredValues(), getMask(), - NeedsMaskForGaps); + NeedsMaskForGaps, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPInterleaveSC) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 0c37db7f9d3a3..be3b3d19a3a11 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2283,7 +2283,7 @@ void VPlanTransforms::createInterleaveGroups( : B.createPtrAdd(InsertPos->getAddr(), OffsetVPV); } auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues, - InsertPos->getMask(), NeedsMaskForGaps); + InsertPos->getMask(), NeedsMaskForGaps, InsertPos->getDebugLoc()); VPIG->insertBefore(InsertPos); unsigned J = 0; diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index c1897e2c5d277..cb7545171744e 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1040,7 +1040,7 @@ TEST_F(VPRecipeTest, CastVPInterleaveRecipeToVPUser) { VPValue *Addr = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1)); VPValue *Mask = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 2)); InterleaveGroup IG(4, false, Align(4)); - VPInterleaveRecipe Recipe(&IG, Addr, {}, Mask, false); + VPInterleaveRecipe Recipe(&IG, Addr, {}, Mask, false, DebugLoc()); EXPECT_TRUE(isa(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa(BaseR)); From 76fa9530c9ac7f81a49b840556f51f4838efbfe1 Mon Sep 17 00:00:00 2001 From: Theo de Magalhaes Date: Wed, 2 Apr 2025 23:46:58 +0200 Subject: [PATCH 0451/1029] [clang] add support for -Wpadded on Windows (#130182) Implements the -Wpadded warning for --target=x86_64-windows-msvc etc. Fixes #61702 . --- clang/docs/ReleaseNotes.rst | 2 + clang/lib/AST/RecordLayoutBuilder.cpp | 65 +++++++++++++++---- .../test/SemaCXX/windows-Wpadded-bitfield.cpp | 32 +++++++++ clang/test/SemaCXX/windows-Wpadded.cpp | 40 ++++++++++++ 4 files changed, 128 insertions(+), 11 deletions(-) create mode 100644 clang/test/SemaCXX/windows-Wpadded-bitfield.cpp create mode 100644 clang/test/SemaCXX/windows-Wpadded.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 3f7f723bc96ce..7fb6b0baae16b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -190,6 +190,8 @@ Modified Compiler Flags - The compiler flag `-fbracket-depth` default value is increased from 256 to 2048. (#GH94728) +- `-Wpadded` option implemented for the `x86_64-windows-msvc` target. Fixes #61702 + Removed Compiler Flags ------------------------- diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp index 3e756ab9b9bfe..41e7198cb7581 100644 --- a/clang/lib/AST/RecordLayoutBuilder.cpp +++ b/clang/lib/AST/RecordLayoutBuilder.cpp @@ -2274,9 +2274,9 @@ static unsigned getPaddingDiagFromTagKind(TagTypeKind Tag) { } } -void ItaniumRecordLayoutBuilder::CheckFieldPadding( - uint64_t Offset, uint64_t UnpaddedOffset, uint64_t UnpackedOffset, - unsigned UnpackedAlign, bool isPacked, const FieldDecl *D) { +static void CheckFieldPadding(const ASTContext &Context, bool IsUnion, + uint64_t Offset, uint64_t UnpaddedOffset, + const FieldDecl *D) { // We let objc ivars without warning, objc interfaces generally are not used // for padding tricks. if (isa(D)) @@ -2300,7 +2300,8 @@ void ItaniumRecordLayoutBuilder::CheckFieldPadding( if (D->getIdentifier()) { auto Diagnostic = D->isBitField() ? diag::warn_padded_struct_bitfield : diag::warn_padded_struct_field; - Diag(D->getLocation(), Diagnostic) + Context.getDiagnostics().Report(D->getLocation(), + Diagnostic) << getPaddingDiagFromTagKind(D->getParent()->getTagKind()) << Context.getTypeDeclType(D->getParent()) << PadSize << (InBits ? 1 : 0) // (byte|bit) @@ -2308,15 +2309,22 @@ void ItaniumRecordLayoutBuilder::CheckFieldPadding( } else { auto Diagnostic = D->isBitField() ? diag::warn_padded_struct_anon_bitfield : diag::warn_padded_struct_anon_field; - Diag(D->getLocation(), Diagnostic) + Context.getDiagnostics().Report(D->getLocation(), + Diagnostic) << getPaddingDiagFromTagKind(D->getParent()->getTagKind()) << Context.getTypeDeclType(D->getParent()) << PadSize << (InBits ? 1 : 0); // (byte|bit) } - } - if (isPacked && Offset != UnpackedOffset) { - HasPackedField = true; - } + } +} + +void ItaniumRecordLayoutBuilder::CheckFieldPadding( + uint64_t Offset, uint64_t UnpaddedOffset, uint64_t UnpackedOffset, + unsigned UnpackedAlign, bool isPacked, const FieldDecl *D) { + ::CheckFieldPadding(Context, IsUnion, Offset, UnpaddedOffset, D); + if (isPacked && Offset != UnpackedOffset) { + HasPackedField = true; + } } static const CXXMethodDecl *computeKeyFunction(ASTContext &Context, @@ -2642,8 +2650,6 @@ struct MicrosoftRecordLayoutBuilder { /// virtual base classes and their offsets in the record. ASTRecordLayout::VBaseOffsetsMapTy VBases; /// The number of remaining bits in our last bitfield allocation. - /// This value isn't meaningful unless LastFieldIsNonZeroWidthBitfield is - /// true. unsigned RemainingBitsInField; bool IsUnion : 1; /// True if the last field laid out was a bitfield and was not 0 @@ -3004,6 +3010,15 @@ void MicrosoftRecordLayoutBuilder::layoutField(const FieldDecl *FD) { } else { FieldOffset = Size.alignTo(Info.Alignment); } + + uint64_t UnpaddedFielddOffsetInBits = + Context.toBits(DataSize) - RemainingBitsInField; + + ::CheckFieldPadding(Context, IsUnion, Context.toBits(FieldOffset), + UnpaddedFielddOffsetInBits, FD); + + RemainingBitsInField = 0; + placeFieldAtOffset(FieldOffset); if (!IsOverlappingEmptyField) @@ -3049,10 +3064,14 @@ void MicrosoftRecordLayoutBuilder::layoutBitField(const FieldDecl *FD) { } else { // Allocate a new block of memory and place the bitfield in it. CharUnits FieldOffset = Size.alignTo(Info.Alignment); + uint64_t UnpaddedFieldOffsetInBits = + Context.toBits(DataSize) - RemainingBitsInField; placeFieldAtOffset(FieldOffset); Size = FieldOffset + Info.Size; Alignment = std::max(Alignment, Info.Alignment); RemainingBitsInField = Context.toBits(Info.Size) - Width; + ::CheckFieldPadding(Context, IsUnion, Context.toBits(FieldOffset), + UnpaddedFieldOffsetInBits, FD); } DataSize = Size; } @@ -3076,9 +3095,14 @@ MicrosoftRecordLayoutBuilder::layoutZeroWidthBitField(const FieldDecl *FD) { } else { // Round up the current record size to the field's alignment boundary. CharUnits FieldOffset = Size.alignTo(Info.Alignment); + uint64_t UnpaddedFieldOffsetInBits = + Context.toBits(DataSize) - RemainingBitsInField; placeFieldAtOffset(FieldOffset); + RemainingBitsInField = 0; Size = FieldOffset; Alignment = std::max(Alignment, Info.Alignment); + ::CheckFieldPadding(Context, IsUnion, Context.toBits(FieldOffset), + UnpaddedFieldOffsetInBits, FD); } DataSize = Size; } @@ -3203,6 +3227,9 @@ void MicrosoftRecordLayoutBuilder::layoutVirtualBases(const CXXRecordDecl *RD) { } void MicrosoftRecordLayoutBuilder::finalizeLayout(const RecordDecl *RD) { + uint64_t UnpaddedSizeInBits = Context.toBits(DataSize); + UnpaddedSizeInBits -= RemainingBitsInField; + // Respect required alignment. Note that in 32-bit mode Required alignment // may be 0 and cause size not to be updated. DataSize = Size; @@ -3231,6 +3258,22 @@ void MicrosoftRecordLayoutBuilder::finalizeLayout(const RecordDecl *RD) { Size = Context.toCharUnitsFromBits(External.Size); if (External.Align) Alignment = Context.toCharUnitsFromBits(External.Align); + return; + } + unsigned CharBitNum = Context.getTargetInfo().getCharWidth(); + uint64_t SizeInBits = Context.toBits(Size); + if (SizeInBits > UnpaddedSizeInBits) { + unsigned int PadSize = SizeInBits - UnpaddedSizeInBits; + bool InBits = true; + if (PadSize % CharBitNum == 0) { + PadSize = PadSize / CharBitNum; + InBits = false; + } + + Context.getDiagnostics().Report(RD->getLocation(), + diag::warn_padded_struct_size) + << Context.getTypeDeclType(RD) << PadSize + << (InBits ? 1 : 0); // (byte|bit) } } diff --git a/clang/test/SemaCXX/windows-Wpadded-bitfield.cpp b/clang/test/SemaCXX/windows-Wpadded-bitfield.cpp new file mode 100644 index 0000000000000..ee5a57124eca5 --- /dev/null +++ b/clang/test/SemaCXX/windows-Wpadded-bitfield.cpp @@ -0,0 +1,32 @@ +// RUN: %clang_cc1 -triple x86_64-windows-msvc -fsyntax-only -verify -Wpadded %s + +struct __attribute__((ms_struct)) BitfieldStruct { // expected-warning {{padding size of 'BitfieldStruct' with 3 bytes to alignment boundary}} + char c : 1; + int : 0; // expected-warning {{padding struct 'BitfieldStruct' with 31 bits to align anonymous bit-field}} + char i; +}; + +struct __attribute__((ms_struct)) SevenBitfieldStruct { // expected-warning {{padding size of 'SevenBitfieldStruct' with 3 bytes to alignment boundary}} + char c : 7; + int : 0; // expected-warning {{padding struct 'SevenBitfieldStruct' with 25 bits to align anonymous bit-field}} + char i; +}; + +struct __attribute__((ms_struct)) SameUnitSizeBitfield { + char c : 7; + char : 1; // Same unit size attributes fall in the same unit + they fill the unit -> no padding + char i; +}; + +struct __attribute__((ms_struct)) DifferentUnitSizeBitfield { // expected-warning {{padding size of 'DifferentUnitSizeBitfield' with 3 bytes to alignment boundary}} + char c : 7; + int : 1; // expected-warning {{padding struct 'DifferentUnitSizeBitfield' with 25 bits to align anonymous bit-field}} + char i; // expected-warning {{padding struct 'DifferentUnitSizeBitfield' with 31 bits to align 'i'}} +}; + +int main() { + BitfieldStruct b; + SevenBitfieldStruct s; + SameUnitSizeBitfield su; + DifferentUnitSizeBitfield du; +} diff --git a/clang/test/SemaCXX/windows-Wpadded.cpp b/clang/test/SemaCXX/windows-Wpadded.cpp new file mode 100644 index 0000000000000..da3f2bf08c6b8 --- /dev/null +++ b/clang/test/SemaCXX/windows-Wpadded.cpp @@ -0,0 +1,40 @@ +// RUN: %clang_cc1 -triple x86_64-windows-msvc -fsyntax-only -verify -Wpadded %s +// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -verify -Wpadded %s + +struct __attribute__((ms_struct)) Foo { // expected-warning {{padding size of 'Foo' with 3 bytes to alignment boundary}} + int b : 1; + char a; // expected-warning {{padding struct 'Foo' with 31 bits to align 'a'}} +}; + +struct __attribute__((ms_struct)) AlignedStruct { // expected-warning {{padding size of 'AlignedStruct' with 4 bytes to alignment boundary}} + char c; + alignas(8) int i; // expected-warning {{padding struct 'AlignedStruct' with 7 bytes to align 'i'}} +}; + + +struct Base { + int b; +}; + +struct Derived : public Base { // expected-warning {{padding size of 'Derived' with 3 bytes to alignment boundary}} + char c; +}; + +union __attribute__((ms_struct)) Union { + char c; + long long u; +}; + +struct __attribute__((ms_struct)) StructWithUnion { // expected-warning {{padding size of 'StructWithUnion' with 6 bytes to alignment boundary}} + char c; + int : 0; + Union t; // expected-warning {{padding struct 'StructWithUnion' with 7 bytes to align 't'}} + short i; +}; + +int main() { + Foo f; + AlignedStruct a; + Derived d; + StructWithUnion swu; +} From acc6bcdc504ad2e8c09a628dc18de0067f7344b8 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 2 Apr 2025 14:53:55 -0700 Subject: [PATCH 0452/1029] Support alternative sections for patchable function entries (#131230) With -fpatchable-function-entry (or the patchable_function_entry function attribute), we emit records of patchable entry locations to the __patchable_function_entries section. Add an additional parameter to the command line option that allows one to specify a different default section name for the records, and an identical parameter to the function attribute that allows one to override the section used. The main use case for this change is the Linux kernel using prefix NOPs for ftrace, and thus depending on__patchable_function_entries to locate traceable functions. Functions that are not traceable currently disable entry NOPs using the function attribute, but this creates a compatibility issue with -fsanitize=kcfi, which expects all indirectly callable functions to have a type hash prefix at the same offset from the function entry. Adding a section parameter would allow the kernel to distinguish between traceable and non-traceable functions by adding entry records to separate sections while maintaining a stable function prefix layout for all functions. LKML discussion: https://lore.kernel.org/lkml/Y1QEzk%2FA41PKLEPe@hirez.programming.kicks-ass.net/ --- clang/include/clang/Basic/Attr.td | 3 +- clang/include/clang/Basic/AttrDocs.td | 10 +++-- clang/include/clang/Basic/CodeGenOptions.h | 4 ++ .../clang/Basic/DiagnosticSemaKinds.td | 4 ++ clang/include/clang/Driver/Options.td | 19 +++++++-- clang/lib/CodeGen/CodeGenFunction.cpp | 6 +++ clang/lib/Driver/ToolChains/Clang.cpp | 8 +++- clang/lib/Sema/SemaDeclAttr.cpp | 24 +++++++++-- .../patchable-function-entry-section.c | 41 +++++++++++++++++++ clang/test/Driver/fpatchable-function-entry.c | 3 ++ .../test/Sema/patchable-function-entry-attr.c | 8 +++- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 12 ++++-- llvm/lib/IR/Verifier.cpp | 5 +++ .../CodeGen/X86/patchable-function-entry.ll | 13 ++++++ .../invalid-patchable-function-entry.ll | 4 ++ 15 files changed, 146 insertions(+), 18 deletions(-) create mode 100644 clang/test/CodeGen/patchable-function-entry-section.c diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 0999d8065e9f5..fd9e686485552 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -936,7 +936,8 @@ def PatchableFunctionEntry "riscv64", "x86", "x86_64", "ppc", "ppc64"]>> { let Spellings = [GCC<"patchable_function_entry">]; let Subjects = SubjectList<[Function, ObjCMethod]>; - let Args = [UnsignedArgument<"Count">, DefaultIntArgument<"Offset", 0>]; + let Args = [UnsignedArgument<"Count">, DefaultIntArgument<"Offset", 0>, + StringArgument<"Section", /* optional */ 1>]; let Documentation = [PatchableFunctionEntryDocs]; } diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index a52ece467ec70..c8b371280e35d 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -6502,10 +6502,12 @@ only N==1 is supported. def PatchableFunctionEntryDocs : Documentation { let Category = DocCatFunction; let Content = [{ -``__attribute__((patchable_function_entry(N,M)))`` is used to generate M NOPs -before the function entry and N-M NOPs after the function entry. This attribute -takes precedence over the command line option ``-fpatchable-function-entry=N,M``. -``M`` defaults to 0 if omitted. +``__attribute__((patchable_function_entry(N,M,Section)))`` is used to generate M +NOPs before the function entry and N-M NOPs after the function entry, with a record of +the entry stored in section ``Section``. This attribute takes precedence over the +command line option ``-fpatchable-function-entry=N,M,Section``. ``M`` defaults to 0 +if omitted.``Section`` defaults to the ``-fpatchable-function-entry`` section name if +set, or to ``__patchable_function_entries`` otherwise. This attribute is only supported on aarch64/aarch64-be/loongarch32/loongarch64/riscv32/riscv64/i386/x86-64/ppc/ppc64 targets. diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h index c531c656f42b7..e39a73bdb13ac 100644 --- a/clang/include/clang/Basic/CodeGenOptions.h +++ b/clang/include/clang/Basic/CodeGenOptions.h @@ -281,6 +281,10 @@ class CodeGenOptions : public CodeGenOptionsBase { /// -fprofile-generate, and -fcs-profile-generate. std::string InstrProfileOutput; + /// Name of the patchable function entry section with + /// -fpatchable-function-entry. + std::string PatchableFunctionEntrySection; + /// Name of the profile file to use with -fprofile-sample-use. std::string SampleProfileFile; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 10568a5ee87fc..3f9ba933582da 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3550,6 +3550,10 @@ def err_conflicting_codeseg_attribute : Error< def warn_duplicate_codeseg_attribute : Warning< "duplicate code segment specifiers">, InGroup
; +def err_attribute_patchable_function_entry_invalid_section + : Error<"section argument to 'patchable_function_entry' attribute is not " + "valid for this target: %0">; + def err_anonymous_property: Error< "anonymous property is not supported">; def err_property_is_variably_modified : Error< diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 534631e43d26d..e69b804de63b5 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3759,10 +3759,16 @@ defm pascal_strings : BoolFOption<"pascal-strings", // Note: This flag has different semantics in the driver and in -cc1. The driver accepts -fpatchable-function-entry=M,N // and forwards it to -cc1 as -fpatchable-function-entry=M and -fpatchable-function-entry-offset=N. In -cc1, both flags // are treated as a single integer. -def fpatchable_function_entry_EQ : Joined<["-"], "fpatchable-function-entry=">, Group, - Visibility<[ClangOption, CC1Option]>, - MetaVarName<"">, HelpText<"Generate M NOPs before function entry and N-M NOPs after function entry">, - MarshallingInfoInt>; +def fpatchable_function_entry_EQ + : Joined<["-"], "fpatchable-function-entry=">, + Group, + Visibility<[ClangOption, CC1Option]>, + MetaVarName<"">, + HelpText<"Generate M NOPs before function entry and N-M NOPs after " + "function entry. " + "If section is specified, use it instead of " + "__patchable_function_entries.">, + MarshallingInfoInt>; def fms_hotpatch : Flag<["-"], "fms-hotpatch">, Group, Visibility<[ClangOption, CC1Option, CLOption]>, HelpText<"Ensure that all functions can be hotpatched at runtime">, @@ -7593,6 +7599,11 @@ def fpatchable_function_entry_offset_EQ : Joined<["-"], "fpatchable-function-entry-offset=">, MetaVarName<"">, HelpText<"Generate M NOPs before function entry">, MarshallingInfoInt>; +def fpatchable_function_entry_section_EQ + : Joined<["-"], "fpatchable-function-entry-section=">, + MetaVarName<"
">, + HelpText<"Use Section instead of __patchable_function_entries">, + MarshallingInfoString>; def fprofile_instrument_EQ : Joined<["-"], "fprofile-instrument=">, HelpText<"Enable PGO instrumentation">, Values<"none,clang,llvm,csllvm">, NormalizedValuesScope<"CodeGenOptions">, diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index dcf523f56bf1e..b55003b9b0bbb 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -965,18 +965,24 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, } unsigned Count, Offset; + StringRef Section; if (const auto *Attr = D ? D->getAttr() : nullptr) { Count = Attr->getCount(); Offset = Attr->getOffset(); + Section = Attr->getSection(); } else { Count = CGM.getCodeGenOpts().PatchableFunctionEntryCount; Offset = CGM.getCodeGenOpts().PatchableFunctionEntryOffset; } + if (Section.empty()) + Section = CGM.getCodeGenOpts().PatchableFunctionEntrySection; if (Count && Offset <= Count) { Fn->addFnAttr("patchable-function-entry", std::to_string(Count - Offset)); if (Offset) Fn->addFnAttr("patchable-function-prefix", std::to_string(Offset)); + if (!Section.empty()) + Fn->addFnAttr("patchable-function-entry-section", Section); } // Instruct that functions for COFF/CodeView targets should start with a // patchable instruction, but only on x86/x64. Don't forward this to ARM/ARM64 diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 5f45cf0865b9e..db4f50fc98f7c 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6917,8 +6917,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, D.Diag(diag::err_drv_unsupported_opt_for_target) << A->getAsString(Args) << TripleStr; else if (S.consumeInteger(10, Size) || - (!S.empty() && (!S.consume_front(",") || - S.consumeInteger(10, Offset) || !S.empty()))) + (!S.empty() && + (!S.consume_front(",") || S.consumeInteger(10, Offset))) || + (!S.empty() && (!S.consume_front(",") || S.empty()))) D.Diag(diag::err_drv_invalid_argument_to_option) << S0 << A->getOption().getName(); else if (Size < Offset) @@ -6927,6 +6928,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back(Args.MakeArgString(A->getSpelling() + Twine(Size))); CmdArgs.push_back(Args.MakeArgString( "-fpatchable-function-entry-offset=" + Twine(Offset))); + if (!S.empty()) + CmdArgs.push_back( + Args.MakeArgString("-fpatchable-function-entry-section=" + S)); } } diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 6cb6f6d105a32..0b844b44930b9 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -5833,9 +5833,10 @@ static void handlePatchableFunctionEntryAttr(Sema &S, Decl *D, return; } uint32_t Count = 0, Offset = 0; + StringRef Section; if (!S.checkUInt32Argument(AL, AL.getArgAsExpr(0), Count, 0, true)) return; - if (AL.getNumArgs() == 2) { + if (AL.getNumArgs() >= 2) { Expr *Arg = AL.getArgAsExpr(1); if (!S.checkUInt32Argument(AL, Arg, Offset, 1, true)) return; @@ -5845,8 +5846,25 @@ static void handlePatchableFunctionEntryAttr(Sema &S, Decl *D, return; } } - D->addAttr(::new (S.Context) - PatchableFunctionEntryAttr(S.Context, AL, Count, Offset)); + if (AL.getNumArgs() == 3) { + SourceLocation LiteralLoc; + if (!S.checkStringLiteralArgumentAttr(AL, 2, Section, &LiteralLoc)) + return; + if (llvm::Error E = S.isValidSectionSpecifier(Section)) { + S.Diag(LiteralLoc, + diag::err_attribute_patchable_function_entry_invalid_section) + << toString(std::move(E)); + return; + } + if (Section.empty()) { + S.Diag(LiteralLoc, + diag::err_attribute_patchable_function_entry_invalid_section) + << "section must not be empty"; + return; + } + } + D->addAttr(::new (S.Context) PatchableFunctionEntryAttr(S.Context, AL, Count, + Offset, Section)); } static void handleBuiltinAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) { diff --git a/clang/test/CodeGen/patchable-function-entry-section.c b/clang/test/CodeGen/patchable-function-entry-section.c new file mode 100644 index 0000000000000..4c0d2a1baf77b --- /dev/null +++ b/clang/test/CodeGen/patchable-function-entry-section.c @@ -0,0 +1,41 @@ +// RUN: %clang_cc1 -triple aarch64 -emit-llvm %s -o - | FileCheck --check-prefixes=COMMON,NODEFAULT %s +// RUN: %clang_cc1 -triple x86_64 -emit-llvm %s -fpatchable-function-entry=1 -fpatchable-function-entry-section=__default_section -o - | FileCheck --check-prefixes=COMMON,DEFAULT %s + +// COMMON: define{{.*}} void @f0() #0 +__attribute__((patchable_function_entry(0))) void f0(void) {} + +// COMMON: define{{.*}} void @f00() #0 +__attribute__((patchable_function_entry(0, 0, "__unused_section"))) void f00(void) {} + +// COMMON: define{{.*}} void @f2() #1 +__attribute__((patchable_function_entry(2))) void f2(void) {} + +// COMMON: define{{.*}} void @f20() #2 +__attribute__((patchable_function_entry(2, 0, "__attr_section"))) void f20(void) {} + +// COMMON: define{{.*}} void @f44() #3 +__attribute__((patchable_function_entry(4, 4))) void f44(void) {} + +// COMMON: define{{.*}} void @f52() #4 +__attribute__((patchable_function_entry(5, 2, "__attr_section"))) void f52(void) {} + +// OPT: define{{.*}} void @f() #5 +void f(void) {} + +/// No need to emit "patchable-function-entry" and thus also "patchable-function-entry-section" +// COMMON: attributes #0 = { {{.*}} +// COMMON-NOT: "patchable-function-entry-section" + +// NODEFAULT: attributes #1 = { {{.*}} "patchable-function-entry"="2" +// NODEFAULT-NOT: "patchable-function-entry-section" +// DEFAULT: attributes #1 = { {{.*}} "patchable-function-entry"="2" "patchable-function-entry-section"="__default_section" + +// COMMON: attributes #2 = { {{.*}} "patchable-function-entry"="2" "patchable-function-entry-section"="__attr_section" + +// NODEFAULT: attributes #3 = { {{.*}} "patchable-function-entry"="0" "patchable-function-prefix"="4" +// NODEFAULT-NOT: "patchable-function-entry-section" +// DEFAULT: attributes #3 = { {{.*}} "patchable-function-entry"="0" "patchable-function-entry-section"="__default_section" "patchable-function-prefix"="4" + +// COMMON: attributes #4 = { {{.*}} "patchable-function-entry"="3" "patchable-function-entry-section"="__attr_section" "patchable-function-prefix"="2" + +// DEFAULT: attributes #5 = { {{.*}} "patchable-function-entry"="1" "patchable-function-entry-section"="__default_section" diff --git a/clang/test/Driver/fpatchable-function-entry.c b/clang/test/Driver/fpatchable-function-entry.c index 5f07ca99a69de..43be6c5a47e47 100644 --- a/clang/test/Driver/fpatchable-function-entry.c +++ b/clang/test/Driver/fpatchable-function-entry.c @@ -15,6 +15,9 @@ // RUN: %clang --target=aarch64 -fsyntax-only %s -fpatchable-function-entry=2,1 -c -### 2>&1 | FileCheck --check-prefix=21 %s // 21: "-fpatchable-function-entry=2" "-fpatchable-function-entry-offset=1" +// RUN: %clang --target=aarch64 -fsyntax-only %s -fpatchable-function-entry=1,1,__section_name -c -### 2>&1 | FileCheck --check-prefix=SECTION %s +// SECTION: "-fpatchable-function-entry=1" "-fpatchable-function-entry-offset=1" "-fpatchable-function-entry-section=__section_name" + // RUN: not %clang --target=powerpc64-ibm-aix-xcoff -fsyntax-only %s -fpatchable-function-entry=1 2>&1 | FileCheck --check-prefix=AIX64 %s // AIX64: error: unsupported option '-fpatchable-function-entry=1' for target 'powerpc64-ibm-aix-xcoff' diff --git a/clang/test/Sema/patchable-function-entry-attr.c b/clang/test/Sema/patchable-function-entry-attr.c index 89e4380c36230..f453e134ab625 100644 --- a/clang/test/Sema/patchable-function-entry-attr.c +++ b/clang/test/Sema/patchable-function-entry-attr.c @@ -3,9 +3,15 @@ // expected-error@+1 {{'patchable_function_entry' attribute takes at least 1 argument}} __attribute__((patchable_function_entry)) void f(void); -// expected-error@+1 {{'patchable_function_entry' attribute takes no more than 2 arguments}} +// expected-error@+1 {{expected string literal as argument of 'patchable_function_entry' attribute}} __attribute__((patchable_function_entry(0, 0, 0))) void f(void); +// expected-error@+1 {{section argument to 'patchable_function_entry' attribute is not valid for this target}} +__attribute__((patchable_function_entry(0, 0, ""))) void f(void); + +// expected-error@+1 {{'patchable_function_entry' attribute takes no more than 3 arguments}} +__attribute__((patchable_function_entry(0, 0, "__section", 0))) void f(void); + // expected-error@+1 {{'patchable_function_entry' attribute requires a non-negative integral compile time constant expression}} __attribute__((patchable_function_entry(-1))) void f(void); diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 2d76aa5488333..0deaf94502b11 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -4602,7 +4602,13 @@ void AsmPrinter::emitPatchableFunctionEntries() { if (TM.getTargetTriple().isOSBinFormatELF()) { auto Flags = ELF::SHF_WRITE | ELF::SHF_ALLOC; const MCSymbolELF *LinkedToSym = nullptr; - StringRef GroupName; + StringRef GroupName, SectionName; + + if (F.hasFnAttribute("patchable-function-entry-section")) + SectionName = F.getFnAttribute("patchable-function-entry-section") + .getValueAsString(); + if (SectionName.empty()) + SectionName = "__patchable_function_entries"; // GNU as < 2.35 did not support section flag 'o'. GNU ld < 2.36 did not // support mixed SHF_LINK_ORDER and non-SHF_LINK_ORDER sections. @@ -4615,8 +4621,8 @@ void AsmPrinter::emitPatchableFunctionEntries() { LinkedToSym = cast(CurrentFnSym); } OutStreamer->switchSection(OutContext.getELFSection( - "__patchable_function_entries", ELF::SHT_PROGBITS, Flags, 0, GroupName, - F.hasComdat(), MCSection::NonUniqueID, LinkedToSym)); + SectionName, ELF::SHT_PROGBITS, Flags, 0, GroupName, F.hasComdat(), + MCSection::NonUniqueID, LinkedToSym)); emitAlignment(Align(PointerSize)); OutStreamer->emitSymbolValue(CurrentPatchableFunctionEntrySym, PointerSize); } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 95dd3aa86b428..7c6cd414554e3 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -2409,6 +2409,11 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs, checkUnsignedBaseTenFuncAttr(Attrs, "patchable-function-prefix", V); checkUnsignedBaseTenFuncAttr(Attrs, "patchable-function-entry", V); + if (Attrs.hasFnAttr("patchable-function-entry-section")) + Check(!Attrs.getFnAttr("patchable-function-entry-section") + .getValueAsString() + .empty(), + "\"patchable-function-entry-section\" must not be empty"); checkUnsignedBaseTenFuncAttr(Attrs, "warn-stack-size", V); if (auto A = Attrs.getFnAttr("sign-return-address"); A.isValid()) { diff --git a/llvm/test/CodeGen/X86/patchable-function-entry.ll b/llvm/test/CodeGen/X86/patchable-function-entry.ll index 54ecd8b1e5daf..d6dfe00c74991 100644 --- a/llvm/test/CodeGen/X86/patchable-function-entry.ll +++ b/llvm/test/CodeGen/X86/patchable-function-entry.ll @@ -98,3 +98,16 @@ define void @f3_2() "patchable-function-entry"="1" "patchable-function-prefix"=" %frame = alloca i8, i32 16 ret void } + +define void @s1() "patchable-function-entry"="1" "patchable-function-entry-section"=".entries" { +; CHECK-LABEL: s1: +; CHECK-NEXT: .Lfunc_begin6: +; CHECK: nop +; CHECK-NEXT: ret +; CHECK: .section .entries,"awo",@progbits,s1{{$}} +; X86: .p2align 2 +; X86-NEXT: .long .Lfunc_begin6 +; X64: .p2align 3 +; X64-NEXT: .quad .Lfunc_begin6 + ret void +} diff --git a/llvm/test/Verifier/invalid-patchable-function-entry.ll b/llvm/test/Verifier/invalid-patchable-function-entry.ll index e74037a28abe6..a86cd89ae7ef9 100644 --- a/llvm/test/Verifier/invalid-patchable-function-entry.ll +++ b/llvm/test/Verifier/invalid-patchable-function-entry.ll @@ -19,3 +19,7 @@ define void @g() "patchable-function-prefix" { ret void } define void @ga() "patchable-function-prefix"="a" { ret void } define void @g_1() "patchable-function-prefix"="-1" { ret void } define void @g3comma() "patchable-function-prefix"="3," { ret void } + +; CHECK: "patchable-function-entry-section" must not be empty + +define void @s1() "patchable-function-entry"="1" "patchable-function-entry-section" { ret void } From 8100bd58a3fc87576bf6d57f79e6bd70e10b83d3 Mon Sep 17 00:00:00 2001 From: Hansang Bae Date: Wed, 2 Apr 2025 17:16:30 -0500 Subject: [PATCH 0453/1029] [OpenMP] 6.0 (TR11) Memory Management Update (#97106) TR11 introduced changes to support target memory management in a unified way by defining a series of API routines and additional traits. Host runtime is oblivious to how actual memory resources are mapped when using the new API routines, so it can only support how the composed memory space is maintained, and the offload backend must handle which memory resources are actually used to allocate memory from the memory space. Here is summary of the implementation. * Implemented 12 API routines to get/mainpulate memory space/allocator. * Memory space composed with a list of devices has a state with resource description, and runtime is responsible for maintaining the allocated memory space objects. * Defined interface with offload runtime to access memory resource list, and to redirect calls to omp_alloc/omp_free since it requires backend-specific information. * Value of omp_default_mem_space changed from 0 to 99, and omp_null_mem_space took the value 0 as defined in the language. * New allocator traits were introduced, but how to use them is up to the offload backend. * Added basic tests for the new API routines. --- openmp/runtime/src/dllexports | 13 + openmp/runtime/src/include/omp.h.var | 34 +- openmp/runtime/src/include/omp_lib.F90.var | 106 ++++++- openmp/runtime/src/include/omp_lib.h.var | 120 ++++++- openmp/runtime/src/kmp.h | 48 ++- openmp/runtime/src/kmp_alloc.cpp | 293 +++++++++++++++++- openmp/runtime/src/kmp_ftn_entry.h | 123 ++++++++ openmp/runtime/src/kmp_ftn_os.h | 48 +++ openmp/runtime/src/kmp_global.cpp | 5 +- openmp/runtime/src/kmp_runtime.cpp | 5 +- openmp/runtime/src/kmp_stub.cpp | 3 +- .../runtime/test/api/omp60_memory_routines.c | 228 ++++++++++++++ 12 files changed, 1006 insertions(+), 20 deletions(-) create mode 100644 openmp/runtime/test/api/omp60_memory_routines.c diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports index 0667d53c35a18..3983dae80c9f5 100644 --- a/openmp/runtime/src/dllexports +++ b/openmp/runtime/src/dllexports @@ -532,6 +532,18 @@ kmp_set_disp_num_buffers 890 omp_get_device_num 896 omp_init_allocator 897 omp_destroy_allocator 898 + omp_get_devices_memspace 810 + omp_get_device_memspace 811 + omp_get_devices_and_host_memspace 812 + omp_get_device_and_host_memspace 813 + omp_get_devices_all_memspace 814 + omp_get_devices_allocator 815 + omp_get_device_allocator 816 + omp_get_devices_and_host_allocator 817 + omp_get_device_and_host_allocator 818 + omp_get_devices_all_allocator 819 + omp_get_memspace_num_resources 820 + omp_get_submemspace 821 %ifndef stub __kmpc_set_default_allocator __kmpc_get_default_allocator @@ -592,6 +604,7 @@ kmp_set_disp_num_buffers 890 llvm_omp_target_host_mem_space DATA llvm_omp_target_shared_mem_space DATA llvm_omp_target_device_mem_space DATA + omp_null_mem_space DATA %ifndef stub # Ordinals between 900 and 999 are reserved diff --git a/openmp/runtime/src/include/omp.h.var b/openmp/runtime/src/include/omp.h.var index 82f9d07657ffb..74f385feb3ea5 100644 --- a/openmp/runtime/src/include/omp.h.var +++ b/openmp/runtime/src/include/omp.h.var @@ -339,7 +339,13 @@ omp_atk_fallback = 5, omp_atk_fb_data = 6, omp_atk_pinned = 7, - omp_atk_partition = 8 + omp_atk_partition = 8, + omp_atk_pin_device = 9, + omp_atk_preferred_device = 10, + omp_atk_device_access = 11, + omp_atk_target_access = 12, + omp_atk_atomic_scope = 13, + omp_atk_part_size = 14 } omp_alloctrait_key_t; typedef enum { @@ -350,7 +356,7 @@ omp_atv_serialized = 5, omp_atv_sequential = omp_atv_serialized, // (deprecated) omp_atv_private = 6, - omp_atv_all = 7, + omp_atv_device = 7, omp_atv_thread = 8, omp_atv_pteam = 9, omp_atv_cgroup = 10, @@ -361,7 +367,11 @@ omp_atv_environment = 15, omp_atv_nearest = 16, omp_atv_blocked = 17, - omp_atv_interleaved = 18 + omp_atv_interleaved = 18, + omp_atv_all = 19, + omp_atv_single = 20, + omp_atv_multiple = 21, + omp_atv_memspace = 22 } omp_alloctrait_value_t; #define omp_atv_default ((omp_uintptr_t)-1) @@ -387,6 +397,7 @@ extern __KMP_IMP omp_allocator_handle_t const llvm_omp_target_device_mem_alloc; typedef omp_uintptr_t omp_memspace_handle_t; + extern __KMP_IMP omp_memspace_handle_t const omp_null_mem_space; extern __KMP_IMP omp_memspace_handle_t const omp_default_mem_space; extern __KMP_IMP omp_memspace_handle_t const omp_large_cap_mem_space; extern __KMP_IMP omp_memspace_handle_t const omp_const_mem_space; @@ -422,7 +433,8 @@ typedef enum omp_memspace_handle_t # endif { - omp_default_mem_space = 0, + omp_null_mem_space = 0, + omp_default_mem_space = 99, omp_large_cap_mem_space = 1, omp_const_mem_space = 2, omp_high_bw_mem_space = 3, @@ -463,6 +475,20 @@ extern void __KAI_KMPC_CONVENTION omp_free(void *ptr, omp_allocator_handle_t a); # endif + /* OpenMP TR11 routines to get memory spaces and allocators */ + extern omp_memspace_handle_t omp_get_devices_memspace(int ndevs, const int *devs, omp_memspace_handle_t memspace); + extern omp_memspace_handle_t omp_get_device_memspace(int dev, omp_memspace_handle_t memspace); + extern omp_memspace_handle_t omp_get_devices_and_host_memspace(int ndevs, const int *devs, omp_memspace_handle_t memspace); + extern omp_memspace_handle_t omp_get_device_and_host_memspace(int dev, omp_memspace_handle_t memspace); + extern omp_memspace_handle_t omp_get_devices_all_memspace(omp_memspace_handle_t memspace); + extern omp_allocator_handle_t omp_get_devices_allocator(int ndevs, const int *devs, omp_memspace_handle_t memspace); + extern omp_allocator_handle_t omp_get_device_allocator(int dev, omp_memspace_handle_t memspace); + extern omp_allocator_handle_t omp_get_devices_and_host_allocator(int ndevs, const int *devs, omp_memspace_handle_t memspace); + extern omp_allocator_handle_t omp_get_device_and_host_allocator(int dev, omp_memspace_handle_t memspace); + extern omp_allocator_handle_t omp_get_devices_all_allocator(omp_memspace_handle_t memspace); + extern int omp_get_memspace_num_resources(omp_memspace_handle_t memspace); + extern omp_memspace_handle_t omp_get_submemspace(omp_memspace_handle_t memspace, int num_resources, int *resources); + /* OpenMP 5.0 Affinity Format */ extern void __KAI_KMPC_CONVENTION omp_set_affinity_format(char const *); extern size_t __KAI_KMPC_CONVENTION omp_get_affinity_format(char *, size_t); diff --git a/openmp/runtime/src/include/omp_lib.F90.var b/openmp/runtime/src/include/omp_lib.F90.var index 5133915c7d8cb..3463b698291e1 100644 --- a/openmp/runtime/src/include/omp_lib.F90.var +++ b/openmp/runtime/src/include/omp_lib.F90.var @@ -145,6 +145,12 @@ integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_fb_data = 6 integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_pinned = 7 integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_partition = 8 + integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_pin_device = 9 + integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_preferred_device = 10 + integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_device_access = 11 + integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_target_access = 12 + integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_atomic_scope = 13 + integer (kind=omp_alloctrait_key_kind), parameter, public :: omp_atk_part_size = 14 integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_default = -1 integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_false = 0 @@ -154,7 +160,7 @@ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_serialized = 5 integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_sequential = omp_atv_serialized integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_private = 6 - integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_all = 7 + integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_device = 7 integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_thread = 8 integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_pteam = 9 integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_cgroup = 10 @@ -166,6 +172,10 @@ integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_nearest = 16 integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_blocked = 17 integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_interleaved = 18 + integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_all = 19 + integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_single = 20 + integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_multiple = 21 + integer (kind=omp_alloctrait_val_kind), parameter, public :: omp_atv_memspace = 22 integer (kind=omp_allocator_handle_kind), parameter, public :: omp_null_allocator = 0 integer (kind=omp_allocator_handle_kind), parameter, public :: omp_default_mem_alloc = 1 @@ -180,7 +190,8 @@ integer (kind=omp_allocator_handle_kind), parameter, public :: llvm_omp_target_shared_mem_alloc = 101 integer (kind=omp_allocator_handle_kind), parameter, public :: llvm_omp_target_device_mem_alloc = 102 - integer (kind=omp_memspace_handle_kind), parameter, public :: omp_default_mem_space = 0 + integer (kind=omp_memspace_handle_kind), parameter, public :: omp_null_mem_space = 0 + integer (kind=omp_memspace_handle_kind), parameter, public :: omp_default_mem_space = 99 integer (kind=omp_memspace_handle_kind), parameter, public :: omp_large_cap_mem_space = 1 integer (kind=omp_memspace_handle_kind), parameter, public :: omp_const_mem_space = 2 integer (kind=omp_memspace_handle_kind), parameter, public :: omp_high_bw_mem_space = 3 @@ -802,6 +813,97 @@ logical (kind=omp_logical_kind) omp_in_explicit_task end function omp_in_explicit_task + function omp_get_devices_memspace(ndevs, devs, memspace) + use omp_lib_kinds + integer(omp_memspace_handle_kind) :: omp_get_devices_memspace + integer, intent(in) :: ndevs + integer, intent(in) :: devs(*) + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_devices_memspace + + function omp_get_device_memspace(dev, memspace) + use omp_lib_kinds + integer(omp_memspace_handle_kind) :: omp_get_device_memspace + integer, intent(in) :: dev + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_device_memspace + + function omp_get_devices_and_host_memspace(ndevs, devs, memspace) + use omp_lib_kinds + integer(omp_memspace_handle_kind) :: & + omp_get_devices_and_host_memspace + integer, intent(in) :: ndevs + integer, intent(in) :: devs(*) + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_devices_and_host_memspace + + function omp_get_device_and_host_memspace(dev, memspace) + use omp_lib_kinds + integer(omp_memspace_handle_kind) :: & + omp_get_device_and_host_memspace + integer, intent(in) :: dev + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_device_and_host_memspace + + function omp_get_devices_all_memspace(memspace) + use omp_lib_kinds + integer(omp_memspace_handle_kind) :: omp_get_devices_all_memspace + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_devices_all_memspace + + function omp_get_devices_allocator(ndevs, devs, memspace) + use omp_lib_kinds + integer(omp_allocator_handle_kind) :: omp_get_devices_allocator + integer, intent(in) :: ndevs + integer, intent(in) :: devs(*) + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_devices_allocator + + function omp_get_device_allocator(dev, memspace) + use omp_lib_kinds + integer(omp_allocator_handle_kind) :: omp_get_device_allocator + integer, intent(in) :: dev + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_device_allocator + + function omp_get_devices_and_host_allocator(ndevs, devs, memspace) + use omp_lib_kinds + integer(omp_allocator_handle_kind) :: & + omp_get_devices_and_host_allocator + integer, intent(in) :: ndevs + integer, intent(in) :: devs(*) + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_devices_and_host_allocator + + function omp_get_device_and_host_allocator(dev, memspace) + use omp_lib_kinds + integer(omp_allocator_handle_kind) :: & + omp_get_device_and_host_allocator + integer, intent(in) :: dev + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_device_and_host_allocator + + function omp_get_devices_all_allocator(memspace) + use omp_lib_kinds + integer(omp_allocator_handle_kind) :: & + omp_get_devices_all_allocator + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_devices_all_allocator + + function omp_get_memspace_num_resources(memspace) + use omp_lib_kinds + integer omp_get_memspace_num_resources + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_memspace_num_resources + + function omp_get_submemspace(memspace, num_resources, resources) + use omp_lib_kinds + integer(omp_memspace_handle_kind) omp_get_submemspace + integer(omp_memspace_handle_kind), intent(in) :: memspace + integer, intent(in) :: num_resources + integer, intent(in) :: resources(*) + end function omp_get_submemspace + ! *** ! *** kmp_* entry points ! *** diff --git a/openmp/runtime/src/include/omp_lib.h.var b/openmp/runtime/src/include/omp_lib.h.var index db1dc889d1299..5793a3ac2e685 100644 --- a/openmp/runtime/src/include/omp_lib.h.var +++ b/openmp/runtime/src/include/omp_lib.h.var @@ -151,6 +151,18 @@ parameter(omp_atk_pinned=7) integer(kind=omp_alloctrait_key_kind)omp_atk_partition parameter(omp_atk_partition=8) + integer(kind=omp_alloctrait_key_kind)omp_atk_pin_device + parameter(omp_atk_pin_device=9) + integer(kind=omp_alloctrait_key_kind)omp_atk_preferred_device + parameter(omp_atk_preferred_device=10) + integer(kind=omp_alloctrait_key_kind)omp_atk_device_access + parameter(omp_atk_device_access=11) + integer(kind=omp_alloctrait_key_kind)omp_atk_target_access + parameter(omp_atk_target_access=12) + integer(kind=omp_alloctrait_key_kind)omp_atk_atomic_scope + parameter(omp_atk_atomic_scope=13) + integer(kind=omp_alloctrait_key_kind)omp_atk_part_size + parameter(omp_atk_part_size=14) integer(kind=omp_alloctrait_val_kind)omp_atv_default parameter(omp_atv_default=-1) @@ -170,8 +182,8 @@ parameter(omp_atv_sequential=5) integer(kind=omp_alloctrait_val_kind)omp_atv_private parameter(omp_atv_private=6) - integer(kind=omp_alloctrait_val_kind)omp_atv_all - parameter(omp_atv_all=7) + integer(kind=omp_alloctrait_val_kind)omp_atv_device + parameter(omp_atv_device=7) integer(kind=omp_alloctrait_val_kind)omp_atv_thread parameter(omp_atv_thread=8) integer(kind=omp_alloctrait_val_kind)omp_atv_pteam @@ -194,6 +206,14 @@ parameter(omp_atv_blocked=17) integer(kind=omp_alloctrait_val_kind)omp_atv_interleaved parameter(omp_atv_interleaved=18) + integer(kind=omp_alloctrait_val_kind)omp_atv_all + parameter(omp_atv_all=19) + integer(kind=omp_alloctrait_val_kind)omp_atv_single + parameter(omp_atv_single=20) + integer(kind=omp_alloctrait_val_kind)omp_atv_multiple + parameter(omp_atv_multiple=21) + integer(kind=omp_alloctrait_val_kind)omp_atv_memspace + parameter(omp_atv_memspace=22) type omp_alloctrait integer (kind=omp_alloctrait_key_kind) key @@ -225,8 +245,10 @@ integer(omp_allocator_handle_kind)llvm_omp_target_device_mem_alloc parameter(llvm_omp_target_device_mem_alloc=102) + integer(kind=omp_memspace_handle_kind)omp_null_mem_space + parameter(omp_null_mem_space=0) integer(kind=omp_memspace_handle_kind)omp_default_mem_space - parameter(omp_default_mem_space=0) + parameter(omp_default_mem_space=99) integer(kind=omp_memspace_handle_kind)omp_large_cap_mem_space parameter(omp_large_cap_mem_space=1) integer(kind=omp_memspace_handle_kind)omp_const_mem_space @@ -863,6 +885,98 @@ logical (kind=omp_logical_kind) omp_in_explicit_task end function omp_in_explicit_task + function omp_get_devices_memspace(ndevs, devs, memspace) + import + integer(omp_memspace_handle_kind) :: omp_get_devices_memspace + integer, intent(in) :: ndevs + integer, intent(in) :: devs(*) + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_devices_memspace + + function omp_get_device_memspace(dev, memspace) + import + integer(omp_memspace_handle_kind) :: omp_get_device_memspace + integer, intent(in) :: dev + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_device_memspace + + function omp_get_devices_and_host_memspace(ndevs,devs,memspace) + import + integer(omp_memspace_handle_kind) :: & + & omp_get_devices_and_host_memspace + integer, intent(in) :: ndevs + integer, intent(in) :: devs(*) + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_devices_and_host_memspace + + function omp_get_device_and_host_memspace(dev, memspace) + import + integer(omp_memspace_handle_kind) :: & + & omp_get_device_and_host_memspace + integer, intent(in) :: dev + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_device_and_host_memspace + + function omp_get_devices_all_memspace(memspace) + import + integer(omp_memspace_handle_kind)::omp_get_devices_all_memspace + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_devices_all_memspace + + function omp_get_devices_allocator(ndevs, devs, memspace) + import + integer(omp_allocator_handle_kind)::omp_get_devices_allocator + integer, intent(in) :: ndevs + integer, intent(in) :: devs(*) + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_devices_allocator + + function omp_get_device_allocator(dev, memspace) + import + integer(omp_allocator_handle_kind) :: omp_get_device_allocator + integer, intent(in) :: dev + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_device_allocator + + function omp_get_devices_and_host_allocator(ndevs,devs,memspace) + import + integer(omp_allocator_handle_kind) :: & + & omp_get_devices_and_host_allocator + integer, intent(in) :: ndevs + integer, intent(in) :: devs(*) + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_devices_and_host_allocator + + function omp_get_device_and_host_allocator(dev, memspace) + import + integer(omp_allocator_handle_kind) :: & + & omp_get_device_and_host_allocator + integer, intent(in) :: dev + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_device_and_host_allocator + + function omp_get_devices_all_allocator(memspace) + import + integer(omp_allocator_handle_kind) :: & + & omp_get_devices_all_allocator + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_devices_all_allocator + + function omp_get_memspace_num_resources(memspace) + import + integer omp_get_memspace_num_resources + integer(omp_memspace_handle_kind), intent(in) :: memspace + end function omp_get_memspace_num_resources + + function omp_get_submemspace(memspace, num_resources, resources) + import + integer(omp_memspace_handle_kind) omp_get_submemspace + integer(omp_memspace_handle_kind), intent(in) :: memspace + integer, intent(in) :: num_resources + integer, intent(in) :: resources(*) + end function omp_get_submemspace + + ! *** ! *** kmp_* entry points ! *** diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 28a5522f3a582..d5d667c32c643 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1046,7 +1046,13 @@ typedef enum { omp_atk_fallback = 5, omp_atk_fb_data = 6, omp_atk_pinned = 7, - omp_atk_partition = 8 + omp_atk_partition = 8, + omp_atk_pin_device = 9, + omp_atk_preferred_device = 10, + omp_atk_device_access = 11, + omp_atk_target_access = 12, + omp_atk_atomic_scope = 13, + omp_atk_part_size = 14 } omp_alloctrait_key_t; typedef enum { @@ -1057,7 +1063,7 @@ typedef enum { omp_atv_serialized = 5, omp_atv_sequential = omp_atv_serialized, // (deprecated) omp_atv_private = 6, - omp_atv_all = 7, + omp_atv_device = 7, omp_atv_thread = 8, omp_atv_pteam = 9, omp_atv_cgroup = 10, @@ -1068,11 +1074,16 @@ typedef enum { omp_atv_environment = 15, omp_atv_nearest = 16, omp_atv_blocked = 17, - omp_atv_interleaved = 18 + omp_atv_interleaved = 18, + omp_atv_all = 19, + omp_atv_single = 20, + omp_atv_multiple = 21, + omp_atv_memspace = 22 } omp_alloctrait_value_t; #define omp_atv_default ((omp_uintptr_t)-1) typedef void *omp_memspace_handle_t; +extern omp_memspace_handle_t const omp_null_mem_space; extern omp_memspace_handle_t const omp_default_mem_space; extern omp_memspace_handle_t const omp_large_cap_mem_space; extern omp_memspace_handle_t const omp_const_mem_space; @@ -1081,6 +1092,7 @@ extern omp_memspace_handle_t const omp_low_lat_mem_space; extern omp_memspace_handle_t const llvm_omp_target_host_mem_space; extern omp_memspace_handle_t const llvm_omp_target_shared_mem_space; extern omp_memspace_handle_t const llvm_omp_target_device_mem_space; +extern omp_memspace_handle_t const kmp_max_mem_space; typedef struct { omp_alloctrait_key_t key; @@ -1109,8 +1121,15 @@ extern omp_allocator_handle_t __kmp_def_allocator; extern int __kmp_memkind_available; extern bool __kmp_hwloc_available; -typedef omp_memspace_handle_t kmp_memspace_t; // placeholder +/// Memory space informaition is shared with offload runtime. +typedef struct kmp_memspace_t { + omp_memspace_handle_t memspace; // predefined input memory space + int num_resources = 0; // number of available resources + int *resources = nullptr; // available resources + kmp_memspace_t *next = nullptr; // next memory space handle +} kmp_memspace_t; +/// Memory allocator information is shared with offload runtime. typedef struct kmp_allocator_t { omp_memspace_handle_t memspace; void **memkind; // pointer to memkind @@ -1120,6 +1139,12 @@ typedef struct kmp_allocator_t { kmp_uint64 pool_size; kmp_uint64 pool_used; bool pinned; + omp_alloctrait_value_t partition; + int pin_device; + int preferred_device; + omp_alloctrait_value_t target_access; + omp_alloctrait_value_t atomic_scope; + size_t part_size; #if KMP_USE_HWLOC omp_alloctrait_value_t membind; #endif @@ -1155,6 +1180,21 @@ extern void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t al); extern void __kmp_init_memkind(); extern void __kmp_fini_memkind(); extern void __kmp_init_target_mem(); +extern void __kmp_fini_target_mem(); + +// OpenMP 6.0 (TR11) Memory Management support +extern omp_memspace_handle_t __kmp_get_devices_memspace(int ndevs, + const int *devs, + omp_memspace_handle_t, + int host); +extern omp_allocator_handle_t __kmp_get_devices_allocator(int ndevs, + const int *devs, + omp_memspace_handle_t, + int host); +extern int __kmp_get_memspace_num_resources(omp_memspace_handle_t memspace); +extern omp_memspace_handle_t +__kmp_get_submemspace(omp_memspace_handle_t memspace, int num_resources, + int *resources); /* ------------------------------------------------------------------------ */ diff --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp index 783d9ffe88aa3..801cd06c95502 100644 --- a/openmp/runtime/src/kmp_alloc.cpp +++ b/openmp/runtime/src/kmp_alloc.cpp @@ -1265,15 +1265,190 @@ static void *(*kmp_target_free_host)(void *ptr, int device); static void *(*kmp_target_free_shared)(void *ptr, int device); static void *(*kmp_target_free_device)(void *ptr, int device); static bool __kmp_target_mem_available; + #define KMP_IS_TARGET_MEM_SPACE(MS) \ (MS == llvm_omp_target_host_mem_space || \ MS == llvm_omp_target_shared_mem_space || \ MS == llvm_omp_target_device_mem_space) + #define KMP_IS_TARGET_MEM_ALLOC(MA) \ (MA == llvm_omp_target_host_mem_alloc || \ MA == llvm_omp_target_shared_mem_alloc || \ MA == llvm_omp_target_device_mem_alloc) +#define KMP_IS_PREDEF_MEM_SPACE(MS) \ + (MS == omp_null_mem_space || MS == omp_default_mem_space || \ + MS == omp_large_cap_mem_space || MS == omp_const_mem_space || \ + MS == omp_high_bw_mem_space || MS == omp_low_lat_mem_space || \ + KMP_IS_TARGET_MEM_SPACE(MS)) + +/// Support OMP 6.0 target memory management +/// Expected offload runtime entries. +/// +/// Returns number of resources and list of unique resource IDs in "resouces". +/// Runtime needs to invoke this twice to get the number of resources, allocate +/// space for the resource IDs, and finally let offload runtime write resource +/// IDs in "resources". +/// int __tgt_get_mem_resources(int num_devices, const int *devices, +/// int host_access, omp_memspace_handle_t memspace, +/// int *resources); +/// +/// Redirects omp_alloc call to offload runtime. +/// void *__tgt_omp_alloc(size_t size, omp_allocator_handle_t allocator); +/// +/// Redirects omp_free call to offload runtime. +/// void __tgt_omp_free(void *ptr, omp_allocator_handle_t); +class kmp_tgt_allocator_t { + bool supported = false; + using get_mem_resources_t = int (*)(int, const int *, int, + omp_memspace_handle_t, int *); + using omp_alloc_t = void *(*)(size_t, omp_allocator_handle_t); + using omp_free_t = void (*)(void *, omp_allocator_handle_t); + get_mem_resources_t tgt_get_mem_resources = nullptr; + omp_alloc_t tgt_omp_alloc = nullptr; + omp_free_t tgt_omp_free = nullptr; + +public: + /// Initialize interface with offload runtime + void init() { + tgt_get_mem_resources = + (get_mem_resources_t)KMP_DLSYM("__tgt_get_mem_resources"); + tgt_omp_alloc = (omp_alloc_t)KMP_DLSYM("__tgt_omp_alloc"); + tgt_omp_free = (omp_free_t)KMP_DLSYM("__tgt_omp_free"); + supported = tgt_get_mem_resources && tgt_omp_alloc && tgt_omp_free; + } + /// Obtain resource information from offload runtime. We assume offload + /// runtime backends maintain a list of unique resource IDS. + int get_mem_resources(int ndevs, const int *devs, int host, + omp_memspace_handle_t memspace, int *resources) { + if (supported) + return tgt_get_mem_resources(ndevs, devs, host, memspace, resources); + return 0; + } + /// Invoke offload runtime's memory allocation routine + void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { + if (supported) + return tgt_omp_alloc(size, allocator); + return nullptr; + } + /// Invoke offload runtime's memory deallocation routine + void omp_free(void *ptr, omp_allocator_handle_t allocator) { + if (supported) + tgt_omp_free(ptr, allocator); + } +} __kmp_tgt_allocator; + +extern "C" int omp_get_num_devices(void); + +/// Maintain a list of target memory spaces that are identified with the +/// requested information. There will be only one unique memory space object +/// that matches the input. +class kmp_tgt_memspace_list_t { + kmp_memspace_t *memspace_list = nullptr; + KMP_LOCK_INIT(mtx); + /// Find memory space that matches the provided input + kmp_memspace_t *find(int num_resources, const int *resources, + omp_memspace_handle_t memspace) { + kmp_memspace_t *ms = memspace_list; + while (ms) { + if (ms->num_resources == num_resources && ms->memspace == memspace && + !memcmp(ms->resources, resources, sizeof(int) * num_resources)) + break; + ms = ms->next; + } + return ms; + } + /// Return memory space for the provided input. It tries to find existing + /// memory space that exactly matches the provided input or create one if + /// not found. + omp_memspace_handle_t get(int num_resources, const int *resources, + omp_memspace_handle_t memspace) { + int gtid = __kmp_entry_gtid(); + __kmp_acquire_lock(&mtx, gtid); + // Sort absolute IDs in the resource list + int *sorted_resources = (int *)__kmp_allocate(sizeof(int) * num_resources); + KMP_MEMCPY(sorted_resources, resources, num_resources * sizeof(int)); + qsort(sorted_resources, (size_t)num_resources, sizeof(int), + [](const void *a, const void *b) { + const int val_a = *(const int *)a; + const int val_b = *(const int *)b; + return (val_a > val_b) ? 1 : ((val_a < val_b) ? -1 : 0); + }); + kmp_memspace_t *ms = find(num_resources, sorted_resources, memspace); + if (ms) { + __kmp_free(sorted_resources); + __kmp_release_lock(&mtx, gtid); + return ms; + } + ms = (kmp_memspace_t *)__kmp_allocate(sizeof(kmp_memspace_t)); + ms->memspace = memspace; + ms->num_resources = num_resources; + ms->resources = sorted_resources; + ms->next = memspace_list; + memspace_list = ms; + __kmp_release_lock(&mtx, gtid); + return ms; + } + +public: + /// Initialize memory space list + void init() { __kmp_init_lock(&mtx); } + /// Release resources for the memory space list + void fini() { + kmp_memspace_t *ms = memspace_list; + while (ms) { + if (ms->resources) + __kmp_free(ms->resources); + kmp_memspace_t *tmp = ms; + ms = ms->next; + __kmp_free(tmp); + } + __kmp_destroy_lock(&mtx); + } + /// Return memory space for the provided input + omp_memspace_handle_t get_memspace(int num_devices, const int *devices, + int host_access, + omp_memspace_handle_t memspace) { + int actual_num_devices = num_devices; + int *actual_devices = const_cast(devices); + if (actual_num_devices == 0) { + actual_num_devices = omp_get_num_devices(); + if (actual_num_devices <= 0) + return omp_null_mem_space; + } + if (actual_devices == NULL) { + // Prepare list of all devices in this case. + actual_devices = (int *)__kmp_allocate(sizeof(int) * actual_num_devices); + for (int i = 0; i < actual_num_devices; i++) + actual_devices[i] = i; + } + // Get the number of available resources first + int num_resources = __kmp_tgt_allocator.get_mem_resources( + actual_num_devices, actual_devices, host_access, memspace, NULL); + if (num_resources <= 0) + return omp_null_mem_space; // No available resources + + omp_memspace_handle_t ms = omp_null_mem_space; + if (num_resources > 0) { + int *resources = (int *)__kmp_allocate(sizeof(int) * num_resources); + // Let offload runtime write the resource IDs + num_resources = __kmp_tgt_allocator.get_mem_resources( + actual_num_devices, actual_devices, host_access, memspace, resources); + ms = get(num_resources, resources, memspace); + __kmp_free(resources); + } + if (!devices && actual_devices) + __kmp_free(actual_devices); + return ms; + } + /// Return sub memory space from the parent memory space + omp_memspace_handle_t get_memspace(int num_resources, const int *resources, + omp_memspace_handle_t parent) { + kmp_memspace_t *ms = (kmp_memspace_t *)parent; + return get(num_resources, resources, ms->memspace); + } +} __kmp_tgt_memspace_list; + #if KMP_OS_UNIX && KMP_DYNAMIC_LIB && !KMP_OS_DARWIN static inline void chk_kind(void ***pkind) { KMP_DEBUG_ASSERT(pkind); @@ -1456,19 +1631,30 @@ void __kmp_init_target_mem() { // lock/pin and unlock/unpin target calls *(void **)(&kmp_target_lock_mem) = KMP_DLSYM("llvm_omp_target_lock_mem"); *(void **)(&kmp_target_unlock_mem) = KMP_DLSYM("llvm_omp_target_unlock_mem"); + __kmp_tgt_allocator.init(); + __kmp_tgt_memspace_list.init(); } +/// Finalize target memory support +void __kmp_fini_target_mem() { __kmp_tgt_memspace_list.fini(); } + omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms, int ntraits, omp_alloctrait_t traits[]) { - // OpenMP 5.0 only allows predefined memspaces - KMP_DEBUG_ASSERT(ms == omp_default_mem_space || ms == omp_low_lat_mem_space || - ms == omp_large_cap_mem_space || ms == omp_const_mem_space || - ms == omp_high_bw_mem_space || KMP_IS_TARGET_MEM_SPACE(ms)); kmp_allocator_t *al; int i; al = (kmp_allocator_t *)__kmp_allocate(sizeof(kmp_allocator_t)); // zeroed al->memspace = ms; // not used currently + + // Assign default values if applicable + al->alignment = 1; + al->pinned = false; + al->partition = omp_atv_environment; + al->pin_device = -1; + al->preferred_device = -1; + al->target_access = omp_atv_single; + al->atomic_scope = omp_atv_device; + for (i = 0; i < ntraits; ++i) { switch (traits[i].key) { case omp_atk_sync_hint: @@ -1503,10 +1689,33 @@ omp_allocator_handle_t __kmpc_init_allocator(int gtid, omp_memspace_handle_t ms, #endif al->memkind = RCAST(void **, traits[i].value); break; + case omp_atk_pin_device: + __kmp_type_convert(traits[i].value, &(al->pin_device)); + break; + case omp_atk_preferred_device: + __kmp_type_convert(traits[i].value, &(al->preferred_device)); + break; + case omp_atk_target_access: + al->target_access = (omp_alloctrait_value_t)traits[i].value; + break; + case omp_atk_atomic_scope: + al->atomic_scope = (omp_alloctrait_value_t)traits[i].value; + break; + case omp_atk_part_size: + __kmp_type_convert(traits[i].value, &(al->part_size)); + break; default: KMP_ASSERT2(0, "Unexpected allocator trait"); } } + + if (al->memspace > kmp_max_mem_space) { + // Memory space has been allocated for targets. + return (omp_allocator_handle_t)al; + } + + KMP_DEBUG_ASSERT(KMP_IS_PREDEF_MEM_SPACE(al->memspace)); + if (al->fb == 0) { // set default allocator al->fb = omp_atv_default_mem_fb; @@ -1580,6 +1789,71 @@ omp_allocator_handle_t __kmpc_get_default_allocator(int gtid) { return __kmp_threads[gtid]->th.th_def_allocator; } +omp_memspace_handle_t __kmp_get_devices_memspace(int ndevs, const int *devs, + omp_memspace_handle_t memspace, + int host) { + if (!__kmp_init_serial) + __kmp_serial_initialize(); + // Only accept valid device description and predefined memory space + if (ndevs < 0 || (ndevs > 0 && !devs) || memspace > kmp_max_mem_space) + return omp_null_mem_space; + + return __kmp_tgt_memspace_list.get_memspace(ndevs, devs, host, memspace); +} + +omp_allocator_handle_t +__kmp_get_devices_allocator(int ndevs, const int *devs, + omp_memspace_handle_t memspace, int host) { + if (!__kmp_init_serial) + __kmp_serial_initialize(); + // Only accept valid device description and predefined memory space + if (ndevs < 0 || (ndevs > 0 && !devs) || memspace > kmp_max_mem_space) + return omp_null_allocator; + + omp_memspace_handle_t mspace = + __kmp_get_devices_memspace(ndevs, devs, memspace, host); + if (mspace == omp_null_mem_space) + return omp_null_allocator; + + return __kmpc_init_allocator(__kmp_entry_gtid(), mspace, 0, NULL); +} + +int __kmp_get_memspace_num_resources(omp_memspace_handle_t memspace) { + if (!__kmp_init_serial) + __kmp_serial_initialize(); + if (memspace == omp_null_mem_space) + return 0; + if (memspace < kmp_max_mem_space) + return 1; // return 1 for predefined memory space + kmp_memspace_t *ms = (kmp_memspace_t *)memspace; + return ms->num_resources; +} + +omp_memspace_handle_t __kmp_get_submemspace(omp_memspace_handle_t memspace, + int num_resources, int *resources) { + if (!__kmp_init_serial) + __kmp_serial_initialize(); + if (memspace == omp_null_mem_space || memspace < kmp_max_mem_space) + return memspace; // return input memory space for predefined memory space + kmp_memspace_t *ms = (kmp_memspace_t *)memspace; + if (num_resources == 0 || ms->num_resources < num_resources || !resources) + return omp_null_mem_space; // input memory space cannot satisfy the request + + // The stored resource ID is an absolute ID only known to the offload backend, + // and the returned memory space will still keep the property. + int *resources_abs = (int *)__kmp_allocate(sizeof(int) * num_resources); + + // Collect absolute resource ID from the relative ID + for (int i = 0; i < num_resources; i++) + resources_abs[i] = ms->resources[resources[i]]; + + omp_memspace_handle_t submemspace = __kmp_tgt_memspace_list.get_memspace( + num_resources, resources_abs, memspace); + __kmp_free(resources_abs); + + return submemspace; +} + typedef struct kmp_mem_desc { // Memory block descriptor void *ptr_alloc; // Pointer returned by allocator size_t size_a; // Size of allocated memory block (initial+descriptor+align) @@ -1667,6 +1941,11 @@ void *__kmp_alloc(int gtid, size_t algn, size_t size, int use_default_allocator = (!__kmp_hwloc_available && !__kmp_memkind_available); + if (al > kmp_max_mem_alloc && al->memspace > kmp_max_mem_space) { + // Memspace has been allocated for targets. + return __kmp_tgt_allocator.omp_alloc(size, allocator); + } + if (KMP_IS_TARGET_MEM_ALLOC(allocator)) { // Use size input directly as the memory may not be accessible on host. // Use default device for now. @@ -2021,6 +2300,12 @@ void ___kmpc_free(int gtid, void *ptr, omp_allocator_handle_t allocator) { kmp_mem_desc_t desc; kmp_uintptr_t addr_align; // address to return to caller kmp_uintptr_t addr_descr; // address of memory block descriptor + + if (al > kmp_max_mem_alloc && al->memspace > kmp_max_mem_space) { + __kmp_tgt_allocator.omp_free(ptr, allocator); + return; + } + if (__kmp_target_mem_available && (KMP_IS_TARGET_MEM_ALLOC(allocator) || (allocator > kmp_max_mem_alloc && KMP_IS_TARGET_MEM_SPACE(al->memspace)))) { diff --git a/openmp/runtime/src/kmp_ftn_entry.h b/openmp/runtime/src/kmp_ftn_entry.h index 9c8be5f953d35..59a9571d59534 100644 --- a/openmp/runtime/src/kmp_ftn_entry.h +++ b/openmp/runtime/src/kmp_ftn_entry.h @@ -428,6 +428,129 @@ omp_allocator_handle_t FTN_STDCALL FTN_GET_DEFAULT_ALLOCATOR(void) { #endif } +/* OpenMP 6.0 (TR11) Memory Management support */ +omp_memspace_handle_t FTN_STDCALL +FTN_GET_DEVICES_MEMSPACE(int KMP_DEREF ndevs, const int *devs, + omp_memspace_handle_t KMP_DEREF memspace) { +#ifdef KMP_STUB + return NULL; +#else + return __kmp_get_devices_memspace(KMP_DEREF ndevs, devs, KMP_DEREF memspace, + 0 /* host */); +#endif +} + +omp_memspace_handle_t FTN_STDCALL FTN_GET_DEVICE_MEMSPACE( + int KMP_DEREF dev, omp_memspace_handle_t KMP_DEREF memspace) { +#ifdef KMP_STUB + return NULL; +#else + int dev_num = KMP_DEREF dev; + return __kmp_get_devices_memspace(1, &dev_num, KMP_DEREF memspace, 0); +#endif +} + +omp_memspace_handle_t FTN_STDCALL +FTN_GET_DEVICES_AND_HOST_MEMSPACE(int KMP_DEREF ndevs, const int *devs, + omp_memspace_handle_t KMP_DEREF memspace) { +#ifdef KMP_STUB + return NULL; +#else + return __kmp_get_devices_memspace(KMP_DEREF ndevs, devs, KMP_DEREF memspace, + 1); +#endif +} + +omp_memspace_handle_t FTN_STDCALL FTN_GET_DEVICE_AND_HOST_MEMSPACE( + int KMP_DEREF dev, omp_memspace_handle_t KMP_DEREF memspace) { +#ifdef KMP_STUB + return NULL; +#else + int dev_num = KMP_DEREF dev; + return __kmp_get_devices_memspace(1, &dev_num, KMP_DEREF memspace, 1); +#endif +} + +omp_memspace_handle_t FTN_STDCALL +FTN_GET_DEVICES_ALL_MEMSPACE(omp_memspace_handle_t KMP_DEREF memspace) { +#ifdef KMP_STUB + return NULL; +#else + return __kmp_get_devices_memspace(0, NULL, KMP_DEREF memspace, 1); +#endif +} + +omp_allocator_handle_t FTN_STDCALL +FTN_GET_DEVICES_ALLOCATOR(int KMP_DEREF ndevs, const int *devs, + omp_allocator_handle_t KMP_DEREF memspace) { +#ifdef KMP_STUB + return NULL; +#else + return __kmp_get_devices_allocator(KMP_DEREF ndevs, devs, KMP_DEREF memspace, + 0 /* host */); +#endif +} + +omp_allocator_handle_t FTN_STDCALL FTN_GET_DEVICE_ALLOCATOR( + int KMP_DEREF dev, omp_allocator_handle_t KMP_DEREF memspace) { +#ifdef KMP_STUB + return NULL; +#else + int dev_num = KMP_DEREF dev; + return __kmp_get_devices_allocator(1, &dev_num, KMP_DEREF memspace, 0); +#endif +} + +omp_allocator_handle_t FTN_STDCALL +FTN_GET_DEVICES_AND_HOST_ALLOCATOR(int KMP_DEREF ndevs, const int *devs, + omp_allocator_handle_t KMP_DEREF memspace) { +#ifdef KMP_STUB + return NULL; +#else + return __kmp_get_devices_allocator(KMP_DEREF ndevs, devs, KMP_DEREF memspace, + 1); +#endif +} + +omp_allocator_handle_t FTN_STDCALL FTN_GET_DEVICE_AND_HOST_ALLOCATOR( + int KMP_DEREF dev, omp_allocator_handle_t KMP_DEREF memspace) { +#ifdef KMP_STUB + return NULL; +#else + int dev_num = KMP_DEREF dev; + return __kmp_get_devices_allocator(1, &dev_num, KMP_DEREF memspace, 1); +#endif +} + +omp_allocator_handle_t FTN_STDCALL +FTN_GET_DEVICES_ALL_ALLOCATOR(omp_allocator_handle_t KMP_DEREF memspace) { +#ifdef KMP_STUB + return NULL; +#else + return __kmp_get_devices_allocator(0, NULL, KMP_DEREF memspace, 1); +#endif +} + +int FTN_STDCALL +FTN_GET_MEMSPACE_NUM_RESOURCES(omp_memspace_handle_t KMP_DEREF memspace) { +#ifdef KMP_STUB + return 0; +#else + return __kmp_get_memspace_num_resources(KMP_DEREF memspace); +#endif +} + +omp_memspace_handle_t FTN_STDCALL +FTN_GET_SUBMEMSPACE(omp_memspace_handle_t KMP_DEREF memspace, + int KMP_DEREF num_resources, int *resources) { +#ifdef KMP_STUB + return NULL; +#else + return __kmp_get_submemspace(KMP_DEREF memspace, KMP_DEREF num_resources, + resources); +#endif +} + /* OpenMP 5.0 affinity format support */ #ifndef KMP_STUB static void __kmp_fortran_strncpy_truncate(char *buffer, size_t buf_size, diff --git a/openmp/runtime/src/kmp_ftn_os.h b/openmp/runtime/src/kmp_ftn_os.h index 7d595b947f4a9..ae0ed067235e5 100644 --- a/openmp/runtime/src/kmp_ftn_os.h +++ b/openmp/runtime/src/kmp_ftn_os.h @@ -127,6 +127,18 @@ #define FTN_DESTROY_ALLOCATOR omp_destroy_allocator #define FTN_SET_DEFAULT_ALLOCATOR omp_set_default_allocator #define FTN_GET_DEFAULT_ALLOCATOR omp_get_default_allocator +#define FTN_GET_DEVICES_MEMSPACE omp_get_devices_memspace +#define FTN_GET_DEVICE_MEMSPACE omp_get_device_memspace +#define FTN_GET_DEVICES_AND_HOST_MEMSPACE omp_get_devices_and_host_memspace +#define FTN_GET_DEVICE_AND_HOST_MEMSPACE omp_get_device_and_host_memspace +#define FTN_GET_DEVICES_ALL_MEMSPACE omp_get_devices_all_memspace +#define FTN_GET_DEVICES_ALLOCATOR omp_get_devices_allocator +#define FTN_GET_DEVICE_ALLOCATOR omp_get_device_allocator +#define FTN_GET_DEVICES_AND_HOST_ALLOCATOR omp_get_devices_and_host_allocator +#define FTN_GET_DEVICE_AND_HOST_ALLOCATOR omp_get_device_and_host_allocator +#define FTN_GET_DEVICES_ALL_ALLOCATOR omp_get_devices_all_allocator +#define FTN_GET_MEMSPACE_NUM_RESOURCES omp_get_memspace_num_resources +#define FTN_GET_SUBMEMSPACE omp_get_submemspace #define FTN_GET_DEVICE_NUM omp_get_device_num #define FTN_SET_AFFINITY_FORMAT omp_set_affinity_format #define FTN_GET_AFFINITY_FORMAT omp_get_affinity_format @@ -262,6 +274,18 @@ #define FTN_DESTROY_ALLOCATOR omp_destroy_allocator_ #define FTN_SET_DEFAULT_ALLOCATOR omp_set_default_allocator_ #define FTN_GET_DEFAULT_ALLOCATOR omp_get_default_allocator_ +#define FTN_GET_DEVICES_MEMSPACE omp_get_devices_memspace_ +#define FTN_GET_DEVICE_MEMSPACE omp_get_device_memspace_ +#define FTN_GET_DEVICES_AND_HOST_MEMSPACE omp_get_devices_and_host_memspace_ +#define FTN_GET_DEVICE_AND_HOST_MEMSPACE omp_get_device_and_host_memspace_ +#define FTN_GET_DEVICES_ALL_MEMSPACE omp_get_devices_all_memspace_ +#define FTN_GET_DEVICES_ALLOCATOR omp_get_devices_allocator_ +#define FTN_GET_DEVICE_ALLOCATOR omp_get_device_allocator_ +#define FTN_GET_DEVICES_AND_HOST_ALLOCATOR omp_get_devices_and_host_allocator_ +#define FTN_GET_DEVICE_AND_HOST_ALLOCATOR omp_get_device_and_host_allocator_ +#define FTN_GET_DEVICES_ALL_ALLOCATOR omp_get_devices_all_allocator_ +#define FTN_GET_MEMSPACE_NUM_RESOURCES omp_get_memspace_num_resources_ +#define FTN_GET_SUBMEMSPACE omp_get_submemspace_ #define FTN_ALLOC omp_alloc_ #define FTN_FREE omp_free_ #define FTN_GET_DEVICE_NUM omp_get_device_num_ @@ -399,6 +423,18 @@ #define FTN_DESTROY_ALLOCATOR OMP_DESTROY_ALLOCATOR #define FTN_SET_DEFAULT_ALLOCATOR OMP_SET_DEFAULT_ALLOCATOR #define FTN_GET_DEFAULT_ALLOCATOR OMP_GET_DEFAULT_ALLOCATOR +#define FTN_GET_DEVICES_MEMSPACE OMP_GET_DEVICES_MEMSPACE +#define FTN_GET_DEVICE_MEMSPACE OMP_GET_DEVICE_MEMSPACE +#define FTN_GET_DEVICES_AND_HOST_MEMSPACE OMP_GET_DEVICES_AND_HOST_MEMSPACE +#define FTN_GET_DEVICE_AND_HOST_MEMSPACE OMP_GET_DEVICE_AND_HOST_MEMSPACE +#define FTN_GET_DEVICES_ALL_MEMSPACE OMP_GET_DEVICES_ALL_MEMSPACE +#define FTN_GET_DEVICES_ALLOCATOR OMP_GET_DEVICES_ALLOCATOR +#define FTN_GET_DEVICE_ALLOCATOR OMP_GET_DEVICE_ALLOCATOR +#define FTN_GET_DEVICES_AND_HOST_ALLOCATOR OMP_GET_DEVICES_AND_HOST_ALLOCATOR +#define FTN_GET_DEVICE_AND_HOST_ALLOCATOR OMP_GET_DEVICE_AND_HOST_ALLOCATOR +#define FTN_GET_DEVICES_ALL_ALLOCATOR OMP_GET_DEVICES_ALL_ALLOCATOR +#define FTN_GET_MEMSPACE_NUM_RESOURCES OMP_GET_MEMSPACE_NUM_RESOURCES +#define FTN_GET_SUBMEMSPACE OMP_GET_SUBMEMSPACE #define FTN_GET_DEVICE_NUM OMP_GET_DEVICE_NUM #define FTN_SET_AFFINITY_FORMAT OMP_SET_AFFINITY_FORMAT #define FTN_GET_AFFINITY_FORMAT OMP_GET_AFFINITY_FORMAT @@ -534,6 +570,18 @@ #define FTN_DESTROY_ALLOCATOR OMP_DESTROY_ALLOCATOR_ #define FTN_SET_DEFAULT_ALLOCATOR OMP_SET_DEFAULT_ALLOCATOR_ #define FTN_GET_DEFAULT_ALLOCATOR OMP_GET_DEFAULT_ALLOCATOR_ +#define FTN_GET_DEVICES_MEMSPACE OMP_GET_DEVICES_MEMSPACE_ +#define FTN_GET_DEVICE_MEMSPACE OMP_GET_DEVICE_MEMSPACE_ +#define FTN_GET_DEVICES_AND_HOST_MEMSPACE OMP_GET_DEVICES_AND_HOST_MEMSPACE_ +#define FTN_GET_DEVICE_AND_HOST_MEMSPACE OMP_GET_DEVICE_AND_HOST_MEMSPACE_ +#define FTN_GET_DEVICES_ALL_MEMSPACE OMP_GET_DEVICES_ALL_MEMSPACE_ +#define FTN_GET_DEVICES_ALLOCATOR OMP_GET_DEVICES_ALLOCATOR_ +#define FTN_GET_DEVICE_ALLOCATOR OMP_GET_DEVICE_ALLOCATOR_ +#define FTN_GET_DEVICES_AND_HOST_ALLOCATOR OMP_GET_DEVICES_AND_HOST_ALLOCATOR_ +#define FTN_GET_DEVICE_AND_HOST_ALLOCATOR OMP_GET_DEVICE_AND_HOST_ALLOCATOR_ +#define FTN_GET_DEVICES_ALL_ALLOCATOR OMP_GET_DEVICES_ALL_ALLOCATOR_ +#define FTN_GET_MEMSPACE_NUM_RESOURCES OMP_GET_MEMSPACE_NUM_RESOURCES_ +#define FTN_GET_SUBMEMSPACE OMP_GET_SUBMEMSPACE_ #define FTN_ALLOC OMP_ALLOC_ #define FTN_FREE OMP_FREE_ #define FTN_GET_DEVICE_NUM OMP_GET_DEVICE_NUM_ diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp index eb077bca4ce21..c3bc235a44fa3 100644 --- a/openmp/runtime/src/kmp_global.cpp +++ b/openmp/runtime/src/kmp_global.cpp @@ -324,8 +324,9 @@ omp_allocator_handle_t const kmp_max_mem_alloc = (omp_allocator_handle_t const)1024; omp_allocator_handle_t __kmp_def_allocator = omp_default_mem_alloc; +omp_memspace_handle_t const omp_null_mem_space = (omp_memspace_handle_t const)0; omp_memspace_handle_t const omp_default_mem_space = - (omp_memspace_handle_t const)0; + (omp_memspace_handle_t const)99; omp_memspace_handle_t const omp_large_cap_mem_space = (omp_memspace_handle_t const)1; omp_memspace_handle_t const omp_const_mem_space = @@ -340,6 +341,8 @@ omp_memspace_handle_t const llvm_omp_target_shared_mem_space = (omp_memspace_handle_t const)101; omp_memspace_handle_t const llvm_omp_target_device_mem_space = (omp_memspace_handle_t const)102; +omp_memspace_handle_t const kmp_max_mem_space = + (omp_memspace_handle_t const)1024; /* This check ensures that the compiler is passing the correct data type for the flags formal parameter of the function kmpc_omp_task_alloc(). If the type is diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index 9f679aa8d334f..417eceb8ebecc 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -579,7 +579,10 @@ static void __kmp_init_allocator() { __kmp_init_memkind(); __kmp_init_target_mem(); } -static void __kmp_fini_allocator() { __kmp_fini_memkind(); } +static void __kmp_fini_allocator() { + __kmp_fini_target_mem(); + __kmp_fini_memkind(); +} /* ------------------------------------------------------------------------ */ diff --git a/openmp/runtime/src/kmp_stub.cpp b/openmp/runtime/src/kmp_stub.cpp index f25e24f09a03d..06276d1bed1c7 100644 --- a/openmp/runtime/src/kmp_stub.cpp +++ b/openmp/runtime/src/kmp_stub.cpp @@ -357,8 +357,9 @@ omp_allocator_handle_t const llvm_omp_target_shared_mem_alloc = omp_allocator_handle_t const llvm_omp_target_device_mem_alloc = (omp_allocator_handle_t const)102; +omp_memspace_handle_t const omp_null_mem_space = (omp_memspace_handle_t const)0; omp_memspace_handle_t const omp_default_mem_space = - (omp_memspace_handle_t const)0; + (omp_memspace_handle_t const)99; omp_memspace_handle_t const omp_large_cap_mem_space = (omp_memspace_handle_t const)1; omp_memspace_handle_t const omp_const_mem_space = diff --git a/openmp/runtime/test/api/omp60_memory_routines.c b/openmp/runtime/test/api/omp60_memory_routines.c new file mode 100644 index 0000000000000..97b648a7a01bb --- /dev/null +++ b/openmp/runtime/test/api/omp60_memory_routines.c @@ -0,0 +1,228 @@ +// RUN: %libomp-compile -Wl,--export-dynamic && %libomp-run + +// REQUIRES: linux + +// Test OpenMP 6.0 memory management routines. +// Test host runtime's basic support with an emulated offload runtime. + +#include +#include + +#define NUM_DEVICES 4 + +// +// Required offload runtime interfaces +// +extern int __tgt_get_num_devices(void) { return NUM_DEVICES; } + +extern int __tgt_get_mem_resources(int num_devices, const int *devices, + int host, omp_memspace_handle_t memspace, + int *resources) { + int i; + // We expect valid inputs within this test. + int num_resources = num_devices; + if (resources) { + // Simple resouce ID mapping example in the backend (=device ID). + // This does not represent any real backend. + for (i = 0; i < num_devices; i++) + resources[i] = devices[i]; + } + return num_resources; +} + +extern void *__tgt_omp_alloc(size_t size, omp_allocator_handle_t allocator) { + return malloc(size); +} + +extern void __tgt_omp_free(void *ptr, omp_allocator_handle_t allocator) { + free(ptr); +} + +// Code above is also used by the corresponding Fortran test + +#define CHECK_OR_RET_FAIL(Expr) \ + do { \ + if (!(Expr)) \ + return EXIT_FAILURE; \ + } while (0) + +// Test user-initialized allocator with the given memory space +static int test_user_allocator(omp_memspace_handle_t ms) { + omp_allocator_handle_t al = omp_null_allocator; + al = omp_init_allocator(ms, 0, NULL); + CHECK_OR_RET_FAIL(al != omp_null_allocator); + void *m = omp_alloc(1024, al); + CHECK_OR_RET_FAIL(m != NULL); + omp_free(m, al); + omp_destroy_allocator(al); + return EXIT_SUCCESS; +} + +static int test_allocator(omp_allocator_handle_t al) { + void *m = omp_alloc(1024, al); + CHECK_OR_RET_FAIL(m != NULL); + omp_free(m, al); + omp_destroy_allocator(al); + return EXIT_SUCCESS; +} + +static int test_mem_space(void) { + int i, count; + int num_devices = omp_get_num_devices(); + CHECK_OR_RET_FAIL(num_devices == NUM_DEVICES); + + int *all_devices = (int *)malloc(sizeof(int) * num_devices); + for (i = 0; i < num_devices; i++) + all_devices[i] = i; + + omp_memspace_handle_t predef = omp_default_mem_space; + omp_memspace_handle_t ms1 = omp_null_mem_space; + omp_memspace_handle_t ms2 = omp_null_mem_space; + + // Test the following API routines. + // * omp_get_device_memspace + // * omp_get_device_and_host_memspace + // * omp_get_devices_memspace + // * omp_get_devices_and_host_memspace + // Test if runtime returns the same memory space handle for the same input. + // Test if we can use the memory space to intialize allocator. + for (i = 0; i < num_devices; i++) { + ms1 = omp_get_device_memspace(i, predef); + CHECK_OR_RET_FAIL(ms1 != omp_null_mem_space); + ms2 = omp_get_device_memspace(i, predef); + CHECK_OR_RET_FAIL(ms1 == ms2); + CHECK_OR_RET_FAIL(test_user_allocator(ms1) == EXIT_SUCCESS); + ms1 = ms2 = omp_null_mem_space; + + ms1 = omp_get_device_and_host_memspace(i, predef); + CHECK_OR_RET_FAIL(ms1 != omp_null_mem_space); + ms2 = omp_get_device_and_host_memspace(i, predef); + CHECK_OR_RET_FAIL(ms1 == ms2); + CHECK_OR_RET_FAIL(test_user_allocator(ms1) == EXIT_SUCCESS); + ms1 = ms2 = omp_null_mem_space; + + for (count = 1; i + count <= num_devices; count++) { + int *devices = &all_devices[i]; + ms1 = omp_get_devices_memspace(count, devices, predef); + CHECK_OR_RET_FAIL(ms1 != omp_null_mem_space); + ms2 = omp_get_devices_memspace(count, devices, predef); + CHECK_OR_RET_FAIL(ms1 == ms2); + CHECK_OR_RET_FAIL(test_user_allocator(ms1) == EXIT_SUCCESS); + ms1 = ms2 = omp_null_mem_space; + + ms1 = omp_get_devices_and_host_memspace(count, devices, predef); + CHECK_OR_RET_FAIL(ms1 != omp_null_mem_space); + ms2 = omp_get_devices_and_host_memspace(count, devices, predef); + CHECK_OR_RET_FAIL(ms1 == ms2); + CHECK_OR_RET_FAIL(test_user_allocator(ms1) == EXIT_SUCCESS); + ms1 = ms2 = omp_null_mem_space; + } + } + + // Test the following API routines. + // * omp_get_devices_all_memspace + // Test if runtime returns the same memory space handle for the same input. + ms1 = omp_get_devices_all_memspace(predef); + CHECK_OR_RET_FAIL(ms1 != omp_null_mem_space); + ms2 = omp_get_devices_all_memspace(predef); + CHECK_OR_RET_FAIL(ms1 == ms2); + + free(all_devices); + + return EXIT_SUCCESS; +} + +static int test_mem_allocator(void) { + int i, count; + int num_devices = omp_get_num_devices(); + CHECK_OR_RET_FAIL(num_devices == NUM_DEVICES); + + int *all_devices = (int *)malloc(sizeof(int) * num_devices); + for (i = 0; i < num_devices; i++) + all_devices[i] = i; + + omp_memspace_handle_t predef = omp_default_mem_space; + omp_allocator_handle_t al = omp_null_allocator; + + // Test the following API routines. + // * omp_get_device_allocator + // * omp_get_device_and_host_allocator + // * omp_get_devices_allocator + // * omp_get_devices_and_host_allocator + for (i = 0; i < num_devices; i++) { + al = omp_get_device_allocator(i, predef); + CHECK_OR_RET_FAIL(al != omp_null_allocator); + CHECK_OR_RET_FAIL(test_allocator(al) == EXIT_SUCCESS); + al = omp_null_allocator; + + al = omp_get_device_and_host_allocator(i, predef); + CHECK_OR_RET_FAIL(al != omp_null_allocator); + CHECK_OR_RET_FAIL(test_allocator(al) == EXIT_SUCCESS); + al = omp_null_allocator; + + for (count = 1; i + count <= num_devices; count++) { + int *devices = &all_devices[i]; + al = omp_get_devices_allocator(count, devices, predef); + CHECK_OR_RET_FAIL(al != omp_null_allocator); + CHECK_OR_RET_FAIL(test_allocator(al) == EXIT_SUCCESS); + al = omp_null_allocator; + + al = omp_get_devices_and_host_allocator(count, devices, predef); + CHECK_OR_RET_FAIL(al != omp_null_allocator); + CHECK_OR_RET_FAIL(test_allocator(al) == EXIT_SUCCESS); + al = omp_null_allocator; + } + } + + // Test the following API routines. + // * omp_get_devices_all_allocator + al = omp_get_devices_all_allocator(predef); + CHECK_OR_RET_FAIL(al != omp_null_allocator); + CHECK_OR_RET_FAIL(test_allocator(al) == EXIT_SUCCESS); + + free(all_devices); + + return EXIT_SUCCESS; +} + +// Just test what we can expect from the emulated backend. +static int test_sub_mem_space(void) { + int i; + omp_memspace_handle_t ms = omp_null_mem_space; + ms = omp_get_devices_all_memspace(omp_default_mem_space); + CHECK_OR_RET_FAIL(ms != omp_null_mem_space); + int num_resources = omp_get_memspace_num_resources(ms); + CHECK_OR_RET_FAIL(num_resources == NUM_DEVICES); + + // Check if single-resource sub memspace is correctly returned. + for (i = 0; i < num_resources; i++) { + omp_memspace_handle_t sub = omp_get_submemspace(ms, 1, &i); + CHECK_OR_RET_FAIL(sub != omp_null_mem_space); + CHECK_OR_RET_FAIL(sub != ms); + int num_sub_resources = omp_get_memspace_num_resources(sub); + CHECK_OR_RET_FAIL(num_sub_resources == 1); + } + + // Check if all-resrouce sub memspace is correctly returned. + int *resources = (int *)malloc(sizeof(int) * num_resources); + for (i = 0; i < num_resources; i++) + resources[i] = i; + omp_memspace_handle_t sub = omp_get_submemspace(ms, num_resources, resources); + CHECK_OR_RET_FAIL(sub != omp_null_mem_space); + CHECK_OR_RET_FAIL(sub == ms); + + return EXIT_SUCCESS; +} + +int main() { + int rc = test_mem_space(); + CHECK_OR_RET_FAIL(rc == EXIT_SUCCESS); + + rc = test_mem_allocator(); + CHECK_OR_RET_FAIL(rc == EXIT_SUCCESS); + + rc = test_sub_mem_space(); + CHECK_OR_RET_FAIL(rc == EXIT_SUCCESS); + + return rc; +} From 93d3775da8810e1542873b1cdcec2ea142704561 Mon Sep 17 00:00:00 2001 From: Dave Lee Date: Wed, 2 Apr 2025 15:16:58 -0700 Subject: [PATCH 0454/1029] [lldb] Fix tagged-pointer info address parsing (#134123) Change `objc tagged-pointer info` to call `OptionArgParser::ToRawAddress`. Previously `ToAddress` was used, but it calls `FixCodeAddress`, which can erroneously mutate the bits of a tagged pointer. --- .../ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp | 2 +- .../objc/tagged-pointer/TestTaggedPointerCmd.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp index c91a29ace1f68..2338367302387 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp @@ -1047,7 +1047,7 @@ class CommandObjectMultiwordObjC_TaggedPointer_Info continue; Status error; - lldb::addr_t arg_addr = OptionArgParser::ToAddress( + lldb::addr_t arg_addr = OptionArgParser::ToRawAddress( &exe_ctx, arg_str, LLDB_INVALID_ADDRESS, &error); if (arg_addr == 0 || arg_addr == LLDB_INVALID_ADDRESS || error.Fail()) { result.AppendErrorWithFormatv( diff --git a/lldb/test/API/lang/objc/tagged-pointer/TestTaggedPointerCmd.py b/lldb/test/API/lang/objc/tagged-pointer/TestTaggedPointerCmd.py index d2519359c783b..f2f6026642d20 100644 --- a/lldb/test/API/lang/objc/tagged-pointer/TestTaggedPointerCmd.py +++ b/lldb/test/API/lang/objc/tagged-pointer/TestTaggedPointerCmd.py @@ -8,10 +8,22 @@ class TestTaggedPointerCommand(TestBase): @no_debug_info_test def test(self): self.build() - lldbutil.run_to_source_breakpoint( + _, _, thread, _ = lldbutil.run_to_source_breakpoint( self, "// break here", lldb.SBFileSpec("main.m") ) + n1 = thread.GetSelectedFrame().FindVariable("n1") + self.expect( + f"lang objc tagged-pointer info {n1.addr}", + substrs=[ + f"{n1.addr} is tagged", + "payload = 0x0000000000000012", + "value = 0x0000000000000001", + "info bits = 0x0000000000000002", + "class = __NSCFNumber", + ], + ) + self.expect( "lang objc tagged-pointer info bogus", error=True, From 990a086d9da0bc2fd53a6a4c95ecbbe23a297a83 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 2 Apr 2025 15:16:35 -0700 Subject: [PATCH 0455/1029] [bazel] Add missing dep after 51d1c7288662ea801b07133fd2d22aff6bac50e2 --- .../libc/test/src/__support/CPP/BUILD.bazel | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/__support/CPP/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/__support/CPP/BUILD.bazel index 96dafbc6da485..723ba2eb00935 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/__support/CPP/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/__support/CPP/BUILD.bazel @@ -93,5 +93,8 @@ libc_test( libc_test( name = "type_traits_test", srcs = ["type_traits_test.cpp"], - deps = ["//libc:__support_cpp_type_traits"], + deps = [ + "//libc:__support_cpp_type_traits", + "//libc:llvm_libc_macros_stdfix_macros", + ], ) From f302f35526553abcb46dab278c4494c3d01deb45 Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Wed, 2 Apr 2025 19:27:29 -0300 Subject: [PATCH 0456/1029] [clang] Track final substitution for Subst* AST nodes (#132748) --- clang/include/clang/AST/ASTContext.h | 17 +++---- clang/include/clang/AST/ExprCXX.h | 25 ++++++++-- clang/include/clang/AST/PropertiesBase.td | 3 +- clang/include/clang/AST/TemplateName.h | 17 +++++-- clang/include/clang/AST/Type.h | 24 ++++++---- clang/include/clang/AST/TypeProperties.td | 4 +- clang/lib/AST/ASTContext.cpp | 15 +++--- clang/lib/AST/ASTImporter.cpp | 9 ++-- clang/lib/AST/ExprCXX.cpp | 6 ++- clang/lib/AST/TemplateName.cpp | 6 ++- clang/lib/AST/TextNodeDumper.cpp | 4 ++ clang/lib/AST/Type.cpp | 22 +++++++-- clang/lib/Sema/SemaTemplate.cpp | 14 ++---- clang/lib/Sema/SemaTemplateInstantiate.cpp | 48 +++++++------------ clang/lib/Sema/TreeTransform.h | 3 +- ...ubst-template-type-parm-type-ast-nodes.cpp | 18 ------- clang/test/AST/ast-dump-template-decls.cpp | 18 ++++--- .../test/Misc/diag-template-diffing-cxx11.cpp | 4 +- clang/test/SemaTemplate/make_integer_seq.cpp | 8 +++- 19 files changed, 146 insertions(+), 119 deletions(-) delete mode 100644 clang/test/AST/ast-dump-retain-subst-template-type-parm-type-ast-nodes.cpp diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index a24f30815e6b9..b3010fa888fa4 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -1795,10 +1795,10 @@ class ASTContext : public RefCountedBase { QualType Wrapped, QualType Contained, const HLSLAttributedResourceType::Attributes &Attrs); - QualType - getSubstTemplateTypeParmType(QualType Replacement, Decl *AssociatedDecl, - unsigned Index, - std::optional PackIndex) const; + QualType getSubstTemplateTypeParmType(QualType Replacement, + Decl *AssociatedDecl, unsigned Index, + std::optional PackIndex, + bool Final) const; QualType getSubstTemplateTypeParmPackType(Decl *AssociatedDecl, unsigned Index, bool Final, const TemplateArgument &ArgPack); @@ -2393,10 +2393,11 @@ class ASTContext : public RefCountedBase { TemplateName getDependentTemplateName(const DependentTemplateStorage &Name) const; - TemplateName - getSubstTemplateTemplateParm(TemplateName replacement, Decl *AssociatedDecl, - unsigned Index, - std::optional PackIndex) const; + TemplateName getSubstTemplateTemplateParm(TemplateName replacement, + Decl *AssociatedDecl, + unsigned Index, + std::optional PackIndex, + bool Final) const; TemplateName getSubstTemplateTemplateParmPack(const TemplateArgument &ArgPack, Decl *AssociatedDecl, unsigned Index, diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 223d74993e9e6..028ee82718d50 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -4514,7 +4514,9 @@ class SubstNonTypeTemplateParmExpr : public Expr { llvm::PointerIntPair AssociatedDeclAndRef; unsigned Index : 15; - unsigned PackIndex : 16; + unsigned PackIndex : 15; + LLVM_PREFERRED_TYPE(bool) + unsigned Final : 1; explicit SubstNonTypeTemplateParmExpr(EmptyShell Empty) : Expr(SubstNonTypeTemplateParmExprClass, Empty) {} @@ -4523,11 +4525,12 @@ class SubstNonTypeTemplateParmExpr : public Expr { SubstNonTypeTemplateParmExpr(QualType Ty, ExprValueKind ValueKind, SourceLocation Loc, Expr *Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, bool RefParam) + std::optional PackIndex, bool RefParam, + bool Final) : Expr(SubstNonTypeTemplateParmExprClass, Ty, ValueKind, OK_Ordinary), Replacement(Replacement), AssociatedDeclAndRef(AssociatedDecl, RefParam), Index(Index), - PackIndex(PackIndex ? *PackIndex + 1 : 0) { + PackIndex(PackIndex ? *PackIndex + 1 : 0), Final(Final) { assert(AssociatedDecl != nullptr); SubstNonTypeTemplateParmExprBits.NameLoc = Loc; setDependence(computeDependence(this)); @@ -4555,6 +4558,10 @@ class SubstNonTypeTemplateParmExpr : public Expr { return PackIndex - 1; } + // This substitution is Final, which means the substitution is fully + // sugared: it doesn't need to be resugared later. + bool getFinal() const { return Final; } + NonTypeTemplateParmDecl *getParameter() const; bool isReferenceParameter() const { return AssociatedDeclAndRef.getInt(); } @@ -4598,7 +4605,10 @@ class SubstNonTypeTemplateParmPackExpr : public Expr { const TemplateArgument *Arguments; /// The number of template arguments in \c Arguments. - unsigned NumArguments : 16; + unsigned NumArguments : 15; + + LLVM_PREFERRED_TYPE(bool) + unsigned Final : 1; unsigned Index : 16; @@ -4612,7 +4622,8 @@ class SubstNonTypeTemplateParmPackExpr : public Expr { SubstNonTypeTemplateParmPackExpr(QualType T, ExprValueKind ValueKind, SourceLocation NameLoc, const TemplateArgument &ArgPack, - Decl *AssociatedDecl, unsigned Index); + Decl *AssociatedDecl, unsigned Index, + bool Final); /// A template-like entity which owns the whole pattern being substituted. /// This will own a set of template parameters. @@ -4622,6 +4633,10 @@ class SubstNonTypeTemplateParmPackExpr : public Expr { /// This should match the result of `getParameterPack()->getIndex()`. unsigned getIndex() const { return Index; } + // This substitution will be Final, which means the substitution will be fully + // sugared: it doesn't need to be resugared later. + bool getFinal() const { return Final; } + /// Retrieve the non-type template parameter pack being substituted. NonTypeTemplateParmDecl *getParameterPack() const; diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td index 5171555008ac9..0e360de16fdd6 100644 --- a/clang/include/clang/AST/PropertiesBase.td +++ b/clang/include/clang/AST/PropertiesBase.td @@ -730,8 +730,9 @@ let Class = PropertyTypeCase in { def : Property<"packIndex", Optional> { let Read = [{ parm->getPackIndex() }]; } + def : Property<"final", Bool> { let Read = [{ parm->getFinal() }]; } def : Creator<[{ - return ctx.getSubstTemplateTemplateParm(replacement, associatedDecl, index, packIndex); + return ctx.getSubstTemplateTemplateParm(replacement, associatedDecl, index, packIndex, final); }]>; } let Class = PropertyTypeCase in { diff --git a/clang/include/clang/AST/TemplateName.h b/clang/include/clang/AST/TemplateName.h index 1a56133b72d6e..ece2afcfa72ab 100644 --- a/clang/include/clang/AST/TemplateName.h +++ b/clang/include/clang/AST/TemplateName.h @@ -414,9 +414,11 @@ class SubstTemplateTemplateParmStorage SubstTemplateTemplateParmStorage(TemplateName Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex) + std::optional PackIndex, + bool Final) : UncommonTemplateNameStorage(SubstTemplateTemplateParm, Index, - PackIndex ? *PackIndex + 1 : 0), + ((PackIndex ? *PackIndex + 1 : 0) << 1) | + Final), Replacement(Replacement), AssociatedDecl(AssociatedDecl) { assert(AssociatedDecl != nullptr); } @@ -430,10 +432,15 @@ class SubstTemplateTemplateParmStorage /// This should match the result of `getParameter()->getIndex()`. unsigned getIndex() const { return Bits.Index; } + // This substitution is Final, which means the substitution is fully + // sugared: it doesn't need to be resugared later. + bool getFinal() const { return Bits.Data & 1; } + std::optional getPackIndex() const { - if (Bits.Data == 0) + auto Data = Bits.Data >> 1; + if (Data == 0) return std::nullopt; - return Bits.Data - 1; + return Data - 1; } TemplateTemplateParmDecl *getParameter() const; @@ -443,7 +450,7 @@ class SubstTemplateTemplateParmStorage static void Profile(llvm::FoldingSetNodeID &ID, TemplateName Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex); + std::optional PackIndex, bool Final); }; class DeducedTemplateStorage : public UncommonTemplateNameStorage, diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index cfd417068abb7..86ae335452980 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2158,12 +2158,15 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { // The index of the template parameter this substitution represents. unsigned Index : 15; + LLVM_PREFERRED_TYPE(bool) + unsigned Final : 1; + /// Represents the index within a pack if this represents a substitution /// from a pack expansion. This index starts at the end of the pack and /// increments towards the beginning. /// Positive non-zero number represents the index + 1. /// Zero means this is not substituted from an expansion. - unsigned PackIndex : 16; + unsigned PackIndex : 15; }; class SubstTemplateTypeParmPackTypeBitfields { @@ -6397,7 +6400,8 @@ class SubstTemplateTypeParmType final Decl *AssociatedDecl; SubstTemplateTypeParmType(QualType Replacement, Decl *AssociatedDecl, - unsigned Index, std::optional PackIndex); + unsigned Index, std::optional PackIndex, + bool Final); public: /// Gets the type that was substituted for the template @@ -6420,6 +6424,10 @@ class SubstTemplateTypeParmType final /// This should match the result of `getReplacedParameter()->getIndex()`. unsigned getIndex() const { return SubstTemplateTypeParmTypeBits.Index; } + // This substitution is Final, which means the substitution is fully + // sugared: it doesn't need to be resugared later. + unsigned getFinal() const { return SubstTemplateTypeParmTypeBits.Final; } + std::optional getPackIndex() const { if (SubstTemplateTypeParmTypeBits.PackIndex == 0) return std::nullopt; @@ -6431,17 +6439,12 @@ class SubstTemplateTypeParmType final void Profile(llvm::FoldingSetNodeID &ID) { Profile(ID, getReplacementType(), getAssociatedDecl(), getIndex(), - getPackIndex()); + getPackIndex(), getFinal()); } static void Profile(llvm::FoldingSetNodeID &ID, QualType Replacement, const Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex) { - Replacement.Profile(ID); - ID.AddPointer(AssociatedDecl); - ID.AddInteger(Index); - ID.AddInteger(PackIndex ? *PackIndex - 1 : 0); - } + std::optional PackIndex, bool Final); static bool classof(const Type *T) { return T->getTypeClass() == SubstTemplateTypeParm; @@ -6488,7 +6491,8 @@ class SubstTemplateTypeParmPackType : public Type, public llvm::FoldingSetNode { /// This should match the result of `getReplacedParameter()->getIndex()`. unsigned getIndex() const { return SubstTemplateTypeParmPackTypeBits.Index; } - // When true the substitution will be 'Final' (subst node won't be placed). + // This substitution will be Final, which means the substitution will be fully + // sugared: it doesn't need to be resugared later. bool getFinal() const; unsigned getNumArgs() const { diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index 391fd26a086f7..477106a152188 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -842,11 +842,11 @@ let Class = SubstTemplateTypeParmType in { def : Property<"PackIndex", Optional> { let Read = [{ node->getPackIndex() }]; } + def : Property<"Final", Bool> { let Read = [{ node->getFinal() }]; } - // The call to getCanonicalType here existed in ASTReader.cpp, too. def : Creator<[{ return ctx.getSubstTemplateTypeParmType( - replacementType, associatedDecl, Index, PackIndex); + replacementType, associatedDecl, Index, PackIndex, Final); }]>; } diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 552b5823add36..0270a8e012849 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -5447,10 +5447,10 @@ QualType ASTContext::getHLSLAttributedResourceType( /// Retrieve a substitution-result type. QualType ASTContext::getSubstTemplateTypeParmType( QualType Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex) const { + std::optional PackIndex, bool Final) const { llvm::FoldingSetNodeID ID; SubstTemplateTypeParmType::Profile(ID, Replacement, AssociatedDecl, Index, - PackIndex); + PackIndex, Final); void *InsertPos = nullptr; SubstTemplateTypeParmType *SubstParm = SubstTemplateTypeParmTypes.FindNodeOrInsertPos(ID, InsertPos); @@ -5460,7 +5460,7 @@ QualType ASTContext::getSubstTemplateTypeParmType( !Replacement.isCanonical()), alignof(SubstTemplateTypeParmType)); SubstParm = new (Mem) SubstTemplateTypeParmType(Replacement, AssociatedDecl, - Index, PackIndex); + Index, PackIndex, Final); Types.push_back(SubstParm); SubstTemplateTypeParmTypes.InsertNode(SubstParm, InsertPos); } @@ -10090,10 +10090,10 @@ ASTContext::getDependentTemplateName(const DependentTemplateStorage &S) const { TemplateName ASTContext::getSubstTemplateTemplateParm( TemplateName Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex) const { + std::optional PackIndex, bool Final) const { llvm::FoldingSetNodeID ID; SubstTemplateTemplateParmStorage::Profile(ID, Replacement, AssociatedDecl, - Index, PackIndex); + Index, PackIndex, Final); void *insertPos = nullptr; SubstTemplateTemplateParmStorage *subst @@ -10101,7 +10101,7 @@ TemplateName ASTContext::getSubstTemplateTemplateParm( if (!subst) { subst = new (*this) SubstTemplateTemplateParmStorage( - Replacement, AssociatedDecl, Index, PackIndex); + Replacement, AssociatedDecl, Index, PackIndex, Final); SubstTemplateTemplateParms.InsertNode(subst, insertPos); } @@ -14202,7 +14202,8 @@ static QualType getCommonSugarTypeNode(ASTContext &Ctx, const Type *X, if (PackIndex != SY->getPackIndex()) return QualType(); return Ctx.getSubstTemplateTypeParmType(Ctx.getQualifiedType(Underlying), - CD, Index, PackIndex); + CD, Index, PackIndex, + SX->getFinal() && SY->getFinal()); } case Type::ObjCTypeParam: // FIXME: Try to merge these. diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 81acb013b0f7d..893160e8f5ba9 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -1631,8 +1631,8 @@ ExpectedType ASTNodeImporter::VisitSubstTemplateTypeParmType( return ToReplacementTypeOrErr.takeError(); return Importer.getToContext().getSubstTemplateTypeParmType( - *ToReplacementTypeOrErr, *ReplacedOrErr, T->getIndex(), - T->getPackIndex()); + *ToReplacementTypeOrErr, *ReplacedOrErr, T->getIndex(), T->getPackIndex(), + T->getFinal()); } ExpectedType ASTNodeImporter::VisitSubstTemplateTypeParmPackType( @@ -8937,7 +8937,8 @@ ExpectedStmt ASTNodeImporter::VisitSubstNonTypeTemplateParmExpr( return new (Importer.getToContext()) SubstNonTypeTemplateParmExpr( ToType, E->getValueKind(), ToExprLoc, ToReplacement, ToAssociatedDecl, - E->getIndex(), E->getPackIndex(), E->isReferenceParameter()); + E->getIndex(), E->getPackIndex(), E->isReferenceParameter(), + E->getFinal()); } ExpectedStmt ASTNodeImporter::VisitTypeTraitExpr(TypeTraitExpr *E) { @@ -9932,7 +9933,7 @@ Expected ASTImporter::Import(TemplateName From) { return ToContext.getSubstTemplateTemplateParm( *ReplacementOrErr, *AssociatedDeclOrErr, Subst->getIndex(), - Subst->getPackIndex()); + Subst->getPackIndex(), Subst->getFinal()); } case TemplateName::SubstTemplateTemplateParmPack: { diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp index a000e988e6834..a0bc50c449d82 100644 --- a/clang/lib/AST/ExprCXX.cpp +++ b/clang/lib/AST/ExprCXX.cpp @@ -1760,10 +1760,12 @@ QualType SubstNonTypeTemplateParmExpr::getParameterType( SubstNonTypeTemplateParmPackExpr::SubstNonTypeTemplateParmPackExpr( QualType T, ExprValueKind ValueKind, SourceLocation NameLoc, - const TemplateArgument &ArgPack, Decl *AssociatedDecl, unsigned Index) + const TemplateArgument &ArgPack, Decl *AssociatedDecl, unsigned Index, + bool Final) : Expr(SubstNonTypeTemplateParmPackExprClass, T, ValueKind, OK_Ordinary), AssociatedDecl(AssociatedDecl), Arguments(ArgPack.pack_begin()), - NumArguments(ArgPack.pack_size()), Index(Index), NameLoc(NameLoc) { + NumArguments(ArgPack.pack_size()), Final(Final), Index(Index), + NameLoc(NameLoc) { assert(AssociatedDecl != nullptr); setDependence(ExprDependence::TypeValueInstantiation | ExprDependence::UnexpandedPack); diff --git a/clang/lib/AST/TemplateName.cpp b/clang/lib/AST/TemplateName.cpp index 031b58123fc99..f8ba5a24c7519 100644 --- a/clang/lib/AST/TemplateName.cpp +++ b/clang/lib/AST/TemplateName.cpp @@ -77,16 +77,18 @@ SubstTemplateTemplateParmStorage::getParameter() const { } void SubstTemplateTemplateParmStorage::Profile(llvm::FoldingSetNodeID &ID) { - Profile(ID, Replacement, getAssociatedDecl(), getIndex(), getPackIndex()); + Profile(ID, Replacement, getAssociatedDecl(), getIndex(), getPackIndex(), + getFinal()); } void SubstTemplateTemplateParmStorage::Profile( llvm::FoldingSetNodeID &ID, TemplateName Replacement, Decl *AssociatedDecl, - unsigned Index, std::optional PackIndex) { + unsigned Index, std::optional PackIndex, bool Final) { Replacement.Profile(ID); ID.AddPointer(AssociatedDecl); ID.AddInteger(Index); ID.AddInteger(PackIndex ? *PackIndex + 1 : 0); + ID.AddBoolean(Final); } SubstTemplateTemplateParmPackStorage::SubstTemplateTemplateParmPackStorage( diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 1fe6f2c722acf..d35b2f5d9ab6d 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -1298,6 +1298,8 @@ void TextNodeDumper::dumpBareTemplateName(TemplateName TN) { OS << " index " << STS->getIndex(); if (std::optional PackIndex = STS->getPackIndex()) OS << " pack_index " << *PackIndex; + if (STS->getFinal()) + OS << " final"; if (const TemplateTemplateParmDecl *P = STS->getParameter()) AddChild("parameter", [=] { Visit(P); }); dumpDeclRef(STS->getAssociatedDecl(), "associated"); @@ -2124,6 +2126,8 @@ void TextNodeDumper::VisitSubstTemplateTypeParmType( VisitTemplateTypeParmDecl(T->getReplacedParameter()); if (auto PackIndex = T->getPackIndex()) OS << " pack_index " << *PackIndex; + if (T->getFinal()) + OS << " final"; } void TextNodeDumper::VisitSubstTemplateTypeParmPackType( diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 667ffc0e599a6..4669bf5541493 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -1294,9 +1294,9 @@ struct SimpleTransformVisitor : public TypeVisitor { == T->getReplacementType().getAsOpaquePtr()) return QualType(T, 0); - return Ctx.getSubstTemplateTypeParmType(replacementType, - T->getAssociatedDecl(), - T->getIndex(), T->getPackIndex()); + return Ctx.getSubstTemplateTypeParmType( + replacementType, T->getAssociatedDecl(), T->getIndex(), + T->getPackIndex(), T->getFinal()); } // FIXME: Non-trivial to implement, but important for C++ @@ -4263,7 +4263,7 @@ static const TemplateTypeParmDecl *getReplacedParameter(Decl *D, SubstTemplateTypeParmType::SubstTemplateTypeParmType( QualType Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex) + std::optional PackIndex, bool Final) : Type(SubstTemplateTypeParm, Replacement.getCanonicalType(), Replacement->getDependence()), AssociatedDecl(AssociatedDecl) { @@ -4273,6 +4273,7 @@ SubstTemplateTypeParmType::SubstTemplateTypeParmType( *getTrailingObjects() = Replacement; SubstTemplateTypeParmTypeBits.Index = Index; + SubstTemplateTypeParmTypeBits.Final = Final; SubstTemplateTypeParmTypeBits.PackIndex = PackIndex ? *PackIndex + 1 : 0; assert(AssociatedDecl != nullptr); } @@ -4282,6 +4283,19 @@ SubstTemplateTypeParmType::getReplacedParameter() const { return ::getReplacedParameter(getAssociatedDecl(), getIndex()); } +void SubstTemplateTypeParmType::Profile(llvm::FoldingSetNodeID &ID, + QualType Replacement, + const Decl *AssociatedDecl, + unsigned Index, + std::optional PackIndex, + bool Final) { + Replacement.Profile(ID); + ID.AddPointer(AssociatedDecl); + ID.AddInteger(Index); + ID.AddInteger(PackIndex ? *PackIndex - 1 : 0); + ID.AddBoolean(Final); +} + SubstTemplateTypeParmPackType::SubstTemplateTypeParmPackType( QualType Canon, Decl *AssociatedDecl, unsigned Index, bool Final, const TemplateArgument &ArgPack) diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 1f87ef4b27bab..8cf65ebe03c07 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -3528,16 +3528,10 @@ QualType Sema::CheckTemplateIdType(TemplateName Name, if (Pattern->isInvalidDecl()) return QualType(); - // Only substitute for the innermost template argument list. NOTE: Some - // external resugarers rely on leaving a Subst* node here. Make the - // substitution non-final in that case. Note that these external resugarers - // will still miss some information in this representation, because we don't - // provide enough context in the Subst* nodes in order to tell different - // template type alias specializations apart. + // Only substitute for the innermost template argument list. MultiLevelTemplateArgumentList TemplateArgLists; - TemplateArgLists.addOuterTemplateArguments( - Template, CTAI.SugaredConverted, - /*Final=*/!getLangOpts().RetainSubstTemplateTypeParmTypeAstNodes); + TemplateArgLists.addOuterTemplateArguments(Template, CTAI.SugaredConverted, + /*Final=*/true); TemplateArgLists.addOuterRetainedLevels( AliasTemplate->getTemplateParameters()->getDepth()); @@ -7558,7 +7552,7 @@ ExprResult Sema::BuildExpressionFromDeclTemplateArgument( ParamType->getPointeeType(), RefExpr.get()->getValueKind(), RefExpr.get()->getExprLoc(), RefExpr.get(), VD, NTTP->getIndex(), /*PackIndex=*/std::nullopt, - /*RefParam=*/true); + /*RefParam=*/true, /*Final=*/true); } } } diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 9f5ca9dca8e89..d835b3b06893d 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1865,11 +1865,10 @@ namespace { Sema::ExtParameterInfoBuilder &PInfos); private: - ExprResult - transformNonTypeTemplateParmRef(Decl *AssociatedDecl, - const NonTypeTemplateParmDecl *parm, - SourceLocation loc, TemplateArgument arg, - std::optional PackIndex); + ExprResult transformNonTypeTemplateParmRef( + Decl *AssociatedDecl, const NonTypeTemplateParmDecl *parm, + SourceLocation loc, TemplateArgument arg, + std::optional PackIndex, bool Final); }; } @@ -2083,10 +2082,8 @@ TemplateName TemplateInstantiator::TransformTemplateName( TemplateName Template = Arg.getAsTemplate(); assert(!Template.isNull() && "Null template template argument"); - if (Final) - return Template; return getSema().Context.getSubstTemplateTemplateParm( - Template, AssociatedDecl, TTP->getIndex(), PackIndex); + Template, AssociatedDecl, TTP->getIndex(), PackIndex, Final); } } @@ -2098,11 +2095,9 @@ TemplateName TemplateInstantiator::TransformTemplateName( TemplateArgument Pack = SubstPack->getArgumentPack(); TemplateName Template = getPackSubstitutedTemplateArgument(getSema(), Pack).getAsTemplate(); - if (SubstPack->getFinal()) - return Template; return getSema().Context.getSubstTemplateTemplateParm( Template, SubstPack->getAssociatedDecl(), SubstPack->getIndex(), - getPackIndex(Pack)); + getPackIndex(Pack), SubstPack->getFinal()); } return inherited::TransformTemplateName(SS, Name, NameLoc, ObjectType, @@ -2144,7 +2139,8 @@ TemplateInstantiator::TransformTemplateParmRefExpr(DeclRefExpr *E, return Arg.getAsExpr(); } - auto [AssociatedDecl, _] = TemplateArgs.getAssociatedDecl(NTTP->getDepth()); + auto [AssociatedDecl, Final] = + TemplateArgs.getAssociatedDecl(NTTP->getDepth()); std::optional PackIndex; if (NTTP->isParameterPack()) { assert(Arg.getKind() == TemplateArgument::Pack && @@ -2163,17 +2159,15 @@ TemplateInstantiator::TransformTemplateParmRefExpr(DeclRefExpr *E, QualType ExprType = TargetType.getNonLValueExprType(SemaRef.Context); if (TargetType->isRecordType()) ExprType.addConst(); - // FIXME: Pass in Final. return new (SemaRef.Context) SubstNonTypeTemplateParmPackExpr( ExprType, TargetType->isReferenceType() ? VK_LValue : VK_PRValue, - E->getLocation(), Arg, AssociatedDecl, NTTP->getPosition()); + E->getLocation(), Arg, AssociatedDecl, NTTP->getPosition(), Final); } PackIndex = getPackIndex(Arg); Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); } - // FIXME: Don't put subst node on Final replacement. return transformNonTypeTemplateParmRef(AssociatedDecl, NTTP, E->getLocation(), - Arg, PackIndex); + Arg, PackIndex, Final); } const AnnotateAttr * @@ -2268,8 +2262,8 @@ TemplateInstantiator::TransformOpenACCRoutineDeclAttr( ExprResult TemplateInstantiator::transformNonTypeTemplateParmRef( Decl *AssociatedDecl, const NonTypeTemplateParmDecl *parm, - SourceLocation loc, TemplateArgument arg, - std::optional PackIndex) { + SourceLocation loc, TemplateArgument arg, std::optional PackIndex, + bool Final) { ExprResult result; // Determine the substituted parameter type. We can usually infer this from @@ -2335,10 +2329,9 @@ ExprResult TemplateInstantiator::transformNonTypeTemplateParmRef( return ExprError(); Expr *resultExpr = result.get(); - // FIXME: Don't put subst node on final replacement. return new (SemaRef.Context) SubstNonTypeTemplateParmExpr( resultExpr->getType(), resultExpr->getValueKind(), loc, resultExpr, - AssociatedDecl, parm->getIndex(), PackIndex, refParam); + AssociatedDecl, parm->getIndex(), PackIndex, refParam, Final); } ExprResult @@ -2351,10 +2344,9 @@ TemplateInstantiator::TransformSubstNonTypeTemplateParmPackExpr( TemplateArgument Pack = E->getArgumentPack(); TemplateArgument Arg = getPackSubstitutedTemplateArgument(getSema(), Pack); - // FIXME: Don't put subst node on final replacement. return transformNonTypeTemplateParmRef( E->getAssociatedDecl(), E->getParameterPack(), - E->getParameterPackLocation(), Arg, getPackIndex(Pack)); + E->getParameterPackLocation(), Arg, getPackIndex(Pack), E->getFinal()); } ExprResult @@ -2396,9 +2388,9 @@ TemplateInstantiator::TransformSubstNonTypeTemplateParmExpr( /*PartialOrderingTTP=*/false, Sema::CTAK_Specified) .isInvalid()) return true; - return transformNonTypeTemplateParmRef(E->getAssociatedDecl(), - E->getParameter(), E->getExprLoc(), - SugaredConverted, E->getPackIndex()); + return transformNonTypeTemplateParmRef( + E->getAssociatedDecl(), E->getParameter(), E->getExprLoc(), + SugaredConverted, E->getPackIndex(), E->getFinal()); } ExprResult TemplateInstantiator::RebuildVarDeclRefExpr(ValueDecl *PD, @@ -2554,13 +2546,9 @@ QualType TemplateInstantiator::BuildSubstTemplateTypeParmType( SemaRef.Context.getQualifiedType(Replacement.getUnqualifiedType(), RQs); } - if (Final) { - TLB.pushTrivial(SemaRef.Context, Replacement, NameLoc); - return Replacement; - } // TODO: only do this uniquing once, at the start of instantiation. QualType Result = getSema().Context.getSubstTemplateTypeParmType( - Replacement, AssociatedDecl, Index, PackIndex); + Replacement, AssociatedDecl, Index, PackIndex, Final); SubstTemplateTypeParmTypeLoc NewTL = TLB.push(Result); NewTL.setNameLoc(NameLoc); diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 524e73242a2da..3689d323cf25b 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -7077,7 +7077,8 @@ QualType TreeTransform::TransformSubstTemplateTypeParmType( return QualType(); QualType Result = SemaRef.Context.getSubstTemplateTypeParmType( - Replacement, NewReplaced, T->getIndex(), T->getPackIndex()); + Replacement, NewReplaced, T->getIndex(), T->getPackIndex(), + T->getFinal()); // Propagate type-source information. SubstTemplateTypeParmTypeLoc NewTL diff --git a/clang/test/AST/ast-dump-retain-subst-template-type-parm-type-ast-nodes.cpp b/clang/test/AST/ast-dump-retain-subst-template-type-parm-type-ast-nodes.cpp deleted file mode 100644 index 97dc983e2436c..0000000000000 --- a/clang/test/AST/ast-dump-retain-subst-template-type-parm-type-ast-nodes.cpp +++ /dev/null @@ -1,18 +0,0 @@ -// RUN: %clang_cc1 -fsyntax-only -fretain-subst-template-type-parm-type-ast-nodes -ast-dump -ast-dump-filter=dump %s | FileCheck -strict-whitespace %s - -namespace t1 { -template using X = T; -using dump = X; - -// CHECK-LABEL: Dumping t1::dump: -// CHECK-NEXT: TypeAliasDecl -// CHECK-NEXT: `-ElaboratedType -// CHECK-NEXT: `-TemplateSpecializationType -// CHECK-NEXT: |-name: 'X':'t1::X' qualified -// CHECK-NEXT: | `-TypeAliasTemplateDecl -// CHECK-NEXT: |-TemplateArgument -// CHECK-NEXT: | `-BuiltinType {{.+}} 'int' -// CHECK-NEXT: `-SubstTemplateTypeParmType 0x{{[0-9a-f]+}} 'int' sugar class depth 0 index 0 T -// CHECK-NEXT: |-TypeAliasTemplate {{.+}} 'X' -// CHECK-NEXT: `-BuiltinType {{.+}} 'int' -} // namespace t1 diff --git a/clang/test/AST/ast-dump-template-decls.cpp b/clang/test/AST/ast-dump-template-decls.cpp index 9f578e5afe561..d5228d4667304 100644 --- a/clang/test/AST/ast-dump-template-decls.cpp +++ b/clang/test/AST/ast-dump-template-decls.cpp @@ -123,6 +123,8 @@ using type2 = typename C::type1; // CHECK-NEXT: TemplateArgument type 'void' // CHECK-NEXT: BuiltinType 0x{{[^ ]*}} 'void' // CHECK-NEXT: FunctionProtoType 0x{{[^ ]*}} 'void (int)' cdecl +// CHECK-NEXT: SubstTemplateTypeParmType 0x{{[^ ]*}} 'void' sugar class depth 0 index 0 U final +// CHECK-NEXT: TypeAliasTemplate 0x{{[^ ]*}} 'type1' // CHECK-NEXT: BuiltinType 0x{{[^ ]*}} 'void' // CHECK-NEXT: SubstTemplateTypeParmType 0x{{[^ ]*}} 'int' sugar class depth 0 index 0 T // CHECK-NEXT: ClassTemplateSpecialization 0x{{[^ ]*}} 'C' @@ -139,14 +141,14 @@ template struct D::bind; // CHECK: TypeAliasDecl 0x{{[^ ]*}} col:11 bound_type 'int (int (*)(float, int), int (*)(char, short))' // CHECK: FunctionProtoType 0x{{[^ ]*}} 'int (int (*)(float, int), int (*)(char, short))' cdecl // CHECK: FunctionProtoType 0x{{[^ ]*}} 'int (float, int)' cdecl -// CHECK: SubstTemplateTypeParmType 0x{{[^ ]*}} 'float' sugar typename depth 0 index 0 ... T pack_index 1 +// CHECK: SubstTemplateTypeParmType 0x{{[^ ]*}} 'float' sugar typename depth 0 index 0 ... T pack_index 1{{$}} // CHECK-NEXT: ClassTemplateSpecialization 0x{{[^ ]*}} 'D' -// CHECK: SubstTemplateTypeParmType 0x{{[^ ]*}} 'int' sugar typename depth 0 index 0 ... U pack_index 1 +// CHECK: SubstTemplateTypeParmType 0x{{[^ ]*}} 'int' sugar typename depth 0 index 0 ... U pack_index 1{{$}} // CHECK-NEXT: ClassTemplateSpecialization 0x{{[^ ]*}} 'bind' // CHECK: FunctionProtoType 0x{{[^ ]*}} 'int (char, short)' cdecl -// CHECK: SubstTemplateTypeParmType 0x{{[^ ]*}} 'char' sugar typename depth 0 index 0 ... T pack_index 0 +// CHECK: SubstTemplateTypeParmType 0x{{[^ ]*}} 'char' sugar typename depth 0 index 0 ... T pack_index 0{{$}} // CHECK-NEXT: ClassTemplateSpecialization 0x{{[^ ]*}} 'D' -// CHECK: SubstTemplateTypeParmType 0x{{[^ ]*}} 'short' sugar typename depth 0 index 0 ... U pack_index 0 +// CHECK: SubstTemplateTypeParmType 0x{{[^ ]*}} 'short' sugar typename depth 0 index 0 ... U pack_index 0{{$}} // CHECK-NEXT: ClassTemplateSpecialization 0x{{[^ ]*}} 'bind' } // namespace PR56099 @@ -156,12 +158,16 @@ template> class D1, class D2> using D = D1 class E {}; using test1 = D; -// CHECK: TypeAliasDecl 0x{{[^ ]*}} col:7 test1 'D':'subst_default_argument::E>' +// CHECK: TypeAliasDecl 0x{{[^ ]*}} col:7 test1 'D':'subst_default_argument::E>' // CHECK: TemplateSpecializationType 0x{{[^ ]*}} 'A' sugar // CHECK-NEXT: |-name: 'A':'subst_default_argument::A' qualified // CHECK-NEXT: | `-ClassTemplateDecl {{.+}} A // CHECK-NEXT: |-TemplateArgument type 'int' -// CHECK-NEXT: | `-BuiltinType 0x{{[^ ]*}} 'int' +// CHECK-NEXT: | `-SubstTemplateTypeParmType 0x{{[^ ]*}} 'int' sugar class depth 0 index 0 E1 final +// CHECK-NEXT: | |-ClassTemplate 0x{{[^ ]*}} 'E' +// CHECK-NEXT: | `-SubstTemplateTypeParmType 0x{{[^ ]*}} 'int' sugar class depth 0 index 1 D2 final +// CHECK-NEXT: | |-TypeAliasTemplate 0x{{[^ ]*}} 'D' +// CHECK-NEXT: | `-BuiltinType 0x{{[^ ]*}} 'int' // CHECK-NEXT: `-RecordType 0x{{[^ ]*}} 'subst_default_argument::A' // CHECK-NEXT: `-ClassTemplateSpecialization 0x{{[^ ]*}} 'A' } // namespace subst_default_argument diff --git a/clang/test/Misc/diag-template-diffing-cxx11.cpp b/clang/test/Misc/diag-template-diffing-cxx11.cpp index ae4fa524e4e44..c62bffe2b458d 100644 --- a/clang/test/Misc/diag-template-diffing-cxx11.cpp +++ b/clang/test/Misc/diag-template-diffing-cxx11.cpp @@ -265,14 +265,14 @@ int k9 = f9(V9()); // CHECK-ELIDE-TREE: S9< // CHECK-ELIDE-TREE: [2 * ...], // CHECK-ELIDE-TREE: U9< -// CHECK-ELIDE-TREE: [(no qualifiers) != const] double>> +// CHECK-ELIDE-TREE: [double != const double]>> // CHECK-NOELIDE-TREE: no matching function for call to 'f9' // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument // CHECK-NOELIDE-TREE: S9< // CHECK-NOELIDE-TREE: int, // CHECK-NOELIDE-TREE: char, // CHECK-NOELIDE-TREE: U9< -// CHECK-NOELIDE-TREE: [(no qualifiers) != const] double>> +// CHECK-NOELIDE-TREE: [double != const double]>> template class class_types {}; void set10(class_types) {} diff --git a/clang/test/SemaTemplate/make_integer_seq.cpp b/clang/test/SemaTemplate/make_integer_seq.cpp index 71e34c5e0e7df..7ca7b55b49964 100644 --- a/clang/test/SemaTemplate/make_integer_seq.cpp +++ b/clang/test/SemaTemplate/make_integer_seq.cpp @@ -48,7 +48,9 @@ using test2 = B; // CHECK-NEXT: |-TemplateArgument template 'A' // CHECK-NEXT: | `-ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} col:38 A // CHECK-NEXT: |-TemplateArgument type 'int' -// CHECK-NEXT: | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int' +// CHECK-NEXT: | `-SubstTemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'int' sugar class depth 0 index 0 B1 final +// CHECK-NEXT: | |-TypeAliasTemplate 0x{{[0-9A-Fa-f]+}} 'B' +// CHECK-NEXT: | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int' // CHECK-NEXT: |-TemplateArgument expr '1' // CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} 'int' // CHECK-NEXT: | |-value: Int 1 @@ -59,7 +61,9 @@ using test2 = B; // CHECK-NEXT: |-name: 'A' qualified // CHECK-NEXT: | `-ClassTemplateDecl {{.+}} A // CHECK-NEXT: |-TemplateArgument type 'int' -// CHECK-NEXT: | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int' +// CHECK-NEXT: | `-SubstTemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'int' sugar class depth 0 index 0 B1 final +// CHECK-NEXT: | |-TypeAliasTemplate 0x{{[0-9A-Fa-f]+}} 'B' +// CHECK-NEXT: | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int' // CHECK-NEXT: |-TemplateArgument expr '0' // CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} 'int' // CHECK-NEXT: | |-value: Int 0 From dedb632b833f14ef28c6f8a7f5e8983c1be60fa9 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Wed, 2 Apr 2025 18:28:56 -0400 Subject: [PATCH 0457/1029] [HIP] Claim `--offload-compress` for `-M` (#133456) Cmake automatically generates dependency files with all compilation options provided by users. When users use `--offload-compress` for HIP compilation, it causes warnings when cmake generates dependency files. Claim this option to suppress warnings. --- clang/lib/Driver/ToolChains/Clang.cpp | 6 ++++++ clang/test/Driver/hip-options.hip | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index db4f50fc98f7c..70489adf01c94 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1026,6 +1026,12 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, CmdArgs.push_back("-dependency-file"); CmdArgs.push_back(DepFile); } + // Cmake generates dependency files using all compilation options specified + // by users. Claim those not used for dependency files. + if (JA.isOffloading(Action::OFK_HIP)) { + Args.ClaimAllArgs(options::OPT_offload_compress); + Args.ClaimAllArgs(options::OPT_no_offload_compress); + } bool HasTarget = false; for (const Arg *A : Args.filtered(options::OPT_MT, options::OPT_MQ)) { diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip index 0aabc8ad41904..29d23c1b6c8d9 100644 --- a/clang/test/Driver/hip-options.hip +++ b/clang/test/Driver/hip-options.hip @@ -242,3 +242,7 @@ // NO-WARN-ATOMIC: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-Werror=atomic-alignment" {{.*}} "-Wno-error=atomic-alignment" // NO-WARN-ATOMIC-NOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-Werror=atomic-alignment" // NO-WARN-ATOMIC-NOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-Wno-error=atomic-alignment" + +// Check --offload-compress does not cause warning. +// RUN: %clang -### -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib \ +// RUN: --offload-arch=gfx1100 --offload-compress --offload-host-only -M %s From 3f7ca8826776f32526e948b89816db492435f2e2 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Wed, 2 Apr 2025 15:33:07 -0700 Subject: [PATCH 0458/1029] [lldb-dap] Add progress events to the packet list (#134157) Before #134048, TestDAP_Progress relied on wait_for_event to block until the progressEnd came in. However, progress events were not added to the packet list, so this call would always time out. This PR makes it so that packets are added to the packet list, and you can block on them. --- .../packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py | 2 -- lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py index 01ef4b68f2653..45403e9df8525 100644 --- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py +++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py @@ -255,8 +255,6 @@ def handle_recv_packet(self, packet): # and 'progressEnd' events. Keep these around in case test # cases want to verify them. self.progress_events.append(packet) - # No need to add 'progress' event packets to our packets list. - return keepGoing elif packet_type == "response": if packet["command"] == "disconnect": diff --git a/lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py b/lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py index ffe3d38eb49a3..fee63655de0da 100755 --- a/lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py +++ b/lldb/test/API/tools/lldb-dap/progress/TestDAP_Progress.py @@ -19,6 +19,7 @@ def verify_progress_events( expected_not_in_message=None, only_verify_first_update=False, ): + self.dap_server.wait_for_event("progressEnd") self.assertTrue(len(self.dap_server.progress_events) > 0) start_found = False update_found = False From c57b9c233a87f37e034445596ed09260cc6b23f5 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Wed, 2 Apr 2025 15:48:55 -0700 Subject: [PATCH 0459/1029] [CIR] Generate the nsw flag correctly for unary ops (#133815) A previous checkin used a workaround to generate the nsw flag where needed for unary ops. This change upstreams a subsequent change that was made in the incubator to generate the flag correctly. --- clang/include/clang/CIR/Dialect/IR/CIROps.td | 29 +++++++++-- clang/include/clang/CIR/MissingFeatures.h | 1 - clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 19 ++++--- .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 10 +--- clang/test/CIR/CodeGen/unary.cpp | 33 +++++++++--- clang/test/CIR/IR/unary.cir | 50 +++++++++++++++++++ 6 files changed, 111 insertions(+), 31 deletions(-) create mode 100644 clang/test/CIR/IR/unary.cir diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index 3965372755685..c17abfd752a1a 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -697,17 +697,24 @@ def UnaryOp : CIR_Op<"unary", [Pure, SameOperandsAndResultType]> { It requires one input operand and has one result, both types should be the same. + If the `nsw` (no signed wrap) attribute is present, the result is poison if + signed overflow occurs. + ```mlir %7 = cir.unary(inc, %1) : i32 -> i32 - %8 = cir.unary(dec, %2) : i32 -> i32 + %8 = cir.unary(dec, %2) nsw : i32 -> i32 ``` }]; let results = (outs CIR_AnyType:$result); - let arguments = (ins Arg:$kind, Arg:$input); + let arguments = (ins Arg:$kind, + Arg:$input, + UnitAttr:$no_signed_wrap); let assemblyFormat = [{ - `(` $kind `,` $input `)` `:` type($input) `,` type($result) attr-dict + `(` $kind `,` $input `)` + (`nsw` $no_signed_wrap^)? + `:` type($input) `,` type($result) attr-dict }]; let hasVerifier = 1; @@ -961,9 +968,21 @@ def BinOp : CIR_Op<"binop", [Pure, It requires two input operands and has one result, all types should be the same. + If the `nsw` (no signed wrap) or `nuw` (no unsigned wrap) attributes are + present, the result is poison if signed or unsigned overflow occurs + (respectively). + + If the `sat` (saturated) attribute is present, the result is clamped to + the maximum value representatable by the type if it would otherwise + exceed that value and is clamped to the minimum representable value if + it would otherwise be below that value. + ```mlir - %7 = cir.binop(add, %1, %2) : !s32i - %7 = cir.binop(mul, %1, %2) : !u8i + %5 = cir.binop(add, %1, %2) : !s32i + %6 = cir.binop(mul, %1, %2) : !u8i + %7 = cir.binop(add, %1, %2) nsw : !s32i + %8 = cir.binop(add, %3, %4) nuw : !u32i + %9 = cir.binop(add, %1, %2) sat : !s32i ``` }]; diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 3a102d90aba8f..23bf826d19a69 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -76,7 +76,6 @@ struct MissingFeatures { static bool opScopeCleanupRegion() { return false; } // Unary operator handling - static bool opUnarySignedOverflow() { return false; } static bool opUnaryPromotionType() { return false; } // Clang early optimizations or things defered to LLVM lowering. diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 2cf92dfbf3a5b..5ac1dc1052c2e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -374,7 +374,7 @@ class ScalarExprEmitter : public StmtVisitor { cir::UnaryOpKind kind = e->isIncrementOp() ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec; // NOTE(CIR): clang calls CreateAdd but folds this to a unary op - value = emitUnaryOp(e, kind, input); + value = emitUnaryOp(e, kind, input, /*nsw=*/false); } } else if (isa(type)) { cgf.cgm.errorNYI(e->getSourceRange(), "Unary inc/dec pointer"); @@ -429,19 +429,17 @@ class ScalarExprEmitter : public StmtVisitor { mlir::Value emitIncDecConsiderOverflowBehavior(const UnaryOperator *e, mlir::Value inVal, bool isInc) { - assert(!cir::MissingFeatures::opUnarySignedOverflow()); cir::UnaryOpKind kind = e->isIncrementOp() ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec; switch (cgf.getLangOpts().getSignedOverflowBehavior()) { case LangOptions::SOB_Defined: - return emitUnaryOp(e, kind, inVal); + return emitUnaryOp(e, kind, inVal, /*nsw=*/false); case LangOptions::SOB_Undefined: assert(!cir::MissingFeatures::sanitizers()); - return emitUnaryOp(e, kind, inVal); - break; + return emitUnaryOp(e, kind, inVal, /*nsw=*/true); case LangOptions::SOB_Trapping: if (!e->canOverflow()) - return emitUnaryOp(e, kind, inVal); + return emitUnaryOp(e, kind, inVal, /*nsw=*/true); cgf.cgm.errorNYI(e->getSourceRange(), "inc/def overflow SOB_Trapping"); return {}; } @@ -473,18 +471,19 @@ class ScalarExprEmitter : public StmtVisitor { assert(!cir::MissingFeatures::opUnaryPromotionType()); mlir::Value operand = Visit(e->getSubExpr()); - assert(!cir::MissingFeatures::opUnarySignedOverflow()); + bool nsw = + kind == cir::UnaryOpKind::Minus && e->getType()->isSignedIntegerType(); // NOTE: LLVM codegen will lower this directly to either a FNeg // or a Sub instruction. In CIR this will be handled later in LowerToLLVM. - return emitUnaryOp(e, kind, operand); + return emitUnaryOp(e, kind, operand, nsw); } mlir::Value emitUnaryOp(const UnaryOperator *e, cir::UnaryOpKind kind, - mlir::Value input) { + mlir::Value input, bool nsw = false) { return builder.create( cgf.getLoc(e->getSourceRange().getBegin()), input.getType(), kind, - input); + input, nsw); } mlir::Value VisitUnaryNot(const UnaryOperator *e) { diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index b19be53947f99..48dc09d151dcf 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -860,14 +860,8 @@ mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite( // Integer unary operations: + - ~ ++ -- if (mlir::isa(elementType)) { mlir::LLVM::IntegerOverflowFlags maybeNSW = - mlir::LLVM::IntegerOverflowFlags::none; - if (mlir::dyn_cast(elementType).isSigned()) { - assert(!cir::MissingFeatures::opUnarySignedOverflow()); - // TODO: For now, assume signed overflow is undefined. We'll need to add - // an attribute to the unary op to control this. - maybeNSW = mlir::LLVM::IntegerOverflowFlags::nsw; - } - + op.getNoSignedWrap() ? mlir::LLVM::IntegerOverflowFlags::nsw + : mlir::LLVM::IntegerOverflowFlags::none; switch (op.getKind()) { case cir::UnaryOpKind::Inc: { assert(!isVector && "++ not allowed on vector types"); diff --git a/clang/test/CIR/CodeGen/unary.cpp b/clang/test/CIR/CodeGen/unary.cpp index 3e041e14ce177..ca47c1068e08d 100644 --- a/clang/test/CIR/CodeGen/unary.cpp +++ b/clang/test/CIR/CodeGen/unary.cpp @@ -83,7 +83,7 @@ int inc0() { // CHECK: %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i // CHECK: cir.store %[[ATMP]], %[[A]] : !s32i // CHECK: %[[INPUT:.*]] = cir.load %[[A]] -// CHECK: %[[INCREMENTED:.*]] = cir.unary(inc, %[[INPUT]]) +// CHECK: %[[INCREMENTED:.*]] = cir.unary(inc, %[[INPUT]]) nsw // CHECK: cir.store %[[INCREMENTED]], %[[A]] // CHECK: %[[A_TO_OUTPUT:.*]] = cir.load %[[A]] @@ -111,8 +111,8 @@ int dec0() { // CHECK: %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i // CHECK: cir.store %[[ATMP]], %[[A]] : !s32i // CHECK: %[[INPUT:.*]] = cir.load %[[A]] -// CHECK: %[[INCREMENTED:.*]] = cir.unary(dec, %[[INPUT]]) -// CHECK: cir.store %[[INCREMENTED]], %[[A]] +// CHECK: %[[DECREMENTED:.*]] = cir.unary(dec, %[[INPUT]]) nsw +// CHECK: cir.store %[[DECREMENTED]], %[[A]] // CHECK: %[[A_TO_OUTPUT:.*]] = cir.load %[[A]] // LLVM: define i32 @dec0() @@ -139,7 +139,7 @@ int inc1() { // CHECK: %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i // CHECK: cir.store %[[ATMP]], %[[A]] : !s32i // CHECK: %[[INPUT:.*]] = cir.load %[[A]] -// CHECK: %[[INCREMENTED:.*]] = cir.unary(inc, %[[INPUT]]) +// CHECK: %[[INCREMENTED:.*]] = cir.unary(inc, %[[INPUT]]) nsw // CHECK: cir.store %[[INCREMENTED]], %[[A]] // CHECK: %[[A_TO_OUTPUT:.*]] = cir.load %[[A]] @@ -167,8 +167,8 @@ int dec1() { // CHECK: %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i // CHECK: cir.store %[[ATMP]], %[[A]] : !s32i // CHECK: %[[INPUT:.*]] = cir.load %[[A]] -// CHECK: %[[INCREMENTED:.*]] = cir.unary(dec, %[[INPUT]]) -// CHECK: cir.store %[[INCREMENTED]], %[[A]] +// CHECK: %[[DECREMENTED:.*]] = cir.unary(dec, %[[INPUT]]) nsw +// CHECK: cir.store %[[DECREMENTED]], %[[A]] // CHECK: %[[A_TO_OUTPUT:.*]] = cir.load %[[A]] // LLVM: define i32 @dec1() @@ -197,7 +197,7 @@ int inc2() { // CHECK: %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i // CHECK: cir.store %[[ATMP]], %[[A]] : !s32i // CHECK: %[[ATOB:.*]] = cir.load %[[A]] -// CHECK: %[[INCREMENTED:.*]] = cir.unary(inc, %[[ATOB]]) +// CHECK: %[[INCREMENTED:.*]] = cir.unary(inc, %[[ATOB]]) nsw // CHECK: cir.store %[[INCREMENTED]], %[[A]] // CHECK: cir.store %[[ATOB]], %[[B]] // CHECK: %[[B_TO_OUTPUT:.*]] = cir.load %[[B]] @@ -405,3 +405,22 @@ float fpPostInc2() { // OGCG: store float %[[A_INC]], ptr %[[A]], align 4 // OGCG: store float %[[A_LOAD]], ptr %[[B]], align 4 // OGCG: %[[B_TO_OUTPUT:.*]] = load float, ptr %[[B]], align 4 + +void chars(char c) { +// CHECK: cir.func @chars + + int c1 = +c; + // CHECK: %[[PROMO:.*]] = cir.cast(integral, %{{.+}} : !s8i), !s32i + // CHECK: cir.unary(plus, %[[PROMO]]) : !s32i, !s32i + int c2 = -c; + // CHECK: %[[PROMO:.*]] = cir.cast(integral, %{{.+}} : !s8i), !s32i + // CHECK: cir.unary(minus, %[[PROMO]]) nsw : !s32i, !s32i + + // Chars can go through some integer promotion codegen paths even when not promoted. + // These should not have nsw attributes because the intermediate promotion makes the + // overflow defined behavior. + ++c; // CHECK: cir.unary(inc, %{{.+}}) : !s8i, !s8i + --c; // CHECK: cir.unary(dec, %{{.+}}) : !s8i, !s8i + c++; // CHECK: cir.unary(inc, %{{.+}}) : !s8i, !s8i + c--; // CHECK: cir.unary(dec, %{{.+}}) : !s8i, !s8i +} diff --git a/clang/test/CIR/IR/unary.cir b/clang/test/CIR/IR/unary.cir new file mode 100644 index 0000000000000..f01121adc106e --- /dev/null +++ b/clang/test/CIR/IR/unary.cir @@ -0,0 +1,50 @@ +// RUN: cir-opt %s | FileCheck %s + +!s32i = !cir.int +!s64i = !cir.int +!u32i = !cir.int +!u64i = !cir.int + +module { + cir.func @test_unary_unsigned() { + %0 = cir.alloca !u32i, !cir.ptr, ["a"] {alignment = 4 : i64} + %1 = cir.load %0 : !cir.ptr, !u32i + %2 = cir.unary(plus, %1) : !u32i, !u32i + %3 = cir.unary(minus, %1) : !u32i, !u32i + %4 = cir.unary(not, %1) : !u32i, !u32i + %5 = cir.unary(inc, %1) : !u32i, !u32i + %6 = cir.unary(dec, %1) : !u32i, !u32i + cir.return + } +// CHECK: cir.func @test_unary_unsigned() { +// CHECK: %0 = cir.alloca !u32i, !cir.ptr, ["a"] {alignment = 4 : i64} +// CHECK: %1 = cir.load %0 : !cir.ptr, !u32i +// CHECK: %2 = cir.unary(plus, %1) : !u32i, !u32i +// CHECK: %3 = cir.unary(minus, %1) : !u32i, !u32i +// CHECK: %4 = cir.unary(not, %1) : !u32i, !u32i +// CHECK: %5 = cir.unary(inc, %1) : !u32i, !u32i +// CHECK: %6 = cir.unary(dec, %1) : !u32i, !u32i +// CHECK: cir.return +// CHECK: } + + cir.func @test_unary_signed() { + %0 = cir.alloca !s32i, !cir.ptr, ["a"] {alignment = 4 : i64} + %1 = cir.load %0 : !cir.ptr, !s32i + %2 = cir.unary(plus, %1) : !s32i, !s32i + %3 = cir.unary(minus, %1) nsw : !s32i, !s32i + %4 = cir.unary(not, %1) : !s32i, !s32i + %5 = cir.unary(inc, %1) nsw : !s32i, !s32i + %6 = cir.unary(dec, %1) nsw : !s32i, !s32i + cir.return + } +// CHECK: cir.func @test_unary_signed() { +// CHECK: %0 = cir.alloca !s32i, !cir.ptr, ["a"] {alignment = 4 : i64} +// CHECK: %1 = cir.load %0 : !cir.ptr, !s32i +// CHECK: %2 = cir.unary(plus, %1) : !s32i, !s32i +// CHECK: %3 = cir.unary(minus, %1) nsw : !s32i, !s32i +// CHECK: %4 = cir.unary(not, %1) : !s32i, !s32i +// CHECK: %5 = cir.unary(inc, %1) nsw : !s32i, !s32i +// CHECK: %6 = cir.unary(dec, %1) nsw : !s32i, !s32i +// CHECK: cir.return +// CHECK: } +} From 1edb6b0af1c47f88aa5c230a1a2b769eeb3c30aa Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 2 Apr 2025 16:06:09 -0700 Subject: [PATCH 0460/1029] [RISCV] Fix crash in parseZcmpStackAdj if token is not an integer. --- llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 8 ++++++-- llvm/test/MC/RISCV/rv64zcmp-invalid.s | 6 ++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 8f9a5ae75fca7..27ae33f8339c7 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2676,7 +2676,11 @@ ParseStatus RISCVAsmParser::parseZcmpStackAdj(OperandVector &Operands, bool Negative = parseOptionalToken(AsmToken::Minus); SMLoc S = getLoc(); - int64_t StackAdjustment = getLexer().getTok().getIntVal(); + + if (getTok().isNot(AsmToken::Integer)) + return ParseStatus::NoMatch; + + int64_t StackAdjustment = getTok().getIntVal(); unsigned RlistVal = static_cast(Operands[1].get())->Rlist.Val; assert(RlistVal != RISCVZC::INVALID_RLIST); @@ -2697,7 +2701,7 @@ ParseStatus RISCVAsmParser::parseZcmpStackAdj(OperandVector &Operands, unsigned Spimm = (StackAdjustment - StackAdjBase) / 16; Operands.push_back(RISCVOperand::createSpimm(Spimm << 4, S)); - getLexer().Lex(); + Lex(); return ParseStatus::Success; } diff --git a/llvm/test/MC/RISCV/rv64zcmp-invalid.s b/llvm/test/MC/RISCV/rv64zcmp-invalid.s index 6c4f8a9ec3293..f7ae69aad6efd 100644 --- a/llvm/test/MC/RISCV/rv64zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv64zcmp-invalid.s @@ -48,3 +48,9 @@ cm.pop {ra, x8-x9, x18-x17}, -40 # CHECK-ERROR: :[[@LINE+1]]:16: error: invalid register cm.pop {ra, x8-f8, x18-x17}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:15: error: stack adjustment is invalid for this instruction and register list +cm.pop {ra}, -x1 + +# CHECK-ERROR: :[[@LINE+1]]:15: error: stack adjustment is invalid for this instruction and register list +cm.push {ra}, x1 From 40a0e3430442c7ec72e84063ea4b679700d4c3d8 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 2 Apr 2025 16:19:04 -0700 Subject: [PATCH 0461/1029] [RISCV] Use location of negative sign if present for error in parseZcmpStackAdj As far as the user is concerned the negative sign and the number are a single value so the error should point to the beginning. --- llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 3 +-- llvm/test/MC/RISCV/rv32xqccmp-invalid.s | 8 ++++---- llvm/test/MC/RISCV/rv32zcmp-invalid.s | 4 ++-- llvm/test/MC/RISCV/rv64xqccmp-invalid.s | 8 ++++---- llvm/test/MC/RISCV/rv64zcmp-invalid.s | 6 +++--- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 27ae33f8339c7..f64df24c40593 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2673,9 +2673,8 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, ParseStatus RISCVAsmParser::parseZcmpStackAdj(OperandVector &Operands, bool ExpectNegative) { - bool Negative = parseOptionalToken(AsmToken::Minus); - SMLoc S = getLoc(); + bool Negative = parseOptionalToken(AsmToken::Minus); if (getTok().isNot(AsmToken::Integer)) return ParseStatus::NoMatch; diff --git a/llvm/test/MC/RISCV/rv32xqccmp-invalid.s b/llvm/test/MC/RISCV/rv32xqccmp-invalid.s index a13d134100dbe..9f43a167ff778 100644 --- a/llvm/test/MC/RISCV/rv32xqccmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32xqccmp-invalid.s @@ -22,16 +22,16 @@ qc.cm.push {ra}, 16 # CHECK-ERROR: :[[@LINE+1]]:24: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.pushfp {ra, s0}, 16 -# CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] +# CHECK-ERROR: :[[@LINE+1]]:24: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] qc.cm.pop {ra, s0-s1}, -32 -# CHECK-ERROR: :[[@LINE+1]]:19: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:18: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.push {ra}, -8 -# CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:24: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.pushfp {ra, s0}, -12 -# CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] +# CHECK-ERROR: :[[@LINE+1]]:24: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] qc.cm.pop {ra, s0-s1}, -40 # CHECK-ERROR: :[[@LINE+1]]:17: error: register list must include 's0' or 'x8' diff --git a/llvm/test/MC/RISCV/rv32zcmp-invalid.s b/llvm/test/MC/RISCV/rv32zcmp-invalid.s index 0a62faa80dc7c..37947fc94d7b9 100644 --- a/llvm/test/MC/RISCV/rv32zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32zcmp-invalid.s @@ -19,10 +19,10 @@ cm.popretz {ra, s0-s1}, 112 # CHECK-ERROR: :[[@LINE+1]]:15: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] cm.push {ra}, 16 -# CHECK-ERROR: :[[@LINE+1]]:22: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] +# CHECK-ERROR: :[[@LINE+1]]:21: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] cm.pop {ra, s0-s1}, -32 -# CHECK-ERROR: :[[@LINE+1]]:16: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:15: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] cm.push {ra}, -8 # CHECK-ERROR: :[[@LINE+1]]:9: error: register list must start from 'ra' or 'x1' diff --git a/llvm/test/MC/RISCV/rv64xqccmp-invalid.s b/llvm/test/MC/RISCV/rv64xqccmp-invalid.s index 124e95a5d0fc2..16b3dd91aea48 100644 --- a/llvm/test/MC/RISCV/rv64xqccmp-invalid.s +++ b/llvm/test/MC/RISCV/rv64xqccmp-invalid.s @@ -22,14 +22,14 @@ qc.cm.push {ra}, 16 # CHECK-ERROR: :[[@LINE+1]]:24: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.pushfp {ra, s0}, 16 -# CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] +# CHECK-ERROR: :[[@LINE+1]]:24: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] qc.cm.pop {ra, s0-s1}, -32 -# CHECK-ERROR: :[[@LINE+1]]:19: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:18: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.push {ra}, -15 -# CHECK-ERROR: :[[@LINE+1]]:23: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:22: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] qc.cm.push {ra, s0}, -15 -# CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] +# CHECK-ERROR: :[[@LINE+1]]:24: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] qc.cm.pop {ra, s0-s1}, -33 diff --git a/llvm/test/MC/RISCV/rv64zcmp-invalid.s b/llvm/test/MC/RISCV/rv64zcmp-invalid.s index f7ae69aad6efd..f77dabce255a0 100644 --- a/llvm/test/MC/RISCV/rv64zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv64zcmp-invalid.s @@ -19,13 +19,13 @@ cm.popretz {ra, s0-s1}, 112 # CHECK-ERROR: :[[@LINE+1]]:15: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] cm.push {ra}, 16 -# CHECK-ERROR: :[[@LINE+1]]:22: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] +# CHECK-ERROR: :[[@LINE+1]]:21: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] cm.pop {ra, s0-s1}, -32 -# CHECK-ERROR: :[[@LINE+1]]:16: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] +# CHECK-ERROR: :[[@LINE+1]]:15: error: stack adjustment for register list must be a multiple of 16 bytes in the range [-64, -16] cm.push {ra}, -15 -# CHECK-ERROR: :[[@LINE+1]]:22: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] +# CHECK-ERROR: :[[@LINE+1]]:21: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] cm.pop {ra, s0-s1}, -33 # CHECK-ERROR: :[[@LINE+1]]:9: error: register list must start from 'ra' or 'x1' From be3abfc00f37d07c641b3e2f93eef5d1a446af8e Mon Sep 17 00:00:00 2001 From: Jeremy Day Date: Wed, 2 Apr 2025 16:35:02 -0700 Subject: [PATCH 0462/1029] [lldb] Clear thread name container before writing UTF8 bytes (#134150) `llvm::convertUTF16ToUTF8String` opens with an assertion that the output container is empty: https://github.com/llvm/llvm-project/blob/3bdf9a08804a5b424fd32fef3b0089f3a6db839d/llvm/lib/Support/ConvertUTFWrapper.cpp#L83-L84 It's not clear to me why this function requires the output container to be empty instead of just overwriting it, but the callsite in `TargetThreadWindows::GetName` may reuse the container without clearing it out first, resulting in an assertion failure: ``` # Child-SP RetAddr Call Site 00 000000d2`44b8ea48 00007ff8`beefc12e ntdll!NtTerminateProcess+0x14 01 000000d2`44b8ea50 00007ff8`bcf518ab ntdll!RtlExitUserProcess+0x11e 02 000000d2`44b8ea80 00007ff8`bc0e0143 KERNEL32!ExitProcessImplementation+0xb 03 000000d2`44b8eab0 00007ff8`bc0e4c49 ucrtbase!common_exit+0xc7 04 000000d2`44b8eb10 00007ff8`bc102ae6 ucrtbase!abort+0x69 05 000000d2`44b8eb40 00007ff8`bc102cc1 ucrtbase!common_assert_to_stderr+0x6e 06 000000d2`44b8eb80 00007fff`b8e27a80 ucrtbase!wassert+0x71 07 000000d2`44b8ebb0 00007fff`b8b821e1 liblldb!llvm::convertUTF16ToUTF8String+0x30 [D:\r\_work\swift-build\swift-build\SourceCache\llvm-project\llvm\lib\Support\ConvertUTFWrapper.cpp @ 88] 08 000000d2`44b8ec30 00007fff`b83e9aa2 liblldb!lldb_private::TargetThreadWindows::GetName+0x1b1 [D:\r\_work\swift-build\swift-build\SourceCache\llvm-project\lldb\source\Plugins\Process\Windows\Common\TargetThreadWindows.cpp @ 198] 09 000000d2`44b8eca0 00007ff7`2a3c3c14 liblldb!lldb::SBThread::GetName+0x102 [D:\r\_work\swift-build\swift-build\SourceCache\llvm-project\lldb\source\API\SBThread.cpp @ 432] 0a 000000d2`44b8ed70 00007ff7`2a3a5ac6 lldb_dap!lldb_dap::CreateThread+0x1f4 [S:\SourceCache\llvm-project\lldb\tools\lldb-dap\JSONUtils.cpp @ 877] 0b 000000d2`44b8ef10 00007ff7`2a3b0ab5 lldb_dap!`anonymous namespace'::request_threads+0xa6 [S:\SourceCache\llvm-project\lldb\tools\lldb-dap\lldb-dap.cpp @ 3906] 0c 000000d2`44b8f010 00007ff7`2a3b0fe8 lldb_dap!lldb_dap::DAP::HandleObject+0x1c5 [S:\SourceCache\llvm-project\lldb\tools\lldb-dap\DAP.cpp @ 796] 0d 000000d2`44b8f130 00007ff7`2a3a8b96 lldb_dap!lldb_dap::DAP::Loop+0x78 [S:\SourceCache\llvm-project\lldb\tools\lldb-dap\DAP.cpp @ 812] 0e 000000d2`44b8f1d0 00007ff7`2a4b5fbc lldb_dap!main+0x1096 [S:\SourceCache\llvm-project\lldb\tools\lldb-dap\lldb-dap.cpp @ 5319] 0f (Inline Function) --------`-------- lldb_dap!invoke_main+0x22 [D:\a\_work\1\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl @ 78] 10 000000d2`44b8fb80 00007ff8`bcf3e8d7 lldb_dap!__scrt_common_main_seh+0x10c [D:\a\_work\1\s\src\vctools\crt\vcstartup\src\startup\exe_common.inl @ 288] 11 000000d2`44b8fbc0 00007ff8`beefbf6c KERNEL32!BaseThreadInitThunk+0x17 12 000000d2`44b8fbf0 00000000`00000000 ntdll!RtlUserThreadStart+0x2c ``` This stack trace was captured from the lldb distributed in the Swift toolchain. The issue is easy to reproduce by resuming from a breakpoint twice in VS Code. I've verified that clearing out the container here fixes the assertion failure. --- .../Plugins/Process/Windows/Common/TargetThreadWindows.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp index a0d0f0ea0abc8..b2b66f2927644 100644 --- a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp +++ b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp @@ -192,6 +192,7 @@ const char *TargetThreadWindows::GetName() { if (SUCCEEDED(GetThreadDescription( m_host_thread.GetNativeThread().GetSystemHandle(), &pszThreadName))) { LLDB_LOGF(log, "GetThreadDescription: %ls", pszThreadName); + m_name.clear(); llvm::convertUTF16ToUTF8String( llvm::ArrayRef(reinterpret_cast(pszThreadName), wcslen(pszThreadName) * sizeof(wchar_t)), From ffed17624eb14a52c1db890e4e8e195dbe5f19b6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 2 Apr 2025 16:41:49 -0700 Subject: [PATCH 0463/1029] [RISCV] Correct the error location for the X26 check in parseRegListCommon. We should point to the start of the reglist not the closing parenthesis. I also moved the check after we finishing parsing the closing brace. The diagnostic mentions '{ra, s0-s10} or {x1, x8-x9, x18-x26}' so we should be sure that's what we parsed. --- llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 8 ++++---- llvm/test/MC/RISCV/rv32xqccmp-invalid.s | 2 +- llvm/test/MC/RISCV/rv32zcmp-invalid.s | 2 +- llvm/test/MC/RISCV/rv64xqccmp-invalid.s | 2 +- llvm/test/MC/RISCV/rv64zcmp-invalid.s | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index f64df24c40593..7837504751694 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2655,13 +2655,13 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, } } - if (RegEnd == RISCV::X26) - return Error(getLoc(), "invalid register list, {ra, s0-s10} or {x1, x8-x9, " - "x18-x26} is not supported"); - if (parseToken(AsmToken::RCurly, "register list must end with '}'")) return ParseStatus::Failure; + if (RegEnd == RISCV::X26) + return Error(S, "invalid register list, {ra, s0-s10} or {x1, x8-x9, " + "x18-x26} is not supported"); + auto Encode = RISCVZC::encodeRlist(RegEnd, IsRVE); assert(Encode != RISCVZC::INVALID_RLIST); if (MustIncludeS0) diff --git a/llvm/test/MC/RISCV/rv32xqccmp-invalid.s b/llvm/test/MC/RISCV/rv32xqccmp-invalid.s index 9f43a167ff778..e43f86cbb84ee 100644 --- a/llvm/test/MC/RISCV/rv32xqccmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32xqccmp-invalid.s @@ -10,7 +10,7 @@ qc.cm.mvsa01 s0, s0 # CHECK-ERROR: :[[@LINE+1]]:14: error: invalid operand for instruction qc.cm.mva01s a1, a2 -# CHECK-ERROR: :[[@LINE+1]]:26: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported +# CHECK-ERROR: :[[@LINE+1]]:15: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported qc.cm.popretz {ra, s0-s10}, 112 # CHECK-ERROR: :[[@LINE+1]]:28: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] diff --git a/llvm/test/MC/RISCV/rv32zcmp-invalid.s b/llvm/test/MC/RISCV/rv32zcmp-invalid.s index 37947fc94d7b9..c41cc35a8f8ee 100644 --- a/llvm/test/MC/RISCV/rv32zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32zcmp-invalid.s @@ -10,7 +10,7 @@ cm.mvsa01 s0, s0 # CHECK-ERROR: :[[@LINE+1]]:11: error: invalid operand for instruction cm.mva01s a1, a2 -# CHECK-ERROR: :[[@LINE+1]]:23: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported +# CHECK-ERROR: :[[@LINE+1]]:12: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported cm.popretz {ra, s0-s10}, 112 # CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] diff --git a/llvm/test/MC/RISCV/rv64xqccmp-invalid.s b/llvm/test/MC/RISCV/rv64xqccmp-invalid.s index 16b3dd91aea48..953887cc77e47 100644 --- a/llvm/test/MC/RISCV/rv64xqccmp-invalid.s +++ b/llvm/test/MC/RISCV/rv64xqccmp-invalid.s @@ -10,7 +10,7 @@ qc.cm.mvsa01 s0, s0 # CHECK-ERROR: :[[@LINE+1]]:14: error: invalid operand for instruction qc.cm.mva01s a1, a2 -# CHECK-ERROR: :[[@LINE+1]]:26: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported +# CHECK-ERROR: :[[@LINE+1]]:15: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported qc.cm.popretz {ra, s0-s10}, 112 # CHECK-ERROR: :[[@LINE+1]]:28: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] diff --git a/llvm/test/MC/RISCV/rv64zcmp-invalid.s b/llvm/test/MC/RISCV/rv64zcmp-invalid.s index f77dabce255a0..ffaffdf6a5999 100644 --- a/llvm/test/MC/RISCV/rv64zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv64zcmp-invalid.s @@ -10,7 +10,7 @@ cm.mvsa01 s0, s0 # CHECK-ERROR: :[[@LINE+1]]:11: error: invalid operand for instruction cm.mva01s a1, a2 -# CHECK-ERROR: :[[@LINE+1]]:23: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported +# CHECK-ERROR: :[[@LINE+1]]:12: error: invalid register list, {ra, s0-s10} or {x1, x8-x9, x18-x26} is not supported cm.popretz {ra, s0-s10}, 112 # CHECK-ERROR: :[[@LINE+1]]:25: error: stack adjustment for register list must be a multiple of 16 bytes in the range [32, 80] From f68a5185d0959929bd0b0249a8128d0720315dab Mon Sep 17 00:00:00 2001 From: Sterling-Augustine <56981066+Sterling-Augustine@users.noreply.github.com> Date: Wed, 2 Apr 2025 16:49:57 -0700 Subject: [PATCH 0464/1029] Allow this test to pass when the source is on a read-only filesystem (#134179) llc attempts to create an empty file in the current directory, but it can't do that on a read-only file system. Send that empty-output to stdout, which prevents this failure. --- .../CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll index 383f6c1288d13..f04c7ec9884bd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll @@ -1,7 +1,7 @@ -; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx810 %s 2>&1 | FileCheck --ignore-case %s -; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx810 %s 2>&1 | FileCheck --ignore-case %s -; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --ignore-case %s -; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -o - -mtriple=amdgcn -mcpu=gfx810 %s 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -o - -global-isel -mtriple=amdgcn -mcpu=gfx810 %s 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -o - -mtriple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -o - -global-isel -mtriple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --ignore-case %s ; ; CHECK: LLVM ERROR: Cannot select From 7559c64c5e97d9f33563f1c6afcfd7f7aac01046 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 3 Apr 2025 07:17:12 +0700 Subject: [PATCH 0465/1029] CloneModule: Map global initializers after mapping the function (#134082) --- llvm/lib/Transforms/Utils/CloneModule.cpp | 52 +++++++++---------- ...e-functions-blockaddress-wrong-function.ll | 8 +-- 2 files changed, 28 insertions(+), 32 deletions(-) diff --git a/llvm/lib/Transforms/Utils/CloneModule.cpp b/llvm/lib/Transforms/Utils/CloneModule.cpp index cabc2ab7933a4..88e2bfe45d2cb 100644 --- a/llvm/lib/Transforms/Utils/CloneModule.cpp +++ b/llvm/lib/Transforms/Utils/CloneModule.cpp @@ -124,32 +124,6 @@ std::unique_ptr llvm::CloneModule( VMap[&I] = GI; } - // Now that all of the things that global variable initializer can refer to - // have been created, loop through and copy the global variable referrers - // over... We also set the attributes on the global now. - // - for (const GlobalVariable &G : M.globals()) { - GlobalVariable *GV = cast(VMap[&G]); - - SmallVector, 1> MDs; - G.getAllMetadata(MDs); - for (auto MD : MDs) - GV->addMetadata(MD.first, *MapMetadata(MD.second, VMap)); - - if (G.isDeclaration()) - continue; - - if (!ShouldCloneDefinition(&G)) { - // Skip after setting the correct linkage for an external reference. - GV->setLinkage(GlobalValue::ExternalLinkage); - continue; - } - if (G.hasInitializer()) - GV->setInitializer(MapValue(G.getInitializer(), VMap)); - - copyComdat(GV, &G); - } - // Similarly, copy over function bodies now... // for (const Function &I : M) { @@ -212,6 +186,32 @@ std::unique_ptr llvm::CloneModule( NewNMD->addOperand(MapMetadata(N, VMap)); } + // Now that all of the things that global variable initializer can refer to + // have been created, loop through and copy the global variable referrers + // over... We also set the attributes on the global now. + // + for (const GlobalVariable &G : M.globals()) { + GlobalVariable *GV = cast(VMap[&G]); + + SmallVector, 1> MDs; + G.getAllMetadata(MDs); + for (auto MD : MDs) + GV->addMetadata(MD.first, *MapMetadata(MD.second, VMap)); + + if (G.isDeclaration()) + continue; + + if (!ShouldCloneDefinition(&G)) { + // Skip after setting the correct linkage for an external reference. + GV->setLinkage(GlobalValue::ExternalLinkage); + continue; + } + if (G.hasInitializer()) + GV->setInitializer(MapValue(G.getInitializer(), VMap)); + + copyComdat(GV, &G); + } + return New; } diff --git a/llvm/test/tools/llvm-reduce/reduce-functions-blockaddress-wrong-function.ll b/llvm/test/tools/llvm-reduce/reduce-functions-blockaddress-wrong-function.ll index f296553759f6b..a757cac0d2bbe 100644 --- a/llvm/test/tools/llvm-reduce/reduce-functions-blockaddress-wrong-function.ll +++ b/llvm/test/tools/llvm-reduce/reduce-functions-blockaddress-wrong-function.ll @@ -1,14 +1,10 @@ ; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=functions --test FileCheck --test-arg --check-prefixes=INTERESTING --test-arg %s --test-arg --input-file %s -o %t ; RUN: FileCheck --check-prefixes=RESULT --input-file=%t %s -; FIXME: This testcase exhibits nonsensical behavior. The first -; function has blockaddress references. When the second function is -; deleted, it causes the blockreferences from the first to be replaced -; with inttoptr. - ; INTERESTING: @blockaddr.table.other -; RESULT: @blockaddr.table.other = private unnamed_addr constant [2 x ptr] [ptr inttoptr (i32 1 to ptr), ptr inttoptr (i32 1 to ptr)] +; RESULT: @blockaddr.table.other = private unnamed_addr constant [2 x ptr] [ptr blockaddress(@bar, %L1), ptr blockaddress(@bar, %L2)] + @blockaddr.table.other = private unnamed_addr constant [2 x ptr] [ptr blockaddress(@bar, %L1), ptr blockaddress(@bar, %L2)] From b55bab229228218341e2f24fc8529c7aaab51e2f Mon Sep 17 00:00:00 2001 From: David Peixotto Date: Wed, 2 Apr 2025 17:22:46 -0700 Subject: [PATCH 0466/1029] [lldb] Fix plugin manager test failure on windows (#134173) This is an attempt to fix a test failure from #133794 when running on windows builds. I suspect we are running into a case where the [ICF](https://learn.microsoft.com/en-us/cpp/build/reference/opt-optimizations?view=msvc-170) optimization kicks in and combines the CreateSystemRuntimePlugin* functions into a single address. This means that we cannot uniquely unregister the plugin based on its create function address. The fix is have each create function return a different (bogus) value. --- lldb/unittests/Core/PluginManagerTest.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/lldb/unittests/Core/PluginManagerTest.cpp b/lldb/unittests/Core/PluginManagerTest.cpp index ca1003ca9a85a..9b0ce2286d273 100644 --- a/lldb/unittests/Core/PluginManagerTest.cpp +++ b/lldb/unittests/Core/PluginManagerTest.cpp @@ -7,11 +7,21 @@ using namespace lldb; using namespace lldb_private; // Mock system runtime plugin create functions. -SystemRuntime *CreateSystemRuntimePluginA(Process *process) { return nullptr; } +// Make them all return different values to avoid the ICF optimization +// from combining them into the same function. The values returned +// are not valid SystemRuntime pointers, but they are unique and +// sufficient for testing. +SystemRuntime *CreateSystemRuntimePluginA(Process *process) { + return (SystemRuntime *)0x1; +} -SystemRuntime *CreateSystemRuntimePluginB(Process *process) { return nullptr; } +SystemRuntime *CreateSystemRuntimePluginB(Process *process) { + return (SystemRuntime *)0x2; +} -SystemRuntime *CreateSystemRuntimePluginC(Process *process) { return nullptr; } +SystemRuntime *CreateSystemRuntimePluginC(Process *process) { + return (SystemRuntime *)0x3; +} // Test class for testing the PluginManager. // The PluginManager modifies global state when registering new plugins. This @@ -24,6 +34,10 @@ class PluginManagerTest : public testing::Test { // Add mock system runtime plugins for testing. void RegisterMockSystemRuntimePlugins() { + // Make sure the create functions all have different addresses. + ASSERT_NE(CreateSystemRuntimePluginA, CreateSystemRuntimePluginB); + ASSERT_NE(CreateSystemRuntimePluginB, CreateSystemRuntimePluginC); + ASSERT_TRUE(PluginManager::RegisterPlugin("a", "test instance A", CreateSystemRuntimePluginA)); ASSERT_TRUE(PluginManager::RegisterPlugin("b", "test instance B", From e5809f01720b9d016f940bc132278f2f2adf0665 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 2 Apr 2025 19:26:19 -0500 Subject: [PATCH 0467/1029] [LLVM] Only build the GPU loader utility if it has LLVM-libc (#134141) Summary: There were some discussions about this being included by default. I need to fix this up and codify the use of LLVM libc inside of LLVM. For now, just turn it off unless the user requested the `libc` GPU stuff. This matches the old behavior. --- llvm/tools/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/tools/CMakeLists.txt b/llvm/tools/CMakeLists.txt index 9fe6f8c6b9c21..729797aa43f0b 100644 --- a/llvm/tools/CMakeLists.txt +++ b/llvm/tools/CMakeLists.txt @@ -9,7 +9,7 @@ # traversing each directory. create_llvm_tool_options() -if(NOT LLVM_COMPILER_IS_GCC_COMPATIBLE) +if(NOT LLVM_COMPILER_IS_GCC_COMPATIBLE OR NOT LLVM_LIBC_GPU_BUILD) set(LLVM_TOOL_LLVM_GPU_LOADER_BUILD OFF) endif() From 749c20b3e0b88792c10d7709874f3ae374e8170e Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Wed, 2 Apr 2025 17:35:14 -0700 Subject: [PATCH 0468/1029] [LIT] Add a test for lit.Test.toMetricValue. NFC --- llvm/utils/lit/tests/Inputs/test-data/dummy_format.py | 10 ++++++++-- llvm/utils/lit/tests/Inputs/test-data/metrics.ini | 3 ++- llvm/utils/lit/tests/test-data.py | 1 + llvm/utils/lit/tests/test-output.py | 3 ++- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py b/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py index a2d314fdb1a8d..30bd1814a6a42 100644 --- a/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py +++ b/llvm/utils/lit/tests/Inputs/test-data/dummy_format.py @@ -27,10 +27,16 @@ def execute(self, test, lit_config): # Load additional metrics. for key, value_str in cfg.items("results"): value = eval(value_str) + metric = lit.Test.toMetricValue(value) if isinstance(value, int): - metric = lit.Test.IntMetricValue(value) + assert isinstance(metric, lit.Test.IntMetricValue) + assert metric.format() == lit.Test.IntMetricValue(value).format() elif isinstance(value, float): - metric = lit.Test.RealMetricValue(value) + assert isinstance(metric, lit.Test.RealMetricValue) + assert metric.format() == lit.Test.RealMetricValue(value).format() + elif isinstance(value, str): + assert isinstance(metric, lit.Test.JSONMetricValue) + assert metric.format() == lit.Test.JSONMetricValue(value).format() else: raise RuntimeError("unsupported result type") result.addMetric(key, metric) diff --git a/llvm/utils/lit/tests/Inputs/test-data/metrics.ini b/llvm/utils/lit/tests/Inputs/test-data/metrics.ini index 01b09c5c77529..52fa32be71166 100644 --- a/llvm/utils/lit/tests/Inputs/test-data/metrics.ini +++ b/llvm/utils/lit/tests/Inputs/test-data/metrics.ini @@ -4,4 +4,5 @@ result_output = Test passed. [results] value0 = 1 -value1 = 2.3456 \ No newline at end of file +value1 = 2.3456 +value2 = "stringy" \ No newline at end of file diff --git a/llvm/utils/lit/tests/test-data.py b/llvm/utils/lit/tests/test-data.py index 628a319dd4f52..6d2df74fd5a4d 100644 --- a/llvm/utils/lit/tests/test-data.py +++ b/llvm/utils/lit/tests/test-data.py @@ -9,4 +9,5 @@ # CHECK-NEXT: *** TEST 'test-data :: metrics.ini' RESULTS *** # CHECK-NEXT: value0: 1 # CHECK-NEXT: value1: 2.3456 +# CHECK-NEXT: value2: "stringy" # CHECK-NEXT: *** diff --git a/llvm/utils/lit/tests/test-output.py b/llvm/utils/lit/tests/test-output.py index d0d01202330f0..86b3bb4c7509e 100644 --- a/llvm/utils/lit/tests/test-output.py +++ b/llvm/utils/lit/tests/test-output.py @@ -10,7 +10,8 @@ # CHECK-NEXT: "elapsed": {{[-+0-9.eE]+}}, # CHECK-NEXT: "metrics": { # CHECK-NEXT: "value0": 1, -# CHECK-NEXT: "value1": 2.3456 +# CHECK-NEXT: "value1": 2.3456, +# CHECK-NEXT: "value2": "stringy" # CHECK-NEXT: } # CHECK-NEXT: "name": "test-data :: metrics.ini", # CHECK-NEXT: "output": "Test passed." From fb7135ec5239a45b43fae6206f7409fd77c50b9f Mon Sep 17 00:00:00 2001 From: Ankur Ahir <69181589+Ankur-0429@users.noreply.github.com> Date: Wed, 2 Apr 2025 18:11:27 -0700 Subject: [PATCH 0469/1029] [Clang] fixed clang frontend crash with friend class declaration and overload == (#133878) --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaDeclCXX.cpp | 3 +-- .../class/class.compare/class.compare.default/p1.cpp | 12 ++++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 7fb6b0baae16b..5e8df45e71d54 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -365,6 +365,7 @@ Bug Fixes to Attribute Support Bug Fixes to C++ Support ^^^^^^^^^^^^^^^^^^^^^^^^ +- Clang now supports implicitly defined comparison operators for friend declarations. (#GH132249) - Clang now diagnoses copy constructors taking the class by value in template instantiations. (#GH130866) - Clang is now better at keeping track of friend function template instance contexts. (#GH55509) - Clang now prints the correct instantiation context for diagnostics suppressed diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 2142582adf6e9..96c0470198e35 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -9006,8 +9006,7 @@ bool Sema::CheckExplicitlyDefaultedComparison(Scope *S, FunctionDecl *FD, return true; if (llvm::none_of(RD->friends(), [&](const FriendDecl *F) { - return FD->getCanonicalDecl() == - F->getFriendDecl()->getCanonicalDecl(); + return declaresSameEntity(F->getFriendDecl(), FD); })) { Diag(FD->getLocation(), diag::err_defaulted_comparison_not_friend) << int(DCK) << int(0) << RD; diff --git a/clang/test/CXX/class/class.compare/class.compare.default/p1.cpp b/clang/test/CXX/class/class.compare/class.compare.default/p1.cpp index a195e0548152d..f3e241c7bbd51 100644 --- a/clang/test/CXX/class/class.compare/class.compare.default/p1.cpp +++ b/clang/test/CXX/class/class.compare/class.compare.default/p1.cpp @@ -285,3 +285,15 @@ struct j { }; bool j::operator==(const j &) const = default; } + +namespace evil2 { + struct k { + }; + + struct l { + friend bool operator==(const l& a, const l& b); + friend class k; + }; + + bool operator==(const l& a, const l& b) = default; +} From 02467f9e2100d451b52bc63abc9c94829d7b83f4 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 2 Apr 2025 18:15:48 -0700 Subject: [PATCH 0470/1029] [ctxprof] Option to move a whole tree to its own module (#133992) Modules may contain a mix of functions that participate or don't participate in callgraphs covered by a contextual profile. We currently have been importing all the functions under a context root in the module defining that root, but if the other functions there are covered by flat profiles, the result is difficult to reason about. This patch allows moving everything under a context root (and that root) in its own module. For now, we expect a module with a filename matching the GUID of the function be present in the set of modules known by the linker. This mechanism can be improved in a later patch. Subsequent patches will handle implementing "move" instead of "import" semantics for the root function (because we want to make sure only one version of the root exists - so the optimizations we perform are actually the ones being observed at runtime). --- llvm/lib/Transforms/IPO/FunctionImport.cpp | 32 +++++++++-- .../ThinLTO/X86/ctxprof-separate-module.ll | 56 +++++++++++++++++++ 2 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 llvm/test/ThinLTO/X86/ctxprof-separate-module.ll diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index f0daf1a558316..cfefc0e8c43a8 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -38,6 +38,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/JSON.h" +#include "llvm/Support/Path.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO/Internalize.h" @@ -175,6 +176,12 @@ static cl::opt WorkloadDefinitions( extern cl::opt UseCtxProfile; +static cl::opt CtxprofMoveRootsToOwnModule( + "thinlto-move-ctxprof-trees", + cl::desc("Move contextual profiling roots and the graphs under them in " + "their own module."), + cl::Hidden, cl::init(false)); + namespace llvm { extern cl::opt EnableMemProfContextDisambiguation; } @@ -535,7 +542,14 @@ class WorkloadImportsManager : public ModuleImportsManager { computeImportForModule(const GVSummaryMapTy &DefinedGVSummaries, StringRef ModName, FunctionImporter::ImportMapTy &ImportList) override { - auto SetIter = Workloads.find(ModName); + StringRef Filename = ModName; + if (CtxprofMoveRootsToOwnModule) { + Filename = sys::path::filename(ModName); + // Drop the file extension. + Filename = Filename.substr(0, Filename.find_last_of('.')); + } + auto SetIter = Workloads.find(Filename); + if (SetIter == Workloads.end()) { LLVM_DEBUG(dbgs() << "[Workload] " << ModName << " does not contain the root of any context.\n"); @@ -748,10 +762,18 @@ class WorkloadImportsManager : public ModuleImportsManager { << RootVI.getSummaryList().size() << ". Skipping.\n"); continue; } - StringRef RootDefiningModule = - RootVI.getSummaryList().front()->modulePath(); - LLVM_DEBUG(dbgs() << "[Workload] Root defining module for " << RootGuid - << " is : " << RootDefiningModule << "\n"); + std::string RootDefiningModule = + RootVI.getSummaryList().front()->modulePath().str(); + if (CtxprofMoveRootsToOwnModule) { + RootDefiningModule = std::to_string(RootGuid); + LLVM_DEBUG( + dbgs() << "[Workload] Moving " << RootGuid + << " to a module with the filename without extension : " + << RootDefiningModule << "\n"); + } else { + LLVM_DEBUG(dbgs() << "[Workload] Root defining module for " << RootGuid + << " is : " << RootDefiningModule << "\n"); + } auto &Set = Workloads[RootDefiningModule]; Root.getContainedGuids(ContainedGUIDs); for (auto Guid : ContainedGUIDs) diff --git a/llvm/test/ThinLTO/X86/ctxprof-separate-module.ll b/llvm/test/ThinLTO/X86/ctxprof-separate-module.ll new file mode 100644 index 0000000000000..c7891d336cc89 --- /dev/null +++ b/llvm/test/ThinLTO/X86/ctxprof-separate-module.ll @@ -0,0 +1,56 @@ +; Test workload based importing via -thinlto-pgo-ctx-prof with moving the whole +; graph to a new module. +; Use external linkage symbols so we don't depend on module paths which are +; used when computing the GUIDs of internal linkage symbols. +; +; Set up +; RUN: rm -rf %t +; RUN: mkdir -p %t +; RUN: split-file %s %t +; +; RUN: opt -module-summary -passes=assign-guid,ctx-instr-gen %t/m1.ll -o %t/m1.bc +; RUN: opt -module-summary -passes=assign-guid,ctx-instr-gen %t/m2.ll -o %t/m2.bc +; RUN: opt -module-summary -passes=assign-guid,ctx-instr-gen %t/6019442868614718803.ll -o %t/6019442868614718803.bc + +; RUN: llvm-ctxprof-util fromYAML --input %t/ctxprof.yaml --output %t/ctxprof.bitstream +; RUN: llvm-lto2 run %t/m1.bc %t/m2.bc %t/6019442868614718803.bc -thinlto-move-ctxprof-trees \ +; RUN: -o %t/result.o -save-temps \ +; RUN: -use-ctx-profile=%t/ctxprof.bitstream \ +; RUN: -r %t/m1.bc,m1_f1,plx \ +; RUN: -r %t/m2.bc,m2_f1,plx +; RUN: llvm-dis %t/result.o.3.3.import.bc -o - | FileCheck %s +; +; +; CHECK: m1_f1() +; CHECK: m2_f1() +; +;--- ctxprof.yaml +Contexts: + - + Guid: 6019442868614718803 + TotalRootEntryCount: 5 + Counters: [1] + Callsites: + - - + Guid: 15593096274670919754 + Counters: [1] + +;--- m1.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +define dso_local void @m1_f1() { + ret void +} + +;--- m2.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +define dso_local void @m2_f1() { + ret void +} + +;--- 6019442868614718803.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" From d59b2c4def9fe187317c20f96cc76eda09bc68a0 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 2 Apr 2025 18:18:17 -0700 Subject: [PATCH 0471/1029] [ctxprof][nfc] Make `computeImportForFunction` a member of `ModuleImportsManager` (#134011) --- llvm/lib/Transforms/IPO/FunctionImport.cpp | 26 ++++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index cfefc0e8c43a8..43807a8feb36e 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -497,6 +497,13 @@ static const char *getFailureName(FunctionImporter::ImportFailureReason Reason); /// Determine the list of imports and exports for each module. class ModuleImportsManager { + void computeImportForFunction( + const FunctionSummary &Summary, unsigned Threshold, + const GVSummaryMapTy &DefinedGVSummaries, + SmallVectorImpl &Worklist, GlobalsImporter &GVImporter, + FunctionImporter::ImportMapTy &ImportList, + FunctionImporter::ImportThresholdsTy &ImportThresholds); + protected: function_ref IsPrevailing; @@ -852,14 +859,11 @@ getFailureName(FunctionImporter::ImportFailureReason Reason) { /// Compute the list of functions to import for a given caller. Mark these /// imported functions and the symbols they reference in their source module as /// exported from their source module. -static void computeImportForFunction( - const FunctionSummary &Summary, const ModuleSummaryIndex &Index, - const unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries, - function_ref - isPrevailing, +void ModuleImportsManager::computeImportForFunction( + const FunctionSummary &Summary, const unsigned Threshold, + const GVSummaryMapTy &DefinedGVSummaries, SmallVectorImpl &Worklist, GlobalsImporter &GVImporter, FunctionImporter::ImportMapTy &ImportList, - DenseMap *ExportLists, FunctionImporter::ImportThresholdsTy &ImportThresholds) { GVImporter.onImportingSummary(Summary); static int ImportCount = 0; @@ -1064,9 +1068,8 @@ void ModuleImportsManager::computeImportForModule( // Skip import for global variables continue; LLVM_DEBUG(dbgs() << "Initialize import for " << VI << "\n"); - computeImportForFunction(*FuncSummary, Index, ImportInstrLimit, - DefinedGVSummaries, IsPrevailing, Worklist, GVI, - ImportList, ExportLists, ImportThresholds); + computeImportForFunction(*FuncSummary, ImportInstrLimit, DefinedGVSummaries, + Worklist, GVI, ImportList, ImportThresholds); } // Process the newly imported functions and add callees to the worklist. @@ -1076,9 +1079,8 @@ void ModuleImportsManager::computeImportForModule( auto Threshold = std::get<1>(GVInfo); if (auto *FS = dyn_cast(Summary)) - computeImportForFunction(*FS, Index, Threshold, DefinedGVSummaries, - IsPrevailing, Worklist, GVI, ImportList, - ExportLists, ImportThresholds); + computeImportForFunction(*FS, Threshold, DefinedGVSummaries, Worklist, + GVI, ImportList, ImportThresholds); } // Print stats about functions considered but rejected for importing From ff0c2fbd8eb66688746476a7ec850fb5afb4d588 Mon Sep 17 00:00:00 2001 From: tangaac Date: Thu, 3 Apr 2025 09:19:59 +0800 Subject: [PATCH 0472/1029] [LoongArch] Pre-commit tests for vector absolute difference (#132898) --- .../LoongArch/lasx/ir-instruction/absd.ll | 557 ++++++++++++++++++ .../LoongArch/lsx/ir-instruction/absd.ll | 557 ++++++++++++++++++ 2 files changed, 1114 insertions(+) create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/absd.ll create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/absd.ll diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/absd.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/absd.ll new file mode 100644 index 0000000000000..bd5b16f5147a2 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/absd.ll @@ -0,0 +1,557 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s + +;; TODO: Currently LoongArch generates sub-optimal code for these cases +;; 1. trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b) or abdu(a,b) +;; 2. abs(sub_nsw(x, y)) -> abds(a,b) +;; 3. sub(smax(a,b),smin(a,b)) -> abds(a,b) or abdu(a,b) +;; 4. select(icmp(a,b, slt|ult),sub(a,b),sub(b,a)) -> abds(a,b) or abdu(a,b) +;; 5. sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b) or abdu(a,b) +;; +;; abds / abdu can be lowered to xvabsd.{b/h/w/d} / xvabsd.{b/h/w/d}u instruction. +;; +;; Later patch will address it. + +;; trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b) +define <32 x i8> @xvabsd_b(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: xvabsd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.b $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a.sext = sext <32 x i8> %a to <32 x i16> + %b.sext = sext <32 x i8> %b to <32 x i16> + %sub = sub <32 x i16> %a.sext, %b.sext + %abs = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %sub, i1 true) + %trunc = trunc <32 x i16> %abs to <32 x i8> + ret <32 x i8> %trunc +} + +define <16 x i16> @xvabsd_h(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: xvabsd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.h $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a.sext = sext <16 x i16> %a to <16 x i32> + %b.sext = sext <16 x i16> %b to <16 x i32> + %sub = sub <16 x i32> %a.sext, %b.sext + %abs = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %sub, i1 true) + %trunc = trunc <16 x i32> %abs to <16 x i16> + ret <16 x i16> %trunc +} + +define <8 x i32> @xvabsd_w(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: xvabsd_w: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.w $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a.sext = sext <8 x i32> %a to <8 x i64> + %b.sext = sext <8 x i32> %b to <8 x i64> + %sub = sub <8 x i64> %a.sext, %b.sext + %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 true) + %trunc = trunc <8 x i64> %abs to <8 x i32> + ret <8 x i32> %trunc +} + +define <4 x i64> @xvabsd_d(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: xvabsd_d: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.d $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a.sext = sext <4 x i64> %a to <4 x i128> + %b.sext = sext <4 x i64> %b to <4 x i128> + %sub = sub <4 x i128> %a.sext, %b.sext + %abs = call <4 x i128> @llvm.abs.v4i128(<4 x i128> %sub, i1 true) + %trunc = trunc <4 x i128> %abs to <4 x i64> + ret <4 x i64> %trunc +} + +define <32 x i8> @xvabsd_bu(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: xvabsd_bu: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.bu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.bu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a.zext = zext <32 x i8> %a to <32 x i16> + %b.zext = zext <32 x i8> %b to <32 x i16> + %sub = sub <32 x i16> %a.zext, %b.zext + %abs = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %sub, i1 true) + %trunc = trunc <32 x i16> %abs to <32 x i8> + ret <32 x i8> %trunc +} + +define <16 x i16> @xvabsd_hu(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: xvabsd_hu: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.hu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.hu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a.zext = zext <16 x i16> %a to <16 x i32> + %b.zext = zext <16 x i16> %b to <16 x i32> + %sub = sub <16 x i32> %a.zext, %b.zext + %abs = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %sub, i1 true) + %trunc = trunc <16 x i32> %abs to <16 x i16> + ret <16 x i16> %trunc +} + +define <8 x i32> @xvabsd_wu(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: xvabsd_wu: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.wu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.wu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a.zext = zext <8 x i32> %a to <8 x i64> + %b.zext = zext <8 x i32> %b to <8 x i64> + %sub = sub <8 x i64> %a.zext, %b.zext + %abs = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %sub, i1 true) + %trunc = trunc <8 x i64> %abs to <8 x i32> + ret <8 x i32> %trunc +} + +define <4 x i64> @xvabsd_du(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: xvabsd_du: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.du $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.du $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a.zext = zext <4 x i64> %a to <4 x i128> + %b.zext = zext <4 x i64> %b to <4 x i128> + %sub = sub <4 x i128> %a.zext, %b.zext + %abs = call <4 x i128> @llvm.abs.v4i128(<4 x i128> %sub, i1 true) + %trunc = trunc <4 x i128> %abs to <4 x i64> + ret <4 x i64> %trunc +} + +;; abs(sub_nsw(x, y)) -> abds(a,b) +define <32 x i8> @xvabsd_b_nsw(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: xvabsd_b_nsw: +; CHECK: # %bb.0: +; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvneg.b $xr1, $xr0 +; CHECK-NEXT: xvmax.b $xr0, $xr0, $xr1 +; CHECK-NEXT: ret + %sub = sub nsw <32 x i8> %a, %b + %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %sub, i1 true) + ret <32 x i8> %abs +} + +define <16 x i16> @xvabsd_h_nsw(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: xvabsd_h_nsw: +; CHECK: # %bb.0: +; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvneg.h $xr1, $xr0 +; CHECK-NEXT: xvmax.h $xr0, $xr0, $xr1 +; CHECK-NEXT: ret + %sub = sub nsw <16 x i16> %a, %b + %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 true) + ret <16 x i16> %abs +} + +define <8 x i32> @xvabsd_w_nsw(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: xvabsd_w_nsw: +; CHECK: # %bb.0: +; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvneg.w $xr1, $xr0 +; CHECK-NEXT: xvmax.w $xr0, $xr0, $xr1 +; CHECK-NEXT: ret + %sub = sub nsw <8 x i32> %a, %b + %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %sub, i1 true) + ret <8 x i32> %abs +} + +define <4 x i64> @xvabsd_d_nsw(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: xvabsd_d_nsw: +; CHECK: # %bb.0: +; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvneg.d $xr1, $xr0 +; CHECK-NEXT: xvmax.d $xr0, $xr0, $xr1 +; CHECK-NEXT: ret + %sub = sub nsw <4 x i64> %a, %b + %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 true) + ret <4 x i64> %abs +} + +;; sub(smax(a,b),smin(a,b)) -> abds(a,b) +define <32 x i8> @maxmin_b(<32 x i8> %0, <32 x i8> %1) { +; CHECK-LABEL: maxmin_b: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.b $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a = tail call <32 x i8> @llvm.smax.v32i8(<32 x i8> %0, <32 x i8> %1) + %b = tail call <32 x i8> @llvm.smin.v32i8(<32 x i8> %0, <32 x i8> %1) + %sub = sub <32 x i8> %a, %b + ret <32 x i8> %sub +} + +define <16 x i16> @maxmin_h(<16 x i16> %0, <16 x i16> %1) { +; CHECK-LABEL: maxmin_h: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.h $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %0, <16 x i16> %1) + %b = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %0, <16 x i16> %1) + %sub = sub <16 x i16> %a, %b + ret <16 x i16> %sub +} + +define <8 x i32> @maxmin_w(<8 x i32> %0, <8 x i32> %1) { +; CHECK-LABEL: maxmin_w: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.w $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %0, <8 x i32> %1) + %b = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %0, <8 x i32> %1) + %sub = sub <8 x i32> %a, %b + ret <8 x i32> %sub +} + +define <4 x i64> @maxmin_d(<4 x i64> %0, <4 x i64> %1) { +; CHECK-LABEL: maxmin_d: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.d $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %0, <4 x i64> %1) + %b = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %0, <4 x i64> %1) + %sub = sub <4 x i64> %a, %b + ret <4 x i64> %sub +} + +define <32 x i8> @maxmin_bu(<32 x i8> %0, <32 x i8> %1) { +; CHECK-LABEL: maxmin_bu: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.bu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.bu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a = tail call <32 x i8> @llvm.umax.v32i8(<32 x i8> %0, <32 x i8> %1) + %b = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %0, <32 x i8> %1) + %sub = sub <32 x i8> %a, %b + ret <32 x i8> %sub +} + +define <16 x i16> @maxmin_hu(<16 x i16> %0, <16 x i16> %1) { +; CHECK-LABEL: maxmin_hu: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.hu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.hu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a = tail call <16 x i16> @llvm.umax.v16i16(<16 x i16> %0, <16 x i16> %1) + %b = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %0, <16 x i16> %1) + %sub = sub <16 x i16> %a, %b + ret <16 x i16> %sub +} + +define <8 x i32> @maxmin_wu(<8 x i32> %0, <8 x i32> %1) { +; CHECK-LABEL: maxmin_wu: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.wu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.wu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a = tail call <8 x i32> @llvm.umax.v8i32(<8 x i32> %0, <8 x i32> %1) + %b = tail call <8 x i32> @llvm.umin.v8i32(<8 x i32> %0, <8 x i32> %1) + %sub = sub <8 x i32> %a, %b + ret <8 x i32> %sub +} + +define <4 x i64> @maxmin_du(<4 x i64> %0, <4 x i64> %1) { +; CHECK-LABEL: maxmin_du: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.du $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.du $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a = tail call <4 x i64> @llvm.umax.v4i64(<4 x i64> %0, <4 x i64> %1) + %b = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %0, <4 x i64> %1) + %sub = sub <4 x i64> %a, %b + ret <4 x i64> %sub +} + +define <32 x i8> @maxmin_bu_com1(<32 x i8> %0, <32 x i8> %1) { +; CHECK-LABEL: maxmin_bu_com1: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.bu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.bu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %a = tail call <32 x i8> @llvm.umax.v32i8(<32 x i8> %0, <32 x i8> %1) + %b = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %1, <32 x i8> %0) + %sub = sub <32 x i8> %a, %b + ret <32 x i8> %sub +} + +;; select(icmp(a,b, slt),sub(a,b),sub(b,a)) -> abds(a,b) +define <32 x i8> @xvabsd_b_cmp(<32 x i8> %a, <32 x i8> %b) nounwind { +; CHECK-LABEL: xvabsd_b_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.b $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp slt <32 x i8> %a, %b + %ab = sub <32 x i8> %a, %b + %ba = sub <32 x i8> %b, %a + %sel = select <32 x i1> %cmp, <32 x i8> %ba, <32 x i8> %ab + ret <32 x i8> %sel +} + +define <16 x i16> @xvabsd_h_cmp(<16 x i16> %a, <16 x i16> %b) nounwind { +; CHECK-LABEL: xvabsd_h_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.h $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp slt <16 x i16> %a, %b + %ab = sub <16 x i16> %a, %b + %ba = sub <16 x i16> %b, %a + %sel = select <16 x i1> %cmp, <16 x i16> %ba, <16 x i16> %ab + ret <16 x i16> %sel +} + +define <8 x i32> @xvabsd_w_cmp(<8 x i32> %a, <8 x i32> %b) nounwind { +; CHECK-LABEL: xvabsd_w_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.w $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp slt <8 x i32> %a, %b + %ab = sub <8 x i32> %a, %b + %ba = sub <8 x i32> %b, %a + %sel = select <8 x i1> %cmp, <8 x i32> %ba, <8 x i32> %ab + ret <8 x i32> %sel +} + +define <4 x i64> @xvabsd_d_cmp(<4 x i64> %a, <4 x i64> %b) nounwind { +; CHECK-LABEL: xvabsd_d_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.d $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp slt <4 x i64> %a, %b + %ab = sub <4 x i64> %a, %b + %ba = sub <4 x i64> %b, %a + %sel = select <4 x i1> %cmp, <4 x i64> %ba, <4 x i64> %ab + ret <4 x i64> %sel +} + +define <32 x i8> @xvabsd_bu_cmp(<32 x i8> %a, <32 x i8> %b) nounwind { +; CHECK-LABEL: xvabsd_bu_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.bu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.bu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp ult <32 x i8> %a, %b + %ab = sub <32 x i8> %a, %b + %ba = sub <32 x i8> %b, %a + %sel = select <32 x i1> %cmp, <32 x i8> %ba, <32 x i8> %ab + ret <32 x i8> %sel +} + +define <16 x i16> @xvabsd_hu_cmp(<16 x i16> %a, <16 x i16> %b) nounwind { +; CHECK-LABEL: xvabsd_hu_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.hu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.hu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp ult <16 x i16> %a, %b + %ab = sub <16 x i16> %a, %b + %ba = sub <16 x i16> %b, %a + %sel = select <16 x i1> %cmp, <16 x i16> %ba, <16 x i16> %ab + ret <16 x i16> %sel +} + +define <8 x i32> @xvabsd_wu_cmp(<8 x i32> %a, <8 x i32> %b) nounwind { +; CHECK-LABEL: xvabsd_wu_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.wu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.wu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp ult <8 x i32> %a, %b + %ab = sub <8 x i32> %a, %b + %ba = sub <8 x i32> %b, %a + %sel = select <8 x i1> %cmp, <8 x i32> %ba, <8 x i32> %ab + ret <8 x i32> %sel +} + +define <4 x i64> @xvabsd_du_cmp(<4 x i64> %a, <4 x i64> %b) nounwind { +; CHECK-LABEL: xvabsd_du_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.du $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.du $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp ult <4 x i64> %a, %b + %ab = sub <4 x i64> %a, %b + %ba = sub <4 x i64> %b, %a + %sel = select <4 x i1> %cmp, <4 x i64> %ba, <4 x i64> %ab + ret <4 x i64> %sel +} + +;; sub(select(icmp(a,b, slt),a,b),select(icmp(a,b, slt),b,a)) -> abds(a,b) +define <32 x i8> @xvabsd_b_select(<32 x i8> %a, <32 x i8> %b) nounwind { +; CHECK-LABEL: xvabsd_b_select: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.b $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp slt <32 x i8> %a, %b + %ab = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %ba = select <32 x i1> %cmp, <32 x i8> %b, <32 x i8> %a + %sub = sub <32 x i8> %ba, %ab + ret <32 x i8> %sub +} + +define <16 x i16> @xvabsd_h_select(<16 x i16> %a, <16 x i16> %b) nounwind { +; CHECK-LABEL: xvabsd_h_select: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.h $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp sle <16 x i16> %a, %b + %ab = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %ba = select <16 x i1> %cmp, <16 x i16> %b, <16 x i16> %a + %sub = sub <16 x i16> %ba, %ab + ret <16 x i16> %sub +} + +define <8 x i32> @xvabsd_w_select(<8 x i32> %a, <8 x i32> %b) nounwind { +; CHECK-LABEL: xvabsd_w_select: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.w $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp sgt <8 x i32> %a, %b + %ab = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b + %ba = select <8 x i1> %cmp, <8 x i32> %b, <8 x i32> %a + %sub = sub <8 x i32> %ab, %ba + ret <8 x i32> %sub +} + +define <4 x i64> @xvabsd_d_select(<4 x i64> %a, <4 x i64> %b) nounwind { +; CHECK-LABEL: xvabsd_d_select: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.d $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp sge <4 x i64> %a, %b + %ab = select <4 x i1> %cmp, <4 x i64> %a, <4 x i64> %b + %ba = select <4 x i1> %cmp, <4 x i64> %b, <4 x i64> %a + %sub = sub <4 x i64> %ab, %ba + ret <4 x i64> %sub +} + +define <32 x i8> @xvabsd_bu_select(<32 x i8> %a, <32 x i8> %b) nounwind { +; CHECK-LABEL: xvabsd_bu_select: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.bu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.bu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp ult <32 x i8> %a, %b + %ab = select <32 x i1> %cmp, <32 x i8> %a, <32 x i8> %b + %ba = select <32 x i1> %cmp, <32 x i8> %b, <32 x i8> %a + %sub = sub <32 x i8> %ba, %ab + ret <32 x i8> %sub +} + +define <16 x i16> @xvabsd_hu_select(<16 x i16> %a, <16 x i16> %b) nounwind { +; CHECK-LABEL: xvabsd_hu_select: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.hu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.hu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp ule <16 x i16> %a, %b + %ab = select <16 x i1> %cmp, <16 x i16> %a, <16 x i16> %b + %ba = select <16 x i1> %cmp, <16 x i16> %b, <16 x i16> %a + %sub = sub <16 x i16> %ba, %ab + ret <16 x i16> %sub +} + +define <8 x i32> @xvabsd_wu_select(<8 x i32> %a, <8 x i32> %b) nounwind { +; CHECK-LABEL: xvabsd_wu_select: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.wu $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.wu $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp ugt <8 x i32> %a, %b + %ab = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b + %ba = select <8 x i1> %cmp, <8 x i32> %b, <8 x i32> %a + %sub = sub <8 x i32> %ab, %ba + ret <8 x i32> %sub +} + +define <4 x i64> @xvabsd_du_select(<4 x i64> %a, <4 x i64> %b) nounwind { +; CHECK-LABEL: xvabsd_du_select: +; CHECK: # %bb.0: +; CHECK-NEXT: xvmin.du $xr2, $xr0, $xr1 +; CHECK-NEXT: xvmax.du $xr0, $xr0, $xr1 +; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr2 +; CHECK-NEXT: ret + %cmp = icmp uge <4 x i64> %a, %b + %ab = select <4 x i1> %cmp, <4 x i64> %a, <4 x i64> %b + %ba = select <4 x i1> %cmp, <4 x i64> %b, <4 x i64> %a + %sub = sub <4 x i64> %ab, %ba + ret <4 x i64> %sub +} + +declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) + +declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) +declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1) + +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) +declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) + +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) +declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1) + +declare <4 x i128> @llvm.abs.v4i128(<4 x i128>, i1) + +declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>) +declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) +declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>) +declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>) +declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) +declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>) +declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>) +declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) +declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>) +declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>) +declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>) +declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) +declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/absd.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/absd.ll new file mode 100644 index 0000000000000..2cbd74204d5d6 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/absd.ll @@ -0,0 +1,557 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s + +;; TODO: Currently LoongArch generates sub-optimal code for five cases +;; 1. trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b) or abdu(a,b) +;; 2. abs(sub_nsw(x, y)) -> abds(a,b) +;; 3. sub(smax(a,b),smin(a,b)) -> abds(a,b) or abdu(a,b) +;; 4. select(icmp(a,b, slt|ult),sub(a,b),sub(b,a)) -> abds(a,b) or abdu(a,b) +;; 5. sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b) or abdu(a,b) +;; +;; abds / abdu can be lowered to vabsd.{b/h/w/d} / vabsd.{b/h/w/d}u instruction +;; +;; Later patch will address it. + +;; trunc(abs(sub(sext(a),sext(b)))) -> abds(a,b) +define <16 x i8> @vabsd_b(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: vabsd_b: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.b $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a.sext = sext <16 x i8> %a to <16 x i16> + %b.sext = sext <16 x i8> %b to <16 x i16> + %sub = sub <16 x i16> %a.sext, %b.sext + %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 true) + %trunc = trunc <16 x i16> %abs to <16 x i8> + ret <16 x i8> %trunc +} + +define <8 x i16> @vabsd_h(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: vabsd_h: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.h $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a.sext = sext <8 x i16> %a to <8 x i32> + %b.sext = sext <8 x i16> %b to <8 x i32> + %sub = sub <8 x i32> %a.sext, %b.sext + %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %sub, i1 true) + %trunc = trunc <8 x i32> %abs to <8 x i16> + ret <8 x i16> %trunc +} + +define <4 x i32> @vabsd_w(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: vabsd_w: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.w $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a.sext = sext <4 x i32> %a to <4 x i64> + %b.sext = sext <4 x i32> %b to <4 x i64> + %sub = sub <4 x i64> %a.sext, %b.sext + %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 true) + %trunc = trunc <4 x i64> %abs to <4 x i32> + ret <4 x i32> %trunc +} + +define <2 x i64> @vabsd_d(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: vabsd_d: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.d $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a.sext = sext <2 x i64> %a to <2 x i128> + %b.sext = sext <2 x i64> %b to <2 x i128> + %sub = sub <2 x i128> %a.sext, %b.sext + %abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 true) + %trunc = trunc <2 x i128> %abs to <2 x i64> + ret <2 x i64> %trunc +} + +define <16 x i8> @vabsd_bu(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: vabsd_bu: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.bu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.bu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a.zext = zext <16 x i8> %a to <16 x i16> + %b.zext = zext <16 x i8> %b to <16 x i16> + %sub = sub <16 x i16> %a.zext, %b.zext + %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 true) + %trunc = trunc <16 x i16> %abs to <16 x i8> + ret <16 x i8> %trunc +} + +define <8 x i16> @vabsd_hu(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: vabsd_hu: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.hu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.hu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a.zext = zext <8 x i16> %a to <8 x i32> + %b.zext = zext <8 x i16> %b to <8 x i32> + %sub = sub <8 x i32> %a.zext, %b.zext + %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %sub, i1 true) + %trunc = trunc <8 x i32> %abs to <8 x i16> + ret <8 x i16> %trunc +} + +define <4 x i32> @vabsd_wu(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: vabsd_wu: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.wu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.wu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a.zext = zext <4 x i32> %a to <4 x i64> + %b.zext = zext <4 x i32> %b to <4 x i64> + %sub = sub <4 x i64> %a.zext, %b.zext + %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 true) + %trunc = trunc <4 x i64> %abs to <4 x i32> + ret <4 x i32> %trunc +} + +define <2 x i64> @vabsd_du(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: vabsd_du: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.du $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.du $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a.zext = zext <2 x i64> %a to <2 x i128> + %b.zext = zext <2 x i64> %b to <2 x i128> + %sub = sub <2 x i128> %a.zext, %b.zext + %abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 true) + %trunc = trunc <2 x i128> %abs to <2 x i64> + ret <2 x i64> %trunc +} + +;; abs(sub_nsw(x, y)) -> abds(a,b) +define <16 x i8> @vabsd_b_nsw(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: vabsd_b_nsw: +; CHECK: # %bb.0: +; CHECK-NEXT: vsub.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vneg.b $vr1, $vr0 +; CHECK-NEXT: vmax.b $vr0, $vr0, $vr1 +; CHECK-NEXT: ret + %sub = sub nsw <16 x i8> %a, %b + %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true) + ret <16 x i8> %abs +} + +define <8 x i16> @vabsd_h_nsw(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: vabsd_h_nsw: +; CHECK: # %bb.0: +; CHECK-NEXT: vsub.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vneg.h $vr1, $vr0 +; CHECK-NEXT: vmax.h $vr0, $vr0, $vr1 +; CHECK-NEXT: ret + %sub = sub nsw <8 x i16> %a, %b + %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) + ret <8 x i16> %abs +} + +define <4 x i32> @vabsd_w_nsw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: vabsd_w_nsw: +; CHECK: # %bb.0: +; CHECK-NEXT: vsub.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vneg.w $vr1, $vr0 +; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 +; CHECK-NEXT: ret + %sub = sub nsw <4 x i32> %a, %b + %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) + ret <4 x i32> %abs +} + +define <2 x i64> @vabsd_d_nsw(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: vabsd_d_nsw: +; CHECK: # %bb.0: +; CHECK-NEXT: vsub.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vneg.d $vr1, $vr0 +; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 +; CHECK-NEXT: ret + %sub = sub nsw <2 x i64> %a, %b + %abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true) + ret <2 x i64> %abs +} + +;; sub(smax(a,b),smin(a,b)) -> abds(a,b) +define <16 x i8> @maxmin_b(<16 x i8> %0, <16 x i8> %1) { +; CHECK-LABEL: maxmin_b: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.b $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> %0, <16 x i8> %1) + %b = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> %0, <16 x i8> %1) + %sub = sub <16 x i8> %a, %b + ret <16 x i8> %sub +} + +define <8 x i16> @maxmin_h(<8 x i16> %0, <8 x i16> %1) { +; CHECK-LABEL: maxmin_h: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.h $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> %0, <8 x i16> %1) + %b = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> %0, <8 x i16> %1) + %sub = sub <8 x i16> %a, %b + ret <8 x i16> %sub +} + +define <4 x i32> @maxmin_w(<4 x i32> %0, <4 x i32> %1) { +; CHECK-LABEL: maxmin_w: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.w $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %1) + %b = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %0, <4 x i32> %1) + %sub = sub <4 x i32> %a, %b + ret <4 x i32> %sub +} + +define <2 x i64> @maxmin_d(<2 x i64> %0, <2 x i64> %1) { +; CHECK-LABEL: maxmin_d: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.d $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a = tail call <2 x i64> @llvm.smax.v2i64(<2 x i64> %0, <2 x i64> %1) + %b = tail call <2 x i64> @llvm.smin.v2i64(<2 x i64> %0, <2 x i64> %1) + %sub = sub <2 x i64> %a, %b + ret <2 x i64> %sub +} + +define <16 x i8> @maxmin_bu(<16 x i8> %0, <16 x i8> %1) { +; CHECK-LABEL: maxmin_bu: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.bu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.bu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> %0, <16 x i8> %1) + %b = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %0, <16 x i8> %1) + %sub = sub <16 x i8> %a, %b + ret <16 x i8> %sub +} + +define <8 x i16> @maxmin_hu(<8 x i16> %0, <8 x i16> %1) { +; CHECK-LABEL: maxmin_hu: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.hu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.hu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %0, <8 x i16> %1) + %b = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %0, <8 x i16> %1) + %sub = sub <8 x i16> %a, %b + ret <8 x i16> %sub +} + +define <4 x i32> @maxmin_wu(<4 x i32> %0, <4 x i32> %1) { +; CHECK-LABEL: maxmin_wu: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.wu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.wu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %0, <4 x i32> %1) + %b = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %0, <4 x i32> %1) + %sub = sub <4 x i32> %a, %b + ret <4 x i32> %sub +} + +define <2 x i64> @maxmin_du(<2 x i64> %0, <2 x i64> %1) { +; CHECK-LABEL: maxmin_du: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.du $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.du $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a = tail call <2 x i64> @llvm.umax.v2i64(<2 x i64> %0, <2 x i64> %1) + %b = tail call <2 x i64> @llvm.umin.v2i64(<2 x i64> %0, <2 x i64> %1) + %sub = sub <2 x i64> %a, %b + ret <2 x i64> %sub +} + +define <16 x i8> @maxmin_bu_com1(<16 x i8> %0, <16 x i8> %1) { +; CHECK-LABEL: maxmin_bu_com1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.bu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.bu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %a = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> %0, <16 x i8> %1) + %b = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %1, <16 x i8> %0) + %sub = sub <16 x i8> %a, %b + ret <16 x i8> %sub +} + +;; select(icmp(a,b, slt),sub(a,b),sub(b,a)) -> abds(a,b) +define <16 x i8> @vabsd_b_cmp(<16 x i8> %a, <16 x i8> %b) nounwind { +; CHECK-LABEL: vabsd_b_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.b $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp slt <16 x i8> %a, %b + %ab = sub <16 x i8> %a, %b + %ba = sub <16 x i8> %b, %a + %sel = select <16 x i1> %cmp, <16 x i8> %ba, <16 x i8> %ab + ret <16 x i8> %sel +} + +define <8 x i16> @vabsd_h_cmp(<8 x i16> %a, <8 x i16> %b) nounwind { +; CHECK-LABEL: vabsd_h_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.h $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp slt <8 x i16> %a, %b + %ab = sub <8 x i16> %a, %b + %ba = sub <8 x i16> %b, %a + %sel = select <8 x i1> %cmp, <8 x i16> %ba, <8 x i16> %ab + ret <8 x i16> %sel +} + +define <4 x i32> @vabsd_w_cmp(<4 x i32> %a, <4 x i32> %b) nounwind { +; CHECK-LABEL: vabsd_w_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.w $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp slt <4 x i32> %a, %b + %ab = sub <4 x i32> %a, %b + %ba = sub <4 x i32> %b, %a + %sel = select <4 x i1> %cmp, <4 x i32> %ba, <4 x i32> %ab + ret <4 x i32> %sel +} + +define <2 x i64> @vabsd_d_cmp(<2 x i64> %a, <2 x i64> %b) nounwind { +; CHECK-LABEL: vabsd_d_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.d $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp slt <2 x i64> %a, %b + %ab = sub <2 x i64> %a, %b + %ba = sub <2 x i64> %b, %a + %sel = select <2 x i1> %cmp, <2 x i64> %ba, <2 x i64> %ab + ret <2 x i64> %sel +} + +define <16 x i8> @vabsd_bu_cmp(<16 x i8> %a, <16 x i8> %b) nounwind { +; CHECK-LABEL: vabsd_bu_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.bu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.bu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp ult <16 x i8> %a, %b + %ab = sub <16 x i8> %a, %b + %ba = sub <16 x i8> %b, %a + %sel = select <16 x i1> %cmp, <16 x i8> %ba, <16 x i8> %ab + ret <16 x i8> %sel +} + +define <8 x i16> @vabsd_hu_cmp(<8 x i16> %a, <8 x i16> %b) nounwind { +; CHECK-LABEL: vabsd_hu_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.hu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.hu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp ult <8 x i16> %a, %b + %ab = sub <8 x i16> %a, %b + %ba = sub <8 x i16> %b, %a + %sel = select <8 x i1> %cmp, <8 x i16> %ba, <8 x i16> %ab + ret <8 x i16> %sel +} + +define <4 x i32> @vabsd_wu_cmp(<4 x i32> %a, <4 x i32> %b) nounwind { +; CHECK-LABEL: vabsd_wu_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.wu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.wu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp ult <4 x i32> %a, %b + %ab = sub <4 x i32> %a, %b + %ba = sub <4 x i32> %b, %a + %sel = select <4 x i1> %cmp, <4 x i32> %ba, <4 x i32> %ab + ret <4 x i32> %sel +} + +define <2 x i64> @vabsd_du_cmp(<2 x i64> %a, <2 x i64> %b) nounwind { +; CHECK-LABEL: vabsd_du_cmp: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.du $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.du $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp ult <2 x i64> %a, %b + %ab = sub <2 x i64> %a, %b + %ba = sub <2 x i64> %b, %a + %sel = select <2 x i1> %cmp, <2 x i64> %ba, <2 x i64> %ab + ret <2 x i64> %sel +} + +;; sub(select(icmp(a,b, slt),a,b),select(icmp(a,b, slt),b,a)) -> abds(a,b) +define <16 x i8> @vabsd_b_select(<16 x i8> %a, <16 x i8> %b) nounwind { +; CHECK-LABEL: vabsd_b_select: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.b $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp slt <16 x i8> %a, %b + %ab = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %ba = select <16 x i1> %cmp, <16 x i8> %b, <16 x i8> %a + %sub = sub <16 x i8> %ba, %ab + ret <16 x i8> %sub +} + +define <8 x i16> @vabsd_h_select(<8 x i16> %a, <8 x i16> %b) nounwind { +; CHECK-LABEL: vabsd_h_select: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.h $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp sle <8 x i16> %a, %b + %ab = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %ba = select <8 x i1> %cmp, <8 x i16> %b, <8 x i16> %a + %sub = sub <8 x i16> %ba, %ab + ret <8 x i16> %sub +} + +define <4 x i32> @vabsd_w_select(<4 x i32> %a, <4 x i32> %b) nounwind { +; CHECK-LABEL: vabsd_w_select: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.w $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp sgt <4 x i32> %a, %b + %ab = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b + %ba = select <4 x i1> %cmp, <4 x i32> %b, <4 x i32> %a + %sub = sub <4 x i32> %ab, %ba + ret <4 x i32> %sub +} + +define <2 x i64> @vabsd_d_select(<2 x i64> %a, <2 x i64> %b) nounwind { +; CHECK-LABEL: vabsd_d_select: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.d $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp sge <2 x i64> %a, %b + %ab = select <2 x i1> %cmp, <2 x i64> %a, <2 x i64> %b + %ba = select <2 x i1> %cmp, <2 x i64> %b, <2 x i64> %a + %sub = sub <2 x i64> %ab, %ba + ret <2 x i64> %sub +} + +define <16 x i8> @vabsd_bu_select(<16 x i8> %a, <16 x i8> %b) nounwind { +; CHECK-LABEL: vabsd_bu_select: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.bu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.bu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.b $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp ult <16 x i8> %a, %b + %ab = select <16 x i1> %cmp, <16 x i8> %a, <16 x i8> %b + %ba = select <16 x i1> %cmp, <16 x i8> %b, <16 x i8> %a + %sub = sub <16 x i8> %ba, %ab + ret <16 x i8> %sub +} + +define <8 x i16> @vabsd_hu_select(<8 x i16> %a, <8 x i16> %b) nounwind { +; CHECK-LABEL: vabsd_hu_select: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.hu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.hu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.h $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp ule <8 x i16> %a, %b + %ab = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b + %ba = select <8 x i1> %cmp, <8 x i16> %b, <8 x i16> %a + %sub = sub <8 x i16> %ba, %ab + ret <8 x i16> %sub +} + +define <4 x i32> @vabsd_wu_select(<4 x i32> %a, <4 x i32> %b) nounwind { +; CHECK-LABEL: vabsd_wu_select: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.wu $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.wu $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.w $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp ugt <4 x i32> %a, %b + %ab = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b + %ba = select <4 x i1> %cmp, <4 x i32> %b, <4 x i32> %a + %sub = sub <4 x i32> %ab, %ba + ret <4 x i32> %sub +} + +define <2 x i64> @vabsd_du_select(<2 x i64> %a, <2 x i64> %b) nounwind { +; CHECK-LABEL: vabsd_du_select: +; CHECK: # %bb.0: +; CHECK-NEXT: vmin.du $vr2, $vr0, $vr1 +; CHECK-NEXT: vmax.du $vr0, $vr0, $vr1 +; CHECK-NEXT: vsub.d $vr0, $vr0, $vr2 +; CHECK-NEXT: ret + %cmp = icmp uge <2 x i64> %a, %b + %ab = select <2 x i1> %cmp, <2 x i64> %a, <2 x i64> %b + %ba = select <2 x i1> %cmp, <2 x i64> %b, <2 x i64> %a + %sub = sub <2 x i64> %ab, %ba + ret <2 x i64> %sub +} + +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) + +declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) +declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) + +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) + +declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) + +declare <2 x i128> @llvm.abs.v2i128(<2 x i128>, i1) + +declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.smax.v2i64(<2 x i64>, <2 x i64>) +declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>) +declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.umax.v2i64(<2 x i64>, <2 x i64>) +declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.umin.v2i64(<2 x i64>, <2 x i64>) From 18c43d01fc61648369fef50999e7df62b3ec292f Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Wed, 2 Apr 2025 18:40:37 -0700 Subject: [PATCH 0473/1029] [lldb-dap] Add a -v/--version command line argument (#134114) Add a -v/--version command line argument to print the version of both the lldb-dap binary and the liblldb it's linked against. This is motivated by me trying to figure out which lldb-dap I had in my PATH. --- .../Shell/DAP/{TestOptions.test => TestHelp.test} | 2 +- lldb/test/Shell/DAP/TestVersion.test | 3 +++ lldb/tools/lldb-dap/Options.td | 6 ++++++ lldb/tools/lldb-dap/lldb-dap.cpp | 12 ++++++++++++ 4 files changed, 22 insertions(+), 1 deletion(-) rename lldb/test/Shell/DAP/{TestOptions.test => TestHelp.test} (88%) create mode 100644 lldb/test/Shell/DAP/TestVersion.test diff --git a/lldb/test/Shell/DAP/TestOptions.test b/lldb/test/Shell/DAP/TestHelp.test similarity index 88% rename from lldb/test/Shell/DAP/TestOptions.test rename to lldb/test/Shell/DAP/TestHelp.test index d290cdae590fd..6033cf15e3835 100644 --- a/lldb/test/Shell/DAP/TestOptions.test +++ b/lldb/test/Shell/DAP/TestHelp.test @@ -4,5 +4,5 @@ # CHECK: --help # CHECK: -h # CHECK: --repl-mode +# CHECK: --version # CHECK: --wait-for-debugger - diff --git a/lldb/test/Shell/DAP/TestVersion.test b/lldb/test/Shell/DAP/TestVersion.test new file mode 100644 index 0000000000000..ad3ff67e45d79 --- /dev/null +++ b/lldb/test/Shell/DAP/TestVersion.test @@ -0,0 +1,3 @@ +# RUN: lldb-dap --version | FileCheck %s +# CHECK: lldb-dap: +# CHECK: liblldb: diff --git a/lldb/tools/lldb-dap/Options.td b/lldb/tools/lldb-dap/Options.td index a1baf2f0370bd..aecf91797ac70 100644 --- a/lldb/tools/lldb-dap/Options.td +++ b/lldb/tools/lldb-dap/Options.td @@ -11,6 +11,12 @@ def: Flag<["-"], "h">, Alias, HelpText<"Alias for --help">; +def version: F<"version">, + HelpText<"Prints out the lldb-dap version.">; +def: Flag<["-"], "v">, + Alias, + HelpText<"Alias for --version">; + def wait_for_debugger: F<"wait-for-debugger">, HelpText<"Pause the program at startup.">; def: Flag<["-"], "g">, diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index b91c62e921428..ec87db6aab330 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -31,6 +31,7 @@ #include "llvm/Option/ArgList.h" #include "llvm/Option/OptTable.h" #include "llvm/Option/Option.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/InitLLVM.h" @@ -177,6 +178,12 @@ static void PrintHelp(LLDBDAPOptTable &table, llvm::StringRef tool_name) { )___"; } +static void PrintVersion() { + llvm::outs() << "lldb-dap: "; + llvm::cl::PrintVersionMessage(); + llvm::outs() << "liblldb: " << lldb::SBDebugger::GetVersionString() << '\n'; +} + // If --launch-target is provided, this instance of lldb-dap becomes a // runInTerminal launcher. It will ultimately launch the program specified in // the --launch-target argument, which is the original program the user wanted @@ -421,6 +428,11 @@ int main(int argc, char *argv[]) { return EXIT_SUCCESS; } + if (input_args.hasArg(OPT_version)) { + PrintVersion(); + return EXIT_SUCCESS; + } + ReplMode default_repl_mode = ReplMode::Auto; if (input_args.hasArg(OPT_repl_mode)) { llvm::opt::Arg *repl_mode = input_args.getLastArg(OPT_repl_mode); From 94dbe5e405a1e8c9ed1462947e2d5a8e45113d47 Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Wed, 2 Apr 2025 19:40:54 -0700 Subject: [PATCH 0474/1029] [mlir][tosa] Remove extra whitespace in the PadOp example (#134113) Trivial cleanup change. Signed-off-by: Jerry Ge --- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 75f167afd9dd0..c5314f8d9d406 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -1906,7 +1906,7 @@ def Tosa_PadOp : Tosa_InferShapedTypeOp<"pad"> { ```mlir %pad_const = "tosa.const"() {values = dense<3.14> : tensor<1xf32>} : () -> tensor<1xf32> - %padding = tosa.const_shape {values = dense<[1, 2, 3, 4]> : tensor<4xindex> } : () -> !tosa.shape<4> + %padding = tosa.const_shape {values = dense<[1, 2, 3, 4]> : tensor<4xindex>} : () -> !tosa.shape<4> tosa.pad %arg0, %padding, %pad_const: (tensor<1x2xf32>, !tosa.shape<4>, tensor<1xf32>) -> (tensor<4x9xf32>) ``` @@ -1914,7 +1914,7 @@ def Tosa_PadOp : Tosa_InferShapedTypeOp<"pad"> { ```mlir %pad_const = "tosa.const"() {values = dense<3.14> : tensor<1xf32>} : () -> tensor<1xf32> - %padding = tosa.const_shape {values = dense<[-1, 2, 3, 4]> : tensor<4xindex> } : () -> !tosa.shape<4> + %padding = tosa.const_shape {values = dense<[-1, 2, 3, 4]> : tensor<4xindex>} : () -> !tosa.shape<4> tosa.pad %arg0, %padding, %pad_const : (tensor<1x2xf32>, !tosa.shape<4>, tensor<1xf32>) -> (tensor) ``` }]; From 4fe0d742752c55d5d10b48620aab30fd81db6645 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Wed, 2 Apr 2025 20:08:56 -0700 Subject: [PATCH 0475/1029] [clang-format] Fix a bug in annotating braces (#134039) Fix #133873 --- clang/lib/Format/UnwrappedLineParser.cpp | 6 +++++- clang/unittests/Format/TokenAnnotatorTest.cpp | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index f7712bea01c2c..213b706807b2a 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -1887,8 +1887,11 @@ void UnwrappedLineParser::parseStructuralElement( if (FormatTok->isBinaryOperator()) nextToken(); break; - case tok::caret: + case tok::caret: { + const auto *Prev = FormatTok->getPreviousNonComment(); nextToken(); + if (Prev && Prev->is(tok::identifier)) + break; // Block return type. if (FormatTok->Tok.isAnyIdentifier() || FormatTok->isTypeName(LangOpts)) { nextToken(); @@ -1903,6 +1906,7 @@ void UnwrappedLineParser::parseStructuralElement( if (FormatTok->is(tok::l_brace)) parseChildBlock(); break; + } case tok::l_brace: if (InRequiresExpression) FormatTok->setFinalizedType(TT_BracedListLBrace); diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index af9fd574b068c..7e0af1c7b4c36 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -3622,6 +3622,11 @@ TEST_F(TokenAnnotatorTest, BraceKind) { ASSERT_EQ(Tokens.size(), 11u) << Tokens; EXPECT_BRACE_KIND(Tokens[7], BK_BracedInit); EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit); + + Tokens = annotate("return lhs ^ Byte{rhs};"); + ASSERT_EQ(Tokens.size(), 9u) << Tokens; + EXPECT_BRACE_KIND(Tokens[4], BK_BracedInit); + EXPECT_BRACE_KIND(Tokens[6], BK_BracedInit); } TEST_F(TokenAnnotatorTest, UnderstandsElaboratedTypeSpecifier) { From 4986a7964858979d00f0c9a98d13db555d8a6f0d Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Thu, 3 Apr 2025 11:11:36 +0800 Subject: [PATCH 0476/1029] [TableGen] Emit `llvm::is_contained` for `CheckOpcode` predicate (#134057) When the list is large, using `llvm::is_contained` is of higher performance than a sequence of comparisons. When the list is small, the `llvm::is_contained` can be inlined and unrolled, which has the same effect as using a sequence of comparisons. And the generated code is more readable. --- llvm/test/TableGen/MacroFusion.td | 4 ++-- .../TableGen/Common/PredicateExpander.cpp | 23 ++++++++----------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/llvm/test/TableGen/MacroFusion.td b/llvm/test/TableGen/MacroFusion.td index 6cf22f5447150..66cff7ec4ef4c 100644 --- a/llvm/test/TableGen/MacroFusion.td +++ b/llvm/test/TableGen/MacroFusion.td @@ -42,7 +42,7 @@ def TestBothFusionPredicate: Fusion<"test-both-fusion-predicate", "HasBothFusion [BothFusionPredicate]>; def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion", - CheckOpcode<[Inst0]>, + CheckOpcode<[Inst0, Inst1]>, CheckAll<[ CheckOpcode<[Inst1]>, CheckRegOperand<0, X0> @@ -162,7 +162,7 @@ def TestSingleFusion: SingleFusion<"test-single-fusion", "HasTestSingleFusion", // CHECK-PREDICATOR-NEXT: return true; // CHECK-PREDICATOR-NEXT: { // CHECK-PREDICATOR-NEXT: const MachineInstr *MI = FirstMI; -// CHECK-PREDICATOR-NEXT: if (( MI->getOpcode() != Test::Inst0 )) +// CHECK-PREDICATOR-NEXT: if (!llvm::is_contained({Test::Inst0, Test::Inst1}, MI->getOpcode())) // CHECK-PREDICATOR-NEXT: return false; // CHECK-PREDICATOR-NEXT: } // CHECK-PREDICATOR-NEXT: if (!SecondMI.getOperand(0).getReg().isVirtual()) { diff --git a/llvm/utils/TableGen/Common/PredicateExpander.cpp b/llvm/utils/TableGen/Common/PredicateExpander.cpp index e54df89937c4a..09d953801600d 100644 --- a/llvm/utils/TableGen/Common/PredicateExpander.cpp +++ b/llvm/utils/TableGen/Common/PredicateExpander.cpp @@ -143,7 +143,6 @@ void PredicateExpander::expandCheckOpcode(raw_ostream &OS, const Record *Inst) { void PredicateExpander::expandCheckOpcode(raw_ostream &OS, ArrayRef Opcodes) { assert(!Opcodes.empty() && "Expected at least one opcode to check!"); - bool First = true; if (Opcodes.size() == 1) { OS << "( "; @@ -152,19 +151,15 @@ void PredicateExpander::expandCheckOpcode(raw_ostream &OS, return; } - OS << '('; - ++Indent; - for (const Record *Rec : Opcodes) { - OS << '\n' << Indent; - if (!First) - OS << (shouldNegate() ? "&& " : "|| "); - - expandCheckOpcode(OS, Rec); - First = false; - } - - --Indent; - OS << '\n' << Indent << ')'; + if (shouldNegate()) + OS << '!'; + OS << "llvm::is_contained("; + ListSeparator Sep; + OS << '{'; + for (const Record *Inst : Opcodes) + OS << Sep << Inst->getValueAsString("Namespace") << "::" << Inst->getName(); + OS << '}'; + OS << ", MI" << (isByRef() ? "." : "->") << "getOpcode())"; } void PredicateExpander::expandCheckPseudo(raw_ostream &OS, From dcc2182bce3d2ef0e0a991664c51b4b3bfcf7197 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Thu, 3 Apr 2025 11:15:42 +0800 Subject: [PATCH 0477/1029] [Clang] Fix a lambda pattern comparison mismatch after ecc7e6ce4 (#133863) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In ecc7e6ce4, we tried to inspect the `LambdaScopeInfo` on stack to recover the instantiating lambda captures. However, there was a mismatch in how we compared the pattern declarations of lambdas: the constraint instantiation used a tailored `getPatternFunctionDecl()` which is localized in SemaLambda that finds the very primal template declaration of a lambda, while `FunctionDecl::getTemplateInstantiationPattern` finds the latest template pattern of a lambda. This difference causes issues when lambdas are nested, as we always want the primary template declaration. This corrects that by moving `Sema::addInstantiatedCapturesToScope` from SemaConcept to SemaLambda, allowing it to use the localized version of `getPatternFunctionDecl`. It is also worth exploring to coalesce the implementation of `getPatternFunctionDecl` with `FunctionDecl::getTemplateInstantiationPattern`. But I’m leaving that for the future, as I’d like to backport this fix (ecc7e6ce4 made the issue more visible in clang 20, sorry!), and changing Sema’s ABI would not be suitable in that regards. Hence, no release note. Fixes https://github.com/llvm/llvm-project/issues/133719 --- clang/lib/Sema/SemaConcept.cpp | 69 --------------------- clang/lib/Sema/SemaLambda.cpp | 68 ++++++++++++++++++++ clang/test/SemaTemplate/concepts-lambda.cpp | 15 +++++ 3 files changed, 83 insertions(+), 69 deletions(-) diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index 5e1cd62530c5b..16f9e3d60560e 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -697,75 +697,6 @@ bool Sema::CheckConstraintSatisfaction( .isInvalid(); } -bool Sema::addInstantiatedCapturesToScope( - FunctionDecl *Function, const FunctionDecl *PatternDecl, - LocalInstantiationScope &Scope, - const MultiLevelTemplateArgumentList &TemplateArgs) { - const auto *LambdaClass = cast(Function)->getParent(); - const auto *LambdaPattern = cast(PatternDecl)->getParent(); - - unsigned Instantiated = 0; - - // FIXME: This is a workaround for not having deferred lambda body - // instantiation. - // When transforming a lambda's body, if we encounter another call to a - // nested lambda that contains a constraint expression, we add all of the - // outer lambda's instantiated captures to the current instantiation scope to - // facilitate constraint evaluation. However, these captures don't appear in - // the CXXRecordDecl until after the lambda expression is rebuilt, so we - // pull them out from the corresponding LSI. - LambdaScopeInfo *InstantiatingScope = nullptr; - if (LambdaPattern->capture_size() && !LambdaClass->capture_size()) { - for (FunctionScopeInfo *Scope : llvm::reverse(FunctionScopes)) { - auto *LSI = dyn_cast(Scope); - if (!LSI || - LSI->CallOperator->getTemplateInstantiationPattern() != PatternDecl) - continue; - InstantiatingScope = LSI; - break; - } - assert(InstantiatingScope); - } - - auto AddSingleCapture = [&](const ValueDecl *CapturedPattern, - unsigned Index) { - ValueDecl *CapturedVar = - InstantiatingScope ? InstantiatingScope->Captures[Index].getVariable() - : LambdaClass->getCapture(Index)->getCapturedVar(); - assert(CapturedVar->isInitCapture()); - Scope.InstantiatedLocal(CapturedPattern, CapturedVar); - }; - - for (const LambdaCapture &CapturePattern : LambdaPattern->captures()) { - if (!CapturePattern.capturesVariable()) { - Instantiated++; - continue; - } - ValueDecl *CapturedPattern = CapturePattern.getCapturedVar(); - - if (!CapturedPattern->isInitCapture()) { - Instantiated++; - continue; - } - - if (!CapturedPattern->isParameterPack()) { - AddSingleCapture(CapturedPattern, Instantiated++); - } else { - Scope.MakeInstantiatedLocalArgPack(CapturedPattern); - SmallVector Unexpanded; - SemaRef.collectUnexpandedParameterPacks( - dyn_cast(CapturedPattern)->getInit(), Unexpanded); - auto NumArgumentsInExpansion = - getNumArgumentsInExpansionFromUnexpanded(Unexpanded, TemplateArgs); - if (!NumArgumentsInExpansion) - continue; - for (unsigned Arg = 0; Arg < *NumArgumentsInExpansion; ++Arg) - AddSingleCapture(CapturedPattern, Instantiated++); - } - } - return false; -} - bool Sema::SetupConstraintScope( FunctionDecl *FD, std::optional> TemplateArgs, const MultiLevelTemplateArgumentList &MLTAL, diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp index 292406f886362..6f114b71981fa 100644 --- a/clang/lib/Sema/SemaLambda.cpp +++ b/clang/lib/Sema/SemaLambda.cpp @@ -2412,6 +2412,74 @@ static FunctionDecl *getPatternFunctionDecl(FunctionDecl *FD) { return FTD->getTemplatedDecl(); } +bool Sema::addInstantiatedCapturesToScope( + FunctionDecl *Function, const FunctionDecl *PatternDecl, + LocalInstantiationScope &Scope, + const MultiLevelTemplateArgumentList &TemplateArgs) { + const auto *LambdaClass = cast(Function)->getParent(); + const auto *LambdaPattern = cast(PatternDecl)->getParent(); + + unsigned Instantiated = 0; + + // FIXME: This is a workaround for not having deferred lambda body + // instantiation. + // When transforming a lambda's body, if we encounter another call to a + // nested lambda that contains a constraint expression, we add all of the + // outer lambda's instantiated captures to the current instantiation scope to + // facilitate constraint evaluation. However, these captures don't appear in + // the CXXRecordDecl until after the lambda expression is rebuilt, so we + // pull them out from the corresponding LSI. + LambdaScopeInfo *InstantiatingScope = nullptr; + if (LambdaPattern->capture_size() && !LambdaClass->capture_size()) { + for (FunctionScopeInfo *Scope : llvm::reverse(FunctionScopes)) { + auto *LSI = dyn_cast(Scope); + if (!LSI || getPatternFunctionDecl(LSI->CallOperator) != PatternDecl) + continue; + InstantiatingScope = LSI; + break; + } + assert(InstantiatingScope); + } + + auto AddSingleCapture = [&](const ValueDecl *CapturedPattern, + unsigned Index) { + ValueDecl *CapturedVar = + InstantiatingScope ? InstantiatingScope->Captures[Index].getVariable() + : LambdaClass->getCapture(Index)->getCapturedVar(); + assert(CapturedVar->isInitCapture()); + Scope.InstantiatedLocal(CapturedPattern, CapturedVar); + }; + + for (const LambdaCapture &CapturePattern : LambdaPattern->captures()) { + if (!CapturePattern.capturesVariable()) { + Instantiated++; + continue; + } + ValueDecl *CapturedPattern = CapturePattern.getCapturedVar(); + + if (!CapturedPattern->isInitCapture()) { + Instantiated++; + continue; + } + + if (!CapturedPattern->isParameterPack()) { + AddSingleCapture(CapturedPattern, Instantiated++); + } else { + Scope.MakeInstantiatedLocalArgPack(CapturedPattern); + SmallVector Unexpanded; + SemaRef.collectUnexpandedParameterPacks( + dyn_cast(CapturedPattern)->getInit(), Unexpanded); + auto NumArgumentsInExpansion = + getNumArgumentsInExpansionFromUnexpanded(Unexpanded, TemplateArgs); + if (!NumArgumentsInExpansion) + continue; + for (unsigned Arg = 0; Arg < *NumArgumentsInExpansion; ++Arg) + AddSingleCapture(CapturedPattern, Instantiated++); + } + } + return false; +} + Sema::LambdaScopeForCallOperatorInstantiationRAII:: LambdaScopeForCallOperatorInstantiationRAII( Sema &SemaRef, FunctionDecl *FD, MultiLevelTemplateArgumentList MLTAL, diff --git a/clang/test/SemaTemplate/concepts-lambda.cpp b/clang/test/SemaTemplate/concepts-lambda.cpp index dcb09c76d26b6..1f67c2511e096 100644 --- a/clang/test/SemaTemplate/concepts-lambda.cpp +++ b/clang/test/SemaTemplate/concepts-lambda.cpp @@ -325,3 +325,18 @@ template void f() { template void f(); } + +namespace GH133719 { + +template +constexpr auto f{[] (auto arg) { + return [a{arg}] { + [] () requires true {}(); + }; +}}; + +void foo() { + f(0); +} + +} From 6a46c6c865270ceb01bcaef4a2e4c8df56a8800a Mon Sep 17 00:00:00 2001 From: LU-JOHN Date: Wed, 2 Apr 2025 22:17:14 -0500 Subject: [PATCH 0478/1029] Ensure KnownBits passed when calculating from range md has right size (#132985) KnownBits passed to computeKnownBitsFromRangeMetadata must have the same bit width as the range metadata bit width. Otherwise the calculated results will be incorrect. --------- Signed-off-by: John Lu --- llvm/lib/Analysis/ValueTracking.cpp | 4 ++++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index fc0c74942c6d8..3b0249f91d6d7 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -430,6 +430,10 @@ void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges, ConstantInt *Upper = mdconst::extract(Ranges.getOperand(2 * i + 1)); ConstantRange Range(Lower->getValue(), Upper->getValue()); + // BitWidth must equal the Ranges BitWidth for the correct number of high + // bits to be set. + assert(BitWidth == Range.getBitWidth() && + "Known bit width must match range bit width!"); // The first CommonPrefixBits of all values in Range are equal. unsigned CommonPrefixBits = diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 3526beeb312ce..69548d0462318 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -9176,6 +9176,12 @@ SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType, "Cannot use an ext load to change the number of vector elements!"); } + assert((!MMO->getRanges() || + (mdconst::extract(MMO->getRanges()->getOperand(0)) + ->getBitWidth() == MemVT.getScalarSizeInBits() && + MemVT.isInteger())) && + "Range metadata and load type must match!"); + bool Indexed = AM != ISD::UNINDEXED; assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!"); From 4e40c7c4bd66d98f529a807dbf410dc46444f4ca Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Wed, 2 Apr 2025 20:37:07 -0700 Subject: [PATCH 0479/1029] [lldb][debugserver] Save and restore the SVE/SME register state (#134184) debugserver isn't saving and restoring the SVE/SME register state around inferior function calls. Making arbitrary function calls while in Streaming SVE mode is generally a poor idea because a NEON instruction can be hit and crash the expression execution, which is how I missed this, but they should be handled correctly if the user knows it is safe to do. rdar://146886210 --- .../source/MacOSX/arm64/DNBArchImplARM64.cpp | 47 ++++++++++++++++--- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp index 34a4ee21f8502..d32a63daa5672 100644 --- a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp +++ b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp @@ -2952,8 +2952,15 @@ kern_return_t DNBArchMachARM64::SetRegisterState(int set) { return err; switch (set) { - case e_regSetALL: - return SetGPRState() | SetVFPState() | SetEXCState() | SetDBGState(false); + case e_regSetALL: { + kern_return_t ret = + SetGPRState() | SetVFPState() | SetEXCState() | SetDBGState(false); + if (CPUHasSME()) { + ret |= SetSVEState(); + ret |= SetSMEState(); + } + return ret; + } case e_regSetGPR: return SetGPRState(); case e_regSetVFP: @@ -3119,9 +3126,20 @@ uint32_t DNBArchMachARM64::SaveRegisterState() { "error: GPR regs failed to read: %u ", kret); } else if ((kret = GetVFPState(force)) != KERN_SUCCESS) { - DNBLogThreadedIf(LOG_THREAD, "DNBArchMachARM64::SaveRegisterState () " - "error: %s regs failed to read: %u", + DNBLogThreadedIf(LOG_THREAD, + "DNBArchMachARM64::SaveRegisterState () " + "error: %s regs failed to read: %u", "VFP", kret); + } else if (CPUHasSME() && (kret = SetSVEState() != KERN_SUCCESS)) { + DNBLogThreadedIf(LOG_THREAD, + "DNBArchMachARM64::SaveRegisterState () " + "error: %s regs failed to read: %u", + "SVE", kret); + } else if (CPUHasSME() && (kret = SetSMEState() != KERN_SUCCESS)) { + DNBLogThreadedIf(LOG_THREAD, + "DNBArchMachARM64::SaveRegisterState () " + "error: %s regs failed to read: %u", + "SME", kret); } else { const uint32_t save_id = GetNextRegisterStateSaveID(); m_saved_register_states[save_id] = m_state.context; @@ -3144,11 +3162,26 @@ bool DNBArchMachARM64::RestoreRegisterState(uint32_t save_id) { save_id, kret); success = false; } else if ((kret = SetVFPState()) != KERN_SUCCESS) { - DNBLogThreadedIf(LOG_THREAD, "DNBArchMachARM64::RestoreRegisterState " - "(save_id = %u) error: %s regs failed to " - "write: %u", + DNBLogThreadedIf(LOG_THREAD, + "DNBArchMachARM64::RestoreRegisterState " + "(save_id = %u) error: %s regs failed to " + "write: %u", save_id, "VFP", kret); success = false; + } else if ((kret = SetSVEState()) != KERN_SUCCESS) { + DNBLogThreadedIf(LOG_THREAD, + "DNBArchMachARM64::RestoreRegisterState " + "(save_id = %u) error: %s regs failed to " + "write: %u", + save_id, "SVE", kret); + success = false; + } else if ((kret = SetSMEState()) != KERN_SUCCESS) { + DNBLogThreadedIf(LOG_THREAD, + "DNBArchMachARM64::RestoreRegisterState " + "(save_id = %u) error: %s regs failed to " + "write: %u", + save_id, "SME", kret); + success = false; } m_saved_register_states.erase(pos); return success; From 3140d51cf3984e83f6480efc6a6f06e4567e7486 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 3 Apr 2025 11:04:02 +0700 Subject: [PATCH 0480/1029] llvm-reduce: Remove unsupported from bitcode uselistorder test (#134185) This was disabled due to flakiness but I'm currently unable to reproduce. I'm nervous the original issue still exists. However, I downgraded the tripped assert in 8c18c25b1b22ea710edb40a4f167a6a8bfe6ff9d to a warning since the same assert can trigger for illegitimate reasons. Fixes #64157 --- llvm/test/tools/llvm-reduce/bitcode-uselistorder.ll | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/test/tools/llvm-reduce/bitcode-uselistorder.ll b/llvm/test/tools/llvm-reduce/bitcode-uselistorder.ll index ed48b25baf35a..ac98d75ef2d3b 100644 --- a/llvm/test/tools/llvm-reduce/bitcode-uselistorder.ll +++ b/llvm/test/tools/llvm-reduce/bitcode-uselistorder.ll @@ -1,6 +1,3 @@ -; Sometimes fails with an assert on many targets. -; UNSUPPORTED: target={{.*}} - ; RUN: llvm-as -o %t.bc %s ; RUN: llvm-reduce -j=1 --abort-on-invalid-reduction \ From e3c0565b74b1f5122ab4dbabc3e941924e116330 Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Wed, 2 Apr 2025 21:07:30 -0700 Subject: [PATCH 0481/1029] Reapply "[cmake] Refactor clang unittest cmake" (#134195) This reapplies 5ffd9bdb50b57 (#133545) with fixes. The BUILD_SHARED_LIBS=ON build was fixed by adding missing LLVM dependencies to the InterpTests binary in unittests/AST/ByteCode/CMakeLists.txt . --- clang/unittests/AST/ByteCode/CMakeLists.txt | 16 ++++------ clang/unittests/AST/CMakeLists.txt | 23 ++++--------- clang/unittests/ASTMatchers/CMakeLists.txt | 22 ++++--------- .../ASTMatchers/Dynamic/CMakeLists.txt | 18 +++-------- clang/unittests/Analysis/CMakeLists.txt | 18 +++-------- .../Analysis/FlowSensitive/CMakeLists.txt | 18 +++-------- clang/unittests/Basic/CMakeLists.txt | 18 +++-------- clang/unittests/CMakeLists.txt | 32 ++++++++++++++++--- clang/unittests/CodeGen/CMakeLists.txt | 15 +++------ clang/unittests/CrossTU/CMakeLists.txt | 12 ++----- .../unittests/DirectoryWatcher/CMakeLists.txt | 11 ++----- clang/unittests/Driver/CMakeLists.txt | 19 ++++------- clang/unittests/Format/CMakeLists.txt | 11 ++----- clang/unittests/Frontend/CMakeLists.txt | 12 +++---- clang/unittests/Index/CMakeLists.txt | 13 +++----- clang/unittests/InstallAPI/CMakeLists.txt | 9 ++---- clang/unittests/Interpreter/CMakeLists.txt | 25 +++++++-------- .../Interpreter/ExceptionTests/CMakeLists.txt | 20 ++++++------ clang/unittests/Lex/CMakeLists.txt | 16 +++------- clang/unittests/Rewrite/CMakeLists.txt | 10 ++---- clang/unittests/Sema/CMakeLists.txt | 18 +++-------- clang/unittests/Serialization/CMakeLists.txt | 17 ++++------ clang/unittests/StaticAnalyzer/CMakeLists.txt | 18 +++-------- clang/unittests/Support/CMakeLists.txt | 11 ++----- clang/unittests/Tooling/CMakeLists.txt | 28 +++++++--------- clang/unittests/Tooling/Syntax/CMakeLists.txt | 15 +++------ clang/unittests/libclang/CMakeLists.txt | 5 +-- .../libclang/CrashTests/CMakeLists.txt | 5 +-- 28 files changed, 166 insertions(+), 289 deletions(-) diff --git a/clang/unittests/AST/ByteCode/CMakeLists.txt b/clang/unittests/AST/ByteCode/CMakeLists.txt index b862fb4834fbd..1469cd6b2a8ea 100644 --- a/clang/unittests/AST/ByteCode/CMakeLists.txt +++ b/clang/unittests/AST/ByteCode/CMakeLists.txt @@ -2,19 +2,17 @@ add_clang_unittest(InterpTests BitcastBuffer.cpp Descriptor.cpp toAPValue.cpp - ) - -clang_target_link_libraries(InterpTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangBasic clangFrontend clangSerialization clangTooling - ) - - target_link_libraries(InterpTests - PRIVATE + LINK_LIBS clangTesting -) + LLVM_COMPONENTS + FrontendOpenMP + Support + TargetParser + ) diff --git a/clang/unittests/AST/CMakeLists.txt b/clang/unittests/AST/CMakeLists.txt index bfa6082a6ffa4..f27d34e8a0719 100644 --- a/clang/unittests/AST/CMakeLists.txt +++ b/clang/unittests/AST/CMakeLists.txt @@ -1,10 +1,3 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - TargetParser - ) - - add_subdirectory(ByteCode) add_clang_unittest(ASTTests @@ -43,10 +36,7 @@ add_clang_unittest(ASTTests TemplateNameTest.cpp TypePrinterTest.cpp UnresolvedSetTest.cpp - ) - -clang_target_link_libraries(ASTTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangBasic @@ -54,11 +44,12 @@ clang_target_link_libraries(ASTTests clangLex clangSerialization clangTooling - ) - -target_link_libraries(ASTTests - PRIVATE + LINK_LIBS clangTesting LLVMTestingAnnotations LLVMTestingSupport -) + LLVM_COMPONENTS + FrontendOpenMP + Support + TargetParser + ) diff --git a/clang/unittests/ASTMatchers/CMakeLists.txt b/clang/unittests/ASTMatchers/CMakeLists.txt index 6a1e629d81b65..47bd5c108bb5a 100644 --- a/clang/unittests/ASTMatchers/CMakeLists.txt +++ b/clang/unittests/ASTMatchers/CMakeLists.txt @@ -1,31 +1,23 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - TargetParser - ) - add_clang_unittest(ASTMatchersTests ASTMatchersInternalTest.cpp ASTMatchersNodeTest.cpp ASTMatchersNarrowingTest.cpp ASTMatchersTraversalTest.cpp GtestMatchersTest.cpp - ) - -clang_target_link_libraries(ASTMatchersTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangBasic clangFrontend clangSerialization clangTooling - ) - -target_link_libraries(ASTMatchersTests - PRIVATE + LINK_LIBS clangTesting LLVMTestingSupport -) + LLVM_COMPONENTS + FrontendOpenMP + Support + TargetParser + ) add_subdirectory(Dynamic) diff --git a/clang/unittests/ASTMatchers/Dynamic/CMakeLists.txt b/clang/unittests/ASTMatchers/Dynamic/CMakeLists.txt index 6d0e12bcb0759..b6db7ce62afe7 100644 --- a/clang/unittests/ASTMatchers/Dynamic/CMakeLists.txt +++ b/clang/unittests/ASTMatchers/Dynamic/CMakeLists.txt @@ -1,16 +1,8 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - ) - add_clang_unittest(DynamicASTMatchersTests VariantValueTest.cpp ParserTest.cpp RegistryTest.cpp - ) - -clang_target_link_libraries(DynamicASTMatchersTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangBasic @@ -18,9 +10,9 @@ clang_target_link_libraries(DynamicASTMatchersTests clangFrontend clangSerialization clangTooling - ) - -target_link_libraries(DynamicASTMatchersTests - PRIVATE + LINK_LIBS clangTesting + LLVM_COMPONENTS + FrontendOpenMP + Support ) diff --git a/clang/unittests/Analysis/CMakeLists.txt b/clang/unittests/Analysis/CMakeLists.txt index cfea57f53f033..059a74843155c 100644 --- a/clang/unittests/Analysis/CMakeLists.txt +++ b/clang/unittests/Analysis/CMakeLists.txt @@ -1,8 +1,3 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - ) - add_clang_unittest(ClangAnalysisTests CFGDominatorTree.cpp CFGTest.cpp @@ -11,10 +6,7 @@ add_clang_unittest(ClangAnalysisTests IntervalPartitionTest.cpp MacroExpansionContextTest.cpp UnsafeBufferUsageTest.cpp - ) - -clang_target_link_libraries(ClangAnalysisTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangAnalysis @@ -23,12 +15,12 @@ clang_target_link_libraries(ClangAnalysisTests clangLex clangSerialization clangTooling - ) - -target_link_libraries(ClangAnalysisTests - PRIVATE + LINK_LIBS clangTesting LLVMTestingSupport + LLVM_COMPONENTS + FrontendOpenMP + Support ) add_subdirectory(FlowSensitive) diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt index 6c01ae8fc2e54..4ac563143cd68 100644 --- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt +++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt @@ -1,8 +1,3 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - ) - add_clang_unittest(ClangAnalysisFlowSensitiveTests ArenaTest.cpp ASTOpsTest.cpp @@ -30,10 +25,7 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests UncheckedOptionalAccessModelTest.cpp ValueTest.cpp WatchedLiteralsSolverTest.cpp - ) - -clang_target_link_libraries(ClangAnalysisFlowSensitiveTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangAnalysis @@ -44,11 +36,11 @@ clang_target_link_libraries(ClangAnalysisFlowSensitiveTests clangLex clangSerialization clangTooling - ) - -target_link_libraries(ClangAnalysisFlowSensitiveTests - PRIVATE + LINK_LIBS clangTesting LLVMTestingAnnotations LLVMTestingSupport + LLVM_COMPONENTS + FrontendOpenMP + Support ) diff --git a/clang/unittests/Basic/CMakeLists.txt b/clang/unittests/Basic/CMakeLists.txt index 3844ba49add8d..e818bd3e2c372 100644 --- a/clang/unittests/Basic/CMakeLists.txt +++ b/clang/unittests/Basic/CMakeLists.txt @@ -1,7 +1,3 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(BasicTests CharInfoTest.cpp DarwinSDKInfoTest.cpp @@ -12,15 +8,11 @@ add_clang_unittest(BasicTests SanitizersTest.cpp SarifTest.cpp SourceManagerTest.cpp - ) - -clang_target_link_libraries(BasicTests - PRIVATE + CLANG_LIBS clangBasic clangLex - ) - -target_link_libraries(BasicTests - PRIVATE + LINK_LIBS LLVMTestingSupport -) + LLVM_COMPONENTS + Support + ) diff --git a/clang/unittests/CMakeLists.txt b/clang/unittests/CMakeLists.txt index 9b3ce8aa7de73..f3823ba309420 100644 --- a/clang/unittests/CMakeLists.txt +++ b/clang/unittests/CMakeLists.txt @@ -15,12 +15,36 @@ if(CLANG_BUILT_STANDALONE) endif() endif() -# add_clang_unittest(test_dirname file1.cpp file2.cpp) +# add_clang_unittest(test_name file1.cpp file2.cpp) # # Will compile the list of files together and link against the clang -# Produces a binary named 'basename(test_dirname)'. -function(add_clang_unittest test_dirname) - add_unittest(ClangUnitTests ${test_dirname} ${ARGN}) +# Produces a binary named 'basename(test_name)'. +function(add_clang_unittest test_name) + cmake_parse_arguments(ARG + "" + "" + "CLANG_LIBS;LINK_LIBS;LLVM_COMPONENTS" + ${ARGN}) + + if (NOT ${test_name} MATCHES "Tests$") + message(FATAL_ERROR "Unit test name must end with 'Tests' for lit to find it.") + endif() + + # LLVM_COMPONENTS is for LLVM_LINK_COMPONENTS deps, and must be before + # add_unittest. + list(APPEND LLVM_LINK_COMPONENTS ${ARG_LLVM_COMPONENTS}) + + add_unittest(ClangUnitTests ${test_name} ${ARG_UNPARSED_ARGUMENTS}) + + # Clang libs either come from the entire dylib, or individual libraries. + if (CLANG_LINK_CLANG_DYLIB) + list(APPEND ARG_LINK_LIBS clang-cpp) + else() + list(APPEND ARG_LINK_LIBS ${ARG_CLANG_LIBS}) + endif() + + # LINK_LIBS is for normal library dependencies. + target_link_libraries(${test_name} PRIVATE ${ARG_LINK_LIBS}) endfunction() add_subdirectory(Basic) diff --git a/clang/unittests/CodeGen/CMakeLists.txt b/clang/unittests/CodeGen/CMakeLists.txt index a437f441568f2..f5bcecb0b08a3 100644 --- a/clang/unittests/CodeGen/CMakeLists.txt +++ b/clang/unittests/CodeGen/CMakeLists.txt @@ -1,18 +1,9 @@ -set(LLVM_LINK_COMPONENTS - Core - Support - TargetParser - ) - add_clang_unittest(ClangCodeGenTests BufferSourceTest.cpp CodeGenExternalTest.cpp TBAAMetadataTest.cpp CheckTargetFeaturesTest.cpp - ) - -clang_target_link_libraries(ClangCodeGenTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangCodeGen @@ -20,4 +11,8 @@ clang_target_link_libraries(ClangCodeGenTests clangLex clangParse clangSerialization + LLVM_COMPONENTS + Core + Support + TargetParser ) diff --git a/clang/unittests/CrossTU/CMakeLists.txt b/clang/unittests/CrossTU/CMakeLists.txt index 222b7e83dc38c..ee81c57ca1dce 100644 --- a/clang/unittests/CrossTU/CMakeLists.txt +++ b/clang/unittests/CrossTU/CMakeLists.txt @@ -1,18 +1,12 @@ -set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - Support - ) - add_clang_unittest(CrossTUTests CrossTranslationUnitTest.cpp - ) - -clang_target_link_libraries(CrossTUTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangCrossTU clangFrontend clangSerialization clangTooling + LLVM_COMPONENTS + Support ) diff --git a/clang/unittests/DirectoryWatcher/CMakeLists.txt b/clang/unittests/DirectoryWatcher/CMakeLists.txt index 38882c9ec2162..58e0aee2d1076 100644 --- a/clang/unittests/DirectoryWatcher/CMakeLists.txt +++ b/clang/unittests/DirectoryWatcher/CMakeLists.txt @@ -1,17 +1,12 @@ if(APPLE OR CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME STREQUAL Windows) - set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(DirectoryWatcherTests DirectoryWatcherTest.cpp - ) - - target_link_libraries(DirectoryWatcherTests - PRIVATE + LINK_LIBS LLVMTestingSupport clangDirectoryWatcher + LLVM_COMPONENTS + Support ) endif() diff --git a/clang/unittests/Driver/CMakeLists.txt b/clang/unittests/Driver/CMakeLists.txt index efdd07ea23889..fa0e87c3318df 100644 --- a/clang/unittests/Driver/CMakeLists.txt +++ b/clang/unittests/Driver/CMakeLists.txt @@ -1,11 +1,3 @@ -set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - MC - Option - Support - TargetParser - ) - add_clang_unittest(ClangDriverTests DistroTest.cpp DXCModeTest.cpp @@ -15,12 +7,15 @@ add_clang_unittest(ClangDriverTests MultilibBuilderTest.cpp MultilibTest.cpp SanitizerArgsTest.cpp - ) - -clang_target_link_libraries(ClangDriverTests - PRIVATE + CLANG_LIBS clangDriver clangBasic clangFrontend # For TextDiagnosticPrinter. clangSerialization + LLVM_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + MC + Option + Support + TargetParser ) diff --git a/clang/unittests/Format/CMakeLists.txt b/clang/unittests/Format/CMakeLists.txt index 71f5886d946c8..5bd6a17182d29 100644 --- a/clang/unittests/Format/CMakeLists.txt +++ b/clang/unittests/Format/CMakeLists.txt @@ -1,7 +1,3 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(FormatTests BracesInserterTest.cpp BracesRemoverTest.cpp @@ -36,12 +32,11 @@ add_clang_unittest(FormatTests SortIncludesTest.cpp UsingDeclarationsSorterTest.cpp TokenAnnotatorTest.cpp - ) - -clang_target_link_libraries(FormatTests - PRIVATE + CLANG_LIBS clangBasic clangFormat clangRewrite clangToolingCore + LLVM_COMPONENTS + Support ) diff --git a/clang/unittests/Frontend/CMakeLists.txt b/clang/unittests/Frontend/CMakeLists.txt index 3c94846243870..bbf0396014fa9 100644 --- a/clang/unittests/Frontend/CMakeLists.txt +++ b/clang/unittests/Frontend/CMakeLists.txt @@ -1,8 +1,3 @@ -set(LLVM_LINK_COMPONENTS - Support - TargetParser - ) - add_clang_unittest(FrontendTests ASTUnitTest.cpp CompilerInvocationTest.cpp @@ -17,9 +12,7 @@ add_clang_unittest(FrontendTests OutputStreamTest.cpp TextDiagnosticTest.cpp UtilsTest.cpp - ) -clang_target_link_libraries(FrontendTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangFrontend @@ -29,4 +22,7 @@ clang_target_link_libraries(FrontendTests clangFrontendTool clangSerialization clangTooling + LLVM_COMPONENTS + Support + TargetParser ) diff --git a/clang/unittests/Index/CMakeLists.txt b/clang/unittests/Index/CMakeLists.txt index ea940e9d7a9ef..15e9ba0643eaf 100644 --- a/clang/unittests/Index/CMakeLists.txt +++ b/clang/unittests/Index/CMakeLists.txt @@ -1,14 +1,6 @@ -set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - Support - ) - add_clang_unittest(IndexTests IndexTests.cpp - ) - -clang_target_link_libraries(IndexTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangFrontend @@ -16,4 +8,7 @@ clang_target_link_libraries(IndexTests clangLex clangSerialization clangTooling + LLVM_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Support ) diff --git a/clang/unittests/InstallAPI/CMakeLists.txt b/clang/unittests/InstallAPI/CMakeLists.txt index 4255001ff51f1..c174fa3f87161 100644 --- a/clang/unittests/InstallAPI/CMakeLists.txt +++ b/clang/unittests/InstallAPI/CMakeLists.txt @@ -1,11 +1,8 @@ add_clang_unittest(InstallAPITests HeaderFileTest.cpp FileListTest.cpp - ) - -clang_target_link_libraries(InstallAPITests - PRIVATE + CLANG_LIBS clangInstallAPI + LINK_LIBS + LLVMTestingSupport ) - -target_link_libraries(InstallAPITests PRIVATE LLVMTestingSupport) diff --git a/clang/unittests/Interpreter/CMakeLists.txt b/clang/unittests/Interpreter/CMakeLists.txt index 95378f9cfe737..9df1a4b03da47 100644 --- a/clang/unittests/Interpreter/CMakeLists.txt +++ b/clang/unittests/Interpreter/CMakeLists.txt @@ -1,12 +1,3 @@ -set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - Core - MC - OrcJIT - Support - TargetParser - ) - add_clang_unittest(ClangReplInterpreterTests IncrementalCompilerBuilderTest.cpp IncrementalProcessingTest.cpp @@ -15,16 +6,24 @@ add_clang_unittest(ClangReplInterpreterTests CodeCompletionTest.cpp EXPORT_SYMBOLS - ) - -target_link_libraries(ClangReplInterpreterTests PUBLIC LLVMTestingSupport) -clang_target_link_libraries(ClangReplInterpreterTests PRIVATE + CLANG_LIBS clangAST clangBasic clangInterpreter clangFrontend clangSema + + LINK_LIBS + LLVMTestingSupport + + LLVM_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Core + MC + OrcJIT + Support + TargetParser ) # Exceptions on Windows are not yet supported. diff --git a/clang/unittests/Interpreter/ExceptionTests/CMakeLists.txt b/clang/unittests/Interpreter/ExceptionTests/CMakeLists.txt index 24ae9cd78b5ca..eb366a860661c 100644 --- a/clang/unittests/Interpreter/ExceptionTests/CMakeLists.txt +++ b/clang/unittests/Interpreter/ExceptionTests/CMakeLists.txt @@ -3,24 +3,22 @@ set(LLVM_REQUIRES_EH ON) set(LLVM_REQUIRES_RTTI ON) -set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - Core - OrcJIT - Support - ) - add_clang_unittest(ClangReplInterpreterExceptionTests InterpreterExceptionTest.cpp - EXPORT_SYMBOLS - ) -llvm_update_compile_flags(ClangReplInterpreterExceptionTests) -target_link_libraries(ClangReplInterpreterExceptionTests PUBLIC + CLANG_LIBS clangAST clangBasic clangInterpreter clangFrontend + + LLVM_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + Core + OrcJIT + Support ) + +llvm_update_compile_flags(ClangReplInterpreterExceptionTests) add_dependencies(ClangReplInterpreterExceptionTests clang-resource-headers) diff --git a/clang/unittests/Lex/CMakeLists.txt b/clang/unittests/Lex/CMakeLists.txt index 5ec93946594b7..96ca6dda9cd85 100644 --- a/clang/unittests/Lex/CMakeLists.txt +++ b/clang/unittests/Lex/CMakeLists.txt @@ -1,7 +1,3 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(LexTests DependencyDirectivesScannerTest.cpp HeaderMapTest.cpp @@ -13,19 +9,15 @@ add_clang_unittest(LexTests PPConditionalDirectiveRecordTest.cpp PPDependencyDirectivesTest.cpp PPMemoryAllocationsTest.cpp - ) - -clang_target_link_libraries(LexTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangLex clangParse clangSema - ) - -target_link_libraries(LexTests - PRIVATE + LINK_LIBS LLVMTestingAnnotations LLVMTestingSupport + LLVM_COMPONENTS + Support ) diff --git a/clang/unittests/Rewrite/CMakeLists.txt b/clang/unittests/Rewrite/CMakeLists.txt index 3c5e2f8e5354b..498613254e72b 100644 --- a/clang/unittests/Rewrite/CMakeLists.txt +++ b/clang/unittests/Rewrite/CMakeLists.txt @@ -1,14 +1,10 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(RewriteTests RewriterTest.cpp - ) -clang_target_link_libraries(RewriteTests - PRIVATE + CLANG_LIBS clangFrontend clangRewrite clangSerialization clangTooling + LLVM_COMPONENTS + Support ) diff --git a/clang/unittests/Sema/CMakeLists.txt b/clang/unittests/Sema/CMakeLists.txt index 17d39408000a4..acc76c932afeb 100644 --- a/clang/unittests/Sema/CMakeLists.txt +++ b/clang/unittests/Sema/CMakeLists.txt @@ -1,8 +1,3 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - ) - add_clang_unittest(SemaTests ExternalSemaSourceTest.cpp CodeCompleteTest.cpp @@ -10,10 +5,7 @@ add_clang_unittest(SemaTests GslOwnerPointerInference.cpp SemaLookupTest.cpp SemaNoloadLookupTest.cpp - ) - -clang_target_link_libraries(SemaTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangBasic @@ -22,11 +14,11 @@ clang_target_link_libraries(SemaTests clangSema clangSerialization clangTooling - ) - -target_link_libraries(SemaTests - PRIVATE + LINK_LIBS LLVMTestingAnnotations LLVMTestingSupport clangTesting + LLVM_COMPONENTS + FrontendOpenMP + Support ) diff --git a/clang/unittests/Serialization/CMakeLists.txt b/clang/unittests/Serialization/CMakeLists.txt index e7005b5d511eb..6782e6b4d7330 100644 --- a/clang/unittests/Serialization/CMakeLists.txt +++ b/clang/unittests/Serialization/CMakeLists.txt @@ -1,10 +1,3 @@ -set(LLVM_LINK_COMPONENTS - BitReader - BitstreamReader - FrontendOpenMP - Support - ) - add_clang_unittest(SerializationTests ForceCheckFileInputTest.cpp InMemoryModuleCacheTest.cpp @@ -14,10 +7,7 @@ add_clang_unittest(SerializationTests LoadSpecLazilyTest.cpp SourceLocationEncodingTest.cpp VarDeclConstantInitTest.cpp - ) - -clang_target_link_libraries(SerializationTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangFrontend @@ -26,4 +16,9 @@ clang_target_link_libraries(SerializationTests clangSerialization clangTooling clangASTMatchers + LLVM_COMPONENTS + BitReader + BitstreamReader + FrontendOpenMP + Support ) diff --git a/clang/unittests/StaticAnalyzer/CMakeLists.txt b/clang/unittests/StaticAnalyzer/CMakeLists.txt index 3b01a4e9e5327..143b7eedbfe05 100644 --- a/clang/unittests/StaticAnalyzer/CMakeLists.txt +++ b/clang/unittests/StaticAnalyzer/CMakeLists.txt @@ -1,8 +1,3 @@ -set(LLVM_LINK_COMPONENTS - FrontendOpenMP - Support - ) - add_clang_unittest(StaticAnalysisTests AnalyzerOptionsTest.cpp APSIntTypeTest.cpp @@ -25,10 +20,7 @@ add_clang_unittest(StaticAnalysisTests SValTest.cpp TestReturnValueUnderConstruction.cpp Z3CrosscheckOracleTest.cpp - ) - -clang_target_link_libraries(StaticAnalysisTests - PRIVATE + CLANG_LIBS clangBasic clangAnalysis clangAST @@ -39,9 +31,9 @@ clang_target_link_libraries(StaticAnalysisTests clangStaticAnalyzerCore clangStaticAnalyzerFrontend clangTooling - ) - -target_link_libraries(StaticAnalysisTests - PRIVATE + LINK_LIBS clangTesting + LLVM_COMPONENTS + FrontendOpenMP + Support ) diff --git a/clang/unittests/Support/CMakeLists.txt b/clang/unittests/Support/CMakeLists.txt index 22be5ed18cc7a..d0ce4f6d10617 100644 --- a/clang/unittests/Support/CMakeLists.txt +++ b/clang/unittests/Support/CMakeLists.txt @@ -1,15 +1,10 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(ClangSupportTests TimeProfilerTest.cpp - ) - -clang_target_link_libraries(ClangSupportTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangFrontend clangSerialization + LLVM_COMPONENTS + Support ) diff --git a/clang/unittests/Tooling/CMakeLists.txt b/clang/unittests/Tooling/CMakeLists.txt index 401978c31863c..106c6b9dc38bd 100644 --- a/clang/unittests/Tooling/CMakeLists.txt +++ b/clang/unittests/Tooling/CMakeLists.txt @@ -1,13 +1,3 @@ -set(LLVM_LINK_COMPONENTS - ${LLVM_TARGETS_TO_BUILD} - MC - Option - FrontendOpenMP - Support - TargetParser - ) - - add_clang_unittest(ToolingTests ASTSelectionTest.cpp CastExprTest.cpp @@ -69,10 +59,8 @@ add_clang_unittest(ToolingTests StencilTest.cpp ToolingTest.cpp TransformerTest.cpp - ) -clang_target_link_libraries(ToolingTests - PRIVATE + CLANG_LIBS clangAST clangASTMatchers clangBasic @@ -89,13 +77,19 @@ clang_target_link_libraries(ToolingTests clangToolingInclusionsStdlib clangToolingRefactoring clangTransformer - ) -target_link_libraries(ToolingTests - PRIVATE + LINK_LIBS LLVMTestingAnnotations LLVMTestingSupport clangTesting -) + + LLVM_COMPONENTS + ${LLVM_TARGETS_TO_BUILD} + MC + Option + FrontendOpenMP + Support + TargetParser + ) add_subdirectory(Syntax) diff --git a/clang/unittests/Tooling/Syntax/CMakeLists.txt b/clang/unittests/Tooling/Syntax/CMakeLists.txt index ff3b6176f879f..db110fefa954f 100644 --- a/clang/unittests/Tooling/Syntax/CMakeLists.txt +++ b/clang/unittests/Tooling/Syntax/CMakeLists.txt @@ -1,7 +1,3 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - add_clang_unittest(SyntaxTests TreeTestBase.cpp BuildTreeTest.cpp @@ -9,10 +5,8 @@ add_clang_unittest(SyntaxTests SynthesisTest.cpp TreeTest.cpp TokensTest.cpp -) -clang_target_link_libraries(SyntaxTests - PRIVATE + CLANG_LIBS clangAST clangBasic clangFrontend @@ -21,11 +15,12 @@ clang_target_link_libraries(SyntaxTests clangTooling clangToolingCore clangToolingSyntax - ) -target_link_libraries(SyntaxTests - PRIVATE + LINK_LIBS clangTesting LLVMTestingAnnotations LLVMTestingSupport + + LLVM_COMPONENTS + Support ) diff --git a/clang/unittests/libclang/CMakeLists.txt b/clang/unittests/libclang/CMakeLists.txt index b3644a0e710e1..ba86c3c4d91e0 100644 --- a/clang/unittests/libclang/CMakeLists.txt +++ b/clang/unittests/libclang/CMakeLists.txt @@ -1,9 +1,6 @@ add_clang_unittest(libclangTests LibclangTest.cpp - ) - -target_link_libraries(libclangTests - PRIVATE + LINK_LIBS libclang ) diff --git a/clang/unittests/libclang/CrashTests/CMakeLists.txt b/clang/unittests/libclang/CrashTests/CMakeLists.txt index 82f0e4c16e901..de7b5a8f6ee91 100644 --- a/clang/unittests/libclang/CrashTests/CMakeLists.txt +++ b/clang/unittests/libclang/CrashTests/CMakeLists.txt @@ -1,8 +1,5 @@ add_clang_unittest(libclangCrashTests LibclangCrashTest.cpp - ) - -target_link_libraries(libclangCrashTests - PRIVATE + LINK_LIBS libclang ) From b8d8405238387ddd92450d6a3ad84350254e76a3 Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Wed, 2 Apr 2025 21:27:44 -0700 Subject: [PATCH 0482/1029] [LLDB] Expose checking if the symbol file exists/is loaded via SBModule (#134163) The motivation for this patch is that in Statistics.cpp we [check to see if the module symfile is loaded](https://github.com/llvm/llvm-project/blob/990a086d9da0bc2fd53a6a4c95ecbbe23a297a83/lldb/source/Target/Statistics.cpp#L353C60-L353C75) to calculate how much debug info has been loaded. I have an external utility that only wants to look at the loaded debug info, which isn't exposed by the SBAPI. --- lldb/include/lldb/API/SBModule.h | 3 +++ lldb/source/API/SBModule.cpp | 12 ++++++++++++ 2 files changed, 15 insertions(+) diff --git a/lldb/include/lldb/API/SBModule.h b/lldb/include/lldb/API/SBModule.h index 85332066ee687..651455bdb78d2 100644 --- a/lldb/include/lldb/API/SBModule.h +++ b/lldb/include/lldb/API/SBModule.h @@ -290,6 +290,9 @@ class LLDB_API SBModule { lldb::SBAddress GetObjectFileHeaderAddress() const; lldb::SBAddress GetObjectFileEntryPointAddress() const; + /// Get if the symbol file for this module is loaded. + bool IsDebugInfoLoaded() const; + /// Get the number of global modules. static uint32_t GetNumberAllocatedModules(); diff --git a/lldb/source/API/SBModule.cpp b/lldb/source/API/SBModule.cpp index 985107ec68efd..4978a553f57c7 100644 --- a/lldb/source/API/SBModule.cpp +++ b/lldb/source/API/SBModule.cpp @@ -659,6 +659,18 @@ lldb::SBAddress SBModule::GetObjectFileEntryPointAddress() const { return sb_addr; } +bool SBModule::IsDebugInfoLoaded() const { + LLDB_INSTRUMENT_VA(this); + + ModuleSP module_sp(GetSP()); + if (module_sp) { + SymbolFile *sym_file = module_sp->GetSymbolFile(/*create=*/false); + return sym_file && sym_file->GetLoadDebugInfoEnabled(); + } + + return false; +} + uint32_t SBModule::GetNumberAllocatedModules() { LLDB_INSTRUMENT(); From 499827318971a7d540a2b928f43d782e63762a47 Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Wed, 2 Apr 2025 21:37:44 -0700 Subject: [PATCH 0483/1029] Reland [RISCV] Add Xqci Insn Formats (#134134) This adds the following instruction formats from the Xqci Spec: - QC.EAI - QC.EI - QC.EB - QC.EJ - QC.ES The update to the THead test is because the largest number of operands for a valid instruction has been bumped by this change. This reverts commit 68fb7a5a1d203dde7badf67031bdd9eb650eef5d. This relands commit 0cfabd37df9940346f3bf8a4d74c19e1f48a00e9. --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 13 +- .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 8 +- llvm/lib/Target/RISCV/RISCVInstrFormats.td | 8 +- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 45 ++-- llvm/lib/Target/RISCV/RISCVInstrInfoC.td | 6 - llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 215 ++++++++++++++++++ llvm/test/MC/RISCV/insn_xqci-invalid.s | 111 +++++++++ llvm/test/MC/RISCV/insn_xqci.s | 41 ++++ llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s | 2 +- 9 files changed, 420 insertions(+), 29 deletions(-) create mode 100644 llvm/test/MC/RISCV/insn_xqci-invalid.s create mode 100644 llvm/test/MC/RISCV/insn_xqci.s diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 7837504751694..d2af2951add6f 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -793,6 +793,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { bool isSImm5() const { return isSImm<5>(); } bool isSImm6() const { return isSImm<6>(); } bool isSImm11() const { return isSImm<11>(); } + bool isSImm16() const { return isSImm<16>(); } bool isSImm20() const { return isSImm<20>(); } bool isSImm26() const { return isSImm<26>(); } bool isSImm32() const { return isSImm<32>(); } @@ -1511,6 +1512,9 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return generateImmOutOfRangeError( Operands, ErrorInfo, -(1 << 12), (1 << 12) - 2, "immediate must be a multiple of 2 bytes in the range"); + case Match_InvalidSImm16: + return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 15), + (1 << 15) - 1); case Match_InvalidSImm16NonZero: return generateImmOutOfRangeError( Operands, ErrorInfo, -(1 << 15), (1 << 15) - 1, @@ -3153,10 +3157,13 @@ bool RISCVAsmParser::parseDirectiveAttribute() { return false; } -bool isValidInsnFormat(StringRef Format, bool AllowC) { +bool isValidInsnFormat(StringRef Format, const MCSubtargetInfo &STI) { return StringSwitch(Format) .Cases("r", "r4", "i", "b", "sb", "u", "j", "uj", "s", true) - .Cases("cr", "ci", "ciw", "css", "cl", "cs", "ca", "cb", "cj", AllowC) + .Cases("cr", "ci", "ciw", "css", "cl", "cs", "ca", "cb", "cj", + STI.hasFeature(RISCV::FeatureStdExtZca)) + .Cases("qc.eai", "qc.ei", "qc.eb", "qc.ej", "qc.es", + !STI.hasFeature(RISCV::Feature64Bit)) .Default(false); } @@ -3246,7 +3253,7 @@ bool RISCVAsmParser::parseDirectiveInsn(SMLoc L) { return false; } - if (!isValidInsnFormat(Format, AllowC)) + if (!isValidInsnFormat(Format, getSTI())) return Error(ErrorLoc, "invalid instruction format"); std::string FormatName = (".insn_" + Format).str(); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index d6672de02862d..adccd1e6c5002 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -51,7 +51,12 @@ enum { InstFormatCLH = 19, InstFormatCSB = 20, InstFormatCSH = 21, - InstFormatOther = 22, + InstFormatQC_EAI = 22, + InstFormatQC_EI = 23, + InstFormatQC_EB = 24, + InstFormatQC_EJ = 25, + InstFormatQC_ES = 26, + InstFormatOther = 31, InstFormatMask = 31, InstFormatShift = 0, @@ -333,6 +338,7 @@ enum OperandType : unsigned { OPERAND_SIMM11, OPERAND_SIMM12, OPERAND_SIMM12_LSB00000, + OPERAND_SIMM16, OPERAND_SIMM16_NONZERO, OPERAND_SIMM20, OPERAND_SIMM26, diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td index d95e806b79f25..0bb0ba57ff50d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td @@ -52,7 +52,13 @@ def InstFormatCLB : InstFormat<18>; def InstFormatCLH : InstFormat<19>; def InstFormatCSB : InstFormat<20>; def InstFormatCSH : InstFormat<21>; -def InstFormatOther : InstFormat<22>; +def InstFormatQC_EAI : InstFormat<22>; +def InstFormatQC_EI : InstFormat<23>; +def InstFormatQC_EB : InstFormat<24>; +def InstFormatQC_EJ : InstFormat<25>; +def InstFormatQC_ES : InstFormat<26>; +def InstFormatOther : InstFormat<31>; + class RISCVVConstraint val> { bits<3> Value = val; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 89e5ad8067c1b..c87452171f090 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1144,6 +1144,33 @@ def AnyReg : Operand { let ParserMatchClass = AnyRegOperand; } +// isCodeGenOnly = 1 to hide them from the tablegened assembly parser. +let isCodeGenOnly = 1, hasSideEffects = 1, mayLoad = 1, mayStore = 1, + hasNoSchedulingInfo = 1 in { +def Insn16 : RVInst16<(outs), (ins uimm16:$value), "", "", [], InstFormatOther> { + bits<16> value; + + let Inst{15-0} = value; + let AsmString = ".insn 0x2, $value"; +} +def Insn32 : RVInst<(outs), (ins uimm32:$value), "", "", [], InstFormatOther> { + bits<32> value; + + let Inst{31-0} = value; + let AsmString = ".insn 0x4, $value"; +} +def Insn48 : RVInst48<(outs), (ins uimm48:$value), "", "", [], InstFormatOther> { + bits<48> value; + let Inst{47-0} = value; + let AsmString = ".insn 0x6, $value"; +} +def Insn64 : RVInst64<(outs), (ins uimm64:$value), "", "", [], InstFormatOther> { + bits<64> value; + let Inst{63-0} = value; + let AsmString = ".insn 0x8, $value"; +} +} // isCodeGenOnly, hasSideEffects, mayLoad, mayStore, hasNoSchedulingInfo + // isCodeGenOnly = 1 to hide them from the tablegened assembly parser. let isCodeGenOnly = 1, hasSideEffects = 1, mayLoad = 1, mayStore = 1, hasNoSchedulingInfo = 1 in { @@ -1179,23 +1206,7 @@ def InsnS : DirectiveInsnS<(outs), (ins uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs2, AnyReg:$rs1, simm12:$imm12), "$opcode, $funct3, $rs2, ${imm12}(${rs1})">; -def Insn32 : RVInst<(outs), (ins uimm32:$value), "", "", [], InstFormatOther> { - bits<32> value; - - let Inst{31-0} = value; - let AsmString = ".insn 0x4, $value"; -} -def Insn48 : RVInst48<(outs), (ins uimm48:$value), "", "", [], InstFormatOther> { - bits<48> value; - let Inst{47-0} = value; - let AsmString = ".insn 0x6, $value"; -} -def Insn64 : RVInst64<(outs), (ins uimm64:$value), "", "", [], InstFormatOther> { - bits<64> value; - let Inst{63-0} = value; - let AsmString = ".insn 0x8, $value"; -} -} +} // isCodeGenOnly, hasSideEffects, mayLoad, mayStore, hasNoSchedulingInfo // Use InstAliases to match these so that we can combine the insn and format // into a mnemonic to use as the key for the tablegened asm matcher table. The diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index 718d95aa1a4bc..1c94af58880f2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -799,12 +799,6 @@ def InsnCJ : DirectiveInsnCJ<(outs), (ins uimm2_opcode:$opcode, uimm3:$funct3, bare_simm12_lsb0:$imm11), "$opcode, $funct3, $imm11">; -def Insn16 : RVInst16<(outs), (ins uimm16:$value), "", "", [], InstFormatOther> { - bits<16> value; - - let Inst{15-0} = value; - let AsmString = ".insn 0x2, $value"; -} } // Use InstAliases to match these so that we can combine the insn and format diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 2479bbd1258a4..93eb82b012eb4 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -94,6 +94,8 @@ def simm5nonzero : RISCVOp, def simm11 : RISCVSImmLeafOp<11>; +def simm16 : RISCVSImmOp<16>; + def simm16nonzero : RISCVOp, ImmLeaf(Imm);}]> { let ParserMatchClass = SImmAsmOperand<16, "NonZero">; @@ -139,6 +141,219 @@ def simm32_lsb0 : Operand { // Instruction Formats //===----------------------------------------------------------------------===// + +class DirectiveInsnQC_EAI + : RVInst48 { + bits<7> opcode; + bits<3> func3; + bits<1> func1; + + bits<5> rd; + bits<32> imm32; + + let Inst{47-16} = imm32; + let Inst{15} = func1; + let Inst{14-12} = func3; + let Inst{11-7} = rd; + let Inst{6-0} = opcode; + + let AsmString = ".insn qc.eai " # argstr; +} + +class DirectiveInsnQC_EI + : RVInst48 { + bits<7> opcode; + bits<3> func3; + bits<2> func2; + + bits<5> rd; + bits<5> rs1; + bits<26> imm26; + + let Inst{47-32} = imm26{25-10}; + let Inst{31-30} = func2; + let Inst{29-20} = imm26{9-0}; + let Inst{19-15} = rs1; + let Inst{14-12} = func3; + let Inst{11-7} = rd; + let Inst{6-0} = opcode; + + let AsmString = ".insn qc.ei " # argstr; +} + +class DirectiveInsnQC_EB + : RVInst48 { + bits<7> opcode; + bits<3> func3; + bits<5> func5; + + bits<5> rs1; + bits<12> imm12; // This one is the PC-relative offset + bits<16> imm16; + + let Inst{47-32} = imm16; + let Inst{31} = imm12{11}; + let Inst{30-25} = imm12{9-4}; + let Inst{24-20} = func5; + let Inst{19-15} = rs1; + let Inst{14-12} = func3; + let Inst{11-8} = imm12{3-0}; + let Inst{7} = imm12{10}; + let Inst{6-0} = opcode; + + let AsmString = ".insn qc.eb " # argstr; +} + +class DirectiveInsnQC_EJ + : RVInst48 { + bits<7> opcode; + bits<3> func3; + bits<2> func2; + bits<5> func5; + + bits<31> imm31; + + let Inst{47-32} = imm31{30-15}; + let Inst{31} = imm31{11}; + let Inst{30-25} = imm31{9-4}; + let Inst{24-20} = func5; + let Inst{19-17} = imm31{14-12}; + let Inst{16-15} = func2; + let Inst{14-12} = func3; + let Inst{11-8} = imm31{3-0}; + let Inst{7} = imm31{10}; + let Inst{6-0} = opcode; + + let AsmString = ".insn qc.ej " # argstr; +} + +class DirectiveInsnQC_ES + : RVInst48 { + bits<7> opcode; + bits<3> func3; + bits<2> func2; + + bits<5> rs1; + bits<5> rs2; + bits<26> imm26; + + let Inst{47-32} = imm26{25-10}; + let Inst{31-30} = func2; + let Inst{29-25} = imm26{9-5}; + let Inst{24-20} = rs2; + let Inst{19-15} = rs1; + let Inst{14-12} = func3; + let Inst{11-7} = imm26{4-0}; + let Inst{6-0} = opcode; + + let AsmString = ".insn qc.es " # argstr; +} + + +let isCodeGenOnly = true, hasSideEffects = true, mayLoad = true, + mayStore = true, hasNoSchedulingInfo = true, Predicates=[IsRV32] in { +def InsnQC_EAI : DirectiveInsnQC_EAI<(outs AnyReg:$rd), + (ins uimm7_opcode:$opcode, + uimm3:$func3, + uimm1:$func1, + simm32:$imm32), + "$opcode, $func3, $func1, $rd, $imm32">; +def InsnQC_EI : DirectiveInsnQC_EI<(outs AnyReg:$rd), + (ins uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs1, + simm26:$imm26), + "$opcode, $func3, $func2, $rd, $rs1, $imm26">; +def InsnQC_EI_Mem : DirectiveInsnQC_EI<(outs AnyReg:$rd), + (ins uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs1, + simm26:$imm26), + "$opcode, $func3, $func2, $rd, ${imm26}(${rs1})">; +def InsnQC_EB : DirectiveInsnQC_EB<(outs), + (ins uimm7_opcode:$opcode, + uimm3:$func3, + uimm5:$func5, + AnyReg:$rs1, + simm16:$imm16, + bare_simm13_lsb0:$imm12), + "$opcode, $func3, $func5, $rs1, $imm16, $imm12">; +def InsnQC_EJ : DirectiveInsnQC_EJ<(outs), + (ins uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + uimm5:$func5, + simm32_lsb0:$imm31), + "$opcode, $func3, $func2, $func5, $imm31">; +def InsnQC_ES : DirectiveInsnQC_ES<(outs), + (ins uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs2, + AnyReg:$rs1, + simm26:$imm26), + "$opcode, $func3, $func2, $rs2, ${imm26}(${rs1})">; +} // isCodeGenOnly, hasSideEffects, mayLoad, mayStore, hasNoSchedulingInfo, Predicates + +let EmitPriority = 0, Predicates = [IsRV32] in { +def : InstAlias<".insn_qc.eai $opcode, $func3, $func1, $rd, $imm32", + (InsnQC_EAI AnyReg:$rd, + uimm7_opcode:$opcode, + uimm3:$func3, + uimm1:$func1, + simm32:$imm32)>; +def : InstAlias<".insn_qc.ei $opcode, $func3, $func2, $rd, $rs1, $imm26", + (InsnQC_EI AnyReg:$rd, + uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs1, + simm26:$imm26)>; +def : InstAlias<".insn_qc.ei $opcode, $func3, $func2, $rd, ${imm26}(${rs1})", + (InsnQC_EI_Mem AnyReg:$rd, + uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs1, + simm26:$imm26)>; +def : InstAlias<".insn_qc.ei $opcode, $func3, $func2, $rd, (${rs1})", + (InsnQC_EI_Mem AnyReg:$rd, + uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs1, + 0)>; +def : InstAlias<".insn_qc.eb $opcode, $func3, $func5, $rs1, $imm16, $imm12", + (InsnQC_EB uimm7_opcode:$opcode, + uimm3:$func3, + uimm5:$func5, + AnyReg:$rs1, + simm16:$imm16, + bare_simm13_lsb0:$imm12)>; +def : InstAlias<".insn_qc.ej $opcode, $func3, $func2, $func5, $imm31", + (InsnQC_EJ uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + uimm5:$func5, + simm32_lsb0:$imm31)>; +def : InstAlias<".insn_qc.es $opcode, $func3, $func2, $rs2, ${imm26}(${rs1})", + (InsnQC_ES uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs2, + AnyReg:$rs1, + simm26:$imm26)>; +def : InstAlias<".insn_qc.es $opcode, $func3, $func2, $rs2, (${rs1})", + (InsnQC_ES uimm7_opcode:$opcode, + uimm3:$func3, + uimm2:$func2, + AnyReg:$rs2, + AnyReg:$rs1, + 0)>; +} // EmitPriority = 0, Predicates = [IsRV32] + //===----------------------------------------------------------------------===// // Instruction Class Templates //===----------------------------------------------------------------------===// diff --git a/llvm/test/MC/RISCV/insn_xqci-invalid.s b/llvm/test/MC/RISCV/insn_xqci-invalid.s new file mode 100644 index 0000000000000..8177adaf8ac50 --- /dev/null +++ b/llvm/test/MC/RISCV/insn_xqci-invalid.s @@ -0,0 +1,111 @@ +# RUN: not llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ +# RUN: 2>&1 | FileCheck -check-prefixes=CHECK-ERR %s + +.insn qc.eai 128, 0, 0, x0, 0 +# CHECK-ERR: [[@LINE-1]]:14: error: opcode must be a valid opcode name or an immediate in the range [0, 127] + +.insn qc.eai 127, 8, 0, x0, 0 +# CHECK-ERR: [[@LINE-1]]:19: error: immediate must be an integer in the range [0, 7] + +.insn qc.eai 127, 7, 2, x0, 0 +# CHECK-ERR: [[@LINE-1]]:22: error: immediate must be an integer in the range [0, 1] + +.insn qc.eai 127, 7, 1, not_a_reg, 0 +# CHECK-ERR: [[@LINE-1]]:25: error: invalid operand for instruction + +.insn qc.eai 127, 7, 1, x31, 0x100000000 +# CHECK-ERR: [[@LINE-1]]:30: error: immediate must be an integer in the range [-2147483648, 4294967295] + +.insn qc.eai 126, 7, 1, x31, 0xFFFFFFFF, extra +# CHECK-ERR: [[@LINE-1]]:42: error: invalid operand for instruction + +.insn qc.ei 128, 0, 0, x31, x0, 0 +# CHECK-ERR: [[@LINE-1]]:13: error: opcode must be a valid opcode name or an immediate in the range [0, 127] + +.insn qc.ei 127, 8, 0, x0, x0, 0 +# CHECK-ERR: [[@LINE-1]]:18: error: immediate must be an integer in the range [0, 7] + +.insn qc.ei 127, 7, 4, x0, x0, 0 +# CHECK-ERR: [[@LINE-1]]:21: error: immediate must be an integer in the range [0, 3] + +.insn qc.ei 127, 7, 3, not_a_reg, x0, 0 +# CHECK-ERR: [[@LINE-1]]:24: error: invalid operand for instruction + +.insn qc.ei 127, 7, 3, x31, not_a_reg, 0 +# CHECK-ERR: [[@LINE-1]]:29: error: immediate must be an integer in the range [-33554432, 33554431] + +.insn qc.ei 127, 7, 3, x31, x31, 0x2000000 +# CHECK-ERR: [[@LINE-1]]:34: error: immediate must be an integer in the range [-33554432, 33554431] + +.insn qc.ei 127, 7, 3, x31, x31, 0x1000000, extra +# CHECK-ERR: [[@LINE-1]]:45: error: invalid operand for instruction + +.insn qc.ei 126, 7, 3, x31, 0x2000000(x0) +# CHECK-ERR: [[@LINE-1]]:29: error: immediate must be an integer in the range [-33554432, 33554431] + +.insn qc.ei 126, 7, 3, x31, 0x1000000(not_a_reg) +# CHECK-ERR: [[@LINE-1]]:39: error: expected register + +.insn qc.ei 126, 7, 3, x31, 0x1000000(x31), extra +# CHECK-ERR: [[@LINE-1]]:45: error: invalid operand for instruction + +.insn qc.eb 128, 0, 0, x0, 0, 0 +# CHECK-ERR: [[@LINE-1]]:13: error: opcode must be a valid opcode name or an immediate in the range [0, 127] + +.insn qc.eb 127, 8, 0, x0, 0, 0 +# CHECK-ERR: [[@LINE-1]]:18: error: immediate must be an integer in the range [0, 7] + +.insn qc.eb 127, 7, 32, x0, 0, 0 +# CHECK-ERR: [[@LINE-1]]:21: error: immediate must be an integer in the range [0, 31] + +.insn qc.eb 127, 7, 31, not_a_reg, 0, 0 +# CHECK-ERR: [[@LINE-1]]:25: error: invalid operand for instruction + +.insn qc.eb 127, 7, 31, x31, 0x8000, 0 +# CHECK-ERR: [[@LINE-1]]:30: error: immediate must be an integer in the range [-32768, 32767] + +.insn qc.eb 127, 7, 31, x31, 0x4000, 0x1000 +# CHECK-ERR: [[@LINE-1]]:38: error: immediate must be a multiple of 2 bytes in the range [-4096, 4094] + +.insn qc.eb 127, 7, 31, x31, 0x4000, 0x800, extra +# CHECK-ERR: [[@LINE-1]]:45: error: invalid operand for instruction + + +.insn qc.ej 128, 0, 0, 0, 0 +# CHECK-ERR: [[@LINE-1]]:13: error: opcode must be a valid opcode name or an immediate in the range [0, 127] + +.insn qc.ej 127, 8, 0, 0, 0 +# CHECK-ERR: [[@LINE-1]]:18: error: immediate must be an integer in the range [0, 7] + +.insn qc.ej 127, 7, 4, 0, 0 +# CHECK-ERR: [[@LINE-1]]:21: error: immediate must be an integer in the range [0, 3] + +.insn qc.ej 127, 7, 3, 32, 0 +# CHECK-ERR: [[@LINE-1]]:24: error: immediate must be an integer in the range [0, 31] + +.insn qc.ej 127, 7, 3, 31, 0x100000000 +# CHECK-ERR: [[@LINE-1]]:28: error: operand must be a multiple of 2 bytes in the range [-2147483648, 2147483646] + +.insn qc.ej 127, 7, 3, 31, 0x80000000, extra +# CHECK-ERR: [[@LINE-1]]:40: error: invalid operand for instruction + +.insn qc.es 128, 0, 0, x0, 0(x0) +# CHECK-ERR: [[@LINE-1]]:13: error: opcode must be a valid opcode name or an immediate in the range [0, 127] + +.insn qc.es 127, 8, 0, x0, 0(x0) +# CHECK-ERR: [[@LINE-1]]:18: error: immediate must be an integer in the range [0, 7] + +.insn qc.es 127, 7, 4, x0, 0(x0) +# CHECK-ERR: [[@LINE-1]]:21: error: immediate must be an integer in the range [0, 3] + +.insn qc.es 127, 7, 3, not_a_reg, 0(x0) +# CHECK-ERR: [[@LINE-1]]:24: error: invalid operand for instruction + +.insn qc.es 127, 7, 3, x31, 0x2000000(x0) +# CHECK-ERR: [[@LINE-1]]:29: error: immediate must be an integer in the range [-33554432, 33554431] + +.insn qc.es 127, 7, 3, x31, 0x1000000(not_a_reg) +# CHECK-ERR: [[@LINE-1]]:39: error: expected register + +.insn qc.es 127, 7, 3, x31, 0x1000000(x31), extra +# CHECK-ERR: [[@LINE-1]]:45: error: invalid operand for instruction diff --git a/llvm/test/MC/RISCV/insn_xqci.s b/llvm/test/MC/RISCV/insn_xqci.s new file mode 100644 index 0000000000000..098745ec22294 --- /dev/null +++ b/llvm/test/MC/RISCV/insn_xqci.s @@ -0,0 +1,41 @@ +# RUN: llvm-mc %s -triple=riscv32 -M no-aliases -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ASM %s +# RUN: llvm-mc -filetype=obj -triple=riscv32 < %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqcilia,+experimental-xqcilo,+experimental-xqcibi,+experimental-xqcilb \ +# RUN: -M no-aliases -d -r - \ +# RUN: | FileCheck -check-prefixes=CHECK-OBJ %s + +# CHECK-ASM: .insn qc.eai 31, 2, 0, a0, 16711935 +# CHECK-ASM: encoding: [0x1f,0x25,0xff,0x00,0xff,0x00] +# CHECK-OBJ: qc.e.addai a0, 0xff00ff +.insn qc.eai 0x1f, 2, 0, a0, 0x00FF00FF + +# CHECK-ASM: .insn qc.ei 31, 3, 2, a0, a1, 16711935 +# CHECK-ASM: encoding: [0x1f,0xb5,0xf5,0x8f,0xc0,0x3f] +# CHECK-OBJ: qc.e.addi a0, a1, 0xff00ff +.insn qc.ei 0x1f, 3, 2, a0, a1, 0x00FF00FF + +# CHECK-ASM: .insn qc.ei 31, 5, 0, a1, 16711935(a0) +# CHECK-ASM: encoding: [0x9f,0x55,0xf5,0x0f,0xc0,0x3f] +# CHECK-OBJ: qc.e.lb a1, 0xff00ff(a0) +.insn qc.ei 0x1f, 5, 0, a1, 0x00FF00FF(a0) + +# CHECK-ASM: .insn qc.ei 31, 5, 0, a1, 0(a0) +# CHECK-ASM: encoding: [0x9f,0x55,0x05,0x00,0x00,0x00] +# CHECK-OBJ: qc.e.lb a1, 0x0(a0) +.insn qc.ei 0x1f, 5, 0, a1, (a0) + +# CHECK-ASM: .insn qc.eb 31, 4, 24, a0, 17476, 22 +# CHECK-ASM: encoding: [0x1f,0x4b,0x85,0x01,0x44,0x44] +# CHECK-OBJ: qc.e.beqi a0, 0x4444, 0x2e +.insn qc.eb 0x1f, 4, 24, a0, 0x4444, 22 + +# CHECK-ASM: .insn qc.ej 31, 4, 0, 0, 22 +# CHECK-ASM: encoding: [0x1f,0x4b,0x00,0x00,0x00,0x00] +# CHECK-OBJ: qc.e.j 0x34 +.insn qc.ej 0x1f, 4, 0, 0, 22 + +# CHECK-ASM: .insn qc.es 31, 6, 1, a1, 0(a0) +# CHECK-ASM: encoding: [0x1f,0x60,0xb5,0x40,0x00,0x00] +# CHECK-OBJ: qc.e.sb a1, 0x0(a0) +.insn qc.es 0x1f, 6, 1, a1, (a0) diff --git a/llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s b/llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s index fe6d0de0a4b00..e45c43a50048a 100644 --- a/llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s +++ b/llvm/test/MC/RISCV/rv64xtheadmemidx-invalid.s @@ -1,7 +1,7 @@ # RUN: not llvm-mc -triple riscv32 -mattr=+xtheadmemidx < %s 2>&1 | FileCheck %s # RUN: not llvm-mc -triple riscv64 -mattr=+xtheadmemidx < %s 2>&1 | FileCheck %s -th.ldia 0(a0), (a1), 0, 0 # CHECK: :[[@LINE]]:23: error: invalid operand for instruction +th.ldia 0(a0), (a1), 0, 0 # CHECK: :[[@LINE]]:26: error: invalid operand for instruction th.ldib a0, 2(a1), 15, 1 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction th.lwia a0, (a1), 30, 2 # CHECK: :[[@LINE]]:20: error: immediate must be an integer in the range [-16, 15] th.lwib a0, (a1), -16, 43 # CHECK: :[[@LINE]]:25: error: immediate must be an integer in the range [0, 3] From 3ea7902494643517c519142002e42a65e81f40d0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 2 Apr 2025 21:48:48 -0700 Subject: [PATCH 0484/1029] [RISCV] Check S0 register list check for qc.cm.pushfp to after we parsed the whole register list. (#134180) This is more of a semantic check. The diagnostic location to has been changed to point at the register list start instead of the closing brace or whatever character might be there instead of a brace if its malformed. --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 20 +++++-------------- llvm/test/MC/RISCV/rv32xqccmp-invalid.s | 2 +- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index d2af2951add6f..aa41410c735b7 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2592,20 +2592,8 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, return Error(getLoc(), "register list must start from 'ra' or 'x1'"); getLexer().Lex(); - bool SeenComma = parseOptionalToken(AsmToken::Comma); - - // There are two choices here: - // - `s0` is not required (usual case), so only try to parse `s0` if there is - // a comma - // - `s0` is required (qc.cm.pushfp), and so we must see the comma between - // `ra` and `s0` and must always try to parse `s0`, below - if (MustIncludeS0 && !SeenComma) { - Error(getLoc(), "register list must include 's0' or 'x8'"); - return ParseStatus::Failure; - } - // parse case like ,s0 (knowing the comma must be there if required) - if (SeenComma) { + if (parseOptionalToken(AsmToken::Comma)) { if (getLexer().isNot(AsmToken::Identifier)) return Error(getLoc(), "invalid register"); StringRef RegName = getLexer().getTok().getIdentifier(); @@ -2668,8 +2656,10 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, auto Encode = RISCVZC::encodeRlist(RegEnd, IsRVE); assert(Encode != RISCVZC::INVALID_RLIST); - if (MustIncludeS0) - assert(Encode != RISCVZC::RA); + + if (MustIncludeS0 && Encode == RISCVZC::RA) + return Error(S, "register list must include 's0' or 'x8'"); + Operands.push_back(RISCVOperand::createRlist(Encode, S)); return ParseStatus::Success; diff --git a/llvm/test/MC/RISCV/rv32xqccmp-invalid.s b/llvm/test/MC/RISCV/rv32xqccmp-invalid.s index e43f86cbb84ee..5bfc2e3498bef 100644 --- a/llvm/test/MC/RISCV/rv32xqccmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32xqccmp-invalid.s @@ -34,6 +34,6 @@ qc.cm.pushfp {ra, s0}, -12 # CHECK-ERROR: :[[@LINE+1]]:24: error: stack adjustment for register list must be a multiple of 16 bytes in the range [16, 64] qc.cm.pop {ra, s0-s1}, -40 -# CHECK-ERROR: :[[@LINE+1]]:17: error: register list must include 's0' or 'x8' +# CHECK-ERROR: :[[@LINE+1]]:14: error: register list must include 's0' or 'x8' qc.cm.pushfp {ra}, -16 From f4048268427f7a5dab4dea9b2d0fd908b8660644 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 2 Apr 2025 21:51:31 -0700 Subject: [PATCH 0485/1029] [RISCV] Don't allow '-' after 'ra' in Zcmp/Xqccmp register list. (#134182) Move the parsing of '-' under the check that we parsed a comma. Unfortunately, this leads to a poor error, but I still have more known issues in this code and may end up with an overall restructuring and want to think about wording. --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 66 +++++++++---------- llvm/test/MC/RISCV/rv64zcmp-invalid.s | 3 + 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index aa41410c735b7..d90d1dda07081 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2604,46 +2604,46 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, return Error(getLoc(), "continuous register list must start from 's0' or 'x8'"); getLexer().Lex(); // eat reg - } - - // parse case like -s1 - if (parseOptionalToken(AsmToken::Minus)) { - StringRef EndName = getLexer().getTok().getIdentifier(); - // FIXME: the register mapping and checks of RVE is wrong - RegEnd = matchRegisterNameHelper(EndName); - if (!(RegEnd == RISCV::X9 || - (RegEnd >= RISCV::X18 && RegEnd <= RISCV::X27))) - return Error(getLoc(), "invalid register"); - getLexer().Lex(); - } - // parse extra part like ', x18[-x20]' for XRegList - if (parseOptionalToken(AsmToken::Comma)) { - if (RegEnd != RISCV::X9) - return Error( - getLoc(), - "first contiguous registers pair of register list must be 'x8-x9'"); + // parse case like -s1 + if (parseOptionalToken(AsmToken::Minus)) { + StringRef EndName = getLexer().getTok().getIdentifier(); + // FIXME: the register mapping and checks of RVE is wrong + RegEnd = matchRegisterNameHelper(EndName); + if (!(RegEnd == RISCV::X9 || + (RegEnd >= RISCV::X18 && RegEnd <= RISCV::X27))) + return Error(getLoc(), "invalid register"); + getLexer().Lex(); + } - // parse ', x18' for extra part - if (getLexer().isNot(AsmToken::Identifier) || IsRVE) - return Error(getLoc(), "invalid register"); - StringRef EndName = getLexer().getTok().getIdentifier(); - RegEnd = MatchRegisterName(EndName); - if (RegEnd != RISCV::X18) - return Error(getLoc(), - "second contiguous registers pair of register list " - "must start from 'x18'"); - getLexer().Lex(); + // parse extra part like ', x18[-x20]' for XRegList + if (parseOptionalToken(AsmToken::Comma)) { + if (RegEnd != RISCV::X9) + return Error( + getLoc(), + "first contiguous registers pair of register list must be 'x8-x9'"); - // parse '-x20' for extra part - if (parseOptionalToken(AsmToken::Minus)) { + // parse ', x18' for extra part if (getLexer().isNot(AsmToken::Identifier) || IsRVE) return Error(getLoc(), "invalid register"); - EndName = getLexer().getTok().getIdentifier(); + StringRef EndName = getLexer().getTok().getIdentifier(); RegEnd = MatchRegisterName(EndName); - if (!(RegEnd >= RISCV::X19 && RegEnd <= RISCV::X27)) - return Error(getLoc(), "invalid register"); + if (RegEnd != RISCV::X18) + return Error(getLoc(), + "second contiguous registers pair of register list " + "must start from 'x18'"); getLexer().Lex(); + + // parse '-x20' for extra part + if (parseOptionalToken(AsmToken::Minus)) { + if (getLexer().isNot(AsmToken::Identifier) || IsRVE) + return Error(getLoc(), "invalid register"); + EndName = getLexer().getTok().getIdentifier(); + RegEnd = MatchRegisterName(EndName); + if (!(RegEnd >= RISCV::X19 && RegEnd <= RISCV::X27)) + return Error(getLoc(), "invalid register"); + getLexer().Lex(); + } } } diff --git a/llvm/test/MC/RISCV/rv64zcmp-invalid.s b/llvm/test/MC/RISCV/rv64zcmp-invalid.s index ffaffdf6a5999..c66415cb49b34 100644 --- a/llvm/test/MC/RISCV/rv64zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv64zcmp-invalid.s @@ -54,3 +54,6 @@ cm.pop {ra}, -x1 # CHECK-ERROR: :[[@LINE+1]]:15: error: stack adjustment is invalid for this instruction and register list cm.push {ra}, x1 + +# CHECK-ERROR: :[[@LINE+1]]:12: error: register list must end with '}' +cm.push {x1-x9}, -32 From 7f2abe8fd10e611696cbc637e715160851b1902b Mon Sep 17 00:00:00 2001 From: Snehasish Kumar Date: Wed, 2 Apr 2025 23:11:17 -0600 Subject: [PATCH 0486/1029] Revert "[Metadata] Preserve MD_prof when merging instructions when one is missing." (#134200) Reverts llvm/llvm-project#132433 I suspect this change caused a failure in the bolt build bot. https://lab.llvm.org/buildbot/#/builders/113/builds/6621 ``` !9185 = !{!"branch_weights", i32 3912, i32 802} Wrong number of operands !9185 = !{!"branch_weights", i32 3912, i32 802} fatal error: error in backend: Broken module found, compilation aborted! ``` --- llvm/lib/Transforms/Utils/Local.cpp | 19 ++---- ...rect-call-branch-weights-preserve-hoist.ll | 62 ------------------ ...irect-call-branch-weights-preserve-sink.ll | 63 ------------------- 3 files changed, 6 insertions(+), 138 deletions(-) delete mode 100644 llvm/test/Transforms/SimplifyCFG/merge-direct-call-branch-weights-preserve-hoist.ll delete mode 100644 llvm/test/Transforms/SimplifyCFG/merge-direct-call-branch-weights-preserve-sink.ll diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index c136825d47b9c..edec0e7a94422 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3355,10 +3355,9 @@ static void combineMetadata(Instruction *K, const Instruction *J, case LLVMContext::MD_invariant_group: // Preserve !invariant.group in K. break; - // Keep empty cases for prof, mmra, memprof, and callsite to prevent them - // from being removed as unknown metadata. The actual merging is handled + // Keep empty cases for mmra, memprof, and callsite to prevent them from + // being removed as unknown metadata. The actual merging is handled // separately below. - case LLVMContext::MD_prof: case LLVMContext::MD_mmra: case LLVMContext::MD_memprof: case LLVMContext::MD_callsite: @@ -3387,6 +3386,10 @@ static void combineMetadata(Instruction *K, const Instruction *J, if (!AAOnly) K->setMetadata(Kind, JMD); break; + case LLVMContext::MD_prof: + if (!AAOnly && DoesKMove) + K->setMetadata(Kind, MDNode::getMergedProfMetadata(KMD, JMD, K, J)); + break; case LLVMContext::MD_noalias_addrspace: if (DoesKMove) K->setMetadata(Kind, @@ -3433,16 +3436,6 @@ static void combineMetadata(Instruction *K, const Instruction *J, K->setMetadata(LLVMContext::MD_callsite, MDNode::getMergedCallsiteMetadata(KCallSite, JCallSite)); } - - // Merge prof metadata. - // Handle separately to support cases where only one instruction has the - // metadata. - auto *JProf = J->getMetadata(LLVMContext::MD_prof); - auto *KProf = K->getMetadata(LLVMContext::MD_prof); - if (!AAOnly && (JProf || KProf)) { - K->setMetadata(LLVMContext::MD_prof, - MDNode::getMergedProfMetadata(KProf, JProf, K, J)); - } } void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J, diff --git a/llvm/test/Transforms/SimplifyCFG/merge-direct-call-branch-weights-preserve-hoist.ll b/llvm/test/Transforms/SimplifyCFG/merge-direct-call-branch-weights-preserve-hoist.ll deleted file mode 100644 index d6058134f5285..0000000000000 --- a/llvm/test/Transforms/SimplifyCFG/merge-direct-call-branch-weights-preserve-hoist.ll +++ /dev/null @@ -1,62 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 2 -; RUN: opt < %s -passes='simplifycfg' -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s --check-prefix=HOIST - -; Test case based on C++ code with manualy annotated !prof metadata. -; This is to test that when calls to 'func1' from 'if.then' block -; and 'if.else' block are hoisted, the branch_weights are merged and -; attached to merged call rather than dropped. -; -; int func1(int a, int b) ; -; int func2(int a, int b) ; - -; int func(int a, int b, bool c) { -; int sum= 0; -; if(c) { -; sum += func1(a, b); -; } else { -; sum += func1(a, b); -; sum -= func2(a, b); -; } -; return sum; -; } -define i32 @_Z4funciib(i32 %a, i32 %b, i1 %c) { -; HOIST-LABEL: define i32 @_Z4funciib -; HOIST-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i1 [[C:%.*]]) { -; HOIST-NEXT: entry: -; HOIST-NEXT: [[CALL:%.*]] = tail call i32 @_Z5func1ii(i32 [[A]], i32 [[B]]), !prof [[PROF0:![0-9]+]] -; HOIST-NEXT: br i1 [[C]], label [[IF_END:%.*]], label [[IF_ELSE:%.*]] -; HOIST: if.else: -; HOIST-NEXT: [[CALL3:%.*]] = tail call i32 @_Z5func2ii(i32 [[A]], i32 [[B]]) -; HOIST-NEXT: [[SUB:%.*]] = sub i32 [[CALL]], [[CALL3]] -; HOIST-NEXT: br label [[IF_END]] -; HOIST: if.end: -; HOIST-NEXT: [[SUM_0:%.*]] = phi i32 [ [[SUB]], [[IF_ELSE]] ], [ [[CALL]], [[ENTRY:%.*]] ] -; HOIST-NEXT: ret i32 [[SUM_0]] -; -entry: - br i1 %c, label %if.then, label %if.else - -if.then: ; preds = %entry - %call = tail call i32 @_Z5func1ii(i32 %a, i32 %b) - br label %if.end - -if.else: ; preds = %entry - %call1 = tail call i32 @_Z5func1ii(i32 %a, i32 %b), !prof !0 - %call3 = tail call i32 @_Z5func2ii(i32 %a, i32 %b) - %sub = sub i32 %call1, %call3 - br label %if.end - -if.end: ; preds = %if.else, %if.then - %sum.0 = phi i32 [ %call, %if.then ], [ %sub, %if.else ] - ret i32 %sum.0 -} - -declare i32 @_Z5func1ii(i32, i32) - -declare i32 @_Z5func2ii(i32, i32) - -!0 = !{!"branch_weights", i32 10} - -;. -; HOIST: [[PROF0]] = !{!"branch_weights", i32 10} -;. diff --git a/llvm/test/Transforms/SimplifyCFG/merge-direct-call-branch-weights-preserve-sink.ll b/llvm/test/Transforms/SimplifyCFG/merge-direct-call-branch-weights-preserve-sink.ll deleted file mode 100644 index c4aed5eb95888..0000000000000 --- a/llvm/test/Transforms/SimplifyCFG/merge-direct-call-branch-weights-preserve-sink.ll +++ /dev/null @@ -1,63 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 2 -; RUN: opt < %s -passes='simplifycfg' -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s --check-prefix=SINK - - -; Test case based on the following C++ code with manualy annotated !prof metadata. -; This is to test that when calls to 'func1' from 'if.then' and 'if.else' are -; sinked, the branch weights are merged and attached to sinked call. -; -; int func1(int a, int b) ; -; int func2(int a, int b) ; - -; int func(int a, int b, bool c) { -; int sum = 0; -; if (c) { -; sum += func1(a,b); -; } else { -; b -= func2(a,b); -; sum += func1(a,b); -; } -; return sum; -; } - -define i32 @_Z4funciib(i32 %a, i32 %b, i1 %c) { -; SINK-LABEL: define i32 @_Z4funciib -; SINK-SAME: (i32 [[A:%.*]], i32 [[B:%.*]], i1 [[C:%.*]]) { -; SINK-NEXT: entry: -; SINK-NEXT: br i1 [[C]], label [[IF_END:%.*]], label [[IF_ELSE:%.*]] -; SINK: if.else: -; SINK-NEXT: [[CALL1:%.*]] = tail call i32 @_Z5func2ii(i32 [[A]], i32 [[B]]) -; SINK-NEXT: [[SUB:%.*]] = sub i32 [[B]], [[CALL1]] -; SINK-NEXT: br label [[IF_END]] -; SINK: if.end: -; SINK-NEXT: [[SUB_SINK:%.*]] = phi i32 [ [[SUB]], [[IF_ELSE]] ], [ [[B]], [[ENTRY:%.*]] ] -; SINK-NEXT: [[CALL2:%.*]] = tail call i32 @_Z5func1ii(i32 [[A]], i32 [[SUB_SINK]]), !prof [[PROF0:![0-9]+]] -; SINK-NEXT: ret i32 [[CALL2]] -; -entry: - br i1 %c, label %if.then, label %if.else - -if.then: ; preds = %entry - %call = tail call i32 @_Z5func1ii(i32 %a, i32 %b), !prof !0 - br label %if.end - -if.else: ; preds = %entry - %call1 = tail call i32 @_Z5func2ii(i32 %a, i32 %b) - %sub = sub i32 %b, %call1 - %call2 = tail call i32 @_Z5func1ii(i32 %a, i32 %sub) - br label %if.end - -if.end: ; preds = %if.else, %if.then - %sum.0 = phi i32 [ %call, %if.then ], [ %call2, %if.else ] - ret i32 %sum.0 -} - -declare i32 @_Z5func1ii(i32, i32) - -declare i32 @_Z5func2ii(i32, i32) - -!0 = !{!"branch_weights", i32 10} - -;. -; SINK: [[PROF0]] = !{!"branch_weights", i32 10} -;. From a19c018379a1d08eceb5db533a19a6bf37423975 Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Wed, 2 Apr 2025 23:01:51 -0700 Subject: [PATCH 0487/1029] Revert "[lldb][debugserver] Save and restore the SVE/SME register state (#134184)" This reverts commit 4e40c7c4bd66d98f529a807dbf410dc46444f4ca. arm64 CI is getting a failure in lldb-api.tools/lldb-server.TestGdbRemoteRegisterState.py with this commit, need to investigate and re-land. --- .../source/MacOSX/arm64/DNBArchImplARM64.cpp | 47 +++---------------- 1 file changed, 7 insertions(+), 40 deletions(-) diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp index d32a63daa5672..34a4ee21f8502 100644 --- a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp +++ b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp @@ -2952,15 +2952,8 @@ kern_return_t DNBArchMachARM64::SetRegisterState(int set) { return err; switch (set) { - case e_regSetALL: { - kern_return_t ret = - SetGPRState() | SetVFPState() | SetEXCState() | SetDBGState(false); - if (CPUHasSME()) { - ret |= SetSVEState(); - ret |= SetSMEState(); - } - return ret; - } + case e_regSetALL: + return SetGPRState() | SetVFPState() | SetEXCState() | SetDBGState(false); case e_regSetGPR: return SetGPRState(); case e_regSetVFP: @@ -3126,20 +3119,9 @@ uint32_t DNBArchMachARM64::SaveRegisterState() { "error: GPR regs failed to read: %u ", kret); } else if ((kret = GetVFPState(force)) != KERN_SUCCESS) { - DNBLogThreadedIf(LOG_THREAD, - "DNBArchMachARM64::SaveRegisterState () " - "error: %s regs failed to read: %u", + DNBLogThreadedIf(LOG_THREAD, "DNBArchMachARM64::SaveRegisterState () " + "error: %s regs failed to read: %u", "VFP", kret); - } else if (CPUHasSME() && (kret = SetSVEState() != KERN_SUCCESS)) { - DNBLogThreadedIf(LOG_THREAD, - "DNBArchMachARM64::SaveRegisterState () " - "error: %s regs failed to read: %u", - "SVE", kret); - } else if (CPUHasSME() && (kret = SetSMEState() != KERN_SUCCESS)) { - DNBLogThreadedIf(LOG_THREAD, - "DNBArchMachARM64::SaveRegisterState () " - "error: %s regs failed to read: %u", - "SME", kret); } else { const uint32_t save_id = GetNextRegisterStateSaveID(); m_saved_register_states[save_id] = m_state.context; @@ -3162,26 +3144,11 @@ bool DNBArchMachARM64::RestoreRegisterState(uint32_t save_id) { save_id, kret); success = false; } else if ((kret = SetVFPState()) != KERN_SUCCESS) { - DNBLogThreadedIf(LOG_THREAD, - "DNBArchMachARM64::RestoreRegisterState " - "(save_id = %u) error: %s regs failed to " - "write: %u", + DNBLogThreadedIf(LOG_THREAD, "DNBArchMachARM64::RestoreRegisterState " + "(save_id = %u) error: %s regs failed to " + "write: %u", save_id, "VFP", kret); success = false; - } else if ((kret = SetSVEState()) != KERN_SUCCESS) { - DNBLogThreadedIf(LOG_THREAD, - "DNBArchMachARM64::RestoreRegisterState " - "(save_id = %u) error: %s regs failed to " - "write: %u", - save_id, "SVE", kret); - success = false; - } else if ((kret = SetSMEState()) != KERN_SUCCESS) { - DNBLogThreadedIf(LOG_THREAD, - "DNBArchMachARM64::RestoreRegisterState " - "(save_id = %u) error: %s regs failed to " - "write: %u", - save_id, "SME", kret); - success = false; } m_saved_register_states.erase(pos); return success; From b384d6d6ccc8f4452cd7086061c657ce76b41224 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 3 Apr 2025 14:04:19 +0800 Subject: [PATCH 0488/1029] [CodeGen] Don't include CGDebugInfo.h in CodeGenFunction.h (NFC) (#134100) This is an expensive header, only include it where needed. Move some functions out of line to achieve that. This reduces time to build clang by ~0.5% in terms of instructions retired. --- clang/lib/CodeGen/CGBuiltin.cpp | 1 + clang/lib/CodeGen/CGCall.cpp | 1 + clang/lib/CodeGen/CGCoroutine.cpp | 3 ++- clang/lib/CodeGen/CGDebugInfo.cpp | 20 +++++++++++++++ clang/lib/CodeGen/CGDeclCXX.cpp | 1 + clang/lib/CodeGen/CGException.cpp | 1 + clang/lib/CodeGen/CGExprAgg.cpp | 1 + clang/lib/CodeGen/CGExprComplex.cpp | 1 + clang/lib/CodeGen/CGNonTrivialStruct.cpp | 1 + clang/lib/CodeGen/CGOpenMPRuntime.cpp | 1 + clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 1 + clang/lib/CodeGen/CGStmtOpenMP.cpp | 1 + clang/lib/CodeGen/CGVTables.cpp | 1 + clang/lib/CodeGen/CodeGenFunction.h | 32 +++--------------------- clang/lib/CodeGen/CodeGenPGO.cpp | 12 +++++++++ clang/lib/CodeGen/CodeGenTypes.cpp | 1 + clang/lib/CodeGen/ItaniumCXXABI.cpp | 1 + clang/lib/CodeGen/MicrosoftCXXABI.cpp | 1 + clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 1 + 19 files changed, 53 insertions(+), 29 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 91ac7c5847b02..310addebd50e9 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14,6 +14,7 @@ #include "ABIInfo.h" #include "CGCUDARuntime.h" #include "CGCXXABI.h" +#include "CGDebugInfo.h" #include "CGObjCRuntime.h" #include "CGOpenCLRuntime.h" #include "CGRecordLayout.h" diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 3cefa3b0c585c..b202255c3a15b 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -17,6 +17,7 @@ #include "CGBlocks.h" #include "CGCXXABI.h" #include "CGCleanup.h" +#include "CGDebugInfo.h" #include "CGRecordLayout.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp index a9795c2c0dc8f..0fc488e98aaf0 100644 --- a/clang/lib/CodeGen/CGCoroutine.cpp +++ b/clang/lib/CodeGen/CGCoroutine.cpp @@ -11,10 +11,11 @@ //===----------------------------------------------------------------------===// #include "CGCleanup.h" +#include "CGDebugInfo.h" #include "CodeGenFunction.h" -#include "llvm/ADT/ScopeExit.h" #include "clang/AST/StmtCXX.h" #include "clang/AST/StmtVisitor.h" +#include "llvm/ADT/ScopeExit.h" using namespace clang; using namespace CodeGen; diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 52aa956121d73..d659243d38d5f 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -6223,3 +6223,23 @@ CGDebugInfo::createConstantValueExpression(const clang::ValueDecl *VD, return nullptr; } + +CodeGenFunction::LexicalScope::LexicalScope(CodeGenFunction &CGF, + SourceRange Range) + : RunCleanupsScope(CGF), Range(Range), ParentScope(CGF.CurLexicalScope) { + CGF.CurLexicalScope = this; + if (CGDebugInfo *DI = CGF.getDebugInfo()) + DI->EmitLexicalBlockStart(CGF.Builder, Range.getBegin()); +} + +CodeGenFunction::LexicalScope::~LexicalScope() { + if (CGDebugInfo *DI = CGF.getDebugInfo()) + DI->EmitLexicalBlockEnd(CGF.Builder, Range.getEnd()); + + // If we should perform a cleanup, force them now. Note that + // this ends the cleanup scope before rescoping any labels. + if (PerformCleanup) { + ApplyDebugLocation DL(CGF, Range.getEnd()); + ForceCleanup(); + } +} diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp index 33c048b48795c..e0921993bd14e 100644 --- a/clang/lib/CodeGen/CGDeclCXX.cpp +++ b/clang/lib/CodeGen/CGDeclCXX.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "CGCXXABI.h" +#include "CGDebugInfo.h" #include "CGHLSLRuntime.h" #include "CGObjCRuntime.h" #include "CGOpenMPRuntime.h" diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp index b4b8c2952b02b..ebecb3aa5241d 100644 --- a/clang/lib/CodeGen/CGException.cpp +++ b/clang/lib/CodeGen/CGException.cpp @@ -12,6 +12,7 @@ #include "CGCXXABI.h" #include "CGCleanup.h" +#include "CGDebugInfo.h" #include "CGObjCRuntime.h" #include "CodeGenFunction.h" #include "ConstantEmitter.h" diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp index c8bdda375d1b1..87b2a73fb0c03 100644 --- a/clang/lib/CodeGen/CGExprAgg.cpp +++ b/clang/lib/CodeGen/CGExprAgg.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "CGCXXABI.h" +#include "CGDebugInfo.h" #include "CGHLSLRuntime.h" #include "CGObjCRuntime.h" #include "CGRecordLayout.h" diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp index 184a355734046..f556594f4a9ec 100644 --- a/clang/lib/CodeGen/CGExprComplex.cpp +++ b/clang/lib/CodeGen/CGExprComplex.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "CGDebugInfo.h" #include "CGOpenMPRuntime.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" diff --git a/clang/lib/CodeGen/CGNonTrivialStruct.cpp b/clang/lib/CodeGen/CGNonTrivialStruct.cpp index d90c44d770d14..c634b5c010e2d 100644 --- a/clang/lib/CodeGen/CGNonTrivialStruct.cpp +++ b/clang/lib/CodeGen/CGNonTrivialStruct.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "CGDebugInfo.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" #include "clang/AST/NonTrivialTypeVisitor.h" diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index b706fa3759c0d..5736864d4cc6b 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -14,6 +14,7 @@ #include "ABIInfoImpl.h" #include "CGCXXABI.h" #include "CGCleanup.h" +#include "CGDebugInfo.h" #include "CGRecordLayout.h" #include "CodeGenFunction.h" #include "TargetInfo.h" diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index feb2448297542..e86fbffe63252 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeGPU.h" +#include "CGDebugInfo.h" #include "CodeGenFunction.h" #include "clang/AST/Attr.h" #include "clang/AST/DeclOpenMP.h" diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index e4d1db264aac9..156f64bb5f508 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "CGCleanup.h" +#include "CGDebugInfo.h" #include "CGOpenMPRuntime.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp index dcd1fa77fa834..0a1cf24fbfa56 100644 --- a/clang/lib/CodeGen/CGVTables.cpp +++ b/clang/lib/CodeGen/CGVTables.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "CGCXXABI.h" +#include "CGDebugInfo.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" #include "clang/AST/Attr.h" diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index dd73d3b3a75f3..af9798b30fbcf 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -14,7 +14,6 @@ #define LLVM_CLANG_LIB_CODEGEN_CODEGENFUNCTION_H #include "CGBuilder.h" -#include "CGDebugInfo.h" #include "CGLoopInfo.h" #include "CGValue.h" #include "CodeGenModule.h" @@ -48,6 +47,7 @@ namespace llvm { class BasicBlock; +class ConvergenceControlInst; class LLVMContext; class MDNode; class SwitchInst; @@ -1105,13 +1105,7 @@ class CodeGenFunction : public CodeGenTypeCache { public: /// Enter a new cleanup scope. - explicit LexicalScope(CodeGenFunction &CGF, SourceRange Range) - : RunCleanupsScope(CGF), Range(Range), - ParentScope(CGF.CurLexicalScope) { - CGF.CurLexicalScope = this; - if (CGDebugInfo *DI = CGF.getDebugInfo()) - DI->EmitLexicalBlockStart(CGF.Builder, Range.getBegin()); - } + explicit LexicalScope(CodeGenFunction &CGF, SourceRange Range); void addLabel(const LabelDecl *label) { assert(PerformCleanup && "adding label to dead scope?"); @@ -1120,17 +1114,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// Exit this cleanup scope, emitting any accumulated /// cleanups. - ~LexicalScope() { - if (CGDebugInfo *DI = CGF.getDebugInfo()) - DI->EmitLexicalBlockEnd(CGF.Builder, Range.getEnd()); - - // If we should perform a cleanup, force them now. Note that - // this ends the cleanup scope before rescoping any labels. - if (PerformCleanup) { - ApplyDebugLocation DL(CGF, Range.getEnd()); - ForceCleanup(); - } - } + ~LexicalScope(); /// Force the emission of cleanups now, instead of waiting /// until this object is destroyed. @@ -1691,15 +1675,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// Increment the profiler's counter for the given statement by \p StepV. /// If \p StepV is null, the default increment is 1. - void incrementProfileCounter(const Stmt *S, llvm::Value *StepV = nullptr) { - if (CGM.getCodeGenOpts().hasProfileClangInstr() && - !CurFn->hasFnAttribute(llvm::Attribute::NoProfile) && - !CurFn->hasFnAttribute(llvm::Attribute::SkipProfile)) { - auto AL = ApplyDebugLocation::CreateArtificial(*this); - PGO.emitCounterSetOrIncrement(Builder, S, StepV); - } - PGO.setCurrentStmt(S); - } + void incrementProfileCounter(const Stmt *S, llvm::Value *StepV = nullptr); bool isMCDCCoverageEnabled() const { return (CGM.getCodeGenOpts().hasProfileClangInstr() && diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index 83a5f20e79aba..afa1d882545f0 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "CodeGenPGO.h" +#include "CGDebugInfo.h" #include "CodeGenFunction.h" #include "CoverageMappingGen.h" #include "clang/AST/RecursiveASTVisitor.h" @@ -1512,3 +1513,14 @@ CodeGenFunction::createProfileWeightsForLoop(const Stmt *Cond, return createProfileWeights(LoopCount, std::max(*CondCount, LoopCount) - LoopCount); } + +void CodeGenFunction::incrementProfileCounter(const Stmt *S, + llvm::Value *StepV) { + if (CGM.getCodeGenOpts().hasProfileClangInstr() && + !CurFn->hasFnAttribute(llvm::Attribute::NoProfile) && + !CurFn->hasFnAttribute(llvm::Attribute::SkipProfile)) { + auto AL = ApplyDebugLocation::CreateArtificial(*this); + PGO.emitCounterSetOrIncrement(Builder, S, StepV); + } + PGO.setCurrentStmt(S); +} diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index 11cf5758b6d3a..b94c11802a268 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -13,6 +13,7 @@ #include "CodeGenTypes.h" #include "CGCXXABI.h" #include "CGCall.h" +#include "CGDebugInfo.h" #include "CGHLSLRuntime.h" #include "CGOpenCLRuntime.h" #include "CGRecordLayout.h" diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp index 38e3a63ebfb11..2822d526a54b0 100644 --- a/clang/lib/CodeGen/ItaniumCXXABI.cpp +++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp @@ -19,6 +19,7 @@ #include "CGCXXABI.h" #include "CGCleanup.h" +#include "CGDebugInfo.h" #include "CGRecordLayout.h" #include "CGVTables.h" #include "CodeGenFunction.h" diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp index 464d4370284fb..91689bb2ec75f 100644 --- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp +++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp @@ -16,6 +16,7 @@ #include "ABIInfo.h" #include "CGCXXABI.h" #include "CGCleanup.h" +#include "CGDebugInfo.h" #include "CGVTables.h" #include "CodeGenModule.h" #include "CodeGenTypes.h" diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index afe25b5418424..ec2e9722028f1 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -12,6 +12,7 @@ #include "ABIInfo.h" #include "CGBuiltin.h" +#include "CGDebugInfo.h" #include "TargetInfo.h" #include "clang/Basic/TargetBuiltins.h" #include "llvm/IR/InlineAsm.h" From 7e65944292278cc245e36cc6ca971654d584012d Mon Sep 17 00:00:00 2001 From: Hua Tian Date: Thu, 3 Apr 2025 14:25:55 +0800 Subject: [PATCH 0489/1029] [llvm][CodeGen] avoid repeated interval calculation in window scheduler (#132352) Some new registers are reused when replacing some old ones in certain use case of ModuloScheduleExpander. It is necessary to avoid repeated interval calculations for these registers. --- llvm/include/llvm/CodeGen/ModuloSchedule.h | 4 - llvm/lib/CodeGen/ModuloSchedule.cpp | 32 +----- .../AArch64/aarch64-swp-ws-live-intervals.mir | 103 ++++++++++++++++++ 3 files changed, 108 insertions(+), 31 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-swp-ws-live-intervals.mir diff --git a/llvm/include/llvm/CodeGen/ModuloSchedule.h b/llvm/include/llvm/CodeGen/ModuloSchedule.h index b6000ba05d882..49dc746d3ee35 100644 --- a/llvm/include/llvm/CodeGen/ModuloSchedule.h +++ b/llvm/include/llvm/CodeGen/ModuloSchedule.h @@ -188,9 +188,6 @@ class ModuloScheduleExpander { /// Instructions to change when emitting the final schedule. InstrChangesTy InstrChanges; - /// Record the registers that need to compute live intervals. - SmallVector NoIntervalRegs; - void generatePipelinedLoop(); void generateProlog(unsigned LastStage, MachineBasicBlock *KernelBB, ValueMapTy *VRMap, MBBVectorTy &PrologBBs); @@ -214,7 +211,6 @@ class ModuloScheduleExpander { void addBranches(MachineBasicBlock &PreheaderBB, MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs, ValueMapTy *VRMap); - void calculateIntervals(); bool computeDelta(MachineInstr &MI, unsigned &Delta); void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI, unsigned Num); diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index d208a62a99372..352093ab6bdf9 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -181,10 +181,6 @@ void ModuloScheduleExpander::generatePipelinedLoop() { // Add branches between prolog and epilog blocks. addBranches(*Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap); - // The intervals of newly created virtual registers are calculated after the - // kernel expansion. - calculateIntervals(); - delete[] VRMap; delete[] VRMapPhi; } @@ -549,10 +545,8 @@ void ModuloScheduleExpander::generateExistingPhis( if (VRMap[LastStageNum - np - 1].count(LoopVal)) PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal]; - if (IsLast && np == NumPhis - 1) { + if (IsLast && np == NumPhis - 1) replaceRegUsesAfterLoop(Def, NewReg, BB, MRI); - NoIntervalRegs.push_back(NewReg); - } continue; } } @@ -592,10 +586,8 @@ void ModuloScheduleExpander::generateExistingPhis( // Check if we need to rename any uses that occurs after the loop. The // register to replace depends on whether the Phi is scheduled in the // epilog. - if (IsLast && np == NumPhis - 1) { + if (IsLast && np == NumPhis - 1) replaceRegUsesAfterLoop(Def, NewReg, BB, MRI); - NoIntervalRegs.push_back(NewReg); - } // In the kernel, a dependent Phi uses the value from this Phi. if (InKernel) @@ -615,10 +607,8 @@ void ModuloScheduleExpander::generateExistingPhis( if (NumStages == 0 && IsLast) { auto &CurStageMap = VRMap[CurStageNum]; auto It = CurStageMap.find(LoopVal); - if (It != CurStageMap.end()) { + if (It != CurStageMap.end()) replaceRegUsesAfterLoop(Def, It->second, BB, MRI); - NoIntervalRegs.push_back(It->second); - } } } } @@ -738,10 +728,8 @@ void ModuloScheduleExpander::generatePhis( rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def, NewReg); } - if (IsLast && np == NumPhis - 1) { + if (IsLast && np == NumPhis - 1) replaceRegUsesAfterLoop(Def, NewReg, BB, MRI); - NoIntervalRegs.push_back(NewReg); - } } } } @@ -953,14 +941,6 @@ void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB, } } -/// Some registers are generated during the kernel expansion. We calculate the -/// live intervals of these registers after the expansion. -void ModuloScheduleExpander::calculateIntervals() { - for (Register Reg : NoIntervalRegs) - LIS.createAndComputeVirtRegInterval(Reg); - NoIntervalRegs.clear(); -} - /// Return true if we can compute the amount the instruction changes /// during each iteration. Set Delta to the amount of the change. bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) { @@ -1081,10 +1061,8 @@ void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI, Register NewReg = MRI.createVirtualRegister(RC); MO.setReg(NewReg); VRMap[CurStageNum][reg] = NewReg; - if (LastDef) { + if (LastDef) replaceRegUsesAfterLoop(reg, NewReg, BB, MRI); - NoIntervalRegs.push_back(NewReg); - } } else if (MO.isUse()) { MachineInstr *Def = MRI.getVRegDef(reg); // Compute the stage that contains the last definition for instruction. diff --git a/llvm/test/CodeGen/AArch64/aarch64-swp-ws-live-intervals.mir b/llvm/test/CodeGen/AArch64/aarch64-swp-ws-live-intervals.mir new file mode 100644 index 0000000000000..48f02452e3597 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-swp-ws-live-intervals.mir @@ -0,0 +1,103 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc --mtriple=aarch64 %s -run-pass=pipeliner -o - | FileCheck %s + +... +--- +name: foo +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: foo + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[FMOVD0_:%[0-9]+]]:fpr64 = FMOVD0 + ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 1 + ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64sp = SUBREG_TO_REG 0, [[MOVi32imm]], %subreg.sub_32 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x40000000), %bb.7(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[FADDDrr:%[0-9]+]]:fpr64 = nofpexcept FADDDrr [[FMOVD0_]], [[FMOVD0_]], implicit $fpcr + ; CHECK-NEXT: [[SUBSXri:%[0-9]+]]:gpr64 = nsw SUBSXri [[SUBREG_TO_REG]], 1, 0, implicit-def $nzcv + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[SUBSXri]] + ; CHECK-NEXT: [[FMOVDi:%[0-9]+]]:fpr64 = FMOVDi 112 + ; CHECK-NEXT: Bcc 0, %bb.7, implicit $nzcv + ; CHECK-NEXT: B %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.5(0x80000000), %bb.6(0x00000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[FADDDrr1:%[0-9]+]]:fpr64 = nofpexcept FADDDrr [[FADDDrr]], [[FMOVD0_]], implicit $fpcr + ; CHECK-NEXT: [[FADDDrr2:%[0-9]+]]:fpr64 = nofpexcept FADDDrr [[FMOVD0_]], [[FMOVD0_]], implicit $fpcr + ; CHECK-NEXT: [[SUBSXri1:%[0-9]+]]:gpr64 = nsw SUBSXri [[COPY1]], 1, 0, implicit-def $nzcv + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64all = COPY [[SUBSXri1]] + ; CHECK-NEXT: [[FMOVDi1:%[0-9]+]]:fpr64 = FMOVDi 112 + ; CHECK-NEXT: Bcc 0, %bb.6, implicit $nzcv + ; CHECK-NEXT: B %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: successors: %bb.6(0x04000000), %bb.5(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr64sp = PHI [[COPY2]], %bb.4, %24, %bb.5 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:fpr64 = PHI [[FMOVDi1]], %bb.4, %25, %bb.5 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.4, [[PHI1]], %bb.5 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:fpr64 = PHI [[FADDDrr2]], %bb.4, %22, %bb.5 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:fpr64 = PHI [[FADDDrr1]], %bb.4, %23, %bb.5 + ; CHECK-NEXT: [[SUBSXri2:%[0-9]+]]:gpr64 = nsw SUBSXri [[PHI]], 1, 0, implicit-def $nzcv + ; CHECK-NEXT: [[FADDDrr3:%[0-9]+]]:fpr64 = nofpexcept FADDDrr [[PHI2]], [[FMOVD0_]], implicit $fpcr + ; CHECK-NEXT: [[FADDDrr4:%[0-9]+]]:fpr64 = nofpexcept FADDDrr [[PHI3]], [[PHI2]], implicit $fpcr + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64all = COPY [[SUBSXri2]] + ; CHECK-NEXT: STRDui [[PHI4]], [[COPY]], 0 + ; CHECK-NEXT: [[FMOVDi2:%[0-9]+]]:fpr64 = FMOVDi 112 + ; CHECK-NEXT: Bcc 1, %bb.5, implicit $nzcv + ; CHECK-NEXT: B %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6: + ; CHECK-NEXT: successors: %bb.7(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:fpr64 = PHI [[FMOVDi]], %bb.4, [[PHI1]], %bb.5 + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:fpr64 = PHI [[FADDDrr2]], %bb.4, [[FADDDrr3]], %bb.5 + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:fpr64 = PHI [[FADDDrr1]], %bb.4, [[FADDDrr4]], %bb.5 + ; CHECK-NEXT: STRDui [[PHI7]], [[COPY]], 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI8:%[0-9]+]]:fpr64 = PHI [[FMOVD0_]], %bb.3, [[PHI5]], %bb.6 + ; CHECK-NEXT: [[PHI9:%[0-9]+]]:fpr64 = PHI [[FADDDrr]], %bb.3, [[PHI6]], %bb.6 + ; CHECK-NEXT: [[FADDDrr5:%[0-9]+]]:fpr64 = nofpexcept FADDDrr [[PHI9]], [[PHI8]], implicit $fpcr + ; CHECK-NEXT: STRDui [[FADDDrr5]], [[COPY]], 0 + ; CHECK-NEXT: B %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: RET_ReallyLR + bb.0: + successors: %bb.1(0x80000000) + liveins: $x0 + + %0:gpr64common = COPY $x0 + %1:fpr64 = FMOVD0 + %2:gpr32 = MOVi32imm 1 + %3:gpr64all = SUBREG_TO_REG 0, killed %2, %subreg.sub_32 + + bb.1: + successors: %bb.2(0x04000000), %bb.1(0x7c000000) + + %4:gpr64sp = PHI %3, %bb.0, %5, %bb.1 + %6:fpr64 = PHI %1, %bb.0, %7, %bb.1 + %8:fpr64 = PHI %1, %bb.0, %6, %bb.1 + %9:fpr64 = nofpexcept FADDDrr %8, %1, implicit $fpcr + %10:fpr64 = nofpexcept FADDDrr killed %9, %6, implicit $fpcr + STRDui killed %10, %0, 0 + %11:gpr64 = nsw SUBSXri %4, 1, 0, implicit-def $nzcv + %5:gpr64all = COPY %11 + %7:fpr64 = FMOVDi 112 + Bcc 1, %bb.1, implicit $nzcv + B %bb.2 + + bb.2: + RET_ReallyLR + +... From 3295970d846b0d820b863f9eeac559b80239297e Mon Sep 17 00:00:00 2001 From: Iris <0.0@owo.li> Date: Thu, 3 Apr 2025 14:34:09 +0800 Subject: [PATCH 0490/1029] [ConstantFolding] Add support for `sinh` and `cosh` intrinsics in constant folding (#132671) Closes #132503. --- llvm/lib/Analysis/ConstantFolding.cpp | 6 + .../ConstProp/sinh-cosh-intrinsics.ll | 174 ++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index b0ba25c3c16ac..dc905ab03e861 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1651,6 +1651,8 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { case Intrinsic::sin: case Intrinsic::cos: case Intrinsic::sincos: + case Intrinsic::sinh: + case Intrinsic::cosh: case Intrinsic::pow: case Intrinsic::powi: case Intrinsic::ldexp: @@ -2513,6 +2515,10 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, return ConstantFoldFP(sin, APF, Ty); case Intrinsic::cos: return ConstantFoldFP(cos, APF, Ty); + case Intrinsic::sinh: + return ConstantFoldFP(sinh, APF, Ty); + case Intrinsic::cosh: + return ConstantFoldFP(cosh, APF, Ty); case Intrinsic::sqrt: return ConstantFoldFP(sqrt, APF, Ty); case Intrinsic::amdgcn_cos: diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll b/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll new file mode 100644 index 0000000000000..a4f318bbc834c --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll @@ -0,0 +1,174 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=instsimplify < %s | FileCheck %s + +define double @test_sinh_0() { +; CHECK-LABEL: define double @test_sinh_0() { +; CHECK-NEXT: ret double 0.000000e+00 +; + %result = call double @llvm.sinh.f64(double 0.0) + ret double %result +} + +define double @test_sinh_ln2() { +; CHECK-LABEL: define double @test_sinh_ln2() { +; CHECK-NEXT: ret double 7.500000e-01 +; + %res = call double @llvm.sinh.f64(double 0x3fe62e42fefa39ef) + ret double %res +} + +define <2 x double> @test_sinh_v2() { +; CHECK-LABEL: define <2 x double> @test_sinh_v2() { +; CHECK-NEXT: ret <2 x double> zeroinitializer +; + %result = call <2 x double> @llvm.sinh.v2f64(<2 x double> zeroinitializer) + ret <2 x double> %result +} + +define double @test_sinh_neg0() { +; CHECK-LABEL: define double @test_sinh_neg0() { +; CHECK-NEXT: ret double -0.000000e+00 +; + %res = call double @llvm.sinh.f64(double -0.0) + ret double %res +} + +define double @test_sinh_poison() { +; CHECK-LABEL: define double @test_sinh_poison() { +; CHECK-NEXT: [[RES:%.*]] = call double @llvm.sinh.f64(double poison) +; CHECK-NEXT: ret double [[RES]] +; + %res = call double @llvm.sinh.f64(double poison) + ret double %res +} + +define double @test_sinh_undef() { +; CHECK-LABEL: define double @test_sinh_undef() { +; CHECK-NEXT: [[RES:%.*]] = call double @llvm.sinh.f64(double undef) +; CHECK-NEXT: ret double [[RES]] +; + %res = call double @llvm.sinh.f64(double undef) + ret double %res +} + +define double @test_sinh_snan() { +; CHECK-LABEL: define double @test_sinh_snan() { +; CHECK-NEXT: [[RES:%.*]] = call double @llvm.sinh.f64(double 0x7FF0000000000001) +; CHECK-NEXT: ret double [[RES]] +; + %res = call double @llvm.sinh.f64(double 0x7ff0000000000001) + ret double %res +} + +define double @test_sinh_qnan() { +; CHECK-LABEL: define double @test_sinh_qnan() { +; CHECK-NEXT: [[RES:%.*]] = call double @llvm.sinh.f64(double 0x7FF8000000000000) +; CHECK-NEXT: ret double [[RES]] +; + %res = call double @llvm.sinh.f64(double 0x7ff8000000000000) + ret double %res +} + +define double @test_sinh_pos_inf() { +; CHECK-LABEL: define double @test_sinh_pos_inf() { +; CHECK-NEXT: [[RES:%.*]] = call double @llvm.sinh.f64(double 0x7FF0000000000000) +; CHECK-NEXT: ret double [[RES]] +; + %res = call double @llvm.sinh.f64(double 0x7ff0000000000000) + ret double %res +} + +define double @test_sinh_neg_inf() { +; CHECK-LABEL: define double @test_sinh_neg_inf() { +; CHECK-NEXT: [[RES:%.*]] = call double @llvm.sinh.f64(double 0xFFF0000000000000) +; CHECK-NEXT: ret double [[RES]] +; + %res = call double @llvm.sinh.f64(double 0xfff0000000000000) + ret double %res +} + +define double @test_cosh_0() { +; CHECK-LABEL: define double @test_cosh_0() { +; CHECK-NEXT: ret double 1.000000e+00 +; + %result = call double @llvm.cosh.f64(double 0.0) + ret double %result +} + +define double @test_cosh_ln2() { +; CHECK-LABEL: define double @test_cosh_ln2() { +; CHECK-NEXT: ret double 1.250000e+00 +; + %res = call double @llvm.cosh.f64(double 0x3fe62e42fefa39ef) + ret double %res +} + +define <2 x double> @test_cosh_v2() { +; CHECK-LABEL: define <2 x double> @test_cosh_v2() { +; CHECK-NEXT: ret <2 x double> splat (double 1.000000e+00) +; + %result = call <2 x double> @llvm.cosh.v2f64(<2 x double> zeroinitializer) + ret <2 x double> %result +} + +define double @test_cosh_neg0() { +; CHECK-LABEL: define double @test_cosh_neg0() { +; CHECK-NEXT: ret double 1.000000e+00 +; + %res = call double @llvm.cosh.f64(double -0.0) + ret double %res +} + +define double @test_cosh_poison() { +; CHECK-LABEL: define double @test_cosh_poison() { +; CHECK-NEXT: [[RES:%.*]] = call double @llvm.cosh.f64(double poison) +; CHECK-NEXT: ret double [[RES]] +; + %res = call double @llvm.cosh.f64(double poison) + ret double %res +} + +define double @test_cosh_undef() { +; CHECK-LABEL: define double @test_cosh_undef() { +; CHECK-NEXT: [[RES:%.*]] = call double @llvm.cosh.f64(double undef) +; CHECK-NEXT: ret double [[RES]] +; + %res = call double @llvm.cosh.f64(double undef) + ret double %res +} + +define double @test_cosh_snan() { +; CHECK-LABEL: define double @test_cosh_snan() { +; CHECK-NEXT: [[RES:%.*]] = call double @llvm.cosh.f64(double 0x7FF0000000000001) +; CHECK-NEXT: ret double [[RES]] +; + %res = call double @llvm.cosh.f64(double 0x7ff0000000000001) + ret double %res +} + +define double @test_cosh_qnan() { +; CHECK-LABEL: define double @test_cosh_qnan() { +; CHECK-NEXT: [[RES:%.*]] = call double @llvm.cosh.f64(double 0x7FF8000000000000) +; CHECK-NEXT: ret double [[RES]] +; + %res = call double @llvm.cosh.f64(double 0x7ff8000000000000) + ret double %res +} + +define double @test_cosh_pos_inf() { +; CHECK-LABEL: define double @test_cosh_pos_inf() { +; CHECK-NEXT: [[RES:%.*]] = call double @llvm.cosh.f64(double 0x7FF0000000000000) +; CHECK-NEXT: ret double [[RES]] +; + %res = call double @llvm.cosh.f64(double 0x7ff0000000000000) + ret double %res +} + +define double @test_cosh_neg_inf() { +; CHECK-LABEL: define double @test_cosh_neg_inf() { +; CHECK-NEXT: [[RES:%.*]] = call double @llvm.cosh.f64(double 0xFFF0000000000000) +; CHECK-NEXT: ret double [[RES]] +; + %res = call double @llvm.cosh.f64(double 0xfff0000000000000) + ret double %res +} From b6c0ce0bb67d822fac1e3b42461f66c261c1157c Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Thu, 3 Apr 2025 14:47:47 +0800 Subject: [PATCH 0491/1029] [IR][NFC] Use `SwitchInst::defaultDestUnreachable` (#134199) --- llvm/include/llvm/IR/Instructions.h | 2 +- llvm/include/llvm/SandboxIR/Instruction.h | 4 ++-- llvm/lib/Analysis/InlineCost.cpp | 16 ++++++++-------- .../Scalar/CorrelatedValuePropagation.cpp | 5 ++--- llvm/lib/Transforms/Utils/Local.cpp | 4 +--- llvm/lib/Transforms/Utils/LowerSwitch.cpp | 2 +- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 17 ++++++----------- llvm/unittests/SandboxIR/SandboxIRTest.cpp | 5 +++-- 8 files changed, 24 insertions(+), 31 deletions(-) diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 9a4fb2985264b..95f0ef875fc07 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -3366,7 +3366,7 @@ class SwitchInst : public Instruction { /// Returns true if the default branch must result in immediate undefined /// behavior, false otherwise. - bool defaultDestUndefined() const { + bool defaultDestUnreachable() const { return isa(getDefaultDest()->getFirstNonPHIOrDbg()); } diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h index 49ea6707ecd82..ce5a2cbec85bd 100644 --- a/llvm/include/llvm/SandboxIR/Instruction.h +++ b/llvm/include/llvm/SandboxIR/Instruction.h @@ -1865,8 +1865,8 @@ class SwitchInst : public SingleLLVMInstructionImpl { Value *getCondition() const; void setCondition(Value *V); BasicBlock *getDefaultDest() const; - bool defaultDestUndefined() const { - return cast(Val)->defaultDestUndefined(); + bool defaultDestUnreachable() const { + return cast(Val)->defaultDestUnreachable(); } void setDefaultDest(BasicBlock *DefaultCase); unsigned getNumCases() const { diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp index 9f193b610328b..30e1af602667c 100644 --- a/llvm/lib/Analysis/InlineCost.cpp +++ b/llvm/lib/Analysis/InlineCost.cpp @@ -344,7 +344,7 @@ class CallAnalyzer : public InstVisitor { /// Called at the end of processing a switch instruction, with the given /// number of case clusters. virtual void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, - bool DefaultDestUndefined) {} + bool DefaultDestUnreachable) {} /// Called to account for any other instruction not specifically accounted /// for. @@ -722,14 +722,14 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { } void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, - bool DefaultDestUndefined) override { + bool DefaultDestUnreachable) override { // If suitable for a jump table, consider the cost for the table size and // branch to destination. // Maximum valid cost increased in this function. if (JumpTableSize) { // Suppose a default branch includes one compare and one conditional // branch if it's reachable. - if (!DefaultDestUndefined) + if (!DefaultDestUnreachable) addCost(2 * InstrCost); // Suppose a jump table requires one load and one jump instruction. int64_t JTCost = @@ -742,7 +742,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer { // Suppose a comparison includes one compare and one conditional branch. // We can reduce a set of instructions if the default branch is // undefined. - addCost((NumCaseCluster - DefaultDestUndefined) * 2 * InstrCost); + addCost((NumCaseCluster - DefaultDestUnreachable) * 2 * InstrCost); return; } @@ -1268,9 +1268,9 @@ class InlineCostFeaturesAnalyzer final : public CallAnalyzer { } void onFinalizeSwitch(unsigned JumpTableSize, unsigned NumCaseCluster, - bool DefaultDestUndefined) override { + bool DefaultDestUnreachable) override { if (JumpTableSize) { - if (!DefaultDestUndefined) + if (!DefaultDestUnreachable) increment(InlineCostFeatureIndex::switch_default_dest_penalty, SwitchDefaultDestCostMultiplier * InstrCost); int64_t JTCost = static_cast(JumpTableSize) * InstrCost + @@ -1281,7 +1281,7 @@ class InlineCostFeaturesAnalyzer final : public CallAnalyzer { if (NumCaseCluster <= 3) { increment(InlineCostFeatureIndex::case_cluster_penalty, - (NumCaseCluster - DefaultDestUndefined) * + (NumCaseCluster - DefaultDestUnreachable) * CaseClusterCostMultiplier * InstrCost); return; } @@ -2508,7 +2508,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { unsigned NumCaseCluster = TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI); - onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUndefined()); + onFinalizeSwitch(JumpTableSize, NumCaseCluster, SI.defaultDestUnreachable()); return false; } diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 314a5d15f0f88..5226aeb66f65a 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -410,9 +410,8 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, ++ReachableCaseCount; } - BasicBlock *DefaultDest = SI->getDefaultDest(); - if (ReachableCaseCount > 1 && - !isa(DefaultDest->getFirstNonPHIOrDbg())) { + if (ReachableCaseCount > 1 && !SI->defaultDestUnreachable()) { + BasicBlock *DefaultDest = SI->getDefaultDest(); ConstantRange CR = LVI->getConstantRangeAtUse(I->getOperandUse(0), /*UndefAllowed*/ false); // The default dest is unreachable if all cases are covered. diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index edec0e7a94422..2f3ea2266e07f 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -203,10 +203,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, BasicBlock *TheOnlyDest = DefaultDest; // If the default is unreachable, ignore it when searching for TheOnlyDest. - if (isa(DefaultDest->getFirstNonPHIOrDbg()) && - SI->getNumCases() > 0) { + if (SI->defaultDestUnreachable() && SI->getNumCases() > 0) TheOnlyDest = SI->case_begin()->getCaseSuccessor(); - } bool Changed = false; diff --git a/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/llvm/lib/Transforms/Utils/LowerSwitch.cpp index c1999fed44296..b70310b364598 100644 --- a/llvm/lib/Transforms/Utils/LowerSwitch.cpp +++ b/llvm/lib/Transforms/Utils/LowerSwitch.cpp @@ -388,7 +388,7 @@ void ProcessSwitchInst(SwitchInst *SI, ConstantInt *UpperBound = nullptr; bool DefaultIsUnreachableFromSwitch = false; - if (isa(Default->getFirstNonPHIOrDbg())) { + if (SI->defaultDestUnreachable()) { // Make the bounds tightly fitted around the case value range, because we // know that the value passed to the switch must be exactly one of the case // values. diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 998677af3411e..e7c550be00b14 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -5715,8 +5715,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) { assert(SI->getNumCases() > 1 && "Degenerate switch?"); - bool HasDefault = - !isa(SI->getDefaultDest()->getFirstNonPHIOrDbg()); + bool HasDefault = !SI->defaultDestUnreachable(); auto *BB = SI->getParent(); @@ -5879,8 +5878,7 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU, // default destination becomes dead and we can remove it. If we know some // of the bits in the value, we can use that to more precisely compute the // number of possible unique case values. - bool HasDefault = - !isa(SI->getDefaultDest()->getFirstNonPHIOrDbg()); + bool HasDefault = !SI->defaultDestUnreachable(); const unsigned NumUnknownBits = Known.getBitWidth() - (Known.Zero | Known.One).popcount(); assert(NumUnknownBits <= Known.getBitWidth()); @@ -6237,11 +6235,8 @@ static bool initializeUniqueCases(SwitchInst *SI, PHINode *&PHI, // is unreachable. DefaultResult = DefaultResults.size() == 1 ? DefaultResults.begin()->second : nullptr; - if ((!DefaultResult && - !isa(DefaultDest->getFirstNonPHIOrDbg()))) - return false; - return true; + return DefaultResult || SI->defaultDestUnreachable(); } // Helper function that checks if it is possible to transform a switch with only @@ -6948,7 +6943,7 @@ static bool switchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder, // If the default destination is unreachable, or if the lookup table covers // all values of the conditional variable, branch directly to the lookup table // BB. Otherwise, check that the condition is within the case range. - bool DefaultIsReachable = !SI->defaultDestUndefined(); + bool DefaultIsReachable = !SI->defaultDestUnreachable(); bool TableHasHoles = (NumResults < TableSize); @@ -7281,7 +7276,7 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder, // We perform this optimization only for switches with // unreachable default case. // This assumtion will save us from checking if `Condition` is a power of two. - if (!isa(SI->getDefaultDest()->getFirstNonPHIOrDbg())) + if (!SI->defaultDestUnreachable()) return false; // Check that switch cases are powers of two. @@ -7363,7 +7358,7 @@ static bool simplifySwitchOfCmpIntrinsic(SwitchInst *SI, IRBuilderBase &Builder, assert(Missing.size() == 1 && "Should have one case left"); Res = *Missing.begin(); - } else if (SI->getNumCases() == 3 && SI->defaultDestUndefined()) { + } else if (SI->getNumCases() == 3 && SI->defaultDestUnreachable()) { // Normalize so that Succ is taken once and OtherSucc twice. Unreachable = SI->getDefaultDest(); Succ = OtherSucc = nullptr; diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index bdc9c2c222ae5..bac2e888019d4 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -4222,8 +4222,9 @@ define void @foo(i32 %cond0, i32 %cond1) { EXPECT_EQ(Switch->getDefaultDest(), Ctx.getValue(LLVMSwitch->getDefaultDest())); EXPECT_EQ(Switch->getDefaultDest(), Default); - // Check defaultDestUndefined(). - EXPECT_EQ(Switch->defaultDestUndefined(), LLVMSwitch->defaultDestUndefined()); + // Check defaultDestUnreachable(). + EXPECT_EQ(Switch->defaultDestUnreachable(), + LLVMSwitch->defaultDestUnreachable()); // Check setDefaultDest(). auto *OrigDefaultDest = Switch->getDefaultDest(); auto *NewDefaultDest = Entry; From 91f3965be43ccb5291fcb5578b62648a1ece17bc Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Thu, 3 Apr 2025 16:21:19 +0900 Subject: [PATCH 0492/1029] [LoopInterchange] Fix the vectorizable check for a loop (#133667) In the profitability check for vectorization, the dependency matrix was not handled correctly. This can result to make a wrong decision: It may say "this loop can be vectorized" when in fact it cannot. The root cause of this is that the check process early returns when it finds '=' or 'I' in the dependency matrix. To make sure that we can actually vectorize the loop, we need to check all the rows of the matrix. This patch fixes the process of checking whether we can vectorize the loop or not. Now it won't make a wrong decision for a loop that cannot be vectorized. Related: #131130 --- .../lib/Transforms/Scalar/LoopInterchange.cpp | 44 ++++++++++++------- .../profitability-vectorization-heuristic.ll | 9 ++-- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index e777f950a7c5a..1dccba4cfa7b8 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -1197,25 +1197,35 @@ LoopInterchangeProfitability::isProfitablePerInstrOrderCost() { return std::nullopt; } +/// Return true if we can vectorize the loop specified by \p LoopId. +static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) { + for (unsigned I = 0; I != DepMatrix.size(); I++) { + char Dir = DepMatrix[I][LoopId]; + if (Dir != 'I' && Dir != '=') + return false; + } + return true; +} + std::optional LoopInterchangeProfitability::isProfitableForVectorization( unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { - for (auto &Row : DepMatrix) { - // If the inner loop is loop independent or doesn't carry any dependency - // it is not profitable to move this to outer position, since we are - // likely able to do inner loop vectorization already. - if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=') - return std::optional(false); - - // If the outer loop is not loop independent it is not profitable to move - // this to inner position, since doing so would not enable inner loop - // parallelism. - if (Row[OuterLoopId] != 'I' && Row[OuterLoopId] != '=') - return std::optional(false); - } - // If inner loop has dependence and outer loop is loop independent then it - // is/ profitable to interchange to enable inner loop parallelism. - // If there are no dependences, interchanging will not improve anything. - return std::optional(!DepMatrix.empty()); + // If the outer loop is not loop independent it is not profitable to move + // this to inner position, since doing so would not enable inner loop + // parallelism. + if (!canVectorize(DepMatrix, OuterLoopId)) + return false; + + // If inner loop has dependence and outer loop is loop independent then it is + // profitable to interchange to enable inner loop parallelism. + if (!canVectorize(DepMatrix, InnerLoopId)) + return true; + + // If both the inner and the outer loop can be vectorized, it is necessary to + // check the cost of each vectorized loop for profitability decision. At this + // time we do not have a cost model to estimate them, so return nullopt. + // TODO: Estimate the cost of vectorized loop when both the outer and the + // inner loop can be vectorized. + return std::nullopt; } bool LoopInterchangeProfitability::isProfitable( diff --git a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll index efd2a9c09e7cf..0f5aee582373d 100644 --- a/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll +++ b/llvm/test/Transforms/LoopInterchange/profitability-vectorization-heuristic.ll @@ -15,16 +15,13 @@ ; } ; } ; -; FIXME: These loops are not exchanged at this time due to the problem in -; profitability heuristic calculation for vectorization. -; CHECK: --- !Missed +; CHECK: --- !Passed ; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: InterchangeNotProfitable +; CHECK-NEXT: Name: Interchanged ; CHECK-NEXT: Function: interchange_necessary_for_vectorization ; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Interchanging loops is not considered to improve cache locality nor vectorization. -; CHECK-NEXT: ... +; CHECK-NEXT: - String: Loop interchanged with enclosing loop. define void @interchange_necessary_for_vectorization() { entry: br label %for.i.header From 041e84261a502a28401813bf55aa778ee0bbcdeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Thu, 3 Apr 2025 09:22:38 +0200 Subject: [PATCH 0493/1029] [Clang][AMDGPU] Expose buffer load lds as a clang builtin (#132048) CK is using either inline assembly or inline LLVM-IR builtins to generate buffer_load_dword lds instructions. This patch exposes this instruction as a Clang builtin available on gfx9 and gfx10. Related to SWDEV-519702 and SWDEV-518861 --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 ++ clang/include/clang/Basic/DiagnosticSemaKinds.td | 4 ++-- clang/lib/Sema/SemaAMDGPU.cpp | 7 +++---- .../CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl | 9 +++++++++ .../builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl | 10 ++++++++++ ...tins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl | 6 ++++++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 +++- 7 files changed, 35 insertions(+), 7 deletions(-) create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index c6c0bf7d8388d..cbef637be213a 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -163,6 +163,8 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b64, "V2UiQbiiIi", "n") BUILTIN(__builtin_amdgcn_raw_buffer_load_b96, "V3UiQbiiIi", "n") BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n") +TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_load_lds, "vQbv*3IUiiiIiIi", "t", "vmem-to-lds-load-insts") + //===----------------------------------------------------------------------===// // Ballot builtins. //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 3f9ba933582da..1993cd5accc22 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13056,6 +13056,6 @@ def err_acc_decl_for_routine : Error<"expected function or lambda declaration for 'routine' construct">; // AMDGCN builtins diagnostics -def err_amdgcn_global_load_lds_size_invalid_value : Error<"invalid size value">; -def note_amdgcn_global_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">; +def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">; +def note_amdgcn_load_lds_size_valid_value : Note<"size must be %select{1, 2, or 4|1, 2, 4, 12 or 16}0">; } // end of sema component. diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index a4d075dfd0768..7fec099374152 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -35,6 +35,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, Builtin::evaluateRequiredTargetFeatures("gfx950-insts", CallerFeatureMap); switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds: case AMDGPU::BI__builtin_amdgcn_global_load_lds: { constexpr const int SizeIdx = 2; llvm::APSInt Size; @@ -54,11 +55,9 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, [[fallthrough]]; } default: - Diag(ArgExpr->getExprLoc(), - diag::err_amdgcn_global_load_lds_size_invalid_value) + Diag(ArgExpr->getExprLoc(), diag::err_amdgcn_load_lds_size_invalid_value) << ArgExpr->getSourceRange(); - Diag(ArgExpr->getExprLoc(), - diag::note_amdgcn_global_load_lds_size_valid_value) + Diag(ArgExpr->getExprLoc(), diag::note_amdgcn_load_lds_size_valid_value) << HasGFX950Insts << ArgExpr->getSourceRange(); return true; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl index 3403b69e07e4b..5e3ed9027c17a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load.cl @@ -170,3 +170,12 @@ v3u32 test_amdgcn_raw_ptr_buffer_load_b96_non_const_soffset(__amdgpu_buffer_rsrc v4u32 test_amdgcn_raw_ptr_buffer_load_b128_non_const_soffset(__amdgpu_buffer_rsrc_t rsrc, int offset, int soffset) { return __builtin_amdgcn_raw_buffer_load_b128(rsrc, /*offset=*/0, soffset, /*aux=*/0); } + +// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_lds( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 1, i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3) +// CHECK-NEXT: ret void +// +void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int offset, int soffset) { + __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 1, offset, soffset, 2, 3); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl new file mode 100644 index 0000000000000..5915393ae7f56 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -S -verify=gfx90a,expected -o - %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s +// REQUIRES: amdgpu-registered-target + +void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int offset, int soffset, int x) { + __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, x, offset, soffset, 0, 0); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}} + __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, x, 0); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}} + __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}} + __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 3, offset, soffset, 0, 0); //expected-error{{invalid size value}} gfx950-note{{size must be 1, 2, 4, 12 or 16}} gfx90a-note{{size must be 1, 2, or 4}} +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl new file mode 100644 index 0000000000000..768f894e9180d --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -S -verify -o - %s +// REQUIRES: amdgpu-registered-target + +void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int offset, int soffset, int x) { + __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, 0); //expected-error{{needs target feature vmem-to-lds-load-insts}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index ebac0f9029791..217e43fcce4fd 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1863,7 +1863,9 @@ class AMDGPURawBufferLoadLDS : Intrinsic < ImmArg>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; -class AMDGPURawPtrBufferLoadLDS : Intrinsic < +class AMDGPURawPtrBufferLoadLDS : + ClangBuiltin<"__builtin_amdgcn_raw_ptr_buffer_load_lds">, + Intrinsic < [], [AMDGPUBufferRsrcTy, // rsrc(SGPR) LLVMQualPointerType<3>, // LDS base offset From 73e1710a4d5629cce5aaebc01cab1f76e3de5c84 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Thu, 3 Apr 2025 15:22:51 +0800 Subject: [PATCH 0494/1029] [SimplifyCFG] Remove unused variable. NFC. (#134211) --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index e7c550be00b14..eac7e7c209c95 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -6228,7 +6228,6 @@ static bool initializeUniqueCases(SwitchInst *SI, PHINode *&PHI, } // Find the default result value. SmallVector, 1> DefaultResults; - BasicBlock *DefaultDest = SI->getDefaultDest(); getCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResults, DL, TTI); // If the default value is not found abort unless the default destination From e1aaee7ea218f1d89646fa1f43bb4c94c27808b5 Mon Sep 17 00:00:00 2001 From: Dmitry Polukhin <34227995+dmpolukhin@users.noreply.github.com> Date: Thu, 3 Apr 2025 08:27:13 +0100 Subject: [PATCH 0495/1029] [modules] Handle friend function that was a definition but became only a declaration during AST deserialization (#132214) Fix for regression #130917, changes in #111992 were too broad. This change reduces scope of previous fix. Added `ExternalASTSource::wasThisDeclarationADefinition` to detect cases when FunctionDecl lost body due to declaration merges. --- clang/include/clang/AST/ExternalASTSource.h | 4 ++ .../clang/Sema/MultiplexExternalSemaSource.h | 2 + clang/include/clang/Serialization/ASTReader.h | 6 +++ clang/lib/AST/ExternalASTSource.cpp | 4 ++ .../lib/Sema/MultiplexExternalSemaSource.cpp | 8 ++++ .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 12 +++--- clang/lib/Serialization/ASTReader.cpp | 4 ++ clang/lib/Serialization/ASTReaderDecl.cpp | 3 ++ .../friend-default-parameters-modules.cpp | 39 +++++++++++++++++++ .../SemaCXX/friend-default-parameters.cpp | 21 ++++++++++ 10 files changed, 98 insertions(+), 5 deletions(-) create mode 100644 clang/test/SemaCXX/friend-default-parameters-modules.cpp create mode 100644 clang/test/SemaCXX/friend-default-parameters.cpp diff --git a/clang/include/clang/AST/ExternalASTSource.h b/clang/include/clang/AST/ExternalASTSource.h index 42aed56d42e07..f45e3af7602c1 100644 --- a/clang/include/clang/AST/ExternalASTSource.h +++ b/clang/include/clang/AST/ExternalASTSource.h @@ -191,6 +191,10 @@ class ExternalASTSource : public RefCountedBase { virtual ExtKind hasExternalDefinitions(const Decl *D); + /// True if this function declaration was a definition before in its own + /// module. + virtual bool wasThisDeclarationADefinition(const FunctionDecl *FD); + /// Finds all declarations lexically contained within the given /// DeclContext, after applying an optional filter predicate. /// diff --git a/clang/include/clang/Sema/MultiplexExternalSemaSource.h b/clang/include/clang/Sema/MultiplexExternalSemaSource.h index 921bebe3a44af..391c2177d75ec 100644 --- a/clang/include/clang/Sema/MultiplexExternalSemaSource.h +++ b/clang/include/clang/Sema/MultiplexExternalSemaSource.h @@ -92,6 +92,8 @@ class MultiplexExternalSemaSource : public ExternalSemaSource { ExtKind hasExternalDefinitions(const Decl *D) override; + bool wasThisDeclarationADefinition(const FunctionDecl *FD) override; + /// Find all declarations with the given name in the /// given context. bool FindExternalVisibleDeclsByName(const DeclContext *DC, diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 2779b3d1cf2ea..58fcc06c3696d 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -1392,6 +1392,10 @@ class ASTReader llvm::DenseMap DefinitionSource; + /// Friend functions that were defined but might have had their bodies + /// removed. + llvm::DenseSet ThisDeclarationWasADefinitionSet; + bool shouldDisableValidationForFile(const serialization::ModuleFile &M) const; /// Reads a statement from the specified cursor. @@ -2374,6 +2378,8 @@ class ASTReader ExtKind hasExternalDefinitions(const Decl *D) override; + bool wasThisDeclarationADefinition(const FunctionDecl *FD) override; + /// Retrieve a selector from the given module with its local ID /// number. Selector getLocalSelector(ModuleFile &M, unsigned LocalID); diff --git a/clang/lib/AST/ExternalASTSource.cpp b/clang/lib/AST/ExternalASTSource.cpp index e2451f294741d..3e865cb7679b5 100644 --- a/clang/lib/AST/ExternalASTSource.cpp +++ b/clang/lib/AST/ExternalASTSource.cpp @@ -38,6 +38,10 @@ ExternalASTSource::hasExternalDefinitions(const Decl *D) { return EK_ReplyHazy; } +bool ExternalASTSource::wasThisDeclarationADefinition(const FunctionDecl *FD) { + return false; +} + void ExternalASTSource::FindFileRegionDecls(FileID File, unsigned Offset, unsigned Length, SmallVectorImpl &Decls) {} diff --git a/clang/lib/Sema/MultiplexExternalSemaSource.cpp b/clang/lib/Sema/MultiplexExternalSemaSource.cpp index 6d945300c386c..fbfb242598c24 100644 --- a/clang/lib/Sema/MultiplexExternalSemaSource.cpp +++ b/clang/lib/Sema/MultiplexExternalSemaSource.cpp @@ -107,6 +107,14 @@ MultiplexExternalSemaSource::hasExternalDefinitions(const Decl *D) { return EK_ReplyHazy; } +bool MultiplexExternalSemaSource::wasThisDeclarationADefinition( + const FunctionDecl *FD) { + for (const auto &S : Sources) + if (S->wasThisDeclarationADefinition(FD)) + return true; + return false; +} + bool MultiplexExternalSemaSource::FindExternalVisibleDeclsByName( const DeclContext *DC, DeclarationName Name, const DeclContext *OriginalDC) { diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 8aaaea0bcdd66..9ea5ecab2d030 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -2604,11 +2604,13 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl( // Friend function defined withing class template may stop being function // definition during AST merges from different modules, in this case decl // with function body should be used for instantiation. - if (isFriend) { - const FunctionDecl *Defn = nullptr; - if (D->hasBody(Defn)) { - D = const_cast(Defn); - FunctionTemplate = Defn->getDescribedFunctionTemplate(); + if (ExternalASTSource *Source = SemaRef.Context.getExternalSource()) { + if (isFriend && Source->wasThisDeclarationADefinition(D)) { + const FunctionDecl *Defn = nullptr; + if (D->hasBody(Defn)) { + D = const_cast(Defn); + FunctionTemplate = Defn->getDescribedFunctionTemplate(); + } } } diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 58a57d6c54523..8e573a11efd35 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -9661,6 +9661,10 @@ ExternalASTSource::ExtKind ASTReader::hasExternalDefinitions(const Decl *FD) { return I->second ? EK_Never : EK_Always; } +bool ASTReader::wasThisDeclarationADefinition(const FunctionDecl *FD) { + return ThisDeclarationWasADefinitionSet.contains(FD); +} + Selector ASTReader::getLocalSelector(ModuleFile &M, unsigned LocalID) { return DecodeSelector(getGlobalSelectorID(M, LocalID)); } diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 77daeaee5dd1f..b838f84c973de 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -523,6 +523,9 @@ void ASTDeclReader::ReadFunctionDefinition(FunctionDecl *FD) { } // Store the offset of the body so we can lazily load it later. Reader.PendingBodies[FD] = GetCurrentCursorOffset(); + // For now remember ThisDeclarationWasADefinition only for friend functions. + if (FD->getFriendObjectKind()) + Reader.ThisDeclarationWasADefinitionSet.insert(FD); } void ASTDeclReader::Visit(Decl *D) { diff --git a/clang/test/SemaCXX/friend-default-parameters-modules.cpp b/clang/test/SemaCXX/friend-default-parameters-modules.cpp new file mode 100644 index 0000000000000..9c4aff9f1964a --- /dev/null +++ b/clang/test/SemaCXX/friend-default-parameters-modules.cpp @@ -0,0 +1,39 @@ +// RUN: rm -fR %t +// RUN: split-file %s %t +// RUN: cd %t +// RUN: %clang_cc1 -std=c++20 -fmodule-map-file=modules.map -xc++ -emit-module -fmodule-name=foo modules.map -o foo.pcm +// RUN: %clang_cc1 -std=c++20 -fmodule-map-file=modules.map -O1 -emit-obj main.cc -verify -fmodule-file=foo.pcm + +//--- modules.map +module "foo" { + export * + module "foo.h" { + export * + header "foo.h" + } +} + +//--- foo.h +#pragma once + +template +void Create(const void* = nullptr); + +template +struct ObjImpl { + template + friend void ::Create(const void*); +}; + +template +void Create(const void*) { + (void) ObjImpl{}; +} + +//--- main.cc +// expected-no-diagnostics +#include "foo.h" + +int main() { + Create<42>(); +} diff --git a/clang/test/SemaCXX/friend-default-parameters.cpp b/clang/test/SemaCXX/friend-default-parameters.cpp new file mode 100644 index 0000000000000..7190477ac496a --- /dev/null +++ b/clang/test/SemaCXX/friend-default-parameters.cpp @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 -std=c++20 -verify -emit-llvm-only %s + +template +void Create(const void* = nullptr); + +template +struct ObjImpl { + template + friend void ::Create(const void*); +}; + +template +void Create(const void*) { + (void) ObjImpl{}; +} + +int main() { + Create<42>(); +} + +// expected-no-diagnostics From 6333fa5160fbde4bd2cf6afe8856695c13ab621f Mon Sep 17 00:00:00 2001 From: Carlos Galvez Date: Thu, 3 Apr 2025 09:28:34 +0200 Subject: [PATCH 0496/1029] [clang-tidy] Fix broken HeaderFilterRegex when read from config file (#133582) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR https://github.com/llvm/llvm-project/pull/91400 broke the usage of HeaderFilterRegex via config file, because it is now created at a different point in the execution and leads to a different value. The result of that is that using HeaderFilterRegex only in the config file does NOT work, in other words clang-tidy stops triggering warnings on header files, thereby losing a lot of coverage. This patch reverts the logic so that the header filter is created upon calling the getHeaderFilter() function. Additionally, this patch adds 2 unit tests to prevent regressions in the future: - One of them, "simple", tests the most basic use case with a single top-level .clang-tidy file. - The second one, "inheritance", demonstrates that the subfolder only gets warnings from headers within it, and not from parent headers. Fixes #118009 Fixes #121969 Fixes #133453 Co-authored-by: Carlos Gálvez --- .../ClangTidyDiagnosticConsumer.cpp | 36 ++++++++++--------- .../clang-tidy/ClangTidyDiagnosticConsumer.h | 4 +++ .../clang-tidy/ClangTidyOptions.cpp | 4 +-- clang-tools-extra/docs/ReleaseNotes.rst | 3 ++ .../inheritance/.clang-tidy | 1 + .../inheritance/foo.cpp | 3 ++ .../inheritance/foo.h | 1 + .../inheritance/subfolder/.clang-tidy | 2 ++ .../inheritance/subfolder/bar.cpp | 8 +++++ .../inheritance/subfolder/bar.h | 1 + .../simple/.clang-tidy | 1 + .../simple/foo.cpp | 3 ++ .../simple/foo.h | 1 + 13 files changed, 49 insertions(+), 19 deletions(-) create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/.clang-tidy create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/foo.cpp create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/foo.h create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/subfolder/.clang-tidy create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/subfolder/bar.cpp create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/subfolder/bar.h create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/.clang-tidy create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/foo.cpp create mode 100644 clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/foo.h diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp index 4c75b42270114..71e852545203e 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp @@ -311,18 +311,7 @@ ClangTidyDiagnosticConsumer::ClangTidyDiagnosticConsumer( : Context(Ctx), ExternalDiagEngine(ExternalDiagEngine), RemoveIncompatibleErrors(RemoveIncompatibleErrors), GetFixesFromNotes(GetFixesFromNotes), - EnableNolintBlocks(EnableNolintBlocks) { - - if (Context.getOptions().HeaderFilterRegex && - !Context.getOptions().HeaderFilterRegex->empty()) - HeaderFilter = - std::make_unique(*Context.getOptions().HeaderFilterRegex); - - if (Context.getOptions().ExcludeHeaderFilterRegex && - !Context.getOptions().ExcludeHeaderFilterRegex->empty()) - ExcludeHeaderFilter = std::make_unique( - *Context.getOptions().ExcludeHeaderFilterRegex); -} + EnableNolintBlocks(EnableNolintBlocks) {} void ClangTidyDiagnosticConsumer::finalizeLastError() { if (!Errors.empty()) { @@ -571,17 +560,30 @@ void ClangTidyDiagnosticConsumer::checkFilters(SourceLocation Location, } StringRef FileName(File->getName()); - LastErrorRelatesToUserCode = - LastErrorRelatesToUserCode || Sources.isInMainFile(Location) || - (HeaderFilter && - (HeaderFilter->match(FileName) && - !(ExcludeHeaderFilter && ExcludeHeaderFilter->match(FileName)))); + LastErrorRelatesToUserCode = LastErrorRelatesToUserCode || + Sources.isInMainFile(Location) || + (getHeaderFilter()->match(FileName) && + !getExcludeHeaderFilter()->match(FileName)); unsigned LineNumber = Sources.getExpansionLineNumber(Location); LastErrorPassesLineFilter = LastErrorPassesLineFilter || passesLineFilter(FileName, LineNumber); } +llvm::Regex *ClangTidyDiagnosticConsumer::getHeaderFilter() { + if (!HeaderFilter) + HeaderFilter = + std::make_unique(*Context.getOptions().HeaderFilterRegex); + return HeaderFilter.get(); +} + +llvm::Regex *ClangTidyDiagnosticConsumer::getExcludeHeaderFilter() { + if (!ExcludeHeaderFilter) + ExcludeHeaderFilter = std::make_unique( + *Context.getOptions().ExcludeHeaderFilterRegex); + return ExcludeHeaderFilter.get(); +} + void ClangTidyDiagnosticConsumer::removeIncompatibleErrors() { // Each error is modelled as the set of intervals in which it applies // replacements. To detect overlapping replacements, we use a sweep line diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h index ff42f96a0477b..d6cf6a2b2731e 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h +++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.h @@ -302,6 +302,10 @@ class ClangTidyDiagnosticConsumer : public DiagnosticConsumer { /// context. llvm::Regex *getHeaderFilter(); + /// Returns the \c ExcludeHeaderFilter constructed for the options set in the + /// context. + llvm::Regex *getExcludeHeaderFilter(); + /// Updates \c LastErrorRelatesToUserCode and LastErrorPassesLineFilter /// according to the diagnostic \p Location. void checkFilters(SourceLocation Location, const SourceManager &Sources); diff --git a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp index 8bac6f161fa05..dd1d86882f5d4 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyOptions.cpp @@ -194,8 +194,8 @@ ClangTidyOptions ClangTidyOptions::getDefaults() { Options.WarningsAsErrors = ""; Options.HeaderFileExtensions = {"", "h", "hh", "hpp", "hxx"}; Options.ImplementationFileExtensions = {"c", "cc", "cpp", "cxx"}; - Options.HeaderFilterRegex = std::nullopt; - Options.ExcludeHeaderFilterRegex = std::nullopt; + Options.HeaderFilterRegex = ""; + Options.ExcludeHeaderFilterRegex = ""; Options.SystemHeaders = false; Options.FormatStyle = "none"; Options.User = std::nullopt; diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 6cb8d572d3a78..6c1f05009df98 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -94,6 +94,9 @@ Improvements to clang-tidy - Improved :program:`clang-tidy-diff.py` script. Add the `-warnings-as-errors` argument to treat warnings as errors. +- Fixed bug in :program:`clang-tidy` by which `HeaderFilterRegex` did not take + effect when passed via the `.clang-tidy` file. + New checks ^^^^^^^^^^ diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/.clang-tidy b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/.clang-tidy new file mode 100644 index 0000000000000..f4210353f94de --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/.clang-tidy @@ -0,0 +1 @@ +HeaderFilterRegex: '.*' diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/foo.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/foo.cpp new file mode 100644 index 0000000000000..5828c2cafaf7d --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/foo.cpp @@ -0,0 +1,3 @@ +// RUN: clang-tidy -checks=-*,google-explicit-constructor %s 2>&1 | FileCheck %s +#include "foo.h" +// CHECK: foo.h:1:12: warning: single-argument constructors must be marked explicit diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/foo.h b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/foo.h new file mode 100644 index 0000000000000..f61d4c2923b50 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/foo.h @@ -0,0 +1 @@ +struct X { X(int); }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/subfolder/.clang-tidy b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/subfolder/.clang-tidy new file mode 100644 index 0000000000000..96706c1428047 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/subfolder/.clang-tidy @@ -0,0 +1,2 @@ +InheritParentConfig: true +HeaderFilterRegex: 'subfolder/.*' diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/subfolder/bar.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/subfolder/bar.cpp new file mode 100644 index 0000000000000..229ba52e2695a --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/subfolder/bar.cpp @@ -0,0 +1,8 @@ +// shell is required for the "dirname" command +// REQUIRES: shell +// RUN: clang-tidy -checks=-*,google-explicit-constructor %s -- -I "$(dirname %S)" 2>&1 | FileCheck %s +#include "foo.h" +// CHECK-NOT: foo.h:1:12: warning: single-argument constructors must be marked explicit + +#include "bar.h" +// CHECK: bar.h:1:13: warning: single-argument constructors must be marked explicit diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/subfolder/bar.h b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/subfolder/bar.h new file mode 100644 index 0000000000000..ee12d00d334dd --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/inheritance/subfolder/bar.h @@ -0,0 +1 @@ +struct XX { XX(int); }; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/.clang-tidy b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/.clang-tidy new file mode 100644 index 0000000000000..f4210353f94de --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/.clang-tidy @@ -0,0 +1 @@ +HeaderFilterRegex: '.*' diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/foo.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/foo.cpp new file mode 100644 index 0000000000000..5828c2cafaf7d --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/foo.cpp @@ -0,0 +1,3 @@ +// RUN: clang-tidy -checks=-*,google-explicit-constructor %s 2>&1 | FileCheck %s +#include "foo.h" +// CHECK: foo.h:1:12: warning: single-argument constructors must be marked explicit diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/foo.h b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/foo.h new file mode 100644 index 0000000000000..f61d4c2923b50 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/infrastructure/header-filter-from-config-file/simple/foo.h @@ -0,0 +1 @@ +struct X { X(int); }; From edc22c64e527171041876f26a491bb1d03d905d5 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 3 Apr 2025 09:42:27 +0100 Subject: [PATCH 0497/1029] [X86] getFauxShuffleMask - only handle VTRUNC nodes with matching src/dst sizes (#134161) Cleanup work for #133947 - we need to handle VTRUNC nodes with large source vectors directly to allow us to widen the size of the shuffle combine We currently discard these results in combineX86ShufflesRecursively anyhow as we don't allow inputs from getTargetShuffleInputs to be larger than the shuffle value type --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8e6a891444bf1..aea80120a0481 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6413,9 +6413,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, case X86ISD::VTRUNC: { SDValue Src = N.getOperand(0); EVT SrcVT = Src.getValueType(); - // Truncated source must be a simple vector. - if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || - (SrcVT.getScalarSizeInBits() % 8) != 0) + if (SrcVT.getSizeInBits() != NumSizeInBits) return false; unsigned NumSrcElts = SrcVT.getVectorNumElements(); unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits(); From 7baa7edc00c5c92e2d17bae760db2e6df97dcec6 Mon Sep 17 00:00:00 2001 From: Romaric Jodin Date: Thu, 3 Apr 2025 11:18:39 +0200 Subject: [PATCH 0498/1029] [libclc]: clspv: add a dummy implememtation for mul_hi (#134094) clspv uses a better implementation that is not using a bigger side when not available. Add a dummy implementation for mul_hi to avoid to override the implementation of clspv with the one in libclc. --- libclc/clc/lib/clspv/SOURCES | 1 + libclc/clc/lib/clspv/integer/clc_mul_hi.cl | 5 +++++ 2 files changed, 6 insertions(+) create mode 100644 libclc/clc/lib/clspv/integer/clc_mul_hi.cl diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES index b1401f8307a4c..b91b0e70a397d 100644 --- a/libclc/clc/lib/clspv/SOURCES +++ b/libclc/clc/lib/clspv/SOURCES @@ -1 +1,2 @@ math/clc_sw_fma.cl +integer/clc_mul_hi.cl diff --git a/libclc/clc/lib/clspv/integer/clc_mul_hi.cl b/libclc/clc/lib/clspv/integer/clc_mul_hi.cl new file mode 100644 index 0000000000000..54a51bbce4303 --- /dev/null +++ b/libclc/clc/lib/clspv/integer/clc_mul_hi.cl @@ -0,0 +1,5 @@ +/* +Opt-out of libclc mul_hi implementation for clspv. +clspv has an internal implementation that does not required using a bigger data size. +That implementation is based on OpMulExtended which is SPIR-V specific, thus it cannot be written in OpenCL-C. +*/ From 6ec66a2292a7321811700ce455cf404bdaa67fc0 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 3 Apr 2025 10:23:56 +0100 Subject: [PATCH 0499/1029] [X86] Move VPERMV3(X,M,Y) -> VPERMV(M,CONCAT(X,Y)) fold after general VPERMV3 canonicalization Pulled out of #133923 - this prevents regressions with SimplifyDemandedVectorEltsForTargetNode exposing VPERMV3(X,M,X) repeated operand patterns which were getting concatenated to wider VPERMV nodes before simpler canonicalizations could clean them up. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 54 ++++++++++++------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index aea80120a0481..52c254c3dd045 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -42673,40 +42673,13 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, return SDValue(); } case X86ISD::VPERMV3: { - // Combine VPERMV3 to widened VPERMV if the two source operands can be - // freely concatenated. MVT WideVT = VT.getDoubleNumVectorElementsVT(); bool CanConcat = VT.is128BitVector() || (VT.is256BitVector() && Subtarget.useAVX512Regs()); - if (CanConcat) { - SDValue Ops[] = {N.getOperand(0), N.getOperand(2)}; - if (SDValue ConcatSrc = - combineConcatVectorOps(DL, WideVT, Ops, DAG, Subtarget)) { - SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG, - DL, WideVT.getSizeInBits()); - SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm, - DAG.getVectorIdxConstant(0, DL)); - } - } SmallVector SrcOps; SmallVector Mask; if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) { assert(Mask.size() == NumElts && "Unexpected shuffle mask size"); - // See if we can concatenate the commuted operands. - if (CanConcat) { - if (SDValue ConcatSrc = combineConcatVectorOps( - DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG, - Subtarget)) { - ShuffleVectorSDNode::commuteMask(Mask); - Mask.append(NumElts, SM_SentinelUndef); - SDValue Perm = - lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc, - DAG.getUNDEF(WideVT), Subtarget, DAG); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm, - DAG.getVectorIdxConstant(0, DL)); - } - } SDValue V1 = peekThroughBitcasts(N.getOperand(0)); SDValue V2 = peekThroughBitcasts(N.getOperand(2)); // Canonicalize to VPERMV if both sources are the same. @@ -42740,6 +42713,33 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, return lowerShuffleWithPERMV(DL, VT, Mask, N.getOperand(2), N.getOperand(0), Subtarget, DAG); } + // Combine VPERMV3 to widened VPERMV if the two source operands can be + // freely concatenated, with a commuted shuffle mask. + if (CanConcat) { + if (SDValue ConcatSrc = combineConcatVectorOps( + DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG, + Subtarget)) { + ShuffleVectorSDNode::commuteMask(Mask); + Mask.append(NumElts, SM_SentinelUndef); + SDValue Perm = + lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc, + DAG.getUNDEF(WideVT), Subtarget, DAG); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm, + DAG.getVectorIdxConstant(0, DL)); + } + } + } + // Combine VPERMV3 to widened VPERMV if the two source operands can be + // freely concatenated. + if (CanConcat) { + if (SDValue ConcatSrc = combineConcatVectorOps( + DL, WideVT, {N.getOperand(0), N.getOperand(2)}, DAG, Subtarget)) { + SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG, + DL, WideVT.getSizeInBits()); + SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm, + DAG.getVectorIdxConstant(0, DL)); + } } return SDValue(); } From 6f324bd39b98659c81aae02595cee1b4c92db8e9 Mon Sep 17 00:00:00 2001 From: Jack Frankland Date: Thu, 3 Apr 2025 10:30:10 +0100 Subject: [PATCH 0500/1029] [mlir][tosa] Remove Convolution Type Verifiers (#134077) Remove the test in the convolution verifier that checks the input and output element types of convolution operations conform to the constraints imposed by the TOSA 1.0 specification. These checks are too strict for users of the TOSA dialect who wish to allow more types than those allowed by the spec and provide compatibility issues with earlier TOSA implementation which allowed more type combinations. Users who do wish to constrain the convolution types combination to only those allowed by the TOSA 1.0 spec should run the TOSA validation pass which already performs these checks. Signed-off-by: Jack Frankland --- mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 12 +----------- mlir/test/Dialect/Tosa/invalid.mlir | 9 --------- 2 files changed, 1 insertion(+), 20 deletions(-) diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index cdba332792eb0..b8d81213d9004 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -425,17 +425,7 @@ static LogicalResult verifyConvOpModes(T op) { if (auto quantType = llvm::dyn_cast(resultEType)) resultEType = quantType.getStorageType(); - // check allowed input/result element types combinations - if ((inputEType.isInteger(8) && resultEType.isInteger(32)) || - (inputEType.isInteger(16) && resultEType.isInteger(48)) || - (isa(inputEType) && resultEType.isF16()) || - (isa(inputEType) && resultEType.isF16()) || - (inputEType.isF16() && resultEType.isF16()) || - (inputEType.isBF16() && resultEType.isBF16()) || - (inputEType.isF32() && resultEType.isF32())) - return success(); - - return op.emitOpError("input/output element types are incompatible."); + return success(); } // verify that inType and outType have same element types diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index ac8a247da24a7..10b8929b16a88 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -181,15 +181,6 @@ func.func @test_conv2d_quant_any_acc(%arg0: tensor<1x4x4x4x!quant.any>> return %0 : tensor<1x4x4x8x!quant.any>> } -// ----- -// CHECK-LABEL: conv2d_quant_any_result -func.func @test_conv2d_quant_any_result(%arg0: tensor<1x4x4x4x!quant.any>>, %arg1: tensor<8x1x1x4x!quant.any>>, %arg2: tensor<8x!quant.any>>) -> tensor<1x4x4x8x!quant.any>> { - %zp = "tosa.const" () { values = dense<0> : tensor<1xi8> } : () -> tensor<1xi8> - // expected-error@+1 {{'tosa.conv2d' op input/output element types are incompatible}} - %0 = tosa.conv2d %arg0, %arg1, %arg2, %zp, %zp {acc_type = i32, dilation = array, pad = array, stride = array, local_bound = true} : (tensor<1x4x4x4x!quant.any>>, tensor<8x1x1x4x!quant.any>>, tensor<8x!quant.any>>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x4x8x!quant.any>> - return %0 : tensor<1x4x4x8x!quant.any>> -} - // ----- func.func @test_concat(%arg0 : tensor<2x1xf32>, %arg1 : tensor<2x2xf32>) -> tensor { From 094904303d50e0ab14bc5f2586a602f79af95953 Mon Sep 17 00:00:00 2001 From: Vladislav Dzhidzhoev Date: Thu, 3 Apr 2025 11:33:11 +0200 Subject: [PATCH 0501/1029] Revert "[lldb] Return *const* UnwindPlan pointers from FuncUnwinders (#133247)" This reverts commit d7afafdbc464e65c56a0a1d77bad426aa7538306. Caused remote Linux to Linux buildbot failure https://lab.llvm.org/buildbot/#/builders/195/builds/7046. --- lldb/include/lldb/Symbol/FuncUnwinders.h | 78 ++++--- lldb/include/lldb/Symbol/UnwindPlan.h | 4 +- .../lldb/Target/RegisterContextUnwind.h | 16 +- lldb/source/Commands/CommandObjectTarget.cpp | 106 ++++----- lldb/source/Symbol/FuncUnwinders.cpp | 207 +++++++++--------- lldb/source/Symbol/UnwindPlan.cpp | 2 +- lldb/source/Target/RegisterContextUnwind.cpp | 84 +++---- 7 files changed, 250 insertions(+), 247 deletions(-) diff --git a/lldb/include/lldb/Symbol/FuncUnwinders.h b/lldb/include/lldb/Symbol/FuncUnwinders.h index 479ccf87b6e2c..1d4c28324e90f 100644 --- a/lldb/include/lldb/Symbol/FuncUnwinders.h +++ b/lldb/include/lldb/Symbol/FuncUnwinders.h @@ -36,19 +36,18 @@ class FuncUnwinders { ~FuncUnwinders(); - std::shared_ptr GetUnwindPlanAtCallSite(Target &target, - Thread &thread); + lldb::UnwindPlanSP GetUnwindPlanAtCallSite(Target &target, Thread &thread); - std::shared_ptr - GetUnwindPlanAtNonCallSite(Target &target, lldb_private::Thread &thread); + lldb::UnwindPlanSP GetUnwindPlanAtNonCallSite(Target &target, + lldb_private::Thread &thread); - std::shared_ptr - GetUnwindPlanFastUnwind(Target &target, lldb_private::Thread &thread); + lldb::UnwindPlanSP GetUnwindPlanFastUnwind(Target &target, + lldb_private::Thread &thread); - std::shared_ptr + lldb::UnwindPlanSP GetUnwindPlanArchitectureDefault(lldb_private::Thread &thread); - std::shared_ptr + lldb::UnwindPlanSP GetUnwindPlanArchitectureDefaultAtFunctionEntry(lldb_private::Thread &thread); Address &GetFirstNonPrologueInsn(Target &target); @@ -78,34 +77,32 @@ class FuncUnwinders { // used. Instead, clients should ask for the *behavior* they are looking for, // using one of the above UnwindPlan retrieval methods. - std::shared_ptr GetAssemblyUnwindPlan(Target &target, - Thread &thread); + lldb::UnwindPlanSP GetAssemblyUnwindPlan(Target &target, Thread &thread); - std::shared_ptr GetObjectFileUnwindPlan(Target &target); + lldb::UnwindPlanSP GetObjectFileUnwindPlan(Target &target); - std::shared_ptr - GetObjectFileAugmentedUnwindPlan(Target &target, Thread &thread); + lldb::UnwindPlanSP GetObjectFileAugmentedUnwindPlan(Target &target, + Thread &thread); - std::shared_ptr GetEHFrameUnwindPlan(Target &target); + lldb::UnwindPlanSP GetEHFrameUnwindPlan(Target &target); - std::shared_ptr - GetEHFrameAugmentedUnwindPlan(Target &target, Thread &thread); + lldb::UnwindPlanSP GetEHFrameAugmentedUnwindPlan(Target &target, + Thread &thread); - std::shared_ptr GetDebugFrameUnwindPlan(Target &target); + lldb::UnwindPlanSP GetDebugFrameUnwindPlan(Target &target); - std::shared_ptr - GetDebugFrameAugmentedUnwindPlan(Target &target, Thread &thread); + lldb::UnwindPlanSP GetDebugFrameAugmentedUnwindPlan(Target &target, + Thread &thread); - std::shared_ptr GetCompactUnwindUnwindPlan(Target &target); + lldb::UnwindPlanSP GetCompactUnwindUnwindPlan(Target &target); - std::shared_ptr GetArmUnwindUnwindPlan(Target &target); + lldb::UnwindPlanSP GetArmUnwindUnwindPlan(Target &target); - std::shared_ptr GetSymbolFileUnwindPlan(Thread &thread); + lldb::UnwindPlanSP GetSymbolFileUnwindPlan(Thread &thread); - std::shared_ptr GetArchDefaultUnwindPlan(Thread &thread); + lldb::UnwindPlanSP GetArchDefaultUnwindPlan(Thread &thread); - std::shared_ptr - GetArchDefaultAtFuncEntryUnwindPlan(Thread &thread); + lldb::UnwindPlanSP GetArchDefaultAtFuncEntryUnwindPlan(Thread &thread); private: lldb::UnwindAssemblySP GetUnwindAssemblyProfiler(Target &target); @@ -116,8 +113,7 @@ class FuncUnwinders { // unwind rule for the pc, and LazyBoolCalculate if it was unable to // determine this for some reason. lldb_private::LazyBool CompareUnwindPlansForIdenticalInitialPCLocation( - Thread &thread, const std::shared_ptr &a, - const std::shared_ptr &b); + Thread &thread, const lldb::UnwindPlanSP &a, const lldb::UnwindPlanSP &b); UnwindTable &m_unwind_table; @@ -133,22 +129,22 @@ class FuncUnwinders { std::recursive_mutex m_mutex; - std::shared_ptr m_unwind_plan_assembly_sp; - std::shared_ptr m_unwind_plan_object_file_sp; - std::shared_ptr m_unwind_plan_eh_frame_sp; - std::shared_ptr m_unwind_plan_debug_frame_sp; + lldb::UnwindPlanSP m_unwind_plan_assembly_sp; + lldb::UnwindPlanSP m_unwind_plan_object_file_sp; + lldb::UnwindPlanSP m_unwind_plan_eh_frame_sp; + lldb::UnwindPlanSP m_unwind_plan_debug_frame_sp; // augmented by assembly inspection so it's valid everywhere - std::shared_ptr m_unwind_plan_object_file_augmented_sp; - std::shared_ptr m_unwind_plan_eh_frame_augmented_sp; - std::shared_ptr m_unwind_plan_debug_frame_augmented_sp; - - std::vector> m_unwind_plan_compact_unwind; - std::shared_ptr m_unwind_plan_arm_unwind_sp; - std::shared_ptr m_unwind_plan_symbol_file_sp; - std::shared_ptr m_unwind_plan_fast_sp; - std::shared_ptr m_unwind_plan_arch_default_sp; - std::shared_ptr m_unwind_plan_arch_default_at_func_entry_sp; + lldb::UnwindPlanSP m_unwind_plan_object_file_augmented_sp; + lldb::UnwindPlanSP m_unwind_plan_eh_frame_augmented_sp; + lldb::UnwindPlanSP m_unwind_plan_debug_frame_augmented_sp; + + std::vector m_unwind_plan_compact_unwind; + lldb::UnwindPlanSP m_unwind_plan_arm_unwind_sp; + lldb::UnwindPlanSP m_unwind_plan_symbol_file_sp; + lldb::UnwindPlanSP m_unwind_plan_fast_sp; + lldb::UnwindPlanSP m_unwind_plan_arch_default_sp; + lldb::UnwindPlanSP m_unwind_plan_arch_default_at_func_entry_sp; // Fetching the UnwindPlans can be expensive - if we've already attempted to // get one & failed, don't try again. diff --git a/lldb/include/lldb/Symbol/UnwindPlan.h b/lldb/include/lldb/Symbol/UnwindPlan.h index 6640a23a3e868..9adda27b8f928 100644 --- a/lldb/include/lldb/Symbol/UnwindPlan.h +++ b/lldb/include/lldb/Symbol/UnwindPlan.h @@ -482,7 +482,7 @@ class UnwindPlan { m_return_addr_register = regnum; } - uint32_t GetReturnAddressRegister() const { return m_return_addr_register; } + uint32_t GetReturnAddressRegister() { return m_return_addr_register; } uint32_t GetInitialCFARegister() const { if (m_row_list.empty()) @@ -497,7 +497,7 @@ class UnwindPlan { m_plan_valid_ranges = std::move(ranges); } - bool PlanValidAtAddress(Address addr) const; + bool PlanValidAtAddress(Address addr); bool IsValidRowIndex(uint32_t idx) const; diff --git a/lldb/include/lldb/Target/RegisterContextUnwind.h b/lldb/include/lldb/Target/RegisterContextUnwind.h index 044a387fe5aa2..c4ae29e657bfb 100644 --- a/lldb/include/lldb/Target/RegisterContextUnwind.h +++ b/lldb/include/lldb/Target/RegisterContextUnwind.h @@ -127,8 +127,7 @@ class RegisterContextUnwind : public lldb_private::RegisterContext { /// Check if the given unwind plan indicates a signal trap handler, and /// update frame type and symbol context if so. - void PropagateTrapHandlerFlagFromUnwindPlan( - std::shared_ptr unwind_plan); + void PropagateTrapHandlerFlagFromUnwindPlan(lldb::UnwindPlanSP unwind_plan); // Provide a location for where THIS function saved the CALLER's register // value @@ -195,17 +194,16 @@ class RegisterContextUnwind : public lldb_private::RegisterContext { const UnwindPlan::Row::FAValue &fa, lldb::addr_t &address); - std::shared_ptr GetFastUnwindPlanForFrame(); + lldb::UnwindPlanSP GetFastUnwindPlanForFrame(); - std::shared_ptr GetFullUnwindPlanForFrame(); + lldb::UnwindPlanSP GetFullUnwindPlanForFrame(); void UnwindLogMsg(const char *fmt, ...) __attribute__((format(printf, 2, 3))); void UnwindLogMsgVerbose(const char *fmt, ...) __attribute__((format(printf, 2, 3))); - bool IsUnwindPlanValidForCurrentPC( - std::shared_ptr unwind_plan_sp); + bool IsUnwindPlanValidForCurrentPC(lldb::UnwindPlanSP unwind_plan_sp); lldb::addr_t GetReturnAddressHint(int32_t plan_offset); @@ -217,9 +215,9 @@ class RegisterContextUnwind : public lldb_private::RegisterContext { // i.e. where THIS frame saved them /// - std::shared_ptr m_fast_unwind_plan_sp; // may be NULL - std::shared_ptr m_full_unwind_plan_sp; - std::shared_ptr m_fallback_unwind_plan_sp; // may be NULL + lldb::UnwindPlanSP m_fast_unwind_plan_sp; // may be NULL + lldb::UnwindPlanSP m_full_unwind_plan_sp; + lldb::UnwindPlanSP m_fallback_unwind_plan_sp; // may be NULL bool m_all_registers_available; // Can we retrieve all regs or just // nonvolatile regs? diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index 3f7d3007ed168..c77bddb4af061 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -3641,70 +3641,77 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed { result.GetOutputStream().Printf("\n"); - if (std::shared_ptr plan_sp = - func_unwinders_sp->GetUnwindPlanAtNonCallSite(*target, *thread)) { + UnwindPlanSP non_callsite_unwind_plan = + func_unwinders_sp->GetUnwindPlanAtNonCallSite(*target, *thread); + if (non_callsite_unwind_plan) { result.GetOutputStream().Printf( "Asynchronous (not restricted to call-sites) UnwindPlan is '%s'\n", - plan_sp->GetSourceName().AsCString()); + non_callsite_unwind_plan->GetSourceName().AsCString()); } - if (std::shared_ptr plan_sp = - func_unwinders_sp->GetUnwindPlanAtCallSite(*target, *thread)) { + UnwindPlanSP callsite_unwind_plan = + func_unwinders_sp->GetUnwindPlanAtCallSite(*target, *thread); + if (callsite_unwind_plan) { result.GetOutputStream().Printf( "Synchronous (restricted to call-sites) UnwindPlan is '%s'\n", - plan_sp->GetSourceName().AsCString()); + callsite_unwind_plan->GetSourceName().AsCString()); } - if (std::shared_ptr plan_sp = - func_unwinders_sp->GetUnwindPlanFastUnwind(*target, *thread)) { - result.GetOutputStream().Printf("Fast UnwindPlan is '%s'\n", - plan_sp->GetSourceName().AsCString()); + UnwindPlanSP fast_unwind_plan = + func_unwinders_sp->GetUnwindPlanFastUnwind(*target, *thread); + if (fast_unwind_plan) { + result.GetOutputStream().Printf( + "Fast UnwindPlan is '%s'\n", + fast_unwind_plan->GetSourceName().AsCString()); } result.GetOutputStream().Printf("\n"); - if (std::shared_ptr plan_sp = - func_unwinders_sp->GetAssemblyUnwindPlan(*target, *thread)) { + UnwindPlanSP assembly_sp = + func_unwinders_sp->GetAssemblyUnwindPlan(*target, *thread); + if (assembly_sp) { result.GetOutputStream().Printf( "Assembly language inspection UnwindPlan:\n"); - plan_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + assembly_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - if (std::shared_ptr plan_sp = - func_unwinders_sp->GetObjectFileUnwindPlan(*target)) { + UnwindPlanSP of_unwind_sp = + func_unwinders_sp->GetObjectFileUnwindPlan(*target); + if (of_unwind_sp) { result.GetOutputStream().Printf("object file UnwindPlan:\n"); - plan_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + of_unwind_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - if (std::shared_ptr plan_sp = - func_unwinders_sp->GetObjectFileAugmentedUnwindPlan(*target, - *thread)) { + UnwindPlanSP of_unwind_augmented_sp = + func_unwinders_sp->GetObjectFileAugmentedUnwindPlan(*target, *thread); + if (of_unwind_augmented_sp) { result.GetOutputStream().Printf("object file augmented UnwindPlan:\n"); - plan_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + of_unwind_augmented_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - if (std::shared_ptr plan_sp = - func_unwinders_sp->GetEHFrameUnwindPlan(*target)) { + UnwindPlanSP ehframe_sp = + func_unwinders_sp->GetEHFrameUnwindPlan(*target); + if (ehframe_sp) { result.GetOutputStream().Printf("eh_frame UnwindPlan:\n"); - plan_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + ehframe_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - if (std::shared_ptr plan_sp = - func_unwinders_sp->GetEHFrameAugmentedUnwindPlan(*target, - *thread)) { + UnwindPlanSP ehframe_augmented_sp = + func_unwinders_sp->GetEHFrameAugmentedUnwindPlan(*target, *thread); + if (ehframe_augmented_sp) { result.GetOutputStream().Printf("eh_frame augmented UnwindPlan:\n"); - plan_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + ehframe_augmented_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - if (std::shared_ptr plan_sp = + if (UnwindPlanSP plan_sp = func_unwinders_sp->GetDebugFrameUnwindPlan(*target)) { result.GetOutputStream().Printf("debug_frame UnwindPlan:\n"); plan_sp->Dump(result.GetOutputStream(), thread.get(), @@ -3712,7 +3719,7 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed { result.GetOutputStream().Printf("\n"); } - if (std::shared_ptr plan_sp = + if (UnwindPlanSP plan_sp = func_unwinders_sp->GetDebugFrameAugmentedUnwindPlan(*target, *thread)) { result.GetOutputStream().Printf("debug_frame augmented UnwindPlan:\n"); @@ -3721,35 +3728,36 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed { result.GetOutputStream().Printf("\n"); } - if (std::shared_ptr plan_sp = - func_unwinders_sp->GetArmUnwindUnwindPlan(*target)) { + UnwindPlanSP arm_unwind_sp = + func_unwinders_sp->GetArmUnwindUnwindPlan(*target); + if (arm_unwind_sp) { result.GetOutputStream().Printf("ARM.exidx unwind UnwindPlan:\n"); - plan_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + arm_unwind_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - if (std::shared_ptr plan_sp = + if (UnwindPlanSP symfile_plan_sp = func_unwinders_sp->GetSymbolFileUnwindPlan(*thread)) { result.GetOutputStream().Printf("Symbol file UnwindPlan:\n"); - plan_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + symfile_plan_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - if (std::shared_ptr plan_sp = - func_unwinders_sp->GetCompactUnwindUnwindPlan(*target)) { + UnwindPlanSP compact_unwind_sp = + func_unwinders_sp->GetCompactUnwindUnwindPlan(*target); + if (compact_unwind_sp) { result.GetOutputStream().Printf("Compact unwind UnwindPlan:\n"); - plan_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + compact_unwind_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - if (std::shared_ptr plan_sp = - func_unwinders_sp->GetUnwindPlanFastUnwind(*target, *thread)) { + if (fast_unwind_plan) { result.GetOutputStream().Printf("Fast UnwindPlan:\n"); - plan_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + fast_unwind_plan->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } diff --git a/lldb/source/Symbol/FuncUnwinders.cpp b/lldb/source/Symbol/FuncUnwinders.cpp index a74029d8343c7..a5ca7b094c949 100644 --- a/lldb/source/Symbol/FuncUnwinders.cpp +++ b/lldb/source/Symbol/FuncUnwinders.cpp @@ -71,47 +71,40 @@ FuncUnwinders::FuncUnwinders(UnwindTable &unwind_table, Address addr, FuncUnwinders::~FuncUnwinders() = default; -std::shared_ptr -FuncUnwinders::GetUnwindPlanAtCallSite(Target &target, Thread &thread) { +UnwindPlanSP FuncUnwinders::GetUnwindPlanAtCallSite(Target &target, + Thread &thread) { std::lock_guard guard(m_mutex); - if (std::shared_ptr plan_sp = - GetObjectFileUnwindPlan(target)) + if (UnwindPlanSP plan_sp = GetObjectFileUnwindPlan(target)) return plan_sp; - if (std::shared_ptr plan_sp = - GetSymbolFileUnwindPlan(thread)) + if (UnwindPlanSP plan_sp = GetSymbolFileUnwindPlan(thread)) return plan_sp; - if (std::shared_ptr plan_sp = - GetDebugFrameUnwindPlan(target)) + if (UnwindPlanSP plan_sp = GetDebugFrameUnwindPlan(target)) return plan_sp; - if (std::shared_ptr plan_sp = GetEHFrameUnwindPlan(target)) + if (UnwindPlanSP plan_sp = GetEHFrameUnwindPlan(target)) return plan_sp; - if (std::shared_ptr plan_sp = - GetCompactUnwindUnwindPlan(target)) + if (UnwindPlanSP plan_sp = GetCompactUnwindUnwindPlan(target)) return plan_sp; - if (std::shared_ptr plan_sp = - GetArmUnwindUnwindPlan(target)) + if (UnwindPlanSP plan_sp = GetArmUnwindUnwindPlan(target)) return plan_sp; return nullptr; } -std::shared_ptr -FuncUnwinders::GetCompactUnwindUnwindPlan(Target &target) { +UnwindPlanSP FuncUnwinders::GetCompactUnwindUnwindPlan(Target &target) { std::lock_guard guard(m_mutex); if (m_unwind_plan_compact_unwind.size() > 0) return m_unwind_plan_compact_unwind[0]; // FIXME support multiple compact // unwind plans for one func if (m_tried_unwind_plan_compact_unwind) - return nullptr; + return UnwindPlanSP(); m_tried_unwind_plan_compact_unwind = true; if (m_range.GetBaseAddress().IsValid()) { Address current_pc(m_range.GetBaseAddress()); CompactUnwindInfo *compact_unwind = m_unwind_table.GetCompactUnwindInfo(); if (compact_unwind) { - auto unwind_plan_sp = - std::make_shared(lldb::eRegisterKindGeneric); + UnwindPlanSP unwind_plan_sp(new UnwindPlan(lldb::eRegisterKindGeneric)); if (compact_unwind->GetUnwindPlan(target, current_pc, *unwind_plan_sp)) { m_unwind_plan_compact_unwind.push_back(unwind_plan_sp); return m_unwind_plan_compact_unwind[0]; // FIXME support multiple @@ -120,11 +113,10 @@ FuncUnwinders::GetCompactUnwindUnwindPlan(Target &target) { } } } - return nullptr; + return UnwindPlanSP(); } -std::shared_ptr -FuncUnwinders::GetObjectFileUnwindPlan(Target &target) { +lldb::UnwindPlanSP FuncUnwinders::GetObjectFileUnwindPlan(Target &target) { std::lock_guard guard(m_mutex); if (m_unwind_plan_object_file_sp.get() || m_tried_unwind_plan_object_file) @@ -134,16 +126,17 @@ FuncUnwinders::GetObjectFileUnwindPlan(Target &target) { if (m_range.GetBaseAddress().IsValid()) { CallFrameInfo *object_file_frame = m_unwind_table.GetObjectFileUnwindInfo(); if (object_file_frame) { - auto plan_sp = std::make_shared(lldb::eRegisterKindGeneric); - if (object_file_frame->GetUnwindPlan(m_range, *plan_sp)) - m_unwind_plan_object_file_sp = std::move(plan_sp); + m_unwind_plan_object_file_sp = + std::make_shared(lldb::eRegisterKindGeneric); + if (!object_file_frame->GetUnwindPlan(m_range, + *m_unwind_plan_object_file_sp)) + m_unwind_plan_object_file_sp.reset(); } } return m_unwind_plan_object_file_sp; } -std::shared_ptr -FuncUnwinders::GetEHFrameUnwindPlan(Target &target) { +UnwindPlanSP FuncUnwinders::GetEHFrameUnwindPlan(Target &target) { std::lock_guard guard(m_mutex); if (m_unwind_plan_eh_frame_sp.get() || m_tried_unwind_plan_eh_frame) return m_unwind_plan_eh_frame_sp; @@ -152,16 +145,16 @@ FuncUnwinders::GetEHFrameUnwindPlan(Target &target) { if (m_range.GetBaseAddress().IsValid()) { DWARFCallFrameInfo *eh_frame = m_unwind_table.GetEHFrameInfo(); if (eh_frame) { - auto plan_sp = std::make_shared(lldb::eRegisterKindGeneric); - if (eh_frame->GetUnwindPlan(m_range, *plan_sp)) - m_unwind_plan_eh_frame_sp = std::move(plan_sp); + m_unwind_plan_eh_frame_sp = + std::make_shared(lldb::eRegisterKindGeneric); + if (!eh_frame->GetUnwindPlan(m_range, *m_unwind_plan_eh_frame_sp)) + m_unwind_plan_eh_frame_sp.reset(); } } return m_unwind_plan_eh_frame_sp; } -std::shared_ptr -FuncUnwinders::GetDebugFrameUnwindPlan(Target &target) { +UnwindPlanSP FuncUnwinders::GetDebugFrameUnwindPlan(Target &target) { std::lock_guard guard(m_mutex); if (m_unwind_plan_debug_frame_sp || m_tried_unwind_plan_debug_frame) return m_unwind_plan_debug_frame_sp; @@ -170,16 +163,16 @@ FuncUnwinders::GetDebugFrameUnwindPlan(Target &target) { if (m_range.GetBaseAddress().IsValid()) { DWARFCallFrameInfo *debug_frame = m_unwind_table.GetDebugFrameInfo(); if (debug_frame) { - auto plan_sp = std::make_shared(lldb::eRegisterKindGeneric); - if (debug_frame->GetUnwindPlan(m_range, *plan_sp)) - m_unwind_plan_debug_frame_sp = std::move(plan_sp); + m_unwind_plan_debug_frame_sp = + std::make_shared(lldb::eRegisterKindGeneric); + if (!debug_frame->GetUnwindPlan(m_range, *m_unwind_plan_debug_frame_sp)) + m_unwind_plan_debug_frame_sp.reset(); } } return m_unwind_plan_debug_frame_sp; } -std::shared_ptr -FuncUnwinders::GetArmUnwindUnwindPlan(Target &target) { +UnwindPlanSP FuncUnwinders::GetArmUnwindUnwindPlan(Target &target) { std::lock_guard guard(m_mutex); if (m_unwind_plan_arm_unwind_sp.get() || m_tried_unwind_plan_arm_unwind) return m_unwind_plan_arm_unwind_sp; @@ -189,9 +182,11 @@ FuncUnwinders::GetArmUnwindUnwindPlan(Target &target) { Address current_pc(m_range.GetBaseAddress()); ArmUnwindInfo *arm_unwind_info = m_unwind_table.GetArmUnwindInfo(); if (arm_unwind_info) { - auto plan_sp = std::make_shared(lldb::eRegisterKindGeneric); - if (arm_unwind_info->GetUnwindPlan(target, current_pc, *plan_sp)) - m_unwind_plan_arm_unwind_sp = std::move(plan_sp); + m_unwind_plan_arm_unwind_sp = + std::make_shared(lldb::eRegisterKindGeneric); + if (!arm_unwind_info->GetUnwindPlan(target, current_pc, + *m_unwind_plan_arm_unwind_sp)) + m_unwind_plan_arm_unwind_sp.reset(); } } return m_unwind_plan_arm_unwind_sp; @@ -215,8 +210,7 @@ class RegisterContextToInfo: public SymbolFile::RegisterInfoResolver { }; } // namespace -std::shared_ptr -FuncUnwinders::GetSymbolFileUnwindPlan(Thread &thread) { +UnwindPlanSP FuncUnwinders::GetSymbolFileUnwindPlan(Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_symbol_file_sp.get() || m_tried_unwind_plan_symbol_file) return m_unwind_plan_symbol_file_sp; @@ -230,9 +224,9 @@ FuncUnwinders::GetSymbolFileUnwindPlan(Thread &thread) { return m_unwind_plan_symbol_file_sp; } -std::shared_ptr +UnwindPlanSP FuncUnwinders::GetObjectFileAugmentedUnwindPlan(Target &target, - Thread &thread) { + Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_object_file_augmented_sp.get() || m_tried_unwind_plan_object_file_augmented) @@ -240,27 +234,30 @@ FuncUnwinders::GetObjectFileAugmentedUnwindPlan(Target &target, m_tried_unwind_plan_object_file_augmented = true; - std::shared_ptr object_file_unwind_plan = - GetObjectFileUnwindPlan(target); + UnwindPlanSP object_file_unwind_plan = GetObjectFileUnwindPlan(target); if (!object_file_unwind_plan) return m_unwind_plan_object_file_augmented_sp; + m_unwind_plan_object_file_augmented_sp = + std::make_shared(*object_file_unwind_plan); + // Augment the instructions with epilogue descriptions if necessary // so the UnwindPlan can be used at any instruction in the function. UnwindAssemblySP assembly_profiler_sp(GetUnwindAssemblyProfiler(target)); if (assembly_profiler_sp) { - auto plan_sp = std::make_shared(*object_file_unwind_plan); - - if (assembly_profiler_sp->AugmentUnwindPlanFromCallSite(m_range, thread, - *plan_sp)) - m_unwind_plan_object_file_augmented_sp = std::move(plan_sp); + if (!assembly_profiler_sp->AugmentUnwindPlanFromCallSite( + m_range, thread, *m_unwind_plan_object_file_augmented_sp)) { + m_unwind_plan_object_file_augmented_sp.reset(); + } + } else { + m_unwind_plan_object_file_augmented_sp.reset(); } return m_unwind_plan_object_file_augmented_sp; } -std::shared_ptr -FuncUnwinders::GetEHFrameAugmentedUnwindPlan(Target &target, Thread &thread) { +UnwindPlanSP FuncUnwinders::GetEHFrameAugmentedUnwindPlan(Target &target, + Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_eh_frame_augmented_sp.get() || m_tried_unwind_plan_eh_frame_augmented) @@ -278,27 +275,30 @@ FuncUnwinders::GetEHFrameAugmentedUnwindPlan(Target &target, Thread &thread) { m_tried_unwind_plan_eh_frame_augmented = true; - std::shared_ptr eh_frame_plan = - GetEHFrameUnwindPlan(target); + UnwindPlanSP eh_frame_plan = GetEHFrameUnwindPlan(target); if (!eh_frame_plan) return m_unwind_plan_eh_frame_augmented_sp; + m_unwind_plan_eh_frame_augmented_sp = + std::make_shared(*eh_frame_plan); + // Augment the eh_frame instructions with epilogue descriptions if necessary // so the UnwindPlan can be used at any instruction in the function. UnwindAssemblySP assembly_profiler_sp(GetUnwindAssemblyProfiler(target)); if (assembly_profiler_sp) { - auto plan_sp = std::make_shared(*eh_frame_plan); - if (assembly_profiler_sp->AugmentUnwindPlanFromCallSite(m_range, thread, - *plan_sp)) - m_unwind_plan_eh_frame_augmented_sp = std::move(plan_sp); + if (!assembly_profiler_sp->AugmentUnwindPlanFromCallSite( + m_range, thread, *m_unwind_plan_eh_frame_augmented_sp)) { + m_unwind_plan_eh_frame_augmented_sp.reset(); + } + } else { + m_unwind_plan_eh_frame_augmented_sp.reset(); } return m_unwind_plan_eh_frame_augmented_sp; } -std::shared_ptr -FuncUnwinders::GetDebugFrameAugmentedUnwindPlan(Target &target, - Thread &thread) { +UnwindPlanSP FuncUnwinders::GetDebugFrameAugmentedUnwindPlan(Target &target, + Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_debug_frame_augmented_sp.get() || m_tried_unwind_plan_debug_frame_augmented) @@ -316,28 +316,30 @@ FuncUnwinders::GetDebugFrameAugmentedUnwindPlan(Target &target, m_tried_unwind_plan_debug_frame_augmented = true; - std::shared_ptr debug_frame_plan = - GetDebugFrameUnwindPlan(target); + UnwindPlanSP debug_frame_plan = GetDebugFrameUnwindPlan(target); if (!debug_frame_plan) return m_unwind_plan_debug_frame_augmented_sp; + m_unwind_plan_debug_frame_augmented_sp = + std::make_shared(*debug_frame_plan); + // Augment the debug_frame instructions with epilogue descriptions if // necessary so the UnwindPlan can be used at any instruction in the // function. UnwindAssemblySP assembly_profiler_sp(GetUnwindAssemblyProfiler(target)); if (assembly_profiler_sp) { - auto plan_sp = std::make_shared(*debug_frame_plan); - - if (assembly_profiler_sp->AugmentUnwindPlanFromCallSite(m_range, thread, - *plan_sp)) - m_unwind_plan_debug_frame_augmented_sp = std::move(plan_sp); - } + if (!assembly_profiler_sp->AugmentUnwindPlanFromCallSite( + m_range, thread, *m_unwind_plan_debug_frame_augmented_sp)) { + m_unwind_plan_debug_frame_augmented_sp.reset(); + } + } else + m_unwind_plan_debug_frame_augmented_sp.reset(); return m_unwind_plan_debug_frame_augmented_sp; } -std::shared_ptr -FuncUnwinders::GetAssemblyUnwindPlan(Target &target, Thread &thread) { +UnwindPlanSP FuncUnwinders::GetAssemblyUnwindPlan(Target &target, + Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_assembly_sp.get() || m_tried_unwind_plan_assembly || !m_unwind_table.GetAllowAssemblyEmulationUnwindPlans()) { @@ -358,10 +360,12 @@ FuncUnwinders::GetAssemblyUnwindPlan(Target &target, Thread &thread) { UnwindAssemblySP assembly_profiler_sp(GetUnwindAssemblyProfiler(target)); if (assembly_profiler_sp) { - auto plan_sp = std::make_shared(lldb::eRegisterKindGeneric); - if (assembly_profiler_sp->GetNonCallSiteUnwindPlanFromAssembly( - range, thread, *plan_sp)) - m_unwind_plan_assembly_sp = std::move(plan_sp); + m_unwind_plan_assembly_sp = + std::make_shared(lldb::eRegisterKindGeneric); + if (!assembly_profiler_sp->GetNonCallSiteUnwindPlanFromAssembly( + range, thread, *m_unwind_plan_assembly_sp)) { + m_unwind_plan_assembly_sp.reset(); + } } return m_unwind_plan_assembly_sp; } @@ -370,8 +374,7 @@ FuncUnwinders::GetAssemblyUnwindPlan(Target &target, Thread &thread) { // If they have the same way of getting the pc value (e.g. "CFA - 8" + "CFA is // sp"), then it will return LazyBoolTrue. LazyBool FuncUnwinders::CompareUnwindPlansForIdenticalInitialPCLocation( - Thread &thread, const std::shared_ptr &a, - const std::shared_ptr &b) { + Thread &thread, const UnwindPlanSP &a, const UnwindPlanSP &b) { LazyBool plans_are_identical = eLazyBoolCalculate; RegisterNumber pc_reg(thread, eRegisterKindGeneric, LLDB_REGNUM_GENERIC_PC); @@ -401,19 +404,17 @@ LazyBool FuncUnwinders::CompareUnwindPlansForIdenticalInitialPCLocation( return plans_are_identical; } -std::shared_ptr -FuncUnwinders::GetUnwindPlanAtNonCallSite(Target &target, Thread &thread) { - std::shared_ptr eh_frame_sp = GetEHFrameUnwindPlan(target); +UnwindPlanSP FuncUnwinders::GetUnwindPlanAtNonCallSite(Target &target, + Thread &thread) { + UnwindPlanSP eh_frame_sp = GetEHFrameUnwindPlan(target); if (!eh_frame_sp) eh_frame_sp = GetDebugFrameUnwindPlan(target); if (!eh_frame_sp) eh_frame_sp = GetObjectFileUnwindPlan(target); - std::shared_ptr arch_default_at_entry_sp = + UnwindPlanSP arch_default_at_entry_sp = GetUnwindPlanArchitectureDefaultAtFunctionEntry(thread); - std::shared_ptr arch_default_sp = - GetUnwindPlanArchitectureDefault(thread); - std::shared_ptr assembly_sp = - GetAssemblyUnwindPlan(target, thread); + UnwindPlanSP arch_default_sp = GetUnwindPlanArchitectureDefault(thread); + UnwindPlanSP assembly_sp = GetAssemblyUnwindPlan(target, thread); // This point of this code is to detect when a function is using a non- // standard ABI, and the eh_frame correctly describes that alternate ABI. @@ -442,24 +443,20 @@ FuncUnwinders::GetUnwindPlanAtNonCallSite(Target &target, Thread &thread) { return eh_frame_sp; } - if (std::shared_ptr plan_sp = - GetSymbolFileUnwindPlan(thread)) + if (UnwindPlanSP plan_sp = GetSymbolFileUnwindPlan(thread)) return plan_sp; - if (std::shared_ptr plan_sp = - GetDebugFrameAugmentedUnwindPlan(target, thread)) + if (UnwindPlanSP plan_sp = GetDebugFrameAugmentedUnwindPlan(target, thread)) return plan_sp; - if (std::shared_ptr plan_sp = - GetEHFrameAugmentedUnwindPlan(target, thread)) + if (UnwindPlanSP plan_sp = GetEHFrameAugmentedUnwindPlan(target, thread)) return plan_sp; - if (std::shared_ptr plan_sp = - GetObjectFileAugmentedUnwindPlan(target, thread)) + if (UnwindPlanSP plan_sp = GetObjectFileAugmentedUnwindPlan(target, thread)) return plan_sp; return assembly_sp; } -std::shared_ptr -FuncUnwinders::GetUnwindPlanFastUnwind(Target &target, Thread &thread) { +UnwindPlanSP FuncUnwinders::GetUnwindPlanFastUnwind(Target &target, + Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_fast_sp.get() || m_tried_unwind_fast) return m_unwind_plan_fast_sp; @@ -468,15 +465,17 @@ FuncUnwinders::GetUnwindPlanFastUnwind(Target &target, Thread &thread) { UnwindAssemblySP assembly_profiler_sp(GetUnwindAssemblyProfiler(target)); if (assembly_profiler_sp) { - auto plan_sp = std::make_shared(lldb::eRegisterKindGeneric); - if (assembly_profiler_sp->GetFastUnwindPlan(m_range, thread, *plan_sp)) - m_unwind_plan_fast_sp = std::move(plan_sp); + m_unwind_plan_fast_sp = + std::make_shared(lldb::eRegisterKindGeneric); + if (!assembly_profiler_sp->GetFastUnwindPlan(m_range, thread, + *m_unwind_plan_fast_sp)) { + m_unwind_plan_fast_sp.reset(); + } } return m_unwind_plan_fast_sp; } -std::shared_ptr -FuncUnwinders::GetUnwindPlanArchitectureDefault(Thread &thread) { +UnwindPlanSP FuncUnwinders::GetUnwindPlanArchitectureDefault(Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_arch_default_sp.get() || m_tried_unwind_arch_default) return m_unwind_plan_arch_default_sp; @@ -492,7 +491,7 @@ FuncUnwinders::GetUnwindPlanArchitectureDefault(Thread &thread) { return m_unwind_plan_arch_default_sp; } -std::shared_ptr +UnwindPlanSP FuncUnwinders::GetUnwindPlanArchitectureDefaultAtFunctionEntry(Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_arch_default_at_func_entry_sp.get() || @@ -541,8 +540,7 @@ FuncUnwinders::GetUnwindAssemblyProfiler(Target &target) { Address FuncUnwinders::GetLSDAAddress(Target &target) { Address lsda_addr; - std::shared_ptr unwind_plan_sp = - GetEHFrameUnwindPlan(target); + UnwindPlanSP unwind_plan_sp = GetEHFrameUnwindPlan(target); if (unwind_plan_sp.get() == nullptr) { unwind_plan_sp = GetCompactUnwindUnwindPlan(target); } @@ -558,8 +556,7 @@ Address FuncUnwinders::GetLSDAAddress(Target &target) { Address FuncUnwinders::GetPersonalityRoutinePtrAddress(Target &target) { Address personality_addr; - std::shared_ptr unwind_plan_sp = - GetEHFrameUnwindPlan(target); + UnwindPlanSP unwind_plan_sp = GetEHFrameUnwindPlan(target); if (unwind_plan_sp.get() == nullptr) { unwind_plan_sp = GetCompactUnwindUnwindPlan(target); } diff --git a/lldb/source/Symbol/UnwindPlan.cpp b/lldb/source/Symbol/UnwindPlan.cpp index cfa8eefaa55bb..f2846eb927bf8 100644 --- a/lldb/source/Symbol/UnwindPlan.cpp +++ b/lldb/source/Symbol/UnwindPlan.cpp @@ -451,7 +451,7 @@ const UnwindPlan::Row *UnwindPlan::GetLastRow() const { return m_row_list.back().get(); } -bool UnwindPlan::PlanValidAtAddress(Address addr) const { +bool UnwindPlan::PlanValidAtAddress(Address addr) { // If this UnwindPlan has no rows, it is an invalid UnwindPlan. if (GetRowCount() == 0) { Log *log = GetLog(LLDBLog::Unwind); diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp index b6a4a71bc3356..cb3d7ee479890 100644 --- a/lldb/source/Target/RegisterContextUnwind.cpp +++ b/lldb/source/Target/RegisterContextUnwind.cpp @@ -84,7 +84,7 @@ RegisterContextUnwind::RegisterContextUnwind(Thread &thread, } bool RegisterContextUnwind::IsUnwindPlanValidForCurrentPC( - std::shared_ptr unwind_plan_sp) { + lldb::UnwindPlanSP unwind_plan_sp) { if (!unwind_plan_sp) return false; @@ -141,9 +141,8 @@ void RegisterContextUnwind::InitializeZerothFrame() { if (ABISP abi_sp = process->GetABI()) current_pc = abi_sp->FixCodeAddress(current_pc); - std::shared_ptr lang_runtime_plan_sp = - LanguageRuntime::GetRuntimeUnwindPlan(m_thread, this, - m_behaves_like_zeroth_frame); + UnwindPlanSP lang_runtime_plan_sp = LanguageRuntime::GetRuntimeUnwindPlan( + m_thread, this, m_behaves_like_zeroth_frame); if (lang_runtime_plan_sp.get()) { UnwindLogMsg("This is an async frame"); } @@ -266,7 +265,7 @@ void RegisterContextUnwind::InitializeZerothFrame() { // Try the fall back unwind plan since the // full unwind plan failed. FuncUnwindersSP func_unwinders_sp; - std::shared_ptr call_site_unwind_plan; + UnwindPlanSP call_site_unwind_plan; bool cfa_status = false; if (m_sym_ctx_valid) { @@ -341,9 +340,8 @@ void RegisterContextUnwind::InitializeNonZerothFrame() { // A LanguageRuntime may provide an UnwindPlan that is used in this // stack trace base on the RegisterContext contents, intsead // of the normal UnwindPlans we would use for the return-pc. - std::shared_ptr lang_runtime_plan_sp = - LanguageRuntime::GetRuntimeUnwindPlan(m_thread, this, - m_behaves_like_zeroth_frame); + UnwindPlanSP lang_runtime_plan_sp = LanguageRuntime::GetRuntimeUnwindPlan( + m_thread, this, m_behaves_like_zeroth_frame); if (lang_runtime_plan_sp.get()) { UnwindLogMsg("This is an async frame"); } @@ -751,37 +749,39 @@ bool RegisterContextUnwind::BehavesLikeZerothFrame() const { // 4. m_current_offset_backed_up_one should have the current byte offset into // the function, maybe backed up by 1, std::nullopt if unknown -std::shared_ptr -RegisterContextUnwind::GetFastUnwindPlanForFrame() { +UnwindPlanSP RegisterContextUnwind::GetFastUnwindPlanForFrame() { + UnwindPlanSP unwind_plan_sp; ModuleSP pc_module_sp(m_current_pc.GetModule()); if (!m_current_pc.IsValid() || !pc_module_sp || pc_module_sp->GetObjectFile() == nullptr) - return nullptr; + return unwind_plan_sp; if (IsFrameZero()) - return nullptr; + return unwind_plan_sp; FuncUnwindersSP func_unwinders_sp( pc_module_sp->GetUnwindTable().GetFuncUnwindersContainingAddress( m_current_pc, m_sym_ctx)); if (!func_unwinders_sp) - return nullptr; + return unwind_plan_sp; // If we're in _sigtramp(), unwinding past this frame requires special // knowledge. if (m_frame_type == eTrapHandlerFrame || m_frame_type == eDebuggerFrame) - return nullptr; + return unwind_plan_sp; - if (std::shared_ptr unwind_plan_sp = - func_unwinders_sp->GetUnwindPlanFastUnwind( - *m_thread.CalculateTarget(), m_thread)) { + unwind_plan_sp = func_unwinders_sp->GetUnwindPlanFastUnwind( + *m_thread.CalculateTarget(), m_thread); + if (unwind_plan_sp) { if (unwind_plan_sp->PlanValidAtAddress(m_current_pc)) { m_frame_type = eNormalFrame; return unwind_plan_sp; + } else { + unwind_plan_sp.reset(); } } - return nullptr; + return unwind_plan_sp; } // On entry to this method, @@ -793,9 +793,9 @@ RegisterContextUnwind::GetFastUnwindPlanForFrame() { // 4. m_current_offset_backed_up_one should have the current byte offset into // the function, maybe backed up by 1, std::nullopt if unknown -std::shared_ptr -RegisterContextUnwind::GetFullUnwindPlanForFrame() { - std::shared_ptr arch_default_unwind_plan_sp; +UnwindPlanSP RegisterContextUnwind::GetFullUnwindPlanForFrame() { + UnwindPlanSP unwind_plan_sp; + UnwindPlanSP arch_default_unwind_plan_sp; ExecutionContext exe_ctx(m_thread.shared_from_this()); Process *process = exe_ctx.GetProcessPtr(); ABI *abi = process ? process->GetABI().get() : nullptr; @@ -833,8 +833,9 @@ RegisterContextUnwind::GetFullUnwindPlanForFrame() { process->GetLoadAddressPermissions(current_pc_addr, permissions) && (permissions & ePermissionsExecutable) == 0)) { if (abi) { + unwind_plan_sp = abi->CreateFunctionEntryUnwindPlan(); m_frame_type = eNormalFrame; - return abi->CreateFunctionEntryUnwindPlan(); + return unwind_plan_sp; } } } @@ -871,29 +872,32 @@ RegisterContextUnwind::GetFullUnwindPlanForFrame() { DWARFCallFrameInfo *eh_frame = pc_module_sp->GetUnwindTable().GetEHFrameInfo(); if (eh_frame) { - auto unwind_plan_sp = - std::make_shared(lldb::eRegisterKindGeneric); + unwind_plan_sp = std::make_shared(lldb::eRegisterKindGeneric); if (eh_frame->GetUnwindPlan(m_current_pc, *unwind_plan_sp)) return unwind_plan_sp; + else + unwind_plan_sp.reset(); } ArmUnwindInfo *arm_exidx = pc_module_sp->GetUnwindTable().GetArmUnwindInfo(); if (arm_exidx) { - auto unwind_plan_sp = - std::make_shared(lldb::eRegisterKindGeneric); + unwind_plan_sp = std::make_shared(lldb::eRegisterKindGeneric); if (arm_exidx->GetUnwindPlan(exe_ctx.GetTargetRef(), m_current_pc, *unwind_plan_sp)) return unwind_plan_sp; + else + unwind_plan_sp.reset(); } CallFrameInfo *object_file_unwind = pc_module_sp->GetUnwindTable().GetObjectFileUnwindInfo(); if (object_file_unwind) { - auto unwind_plan_sp = - std::make_shared(lldb::eRegisterKindGeneric); + unwind_plan_sp = std::make_shared(lldb::eRegisterKindGeneric); if (object_file_unwind->GetUnwindPlan(m_current_pc, *unwind_plan_sp)) return unwind_plan_sp; + else + unwind_plan_sp.reset(); } return arch_default_unwind_plan_sp; @@ -907,13 +911,15 @@ RegisterContextUnwind::GetFullUnwindPlanForFrame() { // substitute plan. Otherwise, use eh_frame. if (m_sym_ctx_valid) { lldb::PlatformSP platform = process->GetTarget().GetPlatform(); - if (auto unwind_plan_sp = platform->GetTrapHandlerUnwindPlan( - process->GetTarget().GetArchitecture().GetTriple(), - GetSymbolOrFunctionName(m_sym_ctx))) + unwind_plan_sp = platform->GetTrapHandlerUnwindPlan( + process->GetTarget().GetArchitecture().GetTriple(), + GetSymbolOrFunctionName(m_sym_ctx)); + + if (unwind_plan_sp) return unwind_plan_sp; } - auto unwind_plan_sp = + unwind_plan_sp = func_unwinders_sp->GetEHFrameUnwindPlan(process->GetTarget()); if (!unwind_plan_sp) unwind_plan_sp = @@ -938,7 +944,7 @@ RegisterContextUnwind::GetFullUnwindPlanForFrame() { // normally we would call GetUnwindPlanAtCallSite() -- because CallSite may // return an unwind plan sourced from either eh_frame (that's what we // intend) or compact unwind (this won't work) - auto unwind_plan_sp = + unwind_plan_sp = func_unwinders_sp->GetEHFrameUnwindPlan(process->GetTarget()); if (!unwind_plan_sp) unwind_plan_sp = @@ -954,7 +960,7 @@ RegisterContextUnwind::GetFullUnwindPlanForFrame() { // Typically the NonCallSite UnwindPlan is the unwind created by inspecting // the assembly language instructions if (m_behaves_like_zeroth_frame && process) { - auto unwind_plan_sp = func_unwinders_sp->GetUnwindPlanAtNonCallSite( + unwind_plan_sp = func_unwinders_sp->GetUnwindPlanAtNonCallSite( process->GetTarget(), m_thread); if (unwind_plan_sp && unwind_plan_sp->PlanValidAtAddress(m_current_pc)) { if (unwind_plan_sp->GetSourcedFromCompiler() == eLazyBoolNo) { @@ -969,7 +975,7 @@ RegisterContextUnwind::GetFullUnwindPlanForFrame() { // assembly code it is often written in a way that it valid at all // location what helps in the most common cases when the instruction // emulation fails. - std::shared_ptr call_site_unwind_plan = + UnwindPlanSP call_site_unwind_plan = func_unwinders_sp->GetUnwindPlanAtCallSite(process->GetTarget(), m_thread); if (call_site_unwind_plan && @@ -1004,7 +1010,6 @@ RegisterContextUnwind::GetFullUnwindPlanForFrame() { } } - std::shared_ptr unwind_plan_sp; // Typically this is unwind info from an eh_frame section intended for // exception handling; only valid at call sites if (process) { @@ -1037,7 +1042,7 @@ RegisterContextUnwind::GetFullUnwindPlanForFrame() { // sites then the architecture default plan and for hand written assembly // code it is often written in a way that it valid at all location what // helps in the most common cases when the instruction emulation fails. - std::shared_ptr call_site_unwind_plan = + UnwindPlanSP call_site_unwind_plan = func_unwinders_sp->GetUnwindPlanAtCallSite(process->GetTarget(), m_thread); if (call_site_unwind_plan && @@ -1781,8 +1786,7 @@ bool RegisterContextUnwind::TryFallbackUnwindPlan() { // Switch the full UnwindPlan to be the fallback UnwindPlan. If we decide // this isn't working, we need to restore. We'll also need to save & restore // the value of the m_cfa ivar. Save is down below a bit in 'old_cfa'. - std::shared_ptr original_full_unwind_plan_sp = - m_full_unwind_plan_sp; + UnwindPlanSP original_full_unwind_plan_sp = m_full_unwind_plan_sp; addr_t old_cfa = m_cfa; addr_t old_afa = m_afa; @@ -1911,7 +1915,7 @@ bool RegisterContextUnwind::ForceSwitchToFallbackUnwindPlan() { } void RegisterContextUnwind::PropagateTrapHandlerFlagFromUnwindPlan( - std::shared_ptr unwind_plan) { + lldb::UnwindPlanSP unwind_plan) { if (unwind_plan->GetUnwindPlanForSignalTrap() != eLazyBoolYes) { // Unwind plan does not indicate trap handler. Do nothing. We may // already be flagged as trap handler flag due to the symbol being From 662d385c7b07fc6aba51e73a09c254f551ab93ab Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Thu, 3 Apr 2025 11:59:02 +0200 Subject: [PATCH 0502/1029] [lldb/telemetry] Report exit status only once (#134078) SetExitStatus can be called the second time when we reap the debug server process. This shouldn't be interesting as at that point, we've already told everyone that the process has exited. I believe/hope this will also help with sporadic shutdown crashes that have cropped up recently. They happen because the debug server is monitored from a detached thread, so this code can be called after main returns (and starts destroying everything). This isn't a real fix for that though, as the situation can still happen (it's just that it usually happens after the exit status has already been set). I think the real fix for that is to make sure these threads terminate before we start shutting everything down. --- lldb/source/Target/Process.cpp | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index 369933234ccca..7936cf28467b2 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -1067,6 +1067,20 @@ const char *Process::GetExitDescription() { bool Process::SetExitStatus(int status, llvm::StringRef exit_string) { // Use a mutex to protect setting the exit status. std::lock_guard guard(m_exit_status_mutex); + Log *log(GetLog(LLDBLog::State | LLDBLog::Process)); + LLDB_LOG(log, "(plugin = {0} status = {1} ({1:x8}), description=\"{2}\")", + GetPluginName(), status, exit_string); + + // We were already in the exited state + if (m_private_state.GetValue() == eStateExited) { + LLDB_LOG( + log, + "(plugin = {0}) ignoring exit status because state was already set " + "to eStateExited", + GetPluginName()); + return false; + } + telemetry::ScopedDispatcher helper; UUID module_uuid; @@ -1089,20 +1103,6 @@ bool Process::SetExitStatus(int status, llvm::StringRef exit_string) { info->pid = m_pid; }); - Log *log(GetLog(LLDBLog::State | LLDBLog::Process)); - LLDB_LOG(log, "(plugin = {0} status = {1} ({1:x8}), description=\"{2}\")", - GetPluginName(), status, exit_string); - - // We were already in the exited state - if (m_private_state.GetValue() == eStateExited) { - LLDB_LOG( - log, - "(plugin = {0}) ignoring exit status because state was already set " - "to eStateExited", - GetPluginName()); - return false; - } - m_exit_status = status; if (!exit_string.empty()) m_exit_string = exit_string.str(); From 2e7ed78cff0ad3e3535443ce8c0c3c0e0925ff73 Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Thu, 3 Apr 2025 11:00:29 +0100 Subject: [PATCH 0503/1029] [mlir][spirv] Add instruction OpGroupNonUniformRotateKHR (#133428) Add an instruction under the extension SPV_KHR_subgroup_rotate. The specification for the extension is here: https://github.khronos.org/SPIRV-Registry/extensions/KHR/SPV_KHR_subgroup_rotate.html --- .../mlir/Dialect/SPIRV/IR/SPIRVBase.td | 4 +- .../Dialect/SPIRV/IR/SPIRVNonUniformOps.td | 74 +++++++++++++++++++ mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp | 23 ++++++ .../Dialect/SPIRV/IR/non-uniform-ops.mlir | 67 +++++++++++++++++ 4 files changed, 167 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td index d5359da2a590e..cd5d201c3d5da 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td @@ -4489,6 +4489,7 @@ def SPIRV_OC_OpGroupNonUniformBitwiseXor : I32EnumAttrCase<"OpGroupNonUnifo def SPIRV_OC_OpGroupNonUniformLogicalAnd : I32EnumAttrCase<"OpGroupNonUniformLogicalAnd", 362>; def SPIRV_OC_OpGroupNonUniformLogicalOr : I32EnumAttrCase<"OpGroupNonUniformLogicalOr", 363>; def SPIRV_OC_OpGroupNonUniformLogicalXor : I32EnumAttrCase<"OpGroupNonUniformLogicalXor", 364>; +def SPIRV_OC_OpGroupNonUniformRotateKHR : I32EnumAttrCase<"OpGroupNonUniformRotateKHR", 4431>; def SPIRV_OC_OpSubgroupBallotKHR : I32EnumAttrCase<"OpSubgroupBallotKHR", 4421>; def SPIRV_OC_OpSDot : I32EnumAttrCase<"OpSDot", 4450>; def SPIRV_OC_OpUDot : I32EnumAttrCase<"OpUDot", 4451>; @@ -4598,7 +4599,8 @@ def SPIRV_OpcodeAttr : SPIRV_OC_OpGroupNonUniformFMax, SPIRV_OC_OpGroupNonUniformBitwiseAnd, SPIRV_OC_OpGroupNonUniformBitwiseOr, SPIRV_OC_OpGroupNonUniformBitwiseXor, SPIRV_OC_OpGroupNonUniformLogicalAnd, SPIRV_OC_OpGroupNonUniformLogicalOr, - SPIRV_OC_OpGroupNonUniformLogicalXor, SPIRV_OC_OpSubgroupBallotKHR, + SPIRV_OC_OpGroupNonUniformLogicalXor, SPIRV_OC_OpGroupNonUniformRotateKHR, + SPIRV_OC_OpSubgroupBallotKHR, SPIRV_OC_OpSDot, SPIRV_OC_OpUDot, SPIRV_OC_OpSUDot, SPIRV_OC_OpSDotAccSat, SPIRV_OC_OpUDotAccSat, SPIRV_OC_OpSUDotAccSat, SPIRV_OC_OpTypeCooperativeMatrixKHR, SPIRV_OC_OpCooperativeMatrixLoadKHR, diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVNonUniformOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVNonUniformOps.td index 98e435c18d3d7..2dd3dbd28d436 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVNonUniformOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVNonUniformOps.td @@ -1361,4 +1361,78 @@ def SPIRV_GroupNonUniformBallotBitCountOp : SPIRV_Op<"GroupNonUniformBallotBitCo // ----- +def SPIRV_GroupNonUniformRotateKHROp : SPIRV_Op<"GroupNonUniformRotateKHR", [ + Pure, AllTypesMatch<["value", "result"]>]> { + let summary = [{ + Rotate values across invocations within a subgroup. + }]; + + let description = [{ + Return the Value of the invocation whose id within the group is calculated + as follows: + + LocalId = SubgroupLocalInvocationId if Execution is Subgroup or + LocalInvocationId if Execution is Workgroup + RotationGroupSize = ClusterSize when ClusterSize is present, otherwise + RotationGroupSize = SubgroupMaxSize if the Kernel capability is declared + and SubgroupSize if not. + Invocation ID = ( (LocalId + Delta) & (RotationGroupSize - 1) ) + + (LocalId & ~(RotationGroupSize - 1)) + + Result Type must be a scalar or vector of floating-point type, integer + type, or Boolean type. + + Execution is a Scope. It must be either Workgroup or Subgroup. + + The type of Value must be the same as Result Type. + + Delta must be a scalar of integer type, whose Signedness operand is 0. + Delta must be dynamically uniform within Execution. + + Delta is treated as unsigned and the resulting value is undefined if the + selected lane is inactive. + + ClusterSize is the size of cluster to use. ClusterSize must be a scalar of + integer type, whose Signedness operand is 0. ClusterSize must come from a + constant instruction. Behavior is undefined unless ClusterSize is at least + 1 and a power of 2. If ClusterSize is greater than the declared + SubGroupSize, executing this instruction results in undefined behavior. + + + + #### Example: + + ```mlir + %four = spirv.Constant 4 : i32 + %0 = spirv.GroupNonUniformRotateKHR , %value, %delta : f32, i32 -> f32 + %1 = spirv.GroupNonUniformRotateKHR , %value, %delta, + clustersize(%four) : f32, i32, i32 -> f32 + ``` + }]; + + let availability = [ + MinVersion, + MaxVersion, + Extension<[]>, + Capability<[SPIRV_C_GroupNonUniformRotateKHR]> + ]; + + let arguments = (ins + SPIRV_ScopeAttr:$execution_scope, + AnyTypeOf<[SPIRV_ScalarOrVectorOf, SPIRV_ScalarOrVectorOf, SPIRV_ScalarOrVectorOf]>:$value, + SPIRV_SignlessOrUnsignedInt:$delta, + Optional:$cluster_size + ); + + let results = (outs + AnyTypeOf<[SPIRV_ScalarOrVectorOf, SPIRV_ScalarOrVectorOf, SPIRV_ScalarOrVectorOf]>:$result + ); + + let assemblyFormat = [{ + $execution_scope `,` $value `,` $delta (`,` `cluster_size` `(` $cluster_size^ `)`)? attr-dict `:` type($value) `,` type($delta) (`,` type($cluster_size)^)? `->` type(results) + }]; +} + +// ----- + #endif // MLIR_DIALECT_SPIRV_IR_NON_UNIFORM_OPS diff --git a/mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp b/mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp index 8aeafda0eb755..461d037134dae 100644 --- a/mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/GroupOps.cpp @@ -304,6 +304,29 @@ LogicalResult GroupNonUniformLogicalXorOp::verify() { return verifyGroupNonUniformArithmeticOp(*this); } +//===----------------------------------------------------------------------===// +// spirv.GroupNonUniformRotateKHR +//===----------------------------------------------------------------------===// + +LogicalResult GroupNonUniformRotateKHROp::verify() { + spirv::Scope scope = getExecutionScope(); + if (scope != spirv::Scope::Workgroup && scope != spirv::Scope::Subgroup) + return emitOpError("execution scope must be 'Workgroup' or 'Subgroup'"); + + if (Value clusterSizeVal = getClusterSize()) { + mlir::Operation *defOp = clusterSizeVal.getDefiningOp(); + int32_t clusterSize = 0; + + if (failed(extractValueFromConstOp(defOp, clusterSize))) + return emitOpError("cluster size operand must come from a constant op"); + + if (!llvm::isPowerOf2_32(clusterSize)) + return emitOpError("cluster size operand must be a power of two"); + } + + return success(); +} + //===----------------------------------------------------------------------===// // Group op verification //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir b/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir index 60ae1584d29fb..bf383d3837b6e 100644 --- a/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir +++ b/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir @@ -604,3 +604,70 @@ func.func @group_non_uniform_logical_xor(%val: i32) -> i32 { %0 = spirv.GroupNonUniformLogicalXor %val : i32 -> i32 return %0: i32 } + +// ----- + +//===----------------------------------------------------------------------===// +// spirv.GroupNonUniformRotateKHR +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: @group_non_uniform_rotate_khr +func.func @group_non_uniform_rotate_khr(%val: f32, %delta: i32) -> f32 { + // CHECK: %{{.+}} = spirv.GroupNonUniformRotateKHR , %{{.+}} : f32, i32 -> f32 + %0 = spirv.GroupNonUniformRotateKHR , %val, %delta : f32, i32 -> f32 + return %0: f32 +} + +// ----- + +// CHECK-LABEL: @group_non_uniform_rotate_khr +func.func @group_non_uniform_rotate_khr(%val: f32, %delta: i32) -> f32 { + // CHECK: %{{.+}} = spirv.GroupNonUniformRotateKHR , %{{.+}} : f32, i32, i32 -> f32 + %four = spirv.Constant 4 : i32 + %0 = spirv.GroupNonUniformRotateKHR , %val, %delta, cluster_size(%four) : f32, i32, i32 -> f32 + return %0: f32 +} + +// ----- + +func.func @group_non_uniform_rotate_khr(%val: f32, %delta: i32) -> f32 { + %four = spirv.Constant 4 : i32 + // expected-error @+1 {{execution scope must be 'Workgroup' or 'Subgroup'}} + %0 = spirv.GroupNonUniformRotateKHR , %val, %delta, cluster_size(%four) : f32, i32, i32 -> f32 + return %0: f32 +} + +// ----- + +func.func @group_non_uniform_rotate_khr(%val: f32, %delta: si32) -> f32 { + %four = spirv.Constant 4 : i32 + // expected-error @+1 {{op operand #1 must be 8/16/32/64-bit signless/unsigned integer, but got 'si32'}} + %0 = spirv.GroupNonUniformRotateKHR , %val, %delta, cluster_size(%four) : f32, si32, i32 -> f32 + return %0: f32 +} + +// ----- + +func.func @group_non_uniform_rotate_khr(%val: f32, %delta: i32) -> f32 { + %four = spirv.Constant 4 : si32 + // expected-error @+1 {{op operand #2 must be 8/16/32/64-bit signless/unsigned integer, but got 'si32'}} + %0 = spirv.GroupNonUniformRotateKHR , %val, %delta, cluster_size(%four) : f32, i32, si32 -> f32 + return %0: f32 +} + +// ----- + +func.func @group_non_uniform_rotate_khr(%val: f32, %delta: i32, %four: i32) -> f32 { + // expected-error @+1 {{cluster size operand must come from a constant op}} + %0 = spirv.GroupNonUniformRotateKHR , %val, %delta, cluster_size(%four) : f32, i32, i32 -> f32 + return %0: f32 +} + +// ----- + +func.func @group_non_uniform_rotate_khr(%val: f32, %delta: i32) -> f32 { + %five = spirv.Constant 5 : i32 + // expected-error @+1 {{cluster size operand must be a power of two}} + %0 = spirv.GroupNonUniformRotateKHR , %val, %delta, cluster_size(%five) : f32, i32, i32 -> f32 + return %0: f32 +} From bf516098fb7c7d428cae03296b92766467f76c9e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 3 Apr 2025 11:01:08 +0100 Subject: [PATCH 0504/1029] [X86] SimplifyDemandedVectorEltsForTargetNode - reduce the size of VPERMV/VPERMV3 nodes if the upper elements are not demanded (#133923) With AVX512VL targets, use 128/256-bit VPERMV/VPERMV3 nodes when we only need the lower elements. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 63 ++++++++++++ .../any_extend_vector_inreg_of_broadcast.ll | 46 ++++----- ...d_vector_inreg_of_broadcast_from_memory.ll | 8 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll | 12 +-- .../vector-interleaved-load-i16-stride-5.ll | 76 ++++++++------- .../vector-interleaved-store-i64-stride-5.ll | 32 +++---- .../vector-interleaved-store-i64-stride-6.ll | 96 +++++++++---------- .../vector-shuffle-combining-avx512bwvl.ll | 6 +- .../zero_extend_vector_inreg_of_broadcast.ll | 22 ++--- ...d_vector_inreg_of_broadcast_from_memory.ll | 8 +- 10 files changed, 217 insertions(+), 152 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 52c254c3dd045..34716929f61f1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43812,6 +43812,69 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } break; } + case X86ISD::VPERMV: { + SmallVector Mask; + SmallVector Ops; + if ((VT.is256BitVector() || Subtarget.hasVLX()) && + getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) { + // For lane-crossing shuffles, only split in half in case we're still + // referencing higher elements. + unsigned HalfElts = NumElts / 2; + unsigned HalfSize = SizeInBits / 2; + Mask.resize(HalfElts); + if (all_of(Mask, + [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) { + MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT(); + SDLoc DL(Op); + SDValue Ext; + SDValue M = + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize); + SDValue V = + extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize); + // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS. + if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16) + Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V); + else + Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, HalfVT, V, M); + SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false, + Subtarget, TLO.DAG, DL, SizeInBits); + return TLO.CombineTo(Op, Insert); + } + } + break; + } + case X86ISD::VPERMV3: { + SmallVector Mask; + SmallVector Ops; + if (Subtarget.hasVLX() && + getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) { + // For lane-crossing shuffles, only split in half in case we're still + // referencing higher elements. + unsigned HalfElts = NumElts / 2; + unsigned HalfSize = SizeInBits / 2; + Mask.resize(HalfElts); + if (all_of(Mask, [&](int M) { + return isUndefOrInRange(M, 0, HalfElts) || + isUndefOrInRange(M, NumElts, NumElts + HalfElts); + })) { + // Adjust mask elements for 2nd operand to point to half width. + for (int &M : Mask) + M = M <= NumElts ? M : (M - HalfElts); + MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT(); + MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger(); + SDLoc DL(Op); + SDValue Ext = TLO.DAG.getNode( + Opc, DL, HalfVT, + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize), + getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true), + extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize)); + SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false, + Subtarget, TLO.DAG, DL, SizeInBits); + return TLO.CombineTo(Op, Insert); + } + } + break; + } case X86ISD::VPERM2X128: { // Simplify VPERM2F128/VPERM2I128 to extract_subvector. SDLoc DL(Op); diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 6f4e7abda8b00..b075d48627b18 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax ; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -4610,10 +4610,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15] +; AVX512F-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4623,10 +4623,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15] +; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -4868,10 +4868,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] +; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4881,10 +4881,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] +; AVX512DQ-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 52f856befa130..61e122b1aba36 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll index 26af46263c0e2..a84466bc1ca1a 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -1113,8 +1113,8 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind ; ; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8: ; AVX512VBMI-FAST: # %bb.0: -; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,79] -; AVX512VBMI-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [64,65,66,67,68,69,24,28,32,36,40,44,48,52,56,79] +; AVX512VBMI-FAST-NEXT: vpmovdb %ymm0, %xmm2 ; AVX512VBMI-FAST-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 ; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm0, %eax @@ -1124,14 +1124,14 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind ; ; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8: ; AVX512VBMI-SLOW: # %bb.0: -; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,77,78,79] -; AVX512VBMI-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VBMI-SLOW-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 +; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,92,96,100,104,108,112,13,14,15] +; AVX512VBMI-SLOW-NEXT: vpmovdb %ymm0, %xmm2 +; AVX512VBMI-SLOW-NEXT: vpermt2b %zmm0, %zmm1, %zmm2 ; AVX512VBMI-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; AVX512VBMI-SLOW-NEXT: vpextrw $6, %xmm0, %eax ; AVX512VBMI-SLOW-NEXT: vpextrw $4, %xmm0, %ecx ; AVX512VBMI-SLOW-NEXT: vpextrw $2, %xmm0, %edx -; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm1, %xmm0 +; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm2, %xmm0 ; AVX512VBMI-SLOW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; AVX512VBMI-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512VBMI-SLOW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 739e6e2369e36..9b19ec15c6f55 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -593,100 +593,104 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i16_stride5_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax -; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovq %xmm1, (%rsi) ; AVX512BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-NEXT: vmovq %xmm3, (%rcx) ; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm1, (%r9) +; AVX512BW-NEXT: vmovq %xmm2, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride5_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax -; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride5_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax -; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax -; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <20 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index 05c111ae5049f..f41123c5c3cfd 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -123,8 +123,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa %xmm3, 64(%r9) @@ -140,8 +140,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm3, 64(%r9) @@ -157,8 +157,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa %xmm3, 64(%r9) @@ -174,8 +174,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 64(%r9) @@ -191,8 +191,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa %xmm3, 64(%r9) @@ -208,8 +208,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9) @@ -225,8 +225,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, 64(%r9) @@ -242,8 +242,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index c2f1723d8031e..aac6a1bddd08a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -139,12 +139,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -158,12 +158,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -177,12 +177,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -196,12 +196,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -215,12 +215,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -234,12 +234,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -253,12 +253,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -272,12 +272,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index ec09c3117c77f..f5cd3e580d017 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -187,10 +187,8 @@ define <8 x i32> @concat_vrotlv_v4i32(<4 x i32> %a0, <4 x i32> %a1, <8 x i32> %a define <8 x i16> @demandedelts_vpermvar_32i16_v8i16(<32 x i16> %x0) { ; CHECK-LABEL: demandedelts_vpermvar_32i16_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,0,6,1,5,2,4,3,7,0,6,1,5,2,4,3,7,0,6,1,5,2,4,3,7,0,6,1,5,2,4,3] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [7,0,6,1,5,2,4,3] +; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %shuffle = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> ) diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 35f25d36cb2e9..ea0e3b3a2b9aa 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax ; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index a598e30845579..a3e2fb5321f32 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) From ecc35456d79aac3fb85fe95e30cdaaca916e8722 Mon Sep 17 00:00:00 2001 From: Camsyn Date: Thu, 3 Apr 2025 18:02:03 +0800 Subject: [PATCH 0505/1029] [Utils] Fix incorrect LCSSA PHI nodes when splitting critical edges with MergeIdenticalEdges (#131744) This PR fixes incorrect LCSSA PHI node generation when splitting critical edges with both `PreserveLCSSA` and `MergeIdenticalEdges` enabled. The bug caused PHI nodes in the split block to miss predecessors when multiple identical edges were merged. --- .../Transforms/Utils/BreakCriticalEdges.cpp | 11 +++- .../Transforms/Utils/BasicBlockUtilsTest.cpp | 57 +++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index d20902c577d3a..0721358eb03bb 100644 --- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -207,6 +207,8 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum, } } + unsigned NumSplitIdenticalEdges = 1; + // If there are any other edges from TIBB to DestBB, update those to go // through the split block, making those edges non-critical as well (and // reducing the number of phi entries in the DestBB if relevant). @@ -219,6 +221,9 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum, // We found another edge to DestBB, go to NewBB instead. TI->setSuccessor(i, NewBB); + + // Record the number of split identical edges to DestBB. + NumSplitIdenticalEdges++; } } @@ -290,7 +295,11 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum, // Update LCSSA form in the newly created exit block. if (Options.PreserveLCSSA) { - createPHIsForSplitLoopExit(TIBB, NewBB, DestBB); + // If > 1 identical edges to be split, we need to introduce the same + // number of the incoming blocks for the new PHINode. + createPHIsForSplitLoopExit( + SmallVector(NumSplitIdenticalEdges, TIBB), NewBB, + DestBB); } if (!LoopPreds.empty()) { diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp index 56692cf25b797..c9a6a32851775 100644 --- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp +++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp @@ -438,6 +438,63 @@ define void @crit_edge(i1 %cond0, i1 %cond1) { EXPECT_TRUE(PDT.verify()); } +TEST(BasicBlockUtils, SplitLoopCriticalEdge) { + LLVMContext C; + std::unique_ptr M = parseIR(C, R"IR( +declare i1 @predicate(ptr %p) + +define ptr @Parse(ptr %gp) { +entry: + br label %for.inc + +for.inc: + %phi = phi ptr [ %gp, %entry ], [ %cp, %while.cond ], [ %cp, %while.cond ] + %cond = call i1 @predicate(ptr %phi) + %inc= getelementptr inbounds i8, ptr %phi, i64 1 + br i1 %cond, label %while.cond, label %exit + +while.cond: + %cp = phi ptr [ %inc, %for.inc ], [ %incdec, %while.body ] + %val = load i8, ptr %cp, align 1 + switch i8 %val, label %while.body [ + i8 10, label %for.inc + i8 0, label %for.inc + ] + +while.body: + %incdec = getelementptr inbounds i8, ptr %cp, i64 1 + br label %while.cond + +exit: + ret ptr %phi +} +)IR"); + Function *F = M->getFunction("Parse"); + DominatorTree DT(*F); + LoopInfo LI(DT); + + CriticalEdgeSplittingOptions CESO = + CriticalEdgeSplittingOptions(nullptr, &LI, nullptr) + .setMergeIdenticalEdges() + .setPreserveLCSSA(); + EXPECT_EQ(2u, SplitAllCriticalEdges(*F, CESO)); + + BasicBlock *WhileBB = getBasicBlockByName(*F, "while.cond"); + BasicBlock *SplitBB = WhileBB->getTerminator()->getSuccessor(1); + // The only 1 successor of SplitBB is %for.inc + EXPECT_EQ(1u, SplitBB->getTerminator()->getNumSuccessors()); + // MergeIdenticalEdges: SplitBB has two identical predecessors, %while.cond. + EXPECT_EQ(WhileBB, SplitBB->getUniquePredecessor()); + EXPECT_TRUE(SplitBB->hasNPredecessors(2)); + + auto *PN = dyn_cast(&SplitBB->front()); + // PreserveLCSSA: should insert a PHI node in front of SplitBB + EXPECT_NE(nullptr, PN); + // The PHI node should have 2 identical incoming blocks. + EXPECT_EQ(2u, PN->getNumIncomingValues()); + EXPECT_EQ(PN->getIncomingBlock(0), PN->getIncomingBlock(1)); +} + TEST(BasicBlockUtils, SplitIndirectBrCriticalEdgesIgnorePHIs) { LLVMContext C; std::unique_ptr M = parseIR(C, R"IR( From 61907ebd764afe75aa7134627f41827e6893d6d0 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Thu, 3 Apr 2025 18:03:42 +0800 Subject: [PATCH 0506/1029] [Clang][CodeGen] Do not use the GEP result to infer offset and result type (#134221) If `CreateConstInBoundsGEP2_32` returns a constant null/gep, the cast to GetElementPtrInst will fail. This patch uses two static helpers `GEPOperator::accumulateConstantOffset/GetElementPtrInst::getIndexedType` to infer offset and result type instead of depending on the GEP result. This patch is extracted from https://github.com/llvm/llvm-project/pull/130734. --- clang/lib/CodeGen/CGBuilder.h | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h index b8036cf6e6a30..090f75d3b5d3c 100644 --- a/clang/lib/CodeGen/CGBuilder.h +++ b/clang/lib/CodeGen/CGBuilder.h @@ -64,21 +64,25 @@ class CGBuilderTy : public CGBuilderBaseTy { Address createConstGEP2_32(Address Addr, unsigned Idx0, unsigned Idx1, const llvm::Twine &Name) { const llvm::DataLayout &DL = BB->getDataLayout(); - llvm::GetElementPtrInst *GEP; + llvm::Value *V; if (IsInBounds) - GEP = cast(CreateConstInBoundsGEP2_32( - Addr.getElementType(), emitRawPointerFromAddress(Addr), Idx0, Idx1, - Name)); + V = CreateConstInBoundsGEP2_32(Addr.getElementType(), + emitRawPointerFromAddress(Addr), Idx0, + Idx1, Name); else - GEP = cast(CreateConstGEP2_32( - Addr.getElementType(), emitRawPointerFromAddress(Addr), Idx0, Idx1, - Name)); + V = CreateConstGEP2_32(Addr.getElementType(), + emitRawPointerFromAddress(Addr), Idx0, Idx1, Name); llvm::APInt Offset( DL.getIndexSizeInBits(Addr.getType()->getPointerAddressSpace()), 0, /*isSigned=*/true); - if (!GEP->accumulateConstantOffset(DL, Offset)) - llvm_unreachable("offset of GEP with constants is always computable"); - return Address(GEP, GEP->getResultElementType(), + if (!llvm::GEPOperator::accumulateConstantOffset( + Addr.getElementType(), {getInt32(Idx0), getInt32(Idx1)}, DL, + Offset)) + llvm_unreachable( + "accumulateConstantOffset with constant indices should not fail."); + llvm::Type *ElementTy = llvm::GetElementPtrInst::getIndexedType( + Addr.getElementType(), {Idx0, Idx1}); + return Address(V, ElementTy, Addr.getAlignment().alignmentAtOffset( CharUnits::fromQuantity(Offset.getSExtValue())), IsInBounds ? Addr.isKnownNonNull() : NotKnownNonNull); From 554f4d1a5769357ee8438c23f572d595c720ff3c Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 3 Apr 2025 11:10:16 +0100 Subject: [PATCH 0507/1029] [lldb][Target] RunThreadPlan to save/restore the ExecutionContext's frame if one exists (#134097) When using `SBFrame::EvaluateExpression` on a frame that's not the currently selected frame, we would sometimes run into errors such as: ``` error: error: The context has changed before we could JIT the expression! error: errored out in DoExecute, couldn't PrepareToExecuteJITExpression ``` During expression parsing, we call `RunStaticInitializers`. On our internal fork this happens quite frequently because any usage of, e.g., function pointers, will inject ptrauth fixup code into the expression. The static initializers are run using `RunThreadPlan`. The `ExecutionContext::m_frame_sp` going into the `RunThreadPlan` is the `SBFrame` that we called `EvaluateExpression` on. LLDB then tries to save this frame to restore it after the thread-plan ran (the restore occurs by unconditionally overwriting whatever is in `ExecutionContext::m_frame_sp`). However, if the `selected_frame_sp` is not the same as the `SBFrame`, then `RunThreadPlan` would set the `ExecutionContext`'s frame to a different frame than what we started with. When we `PrepareToExecuteJITExpression`, LLDB checks whether the `ExecutionContext` frame changed from when we initially `EvaluateExpression`, and if did, bails out with the error above. One such test-case is attached. This currently passes regardless of the fix because our ptrauth static initializers code isn't upstream yet. But the plan is to upstream it soon. This patch addresses the issue by saving/restoring the frame of the incoming `ExecutionContext`, if such frame exists. Otherwise, fall back to using the selected frame. rdar://147456589 --- lldb/source/Target/Process.cpp | 8 ++++- .../expr-from-non-zero-frame/Makefile | 3 ++ .../TestExprFromNonZeroFrame.py | 30 +++++++++++++++++++ .../expr-from-non-zero-frame/main.c | 6 ++++ 4 files changed, 46 insertions(+), 1 deletion(-) create mode 100644 lldb/test/API/commands/expression/expr-from-non-zero-frame/Makefile create mode 100644 lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py create mode 100644 lldb/test/API/commands/expression/expr-from-non-zero-frame/main.c diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index 7936cf28467b2..2adda309dea9c 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -5080,7 +5080,13 @@ Process::RunThreadPlan(ExecutionContext &exe_ctx, return eExpressionSetupError; } - StackID ctx_frame_id = selected_frame_sp->GetStackID(); + // If the ExecutionContext has a frame, we want to make sure to save/restore + // that frame into exe_ctx. This can happen when we run expressions from a + // non-selected SBFrame, in which case we don't want some thread-plan + // to overwrite the ExecutionContext frame. + StackID ctx_frame_id = exe_ctx.HasFrameScope() + ? exe_ctx.GetFrameRef().GetStackID() + : selected_frame_sp->GetStackID(); // N.B. Running the target may unset the currently selected thread and frame. // We don't want to do that either, so we should arrange to reset them as diff --git a/lldb/test/API/commands/expression/expr-from-non-zero-frame/Makefile b/lldb/test/API/commands/expression/expr-from-non-zero-frame/Makefile new file mode 100644 index 0000000000000..10495940055b6 --- /dev/null +++ b/lldb/test/API/commands/expression/expr-from-non-zero-frame/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py b/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py new file mode 100644 index 0000000000000..9b1bcfb177765 --- /dev/null +++ b/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py @@ -0,0 +1,30 @@ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class ExprFromNonZeroFrame(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def test(self): + """ + Tests that we can use SBFrame::EvaluateExpression on a frame + that we're not stopped in, even if thread-plans run as part of + parsing the expression (e.g., when running static initializers). + """ + self.build() + + (_, _, thread, _) = lldbutil.run_to_source_breakpoint( + self, "Break here", lldb.SBFileSpec("main.c") + ) + frame = thread.GetFrameAtIndex(1) + + # Using a function pointer inside the expression ensures we + # emit a ptrauth static initializer on arm64e into the JITted + # expression. The thread-plan that runs for this static + # initializer should save/restore the current execution context + # frame (which in this test is frame #1). + result = frame.EvaluateExpression("int (*fptr)() = &func; fptr()") + self.assertTrue(result.GetError().Success()) + self.assertEqual(result.GetValueAsSigned(), 5) diff --git a/lldb/test/API/commands/expression/expr-from-non-zero-frame/main.c b/lldb/test/API/commands/expression/expr-from-non-zero-frame/main.c new file mode 100644 index 0000000000000..d1675525cf4f2 --- /dev/null +++ b/lldb/test/API/commands/expression/expr-from-non-zero-frame/main.c @@ -0,0 +1,6 @@ +int func(void) { + __builtin_printf("Break here"); + return 5; +} + +int main(int argc, const char *argv[]) { return func(); } From 6c27817294d96705ffd005aea52494ea40b1ef74 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 3 Apr 2025 11:14:08 +0100 Subject: [PATCH 0508/1029] [SelectionDAG] Use SimplifyDemandedBits from SimplifyDemandedVectorElts Bitcast. (#133717) This adds a call to SimplifyDemandedBits from bitcasts with scalar input types in SimplifyDemandedVectorElts, which can help simplify the input scalar. --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 19 ++- llvm/test/CodeGen/AArch64/bitcast-extend.ll | 68 ++++++--- llvm/test/CodeGen/Thumb2/mve-vdup.ll | 4 +- .../WebAssembly/simd-shuffle-bitcast.ll | 4 +- llvm/test/CodeGen/X86/kmov.ll | 136 +++++++----------- .../CodeGen/X86/vector-reduce-fmax-nnan.ll | 1 - 6 files changed, 117 insertions(+), 115 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 10006a9d76785..0f38bbd46cbca 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3163,10 +3163,23 @@ bool TargetLowering::SimplifyDemandedVectorElts( SDValue Src = Op.getOperand(0); EVT SrcVT = Src.getValueType(); - // We only handle vectors here. - // TODO - investigate calling SimplifyDemandedBits/ComputeKnownBits? - if (!SrcVT.isVector()) + if (!SrcVT.isVector()) { + // TODO - bigendian once we have test coverage. + if (IsLE) { + APInt DemandedSrcBits = APInt::getZero(SrcVT.getSizeInBits()); + unsigned EltSize = VT.getScalarSizeInBits(); + for (unsigned I = 0; I != NumElts; ++I) { + if (DemandedElts[I]) { + unsigned Offset = I * EltSize; + DemandedSrcBits.setBits(Offset, Offset + EltSize); + } + } + KnownBits Known; + if (SimplifyDemandedBits(Src, DemandedSrcBits, Known, TLO, Depth + 1)) + return true; + } break; + } // Fast handling of 'identity' bitcasts. unsigned NumSrcElts = SrcVT.getVectorNumElements(); diff --git a/llvm/test/CodeGen/AArch64/bitcast-extend.ll b/llvm/test/CodeGen/AArch64/bitcast-extend.ll index 195c740022d10..85daa3ca6623e 100644 --- a/llvm/test/CodeGen/AArch64/bitcast-extend.ll +++ b/llvm/test/CodeGen/AArch64/bitcast-extend.ll @@ -217,17 +217,28 @@ define <4 x i64> @s_i32_v4i64(i32 %x) { } define void @extractbitcastext(i32 %bytes, ptr %output) { -; CHECK-LABEL: extractbitcastext: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll v1.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extractbitcastext: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: fmov d0, x0 +; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v1.2d, v0.2s, #0 +; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-SD-NEXT: stp q1, q0, [x1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extractbitcastext: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: sxtw x8, w0 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: ushll v1.2d, v0.2s, #0 +; CHECK-GI-NEXT: ushll2 v0.2d, v0.4s, #0 +; CHECK-GI-NEXT: stp q1, q0, [x1] +; CHECK-GI-NEXT: ret %conv = sext i32 %bytes to i64 %b0 = bitcast i64 %conv to <8 x i8> %b1 = zext <8 x i8> %b0 to <8 x i16> @@ -244,17 +255,28 @@ define void @extractbitcastext(i32 %bytes, ptr %output) { } define void @extractbitcastext_s(i32 %bytes, ptr %output) { -; CHECK-LABEL: extractbitcastext_s: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: fmov d0, x8 -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-NEXT: sshll v1.2d, v0.2s, #0 -; CHECK-NEXT: sshll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: stp q1, q0, [x1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extractbitcastext_s: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: fmov d0, x0 +; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: sshll v1.2d, v0.2s, #0 +; CHECK-SD-NEXT: sshll2 v0.2d, v0.4s, #0 +; CHECK-SD-NEXT: stp q1, q0, [x1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extractbitcastext_s: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-GI-NEXT: sxtw x8, w0 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll v1.2d, v0.2s, #0 +; CHECK-GI-NEXT: sshll2 v0.2d, v0.4s, #0 +; CHECK-GI-NEXT: stp q1, q0, [x1] +; CHECK-GI-NEXT: ret %conv = sext i32 %bytes to i64 %b0 = bitcast i64 %conv to <8 x i8> %b1 = sext <8 x i8> %b0 to <8 x i16> @@ -271,3 +293,5 @@ define void @extractbitcastext_s(i32 %bytes, ptr %output) { } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll index 9ba3866ad4730..77fa9f297e678 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll @@ -371,7 +371,7 @@ define arm_aapcs_vfpcc <8 x i16> @bitcast_i64_v8i16(i64 %a) { ; CHECK-LE: @ %bb.0: ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 -; CHECK-LE-NEXT: strd r0, r1, [sp] +; CHECK-LE-NEXT: str r0, [sp] ; CHECK-LE-NEXT: mov r0, sp ; CHECK-LE-NEXT: vldrh.u32 q0, [r0] ; CHECK-LE-NEXT: vmov r0, s0 @@ -420,7 +420,7 @@ define arm_aapcs_vfpcc <8 x i16> @bitcast_i64_v8i16_lane1(i64 %a) { ; CHECK-LE: @ %bb.0: ; CHECK-LE-NEXT: .pad #8 ; CHECK-LE-NEXT: sub sp, #8 -; CHECK-LE-NEXT: strd r0, r1, [sp] +; CHECK-LE-NEXT: str r0, [sp] ; CHECK-LE-NEXT: mov r0, sp ; CHECK-LE-NEXT: vldrh.u32 q0, [r0] ; CHECK-LE-NEXT: vmov r0, s1 diff --git a/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll b/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll index 1f539f1652004..4eca61d08af7f 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-shuffle-bitcast.ll @@ -27,8 +27,8 @@ define <2 x i2> @i2x2_splat(i1 %x) { ; CHECK-LABEL: not_a_vec: ; CHECK-NEXT: .functype not_a_vec (i64, i64) -> (v128){{$}} -; CHECK-NEXT: i32.wrap_i64 $push[[L:[0-9]+]]=, $0 -; CHECK-NEXT: i32x4.splat $push[[R:[0-9]+]]=, $pop[[L]] +; CHECK-NEXT: i64x2.splat $push[[L:[0-9]+]]=, $0 +; CHECK-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $pop[[L]], $2, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 ; CHECK-NEXT: return $pop[[R]] define <4 x i32> @not_a_vec(i128 %x) { %a = bitcast i128 %x to <4 x i32> diff --git a/llvm/test/CodeGen/X86/kmov.ll b/llvm/test/CodeGen/X86/kmov.ll index 55fb2527722a4..5e31baa1ec72f 100644 --- a/llvm/test/CodeGen/X86/kmov.ll +++ b/llvm/test/CodeGen/X86/kmov.ll @@ -386,36 +386,28 @@ define <32 x i1> @invert_i32_mask_extract_32(i32 %mask) { define <32 x i1> @i64_mask_extract_32(i64 %mask) { ; X64-AVX512-LABEL: i64_mask_extract_32: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: movq %rdi, %rax -; X64-AVX512-NEXT: kmovd %eax, %k0 -; X64-AVX512-NEXT: movzbl %ah, %ecx -; X64-AVX512-NEXT: kmovd %ecx, %k1 -; X64-AVX512-NEXT: kunpckbw %k0, %k1, %k0 -; X64-AVX512-NEXT: movl %eax, %ecx -; X64-AVX512-NEXT: shrl $24, %ecx -; X64-AVX512-NEXT: kmovd %ecx, %k1 -; X64-AVX512-NEXT: shrl $16, %eax -; X64-AVX512-NEXT: movzbl %al, %eax -; X64-AVX512-NEXT: kmovd %eax, %k2 -; X64-AVX512-NEXT: kunpckbw %k2, %k1, %k1 -; X64-AVX512-NEXT: kunpckwd %k0, %k1, %k0 +; X64-AVX512-NEXT: kmovq %rdi, %k0 +; X64-AVX512-NEXT: kshiftrd $8, %k0, %k1 +; X64-AVX512-NEXT: kunpckbw %k0, %k1, %k1 +; X64-AVX512-NEXT: kshiftrd $16, %k0, %k2 +; X64-AVX512-NEXT: kshiftrd $24, %k0, %k0 +; X64-AVX512-NEXT: kunpckbw %k2, %k0, %k0 +; X64-AVX512-NEXT: kunpckwd %k1, %k0, %k0 ; X64-AVX512-NEXT: vpmovm2b %k0, %ymm0 ; X64-AVX512-NEXT: retq ; ; X64-KNL-LABEL: i64_mask_extract_32: ; X64-KNL: # %bb.0: -; X64-KNL-NEXT: movq %rdi, %rax -; X64-KNL-NEXT: movl %eax, %ecx +; X64-KNL-NEXT: movl %edi, %eax +; X64-KNL-NEXT: shrl $16, %eax ; X64-KNL-NEXT: kmovw %eax, %k0 -; X64-KNL-NEXT: movzbl %ah, %edx -; X64-KNL-NEXT: # kill: def $eax killed $eax killed $rax +; X64-KNL-NEXT: movl %edi, %eax ; X64-KNL-NEXT: shrl $24, %eax ; X64-KNL-NEXT: kmovw %eax, %k1 -; X64-KNL-NEXT: shrl $16, %ecx -; X64-KNL-NEXT: movzbl %cl, %eax -; X64-KNL-NEXT: kmovw %eax, %k2 -; X64-KNL-NEXT: kunpckbw %k2, %k1, %k1 -; X64-KNL-NEXT: kmovw %edx, %k2 +; X64-KNL-NEXT: kunpckbw %k0, %k1, %k1 +; X64-KNL-NEXT: kmovw %edi, %k0 +; X64-KNL-NEXT: shrl $8, %edi +; X64-KNL-NEXT: kmovw %edi, %k2 ; X64-KNL-NEXT: kunpckbw %k0, %k2, %k2 ; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 ; X64-KNL-NEXT: vpmovdb %zmm0, %xmm0 @@ -480,82 +472,56 @@ define <32 x i1> @invert_i64_mask_extract_32(i64 %mask) { define <64 x i1> @i64_mask_extract_64(i64 %mask) { ; X64-AVX512-LABEL: i64_mask_extract_64: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: movq %rdi, %rax -; X64-AVX512-NEXT: kmovd %eax, %k0 -; X64-AVX512-NEXT: movzbl %ah, %ecx -; X64-AVX512-NEXT: kmovd %ecx, %k1 -; X64-AVX512-NEXT: kunpckbw %k0, %k1, %k0 -; X64-AVX512-NEXT: movl %eax, %ecx -; X64-AVX512-NEXT: shrl $24, %ecx -; X64-AVX512-NEXT: kmovd %ecx, %k1 -; X64-AVX512-NEXT: movl %eax, %ecx -; X64-AVX512-NEXT: shrl $16, %ecx -; X64-AVX512-NEXT: movzbl %cl, %ecx -; X64-AVX512-NEXT: kmovd %ecx, %k2 -; X64-AVX512-NEXT: kunpckbw %k2, %k1, %k1 -; X64-AVX512-NEXT: kunpckwd %k0, %k1, %k0 -; X64-AVX512-NEXT: movq %rdi, %rcx -; X64-AVX512-NEXT: shrq $32, %rcx -; X64-AVX512-NEXT: movzbl %cl, %ecx -; X64-AVX512-NEXT: kmovd %ecx, %k1 -; X64-AVX512-NEXT: movq %rdi, %rcx -; X64-AVX512-NEXT: shrq $40, %rcx -; X64-AVX512-NEXT: movzbl %cl, %ecx -; X64-AVX512-NEXT: kmovd %ecx, %k2 +; X64-AVX512-NEXT: kmovq %rdi, %k0 +; X64-AVX512-NEXT: kshiftrq $32, %k0, %k1 +; X64-AVX512-NEXT: kshiftrq $40, %k0, %k2 ; X64-AVX512-NEXT: kunpckbw %k1, %k2, %k1 -; X64-AVX512-NEXT: movq %rdi, %rcx -; X64-AVX512-NEXT: shrq $56, %rcx -; X64-AVX512-NEXT: kmovd %ecx, %k2 -; X64-AVX512-NEXT: shrq $48, %rax -; X64-AVX512-NEXT: movzbl %al, %eax -; X64-AVX512-NEXT: kmovd %eax, %k3 -; X64-AVX512-NEXT: kunpckbw %k3, %k2, %k2 +; X64-AVX512-NEXT: kshiftrq $48, %k0, %k2 +; X64-AVX512-NEXT: kshiftrq $56, %k0, %k3 +; X64-AVX512-NEXT: kunpckbw %k2, %k3, %k2 ; X64-AVX512-NEXT: kunpckwd %k1, %k2, %k1 +; X64-AVX512-NEXT: kshiftrd $8, %k0, %k2 +; X64-AVX512-NEXT: kunpckbw %k0, %k2, %k2 +; X64-AVX512-NEXT: kshiftrd $16, %k0, %k3 +; X64-AVX512-NEXT: kshiftrd $24, %k0, %k0 +; X64-AVX512-NEXT: kunpckbw %k3, %k0, %k0 +; X64-AVX512-NEXT: kunpckwd %k2, %k0, %k0 ; X64-AVX512-NEXT: kunpckdq %k0, %k1, %k0 ; X64-AVX512-NEXT: vpmovm2b %k0, %zmm0 ; X64-AVX512-NEXT: retq ; ; X64-KNL-LABEL: i64_mask_extract_64: ; X64-KNL: # %bb.0: -; X64-KNL-NEXT: pushq %rbx -; X64-KNL-NEXT: .cfi_def_cfa_offset 16 -; X64-KNL-NEXT: .cfi_offset %rbx, -16 -; X64-KNL-NEXT: movq %rsi, %rcx ; X64-KNL-NEXT: movq %rdi, %rax -; X64-KNL-NEXT: movl %ecx, %edx -; X64-KNL-NEXT: movq %rsi, %rdi -; X64-KNL-NEXT: movq %rsi, %r8 -; X64-KNL-NEXT: movq %rsi, %r9 -; X64-KNL-NEXT: kmovw %ecx, %k0 -; X64-KNL-NEXT: movzbl %ch, %ebx -; X64-KNL-NEXT: # kill: def $ecx killed $ecx killed $rcx -; X64-KNL-NEXT: shrl $24, %ecx +; X64-KNL-NEXT: kmovw %esi, %k0 +; X64-KNL-NEXT: movl %esi, %ecx +; X64-KNL-NEXT: shrl $8, %ecx +; X64-KNL-NEXT: kmovw %ecx, %k1 +; X64-KNL-NEXT: kunpckbw %k0, %k1, %k0 +; X64-KNL-NEXT: movl %esi, %ecx +; X64-KNL-NEXT: shrl $16, %ecx ; X64-KNL-NEXT: kmovw %ecx, %k1 -; X64-KNL-NEXT: shrl $16, %edx -; X64-KNL-NEXT: movzbl %dl, %ecx +; X64-KNL-NEXT: movl %esi, %ecx +; X64-KNL-NEXT: shrl $24, %ecx ; X64-KNL-NEXT: kmovw %ecx, %k2 -; X64-KNL-NEXT: shrq $32, %rsi -; X64-KNL-NEXT: movzbl %sil, %ecx +; X64-KNL-NEXT: kunpckbw %k1, %k2, %k1 +; X64-KNL-NEXT: movq %rsi, %rcx +; X64-KNL-NEXT: shrq $32, %rcx +; X64-KNL-NEXT: kmovw %ecx, %k2 +; X64-KNL-NEXT: movq %rsi, %rcx +; X64-KNL-NEXT: shrq $40, %rcx +; X64-KNL-NEXT: kmovw %ecx, %k3 +; X64-KNL-NEXT: kunpckbw %k2, %k3, %k2 +; X64-KNL-NEXT: movq %rsi, %rcx +; X64-KNL-NEXT: shrq $48, %rcx ; X64-KNL-NEXT: kmovw %ecx, %k3 -; X64-KNL-NEXT: shrq $40, %rdi -; X64-KNL-NEXT: movzbl %dil, %ecx -; X64-KNL-NEXT: kmovw %ecx, %k4 -; X64-KNL-NEXT: kunpckbw %k2, %k1, %k1 -; X64-KNL-NEXT: shrq $56, %r8 -; X64-KNL-NEXT: kmovw %r8d, %k2 +; X64-KNL-NEXT: shrq $56, %rsi +; X64-KNL-NEXT: kmovw %esi, %k4 ; X64-KNL-NEXT: kunpckbw %k3, %k4, %k3 -; X64-KNL-NEXT: shrq $48, %r9 -; X64-KNL-NEXT: movzbl %r9b, %ecx -; X64-KNL-NEXT: kmovw %ecx, %k4 -; X64-KNL-NEXT: kunpckbw %k4, %k2, %k2 -; X64-KNL-NEXT: kmovw %ebx, %k4 -; X64-KNL-NEXT: kunpckbw %k0, %k4, %k0 -; X64-KNL-NEXT: kmovw %k0, (%rax) -; X64-KNL-NEXT: kmovw %k2, 6(%rax) -; X64-KNL-NEXT: kmovw %k3, 4(%rax) -; X64-KNL-NEXT: kmovw %k1, 2(%rax) -; X64-KNL-NEXT: popq %rbx -; X64-KNL-NEXT: .cfi_def_cfa_offset 8 +; X64-KNL-NEXT: kmovw %k3, 6(%rdi) +; X64-KNL-NEXT: kmovw %k2, 4(%rdi) +; X64-KNL-NEXT: kmovw %k1, 2(%rdi) +; X64-KNL-NEXT: kmovw %k0, (%rdi) ; X64-KNL-NEXT: retq %.splatinsert = insertelement <64 x i64> poison, i64 %mask, i64 0 %.splat = shufflevector <64 x i64> %.splatinsert, <64 x i64> poison, <64 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll index f0f430abc48dc..060bd1764d3c4 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -417,7 +417,6 @@ define half @test_v2f16(<2 x half> %a0) nounwind { ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm3 ; AVX512F-NEXT: vucomiss %xmm3, %xmm2 ; AVX512F-NEXT: seta %al -; AVX512F-NEXT: negb %al ; AVX512F-NEXT: kmovd %eax, %k1 ; AVX512F-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 From cb0d1305d12ea6637a541028eb0a4438750164b9 Mon Sep 17 00:00:00 2001 From: Simi Pallipurath Date: Thu, 3 Apr 2025 11:16:05 +0100 Subject: [PATCH 0509/1029] [Clang][ARM] Ensure both -mno-unaligned-access and -munaligned-access are passed to multilib selection logic (#134099) Previously, alignment option was passed to multilib selection logic only when -mno-unaligned-access was explicitly specified on the command line. Now this change ensure both -mno-unaligned-access and -munaligned-access are passed to the multilib selection logic, which now also considers the target architecture when determining alignment access policy. --- clang/lib/Driver/ToolChain.cpp | 22 +++++++------------ .../test/Driver/print-multi-selection-flags.c | 10 +++++++++ 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 8a922b283daf5..ad73814b3efba 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -239,13 +239,10 @@ static void getAArch64MultilibFlags(const Driver &D, Result.push_back(BranchProtectionArg->getAsString(Args)); } - if (Arg *AlignArg = Args.getLastArg( - options::OPT_mstrict_align, options::OPT_mno_strict_align, - options::OPT_mno_unaligned_access, options::OPT_munaligned_access)) { - if (AlignArg->getOption().matches(options::OPT_mstrict_align) || - AlignArg->getOption().matches(options::OPT_mno_unaligned_access)) - Result.push_back(AlignArg->getAsString(Args)); - } + if (FeatureSet.contains("+strict-align")) + Result.push_back("-mno-unaligned-access"); + else + Result.push_back("-munaligned-access"); if (Arg *Endian = Args.getLastArg(options::OPT_mbig_endian, options::OPT_mlittle_endian)) { @@ -313,13 +310,10 @@ static void getARMMultilibFlags(const Driver &D, Result.push_back(BranchProtectionArg->getAsString(Args)); } - if (Arg *AlignArg = Args.getLastArg( - options::OPT_mstrict_align, options::OPT_mno_strict_align, - options::OPT_mno_unaligned_access, options::OPT_munaligned_access)) { - if (AlignArg->getOption().matches(options::OPT_mstrict_align) || - AlignArg->getOption().matches(options::OPT_mno_unaligned_access)) - Result.push_back(AlignArg->getAsString(Args)); - } + if (FeatureSet.contains("+strict-align")) + Result.push_back("-mno-unaligned-access"); + else + Result.push_back("-munaligned-access"); if (Arg *Endian = Args.getLastArg(options::OPT_mbig_endian, options::OPT_mlittle_endian)) { diff --git a/clang/test/Driver/print-multi-selection-flags.c b/clang/test/Driver/print-multi-selection-flags.c index 5a35ae374f011..5f9383fbed8f4 100644 --- a/clang/test/Driver/print-multi-selection-flags.c +++ b/clang/test/Driver/print-multi-selection-flags.c @@ -69,9 +69,19 @@ // CHECK-BRANCH-PROTECTION: -mbranch-protection=standard // RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -mno-unaligned-access | FileCheck --check-prefix=CHECK-NO-UNALIGNED-ACCESS %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -mstrict-align | FileCheck --check-prefix=CHECK-NO-UNALIGNED-ACCESS %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi | FileCheck --check-prefix=CHECK-NO-UNALIGNED-ACCESS %s // RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-elf -mno-unaligned-access | FileCheck --check-prefix=CHECK-NO-UNALIGNED-ACCESS %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-elf -mstrict-align | FileCheck --check-prefix=CHECK-NO-UNALIGNED-ACCESS %s // CHECK-NO-UNALIGNED-ACCESS: -mno-unaligned-access +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -mno-strict-align | FileCheck --check-prefix=CHECK-UNALIGNED-ACCESS %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -munaligned-access | FileCheck --check-prefix=CHECK-UNALIGNED-ACCESS %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-elf | FileCheck --check-prefix=CHECK-UNALIGNED-ACCESS %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-elf -mno-strict-align | FileCheck --check-prefix=CHECK-UNALIGNED-ACCESS %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-elf -munaligned-access | FileCheck --check-prefix=CHECK-UNALIGNED-ACCESS %s +// CHECK-UNALIGNED-ACCESS: -munaligned-access + // RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -mbig-endian | FileCheck --check-prefix=CHECK-BIG-ENDIAN %s // RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-elf -mbig-endian | FileCheck --check-prefix=CHECK-BIG-ENDIAN %s // CHECK-BIG-ENDIAN: -mbig-endian From ee4e8197fa67dd1ed6e9470e00708e7feeaacd97 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Thu, 3 Apr 2025 11:42:07 +0100 Subject: [PATCH 0510/1029] [LLVM][AArch64][SVE] Mark DUP immediate instructions with isAsCheapAsAMove. (#133945) Doing this means we'll regenerate an immediate rather than copy the result of an existing one, reducing instruction dependency chains. --- llvm/lib/Target/AArch64/SVEInstrFormats.td | 3 + ...interleaving-add-mull-scalable-contract.ll | 66 +++++++++---------- ...x-deinterleaving-add-mull-scalable-fast.ll | 36 +++++----- ...complex-deinterleaving-f16-mul-scalable.ll | 32 ++++----- ...complex-deinterleaving-f32-mul-scalable.ll | 32 ++++----- ...complex-deinterleaving-f64-mul-scalable.ll | 32 ++++----- ...complex-deinterleaving-i16-mul-scalable.ll | 32 ++++----- ...complex-deinterleaving-i32-mul-scalable.ll | 32 ++++----- ...complex-deinterleaving-i64-mul-scalable.ll | 56 ++++++++-------- .../complex-deinterleaving-splat-scalable.ll | 59 ++++++++--------- .../AArch64/sme-intrinsics-mova-extract.ll | 17 +++-- .../AArch64/sve-fixed-length-shuffles.ll | 32 ++++----- llvm/test/CodeGen/AArch64/sve-pr92779.ll | 18 ++--- llvm/test/CodeGen/AArch64/sve-split-fcvt.ll | 8 +-- ...ing-mode-fixed-length-insert-vector-elt.ll | 1 - .../sve-streaming-mode-test-register-mov.ll | 2 - 16 files changed, 228 insertions(+), 230 deletions(-) diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 772c440685072..c56713783289e 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -2113,6 +2113,7 @@ class sve_int_dup_mask_imm let DecoderMethod = "DecodeSVELogicalImmInstruction"; let hasSideEffects = 0; + let isAsCheapAsAMove = 1; let isReMaterializable = 1; let Uses = [VG]; } @@ -5118,6 +5119,7 @@ class sve_int_dup_imm sz8_64, string asm, let Inst{4-0} = Zd; let hasSideEffects = 0; + let isAsCheapAsAMove = 1; let isReMaterializable = 1; let Uses = [VG]; } @@ -5161,6 +5163,7 @@ class sve_int_dup_fpimm sz8_64, Operand fpimmtype, let Inst{4-0} = Zd; let hasSideEffects = 0; + let isAsCheapAsAMove = 1; let isReMaterializable = 1; let Uses = [VG]; } diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll index 0485d530fd060..98f5b4c19a9b9 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll @@ -51,20 +51,20 @@ define @mul_add_mull( %a, , } @llvm.vector.deinterleave2.nxv4f64( %a) @@ -102,20 +102,20 @@ define @mul_sub_mull( %a, , } @llvm.vector.deinterleave2.nxv4f64( %a) @@ -153,20 +153,20 @@ define @mul_conj_mull( %a, , } @llvm.vector.deinterleave2.nxv4f64( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll index c643ae9265c0b..2fc91125bc0ac 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll @@ -42,18 +42,18 @@ define @mul_add_mull( %a, , } @llvm.vector.deinterleave2.nxv4f64( %a) @@ -91,18 +91,18 @@ define @mul_sub_mull( %a, , } @llvm.vector.deinterleave2.nxv4f64( %a) @@ -140,18 +140,18 @@ define @mul_conj_mull( %a, , } @llvm.vector.deinterleave2.nxv4f64( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll index b42d484ea74c9..80934d2cb98c2 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll @@ -73,14 +73,14 @@ define @complex_mul_v16f16( %a, , } @llvm.vector.deinterleave2.nxv16f16( %a) @@ -104,22 +104,22 @@ define @complex_mul_v32f16( %a, , } @llvm.vector.deinterleave2.nxv32f16( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll index bcd46aa182b55..874b5b538f1fd 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll @@ -35,14 +35,14 @@ define @complex_mul_v8f32( %a, , } @llvm.vector.deinterleave2.nxv8f32( %a) @@ -66,22 +66,22 @@ define @complex_mul_v16f32( %a, , } @llvm.vector.deinterleave2.nxv16f32( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll index db28fa3997cb3..c9a092f52f159 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll @@ -35,14 +35,14 @@ define @complex_mul_v4f64( %a, , } @llvm.vector.deinterleave2.nxv4f64( %a) @@ -66,22 +66,22 @@ define @complex_mul_v8f64( %a, , } @llvm.vector.deinterleave2.nxv8f64( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll index b4cb548f63088..58a0809ee093f 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll @@ -72,13 +72,13 @@ define @complex_mul_v16i16( %a, , } @llvm.vector.deinterleave2.nxv16i16( %a) @@ -102,21 +102,21 @@ define @complex_mul_v32i16( %a, , } @llvm.vector.deinterleave2.nxv32i16( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll index 4cfe4707b9a96..0958c60ed7cb0 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll @@ -34,13 +34,13 @@ define @complex_mul_v8i32( %a, , } @llvm.vector.deinterleave2.nxv8i32( %a) @@ -64,21 +64,21 @@ define @complex_mul_v16i32( %a, , } @llvm.vector.deinterleave2.nxv16i32( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll index 5975f3b491d48..30c06838c81bc 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll @@ -34,13 +34,13 @@ define @complex_mul_v4i64( %a, , } @llvm.vector.deinterleave2.nxv4i64( %a) @@ -64,21 +64,21 @@ define @complex_mul_v8i64( %a, , } @llvm.vector.deinterleave2.nxv8i64( %a) @@ -102,21 +102,21 @@ define @complex_minus_mul_v8i64( %a, , } @llvm.vector.deinterleave2.nxv8i64( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll index b4425c0c01e17..407da6cd6002b 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll @@ -8,24 +8,24 @@ target triple = "aarch64" define @complex_mul_const( %a, %b) { ; CHECK-LABEL: complex_mul_const: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov z5.d, #0 // =0x0 ; CHECK-NEXT: mov z4.d, #0 // =0x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmov z7.d, #3.00000000 -; CHECK-NEXT: fmov z24.d, #11.00000000 -; CHECK-NEXT: mov z6.d, z4.d -; CHECK-NEXT: mov z5.d, z4.d -; CHECK-NEXT: fcmla z6.d, p0/m, z1.d, z3.d, #0 -; CHECK-NEXT: fcmla z5.d, p0/m, z0.d, z2.d, #0 -; CHECK-NEXT: fcmla z6.d, p0/m, z1.d, z3.d, #90 -; CHECK-NEXT: zip2 z1.d, z24.d, z7.d -; CHECK-NEXT: fcmla z5.d, p0/m, z0.d, z2.d, #90 -; CHECK-NEXT: zip1 z2.d, z24.d, z7.d -; CHECK-NEXT: mov z0.d, z4.d -; CHECK-NEXT: fcmla z4.d, p0/m, z6.d, z1.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z2.d, #0 -; CHECK-NEXT: fcmla z4.d, p0/m, z6.d, z1.d, #90 -; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z2.d, #90 -; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: fmov z6.d, #3.00000000 +; CHECK-NEXT: fmov z7.d, #11.00000000 +; CHECK-NEXT: fcmla z5.d, p0/m, z1.d, z3.d, #0 +; CHECK-NEXT: fcmla z4.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z5.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: fcmla z4.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: zip2 z1.d, z7.d, z6.d +; CHECK-NEXT: zip1 z3.d, z7.d, z6.d +; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: fcmla z2.d, p0/m, z5.d, z1.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z3.d, #0 +; CHECK-NEXT: fcmla z2.d, p0/m, z5.d, z1.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z3.d, #90 +; CHECK-NEXT: mov z1.d, z2.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.vector.deinterleave2.nxv4f64( %a) @@ -56,25 +56,24 @@ define @complex_mul_non_const( %a, @extract_col_q_v2f64( %zd, @test_sink_offset_operand( %pg, i32 %base, i32 %N) { ; CHECK-LABEL: test_sink_offset_operand: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 ; CHECK-NEXT: mov w12, w0 ; CHECK-NEXT: .LBB26_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: mov z1.s, #0 // =0x0 ; CHECK-NEXT: subs w1, w1, #3 -; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 0] -; CHECK-NEXT: mov z2.s, p0/m, za0h.s[w12, 1] -; CHECK-NEXT: mov z3.s, p0/m, za0h.s[w12, 2] +; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: mov z0.s, p0/m, za0h.s[w12, 0] +; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 1] +; CHECK-NEXT: mov z2.s, p0/m, za0h.s[w12, 2] ; CHECK-NEXT: b.ne .LBB26_1 ; CHECK-NEXT: // %bb.2: // %exit -; CHECK-NEXT: add z0.s, z1.s, z2.s -; CHECK-NEXT: add z0.s, z0.s, z3.s +; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: add z0.s, z0.s, z2.s ; CHECK-NEXT: ret entry: %add1 = add i32 %base, 1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index 3fa7eca02c351..0c47e7e14183a 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -29,58 +29,58 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: tbnz w1, #0, .LBB1_2 ; CHECK-NEXT: // %bb.1: // %vector.body ; CHECK-NEXT: mov z0.b, #0 // =0x0 +; CHECK-NEXT: mov z1.b, #0 // =0x0 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: umov w8, v0.b[8] -; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v1.b[1], v0.b[1] ; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: mov v2.b[1], v0.b[9] ; CHECK-NEXT: mov v1.b[2], v0.b[2] -; CHECK-NEXT: mov v2.b[2], v0.b[10] +; CHECK-NEXT: mov v2.b[1], v0.b[9] ; CHECK-NEXT: mov v1.b[3], v0.b[3] -; CHECK-NEXT: mov v2.b[3], v0.b[11] +; CHECK-NEXT: mov v2.b[2], v0.b[10] ; CHECK-NEXT: mov v1.b[4], v0.b[4] -; CHECK-NEXT: mov v2.b[4], v0.b[12] +; CHECK-NEXT: mov v2.b[3], v0.b[11] ; CHECK-NEXT: mov v1.b[5], v0.b[5] -; CHECK-NEXT: mov v2.b[5], v0.b[13] +; CHECK-NEXT: mov v2.b[4], v0.b[12] ; CHECK-NEXT: mov v1.b[6], v0.b[6] -; CHECK-NEXT: mov v2.b[6], v0.b[14] +; CHECK-NEXT: mov v2.b[5], v0.b[13] ; CHECK-NEXT: mov v1.b[7], v0.b[7] +; CHECK-NEXT: mov v2.b[6], v0.b[14] +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: mov v2.b[7], v0.b[15] ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 -; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpklo z2.h, z2.b -; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: lsl z1.s, z1.s, #31 ; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: lsl z1.s, z1.s, #31 +; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: lsl z0.s, z0.s, #31 -; CHECK-NEXT: asr z1.s, z1.s, #31 +; CHECK-NEXT: and z1.s, z1.s, #0x1 ; CHECK-NEXT: lsl z2.s, z2.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 -; CHECK-NEXT: and z1.s, z1.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: lsl z3.s, z3.s, #31 ; CHECK-NEXT: asr z2.s, z2.s, #31 ; CHECK-NEXT: and z0.s, z0.s, #0x1 -; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: asr z3.s, z3.s, #31 ; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: mov z1.s, p1/m, #0 // =0x0 ; CHECK-NEXT: cmpne p2.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #2, mul vl] ; CHECK-NEXT: and z3.s, z3.s, #0x1 ; CHECK-NEXT: cmpne p4.s, p0/z, z2.s, #0 ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: mov z1.s, p1/m, #0 // =0x0 +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: cmpne p3.s, p0/z, z3.s, #0 ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, #3, mul vl] ; CHECK-NEXT: mov z0.s, p2/m, #0 // =0x0 ; CHECK-NEXT: mov z2.s, p4/m, #0 // =0x0 -; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x0, #2, mul vl] ; CHECK-NEXT: mov z3.s, p3/m, #0 // =0x0 ; CHECK-NEXT: st1w { z2.s }, p0, [x0, #1, mul vl] diff --git a/llvm/test/CodeGen/AArch64/sve-pr92779.ll b/llvm/test/CodeGen/AArch64/sve-pr92779.ll index 480f41eb0f81b..1bb7801b6a047 100644 --- a/llvm/test/CodeGen/AArch64/sve-pr92779.ll +++ b/llvm/test/CodeGen/AArch64/sve-pr92779.ll @@ -5,16 +5,16 @@ define void @main(ptr %0) { ; CHECK-LABEL: main: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: mov z1.d, #0 // =0x0 ; CHECK-NEXT: ptrue p0.d, vl1 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 -; CHECK-NEXT: uzp1 v1.2s, v0.2s, v1.2s -; CHECK-NEXT: neg v1.2s, v1.2s -; CHECK-NEXT: smov x8, v1.s[0] -; CHECK-NEXT: smov x9, v1.s[1] -; CHECK-NEXT: mov z0.d, p0/m, x8 -; CHECK-NEXT: mov z0.d, p0/m, x9 -; CHECK-NEXT: str z0, [x0] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uzp1 v0.2s, v1.2s, v0.2s +; CHECK-NEXT: neg v0.2s, v0.2s +; CHECK-NEXT: smov x8, v0.s[0] +; CHECK-NEXT: smov x9, v0.s[1] +; CHECK-NEXT: mov z1.d, p0/m, x8 +; CHECK-NEXT: mov z1.d, p0/m, x9 +; CHECK-NEXT: str z1, [x0] ; CHECK-NEXT: ret "entry": %1 = bitcast zeroinitializer to diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll index 5c84551432909..2fe09f8ac7c5d 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll @@ -331,12 +331,12 @@ define @scvtf_d_nxv4i32( %a) { define @scvtf_d_nxv4i1( %a) { ; CHECK-LABEL: scvtf_d_nxv4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: mov z1.d, #0 // =0x0 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: mov z0.d, z1.d -; CHECK-NEXT: fmov z1.d, p0/m, #-1.00000000 ; CHECK-NEXT: fmov z0.d, p1/m, #-1.00000000 +; CHECK-NEXT: fmov z1.d, p0/m, #-1.00000000 ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -392,12 +392,12 @@ define @ucvtf_d_nxv4i32( %a) { define @ucvtf_d_nxv4i1( %a) { ; CHECK-LABEL: ucvtf_d_nxv4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: mov z1.d, #0 // =0x0 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: mov z0.d, z1.d -; CHECK-NEXT: fmov z1.d, p0/m, #1.00000000 ; CHECK-NEXT: fmov z0.d, p1/m, #1.00000000 +; CHECK-NEXT: fmov z1.d, p0/m, #1.00000000 ; CHECK-NEXT: ret %res = uitofp %a to ret %res diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll index 275d13ebfd949..ad00e99b704dd 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll @@ -419,7 +419,6 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) { ; CHECK-LABEL: insertelement_v1i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.d, #5 // =0x5 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: insertelement_v1i64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll index 9c7a3d5046d0e..37435e35ceabf 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll @@ -39,7 +39,6 @@ define <2 x i64> @fixed_vec_zero_constant() { ; CHECK-LABEL: fixed_vec_zero_constant: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fixed_vec_zero_constant: @@ -54,7 +53,6 @@ define <2 x double> @fixed_vec_fp_zero_constant() { ; CHECK-LABEL: fixed_vec_fp_zero_constant: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fixed_vec_fp_zero_constant: From 7febd78f1e6caacb05ea43fa838a3b2b21fdd0bd Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Thu, 3 Apr 2025 07:13:30 -0400 Subject: [PATCH 0511/1029] No longer diagnose __auto_type as the auto extension (#134129) Given: __auto_type x = 12; decltype(auto) y = 12; -Wc++98-compat would diagnose both x and y with: 'auto' type specifier is incompatible with C++98 This patch silences the diagnostic in those cases. decltype(auto) is still diagnosed with: 'decltype(auto)' type specifier is incompatible with C++ standards before C++14 as expected but no longer produces the extraneous diagnostic about use of 'auto'. Fixes #47900 --- clang/docs/ReleaseNotes.rst | 3 +++ clang/lib/Sema/SemaType.cpp | 19 ++++++++++++------- clang/test/SemaCXX/cxx98-compat.cpp | 10 ++++++++++ 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5e8df45e71d54..47f9c3caa0e47 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -318,6 +318,9 @@ Improvements to Clang's diagnostics - Split diagnosing base class qualifiers from the ``-Wignored-Qualifiers`` diagnostic group into a new ``-Wignored-base-class-qualifiers`` diagnostic group (which is grouped under ``-Wignored-qualifiers``). Fixes #GH131935. +- ``-Wc++98-compat`` no longer diagnoses use of ``__auto_type`` or + ``decltype(auto)`` as though it was the extension for ``auto``. (#GH47900) + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 2df961a48c7c3..20240fdaf2bbb 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -3376,13 +3376,18 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state, } else if (Auto && D.getContext() != DeclaratorContext::LambdaExpr) { // If there was a trailing return type, we already got // warn_cxx98_compat_trailing_return_type in the parser. - SemaRef.Diag(AutoRange.getBegin(), - D.getContext() == DeclaratorContext::LambdaExprParameter - ? diag::warn_cxx11_compat_generic_lambda - : IsDeducedReturnType - ? diag::warn_cxx11_compat_deduced_return_type - : diag::warn_cxx98_compat_auto_type_specifier) - << AutoRange; + // If there was a decltype(auto), we already got + // warn_cxx11_compat_decltype_auto_type_specifier. + unsigned DiagId = 0; + if (D.getContext() == DeclaratorContext::LambdaExprParameter) + DiagId = diag::warn_cxx11_compat_generic_lambda; + else if (IsDeducedReturnType) + DiagId = diag::warn_cxx11_compat_deduced_return_type; + else if (Auto->getKeyword() == AutoTypeKeyword::Auto) + DiagId = diag::warn_cxx98_compat_auto_type_specifier; + + if (DiagId) + SemaRef.Diag(AutoRange.getBegin(), DiagId) << AutoRange; } } diff --git a/clang/test/SemaCXX/cxx98-compat.cpp b/clang/test/SemaCXX/cxx98-compat.cpp index 43ba208d375cd..8e7acf73923e5 100644 --- a/clang/test/SemaCXX/cxx98-compat.cpp +++ b/clang/test/SemaCXX/cxx98-compat.cpp @@ -14,6 +14,16 @@ namespace std { }; } +void test_other_auto_spellings() { + __auto_type x = 0; // Ok + decltype(auto) y = 0; // expected-warning {{'decltype' type specifier is incompatible with C++98}} +#ifndef CXX14COMPAT + // expected-warning@-2 {{'decltype(auto)' type specifier is a C++14 extension}} +#else + // expected-warning@-4 {{'decltype(auto)' type specifier is incompatible with C++ standards before C++14}} +#endif +} + template // expected-warning {{variadic templates are incompatible with C++98}} class Variadic1 {}; From 739fe980802e17e49ab9cc2e4c18a48c88e15ef5 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 3 Apr 2025 12:21:41 +0100 Subject: [PATCH 0512/1029] [lldb][test] TestExprFromNonZeroFrame.py: fix windows build On Windows this test was failing to link with following error: ``` make: Entering directory 'C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.test' C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\clang.exe -gdwarf -O0 -IC:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\packages\Python\lldbsuite\test\make/../../../../..//include -IC:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/tools/lldb/include -IC:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\API\commands\expression\expr-from-non-zero-frame -IC:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\packages\Python\lldbsuite\test\make -include C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\packages\Python\lldbsuite\test\make/test_common.h -fno-limit-debug-info -MT main.o -MD -MP -MF main.d -c -o main.o C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\API\commands\expression\expr-from-non-zero-frame/main.c C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\build\bin\clang.exe main.o -gdwarf -O0 -IC:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\packages\Python\lldbsuite\test\make/../../../../..//include -IC:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/tools/lldb/include -IC:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\test\API\commands\expression\expr-from-non-zero-frame -IC:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\packages\Python\lldbsuite\test\make -include C:\Users\tcwg\llvm-worker\lldb-aarch64-windows\llvm-project\lldb\packages\Python\lldbsuite\test\make/test_common.h -fno-limit-debug-info -fuse-ld=lld --driver-mode=g++ -o "a.out" lld-link: error: undefined symbol: printf >>> referenced by main.o:(func) clang: error: linker command failed with exit code 1 (use -v to see invocation) make: *** [Makefile.rules:530: a.out] Error 1 make: Leaving directory 'C:/Users/tcwg/llvm-worker/lldb-aarch64-windows/build/lldb-test-build.noindex/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.test' ``` --- .../expr-from-non-zero-frame/TestExprFromNonZeroFrame.py | 2 +- .../API/commands/expression/expr-from-non-zero-frame/main.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py b/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py index 9b1bcfb177765..623c5b87f14c7 100644 --- a/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py +++ b/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py @@ -16,7 +16,7 @@ def test(self): self.build() (_, _, thread, _) = lldbutil.run_to_source_breakpoint( - self, "Break here", lldb.SBFileSpec("main.c") + self, "return 5", lldb.SBFileSpec("main.c") ) frame = thread.GetFrameAtIndex(1) diff --git a/lldb/test/API/commands/expression/expr-from-non-zero-frame/main.c b/lldb/test/API/commands/expression/expr-from-non-zero-frame/main.c index d1675525cf4f2..abd52aeeb5b0b 100644 --- a/lldb/test/API/commands/expression/expr-from-non-zero-frame/main.c +++ b/lldb/test/API/commands/expression/expr-from-non-zero-frame/main.c @@ -1,6 +1,3 @@ -int func(void) { - __builtin_printf("Break here"); - return 5; -} +int func(void) { return 5; } int main(int argc, const char *argv[]) { return func(); } From 722346c7bc70aa528beccead4119db83f134f5cd Mon Sep 17 00:00:00 2001 From: Ilya Biryukov Date: Thu, 3 Apr 2025 14:14:34 +0200 Subject: [PATCH 0513/1029] [Tooling] Handle AttributedType in getFullyQualifiedType (#134228) Before this change the code used to add extra qualifiers, e.g. `std::unique_ptr _Nonnull` became `::std::std::unique_ptr _Nonnull` when adding a global namespace qualifier was requested. --- clang/lib/AST/QualTypeNames.cpp | 13 +++++++++ clang/unittests/Tooling/QualTypeNamesTest.cpp | 27 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/clang/lib/AST/QualTypeNames.cpp b/clang/lib/AST/QualTypeNames.cpp index d8ab1092d3ea4..5c151254c36e7 100644 --- a/clang/lib/AST/QualTypeNames.cpp +++ b/clang/lib/AST/QualTypeNames.cpp @@ -10,6 +10,7 @@ #include "clang/AST/DeclTemplate.h" #include "clang/AST/DeclarationName.h" #include "clang/AST/Mangle.h" +#include "clang/AST/Type.h" namespace clang { @@ -416,6 +417,18 @@ QualType getFullyQualifiedType(QualType QT, const ASTContext &Ctx, return QT; } + // Handle types with attributes such as `unique_ptr _Nonnull`. + if (auto *AT = dyn_cast(QT.getTypePtr())) { + QualType NewModified = + getFullyQualifiedType(AT->getModifiedType(), Ctx, WithGlobalNsPrefix); + QualType NewEquivalent = + getFullyQualifiedType(AT->getEquivalentType(), Ctx, WithGlobalNsPrefix); + Qualifiers Qualifiers = QT.getLocalQualifiers(); + return Ctx.getQualifiedType( + Ctx.getAttributedType(AT->getAttrKind(), NewModified, NewEquivalent), + Qualifiers); + } + // Remove the part of the type related to the type being a template // parameter (we won't report it as part of the 'type name' and it // is actually make the code below to be more complex (to handle diff --git a/clang/unittests/Tooling/QualTypeNamesTest.cpp b/clang/unittests/Tooling/QualTypeNamesTest.cpp index 5ded64d4fcc8c..dc81f0188b4fc 100644 --- a/clang/unittests/Tooling/QualTypeNamesTest.cpp +++ b/clang/unittests/Tooling/QualTypeNamesTest.cpp @@ -297,4 +297,31 @@ TEST(QualTypeNameTest, ConstUsing) { using ::A::S; void foo(const S& param1, const S param2);)"); } + +TEST(QualTypeNameTest, NullableAttributesWithGlobalNs) { + TypeNameVisitor Visitor; + Visitor.WithGlobalNsPrefix = true; + Visitor.ExpectedQualTypeNames["param1"] = "::std::unique_ptr _Nullable"; + Visitor.ExpectedQualTypeNames["param2"] = "::std::unique_ptr _Nonnull"; + Visitor.ExpectedQualTypeNames["param3"] = + "::std::unique_ptr< ::std::unique_ptr _Nullable> _Nonnull"; + Visitor.ExpectedQualTypeNames["param4"] = + "::std::unique_ptr _Nullable const *"; + Visitor.ExpectedQualTypeNames["param5"] = + "::std::unique_ptr _Nullable const *"; + Visitor.ExpectedQualTypeNames["param6"] = + "::std::unique_ptr _Nullable const *"; + Visitor.runOver(R"(namespace std { + template class unique_ptr {}; + } + void foo( + std::unique_ptr _Nullable param1, + _Nonnull std::unique_ptr param2, + std::unique_ptr _Nullable> _Nonnull param3, + const std::unique_ptr _Nullable *param4, + _Nullable std::unique_ptr const *param5, + std::unique_ptr _Nullable const *param6 + ); + )"); +} } // end anonymous namespace From 41a6bb4c055cf08110676d9bc942f369fb19450d Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Thu, 3 Apr 2025 13:15:05 +0100 Subject: [PATCH 0514/1029] [LLVM][CodeGen][SVE] Prefer NEON instructions when zeroing Z registers. (#133929) Several implementations have zero-latency instructions to zero registers. To-date no implementation has a dedicated SVE instruction but we can use the NEON equivalent because it is defined to zero bits 128..VL regardless of the immediate used. NOTE: The relevant instruction is not available in streaming mode, where the original SVE DUP instruction remains in use. --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 19 + ...interleaving-add-mull-scalable-contract.ll | 26 +- ...x-deinterleaving-add-mull-scalable-fast.ll | 12 +- ...complex-deinterleaving-f16-mul-scalable.ll | 14 +- ...complex-deinterleaving-f32-mul-scalable.ll | 14 +- ...complex-deinterleaving-f64-mul-scalable.ll | 14 +- ...complex-deinterleaving-i16-mul-scalable.ll | 14 +- ...complex-deinterleaving-i32-mul-scalable.ll | 14 +- ...complex-deinterleaving-i64-mul-scalable.ll | 22 +- ...rleaving-reductions-predicated-scalable.ll | 6 +- ...plex-deinterleaving-reductions-scalable.ll | 18 +- .../complex-deinterleaving-splat-scalable.ll | 37 +- .../AArch64/dag-combine-concat-vectors.ll | 47 +- llvm/test/CodeGen/AArch64/load-insert-zero.ll | 24 +- llvm/test/CodeGen/AArch64/sinksplat.ll | 2 +- .../CodeGen/AArch64/sve-bf16-int-converts.ll | 100 +- llvm/test/CodeGen/AArch64/sve-fcmp.ll | 2 +- llvm/test/CodeGen/AArch64/sve-fcvt.ll | 32 +- .../AArch64/sve-fixed-length-shuffles.ll | 4 +- llvm/test/CodeGen/AArch64/sve-fp-combine.ll | 12 +- .../AArch64/sve-implicit-zero-filling.ll | 6 +- llvm/test/CodeGen/AArch64/sve-int-log.ll | 2 +- llvm/test/CodeGen/AArch64/sve-int-reduce.ll | 22 +- .../AArch64/sve-intrinsics-int-arith-imm.ll | 16 +- .../sve-intrinsics-int-arith-merging.ll | 4 +- .../AArch64/sve-intrinsics-scalar-to-vec.ll | 2 +- .../AArch64/sve-intrinsics-shifts-merging.ll | 18 +- llvm/test/CodeGen/AArch64/sve-knownbits.ll | 2 +- llvm/test/CodeGen/AArch64/sve-ld1r.ll | 12 +- .../CodeGen/AArch64/sve-masked-scatter.ll | 2 +- .../AArch64/sve-partial-reduce-dot-product.ll | 28 +- llvm/test/CodeGen/AArch64/sve-pr92779.ll | 4 +- llvm/test/CodeGen/AArch64/sve-split-fcvt.ll | 8 +- llvm/test/CodeGen/AArch64/sve-vector-splat.ll | 22 +- llvm/test/CodeGen/AArch64/sve-vselect-imm.ll | 36 +- llvm/test/CodeGen/AArch64/sve-zeroinit.ll | 130 ++- .../AArch64/sve2p1-intrinsics-bfadd.ll | 2 +- .../AArch64/sve2p1-intrinsics-bfmax.ll | 2 +- .../AArch64/sve2p1-intrinsics-bfmaxnm.ll | 2 +- .../AArch64/sve2p1-intrinsics-bfmin.ll | 2 +- .../AArch64/sve2p1-intrinsics-bfminnm.ll | 2 +- .../AArch64/sve2p1-intrinsics-bfmla.ll | 2 +- .../AArch64/sve2p1-intrinsics-bfmls.ll | 2 +- .../AArch64/sve2p1-intrinsics-bfmul.ll | 2 +- .../AArch64/sve2p1-intrinsics-bfsub.ll | 2 +- .../CodeGen/AArch64/zeroing-forms-abs-neg.ll | 228 +++-- .../AArch64/zeroing-forms-counts-not.ll | 388 +++++--- .../test/CodeGen/AArch64/zeroing-forms-ext.ll | 196 ++-- .../AArch64/zeroing-forms-fcvt-bfcvt.ll | 116 ++- .../AArch64/zeroing-forms-fcvtlt-fcvtx.ll | 52 +- .../CodeGen/AArch64/zeroing-forms-fcvtzsu.ll | 228 +++-- .../CodeGen/AArch64/zeroing-forms-flogb.ll | 52 +- .../zeroing-forms-frint-frecpx-fsqrt.ll | 868 ++++++++++++------ .../test/CodeGen/AArch64/zeroing-forms-rev.ll | 292 ++++-- ...eroing-forms-urecpe-ursqrte-sqabs-sqneg.ll | 164 ++-- .../CodeGen/AArch64/zeroing-forms-uscvtf.ll | 228 +++-- 56 files changed, 2360 insertions(+), 1217 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index f291589e04c6b..a3b1ae55df028 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7731,6 +7731,7 @@ def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128, "movi", ".2d", [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; +let Predicates = [HasNEON] in { def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>; @@ -7740,6 +7741,23 @@ def : Pat<(v4f32 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v8f16 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v8bf16 immAllZerosV), (MOVIv2d_ns (i32 0))>; +// Prefer NEON instructions when zeroing ZPRs because they are potentially zero-latency. +let AddedComplexity = 5 in { +def : Pat<(nxv2i64 (splat_vector (i64 0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +def : Pat<(nxv4i32 (splat_vector (i32 0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +def : Pat<(nxv8i16 (splat_vector (i32 0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +def : Pat<(nxv16i8 (splat_vector (i32 0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +def : Pat<(nxv2f64 (splat_vector (f64 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +def : Pat<(nxv2f32 (splat_vector (f32 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +def : Pat<(nxv4f32 (splat_vector (f32 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +def : Pat<(nxv2f16 (splat_vector (f16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +def : Pat<(nxv4f16 (splat_vector (f16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +def : Pat<(nxv8f16 (splat_vector (f16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +def : Pat<(nxv2bf16 (splat_vector (bf16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +def : Pat<(nxv4bf16 (splat_vector (bf16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +def : Pat<(nxv8bf16 (splat_vector (bf16 fpimm0))), (SUBREG_TO_REG (i32 0), (MOVIv2d_ns (i32 0)), zsub)>; +} + def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>; def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>; def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>; @@ -7760,6 +7778,7 @@ def : Pat<(v1i64 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; def : Pat<(v2i32 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; def : Pat<(v4i16 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; def : Pat<(v8i8 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; +} // EDIT per word & halfword: 2s, 4h, 4s, & 8h let isReMaterializable = 1, isAsCheapAsAMove = 1 in diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll index 98f5b4c19a9b9..533e831de0df8 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll @@ -50,10 +50,10 @@ entry: define @mul_add_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_add_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.d, #0 // =0x0 -; CHECK-NEXT: mov z25.d, #0 // =0x0 -; CHECK-NEXT: mov z26.d, #0 // =0x0 -; CHECK-NEXT: mov z27.d, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 +; CHECK-NEXT: movi v26.2d, #0000000000000000 +; CHECK-NEXT: movi v27.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcmla z24.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z25.d, p0/m, z3.d, z1.d, #0 @@ -101,10 +101,10 @@ entry: define @mul_sub_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_sub_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.d, #0 // =0x0 -; CHECK-NEXT: mov z25.d, #0 // =0x0 -; CHECK-NEXT: mov z26.d, #0 // =0x0 -; CHECK-NEXT: mov z27.d, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 +; CHECK-NEXT: movi v26.2d, #0000000000000000 +; CHECK-NEXT: movi v27.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcmla z24.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z25.d, p0/m, z3.d, z1.d, #0 @@ -152,10 +152,10 @@ entry: define @mul_conj_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_conj_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.d, #0 // =0x0 -; CHECK-NEXT: mov z25.d, #0 // =0x0 -; CHECK-NEXT: mov z26.d, #0 // =0x0 -; CHECK-NEXT: mov z27.d, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 +; CHECK-NEXT: movi v26.2d, #0000000000000000 +; CHECK-NEXT: movi v27.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcmla z24.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z25.d, p0/m, z3.d, z1.d, #0 @@ -204,7 +204,7 @@ define @mul_add_rot_mull( %a, @mul_add_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_add_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.d, #0 // =0x0 -; CHECK-NEXT: mov z25.d, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 @@ -90,8 +90,8 @@ entry: define @mul_sub_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_sub_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.d, #0 // =0x0 -; CHECK-NEXT: mov z25.d, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #270 ; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #270 @@ -139,8 +139,8 @@ entry: define @mul_conj_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_conj_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.d, #0 // =0x0 -; CHECK-NEXT: mov z25.d, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll index 80934d2cb98c2..a7442cae84c2d 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll @@ -46,7 +46,7 @@ entry: define @complex_mul_v8f16( %a, %b) { ; CHECK-LABEL: complex_mul_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: fcmla z2.h, p0/m, z1.h, z0.h, #0 ; CHECK-NEXT: fcmla z2.h, p0/m, z1.h, z0.h, #90 @@ -72,8 +72,8 @@ entry: define @complex_mul_v16f16( %a, %b) { ; CHECK-LABEL: complex_mul_v16f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z4.h, #0 // =0x0 -; CHECK-NEXT: mov z5.h, #0 // =0x0 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #0 ; CHECK-NEXT: fcmla z4.h, p0/m, z3.h, z1.h, #0 @@ -103,10 +103,10 @@ entry: define @complex_mul_v32f16( %a, %b) { ; CHECK-LABEL: complex_mul_v32f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.h, #0 // =0x0 -; CHECK-NEXT: mov z25.h, #0 // =0x0 -; CHECK-NEXT: mov z26.h, #0 // =0x0 -; CHECK-NEXT: mov z27.h, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 +; CHECK-NEXT: movi v26.2d, #0000000000000000 +; CHECK-NEXT: movi v27.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: fcmla z24.h, p0/m, z4.h, z0.h, #0 ; CHECK-NEXT: fcmla z25.h, p0/m, z5.h, z1.h, #0 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll index 874b5b538f1fd..3cad74b7f5fc6 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll @@ -7,7 +7,7 @@ target triple = "aarch64" define @complex_mul_v4f32( %a, %b) { ; CHECK-LABEL: complex_mul_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fcmla z2.s, p0/m, z1.s, z0.s, #0 ; CHECK-NEXT: fcmla z2.s, p0/m, z1.s, z0.s, #90 @@ -34,8 +34,8 @@ entry: define @complex_mul_v8f32( %a, %b) { ; CHECK-LABEL: complex_mul_v8f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z4.s, #0 // =0x0 -; CHECK-NEXT: mov z5.s, #0 // =0x0 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #0 ; CHECK-NEXT: fcmla z4.s, p0/m, z3.s, z1.s, #0 @@ -65,10 +65,10 @@ entry: define @complex_mul_v16f32( %a, %b) { ; CHECK-LABEL: complex_mul_v16f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.s, #0 // =0x0 -; CHECK-NEXT: mov z25.s, #0 // =0x0 -; CHECK-NEXT: mov z26.s, #0 // =0x0 -; CHECK-NEXT: mov z27.s, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 +; CHECK-NEXT: movi v26.2d, #0000000000000000 +; CHECK-NEXT: movi v27.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fcmla z24.s, p0/m, z4.s, z0.s, #0 ; CHECK-NEXT: fcmla z25.s, p0/m, z5.s, z1.s, #0 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll index c9a092f52f159..e3d99fa457bbc 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll @@ -7,7 +7,7 @@ target triple = "aarch64" define @complex_mul_v2f64( %a, %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcmla z2.d, p0/m, z1.d, z0.d, #0 ; CHECK-NEXT: fcmla z2.d, p0/m, z1.d, z0.d, #90 @@ -34,8 +34,8 @@ entry: define @complex_mul_v4f64( %a, %b) { ; CHECK-LABEL: complex_mul_v4f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z4.d, #0 // =0x0 -; CHECK-NEXT: mov z5.d, #0 // =0x0 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcmla z5.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z4.d, p0/m, z3.d, z1.d, #0 @@ -65,10 +65,10 @@ entry: define @complex_mul_v8f64( %a, %b) { ; CHECK-LABEL: complex_mul_v8f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.d, #0 // =0x0 -; CHECK-NEXT: mov z25.d, #0 // =0x0 -; CHECK-NEXT: mov z26.d, #0 // =0x0 -; CHECK-NEXT: mov z27.d, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 +; CHECK-NEXT: movi v26.2d, #0000000000000000 +; CHECK-NEXT: movi v27.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcmla z24.d, p0/m, z4.d, z0.d, #0 ; CHECK-NEXT: fcmla z25.d, p0/m, z5.d, z1.d, #0 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll index 58a0809ee093f..061fd07489284 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll @@ -46,7 +46,7 @@ entry: define @complex_mul_v8i16( %a, %b) { ; CHECK-LABEL: complex_mul_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: cmla z2.h, z1.h, z0.h, #0 ; CHECK-NEXT: cmla z2.h, z1.h, z0.h, #90 ; CHECK-NEXT: mov z0.d, z2.d @@ -71,8 +71,8 @@ entry: define @complex_mul_v16i16( %a, %b) { ; CHECK-LABEL: complex_mul_v16i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z4.h, #0 // =0x0 -; CHECK-NEXT: mov z5.h, #0 // =0x0 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 ; CHECK-NEXT: cmla z5.h, z2.h, z0.h, #0 ; CHECK-NEXT: cmla z4.h, z3.h, z1.h, #0 ; CHECK-NEXT: cmla z5.h, z2.h, z0.h, #90 @@ -101,10 +101,10 @@ entry: define @complex_mul_v32i16( %a, %b) { ; CHECK-LABEL: complex_mul_v32i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.h, #0 // =0x0 -; CHECK-NEXT: mov z25.h, #0 // =0x0 -; CHECK-NEXT: mov z26.h, #0 // =0x0 -; CHECK-NEXT: mov z27.h, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 +; CHECK-NEXT: movi v26.2d, #0000000000000000 +; CHECK-NEXT: movi v27.2d, #0000000000000000 ; CHECK-NEXT: cmla z24.h, z4.h, z0.h, #0 ; CHECK-NEXT: cmla z25.h, z5.h, z1.h, #0 ; CHECK-NEXT: cmla z27.h, z6.h, z2.h, #0 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll index 0958c60ed7cb0..52caa3279b927 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i32-mul-scalable.ll @@ -7,7 +7,7 @@ target triple = "aarch64" define @complex_mul_v4i32( %a, %b) { ; CHECK-LABEL: complex_mul_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: cmla z2.s, z1.s, z0.s, #0 ; CHECK-NEXT: cmla z2.s, z1.s, z0.s, #90 ; CHECK-NEXT: mov z0.d, z2.d @@ -33,8 +33,8 @@ entry: define @complex_mul_v8i32( %a, %b) { ; CHECK-LABEL: complex_mul_v8i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z4.s, #0 // =0x0 -; CHECK-NEXT: mov z5.s, #0 // =0x0 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 ; CHECK-NEXT: cmla z5.s, z2.s, z0.s, #0 ; CHECK-NEXT: cmla z4.s, z3.s, z1.s, #0 ; CHECK-NEXT: cmla z5.s, z2.s, z0.s, #90 @@ -63,10 +63,10 @@ entry: define @complex_mul_v16i32( %a, %b) { ; CHECK-LABEL: complex_mul_v16i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.s, #0 // =0x0 -; CHECK-NEXT: mov z25.s, #0 // =0x0 -; CHECK-NEXT: mov z26.s, #0 // =0x0 -; CHECK-NEXT: mov z27.s, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 +; CHECK-NEXT: movi v26.2d, #0000000000000000 +; CHECK-NEXT: movi v27.2d, #0000000000000000 ; CHECK-NEXT: cmla z24.s, z4.s, z0.s, #0 ; CHECK-NEXT: cmla z25.s, z5.s, z1.s, #0 ; CHECK-NEXT: cmla z27.s, z6.s, z2.s, #0 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll index 30c06838c81bc..bdc21e7828277 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i64-mul-scalable.ll @@ -7,7 +7,7 @@ target triple = "aarch64" define @complex_mul_v2i64( %a, %b) { ; CHECK-LABEL: complex_mul_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: cmla z2.d, z1.d, z0.d, #0 ; CHECK-NEXT: cmla z2.d, z1.d, z0.d, #90 ; CHECK-NEXT: mov z0.d, z2.d @@ -33,8 +33,8 @@ entry: define @complex_mul_v4i64( %a, %b) { ; CHECK-LABEL: complex_mul_v4i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z4.d, #0 // =0x0 -; CHECK-NEXT: mov z5.d, #0 // =0x0 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 ; CHECK-NEXT: cmla z5.d, z2.d, z0.d, #0 ; CHECK-NEXT: cmla z4.d, z3.d, z1.d, #0 ; CHECK-NEXT: cmla z5.d, z2.d, z0.d, #90 @@ -63,10 +63,10 @@ entry: define @complex_mul_v8i64( %a, %b) { ; CHECK-LABEL: complex_mul_v8i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.d, #0 // =0x0 -; CHECK-NEXT: mov z25.d, #0 // =0x0 -; CHECK-NEXT: mov z26.d, #0 // =0x0 -; CHECK-NEXT: mov z27.d, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 +; CHECK-NEXT: movi v26.2d, #0000000000000000 +; CHECK-NEXT: movi v27.2d, #0000000000000000 ; CHECK-NEXT: cmla z24.d, z4.d, z0.d, #0 ; CHECK-NEXT: cmla z25.d, z5.d, z1.d, #0 ; CHECK-NEXT: cmla z27.d, z6.d, z2.d, #0 @@ -101,10 +101,10 @@ entry: define @complex_minus_mul_v8i64( %a, %b) { ; CHECK-LABEL: complex_minus_mul_v8i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z24.d, #0 // =0x0 -; CHECK-NEXT: mov z25.d, #0 // =0x0 -; CHECK-NEXT: mov z26.d, #0 // =0x0 -; CHECK-NEXT: mov z27.d, #0 // =0x0 +; CHECK-NEXT: movi v24.2d, #0000000000000000 +; CHECK-NEXT: movi v25.2d, #0000000000000000 +; CHECK-NEXT: movi v26.2d, #0000000000000000 +; CHECK-NEXT: movi v27.2d, #0000000000000000 ; CHECK-NEXT: cmla z24.d, z4.d, z0.d, #270 ; CHECK-NEXT: cmla z25.d, z5.d, z1.d, #270 ; CHECK-NEXT: cmla z27.d, z6.d, z2.d, #270 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll index dcc11609ca231..880bd2904154c 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll @@ -14,7 +14,7 @@ target triple = "aarch64" define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov w8, #100 // =0x64 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: whilelo p1.d, xzr, x8 @@ -111,7 +111,7 @@ exit.block: ; preds = %vector.body define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %cond) { ; CHECK-LABEL: complex_mul_predicated_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: mov w11, #100 // =0x64 ; CHECK-NEXT: neg x10, x9 @@ -213,7 +213,7 @@ exit.block: ; preds = %vector.body define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, ptr %cond) { ; CHECK-LABEL: complex_mul_predicated_x2_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov w8, #100 // =0x64 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: whilelo p1.d, xzr, x8 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll index 89f790210e193..29be231920305 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll @@ -14,7 +14,7 @@ target triple = "aarch64" define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: cntd x8 ; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: neg x9, x8 @@ -98,20 +98,20 @@ exit.block: ; preds = %vector.body define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_nonzero_init_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov d0, #1.00000000 -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: fmov d1, #1.00000000 ; CHECK-NEXT: cntd x8 ; CHECK-NEXT: fmov d2, #2.00000000 ; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: neg x9, x8 ; CHECK-NEXT: mov w10, #100 // =0x64 -; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d ; CHECK-NEXT: and x9, x9, x10 ; CHECK-NEXT: rdvl x10, #2 -; CHECK-NEXT: mov z1.d, p0/m, z2.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d +; CHECK-NEXT: sel z2.d, p0, z2.d, z0.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: zip2 z0.d, z1.d, z3.d -; CHECK-NEXT: zip1 z1.d, z1.d, z3.d +; CHECK-NEXT: zip2 z0.d, z2.d, z1.d +; CHECK-NEXT: zip1 z1.d, z2.d, z1.d ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr z2, [x0, #1, mul vl] @@ -183,7 +183,7 @@ exit.block: ; preds = %vector.body define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64_unrolled: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: cntw x8 ; CHECK-NEXT: mov w10, #1000 // =0x3e8 ; CHECK-NEXT: neg x9, x8 @@ -309,7 +309,7 @@ exit.block: ; preds = %vector.body define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalias nocapture noundef readnone %c, [2 x double] %d.coerce, ptr nocapture noundef readonly %s, ptr nocapture noundef writeonly %outs) local_unnamed_addr #0 { ; CHECK-LABEL: reduction_mix: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: cntd x9 ; CHECK-NEXT: mov w11, #100 // =0x64 ; CHECK-NEXT: neg x10, x9 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll index 407da6cd6002b..6615313613153 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll @@ -8,19 +8,19 @@ target triple = "aarch64" define @complex_mul_const( %a, %b) { ; CHECK-LABEL: complex_mul_const: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z5.d, #0 // =0x0 -; CHECK-NEXT: mov z4.d, #0 // =0x0 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fmov z6.d, #3.00000000 ; CHECK-NEXT: fmov z7.d, #11.00000000 -; CHECK-NEXT: fcmla z5.d, p0/m, z1.d, z3.d, #0 ; CHECK-NEXT: fcmla z4.d, p0/m, z0.d, z2.d, #0 -; CHECK-NEXT: fcmla z5.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: fcmla z5.d, p0/m, z1.d, z3.d, #0 ; CHECK-NEXT: fcmla z4.d, p0/m, z0.d, z2.d, #90 -; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: fcmla z5.d, p0/m, z1.d, z3.d, #90 ; CHECK-NEXT: zip2 z1.d, z7.d, z6.d +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: zip1 z3.d, z7.d, z6.d -; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: fcmla z2.d, p0/m, z5.d, z1.d, #0 ; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z3.d, #0 ; CHECK-NEXT: fcmla z2.d, p0/m, z5.d, z1.d, #90 @@ -55,25 +55,26 @@ entry: define @complex_mul_non_const( %a, %b, [2 x double] %c) { ; CHECK-LABEL: complex_mul_non_const: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z6.d, #0 // =0x0 -; CHECK-NEXT: mov z7.d, #0 // =0x0 +; CHECK-NEXT: movi v7.2d, #0000000000000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d5 killed $d5 def $z5 ; CHECK-NEXT: // kill: def $d4 killed $d4 def $z4 -; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: movi v6.2d, #0000000000000000 ; CHECK-NEXT: mov z5.d, d5 -; CHECK-NEXT: mov z4.d, d4 -; CHECK-NEXT: fcmla z6.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: mov z24.d, d4 +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: fcmla z7.d, p0/m, z1.d, z3.d, #0 -; CHECK-NEXT: zip2 z24.d, z4.d, z5.d -; CHECK-NEXT: fcmla z6.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: fcmla z6.d, p0/m, z0.d, z2.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z1.d, z3.d, #90 -; CHECK-NEXT: zip1 z2.d, z4.d, z5.d -; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: zip2 z1.d, z24.d, z5.d +; CHECK-NEXT: fcmla z6.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: zip1 z2.d, z24.d, z5.d +; CHECK-NEXT: fcmla z4.d, p0/m, z7.d, z1.d, #0 ; CHECK-NEXT: fcmla z0.d, p0/m, z6.d, z2.d, #0 -; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z24.d, #0 +; CHECK-NEXT: fcmla z4.d, p0/m, z7.d, z1.d, #90 ; CHECK-NEXT: fcmla z0.d, p0/m, z6.d, z2.d, #90 -; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z24.d, #90 +; CHECK-NEXT: mov z1.d, z4.d ; CHECK-NEXT: ret entry: %c.coerce.fca.0.extract = extractvalue [2 x double] %c, 0 diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll index 4cb1d5b2fb345..53126a08db86f 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll @@ -10,37 +10,22 @@ define fastcc i8 @allocno_reload_assign(ptr %p) { ; CHECK: // %bb.0: ; CHECK-NEXT: fmov d0, xzr ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z16.d, #0 // =0x0 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: uzp1 p0.s, p0.s, p0.s ; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h ; CHECK-NEXT: uzp1 p8.b, p0.b, p0.b ; CHECK-NEXT: mov z0.b, p8/z, #1 // =0x1 ; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: uunpklo z1.h, z0.b -; CHECK-NEXT: uunpkhi z0.h, z0.b +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mvn w8, w8 ; CHECK-NEXT: sbfx x8, x8, #0, #1 ; CHECK-NEXT: whilelo p0.b, xzr, x8 -; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: uunpkhi z3.s, z1.h -; CHECK-NEXT: uunpklo z5.s, z0.h -; CHECK-NEXT: uunpkhi z7.s, z0.h ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpklo p2.h, p1.b ; CHECK-NEXT: punpkhi p4.h, p1.b -; CHECK-NEXT: uunpklo z0.d, z2.s -; CHECK-NEXT: uunpkhi z1.d, z2.s ; CHECK-NEXT: punpklo p6.h, p0.b -; CHECK-NEXT: uunpklo z2.d, z3.s -; CHECK-NEXT: uunpkhi z3.d, z3.s ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: uunpklo z4.d, z5.s -; CHECK-NEXT: uunpkhi z5.d, z5.s -; CHECK-NEXT: uunpklo z6.d, z7.s -; CHECK-NEXT: uunpkhi z7.d, z7.s ; CHECK-NEXT: punpklo p1.h, p2.b ; CHECK-NEXT: punpkhi p2.h, p2.b ; CHECK-NEXT: punpklo p3.h, p4.b @@ -50,14 +35,28 @@ define fastcc i8 @allocno_reload_assign(ptr %p) { ; CHECK-NEXT: punpklo p7.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: st1b { z0.d }, p1, [z16.d] -; CHECK-NEXT: st1b { z1.d }, p2, [z16.d] -; CHECK-NEXT: st1b { z2.d }, p3, [z16.d] -; CHECK-NEXT: st1b { z3.d }, p4, [z16.d] -; CHECK-NEXT: st1b { z4.d }, p5, [z16.d] -; CHECK-NEXT: st1b { z5.d }, p6, [z16.d] -; CHECK-NEXT: st1b { z6.d }, p7, [z16.d] -; CHECK-NEXT: st1b { z7.d }, p0, [z16.d] +; CHECK-NEXT: uunpklo z1.h, z0.b +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpklo z3.d, z2.s +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: st1b { z3.d }, p1, [z0.d] +; CHECK-NEXT: st1b { z2.d }, p2, [z0.d] +; CHECK-NEXT: uunpklo z2.d, z1.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: st1b { z2.d }, p3, [z0.d] +; CHECK-NEXT: uunpkhi z2.h, z0.b +; CHECK-NEXT: uunpklo z3.s, z2.h +; CHECK-NEXT: uunpkhi z2.s, z2.h +; CHECK-NEXT: st1b { z1.d }, p4, [z0.d] +; CHECK-NEXT: uunpklo z1.d, z3.s +; CHECK-NEXT: st1b { z1.d }, p5, [z0.d] +; CHECK-NEXT: uunpkhi z1.d, z3.s +; CHECK-NEXT: st1b { z1.d }, p6, [z0.d] +; CHECK-NEXT: uunpklo z1.d, z2.s +; CHECK-NEXT: st1b { z1.d }, p7, [z0.d] +; CHECK-NEXT: uunpkhi z1.d, z2.s +; CHECK-NEXT: st1b { z1.d }, p0, [z0.d] ; CHECK-NEXT: str p8, [x0] ; CHECK-NEXT: b .LBB0_1 br label %1 diff --git a/llvm/test/CodeGen/AArch64/load-insert-zero.ll b/llvm/test/CodeGen/AArch64/load-insert-zero.ll index d66944e646dab..8b4cc7bcc0311 100644 --- a/llvm/test/CodeGen/AArch64/load-insert-zero.ll +++ b/llvm/test/CodeGen/AArch64/load-insert-zero.ll @@ -921,7 +921,7 @@ define void @predictor_4x4_neon_new(ptr nocapture noundef writeonly %0, i64 noun define @loadnxv8i8(ptr %p) { ; CHECK-LABEL: loadnxv8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldrb w8, [x0] ; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, p0/m, w8 @@ -944,7 +944,7 @@ define @loadnxv16i8(ptr %p) { define @loadnxv4i16(ptr %p) { ; CHECK-LABEL: loadnxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldrh w8, [x0] ; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, p0/m, w8 @@ -967,7 +967,7 @@ define @loadnxv8i16(ptr %p) { define @loadnxv2i32(ptr %p) { ; CHECK-LABEL: loadnxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldr w8, [x0] ; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, p0/m, x8 @@ -1006,7 +1006,7 @@ define @loadnxv4f16(ptr %p) { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldr h1, [x0] ; CHECK-NEXT: mov z0.h, p0/m, h1 ; CHECK-NEXT: ret @@ -1033,7 +1033,7 @@ define @loadnxv4bf16(ptr %p) { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldr h1, [x0] ; CHECK-NEXT: mov z0.h, p0/m, h1 ; CHECK-NEXT: ret @@ -1060,7 +1060,7 @@ define @loadnxv2f32(ptr %p) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldr s1, [x0] ; CHECK-NEXT: mov z0.s, p0/m, s1 ; CHECK-NEXT: ret @@ -1095,7 +1095,7 @@ define @loadnxv2f64(ptr %p) { define @loadnxv8i8_offset(ptr %p) { ; CHECK-LABEL: loadnxv8i8_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldrb w8, [x0, #1] ; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, p0/m, w8 @@ -1120,7 +1120,7 @@ define @loadnxv16i8_offset(ptr %p) { define @loadnxv4i16_offset(ptr %p) { ; CHECK-LABEL: loadnxv4i16_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldurh w8, [x0, #1] ; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, p0/m, w8 @@ -1145,7 +1145,7 @@ define @loadnxv8i16_offset(ptr %p) { define @loadnxv2i32_offset(ptr %p) { ; CHECK-LABEL: loadnxv2i32_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldur w8, [x0, #1] ; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, p0/m, x8 @@ -1187,7 +1187,7 @@ define @loadnxv4f16_offset(ptr %p) { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldur h1, [x0, #1] ; CHECK-NEXT: mov z0.h, p0/m, h1 ; CHECK-NEXT: ret @@ -1216,7 +1216,7 @@ define @loadnxv4bf16_offset(ptr %p) { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldur h1, [x0, #1] ; CHECK-NEXT: mov z0.h, p0/m, h1 ; CHECK-NEXT: ret @@ -1245,7 +1245,7 @@ define @loadnxv2f32_offset(ptr %p) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ldur s1, [x0, #1] ; CHECK-NEXT: mov z0.s, p0/m, s1 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sinksplat.ll b/llvm/test/CodeGen/AArch64/sinksplat.ll index e329548f84d24..5743dc7cce580 100644 --- a/llvm/test/CodeGen/AArch64/sinksplat.ll +++ b/llvm/test/CodeGen/AArch64/sinksplat.ll @@ -510,7 +510,7 @@ define @fmul_scalable(ptr %x, ptr %y) "target-features"="+s ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: sxtw x8, w8 ; CHECK-NEXT: mov w9, #1 // =0x1 ; CHECK-NEXT: ld1rw { z1.s }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll b/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll index d6484c2483f49..bdfe90c8a6bb7 100644 --- a/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll +++ b/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,SVE +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,STREAMING-SVE target triple = "aarch64-unknown-linux-gnu" @@ -430,11 +430,17 @@ define @fptoui_nxv8bf16_to_nxv8i64( %a) ; NOTE: f16(-1.875) == bf16(-1.0) define @sitofp_nxv2i1_to_nxv2bf16( %a) { -; CHECK-LABEL: sitofp_nxv2i1_to_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fmov z0.h, p0/m, #-1.87500000 -; CHECK-NEXT: ret +; SVE-LABEL: sitofp_nxv2i1_to_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fmov z0.h, p0/m, #-1.87500000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: sitofp_nxv2i1_to_nxv2bf16: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fmov z0.h, p0/m, #-1.87500000 +; STREAMING-SVE-NEXT: ret %res = sitofp %a to ret %res } @@ -486,11 +492,17 @@ define @sitofp_nxv2i64_to_nxv2bf16( %a) } define @sitofp_nxv4i1_to_nxv4bf16( %a) { -; CHECK-LABEL: sitofp_nxv4i1_to_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fmov z0.h, p0/m, #-1.87500000 -; CHECK-NEXT: ret +; SVE-LABEL: sitofp_nxv4i1_to_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fmov z0.h, p0/m, #-1.87500000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: sitofp_nxv4i1_to_nxv4bf16: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fmov z0.h, p0/m, #-1.87500000 +; STREAMING-SVE-NEXT: ret %res = sitofp %a to ret %res } @@ -545,11 +557,17 @@ define @sitofp_nxv4i64_to_nxv4bf16( %a) } define @sitofp_nxv8i1_to_nxv8bf16( %a) { -; CHECK-LABEL: sitofp_nxv8i1_to_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fmov z0.h, p0/m, #-1.87500000 -; CHECK-NEXT: ret +; SVE-LABEL: sitofp_nxv8i1_to_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fmov z0.h, p0/m, #-1.87500000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: sitofp_nxv8i1_to_nxv8bf16: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fmov z0.h, p0/m, #-1.87500000 +; STREAMING-SVE-NEXT: ret %res = sitofp %a to ret %res } @@ -624,11 +642,17 @@ define @sitofp_nxv8i64_to_nxv8bf16( %a) ; NOTE: f16(1.875) == bf16(1.0) define @uitofp_nxv2i1_to_nxv2bf16( %a) { -; CHECK-LABEL: uitofp_nxv2i1_to_nxv2bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fmov z0.h, p0/m, #1.87500000 -; CHECK-NEXT: ret +; SVE-LABEL: uitofp_nxv2i1_to_nxv2bf16: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fmov z0.h, p0/m, #1.87500000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: uitofp_nxv2i1_to_nxv2bf16: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fmov z0.h, p0/m, #1.87500000 +; STREAMING-SVE-NEXT: ret %res = uitofp %a to ret %res } @@ -680,11 +704,17 @@ define @uitofp_nxv2i64_to_nxv2bf16( %a) } define @uitofp_nxv4i1_to_nxv4bf16( %a) { -; CHECK-LABEL: uitofp_nxv4i1_to_nxv4bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fmov z0.h, p0/m, #1.87500000 -; CHECK-NEXT: ret +; SVE-LABEL: uitofp_nxv4i1_to_nxv4bf16: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fmov z0.h, p0/m, #1.87500000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: uitofp_nxv4i1_to_nxv4bf16: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fmov z0.h, p0/m, #1.87500000 +; STREAMING-SVE-NEXT: ret %res = uitofp %a to ret %res } @@ -739,11 +769,17 @@ define @uitofp_nxv4i64_to_nxv4bf16( %a) } define @uitofp_nxv8i1_to_nxv8bf16( %a) { -; CHECK-LABEL: uitofp_nxv8i1_to_nxv8bf16: -; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fmov z0.h, p0/m, #1.87500000 -; CHECK-NEXT: ret +; SVE-LABEL: uitofp_nxv8i1_to_nxv8bf16: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fmov z0.h, p0/m, #1.87500000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: uitofp_nxv8i1_to_nxv8bf16: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fmov z0.h, p0/m, #1.87500000 +; STREAMING-SVE-NEXT: ret %res = uitofp %a to ret %res } diff --git a/llvm/test/CodeGen/AArch64/sve-fcmp.ll b/llvm/test/CodeGen/AArch64/sve-fcmp.ll index fc5e640aed4ae..607cc92eb4505 100644 --- a/llvm/test/CodeGen/AArch64/sve-fcmp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcmp.ll @@ -374,7 +374,7 @@ define @one_zero( %x) { define @ueq_zero( %x) { ; CHECK-LABEL: ueq_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fcmuo p1.s, p0/z, z0.s, z1.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 diff --git a/llvm/test/CodeGen/AArch64/sve-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-fcvt.ll index 8b8ddb624a040..743623b86f1b0 100644 --- a/llvm/test/CodeGen/AArch64/sve-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcvt.ll @@ -682,7 +682,7 @@ define @fcvtzu_d_nxv2f64( %a) { define @scvtf_h_nxv2i1( %a) { ; CHECK-LABEL: scvtf_h_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.h, p0/m, #-1.00000000 ; CHECK-NEXT: ret %res = sitofp %a to @@ -722,7 +722,7 @@ define @scvtf_h_nxv2i64( %a) { define @scvtf_h_nxv3i1( %a) { ; CHECK-LABEL: scvtf_h_nxv3i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.h, p0/m, #-1.00000000 ; CHECK-NEXT: ret %res = sitofp %a to @@ -742,7 +742,7 @@ define @scvtf_h_nxv3i16( %a) { define @scvtf_h_nxv4i1( %a) { ; CHECK-LABEL: scvtf_h_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.h, p0/m, #-1.00000000 ; CHECK-NEXT: ret %res = sitofp %a to @@ -772,7 +772,7 @@ define @scvtf_h_nxv4i32( %a) { define @scvtf_h_nxv7i1( %a) { ; CHECK-LABEL: scvtf_h_nxv7i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.h, p0/m, #-1.00000000 ; CHECK-NEXT: ret %res = sitofp %a to @@ -792,7 +792,7 @@ define @scvtf_h_nxv7i16( %a) { define @scvtf_h_nxv8i1( %a) { ; CHECK-LABEL: scvtf_h_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.h, p0/m, #-1.00000000 ; CHECK-NEXT: ret %res = sitofp %a to @@ -812,7 +812,7 @@ define @scvtf_h_nxv8i16( %a) { define @scvtf_s_nxv2i1( %a) { ; CHECK-LABEL: scvtf_s_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.s, p0/m, #-1.00000000 ; CHECK-NEXT: ret %res = sitofp %a to @@ -842,7 +842,7 @@ define @scvtf_s_nxv2i64( %a) { define @scvtf_s_nxv3i1( %a) { ; CHECK-LABEL: scvtf_s_nxv3i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.s, p0/m, #-1.00000000 ; CHECK-NEXT: ret %res = sitofp %a to @@ -862,7 +862,7 @@ define @scvtf_s_nxv3i32( %a) { define @scvtf_s_nxv4i1( %a) { ; CHECK-LABEL: scvtf_s_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.s, p0/m, #-1.00000000 ; CHECK-NEXT: ret %res = sitofp %a to @@ -882,7 +882,7 @@ define @scvtf_s_nxv4i32( %a) { define @scvtf_d_nxv2i1( %a) { ; CHECK-LABEL: scvtf_d_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.d, p0/m, #-1.00000000 ; CHECK-NEXT: ret %res = sitofp %a to @@ -914,7 +914,7 @@ define @scvtf_d_nxv2i64( %a) { define @ucvtf_h_nxv2i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.h, p0/m, #1.00000000 ; CHECK-NEXT: ret %res = uitofp %a to @@ -954,7 +954,7 @@ define @ucvtf_h_nxv2i64( %a) { define @ucvtf_h_nxv3i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv3i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.h, p0/m, #1.00000000 ; CHECK-NEXT: ret %res = uitofp %a to @@ -984,7 +984,7 @@ define @ucvtf_h_nxv3i32( %a) { define @ucvtf_h_nxv4i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.h, p0/m, #1.00000000 ; CHECK-NEXT: ret %res = uitofp %a to @@ -1014,7 +1014,7 @@ define @ucvtf_h_nxv4i32( %a) { define @ucvtf_h_nxv8i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.h, p0/m, #1.00000000 ; CHECK-NEXT: ret %res = uitofp %a to @@ -1034,7 +1034,7 @@ define @ucvtf_h_nxv8i16( %a) { define @ucvtf_s_nxv2i1( %a) { ; CHECK-LABEL: ucvtf_s_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.s, p0/m, #1.00000000 ; CHECK-NEXT: ret %res = uitofp %a to @@ -1064,7 +1064,7 @@ define @ucvtf_s_nxv2i64( %a) { define @ucvtf_s_nxv4i1( %a) { ; CHECK-LABEL: ucvtf_s_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.s, p0/m, #1.00000000 ; CHECK-NEXT: ret %res = uitofp %a to @@ -1084,7 +1084,7 @@ define @ucvtf_s_nxv4i32( %a) { define @ucvtf_d_nxv2i1( %a) { ; CHECK-LABEL: ucvtf_d_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov z0.d, p0/m, #1.00000000 ; CHECK-NEXT: ret %res = uitofp %a to diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index 0c47e7e14183a..b24a9513b83e3 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -28,8 +28,8 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK: // %bb.0: ; CHECK-NEXT: tbnz w1, #0, .LBB1_2 ; CHECK-NEXT: // %bb.1: // %vector.body -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: mov z1.b, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: umov w8, v0.b[8] ; CHECK-NEXT: mov v1.b[1], v0.b[1] diff --git a/llvm/test/CodeGen/AArch64/sve-fp-combine.ll b/llvm/test/CodeGen/AArch64/sve-fp-combine.ll index ddede0feca16a..53aba04028d62 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-combine.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-combine.ll @@ -956,9 +956,9 @@ define @fsub_d_sel_negzero( %a, @fadd_sel_fmul_h( %a, %b, %c, %mask) { ; CHECK-LABEL: fadd_sel_fmul_h: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: fmul z1.h, z1.h, z2.h -; CHECK-NEXT: mov z2.h, #0 // =0x0 -; CHECK-NEXT: sel z1.h, p0, z1.h, z2.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z3.h ; CHECK-NEXT: fadd z0.h, z0.h, z1.h ; CHECK-NEXT: ret %fmul = fmul %b, %c @@ -970,9 +970,9 @@ define @fadd_sel_fmul_h( %a, @fadd_sel_fmul_s( %a, %b, %c, %mask) { ; CHECK-LABEL: fadd_sel_fmul_s: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: fmul z1.s, z1.s, z2.s -; CHECK-NEXT: mov z2.s, #0 // =0x0 -; CHECK-NEXT: sel z1.s, p0, z1.s, z2.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z3.s ; CHECK-NEXT: fadd z0.s, z0.s, z1.s ; CHECK-NEXT: ret %fmul = fmul %b, %c @@ -984,9 +984,9 @@ define @fadd_sel_fmul_s( %a, @fadd_sel_fmul_d( %a, %b, %c, %mask) { ; CHECK-LABEL: fadd_sel_fmul_d: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: fmul z1.d, z1.d, z2.d -; CHECK-NEXT: mov z2.d, #0 // =0x0 -; CHECK-NEXT: sel z1.d, p0, z1.d, z2.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z3.d ; CHECK-NEXT: fadd z0.d, z0.d, z1.d ; CHECK-NEXT: ret %fmul = fmul %b, %c diff --git a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll index 73bbee094827e..ebec275c92c52 100644 --- a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll +++ b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll @@ -180,7 +180,7 @@ define @zero_fill_non_zero_index( %pg, @zero_fill_type_mismatch( %pg, %pg, %a) %t2 = insertelement zeroinitializer, i64 %t1, i64 0 @@ -211,7 +211,7 @@ define @zero_fill_no_zero_upper_lanes( %pg, ; CHECK-LABEL: zero_fill_no_zero_upper_lanes: ; CHECK: // %bb.0: ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z0.d -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: mov z1.d, p0/m, x8 diff --git a/llvm/test/CodeGen/AArch64/sve-int-log.ll b/llvm/test/CodeGen/AArch64/sve-int-log.ll index c45d0f437760f..cc5e5e5ddf86c 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-log.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-log.ll @@ -40,7 +40,7 @@ define @and_b( %a, %b) { define @and_b_zero( %a) { ; CHECK-LABEL: and_b_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.b, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %res = and %a, zeroinitializer ret %res diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll index 6ec18477fe1a0..be936f0fd6d4a 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll @@ -411,12 +411,12 @@ declare i8 @llvm.vector.reduce.add.nxv12i8() define i8 @uaddv_nxv12i8( %a) { ; CHECK-LABEL: uaddv_nxv12i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpkhi z1.h, z0.b -; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: uunpkhi z2.h, z0.b +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: uaddv d0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -430,15 +430,15 @@ declare i8 @llvm.vector.reduce.umax.nxv14i8() define i8 @umax_nxv14i8( %a) { ; CHECK-LABEL: umax_nxv14i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpkhi z1.h, z0.b -; CHECK-NEXT: mov z3.d, #0 // =0x0 +; CHECK-NEXT: uunpkhi z2.h, z0.b +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: uunpkhi z2.s, z1.h -; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s -; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: uunpkhi z3.s, z2.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: uzp1 z1.s, z3.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll index 73a2292b183ba..36761a344018e 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll @@ -1352,7 +1352,7 @@ define @asr_i8_all_active( %a) { define @asr_i8_too_small( %pg, %a) { ; CHECK-LABEL: asr_i8_too_small: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.asr.nxv16i8( %pg, @@ -1388,7 +1388,7 @@ define @asr_i16_all_active( %a) { define @asr_i16_too_small( %pg, %a) { ; CHECK-LABEL: asr_i16_too_small: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.h, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.asr.nxv8i16( %pg, @@ -1424,7 +1424,7 @@ define @asr_i32_all_active( %a) { define @asr_i32_too_small( %pg, %a) { ; CHECK-LABEL: asr_i32_too_small: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.asr.nxv4i32( %pg, @@ -1460,7 +1460,7 @@ define @asr_i64_all_active( %a) { define @asr_i64_too_small( %pg, %a) { ; CHECK-LABEL: asr_i64_too_small: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.asr.nxv2i64( %pg, @@ -1688,7 +1688,7 @@ define @lsr_i8_all_active( %a) { define @lsr_i8_too_small( %pg, %a) { ; CHECK-LABEL: lsr_i8_too_small: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.b, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.lsr.nxv16i8( %pg, @@ -1724,7 +1724,7 @@ define @lsr_i16_all_active( %a) { define @lsr_i16_too_small( %pg, %a) { ; CHECK-LABEL: lsr_i16_too_small: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.h, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.lsr.nxv8i16( %pg, @@ -1760,7 +1760,7 @@ define @lsr_i32_all_active( %a) { define @lsr_i32_too_small( %pg, %a) { ; CHECK-LABEL: lsr_i32_too_small: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.s, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.lsr.nxv4i32( %pg, @@ -1796,7 +1796,7 @@ define @lsr_i64_all_active( %a) { define @lsr_i64_too_small( %pg, %a) { ; CHECK-LABEL: lsr_i64_too_small: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %out = call @llvm.aarch64.sve.lsr.nxv2i64( %pg, diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll index ed820e0fc8a25..19facd2ef8993 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll @@ -397,7 +397,7 @@ define @bic_i64_zero( %pg, @bic_i64_zero_no_unique_reg( %pg, %a) { ; CHECK-LABEL: bic_i64_zero_no_unique_reg: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov z1.d, p0/m, z0.d ; CHECK-NEXT: movprfx z0.d, p0/z, z0.d ; CHECK-NEXT: bic z0.d, p0/m, z0.d, z1.d @@ -414,7 +414,7 @@ define @bic_i64_zero_no_unique_reg( %pg, @bic_i64_zero_no_comm( %pg, %a, %b) { ; CHECK-LABEL: bic_i64_zero_no_comm: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: mov z2.d, p0/m, z0.d ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: bic z0.d, p0/m, z0.d, z2.d diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll index 238f188f93815..53106b330efce 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scalar-to-vec.ll @@ -96,7 +96,7 @@ define @dup_f64( %a, @test_svdup_n_bf16_z( %pg, bfloat %op) #0 { ; CHECK-LABEL: test_svdup_n_bf16_z: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.h, #0 // =0x0 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov z1.h, p0/m, h0 ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll index 2324e3074a420..3a1c6f6731b08 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll @@ -60,7 +60,7 @@ define @asr_i64_zero( %pg, @asr_wide_i8_zero( %pg, %a, %b) { ; CHECK-LABEL: asr_wide_i8_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.b, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.b, p0, z0.b, z2.b ; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.d ; CHECK-NEXT: ret @@ -74,7 +74,7 @@ define @asr_wide_i8_zero( %pg, @asr_wide_i16_zero( %pg, %a, %b) { ; CHECK-LABEL: asr_wide_i16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h ; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.d ; CHECK-NEXT: ret @@ -88,7 +88,7 @@ define @asr_wide_i16_zero( %pg, @asr_wide_i32_zero( %pg, %a, %b) { ; CHECK-LABEL: asr_wide_i32_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.d ; CHECK-NEXT: ret @@ -214,7 +214,7 @@ define @lsl_i64_zero( %pg, @lsl_wide_i8_zero( %pg, %a, %b) { ; CHECK-LABEL: lsl_wide_i8_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.b, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.b, p0, z0.b, z2.b ; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.d ; CHECK-NEXT: ret @@ -228,7 +228,7 @@ define @lsl_wide_i8_zero( %pg, @lsl_wide_i16_zero( %pg, %a, %b) { ; CHECK-LABEL: lsl_wide_i16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.d ; CHECK-NEXT: ret @@ -242,7 +242,7 @@ define @lsl_wide_i16_zero( %pg, @lsl_wide_i32_zero( %pg, %a, %b) { ; CHECK-LABEL: lsl_wide_i32_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.d ; CHECK-NEXT: ret @@ -312,7 +312,7 @@ define @lsr_i64_zero( %pg, @lsr_wide_i8_zero( %pg, %a, %b) { ; CHECK-LABEL: lsr_wide_i8_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.b, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.b, p0, z0.b, z2.b ; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.d ; CHECK-NEXT: ret @@ -326,7 +326,7 @@ define @lsr_wide_i8_zero( %pg, @lsr_wide_i16_zero( %pg, %a, %b) { ; CHECK-LABEL: lsr_wide_i16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.d ; CHECK-NEXT: ret @@ -340,7 +340,7 @@ define @lsr_wide_i16_zero( %pg, @lsr_wide_i32_zero( %pg, %a, %b) { ; CHECK-LABEL: lsr_wide_i32_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-knownbits.ll b/llvm/test/CodeGen/AArch64/sve-knownbits.ll index c22d18c7e2ede..7d6ed08173bf4 100644 --- a/llvm/test/CodeGen/AArch64/sve-knownbits.ll +++ b/llvm/test/CodeGen/AArch64/sve-knownbits.ll @@ -4,7 +4,7 @@ define @test_knownzero( %x) { ; CHECK-LABEL: test_knownzero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %a1 = shl %x, splat (i16 8) %a2 = and %a1, splat (i16 8) diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll index 43391c16e7cce..a6d7d17fd9eef 100644 --- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -1250,7 +1250,7 @@ define @dup_ld1rh_half_passthruzero_nxv8f16( @dup_ld1rs_float_passthruzero_nxv4f32( @dup_ld1rd_double_passthruzero_nxv2f64( @dup_ld1rh_half_passthruzero_nxv4f16( @dup_ld1rh_half_passthruzero_nxv2f16( @dup_ld1rs_float_passthruzero_nxv2f32( %data, %pg) { ; CHECK-LABEL: masked_scatter_splat_constant_pointer: ; CHECK: // %bb.0: // %vector.body -; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1w { z0.d }, p1, [z0.d] diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll index d7bab3297cf29..8d3b12e359f3f 100644 --- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll @@ -264,7 +264,7 @@ entry: define @udot_8to64( %acc, %a, %b) { ; CHECK-LABEL: udot_8to64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z4.s, #0 // =0x0 +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: udot z4.s, z2.b, z3.b ; CHECK-NEXT: sunpklo z2.d, z4.s ; CHECK-NEXT: sunpkhi z3.d, z4.s @@ -327,7 +327,7 @@ entry: define @sdot_8to64( %acc, %a, %b){ ; CHECK-LABEL: sdot_8to64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z4.s, #0 // =0x0 +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: sdot z4.s, z2.b, z3.b ; CHECK-NEXT: sunpklo z2.d, z4.s ; CHECK-NEXT: sunpkhi z3.d, z4.s @@ -390,7 +390,7 @@ entry: define @usdot_8to64( %acc, %a, %b){ ; CHECK-I8MM-LABEL: usdot_8to64: ; CHECK-I8MM: // %bb.0: // %entry -; CHECK-I8MM-NEXT: mov z4.s, #0 // =0x0 +; CHECK-I8MM-NEXT: movi v4.2d, #0000000000000000 ; CHECK-I8MM-NEXT: usdot z4.s, z2.b, z3.b ; CHECK-I8MM-NEXT: sunpklo z2.d, z4.s ; CHECK-I8MM-NEXT: sunpkhi z3.d, z4.s @@ -523,7 +523,7 @@ entry: define @sudot_8to64( %acc, %a, %b) { ; CHECK-I8MM-LABEL: sudot_8to64: ; CHECK-I8MM: // %bb.0: // %entry -; CHECK-I8MM-NEXT: mov z4.s, #0 // =0x0 +; CHECK-I8MM-NEXT: movi v4.2d, #0000000000000000 ; CHECK-I8MM-NEXT: usdot z4.s, z3.b, z2.b ; CHECK-I8MM-NEXT: sunpklo z2.d, z4.s ; CHECK-I8MM-NEXT: sunpkhi z3.d, z4.s @@ -758,11 +758,11 @@ entry: define @udot_no_bin_op_8to64( %acc, %a){ ; CHECK-LABEL: udot_no_bin_op_8to64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z3.b, #1 // =0x1 -; CHECK-NEXT: mov z4.s, #0 // =0x0 -; CHECK-NEXT: udot z4.s, z2.b, z3.b -; CHECK-NEXT: sunpklo z2.d, z4.s -; CHECK-NEXT: sunpkhi z3.d, z4.s +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: mov z4.b, #1 // =0x1 +; CHECK-NEXT: udot z3.s, z2.b, z4.b +; CHECK-NEXT: sunpklo z2.d, z3.s +; CHECK-NEXT: sunpkhi z3.d, z3.s ; CHECK-NEXT: add z0.d, z0.d, z2.d ; CHECK-NEXT: add z1.d, z1.d, z3.d ; CHECK-NEXT: ret @@ -800,11 +800,11 @@ define @udot_no_bin_op_8to64( %acc, @sdot_no_bin_op_8to64( %acc, %a){ ; CHECK-LABEL: sdot_no_bin_op_8to64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z3.b, #1 // =0x1 -; CHECK-NEXT: mov z4.s, #0 // =0x0 -; CHECK-NEXT: sdot z4.s, z2.b, z3.b -; CHECK-NEXT: sunpklo z2.d, z4.s -; CHECK-NEXT: sunpkhi z3.d, z4.s +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: mov z4.b, #1 // =0x1 +; CHECK-NEXT: sdot z3.s, z2.b, z4.b +; CHECK-NEXT: sunpklo z2.d, z3.s +; CHECK-NEXT: sunpkhi z3.d, z3.s ; CHECK-NEXT: add z0.d, z0.d, z2.d ; CHECK-NEXT: add z1.d, z1.d, z3.d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-pr92779.ll b/llvm/test/CodeGen/AArch64/sve-pr92779.ll index 1bb7801b6a047..3f34d79b3bb49 100644 --- a/llvm/test/CodeGen/AArch64/sve-pr92779.ll +++ b/llvm/test/CodeGen/AArch64/sve-pr92779.ll @@ -4,8 +4,8 @@ define void @main(ptr %0) { ; CHECK-LABEL: main: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uzp1 v0.2s, v1.2s, v0.2s diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll index 2fe09f8ac7c5d..2378b226c05e3 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll @@ -331,8 +331,8 @@ define @scvtf_d_nxv4i32( %a) { define @scvtf_d_nxv4i1( %a) { ; CHECK-LABEL: scvtf_d_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: fmov z0.d, p1/m, #-1.00000000 @@ -392,8 +392,8 @@ define @ucvtf_d_nxv4i32( %a) { define @ucvtf_d_nxv4i1( %a) { ; CHECK-LABEL: ucvtf_d_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: fmov z0.d, p1/m, #1.00000000 diff --git a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll index 4a75242848343..5cca5539048b5 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll @@ -365,7 +365,7 @@ define @splat_nxv2f64(double %val) { define @splat_nxv8bf16_zero() #0 { ; CHECK-LABEL: splat_nxv8bf16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret ret zeroinitializer } @@ -373,7 +373,7 @@ define @splat_nxv8bf16_zero() #0 { define @splat_nxv4bf16_zero() #0 { ; CHECK-LABEL: splat_nxv4bf16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret ret zeroinitializer } @@ -381,7 +381,7 @@ define @splat_nxv4bf16_zero() #0 { define @splat_nxv2bf16_zero() #0 { ; CHECK-LABEL: splat_nxv2bf16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret ret zeroinitializer } @@ -389,7 +389,7 @@ define @splat_nxv2bf16_zero() #0 { define @splat_nxv8f16_zero() { ; CHECK-LABEL: splat_nxv8f16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret ret zeroinitializer } @@ -397,7 +397,7 @@ define @splat_nxv8f16_zero() { define @splat_nxv4f16_zero() { ; CHECK-LABEL: splat_nxv4f16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret ret zeroinitializer } @@ -405,7 +405,7 @@ define @splat_nxv4f16_zero() { define @splat_nxv2f16_zero() { ; CHECK-LABEL: splat_nxv2f16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret ret zeroinitializer } @@ -413,7 +413,7 @@ define @splat_nxv2f16_zero() { define @splat_nxv4f32_zero() { ; CHECK-LABEL: splat_nxv4f32_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret ret zeroinitializer } @@ -421,7 +421,7 @@ define @splat_nxv4f32_zero() { define @splat_nxv2f32_zero() { ; CHECK-LABEL: splat_nxv2f32_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret ret zeroinitializer } @@ -429,7 +429,7 @@ define @splat_nxv2f32_zero() { define @splat_nxv2f64_zero() { ; CHECK-LABEL: splat_nxv2f64_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret ret zeroinitializer } @@ -512,7 +512,7 @@ define @splat_nxv2bf16_imm() { define @splat_nxv4i32_fold( %x) { ; CHECK-LABEL: splat_nxv4i32_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %r = sub %x, %x ret %r @@ -522,7 +522,7 @@ define @splat_nxv4i32_fold( %x) { define @splat_nxv4f32_fold( %x) { ; CHECK-LABEL: splat_nxv4f32_fold: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %r = fsub nnan %x, %x ret %r diff --git a/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll b/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll index f16b6a4d50bca..6b5b3d6d436cb 100644 --- a/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-vselect-imm.ll @@ -107,9 +107,9 @@ define @sel_64_shifted( %p) { define @sel_16_illegal_wrong_extension( %p) { ; CHECK-LABEL: sel_16_illegal_wrong_extension: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.h, #128 // =0x80 -; CHECK-NEXT: mov z1.h, #0 // =0x0 -; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov z1.h, #128 // =0x80 +; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret %sel = select %p, splat (i16 128), zeroinitializer ret %sel @@ -118,9 +118,9 @@ define @sel_16_illegal_wrong_extension( %p) define @sel_32_illegal_wrong_extension( %p) { ; CHECK-LABEL: sel_32_illegal_wrong_extension: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #128 // =0x80 -; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov z1.s, #128 // =0x80 +; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret %sel = select %p, splat (i32 128), zeroinitializer ret %sel @@ -129,9 +129,9 @@ define @sel_32_illegal_wrong_extension( %p) define @sel_64_illegal_wrong_extension( %p) { ; CHECK-LABEL: sel_64_illegal_wrong_extension: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, #128 // =0x80 -; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov z1.d, #128 // =0x80 +; CHECK-NEXT: mov z0.d, p0/m, z1.d ; CHECK-NEXT: ret %sel = select %p, splat (i64 128), zeroinitializer ret %sel @@ -140,10 +140,10 @@ define @sel_64_illegal_wrong_extension( %p) define @sel_16_illegal_shifted( %p) { ; CHECK-LABEL: sel_16_illegal_shifted: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov w8, #513 // =0x201 -; CHECK-NEXT: mov z1.h, #0 // =0x0 -; CHECK-NEXT: mov z0.h, w8 -; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret %sel = select %p, splat (i16 513), zeroinitializer ret %sel @@ -152,10 +152,10 @@ define @sel_16_illegal_shifted( %p) { define @sel_32_illegal_shifted( %p) { ; CHECK-LABEL: sel_32_illegal_shifted: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov w8, #513 // =0x201 -; CHECK-NEXT: mov z1.s, #0 // =0x0 -; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret %sel = select %p, splat (i32 513), zeroinitializer ret %sel @@ -164,10 +164,10 @@ define @sel_32_illegal_shifted( %p) { define @sel_64_illegal_shifted( %p) { ; CHECK-LABEL: sel_64_illegal_shifted: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: mov w8, #513 // =0x201 -; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: mov z0.d, p0/m, z1.d ; CHECK-NEXT: ret %sel = select %p, splat (i64 513), zeroinitializer ret %sel diff --git a/llvm/test/CodeGen/AArch64/sve-zeroinit.ll b/llvm/test/CodeGen/AArch64/sve-zeroinit.ll index eab39d0ef4025..3d40fd920cfee 100644 --- a/llvm/test/CodeGen/AArch64/sve-zeroinit.ll +++ b/llvm/test/CodeGen/AArch64/sve-zeroinit.ll @@ -1,95 +1,145 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,SVE +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,STREAMING-SVE target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-linux-gnu" define @test_zeroinit_2xi64() { -; CHECK-LABEL: test_zeroinit_2xi64 -; CHECK: mov z0.d, #0 -; CHECK-NEXT: ret +; SVE-LABEL: test_zeroinit_2xi64: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: test_zeroinit_2xi64: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: ret ret zeroinitializer } define @test_zeroinit_4xi32() { -; CHECK-LABEL: test_zeroinit_4xi32 -; CHECK: mov z0.s, #0 -; CHECK-NEXT: ret +; SVE-LABEL: test_zeroinit_4xi32: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: test_zeroinit_4xi32: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: ret ret zeroinitializer } define @test_zeroinit_8xi16() { -; CHECK-LABEL: test_zeroinit_8xi16 -; CHECK: mov z0.h, #0 -; CHECK-NEXT: ret +; SVE-LABEL: test_zeroinit_8xi16: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: test_zeroinit_8xi16: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: ret ret zeroinitializer } define @test_zeroinit_16xi8() { -; CHECK-LABEL: test_zeroinit_16xi8 -; CHECK: mov z0.b, #0 -; CHECK-NEXT: ret +; SVE-LABEL: test_zeroinit_16xi8: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: test_zeroinit_16xi8: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.b, #0 // =0x0 +; STREAMING-SVE-NEXT: ret ret zeroinitializer } define @test_zeroinit_2xf64() { -; CHECK-LABEL: test_zeroinit_2xf64 -; CHECK: mov z0.d, #0 -; CHECK-NEXT: ret +; SVE-LABEL: test_zeroinit_2xf64: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: test_zeroinit_2xf64: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: ret ret zeroinitializer } define @test_zeroinit_4xf32() { -; CHECK-LABEL: test_zeroinit_4xf32 -; CHECK: mov z0.s, #0 -; CHECK-NEXT: ret +; SVE-LABEL: test_zeroinit_4xf32: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: test_zeroinit_4xf32: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: ret ret zeroinitializer } define @test_zeroinit_8xf16() { -; CHECK-LABEL: test_zeroinit_8xf16 -; CHECK: mov z0.h, #0 -; CHECK-NEXT: ret +; SVE-LABEL: test_zeroinit_8xf16: +; SVE: // %bb.0: +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ret +; +; STREAMING-SVE-LABEL: test_zeroinit_8xf16: +; STREAMING-SVE: // %bb.0: +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: ret ret zeroinitializer } define @test_zeroinit_1xi1() { -; CHECK-LABEL: test_zeroinit_1xi1 -; CHECK: pfalse p0.b -; CHECK-NEXT: ret +; CHECK-LABEL: test_zeroinit_1xi1: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p0.b +; CHECK-NEXT: ret ret zeroinitializer } define @test_zeroinit_2xi1() { -; CHECK-LABEL: test_zeroinit_2xi1 -; CHECK: pfalse p0.b -; CHECK-NEXT: ret +; CHECK-LABEL: test_zeroinit_2xi1: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p0.b +; CHECK-NEXT: ret ret zeroinitializer } define @test_zeroinit_4xi1() { -; CHECK-LABEL: test_zeroinit_4xi1 -; CHECK: pfalse p0.b -; CHECK-NEXT: ret +; CHECK-LABEL: test_zeroinit_4xi1: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p0.b +; CHECK-NEXT: ret ret zeroinitializer } define @test_zeroinit_8xi1() { -; CHECK-LABEL: test_zeroinit_8xi1 -; CHECK: pfalse p0.b -; CHECK-NEXT: ret +; CHECK-LABEL: test_zeroinit_8xi1: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p0.b +; CHECK-NEXT: ret ret zeroinitializer } define @test_zeroinit_16xi1() { -; CHECK-LABEL: test_zeroinit_16xi1 -; CHECK: pfalse p0.b -; CHECK-NEXT: ret +; CHECK-LABEL: test_zeroinit_16xi1: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p0.b +; CHECK-NEXT: ret ret zeroinitializer } define target("aarch64.svcount") @test_zeroinit_svcount() "target-features"="+sme2" { -; CHECK-LABEL: test_zeroinit_svcount -; CHECK: pfalse p0.b -; CHECK-NEXT: ret +; CHECK-LABEL: test_zeroinit_svcount: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p0.b +; CHECK-NEXT: ret ret target("aarch64.svcount") zeroinitializer } diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll index 7b921d71cbfb4..0a18ce054bcaf 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfadd.ll @@ -46,7 +46,7 @@ define @bfadd_u_ptrue( %a, @bfadd_u_zeroing( %pg, %a, %b) { ; CHECK-LABEL: bfadd_u_zeroing: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h ; CHECK-NEXT: bfadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll index 55ef452b60308..dcf6d3c9f6bdf 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmax.ll @@ -58,7 +58,7 @@ define @bfmax_u( %a, @bfmax_u_zeroing( %pg, %a, %b) { ; CHECK-LABEL: bfmax_u_zeroing: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h ; CHECK-NEXT: bfmax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll index 9b0f7e039f2e5..cff013fa26083 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmaxnm.ll @@ -58,7 +58,7 @@ define @bfmaxnm_u( %a, @bfmaxnm_u_zeroing( %pg, %a, %b) { ; CHECK-LABEL: bfmaxnm_u_zeroing: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h ; CHECK-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll index 8c586fd47f5a8..23c554f65da84 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmin.ll @@ -58,7 +58,7 @@ define @bfmin_u( %a, @bfmin_u_zeroing( %pg, %a, %b) { ; CHECK-LABEL: bfmin_u_zeroing: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h ; CHECK-NEXT: bfmin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll index 90132224e0223..cbbb372b921a8 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfminnm.ll @@ -58,7 +58,7 @@ define @bfminnm_u( %a, @bfminnm_u_zeroing( %pg, %a, %b) { ; CHECK-LABEL: bfminnm_u_zeroing: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h ; CHECK-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll index eb7e99f332da3..e0d65c1a0dc0a 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmla.ll @@ -22,7 +22,7 @@ define @bfmla_x( %pg, @bfmla_z( %pg, %a, %b, %c){ ; CHECK-LABEL: bfmla_z: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z3.h, #0 // =0x0 +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: sel z0.h, p0, z0.h, z3.h ; CHECK-NEXT: bfmla z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll index 8ff1afcc9b4ab..9a90755711882 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmls.ll @@ -23,7 +23,7 @@ define @bfmls_x( %pg, @bfmls_z( %pg, %a, %b, %c){ ; CHECK-LABEL: bfmls_z: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z3.h, #0 // =0x0 +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: sel z0.h, p0, z0.h, z3.h ; CHECK-NEXT: bfmls z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll index 8b6a087578ed8..a5e5bb694679d 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfmul.ll @@ -46,7 +46,7 @@ define @bfmul_u( %a, @bfmul_u_zeroing( %pg, %a, %b) { ; CHECK-LABEL: bfmul_u_zeroing: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h ; CHECK-NEXT: bfmul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll index 1b1304312ceb7..0fed30e74f4a8 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-bfsub.ll @@ -46,7 +46,7 @@ define @bfsub_u( %a, @bfsub_u_zeroing( %pg, %a, %b) { ; CHECK-LABEL: bfsub_u_zeroing: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: sel z0.h, p0, z0.h, z2.h ; CHECK-NEXT: bfsub z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll index 510d4576646f1..ba8a606b331e0 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-abs-neg.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve < %s | FileCheck %s +; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2 -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,STREAMING-SVE ; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 target triple = "aarch64-linux" @@ -39,16 +39,22 @@ entry: } define @test_svabs_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svabs_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fabs z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svabs_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fabs z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svabs_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fabs z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svabs_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: fabs z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fabs.nxv2f64( zeroinitializer, %pg, %x) ret %0 @@ -86,16 +92,22 @@ entry: } define @test_svabs_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svabs_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fabs z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svabs_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fabs z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svabs_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fabs z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svabs_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fabs z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fabs.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -133,16 +145,22 @@ entry: } define @test_svabs_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svabs_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fabs z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svabs_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fabs z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svabs_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fabs z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svabs_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fabs z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fabs.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -180,16 +198,22 @@ entry: } define @test_svabs_s8_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svabs_s8_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: abs z0.b, p0/m, z1.b -; CHECK-NEXT: ret +; SVE-LABEL: test_svabs_s8_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: abs z0.b, p0/m, z1.b +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svabs_s8_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: abs z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svabs_s8_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.b, #0 // =0x0 +; STREAMING-SVE-NEXT: abs z0.b, p0/m, z1.b +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.abs.nxv16i8( zeroinitializer, %pg, %x) ret %0 @@ -227,16 +251,22 @@ entry: } define @test_svabs_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svabs_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: abs z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svabs_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: abs z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svabs_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: abs z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svabs_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: abs z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.abs.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -274,16 +304,22 @@ entry: } define @test_svabs_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svabs_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: abs z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svabs_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: abs z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svabs_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: abs z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svabs_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: abs z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.abs.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -321,16 +357,22 @@ entry: } define @test_svabs_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svabs_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: abs z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svabs_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: abs z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svabs_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: abs z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svabs_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: abs z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.abs.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -368,16 +410,22 @@ entry: } define @test_svneg_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svneg_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fneg z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svneg_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fneg z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svneg_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fneg z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svneg_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: fneg z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fneg.nxv2f64( zeroinitializer, %pg, %x) ret %0 @@ -415,16 +463,22 @@ entry: } define @test_svneg_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svneg_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fneg z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svneg_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fneg z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svneg_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fneg z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svneg_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fneg z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fneg.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -462,16 +516,22 @@ entry: } define @test_svneg_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svneg_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fneg z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svneg_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fneg z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svneg_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fneg z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svneg_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fneg z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fneg.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -509,16 +569,22 @@ entry: } define @test_svneg_s8_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svneg_s8_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: neg z0.b, p0/m, z1.b -; CHECK-NEXT: ret +; SVE-LABEL: test_svneg_s8_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: neg z0.b, p0/m, z1.b +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svneg_s8_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: neg z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svneg_s8_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.b, #0 // =0x0 +; STREAMING-SVE-NEXT: neg z0.b, p0/m, z1.b +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.neg.nxv16i8( zeroinitializer, %pg, %x) ret %0 @@ -556,16 +622,22 @@ entry: } define @test_svneg_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svneg_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: neg z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svneg_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: neg z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svneg_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: neg z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svneg_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: neg z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.neg.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -603,16 +675,22 @@ entry: } define @test_svneg_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svneg_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: neg z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svneg_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: neg z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svneg_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: neg z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svneg_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: neg z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.neg.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -650,16 +728,22 @@ entry: } define @test_svneg_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svneg_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: neg z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svneg_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: neg z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svneg_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: neg z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svneg_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: neg z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.neg.nxv2i64( zeroinitializer, %pg, %x) ret %0 diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-counts-not.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-counts-not.ll index f7970ca81f608..b904634d0b76c 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-counts-not.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-counts-not.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+bf16,+sve < %s | FileCheck %s +; RUN: llc -mattr=+bf16,+sve < %s | FileCheck %s -check-prefixes=CHECK,SVE ; RUN: llc -mattr=+bf16,+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2 -; RUN: llc -mattr=+bf16,+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+bf16,+sme -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,STREAMING-SVE ; RUN: llc -mattr=+bf16,+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 target triple = "aarch64-linux" @@ -39,16 +39,22 @@ entry: } define @test_svcls_s8_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcls_s8_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: cls z0.b, p0/m, z1.b -; CHECK-NEXT: ret +; SVE-LABEL: test_svcls_s8_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cls z0.b, p0/m, z1.b +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcls_s8_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cls z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcls_s8_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.b, #0 // =0x0 +; STREAMING-SVE-NEXT: cls z0.b, p0/m, z1.b +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cls.nxv16i8( zeroinitializer, %pg, %x) ret %0 @@ -86,16 +92,22 @@ entry: } define @test_svcls_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcls_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: cls z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svcls_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cls z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcls_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cls z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcls_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: cls z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cls.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -133,16 +145,22 @@ entry: } define @test_svcls_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcls_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: cls z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svcls_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cls z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcls_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cls z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcls_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: cls z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cls.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -180,16 +198,22 @@ entry: } define @test_svcls_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcls_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: cls z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svcls_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cls z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcls_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cls z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcls_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: cls z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cls.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -227,16 +251,22 @@ entry: } define @test_svclz_s8_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svclz_s8_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: clz z0.b, p0/m, z1.b -; CHECK-NEXT: ret +; SVE-LABEL: test_svclz_s8_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: clz z0.b, p0/m, z1.b +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svclz_s8_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: clz z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svclz_s8_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.b, #0 // =0x0 +; STREAMING-SVE-NEXT: clz z0.b, p0/m, z1.b +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.clz.nxv16i8( zeroinitializer, %pg, %x) ret %0 @@ -274,16 +304,22 @@ entry: } define @test_svclz_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svclz_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: clz z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svclz_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: clz z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svclz_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: clz z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svclz_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: clz z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.clz.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -321,16 +357,22 @@ entry: } define @test_svclz_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svclz_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: clz z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svclz_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: clz z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svclz_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: clz z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svclz_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: clz z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.clz.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -368,16 +410,22 @@ entry: } define @test_svclz_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svclz_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: clz z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svclz_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: clz z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svclz_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: clz z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svclz_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: clz z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.clz.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -415,16 +463,22 @@ entry: } define @test_svcnt_s8_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcnt_s8_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: cnt z0.b, p0/m, z1.b -; CHECK-NEXT: ret +; SVE-LABEL: test_svcnt_s8_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cnt z0.b, p0/m, z1.b +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcnt_s8_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cnt z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcnt_s8_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.b, #0 // =0x0 +; STREAMING-SVE-NEXT: cnt z0.b, p0/m, z1.b +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cnt.nxv16i8( zeroinitializer, %pg, %x) ret %0 @@ -462,16 +516,22 @@ entry: } define @test_svcnt_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcnt_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: cnt z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svcnt_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cnt z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcnt_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cnt z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcnt_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: cnt z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cnt.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -509,16 +569,22 @@ entry: } define @test_svcnt_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcnt_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: cnt z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svcnt_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cnt z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcnt_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cnt z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcnt_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: cnt z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cnt.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -556,16 +622,22 @@ entry: } define @test_svcnt_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcnt_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: cnt z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svcnt_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cnt z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcnt_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cnt z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcnt_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: cnt z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cnt.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -603,16 +675,22 @@ entry: } define @test_svcnt_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcnt_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: cnt z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svcnt_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cnt z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcnt_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cnt z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcnt_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: cnt z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cnt.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -650,16 +728,22 @@ entry: } define @test_svcnt_bf16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcnt_bf16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: cnt z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svcnt_bf16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cnt z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcnt_bf16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cnt z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcnt_bf16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: cnt z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cnt.nxv8bf16( zeroinitializer, %pg, %x) ret %0 @@ -697,16 +781,22 @@ entry: } define @test_svcnt_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcnt_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: cnt z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svcnt_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cnt z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcnt_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cnt z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcnt_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: cnt z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cnt.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -744,16 +834,22 @@ entry: } define @test_svcnt_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcnt_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: cnt z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svcnt_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cnt z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcnt_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cnt z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcnt_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: cnt z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cnt.nxv2f64( zeroinitializer, %pg, %x) ret %0 @@ -791,16 +887,22 @@ entry: } define @test_svcnot_s8_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcnot_s8_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: cnot z0.b, p0/m, z1.b -; CHECK-NEXT: ret +; SVE-LABEL: test_svcnot_s8_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cnot z0.b, p0/m, z1.b +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcnot_s8_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cnot z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcnot_s8_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.b, #0 // =0x0 +; STREAMING-SVE-NEXT: cnot z0.b, p0/m, z1.b +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cnot.nxv16i8( zeroinitializer, %pg, %x) ret %0 @@ -838,16 +940,22 @@ entry: } define @test_svcnot_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcnot_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: cnot z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svcnot_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cnot z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcnot_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cnot z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcnot_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: cnot z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cnot.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -885,16 +993,22 @@ entry: } define @test_svcnot_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcnot_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: cnot z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svcnot_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cnot z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcnot_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cnot z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcnot_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: cnot z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cnot.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -932,16 +1046,22 @@ entry: } define @test_svcnot_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcnot_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: cnot z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svcnot_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: cnot z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcnot_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: cnot z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcnot_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: cnot z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.cnot.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -979,16 +1099,22 @@ entry: } define @test_svnot_s8_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svnot_s8_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: not z0.b, p0/m, z1.b -; CHECK-NEXT: ret +; SVE-LABEL: test_svnot_s8_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: not z0.b, p0/m, z1.b +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svnot_s8_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: not z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svnot_s8_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.b, #0 // =0x0 +; STREAMING-SVE-NEXT: not z0.b, p0/m, z1.b +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.not.nxv16i8( zeroinitializer, %pg, %x) ret %0 @@ -1026,16 +1152,22 @@ entry: } define @test_svnot_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svnot_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: not z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svnot_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: not z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svnot_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: not z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svnot_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: not z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.not.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -1073,16 +1205,22 @@ entry: } define @test_svnot_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svnot_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: not z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svnot_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: not z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svnot_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: not z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svnot_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: not z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.not.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -1120,16 +1258,22 @@ entry: } define @test_svnot_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svnot_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: not z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svnot_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: not z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svnot_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: not z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svnot_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: not z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.not.nxv2i64( zeroinitializer, %pg, %x) ret %0 diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-ext.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-ext.ll index b29805c2b8f05..20a5475706c9c 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-ext.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-ext.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve < %s | FileCheck %s +; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2 -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,STREAMING-SVE ; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 target triple = "aarch64-linux" @@ -39,16 +39,22 @@ entry: } define @test_svextb_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svextb_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: sxtb z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svextb_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sxtb z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svextb_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sxtb z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svextb_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: sxtb z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sxtb.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -86,16 +92,22 @@ entry: } define @test_svextb_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svextb_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: sxtb z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svextb_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sxtb z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svextb_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sxtb z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svextb_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: sxtb z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sxtb.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -133,16 +145,22 @@ entry: } define @test_svextb_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svextb_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: sxtb z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svextb_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sxtb z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svextb_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sxtb z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svextb_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: sxtb z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sxtb.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -180,16 +198,22 @@ entry: } define @test_svextb_u16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svextb_u16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: uxtb z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svextb_u16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: uxtb z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svextb_u16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: uxtb z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svextb_u16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: uxtb z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.uxtb.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -227,16 +251,22 @@ entry: } define @test_svextb_u32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svextb_u32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: uxtb z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svextb_u32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: uxtb z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svextb_u32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: uxtb z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svextb_u32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: uxtb z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.uxtb.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -274,16 +304,22 @@ entry: } define @test_svextb_u64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svextb_u64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: uxtb z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svextb_u64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: uxtb z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svextb_u64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: uxtb z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svextb_u64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: uxtb z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.uxtb.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -321,16 +357,22 @@ entry: } define @test_svexth_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svexth_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: sxth z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svexth_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sxth z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svexth_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sxth z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svexth_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: sxth z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sxth.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -368,16 +410,22 @@ entry: } define @test_svexth_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svexth_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: sxth z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svexth_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sxth z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svexth_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sxth z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svexth_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: sxth z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sxth.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -415,16 +463,22 @@ entry: } define @test_svexth_u32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svexth_u32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: uxth z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svexth_u32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: uxth z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svexth_u32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: uxth z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svexth_u32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: uxth z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.uxth.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -462,16 +516,22 @@ entry: } define @test_svexth_u64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svexth_u64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: uxth z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svexth_u64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: uxth z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svexth_u64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: uxth z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svexth_u64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: uxth z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.uxth.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -509,16 +569,22 @@ entry: } define @test_svextw_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svextw_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: sxtw z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svextw_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sxtw z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svextw_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sxtw z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svextw_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: sxtw z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sxtw.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -556,16 +622,22 @@ entry: } define @test_svextw_u64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svextw_u64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: uxtw z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svextw_u64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: uxtw z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svextw_u64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: uxtw z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svextw_u64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: uxtw z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.uxtw.nxv2i64( zeroinitializer, %pg, %x) ret %0 diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll index 855bf9a3b3c49..3ce8376394b2f 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvt-bfcvt.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s +; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s -check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2p2,+bf16 < %s | FileCheck %s -check-prefix CHECK-2p2 -; RUN: llc -mattr=+sme,+bf16 -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme,+bf16 -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,STREAMING-SVE ; RUN: llc -mattr=+sme2p2,+bf16 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 target triple = "aarch64-linux" @@ -38,16 +38,22 @@ entry: } define @test_svcvt_f16_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_f16_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fcvt z0.h, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_f16_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvt z0.h, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_f16_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvt z0.h, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_f16_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvt z0.h, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvt.f16f32( zeroinitializer, %pg, %x) ret %0 @@ -84,16 +90,22 @@ entry: } define @test_svcvt_bf16_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_bf16_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: bfcvt z0.h, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_bf16_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: bfcvt z0.h, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_bf16_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: bfcvt z0.h, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_bf16_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: bfcvt z0.h, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvt.bf16f32.v2( zeroinitializer, %pg, %x) ret %0 @@ -130,16 +142,22 @@ entry: } define @test_svcvt_f16_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_f16_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fcvt z0.h, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_f16_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvt z0.h, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_f16_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvt z0.h, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_f16_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvt z0.h, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvt.f16f64( zeroinitializer, %pg, %x) ret %0 @@ -176,16 +194,22 @@ entry: } define @test_svcvt_f32_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_f32_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fcvt z0.s, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_f32_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvt z0.s, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_f32_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvt z0.s, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_f32_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvt z0.s, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvt.f32f64( zeroinitializer, %pg, %x) ret %0 @@ -222,16 +246,22 @@ entry: } define @test_svcvt_f32_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_f32_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fcvt z0.s, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_f32_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvt z0.s, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_f32_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvt z0.s, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_f32_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvt z0.s, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvt.f32f16( zeroinitializer, %pg, %x) ret %0 @@ -268,16 +298,22 @@ entry: } define @test_svcvt_f64_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_f64_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fcvt z0.d, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_f64_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvt z0.d, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_f64_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvt z0.d, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_f64_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvt z0.d, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvt.f64f16( zeroinitializer, %pg, %x) ret %0 @@ -314,16 +350,22 @@ entry: } define @test_svcvt_f64_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_f64_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fcvt z0.d, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_f64_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvt z0.d, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_f64_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvt z0.d, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_f64_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvt z0.d, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvt.f64f32( zeroinitializer, %pg, %x) ret %0 diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll index c7431e11c21ca..114f2163d94fc 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtlt-fcvtx.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve2 < %s | FileCheck %s +; RUN: llc -mattr=+sve2 < %s | FileCheck %s -check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2 -; RUN: llc -mattr=+sme2 -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme2 -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,STREAMING-SVE ; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 target triple = "aarch64-linux" @@ -38,16 +38,22 @@ entry: } define @test_svcvtlt_f32_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvtlt_f32_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fcvtlt z0.s, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvtlt_f32_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtlt z0.s, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvtlt_f32_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtlt z0.s, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvtlt_f32_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtlt z0.s, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtlt.f32f16( zeroinitializer, %pg, %x) ret %0 @@ -84,16 +90,22 @@ entry: } define @test_svcvtlt_f64_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvtlt_f64_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fcvtlt z0.d, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvtlt_f64_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtlt z0.d, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvtlt_f64_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtlt z0.d, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvtlt_f64_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtlt z0.d, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtlt.f64f32( zeroinitializer, %pg, %x) ret %0 @@ -130,16 +142,22 @@ entry: } define @test_svcvtx_f32_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvtx_f32_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fcvtx z0.s, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvtx_f32_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtx z0.s, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvtx_f32_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtx z0.s, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvtx_f32_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtx z0.s, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtx.f32f64( zeroinitializer, %pg, %x) ret %0 diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll index 7259502bf4400..e03e7ca14871a 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-fcvtzsu.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve < %s | FileCheck %s +; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2 -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,STREAMING-SVE ; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 target triple = "aarch64-linux" @@ -38,16 +38,22 @@ entry: } define @test_fcvtzs_s32_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_fcvtzs_s32_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fcvtzs z0.s, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_fcvtzs_s32_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzs z0.s, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_fcvtzs_s32_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzs z0.s, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_fcvtzs_s32_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzs z0.s, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzs.i32f64( zeroinitializer, %pg, %x) ret %0 @@ -84,16 +90,22 @@ entry: } define @test_fcvtzs_s64_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_fcvtzs_s64_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_fcvtzs_s64_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzs z0.d, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_fcvtzs_s64_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzs z0.d, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_fcvtzs_s64_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzs z0.d, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzs.i64f32( zeroinitializer, %pg, %x) ret %0 @@ -130,16 +142,22 @@ entry: } define @test_fcvtzs_s32_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_fcvtzs_s32_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fcvtzs z0.s, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_fcvtzs_s32_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzs z0.s, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_fcvtzs_s32_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzs z0.s, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_fcvtzs_s32_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzs z0.s, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzs.i32f16( zeroinitializer, %pg, %x) ret %0 @@ -176,16 +194,22 @@ entry: } define @test_fcvtzs_s64_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_fcvtzs_s64_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_fcvtzs_s64_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzs z0.d, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_fcvtzs_s64_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzs z0.d, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_fcvtzs_s64_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzs z0.d, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzs.i64f16( zeroinitializer, %pg, %x) ret %0 @@ -222,16 +246,22 @@ entry: } define @test_fcvtzu_u32_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_fcvtzu_u32_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fcvtzu z0.s, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_fcvtzu_u32_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzu z0.s, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_fcvtzu_u32_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzu z0.s, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_fcvtzu_u32_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzu z0.s, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzu.i32f64( zeroinitializer, %pg, %x) ret %0 @@ -268,16 +298,22 @@ entry: } define @test_fcvtzu_u64_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_fcvtzu_u64_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_fcvtzu_u64_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzu z0.d, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_fcvtzu_u64_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzu z0.d, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_fcvtzu_u64_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzu z0.d, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzu.i64f32( zeroinitializer, %pg, %x) ret %0 @@ -314,16 +350,22 @@ entry: } define @test_fcvtzu_u32_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_fcvtzu_u32_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fcvtzu z0.s, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_fcvtzu_u32_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzu z0.s, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_fcvtzu_u32_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzu z0.s, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_fcvtzu_u32_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzu z0.s, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzu.i32f16( zeroinitializer, %pg, %x) ret %0 @@ -360,16 +402,22 @@ entry: } define @test_fcvtzu_u64_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_fcvtzu_u64_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_fcvtzu_u64_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzu z0.d, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_fcvtzu_u64_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzu z0.d, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_fcvtzu_u64_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzu z0.d, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzu.i64f16( zeroinitializer, %pg, %x) ret %0 @@ -408,16 +456,22 @@ entry: } define @test_svcvt_s16_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_s16_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fcvtzs z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_s16_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzs z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_s16_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzs z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_s16_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzs z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -455,16 +509,22 @@ entry: } define @test_svcvt_u16_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_u16_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fcvtzu z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_u16_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzu z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_u16_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzu z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_u16_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzu z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -502,16 +562,22 @@ entry: } define @test_svcvt_s32_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_s32_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fcvtzs z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_s32_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzs z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_s32_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzs z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_s32_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzs z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -549,16 +615,22 @@ entry: } define @test_svcvt_u32_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_u32_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fcvtzu z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_u32_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzu z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_u32_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzu z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_u32_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzu z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -596,16 +668,22 @@ entry: } define @test_svcvt_s64_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_s64_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_s64_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzs z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_s64_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzs z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_s64_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzs z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64( zeroinitializer, %pg, %x) ret %0 @@ -643,16 +721,22 @@ entry: } define @test_svcvt_u64_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_u64_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_u64_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fcvtzu z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_u64_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fcvtzu z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_u64_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: fcvtzu z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64( zeroinitializer, %pg, %x) ret %0 diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-flogb.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-flogb.ll index 23620a3419b99..79d66b0e04824 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-flogb.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-flogb.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve2 < %s | FileCheck %s +; RUN: llc -mattr=+sve2 < %s | FileCheck %s -check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2 -; RUN: llc -mattr=+sme2 -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme2 -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,STREAMING-SVE ; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 target triple = "aarch64-linux" @@ -38,16 +38,22 @@ entry: } define @test_svlogb_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svlogb_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: flogb z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svlogb_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: flogb z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svlogb_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: flogb z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svlogb_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: flogb z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.flogb.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -84,16 +90,22 @@ entry: } define @test_svlogb_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svlogb_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: flogb z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svlogb_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: flogb z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svlogb_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: flogb z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svlogb_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: flogb z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.flogb.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -130,16 +142,22 @@ entry: } define @test_svlogb_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svlogb_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: flogb z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svlogb_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: flogb z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svlogb_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: flogb z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svlogb_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: flogb z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.flogb.nxv2f64( zeroinitializer, %pg, %x) ret %0 diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-frint-frecpx-fsqrt.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-frint-frecpx-fsqrt.ll index c493ec2dcc95d..25252d222db26 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-frint-frecpx-fsqrt.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-frint-frecpx-fsqrt.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve < %s | FileCheck %s +; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2 -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,STREAMING-SVE ; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 target triple = "aarch64-linux" @@ -39,16 +39,22 @@ entry: } define @test_svrinta_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrinta_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frinta z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrinta_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frinta z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrinta_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frinta z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrinta_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frinta z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frinta.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -86,16 +92,22 @@ entry: } define @test_svrinta_4f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrinta_4f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frinta z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrinta_4f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frinta z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrinta_4f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frinta z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrinta_4f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frinta z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frinta.nxv4f16( zeroinitializer, %pg, %x) ret %0 @@ -133,16 +145,22 @@ entry: } define @test_svrinta_2f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrinta_2f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frinta z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrinta_2f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frinta z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrinta_2f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frinta z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrinta_2f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frinta z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frinta.nxv2f16( zeroinitializer, %pg, %x) ret %0 @@ -180,16 +198,22 @@ entry: } define @test_svrinta_2f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrinta_2f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frinta z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrinta_2f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frinta z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrinta_2f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frinta z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrinta_2f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frinta z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frinta.nxv2f32( zeroinitializer, %pg, %x) ret %0 @@ -227,16 +251,22 @@ entry: } define @test_svrinta_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrinta_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frinta z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrinta_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frinta z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrinta_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frinta z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrinta_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frinta z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frinta.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -274,16 +304,22 @@ entry: } define @test_svrinta_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrinta_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: frinta z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svrinta_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frinta z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrinta_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frinta z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrinta_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: frinta z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frinta.nxv2f64( zeroinitializer, %pg, %x) ret %0 @@ -321,16 +357,22 @@ entry: } define @test_svrinti_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrinti_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frinti z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrinti_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frinti z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrinti_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frinti z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrinti_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frinti z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frinti.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -368,16 +410,22 @@ entry: } define @test_svrinti_4f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrinti_4f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frinti z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrinti_4f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frinti z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrinti_4f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frinti z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrinti_4f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frinti z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frinti.nxv4f16( zeroinitializer, %pg, %x) ret %0 @@ -415,16 +463,22 @@ entry: } define @test_svrinti_2f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrinti_2f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frinti z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrinti_2f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frinti z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrinti_2f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frinti z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrinti_2f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frinti z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frinti.nxv2f16( zeroinitializer, %pg, %x) ret %0 @@ -462,16 +516,22 @@ entry: } define @test_svrinti_2f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrinti_2f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frinti z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrinti_2f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frinti z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrinti_2f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frinti z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrinti_2f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frinti z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frinti.nxv2f32( zeroinitializer, %pg, %x) ret %0 @@ -509,16 +569,22 @@ entry: } define @test_svrinti_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrinti_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frinti z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrinti_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frinti z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrinti_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frinti z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrinti_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frinti z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frinti.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -557,16 +623,22 @@ entry: define @test_svrinti_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrinti_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: frinti z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svrinti_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frinti z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrinti_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frinti z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrinti_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: frinti z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frinti.nxv2f64( zeroinitializer, %pg, %x) ret %0 @@ -606,16 +678,22 @@ entry: define @test_svrintm_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintm_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintm z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintm_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintm z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintm_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintm z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintm_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintm z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintm.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -653,16 +731,22 @@ entry: } define @test_svrintm_4f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintm_4f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintm z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintm_4f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintm z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintm_4f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintm z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintm_4f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintm z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintm.nxv4f16( zeroinitializer, %pg, %x) ret %0 @@ -700,16 +784,22 @@ entry: } define @test_svrintm_2f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintm_2f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintm z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintm_2f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintm z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintm_2f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintm z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintm_2f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintm z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintm.nxv2f16( zeroinitializer, %pg, %x) ret %0 @@ -747,16 +837,22 @@ entry: } define @test_svrintm_2f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintm_2f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frintm z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintm_2f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintm z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintm_2f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintm z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintm_2f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frintm z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintm.nxv2f32( zeroinitializer, %pg, %x) ret %0 @@ -794,16 +890,22 @@ entry: } define @test_svrintm_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintm_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frintm z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintm_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintm z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintm_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintm z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintm_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frintm z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintm.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -841,16 +943,22 @@ entry: } define @test_svrintm_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintm_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: frintm z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintm_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintm z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintm_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintm z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintm_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: frintm z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintm.nxv2f64( zeroinitializer, %pg, %x) ret %0 @@ -888,16 +996,22 @@ entry: } define @test_svrintn_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintn_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintn z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintn_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintn z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintn_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintn z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintn_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintn z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintn.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -935,16 +1049,22 @@ entry: } define @test_svrintn_4f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintn_4f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintn z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintn_4f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintn z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintn_4f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintn z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintn_4f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintn z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintn.nxv4f16( zeroinitializer, %pg, %x) ret %0 @@ -982,16 +1102,22 @@ entry: } define @test_svrintn_2f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintn_2f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintn z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintn_2f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintn z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintn_2f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintn z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintn_2f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintn z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintn.nxv2f16( zeroinitializer, %pg, %x) ret %0 @@ -1029,16 +1155,22 @@ entry: } define @test_svrintn_2f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintn_2f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frintn z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintn_2f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintn z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintn_2f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintn z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintn_2f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frintn z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintn.nxv2f32( zeroinitializer, %pg, %x) ret %0 @@ -1076,16 +1208,22 @@ entry: } define @test_svrintn_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintn_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frintn z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintn_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintn z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintn_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintn z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintn_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frintn z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintn.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -1123,16 +1261,22 @@ entry: } define @test_svrintn_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintn_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: frintn z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintn_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintn z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintn_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintn z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintn_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: frintn z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintn.nxv2f64( zeroinitializer, %pg, %x) ret %0 @@ -1170,16 +1314,22 @@ entry: } define @test_svrintp_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintp_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintp z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintp_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintp z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintp_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintp z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintp_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintp z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintp.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -1217,16 +1367,22 @@ entry: } define @test_svrintp_4f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintp_4f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintp z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintp_4f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintp z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintp_4f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintp z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintp_4f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintp z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintp.nxv4f16( zeroinitializer, %pg, %x) ret %0 @@ -1264,16 +1420,22 @@ entry: } define @test_svrintp_2f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintp_2f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintp z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintp_2f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintp z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintp_2f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintp z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintp_2f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintp z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintp.nxv2f16( zeroinitializer, %pg, %x) ret %0 @@ -1311,16 +1473,22 @@ entry: } define @test_svrintp_2f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintp_2f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frintp z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintp_2f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintp z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintp_2f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintp z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintp_2f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frintp z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintp.nxv2f32( zeroinitializer, %pg, %x) ret %0 @@ -1358,16 +1526,22 @@ entry: } define @test_svrintp_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintp_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frintp z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintp_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintp z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintp_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintp z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintp_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frintp z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintp.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -1406,16 +1580,22 @@ entry: define @test_svrintp_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintp_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: frintp z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintp_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintp z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintp_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintp z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintp_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: frintp z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintp.nxv2f64( zeroinitializer, %pg, %x) ret %0 @@ -1453,16 +1633,22 @@ entry: } define @test_svrintx_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintx_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintx z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintx_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintx z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintx_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintx z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintx_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintx z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintx.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -1500,16 +1686,22 @@ entry: } define @test_svrintx_4f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintx_4f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintx z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintx_4f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintx z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintx_4f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintx z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintx_4f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintx z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintx.nxv4f16( zeroinitializer, %pg, %x) ret %0 @@ -1547,16 +1739,22 @@ entry: } define @test_svrintx_2f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintx_2f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintx z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintx_2f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintx z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintx_2f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintx z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintx_2f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintx z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintx.nxv2f16( zeroinitializer, %pg, %x) ret %0 @@ -1594,16 +1792,22 @@ entry: } define @test_svrintx_2f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintx_2f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frintx z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintx_2f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintx z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintx_2f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintx z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintx_2f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frintx z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintx.nxv2f32( zeroinitializer, %pg, %x) ret %0 @@ -1641,16 +1845,22 @@ entry: } define @test_svrintx_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintx_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frintx z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintx_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintx z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintx_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintx z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintx_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frintx z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintx.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -1688,16 +1898,22 @@ entry: } define @test_svrintx_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintx_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: frintx z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintx_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintx z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintx_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintx z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintx_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: frintx z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintx.nxv2f64( zeroinitializer, %pg, %x) ret %0 @@ -1735,16 +1951,22 @@ entry: } define @test_svrintz_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintz_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintz z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintz_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintz z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintz_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintz z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintz_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintz z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintz.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -1782,16 +2004,22 @@ entry: } define @test_svrintz_4f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintz_4f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintz z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintz_4f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintz z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintz_4f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintz z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintz_4f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintz z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintz.nxv4f16( zeroinitializer, %pg, %x) ret %0 @@ -1829,16 +2057,22 @@ entry: } define @test_svrintz_2f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintz_2f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frintz z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintz_2f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintz z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintz_2f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintz z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintz_2f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frintz z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintz.nxv2f16( zeroinitializer, %pg, %x) ret %0 @@ -1876,16 +2110,22 @@ entry: } define @test_svrintz_2f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintz_2f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frintz z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintz_2f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintz z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintz_2f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintz z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintz_2f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frintz z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintz.nxv2f32( zeroinitializer, %pg, %x) ret %0 @@ -1923,16 +2163,22 @@ entry: } define @test_svrintz_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintz_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frintz z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintz_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintz z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintz_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintz z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintz_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frintz z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintz.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -1970,16 +2216,22 @@ entry: } define @test_svrintz_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrintz_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: frintz z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svrintz_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frintz z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrintz_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frintz z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrintz_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: frintz z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frintz.nxv2f64( zeroinitializer, %pg, %x) ret %0 @@ -2017,16 +2269,22 @@ entry: } define @test_svrecpx_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrecpx_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frecpx z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrecpx_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frecpx z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrecpx_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrecpx_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frecpx z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frecpx.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -2064,16 +2322,22 @@ entry: } define @test_svrecpx_4f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrecpx_4f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frecpx z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrecpx_4f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frecpx z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrecpx_4f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrecpx_4f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frecpx z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frecpx.nxv4f16( zeroinitializer, %pg, %x) ret %0 @@ -2111,16 +2375,22 @@ entry: } define @test_svrecpx_2f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrecpx_2f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: frecpx z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrecpx_2f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frecpx z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrecpx_2f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frecpx z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrecpx_2f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: frecpx z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f16( zeroinitializer, %pg, %x) ret %0 @@ -2158,16 +2428,22 @@ entry: } define @test_svrecpx_2f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrecpx_2f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frecpx z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrecpx_2f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frecpx z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrecpx_2f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frecpx z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrecpx_2f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frecpx z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f32( zeroinitializer, %pg, %x) ret %0 @@ -2205,16 +2481,22 @@ entry: } define @test_svrecpx_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrecpx_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: frecpx z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrecpx_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frecpx z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrecpx_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frecpx z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrecpx_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: frecpx z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frecpx.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -2252,16 +2534,22 @@ entry: } define @test_svrecpx_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrecpx_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: frecpx z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svrecpx_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: frecpx z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrecpx_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: frecpx z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrecpx_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: frecpx z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.frecpx.nxv2f64( zeroinitializer, %pg, %x) ret %0 @@ -2299,16 +2587,22 @@ entry: } define @test_svsqrt_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svsqrt_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fsqrt z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svsqrt_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fsqrt z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svsqrt_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svsqrt_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fsqrt z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fsqrt.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -2346,16 +2640,22 @@ entry: } define @test_svsqrt_4f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svsqrt_4f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fsqrt z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svsqrt_4f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fsqrt z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svsqrt_4f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svsqrt_4f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fsqrt z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fsqrt.nxv4f16( zeroinitializer, %pg, %x) ret %0 @@ -2393,16 +2693,22 @@ entry: } define @test_svsqrt_2f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svsqrt_2f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: fsqrt z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svsqrt_2f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fsqrt z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svsqrt_2f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fsqrt z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svsqrt_2f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: fsqrt z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f16( zeroinitializer, %pg, %x) ret %0 @@ -2440,16 +2746,22 @@ entry: } define @test_svsqrt_2f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svsqrt_2f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fsqrt z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svsqrt_2f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fsqrt z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svsqrt_2f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fsqrt z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svsqrt_2f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fsqrt z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f32( zeroinitializer, %pg, %x) ret %0 @@ -2487,16 +2799,22 @@ entry: } define @test_svsqrt_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svsqrt_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: fsqrt z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svsqrt_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fsqrt z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svsqrt_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fsqrt z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svsqrt_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: fsqrt z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fsqrt.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -2534,16 +2852,22 @@ entry: } define @test_svsqrt_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svsqrt_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fsqrt z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svsqrt_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: fsqrt z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svsqrt_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: fsqrt z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svsqrt_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: fsqrt z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.fsqrt.nxv2f64( zeroinitializer, %pg, %x) ret %0 diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-rev.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-rev.ll index d7a51c8cf8062..fb95047a41205 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-rev.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-rev.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s +; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s -check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2 -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,STREAMING-SVE ; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 target triple = "aarch64-linux" @@ -38,16 +38,22 @@ entry: } define @test_svrbit_s8_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrbit_s8_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: rbit z0.b, p0/m, z1.b -; CHECK-NEXT: ret +; SVE-LABEL: test_svrbit_s8_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: rbit z0.b, p0/m, z1.b +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrbit_s8_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: rbit z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrbit_s8_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.b, #0 // =0x0 +; STREAMING-SVE-NEXT: rbit z0.b, p0/m, z1.b +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.rbit.nxv16i8( zeroinitializer, %pg, %x) ret %0 @@ -84,16 +90,22 @@ entry: } define @test_svrbit_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrbit_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: rbit z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrbit_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: rbit z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrbit_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: rbit z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrbit_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: rbit z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.rbit.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -130,16 +142,22 @@ entry: } define @test_svrbit_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrbit_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: rbit z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrbit_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: rbit z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrbit_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: rbit z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrbit_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: rbit z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.rbit.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -176,16 +194,22 @@ entry: } define @test_svrbit_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrbit_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: rbit z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svrbit_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: rbit z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrbit_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: rbit z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrbit_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: rbit z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.rbit.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -222,16 +246,22 @@ entry: } define @test_svrevb_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevb_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: revb z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevb_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revb z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevb_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revb z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevb_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: revb z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revb.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -268,16 +298,22 @@ entry: } define @test_svrevb_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevb_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: revb z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevb_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revb z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevb_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revb z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevb_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: revb z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revb.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -314,16 +350,22 @@ entry: } define @test_svrevb_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevb_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: revb z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevb_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revb z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevb_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revb z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevb_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: revb z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revb.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -360,16 +402,22 @@ entry: } define @test_svrevh_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevh_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: revh z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevh_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revh z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevh_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revh z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevh_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: revh z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revh.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -406,16 +454,22 @@ entry: } define @test_svrevh_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevh_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: revh z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevh_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revh z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevh_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revh z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevh_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: revh z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revh.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -452,16 +506,22 @@ entry: } define @test_svrevw_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevw_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: revw z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevw_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revw z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevw_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revw z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevw_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: revw z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revw.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -498,16 +558,22 @@ entry: } define @test_svrevd_s8_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevd_s8_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: revd z0.q, p0/m, z1.q -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevd_s8_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revd z0.q, p0/m, z1.q +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevd_s8_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revd z0.q, p0/z, z1.q ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevd_s8_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.b, #0 // =0x0 +; STREAMING-SVE-NEXT: revd z0.q, p0/m, z1.q +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revd.nxv16i8( zeroinitializer, %pg, %x) ret %0 @@ -544,16 +610,22 @@ entry: } define @test_svrevd_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevd_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: revd z0.q, p0/m, z1.q -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevd_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revd z0.q, p0/m, z1.q +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevd_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revd z0.q, p0/z, z1.q ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevd_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: revd z0.q, p0/m, z1.q +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revd.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -590,16 +662,22 @@ entry: } define @test_svrevd_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevd_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: revd z0.q, p0/m, z1.q -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevd_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revd z0.q, p0/m, z1.q +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevd_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revd z0.q, p0/z, z1.q ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevd_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: revd z0.q, p0/m, z1.q +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revd.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -636,16 +714,22 @@ entry: } define @test_svrevd_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevd_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: revd z0.q, p0/m, z1.q -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevd_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revd z0.q, p0/m, z1.q +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevd_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revd z0.q, p0/z, z1.q ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevd_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: revd z0.q, p0/m, z1.q +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revd.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -682,16 +766,22 @@ entry: } define @test_svrevd_f16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevd_f16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: revd z0.q, p0/m, z1.q -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevd_f16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revd z0.q, p0/m, z1.q +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevd_f16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revd z0.q, p0/z, z1.q ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevd_f16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: revd z0.q, p0/m, z1.q +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revd.nxv8f16( zeroinitializer, %pg, %x) ret %0 @@ -728,16 +818,22 @@ entry: } define @test_svrevd_bf16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevd_bf16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: revd z0.q, p0/m, z1.q -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevd_bf16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revd z0.q, p0/m, z1.q +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevd_bf16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revd z0.q, p0/z, z1.q ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevd_bf16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: revd z0.q, p0/m, z1.q +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revd.nxv8bf16( zeroinitializer, %pg, %x) ret %0 @@ -774,16 +870,22 @@ entry: } define @test_svrevd_f32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevd_f32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: revd z0.q, p0/m, z1.q -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevd_f32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revd z0.q, p0/m, z1.q +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevd_f32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revd z0.q, p0/z, z1.q ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevd_f32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: revd z0.q, p0/m, z1.q +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revd.nxv4f32( zeroinitializer, %pg, %x) ret %0 @@ -820,16 +922,22 @@ entry: } define @test_svrevd_f64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrevd_f64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: revd z0.q, p0/m, z1.q -; CHECK-NEXT: ret +; SVE-LABEL: test_svrevd_f64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: revd z0.q, p0/m, z1.q +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrevd_f64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: revd z0.q, p0/z, z1.q ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrevd_f64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: revd z0.q, p0/m, z1.q +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.revd.nxv2f64( zeroinitializer, %pg, %x) ret %0 diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-urecpe-ursqrte-sqabs-sqneg.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-urecpe-ursqrte-sqabs-sqneg.ll index 787ac4458079c..50c73ae198ab2 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-urecpe-ursqrte-sqabs-sqneg.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-urecpe-ursqrte-sqabs-sqneg.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve2 < %s | FileCheck %s +; RUN: llc -mattr=+sve2 < %s | FileCheck %s -check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2 -; RUN: llc -mattr=+sme --force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme --force-streaming < %s | FileCheck %s -check-prefixes=CHECK,STREAMING-SVE ; RUN: llc -mattr=+sme2p2 --force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 target triple = "aarch64-linux" @@ -39,16 +39,22 @@ entry: } define @test_svrecpe_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrecpe_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: urecpe z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrecpe_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: urecpe z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrecpe_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: urecpe z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrecpe_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: urecpe z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.urecpe.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -86,16 +92,22 @@ entry: } define @test_svrsqrte_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svrsqrte_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: ursqrte z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svrsqrte_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ursqrte z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svrsqrte_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: ursqrte z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svrsqrte_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: ursqrte z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.ursqrte.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -133,16 +145,22 @@ entry: } define @test_svqabs_s8_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svqabs_s8_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: sqabs z0.b, p0/m, z1.b -; CHECK-NEXT: ret +; SVE-LABEL: test_svqabs_s8_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sqabs z0.b, p0/m, z1.b +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svqabs_s8_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sqabs z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svqabs_s8_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.b, #0 // =0x0 +; STREAMING-SVE-NEXT: sqabs z0.b, p0/m, z1.b +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sqabs.nxv16i8( zeroinitializer, %pg, %x) ret %0 @@ -180,16 +198,22 @@ entry: } define @test_svqabs_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svqabs_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: sqabs z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svqabs_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sqabs z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svqabs_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sqabs z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svqabs_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: sqabs z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sqabs.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -227,16 +251,22 @@ entry: } define @test_svqabs_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svqabs_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: sqabs z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svqabs_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sqabs z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svqabs_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sqabs z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svqabs_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: sqabs z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sqabs.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -274,16 +304,22 @@ entry: } define @test_svqabs_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svqabs_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: sqabs z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svqabs_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sqabs z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svqabs_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sqabs z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svqabs_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: sqabs z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sqabs.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -321,16 +357,22 @@ entry: } define @test_svqneg_s8_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svqneg_s8_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: sqneg z0.b, p0/m, z1.b -; CHECK-NEXT: ret +; SVE-LABEL: test_svqneg_s8_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sqneg z0.b, p0/m, z1.b +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svqneg_s8_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sqneg z0.b, p0/z, z1.b ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svqneg_s8_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.b, #0 // =0x0 +; STREAMING-SVE-NEXT: sqneg z0.b, p0/m, z1.b +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sqneg.nxv16i8( zeroinitializer, %pg, %x) ret %0 @@ -368,16 +410,22 @@ entry: } define @test_svqneg_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svqneg_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: sqneg z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svqneg_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sqneg z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svqneg_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sqneg z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svqneg_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: sqneg z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sqneg.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -415,16 +463,22 @@ entry: } define @test_svqneg_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svqneg_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: sqneg z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svqneg_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sqneg z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svqneg_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sqneg z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svqneg_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: sqneg z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sqneg.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -462,16 +516,22 @@ entry: } define @test_svqneg_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svqneg_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: sqneg z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svqneg_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: sqneg z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svqneg_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: sqneg z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svqneg_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: sqneg z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.sqneg.nxv2i64( zeroinitializer, %pg, %x) ret %0 diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-uscvtf.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-uscvtf.ll index fd0126f3166dd..e5447071cbfd2 100644 --- a/llvm/test/CodeGen/AArch64/zeroing-forms-uscvtf.ll +++ b/llvm/test/CodeGen/AArch64/zeroing-forms-uscvtf.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve < %s | FileCheck %s +; RUN: llc -mattr=+sve < %s | FileCheck %s -check-prefixes=CHECK,SVE ; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2 -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,STREAMING-SVE ; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2 target triple = "aarch64-linux" @@ -38,16 +38,22 @@ entry: } define @test_scvtf_f32_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_scvtf_f32_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: scvtf z0.s, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_scvtf_f32_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: scvtf z0.s, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_scvtf_f32_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: scvtf z0.s, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_scvtf_f32_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: scvtf z0.s, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.scvtf.f32i64( zeroinitializer, %pg, %x) ret %0 @@ -84,16 +90,22 @@ entry: } define @test_scvtf_f64_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_scvtf_f64_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: scvtf z0.d, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_scvtf_f64_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: scvtf z0.d, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_scvtf_f64_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: scvtf z0.d, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_scvtf_f64_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: scvtf z0.d, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.scvtf.f64i32( zeroinitializer, %pg, %x) ret %0 @@ -130,16 +142,22 @@ entry: } define @test_scvtf_f16_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_scvtf_f16_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: scvtf z0.h, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_scvtf_f16_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: scvtf z0.h, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_scvtf_f16_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: scvtf z0.h, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_scvtf_f16_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: scvtf z0.h, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.scvtf.f16i32( zeroinitializer, %pg, %x) ret %0 @@ -176,16 +194,22 @@ entry: } define @test_scvtf_f16_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_scvtf_f16_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: scvtf z0.h, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_scvtf_f16_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: scvtf z0.h, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_scvtf_f16_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: scvtf z0.h, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_scvtf_f16_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: scvtf z0.h, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.scvtf.f16i64( zeroinitializer, %pg, %x) ret %0 @@ -222,16 +246,22 @@ entry: } define @test_ucvtf_f32_u64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_ucvtf_f32_u64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: ucvtf z0.s, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_ucvtf_f32_u64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ucvtf z0.s, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_ucvtf_f32_u64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: ucvtf z0.s, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_ucvtf_f32_u64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: ucvtf z0.s, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.ucvtf.f32i64( zeroinitializer, %pg, %x) ret %0 @@ -268,16 +298,22 @@ entry: } define @test_ucvtf_f64_u32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_ucvtf_f64_u32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: ucvtf z0.d, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_ucvtf_f64_u32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ucvtf z0.d, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_ucvtf_f64_u32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: ucvtf z0.d, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_ucvtf_f64_u32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: ucvtf z0.d, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.ucvtf.f64i32( zeroinitializer, %pg, %x) ret %0 @@ -314,16 +350,22 @@ entry: } define @test_ucvtf_f16_u32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_ucvtf_f16_u32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: ucvtf z0.h, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_ucvtf_f16_u32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ucvtf z0.h, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_ucvtf_f16_u32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: ucvtf z0.h, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_ucvtf_f16_u32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: ucvtf z0.h, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.ucvtf.f16i32( zeroinitializer, %pg, %x) ret %0 @@ -360,16 +402,22 @@ entry: } define @test_ucvtf_f16_u64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_ucvtf_f16_u64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: ucvtf z0.h, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_ucvtf_f16_u64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ucvtf z0.h, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_ucvtf_f16_u64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: ucvtf z0.h, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_ucvtf_f16_u64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: ucvtf z0.h, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.ucvtf.f16i64( zeroinitializer, %pg, %x) ret %0 @@ -407,16 +455,22 @@ entry: } define @test_svcvt_f16_s16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_f16_s16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: scvtf z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_f16_s16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: scvtf z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_f16_s16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: scvtf z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_f16_s16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: scvtf z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.scvtf.nxv8f16.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -454,16 +508,22 @@ entry: } define @test_svcvt_f16_u16_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_f16_u16_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.h, #0 // =0x0 -; CHECK-NEXT: ucvtf z0.h, p0/m, z1.h -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_f16_u16_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ucvtf z0.h, p0/m, z1.h +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_f16_u16_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: ucvtf z0.h, p0/z, z1.h ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_f16_u16_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.h, #0 // =0x0 +; STREAMING-SVE-NEXT: ucvtf z0.h, p0/m, z1.h +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.ucvtf.nxv8f16.nxv8i16( zeroinitializer, %pg, %x) ret %0 @@ -501,16 +561,22 @@ entry: } define @test_svcvt_f32_s32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_f32_s32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: scvtf z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_f32_s32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: scvtf z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_f32_s32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: scvtf z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_f32_s32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: scvtf z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.scvtf.nxv4f32.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -548,16 +614,22 @@ entry: } define @test_svcvt_f32_u32_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_f32_u32_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.s, #0 // =0x0 -; CHECK-NEXT: ucvtf z0.s, p0/m, z1.s -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_f32_u32_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ucvtf z0.s, p0/m, z1.s +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_f32_u32_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: ucvtf z0.s, p0/z, z1.s ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_f32_u32_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.s, #0 // =0x0 +; STREAMING-SVE-NEXT: ucvtf z0.s, p0/m, z1.s +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.ucvtf.nxv4f32.nxv4i32( zeroinitializer, %pg, %x) ret %0 @@ -595,16 +667,22 @@ entry: } define @test_svcvt_f64_s64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_f64_s64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: scvtf z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_f64_s64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: scvtf z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_f64_s64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: scvtf z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_f64_s64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: scvtf z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.scvtf.nxv2f64.nxv2i64( zeroinitializer, %pg, %x) ret %0 @@ -642,16 +720,22 @@ entry: } define @test_svcvt_f64_u64_z( %pg, double %z0, %x) { -; CHECK-LABEL: test_svcvt_f64_u64_z: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: ucvtf z0.d, p0/m, z1.d -; CHECK-NEXT: ret +; SVE-LABEL: test_svcvt_f64_u64_z: +; SVE: // %bb.0: // %entry +; SVE-NEXT: movi v0.2d, #0000000000000000 +; SVE-NEXT: ucvtf z0.d, p0/m, z1.d +; SVE-NEXT: ret ; ; CHECK-2p2-LABEL: test_svcvt_f64_u64_z: ; CHECK-2p2: // %bb.0: // %entry ; CHECK-2p2-NEXT: ucvtf z0.d, p0/z, z1.d ; CHECK-2p2-NEXT: ret +; +; STREAMING-SVE-LABEL: test_svcvt_f64_u64_z: +; STREAMING-SVE: // %bb.0: // %entry +; STREAMING-SVE-NEXT: mov z0.d, #0 // =0x0 +; STREAMING-SVE-NEXT: ucvtf z0.d, p0/m, z1.d +; STREAMING-SVE-NEXT: ret entry: %0 = tail call @llvm.aarch64.sve.ucvtf.nxv2f64.nxv2i64( zeroinitializer, %pg, %x) ret %0 From 52f3cad9ffa35a472699d541736bd72dd01d6e62 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 3 Apr 2025 13:17:14 +0100 Subject: [PATCH 0515/1029] [X86] getFauxShuffleMask - move INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)) matching behind common one use bitcast checks (#134227) No need to ignore one use checks for the INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)) fold Noticed while working on the #133947 regressions --- llvm/lib/Target/X86/X86ISelLowering.cpp | 41 +++++++++++++------------ 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 34716929f61f1..8271d9c486650 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6185,18 +6185,26 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, } if (!N->isOnlyUserOf(Sub.getNode())) return false; - SDValue SubBC = peekThroughBitcasts(Sub); + + SmallVector SubMask; + SmallVector SubInputs; + SDValue SubSrc = peekThroughOneUseBitcasts(Sub); + EVT SubSrcVT = SubSrc.getValueType(); + if (!SubSrcVT.isVector()) + return false; + // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)). - if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR && - SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) { - uint64_t ExtractIdx = SubBC.getConstantOperandVal(1); - SDValue SubBCSrc = SubBC.getOperand(0); - unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements(); - unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts); - assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 && + if (SubSrc.getOpcode() == ISD::EXTRACT_SUBVECTOR && + SubSrc.getOperand(0).getValueSizeInBits() == NumSizeInBits) { + uint64_t ExtractIdx = SubSrc.getConstantOperandVal(1); + SDValue SubSrcSrc = SubSrc.getOperand(0); + unsigned NumSubSrcSrcElts = + SubSrcSrc.getValueType().getVectorNumElements(); + unsigned MaxElts = std::max(NumElts, NumSubSrcSrcElts); + assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcSrcElts) == 0 && "Subvector valuetype mismatch"); InsertIdx *= (MaxElts / NumElts); - ExtractIdx *= (MaxElts / NumSubSrcBCElts); + ExtractIdx *= (MaxElts / NumSubSrcSrcElts); NumSubElts *= (MaxElts / NumElts); bool SrcIsUndef = Src.isUndef(); for (int i = 0; i != (int)MaxElts; ++i) @@ -6205,17 +6213,11 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i; if (!SrcIsUndef) Ops.push_back(Src); - Ops.push_back(SubBCSrc); + Ops.push_back(SubSrcSrc); return true; } - // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). - SmallVector SubMask; - SmallVector SubInputs; - SDValue SubSrc = peekThroughOneUseBitcasts(Sub); - EVT SubSrcVT = SubSrc.getValueType(); - if (!SubSrcVT.isVector()) - return false; + // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements()); if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG, Depth + 1, ResolveKnownElts)) @@ -6230,10 +6232,11 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, if (SubMask.size() != NumSubElts) { assert(((SubMask.size() % NumSubElts) == 0 || - (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"); + (NumSubElts % SubMask.size()) == 0) && + "Illegal submask scale"); if ((NumSubElts % SubMask.size()) == 0) { int Scale = NumSubElts / SubMask.size(); - SmallVector ScaledSubMask; + SmallVector ScaledSubMask; narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask); SubMask = ScaledSubMask; } else { From 2a9948f0384d30fa07522f29ddb2de62560d75a4 Mon Sep 17 00:00:00 2001 From: Alex Bradbury Date: Thu, 3 Apr 2025 13:47:59 +0100 Subject: [PATCH 0516/1029] Revert "[CLANG-CL] ignores wpadded" (#134239) Reverts llvm/llvm-project#130182 This is causing failures on RISC-V and ppc builders as mentioned on https://github.com/llvm/llvm-project/pull/130182#issuecomment-2775516899 Reverting so the issue can be fixed by the path author without time pressure (as noted in that PR, it seems a value is uninitialised). --- clang/docs/ReleaseNotes.rst | 2 - clang/lib/AST/RecordLayoutBuilder.cpp | 65 ++++--------------- .../test/SemaCXX/windows-Wpadded-bitfield.cpp | 32 --------- clang/test/SemaCXX/windows-Wpadded.cpp | 40 ------------ 4 files changed, 11 insertions(+), 128 deletions(-) delete mode 100644 clang/test/SemaCXX/windows-Wpadded-bitfield.cpp delete mode 100644 clang/test/SemaCXX/windows-Wpadded.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 47f9c3caa0e47..fdf9a246d6373 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -190,8 +190,6 @@ Modified Compiler Flags - The compiler flag `-fbracket-depth` default value is increased from 256 to 2048. (#GH94728) -- `-Wpadded` option implemented for the `x86_64-windows-msvc` target. Fixes #61702 - Removed Compiler Flags ------------------------- diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp index 41e7198cb7581..3e756ab9b9bfe 100644 --- a/clang/lib/AST/RecordLayoutBuilder.cpp +++ b/clang/lib/AST/RecordLayoutBuilder.cpp @@ -2274,9 +2274,9 @@ static unsigned getPaddingDiagFromTagKind(TagTypeKind Tag) { } } -static void CheckFieldPadding(const ASTContext &Context, bool IsUnion, - uint64_t Offset, uint64_t UnpaddedOffset, - const FieldDecl *D) { +void ItaniumRecordLayoutBuilder::CheckFieldPadding( + uint64_t Offset, uint64_t UnpaddedOffset, uint64_t UnpackedOffset, + unsigned UnpackedAlign, bool isPacked, const FieldDecl *D) { // We let objc ivars without warning, objc interfaces generally are not used // for padding tricks. if (isa(D)) @@ -2300,8 +2300,7 @@ static void CheckFieldPadding(const ASTContext &Context, bool IsUnion, if (D->getIdentifier()) { auto Diagnostic = D->isBitField() ? diag::warn_padded_struct_bitfield : diag::warn_padded_struct_field; - Context.getDiagnostics().Report(D->getLocation(), - Diagnostic) + Diag(D->getLocation(), Diagnostic) << getPaddingDiagFromTagKind(D->getParent()->getTagKind()) << Context.getTypeDeclType(D->getParent()) << PadSize << (InBits ? 1 : 0) // (byte|bit) @@ -2309,22 +2308,15 @@ static void CheckFieldPadding(const ASTContext &Context, bool IsUnion, } else { auto Diagnostic = D->isBitField() ? diag::warn_padded_struct_anon_bitfield : diag::warn_padded_struct_anon_field; - Context.getDiagnostics().Report(D->getLocation(), - Diagnostic) + Diag(D->getLocation(), Diagnostic) << getPaddingDiagFromTagKind(D->getParent()->getTagKind()) << Context.getTypeDeclType(D->getParent()) << PadSize << (InBits ? 1 : 0); // (byte|bit) } - } -} - -void ItaniumRecordLayoutBuilder::CheckFieldPadding( - uint64_t Offset, uint64_t UnpaddedOffset, uint64_t UnpackedOffset, - unsigned UnpackedAlign, bool isPacked, const FieldDecl *D) { - ::CheckFieldPadding(Context, IsUnion, Offset, UnpaddedOffset, D); - if (isPacked && Offset != UnpackedOffset) { - HasPackedField = true; - } + } + if (isPacked && Offset != UnpackedOffset) { + HasPackedField = true; + } } static const CXXMethodDecl *computeKeyFunction(ASTContext &Context, @@ -2650,6 +2642,8 @@ struct MicrosoftRecordLayoutBuilder { /// virtual base classes and their offsets in the record. ASTRecordLayout::VBaseOffsetsMapTy VBases; /// The number of remaining bits in our last bitfield allocation. + /// This value isn't meaningful unless LastFieldIsNonZeroWidthBitfield is + /// true. unsigned RemainingBitsInField; bool IsUnion : 1; /// True if the last field laid out was a bitfield and was not 0 @@ -3010,15 +3004,6 @@ void MicrosoftRecordLayoutBuilder::layoutField(const FieldDecl *FD) { } else { FieldOffset = Size.alignTo(Info.Alignment); } - - uint64_t UnpaddedFielddOffsetInBits = - Context.toBits(DataSize) - RemainingBitsInField; - - ::CheckFieldPadding(Context, IsUnion, Context.toBits(FieldOffset), - UnpaddedFielddOffsetInBits, FD); - - RemainingBitsInField = 0; - placeFieldAtOffset(FieldOffset); if (!IsOverlappingEmptyField) @@ -3064,14 +3049,10 @@ void MicrosoftRecordLayoutBuilder::layoutBitField(const FieldDecl *FD) { } else { // Allocate a new block of memory and place the bitfield in it. CharUnits FieldOffset = Size.alignTo(Info.Alignment); - uint64_t UnpaddedFieldOffsetInBits = - Context.toBits(DataSize) - RemainingBitsInField; placeFieldAtOffset(FieldOffset); Size = FieldOffset + Info.Size; Alignment = std::max(Alignment, Info.Alignment); RemainingBitsInField = Context.toBits(Info.Size) - Width; - ::CheckFieldPadding(Context, IsUnion, Context.toBits(FieldOffset), - UnpaddedFieldOffsetInBits, FD); } DataSize = Size; } @@ -3095,14 +3076,9 @@ MicrosoftRecordLayoutBuilder::layoutZeroWidthBitField(const FieldDecl *FD) { } else { // Round up the current record size to the field's alignment boundary. CharUnits FieldOffset = Size.alignTo(Info.Alignment); - uint64_t UnpaddedFieldOffsetInBits = - Context.toBits(DataSize) - RemainingBitsInField; placeFieldAtOffset(FieldOffset); - RemainingBitsInField = 0; Size = FieldOffset; Alignment = std::max(Alignment, Info.Alignment); - ::CheckFieldPadding(Context, IsUnion, Context.toBits(FieldOffset), - UnpaddedFieldOffsetInBits, FD); } DataSize = Size; } @@ -3227,9 +3203,6 @@ void MicrosoftRecordLayoutBuilder::layoutVirtualBases(const CXXRecordDecl *RD) { } void MicrosoftRecordLayoutBuilder::finalizeLayout(const RecordDecl *RD) { - uint64_t UnpaddedSizeInBits = Context.toBits(DataSize); - UnpaddedSizeInBits -= RemainingBitsInField; - // Respect required alignment. Note that in 32-bit mode Required alignment // may be 0 and cause size not to be updated. DataSize = Size; @@ -3258,22 +3231,6 @@ void MicrosoftRecordLayoutBuilder::finalizeLayout(const RecordDecl *RD) { Size = Context.toCharUnitsFromBits(External.Size); if (External.Align) Alignment = Context.toCharUnitsFromBits(External.Align); - return; - } - unsigned CharBitNum = Context.getTargetInfo().getCharWidth(); - uint64_t SizeInBits = Context.toBits(Size); - if (SizeInBits > UnpaddedSizeInBits) { - unsigned int PadSize = SizeInBits - UnpaddedSizeInBits; - bool InBits = true; - if (PadSize % CharBitNum == 0) { - PadSize = PadSize / CharBitNum; - InBits = false; - } - - Context.getDiagnostics().Report(RD->getLocation(), - diag::warn_padded_struct_size) - << Context.getTypeDeclType(RD) << PadSize - << (InBits ? 1 : 0); // (byte|bit) } } diff --git a/clang/test/SemaCXX/windows-Wpadded-bitfield.cpp b/clang/test/SemaCXX/windows-Wpadded-bitfield.cpp deleted file mode 100644 index ee5a57124eca5..0000000000000 --- a/clang/test/SemaCXX/windows-Wpadded-bitfield.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// RUN: %clang_cc1 -triple x86_64-windows-msvc -fsyntax-only -verify -Wpadded %s - -struct __attribute__((ms_struct)) BitfieldStruct { // expected-warning {{padding size of 'BitfieldStruct' with 3 bytes to alignment boundary}} - char c : 1; - int : 0; // expected-warning {{padding struct 'BitfieldStruct' with 31 bits to align anonymous bit-field}} - char i; -}; - -struct __attribute__((ms_struct)) SevenBitfieldStruct { // expected-warning {{padding size of 'SevenBitfieldStruct' with 3 bytes to alignment boundary}} - char c : 7; - int : 0; // expected-warning {{padding struct 'SevenBitfieldStruct' with 25 bits to align anonymous bit-field}} - char i; -}; - -struct __attribute__((ms_struct)) SameUnitSizeBitfield { - char c : 7; - char : 1; // Same unit size attributes fall in the same unit + they fill the unit -> no padding - char i; -}; - -struct __attribute__((ms_struct)) DifferentUnitSizeBitfield { // expected-warning {{padding size of 'DifferentUnitSizeBitfield' with 3 bytes to alignment boundary}} - char c : 7; - int : 1; // expected-warning {{padding struct 'DifferentUnitSizeBitfield' with 25 bits to align anonymous bit-field}} - char i; // expected-warning {{padding struct 'DifferentUnitSizeBitfield' with 31 bits to align 'i'}} -}; - -int main() { - BitfieldStruct b; - SevenBitfieldStruct s; - SameUnitSizeBitfield su; - DifferentUnitSizeBitfield du; -} diff --git a/clang/test/SemaCXX/windows-Wpadded.cpp b/clang/test/SemaCXX/windows-Wpadded.cpp deleted file mode 100644 index da3f2bf08c6b8..0000000000000 --- a/clang/test/SemaCXX/windows-Wpadded.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// RUN: %clang_cc1 -triple x86_64-windows-msvc -fsyntax-only -verify -Wpadded %s -// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -verify -Wpadded %s - -struct __attribute__((ms_struct)) Foo { // expected-warning {{padding size of 'Foo' with 3 bytes to alignment boundary}} - int b : 1; - char a; // expected-warning {{padding struct 'Foo' with 31 bits to align 'a'}} -}; - -struct __attribute__((ms_struct)) AlignedStruct { // expected-warning {{padding size of 'AlignedStruct' with 4 bytes to alignment boundary}} - char c; - alignas(8) int i; // expected-warning {{padding struct 'AlignedStruct' with 7 bytes to align 'i'}} -}; - - -struct Base { - int b; -}; - -struct Derived : public Base { // expected-warning {{padding size of 'Derived' with 3 bytes to alignment boundary}} - char c; -}; - -union __attribute__((ms_struct)) Union { - char c; - long long u; -}; - -struct __attribute__((ms_struct)) StructWithUnion { // expected-warning {{padding size of 'StructWithUnion' with 6 bytes to alignment boundary}} - char c; - int : 0; - Union t; // expected-warning {{padding struct 'StructWithUnion' with 7 bytes to align 't'}} - short i; -}; - -int main() { - Foo f; - AlignedStruct a; - Derived d; - StructWithUnion swu; -} From ebacd46996a7f041be73cf31b5776503e8061e8b Mon Sep 17 00:00:00 2001 From: Koakuma Date: Thu, 3 Apr 2025 19:55:18 +0700 Subject: [PATCH 0517/1029] [SPARC][MC] Add tests for VIS family instructions Also fix up any mistakes/typos in instruction definitions. Reviewers: rorth, s-barannikov, brad0, MaskRay Reviewed By: s-barannikov Pull Request: https://github.com/llvm/llvm-project/pull/130967 --- llvm/lib/Target/Sparc/SparcInstrFormats.td | 15 + llvm/lib/Target/Sparc/SparcInstrInfo.td | 3 + llvm/lib/Target/Sparc/SparcInstrVIS.td | 113 ++++--- llvm/test/MC/Disassembler/Sparc/sparc-vis.txt | 291 ++++++++++++++++++ llvm/test/MC/Sparc/sparc-vis.s | 239 +++++++++++++- llvm/test/MC/Sparc/sparc-vis2.s | 55 ++++ llvm/test/MC/Sparc/sparc-vis3.s | 133 ++++++++ 7 files changed, 798 insertions(+), 51 deletions(-) create mode 100644 llvm/test/MC/Disassembler/Sparc/sparc-vis.txt create mode 100644 llvm/test/MC/Sparc/sparc-vis2.s create mode 100644 llvm/test/MC/Sparc/sparc-vis3.s diff --git a/llvm/lib/Target/Sparc/SparcInstrFormats.td b/llvm/lib/Target/Sparc/SparcInstrFormats.td index 3939f4ed94276..4ff902b190a3b 100644 --- a/llvm/lib/Target/Sparc/SparcInstrFormats.td +++ b/llvm/lib/Target/Sparc/SparcInstrFormats.td @@ -201,6 +201,21 @@ class F3_3c opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins, let Inst{4-0} = rs2; } +// SIAM instruction +class F3_3_siam opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins, + string asmstr, list pattern, InstrItinClass itin = NoItinerary> + : F3 { + bits<3> mode; + + let op = opVal; + let op3 = op3val; + let rd = 0; + let rs1 = 0; + let Inst{13-5} = opfval; // fp opcode + let Inst{4-3} = 0; + let Inst{2-0} = mode; +} + // Shift by register rs2. class F3_Sr opVal, bits<6> op3val, bit xVal, dag outs, dag ins, string asmstr, list pattern, InstrItinClass itin = IIC_iu_instr> diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td index c3b1fdf14d73e..d5af2000d0481 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.td +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td @@ -82,6 +82,9 @@ def UseDeprecatedInsts : Predicate<"Subtarget->useV8DeprecatedInsts()">; // Instruction Pattern Stuff //===----------------------------------------------------------------------===// +// FIXME these should have AsmOperandClass. +def uimm3 : PatLeaf<(imm), [{ return isUInt<3>(N->getZExtValue()); }]>; + def simm10 : PatLeaf<(imm), [{ return isInt<10>(N->getSExtValue()); }]>; def simm11 : PatLeaf<(imm), [{ return isInt<11>(N->getSExtValue()); }]>; diff --git a/llvm/lib/Target/Sparc/SparcInstrVIS.td b/llvm/lib/Target/Sparc/SparcInstrVIS.td index bdefc70869d74..8ce8f37f34040 100644 --- a/llvm/lib/Target/Sparc/SparcInstrVIS.td +++ b/llvm/lib/Target/Sparc/SparcInstrVIS.td @@ -7,76 +7,91 @@ //===----------------------------------------------------------------------===// // // This file contains instruction formats, definitions and patterns needed for -// VIS, VIS II, VIS II instructions on SPARC. +// VIS, VIS II, VIS III instructions on SPARC. //===----------------------------------------------------------------------===// // VIS Instruction Format. -class VISInstFormat opfval, dag outs, dag ins, string asmstr, - list pattern> - : F3_3<0b10, 0b110110, opfval, outs, ins, asmstr, pattern>; +class VISInstFormat opfval, dag outs, dag ins, string asmstr> + : F3_3<0b10, 0b110110, opfval, outs, ins, asmstr, []>; class VISInst opfval, string OpcStr, RegisterClass RC = DFPRegs> : VISInstFormat; + !strconcat(OpcStr, " $rs1, $rs2, $rd")>; // VIS Instruction with integer destination register. class VISInstID opfval, string OpcStr> : VISInstFormat; + !strconcat(OpcStr, " $rs1, $rs2, $rd")>; // For VIS Instructions with no operand. let rd = 0, rs1 = 0, rs2 = 0 in class VISInst0 opfval, string asmstr> - : VISInstFormat; + : VISInstFormat; // For VIS Instructions with only rs1, rd operands. let rs2 = 0 in class VISInst1 opfval, string OpcStr, RegisterClass RC = DFPRegs> : VISInstFormat; + !strconcat(OpcStr, " $rs1, $rd")>; // For VIS Instructions with only rs2, rd operands. let rs1 = 0 in class VISInst2 opfval, string OpcStr, RegisterClass RC = DFPRegs> : VISInstFormat; + !strconcat(OpcStr, " $rs2, $rd")>; // For VIS Instructions with only rd operand. let Constraints = "$rd = $f", rs1 = 0, rs2 = 0 in class VISInstD opfval, string OpcStr, RegisterClass RC = DFPRegs> : VISInstFormat; + !strconcat(OpcStr, " $rd")>; // VIS 1 Instructions let Predicates = [HasVIS] in { def FPADD16 : VISInst<0b001010000, "fpadd16">; -def FPADD16S : VISInst<0b001010001, "fpadd16s">; +def FPADD16S : VISInst<0b001010001, "fpadd16s", FPRegs>; def FPADD32 : VISInst<0b001010010, "fpadd32">; -def FPADD32S : VISInst<0b001010011, "fpadd32s">; +def FPADD32S : VISInst<0b001010011, "fpadd32s", FPRegs>; def FPSUB16 : VISInst<0b001010100, "fpsub16">; -def FPSUB16S : VISInst<0b001010101, "fpsub16S">; +def FPSUB16S : VISInst<0b001010101, "fpsub16s", FPRegs>; def FPSUB32 : VISInst<0b001010110, "fpsub32">; -def FPSUB32S : VISInst<0b001010111, "fpsub32S">; +def FPSUB32S : VISInst<0b001010111, "fpsub32s", FPRegs>; def FPACK16 : VISInst2<0b000111011, "fpack16">; def FPACK32 : VISInst <0b000111010, "fpack32">; -def FPACKFIX : VISInst2<0b000111101, "fpackfix">; -def FEXPAND : VISInst2<0b001001101, "fexpand">; -def FPMERGE : VISInst <0b001001011, "fpmerge">; - -def FMUL8X16 : VISInst<0b000110001, "fmul8x16">; -def FMUL8X16AU : VISInst<0b000110011, "fmul8x16au">; -def FMUL8X16AL : VISInst<0b000110101, "fmul8x16al">; +let rs1 = 0 in +def FPACKFIX : VISInstFormat<0b000111101, + (outs FPRegs:$rd), (ins DFPRegs:$rs2), "fpackfix $rs2, $rd">; +let rs1 = 0 in +def FEXPAND : VISInstFormat<0b001001101, + (outs DFPRegs:$rd), (ins FPRegs:$rs2), "fexpand $rs2, $rd">; +def FPMERGE : VISInstFormat<0b001001011, + (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2), + "fpmerge $rs1, $rs2, $rd">; + +def FMUL8X16 : VISInstFormat<0b000110001, + (outs DFPRegs:$rd), (ins FPRegs:$rs1, DFPRegs:$rs2), + "fmul8x16 $rs1, $rs2, $rd">; +def FMUL8X16AU : VISInstFormat<0b000110011, + (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2), + "fmul8x16au $rs1, $rs2, $rd">; +def FMUL8X16AL : VISInstFormat<0b000110101, + (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2), + "fmul8x16al $rs1, $rs2, $rd">; def FMUL8SUX16 : VISInst<0b000110110, "fmul8sux16">; def FMUL8ULX16 : VISInst<0b000110111, "fmul8ulx16">; -def FMULD8SUX16 : VISInst<0b000111000, "fmuld8sux16">; -def FMULD8ULX16 : VISInst<0b000111001, "fmuld8ulx16">; +def FMULD8SUX16 : VISInstFormat<0b000111000, + (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2), + "fmuld8sux16 $rs1, $rs2, $rd">; +def FMULD8ULX16 : VISInstFormat<0b000111001, + (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2), + "fmuld8ulx16 $rs1, $rs2, $rd">; def ALIGNADDR : VISInst<0b000011000, "alignaddr", I64Regs>; def ALIGNADDRL : VISInst<0b000011010, "alignaddrl", I64Regs>; @@ -148,9 +163,11 @@ def SHUTDOWN : VISInst0<0b010000000, "shutdown">; let Predicates = [HasVIS2] in { def BMASK : VISInst<0b000011001, "bmask", I64Regs>; -def BSHUFFLE : VISInst<0b000011100, "bshuffle">; +def BSHUFFLE : VISInst<0b001001100, "bshuffle">; -def SIAM : VISInst0<0b010000001, "siam">; +let rd = 0, rs1 = 0 in +def SIAM : F3_3_siam<0b10, 0b110110, 0b010000001, (outs), + (ins i32imm:$mode), "siam $mode", []>; def EDGE8N : VISInst<0b000000001, "edge8n", I64Regs>; def EDGE8LN : VISInst<0b000000011, "edge8ln", I64Regs>; @@ -172,59 +189,59 @@ def ADDXCCC : VISInst<0b000010011, "addxccc", I64Regs>; let rd = 0, rs1 = 0 in { def CMASK8 : VISInstFormat<0b000011011, (outs), (ins I64Regs:$rs2), - "cmask8 $rs2", []>; + "cmask8 $rs2">; def CMASK16 : VISInstFormat<0b000011101, (outs), (ins I64Regs:$rs2), - "cmask16 $rs2", []>; + "cmask16 $rs2">; def CMASK32 : VISInstFormat<0b000011111, (outs), (ins I64Regs:$rs2), - "cmask32 $rs2", []>; + "cmask32 $rs2">; } def FCHKSM16 : VISInst<0b001000100, "fchksm16">; def FHADDS : F3_3<0b10, 0b110100, 0b001100001, - (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), + (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2), "fhadds $rs1, $rs2, $rd", []>; def FHADDD : F3_3<0b10, 0b110100, 0b001100010, (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), "fhaddd $rs1, $rs2, $rd", []>; def FHSUBS : F3_3<0b10, 0b110100, 0b001100101, - (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), + (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2), "fhsubs $rs1, $rs2, $rd", []>; def FHSUBD : F3_3<0b10, 0b110100, 0b001100110, (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), "fhsubd $rs1, $rs2, $rd", []>; def FLCMPS : VISInstFormat<0b101010001, (outs FCCRegs:$rd), - (ins DFPRegs:$rs1, DFPRegs:$rs2), - "flcmps $rd, $rs1, $rs2", []>; + (ins FPRegs:$rs1, FPRegs:$rs2), + "flcmps $rd, $rs1, $rs2">; def FLCMPD : VISInstFormat<0b101010010, (outs FCCRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), - "flcmpd $rd, $rs1, $rs2", []>; + "flcmpd $rd, $rs1, $rs2">; def FMEAN16 : VISInst<0b001000000, "fmean16">; def FNADDS : F3_3<0b10, 0b110100, 0b001010001, - (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), + (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2), "fnadds $rs1, $rs2, $rd", []>; def FNADDD : F3_3<0b10, 0b110100, 0b001010010, (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), "fnaddd $rs1, $rs2, $rd", []>; def FNHADDS : F3_3<0b10, 0b110100, 0b001110001, - (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), + (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2), "fnhadds $rs1, $rs2, $rd", []>; def FNHADDD : F3_3<0b10, 0b110100, 0b001110010, (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), "fnhaddd $rs1, $rs2, $rd", []>; def FNMULS : F3_3<0b10, 0b110100, 0b001011001, - (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), - "fnhadds $rs1, $rs2, $rd", []>; + (outs FPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2), + "fnmuls $rs1, $rs2, $rd", []>; def FNMULD : F3_3<0b10, 0b110100, 0b001011010, (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), - "fnhaddd $rs1, $rs2, $rd", []>; + "fnmuld $rs1, $rs2, $rd", []>; def FNSMULD : F3_3<0b10, 0b110100, 0b001111001, - (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), - "fnhadds $rs1, $rs2, $rd", []>; + (outs DFPRegs:$rd), (ins FPRegs:$rs1, FPRegs:$rs2), + "fnsmuld $rs1, $rs2, $rd", []>; def FPADD64 : VISInst<0b001000010, "fpadd64">; @@ -239,24 +256,24 @@ def FSRA32 : VISInst<0b000101111, "fsra32">; let rs1 = 0 in def LZCNT : VISInstFormat<0b000010111, (outs I64Regs:$rd), - (ins I64Regs:$rs2), "lzcnt $rs2, $rd", []>; + (ins I64Regs:$rs2), "lzcnt $rs2, $rd">; let rs1 = 0 in { def MOVSTOSW : VISInstFormat<0b100010011, (outs I64Regs:$rd), - (ins DFPRegs:$rs2), "movstosw $rs2, $rd", []>; + (ins DFPRegs:$rs2), "movstosw $rs2, $rd">; def MOVSTOUW : VISInstFormat<0b100010001, (outs I64Regs:$rd), - (ins DFPRegs:$rs2), "movstouw $rs2, $rd", []>; + (ins DFPRegs:$rs2), "movstouw $rs2, $rd">; def MOVDTOX : VISInstFormat<0b100010000, (outs I64Regs:$rd), - (ins DFPRegs:$rs2), "movdtox $rs2, $rd", []>; + (ins DFPRegs:$rs2), "movdtox $rs2, $rd">; def MOVWTOS : VISInstFormat<0b100011001, (outs DFPRegs:$rd), - (ins I64Regs:$rs2), "movdtox $rs2, $rd", []>; + (ins I64Regs:$rs2), "movwtos $rs2, $rd">; def MOVXTOD : VISInstFormat<0b100011000, (outs DFPRegs:$rd), - (ins I64Regs:$rs2), "movdtox $rs2, $rd", []>; + (ins I64Regs:$rs2), "movxtod $rs2, $rd">; } -def PDISTN : VISInst<0b000111111, "pdistn">; +def PDISTN : VISInstID<0b000111111, "pdistn">; def UMULXHI : VISInst<0b000010110, "umulxhi", I64Regs>; def XMULX : VISInst<0b100010101, "xmulx", I64Regs>; -def XMULXHI : VISInst<0b100010111, "xmulxhi", I64Regs>; +def XMULXHI : VISInst<0b100010110, "xmulxhi", I64Regs>; } // Predicates = [IsVIS3] diff --git a/llvm/test/MC/Disassembler/Sparc/sparc-vis.txt b/llvm/test/MC/Disassembler/Sparc/sparc-vis.txt new file mode 100644 index 0000000000000..56105fb41e8a5 --- /dev/null +++ b/llvm/test/MC/Disassembler/Sparc/sparc-vis.txt @@ -0,0 +1,291 @@ +# RUN: llvm-mc --disassemble %s -triple=sparcv9-unknown-linux -mattr=+vis,+vis2,+vis3 | FileCheck %s + +## VIS 1 instructions. + +# CHECK: fpadd16 %f0, %f2, %f4 +0x89,0xb0,0x0a,0x02 +# CHECK: fpadd16s %f1, %f3, %f5 +0x8b,0xb0,0x4a,0x23 +# CHECK: fpadd32 %f0, %f2, %f4 +0x89,0xb0,0x0a,0x42 +# CHECK: fpadd32s %f1, %f3, %f5 +0x8b,0xb0,0x4a,0x63 +# CHECK: fpsub16 %f0, %f2, %f4 +0x89,0xb0,0x0a,0x82 +# CHECK: fpsub16s %f1, %f3, %f5 +0x8b,0xb0,0x4a,0xa3 +# CHECK: fpsub32 %f0, %f2, %f4 +0x89,0xb0,0x0a,0xc2 +# CHECK: fpsub32s %f1, %f3, %f5 +0x8b,0xb0,0x4a,0xe3 + +# CHECK: fpack16 %f0, %f2 +0x85,0xb0,0x07,0x60 +# CHECK: fpack32 %f0, %f2, %f4 +0x89,0xb0,0x07,0x42 +# CHECK: fpackfix %f0, %f3 +0x87,0xb0,0x07,0xa0 +# CHECK: fexpand %f1, %f2 +0x85,0xb0,0x09,0xa1 +# CHECK: fpmerge %f1, %f3, %f4 +0x89,0xb0,0x49,0x63 + +# CHECK: fmul8x16 %f1, %f2, %f4 +0x89,0xb0,0x46,0x22 +# CHECK: fmul8x16au %f1, %f3, %f4 +0x89,0xb0,0x46,0x63 +# CHECK: fmul8x16al %f1, %f3, %f4 +0x89,0xb0,0x46,0xa3 +# CHECK: fmul8sux16 %f0, %f2, %f4 +0x89,0xb0,0x06,0xc2 +# CHECK: fmul8ulx16 %f0, %f2, %f4 +0x89,0xb0,0x06,0xe2 +# CHECK: fmuld8sux16 %f1, %f3, %f4 +0x89,0xb0,0x47,0x03 +# CHECK: fmuld8ulx16 %f1, %f3, %f4 +0x89,0xb0,0x47,0x23 + +# CHECK: alignaddr %o0, %o1, %o2 +0x95,0xb2,0x03,0x09 +# CHECK: alignaddrl %o0, %o1, %o2 +0x95,0xb2,0x03,0x49 +# CHECK: faligndata %f0, %f2, %f4 +0x89,0xb0,0x09,0x02 + +# CHECK: fzero %f0 +0x81,0xb0,0x0c,0x00 +# CHECK: fzeros %f1 +0x83,0xb0,0x0c,0x20 +# CHECK: fone %f0 +0x81,0xb0,0x0f,0xc0 +# CHECK: fones %f1 +0x83,0xb0,0x0f,0xe0 +# CHECK: fsrc1 %f0, %f2 +0x85,0xb0,0x0e,0x80 +# CHECK: fsrc1s %f1, %f3 +0x87,0xb0,0x4e,0xa0 +# CHECK: fsrc2 %f0, %f2 +0x85,0xb0,0x0f,0x00 +# CHECK: fsrc2s %f1, %f3 +0x87,0xb0,0x0f,0x21 +# CHECK: fnot1 %f0, %f2 +0x85,0xb0,0x0d,0x40 +# CHECK: fnot1s %f1, %f3 +0x87,0xb0,0x4d,0x60 +# CHECK: fnot2 %f0, %f2 +0x85,0xb0,0x0c,0xc0 +# CHECK: fnot2s %f1, %f3 +0x87,0xb0,0x0c,0xe1 +# CHECK: for %f0, %f2, %f4 +0x89,0xb0,0x0f,0x82 +# CHECK: fors %f1, %f3, %f5 +0x8b,0xb0,0x4f,0xa3 +# CHECK: fnor %f0, %f2, %f4 +0x89,0xb0,0x0c,0x42 +# CHECK: fnors %f1, %f3, %f5 +0x8b,0xb0,0x4c,0x63 +# CHECK: fand %f0, %f2, %f4 +0x89,0xb0,0x0e,0x02 +# CHECK: fands %f1, %f3, %f5 +0x8b,0xb0,0x4e,0x23 +# CHECK: fnand %f0, %f2, %f4 +0x89,0xb0,0x0d,0xc2 +# CHECK: fnands %f1, %f3, %f5 +0x8b,0xb0,0x4d,0xe3 +# CHECK: fxor %f0, %f2, %f4 +0x89,0xb0,0x0d,0x82 +# CHECK: fxors %f1, %f3, %f5 +0x8b,0xb0,0x4d,0xa3 +# CHECK: fxnor %f0, %f2, %f4 +0x89,0xb0,0x0e,0x42 +# CHECK: fxnors %f1, %f3, %f5 +0x8b,0xb0,0x4e,0x63 + +# CHECK: fornot1 %f0, %f2, %f4 +0x89,0xb0,0x0f,0x42 +# CHECK: fornot1s %f1, %f3, %f5 +0x8b,0xb0,0x4f,0x63 +# CHECK: fornot2 %f0, %f2, %f4 +0x89,0xb0,0x0e,0xc2 +# CHECK: fornot2s %f1, %f3, %f5 +0x8b,0xb0,0x4e,0xe3 +# CHECK: fandnot1 %f0, %f2, %f4 +0x89,0xb0,0x0d,0x02 +# CHECK: fandnot1s %f1, %f3, %f5 +0x8b,0xb0,0x4d,0x23 +# CHECK: fandnot2 %f0, %f2, %f4 +0x89,0xb0,0x0c,0x82 +# CHECK: fandnot2s %f1, %f3, %f5 +0x8b,0xb0,0x4c,0xa3 + +# CHECK: fcmpgt16 %f0, %f2, %o0 +0x91,0xb0,0x05,0x02 +# CHECK: fcmpgt32 %f0, %f2, %o0 +0x91,0xb0,0x05,0x82 +# CHECK: fcmple16 %f0, %f2, %o0 +0x91,0xb0,0x04,0x02 +# CHECK: fcmple32 %f0, %f2, %o0 +0x91,0xb0,0x04,0x82 +# CHECK: fcmpne16 %f0, %f2, %o0 +0x91,0xb0,0x04,0x42 +# CHECK: fcmpne32 %f0, %f2, %o0 +0x91,0xb0,0x04,0xc2 +# CHECK: fcmpeq16 %f0, %f2, %o0 +0x91,0xb0,0x05,0x42 +# CHECK: fcmpeq32 %f0, %f2, %o0 +0x91,0xb0,0x05,0xc2 + +# CHECK: edge8 %o0, %o1, %o2 +0x95,0xb2,0x00,0x09 +# CHECK: edge8l %o0, %o1, %o2 +0x95,0xb2,0x00,0x49 +# CHECK: edge16 %o0, %o1, %o2 +0x95,0xb2,0x00,0x89 +# CHECK: edge16l %o0, %o1, %o2 +0x95,0xb2,0x00,0xc9 +# CHECK: edge32 %o0, %o1, %o2 +0x95,0xb2,0x01,0x09 +# CHECK: edge32l %o0, %o1, %o2 +0x95,0xb2,0x01,0x49 + +# CHECK: pdist %f0, %f2, %f4 +0x89,0xb0,0x07,0xc2 + +# CHECK: array8 %o0, %o1, %o2 +0x95,0xb2,0x02,0x09 +# CHECK: array16 %o0, %o1, %o2 +0x95,0xb2,0x02,0x49 +# CHECK: array32 %o0, %o1, %o2 +0x95,0xb2,0x02,0x89 + +# CHECK: shutdown +0x81,0xb0,0x10,0x00 + +## VIS 2 instructions. + +# CHECK: bmask %o0, %o1, %o2 +0x95,0xb2,0x03,0x29 +# CHECK: bshuffle %f0, %f2, %f4 +0x89,0xb0,0x09,0x82 + +# CHECK: siam 0 +0x81,0xb0,0x10,0x20 +# CHECK: siam 1 +0x81,0xb0,0x10,0x21 +# CHECK: siam 2 +0x81,0xb0,0x10,0x22 +# CHECK: siam 3 +0x81,0xb0,0x10,0x23 +# CHECK: siam 4 +0x81,0xb0,0x10,0x24 +# CHECK: siam 5 +0x81,0xb0,0x10,0x25 +# CHECK: siam 6 +0x81,0xb0,0x10,0x26 +# CHECK: siam 7 +0x81,0xb0,0x10,0x27 + +# CHECK: edge8n %o0, %o1, %o2 +0x95,0xb2,0x00,0x29 +# CHECK: edge8ln %o0, %o1, %o2 +0x95,0xb2,0x00,0x69 +# CHECK: edge16n %o0, %o1, %o2 +0x95,0xb2,0x00,0xa9 +# CHECK: edge16ln %o0, %o1, %o2 +0x95,0xb2,0x00,0xe9 +# CHECK: edge32n %o0, %o1, %o2 +0x95,0xb2,0x01,0x29 +# CHECK: edge32ln %o0, %o1, %o2 +0x95,0xb2,0x01,0x69 + +## VIS 3 instructions. + +# CHECK: addxc %o0, %o1, %o2 +0x95,0xb2,0x02,0x29 +# CHECK: addxccc %o0, %o1, %o2 +0x95,0xb2,0x02,0x69 + +# CHECK: cmask8 %o0 +0x81,0xb0,0x03,0x68 +# CHECK: cmask16 %o0 +0x81,0xb0,0x03,0xa8 +# CHECK: cmask32 %o0 +0x81,0xb0,0x03,0xe8 + +# CHECK: fchksm16 %f0, %f2, %f4 +0x89,0xb0,0x08,0x82 +# CHECK: fmean16 %f0, %f2, %f4 +0x89,0xb0,0x08,0x02 + +# CHECK: fhadds %f1, %f3, %f5 +0x8b,0xa0,0x4c,0x23 +# CHECK: fhaddd %f0, %f2, %f4 +0x89,0xa0,0x0c,0x42 +# CHECK: fhsubs %f1, %f3, %f5 +0x8b,0xa0,0x4c,0xa3 +# CHECK: fhsubd %f0, %f2, %f4 +0x89,0xa0,0x0c,0xc2 +# CHECK: flcmps %fcc0, %f3, %f5 +0x81,0xb0,0xea,0x25 +# CHECK: flcmpd %fcc0, %f2, %f4 +0x81,0xb0,0xaa,0x44 + +# CHECK: fnadds %f1, %f3, %f5 +0x8b,0xa0,0x4a,0x23 +# CHECK: fnaddd %f0, %f2, %f4 +0x89,0xa0,0x0a,0x42 +# CHECK: fnhadds %f1, %f3, %f5 +0x8b,0xa0,0x4e,0x23 +# CHECK: fnhaddd %f0, %f2, %f4 +0x89,0xa0,0x0e,0x42 + +# CHECK: fnmuls %f1, %f3, %f5 +0x8b,0xa0,0x4b,0x23 +# CHECK: fnmuld %f0, %f2, %f4 +0x89,0xa0,0x0b,0x42 +# CHECK: fnsmuld %f1, %f3, %f4 +0x89,0xa0,0x4f,0x23 + +# CHECK: fpadd64 %f0, %f2, %f4 +0x89,0xb0,0x08,0x42 + +# CHECK: fsll16 %f0, %f2, %f4 +0x89,0xb0,0x04,0x22 +# CHECK: fsrl16 %f0, %f2, %f4 +0x89,0xb0,0x04,0x62 +# CHECK: fsll32 %f0, %f2, %f4 +0x89,0xb0,0x04,0xa2 +# CHECK: fsrl32 %f0, %f2, %f4 +0x89,0xb0,0x04,0xe2 +# CHECK: fslas16 %f0, %f2, %f4 +0x89,0xb0,0x05,0x22 +# CHECK: fsra16 %f0, %f2, %f4 +0x89,0xb0,0x05,0x62 +# CHECK: fslas32 %f0, %f2, %f4 +0x89,0xb0,0x05,0xa2 +# CHECK: fsra32 %f0, %f2, %f4 +0x89,0xb0,0x05,0xe2 + +# CHECK: lzcnt %o0, %o1 +0x93,0xb0,0x02,0xe8 + +# CHECK: movstosw %f0, %o0 +0x91,0xb0,0x22,0x60 +# CHECK: movstouw %f0, %o0 +0x91,0xb0,0x22,0x20 +# CHECK: movdtox %f0, %o0 +0x91,0xb0,0x22,0x00 +# CHECK: movwtos %o0, %f0 +0x81,0xb0,0x23,0x28 +# CHECK: movxtod %o0, %f0 +0x81,0xb0,0x23,0x08 + +# CHECK: pdistn %f0, %f2, %o0 +0x91,0xb0,0x07,0xe2 + +# CHECK: umulxhi %o0, %o1, %o2 +0x95,0xb2,0x02,0xc9 +# CHECK: xmulx %o0, %o1, %o2 +0x95,0xb2,0x22,0xa9 +# CHECK: xmulxhi %o0, %o1, %o2 +0x95,0xb2,0x22,0xc9 diff --git a/llvm/test/MC/Sparc/sparc-vis.s b/llvm/test/MC/Sparc/sparc-vis.s index 77e1ab1432eed..771d036e32db5 100644 --- a/llvm/test/MC/Sparc/sparc-vis.s +++ b/llvm/test/MC/Sparc/sparc-vis.s @@ -1,4 +1,237 @@ -! RUN: llvm-mc %s -triple=sparcv9 -mcpu=niagara -show-encoding | FileCheck %s +! RUN: not llvm-mc %s -triple=sparcv9 -show-encoding 2>&1 | FileCheck %s --check-prefixes=NO-VIS --implicit-check-not=error: +! RUN: llvm-mc %s -triple=sparcv9 -mattr=+vis -show-encoding | FileCheck %s --check-prefixes=VIS - ! CHECK: fzeros %f31 ! encoding: [0xbf,0xb0,0x0c,0x20] - fzeros %f31 +!! VIS 1 instructions. + +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fpadd16 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0a,0x02] +fpadd16 %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fpadd16s %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4a,0x23] +fpadd16s %f1, %f3, %f5 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fpadd32 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0a,0x42] +fpadd32 %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fpadd32s %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4a,0x63] +fpadd32s %f1, %f3, %f5 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fpsub16 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0a,0x82] +fpsub16 %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fpsub16s %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4a,0xa3] +fpsub16s %f1, %f3, %f5 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fpsub32 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0a,0xc2] +fpsub32 %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fpsub32s %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4a,0xe3] +fpsub32s %f1, %f3, %f5 + +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fpack16 %f0, %f2 ! encoding: [0x85,0xb0,0x07,0x60] +fpack16 %f0, %f2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fpack32 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x07,0x42] +fpack32 %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fpackfix %f0, %f3 ! encoding: [0x87,0xb0,0x07,0xa0] +fpackfix %f0, %f3 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fexpand %f1, %f2 ! encoding: [0x85,0xb0,0x09,0xa1] +fexpand %f1, %f2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fpmerge %f1, %f3, %f4 ! encoding: [0x89,0xb0,0x49,0x63] +fpmerge %f1, %f3, %f4 + +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fmul8x16 %f1, %f2, %f4 ! encoding: [0x89,0xb0,0x46,0x22] +fmul8x16 %f1, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fmul8x16au %f1, %f3, %f4 ! encoding: [0x89,0xb0,0x46,0x63] +fmul8x16au %f1, %f3, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fmul8x16al %f1, %f3, %f4 ! encoding: [0x89,0xb0,0x46,0xa3] +fmul8x16al %f1, %f3, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fmul8sux16 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x06,0xc2] +fmul8sux16 %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fmul8ulx16 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x06,0xe2] +fmul8ulx16 %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fmuld8sux16 %f1, %f3, %f4 ! encoding: [0x89,0xb0,0x47,0x03] +fmuld8sux16 %f1, %f3, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fmuld8ulx16 %f1, %f3, %f4 ! encoding: [0x89,0xb0,0x47,0x23] +fmuld8ulx16 %f1, %f3, %f4 + +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: alignaddr %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x03,0x09] +alignaddr %o0, %o1, %o2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: alignaddrl %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x03,0x49] +alignaddrl %o0, %o1, %o2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: faligndata %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x09,0x02] +faligndata %f0, %f2, %f4 + +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fzero %f0 ! encoding: [0x81,0xb0,0x0c,0x00] +fzero %f0 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fzeros %f1 ! encoding: [0x83,0xb0,0x0c,0x20] +fzeros %f1 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fone %f0 ! encoding: [0x81,0xb0,0x0f,0xc0] +fone %f0 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fones %f1 ! encoding: [0x83,0xb0,0x0f,0xe0] +fones %f1 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fsrc1 %f0, %f2 ! encoding: [0x85,0xb0,0x0e,0x80] +fsrc1 %f0, %f2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fsrc1s %f1, %f3 ! encoding: [0x87,0xb0,0x4e,0xa0] +fsrc1s %f1, %f3 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fsrc2 %f0, %f2 ! encoding: [0x85,0xb0,0x0f,0x00] +fsrc2 %f0, %f2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fsrc2s %f1, %f3 ! encoding: [0x87,0xb0,0x0f,0x21] +fsrc2s %f1, %f3 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fnot1 %f0, %f2 ! encoding: [0x85,0xb0,0x0d,0x40] +fnot1 %f0, %f2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fnot1s %f1, %f3 ! encoding: [0x87,0xb0,0x4d,0x60] +fnot1s %f1, %f3 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fnot2 %f0, %f2 ! encoding: [0x85,0xb0,0x0c,0xc0] +fnot2 %f0, %f2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fnot2s %f1, %f3 ! encoding: [0x87,0xb0,0x0c,0xe1] +fnot2s %f1, %f3 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: for %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0f,0x82] +for %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fors %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4f,0xa3] +fors %f1, %f3, %f5 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fnor %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0c,0x42] +fnor %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fnors %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4c,0x63] +fnors %f1, %f3, %f5 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fand %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0e,0x02] +fand %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fands %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4e,0x23] +fands %f1, %f3, %f5 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fnand %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0d,0xc2] +fnand %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fnands %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4d,0xe3] +fnands %f1, %f3, %f5 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fxor %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0d,0x82] +fxor %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fxors %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4d,0xa3] +fxors %f1, %f3, %f5 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fxnor %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0e,0x42] +fxnor %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fxnors %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4e,0x63] +fxnors %f1, %f3, %f5 + +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fornot1 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0f,0x42] +fornot1 %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fornot1s %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4f,0x63] +fornot1s %f1, %f3, %f5 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fornot2 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0e,0xc2] +fornot2 %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fornot2s %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4e,0xe3] +fornot2s %f1, %f3, %f5 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fandnot1 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0d,0x02] +fandnot1 %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fandnot1s %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4d,0x23] +fandnot1s %f1, %f3, %f5 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fandnot2 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x0c,0x82] +fandnot2 %f0, %f2, %f4 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fandnot2s %f1, %f3, %f5 ! encoding: [0x8b,0xb0,0x4c,0xa3] +fandnot2s %f1, %f3, %f5 + +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fcmpgt16 %f0, %f2, %o0 ! encoding: [0x91,0xb0,0x05,0x02] +fcmpgt16 %f0, %f2, %o0 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fcmpgt32 %f0, %f2, %o0 ! encoding: [0x91,0xb0,0x05,0x82] +fcmpgt32 %f0, %f2, %o0 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fcmple16 %f0, %f2, %o0 ! encoding: [0x91,0xb0,0x04,0x02] +fcmple16 %f0, %f2, %o0 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fcmple32 %f0, %f2, %o0 ! encoding: [0x91,0xb0,0x04,0x82] +fcmple32 %f0, %f2, %o0 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fcmpne16 %f0, %f2, %o0 ! encoding: [0x91,0xb0,0x04,0x42] +fcmpne16 %f0, %f2, %o0 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fcmpne32 %f0, %f2, %o0 ! encoding: [0x91,0xb0,0x04,0xc2] +fcmpne32 %f0, %f2, %o0 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fcmpeq16 %f0, %f2, %o0 ! encoding: [0x91,0xb0,0x05,0x42] +fcmpeq16 %f0, %f2, %o0 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: fcmpeq32 %f0, %f2, %o0 ! encoding: [0x91,0xb0,0x05,0xc2] +fcmpeq32 %f0, %f2, %o0 + +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: edge8 %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x00,0x09] +edge8 %o0, %o1, %o2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: edge8l %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x00,0x49] +edge8l %o0, %o1, %o2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: edge16 %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x00,0x89] +edge16 %o0, %o1, %o2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: edge16l %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x00,0xc9] +edge16l %o0, %o1, %o2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: edge32 %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x01,0x09] +edge32 %o0, %o1, %o2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: edge32l %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x01,0x49] +edge32l %o0, %o1, %o2 + +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: pdist %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x07,0xc2] +pdist %f0, %f2, %f4 + +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: array8 %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x02,0x09] +array8 %o0, %o1, %o2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: array16 %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x02,0x49] +array16 %o0, %o1, %o2 +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: array32 %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x02,0x89] +array32 %o0, %o1, %o2 + +! NO-VIS: error: instruction requires a CPU feature not currently enabled +! VIS: shutdown ! encoding: [0x81,0xb0,0x10,0x00] +shutdown diff --git a/llvm/test/MC/Sparc/sparc-vis2.s b/llvm/test/MC/Sparc/sparc-vis2.s new file mode 100644 index 0000000000000..3318884388562 --- /dev/null +++ b/llvm/test/MC/Sparc/sparc-vis2.s @@ -0,0 +1,55 @@ +! RUN: not llvm-mc %s -triple=sparcv9 -show-encoding 2>&1 | FileCheck %s --check-prefixes=NO-VIS2 +! RUN: llvm-mc %s -triple=sparcv9 -mattr=+vis2 -show-encoding | FileCheck %s --check-prefixes=VIS2 --implicit-check-not=error: + +!! VIS 2 instructions. + +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: bmask %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x03,0x29] +bmask %o0, %o1, %o2 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: bshuffle %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x09,0x82] +bshuffle %f0, %f2, %f4 + +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: siam 0 ! encoding: [0x81,0xb0,0x10,0x20] +siam 0 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: siam 1 ! encoding: [0x81,0xb0,0x10,0x21] +siam 1 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: siam 2 ! encoding: [0x81,0xb0,0x10,0x22] +siam 2 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: siam 3 ! encoding: [0x81,0xb0,0x10,0x23] +siam 3 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: siam 4 ! encoding: [0x81,0xb0,0x10,0x24] +siam 4 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: siam 5 ! encoding: [0x81,0xb0,0x10,0x25] +siam 5 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: siam 6 ! encoding: [0x81,0xb0,0x10,0x26] +siam 6 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: siam 7 ! encoding: [0x81,0xb0,0x10,0x27] +siam 7 + +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: edge8n %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x00,0x29] +edge8n %o0, %o1, %o2 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: edge8ln %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x00,0x69] +edge8ln %o0, %o1, %o2 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: edge16n %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x00,0xa9] +edge16n %o0, %o1, %o2 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: edge16ln %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x00,0xe9] +edge16ln %o0, %o1, %o2 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: edge32n %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x01,0x29] +edge32n %o0, %o1, %o2 +! NO-VIS2: error: instruction requires a CPU feature not currently enabled +! VIS2: edge32ln %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x01,0x69] +edge32ln %o0, %o1, %o2 diff --git a/llvm/test/MC/Sparc/sparc-vis3.s b/llvm/test/MC/Sparc/sparc-vis3.s new file mode 100644 index 0000000000000..ea189fb344767 --- /dev/null +++ b/llvm/test/MC/Sparc/sparc-vis3.s @@ -0,0 +1,133 @@ +! RUN: not llvm-mc %s -triple=sparcv9 -show-encoding 2>&1 | FileCheck %s --check-prefixes=NO-VIS3 +! RUN: llvm-mc %s -triple=sparcv9 -mattr=+vis3 -show-encoding | FileCheck %s --check-prefixes=VIS3 --implicit-check-not=error: + +!! VIS 3 instructions. + +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: addxc %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x02,0x29] +addxc %o0, %o1, %o2 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: addxccc %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x02,0x69] +addxccc %o0, %o1, %o2 + +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: cmask8 %o0 ! encoding: [0x81,0xb0,0x03,0x68] +cmask8 %o0 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: cmask16 %o0 ! encoding: [0x81,0xb0,0x03,0xa8] +cmask16 %o0 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: cmask32 %o0 ! encoding: [0x81,0xb0,0x03,0xe8] +cmask32 %o0 + +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fchksm16 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x08,0x82] +fchksm16 %f0, %f2, %f4 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fmean16 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x08,0x02] +fmean16 %f0, %f2, %f4 + +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fhadds %f1, %f3, %f5 ! encoding: [0x8b,0xa0,0x4c,0x23] +fhadds %f1, %f3, %f5 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fhaddd %f0, %f2, %f4 ! encoding: [0x89,0xa0,0x0c,0x42] +fhaddd %f0, %f2, %f4 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fhsubs %f1, %f3, %f5 ! encoding: [0x8b,0xa0,0x4c,0xa3] +fhsubs %f1, %f3, %f5 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fhsubd %f0, %f2, %f4 ! encoding: [0x89,0xa0,0x0c,0xc2] +fhsubd %f0, %f2, %f4 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: flcmps %fcc0, %f3, %f5 ! encoding: [0x81,0xb0,0xea,0x25] +flcmps %fcc0, %f3, %f5 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: flcmpd %fcc0, %f2, %f4 ! encoding: [0x81,0xb0,0xaa,0x44] +flcmpd %fcc0, %f2, %f4 + +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fnadds %f1, %f3, %f5 ! encoding: [0x8b,0xa0,0x4a,0x23] +fnadds %f1, %f3, %f5 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fnaddd %f0, %f2, %f4 ! encoding: [0x89,0xa0,0x0a,0x42] +fnaddd %f0, %f2, %f4 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fnhadds %f1, %f3, %f5 ! encoding: [0x8b,0xa0,0x4e,0x23] +fnhadds %f1, %f3, %f5 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fnhaddd %f0, %f2, %f4 ! encoding: [0x89,0xa0,0x0e,0x42] +fnhaddd %f0, %f2, %f4 + +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fnmuls %f1, %f3, %f5 ! encoding: [0x8b,0xa0,0x4b,0x23] +fnmuls %f1, %f3, %f5 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fnmuld %f0, %f2, %f4 ! encoding: [0x89,0xa0,0x0b,0x42] +fnmuld %f0, %f2, %f4 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fnsmuld %f1, %f3, %f4 ! encoding: [0x89,0xa0,0x4f,0x23] +fnsmuld %f1, %f3, %f4 + +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fpadd64 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x08,0x42] +fpadd64 %f0, %f2, %f4 + +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fsll16 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x04,0x22] +fsll16 %f0, %f2, %f4 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fsrl16 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x04,0x62] +fsrl16 %f0, %f2, %f4 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fsll32 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x04,0xa2] +fsll32 %f0, %f2, %f4 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fsrl32 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x04,0xe2] +fsrl32 %f0, %f2, %f4 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fslas16 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x05,0x22] +fslas16 %f0, %f2, %f4 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fsra16 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x05,0x62] +fsra16 %f0, %f2, %f4 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fslas32 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x05,0xa2] +fslas32 %f0, %f2, %f4 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: fsra32 %f0, %f2, %f4 ! encoding: [0x89,0xb0,0x05,0xe2] +fsra32 %f0, %f2, %f4 + +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: lzcnt %o0, %o1 ! encoding: [0x93,0xb0,0x02,0xe8] +lzcnt %o0, %o1 + +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: movstosw %f0, %o0 ! encoding: [0x91,0xb0,0x22,0x60] +movstosw %f0, %o0 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: movstouw %f0, %o0 ! encoding: [0x91,0xb0,0x22,0x20] +movstouw %f0, %o0 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: movdtox %f0, %o0 ! encoding: [0x91,0xb0,0x22,0x00] +movdtox %f0, %o0 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: movwtos %o0, %f0 ! encoding: [0x81,0xb0,0x23,0x28] +movwtos %o0, %f0 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: movxtod %o0, %f0 ! encoding: [0x81,0xb0,0x23,0x08] +movxtod %o0, %f0 + +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: pdistn %f0, %f2, %o0 ! encoding: [0x91,0xb0,0x07,0xe2] +pdistn %f0, %f2, %o0 + +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: umulxhi %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x02,0xc9] +umulxhi %o0, %o1, %o2 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: xmulx %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x22,0xa9] +xmulx %o0, %o1, %o2 +! NO-VIS3: error: instruction requires a CPU feature not currently enabled +! VIS3: xmulxhi %o0, %o1, %o2 ! encoding: [0x95,0xb2,0x22,0xc9] +xmulxhi %o0, %o1, %o2 From c2355892a4bad78b8ac520a11a7a63371c84d11e Mon Sep 17 00:00:00 2001 From: Elen Kalda Date: Thu, 3 Apr 2025 14:04:28 +0100 Subject: [PATCH 0518/1029] [mlir][tosa] Add ERROR_IF checks to TRANSPOSE_CONV2D verifier (#133234) This patch extends the verifier with following checks: ERROR_IF(out_pad_top <= -KH || out_pad_bottom <= -KH); ERROR_IF(out_pad_left <= -KW || out_pad_right <= -KW); ERROR_IF(stride_y < 1 || stride_x < 1); ERROR_IF(OH != (IH - 1) * stride_y + out_pad_top + out_pad_bottom + KH); ERROR_IF(OW != (IW - 1) * stride_x + out_pad_left + out_pad_right + KW); ERROR_IF(BC != OC && BC != 1); Signed-off-by: Elen Kalda --- mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 112 ++++++++++++++++++ mlir/test/Dialect/Tosa/invalid.mlir | 72 +++++++++++ mlir/test/Dialect/Tosa/invalid_extension.mlir | 6 +- mlir/test/Dialect/Tosa/level_check.mlir | 48 ++++---- mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir | 4 +- 5 files changed, 213 insertions(+), 29 deletions(-) diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index b8d81213d9004..8ae67a25498ad 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -2886,6 +2886,118 @@ LogicalResult TransposeConv2DOp::inferReturnTypeComponents( LogicalResult TransposeConv2DOp::verify() { if (verifyConvOp(*this).failed() || verifyConvOpModes(*this).failed()) return failure(); + + const llvm::ArrayRef strides = getStride(); + const int64_t strideY = strides[0]; + const int64_t strideX = strides[1]; + + if (strideY < 1 || strideX < 1) + return emitOpError("expect all stride values to be >= 1, got [") + << strides << "]"; + + const auto inputType = llvm::dyn_cast(getInput().getType()); + + const auto outputType = + llvm::dyn_cast(getOutput().getType()); + + const auto weightType = + llvm::dyn_cast(getWeight().getType()); + + const auto checkPadAgainstKernelDim = + [this](int64_t pad_value, int64_t kernel_dim_size, + llvm::StringRef pad_name, + llvm::StringRef kernel_dim_name) -> LogicalResult { + if (pad_value <= -kernel_dim_size) + return emitOpError("expected ") + << pad_name << " > -" << kernel_dim_name + << ", but got: " << pad_name << "=" << pad_value << " and " + << kernel_dim_name << "=" << kernel_dim_size; + return success(); + }; + + const llvm::ArrayRef padding = getOutPad(); + + const int64_t outPadTop = padding[0]; + const int64_t outPadBottom = padding[1]; + + const int64_t kernelHeight = weightType.getDimSize(1); + + if (!ShapedType::isDynamic(kernelHeight)) { + if (failed(checkPadAgainstKernelDim(outPadTop, kernelHeight, "out_pad_top", + "KH"))) + return failure(); + + if (failed(checkPadAgainstKernelDim(outPadBottom, kernelHeight, + "out_pad_bottom", "KH"))) + return failure(); + } + + const int64_t kernelWidth = weightType.getDimSize(2); + + const int64_t outPadLeft = padding[2]; + const int64_t outPadRight = padding[3]; + + if (!ShapedType::isDynamic(kernelWidth)) { + if (failed(checkPadAgainstKernelDim(outPadLeft, kernelWidth, "out_pad_left", + "KW"))) + return failure(); + + if (failed(checkPadAgainstKernelDim(outPadRight, kernelWidth, + "out_pad_right", "KW"))) + return failure(); + } + + // Rest of the checks depend on the output type being a RankedTensorType + if (!outputType) + return success(); + + const int64_t inputHeight = inputType.getDimSize(1); + const int64_t outputHeight = outputType.getDimSize(1); + + if (!ShapedType::isDynamic(inputHeight) && + !ShapedType::isDynamic(outputHeight)) { + if (outputHeight != + (inputHeight - 1) * strideY + outPadTop + outPadBottom + kernelHeight) + return emitOpError( + "dimension mismatch: expected OH == (IH - 1) * stride_y " + "+ out_pad_top + out_pad_bottom + KH, but got ") + << outputHeight << " != (" << inputHeight << " - 1) * " << strideY + << " + " << outPadTop << " + " << outPadBottom << " + " + << kernelHeight; + } + + const int64_t inputWidth = inputType.getDimSize(2); + const int64_t outputWidth = outputType.getDimSize(2); + + if (!ShapedType::isDynamic(inputWidth) && + !ShapedType::isDynamic(outputWidth)) { + if (outputWidth != + (inputWidth - 1) * strideX + outPadLeft + outPadRight + kernelWidth) + return emitOpError( + "dimension mismatch: expected OW == (IW - 1) * stride_x " + "+ out_pad_left + out_pad_right + KW, but got ") + << outputWidth << " != (" << inputWidth << " - 1) * " << strideX + << " + " << outPadLeft << " + " << outPadRight << " + " + << kernelWidth; + } + + const auto biasType = llvm::dyn_cast(getBias().getType()); + + if (!biasType) + return success(); + + const int64_t biasChannels = biasType.getDimSize(0); + + // Skip further checks if bias is dynamic + if (biasChannels == ShapedType::kDynamic) + return success(); + + const int64_t outputChannels = outputType.getDimSize(3); + if (biasChannels != outputChannels && biasChannels != 1) + return emitOpError( + "bias channels expected to be equal to output channels (") + << outputChannels << ") or 1, got " << biasChannels; + return success(); } diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index 10b8929b16a88..8cf6d4b154792 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -172,6 +172,78 @@ func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xi8>, %arg1: tensor<16x1 return %0 : tensor<1x32x32x16xi8> } +// ----- + +func.func @test_transpose_conv2d_invalid_padding_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { + // expected-error@+1 {{'tosa.transpose_conv2d' op expected out_pad_top > -KH, but got: out_pad_top=-3 and KH=1}} + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> + return %0 : tensor<1x32x32x16xf32> +} + +// ----- + +func.func @test_transpose_conv2d_invalid_padding_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { + // expected-error@+1 {{'tosa.transpose_conv2d' op expected out_pad_bottom > -KH, but got: out_pad_bottom=-1 and KH=1}} + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> + return %0 : tensor<1x32x32x16xf32> +} + +// ----- + +func.func @test_transpose_conv2d_invalid_padding_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { + // expected-error@+1 {{'tosa.transpose_conv2d' op expected out_pad_left > -KW, but got: out_pad_left=-8 and KW=1}} + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> + return %0 : tensor<1x32x32x16xf32> +} + +// ----- + +func.func @test_transpose_conv2d_invalid_padding_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { + // expected-error@+1 {{'tosa.transpose_conv2d' op expected out_pad_right > -KW, but got: out_pad_right=-9 and KW=1}} + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> + return %0 : tensor<1x32x32x16xf32> +} + +// ----- + +func.func @test_transpose_conv2d_invalid_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { + // expected-error@+1 {{'tosa.transpose_conv2d' op expect all stride values to be >= 1, got [0, 1]}} + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> + return %0 : tensor<1x32x32x16xf32> +} + +// ----- + +func.func @test_transpose_conv2d_invalid_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { + // expected-error@+1 {{'tosa.transpose_conv2d' op expect all stride values to be >= 1, got [1, 0]}} + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> + return %0 : tensor<1x32x32x16xf32> +} + +// ----- + +func.func @test_transpose_conv2d_invalid_output_height(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x33x32x16xf32> { + // expected-error@+1 {{'tosa.transpose_conv2d' op dimension mismatch: expected OH == (IH - 1) * stride_y + out_pad_top + out_pad_bottom + KH, but got 33 != (32 - 1) * 1 + 0 + 0 + 1}} + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x33x32x16xf32> + return %0 : tensor<1x33x32x16xf32> +} + +// ----- + +func.func @test_transpose_conv2d_invalid_output_width(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x40x16xf32> { + // expected-error@+1 {{'tosa.transpose_conv2d' op dimension mismatch: expected OW == (IW - 1) * stride_x + out_pad_left + out_pad_right + KW, but got 40 != (32 - 1) * 1 + 0 + 0 + 1}} + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x40x16xf32> + return %0 : tensor<1x32x40x16xf32> +} + +// ----- + +func.func @test_transpose_conv2d_invalid_bias(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<5xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { + // expected-error@+1 {{'tosa.transpose_conv2d' op bias channels expected to be equal to output channels (16) or 1, got 5}} + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> + return %0 : tensor<1x32x32x16xf32> +} + // ----- // CHECK-LABEL: conv2d_quant_any_acc func.func @test_conv2d_quant_any_acc(%arg0: tensor<1x4x4x4x!quant.any>>, %arg1: tensor<8x1x1x4x!quant.any>>, %arg2: tensor<8x!quant.any>>) -> tensor<1x4x4x8x!quant.any>> { diff --git a/mlir/test/Dialect/Tosa/invalid_extension.mlir b/mlir/test/Dialect/Tosa/invalid_extension.mlir index d1594232e4e1d..dd3d114218309 100644 --- a/mlir/test/Dialect/Tosa/invalid_extension.mlir +++ b/mlir/test/Dialect/Tosa/invalid_extension.mlir @@ -165,11 +165,11 @@ func.func @test_depthwise_conv2d_non_const_input_zp(%arg0: tensor<1x4x4x4xi8>, % // ----- -func.func @test_transpose_conv2d_non_const_weight_zp(%arg0: tensor<1x4x4x4xi8>, %arg1: tensor<1x1x4x2xi8>, %arg2: tensor<8xi32>, %arg3: tensor<1xi8>) -> tensor<1x4x4x8xi32> { +func.func @test_transpose_conv2d_non_const_weight_zp(%arg0: tensor<1x4x4x4xi8>, %arg1: tensor<1x1x4x2xi8>, %arg2: tensor<8xi32>, %arg3: tensor<1xi8>) -> tensor<1x4x7x8xi32> { %input_zp = "tosa.const"() {values = dense<0> : tensor<1xi8> } : () -> tensor<1xi8> // expected-error@+1 {{'tosa.transpose_conv2d' op expected compile time resolvable constant, but got variable value for operand #4}} - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %input_zp, %arg3 {acc_type = i32, out_pad = array, stride = array} : (tensor<1x4x4x4xi8>, tensor<1x1x4x2xi8>, tensor<8xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x4x8xi32> - return %0 : tensor<1x4x4x8xi32> + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %input_zp, %arg3 {acc_type = i32, out_pad = array, stride = array} : (tensor<1x4x4x4xi8>, tensor<1x1x4x2xi8>, tensor<8xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x7x8xi32> + return %0 : tensor<1x4x7x8xi32> } // ----- diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir index 0f469761d89e3..12addcd315449 100644 --- a/mlir/test/Dialect/Tosa/level_check.mlir +++ b/mlir/test/Dialect/Tosa/level_check.mlir @@ -887,74 +887,74 @@ func.func @test_rfft2d_input_w(%arg0: tensor<13x8x16384xf32>) -> (tensor<13x8x81 // ----- -func.func @test_transpose_conv2d_weight_h(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x8193x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { +func.func @test_transpose_conv2d_weight_h(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x8193x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x8224x32x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: KH <= MAX_KERNEL}} %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<16x8193x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> - return %0 : tensor<1x32x32x16xf32> + (tensor<1x32x32x8xf32>, tensor<16x8193x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x8224x32x16xf32> + return %0 : tensor<1x8224x32x16xf32> } // ----- -func.func @test_transpose_conv2d_weight_w(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x8193x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { +func.func @test_transpose_conv2d_weight_w(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x8193x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x8224x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: KW <= MAX_KERNEL}} %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<16x1x8193x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> - return %0 : tensor<1x32x32x16xf32> + (tensor<1x32x32x8xf32>, tensor<16x1x8193x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x8224x16xf32> + return %0 : tensor<1x32x8224x16xf32> } // ----- -func.func @test_transpose_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { +func.func @test_transpose_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x8225x32x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> - return %0 : tensor<1x32x32x16xf32> + (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x8225x32x16xf32> + return %0 : tensor<1x8225x32x16xf32> } // ----- -func.func @test_transpose_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { +func.func @test_transpose_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x8225x32x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> - return %0 : tensor<1x32x32x16xf32> + (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x8225x32x16xf32> + return %0 : tensor<1x8225x32x16xf32> } // ----- -func.func @test_transpose_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { +func.func @test_transpose_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x8225x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> - return %0 : tensor<1x32x32x16xf32> + (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x8225x16xf32> + return %0 : tensor<1x32x8225x16xf32> } // ----- -func.func @test_transpose_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { +func.func @test_transpose_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x8225x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}} %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> - return %0 : tensor<1x32x32x16xf32> + (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x8225x16xf32> + return %0 : tensor<1x32x8225x16xf32> } // ----- -func.func @test_transpose_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { +func.func @test_transpose_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x253984x32x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: stride <= MAX_STRIDE}} %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> - return %0 : tensor<1x32x32x16xf32> + (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x253984x32x16xf32> + return %0 : tensor<1x253984x32x16xf32> } // ----- -func.func @test_transpose_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { +func.func @test_transpose_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x253984x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: stride <= MAX_STRIDE}} %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : - (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> - return %0 : tensor<1x32x32x16xf32> + (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x253984x16xf32> + return %0 : tensor<1x32x253984x16xf32> } // ----- diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir index 037d51dccd1cd..761e489bdeae5 100644 --- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir +++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir @@ -994,8 +994,8 @@ func.func @transpose_conv2d_dynamic_bias(%arg0: tensor<2x6x4x3xf32>, %arg1: tens // CHECK-LABEL: @transpose_conv2d_padded func.func @transpose_conv2d_padded(%arg0: tensor<2x9x11x3xf32>, %arg1: tensor<5x3x6x3xf32>, %arg2: tensor<5xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) { - // CHECK: -> tensor<2x10x13x5xf32> - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, stride = array} : (tensor<2x9x11x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<2x10x13x5xf32> + // CHECK: -> tensor<2x12x19x5xf32> + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, stride = array} : (tensor<2x9x11x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<2x12x19x5xf32> return } From efbbdd69c7974d4fe08ccbbc1d8a206f3f317a1a Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 3 Apr 2025 21:14:45 +0800 Subject: [PATCH 0519/1029] [ADT] Make DenseMap::init() private (NFC) (#134229) I believe this method was not supposed to be public, as it has additional preconditions (it will misbehave when called on a non-empty DenseMap). The public API for this is reserve(). --- llvm/include/llvm/ADT/DenseMap.h | 20 +++++++++---------- .../CodeGen/AssignmentTrackingAnalysis.cpp | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h index ea9ba6f47ac1a..bb99a41646b08 100644 --- a/llvm/include/llvm/ADT/DenseMap.h +++ b/llvm/include/llvm/ADT/DenseMap.h @@ -806,16 +806,6 @@ class DenseMap : public DenseMapBase, } } - void init(unsigned InitNumEntries) { - auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries); - if (allocateBuckets(InitBuckets)) { - this->BaseT::initEmpty(); - } else { - NumEntries = 0; - NumTombstones = 0; - } - } - void grow(unsigned AtLeast) { unsigned OldNumBuckets = NumBuckets; BucketT *OldBuckets = Buckets; @@ -878,6 +868,16 @@ class DenseMap : public DenseMapBase, allocate_buffer(sizeof(BucketT) * NumBuckets, alignof(BucketT))); return true; } + + void init(unsigned InitNumEntries) { + auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries); + if (allocateBuckets(InitBuckets)) { + this->BaseT::initEmpty(); + } else { + NumEntries = 0; + NumTombstones = 0; + } + } }; template Date: Thu, 3 Apr 2025 14:53:41 +0200 Subject: [PATCH 0520/1029] [lldb] Initialize active_row pointer variable It's value is not set on all control flow paths. I believe this should fix the failure on some buildbots after #133247. --- lldb/source/Target/RegisterContextUnwind.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp index cb3d7ee479890..4f8b8a281a020 100644 --- a/lldb/source/Target/RegisterContextUnwind.cpp +++ b/lldb/source/Target/RegisterContextUnwind.cpp @@ -208,7 +208,7 @@ void RegisterContextUnwind::InitializeZerothFrame() { m_fast_unwind_plan_sp = GetFastUnwindPlanForFrame(); m_full_unwind_plan_sp = GetFullUnwindPlanForFrame(); - const UnwindPlan::Row *active_row; + const UnwindPlan::Row *active_row = nullptr; lldb::RegisterKind row_register_kind = eRegisterKindGeneric; // If we have LanguageRuntime UnwindPlan for this unwind, use those From ae8ad8649da7f69dae2b19db79b69c460be01916 Mon Sep 17 00:00:00 2001 From: Lukacma Date: Thu, 3 Apr 2025 14:22:48 +0100 Subject: [PATCH 0521/1029] [Clang][AArch64] Model ZT0 table using inaccessible memory (#133727) This patch changes how ZT0 table is modelled at LLVM-IR level. Currently accesses to ZT0 are represented at LLVM-IR level as memory reads and writes. This patch changes that and models them as purely Inaccessible memory accesses without any unmodeled side-effects. --- llvm/include/llvm/IR/IntrinsicsAArch64.td | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 0f7e963f46e77..77ea0bcaa4b5f 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3031,11 +3031,11 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sme_write_lane_zt : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_anyvector_ty, llvm_i32_ty], - [ImmArg>, ImmArg>, IntrNoMem, IntrHasSideEffects]>; + [ImmArg>, ImmArg>, IntrInaccessibleMemOnly]>; def int_aarch64_sme_write_zt : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_anyvector_ty], - [ImmArg>, IntrNoMem, IntrHasSideEffects]>; + [ImmArg>, IntrInaccessibleMemOnly, IntrWriteMem]>; def int_aarch64_sme_zero : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg>]>; @@ -3851,50 +3851,50 @@ let TargetPrefix = "aarch64" in { def int_aarch64_sve_sel_x4 : SVE2_VG4_Sel_Intrinsic; class SME_LDR_STR_ZT_Intrinsic - : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty]>; + : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty], [IntrInaccessibleMemOrArgMemOnly]>; def int_aarch64_sme_ldr_zt : SME_LDR_STR_ZT_Intrinsic; def int_aarch64_sme_str_zt : SME_LDR_STR_ZT_Intrinsic; // // Zero ZT0 // - def int_aarch64_sme_zero_zt : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg>, IntrWriteMem]>; + def int_aarch64_sme_zero_zt : DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg>, IntrInaccessibleMemOnly, IntrWriteMem]>; // // Lookup table expand one register // def int_aarch64_sme_luti2_lane_zt : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], - [ImmArg>, ImmArg>, IntrReadMem]>; + [ImmArg>, ImmArg>, IntrInaccessibleMemOnly, IntrReadMem]>; def int_aarch64_sme_luti4_lane_zt : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], - [ImmArg>, ImmArg>, IntrReadMem]>; + [ImmArg>, ImmArg>, IntrInaccessibleMemOnly, IntrReadMem]>; // Lookup table expand two registers // def int_aarch64_sme_luti2_lane_zt_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], - [ImmArg>, ImmArg>, IntrReadMem]>; + [ImmArg>, ImmArg>, IntrInaccessibleMemOnly, IntrReadMem]>; def int_aarch64_sme_luti4_lane_zt_x2 : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], - [ImmArg>, ImmArg>, IntrReadMem]>; - + [ImmArg>, ImmArg>, IntrInaccessibleMemOnly, IntrReadMem]>; + // // Lookup table expand four registers // def int_aarch64_sme_luti2_lane_zt_x4 : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], - [ImmArg>, ImmArg>, IntrReadMem]>; + [ImmArg>, ImmArg>, IntrInaccessibleMemOnly, IntrReadMem]>; def int_aarch64_sme_luti4_lane_zt_x4 : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], - [ImmArg>, ImmArg>, IntrReadMem]>; + [ImmArg>, ImmArg>, IntrInaccessibleMemOnly, IntrReadMem]>; def int_aarch64_sme_luti4_zt_x4 : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty], - [ImmArg>, IntrNoMem, IntrHasSideEffects]>; + [ImmArg>, IntrInaccessibleMemOnly, IntrReadMem]>; // From c818ae739902f3c46466b67bd4d1bd4d09b38e84 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Thu, 3 Apr 2025 16:40:34 +0300 Subject: [PATCH 0522/1029] [BOLT] Gadget scanner: detect non-protected indirect calls (#131899) Implement the detection of non-protected indirect calls and branches similar to pac-ret scanner. --- bolt/include/bolt/Core/MCPlusBuilder.h | 10 + bolt/include/bolt/Passes/PAuthGadgetScanner.h | 6 +- bolt/include/bolt/Utils/CommandLineOpts.h | 4 +- bolt/lib/Passes/PAuthGadgetScanner.cpp | 37 +- bolt/lib/Rewrite/RewriteInstance.cpp | 32 +- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 27 + .../binary-analysis/AArch64/cmdline-args.test | 5 +- .../binary-analysis/AArch64/gs-pauth-calls.s | 782 ++++++++++++++++++ .../AArch64/gs-pauth-debug-output.s | 51 +- 9 files changed, 923 insertions(+), 31 deletions(-) create mode 100644 bolt/test/binary-analysis/AArch64/gs-pauth-calls.s diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 1458d36d4813a..bbef65700b2a5 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -577,6 +577,16 @@ class MCPlusBuilder { return getNoRegister(); } + /// Returns the register used as call destination, or no-register, if not + /// an indirect call. Sets IsAuthenticatedInternally if the instruction + /// accepts a signed pointer as its operand and authenticates it internally. + virtual MCPhysReg + getRegUsedAsCallDest(const MCInst &Inst, + bool &IsAuthenticatedInternally) const { + llvm_unreachable("not implemented"); + return getNoRegister(); + } + virtual bool isTerminator(const MCInst &Inst) const; virtual bool isNoop(const MCInst &Inst) const { diff --git a/bolt/include/bolt/Passes/PAuthGadgetScanner.h b/bolt/include/bolt/Passes/PAuthGadgetScanner.h index 700059b814ab9..622e6721dea55 100644 --- a/bolt/include/bolt/Passes/PAuthGadgetScanner.h +++ b/bolt/include/bolt/Passes/PAuthGadgetScanner.h @@ -248,6 +248,9 @@ struct FunctionAnalysisResult { }; class Analysis : public BinaryFunctionPass { + /// Only search for pac-ret violations. + bool PacRetGadgetsOnly; + void runOnFunction(BinaryFunction &Function, MCPlusBuilder::AllocatorIdTy AllocatorId); FunctionAnalysisResult findGadgets(BinaryFunction &BF, @@ -261,7 +264,8 @@ class Analysis : public BinaryFunctionPass { std::mutex AnalysisResultsMutex; public: - explicit Analysis() : BinaryFunctionPass(false) {} + explicit Analysis(bool PacRetGadgetsOnly) + : BinaryFunctionPass(false), PacRetGadgetsOnly(PacRetGadgetsOnly) {} const char *getName() const override { return "pauth-gadget-scanner"; } diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h index 19f8c6b2646d7..3de945f6a1507 100644 --- a/bolt/include/bolt/Utils/CommandLineOpts.h +++ b/bolt/include/bolt/Utils/CommandLineOpts.h @@ -81,9 +81,9 @@ extern llvm::cl::opt Verbosity; /// Return true if we should process all functions in the binary. bool processAllFunctions(); -enum GadgetScannerKind { GS_PACRET, GS_ALL }; +enum GadgetScannerKind { GS_PACRET, GS_PAUTH, GS_ALL }; -extern llvm::cl::list GadgetScannersToRun; +extern llvm::cl::bits GadgetScannersToRun; } // namespace opts diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index e9940372f5c92..a3b320c545734 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -401,11 +401,11 @@ class PacRetAnalysis public: std::vector - getLastClobberingInsts(const MCInst Ret, BinaryFunction &BF, - const ArrayRef UsedDirtyRegs) const { + getLastClobberingInsts(const MCInst &Inst, BinaryFunction &BF, + const ArrayRef UsedDirtyRegs) { if (RegsToTrackInstsFor.empty()) return {}; - auto MaybeState = getStateAt(Ret); + auto MaybeState = getStateBefore(Inst); if (!MaybeState) llvm_unreachable("Expected State to be present"); const State &S = *MaybeState; @@ -453,6 +453,29 @@ shouldReportReturnGadget(const BinaryContext &BC, const MCInstReference &Inst, return std::make_shared(RetKind, Inst, RetReg); } +static std::shared_ptr +shouldReportCallGadget(const BinaryContext &BC, const MCInstReference &Inst, + const State &S) { + static const GadgetKind CallKind("non-protected call found"); + if (!BC.MIB->isCall(Inst) && !BC.MIB->isBranch(Inst)) + return nullptr; + + bool IsAuthenticated = false; + MCPhysReg DestReg = BC.MIB->getRegUsedAsCallDest(Inst, IsAuthenticated); + if (IsAuthenticated || DestReg == BC.MIB->getNoRegister()) + return nullptr; + + LLVM_DEBUG({ + traceInst(BC, "Found call inst", Inst); + traceReg(BC, "Call destination reg", DestReg); + traceRegMask(BC, "SafeToDerefRegs", S.SafeToDerefRegs); + }); + if (S.SafeToDerefRegs[DestReg]) + return nullptr; + + return std::make_shared(CallKind, Inst, DestReg); +} + FunctionAnalysisResult Analysis::findGadgets(BinaryFunction &BF, MCPlusBuilder::AllocatorIdTy AllocatorId) { @@ -469,7 +492,7 @@ Analysis::findGadgets(BinaryFunction &BF, for (BinaryBasicBlock &BB : BF) { for (int64_t I = 0, E = BB.size(); I < E; ++I) { MCInstReference Inst(&BB, I); - const State &S = *PRA.getStateAt(Inst); + const State &S = *PRA.getStateBefore(Inst); // If non-empty state was never propagated from the entry basic block // to Inst, assume it to be unreachable and report a warning. @@ -481,6 +504,12 @@ Analysis::findGadgets(BinaryFunction &BF, if (auto Report = shouldReportReturnGadget(BC, Inst, S)) Result.Diagnostics.push_back(Report); + + if (PacRetGadgetsOnly) + continue; + + if (auto Report = shouldReportCallGadget(BC, Inst, S)) + Result.Diagnostics.push_back(Report); } } return Result; diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index f204aa3eb8a38..3217dd4324bc7 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -247,12 +247,14 @@ static cl::opt WriteBoltInfoSection( "bolt-info", cl::desc("write bolt info section in the output binary"), cl::init(true), cl::Hidden, cl::cat(BoltOutputCategory)); -cl::list - GadgetScannersToRun("scanners", cl::desc("which gadget scanners to run"), - cl::values(clEnumValN(GS_PACRET, "pacret", "pac-ret"), - clEnumValN(GS_ALL, "all", "all")), - cl::ZeroOrMore, cl::CommaSeparated, - cl::cat(BinaryAnalysisCategory)); +cl::bits GadgetScannersToRun( + "scanners", cl::desc("which gadget scanners to run"), + cl::values( + clEnumValN(GS_PACRET, "pacret", + "pac-ret: return address protection (subset of \"pauth\")"), + clEnumValN(GS_PAUTH, "pauth", "All Pointer Authentication scanners"), + clEnumValN(GS_ALL, "all", "All implemented scanners")), + cl::ZeroOrMore, cl::CommaSeparated, cl::cat(BinaryAnalysisCategory)); } // namespace opts @@ -3539,12 +3541,18 @@ void RewriteInstance::runBinaryAnalyses() { // FIXME: add a pass that warns about which functions do not have CFG, // and therefore, analysis is most likely to be less accurate. using GSK = opts::GadgetScannerKind; - // if no command line option was given, act as if "all" was specified. - if (opts::GadgetScannersToRun.empty()) - opts::GadgetScannersToRun.addValue(GSK::GS_ALL); - for (GSK ScannerToRun : opts::GadgetScannersToRun) { - if (ScannerToRun == GSK::GS_PACRET || ScannerToRun == GSK::GS_ALL) - Manager.registerPass(std::make_unique()); + using PAuthScanner = PAuthGadgetScanner::Analysis; + + // If no command line option was given, act as if "all" was specified. + bool RunAll = !opts::GadgetScannersToRun.getBits() || + opts::GadgetScannersToRun.isSet(GSK::GS_ALL); + + if (RunAll || opts::GadgetScannersToRun.isSet(GSK::GS_PAUTH)) { + Manager.registerPass( + std::make_unique(/*OnlyPacRetChecks=*/false)); + } else if (RunAll || opts::GadgetScannersToRun.isSet(GSK::GS_PACRET)) { + Manager.registerPass( + std::make_unique(/*OnlyPacRetChecks=*/true)); } BC->logBOLTErrorsAndQuitOnFatal(Manager.runPasses()); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 0fd127bfeba41..2a648baa4d514 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -277,6 +277,33 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { } } + MCPhysReg + getRegUsedAsCallDest(const MCInst &Inst, + bool &IsAuthenticatedInternally) const override { + assert(isCall(Inst) || isBranch(Inst)); + IsAuthenticatedInternally = false; + + switch (Inst.getOpcode()) { + case AArch64::BR: + case AArch64::BLR: + return Inst.getOperand(0).getReg(); + case AArch64::BRAA: + case AArch64::BRAB: + case AArch64::BRAAZ: + case AArch64::BRABZ: + case AArch64::BLRAA: + case AArch64::BLRAB: + case AArch64::BLRAAZ: + case AArch64::BLRABZ: + IsAuthenticatedInternally = true; + return Inst.getOperand(0).getReg(); + default: + if (isIndirectCall(Inst) || isIndirectBranch(Inst)) + llvm_unreachable("Unhandled indirect branch"); + return getNoRegister(); + } + } + bool isADRP(const MCInst &Inst) const override { return Inst.getOpcode() == AArch64::ADRP; } diff --git a/bolt/test/binary-analysis/AArch64/cmdline-args.test b/bolt/test/binary-analysis/AArch64/cmdline-args.test index 1204d5b1289af..76f7c3ba0a1c7 100644 --- a/bolt/test/binary-analysis/AArch64/cmdline-args.test +++ b/bolt/test/binary-analysis/AArch64/cmdline-args.test @@ -33,7 +33,8 @@ HELP-EMPTY: HELP-NEXT: BinaryAnalysis options: HELP-EMPTY: HELP-NEXT: --scanners= - which gadget scanners to run -HELP-NEXT: =pacret - pac-ret -HELP-NEXT: =all - all +HELP-NEXT: =pacret - pac-ret: return address protection (subset of "pauth") +HELP-NEXT: =pauth - All Pointer Authentication scanners +HELP-NEXT: =all - All implemented scanners HELP-EMPTY: HELP-NEXT: Generic Options: diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-calls.s b/bolt/test/binary-analysis/AArch64/gs-pauth-calls.s new file mode 100644 index 0000000000000..0f6c850583dda --- /dev/null +++ b/bolt/test/binary-analysis/AArch64/gs-pauth-calls.s @@ -0,0 +1,782 @@ +// RUN: %clang %cflags -march=armv8.3-a %s -o %t.exe +// RUN: llvm-bolt-binary-analysis --scanners=pacret %t.exe 2>&1 | FileCheck -check-prefix=PACRET %s +// RUN: llvm-bolt-binary-analysis --scanners=pauth %t.exe 2>&1 | FileCheck %s + +// PACRET-NOT: non-protected call found in function + + .text + + .globl callee + .type callee,@function +callee: + ret + .size callee, .-callee + + .globl good_direct_call + .type good_direct_call,@function +good_direct_call: +// CHECK-NOT: good_direct_call + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + bl callee + + ldp x29, x30, [sp], #16 + autiasp + ret + .size good_direct_call, .-good_direct_call + + .globl good_indirect_call_arg + .type good_indirect_call_arg,@function +good_indirect_call_arg: +// CHECK-NOT: good_indirect_call_arg + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + autia x0, x1 + blr x0 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size good_indirect_call_arg, .-good_indirect_call_arg + + .globl good_indirect_call_mem + .type good_indirect_call_mem,@function +good_indirect_call_mem: +// CHECK-NOT: good_indirect_call_mem + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x16, [x0] + autia x16, x0 + blr x16 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size good_indirect_call_mem, .-good_indirect_call_mem + + .globl good_indirect_call_arg_v83 + .type good_indirect_call_arg_v83,@function +good_indirect_call_arg_v83: +// CHECK-NOT: good_indirect_call_arg_v83 + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + blraa x0, x1 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size good_indirect_call_arg_v83, .-good_indirect_call_arg_v83 + + .globl good_indirect_call_mem_v83 + .type good_indirect_call_mem_v83,@function +good_indirect_call_mem_v83: +// CHECK-NOT: good_indirect_call_mem_v83 + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x16, [x0] + blraa x16, x0 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size good_indirect_call_mem_v83, .-good_indirect_call_mem_v83 + + .globl bad_indirect_call_arg + .type bad_indirect_call_arg,@function +bad_indirect_call_arg: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_call_arg, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x0 +// CHECK-NEXT: The 0 instructions that write to the affected registers after any authentication are: + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + blr x0 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size bad_indirect_call_arg, .-bad_indirect_call_arg + + .globl bad_indirect_call_mem + .type bad_indirect_call_mem,@function +bad_indirect_call_mem: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_call_mem, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: ldr x16, [x0] +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: paciasp +// CHECK-NEXT: {{[0-9a-f]+}}: stp x29, x30, [sp, #-0x10]! +// CHECK-NEXT: {{[0-9a-f]+}}: mov x29, sp +// CHECK-NEXT: {{[0-9a-f]+}}: ldr x16, [x0] +// CHECK-NEXT: {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: {{[0-9a-f]+}}: ldp x29, x30, [sp], #0x10 +// CHECK-NEXT: {{[0-9a-f]+}}: autiasp +// CHECK-NEXT: {{[0-9a-f]+}}: ret + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x16, [x0] + blr x16 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size bad_indirect_call_mem, .-bad_indirect_call_mem + + .globl bad_indirect_call_arg_clobber + .type bad_indirect_call_arg_clobber,@function +bad_indirect_call_arg_clobber: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_call_arg_clobber, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x0 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: mov w0, w2 +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: paciasp +// CHECK-NEXT: {{[0-9a-f]+}}: stp x29, x30, [sp, #-0x10]! +// CHECK-NEXT: {{[0-9a-f]+}}: mov x29, sp +// CHECK-NEXT: {{[0-9a-f]+}}: autia x0, x1 +// CHECK-NEXT: {{[0-9a-f]+}}: mov w0, w2 +// CHECK-NEXT: {{[0-9a-f]+}}: blr x0 +// CHECK-NEXT: {{[0-9a-f]+}}: ldp x29, x30, [sp], #0x10 +// CHECK-NEXT: {{[0-9a-f]+}}: autiasp +// CHECK-NEXT: {{[0-9a-f]+}}: ret + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + autia x0, x1 + mov w0, w2 + blr x0 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size bad_indirect_call_arg_clobber, .-bad_indirect_call_arg_clobber + + .globl bad_indirect_call_mem_clobber + .type bad_indirect_call_mem_clobber,@function +bad_indirect_call_mem_clobber: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_call_mem_clobber, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: mov w16, w2 +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: paciasp +// CHECK-NEXT: {{[0-9a-f]+}}: stp x29, x30, [sp, #-0x10]! +// CHECK-NEXT: {{[0-9a-f]+}}: mov x29, sp +// CHECK-NEXT: {{[0-9a-f]+}}: ldr x16, [x0] +// CHECK-NEXT: {{[0-9a-f]+}}: autia x16, x0 +// CHECK-NEXT: {{[0-9a-f]+}}: mov w16, w2 +// CHECK-NEXT: {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: {{[0-9a-f]+}}: ldp x29, x30, [sp], #0x10 +// CHECK-NEXT: {{[0-9a-f]+}}: autiasp +// CHECK-NEXT: {{[0-9a-f]+}}: ret + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x16, [x0] + autia x16, x0 + mov w16, w2 + blr x16 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size bad_indirect_call_mem_clobber, .-bad_indirect_call_mem_clobber + + .globl good_indirect_call_mem_chain_of_auts + .type good_indirect_call_mem_chain_of_auts,@function +good_indirect_call_mem_chain_of_auts: +// CHECK-NOT: good_indirect_call_mem_chain_of_auts + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x16, [x0] + autda x16, x1 + ldr x16, [x16] + autia x16, x0 + blr x16 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size good_indirect_call_mem_chain_of_auts, .-good_indirect_call_mem_chain_of_auts + + .globl bad_indirect_call_mem_chain_of_auts + .type bad_indirect_call_mem_chain_of_auts,@function +bad_indirect_call_mem_chain_of_auts: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_call_mem_chain_of_auts, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: ldr x16, [x16] +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: paciasp +// CHECK-NEXT: {{[0-9a-f]+}}: stp x29, x30, [sp, #-0x10]! +// CHECK-NEXT: {{[0-9a-f]+}}: mov x29, sp +// CHECK-NEXT: {{[0-9a-f]+}}: ldr x16, [x0] +// CHECK-NEXT: {{[0-9a-f]+}}: autda x16, x1 +// CHECK-NEXT: {{[0-9a-f]+}}: ldr x16, [x16] +// CHECK-NEXT: {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: {{[0-9a-f]+}}: ldp x29, x30, [sp], #0x10 +// CHECK-NEXT: {{[0-9a-f]+}}: autiasp +// CHECK-NEXT: {{[0-9a-f]+}}: ret + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x16, [x0] + autda x16, x1 + ldr x16, [x16] + // Missing AUT of x16. The fact that x16 was authenticated above has nothing to do with it. + blr x16 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size bad_indirect_call_mem_chain_of_auts, .-bad_indirect_call_mem_chain_of_auts + +// Multi-BB test cases. +// +// Positive ("good") test cases are designed so that the register is made safe +// in one BB and used in the other. Negative ("bad") ones are designed so that +// there are two predecessors, one of them ends with the register in a safe +// state and the other ends with that register being unsafe. + + .globl good_indirect_call_arg_multi_bb + .type good_indirect_call_arg_multi_bb,@function +good_indirect_call_arg_multi_bb: +// CHECK-NOT: good_indirect_call_arg_multi_bb + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + autia x0, x1 + cbz x2, 1f + blr x0 +1: + ldr x1, [x0] // prevent authentication oracle + + ldp x29, x30, [sp], #16 + autiasp + ret + .size good_indirect_call_arg_multi_bb, .-good_indirect_call_arg_multi_bb + + .globl good_indirect_call_mem_multi_bb + .type good_indirect_call_mem_multi_bb,@function +good_indirect_call_mem_multi_bb: +// CHECK-NOT: good_indirect_call_mem_multi_bb + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x16, [x0] + autia x16, x0 + cbz x2, 1f + blr x16 +1: + ldr w0, [x16] // prevent authentication oracle + + ldp x29, x30, [sp], #16 + autiasp + ret + .size good_indirect_call_mem_multi_bb, .-good_indirect_call_mem_multi_bb + + .globl bad_indirect_call_arg_multi_bb + .type bad_indirect_call_arg_multi_bb,@function +bad_indirect_call_arg_multi_bb: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_call_arg_multi_bb, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x0 +// CHECK-NEXT: The 0 instructions that write to the affected registers after any authentication are: + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + cbz x2, 1f + autia x0, x1 +1: + blr x0 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size bad_indirect_call_arg_multi_bb, .-bad_indirect_call_arg_multi_bb + + .globl bad_indirect_call_mem_multi_bb + .type bad_indirect_call_mem_multi_bb,@function +bad_indirect_call_mem_multi_bb: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_call_mem_multi_bb, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: ldr x16, [x0] + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x16, [x0] + cbz x2, 1f + autia x16, x1 +1: + blr x16 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size bad_indirect_call_mem_multi_bb, .-bad_indirect_call_mem_multi_bb + + .globl bad_indirect_call_arg_clobber_multi_bb + .type bad_indirect_call_arg_clobber_multi_bb,@function +bad_indirect_call_arg_clobber_multi_bb: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_call_arg_clobber_multi_bb, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x0 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: mov w0, w3 + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + autia x0, x1 + cbz x2, 1f + mov w0, w3 +1: + blr x0 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size bad_indirect_call_arg_clobber_multi_bb, .-bad_indirect_call_arg_clobber_multi_bb + + .globl bad_indirect_call_mem_clobber_multi_bb + .type bad_indirect_call_mem_clobber_multi_bb,@function +bad_indirect_call_mem_clobber_multi_bb: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_call_mem_clobber_multi_bb, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: mov w16, w2 + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x16, [x0] + autia x16, x0 + cbz x2, 1f + mov w16, w2 +1: + blr x16 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size bad_indirect_call_mem_clobber_multi_bb, .-bad_indirect_call_mem_clobber_multi_bb + + .globl good_indirect_call_mem_chain_of_auts_multi_bb + .type good_indirect_call_mem_chain_of_auts_multi_bb,@function +good_indirect_call_mem_chain_of_auts_multi_bb: +// CHECK-NOT: good_indirect_call_mem_chain_of_auts_multi_bb + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x16, [x0] + autda x16, x1 + ldr x16, [x16] + autia x16, x0 + cbz x2, 1f + blr x16 +1: + ldr w0, [x16] // prevent authentication oracle + + ldp x29, x30, [sp], #16 + autiasp + ret + .size good_indirect_call_mem_chain_of_auts_multi_bb, .-good_indirect_call_mem_chain_of_auts_multi_bb + + .globl bad_indirect_call_mem_chain_of_auts_multi_bb + .type bad_indirect_call_mem_chain_of_auts_multi_bb,@function +bad_indirect_call_mem_chain_of_auts_multi_bb: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_call_mem_chain_of_auts_multi_bb, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: ldr x16, [x16] + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x16, [x0] + autda x16, x1 + ldr x16, [x16] + cbz x2, 1f + autia x16, x0 +1: + blr x16 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size bad_indirect_call_mem_chain_of_auts_multi_bb, .-bad_indirect_call_mem_chain_of_auts_multi_bb + +// Test tail calls. To somewhat decrease the number of test cases and not +// duplicate all of the above, only implement "mem" variant of test cases and +// mostly test negative cases. + + .globl good_direct_tailcall + .type good_direct_tailcall,@function +good_direct_tailcall: +// CHECK-NOT: good_direct_tailcall + b callee + .size good_direct_tailcall, .-good_direct_tailcall + + .globl good_indirect_tailcall_mem + .type good_indirect_tailcall_mem,@function +good_indirect_tailcall_mem: +// CHECK-NOT: good_indirect_tailcall_mem + ldr x16, [x0] + autia x16, x0 + br x16 + .size good_indirect_tailcall_mem, .-good_indirect_tailcall_mem + + .globl good_indirect_tailcall_mem_v83 + .type good_indirect_tailcall_mem_v83,@function +good_indirect_tailcall_mem_v83: +// CHECK-NOT: good_indirect_tailcall_mem_v83 + ldr x16, [x0] + braa x16, x0 + .size good_indirect_tailcall_mem_v83, .-good_indirect_tailcall_mem_v83 + + .globl bad_indirect_tailcall_mem + .type bad_indirect_tailcall_mem,@function +bad_indirect_tailcall_mem: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_tailcall_mem, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: br x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: ldr x16, [x0] +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: ldr x16, [x0] +// CHECK-NEXT: {{[0-9a-f]+}}: br x16 + ldr x16, [x0] + br x16 + .size bad_indirect_tailcall_mem, .-bad_indirect_tailcall_mem + + .globl bad_indirect_tailcall_mem_clobber + .type bad_indirect_tailcall_mem_clobber,@function +bad_indirect_tailcall_mem_clobber: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_tailcall_mem_clobber, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: br x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: mov w16, w2 +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: ldr x16, [x0] +// CHECK-NEXT: {{[0-9a-f]+}}: autia x16, x0 +// CHECK-NEXT: {{[0-9a-f]+}}: mov w16, w2 +// CHECK-NEXT: {{[0-9a-f]+}}: br x16 + ldr x16, [x0] + autia x16, x0 + mov w16, w2 + br x16 + .size bad_indirect_tailcall_mem_clobber, .-bad_indirect_tailcall_mem_clobber + + .globl bad_indirect_tailcall_mem_chain_of_auts + .type bad_indirect_tailcall_mem_chain_of_auts,@function +bad_indirect_tailcall_mem_chain_of_auts: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_tailcall_mem_chain_of_auts, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: br x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: ldr x16, [x16] +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: ldr x16, [x0] +// CHECK-NEXT: {{[0-9a-f]+}}: autda x16, x1 +// CHECK-NEXT: {{[0-9a-f]+}}: ldr x16, [x16] +// CHECK-NEXT: {{[0-9a-f]+}}: br x16 + ldr x16, [x0] + autda x16, x1 + ldr x16, [x16] + // Missing AUT of x16. The fact that x16 was authenticated above has nothing to do with it. + br x16 + .size bad_indirect_tailcall_mem_chain_of_auts, .-bad_indirect_tailcall_mem_chain_of_auts + + .globl bad_indirect_tailcall_mem_multi_bb + .type bad_indirect_tailcall_mem_multi_bb,@function +bad_indirect_tailcall_mem_multi_bb: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_tailcall_mem_multi_bb, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: br x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: ldr x16, [x0] + ldr x16, [x0] + cbz x2, 1f + autia x16, x1 +1: + br x16 + .size bad_indirect_tailcall_mem_multi_bb, .-bad_indirect_tailcall_mem_multi_bb + + .globl bad_indirect_tailcall_mem_clobber_multi_bb + .type bad_indirect_tailcall_mem_clobber_multi_bb,@function +bad_indirect_tailcall_mem_clobber_multi_bb: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_indirect_tailcall_mem_clobber_multi_bb, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: br x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: mov w16, w2 + ldr x16, [x0] + autia x16, x0 + cbz x2, 1f + mov w16, w2 +1: + br x16 + .size bad_indirect_tailcall_mem_clobber_multi_bb, .-bad_indirect_tailcall_mem_clobber_multi_bb + +// Test that calling a function is considered as invalidating safety of every +// register. Note that we only have to consider "returning" function calls +// (via branch-with-link), but both direct and indirect variants. +// Checking different registers: +// * x2 - function argument +// * x8 - indirect result location +// * x10 - temporary +// * x16 - intra-procedure-call scratch register +// * x18 - platform register +// * x20 - callee-saved register + + .globl direct_call_invalidates_safety + .type direct_call_invalidates_safety,@function +direct_call_invalidates_safety: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function direct_call_invalidates_safety, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x2 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: bl callee +// CHECK-LABEL: GS-PAUTH: non-protected call found in function direct_call_invalidates_safety, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x8 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: bl callee +// CHECK-LABEL: GS-PAUTH: non-protected call found in function direct_call_invalidates_safety, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x10 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: bl callee +// CHECK-LABEL: GS-PAUTH: non-protected call found in function direct_call_invalidates_safety, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: bl callee +// CHECK-LABEL: GS-PAUTH: non-protected call found in function direct_call_invalidates_safety, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x18 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: bl callee +// CHECK-LABEL: GS-PAUTH: non-protected call found in function direct_call_invalidates_safety, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x20 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: bl callee + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + mov x2, x0 + autiza x2 + bl callee + blr x2 + + mov x8, x0 + autiza x8 + bl callee + blr x8 + + mov x10, x0 + autiza x10 + bl callee + blr x10 + + mov x16, x0 + autiza x16 + bl callee + blr x16 + + mov x18, x0 + autiza x18 + bl callee + blr x18 + + mov x20, x0 + autiza x20 + bl callee + blr x20 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size direct_call_invalidates_safety, .-direct_call_invalidates_safety + + .globl indirect_call_invalidates_safety + .type indirect_call_invalidates_safety,@function +indirect_call_invalidates_safety: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function indirect_call_invalidates_safety, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x2 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: blr x2 +// Check that only one error is reported per pair of BLRs. +// CHECK-NOT: The instruction is {{[0-9a-f]+}}: blr x2 + +// CHECK-LABEL: GS-PAUTH: non-protected call found in function indirect_call_invalidates_safety, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x8 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: blr x8 +// CHECK-NOT: The instruction is {{[0-9a-f]+}}: blr x8 + +// CHECK-LABEL: GS-PAUTH: non-protected call found in function indirect_call_invalidates_safety, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x10 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: blr x10 +// CHECK-NOT: The instruction is {{[0-9a-f]+}}: blr x10 + +// CHECK-LABEL: GS-PAUTH: non-protected call found in function indirect_call_invalidates_safety, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: blr x16 +// CHECK-NOT: The instruction is {{[0-9a-f]+}}: blr x16 + +// CHECK-LABEL: GS-PAUTH: non-protected call found in function indirect_call_invalidates_safety, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x18 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: blr x18 +// CHECK-NOT: The instruction is {{[0-9a-f]+}}: blr x18 + +// CHECK-LABEL: GS-PAUTH: non-protected call found in function indirect_call_invalidates_safety, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x20 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: blr x20 +// CHECK-NOT: The instruction is {{[0-9a-f]+}}: blr x20 + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + mov x2, x0 + autiza x2 + blr x2 // protected call, but makes x2 unsafe + blr x2 // unprotected call + + mov x8, x0 + autiza x8 + blr x8 // protected call, but makes x8 unsafe + blr x8 // unprotected call + + mov x10, x0 + autiza x10 + blr x10 // protected call, but makes x10 unsafe + blr x10 // unprotected call + + mov x16, x0 + autiza x16 + blr x16 // protected call, but makes x16 unsafe + blr x16 // unprotected call + + mov x18, x0 + autiza x18 + blr x18 // protected call, but makes x18 unsafe + blr x18 // unprotected call + + mov x20, x0 + autiza x20 + blr x20 // protected call, but makes x20 unsafe + blr x20 // unprotected call + + ldp x29, x30, [sp], #16 + autiasp + ret + .size indirect_call_invalidates_safety, .-indirect_call_invalidates_safety + +// Test that fused auth+use Armv8.3 instruction do not mark register as safe. + + .globl blraa_no_mark_safe + .type blraa_no_mark_safe,@function +blraa_no_mark_safe: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function blraa_no_mark_safe, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x0 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: blraa x0, x1 +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: paciasp +// CHECK-NEXT: {{[0-9a-f]+}}: stp x29, x30, [sp, #-0x10]! +// CHECK-NEXT: {{[0-9a-f]+}}: mov x29, sp +// CHECK-NEXT: {{[0-9a-f]+}}: blraa x0, x1 +// CHECK-NEXT: {{[0-9a-f]+}}: blr x0 +// CHECK-NEXT: {{[0-9a-f]+}}: ldp x29, x30, [sp], #0x10 +// CHECK-NEXT: {{[0-9a-f]+}}: autiasp +// CHECK-NEXT: {{[0-9a-f]+}}: ret + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + blraa x0, x1 // safe, no write-back, clobbers everything + blr x0 // unsafe + + ldp x29, x30, [sp], #16 + autiasp + ret + .size blraa_no_mark_safe, .-blraa_no_mark_safe + +// Check that the correct set of registers is used to compute the set of last +// writing instructions: both x16 and x17 are tracked in this function, but +// only one particular register is used to compute the set of clobbering +// instructions in each report. + + .globl last_insts_writing_to_reg + .type last_insts_writing_to_reg,@function +last_insts_writing_to_reg: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function last_insts_writing_to_reg, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: ldr x16, [x0] +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: paciasp +// CHECK-NEXT: {{[0-9a-f]+}}: stp x29, x30, [sp, #-0x10]! +// CHECK-NEXT: {{[0-9a-f]+}}: mov x29, sp +// CHECK-NEXT: {{[0-9a-f]+}}: ldr x16, [x0] +// CHECK-NEXT: {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: {{[0-9a-f]+}}: ldr x17, [x1] +// CHECK-NEXT: {{[0-9a-f]+}}: blr x17 +// CHECK-NEXT: {{[0-9a-f]+}}: ldp x29, x30, [sp], #0x10 +// CHECK-NEXT: {{[0-9a-f]+}}: autiasp +// CHECK-NEXT: {{[0-9a-f]+}}: ret +// CHECK-LABEL: GS-PAUTH: non-protected call found in function last_insts_writing_to_reg, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: blr x17 +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: ldr x17, [x1] +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: paciasp +// CHECK-NEXT: {{[0-9a-f]+}}: stp x29, x30, [sp, #-0x10]! +// CHECK-NEXT: {{[0-9a-f]+}}: mov x29, sp +// CHECK-NEXT: {{[0-9a-f]+}}: ldr x16, [x0] +// CHECK-NEXT: {{[0-9a-f]+}}: blr x16 +// CHECK-NEXT: {{[0-9a-f]+}}: ldr x17, [x1] +// CHECK-NEXT: {{[0-9a-f]+}}: blr x17 +// CHECK-NEXT: {{[0-9a-f]+}}: ldp x29, x30, [sp], #0x10 +// CHECK-NEXT: {{[0-9a-f]+}}: autiasp +// CHECK-NEXT: {{[0-9a-f]+}}: ret + paciasp + stp x29, x30, [sp, #-16]! + mov x29, sp + + ldr x16, [x0] + blr x16 + ldr x17, [x1] + blr x17 + + ldp x29, x30, [sp], #16 + autiasp + ret + .size last_insts_writing_to_reg, .-last_insts_writing_to_reg + + .globl main + .type main,@function +main: + mov x0, 0 + ret + .size main, .-main diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s index 30b70b060b94b..b271cda9da62f 100644 --- a/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s +++ b/bolt/test/binary-analysis/AArch64/gs-pauth-debug-output.s @@ -3,6 +3,8 @@ // RUN: %clang %cflags -march=armv8.3-a %s -o %t.exe // RUN: llvm-bolt-binary-analysis --scanners=pacret -no-threads \ // RUN: -debug-only bolt-pauth-scanner %t.exe 2>&1 | FileCheck %s +// RUN: llvm-bolt-binary-analysis --scanners=pauth -no-threads \ +// RUN: -debug-only bolt-pauth-scanner %t.exe 2>&1 | FileCheck -check-prefixes=CHECK,PAUTH %s // Check the debug output generated by PAuth gadget scanner to make sure the // that output is kept meaningful and to provide an overview of what happens @@ -12,8 +14,12 @@ .type simple,@function simple: paciasp + stp x29, x30, [sp, #-0x10]! b 1f 1: + autiza x0 + blr x0 + ldp x29, x30, [sp], #0x10 autiasp ret .size simple, .-simple @@ -25,16 +31,20 @@ simple: // ... // CHECK: BB Layout : [[BB0:[0-9a-zA-Z.]+]], [[BB1:[0-9a-zA-Z.]+]] // CHECK-NEXT: } -// CHECK-NEXT: [[BB0]] (2 instructions, align : 1) +// CHECK-NEXT: [[BB0]] (3 instructions, align : 1) // CHECK-NEXT: Entry Point // CHECK-NEXT: 00000000: paciasp -// CHECK-NEXT: 00000004: b [[BB1]] +// CHECK-NEXT: 00000004: stp x29, x30, [sp, #-0x10]! +// CHECK-NEXT: 00000008: b [[BB1]] // CHECK-NEXT: Successors: [[BB1]] // CHECK-EMPTY: -// CHECK-NEXT: [[BB1]] (2 instructions, align : 1) +// CHECK-NEXT: [[BB1]] (5 instructions, align : 1) // CHECK-NEXT: Predecessors: [[BB0]] -// CHECK-NEXT: 00000008: autiasp -// CHECK-NEXT: 0000000c: ret +// CHECK-NEXT: 0000000c: autiza x0 +// CHECK-NEXT: 00000010: blr x0 +// CHECK-NEXT: 00000014: ldp x29, x30, [sp], #0x10 +// CHECK-NEXT: 00000018: autiasp +// CHECK-NEXT: 0000001c: ret // CHECK-EMPTY: // CHECK-NEXT: DWARF CFI Instructions: // CHECK-NEXT: @@ -42,12 +52,20 @@ simple: // CHECK-EMPTY: // CHECK-NEXT: PacRetAnalysis::ComputeNext( hint #25, pacret-state) // CHECK-NEXT: .. result: (pacret-state) +// CHECK-NEXT: PacRetAnalysis::ComputeNext( stp x29, x30, [sp, #-0x10]!, pacret-state) +// CHECK-NEXT: .. result: (pacret-state) // CHECK-NEXT: PacRetAnalysis::ComputeNext( b [[BB1]], pacret-state) // CHECK-NEXT: .. result: (pacret-state) // CHECK-NEXT: PacRetAnalysis::Confluence( // CHECK-NEXT: State 1: pacret-state // CHECK-NEXT: State 2: pacret-state) // CHECK-NEXT: merged state: pacret-state +// CHECK-NEXT: PacRetAnalysis::ComputeNext( autiza x0, pacret-state) +// CHECK-NEXT: .. result: (pacret-state) +// CHECK-NEXT: PacRetAnalysis::ComputeNext( blr x0, pacret-state) +// CHECK-NEXT: .. result: (pacret-state) +// CHECK-NEXT: PacRetAnalysis::ComputeNext( ldp x29, x30, [sp], #0x10, pacret-state) +// CHECK-NEXT: .. result: (pacret-state) // CHECK-NEXT: PacRetAnalysis::ComputeNext( hint #29, pacret-state) // CHECK-NEXT: .. result: (pacret-state) // CHECK-NEXT: PacRetAnalysis::ComputeNext( ret x30, pacret-state) @@ -56,6 +74,12 @@ simple: // CHECK-NEXT: State 1: pacret-state // CHECK-NEXT: State 2: pacret-state) // CHECK-NEXT: merged state: pacret-state +// CHECK-NEXT: PacRetAnalysis::ComputeNext( autiza x0, pacret-state) +// CHECK-NEXT: .. result: (pacret-state) +// CHECK-NEXT: PacRetAnalysis::ComputeNext( blr x0, pacret-state) +// CHECK-NEXT: .. result: (pacret-state) +// CHECK-NEXT: PacRetAnalysis::ComputeNext( ldp x29, x30, [sp], #0x10, pacret-state) +// CHECK-NEXT: .. result: (pacret-state) // CHECK-NEXT: PacRetAnalysis::ComputeNext( hint #29, pacret-state) // CHECK-NEXT: .. result: (pacret-state) // CHECK-NEXT: PacRetAnalysis::ComputeNext( ret x30, pacret-state) @@ -67,21 +91,28 @@ simple: // ... // CHECK: BB Layout : [[BB0]], [[BB1]] // CHECK-NEXT: } -// CHECK-NEXT: [[BB0]] (2 instructions, align : 1) +// CHECK-NEXT: [[BB0]] (3 instructions, align : 1) // CHECK-NEXT: Entry Point // CHECK-NEXT: 00000000: paciasp # PacRetAnalysis: pacret-state -// CHECK-NEXT: 00000004: b [[BB1]] # PacRetAnalysis: pacret-state +// CHECK-NEXT: 00000004: stp x29, x30, [sp, #-0x10]! # PacRetAnalysis: pacret-state +// CHECK-NEXT: 00000008: b [[BB1]] # PacRetAnalysis: pacret-state // CHECK-NEXT: Successors: [[BB1]] // CHECK-EMPTY: -// CHECK-NEXT: [[BB1]] (2 instructions, align : 1) +// CHECK-NEXT: [[BB1]] (5 instructions, align : 1) // CHECK-NEXT: Predecessors: [[BB0]] -// CHECK-NEXT: 00000008: autiasp # PacRetAnalysis: pacret-state -// CHECK-NEXT: 0000000c: ret # PacRetAnalysis: pacret-state +// CHECK-NEXT: 0000000c: autiza x0 # PacRetAnalysis: pacret-state +// CHECK-NEXT: 00000010: blr x0 # PacRetAnalysis: pacret-state +// CHECK-NEXT: 00000014: ldp x29, x30, [sp], #0x10 # PacRetAnalysis: pacret-state +// CHECK-NEXT: 00000018: autiasp # PacRetAnalysis: pacret-state +// CHECK-NEXT: 0000001c: ret # PacRetAnalysis: pacret-state // CHECK-EMPTY: // CHECK-NEXT: DWARF CFI Instructions: // CHECK-NEXT: // CHECK-NEXT: End of Function "simple" // CHECK-EMPTY: +// PAUTH-NEXT: Found call inst: 00000000: blr x0 # PacRetAnalysis: pacret-state +// PAUTH-NEXT: Call destination reg: X0 +// PAUTH-NEXT: SafeToDerefRegs: W0 X0 W0_HI{{[ \t]*$}} // CHECK-NEXT: Found RET inst: 00000000: ret # PacRetAnalysis: pacret-state // CHECK-NEXT: RetReg: LR // CHECK-NEXT: Authenticated reg: (none) From 9df324e90b32f91e0b2866dffb78b0be0db4f37c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 3 Apr 2025 14:41:24 +0100 Subject: [PATCH 0523/1029] [X86] Add growShuffleMask helper to grow the shuffle mask for a larger value type. NFC. (#134243) Prep work for #133947 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 28 ++++++++++++++++++------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8271d9c486650..d1be19539b642 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -3941,6 +3941,24 @@ static bool canScaleShuffleElements(ArrayRef Mask, unsigned NumDstElts) { return scaleShuffleElements(Mask, NumDstElts, ScaledMask); } +// Helper to grow the shuffle mask for a larger value type. +// NOTE: This is different to scaleShuffleElements which is a same size type. +static void growShuffleMask(ArrayRef SrcMask, + SmallVectorImpl &DstMask, + unsigned SrcSizeInBits, unsigned DstSizeInBits) { + assert(DstMask.empty() && "Expected an empty shuffle mas"); + assert((DstSizeInBits % SrcSizeInBits) == 0 && "Illegal shuffle scale"); + unsigned Scale = DstSizeInBits / SrcSizeInBits; + unsigned NumSrcElts = SrcMask.size(); + DstMask.assign(SrcMask.begin(), SrcMask.end()); + for (int &M : DstMask) { + if (M < 0) + continue; + M = (M % NumSrcElts) + ((M / NumSrcElts) * Scale * NumSrcElts); + } + DstMask.append((Scale - 1) * NumSrcElts, SM_SentinelUndef); +} + /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); @@ -40456,19 +40474,13 @@ static SDValue combineX86ShuffleChainWithExtract( } // Bail if we fail to find a source larger than the existing root. - unsigned Scale = WideSizeInBits / RootSizeInBits; if (WideSizeInBits <= RootSizeInBits || (WideSizeInBits % RootSizeInBits) != 0) return SDValue(); // Create new mask for larger type. - SmallVector WideMask(BaseMask); - for (int &M : WideMask) { - if (M < 0) - continue; - M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts); - } - WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef); + SmallVector WideMask; + growShuffleMask(BaseMask, WideMask, RootSizeInBits, WideSizeInBits); // Attempt to peek through inputs and adjust mask when we extract from an // upper subvector. From a77d8077815627d188ae26fda6e88f0d0d61c990 Mon Sep 17 00:00:00 2001 From: Steven Perron Date: Thu, 3 Apr 2025 09:44:07 -0400 Subject: [PATCH 0524/1029] [SPIRV] Add spirv.VulkanBuffer types to the backend (#133475) Adds code to expand the `llvm.spv.resource.handlefrombinding` and `llvm.spv.resource.getpointer` when the resource type is `spirv.VulkanBuffer`. It gets expanded as a storage buffer or uniform buffer denpending on the storage class used. This is implementing part of https://github.com/llvm/wg-hlsl/blob/main/proposals/0018-spirv-resource-representation.md. --- llvm/docs/SPIRVUsage.rst | 30 ++-- .../SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp | 3 + llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp | 18 +++ llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 23 ++- llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 96 +++++++++--- llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h | 5 + llvm/lib/Target/SPIRV/SPIRVIRMapping.h | 8 + .../Target/SPIRV/SPIRVInstructionSelector.cpp | 145 +++++++++++------- llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp | 16 +- llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 33 +++- llvm/lib/Target/SPIRV/SPIRVUtils.cpp | 26 ++++ llvm/lib/Target/SPIRV/SPIRVUtils.h | 13 ++ .../SPIRV/hlsl-resources/StructuredBuffer.ll | 90 +++++++++++ 13 files changed, 404 insertions(+), 102 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst index d3d9fbb3ba294..f58587314e6b5 100644 --- a/llvm/docs/SPIRVUsage.rst +++ b/llvm/docs/SPIRVUsage.rst @@ -243,19 +243,20 @@ using target extension types and are represented as follows: .. table:: SPIR-V Opaque Types - ================== ====================== =========================================================================================== - SPIR-V Type LLVM type name LLVM type arguments - ================== ====================== =========================================================================================== - OpTypeImage ``spirv.Image`` sampled type, dimensionality, depth, arrayed, MS, sampled, image format, [access qualifier] - OpTypeSampler ``spirv.Sampler`` (none) - OpTypeSampledImage ``spirv.SampledImage`` sampled type, dimensionality, depth, arrayed, MS, sampled, image format, [access qualifier] - OpTypeEvent ``spirv.Event`` (none) - OpTypeDeviceEvent ``spirv.DeviceEvent`` (none) - OpTypeReserveId ``spirv.ReserveId`` (none) - OpTypeQueue ``spirv.Queue`` (none) - OpTypePipe ``spirv.Pipe`` access qualifier - OpTypePipeStorage ``spirv.PipeStorage`` (none) - ================== ====================== =========================================================================================== + ================== ======================= =========================================================================================== + SPIR-V Type LLVM type name LLVM type arguments + ================== ======================= =========================================================================================== + OpTypeImage ``spirv.Image`` sampled type, dimensionality, depth, arrayed, MS, sampled, image format, [access qualifier] + OpTypeSampler ``spirv.Sampler`` (none) + OpTypeSampledImage ``spirv.SampledImage`` sampled type, dimensionality, depth, arrayed, MS, sampled, image format, [access qualifier] + OpTypeEvent ``spirv.Event`` (none) + OpTypeDeviceEvent ``spirv.DeviceEvent`` (none) + OpTypeReserveId ``spirv.ReserveId`` (none) + OpTypeQueue ``spirv.Queue`` (none) + OpTypePipe ``spirv.Pipe`` access qualifier + OpTypePipeStorage ``spirv.PipeStorage`` (none) + NA ``spirv.VulkanBuffer`` ElementType, StorageClass, IsWriteable + ================== ======================= =========================================================================================== All integer arguments take the same value as they do in their `corresponding SPIR-V instruction `_. @@ -266,6 +267,9 @@ parameters of its underlying image type, so that a sampled image for the previous type has the representation ``target("spirv.SampledImage, void, 1, 1, 0, 0, 0, 0, 0)``. +See `wg-hlsl proposal 0018 `_ +for details on ``spirv.VulkanBuffer``. + .. _inline-spirv-types: Inline SPIR-V Types diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp index cd65985a4229c..e559aa2483f26 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp @@ -144,6 +144,9 @@ void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address, printRemainingVariableOps(MI, NumFixedOps, OS, false, true); break; } + case SPIRV::OpMemberDecorate: + printRemainingVariableOps(MI, NumFixedOps, OS); + break; case SPIRV::OpExecutionMode: case SPIRV::OpExecutionModeId: case SPIRV::OpLoopMerge: { diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 334580fac73b4..e3ba0fb80979f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -3090,6 +3090,22 @@ static SPIRVType *getInlineSpirvType(const TargetExtType *ExtensionType, Operands); } +static SPIRVType *getVulkanBufferType(const TargetExtType *ExtensionType, + MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + assert(ExtensionType->getNumTypeParameters() == 1 && + "Vulkan buffers have exactly one type for the type of the buffer."); + assert(ExtensionType->getNumIntParameters() == 2 && + "Vulkan buffer have 2 integer parameters: storage class and is " + "writable."); + + auto *T = ExtensionType->getTypeParameter(0); + auto SC = static_cast( + ExtensionType->getIntParameter(0)); + bool IsWritable = ExtensionType->getIntParameter(1); + return GR->getOrCreateVulkanBufferType(MIRBuilder, T, SC, IsWritable); +} + namespace SPIRV { TargetExtType *parseBuiltinTypeNameToTargetExtType(std::string TypeName, LLVMContext &Context) { @@ -3165,6 +3181,8 @@ SPIRVType *lowerBuiltinType(const Type *OpaqueType, SPIRVType *TargetType; if (Name == "spirv.Type") { TargetType = getInlineSpirvType(BuiltinType, MIRBuilder, GR); + } else if (Name == "spirv.VulkanBuffer") { + TargetType = getVulkanBufferType(BuiltinType, MIRBuilder, GR); } else { // Lookup the demangled builtin type in the TableGen records. const SPIRV::BuiltinType *TypeRecord = SPIRV::lookupBuiltinType(Name); diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 0a6a54b4a2f67..68b69fe6f62b6 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -670,13 +670,22 @@ Type *SPIRVEmitIntrinsics::deduceElementTypeHelper( auto *II = dyn_cast(I); if (II && II->getIntrinsicID() == Intrinsic::spv_resource_getpointer) { - auto *ImageType = cast(II->getOperand(0)->getType()); - assert(ImageType->getTargetExtName() == "spirv.Image"); - (void)ImageType; - if (II->hasOneUse()) { - auto *U = *II->users().begin(); - Ty = cast(U)->getAccessType(); - assert(Ty && "Unable to get type for resource pointer."); + auto *HandleType = cast(II->getOperand(0)->getType()); + if (HandleType->getTargetExtName() == "spirv.Image") { + if (II->hasOneUse()) { + auto *U = *II->users().begin(); + Ty = cast(U)->getAccessType(); + assert(Ty && "Unable to get type for resource pointer."); + } + } else if (HandleType->getTargetExtName() == "spirv.VulkanBuffer") { + // This call is supposed to index into an array + Ty = HandleType->getTypeParameter(0); + assert(Ty->isArrayTy() && + "spv_resource_getpointer indexes into an array, so the type of " + "the buffer should be an array."); + Ty = Ty->getArrayElementType(); + } else { + llvm_unreachable("Unknown handle type for spv_resource_getpointer."); } } else if (Function *CalledF = CI->getCalledFunction()) { std::string DemangledName = diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp index 2c167ac226dea..60ec1c9f15a0c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp @@ -762,23 +762,25 @@ Register SPIRVGlobalRegistry::buildGlobalVariable( static std::string GetSpirvImageTypeName(const SPIRVType *Type, MachineIRBuilder &MIRBuilder, - const std::string &Prefix); + const std::string &Prefix, + SPIRVGlobalRegistry &GR); static std::string buildSpirvTypeName(const SPIRVType *Type, - MachineIRBuilder &MIRBuilder) { + MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry &GR) { switch (Type->getOpcode()) { case SPIRV::OpTypeSampledImage: { - return GetSpirvImageTypeName(Type, MIRBuilder, "sampled_image_"); + return GetSpirvImageTypeName(Type, MIRBuilder, "sampled_image_", GR); } case SPIRV::OpTypeImage: { - return GetSpirvImageTypeName(Type, MIRBuilder, "image_"); + return GetSpirvImageTypeName(Type, MIRBuilder, "image_", GR); } case SPIRV::OpTypeArray: { MachineRegisterInfo *MRI = MIRBuilder.getMRI(); Register ElementTypeReg = Type->getOperand(1).getReg(); auto *ElementType = MRI->getUniqueVRegDef(ElementTypeReg); uint32_t ArraySize = getArrayComponentCount(MRI, Type); - return (buildSpirvTypeName(ElementType, MIRBuilder) + Twine("[") + + return (buildSpirvTypeName(ElementType, MIRBuilder, GR) + Twine("[") + Twine(ArraySize) + Twine("]")) .str(); } @@ -790,6 +792,22 @@ static std::string buildSpirvTypeName(const SPIRVType *Type, if (Type->getOperand(2).getImm()) return ("i" + Twine(Type->getOperand(1).getImm())).str(); return ("u" + Twine(Type->getOperand(1).getImm())).str(); + case SPIRV::OpTypePointer: { + uint32_t StorageClass = GR.getPointerStorageClass(Type); + SPIRVType *PointeeType = GR.getPointeeType(Type); + return ("p_" + Twine(StorageClass) + Twine("_") + + buildSpirvTypeName(PointeeType, MIRBuilder, GR)) + .str(); + } + case SPIRV::OpTypeStruct: { + std::string TypeName = "{"; + for (uint32_t I = 2; I < Type->getNumOperands(); ++I) { + SPIRVType *MemberType = + GR.getSPIRVTypeForVReg(Type->getOperand(I).getReg()); + TypeName = '_' + buildSpirvTypeName(MemberType, MIRBuilder, GR); + } + return TypeName + "}"; + } default: llvm_unreachable("Trying to the the name of an unknown type."); } @@ -797,10 +815,12 @@ static std::string buildSpirvTypeName(const SPIRVType *Type, static std::string GetSpirvImageTypeName(const SPIRVType *Type, MachineIRBuilder &MIRBuilder, - const std::string &Prefix) { + const std::string &Prefix, + SPIRVGlobalRegistry &GR) { Register SampledTypeReg = Type->getOperand(1).getReg(); auto *SampledType = MIRBuilder.getMRI()->getUniqueVRegDef(SampledTypeReg); - std::string TypeName = Prefix + buildSpirvTypeName(SampledType, MIRBuilder); + std::string TypeName = + Prefix + buildSpirvTypeName(SampledType, MIRBuilder, GR); for (uint32_t I = 2; I < Type->getNumOperands(); ++I) { TypeName = (TypeName + '_' + Twine(Type->getOperand(I).getImm())).str(); } @@ -810,20 +830,19 @@ static std::string GetSpirvImageTypeName(const SPIRVType *Type, Register SPIRVGlobalRegistry::getOrCreateGlobalVariableWithBinding( const SPIRVType *VarType, uint32_t Set, uint32_t Binding, MachineIRBuilder &MIRBuilder) { - SPIRVType *VarPointerTypeReg = getOrCreateSPIRVPointerType( - VarType, MIRBuilder, SPIRV::StorageClass::UniformConstant); Register VarReg = MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::iIDRegClass); // TODO: The name should come from the llvm-ir, but how that name will be // passed from the HLSL to the backend has not been decided. Using this place // holder for now. - std::string Name = ("__resource_" + buildSpirvTypeName(VarType, MIRBuilder) + - "_" + Twine(Set) + "_" + Twine(Binding)) - .str(); - buildGlobalVariable(VarReg, VarPointerTypeReg, Name, nullptr, - SPIRV::StorageClass::UniformConstant, nullptr, false, - false, SPIRV::LinkageType::Import, MIRBuilder, false); + std::string Name = + ("__resource_" + buildSpirvTypeName(VarType, MIRBuilder, *this) + "_" + + Twine(Set) + "_" + Twine(Binding)) + .str(); + buildGlobalVariable(VarReg, VarType, Name, nullptr, + getPointerStorageClass(VarType), nullptr, false, false, + SPIRV::LinkageType::Import, MIRBuilder, false); buildOpDecorate(VarReg, MIRBuilder, SPIRV::Decoration::DescriptorSet, {Set}); buildOpDecorate(VarReg, MIRBuilder, SPIRV::Decoration::Binding, {Binding}); @@ -837,13 +856,22 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems, assert((ElemType->getOpcode() != SPIRV::OpTypeVoid) && "Invalid array element type"); SPIRVType *SpvTypeInt32 = getOrCreateSPIRVIntegerType(32, MIRBuilder); - Register NumElementsVReg = - buildConstantInt(NumElems, MIRBuilder, SpvTypeInt32, EmitIR); + + if (NumElems != 0) { + Register NumElementsVReg = + buildConstantInt(NumElems, MIRBuilder, SpvTypeInt32, EmitIR); + return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { + return MIRBuilder.buildInstr(SPIRV::OpTypeArray) + .addDef(createTypeVReg(MIRBuilder)) + .addUse(getSPIRVTypeID(ElemType)) + .addUse(NumElementsVReg); + }); + } + return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) { - return MIRBuilder.buildInstr(SPIRV::OpTypeArray) + return MIRBuilder.buildInstr(SPIRV::OpTypeRuntimeArray) .addDef(createTypeVReg(MIRBuilder)) - .addUse(getSPIRVTypeID(ElemType)) - .addUse(NumElementsVReg); + .addUse(getSPIRVTypeID(ElemType)); }); } @@ -1291,6 +1319,34 @@ SPIRVGlobalRegistry::getPointerStorageClass(const SPIRVType *Type) const { Type->getOperand(1).getImm()); } +SPIRVType *SPIRVGlobalRegistry::getOrCreateVulkanBufferType( + MachineIRBuilder &MIRBuilder, Type *ElemType, + SPIRV::StorageClass::StorageClass SC, bool IsWritable, bool EmitIr) { + auto Key = SPIRV::irhandle_vkbuffer(ElemType, SC, IsWritable); + if (const MachineInstr *MI = findMI(Key, &MIRBuilder.getMF())) + return MI; + + // TODO(134119): The SPIRVType for `ElemType` will not have an explicit + // layout. This generates invalid SPIR-V. + auto *T = StructType::create(ElemType); + auto *BlockType = + getOrCreateSPIRVType(T, MIRBuilder, SPIRV::AccessQualifier::None, EmitIr); + + buildOpDecorate(BlockType->defs().begin()->getReg(), MIRBuilder, + SPIRV::Decoration::Block, {}); + buildOpMemberDecorate(BlockType->defs().begin()->getReg(), MIRBuilder, + SPIRV::Decoration::Offset, 0, {0}); + + if (!IsWritable) { + buildOpMemberDecorate(BlockType->defs().begin()->getReg(), MIRBuilder, + SPIRV::Decoration::NonWritable, 0, {}); + } + + SPIRVType *R = getOrCreateSPIRVPointerType(BlockType, MIRBuilder, SC); + add(Key, R); + return R; +} + SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeImage( MachineIRBuilder &MIRBuilder, SPIRVType *SampledType, SPIRV::Dim::Dim Dim, uint32_t Depth, uint32_t Arrayed, uint32_t Multisampled, uint32_t Sampled, diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index 09f567a9d1866..c18f17d1f3d23 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -547,6 +547,11 @@ class SPIRVGlobalRegistry : public SPIRVIRMapping { SPIRVType *BaseType, MachineInstr &I, const SPIRVInstrInfo &TII, SPIRV::StorageClass::StorageClass SClass = SPIRV::StorageClass::Function); + SPIRVType *getOrCreateVulkanBufferType(MachineIRBuilder &MIRBuilder, + Type *ElemType, + SPIRV::StorageClass::StorageClass SC, + bool IsWritable, bool EmitIr = false); + SPIRVType * getOrCreateOpTypeImage(MachineIRBuilder &MIRBuilder, SPIRVType *SampledType, SPIRV::Dim::Dim Dim, uint32_t Depth, uint32_t Arrayed, diff --git a/llvm/lib/Target/SPIRV/SPIRVIRMapping.h b/llvm/lib/Target/SPIRV/SPIRVIRMapping.h index 66d5d9ae9dad3..5e8e1c55d91c6 100644 --- a/llvm/lib/Target/SPIRV/SPIRVIRMapping.h +++ b/llvm/lib/Target/SPIRV/SPIRVIRMapping.h @@ -65,6 +65,7 @@ enum SpecialTypeKind { STK_Type, STK_Value, STK_MachineInstr, + STK_VkBuffer, STK_Last = -1 }; @@ -142,6 +143,13 @@ inline IRHandle irhandle_ptr(const void *Ptr, unsigned Arg, return std::make_tuple(Ptr, Arg, STK); } +inline IRHandle irhandle_vkbuffer(const Type *ElementType, + StorageClass::StorageClass SC, + bool IsWriteable) { + return std::make_tuple(ElementType, (SC << 1) | IsWriteable, + SpecialTypeKind::STK_VkBuffer); +} + inline IRHandle handle(const Type *Ty) { const Type *WrpTy = unifyPtrType(Ty); return irhandle_ptr(WrpTy, Ty->getTypeID(), STK_Type); diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 4f94d9c5ebb11..946a295c2df25 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -322,9 +322,11 @@ class SPIRVInstructionSelector : public InstructionSelector { uint32_t Opcode) const; MachineInstrBuilder buildConstGenericPtr(MachineInstr &I, Register SrcPtr, SPIRVType *SrcPtrTy) const; - Register buildPointerToResource(const SPIRVType *ResType, uint32_t Set, - uint32_t Binding, uint32_t ArraySize, - Register IndexReg, bool IsNonUniform, + Register buildPointerToResource(const SPIRVType *ResType, + SPIRV::StorageClass::StorageClass SC, + uint32_t Set, uint32_t Binding, + uint32_t ArraySize, Register IndexReg, + bool IsNonUniform, MachineIRBuilder MIRBuilder) const; SPIRVType *widenTypeToVec4(const SPIRVType *Type, MachineInstr &I) const; bool extractSubvector(Register &ResVReg, const SPIRVType *ResType, @@ -1145,18 +1147,20 @@ bool SPIRVInstructionSelector::selectLoad(Register ResVReg, auto *IntPtrDef = dyn_cast(PtrDef); if (IntPtrDef && IntPtrDef->getIntrinsicID() == Intrinsic::spv_resource_getpointer) { - Register ImageReg = IntPtrDef->getOperand(2).getReg(); - Register NewImageReg = - MRI->createVirtualRegister(MRI->getRegClass(ImageReg)); - auto *ImageDef = cast(getVRegDef(*MRI, ImageReg)); - if (!loadHandleBeforePosition(NewImageReg, GR.getSPIRVTypeForVReg(ImageReg), - *ImageDef, I)) { - return false; - } + Register HandleReg = IntPtrDef->getOperand(2).getReg(); + SPIRVType *HandleType = GR.getSPIRVTypeForVReg(HandleReg); + if (HandleType->getOpcode() == SPIRV::OpTypeImage) { + Register NewHandleReg = + MRI->createVirtualRegister(MRI->getRegClass(HandleReg)); + auto *HandleDef = cast(getVRegDef(*MRI, HandleReg)); + if (!loadHandleBeforePosition(NewHandleReg, HandleType, *HandleDef, I)) { + return false; + } - Register IdxReg = IntPtrDef->getOperand(3).getReg(); - return generateImageRead(ResVReg, ResType, NewImageReg, IdxReg, - I.getDebugLoc(), I); + Register IdxReg = IntPtrDef->getOperand(3).getReg(); + return generateImageRead(ResVReg, ResType, NewHandleReg, IdxReg, + I.getDebugLoc(), I); + } } auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpLoad)) @@ -1184,22 +1188,24 @@ bool SPIRVInstructionSelector::selectStore(MachineInstr &I) const { auto *IntPtrDef = dyn_cast(PtrDef); if (IntPtrDef && IntPtrDef->getIntrinsicID() == Intrinsic::spv_resource_getpointer) { - Register ImageReg = IntPtrDef->getOperand(2).getReg(); - Register NewImageReg = - MRI->createVirtualRegister(MRI->getRegClass(ImageReg)); - auto *ImageDef = cast(getVRegDef(*MRI, ImageReg)); - if (!loadHandleBeforePosition(NewImageReg, GR.getSPIRVTypeForVReg(ImageReg), - *ImageDef, I)) { + Register HandleReg = IntPtrDef->getOperand(2).getReg(); + Register NewHandleReg = + MRI->createVirtualRegister(MRI->getRegClass(HandleReg)); + auto *HandleDef = cast(getVRegDef(*MRI, HandleReg)); + SPIRVType *HandleType = GR.getSPIRVTypeForVReg(HandleReg); + if (!loadHandleBeforePosition(NewHandleReg, HandleType, *HandleDef, I)) { return false; } Register IdxReg = IntPtrDef->getOperand(3).getReg(); - return BuildMI(*I.getParent(), I, I.getDebugLoc(), - TII.get(SPIRV::OpImageWrite)) - .addUse(NewImageReg) - .addUse(IdxReg) - .addUse(StoreVal) - .constrainAllUses(TII, TRI, RBI); + if (HandleType->getOpcode() == SPIRV::OpTypeImage) { + return BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(SPIRV::OpImageWrite)) + .addUse(NewHandleReg) + .addUse(IdxReg) + .addUse(StoreVal) + .constrainAllUses(TII, TRI, RBI); + } } MachineBasicBlock &BB = *I.getParent(); @@ -3191,7 +3197,13 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, bool SPIRVInstructionSelector::selectHandleFromBinding(Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const { - return true; + // The images need to be loaded in the same basic block as their use. We defer + // loading the image to the intrinsic that uses it. + if (ResType->getOpcode() == SPIRV::OpTypeImage) + return true; + + return loadHandleBeforePosition(ResVReg, GR.getSPIRVTypeForVReg(ResVReg), + *cast(&I), I); } bool SPIRVInstructionSelector::selectReadImageIntrinsic( @@ -3259,20 +3271,30 @@ bool SPIRVInstructionSelector::generateImageRead(Register &ResVReg, bool SPIRVInstructionSelector::selectResourceGetPointer( Register &ResVReg, const SPIRVType *ResType, MachineInstr &I) const { -#ifdef ASSERT - // For now, the operand is an image. This will change once we start handling - // more resource types. Register ResourcePtr = I.getOperand(2).getReg(); - SPIRVType *RegType = GR.getResultType(ResourcePtr); - assert(RegType->getOpcode() == SPIRV::OpTypeImage && - "Can only handle texel buffers for now."); -#endif - - // For texel buffers, the index into the image is part of the OpImageRead or - // OpImageWrite instructions. So we will do nothing in this case. This - // intrinsic will be combined with the load or store when selecting the load - // or store. - return true; + SPIRVType *RegType = GR.getSPIRVTypeForVReg(ResourcePtr, I.getMF()); + if (RegType->getOpcode() == SPIRV::OpTypeImage) { + // For texel buffers, the index into the image is part of the OpImageRead or + // OpImageWrite instructions. So we will do nothing in this case. This + // intrinsic will be combined with the load or store when selecting the load + // or store. + return true; + } + + assert(ResType->getOpcode() == SPIRV::OpTypePointer); + MachineIRBuilder MIRBuilder(I); + + Register IndexReg = I.getOperand(3).getReg(); + Register ZeroReg = + buildZerosVal(GR.getOrCreateSPIRVIntegerType(32, I, TII), I); + return BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(SPIRV::OpAccessChain)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(ResourcePtr) + .addUse(ZeroReg) + .addUse(IndexReg) + .constrainAllUses(TII, TRI, RBI); } bool SPIRVInstructionSelector::extractSubvector( @@ -3344,22 +3366,27 @@ bool SPIRVInstructionSelector::selectImageWriteIntrinsic( } Register SPIRVInstructionSelector::buildPointerToResource( - const SPIRVType *ResType, uint32_t Set, uint32_t Binding, - uint32_t ArraySize, Register IndexReg, bool IsNonUniform, - MachineIRBuilder MIRBuilder) const { - if (ArraySize == 1) - return GR.getOrCreateGlobalVariableWithBinding(ResType, Set, Binding, + const SPIRVType *ResType, SPIRV::StorageClass::StorageClass SC, + uint32_t Set, uint32_t Binding, uint32_t ArraySize, Register IndexReg, + bool IsNonUniform, MachineIRBuilder MIRBuilder) const { + if (ArraySize == 1) { + SPIRVType *PtrType = + GR.getOrCreateSPIRVPointerType(ResType, MIRBuilder, SC); + return GR.getOrCreateGlobalVariableWithBinding(PtrType, Set, Binding, MIRBuilder); + } const SPIRVType *VarType = GR.getOrCreateSPIRVArrayType( ResType, ArraySize, *MIRBuilder.getInsertPt(), TII); + SPIRVType *VarPointerType = + GR.getOrCreateSPIRVPointerType(VarType, MIRBuilder, SC); Register VarReg = GR.getOrCreateGlobalVariableWithBinding( - VarType, Set, Binding, MIRBuilder); + VarPointerType, Set, Binding, MIRBuilder); - SPIRVType *ResPointerType = GR.getOrCreateSPIRVPointerType( - ResType, MIRBuilder, SPIRV::StorageClass::UniformConstant); + SPIRVType *ResPointerType = + GR.getOrCreateSPIRVPointerType(ResType, MIRBuilder, SC); - Register AcReg = MRI->createVirtualRegister(&SPIRV::iIDRegClass); + Register AcReg = MRI->createVirtualRegister(GR.getRegClass(ResPointerType)); if (IsNonUniform) { // It is unclear which value needs to be marked an non-uniform, so both // the index and the access changed are decorated as non-uniform. @@ -4052,19 +4079,29 @@ bool SPIRVInstructionSelector::loadHandleBeforePosition( uint32_t ArraySize = foldImm(HandleDef.getOperand(4), MRI); Register IndexReg = HandleDef.getOperand(5).getReg(); bool IsNonUniform = ArraySize > 1 && foldImm(HandleDef.getOperand(6), MRI); - + bool IsStructuredBuffer = ResType->getOpcode() == SPIRV::OpTypePointer; MachineIRBuilder MIRBuilder(HandleDef); - Register VarReg = buildPointerToResource(ResType, Set, Binding, ArraySize, + SPIRVType *VarType = ResType; + SPIRV::StorageClass::StorageClass SC = SPIRV::StorageClass::UniformConstant; + + if (IsStructuredBuffer) { + VarType = GR.getPointeeType(ResType); + SC = GR.getPointerStorageClass(ResType); + } + + Register VarReg = buildPointerToResource(VarType, SC, Set, Binding, ArraySize, IndexReg, IsNonUniform, MIRBuilder); if (IsNonUniform) buildOpDecorate(HandleReg, HandleDef, TII, SPIRV::Decoration::NonUniformEXT, {}); - // TODO: For now we assume the resource is an image, which needs to be - // loaded to get the handle. That will not be true for storage buffers. + // The handle for the buffer is the pointer to the resource. For an image, the + // handle is the image object. So images get an extra load. + uint32_t LoadOpcode = + IsStructuredBuffer ? SPIRV::OpCopyObject : SPIRV::OpLoad; return BuildMI(*Pos.getParent(), Pos, HandleDef.getDebugLoc(), - TII.get(SPIRV::OpLoad)) + TII.get(LoadOpcode)) .addDef(HandleReg) .addUse(GR.getSPIRVTypeID(ResType)) .addUse(VarReg) diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 008aecf4cda85..578e82881f6e8 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -85,13 +85,15 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { const LLT p7 = LLT::pointer(7, PSize); // Input const LLT p8 = LLT::pointer(8, PSize); // Output const LLT p10 = LLT::pointer(10, PSize); // Private + const LLT p11 = LLT::pointer(11, PSize); // StorageBuffer // TODO: remove copy-pasting here by using concatenation in some way. auto allPtrsScalarsAndVectors = { - p0, p1, p2, p3, p4, p5, p6, p7, p8, p10, - s1, s8, s16, s32, s64, v2s1, v2s8, v2s16, v2s32, v2s64, - v3s1, v3s8, v3s16, v3s32, v3s64, v4s1, v4s8, v4s16, v4s32, v4s64, - v8s1, v8s8, v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64}; + p0, p1, p2, p3, p4, p5, p6, p7, p8, + p10, p11, s1, s8, s16, s32, s64, v2s1, v2s8, + v2s16, v2s32, v2s64, v3s1, v3s8, v3s16, v3s32, v3s64, v4s1, + v4s8, v4s16, v4s32, v4s64, v8s1, v8s8, v8s16, v8s32, v8s64, + v16s1, v16s8, v16s16, v16s32, v16s64}; auto allVectors = {v2s1, v2s8, v2s16, v2s32, v2s64, v3s1, v3s8, v3s16, v3s32, v3s64, v4s1, v4s8, v4s16, v4s32, @@ -118,10 +120,10 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { s16, s32, s64, v2s16, v2s32, v2s64, v3s16, v3s32, v3s64, v4s16, v4s32, v4s64, v8s16, v8s32, v8s64, v16s16, v16s32, v16s64}; - auto allFloatAndIntScalarsAndPtrs = {s8, s16, s32, s64, p0, p1, p2, - p3, p4, p5, p6, p7, p8, p10}; + auto allFloatAndIntScalarsAndPtrs = {s8, s16, s32, s64, p0, p1, p2, p3, + p4, p5, p6, p7, p8, p10, p11}; - auto allPtrs = {p0, p1, p2, p3, p4, p5, p6, p7, p8, p10}; + auto allPtrs = {p0, p1, p2, p3, p4, p5, p6, p7, p8, p10, p11}; bool IsExtendedInts = ST.canUseExtension( diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index acc8c014cb26b..f9e64f118a277 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -214,6 +214,26 @@ void SPIRVModuleAnalysis::setBaseInfo(const Module &M) { } } +// Appends the signature of the decoration instructions that decorate R to +// Signature. +static void appendDecorationsForReg(const MachineRegisterInfo &MRI, Register R, + InstrSignature &Signature) { + for (MachineInstr &UseMI : MRI.use_instructions(R)) { + // We don't handle OpDecorateId because getting the register alias for the + // ID can cause problems, and we do not need it for now. + if (UseMI.getOpcode() != SPIRV::OpDecorate && + UseMI.getOpcode() != SPIRV::OpMemberDecorate) + continue; + + for (unsigned I = 0; I < UseMI.getNumOperands(); ++I) { + const MachineOperand &MO = UseMI.getOperand(I); + if (MO.isReg()) + continue; + Signature.push_back(hash_value(MO)); + } + } +} + // Returns a representation of an instruction as a vector of MachineOperand // hash values, see llvm::hash_value(const MachineOperand &MO) for details. // This creates a signature of the instruction with the same content @@ -221,13 +241,17 @@ void SPIRVModuleAnalysis::setBaseInfo(const Module &M) { static InstrSignature instrToSignature(const MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI, bool UseDefReg) { + Register DefReg; InstrSignature Signature{MI.getOpcode()}; for (unsigned i = 0; i < MI.getNumOperands(); ++i) { const MachineOperand &MO = MI.getOperand(i); size_t h; if (MO.isReg()) { - if (!UseDefReg && MO.isDef()) + if (!UseDefReg && MO.isDef()) { + assert(!DefReg.isValid() && "Multiple def registers."); + DefReg = MO.getReg(); continue; + } Register RegAlias = MAI.getRegisterAlias(MI.getMF(), MO.getReg()); if (!RegAlias.isValid()) { LLVM_DEBUG({ @@ -247,6 +271,13 @@ static InstrSignature instrToSignature(const MachineInstr &MI, } Signature.push_back(h); } + + if (DefReg.isValid()) { + // Decorations change the semantics of the current instruction. So two + // identical instruction with different decorations cannot be merged. That + // is why we add the decorations to the signature. + appendDecorationsForReg(MI.getMF()->getRegInfo(), DefReg, Signature); + } return Signature; } diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp index 60b67a4f5ec5e..f38794afab436 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp @@ -146,6 +146,30 @@ void buildOpDecorate(Register Reg, MachineInstr &I, const SPIRVInstrInfo &TII, finishBuildOpDecorate(MIB, DecArgs, StrImm); } +void buildOpMemberDecorate(Register Reg, MachineIRBuilder &MIRBuilder, + SPIRV::Decoration::Decoration Dec, uint32_t Member, + const std::vector &DecArgs, + StringRef StrImm) { + auto MIB = MIRBuilder.buildInstr(SPIRV::OpMemberDecorate) + .addUse(Reg) + .addImm(Member) + .addImm(static_cast(Dec)); + finishBuildOpDecorate(MIB, DecArgs, StrImm); +} + +void buildOpMemberDecorate(Register Reg, MachineInstr &I, + const SPIRVInstrInfo &TII, + SPIRV::Decoration::Decoration Dec, uint32_t Member, + const std::vector &DecArgs, + StringRef StrImm) { + MachineBasicBlock &MBB = *I.getParent(); + auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(SPIRV::OpMemberDecorate)) + .addUse(Reg) + .addImm(Member) + .addImm(static_cast(Dec)); + finishBuildOpDecorate(MIB, DecArgs, StrImm); +} + void buildOpSpirvDecorations(Register Reg, MachineIRBuilder &MIRBuilder, const MDNode *GVarMD) { for (unsigned I = 0, E = GVarMD->getNumOperands(); I != E; ++I) { @@ -236,6 +260,8 @@ addressSpaceToStorageClass(unsigned AddrSpace, const SPIRVSubtarget &STI) { return SPIRV::StorageClass::CodeSectionINTEL; case 10: return SPIRV::StorageClass::Private; + case 11: + return SPIRV::StorageClass::StorageBuffer; default: report_fatal_error("Unknown address space"); } diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h index b094184f34fb0..0498c7beb073c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVUtils.h +++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h @@ -144,6 +144,17 @@ void buildOpDecorate(Register Reg, MachineInstr &I, const SPIRVInstrInfo &TII, const std::vector &DecArgs, StringRef StrImm = ""); +// Add an OpDecorate instruction for the given Reg. +void buildOpMemberDecorate(Register Reg, MachineIRBuilder &MIRBuilder, + SPIRV::Decoration::Decoration Dec, uint32_t Member, + const std::vector &DecArgs, + StringRef StrImm = ""); +void buildOpMemberDecorate(Register Reg, MachineInstr &I, + const SPIRVInstrInfo &TII, + SPIRV::Decoration::Decoration Dec, uint32_t Member, + const std::vector &DecArgs, + StringRef StrImm = ""); + // Add an OpDecorate instruction by "spirv.Decorations" metadata node. void buildOpSpirvDecorations(Register Reg, MachineIRBuilder &MIRBuilder, const MDNode *GVarMD); @@ -184,6 +195,8 @@ storageClassToAddressSpace(SPIRV::StorageClass::StorageClass SC) { return 9; case SPIRV::StorageClass::Private: return 10; + case SPIRV::StorageClass::StorageBuffer: + return 11; default: report_fatal_error("Unable to get address space id"); } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll new file mode 100644 index 0000000000000..fc8faa7300534 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StructuredBuffer.ll @@ -0,0 +1,90 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-library %s -o - -filetype=obj | spirv-val %} + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32, i32, i32, i32, i1) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32, i32, i32, i32, i1) #0 + +; CHECK: OpDecorate [[BufferVar:%.+]] DescriptorSet 0 +; CHECK: OpDecorate [[BufferVar]] Binding 0 +; CHECK: OpDecorate [[BufferType:%.+]] Block +; CHECK: OpMemberDecorate [[BufferType]] 0 Offset 0 +; CHECK: OpMemberDecorate [[BufferType]] 0 NonWritable +; CHECK: OpDecorate [[RWBufferVar:%.+]] DescriptorSet 0 +; CHECK: OpDecorate [[RWBufferVar]] Binding 1 +; CHECK: OpDecorate [[RWBufferType:%.+]] Block +; CHECK: OpMemberDecorate [[RWBufferType]] 0 Offset 0 + + +; CHECK: [[int:%[0-9]+]] = OpTypeInt 32 0 +; CHECK: [[ArrayType:%.+]] = OpTypeRuntimeArray +; CHECK: [[RWBufferType]] = OpTypeStruct [[ArrayType]] +; CHECK: [[RWBufferPtrType:%.+]] = OpTypePointer StorageBuffer [[RWBufferType]] +; CHECK: [[BufferType]] = OpTypeStruct [[ArrayType]] +; CHECK: [[BufferPtrType:%.+]] = OpTypePointer StorageBuffer [[BufferType]] +; CHECK-DAG: [[zero:%[0-9]+]] = OpConstant [[int]] 0 +; CHECK-DAG: [[one:%[0-9]+]] = OpConstant [[int]] 1 +; CHECK-DAG: [[two:%[0-9]+]] = OpConstant [[int]] 2 +; CHECK-DAG: [[BufferVar]] = OpVariable [[BufferPtrType]] StorageBuffer +; CHECK-DAG: [[RWBufferVar]] = OpVariable [[RWBufferPtrType]] StorageBuffer + +; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) +define void @main() local_unnamed_addr #1 { +entry: + +; CHECK-DAG: [[BufferHandle:%.+]] = OpCopyObject [[BufferPtrType]] [[BufferVar]] +; CHECK-DAG: [[BufferHandle2:%.+]] = OpCopyObject [[BufferPtrType]] [[BufferVar]] +; CHECK-DAG: [[RWBufferHandle:%.+]] = OpCopyObject [[RWBufferPtrType]] [[RWBufferVar]] + %BufferHandle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32 0, i32 0, i32 1, i32 0, i1 false) + %BufferHandle2 = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 0) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_0t(i32 0, i32 0, i32 1, i32 0, i1 false) + %RWBufferHandle = tail call target("spirv.VulkanBuffer", [0 x i32], 12, 1) @llvm.spv.resource.handlefrombinding.tspirv.VulkanBuffer_a0i32_12_1t(i32 0, i32 1, i32 1, i32 0, i1 false) + +; CHECK: [[AC:%.+]] = OpAccessChain {{.*}} [[BufferHandle]] [[zero]] [[one]] + %0 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_0t(target("spirv.VulkanBuffer", [0 x i32], 12, 0) %BufferHandle, i32 1) + +; CHECK: [[LD:%.+]] = OpLoad [[int]] [[AC]] Aligned 4 + %1 = load i32, ptr addrspace(11) %0, align 4, !tbaa !3 + +; CHECK: [[AC:%.+]] = OpAccessChain {{.*}} [[RWBufferHandle]] [[zero]] [[zero]] + %2 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_1t(target("spirv.VulkanBuffer", [0 x i32], 12, 1) %RWBufferHandle, i32 0) + +; CHECK: OpStore [[AC]] [[LD]] + store i32 %1, ptr addrspace(11) %2, align 4, !tbaa !3 + +; CHECK: [[AC:%.+]] = OpAccessChain {{.*}} [[BufferHandle2]] [[zero]] [[two]] + %3 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_0t(target("spirv.VulkanBuffer", [0 x i32], 12, 0) %BufferHandle2, i32 2) + +; CHECK: [[LD:%.+]] = OpLoad [[int]] [[AC]] Aligned 4 + %4 = load i32, ptr addrspace(11) %3, align 4, !tbaa !3 + +; CHECK: [[AC:%.+]] = OpAccessChain {{.*}} [[RWBufferHandle]] [[zero]] [[zero]] + %5 = tail call noundef nonnull align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_1t(target("spirv.VulkanBuffer", [0 x i32], 12, 1) %RWBufferHandle, i32 0) + +; CHECK: OpStore [[AC]] [[LD]] + store i32 %4, ptr addrspace(11) %5, align 4, !tbaa !3 + ret void +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_1t(target("spirv.VulkanBuffer", [0 x i32], 12, 1), i32) #0 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) +declare ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.VulkanBuffer_a0i32_12_0t(target("spirv.VulkanBuffer", [0 x i32], 12, 0), i32) #0 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) } +attributes #1 = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none) "approx-func-fp-math"="false" "frame-pointer"="all" "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"frame-pointer", i32 2} +!2 = !{!"clang version 21.0.0git (git@github.com:s-perron/llvm-project.git 6e86add06c03e328dbb4b83f99406cc832a22f86)"} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C++ TBAA"} From 6bbdc70066c2bf46ed3d88293e2abfa3d0ffffa3 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Thu, 3 Apr 2025 14:53:19 +0100 Subject: [PATCH 0525/1029] [LV] Use getCallWideningDecision in more places (NFC) (#134236) --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d214b2f2fb4cd..54ccaefdad246 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1202,7 +1202,7 @@ class LoopVectorizationCostModel { CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const { assert(!VF.isScalar() && "Expected vector VF"); - return CallWideningDecisions.at(std::make_pair(CI, VF)); + return CallWideningDecisions.at({CI, VF}); } /// Return True if instruction \p I is an optimizable truncate whose operand @@ -2817,7 +2817,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, // We only need to calculate a cost if the VF is scalar; for actual vectors // we should already have a pre-calculated cost at each VF. if (!VF.isScalar()) - return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost; + return getCallWideningDecision(CI, VF).Cost; Type *RetTy = CI->getType(); if (RecurrenceDescriptor::isFMulAddIntrinsic(CI)) @@ -3216,8 +3216,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication( case Instruction::Call: if (VF.isScalar()) return true; - return CallWideningDecisions.at(std::make_pair(cast(I), VF)) - .Kind == CM_Scalarize; + return getCallWideningDecision(cast(I), VF).Kind == CM_Scalarize; case Instruction::Load: case Instruction::Store: { auto *Ptr = getLoadStorePointerOperand(I); From 586c5e3083428e7473e880dafd5939e8707bc1c9 Mon Sep 17 00:00:00 2001 From: Frank Schlimbach Date: Thu, 3 Apr 2025 15:53:40 +0200 Subject: [PATCH 0526/1029] [mlir][mpi] fixing in-place and 0d mpi.all_reduce (#134225) * inplace allreduce needs special MPI token MPI_IN_PLACE as send buffer * 0d tensors have no sizes/strides in LLVM memref struct --- mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp | 34 ++++++++++++++++--- mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir | 8 +++-- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp b/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp index 9df5e992e8ebd..5575b295ae20a 100644 --- a/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp +++ b/mlir/lib/Conversion/MPIToLLVM/MPIToLLVM.cpp @@ -15,8 +15,10 @@ #include "mlir/Conversion/MPIToLLVM/MPIToLLVM.h" #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Dialect/MPI/IR/MPI.h" #include "mlir/Transforms/DialectConversion.h" #include @@ -57,9 +59,14 @@ std::pair getRawPtrAndSize(const Location loc, loc, rewriter.getI64Type(), memRef, 2); Value resPtr = rewriter.create(loc, ptrType, elType, dataPtr, offset); - Value size = rewriter.create(loc, memRef, - ArrayRef{3, 0}); - size = rewriter.create(loc, rewriter.getI32Type(), size); + Value size; + if (cast(memRef.getType()).getBody().size() > 3) { + size = rewriter.create(loc, memRef, + ArrayRef{3, 0}); + size = rewriter.create(loc, rewriter.getI32Type(), size); + } else { + size = rewriter.create(loc, 1, 32); + } return {resPtr, size}; } @@ -97,6 +104,9 @@ class MPIImplTraits { /// Get the MPI_STATUS_IGNORE value (typically a pointer type). virtual intptr_t getStatusIgnore() = 0; + /// Get the MPI_IN_PLACE value (void *). + virtual void *getInPlace() = 0; + /// Gets or creates an MPI datatype as a value which corresponds to the given /// type. virtual Value getDataType(const Location loc, @@ -158,6 +168,8 @@ class MPICHImplTraits : public MPIImplTraits { intptr_t getStatusIgnore() override { return 1; } + void *getInPlace() override { return reinterpret_cast(-1); } + Value getDataType(const Location loc, ConversionPatternRewriter &rewriter, Type type) override { int32_t mtype = 0; @@ -283,6 +295,8 @@ class OMPIImplTraits : public MPIImplTraits { intptr_t getStatusIgnore() override { return 0; } + void *getInPlace() override { return reinterpret_cast(1); } + Value getDataType(const Location loc, ConversionPatternRewriter &rewriter, Type type) override { StringRef mtype; @@ -516,7 +530,8 @@ struct CommSplitOpLowering : public ConvertOpToLLVMPattern { outPtr.getRes()}); // load the communicator into a register - auto res = rewriter.create(loc, i32, outPtr.getResult()); + Value res = rewriter.create(loc, i32, outPtr.getResult()); + res = rewriter.create(loc, rewriter.getI64Type(), res); // if retval is checked, replace uses of retval with the results from the // call op @@ -525,7 +540,7 @@ struct CommSplitOpLowering : public ConvertOpToLLVMPattern { replacements.push_back(callOp.getResult()); // replace op - replacements.push_back(res.getRes()); + replacements.push_back(res); rewriter.replaceOp(op, replacements); return success(); @@ -709,6 +724,7 @@ struct AllReduceOpLowering : public ConvertOpToLLVMPattern { Location loc = op.getLoc(); MLIRContext *context = rewriter.getContext(); Type i32 = rewriter.getI32Type(); + Type i64 = rewriter.getI64Type(); Type elemType = op.getSendbuf().getType().getElementType(); // ptrType `!llvm.ptr` @@ -719,6 +735,14 @@ struct AllReduceOpLowering : public ConvertOpToLLVMPattern { getRawPtrAndSize(loc, rewriter, adaptor.getSendbuf(), elemType); auto [recvPtr, recvSize] = getRawPtrAndSize(loc, rewriter, adaptor.getRecvbuf(), elemType); + + // If input and output are the same, request in-place operation. + if (adaptor.getSendbuf() == adaptor.getRecvbuf()) { + sendPtr = rewriter.create( + loc, i64, reinterpret_cast(mpiTraits->getInPlace())); + sendPtr = rewriter.create(loc, ptrType, sendPtr); + } + Value dataType = mpiTraits->getDataType(loc, rewriter, elemType); Value mpiOp = mpiTraits->getMPIOp(loc, rewriter, op.getOp()); Value commWorld = mpiTraits->castComm(loc, rewriter, adaptor.getComm()); diff --git a/mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir b/mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir index 174f7c79b9d50..35fc0f5d2e754 100644 --- a/mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir +++ b/mlir/test/Conversion/MPIToLLVM/mpitollvm.mlir @@ -98,10 +98,12 @@ module attributes {dlti.map = #dlti.map<"MPI:Implementation" = "MPICH">} { // CHECK: [[v66:%.*]] = llvm.getelementptr [[v64]][[[v65]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32 // CHECK: [[v67:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v68:%.*]] = llvm.trunc [[v67]] : i64 to i32 + // CHECK: [[ip:%.*]] = llvm.mlir.constant(-1 : i64) : i64 + // CHECK: [[ipp:%.*]] = llvm.inttoptr [[ip]] : i64 to !llvm.ptr // CHECK: [[v69:%.*]] = llvm.mlir.constant(1275069450 : i32) : i32 // CHECK: [[v70:%.*]] = llvm.mlir.constant(1476395011 : i32) : i32 // CHECK: [[v71:%.*]] = llvm.trunc [[comm]] : i64 to i32 - // CHECK: [[v72:%.*]] = llvm.call @MPI_Allreduce([[v61]], [[v66]], [[v63]], [[v69]], [[v70]], [[v71]]) : (!llvm.ptr, !llvm.ptr, i32, i32, i32, i32) -> i32 + // CHECK: [[v72:%.*]] = llvm.call @MPI_Allreduce([[ipp]], [[v66]], [[v63]], [[v69]], [[v70]], [[v71]]) : (!llvm.ptr, !llvm.ptr, i32, i32, i32, i32) -> i32 mpi.allreduce(%arg0, %arg0, MPI_SUM, %comm) : memref<100xf32>, memref<100xf32> // CHECK: llvm.call @MPI_Finalize() : () -> i32 @@ -202,10 +204,12 @@ module attributes { dlti.map = #dlti.map<"MPI:Implementation" = "OpenMPI"> } { // CHECK: [[v56:%.*]] = llvm.getelementptr [[v54]][[[v55]]] : (!llvm.ptr, i64) -> !llvm.ptr, f32 // CHECK: [[v57:%.*]] = llvm.extractvalue [[v5]][3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // CHECK: [[v58:%.*]] = llvm.trunc [[v57]] : i64 to i32 + // CHECK: [[ip:%.*]] = llvm.mlir.constant(1 : i64) : i64 + // CHECK: [[ipp:%.*]] = llvm.inttoptr [[ip]] : i64 to !llvm.ptr // CHECK: [[v59:%.*]] = llvm.mlir.addressof @ompi_mpi_float : !llvm.ptr // CHECK: [[v60:%.*]] = llvm.mlir.addressof @ompi_mpi_sum : !llvm.ptr // CHECK: [[v61:%.*]] = llvm.inttoptr [[comm]] : i64 to !llvm.ptr - // CHECK: [[v62:%.*]] = llvm.call @MPI_Allreduce([[v51]], [[v56]], [[v53]], [[v59]], [[v60]], [[v61]]) : (!llvm.ptr, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 + // CHECK: [[v62:%.*]] = llvm.call @MPI_Allreduce([[ipp]], [[v56]], [[v53]], [[v59]], [[v60]], [[v61]]) : (!llvm.ptr, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 mpi.allreduce(%arg0, %arg0, MPI_SUM, %comm) : memref<100xf32>, memref<100xf32> // CHECK: [[v71:%.*]] = llvm.mlir.constant(10 : i32) : i32 From 9e0ca5720bee96f4b19eeb69a119b5eda3ab5528 Mon Sep 17 00:00:00 2001 From: Aaron Puchert Date: Thu, 3 Apr 2025 15:56:53 +0200 Subject: [PATCH 0527/1029] [X86] When expanding LCMPXCHG16B_SAVE_RBX, substitute RBX in base (#134109) The pseudo-instruction LCMPXCHG16B_SAVE_RBX is used when RBX serves as frame base pointer. At a very late stage it is then translated into a regular LCMPXCHG16B, preceded by copying the actual argument into RBX, and followed by restoring the register to the base pointer. However, in case the `cmpxchg` operates on a local variable, RBX might also be used as a base for the memory operand in frame finalization, and we've overwritten RBX with the input operand for `cmpxchg16b`. So we have to rewrite the memory operand base to use the saved value of RBX. Fixes #119959. --- llvm/lib/Target/X86/X86ExpandPseudo.cpp | 14 ++++++-- .../CodeGen/X86/base-pointer-and-cmpxchg.ll | 34 +++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index c202f7fa93db6..398b738b85697 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -439,8 +439,18 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, InArg.getReg(), false); // Create the actual instruction. MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(X86::LCMPXCHG16B)); - // Copy the operands related to the address. - for (unsigned Idx = 1; Idx < 6; ++Idx) + // Copy the operands related to the address. If we access a frame variable, + // we need to replace the RBX base with SaveRbx, as RBX has another value. + const MachineOperand &Base = MBBI->getOperand(1); + if (Base.getReg() == X86::RBX || Base.getReg() == X86::EBX) + NewInstr->addOperand(MachineOperand::CreateReg( + Base.getReg() == X86::RBX + ? SaveRbx + : Register(TRI->getSubReg(SaveRbx, X86::sub_32bit)), + /*IsDef=*/false)); + else + NewInstr->addOperand(Base); + for (unsigned Idx = 1 + 1; Idx < 1 + X86::AddrNumOperands; ++Idx) NewInstr->addOperand(MBBI->getOperand(Idx)); // Finally, restore the value of RBX. TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, diff --git a/llvm/test/CodeGen/X86/base-pointer-and-cmpxchg.ll b/llvm/test/CodeGen/X86/base-pointer-and-cmpxchg.ll index 498be7c9e1144..5e8da5818fe97 100644 --- a/llvm/test/CodeGen/X86/base-pointer-and-cmpxchg.ll +++ b/llvm/test/CodeGen/X86/base-pointer-and-cmpxchg.ll @@ -49,5 +49,39 @@ tail call void asm sideeffect "nop", "~{rax},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp}, store i32 %n, ptr %idx ret i1 %res } + +; If we compare-and-exchange a frame variable, we additionally need to rewrite +; the memory operand to use the SAVE_rbx instead of rbx, which already contains +; the input operand. +; +; CHECK-LABEL: cmp_and_swap16_frame: +; Check that we actually use rbx. +; gnux32 use the 32bit variant of the registers. +; USE_BASE_64: movq %rsp, %rbx +; USE_BASE_32: movl %esp, %ebx +; Here we drop the inline assembly because the frame pointer is used anyway. So +; rbx is not spilled to the stack but goes into a (hopefully numbered) register. +; USE_BASE: movq %rbx, [[SAVE_rbx:%r[0-9]+]] +; +; USE_BASE: movq {{[^ ]+}}, %rbx +; The use of the frame variable expands to N(%rbx) or N(%ebx). But we've just +; overwritten that with the input operand. We need to use SAVE_rbx instead. +; USE_BASE_64-NEXT: cmpxchg16b {{[0-9]*}}([[SAVE_rbx]]) +; USE_BASE_32-NEXT: cmpxchg16b {{[0-9]*}}([[SAVE_rbx]]d) +; USE_BASE-NEXT: movq [[SAVE_rbx]], %rbx +; +; DONT_USE_BASE-NOT: movq %rsp, %rbx +; DONT_USE_BASE-NOT: movl %esp, %ebx +; DONT_USE_BASE: cmpxchg +define i1 @cmp_and_swap16_frame(i128 %a, i128 %b, i32 %n) { + %local = alloca i128, align 16 + %dummy = alloca i32, i32 %n + %cmp = cmpxchg ptr %local, i128 %a, i128 %b seq_cst seq_cst + %res = extractvalue { i128, i1 } %cmp, 1 + %idx = getelementptr i32, ptr %dummy, i32 5 + store i32 %n, ptr %idx + ret i1 %res +} + !llvm.module.flags = !{!0} !0 = !{i32 2, !"override-stack-alignment", i32 32} From 7145ead280ba10d08fe48e7859f1a61a64653104 Mon Sep 17 00:00:00 2001 From: GeorgeKA Date: Thu, 3 Apr 2025 09:58:52 -0400 Subject: [PATCH 0528/1029] [Clang] Add warning message for C++17 alias template CTAD (#133806) Alias template class template argument deduction is a documented C++20 feature. C++17 also happens to support it, but there is no message output to indicate the officially supported version. This PR adds that. Also updated relevant CTAD test cases. Closes #125913 --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Basic/DiagnosticSemaKinds.td | 7 ++----- clang/lib/Sema/SemaInit.cpp | 3 +-- clang/test/SemaCXX/cxx17-compat.cpp | 2 +- .../SemaCXX/cxx1z-class-template-argument-deduction.cpp | 4 ++-- 5 files changed, 7 insertions(+), 10 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index fdf9a246d6373..3055394dd8b6c 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -403,6 +403,7 @@ Bug Fixes to C++ Support - Clang no longer crashes when establishing subsumption between some constraint expressions. (#GH122581) - Clang now issues an error when placement new is used to modify a const-qualified variable in a ``constexpr`` function. (#GH131432) +- Clang now emits a warning when class template argument deduction for alias templates is used in C++17. (#GH133806) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 1993cd5accc22..52dc477039129 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -49,6 +49,8 @@ defm constexpr_ctor_missing_init : CXX20Compat< defm adl_only_template_id : CXX20Compat< "use of function template name with no prior declaration in function call " "with explicit template arguments is">; +defm ctad_for_alias_templates + : CXX20Compat<"class template argument deduction for alias templates is">; // C++23 compatibility with C++20 and earlier. defm constexpr_static_var : CXX23Compat< @@ -8448,11 +8450,6 @@ let CategoryName = "Lambda Issue" in { def warn_cxx17_compat_lambda_def_ctor_assign : Warning< "%select{default construction|assignment}0 of lambda is incompatible with " "C++ standards before C++20">, InGroup, DefaultIgnore; - - // C++20 class template argument deduction for alias templates. - def warn_cxx17_compat_ctad_for_alias_templates : Warning< - "class template argument deduction for alias templates is incompatible with " - "C++ standards before C++20">, InGroup, DefaultIgnore; } def err_return_in_captured_stmt : Error< diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 9814c3f456f0d..87a4244f2fb76 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -9919,8 +9919,7 @@ QualType Sema::DeduceTemplateSpecializationFromInitializer( if (!Template) { if (auto *AliasTemplate = dyn_cast_or_null( TemplateName.getAsTemplateDecl())) { - Diag(Kind.getLocation(), - diag::warn_cxx17_compat_ctad_for_alias_templates); + DiagCompat(Kind.getLocation(), diag_compat::ctad_for_alias_templates); LookupTemplateDecl = AliasTemplate; auto UnderlyingType = AliasTemplate->getTemplatedDecl() ->getUnderlyingType() diff --git a/clang/test/SemaCXX/cxx17-compat.cpp b/clang/test/SemaCXX/cxx17-compat.cpp index 54ea3384022d4..81b3e1fde5493 100644 --- a/clang/test/SemaCXX/cxx17-compat.cpp +++ b/clang/test/SemaCXX/cxx17-compat.cpp @@ -137,7 +137,7 @@ template struct A { A(T); }; template using B = A; B b = {1}; #if __cplusplus <= 201703L - // FIXME: diagnose as well + // expected-warning@-2 {{class template argument deduction for alias templates is a C++20 extension}} #else // expected-warning@-4 {{class template argument deduction for alias templates is incompatible with C++ standards before C++20}} #endif diff --git a/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp b/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp index 9aaa13d7ac41a..a7d740e66ba63 100644 --- a/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp +++ b/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp @@ -113,9 +113,9 @@ namespace dependent { }; template void f() { typename T::X tx = 0; - typename T::Y ty = 0; + typename T::Y ty = 0; // expected-warning {{class template argument deduction for alias templates is a C++20 extension}} } - template void f(); + template void f(); // expected-note {{in instantiation of function template specialization 'dependent::f' requested here}} template struct C { C(T); }; template C(T) -> C; From 18dd299fb109792d0716156af0a2d8c0ca781c57 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Thu, 3 Apr 2025 15:06:19 +0100 Subject: [PATCH 0529/1029] [Flang][MLIR][OpenMP] Host-evaluation of omp.loop bounds (#133908) This patch updates Flang lowering and kernel flags identification in MLIR so that loop bounds on `target teams loop` constructs are evaluated on the host, making the trip count available to the corresponding `__tgt_target_kernel` call emitted for the target region. This is necessary in order to properly execute these constructs as `target teams distribute parallel do`. Co-authored-by: Kareem Ergawy --- flang/lib/Lower/OpenMP/OpenMP.cpp | 19 +++++++++----- .../Lower/OpenMP/generic-loop-rewriting.f90 | 13 ++++------ flang/test/Lower/OpenMP/host-eval.f90 | 25 +++++++++++++++++++ mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 7 ++++-- mlir/test/Dialect/OpenMP/ops.mlir | 16 ++++++++++++ 5 files changed, 64 insertions(+), 16 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index ab90b4609e855..b04d57ec30e4f 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -557,7 +557,6 @@ static void processHostEvalClauses(lower::AbstractConverter &converter, HostEvalInfo &hostInfo = hostEvalInfo.back(); switch (extractOmpDirective(*ompEval)) { - // Cases where 'teams' and target SPMD clauses might be present. case OMPD_teams_distribute_parallel_do: case OMPD_teams_distribute_parallel_do_simd: cp.processThreadLimit(stmtCtx, hostInfo.ops); @@ -575,18 +574,16 @@ static void processHostEvalClauses(lower::AbstractConverter &converter, cp.processCollapse(loc, eval, hostInfo.ops, hostInfo.iv); break; - // Cases where 'teams' clauses might be present, and target SPMD is - // possible by looking at nested evaluations. case OMPD_teams: cp.processThreadLimit(stmtCtx, hostInfo.ops); [[fallthrough]]; case OMPD_target_teams: cp.processNumTeams(stmtCtx, hostInfo.ops); - processSingleNestedIf( - [](Directive nestedDir) { return topDistributeSet.test(nestedDir); }); + processSingleNestedIf([](Directive nestedDir) { + return topDistributeSet.test(nestedDir) || topLoopSet.test(nestedDir); + }); break; - // Cases where only 'teams' host-evaluated clauses might be present. case OMPD_teams_distribute: case OMPD_teams_distribute_simd: cp.processThreadLimit(stmtCtx, hostInfo.ops); @@ -597,6 +594,16 @@ static void processHostEvalClauses(lower::AbstractConverter &converter, cp.processNumTeams(stmtCtx, hostInfo.ops); break; + case OMPD_teams_loop: + cp.processThreadLimit(stmtCtx, hostInfo.ops); + [[fallthrough]]; + case OMPD_target_teams_loop: + cp.processNumTeams(stmtCtx, hostInfo.ops); + [[fallthrough]]; + case OMPD_loop: + cp.processCollapse(loc, eval, hostInfo.ops, hostInfo.iv); + break; + // Standalone 'target' case. case OMPD_target: { processSingleNestedIf( diff --git a/flang/test/Lower/OpenMP/generic-loop-rewriting.f90 b/flang/test/Lower/OpenMP/generic-loop-rewriting.f90 index e1adf5afb0eba..eaf31e3ffb779 100644 --- a/flang/test/Lower/OpenMP/generic-loop-rewriting.f90 +++ b/flang/test/Lower/OpenMP/generic-loop-rewriting.f90 @@ -11,7 +11,7 @@ subroutine target_teams_loop implicit none integer :: x, i - !$omp target teams loop + !$omp teams loop do i = 0, 10 x = x + i end do @@ -22,19 +22,15 @@ subroutine target_teams_loop implicit none integer :: x, i - !$omp target teams loop bind(teams) + !$omp teams loop bind(teams) do i = 0, 10 x = x + i end do end subroutine target_teams_loop !CHECK-LABEL: func.func @_QPtarget_teams_loop -!CHECK: omp.target map_entries( -!CHECK-SAME: %{{.*}} -> %[[I_ARG:[^[:space:]]+]], -!CHECK-SAME: %{{.*}} -> %[[X_ARG:[^[:space:]]+]] : {{.*}}) { - -!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_ARG]] -!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_ARG]] +!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}i"} +!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "{{.*}}x"} !CHECK: omp.teams { @@ -51,6 +47,7 @@ end subroutine target_teams_loop !CHECK-SAME: (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { !CHECK: %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ARG]] !CHECK: hlfir.assign %{{.*}} to %[[I_PRIV_DECL]]#0 : i32, !fir.ref +!CHECK: hlfir.assign %{{.*}} to %[[X_DECL]]#0 : i32, !fir.ref !CHECK: } !CHECK: } !CHECK: } diff --git a/flang/test/Lower/OpenMP/host-eval.f90 b/flang/test/Lower/OpenMP/host-eval.f90 index 65258c91e5daf..fe5b9597f8620 100644 --- a/flang/test/Lower/OpenMP/host-eval.f90 +++ b/flang/test/Lower/OpenMP/host-eval.f90 @@ -258,3 +258,28 @@ subroutine distribute_simd() !$omp end distribute simd !$omp end teams end subroutine distribute_simd + +! BOTH-LABEL: func.func @_QPloop +subroutine loop() + ! BOTH: omp.target + + ! HOST-SAME: host_eval(%{{.*}} -> %[[LB:.*]], %{{.*}} -> %[[UB:.*]], %{{.*}} -> %[[STEP:.*]] : i32, i32, i32) + + ! DEVICE-NOT: host_eval({{.*}}) + ! DEVICE-SAME: { + + ! BOTH: omp.teams + !$omp target teams + + ! BOTH: omp.parallel + + ! BOTH: omp.distribute + ! BOTH-NEXT: omp.wsloop + ! BOTH-NEXT: omp.loop_nest + + ! HOST-SAME: (%{{.*}}) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) + !$omp loop + do i=1,10 + end do + !$omp end target teams +end subroutine loop diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 882bc4071482f..4ac9f49f12161 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -2058,8 +2058,9 @@ TargetOp::getKernelExecFlags(Operation *capturedOp) { long numWrappers = std::distance(innermostWrapper, wrappers.end()); // Detect Generic-SPMD: target-teams-distribute[-simd]. + // Detect SPMD: target-teams-loop. if (numWrappers == 1) { - if (!isa(innermostWrapper)) + if (!isa(innermostWrapper)) return OMP_TGT_EXEC_MODE_GENERIC; Operation *teamsOp = (*innermostWrapper)->getParentOp(); @@ -2067,7 +2068,9 @@ TargetOp::getKernelExecFlags(Operation *capturedOp) { return OMP_TGT_EXEC_MODE_GENERIC; if (teamsOp->getParentOp() == targetOp.getOperation()) - return OMP_TGT_EXEC_MODE_GENERIC_SPMD; + return isa(innermostWrapper) + ? OMP_TGT_EXEC_MODE_GENERIC_SPMD + : OMP_TGT_EXEC_MODE_SPMD; } // Detect SPMD: target-teams-distribute-parallel-wsloop[-simd]. diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index a5cf789402726..0a10626cd4877 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -2879,6 +2879,22 @@ func.func @omp_target_host_eval(%x : i32) { } omp.terminator } + + // CHECK: omp.target host_eval(%{{.*}} -> %[[HOST_ARG:.*]] : i32) { + // CHECK: omp.teams { + // CHECK: omp.loop { + // CHECK: omp.loop_nest (%{{.*}}) : i32 = (%[[HOST_ARG]]) to (%[[HOST_ARG]]) step (%[[HOST_ARG]]) { + omp.target host_eval(%x -> %arg0 : i32) { + omp.teams { + omp.loop { + omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) { + omp.yield + } + } + omp.terminator + } + omp.terminator + } return } From b61e3874fa97c5ead2c27e8245fe123370a21e81 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 3 Apr 2025 15:15:55 +0100 Subject: [PATCH 0530/1029] Revert "[InstCombine] Match scalable splats in m_ImmConstant (#132522)" This reverts commit df9e5ae5b40c4d245d904a2565e46f5b7ab9c7c8. This is triggering an assertion failure on llvm-test-suite with -enable-vplan-native-path: https://lab.llvm.org/buildbot/#/builders/198/builds/3365 --- llvm/include/llvm/IR/PatternMatch.h | 51 ++++--------------- llvm/test/Transforms/InstCombine/select.ll | 3 +- llvm/test/Transforms/InstCombine/shl-bo.ll | 11 ---- .../InstCombine/shl-twice-constant.ll | 11 ---- llvm/test/Transforms/InstCombine/sub.ll | 4 +- 5 files changed, 14 insertions(+), 66 deletions(-) diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index 2d27c19e1b85e..b3eeb1d7ba88a 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -858,51 +858,18 @@ inline bind_ty m_BasicBlock(const BasicBlock *&V) { return V; } -// TODO: Remove once UseConstant{Int,FP}ForScalableSplat is enabled by default, -// and use m_Unless(m_ConstantExpr). -struct immconstant_ty { - template static bool isImmConstant(ITy *V) { - if (auto *CV = dyn_cast(V)) { - if (!isa(CV) && !CV->containsConstantExpression()) - return true; - - if (CV->getType()->isVectorTy()) { - if (auto *Splat = CV->getSplatValue(/*AllowPoison=*/true)) { - if (!isa(Splat) && - !Splat->containsConstantExpression()) { - return true; - } - } - } - } - return false; - } -}; - -struct match_immconstant_ty : immconstant_ty { - template bool match(ITy *V) { return isImmConstant(V); } -}; - /// Match an arbitrary immediate Constant and ignore it. -inline match_immconstant_ty m_ImmConstant() { return match_immconstant_ty(); } - -struct bind_immconstant_ty : immconstant_ty { - Constant *&VR; - - bind_immconstant_ty(Constant *&V) : VR(V) {} - - template bool match(ITy *V) { - if (isImmConstant(V)) { - VR = cast(V); - return true; - } - return false; - } -}; +inline match_combine_and, + match_unless> +m_ImmConstant() { + return m_CombineAnd(m_Constant(), m_Unless(m_ConstantExpr())); +} /// Match an immediate Constant, capturing the value if we match. -inline bind_immconstant_ty m_ImmConstant(Constant *&C) { - return bind_immconstant_ty(C); +inline match_combine_and, + match_unless> +m_ImmConstant(Constant *&C) { + return m_CombineAnd(m_Constant(C), m_Unless(m_ConstantExpr())); } /// Match a specified Value*. diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index 3d81b72dd232e..2078b795817f8 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -3519,7 +3519,8 @@ define @scalable_sign_bits( %x) { define @scalable_non_zero( %x) { ; CHECK-LABEL: @scalable_non_zero( -; CHECK-NEXT: [[CMP:%.*]] = icmp ult [[X:%.*]], splat (i32 56) +; CHECK-NEXT: [[A:%.*]] = or [[X:%.*]], splat (i32 1) +; CHECK-NEXT: [[CMP:%.*]] = icmp ult [[A]], splat (i32 57) ; CHECK-NEXT: ret [[CMP]] ; %a = or %x, splat (i32 1) diff --git a/llvm/test/Transforms/InstCombine/shl-bo.ll b/llvm/test/Transforms/InstCombine/shl-bo.ll index 5ee8716d5d119..c32ac2eacb25a 100644 --- a/llvm/test/Transforms/InstCombine/shl-bo.ll +++ b/llvm/test/Transforms/InstCombine/shl-bo.ll @@ -656,14 +656,3 @@ define <16 x i8> @test_FoldShiftByConstant_CreateAnd(<16 x i8> %in0) { %vshl_n = shl <16 x i8> %tmp, ret <16 x i8> %vshl_n } - -define @test_FoldShiftByConstant_CreateAnd_scalable( %x) { -; CHECK-LABEL: @test_FoldShiftByConstant_CreateAnd_scalable( -; CHECK-NEXT: [[TMP1:%.*]] = shl [[X:%.*]], splat (i8 2) -; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], splat (i8 8) -; CHECK-NEXT: ret [[TMP2]] -; - %1 = and %x, splat (i8 2) - %2 = shl %1, splat (i8 2) - ret %2 -} diff --git a/llvm/test/Transforms/InstCombine/shl-twice-constant.ll b/llvm/test/Transforms/InstCombine/shl-twice-constant.ll index 151db29fe3e5f..bbdd7fa3d1c40 100644 --- a/llvm/test/Transforms/InstCombine/shl-twice-constant.ll +++ b/llvm/test/Transforms/InstCombine/shl-twice-constant.ll @@ -14,14 +14,3 @@ define i64 @testfunc() { %shl2 = shl i64 %shl1, ptrtoint (ptr @c to i64) ret i64 %shl2 } - -define @scalable() { -; CHECK-LABEL: @scalable( -; CHECK-NEXT: [[SHL1:%.*]] = shl nuw splat (i64 1), shufflevector ( insertelement ( poison, i64 ptrtoint (ptr @c2 to i64), i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[SHL2:%.*]] = shl [[SHL1]], shufflevector ( insertelement ( poison, i64 ptrtoint (ptr @c to i64), i64 0), poison, zeroinitializer) -; CHECK-NEXT: ret [[SHL2]] -; - %shl1 = shl splat (i64 1), splat (i64 ptrtoint (ptr @c2 to i64)) - %shl2 = shl %shl1, splat (i64 ptrtoint (ptr @c to i64)) - ret %shl2 -} diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll index 81ecd8506514e..e89419d1f3838 100644 --- a/llvm/test/Transforms/InstCombine/sub.ll +++ b/llvm/test/Transforms/InstCombine/sub.ll @@ -857,9 +857,11 @@ define <2 x i16> @test44vecminval(<2 x i16> %x) { ret <2 x i16> %sub } +; FIXME: This isn't combined to xor as above because the pattern in visitSub +; uses m_ImmConstant which matches Constant but (explicitly) not ConstantExpr. define @test44scalablevecminval( %x) { ; CHECK-LABEL: @test44scalablevecminval( -; CHECK-NEXT: [[SUB:%.*]] = xor [[X:%.*]], splat (i16 -32768) +; CHECK-NEXT: [[SUB:%.*]] = add [[X:%.*]], splat (i16 -32768) ; CHECK-NEXT: ret [[SUB]] ; %sub = sub nsw %x, splat (i16 -32768) From 59074a376099333a2546c4e3a8b30693c8a3ee92 Mon Sep 17 00:00:00 2001 From: gbMattN Date: Thu, 3 Apr 2025 15:27:14 +0100 Subject: [PATCH 0531/1029] =?UTF-8?q?[ASan]=20Add=20metadata=20to=20rename?= =?UTF-8?q?d=20instructions=20so=20ASan=20doesn't=20use=20the=20i=E2=80=A6?= =?UTF-8?q?=20(#119387)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ncorrect name Clang needs variables to be represented with unique names. This means that if a variable shadows another, its given a different name internally to ensure it has a unique name. If ASan tries to use this name when printing an error, it will print the modified unique name, rather than the variable's source code name Fixes #47326 --- clang/lib/CodeGen/CGExpr.cpp | 3 +++ .../shadowed-stack-serialization.cpp | 12 +++++++++ .../TestCases/use-after-scope-inlined.cpp | 2 +- .../Instrumentation/AddressSanitizer.cpp | 26 ++++++++++++++++++- 4 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 compiler-rt/test/asan/TestCases/shadowed-stack-serialization.cpp diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 3d3a111f0514a..73020389b5e45 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -143,6 +143,9 @@ llvm::AllocaInst *CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, Alloca = new llvm::AllocaInst(Ty, CGM.getDataLayout().getAllocaAddrSpace(), ArraySize, Name, AllocaInsertPt->getIterator()); + if (SanOpts.Mask & SanitizerKind::Address) { + Alloca->addAnnotationMetadata({"alloca_name_altered", Name.str()}); + } if (Allocas) { Allocas->Add(Alloca); } diff --git a/compiler-rt/test/asan/TestCases/shadowed-stack-serialization.cpp b/compiler-rt/test/asan/TestCases/shadowed-stack-serialization.cpp new file mode 100644 index 0000000000000..f2706c671c261 --- /dev/null +++ b/compiler-rt/test/asan/TestCases/shadowed-stack-serialization.cpp @@ -0,0 +1,12 @@ +// RUN: %clangxx_asan -O0 %s -o %t +// RUN: not %run %t 2>&1 | FileCheck %s + +int main() { + int x; + { + int x; + delete &x; + } +} + +// CHECK: [32, 36) 'x' diff --git a/compiler-rt/test/asan/TestCases/use-after-scope-inlined.cpp b/compiler-rt/test/asan/TestCases/use-after-scope-inlined.cpp index d0154ef744241..fdb6b9868102d 100644 --- a/compiler-rt/test/asan/TestCases/use-after-scope-inlined.cpp +++ b/compiler-rt/test/asan/TestCases/use-after-scope-inlined.cpp @@ -27,5 +27,5 @@ int main(int argc, char *argv[]) { // CHECK: Address 0x{{.*}} is located in stack of thread T0 at offset [[OFFSET:[^ ]*]] in frame // CHECK: {{.*}} in main // CHECK: This frame has - // CHECK: {{\[}}[[OFFSET]], {{.*}}) 'x.i' (line [[@LINE-15]]) + // CHECK: {{\[}}[[OFFSET]], {{.*}}) 'x' (line [[@LINE-15]]) } diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 62bfb7cec4ff0..fcac686b4cd10 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -3437,6 +3437,29 @@ static void findStoresToUninstrumentedArgAllocas( } } +static StringRef getAllocaName(AllocaInst *AI) { + // Alloca could have been renamed for uniqueness. Its true name will have been + // recorded as an annotation. + if (AI->hasMetadata(LLVMContext::MD_annotation)) { + MDTuple *AllocaAnnotations = + cast(AI->getMetadata(LLVMContext::MD_annotation)); + for (auto &Annotation : AllocaAnnotations->operands()) { + if (!isa(Annotation)) + continue; + auto AnnotationTuple = cast(Annotation); + for (int Index = 0; Index < AnnotationTuple->getNumOperands(); Index++) { + // All annotations are strings + auto MetadataString = + cast(AnnotationTuple->getOperand(Index)); + if (MetadataString->getString() == "alloca_name_altered") + return cast(AnnotationTuple->getOperand(Index + 1)) + ->getString(); + } + } + } + return AI->getName(); +} + void FunctionStackPoisoner::processStaticAllocas() { if (AllocaVec.empty()) { assert(StaticAllocaPoisonCallVec.empty()); @@ -3477,7 +3500,8 @@ void FunctionStackPoisoner::processStaticAllocas() { SmallVector SVD; SVD.reserve(AllocaVec.size()); for (AllocaInst *AI : AllocaVec) { - ASanStackVariableDescription D = {AI->getName().data(), + StringRef Name = getAllocaName(AI); + ASanStackVariableDescription D = {Name.data(), ASan.getAllocaSizeInBytes(*AI), 0, AI->getAlign().value(), From 008040482b15aa76699e61e59218e92d3786e17a Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Thu, 3 Apr 2025 07:36:46 -0700 Subject: [PATCH 0532/1029] [clang] Add SPIR-V to some OpenMP clang tests (#133503) Just to get some more coverage. Some of the behavior might be weird and change in the future, but let's lock down what happens today to at least prevent regressions. Signed-off-by: Sarnie, Nick --- .../test/Headers/openmp_device_math_isnan.cpp | 11 ++++++++ .../declare_variant_construct_codegen_1.c | 6 +++++ clang/test/OpenMP/interop_codegen.cpp | 1 + clang/test/OpenMP/ompx_attributes_codegen.cpp | 26 ++++++++++++++----- ...arget_num_teams_num_threads_attributes.cpp | 2 ++ .../OpenMP/target_parallel_no_exceptions.cpp | 1 + .../OpenMP/target_team_variable_codegen.cpp | 4 +++ clang/test/OpenMP/target_visibility.cpp | 9 ++++--- 8 files changed, 50 insertions(+), 10 deletions(-) diff --git a/clang/test/Headers/openmp_device_math_isnan.cpp b/clang/test/Headers/openmp_device_math_isnan.cpp index a297cfc5b9293..3fd98813f2480 100644 --- a/clang/test/Headers/openmp_device_math_isnan.cpp +++ b/clang/test/Headers/openmp_device_math_isnan.cpp @@ -1,18 +1,25 @@ // RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=spirv64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=BOOL_RETURN +// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple spirv64 -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=spirv64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=SPIRV_BOOL_RETURN // RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple amdgcn-amd-amdhsa -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD_BOOL_RETURN_SAFE // RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math -ffp-contract=fast +// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=spirv64 -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math -ffp-contract=fast // RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math -ffp-contract=fast // RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math -ffp-contract=fast | FileCheck %s --check-prefix=BOOL_RETURN +// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple spirv64 -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=spirv64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math -ffp-contract=fast | FileCheck %s --check-prefix=SPIRV_BOOL_RETURN // RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple amdgcn-amd-amdhsa -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math -ffp-contract=fast | FileCheck %s --check-prefix=AMD_BOOL_RETURN_FAST // RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -DUSE_ISNAN_WITH_INT_RETURN +// RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=spirv64 -emit-llvm-bc %s -o %t-ppc-host.bc -DUSE_ISNAN_WITH_INT_RETURN // RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc -DUSE_ISNAN_WITH_INT_RETURN // RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DUSE_ISNAN_WITH_INT_RETURN | FileCheck %s --check-prefix=INT_RETURN +// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple spirv64 -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=spirv64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DUSE_ISNAN_WITH_INT_RETURN | FileCheck %s --check-prefix=SPIRV_INT_RETURN // RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple amdgcn-amd-amdhsa -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -DUSE_ISNAN_WITH_INT_RETURN | FileCheck %s --check-prefix=AMD_INT_RETURN_SAFE // RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math -ffp-contract=fast -DUSE_ISNAN_WITH_INT_RETURN // RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc -ffast-math -ffp-contract=fast -DUSE_ISNAN_WITH_INT_RETURN // RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math -ffp-contract=fast -DUSE_ISNAN_WITH_INT_RETURN | FileCheck %s --check-prefix=INT_RETURN +// RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple spirv64 -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=spirv64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math -ffp-contract=fast -DUSE_ISNAN_WITH_INT_RETURN | FileCheck %s --check-prefix=SPIRV_INT_RETURN // RUN: %clang_cc1 -x c++ -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -fopenmp -triple amdgcn-amd-amdhsa -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -ffast-math -ffp-contract=fast -DUSE_ISNAN_WITH_INT_RETURN | FileCheck %s --check-prefix=AMD_INT_RETURN_FAST // expected-no-diagnostics @@ -23,14 +30,18 @@ double math(float f, double d) { // INT_RETURN: call noundef i32 @__nv_isnanf(float // AMD_INT_RETURN_SAFE: call i1 @llvm.is.fpclass.f32(float{{.*}}, i32 3) // AMD_INT_RETURN_FAST: sitofp i32 {{.*}} to double + // SPIRV_INT_RETURN: call spir_func noundef i32 @_Z5isnanf(float // BOOL_RETURN: call noundef i32 @__nv_isnanf(float + // SPIRV_BOOL_RETURN: call spir_func noundef zeroext i1 @_Z5isnanf(float // AMD_BOOL_RETURN_SAFE: call i1 @llvm.is.fpclass.f32(float{{.*}}, i32 3) // AMD_BOOL_RETURN_FAST: icmp ne i32 {{.*}}, 0 r += std::isnan(f); // INT_RETURN: call noundef i32 @__nv_isnand(double + // SPIRV_INT_RETURN: call spir_func noundef i32 @_Z5isnand(double // AMD_INT_RETURN_SAFE: call i1 @llvm.is.fpclass.f64(double{{.*}}, i32 3) // AMD_INT_RETURN_FAST: sitofp i32 {{.*}} to double // BOOL_RETURN: call noundef i32 @__nv_isnand(double + // SPIRV_BOOL_RETURN: call spir_func noundef zeroext i1 @_Z5isnand(double // AMD_BOOL_RETURN_SAFE: call i1 @llvm.is.fpclass.f64(double{{.*}}, i32 3) // AMD_BOOL_RETURN_FAST: icmp ne i32 {{.*}}, 0 r += std::isnan(d); diff --git a/clang/test/OpenMP/declare_variant_construct_codegen_1.c b/clang/test/OpenMP/declare_variant_construct_codegen_1.c index 3cd1ed8dbb320..5e659f05773d1 100644 --- a/clang/test/OpenMP/declare_variant_construct_codegen_1.c +++ b/clang/test/OpenMP/declare_variant_construct_codegen_1.c @@ -7,6 +7,7 @@ // RUN: %clang_cc1 -DCK1 -fopenmp -x c -triple x86_64-unknown-linux -emit-pch -o %t -fopenmp-version=45 %s // RUN: %clang_cc1 -DCK1 -fopenmp -x c -triple x86_64-unknown-linux -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 | FileCheck %s --check-prefix=CK1 // RUN: %clang_cc1 -DCK1 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --check-prefix=CK1 +// RUN: %clang_cc1 -DCK1 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=spirv64 -emit-llvm %s -o - | FileCheck %s --check-prefix=CK1 // RUN: %clang_cc1 -DCK1 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --check-prefix=CK1 // RUN: %clang_cc1 -DCK1 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -emit-pch -o %t %s // RUN: %clang_cc1 -DCK1 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CK1 @@ -15,6 +16,7 @@ // RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c -triple x86_64-unknown-linux -emit-pch -o %t -fopenmp-version=45 %s // RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c -triple x86_64-unknown-linux -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" // RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=spirv64 -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" // RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" // RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -emit-pch -o %t %s // RUN: %clang_cc1 -DCK1 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" @@ -90,6 +92,7 @@ int test(void) { // RUN: %clang_cc1 -DCK2 -fopenmp -x c -triple x86_64-unknown-linux -emit-pch -o %t -fopenmp-version=45 %s // RUN: %clang_cc1 -DCK2 -fopenmp -x c -triple x86_64-unknown-linux -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 | FileCheck %s --check-prefix=CK2 // RUN: %clang_cc1 -DCK2 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --check-prefix=CK2 +// RUN: %clang_cc1 -DCK2 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=spirv64 -emit-llvm %s -o - | FileCheck %s --check-prefix=CK2 // RUN: %clang_cc1 -DCK2 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --check-prefix=CK2 // RUN: %clang_cc1 -DCK2 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -emit-pch -o %t %s // RUN: %clang_cc1 -DCK2 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CK2 @@ -195,6 +198,7 @@ void test(int ***v1, int ***v2, int ***v3, int n) { // RUN: %clang_cc1 -DCK3 -fopenmp-simd -x c -triple x86_64-unknown-linux -emit-pch -o %t -fopenmp-version=45 %s // RUN: %clang_cc1 -DCK3 -fopenmp-simd -x c -triple x86_64-unknown-linux -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" // RUN: %clang_cc1 -DCK3 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -DCK3 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=spirv64 -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" // RUN: %clang_cc1 -DCK3 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" // RUN: %clang_cc1 -DCK3 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -emit-pch -o %t %s // RUN: %clang_cc1 -DCK3 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" @@ -252,6 +256,7 @@ void test(void) { // RUN: %clang_cc1 -DCK4 -fopenmp -x c -triple x86_64-unknown-linux -emit-pch -o %t -fopenmp-version=45 %s // RUN: %clang_cc1 -DCK4 -fopenmp -x c -triple x86_64-unknown-linux -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 | FileCheck %s --check-prefix=CK4 // RUN: %clang_cc1 -DCK4 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --check-prefix=CK4 +// RUN: %clang_cc1 -DCK4 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=spirv64 -emit-llvm %s -o - | FileCheck %s --check-prefix=CK4 // RUN: %clang_cc1 -DCK4 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --check-prefix=CK4 // RUN: %clang_cc1 -DCK4 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -emit-pch -o %t %s // RUN: %clang_cc1 -DCK4 -fopenmp -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=CK4 @@ -260,6 +265,7 @@ void test(void) { // RUN: %clang_cc1 -DCK4 -fopenmp-simd -x c -triple x86_64-unknown-linux -emit-pch -o %t -fopenmp-version=45 %s // RUN: %clang_cc1 -DCK4 -fopenmp-simd -x c -triple x86_64-unknown-linux -include-pch %t -verify %s -emit-llvm -o - -fopenmp-version=45 | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" // RUN: %clang_cc1 -DCK4 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" +// RUN: %clang_cc1 -DCK4 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=spirv64 -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" // RUN: %clang_cc1 -DCK4 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" // RUN: %clang_cc1 -DCK4 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -emit-pch -o %t %s // RUN: %clang_cc1 -DCK4 -fopenmp-simd -x c -triple x86_64-unknown-linux -fopenmp-targets=amdgcn-amd-amdhsa -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --implicit-check-not="{{__kmpc|__tgt}}" diff --git a/clang/test/OpenMP/interop_codegen.cpp b/clang/test/OpenMP/interop_codegen.cpp index 31df2f1ba58c5..1d0b56da6ff0b 100644 --- a/clang/test/OpenMP/interop_codegen.cpp +++ b/clang/test/OpenMP/interop_codegen.cpp @@ -1,6 +1,7 @@ // expected-no-diagnostics // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=spirv64 -emit-llvm %s -o - | FileCheck %s // RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s #ifndef HEADER diff --git a/clang/test/OpenMP/ompx_attributes_codegen.cpp b/clang/test/OpenMP/ompx_attributes_codegen.cpp index d68f00a81335c..8b4e38600d41b 100644 --- a/clang/test/OpenMP/ompx_attributes_codegen.cpp +++ b/clang/test/OpenMP/ompx_attributes_codegen.cpp @@ -6,18 +6,24 @@ // RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -dwarf-version=5 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -dwarf-version=5 -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple spirv64 -fopenmp-targets=spirv64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=SPIRV +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple spirv64 -fopenmp-targets=spirv64 -emit-llvm %s -fopenmp-is-target-device -dwarf-version=5 -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=SPIRV // expected-no-diagnostics // Check that the target attributes are set on the generated kernel void func() { - // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #0 - // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}}) - // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #4 + // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l28(ptr {{[^,]+}}) #0 + // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l30(ptr {{[^,]+}}) + // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l32(ptr {{[^,]+}}) #4 - // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l22(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]] - // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l24(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]] - // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l26(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]] + // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l28(ptr {{[^,]+}}) #[[ATTR0:[0-9]+]] + // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l30(ptr {{[^,]+}}) #[[ATTR1:[0-9]+]] + // NVIDIA: ptx_kernel void @__omp_offloading[[HASH:.*]]_l32(ptr {{[^,]+}}) #[[ATTR2:[0-9]+]] + + // SPIRV: spir_kernel void @__omp_offloading[[HASH:.*]]_l28(ptr {{[^,]+}}) #0 + // SPIRV: spir_kernel void @__omp_offloading[[HASH:.*]]_l30(ptr {{[^,]+}}) + // SPIRV: spir_kernel void @__omp_offloading[[HASH:.*]]_l32(ptr {{[^,]+}}) #4 #pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]]) {} @@ -28,6 +34,14 @@ void func() { {} } +// SPIRV: attributes #0 +// SPIRV-SAME: "nvvm.maxntid"="20" +// SPIRV-SAME: "omp_target_thread_limit"="20" +// SPIRV: attributes #4 +// SPIRV-SAME: "amdgpu-waves-per-eu"="3,7" +// SPIRV-SAME: "nvvm.maxntid"="17" +// SPIRV-SAME: "omp_target_thread_limit"="17" + // AMD: attributes #0 // AMD-SAME: "amdgpu-flat-work-group-size"="10,20" // AMD-SAME: "omp_target_thread_limit"="20" diff --git a/clang/test/OpenMP/target_num_teams_num_threads_attributes.cpp b/clang/test/OpenMP/target_num_teams_num_threads_attributes.cpp index 613b21ff7f75f..bbbacea2d3fc3 100644 --- a/clang/test/OpenMP/target_num_teams_num_threads_attributes.cpp +++ b/clang/test/OpenMP/target_num_teams_num_threads_attributes.cpp @@ -4,6 +4,8 @@ // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s // RUN: %clang_cc1 -target-cpu sm_80 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=spirv64 -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple spirv64 -fopenmp-targets=spirv64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s // expected-no-diagnostics diff --git a/clang/test/OpenMP/target_parallel_no_exceptions.cpp b/clang/test/OpenMP/target_parallel_no_exceptions.cpp index 82fcc1700b7cc..5fcb389cc1606 100644 --- a/clang/test/OpenMP/target_parallel_no_exceptions.cpp +++ b/clang/test/OpenMP/target_parallel_no_exceptions.cpp @@ -2,6 +2,7 @@ // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHK-EXCEPTION // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHK-EXCEPTION +// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=spirv64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHK-EXCEPTION void test_increment() { #pragma omp target diff --git a/clang/test/OpenMP/target_team_variable_codegen.cpp b/clang/test/OpenMP/target_team_variable_codegen.cpp index c7d86edef3074..281036cf703be 100644 --- a/clang/test/OpenMP/target_team_variable_codegen.cpp +++ b/clang/test/OpenMP/target_team_variable_codegen.cpp @@ -9,6 +9,9 @@ // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host-nvidia.bc // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-unknown-unknown -emit-llvm %s -fopenmp-target-debug -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host-nvidia.bc -o - | FileCheck %s --check-prefix=CHECK-NVIDIA +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=spirv64 -emit-llvm-bc %s -o %t-ppc-host-spirv.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple spirv64-unknown-unknown -fopenmp-targets=spirv64 -emit-llvm %s -fopenmp-target-debug -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host-spirv.bc -o - | FileCheck %s --check-prefix=CHECK-SPIRV + // expected-no-diagnostics #ifndef HEADER @@ -50,3 +53,4 @@ int main() //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: // CHECK-AMD: {{.*}} // CHECK-NVIDIA: {{.*}} +// CHECK-SPIRV: {{.*}} diff --git a/clang/test/OpenMP/target_visibility.cpp b/clang/test/OpenMP/target_visibility.cpp index 2554f653170b9..b30f4e7ffd3c5 100644 --- a/clang/test/OpenMP/target_visibility.cpp +++ b/clang/test/OpenMP/target_visibility.cpp @@ -1,5 +1,6 @@ // RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck %s // RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck %s +// RUN: %clang_cc1 -debug-info-kind=limited -verify -fopenmp -x c++ -triple spirv64 -fopenmp-targets=spirv64 -emit-llvm %s -fopenmp-is-target-device -o - | FileCheck %s // expected-no-diagnostics @@ -29,7 +30,7 @@ void B::sbar() { A::sfoo(); } // CHECK-DAG: @x = hidden{{.*}} constant i32 0 // CHECK-DAG: @y = protected{{.*}} i32 0 -// CHECK-DAG: define hidden void @_ZN1B4sbarEv() -// CHECK-DAG: define linkonce_odr hidden void @_ZN1A4sfooEv() -// CHECK-DAG: define hidden void @_ZN1B3barEv( -// CHECK-DAG: define linkonce_odr hidden void @_ZN1A3fooEv( +// CHECK-DAG: define hidden{{.*}} void @_ZN1B4sbarEv() +// CHECK-DAG: define linkonce_odr hidden{{.*}} void @_ZN1A4sfooEv() +// CHECK-DAG: define hidden{{.*}} void @_ZN1B3barEv( +// CHECK-DAG: define linkonce_odr hidden{{.*}}void @_ZN1A3fooEv( From b8fc288c46abeea8d330fabdf6ab1ebf1c5b283d Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Thu, 3 Apr 2025 15:37:43 +0100 Subject: [PATCH 0533/1029] [Dexter] Replace clang with clang++ in various cross project tests (#65987) This patch replaces invocations of clang with clang++ for a set of c++ files in the dexter cross-project tests. As a small additional change, this patch removes -lstdc++ from a test that did not appear to require it. --- .../dexter-tests/aggregate-indirect-arg.cpp | 2 +- .../dexter-tests/asan-deque.cpp | 2 +- .../debuginfo-tests/dexter-tests/ctor.cpp | 2 +- .../dexter-tests/deferred_globals.cpp | 2 +- .../dexter-tests/namespace.cpp | 2 +- .../dexter-tests/nrvo-string.cpp | 4 +- .../dexter-tests/optnone-fastmath.cpp | 4 +- .../dexter-tests/optnone-loops.cpp | 2 +- .../commands/penalty/dex_declare_file.cpp | 2 +- .../commands/penalty/expect_program_state.cpp | 2 +- .../commands/penalty/expect_step_kinds.cpp | 2 +- .../commands/penalty/expect_step_order.cpp | 2 +- .../commands/penalty/expect_watch_type.cpp | 2 +- .../commands/penalty/expect_watch_value.cpp | 2 +- .../penalty/float_range_out_range.cpp | 2 +- .../penalty/float_range_zero_nonmatch.cpp | 2 +- .../commands/penalty/missing_dex_address.cpp | 2 +- .../commands/penalty/unreachable.cpp | 2 +- .../penalty/unreachable_line_range.cpp | 2 +- .../commands/penalty/unreachable_on_line.cpp | 2 +- .../commands/perfect/command_line.c | 2 +- .../dex_declare_address/address_after_ref.cpp | 2 +- .../dex_declare_address/address_hit_count.cpp | 2 +- .../expression_address.cpp | 2 +- .../dex_declare_address/identical_address.cpp | 2 +- .../dex_declare_address/multiple_address.cpp | 2 +- .../dex_declare_address/offset_address.cpp | 2 +- .../dex_declare_address/self_comparison.cpp | 2 +- .../dex_declare_file/dex_and_source/test.cpp | 2 +- .../dex_finish_test/default_conditional.cpp | 2 +- .../default_conditional_hit_count.cpp | 2 +- .../dex_finish_test/default_hit_count.cpp | 2 +- .../dex_finish_test/default_simple.cpp | 2 +- .../limit_steps_conditional.cpp | 2 +- .../limit_steps_conditional_hit_count.cpp | 2 +- .../dex_finish_test/limit_steps_hit_count.cpp | 2 +- .../dex_finish_test/limit_steps_simple.cpp | 2 +- .../commands/perfect/expect_program_state.cpp | 2 +- .../perfect/expect_step_kind/direction.cpp | 2 +- .../perfect/expect_step_kind/func.cpp | 2 +- .../expect_step_kind/func_external.cpp | 2 +- .../perfect/expect_step_kind/recursive.cpp | 2 +- .../perfect/expect_step_kind/small_loop.cpp | 2 +- .../commands/perfect/expect_step_order.cpp | 2 +- .../commands/perfect/expect_watch_type.cpp | 2 +- .../commands/perfect/expect_watch_value.cpp | 2 +- .../float_range_multiple.cpp | 2 +- .../float_range_watch/float_range_no_arg.cpp | 2 +- .../float_range_watch/float_range_small.cpp | 2 +- .../float_range_zero_match.cpp | 2 +- .../perfect/limit_steps/hit_count.cpp | 2 +- .../limit_steps_check_json_step_count.cpp | 2 +- .../limit_steps/limit_steps_expect_loop.cpp | 2 +- .../limit_steps/limit_steps_expect_value.cpp | 2 +- .../limit_steps/limit_steps_line_mismatch.cpp | 2 +- .../limit_steps_overlapping_ranges.cpp | 2 +- .../limit_steps_same_line_conditional.cpp | 2 +- .../perfect/limit_steps/unconditional.cpp | 2 +- .../commands/perfect/unreachable.cpp | 2 +- .../perfect/unreachable_not_cmd_lineno.cpp | 2 +- .../commands/perfect/unreachable_on_line.cpp | 2 +- .../subtools/test/address_printing.cpp | 2 +- .../subtools/test/err_bad_label_ref.cpp | 2 +- .../subtools/test/err_duplicate_address.cpp | 2 +- .../subtools/test/err_duplicate_label.cpp | 2 +- .../feature_tests/subtools/test/err_paren.cpp | 2 +- .../subtools/test/err_paren_mline.cpp | 2 +- .../subtools/test/err_syntax.cpp | 2 +- .../subtools/test/err_syntax_mline.cpp | 2 +- .../feature_tests/subtools/test/err_type.cpp | 2 +- .../subtools/test/err_type_mline.cpp | 2 +- .../subtools/test/err_undeclared_addr.cpp | 2 +- .../subtools/test/label_another_line.cpp | 2 +- .../subtools/test/label_offset.cpp | 2 +- .../subtools/test/source-root-dir.cpp | 2 +- .../subtools/test/target_run_args.c | 2 +- .../test/target_run_args_with_command.c | 2 +- .../dexter/feature_tests/subtools/view.cpp | 2 +- cross-project-tests/lit.cfg.py | 48 +++++++++++++++---- 79 files changed, 118 insertions(+), 90 deletions(-) diff --git a/cross-project-tests/debuginfo-tests/dexter-tests/aggregate-indirect-arg.cpp b/cross-project-tests/debuginfo-tests/dexter-tests/aggregate-indirect-arg.cpp index 801e4851cfa81..f20b7ce1d3e4d 100644 --- a/cross-project-tests/debuginfo-tests/dexter-tests/aggregate-indirect-arg.cpp +++ b/cross-project-tests/debuginfo-tests/dexter-tests/aggregate-indirect-arg.cpp @@ -1,7 +1,7 @@ // REQUIRES: lldb // UNSUPPORTED: system-windows // -// RUN: %clang -std=gnu++11 -O0 -g -lstdc++ %s -o %t +// RUN: %clang++ -std=gnu++11 -O0 -g %s -o %t // RUN: %dexter --fail-lt 1.0 -w \ // RUN: --binary %t --debugger 'lldb' -- %s // Radar 8945514 diff --git a/cross-project-tests/debuginfo-tests/dexter-tests/asan-deque.cpp b/cross-project-tests/debuginfo-tests/dexter-tests/asan-deque.cpp index cd1ca21957d31..08540145dc4ee 100644 --- a/cross-project-tests/debuginfo-tests/dexter-tests/asan-deque.cpp +++ b/cross-project-tests/debuginfo-tests/dexter-tests/asan-deque.cpp @@ -8,7 +8,7 @@ // lldb-8, even outside of dexter, will sometimes trigger an asan fault in // the debugged process and generally freak out. -// RUN: %clang -std=gnu++11 -O1 -glldb -fsanitize=address -arch x86_64 %s -o %t +// RUN: %clang++ -std=gnu++11 -O1 -glldb -fsanitize=address -arch x86_64 %s -o %t // RUN: %dexter --fail-lt 1.0 -w \ // RUN: --binary %t --debugger 'lldb' -- %s #include diff --git a/cross-project-tests/debuginfo-tests/dexter-tests/ctor.cpp b/cross-project-tests/debuginfo-tests/dexter-tests/ctor.cpp index a9b3e38692183..48482ceb31b95 100644 --- a/cross-project-tests/debuginfo-tests/dexter-tests/ctor.cpp +++ b/cross-project-tests/debuginfo-tests/dexter-tests/ctor.cpp @@ -1,7 +1,7 @@ // REQUIRES: lldb // UNSUPPORTED: system-windows // -// RUN: %clang -std=gnu++11 -O0 -glldb %s -o %t +// RUN: %clang++ -std=gnu++11 -O0 -glldb %s -o %t // RUN: %dexter --fail-lt 1.0 -w \ // RUN: --binary %t --debugger 'lldb' -- %s diff --git a/cross-project-tests/debuginfo-tests/dexter-tests/deferred_globals.cpp b/cross-project-tests/debuginfo-tests/dexter-tests/deferred_globals.cpp index d78c7293cb89c..5954f5297be0c 100644 --- a/cross-project-tests/debuginfo-tests/dexter-tests/deferred_globals.cpp +++ b/cross-project-tests/debuginfo-tests/dexter-tests/deferred_globals.cpp @@ -4,7 +4,7 @@ // REQUIRES: lldb // UNSUPPORTED: system-windows -// RUN: %clang -std=gnu++11 -O0 -g %s -o %t +// RUN: %clang++ -std=gnu++11 -O0 -g %s -o %t // RUN: %dexter --fail-lt 1.0 -w \ // RUN: --binary %t --debugger 'lldb' -v -- %s diff --git a/cross-project-tests/debuginfo-tests/dexter-tests/namespace.cpp b/cross-project-tests/debuginfo-tests/dexter-tests/namespace.cpp index c94939157ff7e..e6709cc807db6 100644 --- a/cross-project-tests/debuginfo-tests/dexter-tests/namespace.cpp +++ b/cross-project-tests/debuginfo-tests/dexter-tests/namespace.cpp @@ -5,7 +5,7 @@ // REQUIRES: lldb // UNSUPPORTED: system-windows -// RUN: %clang -g -O0 %s -o %t +// RUN: %clang++ -g -O0 %s -o %t // RUN: %dexter --fail-lt 1.0 -w \ // RUN: --binary %t --debugger 'lldb' -v -- %s diff --git a/cross-project-tests/debuginfo-tests/dexter-tests/nrvo-string.cpp b/cross-project-tests/debuginfo-tests/dexter-tests/nrvo-string.cpp index 822e832f9a16a..5df190a07a4e9 100644 --- a/cross-project-tests/debuginfo-tests/dexter-tests/nrvo-string.cpp +++ b/cross-project-tests/debuginfo-tests/dexter-tests/nrvo-string.cpp @@ -7,11 +7,11 @@ // Zorg configures the ASAN stage2 bots to not build the asan // compiler-rt. Only run this test on non-asanified configurations. // -// RUN: %clang -std=gnu++11 -O0 -glldb -fno-exceptions %s -o %t +// RUN: %clang++ -std=gnu++11 -O0 -glldb -fno-exceptions %s -o %t // RUN: %dexter --fail-lt 1.0 -w \ // RUN: --binary %t --debugger 'lldb' -- %s // -// RUN: %clang -std=gnu++11 -O1 -glldb -fno-exceptions %s -o %t +// RUN: %clang++ -std=gnu++11 -O1 -glldb -fno-exceptions %s -o %t // RUN: %dexter --fail-lt 1.0 -w \ // RUN: --binary %t --debugger 'lldb' -- %s // diff --git a/cross-project-tests/debuginfo-tests/dexter-tests/optnone-fastmath.cpp b/cross-project-tests/debuginfo-tests/dexter-tests/optnone-fastmath.cpp index 9f47f6a5e0cc4..6053488dc6808 100644 --- a/cross-project-tests/debuginfo-tests/dexter-tests/optnone-fastmath.cpp +++ b/cross-project-tests/debuginfo-tests/dexter-tests/optnone-fastmath.cpp @@ -1,7 +1,7 @@ -// RUN: %clang -std=gnu++11 -O2 -ffast-math -g %s -o %t +// RUN: %clang++ -std=gnu++11 -O2 -ffast-math -g %s -o %t // RUN: %dexter --fail-lt 1.0 -w \ // RUN: --binary %t --debugger 'lldb' -- %s -// RUN: %clang -std=gnu++11 -O0 -ffast-math -g %s -o %t +// RUN: %clang++ -std=gnu++11 -O0 -ffast-math -g %s -o %t // RUN: %dexter --fail-lt 1.0 -w \ // RUN: --binary %t --debugger 'lldb' -- %s diff --git a/cross-project-tests/debuginfo-tests/dexter-tests/optnone-loops.cpp b/cross-project-tests/debuginfo-tests/dexter-tests/optnone-loops.cpp index 32395342bddd1..230998c6d4e0d 100644 --- a/cross-project-tests/debuginfo-tests/dexter-tests/optnone-loops.cpp +++ b/cross-project-tests/debuginfo-tests/dexter-tests/optnone-loops.cpp @@ -5,7 +5,7 @@ // UNSUPPORTED: system-windows // UNSUPPORTED: system-darwin -// RUN: %clang -std=gnu++11 -O2 -g %s -o %t +// RUN: %clang++ -std=gnu++11 -O2 -g %s -o %t // RUN: %dexter --fail-lt 1.0 -w \ // RUN: --binary %t --debugger 'lldb' -- %s diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/dex_declare_file.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/dex_declare_file.cpp index adad78040947d..4ee8effda39e4 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/dex_declare_file.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/dex_declare_file.cpp @@ -5,7 +5,7 @@ // UNSUPPORTED: system-darwin // // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: dex_declare_file.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_program_state.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_program_state.cpp index db81de2e9853c..934b4e978073f 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_program_state.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_program_state.cpp @@ -4,7 +4,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: expect_program_state.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_step_kinds.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_step_kinds.cpp index bab2de642a1c3..b059c993ba8bf 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_step_kinds.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_step_kinds.cpp @@ -4,7 +4,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: expect_step_kinds.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_step_order.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_step_order.cpp index 76296caf13eca..391153ba18291 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_step_order.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_step_order.cpp @@ -4,7 +4,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: expect_step_order.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_watch_type.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_watch_type.cpp index 9aef64c813427..507f821cfd08c 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_watch_type.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_watch_type.cpp @@ -9,7 +9,7 @@ // TODO: Reduce this test's coverage and be more specific about // expected behaviour. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: expect_watch_type.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_watch_value.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_watch_value.cpp index ca8f862a5eda6..4cd2b390623c1 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_watch_value.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/expect_watch_value.cpp @@ -5,7 +5,7 @@ // UNSUPPORTED: system-darwin // // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: expect_watch_value.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/float_range_out_range.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/float_range_out_range.cpp index ee9b7b5669c85..466dff34efc6f 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/float_range_out_range.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/float_range_out_range.cpp @@ -4,7 +4,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: float_range_out_range.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/float_range_zero_nonmatch.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/float_range_zero_nonmatch.cpp index 89108789bdb91..e4f9e116d93fd 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/float_range_zero_nonmatch.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/float_range_zero_nonmatch.cpp @@ -4,7 +4,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: float_range_zero_nonmatch.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/missing_dex_address.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/missing_dex_address.cpp index f07a43b91cbcb..c5803dd11d134 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/missing_dex_address.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/missing_dex_address.cpp @@ -5,7 +5,7 @@ // The dbgeng driver doesn't support \DexDeclareAddress yet. // UNSUPPORTED: system-windows // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: missing_dex_address.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/unreachable.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/unreachable.cpp index ea98e431dced7..ddccf68caf9e4 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/unreachable.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/unreachable.cpp @@ -5,7 +5,7 @@ // UNSUPPORTED: system-darwin // // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: unreachable.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/unreachable_line_range.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/unreachable_line_range.cpp index 3038277d26e32..b6925eb4e6c20 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/unreachable_line_range.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/unreachable_line_range.cpp @@ -5,7 +5,7 @@ // UNSUPPORTED: system-darwin // // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: unreachable_line_range.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/unreachable_on_line.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/unreachable_on_line.cpp index f91a2611cecaf..ba741a48850b7 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/unreachable_on_line.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/penalty/unreachable_on_line.cpp @@ -5,7 +5,7 @@ // UNSUPPORTED: system-darwin // // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: unreachable_on_line.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/command_line.c b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/command_line.c index d4fae47fdd6cf..b7f29abb86575 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/command_line.c +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/command_line.c @@ -1,7 +1,7 @@ // The dbgeng driver doesn't support \DexCommandLine yet. // UNSUPPORTED: system-windows // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_c_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: command_line.c: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/address_after_ref.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/address_after_ref.cpp index 6b461686f3930..67afd6687c3dc 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/address_after_ref.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/address_after_ref.cpp @@ -2,7 +2,7 @@ // Test that a \DexDeclareAddress value can have its value defined after // the first reference to that value. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: address_after_ref.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/address_hit_count.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/address_hit_count.cpp index 1bb995c942201..9fd1b24774285 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/address_hit_count.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/address_hit_count.cpp @@ -4,7 +4,7 @@ // expression after the target line has been stepped on a given number of // times. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: address_hit_count.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/expression_address.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/expression_address.cpp index 52a0f8891bc2e..2bb83850e2a16 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/expression_address.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/expression_address.cpp @@ -2,7 +2,7 @@ // Test that a \DexDeclareAddress value can be used to compare the // addresses of two local variables that refer to the same address. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: expression_address.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/identical_address.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/identical_address.cpp index 71d8f9944be3b..edcda2c94a441 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/identical_address.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/identical_address.cpp @@ -2,7 +2,7 @@ // Test that a \DexDeclareAddress value can be used to compare two equal // pointer variables. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: identical_address.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/multiple_address.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/multiple_address.cpp index 487d95399dcf2..66dcdb3ff42ab 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/multiple_address.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/multiple_address.cpp @@ -2,7 +2,7 @@ // Test that multiple \DexDeclareAddress references that point to different // addresses can be used within a single \DexExpectWatchValue. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: multiple_address.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/offset_address.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/offset_address.cpp index ab60c254462cb..b6e4f2cb535ad 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/offset_address.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/offset_address.cpp @@ -2,7 +2,7 @@ // Test that a \DexDeclareAddress value can be used to compare two pointer // variables that have a fixed offset between them. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: offset_address.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/self_comparison.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/self_comparison.cpp index 5c54723446439..295a05556d287 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/self_comparison.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_address/self_comparison.cpp @@ -2,7 +2,7 @@ // Test that a \DexDeclareAddress value can be used to check the change in // value of a variable over time, relative to its initial value. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: self_comparison.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/dex_and_source/test.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/dex_and_source/test.cpp index 71df65215a342..9e720ff8aa557 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/dex_and_source/test.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/dex_and_source/test.cpp @@ -6,7 +6,7 @@ // UNSUPPORTED: system-darwin // // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: dex_and_source diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_conditional.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_conditional.cpp index 8138b894af3aa..f419577e2d02e 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_conditional.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_conditional.cpp @@ -6,7 +6,7 @@ // condition (x == 5) is satisfied. // Tests using the default controller (no \DexLimitSteps). // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: default_conditional.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_conditional_hit_count.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_conditional_hit_count.cpp index bbf6fcfd26259..cd18523468267 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_conditional_hit_count.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_conditional_hit_count.cpp @@ -7,7 +7,7 @@ // given number of times. // Tests using the default controller (no \DexLimitSteps). // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: default_conditional_hit_count.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_hit_count.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_hit_count.cpp index efcfcff8db904..efc22a6d7d816 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_hit_count.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_hit_count.cpp @@ -4,7 +4,7 @@ // specific number of times. // Tests using the default controller (no \DexLimitSteps). // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: default_hit_count.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_simple.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_simple.cpp index 3e3edbf66ff2a..ee4cf1decf285 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_simple.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/default_simple.cpp @@ -4,7 +4,7 @@ // is stepped on. // Tests using the default controller (no \DexLimitSteps). // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: default_simple.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_conditional.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_conditional.cpp index 0473edee1dcd1..253e984e51c4b 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_conditional.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_conditional.cpp @@ -7,7 +7,7 @@ // The dbgeng driver doesn't support \DexLimitSteps yet. // UNSUPPORTED: system-windows // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: limit_steps_conditional.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_conditional_hit_count.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_conditional_hit_count.cpp index 2869b70c46c1e..ac64d49d5392d 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_conditional_hit_count.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_conditional_hit_count.cpp @@ -8,7 +8,7 @@ // The dbgeng driver doesn't support \DexLimitSteps yet. // UNSUPPORTED: system-windows // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: limit_steps_conditional_hit_count.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_hit_count.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_hit_count.cpp index 5928d43a9b258..ce9432f47bfaa 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_hit_count.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_hit_count.cpp @@ -7,7 +7,7 @@ // The dbgeng driver doesn't support \DexLimitSteps yet. // UNSUPPORTED: system-windows // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: limit_steps_hit_count.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_simple.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_simple.cpp index b3d61c500156e..27505d5a3f5a0 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_simple.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_finish_test/limit_steps_simple.cpp @@ -7,7 +7,7 @@ // The dbgeng driver doesn't support \DexLimitSteps yet. // UNSUPPORTED: system-windows, system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: limit_steps_simple.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_program_state.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_program_state.cpp index 6f822f7451eb9..58bdf40d9e112 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_program_state.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_program_state.cpp @@ -5,7 +5,7 @@ // UNSUPPORTED: system-darwin // // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: expect_program_state.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/direction.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/direction.cpp index 5d8f45eb66ea0..c155cfcad5c3c 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/direction.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/direction.cpp @@ -10,7 +10,7 @@ // TODO: The dbgeng debugger does not support column step reporting at present. // XFAIL: system-windows // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: direction.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/func.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/func.cpp index 2126bd8776dd3..0ea8875a9e21a 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/func.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/func.cpp @@ -5,7 +5,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: func.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/func_external.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/func_external.cpp index 41e7695cbdffd..eeb1e8cf7d26c 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/func_external.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/func_external.cpp @@ -8,7 +8,7 @@ // This fails right now on my linux and windows machine, needs examining as to // why. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: func_external.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/recursive.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/recursive.cpp index 432b35181d899..3642c9baf7a4f 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/recursive.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/recursive.cpp @@ -5,7 +5,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: recursive.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/small_loop.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/small_loop.cpp index d47058e67584e..18859bd50089c 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/small_loop.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_kind/small_loop.cpp @@ -5,7 +5,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: small_loop.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_order.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_order.cpp index 48735911d92d1..22883a7ce0c66 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_order.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_step_order.cpp @@ -4,7 +4,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: expect_step_order.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_watch_type.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_watch_type.cpp index d59f7206cc9b5..ddc5e7cce6e98 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_watch_type.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_watch_type.cpp @@ -8,7 +8,7 @@ // in the same manner as LLDB. // XFAIL: system-windows // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: expect_watch_type.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_watch_value.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_watch_value.cpp index 9286a2c704d5c..593488b809959 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_watch_value.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/expect_watch_value.cpp @@ -4,7 +4,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: expect_watch_value.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_multiple.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_multiple.cpp index 46610b2cab6d4..0ef1724c97eb5 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_multiple.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_multiple.cpp @@ -4,7 +4,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: float_range_multiple.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_no_arg.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_no_arg.cpp index 320a400d9264e..06454880b0fe9 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_no_arg.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_no_arg.cpp @@ -7,7 +7,7 @@ // work for both dbgeng and lldb, which output floats differently. // UNSUPPORTED: system-darwin, system-windows // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: float_range_no_arg.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_small.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_small.cpp index 8034a3702636c..2e105a4ef3fbf 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_small.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_small.cpp @@ -4,7 +4,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: float_range_small.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_zero_match.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_zero_match.cpp index 0d55ab4405b6e..3c1926fd3cede 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_zero_match.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/float_range_watch/float_range_zero_match.cpp @@ -3,7 +3,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: float_range_zero_match.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/hit_count.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/hit_count.cpp index 50cc3e21a85b7..9f91145eaa3a5 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/hit_count.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/hit_count.cpp @@ -2,7 +2,7 @@ // Test that \DexLimitSteps keyword argument hit_count correctly limits // the number of times the command can trigger. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: hit_count.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_check_json_step_count.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_check_json_step_count.cpp index a89d316744781..ce28423b0c57f 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_check_json_step_count.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_check_json_step_count.cpp @@ -1,7 +1,7 @@ // Purpose: // Check number of step lines are correctly reported in json output. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t --verbose -- %s | FileCheck %s // CHECK: limit_steps_check_json_step_count.cpp // CHECK: ## BEGIN ## diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_expect_loop.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_expect_loop.cpp index b60c5cafb8fb8..15c8c87d906fb 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_expect_loop.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_expect_loop.cpp @@ -2,7 +2,7 @@ // Check the DexLimit steps only gathers step info for 2 iterations of a // for loop. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: limit_steps_expect_loop.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_expect_value.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_expect_value.cpp index 76c6be665aaaf..03b01cf574c1e 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_expect_value.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_expect_value.cpp @@ -1,7 +1,7 @@ // Purpose: // Ensure that limited stepping breaks for all expected values. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: limit_steps_expect_value.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_line_mismatch.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_line_mismatch.cpp index 863782a5948b8..8b242c4e288f6 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_line_mismatch.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_line_mismatch.cpp @@ -3,7 +3,7 @@ // doesn't exist. This can happen due to optimisations or label is on an // empty line. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: limit_steps_line_mismatch.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_overlapping_ranges.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_overlapping_ranges.cpp index a250e5377f3c8..a99f6bbbe545a 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_overlapping_ranges.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_overlapping_ranges.cpp @@ -1,7 +1,7 @@ // Purpose: // Ensure that multiple overlapping \DexLimitSteps ranges do not interfere. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: limit_steps_overlapping_ranges.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_same_line_conditional.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_same_line_conditional.cpp index bc1a690111c0e..6613ad2355c7c 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_same_line_conditional.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/limit_steps_same_line_conditional.cpp @@ -1,7 +1,7 @@ // Purpose: // Test that LimitStep commands can exist on the same from line. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: limit_steps_same_line_conditional.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/unconditional.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/unconditional.cpp index d7bf8ca4097ff..3a145eaa400a5 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/unconditional.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/limit_steps/unconditional.cpp @@ -2,7 +2,7 @@ // Test that \DexLimitSteps can be used without a condition (i.e. the // breakpoint range is set any time from_line is stepped on). // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: unconditional.cpp diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/unreachable.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/unreachable.cpp index 5e754c0cf5cab..7f42c50c1c791 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/unreachable.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/unreachable.cpp @@ -4,7 +4,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: unreachable.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/unreachable_not_cmd_lineno.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/unreachable_not_cmd_lineno.cpp index cd98561d2a63a..58ab2e6734057 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/unreachable_not_cmd_lineno.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/unreachable_not_cmd_lineno.cpp @@ -4,7 +4,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: unreachable_not_cmd_lineno.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/unreachable_on_line.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/unreachable_on_line.cpp index a59b8b2802f6a..f414729855177 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/unreachable_on_line.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/unreachable_on_line.cpp @@ -4,7 +4,7 @@ // // UNSUPPORTED: system-darwin // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: unreachable_on_line.cpp: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/address_printing.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/address_printing.cpp index cdde328511196..133679ee6950f 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/address_printing.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/address_printing.cpp @@ -11,7 +11,7 @@ // The dbgeng driver doesn't support \DexLimitSteps yet. // UNSUPPORTED: system-windows // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -v -- %s | FileCheck %s // CHECK: Resolved Addresses: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_bad_label_ref.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_bad_label_ref.cpp index e1a2791e50c13..0921d7991bb6e 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_bad_label_ref.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_bad_label_ref.cpp @@ -1,7 +1,7 @@ // Purpose: // Check that referencing an undefined label gives a useful error message. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -v -- %s | FileCheck %s --match-full-lines // // CHECK: parser error:{{.*}}err_bad_label_ref.cpp(15): Unresolved label: 'label_does_not_exist' diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_duplicate_address.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_duplicate_address.cpp index 2120550bb81ff..6839360a7f3bc 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_duplicate_address.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_duplicate_address.cpp @@ -1,7 +1,7 @@ // Purpose: // Check that declaring duplicate addresses gives a useful error message. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -v -- %s | FileCheck %s --match-full-lines diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_duplicate_label.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_duplicate_label.cpp index d8cef2be3322b..89d6fb85d8d60 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_duplicate_label.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_duplicate_label.cpp @@ -1,7 +1,7 @@ // Purpose: // Check that defining duplicate labels gives a useful error message. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -v -- %s | FileCheck %s --match-full-lines // // CHECK: parser error:{{.*}}err_duplicate_label.cpp(12): Found duplicate line label: 'oops' diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_paren.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_paren.cpp index bac1baec259da..5a35b3a512bef 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_paren.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_paren.cpp @@ -7,7 +7,7 @@ // Note: Despite using 'lldb' as the debugger, lldb is not actually required // as the test should finish before lldb would be invoked. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_base test --binary %t --debugger 'lldb' \ // RUN: -v -- %s | FileCheck %s --match-full-lines --strict-whitespace // diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_paren_mline.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_paren_mline.cpp index d48d0a0fcc3bd..0044b3b6eff01 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_paren_mline.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_paren_mline.cpp @@ -7,7 +7,7 @@ // Note: Despite using 'lldb' as the debugger, lldb is not actually required // as the test should finish before lldb would be invoked. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_base test --binary %t --debugger "lldb" \ // RUN: -v -- %s | FileCheck %s --match-full-lines --strict-whitespace // diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax.cpp index 732baef66701a..5992025442132 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax.cpp @@ -7,7 +7,7 @@ // Note: Despite using 'lldb' as the debugger, lldb is not actually required // as the test should finish before lldb would be invoked. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_base test --binary %t --debugger "lldb" \ // RUN: -v -- %s | FileCheck %s --match-full-lines --strict-whitespace // diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax_mline.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax_mline.cpp index 3ede5e90caaf8..71b23a2a3a8b9 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax_mline.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax_mline.cpp @@ -7,7 +7,7 @@ // Note: Despite using 'lldb' as the debugger, lldb is not actually required // as the test should finish before lldb would be invoked. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_base test --binary %t --debugger "lldb" \ // RUN: -v -- %s | FileCheck %s --match-full-lines --strict-whitespace // diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_type.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_type.cpp index 01c19330a4f52..264515496f1c1 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_type.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_type.cpp @@ -7,7 +7,7 @@ // Note: Despite using 'lldb' as the debugger, lldb is not actually required // as the test should finish before lldb would be invoked. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_base test --binary %t --debugger "lldb" \ // RUN: -v -- %s | FileCheck %s --match-full-lines --strict-whitespace // diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_type_mline.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_type_mline.cpp index 38a163152007f..5cbcd2d88808e 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_type_mline.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_type_mline.cpp @@ -7,7 +7,7 @@ // Note: Despite using 'lldb' as the debugger, lldb is not actually required // as the test should finish before lldb would be invoked. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_base test --binary %t --debugger "lldb" \ // RUN: -v -- %s | FileCheck %s --match-full-lines --strict-whitespace // diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_undeclared_addr.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_undeclared_addr.cpp index 7939c9ac98a3a..66a5e46254579 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_undeclared_addr.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_undeclared_addr.cpp @@ -1,7 +1,7 @@ // Purpose: // Check that using an undeclared address gives a useful error message. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_regression_test_run --binary %t -v -- %s | FileCheck %s --match-full-lines diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/label_another_line.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/label_another_line.cpp index 0d2fc0b8821e8..a28758a8fb53b 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/label_another_line.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/label_another_line.cpp @@ -2,7 +2,7 @@ // Check that the optional keyword argument 'on_line' makes a \DexLabel label // that line instead of the line the command is found on. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -- %s | FileCheck %s // CHECK: label_another_line.cpp: (1.0000) diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/label_offset.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/label_offset.cpp index 138979269c39b..334b6a565ee2e 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/label_offset.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/label_offset.cpp @@ -1,7 +1,7 @@ // Purpose: // Check that we can use label-relative line numbers. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t -v -- %s | FileCheck %s // // CHECK: label_offset.cpp: (1.0000) diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/source-root-dir.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/source-root-dir.cpp index 36db3eb3b2750..af24c5d8e572c 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/source-root-dir.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/source-root-dir.cpp @@ -1,6 +1,6 @@ // This test started failing recently for unknown reasons. // XFAIL:* -// RUN: %dexter_regression_test_build \ +// RUN: %dexter_regression_test_cxx_build \ // RUN: -fdebug-prefix-map=%S=/changed %s -o %t // RUN: %dexter --fail-lt 1.0 -w \ // RUN: --binary %t \ diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/target_run_args.c b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/target_run_args.c index 7efd3c08c6237..d8a253e70f6b3 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/target_run_args.c +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/target_run_args.c @@ -1,7 +1,7 @@ // The dbgeng driver doesn't support --target-run-args yet. // UNSUPPORTED: system-windows // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_c_build %s -o %t // RUN: %dexter_regression_test_run --binary %t --target-run-args "a b 'c d'" -- %s | FileCheck %s // CHECK: target_run_args.c: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/target_run_args_with_command.c b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/target_run_args_with_command.c index 34ce63fa033ec..1f8ed5f189936 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/target_run_args_with_command.c +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/target_run_args_with_command.c @@ -1,7 +1,7 @@ // The dbgeng driver doesn't support --target-run-args yet. // UNSUPPORTED: system-windows // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_c_build %s -o %t // RUN: %dexter_regression_test_run --binary %t --target-run-args "a b 'c d'" -- %s | FileCheck %s // CHECK: target_run_args_with_command.c: diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/view.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/view.cpp index 9373c0447ac70..58e3644be5620 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/view.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/view.cpp @@ -1,7 +1,7 @@ // Purpose: // Check the `view` subtool works with typical inputs. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: %dexter_regression_test_run --binary %t --results %t.results -- %s // // RUN: %dexter_base view %t.results/view.cpp.dextIR | FileCheck %s diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py index ccd3d01023c9a..b35c643ac898c 100644 --- a/cross-project-tests/lit.cfg.py +++ b/cross-project-tests/lit.cfg.py @@ -131,18 +131,31 @@ def configure_dexter_substitutions(): if platform.system() == "Windows": # The Windows builder script uses lld. dependencies = ["clang", "lld-link"] - dexter_regression_test_builder = "clang-cl" + dexter_regression_test_c_builder = "clang-cl" + dexter_regression_test_cxx_builder = "clang-cl" dexter_regression_test_debugger = "dbgeng" - dexter_regression_test_flags = "/Zi /Od" + dexter_regression_test_c_flags = "/Zi /Od" + dexter_regression_test_cxx_flags = "/Zi /Od" + dexter_regression_test_additional_flags = "" else: # Use lldb as the debugger on non-Windows platforms. dependencies = ["clang", "lldb"] - dexter_regression_test_builder = "clang++" + dexter_regression_test_c_builder = "clang" + dexter_regression_test_cxx_builder = "clang++" dexter_regression_test_debugger = "lldb" - dexter_regression_test_flags = "-O0 -glldb -std=gnu++11" + dexter_regression_test_c_flags = "-O0 -glldb -std=gnu11" + dexter_regression_test_cxx_flags = "-O0 -glldb -std=gnu++11" + dexter_regression_test_additional_flags = '--lldb-executable "{}"'.format( + lldb_path + ) tools.append( - ToolSubst("%dexter_regression_test_builder", dexter_regression_test_builder) + ToolSubst("%dexter_regression_test_c_builder", dexter_regression_test_c_builder) + ) + tools.append( + ToolSubst( + "%dexter_regression_test_cxx_builder", dexter_regression_test_cxx_builder + ) ) tools.append( ToolSubst("%dexter_regression_test_debugger", dexter_regression_test_debugger) @@ -151,7 +164,10 @@ def configure_dexter_substitutions(): # regression tests we use clang to drive the linker, and so all flags will be # passed in a single command. tools.append( - ToolSubst("%dexter_regression_test_flags", dexter_regression_test_flags) + ToolSubst("%dexter_regression_test_c_flags", dexter_regression_test_c_flags) + ) + tools.append( + ToolSubst("%dexter_regression_test_cxx_flags", dexter_regression_test_cxx_flags) ) # Typical command would take the form: @@ -165,18 +181,30 @@ def configure_dexter_substitutions(): "--fail-lt 1.0 -w", "--debugger", dexter_regression_test_debugger, + dexter_regression_test_additional_flags, ] ) tools.append(ToolSubst("%dexter_regression_test_run", dexter_regression_test_run)) # Include build flags for %dexter_regression_test. - dexter_regression_test_build = " ".join( + dexter_regression_test_c_build = " ".join( [ - dexter_regression_test_builder, - dexter_regression_test_flags, + dexter_regression_test_c_builder, + dexter_regression_test_c_flags, ] ) - tools.append(ToolSubst("%dexter_regression_test_build", dexter_regression_test_build)) + dexter_regression_test_cxx_build = " ".join( + [ + dexter_regression_test_cxx_builder, + dexter_regression_test_cxx_flags, + ] + ) + tools.append( + ToolSubst("%dexter_regression_test_c_build", dexter_regression_test_c_build) + ) + tools.append( + ToolSubst("%dexter_regression_test_cxx_build", dexter_regression_test_cxx_build) + ) return dependencies From 50fe5b90e7a17700dd265d3cd917c4332b806152 Mon Sep 17 00:00:00 2001 From: Jake Egan Date: Thu, 3 Apr 2025 10:39:49 -0400 Subject: [PATCH 0534/1029] [sanitizer_common][NFC] Fix sanitizer_symbolizer_libcdep.cpp formatting (#133930) --- .../sanitizer_symbolizer_libcdep.cpp | 86 ++++++++++--------- 1 file changed, 45 insertions(+), 41 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp index 74458028ae8f5..565701c85d978 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp @@ -31,11 +31,12 @@ Symbolizer *Symbolizer::GetOrInit() { const char *ExtractToken(const char *str, const char *delims, char **result) { uptr prefix_len = internal_strcspn(str, delims); - *result = (char*)InternalAlloc(prefix_len + 1); + *result = (char *)InternalAlloc(prefix_len + 1); internal_memcpy(*result, str, prefix_len); (*result)[prefix_len] = '\0'; const char *prefix_end = str + prefix_len; - if (*prefix_end != '\0') prefix_end++; + if (*prefix_end != '\0') + prefix_end++; return prefix_end; } @@ -78,7 +79,8 @@ const char *ExtractTokenUpToDelimiter(const char *str, const char *delimiter, internal_memcpy(*result, str, prefix_len); (*result)[prefix_len] = '\0'; const char *prefix_end = str + prefix_len; - if (*prefix_end != '\0') prefix_end += internal_strlen(delimiter); + if (*prefix_end != '\0') + prefix_end += internal_strlen(delimiter); return prefix_end; } @@ -215,18 +217,20 @@ const LoadedModule *Symbolizer::FindModuleForAddress(uptr address) { modules_were_reloaded = true; } const LoadedModule *module = SearchForModule(modules_, address); - if (module) return module; + if (module) + return module; // dlopen/dlclose interceptors invalidate the module list, but when // interception is disabled, we need to retry if the lookup fails in // case the module list changed. -#if !SANITIZER_INTERCEPT_DLOPEN_DLCLOSE +# if !SANITIZER_INTERCEPT_DLOPEN_DLCLOSE if (!modules_were_reloaded) { RefreshModules(); module = SearchForModule(modules_, address); - if (module) return module; + if (module) + return module; } -#endif +# endif if (fallback_modules_.size()) { module = SearchForModule(fallback_modules_, address); @@ -260,31 +264,31 @@ class LLVMSymbolizerProcess final : public SymbolizerProcess { // script/asan_symbolize.py and sanitizer_common.h. void GetArgV(const char *path_to_binary, const char *(&argv)[kArgVMax]) const override { -#if defined(__x86_64h__) - const char* const kSymbolizerArch = "--default-arch=x86_64h"; -#elif defined(__x86_64__) - const char* const kSymbolizerArch = "--default-arch=x86_64"; -#elif defined(__i386__) - const char* const kSymbolizerArch = "--default-arch=i386"; -#elif SANITIZER_LOONGARCH64 +# if defined(__x86_64h__) + const char *const kSymbolizerArch = "--default-arch=x86_64h"; +# elif defined(__x86_64__) + const char *const kSymbolizerArch = "--default-arch=x86_64"; +# elif defined(__i386__) + const char *const kSymbolizerArch = "--default-arch=i386"; +# elif SANITIZER_LOONGARCH64 const char *const kSymbolizerArch = "--default-arch=loongarch64"; -#elif SANITIZER_RISCV64 +# elif SANITIZER_RISCV64 const char *const kSymbolizerArch = "--default-arch=riscv64"; -#elif defined(__aarch64__) - const char* const kSymbolizerArch = "--default-arch=arm64"; -#elif defined(__arm__) - const char* const kSymbolizerArch = "--default-arch=arm"; -#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - const char* const kSymbolizerArch = "--default-arch=powerpc64"; -#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - const char* const kSymbolizerArch = "--default-arch=powerpc64le"; -#elif defined(__s390x__) - const char* const kSymbolizerArch = "--default-arch=s390x"; -#elif defined(__s390__) - const char* const kSymbolizerArch = "--default-arch=s390"; -#else - const char* const kSymbolizerArch = "--default-arch=unknown"; -#endif +# elif defined(__aarch64__) + const char *const kSymbolizerArch = "--default-arch=arm64"; +# elif defined(__arm__) + const char *const kSymbolizerArch = "--default-arch=arm"; +# elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + const char *const kSymbolizerArch = "--default-arch=powerpc64"; +# elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + const char *const kSymbolizerArch = "--default-arch=powerpc64le"; +# elif defined(__s390x__) + const char *const kSymbolizerArch = "--default-arch=s390x"; +# elif defined(__s390__) + const char *const kSymbolizerArch = "--default-arch=s390"; +# else + const char *const kSymbolizerArch = "--default-arch=unknown"; +# endif const char *const demangle_flag = common_flags()->demangle ? "--demangle" : "--no-demangle"; @@ -315,7 +319,8 @@ static const char *ParseFileLineInfo(AddressInfo *info, const char *str) { char *back = file_line_info + size - 1; for (int i = 0; i < 2; ++i) { while (back > file_line_info && IsDigit(*back)) --back; - if (*back != ':' || !IsDigit(back[1])) break; + if (*back != ':' || !IsDigit(back[1])) + break; info->column = info->line; info->line = internal_atoll(back + 1); // Truncate the string at the colon to keep only filename. @@ -436,7 +441,7 @@ bool LLVMSymbolizer::SymbolizeData(uptr addr, DataInfo *info) { if (!buf) return false; ParseSymbolizeDataOutput(buf, info); - info->start += (addr - info->module_offset); // Add the base address. + info->start += (addr - info->module_offset); // Add the base address. return true; } @@ -459,10 +464,9 @@ const char *LLVMSymbolizer::FormatAndSendCommand(const char *command_prefix, size_needed = internal_snprintf(buffer_, kBufferSize, "%s \"%s\" 0x%zx\n", command_prefix, module_name, module_offset); else - size_needed = internal_snprintf(buffer_, kBufferSize, - "%s \"%s:%s\" 0x%zx\n", command_prefix, - module_name, ModuleArchToString(arch), - module_offset); + size_needed = internal_snprintf( + buffer_, kBufferSize, "%s \"%s:%s\" 0x%zx\n", command_prefix, + module_name, ModuleArchToString(arch), module_offset); if (size_needed >= static_cast(kBufferSize)) { Report("WARNING: Command buffer too small"); @@ -484,9 +488,9 @@ SymbolizerProcess::SymbolizerProcess(const char *path, bool use_posix_spawn) CHECK_NE(path_[0], '\0'); } -static bool IsSameModule(const char* path) { - if (const char* ProcessName = GetProcessName()) { - if (const char* SymbolizerName = StripModuleName(path)) { +static bool IsSameModule(const char *path) { + if (const char *ProcessName = GetProcessName()) { + if (const char *SymbolizerName = StripModuleName(path)) { return !internal_strcmp(ProcessName, SymbolizerName); } } @@ -516,9 +520,9 @@ const char *SymbolizerProcess::SendCommand(const char *command) { const char *SymbolizerProcess::SendCommandImpl(const char *command) { if (input_fd_ == kInvalidFd || output_fd_ == kInvalidFd) - return nullptr; + return nullptr; if (!WriteToSymbolizer(command, internal_strlen(command))) - return nullptr; + return nullptr; if (!ReadFromSymbolizer()) return nullptr; return buffer_.data(); From 51c2750599d2472dafce0231aa8b95f5137c7de8 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 3 Apr 2025 07:40:00 -0700 Subject: [PATCH 0535/1029] [lldb] Update examples in docs/use/python-reference.rst to work with Python 3 (#134204) The examples on this page were using the Python 2-style print. I ran the updated code examples under Python 3 to confirm they are still up-to-date. --- lldb/docs/use/python-reference.rst | 68 ++++++++++++++++-------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/lldb/docs/use/python-reference.rst b/lldb/docs/use/python-reference.rst index 02e09e10c0f6a..4bf0cb075064b 100644 --- a/lldb/docs/use/python-reference.rst +++ b/lldb/docs/use/python-reference.rst @@ -153,16 +153,16 @@ pass them to the Python print function: (lldb) script Python Interactive Interpreter. To exit, type 'quit()', 'exit()' or Ctrl-D. - >>> print lldb.debugger + >>> print(lldb.debugger) Debugger (instance: "debugger_1", id: 1) - >>> print lldb.target + >>> print(lldb.target) a.out - >>> print lldb.process - SBProcess: pid = 59289, state = stopped, threads = 1, executable = a.out - >>> print lldb.thread - SBThread: tid = 0x1f03 - >>> print lldb.frame - frame #0: 0x0000000100000bb6 a.out main + 54 at main.c:16 + >>> print(lldb.process) + SBProcess: pid = 58842, state = stopped, threads = 1, executable = a.out + >>> print(lldb.thread) + thread #1: tid = 0x2265ce3, 0x0000000100000334 a.out`main at t.c:2:3, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1 + >>> print(lldb.frame) + frame #0: 0x0000000100000334 a.out`main at t.c:2:3 Running a python script when a breakpoint gets hit @@ -252,7 +252,7 @@ Here is the code: > # Get the name of the function > name = frame.GetFunctionName() > # Print the order and the function name - > print '[%i] %s' % (counter, name) + > print('[%i] %s' % (counter, name)) > # Disable the current breakpoint location so it doesn't get hit again > bp_loc.SetEnabled(False) > # No need to stop here @@ -588,7 +588,7 @@ say .. code-block:: python - print >>result, "my command does lots of cool stuff" + print("my command does lots of cool stuff", file=result) SBCommandReturnObject and SBStream both support this file-like behavior by providing write() and flush() calls at the Python layer. @@ -712,7 +712,7 @@ your lldb.ParsedCommand subclass should implement: """ And to handle the completion of arguments: - + .. code-block:: python def handle_argument_completion(self, args, arg_pos, cursor_pos): @@ -826,7 +826,7 @@ a function that can be used by LLDB's python command code: # And the initialization code to add your commands def __lldb_init_module(debugger, internal_dict): debugger.HandleCommand('command script add -f ls.ls ls') - print 'The "ls" python command has been installed and is ready for use.' + print('The "ls" python command has been installed and is ready for use.') Now we can load the module into LLDB and use it @@ -964,16 +964,18 @@ script that will launch a program from the current working directory called "a.out", set a breakpoint at "main", and then run and hit the breakpoint, and print the process, thread and frame objects if the process stopped: -:: +.. code-block:: python - #!/usr/bin/env python + #!/usr/bin/env python3 import lldb import os + def disassemble_instructions(insts): for i in insts: - print i + print(i) + # Set the path to the executable to debug exe = "./a.out" @@ -983,54 +985,56 @@ print the process, thread and frame objects if the process stopped: # When we step or continue, don't return from the function until the process # stops. Otherwise we would have to handle the process events ourselves which, while doable is - #a little tricky. We do this by setting the async mode to false. - debugger.SetAsync (False) + # a little tricky. We do this by setting the async mode to false. + debugger.SetAsync(False) # Create a target from a file and arch - print "Creating a target for '%s'" % exe + print("Creating a target for '%s'" % exe) - target = debugger.CreateTargetWithFileAndArch (exe, lldb.LLDB_ARCH_DEFAULT) + target = debugger.CreateTargetWithFileAndArch(exe, lldb.LLDB_ARCH_DEFAULT) if target: # If the target is valid set a breakpoint at main - main_bp = target.BreakpointCreateByName ("main", target.GetExecutable().GetFilename()); + main_bp = target.BreakpointCreateByName( + "main", target.GetExecutable().GetFilename() + ) - print main_bp + print(main_bp) # Launch the process. Since we specified synchronous mode, we won't return # from this function until we hit the breakpoint at main - process = target.LaunchSimple (None, None, os.getcwd()) + process = target.LaunchSimple(None, None, os.getcwd()) # Make sure the launch went ok if process: # Print some simple process info - state = process.GetState () - print process + state = process.GetState() + print(process) if state == lldb.eStateStopped: # Get the first thread - thread = process.GetThreadAtIndex (0) + thread = process.GetThreadAtIndex(0) if thread: # Print some simple thread info - print thread + print(thread) # Get the first frame - frame = thread.GetFrameAtIndex (0) + frame = thread.GetFrameAtIndex(0) if frame: # Print some simple frame info - print frame + print(frame) function = frame.GetFunction() # See if we have debug info (a function) if function: # We do have a function, print some info for the function - print function + print(function) # Now get all instructions for this function and print them insts = function.GetInstructions(target) - disassemble_instructions (insts) + disassemble_instructions(insts) else: # See if we have a symbol in the symbol table for where we stopped - symbol = frame.GetSymbol(); + symbol = frame.GetSymbol() if symbol: # We do have a symbol, print some info for the symbol - print symbol + print(symbol) Writing lldb frame recognizers in Python ---------------------------------------- From f59b5b8d597d52336a59d2c0555212242e29a45b Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Thu, 3 Apr 2025 15:41:00 +0100 Subject: [PATCH 0536/1029] [MLIR][OpenMP] Fix standalone distribute on the device (#133094) This patch updates the handling of target regions to set trip counts and kernel execution modes properly, based on clang's behavior. This fixes a race condition on `target teams distribute` constructs with no `parallel do` loop inside. This is how kernels are classified, after changes introduced in this patch: ```f90 ! Exec mode: SPMD. ! Trip count: Set. !$omp target teams distribute parallel do do i=... end do ! Exec mode: Generic-SPMD. ! Trip count: Set (outer loop). !$omp target teams distribute do i=... !$omp parallel do private(idx, y) do j=... end do end do ! Exec mode: Generic-SPMD. ! Trip count: Set (outer loop). !$omp target teams distribute do i=... !$omp parallel ... !$omp end parallel end do ! Exec mode: Generic. ! Trip count: Set. !$omp target teams distribute do i=... end do ! Exec mode: SPMD. ! Trip count: Not set. !$omp target parallel do do i=... end do ! Exec mode: Generic. ! Trip count: Not set. !$omp target ... !$omp end target ``` For the split `target teams distribute + parallel do` case, clang produces a Generic kernel which gets promoted to Generic-SPMD by the openmp-opt pass. We can't currently replicate that behavior in flang because our codegen for these constructs results in the introduction of calls to the `kmpc_distribute_static_loop` family of functions, instead of `kmpc_distribute_static_init`, which currently prevent promotion of the kernel to Generic-SPMD. For the time being, instead of relying on the openmp-opt pass, we look at the MLIR representation to find the Generic-SPMD pattern and directly tag the kernel as such during codegen. This is what we were already doing, but incorrectly matching other kinds of kernels as such in the process. --- .../mlir/Dialect/OpenMP/OpenMPEnums.td | 18 ++ mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 2 +- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 204 +++++++++++------- .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 16 +- mlir/test/Dialect/OpenMP/invalid.mlir | 4 +- mlir/test/Dialect/OpenMP/ops.mlir | 17 ++ .../LLVMIR/openmp-target-generic-spmd.mlir | 111 ++++++++++ 7 files changed, 285 insertions(+), 87 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td index 690e3df1f685e..9dbe6897a3304 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td @@ -222,6 +222,24 @@ def ScheduleModifier : OpenMP_I32EnumAttr< def ScheduleModifierAttr : OpenMP_EnumAttr; +//===----------------------------------------------------------------------===// +// target_region_flags enum. +//===----------------------------------------------------------------------===// + +def TargetRegionFlagsNone : I32BitEnumAttrCaseNone<"none">; +def TargetRegionFlagsGeneric : I32BitEnumAttrCaseBit<"generic", 0>; +def TargetRegionFlagsSpmd : I32BitEnumAttrCaseBit<"spmd", 1>; +def TargetRegionFlagsTripCount : I32BitEnumAttrCaseBit<"trip_count", 2>; + +def TargetRegionFlags : OpenMP_BitEnumAttr< + "TargetRegionFlags", + "target region property flags", [ + TargetRegionFlagsNone, + TargetRegionFlagsGeneric, + TargetRegionFlagsSpmd, + TargetRegionFlagsTripCount + ]>; + //===----------------------------------------------------------------------===// // variable_capture_kind enum. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 65095932be627..11530c0fa3620 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -1312,7 +1312,7 @@ def TargetOp : OpenMP_Op<"target", traits = [ /// /// \param capturedOp result of a still valid (no modifications made to any /// nested operations) previous call to `getInnermostCapturedOmpOp()`. - static llvm::omp::OMPTgtExecModeFlags + static ::mlir::omp::TargetRegionFlags getKernelExecFlags(Operation *capturedOp); }] # clausesExtraClassDeclaration; diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 4ac9f49f12161..ecadf16e1e9f6 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1908,8 +1908,8 @@ LogicalResult TargetOp::verifyRegions() { return emitError("target containing multiple 'omp.teams' nested ops"); // Check that host_eval values are only used in legal ways. - llvm::omp::OMPTgtExecModeFlags execFlags = - getKernelExecFlags(getInnermostCapturedOmpOp()); + Operation *capturedOp = getInnermostCapturedOmpOp(); + TargetRegionFlags execFlags = getKernelExecFlags(capturedOp); for (Value hostEvalArg : cast(getOperation()).getHostEvalBlockArgs()) { for (Operation *user : hostEvalArg.getUsers()) { @@ -1924,7 +1924,8 @@ LogicalResult TargetOp::verifyRegions() { "and 'thread_limit' in 'omp.teams'"; } if (auto parallelOp = dyn_cast(user)) { - if (execFlags == llvm::omp::OMP_TGT_EXEC_MODE_SPMD && + if (bitEnumContainsAny(execFlags, TargetRegionFlags::spmd) && + parallelOp->isAncestor(capturedOp) && hostEvalArg == parallelOp.getNumThreads()) continue; @@ -1933,15 +1934,16 @@ LogicalResult TargetOp::verifyRegions() { "'omp.parallel' when representing target SPMD"; } if (auto loopNestOp = dyn_cast(user)) { - if (execFlags != llvm::omp::OMP_TGT_EXEC_MODE_GENERIC && + if (bitEnumContainsAny(execFlags, TargetRegionFlags::trip_count) && + loopNestOp.getOperation() == capturedOp && (llvm::is_contained(loopNestOp.getLoopLowerBounds(), hostEvalArg) || llvm::is_contained(loopNestOp.getLoopUpperBounds(), hostEvalArg) || llvm::is_contained(loopNestOp.getLoopSteps(), hostEvalArg))) continue; return emitOpError() << "host_eval argument only legal as loop bounds " - "and steps in 'omp.loop_nest' when " - "representing target SPMD or Generic-SPMD"; + "and steps in 'omp.loop_nest' when trip count " + "must be evaluated in the host"; } return emitOpError() << "host_eval argument illegal use in '" @@ -1951,33 +1953,12 @@ LogicalResult TargetOp::verifyRegions() { return success(); } -/// Only allow OpenMP terminators and non-OpenMP ops that have known memory -/// effects, but don't include a memory write effect. -static bool siblingAllowedInCapture(Operation *op) { - if (!op) - return false; +static Operation * +findCapturedOmpOp(Operation *rootOp, bool checkSingleMandatoryExec, + llvm::function_ref siblingAllowedFn) { + assert(rootOp && "expected valid operation"); - bool isOmpDialect = - op->getContext()->getLoadedDialect() == - op->getDialect(); - - if (isOmpDialect) - return op->hasTrait(); - - if (auto memOp = dyn_cast(op)) { - SmallVector, 4> effects; - memOp.getEffects(effects); - return !llvm::any_of(effects, [&](MemoryEffects::EffectInstance &effect) { - return isa(effect.getEffect()) && - isa( - effect.getResource()); - }); - } - return true; -} - -Operation *TargetOp::getInnermostCapturedOmpOp() { - Dialect *ompDialect = (*this)->getDialect(); + Dialect *ompDialect = rootOp->getDialect(); Operation *capturedOp = nullptr; DominanceInfo domInfo; @@ -1985,8 +1966,8 @@ Operation *TargetOp::getInnermostCapturedOmpOp() { // ensuring we only enter the region of an operation if it meets the criteria // for being captured. We stop the exploration of nested operations as soon as // we process a region holding no operations to be captured. - walk([&](Operation *op) { - if (op == *this) + rootOp->walk([&](Operation *op) { + if (op == rootOp) return WalkResult::advance(); // Ignore operations of other dialects or omp operations with no regions, @@ -2001,22 +1982,24 @@ Operation *TargetOp::getInnermostCapturedOmpOp() { // (i.e. its block's successors can reach it) or if it's not guaranteed to // be executed before all exits of the region (i.e. it doesn't dominate all // blocks with no successors reachable from the entry block). - Region *parentRegion = op->getParentRegion(); - Block *parentBlock = op->getBlock(); - - for (Block *successor : parentBlock->getSuccessors()) - if (successor->isReachable(parentBlock)) - return WalkResult::interrupt(); - - for (Block &block : *parentRegion) - if (domInfo.isReachableFromEntry(&block) && block.hasNoSuccessors() && - !domInfo.dominates(parentBlock, &block)) - return WalkResult::interrupt(); + if (checkSingleMandatoryExec) { + Region *parentRegion = op->getParentRegion(); + Block *parentBlock = op->getBlock(); + + for (Block *successor : parentBlock->getSuccessors()) + if (successor->isReachable(parentBlock)) + return WalkResult::interrupt(); + + for (Block &block : *parentRegion) + if (domInfo.isReachableFromEntry(&block) && block.hasNoSuccessors() && + !domInfo.dominates(parentBlock, &block)) + return WalkResult::interrupt(); + } // Don't capture this op if it has a not-allowed sibling, and stop recursing // into nested operations. for (Operation &sibling : op->getParentRegion()->getOps()) - if (&sibling != op && !siblingAllowedInCapture(&sibling)) + if (&sibling != op && !siblingAllowedFn(&sibling)) return WalkResult::interrupt(); // Don't continue capturing nested operations if we reach an omp.loop_nest. @@ -2029,10 +2012,35 @@ Operation *TargetOp::getInnermostCapturedOmpOp() { return capturedOp; } -llvm::omp::OMPTgtExecModeFlags -TargetOp::getKernelExecFlags(Operation *capturedOp) { - using namespace llvm::omp; +Operation *TargetOp::getInnermostCapturedOmpOp() { + auto *ompDialect = getContext()->getLoadedDialect(); + + // Only allow OpenMP terminators and non-OpenMP ops that have known memory + // effects, but don't include a memory write effect. + return findCapturedOmpOp( + *this, /*checkSingleMandatoryExec=*/true, [&](Operation *sibling) { + if (!sibling) + return false; + + if (ompDialect == sibling->getDialect()) + return sibling->hasTrait(); + + if (auto memOp = dyn_cast(sibling)) { + SmallVector, 4> + effects; + memOp.getEffects(effects); + return !llvm::any_of( + effects, [&](MemoryEffects::EffectInstance &effect) { + return isa(effect.getEffect()) && + isa( + effect.getResource()); + }); + } + return true; + }); +} +TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) { // A non-null captured op is only valid if it resides inside of a TargetOp // and is the result of calling getInnermostCapturedOmpOp() on it. TargetOp targetOp = @@ -2041,60 +2049,94 @@ TargetOp::getKernelExecFlags(Operation *capturedOp) { (targetOp && targetOp.getInnermostCapturedOmpOp() == capturedOp)) && "unexpected captured op"); - // Make sure this region is capturing a loop. Otherwise, it's a generic - // kernel. + // If it's not capturing a loop, it's a default target region. if (!isa_and_present(capturedOp)) - return OMP_TGT_EXEC_MODE_GENERIC; + return TargetRegionFlags::generic; - SmallVector wrappers; - cast(capturedOp).gatherWrappers(wrappers); - assert(!wrappers.empty()); + // Get the innermost non-simd loop wrapper. + SmallVector loopWrappers; + cast(capturedOp).gatherWrappers(loopWrappers); + assert(!loopWrappers.empty()); - // Ignore optional SIMD leaf construct. - auto *innermostWrapper = wrappers.begin(); + LoopWrapperInterface *innermostWrapper = loopWrappers.begin(); if (isa(innermostWrapper)) innermostWrapper = std::next(innermostWrapper); - long numWrappers = std::distance(innermostWrapper, wrappers.end()); - - // Detect Generic-SPMD: target-teams-distribute[-simd]. - // Detect SPMD: target-teams-loop. - if (numWrappers == 1) { - if (!isa(innermostWrapper)) - return OMP_TGT_EXEC_MODE_GENERIC; - - Operation *teamsOp = (*innermostWrapper)->getParentOp(); - if (!isa_and_present(teamsOp)) - return OMP_TGT_EXEC_MODE_GENERIC; + auto numWrappers = std::distance(innermostWrapper, loopWrappers.end()); + if (numWrappers != 1 && numWrappers != 2) + return TargetRegionFlags::generic; - if (teamsOp->getParentOp() == targetOp.getOperation()) - return isa(innermostWrapper) - ? OMP_TGT_EXEC_MODE_GENERIC_SPMD - : OMP_TGT_EXEC_MODE_SPMD; - } - - // Detect SPMD: target-teams-distribute-parallel-wsloop[-simd]. + // Detect target-teams-distribute-parallel-wsloop[-simd]. if (numWrappers == 2) { if (!isa(innermostWrapper)) - return OMP_TGT_EXEC_MODE_GENERIC; + return TargetRegionFlags::generic; innermostWrapper = std::next(innermostWrapper); if (!isa(innermostWrapper)) - return OMP_TGT_EXEC_MODE_GENERIC; + return TargetRegionFlags::generic; Operation *parallelOp = (*innermostWrapper)->getParentOp(); if (!isa_and_present(parallelOp)) - return OMP_TGT_EXEC_MODE_GENERIC; + return TargetRegionFlags::generic; Operation *teamsOp = parallelOp->getParentOp(); if (!isa_and_present(teamsOp)) - return OMP_TGT_EXEC_MODE_GENERIC; + return TargetRegionFlags::generic; if (teamsOp->getParentOp() == targetOp.getOperation()) - return OMP_TGT_EXEC_MODE_SPMD; + return TargetRegionFlags::spmd | TargetRegionFlags::trip_count; + } + // Detect target-teams-distribute[-simd] and target-teams-loop. + else if (isa(innermostWrapper)) { + Operation *teamsOp = (*innermostWrapper)->getParentOp(); + if (!isa_and_present(teamsOp)) + return TargetRegionFlags::generic; + + if (teamsOp->getParentOp() != targetOp.getOperation()) + return TargetRegionFlags::generic; + + if (isa(innermostWrapper)) + return TargetRegionFlags::spmd | TargetRegionFlags::trip_count; + + // Find single immediately nested captured omp.parallel and add spmd flag + // (generic-spmd case). + // + // TODO: This shouldn't have to be done here, as it is too easy to break. + // The openmp-opt pass should be updated to be able to promote kernels like + // this from "Generic" to "Generic-SPMD". However, the use of the + // `kmpc_distribute_static_loop` family of functions produced by the + // OMPIRBuilder for these kernels prevents that from working. + Dialect *ompDialect = targetOp->getDialect(); + Operation *nestedCapture = findCapturedOmpOp( + capturedOp, /*checkSingleMandatoryExec=*/false, + [&](Operation *sibling) { + return sibling && (ompDialect != sibling->getDialect() || + sibling->hasTrait()); + }); + + TargetRegionFlags result = + TargetRegionFlags::generic | TargetRegionFlags::trip_count; + + if (!nestedCapture) + return result; + + while (nestedCapture->getParentOp() != capturedOp) + nestedCapture = nestedCapture->getParentOp(); + + return isa(nestedCapture) ? result | TargetRegionFlags::spmd + : result; + } + // Detect target-parallel-wsloop[-simd]. + else if (isa(innermostWrapper)) { + Operation *parallelOp = (*innermostWrapper)->getParentOp(); + if (!isa_and_present(parallelOp)) + return TargetRegionFlags::generic; + + if (parallelOp->getParentOp() == targetOp.getOperation()) + return TargetRegionFlags::spmd; } - return OMP_TGT_EXEC_MODE_GENERIC; + return TargetRegionFlags::generic; } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index d41489921bd13..4d610d6e2656d 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -4646,7 +4646,17 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp, combinedMaxThreadsVal = maxThreadsVal; // Update kernel bounds structure for the `OpenMPIRBuilder` to use. - attrs.ExecFlags = targetOp.getKernelExecFlags(capturedOp); + omp::TargetRegionFlags kernelFlags = targetOp.getKernelExecFlags(capturedOp); + assert( + omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic | + omp::TargetRegionFlags::spmd) && + "invalid kernel flags"); + attrs.ExecFlags = + omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic) + ? omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::spmd) + ? llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD + : llvm::omp::OMP_TGT_EXEC_MODE_GENERIC + : llvm::omp::OMP_TGT_EXEC_MODE_SPMD; attrs.MinTeams = minTeamsVal; attrs.MaxTeams.front() = maxTeamsVal; attrs.MinThreads = 1; @@ -4691,8 +4701,8 @@ initTargetRuntimeAttrs(llvm::IRBuilderBase &builder, if (numThreads) attrs.MaxThreads = moduleTranslation.lookupValue(numThreads); - if (targetOp.getKernelExecFlags(capturedOp) != - llvm::omp::OMP_TGT_EXEC_MODE_GENERIC) { + if (omp::bitEnumContainsAny(targetOp.getKernelExecFlags(capturedOp), + omp::TargetRegionFlags::trip_count)) { llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); attrs.LoopTripCount = nullptr; diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir index 403128bb2300e..bd0541987339a 100644 --- a/mlir/test/Dialect/OpenMP/invalid.mlir +++ b/mlir/test/Dialect/OpenMP/invalid.mlir @@ -2320,7 +2320,7 @@ func.func @omp_target_host_eval_parallel(%x : i32) { // ----- func.func @omp_target_host_eval_loop1(%x : i32) { - // expected-error @below {{op host_eval argument only legal as loop bounds and steps in 'omp.loop_nest' when representing target SPMD or Generic-SPMD}} + // expected-error @below {{op host_eval argument only legal as loop bounds and steps in 'omp.loop_nest' when trip count must be evaluated in the host}} omp.target host_eval(%x -> %arg0 : i32) { omp.wsloop { omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) { @@ -2335,7 +2335,7 @@ func.func @omp_target_host_eval_loop1(%x : i32) { // ----- func.func @omp_target_host_eval_loop2(%x : i32) { - // expected-error @below {{op host_eval argument only legal as loop bounds and steps in 'omp.loop_nest' when representing target SPMD or Generic-SPMD}} + // expected-error @below {{op host_eval argument only legal as loop bounds and steps in 'omp.loop_nest' when trip count must be evaluated in the host}} omp.target host_eval(%x -> %arg0 : i32) { omp.teams { ^bb0: diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index 0a10626cd4877..6bc2500471997 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -2864,6 +2864,23 @@ func.func @omp_target_host_eval(%x : i32) { omp.terminator } + // CHECK: omp.target host_eval(%{{.*}} -> %[[HOST_ARG:.*]] : i32) { + // CHECK: omp.parallel num_threads(%[[HOST_ARG]] : i32) { + // CHECK: omp.wsloop { + // CHECK: omp.loop_nest + omp.target host_eval(%x -> %arg0 : i32) { + %y = arith.constant 2 : i32 + omp.parallel num_threads(%arg0 : i32) { + omp.wsloop { + omp.loop_nest (%iv) : i32 = (%y) to (%y) step (%y) { + omp.yield + } + } + omp.terminator + } + omp.terminator + } + // CHECK: omp.target host_eval(%{{.*}} -> %[[HOST_ARG:.*]] : i32) { // CHECK: omp.teams { // CHECK: omp.distribute { diff --git a/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir b/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir new file mode 100644 index 0000000000000..8101660e571e4 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir @@ -0,0 +1,111 @@ +// RUN: split-file %s %t +// RUN: mlir-translate -mlir-to-llvmir %t/host.mlir | FileCheck %s --check-prefix=HOST +// RUN: mlir-translate -mlir-to-llvmir %t/device.mlir | FileCheck %s --check-prefix=DEVICE + +//--- host.mlir + +module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-amd-amdhsa"]} { + llvm.func @main(%arg0 : !llvm.ptr) { + %x = llvm.load %arg0 : !llvm.ptr -> i32 + %0 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr + omp.target host_eval(%x -> %lb, %x -> %ub, %x -> %step : i32, i32, i32) map_entries(%0 -> %ptr : !llvm.ptr) { + %x.map = llvm.load %ptr : !llvm.ptr -> i32 + omp.teams { + omp.distribute { + omp.loop_nest (%iv1) : i32 = (%lb) to (%ub) step (%step) { + omp.parallel { + omp.wsloop { + omp.loop_nest (%iv2) : i32 = (%x.map) to (%x.map) step (%x.map) { + omp.yield + } + } + omp.terminator + } + omp.yield + } + } + omp.terminator + } + omp.terminator + } + llvm.return + } +} + +// HOST-LABEL: define void @main +// HOST: %omp_loop.tripcount = {{.*}} +// HOST-NEXT: br label %[[ENTRY:.*]] +// HOST: [[ENTRY]]: +// HOST: %[[TRIPCOUNT:.*]] = zext i32 %omp_loop.tripcount to i64 +// HOST: %[[TRIPCOUNT_KARG:.*]] = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %[[KARGS:.*]], i32 0, i32 8 +// HOST-NEXT: store i64 %[[TRIPCOUNT]], ptr %[[TRIPCOUNT_KARG]] +// HOST: %[[RESULT:.*]] = call i32 @__tgt_target_kernel({{.*}}, ptr %[[KARGS]]) +// HOST-NEXT: %[[CMP:.*]] = icmp ne i32 %[[RESULT]], 0 +// HOST-NEXT: br i1 %[[CMP]], label %[[OFFLOAD_FAILED:.*]], label %{{.*}} +// HOST: [[OFFLOAD_FAILED]]: +// HOST: call void @[[TARGET_OUTLINE:.*]]({{.*}}) + +// HOST: define internal void @[[TARGET_OUTLINE]] +// HOST: call void{{.*}}@__kmpc_fork_teams({{.*}}, ptr @[[TEAMS_OUTLINE:.*]], {{.*}}) + +// HOST: define internal void @[[TEAMS_OUTLINE]] +// HOST: call void @[[DISTRIBUTE_OUTLINE:.*]]({{.*}}) + +// HOST: define internal void @[[DISTRIBUTE_OUTLINE]] +// HOST: call void @__kmpc_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 92, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}}) +// HOST: call void (ptr, i32, ptr, ...) @__kmpc_fork_call({{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], {{.*}}) + +// HOST: define internal void @[[PARALLEL_OUTLINE]] +// HOST: call void @__kmpc_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 34, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}}) + +//--- device.mlir + +module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true, omp.is_gpu = true} { + llvm.func @main(%arg0 : !llvm.ptr) { + %0 = omp.map.info var_ptr(%arg0 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr + omp.target map_entries(%0 -> %ptr : !llvm.ptr) { + %x = llvm.load %ptr : !llvm.ptr -> i32 + omp.teams { + omp.distribute { + omp.loop_nest (%iv1) : i32 = (%x) to (%x) step (%x) { + omp.parallel { + omp.wsloop { + omp.loop_nest (%iv2) : i32 = (%x) to (%x) step (%x) { + omp.yield + } + } + omp.terminator + } + omp.yield + } + } + omp.terminator + } + omp.terminator + } + llvm.return + } +} + +// DEVICE: @[[KERNEL_NAME:.*]]_exec_mode = weak protected constant i8 [[EXEC_MODE:3]] +// DEVICE: @llvm.compiler.used = appending global [1 x ptr] [ptr @[[KERNEL_NAME]]_exec_mode], section "llvm.metadata" +// DEVICE: @[[KERNEL_NAME]]_kernel_environment = weak_odr protected constant %struct.KernelEnvironmentTy { +// DEVICE-SAME: %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 [[EXEC_MODE]], {{.*}}}, +// DEVICE-SAME: ptr @{{.*}}, ptr @{{.*}} } + +// DEVICE: define weak_odr protected amdgpu_kernel void @[[KERNEL_NAME]]({{.*}}) +// DEVICE: %{{.*}} = call i32 @__kmpc_target_init(ptr @[[KERNEL_NAME]]_kernel_environment, {{.*}}) +// DEVICE: call void @[[TARGET_OUTLINE:.*]]({{.*}}) +// DEVICE: call void @__kmpc_target_deinit() + +// DEVICE: define internal void @[[TARGET_OUTLINE]]({{.*}}) +// DEVICE: call void @[[TEAMS_OUTLINE:.*]]({{.*}}) + +// DEVICE: define internal void @[[TEAMS_OUTLINE]]({{.*}}) +// DEVICE: call void @__kmpc_distribute_static_loop{{.*}}({{.*}}, ptr @[[DISTRIBUTE_OUTLINE:[^,]*]], {{.*}}) + +// DEVICE: define internal void @[[DISTRIBUTE_OUTLINE]]({{.*}}) +// DEVICE: call void @__kmpc_parallel_51(ptr {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], ptr {{.*}}, ptr {{.*}}, i64 {{.*}}) + +// DEVICE: define internal void @[[PARALLEL_OUTLINE]]({{.*}}) +// DEVICE: call void @__kmpc_for_static_loop{{.*}}({{.*}}) From c14b6e90bd140c2290258fa9dbe0fc1ad8939111 Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Thu, 3 Apr 2025 07:41:29 -0700 Subject: [PATCH 0537/1029] [lldb][NFC] Move ShouldShow/ShouldSelect logic into Stopinfo (#134160) This NFC patch simplifies the main loop in HandleProcessStateChanged event by moving duplicated code into the StopInfo class, also allowing StopInfo subclasses to override behavior. More specifically, two functions are created: * ShouldShow: should a Thread with such StopInfo should be printed when the debugger stops? Currently, no StopInfo subclasses override this, but a subsequent patch will fix a bug by making StopInfoBreakpoint check whether the breakpoint is internal. * ShouldSelect: should a Thread with such a StopInfo be selected? This is currently overridden by StopInfoUnixSignal but will, in the future, be overridden by StopInfoBreakpoint. --- lldb/include/lldb/Target/StopInfo.h | 13 +++++ lldb/source/Target/Process.cpp | 77 ++++++----------------------- lldb/source/Target/StopInfo.cpp | 15 +++--- 3 files changed, 36 insertions(+), 69 deletions(-) diff --git a/lldb/include/lldb/Target/StopInfo.h b/lldb/include/lldb/Target/StopInfo.h index 9a13371708be5..368ec51d81891 100644 --- a/lldb/include/lldb/Target/StopInfo.h +++ b/lldb/include/lldb/Target/StopInfo.h @@ -118,6 +118,19 @@ class StopInfo : public std::enable_shared_from_this { StructuredData::ObjectSP GetExtendedInfo() { return m_extended_info; } + /// Returns true if this is a stop reason that should be shown to a user when + /// viewing the thread with this stop info. + virtual bool ShouldShow() const { return IsValid(); } + + /// Returns true if this is a stop reason that should cause a thread to be + /// selected when stopping. + virtual bool ShouldSelect() const { + lldb::StopReason reason = GetStopReason(); + return reason != lldb::eStopReasonNone && + reason != lldb::eStopReasonHistoryBoundary && + reason != lldb::eStopReasonInvalid; + } + static lldb::StopInfoSP CreateStopReasonWithBreakpointSiteID(Thread &thread, lldb::break_id_t break_id); diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index 2adda309dea9c..0b7ba343c11f2 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -808,30 +808,11 @@ bool Process::HandleProcessStateChangedEvent( std::lock_guard guard(thread_list.GetMutex()); ThreadSP curr_thread(thread_list.GetSelectedThread()); - ThreadSP thread; - StopReason curr_thread_stop_reason = eStopReasonInvalid; - bool prefer_curr_thread = false; - if (curr_thread && curr_thread->IsValid()) { - curr_thread_stop_reason = curr_thread->GetStopReason(); - switch (curr_thread_stop_reason) { - case eStopReasonNone: - case eStopReasonInvalid: - // Don't prefer the current thread if it didn't stop for a reason. - break; - case eStopReasonSignal: { - // We need to do the same computation we do for other threads - // below in case the current thread happens to be the one that - // stopped for the no-stop signal. - uint64_t signo = curr_thread->GetStopInfo()->GetValue(); - if (process_sp->GetUnixSignals()->GetShouldStop(signo)) - prefer_curr_thread = true; - } break; - default: - prefer_curr_thread = true; - break; - } + + if (curr_thread && curr_thread->IsValid()) curr_thread_stop_info_sp = curr_thread->GetStopInfo(); - } + bool prefer_curr_thread = curr_thread_stop_info_sp && + curr_thread_stop_info_sp->ShouldSelect(); if (!prefer_curr_thread) { // Prefer a thread that has just completed its plan over another @@ -839,47 +820,16 @@ bool Process::HandleProcessStateChangedEvent( ThreadSP plan_thread; ThreadSP other_thread; - const size_t num_threads = thread_list.GetSize(); - size_t i; - for (i = 0; i < num_threads; ++i) { - thread = thread_list.GetThreadAtIndex(i); - StopReason thread_stop_reason = thread->GetStopReason(); - switch (thread_stop_reason) { - case eStopReasonInvalid: - case eStopReasonNone: - case eStopReasonHistoryBoundary: - break; - - case eStopReasonSignal: { - // Don't select a signal thread if we weren't going to stop at - // that signal. We have to have had another reason for stopping - // here, and the user doesn't want to see this thread. - uint64_t signo = thread->GetStopInfo()->GetValue(); - if (process_sp->GetUnixSignals()->GetShouldStop(signo)) { - if (!other_thread) - other_thread = thread; - } - break; - } - case eStopReasonTrace: - case eStopReasonBreakpoint: - case eStopReasonWatchpoint: - case eStopReasonException: - case eStopReasonExec: - case eStopReasonFork: - case eStopReasonVFork: - case eStopReasonVForkDone: - case eStopReasonThreadExiting: - case eStopReasonInstrumentation: - case eStopReasonProcessorTrace: - case eStopReasonInterrupt: - if (!other_thread) - other_thread = thread; - break; - case eStopReasonPlanComplete: + for (ThreadSP thread : thread_list.Threads()) { + StopInfoSP stop_info = thread->GetStopInfo(); + if (!stop_info || !stop_info->ShouldSelect()) + continue; + StopReason thread_stop_reason = stop_info->GetStopReason(); + if (thread_stop_reason == eStopReasonPlanComplete) { if (!plan_thread) plan_thread = thread; - break; + } else if (!other_thread) { + other_thread = thread; } } if (plan_thread) @@ -887,6 +837,7 @@ bool Process::HandleProcessStateChangedEvent( else if (other_thread) thread_list.SetSelectedThreadByID(other_thread->GetID()); else { + ThreadSP thread; if (curr_thread && curr_thread->IsValid()) thread = curr_thread; else @@ -5832,7 +5783,7 @@ size_t Process::GetThreadStatus(Stream &strm, if (thread_sp) { if (only_threads_with_stop_reason) { StopInfoSP stop_info_sp = thread_sp->GetStopInfo(); - if (!stop_info_sp || !stop_info_sp->IsValid()) + if (!stop_info_sp || !stop_info_sp->ShouldShow()) continue; } thread_sp->GetStatus(strm, start_frame, num_frames, diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp index e9e3603e55316..f1272a723a8cb 100644 --- a/lldb/source/Target/StopInfo.cpp +++ b/lldb/source/Target/StopInfo.cpp @@ -1080,12 +1080,7 @@ class StopInfoUnixSignal : public StopInfo { return false; } - bool ShouldStop(Event *event_ptr) override { - ThreadSP thread_sp(m_thread_wp.lock()); - if (thread_sp) - return thread_sp->GetProcess()->GetUnixSignals()->GetShouldStop(m_value); - return false; - } + bool ShouldStop(Event *event_ptr) override { return IsShouldStopSignal(); } // If should stop returns false, check if we should notify of this event bool DoShouldNotify(Event *event_ptr) override { @@ -1137,9 +1132,17 @@ class StopInfoUnixSignal : public StopInfo { return m_description.c_str(); } + bool ShouldSelect() const override { return IsShouldStopSignal(); } + private: // In siginfo_t terms, if m_value is si_signo, m_code is si_code. std::optional m_code; + + bool IsShouldStopSignal() const { + if (ThreadSP thread_sp = m_thread_wp.lock()) + return thread_sp->GetProcess()->GetUnixSignals()->GetShouldStop(m_value); + return false; + } }; // StopInfoInterrupt From 61ef28650626dd4f651a250005c77255d8086f69 Mon Sep 17 00:00:00 2001 From: gbMattN Date: Thu, 3 Apr 2025 15:56:33 +0100 Subject: [PATCH 0538/1029] Fix signed/unsigned mismatch warning (#134255) --- llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index fcac686b4cd10..a4c6c61e57998 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -3447,7 +3447,8 @@ static StringRef getAllocaName(AllocaInst *AI) { if (!isa(Annotation)) continue; auto AnnotationTuple = cast(Annotation); - for (int Index = 0; Index < AnnotationTuple->getNumOperands(); Index++) { + for (unsigned Index = 0; Index < AnnotationTuple->getNumOperands(); + Index++) { // All annotations are strings auto MetadataString = cast(AnnotationTuple->getOperand(Index)); From 2334fd2ea3a0a391ca88da36764d895d9c34e1bc Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Thu, 3 Apr 2025 16:05:42 +0100 Subject: [PATCH 0539/1029] [Dexter] Update Dexter tests to use new dexter test substitutions Following commit b8fc288, which changed some dexter test substitutions to be specific to C and C++, some tests that had been added since the original patch was written were still using the old substitution; this patch updates them to use the new. --- .../perfect/dex_declare_file/precompiled_binary/commands.dex | 2 +- .../perfect/dex_declare_file/windows_noncanonical_path/test.dex | 2 +- .../feature_tests/subtools/test/err_syntax_dexdeclarefile.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/precompiled_binary/commands.dex b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/precompiled_binary/commands.dex index 99b0a50d31b3b..970106cc6fc25 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/precompiled_binary/commands.dex +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/precompiled_binary/commands.dex @@ -4,7 +4,7 @@ # # UNSUPPORTED: system-darwin # -# RUN: %dexter_regression_test_build %S/test.cpp -o %t +# RUN: %dexter_regression_test_cxx_build %S/test.cpp -o %t # RUN: %dexter_regression_base --binary %t %s | FileCheck %s # CHECK: commands.dex # diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/windows_noncanonical_path/test.dex b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/windows_noncanonical_path/test.dex index ec48bc365441d..b96562e59060a 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/windows_noncanonical_path/test.dex +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/commands/perfect/dex_declare_file/windows_noncanonical_path/test.dex @@ -3,7 +3,7 @@ # # REQUIRES: system-windows # -# RUN: %dexter_regression_test_build "%S/source/test file.cpp" -o %t +# RUN: %dexter_regression_test_cxx_build "%S/source/test file.cpp" -o %t # RUN: %dexter_regression_base --binary %t %s | FileCheck %s # CHECK: test.dex # diff --git a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax_dexdeclarefile.cpp b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax_dexdeclarefile.cpp index e3f08af204e76..40cc1581f85fa 100644 --- a/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax_dexdeclarefile.cpp +++ b/cross-project-tests/debuginfo-tests/dexter/feature_tests/subtools/test/err_syntax_dexdeclarefile.cpp @@ -2,7 +2,7 @@ // Check that Dexter command syntax errors associate with the line and file // they appeared in rather than the current declared file. // -// RUN: %dexter_regression_test_build %s -o %t +// RUN: %dexter_regression_test_cxx_build %s -o %t // RUN: not %dexter_base test --binary %t --debugger 'lldb' -v -- %s \ // RUN: | FileCheck %s --implicit-check-not=FAIL-FILENAME-MATCH From f23bb530cfad1ca72812d0daf599834ea2291219 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Thu, 3 Apr 2025 10:12:18 -0500 Subject: [PATCH 0540/1029] [AMDGPULowerBufferFatPointers] Use InstSimplifyFolder during rewrites (#134137) This PR updates AMDGPULowerBufferFatPointers to use the InstSimplifyFolder when creating IR during buffer fat pointer lowering. This shouldn't cause any large functional changes and might improve the quality of the generated code. --- .../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 54 ++++++++------- ...tor-non-integral-address-spaces-vectors.ll | 2 - .../AMDGPU/lower-buffer-fat-pointers-calls.ll | 10 ++- ...ffer-fat-pointers-contents-legalization.ll | 5 +- .../lower-buffer-fat-pointers-mem-transfer.ll | 12 ++-- .../lower-buffer-fat-pointers-p7-in-memory.ll | 60 +++-------------- .../lower-buffer-fat-pointers-pointer-ops.ll | 65 +++++++------------ ...fer-fat-pointers-unoptimized-debug-data.ll | 42 +++++------- 8 files changed, 92 insertions(+), 158 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 5dd1fe14e5626..e6250ddf2c26b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -224,7 +224,7 @@ #include "SIDefines.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/InstSimplifyFolder.h" #include "llvm/Analysis/Utils/Local.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/AttributeMask.h" @@ -445,7 +445,7 @@ class StoreFatPtrsAsIntsAndExpandMemcpyVisitor ValueToValueMapTy ConvertedForStore; - IRBuilder<> IRB; + IRBuilder IRB; const TargetMachine *TM; @@ -459,9 +459,10 @@ class StoreFatPtrsAsIntsAndExpandMemcpyVisitor public: StoreFatPtrsAsIntsAndExpandMemcpyVisitor(BufferFatPtrToIntTypeMap *TypeMap, + const DataLayout &DL, LLVMContext &Ctx, const TargetMachine *TM) - : TypeMap(TypeMap), IRB(Ctx), TM(TM) {} + : TypeMap(TypeMap), IRB(Ctx, InstSimplifyFolder(DL)), TM(TM) {} bool processFunction(Function &F); bool visitInstruction(Instruction &I) { return false; } @@ -683,7 +684,7 @@ class LegalizeBufferContentTypesVisitor : public InstVisitor { friend class InstVisitor; - IRBuilder<> IRB; + IRBuilder IRB; const DataLayout &DL; @@ -743,7 +744,7 @@ class LegalizeBufferContentTypesVisitor public: LegalizeBufferContentTypesVisitor(const DataLayout &DL, LLVMContext &Ctx) - : IRB(Ctx), DL(DL) {} + : IRB(Ctx, InstSimplifyFolder(DL)), DL(DL) {} bool processFunction(Function &F); }; } // namespace @@ -1326,7 +1327,7 @@ class SplitPtrStructs : public InstVisitor { const TargetMachine *TM; const GCNSubtarget *ST = nullptr; - IRBuilder<> IRB; + IRBuilder IRB; // Copy metadata between instructions if applicable. void copyMetadata(Value *Dest, Value *Src); @@ -1363,8 +1364,9 @@ class SplitPtrStructs : public InstVisitor { bool IsVolatile, SyncScope::ID SSID); public: - SplitPtrStructs(LLVMContext &Ctx, const TargetMachine *TM) - : TM(TM), IRB(Ctx) {} + SplitPtrStructs(const DataLayout &DL, LLVMContext &Ctx, + const TargetMachine *TM) + : TM(TM), IRB(Ctx, InstSimplifyFolder(DL)) {} void processFunction(Function &F); @@ -1415,7 +1417,7 @@ PtrParts SplitPtrStructs::getPtrParts(Value *V) { return {*RsrcEntry = Rsrc, *OffEntry = Off}; } - IRBuilder<>::InsertPointGuard Guard(IRB); + IRBuilder::InsertPointGuard Guard(IRB); if (auto *I = dyn_cast(V)) { LLVM_DEBUG(dbgs() << "Recursing to split parts of " << *I << "\n"); auto [Rsrc, Off] = visit(*I); @@ -1479,7 +1481,7 @@ void SplitPtrStructs::getPossibleRsrcRoots(Instruction *I, } void SplitPtrStructs::processConditionals() { - SmallDenseMap FoundRsrcs; + SmallDenseMap FoundRsrcs; SmallPtrSet Roots; SmallPtrSet Seen; for (Instruction *I : Conditionals) { @@ -1493,7 +1495,7 @@ void SplitPtrStructs::processConditionals() { if (MaybeFoundRsrc != FoundRsrcs.end()) { MaybeRsrc = MaybeFoundRsrc->second; } else { - IRBuilder<>::InsertPointGuard Guard(IRB); + IRBuilder::InsertPointGuard Guard(IRB); Roots.clear(); Seen.clear(); getPossibleRsrcRoots(I, Roots, Seen); @@ -1558,21 +1560,29 @@ void SplitPtrStructs::processConditionals() { // to put the corrections maps in an inconstent state. That'll be handed // during the rest of the killing. Also, `ValueToValueMapTy` guarantees // that references in that map will be updated as well. - ConditionalTemps.push_back(cast(Rsrc)); - ConditionalTemps.push_back(cast(Off)); - Rsrc->replaceAllUsesWith(NewRsrc); - Off->replaceAllUsesWith(NewOff); + // Note that if the temporary instruction got `InstSimplify`'d away, it + // might be something like a block argument. + if (auto *RsrcInst = dyn_cast(Rsrc)) { + ConditionalTemps.push_back(RsrcInst); + RsrcInst->replaceAllUsesWith(NewRsrc); + } + if (auto *OffInst = dyn_cast(Off)) { + ConditionalTemps.push_back(OffInst); + OffInst->replaceAllUsesWith(NewOff); + } // Save on recomputing the cycle traversals in known-root cases. if (MaybeRsrc) for (Value *V : Seen) - FoundRsrcs[cast(V)] = NewRsrc; + FoundRsrcs[V] = NewRsrc; } else if (isa(I)) { if (MaybeRsrc) { - ConditionalTemps.push_back(cast(Rsrc)); - Rsrc->replaceAllUsesWith(*MaybeRsrc); + if (auto *RsrcInst = dyn_cast(Rsrc)) { + ConditionalTemps.push_back(RsrcInst); + RsrcInst->replaceAllUsesWith(*MaybeRsrc); + } for (Value *V : Seen) - FoundRsrcs[cast(V)] = *MaybeRsrc; + FoundRsrcs[V] = *MaybeRsrc; } } else { llvm_unreachable("Only PHIs and selects go in the conditionals list"); @@ -2426,8 +2436,8 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { /*RemoveDeadConstants=*/false, /*IncludeSelf=*/true); } - StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, M.getContext(), - &TM); + StoreFatPtrsAsIntsAndExpandMemcpyVisitor MemOpsRewrite(&IntTM, DL, + M.getContext(), &TM); LegalizeBufferContentTypesVisitor BufferContentsTypeRewrite(DL, M.getContext()); for (Function &F : M.functions()) { @@ -2472,7 +2482,7 @@ bool AMDGPULowerBufferFatPointers::run(Module &M, const TargetMachine &TM) { IntTM.clear(); CloneMap.clear(); - SplitPtrStructs Splitter(M.getContext(), &TM); + SplitPtrStructs Splitter(DL, M.getContext(), &TM); for (Function *F : NeedsPostProcess) Splitter.processFunction(*F); for (Function *F : Intrinsics) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll index c509cf4b1bf37..2450ca5063de0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll @@ -49,8 +49,6 @@ define <2 x ptr addrspace(7)> @gep_vector_splat(<2 x ptr addrspace(7)> %ptrs, i6 ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY10]](s32), [[COPY11]](s32) ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<2 x p8>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[MV2]](s64), [[C]](s32) ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[IVEC]](<2 x s64>), [[DEF]], shufflemask(0, 0) ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[SHUF]](<2 x s64>) diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll index 022094bc633c8..3765bb0af79ba 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-calls.ll @@ -48,9 +48,8 @@ define ptr addrspace(7) @recur.inner.2(i32 %v, ptr addrspace(7) %x) { ; CHECK-NEXT: [[X_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 0 ; CHECK-NEXT: [[X_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[X]], 1 ; CHECK-NEXT: [[INC:%.*]] = add i32 [[X_OFF]], 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[X_RSRC]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[INC]], 1 -; CHECK-NEXT: [[RET:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.1({ ptr addrspace(8), i32 } [[TMP2]], i32 [[V]]) +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[X]], i32 [[INC]], 1 +; CHECK-NEXT: [[RET:%.*]] = call { ptr addrspace(8), i32 } @recur.inner.1({ ptr addrspace(8), i32 } [[TMP1]], i32 [[V]]) ; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] ; %inc = getelementptr i32, ptr addrspace(7) %x, i32 1 @@ -110,9 +109,8 @@ define internal noalias noundef nonnull ptr addrspace(7) @foo(ptr addrspace(7) n ; CHECK-NEXT: [[ARG_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 0 ; CHECK-NEXT: [[ARG_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[ARG]], 1 ; CHECK-NEXT: [[RET:%.*]] = add nuw i32 [[ARG_OFF]], 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[ARG_RSRC]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1 -; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[ARG]], i32 [[RET]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP1]] ; %ret = getelementptr inbounds i32, ptr addrspace(7) %arg, i32 1 ret ptr addrspace(7) %ret diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll index d18f0f8bd1ff9..a8e67a4a61816 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-contents-legalization.ll @@ -1898,10 +1898,9 @@ define void @store_v32i6(<6 x i32> %data.abi, ptr addrspace(8) inreg %buf) { ; CHECK-LABEL: define void @store_v32i6( ; CHECK-SAME: <6 x i32> [[DATA_ABI:%.*]], ptr addrspace(8) inreg [[BUF:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[DATA:%.*]] = bitcast <6 x i32> [[DATA_ABI]] to <32 x i6> -; CHECK-NEXT: [[DATA_LEGAL:%.*]] = bitcast <32 x i6> [[DATA]] to <6 x i32> -; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <4 x i32> +; CHECK-NEXT: [[DATA_SLICE_0:%.*]] = shufflevector <6 x i32> [[DATA_ABI]], <6 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DATA_SLICE_0]], ptr addrspace(8) align 32 [[BUF]], i32 0, i32 0, i32 0) -; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_LEGAL]], <6 x i32> poison, <2 x i32> +; CHECK-NEXT: [[DATA_SLICE_4:%.*]] = shufflevector <6 x i32> [[DATA_ABI]], <6 x i32> poison, <2 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> [[DATA_SLICE_4]], ptr addrspace(8) align 16 [[BUF]], i32 16, i32 0, i32 0) ; CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll index e6c2d1907068f..ee51b0b84554e 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll @@ -133,8 +133,7 @@ define void @memcpy_known(ptr addrspace(7) inreg %src, ptr addrspace(7) inreg %d ; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 -; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] @@ -328,8 +327,7 @@ define void @memcpy_known_i64(ptr addrspace(7) inreg %src, ptr addrspace(7) inre ; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 -; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] @@ -792,8 +790,7 @@ define void @memcpy.inline_known(ptr addrspace(7) inreg %src, ptr addrspace(7) i ; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 -; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 16 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] @@ -987,8 +984,7 @@ define void @memcpy.inline_known_i64(ptr addrspace(7) inreg %src, ptr addrspace( ; CHECK-NEXT: [[DOTSLICE_56:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_56]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0) ; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP3]], 240 -; CHECK-NEXT: [[DOTSLICE_60:%.*]] = shufflevector <64 x i32> [[TMP2]], <64 x i32> poison, <4 x i32> -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTSLICE_60]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[DOTOFF_240]], ptr addrspace(8) align 1 [[DST_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0) ; CHECK-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 256 ; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 8192 ; CHECK-NEXT: br i1 [[TMP5]], label %[[LOAD_STORE_LOOP]], label %[[MEMCPY_SPLIT:.*]] diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll index 9b2e2f950a39d..a8473927e1bd8 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-p7-in-memory.ll @@ -14,11 +14,7 @@ define void @scalar_copy(ptr %a, ptr %b) { ; CHECK-NEXT: [[X_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8) ; CHECK-NEXT: [[X_PTR_OFF:%.*]] = trunc i160 [[X]] to i32 ; CHECK-NEXT: [[B1:%.*]] = getelementptr i160, ptr [[B]], i64 1 -; CHECK-NEXT: [[X_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[X_PTR_RSRC]] to i160 -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i160 [[X_PTR_INT_RSRC]], 32 -; CHECK-NEXT: [[X_PTR_INT_OFF:%.*]] = zext i32 [[X_PTR_OFF]] to i160 -; CHECK-NEXT: [[X_PTR_INT:%.*]] = or i160 [[TMP3]], [[X_PTR_INT_OFF]] -; CHECK-NEXT: store i160 [[X_PTR_INT]], ptr [[B1]], align 32 +; CHECK-NEXT: store i160 [[X]], ptr [[B1]], align 32 ; CHECK-NEXT: ret void ; %x = load ptr addrspace(7), ptr %a @@ -36,11 +32,7 @@ define void @vector_copy(ptr %a, ptr %b) { ; CHECK-NEXT: [[X_PTR_RSRC:%.*]] = inttoptr <4 x i128> [[TMP2]] to <4 x ptr addrspace(8)> ; CHECK-NEXT: [[X_PTR_OFF:%.*]] = trunc <4 x i160> [[X]] to <4 x i32> ; CHECK-NEXT: [[B1:%.*]] = getelementptr <4 x i160>, ptr [[B]], i64 2 -; CHECK-NEXT: [[X_PTR_INT_RSRC:%.*]] = ptrtoint <4 x ptr addrspace(8)> [[X_PTR_RSRC]] to <4 x i160> -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw <4 x i160> [[X_PTR_INT_RSRC]], splat (i160 32) -; CHECK-NEXT: [[X_PTR_INT_OFF:%.*]] = zext <4 x i32> [[X_PTR_OFF]] to <4 x i160> -; CHECK-NEXT: [[X_PTR_INT:%.*]] = or <4 x i160> [[TMP3]], [[X_PTR_INT_OFF]] -; CHECK-NEXT: store <4 x i160> [[X_PTR_INT]], ptr [[B1]], align 128 +; CHECK-NEXT: store <4 x i160> [[X]], ptr [[B1]], align 128 ; CHECK-NEXT: ret void ; %x = load <4 x ptr addrspace(7)>, ptr %a @@ -59,21 +51,13 @@ define void @alloca(ptr %a, ptr %b) { ; CHECK-NEXT: [[X_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8) ; CHECK-NEXT: [[X_PTR_OFF:%.*]] = trunc i160 [[X]] to i32 ; CHECK-NEXT: [[L:%.*]] = getelementptr i160, ptr addrspace(5) [[ALLOCA]], i32 1 -; CHECK-NEXT: [[X_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[X_PTR_RSRC]] to i160 -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i160 [[X_PTR_INT_RSRC]], 32 -; CHECK-NEXT: [[X_PTR_INT_OFF:%.*]] = zext i32 [[X_PTR_OFF]] to i160 -; CHECK-NEXT: [[X_PTR_INT:%.*]] = or i160 [[TMP3]], [[X_PTR_INT_OFF]] -; CHECK-NEXT: store i160 [[X_PTR_INT]], ptr addrspace(5) [[L]], align 32 +; CHECK-NEXT: store i160 [[X]], ptr addrspace(5) [[L]], align 32 ; CHECK-NEXT: [[Y:%.*]] = load i160, ptr addrspace(5) [[L]], align 32 -; CHECK-NEXT: [[TMP4:%.*]] = lshr i160 [[Y]], 32 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i160 [[TMP4]] to i128 -; CHECK-NEXT: [[Y_PTR_RSRC:%.*]] = inttoptr i128 [[TMP5]] to ptr addrspace(8) +; CHECK-NEXT: [[TMP3:%.*]] = lshr i160 [[Y]], 32 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i160 [[TMP3]] to i128 +; CHECK-NEXT: [[Y_PTR_RSRC:%.*]] = inttoptr i128 [[TMP4]] to ptr addrspace(8) ; CHECK-NEXT: [[Y_PTR_OFF:%.*]] = trunc i160 [[Y]] to i32 -; CHECK-NEXT: [[Y_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[Y_PTR_RSRC]] to i160 -; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i160 [[Y_PTR_INT_RSRC]], 32 -; CHECK-NEXT: [[Y_PTR_INT_OFF:%.*]] = zext i32 [[Y_PTR_OFF]] to i160 -; CHECK-NEXT: [[Y_PTR_INT:%.*]] = or i160 [[TMP6]], [[Y_PTR_INT_OFF]] -; CHECK-NEXT: store i160 [[Y_PTR_INT]], ptr [[B]], align 32 +; CHECK-NEXT: store i160 [[Y]], ptr [[B]], align 32 ; CHECK-NEXT: ret void ; %alloca = alloca [5 x ptr addrspace(7)], addrspace(5) @@ -117,35 +101,7 @@ define void @complex_copy(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP18:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[X_2_PTR_RSRC]], 0 ; CHECK-NEXT: [[X_2_PTR:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP18]], i32 [[X_2_PTR_OFF]], 1 ; CHECK-NEXT: [[TMP19:%.*]] = insertvalue { [2 x { ptr addrspace(8), i32 }], i32, { ptr addrspace(8), i32 } } [[TMP14]], { ptr addrspace(8), i32 } [[X_2_PTR]], 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { [2 x { ptr addrspace(8), i32 }], i32, { ptr addrspace(8), i32 } } [[TMP19]], 0 -; CHECK-NEXT: [[TMP21:%.*]] = extractvalue [2 x { ptr addrspace(8), i32 }] [[TMP20]], 0 -; CHECK-NEXT: [[DOTRSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[TMP21]], 0 -; CHECK-NEXT: [[DOTOFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[TMP21]], 1 -; CHECK-NEXT: [[DOT0_0_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[DOTRSRC]] to i160 -; CHECK-NEXT: [[TMP22:%.*]] = shl nuw i160 [[DOT0_0_INT_RSRC]], 32 -; CHECK-NEXT: [[DOT0_0_INT_OFF:%.*]] = zext i32 [[DOTOFF]] to i160 -; CHECK-NEXT: [[DOT0_0_INT:%.*]] = or i160 [[TMP22]], [[DOT0_0_INT_OFF]] -; CHECK-NEXT: [[TMP23:%.*]] = insertvalue [2 x i160] poison, i160 [[DOT0_0_INT]], 0 -; CHECK-NEXT: [[TMP24:%.*]] = extractvalue [2 x { ptr addrspace(8), i32 }] [[TMP20]], 1 -; CHECK-NEXT: [[DOTRSRC1:%.*]] = extractvalue { ptr addrspace(8), i32 } [[TMP24]], 0 -; CHECK-NEXT: [[DOTOFF2:%.*]] = extractvalue { ptr addrspace(8), i32 } [[TMP24]], 1 -; CHECK-NEXT: [[DOT0_1_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[DOTRSRC1]] to i160 -; CHECK-NEXT: [[TMP25:%.*]] = shl nuw i160 [[DOT0_1_INT_RSRC]], 32 -; CHECK-NEXT: [[DOT0_1_INT_OFF:%.*]] = zext i32 [[DOTOFF2]] to i160 -; CHECK-NEXT: [[DOT0_1_INT:%.*]] = or i160 [[TMP25]], [[DOT0_1_INT_OFF]] -; CHECK-NEXT: [[TMP26:%.*]] = insertvalue [2 x i160] [[TMP23]], i160 [[DOT0_1_INT]], 1 -; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { [2 x i160], i32, i160 } poison, [2 x i160] [[TMP26]], 0 -; CHECK-NEXT: [[TMP28:%.*]] = extractvalue { [2 x { ptr addrspace(8), i32 }], i32, { ptr addrspace(8), i32 } } [[TMP19]], 1 -; CHECK-NEXT: [[TMP29:%.*]] = insertvalue { [2 x i160], i32, i160 } [[TMP27]], i32 [[TMP28]], 1 -; CHECK-NEXT: [[TMP30:%.*]] = extractvalue { [2 x { ptr addrspace(8), i32 }], i32, { ptr addrspace(8), i32 } } [[TMP19]], 2 -; CHECK-NEXT: [[DOTRSRC3:%.*]] = extractvalue { ptr addrspace(8), i32 } [[TMP30]], 0 -; CHECK-NEXT: [[DOTOFF4:%.*]] = extractvalue { ptr addrspace(8), i32 } [[TMP30]], 1 -; CHECK-NEXT: [[DOT2_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[DOTRSRC3]] to i160 -; CHECK-NEXT: [[TMP31:%.*]] = shl nuw i160 [[DOT2_INT_RSRC]], 32 -; CHECK-NEXT: [[DOT2_INT_OFF:%.*]] = zext i32 [[DOTOFF4]] to i160 -; CHECK-NEXT: [[DOT2_INT:%.*]] = or i160 [[TMP31]], [[DOT2_INT_OFF]] -; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { [2 x i160], i32, i160 } [[TMP29]], i160 [[DOT2_INT]], 2 -; CHECK-NEXT: store { [2 x i160], i32, i160 } [[TMP32]], ptr [[B]], align 32 +; CHECK-NEXT: store { [2 x i160], i32, i160 } [[X]], ptr [[B]], align 32 ; CHECK-NEXT: ret void ; %x = load {[2 x ptr addrspace(7)], i32, ptr addrspace(7)}, ptr %a diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll index ea4117b418959..b0658031356bb 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll @@ -14,9 +14,8 @@ define ptr addrspace(7) @gep(ptr addrspace(7) %in, i32 %idx) { ; CHECK-NEXT: [[RET_OFFS:%.*]] = add nsw i32 [[RET_IDX]], 8 ; CHECK-NEXT: [[RET_OFFS1:%.*]] = add nsw i32 [[RET_OFFS]], 24 ; CHECK-NEXT: [[RET:%.*]] = add i32 [[IN_OFF]], [[RET_OFFS1]] -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[IN_RSRC]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1 -; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[IN]], i32 [[RET]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP1]] ; %ret = getelementptr inbounds {i32, [4 x ptr]}, ptr addrspace(7) %in, i32 %idx, i32 1, i32 3 ret ptr addrspace(7) %ret @@ -31,9 +30,8 @@ define <2 x ptr addrspace(7)> @gep_vectors(<2 x ptr addrspace(7)> %in, <2 x i32> ; CHECK-NEXT: [[RET_OFFS:%.*]] = add nsw <2 x i32> [[RET_IDX]], splat (i32 8) ; CHECK-NEXT: [[RET_OFFS1:%.*]] = add nsw <2 x i32> [[RET_OFFS]], splat (i32 24) ; CHECK-NEXT: [[RET:%.*]] = add <2 x i32> [[IN_OFF]], [[RET_OFFS1]] -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } poison, <2 x ptr addrspace(8)> [[IN_RSRC]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP1]], <2 x i32> [[RET]], 1 -; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[IN]], <2 x i32> [[RET]], 1 +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP1]] ; %ret = getelementptr inbounds {i32, [4 x ptr]}, <2 x ptr addrspace(7)> %in, <2 x i32> %idx, i32 1, i32 3 ret <2 x ptr addrspace(7)> %ret @@ -51,9 +49,8 @@ define <2 x ptr addrspace(7)> @gep_vector_scalar(<2 x ptr addrspace(7)> %in, i64 ; CHECK-NEXT: [[RET_OFFS:%.*]] = add nsw <2 x i32> [[RET_IDX]], splat (i32 8) ; CHECK-NEXT: [[RET_OFFS1:%.*]] = add nsw <2 x i32> [[RET_OFFS]], splat (i32 24) ; CHECK-NEXT: [[RET:%.*]] = add <2 x i32> [[IN_OFF]], [[RET_OFFS1]] -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } poison, <2 x ptr addrspace(8)> [[IN_RSRC]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP1]], <2 x i32> [[RET]], 1 -; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { <2 x ptr addrspace(8)>, <2 x i32> } [[IN]], <2 x i32> [[RET]], 1 +; CHECK-NEXT: ret { <2 x ptr addrspace(8)>, <2 x i32> } [[TMP1]] ; %ret = getelementptr inbounds {i32, [4 x ptr]}, <2 x ptr addrspace(7)> %in, i64 %idx, i32 1, i32 3 ret <2 x ptr addrspace(7)> %ret @@ -84,9 +81,8 @@ define ptr addrspace(7) @simple_gep(ptr addrspace(7) %ptr, i32 %off) { ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: [[RET_IDX:%.*]] = mul i32 [[OFF]], 4 ; CHECK-NEXT: [[RET:%.*]] = add i32 [[PTR_OFF]], [[RET_IDX]] -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[PTR_RSRC]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1 -; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[PTR]], i32 [[RET]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP1]] ; %ret = getelementptr i32, ptr addrspace(7) %ptr, i32 %off ret ptr addrspace(7) %ret @@ -99,9 +95,8 @@ define ptr addrspace(7) @simple_inbounds_gep(ptr addrspace(7) %ptr, i32 %off) { ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: [[RET_IDX:%.*]] = mul nsw i32 [[OFF]], 4 ; CHECK-NEXT: [[RET:%.*]] = add i32 [[PTR_OFF]], [[RET_IDX]] -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[PTR_RSRC]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1 -; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[PTR]], i32 [[RET]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP1]] ; %ret = getelementptr inbounds i32, ptr addrspace(7) %ptr, i32 %off ret ptr addrspace(7) %ret @@ -114,9 +109,8 @@ define ptr addrspace(7) @simple_nuw_gep(ptr addrspace(7) %ptr, i32 %off) { ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: [[RET_IDX:%.*]] = mul nuw i32 [[OFF]], 4 ; CHECK-NEXT: [[RET:%.*]] = add nuw i32 [[PTR_OFF]], [[RET_IDX]] -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[PTR_RSRC]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1 -; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[PTR]], i32 [[RET]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP1]] ; %ret = getelementptr nuw i32, ptr addrspace(7) %ptr, i32 %off ret ptr addrspace(7) %ret @@ -129,9 +123,8 @@ define ptr addrspace(7) @simple_nusw_gep(ptr addrspace(7) %ptr, i32 %off) { ; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 ; CHECK-NEXT: [[RET_IDX:%.*]] = mul nsw i32 [[OFF]], 4 ; CHECK-NEXT: [[RET:%.*]] = add i32 [[PTR_OFF]], [[RET_IDX]] -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[PTR_RSRC]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1 -; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[PTR]], i32 [[RET]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP1]] ; %ret = getelementptr nusw i32, ptr addrspace(7) %ptr, i32 %off ret ptr addrspace(7) %ret @@ -145,9 +138,8 @@ define ptr addrspace(7) @nusw_gep_pair(ptr addrspace(7) %ptr, i32 %off) { ; CHECK-NEXT: [[P1_IDX:%.*]] = mul nsw i32 [[OFF]], 4 ; CHECK-NEXT: [[P1:%.*]] = add i32 [[PTR_OFF]], [[P1_IDX]] ; CHECK-NEXT: [[RET:%.*]] = add nuw i32 [[P1]], 16 -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[PTR_RSRC]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET]], 1 -; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[PTR]], i32 [[RET]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP1]] ; %p1 = getelementptr nusw i32, ptr addrspace(7) %ptr, i32 %off %ret = getelementptr nusw i32, ptr addrspace(7) %p1, i32 4 @@ -156,11 +148,9 @@ define ptr addrspace(7) @nusw_gep_pair(ptr addrspace(7) %ptr, i32 %off) { define ptr addrspace(7) @zero_gep(ptr addrspace(7) %ptr) { ; CHECK-LABEL: define { ptr addrspace(8), i32 } @zero_gep -; CHECK-SAME: ({ ptr addrspace(8), i32 } [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 -; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[PTR_RSRC]], 0 -; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[PTR_OFF]], 1 +; CHECK-SAME: ({ ptr addrspace(8), i32 } [[RET:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 0 +; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[RET]], 1 ; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] ; %ret = getelementptr i8, ptr addrspace(7) %ptr, i32 0 @@ -173,8 +163,7 @@ define ptr addrspace(7) @zero_gep_goes_second(ptr addrspace(7) %v0, i32 %arg) { ; CHECK-NEXT: [[V0_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[V0]], 0 ; CHECK-NEXT: [[V0_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[V0]], 1 ; CHECK-NEXT: [[V1:%.*]] = add i32 [[V0_OFF]], [[ARG]] -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[V0_RSRC]], 0 -; CHECK-NEXT: [[V2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[V1]], 1 +; CHECK-NEXT: [[V2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[V0]], i32 [[V1]], 1 ; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[V2]] ; %v1 = getelementptr i8, ptr addrspace(7) %v0, i32 %arg @@ -188,9 +177,8 @@ define ptr addrspace(7) @zero_gep_goes_first(ptr addrspace(7) %v0, i32 %arg) { ; CHECK-NEXT: [[V0_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[V0]], 0 ; CHECK-NEXT: [[V0_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[V0]], 1 ; CHECK-NEXT: [[V2:%.*]] = add i32 [[V0_OFF]], [[ARG]] -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[V0_RSRC]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[V2]], 1 -; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } [[V0]], i32 [[V2]], 1 +; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP1]] ; %v1 = getelementptr i8, ptr addrspace(7) %v0, i32 0 %v2 = getelementptr i8, ptr addrspace(7) %v1, i32 %arg @@ -316,11 +304,7 @@ define ptr addrspace(7) @inttoptr_long(i256 %v) { define ptr addrspace(7) @inttoptr_offset(i32 %v) { ; CHECK-LABEL: define { ptr addrspace(8), i32 } @inttoptr_offset ; CHECK-SAME: (i32 [[V:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[V]], 32 -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i128 -; CHECK-NEXT: [[RET_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8) -; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[RET_RSRC]], 0 -; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP3]], i32 [[V]], 1 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, i32 [[V]], 1 ; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] ; %ret = inttoptr i32 %v to ptr addrspace(7) @@ -514,8 +498,7 @@ define ptr addrspace(7) @ptrmask(ptr addrspace(7) %p, i32 %mask) { ; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0 ; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1 ; CHECK-NEXT: [[RET_OFF:%.*]] = and i32 [[P_OFF]], [[MASK]] -; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[P_RSRC]], 0 -; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 [[RET_OFF]], 1 +; CHECK-NEXT: [[RET:%.*]] = insertvalue { ptr addrspace(8), i32 } [[P]], i32 [[RET_OFF]], 1 ; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[RET]] ; %ret = call ptr addrspace(7) @llvm.ptrmask.p7.i32(ptr addrspace(7) %p, i32 %mask) diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll index 95679a593259a..ef3026356f5fe 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll @@ -14,52 +14,46 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: #dbg_value(ptr addrspace(5) [[AUX_PTR_VAR]], [[META12:![0-9]+]], !DIExpression(), [[DBG22]]) ; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META13:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) ; CHECK-NEXT: [[BUF_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF]] to i160, !dbg [[DBG24:![0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i160 [[BUF_PTR_INT_RSRC]], 32, !dbg [[DBG24]] -; CHECK-NEXT: [[BUF_PTR_INT:%.*]] = or i160 [[TMP1]], 0, !dbg [[DBG24]] +; CHECK-NEXT: [[BUF_PTR_INT:%.*]] = shl nuw i160 [[BUF_PTR_INT_RSRC]], 32, !dbg [[DBG24]] ; CHECK-NEXT: store i160 [[BUF_PTR_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG24]] ; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META15:![0-9]+]], !DIExpression(), [[META25:![0-9]+]]) ; CHECK-NEXT: [[AUX_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[AUX]] to i160, !dbg [[DBG26:![0-9]+]] -; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i160 [[AUX_PTR_INT_RSRC]], 32, !dbg [[DBG26]] -; CHECK-NEXT: [[AUX_PTR_INT:%.*]] = or i160 [[TMP2]], 0, !dbg [[DBG26]] +; CHECK-NEXT: [[AUX_PTR_INT:%.*]] = shl nuw i160 [[AUX_PTR_INT_RSRC]], 32, !dbg [[DBG26]] ; CHECK-NEXT: store i160 [[AUX_PTR_INT]], ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG26]] ; CHECK-NEXT: [[BUF_PTR_2:%.*]] = load i160, ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG27:![0-9]+]] -; CHECK-NEXT: [[TMP3:%.*]] = lshr i160 [[BUF_PTR_2]], 32, !dbg [[DBG27]] -; CHECK-NEXT: [[TMP4:%.*]] = trunc i160 [[TMP3]] to i128, !dbg [[DBG27]] -; CHECK-NEXT: [[BUF_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP4]] to ptr addrspace(8), !dbg [[DBG27]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr i160 [[BUF_PTR_2]], 32, !dbg [[DBG27]] +; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128, !dbg [[DBG27]] +; CHECK-NEXT: [[BUF_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8), !dbg [[DBG27]] ; CHECK-NEXT: [[BUF_PTR_2_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_2]] to i32, !dbg [[DBG27]] ; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META16:![0-9]+]], !DIExpression(), [[DBG27]]) ; CHECK-NEXT: [[BUF_PTR_3_IDX:%.*]] = mul i32 [[IDX]], 4, !dbg [[DBG28:![0-9]+]] ; CHECK-NEXT: [[BUF_PTR_3:%.*]] = add i32 [[BUF_PTR_2_PTR_OFF]], [[BUF_PTR_3_IDX]], !dbg [[DBG28]] ; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META17:![0-9]+]], !DIExpression(), [[DBG28]]) ; CHECK-NEXT: [[BUF_PTR_3_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]] to i160, !dbg [[DBG29:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i160 [[BUF_PTR_3_INT_RSRC]], 32, !dbg [[DBG29]] +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i160 [[BUF_PTR_3_INT_RSRC]], 32, !dbg [[DBG29]] ; CHECK-NEXT: [[BUF_PTR_3_INT_OFF:%.*]] = zext i32 [[BUF_PTR_3]] to i160, !dbg [[DBG29]] -; CHECK-NEXT: [[BUF_PTR_3_INT:%.*]] = or i160 [[TMP5]], [[BUF_PTR_3_INT_OFF]], !dbg [[DBG29]] +; CHECK-NEXT: [[BUF_PTR_3_INT:%.*]] = or i160 [[TMP3]], [[BUF_PTR_3_INT_OFF]], !dbg [[DBG29]] ; CHECK-NEXT: store i160 [[BUF_PTR_3_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG29]] ; CHECK-NEXT: [[BUF_PTR_4:%.*]] = load i160, ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG30:![0-9]+]] -; CHECK-NEXT: [[TMP6:%.*]] = lshr i160 [[BUF_PTR_4]], 32, !dbg [[DBG30]] -; CHECK-NEXT: [[TMP7:%.*]] = trunc i160 [[TMP6]] to i128, !dbg [[DBG30]] -; CHECK-NEXT: [[BUF_PTR_4_PTR_RSRC:%.*]] = inttoptr i128 [[TMP7]] to ptr addrspace(8), !dbg [[DBG30]] +; CHECK-NEXT: [[TMP4:%.*]] = lshr i160 [[BUF_PTR_4]], 32, !dbg [[DBG30]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i160 [[TMP4]] to i128, !dbg [[DBG30]] +; CHECK-NEXT: [[BUF_PTR_4_PTR_RSRC:%.*]] = inttoptr i128 [[TMP5]] to ptr addrspace(8), !dbg [[DBG30]] ; CHECK-NEXT: [[BUF_PTR_4_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_4]] to i32, !dbg [[DBG30]] ; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META18:![0-9]+]], !DIExpression(), [[DBG30]]) ; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF_PTR_4_PTR_RSRC]], i32 [[BUF_PTR_4_PTR_OFF]], i32 0, i32 0), !dbg [[DBG31:![0-9]+]] ; CHECK-NEXT: #dbg_value(float [[RET]], [[META19:![0-9]+]], !DIExpression(), [[DBG31]]) ; CHECK-NEXT: [[AUX_PTR_2:%.*]] = load i160, ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG32:![0-9]+]] -; CHECK-NEXT: [[TMP8:%.*]] = lshr i160 [[AUX_PTR_2]], 32, !dbg [[DBG32]] -; CHECK-NEXT: [[TMP9:%.*]] = trunc i160 [[TMP8]] to i128, !dbg [[DBG32]] -; CHECK-NEXT: [[AUX_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP9]] to ptr addrspace(8), !dbg [[DBG32]] +; CHECK-NEXT: [[TMP6:%.*]] = lshr i160 [[AUX_PTR_2]], 32, !dbg [[DBG32]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i160 [[TMP6]] to i128, !dbg [[DBG32]] +; CHECK-NEXT: [[AUX_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP7]] to ptr addrspace(8), !dbg [[DBG32]] ; CHECK-NEXT: [[AUX_PTR_2_PTR_OFF:%.*]] = trunc i160 [[AUX_PTR_2]] to i32, !dbg [[DBG32]] ; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META20:![0-9]+]], !DIExpression(), [[DBG32]]) -; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF_PTR_4_PTR_RSRC]] to i160, !dbg [[DBG33:![0-9]+]] -; CHECK-NEXT: [[TMP10:%.*]] = shl nuw i160 [[BUF_PTR_4_PTR_INT_RSRC]], 32, !dbg [[DBG33]] -; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_OFF:%.*]] = zext i32 [[BUF_PTR_4_PTR_OFF]] to i160, !dbg [[DBG33]] -; CHECK-NEXT: [[BUF_PTR_4_PTR_INT:%.*]] = or i160 [[TMP10]], [[BUF_PTR_4_PTR_INT_OFF]], !dbg [[DBG33]] -; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4_PTR_INT]] to <5 x i32>, !dbg [[DBG33]] -; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_PTR_INT_LEGAL]], <5 x i32> poison, <4 x i32> , !dbg [[DBG33]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_PTR_INT_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]] +; CHECK-NEXT: [[BUF_PTR_4_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4]] to <5 x i32>, !dbg [[DBG33:![0-9]+]] +; CHECK-NEXT: [[BUF_PTR_4_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_LEGAL]], <5 x i32> poison, <4 x i32> , !dbg [[DBG33]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]] ; CHECK-NEXT: [[AUX_PTR_2_PTR_PART_4:%.*]] = add nuw i32 [[AUX_PTR_2_PTR_OFF]], 16, !dbg [[DBG33]] -; CHECK-NEXT: [[BUF_PTR_4_PTR_INT_SLICE_4:%.*]] = extractelement <5 x i32> [[BUF_PTR_4_PTR_INT_LEGAL]], i64 4, !dbg [[DBG33]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[BUF_PTR_4_PTR_INT_SLICE_4]], ptr addrspace(8) align 16 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_PART_4]], i32 0, i32 0), !dbg [[DBG33]] +; CHECK-NEXT: [[BUF_PTR_4_SLICE_4:%.*]] = extractelement <5 x i32> [[BUF_PTR_4_LEGAL]], i64 4, !dbg [[DBG33]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[BUF_PTR_4_SLICE_4]], ptr addrspace(8) align 16 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_PART_4]], i32 0, i32 0), !dbg [[DBG33]] ; CHECK-NEXT: ret float [[RET]], !dbg [[DBG34:![0-9]+]] ; %buf.ptr.var = alloca ptr addrspace(7), align 32, addrspace(5), !dbg !20 From 2080334574a88a7ac4102007b56809cd0a19b905 Mon Sep 17 00:00:00 2001 From: Daniel Chen Date: Thu, 3 Apr 2025 11:21:19 -0400 Subject: [PATCH 0541/1029] [flang-rt] Pass the whole path of libflang_rt.runtime.a to linker on AIX and LoP (#131041) This PR is to improve the driver code to build `flang-rt` path by re-using the logic and code of `compiler-rt`. 1. Moved `addFortranRuntimeLibraryPath` and `addFortranRuntimeLibs` to `ToolChain.h` and made them virtual so that they can be overridden if customization is needed. The current implementation of those two procedures is moved to `ToolChain.cpp` as the base implementation to default to. 2. Both AIX and PPCLinux now override `addFortranRuntimeLibs`. The overriding function of `addFortranRuntimeLibs` for both AIX and PPCLinux calls `getCompilerRTArgString` => `getCompilerRT` => `buildCompilerRTBasename` to get the path to `flang-rt`. This code handles `LLVM_ENABLE_PER_TARGET_RUNTIME_DIR` setting. As shown in `PPCLinux.cpp`, `FT_static` is the default. If not found, it will search and build for `FT_shared`. To differentiate `flang-rt` from `clang-rt`, a boolean flag `IsFortran` is passed to the chain of functions in order to reach `buildCompilerRTBasename`. --- clang/include/clang/Driver/ToolChain.h | 23 ++++-- clang/lib/Driver/ToolChain.cpp | 76 ++++++++++++++++--- clang/lib/Driver/ToolChains/AIX.cpp | 12 ++- clang/lib/Driver/ToolChains/AIX.h | 3 + clang/lib/Driver/ToolChains/AVR.cpp | 7 +- clang/lib/Driver/ToolChains/AVR.h | 3 +- clang/lib/Driver/ToolChains/CommonArgs.cpp | 55 -------------- clang/lib/Driver/ToolChains/CommonArgs.h | 9 --- clang/lib/Driver/ToolChains/Darwin.cpp | 8 +- clang/lib/Driver/ToolChains/Darwin.h | 12 +-- clang/lib/Driver/ToolChains/DragonFly.cpp | 4 +- clang/lib/Driver/ToolChains/FreeBSD.cpp | 4 +- clang/lib/Driver/ToolChains/Gnu.cpp | 4 +- clang/lib/Driver/ToolChains/Haiku.cpp | 4 +- clang/lib/Driver/ToolChains/MSVC.cpp | 4 +- clang/lib/Driver/ToolChains/MinGW.cpp | 4 +- clang/lib/Driver/ToolChains/MipsLinux.cpp | 4 +- clang/lib/Driver/ToolChains/MipsLinux.h | 6 +- clang/lib/Driver/ToolChains/NetBSD.cpp | 4 +- clang/lib/Driver/ToolChains/OHOS.cpp | 2 +- clang/lib/Driver/ToolChains/OHOS.h | 6 +- clang/lib/Driver/ToolChains/OpenBSD.cpp | 12 +-- clang/lib/Driver/ToolChains/OpenBSD.h | 3 +- clang/lib/Driver/ToolChains/PPCLinux.cpp | 16 ++++ clang/lib/Driver/ToolChains/PPCLinux.h | 3 + clang/lib/Driver/ToolChains/Solaris.cpp | 4 +- .../powerpc64-ibm-aix/libflang_rt.runtime.a | 0 .../libflang_rt.runtime.a | 0 .../libflang_rt.runtime.so | 0 flang-rt/cmake/modules/GetToolchainDirs.cmake | 3 +- flang/test/Driver/flang-ld-powerpc.f90 | 41 ++++++++++ 31 files changed, 206 insertions(+), 130 deletions(-) create mode 100644 clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/powerpc64-ibm-aix/libflang_rt.runtime.a create mode 100644 clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/powerpc64le-unknown-linux-gnu/libflang_rt.runtime.a create mode 100644 clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/powerpc64le-unknown-linux-gnu/libflang_rt.runtime.so create mode 100644 flang/test/Driver/flang-ld-powerpc.f90 diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h index f2e8fa306e3a5..076e4296c3090 100644 --- a/clang/include/clang/Driver/ToolChain.h +++ b/clang/include/clang/Driver/ToolChain.h @@ -216,8 +216,8 @@ class ToolChain { virtual std::string buildCompilerRTBasename(const llvm::opt::ArgList &Args, StringRef Component, - FileType Type, - bool AddArch) const; + FileType Type, bool AddArch, + bool IsFortran = false) const; /// Find the target-specific subdirectory for the current target triple under /// \p BaseDir, doing fallback triple searches as necessary. @@ -509,11 +509,22 @@ class ToolChain { virtual std::string getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component, - FileType Type = ToolChain::FT_Static) const; + FileType Type = ToolChain::FT_Static, + bool IsFortran = false) const; + + /// Adds Fortran runtime libraries to \p CmdArgs. + virtual void addFortranRuntimeLibs(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) const; + + /// Adds the path for the Fortran runtime libraries to \p CmdArgs. + virtual void + addFortranRuntimeLibraryPath(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) const; - const char * - getCompilerRTArgString(const llvm::opt::ArgList &Args, StringRef Component, - FileType Type = ToolChain::FT_Static) const; + const char *getCompilerRTArgString(const llvm::opt::ArgList &Args, + StringRef Component, + FileType Type = ToolChain::FT_Static, + bool IsFortran = false) const; std::string getCompilerRTBasename(const llvm::opt::ArgList &Args, StringRef Component, diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index ad73814b3efba..36d0ae34dec86 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -727,8 +727,8 @@ std::string ToolChain::getCompilerRTBasename(const ArgList &Args, std::string ToolChain::buildCompilerRTBasename(const llvm::opt::ArgList &Args, StringRef Component, - FileType Type, - bool AddArch) const { + FileType Type, bool AddArch, + bool IsFortran) const { const llvm::Triple &TT = getTriple(); bool IsITANMSVCWindows = TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment(); @@ -756,14 +756,16 @@ std::string ToolChain::buildCompilerRTBasename(const llvm::opt::ArgList &Args, const char *Env = TT.isAndroid() ? "-android" : ""; ArchAndEnv = ("-" + Arch + Env).str(); } - return (Prefix + Twine("clang_rt.") + Component + ArchAndEnv + Suffix).str(); + + std::string LibName = IsFortran ? "flang_rt." : "clang_rt."; + return (Prefix + Twine(LibName) + Component + ArchAndEnv + Suffix).str(); } std::string ToolChain::getCompilerRT(const ArgList &Args, StringRef Component, - FileType Type) const { + FileType Type, bool IsFortran) const { // Check for runtime files in the new layout without the architecture first. - std::string CRTBasename = - buildCompilerRTBasename(Args, Component, Type, /*AddArch=*/false); + std::string CRTBasename = buildCompilerRTBasename( + Args, Component, Type, /*AddArch=*/false, IsFortran); SmallString<128> Path; for (const auto &LibPath : getLibraryPaths()) { SmallString<128> P(LibPath); @@ -775,8 +777,8 @@ std::string ToolChain::getCompilerRT(const ArgList &Args, StringRef Component, } // Check the filename for the old layout if the new one does not exist. - CRTBasename = - buildCompilerRTBasename(Args, Component, Type, /*AddArch=*/true); + CRTBasename = buildCompilerRTBasename(Args, Component, Type, + /*AddArch=*/!IsFortran, IsFortran); SmallString<128> OldPath(getCompilerRTPath()); llvm::sys::path::append(OldPath, CRTBasename); if (Path.empty() || getVFS().exists(OldPath)) @@ -790,8 +792,62 @@ std::string ToolChain::getCompilerRT(const ArgList &Args, StringRef Component, const char *ToolChain::getCompilerRTArgString(const llvm::opt::ArgList &Args, StringRef Component, - FileType Type) const { - return Args.MakeArgString(getCompilerRT(Args, Component, Type)); + FileType Type, + bool isFortran) const { + return Args.MakeArgString(getCompilerRT(Args, Component, Type, isFortran)); +} + +/// Add Fortran runtime libs +void ToolChain::addFortranRuntimeLibs(const ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) const { + // Link flang_rt.runtime + // These are handled earlier on Windows by telling the frontend driver to + // add the correct libraries to link against as dependents in the object + // file. + if (!getTriple().isKnownWindowsMSVCEnvironment()) { + StringRef F128LibName = getDriver().getFlangF128MathLibrary(); + F128LibName.consume_front_insensitive("lib"); + if (!F128LibName.empty()) { + bool AsNeeded = !getTriple().isOSAIX(); + CmdArgs.push_back("-lflang_rt.quadmath"); + if (AsNeeded) + addAsNeededOption(*this, Args, CmdArgs, /*as_needed=*/true); + CmdArgs.push_back(Args.MakeArgString("-l" + F128LibName)); + if (AsNeeded) + addAsNeededOption(*this, Args, CmdArgs, /*as_needed=*/false); + } + CmdArgs.push_back("-lflang_rt.runtime"); + addArchSpecificRPath(*this, Args, CmdArgs); + + // needs libexecinfo for backtrace functions + if (getTriple().isOSFreeBSD() || getTriple().isOSNetBSD() || + getTriple().isOSOpenBSD() || getTriple().isOSDragonFly()) + CmdArgs.push_back("-lexecinfo"); + } + + // libomp needs libatomic for atomic operations if using libgcc + if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, + options::OPT_fno_openmp, false)) { + Driver::OpenMPRuntimeKind OMPRuntime = getDriver().getOpenMPRuntime(Args); + ToolChain::RuntimeLibType RuntimeLib = GetRuntimeLibType(Args); + if (OMPRuntime == Driver::OMPRT_OMP && RuntimeLib == ToolChain::RLT_Libgcc) + CmdArgs.push_back("-latomic"); + } +} + +void ToolChain::addFortranRuntimeLibraryPath(const llvm::opt::ArgList &Args, + ArgStringList &CmdArgs) const { + // Default to the /../lib directory. This works fine on the + // platforms that we have tested so far. We will probably have to re-fine + // this in the future. In particular, on some platforms, we may need to use + // lib64 instead of lib. + SmallString<256> DefaultLibPath = + llvm::sys::path::parent_path(getDriver().Dir); + llvm::sys::path::append(DefaultLibPath, "lib"); + if (getTriple().isKnownWindowsMSVCEnvironment()) + CmdArgs.push_back(Args.MakeArgString("-libpath:" + DefaultLibPath)); + else + CmdArgs.push_back(Args.MakeArgString("-L" + DefaultLibPath)); } // Android target triples contain a target version. If we don't have libraries diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp index 001f3a5178943..26b9d4c772be6 100644 --- a/clang/lib/Driver/ToolChains/AIX.cpp +++ b/clang/lib/Driver/ToolChains/AIX.cpp @@ -358,8 +358,8 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA, if (D.IsFlangMode() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { - addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs); - addFortranRuntimeLibs(ToolChain, Args, CmdArgs); + ToolChain.addFortranRuntimeLibraryPath(Args, CmdArgs); + ToolChain.addFortranRuntimeLibs(Args, CmdArgs); CmdArgs.push_back("-lm"); CmdArgs.push_back("-lpthread"); } @@ -608,6 +608,14 @@ void AIX::addProfileRTLibs(const llvm::opt::ArgList &Args, ToolChain::addProfileRTLibs(Args, CmdArgs); } +void AIX::addFortranRuntimeLibs(const ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) const { + // Link flang_rt.runtime.a. On AIX, the static and shared library are all + // named .a + CmdArgs.push_back( + getCompilerRTArgString(Args, "runtime", ToolChain::FT_Static, true)); +} + ToolChain::CXXStdlibType AIX::GetDefaultCXXStdlibType() const { return ToolChain::CST_Libcxx; } diff --git a/clang/lib/Driver/ToolChains/AIX.h b/clang/lib/Driver/ToolChains/AIX.h index 8f130f6b54547..17e8370cd1218 100644 --- a/clang/lib/Driver/ToolChains/AIX.h +++ b/clang/lib/Driver/ToolChains/AIX.h @@ -87,6 +87,9 @@ class LLVM_LIBRARY_VISIBILITY AIX : public ToolChain { void addProfileRTLibs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const override; + void addFortranRuntimeLibs(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) const override; + CXXStdlibType GetDefaultCXXStdlibType() const override; RuntimeLibType GetDefaultRuntimeLibType() const override; diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp index 08e906ac9e806..8b8956a0a15ef 100644 --- a/clang/lib/Driver/ToolChains/AVR.cpp +++ b/clang/lib/Driver/ToolChains/AVR.cpp @@ -424,9 +424,10 @@ Tool *AVRToolChain::buildLinker() const { return new tools::AVR::Linker(getTriple(), *this); } -std::string -AVRToolChain::getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component, - FileType Type = ToolChain::FT_Static) const { +std::string AVRToolChain::getCompilerRT(const llvm::opt::ArgList &Args, + StringRef Component, + FileType Type = ToolChain::FT_Static, + bool IsFortran) const { assert(Type == ToolChain::FT_Static && "AVR only supports static libraries"); // Since AVR can never be a host environment, its compiler-rt library files // should always have ".a" suffix, even on windows. diff --git a/clang/lib/Driver/ToolChains/AVR.h b/clang/lib/Driver/ToolChains/AVR.h index 247188b7eaad7..f4d74eb944257 100644 --- a/clang/lib/Driver/ToolChains/AVR.h +++ b/clang/lib/Driver/ToolChains/AVR.h @@ -34,7 +34,8 @@ class LLVM_LIBRARY_VISIBILITY AVRToolChain : public Generic_ELF { std::optional findAVRLibcInstallation() const; StringRef getGCCInstallPath() const { return GCCInstallPath; } std::string getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component, - FileType Type) const override; + FileType Type, + bool IsFortran = false) const override; bool HasNativeLLVMSupport() const override { return true; } diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 5aac20e1cdf44..ddeadff8f6dfb 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1339,61 +1339,6 @@ void tools::addOpenMPHostOffloadingArgs(const Compilation &C, Args.MakeArgString(Twine(Targets) + llvm::join(Triples, ","))); } -/// Add Fortran runtime libs -void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args, - llvm::opt::ArgStringList &CmdArgs) { - // Link flang_rt.runtime - // These are handled earlier on Windows by telling the frontend driver to - // add the correct libraries to link against as dependents in the object - // file. - if (!TC.getTriple().isKnownWindowsMSVCEnvironment()) { - StringRef F128LibName = TC.getDriver().getFlangF128MathLibrary(); - F128LibName.consume_front_insensitive("lib"); - if (!F128LibName.empty()) { - bool AsNeeded = !TC.getTriple().isOSAIX(); - CmdArgs.push_back("-lflang_rt.quadmath"); - if (AsNeeded) - addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/true); - CmdArgs.push_back(Args.MakeArgString("-l" + F128LibName)); - if (AsNeeded) - addAsNeededOption(TC, Args, CmdArgs, /*as_needed=*/false); - } - CmdArgs.push_back("-lflang_rt.runtime"); - addArchSpecificRPath(TC, Args, CmdArgs); - - // needs libexecinfo for backtrace functions - if (TC.getTriple().isOSFreeBSD() || TC.getTriple().isOSNetBSD() || - TC.getTriple().isOSOpenBSD() || TC.getTriple().isOSDragonFly()) - CmdArgs.push_back("-lexecinfo"); - } - - // libomp needs libatomic for atomic operations if using libgcc - if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, - options::OPT_fno_openmp, false)) { - Driver::OpenMPRuntimeKind OMPRuntime = - TC.getDriver().getOpenMPRuntime(Args); - ToolChain::RuntimeLibType RuntimeLib = TC.GetRuntimeLibType(Args); - if (OMPRuntime == Driver::OMPRT_OMP && RuntimeLib == ToolChain::RLT_Libgcc) - CmdArgs.push_back("-latomic"); - } -} - -void tools::addFortranRuntimeLibraryPath(const ToolChain &TC, - const llvm::opt::ArgList &Args, - ArgStringList &CmdArgs) { - // Default to the /../lib directory. This works fine on the - // platforms that we have tested so far. We will probably have to re-fine - // this in the future. In particular, on some platforms, we may need to use - // lib64 instead of lib. - SmallString<256> DefaultLibPath = - llvm::sys::path::parent_path(TC.getDriver().Dir); - llvm::sys::path::append(DefaultLibPath, "lib"); - if (TC.getTriple().isKnownWindowsMSVCEnvironment()) - CmdArgs.push_back(Args.MakeArgString("-libpath:" + DefaultLibPath)); - else - CmdArgs.push_back(Args.MakeArgString("-L" + DefaultLibPath)); -} - static void addSanitizerRuntime(const ToolChain &TC, const ArgList &Args, ArgStringList &CmdArgs, StringRef Sanitizer, bool IsShared, bool IsWhole) { diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h index 9b280e5d871c7..96bc0619dcbc0 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.h +++ b/clang/lib/Driver/ToolChains/CommonArgs.h @@ -121,15 +121,6 @@ void addOpenMPHostOffloadingArgs(const Compilation &C, const JobAction &JA, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs); -/// Adds Fortran runtime libraries to \p CmdArgs. -void addFortranRuntimeLibs(const ToolChain &TC, const llvm::opt::ArgList &Args, - llvm::opt::ArgStringList &CmdArgs); - -/// Adds the path for the Fortran runtime libraries to \p CmdArgs. -void addFortranRuntimeLibraryPath(const ToolChain &TC, - const llvm::opt::ArgList &Args, - llvm::opt::ArgStringList &CmdArgs); - void addHIPRuntimeLibArgs(const ToolChain &TC, Compilation &C, const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs); diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index 32a5fe68e8cff..e41720a824380 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -706,8 +706,8 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA, // to generate executables. if (getToolChain().getDriver().IsFlangMode() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { - addFortranRuntimeLibraryPath(getToolChain(), Args, CmdArgs); - addFortranRuntimeLibs(getToolChain(), Args, CmdArgs); + getToolChain().addFortranRuntimeLibraryPath(Args, CmdArgs); + getToolChain().addFortranRuntimeLibs(Args, CmdArgs); } if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) @@ -1348,7 +1348,7 @@ void MachO::AddLinkRuntimeLib(const ArgList &Args, ArgStringList &CmdArgs, } std::string MachO::getCompilerRT(const ArgList &, StringRef Component, - FileType Type) const { + FileType Type, bool IsFortran) const { assert(Type != ToolChain::FT_Object && "it doesn't make sense to ask for the compiler-rt library name as an " "object file"); @@ -1367,7 +1367,7 @@ std::string MachO::getCompilerRT(const ArgList &, StringRef Component, } std::string Darwin::getCompilerRT(const ArgList &, StringRef Component, - FileType Type) const { + FileType Type, bool IsFortran) const { assert(Type != ToolChain::FT_Object && "it doesn't make sense to ask for the compiler-rt library name as an " "object file"); diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h index 751a3ba2ad5e4..76523d636ce07 100644 --- a/clang/lib/Driver/ToolChains/Darwin.h +++ b/clang/lib/Driver/ToolChains/Darwin.h @@ -232,9 +232,9 @@ class LLVM_LIBRARY_VISIBILITY MachO : public ToolChain { // Return the full path of the compiler-rt library on a non-Darwin MachO // system. Those are under // /lib/darwin/macho_embedded/<...>(.dylib|.a). - std::string - getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component, - FileType Type = ToolChain::FT_Static) const override; + std::string getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component, + FileType Type = ToolChain::FT_Static, + bool IsFortran = false) const override; /// } /// @name ToolChain Implementation @@ -412,9 +412,9 @@ class LLVM_LIBRARY_VISIBILITY Darwin : public AppleMachO { // Return the full path of the compiler-rt library on a Darwin MachO system. // Those are under /lib/darwin/<...>(.dylib|.a). - std::string - getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component, - FileType Type = ToolChain::FT_Static) const override; + std::string getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component, + FileType Type = ToolChain::FT_Static, + bool IsFortran = false) const override; protected: /// } diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp index 1e0a4159bf4ad..38a29ae49e8d9 100644 --- a/clang/lib/Driver/ToolChains/DragonFly.cpp +++ b/clang/lib/Driver/ToolChains/DragonFly.cpp @@ -153,8 +153,8 @@ void dragonfly::Linker::ConstructJob(Compilation &C, const JobAction &JA, // AddRunTimeLibs). if (D.IsFlangMode() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { - addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs); - addFortranRuntimeLibs(ToolChain, Args, CmdArgs); + ToolChain.addFortranRuntimeLibraryPath(Args, CmdArgs); + ToolChain.addFortranRuntimeLibs(Args, CmdArgs); CmdArgs.push_back("-lm"); } diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp index 62206c5fb3c59..e7efe22aa59a8 100644 --- a/clang/lib/Driver/ToolChains/FreeBSD.cpp +++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp @@ -319,8 +319,8 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA, // AddRunTimeLibs). if (D.IsFlangMode() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { - addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs); - addFortranRuntimeLibs(ToolChain, Args, CmdArgs); + ToolChain.addFortranRuntimeLibraryPath(Args, CmdArgs); + ToolChain.addFortranRuntimeLibs(Args, CmdArgs); if (Profiling) CmdArgs.push_back("-lm_p"); else diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index a0fa3c66d7dec..d53039f6302d2 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -572,8 +572,8 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA, // AddRunTimeLibs). if (D.IsFlangMode() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { - addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs); - addFortranRuntimeLibs(ToolChain, Args, CmdArgs); + ToolChain.addFortranRuntimeLibraryPath(Args, CmdArgs); + ToolChain.addFortranRuntimeLibs(Args, CmdArgs); CmdArgs.push_back("-lm"); } diff --git a/clang/lib/Driver/ToolChains/Haiku.cpp b/clang/lib/Driver/ToolChains/Haiku.cpp index 17fb724b6a7cc..0e55a71280aff 100644 --- a/clang/lib/Driver/ToolChains/Haiku.cpp +++ b/clang/lib/Driver/ToolChains/Haiku.cpp @@ -123,8 +123,8 @@ void haiku::Linker::ConstructJob(Compilation &C, const JobAction &JA, // AddRunTimeLibs). if (D.IsFlangMode() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { - addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs); - addFortranRuntimeLibs(ToolChain, Args, CmdArgs); + ToolChain.addFortranRuntimeLibraryPath(Args, CmdArgs); + ToolChain.addFortranRuntimeLibs(Args, CmdArgs); } if (NeedsSanitizerDeps) diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index d5a7fc7e85230..9ae61a528eb12 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -146,8 +146,8 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA, if (C.getDriver().IsFlangMode() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { - addFortranRuntimeLibraryPath(TC, Args, CmdArgs); - addFortranRuntimeLibs(TC, Args, CmdArgs); + TC.addFortranRuntimeLibraryPath(Args, CmdArgs); + TC.addFortranRuntimeLibs(Args, CmdArgs); // Inform the MSVC linker that we're generating a console application, i.e. // one with `main` as the "user-defined" entry point. The `main` function is diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp index 9f0c6160a309e..031240610eef3 100644 --- a/clang/lib/Driver/ToolChains/MinGW.cpp +++ b/clang/lib/Driver/ToolChains/MinGW.cpp @@ -259,8 +259,8 @@ void tools::MinGW::Linker::ConstructJob(Compilation &C, const JobAction &JA, if (C.getDriver().IsFlangMode() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { - addFortranRuntimeLibraryPath(TC, Args, CmdArgs); - addFortranRuntimeLibs(TC, Args, CmdArgs); + TC.addFortranRuntimeLibraryPath(Args, CmdArgs); + TC.addFortranRuntimeLibs(Args, CmdArgs); } // TODO: Add profile stuff here diff --git a/clang/lib/Driver/ToolChains/MipsLinux.cpp b/clang/lib/Driver/ToolChains/MipsLinux.cpp index f61ae471b86d5..0d025937cec9a 100644 --- a/clang/lib/Driver/ToolChains/MipsLinux.cpp +++ b/clang/lib/Driver/ToolChains/MipsLinux.cpp @@ -118,8 +118,8 @@ void MipsLLVMToolChain::AddCXXStdlibLibArgs(const ArgList &Args, } std::string MipsLLVMToolChain::getCompilerRT(const ArgList &Args, - StringRef Component, - FileType Type) const { + StringRef Component, FileType Type, + bool IsFortran) const { SmallString<128> Path(getDriver().ResourceDir); llvm::sys::path::append(Path, SelectedMultilibs.back().osSuffix(), "lib" + LibSuffix, getOS()); diff --git a/clang/lib/Driver/ToolChains/MipsLinux.h b/clang/lib/Driver/ToolChains/MipsLinux.h index a968804f2a6ec..f9bf2e1fcd363 100644 --- a/clang/lib/Driver/ToolChains/MipsLinux.h +++ b/clang/lib/Driver/ToolChains/MipsLinux.h @@ -37,9 +37,9 @@ class LLVM_LIBRARY_VISIBILITY MipsLLVMToolChain : public Linux { void AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const override; - std::string - getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component, - FileType Type = ToolChain::FT_Static) const override; + std::string getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component, + FileType Type = ToolChain::FT_Static, + bool IsFortran = false) const override; std::string computeSysRoot() const override; diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp index c5469f32ac80b..ae164be1b4e8b 100644 --- a/clang/lib/Driver/ToolChains/NetBSD.cpp +++ b/clang/lib/Driver/ToolChains/NetBSD.cpp @@ -328,8 +328,8 @@ void netbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA, // AddRunTimeLibs). if (D.IsFlangMode() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { - addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs); - addFortranRuntimeLibs(ToolChain, Args, CmdArgs); + ToolChain.addFortranRuntimeLibraryPath(Args, CmdArgs); + ToolChain.addFortranRuntimeLibs(Args, CmdArgs); CmdArgs.push_back("-lm"); } diff --git a/clang/lib/Driver/ToolChains/OHOS.cpp b/clang/lib/Driver/ToolChains/OHOS.cpp index e213c695a9fef..1cfa2a8f43b9d 100644 --- a/clang/lib/Driver/ToolChains/OHOS.cpp +++ b/clang/lib/Driver/ToolChains/OHOS.cpp @@ -341,7 +341,7 @@ std::string OHOS::getDynamicLinker(const ArgList &Args) const { } std::string OHOS::getCompilerRT(const ArgList &Args, StringRef Component, - FileType Type) const { + FileType Type, bool IsFortran) const { SmallString<128> Path(getDriver().ResourceDir); llvm::sys::path::append(Path, "lib", getMultiarchTriple(getTriple()), SelectedMultilib.gccSuffix()); diff --git a/clang/lib/Driver/ToolChains/OHOS.h b/clang/lib/Driver/ToolChains/OHOS.h index 2a380420922de..0e0543b406069 100644 --- a/clang/lib/Driver/ToolChains/OHOS.h +++ b/clang/lib/Driver/ToolChains/OHOS.h @@ -56,9 +56,9 @@ class LLVM_LIBRARY_VISIBILITY OHOS : public Generic_ELF { std::string computeSysRoot() const override; std::string getDynamicLinker(const llvm::opt::ArgList &Args) const override; - std::string - getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component, - FileType Type = ToolChain::FT_Static) const override; + std::string getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component, + FileType Type = ToolChain::FT_Static, + bool IsFortran = false) const override; const char *getDefaultLinker() const override { return "ld.lld"; diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp index f668a11e78f81..a5b1f06449b73 100644 --- a/clang/lib/Driver/ToolChains/OpenBSD.cpp +++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp @@ -241,8 +241,8 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA, // AddRunTimeLibs). if (D.IsFlangMode() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { - addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs); - addFortranRuntimeLibs(ToolChain, Args, CmdArgs); + ToolChain.addFortranRuntimeLibraryPath(Args, CmdArgs); + ToolChain.addFortranRuntimeLibs(Args, CmdArgs); if (Profiling) CmdArgs.push_back("-lm_p"); else @@ -372,7 +372,7 @@ void OpenBSD::AddCXXStdlibLibArgs(const ArgList &Args, } std::string OpenBSD::getCompilerRT(const ArgList &Args, StringRef Component, - FileType Type) const { + FileType Type, bool IsFortran) const { if (Component == "builtins") { SmallString<128> Path(getDriver().SysRoot); llvm::sys::path::append(Path, "/usr/lib/libcompiler_rt.a"); @@ -380,13 +380,13 @@ std::string OpenBSD::getCompilerRT(const ArgList &Args, StringRef Component, return std::string(Path); } SmallString<128> P(getDriver().ResourceDir); - std::string CRTBasename = - buildCompilerRTBasename(Args, Component, Type, /*AddArch=*/false); + std::string CRTBasename = buildCompilerRTBasename( + Args, Component, Type, /*AddArch=*/false, IsFortran); llvm::sys::path::append(P, "lib", CRTBasename); // Checks if this is the base system case which uses a different location. if (getVFS().exists(P)) return std::string(P); - return ToolChain::getCompilerRT(Args, Component, Type); + return ToolChain::getCompilerRT(Args, Component, Type, IsFortran); } Tool *OpenBSD::buildAssembler() const { diff --git a/clang/lib/Driver/ToolChains/OpenBSD.h b/clang/lib/Driver/ToolChains/OpenBSD.h index b4350e72d5d26..11b873cb30032 100644 --- a/clang/lib/Driver/ToolChains/OpenBSD.h +++ b/clang/lib/Driver/ToolChains/OpenBSD.h @@ -80,7 +80,8 @@ class LLVM_LIBRARY_VISIBILITY OpenBSD : public Generic_ELF { llvm::opt::ArgStringList &CmdArgs) const override; std::string getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component, - FileType Type = ToolChain::FT_Static) const override; + FileType Type = ToolChain::FT_Static, + bool IsFortran = false) const override; UnwindTableLevel getDefaultUnwindTableLevel(const llvm::opt::ArgList &Args) const override; diff --git a/clang/lib/Driver/ToolChains/PPCLinux.cpp b/clang/lib/Driver/ToolChains/PPCLinux.cpp index 0ed0f91ad166c..575e88c6ab124 100644 --- a/clang/lib/Driver/ToolChains/PPCLinux.cpp +++ b/clang/lib/Driver/ToolChains/PPCLinux.cpp @@ -12,6 +12,7 @@ #include "clang/Driver/Options.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/VirtualFileSystem.h" using namespace clang::driver; using namespace clang::driver::toolchains; @@ -101,3 +102,18 @@ bool PPCLinuxToolChain::SupportIEEEFloat128( return GlibcSupportsFloat128((Twine(D.DyldPrefix) + Linker).str()) && !(D.CCCIsCXX() && HasUnsupportedCXXLib); } + +void PPCLinuxToolChain::addFortranRuntimeLibs( + const ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const { + // Link static flang_rt.runtime.a or shared flang_rt.runtime.so + const char *Path; + if (getVFS().exists(Twine(Path = getCompilerRTArgString( + Args, "runtime", ToolChain::FT_Static, true)))) + CmdArgs.push_back(Path); + else if (getVFS().exists( + Twine(Path = getCompilerRTArgString( + Args, "runtime", ToolChain::FT_Shared, true)))) + CmdArgs.push_back(Path); + else + CmdArgs.push_back("-lflang_rt.runtime"); +} diff --git a/clang/lib/Driver/ToolChains/PPCLinux.h b/clang/lib/Driver/ToolChains/PPCLinux.h index 63adaff6be9c2..910df3d16e6a5 100644 --- a/clang/lib/Driver/ToolChains/PPCLinux.h +++ b/clang/lib/Driver/ToolChains/PPCLinux.h @@ -24,6 +24,9 @@ class LLVM_LIBRARY_VISIBILITY PPCLinuxToolChain : public Linux { AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; + void addFortranRuntimeLibs(const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs) const override; + private: bool SupportIEEEFloat128(const Driver &D, const llvm::Triple &Triple, const llvm::opt::ArgList &Args) const; diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp index fd3232b7c1b06..639497b8fbad2 100644 --- a/clang/lib/Driver/ToolChains/Solaris.cpp +++ b/clang/lib/Driver/ToolChains/Solaris.cpp @@ -225,8 +225,8 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA, // these dependencies need to be listed before the C runtime below. if (D.IsFlangMode() && !Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) { - addFortranRuntimeLibraryPath(getToolChain(), Args, CmdArgs); - addFortranRuntimeLibs(getToolChain(), Args, CmdArgs); + ToolChain.addFortranRuntimeLibraryPath(Args, CmdArgs); + ToolChain.addFortranRuntimeLibs(Args, CmdArgs); CmdArgs.push_back("-lm"); } if (Args.hasArg(options::OPT_fstack_protector) || diff --git a/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/powerpc64-ibm-aix/libflang_rt.runtime.a b/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/powerpc64-ibm-aix/libflang_rt.runtime.a new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/powerpc64le-unknown-linux-gnu/libflang_rt.runtime.a b/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/powerpc64le-unknown-linux-gnu/libflang_rt.runtime.a new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/powerpc64le-unknown-linux-gnu/libflang_rt.runtime.so b/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/powerpc64le-unknown-linux-gnu/libflang_rt.runtime.so new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/flang-rt/cmake/modules/GetToolchainDirs.cmake b/flang-rt/cmake/modules/GetToolchainDirs.cmake index 8b384180bcc31..fba12502b5946 100644 --- a/flang-rt/cmake/modules/GetToolchainDirs.cmake +++ b/flang-rt/cmake/modules/GetToolchainDirs.cmake @@ -34,9 +34,8 @@ function (get_toolchain_library_subdir outvar) set(outval "lib") - if (APPLE OR (UNIX AND CMAKE_SYSTEM_NAME MATCHES "AIX")) + if (APPLE) # Required to be "darwin" for MachO toolchain. - # AIX uses lib/${os_dir} as if LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=OFF get_toolchain_os_dirname(os_dirname) set(outval "${outval}/${os_dirname}") else () diff --git a/flang/test/Driver/flang-ld-powerpc.f90 b/flang/test/Driver/flang-ld-powerpc.f90 new file mode 100644 index 0000000000000..9a6ee453a22e4 --- /dev/null +++ b/flang/test/Driver/flang-ld-powerpc.f90 @@ -0,0 +1,41 @@ +!! Testing ld command with flang on POWERPC. +!! TODO: The AIX test case is meant to test the behavior of linking the static +!! libflang_rt.runtime.a, which will be enabled by a new compiler option +!! -static-libflang_rt in the future. Need to add that option here. + +!! Because flang-rt currently only supports +!! LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON, use +!! resource_dir_with_per_target_subdir as inputs. + +! Check powerpc64-ibm-aix 64-bit linking to static flang-rt +! RUN: %flang %s -### 2>&1 \ +! RUN: --target=powerpc64-ibm-aix \ +! RUN: -resource-dir=%S/../../../clang/test/Driver/Inputs/resource_dir_with_per_target_subdir \ +! RUN: | FileCheck %s --check-prefix=AIX64-LD-PER-TARGET + +! AIX64-LD-PER-TARGET-NOT: warning: +! AIX64-LD-PER-TARGET: "-fc1" "-triple" "powerpc64-ibm-aix" +! AIX64-LD-PER-TARGET-SAME: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]" +! AIX64-LD-PER-TARGET: "{{.*}}ld{{(.exe)?}}" +! AIX64-LD-PER-TARGET-NOT: "-bnso" +! AIX64-LD-PER-TARGET-SAME: "-b64" +! AIX64-LD-PER-TARGET-SAME: "-bpT:0x100000000" "-bpD:0x110000000" +! AIX64-LD-PER-TARGET-SAME: "-lc" +! AIX64-LD-PER-TARGET-SAME: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}powerpc64-ibm-aix{{/|\\\\}}libflang_rt.runtime.a" +! AIX64-LD-PER-TARGET-SAME: "-lm" +! AIX64-LD-PER-TARGET-SAME: "-lpthread" + +! Check powerpc64le-unknown-linux-gnu 64-bit linking to static flang-rt +! RUN: %flang %s -### 2>&1 \ +! RUN: --target=powerpc64le-unknown-linux-gnu \ +! RUN: -resource-dir=%S/../../../clang/test/Driver/Inputs/resource_dir_with_per_target_subdir \ +! RUN: | FileCheck %s --check-prefixes=LOP64-LD-PER-TARGET + +! LOP64-LD-PER-TARGET-NOT: warning: +! LOP64-LD-PER-TARGET: "-fc1" "-triple" "powerpc64le-unknown-linux-gnu" +! LOP64-LD-PER-TARGET-SAME: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]" +! LOP64-LD-PER-TARGET: "{{.*}}ld{{(.exe)?}}" +! LOP64-LD-PER-TARGET-NOT: "-bnso" +! LOP64-LD-PER-TARGET-SAME: "[[RESOURCE_DIR]]{{/|\\\\}}lib{{/|\\\\}}powerpc64le-unknown-linux-gnu{{/|\\\\}}libflang_rt.runtime.a" +! LOP64-LD-PER-TARGET-SAME: "-lm" +! LOP64-LD-PER-TARGET-SAME: "-lc" From 79435de8a51a3df4b74f858a604b7ff56b342ae7 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 3 Apr 2025 16:24:56 +0100 Subject: [PATCH 0542/1029] [ConstantFold] Support scalable constant splats in ConstantFoldCastInstruction (#133207) Previously only fixed vector splats were handled. This adds supports for scalable vectors too by allowing ConstantExpr splats. We need to add the extra V->getType()->isVectorTy() check because a ConstantExpr might be a scalar to vector bitcast. By allowing ConstantExprs this also allow fixed vector ConstantExprs to be folded, which causes the diffs in llvm/test/Analysis/ValueTracking/known-bits-from-operator-constexpr.ll and llvm/test/Transforms/InstSimplify/ConstProp/cast-vector.ll. I can remove them from this PR if reviewers would prefer. Fixes #132922 --- llvm/lib/IR/ConstantFold.cpp | 9 +++++---- .../known-bits-from-operator-constexpr.ll | 2 +- llvm/test/Transforms/InstCombine/addrspacecast.ll | 2 +- .../InstCombine/scalable-const-fp-splat.ll | 3 +-- .../test/Transforms/InstCombine/scalable-trunc.ll | 15 +++++++++++++++ .../InstSimplify/ConstProp/cast-vector.ll | 4 ++-- .../InstSimplify/ConstProp/vscale-inseltpoison.ll | 2 +- .../Transforms/InstSimplify/ConstProp/vscale.ll | 2 +- .../InstSimplify/vscale-inseltpoison.ll | 2 +- llvm/test/Transforms/InstSimplify/vscale.ll | 2 +- .../LoopVectorize/AArch64/induction-costs-sve.ll | 6 +++--- .../RISCV/truncate-to-minimal-bitwidth-cost.ll | 2 +- .../truncate-to-minimal-bitwidth-evl-crash.ll | 2 +- llvm/test/Transforms/MemCpyOpt/crash.ll | 2 +- llvm/test/Transforms/VectorCombine/pr88796.ll | 2 +- 15 files changed, 36 insertions(+), 21 deletions(-) diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp index b577f69eeaba0..7e5fda229b858 100644 --- a/llvm/lib/IR/ConstantFold.cpp +++ b/llvm/lib/IR/ConstantFold.cpp @@ -160,10 +160,9 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V, // If the cast operand is a constant vector, perform the cast by // operating on each element. In the cast of bitcasts, the element // count may be mismatched; don't attempt to handle that here. - if ((isa(V) || isa(V)) && - DestTy->isVectorTy() && - cast(DestTy)->getNumElements() == - cast(V->getType())->getNumElements()) { + if (DestTy->isVectorTy() && V->getType()->isVectorTy() && + cast(DestTy)->getElementCount() == + cast(V->getType())->getElementCount()) { VectorType *DestVecTy = cast(DestTy); Type *DstEltTy = DestVecTy->getElementType(); // Fast path for splatted constants. @@ -174,6 +173,8 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V, return ConstantVector::getSplat( cast(DestTy)->getElementCount(), Res); } + if (isa(DestTy)) + return nullptr; SmallVector res; Type *Ty = IntegerType::get(V->getContext(), 32); for (unsigned i = 0, diff --git a/llvm/test/Analysis/ValueTracking/known-bits-from-operator-constexpr.ll b/llvm/test/Analysis/ValueTracking/known-bits-from-operator-constexpr.ll index e3e30e052ee58..4dd9106898390 100644 --- a/llvm/test/Analysis/ValueTracking/known-bits-from-operator-constexpr.ll +++ b/llvm/test/Analysis/ValueTracking/known-bits-from-operator-constexpr.ll @@ -7,7 +7,7 @@ @g = global [21 x i32] zeroinitializer define i32 @test1(i32 %a) { ; CHECK-LABEL: @test1( -; CHECK-NEXT: [[T:%.*]] = sub i32 [[A:%.*]], extractelement (<4 x i32> ptrtoint (<4 x ptr> getelementptr inbounds ([21 x i32], ptr @g, <4 x i32> zeroinitializer, <4 x i32> ) to <4 x i32>), i32 3) +; CHECK-NEXT: [[T:%.*]] = sub i32 [[A:%.*]], ptrtoint (ptr getelementptr inbounds ([21 x i32], ptr @g, i32 0, i32 17) to i32) ; CHECK-NEXT: ret i32 [[T]] ; %t = sub i32 %a, extractelement (<4 x i32> ptrtoint (<4 x ptr> getelementptr inbounds ([21 x i32], ptr @g, <4 x i32> zeroinitializer, <4 x i32> ) to <4 x i32>), i32 3) diff --git a/llvm/test/Transforms/InstCombine/addrspacecast.ll b/llvm/test/Transforms/InstCombine/addrspacecast.ll index 00df545064069..8f3270cd60609 100644 --- a/llvm/test/Transforms/InstCombine/addrspacecast.ll +++ b/llvm/test/Transforms/InstCombine/addrspacecast.ll @@ -191,7 +191,7 @@ define ptr addrspace(4) @constant_fold_undef() #0 { define <4 x ptr addrspace(4)> @constant_fold_null_vector() #0 { ; CHECK-LABEL: @constant_fold_null_vector( -; CHECK-NEXT: ret <4 x ptr addrspace(4)> addrspacecast (<4 x ptr addrspace(3)> zeroinitializer to <4 x ptr addrspace(4)>) +; CHECK-NEXT: ret <4 x ptr addrspace(4)> ; %cast = addrspacecast <4 x ptr addrspace(3)> zeroinitializer to <4 x ptr addrspace(4)> ret <4 x ptr addrspace(4)> %cast diff --git a/llvm/test/Transforms/InstCombine/scalable-const-fp-splat.ll b/llvm/test/Transforms/InstCombine/scalable-const-fp-splat.ll index 595486361d16e..0982ecfbd3ea3 100644 --- a/llvm/test/Transforms/InstCombine/scalable-const-fp-splat.ll +++ b/llvm/test/Transforms/InstCombine/scalable-const-fp-splat.ll @@ -17,8 +17,7 @@ define @shrink_splat_scalable_extend( % define @shrink_splat_scalable_extend_rhs_constexpr( %a) { ; CHECK-LABEL: define @shrink_splat_scalable_extend_rhs_constexpr( ; CHECK-SAME: [[A:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = fptrunc splat (double -1.000000e+00) to -; CHECK-NEXT: [[TMP3:%.*]] = fadd [[A]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = fadd [[A]], splat (float -1.000000e+00) ; CHECK-NEXT: ret [[TMP3]] ; %2 = fpext %a to diff --git a/llvm/test/Transforms/InstCombine/scalable-trunc.ll b/llvm/test/Transforms/InstCombine/scalable-trunc.ll index dcf4abe10425b..6272ccfe9cdbd 100644 --- a/llvm/test/Transforms/InstCombine/scalable-trunc.ll +++ b/llvm/test/Transforms/InstCombine/scalable-trunc.ll @@ -20,6 +20,21 @@ entry: ret void } +define @constant_splat_trunc() { +; CHECK-LABEL: @constant_splat_trunc( +; CHECK-NEXT: ret splat (i8 1) +; + %1 = trunc splat (i64 1) to + ret %1 +} + +define @constant_splat_trunc_constantexpr() { +; CHECK-LABEL: @constant_splat_trunc_constantexpr( +; CHECK-NEXT: ret splat (i8 1) +; + ret trunc ( splat (i64 1) to ) +} + declare void @llvm.aarch64.sve.st1.nxv2i32(, , ptr) declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 %pattern) diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/cast-vector.ll b/llvm/test/Transforms/InstSimplify/ConstProp/cast-vector.ll index 3e4504a166366..f42f4071ac239 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/cast-vector.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/cast-vector.ll @@ -8,7 +8,7 @@ define <2 x i16> @test1() { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: -; CHECK-NEXT: ret <2 x i16> ptrtoint (<2 x ptr> getelementptr inbounds ([10 x i32], ptr null, <2 x i64> zeroinitializer, <2 x i64> ) to <2 x i16>) +; CHECK-NEXT: ret <2 x i16> ; entry: %gep = getelementptr inbounds [10 x i32], ptr null, i16 0, <2 x i16> @@ -23,7 +23,7 @@ entry: define <2 x i16> @test2() { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: -; CHECK-NEXT: ret <2 x i16> ptrtoint (<2 x ptr> getelementptr (i32, ptr null, <2 x i64> ) to <2 x i16>) +; CHECK-NEXT: ret <2 x i16> ; entry: %gep = getelementptr i32, ptr null, <2 x i16> diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll index a38dfaf8f5819..edc1260eca821 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale-inseltpoison.ll @@ -208,7 +208,7 @@ define @shufflevector() { define @bitcast() { ; CHECK-LABEL: @bitcast( -; CHECK-NEXT: ret bitcast ( splat (i32 1) to ) +; CHECK-NEXT: ret splat (float 0x36A0000000000000) ; %i1 = insertelement poison, i32 1, i32 0 %i2 = shufflevector %i1, poison, zeroinitializer diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll index e24f57445a4d1..8ee6fa6e5f37f 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/vscale.ll @@ -208,7 +208,7 @@ define @shufflevector() { define @bitcast() { ; CHECK-LABEL: @bitcast( -; CHECK-NEXT: ret bitcast ( splat (i32 1) to ) +; CHECK-NEXT: ret splat (float 0x36A0000000000000) ; %i1 = insertelement undef, i32 1, i32 0 %i2 = shufflevector %i1, undef, zeroinitializer diff --git a/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll index 70ca39da95310..593f334abac1e 100644 --- a/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll +++ b/llvm/test/Transforms/InstSimplify/vscale-inseltpoison.ll @@ -140,7 +140,7 @@ define @cmp_le_smax_always_true( %x) { define @bitcast() { ; CHECK-LABEL: @bitcast( -; CHECK-NEXT: ret bitcast ( splat (i32 1) to ) +; CHECK-NEXT: ret splat (float 0x36A0000000000000) ; %i1 = insertelement poison, i32 1, i32 0 %i2 = shufflevector %i1, poison, zeroinitializer diff --git a/llvm/test/Transforms/InstSimplify/vscale.ll b/llvm/test/Transforms/InstSimplify/vscale.ll index 47cd88f4d5e4a..c09a0c201d761 100644 --- a/llvm/test/Transforms/InstSimplify/vscale.ll +++ b/llvm/test/Transforms/InstSimplify/vscale.ll @@ -152,7 +152,7 @@ define @cmp_le_smax_always_true( %x) { define @bitcast() { ; CHECK-LABEL: @bitcast( -; CHECK-NEXT: ret bitcast ( splat (i32 1) to ) +; CHECK-NEXT: ret splat (float 0x36A0000000000000) ; %i1 = insertelement undef, i32 1, i32 0 %i2 = shufflevector %i1, undef, zeroinitializer diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index d7b9d4eba2462..08fea4bfc9b2e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -51,8 +51,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; DEFAULT-NEXT: [[TMP31:%.*]] = zext [[WIDE_LOAD4]] to ; DEFAULT-NEXT: [[TMP32:%.*]] = or [[TMP28]], [[TMP30]] ; DEFAULT-NEXT: [[TMP33:%.*]] = or [[TMP29]], [[TMP31]] -; DEFAULT-NEXT: [[TMP34:%.*]] = lshr [[TMP32]], trunc ( splat (i32 1) to ) -; DEFAULT-NEXT: [[TMP35:%.*]] = lshr [[TMP33]], trunc ( splat (i32 1) to ) +; DEFAULT-NEXT: [[TMP34:%.*]] = lshr [[TMP32]], splat (i16 1) +; DEFAULT-NEXT: [[TMP35:%.*]] = lshr [[TMP33]], splat (i16 1) ; DEFAULT-NEXT: [[TMP36:%.*]] = trunc [[TMP34]] to ; DEFAULT-NEXT: [[TMP37:%.*]] = trunc [[TMP35]] to ; DEFAULT-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] @@ -131,7 +131,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; PRED-NEXT: [[TMP22:%.*]] = mul [[TMP17]], [[TMP16]] ; PRED-NEXT: [[TMP24:%.*]] = zext [[WIDE_MASKED_LOAD]] to ; PRED-NEXT: [[TMP20:%.*]] = or [[TMP22]], [[TMP24]] -; PRED-NEXT: [[TMP21:%.*]] = lshr [[TMP20]], trunc ( splat (i32 1) to ) +; PRED-NEXT: [[TMP21:%.*]] = lshr [[TMP20]], splat (i16 1) ; PRED-NEXT: [[TMP23:%.*]] = trunc [[TMP21]] to ; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] ; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index 12347103f64d4..dc2e99d33c377 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -169,7 +169,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[N_VEC]] to i8 ; CHECK-NEXT: [[TMP7:%.*]] = trunc [[BROADCAST_SPLAT]] to -; CHECK-NEXT: [[TMP8:%.*]] = or trunc ( splat (i8 23) to ), [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = or splat (i1 true), [[TMP7]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[DST]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll index dc7dd2c388731..656ae1cc03a80 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll @@ -30,7 +30,7 @@ define void @truncate_to_minimal_bitwidths_widen_cast_recipe(ptr %src) { ; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv8i8.p0(ptr align 1 [[TMP6]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP8:%.*]] = zext [[VP_OP_LOAD]] to ; CHECK-NEXT: [[TMP12:%.*]] = mul zeroinitializer, [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = lshr [[TMP12]], trunc ( splat (i32 1) to ) +; CHECK-NEXT: [[TMP13:%.*]] = lshr [[TMP12]], splat (i16 1) ; CHECK-NEXT: [[TMP14:%.*]] = trunc [[TMP13]] to ; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i8.nxv8p0( [[TMP14]], align 1 zeroinitializer, splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 diff --git a/llvm/test/Transforms/MemCpyOpt/crash.ll b/llvm/test/Transforms/MemCpyOpt/crash.ll index 494140a6d9262..0a5e76d99caf4 100644 --- a/llvm/test/Transforms/MemCpyOpt/crash.ll +++ b/llvm/test/Transforms/MemCpyOpt/crash.ll @@ -85,7 +85,7 @@ define void @test2(i32 %cmd) nounwind { define void @inttoptr_constexpr_crash(ptr %p) { ; CHECK-LABEL: @inttoptr_constexpr_crash( -; CHECK-NEXT: store <1 x ptr> inttoptr (<1 x i16> bitcast (<2 x i8> to <1 x i16>) to <1 x ptr>), ptr [[P:%.*]], align 1 +; CHECK-NEXT: store <1 x ptr> bitcast (<2 x i8> to <1 x i16>), i32 0) to ptr)>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: ret void ; store <1 x ptr> inttoptr (<1 x i16> bitcast (<2 x i8> to <1 x i16>) to <1 x ptr>), ptr %p, align 1 diff --git a/llvm/test/Transforms/VectorCombine/pr88796.ll b/llvm/test/Transforms/VectorCombine/pr88796.ll index 6f988922f2cc0..3ca0786a6e803 100644 --- a/llvm/test/Transforms/VectorCombine/pr88796.ll +++ b/llvm/test/Transforms/VectorCombine/pr88796.ll @@ -4,7 +4,7 @@ define i32 @test() { ; CHECK-LABEL: define i32 @test() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i16 @llvm.vector.reduce.and.nxv8i16( trunc ( splat (i32 268435456) to )) +; CHECK-NEXT: [[TMP0:%.*]] = tail call i16 @llvm.vector.reduce.and.nxv8i16( zeroinitializer) ; CHECK-NEXT: ret i32 0 ; entry: From 12f75bba41d7b9752df799349b2b32bdf68e9765 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 3 Apr 2025 16:28:24 +0100 Subject: [PATCH 0543/1029] Revert "[X86] SimplifyDemandedVectorEltsForTargetNode - reduce the size of VPERMV/VPERMV3 nodes if the upper elements are not demanded" (#134256) Found a typo in the VPERMV3 mask adjustment - I'm going to revert and re-apply the patch with a fix Reverts llvm/llvm-project#133923 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 63 ------------ .../any_extend_vector_inreg_of_broadcast.ll | 46 ++++----- ...d_vector_inreg_of_broadcast_from_memory.ll | 8 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll | 12 +-- .../vector-interleaved-load-i16-stride-5.ll | 76 +++++++-------- .../vector-interleaved-store-i64-stride-5.ll | 32 +++---- .../vector-interleaved-store-i64-stride-6.ll | 96 +++++++++---------- .../vector-shuffle-combining-avx512bwvl.ll | 6 +- .../zero_extend_vector_inreg_of_broadcast.ll | 22 ++--- ...d_vector_inreg_of_broadcast_from_memory.ll | 8 +- 10 files changed, 152 insertions(+), 217 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d1be19539b642..546a2d22fa58e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43827,69 +43827,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } break; } - case X86ISD::VPERMV: { - SmallVector Mask; - SmallVector Ops; - if ((VT.is256BitVector() || Subtarget.hasVLX()) && - getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) { - // For lane-crossing shuffles, only split in half in case we're still - // referencing higher elements. - unsigned HalfElts = NumElts / 2; - unsigned HalfSize = SizeInBits / 2; - Mask.resize(HalfElts); - if (all_of(Mask, - [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) { - MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT(); - SDLoc DL(Op); - SDValue Ext; - SDValue M = - extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize); - SDValue V = - extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize); - // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS. - if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16) - Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V); - else - Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, HalfVT, V, M); - SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false, - Subtarget, TLO.DAG, DL, SizeInBits); - return TLO.CombineTo(Op, Insert); - } - } - break; - } - case X86ISD::VPERMV3: { - SmallVector Mask; - SmallVector Ops; - if (Subtarget.hasVLX() && - getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) { - // For lane-crossing shuffles, only split in half in case we're still - // referencing higher elements. - unsigned HalfElts = NumElts / 2; - unsigned HalfSize = SizeInBits / 2; - Mask.resize(HalfElts); - if (all_of(Mask, [&](int M) { - return isUndefOrInRange(M, 0, HalfElts) || - isUndefOrInRange(M, NumElts, NumElts + HalfElts); - })) { - // Adjust mask elements for 2nd operand to point to half width. - for (int &M : Mask) - M = M <= NumElts ? M : (M - HalfElts); - MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT(); - MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger(); - SDLoc DL(Op); - SDValue Ext = TLO.DAG.getNode( - Opc, DL, HalfVT, - extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize), - getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true), - extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize)); - SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false, - Subtarget, TLO.DAG, DL, SizeInBits); - return TLO.CombineTo(Op, Insert); - } - } - break; - } case X86ISD::VPERM2X128: { // Simplify VPERM2F128/VPERM2I128 to extract_subvector. SDLoc DL(Op); diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index b075d48627b18..6f4e7abda8b00 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax ; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -4610,10 +4610,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15] -; AVX512F-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23] +; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4623,10 +4623,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15] -; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -4868,10 +4868,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] -; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11] +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4881,10 +4881,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] -; AVX512DQ-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11] +; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 61e122b1aba36..52f856befa130 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll index a84466bc1ca1a..26af46263c0e2 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -1113,8 +1113,8 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind ; ; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8: ; AVX512VBMI-FAST: # %bb.0: -; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [64,65,66,67,68,69,24,28,32,36,40,44,48,52,56,79] -; AVX512VBMI-FAST-NEXT: vpmovdb %ymm0, %xmm2 +; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,79] +; AVX512VBMI-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VBMI-FAST-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 ; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm0, %eax @@ -1124,14 +1124,14 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind ; ; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8: ; AVX512VBMI-SLOW: # %bb.0: -; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,92,96,100,104,108,112,13,14,15] -; AVX512VBMI-SLOW-NEXT: vpmovdb %ymm0, %xmm2 -; AVX512VBMI-SLOW-NEXT: vpermt2b %zmm0, %zmm1, %zmm2 +; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,77,78,79] +; AVX512VBMI-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI-SLOW-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 ; AVX512VBMI-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; AVX512VBMI-SLOW-NEXT: vpextrw $6, %xmm0, %eax ; AVX512VBMI-SLOW-NEXT: vpextrw $4, %xmm0, %ecx ; AVX512VBMI-SLOW-NEXT: vpextrw $2, %xmm0, %edx -; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm2, %xmm0 +; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm1, %xmm0 ; AVX512VBMI-SLOW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; AVX512VBMI-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512VBMI-SLOW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 9b19ec15c6f55..739e6e2369e36 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -593,104 +593,100 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i16_stride5_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax -; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 +; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 +; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 -; AVX512BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-NEXT: vmovq %xmm3, (%rcx) ; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-NEXT: vmovq %xmm1, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride5_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax -; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride5_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax -; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax -; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <20 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index f41123c5c3cfd..05c111ae5049f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -123,8 +123,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa %xmm3, 64(%r9) @@ -140,8 +140,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm3, 64(%r9) @@ -157,8 +157,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa %xmm3, 64(%r9) @@ -174,8 +174,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 64(%r9) @@ -191,8 +191,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa %xmm3, 64(%r9) @@ -208,8 +208,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9) @@ -225,8 +225,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, 64(%r9) @@ -242,8 +242,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index aac6a1bddd08a..c2f1723d8031e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -139,12 +139,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -158,12 +158,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -177,12 +177,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -196,12 +196,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -215,12 +215,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -234,12 +234,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -253,12 +253,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -272,12 +272,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index f5cd3e580d017..ec09c3117c77f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -187,8 +187,10 @@ define <8 x i32> @concat_vrotlv_v4i32(<4 x i32> %a0, <4 x i32> %a1, <8 x i32> %a define <8 x i16> @demandedelts_vpermvar_32i16_v8i16(<32 x i16> %x0) { ; CHECK-LABEL: demandedelts_vpermvar_32i16_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [7,0,6,1,5,2,4,3] -; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,0,6,1,5,2,4,3,7,0,6,1,5,2,4,3,7,0,6,1,5,2,4,3,7,0,6,1,5,2,4,3] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %shuffle = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> ) diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index ea0e3b3a2b9aa..35f25d36cb2e9 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax ; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 +; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] +; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index a3e2fb5321f32..a598e30845579 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) From c1ada72b0995844299ef40433314124266015428 Mon Sep 17 00:00:00 2001 From: Julian Brown Date: Thu, 3 Apr 2025 16:34:35 +0100 Subject: [PATCH 0544/1029] [OpenMP] Mark 'map-type modifiers in arbitrary position' done (#133906) I think #90499 already implements support for the listed OpenMP 6.0 feature mentioned in the title. This patch just marks it done (for C/C++). --- clang/docs/OpenMPSupport.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index 83d90ffef6bc7..f39987caf5c43 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -456,7 +456,7 @@ implementation. +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | ref modifier for map clauses | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| map-type modifiers in arbitrary position | :none:`unclaimed` | :none:`unclaimed` | | +| map-type modifiers in arbitrary position | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/90499 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Lift nesting restriction on concurrent loop | :good:`done` | :none:`unclaimed` | https://github.com/llvm/llvm-project/pull/125621 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ From 49fd0bf35d2e04a0d76ac7fd13b3e3439a91f76f Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Thu, 3 Apr 2025 12:36:15 -0300 Subject: [PATCH 0545/1029] [clang] support pack expansions for trailing requires clauses (#133190) --- .../refactor/tweaks/ExtractVariable.cpp | 6 +-- clang/docs/ReleaseNotes.rst | 2 + clang/include/clang/AST/ASTContext.h | 8 +++ clang/include/clang/AST/ASTNodeTraverser.h | 4 +- clang/include/clang/AST/Decl.h | 37 +++++++------- clang/include/clang/AST/DeclCXX.h | 20 ++++---- clang/include/clang/AST/ExprCXX.h | 2 +- clang/include/clang/AST/RecursiveASTVisitor.h | 9 ++-- clang/include/clang/Sema/Sema.h | 14 ++--- clang/lib/AST/ASTContext.cpp | 13 ++++- clang/lib/AST/ASTImporter.cpp | 5 +- clang/lib/AST/Decl.cpp | 15 +++--- clang/lib/AST/DeclCXX.cpp | 33 +++++++----- clang/lib/AST/DeclPrinter.cpp | 10 ++-- clang/lib/AST/DeclTemplate.cpp | 4 +- clang/lib/AST/ExprCXX.cpp | 2 +- clang/lib/AST/ItaniumMangle.cpp | 2 +- clang/lib/ASTMatchers/ASTMatchFinder.cpp | 3 +- clang/lib/Index/IndexDecl.cpp | 4 +- clang/lib/Sema/SemaConcept.cpp | 6 +-- clang/lib/Sema/SemaDecl.cpp | 21 ++++---- clang/lib/Sema/SemaDeclCXX.cpp | 4 +- clang/lib/Sema/SemaFunctionEffects.cpp | 2 +- clang/lib/Sema/SemaLambda.cpp | 18 ++++--- clang/lib/Sema/SemaOverload.cpp | 12 +++-- clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 51 ++++++++++++------- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 4 +- clang/lib/Sema/TreeTransform.h | 7 ++- clang/lib/Serialization/ASTReaderDecl.cpp | 3 +- clang/lib/Serialization/ASTWriterDecl.cpp | 5 +- .../SemaCXX/fold_lambda_with_variadics.cpp | 9 ++++ clang/tools/libclang/CIndex.cpp | 2 +- 32 files changed, 204 insertions(+), 133 deletions(-) diff --git a/clang-tools-extra/clangd/refactor/tweaks/ExtractVariable.cpp b/clang-tools-extra/clangd/refactor/tweaks/ExtractVariable.cpp index d84e501b87ce7..90dac3b76c648 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/ExtractVariable.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/ExtractVariable.cpp @@ -100,9 +100,9 @@ computeReferencedDecls(const clang::Expr *Expr) { TraverseLambdaCapture(LExpr, &Capture, Initializer); } - if (clang::Expr *const RequiresClause = - LExpr->getTrailingRequiresClause()) { - TraverseStmt(RequiresClause); + if (const clang::Expr *RequiresClause = + LExpr->getTrailingRequiresClause().ConstraintExpr) { + TraverseStmt(const_cast(RequiresClause)); } for (auto *const TemplateParam : LExpr->getExplicitTemplateParameters()) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 3055394dd8b6c..c521b56a98606 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -385,6 +385,8 @@ Bug Fixes to C++ Support - Improved fix for an issue with pack expansions of type constraints, where this now also works if the constraint has non-type or template template parameters. (#GH131798) +- Fix crash when evaluating the trailing requires clause of generic lambdas which are part of + a pack expansion. - Fixes matching of nested template template parameters. (#GH130362) - Correctly diagnoses template template paramters which have a pack parameter not in the last position. diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index b3010fa888fa4..6bdafbdafda94 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -2907,6 +2907,14 @@ class ASTContext : public RefCountedBase { /// that they may be used in declarations of the same template. bool isSameTemplateParameter(const NamedDecl *X, const NamedDecl *Y) const; + /// Determine whether two 'requires' expressions are similar enough that they + /// may be used in re-declarations. + /// + /// Use of 'requires' isn't mandatory, works with constraints expressed in + /// other ways too. + bool isSameAssociatedConstraint(const AssociatedConstraint &ACX, + const AssociatedConstraint &ACY) const; + /// Determine whether two 'requires' expressions are similar enough that they /// may be used in re-declarations. /// diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h index f086d8134a64b..7bb435146f752 100644 --- a/clang/include/clang/AST/ASTNodeTraverser.h +++ b/clang/include/clang/AST/ASTNodeTraverser.h @@ -538,8 +538,8 @@ class ASTNodeTraverser for (const auto *Parameter : D->parameters()) Visit(Parameter); - if (const Expr *TRC = D->getTrailingRequiresClause()) - Visit(TRC); + if (const AssociatedConstraint &TRC = D->getTrailingRequiresClause()) + Visit(TRC.ConstraintExpr); if (Traversal == TK_IgnoreUnlessSpelledInSource && D->isDefaulted()) return; diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index 9e7e93d98c9d1..e4f1e2921bef8 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -81,13 +81,19 @@ enum class ImplicitParamKind; // Holds a constraint expression along with a pack expansion index, if // expanded. struct AssociatedConstraint { - const Expr *ConstraintExpr; - int ArgumentPackSubstitutionIndex; + const Expr *ConstraintExpr = nullptr; + int ArgumentPackSubstitutionIndex = -1; + + constexpr AssociatedConstraint() = default; explicit AssociatedConstraint(const Expr *ConstraintExpr, int ArgumentPackSubstitutionIndex = -1) : ConstraintExpr(ConstraintExpr), ArgumentPackSubstitutionIndex(ArgumentPackSubstitutionIndex) {} + + explicit operator bool() const { return ConstraintExpr != nullptr; } + + bool isNull() const { return !operator bool(); } }; /// The top declaration context. @@ -754,7 +760,7 @@ class DeclaratorDecl : public ValueDecl { // and constrained function decls. struct ExtInfo : public QualifierInfo { TypeSourceInfo *TInfo = nullptr; - Expr *TrailingRequiresClause = nullptr; + AssociatedConstraint TrailingRequiresClause; }; llvm::PointerUnion DeclInfo; @@ -823,17 +829,12 @@ class DeclaratorDecl : public ValueDecl { /// \brief Get the constraint-expression introduced by the trailing /// requires-clause in the function/member declaration, or null if no /// requires-clause was provided. - Expr *getTrailingRequiresClause() { - return hasExtInfo() ? getExtInfo()->TrailingRequiresClause - : nullptr; - } - - const Expr *getTrailingRequiresClause() const { - return hasExtInfo() ? getExtInfo()->TrailingRequiresClause - : nullptr; + const AssociatedConstraint &getTrailingRequiresClause() const { + static constexpr AssociatedConstraint Null; + return hasExtInfo() ? getExtInfo()->TrailingRequiresClause : Null; } - void setTrailingRequiresClause(Expr *TrailingRequiresClause); + void setTrailingRequiresClause(const AssociatedConstraint &AC); unsigned getNumTemplateParameterLists() const { return hasExtInfo() ? getExtInfo()->NumTemplParamLists : 0; @@ -2102,7 +2103,7 @@ class FunctionDecl : public DeclaratorDecl, const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, StorageClass S, bool UsesFPIntrin, bool isInlineSpecified, ConstexprSpecKind ConstexprKind, - Expr *TrailingRequiresClause = nullptr); + const AssociatedConstraint &TrailingRequiresClause); using redeclarable_base = Redeclarable; @@ -2138,7 +2139,7 @@ class FunctionDecl : public DeclaratorDecl, TypeSourceInfo *TInfo, StorageClass SC, bool UsesFPIntrin = false, bool isInlineSpecified = false, bool hasWrittenPrototype = true, ConstexprSpecKind ConstexprKind = ConstexprSpecKind::Unspecified, - Expr *TrailingRequiresClause = nullptr) { + const AssociatedConstraint &TrailingRequiresClause = {}) { DeclarationNameInfo NameInfo(N, NLoc); return FunctionDecl::Create(C, DC, StartLoc, NameInfo, T, TInfo, SC, UsesFPIntrin, isInlineSpecified, @@ -2151,7 +2152,7 @@ class FunctionDecl : public DeclaratorDecl, const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, StorageClass SC, bool UsesFPIntrin, bool isInlineSpecified, bool hasWrittenPrototype, ConstexprSpecKind ConstexprKind, - Expr *TrailingRequiresClause); + const AssociatedConstraint &TrailingRequiresClause); static FunctionDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID); @@ -2644,9 +2645,9 @@ class FunctionDecl : public DeclaratorDecl, /// Use this instead of getTrailingRequiresClause for concepts APIs that /// accept an ArrayRef of constraint expressions. void - getAssociatedConstraints(SmallVectorImpl &AC) const { - if (auto *TRC = getTrailingRequiresClause()) - AC.emplace_back(TRC); + getAssociatedConstraints(SmallVectorImpl &ACs) const { + if (const AssociatedConstraint &AC = getTrailingRequiresClause()) + ACs.emplace_back(AC); } /// Get the message that indicates why this function was deleted. diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h index 7dbefeea4b1a3..764f85b04e6a0 100644 --- a/clang/include/clang/AST/DeclCXX.h +++ b/clang/include/clang/AST/DeclCXX.h @@ -1974,7 +1974,7 @@ class CXXDeductionGuideDecl : public FunctionDecl { const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, SourceLocation EndLocation, CXXConstructorDecl *Ctor, DeductionCandidate Kind, - Expr *TrailingRequiresClause, + const AssociatedConstraint &TrailingRequiresClause, const CXXDeductionGuideDecl *GeneratedFrom, SourceDeductionGuideKind SourceKind) : FunctionDecl(CXXDeductionGuide, C, DC, StartLoc, NameInfo, T, TInfo, @@ -2007,7 +2007,7 @@ class CXXDeductionGuideDecl : public FunctionDecl { TypeSourceInfo *TInfo, SourceLocation EndLocation, CXXConstructorDecl *Ctor = nullptr, DeductionCandidate Kind = DeductionCandidate::Normal, - Expr *TrailingRequiresClause = nullptr, + const AssociatedConstraint &TrailingRequiresClause = {}, const CXXDeductionGuideDecl *SourceDG = nullptr, SourceDeductionGuideKind SK = SourceDeductionGuideKind::None); @@ -2115,7 +2115,7 @@ class CXXMethodDecl : public FunctionDecl { QualType T, TypeSourceInfo *TInfo, StorageClass SC, bool UsesFPIntrin, bool isInline, ConstexprSpecKind ConstexprKind, SourceLocation EndLocation, - Expr *TrailingRequiresClause = nullptr) + const AssociatedConstraint &TrailingRequiresClause = {}) : FunctionDecl(DK, C, RD, StartLoc, NameInfo, T, TInfo, SC, UsesFPIntrin, isInline, ConstexprKind, TrailingRequiresClause) { if (EndLocation.isValid()) @@ -2128,7 +2128,7 @@ class CXXMethodDecl : public FunctionDecl { const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, StorageClass SC, bool UsesFPIntrin, bool isInline, ConstexprSpecKind ConstexprKind, SourceLocation EndLocation, - Expr *TrailingRequiresClause = nullptr); + const AssociatedConstraint &TrailingRequiresClause = {}); static CXXMethodDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID); @@ -2596,7 +2596,7 @@ class CXXConstructorDecl final bool UsesFPIntrin, bool isInline, bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind, InheritedConstructor Inherited, - Expr *TrailingRequiresClause); + const AssociatedConstraint &TrailingRequiresClause); void anchor() override; @@ -2639,7 +2639,7 @@ class CXXConstructorDecl final ExplicitSpecifier ES, bool UsesFPIntrin, bool isInline, bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind, InheritedConstructor Inherited = InheritedConstructor(), - Expr *TrailingRequiresClause = nullptr); + const AssociatedConstraint &TrailingRequiresClause = {}); void setExplicitSpecifier(ExplicitSpecifier ES) { assert((!ES.getExpr() || @@ -2859,7 +2859,7 @@ class CXXDestructorDecl : public CXXMethodDecl { const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, bool UsesFPIntrin, bool isInline, bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind, - Expr *TrailingRequiresClause = nullptr) + const AssociatedConstraint &TrailingRequiresClause = {}) : CXXMethodDecl(CXXDestructor, C, RD, StartLoc, NameInfo, T, TInfo, SC_None, UsesFPIntrin, isInline, ConstexprKind, SourceLocation(), TrailingRequiresClause) { @@ -2874,7 +2874,7 @@ class CXXDestructorDecl : public CXXMethodDecl { const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, bool UsesFPIntrin, bool isInline, bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind, - Expr *TrailingRequiresClause = nullptr); + const AssociatedConstraint &TrailingRequiresClause = {}); static CXXDestructorDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID); void setOperatorDelete(FunctionDecl *OD, Expr *ThisArg); @@ -2925,7 +2925,7 @@ class CXXConversionDecl : public CXXMethodDecl { TypeSourceInfo *TInfo, bool UsesFPIntrin, bool isInline, ExplicitSpecifier ES, ConstexprSpecKind ConstexprKind, SourceLocation EndLocation, - Expr *TrailingRequiresClause = nullptr) + const AssociatedConstraint &TrailingRequiresClause = {}) : CXXMethodDecl(CXXConversion, C, RD, StartLoc, NameInfo, T, TInfo, SC_None, UsesFPIntrin, isInline, ConstexprKind, EndLocation, TrailingRequiresClause), @@ -2943,7 +2943,7 @@ class CXXConversionDecl : public CXXMethodDecl { const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, bool UsesFPIntrin, bool isInline, ExplicitSpecifier ES, ConstexprSpecKind ConstexprKind, SourceLocation EndLocation, - Expr *TrailingRequiresClause = nullptr); + const AssociatedConstraint &TrailingRequiresClause = {}); static CXXConversionDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID); ExplicitSpecifier getExplicitSpecifier() { diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 028ee82718d50..0f1455d8be3ca 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -2129,7 +2129,7 @@ class LambdaExpr final : public Expr, ArrayRef getExplicitTemplateParameters() const; /// Get the trailing requires clause, if any. - Expr *getTrailingRequiresClause() const; + const AssociatedConstraint &getTrailingRequiresClause() const; /// Whether this is a generic lambda. bool isGenericLambda() const { return getTemplateParameterList(); } diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 0530996ed20d3..3edc8684d0a19 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -2253,8 +2253,10 @@ bool RecursiveASTVisitor::TraverseFunctionHelper(FunctionDecl *D) { } // Visit the trailing requires clause, if any. - if (Expr *TrailingRequiresClause = D->getTrailingRequiresClause()) { - TRY_TO(TraverseStmt(TrailingRequiresClause)); + if (const AssociatedConstraint &TrailingRequiresClause = + D->getTrailingRequiresClause()) { + TRY_TO(TraverseStmt( + const_cast(TrailingRequiresClause.ConstraintExpr))); } if (CXXConstructorDecl *Ctor = dyn_cast(D)) { @@ -2768,7 +2770,8 @@ DEF_TRAVERSE_STMT(LambdaExpr, { if (S->hasExplicitResultType()) TRY_TO(TraverseTypeLoc(Proto.getReturnLoc())); - TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getTrailingRequiresClause()); + TRY_TO_TRAVERSE_OR_ENQUEUE_STMT( + const_cast(S->getTrailingRequiresClause().ConstraintExpr)); TRY_TO_TRAVERSE_OR_ENQUEUE_STMT(S->getBody()); } diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index a62562bb134f5..1aa0e4a9917de 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -8867,12 +8867,14 @@ class Sema final : public SemaBase { CXXMethodDecl *CallOperator, CXXRecordDecl *Class, TemplateParameterList *TemplateParams); - void CompleteLambdaCallOperator( - CXXMethodDecl *Method, SourceLocation LambdaLoc, - SourceLocation CallOperatorLoc, Expr *TrailingRequiresClause, - TypeSourceInfo *MethodTyInfo, ConstexprSpecKind ConstexprKind, - StorageClass SC, ArrayRef Params, - bool HasExplicitResultType); + void + CompleteLambdaCallOperator(CXXMethodDecl *Method, SourceLocation LambdaLoc, + SourceLocation CallOperatorLoc, + const AssociatedConstraint &TrailingRequiresClause, + TypeSourceInfo *MethodTyInfo, + ConstexprSpecKind ConstexprKind, StorageClass SC, + ArrayRef Params, + bool HasExplicitResultType); /// Returns true if the explicit object parameter was invalid. bool DiagnoseInvalidExplicitObjectParameterInLambda(CXXMethodDecl *Method, diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 0270a8e012849..a73c15ae6bcc7 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -7070,6 +7070,15 @@ bool ASTContext::hasSameTemplateName(const TemplateName &X, getCanonicalTemplateName(Y, IgnoreDeduced); } +bool ASTContext::isSameAssociatedConstraint( + const AssociatedConstraint &ACX, const AssociatedConstraint &ACY) const { + if (ACX.ArgumentPackSubstitutionIndex != ACY.ArgumentPackSubstitutionIndex) + return false; + if (!isSameConstraintExpr(ACX.ConstraintExpr, ACY.ConstraintExpr)) + return false; + return true; +} + bool ASTContext::isSameConstraintExpr(const Expr *XCE, const Expr *YCE) const { if (!XCE != !YCE) return false; @@ -7386,8 +7395,8 @@ bool ASTContext::isSameEntity(const NamedDecl *X, const NamedDecl *Y) const { return false; } - if (!isSameConstraintExpr(FuncX->getTrailingRequiresClause(), - FuncY->getTrailingRequiresClause())) + if (!isSameAssociatedConstraint(FuncX->getTrailingRequiresClause(), + FuncY->getTrailingRequiresClause())) return false; auto GetTypeAsWritten = [](const FunctionDecl *FD) { diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 893160e8f5ba9..73dc355003e6b 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -3915,8 +3915,9 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) { auto ToEndLoc = importChecked(Err, D->getEndLoc()); auto ToDefaultLoc = importChecked(Err, D->getDefaultLoc()); auto ToQualifierLoc = importChecked(Err, D->getQualifierLoc()); - auto TrailingRequiresClause = - importChecked(Err, D->getTrailingRequiresClause()); + AssociatedConstraint TrailingRequiresClause = D->getTrailingRequiresClause(); + TrailingRequiresClause.ConstraintExpr = + importChecked(Err, TrailingRequiresClause.ConstraintExpr); if (Err) return std::move(Err); diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 5f5568518e0b9..568d74cc7df0b 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -2009,8 +2009,8 @@ void DeclaratorDecl::setQualifierInfo(NestedNameSpecifierLoc QualifierLoc) { } } -void DeclaratorDecl::setTrailingRequiresClause(Expr *TrailingRequiresClause) { - assert(TrailingRequiresClause); +void DeclaratorDecl::setTrailingRequiresClause(const AssociatedConstraint &AC) { + assert(AC); // Make sure the extended decl info is allocated. if (!hasExtInfo()) { // Save (non-extended) type source info pointer. @@ -2021,7 +2021,7 @@ void DeclaratorDecl::setTrailingRequiresClause(Expr *TrailingRequiresClause) { getExtInfo()->TInfo = savedTInfo; } // Set requires clause info. - getExtInfo()->TrailingRequiresClause = TrailingRequiresClause; + getExtInfo()->TrailingRequiresClause = AC; } void DeclaratorDecl::setTemplateParameterListsInfo( @@ -3047,7 +3047,7 @@ FunctionDecl::FunctionDecl(Kind DK, ASTContext &C, DeclContext *DC, TypeSourceInfo *TInfo, StorageClass S, bool UsesFPIntrin, bool isInlineSpecified, ConstexprSpecKind ConstexprKind, - Expr *TrailingRequiresClause) + const AssociatedConstraint &TrailingRequiresClause) : DeclaratorDecl(DK, DC, NameInfo.getLoc(), NameInfo.getName(), T, TInfo, StartLoc), DeclContext(DK), redeclarable_base(C), Body(), ODRHash(0), @@ -3571,7 +3571,7 @@ bool FunctionDecl::isMemberLikeConstrainedFriend() const { // If these friends don't have constraints, they aren't constrained, and // thus don't fall under temp.friend p9. Else the simple presence of a // constraint makes them unique. - return getTrailingRequiresClause(); + return !getTrailingRequiresClause().isNull(); } return FriendConstraintRefersToEnclosingTemplate(); @@ -5453,7 +5453,7 @@ FunctionDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, TypeSourceInfo *TInfo, StorageClass SC, bool UsesFPIntrin, bool isInlineSpecified, bool hasWrittenPrototype, ConstexprSpecKind ConstexprKind, - Expr *TrailingRequiresClause) { + const AssociatedConstraint &TrailingRequiresClause) { FunctionDecl *New = new (C, DC) FunctionDecl( Function, C, DC, StartLoc, NameInfo, T, TInfo, SC, UsesFPIntrin, isInlineSpecified, ConstexprKind, TrailingRequiresClause); @@ -5464,7 +5464,8 @@ FunctionDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, FunctionDecl *FunctionDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) { return new (C, ID) FunctionDecl( Function, C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), - nullptr, SC_None, false, false, ConstexprSpecKind::Unspecified, nullptr); + nullptr, SC_None, false, false, ConstexprSpecKind::Unspecified, + /*TrailingRequiresClause=*/{}); } BlockDecl *BlockDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L) { diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 3c447a905a83c..7aa710ad7309b 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -2304,7 +2304,7 @@ CXXDeductionGuideDecl *CXXDeductionGuideDecl::Create( ASTContext &C, DeclContext *DC, SourceLocation StartLoc, ExplicitSpecifier ES, const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, SourceLocation EndLocation, CXXConstructorDecl *Ctor, - DeductionCandidate Kind, Expr *TrailingRequiresClause, + DeductionCandidate Kind, const AssociatedConstraint &TrailingRequiresClause, const CXXDeductionGuideDecl *GeneratedFrom, SourceDeductionGuideKind SourceKind) { return new (C, DC) CXXDeductionGuideDecl( @@ -2318,7 +2318,7 @@ CXXDeductionGuideDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) { C, /*DC=*/nullptr, SourceLocation(), ExplicitSpecifier(), DeclarationNameInfo(), QualType(), /*TInfo=*/nullptr, SourceLocation(), /*Ctor=*/nullptr, DeductionCandidate::Normal, - /*TrailingRequiresClause=*/nullptr, + /*TrailingRequiresClause=*/{}, /*GeneratedFrom=*/nullptr, SourceDeductionGuideKind::None); } @@ -2427,7 +2427,7 @@ CXXMethodDecl::Create(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc, TypeSourceInfo *TInfo, StorageClass SC, bool UsesFPIntrin, bool isInline, ConstexprSpecKind ConstexprKind, SourceLocation EndLocation, - Expr *TrailingRequiresClause) { + const AssociatedConstraint &TrailingRequiresClause) { return new (C, RD) CXXMethodDecl( CXXMethod, C, RD, StartLoc, NameInfo, T, TInfo, SC, UsesFPIntrin, isInline, ConstexprKind, EndLocation, TrailingRequiresClause); @@ -2435,10 +2435,11 @@ CXXMethodDecl::Create(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc, CXXMethodDecl *CXXMethodDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) { - return new (C, ID) CXXMethodDecl( - CXXMethod, C, nullptr, SourceLocation(), DeclarationNameInfo(), - QualType(), nullptr, SC_None, false, false, - ConstexprSpecKind::Unspecified, SourceLocation(), nullptr); + return new (C, ID) + CXXMethodDecl(CXXMethod, C, nullptr, SourceLocation(), + DeclarationNameInfo(), QualType(), nullptr, SC_None, false, + false, ConstexprSpecKind::Unspecified, SourceLocation(), + /*TrailingRequiresClause=*/{}); } CXXMethodDecl *CXXMethodDecl::getDevirtualizedMethod(const Expr *Base, @@ -2834,7 +2835,8 @@ CXXConstructorDecl::CXXConstructorDecl( const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, ExplicitSpecifier ES, bool UsesFPIntrin, bool isInline, bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind, - InheritedConstructor Inherited, Expr *TrailingRequiresClause) + InheritedConstructor Inherited, + const AssociatedConstraint &TrailingRequiresClause) : CXXMethodDecl(CXXConstructor, C, RD, StartLoc, NameInfo, T, TInfo, SC_None, UsesFPIntrin, isInline, ConstexprKind, SourceLocation(), TrailingRequiresClause) { @@ -2861,7 +2863,7 @@ CXXConstructorDecl *CXXConstructorDecl::CreateDeserialized(ASTContext &C, auto *Result = new (C, ID, Extra) CXXConstructorDecl( C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), nullptr, ExplicitSpecifier(), false, false, false, ConstexprSpecKind::Unspecified, - InheritedConstructor(), nullptr); + InheritedConstructor(), /*TrailingRequiresClause=*/{}); Result->setInheritingConstructor(isInheritingConstructor); Result->CXXConstructorDeclBits.HasTrailingExplicitSpecifier = hasTrailingExplicit; @@ -2874,7 +2876,8 @@ CXXConstructorDecl *CXXConstructorDecl::Create( const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, ExplicitSpecifier ES, bool UsesFPIntrin, bool isInline, bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind, - InheritedConstructor Inherited, Expr *TrailingRequiresClause) { + InheritedConstructor Inherited, + const AssociatedConstraint &TrailingRequiresClause) { assert(NameInfo.getName().getNameKind() == DeclarationName::CXXConstructorName && "Name must refer to a constructor"); @@ -3000,14 +3003,16 @@ CXXDestructorDecl *CXXDestructorDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) { return new (C, ID) CXXDestructorDecl( C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), nullptr, - false, false, false, ConstexprSpecKind::Unspecified, nullptr); + false, false, false, ConstexprSpecKind::Unspecified, + /*TrailingRequiresClause=*/{}); } CXXDestructorDecl *CXXDestructorDecl::Create( ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc, const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, bool UsesFPIntrin, bool isInline, bool isImplicitlyDeclared, - ConstexprSpecKind ConstexprKind, Expr *TrailingRequiresClause) { + ConstexprSpecKind ConstexprKind, + const AssociatedConstraint &TrailingRequiresClause) { assert(NameInfo.getName().getNameKind() == DeclarationName::CXXDestructorName && "Name must refer to a destructor"); @@ -3062,7 +3067,7 @@ CXXConversionDecl *CXXConversionDecl::CreateDeserialized(ASTContext &C, return new (C, ID) CXXConversionDecl( C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), nullptr, false, false, ExplicitSpecifier(), ConstexprSpecKind::Unspecified, - SourceLocation(), nullptr); + SourceLocation(), /*TrailingRequiresClause=*/{}); } CXXConversionDecl *CXXConversionDecl::Create( @@ -3070,7 +3075,7 @@ CXXConversionDecl *CXXConversionDecl::Create( const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo, bool UsesFPIntrin, bool isInline, ExplicitSpecifier ES, ConstexprSpecKind ConstexprKind, SourceLocation EndLocation, - Expr *TrailingRequiresClause) { + const AssociatedConstraint &TrailingRequiresClause) { assert(NameInfo.getName().getNameKind() == DeclarationName::CXXConversionFunctionName && "Name must refer to a conversion function"); diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp index 6368531cef3be..28098b242d494 100644 --- a/clang/lib/AST/DeclPrinter.cpp +++ b/clang/lib/AST/DeclPrinter.cpp @@ -842,10 +842,14 @@ void DeclPrinter::VisitFunctionDecl(FunctionDecl *D) { } Out << Proto; - if (Expr *TrailingRequiresClause = D->getTrailingRequiresClause()) { + if (const AssociatedConstraint &TrailingRequiresClause = + D->getTrailingRequiresClause()) { Out << " requires "; - TrailingRequiresClause->printPretty(Out, nullptr, SubPolicy, Indentation, - "\n", &Context); + // FIXME: The printer could support printing expressions and types as if + // expanded by an index. Pass in the ArgumentPackSubstitutionIndex when + // that's supported. + TrailingRequiresClause.ConstraintExpr->printPretty( + Out, nullptr, SubPolicy, Indentation, "\n", &Context); } } else { Ty.print(Out, Policy, Proto); diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index 8f6916aeb4bd6..b0bba8408f2b9 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -291,7 +291,7 @@ void TemplateDecl::getAssociatedConstraints( llvm::SmallVectorImpl &ACs) const { TemplateParams->getAssociatedConstraints(ACs); if (auto *FD = dyn_cast_or_null(getTemplatedDecl())) - if (const Expr *TRC = FD->getTrailingRequiresClause()) + if (const AssociatedConstraint &TRC = FD->getTrailingRequiresClause()) ACs.emplace_back(TRC); } @@ -299,7 +299,7 @@ bool TemplateDecl::hasAssociatedConstraints() const { if (TemplateParams->hasAssociatedConstraints()) return true; if (auto *FD = dyn_cast_or_null(getTemplatedDecl())) - return FD->getTrailingRequiresClause(); + return static_cast(FD->getTrailingRequiresClause()); return false; } diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp index a0bc50c449d82..77add7b0b6abe 100644 --- a/clang/lib/AST/ExprCXX.cpp +++ b/clang/lib/AST/ExprCXX.cpp @@ -1402,7 +1402,7 @@ ArrayRef LambdaExpr::getExplicitTemplateParameters() const { return Record->getLambdaExplicitTemplateParameters(); } -Expr *LambdaExpr::getTrailingRequiresClause() const { +const AssociatedConstraint &LambdaExpr::getTrailingRequiresClause() const { return getCallOperator()->getTrailingRequiresClause(); } diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index b81981606866a..eb9c9c30622ad 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -3781,7 +3781,7 @@ void CXXNameMangler::mangleBareFunctionType(const FunctionProtoType *Proto, if (FD) { FunctionTypeDepth.enterResultType(); - mangleRequiresClause(FD->getTrailingRequiresClause()); + mangleRequiresClause(FD->getTrailingRequiresClause().ConstraintExpr); } FunctionTypeDepth.pop(saved); diff --git a/clang/lib/ASTMatchers/ASTMatchFinder.cpp b/clang/lib/ASTMatchers/ASTMatchFinder.cpp index e9ec7eff1e0ab..6d0ba0b7907a1 100644 --- a/clang/lib/ASTMatchers/ASTMatchFinder.cpp +++ b/clang/lib/ASTMatchers/ASTMatchFinder.cpp @@ -584,7 +584,8 @@ class MatchASTVisitor : public RecursiveASTVisitor, if (LE->hasExplicitResultType()) TraverseTypeLoc(Proto.getReturnLoc()); - TraverseStmt(LE->getTrailingRequiresClause()); + TraverseStmt( + const_cast(LE->getTrailingRequiresClause().ConstraintExpr)); } TraverseStmt(LE->getBody()); diff --git a/clang/lib/Index/IndexDecl.cpp b/clang/lib/Index/IndexDecl.cpp index 6c971bf0f381b..df875e0b40079 100644 --- a/clang/lib/Index/IndexDecl.cpp +++ b/clang/lib/Index/IndexDecl.cpp @@ -132,8 +132,8 @@ class IndexingDeclVisitor : public ConstDeclVisitor { } } } - if (auto *C = D->getTrailingRequiresClause()) - IndexCtx.indexBody(C, Parent); + if (const AssociatedConstraint &C = D->getTrailingRequiresClause()) + IndexCtx.indexBody(C.ConstraintExpr, Parent); } bool handleObjCMethod(const ObjCMethodDecl *D, diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index 16f9e3d60560e..e10c49203725f 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -848,10 +848,8 @@ bool Sema::CheckFunctionConstraints(const FunctionDecl *FD, ForOverloadResolution); return CheckConstraintSatisfaction( - FD, - AssociatedConstraint(FD->getTrailingRequiresClause(), - ArgumentPackSubstitutionIndex), - *MLTAL, SourceRange(UsageLoc.isValid() ? UsageLoc : FD->getLocation()), + FD, FD->getTrailingRequiresClause(), *MLTAL, + SourceRange(UsageLoc.isValid() ? UsageLoc : FD->getLocation()), Satisfaction); } diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 9b7b3f856cc55..a675feaf50ce3 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -9353,7 +9353,7 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D, SemaRef.Context, DC, D.getBeginLoc(), NameInfo, R, TInfo, SC, SemaRef.getCurFPFeatures().isFPConstrained(), isInline, HasPrototype, ConstexprSpecKind::Unspecified, - /*TrailingRequiresClause=*/nullptr); + /*TrailingRequiresClause=*/{}); if (D.isInvalidType()) NewFD->setInvalidDecl(); @@ -9361,7 +9361,7 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D, } ExplicitSpecifier ExplicitSpecifier = D.getDeclSpec().getExplicitSpecifier(); - Expr *TrailingRequiresClause = D.getTrailingRequiresClause(); + AssociatedConstraint TrailingRequiresClause(D.getTrailingRequiresClause()); SemaRef.CheckExplicitObjectMemberFunction(DC, D, Name, R); @@ -10531,7 +10531,7 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC, diag::ext_operator_new_delete_declared_inline) << NewFD->getDeclName(); - if (Expr *TRC = NewFD->getTrailingRequiresClause()) { + if (const Expr *TRC = NewFD->getTrailingRequiresClause().ConstraintExpr) { // C++20 [dcl.decl.general]p4: // The optional requires-clause in an init-declarator or // member-declarator shall be present only if the declarator declares a @@ -12261,7 +12261,7 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD, if (Method->isVirtual() && NewFD->getTrailingRequiresClause()) // C++2a [class.virtual]p6 // A virtual method shall not have a requires-clause. - Diag(NewFD->getTrailingRequiresClause()->getBeginLoc(), + Diag(NewFD->getTrailingRequiresClause().ConstraintExpr->getBeginLoc(), diag::err_constrained_virtual_method); if (Method->isStatic()) @@ -19085,8 +19085,7 @@ static void SetEligibleMethods(Sema &S, CXXRecordDecl *Record, SmallVector SatisfactionStatus; for (CXXMethodDecl *Method : Methods) { - const Expr *Constraints = Method->getTrailingRequiresClause(); - if (!Constraints) + if (!Method->getTrailingRequiresClause()) SatisfactionStatus.push_back(true); else { ConstraintSatisfaction Satisfaction; @@ -19105,7 +19104,7 @@ static void SetEligibleMethods(Sema &S, CXXRecordDecl *Record, if (FunctionDecl *MF = OrigMethod->getInstantiatedFromMemberFunction()) OrigMethod = cast(MF); - const Expr *Constraints = OrigMethod->getTrailingRequiresClause(); + AssociatedConstraint Orig = OrigMethod->getTrailingRequiresClause(); bool AnotherMethodIsMoreConstrained = false; for (size_t j = 0; j < Methods.size(); j++) { if (i == j || !SatisfactionStatus[j]) @@ -19118,15 +19117,13 @@ static void SetEligibleMethods(Sema &S, CXXRecordDecl *Record, CSM)) continue; - const Expr *OtherConstraints = OtherMethod->getTrailingRequiresClause(); - if (!OtherConstraints) + AssociatedConstraint Other = OtherMethod->getTrailingRequiresClause(); + if (!Other) continue; - if (!Constraints) { + if (!Orig) { AnotherMethodIsMoreConstrained = true; break; } - AssociatedConstraint Other(OtherConstraints); - AssociatedConstraint Orig(Constraints); if (S.IsAtLeastAsConstrained(OtherMethod, {Other}, OrigMethod, {Orig}, AnotherMethodIsMoreConstrained)) { // There was an error with the constraints comparison. Exit the loop diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 96c0470198e35..47c472b35463e 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -18995,8 +18995,8 @@ bool Sema::checkThisInStaticMemberFunctionType(CXXMethodDecl *Method) { return true; // Check the trailing requires clause - if (Expr *E = Method->getTrailingRequiresClause()) - if (!Finder.TraverseStmt(E)) + if (const AssociatedConstraint &TRC = Method->getTrailingRequiresClause()) + if (!Finder.TraverseStmt(const_cast(TRC.ConstraintExpr))) return true; return checkThisInStaticMemberFunctionAttributes(Method); diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp index 31980abd23fd1..1592862416bf9 100644 --- a/clang/lib/Sema/SemaFunctionEffects.cpp +++ b/clang/lib/Sema/SemaFunctionEffects.cpp @@ -990,7 +990,7 @@ class Analyzer { followDestructor(dyn_cast(Dtor->getParent()), Dtor); if (auto *FD = dyn_cast(CurrentCaller.CDecl)) { - TrailingRequiresClause = FD->getTrailingRequiresClause(); + TrailingRequiresClause = FD->getTrailingRequiresClause().ConstraintExpr; // Note that FD->getType->getAs() can yield a // noexcept Expr which has been boiled down to a constant expression. diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp index 6f114b71981fa..f38198e1feab8 100644 --- a/clang/lib/Sema/SemaLambda.cpp +++ b/clang/lib/Sema/SemaLambda.cpp @@ -1015,7 +1015,7 @@ CXXMethodDecl *Sema::CreateLambdaCallOperator(SourceRange IntroducerRange, QualType(), /*Tinfo=*/nullptr, SC_None, getCurFPFeatures().isFPConstrained(), /*isInline=*/true, ConstexprSpecKind::Unspecified, SourceLocation(), - /*TrailingRequiresClause=*/nullptr); + /*TrailingRequiresClause=*/{}); Method->setAccess(AS_public); return Method; } @@ -1033,7 +1033,8 @@ void Sema::AddTemplateParametersToLambdaCallOperator( void Sema::CompleteLambdaCallOperator( CXXMethodDecl *Method, SourceLocation LambdaLoc, - SourceLocation CallOperatorLoc, Expr *TrailingRequiresClause, + SourceLocation CallOperatorLoc, + const AssociatedConstraint &TrailingRequiresClause, TypeSourceInfo *MethodTyInfo, ConstexprSpecKind ConstexprKind, StorageClass SC, ArrayRef Params, bool HasExplicitResultType) { @@ -1461,8 +1462,9 @@ void Sema::ActOnStartOfLambdaDefinition(LambdaIntroducer &Intro, CompleteLambdaCallOperator( Method, Intro.Range.getBegin(), CallOperatorLoc, - ParamInfo.getTrailingRequiresClause(), MethodTyInfo, - ParamInfo.getDeclSpec().getConstexprSpecifier(), + AssociatedConstraint(ParamInfo.getTrailingRequiresClause(), + /*ArgumentPackSubstitutionIndex=*/-1), + MethodTyInfo, ParamInfo.getDeclSpec().getConstexprSpecifier(), IsLambdaStatic ? SC_Static : SC_None, Params, ExplicitResultType); CheckCXXDefaultArguments(Method); @@ -1545,7 +1547,7 @@ void Sema::ActOnStartOfLambdaDefinition(LambdaIntroducer &Intro, // The optional requires-clause ([temp.pre]) in an init-declarator or // member-declarator shall be present only if the declarator declares a // templated function ([dcl.fct]). - if (Expr *TRC = Method->getTrailingRequiresClause()) { + if (const AssociatedConstraint &TRC = Method->getTrailingRequiresClause()) { // [temp.pre]/8: // An entity is templated if it is // - a template, @@ -1568,7 +1570,8 @@ void Sema::ActOnStartOfLambdaDefinition(LambdaIntroducer &Intro, // applies to the call operator, which we already know is a member function, // AND defined. if (!Method->getDescribedFunctionTemplate() && !Method->isTemplated()) { - Diag(TRC->getBeginLoc(), diag::err_constrained_non_templated_function); + Diag(TRC.ConstraintExpr->getBeginLoc(), + diag::err_constrained_non_templated_function); } } @@ -1791,7 +1794,8 @@ static void addFunctionPointerConversion(Sema &S, SourceRange IntroducerRange, // A non-generic lambda may still be a templated entity. We need to preserve // constraints when converting the lambda to a function pointer. See GH63181. - if (Expr *Requires = CallOperator->getTrailingRequiresClause()) + if (const AssociatedConstraint &Requires = + CallOperator->getTrailingRequiresClause()) Conversion->setTrailingRequiresClause(Requires); if (Class->isGenericLambda()) { diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 1802f8f4e1f91..d9a79bc802b56 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1551,12 +1551,16 @@ static bool IsOverloadOrOverrideImpl(Sema &SemaRef, FunctionDecl *New, if (!UseOverrideRules && New->getTemplateSpecializationKind() != TSK_ExplicitSpecialization) { - Expr *NewRC = New->getTrailingRequiresClause(), - *OldRC = Old->getTrailingRequiresClause(); - if ((NewRC != nullptr) != (OldRC != nullptr)) + AssociatedConstraint NewRC = New->getTrailingRequiresClause(), + OldRC = Old->getTrailingRequiresClause(); + if (!NewRC != !OldRC) + return true; + if (NewRC.ArgumentPackSubstitutionIndex != + OldRC.ArgumentPackSubstitutionIndex) return true; if (NewRC && - !SemaRef.AreConstraintExpressionsEqual(OldDecl, OldRC, NewDecl, NewRC)) + !SemaRef.AreConstraintExpressionsEqual(OldDecl, OldRC.ConstraintExpr, + NewDecl, NewRC.ConstraintExpr)) return true; } diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp index 3b2129e0df815..99bd9d0fb79af 100644 --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -200,7 +200,7 @@ buildDeductionGuide(Sema &SemaRef, TemplateDecl *OriginalTemplate, TypeSourceInfo *TInfo, SourceLocation LocStart, SourceLocation Loc, SourceLocation LocEnd, bool IsImplicit, llvm::ArrayRef MaterializedTypedefs = {}, - Expr *FunctionTrailingRC = nullptr) { + const AssociatedConstraint &FunctionTrailingRC = {}) { DeclContext *DC = OriginalTemplate->getDeclContext(); auto DeductionGuideName = SemaRef.Context.DeclarationNames.getCXXDeductionGuideName( @@ -356,7 +356,8 @@ struct ConvertConstructorToDeductionGuideTransform { TemplateParameterList *TemplateParams = SemaRef.GetTemplateParameterList(Template); SmallVector Depth1Args; - Expr *OuterRC = TemplateParams->getRequiresClause(); + AssociatedConstraint OuterRC(TemplateParams->getRequiresClause(), + /*ArgumentPackSubstitutionIndex=*/-1); if (FTD) { TemplateParameterList *InnerParams = FTD->getTemplateParameters(); SmallVector AllParams; @@ -456,18 +457,20 @@ struct ConvertConstructorToDeductionGuideTransform { // At this point, the function parameters are already 'instantiated' in the // current scope. Substitute into the constructor's trailing // requires-clause, if any. - Expr *FunctionTrailingRC = nullptr; - if (Expr *RC = CD->getTrailingRequiresClause()) { + AssociatedConstraint FunctionTrailingRC; + if (const AssociatedConstraint &RC = CD->getTrailingRequiresClause()) { MultiLevelTemplateArgumentList Args; Args.setKind(TemplateSubstitutionKind::Rewrite); Args.addOuterTemplateArguments(Depth1Args); Args.addOuterRetainedLevel(); if (NestedPattern) Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth()); - ExprResult E = SemaRef.SubstConstraintExprWithoutSatisfaction(RC, Args); + ExprResult E = SemaRef.SubstConstraintExprWithoutSatisfaction( + const_cast(RC.ConstraintExpr), Args); if (!E.isUsable()) return nullptr; - FunctionTrailingRC = E.get(); + FunctionTrailingRC = + AssociatedConstraint(E.get(), RC.ArgumentPackSubstitutionIndex); } // C++ [over.match.class.deduct]p1: @@ -480,13 +483,19 @@ struct ConvertConstructorToDeductionGuideTransform { if (OuterRC) { // The outer template parameters are not transformed, so their // associated constraints don't need substitution. + // FIXME: Should simply add another field for the OuterRC, instead of + // combining them like this. if (!FunctionTrailingRC) FunctionTrailingRC = OuterRC; else - FunctionTrailingRC = BinaryOperator::Create( - SemaRef.Context, /*lhs=*/OuterRC, /*rhs=*/FunctionTrailingRC, - BO_LAnd, SemaRef.Context.BoolTy, VK_PRValue, OK_Ordinary, - TemplateParams->getTemplateLoc(), FPOptionsOverride()); + FunctionTrailingRC = AssociatedConstraint( + BinaryOperator::Create( + SemaRef.Context, + /*lhs=*/const_cast(OuterRC.ConstraintExpr), + /*rhs=*/const_cast(FunctionTrailingRC.ConstraintExpr), + BO_LAnd, SemaRef.Context.BoolTy, VK_PRValue, OK_Ordinary, + TemplateParams->getTemplateLoc(), FPOptionsOverride()), + FunctionTrailingRC.ArgumentPackSubstitutionIndex); } return buildDeductionGuide( @@ -1238,14 +1247,20 @@ void DeclareImplicitDeductionGuidesForTypeAlias( // FIXME: Here the synthesized deduction guide is not a templated // function. Per [dcl.decl]p4, the requires-clause shall be present only // if the declarator declares a templated function, a bug in standard? - auto *Constraint = buildIsDeducibleConstraint( - SemaRef, AliasTemplate, Transformed->getReturnType(), {}); - if (auto *RC = DG->getTrailingRequiresClause()) { - auto Conjunction = - SemaRef.BuildBinOp(SemaRef.getCurScope(), SourceLocation{}, - BinaryOperatorKind::BO_LAnd, RC, Constraint); - if (!Conjunction.isInvalid()) - Constraint = Conjunction.getAs(); + AssociatedConstraint Constraint( + buildIsDeducibleConstraint(SemaRef, AliasTemplate, + Transformed->getReturnType(), {}), + /*ArgumentPackSubstitutionIndex=*/-1); + if (const AssociatedConstraint &RC = DG->getTrailingRequiresClause()) { + auto Conjunction = SemaRef.BuildBinOp( + SemaRef.getCurScope(), SourceLocation{}, + BinaryOperatorKind::BO_LAnd, const_cast(RC.ConstraintExpr), + const_cast(Constraint.ConstraintExpr)); + if (!Conjunction.isInvalid()) { + Constraint.ConstraintExpr = Conjunction.getAs(); + Constraint.ArgumentPackSubstitutionIndex = + RC.ArgumentPackSubstitutionIndex; + } } Transformed->setTrailingRequiresClause(Constraint); continue; diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 9ea5ecab2d030..fcb4ee5650f91 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -2671,7 +2671,7 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl( return nullptr; } - Expr *TrailingRequiresClause = D->getTrailingRequiresClause(); + AssociatedConstraint TrailingRequiresClause = D->getTrailingRequiresClause(); // If we're instantiating a local function declaration, put the result // in the enclosing namespace; otherwise we need to find the instantiated @@ -3102,7 +3102,7 @@ Decl *TemplateDeclInstantiator::VisitCXXMethodDecl( } CXXRecordDecl *Record = cast(DC); - Expr *TrailingRequiresClause = D->getTrailingRequiresClause(); + AssociatedConstraint TrailingRequiresClause = D->getTrailingRequiresClause(); DeclarationNameInfo NameInfo = SemaRef.SubstDeclarationNameInfo(D->getNameInfo(), TemplateArgs); diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 3689d323cf25b..12ec97ca8c3e0 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -15659,10 +15659,13 @@ TreeTransform::TransformLambdaExpr(LambdaExpr *E) { auto FPTL = NewCallOpTSI->getTypeLoc().getAsAdjusted(); assert(FPTL && "Not a FunctionProtoType?"); + AssociatedConstraint TRC = E->getCallOperator()->getTrailingRequiresClause(); + if (TRC.ArgumentPackSubstitutionIndex == -1) + TRC.ArgumentPackSubstitutionIndex = SemaRef.ArgumentPackSubstitutionIndex; + getSema().CompleteLambdaCallOperator( NewCallOperator, E->getCallOperator()->getLocation(), - E->getCallOperator()->getInnerLocStart(), - E->getCallOperator()->getTrailingRequiresClause(), NewCallOpTSI, + E->getCallOperator()->getInnerLocStart(), TRC, NewCallOpTSI, E->getCallOperator()->getConstexprKind(), E->getCallOperator()->getStorageClass(), FPTL.getParams(), E->hasExplicitResultType()); diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index b838f84c973de..1a2b8be7e2b8a 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -904,7 +904,8 @@ void ASTDeclReader::VisitDeclaratorDecl(DeclaratorDecl *DD) { if (Record.readInt()) { // hasExtInfo auto *Info = new (Reader.getContext()) DeclaratorDecl::ExtInfo(); Record.readQualifierInfo(*Info); - Info->TrailingRequiresClause = Record.readExpr(); + Info->TrailingRequiresClause = + AssociatedConstraint(Record.readExpr(), int(Record.readInt())); DD->DeclInfo = Info; } QualType TSIType = Record.readType(); diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index b896a04a0b14b..a4b89d0d9ed5e 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -728,7 +728,10 @@ void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) { if (D->hasExtInfo()) { DeclaratorDecl::ExtInfo *Info = D->getExtInfo(); Record.AddQualifierInfo(*Info); - Record.AddStmt(Info->TrailingRequiresClause); + Record.AddStmt( + const_cast(Info->TrailingRequiresClause.ConstraintExpr)); + Record.push_back( + Info->TrailingRequiresClause.ArgumentPackSubstitutionIndex); } // The location information is deferred until the end of the record. Record.AddTypeRef(D->getTypeSourceInfo() ? D->getTypeSourceInfo()->getType() diff --git a/clang/test/SemaCXX/fold_lambda_with_variadics.cpp b/clang/test/SemaCXX/fold_lambda_with_variadics.cpp index 106da7d0e2663..980d71b2142a2 100644 --- a/clang/test/SemaCXX/fold_lambda_with_variadics.cpp +++ b/clang/test/SemaCXX/fold_lambda_with_variadics.cpp @@ -267,6 +267,15 @@ static_assert(bazz<1, 2>()(1)); // expected-error@-1 {{is ambiguous}} // expected-note@#bazz 2{{candidate function [with value:auto = int]}} +template concept C2 = sizeof(T) >= sizeof(int); +template static constexpr auto trailing() { + return Overloaded{[](auto) requires (C2 && C2) { return 0; }...}; // #trailing +} +static_assert(trailing()(0)); +// expected-error@-1 {{is ambiguous}} +// expected-note@#trailing 2{{candidate function [with auto:1 = int]}} + + } // namespace GH101754 namespace GH131798 { diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index 6ea6447d1d590..c8db6c92bb4d4 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -872,7 +872,7 @@ bool CursorVisitor::VisitFunctionDecl(FunctionDecl *ND) { // FIXME: Attributes? } - if (auto *E = ND->getTrailingRequiresClause()) { + if (auto *E = ND->getTrailingRequiresClause().ConstraintExpr) { if (Visit(E)) return true; } From d7d91500b6ef7efb059f660ff7e4aa44553643e6 Mon Sep 17 00:00:00 2001 From: Asher Mancinelli Date: Thu, 3 Apr 2025 08:37:40 -0700 Subject: [PATCH 0546/1029] [flang][nfc] Initial changes needed to use llvm intrinsics instead of regular calls (#134170) Flang uses `fir.call ` in a few places. This means consumers of the IR need to strcmp every fir.call if they want to find a particular LLVM intrinsic. Emit LLVM memcpy intrinsics instead. --- .../Optimizer/Builder/LowLevelIntrinsics.h | 3 --- flang/lib/Lower/ConvertExpr.cpp | 23 ++++++++----------- .../Optimizer/Builder/LowLevelIntrinsics.cpp | 10 -------- flang/test/Lower/array-constructor-2.f90 | 16 ++++++------- 4 files changed, 18 insertions(+), 34 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/LowLevelIntrinsics.h b/flang/include/flang/Optimizer/Builder/LowLevelIntrinsics.h index 9be051632f93d..be106f7ea33b7 100644 --- a/flang/include/flang/Optimizer/Builder/LowLevelIntrinsics.h +++ b/flang/include/flang/Optimizer/Builder/LowLevelIntrinsics.h @@ -24,9 +24,6 @@ class FirOpBuilder; namespace fir::factory { -/// Get the LLVM intrinsic for `memcpy`. Use the 64 bit version. -mlir::func::FuncOp getLlvmMemcpy(FirOpBuilder &builder); - /// Get the LLVM intrinsic for `memmove`. Use the 64 bit version. mlir::func::FuncOp getLlvmMemmove(FirOpBuilder &builder); diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp index b677a136a74aa..2d61c2ee8dd8e 100644 --- a/flang/lib/Lower/ConvertExpr.cpp +++ b/flang/lib/Lower/ConvertExpr.cpp @@ -6184,17 +6184,16 @@ class ArrayExprLowering { /// Get the function signature of the LLVM memcpy intrinsic. mlir::FunctionType memcpyType() { - return fir::factory::getLlvmMemcpy(builder).getFunctionType(); + auto ptrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); + llvm::SmallVector args = {ptrTy, ptrTy, builder.getI64Type()}; + return mlir::FunctionType::get(builder.getContext(), args, std::nullopt); } /// Create a call to the LLVM memcpy intrinsic. - void createCallMemcpy(llvm::ArrayRef args) { + void createCallMemcpy(llvm::ArrayRef args, bool isVolatile) { mlir::Location loc = getLoc(); - mlir::func::FuncOp memcpyFunc = fir::factory::getLlvmMemcpy(builder); - mlir::SymbolRefAttr funcSymAttr = - builder.getSymbolRefAttr(memcpyFunc.getName()); - mlir::FunctionType funcTy = memcpyFunc.getFunctionType(); - builder.create(loc, funcSymAttr, funcTy.getResults(), args); + builder.create(loc, args[0], args[1], args[2], + isVolatile); } // Construct code to check for a buffer overrun and realloc the buffer when @@ -6306,9 +6305,8 @@ class ArrayExprLowering { auto buff = builder.createConvert(loc, fir::HeapType::get(resTy), mem); mlir::Value buffi = computeCoordinate(buff, off); llvm::SmallVector args = fir::runtime::createArguments( - builder, loc, memcpyType(), buffi, v.getAddr(), byteSz, - /*volatile=*/builder.createBool(loc, false)); - createCallMemcpy(args); + builder, loc, memcpyType(), buffi, v.getAddr(), byteSz); + createCallMemcpy(args, /*isVolatile=*/false); // Save the incremented buffer position. builder.create(loc, endOff, buffPos); @@ -6357,9 +6355,8 @@ class ArrayExprLowering { builder.createConvert(loc, fir::HeapType::get(resTy), mem); mlir::Value buffi = computeCoordinate(buff, off); llvm::SmallVector args = fir::runtime::createArguments( - builder, loc, memcpyType(), buffi, v.getAddr(), eleSz, - /*volatile=*/builder.createBool(loc, false)); - createCallMemcpy(args); + builder, loc, memcpyType(), buffi, v.getAddr(), eleSz); + createCallMemcpy(args, /*isVolatile=*/false); builder.create(loc, plusOne, buffPos); } diff --git a/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp b/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp index 411a48614af6c..e8547cf2b1e1b 100644 --- a/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp +++ b/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp @@ -21,16 +21,6 @@ #include "flang/Optimizer/Builder/LowLevelIntrinsics.h" #include "flang/Optimizer/Builder/FIRBuilder.h" -mlir::func::FuncOp fir::factory::getLlvmMemcpy(fir::FirOpBuilder &builder) { - auto ptrTy = builder.getRefType(builder.getIntegerType(8)); - llvm::SmallVector args = {ptrTy, ptrTy, builder.getI64Type(), - builder.getI1Type()}; - auto memcpyTy = - mlir::FunctionType::get(builder.getContext(), args, std::nullopt); - return builder.createFunction(builder.getUnknownLoc(), - "llvm.memcpy.p0.p0.i64", memcpyTy); -} - mlir::func::FuncOp fir::factory::getLlvmMemmove(fir::FirOpBuilder &builder) { auto ptrTy = builder.getRefType(builder.getIntegerType(8)); llvm::SmallVector args = {ptrTy, ptrTy, builder.getI64Type(), diff --git a/flang/test/Lower/array-constructor-2.f90 b/flang/test/Lower/array-constructor-2.f90 index ae75a3b425202..c026c0673fbbd 100644 --- a/flang/test/Lower/array-constructor-2.f90 +++ b/flang/test/Lower/array-constructor-2.f90 @@ -78,12 +78,12 @@ end function test3c ! CHECK-DAG: %[[rep:.*]] = fir.convert %{{.*}} : (!fir.heap) -> !fir.ref ! CHECK-DAG: %[[res:.*]] = fir.convert %{{.*}} : (index) -> i64 ! CHECK: %{{.*}} = fir.call @realloc(%[[rep]], %[[res]]) {{.*}}: (!fir.ref, i64) -> !fir.ref - ! CHECK: fir.call @llvm.memcpy.p0.p0.i64(%{{.*}}, %{{.*}}, %{{.*}}, %false{{.*}}) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () + ! CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %{{.*}}) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () ! CHECK: fir.call @_QPtest3c ! CHECK: fir.save_result ! CHECK: %[[tmp2:.*]] = fir.allocmem !fir.array, %{{.*}}#1 {uniq_name = ".array.expr"} ! CHECK: fir.call @realloc - ! CHECK: fir.call @llvm.memcpy.p0.p0.i64(% + ! CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %{{.*}}) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () ! CHECK: fir.array_coor %[[tmp:.*]](%{{.*}}) %{{.*}} : (!fir.heap>, !fir.shape<1>, index) -> !fir.ref ! CHECK-NEXT: fir.load ! CHECK-NEXT: fir.array_coor %arg0 %{{.*}} : (!fir.box>, index) -> !fir.ref @@ -130,11 +130,11 @@ subroutine test5(a, array2) ! CHECK: %[[res:.*]] = fir.allocmem !fir.array<4xf32> ! CHECK: fir.address_of(@_QQro.2xr4.2) : !fir.ref> ! CHECK: %[[tmp1:.*]] = fir.allocmem !fir.array<2xf32> - ! CHECK: fir.call @llvm.memcpy.p0.p0.i64(%{{.*}}, %{{.*}}, %{{.*}}, %false{{.*}}) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () + ! CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %{{.*}}) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () ! CHECK: %[[tmp2:.*]] = fir.allocmem !fir.array<2xf32> ! CHECK: = fir.array_coor %[[array2]](%{{.*}}) %{{.*}} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref ! CHECK: = fir.array_coor %[[tmp2]](%{{.*}}) %{{.*}} : (!fir.heap>, !fir.shape<1>, index) -> !fir.ref - ! CHECK: fir.call @llvm.memcpy.p0.p0.i64(%{{.*}}, %{{.*}}, %{{.*}}, %false{{.*}}) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () + ! CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %{{.*}}) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () ! CHECK: = fir.array_coor %{{.*}}(%{{.*}}) %{{.*}} : (!fir.heap>, !fir.shape<1>, index) -> !fir.ref ! CHECK: = fir.array_coor %[[a]] %{{.*}} : (!fir.box>, index) -> !fir.ref ! CHECK-DAG: fir.freemem %{{.*}} : !fir.heap> @@ -151,12 +151,12 @@ subroutine test6(c, d, e) ! CHECK: = fir.allocmem !fir.array<2x!fir.char<1,5>> ! CHECK: fir.call @realloc ! CHECK: %[[t:.*]] = fir.coordinate_of %{{.*}}, %{{.*}} : (!fir.heap>>, index) -> !fir.ref> - ! CHECK: %[[to:.*]] = fir.convert %[[t]] : (!fir.ref>) -> !fir.ref - ! CHECK: fir.call @llvm.memcpy.p0.p0.i64(%[[to]], %{{.*}}, %{{.*}}, %false) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () + ! CHECK: %[[to:.*]] = fir.convert %[[t]] : (!fir.ref>) -> !llvm.ptr + ! CHECK: "llvm.intr.memcpy"(%[[to]], %{{.*}}, %{{.*}}) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () ! CHECK: fir.call @realloc ! CHECK: %[[t:.*]] = fir.coordinate_of %{{.*}}, %{{.*}} : (!fir.heap>>, index) -> !fir.ref> - ! CHECK: %[[to:.*]] = fir.convert %[[t]] : (!fir.ref>) -> !fir.ref - ! CHECK: fir.call @llvm.memcpy.p0.p0.i64(%[[to]], %{{.*}}, %{{.*}}, %false) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () + ! CHECK: %[[to:.*]] = fir.convert %[[t]] : (!fir.ref>) -> !llvm.ptr + ! CHECK: "llvm.intr.memcpy"(%[[to]], %{{.*}}, %{{.*}}) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () ! CHECK: fir.freemem %{{.*}} : !fir.heap>> c = (/ d, e /) end subroutine test6 From 3801bf6164f570a145e3ebd20cf9114782ae0329 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 3 Apr 2025 08:50:31 -0700 Subject: [PATCH 0547/1029] [NFC] Cleanup pass initialization for SPIRV passes (#134189) - Do not call pass initialization functions from pass contructors. - Instead, call them from SPIRV target initialization. - https://github.com/llvm/llvm-project/issues/111767 --- .../Analysis/SPIRVConvergenceRegionAnalysis.cpp | 5 +---- llvm/lib/Target/SPIRV/SPIRV.h | 5 +++++ llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 15 ++++----------- llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp | 13 ++----------- .../lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp | 10 +++------- .../Target/SPIRV/SPIRVMergeRegionExitTargets.cpp | 9 +++------ llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 4 ---- llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp | 4 +--- llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp | 4 +--- .../Target/SPIRV/SPIRVPreLegalizerCombiner.cpp | 2 -- llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp | 11 +++-------- llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp | 8 +------- .../SPIRV/SPIRVStripConvergentIntrinsics.cpp | 11 +++-------- llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp | 9 +++++++++ 14 files changed, 36 insertions(+), 74 deletions(-) diff --git a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp index 48b327deaba84..88d5d0d503aa3 100644 --- a/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/Analysis/SPIRVConvergenceRegionAnalysis.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "SPIRVConvergenceRegionAnalysis.h" +#include "SPIRV.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" @@ -26,10 +27,6 @@ using namespace llvm; using namespace SPIRV; -namespace llvm { -void initializeSPIRVConvergenceRegionAnalysisWrapperPassPass(PassRegistry &); -} // namespace llvm - INITIALIZE_PASS_BEGIN(SPIRVConvergenceRegionAnalysisWrapperPass, "convergence-region", "SPIRV convergence regions analysis", true, true) diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h index d765dfe370be2..51728d1aa678d 100644 --- a/llvm/lib/Target/SPIRV/SPIRV.h +++ b/llvm/lib/Target/SPIRV/SPIRV.h @@ -43,6 +43,11 @@ void initializeSPIRVPostLegalizerPass(PassRegistry &); void initializeSPIRVStructurizerPass(PassRegistry &); void initializeSPIRVEmitIntrinsicsPass(PassRegistry &); void initializeSPIRVEmitNonSemanticDIPass(PassRegistry &); +void initializeSPIRVLegalizePointerCastPass(PassRegistry &); +void initializeSPIRVRegularizerPass(PassRegistry &); +void initializeSPIRVMergeRegionExitTargetsPass(PassRegistry &); +void initializeSPIRVPrepareFunctionsPass(PassRegistry &); +void initializeSPIRVStripConvergentIntrinsicsPass(PassRegistry &); } // namespace llvm #endif // LLVM_LIB_TARGET_SPIRV_SPIRV_H diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 68b69fe6f62b6..0067d2400529a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -47,13 +47,10 @@ using namespace llvm; -namespace llvm { -namespace SPIRV { +namespace llvm::SPIRV { #define GET_BuiltinGroup_DECL #include "SPIRVGenTables.inc" -} // namespace SPIRV -void initializeSPIRVEmitIntrinsicsPass(PassRegistry &); -} // namespace llvm +} // namespace llvm::SPIRV namespace { @@ -200,12 +197,8 @@ class SPIRVEmitIntrinsics public: static char ID; - SPIRVEmitIntrinsics() : ModulePass(ID) { - initializeSPIRVEmitIntrinsicsPass(*PassRegistry::getPassRegistry()); - } - SPIRVEmitIntrinsics(SPIRVTargetMachine *_TM) : ModulePass(ID), TM(_TM) { - initializeSPIRVEmitIntrinsicsPass(*PassRegistry::getPassRegistry()); - } + SPIRVEmitIntrinsics(SPIRVTargetMachine *TM = nullptr) + : ModulePass(ID), TM(TM) {} Instruction *visitInstruction(Instruction &I) { return &I; } Instruction *visitSwitchInst(SwitchInst &I); Instruction *visitGetElementPtrInst(GetElementPtrInst &I); diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp index 7858f44a054d7..725fcdb46f56d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitNonSemanticDI.cpp @@ -33,8 +33,8 @@ namespace { struct SPIRVEmitNonSemanticDI : public MachineFunctionPass { static char ID; SPIRVTargetMachine *TM; - SPIRVEmitNonSemanticDI(SPIRVTargetMachine *TM); - SPIRVEmitNonSemanticDI(); + SPIRVEmitNonSemanticDI(SPIRVTargetMachine *TM = nullptr) + : MachineFunctionPass(ID), TM(TM) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -54,15 +54,6 @@ llvm::createSPIRVEmitNonSemanticDIPass(SPIRVTargetMachine *TM) { return new SPIRVEmitNonSemanticDI(TM); } -SPIRVEmitNonSemanticDI::SPIRVEmitNonSemanticDI(SPIRVTargetMachine *TM) - : MachineFunctionPass(ID), TM(TM) { - initializeSPIRVEmitNonSemanticDIPass(*PassRegistry::getPassRegistry()); -} - -SPIRVEmitNonSemanticDI::SPIRVEmitNonSemanticDI() : MachineFunctionPass(ID) { - initializeSPIRVEmitNonSemanticDIPass(*PassRegistry::getPassRegistry()); -} - enum BaseTypeAttributeEncoding { Unspecified = 0, Address = 1, diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index 2ccff9dd321ec..560869f9fe62a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -56,10 +56,7 @@ using namespace llvm; -namespace llvm { -void initializeSPIRVLegalizePointerCastPass(PassRegistry &); -} - +namespace { class SPIRVLegalizePointerCast : public FunctionPass { // Builds the `spv_assign_type` assigning |Ty| to |Value| at the current @@ -182,9 +179,7 @@ class SPIRVLegalizePointerCast : public FunctionPass { } public: - SPIRVLegalizePointerCast(SPIRVTargetMachine *TM) : FunctionPass(ID), TM(TM) { - initializeSPIRVLegalizePointerCastPass(*PassRegistry::getPassRegistry()); - }; + SPIRVLegalizePointerCast(SPIRVTargetMachine *TM) : FunctionPass(ID), TM(TM) {} virtual bool runOnFunction(Function &F) override { const SPIRVSubtarget &ST = TM->getSubtarget(F); @@ -217,6 +212,7 @@ class SPIRVLegalizePointerCast : public FunctionPass { public: static char ID; }; +} // namespace char SPIRVLegalizePointerCast::ID = 0; INITIALIZE_PASS(SPIRVLegalizePointerCast, "spirv-legalize-bitcast", diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp index 267ab7b6376bd..67e73f1bd2198 100644 --- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp @@ -34,16 +34,13 @@ using namespace llvm; -namespace llvm { -void initializeSPIRVMergeRegionExitTargetsPass(PassRegistry &); +namespace { class SPIRVMergeRegionExitTargets : public FunctionPass { public: static char ID; - SPIRVMergeRegionExitTargets() : FunctionPass(ID) { - initializeSPIRVMergeRegionExitTargetsPass(*PassRegistry::getPassRegistry()); - }; + SPIRVMergeRegionExitTargets() : FunctionPass(ID) {} // Gather all the successors of |BB|. // This function asserts if the terminator neither a branch, switch or return. @@ -273,7 +270,7 @@ class SPIRVMergeRegionExitTargets : public FunctionPass { FunctionPass::getAnalysisUsage(AU); } }; -} // namespace llvm +} // namespace char SPIRVMergeRegionExitTargets::ID = 0; diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index f9e64f118a277..8ba163ed57ed2 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -49,10 +49,6 @@ struct AvoidCapabilitiesSet { char llvm::SPIRVModuleAnalysis::ID = 0; -namespace llvm { -void initializeSPIRVModuleAnalysisPass(PassRegistry &); -} // namespace llvm - INITIALIZE_PASS(SPIRVModuleAnalysis, DEBUG_TYPE, "SPIRV module analysis", true, true) diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp index de9be33d68c34..c9b78c521f504 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp @@ -34,9 +34,7 @@ namespace { class SPIRVPostLegalizer : public MachineFunctionPass { public: static char ID; - SPIRVPostLegalizer() : MachineFunctionPass(ID) { - initializeSPIRVPostLegalizerPass(*PassRegistry::getPassRegistry()); - } + SPIRVPostLegalizer() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; }; } // namespace diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index 3a68def3df058..e4cc03eff1035 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -32,9 +32,7 @@ namespace { class SPIRVPreLegalizer : public MachineFunctionPass { public: static char ID; - SPIRVPreLegalizer() : MachineFunctionPass(ID) { - initializeSPIRVPreLegalizerPass(*PassRegistry::getPassRegistry()); - } + SPIRVPreLegalizer() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; }; diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp index ec688762ca0a5..d2de71ce10c10 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizerCombiner.cpp @@ -197,8 +197,6 @@ void SPIRVPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { SPIRVPreLegalizerCombiner::SPIRVPreLegalizerCombiner() : MachineFunctionPass(ID) { - initializeSPIRVPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); - if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 028699e56a946..628688d83a314 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -35,10 +35,6 @@ using namespace llvm; -namespace llvm { -void initializeSPIRVPrepareFunctionsPass(PassRegistry &); -} - namespace { class SPIRVPrepareFunctions : public ModulePass { @@ -48,9 +44,8 @@ class SPIRVPrepareFunctions : public ModulePass { public: static char ID; - SPIRVPrepareFunctions(const SPIRVTargetMachine &TM) : ModulePass(ID), TM(TM) { - initializeSPIRVPrepareFunctionsPass(*PassRegistry::getPassRegistry()); - } + SPIRVPrepareFunctions(const SPIRVTargetMachine &TM) + : ModulePass(ID), TM(TM) {} bool runOnModule(Module &M) override; @@ -68,7 +63,7 @@ char SPIRVPrepareFunctions::ID = 0; INITIALIZE_PASS(SPIRVPrepareFunctions, "prepare-functions", "SPIRV prepare functions", false, false) -std::string lowerLLVMIntrinsicName(IntrinsicInst *II) { +static std::string lowerLLVMIntrinsicName(IntrinsicInst *II) { Function *IntrinsicFunc = II->getCalledFunction(); assert(IntrinsicFunc && "Missing function"); std::string FuncName = IntrinsicFunc->getName().str(); diff --git a/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp b/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp index a956fad5487c1..0e01430d3b863 100644 --- a/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp @@ -25,19 +25,13 @@ using namespace llvm; -namespace llvm { -void initializeSPIRVRegularizerPass(PassRegistry &); -} - namespace { struct SPIRVRegularizer : public FunctionPass, InstVisitor { DenseMap Old2NewFuncs; public: static char ID; - SPIRVRegularizer() : FunctionPass(ID) { - initializeSPIRVRegularizerPass(*PassRegistry::getPassRegistry()); - } + SPIRVRegularizer() : FunctionPass(ID) {} bool runOnFunction(Function &F) override; StringRef getPassName() const override { return "SPIR-V Regularizer"; } diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp index c87048b93f80f..78bb6973f3896 100644 --- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp @@ -25,18 +25,12 @@ using namespace llvm; -namespace llvm { -void initializeSPIRVStripConvergentIntrinsicsPass(PassRegistry &); -} - +namespace { class SPIRVStripConvergentIntrinsics : public FunctionPass { public: static char ID; - SPIRVStripConvergentIntrinsics() : FunctionPass(ID) { - initializeSPIRVStripConvergentIntrinsicsPass( - *PassRegistry::getPassRegistry()); - }; + SPIRVStripConvergentIntrinsics() : FunctionPass(ID) {} virtual bool runOnFunction(Function &F) override { DenseSet ToRemove; @@ -85,6 +79,7 @@ class SPIRVStripConvergentIntrinsics : public FunctionPass { return ToRemove.size() != 0; } }; +} // namespace char SPIRVStripConvergentIntrinsics::ID = 0; INITIALIZE_PASS(SPIRVStripConvergentIntrinsics, "strip-convergent-intrinsics", diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp index 0aa214dd354ee..4399f080f1f81 100644 --- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp @@ -49,6 +49,15 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSPIRVTarget() { initializeSPIRVConvergenceRegionAnalysisWrapperPassPass(PR); initializeSPIRVStructurizerPass(PR); initializeSPIRVPreLegalizerCombinerPass(PR); + initializeSPIRVLegalizePointerCastPass(PR); + initializeSPIRVRegularizerPass(PR); + initializeSPIRVPreLegalizerPass(PR); + initializeSPIRVPostLegalizerPass(PR); + initializeSPIRVMergeRegionExitTargetsPass(PR); + initializeSPIRVEmitIntrinsicsPass(PR); + initializeSPIRVEmitNonSemanticDIPass(PR); + initializeSPIRVPrepareFunctionsPass(PR); + initializeSPIRVStripConvergentIntrinsicsPass(PR); } static std::string computeDataLayout(const Triple &TT) { From bec5cfd970c5882c54a9e8d9f3da430dc39d0dd0 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 3 Apr 2025 09:08:23 -0700 Subject: [PATCH 0548/1029] [lldb-dap] Protect SetBreakpoint with the API mutex (#134030) Protect the various SetBreakpoint functions with the API mutex. This fixes a race condition between the breakpoint being created and the DAP label getting added. This was causing `TestDAP_breakpointEvents.py` to be flaky. Fixes #131242. --- lldb/tools/lldb-dap/Breakpoint.cpp | 6 ++++++ lldb/tools/lldb-dap/DAP.h | 3 +++ lldb/tools/lldb-dap/ExceptionBreakpoint.cpp | 5 +++++ lldb/tools/lldb-dap/FunctionBreakpoint.cpp | 5 +++++ lldb/tools/lldb-dap/SourceBreakpoint.cpp | 5 +++++ 5 files changed, 24 insertions(+) diff --git a/lldb/tools/lldb-dap/Breakpoint.cpp b/lldb/tools/lldb-dap/Breakpoint.cpp index e02f62076f935..5679fd545d53f 100644 --- a/lldb/tools/lldb-dap/Breakpoint.cpp +++ b/lldb/tools/lldb-dap/Breakpoint.cpp @@ -7,14 +7,17 @@ //===----------------------------------------------------------------------===// #include "Breakpoint.h" +#include "DAP.h" #include "JSONUtils.h" #include "lldb/API/SBAddress.h" #include "lldb/API/SBBreakpointLocation.h" #include "lldb/API/SBLineEntry.h" +#include "lldb/API/SBMutex.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/JSON.h" #include #include +#include #include using namespace lldb_dap; @@ -74,6 +77,9 @@ bool Breakpoint::MatchesName(const char *name) { } void Breakpoint::SetBreakpoint() { + lldb::SBMutex lock = m_dap.GetAPIMutex(); + std::lock_guard guard(lock); + m_bp.AddName(kDAPBreakpointLabel); if (!m_condition.empty()) SetCondition(); diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index 4357bdd5cc80f..3ce6498632479 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -27,6 +27,7 @@ #include "lldb/API/SBFile.h" #include "lldb/API/SBFormat.h" #include "lldb/API/SBFrame.h" +#include "lldb/API/SBMutex.h" #include "lldb/API/SBTarget.h" #include "lldb/API/SBThread.h" #include "lldb/API/SBValue.h" @@ -404,6 +405,8 @@ struct DAP { InstructionBreakpoint *GetInstructionBreakpoint(const lldb::break_id_t bp_id); InstructionBreakpoint *GetInstructionBPFromStopReason(lldb::SBThread &thread); + + lldb::SBMutex GetAPIMutex() const { return target.GetAPIMutex(); } }; } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp index d8109daf89129..9772e7344ced6 100644 --- a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp +++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp @@ -9,11 +9,16 @@ #include "ExceptionBreakpoint.h" #include "BreakpointBase.h" #include "DAP.h" +#include "lldb/API/SBMutex.h" #include "lldb/API/SBTarget.h" +#include namespace lldb_dap { void ExceptionBreakpoint::SetBreakpoint() { + lldb::SBMutex lock = m_dap.GetAPIMutex(); + std::lock_guard guard(lock); + if (m_bp.IsValid()) return; bool catch_value = m_filter.find("_catch") != std::string::npos; diff --git a/lldb/tools/lldb-dap/FunctionBreakpoint.cpp b/lldb/tools/lldb-dap/FunctionBreakpoint.cpp index 2fb6e8fafc2fa..d87723f7557bd 100644 --- a/lldb/tools/lldb-dap/FunctionBreakpoint.cpp +++ b/lldb/tools/lldb-dap/FunctionBreakpoint.cpp @@ -9,6 +9,8 @@ #include "FunctionBreakpoint.h" #include "DAP.h" #include "JSONUtils.h" +#include "lldb/API/SBMutex.h" +#include namespace lldb_dap { @@ -17,6 +19,9 @@ FunctionBreakpoint::FunctionBreakpoint(DAP &d, const llvm::json::Object &obj) m_function_name(std::string(GetString(obj, "name").value_or(""))) {} void FunctionBreakpoint::SetBreakpoint() { + lldb::SBMutex lock = m_dap.GetAPIMutex(); + std::lock_guard guard(lock); + if (m_function_name.empty()) return; m_bp = m_dap.target.BreakpointCreateByName(m_function_name.c_str()); diff --git a/lldb/tools/lldb-dap/SourceBreakpoint.cpp b/lldb/tools/lldb-dap/SourceBreakpoint.cpp index 150fa6af44d3a..6d8d3470668c8 100644 --- a/lldb/tools/lldb-dap/SourceBreakpoint.cpp +++ b/lldb/tools/lldb-dap/SourceBreakpoint.cpp @@ -13,6 +13,7 @@ #include "lldb/API/SBBreakpoint.h" #include "lldb/API/SBFileSpecList.h" #include "lldb/API/SBFrame.h" +#include "lldb/API/SBMutex.h" #include "lldb/API/SBTarget.h" #include "lldb/API/SBThread.h" #include "lldb/API/SBValue.h" @@ -20,6 +21,7 @@ #include #include #include +#include #include namespace lldb_dap { @@ -33,6 +35,9 @@ SourceBreakpoint::SourceBreakpoint(DAP &dap, const llvm::json::Object &obj) .value_or(LLDB_INVALID_COLUMN_NUMBER)) {} void SourceBreakpoint::SetBreakpoint(const llvm::StringRef source_path) { + lldb::SBMutex lock = m_dap.GetAPIMutex(); + std::lock_guard guard(lock); + lldb::SBFileSpecList module_list; m_bp = m_dap.target.BreakpointCreateByLocation( source_path.str().c_str(), m_line, m_column, 0, module_list); From bf388f8a43c26264dfa96a91bead440d19f58bc4 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Thu, 3 Apr 2025 12:26:41 -0400 Subject: [PATCH 0549/1029] [AMDGPU][True16][CodeGen] legalize operands when move16bit SALU to VALU (#133985) This is a follow up PR from https://github.com/llvm/llvm-project/pull/132089. When a V2S copy and its useMI are lowered to VALU, this patch check: If the generated new VALU is a true16 inst. Add subreg access on all operands if necessary. an example MIR looks like: ``` %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0 ... %2:sreg_32 = COPY %1:vgpr_32 %3:sreg_32 = S_FLOOR_F16 %2:sreg_32, ... ``` currently lowered to ``` %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0 ... %2:vgpr_16 = V_FLOOR_F16_t16_e64 0, %1:vgpr_32, 0, 0, 0 ... ``` after this patch ``` %1:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0 ... %2:vgpr_16 = V_FLOOR_F16_t16_e64 0, %1.lo16:vgpr_32, 0, 0, 0 ... ``` --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 36 ++++++++++++++++--- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 4 +++ .../AMDGPU/fix-sgpr-copies-f16-true16.mir | 20 +++++++++++ .../CodeGen/AMDGPU/fix-sgpr-copies-f16.mir | 27 ++++++++------ 4 files changed, 72 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 260f80a5f532e..61fda0eef6314 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7228,6 +7228,29 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) { return DeferredList.contains(MI); } +// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode, +// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add +// subreg access properly. This can be removed after we have sgpr16 in place +void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst, + MachineRegisterInfo &MRI) const { + unsigned Opcode = Inst.getOpcode(); + if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts()) + return; + + for (MachineOperand &Op : Inst.explicit_operands()) { + unsigned OpIdx = Op.getOperandNo(); + if (!OpIdx) + continue; + if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) { + unsigned RCID = get(Opcode).operands()[OpIdx].RegClass; + const TargetRegisterClass *RC = RI.getRegClass(RCID); + if (RI.getRegSizeInBits(*RC) == 16) { + Op.setSubReg(AMDGPU::lo16); + } + } + } +} + void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const { @@ -7613,6 +7636,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, .add(Inst.getOperand(0)) .add(Inst.getOperand(1)); } + legalizeOperandsVALUt16(*NewInstr, MRI); legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr); MachineOperand SCCOp = Inst.getOperand(SCCIdx); @@ -7682,6 +7706,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, .addImm(0) // omod .addImm(0); // opsel0 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst); + legalizeOperandsVALUt16(*NewInstr, MRI); legalizeOperands(*NewInstr, MDT); addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist); Inst.eraseFromParent(); @@ -7747,6 +7772,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, // If this is a v2s copy src from vgpr16 to sgpr32, // replace vgpr copy to subreg_to_reg + // This can be remove after we have sgpr16 in place if (ST.useRealTrue16Insts() && Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() && RI.isVGPR(MRI, Inst.getOperand(1).getReg())) { @@ -7785,11 +7811,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, NewInstr.addImm(0); if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) { MachineOperand Src = Inst.getOperand(1); - if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() && - Src.isReg() && RI.isVGPR(MRI, Src.getReg())) - NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16); - else - NewInstr->addOperand(Src); + NewInstr->addOperand(Src); } if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { @@ -7863,6 +7885,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, // Check useMI of NewInstr. If used by a true16 instruction, // add a lo16 subreg access if size mismatched + // This can be remove after we have sgpr16 in place if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), E = MRI.use_end(); @@ -7878,6 +7901,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, } } fixImplicitOperands(*NewInstr); + + legalizeOperandsVALUt16(*NewInstr, MRI); + // Legalize the operands legalizeOperands(*NewInstr, MDT); if (NewDstReg) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 79ef1432d512a..d63225c067c9d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1279,6 +1279,10 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { /// was moved to VGPR. \returns true if succeeded. bool moveFlatAddrToVGPR(MachineInstr &Inst) const; + /// Fix operands in Inst to fix 16bit SALU to VALU lowering. + void legalizeOperandsVALUt16(MachineInstr &Inst, + MachineRegisterInfo &MRI) const; + /// Replace the instructions opcode with the equivalent VALU /// opcode. This function will also move the users of MachineInstruntions /// in the \p WorkList to the VALU if necessary. If present, \p MDT is diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir index 419f57972a485..137a9aaea6a77 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir @@ -1,6 +1,26 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s +--- +name: cmp_f16 +body: | + bb.0.entry: + ; GCN-LABEL: name: cmp_f16 + ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16 + ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[SUBREG_TO_REG]].lo16, 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec + %0:vgpr_16 = IMPLICIT_DEF + %1:sreg_32 = IMPLICIT_DEF + %2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec + %3:sreg_32 = COPY %2:vgpr_16 + nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode + %4:sreg_32_xm0_xexec = COPY $scc + %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec +... + --- name: cvt_hi_f32_f16 body: | diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir index 23e4b80b61f69..8bc8eefad6bf7 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir @@ -1,19 +1,26 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 -# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow -# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=REAL16 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=FAKE16 %s --- name: fmac_f16 body: | bb.0: - ; GCN-LABEL: name: fmac_f16 - ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec + ; REAL16-LABEL: name: fmac_f16 + ; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; REAL16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec + ; REAL16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; REAL16-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FMAC_F16_t16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec + ; + ; FAKE16-LABEL: name: fmac_f16 + ; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; FAKE16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; FAKE16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; FAKE16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec + ; FAKE16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; FAKE16-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec %0:vgpr_32 = IMPLICIT_DEF %1:sreg_32 = IMPLICIT_DEF %2:sreg_32 = IMPLICIT_DEF From 65fa57bdcc9d745dd8c222426e79618fb7cf1c91 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Thu, 3 Apr 2025 09:27:54 -0700 Subject: [PATCH 0550/1029] [HLSL][RootSignature] Define and integrate `HLSLRootSignatureAttr` (#134124) - Defines HLSLRootSignature Attr in `Attr.td` - Define and implement handleHLSLRootSignature in `SemaHLSL` - Adds sample test case to show AST Node is generated in `RootSignatures-AST.hlsl` This commit will "hook-up" the seperately defined RootSignature parser and invoke it to create the RootElements, then store them on the ASTContext and finally store the reference to the Elements in RootSignatureAttr Resolves https://github.com/llvm/llvm-project/issues/119011 --------- Co-authored-by: Finn Plummer --- clang/include/clang/AST/Attr.h | 1 + clang/include/clang/Basic/Attr.td | 19 +++++++++++ clang/include/clang/Basic/AttrDocs.td | 11 +++++++ clang/include/clang/Sema/SemaHLSL.h | 1 + clang/lib/Sema/SemaDeclAttr.cpp | 3 ++ clang/lib/Sema/SemaHLSL.cpp | 35 +++++++++++++++++++++ clang/test/AST/HLSL/RootSignatures-AST.hlsl | 24 ++++++++++++++ clang/test/SemaHLSL/RootSignature-err.hlsl | 9 ++++++ 8 files changed, 103 insertions(+) create mode 100644 clang/test/AST/HLSL/RootSignatures-AST.hlsl create mode 100644 clang/test/SemaHLSL/RootSignature-err.hlsl diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h index 994f236337b99..37c3f8bbfb5f9 100644 --- a/clang/include/clang/AST/Attr.h +++ b/clang/include/clang/AST/Attr.h @@ -26,6 +26,7 @@ #include "clang/Basic/SourceLocation.h" #include "clang/Support/Compiler.h" #include "llvm/Frontend/HLSL/HLSLResource.h" +#include "llvm/Frontend/HLSL/HLSLRootSignature.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/VersionTuple.h" diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index fd9e686485552..9ef4f2b6b91ed 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4710,6 +4710,25 @@ def Error : InheritableAttr { let Documentation = [ErrorAttrDocs]; } +def HLSLRootSignature : Attr { + /// [RootSignature(Signature)] + let Spellings = [Microsoft<"RootSignature">]; + let Args = [StringArgument<"Signature">]; + let Subjects = SubjectList<[Function], + ErrorDiag, "'function'">; + let LangOpts = [HLSL]; + let Documentation = [HLSLRootSignatureDocs]; + let AdditionalMembers = [{ +private: + ArrayRef RootElements; +public: + void setElements(ArrayRef Elements) { + RootElements = Elements; + } + auto getElements() const { return RootElements; } +}]; +} + def HLSLNumThreads: InheritableAttr { let Spellings = [Microsoft<"numthreads">]; let Args = [IntArgument<"X">, IntArgument<"Y">, IntArgument<"Z">]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index c8b371280e35d..1b969e456b910 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -8145,6 +8145,17 @@ and https://microsoft.github.io/hlsl-specs/proposals/0013-wave-size-range.html }]; } +def HLSLRootSignatureDocs : Documentation { + let Category = DocCatFunction; + let Content = [{ +The ``RootSignature`` attribute applies to HLSL entry functions to define what +types of resources are bound to the graphics pipeline. + +For details about the use and specification of Root Signatures please see here: +https://learn.microsoft.com/en-us/windows/win32/direct3d12/root-signatures + }]; +} + def NumThreadsDocs : Documentation { let Category = DocCatFunction; let Content = [{ diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h index f333fe30e8da0..1bd35332612cd 100644 --- a/clang/include/clang/Sema/SemaHLSL.h +++ b/clang/include/clang/Sema/SemaHLSL.h @@ -118,6 +118,7 @@ class SemaHLSL : public SemaBase { bool IsCompAssign); void emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS, BinaryOperatorKind Opc); + void handleRootSignatureAttr(Decl *D, const ParsedAttr &AL); void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL); void handleWaveSizeAttr(Decl *D, const ParsedAttr &AL); void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL); diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 0b844b44930b9..b36d327f5bd0a 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -7498,6 +7498,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, break; // HLSL attributes: + case ParsedAttr::AT_HLSLRootSignature: + S.HLSL().handleRootSignatureAttr(D, AL); + break; case ParsedAttr::AT_HLSLNumThreads: S.HLSL().handleNumThreadsAttr(D, AL); break; diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index fe600386e6fa9..bed14c111b544 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -28,6 +28,7 @@ #include "clang/Basic/SourceLocation.h" #include "clang/Basic/Specifiers.h" #include "clang/Basic/TargetInfo.h" +#include "clang/Parse/ParseHLSLRootSignature.h" #include "clang/Sema/Initialization.h" #include "clang/Sema/ParsedAttr.h" #include "clang/Sema/Sema.h" @@ -941,6 +942,40 @@ void SemaHLSL::emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS, << NewFnName << FixItHint::CreateReplacement(FullRange, OS.str()); } +void SemaHLSL::handleRootSignatureAttr(Decl *D, const ParsedAttr &AL) { + if (AL.getNumArgs() != 1) { + Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 1; + return; + } + + StringRef Signature; + if (!SemaRef.checkStringLiteralArgumentAttr(AL, 0, Signature)) + return; + + SourceLocation Loc = AL.getArgAsExpr(0)->getExprLoc(); + // TODO(#126565): pass down below to lexer when fp is supported + // llvm::RoundingMode RM = SemaRef.CurFPFeatures.getRoundingMode(); + hlsl::RootSignatureLexer Lexer(Signature, Loc); + SmallVector Elements; + hlsl::RootSignatureParser Parser(Elements, Lexer, SemaRef.getPreprocessor()); + + if (Parser.parse()) + return; + + // Allocate elements onto AST context + unsigned N = Elements.size(); + auto RootElements = MutableArrayRef( + ::new (getASTContext()) llvm::hlsl::rootsig::RootElement[N], N); + for (unsigned I = 0; I < N; ++I) + RootElements[I] = Elements[I]; + + // Set elements + auto *Result = ::new (getASTContext()) + HLSLRootSignatureAttr(getASTContext(), AL, Signature); + Result->setElements(ArrayRef(RootElements)); + D->addAttr(Result); +} + void SemaHLSL::handleNumThreadsAttr(Decl *D, const ParsedAttr &AL) { llvm::VersionTuple SMVersion = getASTContext().getTargetInfo().getTriple().getOSVersion(); diff --git a/clang/test/AST/HLSL/RootSignatures-AST.hlsl b/clang/test/AST/HLSL/RootSignatures-AST.hlsl new file mode 100644 index 0000000000000..948f2484ff5d0 --- /dev/null +++ b/clang/test/AST/HLSL/RootSignatures-AST.hlsl @@ -0,0 +1,24 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -ast-dump \ +// RUN: -disable-llvm-passes -o - %s | FileCheck %s + +// This test ensures that the sample root signature is parsed without error and +// the Attr AST Node is created succesfully. If an invalid root signature was +// passed in then we would exit out of Sema before the Attr is created. + +#define SampleRS \ + "DescriptorTable( " \ + " CBV(), " \ + " SRV(), " \ + " UAV()" \ + "), " \ + "DescriptorTable(Sampler())" + +// CHECK: HLSLRootSignatureAttr +// CHECK-SAME: "DescriptorTable( +// CHECK-SAME: CBV(), +// CHECK-SAME: SRV(), +// CHECK-SAME: UAV() +// CHECK-SAME: ), +// CHECK-SAME: DescriptorTable(Sampler())" +[RootSignature(SampleRS)] +void main() {} diff --git a/clang/test/SemaHLSL/RootSignature-err.hlsl b/clang/test/SemaHLSL/RootSignature-err.hlsl new file mode 100644 index 0000000000000..647a4ba2470a7 --- /dev/null +++ b/clang/test/SemaHLSL/RootSignature-err.hlsl @@ -0,0 +1,9 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - %s -verify + +// Attr test + +[RootSignature()] // expected-error {{'RootSignature' attribute takes one argument}} +void bad_root_signature_0() {} + +[RootSignature("Arg1", "Arg2")] // expected-error {{'RootSignature' attribute takes one argument}} +void bad_root_signature_1() {} From b738b82699e58fa50d15d76e1e26b58100ad344e Mon Sep 17 00:00:00 2001 From: Connector Switch Date: Fri, 4 Apr 2025 00:36:23 +0800 Subject: [PATCH 0551/1029] [libc] Combine the function prototype `int (*compar)(const void *, const void *)` (#134238) Closes #134118. --- libc/include/CMakeLists.txt | 20 +++++++++---------- libc/include/llvm-libc-types/CMakeLists.txt | 3 +-- .../llvm-libc-types/__lsearchcompare_t.h | 14 ------------- ...searchcompare_t.h => __search_compare_t.h} | 10 +++++----- libc/include/search.yaml | 10 +++++----- libc/include/stdlib.yaml | 6 +++--- 6 files changed, 24 insertions(+), 39 deletions(-) delete mode 100644 libc/include/llvm-libc-types/__lsearchcompare_t.h rename libc/include/llvm-libc-types/{__bsearchcompare_t.h => __search_compare_t.h} (53%) diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index d1e116ac547d7..e407de2f16959 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -240,13 +240,13 @@ add_header_macro( ../libc/include/search.yaml search.h DEPENDS - .llvm_libc_common_h .llvm-libc-types.ACTION .llvm-libc-types.ENTRY - .llvm-libc-types.struct_hsearch_data - .llvm-libc-types.size_t .llvm-libc-types.VISIT - .llvm-libc-types.__lsearchcompare_t + .llvm-libc-types.__search_compare_t + .llvm-libc-types.size_t + .llvm-libc-types.struct_hsearch_data + .llvm_libc_common_h ) add_header_macro( @@ -343,17 +343,17 @@ add_header_macro( ../libc/include/stdlib.yaml stdlib.h DEPENDS - .llvm_libc_common_h .llvm-libc-macros.stdlib_macros + .llvm-libc-types.__atexithandler_t + .llvm-libc-types.__qsortcompare_t + .llvm-libc-types.__qsortrcompare_t + .llvm-libc-types.__search_compare_t .llvm-libc-types.div_t .llvm-libc-types.ldiv_t .llvm-libc-types.lldiv_t - .llvm-libc-types.size_t - .llvm-libc-types.__bsearchcompare_t - .llvm-libc-types.__qsortcompare_t - .llvm-libc-types.__qsortrcompare_t - .llvm-libc-types.__atexithandler_t .llvm-libc-types.locale_t + .llvm-libc-types.size_t + .llvm_libc_common_h ) add_header_macro( diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt index 66e8527701873..9ed39bcd05190 100644 --- a/libc/include/llvm-libc-types/CMakeLists.txt +++ b/libc/include/llvm-libc-types/CMakeLists.txt @@ -2,8 +2,7 @@ add_header(off64_t HDR off64_t.h) add_header(size_t HDR size_t.h) add_header(ssize_t HDR ssize_t.h) add_header(__atfork_callback_t HDR __atfork_callback_t.h) -add_header(__bsearchcompare_t HDR __bsearchcompare_t.h) -add_header(__lsearchcompare_t HDR __lsearchcompare_t.h) +add_header(__search_compare_t HDR __search_compare_t.h) add_header(__call_once_func_t HDR __call_once_func_t.h) add_header(__dl_iterate_phdr_callback_t HDR __dl_iterate_phdr_callback_t.h DEPENDS .size_t) add_header(__exec_argv_t HDR __exec_argv_t.h) diff --git a/libc/include/llvm-libc-types/__lsearchcompare_t.h b/libc/include/llvm-libc-types/__lsearchcompare_t.h deleted file mode 100644 index 08dc2db274d0c..0000000000000 --- a/libc/include/llvm-libc-types/__lsearchcompare_t.h +++ /dev/null @@ -1,14 +0,0 @@ -//===-- Definition of type __lsearchcompare_t -----------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_TYPES___LSEARCHCOMPARE_T_H -#define LLVM_LIBC_TYPES___LSEARCHCOMPARE_T_H - -typedef int (*__lsearchcompare_t)(const void *, const void *); - -#endif // LLVM_LIBC_TYPES___LSEARCHCOMPARE_T_H diff --git a/libc/include/llvm-libc-types/__bsearchcompare_t.h b/libc/include/llvm-libc-types/__search_compare_t.h similarity index 53% rename from libc/include/llvm-libc-types/__bsearchcompare_t.h rename to libc/include/llvm-libc-types/__search_compare_t.h index 0b1987be1fdd6..7033fef49b49a 100644 --- a/libc/include/llvm-libc-types/__bsearchcompare_t.h +++ b/libc/include/llvm-libc-types/__search_compare_t.h @@ -1,4 +1,4 @@ -//===-- Definition of type __bsearchcompare_t -----------------------------===// +//===-- Definition of type __search_compare_t -----------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,9 +6,9 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIBC_TYPES___BSEARCHCOMPARE_T_H -#define LLVM_LIBC_TYPES___BSEARCHCOMPARE_T_H +#ifndef LLVM_LIBC_TYPES___SEARCH_COMPARE_T_H +#define LLVM_LIBC_TYPES___SEARCH_COMPARE_T_H -typedef int (*__bsearchcompare_t)(const void *, const void *); +typedef int (*__search_compare_t)(const void *, const void *); -#endif // LLVM_LIBC_TYPES___BSEARCHCOMPARE_T_H +#endif // LLVM_LIBC_TYPES___SEARCH_COMPARE_T_H diff --git a/libc/include/search.yaml b/libc/include/search.yaml index e2e711cc93f4d..e0247afad2cd6 100644 --- a/libc/include/search.yaml +++ b/libc/include/search.yaml @@ -2,11 +2,11 @@ header: search.h header_template: search.h.def macros: [] types: - - type_name: struct_hsearch_data - - type_name: ENTRY - type_name: ACTION + - type_name: ENTRY - type_name: VISIT - - type_name: __lsearchcompare_t + - type_name: __search_compare_t + - type_name: struct_hsearch_data enums: [] objects: [] functions: @@ -69,7 +69,7 @@ functions: - type: const void * - type: size_t * - type: size_t - - type: __lsearchcompare_t + - type: __search_compare_t - name: lsearch standards: - POSIX @@ -79,4 +79,4 @@ functions: - type: void * - type: size_t * - type: size_t - - type: __lsearchcompare_t + - type: __search_compare_t diff --git a/libc/include/stdlib.yaml b/libc/include/stdlib.yaml index a6204ce3afee3..5690a942e2570 100644 --- a/libc/include/stdlib.yaml +++ b/libc/include/stdlib.yaml @@ -7,9 +7,9 @@ merge_yaml_files: macros: [] types: - type_name: __atexithandler_t - - type_name: __qsortrcompare_t - type_name: __qsortcompare_t - - type_name: __bsearchcompare_t + - type_name: __qsortrcompare_t + - type_name: __search_compare_t - type_name: div_t - type_name: ldiv_t - type_name: lldiv_t @@ -87,7 +87,7 @@ functions: - type: const void * - type: size_t - type: size_t - - type: __bsearchcompare_t + - type: __search_compare_t - name: div standards: - stdc From 2190808f5d010a91e17c0dd8143466042b5a4028 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 3 Apr 2025 17:39:38 +0100 Subject: [PATCH 0552/1029] [X86] SimplifyDemandedVectorEltsForTargetNode - reduce the size of VPERMV/VPERMV3 nodes if the upper elements are not demanded (REAPPLIED) (#134263) With AVX512VL targets, use 128/256-bit VPERMV/VPERMV3 nodes when we only need the lower elements. Reapplied version of #133923 with fix for typo in the VPERMV3 mask adjustment --- llvm/lib/Target/X86/X86ISelLowering.cpp | 63 ++++++++++++ .../any_extend_vector_inreg_of_broadcast.ll | 46 ++++----- ...d_vector_inreg_of_broadcast_from_memory.ll | 8 +- llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll | 12 +-- .../vector-interleaved-load-i16-stride-5.ll | 76 ++++++++------- .../vector-interleaved-store-i64-stride-5.ll | 32 +++---- .../vector-interleaved-store-i64-stride-6.ll | 96 +++++++++---------- .../vector-shuffle-combining-avx512bwvl.ll | 6 +- .../zero_extend_vector_inreg_of_broadcast.ll | 22 ++--- ...d_vector_inreg_of_broadcast_from_memory.ll | 8 +- 10 files changed, 217 insertions(+), 152 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 546a2d22fa58e..d2d022ab52c41 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43827,6 +43827,69 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( } break; } + case X86ISD::VPERMV: { + SmallVector Mask; + SmallVector Ops; + if ((VT.is256BitVector() || Subtarget.hasVLX()) && + getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) { + // For lane-crossing shuffles, only split in half in case we're still + // referencing higher elements. + unsigned HalfElts = NumElts / 2; + unsigned HalfSize = SizeInBits / 2; + Mask.resize(HalfElts); + if (all_of(Mask, + [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) { + MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT(); + SDLoc DL(Op); + SDValue Ext; + SDValue M = + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize); + SDValue V = + extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize); + // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS. + if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16) + Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V); + else + Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, HalfVT, V, M); + SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false, + Subtarget, TLO.DAG, DL, SizeInBits); + return TLO.CombineTo(Op, Insert); + } + } + break; + } + case X86ISD::VPERMV3: { + SmallVector Mask; + SmallVector Ops; + if (Subtarget.hasVLX() && + getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) { + // For lane-crossing shuffles, only split in half in case we're still + // referencing higher elements. + unsigned HalfElts = NumElts / 2; + unsigned HalfSize = SizeInBits / 2; + Mask.resize(HalfElts); + if (all_of(Mask, [&](int M) { + return isUndefOrInRange(M, 0, HalfElts) || + isUndefOrInRange(M, NumElts, NumElts + HalfElts); + })) { + // Adjust mask elements for 2nd operand to point to half width. + for (int &M : Mask) + M = (M < NumElts) ? M : (M - HalfElts); + MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT(); + MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger(); + SDLoc DL(Op); + SDValue Ext = TLO.DAG.getNode( + Opc, DL, HalfVT, + extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize), + getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true), + extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize)); + SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false, + Subtarget, TLO.DAG, DL, SizeInBits); + return TLO.CombineTo(Op, Insert); + } + } + break; + } case X86ISD::VPERM2X128: { // Simplify VPERM2F128/VPERM2I128 to extract_subvector. SDLoc DL(Op); diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 6f4e7abda8b00..b075d48627b18 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax ; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -4610,10 +4610,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15] +; AVX512F-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4623,10 +4623,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15] +; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper @@ -4868,10 +4868,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] +; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512F-NEXT: vzeroupper @@ -4881,10 +4881,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11] -; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7] +; AVX512DQ-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 52f856befa130..61e122b1aba36 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll index 26af46263c0e2..a84466bc1ca1a 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -1113,8 +1113,8 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind ; ; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8: ; AVX512VBMI-FAST: # %bb.0: -; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,79] -; AVX512VBMI-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [64,65,66,67,68,69,24,28,32,36,40,44,48,52,56,79] +; AVX512VBMI-FAST-NEXT: vpmovdb %ymm0, %xmm2 ; AVX512VBMI-FAST-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 ; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm0, %eax @@ -1124,14 +1124,14 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind ; ; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8: ; AVX512VBMI-SLOW: # %bb.0: -; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,77,78,79] -; AVX512VBMI-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VBMI-SLOW-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 +; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,92,96,100,104,108,112,13,14,15] +; AVX512VBMI-SLOW-NEXT: vpmovdb %ymm0, %xmm2 +; AVX512VBMI-SLOW-NEXT: vpermt2b %zmm0, %zmm1, %zmm2 ; AVX512VBMI-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; AVX512VBMI-SLOW-NEXT: vpextrw $6, %xmm0, %eax ; AVX512VBMI-SLOW-NEXT: vpextrw $4, %xmm0, %ecx ; AVX512VBMI-SLOW-NEXT: vpextrw $2, %xmm0, %edx -; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm1, %xmm0 +; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm2, %xmm0 ; AVX512VBMI-SLOW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; AVX512VBMI-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 ; AVX512VBMI-SLOW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 739e6e2369e36..9b19ec15c6f55 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -593,100 +593,104 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i16_stride5_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax -; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 +; AVX512BW-NEXT: vmovq %xmm1, (%rsi) ; AVX512BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-NEXT: vmovq %xmm3, (%rcx) ; AVX512BW-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-NEXT: vmovq %xmm1, (%r9) +; AVX512BW-NEXT: vmovq %xmm2, (%r9) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: load_i16_stride5_vf4: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax -; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 -; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 +; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi) ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: load_i16_stride5_vf4: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax -; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2 +; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi) ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax -; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <20 x i16>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index 05c111ae5049f..f41123c5c3cfd 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -123,8 +123,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa %xmm3, 64(%r9) @@ -140,8 +140,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm3, 64(%r9) @@ -157,8 +157,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa %xmm3, 64(%r9) @@ -174,8 +174,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 64(%r9) @@ -191,8 +191,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa %xmm3, 64(%r9) @@ -208,8 +208,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9) @@ -225,8 +225,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, 64(%r9) @@ -242,8 +242,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll index c2f1723d8031e..aac6a1bddd08a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll @@ -139,12 +139,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -158,12 +158,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -177,12 +177,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -196,12 +196,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -215,12 +215,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -234,12 +234,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -253,12 +253,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -272,12 +272,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3] -; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7] +; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index ec09c3117c77f..f5cd3e580d017 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -187,10 +187,8 @@ define <8 x i32> @concat_vrotlv_v4i32(<4 x i32> %a0, <4 x i32> %a1, <8 x i32> %a define <8 x i16> @demandedelts_vpermvar_32i16_v8i16(<32 x i16> %x0) { ; CHECK-LABEL: demandedelts_vpermvar_32i16_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,0,6,1,5,2,4,3,7,0,6,1,5,2,4,3,7,0,6,1,5,2,4,3,7,0,6,1,5,2,4,3] -; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vpmovsxbw {{.*#+}} xmm1 = [7,0,6,1,5,2,4,3] +; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %shuffle = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> ) diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 35f25d36cb2e9..ea0e3b3a2b9aa 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] +; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax ; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1 ; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index a598e30845579..a3e2fb5321f32 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 @@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7] -; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in. ; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2: ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7] -; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0 ; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7] ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx) From 73e8d67a200beaa554a72cdd50e4d1a5a55caf69 Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Thu, 3 Apr 2025 09:40:50 -0700 Subject: [PATCH 0553/1029] Revert "[HLSL][RootSignature] Define and integrate `HLSLRootSignatureAttr`" (#134273) Reverts llvm/llvm-project#134124 The build is failing again to a linking error: [here](https://github.com/llvm/llvm-project/pull/134124#issuecomment-2776370486). Again the error was not present locally or any of the pre-merge builds and must have been transitively linked in these build environments... --- clang/include/clang/AST/Attr.h | 1 - clang/include/clang/Basic/Attr.td | 19 ----------- clang/include/clang/Basic/AttrDocs.td | 11 ------- clang/include/clang/Sema/SemaHLSL.h | 1 - clang/lib/Sema/SemaDeclAttr.cpp | 3 -- clang/lib/Sema/SemaHLSL.cpp | 35 --------------------- clang/test/AST/HLSL/RootSignatures-AST.hlsl | 24 -------------- clang/test/SemaHLSL/RootSignature-err.hlsl | 9 ------ 8 files changed, 103 deletions(-) delete mode 100644 clang/test/AST/HLSL/RootSignatures-AST.hlsl delete mode 100644 clang/test/SemaHLSL/RootSignature-err.hlsl diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h index 37c3f8bbfb5f9..994f236337b99 100644 --- a/clang/include/clang/AST/Attr.h +++ b/clang/include/clang/AST/Attr.h @@ -26,7 +26,6 @@ #include "clang/Basic/SourceLocation.h" #include "clang/Support/Compiler.h" #include "llvm/Frontend/HLSL/HLSLResource.h" -#include "llvm/Frontend/HLSL/HLSLRootSignature.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/VersionTuple.h" diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 9ef4f2b6b91ed..fd9e686485552 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4710,25 +4710,6 @@ def Error : InheritableAttr { let Documentation = [ErrorAttrDocs]; } -def HLSLRootSignature : Attr { - /// [RootSignature(Signature)] - let Spellings = [Microsoft<"RootSignature">]; - let Args = [StringArgument<"Signature">]; - let Subjects = SubjectList<[Function], - ErrorDiag, "'function'">; - let LangOpts = [HLSL]; - let Documentation = [HLSLRootSignatureDocs]; - let AdditionalMembers = [{ -private: - ArrayRef RootElements; -public: - void setElements(ArrayRef Elements) { - RootElements = Elements; - } - auto getElements() const { return RootElements; } -}]; -} - def HLSLNumThreads: InheritableAttr { let Spellings = [Microsoft<"numthreads">]; let Args = [IntArgument<"X">, IntArgument<"Y">, IntArgument<"Z">]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 1b969e456b910..c8b371280e35d 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -8145,17 +8145,6 @@ and https://microsoft.github.io/hlsl-specs/proposals/0013-wave-size-range.html }]; } -def HLSLRootSignatureDocs : Documentation { - let Category = DocCatFunction; - let Content = [{ -The ``RootSignature`` attribute applies to HLSL entry functions to define what -types of resources are bound to the graphics pipeline. - -For details about the use and specification of Root Signatures please see here: -https://learn.microsoft.com/en-us/windows/win32/direct3d12/root-signatures - }]; -} - def NumThreadsDocs : Documentation { let Category = DocCatFunction; let Content = [{ diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h index 1bd35332612cd..f333fe30e8da0 100644 --- a/clang/include/clang/Sema/SemaHLSL.h +++ b/clang/include/clang/Sema/SemaHLSL.h @@ -118,7 +118,6 @@ class SemaHLSL : public SemaBase { bool IsCompAssign); void emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS, BinaryOperatorKind Opc); - void handleRootSignatureAttr(Decl *D, const ParsedAttr &AL); void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL); void handleWaveSizeAttr(Decl *D, const ParsedAttr &AL); void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL); diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index b36d327f5bd0a..0b844b44930b9 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -7498,9 +7498,6 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, break; // HLSL attributes: - case ParsedAttr::AT_HLSLRootSignature: - S.HLSL().handleRootSignatureAttr(D, AL); - break; case ParsedAttr::AT_HLSLNumThreads: S.HLSL().handleNumThreadsAttr(D, AL); break; diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index bed14c111b544..fe600386e6fa9 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -28,7 +28,6 @@ #include "clang/Basic/SourceLocation.h" #include "clang/Basic/Specifiers.h" #include "clang/Basic/TargetInfo.h" -#include "clang/Parse/ParseHLSLRootSignature.h" #include "clang/Sema/Initialization.h" #include "clang/Sema/ParsedAttr.h" #include "clang/Sema/Sema.h" @@ -942,40 +941,6 @@ void SemaHLSL::emitLogicalOperatorFixIt(Expr *LHS, Expr *RHS, << NewFnName << FixItHint::CreateReplacement(FullRange, OS.str()); } -void SemaHLSL::handleRootSignatureAttr(Decl *D, const ParsedAttr &AL) { - if (AL.getNumArgs() != 1) { - Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) << AL << 1; - return; - } - - StringRef Signature; - if (!SemaRef.checkStringLiteralArgumentAttr(AL, 0, Signature)) - return; - - SourceLocation Loc = AL.getArgAsExpr(0)->getExprLoc(); - // TODO(#126565): pass down below to lexer when fp is supported - // llvm::RoundingMode RM = SemaRef.CurFPFeatures.getRoundingMode(); - hlsl::RootSignatureLexer Lexer(Signature, Loc); - SmallVector Elements; - hlsl::RootSignatureParser Parser(Elements, Lexer, SemaRef.getPreprocessor()); - - if (Parser.parse()) - return; - - // Allocate elements onto AST context - unsigned N = Elements.size(); - auto RootElements = MutableArrayRef( - ::new (getASTContext()) llvm::hlsl::rootsig::RootElement[N], N); - for (unsigned I = 0; I < N; ++I) - RootElements[I] = Elements[I]; - - // Set elements - auto *Result = ::new (getASTContext()) - HLSLRootSignatureAttr(getASTContext(), AL, Signature); - Result->setElements(ArrayRef(RootElements)); - D->addAttr(Result); -} - void SemaHLSL::handleNumThreadsAttr(Decl *D, const ParsedAttr &AL) { llvm::VersionTuple SMVersion = getASTContext().getTargetInfo().getTriple().getOSVersion(); diff --git a/clang/test/AST/HLSL/RootSignatures-AST.hlsl b/clang/test/AST/HLSL/RootSignatures-AST.hlsl deleted file mode 100644 index 948f2484ff5d0..0000000000000 --- a/clang/test/AST/HLSL/RootSignatures-AST.hlsl +++ /dev/null @@ -1,24 +0,0 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -ast-dump \ -// RUN: -disable-llvm-passes -o - %s | FileCheck %s - -// This test ensures that the sample root signature is parsed without error and -// the Attr AST Node is created succesfully. If an invalid root signature was -// passed in then we would exit out of Sema before the Attr is created. - -#define SampleRS \ - "DescriptorTable( " \ - " CBV(), " \ - " SRV(), " \ - " UAV()" \ - "), " \ - "DescriptorTable(Sampler())" - -// CHECK: HLSLRootSignatureAttr -// CHECK-SAME: "DescriptorTable( -// CHECK-SAME: CBV(), -// CHECK-SAME: SRV(), -// CHECK-SAME: UAV() -// CHECK-SAME: ), -// CHECK-SAME: DescriptorTable(Sampler())" -[RootSignature(SampleRS)] -void main() {} diff --git a/clang/test/SemaHLSL/RootSignature-err.hlsl b/clang/test/SemaHLSL/RootSignature-err.hlsl deleted file mode 100644 index 647a4ba2470a7..0000000000000 --- a/clang/test/SemaHLSL/RootSignature-err.hlsl +++ /dev/null @@ -1,9 +0,0 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - %s -verify - -// Attr test - -[RootSignature()] // expected-error {{'RootSignature' attribute takes one argument}} -void bad_root_signature_0() {} - -[RootSignature("Arg1", "Arg2")] // expected-error {{'RootSignature' attribute takes one argument}} -void bad_root_signature_1() {} From bc6cd825ecea94f015c590c877a1401d3a4a46b8 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Thu, 3 Apr 2025 09:45:00 -0700 Subject: [PATCH 0554/1029] [lldb-dap] Creating a common configuration structure for launch and attach requests. (#133960) This moves all the common settings of the launch and attach operations into the `lldb_dap::protocol::Configuration`. These common settings can be in both `launch` and `attach` requests and allows us to isolate the DAP configuration operations into a single common location. This is split out from #133624. --- lldb/tools/lldb-dap/DAP.cpp | 41 +++++----- lldb/tools/lldb-dap/DAP.h | 23 ++---- .../lldb-dap/Handler/AttachRequestHandler.cpp | 21 +++--- .../lldb-dap/Handler/CompletionsHandler.cpp | 7 +- .../Handler/EvaluateRequestHandler.cpp | 3 +- .../lldb-dap/Handler/LaunchRequestHandler.cpp | 21 +++--- .../tools/lldb-dap/Handler/RequestHandler.cpp | 2 +- .../Handler/SetVariableRequestHandler.cpp | 3 +- .../Handler/StackTraceRequestHandler.cpp | 2 +- .../Handler/VariablesRequestHandler.cpp | 20 ++--- lldb/tools/lldb-dap/JSONUtils.cpp | 3 +- lldb/tools/lldb-dap/JSONUtils.h | 5 -- .../lldb-dap/Protocol/ProtocolRequests.h | 75 +++++++++++++++++++ lldb/tools/lldb-dap/SourceBreakpoint.cpp | 6 +- lldb/tools/lldb-dap/lldb-dap.cpp | 15 ++-- 15 files changed, 156 insertions(+), 91 deletions(-) diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index 8951384212f11..9361ba968e9c2 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -69,20 +69,20 @@ const char DEV_NULL[] = "/dev/null"; namespace lldb_dap { -DAP::DAP(llvm::StringRef path, Log *log, const ReplMode default_repl_mode, +llvm::StringRef DAP::debug_adapter_path = ""; + +DAP::DAP(Log *log, const ReplMode default_repl_mode, std::vector pre_init_commands, Transport &transport) - : debug_adapter_path(path), log(log), transport(transport), - broadcaster("lldb-dap"), exception_breakpoints(), - pre_init_commands(std::move(pre_init_commands)), - focus_tid(LLDB_INVALID_THREAD_ID), stop_at_entry(false), is_attach(false), - enable_auto_variable_summaries(false), - enable_synthetic_child_debugging(false), - display_extended_backtrace(false), + : log(log), transport(transport), broadcaster("lldb-dap"), + exception_breakpoints(), focus_tid(LLDB_INVALID_THREAD_ID), + stop_at_entry(false), is_attach(false), restarting_process_id(LLDB_INVALID_PROCESS_ID), configuration_done_sent(false), waiting_for_run_in_terminal(false), progress_event_reporter( [&](const ProgressEvent &event) { SendJSON(event.ToJSON()); }), - reverse_request_seq(0), repl_mode(default_repl_mode) {} + reverse_request_seq(0), repl_mode(default_repl_mode) { + configuration.preInitCommands = std::move(pre_init_commands); +} DAP::~DAP() = default; @@ -505,8 +505,9 @@ ReplMode DAP::DetectReplMode(lldb::SBFrame frame, std::string &expression, bool partial_expression) { // Check for the escape hatch prefix. if (!expression.empty() && - llvm::StringRef(expression).starts_with(command_escape_prefix)) { - expression = expression.substr(command_escape_prefix.size()); + llvm::StringRef(expression) + .starts_with(configuration.commandEscapePrefix)) { + expression = expression.substr(configuration.commandEscapePrefix.size()); return ReplMode::Command; } @@ -546,7 +547,7 @@ ReplMode DAP::DetectReplMode(lldb::SBFrame frame, std::string &expression, << "Warning: Expression '" << term << "' is both an LLDB command and variable. It will be evaluated as " "a variable. To evaluate the expression as an LLDB command, use '" - << command_escape_prefix << "' as a prefix.\n"; + << configuration.commandEscapePrefix << "' as a prefix.\n"; } // Variables take preference to commands in auto, since commands can always @@ -593,36 +594,38 @@ DAP::RunLaunchCommands(llvm::ArrayRef launch_commands) { } llvm::Error DAP::RunInitCommands() { - if (!RunLLDBCommands("Running initCommands:", init_commands)) + if (!RunLLDBCommands("Running initCommands:", configuration.initCommands)) return createRunLLDBCommandsErrorMessage("initCommands"); return llvm::Error::success(); } llvm::Error DAP::RunPreInitCommands() { - if (!RunLLDBCommands("Running preInitCommands:", pre_init_commands)) + if (!RunLLDBCommands("Running preInitCommands:", + configuration.preInitCommands)) return createRunLLDBCommandsErrorMessage("preInitCommands"); return llvm::Error::success(); } llvm::Error DAP::RunPreRunCommands() { - if (!RunLLDBCommands("Running preRunCommands:", pre_run_commands)) + if (!RunLLDBCommands("Running preRunCommands:", configuration.preRunCommands)) return createRunLLDBCommandsErrorMessage("preRunCommands"); return llvm::Error::success(); } void DAP::RunPostRunCommands() { - RunLLDBCommands("Running postRunCommands:", post_run_commands); + RunLLDBCommands("Running postRunCommands:", configuration.postRunCommands); } void DAP::RunStopCommands() { - RunLLDBCommands("Running stopCommands:", stop_commands); + RunLLDBCommands("Running stopCommands:", configuration.stopCommands); } void DAP::RunExitCommands() { - RunLLDBCommands("Running exitCommands:", exit_commands); + RunLLDBCommands("Running exitCommands:", configuration.exitCommands); } void DAP::RunTerminateCommands() { - RunLLDBCommands("Running terminateCommands:", terminate_commands); + RunLLDBCommands("Running terminateCommands:", + configuration.terminateCommands); } lldb::SBTarget diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index 3ce6498632479..fc43d988f3a09 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -32,7 +32,6 @@ #include "lldb/API/SBThread.h" #include "lldb/API/SBValue.h" #include "lldb/API/SBValueList.h" -#include "lldb/lldb-forward.h" #include "lldb/lldb-types.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -149,12 +148,16 @@ struct SendEventRequestHandler : public lldb::SBCommandPluginInterface { }; struct DAP { - llvm::StringRef debug_adapter_path; + /// Path to the lldb-dap binary itself. + static llvm::StringRef debug_adapter_path; + Log *log; Transport &transport; lldb::SBFile in; OutputRedirector out; OutputRedirector err; + /// Configuration specified by the launch or attach commands. + protocol::Configuration configuration; lldb::SBDebugger debugger; lldb::SBTarget target; Variables variables; @@ -166,13 +169,6 @@ struct DAP { InstructionBreakpointMap instruction_breakpoints; std::optional> exception_breakpoints; llvm::once_flag init_exception_breakpoints_flag; - std::vector pre_init_commands; - std::vector init_commands; - std::vector pre_run_commands; - std::vector post_run_commands; - std::vector exit_commands; - std::vector stop_commands; - std::vector terminate_commands; // Map step in target id to list of function targets that user can choose. llvm::DenseMap step_in_targets; // A copy of the last LaunchRequest or AttachRequest so we can reuse its @@ -183,9 +179,6 @@ struct DAP { llvm::once_flag terminated_event_flag; bool stop_at_entry; bool is_attach; - bool enable_auto_variable_summaries; - bool enable_synthetic_child_debugging; - bool display_extended_backtrace; // The process event thread normally responds to process exited events by // shutting down the entire adapter. When we're restarting, we keep the id of // the old process here so we can detect this case and keep running. @@ -202,7 +195,7 @@ struct DAP { llvm::SmallDenseMap> inflight_reverse_requests; ReplMode repl_mode; - std::string command_escape_prefix = "`"; + lldb::SBFormat frame_format; lldb::SBFormat thread_format; // This is used to allow request_evaluate to handle empty expressions @@ -216,8 +209,6 @@ struct DAP { /// Creates a new DAP sessions. /// - /// \param[in] path - /// Path to the lldb-dap binary. /// \param[in] log /// Log stream, if configured. /// \param[in] default_repl_mode @@ -226,7 +217,7 @@ struct DAP { /// LLDB commands to execute as soon as the debugger instance is allocaed. /// \param[in] transport /// Transport for this debug session. - DAP(llvm::StringRef path, Log *log, const ReplMode default_repl_mode, + DAP(Log *log, const ReplMode default_repl_mode, std::vector pre_init_commands, Transport &transport); ~DAP(); diff --git a/lldb/tools/lldb-dap/Handler/AttachRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/AttachRequestHandler.cpp index 20f7c80a1ed90..5e622f3d3dcd4 100644 --- a/lldb/tools/lldb-dap/Handler/AttachRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/AttachRequestHandler.cpp @@ -63,11 +63,12 @@ void AttachRequestHandler::operator()(const llvm::json::Object &request) const { attach_info.SetProcessID(pid); const auto wait_for = GetBoolean(arguments, "waitFor").value_or(false); attach_info.SetWaitForLaunch(wait_for, false /*async*/); - dap.init_commands = GetStrings(arguments, "initCommands"); - dap.pre_run_commands = GetStrings(arguments, "preRunCommands"); - dap.stop_commands = GetStrings(arguments, "stopCommands"); - dap.exit_commands = GetStrings(arguments, "exitCommands"); - dap.terminate_commands = GetStrings(arguments, "terminateCommands"); + dap.configuration.initCommands = GetStrings(arguments, "initCommands"); + dap.configuration.preRunCommands = GetStrings(arguments, "preRunCommands"); + dap.configuration.stopCommands = GetStrings(arguments, "stopCommands"); + dap.configuration.exitCommands = GetStrings(arguments, "exitCommands"); + dap.configuration.terminateCommands = + GetStrings(arguments, "terminateCommands"); auto attachCommands = GetStrings(arguments, "attachCommands"); llvm::StringRef core_file = GetString(arguments, "coreFile").value_or(""); const uint64_t timeout_seconds = @@ -75,16 +76,16 @@ void AttachRequestHandler::operator()(const llvm::json::Object &request) const { dap.stop_at_entry = core_file.empty() ? GetBoolean(arguments, "stopOnEntry").value_or(false) : true; - dap.post_run_commands = GetStrings(arguments, "postRunCommands"); + dap.configuration.postRunCommands = GetStrings(arguments, "postRunCommands"); const llvm::StringRef debuggerRoot = GetString(arguments, "debuggerRoot").value_or(""); - dap.enable_auto_variable_summaries = + dap.configuration.enableAutoVariableSummaries = GetBoolean(arguments, "enableAutoVariableSummaries").value_or(false); - dap.enable_synthetic_child_debugging = + dap.configuration.enableSyntheticChildDebugging = GetBoolean(arguments, "enableSyntheticChildDebugging").value_or(false); - dap.display_extended_backtrace = + dap.configuration.displayExtendedBacktrace = GetBoolean(arguments, "displayExtendedBacktrace").value_or(false); - dap.command_escape_prefix = + dap.configuration.commandEscapePrefix = GetString(arguments, "commandEscapePrefix").value_or("`"); dap.SetFrameFormat(GetString(arguments, "customFrameFormat").value_or("")); dap.SetThreadFormat(GetString(arguments, "customThreadFormat").value_or("")); diff --git a/lldb/tools/lldb-dap/Handler/CompletionsHandler.cpp b/lldb/tools/lldb-dap/Handler/CompletionsHandler.cpp index 5414aaeb2c317..c72fc5686cd5b 100644 --- a/lldb/tools/lldb-dap/Handler/CompletionsHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/CompletionsHandler.cpp @@ -157,19 +157,20 @@ void CompletionsRequestHandler::operator()( llvm::json::Array targets; bool had_escape_prefix = - llvm::StringRef(text).starts_with(dap.command_escape_prefix); + llvm::StringRef(text).starts_with(dap.configuration.commandEscapePrefix); ReplMode completion_mode = dap.DetectReplMode(frame, text, true); // Handle the offset change introduced by stripping out the // `command_escape_prefix`. if (had_escape_prefix) { - if (offset < static_cast(dap.command_escape_prefix.size())) { + if (offset < + static_cast(dap.configuration.commandEscapePrefix.size())) { body.try_emplace("targets", std::move(targets)); response.try_emplace("body", std::move(body)); dap.SendJSON(llvm::json::Value(std::move(response))); return; } - offset -= dap.command_escape_prefix.size(); + offset -= dap.configuration.commandEscapePrefix.size(); } // While the user is typing then we likely have an incomplete input and cannot diff --git a/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp index e9f08a1017abc..8ed09fa2a931a 100644 --- a/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp @@ -205,7 +205,8 @@ void EvaluateRequestHandler::operator()( else EmplaceSafeString(response, "message", "evaluate failed"); } else { - VariableDescription desc(value, dap.enable_auto_variable_summaries); + VariableDescription desc(value, + dap.configuration.enableAutoVariableSummaries); EmplaceSafeString(body, "result", desc.GetResult(context)); EmplaceSafeString(body, "type", desc.display_type_name); int64_t var_ref = 0; diff --git a/lldb/tools/lldb-dap/Handler/LaunchRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/LaunchRequestHandler.cpp index f64c186376a36..5f14cb074e37e 100644 --- a/lldb/tools/lldb-dap/Handler/LaunchRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/LaunchRequestHandler.cpp @@ -54,22 +54,23 @@ void LaunchRequestHandler::operator()(const llvm::json::Object &request) const { llvm::json::Object response; FillResponse(request, response); const auto *arguments = request.getObject("arguments"); - dap.init_commands = GetStrings(arguments, "initCommands"); - dap.pre_run_commands = GetStrings(arguments, "preRunCommands"); - dap.stop_commands = GetStrings(arguments, "stopCommands"); - dap.exit_commands = GetStrings(arguments, "exitCommands"); - dap.terminate_commands = GetStrings(arguments, "terminateCommands"); - dap.post_run_commands = GetStrings(arguments, "postRunCommands"); + dap.configuration.initCommands = GetStrings(arguments, "initCommands"); + dap.configuration.preRunCommands = GetStrings(arguments, "preRunCommands"); + dap.configuration.stopCommands = GetStrings(arguments, "stopCommands"); + dap.configuration.exitCommands = GetStrings(arguments, "exitCommands"); + dap.configuration.terminateCommands = + GetStrings(arguments, "terminateCommands"); + dap.configuration.postRunCommands = GetStrings(arguments, "postRunCommands"); dap.stop_at_entry = GetBoolean(arguments, "stopOnEntry").value_or(false); const llvm::StringRef debuggerRoot = GetString(arguments, "debuggerRoot").value_or(""); - dap.enable_auto_variable_summaries = + dap.configuration.enableAutoVariableSummaries = GetBoolean(arguments, "enableAutoVariableSummaries").value_or(false); - dap.enable_synthetic_child_debugging = + dap.configuration.enableSyntheticChildDebugging = GetBoolean(arguments, "enableSyntheticChildDebugging").value_or(false); - dap.display_extended_backtrace = + dap.configuration.displayExtendedBacktrace = GetBoolean(arguments, "displayExtendedBacktrace").value_or(false); - dap.command_escape_prefix = + dap.configuration.commandEscapePrefix = GetString(arguments, "commandEscapePrefix").value_or("`"); dap.SetFrameFormat(GetString(arguments, "customFrameFormat").value_or("")); dap.SetThreadFormat(GetString(arguments, "customThreadFormat").value_or("")); diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp index f067dfc5544fe..576f0dda64cf4 100644 --- a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp @@ -119,7 +119,7 @@ static llvm::Error RunInTerminal(DAP &dap, debugger_pid = getpid(); #endif llvm::json::Object reverse_request = CreateRunInTerminalReverseRequest( - launch_request, dap.debug_adapter_path, comm_file.m_path, debugger_pid); + launch_request, comm_file.m_path, debugger_pid); dap.SendReverseRequest("runInTerminal", std::move(reverse_request)); diff --git a/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp index a7896b7fefa29..c48bcd84c9ddc 100644 --- a/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/SetVariableRequestHandler.cpp @@ -145,7 +145,8 @@ void SetVariableRequestHandler::operator()( lldb::SBError error; bool success = variable.SetValueFromCString(value.data(), error); if (success) { - VariableDescription desc(variable, dap.enable_auto_variable_summaries); + VariableDescription desc(variable, + dap.configuration.enableAutoVariableSummaries); EmplaceSafeString(body, "value", desc.display_value); EmplaceSafeString(body, "type", desc.display_type_name); diff --git a/lldb/tools/lldb-dap/Handler/StackTraceRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/StackTraceRequestHandler.cpp index 220be0f99be6b..a58e3325af100 100644 --- a/lldb/tools/lldb-dap/Handler/StackTraceRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/StackTraceRequestHandler.cpp @@ -70,7 +70,7 @@ static bool FillStackFrames(DAP &dap, lldb::SBThread &thread, stack_frames.emplace_back(CreateStackFrame(frame, dap.frame_format)); } - if (dap.display_extended_backtrace && reached_end_of_stack) { + if (dap.configuration.displayExtendedBacktrace && reached_end_of_stack) { // Check for any extended backtraces. for (uint32_t bt = 0; bt < thread.GetProcess().GetNumExtendedBacktraceTypes(); bt++) { diff --git a/lldb/tools/lldb-dap/Handler/VariablesRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/VariablesRequestHandler.cpp index 6bb0a0f160499..19bcca2b22b9b 100644 --- a/lldb/tools/lldb-dap/Handler/VariablesRequestHandler.cpp +++ b/lldb/tools/lldb-dap/Handler/VariablesRequestHandler.cpp @@ -180,10 +180,10 @@ void VariablesRequestHandler::operator()( return_var_ref = dap.variables.InsertVariable(stop_return_value, /*is_permanent=*/false); } - variables.emplace_back( - CreateVariable(renamed_return_value, return_var_ref, hex, - dap.enable_auto_variable_summaries, - dap.enable_synthetic_child_debugging, false)); + variables.emplace_back(CreateVariable( + renamed_return_value, return_var_ref, hex, + dap.configuration.enableAutoVariableSummaries, + dap.configuration.enableSyntheticChildDebugging, false)); } } @@ -197,8 +197,8 @@ void VariablesRequestHandler::operator()( int64_t var_ref = dap.variables.InsertVariable(variable, /*is_permanent=*/false); variables.emplace_back(CreateVariable( - variable, var_ref, hex, dap.enable_auto_variable_summaries, - dap.enable_synthetic_child_debugging, + variable, var_ref, hex, dap.configuration.enableAutoVariableSummaries, + dap.configuration.enableSyntheticChildDebugging, variable_name_counts[GetNonNullVariableName(variable)] > 1)); } } else { @@ -214,8 +214,8 @@ void VariablesRequestHandler::operator()( dap.variables.IsPermanentVariableReference(variablesReference); int64_t var_ref = dap.variables.InsertVariable(child, is_permanent); variables.emplace_back(CreateVariable( - child, var_ref, hex, dap.enable_auto_variable_summaries, - dap.enable_synthetic_child_debugging, + child, var_ref, hex, dap.configuration.enableAutoVariableSummaries, + dap.configuration.enableSyntheticChildDebugging, /*is_name_duplicated=*/false, custom_name)); }; const int64_t num_children = variable.GetNumChildren(); @@ -228,8 +228,8 @@ void VariablesRequestHandler::operator()( // "[raw]" child that can be used to inspect the raw version of a // synthetic member. That eliminates the need for the user to go to the // debug console and type `frame var to get these values. - if (dap.enable_synthetic_child_debugging && variable.IsSynthetic() && - i == num_children) + if (dap.configuration.enableSyntheticChildDebugging && + variable.IsSynthetic() && i == num_children) addChild(variable.GetNonSyntheticValue(), "[raw]"); } } diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 590137e48199d..7660403666150 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -1400,7 +1400,6 @@ llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit) { /// https://microsoft.github.io/debug-adapter-protocol/specification#Reverse_Requests_RunInTerminal llvm::json::Object CreateRunInTerminalReverseRequest(const llvm::json::Object &launch_request, - llvm::StringRef debug_adapter_path, llvm::StringRef comm_file, lldb::pid_t debugger_pid) { llvm::json::Object run_in_terminal_args; @@ -1410,7 +1409,7 @@ CreateRunInTerminalReverseRequest(const llvm::json::Object &launch_request, const auto *launch_request_arguments = launch_request.getObject("arguments"); // The program path must be the first entry in the "args" field - std::vector args = {debug_adapter_path.str(), "--comm-file", + std::vector args = {DAP::debug_adapter_path.str(), "--comm-file", comm_file.str()}; if (debugger_pid != LLDB_INVALID_PROCESS_ID) { args.push_back("--debugger-pid"); diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h index 5d403d39a76d4..da91797290ff0 100644 --- a/lldb/tools/lldb-dap/JSONUtils.h +++ b/lldb/tools/lldb-dap/JSONUtils.h @@ -565,10 +565,6 @@ llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit); /// The original launch_request object whose fields are used to construct /// the reverse request object. /// -/// \param[in] debug_adapter_path -/// Path to the current debug adapter. It will be used to delegate the -/// launch of the target. -/// /// \param[in] comm_file /// The fifo file used to communicate the with the target launcher. /// @@ -582,7 +578,6 @@ llvm::json::Value CreateCompileUnit(lldb::SBCompileUnit &unit); /// Microsoft. llvm::json::Object CreateRunInTerminalReverseRequest(const llvm::json::Object &launch_request, - llvm::StringRef debug_adapter_path, llvm::StringRef comm_file, lldb::pid_t debugger_pid); diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h index 927106997953a..64c5116315239 100644 --- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h +++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h @@ -118,6 +118,81 @@ bool fromJSON(const llvm::json::Value &, InitializeRequestArguments &, /// Response to `initialize` request. The capabilities of this debug adapter. using InitializeResponseBody = std::optional; +/// DAP Launch and Attach common configurations. +struct Configuration { + /// Specify a working directory to use when launching `lldb-dap`. If the debug + /// information in your executable contains relative paths, this option can be + /// used so that `lldb-dap` can find source files and object files that have + /// relative paths. + std::optional debuggerRoot; + + /// Enable auto generated summaries for variables when no summaries exist for + /// a given type. This feature can cause performance delays in large projects + /// when viewing variables. + bool enableAutoVariableSummaries = false; + + /// If a variable is displayed using a synthetic children, also display the + /// actual contents of the variable at the end under a [raw] entry. This is + /// useful when creating sythetic child plug-ins as it lets you see the actual + /// contents of the variable. + bool enableSyntheticChildDebugging = false; + + /// Enable language specific extended backtraces. + bool displayExtendedBacktrace = false; + + /// The escape prefix to use for executing regular LLDB commands in the Debug + /// Console, instead of printing variables. Defaults to a backtick. If it's an + /// empty string, then all expression in the Debug Console are treated as + /// regular LLDB commands. + std::string commandEscapePrefix = "`"; + + /// If non-empty, stack frames will have descriptions generated based on the + /// provided format. See https://lldb.llvm.org/use/formatting.html for an + /// explanation on format strings for frames. If the format string contains + /// errors, an error message will be displayed on the Debug Console and the + /// default frame names will be used. This might come with a performance cost + /// because debug information might need to be processed to generate the + /// description. + std::optional customFrameFormat; + + /// Same as `customFrameFormat`, but for threads instead of stack frames. + std::optional customThreadFormat; + + /// Specify a source path to remap "./" to allow full paths to be used when + /// setting breakpoints in binaries that have relative source paths. + std::optional sourcePath; + + /// Specify an array of path re-mappings. Each element in the array must be a + /// two element array containing a source and destination pathname. Overrides + /// sourcePath. + std::vector> sourceMap; + + /// LLDB commands executed upon debugger startup prior to creating the LLDB + /// target. + std::vector preInitCommands; + + /// LLDB commands executed upon debugger startup prior to creating the LLDB + /// target. + std::vector initCommands; + + /// LLDB commands executed just before launching/attaching, after the LLDB + /// target has been created. + std::vector preRunCommands; + + /// LLDB commands executed just after launching/attaching, after the LLDB + /// target has been created. + std::vector postRunCommands; + + /// LLDB commands executed just after each stop. + std::vector stopCommands; + + /// LLDB commands executed when the program exits. + std::vector exitCommands; + + /// LLDB commands executed when the debugging session ends. + std::vector terminateCommands; +}; + /// Arguments for `source` request. struct SourceArguments { /// Specifies the source content to load. Either `source.path` or diff --git a/lldb/tools/lldb-dap/SourceBreakpoint.cpp b/lldb/tools/lldb-dap/SourceBreakpoint.cpp index 6d8d3470668c8..a7e00cae36fbc 100644 --- a/lldb/tools/lldb-dap/SourceBreakpoint.cpp +++ b/lldb/tools/lldb-dap/SourceBreakpoint.cpp @@ -324,9 +324,9 @@ bool SourceBreakpoint::BreakpointHitCallback( frame.GetValueForVariablePath(expr, lldb::eDynamicDontRunTarget); if (value.GetError().Fail()) value = frame.EvaluateExpression(expr); - output += - VariableDescription(value, bp->m_dap.enable_auto_variable_summaries) - .display_value; + output += VariableDescription( + value, bp->m_dap.configuration.enableAutoVariableSummaries) + .display_value; } else { output += messagePart.text; } diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index ec87db6aab330..16a90dd20707e 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -44,7 +44,6 @@ #include #include #include -#include #include #include #include @@ -289,8 +288,7 @@ validateConnection(llvm::StringRef conn) { static llvm::Error serveConnection(const Socket::SocketProtocol &protocol, const std::string &name, - Log *log, llvm::StringRef program_path, - const ReplMode default_repl_mode, + Log *log, const ReplMode default_repl_mode, const std::vector &pre_init_commands) { Status status; static std::unique_ptr listener = Socket::Create(protocol, status); @@ -335,8 +333,7 @@ serveConnection(const Socket::SocketProtocol &protocol, const std::string &name, &dap_sessions]() { llvm::set_thread_name(client_name + ".runloop"); Transport transport(client_name, log, io, io); - DAP dap(program_path, log, default_repl_mode, pre_init_commands, - transport); + DAP dap(log, default_repl_mode, pre_init_commands, transport); if (auto Err = dap.ConfigureIO()) { llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), @@ -417,6 +414,7 @@ int main(int argc, char *argv[]) { llvm::SmallString<256> program_path(argv[0]); llvm::sys::fs::make_absolute(program_path); + DAP::debug_adapter_path = program_path; LLDBDAPOptTable T; unsigned MAI, MAC; @@ -553,8 +551,8 @@ int main(int argc, char *argv[]) { Socket::SocketProtocol protocol; std::string name; std::tie(protocol, name) = *maybeProtoclAndName; - if (auto Err = serveConnection(protocol, name, log.get(), program_path, - default_repl_mode, pre_init_commands)) { + if (auto Err = serveConnection(protocol, name, log.get(), default_repl_mode, + pre_init_commands)) { llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(), "Connection failed: "); return EXIT_FAILURE; @@ -589,8 +587,7 @@ int main(int argc, char *argv[]) { constexpr llvm::StringLiteral client_name = "stdin/stdout"; Transport transport(client_name, log.get(), input, output); - DAP dap(program_path, log.get(), default_repl_mode, pre_init_commands, - transport); + DAP dap(log.get(), default_repl_mode, pre_init_commands, transport); // stdout/stderr redirection to the IDE's console if (auto Err = dap.ConfigureIO(stdout, stderr)) { From a1bc979aa854c600e64e7500f1b79cd1d2655eb4 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com> Date: Thu, 3 Apr 2025 09:47:36 -0700 Subject: [PATCH 0555/1029] [mlir][Bufferization] Do not have read semantics for destination of `tensor.parallel_insert_slice`. (#134169) `tensor.insert_slice` needs to have read semantics on its destination operand. Since it has a return value, its semantics are - Copy dest to result - Copy source to subview of destination. `tensor.parallel_insert_slice` though has no result. So it does not need to have read semantics. The op description [here](https://github.com/llvm/llvm-project/blob/a3ac318e5f8668ec5b79dd86639881dfb2e88b69/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td#L1524) also says that it is expected to lower to a `memref.subview`, that does not have read semantics on the destination (its just a view). This patch drops the read semantics for destination of `tensor.parallel_insert_slice` but also makes the `shared_outs` operands of `scf.forall` have read semantics. Earlier it would rely indirectly on read semantics of destination operand of `tensor.parallel_insert_slice` to propagate the read semantics for `shared_outs`. Now that is specified more directly. Fixes #133964 --------- Signed-off-by: MaheshRavishankar --- .../BufferizableOpInterfaceImpl.cpp | 28 +++------------ .../BufferizableOpInterfaceImpl.cpp | 3 +- mlir/test/Dialect/SCF/one-shot-bufferize.mlir | 35 +++++++++++++++++++ 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp index f48d2a2df9c3c..cf62ee8bc45b5 100644 --- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp @@ -1186,18 +1186,6 @@ struct YieldOpInterface } }; -/// Return `true` if the given loop may have 0 iterations. -bool mayHaveZeroIterations(scf::ForallOp forallOp) { - for (auto [lb, ub] : llvm::zip(forallOp.getMixedLowerBound(), - forallOp.getMixedUpperBound())) { - std::optional lbConst = getConstantIntValue(lb); - std::optional ubConst = getConstantIntValue(ub); - if (!lbConst.has_value() || !ubConst.has_value() || *lbConst >= *ubConst) - return true; - } - return false; -} - /// Bufferization of ForallOp. This also bufferizes the terminator of the /// region. There are op interfaces for the terminators (InParallelOp /// and ParallelInsertSliceOp), but these are only used during analysis. Not @@ -1207,17 +1195,11 @@ struct ForallOpInterface ForallOp> { bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand, const AnalysisState &state) const { - auto forallOp = cast(op); - - // If the loop has zero iterations, the results of the op are their - // corresponding shared_outs, meaning that the shared_outs bufferize to a - // read. - if (mayHaveZeroIterations(forallOp)) - return true; - - // scf::ForallOp alone doesn't bufferize to a memory read, one of the - // uses of its matching bbArg may. - return state.isValueRead(forallOp.getTiedBlockArgument(&opOperand)); + // All tensor operands to `scf.forall` are `shared_outs` and all + // shared outs are assumed to be read by the loop. This does not + // account for the case where the entire value is over-written, + // but being conservative here. + return true; } bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand, diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp index 4ac6eca586961..31014172a9555 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp @@ -930,8 +930,7 @@ struct ParallelInsertSliceOpInterface bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand, const AnalysisState &state) const { - return insertSliceOpRequiresRead(cast(op), - opOperand); + return opOperand == cast(op).getSourceMutable(); } bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand, diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir index bb9f7dfdba83f..a1067ec3ba05f 100644 --- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir @@ -946,3 +946,38 @@ func.func @index_switch(%pred: index, %b: tensor<5xf32>, %c: tensor<5xf32>) -> t // CHECK: return %[[r]] return %0 : tensor<5xf32> } + +// ----- + +// See Issue https://github.com/llvm/llvm-project/issues/133964 . Checks that +// tensor.parallel_insert_slice dest operand does not have read semantics. +func.func @check_scfforall_inplace_bufferizer(%arg0 : tensor, + %arg1 : tensor, + %arg2 : tensor {bufferization.writable = true}) -> tensor { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %d0 = tensor.dim %arg2, %c0 : tensor + %d1 = tensor.dim %arg1, %c1 : tensor + %0 = scf.forall (%arg3) in (%c1) shared_outs(%arg4 = %arg2) -> (tensor) { + %1 = tensor.extract_slice %arg0[0, 0][%d0, %d1][1, 1] : tensor to tensor + %2 = tensor.extract_slice %arg1[0, 0][%d0, %d1][1, 1] : tensor to tensor + %3 = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d0)>], + iterator_types = ["parallel", "reduction"]} + ins(%1, %2 : tensor, tensor) + outs(%arg4 : tensor) { + ^bb0(%b0 : f32, %b1: f32, %b2 : f32): + %4 = arith.mulf %b0, %b1 : f32 + %5 = arith.addf %4, %b2 : f32 + linalg.yield %5 : f32 + } -> tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %3 into %arg4[0] [%d0] [1] : tensor into tensor + } + } + return %0 : tensor +} +// CHECK-LABEL: func @check_scfforall_inplace_bufferizer +// CHECK-NOT: memref.alloc From 6ddf7cf7808e7e60314ed003d52c215dedc0924b Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Thu, 3 Apr 2025 18:47:56 +0200 Subject: [PATCH 0556/1029] [mlir][bazel] Allow `gentbl_cc_library(tbl_outs)` to be a dict. (#134271) This makes the BUILD file shorter and more readable. I will follow up with converting the other instances. --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 14 ++++---------- utils/bazel/llvm-project-overlay/mlir/tblgen.bzl | 8 +++++--- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 0c89b7bf18e0f..83891dcb1e3d2 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -128,16 +128,10 @@ gentbl_cc_library( gentbl_cc_library( name = "TensorEncodingIncGen", - tbl_outs = [ - ( - ["-gen-attr-interface-decls"], - "include/mlir/IR/TensorEncInterfaces.h.inc", - ), - ( - ["-gen-attr-interface-defs"], - "include/mlir/IR/TensorEncInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/IR/TensorEncInterfaces.h.inc": ["-gen-attr-interface-decls"], + "include/mlir/IR/TensorEncInterfaces.cpp.inc": ["-gen-attr-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/IR/TensorEncoding.td", deps = [":TensorOpsTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl index e45ba1fe0ef72..b0012848100be 100644 --- a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl +++ b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl @@ -397,9 +397,9 @@ def gentbl_cc_library( name: The name of the generated cc_library rule for use in dependencies. tblgen: The binary used to produce the output. td_file: The primary table definitions file. - tbl_outs: A list of tuples ([opts], out), where each 'opts' is a list of - options passed to tblgen, each option being a string, and 'out' is the - corresponding output file produced. + tbl_outs: Either a dict {out: [opts]} or a list of tuples ([opts], out), + where each 'opts' is a list of options passed to tblgen, each option + being a string, and 'out' is the corresponding output file produced. td_srcs: See gentbl_rule.td_srcs includes: See gentbl_rule.includes deps: See gentbl_rule.deps @@ -409,6 +409,8 @@ def gentbl_cc_library( **kwargs: Extra keyword arguments to pass to all generated rules. """ + if type(tbl_outs) == type({}): + tbl_outs = [(v, k) for k, v in tbl_outs.items()] filegroup_name = name + "_filegroup" gentbl_filegroup( name = filegroup_name, From a54736afd5b8f8ed25550a9f456afd36e49c04e0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 3 Apr 2025 23:52:25 +0700 Subject: [PATCH 0557/1029] CloneFunction: Do not delete blocks with address taken (#134209) If a block with a single predecessor also had its address taken, it was getting deleted in this post-inline cleanup step. This would result in the blockaddress in the resulting function getting deleted and replaced with inttoptr 1. This fixes one bug required to permit inlining of functions with blockaddress uses. At the moment this is not testable (at least without an annoyingly complex unit test), and is a pre-bug fix for future patches. Functions with blockaddress uses are rejected in isInlineViable, so we don't get this far with the current InlineFunction uses (some of the existing cases seem to reproduce this part of the rejection logic, like PartialInliner). This will be tested in a pending llvm-reduce change. Prerequisite for #38908 --- llvm/lib/Transforms/Utils/CloneFunction.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index e58585705e82f..9387797019023 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -928,7 +928,7 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, } BasicBlock *Dest = BI->getSuccessor(0); - if (!Dest->getSinglePredecessor()) { + if (!Dest->getSinglePredecessor() || Dest->hasAddressTaken()) { ++I; continue; } From 9a5b0f302be2ce155d5a4a0f0ca998ecb9bca497 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 3 Apr 2025 18:03:16 +0100 Subject: [PATCH 0558/1029] Reapply "[InstCombine] Match scalable splats in m_ImmConstant (#132522)" (#134262) This reapplies #132522. Previously casts of scalable m_ImmConstant splats weren't being folded by ConstantFoldCastOperand, triggering the "Constant-fold of ImmConstant should not fail" assertion. There are no changes to the code in this PR, instead we just needed #133207 to land first. A test has been added for the assertion in llvm/test/Transforms/InstSimplify/vec-icmp-of-cast.ll @icmp_ult_sext_scalable_splat_is_true.
#118806 fixed an infinite loop in FoldShiftByConstant that could occur when the shift amount was a ConstantExpr. However this meant that FoldShiftByConstant no longer kicked in for scalable vectors because scalable splats are represented by ConstantExprs. This fixes it by allowing scalable splats of non-ConstantExprs in m_ImmConstant, which also fixes a few other test cases where scalable splats were being missed. But I'm also hoping that UseConstantIntForScalableSplat will eventually remove the need for this. I noticed this when trying to reverse a combine on RISC-V in #132245, and saw that the resulting vector and scalar forms were different. --- llvm/include/llvm/IR/PatternMatch.h | 51 +++++++++++++++---- llvm/test/Transforms/InstCombine/select.ll | 3 +- llvm/test/Transforms/InstCombine/shl-bo.ll | 11 ++++ .../InstCombine/shl-twice-constant.ll | 11 ++++ llvm/test/Transforms/InstCombine/sub.ll | 4 +- .../InstSimplify/vec-icmp-of-cast.ll | 9 ++++ 6 files changed, 75 insertions(+), 14 deletions(-) diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index b3eeb1d7ba88a..2d27c19e1b85e 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -858,18 +858,51 @@ inline bind_ty m_BasicBlock(const BasicBlock *&V) { return V; } +// TODO: Remove once UseConstant{Int,FP}ForScalableSplat is enabled by default, +// and use m_Unless(m_ConstantExpr). +struct immconstant_ty { + template static bool isImmConstant(ITy *V) { + if (auto *CV = dyn_cast(V)) { + if (!isa(CV) && !CV->containsConstantExpression()) + return true; + + if (CV->getType()->isVectorTy()) { + if (auto *Splat = CV->getSplatValue(/*AllowPoison=*/true)) { + if (!isa(Splat) && + !Splat->containsConstantExpression()) { + return true; + } + } + } + } + return false; + } +}; + +struct match_immconstant_ty : immconstant_ty { + template bool match(ITy *V) { return isImmConstant(V); } +}; + /// Match an arbitrary immediate Constant and ignore it. -inline match_combine_and, - match_unless> -m_ImmConstant() { - return m_CombineAnd(m_Constant(), m_Unless(m_ConstantExpr())); -} +inline match_immconstant_ty m_ImmConstant() { return match_immconstant_ty(); } + +struct bind_immconstant_ty : immconstant_ty { + Constant *&VR; + + bind_immconstant_ty(Constant *&V) : VR(V) {} + + template bool match(ITy *V) { + if (isImmConstant(V)) { + VR = cast(V); + return true; + } + return false; + } +}; /// Match an immediate Constant, capturing the value if we match. -inline match_combine_and, - match_unless> -m_ImmConstant(Constant *&C) { - return m_CombineAnd(m_Constant(C), m_Unless(m_ConstantExpr())); +inline bind_immconstant_ty m_ImmConstant(Constant *&C) { + return bind_immconstant_ty(C); } /// Match a specified Value*. diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index 2078b795817f8..3d81b72dd232e 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -3519,8 +3519,7 @@ define @scalable_sign_bits( %x) { define @scalable_non_zero( %x) { ; CHECK-LABEL: @scalable_non_zero( -; CHECK-NEXT: [[A:%.*]] = or [[X:%.*]], splat (i32 1) -; CHECK-NEXT: [[CMP:%.*]] = icmp ult [[A]], splat (i32 57) +; CHECK-NEXT: [[CMP:%.*]] = icmp ult [[X:%.*]], splat (i32 56) ; CHECK-NEXT: ret [[CMP]] ; %a = or %x, splat (i32 1) diff --git a/llvm/test/Transforms/InstCombine/shl-bo.ll b/llvm/test/Transforms/InstCombine/shl-bo.ll index c32ac2eacb25a..5ee8716d5d119 100644 --- a/llvm/test/Transforms/InstCombine/shl-bo.ll +++ b/llvm/test/Transforms/InstCombine/shl-bo.ll @@ -656,3 +656,14 @@ define <16 x i8> @test_FoldShiftByConstant_CreateAnd(<16 x i8> %in0) { %vshl_n = shl <16 x i8> %tmp, ret <16 x i8> %vshl_n } + +define @test_FoldShiftByConstant_CreateAnd_scalable( %x) { +; CHECK-LABEL: @test_FoldShiftByConstant_CreateAnd_scalable( +; CHECK-NEXT: [[TMP1:%.*]] = shl [[X:%.*]], splat (i8 2) +; CHECK-NEXT: [[TMP2:%.*]] = and [[TMP1]], splat (i8 8) +; CHECK-NEXT: ret [[TMP2]] +; + %1 = and %x, splat (i8 2) + %2 = shl %1, splat (i8 2) + ret %2 +} diff --git a/llvm/test/Transforms/InstCombine/shl-twice-constant.ll b/llvm/test/Transforms/InstCombine/shl-twice-constant.ll index bbdd7fa3d1c40..151db29fe3e5f 100644 --- a/llvm/test/Transforms/InstCombine/shl-twice-constant.ll +++ b/llvm/test/Transforms/InstCombine/shl-twice-constant.ll @@ -14,3 +14,14 @@ define i64 @testfunc() { %shl2 = shl i64 %shl1, ptrtoint (ptr @c to i64) ret i64 %shl2 } + +define @scalable() { +; CHECK-LABEL: @scalable( +; CHECK-NEXT: [[SHL1:%.*]] = shl nuw splat (i64 1), shufflevector ( insertelement ( poison, i64 ptrtoint (ptr @c2 to i64), i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[SHL2:%.*]] = shl [[SHL1]], shufflevector ( insertelement ( poison, i64 ptrtoint (ptr @c to i64), i64 0), poison, zeroinitializer) +; CHECK-NEXT: ret [[SHL2]] +; + %shl1 = shl splat (i64 1), splat (i64 ptrtoint (ptr @c2 to i64)) + %shl2 = shl %shl1, splat (i64 ptrtoint (ptr @c to i64)) + ret %shl2 +} diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll index e89419d1f3838..81ecd8506514e 100644 --- a/llvm/test/Transforms/InstCombine/sub.ll +++ b/llvm/test/Transforms/InstCombine/sub.ll @@ -857,11 +857,9 @@ define <2 x i16> @test44vecminval(<2 x i16> %x) { ret <2 x i16> %sub } -; FIXME: This isn't combined to xor as above because the pattern in visitSub -; uses m_ImmConstant which matches Constant but (explicitly) not ConstantExpr. define @test44scalablevecminval( %x) { ; CHECK-LABEL: @test44scalablevecminval( -; CHECK-NEXT: [[SUB:%.*]] = add [[X:%.*]], splat (i16 -32768) +; CHECK-NEXT: [[SUB:%.*]] = xor [[X:%.*]], splat (i16 -32768) ; CHECK-NEXT: ret [[SUB]] ; %sub = sub nsw %x, splat (i16 -32768) diff --git a/llvm/test/Transforms/InstSimplify/vec-icmp-of-cast.ll b/llvm/test/Transforms/InstSimplify/vec-icmp-of-cast.ll index 8b27ab1f0ef26..3314ef6ff9715 100644 --- a/llvm/test/Transforms/InstSimplify/vec-icmp-of-cast.ll +++ b/llvm/test/Transforms/InstSimplify/vec-icmp-of-cast.ll @@ -174,3 +174,12 @@ define <2 x i1> @icmp_slt_sext_is_true_false(<2 x i8> %x) { %cmp = icmp slt <2 x i32> %xext, ret <2 x i1> %cmp } + +define @icmp_ult_sext_scalable_splat_is_true( %x) { +; CHECK-LABEL: @icmp_ult_sext_scalable_splat_is_true( +; CHECK-NEXT: ret splat (i1 true) +; + %s = sext %x to + %cmp = icmp slt %s, splat (i64 257) + ret %cmp +} From 2abcdd8cf08b9a170e6e5ad1b9facbf71135522f Mon Sep 17 00:00:00 2001 From: Austin Schuh Date: Thu, 3 Apr 2025 10:08:02 -0700 Subject: [PATCH 0559/1029] [CUDA] Add support for CUDA surfaces (#132883) This adds support for all the surface read and write calls to clang. It extends the pattern used for textures to surfaces too. I tested this by generating all the various permutations of the calls and argument types in a python script, compiling them with both clang and nvcc, and comparing the generated ptx for equivilence. They all agree, ignoring register allocation, and some places where Clang picks different memory write instructions. An example kernel is: ``` __global__ void testKernel(cudaSurfaceObject_t surfObj, int x, float2* result) { *result = surf1Dread(surfObj, x, cudaBoundaryModeZero); } ``` --------- Signed-off-by: Austin Schuh --- .../Headers/__clang_cuda_runtime_wrapper.h | 1 + .../Headers/__clang_cuda_texture_intrinsics.h | 439 ++- clang/test/CodeGen/nvptx-surface.cu | 3329 +++++++++++++++++ clang/test/Headers/Inputs/include/cuda.h | 47 + .../include/surface_indirect_functions.h | 2 + 5 files changed, 3816 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/nvptx-surface.cu create mode 100644 clang/test/Headers/Inputs/include/surface_indirect_functions.h diff --git a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h index d369c86fe1064..44934ba2c2d67 100644 --- a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h +++ b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h @@ -384,6 +384,7 @@ __host__ __device__ void __nv_tex_surf_handler(const char *name, T *ptr, // will continue to fail as it does now. #endif // CUDA_VERSION #endif // __cplusplus >= 201103L && CUDA_VERSION >= 9000 +#include "surface_indirect_functions.h" #include "texture_fetch_functions.h" #include "texture_indirect_functions.h" diff --git a/clang/lib/Headers/__clang_cuda_texture_intrinsics.h b/clang/lib/Headers/__clang_cuda_texture_intrinsics.h index a71952211237b..8b914ed50b5fc 100644 --- a/clang/lib/Headers/__clang_cuda_texture_intrinsics.h +++ b/clang/lib/Headers/__clang_cuda_texture_intrinsics.h @@ -28,6 +28,7 @@ #pragma push_macro("__Args") #pragma push_macro("__ID") #pragma push_macro("__IDV") +#pragma push_macro("__OP_TYPE_SURFACE") #pragma push_macro("__IMPL_2DGATHER") #pragma push_macro("__IMPL_ALIAS") #pragma push_macro("__IMPL_ALIASI") @@ -45,6 +46,63 @@ #pragma push_macro("__IMPL_SI") #pragma push_macro("__L") #pragma push_macro("__STRIP_PARENS") +#pragma push_macro("__SURF_WRITE_V2") +#pragma push_macro("__SW_ASM_ARGS") +#pragma push_macro("__SW_ASM_ARGS1") +#pragma push_macro("__SW_ASM_ARGS2") +#pragma push_macro("__SW_ASM_ARGS4") +#pragma push_macro("__SURF_WRITE_V2") +#pragma push_macro("__SURF_READ_V2") +#pragma push_macro("__SW_ASM_ARGS") +#pragma push_macro("__SW_ASM_ARGS1") +#pragma push_macro("__SW_ASM_ARGS2") +#pragma push_macro("__SW_ASM_ARGS4") +#pragma push_macro("__SURF_READ1D"); +#pragma push_macro("__SURF_READ2D"); +#pragma push_macro("__SURF_READ3D"); +#pragma push_macro("__SURF_READ1DLAYERED"); +#pragma push_macro("__SURF_READ2DLAYERED"); +#pragma push_macro("__SURF_READCUBEMAP"); +#pragma push_macro("__SURF_READCUBEMAPLAYERED"); +#pragma push_macro("__1DV1"); +#pragma push_macro("__1DV2"); +#pragma push_macro("__1DV4"); +#pragma push_macro("__2DV1"); +#pragma push_macro("__2DV2"); +#pragma push_macro("__2DV4"); +#pragma push_macro("__1DLAYERV1"); +#pragma push_macro("__1DLAYERV2"); +#pragma push_macro("__1DLAYERV4"); +#pragma push_macro("__3DV1"); +#pragma push_macro("__3DV2"); +#pragma push_macro("__3DV4"); +#pragma push_macro("__2DLAYERV1"); +#pragma push_macro("__2DLAYERV2"); +#pragma push_macro("__2DLAYERV4"); +#pragma push_macro("__CUBEMAPV1"); +#pragma push_macro("__CUBEMAPV2"); +#pragma push_macro("__CUBEMAPV4"); +#pragma push_macro("__CUBEMAPLAYERV1"); +#pragma push_macro("__CUBEMAPLAYERV2"); +#pragma push_macro("__CUBEMAPLAYERV4"); +#pragma push_macro("__SURF_READXD_ALL"); +#pragma push_macro("__SURF_WRITE1D_V2"); +#pragma push_macro("__SURF_WRITE1DLAYERED_V2"); +#pragma push_macro("__SURF_WRITE2D_V2"); +#pragma push_macro("__SURF_WRITE2DLAYERED_V2"); +#pragma push_macro("__SURF_WRITE3D_V2"); +#pragma push_macro("__SURF_CUBEMAPWRITE_V2"); +#pragma push_macro("__SURF_CUBEMAPLAYEREDWRITE_V2"); +#pragma push_macro("__SURF_WRITEXD_V2_ALL"); +#pragma push_macro("__1DV1"); +#pragma push_macro("__1DV2"); +#pragma push_macro("__1DV4"); +#pragma push_macro("__2DV1"); +#pragma push_macro("__2DV2"); +#pragma push_macro("__2DV4"); +#pragma push_macro("__3DV1"); +#pragma push_macro("__3DV2"); +#pragma push_macro("__3DV4"); // Put all functions into anonymous namespace so they have internal linkage. // The device-only function here must be internal in order to avoid ODR @@ -186,6 +244,21 @@ template struct __TypeInfoT { using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t; }; +// Tag structs to distinguish operation types +struct __texture_op_tag {}; +struct __surface_op_tag {}; + +// Template specialization to determine operation type based on tag value +template struct __op_type_traits { + using type = __texture_op_tag; +}; + +// Specialize for known surface operation tags +#define __OP_TYPE_SURFACE(__op) \ + template <> struct __op_type_traits<__op> { \ + using type = __surface_op_tag; \ + } + // Classes that implement specific texture ops. template struct __tex_fetch_v4; @@ -649,6 +722,302 @@ template struct __convert { } }; +// There are a couple of layers here. First, __op_type_traits is used to +// dispatch to either surface write calls, or to the texture read calls. +// +// Then, that dispatches to __tex_fetch_impl below, which dispatches by both tag +// and datatype to the appropriate +// __surf_read_write_v2. +// TODO(austin): Do the reads too. + +// Mark which of the ids we should be dispatching to surface write calls. +__OP_TYPE_SURFACE(__ID("__isurf1Dread")); +__OP_TYPE_SURFACE(__ID("__isurf2Dread")); +__OP_TYPE_SURFACE(__ID("__isurf3Dread")); +__OP_TYPE_SURFACE(__ID("__isurf1DLayeredread")); +__OP_TYPE_SURFACE(__ID("__isurf2DLayeredread")); +__OP_TYPE_SURFACE(__ID("__isurfCubemapread")); +__OP_TYPE_SURFACE(__ID("__isurfCubemapLayeredread")); +__OP_TYPE_SURFACE(__ID("__isurf1Dwrite_v2")); +__OP_TYPE_SURFACE(__ID("__isurf2Dwrite_v2")); +__OP_TYPE_SURFACE(__ID("__isurf3Dwrite_v2")); +__OP_TYPE_SURFACE(__ID("__isurf1DLayeredwrite_v2")); +__OP_TYPE_SURFACE(__ID("__isurf2DLayeredwrite_v2")); +__OP_TYPE_SURFACE(__ID("__isurfCubemapwrite_v2")); +__OP_TYPE_SURFACE(__ID("__isurfCubemapLayeredwrite_v2")); + +template struct __surf_read_write_v2; + +// For the various write calls, we need to be able to generate variations with +// different IDs, different numbers of arguments, and different numbers of +// outputs. + +#define __SURF_WRITE_V2(__op, __asm_dim, __asmtype, __type, __index_op_args, \ + __index_args, __index_asm_args, __asm_op_args, \ + __asm_args) \ + template <> struct __surf_read_write_v2<__op, __type> { \ + static __device__ void __run(__type *__ptr, cudaSurfaceObject_t obj, \ + __L(__index_args), \ + cudaSurfaceBoundaryMode mode) { \ + switch (mode) { \ + case cudaBoundaryModeZero: \ + asm volatile("sust.b." __asm_dim "." __asmtype \ + ".zero [%0, " __index_op_args "], " __asm_op_args ";" \ + : \ + : "l"(obj), __L(__index_asm_args), __L(__asm_args)); \ + break; \ + case cudaBoundaryModeClamp: \ + asm volatile("sust.b." __asm_dim "." __asmtype \ + ".clamp [%0, " __index_op_args "], " __asm_op_args ";" \ + : \ + : "l"(obj), __L(__index_asm_args), __L(__asm_args)); \ + break; \ + case cudaBoundaryModeTrap: \ + asm volatile("sust.b." __asm_dim "." __asmtype \ + ".trap [%0, " __index_op_args "], " __asm_op_args ";" \ + : \ + : "l"(obj), __L(__index_asm_args), __L(__asm_args)); \ + break; \ + } \ + } \ + } + +#define __SURF_READ_V2(__op, __asm_dim, __asmtype, __type, __asm_op_args, \ + __asm_args, __index_args, __index_asm_args) \ + template <> struct __surf_read_write_v2<__op, __type> { \ + static __device__ void __run(__type *__ptr, cudaSurfaceObject_t obj, \ + __L(__index_args), \ + cudaSurfaceBoundaryMode mode) { \ + switch (mode) { \ + case cudaBoundaryModeZero: \ + asm("suld.b." __asm_dim "." __asmtype ".zero " __asm_op_args ";" \ + : __L(__asm_args) \ + : "l"(obj), __L(__index_asm_args)); \ + break; \ + case cudaBoundaryModeClamp: \ + asm("suld.b." __asm_dim "." __asmtype ".clamp " __asm_op_args ";" \ + : __L(__asm_args) \ + : "l"(obj), __L(__index_asm_args)); \ + break; \ + case cudaBoundaryModeTrap: \ + asm("suld.b." __asm_dim "." __asmtype ".trap " __asm_op_args ";" \ + : __L(__asm_args) \ + : "l"(obj), __L(__index_asm_args)); \ + break; \ + } \ + } \ + } + +// Amazing, the read side should follow the same flow, I just need to change the +// generated assembly calls, and the rest should fall in line. + +#define __SW_ASM_ARGS(__type) (__type(*__ptr)) +#define __SW_ASM_ARGS1(__type) (__type(__ptr->x)) +#define __SW_ASM_ARGS2(__type) (__type(__ptr->x), __type(__ptr->y)) +#define __SW_ASM_ARGS4(__type) \ + (__type(__ptr->x), __type(__ptr->y), __type(__ptr->z), __type(__ptr->w)) + +#define __SURF_READ1D(__asmtype, __type, __asm_op_args, __asm_args) \ + __SURF_READ_V2(__ID("__isurf1Dread"), "1d", __asmtype, __type, \ + __asm_op_args, __asm_args, (int x), ("r"(x))) +#define __SURF_READ2D(__asmtype, __type, __asm_op_args, __asm_args) \ + __SURF_READ_V2(__ID("__isurf2Dread"), "2d", __asmtype, __type, \ + __asm_op_args, __asm_args, (int x, int y), ("r"(x), "r"(y))) +#define __SURF_READ3D(__asmtype, __type, __asm_op_args, __asm_args) \ + __SURF_READ_V2(__ID("__isurf3Dread"), "3d", __asmtype, __type, \ + __asm_op_args, __asm_args, (int x, int y, int z), \ + ("r"(x), "r"(y), "r"(z))) + +#define __SURF_READ1DLAYERED(__asmtype, __type, __asm_op_args, __asm_args) \ + __SURF_READ_V2(__ID("__isurf1DLayeredread"), "a1d", __asmtype, __type, \ + __asm_op_args, __asm_args, (int x, int layer), \ + ("r"(x), "r"(layer))) +#define __SURF_READ2DLAYERED(__asmtype, __type, __asm_op_args, __asm_args) \ + __SURF_READ_V2(__ID("__isurf2DLayeredread"), "a2d", __asmtype, __type, \ + __asm_op_args, __asm_args, (int x, int y, int layer), \ + ("r"(x), "r"(y), "r"(layer))) +#define __SURF_READCUBEMAP(__asmtype, __type, __asm_op_args, __asm_args) \ + __SURF_READ_V2(__ID("__isurfCubemapread"), "a2d", __asmtype, __type, \ + __asm_op_args, __asm_args, (int x, int y, int face), \ + ("r"(x), "r"(y), "r"(face))) +#define __SURF_READCUBEMAPLAYERED(__asmtype, __type, __asm_op_args, \ + __asm_args) \ + __SURF_READ_V2(__ID("__isurfCubemapLayeredread"), "a2d", __asmtype, __type, \ + __asm_op_args, __asm_args, (int x, int y, int layerface), \ + ("r"(x), "r"(y), "r"(layerface))) + +#define __1DV1 "{%0}, [%1, {%2}]" +#define __1DV2 "{%0, %1}, [%2, {%3}]" +#define __1DV4 "{%0, %1, %2, %3}, [%4, {%5}]" + +#define __2DV1 "{%0}, [%1, {%2, %3}]" +#define __2DV2 "{%0, %1}, [%2, {%3, %4}]" +#define __2DV4 "{%0, %1, %2, %3}, [%4, {%5, %6}]" + +#define __1DLAYERV1 "{%0}, [%1, {%3, %2}]" +#define __1DLAYERV2 "{%0, %1}, [%2, {%4, %3}]" +#define __1DLAYERV4 "{%0, %1, %2, %3}, [%4, {%6, %5}]" + +#define __3DV1 "{%0}, [%1, {%2, %3, %4, %4}]" +#define __3DV2 "{%0, %1}, [%2, {%3, %4, %5, %5}]" +#define __3DV4 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}]" + +#define __2DLAYERV1 "{%0}, [%1, {%4, %2, %3, %3}]" +#define __2DLAYERV2 "{%0, %1}, [%2, {%5, %3, %4, %4}]" +#define __2DLAYERV4 "{%0, %1, %2, %3}, [%4, {%7, %5, %6, %6}]" + +#define __CUBEMAPV1 "{%0}, [%1, {%4, %2, %3, %3}]" +#define __CUBEMAPV2 "{%0, %1}, [%2, {%5, %3, %4, %4}]" +#define __CUBEMAPV4 "{%0, %1, %2, %3}, [%4, {%7, %5, %6, %6}]" + +#define __CUBEMAPLAYERV1 "{%0}, [%1, {%4, %2, %3, %3}]" +#define __CUBEMAPLAYERV2 "{%0, %1}, [%2, {%5, %3, %4, %4}]" +#define __CUBEMAPLAYERV4 "{%0, %1, %2, %3}, [%4, {%7, %5, %6, %6}]" + +#define __SURF_READXD_ALL(__xdv1, __xdv2, __xdv4, __surf_readxd_v2) \ + __surf_readxd_v2("b8", char, __xdv1, __SW_ASM_ARGS("=h")); \ + __surf_readxd_v2("b8", signed char, __xdv1, __SW_ASM_ARGS("=h")); \ + __surf_readxd_v2("b8", char1, __xdv1, __SW_ASM_ARGS1("=h")); \ + __surf_readxd_v2("b8", unsigned char, __xdv1, __SW_ASM_ARGS("=h")); \ + __surf_readxd_v2("b8", uchar1, __xdv1, __SW_ASM_ARGS1("=h")); \ + __surf_readxd_v2("b16", short, __xdv1, __SW_ASM_ARGS("=h")); \ + __surf_readxd_v2("b16", short1, __xdv1, __SW_ASM_ARGS1("=h")); \ + __surf_readxd_v2("b16", unsigned short, __xdv1, __SW_ASM_ARGS("=h")); \ + __surf_readxd_v2("b16", ushort1, __xdv1, __SW_ASM_ARGS1("=h")); \ + __surf_readxd_v2("b32", int, __xdv1, __SW_ASM_ARGS("=r")); \ + __surf_readxd_v2("b32", int1, __xdv1, __SW_ASM_ARGS1("=r")); \ + __surf_readxd_v2("b32", unsigned int, __xdv1, __SW_ASM_ARGS("=r")); \ + __surf_readxd_v2("b32", uint1, __xdv1, __SW_ASM_ARGS1("=r")); \ + __surf_readxd_v2("b64", long long, __xdv1, __SW_ASM_ARGS("=l")); \ + __surf_readxd_v2("b64", longlong1, __xdv1, __SW_ASM_ARGS1("=l")); \ + __surf_readxd_v2("b64", unsigned long long, __xdv1, __SW_ASM_ARGS("=l")); \ + __surf_readxd_v2("b64", ulonglong1, __xdv1, __SW_ASM_ARGS1("=l")); \ + __surf_readxd_v2("b32", float, __xdv1, __SW_ASM_ARGS("=r")); \ + __surf_readxd_v2("b32", float1, __xdv1, __SW_ASM_ARGS1("=r")); \ + \ + __surf_readxd_v2("v2.b8", char2, __xdv2, __SW_ASM_ARGS2("=h")); \ + __surf_readxd_v2("v2.b8", uchar2, __xdv2, __SW_ASM_ARGS2("=h")); \ + __surf_readxd_v2("v2.b16", short2, __xdv2, __SW_ASM_ARGS2("=h")); \ + __surf_readxd_v2("v2.b16", ushort2, __xdv2, __SW_ASM_ARGS2("=h")); \ + __surf_readxd_v2("v2.b32", int2, __xdv2, __SW_ASM_ARGS2("=r")); \ + __surf_readxd_v2("v2.b32", uint2, __xdv2, __SW_ASM_ARGS2("=r")); \ + __surf_readxd_v2("v2.b64", longlong2, __xdv2, __SW_ASM_ARGS2("=l")); \ + __surf_readxd_v2("v2.b64", ulonglong2, __xdv2, __SW_ASM_ARGS2("=l")); \ + __surf_readxd_v2("v2.b32", float2, __xdv2, __SW_ASM_ARGS2("=r")); \ + \ + __surf_readxd_v2("v4.b8", char4, __xdv4, __SW_ASM_ARGS4("=h")); \ + __surf_readxd_v2("v4.b8", uchar4, __xdv4, __SW_ASM_ARGS4("=h")); \ + __surf_readxd_v2("v4.b16", short4, __xdv4, __SW_ASM_ARGS4("=h")); \ + __surf_readxd_v2("v4.b16", ushort4, __xdv4, __SW_ASM_ARGS4("=h")); \ + __surf_readxd_v2("v4.b32", int4, __xdv4, __SW_ASM_ARGS4("=r")); \ + __surf_readxd_v2("v4.b32", uint4, __xdv4, __SW_ASM_ARGS4("=r")); \ + __surf_readxd_v2("v4.b32", float4, __xdv4, __SW_ASM_ARGS4("=r")) + +__SURF_READXD_ALL(__1DV1, __1DV2, __1DV4, __SURF_READ1D); +__SURF_READXD_ALL(__2DV1, __2DV2, __2DV4, __SURF_READ2D); +__SURF_READXD_ALL(__3DV1, __3DV2, __3DV4, __SURF_READ3D); +__SURF_READXD_ALL(__1DLAYERV1, __1DLAYERV2, __1DLAYERV4, __SURF_READ1DLAYERED); +__SURF_READXD_ALL(__2DLAYERV1, __2DLAYERV2, __2DLAYERV4, __SURF_READ2DLAYERED); +__SURF_READXD_ALL(__CUBEMAPV1, __CUBEMAPV2, __CUBEMAPV4, __SURF_READCUBEMAP); +__SURF_READXD_ALL(__CUBEMAPLAYERV1, __CUBEMAPLAYERV2, __CUBEMAPLAYERV4, + __SURF_READCUBEMAPLAYERED); + +#define __SURF_WRITE1D_V2(__asmtype, __type, __asm_op_args, __asm_args) \ + __SURF_WRITE_V2(__ID("__isurf1Dwrite_v2"), "1d", __asmtype, __type, "{%1}", \ + (int x), ("r"(x)), __asm_op_args, __asm_args) +#define __SURF_WRITE1DLAYERED_V2(__asmtype, __type, __asm_op_args, __asm_args) \ + __SURF_WRITE_V2(__ID("__isurf1DLayeredwrite_v2"), "a1d", __asmtype, __type, \ + "{%2, %1}", (int x, int layer), ("r"(x), "r"(layer)), \ + __asm_op_args, __asm_args) +#define __SURF_WRITE2D_V2(__asmtype, __type, __asm_op_args, __asm_args) \ + __SURF_WRITE_V2(__ID("__isurf2Dwrite_v2"), "2d", __asmtype, __type, \ + "{%1, %2}", (int x, int y), ("r"(x), "r"(y)), __asm_op_args, \ + __asm_args) +#define __SURF_WRITE2DLAYERED_V2(__asmtype, __type, __asm_op_args, __asm_args) \ + __SURF_WRITE_V2(__ID("__isurf2DLayeredwrite_v2"), "a2d", __asmtype, __type, \ + "{%3, %1, %2, %2}", (int x, int y, int layer), \ + ("r"(x), "r"(y), "r"(layer)), __asm_op_args, __asm_args) +#define __SURF_WRITE3D_V2(__asmtype, __type, __asm_op_args, __asm_args) \ + __SURF_WRITE_V2(__ID("__isurf3Dwrite_v2"), "3d", __asmtype, __type, \ + "{%1, %2, %3, %3}", (int x, int y, int z), \ + ("r"(x), "r"(y), "r"(z)), __asm_op_args, __asm_args) + +#define __SURF_CUBEMAPWRITE_V2(__asmtype, __type, __asm_op_args, __asm_args) \ + __SURF_WRITE_V2(__ID("__isurfCubemapwrite_v2"), "a2d", __asmtype, __type, \ + "{%3, %1, %2, %2}", (int x, int y, int face), \ + ("r"(x), "r"(y), "r"(face)), __asm_op_args, __asm_args) +#define __SURF_CUBEMAPLAYEREDWRITE_V2(__asmtype, __type, __asm_op_args, \ + __asm_args) \ + __SURF_WRITE_V2(__ID("__isurfCubemapLayeredwrite_v2"), "a2d", __asmtype, \ + __type, "{%3, %1, %2, %2}", (int x, int y, int layerface), \ + ("r"(x), "r"(y), "r"(layerface)), __asm_op_args, __asm_args) + +#define __SURF_WRITEXD_V2_ALL(__xdv1, __xdv2, __xdv4, __surf_writexd_v2) \ + __surf_writexd_v2("b8", char, __xdv1, __SW_ASM_ARGS("h")); \ + __surf_writexd_v2("b8", signed char, __xdv1, __SW_ASM_ARGS("h")); \ + __surf_writexd_v2("b8", char1, __xdv1, __SW_ASM_ARGS1("h")); \ + __surf_writexd_v2("b8", unsigned char, __xdv1, __SW_ASM_ARGS("h")); \ + __surf_writexd_v2("b8", uchar1, __xdv1, __SW_ASM_ARGS1("h")); \ + __surf_writexd_v2("b16", short, __xdv1, __SW_ASM_ARGS("h")); \ + __surf_writexd_v2("b16", short1, __xdv1, __SW_ASM_ARGS1("h")); \ + __surf_writexd_v2("b16", unsigned short, __xdv1, __SW_ASM_ARGS("h")); \ + __surf_writexd_v2("b16", ushort1, __xdv1, __SW_ASM_ARGS1("h")); \ + __surf_writexd_v2("b32", int, __xdv1, __SW_ASM_ARGS("r")); \ + __surf_writexd_v2("b32", int1, __xdv1, __SW_ASM_ARGS1("r")); \ + __surf_writexd_v2("b32", unsigned int, __xdv1, __SW_ASM_ARGS("r")); \ + __surf_writexd_v2("b32", uint1, __xdv1, __SW_ASM_ARGS1("r")); \ + __surf_writexd_v2("b64", long long, __xdv1, __SW_ASM_ARGS("l")); \ + __surf_writexd_v2("b64", longlong1, __xdv1, __SW_ASM_ARGS1("l")); \ + __surf_writexd_v2("b64", unsigned long long, __xdv1, __SW_ASM_ARGS("l")); \ + __surf_writexd_v2("b64", ulonglong1, __xdv1, __SW_ASM_ARGS1("l")); \ + __surf_writexd_v2("b32", float, __xdv1, __SW_ASM_ARGS("r")); \ + __surf_writexd_v2("b32", float1, __xdv1, __SW_ASM_ARGS1("r")); \ + \ + __surf_writexd_v2("v2.b8", char2, __xdv2, __SW_ASM_ARGS2("h")); \ + __surf_writexd_v2("v2.b8", uchar2, __xdv2, __SW_ASM_ARGS2("h")); \ + __surf_writexd_v2("v2.b16", short2, __xdv2, __SW_ASM_ARGS2("h")); \ + __surf_writexd_v2("v2.b16", ushort2, __xdv2, __SW_ASM_ARGS2("h")); \ + __surf_writexd_v2("v2.b32", int2, __xdv2, __SW_ASM_ARGS2("r")); \ + __surf_writexd_v2("v2.b32", uint2, __xdv2, __SW_ASM_ARGS2("r")); \ + __surf_writexd_v2("v2.b64", longlong2, __xdv2, __SW_ASM_ARGS2("l")); \ + __surf_writexd_v2("v2.b64", ulonglong2, __xdv2, __SW_ASM_ARGS2("l")); \ + __surf_writexd_v2("v2.b32", float2, __xdv2, __SW_ASM_ARGS2("r")); \ + \ + __surf_writexd_v2("v4.b8", char4, __xdv4, __SW_ASM_ARGS4("h")); \ + __surf_writexd_v2("v4.b8", uchar4, __xdv4, __SW_ASM_ARGS4("h")); \ + __surf_writexd_v2("v4.b16", short4, __xdv4, __SW_ASM_ARGS4("h")); \ + __surf_writexd_v2("v4.b16", ushort4, __xdv4, __SW_ASM_ARGS4("h")); \ + __surf_writexd_v2("v4.b32", int4, __xdv4, __SW_ASM_ARGS4("r")); \ + __surf_writexd_v2("v4.b32", uint4, __xdv4, __SW_ASM_ARGS4("r")); \ + __surf_writexd_v2("v4.b32", float4, __xdv4, __SW_ASM_ARGS4("r")) + +#define __1DV1 "{%2}" +#define __1DV2 "{%2, %3}" +#define __1DV4 "{%2, %3, %4, %5}" + +#define __2DV1 "{%3}" +#define __2DV2 "{%3, %4}" +#define __2DV4 "{%3, %4, %5, %6}" + +#define __3DV1 "{%4}" +#define __3DV2 "{%4, %5}" +#define __3DV4 "{%4, %5, %6, %7}" + +__SURF_WRITEXD_V2_ALL(__1DV1, __1DV2, __1DV4, __SURF_WRITE1D_V2); +__SURF_WRITEXD_V2_ALL(__2DV1, __2DV2, __2DV4, __SURF_WRITE2D_V2); +__SURF_WRITEXD_V2_ALL(__3DV1, __3DV2, __3DV4, __SURF_WRITE3D_V2); +__SURF_WRITEXD_V2_ALL(__2DV1, __2DV2, __2DV4, __SURF_WRITE1DLAYERED_V2); +__SURF_WRITEXD_V2_ALL(__3DV1, __3DV2, __3DV4, __SURF_WRITE2DLAYERED_V2); +__SURF_WRITEXD_V2_ALL(__3DV1, __3DV2, __3DV4, __SURF_CUBEMAPWRITE_V2); +__SURF_WRITEXD_V2_ALL(__3DV1, __3DV2, __3DV4, __SURF_CUBEMAPLAYEREDWRITE_V2); + +template +__device__ static void __tex_fetch_impl(__surface_op_tag, __DataT *__ptr, + cudaSurfaceObject_t __handle, + __Args... __args) { + __surf_read_write_v2<__op, __DataT>::__run(__ptr, __handle, __args...); +} + // These are the top-level function overloads the __nv_tex_surf_handler expands // to. Each overload deals with one of the several ways __nv_tex_surf_handler // is called by CUDA headers. In the end, each of the overloads does the same @@ -659,13 +1028,21 @@ template struct __convert { // __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...); // Data type and return type are based on ret. template -__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle, - __Args... __args) { +__device__ static void __tex_fetch_impl(__texture_op_tag, __T *__ptr, + cudaTextureObject_t __handle, + __Args... __args) { using __FetchT = typename __TypeInfoT<__T>::__fetch_t; *__ptr = __convert<__T, __FetchT>::__run( __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...)); } +template +__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle, + __Args... __args) { + using op_type = typename __op_type_traits<__op>::type; + __tex_fetch_impl<__op>(op_type{}, __ptr, __handle, __args...); +} + #if CUDA_VERSION < 12000 // texture<> objects get magically converted into a texture reference. However, // there's no way to convert them to cudaTextureObject_t on C++ level. So, we @@ -722,6 +1099,7 @@ __tex_fetch(__DataT *, __RetT *__ptr, #pragma pop_macro("__Args") #pragma pop_macro("__ID") #pragma pop_macro("__IDV") +#pragma pop_macro("__OP_TYPE_SURFACE") #pragma pop_macro("__IMPL_2DGATHER") #pragma pop_macro("__IMPL_ALIAS") #pragma pop_macro("__IMPL_ALIASI") @@ -739,4 +1117,61 @@ __tex_fetch(__DataT *, __RetT *__ptr, #pragma pop_macro("__IMPL_SI") #pragma pop_macro("__L") #pragma pop_macro("__STRIP_PARENS") +#pragma pop_macro("__SURF_WRITE_V2") +#pragma pop_macro("__SW_ASM_ARGS") +#pragma pop_macro("__SW_ASM_ARGS1") +#pragma pop_macro("__SW_ASM_ARGS2") +#pragma pop_macro("__SW_ASM_ARGS4") +#pragma pop_macro("__SURF_WRITE_V2") +#pragma pop_macro("__SURF_READ_V2") +#pragma pop_macro("__SW_ASM_ARGS") +#pragma pop_macro("__SW_ASM_ARGS1") +#pragma pop_macro("__SW_ASM_ARGS2") +#pragma pop_macro("__SW_ASM_ARGS4") +#pragma pop_macro("__SURF_READ1D"); +#pragma pop_macro("__SURF_READ2D"); +#pragma pop_macro("__SURF_READ3D"); +#pragma pop_macro("__SURF_READ1DLAYERED"); +#pragma pop_macro("__SURF_READ2DLAYERED"); +#pragma pop_macro("__SURF_READCUBEMAP"); +#pragma pop_macro("__SURF_READCUBEMAPLAYERED"); +#pragma pop_macro("__1DV1"); +#pragma pop_macro("__1DV2"); +#pragma pop_macro("__1DV4"); +#pragma pop_macro("__2DV1"); +#pragma pop_macro("__2DV2"); +#pragma pop_macro("__2DV4"); +#pragma pop_macro("__1DLAYERV1"); +#pragma pop_macro("__1DLAYERV2"); +#pragma pop_macro("__1DLAYERV4"); +#pragma pop_macro("__3DV1"); +#pragma pop_macro("__3DV2"); +#pragma pop_macro("__3DV4"); +#pragma pop_macro("__2DLAYERV1"); +#pragma pop_macro("__2DLAYERV2"); +#pragma pop_macro("__2DLAYERV4"); +#pragma pop_macro("__CUBEMAPV1"); +#pragma pop_macro("__CUBEMAPV2"); +#pragma pop_macro("__CUBEMAPV4"); +#pragma pop_macro("__CUBEMAPLAYERV1"); +#pragma pop_macro("__CUBEMAPLAYERV2"); +#pragma pop_macro("__CUBEMAPLAYERV4"); +#pragma pop_macro("__SURF_READXD_ALL"); +#pragma pop_macro("__SURF_WRITE1D_V2"); +#pragma pop_macro("__SURF_WRITE1DLAYERED_V2"); +#pragma pop_macro("__SURF_WRITE2D_V2"); +#pragma pop_macro("__SURF_WRITE2DLAYERED_V2"); +#pragma pop_macro("__SURF_WRITE3D_V2"); +#pragma pop_macro("__SURF_CUBEMAPWRITE_V2"); +#pragma pop_macro("__SURF_CUBEMAPLAYEREDWRITE_V2"); +#pragma pop_macro("__SURF_WRITEXD_V2_ALL"); +#pragma pop_macro("__1DV1"); +#pragma pop_macro("__1DV2"); +#pragma pop_macro("__1DV4"); +#pragma pop_macro("__2DV1"); +#pragma pop_macro("__2DV2"); +#pragma pop_macro("__2DV4"); +#pragma pop_macro("__3DV1"); +#pragma pop_macro("__3DV2"); +#pragma pop_macro("__3DV4"); #endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__ diff --git a/clang/test/CodeGen/nvptx-surface.cu b/clang/test/CodeGen/nvptx-surface.cu new file mode 100644 index 0000000000000..7c42e5d118153 --- /dev/null +++ b/clang/test/CodeGen/nvptx-surface.cu @@ -0,0 +1,3329 @@ +// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -O3 -o - %s -emit-llvm | FileCheck %s +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -O3 -o - %s -emit-llvm | FileCheck %s +#include "../Headers/Inputs/include/cuda.h" + +#include "__clang_cuda_texture_intrinsics.h" + +__device__ void surfchar(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + char val; + + // CHECK: %0 = tail call i8 asm "suld.b.1d.b8.zero {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.zero [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i8 asm "suld.b.1d.b8.clamp {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.clamp [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i8 asm "suld.b.1d.b8.trap {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.trap [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i8 asm "suld.b.2d.b8.zero {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.zero [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i8 asm "suld.b.2d.b8.clamp {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.clamp [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i8 asm "suld.b.2d.b8.trap {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.trap [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i8 asm "suld.b.3d.b8.zero {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i8 asm "suld.b.3d.b8.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i8 asm "suld.b.3d.b8.trap {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i8 asm "suld.b.a1d.b8.zero {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.zero [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i8 asm "suld.b.a1d.b8.clamp {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.clamp [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i8 asm "suld.b.a1d.b8.trap {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.trap [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfsignedchar(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + signed char val; + + // CHECK: %0 = tail call i8 asm "suld.b.1d.b8.zero {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.zero [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i8 asm "suld.b.1d.b8.clamp {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.clamp [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i8 asm "suld.b.1d.b8.trap {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.trap [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i8 asm "suld.b.2d.b8.zero {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.zero [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i8 asm "suld.b.2d.b8.clamp {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.clamp [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i8 asm "suld.b.2d.b8.trap {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.trap [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i8 asm "suld.b.3d.b8.zero {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i8 asm "suld.b.3d.b8.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i8 asm "suld.b.3d.b8.trap {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i8 asm "suld.b.a1d.b8.zero {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.zero [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i8 asm "suld.b.a1d.b8.clamp {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.clamp [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i8 asm "suld.b.a1d.b8.trap {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.trap [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfchar1(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + char1 val; + + // CHECK: %0 = tail call i8 asm "suld.b.1d.b8.zero {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.zero [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i8 asm "suld.b.1d.b8.clamp {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.clamp [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i8 asm "suld.b.1d.b8.trap {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.trap [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i8 asm "suld.b.2d.b8.zero {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.zero [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i8 asm "suld.b.2d.b8.clamp {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.clamp [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i8 asm "suld.b.2d.b8.trap {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.trap [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i8 asm "suld.b.3d.b8.zero {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i8 asm "suld.b.3d.b8.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i8 asm "suld.b.3d.b8.trap {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i8 asm "suld.b.a1d.b8.zero {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.zero [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i8 asm "suld.b.a1d.b8.clamp {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.clamp [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i8 asm "suld.b.a1d.b8.trap {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.trap [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfunsignedchar(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + unsigned char val; + + // CHECK: %0 = tail call i8 asm "suld.b.1d.b8.zero {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.zero [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i8 asm "suld.b.1d.b8.clamp {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.clamp [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i8 asm "suld.b.1d.b8.trap {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.trap [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i8 asm "suld.b.2d.b8.zero {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.zero [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i8 asm "suld.b.2d.b8.clamp {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.clamp [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i8 asm "suld.b.2d.b8.trap {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.trap [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i8 asm "suld.b.3d.b8.zero {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i8 asm "suld.b.3d.b8.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i8 asm "suld.b.3d.b8.trap {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i8 asm "suld.b.a1d.b8.zero {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.zero [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i8 asm "suld.b.a1d.b8.clamp {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.clamp [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i8 asm "suld.b.a1d.b8.trap {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.trap [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfuchar1(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + uchar1 val; + + // CHECK: %0 = tail call i8 asm "suld.b.1d.b8.zero {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.zero [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i8 asm "suld.b.1d.b8.clamp {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.clamp [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i8 asm "suld.b.1d.b8.trap {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b8.trap [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i8 asm "suld.b.2d.b8.zero {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.zero [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i8 asm "suld.b.2d.b8.clamp {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.clamp [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i8 asm "suld.b.2d.b8.trap {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b8.trap [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i8 asm "suld.b.3d.b8.zero {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i8 asm "suld.b.3d.b8.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i8 asm "suld.b.3d.b8.trap {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b8.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i8 asm "suld.b.a1d.b8.zero {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.zero [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i8 asm "suld.b.a1d.b8.clamp {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.clamp [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i8 asm "suld.b.a1d.b8.trap {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b8.trap [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i8 asm "suld.b.a2d.b8.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i8 asm "suld.b.a2d.b8.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i8 asm "suld.b.a2d.b8.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b8.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfshort(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + short val; + + // CHECK: %0 = tail call i16 asm "suld.b.1d.b16.zero {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b16.zero [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i16 asm "suld.b.1d.b16.clamp {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b16.clamp [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i16 asm "suld.b.1d.b16.trap {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b16.trap [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i16 asm "suld.b.2d.b16.zero {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b16.zero [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i16 asm "suld.b.2d.b16.clamp {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b16.clamp [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i16 asm "suld.b.2d.b16.trap {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b16.trap [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i16 asm "suld.b.3d.b16.zero {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b16.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i16 asm "suld.b.3d.b16.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b16.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i16 asm "suld.b.3d.b16.trap {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b16.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i16 asm "suld.b.a1d.b16.zero {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b16.zero [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i16 asm "suld.b.a1d.b16.clamp {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b16.clamp [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i16 asm "suld.b.a1d.b16.trap {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b16.trap [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i16 asm "suld.b.a2d.b16.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i16 asm "suld.b.a2d.b16.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i16 asm "suld.b.a2d.b16.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i16 asm "suld.b.a2d.b16.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i16 asm "suld.b.a2d.b16.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i16 asm "suld.b.a2d.b16.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i16 asm "suld.b.a2d.b16.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i16 asm "suld.b.a2d.b16.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i16 asm "suld.b.a2d.b16.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfshort1(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + short1 val; + + // CHECK: %0 = tail call i16 asm "suld.b.1d.b16.zero {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b16.zero [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i16 asm "suld.b.1d.b16.clamp {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b16.clamp [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i16 asm "suld.b.1d.b16.trap {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b16.trap [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i16 asm "suld.b.2d.b16.zero {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b16.zero [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i16 asm "suld.b.2d.b16.clamp {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b16.clamp [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i16 asm "suld.b.2d.b16.trap {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b16.trap [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i16 asm "suld.b.3d.b16.zero {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b16.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i16 asm "suld.b.3d.b16.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b16.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i16 asm "suld.b.3d.b16.trap {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b16.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i16 asm "suld.b.a1d.b16.zero {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b16.zero [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i16 asm "suld.b.a1d.b16.clamp {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b16.clamp [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i16 asm "suld.b.a1d.b16.trap {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b16.trap [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i16 asm "suld.b.a2d.b16.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i16 asm "suld.b.a2d.b16.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i16 asm "suld.b.a2d.b16.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i16 asm "suld.b.a2d.b16.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i16 asm "suld.b.a2d.b16.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i16 asm "suld.b.a2d.b16.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i16 asm "suld.b.a2d.b16.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i16 asm "suld.b.a2d.b16.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i16 asm "suld.b.a2d.b16.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfunsignedshort(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + unsigned short val; + + // CHECK: %0 = tail call i16 asm "suld.b.1d.b16.zero {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b16.zero [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i16 asm "suld.b.1d.b16.clamp {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b16.clamp [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i16 asm "suld.b.1d.b16.trap {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b16.trap [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i16 asm "suld.b.2d.b16.zero {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b16.zero [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i16 asm "suld.b.2d.b16.clamp {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b16.clamp [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i16 asm "suld.b.2d.b16.trap {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b16.trap [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i16 asm "suld.b.3d.b16.zero {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b16.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i16 asm "suld.b.3d.b16.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b16.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i16 asm "suld.b.3d.b16.trap {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b16.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i16 asm "suld.b.a1d.b16.zero {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b16.zero [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i16 asm "suld.b.a1d.b16.clamp {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b16.clamp [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i16 asm "suld.b.a1d.b16.trap {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b16.trap [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i16 asm "suld.b.a2d.b16.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i16 asm "suld.b.a2d.b16.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i16 asm "suld.b.a2d.b16.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i16 asm "suld.b.a2d.b16.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i16 asm "suld.b.a2d.b16.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i16 asm "suld.b.a2d.b16.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i16 asm "suld.b.a2d.b16.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i16 asm "suld.b.a2d.b16.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i16 asm "suld.b.a2d.b16.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfushort1(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + ushort1 val; + + // CHECK: %0 = tail call i16 asm "suld.b.1d.b16.zero {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b16.zero [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i16 asm "suld.b.1d.b16.clamp {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b16.clamp [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i16 asm "suld.b.1d.b16.trap {$0}, [$1, {$2}];", "=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b16.trap [$0, {$1}], {$2};", "l,r,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i16 asm "suld.b.2d.b16.zero {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b16.zero [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i16 asm "suld.b.2d.b16.clamp {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b16.clamp [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i16 asm "suld.b.2d.b16.trap {$0}, [$1, {$2, $3}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b16.trap [$0, {$1, $2}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i16 asm "suld.b.3d.b16.zero {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b16.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i16 asm "suld.b.3d.b16.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b16.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i16 asm "suld.b.3d.b16.trap {$0}, [$1, {$2, $3, $4, $4}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b16.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i16 asm "suld.b.a1d.b16.zero {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b16.zero [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i16 asm "suld.b.a1d.b16.clamp {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b16.clamp [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i16 asm "suld.b.a1d.b16.trap {$0}, [$1, {$3, $2}];", "=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b16.trap [$0, {$2, $1}], {$3};", "l,r,r,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i16 asm "suld.b.a2d.b16.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i16 asm "suld.b.a2d.b16.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i16 asm "suld.b.a2d.b16.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i16 asm "suld.b.a2d.b16.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i16 asm "suld.b.a2d.b16.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i16 asm "suld.b.a2d.b16.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i16 asm "suld.b.a2d.b16.zero {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i16 asm "suld.b.a2d.b16.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i16 asm "suld.b.a2d.b16.trap {$0}, [$1, {$4, $2, $3, $3}];", "=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b16.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfint(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + int val; + + // CHECK: %0 = tail call i32 asm "suld.b.1d.b32.zero {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.zero [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i32 asm "suld.b.1d.b32.clamp {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.clamp [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i32 asm "suld.b.1d.b32.trap {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.trap [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i32 asm "suld.b.2d.b32.zero {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.zero [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i32 asm "suld.b.2d.b32.clamp {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.clamp [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i32 asm "suld.b.2d.b32.trap {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.trap [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i32 asm "suld.b.3d.b32.zero {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i32 asm "suld.b.3d.b32.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i32 asm "suld.b.3d.b32.trap {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i32 asm "suld.b.a1d.b32.zero {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.zero [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i32 asm "suld.b.a1d.b32.clamp {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.clamp [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i32 asm "suld.b.a1d.b32.trap {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.trap [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i32 asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i32 asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i32 asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i32 asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i32 asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i32 asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i32 asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i32 asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i32 asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfint1(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + int1 val; + + // CHECK: %0 = tail call i32 asm "suld.b.1d.b32.zero {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.zero [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i32 asm "suld.b.1d.b32.clamp {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.clamp [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i32 asm "suld.b.1d.b32.trap {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.trap [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i32 asm "suld.b.2d.b32.zero {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.zero [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i32 asm "suld.b.2d.b32.clamp {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.clamp [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i32 asm "suld.b.2d.b32.trap {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.trap [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i32 asm "suld.b.3d.b32.zero {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i32 asm "suld.b.3d.b32.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i32 asm "suld.b.3d.b32.trap {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i32 asm "suld.b.a1d.b32.zero {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.zero [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i32 asm "suld.b.a1d.b32.clamp {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.clamp [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i32 asm "suld.b.a1d.b32.trap {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.trap [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i32 asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i32 asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i32 asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i32 asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i32 asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i32 asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i32 asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i32 asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i32 asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfunsignedint(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + unsigned int val; + + // CHECK: %0 = tail call i32 asm "suld.b.1d.b32.zero {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.zero [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i32 asm "suld.b.1d.b32.clamp {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.clamp [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i32 asm "suld.b.1d.b32.trap {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.trap [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i32 asm "suld.b.2d.b32.zero {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.zero [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i32 asm "suld.b.2d.b32.clamp {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.clamp [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i32 asm "suld.b.2d.b32.trap {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.trap [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i32 asm "suld.b.3d.b32.zero {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i32 asm "suld.b.3d.b32.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i32 asm "suld.b.3d.b32.trap {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i32 asm "suld.b.a1d.b32.zero {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.zero [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i32 asm "suld.b.a1d.b32.clamp {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.clamp [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i32 asm "suld.b.a1d.b32.trap {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.trap [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i32 asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i32 asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i32 asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i32 asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i32 asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i32 asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i32 asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i32 asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i32 asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfuint1(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + uint1 val; + + // CHECK: %0 = tail call i32 asm "suld.b.1d.b32.zero {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.zero [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i32 asm "suld.b.1d.b32.clamp {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.clamp [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i32 asm "suld.b.1d.b32.trap {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.trap [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i32 asm "suld.b.2d.b32.zero {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.zero [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i32 asm "suld.b.2d.b32.clamp {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.clamp [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i32 asm "suld.b.2d.b32.trap {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.trap [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i32 asm "suld.b.3d.b32.zero {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i32 asm "suld.b.3d.b32.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i32 asm "suld.b.3d.b32.trap {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i32 asm "suld.b.a1d.b32.zero {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.zero [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i32 asm "suld.b.a1d.b32.clamp {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.clamp [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i32 asm "suld.b.a1d.b32.trap {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.trap [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i32 asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i32 asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i32 asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i32 asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i32 asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i32 asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i32 asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i32 asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i32 asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surflonglong(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + long long val; + + // CHECK: %0 = tail call i64 asm "suld.b.1d.b64.zero {$0}, [$1, {$2}];", "=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b64.zero [$0, {$1}], {$2};", "l,r,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i64 asm "suld.b.1d.b64.clamp {$0}, [$1, {$2}];", "=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b64.clamp [$0, {$1}], {$2};", "l,r,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i64 asm "suld.b.1d.b64.trap {$0}, [$1, {$2}];", "=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b64.trap [$0, {$1}], {$2};", "l,r,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i64 asm "suld.b.2d.b64.zero {$0}, [$1, {$2, $3}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b64.zero [$0, {$1, $2}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i64 asm "suld.b.2d.b64.clamp {$0}, [$1, {$2, $3}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b64.clamp [$0, {$1, $2}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i64 asm "suld.b.2d.b64.trap {$0}, [$1, {$2, $3}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b64.trap [$0, {$1, $2}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i64 asm "suld.b.3d.b64.zero {$0}, [$1, {$2, $3, $4, $4}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b64.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i64 asm "suld.b.3d.b64.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b64.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i64 asm "suld.b.3d.b64.trap {$0}, [$1, {$2, $3, $4, $4}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b64.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i64 asm "suld.b.a1d.b64.zero {$0}, [$1, {$3, $2}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b64.zero [$0, {$2, $1}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i64 asm "suld.b.a1d.b64.clamp {$0}, [$1, {$3, $2}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b64.clamp [$0, {$2, $1}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i64 asm "suld.b.a1d.b64.trap {$0}, [$1, {$3, $2}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b64.trap [$0, {$2, $1}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i64 asm "suld.b.a2d.b64.zero {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i64 asm "suld.b.a2d.b64.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i64 asm "suld.b.a2d.b64.trap {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i64 asm "suld.b.a2d.b64.zero {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i64 asm "suld.b.a2d.b64.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i64 asm "suld.b.a2d.b64.trap {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i64 asm "suld.b.a2d.b64.zero {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i64 asm "suld.b.a2d.b64.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i64 asm "suld.b.a2d.b64.trap {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surflonglong1(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + longlong1 val; + + // CHECK: %0 = tail call i64 asm "suld.b.1d.b64.zero {$0}, [$1, {$2}];", "=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b64.zero [$0, {$1}], {$2};", "l,r,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i64 asm "suld.b.1d.b64.clamp {$0}, [$1, {$2}];", "=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b64.clamp [$0, {$1}], {$2};", "l,r,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i64 asm "suld.b.1d.b64.trap {$0}, [$1, {$2}];", "=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b64.trap [$0, {$1}], {$2};", "l,r,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i64 asm "suld.b.2d.b64.zero {$0}, [$1, {$2, $3}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b64.zero [$0, {$1, $2}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i64 asm "suld.b.2d.b64.clamp {$0}, [$1, {$2, $3}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b64.clamp [$0, {$1, $2}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i64 asm "suld.b.2d.b64.trap {$0}, [$1, {$2, $3}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b64.trap [$0, {$1, $2}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i64 asm "suld.b.3d.b64.zero {$0}, [$1, {$2, $3, $4, $4}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b64.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i64 asm "suld.b.3d.b64.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b64.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i64 asm "suld.b.3d.b64.trap {$0}, [$1, {$2, $3, $4, $4}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b64.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i64 asm "suld.b.a1d.b64.zero {$0}, [$1, {$3, $2}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b64.zero [$0, {$2, $1}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i64 asm "suld.b.a1d.b64.clamp {$0}, [$1, {$3, $2}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b64.clamp [$0, {$2, $1}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i64 asm "suld.b.a1d.b64.trap {$0}, [$1, {$3, $2}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b64.trap [$0, {$2, $1}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i64 asm "suld.b.a2d.b64.zero {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i64 asm "suld.b.a2d.b64.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i64 asm "suld.b.a2d.b64.trap {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i64 asm "suld.b.a2d.b64.zero {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i64 asm "suld.b.a2d.b64.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i64 asm "suld.b.a2d.b64.trap {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i64 asm "suld.b.a2d.b64.zero {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i64 asm "suld.b.a2d.b64.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i64 asm "suld.b.a2d.b64.trap {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfunsignedlonglong(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + unsigned long long val; + + // CHECK: %0 = tail call i64 asm "suld.b.1d.b64.zero {$0}, [$1, {$2}];", "=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b64.zero [$0, {$1}], {$2};", "l,r,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i64 asm "suld.b.1d.b64.clamp {$0}, [$1, {$2}];", "=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b64.clamp [$0, {$1}], {$2};", "l,r,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i64 asm "suld.b.1d.b64.trap {$0}, [$1, {$2}];", "=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b64.trap [$0, {$1}], {$2};", "l,r,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i64 asm "suld.b.2d.b64.zero {$0}, [$1, {$2, $3}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b64.zero [$0, {$1, $2}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i64 asm "suld.b.2d.b64.clamp {$0}, [$1, {$2, $3}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b64.clamp [$0, {$1, $2}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i64 asm "suld.b.2d.b64.trap {$0}, [$1, {$2, $3}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b64.trap [$0, {$1, $2}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i64 asm "suld.b.3d.b64.zero {$0}, [$1, {$2, $3, $4, $4}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b64.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i64 asm "suld.b.3d.b64.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b64.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i64 asm "suld.b.3d.b64.trap {$0}, [$1, {$2, $3, $4, $4}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b64.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i64 asm "suld.b.a1d.b64.zero {$0}, [$1, {$3, $2}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b64.zero [$0, {$2, $1}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i64 asm "suld.b.a1d.b64.clamp {$0}, [$1, {$3, $2}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b64.clamp [$0, {$2, $1}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i64 asm "suld.b.a1d.b64.trap {$0}, [$1, {$3, $2}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b64.trap [$0, {$2, $1}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i64 asm "suld.b.a2d.b64.zero {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i64 asm "suld.b.a2d.b64.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i64 asm "suld.b.a2d.b64.trap {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i64 asm "suld.b.a2d.b64.zero {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i64 asm "suld.b.a2d.b64.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i64 asm "suld.b.a2d.b64.trap {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i64 asm "suld.b.a2d.b64.zero {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i64 asm "suld.b.a2d.b64.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i64 asm "suld.b.a2d.b64.trap {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfulonglong1(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + ulonglong1 val; + + // CHECK: %0 = tail call i64 asm "suld.b.1d.b64.zero {$0}, [$1, {$2}];", "=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b64.zero [$0, {$1}], {$2};", "l,r,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call i64 asm "suld.b.1d.b64.clamp {$0}, [$1, {$2}];", "=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b64.clamp [$0, {$1}], {$2};", "l,r,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call i64 asm "suld.b.1d.b64.trap {$0}, [$1, {$2}];", "=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b64.trap [$0, {$1}], {$2};", "l,r,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call i64 asm "suld.b.2d.b64.zero {$0}, [$1, {$2, $3}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b64.zero [$0, {$1, $2}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call i64 asm "suld.b.2d.b64.clamp {$0}, [$1, {$2, $3}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b64.clamp [$0, {$1, $2}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call i64 asm "suld.b.2d.b64.trap {$0}, [$1, {$2, $3}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b64.trap [$0, {$1, $2}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call i64 asm "suld.b.3d.b64.zero {$0}, [$1, {$2, $3, $4, $4}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b64.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call i64 asm "suld.b.3d.b64.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b64.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call i64 asm "suld.b.3d.b64.trap {$0}, [$1, {$2, $3, $4, $4}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b64.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call i64 asm "suld.b.a1d.b64.zero {$0}, [$1, {$3, $2}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b64.zero [$0, {$2, $1}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call i64 asm "suld.b.a1d.b64.clamp {$0}, [$1, {$3, $2}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b64.clamp [$0, {$2, $1}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call i64 asm "suld.b.a1d.b64.trap {$0}, [$1, {$3, $2}];", "=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b64.trap [$0, {$2, $1}], {$3};", "l,r,r,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call i64 asm "suld.b.a2d.b64.zero {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call i64 asm "suld.b.a2d.b64.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call i64 asm "suld.b.a2d.b64.trap {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call i64 asm "suld.b.a2d.b64.zero {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call i64 asm "suld.b.a2d.b64.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call i64 asm "suld.b.a2d.b64.trap {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call i64 asm "suld.b.a2d.b64.zero {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call i64 asm "suld.b.a2d.b64.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call i64 asm "suld.b.a2d.b64.trap {$0}, [$1, {$4, $2, $3, $3}];", "=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b64.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surffloat(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + float val; + + // CHECK: %0 = tail call contract float asm "suld.b.1d.b32.zero {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.zero [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call contract float asm "suld.b.1d.b32.clamp {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.clamp [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call contract float asm "suld.b.1d.b32.trap {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.trap [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call contract float asm "suld.b.2d.b32.zero {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.zero [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call contract float asm "suld.b.2d.b32.clamp {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.clamp [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call contract float asm "suld.b.2d.b32.trap {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.trap [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call contract float asm "suld.b.3d.b32.zero {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call contract float asm "suld.b.3d.b32.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call contract float asm "suld.b.3d.b32.trap {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call contract float asm "suld.b.a1d.b32.zero {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.zero [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call contract float asm "suld.b.a1d.b32.clamp {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.clamp [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call contract float asm "suld.b.a1d.b32.trap {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.trap [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call contract float asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call contract float asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call contract float asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call contract float asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call contract float asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call contract float asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call contract float asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call contract float asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call contract float asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surffloat1(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + float1 val; + + // CHECK: %0 = tail call contract float asm "suld.b.1d.b32.zero {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.zero [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call contract float asm "suld.b.1d.b32.clamp {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.clamp [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call contract float asm "suld.b.1d.b32.trap {$0}, [$1, {$2}];", "=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.b32.trap [$0, {$1}], {$2};", "l,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call contract float asm "suld.b.2d.b32.zero {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.zero [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call contract float asm "suld.b.2d.b32.clamp {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.clamp [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call contract float asm "suld.b.2d.b32.trap {$0}, [$1, {$2, $3}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.b32.trap [$0, {$1, $2}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call contract float asm "suld.b.3d.b32.zero {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.zero [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call contract float asm "suld.b.3d.b32.clamp {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.clamp [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call contract float asm "suld.b.3d.b32.trap {$0}, [$1, {$2, $3, $4, $4}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.b32.trap [$0, {$1, $2, $3, $3}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call contract float asm "suld.b.a1d.b32.zero {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.zero [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call contract float asm "suld.b.a1d.b32.clamp {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.clamp [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call contract float asm "suld.b.a1d.b32.trap {$0}, [$1, {$3, $2}];", "=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.b32.trap [$0, {$2, $1}], {$3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call contract float asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call contract float asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call contract float asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call contract float asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call contract float asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call contract float asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call contract float asm "suld.b.a2d.b32.zero {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.zero [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call contract float asm "suld.b.a2d.b32.clamp {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.clamp [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call contract float asm "suld.b.a2d.b32.trap {$0}, [$1, {$4, $2, $3, $3}];", "=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.b32.trap [$0, {$3, $1, $2, $2}], {$4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfchar2(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + char2 val; + + // CHECK: %0 = tail call { i8, i8 } asm "suld.b.1d.v2.b8.zero {$0, $1}, [$2, {$3}];", "=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b8.zero [$0, {$1}], {$2, $3};", "l,r,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i8, i8 } asm "suld.b.1d.v2.b8.clamp {$0, $1}, [$2, {$3}];", "=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b8.clamp [$0, {$1}], {$2, $3};", "l,r,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i8, i8 } asm "suld.b.1d.v2.b8.trap {$0, $1}, [$2, {$3}];", "=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b8.trap [$0, {$1}], {$2, $3};", "l,r,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i8, i8 } asm "suld.b.2d.v2.b8.zero {$0, $1}, [$2, {$3, $4}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b8.zero [$0, {$1, $2}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i8, i8 } asm "suld.b.2d.v2.b8.clamp {$0, $1}, [$2, {$3, $4}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b8.clamp [$0, {$1, $2}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i8, i8 } asm "suld.b.2d.v2.b8.trap {$0, $1}, [$2, {$3, $4}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b8.trap [$0, {$1, $2}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i8, i8 } asm "suld.b.3d.v2.b8.zero {$0, $1}, [$2, {$3, $4, $5, $5}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b8.zero [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i8, i8 } asm "suld.b.3d.v2.b8.clamp {$0, $1}, [$2, {$3, $4, $5, $5}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b8.clamp [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i8, i8 } asm "suld.b.3d.v2.b8.trap {$0, $1}, [$2, {$3, $4, $5, $5}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b8.trap [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i8, i8 } asm "suld.b.a1d.v2.b8.zero {$0, $1}, [$2, {$4, $3}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b8.zero [$0, {$2, $1}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i8, i8 } asm "suld.b.a1d.v2.b8.clamp {$0, $1}, [$2, {$4, $3}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b8.clamp [$0, {$2, $1}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i8, i8 } asm "suld.b.a1d.v2.b8.trap {$0, $1}, [$2, {$4, $3}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b8.trap [$0, {$2, $1}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfuchar2(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + uchar2 val; + + // CHECK: %0 = tail call { i8, i8 } asm "suld.b.1d.v2.b8.zero {$0, $1}, [$2, {$3}];", "=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b8.zero [$0, {$1}], {$2, $3};", "l,r,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i8, i8 } asm "suld.b.1d.v2.b8.clamp {$0, $1}, [$2, {$3}];", "=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b8.clamp [$0, {$1}], {$2, $3};", "l,r,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i8, i8 } asm "suld.b.1d.v2.b8.trap {$0, $1}, [$2, {$3}];", "=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b8.trap [$0, {$1}], {$2, $3};", "l,r,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i8, i8 } asm "suld.b.2d.v2.b8.zero {$0, $1}, [$2, {$3, $4}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b8.zero [$0, {$1, $2}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i8, i8 } asm "suld.b.2d.v2.b8.clamp {$0, $1}, [$2, {$3, $4}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b8.clamp [$0, {$1, $2}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i8, i8 } asm "suld.b.2d.v2.b8.trap {$0, $1}, [$2, {$3, $4}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b8.trap [$0, {$1, $2}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i8, i8 } asm "suld.b.3d.v2.b8.zero {$0, $1}, [$2, {$3, $4, $5, $5}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b8.zero [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i8, i8 } asm "suld.b.3d.v2.b8.clamp {$0, $1}, [$2, {$3, $4, $5, $5}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b8.clamp [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i8, i8 } asm "suld.b.3d.v2.b8.trap {$0, $1}, [$2, {$3, $4, $5, $5}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b8.trap [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i8, i8 } asm "suld.b.a1d.v2.b8.zero {$0, $1}, [$2, {$4, $3}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b8.zero [$0, {$2, $1}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i8, i8 } asm "suld.b.a1d.v2.b8.clamp {$0, $1}, [$2, {$4, $3}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b8.clamp [$0, {$2, $1}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i8, i8 } asm "suld.b.a1d.v2.b8.trap {$0, $1}, [$2, {$4, $3}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b8.trap [$0, {$2, $1}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i8, i8 } asm "suld.b.a2d.v2.b8.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b8.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfshort2(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + short2 val; + + // CHECK: %0 = tail call { i16, i16 } asm "suld.b.1d.v2.b16.zero {$0, $1}, [$2, {$3}];", "=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b16.zero [$0, {$1}], {$2, $3};", "l,r,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i16, i16 } asm "suld.b.1d.v2.b16.clamp {$0, $1}, [$2, {$3}];", "=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b16.clamp [$0, {$1}], {$2, $3};", "l,r,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i16, i16 } asm "suld.b.1d.v2.b16.trap {$0, $1}, [$2, {$3}];", "=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b16.trap [$0, {$1}], {$2, $3};", "l,r,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i16, i16 } asm "suld.b.2d.v2.b16.zero {$0, $1}, [$2, {$3, $4}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b16.zero [$0, {$1, $2}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i16, i16 } asm "suld.b.2d.v2.b16.clamp {$0, $1}, [$2, {$3, $4}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b16.clamp [$0, {$1, $2}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i16, i16 } asm "suld.b.2d.v2.b16.trap {$0, $1}, [$2, {$3, $4}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b16.trap [$0, {$1, $2}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i16, i16 } asm "suld.b.3d.v2.b16.zero {$0, $1}, [$2, {$3, $4, $5, $5}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b16.zero [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i16, i16 } asm "suld.b.3d.v2.b16.clamp {$0, $1}, [$2, {$3, $4, $5, $5}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b16.clamp [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i16, i16 } asm "suld.b.3d.v2.b16.trap {$0, $1}, [$2, {$3, $4, $5, $5}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b16.trap [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i16, i16 } asm "suld.b.a1d.v2.b16.zero {$0, $1}, [$2, {$4, $3}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b16.zero [$0, {$2, $1}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i16, i16 } asm "suld.b.a1d.v2.b16.clamp {$0, $1}, [$2, {$4, $3}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b16.clamp [$0, {$2, $1}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i16, i16 } asm "suld.b.a1d.v2.b16.trap {$0, $1}, [$2, {$4, $3}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b16.trap [$0, {$2, $1}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} +__device__ void surfushort2(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + ushort2 val; + + // CHECK: %0 = tail call { i16, i16 } asm "suld.b.1d.v2.b16.zero {$0, $1}, [$2, {$3}];", "=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b16.zero [$0, {$1}], {$2, $3};", "l,r,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i16, i16 } asm "suld.b.1d.v2.b16.clamp {$0, $1}, [$2, {$3}];", "=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b16.clamp [$0, {$1}], {$2, $3};", "l,r,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i16, i16 } asm "suld.b.1d.v2.b16.trap {$0, $1}, [$2, {$3}];", "=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b16.trap [$0, {$1}], {$2, $3};", "l,r,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i16, i16 } asm "suld.b.2d.v2.b16.zero {$0, $1}, [$2, {$3, $4}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b16.zero [$0, {$1, $2}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i16, i16 } asm "suld.b.2d.v2.b16.clamp {$0, $1}, [$2, {$3, $4}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b16.clamp [$0, {$1, $2}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i16, i16 } asm "suld.b.2d.v2.b16.trap {$0, $1}, [$2, {$3, $4}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b16.trap [$0, {$1, $2}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i16, i16 } asm "suld.b.3d.v2.b16.zero {$0, $1}, [$2, {$3, $4, $5, $5}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b16.zero [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i16, i16 } asm "suld.b.3d.v2.b16.clamp {$0, $1}, [$2, {$3, $4, $5, $5}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b16.clamp [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i16, i16 } asm "suld.b.3d.v2.b16.trap {$0, $1}, [$2, {$3, $4, $5, $5}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b16.trap [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i16, i16 } asm "suld.b.a1d.v2.b16.zero {$0, $1}, [$2, {$4, $3}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b16.zero [$0, {$2, $1}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i16, i16 } asm "suld.b.a1d.v2.b16.clamp {$0, $1}, [$2, {$4, $3}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b16.clamp [$0, {$2, $1}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i16, i16 } asm "suld.b.a1d.v2.b16.trap {$0, $1}, [$2, {$4, $3}];", "=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b16.trap [$0, {$2, $1}], {$3, $4};", "l,r,r,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i16, i16 } asm "suld.b.a2d.v2.b16.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b16.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfint2(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + int2 val; + + // CHECK: %0 = tail call { i32, i32 } asm "suld.b.1d.v2.b32.zero {$0, $1}, [$2, {$3}];", "=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b32.zero [$0, {$1}], {$2, $3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i32, i32 } asm "suld.b.1d.v2.b32.clamp {$0, $1}, [$2, {$3}];", "=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b32.clamp [$0, {$1}], {$2, $3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i32, i32 } asm "suld.b.1d.v2.b32.trap {$0, $1}, [$2, {$3}];", "=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b32.trap [$0, {$1}], {$2, $3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i32, i32 } asm "suld.b.2d.v2.b32.zero {$0, $1}, [$2, {$3, $4}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b32.zero [$0, {$1, $2}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i32, i32 } asm "suld.b.2d.v2.b32.clamp {$0, $1}, [$2, {$3, $4}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b32.clamp [$0, {$1, $2}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i32, i32 } asm "suld.b.2d.v2.b32.trap {$0, $1}, [$2, {$3, $4}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b32.trap [$0, {$1, $2}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i32, i32 } asm "suld.b.3d.v2.b32.zero {$0, $1}, [$2, {$3, $4, $5, $5}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b32.zero [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i32, i32 } asm "suld.b.3d.v2.b32.clamp {$0, $1}, [$2, {$3, $4, $5, $5}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b32.clamp [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i32, i32 } asm "suld.b.3d.v2.b32.trap {$0, $1}, [$2, {$3, $4, $5, $5}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b32.trap [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i32, i32 } asm "suld.b.a1d.v2.b32.zero {$0, $1}, [$2, {$4, $3}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b32.zero [$0, {$2, $1}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i32, i32 } asm "suld.b.a1d.v2.b32.clamp {$0, $1}, [$2, {$4, $3}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b32.clamp [$0, {$2, $1}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i32, i32 } asm "suld.b.a1d.v2.b32.trap {$0, $1}, [$2, {$4, $3}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b32.trap [$0, {$2, $1}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfuint2(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + uint2 val; + + // CHECK: %0 = tail call { i32, i32 } asm "suld.b.1d.v2.b32.zero {$0, $1}, [$2, {$3}];", "=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b32.zero [$0, {$1}], {$2, $3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i32, i32 } asm "suld.b.1d.v2.b32.clamp {$0, $1}, [$2, {$3}];", "=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b32.clamp [$0, {$1}], {$2, $3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i32, i32 } asm "suld.b.1d.v2.b32.trap {$0, $1}, [$2, {$3}];", "=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b32.trap [$0, {$1}], {$2, $3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i32, i32 } asm "suld.b.2d.v2.b32.zero {$0, $1}, [$2, {$3, $4}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b32.zero [$0, {$1, $2}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i32, i32 } asm "suld.b.2d.v2.b32.clamp {$0, $1}, [$2, {$3, $4}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b32.clamp [$0, {$1, $2}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i32, i32 } asm "suld.b.2d.v2.b32.trap {$0, $1}, [$2, {$3, $4}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b32.trap [$0, {$1, $2}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i32, i32 } asm "suld.b.3d.v2.b32.zero {$0, $1}, [$2, {$3, $4, $5, $5}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b32.zero [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i32, i32 } asm "suld.b.3d.v2.b32.clamp {$0, $1}, [$2, {$3, $4, $5, $5}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b32.clamp [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i32, i32 } asm "suld.b.3d.v2.b32.trap {$0, $1}, [$2, {$3, $4, $5, $5}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b32.trap [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i32, i32 } asm "suld.b.a1d.v2.b32.zero {$0, $1}, [$2, {$4, $3}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b32.zero [$0, {$2, $1}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i32, i32 } asm "suld.b.a1d.v2.b32.clamp {$0, $1}, [$2, {$4, $3}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b32.clamp [$0, {$2, $1}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i32, i32 } asm "suld.b.a1d.v2.b32.trap {$0, $1}, [$2, {$4, $3}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b32.trap [$0, {$2, $1}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i32, i32 } asm "suld.b.a2d.v2.b32.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surflonglong2(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + longlong2 val; + + // CHECK: %0 = tail call { i64, i64 } asm "suld.b.1d.v2.b64.zero {$0, $1}, [$2, {$3}];", "=l,=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b64.zero [$0, {$1}], {$2, $3};", "l,r,l,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i64, i64 } asm "suld.b.1d.v2.b64.clamp {$0, $1}, [$2, {$3}];", "=l,=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b64.clamp [$0, {$1}], {$2, $3};", "l,r,l,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i64, i64 } asm "suld.b.1d.v2.b64.trap {$0, $1}, [$2, {$3}];", "=l,=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b64.trap [$0, {$1}], {$2, $3};", "l,r,l,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i64, i64 } asm "suld.b.2d.v2.b64.zero {$0, $1}, [$2, {$3, $4}];", "=l,=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b64.zero [$0, {$1, $2}], {$3, $4};", "l,r,r,l,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i64, i64 } asm "suld.b.2d.v2.b64.clamp {$0, $1}, [$2, {$3, $4}];", "=l,=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b64.clamp [$0, {$1, $2}], {$3, $4};", "l,r,r,l,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i64, i64 } asm "suld.b.2d.v2.b64.trap {$0, $1}, [$2, {$3, $4}];", "=l,=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b64.trap [$0, {$1, $2}], {$3, $4};", "l,r,r,l,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i64, i64 } asm "suld.b.3d.v2.b64.zero {$0, $1}, [$2, {$3, $4, $5, $5}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b64.zero [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i64, i64 } asm "suld.b.3d.v2.b64.clamp {$0, $1}, [$2, {$3, $4, $5, $5}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b64.clamp [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i64, i64 } asm "suld.b.3d.v2.b64.trap {$0, $1}, [$2, {$3, $4, $5, $5}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b64.trap [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i64, i64 } asm "suld.b.a1d.v2.b64.zero {$0, $1}, [$2, {$4, $3}];", "=l,=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b64.zero [$0, {$2, $1}], {$3, $4};", "l,r,r,l,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i64, i64 } asm "suld.b.a1d.v2.b64.clamp {$0, $1}, [$2, {$4, $3}];", "=l,=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b64.clamp [$0, {$2, $1}], {$3, $4};", "l,r,r,l,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i64, i64 } asm "suld.b.a1d.v2.b64.trap {$0, $1}, [$2, {$4, $3}];", "=l,=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b64.trap [$0, {$2, $1}], {$3, $4};", "l,r,r,l,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfulonglong2(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + ulonglong2 val; + + // CHECK: %0 = tail call { i64, i64 } asm "suld.b.1d.v2.b64.zero {$0, $1}, [$2, {$3}];", "=l,=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b64.zero [$0, {$1}], {$2, $3};", "l,r,l,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i64, i64 } asm "suld.b.1d.v2.b64.clamp {$0, $1}, [$2, {$3}];", "=l,=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b64.clamp [$0, {$1}], {$2, $3};", "l,r,l,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i64, i64 } asm "suld.b.1d.v2.b64.trap {$0, $1}, [$2, {$3}];", "=l,=l,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b64.trap [$0, {$1}], {$2, $3};", "l,r,l,l" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i64, i64 } asm "suld.b.2d.v2.b64.zero {$0, $1}, [$2, {$3, $4}];", "=l,=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b64.zero [$0, {$1, $2}], {$3, $4};", "l,r,r,l,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i64, i64 } asm "suld.b.2d.v2.b64.clamp {$0, $1}, [$2, {$3, $4}];", "=l,=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b64.clamp [$0, {$1, $2}], {$3, $4};", "l,r,r,l,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i64, i64 } asm "suld.b.2d.v2.b64.trap {$0, $1}, [$2, {$3, $4}];", "=l,=l,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b64.trap [$0, {$1, $2}], {$3, $4};", "l,r,r,l,l" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i64, i64 } asm "suld.b.3d.v2.b64.zero {$0, $1}, [$2, {$3, $4, $5, $5}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b64.zero [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i64, i64 } asm "suld.b.3d.v2.b64.clamp {$0, $1}, [$2, {$3, $4, $5, $5}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b64.clamp [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i64, i64 } asm "suld.b.3d.v2.b64.trap {$0, $1}, [$2, {$3, $4, $5, $5}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b64.trap [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i64, i64 } asm "suld.b.a1d.v2.b64.zero {$0, $1}, [$2, {$4, $3}];", "=l,=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b64.zero [$0, {$2, $1}], {$3, $4};", "l,r,r,l,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i64, i64 } asm "suld.b.a1d.v2.b64.clamp {$0, $1}, [$2, {$4, $3}];", "=l,=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b64.clamp [$0, {$2, $1}], {$3, $4};", "l,r,r,l,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i64, i64 } asm "suld.b.a1d.v2.b64.trap {$0, $1}, [$2, {$4, $3}];", "=l,=l,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b64.trap [$0, {$2, $1}], {$3, $4};", "l,r,r,l,l" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i64, i64 } asm "suld.b.a2d.v2.b64.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=l,=l,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b64.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,l,l" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surffloat2(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + float2 val; + + // CHECK: %0 = tail call contract { float, float } asm "suld.b.1d.v2.b32.zero {$0, $1}, [$2, {$3}];", "=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b32.zero [$0, {$1}], {$2, $3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call contract { float, float } asm "suld.b.1d.v2.b32.clamp {$0, $1}, [$2, {$3}];", "=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b32.clamp [$0, {$1}], {$2, $3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call contract { float, float } asm "suld.b.1d.v2.b32.trap {$0, $1}, [$2, {$3}];", "=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v2.b32.trap [$0, {$1}], {$2, $3};", "l,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call contract { float, float } asm "suld.b.2d.v2.b32.zero {$0, $1}, [$2, {$3, $4}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b32.zero [$0, {$1, $2}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call contract { float, float } asm "suld.b.2d.v2.b32.clamp {$0, $1}, [$2, {$3, $4}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b32.clamp [$0, {$1, $2}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call contract { float, float } asm "suld.b.2d.v2.b32.trap {$0, $1}, [$2, {$3, $4}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v2.b32.trap [$0, {$1, $2}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call contract { float, float } asm "suld.b.3d.v2.b32.zero {$0, $1}, [$2, {$3, $4, $5, $5}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b32.zero [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call contract { float, float } asm "suld.b.3d.v2.b32.clamp {$0, $1}, [$2, {$3, $4, $5, $5}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b32.clamp [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call contract { float, float } asm "suld.b.3d.v2.b32.trap {$0, $1}, [$2, {$3, $4, $5, $5}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v2.b32.trap [$0, {$1, $2, $3, $3}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call contract { float, float } asm "suld.b.a1d.v2.b32.zero {$0, $1}, [$2, {$4, $3}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b32.zero [$0, {$2, $1}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call contract { float, float } asm "suld.b.a1d.v2.b32.clamp {$0, $1}, [$2, {$4, $3}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b32.clamp [$0, {$2, $1}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call contract { float, float } asm "suld.b.a1d.v2.b32.trap {$0, $1}, [$2, {$4, $3}];", "=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v2.b32.trap [$0, {$2, $1}], {$3, $4};", "l,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call contract { float, float } asm "suld.b.a2d.v2.b32.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call contract { float, float } asm "suld.b.a2d.v2.b32.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call contract { float, float } asm "suld.b.a2d.v2.b32.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call contract { float, float } asm "suld.b.a2d.v2.b32.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call contract { float, float } asm "suld.b.a2d.v2.b32.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call contract { float, float } asm "suld.b.a2d.v2.b32.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call contract { float, float } asm "suld.b.a2d.v2.b32.zero {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call contract { float, float } asm "suld.b.a2d.v2.b32.clamp {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call contract { float, float } asm "suld.b.a2d.v2.b32.trap {$0, $1}, [$2, {$5, $3, $4, $4}];", "=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v2.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfchar4(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + char4 val; + + // CHECK: %0 = tail call { i8, i8, i8, i8 } asm "suld.b.1d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$5}];", "=h,=h,=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b8.zero [$0, {$1}], {$2, $3, $4, $5};", "l,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i8, i8, i8, i8 } asm "suld.b.1d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$5}];", "=h,=h,=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b8.clamp [$0, {$1}], {$2, $3, $4, $5};", "l,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i8, i8, i8, i8 } asm "suld.b.1d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$5}];", "=h,=h,=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b8.trap [$0, {$1}], {$2, $3, $4, $5};", "l,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i8, i8, i8, i8 } asm "suld.b.2d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$5, $6}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b8.zero [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i8, i8, i8, i8 } asm "suld.b.2d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$5, $6}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b8.clamp [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i8, i8, i8, i8 } asm "suld.b.2d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$5, $6}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b8.trap [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i8, i8, i8, i8 } asm "suld.b.3d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b8.zero [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i8, i8, i8, i8 } asm "suld.b.3d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b8.clamp [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i8, i8, i8, i8 } asm "suld.b.3d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b8.trap [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i8, i8, i8, i8 } asm "suld.b.a1d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$6, $5}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b8.zero [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i8, i8, i8, i8 } asm "suld.b.a1d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$6, $5}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b8.clamp [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i8, i8, i8, i8 } asm "suld.b.a1d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$6, $5}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b8.trap [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfuchar4(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + uchar4 val; + + // CHECK: %0 = tail call { i8, i8, i8, i8 } asm "suld.b.1d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$5}];", "=h,=h,=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b8.zero [$0, {$1}], {$2, $3, $4, $5};", "l,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i8, i8, i8, i8 } asm "suld.b.1d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$5}];", "=h,=h,=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b8.clamp [$0, {$1}], {$2, $3, $4, $5};", "l,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i8, i8, i8, i8 } asm "suld.b.1d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$5}];", "=h,=h,=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b8.trap [$0, {$1}], {$2, $3, $4, $5};", "l,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i8, i8, i8, i8 } asm "suld.b.2d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$5, $6}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b8.zero [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i8, i8, i8, i8 } asm "suld.b.2d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$5, $6}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b8.clamp [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i8, i8, i8, i8 } asm "suld.b.2d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$5, $6}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b8.trap [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i8, i8, i8, i8 } asm "suld.b.3d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b8.zero [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i8, i8, i8, i8 } asm "suld.b.3d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b8.clamp [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i8, i8, i8, i8 } asm "suld.b.3d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b8.trap [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i8, i8, i8, i8 } asm "suld.b.a1d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$6, $5}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b8.zero [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i8, i8, i8, i8 } asm "suld.b.a1d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$6, $5}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b8.clamp [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i8, i8, i8, i8 } asm "suld.b.a1d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$6, $5}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b8.trap [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i8, i8, i8, i8 } asm "suld.b.a2d.v4.b8.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b8.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfshort4(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + short4 val; + + // CHECK: %0 = tail call { i16, i16, i16, i16 } asm "suld.b.1d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$5}];", "=h,=h,=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b16.zero [$0, {$1}], {$2, $3, $4, $5};", "l,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i16, i16, i16, i16 } asm "suld.b.1d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$5}];", "=h,=h,=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b16.clamp [$0, {$1}], {$2, $3, $4, $5};", "l,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i16, i16, i16, i16 } asm "suld.b.1d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$5}];", "=h,=h,=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b16.trap [$0, {$1}], {$2, $3, $4, $5};", "l,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i16, i16, i16, i16 } asm "suld.b.2d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$5, $6}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b16.zero [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i16, i16, i16, i16 } asm "suld.b.2d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$5, $6}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b16.clamp [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i16, i16, i16, i16 } asm "suld.b.2d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$5, $6}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b16.trap [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i16, i16, i16, i16 } asm "suld.b.3d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b16.zero [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i16, i16, i16, i16 } asm "suld.b.3d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b16.clamp [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i16, i16, i16, i16 } asm "suld.b.3d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b16.trap [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i16, i16, i16, i16 } asm "suld.b.a1d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$6, $5}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b16.zero [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i16, i16, i16, i16 } asm "suld.b.a1d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$6, $5}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b16.clamp [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i16, i16, i16, i16 } asm "suld.b.a1d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$6, $5}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b16.trap [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfushort4(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + ushort4 val; + + // CHECK: %0 = tail call { i16, i16, i16, i16 } asm "suld.b.1d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$5}];", "=h,=h,=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b16.zero [$0, {$1}], {$2, $3, $4, $5};", "l,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i16, i16, i16, i16 } asm "suld.b.1d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$5}];", "=h,=h,=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b16.clamp [$0, {$1}], {$2, $3, $4, $5};", "l,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i16, i16, i16, i16 } asm "suld.b.1d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$5}];", "=h,=h,=h,=h,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b16.trap [$0, {$1}], {$2, $3, $4, $5};", "l,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i16, i16, i16, i16 } asm "suld.b.2d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$5, $6}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b16.zero [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i16, i16, i16, i16 } asm "suld.b.2d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$5, $6}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b16.clamp [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i16, i16, i16, i16 } asm "suld.b.2d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$5, $6}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b16.trap [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i16, i16, i16, i16 } asm "suld.b.3d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b16.zero [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i16, i16, i16, i16 } asm "suld.b.3d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b16.clamp [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i16, i16, i16, i16 } asm "suld.b.3d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b16.trap [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i16, i16, i16, i16 } asm "suld.b.a1d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$6, $5}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b16.zero [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i16, i16, i16, i16 } asm "suld.b.a1d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$6, $5}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b16.clamp [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i16, i16, i16, i16 } asm "suld.b.a1d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$6, $5}];", "=h,=h,=h,=h,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b16.trap [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i16, i16, i16, i16 } asm "suld.b.a2d.v4.b16.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=h,=h,=h,=h,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b16.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,h,h,h,h" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfint4(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + int4 val; + + // CHECK: %0 = tail call { i32, i32, i32, i32 } asm "suld.b.1d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$5}];", "=r,=r,=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b32.zero [$0, {$1}], {$2, $3, $4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i32, i32, i32, i32 } asm "suld.b.1d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$5}];", "=r,=r,=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b32.clamp [$0, {$1}], {$2, $3, $4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i32, i32, i32, i32 } asm "suld.b.1d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$5}];", "=r,=r,=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b32.trap [$0, {$1}], {$2, $3, $4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i32, i32, i32, i32 } asm "suld.b.2d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$5, $6}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b32.zero [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i32, i32, i32, i32 } asm "suld.b.2d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$5, $6}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b32.clamp [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i32, i32, i32, i32 } asm "suld.b.2d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$5, $6}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b32.trap [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i32, i32, i32, i32 } asm "suld.b.3d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b32.zero [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i32, i32, i32, i32 } asm "suld.b.3d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b32.clamp [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i32, i32, i32, i32 } asm "suld.b.3d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b32.trap [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i32, i32, i32, i32 } asm "suld.b.a1d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$6, $5}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b32.zero [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i32, i32, i32, i32 } asm "suld.b.a1d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$6, $5}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b32.clamp [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i32, i32, i32, i32 } asm "suld.b.a1d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$6, $5}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b32.trap [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surfuint4(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + uint4 val; + + // CHECK: %0 = tail call { i32, i32, i32, i32 } asm "suld.b.1d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$5}];", "=r,=r,=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b32.zero [$0, {$1}], {$2, $3, $4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call { i32, i32, i32, i32 } asm "suld.b.1d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$5}];", "=r,=r,=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b32.clamp [$0, {$1}], {$2, $3, $4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call { i32, i32, i32, i32 } asm "suld.b.1d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$5}];", "=r,=r,=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b32.trap [$0, {$1}], {$2, $3, $4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call { i32, i32, i32, i32 } asm "suld.b.2d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$5, $6}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b32.zero [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call { i32, i32, i32, i32 } asm "suld.b.2d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$5, $6}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b32.clamp [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call { i32, i32, i32, i32 } asm "suld.b.2d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$5, $6}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b32.trap [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call { i32, i32, i32, i32 } asm "suld.b.3d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b32.zero [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call { i32, i32, i32, i32 } asm "suld.b.3d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b32.clamp [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call { i32, i32, i32, i32 } asm "suld.b.3d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b32.trap [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call { i32, i32, i32, i32 } asm "suld.b.a1d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$6, $5}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b32.zero [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call { i32, i32, i32, i32 } asm "suld.b.a1d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$6, $5}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b32.clamp [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call { i32, i32, i32, i32 } asm "suld.b.a1d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$6, $5}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b32.trap [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call { i32, i32, i32, i32 } asm "suld.b.a2d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} + +__device__ void surffloat4(cudaSurfaceObject_t surf, int x, int y, int z, int layer, int face, int layerface) { + float4 val; + + // CHECK: %0 = tail call contract { float, float, float, float } asm "suld.b.1d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$5}];", "=r,=r,=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b32.zero [$0, {$1}], {$2, $3, $4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeZero); + // CHECK: %1 = tail call contract { float, float, float, float } asm "suld.b.1d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$5}];", "=r,=r,=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b32.clamp [$0, {$1}], {$2, $3, $4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeClamp); + // CHECK: %2 = tail call contract { float, float, float, float } asm "suld.b.1d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$5}];", "=r,=r,=r,=r,l,r" + __nv_tex_surf_handler("__isurf1Dread", &val, surf, x, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.1d.v4.b32.trap [$0, {$1}], {$2, $3, $4, $5};", "l,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, surf, x, cudaBoundaryModeTrap); + + // CHECK: %3 = tail call contract { float, float, float, float } asm "suld.b.2d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$5, $6}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b32.zero [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeZero); + // CHECK: %4 = tail call contract { float, float, float, float } asm "suld.b.2d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$5, $6}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b32.clamp [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeClamp); + // CHECK: %5 = tail call contract { float, float, float, float } asm "suld.b.2d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$5, $6}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf2Dread", &val, surf, x, y, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.2d.v4.b32.trap [$0, {$1, $2}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, surf, x, y, cudaBoundaryModeTrap); + + // CHECK: %6 = tail call contract { float, float, float, float } asm "suld.b.3d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b32.zero [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeZero); + // CHECK: %7 = tail call contract { float, float, float, float } asm "suld.b.3d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b32.clamp [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeClamp); + // CHECK: %8 = tail call contract { float, float, float, float } asm "suld.b.3d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$5, $6, $7, $7}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf3Dread", &val, surf, x, y, z, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.3d.v4.b32.trap [$0, {$1, $2, $3, $3}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, surf, x, y, z, cudaBoundaryModeTrap); + + // CHECK: %9 = tail call contract { float, float, float, float } asm "suld.b.a1d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$6, $5}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b32.zero [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeZero); + // CHECK: %10 = tail call contract { float, float, float, float } asm "suld.b.a1d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$6, $5}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b32.clamp [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeClamp); + // CHECK: %11 = tail call contract { float, float, float, float } asm "suld.b.a1d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$6, $5}];", "=r,=r,=r,=r,l,r,r" + __nv_tex_surf_handler("__isurf1DLayeredread", &val, surf, x, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a1d.v4.b32.trap [$0, {$2, $1}], {$3, $4, $5, $6};", "l,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, surf, x, layer, cudaBoundaryModeTrap); + + // CHECK: %12 = tail call contract { float, float, float, float } asm "suld.b.a2d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeZero); + // CHECK: %13 = tail call contract { float, float, float, float } asm "suld.b.a2d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeClamp); + // CHECK: %14 = tail call contract { float, float, float, float } asm "suld.b.a2d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredread", &val, surf, x, y, layer, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, surf, x, y, layer, cudaBoundaryModeTrap); + + // CHECK: %15 = tail call contract { float, float, float, float } asm "suld.b.a2d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeZero); + // CHECK: %16 = tail call contract { float, float, float, float } asm "suld.b.a2d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeClamp); + // CHECK: %17 = tail call contract { float, float, float, float } asm "suld.b.a2d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapread", &val, surf, x, y, face, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, surf, x, y, face, cudaBoundaryModeTrap); + + // CHECK: %18 = tail call contract { float, float, float, float } asm "suld.b.a2d.v4.b32.zero {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.zero [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeZero); + // CHECK: %19 = tail call contract { float, float, float, float } asm "suld.b.a2d.v4.b32.clamp {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.clamp [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeClamp); + // CHECK: %20 = tail call contract { float, float, float, float } asm "suld.b.a2d.v4.b32.trap {$0, $1, $2, $3}, [$4, {$7, $5, $6, $6}];", "=r,=r,=r,=r,l,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredread", &val, surf, x, y, layerface, cudaBoundaryModeTrap); + // CHECK: tail call void asm sideeffect "sust.b.a2d.v4.b32.trap [$0, {$3, $1, $2, $2}], {$4, $5, $6, $7};", "l,r,r,r,r,r,r,r" + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, surf, x, y, layerface, cudaBoundaryModeTrap); +} diff --git a/clang/test/Headers/Inputs/include/cuda.h b/clang/test/Headers/Inputs/include/cuda.h index d84029759c165..40a00b5af295a 100644 --- a/clang/test/Headers/Inputs/include/cuda.h +++ b/clang/test/Headers/Inputs/include/cuda.h @@ -25,6 +25,10 @@ __device__ void *operator new[](__SIZE_TYPE__, void *p) { return p; } #define CUDA_VERSION 10100 +struct char1 { + char x; + __host__ __device__ char1(char x = 0) : x(x) {} +}; struct char2 { char x, y; __host__ __device__ char2(char x = 0, char y = 0) : x(x), y(y) {} @@ -34,6 +38,10 @@ struct char4 { __host__ __device__ char4(char x = 0, char y = 0, char z = 0, char w = 0) : x(x), y(y), z(z), w(w) {} }; +struct uchar1 { + unsigned char x; + __host__ __device__ uchar1(unsigned char x = 0) : x(x) {} +}; struct uchar2 { unsigned char x, y; __host__ __device__ uchar2(unsigned char x = 0, unsigned char y = 0) : x(x), y(y) {} @@ -43,6 +51,10 @@ struct uchar4 { __host__ __device__ uchar4(unsigned char x = 0, unsigned char y = 0, unsigned char z = 0, unsigned char w = 0) : x(x), y(y), z(z), w(w) {} }; +struct short1 { + short x; + __host__ __device__ short1(short x = 0) : x(x) {} +}; struct short2 { short x, y; __host__ __device__ short2(short x = 0, short y = 0) : x(x), y(y) {} @@ -52,6 +64,10 @@ struct short4 { __host__ __device__ short4(short x = 0, short y = 0, short z = 0, short w = 0) : x(x), y(y), z(z), w(w) {} }; +struct ushort1 { + unsigned short x; + __host__ __device__ ushort1(unsigned short x = 0) : x(x) {} +}; struct ushort2 { unsigned short x, y; __host__ __device__ ushort2(unsigned short x = 0, unsigned short y = 0) : x(x), y(y) {} @@ -61,6 +77,10 @@ struct ushort4 { __host__ __device__ ushort4(unsigned short x = 0, unsigned short y = 0, unsigned short z = 0, unsigned short w = 0) : x(x), y(y), z(z), w(w) {} }; +struct int1 { + int x; + __host__ __device__ int1(int x = 0) : x(x) {} +}; struct int2 { int x, y; __host__ __device__ int2(int x = 0, int y = 0) : x(x), y(y) {} @@ -70,6 +90,10 @@ struct int4 { __host__ __device__ int4(int x = 0, int y = 0, int z = 0, int w = 0) : x(x), y(y), z(z), w(w) {} }; +struct uint1 { + unsigned x; + __host__ __device__ uint1(unsigned x = 0) : x(x) {} +}; struct uint2 { unsigned x, y; __host__ __device__ uint2(unsigned x = 0, unsigned y = 0) : x(x), y(y) {} @@ -83,6 +107,10 @@ struct uint4 { __host__ __device__ uint4(unsigned x = 0, unsigned y = 0, unsigned z = 0, unsigned w = 0) : x(x), y(y), z(z), w(w) {} }; +struct longlong1 { + long long x; + __host__ __device__ longlong1(long long x = 0) : x(x) {} +}; struct longlong2 { long long x, y; __host__ __device__ longlong2(long long x = 0, long long y = 0) : x(x), y(y) {} @@ -92,6 +120,10 @@ struct longlong4 { __host__ __device__ longlong4(long long x = 0, long long y = 0, long long z = 0, long long w = 0) : x(x), y(y), z(z), w(w) {} }; +struct ulonglong1 { + unsigned long long x; + __host__ __device__ ulonglong1(unsigned long long x = 0) : x(x) {} +}; struct ulonglong2 { unsigned long long x, y; __host__ __device__ ulonglong2(unsigned long long x = 0, unsigned long long y = 0) : x(x), y(y) {} @@ -101,6 +133,10 @@ struct ulonglong4 { __host__ __device__ ulonglong4(unsigned long long x = 0, unsigned long long y = 0, unsigned long long z = 0, unsigned long long w = 0) : x(x), y(y), z(z), w(w) {} }; +struct float1 { + float x; + __host__ __device__ float1(float x = 0) : x(x) {} +}; struct float2 { float x, y; __host__ __device__ float2(float x = 0, float y = 0) : x(x), y(y) {} @@ -110,6 +146,10 @@ struct float4 { __host__ __device__ float4(float x = 0, float y = 0, float z = 0, float w = 0) : x(x), y(y), z(z), w(w) {} }; +struct double1 { + double x; + __host__ __device__ double1(double x = 0) : x(x) {} +}; struct double2 { double x, y; __host__ __device__ double2(double x = 0, double y = 0) : x(x), y(y) {} @@ -120,12 +160,19 @@ struct double4 { }; typedef unsigned long long cudaTextureObject_t; +typedef unsigned long long cudaSurfaceObject_t; enum cudaTextureReadMode { cudaReadModeNormalizedFloat, cudaReadModeElementType }; +enum cudaSurfaceBoundaryMode { + cudaBoundaryModeZero, + cudaBoundaryModeClamp, + cudaBoundaryModeTrap +}; + enum { cudaTextureType1D, cudaTextureType2D, diff --git a/clang/test/Headers/Inputs/include/surface_indirect_functions.h b/clang/test/Headers/Inputs/include/surface_indirect_functions.h new file mode 100644 index 0000000000000..bffa775cb2822 --- /dev/null +++ b/clang/test/Headers/Inputs/include/surface_indirect_functions.h @@ -0,0 +1,2 @@ +// required for __clang_cuda_runtime_wrapper.h tests +#pragma once From 0bec0f5c059af5f920fe22ecda469b666b5971b0 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 3 Apr 2025 13:21:22 -0400 Subject: [PATCH 0560/1029] [SLP]Initial support for (masked)loads + compress and (masked)interleaved Added initial support for (masked)loads + compress and (masked)interleaved loads. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/132099 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 359 ++++++++++++++++-- .../X86/entries-shuffled-diff-sizes.ll | 17 +- .../X86/gep-nodes-with-non-gep-inst.ll | 22 +- .../Transforms/SLPVectorizer/X86/pr47623.ll | 16 +- .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 162 +++----- .../Transforms/SLPVectorizer/X86/pr47629.ll | 162 +++----- .../X86/remark_gather-load-redux-cost.ll | 11 +- .../X86/reorder-possible-strided-node.ll | 52 +-- .../X86/reorder-reused-masked-gather.ll | 12 +- .../X86/reorder-reused-masked-gather2.ll | 11 +- .../X86/scatter-vectorize-reused-pointer.ll | 12 +- .../Transforms/SLPVectorizer/X86/sin-sqrt.ll | 8 +- .../SLPVectorizer/X86/split-load8_2-unord.ll | 11 +- .../X86/split-load8_2_unord_geps.ll | 11 +- 14 files changed, 518 insertions(+), 348 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b82a66ca3b889..a115fec47aeec 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -38,6 +38,7 @@ #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" @@ -1380,7 +1381,8 @@ class BoUpSLP { Gather, Vectorize, ScatterVectorize, - StridedVectorize + StridedVectorize, + CompressVectorize }; using ValueList = SmallVector; @@ -3378,6 +3380,7 @@ class BoUpSLP { Vectorize, ///< The node is regularly vectorized. ScatterVectorize, ///< Masked scatter/gather node. StridedVectorize, ///< Strided loads (and stores) + CompressVectorize, ///< (Masked) load with compress. NeedToGather, ///< Gather/buildvector node. CombinedVectorize, ///< Vectorized node, combined with its user into more ///< complex node like select/cmp to minmax, mul/add to @@ -3604,6 +3607,9 @@ class BoUpSLP { case StridedVectorize: dbgs() << "StridedVectorize\n"; break; + case CompressVectorize: + dbgs() << "CompressVectorize\n"; + break; case NeedToGather: dbgs() << "NeedToGather\n"; break; @@ -4819,7 +4825,8 @@ template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { if (Entry->isGather()) return "color=red"; if (Entry->State == TreeEntry::ScatterVectorize || - Entry->State == TreeEntry::StridedVectorize) + Entry->State == TreeEntry::StridedVectorize || + Entry->State == TreeEntry::CompressVectorize) return "color=blue"; return ""; } @@ -5419,6 +5426,157 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec, return Builder.CreateShuffleVector(Vec, Mask); } +/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered +/// with \p Order. +/// \return true if the mask represents strided access, false - otherwise. +static bool buildCompressMask(ArrayRef PointerOps, + ArrayRef Order, Type *ScalarTy, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl &CompressMask) { + const unsigned Sz = PointerOps.size(); + CompressMask.assign(Sz, PoisonMaskElem); + // The first element always set. + CompressMask[0] = 0; + // Check if the mask represents strided access. + std::optional Stride = 0; + Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()]; + for (unsigned I : seq(1, Sz)) { + Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]]; + unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE); + CompressMask[I] = Pos; + if (!Stride) + continue; + if (*Stride == 0) { + *Stride = Pos; + continue; + } + if (Pos != *Stride * I) + Stride.reset(); + } + return Stride.has_value(); +} + +/// Checks if the \p VL can be transformed to a (masked)load + compress or +/// (masked) interleaved load. +static bool isMaskedLoadCompress( + ArrayRef VL, ArrayRef PointerOps, + ArrayRef Order, const TargetTransformInfo &TTI, + const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, + const DominatorTree &DT, const TargetLibraryInfo &TLI, + const function_ref AreAllUsersVectorized, bool &IsMasked, + unsigned &InterleaveFactor, SmallVectorImpl &CompressMask, + VectorType *&LoadVecTy) { + InterleaveFactor = 0; + Type *ScalarTy = VL.front()->getType(); + const unsigned Sz = VL.size(); + auto *VecTy = getWidenedType(ScalarTy, Sz); + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + // Check external uses. + for (const auto [I, V] : enumerate(VL)) { + if (AreAllUsersVectorized(V)) + continue; + InstructionCost ExtractCost = + TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I); + InstructionCost ScalarCost = + TTI.getInstructionCost(cast(V), CostKind); + if (ExtractCost <= ScalarCost) + return false; + } + Value *Ptr0; + Value *PtrN; + if (Order.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[Order.front()]; + PtrN = PointerOps[Order.back()]; + } + std::optional Diff = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); + if (!Diff) + return false; + const unsigned MaxRegSize = + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) + .getFixedValue(); + // Check for very large distances between elements. + if (*Diff / Sz >= MaxRegSize / 8) + return false; + Align CommonAlignment = computeCommonAlignment(VL); + LoadVecTy = getWidenedType(ScalarTy, *Diff + 1); + auto *LI = cast(Order.empty() ? VL.front() : VL[Order.front()]); + IsMasked = !isSafeToLoadUnconditionally( + Ptr0, LoadVecTy, CommonAlignment, DL, + cast(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT, + &TLI); + if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment, + LI->getPointerAddressSpace())) + return false; + // TODO: perform the analysis of each scalar load for better + // safe-load-unconditionally analysis. + bool IsStrided = + buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask); + assert(CompressMask.size() >= 2 && "At least two elements are required"); + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, PointerOps, PointerOps.front(), + Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy); + // The cost of scalar loads. + InstructionCost ScalarLoadsCost = + std::accumulate(VL.begin(), VL.end(), InstructionCost(), + [&](InstructionCost C, Value *V) { + return C + TTI.getInstructionCost(cast(V), + CostKind); + }) + + ScalarGEPCost; + APInt DemandedElts = APInt::getAllOnes(Sz); + InstructionCost GatherCost = + getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts, + /*Insert=*/true, + /*Extract=*/false, CostKind) + + ScalarLoadsCost; + InstructionCost LoadCost = 0; + if (IsMasked) { + LoadCost = + TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, + LI->getPointerAddressSpace(), CostKind); + } else { + CommonAlignment = LI->getAlign(); + LoadCost = + TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, + LI->getPointerAddressSpace(), CostKind); + } + SmallVector Mask; + if (!Order.empty()) + inversePermutation(Order, Mask); + if (IsStrided) { + // Check for potential segmented(interleaved) loads. + if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1], + CommonAlignment, + LI->getPointerAddressSpace())) { + InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost( + Instruction::Load, LoadVecTy, CompressMask[1], std::nullopt, + CommonAlignment, LI->getPointerAddressSpace(), CostKind, IsMasked); + if (!Mask.empty()) + InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, + VecTy, Mask, CostKind); + if (InterleavedCost < GatherCost) { + InterleaveFactor = CompressMask[1]; + return true; + } + } + } + if (!Order.empty()) { + SmallVector NewMask(Sz, PoisonMaskElem); + for (unsigned I : seq(Sz)) { + NewMask[I] = CompressMask[Mask[I]]; + } + CompressMask.swap(NewMask); + } + InstructionCost CompressCost = ::getShuffleCost( + TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind); + InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost; + return TotalVecCost < GatherCost; +} + BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, @@ -5490,9 +5648,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, // Check that the sorted loads are consecutive. if (static_cast(*Diff) == Sz - 1) return LoadsState::Vectorize; - if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || - TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) - return LoadsState::Gather; // Simple check if not a strided access - clear order. bool IsPossibleStrided = *Diff % (Sz - 1) == 0; // Try to generate strided load node if: @@ -5548,7 +5703,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, } } } + bool IsMasked; + unsigned InterleaveFactor; + SmallVector CompressMask; + VectorType *LoadVecTy; + if (isMaskedLoadCompress( + VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI, + [&](Value *V) { + return areAllUsersVectorized(cast(V), + UserIgnoreList); + }, + IsMasked, InterleaveFactor, CompressMask, LoadVecTy)) + return LoadsState::CompressVectorize; } + if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || + TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) + return LoadsState::Gather; // Correctly identify compare the cost of loads + shuffles rather than // strided/masked gather loads. Returns true if vectorized + shuffles // representation is better than just gather. @@ -5641,7 +5811,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, } // If need the reorder - consider as high-cost masked gather for now. if ((LS == LoadsState::Vectorize || - LS == LoadsState::StridedVectorize) && + LS == LoadsState::StridedVectorize || + LS == LoadsState::CompressVectorize) && !Order.empty() && !isReverseOrder(Order)) LS = LoadsState::ScatterVectorize; States.push_back(LS); @@ -5706,6 +5877,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, CommonAlignment, CostKind) + VectorGEPCost; break; + case LoadsState::CompressVectorize: + VecLdCost += TTI.getMaskedMemoryOpCost( + Instruction::Load, SubVecTy, CommonAlignment, + LI0->getPointerAddressSpace(), CostKind) + + VectorGEPCost + + ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy, + {}, CostKind); + break; case LoadsState::ScatterVectorize: VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy, LI0->getPointerOperand(), @@ -6079,7 +6258,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, return std::nullopt; if (TE.State == TreeEntry::SplitVectorize || ((TE.State == TreeEntry::Vectorize || - TE.State == TreeEntry::StridedVectorize) && + TE.State == TreeEntry::StridedVectorize || + TE.State == TreeEntry::CompressVectorize) && (isa(TE.getMainOp()) || (TopToBottom && isa(TE.getMainOp()))))) { assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) && @@ -6266,7 +6446,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, OrdersType CurrentOrder; LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(), CurrentOrder, PointerOps); - if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize) + if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize || + Res == LoadsState::CompressVectorize) return std::move(CurrentOrder); } // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars @@ -6506,7 +6687,8 @@ void BoUpSLP::reorderTopToBottom() { VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || - TE->State == TreeEntry::SplitVectorize) || + TE->State == TreeEntry::SplitVectorize || + TE->State == TreeEntry::CompressVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && @@ -6680,7 +6862,8 @@ void BoUpSLP::reorderTopToBottom() { if ((TE->State == TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty()) || ((TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize) && + TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::CompressVectorize) && (isa(TE->getMainOp()) || (SLPReVec && isa(TE->getMainOp()))))) { @@ -6728,6 +6911,7 @@ bool BoUpSLP::canReorderOperands( return OpData.first == I && (OpData.second->State == TreeEntry::Vectorize || OpData.second->State == TreeEntry::StridedVectorize || + OpData.second->State == TreeEntry::CompressVectorize || OpData.second->State == TreeEntry::SplitVectorize); })) continue; @@ -6742,6 +6926,7 @@ bool BoUpSLP::canReorderOperands( // node, just reorder reuses mask. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) GatherOps.push_back(TE); @@ -6752,6 +6937,7 @@ bool BoUpSLP::canReorderOperands( [&Gather, UserTE, I](TreeEntry *TE) { assert(TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && "Only non-vectorized nodes are expected."); if (TE->UserTreeIndex.UserTE == UserTE && @@ -6788,6 +6974,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (const std::unique_ptr &TE : VectorizableTree) { if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize) NonVectorized.push_back(TE.get()); if (std::optional CurrentOrder = @@ -6795,6 +6982,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Queue.push(TE.get()); if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::CompressVectorize || TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.insert(TE.get()); @@ -6823,6 +7011,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (TreeEntry *TE : OrderedOps) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::CompressVectorize || TE->State == TreeEntry::SplitVectorize || (TE->isGather() && GathersToOrders.contains(TE))) || !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() || @@ -7117,6 +7306,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // Gathers are processed separately. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && (TE->State != TreeEntry::ScatterVectorize || TE->ReorderIndices.empty())) @@ -7149,7 +7339,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Data.first->reorderOperands(Mask); if (!isa(Data.first->getMainOp()) || Data.first->isAltShuffle() || - Data.first->State == TreeEntry::StridedVectorize) { + Data.first->State == TreeEntry::StridedVectorize || + Data.first->State == TreeEntry::CompressVectorize) { reorderScalars(Data.first->Scalars, Mask); reorderOrder(Data.first->ReorderIndices, MaskOrder, /*BottomOrder=*/true); @@ -7927,8 +8118,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads( }); if (It == Slice.end()) return false; - ArrayRef VL = - VectorizableTree[std::get<0>(P)]->Scalars; + const TreeEntry &TE = + *VectorizableTree[std::get<0>(P)]; + ArrayRef VL = TE.Scalars; + OrdersType Order; + SmallVector PointerOps; + LoadsState State = canVectorizeLoads( + VL, VL.front(), Order, PointerOps); + if (State == LoadsState::ScatterVectorize || + State == LoadsState::CompressVectorize) + return false; ConsecutiveNodesSize += VL.size(); unsigned Start = std::distance(Slice.begin(), It); unsigned Sz = Slice.size() - Start; @@ -8393,23 +8592,44 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. + auto IsGatheredNode = [&]() { + if (!GatheredLoadsEntriesFirst) + return false; + return all_of(VL, [&](Value *V) { + if (isa(V)) + return true; + return any_of(getTreeEntries(V), [&](const TreeEntry *TE) { + return TE->Idx >= *GatheredLoadsEntriesFirst; + }); + }); + }; switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) { case LoadsState::Vectorize: return TreeEntry::Vectorize; + case LoadsState::CompressVectorize: + if (!IsGraphTransformMode && !VectorizableTree.empty()) { + // Delay slow vectorized nodes for better vectorization attempts. + LoadEntriesToVectorize.insert(VectorizableTree.size()); + return TreeEntry::NeedToGather; + } + return IsGatheredNode() ? TreeEntry::NeedToGather + : TreeEntry::CompressVectorize; case LoadsState::ScatterVectorize: if (!IsGraphTransformMode && !VectorizableTree.empty()) { // Delay slow vectorized nodes for better vectorization attempts. LoadEntriesToVectorize.insert(VectorizableTree.size()); return TreeEntry::NeedToGather; } - return TreeEntry::ScatterVectorize; + return IsGatheredNode() ? TreeEntry::NeedToGather + : TreeEntry::ScatterVectorize; case LoadsState::StridedVectorize: if (!IsGraphTransformMode && VectorizableTree.size() > 1) { // Delay slow vectorized nodes for better vectorization attempts. LoadEntriesToVectorize.insert(VectorizableTree.size()); return TreeEntry::NeedToGather; } - return TreeEntry::StridedVectorize; + return IsGatheredNode() ? TreeEntry::NeedToGather + : TreeEntry::StridedVectorize; case LoadsState::Gather: #ifndef NDEBUG Type *ScalarTy = VL0->getType(); @@ -9510,6 +9730,15 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, << "SLP: added a new TreeEntry (jumbled LoadInst).\n"; TE->dump()); break; + case TreeEntry::CompressVectorize: + // Vectorizing non-consecutive loads with (masked)load + compress. + TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndices, CurrentOrder); + LLVM_DEBUG( + dbgs() + << "SLP: added a new TreeEntry (masked LoadInst + compress).\n"; + TE->dump()); + break; case TreeEntry::StridedVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, @@ -12041,6 +12270,8 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { if (TE.State == TreeEntry::ScatterVectorize || TE.State == TreeEntry::StridedVectorize) return TTI::CastContextHint::GatherScatter; + if (TE.State == TreeEntry::CompressVectorize) + return TTI::CastContextHint::Masked; if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load && !TE.isAltShuffle()) { if (TE.ReorderIndices.empty()) @@ -12134,7 +12365,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::StridedVectorize) && + E->State == TreeEntry::StridedVectorize || + E->State == TreeEntry::CompressVectorize) && "Unhandled state"); assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || @@ -12225,8 +12457,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // Negative value means vectorizing is profitable. auto GetGEPCostDiff = [=](ArrayRef Ptrs, Value *BasePtr) { assert((E->State == TreeEntry::Vectorize || - E->State == TreeEntry::StridedVectorize) && - "Entry state expected to be Vectorize or StridedVectorize here."); + E->State == TreeEntry::StridedVectorize || + E->State == TreeEntry::CompressVectorize) && + "Entry state expected to be Vectorize, StridedVectorize or " + "MaskedLoadCompressVectorize here."); InstructionCost ScalarCost = 0; InstructionCost VecCost = 0; std::tie(ScalarCost, VecCost) = getGEPCosts( @@ -12689,6 +12923,46 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, /*VariableMask=*/false, CommonAlignment, CostKind); break; } + case TreeEntry::CompressVectorize: { + SmallVector PointerOps(VL.size()); + for (auto [I, V] : enumerate(VL)) + PointerOps[I] = cast(V)->getPointerOperand(); + bool IsMasked; + unsigned InterleaveFactor; + SmallVector CompressMask; + VectorType *LoadVecTy; + [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress( + VL, PointerOps, std::nullopt, *TTI, *DL, *SE, *AC, *DT, *TLI, + [](Value *) { return true; }, IsMasked, InterleaveFactor, + CompressMask, LoadVecTy); + assert(IsVectorized && "Expected to be vectorized"); + Align CommonAlignment; + if (IsMasked) + CommonAlignment = + computeCommonAlignment(UniqueValues.getArrayRef()); + else + CommonAlignment = LI0->getAlign(); + if (InterleaveFactor) { + VecLdCost = TTI->getInterleavedMemoryOpCost( + Instruction::Load, LoadVecTy, InterleaveFactor, std::nullopt, + CommonAlignment, LI0->getPointerAddressSpace(), CostKind); + } else if (IsMasked) { + VecLdCost = TTI->getMaskedMemoryOpCost( + Instruction::Load, LoadVecTy, CommonAlignment, + LI0->getPointerAddressSpace(), CostKind); + // TODO: include this cost into CommonCost. + VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + LoadVecTy, CompressMask, CostKind); + } else { + VecLdCost = TTI->getMemoryOpCost( + Instruction::Load, LoadVecTy, CommonAlignment, + LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); + // TODO: include this cost into CommonCost. + VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + LoadVecTy, CompressMask, CostKind); + } + break; + } case TreeEntry::ScatterVectorize: { Align CommonAlignment = computeCommonAlignment(UniqueValues.getArrayRef()); @@ -12978,6 +13252,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { if (VectorizableTree.size() == 1 && (VectorizableTree[0]->State == TreeEntry::Vectorize || VectorizableTree[0]->State == TreeEntry::StridedVectorize || + VectorizableTree[0]->State == TreeEntry::CompressVectorize || (ForReduction && AreVectorizableGathers(VectorizableTree[0].get(), VectorizableTree[0]->Scalars.size()) && @@ -13001,7 +13276,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { if (VectorizableTree[0]->isGather() || (VectorizableTree[1]->isGather() && VectorizableTree[0]->State != TreeEntry::ScatterVectorize && - VectorizableTree[0]->State != TreeEntry::StridedVectorize)) + VectorizableTree[0]->State != TreeEntry::StridedVectorize && + VectorizableTree[0]->State != TreeEntry::CompressVectorize)) return false; return true; @@ -17183,6 +17459,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *PO = LI->getPointerOperand(); if (E->State == TreeEntry::Vectorize) { NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign()); + } else if (E->State == TreeEntry::CompressVectorize) { + SmallVector PointerOps(E->Scalars.size()); + for (auto [I, V] : enumerate(E->Scalars)) + PointerOps[I] = cast(V)->getPointerOperand(); + bool IsMasked; + unsigned InterleaveFactor; + SmallVector CompressMask; + VectorType *LoadVecTy; + [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress( + E->Scalars, PointerOps, std::nullopt, *TTI, *DL, *SE, *AC, *DT, + *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor, + CompressMask, LoadVecTy); + assert(IsVectorized && "Expected to be vectorized"); + Align CommonAlignment; + if (IsMasked) + CommonAlignment = computeCommonAlignment(E->Scalars); + else + CommonAlignment = LI->getAlign(); + if (IsMasked) { + SmallVector MaskValues( + getNumElements(LoadVecTy) / getNumElements(LI->getType()), + ConstantInt::getFalse(VecTy->getContext())); + for (int I : CompressMask) + MaskValues[I] = ConstantInt::getTrue(VecTy->getContext()); + Constant *MaskValue = ConstantVector::get(MaskValues); + NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment, + MaskValue); + } else { + NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment); + } + NewLI = ::propagateMetadata(NewLI, E->Scalars); + // TODO: include this cost into CommonCost. + NewLI = + cast(Builder.CreateShuffleVector(NewLI, CompressMask)); } else if (E->State == TreeEntry::StridedVectorize) { Value *Ptr0 = cast(E->Scalars.front())->getPointerOperand(); Value *PtrN = cast(E->Scalars.back())->getPointerOperand(); @@ -17252,7 +17562,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Align CommonAlignment = computeCommonAlignment(E->Scalars); NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment); } - Value *V = ::propagateMetadata(NewLI, E->Scalars); + Value *V = E->State == TreeEntry::CompressVectorize + ? NewLI + : ::propagateMetadata(NewLI, E->Scalars); V = FinalShuffle(V, E); E->VectorizedValue = V; @@ -17854,11 +18166,14 @@ Value *BoUpSLP::vectorizeTree( ArrayRef UseEntries = getTreeEntries(U); return !UseEntries.empty() && (E->State == TreeEntry::Vectorize || - E->State == TreeEntry::StridedVectorize) && + E->State == TreeEntry::StridedVectorize || + E->State == TreeEntry::CompressVectorize) && any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) { return (UseEntry->State == TreeEntry::Vectorize || UseEntry->State == - TreeEntry::StridedVectorize) && + TreeEntry::StridedVectorize || + UseEntry->State == + TreeEntry::CompressVectorize) && doesInTreeUserNeedToExtract( Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll index aa9195f8c48ce..b99a1c2d83394 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll @@ -15,19 +15,16 @@ define void @test() { ; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]] ; CHECK-NEXT: store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), align 16 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1296), align 16 -; CHECK-NEXT: [[TMP13:%.*]] = load <8 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1304), align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 ; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP12]], <8 x float> [[TMP13]], i64 8) -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP15]], <4 x float> [[TMP7]], i64 0) -; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> [[TMP16]], <2 x float> [[TMP9]], i64 6) +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP14]], [[TMP17]] -; CHECK-NEXT: store <16 x float> [[TMP18]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP18]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: store <16 x float> [[TMP15]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 ; CHECK-NEXT: ret void ; alloca_0: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll index 12263b065d89c..80ba7a40fb193 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll @@ -9,17 +9,9 @@ define void @test() { ; CHECK-NEXT: [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[COND_IN_V]], align 8 -; CHECK-NEXT: [[BV:%.*]] = icmp eq i64 [[V]], 0 -; CHECK-NEXT: [[IN_1:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 4 -; CHECK-NEXT: [[V_1:%.*]] = load i64, ptr [[IN_1]], align 8 -; CHECK-NEXT: [[BV_1:%.*]] = icmp eq i64 [[V_1]], 0 -; CHECK-NEXT: [[IN_2:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 8 -; CHECK-NEXT: [[V_2:%.*]] = load i64, ptr [[IN_2]], align 8 -; CHECK-NEXT: [[BV_2:%.*]] = icmp eq i64 [[V_2]], 0 -; CHECK-NEXT: [[IN_3:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 12 -; CHECK-NEXT: [[V_3:%.*]] = load i64, ptr [[IN_3]], align 8 -; CHECK-NEXT: [[BV_3:%.*]] = icmp eq i64 [[V_3]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> , <13 x i64> poison) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret void ; ; CHECK-SLP-THRESHOLD-LABEL: define void @test @@ -28,11 +20,9 @@ define void @test() { ; CHECK-SLP-THRESHOLD-NEXT: [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null ; CHECK-SLP-THRESHOLD-NEXT: br label [[BB:%.*]] ; CHECK-SLP-THRESHOLD: bb: -; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0 -; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> -; CHECK-SLP-THRESHOLD-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison) -; CHECK-SLP-THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer +; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> , <13 x i64> poison) +; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> +; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer ; CHECK-SLP-THRESHOLD-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll index f249394c91788..a9c0eb3f9f2b9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -24,20 +24,16 @@ define void @foo() { ; SSE-NEXT: ret void ; ; AVX-LABEL: @foo( -; AVX-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 -; AVX-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16 +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> ; AVX-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 ; AVX-NEXT: ret void ; ; AVX512-LABEL: @foo( -; AVX512-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 -; AVX512-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8 -; AVX512-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 -; AVX512-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16 +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> ; AVX512-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 ; AVX512-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index 925c334cb5f20..a0e52c13ec621 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -164,36 +164,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> +; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; AVX512F-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( ; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> +; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -290,49 +274,30 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 -; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 -; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 -; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 -; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 -; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 -; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 -; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 -; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 -; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 -; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 -; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24 +; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], -; AVX512F-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> +; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; AVX512F-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], -; AVX512VL-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, ptr %1, align 4, !tbaa !2 @@ -447,49 +412,30 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 -; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 -; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 -; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 -; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 -; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 -; AVX2-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 +; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24 +; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], ; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> -; AVX512F-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512F-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> +; AVX512F-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; AVX512F-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> -; AVX512VL-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512VL-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, ptr %t0, i64 1 @@ -687,25 +633,21 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512F-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> +; AVX512F-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512VL-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, ptr %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index dc1ba4ec7e7ab..6c5638819dcea 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -164,36 +164,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> +; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; AVX512F-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( ; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> +; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -290,49 +274,30 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 -; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 -; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 -; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 -; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 -; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 -; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 -; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 -; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 -; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 -; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 -; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24 +; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], -; AVX512F-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> +; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; AVX512F-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], -; AVX512VL-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, ptr %1, align 4, !tbaa !2 @@ -447,49 +412,30 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 -; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 -; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 -; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 -; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 -; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 -; AVX2-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 +; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24 +; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], ; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> -; AVX512F-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512F-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> +; AVX512F-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; AVX512F-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> -; AVX512VL-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512VL-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, ptr %t0, i64 1 @@ -687,25 +633,21 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512F-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> +; AVX512F-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512VL-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, ptr %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll index 0807a1bd4cdea..bbb1b87fc3dfa 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll @@ -5,10 +5,9 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) +; CHECK-NEXT: [[OFF0_1:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[OFF0_1]], i32 8, <15 x i1> , <15 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, <8 x ptr> [[TMP5]], <8 x i32> [[TMP3]] @@ -22,9 +21,9 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' - ; YAML-NEXT: - Cost: '-1' + ; YAML-NEXT: - Cost: '-10' ; YAML-NEXT: - String: ' and with tree size ' - ; YAML-NEXT: - TreeSize: '8' + ; YAML-NEXT: - TreeSize: '5' entry: %off0.1 = getelementptr inbounds i32, ptr %addr, i32 1 %idx0 = load i32, ptr %off0.1, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll index 5bd954e741d43..02058b1fe8578 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll @@ -5,16 +5,17 @@ define void @test() { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP6]], <2 x i32> [[TMP10]], i64 0) ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> @@ -64,16 +65,17 @@ define void @test1() { ; CHECK-LABEL: define void @test1( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP12]], <2 x i32> [[TMP10]], i64 0) ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer @@ -125,16 +127,17 @@ define void @test_div() { ; CHECK-LABEL: define void @test_div( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0) ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> @@ -184,16 +187,17 @@ define void @test_rem() { ; CHECK-LABEL: define void @test_rem( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0) ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll index c7c67d31f9ded..73b6c80730935 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll @@ -3,14 +3,12 @@ define void @test(ptr noalias %0, ptr %p) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x ptr> [[TMP2]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x ptr> [[TMP3]], <8 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> , <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 4, <16 x i1> , <16 x float> poison) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> , <16 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]] ; CHECK-NEXT: store <16 x float> [[TMP10]], ptr [[TMP5]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll index c114c5dee78e9..92d5506977aeb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll @@ -8,14 +8,11 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 { ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 24 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <6 x float> @llvm.masked.load.v6f32.p1(ptr addrspace(1) [[TMP3]], i32 4, <6 x i1> , <6 x float> poison) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP6]], i64 0) -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP8]], i64 2) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x float> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index 1294a87ff6967..d487e3616956c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -5,16 +5,12 @@ define void @test(i1 %c, ptr %arg) { ; CHECK-LABEL: @test( ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: -; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG:%.*]], i32 8, <5 x i1> , <5 x i64> poison) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x i64> [[TMP1]], <5 x i64> poison, <4 x i32> ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: -; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG]], i32 8, <5 x i1> , <5 x i64> poison) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <5 x i64> [[TMP3]], <5 x i64> poison, <4 x i32> ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: ; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll index e1e80d96d416d..b4996eb58b47e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll @@ -19,11 +19,11 @@ define void @test() { ; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]]) ; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]]) ; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]]) -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <6 x double>, ptr @src, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]]) -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[SIN1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP13]], double [[SIN3]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index 202ec9633712f..3f684e414c8ba 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -8,15 +8,10 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX20]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP0]], i64 4) +; CHECK-NEXT: [[TMP1:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr [[ARRAYIDX20]], i32 4, <12 x i1> , <12 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]] ; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll index 8fe7d15b69cb1..fdc0bc0e00eb8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll @@ -4,16 +4,15 @@ define void @test(ptr noalias %p, ptr noalias %addr, ptr noalias %s) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[ADDR]], i32 8, <15 x i1> , <15 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) +; CHECK-NEXT: [[TMP11:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[GEP2]], i32 8, <15 x i1> , <15 x i32> poison) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <15 x i32> [[TMP11]], <15 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP8]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) ; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]] From 1a540c3b8bcefaf6b36f261341ce55a1a24eca21 Mon Sep 17 00:00:00 2001 From: zhijian lin Date: Thu, 3 Apr 2025 13:22:49 -0400 Subject: [PATCH 0561/1029] [PowerPC] Deprecate uses of ISD::ADDC/ISD::ADDE/ISD::SUBC/ISD::SUBE (#133155) ISD::ADDC, ISD::ADDE, ISD::SUBC and ISD::SUBE are being deprecated, using ISD::UADDO_CARRY,ISD::USUBO_CARRY instead. Lowering the UADDO, UADDO_CARRY, USUBO, USUBO_CARRY in the patch. --- llvm/include/llvm/CodeGen/LivePhysRegs.h | 3 + llvm/lib/CodeGen/LivePhysRegs.cpp | 24 ++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 192 ++++++++++----- llvm/lib/Target/PowerPC/PPCISelLowering.h | 9 +- llvm/lib/Target/PowerPC/PPCInstr64Bit.td | 20 +- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 17 ++ llvm/lib/Target/PowerPC/PPCInstrInfo.td | 44 +++- llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 7 + llvm/lib/Target/PowerPC/PPCRegisterInfo.h | 3 + llvm/lib/Target/PowerPC/PPCRegisterInfo.td | 1 + llvm/lib/Target/X86/X86ISelLowering.cpp | 27 +-- llvm/test/CodeGen/PowerPC/adde_return_type.ll | 2 +- llvm/test/CodeGen/PowerPC/addegluecrash.ll | 24 +- llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll | 16 +- llvm/test/CodeGen/PowerPC/aix-cc-abi.ll | 8 +- .../CodeGen/PowerPC/aix-cc-byval-split.ll | 8 +- .../CodeGen/PowerPC/aix-tls-gd-longlong.ll | 48 ++-- .../PowerPC/aix-tls-le-ldst-longlong.ll | 120 +++++----- .../PowerPC/aix-tls-le-xcoff-reloc-large32.ll | 24 +- .../PowerPC/atomicrmw-cond-sub-clamp.ll | 6 +- .../carry-liveness-after-expand-isel.ll | 82 +++++++ llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll | 6 +- llvm/test/CodeGen/PowerPC/inc-of-add.ll | 2 +- llvm/test/CodeGen/PowerPC/pr35688.ll | 3 +- llvm/test/CodeGen/PowerPC/pr36292.ll | 7 +- llvm/test/CodeGen/PowerPC/pr40922.ll | 9 +- llvm/test/CodeGen/PowerPC/pr45448.ll | 12 +- llvm/test/CodeGen/PowerPC/sat-add.ll | 35 +-- llvm/test/CodeGen/PowerPC/select.ll | 20 +- llvm/test/CodeGen/PowerPC/uaddo-32.ll | 50 ++-- llvm/test/CodeGen/PowerPC/uaddo-64.ll | 82 ++++--- .../umulo-128-legalisation-lowering.ll | 219 +++++++++--------- .../PowerPC/urem-seteq-illegal-types.ll | 23 +- 33 files changed, 718 insertions(+), 435 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/carry-liveness-after-expand-isel.ll diff --git a/llvm/include/llvm/CodeGen/LivePhysRegs.h b/llvm/include/llvm/CodeGen/LivePhysRegs.h index 3b3608e4641e7..2a719571fde2d 100644 --- a/llvm/include/llvm/CodeGen/LivePhysRegs.h +++ b/llvm/include/llvm/CodeGen/LivePhysRegs.h @@ -195,6 +195,9 @@ void addLiveIns(MachineBasicBlock &MBB, const LivePhysRegs &LiveRegs); void computeAndAddLiveIns(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB); +/// Check if physical register \p Reg is used after \p MBI. +bool isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI); + /// Convenience function for recomputing live-in's for a MBB. Returns true if /// any changes were made. static inline bool recomputeLiveIns(MachineBasicBlock &MBB) { diff --git a/llvm/lib/CodeGen/LivePhysRegs.cpp b/llvm/lib/CodeGen/LivePhysRegs.cpp index 7a06d108c66ca..bc711382420be 100644 --- a/llvm/lib/CodeGen/LivePhysRegs.cpp +++ b/llvm/lib/CodeGen/LivePhysRegs.cpp @@ -338,3 +338,27 @@ void llvm::computeAndAddLiveIns(LivePhysRegs &LiveRegs, computeLiveIns(LiveRegs, MBB); addLiveIns(MBB, LiveRegs); } + +// Returns true if `Reg` is used after this iterator in the rest of the +// basic block or any successors of the basic block. +bool llvm::isPhysRegUsedAfter(Register Reg, MachineBasicBlock::iterator MBI) { + assert(Reg.isPhysical() && "Apply to physical register only"); + + MachineBasicBlock *MBB = MBI->getParent(); + // Scan forward through BB for a use/def of Reg + for (const MachineInstr &MI : llvm::make_range(std::next(MBI), MBB->end())) { + if (MI.readsRegister(Reg, /*TRI=*/nullptr)) + return true; + // If we found a def, we can stop searching. + if (MI.definesRegister(Reg, /*TRI=*/nullptr)) + return false; + } + + // If we hit the end of the block, check whether Reg is live into a + // successor. + for (MachineBasicBlock *Succ : MBB->successors()) + if (Succ->isLiveIn(Reg)) + return true; + + return false; +} diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 5ed14cd21840c..7f4ddae5db463 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -36,6 +36,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -197,6 +198,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, } setOperationAction(ISD::UADDO, RegVT, Custom); + setOperationAction(ISD::USUBO, RegVT, Custom); + + // PowerPC uses addo_carry,subo_carry to propagate carry. + setOperationAction(ISD::UADDO_CARRY, RegVT, Custom); + setOperationAction(ISD::USUBO_CARRY, RegVT, Custom); // On P10, the default lowering generates better code using the // setbc instruction. @@ -266,15 +272,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal); } - // PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry. - const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; - for (MVT VT : ScalarIntVTs) { - setOperationAction(ISD::ADDC, VT, Legal); - setOperationAction(ISD::ADDE, VT, Legal); - setOperationAction(ISD::SUBC, VT, Legal); - setOperationAction(ISD::SUBE, VT, Legal); - } - if (Subtarget.useCRBits()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -1864,6 +1861,14 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { return "PPCISD::SETBC"; case PPCISD::SETBCR: return "PPCISD::SETBCR"; + case PPCISD::ADDC: + return "PPCISD::ADDC"; + case PPCISD::ADDE: + return "PPCISD::ADDE"; + case PPCISD::SUBC: + return "PPCISD::SUBC"; + case PPCISD::SUBE: + return "PPCISD::SUBE"; } return nullptr; } @@ -12150,43 +12155,74 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("ERROR:Should return for all cases within swtich."); } -SDValue PPCTargetLowering::LowerUaddo(SDValue Op, SelectionDAG &DAG) const { - // Default to target independent lowering if there is a logical user of the - // carry-bit. - for (SDNode *U : Op->users()) { - if (U->getOpcode() == ISD::SELECT) - return SDValue(); - if (ISD::isBitwiseLogicOp(U->getOpcode())) { - for (unsigned i = 0, ie = U->getNumOperands(); i != ie; ++i) { - if (U->getOperand(i).getOpcode() != ISD::UADDO && - U->getOperand(i).getOpcode() != ISD::MERGE_VALUES) - return SDValue(); - } - } - } - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - SDLoc dl(Op); - - // Default to target independent lowering for special cases handled there. - if (isOneConstant(RHS) || isAllOnesConstant(RHS)) - return SDValue(); +static SDValue ConvertCarryValueToCarryFlag(EVT SumType, SDValue Value, + SelectionDAG &DAG, + const PPCSubtarget &STI) { + SDLoc DL(Value); + if (STI.useCRBits()) + Value = DAG.getNode(ISD::SELECT, DL, SumType, Value, + DAG.getConstant(1, DL, SumType), + DAG.getConstant(0, DL, SumType)); + else + Value = DAG.getZExtOrTrunc(Value, DL, SumType); + SDValue Sum = DAG.getNode(PPCISD::ADDC, DL, DAG.getVTList(SumType, MVT::i32), + Value, DAG.getAllOnesConstant(DL, SumType)); + return Sum.getValue(1); +} - EVT VT = Op.getNode()->getValueType(0); +static SDValue ConvertCarryFlagToCarryValue(EVT SumType, SDValue Flag, + EVT CarryType, SelectionDAG &DAG, + const PPCSubtarget &STI) { + SDLoc DL(Flag); + SDValue Zero = DAG.getConstant(0, DL, SumType); + SDValue Carry = DAG.getNode( + PPCISD::ADDE, DL, DAG.getVTList(SumType, MVT::i32), Zero, Zero, Flag); + if (STI.useCRBits()) + return DAG.getSetCC(DL, CarryType, Carry, Zero, ISD::SETNE); + return DAG.getZExtOrTrunc(Carry, DL, CarryType); +} - SDValue ADDC; - SDValue Overflow; - SDVTList VTs = Op.getNode()->getVTList(); +SDValue PPCTargetLowering::LowerADDSUBO(SDValue Op, SelectionDAG &DAG) const { - ADDC = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), LHS, RHS); - Overflow = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(VT, MVT::Glue), - DAG.getConstant(0, dl, VT), DAG.getConstant(0, dl, VT), - ADDC.getValue(1)); - SDValue OverflowTrunc = - DAG.getNode(ISD::TRUNCATE, dl, Op.getNode()->getValueType(1), Overflow); - SDValue Res = - DAG.getNode(ISD::MERGE_VALUES, dl, VTs, ADDC.getValue(0), OverflowTrunc); - return Res; + SDLoc DL(Op); + SDNode *N = Op.getNode(); + EVT VT = N->getValueType(0); + EVT CarryType = N->getValueType(1); + unsigned Opc = N->getOpcode(); + bool IsAdd = Opc == ISD::UADDO; + Opc = IsAdd ? PPCISD::ADDC : PPCISD::SUBC; + SDValue Sum = DAG.getNode(Opc, DL, DAG.getVTList(VT, MVT::i32), + N->getOperand(0), N->getOperand(1)); + SDValue Carry = ConvertCarryFlagToCarryValue(VT, Sum.getValue(1), CarryType, + DAG, Subtarget); + if (!IsAdd) + Carry = DAG.getNode(ISD::XOR, DL, CarryType, Carry, + DAG.getConstant(1UL, DL, CarryType)); + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, Carry); +} + +SDValue PPCTargetLowering::LowerADDSUBO_CARRY(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDNode *N = Op.getNode(); + unsigned Opc = N->getOpcode(); + EVT VT = N->getValueType(0); + EVT CarryType = N->getValueType(1); + SDValue CarryOp = N->getOperand(2); + bool IsAdd = Opc == ISD::UADDO_CARRY; + Opc = IsAdd ? PPCISD::ADDE : PPCISD::SUBE; + if (!IsAdd) + CarryOp = DAG.getNode(ISD::XOR, DL, CarryOp.getValueType(), CarryOp, + DAG.getAllOnesConstant(DL, CarryOp.getValueType())); + CarryOp = ConvertCarryValueToCarryFlag(VT, CarryOp, DAG, Subtarget); + SDValue Sum = DAG.getNode(Opc, DL, DAG.getVTList(VT, MVT::i32), + Op.getOperand(0), Op.getOperand(1), CarryOp); + CarryOp = ConvertCarryFlagToCarryValue(VT, Sum.getValue(1), CarryType, DAG, + Subtarget); + if (!IsAdd) + CarryOp = DAG.getNode(ISD::XOR, DL, CarryOp.getValueType(), CarryOp, + DAG.getConstant(1UL, DL, CarryOp.getValueType())); + return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, CarryOp); } SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const { @@ -12217,8 +12253,8 @@ SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const { /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { - default: llvm_unreachable("Wasn't expecting to be able to lower this!"); - case ISD::UADDO: return LowerUaddo(Op, DAG); + default: + llvm_unreachable("Wasn't expecting to be able to lower this!"); case ISD::FPOW: return lowerPow(Op, DAG); case ISD::FSIN: return lowerSin(Op, DAG); case ISD::FCOS: return lowerCos(Op, DAG); @@ -12311,6 +12347,12 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerATOMIC_LOAD_STORE(Op, DAG); case ISD::IS_FPCLASS: return LowerIS_FPCLASS(Op, DAG); + case ISD::UADDO: + case ISD::USUBO: + return LowerADDSUBO(Op, DAG); + case ISD::UADDO_CARRY: + case ISD::USUBO_CARRY: + return LowerADDSUBO_CARRY(Op, DAG); } } @@ -13393,6 +13435,11 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, F->insert(It, copy0MBB); F->insert(It, sinkMBB); + if (isPhysRegUsedAfter(PPC::CARRY, MI.getIterator())) { + copy0MBB->addLiveIn(PPC::CARRY); + sinkMBB->addLiveIn(PPC::CARRY); + } + // Set the call frame size on entry to the new basic blocks. // See https://reviews.llvm.org/D156113. unsigned CallFrameSize = TII->getCallFrameSizeAt(MI); @@ -16245,6 +16292,21 @@ static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) { return true; } +static SDValue DAGCombineAddc(SDNode *N, + llvm::PPCTargetLowering::DAGCombinerInfo &DCI) { + if (N->getOpcode() == PPCISD::ADDC && N->hasAnyUseOfValue(1)) { + // (ADDC (ADDE 0, 0, C), -1) -> C + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if (LHS->getOpcode() == PPCISD::ADDE && + isNullConstant(LHS->getOperand(0)) && + isNullConstant(LHS->getOperand(1)) && isAllOnesConstant(RHS)) { + return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); + } + } + return SDValue(); +} + SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -17033,6 +17095,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); + case PPCISD::ADDC: + return DAGCombineAddc(N, DCI); } return SDValue(); @@ -17086,6 +17150,16 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.Zero = 0xFFFF0000; break; } + case PPCISD::ADDE: { + if (Op.getResNo() == 0) { + // (0|1), _ = ADDE 0, 0, CARRY + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + if (isNullConstant(LHS) && isNullConstant(RHS)) + Known.Zero = ~1ULL; + } + break; + } case ISD::INTRINSIC_WO_CHAIN: { switch (Op.getConstantOperandVal(0)) { default: break; @@ -18355,7 +18429,8 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, return SDValue(); SDLoc DL(N); - SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue); + EVT CarryType = Subtarget.useCRBits() ? MVT::i1 : MVT::i32; + SDVTList VTs = DAG.getVTList(MVT::i64, CarryType); SDValue Cmp = RHS.getOperand(0); SDValue Z = Cmp.getOperand(0); auto *Constant = cast(Cmp.getOperand(1)); @@ -18373,11 +18448,14 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, DAG.getConstant(NegConstant, DL, MVT::i64)); SDValue AddOrZ = NegConstant != 0 ? Add : Z; - SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue), - AddOrZ, DAG.getAllOnesConstant(DL, MVT::i64)); - return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), + SDValue Addc = + DAG.getNode(ISD::UADDO_CARRY, DL, DAG.getVTList(MVT::i64, CarryType), + AddOrZ, DAG.getAllOnesConstant(DL, MVT::i64), + DAG.getConstant(0, DL, CarryType)); + return DAG.getNode(ISD::UADDO_CARRY, DL, VTs, LHS, + DAG.getConstant(0, DL, MVT::i64), SDValue(Addc.getNode(), 1)); - } + } case ISD::SETEQ: { // when C == 0 // --> addze X, (subfic Z, 0).carry @@ -18388,11 +18466,15 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, DAG.getConstant(NegConstant, DL, MVT::i64)); SDValue AddOrZ = NegConstant != 0 ? Add : Z; - SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue), - DAG.getConstant(0, DL, MVT::i64), AddOrZ); - return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), - SDValue(Subc.getNode(), 1)); - } + SDValue Subc = + DAG.getNode(ISD::USUBO_CARRY, DL, DAG.getVTList(MVT::i64, CarryType), + DAG.getConstant(0, DL, MVT::i64), AddOrZ, + DAG.getConstant(0, DL, CarryType)); + SDValue Invert = DAG.getNode(ISD::XOR, DL, CarryType, Subc.getValue(1), + DAG.getConstant(1UL, DL, CarryType)); + return DAG.getNode(ISD::UADDO_CARRY, DL, VTs, LHS, + DAG.getConstant(0, DL, MVT::i64), Invert); + } } return SDValue(); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 1f22aa16a89be..7365f3103276c 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -161,6 +161,12 @@ namespace llvm { SRA, SHL, + /// These nodes represent PPC arithmetic operations with carry. + ADDC, + ADDE, + SUBC, + SUBE, + /// FNMSUB - Negated multiply-subtract instruction. FNMSUB, @@ -1280,7 +1286,6 @@ namespace llvm { SDValue LowerGlobalTLSAddressLinux(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerUaddo(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSSUBO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; @@ -1316,6 +1321,8 @@ namespace llvm { SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerADDSUBO(SDValue Op, SelectionDAG &DAG) const; SDValue lowerToLibCall(const char *LibCallName, SDValue Op, SelectionDAG &DAG) const; SDValue lowerLibCallBasedOnType(const char *LibCallFloatName, diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index bcac0de55d9d3..4205b3086a3c9 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -760,13 +760,13 @@ def STFDXTLS : XForm_8<31, 727, (outs), (ins f8rc:$RST, ptr_rc_nor0:$RA, tlsreg: let isCommutable = 1 in defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "addc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (addc i64:$RA, i64:$RB))]>, + [(set i64:$RT, (PPCaddc i64:$RA, i64:$RB))]>, PPC970_DGroup_Cracked; let Defs = [CARRY] in def ADDIC8 : DForm_2<12, (outs g8rc:$RST), (ins g8rc:$RA, s16imm64:$D), "addic $RST, $RA, $D", IIC_IntGeneral, - [(set i64:$RST, (addc i64:$RA, imm64SExt16:$D))]>; + [(set i64:$RST, (PPCaddc i64:$RA, imm64SExt16:$D))]>; def ADDI8 : DForm_2<14, (outs g8rc:$RST), (ins g8rc_nox0:$RA, s16imm64:$D), "addi $RST, $RA, $D", IIC_IntSimple, [(set i64:$RST, (add i64:$RA, imm64SExt16:$D))]>; @@ -782,11 +782,11 @@ def LA8 : DForm_2<14, (outs g8rc:$RST), (ins g8rc_nox0:$RA, s16imm64:$D), let Defs = [CARRY] in { def SUBFIC8: DForm_2< 8, (outs g8rc:$RST), (ins g8rc:$RA, s16imm64:$D), "subfic $RST, $RA, $D", IIC_IntGeneral, - [(set i64:$RST, (subc imm64SExt16:$D, i64:$RA))]>; + [(set i64:$RST, (PPCsubc imm64SExt16:$D, i64:$RA))]>; } defm SUBFC8 : XOForm_1rc<31, 8, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "subfc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (subc i64:$RB, i64:$RA))]>, + [(set i64:$RT, (PPCsubc i64:$RB, i64:$RA))]>, PPC970_DGroup_Cracked; defm SUBF8 : XOForm_1rx<31, 40, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "subf", "$RT, $RA, $RB", IIC_IntGeneral, @@ -798,22 +798,22 @@ let Uses = [CARRY] in { let isCommutable = 1 in defm ADDE8 : XOForm_1rc<31, 138, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "adde", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (adde i64:$RA, i64:$RB))]>; + [(set i64:$RT, (PPCadde i64:$RA, i64:$RB, CARRY))]>; defm ADDME8 : XOForm_3rc<31, 234, 0, (outs g8rc:$RT), (ins g8rc:$RA), "addme", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (adde i64:$RA, -1))]>; + [(set i64:$RT, (PPCadde i64:$RA, -1, CARRY))]>; defm ADDZE8 : XOForm_3rc<31, 202, 0, (outs g8rc:$RT), (ins g8rc:$RA), "addze", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (adde i64:$RA, 0))]>; + [(set i64:$RT, (PPCadde i64:$RA, 0, CARRY))]>; defm SUBFE8 : XOForm_1rc<31, 136, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "subfe", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (sube i64:$RB, i64:$RA))]>; + [(set i64:$RT, (PPCsube i64:$RB, i64:$RA, CARRY))]>; defm SUBFME8 : XOForm_3rc<31, 232, 0, (outs g8rc:$RT), (ins g8rc:$RA), "subfme", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (sube -1, i64:$RA))]>; + [(set i64:$RT, (PPCsube -1, i64:$RA, CARRY))]>; defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$RT), (ins g8rc:$RA), "subfze", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (sube 0, i64:$RA))]>; + [(set i64:$RT, (PPCsube 0, i64:$RA, CARRY))]>; } } // isCodeGenOnly diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index f017073911950..97e9f59328f7e 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1758,6 +1758,23 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(PPC::EFDCFS), DestReg).addReg(SrcReg); getKillRegState(KillSrc); return; + } else if ((PPC::G8RCRegClass.contains(DestReg) || + PPC::GPRCRegClass.contains(DestReg)) && + SrcReg == PPC::CARRY) { + bool Is64Bit = PPC::G8RCRegClass.contains(DestReg); + BuildMI(MBB, I, DL, get(Is64Bit ? PPC::MFSPR8 : PPC::MFSPR), DestReg) + .addImm(1) + .addReg(PPC::CARRY, RegState::Implicit); + return; + } else if ((PPC::G8RCRegClass.contains(SrcReg) || + PPC::GPRCRegClass.contains(SrcReg)) && + DestReg == PPC::CARRY) { + bool Is64Bit = PPC::G8RCRegClass.contains(SrcReg); + BuildMI(MBB, I, DL, get(Is64Bit ? PPC::MTSPR8 : PPC::MTSPR)) + .addImm(1) + .addReg(SrcReg) + .addReg(PPC::CARRY, RegState::ImplicitDefine); + return; } unsigned Opc; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 175ba6009364a..e2864c2405967 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -124,6 +124,21 @@ def SDT_PPCFPMinMax : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0> ]>; +// RES, CARRY = op LHS, RHS +def SDT_PPCBinaryArithWithFlagsOut : SDTypeProfile<2, 2, [ + SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisVT<1, i32>, +]>; + +// RES, CARRY = op LHS, RHS, CARRY +def SDT_PPCBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, [ + SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisSameAs<1, 4>, + SDTCisVT<1, i32>, +]>; + //===----------------------------------------------------------------------===// // PowerPC specific DAG Nodes. // @@ -401,6 +416,15 @@ def PPCtlsdynamatpcreladdr : SDNode<"PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR", def PPCtlslocalexecmataddr : SDNode<"PPCISD::TLS_LOCAL_EXEC_MAT_ADDR", SDTIntUnaryOp, []>; +def PPCaddc : SDNode<"PPCISD::ADDC", SDT_PPCBinaryArithWithFlagsOut, + [SDNPCommutative]>; +def PPCadde : SDNode<"PPCISD::ADDE", SDT_PPCBinaryArithWithFlagsInOut, + []>; +def PPCsubc : SDNode<"PPCISD::SUBC", SDT_PPCBinaryArithWithFlagsOut, + []>; +def PPCsube : SDNode<"PPCISD::SUBE", SDT_PPCBinaryArithWithFlagsInOut, + []>; + //===----------------------------------------------------------------------===// // PowerPC specific transformation functions and pattern fragments. // @@ -2291,7 +2315,7 @@ let BaseName = "addic" in { let Defs = [CARRY] in def ADDIC : DForm_2<12, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), "addic $RST, $RA, $D", IIC_IntGeneral, - [(set i32:$RST, (addc i32:$RA, imm32SExt16:$D))]>, + [(set i32:$RST, (PPCaddc i32:$RA, imm32SExt16:$D))]>, RecFormRel, PPC970_DGroup_Cracked; let Defs = [CARRY, CR0] in def ADDIC_rec : DForm_2<13, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), @@ -2312,7 +2336,7 @@ def MULLI : DForm_2< 7, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), let Defs = [CARRY] in def SUBFIC : DForm_2< 8, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), "subfic $RST, $RA, $D", IIC_IntGeneral, - [(set i32:$RST, (subc imm32SExt16:$D, i32:$RA))]>; + [(set i32:$RST, (PPCsubc imm32SExt16:$D, i32:$RA))]>; let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def LI : DForm_2_r0<14, (outs gprc:$RST), (ins s16imm:$D), @@ -2909,7 +2933,7 @@ def ADD4TLS : XOForm_1<31, 266, 0, (outs gprc:$RT), (ins gprc:$RA, tlsreg32:$RB let isCommutable = 1 in defm ADDC : XOForm_1rc<31, 10, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "addc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (addc i32:$RA, i32:$RB))]>, + [(set i32:$RT, (PPCaddc i32:$RA, i32:$RB))]>, PPC970_DGroup_Cracked; defm DIVW : XOForm_1rcr<31, 491, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), @@ -2942,7 +2966,7 @@ defm SUBF : XOForm_1rx<31, 40, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), [(set i32:$RT, (sub i32:$RB, i32:$RA))]>; defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "subfc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (subc i32:$RB, i32:$RA))]>, + [(set i32:$RT, (PPCsubc i32:$RB, i32:$RA))]>, PPC970_DGroup_Cracked; defm NEG : XOForm_3r<31, 104, 0, (outs gprc:$RT), (ins gprc:$RA), "neg", "$RT, $RA", IIC_IntSimple, @@ -2951,22 +2975,22 @@ let Uses = [CARRY] in { let isCommutable = 1 in defm ADDE : XOForm_1rc<31, 138, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "adde", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (adde i32:$RA, i32:$RB))]>; + [(set i32:$RT, (PPCadde i32:$RA, i32:$RB, CARRY))]>; defm ADDME : XOForm_3rc<31, 234, 0, (outs gprc:$RT), (ins gprc:$RA), "addme", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (adde i32:$RA, -1))]>; + [(set i32:$RT, (PPCadde i32:$RA, -1, CARRY))]>; defm ADDZE : XOForm_3rc<31, 202, 0, (outs gprc:$RT), (ins gprc:$RA), "addze", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (adde i32:$RA, 0))]>; + [(set i32:$RT, (PPCadde i32:$RA, 0, CARRY))]>; defm SUBFE : XOForm_1rc<31, 136, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "subfe", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (sube i32:$RB, i32:$RA))]>; + [(set i32:$RT, (PPCsube i32:$RB, i32:$RA, CARRY))]>; defm SUBFME : XOForm_3rc<31, 232, 0, (outs gprc:$RT), (ins gprc:$RA), "subfme", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (sube -1, i32:$RA))]>; + [(set i32:$RT, (PPCsube -1, i32:$RA, CARRY))]>; defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$RT), (ins gprc:$RA), "subfze", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (sube 0, i32:$RA))]>; + [(set i32:$RT, (PPCsube 0, i32:$RA, CARRY))]>; } } diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index b60a91be82406..2177dba1e5762 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -625,6 +625,13 @@ bool PPCRegisterInfo::getRegAllocationHints(Register VirtReg, return BaseImplRetVal; } +const TargetRegisterClass * +PPCRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { + if (RC == &PPC::CARRYRCRegClass) + return TM.isPPC64() ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; + return RC; +} + unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const PPCFrameLowering *TFI = getFrameLowering(MF); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 103059d0e29ab..005d890c57c93 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -76,6 +76,9 @@ class PPCRegisterInfo : public PPCGenRegisterInfo { const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override; + const TargetRegisterClass * + getCrossCopyRegClass(const TargetRegisterClass *RC) const override; + unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index 3cb7cd9d8f229..8b690b7b833b3 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -494,6 +494,7 @@ def LR8RC : RegisterClass<"PPC", [i64], 64, (add LR8)> { def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>; def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> { let CopyCost = -1; + let isAllocatable = 0; } // Make AllocationOrder as similar as G8RC's to avoid potential spilling. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d2d022ab52c41..a4381b99dbae0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -28,6 +28,7 @@ #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/IntrinsicLowering.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -35463,28 +35464,6 @@ MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context, // X86 Scheduler Hooks //===----------------------------------------------------------------------===// -// Returns true if EFLAG is consumed after this iterator in the rest of the -// basic block or any successors of the basic block. -static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, - MachineBasicBlock *BB) { - // Scan forward through BB for a use/def of EFLAGS. - for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) { - if (mi.readsRegister(X86::EFLAGS, /*TRI=*/nullptr)) - return true; - // If we found a def, we can stop searching. - if (mi.definesRegister(X86::EFLAGS, /*TRI=*/nullptr)) - return false; - } - - // If we hit the end of the block, check whether EFLAGS is live into a - // successor. - for (MachineBasicBlock *Succ : BB->successors()) - if (Succ->isLiveIn(X86::EFLAGS)) - return true; - - return false; -} - /// Utility function to emit xbegin specifying the start of an RTM region. static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII) { @@ -35517,7 +35496,7 @@ static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, MF->insert(I, fallMBB); MF->insert(I, sinkMBB); - if (isEFLAGSLiveAfter(MI, MBB)) { + if (isPhysRegUsedAfter(X86::EFLAGS, MI)) { mainMBB->addLiveIn(X86::EFLAGS); fallMBB->addLiveIn(X86::EFLAGS); sinkMBB->addLiveIn(X86::EFLAGS); @@ -35856,7 +35835,7 @@ X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock* BB, const TargetRegisterInfo* TRI) { - if (isEFLAGSLiveAfter(SelectItr, BB)) + if (isPhysRegUsedAfter(X86::EFLAGS, SelectItr)) return false; // We found a def, or hit the end of the basic block and EFLAGS wasn't live diff --git a/llvm/test/CodeGen/PowerPC/adde_return_type.ll b/llvm/test/CodeGen/PowerPC/adde_return_type.ll index 7ce11079a6267..47c5efc35afc6 100644 --- a/llvm/test/CodeGen/PowerPC/adde_return_type.ll +++ b/llvm/test/CodeGen/PowerPC/adde_return_type.ll @@ -3,7 +3,7 @@ ; RUN: < %s -o /dev/null 2>&1 | FileCheck %s define i64 @testAddeReturnType(i64 %X, i64 %Z) { -; CHECK: Legally typed node: {{.*}}: i64,glue = adde {{.*}} +; CHECK: Legally typed node: {{.*}}: i64,i1 = uaddo {{.*}} %cmp = icmp ne i64 %Z, 0 %conv1 = zext i1 %cmp to i64 %add = add nsw i64 %conv1, %X diff --git a/llvm/test/CodeGen/PowerPC/addegluecrash.ll b/llvm/test/CodeGen/PowerPC/addegluecrash.ll index a711b09b9bdfd..7cd94c0e4c2d5 100644 --- a/llvm/test/CodeGen/PowerPC/addegluecrash.ll +++ b/llvm/test/CodeGen/PowerPC/addegluecrash.ll @@ -9,20 +9,20 @@ define void @bn_mul_comba8(ptr nocapture %r, ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: std 4, -8(1) # 8-byte Folded Spill ; CHECK-NEXT: mr 4, 3 ; CHECK-NEXT: ld 3, -8(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 9, 0(3) -; CHECK-NEXT: ld 8, 0(5) -; CHECK-NEXT: mulhdu 7, 8, 9 +; CHECK-NEXT: ld 6, 0(3) +; CHECK-NEXT: ld 11, 0(5) +; CHECK-NEXT: mulhdu 8, 11, 6 ; CHECK-NEXT: ld 3, 8(3) -; CHECK-NEXT: mulld 6, 3, 9 -; CHECK-NEXT: mulhdu 3, 3, 9 -; CHECK-NEXT: addc 6, 6, 7 -; CHECK-NEXT: addze 3, 3 -; CHECK-NEXT: ld 5, 8(5) -; CHECK-NEXT: mulld 7, 5, 8 -; CHECK-NEXT: mulhdu 5, 5, 8 -; CHECK-NEXT: addc 6, 6, 7 +; CHECK-NEXT: mulld 7, 3, 6 +; CHECK-NEXT: addc 9, 7, 8 +; CHECK-NEXT: ld 10, 8(5) +; CHECK-NEXT: mulhdu 5, 10, 11 +; CHECK-NEXT: mulld 10, 10, 11 +; CHECK-NEXT: addc 9, 9, 10 ; CHECK-NEXT: addze 5, 5 -; CHECK-NEXT: add 3, 5, 3 +; CHECK-NEXT: addc 7, 7, 8 +; CHECK-NEXT: mulhdu 3, 3, 6 +; CHECK-NEXT: adde 3, 5, 3 ; CHECK-NEXT: cmpld 3, 5 ; CHECK-NEXT: crmove 20, 0 ; CHECK-NEXT: li 5, 0 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll index 501227c9072c4..aead5762d0921 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll @@ -1103,13 +1103,13 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; 32BIT-NEXT: renamable $r11 = LWZ 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0) ; 32BIT-NEXT: renamable $r12 = LWZ 0, %fixed-stack.4 :: (load (s32) from %fixed-stack.4) ; 32BIT-NEXT: renamable $r0 = LBZ 3, %fixed-stack.1 :: (load (s8) from %fixed-stack.1 + 3, basealign 4) - ; 32BIT-NEXT: renamable $r31 = LWZ 4, %fixed-stack.3 :: (load (s32) from %fixed-stack.3 + 4, basealign 16) - ; 32BIT-NEXT: renamable $r30 = LWZ 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3, align 16) + ; 32BIT-NEXT: renamable $r31 = LWZ 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3, align 16) + ; 32BIT-NEXT: renamable $r30 = LWZ 4, %fixed-stack.3 :: (load (s32) from %fixed-stack.3 + 4, basealign 16) ; 32BIT-NEXT: renamable $r29 = LWZ 0, %fixed-stack.5 :: (load (s32) from %fixed-stack.5, align 8) ; 32BIT-NEXT: renamable $r28 = LBZ 3, %fixed-stack.6 :: (load (s8) from %fixed-stack.6 + 3, basealign 4) ; 32BIT-NEXT: renamable $r27 = LHA 2, %fixed-stack.7 :: (load (s16) from %fixed-stack.7 + 2, basealign 4) - ; 32BIT-NEXT: renamable $r26 = LWZ 4, %fixed-stack.9 :: (load (s32) from %fixed-stack.9 + 4, basealign 8) - ; 32BIT-NEXT: renamable $r25 = LWZ 0, %fixed-stack.9 :: (load (s32) from %fixed-stack.9, align 8) + ; 32BIT-NEXT: renamable $r26 = LWZ 0, %fixed-stack.9 :: (load (s32) from %fixed-stack.9, align 8) + ; 32BIT-NEXT: renamable $r25 = LWZ 4, %fixed-stack.9 :: (load (s32) from %fixed-stack.9 + 4, basealign 8) ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r5 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r6 @@ -1120,8 +1120,8 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r9 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r10 ; 32BIT-NEXT: renamable $r6 = SRAWI renamable $r3, 31, implicit-def dead $carry - ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r26, implicit-def $carry - ; 32BIT-NEXT: renamable $r6 = ADDE killed renamable $r6, killed renamable $r25, implicit-def dead $carry, implicit $carry + ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r25, implicit-def $carry + ; 32BIT-NEXT: renamable $r6 = ADDE killed renamable $r6, killed renamable $r26, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r7 = SRAWI renamable $r27, 31, implicit-def dead $carry ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r27, implicit-def $carry ; 32BIT-NEXT: renamable $r6 = ADDE killed renamable $r6, killed renamable $r7, implicit-def dead $carry, implicit $carry @@ -1131,8 +1131,8 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; 32BIT-NEXT: renamable $r6 = ADDZE killed renamable $r6, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r12, implicit-def $carry ; 32BIT-NEXT: renamable $r4 = ADDE killed renamable $r6, killed renamable $r4, implicit-def dead $carry, implicit $carry - ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r31, implicit-def $carry - ; 32BIT-NEXT: renamable $r4 = ADDE killed renamable $r4, killed renamable $r30, implicit-def dead $carry, implicit $carry + ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r30, implicit-def $carry + ; 32BIT-NEXT: renamable $r4 = ADDE killed renamable $r4, killed renamable $r31, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r0, implicit-def $carry ; 32BIT-NEXT: renamable $r6 = ADDZE killed renamable $r4, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r4 = ADDC killed renamable $r3, killed renamable $r11, implicit-def $carry diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll index 79c59e925302a..8f33f5ef863e6 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll @@ -1213,14 +1213,14 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; ASM32PWR4-NEXT: addc 3, 3, 6 ; ASM32PWR4-NEXT: addze 6, 7 ; ASM32PWR4-NEXT: addc 3, 3, 9 -; ASM32PWR4-NEXT: lwz 5, 84(1) +; ASM32PWR4-NEXT: lwz 7, 84(1) ; ASM32PWR4-NEXT: addze 6, 6 ; ASM32PWR4-NEXT: addc 3, 3, 31 -; ASM32PWR4-NEXT: lwz 7, 80(1) +; ASM32PWR4-NEXT: lwz 5, 80(1) ; ASM32PWR4-NEXT: adde 6, 6, 30 -; ASM32PWR4-NEXT: addc 3, 3, 5 +; ASM32PWR4-NEXT: addc 3, 3, 7 ; ASM32PWR4-NEXT: lbz 8, 91(1) -; ASM32PWR4-NEXT: adde 5, 6, 7 +; ASM32PWR4-NEXT: adde 5, 6, 5 ; ASM32PWR4-NEXT: addc 3, 3, 8 ; ASM32PWR4-NEXT: lbz 6, 103(1) ; ASM32PWR4-NEXT: addze 5, 5 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll index f1bf7c262317d..9b1893b111556 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll @@ -36,17 +36,17 @@ entry: ; CHECK32: bb.0.entry: ; CHECK32-NEXT: liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10 -; CHECK32: renamable $r[[REG1:[0-9]+]] = LWZ 84, %fixed-stack.0 +; CHECK32: renamable $r[[REG1:[0-9]+]] = LWZ 80, %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r3, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r4, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4 -; CHECK32: renamable $r[[REG2:[0-9]+]] = LWZ 80, %fixed-stack.0 +; CHECK32: renamable $r[[REG2:[0-9]+]] = LWZ 84, %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r5, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8 ; CHECK32-DAG: STW killed renamable $r6, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12 ; CHECK32-DAG: STW renamable $r7, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16 ; CHECK32-DAG: STW renamable $r8, 20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20 ; CHECK32-DAG: STW killed renamable $r9, 24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24 -; CHECK32: renamable $r4 = ADDC killed renamable $r8, killed renamable $r[[REG1]], implicit-def $carry -; CHECK32: renamable $r3 = ADDE killed renamable $r7, killed renamable $r[[REG2]], implicit-def dead $carry, implicit killed $carry +; CHECK32: renamable $r4 = ADDC killed renamable $r8, killed renamable $r[[REG2]], implicit-def $carry +; CHECK32: renamable $r3 = ADDE killed renamable $r7, killed renamable $r[[REG1]], implicit-def dead $carry, implicit killed $carry ; CHECK32 STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28 ; CHECK32: BLR implicit $lr, implicit $rm, implicit $r3, implicit $r4 diff --git a/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll b/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll index 53a7cb0aad9ee..5f471ce83828a 100644 --- a/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll +++ b/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll @@ -325,12 +325,12 @@ define i64 @loadsTGInit() #1 { ; SMALL32-NEXT: stw 0, 40(1) ; SMALL32-NEXT: bla .__tls_get_addr[PR] ; SMALL32-NEXT: lwz 4, L..C7(2) # @GInit -; SMALL32-NEXT: lwz 5, 4(3) +; SMALL32-NEXT: lwz 5, 0(3) +; SMALL32-NEXT: lwz 3, 4(3) ; SMALL32-NEXT: lwz 6, 4(4) -; SMALL32-NEXT: lwz 3, 0(3) ; SMALL32-NEXT: lwz 7, 0(4) -; SMALL32-NEXT: addc 4, 6, 5 -; SMALL32-NEXT: adde 3, 7, 3 +; SMALL32-NEXT: addc 4, 6, 3 +; SMALL32-NEXT: adde 3, 7, 5 ; SMALL32-NEXT: addi 1, 1, 32 ; SMALL32-NEXT: lwz 0, 8(1) ; SMALL32-NEXT: mtlr 0 @@ -346,14 +346,14 @@ define i64 @loadsTGInit() #1 { ; LARGE32-NEXT: lwz 3, L..C0@l(3) ; LARGE32-NEXT: lwz 4, L..C1@l(4) ; LARGE32-NEXT: bla .__tls_get_addr[PR] -; LARGE32-NEXT: lwz 4, 4(3) -; LARGE32-NEXT: lwz 3, 0(3) -; LARGE32-NEXT: addis 5, L..C7@u(2) -; LARGE32-NEXT: lwz 5, L..C7@l(5) -; LARGE32-NEXT: lwz 6, 4(5) -; LARGE32-NEXT: lwz 5, 0(5) -; LARGE32-NEXT: addc 4, 6, 4 -; LARGE32-NEXT: adde 3, 5, 3 +; LARGE32-NEXT: lwz 5, 0(3) +; LARGE32-NEXT: lwz 3, 4(3) +; LARGE32-NEXT: addis 4, L..C7@u(2) +; LARGE32-NEXT: lwz 4, L..C7@l(4) +; LARGE32-NEXT: lwz 6, 4(4) +; LARGE32-NEXT: lwz 7, 0(4) +; LARGE32-NEXT: addc 4, 6, 3 +; LARGE32-NEXT: adde 3, 7, 5 ; LARGE32-NEXT: addi 1, 1, 32 ; LARGE32-NEXT: lwz 0, 8(1) ; LARGE32-NEXT: mtlr 0 @@ -589,12 +589,12 @@ define i64 @loadsTWInit() #1 { ; SMALL32-NEXT: stw 0, 40(1) ; SMALL32-NEXT: bla .__tls_get_addr[PR] ; SMALL32-NEXT: lwz 4, L..C7(2) # @GInit -; SMALL32-NEXT: lwz 5, 4(3) +; SMALL32-NEXT: lwz 5, 0(3) +; SMALL32-NEXT: lwz 3, 4(3) ; SMALL32-NEXT: lwz 6, 4(4) -; SMALL32-NEXT: lwz 3, 0(3) ; SMALL32-NEXT: lwz 7, 0(4) -; SMALL32-NEXT: addc 4, 6, 5 -; SMALL32-NEXT: adde 3, 7, 3 +; SMALL32-NEXT: addc 4, 6, 3 +; SMALL32-NEXT: adde 3, 7, 5 ; SMALL32-NEXT: addi 1, 1, 32 ; SMALL32-NEXT: lwz 0, 8(1) ; SMALL32-NEXT: mtlr 0 @@ -610,14 +610,14 @@ define i64 @loadsTWInit() #1 { ; LARGE32-NEXT: lwz 3, L..C5@l(3) ; LARGE32-NEXT: lwz 4, L..C6@l(4) ; LARGE32-NEXT: bla .__tls_get_addr[PR] -; LARGE32-NEXT: lwz 4, 4(3) -; LARGE32-NEXT: lwz 3, 0(3) -; LARGE32-NEXT: addis 5, L..C7@u(2) -; LARGE32-NEXT: lwz 5, L..C7@l(5) -; LARGE32-NEXT: lwz 6, 4(5) -; LARGE32-NEXT: lwz 5, 0(5) -; LARGE32-NEXT: addc 4, 6, 4 -; LARGE32-NEXT: adde 3, 5, 3 +; LARGE32-NEXT: lwz 5, 0(3) +; LARGE32-NEXT: lwz 3, 4(3) +; LARGE32-NEXT: addis 4, L..C7@u(2) +; LARGE32-NEXT: lwz 4, L..C7@l(4) +; LARGE32-NEXT: lwz 6, 4(4) +; LARGE32-NEXT: lwz 7, 0(4) +; LARGE32-NEXT: addc 4, 6, 3 +; LARGE32-NEXT: adde 3, 7, 5 ; LARGE32-NEXT: addi 1, 1, 32 ; LARGE32-NEXT: lwz 0, 8(1) ; LARGE32-NEXT: mtlr 0 diff --git a/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll b/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll index c2d7325107a84..533c866eb4e12 100644 --- a/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll +++ b/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll @@ -304,15 +304,15 @@ define i64 @loadITLUninit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C0(r2) # target-flags(ppc-tprel) @IThreadLocalVarUninit ; SMALL32-NEXT: bla .__get_tpointer[PR] -; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r6, 4(r5) -; SMALL32-NEXT: lwz r5, 0(r5) -; SMALL32-NEXT: lwz r4, 4(r3) -; SMALL32-NEXT: lwz r3, 0(r3) -; SMALL32-NEXT: addc r4, r6, r4 -; SMALL32-NEXT: adde r3, r5, r3 +; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit +; SMALL32-NEXT: lwz r5, 0(r3) +; SMALL32-NEXT: lwz r3, 4(r3) +; SMALL32-NEXT: lwz r6, 0(r4) +; SMALL32-NEXT: lwz r4, 4(r4) +; SMALL32-NEXT: addc r4, r4, r3 +; SMALL32-NEXT: adde r3, r6, r5 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -327,14 +327,14 @@ define i64 @loadITLUninit2() { ; LARGE32-NEXT: lwz r4, L..C0@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r4, 4(r3) -; LARGE32-NEXT: lwz r3, 0(r3) -; LARGE32-NEXT: addis r5, L..C4@u(r2) -; LARGE32-NEXT: lwz r5, L..C4@l(r5) -; LARGE32-NEXT: lwz r6, 4(r5) -; LARGE32-NEXT: lwz r5, 0(r5) -; LARGE32-NEXT: addc r4, r6, r4 -; LARGE32-NEXT: adde r3, r5, r3 +; LARGE32-NEXT: lwz r5, 0(r3) +; LARGE32-NEXT: lwz r3, 4(r3) +; LARGE32-NEXT: addis r4, L..C4@u(r2) +; LARGE32-NEXT: lwz r4, L..C4@l(r4) +; LARGE32-NEXT: lwz r6, 0(r4) +; LARGE32-NEXT: lwz r4, 4(r4) +; LARGE32-NEXT: addc r4, r4, r3 +; LARGE32-NEXT: adde r3, r6, r5 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 @@ -424,15 +424,15 @@ define i64 @loadITLInit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C1(r2) # target-flags(ppc-tprel) @IThreadLocalVarInit ; SMALL32-NEXT: bla .__get_tpointer[PR] -; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r6, 4(r5) -; SMALL32-NEXT: lwz r5, 0(r5) -; SMALL32-NEXT: lwz r4, 4(r3) -; SMALL32-NEXT: lwz r3, 0(r3) -; SMALL32-NEXT: addc r4, r6, r4 -; SMALL32-NEXT: adde r3, r5, r3 +; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit +; SMALL32-NEXT: lwz r5, 0(r3) +; SMALL32-NEXT: lwz r3, 4(r3) +; SMALL32-NEXT: lwz r6, 0(r4) +; SMALL32-NEXT: lwz r4, 4(r4) +; SMALL32-NEXT: addc r4, r4, r3 +; SMALL32-NEXT: adde r3, r6, r5 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -447,14 +447,14 @@ define i64 @loadITLInit2() { ; LARGE32-NEXT: lwz r4, L..C1@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r4, 4(r3) -; LARGE32-NEXT: lwz r3, 0(r3) -; LARGE32-NEXT: addis r5, L..C4@u(r2) -; LARGE32-NEXT: lwz r5, L..C4@l(r5) -; LARGE32-NEXT: lwz r6, 4(r5) -; LARGE32-NEXT: lwz r5, 0(r5) -; LARGE32-NEXT: addc r4, r6, r4 -; LARGE32-NEXT: adde r3, r5, r3 +; LARGE32-NEXT: lwz r5, 0(r3) +; LARGE32-NEXT: lwz r3, 4(r3) +; LARGE32-NEXT: addis r4, L..C4@u(r2) +; LARGE32-NEXT: lwz r4, L..C4@l(r4) +; LARGE32-NEXT: lwz r6, 0(r4) +; LARGE32-NEXT: lwz r4, 4(r4) +; LARGE32-NEXT: addc r4, r4, r3 +; LARGE32-NEXT: adde r3, r6, r5 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 @@ -544,15 +544,15 @@ define i64 @loadTLUninit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C2(r2) # target-flags(ppc-tprel) @ThreadLocalVarUninit ; SMALL32-NEXT: bla .__get_tpointer[PR] -; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r6, 4(r5) -; SMALL32-NEXT: lwz r5, 0(r5) -; SMALL32-NEXT: lwz r4, 4(r3) -; SMALL32-NEXT: lwz r3, 0(r3) -; SMALL32-NEXT: addc r4, r6, r4 -; SMALL32-NEXT: adde r3, r5, r3 +; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit +; SMALL32-NEXT: lwz r5, 0(r3) +; SMALL32-NEXT: lwz r3, 4(r3) +; SMALL32-NEXT: lwz r6, 0(r4) +; SMALL32-NEXT: lwz r4, 4(r4) +; SMALL32-NEXT: addc r4, r4, r3 +; SMALL32-NEXT: adde r3, r6, r5 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -567,14 +567,14 @@ define i64 @loadTLUninit2() { ; LARGE32-NEXT: lwz r4, L..C2@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r4, 4(r3) -; LARGE32-NEXT: lwz r3, 0(r3) -; LARGE32-NEXT: addis r5, L..C4@u(r2) -; LARGE32-NEXT: lwz r5, L..C4@l(r5) -; LARGE32-NEXT: lwz r6, 4(r5) -; LARGE32-NEXT: lwz r5, 0(r5) -; LARGE32-NEXT: addc r4, r6, r4 -; LARGE32-NEXT: adde r3, r5, r3 +; LARGE32-NEXT: lwz r5, 0(r3) +; LARGE32-NEXT: lwz r3, 4(r3) +; LARGE32-NEXT: addis r4, L..C4@u(r2) +; LARGE32-NEXT: lwz r4, L..C4@l(r4) +; LARGE32-NEXT: lwz r6, 0(r4) +; LARGE32-NEXT: lwz r4, 4(r4) +; LARGE32-NEXT: addc r4, r4, r3 +; LARGE32-NEXT: adde r3, r6, r5 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 @@ -664,15 +664,15 @@ define i64 @loadTLInit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C3(r2) # target-flags(ppc-tprel) @ThreadLocalVarInit ; SMALL32-NEXT: bla .__get_tpointer[PR] -; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r6, 4(r5) -; SMALL32-NEXT: lwz r5, 0(r5) -; SMALL32-NEXT: lwz r4, 4(r3) -; SMALL32-NEXT: lwz r3, 0(r3) -; SMALL32-NEXT: addc r4, r6, r4 -; SMALL32-NEXT: adde r3, r5, r3 +; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit +; SMALL32-NEXT: lwz r5, 0(r3) +; SMALL32-NEXT: lwz r3, 4(r3) +; SMALL32-NEXT: lwz r6, 0(r4) +; SMALL32-NEXT: lwz r4, 4(r4) +; SMALL32-NEXT: addc r4, r4, r3 +; SMALL32-NEXT: adde r3, r6, r5 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -687,14 +687,14 @@ define i64 @loadTLInit2() { ; LARGE32-NEXT: lwz r4, L..C3@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r4, 4(r3) -; LARGE32-NEXT: lwz r3, 0(r3) -; LARGE32-NEXT: addis r5, L..C4@u(r2) -; LARGE32-NEXT: lwz r5, L..C4@l(r5) -; LARGE32-NEXT: lwz r6, 4(r5) -; LARGE32-NEXT: lwz r5, 0(r5) -; LARGE32-NEXT: addc r4, r6, r4 -; LARGE32-NEXT: adde r3, r5, r3 +; LARGE32-NEXT: lwz r5, 0(r3) +; LARGE32-NEXT: lwz r3, 4(r3) +; LARGE32-NEXT: addis r4, L..C4@u(r2) +; LARGE32-NEXT: lwz r4, L..C4@l(r4) +; LARGE32-NEXT: lwz r6, 0(r4) +; LARGE32-NEXT: lwz r4, 4(r4) +; LARGE32-NEXT: addc r4, r4, r3 +; LARGE32-NEXT: adde r3, r6, r5 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 diff --git a/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll b/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll index 6c0ea782c2a38..268402170063e 100644 --- a/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll +++ b/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll @@ -290,16 +290,16 @@ entry: ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} bla 0 ; DIS-NEXT: {{0*}}[[#ADDR]]: R_RBA (idx: [[#NFA+1]]) .__get_tpointer[PR] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 3, 3, 4 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 4(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 0(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 5, 2, 0 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 0(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 4(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 4, 2, 0 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCU (idx: [[#NFA+25]]) VarInit[TE] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 8(5) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 8(4) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL (idx: [[#NFA+25]]) VarInit[TE] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 6, 4(5) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 0(5) -; DIS-NEXT: addc 4, 6, 4 -; DIS-NEXT: adde 3, 5, 3 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 6, 0(4) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 4(4) +; DIS-NEXT: addc 4, 4, 3 +; DIS-NEXT: adde 3, 6, 5 ; DIS-NEXT: addi 1, 1, 32 ; DIS-NEXT: lwz 0, 8(1) ; DIS-NEXT: mtlr 0 @@ -324,10 +324,10 @@ entry: ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 12(4) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL (idx: [[#NFA+27]]) IThreadLocalVarUninit2[TE] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 3, 3, 4 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 4(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 0(3) -; DIS-NEXT: addic 4, 4, 1 -; DIS-NEXT: addze 3, 3 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 0(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 4(3) +; DIS-NEXT: addic 4, 3, 1 +; DIS-NEXT: addze 3, 5 ; DIS-NEXT: addi 1, 1, 32 ; DIS-NEXT: lwz 0, 8(1) ; DIS-NEXT: mtlr 0 diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll index 0ff2f28207ed4..4f00cff83942a 100644 --- a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll +++ b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll @@ -357,10 +357,10 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) { ; CHECK-NEXT: .LBB7_2: # %atomicrmw.start ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB7_4 Depth 2 -; CHECK-NEXT: sub 5, 6, 4 -; CHECK-NEXT: cmpld 5, 6 +; CHECK-NEXT: subc 5, 6, 4 ; CHECK-NEXT: li 7, 0 -; CHECK-NEXT: bgt 0, .LBB7_4 +; CHECK-NEXT: addze. 8, 7 +; CHECK-NEXT: beq 0, .LBB7_4 ; CHECK-NEXT: # %bb.3: # %atomicrmw.start ; CHECK-NEXT: # ; CHECK-NEXT: mr 7, 5 diff --git a/llvm/test/CodeGen/PowerPC/carry-liveness-after-expand-isel.ll b/llvm/test/CodeGen/PowerPC/carry-liveness-after-expand-isel.ll new file mode 100644 index 0000000000000..15ab8aa05b329 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/carry-liveness-after-expand-isel.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck %s + +target datalayout = "E-m:e-p:32:32-Fn32-i64:64-n32" +target triple = "powerpc-unknown-linux-gnu" + +@md_seq_show___trans_tmp_57 = external global i8 + +define i32 @md_seq_show(i64 %0, i32 %1) #0 { + ; CHECK-LABEL: name: md_seq_show + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; CHECK-NEXT: liveins: $r3, $r4, $r5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprc = COPY $r5 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprc = COPY $r4 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprc = COPY $r3 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gprc = COPY [[COPY1]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gprc = COPY [[COPY2]] + ; CHECK-NEXT: [[ADDIC:%[0-9]+]]:gprc = ADDIC [[COPY1]], 1, implicit-def $carry + ; CHECK-NEXT: [[CMPLWI:%[0-9]+]]:crrc = CMPLWI killed [[ADDIC]], 1 + ; CHECK-NEXT: [[LI:%[0-9]+]]:gprc_and_gprc_nor0 = LI 0 + ; CHECK-NEXT: [[LI1:%[0-9]+]]:gprc_and_gprc_nor0 = LI 1 + ; CHECK-NEXT: BCC 44, [[CMPLWI]], %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.entry: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: liveins: $carry + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.entry: + ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) + ; CHECK-NEXT: liveins: $carry + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gprc_and_gprc_nor0 = PHI [[LI]], %bb.3, [[LI1]], %bb.0 + ; CHECK-NEXT: [[ADDZE:%[0-9]+]]:gprc = ADDZE [[COPY2]], implicit-def dead $carry, implicit $carry + ; CHECK-NEXT: [[ADDIC1:%[0-9]+]]:gprc = ADDIC [[ADDZE]], -1, implicit-def $carry + ; CHECK-NEXT: [[SUBFE:%[0-9]+]]:gprc_and_gprc_nor0 = SUBFE killed [[ADDIC1]], [[ADDZE]], implicit-def dead $carry, implicit $carry + ; CHECK-NEXT: [[CMPLWI1:%[0-9]+]]:crrc = CMPLWI [[ADDZE]], 0 + ; CHECK-NEXT: BCC 76, [[CMPLWI1]], %bb.6 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.entry: + ; CHECK-NEXT: successors: %bb.6(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.entry: + ; CHECK-NEXT: successors: %bb.1(0x55555556), %bb.2(0x2aaaaaaa) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:gprc = PHI [[SUBFE]], %bb.5, [[PHI]], %bb.4 + ; CHECK-NEXT: [[CMPLWI2:%[0-9]+]]:crrc = CMPLWI killed [[PHI1]], 0 + ; CHECK-NEXT: BCC 68, killed [[CMPLWI2]], %bb.2 + ; CHECK-NEXT: B %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.for.cond.i.preheader: + ; CHECK-NEXT: [[LI2:%[0-9]+]]:gprc = LI 0 + ; CHECK-NEXT: $r3 = COPY [[LI2]] + ; CHECK-NEXT: BLR implicit $lr, implicit $rm, implicit $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.status_resync.exit: + ; CHECK-NEXT: [[ADDIC2:%[0-9]+]]:gprc = ADDIC [[COPY]], -1, implicit-def $carry + ; CHECK-NEXT: [[SUBFE1:%[0-9]+]]:gprc = SUBFE killed [[ADDIC2]], [[COPY]], implicit-def dead $carry, implicit $carry + ; CHECK-NEXT: [[LIS:%[0-9]+]]:gprc_and_gprc_nor0 = LIS target-flags(ppc-ha) @md_seq_show___trans_tmp_57 + ; CHECK-NEXT: STB killed [[SUBFE1]], target-flags(ppc-lo) @md_seq_show___trans_tmp_57, killed [[LIS]] :: (store (s8) into @md_seq_show___trans_tmp_57) + ; CHECK-NEXT: [[LI3:%[0-9]+]]:gprc = LI 0 + ; CHECK-NEXT: $r3 = COPY [[LI3]] + ; CHECK-NEXT: BLR implicit $lr, implicit $rm, implicit $r3 + + entry: + switch i64 %0, label %status_resync.exit [ + i64 -1, label %for.cond.i.preheader + i64 0, label %for.cond.i.preheader + ] + + for.cond.i.preheader: ; preds = %entry, %entry + ret i32 0 + + status_resync.exit: ; preds = %entry + %tobool = icmp ne i32 %1, 0 + %storedv = zext i1 %tobool to i8 + store i8 %storedv, ptr @md_seq_show___trans_tmp_57, align 1 + ret i32 0 +} + +attributes #0 = { "target-features"="-aix-shared-lib-tls-model-opt,-aix-small-local-dynamic-tls,-aix-small-local-exec-tls,-altivec,-bpermd,-crbits,-crypto,-direct-move,-extdiv,-htm,-isa-v206-instructions,-isa-v207-instructions,-isa-v30-instructions,-power8-vector,-power9-vector,-privileged,-quadword-atomics,-rop-protect,-spe,-vsx" } diff --git a/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll b/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll index 34091ba46c3f6..29e7a16739864 100644 --- a/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll +++ b/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll @@ -12,11 +12,11 @@ define double @postinctodbl(ptr nocapture %llp) #0 { ; CHECK-NEXT: addic 4, 4, 1 ; CHECK-NEXT: lwz 5, 0(3) ; CHECK-NEXT: stw 5, 8(1) -; CHECK-NEXT: addze 5, 5 ; CHECK-NEXT: lfd 0, 8(1) -; CHECK-NEXT: stw 5, 0(3) -; CHECK-NEXT: fcfid 1, 0 ; CHECK-NEXT: stw 4, 4(3) +; CHECK-NEXT: addze 4, 5 +; CHECK-NEXT: fcfid 1, 0 +; CHECK-NEXT: stw 4, 0(3) ; CHECK-NEXT: addi 1, 1, 16 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/inc-of-add.ll b/llvm/test/CodeGen/PowerPC/inc-of-add.ll index 98b812e7845a5..432b5a6b362fe 100644 --- a/llvm/test/CodeGen/PowerPC/inc-of-add.ll +++ b/llvm/test/CodeGen/PowerPC/inc-of-add.ll @@ -412,8 +412,8 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; PPC32-NEXT: not 4, 4 ; PPC32-NEXT: not 3, 3 ; PPC32-NEXT: subc 4, 8, 4 -; PPC32-NEXT: not 6, 6 ; PPC32-NEXT: subfe 3, 3, 7 +; PPC32-NEXT: not 6, 6 ; PPC32-NEXT: not 5, 5 ; PPC32-NEXT: subc 6, 10, 6 ; PPC32-NEXT: subfe 5, 5, 9 diff --git a/llvm/test/CodeGen/PowerPC/pr35688.ll b/llvm/test/CodeGen/PowerPC/pr35688.ll index 8a4351b229fd1..5746934802eb2 100644 --- a/llvm/test/CodeGen/PowerPC/pr35688.ll +++ b/llvm/test/CodeGen/PowerPC/pr35688.ll @@ -8,10 +8,9 @@ define void @ec_GFp_nistp256_points_mul() { ; CHECK-LABEL: ec_GFp_nistp256_points_mul: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: ld 3, 0(3) +; CHECK-NEXT: subfic 4, 3, 0 ; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: subfic 5, 3, 0 ; CHECK-NEXT: subfze 5, 4 -; CHECK-NEXT: sradi 5, 5, 63 ; CHECK-NEXT: subc 3, 5, 3 ; CHECK-NEXT: subfe 3, 4, 5 ; CHECK-NEXT: sradi 3, 3, 63 diff --git a/llvm/test/CodeGen/PowerPC/pr36292.ll b/llvm/test/CodeGen/PowerPC/pr36292.ll index 1794b3ba526ed..98d94646bce65 100644 --- a/llvm/test/CodeGen/PowerPC/pr36292.ll +++ b/llvm/test/CodeGen/PowerPC/pr36292.ll @@ -12,11 +12,12 @@ define void @test() nounwind comdat { ; CHECK-NEXT: std 30, -16(1) # 8-byte Folded Spill ; CHECK-NEXT: stdu 1, -64(1) ; CHECK-NEXT: std 0, 80(1) +; CHECK-NEXT: li 4, 0 ; CHECK-NEXT: ld 3, 0(3) ; CHECK-NEXT: ld 30, 32(1) -; CHECK-NEXT: sub 4, 3, 30 -; CHECK-NEXT: cmpld 4, 3 -; CHECK-NEXT: iselgt 3, 0, 4 +; CHECK-NEXT: subc 3, 3, 30 +; CHECK-NEXT: addze. 4, 4 +; CHECK-NEXT: iseleq 3, 0, 3 ; CHECK-NEXT: addi 29, 3, 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %forcond diff --git a/llvm/test/CodeGen/PowerPC/pr40922.ll b/llvm/test/CodeGen/PowerPC/pr40922.ll index 9252e9a3e3aa4..ed840ad12b7ed 100644 --- a/llvm/test/CodeGen/PowerPC/pr40922.ll +++ b/llvm/test/CodeGen/PowerPC/pr40922.ll @@ -23,11 +23,10 @@ define i32 @a() { ; CHECK-NEXT: li 5, 0 ; CHECK-NEXT: mr 30, 3 ; CHECK-NEXT: addic 6, 4, 6 -; CHECK-NEXT: addze 5, 5 -; CHECK-NEXT: rlwinm 6, 6, 0, 28, 26 -; CHECK-NEXT: andi. 5, 5, 1 -; CHECK-NEXT: cmplw 1, 6, 4 -; CHECK-NEXT: crorc 20, 1, 4 +; CHECK-NEXT: addze. 5, 5 +; CHECK-NEXT: rlwinm 5, 6, 0, 28, 26 +; CHECK-NEXT: cmplw 1, 5, 4 +; CHECK-NEXT: crnand 20, 4, 2 ; CHECK-NEXT: bc 12, 20, .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: bl e diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll index 0f2dcb3ccc8a0..0edbae47e9378 100644 --- a/llvm/test/CodeGen/PowerPC/pr45448.ll +++ b/llvm/test/CodeGen/PowerPC/pr45448.ll @@ -22,12 +22,14 @@ define hidden void @julia_tryparse_internal_45896() #0 { ; CHECK-NEXT: li r5, -3 ; CHECK-NEXT: sradi r4, r3, 63 ; CHECK-NEXT: rldic r5, r5, 4, 32 +; CHECK-NEXT: mulld r6, r4, r5 ; CHECK-NEXT: mulhdu r3, r3, r5 -; CHECK-NEXT: maddld r6, r4, r5, r3 -; CHECK-NEXT: cmpld cr1, r6, r3 -; CHECK-NEXT: mulhdu. r3, r4, r5 -; CHECK-NEXT: crorc 4*cr5+lt, 4*cr1+lt, eq -; CHECK-NEXT: bc 4, 4*cr5+lt, .LBB0_9 +; CHECK-NEXT: mulhdu r4, r4, r5 +; CHECK-NEXT: addc r3, r3, r6 +; CHECK-NEXT: li r3, 0 +; CHECK-NEXT: addze r3, r3 +; CHECK-NEXT: or. r3, r4, r3 +; CHECK-NEXT: beq cr0, .LBB0_9 ; CHECK-NEXT: # %bb.8: # %L917 ; CHECK-NEXT: .LBB0_9: # %L994 top: diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll index 8fff2c28da245..d9b22bda85e44 100644 --- a/llvm/test/CodeGen/PowerPC/sat-add.ll +++ b/llvm/test/CodeGen/PowerPC/sat-add.ll @@ -156,10 +156,11 @@ define i64 @unsigned_sat_constant_i64_using_min(i64 %x) { define i64 @unsigned_sat_constant_i64_using_cmp_sum(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: addi 4, 3, 42 -; CHECK-NEXT: cmpld 4, 3 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: isellt 3, 3, 4 +; CHECK-NEXT: li 4, 0 +; CHECK-NEXT: addic 3, 3, 42 +; CHECK-NEXT: addze. 4, 4 +; CHECK-NEXT: li 4, -1 +; CHECK-NEXT: iseleq 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, 42 %c = icmp ugt i64 %x, %a @@ -170,10 +171,11 @@ define i64 @unsigned_sat_constant_i64_using_cmp_sum(i64 %x) { define i64 @unsigned_sat_constant_i64_using_cmp_notval(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_using_cmp_notval: ; CHECK: # %bb.0: -; CHECK-NEXT: addi 4, 3, 42 -; CHECK-NEXT: cmpld 4, 3 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: isellt 3, 3, 4 +; CHECK-NEXT: li 4, 0 +; CHECK-NEXT: addic 3, 3, 42 +; CHECK-NEXT: addze. 4, 4 +; CHECK-NEXT: li 4, -1 +; CHECK-NEXT: iseleq 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, 42 %c = icmp ugt i64 %x, -43 @@ -346,10 +348,11 @@ define i64 @unsigned_sat_variable_i64_using_min(i64 %x, i64 %y) { define i64 @unsigned_sat_variable_i64_using_cmp_sum(i64 %x, i64 %y) { ; CHECK-LABEL: unsigned_sat_variable_i64_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: add 4, 3, 4 -; CHECK-NEXT: cmpld 4, 3 -; CHECK-NEXT: li 3, -1 -; CHECK-NEXT: isellt 3, 3, 4 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: li 4, 0 +; CHECK-NEXT: addze. 4, 4 +; CHECK-NEXT: li 4, -1 +; CHECK-NEXT: iseleq 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, %y %c = icmp ugt i64 %x, %a @@ -859,9 +862,11 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { define i64 @unsigned_sat_constant_i64_with_single_use(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_with_single_use: ; CHECK: # %bb.0: -; CHECK-NEXT: addi 4, 3, -4 -; CHECK-NEXT: cmpld 4, 3 -; CHECK-NEXT: iselgt 3, 0, 4 +; CHECK-NEXT: li 4, 4 +; CHECK-NEXT: subc 3, 3, 4 +; CHECK-NEXT: li 4, 0 +; CHECK-NEXT: addze. 4, 4 +; CHECK-NEXT: iseleq 3, 0, 3 ; CHECK-NEXT: blr %umin = call i64 @llvm.umin.i64(i64 %x, i64 4) %sub = sub i64 %x, %umin diff --git a/llvm/test/CodeGen/PowerPC/select.ll b/llvm/test/CodeGen/PowerPC/select.ll index 289f83c475ff3..10661030da8d8 100644 --- a/llvm/test/CodeGen/PowerPC/select.ll +++ b/llvm/test/CodeGen/PowerPC/select.ll @@ -135,18 +135,22 @@ define i64 @f4_sge_0(i64 %x) { ; ; CHECK-32-LABEL: f4_sge_0: ; CHECK-32: # %bb.0: -; CHECK-32-NEXT: mr r5, r4 +; CHECK-32-NEXT: mr r6, r4 ; CHECK-32-NEXT: subfic r4, r4, 0 -; CHECK-32-NEXT: mr r6, r3 ; CHECK-32-NEXT: cmpwi r3, -1 -; CHECK-32-NEXT: subfze r3, r3 -; CHECK-32-NEXT: bgt cr0, .LBB5_2 +; CHECK-32-NEXT: subfze r5, r3 +; CHECK-32-NEXT: ble cr0, .LBB5_3 ; CHECK-32-NEXT: # %bb.1: -; CHECK-32-NEXT: mr r3, r6 +; CHECK-32-NEXT: ble cr0, .LBB5_4 ; CHECK-32-NEXT: .LBB5_2: -; CHECK-32-NEXT: bgtlr cr0 -; CHECK-32-NEXT: # %bb.3: -; CHECK-32-NEXT: mr r4, r5 +; CHECK-32-NEXT: mr r3, r5 +; CHECK-32-NEXT: blr +; CHECK-32-NEXT: .LBB5_3: +; CHECK-32-NEXT: mr r4, r6 +; CHECK-32-NEXT: bgt cr0, .LBB5_2 +; CHECK-32-NEXT: .LBB5_4: +; CHECK-32-NEXT: mr r5, r3 +; CHECK-32-NEXT: mr r3, r5 ; CHECK-32-NEXT: blr %c = icmp sge i64 %x, 0 %x.neg = sub i64 0, %x diff --git a/llvm/test/CodeGen/PowerPC/uaddo-32.ll b/llvm/test/CodeGen/PowerPC/uaddo-32.ll index b5989fc2ee2da..5dd5a2672b166 100644 --- a/llvm/test/CodeGen/PowerPC/uaddo-32.ll +++ b/llvm/test/CodeGen/PowerPC/uaddo-32.ll @@ -1,15 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s -; RUN: llc < %s -mtriple=powerpc-ibm-aix-xcoff | FileCheck %s +; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s --check-prefix=LINUXASM +; RUN: llc < %s -mtriple=powerpc-ibm-aix-xcoff | FileCheck %s --check-prefix=AIXASM define noundef i32 @add(i32 noundef %a, i32 noundef %b, ptr nocapture noundef writeonly %ovf) { -; CHECK-LABEL: add: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 6, 0 -; CHECK-NEXT: addc 3, 3, 4 -; CHECK-NEXT: addze 4, 6 -; CHECK-NEXT: stw 4, 0(5) -; CHECK-NEXT: blr +; LINUXASM-LABEL: add: +; LINUXASM: # %bb.0: # %entry +; LINUXASM-NEXT: li 6, 0 +; LINUXASM-NEXT: addc 3, 3, 4 +; LINUXASM-NEXT: addze 4, 6 +; LINUXASM-NEXT: stw 4, 0(5) +; LINUXASM-NEXT: blr + +; AIXASM-LABEL: .add: +; AIXASM: # %bb.0: # %entry +; AIXASM-NEXT: addc 3, 3, 4 +; AIXASM-NEXT: li 4, 0 +; AIXASM-NEXT: addze 4, 4 +; AIXASM-NEXT: stw 4, 0(5) +; AIXASM-NEXT: blr + entry: %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %1 = extractvalue { i32, i1 } %0, 1 @@ -22,13 +31,22 @@ entry: declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) define noundef zeroext i1 @add_overflow(i32 noundef %a, i32 noundef %b, ptr nocapture noundef writeonly %ovf) { -; CHECK-LABEL: add_overflow: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 6, 0 -; CHECK-NEXT: addc 4, 3, 4 -; CHECK-NEXT: addze 3, 6 -; CHECK-NEXT: stw 4, 0(5) -; CHECK-NEXT: blr +; LINUXASM-LABEL: add_overflow: +; LINUXASM: # %bb.0: # %entry +; LINUXASM-NEXT: li 6, 0 +; LINUXASM-NEXT: addc 4, 3, 4 +; LINUXASM-NEXT: addze 3, 6 +; LINUXASM-NEXT: stw 4, 0(5) +; LINUXASM-NEXT: blr + +; AIXASM-LABEL: .add_overflow: +; AIXASM: # %bb.0: # %entry +; AIXASM-NEXT: addc 4, 3, 4 +; AIXASM-NEXT: li 3, 0 +; AIXASM-NEXT: addze 3, 3 +; AIXASM-NEXT: stw 4, 0(5) +; AIXASM-NEXT: blr + entry: %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %1 = extractvalue { i32, i1 } %0, 1 diff --git a/llvm/test/CodeGen/PowerPC/uaddo-64.ll b/llvm/test/CodeGen/PowerPC/uaddo-64.ll index 3c7ab2c2bab79..98e834f29467c 100644 --- a/llvm/test/CodeGen/PowerPC/uaddo-64.ll +++ b/llvm/test/CodeGen/PowerPC/uaddo-64.ll @@ -1,15 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mcpu=ppc -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff | FileCheck %s +; RUN: llc < %s -mcpu=ppc -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s --check-prefix=LINUXASM +; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff | FileCheck %s --check-prefix=AIXASM define noundef i64 @add(i64 noundef %a, i64 noundef %b, ptr nocapture noundef writeonly %ovf) { -; CHECK-LABEL: add: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 6, 0 -; CHECK-NEXT: addc 3, 3, 4 -; CHECK-NEXT: addze 4, 6 -; CHECK-NEXT: std 4, 0(5) -; CHECK-NEXT: blr +; LINUXASM-LABEL: add: +; LINUXASM: # %bb.0: # %entry +; LINUXASM-NEXT: li 6, 0 +; LINUXASM-NEXT: addc 3, 3, 4 +; LINUXASM-NEXT: addze 4, 6 +; LINUXASM-NEXT: std 4, 0(5) +; LINUXASM-NEXT: blr + +; AIXASM-LABEL: .add: +; AIXASM: # %bb.0: # %entry +; AIXASM-NEXT: addc 3, 3, 4 +; AIXASM-NEXT: li 4, 0 +; AIXASM-NEXT: addze 4, 4 +; AIXASM-NEXT: std 4, 0(5) +; AIXASM-NEXT: blr + entry: %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %1 = extractvalue { i64, i1 } %0, 1 @@ -22,13 +31,22 @@ entry: declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) define noundef zeroext i1 @add_overflow(i64 noundef %a, i64 noundef %b, ptr nocapture noundef writeonly %ovf) { -; CHECK-LABEL: add_overflow: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 6, 0 -; CHECK-NEXT: addc 4, 3, 4 -; CHECK-NEXT: addze 3, 6 -; CHECK-NEXT: std 4, 0(5) -; CHECK-NEXT: blr +; LINUXASM-LABEL: add_overflow: +; LINUXASM: # %bb.0: # %entry +; LINUXASM-NEXT: li 6, 0 +; LINUXASM-NEXT: addc 4, 3, 4 +; LINUXASM-NEXT: addze 3, 6 +; LINUXASM-NEXT: std 4, 0(5) +; LINUXASM-NEXT: blr + +; AIXASM-LABEL: .add_overflow: +; AIXASM: # %bb.0: # %entry +; AIXASM-NEXT: addc 4, 3, 4 +; AIXASM-NEXT: li 3, 0 +; AIXASM-NEXT: addze 3, 3 +; AIXASM-NEXT: std 4, 0(5) +; AIXASM-NEXT: blr + entry: %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %1 = extractvalue { i64, i1 } %0, 1 @@ -38,16 +56,28 @@ entry: } define noundef i64 @addWithCarryIn (i64 noundef %a, i64 noundef %b, i64 noundef %c, ptr nocapture noundef writeonly %ovf) { -; CHECK-LABEL: addWithCarryIn: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: li 7, 0 -; CHECK-NEXT: addc 3, 3, 4 -; CHECK-NEXT: addze 4, 7 -; CHECK-NEXT: addc 3, 3, 5 -; CHECK-NEXT: addze 5, 7 -; CHECK-NEXT: or 4, 4, 5 -; CHECK-NEXT: std 4, 0(6) -; CHECK-NEXT: blr +; LINUXASM-LABEL: addWithCarryIn: +; LINUXASM: # %bb.0: # %entry +; LINUXASM-NEXT: li 7, 0 +; LINUXASM-NEXT: addc 3, 3, 4 +; LINUXASM-NEXT: addze 4, 7 +; LINUXASM-NEXT: addc 3, 3, 5 +; LINUXASM-NEXT: addze 5, 7 +; LINUXASM-NEXT: or 4, 4, 5 +; LINUXASM-NEXT: std 4, 0(6) +; LINUXASM-NEXT: blr + +; AIXASM-LABEL: .addWithCarryIn: +; AIXASM: # %bb.0: # %entry +; AIXASM-NEXT: addc 3, 3, 4 +; AIXASM-NEXT: li 4, 0 +; AIXASM-NEXT: addze 7, 4 +; AIXASM-NEXT: addc 3, 3, 5 +; AIXASM-NEXT: addze 4, 4 +; AIXASM-NEXT: or 4, 7, 4 +; AIXASM-NEXT: std 4, 0(6) +; AIXASM-NEXT: blr + entry: %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %1 = extractvalue { i64, i1 } %0, 1 diff --git a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll index 84895e74f18d5..f573fdab1b153 100644 --- a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll @@ -5,137 +5,134 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; PPC64-LABEL: muloti_test: ; PPC64: # %bb.0: # %start -; PPC64-NEXT: addic 8, 5, -1 -; PPC64-NEXT: mulhdu 9, 5, 4 +; PPC64-NEXT: addic 9, 5, -1 ; PPC64-NEXT: mulld 10, 5, 4 +; PPC64-NEXT: mulld 11, 3, 6 +; PPC64-NEXT: subfe 9, 9, 5 +; PPC64-NEXT: add 10, 11, 10 +; PPC64-NEXT: addic 11, 3, -1 +; PPC64-NEXT: mulhdu 8, 3, 6 +; PPC64-NEXT: subfe 3, 11, 3 +; PPC64-NEXT: and 3, 3, 9 +; PPC64-NEXT: addic 9, 8, -1 +; PPC64-NEXT: subfe 8, 9, 8 +; PPC64-NEXT: or 3, 3, 8 +; PPC64-NEXT: mulhdu 5, 5, 4 +; PPC64-NEXT: addic 8, 5, -1 ; PPC64-NEXT: subfe 5, 8, 5 -; PPC64-NEXT: mulld 8, 3, 6 -; PPC64-NEXT: add 8, 8, 10 -; PPC64-NEXT: addic 10, 3, -1 -; PPC64-NEXT: mulhdu 7, 3, 6 -; PPC64-NEXT: subfe 3, 10, 3 -; PPC64-NEXT: and 5, 3, 5 -; PPC64-NEXT: addic 3, 7, -1 -; PPC64-NEXT: subfe 7, 3, 7 -; PPC64-NEXT: or 5, 5, 7 -; PPC64-NEXT: mulhdu 10, 4, 6 -; PPC64-NEXT: addic 7, 9, -1 -; PPC64-NEXT: add 3, 10, 8 -; PPC64-NEXT: subfe 7, 7, 9 -; PPC64-NEXT: or 5, 5, 7 -; PPC64-NEXT: subc 7, 3, 10 -; PPC64-NEXT: subfe 7, 3, 3 -; PPC64-NEXT: neg 7, 7 +; PPC64-NEXT: li 7, 0 +; PPC64-NEXT: or 5, 3, 5 +; PPC64-NEXT: mulhdu 8, 4, 6 +; PPC64-NEXT: addc 3, 8, 10 +; PPC64-NEXT: addze 7, 7 +; PPC64-NEXT: addic 8, 7, -1 +; PPC64-NEXT: subfe 7, 8, 7 ; PPC64-NEXT: or 5, 5, 7 ; PPC64-NEXT: mulld 4, 4, 6 ; PPC64-NEXT: blr ; ; PPC32-LABEL: muloti_test: ; PPC32: # %bb.0: # %start -; PPC32-NEXT: stwu 1, -80(1) -; PPC32-NEXT: mr 11, 7 -; PPC32-NEXT: stw 26, 56(1) # 4-byte Folded Spill -; PPC32-NEXT: mulhwu. 26, 11, 6 -; PPC32-NEXT: stw 24, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: stwu 1, -64(1) +; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill ; PPC32-NEXT: mfcr 12 -; PPC32-NEXT: stw 27, 60(1) # 4-byte Folded Spill -; PPC32-NEXT: mcrf 1, 0 -; PPC32-NEXT: stw 19, 28(1) # 4-byte Folded Spill -; PPC32-NEXT: mulhwu 27, 6, 10 -; PPC32-NEXT: stw 20, 32(1) # 4-byte Folded Spill -; PPC32-NEXT: cmpwi 6, 11, 0 -; PPC32-NEXT: stw 21, 36(1) # 4-byte Folded Spill +; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill +; PPC32-NEXT: mullw 27, 9, 4 +; PPC32-NEXT: stw 21, 20(1) # 4-byte Folded Spill +; PPC32-NEXT: mr 11, 7 +; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill ; PPC32-NEXT: li 7, 0 -; PPC32-NEXT: stw 22, 40(1) # 4-byte Folded Spill -; PPC32-NEXT: mulhwu. 26, 5, 8 -; PPC32-NEXT: stw 23, 44(1) # 4-byte Folded Spill -; PPC32-NEXT: mcrf 5, 0 -; PPC32-NEXT: stw 25, 52(1) # 4-byte Folded Spill -; PPC32-NEXT: cmpwi 5, 0 -; PPC32-NEXT: stw 28, 64(1) # 4-byte Folded Spill -; PPC32-NEXT: mullw 24, 5, 10 -; PPC32-NEXT: stw 29, 68(1) # 4-byte Folded Spill -; PPC32-NEXT: crnor 20, 2, 26 -; PPC32-NEXT: stw 30, 72(1) # 4-byte Folded Spill -; PPC32-NEXT: cmpwi 3, 0 -; PPC32-NEXT: stw 12, 24(1) -; PPC32-NEXT: mulhwu 30, 5, 10 -; PPC32-NEXT: cmpwi 6, 9, 0 -; PPC32-NEXT: crnor 21, 26, 2 -; PPC32-NEXT: crorc 20, 20, 6 -; PPC32-NEXT: crorc 20, 20, 22 -; PPC32-NEXT: mulhwu 12, 5, 9 -; PPC32-NEXT: mullw 26, 5, 9 -; PPC32-NEXT: mullw 22, 5, 8 -; PPC32-NEXT: addc 5, 24, 27 -; PPC32-NEXT: addze 30, 30 +; PPC32-NEXT: mullw 26, 3, 10 +; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill +; PPC32-NEXT: add 27, 26, 27 +; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill +; PPC32-NEXT: cmpwi 7, 11, 0 +; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill +; PPC32-NEXT: mullw 24, 11, 6 +; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill +; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill +; PPC32-NEXT: mulhwu 0, 8, 6 +; PPC32-NEXT: stw 12, 16(1) +; PPC32-NEXT: mr 12, 5 +; PPC32-NEXT: mulhwu 5, 4, 10 +; PPC32-NEXT: addc 5, 5, 27 +; PPC32-NEXT: addze 27, 7 +; PPC32-NEXT: cmpwi 2, 27, 0 +; PPC32-NEXT: mullw 25, 12, 8 +; PPC32-NEXT: add 26, 24, 25 +; PPC32-NEXT: addc 0, 0, 26 +; PPC32-NEXT: addze 26, 7 +; PPC32-NEXT: mullw 23, 8, 6 +; PPC32-NEXT: mullw 22, 4, 10 +; PPC32-NEXT: addc 24, 22, 23 +; PPC32-NEXT: adde 22, 5, 0 +; PPC32-NEXT: mulhwu 29, 6, 10 +; PPC32-NEXT: mullw 21, 12, 10 +; PPC32-NEXT: addc 5, 21, 29 +; PPC32-NEXT: mulhwu 30, 12, 10 +; PPC32-NEXT: addze 0, 30 ; PPC32-NEXT: mullw 23, 6, 9 ; PPC32-NEXT: addc 5, 23, 5 -; PPC32-NEXT: mullw 21, 11, 6 -; PPC32-NEXT: add 27, 21, 22 -; PPC32-NEXT: mulhwu 28, 8, 6 -; PPC32-NEXT: add 27, 28, 27 -; PPC32-NEXT: cmplw 7, 27, 28 -; PPC32-NEXT: mulhwu. 23, 3, 10 +; PPC32-NEXT: mulhwu 28, 6, 9 +; PPC32-NEXT: addze 29, 28 +; PPC32-NEXT: addc 0, 0, 29 +; PPC32-NEXT: addze 29, 7 +; PPC32-NEXT: mullw 30, 12, 9 +; PPC32-NEXT: addc 0, 30, 0 +; PPC32-NEXT: mulhwu 25, 12, 9 +; PPC32-NEXT: adde 30, 25, 29 +; PPC32-NEXT: addc 0, 0, 24 +; PPC32-NEXT: adde 30, 30, 22 +; PPC32-NEXT: addze. 29, 7 +; PPC32-NEXT: mcrf 1, 0 +; PPC32-NEXT: mulhwu. 29, 11, 6 ; PPC32-NEXT: mcrf 6, 0 -; PPC32-NEXT: cror 24, 20, 28 -; PPC32-NEXT: crorc 25, 21, 26 -; PPC32-NEXT: mulhwu 0, 6, 9 -; PPC32-NEXT: mullw 20, 9, 4 +; PPC32-NEXT: mulhwu. 29, 12, 8 +; PPC32-NEXT: mcrf 5, 0 +; PPC32-NEXT: cmpwi 12, 0 +; PPC32-NEXT: crnor 20, 2, 30 +; PPC32-NEXT: cmpwi 3, 0 +; PPC32-NEXT: cmpwi 7, 9, 0 +; PPC32-NEXT: crnor 24, 30, 2 +; PPC32-NEXT: mulhwu. 12, 3, 10 +; PPC32-NEXT: crorc 20, 20, 26 +; PPC32-NEXT: mcrf 7, 0 +; PPC32-NEXT: crorc 20, 20, 22 +; PPC32-NEXT: cmpwi 26, 0 +; PPC32-NEXT: crorc 28, 20, 2 ; PPC32-NEXT: mulhwu. 9, 9, 4 -; PPC32-NEXT: mcrf 1, 0 -; PPC32-NEXT: addze 9, 0 -; PPC32-NEXT: mullw 19, 3, 10 -; PPC32-NEXT: or. 3, 4, 3 ; PPC32-NEXT: mcrf 5, 0 -; PPC32-NEXT: addc 3, 30, 9 -; PPC32-NEXT: add 24, 19, 20 -; PPC32-NEXT: mulhwu 29, 4, 10 -; PPC32-NEXT: add 28, 29, 24 -; PPC32-NEXT: cmplw 2, 28, 29 -; PPC32-NEXT: crorc 20, 25, 6 -; PPC32-NEXT: cror 20, 20, 8 -; PPC32-NEXT: mullw 22, 4, 10 -; PPC32-NEXT: or. 4, 8, 11 -; PPC32-NEXT: addze 4, 7 -; PPC32-NEXT: crnor 21, 2, 22 +; PPC32-NEXT: crorc 20, 24, 30 +; PPC32-NEXT: or. 3, 4, 3 +; PPC32-NEXT: mcrf 6, 0 +; PPC32-NEXT: crorc 20, 20, 22 +; PPC32-NEXT: or. 3, 8, 11 +; PPC32-NEXT: crorc 20, 20, 10 +; PPC32-NEXT: crnor 21, 2, 26 ; PPC32-NEXT: cror 20, 21, 20 -; PPC32-NEXT: mullw 25, 8, 6 -; PPC32-NEXT: addc 8, 26, 3 -; PPC32-NEXT: adde 9, 12, 4 -; PPC32-NEXT: addc 3, 22, 25 -; PPC32-NEXT: adde 11, 28, 27 -; PPC32-NEXT: addc 4, 8, 3 -; PPC32-NEXT: adde 3, 9, 11 -; PPC32-NEXT: cmplw 1, 3, 9 -; PPC32-NEXT: cmplw 4, 8 -; PPC32-NEXT: crandc 22, 4, 6 +; PPC32-NEXT: cror 20, 20, 28 +; PPC32-NEXT: crandc 20, 6, 20 ; PPC32-NEXT: mullw 6, 6, 10 -; PPC32-NEXT: bc 12, 22, .LBB0_3 +; PPC32-NEXT: bc 12, 20, .LBB0_2 ; PPC32-NEXT: # %bb.1: # %start -; PPC32-NEXT: crand 21, 6, 0 -; PPC32-NEXT: bc 12, 21, .LBB0_3 -; PPC32-NEXT: # %bb.2: # %start -; PPC32-NEXT: cror 20, 20, 24 -; PPC32-NEXT: bc 4, 20, .LBB0_4 -; PPC32-NEXT: .LBB0_3: # %start ; PPC32-NEXT: li 7, 1 -; PPC32-NEXT: .LBB0_4: # %start -; PPC32-NEXT: lwz 12, 24(1) -; PPC32-NEXT: lwz 30, 72(1) # 4-byte Folded Reload +; PPC32-NEXT: .LBB0_2: # %start +; PPC32-NEXT: lwz 12, 16(1) +; PPC32-NEXT: mr 3, 30 +; PPC32-NEXT: mr 4, 0 +; PPC32-NEXT: lwz 30, 56(1) # 4-byte Folded Reload ; PPC32-NEXT: mtcrf 32, 12 # cr2 -; PPC32-NEXT: lwz 29, 68(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 28, 64(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 27, 60(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 26, 56(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 25, 52(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 24, 48(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 23, 44(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 22, 40(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 21, 36(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 20, 32(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 19, 28(1) # 4-byte Folded Reload -; PPC32-NEXT: addi 1, 1, 80 +; PPC32-NEXT: lwz 29, 52(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 28, 48(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 27, 44(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 26, 40(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 25, 36(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 24, 32(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 23, 28(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 22, 24(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 21, 20(1) # 4-byte Folded Reload +; PPC32-NEXT: addi 1, 1, 64 ; PPC32-NEXT: blr start: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2 diff --git a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll index e5c5356ce50a4..515dd0f70e948 100644 --- a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll @@ -207,33 +207,32 @@ define i1 @test_urem_oversized(i66 %X) nounwind { ; PPC: # %bb.0: ; PPC-NEXT: lis 6, -12795 ; PPC-NEXT: ori 6, 6, 40665 -; PPC-NEXT: mulhwu 7, 5, 6 +; PPC-NEXT: mulhwu 8, 5, 6 ; PPC-NEXT: lis 9, 12057 ; PPC-NEXT: ori 9, 9, 37186 ; PPC-NEXT: mullw 11, 4, 6 -; PPC-NEXT: addc 7, 11, 7 +; PPC-NEXT: addc 8, 11, 8 ; PPC-NEXT: lis 11, -5526 ; PPC-NEXT: ori 11, 11, 61135 -; PPC-NEXT: mulhwu 8, 4, 6 -; PPC-NEXT: addze 8, 8 +; PPC-NEXT: mulhwu 7, 4, 6 +; PPC-NEXT: addze 7, 7 ; PPC-NEXT: mulhwu 10, 5, 9 ; PPC-NEXT: mullw 4, 4, 9 ; PPC-NEXT: mullw 9, 5, 9 -; PPC-NEXT: addc 7, 9, 7 -; PPC-NEXT: addze 9, 10 -; PPC-NEXT: rotlwi 10, 7, 31 +; PPC-NEXT: addc 8, 9, 8 +; PPC-NEXT: adde 7, 7, 10 +; PPC-NEXT: add 4, 4, 7 +; PPC-NEXT: rotlwi 9, 8, 31 ; PPC-NEXT: mullw 3, 3, 6 ; PPC-NEXT: mullw 6, 5, 6 ; PPC-NEXT: slwi 5, 5, 1 ; PPC-NEXT: add 3, 5, 3 ; PPC-NEXT: rotlwi 5, 6, 31 -; PPC-NEXT: rlwimi 5, 7, 31, 0, 0 -; PPC-NEXT: add 7, 8, 9 -; PPC-NEXT: add 4, 4, 7 ; PPC-NEXT: add 3, 4, 3 -; PPC-NEXT: rlwimi 10, 3, 31, 0, 0 +; PPC-NEXT: rlwimi 5, 8, 31, 0, 0 +; PPC-NEXT: rlwimi 9, 3, 31, 0, 0 ; PPC-NEXT: cmplw 5, 11 -; PPC-NEXT: cmplwi 1, 10, 13 +; PPC-NEXT: cmplwi 1, 9, 13 ; PPC-NEXT: rlwinm 3, 3, 31, 31, 31 ; PPC-NEXT: crandc 20, 4, 6 ; PPC-NEXT: crand 21, 6, 0 From 75142250527a97fcf0c721148705ae415a2f2d3a Mon Sep 17 00:00:00 2001 From: Sterling-Augustine <56981066+Sterling-Augustine@users.noreply.github.com> Date: Thu, 3 Apr 2025 10:24:10 -0700 Subject: [PATCH 0562/1029] Use a more proper idiom for "the output file doesn't matter". NFC. (#134280) As in the description. Follow up to PR #134179. --- .../CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll index f04c7ec9884bd..3abc5bd33d8af 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.lds.err.ll @@ -1,7 +1,7 @@ -; RUN: not --crash llc -o - -mtriple=amdgcn -mcpu=gfx810 %s 2>&1 | FileCheck --ignore-case %s -; RUN: not --crash llc -o - -global-isel -mtriple=amdgcn -mcpu=gfx810 %s 2>&1 | FileCheck --ignore-case %s -; RUN: not --crash llc -o - -mtriple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --ignore-case %s -; RUN: not --crash llc -o - -global-isel -mtriple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -filetype=null -mtriple=amdgcn -mcpu=gfx810 %s 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -filetype=null -global-isel -mtriple=amdgcn -mcpu=gfx810 %s 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -filetype=null -mtriple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --ignore-case %s +; RUN: not --crash llc -filetype=null -global-isel -mtriple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck --ignore-case %s ; ; CHECK: LLVM ERROR: Cannot select From 262b9b515330daf7c446cc7983bf5f89185b3666 Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Thu, 3 Apr 2025 19:25:25 +0200 Subject: [PATCH 0563/1029] [CIR][Upstream] Local initialization for ArrayType (#132974) This change adds local initialization for ArrayType Issue #130197 --- .../CIR/Dialect/Builder/CIRBaseBuilder.h | 15 + .../include/clang/CIR/Dialect/IR/CIRDialect.h | 25 ++ clang/include/clang/CIR/Dialect/IR/CIROps.td | 41 +++ clang/include/clang/CIR/LoweringHelpers.h | 40 +++ clang/include/clang/CIR/MissingFeatures.h | 1 + clang/lib/CIR/CodeGen/Address.h | 10 + clang/lib/CIR/CodeGen/CIRGenDecl.cpp | 2 +- clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp | 277 ++++++++++++++++++ clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp | 22 ++ clang/lib/CIR/CodeGen/CIRGenFunction.cpp | 43 +++ clang/lib/CIR/CodeGen/CIRGenFunction.h | 4 + clang/lib/CIR/CodeGen/CIRGenModule.cpp | 12 + clang/lib/CIR/CodeGen/CIRGenModule.h | 4 + clang/lib/CIR/CodeGen/CIRGenTypeCache.h | 13 + clang/lib/CIR/CodeGen/CIRGenTypes.cpp | 27 ++ clang/lib/CIR/CodeGen/CIRGenTypes.h | 4 + clang/lib/CIR/CodeGen/CIRGenValue.h | 44 ++- clang/lib/CIR/CodeGen/CMakeLists.txt | 1 + clang/lib/CIR/Dialect/IR/CIRDialect.cpp | 42 ++- clang/lib/CIR/Lowering/CMakeLists.txt | 1 + .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 94 ++++++ .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h | 15 + clang/lib/CIR/Lowering/LoweringHelpers.cpp | 146 +++++++++ clang/test/CIR/CodeGen/array.cpp | 112 ++++++- clang/test/CIR/Lowering/array.cpp | 90 +++++- 25 files changed, 1064 insertions(+), 21 deletions(-) create mode 100644 clang/include/clang/CIR/LoweringHelpers.h create mode 100644 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp create mode 100644 clang/lib/CIR/Lowering/LoweringHelpers.cpp diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index e666be0b25d75..51939e3af833d 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -67,6 +67,16 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return create(loc, attr.getType(), attr); } + cir::ConstantOp getConstantInt(mlir::Location loc, mlir::Type ty, + int64_t value) { + return getConstant(loc, cir::IntAttr::get(ty, value)); + } + + // Creates constant null value for integral type ty. + cir::ConstantOp getNullValue(mlir::Type ty, mlir::Location loc) { + return getConstant(loc, getZeroInitAttr(ty)); + } + mlir::TypedAttr getConstNullPtrAttr(mlir::Type t) { assert(mlir::isa(t) && "expected cir.ptr"); return getConstPtrAttr(t, 0); @@ -171,6 +181,11 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return createLoad(loc, addr); } + cir::PtrStrideOp createPtrStride(mlir::Location loc, mlir::Value base, + mlir::Value stride) { + return create(loc, base.getType(), base, stride); + } + //===--------------------------------------------------------------------===// // Cast/Conversion Operators //===--------------------------------------------------------------------===// diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.h b/clang/include/clang/CIR/Dialect/IR/CIRDialect.h index 4d7f537418a90..986f682a19f4f 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.h +++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.h @@ -32,6 +32,31 @@ #include "clang/CIR/Interfaces/CIRLoopOpInterface.h" #include "clang/CIR/Interfaces/CIROpInterfaces.h" +namespace mlir { +namespace OpTrait { + +namespace impl { +// These functions are out-of-line implementations of the methods in the +// corresponding trait classes. This avoids them being template +// instantiated/duplicated. +LogicalResult verifySameFirstOperandAndResultType(Operation *op); +} // namespace impl + +/// This class provides verification for ops that are known to have the same +/// first operand and result type. +/// +template +class SameFirstOperandAndResultType + : public TraitBase { +public: + static llvm::LogicalResult verifyTrait(Operation *op) { + return impl::verifySameFirstOperandAndResultType(op); + } +}; + +} // namespace OpTrait +} // namespace mlir + using BuilderCallbackRef = llvm::function_ref; diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index c17abfd752a1a..562493888e10c 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -79,6 +79,13 @@ class LLVMLoweringInfo { class CIR_Op traits = []> : Op, LLVMLoweringInfo; +//===----------------------------------------------------------------------===// +// CIR Op Traits +//===----------------------------------------------------------------------===// + +def SameFirstOperandAndResultType : + NativeOpTrait<"SameFirstOperandAndResultType">; + //===----------------------------------------------------------------------===// // CastOp //===----------------------------------------------------------------------===// @@ -229,6 +236,40 @@ def CastOp : CIR_Op<"cast", let hasFolder = 1; } + +//===----------------------------------------------------------------------===// +// PtrStrideOp +//===----------------------------------------------------------------------===// + +def PtrStrideOp : CIR_Op<"ptr_stride", + [Pure, SameFirstOperandAndResultType]> { + let summary = "Pointer access with stride"; + let description = [{ + Given a base pointer as first operand, provides a new pointer after applying + a stride (second operand). + + ```mlir + %3 = cir.const 0 : i32 + %4 = cir.ptr_stride(%2 : !cir.ptr, %3 : i32), !cir.ptr + ``` + }]; + + let arguments = (ins CIR_PointerType:$base, PrimitiveInt:$stride); + let results = (outs CIR_PointerType:$result); + + let assemblyFormat = [{ + `(` $base `:` qualified(type($base)) `,` $stride `:` + qualified(type($stride)) `)` `,` qualified(type($result)) attr-dict + }]; + + let extraClassDeclaration = [{ + // Get type pointed by the base pointer. + mlir::Type getElementTy() { + return mlir::cast(getBase().getType()).getPointee(); + } + }]; +} + //===----------------------------------------------------------------------===// // ConstantOp //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/CIR/LoweringHelpers.h b/clang/include/clang/CIR/LoweringHelpers.h new file mode 100644 index 0000000000000..3077010ee5ffe --- /dev/null +++ b/clang/include/clang/CIR/LoweringHelpers.h @@ -0,0 +1,40 @@ +//====- LoweringHelpers.h - Lowering helper functions ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares helper functions for lowering from CIR to LLVM or MLIR. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_CIR_LOWERINGHELPERS_H +#define LLVM_CLANG_CIR_LOWERINGHELPERS_H + +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/Transforms/DialectConversion.h" +#include "clang/CIR/Dialect/IR/CIRDialect.h" + +mlir::DenseElementsAttr +convertStringAttrToDenseElementsAttr(cir::ConstArrayAttr attr, mlir::Type type); + +template StorageTy getZeroInitFromType(mlir::Type ty); +template <> mlir::APInt getZeroInitFromType(mlir::Type ty); +template <> mlir::APFloat getZeroInitFromType(mlir::Type ty); + +template +void convertToDenseElementsAttrImpl(cir::ConstArrayAttr attr, + llvm::SmallVectorImpl &values); + +template +mlir::DenseElementsAttr +convertToDenseElementsAttr(cir::ConstArrayAttr attr, + const llvm::SmallVectorImpl &dims, + mlir::Type type); + +std::optional +lowerConstArrayAttr(cir::ConstArrayAttr constArr, + const mlir::TypeConverter *converter); + +#endif diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 23bf826d19a69..21a1d99c7c218 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -118,6 +118,7 @@ struct MissingFeatures { static bool vectorType() { return false; } static bool complexType() { return false; } static bool fixedPointType() { return false; } + static bool stringTypeWithDifferentArraySize() { return false; } // Future CIR operations static bool awaitOp() { return false; } diff --git a/clang/lib/CIR/CodeGen/Address.h b/clang/lib/CIR/CodeGen/Address.h index fba1ffd90877b..2cc8ada783197 100644 --- a/clang/lib/CIR/CodeGen/Address.h +++ b/clang/lib/CIR/CodeGen/Address.h @@ -70,6 +70,14 @@ class Address { return pointerAndKnownNonNull.getPointer(); } + mlir::Type getType() const { + assert(mlir::cast( + pointerAndKnownNonNull.getPointer().getType()) + .getPointee() == elementType); + + return mlir::cast(getPointer().getType()); + } + mlir::Type getElementType() const { assert(isValid()); assert(mlir::cast( @@ -77,6 +85,8 @@ class Address { .getPointee() == elementType); return elementType; } + + clang::CharUnits getAlignment() const { return alignment; } }; } // namespace clang::CIRGen diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp index f2153c23ebb43..5b832b463e752 100644 --- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp @@ -251,7 +251,7 @@ void CIRGenFunction::emitExprAsInit(const Expr *init, const ValueDecl *d, return; } case cir::TEK_Aggregate: - cgm.errorNYI(init->getSourceRange(), "emitExprAsInit: aggregate type"); + emitAggExpr(init, AggValueSlot::forLValue(lvalue)); return; } llvm_unreachable("bad evaluation kind"); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp new file mode 100644 index 0000000000000..36da63d5f7d76 --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp @@ -0,0 +1,277 @@ +//===--- CIRGenExprAgg.cpp - Emit CIR Code from Aggregate Expressions -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This contains code to emit Aggregate Expr nodes as CIR code. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenBuilder.h" +#include "CIRGenFunction.h" +#include "CIRGenValue.h" +#include "clang/CIR/Dialect/IR/CIRAttrs.h" + +#include "clang/AST/Expr.h" +#include "clang/AST/StmtVisitor.h" +#include + +using namespace clang; +using namespace clang::CIRGen; + +namespace { +class AggExprEmitter : public StmtVisitor { + + CIRGenFunction &cgf; + AggValueSlot dest; + + AggValueSlot ensureSlot(mlir::Location loc, QualType t) { + if (!dest.isIgnored()) + return dest; + + cgf.cgm.errorNYI(loc, "Slot for ignored address"); + return dest; + } + +public: + AggExprEmitter(CIRGenFunction &cgf, AggValueSlot dest) + : cgf(cgf), dest(dest) {} + + void emitArrayInit(Address destPtr, cir::ArrayType arrayTy, QualType arrayQTy, + Expr *exprToVisit, ArrayRef args, + Expr *arrayFiller); + + void emitInitializationToLValue(Expr *e, LValue lv); + + void emitNullInitializationToLValue(mlir::Location loc, LValue lv); + + void Visit(Expr *e) { StmtVisitor::Visit(e); } + + void VisitInitListExpr(InitListExpr *e); + + void visitCXXParenListOrInitListExpr(Expr *e, ArrayRef args, + FieldDecl *initializedFieldInUnion, + Expr *arrayFiller); +}; + +} // namespace + +static bool isTrivialFiller(Expr *e) { + if (!e) + return true; + + if (isa(e)) + return true; + + if (auto *ile = dyn_cast(e)) { + if (ile->getNumInits()) + return false; + return isTrivialFiller(ile->getArrayFiller()); + } + + if (const auto *cons = dyn_cast_or_null(e)) + return cons->getConstructor()->isDefaultConstructor() && + cons->getConstructor()->isTrivial(); + + return false; +} + +void AggExprEmitter::emitArrayInit(Address destPtr, cir::ArrayType arrayTy, + QualType arrayQTy, Expr *e, + ArrayRef args, Expr *arrayFiller) { + CIRGenBuilderTy &builder = cgf.getBuilder(); + const mlir::Location loc = cgf.getLoc(e->getSourceRange()); + + const uint64_t numInitElements = args.size(); + + const QualType elementType = + cgf.getContext().getAsArrayType(arrayQTy)->getElementType(); + + if (elementType.isDestructedType()) { + cgf.cgm.errorNYI(loc, "dtorKind NYI"); + return; + } + + const QualType elementPtrType = cgf.getContext().getPointerType(elementType); + + const mlir::Type cirElementType = cgf.convertType(elementType); + const cir::PointerType cirElementPtrType = + builder.getPointerTo(cirElementType); + + auto begin = builder.create(loc, cirElementPtrType, + cir::CastKind::array_to_ptrdecay, + destPtr.getPointer()); + + const CharUnits elementSize = + cgf.getContext().getTypeSizeInChars(elementType); + const CharUnits elementAlign = + destPtr.getAlignment().alignmentOfArrayElement(elementSize); + + // The 'current element to initialize'. The invariants on this + // variable are complicated. Essentially, after each iteration of + // the loop, it points to the last initialized element, except + // that it points to the beginning of the array before any + // elements have been initialized. + mlir::Value element = begin; + + // Don't build the 'one' before the cycle to avoid + // emmiting the redundant `cir.const 1` instrs. + mlir::Value one; + + // Emit the explicit initializers. + for (uint64_t i = 0; i != numInitElements; ++i) { + // Advance to the next element. + if (i > 0) { + one = builder.getConstantInt(loc, cgf.PtrDiffTy, i); + element = builder.createPtrStride(loc, begin, one); + } + + const Address address = Address(element, cirElementType, elementAlign); + const LValue elementLV = LValue::makeAddr(address, elementType); + emitInitializationToLValue(args[i], elementLV); + } + + const uint64_t numArrayElements = arrayTy.getSize(); + + // Check whether there's a non-trivial array-fill expression. + const bool hasTrivialFiller = isTrivialFiller(arrayFiller); + + // Any remaining elements need to be zero-initialized, possibly + // using the filler expression. We can skip this if the we're + // emitting to zeroed memory. + if (numInitElements != numArrayElements && + !(dest.isZeroed() && hasTrivialFiller && + cgf.getTypes().isZeroInitializable(elementType))) { + // Advance to the start of the rest of the array. + if (numInitElements) { + one = builder.getConstantInt(loc, cgf.PtrDiffTy, 1); + element = builder.create(loc, cirElementPtrType, + element, one); + } + + // Allocate the temporary variable + // to store the pointer to first unitialized element + const Address tmpAddr = cgf.createTempAlloca( + cirElementPtrType, cgf.getPointerAlign(), loc, "arrayinit.temp", + /*insertIntoFnEntryBlock=*/false); + LValue tmpLV = LValue::makeAddr(tmpAddr, elementPtrType); + cgf.emitStoreThroughLValue(RValue::get(element), tmpLV); + + // TODO(CIR): Replace this part later with cir::DoWhileOp + for (unsigned i = numInitElements; i != numArrayElements; ++i) { + cir::LoadOp currentElement = + builder.createLoad(loc, tmpAddr.getPointer()); + + // Emit the actual filler expression. + const LValue elementLV = LValue::makeAddr( + Address(currentElement, cirElementType, elementAlign), elementType); + + if (arrayFiller) + emitInitializationToLValue(arrayFiller, elementLV); + else + emitNullInitializationToLValue(loc, elementLV); + + // Advance pointer and store them to temporary variable + one = builder.getConstantInt(loc, cgf.PtrDiffTy, 1); + cir::PtrStrideOp nextElement = + builder.createPtrStride(loc, currentElement, one); + cgf.emitStoreThroughLValue(RValue::get(nextElement), tmpLV); + } + } +} + +void AggExprEmitter::emitInitializationToLValue(Expr *e, LValue lv) { + const QualType type = lv.getType(); + + if (isa(e)) { + const mlir::Location loc = e->getSourceRange().isValid() + ? cgf.getLoc(e->getSourceRange()) + : *cgf.currSrcLoc; + return emitNullInitializationToLValue(loc, lv); + } + + if (isa(e)) + return; + + if (type->isReferenceType()) + cgf.cgm.errorNYI("emitInitializationToLValue ReferenceType"); + + switch (cgf.getEvaluationKind(type)) { + case cir::TEK_Complex: + cgf.cgm.errorNYI("emitInitializationToLValue TEK_Complex"); + break; + case cir::TEK_Aggregate: + cgf.emitAggExpr(e, AggValueSlot::forLValue(lv)); + return; + case cir::TEK_Scalar: + if (lv.isSimple()) + cgf.emitScalarInit(e, cgf.getLoc(e->getSourceRange()), lv); + else + cgf.emitStoreThroughLValue(RValue::get(cgf.emitScalarExpr(e)), lv); + return; + } +} + +void AggExprEmitter::emitNullInitializationToLValue(mlir::Location loc, + LValue lv) { + const QualType type = lv.getType(); + + // If the destination slot is already zeroed out before the aggregate is + // copied into it, we don't have to emit any zeros here. + if (dest.isZeroed() && cgf.getTypes().isZeroInitializable(type)) + return; + + if (cgf.hasScalarEvaluationKind(type)) { + // For non-aggregates, we can store the appropriate null constant. + mlir::Value null = cgf.cgm.emitNullConstant(type, loc); + if (lv.isSimple()) { + cgf.emitStoreOfScalar(null, lv, /* isInitialization */ true); + return; + } + + cgf.cgm.errorNYI("emitStoreThroughBitfieldLValue"); + return; + } + + // There's a potential optimization opportunity in combining + // memsets; that would be easy for arrays, but relatively + // difficult for structures with the current code. + cgf.emitNullInitialization(loc, lv.getAddress(), lv.getType()); +} + +void AggExprEmitter::VisitInitListExpr(InitListExpr *e) { + if (e->hadArrayRangeDesignator()) + llvm_unreachable("GNU array range designator extension"); + + if (e->isTransparent()) + return Visit(e->getInit(0)); + + visitCXXParenListOrInitListExpr( + e, e->inits(), e->getInitializedFieldInUnion(), e->getArrayFiller()); +} + +void AggExprEmitter::visitCXXParenListOrInitListExpr( + Expr *e, ArrayRef args, FieldDecl *initializedFieldInUnion, + Expr *arrayFiller) { + + const AggValueSlot dest = + ensureSlot(cgf.getLoc(e->getSourceRange()), e->getType()); + + if (e->getType()->isConstantArrayType()) { + cir::ArrayType arrayTy = + cast(dest.getAddress().getElementType()); + emitArrayInit(dest.getAddress(), arrayTy, e->getType(), e, args, + arrayFiller); + return; + } + + cgf.cgm.errorNYI( + "visitCXXParenListOrInitListExpr Record or VariableSizeArray type"); +} + +void CIRGenFunction::emitAggExpr(const Expr *e, AggValueSlot slot) { + AggExprEmitter(*this, slot).Visit(const_cast(e)); +} diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp index fc49d6da97206..50fa029851f33 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp @@ -412,3 +412,25 @@ mlir::Attribute ConstantEmitter::tryEmitPrivate(const APValue &value, } llvm_unreachable("Unknown APValue kind"); } + +mlir::Value CIRGenModule::emitNullConstant(QualType t, mlir::Location loc) { + if (t->getAs()) { + return builder.getNullPtr(getTypes().convertTypeForMem(t), loc); + } + + if (getTypes().isZeroInitializable(t)) + return builder.getNullValue(getTypes().convertTypeForMem(t), loc); + + if (getASTContext().getAsConstantArrayType(t)) { + errorNYI("CIRGenModule::emitNullConstant ConstantArrayType"); + } + + if (t->getAs()) + errorNYI("CIRGenModule::emitNullConstant RecordType"); + + assert(t->isMemberDataPointerType() && + "Should only see pointers to data members here!"); + + errorNYI("CIRGenModule::emitNullConstant unsupported type"); + return {}; +} diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp index 47fc90836fca6..2465ccffd19d6 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp @@ -453,4 +453,47 @@ LValue CIRGenFunction::emitLValue(const Expr *e) { } } +void CIRGenFunction::emitNullInitialization(mlir::Location loc, Address destPtr, + QualType ty) { + // Ignore empty classes in C++. + if (getLangOpts().CPlusPlus) { + if (const RecordType *rt = ty->getAs()) { + if (cast(rt->getDecl())->isEmpty()) + return; + } + } + + // Cast the dest ptr to the appropriate i8 pointer type. + if (builder.isInt8Ty(destPtr.getElementType())) { + cgm.errorNYI(loc, "Cast the dest ptr to the appropriate i8 pointer type"); + } + + // Get size and alignment info for this aggregate. + const CharUnits size = getContext().getTypeSizeInChars(ty); + if (size.isZero()) { + // But note that getTypeInfo returns 0 for a VLA. + if (isa(getContext().getAsArrayType(ty))) { + cgm.errorNYI(loc, + "emitNullInitialization for zero size VariableArrayType"); + } else { + return; + } + } + + // If the type contains a pointer to data member we can't memset it to zero. + // Instead, create a null constant and copy it to the destination. + // TODO: there are other patterns besides zero that we can usefully memset, + // like -1, which happens to be the pattern used by member-pointers. + if (!cgm.getTypes().isZeroInitializable(ty)) { + cgm.errorNYI(loc, "type is not zero initializable"); + } + + // In LLVM Codegen: otherwise, just memset the whole thing to zero using + // Builder.CreateMemSet. In CIR just emit a store of #cir.zero to the + // respective address. + // Builder.CreateMemSet(DestPtr, Builder.getInt8(0), SizeVal, false); + const mlir::Value zeroValue = builder.getNullValue(convertType(ty), loc); + builder.createStore(loc, zeroValue, destPtr.getPointer()); +} + } // namespace clang::CIRGen diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 5cae4d5da9516..4889c3ce4ca9c 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -110,6 +110,8 @@ class CIRGenFunction : public CIRGenTypeCache { public: mlir::Value createDummyValue(mlir::Location loc, clang::QualType qt); + void emitNullInitialization(mlir::Location loc, Address destPtr, QualType ty); + private: // Track current variable initialization (if there's one) const clang::VarDecl *currVarDecl = nullptr; @@ -377,6 +379,8 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::OpBuilder::InsertPoint ip, mlir::Value arraySize = nullptr); + void emitAggExpr(const clang::Expr *e, AggValueSlot slot); + /// Emit code to compute the specified expression which can have any type. The /// result is returned as an RValue struct. If this is an aggregate /// expression, the aggloc/agglocvolatile arguments indicate where the result diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 2a37d6c7d1888..d3b3b0632c2f0 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -57,6 +57,18 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext, FP80Ty = cir::FP80Type::get(&getMLIRContext()); FP128Ty = cir::FP128Type::get(&getMLIRContext()); + PointerAlignInBytes = + astContext + .toCharUnitsFromBits( + astContext.getTargetInfo().getPointerAlign(LangAS::Default)) + .getQuantity(); + + // TODO(CIR): Should be updated once TypeSizeInfoAttr is upstreamed + const unsigned sizeTypeSize = + astContext.getTypeSize(astContext.getSignedSizeType()); + PtrDiffTy = + cir::IntType::get(&getMLIRContext(), sizeTypeSize, /*isSigned=*/true); + theModule->setAttr(cir::CIRDialect::getTripleAttrName(), builder.getStringAttr(getTriple().str())); } diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index 734cafa2e07bb..6ba1ccc4ddd9f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -113,6 +113,10 @@ class CIRGenModule : public CIRGenTypeCache { void emitGlobalVarDefinition(const clang::VarDecl *vd, bool isTentative = false); + /// Return the result of value-initializing the given type, i.e. a null + /// expression of the given type. + mlir::Value emitNullConstant(QualType t, mlir::Location loc); + cir::FuncOp getOrCreateCIRFunction(llvm::StringRef mangledName, mlir::Type funcType, clang::GlobalDecl gd, bool forVTable, diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h index 99c0123c64b28..a5b7f0c9579b4 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h +++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h @@ -13,6 +13,7 @@ #ifndef LLVM_CLANG_LIB_CIR_CIRGENTYPECACHE_H #define LLVM_CLANG_LIB_CIR_CIRGENTYPECACHE_H +#include "clang/AST/CharUnits.h" #include "clang/CIR/Dialect/IR/CIRTypes.h" namespace clang::CIRGen { @@ -47,6 +48,18 @@ struct CIRGenTypeCache { cir::DoubleType DoubleTy; cir::FP80Type FP80Ty; cir::FP128Type FP128Ty; + + mlir::Type PtrDiffTy; + + /// The size and alignment of a pointer into the generic address space. + union { + unsigned char PointerAlignInBytes; + unsigned char PointerSizeInBytes; + }; + + clang::CharUnits getPointerAlign() const { + return clang::CharUnits::fromQuantity(PointerAlignInBytes); + } }; } // namespace clang::CIRGen diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp index aaf3fe240f3c3..1e47ccc451b86 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp @@ -254,3 +254,30 @@ mlir::Type CIRGenTypes::convertTypeForMem(clang::QualType qualType, return convertedType; } + +bool CIRGenTypes::isZeroInitializable(clang::QualType t) { + if (t->getAs()) + return astContext.getTargetNullPointerValue(t) == 0; + + if (const auto *at = astContext.getAsArrayType(t)) { + if (isa(at)) + return true; + + if (const auto *cat = dyn_cast(at)) + if (astContext.getConstantArrayElementCount(cat) == 0) + return true; + } + + if (t->getAs()) { + cgm.errorNYI(SourceLocation(), "isZeroInitializable for RecordType", t); + return false; + } + + if (t->getAs()) { + cgm.errorNYI(SourceLocation(), "isZeroInitializable for MemberPointerType", + t); + return false; + } + + return true; +} diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.h b/clang/lib/CIR/CodeGen/CIRGenTypes.h index f280e17ebddc6..73948f5c63e6a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypes.h +++ b/clang/lib/CIR/CodeGen/CIRGenTypes.h @@ -71,6 +71,10 @@ class CIRGenTypes { /// representation is usually i8 or i32, depending on the target. // TODO: convert this comment to account for MLIR's equivalence mlir::Type convertTypeForMem(clang::QualType, bool forBitField = false); + + /// Return whether a type can be zero-initialized (in the C++ sense) with an + /// LLVM zeroinitializer. + bool isZeroInitializable(clang::QualType t); }; } // namespace clang::CIRGen diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h index c559e853aad39..d22d518ef4904 100644 --- a/clang/lib/CIR/CodeGen/CIRGenValue.h +++ b/clang/lib/CIR/CodeGen/CIRGenValue.h @@ -85,11 +85,15 @@ class LValue { MatrixElt // This is a matrix element, use getVector* } lvType; clang::QualType type; + clang::Qualifiers quals; mlir::Value v; mlir::Type elementType; - void initialize(clang::QualType type) { this->type = type; } + void initialize(clang::QualType type, clang::Qualifiers quals) { + this->type = type; + this->quals = quals; + } public: bool isSimple() const { return lvType == Simple; } @@ -111,16 +115,52 @@ class LValue { return Address(getPointer(), elementType, getAlignment()); } + const clang::Qualifiers &getQuals() const { return quals; } + static LValue makeAddr(Address address, clang::QualType t) { LValue r; r.lvType = Simple; r.v = address.getPointer(); r.elementType = address.getElementType(); - r.initialize(t); + r.initialize(t, t.getQualifiers()); return r; } }; +/// An aggregate value slot. +class AggValueSlot { + + Address addr; + clang::Qualifiers quals; + + /// This is set to true if the memory in the slot is known to be zero before + /// the assignment into it. This means that zero fields don't need to be set. + bool zeroedFlag : 1; + +public: + enum IsZeroed_t { IsNotZeroed, IsZeroed }; + + AggValueSlot(Address addr, clang::Qualifiers quals, bool zeroedFlag) + : addr(addr), quals(quals), zeroedFlag(zeroedFlag) {} + + static AggValueSlot forAddr(Address addr, clang::Qualifiers quals, + IsZeroed_t isZeroed = IsNotZeroed) { + return AggValueSlot(addr, quals, isZeroed); + } + + static AggValueSlot forLValue(const LValue &lv) { + return forAddr(lv.getAddress(), lv.getQuals()); + } + + clang::Qualifiers getQualifiers() const { return quals; } + + Address getAddress() const { return addr; } + + bool isIgnored() const { return !addr.isValid(); } + + IsZeroed_t isZeroed() const { return IsZeroed_t(zeroedFlag); } +}; + } // namespace clang::CIRGen #endif // CLANG_LIB_CIR_CIRGENVALUE_H diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt index 8ee65c2763e70..da8d63ca569af 100644 --- a/clang/lib/CIR/CodeGen/CMakeLists.txt +++ b/clang/lib/CIR/CodeGen/CMakeLists.txt @@ -10,6 +10,7 @@ add_clang_library(clangCIR CIRGenerator.cpp CIRGenDecl.cpp CIRGenExpr.cpp + CIRGenExprAggregate.cpp CIRGenExprConstant.cpp CIRGenExprScalar.cpp CIRGenFunction.cpp diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp index 4ace083e3c081..143ed5544375f 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp @@ -246,8 +246,8 @@ OpFoldResult cir::ConstantOp::fold(FoldAdaptor /*adaptor*/) { //===----------------------------------------------------------------------===// LogicalResult cir::CastOp::verify() { - auto resType = getResult().getType(); - auto srcType = getSrc().getType(); + const mlir::Type resType = getResult().getType(); + const mlir::Type srcType = getSrc().getType(); switch (getKind()) { case cir::CastKind::int_to_bool: { @@ -271,6 +271,15 @@ LogicalResult cir::CastOp::verify() { return emitOpError() << "requires !cir.int type for source"; return success(); } + case cir::CastKind::array_to_ptrdecay: { + const auto arrayPtrTy = mlir::dyn_cast(srcType); + const auto flatPtrTy = mlir::dyn_cast(resType); + if (!arrayPtrTy || !flatPtrTy) + return emitOpError() << "requires !cir.ptr type for source and result"; + + // TODO(CIR): Make sure the AddrSpace of both types are equals + return success(); + } case cir::CastKind::bitcast: { // Handle the pointer types first. auto srcPtrTy = mlir::dyn_cast(srcType); @@ -453,9 +462,9 @@ mlir::LogicalResult cir::ReturnOp::verify() { /// Given the region at `index`, or the parent operation if `index` is None, /// return the successor regions. These are the regions that may be selected -/// during the flow of control. `operands` is a set of optional attributes that -/// correspond to a constant value for each operand, or null if that operand is -/// not a constant. +/// during the flow of control. `operands` is a set of optional attributes +/// that correspond to a constant value for each operand, or null if that +/// operand is not a constant. void cir::ScopeOp::getSuccessorRegions( mlir::RegionBranchPoint point, SmallVectorImpl ®ions) { // The only region always branch back to the parent operation. @@ -683,8 +692,8 @@ ParseResult cir::FuncOp::parse(OpAsmParser &parser, OperationState &state) { } bool cir::FuncOp::isDeclaration() { - // TODO(CIR): This function will actually do something once external function - // declarations and aliases are upstreamed. + // TODO(CIR): This function will actually do something once external + // function declarations and aliases are upstreamed. return false; } @@ -710,6 +719,25 @@ void cir::FuncOp::print(OpAsmPrinter &p) { } } +//===----------------------------------------------------------------------===// +// CIR defined traits +//===----------------------------------------------------------------------===// + +LogicalResult +mlir::OpTrait::impl::verifySameFirstOperandAndResultType(Operation *op) { + if (failed(verifyAtLeastNOperands(op, 1)) || failed(verifyOneResult(op))) + return failure(); + + const Type type = op->getResult(0).getType(); + const Type opType = op->getOperand(0).getType(); + + if (type != opType) + return op->emitOpError() + << "requires the same type for first operand and result"; + + return success(); +} + // TODO(CIR): The properties of functions that require verification haven't // been implemented yet. mlir::LogicalResult cir::FuncOp::verify() { return success(); } diff --git a/clang/lib/CIR/Lowering/CMakeLists.txt b/clang/lib/CIR/Lowering/CMakeLists.txt index 09e48862df63c..28ec3c551018c 100644 --- a/clang/lib/CIR/Lowering/CMakeLists.txt +++ b/clang/lib/CIR/Lowering/CMakeLists.txt @@ -7,6 +7,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_clang_library(clangCIRLoweringCommon CIRPasses.cpp + LoweringHelpers.cpp LINK_LIBS clangCIR diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 48dc09d151dcf..1c455039269b9 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -13,6 +13,7 @@ #include "LowerToLLVM.h" #include +#include #include "mlir/Conversion/LLVMCommon/TypeConverter.h" #include "mlir/Dialect/DLTI/DLTI.h" @@ -28,6 +29,7 @@ #include "mlir/Transforms/DialectConversion.h" #include "clang/CIR/Dialect/IR/CIRDialect.h" #include "clang/CIR/Dialect/Passes.h" +#include "clang/CIR/LoweringHelpers.h" #include "clang/CIR/MissingFeatures.h" #include "clang/CIR/Passes.h" #include "llvm/ADT/TypeSwitch.h" @@ -523,6 +525,66 @@ mlir::LogicalResult CIRToLLVMCastOpLowering::matchAndRewrite( return mlir::success(); } +mlir::LogicalResult CIRToLLVMPtrStrideOpLowering::matchAndRewrite( + cir::PtrStrideOp ptrStrideOp, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + + const mlir::TypeConverter *tc = getTypeConverter(); + const mlir::Type resultTy = tc->convertType(ptrStrideOp.getType()); + + mlir::Type elementTy = + convertTypeForMemory(*tc, dataLayout, ptrStrideOp.getElementTy()); + mlir::MLIRContext *ctx = elementTy.getContext(); + + // void and function types doesn't really have a layout to use in GEPs, + // make it i8 instead. + if (mlir::isa(elementTy) || + mlir::isa(elementTy)) + elementTy = mlir::IntegerType::get(elementTy.getContext(), 8, + mlir::IntegerType::Signless); + // Zero-extend, sign-extend or trunc the pointer value. + mlir::Value index = adaptor.getStride(); + const unsigned width = + mlir::cast(index.getType()).getWidth(); + const std::optional layoutWidth = + dataLayout.getTypeIndexBitwidth(adaptor.getBase().getType()); + + mlir::Operation *indexOp = index.getDefiningOp(); + if (indexOp && layoutWidth && width != *layoutWidth) { + // If the index comes from a subtraction, make sure the extension happens + // before it. To achieve that, look at unary minus, which already got + // lowered to "sub 0, x". + const auto sub = dyn_cast(indexOp); + auto unary = dyn_cast_if_present( + ptrStrideOp.getStride().getDefiningOp()); + bool rewriteSub = + unary && unary.getKind() == cir::UnaryOpKind::Minus && sub; + if (rewriteSub) + index = indexOp->getOperand(1); + + // Handle the cast + const auto llvmDstType = mlir::IntegerType::get(ctx, *layoutWidth); + index = getLLVMIntCast(rewriter, index, llvmDstType, + ptrStrideOp.getStride().getType().isUnsigned(), + width, *layoutWidth); + + // Rewrite the sub in front of extensions/trunc + if (rewriteSub) { + index = rewriter.create( + index.getLoc(), index.getType(), + rewriter.create( + index.getLoc(), index.getType(), + mlir::IntegerAttr::get(index.getType(), 0)), + index); + rewriter.eraseOp(sub); + } + } + + rewriter.replaceOpWithNewOp( + ptrStrideOp, resultTy, elementTy, adaptor.getBase(), index); + return mlir::success(); +} + mlir::LogicalResult CIRToLLVMAllocaOpLowering::matchAndRewrite( cir::AllocaOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { @@ -603,6 +665,15 @@ mlir::LogicalResult CIRToLLVMStoreOpLowering::matchAndRewrite( return mlir::LogicalResult::success(); } +bool hasTrailingZeros(cir::ConstArrayAttr attr) { + auto array = mlir::dyn_cast(attr.getElts()); + return attr.hasTrailingZeros() || + (array && std::count_if(array.begin(), array.end(), [](auto elt) { + auto ar = dyn_cast(elt); + return ar && hasTrailingZeros(ar); + })); +} + mlir::LogicalResult CIRToLLVMConstantOpLowering::matchAndRewrite( cir::ConstantOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { @@ -641,6 +712,27 @@ mlir::LogicalResult CIRToLLVMConstantOpLowering::matchAndRewrite( } assert(!cir::MissingFeatures::opGlobalViewAttr()); attr = op.getValue(); + } else if (const auto arrTy = mlir::dyn_cast(op.getType())) { + const auto constArr = mlir::dyn_cast(op.getValue()); + if (!constArr && !isa(op.getValue())) + return op.emitError() << "array does not have a constant initializer"; + + std::optional denseAttr; + if (constArr && hasTrailingZeros(constArr)) { + const mlir::Value newOp = + lowerCirAttrAsValue(op, constArr, rewriter, getTypeConverter()); + rewriter.replaceOp(op, newOp); + return mlir::success(); + } else if (constArr && + (denseAttr = lowerConstArrayAttr(constArr, typeConverter))) { + attr = denseAttr.value(); + } else { + const mlir::Value initVal = + lowerCirAttrAsValue(op, op.getValue(), rewriter, typeConverter); + rewriter.replaceAllUsesWith(op, initVal); + rewriter.eraseOp(op); + return mlir::success(); + } } else { return op.emitError() << "unsupported constant type " << op.getType(); } @@ -1230,6 +1322,8 @@ void ConvertCIRToLLVMPass::runOnOperation() { patterns.add(converter, patterns.getContext(), dl); patterns.add(converter, patterns.getContext(), dl); patterns.add(converter, patterns.getContext(), dl); + patterns.add(converter, patterns.getContext(), + dl); patterns.add< // clang-format off CIRToLLVMBinOpLowering, diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h index b2926e75d1303..6f489fb49f44f 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h @@ -204,6 +204,21 @@ class CIRToLLVMTrapOpLowering : public mlir::OpConversionPattern { mlir::ConversionPatternRewriter &) const override; }; +class CIRToLLVMPtrStrideOpLowering + : public mlir::OpConversionPattern { + mlir::DataLayout const &dataLayout; + +public: + CIRToLLVMPtrStrideOpLowering(const mlir::TypeConverter &typeConverter, + mlir::MLIRContext *context, + mlir::DataLayout const &dataLayout) + : OpConversionPattern(typeConverter, context), dataLayout(dataLayout) {} + using mlir::OpConversionPattern::OpConversionPattern; + + mlir::LogicalResult + matchAndRewrite(cir::PtrStrideOp op, OpAdaptor, + mlir::ConversionPatternRewriter &) const override; +}; } // namespace direct } // namespace cir diff --git a/clang/lib/CIR/Lowering/LoweringHelpers.cpp b/clang/lib/CIR/Lowering/LoweringHelpers.cpp new file mode 100644 index 0000000000000..0320bc40509b0 --- /dev/null +++ b/clang/lib/CIR/Lowering/LoweringHelpers.cpp @@ -0,0 +1,146 @@ +//====- LoweringHelpers.cpp - Lowering helper functions -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains helper functions for lowering from CIR to LLVM or MLIR. +// +//===----------------------------------------------------------------------===// + +#include "clang/CIR/LoweringHelpers.h" +#include "clang/CIR/MissingFeatures.h" + +mlir::DenseElementsAttr +convertStringAttrToDenseElementsAttr(cir::ConstArrayAttr attr, + mlir::Type type) { + auto values = llvm::SmallVector{}; + const auto stringAttr = mlir::cast(attr.getElts()); + + for (const char element : stringAttr) + values.push_back({8, (uint64_t)element}); + + const auto arrayTy = mlir::cast(attr.getType()); + if (arrayTy.getSize() != stringAttr.size()) + assert(!cir::MissingFeatures::stringTypeWithDifferentArraySize()); + + return mlir::DenseElementsAttr::get( + mlir::RankedTensorType::get({(int64_t)values.size()}, type), + llvm::ArrayRef(values)); +} + +template <> mlir::APInt getZeroInitFromType(mlir::Type ty) { + assert(mlir::isa(ty) && "expected int type"); + const auto intTy = mlir::cast(ty); + return mlir::APInt::getZero(intTy.getWidth()); +} + +template <> mlir::APFloat getZeroInitFromType(mlir::Type ty) { + assert((mlir::isa(ty)) && + "only float and double supported"); + + if (ty.isF32() || mlir::isa(ty)) + return mlir::APFloat(0.f); + + if (ty.isF64() || mlir::isa(ty)) + return mlir::APFloat(0.0); + + llvm_unreachable("NYI"); +} + +/// \param attr the ConstArrayAttr to convert +/// \param values the output parameter, the values array to fill +/// \param currentDims the shpae of tensor we're going to convert to +/// \param dimIndex the current dimension we're processing +/// \param currentIndex the current index in the values array +template +void convertToDenseElementsAttrImpl( + cir::ConstArrayAttr attr, llvm::SmallVectorImpl &values, + const llvm::SmallVectorImpl ¤tDims, int64_t dimIndex, + int64_t currentIndex) { + if (auto stringAttr = mlir::dyn_cast(attr.getElts())) { + if (auto arrayType = mlir::dyn_cast(attr.getType())) { + for (auto element : stringAttr) { + auto intAttr = cir::IntAttr::get(arrayType.getEltType(), element); + values[currentIndex++] = mlir::dyn_cast(intAttr).getValue(); + } + return; + } + } + + dimIndex++; + std::size_t elementsSizeInCurrentDim = 1; + for (std::size_t i = dimIndex; i < currentDims.size(); i++) + elementsSizeInCurrentDim *= currentDims[i]; + + auto arrayAttr = mlir::cast(attr.getElts()); + for (auto eltAttr : arrayAttr) { + if (auto valueAttr = mlir::dyn_cast(eltAttr)) { + values[currentIndex++] = valueAttr.getValue(); + continue; + } + + if (auto subArrayAttr = mlir::dyn_cast(eltAttr)) { + convertToDenseElementsAttrImpl(subArrayAttr, values, currentDims, + dimIndex, currentIndex); + currentIndex += elementsSizeInCurrentDim; + continue; + } + + if (mlir::isa(eltAttr)) { + currentIndex += elementsSizeInCurrentDim; + continue; + } + + llvm_unreachable("unknown element in ConstArrayAttr"); + } +} + +template +mlir::DenseElementsAttr convertToDenseElementsAttr( + cir::ConstArrayAttr attr, const llvm::SmallVectorImpl &dims, + mlir::Type elementType, mlir::Type convertedElementType) { + unsigned vectorSize = 1; + for (auto dim : dims) + vectorSize *= dim; + auto values = llvm::SmallVector( + vectorSize, getZeroInitFromType(elementType)); + convertToDenseElementsAttrImpl(attr, values, dims, /*currentDim=*/0, + /*initialIndex=*/0); + return mlir::DenseElementsAttr::get( + mlir::RankedTensorType::get(dims, convertedElementType), + llvm::ArrayRef(values)); +} + +std::optional +lowerConstArrayAttr(cir::ConstArrayAttr constArr, + const mlir::TypeConverter *converter) { + // Ensure ConstArrayAttr has a type. + const auto typedConstArr = mlir::cast(constArr); + + // Ensure ConstArrayAttr type is a ArrayType. + const auto cirArrayType = mlir::cast(typedConstArr.getType()); + + // Is a ConstArrayAttr with an cir::ArrayType: fetch element type. + mlir::Type type = cirArrayType; + auto dims = llvm::SmallVector{}; + while (auto arrayType = mlir::dyn_cast(type)) { + dims.push_back(arrayType.getSize()); + type = arrayType.getEltType(); + } + + if (mlir::isa(constArr.getElts())) + return convertStringAttrToDenseElementsAttr(constArr, + converter->convertType(type)); + if (mlir::isa(type)) + return convertToDenseElementsAttr( + constArr, dims, type, converter->convertType(type)); + + if (mlir::isa(type)) + return convertToDenseElementsAttr( + constArr, dims, type, converter->convertType(type)); + + return std::nullopt; +} diff --git a/clang/test/CIR/CodeGen/array.cpp b/clang/test/CIR/CodeGen/array.cpp index 1e74275eab058..0d28ebc66f83c 100644 --- a/clang/test/CIR/CodeGen/array.cpp +++ b/clang/test/CIR/CodeGen/array.cpp @@ -28,14 +28,114 @@ int f[5] = {1, 2}; // CHECK: cir.global external @f = #cir.const_array<[#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i]> : !cir.array void func() { - int l[10]; - // CHECK: %[[ARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["l"] + int arr[10]; + + // CHECK: %[[ARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["arr"] +} + +void func2() { + int arr[2] = {5}; + + // CHECK: %[[ARR2:.*]] = cir.alloca !cir.array, !cir.ptr>, ["arr", init] + // CHECK: %[[ELE_ALLOCA:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["arrayinit.temp", init] + // CHECK: %[[ARR_2_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR2]] : !cir.ptr>), !cir.ptr + // CHECK: %[[V1:.*]] = cir.const #cir.int<5> : !s32i + // CHECK: cir.store %[[V1]], %[[ARR_2_PTR]] : !s32i, !cir.ptr + // CHECK: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i + // CHECK: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_2_PTR]] : !cir.ptr, %[[OFFSET_0]] : !s64i), !cir.ptr + // CHECK: cir.store %[[ELE_PTR]], %[[ELE_ALLOCA]] : !cir.ptr, !cir.ptr> + // CHECK: %[[LOAD_1:.*]] = cir.load %[[ELE_ALLOCA]] : !cir.ptr>, !cir.ptr + // CHECK: %[[V2:.*]] = cir.const #cir.int<0> : !s32i + // CHECK: cir.store %[[V2]], %[[LOAD_1]] : !s32i, !cir.ptr + // CHECK: %[[OFFSET_1:.*]] = cir.const #cir.int<1> : !s64i + // CHECK: %[[ELE_1_PTR:.*]] = cir.ptr_stride(%[[LOAD_1]] : !cir.ptr, %[[OFFSET_1]] : !s64i), !cir.ptr + // CHECK: cir.store %[[ELE_1_PTR]], %[[ELE_ALLOCA]] : !cir.ptr, !cir.ptr> +} + +void func3() { + int arr[2] = {5, 6}; + + // CHECK: %[[ARR3:.*]] = cir.alloca !cir.array, !cir.ptr>, ["arr", init] + // CHECK: %[[ARR_3_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR3]] : !cir.ptr>), !cir.ptr + // CHECK: %[[V0:.*]] = cir.const #cir.int<5> : !s32i + // CHECK: cir.store %[[V0]], %[[ARR_3_PTR]] : !s32i, !cir.ptr + // CHECK: %[[OFFSET_0:.*]] = cir.const #cir.int<1> : !s64i + // CHECK: %[[ELE_1_PTR:.*]] = cir.ptr_stride(%[[ARR_3_PTR]] : !cir.ptr, %[[OFFSET_0]] : !s64i), !cir.ptr + // CHECK: %[[V1:.*]] = cir.const #cir.int<6> : !s32i + // CHECK: cir.store %[[V1]], %[[ELE_1_PTR]] : !s32i, !cir.ptr +} + +void func4() { + int arr[2][1] = {{5}, {6}}; + + // CHECK: %[[ARR:.*]] = cir.alloca !cir.array x 2>, !cir.ptr x 2>>, ["arr", init] + // CHECK: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr x 2>>), !cir.ptr> + // CHECK: %[[ARR_0_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_PTR]] : !cir.ptr>), !cir.ptr + // CHECK: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i + // CHECK: cir.store %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr + // CHECK: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i + // CHECK: %[[ARR_1:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr>, %[[OFFSET]] : !s64i), !cir.ptr> + // CHECK: %[[ARR_1_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_1]] : !cir.ptr>), !cir.ptr + // CHECK: %[[V_1_0:.*]] = cir.const #cir.int<6> : !s32i + // CHECK: cir.store %[[V_1_0]], %[[ARR_1_PTR]] : !s32i, !cir.ptr +} + +void func5() { + int arr[2][1] = {{5}}; + + // CHECK: %[[ARR:.*]] = cir.alloca !cir.array x 2>, !cir.ptr x 2>>, ["arr", init] + // CHECK: %[[ARR_PTR:.*]] = cir.alloca !cir.ptr>, !cir.ptr>>, ["arrayinit.temp", init] + // CHECK: %[[ARR_0:.*]] = cir.cast(array_to_ptrdecay, %0 : !cir.ptr x 2>>), !cir.ptr> + // CHECK: %[[ARR_0_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR_0]] : !cir.ptr>), !cir.ptr + // CHECK: %[[V_0_0:.*]] = cir.const #cir.int<5> : !s32i + // CHECK: cir.store %[[V_0_0]], %[[ARR_0_PTR]] : !s32i, !cir.ptr + // CHECK: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i + // CHECK: %6 = cir.ptr_stride(%[[ARR_0]] : !cir.ptr>, %[[OFFSET]] : !s64i), !cir.ptr> + // CHECK: cir.store %6, %[[ARR_PTR]] : !cir.ptr>, !cir.ptr>> + // CHECK: %7 = cir.load %[[ARR_PTR]] : !cir.ptr>>, !cir.ptr> + // CHECK: %8 = cir.const #cir.zero : !cir.array + // CHECK: cir.store %8, %7 : !cir.array, !cir.ptr> + // CHECK: %[[OFFSET_1:.*]] = cir.const #cir.int<1> : !s64i + // CHECK: %10 = cir.ptr_stride(%7 : !cir.ptr>, %[[OFFSET_1]] : !s64i), !cir.ptr> + // CHECK: cir.store %10, %[[ARR_PTR]] : !cir.ptr>, !cir.ptr>> +} + +void func6() { + int x = 4; + int arr[2] = { x, 5 }; + + // CHECK: %[[VAR:.*]] = cir.alloca !s32i, !cir.ptr, ["x", init] + // CHECK: %[[ARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["arr", init] + // CHECK: %[[V:.*]] = cir.const #cir.int<4> : !s32i + // CHECK: cir.store %[[V]], %[[VAR]] : !s32i, !cir.ptr + // CHECK: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr>), !cir.ptr + // CHECK: %[[TMP:.*]] = cir.load %[[VAR]] : !cir.ptr, !s32i + // CHECK: cir.store %[[TMP]], %[[ARR_PTR]] : !s32i, !cir.ptr + // CHECK: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i + // CHECK: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[ARR_PTR]] : !cir.ptr, %[[OFFSET]] : !s64i), !cir.ptr + // CHECK: %[[V1:.*]] = cir.const #cir.int<5> : !s32i + // CHECK: cir.store %[[V1]], %[[ELE_PTR]] : !s32i, !cir.ptr +} + +void func7() { + int* arr[1] = {}; + + // CHECK: %[[ARR:.*]] = cir.alloca !cir.array x 1>, !cir.ptr x 1>>, ["arr", init] + // CHECK: %[[ARR_TMP:.*]] = cir.alloca !cir.ptr>, !cir.ptr>>, ["arrayinit.temp", init] + // CHECK: %[[ARR_PTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARR]] : !cir.ptr x 1>>), !cir.ptr> + // CHECK: cir.store %[[ARR_PTR]], %[[ARR_TMP]] : !cir.ptr>, !cir.ptr>> + // CHECK: %[[TMP:.*]] = cir.load %[[ARR_TMP]] : !cir.ptr>>, !cir.ptr> + // CHECK: %[[NULL_PTR:.*]] = cir.const #cir.ptr : !cir.ptr + // CHECK: cir.store %[[NULL_PTR]], %[[TMP]] : !cir.ptr, !cir.ptr> + // CHECK: %[[OFFSET:.*]] = cir.const #cir.int<1> : !s64i + // CHECK: %[[ELE_PTR:.*]] = cir.ptr_stride(%[[TMP]] : !cir.ptr>, %[[OFFSET]] : !s64i), !cir.ptr> + // CHECK: cir.store %[[ELE_PTR]], %[[ARR_TMP]] : !cir.ptr>, !cir.ptr>> } -void func2(int p[10]) {} -// CHECK: cir.func @func2(%arg0: !cir.ptr +void func8(int p[10]) {} +// CHECK: cir.func @func8(%arg0: !cir.ptr // CHECK: cir.alloca !cir.ptr, !cir.ptr>, ["p", init] -void func3(int pp[10][5]) {} -// CHECK: cir.func @func3(%arg0: !cir.ptr> +void func9(int pp[10][5]) {} +// CHECK: cir.func @func9(%arg0: !cir.ptr> // CHECK: cir.alloca !cir.ptr>, !cir.ptr>> diff --git a/clang/test/CIR/Lowering/array.cpp b/clang/test/CIR/Lowering/array.cpp index 4fb996aefe79e..e1c977eb43141 100644 --- a/clang/test/CIR/Lowering/array.cpp +++ b/clang/test/CIR/Lowering/array.cpp @@ -30,15 +30,95 @@ int f[5] = {1, 2}; // CHECK: @f = dso_local global [5 x i32] [i32 1, i32 2, i32 0, i32 0, i32 0] void func() { - int l[10]; + int arr[10]; } // CHECK: define void @func() // CHECK-NEXT: alloca [10 x i32], i64 1, align 16 -void func2(int p[10]) {} -// CHECK: define void @func2(ptr {{%.*}}) +void func2() { + int arr2[2] = {5}; +} +// CHECK: define void @func2() +// CHECK: %[[ARR_ALLOCA:.*]] = alloca [2 x i32], i64 1, align 4 +// CHECK: %[[TMP:.*]] = alloca ptr, i64 1, align 8 +// CHECK: %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0 +// CHECK: store i32 5, ptr %[[ARR_PTR]], align 4 +// CHECK: %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1 +// CHECK: store ptr %[[ELE_1_PTR]], ptr %[[TMP]], align 8 +// CHECK: %[[TMP2:.*]] = load ptr, ptr %[[TMP]], align 8 +// CHECK: store i32 0, ptr %[[TMP2]], align 4 +// CHECK: %[[ELE_1:.*]] = getelementptr i32, ptr %[[TMP2]], i64 1 +// CHECK: store ptr %[[ELE_1]], ptr %[[TMP]], align 8 + +void func3() { + int arr3[2] = {5, 6}; +} +// CHECK: define void @func3() +// CHECK: %[[ARR_ALLOCA:.*]] = alloca [2 x i32], i64 1, align 4 +// CHECK: %[[ARR_PTR:.*]] = getelementptr i32, ptr %[[ARR_ALLOCA]], i32 0 +// CHECK: store i32 5, ptr %[[ARR_PTR]], align 4 +// CHECK: %[[ELE_1_PTR:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i64 1 +// CHECK: store i32 6, ptr %[[ELE_1_PTR]], align 4 + +void func4() { + int arr4[2][1] = {{5}, {6}}; +} +// CHECK: define void @func4() +// CHECK: %[[ARR_ALLOCA:.*]] = alloca [2 x [1 x i32]], i64 1, align 4 +// CHECK: %[[ARR_0:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0 +// CHECK: %[[ARR_0_ELE_0:.*]] = getelementptr i32, ptr %[[ARR_0]], i32 0 +// CHECK: store i32 5, ptr %[[ARR_0_ELE_0]], align 4 +// CHECK: %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %2, i64 1 +// CHECK: %[[ARR_0_ELE_0:.*]] = getelementptr i32, ptr %[[ARR_1]], i32 0 +// CHECK: store i32 6, ptr %[[ARR_0_ELE_0]], align 4 + +void func5() { + int arr5[2][1] = {{5}}; +} +// CHECK: define void @func5() +// CHECK: %[[ARR_ALLOCA:.*]] = alloca [2 x [1 x i32]], i64 1, align 4 +// CHECK: %[[TMP:.*]] = alloca ptr, i64 1, align 8 +// CHECK: %[[ARR_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_ALLOCA]], i32 0 +// CHECK: %[[ARR_0:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0 +// CHECK: store i32 5, ptr %[[ARR_0]], align 4 +// CHECK: %[[ARR_1:.*]] = getelementptr [1 x i32], ptr %[[ARR_PTR]], i64 1 +// CHECK: store ptr %[[ARR_1]], ptr %[[TMP]], align 8 +// CHECK: %[[ARR_1_VAL:.*]] = load ptr, ptr %[[TMP]], align 8 +// CHECK: store [1 x i32] zeroinitializer, ptr %[[ARR_1_VAL]], align 4 +// CHECK: %[[ARR_1_PTR:.*]] = getelementptr [1 x i32], ptr %[[ARR_1_VAL]], i64 1 +// CHECK: store ptr %[[ARR_1_PTR]], ptr %[[TMP]], align 8 + +void func6() { + int x = 4; + int arr[2] = { x, 5 }; +} +// CHECK: define void @func6() +// CHECK: %[[VAR:.*]] = alloca i32, i64 1, align 4 +// CHECK: %[[ARR:.*]] = alloca [2 x i32], i64 1, align 4 +// CHECK: store i32 4, ptr %[[VAR]], align 4 +// CHECK: %[[ELE_0:.*]] = getelementptr i32, ptr %[[ARR]], i32 0 +// CHECK: %[[TMP:.*]] = load i32, ptr %[[VAR]], align 4 +// CHECK: store i32 %[[TMP]], ptr %[[ELE_0]], align 4 +// CHECK: %[[ELE_1:.*]] = getelementptr i32, ptr %[[ELE_0]], i64 1 +// CHECK: store i32 5, ptr %[[ELE_1]], align 4 + +void func7() { + int* arr[1] = {}; +} +// CHECK: define void @func7() +// CHECK: %[[ARR:.*]] = alloca [1 x ptr], i64 1, align 8 +// CHECK: %[[ALLOCA:.*]] = alloca ptr, i64 1, align 8 +// CHECK: %[[ELE_PTR:.*]] = getelementptr ptr, ptr %[[ARR]], i32 0 +// CHECK: store ptr %[[ELE_PTR]], ptr %[[ALLOCA]], align 8 +// CHECK: %[[TMP:.*]] = load ptr, ptr %[[ALLOCA]], align 8 +// CHECK: store ptr null, ptr %[[TMP]], align 8 +// CHECK: %[[ELE:.*]] = getelementptr ptr, ptr %[[TMP]], i64 1 +// CHECK: store ptr %[[ELE]], ptr %[[ALLOCA]], align 8 + +void func8(int p[10]) {} +// CHECK: define void @func8(ptr {{%.*}}) // CHECK-NEXT: alloca ptr, i64 1, align 8 -void func3(int pp[10][5]) {} -// CHECK: define void @func3(ptr {{%.*}}) +void func9(int pp[10][5]) {} +// CHECK: define void @func9(ptr {{%.*}}) // CHECK-NEXT: alloca ptr, i64 1, align 8 From cfee056b4e75cd941591d298e0f8dc303460c57e Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Thu, 3 Apr 2025 14:27:18 -0300 Subject: [PATCH 0564/1029] [clang] NFC: introduce UnsignedOrNone as a replacement for std::optional (#134142) This introduces a new class 'UnsignedOrNone', which models a lite version of `std::optional`, but has the same size as 'unsigned'. This replaces most uses of `std::optional`, and similar schemes utilizing 'int' and '-1' as sentinel. Besides the smaller size advantage, this is simpler to serialize, as its internal representation is a single unsigned int as well. --- clang/include/clang/AST/ASTConcept.h | 12 +- clang/include/clang/AST/ASTContext.h | 9 +- clang/include/clang/AST/ASTImporter.h | 2 +- .../clang/AST/ASTStructuralEquivalence.h | 3 +- clang/include/clang/AST/Decl.h | 10 +- clang/include/clang/AST/DeclTemplate.h | 36 ++-- clang/include/clang/AST/ExprCXX.h | 33 ++-- clang/include/clang/AST/ExprObjC.h | 2 +- clang/include/clang/AST/Mangle.h | 4 +- clang/include/clang/AST/PropertiesBase.td | 14 +- clang/include/clang/AST/TemplateBase.h | 13 +- clang/include/clang/AST/TemplateName.h | 19 +-- clang/include/clang/AST/Type.h | 23 ++- clang/include/clang/AST/TypeProperties.td | 4 +- clang/include/clang/Basic/UnsignedOrNone.h | 53 ++++++ clang/include/clang/Sema/Sema.h | 59 ++++--- clang/include/clang/Sema/SemaLambda.h | 2 +- clang/include/clang/Sema/SemaOpenACC.h | 2 +- clang/include/clang/Sema/Template.h | 5 +- clang/include/clang/Sema/TemplateDeduction.h | 2 +- clang/include/clang/Serialization/ASTReader.h | 2 +- .../clang/Serialization/ASTRecordReader.h | 4 + .../clang/Serialization/ASTRecordWriter.h | 4 + clang/lib/AST/ASTContext.cpp | 41 +++-- clang/lib/AST/ASTImporter.cpp | 9 +- clang/lib/AST/ASTStructuralEquivalence.cpp | 6 +- clang/lib/AST/ComputeDependence.cpp | 2 +- clang/lib/AST/Decl.cpp | 2 +- clang/lib/AST/DeclTemplate.cpp | 12 +- clang/lib/AST/Expr.cpp | 2 +- clang/lib/AST/ExprCXX.cpp | 13 +- clang/lib/AST/ItaniumMangle.cpp | 6 +- clang/lib/AST/JSONNodeDumper.cpp | 2 +- clang/lib/AST/TemplateBase.cpp | 9 +- clang/lib/AST/TemplateName.cpp | 4 +- clang/lib/AST/TextNodeDumper.cpp | 2 +- clang/lib/AST/Type.cpp | 18 +- clang/lib/Sema/Sema.cpp | 2 +- clang/lib/Sema/SemaConcept.cpp | 16 +- clang/lib/Sema/SemaDecl.cpp | 2 +- clang/lib/Sema/SemaDeclCXX.cpp | 10 +- clang/lib/Sema/SemaExprCXX.cpp | 8 +- clang/lib/Sema/SemaLambda.cpp | 25 ++- clang/lib/Sema/SemaOpenACC.cpp | 5 +- clang/lib/Sema/SemaOverload.cpp | 5 +- clang/lib/Sema/SemaTemplate.cpp | 12 +- clang/lib/Sema/SemaTemplateDeduction.cpp | 60 +++---- clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 28 ++-- clang/lib/Sema/SemaTemplateInstantiate.cpp | 100 +++++------ .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 59 ++++--- clang/lib/Sema/SemaTemplateVariadic.cpp | 26 +-- clang/lib/Sema/SemaType.cpp | 14 +- clang/lib/Sema/TreeTransform.h | 155 +++++++++--------- clang/lib/Serialization/ASTReader.cpp | 4 +- clang/lib/Serialization/ASTReaderDecl.cpp | 13 +- clang/lib/Serialization/ASTReaderStmt.cpp | 2 +- clang/lib/Serialization/ASTWriter.cpp | 2 +- clang/lib/Serialization/ASTWriterDecl.cpp | 9 +- clang/lib/Serialization/ASTWriterStmt.cpp | 2 +- clang/unittests/AST/ASTImporterTest.cpp | 12 +- 60 files changed, 509 insertions(+), 507 deletions(-) create mode 100644 clang/include/clang/Basic/UnsignedOrNone.h diff --git a/clang/include/clang/AST/ASTConcept.h b/clang/include/clang/AST/ASTConcept.h index f89899c3ea7b1..078e1e848f393 100644 --- a/clang/include/clang/AST/ASTConcept.h +++ b/clang/include/clang/AST/ASTConcept.h @@ -18,6 +18,7 @@ #include "clang/AST/NestedNameSpecifier.h" #include "clang/AST/TemplateBase.h" #include "clang/Basic/SourceLocation.h" +#include "clang/Basic/UnsignedOrNone.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/SmallVector.h" @@ -229,15 +230,14 @@ class TypeConstraint { /// type-constraint. Expr *ImmediatelyDeclaredConstraint = nullptr; ConceptReference *ConceptRef; - int ArgumentPackSubstitutionIndex; + UnsignedOrNone ArgPackSubstIndex; public: TypeConstraint(ConceptReference *ConceptRef, Expr *ImmediatelyDeclaredConstraint, - int ArgumentPackSubstitutionIndex) + UnsignedOrNone ArgPackSubstIndex) : ImmediatelyDeclaredConstraint(ImmediatelyDeclaredConstraint), - ConceptRef(ConceptRef), - ArgumentPackSubstitutionIndex(ArgumentPackSubstitutionIndex) {} + ConceptRef(ConceptRef), ArgPackSubstIndex(ArgPackSubstIndex) {} /// \brief Get the immediately-declared constraint expression introduced by /// this type-constraint, that is - the constraint expression that is added to @@ -248,9 +248,7 @@ class TypeConstraint { ConceptReference *getConceptReference() const { return ConceptRef; } - int getArgumentPackSubstitutionIndex() const { - return ArgumentPackSubstitutionIndex; - } + UnsignedOrNone getArgPackSubstIndex() const { return ArgPackSubstIndex; } // FIXME: Instead of using these concept related functions the callers should // directly work with the corresponding ConceptReference. diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 6bdafbdafda94..3ff9f308f3a5e 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -1797,7 +1797,7 @@ class ASTContext : public RefCountedBase { QualType getSubstTemplateTypeParmType(QualType Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, + UnsignedOrNone PackIndex, bool Final) const; QualType getSubstTemplateTypeParmPackType(Decl *AssociatedDecl, unsigned Index, bool Final, @@ -1853,8 +1853,7 @@ class ASTContext : public RefCountedBase { /// expansion is used in a context where the arity is inferred from /// elsewhere, such as if the pattern contains a placeholder type or /// if this is the canonical type of another pack expansion type. - QualType getPackExpansionType(QualType Pattern, - std::optional NumExpansions, + QualType getPackExpansionType(QualType Pattern, UnsignedOrNone NumExpansions, bool ExpectPackInType = true) const; QualType getObjCInterfaceType(const ObjCInterfaceDecl *Decl, @@ -1898,7 +1897,7 @@ class ASTContext : public RefCountedBase { QualType getPackIndexingType(QualType Pattern, Expr *IndexExpr, bool FullySubstituted = false, ArrayRef Expansions = {}, - int Index = -1) const; + UnsignedOrNone Index = std::nullopt) const; /// Unary type transforms QualType getUnaryTransformType(QualType BaseType, QualType UnderlyingType, @@ -2396,7 +2395,7 @@ class ASTContext : public RefCountedBase { TemplateName getSubstTemplateTemplateParm(TemplateName replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, + UnsignedOrNone PackIndex, bool Final) const; TemplateName getSubstTemplateTemplateParmPack(const TemplateArgument &ArgPack, Decl *AssociatedDecl, diff --git a/clang/include/clang/AST/ASTImporter.h b/clang/include/clang/AST/ASTImporter.h index a2550716e3c7f..c40b92666a2ff 100644 --- a/clang/include/clang/AST/ASTImporter.h +++ b/clang/include/clang/AST/ASTImporter.h @@ -592,7 +592,7 @@ class TypeSourceInfo; /// F should be a field (or indirect field) declaration. /// \returns The index of the field in its parent context (starting from 0). /// On error `std::nullopt` is returned (parent context is non-record). - static std::optional getFieldIndex(Decl *F); + static UnsignedOrNone getFieldIndex(Decl *F); }; } // namespace clang diff --git a/clang/include/clang/AST/ASTStructuralEquivalence.h b/clang/include/clang/AST/ASTStructuralEquivalence.h index 67aa0023c25d0..b0caded2f49a6 100644 --- a/clang/include/clang/AST/ASTStructuralEquivalence.h +++ b/clang/include/clang/AST/ASTStructuralEquivalence.h @@ -123,8 +123,7 @@ struct StructuralEquivalenceContext { /// /// FIXME: This is needed by ASTImporter and ASTStructureEquivalence. It /// probably makes more sense in some other common place then here. - static std::optional - findUntaggedStructOrUnionIndex(RecordDecl *Anon); + static UnsignedOrNone findUntaggedStructOrUnionIndex(RecordDecl *Anon); // If ErrorOnTagTypeMismatch is set, return the error, otherwise get the // relevant warning for the input error diagnostic. diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index e4f1e2921bef8..ff1d3497b77c3 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -33,6 +33,7 @@ #include "clang/Basic/PragmaKinds.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/Specifiers.h" +#include "clang/Basic/UnsignedOrNone.h" #include "clang/Basic/Visibility.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/ArrayRef.h" @@ -82,14 +83,13 @@ enum class ImplicitParamKind; // expanded. struct AssociatedConstraint { const Expr *ConstraintExpr = nullptr; - int ArgumentPackSubstitutionIndex = -1; + UnsignedOrNone ArgPackSubstIndex = std::nullopt; constexpr AssociatedConstraint() = default; explicit AssociatedConstraint(const Expr *ConstraintExpr, - int ArgumentPackSubstitutionIndex = -1) - : ConstraintExpr(ConstraintExpr), - ArgumentPackSubstitutionIndex(ArgumentPackSubstitutionIndex) {} + UnsignedOrNone ArgPackSubstIndex = std::nullopt) + : ConstraintExpr(ConstraintExpr), ArgPackSubstIndex(ArgPackSubstIndex) {} explicit operator bool() const { return ConstraintExpr != nullptr; } @@ -2540,7 +2540,7 @@ class FunctionDecl : public DeclaratorDecl, /// If this function is an allocation/deallocation function that takes /// the `std::nothrow_t` tag, return true through IsNothrow, bool isReplaceableGlobalAllocationFunction( - std::optional *AlignmentParam = nullptr, + UnsignedOrNone *AlignmentParam = nullptr, bool *IsNothrow = nullptr) const; /// Determine if this function provides an inline implementation of a builtin. diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h index 37fe0acf5d4d5..a8100b642e04c 100644 --- a/clang/include/clang/AST/DeclTemplate.h +++ b/clang/include/clang/AST/DeclTemplate.h @@ -1198,13 +1198,8 @@ class TemplateTypeParmDecl final : public TypeDecl, /// type constraint. bool TypeConstraintInitialized : 1; - /// Whether this type template parameter is an "expanded" - /// parameter pack, meaning that its type is a pack expansion and we - /// already know the set of types that expansion expands to. - bool ExpandedParameterPack : 1; - - /// The number of type parameters in an expanded parameter pack. - unsigned NumExpanded = 0; + /// The number of type parameters in an expanded parameter pack, if any. + UnsignedOrNone NumExpanded = std::nullopt; /// The default template argument, if any. using DefArgStorage = @@ -1213,19 +1208,17 @@ class TemplateTypeParmDecl final : public TypeDecl, TemplateTypeParmDecl(DeclContext *DC, SourceLocation KeyLoc, SourceLocation IdLoc, IdentifierInfo *Id, bool Typename, - bool HasTypeConstraint, - std::optional NumExpanded) + bool HasTypeConstraint, UnsignedOrNone NumExpanded) : TypeDecl(TemplateTypeParm, DC, IdLoc, Id, KeyLoc), Typename(Typename), HasTypeConstraint(HasTypeConstraint), TypeConstraintInitialized(false), - ExpandedParameterPack(NumExpanded), - NumExpanded(NumExpanded.value_or(0)) {} + NumExpanded(NumExpanded) {} public: static TemplateTypeParmDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation KeyLoc, SourceLocation NameLoc, unsigned D, unsigned P, IdentifierInfo *Id, bool Typename, bool ParameterPack, bool HasTypeConstraint = false, - std::optional NumExpanded = std::nullopt); + UnsignedOrNone NumExpanded = std::nullopt); static TemplateTypeParmDecl *CreateDeserialized(const ASTContext &C, GlobalDeclID ID); static TemplateTypeParmDecl *CreateDeserialized(const ASTContext &C, @@ -1327,13 +1320,8 @@ class TemplateTypeParmDecl final : public TypeDecl, /// expanded parameter pack. For example, instantiating /// \c X results in \c Convertibles being an expanded /// parameter pack of size 2 (use getNumExpansionTypes() to get this number). - bool isExpandedParameterPack() const { return ExpandedParameterPack; } - - /// Retrieves the number of parameters in an expanded parameter pack. - unsigned getNumExpansionParameters() const { - assert(ExpandedParameterPack && "Not an expansion parameter pack"); - return NumExpanded; - } + /// Retrieves the number of parameters in an expanded parameter pack, if any. + UnsignedOrNone getNumExpansionParameters() const { return NumExpanded; } /// Returns the type constraint associated with this template parameter (if /// any). @@ -1344,7 +1332,7 @@ class TemplateTypeParmDecl final : public TypeDecl, void setTypeConstraint(ConceptReference *CR, Expr *ImmediatelyDeclaredConstraint, - int ArgumentPackSubstitutionIndex); + UnsignedOrNone ArgPackSubstIndex); /// Determine whether this template parameter has a type-constraint. bool hasTypeConstraint() const { @@ -1360,7 +1348,7 @@ class TemplateTypeParmDecl final : public TypeDecl, llvm::SmallVectorImpl &AC) const { if (HasTypeConstraint) AC.emplace_back(getTypeConstraint()->getImmediatelyDeclaredConstraint(), - getTypeConstraint()->getArgumentPackSubstitutionIndex()); + getTypeConstraint()->getArgPackSubstIndex()); } SourceRange getSourceRange() const override LLVM_READONLY; @@ -3379,10 +3367,10 @@ inline TemplateDecl *getAsTypeTemplateDecl(Decl *D) { /// /// In \c A::B, \c NTs and \c TTs have expanded pack size 2, and \c Us /// is not a pack expansion, so returns an empty Optional. -inline std::optional getExpandedPackSize(const NamedDecl *Param) { +inline UnsignedOrNone getExpandedPackSize(const NamedDecl *Param) { if (const auto *TTP = dyn_cast(Param)) { - if (TTP->isExpandedParameterPack()) - return TTP->getNumExpansionParameters(); + if (UnsignedOrNone Num = TTP->getNumExpansionParameters()) + return Num; } if (const auto *NTTP = dyn_cast(Param)) { diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 0f1455d8be3ca..ac78d2faefe42 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -4210,7 +4210,7 @@ class PackExpansionExpr : public Expr { public: PackExpansionExpr(QualType T, Expr *Pattern, SourceLocation EllipsisLoc, - std::optional NumExpansions) + UnsignedOrNone NumExpansions) : Expr(PackExpansionExprClass, T, Pattern->getValueKind(), Pattern->getObjectKind()), EllipsisLoc(EllipsisLoc), @@ -4233,7 +4233,7 @@ class PackExpansionExpr : public Expr { /// Determine the number of expansions that will be produced when /// this pack expansion is instantiated, if already known. - std::optional getNumExpansions() const { + UnsignedOrNone getNumExpansions() const { if (NumExpansions) return NumExpansions - 1; @@ -4304,8 +4304,7 @@ class SizeOfPackExpr final /// the given parameter pack. SizeOfPackExpr(QualType SizeType, SourceLocation OperatorLoc, NamedDecl *Pack, SourceLocation PackLoc, SourceLocation RParenLoc, - std::optional Length, - ArrayRef PartialArgs) + UnsignedOrNone Length, ArrayRef PartialArgs) : Expr(SizeOfPackExprClass, SizeType, VK_PRValue, OK_Ordinary), OperatorLoc(OperatorLoc), PackLoc(PackLoc), RParenLoc(RParenLoc), Length(Length ? *Length : PartialArgs.size()), Pack(Pack) { @@ -4325,7 +4324,7 @@ class SizeOfPackExpr final static SizeOfPackExpr *Create(ASTContext &Context, SourceLocation OperatorLoc, NamedDecl *Pack, SourceLocation PackLoc, SourceLocation RParenLoc, - std::optional Length = std::nullopt, + UnsignedOrNone Length = std::nullopt, ArrayRef PartialArgs = {}); static SizeOfPackExpr *CreateDeserialized(ASTContext &Context, unsigned NumPartialArgs); @@ -4467,7 +4466,7 @@ class PackIndexingExpr final Expr *getIndexExpr() const { return cast(SubExprs[1]); } - std::optional getSelectedIndex() const { + UnsignedOrNone getSelectedIndex() const { if (isInstantiationDependent()) return std::nullopt; ConstantExpr *CE = cast(getIndexExpr()); @@ -4477,7 +4476,7 @@ class PackIndexingExpr final } Expr *getSelectedExpr() const { - std::optional Index = getSelectedIndex(); + UnsignedOrNone Index = getSelectedIndex(); assert(Index && "extracting the indexed expression of a dependant pack"); return getTrailingObjects()[*Index]; } @@ -4525,12 +4524,12 @@ class SubstNonTypeTemplateParmExpr : public Expr { SubstNonTypeTemplateParmExpr(QualType Ty, ExprValueKind ValueKind, SourceLocation Loc, Expr *Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, bool RefParam, + UnsignedOrNone PackIndex, bool RefParam, bool Final) : Expr(SubstNonTypeTemplateParmExprClass, Ty, ValueKind, OK_Ordinary), Replacement(Replacement), AssociatedDeclAndRef(AssociatedDecl, RefParam), Index(Index), - PackIndex(PackIndex ? *PackIndex + 1 : 0), Final(Final) { + PackIndex(PackIndex.toInternalRepresentation()), Final(Final) { assert(AssociatedDecl != nullptr); SubstNonTypeTemplateParmExprBits.NameLoc = Loc; setDependence(computeDependence(this)); @@ -4552,10 +4551,8 @@ class SubstNonTypeTemplateParmExpr : public Expr { /// This should match the result of `getParameter()->getIndex()`. unsigned getIndex() const { return Index; } - std::optional getPackIndex() const { - if (PackIndex == 0) - return std::nullopt; - return PackIndex - 1; + UnsignedOrNone getPackIndex() const { + return UnsignedOrNone::fromInternalRepresentation(PackIndex); } // This substitution is Final, which means the substitution is fully @@ -4882,7 +4879,7 @@ class CXXFoldExpr : public Expr { SourceLocation RParenLoc; // When 0, the number of expansions is not known. Otherwise, this is one more // than the number of expansions. - unsigned NumExpansions; + UnsignedOrNone NumExpansions = std::nullopt; Stmt *SubExprs[SubExpr::Count]; BinaryOperatorKind Opcode; @@ -4890,7 +4887,7 @@ class CXXFoldExpr : public Expr { CXXFoldExpr(QualType T, UnresolvedLookupExpr *Callee, SourceLocation LParenLoc, Expr *LHS, BinaryOperatorKind Opcode, SourceLocation EllipsisLoc, Expr *RHS, SourceLocation RParenLoc, - std::optional NumExpansions); + UnsignedOrNone NumExpansions); CXXFoldExpr(EmptyShell Empty) : Expr(CXXFoldExprClass, Empty) {} @@ -4919,11 +4916,7 @@ class CXXFoldExpr : public Expr { SourceLocation getEllipsisLoc() const { return EllipsisLoc; } BinaryOperatorKind getOperator() const { return Opcode; } - std::optional getNumExpansions() const { - if (NumExpansions) - return NumExpansions - 1; - return std::nullopt; - } + UnsignedOrNone getNumExpansions() const { return NumExpansions; } SourceLocation getBeginLoc() const LLVM_READONLY { if (LParenLoc.isValid()) diff --git a/clang/include/clang/AST/ExprObjC.h b/clang/include/clang/AST/ExprObjC.h index 1fccc26069582..f87fa85569c44 100644 --- a/clang/include/clang/AST/ExprObjC.h +++ b/clang/include/clang/AST/ExprObjC.h @@ -271,7 +271,7 @@ struct ObjCDictionaryElement { /// The number of elements this pack expansion will expand to, if /// this is a pack expansion and is known. - std::optional NumExpansions; + UnsignedOrNone NumExpansions; /// Determines whether this dictionary element is a pack expansion. bool isPackExpansion() const { return EllipsisLoc.isValid(); } diff --git a/clang/include/clang/AST/Mangle.h b/clang/include/clang/AST/Mangle.h index 9ed8895cbfff1..a0162fb7125fe 100644 --- a/clang/include/clang/AST/Mangle.h +++ b/clang/include/clang/AST/Mangle.h @@ -182,8 +182,8 @@ class MangleContext { class ItaniumMangleContext : public MangleContext { public: - using DiscriminatorOverrideTy = - std::optional (*)(ASTContext &, const NamedDecl *); + using DiscriminatorOverrideTy = UnsignedOrNone (*)(ASTContext &, + const NamedDecl *); explicit ItaniumMangleContext(ASTContext &C, DiagnosticsEngine &D, bool IsAux = false) : MangleContext(C, D, MK_Itanium, IsAux) {} diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td index 0e360de16fdd6..90537d47dd9c9 100644 --- a/clang/include/clang/AST/PropertiesBase.td +++ b/clang/include/clang/AST/PropertiesBase.td @@ -144,6 +144,7 @@ def TemplateNameKind : EnumPropertyType<"TemplateName::NameKind">; def TypeOfKind : EnumPropertyType<"TypeOfKind">; def UInt32 : CountPropertyType<"uint32_t">; def UInt64 : CountPropertyType<"uint64_t">; +def UnsignedOrNone : PropertyType; def UnaryTypeTransformKind : EnumPropertyType<"UnaryTransformType::UTTKind">; def VectorKind : EnumPropertyType<"VectorKind">; def TypeCoupledDeclRefInfo : PropertyType; @@ -727,7 +728,7 @@ let Class = PropertyTypeCase in { def : Property<"index", UInt32> { let Read = [{ parm->getIndex() }]; } - def : Property<"packIndex", Optional> { + def : Property<"packIndex", UnsignedOrNone> { let Read = [{ parm->getPackIndex() }]; } def : Property<"final", Bool> { let Read = [{ parm->getFinal() }]; } @@ -860,21 +861,16 @@ let Class = PropertyTypeCase in { def : Property<"name", TemplateName> { let Read = [{ node.getAsTemplateOrTemplatePattern() }]; } - def : Property<"numExpansions", Optional> { + def : Property<"numExpansions", UnsignedOrNone> { let Read = [{ - // Translate unsigned -> uint32_t just in case. - llvm::transformOptional(node.getNumTemplateExpansions(), - [](unsigned i) { return uint32_t(i); }) + node.getNumTemplateExpansions() }]; } def : Property<"isDefaulted", Bool> { let Read = [{ node.getIsDefaulted() }]; } def : Creator<[{ - auto numExpansionsUnsigned = llvm::transformOptional( - numExpansions, [](uint32_t i) { return unsigned(i); }); - - return TemplateArgument(name, numExpansionsUnsigned, isDefaulted); + return TemplateArgument(name, numExpansions, isDefaulted); }]>; } let Class = PropertyTypeCase in { diff --git a/clang/include/clang/AST/TemplateBase.h b/clang/include/clang/AST/TemplateBase.h index 9d0ee24a4f5e3..a800a16fc3e7a 100644 --- a/clang/include/clang/AST/TemplateBase.h +++ b/clang/include/clang/AST/TemplateBase.h @@ -159,7 +159,7 @@ class TemplateArgument { unsigned Kind : 31; LLVM_PREFERRED_TYPE(bool) unsigned IsDefaulted : 1; - unsigned NumExpansions; + UnsignedOrNone NumExpansions; void *Name; }; struct TV { @@ -232,7 +232,7 @@ class TemplateArgument { TemplateArg.Kind = Template; TemplateArg.IsDefaulted = IsDefaulted; TemplateArg.Name = Name.getAsVoidPointer(); - TemplateArg.NumExpansions = 0; + TemplateArg.NumExpansions = std::nullopt; } /// Construct a template argument that is a template pack expansion. @@ -249,15 +249,12 @@ class TemplateArgument { /// /// \param IsDefaulted If 'true', implies that this TemplateArgument /// corresponds to a default template parameter - TemplateArgument(TemplateName Name, std::optional NumExpansions, + TemplateArgument(TemplateName Name, UnsignedOrNone NumExpansions, bool IsDefaulted = false) { TemplateArg.Kind = TemplateExpansion; TemplateArg.IsDefaulted = IsDefaulted; TemplateArg.Name = Name.getAsVoidPointer(); - if (NumExpansions) - TemplateArg.NumExpansions = *NumExpansions + 1; - else - TemplateArg.NumExpansions = 0; + TemplateArg.NumExpansions = NumExpansions; } /// Construct a template argument that is an expression. @@ -356,7 +353,7 @@ class TemplateArgument { /// Retrieve the number of expansions that a template template argument /// expansion will produce, if known. - std::optional getNumTemplateExpansions() const; + UnsignedOrNone getNumTemplateExpansions() const; /// Retrieve the template argument as an integral value. // FIXME: Provide a way to read the integral data without copying the value. diff --git a/clang/include/clang/AST/TemplateName.h b/clang/include/clang/AST/TemplateName.h index ece2afcfa72ab..63949f898f6a2 100644 --- a/clang/include/clang/AST/TemplateName.h +++ b/clang/include/clang/AST/TemplateName.h @@ -17,6 +17,7 @@ #include "clang/AST/NestedNameSpecifier.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/OperatorKinds.h" +#include "clang/Basic/UnsignedOrNone.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/PointerUnion.h" @@ -414,11 +415,10 @@ class SubstTemplateTemplateParmStorage SubstTemplateTemplateParmStorage(TemplateName Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, - bool Final) - : UncommonTemplateNameStorage(SubstTemplateTemplateParm, Index, - ((PackIndex ? *PackIndex + 1 : 0) << 1) | - Final), + UnsignedOrNone PackIndex, bool Final) + : UncommonTemplateNameStorage( + SubstTemplateTemplateParm, Index, + ((PackIndex.toInternalRepresentation()) << 1) | Final), Replacement(Replacement), AssociatedDecl(AssociatedDecl) { assert(AssociatedDecl != nullptr); } @@ -436,11 +436,8 @@ class SubstTemplateTemplateParmStorage // sugared: it doesn't need to be resugared later. bool getFinal() const { return Bits.Data & 1; } - std::optional getPackIndex() const { - auto Data = Bits.Data >> 1; - if (Data == 0) - return std::nullopt; - return Data - 1; + UnsignedOrNone getPackIndex() const { + return UnsignedOrNone::fromInternalRepresentation(Bits.Data >> 1); } TemplateTemplateParmDecl *getParameter() const; @@ -450,7 +447,7 @@ class SubstTemplateTemplateParmStorage static void Profile(llvm::FoldingSetNodeID &ID, TemplateName Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, bool Final); + UnsignedOrNone PackIndex, bool Final); }; class DeducedTemplateStorage : public UncommonTemplateNameStorage, diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 86ae335452980..06d60f618ddcb 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -5962,7 +5962,7 @@ class PackIndexingType final return *(getExpansionsPtr() + *getSelectedIndex()); } - std::optional getSelectedIndex() const; + UnsignedOrNone getSelectedIndex() const; bool hasSelectedType() const { return getSelectedIndex() != std::nullopt; } @@ -6400,7 +6400,7 @@ class SubstTemplateTypeParmType final Decl *AssociatedDecl; SubstTemplateTypeParmType(QualType Replacement, Decl *AssociatedDecl, - unsigned Index, std::optional PackIndex, + unsigned Index, UnsignedOrNone PackIndex, bool Final); public: @@ -6428,10 +6428,9 @@ class SubstTemplateTypeParmType final // sugared: it doesn't need to be resugared later. unsigned getFinal() const { return SubstTemplateTypeParmTypeBits.Final; } - std::optional getPackIndex() const { - if (SubstTemplateTypeParmTypeBits.PackIndex == 0) - return std::nullopt; - return SubstTemplateTypeParmTypeBits.PackIndex - 1; + UnsignedOrNone getPackIndex() const { + return UnsignedOrNone::fromInternalRepresentation( + SubstTemplateTypeParmTypeBits.PackIndex); } bool isSugared() const { return true; } @@ -6444,7 +6443,7 @@ class SubstTemplateTypeParmType final static void Profile(llvm::FoldingSetNodeID &ID, QualType Replacement, const Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, bool Final); + UnsignedOrNone PackIndex, bool Final); static bool classof(const Type *T) { return T->getTypeClass() == SubstTemplateTypeParm; @@ -7142,7 +7141,7 @@ class PackExpansionType : public Type, public llvm::FoldingSetNode { QualType Pattern; PackExpansionType(QualType Pattern, QualType Canon, - std::optional NumExpansions) + UnsignedOrNone NumExpansions) : Type(PackExpansion, Canon, (Pattern->getDependence() | TypeDependence::Dependent | TypeDependence::Instantiation) & @@ -7160,7 +7159,7 @@ class PackExpansionType : public Type, public llvm::FoldingSetNode { /// Retrieve the number of expansions that this pack expansion will /// generate, if known. - std::optional getNumExpansions() const { + UnsignedOrNone getNumExpansions() const { if (PackExpansionTypeBits.NumExpansions) return PackExpansionTypeBits.NumExpansions - 1; return std::nullopt; @@ -7174,11 +7173,9 @@ class PackExpansionType : public Type, public llvm::FoldingSetNode { } static void Profile(llvm::FoldingSetNodeID &ID, QualType Pattern, - std::optional NumExpansions) { + UnsignedOrNone NumExpansions) { ID.AddPointer(Pattern.getAsOpaquePtr()); - ID.AddBoolean(NumExpansions.has_value()); - if (NumExpansions) - ID.AddInteger(*NumExpansions); + ID.AddInteger(NumExpansions.toInternalRepresentation()); } static bool classof(const Type *T) { diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index 477106a152188..66d490850678a 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -839,7 +839,7 @@ let Class = SubstTemplateTypeParmType in { def : Property<"Index", UInt32> { let Read = [{ node->getIndex() }]; } - def : Property<"PackIndex", Optional> { + def : Property<"PackIndex", UnsignedOrNone> { let Read = [{ node->getPackIndex() }]; } def : Property<"Final", Bool> { let Read = [{ node->getFinal() }]; } @@ -854,7 +854,7 @@ let Class = PackExpansionType in { def : Property<"pattern", QualType> { let Read = [{ node->getPattern() }]; } - def : Property<"numExpansions", Optional> { + def : Property<"numExpansions", UnsignedOrNone> { let Read = [{ node->getNumExpansions() }]; } diff --git a/clang/include/clang/Basic/UnsignedOrNone.h b/clang/include/clang/Basic/UnsignedOrNone.h new file mode 100644 index 0000000000000..659fd8c6487d2 --- /dev/null +++ b/clang/include/clang/Basic/UnsignedOrNone.h @@ -0,0 +1,53 @@ +//===- UnsignedOrNone.h - simple optional index-----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Defines clang::UnsignedOrNone. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_BASIC_UNSIGNED_OR_NONE_H +#define LLVM_CLANG_BASIC_UNSIGNED_OR_NONE_H + +#include +#include + +namespace clang { + +struct UnsignedOrNone { + constexpr UnsignedOrNone(std::nullopt_t) : Rep(0) {} + UnsignedOrNone(unsigned Val) : Rep(Val + 1) { assert(operator bool()); } + UnsignedOrNone(int) = delete; + + constexpr static UnsignedOrNone fromInternalRepresentation(unsigned Rep) { + return {std::nullopt, Rep}; + } + constexpr unsigned toInternalRepresentation() const { return Rep; } + + explicit constexpr operator bool() const { return Rep != 0; } + unsigned operator*() const { + assert(operator bool()); + return Rep - 1; + } + + friend constexpr bool operator==(UnsignedOrNone LHS, UnsignedOrNone RHS) { + return LHS.Rep == RHS.Rep; + } + friend constexpr bool operator!=(UnsignedOrNone LHS, UnsignedOrNone RHS) { + return LHS.Rep != RHS.Rep; + } + +private: + constexpr UnsignedOrNone(std::nullopt_t, unsigned Rep) : Rep(Rep) {}; + + unsigned Rep; +}; + +} // namespace clang + +#endif // LLVM_CLANG_BASIC_UNSIGNED_OR_NONE_H diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 1aa0e4a9917de..b835697f99670 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -6124,8 +6124,8 @@ class Sema final : public SemaBase { RecordDecl *ClassDecl, const IdentifierInfo *Name); - std::optional GetDecompositionElementCount(QualType DecompType, - SourceLocation Loc); + UnsignedOrNone GetDecompositionElementCount(QualType DecompType, + SourceLocation Loc); void CheckCompleteDecompositionDeclaration(DecompositionDecl *DD); /// Stack containing information needed when in C++2a an 'auto' is encountered @@ -8890,10 +8890,11 @@ class Sema final : public SemaBase { Loc, ByRef, EllipsisLoc, std::nullopt, Id, InitKind != LambdaCaptureInitKind::CopyInit, Init)); } - QualType buildLambdaInitCaptureInitialization( - SourceLocation Loc, bool ByRef, SourceLocation EllipsisLoc, - std::optional NumExpansions, IdentifierInfo *Id, - bool DirectInit, Expr *&Init); + QualType buildLambdaInitCaptureInitialization(SourceLocation Loc, bool ByRef, + SourceLocation EllipsisLoc, + UnsignedOrNone NumExpansions, + IdentifierInfo *Id, + bool DirectInit, Expr *&Init); /// Create a dummy variable within the declcontext of the lambda's /// call operator, for name lookup purposes for a lambda init capture. @@ -13344,28 +13345,25 @@ class Sema final : public SemaBase { /// The current index into pack expansion arguments that will be /// used for substitution of parameter packs. /// - /// The pack expansion index will be -1 to indicate that parameter packs + /// The pack expansion index will be none to indicate that parameter packs /// should be instantiated as themselves. Otherwise, the index specifies /// which argument within the parameter pack will be used for substitution. - int ArgumentPackSubstitutionIndex; + UnsignedOrNone ArgPackSubstIndex; /// RAII object used to change the argument pack substitution index /// within a \c Sema object. /// - /// See \c ArgumentPackSubstitutionIndex for more information. - class ArgumentPackSubstitutionIndexRAII { + /// See \c ArgPackSubstIndex for more information. + class ArgPackSubstIndexRAII { Sema &Self; - int OldSubstitutionIndex; + UnsignedOrNone OldSubstIndex; public: - ArgumentPackSubstitutionIndexRAII(Sema &Self, int NewSubstitutionIndex) - : Self(Self), OldSubstitutionIndex(Self.ArgumentPackSubstitutionIndex) { - Self.ArgumentPackSubstitutionIndex = NewSubstitutionIndex; - } + ArgPackSubstIndexRAII(Sema &Self, UnsignedOrNone NewSubstIndex) + : Self(Self), + OldSubstIndex(std::exchange(Self.ArgPackSubstIndex, NewSubstIndex)) {} - ~ArgumentPackSubstitutionIndexRAII() { - Self.ArgumentPackSubstitutionIndex = OldSubstitutionIndex; - } + ~ArgPackSubstIndexRAII() { Self.ArgPackSubstIndex = OldSubstIndex; } }; friend class ArgumentPackSubstitutionRAII; @@ -13465,7 +13463,7 @@ class Sema final : public SemaBase { ParmVarDecl * SubstParmVarDecl(ParmVarDecl *D, const MultiLevelTemplateArgumentList &TemplateArgs, - int indexAdjustment, std::optional NumExpansions, + int indexAdjustment, UnsignedOrNone NumExpansions, bool ExpectParameterPack, bool EvaluateConstraints = true); /// Substitute the given template arguments into the given set of @@ -14315,13 +14313,13 @@ class Sema final : public SemaBase { /// expansion. TypeSourceInfo *CheckPackExpansion(TypeSourceInfo *Pattern, SourceLocation EllipsisLoc, - std::optional NumExpansions); + UnsignedOrNone NumExpansions); /// Construct a pack expansion type from the pattern of the pack /// expansion. QualType CheckPackExpansion(QualType Pattern, SourceRange PatternRange, SourceLocation EllipsisLoc, - std::optional NumExpansions); + UnsignedOrNone NumExpansions); /// Invoked when parsing an expression followed by an ellipsis, which /// creates a pack expansion. @@ -14340,7 +14338,7 @@ class Sema final : public SemaBase { /// /// \param EllipsisLoc The location of the ellipsis. ExprResult CheckPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc, - std::optional NumExpansions); + UnsignedOrNone NumExpansions); /// Determine whether we could expand a pack expansion with the /// given set of parameter packs into separate arguments by repeatedly @@ -14380,7 +14378,7 @@ class Sema final : public SemaBase { SourceLocation EllipsisLoc, SourceRange PatternRange, ArrayRef Unexpanded, const MultiLevelTemplateArgumentList &TemplateArgs, bool &ShouldExpand, - bool &RetainExpansion, std::optional &NumExpansions); + bool &RetainExpansion, UnsignedOrNone &NumExpansions); /// Determine the number of arguments in the given pack expansion /// type. @@ -14389,10 +14387,10 @@ class Sema final : public SemaBase { /// consistent across all of the unexpanded parameter packs in its pattern. /// /// Returns an empty Optional if the type can't be expanded. - std::optional getNumArgumentsInExpansion( + UnsignedOrNone getNumArgumentsInExpansion( QualType T, const MultiLevelTemplateArgumentList &TemplateArgs); - std::optional getNumArgumentsInExpansionFromUnexpanded( + UnsignedOrNone getNumArgumentsInExpansionFromUnexpanded( llvm::ArrayRef Unexpanded, const MultiLevelTemplateArgumentList &TemplateArgs); @@ -14421,9 +14419,10 @@ class Sema final : public SemaBase { /// /// \param NumExpansions Will be set to the number of expansions that will /// be generated from this pack expansion, if known a priori. - TemplateArgumentLoc getTemplateArgumentPackExpansionPattern( - TemplateArgumentLoc OrigLoc, SourceLocation &Ellipsis, - std::optional &NumExpansions) const; + TemplateArgumentLoc + getTemplateArgumentPackExpansionPattern(TemplateArgumentLoc OrigLoc, + SourceLocation &Ellipsis, + UnsignedOrNone &NumExpansions) const; /// Given a template argument that contains an unexpanded parameter pack, but /// which has already been substituted, attempt to determine the number of @@ -14431,7 +14430,7 @@ class Sema final : public SemaBase { /// /// This is intended for use when transforming 'sizeof...(Arg)' in order to /// avoid actually expanding the pack where possible. - std::optional getFullyPackExpandedSize(TemplateArgument Arg); + UnsignedOrNone getFullyPackExpandedSize(TemplateArgument Arg); /// Called when an expression computing the size of a parameter pack /// is parsed. @@ -14473,7 +14472,7 @@ class Sema final : public SemaBase { BinaryOperatorKind Operator, SourceLocation EllipsisLoc, Expr *RHS, SourceLocation RParenLoc, - std::optional NumExpansions); + UnsignedOrNone NumExpansions); ExprResult BuildEmptyCXXFoldExpr(SourceLocation EllipsisLoc, BinaryOperatorKind Operator); diff --git a/clang/include/clang/Sema/SemaLambda.h b/clang/include/clang/Sema/SemaLambda.h index 3c9d22df70c0d..a1d016f6ca289 100644 --- a/clang/include/clang/Sema/SemaLambda.h +++ b/clang/include/clang/Sema/SemaLambda.h @@ -31,7 +31,7 @@ class Sema; /// of the capture-capable lambda's LambdaScopeInfo. /// See Implementation for more detailed comments. -std::optional getStackIndexOfNearestEnclosingCaptureCapableLambda( +UnsignedOrNone getStackIndexOfNearestEnclosingCaptureCapableLambda( ArrayRef FunctionScopes, ValueDecl *VarToCapture, Sema &S); diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h index 3291028c1b621..18d92e62d71a7 100644 --- a/clang/include/clang/Sema/SemaOpenACC.h +++ b/clang/include/clang/Sema/SemaOpenACC.h @@ -105,7 +105,7 @@ class SemaOpenACC : public SemaBase { /// This is the number of expressions on a 'tile' clause. This doesn't have /// to be an APSInt because it isn't the result of a constexpr, just by our /// own counting of elements. - std::optional CurTileCount; + UnsignedOrNone CurTileCount = std::nullopt; /// Records whether we've hit a 'CurTileCount' of '0' on the wya down, /// which allows us to diagnose if the number of arguments is too large for diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h index 647c4cfa341e1..f9a10cfafb1f7 100644 --- a/clang/include/clang/Sema/Template.h +++ b/clang/include/clang/Sema/Template.h @@ -569,7 +569,7 @@ enum class TemplateSubstitutionKind : char { : public DeclVisitor { Sema &SemaRef; - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex; + Sema::ArgPackSubstIndexRAII SubstIndex; DeclContext *Owner; const MultiLevelTemplateArgumentList &TemplateArgs; Sema::LateInstantiatedAttrVec* LateAttrs = nullptr; @@ -595,8 +595,7 @@ enum class TemplateSubstitutionKind : char { public: TemplateDeclInstantiator(Sema &SemaRef, DeclContext *Owner, const MultiLevelTemplateArgumentList &TemplateArgs) - : SemaRef(SemaRef), - SubstIndex(SemaRef, SemaRef.ArgumentPackSubstitutionIndex), + : SemaRef(SemaRef), SubstIndex(SemaRef, SemaRef.ArgPackSubstIndex), Owner(Owner), TemplateArgs(TemplateArgs) {} void setEvaluateConstraints(bool B) { diff --git a/clang/include/clang/Sema/TemplateDeduction.h b/clang/include/clang/Sema/TemplateDeduction.h index 020e19bc7a608..39c909d73f565 100644 --- a/clang/include/clang/Sema/TemplateDeduction.h +++ b/clang/include/clang/Sema/TemplateDeduction.h @@ -301,7 +301,7 @@ struct DeductionFailureInfo { /// Return the index of the call argument that this deduction /// failure refers to, if any. - std::optional getCallArgIndex(); + UnsignedOrNone getCallArgIndex(); /// Free any memory associated with this deduction failure. void Destroy(); diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 58fcc06c3696d..57ae4aa104d9a 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -2659,7 +2659,7 @@ inline bool shouldSkipCheckingODR(const Decl *D) { /// Calculate a hash value for the primary module name of the given module. /// \returns std::nullopt if M is not a C++ standard module. -std::optional getPrimaryModuleHash(const Module *M); +UnsignedOrNone getPrimaryModuleHash(const Module *M); } // namespace clang diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h index 7117b7246739b..141804185083f 100644 --- a/clang/include/clang/Serialization/ASTRecordReader.h +++ b/clang/include/clang/Serialization/ASTRecordReader.h @@ -319,6 +319,10 @@ class ASTRecordReader return readInt(); } + UnsignedOrNone readUnsignedOrNone() { + return UnsignedOrNone::fromInternalRepresentation(unsigned(readInt())); + } + /// Read a string, advancing Idx. std::string readString() { return Reader->ReadString(Record, Idx); diff --git a/clang/include/clang/Serialization/ASTRecordWriter.h b/clang/include/clang/Serialization/ASTRecordWriter.h index 84d77e46016b7..e1fb239a9ce49 100644 --- a/clang/include/clang/Serialization/ASTRecordWriter.h +++ b/clang/include/clang/Serialization/ASTRecordWriter.h @@ -168,6 +168,10 @@ class ASTRecordWriter Record->push_back(Value); } + void writeUnsignedOrNone(UnsignedOrNone Value) { + Record->push_back(Value.toInternalRepresentation()); + } + /// Emit an integral value. void AddAPInt(const llvm::APInt &Value) { writeAPInt(Value); diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index a73c15ae6bcc7..1b6b3d06ddc1e 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -732,11 +732,8 @@ ASTContext::CanonicalTemplateTemplateParm::Profile(llvm::FoldingSetNodeID &ID, if (const auto *TTP = dyn_cast(*P)) { ID.AddInteger(0); ID.AddBoolean(TTP->isParameterPack()); - if (TTP->isExpandedParameterPack()) { - ID.AddBoolean(true); - ID.AddInteger(TTP->getNumExpansionParameters()); - } else - ID.AddBoolean(false); + ID.AddInteger( + TTP->getNumExpansionParameters().toInternalRepresentation()); continue; } @@ -789,9 +786,7 @@ ASTContext::getCanonicalTemplateTemplateParmDecl( *this, getTranslationUnitDecl(), SourceLocation(), SourceLocation(), TTP->getDepth(), TTP->getIndex(), nullptr, false, TTP->isParameterPack(), /*HasTypeConstraint=*/false, - TTP->isExpandedParameterPack() - ? std::optional(TTP->getNumExpansionParameters()) - : std::nullopt); + TTP->getNumExpansionParameters()); CanonParams.push_back(NewTTP); } else if (const auto *NTTP = dyn_cast(*P)) { QualType T = getUnconstrainedType(getCanonicalType(NTTP->getType())); @@ -5445,9 +5440,11 @@ QualType ASTContext::getHLSLAttributedResourceType( return QualType(Ty, 0); } /// Retrieve a substitution-result type. -QualType ASTContext::getSubstTemplateTypeParmType( - QualType Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, bool Final) const { +QualType ASTContext::getSubstTemplateTypeParmType(QualType Replacement, + Decl *AssociatedDecl, + unsigned Index, + UnsignedOrNone PackIndex, + bool Final) const { llvm::FoldingSetNodeID ID; SubstTemplateTypeParmType::Profile(ID, Replacement, AssociatedDecl, Index, PackIndex, Final); @@ -5863,7 +5860,7 @@ TemplateArgument ASTContext::getInjectedTemplateArg(NamedDecl *Param) const { TemplateName Name = getQualifiedTemplateName( nullptr, /*TemplateKeyword=*/false, TemplateName(TTP)); if (TTP->isParameterPack()) - Arg = TemplateArgument(Name, std::optional()); + Arg = TemplateArgument(Name, /*NumExpansions=*/std::nullopt); else Arg = TemplateArgument(Name); } @@ -5876,7 +5873,7 @@ TemplateArgument ASTContext::getInjectedTemplateArg(NamedDecl *Param) const { } QualType ASTContext::getPackExpansionType(QualType Pattern, - std::optional NumExpansions, + UnsignedOrNone NumExpansions, bool ExpectPackInType) const { assert((!ExpectPackInType || Pattern->containsUnexpandedParameterPack()) && "Pack expansions must expand one or more parameter packs"); @@ -6371,10 +6368,10 @@ QualType ASTContext::getDecltypeType(Expr *e, QualType UnderlyingType) const { QualType ASTContext::getPackIndexingType(QualType Pattern, Expr *IndexExpr, bool FullySubstituted, ArrayRef Expansions, - int Index) const { + UnsignedOrNone Index) const { QualType Canonical; - if (FullySubstituted && Index != -1) { - Canonical = getCanonicalType(Expansions[Index]); + if (FullySubstituted && Index) { + Canonical = getCanonicalType(Expansions[*Index]); } else { llvm::FoldingSetNodeID ID; PackIndexingType::Profile(ID, *this, Pattern.getCanonicalType(), IndexExpr, @@ -7072,7 +7069,7 @@ bool ASTContext::hasSameTemplateName(const TemplateName &X, bool ASTContext::isSameAssociatedConstraint( const AssociatedConstraint &ACX, const AssociatedConstraint &ACY) const { - if (ACX.ArgumentPackSubstitutionIndex != ACY.ArgumentPackSubstitutionIndex) + if (ACX.ArgPackSubstIndex != ACY.ArgPackSubstIndex) return false; if (!isSameConstraintExpr(ACX.ConstraintExpr, ACY.ConstraintExpr)) return false; @@ -10097,9 +10094,11 @@ ASTContext::getDependentTemplateName(const DependentTemplateStorage &S) const { return TemplateName(QTN); } -TemplateName ASTContext::getSubstTemplateTemplateParm( - TemplateName Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, bool Final) const { +TemplateName ASTContext::getSubstTemplateTemplateParm(TemplateName Replacement, + Decl *AssociatedDecl, + unsigned Index, + UnsignedOrNone PackIndex, + bool Final) const { llvm::FoldingSetNodeID ID; SubstTemplateTemplateParmStorage::Profile(ID, Replacement, AssociatedDecl, Index, PackIndex, Final); @@ -12974,7 +12973,7 @@ MangleContext *ASTContext::createDeviceMangleContext(const TargetInfo &T) { case TargetCXXABI::XL: return ItaniumMangleContext::create( *this, getDiagnostics(), - [](ASTContext &, const NamedDecl *ND) -> std::optional { + [](ASTContext &, const NamedDecl *ND) -> UnsignedOrNone { if (const auto *RD = dyn_cast(ND)) return RD->getDeviceLambdaManglingNumber(); return std::nullopt; diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 73dc355003e6b..8c91cce22f78e 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -4354,7 +4354,7 @@ static bool IsEquivalentFriend(ASTImporter &Importer, FriendDecl *FD1, static FriendCountAndPosition getFriendCountAndPosition(ASTImporter &Importer, FriendDecl *FD) { unsigned int FriendCount = 0; - std::optional FriendPosition; + UnsignedOrNone FriendPosition = std::nullopt; const auto *RD = cast(FD->getLexicalDeclContext()); for (FriendDecl *FoundFriend : RD->friends()) { @@ -5976,8 +5976,7 @@ ASTNodeImporter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) { if (Err) return std::move(Err); - ToD->setTypeConstraint(ToConceptRef, ToIDC, - TC->getArgumentPackSubstitutionIndex()); + ToD->setTypeConstraint(ToConceptRef, ToIDC, TC->getArgPackSubstIndex()); } if (Error Err = importTemplateParameterDefaultArgument(D, ToD)) @@ -8293,7 +8292,7 @@ ExpectedStmt ASTNodeImporter::VisitSizeOfPackExpr(SizeOfPackExpr *E) { if (Err) return std::move(Err); - std::optional Length; + UnsignedOrNone Length = std::nullopt; if (!E->isValueDependent()) Length = E->getPackLength(); @@ -9043,7 +9042,7 @@ ASTImporter::ASTImporter(ASTContext &ToContext, FileManager &ToFileManager, ASTImporter::~ASTImporter() = default; -std::optional ASTImporter::getFieldIndex(Decl *F) { +UnsignedOrNone ASTImporter::getFieldIndex(Decl *F) { assert(F && (isa(*F) || isa(*F)) && "Try to get field index for non-field."); diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp index c769722521d9c..2c7cb581ccfaa 100644 --- a/clang/lib/AST/ASTStructuralEquivalence.cpp +++ b/clang/lib/AST/ASTStructuralEquivalence.cpp @@ -1656,9 +1656,9 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, if (!D1->getDeclName() && !D2->getDeclName()) { // If both anonymous structs/unions are in a record context, make sure // they occur in the same location in the context records. - if (std::optional Index1 = + if (UnsignedOrNone Index1 = StructuralEquivalenceContext::findUntaggedStructOrUnionIndex(D1)) { - if (std::optional Index2 = + if (UnsignedOrNone Index2 = StructuralEquivalenceContext::findUntaggedStructOrUnionIndex( D2)) { if (*Index1 != *Index2) @@ -2345,7 +2345,7 @@ DiagnosticBuilder StructuralEquivalenceContext::Diag2(SourceLocation Loc, return ToCtx.getDiagnostics().Report(Loc, DiagID); } -std::optional +UnsignedOrNone StructuralEquivalenceContext::findUntaggedStructOrUnionIndex(RecordDecl *Anon) { ASTContext &Context = Anon->getASTContext(); QualType AnonTy = Context.getRecordType(Anon); diff --git a/clang/lib/AST/ComputeDependence.cpp b/clang/lib/AST/ComputeDependence.cpp index 07c4419e3cf40..fd2eefa1cf076 100644 --- a/clang/lib/AST/ComputeDependence.cpp +++ b/clang/lib/AST/ComputeDependence.cpp @@ -391,7 +391,7 @@ ExprDependence clang::computeDependence(PackIndexingExpr *E) { if (Exprs.empty() || !E->isFullySubstituted()) D |= PatternDep | ExprDependence::Instantiation; else if (!E->getIndexExpr()->isInstantiationDependent()) { - std::optional Index = E->getSelectedIndex(); + UnsignedOrNone Index = E->getSelectedIndex(); assert(Index && *Index < Exprs.size() && "pack index out of bound"); D |= Exprs[*Index]->getDependence(); } diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 568d74cc7df0b..0e4d69392e8c7 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3386,7 +3386,7 @@ bool FunctionDecl::isReservedGlobalPlacementOperator() const { } bool FunctionDecl::isReplaceableGlobalAllocationFunction( - std::optional *AlignmentParam, bool *IsNothrow) const { + UnsignedOrNone *AlignmentParam, bool *IsNothrow) const { if (getDeclName().getNameKind() != DeclarationName::CXXOperatorName) return false; if (getDeclName().getCXXOverloadedOperator() != OO_New && diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index b0bba8408f2b9..e8e2cad721981 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -172,7 +172,7 @@ unsigned TemplateParameterList::getMinRequiredArguments() const { unsigned NumRequiredArgs = 0; for (const NamedDecl *P : asArray()) { if (P->isTemplateParameterPack()) { - if (std::optional Expansions = getExpandedPackSize(P)) { + if (UnsignedOrNone Expansions = getExpandedPackSize(P)) { NumRequiredArgs += *Expansions; continue; } @@ -230,7 +230,7 @@ void TemplateParameterList::getAssociatedConstraints( if (const auto *TTP = dyn_cast(Param)) { if (const auto *TC = TTP->getTypeConstraint()) ACs.emplace_back(TC->getImmediatelyDeclaredConstraint(), - TC->getArgumentPackSubstitutionIndex()); + TC->getArgPackSubstIndex()); } else if (const auto *NTTP = dyn_cast(Param)) { if (const Expr *E = NTTP->getPlaceholderTypeConstraint()) ACs.emplace_back(E); @@ -684,7 +684,7 @@ TemplateTypeParmDecl *TemplateTypeParmDecl::Create( const ASTContext &C, DeclContext *DC, SourceLocation KeyLoc, SourceLocation NameLoc, unsigned D, unsigned P, IdentifierInfo *Id, bool Typename, bool ParameterPack, bool HasTypeConstraint, - std::optional NumExpanded) { + UnsignedOrNone NumExpanded) { auto *TTPDecl = new (C, DC, additionalSizeToAlloc(HasTypeConstraint ? 1 : 0)) @@ -750,14 +750,14 @@ bool TemplateTypeParmDecl::isParameterPack() const { void TemplateTypeParmDecl::setTypeConstraint( ConceptReference *Loc, Expr *ImmediatelyDeclaredConstraint, - int ArgumentPackSubstitutionIndex) { + UnsignedOrNone ArgPackSubstIndex) { assert(HasTypeConstraint && "HasTypeConstraint=true must be passed at construction in order to " "call setTypeConstraint"); assert(!TypeConstraintInitialized && "TypeConstraint was already initialized!"); - new (getTrailingObjects()) TypeConstraint( - Loc, ImmediatelyDeclaredConstraint, ArgumentPackSubstitutionIndex); + new (getTrailingObjects()) + TypeConstraint(Loc, ImmediatelyDeclaredConstraint, ArgPackSubstIndex); TypeConstraintInitialized = true; } diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 9d5b4a60c9fe7..389fa70a61b4b 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -596,7 +596,7 @@ std::string SYCLUniqueStableNameExpr::ComputeName(ASTContext &Context) const { std::string SYCLUniqueStableNameExpr::ComputeName(ASTContext &Context, QualType Ty) { auto MangleCallback = [](ASTContext &Ctx, - const NamedDecl *ND) -> std::optional { + const NamedDecl *ND) -> UnsignedOrNone { if (const auto *RD = dyn_cast(ND)) return RD->getDeviceLambdaManglingNumber(); return std::nullopt; diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp index 77add7b0b6abe..b12f655c4b386 100644 --- a/clang/lib/AST/ExprCXX.cpp +++ b/clang/lib/AST/ExprCXX.cpp @@ -1694,7 +1694,7 @@ SizeOfPackExpr *SizeOfPackExpr::Create(ASTContext &Context, SourceLocation OperatorLoc, NamedDecl *Pack, SourceLocation PackLoc, SourceLocation RParenLoc, - std::optional Length, + UnsignedOrNone Length, ArrayRef PartialArgs) { void *Storage = Context.Allocate(totalSizeToAlloc(PartialArgs.size())); @@ -1981,14 +1981,13 @@ CXXParenListInitExpr *CXXParenListInitExpr::CreateEmpty(ASTContext &C, } CXXFoldExpr::CXXFoldExpr(QualType T, UnresolvedLookupExpr *Callee, - SourceLocation LParenLoc, Expr *LHS, - BinaryOperatorKind Opcode, - SourceLocation EllipsisLoc, Expr *RHS, - SourceLocation RParenLoc, - std::optional NumExpansions) + SourceLocation LParenLoc, Expr *LHS, + BinaryOperatorKind Opcode, SourceLocation EllipsisLoc, + Expr *RHS, SourceLocation RParenLoc, + UnsignedOrNone NumExpansions) : Expr(CXXFoldExprClass, T, VK_PRValue, OK_Ordinary), LParenLoc(LParenLoc), EllipsisLoc(EllipsisLoc), RParenLoc(RParenLoc), - NumExpansions(NumExpansions ? *NumExpansions + 1 : 0), Opcode(Opcode) { + NumExpansions(NumExpansions), Opcode(Opcode) { // We rely on asserted invariant to distinguish left and right folds. assert(((LHS && LHS->containsUnexpandedParameterPack()) != (RHS && RHS->containsUnexpandedParameterPack())) && diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index eb9c9c30622ad..fdd84d0bf7c5c 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -1641,7 +1641,7 @@ void CXXNameMangler::mangleUnqualifiedName( // ::= * + // # Parameter types or 'v' for 'void'. if (const CXXRecordDecl *Record = dyn_cast(TD)) { - std::optional DeviceNumber = + UnsignedOrNone DeviceNumber = Context.getDiscriminatorOverride()(Context.getASTContext(), Record); // If we have a device-number via the discriminator, use that to mangle @@ -2136,7 +2136,7 @@ void CXXNameMangler::mangleLambda(const CXXRecordDecl *Lambda) { // if the host-side CXX ABI has different numbering for lambda. In such case, // if the mangle context is that device-side one, use the device-side lambda // mangling number for this lambda. - std::optional DeviceNumber = + UnsignedOrNone DeviceNumber = Context.getDiscriminatorOverride()(Context.getASTContext(), Lambda); unsigned Number = DeviceNumber ? *DeviceNumber : Lambda->getLambdaManglingNumber(); @@ -7540,7 +7540,7 @@ ItaniumMangleContext *ItaniumMangleContext::create(ASTContext &Context, bool IsAux) { return new ItaniumMangleContextImpl( Context, Diags, - [](ASTContext &, const NamedDecl *) -> std::optional { + [](ASTContext &, const NamedDecl *) -> UnsignedOrNone { return std::nullopt; }, IsAux); diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp index e5e7bd31f73e9..3420c1f343cf5 100644 --- a/clang/lib/AST/JSONNodeDumper.cpp +++ b/clang/lib/AST/JSONNodeDumper.cpp @@ -818,7 +818,7 @@ void JSONNodeDumper::VisitObjCInterfaceType(const ObjCInterfaceType *OIT) { } void JSONNodeDumper::VisitPackExpansionType(const PackExpansionType *PET) { - if (std::optional N = PET->getNumExpansions()) + if (UnsignedOrNone N = PET->getNumExpansions()) JOS.attribute("numExpansions", *N); } diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp index 0eef8f305fcb3..0be0a83b7010d 100644 --- a/clang/lib/AST/TemplateBase.cpp +++ b/clang/lib/AST/TemplateBase.cpp @@ -344,12 +344,9 @@ bool TemplateArgument::containsUnexpandedParameterPack() const { return getDependence() & TemplateArgumentDependence::UnexpandedPack; } -std::optional TemplateArgument::getNumTemplateExpansions() const { +UnsignedOrNone TemplateArgument::getNumTemplateExpansions() const { assert(getKind() == TemplateExpansion); - if (TemplateArg.NumExpansions) - return TemplateArg.NumExpansions - 1; - - return std::nullopt; + return TemplateArg.NumExpansions; } QualType TemplateArgument::getNonTypeTemplateArgumentType() const { @@ -401,7 +398,7 @@ void TemplateArgument::Profile(llvm::FoldingSetNodeID &ID, break; case TemplateExpansion: - ID.AddInteger(TemplateArg.NumExpansions); + ID.AddInteger(TemplateArg.NumExpansions.toInternalRepresentation()); [[fallthrough]]; case Template: ID.AddPointer(TemplateArg.Name); diff --git a/clang/lib/AST/TemplateName.cpp b/clang/lib/AST/TemplateName.cpp index f8ba5a24c7519..4404552f84fbb 100644 --- a/clang/lib/AST/TemplateName.cpp +++ b/clang/lib/AST/TemplateName.cpp @@ -83,11 +83,11 @@ void SubstTemplateTemplateParmStorage::Profile(llvm::FoldingSetNodeID &ID) { void SubstTemplateTemplateParmStorage::Profile( llvm::FoldingSetNodeID &ID, TemplateName Replacement, Decl *AssociatedDecl, - unsigned Index, std::optional PackIndex, bool Final) { + unsigned Index, UnsignedOrNone PackIndex, bool Final) { Replacement.Profile(ID); ID.AddPointer(AssociatedDecl); ID.AddInteger(Index); - ID.AddInteger(PackIndex ? *PackIndex + 1 : 0); + ID.AddInteger(PackIndex.toInternalRepresentation()); ID.AddBoolean(Final); } diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index d35b2f5d9ab6d..05f1953aa473c 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -1296,7 +1296,7 @@ void TextNodeDumper::dumpBareTemplateName(TemplateName TN) { const SubstTemplateTemplateParmStorage *STS = TN.getAsSubstTemplateTemplateParm(); OS << " index " << STS->getIndex(); - if (std::optional PackIndex = STS->getPackIndex()) + if (UnsignedOrNone PackIndex = STS->getPackIndex()) OS << " pack_index " << *PackIndex; if (STS->getFinal()) OS << " final"; diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 4669bf5541493..879ad1a7eaa84 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -4059,7 +4059,7 @@ PackIndexingType::PackIndexingType(const ASTContext &Context, getTrailingObjects()); } -std::optional PackIndexingType::getSelectedIndex() const { +UnsignedOrNone PackIndexingType::getSelectedIndex() const { if (isInstantiationDependentType()) return std::nullopt; // Should only be not a constant for error recovery. @@ -4261,9 +4261,11 @@ static const TemplateTypeParmDecl *getReplacedParameter(Decl *D, getReplacedTemplateParameterList(D)->getParam(Index)); } -SubstTemplateTypeParmType::SubstTemplateTypeParmType( - QualType Replacement, Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, bool Final) +SubstTemplateTypeParmType::SubstTemplateTypeParmType(QualType Replacement, + Decl *AssociatedDecl, + unsigned Index, + UnsignedOrNone PackIndex, + bool Final) : Type(SubstTemplateTypeParm, Replacement.getCanonicalType(), Replacement->getDependence()), AssociatedDecl(AssociatedDecl) { @@ -4274,7 +4276,8 @@ SubstTemplateTypeParmType::SubstTemplateTypeParmType( SubstTemplateTypeParmTypeBits.Index = Index; SubstTemplateTypeParmTypeBits.Final = Final; - SubstTemplateTypeParmTypeBits.PackIndex = PackIndex ? *PackIndex + 1 : 0; + SubstTemplateTypeParmTypeBits.PackIndex = + PackIndex.toInternalRepresentation(); assert(AssociatedDecl != nullptr); } @@ -4287,12 +4290,11 @@ void SubstTemplateTypeParmType::Profile(llvm::FoldingSetNodeID &ID, QualType Replacement, const Decl *AssociatedDecl, unsigned Index, - std::optional PackIndex, - bool Final) { + UnsignedOrNone PackIndex, bool Final) { Replacement.Profile(ID); ID.AddPointer(AssociatedDecl); ID.AddInteger(Index); - ID.AddInteger(PackIndex ? *PackIndex - 1 : 0); + ID.AddInteger(PackIndex.toInternalRepresentation()); ID.AddBoolean(Final); } diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 93a2d797679d4..64f5633f380ec 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -263,7 +263,7 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer, TyposCorrected(0), IsBuildingRecoveryCallExpr(false), NumSFINAEErrors(0), AccessCheckingSFINAE(false), CurrentInstantiationScope(nullptr), InNonInstantiationSFINAEContext(false), NonInstantiationEntries(0), - ArgumentPackSubstitutionIndex(-1), SatisfactionCache(Context) { + ArgPackSubstIndex(std::nullopt), SatisfactionCache(Context) { assert(pp.TUKind == TUKind); TUScope = nullptr; diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index e10c49203725f..011a6d072d35c 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -289,7 +289,7 @@ static ExprResult EvaluateAtomicConstraint( return SubstitutedExpression; } -std::optional static EvaluateFoldExpandedConstraintSize( +static UnsignedOrNone EvaluateFoldExpandedConstraintSize( Sema &S, const CXXFoldExpr *FE, const NamedDecl *Template, SourceLocation TemplateNameLoc, const MultiLevelTemplateArgumentList &MLTAL, ConstraintSatisfaction &Satisfaction) { @@ -304,15 +304,14 @@ std::optional static EvaluateFoldExpandedConstraintSize( assert(!Unexpanded.empty() && "Pack expansion without parameter packs?"); bool Expand = true; bool RetainExpansion = false; - std::optional OrigNumExpansions = FE->getNumExpansions(), - NumExpansions = OrigNumExpansions; + UnsignedOrNone NumExpansions = FE->getNumExpansions(); if (S.CheckParameterPacksForExpansion( FE->getEllipsisLoc(), Pattern->getSourceRange(), Unexpanded, MLTAL, Expand, RetainExpansion, NumExpansions) || !Expand || RetainExpansion) return std::nullopt; - if (NumExpansions && S.getLangOpts().BracketDepth < NumExpansions) { + if (NumExpansions && S.getLangOpts().BracketDepth < *NumExpansions) { S.Diag(FE->getEllipsisLoc(), clang::diag::err_fold_expression_limit_exceeded) << *NumExpansions << S.getLangOpts().BracketDepth @@ -413,12 +412,12 @@ static ExprResult calculateConstraintSatisfaction( if (Conjunction != Satisfaction.IsSatisfied) return Out; } - std::optional NumExpansions = EvaluateFoldExpandedConstraintSize( + UnsignedOrNone NumExpansions = EvaluateFoldExpandedConstraintSize( S, FE, Template, TemplateNameLoc, MLTAL, Satisfaction); if (!NumExpansions) return ExprError(); for (unsigned I = 0; I < *NumExpansions; I++) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(S, I); + Sema::ArgPackSubstIndexRAII SubstIndex(S, I); ExprResult Res = calculateConstraintSatisfaction( S, FE->getPattern(), Template, TemplateNameLoc, MLTAL, Satisfaction); if (Res.isInvalid()) @@ -589,8 +588,7 @@ static bool CheckConstraintSatisfaction( return true; for (const AssociatedConstraint &AC : AssociatedConstraints) { - Sema::ArgumentPackSubstitutionIndexRAII _(S, - AC.ArgumentPackSubstitutionIndex); + Sema::ArgPackSubstIndexRAII _(S, AC.ArgPackSubstIndex); ExprResult Res = calculateConstraintSatisfaction( S, Template, TemplateIDRange.getBegin(), TemplateArgsLists, AC.ConstraintExpr, Satisfaction); @@ -1406,7 +1404,7 @@ substituteParameterMappings(Sema &S, NormalizedConstraint &N, } if (N.isFoldExpanded()) { - Sema::ArgumentPackSubstitutionIndexRAII _(S, -1); + Sema::ArgPackSubstIndexRAII _(S, std::nullopt); return substituteParameterMappings( S, N.getFoldExpandedConstraint()->Constraint, Concept, MLTAL, ArgsAsWritten); diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index a675feaf50ce3..d630f9bd409fd 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -16749,7 +16749,7 @@ void Sema::AddKnownFunctionAttributesForReplaceableGlobalAllocationFunction( FD->getDeclName().getCXXOverloadedOperator() != OO_Array_New) return; - std::optional AlignmentParam; + UnsignedOrNone AlignmentParam = std::nullopt; bool IsNothrow = false; if (!FD->isReplaceableGlobalAllocationFunction(&AlignmentParam, &IsNothrow)) return; diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 47c472b35463e..07379c6876731 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -1660,8 +1660,8 @@ void Sema::CheckCompleteDecompositionDeclaration(DecompositionDecl *DD) { DD->setInvalidDecl(); } -std::optional Sema::GetDecompositionElementCount(QualType T, - SourceLocation Loc) { +UnsignedOrNone Sema::GetDecompositionElementCount(QualType T, + SourceLocation Loc) { const ASTContext &Ctx = getASTContext(); assert(!T->isDependentType()); @@ -1671,18 +1671,18 @@ std::optional Sema::GetDecompositionElementCount(QualType T, T = Context.getQualifiedType(Unqual, Quals); if (auto *CAT = Ctx.getAsConstantArrayType(T)) - return CAT->getSize().getZExtValue(); + return static_cast(CAT->getSize().getZExtValue()); if (auto *VT = T->getAs()) return VT->getNumElements(); if (T->getAs()) - return 2; + return 2u; llvm::APSInt TupleSize(Ctx.getTypeSize(Ctx.getSizeType())); switch (isTupleLike(*this, Loc, T, TupleSize)) { case IsTupleLike::Error: return std::nullopt; case IsTupleLike::TupleLike: - return TupleSize.getExtValue(); + return static_cast(TupleSize.getExtValue()); case IsTupleLike::NotTupleLike: break; } diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 78eba8e262771..e43f5e3f75bfe 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -1995,7 +1995,7 @@ Sema::isUnavailableAlignedAllocationFunction(const FunctionDecl &FD) const { return false; if (FD.isDefined()) return false; - std::optional AlignmentParam; + UnsignedOrNone AlignmentParam = std::nullopt; if (FD.isReplaceableGlobalAllocationFunction(&AlignmentParam) && AlignmentParam) return true; @@ -5818,7 +5818,7 @@ static APValue EvaluateSizeTTypeTrait(Sema &S, TypeTrait Kind, case TypeTrait::UTT_StructuredBindingSize: { QualType T = Args[0]->getType(); SourceRange ArgRange = Args[0]->getTypeLoc().getSourceRange(); - std::optional Size = + UnsignedOrNone Size = S.GetDecompositionElementCount(T, ArgRange.getBegin()); if (!Size) { S.Diag(KWLoc, diag::err_arg_is_not_destructurable) << T << ArgRange; @@ -8777,7 +8777,7 @@ static void CheckIfAnyEnclosingLambdasMustCaptureAnyPotentialCaptures( // If we have a capture-capable lambda for the variable, go ahead and // capture the variable in that lambda (and all its enclosing lambdas). - if (const std::optional Index = + if (const UnsignedOrNone Index = getStackIndexOfNearestEnclosingCaptureCapableLambda( S.FunctionScopes, Var, S)) S.MarkCaptureUsedInEnclosingContext(Var, VarExpr->getExprLoc(), *Index); @@ -8810,7 +8810,7 @@ static void CheckIfAnyEnclosingLambdasMustCaptureAnyPotentialCaptures( if (CurrentLSI->hasPotentialThisCapture()) { // If we have a capture-capable lambda for 'this', go ahead and capture // 'this' in that lambda (and all its enclosing lambdas). - if (const std::optional Index = + if (const UnsignedOrNone Index = getStackIndexOfNearestEnclosingCaptureCapableLambda( S.FunctionScopes, /*0 is 'this'*/ nullptr, S)) { const unsigned FunctionScopeIndexOfCapturableLambda = *Index; diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp index f38198e1feab8..1183a04d3bf33 100644 --- a/clang/lib/Sema/SemaLambda.cpp +++ b/clang/lib/Sema/SemaLambda.cpp @@ -62,17 +62,16 @@ using namespace sema; /// is at the top of the stack and has the highest index. /// \param VarToCapture - the variable to capture. If NULL, capture 'this'. /// -/// \returns An std::optional Index that if evaluates to 'true' +/// \returns An UnsignedOrNone Index that if evaluates to 'true' /// contains the index (into Sema's FunctionScopeInfo stack) of the innermost /// lambda which is capture-ready. If the return value evaluates to 'false' /// then no lambda is capture-ready for \p VarToCapture. -static inline std::optional -getStackIndexOfNearestEnclosingCaptureReadyLambda( +static inline UnsignedOrNone getStackIndexOfNearestEnclosingCaptureReadyLambda( ArrayRef FunctionScopes, ValueDecl *VarToCapture) { // Label failure to capture. - const std::optional NoLambdaIsCaptureReady; + const UnsignedOrNone NoLambdaIsCaptureReady = std::nullopt; // Ignore all inner captured regions. unsigned CurScopeIndex = FunctionScopes.size() - 1; @@ -173,19 +172,18 @@ getStackIndexOfNearestEnclosingCaptureReadyLambda( /// \param VarToCapture - the variable to capture. If NULL, capture 'this'. /// /// -/// \returns An std::optional Index that if evaluates to 'true' +/// \returns An UnsignedOrNone Index that if evaluates to 'true' /// contains the index (into Sema's FunctionScopeInfo stack) of the innermost /// lambda which is capture-capable. If the return value evaluates to 'false' /// then no lambda is capture-capable for \p VarToCapture. -std::optional -clang::getStackIndexOfNearestEnclosingCaptureCapableLambda( +UnsignedOrNone clang::getStackIndexOfNearestEnclosingCaptureCapableLambda( ArrayRef FunctionScopes, ValueDecl *VarToCapture, Sema &S) { - const std::optional NoLambdaIsCaptureCapable; + const UnsignedOrNone NoLambdaIsCaptureCapable = std::nullopt; - const std::optional OptionalStackIndex = + const UnsignedOrNone OptionalStackIndex = getStackIndexOfNearestEnclosingCaptureReadyLambda(FunctionScopes, VarToCapture); if (!OptionalStackIndex) @@ -808,8 +806,8 @@ void Sema::deduceClosureReturnType(CapturingScopeInfo &CSI) { QualType Sema::buildLambdaInitCaptureInitialization( SourceLocation Loc, bool ByRef, SourceLocation EllipsisLoc, - std::optional NumExpansions, IdentifierInfo *Id, - bool IsDirectInit, Expr *&Init) { + UnsignedOrNone NumExpansions, IdentifierInfo *Id, bool IsDirectInit, + Expr *&Init) { // Create an 'auto' or 'auto&' TypeSourceInfo that we can use to // deduce against. QualType DeductType = Context.getAutoDeductType(); @@ -1462,9 +1460,8 @@ void Sema::ActOnStartOfLambdaDefinition(LambdaIntroducer &Intro, CompleteLambdaCallOperator( Method, Intro.Range.getBegin(), CallOperatorLoc, - AssociatedConstraint(ParamInfo.getTrailingRequiresClause(), - /*ArgumentPackSubstitutionIndex=*/-1), - MethodTyInfo, ParamInfo.getDeclSpec().getConstexprSpecifier(), + AssociatedConstraint(ParamInfo.getTrailingRequiresClause()), MethodTyInfo, + ParamInfo.getDeclSpec().getConstexprSpecifier(), IsLambdaStatic ? SC_Static : SC_None, Params, ExplicitResultType); CheckCXXDefaultArguments(Method); diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index 231e73fdbc9ec..819fb0853e8f5 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -285,7 +285,8 @@ void SemaOpenACC::AssociatedStmtRAII::SetTileInfoBeforeAssociatedStmt( OpenACCTileClause *TileClause = cast(*TileClauseItr); SemaRef.TileInfo.ActiveTile = TileClause; SemaRef.TileInfo.TileDepthSatisfied = false; - SemaRef.TileInfo.CurTileCount = TileClause->getSizeExprs().size(); + SemaRef.TileInfo.CurTileCount = + static_cast(TileClause->getSizeExprs().size()); SemaRef.TileInfo.DirectiveKind = DirKind; } @@ -911,7 +912,7 @@ void SemaOpenACC::ForStmtBeginHelper(SourceLocation ForLoc, diag::note_acc_active_clause_here) << OpenACCClauseKind::Tile; } else { - --(*TileInfo.CurTileCount); + TileInfo.CurTileCount = *TileInfo.CurTileCount - 1; // Once we've hit zero here, we know we have deep enough 'for' loops to // get to the bottom. if (*TileInfo.CurTileCount == 0) diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index d9a79bc802b56..0564557738170 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -1002,7 +1002,7 @@ const TemplateArgument *DeductionFailureInfo::getSecondArg() { return nullptr; } -std::optional DeductionFailureInfo::getCallArgIndex() { +UnsignedOrNone DeductionFailureInfo::getCallArgIndex() { switch (static_cast(Result)) { case TemplateDeductionResult::DeducedMismatch: case TemplateDeductionResult::DeducedMismatchNested: @@ -1555,8 +1555,7 @@ static bool IsOverloadOrOverrideImpl(Sema &SemaRef, FunctionDecl *New, OldRC = Old->getTrailingRequiresClause(); if (!NewRC != !OldRC) return true; - if (NewRC.ArgumentPackSubstitutionIndex != - OldRC.ArgumentPackSubstitutionIndex) + if (NewRC.ArgPackSubstIndex != OldRC.ArgPackSubstIndex) return true; if (NewRC && !SemaRef.AreConstraintExpressionsEqual(OldDecl, OldRC.ConstraintExpr, diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 8cf65ebe03c07..153f44f8ec67a 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -916,7 +916,7 @@ static TemplateArgumentLoc translateTemplateArgument(Sema &SemaRef, TemplateName Template = Arg.getAsTemplate().get(); TemplateArgument TArg; if (Arg.getEllipsisLoc().isValid()) - TArg = TemplateArgument(Template, std::optional()); + TArg = TemplateArgument(Template, /*NumExpansions=*/std::nullopt); else TArg = Template; return TemplateArgumentLoc( @@ -1220,9 +1220,8 @@ bool Sema::AttachTypeConstraint(NestedNameSpecifierLoc NS, /*FoundDecl=*/FoundDecl, /*NamedConcept=*/NamedConcept, /*ArgsWritten=*/ArgsAsWritten); - ConstrainedParameter->setTypeConstraint(CL, - ImmediatelyDeclaredConstraint.get(), - /*ArgumentPackSubstitutionIndex=*/-1); + ConstrainedParameter->setTypeConstraint( + CL, ImmediatelyDeclaredConstraint.get(), std::nullopt); return false; } @@ -5244,8 +5243,7 @@ bool Sema::CheckTemplateArgument(NamedDecl *Param, TemplateArgumentLoc &ArgLoc, /*Final=*/true); // If the parameter is a pack expansion, expand this slice of the pack. if (auto *PET = NTTPType->getAs()) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(*this, - ArgumentPackIndex); + Sema::ArgPackSubstIndexRAII SubstIndex(*this, ArgumentPackIndex); NTTPType = SubstType(PET->getPattern(), MLTAL, NTTP->getLocation(), NTTP->getDeclName()); } else { @@ -5563,7 +5561,7 @@ bool Sema::CheckTemplateArgumentList( // If we have an expanded parameter pack, make sure we don't have too // many arguments. - if (std::optional Expansions = getExpandedPackSize(*Param)) { + if (UnsignedOrNone Expansions = getExpandedPackSize(*Param)) { if (*Expansions == SugaredArgumentPack.size()) { // We're done with this parameter pack. Pack up its arguments and add // them to the list. diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 9969f1762fe36..ab6e18aee7206 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -834,7 +834,7 @@ class PackDeductionScope { // FIXME: What if we encounter multiple packs with different numbers of // pre-expanded expansions? (This should already have been diagnosed // during substitution.) - if (std::optional ExpandedPackExpansions = + if (UnsignedOrNone ExpandedPackExpansions = getExpandedPackSize(TemplateParams->getParam(Index))) FixedNumExpansions = ExpandedPackExpansions; @@ -961,14 +961,14 @@ class PackDeductionScope { } // Return the size of the saved packs if all of them has the same size. - std::optional getSavedPackSizeIfAllEqual() const { + UnsignedOrNone getSavedPackSizeIfAllEqual() const { unsigned PackSize = Packs[0].Saved.pack_size(); if (std::all_of(Packs.begin() + 1, Packs.end(), [&PackSize](const auto &P) { return P.Saved.pack_size() == PackSize; })) return PackSize; - return {}; + return std::nullopt; } /// Determine whether this pack has already been deduced from a previous @@ -984,7 +984,7 @@ class PackDeductionScope { /// Determine whether this pack expansion scope has a known, fixed arity. /// This happens if it involves a pack from an outer template that has /// (notionally) already been expanded. - bool hasFixedArity() { return FixedNumExpansions.has_value(); } + bool hasFixedArity() { return static_cast(FixedNumExpansions); } /// Determine whether the next element of the argument is still part of this /// pack. This is the case unless the pack is already expanded to a fixed @@ -1105,7 +1105,7 @@ class PackDeductionScope { // If we have a pre-expanded pack and we didn't deduce enough elements // for it, fail deduction. - if (std::optional Expansions = getExpandedPackSize(Param)) { + if (UnsignedOrNone Expansions = getExpandedPackSize(Param)) { if (*Expansions != PackElements) { Info.Param = makeTemplateParameter(Param); Info.FirstArg = Result; @@ -1130,7 +1130,7 @@ class PackDeductionScope { bool DeducedFromEarlierParameter = false; bool FinishingDeduction = false; /// The number of expansions, if we have a fully-expanded pack in this scope. - std::optional FixedNumExpansions; + UnsignedOrNone FixedNumExpansions = std::nullopt; SmallVector Packs; }; @@ -1222,7 +1222,7 @@ static TemplateDeductionResult DeduceForEachType( // If the parameter type contains an explicitly-specified pack that we // could not expand, skip the number of parameters notionally created // by the expansion. - std::optional NumExpansions = Expansion->getNumExpansions(); + UnsignedOrNone NumExpansions = Expansion->getNumExpansions(); if (NumExpansions && !PackScope.isPartiallyExpanded()) { for (unsigned I = 0; I != *NumExpansions && ArgIdx < Args.size(); ++I, ++ArgIdx) @@ -3735,7 +3735,7 @@ TemplateDeductionResult Sema::SubstituteExplicitTemplateArguments( auto *Param = TemplateParams->getParam(CTAI.SugaredConverted.size() - 1); // If this is a fully-saturated fixed-size pack, it should be // fully-substituted, not partially-substituted. - std::optional Expansions = getExpandedPackSize(Param); + UnsignedOrNone Expansions = getExpandedPackSize(Param); if (!Expansions || Arg.pack_size() < *Expansions) { PartiallySubstitutedPackIndex = CTAI.SugaredConverted.size() - 1; CurrentInstantiationScope->SetPartiallySubstitutedPack( @@ -3967,21 +3967,22 @@ CheckOriginalCallArgDeduction(Sema &S, TemplateDeductionInfo &Info, /// ArgumentPackSubstitutionIndex for further substitutions. // FIXME: We should track this in OriginalCallArgs so we don't need to // reconstruct it here. -static unsigned getPackIndexForParam(Sema &S, - FunctionTemplateDecl *FunctionTemplate, - const MultiLevelTemplateArgumentList &Args, - unsigned ParamIdx) { +static UnsignedOrNone +getPackIndexForParam(Sema &S, FunctionTemplateDecl *FunctionTemplate, + const MultiLevelTemplateArgumentList &Args, + unsigned ParamIdx) { unsigned Idx = 0; for (auto *PD : FunctionTemplate->getTemplatedDecl()->parameters()) { if (PD->isParameterPack()) { - unsigned NumExpansions = - S.getNumArgumentsInExpansion(PD->getType(), Args).value_or(1); + UnsignedOrNone NumArgs = + S.getNumArgumentsInExpansion(PD->getType(), Args); + unsigned NumExpansions = NumArgs ? *NumArgs : 1; if (Idx + NumExpansions > ParamIdx) return ParamIdx - Idx; Idx += NumExpansions; } else { if (Idx == ParamIdx) - return -1; // Not a pack expansion + return std::nullopt; // Not a pack expansion ++Idx; } } @@ -4183,7 +4184,7 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction( QualType &CacheEntry = DeducedATypes[{ParamIdx, OriginalArg.OriginalParamType}]; if (CacheEntry.isNull()) { - ArgumentPackSubstitutionIndexRAII PackIndex( + ArgPackSubstIndexRAII PackIndex( *this, getPackIndexForParam(*this, FunctionTemplate, SubstArgs, ParamIdx)); CacheEntry = @@ -4743,8 +4744,7 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( // If the parameter type contains an explicitly-specified pack that we // could not expand, skip the number of parameters notionally created // by the expansion. - std::optional NumExpansions = - ParamExpansion->getNumExpansions(); + UnsignedOrNone NumExpansions = ParamExpansion->getNumExpansions(); if (NumExpansions && !PackScope.isPartiallyExpanded()) { for (unsigned I = 0; I != *NumExpansions && ArgIdx < Args.size(); ++I, ++ArgIdx) { @@ -4767,7 +4767,7 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( // that, in this case we are not processing all of the remaining // arguments. We are only process as many arguments as we have in // the already deduced parameter. - std::optional ArgPosAfterSubstitution = + UnsignedOrNone ArgPosAfterSubstitution = PackScope.getSavedPackSizeIfAllEqual(); if (!ArgPosAfterSubstitution) continue; @@ -5628,12 +5628,13 @@ static QualType GetImplicitObjectParameterType(ASTContext &Context, } static TemplateDeductionResult CheckDeductionConsistency( - Sema &S, FunctionTemplateDecl *FTD, int ArgIdx, QualType P, QualType A, - ArrayRef DeducedArgs, bool CheckConsistency) { + Sema &S, FunctionTemplateDecl *FTD, UnsignedOrNone ArgIdx, QualType P, + QualType A, ArrayRef DeducedArgs, bool CheckConsistency) { MultiLevelTemplateArgumentList MLTAL(FTD, DeducedArgs, /*Final=*/true); - Sema::ArgumentPackSubstitutionIndexRAII PackIndex( - S, ArgIdx != -1 ? ::getPackIndexForParam(S, FTD, MLTAL, ArgIdx) : -1); + Sema::ArgPackSubstIndexRAII PackIndex( + S, + ArgIdx ? ::getPackIndexForParam(S, FTD, MLTAL, *ArgIdx) : std::nullopt); bool IsIncompleteSubstitution = false; // FIXME: A substitution can be incomplete on a non-structural part of the // type. Use the canonical type for now, until the TemplateInstantiator can @@ -5785,8 +5786,9 @@ static bool isAtLeastAsSpecializedAs( // in deduction. We will still try to substitute them though. if (TPOC != TPOC_Call) { if (auto TDR = ::CheckDeductionConsistency( - S, FTD, /*ArgIdx=*/-1, Proto2->getReturnType(), - Proto1->getReturnType(), DeducedArgs, + S, FTD, /*ArgIdx=*/std::nullopt, + Proto2->getReturnType(), Proto1->getReturnType(), + DeducedArgs, /*CheckConsistency=*/HasDeducedAnyParamFromReturnType); TDR != TemplateDeductionResult::Success) return TDR; @@ -5799,12 +5801,14 @@ static bool isAtLeastAsSpecializedAs( S, TemplateParams, Args2, Args1, Info, Deduced, PartialOrderingKind::Call, /*FinishingDeduction=*/true, [&](Sema &S, TemplateParameterList *, int ParamIdx, - int ArgIdx, QualType P, QualType A, + UnsignedOrNone ArgIdx, QualType P, QualType A, TemplateDeductionInfo &Info, SmallVectorImpl &Deduced, PartialOrderingKind) { - if (ArgIdx != -1) - ArgIdx -= Args1Offset; + if (ArgIdx && *ArgIdx >= Args1Offset) + ArgIdx = *ArgIdx - Args1Offset; + else + ArgIdx = std::nullopt; return ::CheckDeductionConsistency( S, FTD, ArgIdx, P, A, DeducedArgs, /*CheckConsistency=*/HasDeducedParam[ParamIdx]); diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp index 99bd9d0fb79af..b4863cefc3fb4 100644 --- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp +++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp @@ -252,9 +252,7 @@ TemplateTypeParmDecl *transformTemplateTypeParam( SemaRef.Context, DC, TTP->getBeginLoc(), TTP->getLocation(), NewDepth, NewIndex, TTP->getIdentifier(), TTP->wasDeclaredWithTypename(), TTP->isParameterPack(), TTP->hasTypeConstraint(), - TTP->isExpandedParameterPack() - ? std::optional(TTP->getNumExpansionParameters()) - : std::nullopt); + TTP->getNumExpansionParameters()); if (const auto *TC = TTP->getTypeConstraint()) SemaRef.SubstTypeConstraint(NewTTP, TC, Args, /*EvaluateConstraint=*/EvaluateConstraint); @@ -356,8 +354,7 @@ struct ConvertConstructorToDeductionGuideTransform { TemplateParameterList *TemplateParams = SemaRef.GetTemplateParameterList(Template); SmallVector Depth1Args; - AssociatedConstraint OuterRC(TemplateParams->getRequiresClause(), - /*ArgumentPackSubstitutionIndex=*/-1); + AssociatedConstraint OuterRC(TemplateParams->getRequiresClause()); if (FTD) { TemplateParameterList *InnerParams = FTD->getTemplateParameters(); SmallVector AllParams; @@ -469,8 +466,7 @@ struct ConvertConstructorToDeductionGuideTransform { const_cast(RC.ConstraintExpr), Args); if (!E.isUsable()) return nullptr; - FunctionTrailingRC = - AssociatedConstraint(E.get(), RC.ArgumentPackSubstitutionIndex); + FunctionTrailingRC = AssociatedConstraint(E.get(), RC.ArgPackSubstIndex); } // C++ [over.match.class.deduct]p1: @@ -495,7 +491,7 @@ struct ConvertConstructorToDeductionGuideTransform { /*rhs=*/const_cast(FunctionTrailingRC.ConstraintExpr), BO_LAnd, SemaRef.Context.BoolTy, VK_PRValue, OK_Ordinary, TemplateParams->getTemplateLoc(), FPOptionsOverride()), - FunctionTrailingRC.ArgumentPackSubstitutionIndex); + FunctionTrailingRC.ArgPackSubstIndex); } return buildDeductionGuide( @@ -630,7 +626,7 @@ struct ConvertConstructorToDeductionGuideTransform { TypeSourceInfo *NewDI; if (auto PackTL = OldDI->getTypeLoc().getAs()) { // Expand out the one and only element in each inner pack. - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(SemaRef, 0); + Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, 0u); NewDI = SemaRef.SubstType(PackTL.getPatternLoc(), Args, OldParam->getLocation(), OldParam->getDeclName()); @@ -1247,10 +1243,8 @@ void DeclareImplicitDeductionGuidesForTypeAlias( // FIXME: Here the synthesized deduction guide is not a templated // function. Per [dcl.decl]p4, the requires-clause shall be present only // if the declarator declares a templated function, a bug in standard? - AssociatedConstraint Constraint( - buildIsDeducibleConstraint(SemaRef, AliasTemplate, - Transformed->getReturnType(), {}), - /*ArgumentPackSubstitutionIndex=*/-1); + AssociatedConstraint Constraint(buildIsDeducibleConstraint( + SemaRef, AliasTemplate, Transformed->getReturnType(), {})); if (const AssociatedConstraint &RC = DG->getTrailingRequiresClause()) { auto Conjunction = SemaRef.BuildBinOp( SemaRef.getCurScope(), SourceLocation{}, @@ -1258,8 +1252,7 @@ void DeclareImplicitDeductionGuidesForTypeAlias( const_cast(Constraint.ConstraintExpr)); if (!Conjunction.isInvalid()) { Constraint.ConstraintExpr = Conjunction.getAs(); - Constraint.ArgumentPackSubstitutionIndex = - RC.ArgumentPackSubstitutionIndex; + Constraint.ArgPackSubstIndex = RC.ArgPackSubstIndex; } } Transformed->setTrailingRequiresClause(Constraint); @@ -1355,8 +1348,7 @@ FunctionTemplateDecl *Sema::DeclareAggregateDeductionGuideFromInitList( // In case we were expanding a pack when we attempted to declare deduction // guides, turn off pack expansion for everything we're about to do. - ArgumentPackSubstitutionIndexRAII SubstIndex(*this, - /*NewSubstitutionIndex=*/-1); + ArgPackSubstIndexRAII SubstIndex(*this, std::nullopt); // Create a template instantiation record to track the "instantiation" of // constructors into deduction guides. InstantiatingTemplate BuildingDeductionGuides( @@ -1405,7 +1397,7 @@ void Sema::DeclareImplicitDeductionGuides(TemplateDecl *Template, // In case we were expanding a pack when we attempted to declare deduction // guides, turn off pack expansion for everything we're about to do. - ArgumentPackSubstitutionIndexRAII SubstIndex(*this, -1); + ArgPackSubstIndexRAII SubstIndex(*this, std::nullopt); // Create a template instantiation record to track the "instantiation" of // constructors into deduction guides. InstantiatingTemplate BuildingDeductionGuides( diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index d835b3b06893d..dd493a083d86d 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1350,9 +1350,9 @@ std::optional Sema::isSFINAEContext() const { static TemplateArgument getPackSubstitutedTemplateArgument(Sema &S, TemplateArgument Arg) { - assert(S.ArgumentPackSubstitutionIndex >= 0); - assert(S.ArgumentPackSubstitutionIndex < (int)Arg.pack_size()); - Arg = Arg.pack_begin()[S.ArgumentPackSubstitutionIndex]; + assert(S.ArgPackSubstIndex); + assert(*S.ArgPackSubstIndex < Arg.pack_size()); + Arg = Arg.pack_begin()[*S.ArgPackSubstIndex]; if (Arg.isPackExpansion()) Arg = Arg.getPackExpansionPattern(); return Arg; @@ -1418,18 +1418,18 @@ namespace { return TemplateArgs.getNewDepth(Depth); } - std::optional getPackIndex(TemplateArgument Pack) { - int Index = getSema().ArgumentPackSubstitutionIndex; - if (Index == -1) + UnsignedOrNone getPackIndex(TemplateArgument Pack) { + UnsignedOrNone Index = getSema().ArgPackSubstIndex; + if (!Index) return std::nullopt; - return Pack.pack_size() - 1 - Index; + return Pack.pack_size() - 1 - *Index; } bool TryExpandParameterPacks(SourceLocation EllipsisLoc, SourceRange PatternRange, ArrayRef Unexpanded, bool &ShouldExpand, bool &RetainExpansion, - std::optional &NumExpansions) { + UnsignedOrNone &NumExpansions) { return getSema().CheckParameterPacksForExpansion(EllipsisLoc, PatternRange, Unexpanded, TemplateArgs, @@ -1481,7 +1481,7 @@ namespace { getTemplateArgumentPackPatternForRewrite(const TemplateArgument &TA) { if (TA.getKind() != TemplateArgument::Pack) return TA; - if (SemaRef.ArgumentPackSubstitutionIndex != -1) + if (SemaRef.ArgPackSubstIndex) return getPackSubstitutedTemplateArgument(SemaRef, TA); assert(TA.pack_size() == 1 && TA.pack_begin()->isPackExpansion() && "unexpected pack arguments in template rewrite"); @@ -1667,7 +1667,7 @@ namespace { return inherited::TransformTemplateArgument(Input, Output, Uneval); } - std::optional ComputeSizeOfPackExprWithoutSubstitution( + UnsignedOrNone ComputeSizeOfPackExprWithoutSubstitution( ArrayRef PackArgs) { // Don't do this when rewriting template parameters for CTAD: // 1) The heuristic needs the unpacked Subst* nodes to figure out the @@ -1691,10 +1691,10 @@ namespace { Qualifiers ThisTypeQuals, Fn TransformExceptionSpec); - ParmVarDecl * - TransformFunctionTypeParam(ParmVarDecl *OldParm, int indexAdjustment, - std::optional NumExpansions, - bool ExpectParameterPack); + ParmVarDecl *TransformFunctionTypeParam(ParmVarDecl *OldParm, + int indexAdjustment, + UnsignedOrNone NumExpansions, + bool ExpectParameterPack); using inherited::TransformTemplateTypeParmType; /// Transforms a template type parameter type by performing @@ -1705,7 +1705,7 @@ namespace { QualType BuildSubstTemplateTypeParmType( TypeLocBuilder &TLB, bool SuppressObjCLifetime, bool Final, - Decl *AssociatedDecl, unsigned Index, std::optional PackIndex, + Decl *AssociatedDecl, unsigned Index, UnsignedOrNone PackIndex, TemplateArgument Arg, SourceLocation NameLoc); /// Transforms an already-substituted template type parameter pack @@ -1865,10 +1865,11 @@ namespace { Sema::ExtParameterInfoBuilder &PInfos); private: - ExprResult transformNonTypeTemplateParmRef( - Decl *AssociatedDecl, const NonTypeTemplateParmDecl *parm, - SourceLocation loc, TemplateArgument arg, - std::optional PackIndex, bool Final); + ExprResult + transformNonTypeTemplateParmRef(Decl *AssociatedDecl, + const NonTypeTemplateParmDecl *parm, + SourceLocation loc, TemplateArgument arg, + UnsignedOrNone PackIndex, bool Final); }; } @@ -1957,7 +1958,7 @@ TemplateInstantiator::TransformFirstQualifierInScope(NamedDecl *D, assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); - if (getSema().ArgumentPackSubstitutionIndex == -1) + if (!getSema().ArgPackSubstIndex) return nullptr; Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); @@ -2062,12 +2063,12 @@ TemplateName TemplateInstantiator::TransformTemplateName( auto [AssociatedDecl, Final] = TemplateArgs.getAssociatedDecl(TTP->getDepth()); - std::optional PackIndex; + UnsignedOrNone PackIndex = std::nullopt; if (TTP->isParameterPack()) { assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); - if (getSema().ArgumentPackSubstitutionIndex == -1) { + if (!getSema().ArgPackSubstIndex) { // We have the template argument pack to substitute, but we're not // actually expanding the enclosing pack expansion yet. So, just // keep the entire argument pack. @@ -2089,7 +2090,7 @@ TemplateName TemplateInstantiator::TransformTemplateName( if (SubstTemplateTemplateParmPackStorage *SubstPack = Name.getAsSubstTemplateTemplateParmPack()) { - if (getSema().ArgumentPackSubstitutionIndex == -1) + if (!getSema().ArgPackSubstIndex) return Name; TemplateArgument Pack = SubstPack->getArgumentPack(); @@ -2141,12 +2142,12 @@ TemplateInstantiator::TransformTemplateParmRefExpr(DeclRefExpr *E, auto [AssociatedDecl, Final] = TemplateArgs.getAssociatedDecl(NTTP->getDepth()); - std::optional PackIndex; + UnsignedOrNone PackIndex = std::nullopt; if (NTTP->isParameterPack()) { assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); - if (getSema().ArgumentPackSubstitutionIndex == -1) { + if (!getSema().ArgPackSubstIndex) { // We have an argument pack, but we can't select a particular argument // out of it yet. Therefore, we'll build an expression to hold on to that // argument pack. @@ -2262,7 +2263,7 @@ TemplateInstantiator::TransformOpenACCRoutineDeclAttr( ExprResult TemplateInstantiator::transformNonTypeTemplateParmRef( Decl *AssociatedDecl, const NonTypeTemplateParmDecl *parm, - SourceLocation loc, TemplateArgument arg, std::optional PackIndex, + SourceLocation loc, TemplateArgument arg, UnsignedOrNone PackIndex, bool Final) { ExprResult result; @@ -2271,7 +2272,7 @@ ExprResult TemplateInstantiator::transformNonTypeTemplateParmRef( auto SubstParamType = [&] { QualType T; if (parm->isExpandedParameterPack()) - T = parm->getExpansionType(SemaRef.ArgumentPackSubstitutionIndex); + T = parm->getExpansionType(*SemaRef.ArgPackSubstIndex); else T = parm->getType(); if (parm->isParameterPack() && isa(T)) @@ -2337,7 +2338,7 @@ ExprResult TemplateInstantiator::transformNonTypeTemplateParmRef( ExprResult TemplateInstantiator::TransformSubstNonTypeTemplateParmPackExpr( SubstNonTypeTemplateParmPackExpr *E) { - if (getSema().ArgumentPackSubstitutionIndex == -1) { + if (!getSema().ArgPackSubstIndex) { // We aren't expanding the parameter pack, so just return ourselves. return E; } @@ -2401,9 +2402,9 @@ ExprResult TemplateInstantiator::RebuildVarDeclRefExpr(ValueDecl *PD, ExprResult TemplateInstantiator::TransformFunctionParmPackExpr(FunctionParmPackExpr *E) { - if (getSema().ArgumentPackSubstitutionIndex != -1) { + if (getSema().ArgPackSubstIndex) { // We can expand this parameter pack now. - ValueDecl *D = E->getExpansion(getSema().ArgumentPackSubstitutionIndex); + ValueDecl *D = E->getExpansion(*getSema().ArgPackSubstIndex); ValueDecl *VD = cast_or_null(TransformDecl(E->getExprLoc(), D)); if (!VD) return ExprError(); @@ -2445,7 +2446,7 @@ TemplateInstantiator::TransformFunctionParmPackRefExpr(DeclRefExpr *E, if (DeclArgumentPack *Pack = dyn_cast(*Found)) { // If this is a reference to a function parameter pack which we can // substitute but can't yet expand, build a FunctionParmPackExpr for it. - if (getSema().ArgumentPackSubstitutionIndex == -1) { + if (!getSema().ArgPackSubstIndex) { QualType T = TransformType(E->getType()); if (T.isNull()) return ExprError(); @@ -2455,7 +2456,7 @@ TemplateInstantiator::TransformFunctionParmPackRefExpr(DeclRefExpr *E, return PackExpr; } - TransformedDecl = (*Pack)[getSema().ArgumentPackSubstitutionIndex]; + TransformedDecl = (*Pack)[*getSema().ArgPackSubstIndex]; } else { TransformedDecl = cast(*Found); } @@ -2520,8 +2521,8 @@ QualType TemplateInstantiator::TransformFunctionProtoType(TypeLocBuilder &TLB, } ParmVarDecl *TemplateInstantiator::TransformFunctionTypeParam( - ParmVarDecl *OldParm, int indexAdjustment, - std::optional NumExpansions, bool ExpectParameterPack) { + ParmVarDecl *OldParm, int indexAdjustment, UnsignedOrNone NumExpansions, + bool ExpectParameterPack) { auto NewParm = SemaRef.SubstParmVarDecl( OldParm, TemplateArgs, indexAdjustment, NumExpansions, ExpectParameterPack, EvaluateConstraints); @@ -2532,7 +2533,7 @@ ParmVarDecl *TemplateInstantiator::TransformFunctionTypeParam( QualType TemplateInstantiator::BuildSubstTemplateTypeParmType( TypeLocBuilder &TLB, bool SuppressObjCLifetime, bool Final, - Decl *AssociatedDecl, unsigned Index, std::optional PackIndex, + Decl *AssociatedDecl, unsigned Index, UnsignedOrNone PackIndex, TemplateArgument Arg, SourceLocation NameLoc) { QualType Replacement = Arg.getAsType(); @@ -2594,12 +2595,12 @@ TemplateInstantiator::TransformTemplateTypeParmType(TypeLocBuilder &TLB, auto [AssociatedDecl, Final] = TemplateArgs.getAssociatedDecl(T->getDepth()); - std::optional PackIndex; + UnsignedOrNone PackIndex = std::nullopt; if (T->isParameterPack()) { assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); - if (getSema().ArgumentPackSubstitutionIndex == -1) { + if (!getSema().ArgPackSubstIndex) { // We have the template argument pack, but we're not expanding the // enclosing pack expansion yet. Just save the template argument // pack for later substitution. @@ -2647,7 +2648,7 @@ QualType TemplateInstantiator::TransformSubstTemplateTypeParmPackType( Decl *NewReplaced = TransformDecl(TL.getNameLoc(), T->getAssociatedDecl()); - if (getSema().ArgumentPackSubstitutionIndex == -1) { + if (!getSema().ArgPackSubstIndex) { // We aren't expanding the parameter pack, so just return ourselves. QualType Result = TL.getType(); if (NewReplaced != T->getAssociatedDecl()) @@ -2866,7 +2867,7 @@ TemplateInstantiator::TransformNestedRequirement( if (!SemaRef.CheckConstraintSatisfaction( nullptr, AssociatedConstraint(Req->getConstraintExpr(), - SemaRef.ArgumentPackSubstitutionIndex), + SemaRef.ArgPackSubstIndex), Result, TemplateArgs, Req->getConstraintExpr()->getSourceRange(), Satisfaction) && !Result.empty()) @@ -3150,9 +3151,9 @@ bool Sema::SubstTypeConstraint( TC->getTemplateArgsAsWritten(); if (!EvaluateConstraints) { - auto Index = TC->getArgumentPackSubstitutionIndex(); - if (Index == -1) - Index = SemaRef.ArgumentPackSubstitutionIndex; + UnsignedOrNone Index = TC->getArgPackSubstIndex(); + if (!Index) + Index = SemaRef.ArgPackSubstIndex; Inst->setTypeConstraint(TC->getConceptReference(), TC->getImmediatelyDeclaredConstraint(), Index); return false; @@ -3177,10 +3178,11 @@ bool Sema::SubstTypeConstraint( : SourceLocation()); } -ParmVarDecl *Sema::SubstParmVarDecl( - ParmVarDecl *OldParm, const MultiLevelTemplateArgumentList &TemplateArgs, - int indexAdjustment, std::optional NumExpansions, - bool ExpectParameterPack, bool EvaluateConstraint) { +ParmVarDecl * +Sema::SubstParmVarDecl(ParmVarDecl *OldParm, + const MultiLevelTemplateArgumentList &TemplateArgs, + int indexAdjustment, UnsignedOrNone NumExpansions, + bool ExpectParameterPack, bool EvaluateConstraint) { TypeSourceInfo *OldDI = OldParm->getTypeSourceInfo(); TypeSourceInfo *NewDI = nullptr; @@ -3422,7 +3424,7 @@ Sema::SubstBaseSpecifiers(CXXRecordDecl *Instantiation, Unexpanded); bool ShouldExpand = false; bool RetainExpansion = false; - std::optional NumExpansions; + UnsignedOrNone NumExpansions = std::nullopt; if (CheckParameterPacksForExpansion(Base.getEllipsisLoc(), Base.getSourceRange(), Unexpanded, @@ -3436,7 +3438,7 @@ Sema::SubstBaseSpecifiers(CXXRecordDecl *Instantiation, // If we should expand this pack expansion now, do so. if (ShouldExpand) { for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(*this, I); + Sema::ArgPackSubstIndexRAII SubstIndex(*this, I); TypeSourceInfo *BaseTypeLoc = SubstType(Base.getTypeSourceInfo(), TemplateArgs, @@ -3464,7 +3466,7 @@ Sema::SubstBaseSpecifiers(CXXRecordDecl *Instantiation, // The resulting base specifier will (still) be a pack expansion. EllipsisLoc = Base.getEllipsisLoc(); - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(*this, -1); + Sema::ArgPackSubstIndexRAII SubstIndex(*this, std::nullopt); BaseTypeLoc = SubstType(Base.getTypeSourceInfo(), TemplateArgs, Base.getSourceRange().getBegin(), diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index fcb4ee5650f91..2d6f2ca67af8a 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -130,7 +130,7 @@ static void instantiateDependentAlignedAttr( // Determine whether we can expand this attribute pack yet. bool Expand = true, RetainExpansion = false; - std::optional NumExpansions; + UnsignedOrNone NumExpansions = std::nullopt; // FIXME: Use the actual location of the ellipsis. SourceLocation EllipsisLoc = Aligned->getLocation(); if (S.CheckParameterPacksForExpansion(EllipsisLoc, Aligned->getRange(), @@ -139,11 +139,11 @@ static void instantiateDependentAlignedAttr( return; if (!Expand) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(S, -1); + Sema::ArgPackSubstIndexRAII SubstIndex(S, std::nullopt); instantiateDependentAlignedAttr(S, TemplateArgs, Aligned, New, true); } else { for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(S, I); + Sema::ArgPackSubstIndexRAII SubstIndex(S, I); instantiateDependentAlignedAttr(S, TemplateArgs, Aligned, New, false); } } @@ -1888,7 +1888,7 @@ Decl *TemplateDeclInstantiator::VisitFriendDecl(FriendDecl *D) { bool ShouldExpand = true; bool RetainExpansion = false; - std::optional NumExpansions; + UnsignedOrNone NumExpansions = std::nullopt; if (SemaRef.CheckParameterPacksForExpansion( D->getEllipsisLoc(), D->getSourceRange(), Unexpanded, TemplateArgs, ShouldExpand, RetainExpansion, NumExpansions)) @@ -1900,7 +1900,7 @@ Decl *TemplateDeclInstantiator::VisitFriendDecl(FriendDecl *D) { if (ShouldExpand) { SmallVector Decls; for (unsigned I = 0; I != *NumExpansions; I++) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(SemaRef, I); + Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, I); TypeSourceInfo *TSI = SemaRef.SubstType( Ty, TemplateArgs, D->getEllipsisLoc(), DeclarationName()); if (!TSI) @@ -3418,10 +3418,10 @@ Decl *TemplateDeclInstantiator::VisitTemplateTypeParmDecl( TemplateTypeParmDecl *D) { assert(D->getTypeForDecl()->isTemplateTypeParmType()); - std::optional NumExpanded; + UnsignedOrNone NumExpanded = std::nullopt; if (const TypeConstraint *TC = D->getTypeConstraint()) { - if (D->isPackExpansion() && !D->isExpandedParameterPack()) { + if (D->isPackExpansion() && !D->getNumExpansionParameters()) { assert(TC->getTemplateArgsAsWritten() && "type parameter can only be an expansion when explicit arguments " "are specified"); @@ -3528,9 +3528,9 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl( // be expanded. bool Expand = true; bool RetainExpansion = false; - std::optional OrigNumExpansions = + UnsignedOrNone OrigNumExpansions = Expansion.getTypePtr()->getNumExpansions(); - std::optional NumExpansions = OrigNumExpansions; + UnsignedOrNone NumExpansions = OrigNumExpansions; if (SemaRef.CheckParameterPacksForExpansion(Expansion.getEllipsisLoc(), Pattern.getSourceRange(), Unexpanded, @@ -3541,7 +3541,7 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl( if (Expand) { for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(SemaRef, I); + Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, I); TypeSourceInfo *NewDI = SemaRef.SubstType(Pattern, TemplateArgs, D->getLocation(), D->getDeclName()); @@ -3566,7 +3566,7 @@ Decl *TemplateDeclInstantiator::VisitNonTypeTemplateParmDecl( } else { // We cannot fully expand the pack expansion now, so substitute into the // pattern and create a new pack expansion type. - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(SemaRef, -1); + Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, std::nullopt); TypeSourceInfo *NewPattern = SemaRef.SubstType(Pattern, TemplateArgs, D->getLocation(), D->getDeclName()); @@ -3701,7 +3701,7 @@ TemplateDeclInstantiator::VisitTemplateTemplateParmDecl( // be expanded. bool Expand = true; bool RetainExpansion = false; - std::optional NumExpansions; + UnsignedOrNone NumExpansions = std::nullopt; if (SemaRef.CheckParameterPacksForExpansion(D->getLocation(), TempParams->getSourceRange(), Unexpanded, @@ -3712,7 +3712,7 @@ TemplateDeclInstantiator::VisitTemplateTemplateParmDecl( if (Expand) { for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(SemaRef, I); + Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, I); LocalInstantiationScope Scope(SemaRef); TemplateParameterList *Expansion = SubstTemplateParams(TempParams); if (!Expansion) @@ -3728,7 +3728,7 @@ TemplateDeclInstantiator::VisitTemplateTemplateParmDecl( } else { // We cannot fully expand the pack expansion now, so just substitute // into the pattern. - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(SemaRef, -1); + Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, std::nullopt); LocalInstantiationScope Scope(SemaRef); InstParams = SubstTemplateParams(TempParams); @@ -3978,7 +3978,7 @@ Decl *TemplateDeclInstantiator::instantiateUnresolvedUsingDecl( // be expanded. bool Expand = true; bool RetainExpansion = false; - std::optional NumExpansions; + UnsignedOrNone NumExpansions = std::nullopt; if (SemaRef.CheckParameterPacksForExpansion( D->getEllipsisLoc(), D->getSourceRange(), Unexpanded, TemplateArgs, Expand, RetainExpansion, NumExpansions)) @@ -3992,7 +3992,7 @@ Decl *TemplateDeclInstantiator::instantiateUnresolvedUsingDecl( if (!Expand) { // We cannot fully expand the pack expansion now, so substitute into the // pattern and create a new pack expansion. - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(SemaRef, -1); + Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, std::nullopt); return instantiateUnresolvedUsingDecl(D, true); } @@ -4012,7 +4012,7 @@ Decl *TemplateDeclInstantiator::instantiateUnresolvedUsingDecl( // Instantiate the slices of this pack and build a UsingPackDecl. SmallVector Expansions; for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(SemaRef, I); + Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, I); Decl *Slice = instantiateUnresolvedUsingDecl(D, true); if (!Slice) return nullptr; @@ -4046,8 +4046,8 @@ Decl *TemplateDeclInstantiator::instantiateUnresolvedUsingDecl( // Produce a pack expansion only if we're not instantiating a particular // slice of a pack expansion. - bool InstantiatingSlice = D->getEllipsisLoc().isValid() && - SemaRef.ArgumentPackSubstitutionIndex != -1; + bool InstantiatingSlice = + D->getEllipsisLoc().isValid() && SemaRef.ArgPackSubstIndex; SourceLocation EllipsisLoc = InstantiatingSlice ? SourceLocation() : D->getEllipsisLoc(); @@ -4998,7 +4998,7 @@ TemplateDeclInstantiator::SubstFunctionType(FunctionDecl *D, LocalInstantiationScope *Scope = SemaRef.CurrentInstantiationScope; - std::optional NumArgumentsInExpansion; + UnsignedOrNone NumArgumentsInExpansion = std::nullopt; if (OldParam->isParameterPack()) NumArgumentsInExpansion = SemaRef.getNumArgumentsInExpansion(OldParam->getType(), @@ -5125,7 +5125,7 @@ bool Sema::addInstantiatedParametersToScope( // Expand the parameter pack. Scope.MakeInstantiatedLocalArgPack(PatternParam); - std::optional NumArgumentsInExpansion = + UnsignedOrNone NumArgumentsInExpansion = getNumArgumentsInExpansion(PatternParam->getType(), TemplateArgs); if (NumArgumentsInExpansion) { QualType PatternType = @@ -5134,7 +5134,7 @@ bool Sema::addInstantiatedParametersToScope( ParmVarDecl *FunctionParam = Function->getParamDecl(FParamIdx); FunctionParam->setDeclName(PatternParam->getDeclName()); if (!PatternDecl->getType()->isDependentType()) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(*this, Arg); + Sema::ArgPackSubstIndexRAII SubstIndex(*this, Arg); QualType T = SubstType(PatternType, TemplateArgs, FunctionParam->getLocation(), FunctionParam->getDeclName()); @@ -6306,7 +6306,7 @@ Sema::InstantiateMemInitializers(CXXConstructorDecl *New, collectUnexpandedParameterPacks(Init->getInit(), Unexpanded); bool ShouldExpand = false; bool RetainExpansion = false; - std::optional NumExpansions; + UnsignedOrNone NumExpansions = std::nullopt; if (CheckParameterPacksForExpansion(Init->getEllipsisLoc(), BaseTL.getSourceRange(), Unexpanded, @@ -6321,7 +6321,7 @@ Sema::InstantiateMemInitializers(CXXConstructorDecl *New, // Loop over all of the arguments in the argument pack(s), for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(*this, I); + Sema::ArgPackSubstIndexRAII SubstIndex(*this, I); // Instantiate the initializer. ExprResult TempInit = SubstInitializer(Init->getInit(), TemplateArgs, @@ -6694,18 +6694,17 @@ NamedDecl *Sema::FindInstantiatedDecl(SourceLocation Loc, NamedDecl *D, if (auto Found = CurrentInstantiationScope->findInstantiationOf(D)) { if (Decl *FD = Found->dyn_cast()) { if (auto *BD = dyn_cast(FD); - BD && BD->isParameterPack() && - ArgumentPackSubstitutionIndex != -1) { - return BD->getBindingPackDecls()[ArgumentPackSubstitutionIndex]; + BD && BD->isParameterPack() && ArgPackSubstIndex) { + return BD->getBindingPackDecls()[*ArgPackSubstIndex]; } return cast(FD); } - int PackIdx = ArgumentPackSubstitutionIndex; - assert(PackIdx != -1 && + assert(ArgPackSubstIndex && "found declaration pack but not pack expanding"); typedef LocalInstantiationScope::DeclArgumentPack DeclArgumentPack; - return cast((*cast(*Found))[PackIdx]); + return cast( + (*cast(*Found))[*ArgPackSubstIndex]); } } diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp index d9256dbd07d7a..ef0e6ee23e942 100644 --- a/clang/lib/Sema/SemaTemplateVariadic.cpp +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp @@ -685,9 +685,9 @@ TypeResult Sema::ActOnPackExpansion(ParsedType Type, return CreateParsedType(TSResult->getType(), TSResult); } -TypeSourceInfo * -Sema::CheckPackExpansion(TypeSourceInfo *Pattern, SourceLocation EllipsisLoc, - std::optional NumExpansions) { +TypeSourceInfo *Sema::CheckPackExpansion(TypeSourceInfo *Pattern, + SourceLocation EllipsisLoc, + UnsignedOrNone NumExpansions) { // Create the pack expansion type and source-location information. QualType Result = CheckPackExpansion(Pattern->getType(), Pattern->getTypeLoc().getSourceRange(), @@ -705,7 +705,7 @@ Sema::CheckPackExpansion(TypeSourceInfo *Pattern, SourceLocation EllipsisLoc, QualType Sema::CheckPackExpansion(QualType Pattern, SourceRange PatternRange, SourceLocation EllipsisLoc, - std::optional NumExpansions) { + UnsignedOrNone NumExpansions) { // C++11 [temp.variadic]p5: // The pattern of a pack expansion shall name one or more // parameter packs that are not expanded by a nested pack @@ -729,7 +729,7 @@ ExprResult Sema::ActOnPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc) { } ExprResult Sema::CheckPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc, - std::optional NumExpansions) { + UnsignedOrNone NumExpansions) { if (!Pattern) return ExprError(); @@ -753,12 +753,12 @@ bool Sema::CheckParameterPacksForExpansion( SourceLocation EllipsisLoc, SourceRange PatternRange, ArrayRef Unexpanded, const MultiLevelTemplateArgumentList &TemplateArgs, bool &ShouldExpand, - bool &RetainExpansion, std::optional &NumExpansions) { + bool &RetainExpansion, UnsignedOrNone &NumExpansions) { ShouldExpand = true; RetainExpansion = false; std::pair FirstPack; bool HaveFirstPack = false; - std::optional NumPartialExpansions; + UnsignedOrNone NumPartialExpansions = std::nullopt; SourceLocation PartiallySubstitutedPackLoc; typedef LocalInstantiationScope::DeclArgumentPack DeclArgumentPack; @@ -942,10 +942,10 @@ bool Sema::CheckParameterPacksForExpansion( return false; } -std::optional Sema::getNumArgumentsInExpansionFromUnexpanded( +UnsignedOrNone Sema::getNumArgumentsInExpansionFromUnexpanded( llvm::ArrayRef Unexpanded, const MultiLevelTemplateArgumentList &TemplateArgs) { - std::optional Result; + UnsignedOrNone Result = std::nullopt; for (unsigned I = 0, N = Unexpanded.size(); I != N; ++I) { // Compute the depth and index for this parameter pack. unsigned Depth; @@ -992,7 +992,7 @@ std::optional Sema::getNumArgumentsInExpansionFromUnexpanded( return Result; } -std::optional Sema::getNumArgumentsInExpansion( +UnsignedOrNone Sema::getNumArgumentsInExpansion( QualType T, const MultiLevelTemplateArgumentList &TemplateArgs) { QualType Pattern = cast(T)->getPattern(); SmallVector Unexpanded; @@ -1252,7 +1252,7 @@ ExprResult Sema::BuildPackIndexingExpr(Expr *PackExpression, TemplateArgumentLoc Sema::getTemplateArgumentPackExpansionPattern( TemplateArgumentLoc OrigLoc, SourceLocation &Ellipsis, - std::optional &NumExpansions) const { + UnsignedOrNone &NumExpansions) const { const TemplateArgument &Argument = OrigLoc.getArgument(); assert(Argument.isPackExpansion()); switch (Argument.getKind()) { @@ -1310,7 +1310,7 @@ TemplateArgumentLoc Sema::getTemplateArgumentPackExpansionPattern( llvm_unreachable("Invalid TemplateArgument Kind!"); } -std::optional Sema::getFullyPackExpandedSize(TemplateArgument Arg) { +UnsignedOrNone Sema::getFullyPackExpandedSize(TemplateArgument Arg) { assert(Arg.containsUnexpandedParameterPack()); // If this is a substituted pack, grab that pack. If not, we don't know @@ -1464,7 +1464,7 @@ ExprResult Sema::BuildCXXFoldExpr(UnresolvedLookupExpr *Callee, BinaryOperatorKind Operator, SourceLocation EllipsisLoc, Expr *RHS, SourceLocation RParenLoc, - std::optional NumExpansions) { + UnsignedOrNone NumExpansions) { return new (Context) CXXFoldExpr(Context.DependentTy, Callee, LParenLoc, LHS, Operator, EllipsisLoc, RHS, RParenLoc, NumExpansions); diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 20240fdaf2bbb..4e7726e258110 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -9685,7 +9685,7 @@ QualType Sema::BuildPackIndexingType(QualType Pattern, Expr *IndexExpr, bool FullySubstituted, ArrayRef Expansions) { - std::optional Index; + UnsignedOrNone Index = std::nullopt; if (FullySubstituted && !IndexExpr->isValueDependent() && !IndexExpr->isTypeDependent()) { llvm::APSInt Value(Context.getIntWidth(Context.getSizeType())); @@ -9693,20 +9693,18 @@ QualType Sema::BuildPackIndexingType(QualType Pattern, Expr *IndexExpr, IndexExpr, Context.getSizeType(), Value, CCEK_ArrayBound); if (!Res.isUsable()) return QualType(); - Index = Value.getExtValue(); IndexExpr = Res.get(); - } - - if (FullySubstituted && Index) { - if (*Index < 0 || *Index >= int64_t(Expansions.size())) { + int64_t V = Value.getExtValue(); + if (FullySubstituted && (V < 0 || V >= int64_t(Expansions.size()))) { Diag(IndexExpr->getBeginLoc(), diag::err_pack_index_out_of_bound) - << *Index << Pattern << Expansions.size(); + << V << Pattern << Expansions.size(); return QualType(); } + Index = static_cast(V); } return Context.getPackIndexingType(Pattern, IndexExpr, FullySubstituted, - Expansions, Index.value_or(-1)); + Expansions, Index); } static QualType GetEnumUnderlyingType(Sema &S, QualType BaseType, diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 12ec97ca8c3e0..237c5a9ef501b 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -116,11 +116,11 @@ class TreeTransform { TemplateArgument Old; // Set the pack expansion index to -1 to avoid pack substitution and // indicate that parameter packs should be instantiated as themselves. - Sema::ArgumentPackSubstitutionIndexRAII ResetPackSubstIndex; + Sema::ArgPackSubstIndexRAII ResetPackSubstIndex; public: ForgetPartiallySubstitutedPackRAII(Derived &Self) - : Self(Self), ResetPackSubstIndex(Self.getSema(), -1) { + : Self(Self), ResetPackSubstIndex(Self.getSema(), std::nullopt) { Old = Self.ForgetPartiallySubstitutedPack(); } @@ -165,7 +165,7 @@ class TreeTransform { /// We must always rebuild all AST nodes when performing variadic template /// pack expansion, in order to avoid violating the AST invariant that each /// statement node appears at most once in its containing declaration. - bool AlwaysRebuild() { return SemaRef.ArgumentPackSubstitutionIndex != -1; } + bool AlwaysRebuild() { return static_cast(SemaRef.ArgPackSubstIndex); } /// Whether the transformation is forming an expression or statement that /// replaces the original. In this case, we'll reuse mangling numbers from @@ -293,7 +293,7 @@ class TreeTransform { SourceRange PatternRange, ArrayRef Unexpanded, bool &ShouldExpand, bool &RetainExpansion, - std::optional &NumExpansions) { + UnsignedOrNone &NumExpansions) { ShouldExpand = false; return false; } @@ -764,7 +764,7 @@ class TreeTransform { /// scope index; can be negative ParmVarDecl *TransformFunctionTypeParam(ParmVarDecl *OldParm, int indexAdjustment, - std::optional NumExpansions, + UnsignedOrNone NumExpansions, bool ExpectParameterPack); /// Transform the body of a lambda-expression. @@ -1255,7 +1255,7 @@ class TreeTransform { /// Subclasses may override this routine to provide different behavior. QualType RebuildPackExpansionType(QualType Pattern, SourceRange PatternRange, SourceLocation EllipsisLoc, - std::optional NumExpansions) { + UnsignedOrNone NumExpansions) { return getSema().CheckPackExpansion(Pattern, PatternRange, EllipsisLoc, NumExpansions); } @@ -3653,14 +3653,14 @@ class TreeTransform { return SemaRef.BuildCXXNoexceptExpr(Range.getBegin(), Arg, Range.getEnd()); } - std::optional + UnsignedOrNone ComputeSizeOfPackExprWithoutSubstitution(ArrayRef PackArgs); /// Build a new expression to compute the length of a parameter pack. ExprResult RebuildSizeOfPackExpr(SourceLocation OperatorLoc, NamedDecl *Pack, SourceLocation PackLoc, SourceLocation RParenLoc, - std::optional Length, + UnsignedOrNone Length, ArrayRef PartialArgs) { return SizeOfPackExpr::Create(SemaRef.Context, OperatorLoc, Pack, PackLoc, RParenLoc, Length, PartialArgs); @@ -3970,9 +3970,9 @@ class TreeTransform { /// By default, performs semantic analysis to build a new pack expansion /// for a template argument. Subclasses may override this routine to provide /// different behavior. - TemplateArgumentLoc - RebuildPackExpansion(TemplateArgumentLoc Pattern, SourceLocation EllipsisLoc, - std::optional NumExpansions) { + TemplateArgumentLoc RebuildPackExpansion(TemplateArgumentLoc Pattern, + SourceLocation EllipsisLoc, + UnsignedOrNone NumExpansions) { switch (Pattern.getArgument().getKind()) { case TemplateArgument::Expression: { ExprResult Result @@ -4020,7 +4020,7 @@ class TreeTransform { /// for an expression. Subclasses may override this routine to provide /// different behavior. ExprResult RebuildPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc, - std::optional NumExpansions) { + UnsignedOrNone NumExpansions) { return getSema().CheckPackExpansion(Pattern, EllipsisLoc, NumExpansions); } @@ -4033,7 +4033,7 @@ class TreeTransform { BinaryOperatorKind Operator, SourceLocation EllipsisLoc, Expr *RHS, SourceLocation RParenLoc, - std::optional NumExpansions) { + UnsignedOrNone NumExpansions) { return getSema().BuildCXXFoldExpr(ULE, LParenLoc, LHS, Operator, EllipsisLoc, RHS, RParenLoc, NumExpansions); @@ -4452,8 +4452,8 @@ bool TreeTransform::TransformExprs(Expr *const *Inputs, // be expanded. bool Expand = true; bool RetainExpansion = false; - std::optional OrigNumExpansions = Expansion->getNumExpansions(); - std::optional NumExpansions = OrigNumExpansions; + UnsignedOrNone OrigNumExpansions = Expansion->getNumExpansions(); + UnsignedOrNone NumExpansions = OrigNumExpansions; if (getDerived().TryExpandParameterPacks(Expansion->getEllipsisLoc(), Pattern->getSourceRange(), Unexpanded, @@ -4465,7 +4465,7 @@ bool TreeTransform::TransformExprs(Expr *const *Inputs, // The transform has determined that we should perform a simple // transformation on the pack expansion, producing another pack // expansion. - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); ExprResult OutPattern = getDerived().TransformExpr(Pattern); if (OutPattern.isInvalid()) return true; @@ -4489,7 +4489,7 @@ bool TreeTransform::TransformExprs(Expr *const *Inputs, // The transform has determined that we should perform an elementwise // expansion of the pattern. Do so. for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), I); ExprResult Out = getDerived().TransformExpr(Pattern); if (Out.isInvalid()) return true; @@ -5039,7 +5039,7 @@ bool TreeTransform::TransformTemplateArguments( // We have a pack expansion, for which we will be substituting into // the pattern. SourceLocation Ellipsis; - std::optional OrigNumExpansions; + UnsignedOrNone OrigNumExpansions = std::nullopt; TemplateArgumentLoc Pattern = getSema().getTemplateArgumentPackExpansionPattern( In, Ellipsis, OrigNumExpansions); @@ -5052,7 +5052,7 @@ bool TreeTransform::TransformTemplateArguments( // be expanded. bool Expand = true; bool RetainExpansion = false; - std::optional NumExpansions = OrigNumExpansions; + UnsignedOrNone NumExpansions = OrigNumExpansions; if (getDerived().TryExpandParameterPacks(Ellipsis, Pattern.getSourceRange(), Unexpanded, @@ -5066,7 +5066,7 @@ bool TreeTransform::TransformTemplateArguments( // transformation on the pack expansion, producing another pack // expansion. TemplateArgumentLoc OutPattern; - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); if (getDerived().TransformTemplateArgument(Pattern, OutPattern, Uneval)) return true; @@ -5082,7 +5082,7 @@ bool TreeTransform::TransformTemplateArguments( // The transform has determined that we should perform an elementwise // expansion of the pattern. Do so. for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), I); if (getDerived().TransformTemplateArgument(Pattern, Out, Uneval)) return true; @@ -6080,8 +6080,8 @@ QualType TreeTransform::TransformExtVectorType(TypeLocBuilder &TLB, template ParmVarDecl *TreeTransform::TransformFunctionTypeParam( - ParmVarDecl *OldParm, int indexAdjustment, - std::optional NumExpansions, bool ExpectParameterPack) { + ParmVarDecl *OldParm, int indexAdjustment, UnsignedOrNone NumExpansions, + bool ExpectParameterPack) { TypeSourceInfo *OldDI = OldParm->getTypeSourceInfo(); TypeSourceInfo *NewDI = nullptr; @@ -6152,7 +6152,7 @@ bool TreeTransform::TransformFunctionTypeParams( if (ParmVarDecl *OldParm = Params[i]) { assert(OldParm->getFunctionScopeIndex() == i); - std::optional NumExpansions; + UnsignedOrNone NumExpansions = std::nullopt; ParmVarDecl *NewParm = nullptr; if (OldParm->isParameterPack()) { // We have a function parameter pack that may need to be expanded. @@ -6167,7 +6167,7 @@ bool TreeTransform::TransformFunctionTypeParams( // Determine whether we should expand the parameter packs. bool ShouldExpand = false; bool RetainExpansion = false; - std::optional OrigNumExpansions; + UnsignedOrNone OrigNumExpansions = std::nullopt; if (Unexpanded.size() > 0) { OrigNumExpansions = ExpansionTL.getTypePtr()->getNumExpansions(); NumExpansions = OrigNumExpansions; @@ -6193,7 +6193,7 @@ bool TreeTransform::TransformFunctionTypeParams( // parameters. getDerived().ExpandingFunctionParameterPack(OldParm); for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), I); ParmVarDecl *NewParm = getDerived().TransformFunctionTypeParam(OldParm, indexAdjustment++, @@ -6240,7 +6240,7 @@ bool TreeTransform::TransformFunctionTypeParams( // We'll substitute the parameter now without expanding the pack // expansion. - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); NewParm = getDerived().TransformFunctionTypeParam(OldParm, indexAdjustment, NumExpansions, @@ -6270,7 +6270,7 @@ bool TreeTransform::TransformFunctionTypeParams( assert(ParamTypes); QualType OldType = ParamTypes[i]; bool IsPackExpansion = false; - std::optional NumExpansions; + UnsignedOrNone NumExpansions = std::nullopt; QualType NewType; if (const PackExpansionType *Expansion = dyn_cast(OldType)) { @@ -6294,7 +6294,7 @@ bool TreeTransform::TransformFunctionTypeParams( // Expand the function parameter pack into multiple, separate // parameters. for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), I); QualType NewType = getDerived().TransformType(Pattern); if (NewType.isNull()) return true; @@ -6337,7 +6337,7 @@ bool TreeTransform::TransformFunctionTypeParams( // expansion. OldType = Expansion->getPattern(); IsPackExpansion = true; - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); NewType = getDerived().TransformType(OldType); } else { NewType = getDerived().TransformType(OldType); @@ -6579,7 +6579,7 @@ bool TreeTransform::TransformExceptionSpec( // be expanded. bool Expand = false; bool RetainExpansion = false; - std::optional NumExpansions = PackExpansion->getNumExpansions(); + UnsignedOrNone NumExpansions = PackExpansion->getNumExpansions(); // FIXME: Track the location of the ellipsis (and track source location // information for the types in the exception specification in general). if (getDerived().TryExpandParameterPacks( @@ -6591,7 +6591,7 @@ bool TreeTransform::TransformExceptionSpec( // We can't expand this pack expansion into separate arguments yet; // just substitute into the pattern and create a new pack expansion // type. - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); QualType U = getDerived().TransformType(PackExpansion->getPattern()); if (U.isNull()) return true; @@ -6604,7 +6604,7 @@ bool TreeTransform::TransformExceptionSpec( // Substitute into the pack expansion pattern for each slice of the // pack. for (unsigned ArgIdx = 0; ArgIdx != *NumExpansions; ++ArgIdx) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), ArgIdx); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), ArgIdx); QualType U = getDerived().TransformType(PackExpansion->getPattern()); if (U.isNull() || SemaRef.CheckSpecifiedExceptionType(U, Loc)) @@ -6857,14 +6857,13 @@ TreeTransform::TransformPackIndexingType(TypeLocBuilder &TLB, // be expanded. bool ShouldExpand = true; bool RetainExpansion = false; - std::optional OrigNumExpansions; - std::optional NumExpansions = OrigNumExpansions; + UnsignedOrNone NumExpansions = std::nullopt; if (getDerived().TryExpandParameterPacks(TL.getEllipsisLoc(), SourceRange(), Unexpanded, ShouldExpand, RetainExpansion, NumExpansions)) return QualType(); if (!ShouldExpand) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); // FIXME: should we keep TypeLoc for individual expansions in // PackIndexingTypeLoc? TypeSourceInfo *TI = @@ -6888,7 +6887,7 @@ TreeTransform::TransformPackIndexingType(TypeLocBuilder &TLB, continue; } for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), I); QualType Out = getDerived().TransformType(T); if (Out.isNull()) return QualType(); @@ -6910,7 +6909,7 @@ TreeTransform::TransformPackIndexingType(TypeLocBuilder &TLB, // A pack indexing type can appear in a larger pack expansion, // e.g. `Pack...[pack_of_indexes]...` // so we need to temporarily disable substitution of pack elements - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); QualType Result = getDerived().TransformType(TLB, TL.getPatternLoc()); QualType Out = getDerived().RebuildPackIndexingType( @@ -7922,7 +7921,7 @@ TreeTransform::TransformObjCObjectType(TypeLocBuilder &TLB, TypeLoc PatternLoc = PackExpansionLoc.getPatternLoc(); bool Expand = false; bool RetainExpansion = false; - std::optional NumExpansions = PackExpansion->getNumExpansions(); + UnsignedOrNone NumExpansions = PackExpansion->getNumExpansions(); if (getDerived().TryExpandParameterPacks( PackExpansionLoc.getEllipsisLoc(), PatternLoc.getSourceRange(), Unexpanded, Expand, RetainExpansion, NumExpansions)) @@ -7932,7 +7931,7 @@ TreeTransform::TransformObjCObjectType(TypeLocBuilder &TLB, // We can't expand this pack expansion into separate arguments yet; // just substitute into the pattern and create a new pack expansion // type. - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); TypeLocBuilder TypeArgBuilder; TypeArgBuilder.reserve(PatternLoc.getFullDataSize()); @@ -7953,7 +7952,7 @@ TreeTransform::TransformObjCObjectType(TypeLocBuilder &TLB, // Substitute into the pack expansion pattern for each slice of the // pack. for (unsigned ArgIdx = 0; ArgIdx != *NumExpansions; ++ArgIdx) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), ArgIdx); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), ArgIdx); TypeLocBuilder TypeArgBuilder; TypeArgBuilder.reserve(PatternLoc.getFullDataSize()); @@ -14833,9 +14832,9 @@ TreeTransform::TransformTypeTraitExpr(TypeTraitExpr *E) { // be expanded. bool Expand = true; bool RetainExpansion = false; - std::optional OrigNumExpansions = + UnsignedOrNone OrigNumExpansions = ExpansionTL.getTypePtr()->getNumExpansions(); - std::optional NumExpansions = OrigNumExpansions; + UnsignedOrNone NumExpansions = OrigNumExpansions; if (getDerived().TryExpandParameterPacks(ExpansionTL.getEllipsisLoc(), PatternTL.getSourceRange(), Unexpanded, @@ -14847,7 +14846,7 @@ TreeTransform::TransformTypeTraitExpr(TypeTraitExpr *E) { // The transform has determined that we should perform a simple // transformation on the pack expansion, producing another pack // expansion. - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); TypeLocBuilder TLB; TLB.reserve(From->getTypeLoc().getFullDataSize()); @@ -14873,7 +14872,7 @@ TreeTransform::TransformTypeTraitExpr(TypeTraitExpr *E) { // Expand the pack expansion by substituting for each argument in the // pack(s). for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(SemaRef, I); + Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, I); TypeLocBuilder TLB; TLB.reserve(PatternTL.getFullDataSize()); QualType To = getDerived().TransformType(TLB, PatternTL); @@ -15377,7 +15376,7 @@ TreeTransform::TransformLambdaExpr(LambdaExpr *E) { auto *OldVD = cast(C->getCapturedVar()); auto SubstInitCapture = [&](SourceLocation EllipsisLoc, - std::optional NumExpansions) { + UnsignedOrNone NumExpansions) { ExprResult NewExprInitResult = getDerived().TransformInitializer( OldVD->getInit(), OldVD->getInitStyle() == VarDecl::CallInit); @@ -15410,9 +15409,9 @@ TreeTransform::TransformLambdaExpr(LambdaExpr *E) { // be expanded. bool Expand = true; bool RetainExpansion = false; - std::optional OrigNumExpansions = + UnsignedOrNone OrigNumExpansions = ExpansionTL.getTypePtr()->getNumExpansions(); - std::optional NumExpansions = OrigNumExpansions; + UnsignedOrNone NumExpansions = OrigNumExpansions; if (getDerived().TryExpandParameterPacks( ExpansionTL.getEllipsisLoc(), OldVD->getInit()->getSourceRange(), Unexpanded, Expand, @@ -15422,7 +15421,7 @@ TreeTransform::TransformLambdaExpr(LambdaExpr *E) { "capture since it cannot be extended"); if (Expand) { for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), I); SubstInitCapture(SourceLocation(), std::nullopt); } } else { @@ -15575,7 +15574,7 @@ TreeTransform::TransformLambdaExpr(LambdaExpr *E) { UnexpandedParameterPack Unexpanded(C->getCapturedVar(), C->getLocation()); bool ShouldExpand = false; bool RetainExpansion = false; - std::optional NumExpansions; + UnsignedOrNone NumExpansions = std::nullopt; if (getDerived().TryExpandParameterPacks(C->getEllipsisLoc(), C->getLocation(), Unexpanded, @@ -15591,7 +15590,7 @@ TreeTransform::TransformLambdaExpr(LambdaExpr *E) { // expansion of the pattern. Do so. auto *Pack = cast(C->getCapturedVar()); for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), I); ValueDecl *CapturedVar = cast_if_present( getDerived().TransformDecl(C->getLocation(), Pack)); if (!CapturedVar) { @@ -15660,8 +15659,8 @@ TreeTransform::TransformLambdaExpr(LambdaExpr *E) { assert(FPTL && "Not a FunctionProtoType?"); AssociatedConstraint TRC = E->getCallOperator()->getTrailingRequiresClause(); - if (TRC.ArgumentPackSubstitutionIndex == -1) - TRC.ArgumentPackSubstitutionIndex = SemaRef.ArgumentPackSubstitutionIndex; + if (!TRC.ArgPackSubstIndex) + TRC.ArgPackSubstIndex = SemaRef.ArgPackSubstIndex; getSema().CompleteLambdaCallOperator( NewCallOperator, E->getCallOperator()->getLocation(), @@ -16042,10 +16041,9 @@ TreeTransform::TransformPackExpansionExpr(PackExpansionExpr *E) { } template -std::optional -TreeTransform::ComputeSizeOfPackExprWithoutSubstitution( +UnsignedOrNone TreeTransform::ComputeSizeOfPackExprWithoutSubstitution( ArrayRef PackArgs) { - std::optional Result = 0; + UnsignedOrNone Result = 0u; for (const TemplateArgument &Arg : PackArgs) { if (!Arg.isPackExpansion()) { Result = *Result + 1; @@ -16057,20 +16055,20 @@ TreeTransform::ComputeSizeOfPackExprWithoutSubstitution( // Find the pattern of the pack expansion. SourceLocation Ellipsis; - std::optional OrigNumExpansions; + UnsignedOrNone OrigNumExpansions = std::nullopt; TemplateArgumentLoc Pattern = getSema().getTemplateArgumentPackExpansionPattern(ArgLoc, Ellipsis, OrigNumExpansions); // Substitute under the pack expansion. Do not expand the pack (yet). TemplateArgumentLoc OutPattern; - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); if (getDerived().TransformTemplateArgument(Pattern, OutPattern, /*Uneval*/ true)) - return true; + return 1u; // See if we can determine the number of arguments from the result. - std::optional NumExpansions = + UnsignedOrNone NumExpansions = getSema().getFullyPackExpandedSize(OutPattern.getArgument()); if (!NumExpansions) { // No: we must be in an alias template expansion, and we're going to @@ -16105,7 +16103,7 @@ TreeTransform::TransformSizeOfPackExpr(SizeOfPackExpr *E) { UnexpandedParameterPack Unexpanded(E->getPack(), E->getPackLoc()); bool ShouldExpand = false; bool RetainExpansion = false; - std::optional NumExpansions; + UnsignedOrNone NumExpansions = std::nullopt; if (getDerived().TryExpandParameterPacks(E->getOperatorLoc(), E->getPackLoc(), Unexpanded, ShouldExpand, RetainExpansion, @@ -16149,7 +16147,7 @@ TreeTransform::TransformSizeOfPackExpr(SizeOfPackExpr *E) { } // Try to compute the result without performing a partial substitution. - std::optional Result = + UnsignedOrNone Result = getDerived().ComputeSizeOfPackExprWithoutSubstitution(PackArgs); // Common case: we could determine the number of expansions without @@ -16186,9 +16184,10 @@ TreeTransform::TransformSizeOfPackExpr(SizeOfPackExpr *E) { E->getOperatorLoc(), E->getPack(), E->getPackLoc(), E->getRParenLoc(), std::nullopt, Args); - return getDerived().RebuildSizeOfPackExpr(E->getOperatorLoc(), E->getPack(), - E->getPackLoc(), E->getRParenLoc(), - Args.size(), {}); + return getDerived().RebuildSizeOfPackExpr( + E->getOperatorLoc(), E->getPack(), E->getPackLoc(), E->getRParenLoc(), + /*Length=*/static_cast(Args.size()), + /*PartialArgs=*/std::nullopt); } template @@ -16220,14 +16219,14 @@ TreeTransform::TransformPackIndexingExpr(PackIndexingExpr *E) { // be expanded. bool ShouldExpand = true; bool RetainExpansion = false; - std::optional OrigNumExpansions; - std::optional NumExpansions = OrigNumExpansions; + UnsignedOrNone OrigNumExpansions = std::nullopt, + NumExpansions = std::nullopt; if (getDerived().TryExpandParameterPacks( E->getEllipsisLoc(), Pattern->getSourceRange(), Unexpanded, ShouldExpand, RetainExpansion, NumExpansions)) return true; if (!ShouldExpand) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); ExprResult Pack = getDerived().TransformExpr(Pattern); if (Pack.isInvalid()) return ExprError(); @@ -16236,7 +16235,7 @@ TreeTransform::TransformPackIndexingExpr(PackIndexingExpr *E) { {}, /*FullySubstituted=*/false); } for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), I); ExprResult Out = getDerived().TransformExpr(Pattern); if (Out.isInvalid()) return true; @@ -16328,8 +16327,8 @@ TreeTransform::TransformCXXFoldExpr(CXXFoldExpr *E) { // be expanded. bool Expand = true; bool RetainExpansion = false; - std::optional OrigNumExpansions = E->getNumExpansions(), - NumExpansions = OrigNumExpansions; + UnsignedOrNone OrigNumExpansions = E->getNumExpansions(), + NumExpansions = OrigNumExpansions; if (getDerived().TryExpandParameterPacks(E->getEllipsisLoc(), Pattern->getSourceRange(), Unexpanded, @@ -16340,7 +16339,7 @@ TreeTransform::TransformCXXFoldExpr(CXXFoldExpr *E) { if (!Expand) { // Do not expand any packs here, just transform and rebuild a fold // expression. - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); ExprResult LHS = E->getLHS() ? getDerived().TransformExpr(E->getLHS()) : ExprResult(); @@ -16364,7 +16363,7 @@ TreeTransform::TransformCXXFoldExpr(CXXFoldExpr *E) { // Formally a fold expression expands to nested parenthesized expressions. // Enforce this limit to avoid creating trees so deep we can't safely traverse // them. - if (NumExpansions && SemaRef.getLangOpts().BracketDepth < NumExpansions) { + if (NumExpansions && SemaRef.getLangOpts().BracketDepth < *NumExpansions) { SemaRef.Diag(E->getEllipsisLoc(), clang::diag::err_fold_expression_limit_exceeded) << *NumExpansions << SemaRef.getLangOpts().BracketDepth @@ -16397,7 +16396,7 @@ TreeTransform::TransformCXXFoldExpr(CXXFoldExpr *E) { } for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex( + Sema::ArgPackSubstIndexRAII SubstIndex( getSema(), LeftFold ? I : *NumExpansions - I - 1); ExprResult Out = getDerived().TransformExpr(Pattern); if (Out.isInvalid()) @@ -16544,8 +16543,8 @@ TreeTransform::TransformObjCDictionaryLiteral( // and should be expanded. bool Expand = true; bool RetainExpansion = false; - std::optional OrigNumExpansions = OrigElement.NumExpansions; - std::optional NumExpansions = OrigNumExpansions; + UnsignedOrNone OrigNumExpansions = OrigElement.NumExpansions; + UnsignedOrNone NumExpansions = OrigNumExpansions; SourceRange PatternRange(OrigElement.Key->getBeginLoc(), OrigElement.Value->getEndLoc()); if (getDerived().TryExpandParameterPacks(OrigElement.EllipsisLoc, @@ -16557,7 +16556,7 @@ TreeTransform::TransformObjCDictionaryLiteral( // The transform has determined that we should perform a simple // transformation on the pack expansion, producing another pack // expansion. - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), -1); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), std::nullopt); ExprResult Key = getDerived().TransformExpr(OrigElement.Key); if (Key.isInvalid()) return ExprError(); @@ -16586,7 +16585,7 @@ TreeTransform::TransformObjCDictionaryLiteral( // The transform has determined that we should perform an elementwise // expansion of the pattern. Do so. for (unsigned I = 0; I != *NumExpansions; ++I) { - Sema::ArgumentPackSubstitutionIndexRAII SubstIndex(getSema(), I); + Sema::ArgPackSubstIndexRAII SubstIndex(getSema(), I); ExprResult Key = getDerived().TransformExpr(OrigElement.Key); if (Key.isInvalid()) return ExprError(); diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 8e573a11efd35..38697eb835134 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -1321,7 +1321,7 @@ ModuleLocalNameLookupTrait::internal_key_type ModuleLocalNameLookupTrait::GetInternalKey(const external_key_type &Key) { DeclarationNameKey Name(Key.first); - std::optional ModuleHash = getPrimaryModuleHash(Key.second); + UnsignedOrNone ModuleHash = getPrimaryModuleHash(Key.second); if (!ModuleHash) return {Name, 0}; @@ -12884,7 +12884,7 @@ static unsigned getStableHashForModuleName(StringRef PrimaryModuleName) { return ID.computeStableHash(); } -std::optional clang::getPrimaryModuleHash(const Module *M) { +UnsignedOrNone clang::getPrimaryModuleHash(const Module *M) { if (!M) return std::nullopt; diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 1a2b8be7e2b8a..5545cbc8d608c 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -904,8 +904,9 @@ void ASTDeclReader::VisitDeclaratorDecl(DeclaratorDecl *DD) { if (Record.readInt()) { // hasExtInfo auto *Info = new (Reader.getContext()) DeclaratorDecl::ExtInfo(); Record.readQualifierInfo(*Info); - Info->TrailingRequiresClause = - AssociatedConstraint(Record.readExpr(), int(Record.readInt())); + Info->TrailingRequiresClause = AssociatedConstraint( + Record.readExpr(), + UnsignedOrNone::fromInternalRepresentation(Record.readUInt32())); DD->DeclInfo = Info; } QualType TSIType = Record.readType(); @@ -2710,12 +2711,10 @@ void ASTDeclReader::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) { if (Record.readBool()) CR = Record.readConceptReference(); Expr *ImmediatelyDeclaredConstraint = Record.readExpr(); - int ArgumentPackSubstitutionIndex = Record.readInt(); + UnsignedOrNone ArgPackSubstIndex = Record.readUnsignedOrNone(); - D->setTypeConstraint(CR, ImmediatelyDeclaredConstraint, - ArgumentPackSubstitutionIndex); - if ((D->ExpandedParameterPack = Record.readInt())) - D->NumExpanded = Record.readInt(); + D->setTypeConstraint(CR, ImmediatelyDeclaredConstraint, ArgPackSubstIndex); + D->NumExpanded = Record.readUnsignedOrNone(); } if (Record.readInt()) diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index 7c7abcb2d49d9..d26152f3780ed 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2271,7 +2271,7 @@ void ASTStmtReader::VisitCXXFoldExpr(CXXFoldExpr *E) { E->LParenLoc = readSourceLocation(); E->EllipsisLoc = readSourceLocation(); E->RParenLoc = readSourceLocation(); - E->NumExpansions = Record.readInt(); + E->NumExpansions = Record.readUnsignedOrNone(); E->SubExprs[0] = Record.readSubExpr(); E->SubExprs[1] = Record.readSubExpr(); E->SubExprs[2] = Record.readSubExpr(); diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 84f7f2bc5fce4..f27be5fb4c76c 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -4267,7 +4267,7 @@ class ASTDeclContextNameLookupTrait : public ASTDeclContextNameLookupTraitBase { auto ID = Writer.GetDeclRef(DeclForLocalLookup); if (isModuleLocalDecl(D)) { - if (std::optional PrimaryModuleHash = + if (UnsignedOrNone PrimaryModuleHash = getPrimaryModuleHash(D->getOwningModule())) { auto Key = std::make_pair(D->getDeclName(), *PrimaryModuleHash); auto Iter = ModuleLocalDeclsMap.find(Key); diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index a4b89d0d9ed5e..3a7a23481ea98 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -730,8 +730,7 @@ void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) { Record.AddQualifierInfo(*Info); Record.AddStmt( const_cast(Info->TrailingRequiresClause.ConstraintExpr)); - Record.push_back( - Info->TrailingRequiresClause.ArgumentPackSubstitutionIndex); + Record.writeUnsignedOrNone(Info->TrailingRequiresClause.ArgPackSubstIndex); } // The location information is deferred until the end of the record. Record.AddTypeRef(D->getTypeSourceInfo() ? D->getTypeSourceInfo()->getType() @@ -2039,10 +2038,8 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) { if (CR) Record.AddConceptReference(CR); Record.AddStmt(TC->getImmediatelyDeclaredConstraint()); - Record.push_back(TC->getArgumentPackSubstitutionIndex()); - Record.push_back(D->isExpandedParameterPack()); - if (D->isExpandedParameterPack()) - Record.push_back(D->getNumExpansionParameters()); + Record.writeUnsignedOrNone(TC->getArgPackSubstIndex()); + Record.writeUnsignedOrNone(D->getNumExpansionParameters()); } bool OwnsDefaultArg = D->hasDefaultArgument() && diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 0860704368f3b..23bb5ff22efaf 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2271,7 +2271,7 @@ void ASTStmtWriter::VisitCXXFoldExpr(CXXFoldExpr *E) { Record.AddSourceLocation(E->LParenLoc); Record.AddSourceLocation(E->EllipsisLoc); Record.AddSourceLocation(E->RParenLoc); - Record.push_back(E->NumExpansions); + Record.push_back(E->NumExpansions.toInternalRepresentation()); Record.AddStmt(E->SubExprs[0]); Record.AddStmt(E->SubExprs[1]); Record.AddStmt(E->SubExprs[2]); diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index 0bac95eb40b20..40e1197bc21f1 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -3589,7 +3589,7 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportUnnamedFieldsInCorrectOrder) { ASSERT_FALSE(FromField->getDeclName()); auto *ToField = cast_or_null(Import(FromField, Lang_CXX11)); EXPECT_TRUE(ToField); - std::optional ToIndex = ASTImporter::getFieldIndex(ToField); + UnsignedOrNone ToIndex = ASTImporter::getFieldIndex(ToField); EXPECT_TRUE(ToIndex); EXPECT_EQ(*ToIndex, FromIndex); ++FromIndex; @@ -5472,7 +5472,7 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportSubstTemplateTypeParmType) { FromTU, classTemplateSpecializationDecl()); auto testType = [&](ASTContext &Ctx, const char *Name, - std::optional PackIndex) { + UnsignedOrNone PackIndex) { const auto *Subst = selectFirst( "sttp", match(substTemplateTypeParmType( hasReplacementType(hasCanonicalType(asString(Name)))) @@ -5486,10 +5486,10 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportSubstTemplateTypeParmType) { }; auto tests = [&](ASTContext &Ctx) { testType(Ctx, "void", std::nullopt); - testType(Ctx, "char", 3); - testType(Ctx, "float", 2); - testType(Ctx, "int", 1); - testType(Ctx, "short", 0); + testType(Ctx, "char", 3u); + testType(Ctx, "float", 2u); + testType(Ctx, "int", 1u); + testType(Ctx, "short", 0u); }; tests(FromTU->getASTContext()); From 3e59ff27e5d4506086dfe02ec600adf0baa9543e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 3 Apr 2025 10:33:09 -0700 Subject: [PATCH 0565/1029] [flang][cuda] Fix pred type for vote functions (#134166) --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 14 +++++++++----- flang/test/Lower/CUDA/cuda-device-proc.cuf | 9 +++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 8aed288d128b6..4988b6bfb3d3f 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -6508,12 +6508,13 @@ IntrinsicLibrary::genMatchAllSync(mlir::Type resultType, } static mlir::Value genVoteSync(fir::FirOpBuilder &builder, mlir::Location loc, - llvm::StringRef funcName, + llvm::StringRef funcName, mlir::Type resTy, llvm::ArrayRef args) { mlir::MLIRContext *context = builder.getContext(); mlir::Type i32Ty = builder.getI32Type(); + mlir::Type i1Ty = builder.getI1Type(); mlir::FunctionType ftype = - mlir::FunctionType::get(context, {i32Ty, i32Ty}, {i32Ty}); + mlir::FunctionType::get(context, {i32Ty, i1Ty}, {resTy}); auto funcOp = builder.createFunction(loc, funcName, ftype); llvm::SmallVector filteredArgs; return builder.create(loc, funcOp, args).getResult(0); @@ -6523,14 +6524,16 @@ static mlir::Value genVoteSync(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value IntrinsicLibrary::genVoteAllSync(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); - return genVoteSync(builder, loc, "llvm.nvvm.vote.all.sync", args); + return genVoteSync(builder, loc, "llvm.nvvm.vote.all.sync", + builder.getI1Type(), args); } // ANY_SYNC mlir::Value IntrinsicLibrary::genVoteAnySync(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); - return genVoteSync(builder, loc, "llvm.nvvm.vote.any.sync", args); + return genVoteSync(builder, loc, "llvm.nvvm.vote.any.sync", + builder.getI1Type(), args); } // BALLOT_SYNC @@ -6538,7 +6541,8 @@ mlir::Value IntrinsicLibrary::genVoteBallotSync(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); - return genVoteSync(builder, loc, "llvm.nvvm.vote.ballot.sync", args); + return genVoteSync(builder, loc, "llvm.nvvm.vote.ballot.sync", + builder.getI32Type(), args); } // MATCH_ANY_SYNC diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 6a7fee73f338a..a4a4750dd61e6 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -297,10 +297,11 @@ end ! CHECK: fir.call @__ldcv_r8x2_(%{{.*}}, %{{.*}}) fastmath : (!fir.ref>, !fir.ref>) -> () attributes(device) subroutine testVote() - integer :: a, ipred, mask, v32 - a = all_sync(mask, v32) - a = any_sync(mask, v32) - a = ballot_sync(mask, v32) + integer :: a, ipred, mask + logical(4) :: pred + a = all_sync(mask, pred) + a = any_sync(mask, pred) + a = ballot_sync(mask, pred) end subroutine ! CHECK-LABEL: func.func @_QPtestvote() From 3f6ae3f0a81ac32aee7633b7c240ce8eb25192ff Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Thu, 3 Apr 2025 10:43:28 -0700 Subject: [PATCH 0566/1029] [flang] Added driver options for arrays repacking. (#134002) Added options: * -f[no-]repack-arrays * -f[no-]stack-repack-arrays * -frepack-arrays-contiguity=whole/innermost --- clang/include/clang/Driver/Options.td | 63 ++++++++++++++++++- clang/lib/Driver/ToolChains/Flang.cpp | 30 ++++++--- flang/docs/ArrayRepacking.md | 19 +++--- flang/include/flang/Lower/LoweringOptions.def | 9 +-- flang/lib/Frontend/CompilerInvocation.cpp | 13 ++++ flang/lib/Lower/ConvertVariable.cpp | 2 +- .../test/Driver/frepack-arrays-contiguity.f90 | 32 ++++++++++ flang/test/Driver/frepack-arrays.f90 | 24 +++++++ flang/test/Driver/fstack-repack-arrays.f90 | 24 +++++++ flang/test/Lower/repack-arrays.f90 | 8 +-- flang/tools/bbc/bbc.cpp | 15 ++--- 11 files changed, 201 insertions(+), 38 deletions(-) create mode 100644 flang/test/Driver/frepack-arrays-contiguity.f90 create mode 100644 flang/test/Driver/frepack-arrays.f90 create mode 100644 flang/test/Driver/fstack-repack-arrays.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index e69b804de63b5..2ca5f99e4ca63 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6825,7 +6825,6 @@ defm real_8_real_10 : BooleanFFlag<"real-8-real-10">, Group; defm real_8_real_16 : BooleanFFlag<"real-8-real-16">, Group; defm real_8_real_4 : BooleanFFlag<"real-8-real-4">, Group; defm recursive : BooleanFFlag<"recursive">, Group; -defm repack_arrays : BooleanFFlag<"repack-arrays">, Group; defm second_underscore : BooleanFFlag<"second-underscore">, Group; defm sign_zero : BooleanFFlag<"sign-zero">, Group; defm whole_file : BooleanFFlag<"whole-file">, Group; @@ -6967,6 +6966,52 @@ defm unsigned : OptInFC1FFlag<"unsigned", "Enables UNSIGNED type">; def fno_automatic : Flag<["-"], "fno-automatic">, Group, HelpText<"Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE">; +defm repack_arrays + : BoolOptionWithoutMarshalling< + "f", "repack-arrays", PosFlag, + NegFlag, + BothFlags<[], [], + " non-contiguous assumed shape dummy arrays into " + "contiguous memory">>, + DocBrief<[{Create temporary copies of non-contiguous assumed shape dummy +arrays in subprogram prologues, and destroy them in subprogram epilogues. +The temporary copy is initialized with values from the original array +in the prologue, if needed. In the epilogue, the current values +in the temporary array are copied into the original array, if needed. + +Accessing the contiguous temporary in the program code may result +in faster execution comparing to accessing elements of the original array, +when they are sparse in memory. At the same time, the overhead +of copying values between the original and the temporary arrays +may be significant, which may slow down some programs. + +Enabling array repacking may also change the behavior of certain +programs: + +* The copy actions may introduce a data race in valid OpenACC/OpenMP programs. + For example, if different threads execute the same subprogram + with a non-contiguous assumed shape dummy array, and the different threads + access unrelated parts of the array, then the whole array copy + made in each thread will cause a data race. +* OpenACC/OpenMP offload programs may behave incorrectly with regards + to the device data environment, due to the fact that the original + array and the temporary may have different presence status on the device. +* ``IS_CONTIGUOUS`` intrinsic may return ``TRUE`` with the array repacking + enabled, whereas if would return ``FALSE`` with the repacking disabled. +* The result of ``LOC`` intrinsic applied to an actual argument associated + with a non-contiguous assumed shape dummy array, may be different + from the result of ``LOC`` applied to the dummy array.}]>; + +def frepack_arrays_contiguity_EQ + : Joined<["-"], "frepack-arrays-contiguity=">, + Group, + Values<"whole,innermost">, + HelpText< + "When -frepack-arrays is in effect, 'whole' enables " + "repacking for arrays that are non-contiguous in any dimension, " + "'innermost' enables repacking for arrays that are non-contiguous " + "in the innermost dimension (the default)">; + defm save_main_program : BoolOptionWithoutMarshalling<"f", "save-main-program", PosFlag, @@ -6980,6 +7025,22 @@ defm loop_versioning : BoolOptionWithoutMarshalling<"f", "version-loops-for-stri PosFlag, NegFlag>; +defm stack_repack_arrays + : BoolOptionWithoutMarshalling< + "f", "stack-repack-arrays", + PosFlag, + NegFlag< + SetFalse, [], [], + "Allocate -frepack-arrays temporaries on the heap (default)">>, + DocBrief<[{Controls whether the array temporaries created under +**-frepack-arrays** are allocated on the stack or on the heap. + +By default, the heap is used. Allocations of polymorphic types +are always done on the heap, though this may change in future releases. + }]>; + def fhermetic_module_files : Flag<["-"], "fhermetic-module-files">, Group, HelpText<"Emit hermetic module files (no nested USE association)">; diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 8312234e33a64..96e2486da764c 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -157,16 +157,26 @@ void Flang::addCodegenOptions(const ArgList &Args, if (shouldLoopVersion(Args)) CmdArgs.push_back("-fversion-loops-for-stride"); - Args.addAllArgs(CmdArgs, - {options::OPT_fdo_concurrent_to_openmp_EQ, - options::OPT_flang_experimental_hlfir, - options::OPT_flang_deprecated_no_hlfir, - options::OPT_fno_ppc_native_vec_elem_order, - options::OPT_fppc_native_vec_elem_order, - options::OPT_finit_global_zero, - options::OPT_fno_init_global_zero, options::OPT_ftime_report, - options::OPT_ftime_report_EQ, options::OPT_funroll_loops, - options::OPT_fno_unroll_loops}); + for (const auto &arg : + Args.getAllArgValues(options::OPT_frepack_arrays_contiguity_EQ)) + if (arg.compare("whole") != 0 && arg.compare("innermost") != 0) { + getToolChain().getDriver().Diag(diag::err_drv_unsupported_option_argument) + << "-frepack-arrays-contiguity=" << arg; + } + + Args.addAllArgs( + CmdArgs, + {options::OPT_fdo_concurrent_to_openmp_EQ, + options::OPT_flang_experimental_hlfir, + options::OPT_flang_deprecated_no_hlfir, + options::OPT_fno_ppc_native_vec_elem_order, + options::OPT_fppc_native_vec_elem_order, options::OPT_finit_global_zero, + options::OPT_fno_init_global_zero, options::OPT_frepack_arrays, + options::OPT_fno_repack_arrays, + options::OPT_frepack_arrays_contiguity_EQ, + options::OPT_fstack_repack_arrays, options::OPT_fno_stack_repack_arrays, + options::OPT_ftime_report, options::OPT_ftime_report_EQ, + options::OPT_funroll_loops, options::OPT_fno_unroll_loops}); } void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const { diff --git a/flang/docs/ArrayRepacking.md b/flang/docs/ArrayRepacking.md index 87cfc5d1bb4bc..7de599f293e40 100755 --- a/flang/docs/ArrayRepacking.md +++ b/flang/docs/ArrayRepacking.md @@ -39,13 +39,13 @@ Having these results it seems reasonable to provide support for arrays repacking #### Facts and guesses about the implementation -The dynamic checks for continuity and the array copy code is located completely in the [runtime](https://github.com/gcc-mirror/gcc/blob/3e08a4ecea27c54fda90e8f58641b1986ad957e1/libgfortran/generated/in_pack_r8.c#L35), so the compiler inserts unconditional calls in the subprogram prologue/epilogue. +The dynamic checks for contiguity and the array copy code is located completely in the [runtime](https://github.com/gcc-mirror/gcc/blob/3e08a4ecea27c54fda90e8f58641b1986ad957e1/libgfortran/generated/in_pack_r8.c#L35), so the compiler inserts unconditional calls in the subprogram prologue/epilogue. It looks like `gfortran` ignores `intent(out)/intent(in)` which could have helped to avoid some of the `pack/unpack` overhead. It looks like the `pack`/`unpack` actions are inserted early in the compilation pipeline, and these extra calls affect behavior of the later optimization passes. For example, `Polyhedron/fatigue2` slows down by about 2x with `-frepack-arrays`: this slowdown is not caused by the `pack`/`unpack` overhead, but is a consequence of worse function inlining decisions made after the calls insertion. The benchmarks becomes even faster than the original version with `-frepack-arrays` and proper `-finline-limit=` settings, but it does not look like the benchmark contains code that would benefit from the array repacking. -It does not look like `gfortran` is able to eliminate the `pack`/`unpack` code after the function inlining, if the actual argument is statically known to be contiguous. So the overhead from the dynamic continuity checks is inevitable when `-frepack-arrays` is specified. +It does not look like `gfortran` is able to eliminate the `pack`/`unpack` code after the function inlining, if the actual argument is statically known to be contiguous. So the overhead from the dynamic contiguity checks is inevitable when `-frepack-arrays` is specified. It does not look like `gfortran` tries to optimize the insertion of `pack`/`unpack` code. For example, if a dummy array is only used under a condition within the subprogram, the repacking code might be inserted under the same condition to minimize the overhead on the unconditional path through the subprogram. @@ -59,7 +59,7 @@ It does not look like `gfortran` tries to optimize the insertion of `pack`/`unpa #### Facts and guesses about the implementation -The `pack` code is only generated if the actual argument may be non-contiguous in the innermost dimension, as determined statically, i.e. the compiler does not generate any dynamic continuity checks. For example: +The `pack` code is only generated if the actual argument may be non-contiguous in the innermost dimension, as determined statically, i.e. the compiler does not generate any dynamic contiguity checks. For example: ```Fortran interface @@ -132,8 +132,8 @@ So it does not seem practical/reasonable to enable the array repacking by defaul ### Performance 1. Minimize the overhead of array repacking, e.g. avoid copy-in/out whenever possible, execute copy-in/out only on the execution paths where the array is accessed. -2. Provide different modes of repacking depending on the "continuity" meaning, i.e. one - array is contiguous in the innermost dimension, two - array is contiguous in all dimensions. -3. Avoid generating repacking code, when the "continuity" can be statically proven (including after optimization passes like constant propagation, function inlining, etc.). +2. Provide different modes of repacking depending on the "contiguity" meaning, i.e. one - array is contiguous in the innermost dimension, two - array is contiguous in all dimensions. +3. Avoid generating repacking code, when the "contiguity" can be statically proven (including after optimization passes like constant propagation, function inlining, etc.). 4. Use a set of heuristics to avoid generating repacking code based on the array usage pattern, e.g. if an array is proven not to be used in an array expression or a loop, etc. 5. Use a set of heuristics to avoid repacking actions dynamically, e.g. based on the array size, element size, byte stride(s) of the [innermost] dimension(s), etc. 6. Minimize the impact of the IR changes, introduced by repacking, on the later optimization passes. @@ -156,7 +156,7 @@ Controlled by cli options, Lowering will generate a `fir.pack_array` operation i The new operations will hold all the information that customizes further handling of the `pack`/`unpack` actions, such as: * Optional array of attributes supporting an interface to generate a predicate that says if the repacking is safe in the current context. -* The continuity mode: `innermost` vs `whole`. +* The contiguity mode: `innermost` vs `whole`. * Attributes selecting the heuristics (both compiler and runtime ones) that may be applied to avoid `pack`/`unpack` actions. * Other attributes, like `stack` vs `heap` to manage the temporary allocation according to `-fstack-arrays`, etc. @@ -195,7 +195,7 @@ The operation creates a new `!fir.box/class>` value to represent ei Arguments: * `stack` - indicates if `-fstack-arrays` is in effect for compiling this function. -* `innermost` - tells that the repacking has to be done iff the array is not contiguous in the innermost dimension. This also describes what type of continuity can be expected from `%new_var`, i.e. `innermost` means that the resulting array is definitely contiguous in the innermost dimension, but may be non-contiguous in other dimensions (unless additional analysis proves otherwise). For 1-D arrays, `innermost` attribute is not valid. +* `innermost` - tells that the repacking has to be done iff the array is not contiguous in the innermost dimension. This also describes what type of contiguity can be expected from `%new_var`, i.e. `innermost` means that the resulting array is definitely contiguous in the innermost dimension, but may be non-contiguous in other dimensions (unless additional analysis proves otherwise). For 1-D arrays, `innermost` attribute is not valid. * `no_copy` - indicates that, in case a temporary array is created, `%var` to `%new_var` copy is not required (`intent(out)` dummy argument case). * `heuristics` * `loop-only` - `fir.pack_array` can be optimized away, if the array is not used in a loop. @@ -351,7 +351,7 @@ The `fir.pack_array`'s copy-in action cannot be skipped for `INTENT(OUT)` dummy #### Optional behavior -In case of the `whole` continuity mode or with 1-D array, Flang can propagate this information to `hlfir.declare` - this may improve optimizations down the road. This can be done iff the repacking has no dynamic constraints and/or heuristics. For example: +In case of the `whole` contiguity mode or with 1-D array, Flang can propagate this information to `hlfir.declare` - this may improve optimizations down the road. This can be done iff the repacking has no dynamic constraints and/or heuristics. For example: ``` %c0 = arith.constant 0 : index @@ -441,10 +441,11 @@ In cases where `fir.pack_array` is statically known to produce a copy that is co The following user options are proposed: * `-frepack-arrays` - the option forces Flang to repack a non-contiguous assumed-shape dummy array into a temporary contiguous memory, which may result in faster accesses of the array. The compiler will insert special code in subprogram prologue to allocate a temporary array and copy the original array into the temporary; in subprogram epilogue, it will insert a copy from the temporary array into the original array and deallocate the temporary. The overhead of the allocation/deallocation and the copies may be significant depending on the array size. The compiler will try to optimize the unnecessary/unprofitable repacking. +* `-fstack-repack-arrays` - attempt allocating the temporary arrays in stack memory. By default, they are allocated in heap memory (note that `-fstack-arrays` does not affect the allocation of the temporaries created for the arrays repacking). * `-frepack-arrays-opts=[none|loop-only]` - the option enables optimizations that may eliminate the array repacking code depending on the array usage pattern: * `none` - no optimizations. * `loop-only` - the array repacking code will be removed in any subprogram where the array is not used inside a loop or an array expression. -* `-frepack-arrays-continuity=[whole|innermost]`: +* `-frepack-arrays-contiguity=[whole|innermost]`: * `whole` - the option will repack arrays that are non-contiguous in any dimension (default). * `innermost` - the option will repack arrays that are non-contiguous in the innermost dimension. * `-frepack-arrays-max-size=` - arrays bigger than the specified size will not be repacked. diff --git a/flang/include/flang/Lower/LoweringOptions.def b/flang/include/flang/Lower/LoweringOptions.def index 6735bea551414..b062ea1a805ac 100644 --- a/flang/include/flang/Lower/LoweringOptions.def +++ b/flang/include/flang/Lower/LoweringOptions.def @@ -48,14 +48,15 @@ ENUM_LOWERINGOPT(ReallocateLHS, unsigned, 1, 1) /// On by default. ENUM_LOWERINGOPT(InitGlobalZero, unsigned, 1, 1) -/// If true, the arrays of unknown size and array temporaries -/// are requested to be allocated in stack memory. -ENUM_LOWERINGOPT(StackArrays, unsigned, 1, 0) - /// If true, the dummy assumed shape arrays are conditionally /// packed into contiguous memory. ENUM_LOWERINGOPT(RepackArrays, unsigned, 1, 0) +/// If true, the temporary arrays created under RepackArrays +/// control will be allocated in stack memory. If false, +/// they will be allocated in heap memory. +ENUM_LOWERINGOPT(StackRepackArrays, unsigned, 1, 0) + /// If true, the repacking (RepackArrays option above) /// will be done for arrays non-contiguous in any dimension, /// otherwise, it will be done only for arrays non-contiguous diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 466d939b7b840..6f87a18d69c3d 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -1476,6 +1476,19 @@ bool CompilerInvocation::createFromArgs( clang::driver::options::OPT_fno_realloc_lhs, true)) invoc.loweringOpts.setReallocateLHS(false); + invoc.loweringOpts.setRepackArrays( + args.hasFlag(clang::driver::options::OPT_frepack_arrays, + clang::driver::options::OPT_fno_repack_arrays, + /*default=*/false)); + invoc.loweringOpts.setStackRepackArrays( + args.hasFlag(clang::driver::options::OPT_fstack_repack_arrays, + clang::driver::options::OPT_fno_stack_repack_arrays, + /*default=*/false)); + if (auto *arg = args.getLastArg( + clang::driver::options::OPT_frepack_arrays_contiguity_EQ)) + invoc.loweringOpts.setRepackArraysWhole(arg->getValue() == + llvm::StringRef{"whole"}); + success &= parseFrontendArgs(invoc.getFrontendOpts(), args, diags); parseTargetArgs(invoc.getTargetOpts(), args); parsePreprocessorArgs(invoc.getPreprocessorOpts(), args); diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 0b22b743edee9..366ff328bfa27 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -2630,7 +2630,7 @@ Fortran::lower::genPackArray(Fortran::lower::AbstractConverter &converter, }); fir::FirOpBuilder &builder = converter.getFirOpBuilder(); const mlir::Location loc = genLocation(converter, sym); - bool stackAlloc = opts.getStackArrays(); + bool stackAlloc = opts.getStackRepackArrays(); // 1D arrays must always use 'whole' mode. bool isInnermostMode = !opts.getRepackArraysWhole() && sym.Rank() > 1; // Avoid copy-in for 'intent(out)' variable, unless this is a dummy diff --git a/flang/test/Driver/frepack-arrays-contiguity.f90 b/flang/test/Driver/frepack-arrays-contiguity.f90 new file mode 100644 index 0000000000000..88e5af4129eda --- /dev/null +++ b/flang/test/Driver/frepack-arrays-contiguity.f90 @@ -0,0 +1,32 @@ +! Test forwarding of -frepack-arrays-contiguity options: +! RUN: %flang -frepack-arrays-contiguity=whole %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=WHOLECMD %s +! RUN: %flang -frepack-arrays-contiguity=innermost %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=INNERMOSTCMD %s +! RUN: %flang -frepack-arrays-contiguity=innermost -frepack-arrays-contiguity=whole %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=WHOLECMD %s +! RUN: %flang -frepack-arrays-contiguity=whole -frepack-arrays-contiguity=innermost %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=INNERMOSTCMD %s +! RUN: not %flang -frepack-arrays-contiguity= -frepack-arrays-contiguity=innermost %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=ERROR %s +! RUN: not %flang -frepack-arrays-contiguity=whole3 -frepack-arrays-contiguity=innermost %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=ERROR %s +! RUN: not %flang -frepack-arrays-contiguity=innermostg -frepack-arrays-contiguity=innermost %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=ERROR %s + +! Test proper setting of the lowering options: +! RUN: %flang_fc1 -frepack-arrays -frepack-arrays-contiguity=whole %s -emit-hlfir -o - | FileCheck --check-prefix=WHOLE %s +! RUN: %flang_fc1 -frepack-arrays-contiguity=whole %s -emit-hlfir -o - | FileCheck --check-prefix=NOREPACK %s +! RUN: %flang_fc1 -frepack-arrays -frepack-arrays-contiguity=innermost %s -emit-hlfir -o - | FileCheck --check-prefix=INNERMOST %s +! RUN: %flang_fc1 -frepack-arrays-contiguity=innermost %s -emit-hlfir -o - | FileCheck --check-prefix=NOREPACK %s + +! Default setting is 'innermost': +! RUN: %flang_fc1 -frepack-arrays %s -emit-hlfir -o - | FileCheck --check-prefix=INNERMOST %s + +! ERROR: error: unsupported argument '{{.*}}' to option '-frepack-arrays-contiguity=' + +! WHOLECMD: "-fc1"{{.*}}"-frepack-arrays-contiguity=whole" +! INNERMOSTCMD: "-fc1"{{.*}}"-frepack-arrays-contiguity=innermost" + +subroutine test(x) + real :: x(:,:) + ! WHOLE: fir.pack_array{{.*}}whole + ! WHOLE: fir.unpack_array + ! INERMOST: fir.pack_array{{.*}}innermost + ! INNERMOST: fir.unpack_array + ! NOREPACK-NOT: fir.pack_array + ! NOREPACK-NOT: fir.unpack_array +end subroutine diff --git a/flang/test/Driver/frepack-arrays.f90 b/flang/test/Driver/frepack-arrays.f90 new file mode 100644 index 0000000000000..0d1913d282446 --- /dev/null +++ b/flang/test/Driver/frepack-arrays.f90 @@ -0,0 +1,24 @@ +! Test forwarding of -f[no-]repack-arrays options: +! RUN: %flang -frepack-arrays %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=REPACKCMD %s +! RUN: %flang -fno-repack-arrays %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=NOREPACKCMD %s +! RUN: %flang -frepack-arrays -fno-repack-arrays %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=NOREPACKCMD %s +! RUN: %flang -fno-repack-arrays -frepack-arrays %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=REPACKCMD %s + +! Test proper setting of the lowering options: +! RUN: %flang_fc1 -frepack-arrays %s -emit-hlfir -o - | FileCheck --check-prefix=REPACK %s +! RUN: %flang_fc1 -fno-repack-arrays %s -emit-hlfir -o - | FileCheck --check-prefix=NOREPACK %s +! RUN: %flang_fc1 -frepack-arrays -fno-repack-arrays %s -emit-hlfir -o - | FileCheck --check-prefix=NOREPACK %s +! RUN: %flang_fc1 -fno-repack-arrays -frepack-arrays %s -emit-hlfir -o - | FileCheck --check-prefix=REPACK %s + +! REPACKCMD: "-fc1"{{.*}}"-frepack-arrays" +! REPACKCMD-NOT: -fno-repack-arrays +! NOREPACKCMD: "-fc1"{{.*}}"-fno-repack-arrays" +! NOREPACKCMD-NOT: -frepack-arrays + +subroutine test(x) + real :: x(:) + ! REPACK: fir.pack_array + ! REPACK: fir.unpack_array + ! NOREPACK-NOT: fir.pack_array + ! NOREPACK-NOT: fir.unpack_array +end subroutine diff --git a/flang/test/Driver/fstack-repack-arrays.f90 b/flang/test/Driver/fstack-repack-arrays.f90 new file mode 100644 index 0000000000000..406228cfe6105 --- /dev/null +++ b/flang/test/Driver/fstack-repack-arrays.f90 @@ -0,0 +1,24 @@ +! Test forwarding of -f[no-]stack-repack-arrays options: +! RUN: %flang -fstack-repack-arrays %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=STACKCMD %s +! RUN: %flang -fno-stack-repack-arrays %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=HEAPCMD %s +! RUN: %flang -fstack-repack-arrays -fno-stack-repack-arrays %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=HEAPCMD %s +! RUN: %flang -fno-stack-repack-arrays -fstack-arrays %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=HEAPCMD %s +! RUN: %flang -fno-stack-repack-arrays -fstack-repack-arrays %s -### -fsyntax-only 2>&1 | FileCheck --check-prefix=STACKCMD %s + +! Test proper setting of the lowering options: +! RUN: %flang_fc1 -frepack-arrays -fstack-repack-arrays %s -emit-hlfir -o - | FileCheck --check-prefix=STACK %s +! RUN: %flang_fc1 -frepack-arrays -fno-stack-repack-arrays %s -emit-hlfir -o - | FileCheck --check-prefix=HEAP %s + +! Default setting is 'heap': +! RUN: %flang_fc1 -frepack-arrays %s -emit-hlfir -o - | FileCheck --check-prefix=HEAP %s + +! STACKCMD: "-fc1"{{.*}}"-fstack-repack-arrays" +! HEAPCMD: "-fc1"{{.*}}"-fno-stack-repack-arrays" + +subroutine test(x) + real :: x(:,:) + ! STACK: fir.pack_array{{.*}}stack + ! STACK: fir.unpack_array{{.*}}stack + ! HEAP: fir.pack_array{{.*}}heap + ! HEAP: fir.unpack_array{{.*}}heap +end subroutine diff --git a/flang/test/Lower/repack-arrays.f90 b/flang/test/Lower/repack-arrays.f90 index 19ea93a3521a3..ff89df82793a3 100644 --- a/flang/test/Lower/repack-arrays.f90 +++ b/flang/test/Lower/repack-arrays.f90 @@ -1,7 +1,7 @@ -! RUN: bbc -emit-hlfir -frepack-arrays -fstack-arrays -frepack-arrays-continuity-whole %s -o - -I nowhere | FileCheck --check-prefixes=ALL,STACK,WHOLE %s -! RUN: bbc -emit-hlfir -frepack-arrays -fstack-arrays=false -frepack-arrays-continuity-whole %s -o - -I nowhere | FileCheck --check-prefixes=ALL,HEAP,WHOLE %s -! RUN: bbc -emit-hlfir -frepack-arrays -fstack-arrays -frepack-arrays-continuity-whole=false %s -o - -I nowhere | FileCheck --check-prefixes=ALL,STACK,INNER %s -! RUN: bbc -emit-hlfir -frepack-arrays -fstack-arrays=false -frepack-arrays-continuity-whole=false %s -o - -I nowhere | FileCheck --check-prefixes=ALL,HEAP,INNER %s +! RUN: bbc -emit-hlfir -frepack-arrays -fstack-repack-arrays -frepack-arrays-continuity-whole %s -o - -I nowhere | FileCheck --check-prefixes=ALL,STACK,WHOLE %s +! RUN: bbc -emit-hlfir -frepack-arrays -fstack-repack-arrays=false -frepack-arrays-continuity-whole %s -o - -I nowhere | FileCheck --check-prefixes=ALL,HEAP,WHOLE %s +! RUN: bbc -emit-hlfir -frepack-arrays -fstack-repack-arrays -frepack-arrays-continuity-whole=false %s -o - -I nowhere | FileCheck --check-prefixes=ALL,STACK,INNER %s +! RUN: bbc -emit-hlfir -frepack-arrays -fstack-repack-arrays=false -frepack-arrays-continuity-whole=false %s -o - -I nowhere | FileCheck --check-prefixes=ALL,HEAP,INNER %s ! ALL-LABEL: func.func @_QPtest1( ! ALL-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.box> {fir.bindc_name = "x"}) { diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index 78ce510968ca5..c544008a24d56 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -251,14 +251,11 @@ static llvm::cl::opt "the LHS of the intrinsic assignment"), llvm::cl::init(true)); -// TODO: -fstack-arrays is currently only used for fir.pack_array, -// but it should probably be used for deciding how arrays/temporaries -// are allocated during lowering. -static llvm::cl::opt - stackArrays("fstack-arrays", - llvm::cl::desc("Allocate all arrays of unknown size and " - "temporary arrays in stack memory"), - llvm::cl::init(false)); +static llvm::cl::opt stackRepackArrays( + "fstack-repack-arrays", + llvm::cl::desc("Allocate temporary arrays for -frepack-arrays " + "in stack memory"), + llvm::cl::init(false)); static llvm::cl::opt repackArrays("frepack-arrays", @@ -429,7 +426,7 @@ static llvm::LogicalResult convertFortranSourceToMLIR( loweringOptions.setIntegerWrapAround(integerWrapAround); loweringOptions.setInitGlobalZero(initGlobalZero); loweringOptions.setReallocateLHS(reallocateLHS); - loweringOptions.setStackArrays(stackArrays); + loweringOptions.setStackRepackArrays(stackRepackArrays); loweringOptions.setRepackArrays(repackArrays); loweringOptions.setRepackArraysWhole(repackArraysWhole); std::vector envDefaults = {}; From b8b752db2b89a730aff250a83b64c7ed10d95ca0 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Thu, 3 Apr 2025 10:43:49 -0700 Subject: [PATCH 0567/1029] [flang][NFC] Create required Source dir for flang-doc. (#134000) --- flang/docs/CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/flang/docs/CMakeLists.txt b/flang/docs/CMakeLists.txt index 3e4883e881ffa..774fa4c745696 100644 --- a/flang/docs/CMakeLists.txt +++ b/flang/docs/CMakeLists.txt @@ -100,6 +100,15 @@ function (gen_rst_file_from_td output_file td_option source docs_target) list(APPEND LLVM_TABLEGEN_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}/../../clang/include/clang/Driver/") clang_tablegen(Source/${output_file} ${td_option} SOURCE ${source} TARGET "gen-${output_file}") add_dependencies(${docs_target} "gen-${output_file}") + + # clang_tablegen() does not create the output directory automatically, + # so we have to create it explicitly. Note that copy-flang-src-docs below + # does create the output directory, but it is not necessarily run + # before RST generation. + add_custom_target(create-flang-rst-output-dir + COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/Source + ) + add_dependencies("gen-${output_file}" create-flang-rst-output-dir) endfunction() if (LLVM_ENABLE_SPHINX) From 158684a80ff825f0f02863360ac51230f5b7c3b4 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Thu, 3 Apr 2025 11:25:44 -0700 Subject: [PATCH 0568/1029] [bazel] Add missing dep after 586c5e3083428e7473e880dafd5939e8707bc1c9 --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 83891dcb1e3d2..e0f072d769f31 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -14255,6 +14255,7 @@ cc_library( hdrs = ["include/mlir/Conversion/MPIToLLVM/MPIToLLVM.h"], includes = ["include"], deps = [ + ":ArithDialect", ":ConvertToLLVMInterface", ":DLTIDialect", ":IR", From b393ca6026ffbd5621edb00503dd39e9eec97a1a Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Thu, 3 Apr 2025 11:28:45 -0700 Subject: [PATCH 0569/1029] [NFC][LLVM][RISCV] Cleanup pass initialization for RISCV (#134279) - Move calls to pass initialization functions to RISCV target initialization and remove them from pass constructors. --- .../Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp | 6 +----- .../Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp | 6 +----- .../Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp | 6 +----- llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp | 10 ++-------- .../lib/Target/RISCV/RISCVRedundantCopyElimination.cpp | 5 +---- llvm/lib/Target/RISCV/RISCVTargetMachine.cpp | 2 ++ 6 files changed, 8 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp index b3f27ea95b79c..1450d5f092f9c 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp @@ -110,8 +110,6 @@ void RISCVO0PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { RISCVO0PreLegalizerCombiner::RISCVO0PreLegalizerCombiner() : MachineFunctionPass(ID) { - initializeRISCVO0PreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); - if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } @@ -150,8 +148,6 @@ INITIALIZE_PASS_END(RISCVO0PreLegalizerCombiner, DEBUG_TYPE, "Combine RISC-V machine instrs before legalization", false, false) -namespace llvm { -FunctionPass *createRISCVO0PreLegalizerCombiner() { +FunctionPass *llvm::createRISCVO0PreLegalizerCombiner() { return new RISCVO0PreLegalizerCombiner(); } -} // end namespace llvm diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp index 29136d8b8bf04..eaccf6d67dcc4 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp @@ -118,8 +118,6 @@ void RISCVPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { RISCVPostLegalizerCombiner::RISCVPostLegalizerCombiner() : MachineFunctionPass(ID) { - initializeRISCVPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); - if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } @@ -164,8 +162,6 @@ INITIALIZE_PASS_END(RISCVPostLegalizerCombiner, DEBUG_TYPE, "Combine RISC-V MachineInstrs after legalization", false, false) -namespace llvm { -FunctionPass *createRISCVPostLegalizerCombiner() { +FunctionPass *llvm::createRISCVPostLegalizerCombiner() { return new RISCVPostLegalizerCombiner(); } -} // end namespace llvm diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp index 0c5a09a925bb6..afd25676a89eb 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp @@ -116,8 +116,6 @@ void RISCVPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { RISCVPreLegalizerCombiner::RISCVPreLegalizerCombiner() : MachineFunctionPass(ID) { - initializeRISCVPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); - if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } @@ -167,8 +165,6 @@ INITIALIZE_PASS_END(RISCVPreLegalizerCombiner, DEBUG_TYPE, "Combine RISC-V machine instrs before legalization", false, false) -namespace llvm { -FunctionPass *createRISCVPreLegalizerCombiner() { +FunctionPass *llvm::createRISCVPreLegalizerCombiner() { return new RISCVPreLegalizerCombiner(); } -} // end namespace llvm diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp index bb772fc5da922..a537904b27744 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp @@ -34,9 +34,7 @@ class RISCVExpandAtomicPseudo : public MachineFunctionPass { const RISCVInstrInfo *TII; static char ID; - RISCVExpandAtomicPseudo() : MachineFunctionPass(ID) { - initializeRISCVExpandAtomicPseudoPass(*PassRegistry::getPassRegistry()); - } + RISCVExpandAtomicPseudo() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -733,10 +731,6 @@ bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg( INITIALIZE_PASS(RISCVExpandAtomicPseudo, "riscv-expand-atomic-pseudo", RISCV_EXPAND_ATOMIC_PSEUDO_NAME, false, false) -namespace llvm { - -FunctionPass *createRISCVExpandAtomicPseudoPass() { +FunctionPass *llvm::createRISCVExpandAtomicPseudoPass() { return new RISCVExpandAtomicPseudo(); } - -} // end of namespace llvm diff --git a/llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp b/llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp index 65ff67b424796..167db9f50bcb8 100644 --- a/llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp +++ b/llvm/lib/Target/RISCV/RISCVRedundantCopyElimination.cpp @@ -44,10 +44,7 @@ class RISCVRedundantCopyElimination : public MachineFunctionPass { public: static char ID; - RISCVRedundantCopyElimination() : MachineFunctionPass(ID) { - initializeRISCVRedundantCopyEliminationPass( - *PassRegistry::getPassRegistry()); - } + RISCVRedundantCopyElimination() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; MachineFunctionProperties getRequiredProperties() const override { diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index ec8ba3322a6e1..7fb64be3975d5 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -147,6 +147,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() { initializeRISCVMoveMergePass(*PR); initializeRISCVPushPopOptPass(*PR); initializeRISCVLoadStoreOptPass(*PR); + initializeRISCVExpandAtomicPseudoPass(*PR); + initializeRISCVRedundantCopyEliminationPass(*PR); } static StringRef computeDataLayout(const Triple &TT, From 7288f1bc32c1964c4de50aa305b696b32d0c0f1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 3 Apr 2025 12:08:30 -0700 Subject: [PATCH 0570/1029] [flang][cuda] Use nvvm operation for match any (#134283) The string used for intrinsic was not the correct one "llvm.nvvm.match.any.sync.i32p". There was an extra `p` at the end. Use the NVVM operation instead so we don't duplicate it. --- .../include/flang/Optimizer/Support/InitFIR.h | 4 ++- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 25 +++++++------------ flang/test/Lower/CUDA/cuda-device-proc.cuf | 10 +++----- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h index e999796c23718..f509fdfcf4918 100644 --- a/flang/include/flang/Optimizer/Support/InitFIR.h +++ b/flang/include/flang/Optimizer/Support/InitFIR.h @@ -22,6 +22,7 @@ #include "mlir/Dialect/Affine/Passes.h" #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/Func/Extensions/InlinerExtension.h" +#include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/OpenACC/Transforms/Passes.h" #include "mlir/InitAllDialects.h" #include "mlir/Pass/Pass.h" @@ -37,7 +38,8 @@ namespace fir::support { mlir::scf::SCFDialect, mlir::arith::ArithDialect, \ mlir::cf::ControlFlowDialect, mlir::func::FuncDialect, \ mlir::vector::VectorDialect, mlir::math::MathDialect, \ - mlir::complex::ComplexDialect, mlir::DLTIDialect, cuf::CUFDialect + mlir::complex::ComplexDialect, mlir::DLTIDialect, cuf::CUFDialect, \ + mlir::NVVM::NVVMDialect #define FLANG_CODEGEN_DIALECT_LIST FIRCodeGenDialect, mlir::LLVM::LLVMDialect diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 4988b6bfb3d3f..a562d9b7e461c 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -48,6 +48,7 @@ #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/LLVMTypes.h" +#include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "llvm/Support/CommandLine.h" @@ -6552,23 +6553,15 @@ IntrinsicLibrary::genMatchAnySync(mlir::Type resultType, assert(args.size() == 2); bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); - llvm::StringRef funcName = - is32 ? "llvm.nvvm.match.any.sync.i32p" : "llvm.nvvm.match.any.sync.i64p"; - mlir::MLIRContext *context = builder.getContext(); - mlir::Type i32Ty = builder.getI32Type(); - mlir::Type i64Ty = builder.getI64Type(); - mlir::Type valTy = is32 ? i32Ty : i64Ty; + mlir::Value arg1 = args[1]; + if (arg1.getType().isF32() || arg1.getType().isF64()) + arg1 = builder.create( + loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); - mlir::FunctionType ftype = - mlir::FunctionType::get(context, {i32Ty, valTy}, {i32Ty}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - llvm::SmallVector filteredArgs; - filteredArgs.push_back(args[0]); - if (args[1].getType().isF32() || args[1].getType().isF64()) - filteredArgs.push_back(builder.create(loc, valTy, args[1])); - else - filteredArgs.push_back(args[1]); - return builder.create(loc, funcOp, filteredArgs).getResult(0); + return builder + .create(loc, resultType, args[0], arg1, + mlir::NVVM::MatchSyncKind::any) + .getResult(); } // MATMUL diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index a4a4750dd61e6..dbce4a5fa47dd 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -143,12 +143,10 @@ attributes(device) subroutine testMatchAny() end subroutine ! CHECK-LABEL: func.func @_QPtestmatchany() -! CHECK: fir.call @llvm.nvvm.match.any.sync.i32p -! CHECK: fir.call @llvm.nvvm.match.any.sync.i64p -! CHECK: fir.convert %{{.*}} : (f32) -> i32 -! CHECK: fir.call @llvm.nvvm.match.any.sync.i32p -! CHECK: fir.convert %{{.*}} : (f64) -> i64 -! CHECK: fir.call @llvm.nvvm.match.any.sync.i64p +! CHECK: %{{.*}} = nvvm.match.sync any %{{.*}}, %{{.*}} : i32 -> i32 +! CHECK: %{{.*}} = nvvm.match.sync any %{{.*}}, %{{.*}} : i64 -> i32 +! CHECK: %{{.*}} = nvvm.match.sync any %{{.*}}, %{{.*}} : i32 -> i32 +! CHECK: %{{.*}} = nvvm.match.sync any %{{.*}}, %{{.*}} : i64 -> i32 attributes(device) subroutine testAtomic(aa, n) integer :: aa(*) From 229ca7dbcb5a6bcbcdc87fb0feb29362375c2843 Mon Sep 17 00:00:00 2001 From: zcfh <1412805291@qq.com> Date: Fri, 4 Apr 2025 03:48:27 +0800 Subject: [PATCH 0571/1029] [memprof] Report an error when buildid and profile do not match (#132504) ## Problem When the build ids of the profile and binary do not match, the error reported by llvm-profdata is `no entries in callstack map after symbolization`, but the root cause of this problem is the **build id mismatch**. ## Trigger scenario For example, when performing `memprof` optimization on `clang`, `rawprofile` is collected through `ninja clang`. In addition to running clang, some other programs will also be executed, and these programs will also generate rawprofile. When `no entries in callstack map after symbolization` appears during `llvm-profdata merge`, users may mistakenly think that the **instrumentation failed or other reasons**, and will **not directly realize that the binary and profile do not match**. ## Changed Currently, when the build id does not match, an assert error is triggered only in debug mode. Change it to directly return an error when the build id does not match. --- llvm/lib/ProfileData/MemProfReader.cpp | 6 +++++- llvm/test/tools/llvm-profdata/memprof-buildid.test | 6 ++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp index 16502a4f1e8af..c57f9b22273d4 100644 --- a/llvm/lib/ProfileData/MemProfReader.cpp +++ b/llvm/lib/ProfileData/MemProfReader.cpp @@ -444,7 +444,11 @@ Error RawMemProfReader::setupForSymbolization() { ProfiledTextSegmentEnd = Entry.End; } } - assert(NumMatched != 0 && "No matching executable segments in segment info."); + if (NumMatched == 0) + return make_error( + Twine("No matching executable segments found in binary ") + + Binary.getBinary()->getFileName(), + inconvertibleErrorCode()); assert((PreferredTextSegmentAddress == 0 || (PreferredTextSegmentAddress == ProfiledTextSegmentStart)) && "Expect text segment address to be 0 or equal to profiled text " diff --git a/llvm/test/tools/llvm-profdata/memprof-buildid.test b/llvm/test/tools/llvm-profdata/memprof-buildid.test index a5abe6ea7dcb6..75c3da2506796 100644 --- a/llvm/test/tools/llvm-profdata/memprof-buildid.test +++ b/llvm/test/tools/llvm-profdata/memprof-buildid.test @@ -14,3 +14,9 @@ CHECK: Build ID: [[ID:[[:xdigit:]]+]] COM: Then match it with the profdata output. CHECK-COUNT-1: BuildId: {{.*}}[[ID]] + +Test error message when profile build id does not match build id in a different binary. +RUN: not llvm-profdata show --memory %p/Inputs/buildid.memprofraw --profiled-binary %p/Inputs/basic.memprofexe -o - 2>&1 | FileCheck %s -check-prefix=BUILDID-NOT-MATCH +RUN: not llvm-profdata merge %p/Inputs/buildid.memprofraw %p/Inputs/basic.memprofraw --profiled-binary %p/Inputs/basic.memprofexe -o %t4.prof 2>&1 | FileCheck %s -check-prefix=BUILDID-NOT-MATCH + +BUILDID-NOT-MATCH: No matching executable segments found in binary From 7c4013d5912d2163147b46d6f97f4e2b0d976f3a Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 3 Apr 2025 12:58:49 -0700 Subject: [PATCH 0572/1029] Revert "[SLP]Initial support for (masked)loads + compress and (masked)interleaved" This reverts commit 0bec0f5c059af5f920fe22ecda469b666b5971b0 to fix a crash reported in https://lab.llvm.org/buildbot/#/builders/143/builds/6668. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 359 ++---------------- .../X86/entries-shuffled-diff-sizes.ll | 17 +- .../X86/gep-nodes-with-non-gep-inst.ll | 22 +- .../Transforms/SLPVectorizer/X86/pr47623.ll | 16 +- .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 162 +++++--- .../Transforms/SLPVectorizer/X86/pr47629.ll | 162 +++++--- .../X86/remark_gather-load-redux-cost.ll | 11 +- .../X86/reorder-possible-strided-node.ll | 52 ++- .../X86/reorder-reused-masked-gather.ll | 12 +- .../X86/reorder-reused-masked-gather2.ll | 11 +- .../X86/scatter-vectorize-reused-pointer.ll | 12 +- .../Transforms/SLPVectorizer/X86/sin-sqrt.ll | 8 +- .../SLPVectorizer/X86/split-load8_2-unord.ll | 11 +- .../X86/split-load8_2_unord_geps.ll | 11 +- 14 files changed, 348 insertions(+), 518 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a115fec47aeec..b82a66ca3b889 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -38,7 +38,6 @@ #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVDescriptors.h" -#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" @@ -1381,8 +1380,7 @@ class BoUpSLP { Gather, Vectorize, ScatterVectorize, - StridedVectorize, - CompressVectorize + StridedVectorize }; using ValueList = SmallVector; @@ -3380,7 +3378,6 @@ class BoUpSLP { Vectorize, ///< The node is regularly vectorized. ScatterVectorize, ///< Masked scatter/gather node. StridedVectorize, ///< Strided loads (and stores) - CompressVectorize, ///< (Masked) load with compress. NeedToGather, ///< Gather/buildvector node. CombinedVectorize, ///< Vectorized node, combined with its user into more ///< complex node like select/cmp to minmax, mul/add to @@ -3607,9 +3604,6 @@ class BoUpSLP { case StridedVectorize: dbgs() << "StridedVectorize\n"; break; - case CompressVectorize: - dbgs() << "CompressVectorize\n"; - break; case NeedToGather: dbgs() << "NeedToGather\n"; break; @@ -4825,8 +4819,7 @@ template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { if (Entry->isGather()) return "color=red"; if (Entry->State == TreeEntry::ScatterVectorize || - Entry->State == TreeEntry::StridedVectorize || - Entry->State == TreeEntry::CompressVectorize) + Entry->State == TreeEntry::StridedVectorize) return "color=blue"; return ""; } @@ -5426,157 +5419,6 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec, return Builder.CreateShuffleVector(Vec, Mask); } -/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered -/// with \p Order. -/// \return true if the mask represents strided access, false - otherwise. -static bool buildCompressMask(ArrayRef PointerOps, - ArrayRef Order, Type *ScalarTy, - const DataLayout &DL, ScalarEvolution &SE, - SmallVectorImpl &CompressMask) { - const unsigned Sz = PointerOps.size(); - CompressMask.assign(Sz, PoisonMaskElem); - // The first element always set. - CompressMask[0] = 0; - // Check if the mask represents strided access. - std::optional Stride = 0; - Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()]; - for (unsigned I : seq(1, Sz)) { - Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]]; - unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE); - CompressMask[I] = Pos; - if (!Stride) - continue; - if (*Stride == 0) { - *Stride = Pos; - continue; - } - if (Pos != *Stride * I) - Stride.reset(); - } - return Stride.has_value(); -} - -/// Checks if the \p VL can be transformed to a (masked)load + compress or -/// (masked) interleaved load. -static bool isMaskedLoadCompress( - ArrayRef VL, ArrayRef PointerOps, - ArrayRef Order, const TargetTransformInfo &TTI, - const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, - const DominatorTree &DT, const TargetLibraryInfo &TLI, - const function_ref AreAllUsersVectorized, bool &IsMasked, - unsigned &InterleaveFactor, SmallVectorImpl &CompressMask, - VectorType *&LoadVecTy) { - InterleaveFactor = 0; - Type *ScalarTy = VL.front()->getType(); - const unsigned Sz = VL.size(); - auto *VecTy = getWidenedType(ScalarTy, Sz); - constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - // Check external uses. - for (const auto [I, V] : enumerate(VL)) { - if (AreAllUsersVectorized(V)) - continue; - InstructionCost ExtractCost = - TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I); - InstructionCost ScalarCost = - TTI.getInstructionCost(cast(V), CostKind); - if (ExtractCost <= ScalarCost) - return false; - } - Value *Ptr0; - Value *PtrN; - if (Order.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); - } else { - Ptr0 = PointerOps[Order.front()]; - PtrN = PointerOps[Order.back()]; - } - std::optional Diff = - getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); - if (!Diff) - return false; - const unsigned MaxRegSize = - TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) - .getFixedValue(); - // Check for very large distances between elements. - if (*Diff / Sz >= MaxRegSize / 8) - return false; - Align CommonAlignment = computeCommonAlignment(VL); - LoadVecTy = getWidenedType(ScalarTy, *Diff + 1); - auto *LI = cast(Order.empty() ? VL.front() : VL[Order.front()]); - IsMasked = !isSafeToLoadUnconditionally( - Ptr0, LoadVecTy, CommonAlignment, DL, - cast(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT, - &TLI); - if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment, - LI->getPointerAddressSpace())) - return false; - // TODO: perform the analysis of each scalar load for better - // safe-load-unconditionally analysis. - bool IsStrided = - buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask); - assert(CompressMask.size() >= 2 && "At least two elements are required"); - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, PointerOps, PointerOps.front(), - Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy); - // The cost of scalar loads. - InstructionCost ScalarLoadsCost = - std::accumulate(VL.begin(), VL.end(), InstructionCost(), - [&](InstructionCost C, Value *V) { - return C + TTI.getInstructionCost(cast(V), - CostKind); - }) + - ScalarGEPCost; - APInt DemandedElts = APInt::getAllOnes(Sz); - InstructionCost GatherCost = - getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts, - /*Insert=*/true, - /*Extract=*/false, CostKind) + - ScalarLoadsCost; - InstructionCost LoadCost = 0; - if (IsMasked) { - LoadCost = - TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, - LI->getPointerAddressSpace(), CostKind); - } else { - CommonAlignment = LI->getAlign(); - LoadCost = - TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, - LI->getPointerAddressSpace(), CostKind); - } - SmallVector Mask; - if (!Order.empty()) - inversePermutation(Order, Mask); - if (IsStrided) { - // Check for potential segmented(interleaved) loads. - if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1], - CommonAlignment, - LI->getPointerAddressSpace())) { - InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost( - Instruction::Load, LoadVecTy, CompressMask[1], std::nullopt, - CommonAlignment, LI->getPointerAddressSpace(), CostKind, IsMasked); - if (!Mask.empty()) - InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, - VecTy, Mask, CostKind); - if (InterleavedCost < GatherCost) { - InterleaveFactor = CompressMask[1]; - return true; - } - } - } - if (!Order.empty()) { - SmallVector NewMask(Sz, PoisonMaskElem); - for (unsigned I : seq(Sz)) { - NewMask[I] = CompressMask[Mask[I]]; - } - CompressMask.swap(NewMask); - } - InstructionCost CompressCost = ::getShuffleCost( - TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind); - InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost; - return TotalVecCost < GatherCost; -} - BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, @@ -5648,6 +5490,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, // Check that the sorted loads are consecutive. if (static_cast(*Diff) == Sz - 1) return LoadsState::Vectorize; + if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || + TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) + return LoadsState::Gather; // Simple check if not a strided access - clear order. bool IsPossibleStrided = *Diff % (Sz - 1) == 0; // Try to generate strided load node if: @@ -5703,22 +5548,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, } } } - bool IsMasked; - unsigned InterleaveFactor; - SmallVector CompressMask; - VectorType *LoadVecTy; - if (isMaskedLoadCompress( - VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI, - [&](Value *V) { - return areAllUsersVectorized(cast(V), - UserIgnoreList); - }, - IsMasked, InterleaveFactor, CompressMask, LoadVecTy)) - return LoadsState::CompressVectorize; } - if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || - TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) - return LoadsState::Gather; // Correctly identify compare the cost of loads + shuffles rather than // strided/masked gather loads. Returns true if vectorized + shuffles // representation is better than just gather. @@ -5811,8 +5641,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, } // If need the reorder - consider as high-cost masked gather for now. if ((LS == LoadsState::Vectorize || - LS == LoadsState::StridedVectorize || - LS == LoadsState::CompressVectorize) && + LS == LoadsState::StridedVectorize) && !Order.empty() && !isReverseOrder(Order)) LS = LoadsState::ScatterVectorize; States.push_back(LS); @@ -5877,14 +5706,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, CommonAlignment, CostKind) + VectorGEPCost; break; - case LoadsState::CompressVectorize: - VecLdCost += TTI.getMaskedMemoryOpCost( - Instruction::Load, SubVecTy, CommonAlignment, - LI0->getPointerAddressSpace(), CostKind) + - VectorGEPCost + - ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy, - {}, CostKind); - break; case LoadsState::ScatterVectorize: VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy, LI0->getPointerOperand(), @@ -6258,8 +6079,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, return std::nullopt; if (TE.State == TreeEntry::SplitVectorize || ((TE.State == TreeEntry::Vectorize || - TE.State == TreeEntry::StridedVectorize || - TE.State == TreeEntry::CompressVectorize) && + TE.State == TreeEntry::StridedVectorize) && (isa(TE.getMainOp()) || (TopToBottom && isa(TE.getMainOp()))))) { assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) && @@ -6446,8 +6266,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, OrdersType CurrentOrder; LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(), CurrentOrder, PointerOps); - if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize || - Res == LoadsState::CompressVectorize) + if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize) return std::move(CurrentOrder); } // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars @@ -6687,8 +6506,7 @@ void BoUpSLP::reorderTopToBottom() { VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || - TE->State == TreeEntry::SplitVectorize || - TE->State == TreeEntry::CompressVectorize) || + TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && @@ -6862,8 +6680,7 @@ void BoUpSLP::reorderTopToBottom() { if ((TE->State == TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty()) || ((TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize || - TE->State == TreeEntry::CompressVectorize) && + TE->State == TreeEntry::StridedVectorize) && (isa(TE->getMainOp()) || (SLPReVec && isa(TE->getMainOp()))))) { @@ -6911,7 +6728,6 @@ bool BoUpSLP::canReorderOperands( return OpData.first == I && (OpData.second->State == TreeEntry::Vectorize || OpData.second->State == TreeEntry::StridedVectorize || - OpData.second->State == TreeEntry::CompressVectorize || OpData.second->State == TreeEntry::SplitVectorize); })) continue; @@ -6926,7 +6742,6 @@ bool BoUpSLP::canReorderOperands( // node, just reorder reuses mask. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && - TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) GatherOps.push_back(TE); @@ -6937,7 +6752,6 @@ bool BoUpSLP::canReorderOperands( [&Gather, UserTE, I](TreeEntry *TE) { assert(TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && - TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && "Only non-vectorized nodes are expected."); if (TE->UserTreeIndex.UserTE == UserTE && @@ -6974,7 +6788,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (const std::unique_ptr &TE : VectorizableTree) { if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && - TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize) NonVectorized.push_back(TE.get()); if (std::optional CurrentOrder = @@ -6982,7 +6795,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Queue.push(TE.get()); if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || - TE->State == TreeEntry::CompressVectorize || TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.insert(TE.get()); @@ -7011,7 +6823,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (TreeEntry *TE : OrderedOps) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || - TE->State == TreeEntry::CompressVectorize || TE->State == TreeEntry::SplitVectorize || (TE->isGather() && GathersToOrders.contains(TE))) || !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() || @@ -7306,7 +7117,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // Gathers are processed separately. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && - TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && (TE->State != TreeEntry::ScatterVectorize || TE->ReorderIndices.empty())) @@ -7339,8 +7149,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Data.first->reorderOperands(Mask); if (!isa(Data.first->getMainOp()) || Data.first->isAltShuffle() || - Data.first->State == TreeEntry::StridedVectorize || - Data.first->State == TreeEntry::CompressVectorize) { + Data.first->State == TreeEntry::StridedVectorize) { reorderScalars(Data.first->Scalars, Mask); reorderOrder(Data.first->ReorderIndices, MaskOrder, /*BottomOrder=*/true); @@ -8118,16 +7927,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads( }); if (It == Slice.end()) return false; - const TreeEntry &TE = - *VectorizableTree[std::get<0>(P)]; - ArrayRef VL = TE.Scalars; - OrdersType Order; - SmallVector PointerOps; - LoadsState State = canVectorizeLoads( - VL, VL.front(), Order, PointerOps); - if (State == LoadsState::ScatterVectorize || - State == LoadsState::CompressVectorize) - return false; + ArrayRef VL = + VectorizableTree[std::get<0>(P)]->Scalars; ConsecutiveNodesSize += VL.size(); unsigned Start = std::distance(Slice.begin(), It); unsigned Sz = Slice.size() - Start; @@ -8592,44 +8393,23 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. - auto IsGatheredNode = [&]() { - if (!GatheredLoadsEntriesFirst) - return false; - return all_of(VL, [&](Value *V) { - if (isa(V)) - return true; - return any_of(getTreeEntries(V), [&](const TreeEntry *TE) { - return TE->Idx >= *GatheredLoadsEntriesFirst; - }); - }); - }; switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) { case LoadsState::Vectorize: return TreeEntry::Vectorize; - case LoadsState::CompressVectorize: - if (!IsGraphTransformMode && !VectorizableTree.empty()) { - // Delay slow vectorized nodes for better vectorization attempts. - LoadEntriesToVectorize.insert(VectorizableTree.size()); - return TreeEntry::NeedToGather; - } - return IsGatheredNode() ? TreeEntry::NeedToGather - : TreeEntry::CompressVectorize; case LoadsState::ScatterVectorize: if (!IsGraphTransformMode && !VectorizableTree.empty()) { // Delay slow vectorized nodes for better vectorization attempts. LoadEntriesToVectorize.insert(VectorizableTree.size()); return TreeEntry::NeedToGather; } - return IsGatheredNode() ? TreeEntry::NeedToGather - : TreeEntry::ScatterVectorize; + return TreeEntry::ScatterVectorize; case LoadsState::StridedVectorize: if (!IsGraphTransformMode && VectorizableTree.size() > 1) { // Delay slow vectorized nodes for better vectorization attempts. LoadEntriesToVectorize.insert(VectorizableTree.size()); return TreeEntry::NeedToGather; } - return IsGatheredNode() ? TreeEntry::NeedToGather - : TreeEntry::StridedVectorize; + return TreeEntry::StridedVectorize; case LoadsState::Gather: #ifndef NDEBUG Type *ScalarTy = VL0->getType(); @@ -9730,15 +9510,6 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, << "SLP: added a new TreeEntry (jumbled LoadInst).\n"; TE->dump()); break; - case TreeEntry::CompressVectorize: - // Vectorizing non-consecutive loads with (masked)load + compress. - TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S, - UserTreeIdx, ReuseShuffleIndices, CurrentOrder); - LLVM_DEBUG( - dbgs() - << "SLP: added a new TreeEntry (masked LoadInst + compress).\n"; - TE->dump()); - break; case TreeEntry::StridedVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, @@ -12270,8 +12041,6 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { if (TE.State == TreeEntry::ScatterVectorize || TE.State == TreeEntry::StridedVectorize) return TTI::CastContextHint::GatherScatter; - if (TE.State == TreeEntry::CompressVectorize) - return TTI::CastContextHint::Masked; if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load && !TE.isAltShuffle()) { if (TE.ReorderIndices.empty()) @@ -12365,8 +12134,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::StridedVectorize || - E->State == TreeEntry::CompressVectorize) && + E->State == TreeEntry::StridedVectorize) && "Unhandled state"); assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || @@ -12457,10 +12225,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // Negative value means vectorizing is profitable. auto GetGEPCostDiff = [=](ArrayRef Ptrs, Value *BasePtr) { assert((E->State == TreeEntry::Vectorize || - E->State == TreeEntry::StridedVectorize || - E->State == TreeEntry::CompressVectorize) && - "Entry state expected to be Vectorize, StridedVectorize or " - "MaskedLoadCompressVectorize here."); + E->State == TreeEntry::StridedVectorize) && + "Entry state expected to be Vectorize or StridedVectorize here."); InstructionCost ScalarCost = 0; InstructionCost VecCost = 0; std::tie(ScalarCost, VecCost) = getGEPCosts( @@ -12923,46 +12689,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, /*VariableMask=*/false, CommonAlignment, CostKind); break; } - case TreeEntry::CompressVectorize: { - SmallVector PointerOps(VL.size()); - for (auto [I, V] : enumerate(VL)) - PointerOps[I] = cast(V)->getPointerOperand(); - bool IsMasked; - unsigned InterleaveFactor; - SmallVector CompressMask; - VectorType *LoadVecTy; - [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress( - VL, PointerOps, std::nullopt, *TTI, *DL, *SE, *AC, *DT, *TLI, - [](Value *) { return true; }, IsMasked, InterleaveFactor, - CompressMask, LoadVecTy); - assert(IsVectorized && "Expected to be vectorized"); - Align CommonAlignment; - if (IsMasked) - CommonAlignment = - computeCommonAlignment(UniqueValues.getArrayRef()); - else - CommonAlignment = LI0->getAlign(); - if (InterleaveFactor) { - VecLdCost = TTI->getInterleavedMemoryOpCost( - Instruction::Load, LoadVecTy, InterleaveFactor, std::nullopt, - CommonAlignment, LI0->getPointerAddressSpace(), CostKind); - } else if (IsMasked) { - VecLdCost = TTI->getMaskedMemoryOpCost( - Instruction::Load, LoadVecTy, CommonAlignment, - LI0->getPointerAddressSpace(), CostKind); - // TODO: include this cost into CommonCost. - VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - LoadVecTy, CompressMask, CostKind); - } else { - VecLdCost = TTI->getMemoryOpCost( - Instruction::Load, LoadVecTy, CommonAlignment, - LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); - // TODO: include this cost into CommonCost. - VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - LoadVecTy, CompressMask, CostKind); - } - break; - } case TreeEntry::ScatterVectorize: { Align CommonAlignment = computeCommonAlignment(UniqueValues.getArrayRef()); @@ -13252,7 +12978,6 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { if (VectorizableTree.size() == 1 && (VectorizableTree[0]->State == TreeEntry::Vectorize || VectorizableTree[0]->State == TreeEntry::StridedVectorize || - VectorizableTree[0]->State == TreeEntry::CompressVectorize || (ForReduction && AreVectorizableGathers(VectorizableTree[0].get(), VectorizableTree[0]->Scalars.size()) && @@ -13276,8 +13001,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { if (VectorizableTree[0]->isGather() || (VectorizableTree[1]->isGather() && VectorizableTree[0]->State != TreeEntry::ScatterVectorize && - VectorizableTree[0]->State != TreeEntry::StridedVectorize && - VectorizableTree[0]->State != TreeEntry::CompressVectorize)) + VectorizableTree[0]->State != TreeEntry::StridedVectorize)) return false; return true; @@ -17459,40 +17183,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *PO = LI->getPointerOperand(); if (E->State == TreeEntry::Vectorize) { NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign()); - } else if (E->State == TreeEntry::CompressVectorize) { - SmallVector PointerOps(E->Scalars.size()); - for (auto [I, V] : enumerate(E->Scalars)) - PointerOps[I] = cast(V)->getPointerOperand(); - bool IsMasked; - unsigned InterleaveFactor; - SmallVector CompressMask; - VectorType *LoadVecTy; - [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress( - E->Scalars, PointerOps, std::nullopt, *TTI, *DL, *SE, *AC, *DT, - *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor, - CompressMask, LoadVecTy); - assert(IsVectorized && "Expected to be vectorized"); - Align CommonAlignment; - if (IsMasked) - CommonAlignment = computeCommonAlignment(E->Scalars); - else - CommonAlignment = LI->getAlign(); - if (IsMasked) { - SmallVector MaskValues( - getNumElements(LoadVecTy) / getNumElements(LI->getType()), - ConstantInt::getFalse(VecTy->getContext())); - for (int I : CompressMask) - MaskValues[I] = ConstantInt::getTrue(VecTy->getContext()); - Constant *MaskValue = ConstantVector::get(MaskValues); - NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment, - MaskValue); - } else { - NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment); - } - NewLI = ::propagateMetadata(NewLI, E->Scalars); - // TODO: include this cost into CommonCost. - NewLI = - cast(Builder.CreateShuffleVector(NewLI, CompressMask)); } else if (E->State == TreeEntry::StridedVectorize) { Value *Ptr0 = cast(E->Scalars.front())->getPointerOperand(); Value *PtrN = cast(E->Scalars.back())->getPointerOperand(); @@ -17562,9 +17252,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Align CommonAlignment = computeCommonAlignment(E->Scalars); NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment); } - Value *V = E->State == TreeEntry::CompressVectorize - ? NewLI - : ::propagateMetadata(NewLI, E->Scalars); + Value *V = ::propagateMetadata(NewLI, E->Scalars); V = FinalShuffle(V, E); E->VectorizedValue = V; @@ -18166,14 +17854,11 @@ Value *BoUpSLP::vectorizeTree( ArrayRef UseEntries = getTreeEntries(U); return !UseEntries.empty() && (E->State == TreeEntry::Vectorize || - E->State == TreeEntry::StridedVectorize || - E->State == TreeEntry::CompressVectorize) && + E->State == TreeEntry::StridedVectorize) && any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) { return (UseEntry->State == TreeEntry::Vectorize || UseEntry->State == - TreeEntry::StridedVectorize || - UseEntry->State == - TreeEntry::CompressVectorize) && + TreeEntry::StridedVectorize) && doesInTreeUserNeedToExtract( Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll index b99a1c2d83394..aa9195f8c48ce 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll @@ -15,16 +15,19 @@ define void @test() { ; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]] ; CHECK-NEXT: store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), align 16 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1296), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load <8 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1304), align 16 ; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP8]], <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP12]], <8 x float> [[TMP13]], i64 8) +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP15]], <4 x float> [[TMP7]], i64 0) +; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> [[TMP16]], <2 x float> [[TMP9]], i64 6) ; CHECK-NEXT: [[TMP18:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP14]], [[TMP17]] -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP18]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: store <16 x float> [[TMP15]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 +; CHECK-NEXT: store <16 x float> [[TMP18]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 ; CHECK-NEXT: ret void ; alloca_0: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll index 80ba7a40fb193..12263b065d89c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll @@ -9,9 +9,17 @@ define void @test() { ; CHECK-NEXT: [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> , <13 x i64> poison) -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[COND_IN_V]], align 8 +; CHECK-NEXT: [[BV:%.*]] = icmp eq i64 [[V]], 0 +; CHECK-NEXT: [[IN_1:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 4 +; CHECK-NEXT: [[V_1:%.*]] = load i64, ptr [[IN_1]], align 8 +; CHECK-NEXT: [[BV_1:%.*]] = icmp eq i64 [[V_1]], 0 +; CHECK-NEXT: [[IN_2:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 8 +; CHECK-NEXT: [[V_2:%.*]] = load i64, ptr [[IN_2]], align 8 +; CHECK-NEXT: [[BV_2:%.*]] = icmp eq i64 [[V_2]], 0 +; CHECK-NEXT: [[IN_3:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 12 +; CHECK-NEXT: [[V_3:%.*]] = load i64, ptr [[IN_3]], align 8 +; CHECK-NEXT: [[BV_3:%.*]] = icmp eq i64 [[V_3]], 0 ; CHECK-NEXT: ret void ; ; CHECK-SLP-THRESHOLD-LABEL: define void @test @@ -20,9 +28,11 @@ define void @test() { ; CHECK-SLP-THRESHOLD-NEXT: [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null ; CHECK-SLP-THRESHOLD-NEXT: br label [[BB:%.*]] ; CHECK-SLP-THRESHOLD: bb: -; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> , <13 x i64> poison) -; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> -; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer +; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0 +; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> +; CHECK-SLP-THRESHOLD-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison) +; CHECK-SLP-THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer ; CHECK-SLP-THRESHOLD-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll index a9c0eb3f9f2b9..f249394c91788 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -24,16 +24,20 @@ define void @foo() { ; SSE-NEXT: ret void ; ; AVX-LABEL: @foo( -; AVX-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16 -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 +; AVX-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> ; AVX-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 ; AVX-NEXT: ret void ; ; AVX512-LABEL: @foo( -; AVX512-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16 -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> -; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 +; AVX512-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 +; AVX512-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 ; AVX512-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index a0e52c13ec621..925c334cb5f20 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -164,20 +164,36 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> -; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], -; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> -; AVX512F-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( ; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> -; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], -; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> -; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -274,30 +290,49 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24 -; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 +; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 +; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 +; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 +; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 +; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 +; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> -; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> -; AVX512F-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], +; AVX512F-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> -; AVX512VL-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], +; AVX512VL-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, ptr %1, align 4, !tbaa !2 @@ -412,30 +447,49 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24 -; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 +; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 +; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 +; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 +; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 +; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 +; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 +; AVX2-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 +; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 +; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 +; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 +; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 +; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 ; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], ; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> -; AVX512F-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; AVX512F-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> +; AVX512F-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512F-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; AVX512VL-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> +; AVX512VL-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512VL-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, ptr %t0, i64 1 @@ -633,21 +687,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> -; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> -; AVX512F-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> -; AVX512VL-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, ptr %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index 6c5638819dcea..dc1ba4ec7e7ab 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -164,20 +164,36 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> -; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], -; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> -; AVX512F-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( ; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> -; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], -; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> -; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -274,30 +290,49 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24 -; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 +; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 +; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 +; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 +; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 +; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 +; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> -; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> -; AVX512F-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], +; AVX512F-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> -; AVX512VL-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], +; AVX512VL-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, ptr %1, align 4, !tbaa !2 @@ -412,30 +447,49 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24 -; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 +; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 +; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 +; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 +; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 +; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 +; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 +; AVX2-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 +; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 +; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 +; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 +; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 +; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 ; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], ; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> -; AVX512F-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; AVX512F-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> +; AVX512F-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512F-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; AVX512VL-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> +; AVX512VL-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512VL-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, ptr %t0, i64 1 @@ -633,21 +687,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> -; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> -; AVX512F-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> -; AVX512VL-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, ptr %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll index bbb1b87fc3dfa..0807a1bd4cdea 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll @@ -5,9 +5,10 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[OFF0_1:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1 -; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[OFF0_1]], i32 8, <15 x i1> , <15 x i32> poison) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, <8 x ptr> [[TMP5]], <8 x i32> [[TMP3]] @@ -21,9 +22,9 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' - ; YAML-NEXT: - Cost: '-10' + ; YAML-NEXT: - Cost: '-1' ; YAML-NEXT: - String: ' and with tree size ' - ; YAML-NEXT: - TreeSize: '5' + ; YAML-NEXT: - TreeSize: '8' entry: %off0.1 = getelementptr inbounds i32, ptr %addr, i32 1 %idx0 = load i32, ptr %off0.1, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll index 02058b1fe8578..5bd954e741d43 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll @@ -5,17 +5,16 @@ define void @test() { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP6]], <2 x i32> [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP9]], i32 2 ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> @@ -65,17 +64,16 @@ define void @test1() { ; CHECK-LABEL: define void @test1( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP12]], <2 x i32> [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 2 ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer @@ -127,17 +125,16 @@ define void @test_div() { ; CHECK-LABEL: define void @test_div( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> @@ -187,17 +184,16 @@ define void @test_rem() { ; CHECK-LABEL: define void @test_rem( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll index 73b6c80730935..c7c67d31f9ded 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll @@ -3,12 +3,14 @@ define void @test(ptr noalias %0, ptr %p) { ; CHECK-LABEL: @test( +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x ptr> [[TMP2]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x ptr> [[TMP3]], <8 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 4, <16 x i1> , <16 x float> poison) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> , <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> , <16 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]] ; CHECK-NEXT: store <16 x float> [[TMP10]], ptr [[TMP5]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll index 92d5506977aeb..c114c5dee78e9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll @@ -8,11 +8,14 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 { ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8 -; CHECK-NEXT: [[TMP5:%.*]] = call <6 x float> @llvm.masked.load.v6f32.p1(ptr addrspace(1) [[TMP3]], i32 4, <6 x i1> , <6 x float> poison) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 24 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x float> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index d487e3616956c..1294a87ff6967 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -5,12 +5,16 @@ define void @test(i1 %c, ptr %arg) { ; CHECK-LABEL: @test( ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: -; CHECK-NEXT: [[TMP1:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG:%.*]], i32 8, <5 x i1> , <5 x i64> poison) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x i64> [[TMP1]], <5 x i64> poison, <4 x i32> +; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <4 x i32> ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: -; CHECK-NEXT: [[TMP3:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG]], i32 8, <5 x i1> , <5 x i64> poison) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <5 x i64> [[TMP3]], <5 x i64> poison, <4 x i32> +; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP7]], <4 x i32> ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: ; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll index b4996eb58b47e..e1e80d96d416d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll @@ -19,11 +19,11 @@ define void @test() { ; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]]) ; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]]) ; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]]) -; CHECK-NEXT: [[TMP1:%.*]] = load <6 x double>, ptr @src, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]]) -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[SIN1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP13]], double [[SIN3]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index 3f684e414c8ba..202ec9633712f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -8,10 +8,15 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr [[ARRAYIDX20]], i32 4, <12 x i1> , <12 x i32> poison) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP0]], i64 4) ; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]] ; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll index fdc0bc0e00eb8..8fe7d15b69cb1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll @@ -4,15 +4,16 @@ define void @test(ptr noalias %p, ptr noalias %addr, ptr noalias %s) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1 -; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[ADDR]], i32 8, <15 x i1> , <15 x i32> poison) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) -; CHECK-NEXT: [[TMP11:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[GEP2]], i32 8, <15 x i1> , <15 x i32> poison) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <15 x i32> [[TMP11]], <15 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP8]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) ; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]] From 012e574d4dc3f2441fae05e3fa9c35f3fe1e310e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 3 Apr 2025 21:01:58 +0100 Subject: [PATCH 0573/1029] [LV] Add FindLastIV test with truncated IV and epilogue vectorization. This adds missing test coverage for https://github.com/llvm/llvm-project/pull/132691. --- .../AArch64/epilog-iv-select-cmp.ll | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll index 25404964d8058..7296cc0840dc0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll @@ -120,3 +120,129 @@ loop: exit: ret i8 %sel } + +define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { +; CHECK-LABEL: define i32 @select_icmp_var_start_iv_trunc( +; CHECK-SAME: i32 [[N:%.*]], i32 [[START:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[N_POS:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[N_POS]]) +; CHECK-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[N_EXT]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ splat (i32 -2147483648), %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) +; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i32> [[STEP_ADD_2]], splat (i32 4) +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3]] = select i1 [[TMP2]], <4 x i32> [[VEC_IND]], <4 x i32> [[VEC_PHI]] +; CHECK-NEXT: [[TMP4]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD]], <4 x i32> [[VEC_PHI2]] +; CHECK-NEXT: [[TMP5]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD_2]], <4 x i32> [[VEC_PHI3]] +; CHECK-NEXT: [[TMP6]] = select i1 [[TMP2]], <4 x i32> [[STEP_ADD_3]], <4 x i32> [[VEC_PHI4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP4]]) +; CHECK-NEXT: [[RDX_MINMAX5:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX]], <4 x i32> [[TMP5]]) +; CHECK-NEXT: [[RDX_MINMAX6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX5]], <4 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX6]]) +; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP8]], -2147483648 +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[START]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[START]] +; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -2147483648, i32 [[BC_MERGE_RDX]] +; CHECK-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF7]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i32> poison, i32 [[START]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT9]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT10]], zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[BC_RESUME_VAL]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i64 0 +; CHECK-NEXT: [[DOTSPLAT14:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT13]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[DOTSPLAT14]], +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX11:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI12:%.*]] = phi <4 x i32> [ [[DOTSPLAT]], %[[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND15:%.*]] = phi <4 x i32> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP14]] = select i1 [[TMP13]], <4 x i32> [[VEC_IND15]], <4 x i32> [[VEC_PHI12]] +; CHECK-NEXT: [[INDEX_NEXT17]] = add nuw i64 [[INDEX11]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i32> [[VEC_IND15]], splat (i32 4) +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT17]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP14]]) +; CHECK-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp ne i32 [[TMP16]], -2147483648 +; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[START]] +; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]] +; CHECK-NEXT: br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL21:%.*]] = phi i64 [ [[N_VEC8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX22:%.*]] = phi i32 [ [[RDX_SELECT19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL21]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX22]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[START]], 0 +; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-NEXT: [[RED_NEXT]] = select i1 [[C]], i32 [[IV_TRUNC]], i32 [[RED]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N_EXT]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT19]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]] +; +entry: + %N.pos = icmp sgt i32 %N, 0 + call void @llvm.assume(i1 %N.pos) + %N.ext = zext i32 %N to i64 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi i32 [ %start, %entry ], [ %red.next, %loop ] + %c = icmp eq i32 %start, 0 + %iv.trunc = trunc i64 %iv to i32 + %red.next = select i1 %c, i32 %iv.trunc, i32 %red + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, %N.ext + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} + +declare void @llvm.assume(i1 noundef) + +attributes #0 = { "target-cpu"="apple-m1" } From daab7d08078bb7cd37c66b78a56f4773e6b12fba Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 3 Apr 2025 13:21:22 -0400 Subject: [PATCH 0574/1029] [SLP]Initial support for (masked)loads + compress and (masked)interleaved Added initial support for (masked)loads + compress and (masked)interleaved loads. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/132099 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 358 ++++++++++++++++-- .../X86/entries-shuffled-diff-sizes.ll | 17 +- .../X86/gep-nodes-with-non-gep-inst.ll | 22 +- .../Transforms/SLPVectorizer/X86/pr47623.ll | 16 +- .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 162 +++----- .../Transforms/SLPVectorizer/X86/pr47629.ll | 162 +++----- .../X86/remark_gather-load-redux-cost.ll | 11 +- .../X86/reorder-possible-strided-node.ll | 52 +-- .../X86/reorder-reused-masked-gather.ll | 12 +- .../X86/reorder-reused-masked-gather2.ll | 11 +- .../X86/scatter-vectorize-reused-pointer.ll | 12 +- .../Transforms/SLPVectorizer/X86/sin-sqrt.ll | 8 +- .../SLPVectorizer/X86/split-load8_2-unord.ll | 11 +- .../X86/split-load8_2_unord_geps.ll | 11 +- 14 files changed, 517 insertions(+), 348 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b82a66ca3b889..c384b11bbc1a5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -38,6 +38,7 @@ #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" @@ -1380,7 +1381,8 @@ class BoUpSLP { Gather, Vectorize, ScatterVectorize, - StridedVectorize + StridedVectorize, + CompressVectorize }; using ValueList = SmallVector; @@ -3378,6 +3380,7 @@ class BoUpSLP { Vectorize, ///< The node is regularly vectorized. ScatterVectorize, ///< Masked scatter/gather node. StridedVectorize, ///< Strided loads (and stores) + CompressVectorize, ///< (Masked) load with compress. NeedToGather, ///< Gather/buildvector node. CombinedVectorize, ///< Vectorized node, combined with its user into more ///< complex node like select/cmp to minmax, mul/add to @@ -3604,6 +3607,9 @@ class BoUpSLP { case StridedVectorize: dbgs() << "StridedVectorize\n"; break; + case CompressVectorize: + dbgs() << "CompressVectorize\n"; + break; case NeedToGather: dbgs() << "NeedToGather\n"; break; @@ -4819,7 +4825,8 @@ template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { if (Entry->isGather()) return "color=red"; if (Entry->State == TreeEntry::ScatterVectorize || - Entry->State == TreeEntry::StridedVectorize) + Entry->State == TreeEntry::StridedVectorize || + Entry->State == TreeEntry::CompressVectorize) return "color=blue"; return ""; } @@ -5419,6 +5426,157 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec, return Builder.CreateShuffleVector(Vec, Mask); } +/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered +/// with \p Order. +/// \return true if the mask represents strided access, false - otherwise. +static bool buildCompressMask(ArrayRef PointerOps, + ArrayRef Order, Type *ScalarTy, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl &CompressMask) { + const unsigned Sz = PointerOps.size(); + CompressMask.assign(Sz, PoisonMaskElem); + // The first element always set. + CompressMask[0] = 0; + // Check if the mask represents strided access. + std::optional Stride = 0; + Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()]; + for (unsigned I : seq(1, Sz)) { + Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]]; + unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE); + CompressMask[I] = Pos; + if (!Stride) + continue; + if (*Stride == 0) { + *Stride = Pos; + continue; + } + if (Pos != *Stride * I) + Stride.reset(); + } + return Stride.has_value(); +} + +/// Checks if the \p VL can be transformed to a (masked)load + compress or +/// (masked) interleaved load. +static bool isMaskedLoadCompress( + ArrayRef VL, ArrayRef PointerOps, + ArrayRef Order, const TargetTransformInfo &TTI, + const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, + const DominatorTree &DT, const TargetLibraryInfo &TLI, + const function_ref AreAllUsersVectorized, bool &IsMasked, + unsigned &InterleaveFactor, SmallVectorImpl &CompressMask, + VectorType *&LoadVecTy) { + InterleaveFactor = 0; + Type *ScalarTy = VL.front()->getType(); + const unsigned Sz = VL.size(); + auto *VecTy = getWidenedType(ScalarTy, Sz); + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + // Check external uses. + for (const auto [I, V] : enumerate(VL)) { + if (AreAllUsersVectorized(V)) + continue; + InstructionCost ExtractCost = + TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I); + InstructionCost ScalarCost = + TTI.getInstructionCost(cast(V), CostKind); + if (ExtractCost <= ScalarCost) + return false; + } + Value *Ptr0; + Value *PtrN; + if (Order.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[Order.front()]; + PtrN = PointerOps[Order.back()]; + } + std::optional Diff = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); + if (!Diff) + return false; + const unsigned MaxRegSize = + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) + .getFixedValue(); + // Check for very large distances between elements. + if (*Diff / Sz >= MaxRegSize / 8) + return false; + Align CommonAlignment = computeCommonAlignment(VL); + LoadVecTy = getWidenedType(ScalarTy, *Diff + 1); + auto *LI = cast(Order.empty() ? VL.front() : VL[Order.front()]); + IsMasked = !isSafeToLoadUnconditionally( + Ptr0, LoadVecTy, CommonAlignment, DL, + cast(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT, + &TLI); + if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment, + LI->getPointerAddressSpace())) + return false; + // TODO: perform the analysis of each scalar load for better + // safe-load-unconditionally analysis. + bool IsStrided = + buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask); + assert(CompressMask.size() >= 2 && "At least two elements are required"); + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, PointerOps, PointerOps.front(), + Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy); + // The cost of scalar loads. + InstructionCost ScalarLoadsCost = + std::accumulate(VL.begin(), VL.end(), InstructionCost(), + [&](InstructionCost C, Value *V) { + return C + TTI.getInstructionCost(cast(V), + CostKind); + }) + + ScalarGEPCost; + APInt DemandedElts = APInt::getAllOnes(Sz); + InstructionCost GatherCost = + getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts, + /*Insert=*/true, + /*Extract=*/false, CostKind) + + ScalarLoadsCost; + InstructionCost LoadCost = 0; + if (IsMasked) { + LoadCost = + TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, + LI->getPointerAddressSpace(), CostKind); + } else { + CommonAlignment = LI->getAlign(); + LoadCost = + TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, + LI->getPointerAddressSpace(), CostKind); + } + SmallVector Mask; + if (!Order.empty()) + inversePermutation(Order, Mask); + if (IsStrided) { + // Check for potential segmented(interleaved) loads. + if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1], + CommonAlignment, + LI->getPointerAddressSpace())) { + InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost( + Instruction::Load, LoadVecTy, CompressMask[1], std::nullopt, + CommonAlignment, LI->getPointerAddressSpace(), CostKind, IsMasked); + if (!Mask.empty()) + InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, + VecTy, Mask, CostKind); + if (InterleavedCost < GatherCost) { + InterleaveFactor = CompressMask[1]; + return true; + } + } + } + if (!Order.empty()) { + SmallVector NewMask(Sz, PoisonMaskElem); + for (unsigned I : seq(Sz)) { + NewMask[I] = CompressMask[Mask[I]]; + } + CompressMask.swap(NewMask); + } + InstructionCost CompressCost = ::getShuffleCost( + TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind); + InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost; + return TotalVecCost < GatherCost; +} + BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, @@ -5490,9 +5648,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, // Check that the sorted loads are consecutive. if (static_cast(*Diff) == Sz - 1) return LoadsState::Vectorize; - if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || - TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) - return LoadsState::Gather; // Simple check if not a strided access - clear order. bool IsPossibleStrided = *Diff % (Sz - 1) == 0; // Try to generate strided load node if: @@ -5548,7 +5703,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, } } } + bool IsMasked; + unsigned InterleaveFactor; + SmallVector CompressMask; + VectorType *LoadVecTy; + if (isMaskedLoadCompress( + VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI, + [&](Value *V) { + return areAllUsersVectorized(cast(V), + UserIgnoreList); + }, + IsMasked, InterleaveFactor, CompressMask, LoadVecTy)) + return LoadsState::CompressVectorize; } + if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || + TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) + return LoadsState::Gather; // Correctly identify compare the cost of loads + shuffles rather than // strided/masked gather loads. Returns true if vectorized + shuffles // representation is better than just gather. @@ -5641,7 +5811,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, } // If need the reorder - consider as high-cost masked gather for now. if ((LS == LoadsState::Vectorize || - LS == LoadsState::StridedVectorize) && + LS == LoadsState::StridedVectorize || + LS == LoadsState::CompressVectorize) && !Order.empty() && !isReverseOrder(Order)) LS = LoadsState::ScatterVectorize; States.push_back(LS); @@ -5706,6 +5877,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, CommonAlignment, CostKind) + VectorGEPCost; break; + case LoadsState::CompressVectorize: + VecLdCost += TTI.getMaskedMemoryOpCost( + Instruction::Load, SubVecTy, CommonAlignment, + LI0->getPointerAddressSpace(), CostKind) + + VectorGEPCost + + ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy, + {}, CostKind); + break; case LoadsState::ScatterVectorize: VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy, LI0->getPointerOperand(), @@ -6079,7 +6258,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, return std::nullopt; if (TE.State == TreeEntry::SplitVectorize || ((TE.State == TreeEntry::Vectorize || - TE.State == TreeEntry::StridedVectorize) && + TE.State == TreeEntry::StridedVectorize || + TE.State == TreeEntry::CompressVectorize) && (isa(TE.getMainOp()) || (TopToBottom && isa(TE.getMainOp()))))) { assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) && @@ -6266,7 +6446,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, OrdersType CurrentOrder; LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(), CurrentOrder, PointerOps); - if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize) + if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize || + Res == LoadsState::CompressVectorize) return std::move(CurrentOrder); } // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars @@ -6506,7 +6687,8 @@ void BoUpSLP::reorderTopToBottom() { VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || - TE->State == TreeEntry::SplitVectorize) || + TE->State == TreeEntry::SplitVectorize || + TE->State == TreeEntry::CompressVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && @@ -6680,7 +6862,8 @@ void BoUpSLP::reorderTopToBottom() { if ((TE->State == TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty()) || ((TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize) && + TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::CompressVectorize) && (isa(TE->getMainOp()) || (SLPReVec && isa(TE->getMainOp()))))) { @@ -6728,6 +6911,7 @@ bool BoUpSLP::canReorderOperands( return OpData.first == I && (OpData.second->State == TreeEntry::Vectorize || OpData.second->State == TreeEntry::StridedVectorize || + OpData.second->State == TreeEntry::CompressVectorize || OpData.second->State == TreeEntry::SplitVectorize); })) continue; @@ -6742,6 +6926,7 @@ bool BoUpSLP::canReorderOperands( // node, just reorder reuses mask. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) GatherOps.push_back(TE); @@ -6752,6 +6937,7 @@ bool BoUpSLP::canReorderOperands( [&Gather, UserTE, I](TreeEntry *TE) { assert(TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && "Only non-vectorized nodes are expected."); if (TE->UserTreeIndex.UserTE == UserTE && @@ -6788,6 +6974,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (const std::unique_ptr &TE : VectorizableTree) { if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize) NonVectorized.push_back(TE.get()); if (std::optional CurrentOrder = @@ -6795,6 +6982,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Queue.push(TE.get()); if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::CompressVectorize || TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.insert(TE.get()); @@ -6823,6 +7011,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (TreeEntry *TE : OrderedOps) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::CompressVectorize || TE->State == TreeEntry::SplitVectorize || (TE->isGather() && GathersToOrders.contains(TE))) || !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() || @@ -7117,6 +7306,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // Gathers are processed separately. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && (TE->State != TreeEntry::ScatterVectorize || TE->ReorderIndices.empty())) @@ -7149,7 +7339,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Data.first->reorderOperands(Mask); if (!isa(Data.first->getMainOp()) || Data.first->isAltShuffle() || - Data.first->State == TreeEntry::StridedVectorize) { + Data.first->State == TreeEntry::StridedVectorize || + Data.first->State == TreeEntry::CompressVectorize) { reorderScalars(Data.first->Scalars, Mask); reorderOrder(Data.first->ReorderIndices, MaskOrder, /*BottomOrder=*/true); @@ -7927,8 +8118,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads( }); if (It == Slice.end()) return false; - ArrayRef VL = - VectorizableTree[std::get<0>(P)]->Scalars; + const TreeEntry &TE = + *VectorizableTree[std::get<0>(P)]; + ArrayRef VL = TE.Scalars; + OrdersType Order; + SmallVector PointerOps; + LoadsState State = canVectorizeLoads( + VL, VL.front(), Order, PointerOps); + if (State == LoadsState::ScatterVectorize || + State == LoadsState::CompressVectorize) + return false; ConsecutiveNodesSize += VL.size(); unsigned Start = std::distance(Slice.begin(), It); unsigned Sz = Slice.size() - Start; @@ -8393,23 +8592,44 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. + auto IsGatheredNode = [&]() { + if (!GatheredLoadsEntriesFirst) + return false; + return all_of(VL, [&](Value *V) { + if (isa(V)) + return true; + return any_of(getTreeEntries(V), [&](const TreeEntry *TE) { + return TE->Idx >= *GatheredLoadsEntriesFirst; + }); + }); + }; switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) { case LoadsState::Vectorize: return TreeEntry::Vectorize; + case LoadsState::CompressVectorize: + if (!IsGraphTransformMode && !VectorizableTree.empty()) { + // Delay slow vectorized nodes for better vectorization attempts. + LoadEntriesToVectorize.insert(VectorizableTree.size()); + return TreeEntry::NeedToGather; + } + return IsGatheredNode() ? TreeEntry::NeedToGather + : TreeEntry::CompressVectorize; case LoadsState::ScatterVectorize: if (!IsGraphTransformMode && !VectorizableTree.empty()) { // Delay slow vectorized nodes for better vectorization attempts. LoadEntriesToVectorize.insert(VectorizableTree.size()); return TreeEntry::NeedToGather; } - return TreeEntry::ScatterVectorize; + return IsGatheredNode() ? TreeEntry::NeedToGather + : TreeEntry::ScatterVectorize; case LoadsState::StridedVectorize: if (!IsGraphTransformMode && VectorizableTree.size() > 1) { // Delay slow vectorized nodes for better vectorization attempts. LoadEntriesToVectorize.insert(VectorizableTree.size()); return TreeEntry::NeedToGather; } - return TreeEntry::StridedVectorize; + return IsGatheredNode() ? TreeEntry::NeedToGather + : TreeEntry::StridedVectorize; case LoadsState::Gather: #ifndef NDEBUG Type *ScalarTy = VL0->getType(); @@ -9510,6 +9730,15 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, << "SLP: added a new TreeEntry (jumbled LoadInst).\n"; TE->dump()); break; + case TreeEntry::CompressVectorize: + // Vectorizing non-consecutive loads with (masked)load + compress. + TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndices, CurrentOrder); + LLVM_DEBUG( + dbgs() + << "SLP: added a new TreeEntry (masked LoadInst + compress).\n"; + TE->dump()); + break; case TreeEntry::StridedVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, @@ -12041,6 +12270,8 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { if (TE.State == TreeEntry::ScatterVectorize || TE.State == TreeEntry::StridedVectorize) return TTI::CastContextHint::GatherScatter; + if (TE.State == TreeEntry::CompressVectorize) + return TTI::CastContextHint::Masked; if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load && !TE.isAltShuffle()) { if (TE.ReorderIndices.empty()) @@ -12134,7 +12365,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::StridedVectorize) && + E->State == TreeEntry::StridedVectorize || + E->State == TreeEntry::CompressVectorize) && "Unhandled state"); assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || @@ -12225,8 +12457,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // Negative value means vectorizing is profitable. auto GetGEPCostDiff = [=](ArrayRef Ptrs, Value *BasePtr) { assert((E->State == TreeEntry::Vectorize || - E->State == TreeEntry::StridedVectorize) && - "Entry state expected to be Vectorize or StridedVectorize here."); + E->State == TreeEntry::StridedVectorize || + E->State == TreeEntry::CompressVectorize) && + "Entry state expected to be Vectorize, StridedVectorize or " + "MaskedLoadCompressVectorize here."); InstructionCost ScalarCost = 0; InstructionCost VecCost = 0; std::tie(ScalarCost, VecCost) = getGEPCosts( @@ -12689,6 +12923,45 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, /*VariableMask=*/false, CommonAlignment, CostKind); break; } + case TreeEntry::CompressVectorize: { + SmallVector PointerOps(VL.size()); + for (auto [I, V] : enumerate(VL)) + PointerOps[I] = cast(V)->getPointerOperand(); + bool IsMasked; + unsigned InterleaveFactor; + SmallVector CompressMask; + VectorType *LoadVecTy; + [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress( + VL, PointerOps, std::nullopt, *TTI, *DL, *SE, *AC, *DT, *TLI, + [](Value *) { return true; }, IsMasked, InterleaveFactor, + CompressMask, LoadVecTy); + assert(IsVectorized && "Expected to be vectorized"); + Align CommonAlignment; + if (IsMasked) + CommonAlignment = computeCommonAlignment(VL); + else + CommonAlignment = LI0->getAlign(); + if (InterleaveFactor) { + VecLdCost = TTI->getInterleavedMemoryOpCost( + Instruction::Load, LoadVecTy, InterleaveFactor, std::nullopt, + CommonAlignment, LI0->getPointerAddressSpace(), CostKind); + } else if (IsMasked) { + VecLdCost = TTI->getMaskedMemoryOpCost( + Instruction::Load, LoadVecTy, CommonAlignment, + LI0->getPointerAddressSpace(), CostKind); + // TODO: include this cost into CommonCost. + VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + LoadVecTy, CompressMask, CostKind); + } else { + VecLdCost = TTI->getMemoryOpCost( + Instruction::Load, LoadVecTy, CommonAlignment, + LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); + // TODO: include this cost into CommonCost. + VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + LoadVecTy, CompressMask, CostKind); + } + break; + } case TreeEntry::ScatterVectorize: { Align CommonAlignment = computeCommonAlignment(UniqueValues.getArrayRef()); @@ -12978,6 +13251,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { if (VectorizableTree.size() == 1 && (VectorizableTree[0]->State == TreeEntry::Vectorize || VectorizableTree[0]->State == TreeEntry::StridedVectorize || + VectorizableTree[0]->State == TreeEntry::CompressVectorize || (ForReduction && AreVectorizableGathers(VectorizableTree[0].get(), VectorizableTree[0]->Scalars.size()) && @@ -13001,7 +13275,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { if (VectorizableTree[0]->isGather() || (VectorizableTree[1]->isGather() && VectorizableTree[0]->State != TreeEntry::ScatterVectorize && - VectorizableTree[0]->State != TreeEntry::StridedVectorize)) + VectorizableTree[0]->State != TreeEntry::StridedVectorize && + VectorizableTree[0]->State != TreeEntry::CompressVectorize)) return false; return true; @@ -17183,6 +17458,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *PO = LI->getPointerOperand(); if (E->State == TreeEntry::Vectorize) { NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign()); + } else if (E->State == TreeEntry::CompressVectorize) { + SmallVector PointerOps(E->Scalars.size()); + for (auto [I, V] : enumerate(E->Scalars)) + PointerOps[I] = cast(V)->getPointerOperand(); + bool IsMasked; + unsigned InterleaveFactor; + SmallVector CompressMask; + VectorType *LoadVecTy; + [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress( + E->Scalars, PointerOps, std::nullopt, *TTI, *DL, *SE, *AC, *DT, + *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor, + CompressMask, LoadVecTy); + assert(IsVectorized && "Expected to be vectorized"); + Align CommonAlignment; + if (IsMasked) + CommonAlignment = computeCommonAlignment(E->Scalars); + else + CommonAlignment = LI->getAlign(); + if (IsMasked) { + SmallVector MaskValues( + getNumElements(LoadVecTy) / getNumElements(LI->getType()), + ConstantInt::getFalse(VecTy->getContext())); + for (int I : CompressMask) + MaskValues[I] = ConstantInt::getTrue(VecTy->getContext()); + Constant *MaskValue = ConstantVector::get(MaskValues); + NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment, + MaskValue); + } else { + NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment); + } + NewLI = ::propagateMetadata(NewLI, E->Scalars); + // TODO: include this cost into CommonCost. + NewLI = + cast(Builder.CreateShuffleVector(NewLI, CompressMask)); } else if (E->State == TreeEntry::StridedVectorize) { Value *Ptr0 = cast(E->Scalars.front())->getPointerOperand(); Value *PtrN = cast(E->Scalars.back())->getPointerOperand(); @@ -17252,7 +17561,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Align CommonAlignment = computeCommonAlignment(E->Scalars); NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment); } - Value *V = ::propagateMetadata(NewLI, E->Scalars); + Value *V = E->State == TreeEntry::CompressVectorize + ? NewLI + : ::propagateMetadata(NewLI, E->Scalars); V = FinalShuffle(V, E); E->VectorizedValue = V; @@ -17854,11 +18165,14 @@ Value *BoUpSLP::vectorizeTree( ArrayRef UseEntries = getTreeEntries(U); return !UseEntries.empty() && (E->State == TreeEntry::Vectorize || - E->State == TreeEntry::StridedVectorize) && + E->State == TreeEntry::StridedVectorize || + E->State == TreeEntry::CompressVectorize) && any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) { return (UseEntry->State == TreeEntry::Vectorize || UseEntry->State == - TreeEntry::StridedVectorize) && + TreeEntry::StridedVectorize || + UseEntry->State == + TreeEntry::CompressVectorize) && doesInTreeUserNeedToExtract( Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll index aa9195f8c48ce..b99a1c2d83394 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll @@ -15,19 +15,16 @@ define void @test() { ; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]] ; CHECK-NEXT: store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), align 16 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1296), align 16 -; CHECK-NEXT: [[TMP13:%.*]] = load <8 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1304), align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 ; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP12]], <8 x float> [[TMP13]], i64 8) -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP15]], <4 x float> [[TMP7]], i64 0) -; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> [[TMP16]], <2 x float> [[TMP9]], i64 6) +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP14]], [[TMP17]] -; CHECK-NEXT: store <16 x float> [[TMP18]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP18]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: store <16 x float> [[TMP15]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 ; CHECK-NEXT: ret void ; alloca_0: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll index 12263b065d89c..80ba7a40fb193 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll @@ -9,17 +9,9 @@ define void @test() { ; CHECK-NEXT: [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[COND_IN_V]], align 8 -; CHECK-NEXT: [[BV:%.*]] = icmp eq i64 [[V]], 0 -; CHECK-NEXT: [[IN_1:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 4 -; CHECK-NEXT: [[V_1:%.*]] = load i64, ptr [[IN_1]], align 8 -; CHECK-NEXT: [[BV_1:%.*]] = icmp eq i64 [[V_1]], 0 -; CHECK-NEXT: [[IN_2:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 8 -; CHECK-NEXT: [[V_2:%.*]] = load i64, ptr [[IN_2]], align 8 -; CHECK-NEXT: [[BV_2:%.*]] = icmp eq i64 [[V_2]], 0 -; CHECK-NEXT: [[IN_3:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 12 -; CHECK-NEXT: [[V_3:%.*]] = load i64, ptr [[IN_3]], align 8 -; CHECK-NEXT: [[BV_3:%.*]] = icmp eq i64 [[V_3]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> , <13 x i64> poison) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret void ; ; CHECK-SLP-THRESHOLD-LABEL: define void @test @@ -28,11 +20,9 @@ define void @test() { ; CHECK-SLP-THRESHOLD-NEXT: [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null ; CHECK-SLP-THRESHOLD-NEXT: br label [[BB:%.*]] ; CHECK-SLP-THRESHOLD: bb: -; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0 -; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> -; CHECK-SLP-THRESHOLD-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison) -; CHECK-SLP-THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer +; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> , <13 x i64> poison) +; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> +; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer ; CHECK-SLP-THRESHOLD-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll index f249394c91788..a9c0eb3f9f2b9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -24,20 +24,16 @@ define void @foo() { ; SSE-NEXT: ret void ; ; AVX-LABEL: @foo( -; AVX-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 -; AVX-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16 +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> ; AVX-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 ; AVX-NEXT: ret void ; ; AVX512-LABEL: @foo( -; AVX512-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 -; AVX512-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8 -; AVX512-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 -; AVX512-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16 +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> ; AVX512-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 ; AVX512-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index 925c334cb5f20..a0e52c13ec621 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -164,36 +164,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> +; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; AVX512F-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( ; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> +; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -290,49 +274,30 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 -; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 -; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 -; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 -; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 -; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 -; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 -; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 -; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 -; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 -; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 -; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24 +; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], -; AVX512F-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> +; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; AVX512F-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], -; AVX512VL-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, ptr %1, align 4, !tbaa !2 @@ -447,49 +412,30 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 -; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 -; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 -; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 -; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 -; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 -; AVX2-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 +; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24 +; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], ; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> -; AVX512F-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512F-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> +; AVX512F-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; AVX512F-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> -; AVX512VL-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512VL-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, ptr %t0, i64 1 @@ -687,25 +633,21 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512F-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> +; AVX512F-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512VL-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, ptr %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index dc1ba4ec7e7ab..6c5638819dcea 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -164,36 +164,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> +; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; AVX512F-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( ; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> +; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -290,49 +274,30 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 -; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 -; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 -; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 -; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 -; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 -; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 -; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 -; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 -; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 -; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 -; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24 +; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], -; AVX512F-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> +; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; AVX512F-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], -; AVX512VL-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, ptr %1, align 4, !tbaa !2 @@ -447,49 +412,30 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 -; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 -; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 -; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 -; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 -; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 -; AVX2-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 +; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24 +; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], ; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> -; AVX512F-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512F-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> +; AVX512F-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; AVX512F-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> -; AVX512VL-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512VL-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, ptr %t0, i64 1 @@ -687,25 +633,21 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512F-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> +; AVX512F-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512VL-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, ptr %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll index 0807a1bd4cdea..bbb1b87fc3dfa 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll @@ -5,10 +5,9 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) +; CHECK-NEXT: [[OFF0_1:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[OFF0_1]], i32 8, <15 x i1> , <15 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, <8 x ptr> [[TMP5]], <8 x i32> [[TMP3]] @@ -22,9 +21,9 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' - ; YAML-NEXT: - Cost: '-1' + ; YAML-NEXT: - Cost: '-10' ; YAML-NEXT: - String: ' and with tree size ' - ; YAML-NEXT: - TreeSize: '8' + ; YAML-NEXT: - TreeSize: '5' entry: %off0.1 = getelementptr inbounds i32, ptr %addr, i32 1 %idx0 = load i32, ptr %off0.1, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll index 5bd954e741d43..02058b1fe8578 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll @@ -5,16 +5,17 @@ define void @test() { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP6]], <2 x i32> [[TMP10]], i64 0) ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> @@ -64,16 +65,17 @@ define void @test1() { ; CHECK-LABEL: define void @test1( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP12]], <2 x i32> [[TMP10]], i64 0) ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer @@ -125,16 +127,17 @@ define void @test_div() { ; CHECK-LABEL: define void @test_div( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0) ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> @@ -184,16 +187,17 @@ define void @test_rem() { ; CHECK-LABEL: define void @test_rem( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0) ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll index c7c67d31f9ded..73b6c80730935 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll @@ -3,14 +3,12 @@ define void @test(ptr noalias %0, ptr %p) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x ptr> [[TMP2]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x ptr> [[TMP3]], <8 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> , <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 4, <16 x i1> , <16 x float> poison) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> , <16 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]] ; CHECK-NEXT: store <16 x float> [[TMP10]], ptr [[TMP5]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll index c114c5dee78e9..92d5506977aeb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll @@ -8,14 +8,11 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 { ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 24 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <6 x float> @llvm.masked.load.v6f32.p1(ptr addrspace(1) [[TMP3]], i32 4, <6 x i1> , <6 x float> poison) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP6]], i64 0) -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP8]], i64 2) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x float> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index 1294a87ff6967..d487e3616956c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -5,16 +5,12 @@ define void @test(i1 %c, ptr %arg) { ; CHECK-LABEL: @test( ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: -; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG:%.*]], i32 8, <5 x i1> , <5 x i64> poison) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x i64> [[TMP1]], <5 x i64> poison, <4 x i32> ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: -; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG]], i32 8, <5 x i1> , <5 x i64> poison) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <5 x i64> [[TMP3]], <5 x i64> poison, <4 x i32> ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: ; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll index e1e80d96d416d..b4996eb58b47e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll @@ -19,11 +19,11 @@ define void @test() { ; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]]) ; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]]) ; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]]) -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <6 x double>, ptr @src, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]]) -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[SIN1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP13]], double [[SIN3]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index 202ec9633712f..3f684e414c8ba 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -8,15 +8,10 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX20]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP0]], i64 4) +; CHECK-NEXT: [[TMP1:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr [[ARRAYIDX20]], i32 4, <12 x i1> , <12 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]] ; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll index 8fe7d15b69cb1..fdc0bc0e00eb8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll @@ -4,16 +4,15 @@ define void @test(ptr noalias %p, ptr noalias %addr, ptr noalias %s) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[ADDR]], i32 8, <15 x i1> , <15 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) +; CHECK-NEXT: [[TMP11:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[GEP2]], i32 8, <15 x i1> , <15 x i32> poison) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <15 x i32> [[TMP11]], <15 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP8]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) ; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]] From b93376f899824ad8543b961f2f79de6841034090 Mon Sep 17 00:00:00 2001 From: Hristo Hristov Date: Thu, 3 Apr 2025 23:18:04 +0300 Subject: [PATCH 0575/1029] [libc++][type_traits] `reference_{constructs|converts}_from_temporary` with `-Winvalid-specialization` tests (#133946) Addresses comment: https://github.com/llvm/llvm-project/pull/128649/files#r2022341035 --------- Co-authored-by: Hristo Hristov --- .../libcxx/type_traits/no_specializations.verify.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp index 807d01e381b49..38560161f162e 100644 --- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp +++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp @@ -156,6 +156,12 @@ SPECIALIZE_UTT(is_unbounded_array); // expected-error 2 {{cannot be speciali # if TEST_STD_VER >= 23 SPECIALIZE_UTT(is_implicit_lifetime); // expected-error 2 {{cannot be specialized}} SPECIALIZE_UTT(is_scoped_enum); // expected-error 2 {{cannot be specialized}} +# if __has_builtin(__reference_constructs_from_temporary) +SPECIALIZE_BTT(reference_constructs_from_temporary); // expected-error 2 {{cannot be specialized}} +# endif +# if __has_builtin(__reference_converts_from_temporary) +SPECIALIZE_BTT(reference_converts_from_temporary); // expected-error 2 {{cannot be specialized}} +# endif # endif # if TEST_STD_VER >= 26 @@ -177,8 +183,8 @@ struct std::conditional; // expected-error {{cannot be specialized}} template <> struct std::enable_if; // expected-error {{cannot be specialized}} -#if TEST_STD_VER >= 20 +# if TEST_STD_VER >= 20 template <> struct std::integral_constant; // expected-error {{cannot be specialized}} -#endif +# endif #endif From 61768b35285ef73b88146946dddfef045b353708 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Thu, 3 Apr 2025 13:21:39 -0700 Subject: [PATCH 0576/1029] [ctxprof] Don't import roots elsewhere (#134012) Block a context root from being imported by its callers. Suppose that happened. Its caller - usually a message pump - inlines its copy of the root. Then it (the root) and whatever it calls will be the non-contextually optimized callee versions. --- llvm/lib/Transforms/IPO/FunctionImport.cpp | 18 ++++++++++++++ .../ThinLTO/X86/ctxprof-separate-module.ll | 24 ++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 43807a8feb36e..05c41eb8d908b 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -516,6 +516,7 @@ class ModuleImportsManager { const ModuleSummaryIndex &Index, DenseMap *ExportLists = nullptr) : IsPrevailing(IsPrevailing), Index(Index), ExportLists(ExportLists) {} + virtual bool canImport(ValueInfo VI) { return true; } public: virtual ~ModuleImportsManager() = default; @@ -544,6 +545,11 @@ class WorkloadImportsManager : public ModuleImportsManager { // determine if a module's import list should be done by the base // ModuleImportsManager or by us. StringMap> Workloads; + // Track the roots to avoid importing them due to other callers. We want there + // to be only one variant), for which we optimize according to the contextual + // profile. "Variants" refers to copies due to importing - we want there to be + // just one instance of this function. + DenseSet Roots; void computeImportForModule(const GVSummaryMapTy &DefinedGVSummaries, @@ -783,12 +789,15 @@ class WorkloadImportsManager : public ModuleImportsManager { } auto &Set = Workloads[RootDefiningModule]; Root.getContainedGuids(ContainedGUIDs); + Roots.insert(RootVI); for (auto Guid : ContainedGUIDs) if (auto VI = Index.getValueInfo(Guid)) Set.insert(VI); } } + bool canImport(ValueInfo VI) override { return !Roots.contains(VI); } + public: WorkloadImportsManager( function_ref @@ -886,6 +895,15 @@ void ModuleImportsManager::computeImportForFunction( continue; } + if (!canImport(VI)) { + LLVM_DEBUG( + dbgs() << "Skipping over " << VI.getGUID() + << " because its import is handled in a different module."); + assert(VI.getSummaryList().size() == 1 && + "The root was expected to be an external symbol"); + continue; + } + auto GetBonusMultiplier = [](CalleeInfo::HotnessType Hotness) -> float { if (Hotness == CalleeInfo::HotnessType::Hot) return ImportHotMultiplier; diff --git a/llvm/test/ThinLTO/X86/ctxprof-separate-module.ll b/llvm/test/ThinLTO/X86/ctxprof-separate-module.ll index c7891d336cc89..391fe21a1b638 100644 --- a/llvm/test/ThinLTO/X86/ctxprof-separate-module.ll +++ b/llvm/test/ThinLTO/X86/ctxprof-separate-module.ll @@ -1,3 +1,4 @@ +; REQUIRES: asserts ; Test workload based importing via -thinlto-pgo-ctx-prof with moving the whole ; graph to a new module. ; Use external linkage symbols so we don't depend on module paths which are @@ -10,19 +11,25 @@ ; ; RUN: opt -module-summary -passes=assign-guid,ctx-instr-gen %t/m1.ll -o %t/m1.bc ; RUN: opt -module-summary -passes=assign-guid,ctx-instr-gen %t/m2.ll -o %t/m2.bc +; RUN: opt -module-summary -passes=assign-guid,ctx-instr-gen %t/m3.ll -o %t/m3.bc ; RUN: opt -module-summary -passes=assign-guid,ctx-instr-gen %t/6019442868614718803.ll -o %t/6019442868614718803.bc ; RUN: llvm-ctxprof-util fromYAML --input %t/ctxprof.yaml --output %t/ctxprof.bitstream -; RUN: llvm-lto2 run %t/m1.bc %t/m2.bc %t/6019442868614718803.bc -thinlto-move-ctxprof-trees \ +; RUN: llvm-lto2 run %t/m1.bc %t/m2.bc %t/m3.bc %t/6019442868614718803.bc -thinlto-move-ctxprof-trees \ ; RUN: -o %t/result.o -save-temps \ ; RUN: -use-ctx-profile=%t/ctxprof.bitstream \ ; RUN: -r %t/m1.bc,m1_f1,plx \ -; RUN: -r %t/m2.bc,m2_f1,plx -; RUN: llvm-dis %t/result.o.3.3.import.bc -o - | FileCheck %s +; RUN: -r %t/m2.bc,m2_f1,plx \ +; RUN: -r %t/m3.bc,m1_f1 \ +; RUN: -r %t/m3.bc,m3_f1,plx -debug-only=function-import 2>&1 | FileCheck %s --check-prefix=ABSENT-MSG +; RUN: llvm-dis %t/result.o.4.3.import.bc -o - | FileCheck %s +; RUN: llvm-dis %t/result.o.3.3.import.bc -o - | FileCheck %s --check-prefix=ABSENT ; ; ; CHECK: m1_f1() ; CHECK: m2_f1() +; ABSENT: declare void @m1_f1() +; ABSENT-MSG: Skipping over 6019442868614718803 because its import is handled in a different module. ; ;--- ctxprof.yaml Contexts: @@ -51,6 +58,17 @@ define dso_local void @m2_f1() { ret void } +;--- m3.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +declare void @m1_f1() + +define dso_local void @m3_f1() { + call void @m1_f1() + ret void +} + ;--- 6019442868614718803.ll target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" From cdff7f0b6ebe48c7d99079db002855be7716a7a9 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 3 Apr 2025 21:46:48 +0100 Subject: [PATCH 0577/1029] [LV] Retrieve middle VPBB via scalar ph to fix epilogue resumephis (NFC) If ScalarPH has predecessors, we may need to update its reduction resume values. If there is a middle block, it must be the first predecessor. Note that the first predecessor may not be the middle block, if the middle block doesn't branch to the scalar preheader. In that case, fixReductionScalarResumeWhenVectorizingEpilog will be a no-op. In preparation for https://github.com/llvm/llvm-project/pull/106748. --- .../Transforms/Vectorize/LoopVectorize.cpp | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 54ccaefdad246..0291a8bfd9674 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7786,7 +7786,6 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan.execute(&State); - auto *MiddleVPBB = BestVPlan.getMiddleBlock(); // 2.5 When vectorizing the epilogue, fix reduction resume values from the // additional bypass block. if (VectorizingEpilogue) { @@ -7801,10 +7800,20 @@ DenseMap LoopVectorizationPlanner::executePlan( Phi.addIncoming(Phi.getIncomingValueForBlock(BypassBlock), Pred); } } - - for (VPRecipeBase &R : *MiddleVPBB) { - fixReductionScalarResumeWhenVectorizingEpilog( - &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock); + VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader(); + ArrayRef ScalarPreds = ScalarPH->getPredecessors(); + if (!ScalarPreds.empty()) { + // If ScalarPH has predecessors, we may need to update its reduction + // resume values. If there is a middle block, it must be the first + // predecessor. Note that the first predecessor may not be the middle + // block, if the middle block doesn't branch to the scalar preheader. In + // that case, fixReductionScalarResumeWhenVectorizingEpilog will be a + // no-op. + auto *MiddleVPBB = cast(ScalarPreds[0]); + for (VPRecipeBase &R : *MiddleVPBB) { + fixReductionScalarResumeWhenVectorizingEpilog( + &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock); + } } } From 5f99e0d4b9ea071e29a9cba75619d26811ff76c2 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 3 Apr 2025 13:51:17 -0700 Subject: [PATCH 0578/1029] [lldb] Use the "reverse video" effect when colors are disabled. (#134203) When you run lldb without colors (`-X`), the status line looks weird because it doesn't have a background. You end up with what appears to be floating text at the bottom of your terminal. This patch changes the statusline to use the reverse video effect, even when colors are off. The effect doesn't introduce any new colors and just inverts the foreground and background color. I considered an alternative approach which changes the behavior of the `-X` option, so that turning off colors doesn't prevent emitting non-color related control characters such as bold, underline, and reverse video. I decided to go with this more targeted fix as (1) nobody is asking for this more general change and (2) it introduces significant complexity to plumb this through using a setting and driver flag so that it can be disabled when running the tests. Fixes #134112. --- lldb/source/Core/Statusline.cpp | 7 ++++++ .../statusline/TestStatusline.py | 22 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/lldb/source/Core/Statusline.cpp b/lldb/source/Core/Statusline.cpp index c18fbb6c5561e..b7650503e16bc 100644 --- a/lldb/source/Core/Statusline.cpp +++ b/lldb/source/Core/Statusline.cpp @@ -27,6 +27,7 @@ #define ANSI_CLEAR_LINE ESCAPE "[2K" #define ANSI_SET_SCROLL_ROWS ESCAPE "[0;%ur" #define ANSI_TO_START_OF_ROW ESCAPE "[%u;0f" +#define ANSI_REVERSE_VIDEO ESCAPE "[7m" #define ANSI_UP_ROWS ESCAPE "[%dA" using namespace lldb; @@ -74,6 +75,12 @@ void Statusline::Draw(std::string str) { locked_stream << ANSI_SAVE_CURSOR; locked_stream.Printf(ANSI_TO_START_OF_ROW, static_cast(m_terminal_height)); + + // Use "reverse video" to make sure the statusline has a background. Only do + // this when colors are disabled, and rely on the statusline format otherwise. + if (!m_debugger.GetUseColor()) + locked_stream << ANSI_REVERSE_VIDEO; + locked_stream << str; locked_stream << ANSI_NORMAL; locked_stream << ANSI_RESTORE_CURSOR; diff --git a/lldb/test/API/functionalities/statusline/TestStatusline.py b/lldb/test/API/functionalities/statusline/TestStatusline.py index a58dc5470ed6d..747a7a14e0629 100644 --- a/lldb/test/API/functionalities/statusline/TestStatusline.py +++ b/lldb/test/API/functionalities/statusline/TestStatusline.py @@ -55,3 +55,25 @@ def test(self): self.expect( "set set show-statusline false", ["\x1b[0;{}r".format(terminal_height)] ) + + # PExpect uses many timeouts internally and doesn't play well + # under ASAN on a loaded machine.. + @skipIfAsan + def test_no_color(self): + """Basic test for the statusline with colors disabled.""" + self.build() + self.launch(use_colors=False) + self.do_setup() + + # Change the terminal dimensions. + terminal_height = 10 + terminal_width = 60 + self.child.setwinsize(terminal_height, terminal_width) + + # Enable the statusline and check for the "reverse video" control character. + self.expect( + "set set show-statusline true", + [ + "\x1b[7m", + ], + ) From 7d3dfc862d283319d01997c0672c50b4a082aa4e Mon Sep 17 00:00:00 2001 From: Henry Jiang Date: Thu, 3 Apr 2025 17:01:18 -0400 Subject: [PATCH 0579/1029] [JITLink][XCOFF] Setup initial build support for XCOFF (#127266) This patch starts the initial implementation of JITLink for XCOFF (Object format for AIX). --- .../llvm/ExecutionEngine/JITLink/XCOFF.h | 37 ++ .../ExecutionEngine/JITLink/XCOFF_ppc64.h | 37 ++ .../ExecutionEngine/JITLink/CMakeLists.txt | 5 + llvm/lib/ExecutionEngine/JITLink/JITLink.cpp | 5 + llvm/lib/ExecutionEngine/JITLink/XCOFF.cpp | 43 ++ .../JITLink/XCOFFLinkGraphBuilder.cpp | 415 ++++++++++++++++++ .../JITLink/XCOFFLinkGraphBuilder.h | 63 +++ .../ExecutionEngine/JITLink/XCOFF_ppc64.cpp | 121 +++++ .../ExecutionEngine/Orc/LoadLinkableFile.cpp | 16 + .../Orc/ObjectFileInterface.cpp | 50 +++ llvm/lib/Object/XCOFFObjectFile.cpp | 10 +- .../JITLink/ppc64/XCOFF_ppc64.ll | 24 + 12 files changed, 823 insertions(+), 3 deletions(-) create mode 100644 llvm/include/llvm/ExecutionEngine/JITLink/XCOFF.h create mode 100644 llvm/include/llvm/ExecutionEngine/JITLink/XCOFF_ppc64.h create mode 100644 llvm/lib/ExecutionEngine/JITLink/XCOFF.cpp create mode 100644 llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp create mode 100644 llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.h create mode 100644 llvm/lib/ExecutionEngine/JITLink/XCOFF_ppc64.cpp create mode 100644 llvm/test/ExecutionEngine/JITLink/ppc64/XCOFF_ppc64.ll diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/XCOFF.h b/llvm/include/llvm/ExecutionEngine/JITLink/XCOFF.h new file mode 100644 index 0000000000000..3d181d0786eb7 --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/JITLink/XCOFF.h @@ -0,0 +1,37 @@ +//===------- XCOFF.h - Generic JIT link function for XCOFF ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// jit-link functions for XCOFF. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_JITLINK_XCOFF_H +#define LLVM_EXECUTIONENGINE_JITLINK_XCOFF_H + +#include "llvm/ExecutionEngine/JITLink/JITLink.h" + +namespace llvm { +namespace jitlink { + +/// Create a LinkGraph from an XCOFF relocatable object. +/// +/// Note: The graph does not take ownership of the underlying buffer, nor copy +/// its contents. The caller is responsible for ensuring that the object buffer +/// outlives the graph. +Expected> +createLinkGraphFromXCOFFObject(MemoryBufferRef ObjectBuffer, + std::shared_ptr SSP); + +/// Link the given graph. +void link_XCOFF(std::unique_ptr G, + std::unique_ptr Ctx); + +} // namespace jitlink +} // namespace llvm + +#endif // LLVM_EXECUTIONENGINE_JITLINK_XCOFF_H diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/XCOFF_ppc64.h b/llvm/include/llvm/ExecutionEngine/JITLink/XCOFF_ppc64.h new file mode 100644 index 0000000000000..ec5c8a37bda27 --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/JITLink/XCOFF_ppc64.h @@ -0,0 +1,37 @@ +//===------ XCOFF_ppc64.h - JIT link functions for XCOFF/ppc64 ------*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// jit-link functions for XCOFF/ppc64. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_JITLINK_XCOFF_PPC64_H +#define LLVM_EXECUTIONENGINE_JITLINK_XCOFF_PPC64_H + +#include "llvm/ExecutionEngine/JITLink/JITLink.h" + +namespace llvm::jitlink { + +/// Create a LinkGraph from an XCOFF/ppc64 relocatable object. +/// +/// Note: The graph does not take ownership of the underlying buffer, nor copy +/// its contents. The caller is responsible for ensuring that the object buffer +/// outlives the graph. +/// +Expected> createLinkGraphFromXCOFFObject_ppc64( + MemoryBufferRef ObjectBuffer, std::shared_ptr SSP); + +/// jit-link the given object buffer, which must be a XCOFF ppc64 object file. +/// +void link_XCOFF_ppc64(std::unique_ptr G, + std::unique_ptr Ctx); + +} // end namespace llvm::jitlink + +#endif // LLVM_EXECUTIONENGINE_JITLINK_XCOFF_PPC64_H diff --git a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt index 65dd0c7468ae1..22e4513e1374c 100644 --- a/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/JITLink/CMakeLists.txt @@ -35,6 +35,11 @@ add_llvm_component_library(LLVMJITLink COFFLinkGraphBuilder.cpp COFF_x86_64.cpp + # XCOFF + XCOFF.cpp + XCOFF_ppc64.cpp + XCOFFLinkGraphBuilder.cpp + # Architectures: aarch32.cpp aarch64.cpp diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp index e8ce9b2b9527d..15a8fcf312ade 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp @@ -13,6 +13,7 @@ #include "llvm/ExecutionEngine/JITLink/COFF.h" #include "llvm/ExecutionEngine/JITLink/ELF.h" #include "llvm/ExecutionEngine/JITLink/MachO.h" +#include "llvm/ExecutionEngine/JITLink/XCOFF.h" #include "llvm/ExecutionEngine/JITLink/aarch64.h" #include "llvm/ExecutionEngine/JITLink/i386.h" #include "llvm/ExecutionEngine/JITLink/loongarch.h" @@ -501,6 +502,8 @@ createLinkGraphFromObject(MemoryBufferRef ObjectBuffer, return createLinkGraphFromELFObject(ObjectBuffer, std::move(SSP)); case file_magic::coff_object: return createLinkGraphFromCOFFObject(ObjectBuffer, std::move(SSP)); + case file_magic::xcoff_object_64: + return createLinkGraphFromXCOFFObject(ObjectBuffer, std::move(SSP)); default: return make_error("Unsupported file format"); }; @@ -532,6 +535,8 @@ void link(std::unique_ptr G, std::unique_ptr Ctx) { return link_ELF(std::move(G), std::move(Ctx)); case Triple::COFF: return link_COFF(std::move(G), std::move(Ctx)); + case Triple::XCOFF: + return link_XCOFF(std::move(G), std::move(Ctx)); default: Ctx->notifyFailed(make_error("Unsupported object format")); }; diff --git a/llvm/lib/ExecutionEngine/JITLink/XCOFF.cpp b/llvm/lib/ExecutionEngine/JITLink/XCOFF.cpp new file mode 100644 index 0000000000000..cb026538632a9 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/XCOFF.cpp @@ -0,0 +1,43 @@ +//===-------------- XCOFF.cpp - JIT linker function for XCOFF -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// XCOFF jit-link function. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/JITLink/XCOFF.h" +#include "llvm/ExecutionEngine/JITLink/XCOFF_ppc64.h" +#include "llvm/Object/XCOFFObjectFile.h" + +using namespace llvm; + +#define DEBUG_TYPE "jitlink" + +namespace llvm { +namespace jitlink { + +Expected> +createLinkGraphFromXCOFFObject(MemoryBufferRef ObjectBuffer, + std::shared_ptr SSP) { + // Check magic + file_magic Magic = identify_magic(ObjectBuffer.getBuffer()); + if (Magic != file_magic::xcoff_object_64) + return make_error("Invalid XCOFF 64 Header"); + + // TODO: See if we need to add more checks + // + return createLinkGraphFromXCOFFObject_ppc64(ObjectBuffer, std::move(SSP)); +} + +void link_XCOFF(std::unique_ptr G, + std::unique_ptr Ctx) { + link_XCOFF_ppc64(std::move(G), std::move(Ctx)); +} + +} // namespace jitlink +} // namespace llvm diff --git a/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp new file mode 100644 index 0000000000000..b871ecfb4f0d8 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp @@ -0,0 +1,415 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic XCOFF LinkGraph building code. +// +//===----------------------------------------------------------------------===// + +#include "XCOFFLinkGraphBuilder.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/BinaryFormat/XCOFF.h" +#include "llvm/ExecutionEngine/JITLink/JITLink.h" +#include "llvm/ExecutionEngine/JITLink/ppc64.h" +#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h" +#include "llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Object/XCOFFObjectFile.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "jitlink" + +namespace llvm { +namespace jitlink { + +XCOFFLinkGraphBuilder::XCOFFLinkGraphBuilder( + const object::XCOFFObjectFile &Obj, + std::shared_ptr SSP, Triple TT, + SubtargetFeatures Features, + LinkGraph::GetEdgeKindNameFunction GetEdgeKindName) + : Obj(Obj), + G(std::make_unique( + std::string(Obj.getFileName()), std::move(SSP), std::move(TT), + std::move(Features), std::move(GetEdgeKindName))) {} + +static llvm::StringRef getStorageClassString(XCOFF::StorageClass SC) { + switch (SC) { + case XCOFF::StorageClass::C_FILE: + return "C_FILE (File name)"; + case XCOFF::StorageClass::C_BINCL: + return "C_BINCL (Beginning of include file)"; + case XCOFF::StorageClass::C_EINCL: + return "C_EINCL (Ending of include file)"; + case XCOFF::StorageClass::C_GSYM: + return "C_GSYM (Global variable)"; + case XCOFF::StorageClass::C_STSYM: + return "C_STSYM (Statically allocated symbol)"; + case XCOFF::StorageClass::C_BCOMM: + return "C_BCOMM (Beginning of common block)"; + case XCOFF::StorageClass::C_ECOMM: + return "C_ECOMM (End of common block)"; + case XCOFF::StorageClass::C_ENTRY: + return "C_ENTRY (Alternate entry)"; + case XCOFF::StorageClass::C_BSTAT: + return "C_BSTAT (Beginning of static block)"; + case XCOFF::StorageClass::C_ESTAT: + return "C_ESTAT (End of static block)"; + case XCOFF::StorageClass::C_GTLS: + return "C_GTLS (Global thread-local variable)"; + case XCOFF::StorageClass::C_STTLS: + return "C_STTLS (Static thread-local variable)"; + case XCOFF::StorageClass::C_DWARF: + return "C_DWARF (DWARF section symbol)"; + case XCOFF::StorageClass::C_LSYM: + return "C_LSYM (Automatic variable allocated on stack)"; + case XCOFF::StorageClass::C_PSYM: + return "C_PSYM (Argument to subroutine allocated on stack)"; + case XCOFF::StorageClass::C_RSYM: + return "C_RSYM (Register variable)"; + case XCOFF::StorageClass::C_RPSYM: + return "C_RPSYM (Argument to function stored in register)"; + case XCOFF::StorageClass::C_ECOML: + return "C_ECOML (Local member of common block)"; + case XCOFF::StorageClass::C_FUN: + return "C_FUN (Function or procedure)"; + case XCOFF::StorageClass::C_EXT: + return "C_EXT (External symbol)"; + case XCOFF::StorageClass::C_WEAKEXT: + return "C_WEAKEXT (Weak external symbol)"; + case XCOFF::StorageClass::C_NULL: + return "C_NULL"; + case XCOFF::StorageClass::C_STAT: + return "C_STAT (Static)"; + case XCOFF::StorageClass::C_BLOCK: + return "C_BLOCK (\".bb\" or \".eb\")"; + case XCOFF::StorageClass::C_FCN: + return "C_FCN (\".bf\" or \".ef\")"; + case XCOFF::StorageClass::C_HIDEXT: + return "C_HIDEXT (Un-named external symbol)"; + case XCOFF::StorageClass::C_INFO: + return "C_INFO (Comment string in .info section)"; + case XCOFF::StorageClass::C_DECL: + return "C_DECL (Declaration of object)"; + case XCOFF::StorageClass::C_AUTO: + return "C_AUTO (Automatic variable)"; + case XCOFF::StorageClass::C_REG: + return "C_REG (Register variable)"; + case XCOFF::StorageClass::C_EXTDEF: + return "C_EXTDEF (External definition)"; + case XCOFF::StorageClass::C_LABEL: + return "C_LABEL (Label)"; + case XCOFF::StorageClass::C_ULABEL: + return "C_ULABEL (Undefined label)"; + case XCOFF::StorageClass::C_MOS: + return "C_MOS (Member of structure)"; + case XCOFF::StorageClass::C_ARG: + return "C_ARG (Function argument)"; + case XCOFF::StorageClass::C_STRTAG: + return "C_STRTAG (Structure tag)"; + case XCOFF::StorageClass::C_MOU: + return "C_MOU (Member of union)"; + case XCOFF::StorageClass::C_UNTAG: + return "C_UNTAG (Union tag)"; + case XCOFF::StorageClass::C_TPDEF: + return "C_TPDEF (Type definition)"; + case XCOFF::StorageClass::C_USTATIC: + return "C_USTATIC (Undefined static)"; + case XCOFF::StorageClass::C_ENTAG: + return "C_ENTAG (Enumeration tag)"; + case XCOFF::StorageClass::C_MOE: + return "C_MOE (Member of enumeration)"; + case XCOFF::StorageClass::C_REGPARM: + return "C_REGPARM (Register parameter)"; + case XCOFF::StorageClass::C_FIELD: + return "C_FIELD (Bit field)"; + case XCOFF::StorageClass::C_EOS: + return "C_EOS (End of structure)"; + case XCOFF::StorageClass::C_LINE: + return "C_LINE"; + case XCOFF::StorageClass::C_ALIAS: + return "C_ALIAS (Duplicate tag)"; + case XCOFF::StorageClass::C_HIDDEN: + return "C_HIDDEN (Special storage class for external)"; + case XCOFF::StorageClass::C_EFCN: + return "C_EFCN (Physical end of function)"; + case XCOFF::StorageClass::C_TCSYM: + return "C_TCSYM (Reserved)"; + } +} + +Error XCOFFLinkGraphBuilder::processSections() { + LLVM_DEBUG(dbgs() << " Creating graph sections...\n"); + + UndefSection = &G->createSection("*UND*", orc::MemProt::None); + + for (object::SectionRef Section : Obj.sections()) { + auto SectionName = Section.getName(); + if (!SectionName) + return SectionName.takeError(); + + LLVM_DEBUG({ + dbgs() << " section = " << *SectionName + << ", idx = " << Section.getIndex() + << ", size = " << format_hex_no_prefix(Section.getSize(), 8) + << ", vma = " << format_hex(Section.getAddress(), 16) << "\n"; + }); + + // We can skip debug (including dawrf) and pad sections + if (Section.isDebugSection() || *SectionName == "pad") + continue; + LLVM_DEBUG(dbgs() << " creating graph section\n"); + + orc::MemProt Prot = orc::MemProt::Read; + if (Section.isText()) + Prot |= orc::MemProt::Exec; + if (Section.isData() || Section.isBSS()) + Prot |= orc::MemProt::Write; + + jitlink::Section *GraphSec = &G->createSection(*SectionName, Prot); + // TODO: Check for no_alloc for certain sections + + assert(!SectionTable.contains(Section.getIndex()) && + "Section with same index already exists"); + SectionTable[Section.getIndex()] = {GraphSec, Section}; + } + + return Error::success(); +} + +static std::optional +getXCOFFSymbolContainingSymbolRef(const object::XCOFFObjectFile &Obj, + const object::SymbolRef &Sym) { + const object::XCOFFSymbolRef SymRef = + Obj.toSymbolRef(Sym.getRawDataRefImpl()); + if (!SymRef.isCsectSymbol()) + return std::nullopt; + + Expected CsectAuxEntOrErr = + SymRef.getXCOFFCsectAuxRef(); + if (!CsectAuxEntOrErr || !CsectAuxEntOrErr.get().isLabel()) + return std::nullopt; + uint32_t Idx = + static_cast(CsectAuxEntOrErr.get().getSectionOrLength()); + object::DataRefImpl DRI; + DRI.p = Obj.getSymbolByIndex(Idx); + return object::XCOFFSymbolRef(DRI, &Obj); +} + +static void printSymbolEntry(raw_ostream &OS, + const object::XCOFFObjectFile &Obj, + const object::XCOFFSymbolRef &Sym) { + OS << " " << format_hex(cantFail(Sym.getAddress()), 16); + OS << " " << left_justify(cantFail(Sym.getName()), 10); + if (Sym.isCsectSymbol()) { + auto CsectAuxEntry = cantFail(Sym.getXCOFFCsectAuxRef()); + if (!CsectAuxEntry.isLabel()) { + std::string MCStr = + "[" + + XCOFF::getMappingClassString(CsectAuxEntry.getStorageMappingClass()) + .str() + + "]"; + OS << left_justify(MCStr, 3); + } + } + OS << " " << format_hex(Sym.getSize(), 8); + OS << " " << Sym.getSectionNumber(); + OS << " " << getStorageClassString(Sym.getStorageClass()); + OS << " (idx: " << Obj.getSymbolIndex(Sym.getRawDataRefImpl().p) << ")"; + if (Sym.isCsectSymbol()) { + if (auto ParentSym = getXCOFFSymbolContainingSymbolRef(Obj, Sym)) { + OS << " (csect idx: " + << Obj.getSymbolIndex(ParentSym->getRawDataRefImpl().p) << ")"; + } + } + OS << "\n"; +} + +Error XCOFFLinkGraphBuilder::processCsectsAndSymbols() { + LLVM_DEBUG(dbgs() << " Creating graph blocks and symbols...\n"); + + for (auto [K, V] : SectionTable) { + LLVM_DEBUG(dbgs() << " section entry(idx: " << K + << " section: " << V.Section->getName() << ")\n"); + } + + for (object::XCOFFSymbolRef Symbol : Obj.symbols()) { + LLVM_DEBUG({ printSymbolEntry(dbgs(), Obj, Symbol); }); + + auto Flags = Symbol.getFlags(); + if (!Flags) + return Flags.takeError(); + + bool External = *Flags & object::SymbolRef::SF_Undefined; + bool Weak = *Flags & object::SymbolRef::SF_Weak; + bool Global = *Flags & object::SymbolRef::SF_Global; + + auto SymbolIndex = Obj.getSymbolIndex(Symbol.getEntryAddress()); + auto SymbolName = Symbol.getName(); + if (!SymbolName) + return SymbolName.takeError(); + + if (External) { + LLVM_DEBUG(dbgs() << " created external symbol\n"); + SymbolIndexTable[SymbolIndex] = + &G->addExternalSymbol(*SymbolName, Symbol.getSize(), Weak); + continue; + } + + if (!Symbol.isCsectSymbol()) { + LLVM_DEBUG(dbgs() << " skipped: not a csect symbol\n"); + continue; + } + + auto ParentSym = getXCOFFSymbolContainingSymbolRef(Obj, Symbol); + object::XCOFFSymbolRef CsectSymbol = ParentSym ? *ParentSym : Symbol; + + auto CsectSymbolIndex = Obj.getSymbolIndex(CsectSymbol.getEntryAddress()); + auto ParentSectionNumber = CsectSymbol.getSectionNumber(); + + bool IsUndefinedSection = !SectionTable.contains(ParentSectionNumber); + Section *ParentSection = !IsUndefinedSection + ? SectionTable[ParentSectionNumber].Section + : UndefSection; + Block *B = nullptr; + + // TODO: Clean up the logic for handling undefined symbols + if (!CsectTable.contains(CsectSymbolIndex) && !IsUndefinedSection) { + object::SectionRef &SectionRef = + SectionTable[ParentSectionNumber].SectionData; + auto Data = SectionRef.getContents(); + if (!Data) + return Data.takeError(); + auto CsectSymbolAddr = CsectSymbol.getAddress(); + if (!CsectSymbolAddr) + return CsectSymbolAddr.takeError(); + + ArrayRef SectionBuffer{Data->data(), Data->size()}; + auto Offset = *CsectSymbolAddr - SectionRef.getAddress(); + + LLVM_DEBUG(dbgs() << " symbol entry: offset = " << Offset + << ", size = " << CsectSymbol.getSize() + << ", storage class = " + << getStorageClassString(CsectSymbol.getStorageClass()) + << "\n"); + + B = &G->createContentBlock( + *ParentSection, SectionBuffer.slice(Offset, CsectSymbol.getSize()), + orc::ExecutorAddr(*CsectSymbolAddr), CsectSymbol.getAlignment(), 0); + + CsectTable[CsectSymbolIndex] = B; + } else { + B = CsectTable[CsectSymbolIndex]; + } + + Scope S{Scope::Local}; + if (Symbol.getSymbolType() & XCOFF::SYM_V_HIDDEN || + Symbol.getSymbolType() & XCOFF::SYM_V_INTERNAL) + S = Scope::Hidden; + else if (Global) + S = Scope::Default; + // TODO: map all symbols for c++ static initialization to SideEffectOnly + + Linkage L = Weak ? Linkage::Weak : Linkage::Strong; + auto SymbolAddr = Symbol.getAddress(); + if (!SymbolAddr) + return SymbolAddr.takeError(); + auto IsCallableOrErr = Symbol.isFunction(); + if (!IsCallableOrErr) + return IsCallableOrErr.takeError(); + + auto BlockOffset = *SymbolAddr - B->getAddress().getValue(); + + LLVM_DEBUG(dbgs() << " creating with linkage = " << getLinkageName(L) + << ", scope = " << getScopeName(S) << ", B = " + << format_hex(B->getAddress().getValue(), 16) << "\n"); + + SymbolIndexTable[SymbolIndex] = + &G->addDefinedSymbol(*B, BlockOffset, *SymbolName, Symbol.getSize(), L, + S, *IsCallableOrErr, true); + } + + return Error::success(); +} + +Error XCOFFLinkGraphBuilder::processRelocations() { + LLVM_DEBUG(dbgs() << " Creating relocations...\n"); + + for (object::SectionRef Section : Obj.sections()) { + auto SectionName = Section.getName(); + if (!SectionName) + return SectionName.takeError(); + + LLVM_DEBUG(dbgs() << " Relocations for section " << *SectionName + << ":\n"); + + for (object::RelocationRef Relocation : Section.relocations()) { + SmallString<16> RelocName; + Relocation.getTypeName(RelocName); + object::SymbolRef Symbol = *Relocation.getSymbol(); + + auto TargetSymbol = Symbol.getName(); + if (!TargetSymbol) + return TargetSymbol.takeError(); + + auto SymbolIndex = Obj.getSymbolIndex(Symbol.getRawDataRefImpl().p); + + LLVM_DEBUG(dbgs() << " " << format_hex(Relocation.getOffset(), 16) + << " (idx: " << SymbolIndex << ")" + << " " << RelocName << " " << *TargetSymbol << "\n";); + + assert(SymbolIndexTable.contains(SymbolIndex) && + "Relocation needs a record in the symbol table"); + auto *S = SymbolIndexTable[SymbolIndex]; + auto It = find_if(G->blocks(), + [Target = orc::ExecutorAddr(Section.getAddress() + + Relocation.getOffset())]( + const Block *B) -> bool { + return B->getRange().contains(Target); + }); + assert(It != G->blocks().end() && + "Cannot find the target relocation block"); + Block *B = *It; + + auto TargetBlockOffset = Section.getAddress() + Relocation.getOffset() - + B->getAddress().getValue(); + switch (Relocation.getType()) { + case XCOFF::R_POS: + B->addEdge(ppc64::EdgeKind_ppc64::Pointer64, TargetBlockOffset, *S, 0); + break; + default: + SmallString<16> RelocType; + Relocation.getTypeName(RelocType); + return make_error( + "Unsupported Relocation Type: " + RelocType, std::error_code()); + } + } + } + + return Error::success(); +} + +Expected> XCOFFLinkGraphBuilder::buildGraph() { + LLVM_DEBUG(dbgs() << "Building XCOFFLinkGraph...\n"); + + // FIXME: Check to make sure the object is relocatable + + if (auto Err = processSections()) + return Err; + if (auto Err = processCsectsAndSymbols()) + return Err; + if (auto Err = processRelocations()) + return Err; + + return std::move(G); +} + +} // namespace jitlink +} // namespace llvm diff --git a/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.h new file mode 100644 index 0000000000000..c6481170637c2 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.h @@ -0,0 +1,63 @@ +//===----- XCOFFLinkGraphBuilder.h - XCOFF LinkGraph builder ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic XCOFF LinkGraph building code. +// +//===----------------------------------------------------------------------===// + +#ifndef LIB_EXECUTIONENGINE_JITLINK_XCOFFLINKGRAPHBUILDER_H +#define LIB_EXECUTIONENGINE_JITLINK_XCOFFLINKGRAPHBUILDER_H + +#include "llvm/ExecutionEngine/JITLink/JITLink.h" +#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Object/XCOFFObjectFile.h" +#include "llvm/TargetParser/SubtargetFeature.h" +#include + +namespace llvm { +namespace jitlink { + +class XCOFFLinkGraphBuilder { +public: + virtual ~XCOFFLinkGraphBuilder() = default; + Expected> buildGraph(); + +public: + XCOFFLinkGraphBuilder(const object::XCOFFObjectFile &Obj, + std::shared_ptr SSP, Triple TT, + SubtargetFeatures Features, + LinkGraph::GetEdgeKindNameFunction GetEdgeKindName); + LinkGraph &getGraph() const { return *G; } + const object::XCOFFObjectFile &getObject() const { return Obj; } + +private: + Error processSections(); + Error processCsectsAndSymbols(); + Error processRelocations(); + +private: + const object::XCOFFObjectFile &Obj; + std::unique_ptr G; + + Section *UndefSection; + + struct SectionEntry { + jitlink::Section *Section; + object::SectionRef SectionData; + }; + + DenseMap SectionTable; + DenseMap CsectTable; + DenseMap SymbolIndexTable; +}; + +} // namespace jitlink +} // namespace llvm + +#endif // LIB_EXECUTIONENGINE_JITLINK_XCOFFLINKGRAPHBUILDER_H diff --git a/llvm/lib/ExecutionEngine/JITLink/XCOFF_ppc64.cpp b/llvm/lib/ExecutionEngine/JITLink/XCOFF_ppc64.cpp new file mode 100644 index 0000000000000..fd6b5f61749b5 --- /dev/null +++ b/llvm/lib/ExecutionEngine/JITLink/XCOFF_ppc64.cpp @@ -0,0 +1,121 @@ +//===------- XCOFF_ppc64.cpp -JIT linker implementation for XCOFF/ppc64 +//-------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// XCOFF/ppc64 jit-link implementation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/JITLink/XCOFF_ppc64.h" +#include "JITLinkGeneric.h" +#include "XCOFFLinkGraphBuilder.h" +#include "llvm/ADT/bit.h" +#include "llvm/ExecutionEngine/JITLink/JITLink.h" +#include "llvm/ExecutionEngine/JITLink/ppc64.h" +#include "llvm/Object/ObjectFile.h" +#include "llvm/Object/XCOFFObjectFile.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "jitlink" + +namespace llvm { +namespace jitlink { + +Expected> createLinkGraphFromXCOFFObject_ppc64( + MemoryBufferRef ObjectBuffer, std::shared_ptr SSP) { + LLVM_DEBUG({ + dbgs() << "Building jitlink graph for new input " + << ObjectBuffer.getBufferIdentifier() << "...\n"; + }); + + auto Obj = object::ObjectFile::createObjectFile(ObjectBuffer); + if (!Obj) + return Obj.takeError(); + assert((**Obj).isXCOFF() && "Expects and XCOFF Object"); + + auto Features = (*Obj)->getFeatures(); + if (!Features) + return Features.takeError(); + LLVM_DEBUG({ + dbgs() << " Features: "; + (*Features).print(dbgs()); + }); + + return XCOFFLinkGraphBuilder(cast(**Obj), + std::move(SSP), Triple("powerpc64-ibm-aix"), + std::move(*Features), ppc64::getEdgeKindName) + .buildGraph(); +} + +class XCOFFJITLinker_ppc64 : public JITLinker { + using JITLinkerBase = JITLinker; + friend JITLinkerBase; + +public: + XCOFFJITLinker_ppc64(std::unique_ptr Ctx, + std::unique_ptr G, + PassConfiguration PassConfig) + : JITLinkerBase(std::move(Ctx), std::move(G), std::move(PassConfig)) { + // FIXME: Post allocation pass define TOC base, this is temporary to support + // building until we can build the required toc entries + defineTOCSymbol(getGraph()); + } + + Error applyFixup(LinkGraph &G, Block &B, const Edge &E) const { + LLVM_DEBUG(dbgs() << " Applying fixup for " << G.getName() + << ", address = " << B.getAddress() + << ", target = " << E.getTarget().getName() << ", kind = " + << ppc64::getEdgeKindName(E.getKind()) << "\n"); + switch (E.getKind()) { + case ppc64::Pointer64: + if (auto Err = ppc64::applyFixup(G, B, E, TOCSymbol)) + return Err; + break; + default: + return make_error("Unsupported relocation type", + std::error_code()); + } + return Error::success(); + } + +private: + void defineTOCSymbol(LinkGraph &G) { + for (Symbol *S : G.defined_symbols()) { + if (S->hasName() && *S->getName() == StringRef("TOC")) { + TOCSymbol = S; + return; + } + } + llvm_unreachable("LinkGraph does not contan an TOC Symbol"); + } + +private: + Symbol *TOCSymbol = nullptr; +}; + +void link_XCOFF_ppc64(std::unique_ptr G, + std::unique_ptr Ctx) { + // Ctx->notifyFailed(make_error( + // "link_XCOFF_ppc64 is not implemented", std::error_code())); + + PassConfiguration Config; + + // Pass insertions + + if (auto Err = Ctx->modifyPassConfig(*G, Config)) + return Ctx->notifyFailed(std::move(Err)); + + XCOFFJITLinker_ppc64::link(std::move(Ctx), std::move(G), std::move(Config)); +} + +} // namespace jitlink +} // namespace llvm diff --git a/llvm/lib/ExecutionEngine/Orc/LoadLinkableFile.cpp b/llvm/lib/ExecutionEngine/Orc/LoadLinkableFile.cpp index 77ae7c7ca2e0e..4f01c01da4b9f 100644 --- a/llvm/lib/ExecutionEngine/Orc/LoadLinkableFile.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LoadLinkableFile.cpp @@ -25,6 +25,13 @@ checkCOFFRelocatableObject(std::unique_ptr Obj, return std::move(Obj); } +static Expected> +checkXCOFFRelocatableObject(std::unique_ptr Obj, + const Triple &TT) { + // TODO: Actually check the architecture of the file. + return std::move(Obj); +} + static Expected> checkELFRelocatableObject(std::unique_ptr Obj, const Triple &TT) { // TODO: Actually check the architecture of the file. @@ -105,6 +112,15 @@ loadLinkableFile(StringRef Path, const Triple &TT, LoadArchives LA, return loadLinkableSliceFromMachOUniversalBinary( FD, std::move(*Buf), TT, LA, Path, *IdentifierOverride); break; + case file_magic::xcoff_object_64: + if (!RequireFormat || *RequireFormat == Triple::XCOFF) { + auto CheckedBuf = checkXCOFFRelocatableObject(std::move(*Buf), TT); + if (!CheckedBuf) + return CheckedBuf.takeError(); + return std::make_pair(std::move(*CheckedBuf), + LinkableFileKind::RelocatableObject); + } + break; default: break; } diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp index 0b2cafb0bff13..b9cee98d45d77 100644 --- a/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ObjectFileInterface.cpp @@ -7,11 +7,13 @@ //===----------------------------------------------------------------------===// #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h" +#include "llvm/ExecutionEngine/JITSymbol.h" #include "llvm/ExecutionEngine/Orc/Shared/ObjectFormats.h" #include "llvm/Object/COFF.h" #include "llvm/Object/ELFObjectFile.h" #include "llvm/Object/MachO.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Object/XCOFFObjectFile.h" #include #define DEBUG_TYPE "orc" @@ -227,6 +229,52 @@ getCOFFObjectFileSymbolInfo(ExecutionSession &ES, return I; } +Expected +getXCOFFObjectFileSymbolInfo(ExecutionSession &ES, + const object::ObjectFile &Obj) { + + MaterializationUnit::Interface I; + + for (auto &Sym : Obj.symbols()) { + Expected SymFlagsOrErr = Sym.getFlags(); + if (!SymFlagsOrErr) + return SymFlagsOrErr.takeError(); + uint32_t Flags = *SymFlagsOrErr; + + // Skip undefined, non global and ST_File + if (Flags & object::SymbolRef::SF_Undefined) + continue; + if (!(Flags & object::SymbolRef::SF_Global)) + continue; + + auto SymbolType = Sym.getType(); + if (!SymbolType) + return SymbolType.takeError(); + + if (*SymbolType == object::SymbolRef::ST_File) + continue; + + auto Name = Sym.getName(); + if (!Name) + return Name.takeError(); + auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym); + if (!SymFlags) + return SymFlags.takeError(); + + // TODO: Revisit symbol visibility + // On AIX, symbols with C_EXT and C_WEAKEXT symbols have no specified + // visibility are considered to have Default scope for LinkGraph. When the + // object is not a DSO, symbol visibility is not specified. In the absence + // of an Export List, its reasonable to minimic roughly the behaviour of + // -bexpall or CreateExportList. + *SymFlags |= JITSymbolFlags::Exported; + + I.SymbolFlags[ES.intern(std::move(*Name))] = std::move(*SymFlags); + } + // TODO: Find all initialization symbols for c++ static initializers + return I; +} + Expected getGenericObjectFileSymbolInfo(ExecutionSession &ES, const object::ObjectFile &Obj) { @@ -280,6 +328,8 @@ getObjectFileInterface(ExecutionSession &ES, MemoryBufferRef ObjBuffer) { return getELFObjectFileSymbolInfo(ES, *ELFObj); else if (auto *COFFObj = dyn_cast(Obj->get())) return getCOFFObjectFileSymbolInfo(ES, *COFFObj); + else if (auto *XCOFFObj = dyn_cast(Obj->get())) + return getXCOFFObjectFileSymbolInfo(ES, *XCOFFObj); return getGenericObjectFileSymbolInfo(ES, **Obj); } diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp index df3b2a091aec2..5a246438e2c0e 100644 --- a/llvm/lib/Object/XCOFFObjectFile.cpp +++ b/llvm/lib/Object/XCOFFObjectFile.cpp @@ -429,9 +429,13 @@ XCOFFObjectFile::getSectionContents(DataRefImpl Sec) const { } uint64_t XCOFFObjectFile::getSectionAlignment(DataRefImpl Sec) const { - uint64_t Result = 0; - llvm_unreachable("Not yet implemented!"); - return Result; + // TODO: Copied from MC/XCOFFObjectWriter.cpp + // Sections other than DWARF section use DefaultSectionAlign as the default + // alignment, while DWARF sections have their own alignments. DWARF section + // alignment is bigger than DefaultSectionAlign. + if (isDebugSection(Sec)) + return 8; + return 4; } uint64_t XCOFFObjectFile::getSectionFileOffsetToRawData(DataRefImpl Sec) const { diff --git a/llvm/test/ExecutionEngine/JITLink/ppc64/XCOFF_ppc64.ll b/llvm/test/ExecutionEngine/JITLink/ppc64/XCOFF_ppc64.ll new file mode 100644 index 0000000000000..659b0a8959e73 --- /dev/null +++ b/llvm/test/ExecutionEngine/JITLink/ppc64/XCOFF_ppc64.ll @@ -0,0 +1,24 @@ +; AIX's support for llvm-mc does not have enough support for directives like .csect +; so we can't use the tool. llvm-jitlink -check is not available as it requries +; implementation of registerXCOFFGraphInfo. Will revisit this testcase once support +; is more complete. + +; RUN: mkdir -p %t +; RUN: llc --filetype=obj -o %t/xcoff_ppc64.o %s +; RUN: llvm-jitlink -noexec -num-threads=0 -triple=powerpc64-ibm-aix %t/xcoff_ppc64.o + +target datalayout = "E-m:a-Fi64-i64:64-i128:128-n32:64-S128-v256:256:256-v512:512:512" +target triple = "powerpc64-ibm-aix" + +define i32 @main() #0 { +entry: + ret i32 0 +} + +attributes #0 = { "target-cpu"="pwr7" } + +!llvm.module.flags = !{!0, !1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} + From 4f902d2425e59bd182390702de23d5cec3467fc2 Mon Sep 17 00:00:00 2001 From: Alexander Yermolovich <43973793+ayermolo@users.noreply.github.com> Date: Thu, 3 Apr 2025 14:02:27 -0700 Subject: [PATCH 0580/1029] [llvm-dwarfdump] Make --verify for .debug_names multithreaded. (#127281) This PR makes verification of .debug_names acceleration table multithreaded. In local testing it improves verification of clang .debug_names from four minutes to under a minute. This PR relies on a current mechanism of extracting DIEs into a vector. Future improvements can include creating API to extract one DIE at a time, or grouping Entires into buckets by CUs and extracting before parallel step. Single Thread 4:12.37 real, 246.88 user, 3.54 sys, 0 amem,10232004 mmem Multi Thread 0:49.40 real, 612.84 user, 515.73 sys, 0 amem, 11226292 mmem --- .../DebugInfo/DWARF/DWARFAcceleratorTable.h | 11 + .../llvm/DebugInfo/DWARF/DWARFVerifier.h | 43 ++- llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp | 356 ++++++++++-------- .../X86/debug-names-verify-completeness.s | 1 - llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp | 18 +- 5 files changed, 261 insertions(+), 168 deletions(-) diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h index 9d7ac12cefdc8..cef5fa1f2ee53 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h @@ -770,6 +770,7 @@ class DWARFDebugNames : public DWARFAcceleratorTable { } public: + using size_type = size_t; using iterator_category = std::input_iterator_tag; using value_type = NameTableEntry; using difference_type = uint32_t; @@ -793,6 +794,16 @@ class DWARFDebugNames : public DWARFAcceleratorTable { next(); return I; } + /// Accesses entry at specific index (1-based internally, 0-based + /// externally). For example how this is used in parallelForEach. + reference operator[](size_type idx) { + return CurrentIndex->getNameTableEntry(idx + 1); + } + /// Computes difference between iterators (used in parallelForEach). + difference_type operator-(const NameIterator &other) const { + assert(CurrentIndex == other.CurrentIndex); + return this->CurrentName - other.CurrentName; + } friend bool operator==(const NameIterator &A, const NameIterator &B) { return A.CurrentIndex == B.CurrentIndex && A.CurrentName == B.CurrentName; diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h index 595215ba35dd5..717f9cedc4ee3 100644 --- a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h +++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h @@ -9,6 +9,7 @@ #ifndef LLVM_DEBUGINFO_DWARF_DWARFVERIFIER_H #define LLVM_DEBUGINFO_DWARF_DWARFVERIFIER_H +#include "llvm/ADT/StringMap.h" #include "llvm/DebugInfo/DIContext.h" #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h" #include "llvm/DebugInfo/DWARF/DWARFAddressRange.h" @@ -16,6 +17,7 @@ #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h" #include #include +#include #include namespace llvm { @@ -37,7 +39,9 @@ struct AggregationData { class OutputCategoryAggregator { private: + std::mutex WriteMutex; std::map> Aggregation; + uint64_t NumErrors = 0; bool IncludeDetail; public: @@ -52,6 +56,8 @@ class OutputCategoryAggregator { void EnumerateDetailedResultsFor( StringRef category, std::function handleCounts); + /// Return the number of errors that have been reported. + uint64_t GetNumErrors() const { return NumErrors; } }; /// A class that verifies DWARF debug information given a DWARF Context. @@ -114,6 +120,7 @@ class DWARFVerifier { bool IsObjectFile; bool IsMachOObject; using ReferenceMap = std::map>; + std::mutex AccessMutex; raw_ostream &error() const; raw_ostream &warn() const; @@ -274,21 +281,23 @@ class DWARFVerifier { /// \param SectionName the name of the table we're verifying /// /// \returns The number of errors occurred during verification - unsigned verifyAppleAccelTable(const DWARFSection *AccelSection, - DataExtractor *StrData, - const char *SectionName); - - unsigned verifyDebugNamesCULists(const DWARFDebugNames &AccelTable); - unsigned verifyNameIndexBuckets(const DWARFDebugNames::NameIndex &NI, - const DataExtractor &StrData); - unsigned verifyNameIndexAbbrevs(const DWARFDebugNames::NameIndex &NI); - unsigned verifyNameIndexAttribute(const DWARFDebugNames::NameIndex &NI, - const DWARFDebugNames::Abbrev &Abbr, - DWARFDebugNames::AttributeEncoding AttrEnc); - unsigned verifyNameIndexEntries(const DWARFDebugNames::NameIndex &NI, - const DWARFDebugNames::NameTableEntry &NTE); - unsigned verifyNameIndexCompleteness(const DWARFDie &Die, - const DWARFDebugNames::NameIndex &NI); + void verifyAppleAccelTable(const DWARFSection *AccelSection, + DataExtractor *StrData, const char *SectionName); + + void verifyDebugNamesCULists(const DWARFDebugNames &AccelTable); + void verifyNameIndexBuckets(const DWARFDebugNames::NameIndex &NI, + const DataExtractor &StrData); + void verifyNameIndexAbbrevs(const DWARFDebugNames::NameIndex &NI); + void verifyNameIndexAttribute(const DWARFDebugNames::NameIndex &NI, + const DWARFDebugNames::Abbrev &Abbr, + DWARFDebugNames::AttributeEncoding AttrEnc); + void verifyNameIndexEntries( + const DWARFDebugNames::NameIndex &NI, + const DWARFDebugNames::NameTableEntry &NTE, + const DenseMap &CUOffsetsToDUMap); + void verifyNameIndexCompleteness( + const DWARFDie &Die, const DWARFDebugNames::NameIndex &NI, + const StringMap> &NamesToDieOffsets); /// Verify that the DWARF v5 accelerator table is valid. /// @@ -307,8 +316,8 @@ class DWARFVerifier { /// \param StrData string section /// /// \returns The number of errors occurred during verification - unsigned verifyDebugNames(const DWARFSection &AccelSection, - const DataExtractor &StrData); + void verifyDebugNames(const DWARFSection &AccelSection, + const DataExtractor &StrData); public: DWARFVerifier(raw_ostream &S, DWARFContext &D, diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp index 69027500ab51d..690a2e5fee9de 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -33,6 +33,7 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/JSON.h" +#include "llvm/Support/Parallel.h" #include "llvm/Support/WithColor.h" #include "llvm/Support/raw_ostream.h" #include @@ -1106,10 +1107,9 @@ bool DWARFVerifier::handleDebugLine() { return NumDebugLineErrors == 0; } -unsigned DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection, - DataExtractor *StrData, - const char *SectionName) { - unsigned NumErrors = 0; +void DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection, + DataExtractor *StrData, + const char *SectionName) { DWARFDataExtractor AccelSectionData(DCtx.getDWARFObj(), *AccelSection, DCtx.isLittleEndian(), 0); AppleAcceleratorTable AccelTable(AccelSectionData, *StrData); @@ -1121,7 +1121,7 @@ unsigned DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection, ErrorCategory.Report("Section is too small to fit a section header", [&]() { error() << "Section is too small to fit a section header.\n"; }); - return 1; + return; } // Verify that the section is not too short. @@ -1129,7 +1129,7 @@ unsigned DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection, std::string Msg = toString(std::move(E)); ErrorCategory.Report("Section is too small to fit a section header", [&]() { error() << Msg << '\n'; }); - return 1; + return; } // Verify that all buckets have a valid hash index or are empty. @@ -1147,7 +1147,6 @@ unsigned DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection, error() << format("Bucket[%d] has invalid hash index: %u.\n", BucketIdx, HashIdx); }); - ++NumErrors; } } uint32_t NumAtoms = AccelTable.getAtomsDesc().size(); @@ -1155,13 +1154,13 @@ unsigned DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection, ErrorCategory.Report("No atoms", [&]() { error() << "No atoms: failed to read HashData.\n"; }); - return 1; + return; } if (!AccelTable.validateForms()) { ErrorCategory.Report("Unsupported form", [&]() { error() << "Unsupported form: failed to read HashData.\n"; }); - return 1; + return; } for (uint32_t HashIdx = 0; HashIdx < NumHashes; ++HashIdx) { @@ -1176,7 +1175,6 @@ unsigned DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection, "0x%08" PRIx64 ".\n", HashIdx, HashDataOffset); }); - ++NumErrors; } uint64_t StrpOffset; @@ -1207,8 +1205,6 @@ unsigned DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection, SectionName, BucketIdx, HashIdx, Hash, StringCount, StrpOffset, HashDataIdx, Offset, Name); }); - - ++NumErrors; continue; } if ((Tag != dwarf::DW_TAG_null) && (Die.getTag() != Tag)) { @@ -1218,74 +1214,70 @@ unsigned DWARFVerifier::verifyAppleAccelTable(const DWARFSection *AccelSection, << dwarf::TagString(Die.getTag()) << " of DIE[" << HashDataIdx << "].\n"; }); - ++NumErrors; } } - ++StringCount; } } - return NumErrors; } -unsigned -DWARFVerifier::verifyDebugNamesCULists(const DWARFDebugNames &AccelTable) { +void DWARFVerifier::verifyDebugNamesCULists(const DWARFDebugNames &AccelTable) { // A map from CU offset to the (first) Name Index offset which claims to index // this CU. DenseMap CUMap; - const uint64_t NotIndexed = std::numeric_limits::max(); - CUMap.reserve(DCtx.getNumCompileUnits()); + + DenseSet CUOffsets; for (const auto &CU : DCtx.compile_units()) - CUMap[CU->getOffset()] = NotIndexed; + CUOffsets.insert(CU->getOffset()); - unsigned NumErrors = 0; - for (const DWARFDebugNames::NameIndex &NI : AccelTable) { + parallelForEach(AccelTable, [&](const DWARFDebugNames::NameIndex &NI) { if (NI.getCUCount() == 0) { ErrorCategory.Report("Name Index doesn't index any CU", [&]() { error() << formatv("Name Index @ {0:x} does not index any CU\n", NI.getUnitOffset()); }); - ++NumErrors; - continue; + return; } for (uint32_t CU = 0, End = NI.getCUCount(); CU < End; ++CU) { uint64_t Offset = NI.getCUOffset(CU); - auto Iter = CUMap.find(Offset); - - if (Iter == CUMap.end()) { + if (!CUOffsets.count(Offset)) { ErrorCategory.Report("Name Index references non-existing CU", [&]() { error() << formatv( "Name Index @ {0:x} references a non-existing CU @ {1:x}\n", NI.getUnitOffset(), Offset); }); - ++NumErrors; continue; } - - if (Iter->second != NotIndexed) { + uint64_t DuplicateCUOffset = 0; + { + std::lock_guard Lock(AccessMutex); + auto Iter = CUMap.find(Offset); + if (Iter != CUMap.end()) + DuplicateCUOffset = Iter->second; + else + CUMap[Offset] = NI.getUnitOffset(); + } + if (DuplicateCUOffset) { ErrorCategory.Report("Duplicate Name Index", [&]() { error() << formatv( "Name Index @ {0:x} references a CU @ {1:x}, but " "this CU is already indexed by Name Index @ {2:x}\n", - NI.getUnitOffset(), Offset, Iter->second); + NI.getUnitOffset(), Offset, DuplicateCUOffset); }); continue; } - Iter->second = NI.getUnitOffset(); } - } + }); - for (const auto &KV : CUMap) { - if (KV.second == NotIndexed) - warn() << formatv("CU @ {0:x} not covered by any Name Index\n", KV.first); + for (const auto &CU : DCtx.compile_units()) { + if (CUMap.count(CU->getOffset()) == 0) + warn() << formatv("CU @ {0:x} not covered by any Name Index\n", + CU->getOffset()); } - - return NumErrors; } -unsigned -DWARFVerifier::verifyNameIndexBuckets(const DWARFDebugNames::NameIndex &NI, - const DataExtractor &StrData) { +void DWARFVerifier::verifyNameIndexBuckets(const DWARFDebugNames::NameIndex &NI, + const DataExtractor &StrData) { struct BucketInfo { uint32_t Bucket; uint32_t Index; @@ -1295,17 +1287,17 @@ DWARFVerifier::verifyNameIndexBuckets(const DWARFDebugNames::NameIndex &NI, bool operator<(const BucketInfo &RHS) const { return Index < RHS.Index; } }; - uint32_t NumErrors = 0; if (NI.getBucketCount() == 0) { warn() << formatv("Name Index @ {0:x} does not contain a hash table.\n", NI.getUnitOffset()); - return NumErrors; + return; } // Build up a list of (Bucket, Index) pairs. We use this later to verify that // each Name is reachable from the appropriate bucket. std::vector BucketStarts; BucketStarts.reserve(NI.getBucketCount() + 1); + const uint64_t OrigNumberOfErrors = ErrorCategory.GetNumErrors(); for (uint32_t Bucket = 0, End = NI.getBucketCount(); Bucket < End; ++Bucket) { uint32_t Index = NI.getBucketArrayEntry(Bucket); if (Index > NI.getNameCount()) { @@ -1315,7 +1307,6 @@ DWARFVerifier::verifyNameIndexBuckets(const DWARFDebugNames::NameIndex &NI, Bucket, NI.getUnitOffset(), Index, NI.getNameCount()); }); - ++NumErrors; continue; } if (Index > 0) @@ -1325,8 +1316,8 @@ DWARFVerifier::verifyNameIndexBuckets(const DWARFDebugNames::NameIndex &NI, // If there were any buckets with invalid values, skip further checks as they // will likely produce many errors which will only confuse the actual root // problem. - if (NumErrors > 0) - return NumErrors; + if (OrigNumberOfErrors != ErrorCategory.GetNumErrors()) + return; // Sort the list in the order of increasing "Index" entries. array_pod_sort(BucketStarts.begin(), BucketStarts.end()); @@ -1352,7 +1343,6 @@ DWARFVerifier::verifyNameIndexBuckets(const DWARFDebugNames::NameIndex &NI, "are not covered by the hash table.\n", NI.getUnitOffset(), NextUncovered, B.Index - 1); }); - ++NumErrors; } uint32_t Idx = B.Index; @@ -1374,7 +1364,6 @@ DWARFVerifier::verifyNameIndexBuckets(const DWARFDebugNames::NameIndex &NI, NI.getUnitOffset(), B.Bucket, FirstHash, FirstHash % NI.getBucketCount()); }); - ++NumErrors; } // This find the end of this bucket and also verifies that all the hashes in @@ -1395,17 +1384,15 @@ DWARFVerifier::verifyNameIndexBuckets(const DWARFDebugNames::NameIndex &NI, "the Name Index hash is {4:x}\n", NI.getUnitOffset(), Str, Idx, caseFoldingDjbHash(Str), Hash); }); - ++NumErrors; } - ++Idx; } NextUncovered = std::max(NextUncovered, Idx); } - return NumErrors; + return; } -unsigned DWARFVerifier::verifyNameIndexAttribute( +void DWARFVerifier::verifyNameIndexAttribute( const DWARFDebugNames::NameIndex &NI, const DWARFDebugNames::Abbrev &Abbr, DWARFDebugNames::AttributeEncoding AttrEnc) { StringRef FormName = dwarf::FormEncodingString(AttrEnc.Form); @@ -1416,7 +1403,7 @@ unsigned DWARFVerifier::verifyNameIndexAttribute( NI.getUnitOffset(), Abbr.Code, AttrEnc.Index, AttrEnc.Form); }); - return 1; + return; } if (AttrEnc.Index == DW_IDX_type_hash) { @@ -1427,9 +1414,9 @@ unsigned DWARFVerifier::verifyNameIndexAttribute( "uses an unexpected form {2} (should be {3}).\n", NI.getUnitOffset(), Abbr.Code, AttrEnc.Form, dwarf::DW_FORM_data8); }); - return 1; + return; } - return 0; + return; } if (AttrEnc.Index == dwarf::DW_IDX_parent) { @@ -1443,9 +1430,9 @@ unsigned DWARFVerifier::verifyNameIndexAttribute( "DW_FORM_ref4 or DW_FORM_flag_present).\n", NI.getUnitOffset(), Abbr.Code, AttrEnc.Form); }); - return 1; + return; } - return 0; + return; } // A list of known index attributes and their expected form classes. @@ -1470,7 +1457,7 @@ unsigned DWARFVerifier::verifyNameIndexAttribute( warn() << formatv("NameIndex @ {0:x}: Abbreviation {1:x} contains an " "unknown index attribute: {2}.\n", NI.getUnitOffset(), Abbr.Code, AttrEnc.Index); - return 0; + return; } if (!DWARFFormValue(AttrEnc.Form).isFormClass(Iter->Class)) { @@ -1480,14 +1467,13 @@ unsigned DWARFVerifier::verifyNameIndexAttribute( NI.getUnitOffset(), Abbr.Code, AttrEnc.Index, AttrEnc.Form, Iter->ClassName); }); - return 1; + return; } - return 0; + return; } -unsigned -DWARFVerifier::verifyNameIndexAbbrevs(const DWARFDebugNames::NameIndex &NI) { - unsigned NumErrors = 0; +void DWARFVerifier::verifyNameIndexAbbrevs( + const DWARFDebugNames::NameIndex &NI) { for (const auto &Abbrev : NI.getAbbrevs()) { StringRef TagName = dwarf::TagString(Abbrev.Tag); if (TagName.empty()) { @@ -1505,10 +1491,9 @@ DWARFVerifier::verifyNameIndexAbbrevs(const DWARFDebugNames::NameIndex &NI) { "multiple {2} attributes.\n", NI.getUnitOffset(), Abbrev.Code, AttrEnc.Index); }); - ++NumErrors; continue; } - NumErrors += verifyNameIndexAttribute(NI, Abbrev, AttrEnc); + verifyNameIndexAttribute(NI, Abbrev, AttrEnc); } if (NI.getCUCount() > 1 && !Attributes.count(dwarf::DW_IDX_compile_unit) && @@ -1519,7 +1504,6 @@ DWARFVerifier::verifyNameIndexAbbrevs(const DWARFDebugNames::NameIndex &NI) { "or DW_IDX_type_unit attribute.\n", NI.getUnitOffset(), Abbrev.Code); }); - ++NumErrors; } if (!Attributes.count(dwarf::DW_IDX_die_offset)) { ErrorCategory.Report("Abbreviate in NameIndex missing attribute", [&]() { @@ -1527,12 +1511,12 @@ DWARFVerifier::verifyNameIndexAbbrevs(const DWARFDebugNames::NameIndex &NI) { "NameIndex @ {0:x}: Abbreviation {1:x} has no {2} attribute.\n", NI.getUnitOffset(), Abbrev.Code, dwarf::DW_IDX_die_offset); }); - ++NumErrors; } } - return NumErrors; } +/// Constructs a full name for a DIE. Potentially it does recursive lookup on +/// DIEs. This can lead to extraction of DIEs in a different CU or TU. static SmallVector getNames(const DWARFDie &DIE, bool IncludeStrippedTemplateNames, bool IncludeObjCNames = true, @@ -1571,9 +1555,10 @@ static SmallVector getNames(const DWARFDie &DIE, return Result; } -unsigned DWARFVerifier::verifyNameIndexEntries( +void DWARFVerifier::verifyNameIndexEntries( const DWARFDebugNames::NameIndex &NI, - const DWARFDebugNames::NameTableEntry &NTE) { + const DWARFDebugNames::NameTableEntry &NTE, + const DenseMap &CUOffsetsToDUMap) { const char *CStr = NTE.getString(); if (!CStr) { ErrorCategory.Report("Unable to get string associated with name", [&]() { @@ -1581,11 +1566,9 @@ unsigned DWARFVerifier::verifyNameIndexEntries( "with name {1}.\n", NI.getUnitOffset(), NTE.getIndex()); }); - return 1; + return; } StringRef Str(CStr); - - unsigned NumErrors = 0; unsigned NumEntries = 0; uint64_t EntryID = NTE.getEntryOffset(); uint64_t NextEntryID = EntryID; @@ -1601,7 +1584,6 @@ unsigned DWARFVerifier::verifyNameIndexEntries( "invalid CU index ({2}).\n", NI.getUnitOffset(), EntryID, *CUIndex); }); - ++NumErrors; continue; } const uint32_t NumLocalTUs = NI.getLocalTUCount(); @@ -1612,7 +1594,6 @@ unsigned DWARFVerifier::verifyNameIndexEntries( "invalid TU index ({2}).\n", NI.getUnitOffset(), EntryID, *TUIndex); }); - ++NumErrors; continue; } std::optional UnitOffset; @@ -1640,7 +1621,6 @@ unsigned DWARFVerifier::verifyNameIndexEntries( "foreign TU index ({2}) with no CU index.\n", NI.getUnitOffset(), EntryID, *TUIndex); }); - ++NumErrors; continue; } } else { @@ -1668,7 +1648,6 @@ unsigned DWARFVerifier::verifyNameIndexEntries( "invalid CU or TU offset {2:x}.\n", NI.getUnitOffset(), EntryID, *UnitOffset); }); - ++NumErrors; continue; } // This function will try to get the non skeleton unit DIE, but if it is @@ -1682,9 +1661,15 @@ unsigned DWARFVerifier::verifyNameIndexEntries( // call to properly deal with it. It isn't clear that getNonSkeletonUnitDIE // will return the unit DIE of DU if we aren't able to get the .dwo file, // but that is what the function currently does. + DWARFUnit *NonSkeletonUnit = nullptr; + if (DU->getDWOId()) { + auto Iter = CUOffsetsToDUMap.find(DU->getOffset()); + NonSkeletonUnit = Iter->second; + } else { + NonSkeletonUnit = DU; + } DWARFDie UnitDie = DU->getUnitDIE(); - DWARFDie NonSkeletonUnitDie = DU->getNonSkeletonUnitDIE(); - if (DU->getDWOId() && UnitDie == NonSkeletonUnitDie) { + if (DU->getDWOId() && !NonSkeletonUnit->isDWOUnit()) { ErrorCategory.Report("Unable to get load .dwo file", [&]() { error() << formatv( "Name Index @ {0:x}: Entry @ {1:x} unable to load " @@ -1693,10 +1678,9 @@ unsigned DWARFVerifier::verifyNameIndexEntries( dwarf::toString(UnitDie.find({DW_AT_dwo_name, DW_AT_GNU_dwo_name})), *UnitOffset); }); - ++NumErrors; continue; } - DWARFUnit *NonSkeletonUnit = nullptr; + if (TUIndex && *TUIndex >= NumLocalTUs) { // We have a foreign TU index, which either means we have a .dwo file // that has one or more type units, or we have a .dwp file with one or @@ -1707,17 +1691,16 @@ unsigned DWARFVerifier::verifyNameIndexEntries( // above, so we know we have the right file. const uint32_t ForeignTUIdx = *TUIndex - NumLocalTUs; const uint64_t TypeSig = NI.getForeignTUSignature(ForeignTUIdx); - llvm::DWARFContext &SkeletonDCtx = - NonSkeletonUnitDie.getDwarfUnit()->getContext(); + llvm::DWARFContext &NonSkeletonDCtx = NonSkeletonUnit->getContext(); // Now find the type unit from the type signature and then update the // NonSkeletonUnitDie to point to the actual type unit in the .dwo/.dwp. NonSkeletonUnit = - SkeletonDCtx.getTypeUnitForHash(TypeSig, /*IsDWO=*/true); - NonSkeletonUnitDie = NonSkeletonUnit->getUnitDIE(true); + NonSkeletonDCtx.getTypeUnitForHash(TypeSig, /*IsDWO=*/true); // If we have foreign type unit in a DWP file, then we need to ignore // any entries from type units that don't match the one that made it into // the .dwp file. - if (SkeletonDCtx.isDWP()) { + if (NonSkeletonDCtx.isDWP()) { + DWARFDie NonSkeletonUnitDie = NonSkeletonUnit->getUnitDIE(true); StringRef DUDwoName = dwarf::toStringRef( UnitDie.find({DW_AT_dwo_name, DW_AT_GNU_dwo_name})); StringRef TUDwoName = dwarf::toStringRef( @@ -1725,8 +1708,6 @@ unsigned DWARFVerifier::verifyNameIndexEntries( if (DUDwoName != TUDwoName) continue; // Skip this TU, it isn't the one in the .dwp file. } - } else { - NonSkeletonUnit = NonSkeletonUnitDie.getDwarfUnit(); } uint64_t DIEOffset = NonSkeletonUnit->getOffset() + *EntryOr->getDIEUnitOffset(); @@ -1743,14 +1724,12 @@ unsigned DWARFVerifier::verifyNameIndexEntries( continue; } DWARFDie DIE = NonSkeletonUnit->getDIEForOffset(DIEOffset); - if (!DIE) { ErrorCategory.Report("NameIndex references nonexistent DIE", [&]() { error() << formatv("Name Index @ {0:x}: Entry @ {1:x} references a " "non-existing DIE @ {2:x}.\n", NI.getUnitOffset(), EntryID, DIEOffset); }); - ++NumErrors; continue; } // Only compare the DIE we found's DWARFUnit offset if the DIE lives in @@ -1766,7 +1745,6 @@ unsigned DWARFVerifier::verifyNameIndexEntries( NI.getUnitOffset(), EntryID, DIEOffset, *UnitOffset, DIE.getDwarfUnit()->getOffset()); }); - ++NumErrors; } if (DIE.getTag() != EntryOr->tag()) { ErrorCategory.Report("Name Index contains mismatched Tag of DIE", [&]() { @@ -1776,7 +1754,6 @@ unsigned DWARFVerifier::verifyNameIndexEntries( NI.getUnitOffset(), EntryID, DIEOffset, EntryOr->tag(), DIE.getTag()); }); - ++NumErrors; } // We allow an extra name for functions: their name without any template @@ -1792,7 +1769,6 @@ unsigned DWARFVerifier::verifyNameIndexEntries( NI.getUnitOffset(), EntryID, DIEOffset, Str, make_range(EntryNames.begin(), EntryNames.end())); }); - ++NumErrors; } } handleAllErrors( @@ -1806,7 +1782,6 @@ unsigned DWARFVerifier::verifyNameIndexEntries( "not associated with any entries.\n", NI.getUnitOffset(), NTE.getIndex(), Str); }); - ++NumErrors; }, [&](const ErrorInfoBase &Info) { ErrorCategory.Report("Uncategorized NameIndex error", [&]() { @@ -1814,9 +1789,7 @@ unsigned DWARFVerifier::verifyNameIndexEntries( NI.getUnitOffset(), NTE.getIndex(), Str, Info.message()); }); - ++NumErrors; }); - return NumErrors; } static bool isVariableIndexable(const DWARFDie &Die, DWARFContext &DCtx) { @@ -1844,8 +1817,9 @@ static bool isVariableIndexable(const DWARFDie &Die, DWARFContext &DCtx) { return false; } -unsigned DWARFVerifier::verifyNameIndexCompleteness( - const DWARFDie &Die, const DWARFDebugNames::NameIndex &NI) { +void DWARFVerifier::verifyNameIndexCompleteness( + const DWARFDie &Die, const DWARFDebugNames::NameIndex &NI, + const StringMap> &NamesToDieOffsets) { // First check, if the Die should be indexed. The code follows the DWARF v5 // wording as closely as possible. @@ -1853,7 +1827,7 @@ unsigned DWARFVerifier::verifyNameIndexCompleteness( // "All non-defining declarations (that is, debugging information entries // with a DW_AT_declaration attribute) are excluded." if (Die.find(DW_AT_declaration)) - return 0; + return; // "DW_TAG_namespace debugging information entries without a DW_AT_name // attribute are included with the name “(anonymous namespace)”. @@ -1871,7 +1845,7 @@ unsigned DWARFVerifier::verifyNameIndexCompleteness( auto EntryNames = getNames(Die, IncludeStrippedTemplateNames, IncludeObjCNames, IncludeLinkageName); if (EntryNames.empty()) - return 0; + return; // We deviate from the specification here, which says: // "The name index must contain an entry for each debugging information entry @@ -1882,7 +1856,7 @@ unsigned DWARFVerifier::verifyNameIndexCompleteness( // Compile units and modules have names but shouldn't be indexed. case DW_TAG_compile_unit: case DW_TAG_module: - return 0; + return; // Function and template parameters are not globally visible, so we shouldn't // index them. @@ -1891,22 +1865,22 @@ unsigned DWARFVerifier::verifyNameIndexCompleteness( case DW_TAG_template_type_parameter: case DW_TAG_GNU_template_parameter_pack: case DW_TAG_GNU_template_template_param: - return 0; + return; // Object members aren't globally visible. case DW_TAG_member: - return 0; + return; // According to a strict reading of the specification, enumerators should not // be indexed (and LLVM currently does not do that). However, this causes // problems for the debuggers, so we may need to reconsider this. case DW_TAG_enumerator: - return 0; + return; // Imported declarations should not be indexed according to the specification // and LLVM currently does not do that. case DW_TAG_imported_declaration: - return 0; + return; // "DW_TAG_subprogram, DW_TAG_inlined_subroutine, and DW_TAG_label debugging // information entries without an address attribute (DW_AT_low_pc, @@ -1917,7 +1891,7 @@ unsigned DWARFVerifier::verifyNameIndexCompleteness( if (Die.findRecursively( {DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_entry_pc})) break; - return 0; + return; // "DW_TAG_variable debugging information entries with a DW_AT_location // attribute that includes a DW_OP_addr or DW_OP_form_tls_address operator are @@ -1927,7 +1901,7 @@ unsigned DWARFVerifier::verifyNameIndexCompleteness( case DW_TAG_variable: if (isVariableIndexable(Die, DCtx)) break; - return 0; + return; default: break; @@ -1935,12 +1909,10 @@ unsigned DWARFVerifier::verifyNameIndexCompleteness( // Now we know that our Die should be present in the Index. Let's check if // that's the case. - unsigned NumErrors = 0; uint64_t DieUnitOffset = Die.getOffset() - Die.getDwarfUnit()->getOffset(); for (StringRef Name : EntryNames) { - if (none_of(NI.equal_range(Name), [&](const DWARFDebugNames::Entry &E) { - return E.getDIEUnitOffset() == DieUnitOffset; - })) { + auto iter = NamesToDieOffsets.find(Name); + if (iter == NamesToDieOffsets.end() || !iter->second.count(DieUnitOffset)) { ErrorCategory.Report( "Name Index DIE entry missing name", llvm::dwarf::TagString(Die.getTag()), [&]() { @@ -1949,15 +1921,47 @@ unsigned DWARFVerifier::verifyNameIndexCompleteness( "name {3} missing.\n", NI.getUnitOffset(), Die.getOffset(), Die.getTag(), Name); }); - ++NumErrors; } } - return NumErrors; } -unsigned DWARFVerifier::verifyDebugNames(const DWARFSection &AccelSection, - const DataExtractor &StrData) { - unsigned NumErrors = 0; +/// Extracts all the data for CU/TUs so we can access it in parallel without +/// locks. +static void extractCUsTus(DWARFContext &DCtx) { + // Abbrev DeclSet is shared beween the units. + for (auto &CUTU : DCtx.normal_units()) { + CUTU->getUnitDIE(); + CUTU->getBaseAddress(); + } + parallelForEach(DCtx.normal_units(), [&](const auto &CUTU) { + if (Error E = CUTU->tryExtractDIEsIfNeeded(false)) + DCtx.getRecoverableErrorHandler()(std::move(E)); + }); + + // Invoking getNonSkeletonUnitDIE() sets up all the base pointers for DWO + // Units. This is needed for getBaseAddress(). + for (const auto &CU : DCtx.compile_units()) { + if (!CU->getDWOId()) + continue; + DWARFContext &NonSkeletonContext = + CU->getNonSkeletonUnitDIE().getDwarfUnit()->getContext(); + // Iterates over CUs and TUs. + for (auto &CUTU : NonSkeletonContext.dwo_units()) { + CUTU->getUnitDIE(); + CUTU->getBaseAddress(); + } + parallelForEach(NonSkeletonContext.dwo_units(), [&](const auto &CUTU) { + if (Error E = CUTU->tryExtractDIEsIfNeeded(false)) + DCtx.getRecoverableErrorHandler()(std::move(E)); + }); + // If context is for DWP we only need to extract once. + if (NonSkeletonContext.isDWP()) + break; + } +} + +void DWARFVerifier::verifyDebugNames(const DWARFSection &AccelSection, + const DataExtractor &StrData) { DWARFDataExtractor AccelSectionData(DCtx.getDWARFObj(), AccelSection, DCtx.isLittleEndian(), 0); DWARFDebugNames AccelTable(AccelSectionData, StrData); @@ -1970,67 +1974,119 @@ unsigned DWARFVerifier::verifyDebugNames(const DWARFSection &AccelSection, std::string Msg = toString(std::move(E)); ErrorCategory.Report("Accelerator Table Error", [&]() { error() << Msg << '\n'; }); - return 1; + return; } - - NumErrors += verifyDebugNamesCULists(AccelTable); - for (const auto &NI : AccelTable) - NumErrors += verifyNameIndexBuckets(NI, StrData); + const uint64_t OriginalNumErrors = ErrorCategory.GetNumErrors(); + verifyDebugNamesCULists(AccelTable); for (const auto &NI : AccelTable) - NumErrors += verifyNameIndexAbbrevs(NI); + verifyNameIndexBuckets(NI, StrData); + parallelForEach(AccelTable, [&](const DWARFDebugNames::NameIndex &NI) { + verifyNameIndexAbbrevs(NI); + }); // Don't attempt Entry validation if any of the previous checks found errors - if (NumErrors > 0) - return NumErrors; - for (const auto &NI : AccelTable) - for (const DWARFDebugNames::NameTableEntry &NTE : NI) - NumErrors += verifyNameIndexEntries(NI, NTE); + if (OriginalNumErrors != ErrorCategory.GetNumErrors()) + return; + DenseMap CUOffsetsToDUMap; + for (const auto &CU : DCtx.compile_units()) { + if (!(CU->getVersion() >= 5 && CU->getDWOId())) + continue; + CUOffsetsToDUMap[CU->getOffset()] = + CU->getNonSkeletonUnitDIE().getDwarfUnit(); + } + extractCUsTus(DCtx); + for (const DWARFDebugNames::NameIndex &NI : AccelTable) { + parallelForEach(NI, [&](DWARFDebugNames::NameTableEntry NTE) { + verifyNameIndexEntries(NI, NTE, CUOffsetsToDUMap); + }); + } - for (const std::unique_ptr &U : DCtx.info_section_units()) { - if (const DWARFDebugNames::NameIndex *NI = - AccelTable.getCUOrTUNameIndex(U->getOffset())) { - DWARFCompileUnit *CU = dyn_cast(U.get()); + auto populateNameToOffset = + [&](const DWARFDebugNames::NameIndex &NI, + StringMap> &NamesToDieOffsets) { + for (const DWARFDebugNames::NameTableEntry &NTE : NI) { + const char *tName = NTE.getString(); + const std::string Name = tName ? std::string(tName) : ""; + uint64_t EntryID = NTE.getEntryOffset(); + Expected EntryOr = NI.getEntry(&EntryID); + auto Iter = NamesToDieOffsets.insert({Name, DenseSet(3)}); + for (; EntryOr; EntryOr = NI.getEntry(&EntryID)) { + if (std::optional DieOffset = EntryOr->getDIEUnitOffset()) + Iter.first->second.insert(*DieOffset); + } + handleAllErrors( + EntryOr.takeError(), + [&](const DWARFDebugNames::SentinelError &) { + if (!NamesToDieOffsets.empty()) + return; + ErrorCategory.Report( + "NameIndex Name is not associated with any entries", [&]() { + error() + << formatv("Name Index @ {0:x}: Name {1} ({2}) is " + "not associated with any entries.\n", + NI.getUnitOffset(), NTE.getIndex(), Name); + }); + }, + [&](const ErrorInfoBase &Info) { + ErrorCategory.Report("Uncategorized NameIndex error", [&]() { + error() << formatv( + "Name Index @ {0:x}: Name {1} ({2}): {3}\n", + NI.getUnitOffset(), NTE.getIndex(), Name, Info.message()); + }); + }); + } + }; + // NameIndex can have multiple CUs. For example if it was created by BOLT. + // So better to iterate over NI, and then over CUs in it. + for (const DWARFDebugNames::NameIndex &NI : AccelTable) { + StringMap> NamesToDieOffsets(NI.getNameCount()); + populateNameToOffset(NI, NamesToDieOffsets); + for (uint32_t i = 0, iEnd = NI.getCUCount(); i < iEnd; ++i) { + const uint64_t CUOffset = NI.getCUOffset(i); + DWARFUnit *U = DCtx.getUnitForOffset(CUOffset); + DWARFCompileUnit *CU = dyn_cast(U); if (CU) { if (CU->getDWOId()) { DWARFDie CUDie = CU->getUnitDIE(true); DWARFDie NonSkeletonUnitDie = CUDie.getDwarfUnit()->getNonSkeletonUnitDIE(false); if (CUDie != NonSkeletonUnitDie) { - for (const DWARFDebugInfoEntry &Die : - NonSkeletonUnitDie.getDwarfUnit()->dies()) - NumErrors += verifyNameIndexCompleteness( - DWARFDie(NonSkeletonUnitDie.getDwarfUnit(), &Die), *NI); + parallelForEach( + NonSkeletonUnitDie.getDwarfUnit()->dies(), + [&](const DWARFDebugInfoEntry &Die) { + verifyNameIndexCompleteness( + DWARFDie(NonSkeletonUnitDie.getDwarfUnit(), &Die), NI, + NamesToDieOffsets); + }); } } else { - for (const DWARFDebugInfoEntry &Die : CU->dies()) - NumErrors += verifyNameIndexCompleteness(DWARFDie(CU, &Die), *NI); + parallelForEach(CU->dies(), [&](const DWARFDebugInfoEntry &Die) { + verifyNameIndexCompleteness(DWARFDie(CU, &Die), NI, + NamesToDieOffsets); + }); } } } } - return NumErrors; + return; } bool DWARFVerifier::handleAccelTables() { const DWARFObject &D = DCtx.getDWARFObj(); DataExtractor StrData(D.getStrSection(), DCtx.isLittleEndian(), 0); - unsigned NumErrors = 0; if (!D.getAppleNamesSection().Data.empty()) - NumErrors += verifyAppleAccelTable(&D.getAppleNamesSection(), &StrData, - ".apple_names"); + verifyAppleAccelTable(&D.getAppleNamesSection(), &StrData, ".apple_names"); if (!D.getAppleTypesSection().Data.empty()) - NumErrors += verifyAppleAccelTable(&D.getAppleTypesSection(), &StrData, - ".apple_types"); + verifyAppleAccelTable(&D.getAppleTypesSection(), &StrData, ".apple_types"); if (!D.getAppleNamespacesSection().Data.empty()) - NumErrors += verifyAppleAccelTable(&D.getAppleNamespacesSection(), &StrData, - ".apple_namespaces"); + verifyAppleAccelTable(&D.getAppleNamespacesSection(), &StrData, + ".apple_namespaces"); if (!D.getAppleObjCSection().Data.empty()) - NumErrors += verifyAppleAccelTable(&D.getAppleObjCSection(), &StrData, - ".apple_objc"); + verifyAppleAccelTable(&D.getAppleObjCSection(), &StrData, ".apple_objc"); if (!D.getNamesSection().Data.empty()) - NumErrors += verifyDebugNames(D.getNamesSection(), StrData); - return NumErrors == 0; + verifyDebugNames(D.getNamesSection(), StrData); + return ErrorCategory.GetNumErrors() == 0; } bool DWARFVerifier::handleDebugStrOffsets() { @@ -2176,6 +2232,8 @@ void OutputCategoryAggregator::Report( void OutputCategoryAggregator::Report( StringRef category, StringRef sub_category, std::function detailCallback) { + std::lock_guard Lock(WriteMutex); + ++NumErrors; std::string category_str = std::string(category); AggregationData &Agg = Aggregation[category_str]; Agg.OverallCount++; diff --git a/llvm/test/tools/llvm-dwarfdump/X86/debug-names-verify-completeness.s b/llvm/test/tools/llvm-dwarfdump/X86/debug-names-verify-completeness.s index b16f8658f87ec..9886968fdab99 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/debug-names-verify-completeness.s +++ b/llvm/test/tools/llvm-dwarfdump/X86/debug-names-verify-completeness.s @@ -177,4 +177,3 @@ .Lnames_abbrev_end0: .Lnames_entries0: .Lnames_end0: - diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp index 82dda93b7f1ab..5749b19d7ad49 100644 --- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp +++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp @@ -30,9 +30,11 @@ #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Parallel.h" #include "llvm/Support/Path.h" #include "llvm/Support/Regex.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Support/WithColor.h" #include "llvm/Support/raw_ostream.h" @@ -284,6 +286,14 @@ static cl::opt cat(DwarfDumpCategory)); static opt Verify("verify", desc("Verify the DWARF debug info."), cat(DwarfDumpCategory)); +static opt VerifyNumThreads( + "verify-num-threads", init(1), + desc("Number of threads to use for --verify. Single threaded verification " + "is the default unless this option is specified. If 0 is specified, " + "maximum hardware threads will be used. This can cause the " + "output to be non determinisitic, but can speed up verification and " + "is useful when running with the summary only or JSON summary modes."), + cat(DwarfDumpCategory)); static opt ErrorDetails( "error-display", init(Unspecified), desc("Set the level of detail and summary to display when verifying " @@ -778,7 +788,8 @@ static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer, if (filterArch(*Obj)) { std::unique_ptr DICtx = DWARFContext::create( *Obj, DWARFContext::ProcessDebugRelocations::Process, nullptr, "", - RecoverableErrorHandler); + RecoverableErrorHandler, WithColor::defaultWarningHandler, + /*ThreadSafe=*/true); DICtx->setParseCUTUIndexManually(ManuallyGenerateUnitIndex); if (!HandleObj(*Obj, *DICtx, Filename, OS)) Result = false; @@ -906,6 +917,11 @@ int main(int argc, char **argv) { bool Success = true; if (Verify) { + if (!VerifyNumThreads) + parallel::strategy = + hardware_concurrency(hardware_concurrency().compute_thread_count()); + else + parallel::strategy = hardware_concurrency(VerifyNumThreads); for (StringRef Object : Objects) Success &= handleFile(Object, verifyObjectFile, OutputFile.os()); } else if (Statistics) { From 13aac46332f607a38067b5ddd466071683b8c255 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Thu, 3 Apr 2025 14:03:25 -0700 Subject: [PATCH 0581/1029] [clang][NFC] Refactor CodeGen's hasBooleanRepresentation (#134159) The ClangIR upstreaming project needs the same logic for hasBooleanRepresentation() that is currently implemented in the standard clang codegen. In order to share this code, this change moves the implementation of this function into the AST Type class. No functional change is intended by this change. The ClangIR use of this function will be added separately in a later change. --- clang/include/clang/AST/Type.h | 4 ++++ clang/lib/AST/Type.cpp | 13 +++++++++++++ clang/lib/CodeGen/CGExpr.cpp | 23 +++++------------------ 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 06d60f618ddcb..9f6189440fabf 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2760,6 +2760,10 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { /// of some sort, e.g., it is a floating-point type or a vector thereof. bool hasFloatingRepresentation() const; + /// Determine whether this type has a boolean representation + /// of some sort. + bool hasBooleanRepresentation() const; + // Type Checking Functions: Check to see if this type is structurally the // specified type, ignoring typedefs and qualifiers, and return a pointer to // the best type we can. diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 879ad1a7eaa84..4336fe44b82ad 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2334,6 +2334,19 @@ bool Type::isArithmeticType() const { return isa(CanonicalType) || isBitIntType(); } +bool Type::hasBooleanRepresentation() const { + if (isBooleanType()) + return true; + + if (const EnumType *ET = getAs()) + return ET->getDecl()->getIntegerType()->isBooleanType(); + + if (const AtomicType *AT = getAs()) + return AT->getValueType()->hasBooleanRepresentation(); + + return false; +} + Type::ScalarTypeKind Type::getScalarTypeKind() const { assert(isScalarType()); diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 73020389b5e45..0d7e5a2091146 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -1896,19 +1896,6 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(LValue lvalue, lvalue.getTBAAInfo(), lvalue.isNontemporal()); } -static bool hasBooleanRepresentation(QualType Ty) { - if (Ty->isBooleanType()) - return true; - - if (const EnumType *ET = Ty->getAs()) - return ET->getDecl()->getIntegerType()->isBooleanType(); - - if (const AtomicType *AT = Ty->getAs()) - return hasBooleanRepresentation(AT->getValueType()); - - return false; -} - static bool getRangeForType(CodeGenFunction &CGF, QualType Ty, llvm::APInt &Min, llvm::APInt &End, bool StrictEnums, bool IsBool) { @@ -1931,7 +1918,7 @@ static bool getRangeForType(CodeGenFunction &CGF, QualType Ty, llvm::MDNode *CodeGenFunction::getRangeForLoadFromType(QualType Ty) { llvm::APInt Min, End; if (!getRangeForType(*this, Ty, Min, End, CGM.getCodeGenOpts().StrictEnums, - hasBooleanRepresentation(Ty))) + Ty->hasBooleanRepresentation())) return nullptr; llvm::MDBuilder MDHelper(getLLVMContext()); @@ -1945,7 +1932,7 @@ bool CodeGenFunction::EmitScalarRangeCheck(llvm::Value *Value, QualType Ty, if (!HasBoolCheck && !HasEnumCheck) return false; - bool IsBool = hasBooleanRepresentation(Ty) || + bool IsBool = Ty->hasBooleanRepresentation() || NSAPI(CGM.getContext()).isObjCBOOLType(Ty); bool NeedsBoolCheck = HasBoolCheck && IsBool; bool NeedsEnumCheck = HasEnumCheck && Ty->getAs(); @@ -2073,7 +2060,7 @@ llvm::Value *CodeGenFunction::EmitLoadOfScalar(Address Addr, bool Volatile, /// by ConvertType) to its load/store type (as returned by /// convertTypeForLoadStore). llvm::Value *CodeGenFunction::EmitToMemory(llvm::Value *Value, QualType Ty) { - if (hasBooleanRepresentation(Ty) || Ty->isBitIntType()) { + if (Ty->hasBooleanRepresentation() || Ty->isBitIntType()) { llvm::Type *StoreTy = convertTypeForLoadStore(Ty, Value->getType()); bool Signed = Ty->isSignedIntegerOrEnumerationType(); return Builder.CreateIntCast(Value, StoreTy, Signed, "storedv"); @@ -2114,7 +2101,7 @@ llvm::Value *CodeGenFunction::EmitFromMemory(llvm::Value *Value, QualType Ty) { } llvm::Type *ResTy = ConvertType(Ty); - if (hasBooleanRepresentation(Ty) || Ty->isBitIntType() || + if (Ty->hasBooleanRepresentation() || Ty->isBitIntType() || Ty->isExtVectorBoolType()) return Builder.CreateTrunc(Value, ResTy, "loadedv"); @@ -2601,7 +2588,7 @@ void CodeGenFunction::EmitStoreThroughBitfieldLValue(RValue Src, LValue Dst, Builder.CreateLoad(Ptr, Dst.isVolatileQualified(), "bf.load"); // Mask the source value as needed. - if (!hasBooleanRepresentation(Dst.getType())) + if (!Dst.getType()->hasBooleanRepresentation()) SrcVal = Builder.CreateAnd( SrcVal, llvm::APInt::getLowBitsSet(StorageSize, Info.Size), "bf.value"); From a06ae976dc0d977ca034041a8b91634e32bf6994 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Thu, 3 Apr 2025 14:04:32 -0700 Subject: [PATCH 0582/1029] [CIR] Upstream support for promoted types with unary plus/minus (#133829) The initial upstreaming of unary operations left promoted types unhandled for the unary plus and minus operators. This change implements support for promoted types and performs a bit of related code cleanup. --- clang/lib/CIR/CodeGen/CIRGenBuilder.h | 9 +++++ clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 38 +++++++++----------- clang/test/CIR/CodeGen/unary.cpp | 42 ++++++++++++++++++++++ 3 files changed, 68 insertions(+), 21 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h index 03fb227a464a1..61a747254b3d0 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h +++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h @@ -160,6 +160,15 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy { llvm_unreachable("negation for the given type is NYI"); } + // TODO: split this to createFPExt/createFPTrunc when we have dedicated cast + // operations. + mlir::Value createFloatingCast(mlir::Value v, mlir::Type destType) { + assert(!cir::MissingFeatures::fpConstraints()); + + return create(v.getLoc(), destType, cir::CastKind::floating, + v); + } + mlir::Value createFSub(mlir::Location loc, mlir::Value lhs, mlir::Value rhs) { assert(!cir::MissingFeatures::metaDataNode()); assert(!cir::MissingFeatures::fpConstraints()); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 5ac1dc1052c2e..6289a8f1d2ed7 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -88,13 +88,11 @@ class ScalarExprEmitter : public StmtVisitor { //===--------------------------------------------------------------------===// mlir::Value emitPromotedValue(mlir::Value result, QualType promotionType) { - cgf.cgm.errorNYI(result.getLoc(), "floating cast for promoted value"); - return {}; + return builder.createFloatingCast(result, cgf.convertType(promotionType)); } mlir::Value emitUnPromotedValue(mlir::Value result, QualType exprType) { - cgf.cgm.errorNYI(result.getLoc(), "floating cast for unpromoted value"); - return {}; + return builder.createFloatingCast(result, cgf.convertType(exprType)); } mlir::Value emitPromoted(const Expr *e, QualType promotionType); @@ -446,37 +444,35 @@ class ScalarExprEmitter : public StmtVisitor { llvm_unreachable("Unexpected signed overflow behavior kind"); } - mlir::Value VisitUnaryPlus(const UnaryOperator *e, - QualType promotionType = QualType()) { - if (!promotionType.isNull()) - cgf.cgm.errorNYI(e->getSourceRange(), "VisitUnaryPlus: promotionType"); - assert(!cir::MissingFeatures::opUnaryPromotionType()); - mlir::Value result = emitUnaryPlusOrMinus(e, cir::UnaryOpKind::Plus); - return result; + mlir::Value VisitUnaryPlus(const UnaryOperator *e) { + return emitUnaryPlusOrMinus(e, cir::UnaryOpKind::Plus); } - mlir::Value VisitUnaryMinus(const UnaryOperator *e, - QualType promotionType = QualType()) { - if (!promotionType.isNull()) - cgf.cgm.errorNYI(e->getSourceRange(), "VisitUnaryMinus: promotionType"); - assert(!cir::MissingFeatures::opUnaryPromotionType()); - mlir::Value result = emitUnaryPlusOrMinus(e, cir::UnaryOpKind::Minus); - return result; + mlir::Value VisitUnaryMinus(const UnaryOperator *e) { + return emitUnaryPlusOrMinus(e, cir::UnaryOpKind::Minus); } mlir::Value emitUnaryPlusOrMinus(const UnaryOperator *e, cir::UnaryOpKind kind) { ignoreResultAssign = false; - assert(!cir::MissingFeatures::opUnaryPromotionType()); - mlir::Value operand = Visit(e->getSubExpr()); + QualType promotionType = getPromotionType(e->getSubExpr()->getType()); + + mlir::Value operand; + if (!promotionType.isNull()) + operand = cgf.emitPromotedScalarExpr(e->getSubExpr(), promotionType); + else + operand = Visit(e->getSubExpr()); bool nsw = kind == cir::UnaryOpKind::Minus && e->getType()->isSignedIntegerType(); // NOTE: LLVM codegen will lower this directly to either a FNeg // or a Sub instruction. In CIR this will be handled later in LowerToLLVM. - return emitUnaryOp(e, kind, operand, nsw); + mlir::Value result = emitUnaryOp(e, kind, operand, nsw); + if (result && !promotionType.isNull()) + return emitUnPromotedValue(result, e->getType()); + return result; } mlir::Value emitUnaryOp(const UnaryOperator *e, cir::UnaryOpKind kind, diff --git a/clang/test/CIR/CodeGen/unary.cpp b/clang/test/CIR/CodeGen/unary.cpp index ca47c1068e08d..a6405e653a07c 100644 --- a/clang/test/CIR/CodeGen/unary.cpp +++ b/clang/test/CIR/CodeGen/unary.cpp @@ -424,3 +424,45 @@ void chars(char c) { c++; // CHECK: cir.unary(inc, %{{.+}}) : !s8i, !s8i c--; // CHECK: cir.unary(dec, %{{.+}}) : !s8i, !s8i } + +_Float16 fp16UPlus(_Float16 f) { + return +f; +} + +// CHECK: cir.func @fp16UPlus({{.*}}) -> !cir.f16 +// CHECK: %[[INPUT:.*]] = cir.load %[[F:.*]] +// CHECK: %[[PROMOTED:.*]] = cir.cast(floating, %[[INPUT]] : !cir.f16), !cir.float +// CHECK: %[[RESULT:.*]] = cir.unary(plus, %[[PROMOTED]]) +// CHECK: %[[UNPROMOTED:.*]] = cir.cast(floating, %[[RESULT]] : !cir.float), !cir.f16 + +// LLVM: define half @fp16UPlus({{.*}}) +// LLVM: %[[F_LOAD:.*]] = load half, ptr %{{.*}}, align 2 +// LLVM: %[[PROMOTED:.*]] = fpext half %[[F_LOAD]] to float +// LLVM: %[[UNPROMOTED:.*]] = fptrunc float %[[PROMOTED]] to half + +// OGCG: define{{.*}} half @_Z9fp16UPlusDF16_({{.*}}) +// OGCG: %[[F_LOAD:.*]] = load half, ptr %{{.*}}, align 2 +// OGCG: %[[PROMOTED:.*]] = fpext half %[[F_LOAD]] to float +// OGCG: %[[UNPROMOTED:.*]] = fptrunc float %[[PROMOTED]] to half + +_Float16 fp16UMinus(_Float16 f) { + return -f; +} + +// CHECK: cir.func @fp16UMinus({{.*}}) -> !cir.f16 +// CHECK: %[[INPUT:.*]] = cir.load %[[F:.*]] +// CHECK: %[[PROMOTED:.*]] = cir.cast(floating, %[[INPUT]] : !cir.f16), !cir.float +// CHECK: %[[RESULT:.*]] = cir.unary(minus, %[[PROMOTED]]) +// CHECK: %[[UNPROMOTED:.*]] = cir.cast(floating, %[[RESULT]] : !cir.float), !cir.f16 + +// LLVM: define half @fp16UMinus({{.*}}) +// LLVM: %[[F_LOAD:.*]] = load half, ptr %{{.*}}, align 2 +// LLVM: %[[PROMOTED:.*]] = fpext half %[[F_LOAD]] to float +// LLVM: %[[RESULT:.*]] = fneg float %[[PROMOTED]] +// LLVM: %[[UNPROMOTED:.*]] = fptrunc float %[[RESULT]] to half + +// OGCG: define{{.*}} half @_Z10fp16UMinusDF16_({{.*}}) +// OGCG: %[[F_LOAD:.*]] = load half, ptr %{{.*}}, align 2 +// OGCG: %[[PROMOTED:.*]] = fpext half %[[F_LOAD]] to float +// OGCG: %[[RESULT:.*]] = fneg float %[[PROMOTED]] +// OGCG: %[[UNPROMOTED:.*]] = fptrunc float %[[RESULT]] to half From 0f696c2e866d7aa913c7a1eaadde9bd4378e25b1 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 3 Apr 2025 22:14:14 +0100 Subject: [PATCH 0583/1029] [LV] Add test where epilogue is vectorized and backedge removed. Adds extra test coverage for https://github.com/llvm/llvm-project/pull/106748. --- ...ctor-loop-backedge-elimination-epilogue.ll | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll new file mode 100644 index 0000000000000..2b2ab8da6638b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -mcpu=apple-m1 -S %s | FileCheck %s + +target triple = "arm64-apple-macosx" + +define void @test_remove_vector_loop_region_epilogue(ptr %dst, i1 %c) { +; CHECK-LABEL: define void @test_remove_vector_loop_region_epilogue( +; CHECK-SAME: ptr [[DST:%.*]], i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[TC:%.*]] = select i1 [[C]], i64 8, i64 0 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TC]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TC]], 64 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TC]], 64 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TC]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i32 32 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0]], i32 48 +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP1]], align 4 +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP2]], align 4 +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP3]], align 4 +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 4 +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TC]], [[N_VEC]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TC]], 8 +; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TC]], [[N_MOD_VF2]] +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[VEC_EPILOG_RESUME_VAL]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr [[TMP6]], align 4 +; CHECK-NEXT: br label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TC]], [[N_VEC3]] +; CHECK-NEXT: br i1 [[CMP_N4]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i8 0, ptr [[GEP]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[TC]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %tc = select i1 %c, i64 8, i64 0 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep = getelementptr i8, ptr %dst, i64 %iv + store i8 0, ptr %gep, align 4 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %tc + br i1 %ec, label %exit, label %loop + +exit: + ret void +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1} +;. From de40f6101d34dd7964d409366fca663c5515a941 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 3 Apr 2025 14:19:21 -0700 Subject: [PATCH 0584/1029] [flang][cuda][NFC] Use NVVM op for match all (#134303) --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 38 ++++++++----------- flang/test/Lower/CUDA/cuda-device-proc.cuf | 10 ++--- 2 files changed, 19 insertions(+), 29 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index a562d9b7e461c..349345c1a2ca0 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -6478,31 +6478,23 @@ IntrinsicLibrary::genMatchAllSync(mlir::Type resultType, assert(args.size() == 3); bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32(); - llvm::StringRef funcName = - is32 ? "llvm.nvvm.match.all.sync.i32p" : "llvm.nvvm.match.all.sync.i64p"; - mlir::MLIRContext *context = builder.getContext(); - mlir::Type i32Ty = builder.getI32Type(); - mlir::Type i64Ty = builder.getI64Type(); mlir::Type i1Ty = builder.getI1Type(); - mlir::Type retTy = mlir::TupleType::get(context, {resultType, i1Ty}); - mlir::Type valTy = is32 ? i32Ty : i64Ty; + mlir::MLIRContext *context = builder.getContext(); - mlir::FunctionType ftype = - mlir::FunctionType::get(context, {i32Ty, valTy}, {retTy}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - llvm::SmallVector filteredArgs; - filteredArgs.push_back(args[0]); - if (args[1].getType().isF32() || args[1].getType().isF64()) - filteredArgs.push_back(builder.create(loc, valTy, args[1])); - else - filteredArgs.push_back(args[1]); - auto call = builder.create(loc, funcOp, filteredArgs); - auto zero = builder.getIntegerAttr(builder.getIndexType(), 0); - auto value = builder.create( - loc, resultType, call.getResult(0), builder.getArrayAttr(zero)); - auto one = builder.getIntegerAttr(builder.getIndexType(), 1); - auto pred = builder.create(loc, i1Ty, call.getResult(0), - builder.getArrayAttr(one)); + mlir::Value arg1 = args[1]; + if (arg1.getType().isF32() || arg1.getType().isF64()) + arg1 = builder.create( + loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1); + + mlir::Type retTy = + mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty}); + auto match = + builder + .create(loc, retTy, args[0], arg1, + mlir::NVVM::MatchSyncKind::all) + .getResult(); + auto value = builder.create(loc, match, 0); + auto pred = builder.create(loc, match, 1); auto conv = builder.create(loc, resultType, pred); builder.create(loc, conv, args[2]); return value; diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index dbce4a5fa47dd..016d3bd1f1511 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -124,12 +124,10 @@ attributes(device) subroutine testMatch() end subroutine ! CHECK-LABEL: func.func @_QPtestmatch() -! CHECK: fir.call @llvm.nvvm.match.all.sync.i32p -! CHECK: fir.call @llvm.nvvm.match.all.sync.i64p -! CHECK: fir.convert %{{.*}} : (f32) -> i32 -! CHECK: fir.call @llvm.nvvm.match.all.sync.i32p -! CHECK: fir.convert %{{.*}} : (f64) -> i64 -! CHECK: fir.call @llvm.nvvm.match.all.sync.i64p +! CHECK: %{{.*}} = nvvm.match.sync all %{{.*}}, %{{.*}} : i32 -> !llvm.struct<(i32, i1)> +! CHECK: %{{.*}} = nvvm.match.sync all %{{.*}}, %{{.*}} : i64 -> !llvm.struct<(i32, i1)> +! CHECK: %{{.*}} = nvvm.match.sync all %{{.*}}, %{{.*}} : i32 -> !llvm.struct<(i32, i1)> +! CHECK: %{{.*}} = nvvm.match.sync all %{{.*}}, %{{.*}} : i64 -> !llvm.struct<(i32, i1)> attributes(device) subroutine testMatchAny() integer :: a, mask, v32 From fb6f60ddc5f2e386bbeaa2927a6081d15bc159fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Thu, 3 Apr 2025 14:19:31 -0700 Subject: [PATCH 0585/1029] [flang][cuda][NFC] Use NVVM VoteBallotOp (#134307) `llvm.nvvm.vote.ballot.sync` has its own operation so use it in lowering. --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 7 +++++-- flang/test/Lower/CUDA/cuda-device-proc.cuf | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 349345c1a2ca0..ba6743abaeaf7 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -6534,8 +6534,11 @@ mlir::Value IntrinsicLibrary::genVoteBallotSync(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); - return genVoteSync(builder, loc, "llvm.nvvm.vote.ballot.sync", - builder.getI32Type(), args); + mlir::Value arg1 = + builder.create(loc, builder.getI1Type(), args[1]); + return builder + .create(loc, resultType, args[0], arg1) + .getResult(); } // MATCH_ANY_SYNC diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 016d3bd1f1511..a7f9038761b51 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -303,7 +303,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtestvote() ! CHECK: fir.call @llvm.nvvm.vote.all.sync ! CHECK: fir.call @llvm.nvvm.vote.any.sync -! CHECK: fir.call @llvm.nvvm.vote.ballot.sync +! CHECK: %{{.*}} = nvvm.vote.ballot.sync %{{.*}}, %{{.*}} : i32 ! CHECK-DAG: func.func private @__ldca_i4x4_(!fir.ref>, !fir.ref>) ! CHECK-DAG: func.func private @__ldcg_i4x4_(!fir.ref>, !fir.ref>) From 85fdab33b09ab2246e69f806a85f1846a9ea1e51 Mon Sep 17 00:00:00 2001 From: Andre Kuhlenschmidt Date: Thu, 3 Apr 2025 14:33:53 -0700 Subject: [PATCH 0586/1029] [flang][intrinsic] add nonstandard intrinsic unlink (#134162) This PR adds the intrinsic `unlink` to flang. ## Test plan - Added two codegen unit tests and ensured flang-check continues to pass. - Manually compiled and ran the example from the documentation. --- flang-rt/lib/runtime/command.cpp | 19 ++++++- flang/docs/Intrinsics.md | 42 +++++++++++++++ .../flang/Optimizer/Builder/IntrinsicCall.h | 2 + .../flang/Optimizer/Builder/Runtime/Command.h | 5 ++ flang/include/flang/Runtime/command.h | 7 ++- flang/lib/Evaluate/intrinsics.cpp | 12 ++++- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 37 +++++++++++++ .../lib/Optimizer/Builder/Runtime/Command.cpp | 14 +++++ flang/test/Lower/Intrinsics/unlink-func.f90 | 24 +++++++++ flang/test/Lower/Intrinsics/unlink-sub.f90 | 54 +++++++++++++++++++ 10 files changed, 213 insertions(+), 3 deletions(-) create mode 100644 flang/test/Lower/Intrinsics/unlink-func.f90 create mode 100644 flang/test/Lower/Intrinsics/unlink-sub.f90 diff --git a/flang-rt/lib/runtime/command.cpp b/flang-rt/lib/runtime/command.cpp index d2e09639fdb59..b69143bf458ba 100644 --- a/flang-rt/lib/runtime/command.cpp +++ b/flang-rt/lib/runtime/command.cpp @@ -12,6 +12,7 @@ #include "flang-rt/runtime/stat.h" #include "flang-rt/runtime/terminator.h" #include "flang-rt/runtime/tools.h" +#include #include #include @@ -19,6 +20,7 @@ #include "flang/Common/windows-include.h" #include #define getcwd _getcwd +#define unlink _unlink #define PATH_MAX MAX_PATH #ifdef _MSC_VER @@ -27,7 +29,7 @@ inline pid_t getpid() { return GetCurrentProcessId(); } #endif #else -#include //getpid() +#include //getpid() unlink() #ifndef PATH_MAX #define PATH_MAX 4096 @@ -307,4 +309,19 @@ std::int32_t RTNAME(Hostnm)( return status; } +std::int32_t RTNAME(Unlink)( + const char *str, size_t strLength, const char *sourceFile, int line) { + Terminator terminator{sourceFile, line}; + + auto pathLength = TrimTrailingSpaces(str, strLength); + auto path = SaveDefaultCharacter(str, pathLength, terminator); + + std::int32_t status{0}; + + if (unlink(path.get()) != 0) { + status = errno; + } + + return status; +} } // namespace Fortran::runtime diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index ddb053d7a3d0b..e885ceca25aad 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -1091,6 +1091,48 @@ end program rename_proc This intrinsic is an alias for `CPU_TIME`: supporting both a subroutine and a function form. +### Non-Standard Intrinsics: UNLINK + +#### Description +`UNLINK(PATH [, STATUS])` deletes a link to a file. + +This intrinsic is provided in both subroutine and function forms; however, only +one form can be used in any given program unit. + +| ARGUMENT | INTENT | TYPE | KIND | Description | +|----------|--------|-------------|---------|---------------------------------| +| `PATH` | `IN` | `CHARACTER` | default | The path of the file to unlink. | +| `STATUS` | `OUT` | `INTEGER` | default | Optional. Returns 0 on success, C's `errno` on failure. | + +#### Usage and Info + +- **Standard:** GNU extension +- **Class:** Subroutine, function +- **Syntax:** `CALL UNLINK(PATH [, STATUS])`, `STATUS = UNLINK(PATH)` + +#### Example +The following example just prints "hello.txt doesn't exist". +```Fortran +SUBROUTINE try_unlink_hello_again() + INTEGER :: status + CALL UNLINK("hello.txt", status) + IF (status .NE. 0) PRINT *, "hello.txt doesn't exist" +END SUBROUTINE + +PROGRAM example_unlink + INTEGER :: hello + ! Create ./hello.txt + OPEN(newunit=hello, file="hello.txt") + WRITE (hello, *), "Hello!" + CLOSE(hello) + + ! Delete ./hello.txt + IF (UNLINK("hello.txt") .NE. 0) PRINT *, "didn't create a file" + + CALL try_unlink_hello_again() +END PROGRAM +``` + ### Non-standard Intrinsics: LNBLNK This intrinsic is an alias for `LEN_TRIM`, without the optional KIND argument. diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 4cbef141ced94..a4268e74d9a67 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -439,6 +439,8 @@ struct IntrinsicLibrary { void genThreadFenceSystem(llvm::ArrayRef); fir::ExtendedValue genTrim(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genUbound(mlir::Type, llvm::ArrayRef); + fir::ExtendedValue genUnlink(std::optional resultType, + llvm::ArrayRef args); fir::ExtendedValue genUnpack(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genVerify(mlir::Type, llvm::ArrayRef); mlir::Value genVoteAllSync(mlir::Type, llvm::ArrayRef); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Command.h b/flang/include/flang/Optimizer/Builder/Runtime/Command.h index ba0d3b094f40c..5880a703ed92e 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Command.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Command.h @@ -68,5 +68,10 @@ mlir::Value genHostnm(fir::FirOpBuilder &builder, mlir::Location loc, void genPerror(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value string); +/// Generate a call to the Unlink runtime function which implements +/// the UNLINK intrinsic. +mlir::Value genUnlink(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value path, mlir::Value pathLength); + } // namespace fir::runtime #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_COMMAND_H diff --git a/flang/include/flang/Runtime/command.h b/flang/include/flang/Runtime/command.h index e0069a9bc0321..16854c981ca23 100644 --- a/flang/include/flang/Runtime/command.h +++ b/flang/include/flang/Runtime/command.h @@ -63,7 +63,12 @@ std::int32_t RTNAME(GetCwd)( // Calls hostnm() std::int32_t RTNAME(Hostnm)( const Descriptor &res, const char *sourceFile, int line); -} + +// Calls unlink() +std::int32_t RTNAME(Unlink)( + const char *path, size_t pathLength, const char *sourceFile, int line); + +} // extern "C" } // namespace Fortran::runtime #endif // FORTRAN_RUNTIME_COMMAND_H_ diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index 0c15ec5473965..0eb8419491a61 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -1010,6 +1010,8 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ KINDUnsigned}, {"umaskl", {{"i", AnyInt}, DefaultingKIND}, KINDUnsigned}, {"umaskr", {{"i", AnyInt}, DefaultingKIND}, KINDUnsigned}, + {"unlink", {{"path", DefaultChar, Rank::scalar}}, DefaultInt, Rank::scalar, + IntrinsicClass::transformationalFunction}, {"unpack", {{"vector", SameType, Rank::vector}, {"mask", AnyLogical, Rank::array}, {"field", SameType, Rank::conformable}}, @@ -1319,6 +1321,8 @@ static const SpecificIntrinsicInterface specificIntrinsicFunction[]{ "abs"}, }; +// Must be sorted by name. The rank of the return value is ignored since +// subroutines are do not have a return value. static const IntrinsicInterface intrinsicSubroutine[]{ {"abort", {}, {}, Rank::elemental, IntrinsicClass::impureSubroutine}, {"atomic_add", @@ -1631,6 +1635,12 @@ static const IntrinsicInterface intrinsicSubroutine[]{ {{"seconds", AnyInt, Rank::scalar, Optionality::required, common::Intent::In}}, {}, Rank::elemental, IntrinsicClass::impureSubroutine}, + {"unlink", + {{"path", DefaultChar, Rank::scalar, Optionality::required, + common::Intent::In}, + {"status", DefaultInt, Rank::scalar, Optionality::optional, + common::Intent::Out}}, + {}, Rank::elemental, IntrinsicClass::impureSubroutine}, }; // Finds a built-in derived type and returns it as a DynamicType. @@ -2800,7 +2810,7 @@ bool IntrinsicProcTable::Implementation::IsDualIntrinsic( // Collection for some intrinsics with function and subroutine form, // in order to pass the semantic check. static const std::string dualIntrinsic[]{{"chdir"s}, {"etime"s}, {"getcwd"s}, - {"hostnm"s}, {"rename"s}, {"second"s}, {"system"s}}; + {"hostnm"s}, {"rename"s}, {"second"s}, {"system"s}, {"unlink"s}}; return llvm::is_contained(dualIntrinsic, name); } diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index ba6743abaeaf7..ad2324d5b5edc 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -937,6 +937,10 @@ static constexpr IntrinsicHandler handlers[]{ /*isElemental=*/false}, {"umaskl", &I::genMask}, {"umaskr", &I::genMask}, + {"unlink", + &I::genUnlink, + {{{"path", asAddr}, {"status", asAddr, handleDynamicOptional}}}, + /*isElemental=*/false}, {"unpack", &I::genUnpack, {{{"vector", asBox}, {"mask", asBox}, {"field", asBox}}}, @@ -8513,6 +8517,39 @@ static mlir::Value createExtremumCompare(mlir::Location loc, return result; } +// UNLINK +fir::ExtendedValue +IntrinsicLibrary::genUnlink(std::optional resultType, + llvm::ArrayRef args) { + assert((resultType.has_value() && args.size() == 1) || + (!resultType.has_value() && args.size() >= 1 && args.size() <= 2)); + + mlir::Value path = fir::getBase(args[0]); + mlir::Value pathLength = fir::getLen(args[0]); + mlir::Value statusValue = + fir::runtime::genUnlink(builder, loc, path, pathLength); + + if (resultType.has_value()) { + // Function form, return status. + return builder.createConvert(loc, *resultType, statusValue); + } + + // Subroutine form, store status and return none. + const fir::ExtendedValue &status = args[1]; + if (!isStaticallyAbsent(status)) { + mlir::Value statusAddr = fir::getBase(status); + mlir::Value statusIsPresentAtRuntime = + builder.genIsNotNullAddr(loc, statusAddr); + builder.genIfThen(loc, statusIsPresentAtRuntime) + .genThen([&]() { + builder.createStoreWithConvert(loc, statusValue, statusAddr); + }) + .end(); + } + + return {}; +} + // UNPACK fir::ExtendedValue IntrinsicLibrary::genUnpack(mlir::Type resultType, diff --git a/flang/lib/Optimizer/Builder/Runtime/Command.cpp b/flang/lib/Optimizer/Builder/Runtime/Command.cpp index 9b814c3395aa1..27ea5961837e6 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Command.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Command.cpp @@ -125,3 +125,17 @@ void fir::runtime::genPerror(fir::FirOpBuilder &builder, mlir::Location loc, fir::runtime::createArguments(builder, loc, runtimeFuncTy, string); builder.create(loc, runtimeFunc, args); } + +mlir::Value fir::runtime::genUnlink(fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Value path, + mlir::Value pathLength) { + mlir::func::FuncOp func = + fir::runtime::getRuntimeFunc(loc, builder); + auto runtimeFuncTy = func.getFunctionType(); + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, runtimeFuncTy.getInput(1)); + llvm::SmallVector args = fir::runtime::createArguments( + builder, loc, runtimeFuncTy, path, pathLength, sourceFile, sourceLine); + return builder.create(loc, func, args).getResult(0); +} diff --git a/flang/test/Lower/Intrinsics/unlink-func.f90 b/flang/test/Lower/Intrinsics/unlink-func.f90 new file mode 100644 index 0000000000000..15025a7c2f151 --- /dev/null +++ b/flang/test/Lower/Intrinsics/unlink-func.f90 @@ -0,0 +1,24 @@ +!RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s + +!CHECK-LABEL: func.func @_QPunlink_test +!CHECK-SAME: %[[dummyPath:.*]]: !fir.boxchar<1> {fir.bindc_name = "path"}) -> i32 { +integer function unlink_test(path) +CHARACTER(len=255) :: path + +!CHECK-DAG: %[[func_result:.*]] = fir.alloca i32 {bindc_name = "unlink_test", uniq_name = "_QFunlink_testEunlink_test"} +!CHECK-DAG: %[[func_result_decl:.*]]:{{.*}} = hlfir.declare %[[func_result]] {uniq_name = "_QFunlink_testEunlink_test"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK-DAG: %[[src_path_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref +!CHECK-DAG: %[[line_value:.*]] = arith.constant {{.*}} : i64 +!CHECK-DAG: %[[path:.*]] = fir.convert {{.*}} (!fir.ref>) -> !fir.ref +!CHECK-DAG: %[[path_len:.*]] = fir.convert {{.*}} : (index) -> i64 +!CHECK-DAG: %[[src_path:.*]] = fir.convert %[[src_path_addr]] : (!fir.ref) -> !fir.ref +!CHECK-DAG: %[[line:.*]] = fir.convert %[[line_value]] : (i64) -> i32 +!CHECK: %[[unlink_result:.*]] = fir.call @_FortranAUnlink(%[[path]], %[[path_len]], %[[src_path]], %[[line]]) +!CHECK-SAME: -> i32 + +! Check _FortranAUnlink result code handling +!CHECK-DAG: hlfir.assign %[[unlink_result]] to %[[func_result_decl]]#0 : i32, !fir.ref +!CHECK-DAG: %[[load_result:.*]] = fir.load %[[func_result_decl]]#0 : !fir.ref +!CHECK: return %[[load_result]] : i32 +unlink_test = unlink(path) +end function unlink_test diff --git a/flang/test/Lower/Intrinsics/unlink-sub.f90 b/flang/test/Lower/Intrinsics/unlink-sub.f90 new file mode 100644 index 0000000000000..78d2b1096ae82 --- /dev/null +++ b/flang/test/Lower/Intrinsics/unlink-sub.f90 @@ -0,0 +1,54 @@ +!RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s + +!CHECK-LABEL: func.func @_QPpath_only +!CHECK-SAME: %[[dummyPath:.*]]: !fir.boxchar<1> {fir.bindc_name = "path"}) { +subroutine path_only(path) + CHARACTER(len=*) :: path + !CHECK-DAG: %[[scope:.*]] = fir.dummy_scope : !fir.dscope + !CHECK-DAG: %[[unbox_path:.*]]:2 = fir.unboxchar %[[dummyPath]] : (!fir.boxchar<1>) -> (!fir.ref>, index) + !CHECK-DAG: %[[path_decl:.*]]:2 = hlfir.declare %[[unbox_path]]#0 typeparams %[[unbox_path]]#1 dummy_scope %[[scope]] {uniq_name = "_QFpath_onlyEpath"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) + !CHECK-DAG: %[[src_path_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref> + !CHECK-DAG: %[[line_value:.*]] = arith.constant {{.*}} : i64 + !CHECK-DAG: %[[path:.*]] = fir.convert %[[path_decl]]#1 : (!fir.ref>) -> !fir.ref + !CHECK-DAG: %[[path_len:.*]] = fir.convert %[[unbox_path]]#1 : (index) -> i64 + !CHECK-DAG: %[[src_path:.*]] = fir.convert %[[src_path_addr]] : (!fir.ref) -> !fir.ref + !CHECK-DAG: %[[line:.*]] = fir.convert %[[line_value]] : (i64) -> i32 + !CHECK: fir.call @_FortranAUnlink(%[[path]], %[[path_len]], %[[src_path]], %[[line]]) + !CHECK-SAME: : (!fir.ref, i64, !fir.ref, i32) + !CHECK-SAME: -> i32 + call unlink(path) + !CHECK: return +end subroutine path_only + !CHECK: } + + !CHECK-LABEL: func.func @_QPall_arguments + !CHECK-SAME: %[[dummyPath:.*]]: !fir.boxchar<1> {fir.bindc_name = "path"} + !CHECK-SAME: %[[dummyStat:.*]]: !fir.ref {fir.bindc_name = "status"} + !CHECK-SAME: ) { +subroutine all_arguments(path, status) + CHARACTER(len=*) :: path + INTEGER :: status + !CHECK-DAG: %[[scope:.*]] = fir.dummy_scope : !fir.dscope + !CHECK-DAG: %[[unbox_path:.*]]:2 = fir.unboxchar %[[dummyPath]] : (!fir.boxchar<1>) -> (!fir.ref>, index) + !CHECK-DAG: %[[path_decl:.*]]:2 = hlfir.declare %[[unbox_path]]#0 typeparams %[[unbox_path]]#1 dummy_scope %[[scope]] {uniq_name = "_QFall_argumentsEpath"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) + !CHECK-DAG: %[[status_decl:.*]]:2 = hlfir.declare %[[dummyStat]] dummy_scope %[[scope]] {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + !CHECK-DAG: %[[src_path_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref> + !CHECK-DAG: %[[line_value:.*]] = arith.constant {{.*}} : i64 + !CHECK-DAG: %[[path:.*]] = fir.convert %[[path_decl]]#1 : (!fir.ref>) -> !fir.ref + !CHECK-DAG: %[[path_len:.*]] = fir.convert %[[unbox_path]]#1 : (index) -> i64 + !CHECK-DAG: %[[src_path:.*]] = fir.convert %[[src_path_addr]] : (!fir.ref) -> !fir.ref + !CHECK-DAG: %[[line:.*]] = fir.convert %[[line_value]] : (i64) -> i32 + !CHECK: %[[unlink_result:.*]] = fir.call @_FortranAUnlink(%[[path]], %[[path_len]], %[[src_path]], %[[line]]) + !CHECK-SAME: : (!fir.ref, i64, !fir.ref, i32) + !CHECK-SAME: -> i32 + + !CHECK-DAG: %[[status_i64:.*]] = fir.convert %[[status_decl]]#0 : (!fir.ref) -> i64 + !CHECK-DAG: %[[c_null:.*]] = arith.constant 0 : i64 + !CHECK-DAG: %[[cmp_result:.*]] = arith.cmpi ne, %[[status_i64]], %[[c_null]] : i64 + !CHECK: fir.if %[[cmp_result]] { + !CHECK-NEXT: fir.store %[[unlink_result]] to %[[status_decl]]#0 : !fir.ref + !CHECK-NEXT: } + call unlink(path, status) + !CHECK: return +end subroutine all_arguments + !CHECK: } From 506630d6db7c848f8943fff752039b9e1c91cb63 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Thu, 3 Apr 2025 14:57:06 -0700 Subject: [PATCH 0587/1029] [clang][deps] Avoid unchecked error assertion (#134284) --- clang/tools/clang-scan-deps/ClangScanDeps.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index fa63649bb9028..13dab6e445733 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -929,7 +929,7 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { FileOS.emplace(OutputFileName, EC, llvm::sys::fs::OF_Text); if (EC) { llvm::errs() << "Failed to open output file '" << OutputFileName - << "': " << llvm::errorCodeToError(EC) << '\n'; + << "': " << EC.message() << '\n'; std::exit(1); } return *FileOS; From 109566a3d0cf20fc13a181a844c159032e7042ee Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Thu, 3 Apr 2025 15:21:26 -0700 Subject: [PATCH 0588/1029] [bazel] Fold "${Target}Analysis" targets into their respective CodeGen targets. (#134312) After 3801bf6164f570a145e3ebd20cf9114782ae0329, SPIRVAnalysis needs to include SPIRV.h provided by SPIRVCodegen, but the CodeGen target already depends on Analysis, so that would cause a circular dependency. Analysis is a subdirectory of CodeGen so it makes sense as a part of the main CodeGen target too. --- .../llvm-project-overlay/llvm/BUILD.bazel | 21 ++----------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 072147b7b6150..44fb4357c1e1f 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -3371,6 +3371,8 @@ gentbl_cc_library( name = target["name"] + "CodeGen", srcs = glob( [ + "lib/Target/" + target["name"] + "/Analysis/*.cpp", + "lib/Target/" + target["name"] + "/Analysis/*.h", "lib/Target/" + target["name"] + "/GISel/*.cpp", "lib/Target/" + target["name"] + "/GISel/*.h", "lib/Target/" + target["name"] + "/*.cpp", @@ -3406,7 +3408,6 @@ gentbl_cc_library( ":TransformUtils", ":Vectorize", ":config", - ":" + target["name"] + "Analysis", ":" + target["name"] + "CommonTableGen", ":" + target["name"] + "Info", ":" + target["name"] + "UtilsAndDesc", @@ -3511,24 +3512,6 @@ gentbl_cc_library( ":" + target["name"] + "UtilsAndDesc", ], )], - [cc_library( - name = target["name"] + "Analysis", - srcs = glob( - [ - "lib/Target/" + target["name"] + "/Analysis/*.cpp", - "lib/Target/" + target["name"] + "/Analysis/*.h", - ], - allow_empty = True, - ), - copts = llvm_copts, - features = ["-layering_check"], - deps = [ - ":Analysis", - ":Core", - ":Support", - ":TransformUtils", - ], - )], ] for target in llvm_target_lib_list] cc_library( From 996cf5dc6731d00da6ce5dc7a25b399d0ed29d54 Mon Sep 17 00:00:00 2001 From: Sumit Agarwal Date: Thu, 3 Apr 2025 15:23:09 -0700 Subject: [PATCH 0589/1029] [HLSL] Implement dot2add intrinsic (#131237) Resolves #99221 Key points: For SPIRV backend, it decompose into a `dot` followed a `add`. - [x] Implement dot2add clang builtin, - [x] Link dot2add clang builtin with hlsl_intrinsics.h - [x] Add sema checks for dot2add to CheckHLSLBuiltinFunctionCall in SemaHLSL.cpp - [x] Add codegen for dot2add to EmitHLSLBuiltinExpr in CGBuiltin.cpp - [x] Add codegen tests to clang/test/CodeGenHLSL/builtins/dot2add.hlsl - [x] Add sema tests to clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl - [x] Create the int_dx_dot2add intrinsic in IntrinsicsDirectX.td - [x] Create the DXILOpMapping of int_dx_dot2add to 162 in DXIL.td - [x] Create the dot2add.ll and dot2add_errors.ll tests in llvm/test/CodeGen/DirectX/ --- clang/include/clang/Basic/Builtins.td | 6 + clang/lib/CodeGen/CGHLSLBuiltins.cpp | 13 ++ .../lib/Headers/hlsl/hlsl_intrinsic_helpers.h | 8 ++ clang/lib/Headers/hlsl/hlsl_intrinsics.h | 15 ++ clang/test/CodeGenHLSL/builtins/dot2add.hlsl | 135 ++++++++++++++++++ .../SemaHLSL/BuiltIns/dot2add-errors.hlsl | 13 ++ llvm/include/llvm/IR/IntrinsicsDirectX.td | 4 + llvm/lib/Target/DirectX/DXIL.td | 11 ++ llvm/lib/Target/DirectX/DXILOpLowering.cpp | 19 ++- llvm/test/CodeGen/DirectX/dot2add.ll | 8 ++ 10 files changed, 228 insertions(+), 4 deletions(-) create mode 100644 clang/test/CodeGenHLSL/builtins/dot2add.hlsl create mode 100644 clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl create mode 100644 llvm/test/CodeGen/DirectX/dot2add.ll diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index b2c7ddb43de55..c7ca607e4b3d2 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -4891,6 +4891,12 @@ def HLSLDotProduct : LangBuiltin<"HLSL_LANG"> { let Prototype = "void(...)"; } +def HLSLDot2Add : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_dot2add"]; + let Attributes = [NoThrow, Const]; + let Prototype = "float(_ExtVector<2, _Float16>, _ExtVector<2, _Float16>, float)"; +} + def HLSLDot4AddI8Packed : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_dot4add_i8packed"]; let Attributes = [NoThrow, Const]; diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp index 99c62808c323d..07f6d0953f026 100644 --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp @@ -380,6 +380,19 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, getDotProductIntrinsic(CGM.getHLSLRuntime(), VecTy0->getElementType()), ArrayRef{Op0, Op1}, nullptr, "hlsl.dot"); } + case Builtin::BI__builtin_hlsl_dot2add: { + llvm::Triple::ArchType Arch = CGM.getTarget().getTriple().getArch(); + assert(Arch == llvm::Triple::dxil && + "Intrinsic dot2add is only allowed for dxil architecture"); + Value *A = EmitScalarExpr(E->getArg(0)); + Value *B = EmitScalarExpr(E->getArg(1)); + Value *C = EmitScalarExpr(E->getArg(2)); + + Intrinsic::ID ID = llvm ::Intrinsic::dx_dot2add; + return Builder.CreateIntrinsic( + /*ReturnType=*/C->getType(), ID, ArrayRef{A, B, C}, nullptr, + "dx.dot2add"); + } case Builtin::BI__builtin_hlsl_dot4add_i8packed: { Value *A = EmitScalarExpr(E->getArg(0)); Value *B = EmitScalarExpr(E->getArg(1)); diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h index 8cdd63d7e07bb..3c15f2b38d80f 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h @@ -45,6 +45,14 @@ distance_vec_impl(vector X, vector Y) { return length_vec_impl(X - Y); } +constexpr float dot2add_impl(half2 a, half2 b, float c) { +#if defined(__DIRECTX__) + return __builtin_hlsl_dot2add(a, b, c); +#else + return dot(a, b) + c; +#endif +} + template constexpr T reflect_impl(T I, T N) { return I - 2 * N * I * N; } diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index fd799b8d874ae..1a61fdba4fc19 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -175,6 +175,21 @@ const inline float distance(__detail::HLSL_FIXED_VECTOR X, return __detail::distance_vec_impl(X, Y); } +//===----------------------------------------------------------------------===// +// dot2add builtins +//===----------------------------------------------------------------------===// + +/// \fn float dot2add(half2 A, half2 B, float C) +/// \brief Dot product of 2 vector of type half and add a float scalar value. +/// \param A The first input value to dot product. +/// \param B The second input value to dot product. +/// \param C The input value added to the dot product. + +_HLSL_AVAILABILITY(shadermodel, 6.4) +const inline float dot2add(half2 A, half2 B, float C) { + return __detail::dot2add_impl(A, B, C); +} + //===----------------------------------------------------------------------===// // fmod builtins //===----------------------------------------------------------------------===// diff --git a/clang/test/CodeGenHLSL/builtins/dot2add.hlsl b/clang/test/CodeGenHLSL/builtins/dot2add.hlsl new file mode 100644 index 0000000000000..2464607dd636c --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/dot2add.hlsl @@ -0,0 +1,135 @@ +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \ +// RUN: dxil-pc-shadermodel6.3-compute %s -emit-llvm -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-DXIL +// RUN: %clang_cc1 -finclude-default-header -fnative-half-type -triple \ +// RUN: spirv-pc-vulkan-compute %s -emit-llvm -o - | \ +// RUN: FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV + +// Test basic lowering to runtime function call. + +// CHECK-LABEL: define {{.*}}test_default_parameter_type +float test_default_parameter_type(half2 p1, half2 p2, float p3) { + // CHECK-SPIRV: %[[MUL:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.spv.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) + // CHECK-SPIRV: %[[CONV:.*]] = fpext reassoc nnan ninf nsz arcp afn half %[[MUL]] to float + // CHECK-SPIRV: %[[C:.*]] = load float, ptr %c.addr.i, align 4 + // CHECK-SPIRV: %[[RES:.*]] = fadd reassoc nnan ninf nsz arcp afn float %[[CONV]], %[[C]] + // CHECK-DXIL: %[[RES:.*]] = call {{.*}} float @llvm.dx.dot2add.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}) + // CHECK: ret float %[[RES]] + return dot2add(p1, p2, p3); +} + +// CHECK-LABEL: define {{.*}}test_float_arg2_type +float test_float_arg2_type(half2 p1, float2 p2, float p3) { + // CHECK: %conv = fptrunc reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}} to <2 x half> + // CHECK-SPIRV: %[[MUL:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.spv.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) + // CHECK-SPIRV: %[[CONV:.*]] = fpext reassoc nnan ninf nsz arcp afn half %[[MUL]] to float + // CHECK-SPIRV: %[[C:.*]] = load float, ptr %c.addr.i, align 4 + // CHECK-SPIRV: %[[RES:.*]] = fadd reassoc nnan ninf nsz arcp afn float %[[CONV]], %[[C]] + // CHECK-DXIL: %[[RES:.*]] = call {{.*}} float @llvm.dx.dot2add.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}) + // CHECK: ret float %[[RES]] + return dot2add(p1, p2, p3); +} + +// CHECK-LABEL: define {{.*}}test_float_arg1_type +float test_float_arg1_type(float2 p1, half2 p2, float p3) { + // CHECK: %conv = fptrunc reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}} to <2 x half> + // CHECK-SPIRV: %[[MUL:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.spv.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) + // CHECK-SPIRV: %[[CONV:.*]] = fpext reassoc nnan ninf nsz arcp afn half %[[MUL]] to float + // CHECK-SPIRV: %[[C:.*]] = load float, ptr %c.addr.i, align 4 + // CHECK-SPIRV: %[[RES:.*]] = fadd reassoc nnan ninf nsz arcp afn float %[[CONV]], %[[C]] + // CHECK-DXIL: %[[RES:.*]] = call {{.*}} float @llvm.dx.dot2add.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}) + // CHECK: ret float %[[RES]] + return dot2add(p1, p2, p3); +} + +// CHECK-LABEL: define {{.*}}test_double_arg3_type +float test_double_arg3_type(half2 p1, half2 p2, double p3) { + // CHECK: %conv = fptrunc reassoc nnan ninf nsz arcp afn double %{{.*}} to float + // CHECK-SPIRV: %[[MUL:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.spv.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) + // CHECK-SPIRV: %[[CONV:.*]] = fpext reassoc nnan ninf nsz arcp afn half %[[MUL]] to float + // CHECK-SPIRV: %[[C:.*]] = load float, ptr %c.addr.i, align 4 + // CHECK-SPIRV: %[[RES:.*]] = fadd reassoc nnan ninf nsz arcp afn float %[[CONV]], %[[C]] + // CHECK-DXIL: %[[RES:.*]] = call {{.*}} float @llvm.dx.dot2add.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}) + // CHECK: ret float %[[RES]] + return dot2add(p1, p2, p3); +} + +// CHECK-LABEL: define {{.*}}test_float_arg1_arg2_type +float test_float_arg1_arg2_type(float2 p1, float2 p2, float p3) { + // CHECK: %conv = fptrunc reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}} to <2 x half> + // CHECK: %conv1 = fptrunc reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}} to <2 x half> + // CHECK-SPIRV: %[[MUL:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.spv.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) + // CHECK-SPIRV: %[[CONV:.*]] = fpext reassoc nnan ninf nsz arcp afn half %[[MUL]] to float + // CHECK-SPIRV: %[[C:.*]] = load float, ptr %c.addr.i, align 4 + // CHECK-SPIRV: %[[RES:.*]] = fadd reassoc nnan ninf nsz arcp afn float %[[CONV]], %[[C]] + // CHECK-DXIL: %[[RES:.*]] = call {{.*}} float @llvm.dx.dot2add.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}) + // CHECK: ret float %[[RES]] + return dot2add(p1, p2, p3); +} + +// CHECK-LABEL: define {{.*}}test_double_arg1_arg2_type +float test_double_arg1_arg2_type(double2 p1, double2 p2, float p3) { + // CHECK: %conv = fptrunc reassoc nnan ninf nsz arcp afn <2 x double> %{{.*}} to <2 x half> + // CHECK: %conv1 = fptrunc reassoc nnan ninf nsz arcp afn <2 x double> %{{.*}} to <2 x half> + // CHECK-SPIRV: %[[MUL:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.spv.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) + // CHECK-SPIRV: %[[CONV:.*]] = fpext reassoc nnan ninf nsz arcp afn half %[[MUL]] to float + // CHECK-SPIRV: %[[C:.*]] = load float, ptr %c.addr.i, align 4 + // CHECK-SPIRV: %[[RES:.*]] = fadd reassoc nnan ninf nsz arcp afn float %[[CONV]], %[[C]] + // CHECK-DXIL: %[[RES:.*]] = call {{.*}} float @llvm.dx.dot2add.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}) + // CHECK: ret float %[[RES]] + return dot2add(p1, p2, p3); +} + +// CHECK-LABEL: define {{.*}}test_int16_arg1_arg2_type +float test_int16_arg1_arg2_type(int16_t2 p1, int16_t2 p2, float p3) { + // CHECK: %conv = sitofp <2 x i16> %{{.*}} to <2 x half> + // CHECK: %conv1 = sitofp <2 x i16> %{{.*}} to <2 x half> + // CHECK-SPIRV: %[[MUL:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.spv.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) + // CHECK-SPIRV: %[[CONV:.*]] = fpext reassoc nnan ninf nsz arcp afn half %[[MUL]] to float + // CHECK-SPIRV: %[[C:.*]] = load float, ptr %c.addr.i, align 4 + // CHECK-SPIRV: %[[RES:.*]] = fadd reassoc nnan ninf nsz arcp afn float %[[CONV]], %[[C]] + // CHECK-DXIL: %[[RES:.*]] = call {{.*}} float @llvm.dx.dot2add.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}) + // CHECK: ret float %[[RES]] + return dot2add(p1, p2, p3); +} + +// CHECK-LABEL: define {{.*}}test_int32_arg1_arg2_type +float test_int32_arg1_arg2_type(int32_t2 p1, int32_t2 p2, float p3) { + // CHECK: %conv = sitofp <2 x i32> %{{.*}} to <2 x half> + // CHECK: %conv1 = sitofp <2 x i32> %{{.*}} to <2 x half> + // CHECK-SPIRV: %[[MUL:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.spv.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) + // CHECK-SPIRV: %[[CONV:.*]] = fpext reassoc nnan ninf nsz arcp afn half %[[MUL]] to float + // CHECK-SPIRV: %[[C:.*]] = load float, ptr %c.addr.i, align 4 + // CHECK-SPIRV: %[[RES:.*]] = fadd reassoc nnan ninf nsz arcp afn float %[[CONV]], %[[C]] + // CHECK-DXIL: %[[RES:.*]] = call {{.*}} float @llvm.dx.dot2add.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}) + // CHECK: ret float %[[RES]] + return dot2add(p1, p2, p3); +} + +// CHECK-LABEL: define {{.*}}test_int64_arg1_arg2_type +float test_int64_arg1_arg2_type(int64_t2 p1, int64_t2 p2, float p3) { + // CHECK: %conv = sitofp <2 x i64> %{{.*}} to <2 x half> + // CHECK: %conv1 = sitofp <2 x i64> %{{.*}} to <2 x half> + // CHECK-SPIRV: %[[MUL:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.spv.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) + // CHECK-SPIRV: %[[CONV:.*]] = fpext reassoc nnan ninf nsz arcp afn half %[[MUL]] to float + // CHECK-SPIRV: %[[C:.*]] = load float, ptr %c.addr.i, align 4 + // CHECK-SPIRV: %[[RES:.*]] = fadd reassoc nnan ninf nsz arcp afn float %[[CONV]], %[[C]] + // CHECK-DXIL: %[[RES:.*]] = call {{.*}} float @llvm.dx.dot2add.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}) + // CHECK: ret float %[[RES]] + return dot2add(p1, p2, p3); +} + +// CHECK-LABEL: define {{.*}}test_bool_arg1_arg2_type +float test_bool_arg1_arg2_type(bool2 p1, bool2 p2, float p3) { + // CHECK: %loadedv = trunc <2 x i32> %{{.*}} to <2 x i1> + // CHECK: %conv = uitofp <2 x i1> %loadedv to <2 x half> + // CHECK: %loadedv1 = trunc <2 x i32> %{{.*}} to <2 x i1> + // CHECK: %conv2 = uitofp <2 x i1> %loadedv1 to <2 x half> + // CHECK-SPIRV: %[[MUL:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.spv.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) + // CHECK-SPIRV: %[[CONV:.*]] = fpext reassoc nnan ninf nsz arcp afn half %[[MUL]] to float + // CHECK-SPIRV: %[[C:.*]] = load float, ptr %c.addr.i, align 4 + // CHECK-SPIRV: %[[RES:.*]] = fadd reassoc nnan ninf nsz arcp afn float %[[CONV]], %[[C]] + // CHECK-DXIL: %[[RES:.*]] = call {{.*}} float @llvm.dx.dot2add.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, float %{{.*}}) + // CHECK: ret float %[[RES]] + return dot2add(p1, p2, p3); +} diff --git a/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl new file mode 100644 index 0000000000000..262ceecbf1d90 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/dot2add-errors.hlsl @@ -0,0 +1,13 @@ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify + +float test_too_few_arg() { + return dot2add(); + // expected-error@-1 {{no matching function for call to 'dot2add'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 3 arguments, but 0 were provided}} +} + +float test_too_many_arg(half2 p1, half2 p2, float p3) { + return dot2add(p1, p2, p3, p1); + // expected-error@-1 {{no matching function for call to 'dot2add'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function not viable: requires 3 arguments, but 4 were provided}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index ead7286f4311c..775d325feeb14 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -100,6 +100,10 @@ def int_dx_udot : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>], [IntrNoMem, Commutative] >; +def int_dx_dot2add : + DefaultAttrsIntrinsic<[llvm_float_ty], + [llvm_anyfloat_ty, LLVMMatchType<0>, llvm_float_ty], + [IntrNoMem, Commutative]>; def int_dx_dot4add_i8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_dx_dot4add_u8packed : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 1d8904bdf5514..b1e7406ead675 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -1077,6 +1077,17 @@ def RawBufferStore : DXILOp<140, rawBufferStore> { let stages = [Stages]; } +def Dot2AddHalf : DXILOp<162, dot2AddHalf> { + let Doc = "dot product of 2 vectors of half having size = 2, returns " + "float"; + let intrinsics = [IntrinSelect]; + let arguments = [FloatTy, HalfTy, HalfTy, HalfTy, HalfTy]; + let result = FloatTy; + let overloads = [Overloads]; + let stages = [Stages]; + let attributes = [Attributes]; +} + def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> { let Doc = "signed dot product of 4 x i8 vectors packed into i32, with " "accumulate to i32"; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index dff9f3e03079e..3dcd3d8fd244a 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -54,10 +54,8 @@ static SmallVector populateOperands(Value *Arg, IRBuilder<> &Builder) { return ExtractedElements; } -static SmallVector argVectorFlatten(CallInst *Orig, - IRBuilder<> &Builder) { - // Note: arg[NumOperands-1] is a pointer and is not needed by our flattening. - unsigned NumOperands = Orig->getNumOperands() - 1; +static SmallVector +argVectorFlatten(CallInst *Orig, IRBuilder<> &Builder, unsigned NumOperands) { assert(NumOperands > 0); Value *Arg0 = Orig->getOperand(0); [[maybe_unused]] auto *VecArg0 = dyn_cast(Arg0->getType()); @@ -75,6 +73,12 @@ static SmallVector argVectorFlatten(CallInst *Orig, return NewOperands; } +static SmallVector argVectorFlatten(CallInst *Orig, + IRBuilder<> &Builder) { + // Note: arg[NumOperands-1] is a pointer and is not needed by our flattening. + return argVectorFlatten(Orig, Builder, Orig->getNumOperands() - 1); +} + namespace { class OpLowerer { Module &M; @@ -168,6 +172,13 @@ class OpLowerer { } } else if (IsVectorArgExpansion) { Args = argVectorFlatten(CI, OpBuilder.getIRB()); + } else if (F.getIntrinsicID() == Intrinsic::dx_dot2add) { + // arg[NumOperands-1] is a pointer and is not needed by our flattening. + // arg[NumOperands-2] also does not need to be flattened because it is a + // scalar. + unsigned NumOperands = CI->getNumOperands() - 2; + Args.push_back(CI->getArgOperand(NumOperands)); + Args.append(argVectorFlatten(CI, OpBuilder.getIRB(), NumOperands)); } else { Args.append(CI->arg_begin(), CI->arg_end()); } diff --git a/llvm/test/CodeGen/DirectX/dot2add.ll b/llvm/test/CodeGen/DirectX/dot2add.ll new file mode 100644 index 0000000000000..40c6cdafc83da --- /dev/null +++ b/llvm/test/CodeGen/DirectX/dot2add.ll @@ -0,0 +1,8 @@ +; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s + +define noundef float @dot2add_simple(<2 x half> noundef %a, <2 x half> noundef %b, float %c) { +entry: +; CHECK: call float @dx.op.dot2AddHalf(i32 162, float %c, half %0, half %1, half %2, half %3) + %ret = call float @llvm.dx.dot2add(<2 x half> %a, <2 x half> %b, float %c) + ret float %ret +} From b11eece1bb582fa1d06ca61e210f9741e0a357b7 Mon Sep 17 00:00:00 2001 From: Andre Kuhlenschmidt Date: Thu, 3 Apr 2025 15:33:40 -0700 Subject: [PATCH 0590/1029] [flang][intrinsics] Implement the time intrinsic (#133823) This PR implements the nonstandard intrinsic time. In addition to running the unit tests, I also double checked that the example code works by manually compiling and running it. --- flang-rt/lib/runtime/extensions.cpp | 3 +++ flang/docs/Intrinsics.md | 21 +++++++++++++++++++ .../flang/Optimizer/Builder/IntrinsicCall.h | 1 + .../Optimizer/Builder/Runtime/Intrinsics.h | 3 +++ flang/include/flang/Runtime/extensions.h | 3 +++ flang/lib/Evaluate/intrinsics.cpp | 2 ++ flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 11 +++++++++- .../Optimizer/Builder/Runtime/Intrinsics.cpp | 7 +++++++ flang/test/Lower/Intrinsics/time.f90 | 17 +++++++++++++++ 9 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 flang/test/Lower/Intrinsics/time.f90 diff --git a/flang-rt/lib/runtime/extensions.cpp b/flang-rt/lib/runtime/extensions.cpp index 618e184e28519..a73279e445797 100644 --- a/flang-rt/lib/runtime/extensions.cpp +++ b/flang-rt/lib/runtime/extensions.cpp @@ -272,5 +272,8 @@ void FORTRAN_PROCEDURE_NAME(qsort)(int *array, int *len, int *isize, // PERROR(STRING) void RTNAME(Perror)(const char *str) { perror(str); } +// GNU extension function TIME() +std::int64_t RTNAME(time)() { return time(nullptr); } + } // namespace Fortran::runtime } // extern "C" diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index e885ceca25aad..8b675c33b09d1 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -1091,6 +1091,27 @@ end program rename_proc This intrinsic is an alias for `CPU_TIME`: supporting both a subroutine and a function form. +### Non-Standard Intrinsics: TIME + +#### Description +`TIME()` returns the current time of the system as a INTEGER(8). + +#### Usage and Info + +- **Standard:** GNU extension +- **Class:** function +- **Syntax:** `RESULT = TIME()` + +#### Example +```Fortran +PROGRAM example_time + print *, TIME() + print *, TIME() + call SLEEP(10) + print *, TIME() +END PROGRAM +``` + ### Non-Standard Intrinsics: UNLINK #### Description diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index a4268e74d9a67..29cde05480173 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -429,6 +429,7 @@ struct IntrinsicLibrary { mlir::ArrayRef args); void genSystemClock(llvm::ArrayRef); mlir::Value genTand(mlir::Type, llvm::ArrayRef); + mlir::Value genTime(mlir::Type, llvm::ArrayRef); mlir::Value genTrailz(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genTransfer(mlir::Type, llvm::ArrayRef); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h index 51d2dc82f98ae..2e5adf6bd0ab7 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h @@ -65,6 +65,9 @@ void genRandomSeed(fir::FirOpBuilder &, mlir::Location, mlir::Value size, void genRename(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value path1, mlir::Value path2, mlir::Value status); +/// generate time runtime call +mlir::Value genTime(fir::FirOpBuilder &builder, mlir::Location loc); + /// generate runtime call to transfer intrinsic with no size argument void genTransfer(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value resultBox, mlir::Value sourceBox, diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h index 57de3f8f05948..47ef4c12ef73a 100644 --- a/flang/include/flang/Runtime/extensions.h +++ b/flang/include/flang/Runtime/extensions.h @@ -65,6 +65,9 @@ std::int64_t RTNAME(Signal)(std::int64_t number, void (*handler)(int)); // GNU extension subroutine SLEEP(SECONDS) void RTNAME(Sleep)(std::int64_t seconds); +// GNU extension function TIME() +std::int64_t RTNAME(time)(); + // GNU extension function ACCESS(NAME, MODE) // TODO: not supported on Windows #ifndef _WIN32 diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index 0eb8419491a61..e4f82b7fddb02 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -977,6 +977,8 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ DefaultInt, Rank::vector, IntrinsicClass::transformationalFunction}, {"this_image", {OptionalTEAM}, DefaultInt, Rank::scalar, IntrinsicClass::transformationalFunction}, + {"time", {}, TypePattern{IntType, KindCode::exactKind, 8}, Rank::scalar, + IntrinsicClass::transformationalFunction}, {"tiny", {{"x", SameReal, Rank::anyOrAssumedRank, Optionality::required, common::Intent::In, {ArgFlag::canBeMoldNull}}}, diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index ad2324d5b5edc..2df9349269a69 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -769,7 +769,7 @@ static constexpr IntrinsicHandler handlers[]{ {"perror", &I::genPerror, {{{"string", asBox}}}, - /*isElemental=*/false}, + /*isElemental*/ false}, {"popcnt", &I::genPopcnt}, {"poppar", &I::genPoppar}, {"present", @@ -921,6 +921,7 @@ static constexpr IntrinsicHandler handlers[]{ {"threadfence", &I::genThreadFence, {}, /*isElemental=*/false}, {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false}, {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false}, + {"time", &I::genTime, {}, /*isElemental=*/false}, {"trailz", &I::genTrailz}, {"transfer", &I::genTransfer, @@ -8428,6 +8429,14 @@ void IntrinsicLibrary::genThreadFenceSystem( builder.create(loc, funcOp, noArgs); } +// TIME +mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 0); + return builder.createConvert(loc, resultType, + fir::runtime::genTime(builder, loc)); +} + // TRIM fir::ExtendedValue IntrinsicLibrary::genTrim(mlir::Type resultType, diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp index 2f46e7605fe91..3aad0625042a2 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp @@ -252,6 +252,13 @@ void fir::runtime::genRename(fir::FirOpBuilder &builder, mlir::Location loc, builder.create(loc, runtimeFunc, args); } +/// generate runtime call to time intrinsic +mlir::Value fir::runtime::genTime(fir::FirOpBuilder &builder, + mlir::Location loc) { + auto func = fir::runtime::getRuntimeFunc(loc, builder); + return builder.create(loc, func, std::nullopt).getResult(0); +} + /// generate runtime call to transfer intrinsic with no size argument void fir::runtime::genTransfer(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value resultBox, mlir::Value sourceBox, diff --git a/flang/test/Lower/Intrinsics/time.f90 b/flang/test/Lower/Intrinsics/time.f90 new file mode 100644 index 0000000000000..eac492572ef89 --- /dev/null +++ b/flang/test/Lower/Intrinsics/time.f90 @@ -0,0 +1,17 @@ +!RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s + +!CHECK-LABEL: func.func @_QPtime_test() -> i64 +function time_test() + Integer(kind=8) :: time_test + + + !CHECK-DAG: %[[func_result:.*]] = fir.alloca i64 {bindc_name = "time_test", uniq_name = "_QFtime_testEtime_test"} + !CHECK-DAG: %[[func_result_decl:.*]]:{{.*}} = hlfir.declare %[[func_result]] {uniq_name = "_QFtime_testEtime_test"} : {{.*}}fir.ref{{.*}} + !CHECK: %[[call_result:.*]] = fir.call @_FortranAtime() + !CHECK-SAME: -> i64 + + !CHECK-DAG: hlfir.assign %[[call_result]] to %[[func_result_decl]]#[[func_result]] : i64, !fir.ref + !CHECK-DAG: %[[load_result:.*]] = fir.load %[[func_result_decl]]#[[func_result]] : !fir.ref + !CHECK: return %[[load_result]] : i64 + time_test = time() +end function time_test From 2cd8edd1fff2a9d82902f70270b4209463a34cba Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 3 Apr 2025 18:34:49 -0400 Subject: [PATCH 0591/1029] [libc++] Add missing release note for LLVM 20 about zip_view (#134144) We should have had a release note in LLVM 20 about implementing P2165R4 since that is technically an ABI and API break for zip_view. We don't expect anyone to actually hit the ABI issue, but we've come across some (fairly small) breakage due to the API change, so this should at least be mentioned in the release notes. --- libcxx/docs/ReleaseNotes/20.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst index 57ab0c167544b..06e6e673b5508 100644 --- a/libcxx/docs/ReleaseNotes/20.rst +++ b/libcxx/docs/ReleaseNotes/20.rst @@ -162,6 +162,12 @@ Deprecations and Removals - Non-conforming extension ``packaged_task::result_type`` is deprecated. It will be removed in LLVM 21. +- The changes for ``ranges::zip_view`` from `P2165R4 `_ have been implemented. This can + lead to code assuming that ``zip_view`` produces ``std::pair`` to stop compiling now that it produces ``std::tuple``. + The cases are rare since ``tuple`` and ``pair`` are compatible for the most part, but this can lead to code that + was previously accepted now being rejected. This is necessary for libc++ to be conforming, so we don't provide any + way to opt-out of that behavior. + Upcoming Deprecations and Removals ---------------------------------- @@ -205,3 +211,8 @@ ABI Affecting Changes - The localization support base API has been reimplemented, leading to different functions being exported from the libc++ built library on Windows and Windows-like platforms. + +- The changes for ``ranges::zip_view`` from `P2165R4 `_ have been implemented. This changes + the element type of ``zip_view`` from a ``std::pair`` to a ``std::tuple`` in some cases. This is technically an ABI + break, however since ``zip_view`` is generally not an ABI sensitive type, we don't expect users to encounter any + issues and we don't provide a way to change this behavior, which would make libc++ non-conforming. From f1c6612202d88cbde224387621327a31609f2177 Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Thu, 3 Apr 2025 15:48:43 -0700 Subject: [PATCH 0592/1029] [lldb][debugserver] Save and restore the SVE/SME register state (#134184) debugserver isn't saving and restoring the SVE/SME register state around inferior function calls. Making arbitrary function calls while in Streaming SVE mode is generally a poor idea because a NEON instruction can be hit and crash the expression execution, which is how I missed this, but they should be handled correctly if the user knows it is safe to do. Re-landing this change after fixing an incorrect behavior on systems without SME support. rdar://146886210 --- .../source/MacOSX/arm64/DNBArchImplARM64.cpp | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp index 34a4ee21f8502..6ee1466612ee1 100644 --- a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp +++ b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp @@ -2952,8 +2952,15 @@ kern_return_t DNBArchMachARM64::SetRegisterState(int set) { return err; switch (set) { - case e_regSetALL: - return SetGPRState() | SetVFPState() | SetEXCState() | SetDBGState(false); + case e_regSetALL: { + kern_return_t ret = + SetGPRState() | SetVFPState() | SetEXCState() | SetDBGState(false); + if (CPUHasSME()) { + SetSVEState(); + SetSMEState(); + } + return ret; + } case e_regSetGPR: return SetGPRState(); case e_regSetVFP: @@ -3123,6 +3130,12 @@ uint32_t DNBArchMachARM64::SaveRegisterState() { "error: %s regs failed to read: %u", "VFP", kret); } else { + if (CPUHasSME()) { + // These can fail when processor is not in streaming SVE mode, + // and that failure should be ignored. + GetSVEState(force); + GetSMEState(force); + } const uint32_t save_id = GetNextRegisterStateSaveID(); m_saved_register_states[save_id] = m_state.context; return save_id; @@ -3150,6 +3163,12 @@ bool DNBArchMachARM64::RestoreRegisterState(uint32_t save_id) { save_id, "VFP", kret); success = false; } + if (CPUHasSME()) { + // These can fail when processor is not in streaming SVE mode, + // and that failure should be ignored. + SetSVEState(); + SetSMEState(); + } m_saved_register_states.erase(pos); return success; } From 9f2feeb1891a4d58823e6cbbb404847d6e43e65b Mon Sep 17 00:00:00 2001 From: modiking Date: Thu, 3 Apr 2025 15:50:54 -0700 Subject: [PATCH 0593/1029] [mlir][gpu][nvptx] Remove null terminator when outputting PTX (#133019) PTX source files are expected to only contain ASCII text (https://docs.nvidia.com/cuda/parallel-thread-execution/#source-format) and no null terminators. `ptxas` has so far not enforced this but is moving towards doing so. This revealed a problem where the null terminator is getting printed out in the output file in MLIR path when outputting ptx directly. Only add the null on the assembly output path for JIT instead of in output of `moduleToObject `. --- mlir/lib/Target/LLVM/NVVM/Target.cpp | 8 ++------ mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp | 5 ++++- mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp | 1 + 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp index 023c65b3dd9df..586748df8d154 100644 --- a/mlir/lib/Target/LLVM/NVVM/Target.cpp +++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp @@ -722,12 +722,8 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) { #undef DEBUG_TYPE // Return PTX if the compilation target is `assembly`. - if (targetOptions.getCompilationTarget() == - gpu::CompilationTarget::Assembly) { - // Make sure to include the null terminator. - StringRef bin(serializedISA->c_str(), serializedISA->size() + 1); - return SmallVector(bin.begin(), bin.end()); - } + if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly) + return SmallVector(serializedISA->begin(), serializedISA->end()); std::optional> result; moduleToObjectTimer.startTimer(); diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp index ade239c526af8..8d4a0bcf8adbf 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp @@ -116,8 +116,11 @@ LogicalResult SelectObjectAttrImpl::embedBinary( llvm::Module *module = moduleTranslation.getLLVMModule(); // Embed the object as a global string. + // Add null for assembly output for JIT paths that expect null-terminated + // strings. + bool addNull = (object.getFormat() == gpu::CompilationTarget::Assembly); llvm::Constant *binary = llvm::ConstantDataArray::getString( - builder.getContext(), object.getObject().getValue(), false); + builder.getContext(), object.getObject().getValue(), addNull); llvm::GlobalVariable *serializedObj = new llvm::GlobalVariable(*module, binary->getType(), true, llvm::GlobalValue::LinkageTypes::InternalLinkage, diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp index eabfd1c4d32eb..cae713a1ce1d2 100644 --- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp +++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp @@ -130,6 +130,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMToPTX)) { ASSERT_TRUE( StringRef(object->data(), object->size()).contains("nvvm_kernel")); + ASSERT_TRUE(StringRef(object->data(), object->size()).count('\0') == 0); } } From bd197ca00365cd35002792324bd149f71e864e1a Mon Sep 17 00:00:00 2001 From: Ian Anderson Date: Thu, 3 Apr 2025 16:09:57 -0700 Subject: [PATCH 0594/1029] [clang][modules] Determine if the SDK supports builtin modules independent of the target (#134005) Whether the SDK supports builtin modules is a property of the SDK itself, and really has nothing to do with the target. This was already worked around for Mac Catalyst, but there are some other more esoteric non-obvious target-to-sdk mappings that aren't handled. Have the SDK parse its OS out of CanonicalName and use that instead of the target to determine if builtin modules are supported. --- clang/include/clang/Basic/DarwinSDKInfo.h | 6 +- clang/lib/Basic/DarwinSDKInfo.cpp | 26 +++++++- clang/lib/Driver/ToolChains/Darwin.cpp | 60 +++++++++---------- .../Inputs/DriverKit23.0.sdk/SDKSettings.json | 2 +- .../Inputs/MacOSX10.14.sdk/SDKSettings.json | 2 +- .../SDKSettings.json | 1 + .../Inputs/MacOSX15.0.sdk/SDKSettings.json | 2 +- .../Inputs/MacOSX15.1.sdk/SDKSettings.json | 2 +- .../Inputs/WatchOS6.0.sdk/SDKSettings.json | 2 +- .../Inputs/iPhoneOS13.0.sdk/SDKSettings.json | 2 +- .../darwin-ld-platform-version-watchos.c | 4 +- .../Inputs/MacOSX13.0.sdk/SDKSettings.json | 1 + .../Inputs/AppleTVOS15.0.sdk/SDKSettings.json | 1 + .../Inputs/MacOSX11.0.sdk/SDKSettings.json | 1 + .../Inputs/WatchOS7.0.sdk/SDKSettings.json | 1 + 15 files changed, 72 insertions(+), 41 deletions(-) diff --git a/clang/include/clang/Basic/DarwinSDKInfo.h b/clang/include/clang/Basic/DarwinSDKInfo.h index db20b968a898e..bc122c7d21c72 100644 --- a/clang/include/clang/Basic/DarwinSDKInfo.h +++ b/clang/include/clang/Basic/DarwinSDKInfo.h @@ -143,16 +143,19 @@ class DarwinSDKInfo { DarwinSDKInfo( VersionTuple Version, VersionTuple MaximumDeploymentTarget, + llvm::Triple::OSType OS, llvm::DenseMap> VersionMappings = llvm::DenseMap>()) : Version(Version), MaximumDeploymentTarget(MaximumDeploymentTarget), - VersionMappings(std::move(VersionMappings)) {} + OS(OS), VersionMappings(std::move(VersionMappings)) {} const llvm::VersionTuple &getVersion() const { return Version; } + const llvm::Triple::OSType &getOS() const { return OS; } + // Returns the optional, target-specific version mapping that maps from one // target to another target. // @@ -177,6 +180,7 @@ class DarwinSDKInfo { private: VersionTuple Version; VersionTuple MaximumDeploymentTarget; + llvm::Triple::OSType OS; // Need to wrap the value in an optional here as the value has to be default // constructible, and std::unique_ptr doesn't like DarwinSDKInfo being // Optional as Optional is trying to copy it in emplace. diff --git a/clang/lib/Basic/DarwinSDKInfo.cpp b/clang/lib/Basic/DarwinSDKInfo.cpp index 00aa5f9e63cd3..6bcfb9d598377 100644 --- a/clang/lib/Basic/DarwinSDKInfo.cpp +++ b/clang/lib/Basic/DarwinSDKInfo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang/Basic/DarwinSDKInfo.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorOr.h" #include "llvm/Support/JSON.h" #include "llvm/Support/MemoryBuffer.h" @@ -62,6 +63,28 @@ DarwinSDKInfo::RelatedTargetVersionMapping::parseJSON( Min, Max, MinValue, MaximumDeploymentTarget, std::move(Mapping)); } +static llvm::Triple::OSType parseOS(const llvm::json::Object &Obj) { + // The CanonicalName is the Xcode platform followed by a version, e.g. + // macosx16.0. + auto CanonicalName = Obj.getString("CanonicalName"); + if (!CanonicalName) + return llvm::Triple::UnknownOS; + size_t VersionStart = CanonicalName->find_first_of("0123456789"); + StringRef XcodePlatform = CanonicalName->slice(0, VersionStart); + return llvm::StringSwitch(XcodePlatform) + .Case("macosx", llvm::Triple::MacOSX) + .Case("iphoneos", llvm::Triple::IOS) + .Case("iphonesimulator", llvm::Triple::IOS) + .Case("appletvos", llvm::Triple::TvOS) + .Case("appletvsimulator", llvm::Triple::TvOS) + .Case("watchos", llvm::Triple::WatchOS) + .Case("watchsimulator", llvm::Triple::WatchOS) + .Case("xros", llvm::Triple::XROS) + .Case("xrsimulator", llvm::Triple::XROS) + .Case("driverkit", llvm::Triple::DriverKit) + .Default(llvm::Triple::UnknownOS); +} + static std::optional getVersionKey(const llvm::json::Object &Obj, StringRef Key) { auto Value = Obj.getString(Key); @@ -82,6 +105,7 @@ DarwinSDKInfo::parseDarwinSDKSettingsJSON(const llvm::json::Object *Obj) { getVersionKey(*Obj, "MaximumDeploymentTarget"); if (!MaximumDeploymentVersion) return std::nullopt; + llvm::Triple::OSType OS = parseOS(*Obj); llvm::DenseMap> VersionMappings; @@ -124,7 +148,7 @@ DarwinSDKInfo::parseDarwinSDKSettingsJSON(const llvm::json::Object *Obj) { } return DarwinSDKInfo(std::move(*Version), - std::move(*MaximumDeploymentVersion), + std::move(*MaximumDeploymentVersion), OS, std::move(VersionMappings)); } diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index e41720a824380..63e9fbfd4304c 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1886,7 +1886,8 @@ struct DarwinPlatform { assert(IsValid && "invalid SDK version"); return DarwinSDKInfo( Version, - /*MaximumDeploymentTarget=*/VersionTuple(Version.getMajor(), 0, 99)); + /*MaximumDeploymentTarget=*/VersionTuple(Version.getMajor(), 0, 99), + getOSFromPlatform(Platform)); } private: @@ -1916,6 +1917,23 @@ struct DarwinPlatform { } } + static llvm::Triple::OSType getOSFromPlatform(DarwinPlatformKind Platform) { + switch (Platform) { + case DarwinPlatformKind::MacOS: + return llvm::Triple::MacOSX; + case DarwinPlatformKind::IPhoneOS: + return llvm::Triple::IOS; + case DarwinPlatformKind::TvOS: + return llvm::Triple::TvOS; + case DarwinPlatformKind::WatchOS: + return llvm::Triple::WatchOS; + case DarwinPlatformKind::DriverKit: + return llvm::Triple::DriverKit; + case DarwinPlatformKind::XROS: + return llvm::Triple::XROS; + } + } + SourceKind Kind; DarwinPlatformKind Platform; DarwinEnvironmentKind Environment = DarwinEnvironmentKind::NativeEnvironment; @@ -2966,20 +2984,8 @@ bool Darwin::isAlignedAllocationUnavailable() const { return TargetVersion < alignedAllocMinVersion(OS); } -static bool sdkSupportsBuiltinModules( - const Darwin::DarwinPlatformKind &TargetPlatform, - const Darwin::DarwinEnvironmentKind &TargetEnvironment, - const std::optional &SDKInfo) { - if (TargetEnvironment == Darwin::NativeEnvironment || - TargetEnvironment == Darwin::Simulator || - TargetEnvironment == Darwin::MacCatalyst) { - // Standard xnu/Mach/Darwin based environments - // depend on the SDK version. - } else { - // All other environments support builtin modules from the start. - return true; - } - +static bool +sdkSupportsBuiltinModules(const std::optional &SDKInfo) { if (!SDKInfo) // If there is no SDK info, assume this is building against a // pre-SDK version of macOS (i.e. before Mac OS X 10.4). Those @@ -2990,26 +2996,18 @@ static bool sdkSupportsBuiltinModules( return false; VersionTuple SDKVersion = SDKInfo->getVersion(); - switch (TargetPlatform) { + switch (SDKInfo->getOS()) { // Existing SDKs added support for builtin modules in the fall // 2024 major releases. - case Darwin::MacOS: + case llvm::Triple::MacOSX: return SDKVersion >= VersionTuple(15U); - case Darwin::IPhoneOS: - switch (TargetEnvironment) { - case Darwin::MacCatalyst: - // Mac Catalyst uses `-target arm64-apple-ios18.0-macabi` so the platform - // is iOS, but it builds with the macOS SDK, so it's the macOS SDK version - // that's relevant. - return SDKVersion >= VersionTuple(15U); - default: - return SDKVersion >= VersionTuple(18U); - } - case Darwin::TvOS: + case llvm::Triple::IOS: return SDKVersion >= VersionTuple(18U); - case Darwin::WatchOS: + case llvm::Triple::TvOS: + return SDKVersion >= VersionTuple(18U); + case llvm::Triple::WatchOS: return SDKVersion >= VersionTuple(11U); - case Darwin::XROS: + case llvm::Triple::XROS: return SDKVersion >= VersionTuple(2U); // New SDKs support builtin modules from the start. @@ -3138,7 +3136,7 @@ void Darwin::addClangTargetOptions( // i.e. when the builtin stdint.h is in the Darwin module too, the cycle // goes away. Note that -fbuiltin-headers-in-system-modules does nothing // to fix the same problem with C++ headers, and is generally fragile. - if (!sdkSupportsBuiltinModules(TargetPlatform, TargetEnvironment, SDKInfo)) + if (!sdkSupportsBuiltinModules(SDKInfo)) CC1Args.push_back("-fbuiltin-headers-in-system-modules"); if (!DriverArgs.hasArgNoClaim(options::OPT_fdefine_target_os_macros, diff --git a/clang/test/Driver/Inputs/DriverKit23.0.sdk/SDKSettings.json b/clang/test/Driver/Inputs/DriverKit23.0.sdk/SDKSettings.json index 7ba6c244df211..edee441adb474 100644 --- a/clang/test/Driver/Inputs/DriverKit23.0.sdk/SDKSettings.json +++ b/clang/test/Driver/Inputs/DriverKit23.0.sdk/SDKSettings.json @@ -1 +1 @@ -{"Version":"23.0", "MaximumDeploymentTarget": "23.0.99"} +{"Version":"23.0", "CanonicalName": "driverkit23.0", "MaximumDeploymentTarget": "23.0.99"} diff --git a/clang/test/Driver/Inputs/MacOSX10.14.sdk/SDKSettings.json b/clang/test/Driver/Inputs/MacOSX10.14.sdk/SDKSettings.json index b612107cef394..e7383550e42bd 100644 --- a/clang/test/Driver/Inputs/MacOSX10.14.sdk/SDKSettings.json +++ b/clang/test/Driver/Inputs/MacOSX10.14.sdk/SDKSettings.json @@ -1 +1 @@ -{"Version":"10.14", "MaximumDeploymentTarget": "10.14.99"} +{"Version":"10.14", "CanonicalName": "macosx10.14", "MaximumDeploymentTarget": "10.14.99"} diff --git a/clang/test/Driver/Inputs/MacOSX10.15.versioned.sdk/SDKSettings.json b/clang/test/Driver/Inputs/MacOSX10.15.versioned.sdk/SDKSettings.json index b0769e9f86045..7325cc45a2808 100644 --- a/clang/test/Driver/Inputs/MacOSX10.15.versioned.sdk/SDKSettings.json +++ b/clang/test/Driver/Inputs/MacOSX10.15.versioned.sdk/SDKSettings.json @@ -1,5 +1,6 @@ { "Version":"10.15", + "CanonicalName": "macosx10.15", "MaximumDeploymentTarget": "10.15.99", "VersionMap" : { "macOS_iOSMac" : { diff --git a/clang/test/Driver/Inputs/MacOSX15.0.sdk/SDKSettings.json b/clang/test/Driver/Inputs/MacOSX15.0.sdk/SDKSettings.json index ced45d5c21996..81d5ee28a5a05 100644 --- a/clang/test/Driver/Inputs/MacOSX15.0.sdk/SDKSettings.json +++ b/clang/test/Driver/Inputs/MacOSX15.0.sdk/SDKSettings.json @@ -1 +1 @@ -{"Version":"15.0", "MaximumDeploymentTarget": "15.0.99"} +{"Version":"15.0", "CanonicalName": "macosx15.0", "MaximumDeploymentTarget": "15.0.99"} diff --git a/clang/test/Driver/Inputs/MacOSX15.1.sdk/SDKSettings.json b/clang/test/Driver/Inputs/MacOSX15.1.sdk/SDKSettings.json index d46295b2ab5a1..956cbe4041b9a 100644 --- a/clang/test/Driver/Inputs/MacOSX15.1.sdk/SDKSettings.json +++ b/clang/test/Driver/Inputs/MacOSX15.1.sdk/SDKSettings.json @@ -1 +1 @@ -{"Version":"15.1", "MaximumDeploymentTarget": "15.1.99"} +{"Version":"15.1", "CanonicalName": "macosx15.1", "MaximumDeploymentTarget": "15.1.99"} diff --git a/clang/test/Driver/Inputs/WatchOS6.0.sdk/SDKSettings.json b/clang/test/Driver/Inputs/WatchOS6.0.sdk/SDKSettings.json index 9e30a153cb5fb..314fc22edf7eb 100644 --- a/clang/test/Driver/Inputs/WatchOS6.0.sdk/SDKSettings.json +++ b/clang/test/Driver/Inputs/WatchOS6.0.sdk/SDKSettings.json @@ -1 +1 @@ -{"Version":"6.0.0", "MaximumDeploymentTarget": "6.0.99"} +{"Version":"6.0", "CanonicalName": "watchos6.0", "MaximumDeploymentTarget": "6.0.99"} diff --git a/clang/test/Driver/Inputs/iPhoneOS13.0.sdk/SDKSettings.json b/clang/test/Driver/Inputs/iPhoneOS13.0.sdk/SDKSettings.json index b05260f994868..c122cf47ce0f7 100644 --- a/clang/test/Driver/Inputs/iPhoneOS13.0.sdk/SDKSettings.json +++ b/clang/test/Driver/Inputs/iPhoneOS13.0.sdk/SDKSettings.json @@ -1 +1 @@ -{"Version":"13.0", "MaximumDeploymentTarget": "13.0.99"} +{"Version":"13.0", "CanonicalName": "iphoneos13.0", "MaximumDeploymentTarget": "13.0.99"} diff --git a/clang/test/Driver/darwin-ld-platform-version-watchos.c b/clang/test/Driver/darwin-ld-platform-version-watchos.c index 3d405a7c09f62..fc8e859f30efa 100644 --- a/clang/test/Driver/darwin-ld-platform-version-watchos.c +++ b/clang/test/Driver/darwin-ld-platform-version-watchos.c @@ -18,5 +18,5 @@ // RUN: | FileCheck --check-prefix=SIMUL %s // LINKER-OLD: "-watchos_version_min" "5.2.0" -// LINKER-NEW: "-platform_version" "watchos" "5.2.0" "6.0.0" -// SIMUL: "-platform_version" "watchos-simulator" "6.0.0" "6.0.0" +// LINKER-NEW: "-platform_version" "watchos" "5.2.0" "6.0" +// SIMUL: "-platform_version" "watchos-simulator" "6.0.0" "6.0" diff --git a/clang/test/InstallAPI/Inputs/MacOSX13.0.sdk/SDKSettings.json b/clang/test/InstallAPI/Inputs/MacOSX13.0.sdk/SDKSettings.json index 258d8288fc6b4..d08288ed1af10 100644 --- a/clang/test/InstallAPI/Inputs/MacOSX13.0.sdk/SDKSettings.json +++ b/clang/test/InstallAPI/Inputs/MacOSX13.0.sdk/SDKSettings.json @@ -1,6 +1,7 @@ { "DefaultVariant": "macos", "DisplayName": "macOS 13", "Version": "13.0", + "CanonicalName": "macosx13.0", "MaximumDeploymentTarget": "13.0.99", "PropertyConditionFallbackNames": [], "VersionMap": { "iOSMac_macOS": { diff --git a/clang/test/Sema/Inputs/AppleTVOS15.0.sdk/SDKSettings.json b/clang/test/Sema/Inputs/AppleTVOS15.0.sdk/SDKSettings.json index dc10836f4887e..6cd02f33471ed 100644 --- a/clang/test/Sema/Inputs/AppleTVOS15.0.sdk/SDKSettings.json +++ b/clang/test/Sema/Inputs/AppleTVOS15.0.sdk/SDKSettings.json @@ -1,6 +1,7 @@ { "DisplayName": "tvOS 15.0", "Version": "15.0", + "CanonicalName": "appletvos15.0", "MaximumDeploymentTarget": "15.0.99", "PropertyConditionFallbackNames": [], "VersionMap": { diff --git a/clang/test/Sema/Inputs/MacOSX11.0.sdk/SDKSettings.json b/clang/test/Sema/Inputs/MacOSX11.0.sdk/SDKSettings.json index b40e35e882e60..e6220abfbe573 100644 --- a/clang/test/Sema/Inputs/MacOSX11.0.sdk/SDKSettings.json +++ b/clang/test/Sema/Inputs/MacOSX11.0.sdk/SDKSettings.json @@ -1,6 +1,7 @@ { "DefaultVariant": "macos", "DisplayName": "macOS 11", "Version": "11.0", + "CanonicalName": "macosx11.0", "MaximumDeploymentTarget": "11.0.99", "PropertyConditionFallbackNames": [], "VersionMap": { "iOSMac_macOS": { diff --git a/clang/test/Sema/Inputs/WatchOS7.0.sdk/SDKSettings.json b/clang/test/Sema/Inputs/WatchOS7.0.sdk/SDKSettings.json index fff3e3a80286e..84914c105749a 100644 --- a/clang/test/Sema/Inputs/WatchOS7.0.sdk/SDKSettings.json +++ b/clang/test/Sema/Inputs/WatchOS7.0.sdk/SDKSettings.json @@ -1,6 +1,7 @@ { "DisplayName": "watchOS 7.0", "Version": "7.0", + "CanonicalName": "watchos7.0", "MaximumDeploymentTarget": "7.0.99", "PropertyConditionFallbackNames": [], "VersionMap": { From d33ae41c621dfbfb1eda5d469e2fb146ef49fbf9 Mon Sep 17 00:00:00 2001 From: Aditya Tejpaul <97700214+hoarfrost32@users.noreply.github.com> Date: Fri, 4 Apr 2025 04:49:12 +0530 Subject: [PATCH 0595/1029] [libc] Implemented utimes (Issue #133953) (#134167) This pull request implements the `utimes` command in libc ([Issue #133953](https://github.com/llvm/llvm-project/issues/133953)). - [x] Add the implementation of `utimes` in `/src/sys/time`. - [x] Add tests for `utimes` in `/test/src/sys/time`. - [x] Add `utimes` to [entrypoints.txt](https://github.com/llvm/llvm-project/blob/main/libc/config/linux/x86_64/entrypoints.txt) for at least x86_64 and whatever you're building on - [x] Add `utimes` to [include/sys/time.yaml](https://github.com/llvm/llvm-project/blob/main/libc/include/sys/time.yaml) --- libc/config/linux/x86_64/entrypoints.txt | 3 + libc/include/sys/time.yaml | 7 +- libc/src/sys/CMakeLists.txt | 1 + libc/src/sys/time/CMakeLists.txt | 10 +++ libc/src/sys/time/linux/CMakeLists.txt | 16 +++++ libc/src/sys/time/linux/utimes.cpp | 76 ++++++++++++++++++++ libc/src/sys/time/utimes.h | 21 ++++++ libc/test/src/sys/CMakeLists.txt | 1 + libc/test/src/sys/time/CMakeLists.txt | 19 +++++ libc/test/src/sys/time/utimes_test.cpp | 92 ++++++++++++++++++++++++ 10 files changed, 245 insertions(+), 1 deletion(-) create mode 100644 libc/src/sys/time/CMakeLists.txt create mode 100644 libc/src/sys/time/linux/CMakeLists.txt create mode 100644 libc/src/sys/time/linux/utimes.cpp create mode 100644 libc/src/sys/time/utimes.h create mode 100644 libc/test/src/sys/time/CMakeLists.txt create mode 100644 libc/test/src/sys/time/utimes_test.cpp diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index eccd222fa123e..1ac3a781d5279 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -288,6 +288,9 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.sys.statvfs.fstatvfs libc.src.sys.statvfs.statvfs + # sys/utimes.h entrypoints + libc.src.sys.time.utimes + # sys/utsname.h entrypoints libc.src.sys.utsname.uname diff --git a/libc/include/sys/time.yaml b/libc/include/sys/time.yaml index ca497bbe92995..92ab9a467f33a 100644 --- a/libc/include/sys/time.yaml +++ b/libc/include/sys/time.yaml @@ -5,5 +5,10 @@ macros: [] types: - type_name: struct_timeval enums: [] -functions: [] objects: [] +functions: + - name: utimes + return_type: int + arguments: + - type: const char* + - type: const struct timeval* diff --git a/libc/src/sys/CMakeLists.txt b/libc/src/sys/CMakeLists.txt index bb177f11c6d62..9a73b80d35d2f 100644 --- a/libc/src/sys/CMakeLists.txt +++ b/libc/src/sys/CMakeLists.txt @@ -8,6 +8,7 @@ add_subdirectory(socket) add_subdirectory(sendfile) add_subdirectory(stat) add_subdirectory(statvfs) +add_subdirectory(time) add_subdirectory(utsname) add_subdirectory(wait) add_subdirectory(prctl) diff --git a/libc/src/sys/time/CMakeLists.txt b/libc/src/sys/time/CMakeLists.txt new file mode 100644 index 0000000000000..f599cddaaeeb3 --- /dev/null +++ b/libc/src/sys/time/CMakeLists.txt @@ -0,0 +1,10 @@ +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS}) +endif() + +add_entrypoint_object( + utimes + ALIAS + DEPENDS + .${LIBC_TARGET_OS}.utimes +) diff --git a/libc/src/sys/time/linux/CMakeLists.txt b/libc/src/sys/time/linux/CMakeLists.txt new file mode 100644 index 0000000000000..506001e5c9fd2 --- /dev/null +++ b/libc/src/sys/time/linux/CMakeLists.txt @@ -0,0 +1,16 @@ +add_entrypoint_object( + utimes + SRCS + utimes.cpp + HDRS + ../utimes.h + DEPENDS + libc.hdr.types.struct_timeval + libc.hdr.fcntl_macros + libc.src.__support.OSUtil.osutil + libc.include.sys_stat + libc.include.sys_syscall + libc.include.fcntl + libc.src.__support.OSUtil.osutil + libc.src.errno.errno +) diff --git a/libc/src/sys/time/linux/utimes.cpp b/libc/src/sys/time/linux/utimes.cpp new file mode 100644 index 0000000000000..1cc5a8344e91a --- /dev/null +++ b/libc/src/sys/time/linux/utimes.cpp @@ -0,0 +1,76 @@ +//===-- Linux implementation of utimes ------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/sys/time/utimes.h" + +#include "hdr/fcntl_macros.h" +#include "hdr/types/struct_timeval.h" + +#include "src/__support/OSUtil/syscall.h" +#include "src/__support/common.h" + +#include "src/errno/libc_errno.h" + +#include + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(int, utimes, + (const char *path, const struct timeval times[2])) { + int ret; + +#ifdef SYS_utimes + // No need to define a timespec struct, use the syscall directly. + ret = LIBC_NAMESPACE::syscall_impl(SYS_utimes, path, times); +#elif defined(SYS_utimensat) + // the utimensat syscall requires a timespec struct, not timeval. + struct timespec ts[2]; + struct timespec *ts_ptr = nullptr; // default value if times is NULL + + // convert the microsec values in timeval struct times + // to nanosecond values in timespec struct ts + if (times != NULL) { + + // ensure consistent values + if ((times[0].tv_usec < 0 || times[1].tv_usec < 0) || + (times[0].tv_usec >= 1000000 || times[1].tv_usec >= 1000000)) { + libc_errno = EINVAL; + return -1; + } + + // set seconds in ts + ts[0].tv_sec = times[0].tv_sec; + ts[1].tv_sec = times[1].tv_sec; + + // convert u-seconds to nanoseconds + ts[0].tv_nsec = times[0].tv_usec * 1000; + ts[1].tv_nsec = times[1].tv_usec * 1000; + + ts_ptr = ts; + } + + // If times was NULL, ts_ptr remains NULL, which utimensat interprets + // as setting times to the current time. + + // utimensat syscall. + // flags=0 means don't follow symlinks (like utimes) + ret = LIBC_NAMESPACE::syscall_impl(SYS_utimensat, AT_FDCWD, path, ts_ptr, + 0); + +#else +#error "utimensat and utimes syscalls not available." +#endif // SYS_utimensat + + if (ret < 0) { + libc_errno = -ret; + return -1; + } + + return 0; +} +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/sys/time/utimes.h b/libc/src/sys/time/utimes.h new file mode 100644 index 0000000000000..6e19e412d69ac --- /dev/null +++ b/libc/src/sys/time/utimes.h @@ -0,0 +1,21 @@ +//===-- Implementation header for utimes ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_SYS_TIME_UTIMES_H +#define LLVM_LIBC_SRC_SYS_TIME_UTIMES_H + +#include "hdr/types/struct_timeval.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +int utimes(const char *path, const struct timeval times[2]); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_SYS_TIME_UTIMES_H diff --git a/libc/test/src/sys/CMakeLists.txt b/libc/test/src/sys/CMakeLists.txt index 9e9293aab628f..224cc7905ad31 100644 --- a/libc/test/src/sys/CMakeLists.txt +++ b/libc/test/src/sys/CMakeLists.txt @@ -12,3 +12,4 @@ add_subdirectory(prctl) add_subdirectory(auxv) add_subdirectory(epoll) add_subdirectory(uio) +add_subdirectory(time) diff --git a/libc/test/src/sys/time/CMakeLists.txt b/libc/test/src/sys/time/CMakeLists.txt new file mode 100644 index 0000000000000..c092d33e43d85 --- /dev/null +++ b/libc/test/src/sys/time/CMakeLists.txt @@ -0,0 +1,19 @@ +add_custom_target(libc_sys_time_unittests) + +add_libc_unittest( + utimes_test + SUITE + libc_sys_time_unittests + SRCS + utimes_test.cpp + DEPENDS + libc.hdr.fcntl_macros + libc.src.errno.errno + libc.src.fcntl.open + libc.src.sys.time.utimes + libc.src.unistd.close + libc.src.unistd.read + libc.src.unistd.write + libc.src.stdio.remove + libc.src.sys.stat.stat +) diff --git a/libc/test/src/sys/time/utimes_test.cpp b/libc/test/src/sys/time/utimes_test.cpp new file mode 100644 index 0000000000000..b97befb8626e3 --- /dev/null +++ b/libc/test/src/sys/time/utimes_test.cpp @@ -0,0 +1,92 @@ +//===-- Unittests for utimes ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/fcntl_macros.h" +#include "hdr/types/struct_timeval.h" +#include "src/errno/libc_errno.h" +#include "src/fcntl/open.h" +#include "src/stdio/remove.h" +#include "src/sys/stat/stat.h" +#include "src/sys/time/utimes.h" +#include "src/unistd/close.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" +#include "test/UnitTest/Test.h" + +constexpr const char *FILE_PATH = "utimes.test"; + +// SUCCESS: Takes a file and successfully updates +// its last access and modified times. +TEST(LlvmLibcUtimesTest, ChangeTimesSpecific) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + + auto TEST_FILE = libc_make_test_file_path(FILE_PATH); + int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT); + ASSERT_GT(fd, 0); + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + + // make a dummy timeval struct + struct timeval times[2]; + times[0].tv_sec = 54321; + times[0].tv_usec = 12345; + times[1].tv_sec = 43210; + times[1].tv_usec = 23456; + + // ensure utimes succeeds + ASSERT_THAT(LIBC_NAMESPACE::utimes(FILE_PATH, times), Succeeds(0)); + + // verify the times values against stat of the TEST_FILE + struct stat statbuf; + ASSERT_EQ(LIBC_NAMESPACE::stat(FILE_PATH, &statbuf), 0); + + // seconds + ASSERT_EQ(statbuf.st_atim.tv_sec, times[0].tv_sec); + ASSERT_EQ(statbuf.st_mtim.tv_sec, times[1].tv_sec); + + // microseconds + ASSERT_EQ(statbuf.st_atim.tv_nsec, + static_cast(times[0].tv_usec * 1000)); + ASSERT_EQ(statbuf.st_mtim.tv_nsec, + static_cast(times[1].tv_usec * 1000)); + + ASSERT_THAT(LIBC_NAMESPACE::remove(TEST_FILE), Succeeds(0)); +} + +// FAILURE: Invalid values in the timeval struct +// to check that utimes rejects it. +TEST(LlvmLibcUtimesTest, InvalidMicroseconds) { + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; + using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + + auto TEST_FILE = libc_make_test_file_path(FILE_PATH); + int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT); + ASSERT_GT(fd, 0); + ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); + + // make a dummy timeval struct + // populated with bad usec values + struct timeval times[2]; + times[0].tv_sec = 54321; + times[0].tv_usec = 4567; + times[1].tv_sec = 43210; + times[1].tv_usec = 1000000; // invalid + + // ensure utimes fails + ASSERT_THAT(LIBC_NAMESPACE::utimes(FILE_PATH, times), Fails(EINVAL)); + + // check for failure on + // the other possible bad values + + times[0].tv_sec = 54321; + times[0].tv_usec = -4567; // invalid + times[1].tv_sec = 43210; + times[1].tv_usec = 1000; + + // ensure utimes fails once more + ASSERT_THAT(LIBC_NAMESPACE::utimes(FILE_PATH, times), Fails(EINVAL)); + ASSERT_THAT(LIBC_NAMESPACE::remove(TEST_FILE), Succeeds(0)); +} From ba0a52a04b140bb7ed75cca4e1c27ec7d747fa40 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Thu, 3 Apr 2025 16:47:36 -0700 Subject: [PATCH 0596/1029] [InferAS] Support getAssumedAddrSpace for Arguments for NVPTX (#133991) --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 12 +- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 15 + .../Transforms/Scalar/InferAddressSpaces.cpp | 49 +- llvm/test/CodeGen/NVPTX/i1-ext-load.ll | 10 +- .../CodeGen/NVPTX/lower-args-gridconstant.ll | 46 +- llvm/test/CodeGen/NVPTX/lower-args.ll | 34 +- llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 975 +++++++++++------- llvm/test/DebugInfo/NVPTX/debug-addr-class.ll | 53 +- llvm/test/DebugInfo/NVPTX/debug-info.ll | 80 +- .../InferAddressSpaces/NVPTX/arguments.ll | 35 + 10 files changed, 785 insertions(+), 524 deletions(-) create mode 100644 llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 2637b9fab0d50..a683726facd0c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -678,11 +678,8 @@ static bool runOnKernelFunction(const NVPTXTargetMachine &TM, Function &F) { LLVM_DEBUG(dbgs() << "Lowering kernel args of " << F.getName() << "\n"); for (Argument &Arg : F.args()) { - if (Arg.getType()->isPointerTy()) { - if (Arg.hasByValAttr()) - handleByValParam(TM, &Arg); - else if (TM.getDrvInterface() == NVPTX::CUDA) - markPointerAsGlobal(&Arg); + if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) { + handleByValParam(TM, &Arg); } else if (Arg.getType()->isIntegerTy() && TM.getDrvInterface() == NVPTX::CUDA) { HandleIntToPtr(Arg); @@ -699,10 +696,9 @@ static bool runOnDeviceFunction(const NVPTXTargetMachine &TM, Function &F) { cast(TM.getSubtargetImpl()->getTargetLowering()); for (Argument &Arg : F.args()) - if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) { - markPointerAsAS(&Arg, ADDRESS_SPACE_LOCAL); + if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) adjustByValArgAlignment(&Arg, &Arg, TLI); - } + return true; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index a89ca3037c7ff..e359735c20750 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -599,6 +599,21 @@ unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const { if (isa(V)) return ADDRESS_SPACE_LOCAL; + if (const Argument *Arg = dyn_cast(V)) { + if (isKernelFunction(*Arg->getParent())) { + const NVPTXTargetMachine &TM = + static_cast(getTLI()->getTargetMachine()); + if (TM.getDrvInterface() == NVPTX::CUDA && !Arg->hasByValAttr()) + return ADDRESS_SPACE_GLOBAL; + } else { + // We assume that all device parameters that are passed byval will be + // placed in the local AS. Very simple cases will be updated after ISel to + // use the device param space where possible. + if (Arg->hasByValAttr()) + return ADDRESS_SPACE_LOCAL; + } + } + return -1; } diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 73a3f5e4d3694..b65a08be75640 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -305,10 +305,16 @@ static bool isNoopPtrIntCastPair(const Operator *I2P, const DataLayout &DL, } // Returns true if V is an address expression. -// TODO: Currently, we consider only phi, bitcast, addrspacecast, and -// getelementptr operators. +// TODO: Currently, we only consider: +// - arguments +// - phi, bitcast, addrspacecast, and getelementptr operators static bool isAddressExpression(const Value &V, const DataLayout &DL, const TargetTransformInfo *TTI) { + + if (const Argument *Arg = dyn_cast(&V)) + return Arg->getType()->isPointerTy() && + TTI->getAssumedAddrSpace(&V) != UninitializedAddressSpace; + const Operator *Op = dyn_cast(&V); if (!Op) return false; @@ -341,6 +347,9 @@ static bool isAddressExpression(const Value &V, const DataLayout &DL, static SmallVector getPointerOperands(const Value &V, const DataLayout &DL, const TargetTransformInfo *TTI) { + if (isa(&V)) + return {}; + const Operator &Op = cast(V); switch (Op.getOpcode()) { case Instruction::PHI: { @@ -505,13 +514,11 @@ void InferAddressSpacesImpl::appendsFlatAddressExpressionToPostorderStack( if (Visited.insert(V).second) { PostorderStack.emplace_back(V, false); - Operator *Op = cast(V); - for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) { - if (ConstantExpr *CE = dyn_cast(Op->getOperand(I))) { - if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second) - PostorderStack.emplace_back(CE, false); - } - } + if (auto *Op = dyn_cast(V)) + for (auto &O : Op->operands()) + if (ConstantExpr *CE = dyn_cast(O)) + if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second) + PostorderStack.emplace_back(CE, false); } } } @@ -828,6 +835,18 @@ Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace( assert(V->getType()->getPointerAddressSpace() == FlatAddrSpace && isAddressExpression(*V, *DL, TTI)); + if (auto *Arg = dyn_cast(V)) { + // Arguments are address space casted in the function body, as we do not + // want to change the function signature. + Function *F = Arg->getParent(); + BasicBlock::iterator Insert = F->getEntryBlock().getFirstNonPHIIt(); + + Type *NewPtrTy = PointerType::get(Arg->getContext(), NewAddrSpace); + auto *NewI = new AddrSpaceCastInst(Arg, NewPtrTy); + NewI->insertBefore(Insert); + return NewI; + } + if (Instruction *I = dyn_cast(V)) { Value *NewV = cloneInstructionWithNewAddressSpace( I, NewAddrSpace, ValueWithNewAddrSpace, PredicatedAS, PoisonUsesToFix); @@ -966,8 +985,12 @@ bool InferAddressSpacesImpl::updateAddressSpace( // of all its pointer operands. unsigned NewAS = UninitializedAddressSpace; - const Operator &Op = cast(V); - if (Op.getOpcode() == Instruction::Select) { + // isAddressExpression should guarantee that V is an operator or an argument. + assert(isa(V) || isa(V)); + + if (isa(V) && + cast(V).getOpcode() == Instruction::Select) { + const Operator &Op = cast(V); Value *Src0 = Op.getOperand(1); Value *Src1 = Op.getOperand(2); @@ -1258,7 +1281,7 @@ void InferAddressSpacesImpl::performPointerReplacement( } // Otherwise, replaces the use with flat(NewV). - if (Instruction *VInst = dyn_cast(V)) { + if (isa(V) || isa(NewV)) { // Don't create a copy of the original addrspacecast. if (U == V && isa(V)) return; @@ -1268,7 +1291,7 @@ void InferAddressSpacesImpl::performPointerReplacement( if (Instruction *NewVInst = dyn_cast(NewV)) InsertPos = std::next(NewVInst->getIterator()); else - InsertPos = std::next(VInst->getIterator()); + InsertPos = std::next(cast(V)->getIterator()); while (isa(InsertPos)) ++InsertPos; diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll index f5f1dd9fcf0ea..44ac46db254a7 100644 --- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll +++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll @@ -12,14 +12,14 @@ define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) { ; CHECK: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK: ld.param.u64 %rd1, [foo_param_0]; -; CHECK: ld.param.u64 %rd2, [foo_param_1]; -; CHECK: cvta.to.global.u64 %rd3, %rd2; -; CHECK: cvta.to.global.u64 %rd4, %rd1; -; CHECK: ld.global.nc.u8 %rs1, [%rd4]; +; CHECK: cvta.to.global.u64 %rd2, %rd1; +; CHECK: ld.param.u64 %rd3, [foo_param_1]; +; CHECK: cvta.to.global.u64 %rd4, %rd3; +; CHECK: ld.global.nc.u8 %rs1, [%rd2]; ; CHECK: cvt.u32.u8 %r1, %rs1; ; CHECK: add.s32 %r2, %r1, 1; ; CHECK: and.b32 %r3, %r2, 1; -; CHECK: st.global.u32 [%rd3], %r3; +; CHECK: st.global.u32 [%rd4], %r3; ; CHECK: ret; %ld = load i1, ptr %ptr, align 1 %zext = zext i1 %ld to i32 diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index e4e1f40d0d8b2..38b7400696c54 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -12,9 +12,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly ; OPT-LABEL: define dso_local noundef i32 @non_kernel_function( ; OPT-SAME: ptr noundef readonly byval([[STRUCT_UINT4:%.*]]) align 16 captures(none) [[A:%.*]], i1 noundef zeroext [[B:%.*]], i32 noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; OPT-NEXT: [[ENTRY:.*:]] -; OPT-NEXT: [[A2:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(5) -; OPT-NEXT: [[A1:%.*]] = addrspacecast ptr addrspace(5) [[A2]] to ptr -; OPT-NEXT: [[A_:%.*]] = select i1 [[B]], ptr [[A1]], ptr addrspacecast (ptr addrspace(1) @gi to ptr) +; OPT-NEXT: [[A_:%.*]] = select i1 [[B]], ptr [[A]], ptr addrspacecast (ptr addrspace(1) @gi to ptr) ; OPT-NEXT: [[IDX_EXT:%.*]] = sext i32 [[C]] to i64 ; OPT-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[A_]], i64 [[IDX_EXT]] ; OPT-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 1 @@ -74,12 +72,10 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_int( ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; OPT-NEXT: [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr ; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) ; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 ; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]] -; OPT-NEXT: store i32 [[ADD]], ptr [[OUT3]], align 4 +; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4 ; OPT-NEXT: ret void %tmp = load i32, ptr %input1, align 4 %add = add i32 %tmp, %input2 @@ -105,15 +101,13 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_struct( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; OPT-NEXT: [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr ; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) ; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 ; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 ; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 ; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4 ; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]] -; OPT-NEXT: store i32 [[ADD]], ptr [[OUT5]], align 4 +; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4 ; OPT-NEXT: ret void %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 @@ -233,11 +227,9 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1) -; OPT-NEXT: [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr ; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR5]], align 8 +; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR]], align 8 ; OPT-NEXT: ret void store ptr %input, ptr %addr, align 8 ret void @@ -263,14 +255,12 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 ; PTX-NOT .local ; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1) -; OPT-NEXT: [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr ; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) ; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 ; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 ; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 -; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT5]], align 8 +; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT]], align 8 ; OPT-NEXT: ret void %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 @@ -311,13 +301,11 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape( ; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) -; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr ; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]]) ; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4 ; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]] -; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT5]], align 4 +; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT]], align 4 ; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]]) ; OPT-NEXT: ret void %val = load i32, ptr %input @@ -361,15 +349,13 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) -; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr ; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) ; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 ; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4 ; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 ; OPT-NEXT: [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4 -; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT5]], align 8 +; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT]], align 8 ; OPT-NEXT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]] ; OPT-NEXT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]]) ; OPT-NEXT: ret i32 [[ADD]] @@ -407,11 +393,9 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) -; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr ; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) -; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 ; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 ; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; OPT: [[FIRST]]: @@ -423,7 +407,7 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr ; OPT: [[MERGE]]: ; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] ; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 ; OPT-NEXT: ret void %val = load i32, ptr %inout @@ -470,13 +454,11 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) -; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr ; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) ; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) -; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 ; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 ; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; OPT: [[FIRST]]: @@ -488,7 +470,7 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ; OPT: [[MERGE]]: ; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] ; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 ; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 @@ -531,17 +513,15 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_select( ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { -; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) -; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr ; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) ; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) ; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) -; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4 ; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 ; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] ; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 ; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index a1c0a86e9c4e4..8fa7d5c3e0cbc 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes IR,IRC ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes IR,IRO ; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX,PTXC @@ -17,12 +17,10 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 % ; IR-LABEL: define void @load_alignment( ; IR-SAME: ptr readonly byval([[CLASS_OUTER:%.*]]) align 8 captures(none) [[ARG:%.*]]) { ; IR-NEXT: [[ENTRY:.*:]] -; IR-NEXT: [[ARG2:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(5) -; IR-NEXT: [[ARG1:%.*]] = addrspacecast ptr addrspace(5) [[ARG2]] to ptr -; IR-NEXT: [[ARG_IDX_VAL:%.*]] = load ptr, ptr [[ARG1]], align 8 -; IR-NEXT: [[ARG_IDX1:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG1]], i64 0, i32 0, i32 1 +; IR-NEXT: [[ARG_IDX_VAL:%.*]] = load ptr, ptr [[ARG]], align 8 +; IR-NEXT: [[ARG_IDX1:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG]], i64 0, i32 0, i32 1 ; IR-NEXT: [[ARG_IDX1_VAL:%.*]] = load ptr, ptr [[ARG_IDX1]], align 8 -; IR-NEXT: [[ARG_IDX2:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG1]], i64 0, i32 1 +; IR-NEXT: [[ARG_IDX2:%.*]] = getelementptr [[CLASS_OUTER]], ptr [[ARG]], i64 0, i32 1 ; IR-NEXT: [[ARG_IDX2_VAL:%.*]] = load i32, ptr [[ARG_IDX2]], align 8 ; IR-NEXT: [[ARG_IDX_VAL_VAL:%.*]] = load i32, ptr [[ARG_IDX_VAL]], align 4 ; IR-NEXT: [[ADD_I:%.*]] = add nsw i32 [[ARG_IDX_VAL_VAL]], [[ARG_IDX2_VAL]] @@ -77,9 +75,7 @@ entry: define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) { ; IR-LABEL: define void @load_padding( ; IR-SAME: ptr readonly byval([[CLASS_PADDED:%.*]]) align 4 captures(none) [[ARG:%.*]]) { -; IR-NEXT: [[ARG2:%.*]] = addrspacecast ptr [[ARG]] to ptr addrspace(5) -; IR-NEXT: [[ARG1:%.*]] = addrspacecast ptr addrspace(5) [[ARG2]] to ptr -; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull align 16 [[ARG1]]) +; IR-NEXT: [[TMP:%.*]] = call ptr @escape(ptr nonnull align 16 [[ARG]]) ; IR-NEXT: ret void ; ; PTX-LABEL: load_padding( @@ -108,21 +104,11 @@ define void @load_padding(ptr nocapture readonly byval(%class.padded) %arg) { ; OpenCL can't make assumptions about incoming pointer, so we should generate ; generic pointers load/store. define ptx_kernel void @ptr_generic(ptr %out, ptr %in) { -; IRC-LABEL: define ptx_kernel void @ptr_generic( -; IRC-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) { -; IRC-NEXT: [[IN3:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; IRC-NEXT: [[IN4:%.*]] = addrspacecast ptr addrspace(1) [[IN3]] to ptr -; IRC-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; IRC-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; IRC-NEXT: [[V:%.*]] = load i32, ptr [[IN4]], align 4 -; IRC-NEXT: store i32 [[V]], ptr [[OUT2]], align 4 -; IRC-NEXT: ret void -; -; IRO-LABEL: define ptx_kernel void @ptr_generic( -; IRO-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) { -; IRO-NEXT: [[V:%.*]] = load i32, ptr [[IN]], align 4 -; IRO-NEXT: store i32 [[V]], ptr [[OUT]], align 4 -; IRO-NEXT: ret void +; IR-LABEL: define ptx_kernel void @ptr_generic( +; IR-SAME: ptr [[OUT:%.*]], ptr [[IN:%.*]]) { +; IR-NEXT: [[V:%.*]] = load i32, ptr [[IN]], align 4 +; IR-NEXT: store i32 [[V]], ptr [[OUT]], align 4 +; IR-NEXT: ret void ; ; PTXC-LABEL: ptr_generic( ; PTXC: { diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index cfe934544eb3a..4631732b81ea6 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -1,9 +1,11 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5 -; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_60 -; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_70 -; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_60 -; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_70 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_60 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_70 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_60 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,LOWER-ARGS,SM_70 ; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -passes=nvptx-copy-byval-args -S | FileCheck %s --check-prefixes=COMMON,COPY +; RUN: llc < %s -mcpu=sm_60 -mattr=ptx77 | FileCheck %s --check-prefixes=PTX,PTX_60 +; RUN: llc < %s -mcpu=sm_70 -mattr=ptx77 | FileCheck %s --check-prefixes=PTX,PTX_70 source_filename = "" target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" target triple = "nvptx64-nvidia-cuda" @@ -27,25 +29,13 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @read_only( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 -; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @read_only( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 -; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_70-NEXT: ret void +; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only( +; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; LOWER-ARGS-NEXT: [[ENTRY:.*:]] +; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 +; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; LOWER-ARGS-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @read_only( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { @@ -57,6 +47,17 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out ; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 ; COPY-NEXT: ret void ; +; PTX-LABEL: read_only( +; PTX: { +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: ld.param.u64 %rd1, [read_only_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [read_only_param_1]; +; PTX-NEXT: st.global.u32 [%rd2], %r1; +; PTX-NEXT: ret; entry: %i = load i32, ptr %s, align 4 store i32 %i, ptr %out, align 4 @@ -65,27 +66,14 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; SM_60-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; SM_70-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_70-NEXT: ret void +; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep( +; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; LOWER-ARGS-NEXT: [[ENTRY:.*:]] +; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; LOWER-ARGS-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -98,6 +86,17 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly ; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 ; COPY-NEXT: ret void ; +; PTX-LABEL: read_only_gep( +; PTX: { +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_param_1+4]; +; PTX-NEXT: st.global.u32 [%rd2], %r1; +; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 %i = load i32, ptr %b, align 4 @@ -107,27 +106,14 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; SM_60-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 -; SM_70-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 -; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_70-NEXT: ret void +; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep_asc( +; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; LOWER-ARGS-NEXT: [[ENTRY:.*:]] +; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; LOWER-ARGS-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -141,6 +127,17 @@ define dso_local ptx_kernel void @read_only_gep_asc(ptr nocapture noundef writeo ; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 ; COPY-NEXT: ret void ; +; PTX-LABEL: read_only_gep_asc( +; PTX: { +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc_param_1+4]; +; PTX-NEXT: st.global.u32 [%rd2], %r1; +; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 %asc = addrspacecast ptr %b to ptr addrspace(101) @@ -151,49 +148,30 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; SM_60-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; SM_60-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr -; SM_60-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 -; SM_60-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; SM_70-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; SM_70-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr -; SM_70-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 -; SM_70-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0( -; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 -; COPY-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; COPY-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr -; COPY-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 -; COPY-NEXT: store i32 [[I]], ptr [[OUT]], align 4 -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @read_only_gep_asc0( +; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; COMMON-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; COMMON-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr +; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT]], align 4 +; COMMON-NEXT: ret void ; +; PTX-LABEL: read_only_gep_asc0( +; PTX: { +; PTX-NEXT: .reg .b32 %r<2>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_asc0_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_asc0_param_1+4]; +; PTX-NEXT: st.global.u32 [%rd2], %r1; +; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 %asc = addrspacecast ptr %b to ptr addrspace(101) @@ -208,23 +186,19 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out ; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr( ; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) +; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]] ; SM_60-NEXT: ret void ; ; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr( ; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) +; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]] ; SM_70-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr( @@ -233,9 +207,36 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out ; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 ; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) +; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR5:[0-9]+]] ; COPY-NEXT: ret void ; +; PTX-LABEL: escape_ptr( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot4[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.b64 %SPL, __local_depot4; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: add.u64 %rd1, %SP, 0; +; PTX-NEXT: add.u64 %rd2, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_param_1+4]; +; PTX-NEXT: st.local.u32 [%rd2+4], %r1; +; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_param_1]; +; PTX-NEXT: st.local.u32 [%rd2], %r2; +; PTX-NEXT: { // callseq 0, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0], %rd1; +; PTX-NEXT: call.uni +; PTX-NEXT: _Z6escapePv, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: } // callseq 0 +; PTX-NEXT: ret; entry: call void @_Z6escapePv(ptr noundef nonnull %s) #0 ret void @@ -246,25 +247,21 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone ; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( ; SM_60-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) +; SM_60-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; SM_60-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]] ; SM_60-NEXT: ret void ; ; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( ; SM_70-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { ; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) +; SM_70-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_70-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; SM_70-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]] ; SM_70-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep( @@ -274,9 +271,37 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone ; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) ; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) ; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 -; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) +; COPY-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR5]] ; COPY-NEXT: ret void ; +; PTX-LABEL: escape_ptr_gep( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot5[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<4>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.b64 %SPL, __local_depot5; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: add.u64 %rd1, %SP, 0; +; PTX-NEXT: add.u64 %rd2, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_gep_param_1+4]; +; PTX-NEXT: st.local.u32 [%rd2+4], %r1; +; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_gep_param_1]; +; PTX-NEXT: st.local.u32 [%rd2], %r2; +; PTX-NEXT: add.s64 %rd3, %rd1, 4; +; PTX-NEXT: { // callseq 1, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0], %rd3; +; PTX-NEXT: call.uni +; PTX-NEXT: _Z6escapePv, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: } // callseq 1 +; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 call void @_Z6escapePv(ptr noundef nonnull %b) #0 @@ -285,37 +310,36 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_store( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_store( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_store( -; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: store ptr [[S1]], ptr [[OUT]], align 8 -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_store( +; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: store ptr [[S1]], ptr [[OUT]], align 8 +; COMMON-NEXT: ret void ; +; PTX-LABEL: escape_ptr_store( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot6[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.b64 %SPL, __local_depot6; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_store_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: add.u64 %rd3, %SP, 0; +; PTX-NEXT: add.u64 %rd4, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_store_param_1+4]; +; PTX-NEXT: st.local.u32 [%rd4+4], %r1; +; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_store_param_1]; +; PTX-NEXT: st.local.u32 [%rd4], %r2; +; PTX-NEXT: st.global.u64 [%rd2], %rd3; +; PTX-NEXT: ret; entry: store ptr %s, ptr %out, align 8 ret void @@ -323,40 +347,38 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; SM_60-NEXT: store ptr [[B]], ptr [[OUT2]], align 8 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 -; SM_70-NEXT: store ptr [[B]], ptr [[OUT2]], align 8 -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store( -; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 -; COPY-NEXT: store ptr [[B]], ptr [[OUT]], align 8 -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptr_gep_store( +; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4 +; COMMON-NEXT: store ptr [[B]], ptr [[OUT]], align 8 +; COMMON-NEXT: ret void ; +; PTX-LABEL: escape_ptr_gep_store( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot7[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.b64 %SPL, __local_depot7; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_gep_store_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: add.u64 %rd3, %SP, 0; +; PTX-NEXT: add.u64 %rd4, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_gep_store_param_1+4]; +; PTX-NEXT: st.local.u32 [%rd4+4], %r1; +; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_gep_store_param_1]; +; PTX-NEXT: st.local.u32 [%rd4], %r2; +; PTX-NEXT: add.s64 %rd5, %rd3, 4; +; PTX-NEXT: st.global.u64 [%rd2], %rd5; +; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 store ptr %b, ptr %out, align 8 @@ -365,40 +387,37 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @escape_ptrtoint( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 -; SM_60-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @escape_ptrtoint( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 -; SM_70-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @escape_ptrtoint( -; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64 -; COPY-NEXT: store i64 [[I]], ptr [[OUT]], align 8 -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @escape_ptrtoint( +; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64 +; COMMON-NEXT: store i64 [[I]], ptr [[OUT]], align 8 +; COMMON-NEXT: ret void ; +; PTX-LABEL: escape_ptrtoint( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot8[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.b64 %SPL, __local_depot8; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u64 %rd1, [escape_ptrtoint_param_0]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: add.u64 %rd3, %SP, 0; +; PTX-NEXT: add.u64 %rd4, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [escape_ptrtoint_param_1+4]; +; PTX-NEXT: st.local.u32 [%rd4+4], %r1; +; PTX-NEXT: ld.param.u32 %r2, [escape_ptrtoint_param_1]; +; PTX-NEXT: st.local.u32 [%rd4], %r2; +; PTX-NEXT: st.global.u64 [%rd2], %rd3; +; PTX-NEXT: ret; entry: %i = ptrtoint ptr %s to i64 store i64 %i, ptr %out, align 8 @@ -407,23 +426,12 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_from_param( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_from_param( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) -; SM_70-NEXT: ret void +; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param( +; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; LOWER-ARGS-NEXT: [[ENTRY:.*:]] +; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true) +; LOWER-ARGS-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -434,6 +442,46 @@ define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeo ; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true) ; COPY-NEXT: ret void ; +; PTX-LABEL: memcpy_from_param( +; PTX: { +; PTX-NEXT: .reg .b16 %rs<17>; +; PTX-NEXT: .reg .b64 %rd<2>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: ld.param.u64 %rd1, [memcpy_from_param_param_0]; +; PTX-NEXT: ld.param.u8 %rs1, [memcpy_from_param_param_1+15]; +; PTX-NEXT: st.volatile.u8 [%rd1+15], %rs1; +; PTX-NEXT: ld.param.u8 %rs2, [memcpy_from_param_param_1+14]; +; PTX-NEXT: st.volatile.u8 [%rd1+14], %rs2; +; PTX-NEXT: ld.param.u8 %rs3, [memcpy_from_param_param_1+13]; +; PTX-NEXT: st.volatile.u8 [%rd1+13], %rs3; +; PTX-NEXT: ld.param.u8 %rs4, [memcpy_from_param_param_1+12]; +; PTX-NEXT: st.volatile.u8 [%rd1+12], %rs4; +; PTX-NEXT: ld.param.u8 %rs5, [memcpy_from_param_param_1+11]; +; PTX-NEXT: st.volatile.u8 [%rd1+11], %rs5; +; PTX-NEXT: ld.param.u8 %rs6, [memcpy_from_param_param_1+10]; +; PTX-NEXT: st.volatile.u8 [%rd1+10], %rs6; +; PTX-NEXT: ld.param.u8 %rs7, [memcpy_from_param_param_1+9]; +; PTX-NEXT: st.volatile.u8 [%rd1+9], %rs7; +; PTX-NEXT: ld.param.u8 %rs8, [memcpy_from_param_param_1+8]; +; PTX-NEXT: st.volatile.u8 [%rd1+8], %rs8; +; PTX-NEXT: ld.param.u8 %rs9, [memcpy_from_param_param_1+7]; +; PTX-NEXT: st.volatile.u8 [%rd1+7], %rs9; +; PTX-NEXT: ld.param.u8 %rs10, [memcpy_from_param_param_1+6]; +; PTX-NEXT: st.volatile.u8 [%rd1+6], %rs10; +; PTX-NEXT: ld.param.u8 %rs11, [memcpy_from_param_param_1+5]; +; PTX-NEXT: st.volatile.u8 [%rd1+5], %rs11; +; PTX-NEXT: ld.param.u8 %rs12, [memcpy_from_param_param_1+4]; +; PTX-NEXT: st.volatile.u8 [%rd1+4], %rs12; +; PTX-NEXT: ld.param.u8 %rs13, [memcpy_from_param_param_1+3]; +; PTX-NEXT: st.volatile.u8 [%rd1+3], %rs13; +; PTX-NEXT: ld.param.u8 %rs14, [memcpy_from_param_param_1+2]; +; PTX-NEXT: st.volatile.u8 [%rd1+2], %rs14; +; PTX-NEXT: ld.param.u8 %rs15, [memcpy_from_param_param_1+1]; +; PTX-NEXT: st.volatile.u8 [%rd1+1], %rs15; +; PTX-NEXT: ld.param.u8 %rs16, [memcpy_from_param_param_1]; +; PTX-NEXT: st.volatile.u8 [%rd1], %rs16; +; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) ret void @@ -441,23 +489,12 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign( -; SM_60-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign( -; SM_70-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) -; SM_70-NEXT: ret void +; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign( +; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; LOWER-ARGS-NEXT: [[ENTRY:.*:]] +; LOWER-ARGS-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true) +; LOWER-ARGS-NEXT: ret void ; ; COPY-LABEL: define dso_local ptx_kernel void @memcpy_from_param_noalign( ; COPY-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { @@ -468,6 +505,46 @@ define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture nound ; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[OUT]], ptr [[S1]], i64 16, i1 true) ; COPY-NEXT: ret void ; +; PTX-LABEL: memcpy_from_param_noalign( +; PTX: { +; PTX-NEXT: .reg .b16 %rs<17>; +; PTX-NEXT: .reg .b64 %rd<2>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: ld.param.u64 %rd1, [memcpy_from_param_noalign_param_0]; +; PTX-NEXT: ld.param.u8 %rs1, [memcpy_from_param_noalign_param_1+15]; +; PTX-NEXT: st.volatile.u8 [%rd1+15], %rs1; +; PTX-NEXT: ld.param.u8 %rs2, [memcpy_from_param_noalign_param_1+14]; +; PTX-NEXT: st.volatile.u8 [%rd1+14], %rs2; +; PTX-NEXT: ld.param.u8 %rs3, [memcpy_from_param_noalign_param_1+13]; +; PTX-NEXT: st.volatile.u8 [%rd1+13], %rs3; +; PTX-NEXT: ld.param.u8 %rs4, [memcpy_from_param_noalign_param_1+12]; +; PTX-NEXT: st.volatile.u8 [%rd1+12], %rs4; +; PTX-NEXT: ld.param.u8 %rs5, [memcpy_from_param_noalign_param_1+11]; +; PTX-NEXT: st.volatile.u8 [%rd1+11], %rs5; +; PTX-NEXT: ld.param.u8 %rs6, [memcpy_from_param_noalign_param_1+10]; +; PTX-NEXT: st.volatile.u8 [%rd1+10], %rs6; +; PTX-NEXT: ld.param.u8 %rs7, [memcpy_from_param_noalign_param_1+9]; +; PTX-NEXT: st.volatile.u8 [%rd1+9], %rs7; +; PTX-NEXT: ld.param.u8 %rs8, [memcpy_from_param_noalign_param_1+8]; +; PTX-NEXT: st.volatile.u8 [%rd1+8], %rs8; +; PTX-NEXT: ld.param.u8 %rs9, [memcpy_from_param_noalign_param_1+7]; +; PTX-NEXT: st.volatile.u8 [%rd1+7], %rs9; +; PTX-NEXT: ld.param.u8 %rs10, [memcpy_from_param_noalign_param_1+6]; +; PTX-NEXT: st.volatile.u8 [%rd1+6], %rs10; +; PTX-NEXT: ld.param.u8 %rs11, [memcpy_from_param_noalign_param_1+5]; +; PTX-NEXT: st.volatile.u8 [%rd1+5], %rs11; +; PTX-NEXT: ld.param.u8 %rs12, [memcpy_from_param_noalign_param_1+4]; +; PTX-NEXT: st.volatile.u8 [%rd1+4], %rs12; +; PTX-NEXT: ld.param.u8 %rs13, [memcpy_from_param_noalign_param_1+3]; +; PTX-NEXT: st.volatile.u8 [%rd1+3], %rs13; +; PTX-NEXT: ld.param.u8 %rs14, [memcpy_from_param_noalign_param_1+2]; +; PTX-NEXT: st.volatile.u8 [%rd1+2], %rs14; +; PTX-NEXT: ld.param.u8 %rs15, [memcpy_from_param_noalign_param_1+1]; +; PTX-NEXT: st.volatile.u8 [%rd1+1], %rs15; +; PTX-NEXT: ld.param.u8 %rs16, [memcpy_from_param_noalign_param_1]; +; PTX-NEXT: st.volatile.u8 [%rd1], %rs16; +; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) ret void @@ -475,37 +552,79 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @memcpy_to_param( -; SM_60-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[ENTRY:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; SM_60-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr -; SM_60-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @memcpy_to_param( -; SM_70-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[ENTRY:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; SM_70-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr -; SM_70-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @memcpy_to_param( -; COPY-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[ENTRY:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true) -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @memcpy_to_param( +; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef readnone byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true) +; COMMON-NEXT: ret void ; +; PTX-LABEL: memcpy_to_param( +; PTX: { +; PTX-NEXT: .local .align 8 .b8 __local_depot11[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<48>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %entry +; PTX-NEXT: mov.b64 %SPL, __local_depot11; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u64 %rd1, [memcpy_to_param_param_0]; +; PTX-NEXT: add.u64 %rd3, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [memcpy_to_param_param_1+4]; +; PTX-NEXT: st.local.u32 [%rd3+4], %r1; +; PTX-NEXT: ld.param.u32 %r2, [memcpy_to_param_param_1]; +; PTX-NEXT: st.local.u32 [%rd3], %r2; +; PTX-NEXT: ld.volatile.u8 %rd4, [%rd1]; +; PTX-NEXT: ld.volatile.u8 %rd5, [%rd1+1]; +; PTX-NEXT: shl.b64 %rd6, %rd5, 8; +; PTX-NEXT: or.b64 %rd7, %rd6, %rd4; +; PTX-NEXT: ld.volatile.u8 %rd8, [%rd1+2]; +; PTX-NEXT: shl.b64 %rd9, %rd8, 16; +; PTX-NEXT: ld.volatile.u8 %rd10, [%rd1+3]; +; PTX-NEXT: shl.b64 %rd11, %rd10, 24; +; PTX-NEXT: or.b64 %rd12, %rd11, %rd9; +; PTX-NEXT: or.b64 %rd13, %rd12, %rd7; +; PTX-NEXT: ld.volatile.u8 %rd14, [%rd1+4]; +; PTX-NEXT: ld.volatile.u8 %rd15, [%rd1+5]; +; PTX-NEXT: shl.b64 %rd16, %rd15, 8; +; PTX-NEXT: or.b64 %rd17, %rd16, %rd14; +; PTX-NEXT: ld.volatile.u8 %rd18, [%rd1+6]; +; PTX-NEXT: shl.b64 %rd19, %rd18, 16; +; PTX-NEXT: ld.volatile.u8 %rd20, [%rd1+7]; +; PTX-NEXT: shl.b64 %rd21, %rd20, 24; +; PTX-NEXT: or.b64 %rd22, %rd21, %rd19; +; PTX-NEXT: or.b64 %rd23, %rd22, %rd17; +; PTX-NEXT: shl.b64 %rd24, %rd23, 32; +; PTX-NEXT: or.b64 %rd25, %rd24, %rd13; +; PTX-NEXT: st.volatile.u64 [%SP], %rd25; +; PTX-NEXT: ld.volatile.u8 %rd26, [%rd1+8]; +; PTX-NEXT: ld.volatile.u8 %rd27, [%rd1+9]; +; PTX-NEXT: shl.b64 %rd28, %rd27, 8; +; PTX-NEXT: or.b64 %rd29, %rd28, %rd26; +; PTX-NEXT: ld.volatile.u8 %rd30, [%rd1+10]; +; PTX-NEXT: shl.b64 %rd31, %rd30, 16; +; PTX-NEXT: ld.volatile.u8 %rd32, [%rd1+11]; +; PTX-NEXT: shl.b64 %rd33, %rd32, 24; +; PTX-NEXT: or.b64 %rd34, %rd33, %rd31; +; PTX-NEXT: or.b64 %rd35, %rd34, %rd29; +; PTX-NEXT: ld.volatile.u8 %rd36, [%rd1+12]; +; PTX-NEXT: ld.volatile.u8 %rd37, [%rd1+13]; +; PTX-NEXT: shl.b64 %rd38, %rd37, 8; +; PTX-NEXT: or.b64 %rd39, %rd38, %rd36; +; PTX-NEXT: ld.volatile.u8 %rd40, [%rd1+14]; +; PTX-NEXT: shl.b64 %rd41, %rd40, 16; +; PTX-NEXT: ld.volatile.u8 %rd42, [%rd1+15]; +; PTX-NEXT: shl.b64 %rd43, %rd42, 24; +; PTX-NEXT: or.b64 %rd44, %rd43, %rd41; +; PTX-NEXT: or.b64 %rd45, %rd44, %rd39; +; PTX-NEXT: shl.b64 %rd46, %rd45, 32; +; PTX-NEXT: or.b64 %rd47, %rd46, %rd35; +; PTX-NEXT: st.volatile.u64 [%SP+8], %rd47; +; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) ret void @@ -513,40 +632,22 @@ entry: ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 { -; SM_60-LABEL: define dso_local ptx_kernel void @copy_on_store( -; SM_60-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_60-NEXT: [[BB:.*:]] -; SM_60-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_60-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_60-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; SM_60-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr -; SM_60-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 -; SM_60-NEXT: store i32 [[I]], ptr [[S3]], align 4 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define dso_local ptx_kernel void @copy_on_store( -; SM_70-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { -; SM_70-NEXT: [[BB:.*:]] -; SM_70-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 -; SM_70-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S3]], ptr addrspace(101) align 4 [[S4]], i64 8, i1 false) -; SM_70-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) -; SM_70-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr -; SM_70-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 -; SM_70-NEXT: store i32 [[I]], ptr [[S3]], align 4 -; SM_70-NEXT: ret void -; -; COPY-LABEL: define dso_local ptx_kernel void @copy_on_store( -; COPY-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { -; COPY-NEXT: [[BB:.*:]] -; COPY-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 -; COPY-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) -; COPY-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4 -; COPY-NEXT: store i32 [[I]], ptr [[S1]], align 4 -; COPY-NEXT: ret void +; COMMON-LABEL: define dso_local ptx_kernel void @copy_on_store( +; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S2:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false) +; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[S1]], align 4 +; COMMON-NEXT: ret void ; +; PTX-LABEL: copy_on_store( +; PTX: { +; PTX-EMPTY: +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %bb +; PTX-NEXT: ret; bb: %i = load i32, ptr %in, align 4 store i32 %i, ptr %s, align 4 @@ -557,8 +658,6 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; SM_60-LABEL: define ptx_kernel void @test_select( ; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { ; SM_60-NEXT: [[BB:.*:]] -; SM_60-NEXT: [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr ; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 ; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false) @@ -567,21 +666,19 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) ; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] ; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4 +; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 ; SM_60-NEXT: ret void ; ; SM_70-LABEL: define ptx_kernel void @test_select( ; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { ; SM_70-NEXT: [[BB:.*:]] -; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr ; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) ; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) ; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) ; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) ; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] ; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT2]], align 4 +; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 ; SM_70-NEXT: ret void ; ; COPY-LABEL: define ptx_kernel void @test_select( @@ -598,6 +695,48 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; COPY-NEXT: store i32 [[VALLOADED]], ptr [[OUT]], align 4 ; COPY-NEXT: ret void ; +; PTX_60-LABEL: test_select( +; PTX_60: { +; PTX_60-NEXT: .reg .pred %p<2>; +; PTX_60-NEXT: .reg .b16 %rs<3>; +; PTX_60-NEXT: .reg .b32 %r<4>; +; PTX_60-NEXT: .reg .b64 %rd<3>; +; PTX_60-EMPTY: +; PTX_60-NEXT: // %bb.0: // %bb +; PTX_60-NEXT: ld.param.u8 %rs1, [test_select_param_3]; +; PTX_60-NEXT: and.b16 %rs2, %rs1, 1; +; PTX_60-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX_60-NEXT: ld.param.u64 %rd1, [test_select_param_2]; +; PTX_60-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX_60-NEXT: ld.param.u32 %r1, [test_select_param_1]; +; PTX_60-NEXT: ld.param.u32 %r2, [test_select_param_0]; +; PTX_60-NEXT: selp.b32 %r3, %r2, %r1, %p1; +; PTX_60-NEXT: st.global.u32 [%rd2], %r3; +; PTX_60-NEXT: ret; +; +; PTX_70-LABEL: test_select( +; PTX_70: { +; PTX_70-NEXT: .reg .pred %p<2>; +; PTX_70-NEXT: .reg .b16 %rs<3>; +; PTX_70-NEXT: .reg .b32 %r<2>; +; PTX_70-NEXT: .reg .b64 %rd<10>; +; PTX_70-EMPTY: +; PTX_70-NEXT: // %bb.0: // %bb +; PTX_70-NEXT: ld.param.u8 %rs1, [test_select_param_3]; +; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; +; PTX_70-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX_70-NEXT: mov.b64 %rd1, test_select_param_0; +; PTX_70-NEXT: ld.param.u64 %rd2, [test_select_param_2]; +; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1; +; PTX_70-NEXT: mov.b64 %rd5, %rd4; +; PTX_70-NEXT: cvta.param.u64 %rd6, %rd5; +; PTX_70-NEXT: mov.b64 %rd7, %rd1; +; PTX_70-NEXT: cvta.param.u64 %rd8, %rd7; +; PTX_70-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1; +; PTX_70-NEXT: ld.u32 %r1, [%rd9]; +; PTX_70-NEXT: st.global.u32 [%rd3], %r1; +; PTX_70-NEXT: ret; bb: %ptrnew = select i1 %cond, ptr %input1, ptr %input2 %valloaded = load i32, ptr %ptrnew, align 4 @@ -606,49 +745,45 @@ bb: } define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) { -; SM_60-LABEL: define ptx_kernel void @test_select_write( -; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { -; SM_60-NEXT: [[BB:.*:]] -; SM_60-NEXT: [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_60-NEXT: [[OUT6:%.*]] = addrspacecast ptr addrspace(1) [[OUT5]] to ptr -; SM_60-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 -; SM_60-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) -; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) -; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] -; SM_60-NEXT: store i32 1, ptr [[PTRNEW]], align 4 -; SM_60-NEXT: ret void -; -; SM_70-LABEL: define ptx_kernel void @test_select_write( -; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { -; SM_70-NEXT: [[BB:.*:]] -; SM_70-NEXT: [[OUT5:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) -; SM_70-NEXT: [[OUT6:%.*]] = addrspacecast ptr addrspace(1) [[OUT5]] to ptr -; SM_70-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 -; SM_70-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) -; SM_70-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; SM_70-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; SM_70-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) -; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] -; SM_70-NEXT: store i32 1, ptr [[PTRNEW]], align 4 -; SM_70-NEXT: ret void -; -; COPY-LABEL: define ptx_kernel void @test_select_write( -; COPY-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { -; COPY-NEXT: [[BB:.*:]] -; COPY-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 -; COPY-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) -; COPY-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 -; COPY-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) -; COPY-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] -; COPY-NEXT: store i32 1, ptr [[PTRNEW]], align 4 -; COPY-NEXT: ret void +; COMMON-LABEL: define ptx_kernel void @test_select_write( +; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[INPUT23:%.*]] = alloca i32, align 4 +; COMMON-NEXT: [[INPUT24:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false) +; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 +; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false) +; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]] +; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 +; COMMON-NEXT: ret void ; +; PTX-LABEL: test_select_write( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot14[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b16 %rs<3>; +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %bb +; PTX-NEXT: mov.b64 %SPL, __local_depot14; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u8 %rs1, [test_select_write_param_3]; +; PTX-NEXT: and.b16 %rs2, %rs1, 1; +; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX-NEXT: ld.param.u32 %r1, [test_select_write_param_1]; +; PTX-NEXT: st.u32 [%SP], %r1; +; PTX-NEXT: ld.param.u32 %r2, [test_select_write_param_0]; +; PTX-NEXT: st.u32 [%SP+4], %r2; +; PTX-NEXT: add.u64 %rd2, %SPL, 4; +; PTX-NEXT: add.u64 %rd4, %SPL, 0; +; PTX-NEXT: selp.b64 %rd5, %rd2, %rd4, %p1; +; PTX-NEXT: mov.b32 %r3, 1; +; PTX-NEXT: st.local.u32 [%rd5], %r3; +; PTX-NEXT: ret; bb: %ptrnew = select i1 %cond, ptr %input1, ptr %input2 store i32 1, ptr %ptrnew, align 4 @@ -659,8 +794,6 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; SM_60-LABEL: define ptx_kernel void @test_phi( ; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { ; SM_60-NEXT: [[BB:.*:]] -; SM_60-NEXT: [[INOUT7:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) -; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr ; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 ; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) ; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false) @@ -677,14 +810,12 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; SM_60: [[MERGE]]: ; SM_60-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] ; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[INOUT8]], align 4 +; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 ; SM_60-NEXT: ret void ; ; SM_70-LABEL: define ptx_kernel void @test_phi( ; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { ; SM_70-NEXT: [[BB:.*:]] -; SM_70-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) -; SM_70-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr ; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) ; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) ; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) @@ -699,7 +830,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; SM_70: [[MERGE]]: ; SM_70-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] ; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 ; SM_70-NEXT: ret void ; ; COPY-LABEL: define ptx_kernel void @test_phi( @@ -724,6 +855,53 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; COPY-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4 ; COPY-NEXT: ret void ; +; PTX_60-LABEL: test_phi( +; PTX_60: { +; PTX_60-NEXT: .reg .pred %p<2>; +; PTX_60-NEXT: .reg .b16 %rs<3>; +; PTX_60-NEXT: .reg .b32 %r<5>; +; PTX_60-NEXT: .reg .b64 %rd<3>; +; PTX_60-EMPTY: +; PTX_60-NEXT: // %bb.0: // %bb +; PTX_60-NEXT: ld.param.u8 %rs1, [test_phi_param_3]; +; PTX_60-NEXT: and.b16 %rs2, %rs1, 1; +; PTX_60-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX_60-NEXT: ld.param.u64 %rd2, [test_phi_param_2]; +; PTX_60-NEXT: cvta.to.global.u64 %rd1, %rd2; +; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_0]; +; PTX_60-NEXT: @%p1 bra $L__BB15_2; +; PTX_60-NEXT: // %bb.1: // %second +; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_1+4]; +; PTX_60-NEXT: $L__BB15_2: // %merge +; PTX_60-NEXT: st.global.u32 [%rd1], %r4; +; PTX_60-NEXT: ret; +; +; PTX_70-LABEL: test_phi( +; PTX_70: { +; PTX_70-NEXT: .reg .pred %p<2>; +; PTX_70-NEXT: .reg .b16 %rs<3>; +; PTX_70-NEXT: .reg .b32 %r<2>; +; PTX_70-NEXT: .reg .b64 %rd<12>; +; PTX_70-EMPTY: +; PTX_70-NEXT: // %bb.0: // %bb +; PTX_70-NEXT: ld.param.u8 %rs1, [test_phi_param_3]; +; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; +; PTX_70-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX_70-NEXT: mov.b64 %rd6, test_phi_param_0; +; PTX_70-NEXT: ld.param.u64 %rd7, [test_phi_param_2]; +; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd7; +; PTX_70-NEXT: mov.b64 %rd10, %rd6; +; PTX_70-NEXT: cvta.param.u64 %rd11, %rd10; +; PTX_70-NEXT: @%p1 bra $L__BB15_2; +; PTX_70-NEXT: // %bb.1: // %second +; PTX_70-NEXT: mov.b64 %rd8, test_phi_param_1; +; PTX_70-NEXT: mov.b64 %rd9, %rd8; +; PTX_70-NEXT: cvta.param.u64 %rd2, %rd9; +; PTX_70-NEXT: add.s64 %rd11, %rd2, 4; +; PTX_70-NEXT: $L__BB15_2: // %merge +; PTX_70-NEXT: ld.u32 %r1, [%rd11]; +; PTX_70-NEXT: st.global.u32 [%rd1], %r1; +; PTX_70-NEXT: ret; bb: br i1 %cond, label %first, label %second @@ -744,7 +922,7 @@ merge: ; preds = %second, %first define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) { ; COMMON-LABEL: define ptx_kernel void @test_phi_write( -; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { ; COMMON-NEXT: [[BB:.*:]] ; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 ; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) @@ -764,6 +942,35 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 ; COMMON-NEXT: ret void ; +; PTX-LABEL: test_phi_write( +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot16[8]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b16 %rs<3>; +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: // %bb +; PTX-NEXT: mov.b64 %SPL, __local_depot16; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: ld.param.u8 %rs1, [test_phi_write_param_2]; +; PTX-NEXT: and.b16 %rs2, %rs1, 1; +; PTX-NEXT: setp.eq.b16 %p1, %rs2, 1; +; PTX-NEXT: add.u64 %rd1, %SPL, 0; +; PTX-NEXT: ld.param.u32 %r1, [test_phi_write_param_1+4]; +; PTX-NEXT: st.u32 [%SP], %r1; +; PTX-NEXT: add.u64 %rd6, %SPL, 4; +; PTX-NEXT: ld.param.u32 %r2, [test_phi_write_param_0]; +; PTX-NEXT: st.u32 [%SP+4], %r2; +; PTX-NEXT: @%p1 bra $L__BB16_2; +; PTX-NEXT: // %bb.1: // %second +; PTX-NEXT: mov.b64 %rd6, %rd1; +; PTX-NEXT: $L__BB16_2: // %merge +; PTX-NEXT: mov.b32 %r3, 1; +; PTX-NEXT: st.local.u32 [%rd6], %r3; +; PTX-NEXT: ret; bb: br i1 %cond, label %first, label %second diff --git a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll index 82301e42f7d06..a257b6cfd77b7 100644 --- a/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-addr-class.ll @@ -169,19 +169,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK-NEXT: .b8 0 // EOM(1) ; CHECK-NEXT: .b8 0 // EOM(2) ; CHECK-NEXT: .b8 6 // Abbreviation Code -; CHECK-NEXT: .b8 5 // DW_TAG_formal_parameter -; CHECK-NEXT: .b8 0 // DW_CHILDREN_no -; CHECK-NEXT: .b8 3 // DW_AT_name -; CHECK-NEXT: .b8 8 // DW_FORM_string -; CHECK-NEXT: .b8 58 // DW_AT_decl_file -; CHECK-NEXT: .b8 11 // DW_FORM_data1 -; CHECK-NEXT: .b8 59 // DW_AT_decl_line -; CHECK-NEXT: .b8 11 // DW_FORM_data1 -; CHECK-NEXT: .b8 73 // DW_AT_type -; CHECK-NEXT: .b8 19 // DW_FORM_ref4 -; CHECK-NEXT: .b8 0 // EOM(1) -; CHECK-NEXT: .b8 0 // EOM(2) -; CHECK-NEXT: .b8 7 // Abbreviation Code ; CHECK-NEXT: .b8 15 // DW_TAG_pointer_type ; CHECK-NEXT: .b8 0 // DW_CHILDREN_no ; CHECK-NEXT: .b8 73 // DW_AT_type @@ -192,12 +179,12 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK-NEXT: } ; CHECK-NEXT: .section .debug_info ; CHECK-NEXT: { -; CHECK-NEXT: .b32 238 // Length of Unit +; CHECK-NEXT: .b32 254 // Length of Unit ; CHECK-NEXT: .b8 2 // DWARF version number ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b32 .debug_abbrev // Offset Into Abbrev. Section ; CHECK-NEXT: .b8 8 // Address Size (in bytes) -; CHECK-NEXT: .b8 1 // Abbrev [1] 0xb:0xe7 DW_TAG_compile_unit +; CHECK-NEXT: .b8 1 // Abbrev [1] 0xb:0xf7 DW_TAG_compile_unit ; CHECK-NEXT: .b8 99 // DW_AT_producer ; CHECK-NEXT: .b8 108 ; CHECK-NEXT: .b8 97 @@ -307,7 +294,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK-NEXT: .b8 9 // DW_AT_location ; CHECK-NEXT: .b8 3 ; CHECK-NEXT: .b64 SHARED -; CHECK-NEXT: .b8 4 // Abbrev [4] 0x90:0x53 DW_TAG_subprogram +; CHECK-NEXT: .b8 4 // Abbrev [4] 0x90:0x63 DW_TAG_subprogram ; CHECK-NEXT: .b64 $L__func_begin0 // DW_AT_low_pc ; CHECK-NEXT: .b64 $L__func_end0 // DW_AT_high_pc ; CHECK-NEXT: .b8 1 // DW_AT_frame_base @@ -337,20 +324,36 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 6 // DW_AT_decl_line -; CHECK-NEXT: .b32 227 // DW_AT_type -; CHECK-NEXT: .b8 6 // Abbrev [6] 0xc0:0x9 DW_TAG_formal_parameter +; CHECK-NEXT: .b32 248 // DW_AT_type +; CHECK-NEXT: .b8 5 // Abbrev [5] 0xc0:0x11 DW_TAG_formal_parameter +; CHECK-NEXT: .b8 2 // DW_AT_address_class +; CHECK-NEXT: .b8 6 // DW_AT_location +; CHECK-NEXT: .b8 144 +; CHECK-NEXT: .b8 177 +; CHECK-NEXT: .b8 200 +; CHECK-NEXT: .b8 201 +; CHECK-NEXT: .b8 171 +; CHECK-NEXT: .b8 2 ; CHECK-NEXT: .b8 120 // DW_AT_name ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 6 // DW_AT_decl_line -; CHECK-NEXT: .b32 236 // DW_AT_type -; CHECK-NEXT: .b8 6 // Abbrev [6] 0xc9:0x9 DW_TAG_formal_parameter +; CHECK-NEXT: .b32 243 // DW_AT_type +; CHECK-NEXT: .b8 5 // Abbrev [5] 0xd1:0x11 DW_TAG_formal_parameter +; CHECK-NEXT: .b8 2 // DW_AT_address_class +; CHECK-NEXT: .b8 6 // DW_AT_location +; CHECK-NEXT: .b8 144 +; CHECK-NEXT: .b8 179 +; CHECK-NEXT: .b8 200 +; CHECK-NEXT: .b8 201 +; CHECK-NEXT: .b8 171 +; CHECK-NEXT: .b8 2 ; CHECK-NEXT: .b8 121 // DW_AT_name ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 6 // DW_AT_decl_line -; CHECK-NEXT: .b32 236 // DW_AT_type -; CHECK-NEXT: .b8 5 // Abbrev [5] 0xd2:0x10 DW_TAG_formal_parameter +; CHECK-NEXT: .b32 243 // DW_AT_type +; CHECK-NEXT: .b8 5 // Abbrev [5] 0xe2:0x10 DW_TAG_formal_parameter ; CHECK-NEXT: .b8 2 // DW_AT_address_class ; CHECK-NEXT: .b8 5 // DW_AT_location ; CHECK-NEXT: .b8 144 @@ -364,7 +367,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK-NEXT: .b8 6 // DW_AT_decl_line ; CHECK-NEXT: .b32 111 // DW_AT_type ; CHECK-NEXT: .b8 0 // End Of Children Mark -; CHECK-NEXT: .b8 3 // Abbrev [3] 0xe3:0x9 DW_TAG_base_type +; CHECK-NEXT: .b8 6 // Abbrev [6] 0xf3:0x5 DW_TAG_pointer_type +; CHECK-NEXT: .b32 248 // DW_AT_type +; CHECK-NEXT: .b8 3 // Abbrev [3] 0xf8:0x9 DW_TAG_base_type ; CHECK-NEXT: .b8 102 // DW_AT_name ; CHECK-NEXT: .b8 108 ; CHECK-NEXT: .b8 111 @@ -373,8 +378,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 4 // DW_AT_encoding ; CHECK-NEXT: .b8 4 // DW_AT_byte_size -; CHECK-NEXT: .b8 7 // Abbrev [7] 0xec:0x5 DW_TAG_pointer_type -; CHECK-NEXT: .b32 227 // DW_AT_type ; CHECK-NEXT: .b8 0 // End Of Children Mark ; CHECK-NEXT: } ; CHECK-NEXT: .section .debug_macinfo { } diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll index 62b30a1f15aff..fa2925af37971 100644 --- a/llvm/test/DebugInfo/NVPTX/debug-info.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll @@ -100,8 +100,8 @@ if.end: ; preds = %if.then, %entry ; CHECK: .section .debug_loc ; CHECK-NEXT: { ; CHECK-NEXT: $L__debug_loc0: -; CHECK-NEXT: .b64 $L__tmp8 ; CHECK-NEXT: .b64 $L__tmp10 +; CHECK-NEXT: .b64 $L__tmp12 ; CHECK-NEXT: .b8 5 // Loc expr size ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 144 // DW_OP_regx @@ -112,7 +112,7 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b64 0 ; CHECK-NEXT: .b64 0 ; CHECK-NEXT: $L__debug_loc1: -; CHECK-NEXT: .b64 $L__tmp5 +; CHECK-NEXT: .b64 $L__tmp7 ; CHECK-NEXT: .b64 $L__func_end0 ; CHECK-NEXT: .b8 5 // Loc expr size ; CHECK-NEXT: .b8 0 @@ -586,12 +586,12 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: } ; CHECK-NEXT: .section .debug_info ; CHECK-NEXT: { -; CHECK-NEXT: .b32 2388 // Length of Unit +; CHECK-NEXT: .b32 2404 // Length of Unit ; CHECK-NEXT: .b8 2 // DWARF version number ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b32 .debug_abbrev // Offset Into Abbrev. Section ; CHECK-NEXT: .b8 8 // Address Size (in bytes) -; CHECK-NEXT: .b8 1 // Abbrev [1] 0xb:0x94d DW_TAG_compile_unit +; CHECK-NEXT: .b8 1 // Abbrev [1] 0xb:0x95d DW_TAG_compile_unit ; CHECK-NEXT: .b8 0 // DW_AT_producer ; CHECK-NEXT: .b8 4 // DW_AT_language ; CHECK-NEXT: .b8 0 @@ -2481,7 +2481,7 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b8 4 // DW_AT_byte_size ; CHECK-NEXT: .b8 12 // Abbrev [12] 0x83d:0x5 DW_TAG_pointer_type ; CHECK-NEXT: .b32 2100 // DW_AT_type -; CHECK-NEXT: .b8 23 // Abbrev [23] 0x842:0xd5 DW_TAG_subprogram +; CHECK-NEXT: .b8 23 // Abbrev [23] 0x842:0xe5 DW_TAG_subprogram ; CHECK-NEXT: .b64 $L__func_begin0 // DW_AT_low_pc ; CHECK-NEXT: .b64 $L__func_end0 // DW_AT_high_pc ; CHECK-NEXT: .b8 1 // DW_AT_frame_base @@ -2522,7 +2522,7 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 5 // DW_AT_decl_line -; CHECK-NEXT: .b32 2384 // DW_AT_type +; CHECK-NEXT: .b32 2400 // DW_AT_type ; CHECK-NEXT: .b8 25 // Abbrev [25] 0x87d:0xd DW_TAG_formal_parameter ; CHECK-NEXT: .b32 $L__debug_loc0 // DW_AT_location ; CHECK-NEXT: .b8 97 // DW_AT_name @@ -2530,54 +2530,70 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 5 // DW_AT_decl_line ; CHECK-NEXT: .b32 2100 // DW_AT_type -; CHECK-NEXT: .b8 22 // Abbrev [22] 0x88a:0x9 DW_TAG_formal_parameter +; CHECK-NEXT: .b8 24 // Abbrev [24] 0x88a:0x11 DW_TAG_formal_parameter +; CHECK-NEXT: .b8 2 // DW_AT_address_class +; CHECK-NEXT: .b8 6 // DW_AT_location +; CHECK-NEXT: .b8 144 +; CHECK-NEXT: .b8 179 +; CHECK-NEXT: .b8 200 +; CHECK-NEXT: .b8 201 +; CHECK-NEXT: .b8 171 +; CHECK-NEXT: .b8 2 ; CHECK-NEXT: .b8 120 // DW_AT_name ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 5 // DW_AT_decl_line ; CHECK-NEXT: .b32 2109 // DW_AT_type -; CHECK-NEXT: .b8 22 // Abbrev [22] 0x893:0x9 DW_TAG_formal_parameter +; CHECK-NEXT: .b8 24 // Abbrev [24] 0x89b:0x11 DW_TAG_formal_parameter +; CHECK-NEXT: .b8 2 // DW_AT_address_class +; CHECK-NEXT: .b8 6 // DW_AT_location +; CHECK-NEXT: .b8 144 +; CHECK-NEXT: .b8 180 +; CHECK-NEXT: .b8 200 +; CHECK-NEXT: .b8 201 +; CHECK-NEXT: .b8 171 +; CHECK-NEXT: .b8 2 ; CHECK-NEXT: .b8 121 // DW_AT_name ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 5 // DW_AT_decl_line ; CHECK-NEXT: .b32 2109 // DW_AT_type -; CHECK-NEXT: .b8 26 // Abbrev [26] 0x89c:0xd DW_TAG_variable +; CHECK-NEXT: .b8 26 // Abbrev [26] 0x8ac:0xd DW_TAG_variable ; CHECK-NEXT: .b32 $L__debug_loc1 // DW_AT_location ; CHECK-NEXT: .b8 105 // DW_AT_name ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 1 // DW_AT_decl_file ; CHECK-NEXT: .b8 6 // DW_AT_decl_line -; CHECK-NEXT: .b32 2384 // DW_AT_type -; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8a9:0x18 DW_TAG_inlined_subroutine +; CHECK-NEXT: .b32 2400 // DW_AT_type +; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8b9:0x18 DW_TAG_inlined_subroutine ; CHECK-NEXT: .b32 691 // DW_AT_abstract_origin -; CHECK-NEXT: .b64 $L__tmp1 // DW_AT_low_pc -; CHECK-NEXT: .b64 $L__tmp2 // DW_AT_high_pc +; CHECK-NEXT: .b64 $L__tmp3 // DW_AT_low_pc +; CHECK-NEXT: .b64 $L__tmp4 // DW_AT_high_pc ; CHECK-NEXT: .b8 1 // DW_AT_call_file ; CHECK-NEXT: .b8 6 // DW_AT_call_line ; CHECK-NEXT: .b8 11 // DW_AT_call_column -; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8c1:0x18 DW_TAG_inlined_subroutine +; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8d1:0x18 DW_TAG_inlined_subroutine ; CHECK-NEXT: .b32 1450 // DW_AT_abstract_origin -; CHECK-NEXT: .b64 $L__tmp2 // DW_AT_low_pc -; CHECK-NEXT: .b64 $L__tmp3 // DW_AT_high_pc +; CHECK-NEXT: .b64 $L__tmp4 // DW_AT_low_pc +; CHECK-NEXT: .b64 $L__tmp5 // DW_AT_high_pc ; CHECK-NEXT: .b8 1 // DW_AT_call_file ; CHECK-NEXT: .b8 6 // DW_AT_call_line ; CHECK-NEXT: .b8 24 // DW_AT_call_column -; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8d9:0x18 DW_TAG_inlined_subroutine +; CHECK-NEXT: .b8 27 // Abbrev [27] 0x8e9:0x18 DW_TAG_inlined_subroutine ; CHECK-NEXT: .b32 2044 // DW_AT_abstract_origin -; CHECK-NEXT: .b64 $L__tmp3 // DW_AT_low_pc -; CHECK-NEXT: .b64 $L__tmp4 // DW_AT_high_pc +; CHECK-NEXT: .b64 $L__tmp5 // DW_AT_low_pc +; CHECK-NEXT: .b64 $L__tmp6 // DW_AT_high_pc ; CHECK-NEXT: .b8 1 // DW_AT_call_file ; CHECK-NEXT: .b8 6 // DW_AT_call_line ; CHECK-NEXT: .b8 37 // DW_AT_call_column -; CHECK-NEXT: .b8 28 // Abbrev [28] 0x8f1:0x25 DW_TAG_inlined_subroutine +; CHECK-NEXT: .b8 28 // Abbrev [28] 0x901:0x25 DW_TAG_inlined_subroutine ; CHECK-NEXT: .b32 2050 // DW_AT_abstract_origin -; CHECK-NEXT: .b64 $L__tmp9 // DW_AT_low_pc -; CHECK-NEXT: .b64 $L__tmp10 // DW_AT_high_pc +; CHECK-NEXT: .b64 $L__tmp11 // DW_AT_low_pc +; CHECK-NEXT: .b64 $L__tmp12 // DW_AT_high_pc ; CHECK-NEXT: .b8 1 // DW_AT_call_file ; CHECK-NEXT: .b8 8 // DW_AT_call_line ; CHECK-NEXT: .b8 5 // DW_AT_call_column -; CHECK-NEXT: .b8 29 // Abbrev [29] 0x909:0xc DW_TAG_formal_parameter +; CHECK-NEXT: .b8 29 // Abbrev [29] 0x919:0xc DW_TAG_formal_parameter ; CHECK-NEXT: .b8 2 // DW_AT_address_class ; CHECK-NEXT: .b8 5 // DW_AT_location ; CHECK-NEXT: .b8 144 @@ -2588,17 +2604,17 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b32 2079 // DW_AT_abstract_origin ; CHECK-NEXT: .b8 0 // End Of Children Mark ; CHECK-NEXT: .b8 0 // End Of Children Mark -; CHECK-NEXT: .b8 30 // Abbrev [30] 0x917:0xd DW_TAG_namespace +; CHECK-NEXT: .b8 30 // Abbrev [30] 0x927:0xd DW_TAG_namespace ; CHECK-NEXT: .b8 115 // DW_AT_name ; CHECK-NEXT: .b8 116 ; CHECK-NEXT: .b8 100 ; CHECK-NEXT: .b8 0 -; CHECK-NEXT: .b8 31 // Abbrev [31] 0x91c:0x7 DW_TAG_imported_declaration +; CHECK-NEXT: .b8 31 // Abbrev [31] 0x92c:0x7 DW_TAG_imported_declaration ; CHECK-NEXT: .b8 4 // DW_AT_decl_file ; CHECK-NEXT: .b8 202 // DW_AT_decl_line -; CHECK-NEXT: .b32 2340 // DW_AT_import +; CHECK-NEXT: .b32 2356 // DW_AT_import ; CHECK-NEXT: .b8 0 // End Of Children Mark -; CHECK-NEXT: .b8 32 // Abbrev [32] 0x924:0x1b DW_TAG_subprogram +; CHECK-NEXT: .b8 32 // Abbrev [32] 0x934:0x1b DW_TAG_subprogram ; CHECK-NEXT: .b8 95 // DW_AT_MIPS_linkage_name ; CHECK-NEXT: .b8 90 ; CHECK-NEXT: .b8 76 @@ -2614,12 +2630,12 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 4 // DW_AT_decl_file ; CHECK-NEXT: .b8 44 // DW_AT_decl_line -; CHECK-NEXT: .b32 2367 // DW_AT_type +; CHECK-NEXT: .b32 2383 // DW_AT_type ; CHECK-NEXT: .b8 1 // DW_AT_declaration -; CHECK-NEXT: .b8 7 // Abbrev [7] 0x939:0x5 DW_TAG_formal_parameter -; CHECK-NEXT: .b32 2367 // DW_AT_type +; CHECK-NEXT: .b8 7 // Abbrev [7] 0x949:0x5 DW_TAG_formal_parameter +; CHECK-NEXT: .b32 2383 // DW_AT_type ; CHECK-NEXT: .b8 0 // End Of Children Mark -; CHECK-NEXT: .b8 10 // Abbrev [10] 0x93f:0x11 DW_TAG_base_type +; CHECK-NEXT: .b8 10 // Abbrev [10] 0x94f:0x11 DW_TAG_base_type ; CHECK-NEXT: .b8 108 // DW_AT_name ; CHECK-NEXT: .b8 111 ; CHECK-NEXT: .b8 110 @@ -2636,7 +2652,7 @@ if.end: ; preds = %if.then, %entry ; CHECK-NEXT: .b8 0 ; CHECK-NEXT: .b8 5 // DW_AT_encoding ; CHECK-NEXT: .b8 8 // DW_AT_byte_size -; CHECK-NEXT: .b8 10 // Abbrev [10] 0x950:0x7 DW_TAG_base_type +; CHECK-NEXT: .b8 10 // Abbrev [10] 0x960:0x7 DW_TAG_base_type ; CHECK-NEXT: .b8 105 // DW_AT_name ; CHECK-NEXT: .b8 110 ; CHECK-NEXT: .b8 116 diff --git a/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll b/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll new file mode 100644 index 0000000000000..dbd2662de4274 --- /dev/null +++ b/llvm/test/Transforms/InferAddressSpaces/NVPTX/arguments.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=infer-address-spaces %s | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + + +define ptx_kernel i32 @test_kernel(ptr %a, ptr byval(i32) %b) { +; CHECK-LABEL: define ptx_kernel i32 @test_kernel( +; CHECK-SAME: ptr [[A:%.*]], ptr byval(i32) [[B:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1) +; CHECK-NEXT: [[V1:%.*]] = load i32, ptr addrspace(1) [[TMP1]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[B]], align 4 +; CHECK-NEXT: [[SUM:%.*]] = add i32 [[V1]], [[V2]] +; CHECK-NEXT: ret i32 [[SUM]] +; + %v1 = load i32, ptr %a + %v2 = load i32, ptr %b + %sum = add i32 %v1, %v2 + ret i32 %sum +} + +define i32 @test_device(ptr %a, ptr byval(i32) %b) { +; CHECK-LABEL: define i32 @test_device( +; CHECK-SAME: ptr [[A:%.*]], ptr byval(i32) [[B:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(5) +; CHECK-NEXT: [[V1:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i32, ptr addrspace(5) [[TMP1]], align 4 +; CHECK-NEXT: [[SUM:%.*]] = add i32 [[V1]], [[V2]] +; CHECK-NEXT: ret i32 [[SUM]] +; + %v1 = load i32, ptr %a + %v2 = load i32, ptr %b + %sum = add i32 %v1, %v2 + ret i32 %sum +} From ee1ee1144a20c439432a4f6075bdd7aa0afcfb9f Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Thu, 3 Apr 2025 16:49:59 -0700 Subject: [PATCH 0597/1029] Fix unused variable warning in non-debug build after 7d3dfc862d283319d01997c0672c50b4a082aa4e (NFC) --- llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp index b871ecfb4f0d8..c2dae5e3e5443 100644 --- a/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp @@ -235,7 +235,7 @@ static void printSymbolEntry(raw_ostream &OS, Error XCOFFLinkGraphBuilder::processCsectsAndSymbols() { LLVM_DEBUG(dbgs() << " Creating graph blocks and symbols...\n"); - for (auto [K, V] : SectionTable) { + for ([[maybe_unused]] auto [K, V] : SectionTable) { LLVM_DEBUG(dbgs() << " section entry(idx: " << K << " section: " << V.Section->getName() << ")\n"); } From c0079ba3dd31dd928df0c7f0f0e6106260f0ca19 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Thu, 3 Apr 2025 16:53:55 -0700 Subject: [PATCH 0598/1029] [libc] Make utimes_test more stable (#134321) The test for utimes added in #134167 might fail if the file for one test hasn't been cleaned up by the OS before the second test starts. This patch makes the tests use different files. --- libc/test/src/sys/time/utimes_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/test/src/sys/time/utimes_test.cpp b/libc/test/src/sys/time/utimes_test.cpp index b97befb8626e3..69607ba928e1e 100644 --- a/libc/test/src/sys/time/utimes_test.cpp +++ b/libc/test/src/sys/time/utimes_test.cpp @@ -17,13 +17,12 @@ #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -constexpr const char *FILE_PATH = "utimes.test"; - // SUCCESS: Takes a file and successfully updates // its last access and modified times. TEST(LlvmLibcUtimesTest, ChangeTimesSpecific) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *FILE_PATH = "utimes_pass.test"; auto TEST_FILE = libc_make_test_file_path(FILE_PATH); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT); ASSERT_GT(fd, 0); @@ -62,6 +61,7 @@ TEST(LlvmLibcUtimesTest, InvalidMicroseconds) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + constexpr const char *FILE_PATH = "utimes_fail.test"; auto TEST_FILE = libc_make_test_file_path(FILE_PATH); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT); ASSERT_GT(fd, 0); From 65b85bf8bcb6c88f249de99c6b568f13b9794c1c Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Thu, 3 Apr 2025 16:58:11 -0700 Subject: [PATCH 0599/1029] [flang] Fixed driver link LIT test for PPC targets. (#134320) After #131041, the F128 libraries are not linked for PPC targets even when the driver is built with FLANG_RUNTIME_F128_MATH_LIB. --- flang/test/Driver/linker-flags.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90 index 4e62a8c32d360..20104276d2e4a 100644 --- a/flang/test/Driver/linker-flags.f90 +++ b/flang/test/Driver/linker-flags.f90 @@ -2,7 +2,7 @@ ! invocation. These libraries are added on top of other standard runtime ! libraries that the Clang driver will include. -! RUN: %flang -### --target=ppc64le-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128%f128-lib +! RUN: %flang -### --target=ppc64le-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,UNIX-F128NONE ! RUN: %flang -### --target=aarch64-apple-darwin %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,DARWIN,DARWIN-F128%f128-lib ! RUN: %flang -### --target=sparc-sun-solaris2.11 %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX,SOLARIS-F128%f128-lib ! RUN: %flang -### --target=x86_64-unknown-freebsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,BSD,BSD-F128%f128-lib From fcfbef5582f097f3933470755c3f873e9efce03f Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Thu, 3 Apr 2025 18:15:30 -0700 Subject: [PATCH 0600/1029] [mlir][tosa] Remove extra declarations of MulOperandsAndResultElementType in TosaOps.td (#134300) Minor code cleanup Signed-off-by: Jerry Ge --- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index c5314f8d9d406..c4050dcd6baf9 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -974,11 +974,6 @@ def Tosa_MinimumOp : Tosa_ElementwiseOp<"minimum", [ ]; } -def MulOperandsAndResultElementType : - NativeOpTrait<"MulOperandsAndResultElementType"> { - let cppNamespace = "mlir::OpTrait::tosa"; -} - //===----------------------------------------------------------------------===// // Operator: mul //===----------------------------------------------------------------------===// From 7d05c2326c579ff93b7b2a412cc2fd2b10598b34 Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Thu, 3 Apr 2025 18:15:39 -0700 Subject: [PATCH 0601/1029] [mlir][tosa] Remove extra trailing whitespace (#134290) Trivial change. Signed-off-by: Jerry Ge --- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index c4050dcd6baf9..cda75da57f1ad 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -2246,7 +2246,7 @@ def Tosa_CastOp: Tosa_Op<"cast", [Pure, let description = [{ Casts a tensor from one data type to another. * This table is showing the supported conversions from the TOSA Specification. - * The MLIR dialect here can be used to represent other conversions. + * The MLIR dialect here can be used to represent other conversions. | Mode | Input | Output | |--------------------------|---------|---------| From ae5306f30e17719d7589b7941bd107ce58a04228 Mon Sep 17 00:00:00 2001 From: Sudharsan Veeravalli Date: Fri, 4 Apr 2025 06:57:21 +0530 Subject: [PATCH 0602/1029] [RISCV] Add symbol parsing support for the Xqcibi branch immediate instructions (#134233) This patch adds support for parsing symbols in the Xqcibi branch immediate instructions. While the 32 bit branch instructions use the same instruction format and relocation as the existing branch instructions in RISCV, the 48 bit ones use the `InstFormatQC_EB` instruction format and the `R_RISCV_QC_E_BRANCH` relocation that is defined in `BinaryFormat/ELFRelocs/RISCV_nonstandard.def.` Vendor relocation support will be added in a later patch. --- .../RISCV/MCTargetDesc/RISCVAsmBackend.cpp | 3 ++ .../MCTargetDesc/RISCVELFObjectWriter.cpp | 2 + .../RISCV/MCTargetDesc/RISCVFixupKinds.h | 3 ++ .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp | 2 + llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 2 +- llvm/test/MC/RISCV/xqcibi-relocations.s | 44 +++++++++++++++++++ 6 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 llvm/test/MC/RISCV/xqcibi-relocations.s diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 37cd79e890263..10a26554ed672 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -92,6 +92,7 @@ RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { {"fixup_riscv_tlsdesc_load_lo12", 20, 12, 0}, {"fixup_riscv_tlsdesc_add_lo12", 20, 12, 0}, {"fixup_riscv_tlsdesc_call", 0, 0, 0}, + {"fixup_riscv_qc_e_branch", 0, 48, MCFixupKindInfo::FKF_IsPCRel}, }; static_assert((std::size(Infos)) == RISCV::NumTargetFixupKinds, "Not all fixup kinds added to Infos array"); @@ -165,6 +166,7 @@ bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced( // in the range [-2048, 2046]. return Offset > 2046 || Offset < -2048; case RISCV::fixup_riscv_branch: + case RISCV::fixup_riscv_qc_e_branch: // For conditional branch instructions the immediate must be // in the range [-4096, 4095]. return !isInt<13>(Offset); @@ -464,6 +466,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, Value = (Sbit << 19) | (Lo10 << 9) | (Mid1 << 8) | Hi8; return Value; } + case RISCV::fixup_riscv_qc_e_branch: case RISCV::fixup_riscv_branch: { if (!isInt<13>(Value)) Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp index 5faeb98f0abf5..2e73ba54ae21b 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp @@ -120,6 +120,8 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_RISCV_CALL_PLT; case RISCV::fixup_riscv_call_plt: return ELF::R_RISCV_CALL_PLT; + case RISCV::fixup_riscv_qc_e_branch: + return ELF::R_RISCV_QC_E_BRANCH; } } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h index 821372d3d39a6..df7916a4490b7 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h @@ -77,6 +77,9 @@ enum Fixups { fixup_riscv_tlsdesc_load_lo12, fixup_riscv_tlsdesc_add_lo12, fixup_riscv_tlsdesc_call, + // 12-bit fixup for symbol references in the 48-bit Xqcibi branch immediate + // instructions + fixup_riscv_qc_e_branch, // Used as a sentinel, must be the last fixup_riscv_invalid, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 69ad3d936fbbe..fc98859314680 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -569,6 +569,8 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, FixupKind = RISCV::fixup_riscv_rvc_branch; } else if (MIFrm == RISCVII::InstFormatI) { FixupKind = RISCV::fixup_riscv_12_i; + } else if (MIFrm == RISCVII::InstFormatQC_EB) { + FixupKind = RISCV::fixup_riscv_qc_e_branch; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 93eb82b012eb4..64d6f6d8f8bbf 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -550,7 +550,7 @@ class QCIBranchInst_rii funct3, DAGOperand InTyImm5, string opcodestr> class QCIBranchInst48_rii funct5, DAGOperand InTyImm16, string opcodestr> : RVInst48<(outs), (ins GPRNoX0:$rs1, InTyImm16:$imm16, bare_simm13_lsb0:$imm12), - opcodestr, "$rs1, $imm16, $imm12", [], InstFormatOther> { + opcodestr, "$rs1, $imm16, $imm12", [], InstFormatQC_EB> { bits<5> rs1; bits<16> imm16; bits<12> imm12; diff --git a/llvm/test/MC/RISCV/xqcibi-relocations.s b/llvm/test/MC/RISCV/xqcibi-relocations.s new file mode 100644 index 0000000000000..4899e5f1eac46 --- /dev/null +++ b/llvm/test/MC/RISCV/xqcibi-relocations.s @@ -0,0 +1,44 @@ +# RUN: llvm-mc -triple riscv32 -mattr=+experimental-xqcibi %s -show-encoding \ +# RUN: | FileCheck -check-prefix=INSTR -check-prefix=FIXUP %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-xqcibi %s -o %t.o +# RUN: llvm-readobj -r %t.o | FileCheck -check-prefix=RELOC %s + +# Check prefixes: +# RELOC - Check the relocation in the object. +# FIXUP - Check the fixup on the instruction. +# INSTR - Check the instruction is handled properly by the ASMPrinter. + +.text + +# Check that branch to an undefined symbol is handled +# FIXME: This should be relaxed to an inverse branch and jump +qc.bnei x6, 10, foo +# RELOC: R_RISCV_BRANCH foo 0x0 +# INSTR: qc.bnei t1, 10, foo +# FIXUP: fixup A - offset: 0, value: foo, kind: fixup_riscv_branch + +# FIXME: This should be relaxed to an inverse branch and jump +qc.e.bgeui x8, 12, foo +# RELOC: R_RISCV_CUSTOM193 foo 0x0 +# INSTR: qc.e.bgeui s0, 12, foo +# FIXUP: fixup A - offset: 0, value: foo, kind: fixup_riscv_qc_e_branch + +# Check that a label in a different section is handled similar to an undefined symbol +# FIXME: This should be relaxed to an inverse branch and jump +qc.e.bltui x4, 9, .bar +# RELOC: R_RISCV_CUSTOM193 .bar 0x0 +# INSTR: qc.e.bltui tp, 9, .bar +# FIXUP: fixup A - offset: 0, value: .bar, kind: fixup_riscv_qc_e_branch + +# Check that branches to a defined symbol are handled correctly +qc.e.beqi x7, 8, .L1 +# INSTR: qc.e.beqi t2, 8, .L1 +# FIXUP: fixup A - offset: 0, value: .L1, kind: fixup_riscv_qc_e_branch + +.L1: + ret + +.section .t2 + +.bar: + ret From b9891715af7ab055ccb7ad424aefe54c67d77988 Mon Sep 17 00:00:00 2001 From: Rodrigo Rocha Date: Fri, 4 Apr 2025 02:34:24 +0100 Subject: [PATCH 0603/1029] [BOLT] Handle generation of compare and jump sequences (#131949) This patch fixes the following two issues with the createCmpJE for AArch64: 1. Avoids overwriting the value of the input register RegNo by use XZR as the destination register. subs xzr, RegNo, #Imm which is equivalent to a simple cmp RegNo, #Imm 2. The immediate operand to the Bcc instruction must be EQ instead of #Imm. This patch also adds a new function for createCmpJNE and unit tests for the both createCmpJE and createCmpJNE for X86 and AArch64. --- bolt/include/bolt/Core/MCPlusBuilder.h | 9 ++ .../Target/AArch64/AArch64MCPlusBuilder.cpp | 32 ++++++- bolt/lib/Target/X86/X86MCPlusBuilder.cpp | 12 +++ bolt/unittests/Core/MCPlusBuilder.cpp | 92 +++++++++++++++++++ 4 files changed, 144 insertions(+), 1 deletion(-) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index bbef65700b2a5..fa942accbea4e 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -1751,6 +1751,15 @@ class MCPlusBuilder { return {}; } + /// Create a sequence of instructions to compare contents of a register + /// \p RegNo to immediate \Imm and jump to \p Target if they are different. + virtual InstructionListType createCmpJNE(MCPhysReg RegNo, int64_t Imm, + const MCSymbol *Target, + MCContext *Ctx) const { + llvm_unreachable("not implemented"); + return {}; + } + /// Creates inline memcpy instruction. If \p ReturnEnd is true, then return /// (dest + n) instead of dest. virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const { diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 2a648baa4d514..b50a37abeda48 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -1361,17 +1361,47 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { int getUncondBranchEncodingSize() const override { return 28; } + // This helper function creates the snippet of code that compares a register + // RegNo with an immedaite Imm, and jumps to Target if they are equal. + // cmp RegNo, #Imm + // b.eq Target + // where cmp is an alias for subs, which results in the code below: + // subs xzr, RegNo, #Imm + // b.eq Target. InstructionListType createCmpJE(MCPhysReg RegNo, int64_t Imm, const MCSymbol *Target, MCContext *Ctx) const override { InstructionListType Code; Code.emplace_back(MCInstBuilder(AArch64::SUBSXri) - .addReg(RegNo) + .addReg(AArch64::XZR) .addReg(RegNo) .addImm(Imm) .addImm(0)); Code.emplace_back(MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::EQ) + .addExpr(MCSymbolRefExpr::create( + Target, MCSymbolRefExpr::VK_None, *Ctx))); + return Code; + } + + // This helper function creates the snippet of code that compares a register + // RegNo with an immedaite Imm, and jumps to Target if they are not equal. + // cmp RegNo, #Imm + // b.ne Target + // where cmp is an alias for subs, which results in the code below: + // subs xzr, RegNo, #Imm + // b.ne Target. + InstructionListType createCmpJNE(MCPhysReg RegNo, int64_t Imm, + const MCSymbol *Target, + MCContext *Ctx) const override { + InstructionListType Code; + Code.emplace_back(MCInstBuilder(AArch64::SUBSXri) + .addReg(AArch64::XZR) + .addReg(RegNo) .addImm(Imm) + .addImm(0)); + Code.emplace_back(MCInstBuilder(AArch64::Bcc) + .addImm(AArch64CC::NE) .addExpr(MCSymbolRefExpr::create( Target, MCSymbolRefExpr::VK_None, *Ctx))); return Code; diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 0b2617600f5c0..8e459e10244fd 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -2426,6 +2426,18 @@ class X86MCPlusBuilder : public MCPlusBuilder { return Code; } + InstructionListType createCmpJNE(MCPhysReg RegNo, int64_t Imm, + const MCSymbol *Target, + MCContext *Ctx) const override { + InstructionListType Code; + Code.emplace_back(MCInstBuilder(X86::CMP64ri8).addReg(RegNo).addImm(Imm)); + Code.emplace_back(MCInstBuilder(X86::JCC_1) + .addExpr(MCSymbolRefExpr::create( + Target, MCSymbolRefExpr::VK_None, *Ctx)) + .addImm(X86::COND_NE)); + return Code; + } + std::optional createRelocation(const MCFixup &Fixup, const MCAsmBackend &MAB) const override { diff --git a/bolt/unittests/Core/MCPlusBuilder.cpp b/bolt/unittests/Core/MCPlusBuilder.cpp index d367eb07f7767..a3113cab3d334 100644 --- a/bolt/unittests/Core/MCPlusBuilder.cpp +++ b/bolt/unittests/Core/MCPlusBuilder.cpp @@ -107,6 +107,54 @@ TEST_P(MCPlusBuilderTester, AliasSmallerX0) { testRegAliases(Triple::aarch64, AArch64::X0, AliasesX0, AliasesX0Count, true); } +TEST_P(MCPlusBuilderTester, AArch64_CmpJE) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true); + std::unique_ptr BB = BF->createBasicBlock(); + + InstructionListType Instrs = + BC->MIB->createCmpJE(AArch64::X0, 2, BB->getLabel(), BC->Ctx.get()); + BB->addInstructions(Instrs.begin(), Instrs.end()); + BB->addSuccessor(BB.get()); + + auto II = BB->begin(); + ASSERT_EQ(II->getOpcode(), AArch64::SUBSXri); + ASSERT_EQ(II->getOperand(0).getReg(), AArch64::XZR); + ASSERT_EQ(II->getOperand(1).getReg(), AArch64::X0); + ASSERT_EQ(II->getOperand(2).getImm(), 2); + ASSERT_EQ(II->getOperand(3).getImm(), 0); + II++; + ASSERT_EQ(II->getOpcode(), AArch64::Bcc); + ASSERT_EQ(II->getOperand(0).getImm(), AArch64CC::EQ); + const MCSymbol *Label = BC->MIB->getTargetSymbol(*II, 1); + ASSERT_EQ(Label, BB->getLabel()); +} + +TEST_P(MCPlusBuilderTester, AArch64_CmpJNE) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true); + std::unique_ptr BB = BF->createBasicBlock(); + + InstructionListType Instrs = + BC->MIB->createCmpJNE(AArch64::X0, 2, BB->getLabel(), BC->Ctx.get()); + BB->addInstructions(Instrs.begin(), Instrs.end()); + BB->addSuccessor(BB.get()); + + auto II = BB->begin(); + ASSERT_EQ(II->getOpcode(), AArch64::SUBSXri); + ASSERT_EQ(II->getOperand(0).getReg(), AArch64::XZR); + ASSERT_EQ(II->getOperand(1).getReg(), AArch64::X0); + ASSERT_EQ(II->getOperand(2).getImm(), 2); + ASSERT_EQ(II->getOperand(3).getImm(), 0); + II++; + ASSERT_EQ(II->getOpcode(), AArch64::Bcc); + ASSERT_EQ(II->getOperand(0).getImm(), AArch64CC::NE); + const MCSymbol *Label = BC->MIB->getTargetSymbol(*II, 1); + ASSERT_EQ(Label, BB->getLabel()); +} + #endif // AARCH64_AVAILABLE #ifdef X86_AVAILABLE @@ -143,6 +191,50 @@ TEST_P(MCPlusBuilderTester, ReplaceRegWithImm) { ASSERT_EQ(II->getOperand(1).getImm(), 1); } +TEST_P(MCPlusBuilderTester, X86_CmpJE) { + if (GetParam() != Triple::x86_64) + GTEST_SKIP(); + BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true); + std::unique_ptr BB = BF->createBasicBlock(); + + InstructionListType Instrs = + BC->MIB->createCmpJE(X86::EAX, 2, BB->getLabel(), BC->Ctx.get()); + BB->addInstructions(Instrs.begin(), Instrs.end()); + BB->addSuccessor(BB.get()); + + auto II = BB->begin(); + ASSERT_EQ(II->getOpcode(), X86::CMP64ri8); + ASSERT_EQ(II->getOperand(0).getReg(), X86::EAX); + ASSERT_EQ(II->getOperand(1).getImm(), 2); + II++; + ASSERT_EQ(II->getOpcode(), X86::JCC_1); + const MCSymbol *Label = BC->MIB->getTargetSymbol(*II, 0); + ASSERT_EQ(Label, BB->getLabel()); + ASSERT_EQ(II->getOperand(1).getImm(), X86::COND_E); +} + +TEST_P(MCPlusBuilderTester, X86_CmpJNE) { + if (GetParam() != Triple::x86_64) + GTEST_SKIP(); + BinaryFunction *BF = BC->createInjectedBinaryFunction("BF", true); + std::unique_ptr BB = BF->createBasicBlock(); + + InstructionListType Instrs = + BC->MIB->createCmpJNE(X86::EAX, 2, BB->getLabel(), BC->Ctx.get()); + BB->addInstructions(Instrs.begin(), Instrs.end()); + BB->addSuccessor(BB.get()); + + auto II = BB->begin(); + ASSERT_EQ(II->getOpcode(), X86::CMP64ri8); + ASSERT_EQ(II->getOperand(0).getReg(), X86::EAX); + ASSERT_EQ(II->getOperand(1).getImm(), 2); + II++; + ASSERT_EQ(II->getOpcode(), X86::JCC_1); + const MCSymbol *Label = BC->MIB->getTargetSymbol(*II, 0); + ASSERT_EQ(Label, BB->getLabel()); + ASSERT_EQ(II->getOperand(1).getImm(), X86::COND_NE); +} + #endif // X86_AVAILABLE TEST_P(MCPlusBuilderTester, Annotation) { From 21468261695918cd74e0079153eedbccb689fe20 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Thu, 3 Apr 2025 18:36:45 -0700 Subject: [PATCH 0604/1029] [ctxprof] Support for "move" semantics for the contextual root (#134192) This PR finishes what PR #133992 started. --- .../Transforms/Utils/FunctionImportUtils.h | 25 ++++------------ llvm/lib/Transforms/IPO/FunctionImport.cpp | 18 ++++++++++++ .../Transforms/Utils/FunctionImportUtils.cpp | 29 ++++++++++++++++++- .../ThinLTO/X86/ctxprof-separate-module.ll | 22 ++++++++++++-- 4 files changed, 70 insertions(+), 24 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/FunctionImportUtils.h b/llvm/include/llvm/Transforms/Utils/FunctionImportUtils.h index 6d83b615d5f13..28ba20bc18cf9 100644 --- a/llvm/include/llvm/Transforms/Utils/FunctionImportUtils.h +++ b/llvm/include/llvm/Transforms/Utils/FunctionImportUtils.h @@ -97,29 +97,14 @@ class FunctionImportGlobalProcessing { /// linkage for a required promotion of a local to global scope. GlobalValue::LinkageTypes getLinkage(const GlobalValue *SGV, bool DoPromote); + /// The symbols with these names are moved to a different module and should be + /// promoted to external linkage where they are defined. + DenseSet SymbolsToMove; + public: FunctionImportGlobalProcessing(Module &M, const ModuleSummaryIndex &Index, SetVector *GlobalsToImport, - bool ClearDSOLocalOnDeclarations) - : M(M), ImportIndex(Index), GlobalsToImport(GlobalsToImport), - ClearDSOLocalOnDeclarations(ClearDSOLocalOnDeclarations) { - // If we have a ModuleSummaryIndex but no function to import, - // then this is the primary module being compiled in a ThinLTO - // backend compilation, and we need to see if it has functions that - // may be exported to another backend compilation. - if (!GlobalsToImport) - HasExportedFunctions = ImportIndex.hasExportedFunctions(M); - -#ifndef NDEBUG - SmallVector Vec; - // First collect those in the llvm.used set. - collectUsedGlobalVariables(M, Vec, /*CompilerUsed=*/false); - // Next collect those in the llvm.compiler.used set. - collectUsedGlobalVariables(M, Vec, /*CompilerUsed=*/true); - Used = {llvm::from_range, Vec}; -#endif - } - + bool ClearDSOLocalOnDeclarations); void run(); }; diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 05c41eb8d908b..d93bd44de52fe 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -182,6 +182,15 @@ static cl::opt CtxprofMoveRootsToOwnModule( "their own module."), cl::Hidden, cl::init(false)); +cl::list MoveSymbolGUID( + "thinlto-move-symbols", + cl::desc( + "Move the symbols with the given name. This will delete these symbols " + "wherever they are originally defined, and make sure their " + "linkage is External where they are imported. It is meant to be " + "used with the name of contextual profiling roots."), + cl::Hidden); + namespace llvm { extern cl::opt EnableMemProfContextDisambiguation; } @@ -1859,6 +1868,15 @@ Expected FunctionImporter::importFunctions( LLVM_DEBUG(dbgs() << "Starting import for Module " << DestModule.getModuleIdentifier() << "\n"); unsigned ImportedCount = 0, ImportedGVCount = 0; + // Before carrying out any imports, see if this module defines functions in + // MoveSymbolGUID. If it does, delete them here (but leave the declaration). + // The function will be imported elsewhere, as extenal linkage, and the + // destination doesn't yet have its definition. + DenseSet MoveSymbolGUIDSet; + MoveSymbolGUIDSet.insert_range(MoveSymbolGUID); + for (auto &F : DestModule) + if (!F.isDeclaration() && MoveSymbolGUIDSet.contains(F.getGUID())) + F.deleteBody(); IRMover Mover(DestModule); diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp index ae1af943bc11c..81e461e28df17 100644 --- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -24,6 +24,31 @@ static cl::opt UseSourceFilenameForPromotedLocals( "This requires that the source filename has a unique name / " "path to avoid name collisions.")); +extern cl::list MoveSymbolGUID; + +FunctionImportGlobalProcessing::FunctionImportGlobalProcessing( + Module &M, const ModuleSummaryIndex &Index, + SetVector *GlobalsToImport, bool ClearDSOLocalOnDeclarations) + : M(M), ImportIndex(Index), GlobalsToImport(GlobalsToImport), + ClearDSOLocalOnDeclarations(ClearDSOLocalOnDeclarations) { + // If we have a ModuleSummaryIndex but no function to import, + // then this is the primary module being compiled in a ThinLTO + // backend compilation, and we need to see if it has functions that + // may be exported to another backend compilation. + if (!GlobalsToImport) + HasExportedFunctions = ImportIndex.hasExportedFunctions(M); + +#ifndef NDEBUG + SmallVector Vec; + // First collect those in the llvm.used set. + collectUsedGlobalVariables(M, Vec, /*CompilerUsed=*/false); + // Next collect those in the llvm.compiler.used set. + collectUsedGlobalVariables(M, Vec, /*CompilerUsed=*/true); + Used = {llvm::from_range, Vec}; +#endif + SymbolsToMove.insert_range(MoveSymbolGUID); +} + /// Checks if we should import SGV as a definition, otherwise import as a /// declaration. bool FunctionImportGlobalProcessing::doImportAsDefinition( @@ -147,7 +172,9 @@ FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV, // and/or optimization, but are turned into declarations later // during the EliminateAvailableExternally pass. if (doImportAsDefinition(SGV) && !isa(SGV)) - return GlobalValue::AvailableExternallyLinkage; + return SymbolsToMove.contains(SGV->getGUID()) + ? GlobalValue::ExternalLinkage + : GlobalValue::AvailableExternallyLinkage; // An imported external declaration stays external. return SGV->getLinkage(); diff --git a/llvm/test/ThinLTO/X86/ctxprof-separate-module.ll b/llvm/test/ThinLTO/X86/ctxprof-separate-module.ll index 391fe21a1b638..b6824a0f9f08c 100644 --- a/llvm/test/ThinLTO/X86/ctxprof-separate-module.ll +++ b/llvm/test/ThinLTO/X86/ctxprof-separate-module.ll @@ -22,15 +22,31 @@ ; RUN: -r %t/m2.bc,m2_f1,plx \ ; RUN: -r %t/m3.bc,m1_f1 \ ; RUN: -r %t/m3.bc,m3_f1,plx -debug-only=function-import 2>&1 | FileCheck %s --check-prefix=ABSENT-MSG + +; also add the move semantics for the root: +; RUN: llvm-lto2 run %t/m1.bc %t/m2.bc %t/m3.bc %t/6019442868614718803.bc -thinlto-move-ctxprof-trees \ +; RUN: -thinlto-move-symbols=6019442868614718803 \ +; RUN: -o %t/result-with-move.o -save-temps \ +; RUN: -use-ctx-profile=%t/ctxprof.bitstream \ +; RUN: -r %t/m1.bc,m1_f1,plx \ +; RUN: -r %t/m2.bc,m2_f1,plx \ +; RUN: -r %t/m3.bc,m1_f1 \ +; RUN: -r %t/m3.bc,m3_f1,plx -debug-only=function-import 2>&1 | FileCheck %s --check-prefix=ABSENT-MSG + ; RUN: llvm-dis %t/result.o.4.3.import.bc -o - | FileCheck %s ; RUN: llvm-dis %t/result.o.3.3.import.bc -o - | FileCheck %s --check-prefix=ABSENT +; RUN: llvm-dis %t/result-with-move.o.1.3.import.bc -o - | FileCheck %s --check-prefix=WITHMOVE-SRC +; RUN: llvm-dis %t/result-with-move.o.4.3.import.bc -o - | FileCheck %s --check-prefix=WITHMOVE-DEST +; RUN: llvm-dis %t/result.o.1.3.import.bc -o - | FileCheck %s --check-prefix=WITHOUTMOVE-SRC ; -; -; CHECK: m1_f1() -; CHECK: m2_f1() +; CHECK: define available_externally void @m1_f1() +; CHECK: define available_externally void @m2_f1() ; ABSENT: declare void @m1_f1() ; ABSENT-MSG: Skipping over 6019442868614718803 because its import is handled in a different module. ; +; WITHMOVE-SRC: declare dso_local void @m1_f1 +; WITHMOVE-DEST: define dso_local void @m1_f1 +; WITHOUTMOVE-SRC: define dso_local void @m1_f1 ;--- ctxprof.yaml Contexts: - From 4532512f6c2558fa7d5ba9f35fbce7bad76c1380 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Thu, 3 Apr 2025 19:02:46 -0700 Subject: [PATCH 0605/1029] [ctxprof] Move `MoveSymbolGUID` to address dependency issues (#134334) See PR #134192 --- llvm/lib/Transforms/IPO/FunctionImport.cpp | 9 +-------- llvm/lib/Transforms/Utils/FunctionImportUtils.cpp | 9 ++++++++- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index d93bd44de52fe..f1dce5d7904f9 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -182,14 +182,7 @@ static cl::opt CtxprofMoveRootsToOwnModule( "their own module."), cl::Hidden, cl::init(false)); -cl::list MoveSymbolGUID( - "thinlto-move-symbols", - cl::desc( - "Move the symbols with the given name. This will delete these symbols " - "wherever they are originally defined, and make sure their " - "linkage is External where they are imported. It is meant to be " - "used with the name of contextual profiling roots."), - cl::Hidden); +extern cl::list MoveSymbolGUID; namespace llvm { extern cl::opt EnableMemProfContextDisambiguation; diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp index 81e461e28df17..3bbe875bbe9e5 100644 --- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp +++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp @@ -24,7 +24,14 @@ static cl::opt UseSourceFilenameForPromotedLocals( "This requires that the source filename has a unique name / " "path to avoid name collisions.")); -extern cl::list MoveSymbolGUID; +cl::list MoveSymbolGUID( + "thinlto-move-symbols", + cl::desc( + "Move the symbols with the given name. This will delete these symbols " + "wherever they are originally defined, and make sure their " + "linkage is External where they are imported. It is meant to be " + "used with the name of contextual profiling roots."), + cl::Hidden); FunctionImportGlobalProcessing::FunctionImportGlobalProcessing( Module &M, const ModuleSummaryIndex &Index, From 4088c70f4e7eae0bfc2916cc88c7301ec5e19daf Mon Sep 17 00:00:00 2001 From: NAKAMURA Takumi Date: Fri, 4 Apr 2025 10:53:21 +0900 Subject: [PATCH 0606/1029] CGHLSLBuiltins.cpp: Suppress a warning in #131237 [-Wunused-variable] --- clang/lib/CodeGen/CGHLSLBuiltins.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp index 07f6d0953f026..27d1c69439944 100644 --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp @@ -381,8 +381,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, ArrayRef{Op0, Op1}, nullptr, "hlsl.dot"); } case Builtin::BI__builtin_hlsl_dot2add: { - llvm::Triple::ArchType Arch = CGM.getTarget().getTriple().getArch(); - assert(Arch == llvm::Triple::dxil && + assert(CGM.getTarget().getTriple().getArch() == llvm::Triple::dxil && "Intrinsic dot2add is only allowed for dxil architecture"); Value *A = EmitScalarExpr(E->getArg(0)); Value *B = EmitScalarExpr(E->getArg(1)); From c9f6d26e049849a8d8d31deaf6c710894a361b0e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 3 Apr 2025 19:22:45 -0700 Subject: [PATCH 0607/1029] [MC] Merge MCAsmLexer.{h,cpp} into AsmLexer.{h,cpp} (#134207) 2b11c7de4ae182496438e166cb6758d41b6e1740 introduced `llvm/include/llvm/MC/MCAsmLexer.h` and made `AsmLexer` inherit from `MCAsmLexer`, likely to allow target-specific parsers to depend solely on `MCAsmLexer`. However, this separation now seems unnecessary and confusing. `MCAsmLexer` defines virtual functions with `AsmLexer` as its only implementation, and `AsmLexer` itself has few extra public methods. To simplify the codebase, this change merges MCAsmLexer.{h,cpp} into AsmLexer.{h,cpp}. MCAsmLexer.h is temporarily kept as a forwarder. Note: I doubt that a downstream lexer handling an assembly syntax significantly different from the standard GNU Assembler syntax would want to inherit from `MCAsmLexer`. Instead, it's more likely they'd extend `AsmLexer` by adding new states and modifying its internal logic, as seen with variables for MASM, M68k, and HLASM. --- llvm/include/llvm/MC/MCParser/AsmLexer.h | 170 +++++++++++++++- llvm/include/llvm/MC/MCParser/MCAsmLexer.h | 185 +----------------- llvm/lib/MC/MCParser/AsmLexer.cpp | 86 ++++++++ llvm/lib/MC/MCParser/CMakeLists.txt | 1 - llvm/lib/MC/MCParser/MCAsmLexer.cpp | 105 ---------- .../secondary/llvm/lib/MC/MCParser/BUILD.gn | 1 - 6 files changed, 255 insertions(+), 293 deletions(-) delete mode 100644 llvm/lib/MC/MCParser/MCAsmLexer.cpp diff --git a/llvm/include/llvm/MC/MCParser/AsmLexer.h b/llvm/include/llvm/MC/MCParser/AsmLexer.h index 735b0c114f2aa..d1599dc47e76b 100644 --- a/llvm/include/llvm/MC/MCParser/AsmLexer.h +++ b/llvm/include/llvm/MC/MCParser/AsmLexer.h @@ -13,16 +13,182 @@ #ifndef LLVM_MC_MCPARSER_ASMLEXER_H #define LLVM_MC_MCPARSER_ASMLEXER_H +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" -#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCAsmMacro.h" +#include +#include #include +#include namespace llvm { class MCAsmInfo; +/// A callback class which is notified of each comment in an assembly file as +/// it is lexed. +class AsmCommentConsumer { +public: + virtual ~AsmCommentConsumer() = default; + + /// Callback function for when a comment is lexed. Loc is the start of the + /// comment text (excluding the comment-start marker). CommentText is the text + /// of the comment, excluding the comment start and end markers, and the + /// newline for single-line comments. + virtual void HandleComment(SMLoc Loc, StringRef CommentText) = 0; +}; + +/// Generic assembler lexer interface, for use by target specific assembly +/// lexers. +class MCAsmLexer { + /// The current token, stored in the base class for faster access. + SmallVector CurTok; + + /// The location and description of the current error + SMLoc ErrLoc; + std::string Err; + +protected: // Can only create subclasses. + const char *TokStart = nullptr; + bool SkipSpace = true; + bool AllowAtInIdentifier = false; + bool AllowHashInIdentifier = false; + bool IsAtStartOfStatement = true; + bool LexMasmHexFloats = false; + bool LexMasmIntegers = false; + bool LexMasmStrings = false; + bool LexMotorolaIntegers = false; + bool UseMasmDefaultRadix = false; + unsigned DefaultRadix = 10; + bool LexHLASMIntegers = false; + bool LexHLASMStrings = false; + AsmCommentConsumer *CommentConsumer = nullptr; + + MCAsmLexer(); + + virtual AsmToken LexToken() = 0; + + void SetError(SMLoc errLoc, const std::string &err) { + ErrLoc = errLoc; + Err = err; + } + +public: + MCAsmLexer(const MCAsmLexer &) = delete; + MCAsmLexer &operator=(const MCAsmLexer &) = delete; + virtual ~MCAsmLexer(); + + /// Consume the next token from the input stream and return it. + /// + /// The lexer will continuously return the end-of-file token once the end of + /// the main input file has been reached. + const AsmToken &Lex() { + assert(!CurTok.empty()); + // Mark if we parsing out a EndOfStatement. + IsAtStartOfStatement = CurTok.front().getKind() == AsmToken::EndOfStatement; + CurTok.erase(CurTok.begin()); + // LexToken may generate multiple tokens via UnLex but will always return + // the first one. Place returned value at head of CurTok vector. + if (CurTok.empty()) { + AsmToken T = LexToken(); + CurTok.insert(CurTok.begin(), T); + } + return CurTok.front(); + } + + void UnLex(AsmToken const &Token) { + IsAtStartOfStatement = false; + CurTok.insert(CurTok.begin(), Token); + } + + bool isAtStartOfStatement() { return IsAtStartOfStatement; } + + virtual StringRef LexUntilEndOfStatement() = 0; + + /// Get the current source location. + SMLoc getLoc() const; + + /// Get the current (last) lexed token. + const AsmToken &getTok() const { return CurTok[0]; } + + /// Look ahead at the next token to be lexed. + const AsmToken peekTok(bool ShouldSkipSpace = true) { + AsmToken Tok; + + MutableArrayRef Buf(Tok); + size_t ReadCount = peekTokens(Buf, ShouldSkipSpace); + + assert(ReadCount == 1); + (void)ReadCount; + + return Tok; + } + + /// Look ahead an arbitrary number of tokens. + virtual size_t peekTokens(MutableArrayRef Buf, + bool ShouldSkipSpace = true) = 0; + + /// Get the current error location + SMLoc getErrLoc() { return ErrLoc; } + + /// Get the current error string + const std::string &getErr() { return Err; } + + /// Get the kind of current token. + AsmToken::TokenKind getKind() const { return getTok().getKind(); } + + /// Check if the current token has kind \p K. + bool is(AsmToken::TokenKind K) const { return getTok().is(K); } + + /// Check if the current token has kind \p K. + bool isNot(AsmToken::TokenKind K) const { return getTok().isNot(K); } + + /// Set whether spaces should be ignored by the lexer + void setSkipSpace(bool val) { SkipSpace = val; } + + bool getAllowAtInIdentifier() { return AllowAtInIdentifier; } + void setAllowAtInIdentifier(bool v) { AllowAtInIdentifier = v; } + + void setAllowHashInIdentifier(bool V) { AllowHashInIdentifier = V; } + + void setCommentConsumer(AsmCommentConsumer *CommentConsumer) { + this->CommentConsumer = CommentConsumer; + } + + /// Set whether to lex masm-style binary (e.g., 0b1101) and radix-specified + /// literals (e.g., 0ABCh [hex], 576t [decimal], 77o [octal], 1101y [binary]). + void setLexMasmIntegers(bool V) { LexMasmIntegers = V; } + + /// Set whether to use masm-style default-radix integer literals. If disabled, + /// assume decimal unless prefixed (e.g., 0x2c [hex], 077 [octal]). + void useMasmDefaultRadix(bool V) { UseMasmDefaultRadix = V; } + + unsigned getMasmDefaultRadix() const { return DefaultRadix; } + void setMasmDefaultRadix(unsigned Radix) { DefaultRadix = Radix; } + + /// Set whether to lex masm-style hex float literals, such as 3f800000r. + void setLexMasmHexFloats(bool V) { LexMasmHexFloats = V; } + + /// Set whether to lex masm-style string literals, such as 'Can''t find file' + /// and "This ""value"" not found". + void setLexMasmStrings(bool V) { LexMasmStrings = V; } + + /// Set whether to lex Motorola-style integer literals, such as $deadbeef or + /// %01010110. + void setLexMotorolaIntegers(bool V) { LexMotorolaIntegers = V; } + + /// Set whether to lex HLASM-flavour integers. For now this is only [0-9]* + void setLexHLASMIntegers(bool V) { LexHLASMIntegers = V; } + + /// Set whether to "lex" HLASM-flavour character and string literals. For now, + /// setting this option to true, will disable lexing for character and string + /// literals. + void setLexHLASMStrings(bool V) { LexHLASMStrings = V; } +}; + /// AsmLexer - Lexer class for assembly files. -class AsmLexer : public MCAsmLexer { +class AsmLexer final : public MCAsmLexer { const MCAsmInfo &MAI; const char *CurPtr = nullptr; diff --git a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h index 61b89b9a103f4..0b004537be4fc 100644 --- a/llvm/include/llvm/MC/MCParser/MCAsmLexer.h +++ b/llvm/include/llvm/MC/MCParser/MCAsmLexer.h @@ -6,187 +6,4 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_MC_MCPARSER_MCASMLEXER_H -#define LLVM_MC_MCPARSER_MCASMLEXER_H - -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/MC/MCAsmMacro.h" -#include -#include -#include -#include - -namespace llvm { - -/// A callback class which is notified of each comment in an assembly file as -/// it is lexed. -class AsmCommentConsumer { -public: - virtual ~AsmCommentConsumer() = default; - - /// Callback function for when a comment is lexed. Loc is the start of the - /// comment text (excluding the comment-start marker). CommentText is the text - /// of the comment, excluding the comment start and end markers, and the - /// newline for single-line comments. - virtual void HandleComment(SMLoc Loc, StringRef CommentText) = 0; -}; - - -/// Generic assembler lexer interface, for use by target specific assembly -/// lexers. -class MCAsmLexer { - /// The current token, stored in the base class for faster access. - SmallVector CurTok; - - /// The location and description of the current error - SMLoc ErrLoc; - std::string Err; - -protected: // Can only create subclasses. - const char *TokStart = nullptr; - bool SkipSpace = true; - bool AllowAtInIdentifier = false; - bool AllowHashInIdentifier = false; - bool IsAtStartOfStatement = true; - bool LexMasmHexFloats = false; - bool LexMasmIntegers = false; - bool LexMasmStrings = false; - bool LexMotorolaIntegers = false; - bool UseMasmDefaultRadix = false; - unsigned DefaultRadix = 10; - bool LexHLASMIntegers = false; - bool LexHLASMStrings = false; - AsmCommentConsumer *CommentConsumer = nullptr; - - MCAsmLexer(); - - virtual AsmToken LexToken() = 0; - - void SetError(SMLoc errLoc, const std::string &err) { - ErrLoc = errLoc; - Err = err; - } - -public: - MCAsmLexer(const MCAsmLexer &) = delete; - MCAsmLexer &operator=(const MCAsmLexer &) = delete; - virtual ~MCAsmLexer(); - - /// Consume the next token from the input stream and return it. - /// - /// The lexer will continuously return the end-of-file token once the end of - /// the main input file has been reached. - const AsmToken &Lex() { - assert(!CurTok.empty()); - // Mark if we parsing out a EndOfStatement. - IsAtStartOfStatement = CurTok.front().getKind() == AsmToken::EndOfStatement; - CurTok.erase(CurTok.begin()); - // LexToken may generate multiple tokens via UnLex but will always return - // the first one. Place returned value at head of CurTok vector. - if (CurTok.empty()) { - AsmToken T = LexToken(); - CurTok.insert(CurTok.begin(), T); - } - return CurTok.front(); - } - - void UnLex(AsmToken const &Token) { - IsAtStartOfStatement = false; - CurTok.insert(CurTok.begin(), Token); - } - - bool isAtStartOfStatement() { return IsAtStartOfStatement; } - - virtual StringRef LexUntilEndOfStatement() = 0; - - /// Get the current source location. - SMLoc getLoc() const; - - /// Get the current (last) lexed token. - const AsmToken &getTok() const { - return CurTok[0]; - } - - /// Look ahead at the next token to be lexed. - const AsmToken peekTok(bool ShouldSkipSpace = true) { - AsmToken Tok; - - MutableArrayRef Buf(Tok); - size_t ReadCount = peekTokens(Buf, ShouldSkipSpace); - - assert(ReadCount == 1); - (void)ReadCount; - - return Tok; - } - - /// Look ahead an arbitrary number of tokens. - virtual size_t peekTokens(MutableArrayRef Buf, - bool ShouldSkipSpace = true) = 0; - - /// Get the current error location - SMLoc getErrLoc() { - return ErrLoc; - } - - /// Get the current error string - const std::string &getErr() { - return Err; - } - - /// Get the kind of current token. - AsmToken::TokenKind getKind() const { return getTok().getKind(); } - - /// Check if the current token has kind \p K. - bool is(AsmToken::TokenKind K) const { return getTok().is(K); } - - /// Check if the current token has kind \p K. - bool isNot(AsmToken::TokenKind K) const { return getTok().isNot(K); } - - /// Set whether spaces should be ignored by the lexer - void setSkipSpace(bool val) { SkipSpace = val; } - - bool getAllowAtInIdentifier() { return AllowAtInIdentifier; } - void setAllowAtInIdentifier(bool v) { AllowAtInIdentifier = v; } - - void setAllowHashInIdentifier(bool V) { AllowHashInIdentifier = V; } - - void setCommentConsumer(AsmCommentConsumer *CommentConsumer) { - this->CommentConsumer = CommentConsumer; - } - - /// Set whether to lex masm-style binary (e.g., 0b1101) and radix-specified - /// literals (e.g., 0ABCh [hex], 576t [decimal], 77o [octal], 1101y [binary]). - void setLexMasmIntegers(bool V) { LexMasmIntegers = V; } - - /// Set whether to use masm-style default-radix integer literals. If disabled, - /// assume decimal unless prefixed (e.g., 0x2c [hex], 077 [octal]). - void useMasmDefaultRadix(bool V) { UseMasmDefaultRadix = V; } - - unsigned getMasmDefaultRadix() const { return DefaultRadix; } - void setMasmDefaultRadix(unsigned Radix) { DefaultRadix = Radix; } - - /// Set whether to lex masm-style hex float literals, such as 3f800000r. - void setLexMasmHexFloats(bool V) { LexMasmHexFloats = V; } - - /// Set whether to lex masm-style string literals, such as 'Can''t find file' - /// and "This ""value"" not found". - void setLexMasmStrings(bool V) { LexMasmStrings = V; } - - /// Set whether to lex Motorola-style integer literals, such as $deadbeef or - /// %01010110. - void setLexMotorolaIntegers(bool V) { LexMotorolaIntegers = V; } - - /// Set whether to lex HLASM-flavour integers. For now this is only [0-9]* - void setLexHLASMIntegers(bool V) { LexHLASMIntegers = V; } - - /// Set whether to "lex" HLASM-flavour character and string literals. For now, - /// setting this option to true, will disable lexing for character and string - /// literals. - void setLexHLASMStrings(bool V) { LexHLASMStrings = V; } -}; - -} // end namespace llvm - -#endif // LLVM_MC_MCPARSER_MCASMLEXER_H +#include "llvm/MC/MCParser/AsmLexer.h" diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp index 8715f94d51fe5..3c911dba8cc26 100644 --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -21,6 +21,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/SMLoc.h" #include "llvm/Support/SaveAndRestore.h" +#include "llvm/Support/raw_ostream.h" #include #include #include @@ -31,6 +32,91 @@ using namespace llvm; +SMLoc AsmToken::getLoc() const { return SMLoc::getFromPointer(Str.data()); } + +SMLoc AsmToken::getEndLoc() const { + return SMLoc::getFromPointer(Str.data() + Str.size()); +} + +SMRange AsmToken::getLocRange() const { return SMRange(getLoc(), getEndLoc()); } + +void AsmToken::dump(raw_ostream &OS) const { + switch (Kind) { + case AsmToken::Error: + OS << "error"; + break; + case AsmToken::Identifier: + OS << "identifier: " << getString(); + break; + case AsmToken::Integer: + OS << "int: " << getString(); + break; + case AsmToken::Real: + OS << "real: " << getString(); + break; + case AsmToken::String: + OS << "string: " << getString(); + break; + + // clang-format off + case AsmToken::Amp: OS << "Amp"; break; + case AsmToken::AmpAmp: OS << "AmpAmp"; break; + case AsmToken::At: OS << "At"; break; + case AsmToken::BackSlash: OS << "BackSlash"; break; + case AsmToken::BigNum: OS << "BigNum"; break; + case AsmToken::Caret: OS << "Caret"; break; + case AsmToken::Colon: OS << "Colon"; break; + case AsmToken::Comma: OS << "Comma"; break; + case AsmToken::Comment: OS << "Comment"; break; + case AsmToken::Dollar: OS << "Dollar"; break; + case AsmToken::Dot: OS << "Dot"; break; + case AsmToken::EndOfStatement: OS << "EndOfStatement"; break; + case AsmToken::Eof: OS << "Eof"; break; + case AsmToken::Equal: OS << "Equal"; break; + case AsmToken::EqualEqual: OS << "EqualEqual"; break; + case AsmToken::Exclaim: OS << "Exclaim"; break; + case AsmToken::ExclaimEqual: OS << "ExclaimEqual"; break; + case AsmToken::Greater: OS << "Greater"; break; + case AsmToken::GreaterEqual: OS << "GreaterEqual"; break; + case AsmToken::GreaterGreater: OS << "GreaterGreater"; break; + case AsmToken::Hash: OS << "Hash"; break; + case AsmToken::HashDirective: OS << "HashDirective"; break; + case AsmToken::LBrac: OS << "LBrac"; break; + case AsmToken::LCurly: OS << "LCurly"; break; + case AsmToken::LParen: OS << "LParen"; break; + case AsmToken::Less: OS << "Less"; break; + case AsmToken::LessEqual: OS << "LessEqual"; break; + case AsmToken::LessGreater: OS << "LessGreater"; break; + case AsmToken::LessLess: OS << "LessLess"; break; + case AsmToken::Minus: OS << "Minus"; break; + case AsmToken::MinusGreater: OS << "MinusGreater"; break; + case AsmToken::Percent: OS << "Percent"; break; + case AsmToken::Pipe: OS << "Pipe"; break; + case AsmToken::PipePipe: OS << "PipePipe"; break; + case AsmToken::Plus: OS << "Plus"; break; + case AsmToken::Question: OS << "Question"; break; + case AsmToken::RBrac: OS << "RBrac"; break; + case AsmToken::RCurly: OS << "RCurly"; break; + case AsmToken::RParen: OS << "RParen"; break; + case AsmToken::Slash: OS << "Slash"; break; + case AsmToken::Space: OS << "Space"; break; + case AsmToken::Star: OS << "Star"; break; + case AsmToken::Tilde: OS << "Tilde"; break; + // clang-format on + } + + // Print the token string. + OS << " (\""; + OS.write_escaped(getString()); + OS << "\")"; +} + +MCAsmLexer::MCAsmLexer() { CurTok.emplace_back(AsmToken::Space, StringRef()); } + +MCAsmLexer::~MCAsmLexer() = default; + +SMLoc MCAsmLexer::getLoc() const { return SMLoc::getFromPointer(TokStart); } + AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { // For COFF targets, this is true, while for ELF targets, it should be false. // Currently, @specifier parsing depends on '@' being included in the token. diff --git a/llvm/lib/MC/MCParser/CMakeLists.txt b/llvm/lib/MC/MCParser/CMakeLists.txt index d3fa2675a255e..008a50e9da660 100644 --- a/llvm/lib/MC/MCParser/CMakeLists.txt +++ b/llvm/lib/MC/MCParser/CMakeLists.txt @@ -6,7 +6,6 @@ add_llvm_component_library(LLVMMCParser GOFFAsmParser.cpp DarwinAsmParser.cpp ELFAsmParser.cpp - MCAsmLexer.cpp MCAsmParser.cpp MCAsmParserExtension.cpp MCTargetAsmParser.cpp diff --git a/llvm/lib/MC/MCParser/MCAsmLexer.cpp b/llvm/lib/MC/MCParser/MCAsmLexer.cpp deleted file mode 100644 index 8bdba218aa9c0..0000000000000 --- a/llvm/lib/MC/MCParser/MCAsmLexer.cpp +++ /dev/null @@ -1,105 +0,0 @@ -//===- MCAsmLexer.cpp - Abstract Asm Lexer Interface ----------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCParser/MCAsmLexer.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/SMLoc.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -MCAsmLexer::MCAsmLexer() { - CurTok.emplace_back(AsmToken::Space, StringRef()); -} - -MCAsmLexer::~MCAsmLexer() = default; - -SMLoc MCAsmLexer::getLoc() const { - return SMLoc::getFromPointer(TokStart); -} - -SMLoc AsmToken::getLoc() const { - return SMLoc::getFromPointer(Str.data()); -} - -SMLoc AsmToken::getEndLoc() const { - return SMLoc::getFromPointer(Str.data() + Str.size()); -} - -SMRange AsmToken::getLocRange() const { - return SMRange(getLoc(), getEndLoc()); -} - -void AsmToken::dump(raw_ostream &OS) const { - switch (Kind) { - case AsmToken::Error: - OS << "error"; - break; - case AsmToken::Identifier: - OS << "identifier: " << getString(); - break; - case AsmToken::Integer: - OS << "int: " << getString(); - break; - case AsmToken::Real: - OS << "real: " << getString(); - break; - case AsmToken::String: - OS << "string: " << getString(); - break; - - case AsmToken::Amp: OS << "Amp"; break; - case AsmToken::AmpAmp: OS << "AmpAmp"; break; - case AsmToken::At: OS << "At"; break; - case AsmToken::BackSlash: OS << "BackSlash"; break; - case AsmToken::BigNum: OS << "BigNum"; break; - case AsmToken::Caret: OS << "Caret"; break; - case AsmToken::Colon: OS << "Colon"; break; - case AsmToken::Comma: OS << "Comma"; break; - case AsmToken::Comment: OS << "Comment"; break; - case AsmToken::Dollar: OS << "Dollar"; break; - case AsmToken::Dot: OS << "Dot"; break; - case AsmToken::EndOfStatement: OS << "EndOfStatement"; break; - case AsmToken::Eof: OS << "Eof"; break; - case AsmToken::Equal: OS << "Equal"; break; - case AsmToken::EqualEqual: OS << "EqualEqual"; break; - case AsmToken::Exclaim: OS << "Exclaim"; break; - case AsmToken::ExclaimEqual: OS << "ExclaimEqual"; break; - case AsmToken::Greater: OS << "Greater"; break; - case AsmToken::GreaterEqual: OS << "GreaterEqual"; break; - case AsmToken::GreaterGreater: OS << "GreaterGreater"; break; - case AsmToken::Hash: OS << "Hash"; break; - case AsmToken::HashDirective: OS << "HashDirective"; break; - case AsmToken::LBrac: OS << "LBrac"; break; - case AsmToken::LCurly: OS << "LCurly"; break; - case AsmToken::LParen: OS << "LParen"; break; - case AsmToken::Less: OS << "Less"; break; - case AsmToken::LessEqual: OS << "LessEqual"; break; - case AsmToken::LessGreater: OS << "LessGreater"; break; - case AsmToken::LessLess: OS << "LessLess"; break; - case AsmToken::Minus: OS << "Minus"; break; - case AsmToken::MinusGreater: OS << "MinusGreater"; break; - case AsmToken::Percent: OS << "Percent"; break; - case AsmToken::Pipe: OS << "Pipe"; break; - case AsmToken::PipePipe: OS << "PipePipe"; break; - case AsmToken::Plus: OS << "Plus"; break; - case AsmToken::Question: OS << "Question"; break; - case AsmToken::RBrac: OS << "RBrac"; break; - case AsmToken::RCurly: OS << "RCurly"; break; - case AsmToken::RParen: OS << "RParen"; break; - case AsmToken::Slash: OS << "Slash"; break; - case AsmToken::Space: OS << "Space"; break; - case AsmToken::Star: OS << "Star"; break; - case AsmToken::Tilde: OS << "Tilde"; break; - } - - // Print the token string. - OS << " (\""; - OS.write_escaped(getString()); - OS << "\")"; -} diff --git a/llvm/utils/gn/secondary/llvm/lib/MC/MCParser/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/MC/MCParser/BUILD.gn index 8fa51893a4281..3fd2976be1b9d 100644 --- a/llvm/utils/gn/secondary/llvm/lib/MC/MCParser/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/MC/MCParser/BUILD.gn @@ -14,7 +14,6 @@ static_library("MCParser") { "DarwinAsmParser.cpp", "ELFAsmParser.cpp", "GOFFAsmParser.cpp", - "MCAsmLexer.cpp", "MCAsmParser.cpp", "MCAsmParserExtension.cpp", "MCTargetAsmParser.cpp", From 897f9a51b981c773a63dd94709d9de2442feb008 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Fri, 4 Apr 2025 11:27:39 +0800 Subject: [PATCH 0608/1029] [X86][AVX10.2] Replace nepbh with bf16 to match with others, NFCI (#134240) --- clang/include/clang/Basic/BuiltinsX86.td | 6 ++--- clang/lib/CodeGen/TargetBuiltins/X86.cpp | 6 ++--- clang/lib/Headers/avx10_2_512bf16intrin.h | 16 ++++++------ clang/lib/Headers/avx10_2bf16intrin.h | 32 +++++++++++------------ 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index cc4249acbfee9..67cbbfdec7aaf 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -5365,13 +5365,13 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth< let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { def vsqrtbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">; - def vfmaddnepbh512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Vector<32, __bf16>)">; + def vfmaddbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Vector<32, __bf16>, _Vector<32, __bf16>)">; } let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { - def vfmaddnepbh256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Vector<16, __bf16>)">; + def vfmaddbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>, _Vector<16, __bf16>, _Vector<16, __bf16>)">; } let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def vfmaddnepbh128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Vector<8, __bf16>)">; + def vfmaddbf16128 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>, _Vector<8, __bf16>, _Vector<8, __bf16>)">; } diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index 0466770587a42..3c2a77ab3fe4e 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -1049,9 +1049,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_vfmaddph512_mask: case X86::BI__builtin_ia32_vfmaddph512_maskz: case X86::BI__builtin_ia32_vfmaddph512_mask3: - case X86::BI__builtin_ia32_vfmaddnepbh128: - case X86::BI__builtin_ia32_vfmaddnepbh256: - case X86::BI__builtin_ia32_vfmaddnepbh512: + case X86::BI__builtin_ia32_vfmaddbf16128: + case X86::BI__builtin_ia32_vfmaddbf16256: + case X86::BI__builtin_ia32_vfmaddbf16512: case X86::BI__builtin_ia32_vfmaddps512_mask: case X86::BI__builtin_ia32_vfmaddps512_maskz: case X86::BI__builtin_ia32_vfmaddps512_mask3: diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h index ce43ecbcfe047..75290d22ef259 100644 --- a/clang/lib/Headers/avx10_2_512bf16intrin.h +++ b/clang/lib/Headers/avx10_2_512bf16intrin.h @@ -441,8 +441,8 @@ _mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) { static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_fmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { - return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B, - (__v32bf)__C); + return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, (__v32bf)__B, + (__v32bf)__C); } static __inline__ __m512bh __DEFAULT_FN_ATTRS512 @@ -469,8 +469,8 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_pbh( static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_fmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { - return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, (__v32bf)__B, - -(__v32bf)__C); + return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, (__v32bf)__B, + -(__v32bf)__C); } static __inline__ __m512bh __DEFAULT_FN_ATTRS512 @@ -497,8 +497,8 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsub_pbh( static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_fnmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { - return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B, - (__v32bf)__C); + return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, -(__v32bf)__B, + (__v32bf)__C); } static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmadd_pbh( @@ -527,8 +527,8 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmadd_pbh( static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_fnmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) { - return (__m512bh)__builtin_ia32_vfmaddnepbh512((__v32bf)__A, -(__v32bf)__B, - -(__v32bf)__C); + return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, -(__v32bf)__B, + -(__v32bf)__C); } static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsub_pbh( diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h index 199cc13ff7a1c..66797ae00fe4f 100644 --- a/clang/lib/Headers/avx10_2bf16intrin.h +++ b/clang/lib/Headers/avx10_2bf16intrin.h @@ -852,8 +852,8 @@ _mm_maskz_sqrt_pbh(__mmask8 __U, __m128bh __A) { static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_fmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { - return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, (__v16bf)__B, - (__v16bf)__C); + return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, (__v16bf)__B, + (__v16bf)__C); } static __inline__ __m256bh __DEFAULT_FN_ATTRS256 @@ -880,8 +880,8 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_pbh( static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_fmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { - return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, (__v16bf)__B, - -(__v16bf)__C); + return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, (__v16bf)__B, + -(__v16bf)__C); } static __inline__ __m256bh __DEFAULT_FN_ATTRS256 @@ -908,8 +908,8 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_pbh( static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_fnmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { - return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, -(__v16bf)__B, - (__v16bf)__C); + return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, -(__v16bf)__B, + (__v16bf)__C); } static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_pbh( @@ -938,8 +938,8 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_pbh( static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_fnmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) { - return (__m256bh)__builtin_ia32_vfmaddnepbh256((__v16bf)__A, -(__v16bf)__B, - -(__v16bf)__C); + return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, -(__v16bf)__B, + -(__v16bf)__C); } static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_pbh( @@ -969,8 +969,8 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_pbh( static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmadd_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { - return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, (__v8bf)__B, - (__v8bf)__C); + return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, (__v8bf)__B, + (__v8bf)__C); } static __inline__ __m128bh __DEFAULT_FN_ATTRS128 @@ -997,8 +997,8 @@ _mm_maskz_fmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmsub_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { - return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, (__v8bf)__B, - -(__v8bf)__C); + return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, (__v8bf)__B, + -(__v8bf)__C); } static __inline__ __m128bh __DEFAULT_FN_ATTRS128 @@ -1025,8 +1025,8 @@ _mm_maskz_fmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmadd_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { - return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, -(__v8bf)__B, - (__v8bf)__C); + return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, -(__v8bf)__B, + (__v8bf)__C); } static __inline__ __m128bh __DEFAULT_FN_ATTRS128 @@ -1053,8 +1053,8 @@ _mm_maskz_fnmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) { static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmsub_pbh(__m128bh __A, __m128bh __B, __m128bh __C) { - return (__m128bh)__builtin_ia32_vfmaddnepbh128((__v8bf)__A, -(__v8bf)__B, - -(__v8bf)__C); + return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, -(__v8bf)__B, + -(__v8bf)__C); } static __inline__ __m128bh __DEFAULT_FN_ATTRS128 From 8e7d6baf0e013408be932758b4a5334c14a34086 Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Thu, 3 Apr 2025 20:45:55 -0700 Subject: [PATCH 0609/1029] Revert "[LLDB] Expose checking if the symbol file exists/is loaded via SBModule" (#134341) Reverts llvm/llvm-project#134163 Reverting while @clayborg and I come up with a better API --- lldb/include/lldb/API/SBModule.h | 3 --- lldb/source/API/SBModule.cpp | 12 ------------ 2 files changed, 15 deletions(-) diff --git a/lldb/include/lldb/API/SBModule.h b/lldb/include/lldb/API/SBModule.h index 651455bdb78d2..85332066ee687 100644 --- a/lldb/include/lldb/API/SBModule.h +++ b/lldb/include/lldb/API/SBModule.h @@ -290,9 +290,6 @@ class LLDB_API SBModule { lldb::SBAddress GetObjectFileHeaderAddress() const; lldb::SBAddress GetObjectFileEntryPointAddress() const; - /// Get if the symbol file for this module is loaded. - bool IsDebugInfoLoaded() const; - /// Get the number of global modules. static uint32_t GetNumberAllocatedModules(); diff --git a/lldb/source/API/SBModule.cpp b/lldb/source/API/SBModule.cpp index 4978a553f57c7..985107ec68efd 100644 --- a/lldb/source/API/SBModule.cpp +++ b/lldb/source/API/SBModule.cpp @@ -659,18 +659,6 @@ lldb::SBAddress SBModule::GetObjectFileEntryPointAddress() const { return sb_addr; } -bool SBModule::IsDebugInfoLoaded() const { - LLDB_INSTRUMENT_VA(this); - - ModuleSP module_sp(GetSP()); - if (module_sp) { - SymbolFile *sym_file = module_sp->GetSymbolFile(/*create=*/false); - return sym_file && sym_file->GetLoadDebugInfoEnabled(); - } - - return false; -} - uint32_t SBModule::GetNumberAllocatedModules() { LLDB_INSTRUMENT(); From 6333f8457c43f5a2d19d6552689726e86196dea7 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Fri, 4 Apr 2025 06:20:51 +0200 Subject: [PATCH 0610/1029] [flang][OpenMP] Move reductions from `loop` to `teams` when `loop` is mapped to `distribute` (#132920) Follow-up to #132003, in particular, see https://github.com/llvm/llvm-project/pull/132003#issuecomment-2739701936. This PR extends reduction support for `loop` directives. Consider the following scenario: ```fortran subroutine bar implicit none integer :: x, i !$omp teams loop reduction(+: x) DO i = 1, 5 call foo() END DO end subroutine ``` Note the following: * According to the spec, the `reduction` clause will be attached to `loop` during earlier stages in the compiler. * Additionally, `loop` cannot be mapped to `distribute parallel for` due to the call to a foreign function inside the loop's body. * Therefore, `loop` must be mapped to `distribute`. * However, `distribute` does not have `reduction` clauses. * As a result, we have to move the `reduction`s from the `loop` to its parent `teams` directive, which is what is done by this PR. --- .../OpenMP/GenericLoopConversion.cpp | 32 +++++++++++++++- flang/test/Lower/OpenMP/loop-directive.f90 | 37 +++++++++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp index 74ad6330b11a7..3009746954984 100644 --- a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp @@ -57,10 +57,38 @@ class GenericLoopConversionPattern rewriteToWsloop(loopOp, rewriter); break; case GenericLoopCombinedInfo::TeamsLoop: - if (teamsLoopCanBeParallelFor(loopOp)) + if (teamsLoopCanBeParallelFor(loopOp)) { rewriteToDistributeParallelDo(loopOp, rewriter); - else + } else { + auto teamsOp = llvm::cast(loopOp->getParentOp()); + auto teamsBlockArgIface = + llvm::cast(*teamsOp); + auto loopBlockArgIface = + llvm::cast(*loopOp); + + for (unsigned i = 0; i < loopBlockArgIface.numReductionBlockArgs(); + ++i) { + mlir::BlockArgument loopRedBlockArg = + loopBlockArgIface.getReductionBlockArgs()[i]; + mlir::BlockArgument teamsRedBlockArg = + teamsBlockArgIface.getReductionBlockArgs()[i]; + rewriter.replaceAllUsesWith(loopRedBlockArg, teamsRedBlockArg); + } + + for (unsigned i = 0; i < loopBlockArgIface.numReductionBlockArgs(); + ++i) { + loopOp.getRegion().eraseArgument( + loopBlockArgIface.getReductionBlockArgsStart()); + } + + loopOp.removeReductionModAttr(); + loopOp.getReductionVarsMutable().clear(); + loopOp.removeReductionByrefAttr(); + loopOp.removeReductionSymsAttr(); + rewriteToDistribute(loopOp, rewriter); + } + break; } diff --git a/flang/test/Lower/OpenMP/loop-directive.f90 b/flang/test/Lower/OpenMP/loop-directive.f90 index 954985e2d64f1..a974f264cc040 100644 --- a/flang/test/Lower/OpenMP/loop-directive.f90 +++ b/flang/test/Lower/OpenMP/loop-directive.f90 @@ -358,3 +358,40 @@ subroutine multi_block_teams end select !$omp end target teams end subroutine + + +! Verifies that reductions are hoisted to the parent `teams` directive and removed +! from the `loop` directive when `loop` is mapped to `distribute`. + +! CHECK-LABEL: func.func @_QPteams_loop_cannot_be_parallel_for_with_reductions +subroutine teams_loop_cannot_be_parallel_for_with_reductions + implicit none + integer :: x, y, i, p + + ! CHECK: %[[ADD_RED:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QF{{.*}}Ex"} + ! CHECK: %[[MUL_RED:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QF{{.*}}Ey"} + ! CHECK: omp.teams reduction( + ! CHECK-SAME: @add_reduction_i32 %[[ADD_RED]]#0 -> %[[ADD_RED_ARG:[^[:space:]]*]], + ! CHECK-SAME: @multiply_reduction_i32 %[[MUL_RED]]#0 -> %[[MUL_RED_ARG:.*]] : {{.*}}) { + + ! CHECK: omp.distribute private(@{{.*}} %{{.*}} -> %{{.*}}, @{{.*}} %{{.*}} -> %{{.*}} : {{.*}}) { + ! CHECK: %[[ADD_RED_DECL:.*]]:2 = hlfir.declare %[[ADD_RED_ARG]] {uniq_name = "_QF{{.*}}Ex"} + ! CHECK: %[[MUL_RED_DECL:.*]]:2 = hlfir.declare %[[MUL_RED_ARG]] {uniq_name = "_QF{{.*}}Ey"} + + ! CHECK: %[[ADD_RES:.*]] = arith.addi %{{.*}}, %{{.*}} : i32 + ! CHECK: hlfir.assign %[[ADD_RES]] to %[[ADD_RED_DECL]]#0 : i32, !fir.ref + + ! CHECK: %[[MUL_RES:.*]] = arith.muli %{{.*}}, %{{.*}} : i32 + ! CHECK: hlfir.assign %[[MUL_RES]] to %[[MUL_RED_DECL]]#0 : i32, !fir.ref + ! CHECK: omp.yield + ! CHECK: } + ! CHECK: omp.terminator + ! CHECK: } + !$omp teams loop reduction(+: x) reduction(*: y) private(p) + do i = 1, 5 + call foo() + x = x + i + y = y * i + p = 42 + end do +end subroutine From 46e2c07fa28bd42da8f8ca52e93603297114afa2 Mon Sep 17 00:00:00 2001 From: cmtice Date: Thu, 3 Apr 2025 21:39:30 -0700 Subject: [PATCH 0611/1029] [LLDB] Add DIL code for handling plain variable names. (#120971) Add the Data Inspection Language (DIL) implementation pieces for handling plain local and global variable names. See https://discourse.llvm.org/t/rfc-data-inspection-language/69893 for information about DIL. This change includes the basic AST, Lexer, Parser and Evaluator pieces, as well as some tests. --- lldb/docs/dil-expr-lang.ebnf | 42 +++ lldb/include/lldb/ValueObject/DILAST.h | 97 +++++++ lldb/include/lldb/ValueObject/DILEval.h | 63 +++++ lldb/include/lldb/ValueObject/DILLexer.h | 27 +- lldb/include/lldb/ValueObject/DILParser.h | 125 +++++++++ lldb/source/Target/StackFrame.cpp | 43 ++- lldb/source/ValueObject/CMakeLists.txt | 3 + lldb/source/ValueObject/DILAST.cpp | 22 ++ lldb/source/ValueObject/DILEval.cpp | 235 ++++++++++++++++ lldb/source/ValueObject/DILParser.cpp | 260 ++++++++++++++++++ .../basics/GlobalVariableLookup/Makefile | 3 + .../TestFrameVarDILGlobalVariableLookup.py | 51 ++++ .../basics/GlobalVariableLookup/main.cpp | 15 + .../var-dil/basics/InstanceVariables/Makefile | 3 + .../TestFrameVarDILInstanceVariables.py | 29 ++ .../var-dil/basics/InstanceVariables/main.cpp | 23 ++ .../frame/var-dil/basics/LocalVars/Makefile | 3 + .../LocalVars/TestFrameVarDILLocalVars.py | 31 +++ .../frame/var-dil/basics/LocalVars/main.cpp | 9 + lldb/unittests/ValueObject/DILLexerTests.cpp | 10 +- 20 files changed, 1081 insertions(+), 13 deletions(-) create mode 100644 lldb/docs/dil-expr-lang.ebnf create mode 100644 lldb/include/lldb/ValueObject/DILAST.h create mode 100644 lldb/include/lldb/ValueObject/DILEval.h create mode 100644 lldb/include/lldb/ValueObject/DILParser.h create mode 100644 lldb/source/ValueObject/DILAST.cpp create mode 100644 lldb/source/ValueObject/DILEval.cpp create mode 100644 lldb/source/ValueObject/DILParser.cpp create mode 100644 lldb/test/API/commands/frame/var-dil/basics/GlobalVariableLookup/Makefile create mode 100644 lldb/test/API/commands/frame/var-dil/basics/GlobalVariableLookup/TestFrameVarDILGlobalVariableLookup.py create mode 100644 lldb/test/API/commands/frame/var-dil/basics/GlobalVariableLookup/main.cpp create mode 100644 lldb/test/API/commands/frame/var-dil/basics/InstanceVariables/Makefile create mode 100644 lldb/test/API/commands/frame/var-dil/basics/InstanceVariables/TestFrameVarDILInstanceVariables.py create mode 100644 lldb/test/API/commands/frame/var-dil/basics/InstanceVariables/main.cpp create mode 100644 lldb/test/API/commands/frame/var-dil/basics/LocalVars/Makefile create mode 100644 lldb/test/API/commands/frame/var-dil/basics/LocalVars/TestFrameVarDILLocalVars.py create mode 100644 lldb/test/API/commands/frame/var-dil/basics/LocalVars/main.cpp diff --git a/lldb/docs/dil-expr-lang.ebnf b/lldb/docs/dil-expr-lang.ebnf new file mode 100644 index 0000000000000..0bbbecbdc78c1 --- /dev/null +++ b/lldb/docs/dil-expr-lang.ebnf @@ -0,0 +1,42 @@ +(* Data Inspection Language (DIL) definition - LLDB Debug Expressions *) + +(* This is currently a subset of the final DIL Language, matching the current + DIL implementation. *) + +expression = primary_expression ; + +primary_expression = id_expression + | "(" expression ")"; + +id_expression = unqualified_id + | qualified_id + | register ; + +unqualified_id = identifier ; + +qualified_id = ["::"] [nested_name_specifier] unqualified_id + | ["::"] identifier ; + +identifier = ? C99 Identifier ? ; + +register = "$" ? Register name ? ; + +nested_name_specifier = type_name "::" + | namespace_name '::' + | nested_name_specifier identifier "::" ; + +type_name = class_name + | enum_name + | typedef_name; + +class_name = identifier ; + +enum_name = identifier ; + +typedef_name = identifier ; + +namespace_name = identifier ; + + + + diff --git a/lldb/include/lldb/ValueObject/DILAST.h b/lldb/include/lldb/ValueObject/DILAST.h new file mode 100644 index 0000000000000..05d87e9cc4b6b --- /dev/null +++ b/lldb/include/lldb/ValueObject/DILAST.h @@ -0,0 +1,97 @@ +//===-- DILAST.h ------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_VALUEOBJECT_DILAST_H +#define LLDB_VALUEOBJECT_DILAST_H + +#include "lldb/ValueObject/ValueObject.h" +#include "llvm/Support/Error.h" +#include +#include + +namespace lldb_private::dil { + +/// The various types DIL AST nodes (used by the DIL parser). +enum class NodeKind { + eErrorNode, + eIdentifierNode, +}; + +/// Forward declaration, for use in DIL AST nodes. Definition is at the very +/// end of this file. +class Visitor; + +/// The rest of the classes in this file, except for the Visitor class at the +/// very end, define all the types of AST nodes used by the DIL parser and +/// expression evaluator. The DIL parser parses the input string and creates +/// the AST parse tree from the AST nodes. The resulting AST node tree gets +/// passed to the DIL expression evaluator, which evaluates the DIL AST nodes +/// and creates/returns a ValueObjectSP containing the result. + +/// Base class for AST nodes used by the Data Inspection Language (DIL) parser. +/// All of the specialized types of AST nodes inherit from this (virtual) base +/// class. +class ASTNode { +public: + ASTNode(uint32_t location, NodeKind kind) + : m_location(location), m_kind(kind) {} + virtual ~ASTNode() = default; + + virtual llvm::Expected Accept(Visitor *v) const = 0; + + uint32_t GetLocation() const { return m_location; } + NodeKind GetKind() const { return m_kind; } + +private: + uint32_t m_location; + const NodeKind m_kind; +}; + +using ASTNodeUP = std::unique_ptr; + +class ErrorNode : public ASTNode { +public: + ErrorNode() : ASTNode(0, NodeKind::eErrorNode) {} + llvm::Expected Accept(Visitor *v) const override; + + static bool classof(const ASTNode *node) { + return node->GetKind() == NodeKind::eErrorNode; + } +}; + +class IdentifierNode : public ASTNode { +public: + IdentifierNode(uint32_t location, std::string name) + : ASTNode(location, NodeKind::eIdentifierNode), m_name(std::move(name)) {} + + llvm::Expected Accept(Visitor *v) const override; + + std::string GetName() const { return m_name; } + + static bool classof(const ASTNode *node) { + return node->GetKind() == NodeKind::eIdentifierNode; + } + +private: + std::string m_name; +}; + +/// This class contains one Visit method for each specialized type of +/// DIL AST node. The Visit methods are used to dispatch a DIL AST node to +/// the correct function in the DIL expression evaluator for evaluating that +/// type of AST node. +class Visitor { +public: + virtual ~Visitor() = default; + virtual llvm::Expected + Visit(const IdentifierNode *node) = 0; +}; + +} // namespace lldb_private::dil + +#endif // LLDB_VALUEOBJECT_DILAST_H diff --git a/lldb/include/lldb/ValueObject/DILEval.h b/lldb/include/lldb/ValueObject/DILEval.h new file mode 100644 index 0000000000000..335035d3f9248 --- /dev/null +++ b/lldb/include/lldb/ValueObject/DILEval.h @@ -0,0 +1,63 @@ +//===-- DILEval.h -----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_VALUEOBJECT_DILEVAL_H +#define LLDB_VALUEOBJECT_DILEVAL_H + +#include "lldb/ValueObject/DILAST.h" +#include "lldb/ValueObject/DILParser.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include +#include + +namespace lldb_private::dil { + +/// Given the name of an identifier (variable name, member name, type name, +/// etc.), find the ValueObject for that name (if it exists), excluding global +/// variables, and create and return an IdentifierInfo object containing all +/// the relevant information about that object (for DIL parsing and +/// evaluating). +lldb::ValueObjectSP LookupIdentifier(llvm::StringRef name_ref, + std::shared_ptr frame_sp, + lldb::DynamicValueType use_dynamic, + CompilerType *scope_ptr = nullptr); + +/// Given the name of an identifier, check to see if it matches the name of a +/// global variable. If so, find the ValueObject for that global variable, and +/// create and return an IdentifierInfo object containing all the relevant +/// informatin about it. +lldb::ValueObjectSP LookupGlobalIdentifier(llvm::StringRef name_ref, + std::shared_ptr frame_sp, + lldb::TargetSP target_sp, + lldb::DynamicValueType use_dynamic, + CompilerType *scope_ptr = nullptr); + +class Interpreter : Visitor { +public: + Interpreter(lldb::TargetSP target, llvm::StringRef expr, + lldb::DynamicValueType use_dynamic, + std::shared_ptr frame_sp); + + llvm::Expected Evaluate(const ASTNode *node); + +private: + llvm::Expected + Visit(const IdentifierNode *node) override; + + // Used by the interpreter to create objects, perform casts, etc. + lldb::TargetSP m_target; + llvm::StringRef m_expr; + lldb::ValueObjectSP m_scope; + lldb::DynamicValueType m_default_dynamic; + std::shared_ptr m_exe_ctx_scope; +}; + +} // namespace lldb_private::dil + +#endif // LLDB_VALUEOBJECT_DILEVAL_H diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h index e1182da5b20ab..d15fc382d1623 100644 --- a/lldb/include/lldb/ValueObject/DILLexer.h +++ b/lldb/include/lldb/ValueObject/DILLexer.h @@ -11,6 +11,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/Error.h" +#include "llvm/Support/FormatVariadic.h" #include #include #include @@ -41,10 +42,8 @@ class Token { bool IsNot(Kind kind) const { return m_kind != kind; } - bool IsOneOf(Kind kind1, Kind kind2) const { return Is(kind1) || Is(kind2); } - - template bool IsOneOf(Kind kind, Ts... Ks) const { - return Is(kind) || IsOneOf(Ks...); + bool IsOneOf(llvm::ArrayRef kinds) const { + return llvm::is_contained(kinds, m_kind); } uint32_t GetLocation() const { return m_start_pos; } @@ -120,4 +119,24 @@ class DILLexer { } // namespace lldb_private::dil +namespace llvm { + +template <> struct format_provider { + static void format(const lldb_private::dil::Token::Kind &k, raw_ostream &OS, + llvm::StringRef Options) { + OS << "'" << lldb_private::dil::Token::GetTokenName(k) << "'"; + } +}; + +template <> struct format_provider { + static void format(const lldb_private::dil::Token &t, raw_ostream &OS, + llvm::StringRef Options) { + lldb_private::dil::Token::Kind kind = t.GetKind(); + OS << "<'" << t.GetSpelling() << "' (" + << lldb_private::dil::Token::GetTokenName(kind) << ")>"; + } +}; + +} // namespace llvm + #endif // LLDB_VALUEOBJECT_DILLEXER_H diff --git a/lldb/include/lldb/ValueObject/DILParser.h b/lldb/include/lldb/ValueObject/DILParser.h new file mode 100644 index 0000000000000..9b7a6cd487939 --- /dev/null +++ b/lldb/include/lldb/ValueObject/DILParser.h @@ -0,0 +1,125 @@ +//===-- DILParser.h ---------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_VALUEOBJECT_DILPARSER_H +#define LLDB_VALUEOBJECT_DILPARSER_H + +#include "lldb/Target/ExecutionContextScope.h" +#include "lldb/Utility/DiagnosticsRendering.h" +#include "lldb/Utility/Status.h" +#include "lldb/ValueObject/DILAST.h" +#include "lldb/ValueObject/DILLexer.h" +#include "llvm/Support/Error.h" +#include +#include +#include +#include +#include +#include + +namespace lldb_private::dil { + +enum class ErrorCode : unsigned char { + kOk = 0, + kInvalidExpressionSyntax, + kUndeclaredIdentifier, + kUnknown, +}; + +// The following is modeled on class OptionParseError. +class DILDiagnosticError + : public llvm::ErrorInfo { + DiagnosticDetail m_detail; + +public: + using llvm::ErrorInfo::ErrorInfo; + DILDiagnosticError(DiagnosticDetail detail) + : ErrorInfo(make_error_code(std::errc::invalid_argument)), + m_detail(std::move(detail)) {} + + DILDiagnosticError(llvm::StringRef expr, const std::string &message, + uint32_t loc, uint16_t err_len); + + std::unique_ptr Clone() const override { + return std::make_unique(m_detail); + } + + llvm::ArrayRef GetDetails() const override { + return {m_detail}; + } + + std::string message() const override { return m_detail.rendered; } +}; + +/// Pure recursive descent parser for C++ like expressions. +/// EBNF grammar for the parser is described in lldb/docs/dil-expr-lang.ebnf +class DILParser { +public: + static llvm::Expected Parse(llvm::StringRef dil_input_expr, + DILLexer lexer, + std::shared_ptr frame_sp, + lldb::DynamicValueType use_dynamic, + bool use_synthetic, bool fragile_ivar, + bool check_ptr_vs_member); + + ~DILParser() = default; + + bool UseSynthetic() { return m_use_synthetic; } + + lldb::DynamicValueType UseDynamic() { return m_use_dynamic; } + +private: + explicit DILParser(llvm::StringRef dil_input_expr, DILLexer lexer, + std::shared_ptr frame_sp, + lldb::DynamicValueType use_dynamic, bool use_synthetic, + bool fragile_ivar, bool check_ptr_vs_member, + llvm::Error &error); + + ASTNodeUP Run(); + + ASTNodeUP ParseExpression(); + ASTNodeUP ParsePrimaryExpression(); + + std::string ParseNestedNameSpecifier(); + + std::string ParseIdExpression(); + std::string ParseUnqualifiedId(); + + void BailOut(const std::string &error, uint32_t loc, uint16_t err_len); + + void Expect(Token::Kind kind); + + void TentativeParsingRollback(uint32_t saved_idx) { + if (m_error) + llvm::consumeError(std::move(m_error)); + m_dil_lexer.ResetTokenIdx(saved_idx); + } + + Token CurToken() { return m_dil_lexer.GetCurrentToken(); } + + // Parser doesn't own the evaluation context. The produced AST may depend on + // it (for example, for source locations), so it's expected that expression + // context will outlive the parser. + std::shared_ptr m_ctx_scope; + + llvm::StringRef m_input_expr; + + DILLexer m_dil_lexer; + + // Holds an error if it occures during parsing. + llvm::Error &m_error; + + lldb::DynamicValueType m_use_dynamic; + bool m_use_synthetic; + bool m_fragile_ivar; + bool m_check_ptr_vs_member; +}; // class DILParser + +} // namespace lldb_private::dil + +#endif // LLDB_VALUEOBJECT_DILPARSER_H diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index bab36e9aa1033..0306f68169a98 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -31,6 +31,9 @@ #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/RegisterValue.h" +#include "lldb/ValueObject/DILEval.h" +#include "lldb/ValueObject/DILLexer.h" +#include "lldb/ValueObject/DILParser.h" #include "lldb/ValueObject/ValueObjectConstResult.h" #include "lldb/ValueObject/ValueObjectMemory.h" #include "lldb/ValueObject/ValueObjectVariable.h" @@ -523,10 +526,42 @@ ValueObjectSP StackFrame::GetValueForVariableExpressionPath( ValueObjectSP StackFrame::DILGetValueForVariableExpressionPath( llvm::StringRef var_expr, lldb::DynamicValueType use_dynamic, uint32_t options, lldb::VariableSP &var_sp, Status &error) { - // This is a place-holder for the calls into the DIL parser and - // evaluator. For now, just call the "real" frame variable implementation. - return LegacyGetValueForVariableExpressionPath(var_expr, use_dynamic, options, - var_sp, error); + + const bool check_ptr_vs_member = + (options & eExpressionPathOptionCheckPtrVsMember) != 0; + const bool no_fragile_ivar = + (options & eExpressionPathOptionsNoFragileObjcIvar) != 0; + const bool no_synth_child = + (options & eExpressionPathOptionsNoSyntheticChildren) != 0; + + // Lex the expression. + auto lex_or_err = dil::DILLexer::Create(var_expr); + if (!lex_or_err) { + error = Status::FromError(lex_or_err.takeError()); + return ValueObjectSP(); + } + + // Parse the expression. + auto tree_or_error = dil::DILParser::Parse( + var_expr, std::move(*lex_or_err), shared_from_this(), use_dynamic, + !no_synth_child, !no_fragile_ivar, check_ptr_vs_member); + if (!tree_or_error) { + error = Status::FromError(tree_or_error.takeError()); + return ValueObjectSP(); + } + + // Evaluate the parsed expression. + lldb::TargetSP target = this->CalculateTarget(); + dil::Interpreter interpreter(target, var_expr, use_dynamic, + shared_from_this()); + + auto valobj_or_error = interpreter.Evaluate((*tree_or_error).get()); + if (!valobj_or_error) { + error = Status::FromError(valobj_or_error.takeError()); + return ValueObjectSP(); + } + + return *valobj_or_error; } ValueObjectSP StackFrame::LegacyGetValueForVariableExpressionPath( diff --git a/lldb/source/ValueObject/CMakeLists.txt b/lldb/source/ValueObject/CMakeLists.txt index 30c34472289e7..92683916f5a52 100644 --- a/lldb/source/ValueObject/CMakeLists.txt +++ b/lldb/source/ValueObject/CMakeLists.txt @@ -1,5 +1,8 @@ add_lldb_library(lldbValueObject + DILAST.cpp + DILEval.cpp DILLexer.cpp + DILParser.cpp ValueObject.cpp ValueObjectCast.cpp ValueObjectChild.cpp diff --git a/lldb/source/ValueObject/DILAST.cpp b/lldb/source/ValueObject/DILAST.cpp new file mode 100644 index 0000000000000..e75958d784627 --- /dev/null +++ b/lldb/source/ValueObject/DILAST.cpp @@ -0,0 +1,22 @@ +//===-- DILAST.cpp --------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/ValueObject/DILAST.h" +#include "llvm/Support/ErrorHandling.h" + +namespace lldb_private::dil { + +llvm::Expected ErrorNode::Accept(Visitor *v) const { + llvm_unreachable("Attempting to Visit a DIL ErrorNode."); +} + +llvm::Expected IdentifierNode::Accept(Visitor *v) const { + return v->Visit(this); +} + +} // namespace lldb_private::dil diff --git a/lldb/source/ValueObject/DILEval.cpp b/lldb/source/ValueObject/DILEval.cpp new file mode 100644 index 0000000000000..4889834c7a3c1 --- /dev/null +++ b/lldb/source/ValueObject/DILEval.cpp @@ -0,0 +1,235 @@ +//===-- DILEval.cpp -------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/ValueObject/DILEval.h" +#include "lldb/Symbol/VariableList.h" +#include "lldb/Target/RegisterContext.h" +#include "lldb/ValueObject/DILAST.h" +#include "lldb/ValueObject/ValueObject.h" +#include "lldb/ValueObject/ValueObjectRegister.h" +#include "lldb/ValueObject/ValueObjectVariable.h" +#include "llvm/Support/FormatAdapters.h" +#include + +namespace lldb_private::dil { + +static lldb::ValueObjectSP LookupStaticIdentifier( + VariableList &variable_list, std::shared_ptr exe_scope, + llvm::StringRef name_ref, llvm::StringRef unqualified_name) { + // First look for an exact match to the (possibly) qualified name. + for (const lldb::VariableSP &var_sp : variable_list) { + lldb::ValueObjectSP valobj_sp( + ValueObjectVariable::Create(exe_scope.get(), var_sp)); + if (valobj_sp && valobj_sp->GetVariable() && + (valobj_sp->GetVariable()->NameMatches(ConstString(name_ref)))) + return valobj_sp; + } + + // If the qualified name is the same as the unqualfied name, there's nothing + // more to be done. + if (name_ref == unqualified_name) + return nullptr; + + // We didn't match the qualified name; try to match the unqualified name. + for (const lldb::VariableSP &var_sp : variable_list) { + lldb::ValueObjectSP valobj_sp( + ValueObjectVariable::Create(exe_scope.get(), var_sp)); + if (valobj_sp && valobj_sp->GetVariable() && + (valobj_sp->GetVariable()->NameMatches(ConstString(unqualified_name)))) + return valobj_sp; + } + + return nullptr; +} + +static lldb::VariableSP DILFindVariable(ConstString name, + lldb::VariableListSP variable_list) { + lldb::VariableSP exact_match; + std::vector possible_matches; + + for (lldb::VariableSP var_sp : *variable_list) { + llvm::StringRef str_ref_name = var_sp->GetName().GetStringRef(); + // Check for global vars, which might start with '::'. + str_ref_name.consume_front("::"); + + if (str_ref_name == name.GetStringRef()) + possible_matches.push_back(var_sp); + else if (var_sp->NameMatches(name)) + possible_matches.push_back(var_sp); + } + + // Look for exact matches (favors local vars over global vars) + auto exact_match_it = + llvm::find_if(possible_matches, [&](lldb::VariableSP var_sp) { + return var_sp->GetName() == name; + }); + + if (exact_match_it != possible_matches.end()) + return *exact_match_it; + + // Look for a global var exact match. + for (auto var_sp : possible_matches) { + llvm::StringRef str_ref_name = var_sp->GetName().GetStringRef(); + str_ref_name.consume_front("::"); + if (str_ref_name == name.GetStringRef()) + return var_sp; + } + + // If there's a single non-exact match, take it. + if (possible_matches.size() == 1) + return possible_matches[0]; + + return nullptr; +} + +lldb::ValueObjectSP LookupGlobalIdentifier( + llvm::StringRef name_ref, std::shared_ptr stack_frame, + lldb::TargetSP target_sp, lldb::DynamicValueType use_dynamic, + CompilerType *scope_ptr) { + // First look for match in "local" global variables. + lldb::VariableListSP variable_list(stack_frame->GetInScopeVariableList(true)); + name_ref.consume_front("::"); + + lldb::ValueObjectSP value_sp; + if (variable_list) { + lldb::VariableSP var_sp = + DILFindVariable(ConstString(name_ref), variable_list); + if (var_sp) + value_sp = + stack_frame->GetValueObjectForFrameVariable(var_sp, use_dynamic); + } + + if (value_sp) + return value_sp; + + // Also check for static global vars. + if (variable_list) { + const char *type_name = ""; + if (scope_ptr) + type_name = scope_ptr->GetCanonicalType().GetTypeName().AsCString(); + std::string name_with_type_prefix = + llvm::formatv("{0}::{1}", type_name, name_ref).str(); + value_sp = LookupStaticIdentifier(*variable_list, stack_frame, + name_with_type_prefix, name_ref); + if (!value_sp) + value_sp = LookupStaticIdentifier(*variable_list, stack_frame, name_ref, + name_ref); + } + + if (value_sp) + return value_sp; + + // Check for match in modules global variables. + VariableList modules_var_list; + target_sp->GetImages().FindGlobalVariables( + ConstString(name_ref), std::numeric_limits::max(), + modules_var_list); + if (modules_var_list.Empty()) + return nullptr; + + for (const lldb::VariableSP &var_sp : modules_var_list) { + std::string qualified_name = llvm::formatv("::{0}", name_ref).str(); + if (var_sp->NameMatches(ConstString(name_ref)) || + var_sp->NameMatches(ConstString(qualified_name))) { + value_sp = ValueObjectVariable::Create(stack_frame.get(), var_sp); + break; + } + } + + if (value_sp) + return value_sp; + + return nullptr; +} + +lldb::ValueObjectSP LookupIdentifier(llvm::StringRef name_ref, + std::shared_ptr stack_frame, + lldb::DynamicValueType use_dynamic, + CompilerType *scope_ptr) { + // Support $rax as a special syntax for accessing registers. + // Will return an invalid value in case the requested register doesn't exist. + if (name_ref.consume_front("$")) { + lldb::RegisterContextSP reg_ctx(stack_frame->GetRegisterContext()); + if (!reg_ctx) + return nullptr; + + if (const RegisterInfo *reg_info = reg_ctx->GetRegisterInfoByName(name_ref)) + return ValueObjectRegister::Create(stack_frame.get(), reg_ctx, reg_info); + + return nullptr; + } + + lldb::VariableListSP variable_list( + stack_frame->GetInScopeVariableList(false)); + + if (!name_ref.contains("::")) { + if (!scope_ptr || !scope_ptr->IsValid()) { + // Lookup in the current frame. + // Try looking for a local variable in current scope. + lldb::ValueObjectSP value_sp; + if (variable_list) { + lldb::VariableSP var_sp = + DILFindVariable(ConstString(name_ref), variable_list); + if (var_sp) + value_sp = + stack_frame->GetValueObjectForFrameVariable(var_sp, use_dynamic); + } + if (!value_sp) + value_sp = stack_frame->FindVariable(ConstString(name_ref)); + + if (value_sp) + return value_sp; + + // Try looking for an instance variable (class member). + SymbolContext sc = stack_frame->GetSymbolContext( + lldb::eSymbolContextFunction | lldb::eSymbolContextBlock); + llvm::StringRef ivar_name = sc.GetInstanceVariableName(); + value_sp = stack_frame->FindVariable(ConstString(ivar_name)); + if (value_sp) + value_sp = value_sp->GetChildMemberWithName(name_ref); + + if (value_sp) + return value_sp; + } + } + return nullptr; +} + +Interpreter::Interpreter(lldb::TargetSP target, llvm::StringRef expr, + lldb::DynamicValueType use_dynamic, + std::shared_ptr frame_sp) + : m_target(std::move(target)), m_expr(expr), m_default_dynamic(use_dynamic), + m_exe_ctx_scope(frame_sp) {} + +llvm::Expected Interpreter::Evaluate(const ASTNode *node) { + + // Traverse an AST pointed by the `node`. + return node->Accept(this); +} + +llvm::Expected +Interpreter::Visit(const IdentifierNode *node) { + lldb::DynamicValueType use_dynamic = m_default_dynamic; + + lldb::ValueObjectSP identifier = + LookupIdentifier(node->GetName(), m_exe_ctx_scope, use_dynamic); + + if (!identifier) + identifier = LookupGlobalIdentifier(node->GetName(), m_exe_ctx_scope, + m_target, use_dynamic); + if (!identifier) { + std::string errMsg = + llvm::formatv("use of undeclared identifier '{0}'", node->GetName()); + return llvm::make_error( + m_expr, errMsg, node->GetLocation(), node->GetName().size()); + } + + return identifier; +} + +} // namespace lldb_private::dil diff --git a/lldb/source/ValueObject/DILParser.cpp b/lldb/source/ValueObject/DILParser.cpp new file mode 100644 index 0000000000000..a8baba2c06e7a --- /dev/null +++ b/lldb/source/ValueObject/DILParser.cpp @@ -0,0 +1,260 @@ +//===-- DILParser.cpp -----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This implements the recursive descent parser for the Data Inspection +// Language (DIL), and its helper functions, which will eventually underlie the +// 'frame variable' command. The language that this parser recognizes is +// described in lldb/docs/dil-expr-lang.ebnf +// +//===----------------------------------------------------------------------===// + +#include "lldb/ValueObject/DILParser.h" +#include "lldb/Target/ExecutionContextScope.h" +#include "lldb/Utility/DiagnosticsRendering.h" +#include "lldb/ValueObject/DILAST.h" +#include "lldb/ValueObject/DILEval.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/FormatAdapters.h" +#include +#include +#include +#include +#include + +namespace lldb_private::dil { + +DILDiagnosticError::DILDiagnosticError(llvm::StringRef expr, + const std::string &message, uint32_t loc, + uint16_t err_len) + : ErrorInfo(make_error_code(std::errc::invalid_argument)) { + DiagnosticDetail::SourceLocation sloc = { + FileSpec{}, /*line=*/1, static_cast(loc + 1), + err_len, false, /*in_user_input=*/true}; + std::string rendered_msg = + llvm::formatv(":1:{0}: {1}\n 1 | {2}\n | ^", + loc + 1, message, expr); + m_detail.source_location = sloc; + m_detail.severity = lldb::eSeverityError; + m_detail.message = message; + m_detail.rendered = std::move(rendered_msg); +} + +llvm::Expected +DILParser::Parse(llvm::StringRef dil_input_expr, DILLexer lexer, + std::shared_ptr frame_sp, + lldb::DynamicValueType use_dynamic, bool use_synthetic, + bool fragile_ivar, bool check_ptr_vs_member) { + llvm::Error error = llvm::Error::success(); + DILParser parser(dil_input_expr, lexer, frame_sp, use_dynamic, use_synthetic, + fragile_ivar, check_ptr_vs_member, error); + + ASTNodeUP node_up = parser.Run(); + + if (error) + return error; + + return node_up; +} + +DILParser::DILParser(llvm::StringRef dil_input_expr, DILLexer lexer, + std::shared_ptr frame_sp, + lldb::DynamicValueType use_dynamic, bool use_synthetic, + bool fragile_ivar, bool check_ptr_vs_member, + llvm::Error &error) + : m_ctx_scope(frame_sp), m_input_expr(dil_input_expr), + m_dil_lexer(std::move(lexer)), m_error(error), m_use_dynamic(use_dynamic), + m_use_synthetic(use_synthetic), m_fragile_ivar(fragile_ivar), + m_check_ptr_vs_member(check_ptr_vs_member) {} + +ASTNodeUP DILParser::Run() { + ASTNodeUP expr = ParseExpression(); + + Expect(Token::Kind::eof); + + return expr; +} + +// Parse an expression. +// +// expression: +// primary_expression +// +ASTNodeUP DILParser::ParseExpression() { return ParsePrimaryExpression(); } + +// Parse a primary_expression. +// +// primary_expression: +// id_expression +// "(" expression ")" +// +ASTNodeUP DILParser::ParsePrimaryExpression() { + if (CurToken().IsOneOf({Token::coloncolon, Token::identifier})) { + // Save the source location for the diagnostics message. + uint32_t loc = CurToken().GetLocation(); + auto identifier = ParseIdExpression(); + + return std::make_unique(loc, identifier); + } + + if (CurToken().Is(Token::l_paren)) { + m_dil_lexer.Advance(); + auto expr = ParseExpression(); + Expect(Token::r_paren); + m_dil_lexer.Advance(); + return expr; + } + + BailOut(llvm::formatv("Unexpected token: {0}", CurToken()), + CurToken().GetLocation(), CurToken().GetSpelling().length()); + return std::make_unique(); +} + +// Parse nested_name_specifier. +// +// nested_name_specifier: +// type_name "::" +// namespace_name "::" +// nested_name_specifier identifier "::" +// +std::string DILParser::ParseNestedNameSpecifier() { + // The first token in nested_name_specifier is always an identifier, or + // '(anonymous namespace)'. + switch (CurToken().GetKind()) { + case Token::l_paren: { + // Anonymous namespaces need to be treated specially: They are + // represented the the string '(anonymous namespace)', which has a + // space in it (throwing off normal parsing) and is not actually + // proper C++> Check to see if we're looking at + // '(anonymous namespace)::...' + + // Look for all the pieces, in order: + // l_paren 'anonymous' 'namespace' r_paren coloncolon + if (m_dil_lexer.LookAhead(1).Is(Token::identifier) && + (m_dil_lexer.LookAhead(1).GetSpelling() == "anonymous") && + m_dil_lexer.LookAhead(2).Is(Token::identifier) && + (m_dil_lexer.LookAhead(2).GetSpelling() == "namespace") && + m_dil_lexer.LookAhead(3).Is(Token::r_paren) && + m_dil_lexer.LookAhead(4).Is(Token::coloncolon)) { + m_dil_lexer.Advance(4); + + assert( + (CurToken().Is(Token::identifier) || CurToken().Is(Token::l_paren)) && + "Expected an identifier or anonymous namespace, but not found."); + // Continue parsing the nested_namespace_specifier. + std::string identifier2 = ParseNestedNameSpecifier(); + if (identifier2.empty()) { + Expect(Token::identifier); + identifier2 = CurToken().GetSpelling(); + m_dil_lexer.Advance(); + } + return "(anonymous namespace)::" + identifier2; + } + + return ""; + } // end of special handling for '(anonymous namespace)' + case Token::identifier: { + // If the next token is scope ("::"), then this is indeed a + // nested_name_specifier + if (m_dil_lexer.LookAhead(1).Is(Token::coloncolon)) { + // This nested_name_specifier is a single identifier. + std::string identifier = CurToken().GetSpelling(); + m_dil_lexer.Advance(1); + Expect(Token::coloncolon); + m_dil_lexer.Advance(); + // Continue parsing the nested_name_specifier. + return identifier + "::" + ParseNestedNameSpecifier(); + } + + return ""; + } + default: + return ""; + } +} + +// Parse an id_expression. +// +// id_expression: +// unqualified_id +// qualified_id +// +// qualified_id: +// ["::"] [nested_name_specifier] unqualified_id +// ["::"] identifier +// +// identifier: +// ? Token::identifier ? +// +std::string DILParser::ParseIdExpression() { + // Try parsing optional global scope operator. + bool global_scope = false; + if (CurToken().Is(Token::coloncolon)) { + global_scope = true; + m_dil_lexer.Advance(); + } + + // Try parsing optional nested_name_specifier. + std::string nested_name_specifier = ParseNestedNameSpecifier(); + + // If nested_name_specifier is present, then it's qualified_id production. + // Follow the first production rule. + if (!nested_name_specifier.empty()) { + // Parse unqualified_id and construct a fully qualified id expression. + auto unqualified_id = ParseUnqualifiedId(); + + return llvm::formatv("{0}{1}{2}", global_scope ? "::" : "", + nested_name_specifier, unqualified_id); + } + + // No nested_name_specifier, but with global scope -- this is also a + // qualified_id production. Follow the second production rule. + if (global_scope) { + Expect(Token::identifier); + std::string identifier = CurToken().GetSpelling(); + m_dil_lexer.Advance(); + return llvm::formatv("{0}{1}", global_scope ? "::" : "", identifier); + } + + // This is unqualified_id production. + return ParseUnqualifiedId(); +} + +// Parse an unqualified_id. +// +// unqualified_id: +// identifier +// +// identifier: +// ? Token::identifier ? +// +std::string DILParser::ParseUnqualifiedId() { + Expect(Token::identifier); + std::string identifier = CurToken().GetSpelling(); + m_dil_lexer.Advance(); + return identifier; +} + +void DILParser::BailOut(const std::string &error, uint32_t loc, + uint16_t err_len) { + if (m_error) + // If error is already set, then the parser is in the "bail-out" mode. Don't + // do anything and keep the original error. + return; + + m_error = + llvm::make_error(m_input_expr, error, loc, err_len); + // Advance the lexer token index to the end of the lexed tokens vector. + m_dil_lexer.ResetTokenIdx(m_dil_lexer.NumLexedTokens() - 1); +} + +void DILParser::Expect(Token::Kind kind) { + if (CurToken().IsNot(kind)) { + BailOut(llvm::formatv("expected {0}, got: {1}", kind, CurToken()), + CurToken().GetLocation(), CurToken().GetSpelling().length()); + } +} + +} // namespace lldb_private::dil diff --git a/lldb/test/API/commands/frame/var-dil/basics/GlobalVariableLookup/Makefile b/lldb/test/API/commands/frame/var-dil/basics/GlobalVariableLookup/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/commands/frame/var-dil/basics/GlobalVariableLookup/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/commands/frame/var-dil/basics/GlobalVariableLookup/TestFrameVarDILGlobalVariableLookup.py b/lldb/test/API/commands/frame/var-dil/basics/GlobalVariableLookup/TestFrameVarDILGlobalVariableLookup.py new file mode 100644 index 0000000000000..edb013c7b3526 --- /dev/null +++ b/lldb/test/API/commands/frame/var-dil/basics/GlobalVariableLookup/TestFrameVarDILGlobalVariableLookup.py @@ -0,0 +1,51 @@ +""" +Make sure 'frame var' using DIL parser/evaultor works for local variables. +""" + +import lldb +from lldbsuite.test.lldbtest import * +from lldbsuite.test.decorators import * +from lldbsuite.test import lldbutil + +import os +import shutil +import time + + +class TestFrameVarDILGlobalVariableLookup(TestBase): + # If your test case doesn't stress debug info, then + # set this to true. That way it won't be run once for + # each debug info format. + NO_DEBUG_INFO_TESTCASE = True + + def test_frame_var(self): + self.build() + lldbutil.run_to_source_breakpoint( + self, "Set a breakpoint here", lldb.SBFileSpec("main.cpp") + ) + + self.runCmd("settings set target.experimental.use-DIL true") + self.expect_var_path("globalVar", type="int", value="-559038737") # 0xDEADBEEF + self.expect_var_path("globalPtr", type="int *") + self.expect_var_path("globalRef", type="int &") + self.expect_var_path("::globalVar", value="-559038737") + self.expect_var_path("::globalPtr", type="int *") + self.expect_var_path("::globalRef", type="int &") + + self.expect( + "frame variable 'externGlobalVar'", + error=True, + substrs=["use of undeclared identifier"], + ) # 0x00C0FFEE + self.expect( + "frame variable '::externGlobalVar'", + error=True, + substrs=["use of undeclared identifier"], + ) # ["12648430"]) + + self.expect_var_path("ns::globalVar", value="13") + self.expect_var_path("ns::globalPtr", type="int *") + self.expect_var_path("ns::globalRef", type="int &") + self.expect_var_path("::ns::globalVar", value="13") + self.expect_var_path("::ns::globalPtr", type="int *") + self.expect_var_path("::ns::globalRef", type="int &") diff --git a/lldb/test/API/commands/frame/var-dil/basics/GlobalVariableLookup/main.cpp b/lldb/test/API/commands/frame/var-dil/basics/GlobalVariableLookup/main.cpp new file mode 100644 index 0000000000000..5bae4fd423e32 --- /dev/null +++ b/lldb/test/API/commands/frame/var-dil/basics/GlobalVariableLookup/main.cpp @@ -0,0 +1,15 @@ +int globalVar = 0xDEADBEEF; +extern int externGlobalVar; + +int *globalPtr = &globalVar; +int &globalRef = globalVar; + +namespace ns { +int globalVar = 13; +int *globalPtr = &globalVar; +int &globalRef = globalVar; +} // namespace ns + +int main(int argc, char **argv) { + return 0; // Set a breakpoint here +} diff --git a/lldb/test/API/commands/frame/var-dil/basics/InstanceVariables/Makefile b/lldb/test/API/commands/frame/var-dil/basics/InstanceVariables/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/commands/frame/var-dil/basics/InstanceVariables/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/commands/frame/var-dil/basics/InstanceVariables/TestFrameVarDILInstanceVariables.py b/lldb/test/API/commands/frame/var-dil/basics/InstanceVariables/TestFrameVarDILInstanceVariables.py new file mode 100644 index 0000000000000..cf230928fc117 --- /dev/null +++ b/lldb/test/API/commands/frame/var-dil/basics/InstanceVariables/TestFrameVarDILInstanceVariables.py @@ -0,0 +1,29 @@ +""" +Make sure 'frame var' using DIL parser/evaultor works for local variables. +""" + +import lldb +from lldbsuite.test.lldbtest import * +from lldbsuite.test.decorators import * +from lldbsuite.test import lldbutil + +import os +import shutil +import time + + +class TestFrameVarDILInstanceVariables(TestBase): + # If your test case doesn't stress debug info, then + # set this to true. That way it won't be run once for + # each debug info format. + NO_DEBUG_INFO_TESTCASE = True + + def test_frame_var(self): + self.build() + lldbutil.run_to_source_breakpoint( + self, "Set a breakpoint here", lldb.SBFileSpec("main.cpp") + ) + + self.runCmd("settings set target.experimental.use-DIL true") + self.expect_var_path("this", type="TestMethods *") + self.expect_var_path("c", children=[ValueCheck(name="field_", value="-1")]) diff --git a/lldb/test/API/commands/frame/var-dil/basics/InstanceVariables/main.cpp b/lldb/test/API/commands/frame/var-dil/basics/InstanceVariables/main.cpp new file mode 100644 index 0000000000000..7a559c4007415 --- /dev/null +++ b/lldb/test/API/commands/frame/var-dil/basics/InstanceVariables/main.cpp @@ -0,0 +1,23 @@ +#include + +class C { +public: + int field_ = 1337; +}; + +class TestMethods { +public: + void TestInstanceVariables() { + C c; + c.field_ = -1; + + return; // Set a breakpoint here + } +}; + +int main(int argc, char **argv) { + TestMethods tm; + + tm.TestInstanceVariables(); + return 0; +} diff --git a/lldb/test/API/commands/frame/var-dil/basics/LocalVars/Makefile b/lldb/test/API/commands/frame/var-dil/basics/LocalVars/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/commands/frame/var-dil/basics/LocalVars/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/commands/frame/var-dil/basics/LocalVars/TestFrameVarDILLocalVars.py b/lldb/test/API/commands/frame/var-dil/basics/LocalVars/TestFrameVarDILLocalVars.py new file mode 100644 index 0000000000000..0f6618fe47984 --- /dev/null +++ b/lldb/test/API/commands/frame/var-dil/basics/LocalVars/TestFrameVarDILLocalVars.py @@ -0,0 +1,31 @@ +""" +Make sure 'frame var' using DIL parser/evaultor works for local variables. +""" + +import lldb +from lldbsuite.test.lldbtest import * +from lldbsuite.test.decorators import * +from lldbsuite.test import lldbutil + +import os +import shutil +import time + + +class TestFrameVarDILLocalVars(TestBase): + # If your test case doesn't stress debug info, then + # set this to true. That way it won't be run once for + # each debug info format. + NO_DEBUG_INFO_TESTCASE = True + + def test_frame_var(self): + self.build() + lldbutil.run_to_source_breakpoint( + self, "Set a breakpoint here", lldb.SBFileSpec("main.cpp") + ) + + self.runCmd("settings set target.experimental.use-DIL true") + self.expect_var_path("a", value="1") + self.expect_var_path("b", value="2") + self.expect_var_path("c", value="'\\xfd'") + self.expect_var_path("s", value="4") diff --git a/lldb/test/API/commands/frame/var-dil/basics/LocalVars/main.cpp b/lldb/test/API/commands/frame/var-dil/basics/LocalVars/main.cpp new file mode 100644 index 0000000000000..04c73539c5f89 --- /dev/null +++ b/lldb/test/API/commands/frame/var-dil/basics/LocalVars/main.cpp @@ -0,0 +1,9 @@ +int main(int argc, char **argv) { + int a = 1; + int b = 2; + + char c = -3; + unsigned short s = 4; + + return 0; // Set a breakpoint here +} diff --git a/lldb/unittests/ValueObject/DILLexerTests.cpp b/lldb/unittests/ValueObject/DILLexerTests.cpp index 9e5b8efd7af80..9afa957901ae7 100644 --- a/lldb/unittests/ValueObject/DILLexerTests.cpp +++ b/lldb/unittests/ValueObject/DILLexerTests.cpp @@ -54,9 +54,9 @@ TEST(DILLexerTests, TokenKindTest) { EXPECT_TRUE(token.Is(Token::identifier)); EXPECT_FALSE(token.Is(Token::l_paren)); - EXPECT_TRUE(token.IsOneOf(Token::eof, Token::identifier)); - EXPECT_FALSE(token.IsOneOf(Token::l_paren, Token::r_paren, Token::coloncolon, - Token::eof)); + EXPECT_TRUE(token.IsOneOf({Token::eof, Token::identifier})); + EXPECT_FALSE(token.IsOneOf( + {Token::l_paren, Token::r_paren, Token::coloncolon, Token::eof})); } TEST(DILLexerTests, LookAheadTest) { @@ -150,7 +150,7 @@ TEST(DILLexerTests, IdentifiersTest) { DILLexer lexer(*maybe_lexer); Token token = lexer.GetCurrentToken(); EXPECT_TRUE(token.IsNot(Token::identifier)); - EXPECT_TRUE(token.IsOneOf(Token::eof, Token::coloncolon, Token::l_paren, - Token::r_paren)); + EXPECT_TRUE(token.IsOneOf( + {Token::eof, Token::coloncolon, Token::l_paren, Token::r_paren})); } } From 92c93f5286b9ff33f27ff694d2dc33da1c07afdd Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 3 Apr 2025 22:11:48 -0700 Subject: [PATCH 0612/1029] [MC] Merge MCAsmLexer and AsmLexer Follow-up to #134207 Both classes define `IsAtStartOfStatement` but the semantics are confusingly different. Rename the base class one. --- llvm/include/llvm/MC/MCParser/AsmLexer.h | 67 +++++++-------------- llvm/include/llvm/MC/MCParser/MCAsmParser.h | 1 - llvm/lib/MC/MCParser/AsmLexer.cpp | 10 +-- 3 files changed, 25 insertions(+), 53 deletions(-) diff --git a/llvm/include/llvm/MC/MCParser/AsmLexer.h b/llvm/include/llvm/MC/MCParser/AsmLexer.h index d1599dc47e76b..bf4aa09df613f 100644 --- a/llvm/include/llvm/MC/MCParser/AsmLexer.h +++ b/llvm/include/llvm/MC/MCParser/AsmLexer.h @@ -39,17 +39,24 @@ class AsmCommentConsumer { virtual void HandleComment(SMLoc Loc, StringRef CommentText) = 0; }; -/// Generic assembler lexer interface, for use by target specific assembly -/// lexers. -class MCAsmLexer { +class AsmLexer { /// The current token, stored in the base class for faster access. SmallVector CurTok; + const char *CurPtr = nullptr; + StringRef CurBuf; + /// The location and description of the current error SMLoc ErrLoc; std::string Err; -protected: // Can only create subclasses. + const MCAsmInfo &MAI; + + bool IsAtStartOfLine = true; + bool AtStartOfStatement = true; + bool IsPeeking = false; + bool EndStatementAtEOF = true; + const char *TokStart = nullptr; bool SkipSpace = true; bool AllowAtInIdentifier = false; @@ -65,9 +72,7 @@ class MCAsmLexer { bool LexHLASMStrings = false; AsmCommentConsumer *CommentConsumer = nullptr; - MCAsmLexer(); - - virtual AsmToken LexToken() = 0; + AsmToken LexToken(); void SetError(SMLoc errLoc, const std::string &err) { ErrLoc = errLoc; @@ -75,9 +80,9 @@ class MCAsmLexer { } public: - MCAsmLexer(const MCAsmLexer &) = delete; - MCAsmLexer &operator=(const MCAsmLexer &) = delete; - virtual ~MCAsmLexer(); + AsmLexer(const MCAsmInfo &MAI); + AsmLexer(const AsmLexer &) = delete; + AsmLexer &operator=(const AsmLexer &) = delete; /// Consume the next token from the input stream and return it. /// @@ -86,7 +91,7 @@ class MCAsmLexer { const AsmToken &Lex() { assert(!CurTok.empty()); // Mark if we parsing out a EndOfStatement. - IsAtStartOfStatement = CurTok.front().getKind() == AsmToken::EndOfStatement; + AtStartOfStatement = CurTok.front().getKind() == AsmToken::EndOfStatement; CurTok.erase(CurTok.begin()); // LexToken may generate multiple tokens via UnLex but will always return // the first one. Place returned value at head of CurTok vector. @@ -98,16 +103,16 @@ class MCAsmLexer { } void UnLex(AsmToken const &Token) { - IsAtStartOfStatement = false; + AtStartOfStatement = false; CurTok.insert(CurTok.begin(), Token); } - bool isAtStartOfStatement() { return IsAtStartOfStatement; } + bool isAtStartOfStatement() { return AtStartOfStatement; } - virtual StringRef LexUntilEndOfStatement() = 0; + StringRef LexUntilEndOfStatement(); /// Get the current source location. - SMLoc getLoc() const; + SMLoc getLoc() const { return SMLoc::getFromPointer(TokStart); } /// Get the current (last) lexed token. const AsmToken &getTok() const { return CurTok[0]; } @@ -126,8 +131,7 @@ class MCAsmLexer { } /// Look ahead an arbitrary number of tokens. - virtual size_t peekTokens(MutableArrayRef Buf, - bool ShouldSkipSpace = true) = 0; + size_t peekTokens(MutableArrayRef Buf, bool ShouldSkipSpace = true); /// Get the current error location SMLoc getErrLoc() { return ErrLoc; } @@ -185,37 +189,10 @@ class MCAsmLexer { /// setting this option to true, will disable lexing for character and string /// literals. void setLexHLASMStrings(bool V) { LexHLASMStrings = V; } -}; - -/// AsmLexer - Lexer class for assembly files. -class AsmLexer final : public MCAsmLexer { - const MCAsmInfo &MAI; - - const char *CurPtr = nullptr; - StringRef CurBuf; - bool IsAtStartOfLine = true; - bool IsAtStartOfStatement = true; - bool IsPeeking = false; - bool EndStatementAtEOF = true; - -protected: - /// LexToken - Read the next token and return its code. - AsmToken LexToken() override; - -public: - AsmLexer(const MCAsmInfo &MAI); - AsmLexer(const AsmLexer &) = delete; - AsmLexer &operator=(const AsmLexer &) = delete; - ~AsmLexer() override; void setBuffer(StringRef Buf, const char *ptr = nullptr, bool EndStatementAtEOF = true); - StringRef LexUntilEndOfStatement() override; - - size_t peekTokens(MutableArrayRef Buf, - bool ShouldSkipSpace = true) override; - const MCAsmInfo &getMAI() const { return MAI; } private: @@ -237,6 +214,8 @@ class AsmLexer final : public MCAsmLexer { StringRef LexUntilEndOfLine(); }; +using MCAsmLexer = AsmLexer; + } // end namespace llvm #endif // LLVM_MC_MCPARSER_ASMLEXER_H diff --git a/llvm/include/llvm/MC/MCParser/MCAsmParser.h b/llvm/include/llvm/MC/MCParser/MCAsmParser.h index c65a38c944eea..7cdd99a207468 100644 --- a/llvm/include/llvm/MC/MCParser/MCAsmParser.h +++ b/llvm/include/llvm/MC/MCParser/MCAsmParser.h @@ -24,7 +24,6 @@ namespace llvm { -class MCAsmLexer; class MCAsmInfo; class MCAsmParserExtension; class MCExpr; diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp index 3c911dba8cc26..b91f1f544d29c 100644 --- a/llvm/lib/MC/MCParser/AsmLexer.cpp +++ b/llvm/lib/MC/MCParser/AsmLexer.cpp @@ -111,21 +111,15 @@ void AsmToken::dump(raw_ostream &OS) const { OS << "\")"; } -MCAsmLexer::MCAsmLexer() { CurTok.emplace_back(AsmToken::Space, StringRef()); } - -MCAsmLexer::~MCAsmLexer() = default; - -SMLoc MCAsmLexer::getLoc() const { return SMLoc::getFromPointer(TokStart); } - AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) { // For COFF targets, this is true, while for ELF targets, it should be false. // Currently, @specifier parsing depends on '@' being included in the token. AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@") && MAI.useAtForSpecifier(); LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers(); -} -AsmLexer::~AsmLexer() = default; + CurTok.emplace_back(AsmToken::Space, StringRef()); +} void AsmLexer::setBuffer(StringRef Buf, const char *ptr, bool EndStatementAtEOF) { From 92923e517c2926eb94b7b6e403433ecf62953186 Mon Sep 17 00:00:00 2001 From: Iris <0.0@owo.li> Date: Fri, 4 Apr 2025 14:00:04 +0800 Subject: [PATCH 0613/1029] [mlir][llvm] Add `LLVM_DependentLibrariesAttr` (#133385) https://llvm.org/docs/LangRef.html#dependent-libs-named-metadata --------- Co-authored-by: Tobias Gysi --- .../mlir/Dialect/LLVMIR/LLVMAttrDefs.td | 19 +++++++++++++++++++ .../mlir/Dialect/LLVMIR/LLVMDialect.td | 5 +++++ .../include/mlir/Target/LLVMIR/ModuleImport.h | 4 ++++ .../mlir/Target/LLVMIR/ModuleTranslation.h | 3 +++ mlir/lib/Target/LLVMIR/ModuleImport.cpp | 19 +++++++++++++++++++ mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 18 ++++++++++++++++++ .../Import/metadata-dependent-libraries.ll | 6 ++++++ mlir/test/Target/LLVMIR/llvmir.mlir | 8 ++++++++ 8 files changed, 82 insertions(+) create mode 100644 mlir/test/Target/LLVMIR/Import/metadata-dependent-libraries.ll diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td index 41c30b81770bc..549a37de2e412 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td @@ -1327,4 +1327,23 @@ def ModuleFlagAttr let assemblyFormat = "`<` $behavior `,` $key `,` $value `>`"; } +//===----------------------------------------------------------------------===// +// LLVM_DependentLibrariesAttr +//===----------------------------------------------------------------------===// + +def LLVM_DependentLibrariesAttr + : LLVM_Attr<"DependentLibraries", "dependent_libraries"> { + let summary = "LLVM dependent libraries attribute"; + let description = [{ + Represents the list of dependent libraries for the current module. + This attribute is used to specify the libraries that the module depends + on, and it can be used for linking purposes. + + See the following links for more details: + https://llvm.org/docs/LangRef.html#dependent-libs-named-metadata + }]; + let parameters = (ins OptionalArrayRefParameter<"StringAttr">:$libs); + let assemblyFormat = "`<` $libs `>`"; +} + #endif // LLVMIR_ATTRDEFS diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td index 46fae44f7b0fa..7f8b3aa833a37 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td @@ -83,6 +83,11 @@ def LLVM_Dialect : Dialect { return "llvm.emit_c_interface"; } + /// Name of the dependent libraries attribute. + static StringRef getDependentLibrariesAttrName() { + return "llvm.dependent_libraries"; + } + /// Returns `true` if the given type is compatible with the LLVM dialect. static bool isCompatibleType(Type); diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h index 3b164927d41fd..3dc848c413905 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h @@ -229,6 +229,10 @@ class ModuleImport { /// attribute. LogicalResult convertCommandlineMetadata(); + /// Converts !llvm.dependent-libraries metadata to llvm.dependent_libraries + /// LLVM ModuleOp attribute. + LogicalResult convertDependentLibrariesMetadata(); + /// Converts all LLVM metadata nodes that translate to attributes such as /// alias analysis or access group metadata, and builds a map from the /// metadata nodes to the converted attributes. diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h index 01dda6238d8f3..99b1b65aeb6a5 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h @@ -363,6 +363,9 @@ class ModuleTranslation { /// Process the llvm.commandline LLVM Metadata, if it exists. LogicalResult createCommandlineMetadata(); + /// Process the llvm.dependent_libraries LLVM Metadata, if it exists. + LogicalResult createDependentLibrariesMetadata(); + /// Translates dialect attributes attached to the given operation. LogicalResult convertDialectAttributes(Operation *op, diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index ea141d8b07284..5f047a59a9828 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -563,6 +563,23 @@ LogicalResult ModuleImport::convertLinkerOptionsMetadata() { return success(); } +LogicalResult ModuleImport::convertDependentLibrariesMetadata() { + for (const llvm::NamedMDNode &named : llvmModule->named_metadata()) { + if (named.getName() != "llvm.dependent-libraries") + continue; + SmallVector libraries; + for (const llvm::MDNode *node : named.operands()) { + if (node->getNumOperands() == 1) + if (auto *mdString = dyn_cast(node->getOperand(0))) + libraries.push_back(mdString->getString()); + } + if (!libraries.empty()) + mlirModule->setAttr(LLVM::LLVMDialect::getDependentLibrariesAttrName(), + builder.getStrArrayAttr(libraries)); + } + return success(); +} + LogicalResult ModuleImport::convertIdentMetadata() { for (const llvm::NamedMDNode &named : llvmModule->named_metadata()) { // llvm.ident should have a single operand. That operand is itself an @@ -625,6 +642,8 @@ LogicalResult ModuleImport::convertMetadata() { } if (failed(convertLinkerOptionsMetadata())) return failure(); + if (failed(convertDependentLibrariesMetadata())) + return failure(); if (failed(convertModuleFlagsMetadata())) return failure(); if (failed(convertIdentMetadata())) diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index 1e2f2c0468045..d30cb8a7d7974 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -2036,6 +2036,22 @@ LogicalResult ModuleTranslation::createCommandlineMetadata() { return success(); } +LogicalResult ModuleTranslation::createDependentLibrariesMetadata() { + if (auto dependentLibrariesAttr = mlirModule->getDiscardableAttr( + LLVM::LLVMDialect::getDependentLibrariesAttrName())) { + auto *nmd = + llvmModule->getOrInsertNamedMetadata("llvm.dependent-libraries"); + llvm::LLVMContext &ctx = llvmModule->getContext(); + for (auto libAttr : + cast(dependentLibrariesAttr).getAsRange()) { + auto *md = + llvm::MDNode::get(ctx, llvm::MDString::get(ctx, libAttr.getValue())); + nmd->addOperand(md); + } + } + return success(); +} + void ModuleTranslation::setLoopMetadata(Operation *op, llvm::Instruction *inst) { LoopAnnotationAttr attr = @@ -2201,6 +2217,8 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext, return nullptr; if (failed(translator.createCommandlineMetadata())) return nullptr; + if (failed(translator.createDependentLibrariesMetadata())) + return nullptr; // Convert other top-level operations if possible. for (Operation &o : getModuleBody(module).getOperations()) { diff --git a/mlir/test/Target/LLVMIR/Import/metadata-dependent-libraries.ll b/mlir/test/Target/LLVMIR/Import/metadata-dependent-libraries.ll new file mode 100644 index 0000000000000..4a6d438046a36 --- /dev/null +++ b/mlir/test/Target/LLVMIR/Import/metadata-dependent-libraries.ll @@ -0,0 +1,6 @@ +; RUN: mlir-translate -import-llvm %s | FileCheck %s + +; CHECK: llvm.dependent_libraries = ["foo", "bar"] +!llvm.dependent-libraries = !{!0, !1} +!0 = !{!"foo"} +!1 = !{!"bar"} diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir index cbd41efdc3015..4dccf0dbdcf2d 100644 --- a/mlir/test/Target/LLVMIR/llvmir.mlir +++ b/mlir/test/Target/LLVMIR/llvmir.mlir @@ -2825,6 +2825,14 @@ module { // ----- +module attributes {llvm.dependent_libraries = ["foo", "bar"]} {} + +// CHECK: !llvm.dependent-libraries = !{![[#LIBFOO:]], ![[#LIBBAR:]]} +// CHECK: ![[#LIBFOO]] = !{!"foo"} +// CHECK: ![[#LIBBAR]] = !{!"bar"} + +// ----- + llvm.mlir.global external constant @const() {addr_space = 0 : i32, dso_local} : i32 { %0 = llvm.mlir.addressof @const : !llvm.ptr %1 = llvm.ptrtoint %0 : !llvm.ptr to i64 From d579622b1e1a6bc59fbe0135ab30fc0fd9849882 Mon Sep 17 00:00:00 2001 From: Mats Jun Larsen Date: Fri, 4 Apr 2025 16:18:01 +0900 Subject: [PATCH 0614/1029] [clang][CGObjC] Prefer PointerType::get with LLVMContext over Type (NFC) (#133871) Part of #123569 --- clang/lib/CodeGen/CGObjCMac.cpp | 55 ++++++++++++++++----------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp index 639c38e7c4555..98f988dfecf84 100644 --- a/clang/lib/CodeGen/CGObjCMac.cpp +++ b/clang/lib/CodeGen/CGObjCMac.cpp @@ -2011,7 +2011,7 @@ CodeGen::RValue CGObjCMac::GenerateMessageSendSuper( CGF.Builder.CreateStructGEP(ObjCSuper, 0)); // If this is a class message the metaclass is passed as the target. - llvm::Type *ClassTyPtr = llvm::PointerType::getUnqual(ObjCTypes.ClassTy); + llvm::Type *ClassTyPtr = llvm::PointerType::getUnqual(VMContext); llvm::Value *Target; if (IsClassMessage) { if (isCategoryImpl) { @@ -5657,7 +5657,7 @@ ObjCCommonTypesHelper::ObjCCommonTypesHelper(CodeGen::CodeGenModule &cgm) IvarOffsetVarTy = LongTy; ObjectPtrTy = cast(Types.ConvertType(Ctx.getObjCIdType())); - PtrObjectPtrTy = llvm::PointerType::getUnqual(ObjectPtrTy); + PtrObjectPtrTy = llvm::PointerType::getUnqual(VMContext); SelectorPtrTy = cast(Types.ConvertType(Ctx.getObjCSelType())); @@ -5688,7 +5688,7 @@ ObjCCommonTypesHelper::ObjCCommonTypesHelper(CodeGen::CodeGenModule &cgm) SuperPtrCTy = Ctx.getPointerType(SuperCTy); SuperTy = cast(Types.ConvertType(SuperCTy)); - SuperPtrTy = llvm::PointerType::getUnqual(SuperTy); + SuperPtrTy = llvm::PointerType::getUnqual(VMContext); // struct _prop_t { // char *name; @@ -5704,7 +5704,7 @@ ObjCCommonTypesHelper::ObjCCommonTypesHelper(CodeGen::CodeGenModule &cgm) PropertyListTy = llvm::StructType::create( "struct._prop_list_t", IntTy, IntTy, llvm::ArrayType::get(PropertyTy, 0)); // struct _prop_list_t * - PropertyListPtrTy = llvm::PointerType::getUnqual(PropertyListTy); + PropertyListPtrTy = llvm::PointerType::getUnqual(VMContext); // struct _objc_method { // SEL _cmd; @@ -5716,7 +5716,7 @@ ObjCCommonTypesHelper::ObjCCommonTypesHelper(CodeGen::CodeGenModule &cgm) // struct _objc_cache * CacheTy = llvm::StructType::create(VMContext, "struct._objc_cache"); - CachePtrTy = llvm::PointerType::getUnqual(CacheTy); + CachePtrTy = llvm::PointerType::getUnqual(VMContext); } ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) @@ -5737,8 +5737,7 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) llvm::ArrayType::get(MethodDescriptionTy, 0)); // struct _objc_method_description_list * - MethodDescriptionListPtrTy = - llvm::PointerType::getUnqual(MethodDescriptionListTy); + MethodDescriptionListPtrTy = llvm::PointerType::getUnqual(VMContext); // Protocol description structures @@ -5756,7 +5755,7 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) PropertyListPtrTy); // struct _objc_protocol_extension * - ProtocolExtensionPtrTy = llvm::PointerType::getUnqual(ProtocolExtensionTy); + ProtocolExtensionPtrTy = llvm::PointerType::getUnqual(VMContext); // Handle construction of Protocol and ProtocolList types @@ -5779,9 +5778,9 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) "struct._objc_protocol_list"); // struct _objc_protocol_list * - ProtocolListPtrTy = llvm::PointerType::getUnqual(ProtocolListTy); + ProtocolListPtrTy = llvm::PointerType::getUnqual(VMContext); - ProtocolPtrTy = llvm::PointerType::getUnqual(ProtocolTy); + ProtocolPtrTy = llvm::PointerType::getUnqual(VMContext); // Class description structures @@ -5795,17 +5794,17 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) // struct _objc_ivar_list * IvarListTy = llvm::StructType::create(VMContext, "struct._objc_ivar_list"); - IvarListPtrTy = llvm::PointerType::getUnqual(IvarListTy); + IvarListPtrTy = llvm::PointerType::getUnqual(VMContext); // struct _objc_method_list * MethodListTy = llvm::StructType::create(VMContext, "struct._objc_method_list"); - MethodListPtrTy = llvm::PointerType::getUnqual(MethodListTy); + MethodListPtrTy = llvm::PointerType::getUnqual(VMContext); // struct _objc_class_extension * ClassExtensionTy = llvm::StructType::create( "struct._objc_class_extension", IntTy, Int8PtrTy, PropertyListPtrTy); - ClassExtensionPtrTy = llvm::PointerType::getUnqual(ClassExtensionTy); + ClassExtensionPtrTy = llvm::PointerType::getUnqual(VMContext); // struct _objc_class { // Class isa; @@ -5828,7 +5827,7 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) Int8PtrTy, ClassExtensionPtrTy}, "struct._objc_class"); - ClassPtrTy = llvm::PointerType::getUnqual(ClassTy); + ClassPtrTy = llvm::PointerType::getUnqual(VMContext); // struct _objc_category { // char *category_name; @@ -5857,7 +5856,7 @@ ObjCTypesHelper::ObjCTypesHelper(CodeGen::CodeGenModule &cgm) SymtabTy = llvm::StructType::create("struct._objc_symtab", LongTy, SelectorPtrTy, ShortTy, ShortTy, llvm::ArrayType::get(Int8PtrTy, 0)); - SymtabPtrTy = llvm::PointerType::getUnqual(SymtabTy); + SymtabPtrTy = llvm::PointerType::getUnqual(VMContext); // struct _objc_module { // long version; @@ -5892,7 +5891,7 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper( llvm::StructType::create("struct.__method_list_t", IntTy, IntTy, llvm::ArrayType::get(MethodTy, 0)); // struct method_list_t * - MethodListnfABIPtrTy = llvm::PointerType::getUnqual(MethodListnfABITy); + MethodListnfABIPtrTy = llvm::PointerType::getUnqual(VMContext); // struct _protocol_t { // id isa; // NULL @@ -5918,7 +5917,7 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper( PropertyListPtrTy); // struct _protocol_t* - ProtocolnfABIPtrTy = llvm::PointerType::getUnqual(ProtocolnfABITy); + ProtocolnfABIPtrTy = llvm::PointerType::getUnqual(VMContext); // struct _protocol_list_t { // long protocol_count; // Note, this is 32/64 bit @@ -5929,7 +5928,7 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper( "struct._objc_protocol_list"); // struct _objc_protocol_list* - ProtocolListnfABIPtrTy = llvm::PointerType::getUnqual(ProtocolListnfABITy); + ProtocolListnfABIPtrTy = llvm::PointerType::getUnqual(VMContext); // struct _ivar_t { // unsigned [long] int *offset; // pointer to ivar offset location @@ -5939,8 +5938,8 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper( // uint32_t size; // } IvarnfABITy = llvm::StructType::create( - "struct._ivar_t", llvm::PointerType::getUnqual(IvarOffsetVarTy), - Int8PtrTy, Int8PtrTy, IntTy, IntTy); + "struct._ivar_t", llvm::PointerType::getUnqual(VMContext), Int8PtrTy, + Int8PtrTy, IntTy, IntTy); // struct _ivar_list_t { // uint32 entsize; // sizeof(struct _ivar_t) @@ -5951,7 +5950,7 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper( llvm::StructType::create("struct._ivar_list_t", IntTy, IntTy, llvm::ArrayType::get(IvarnfABITy, 0)); - IvarListnfABIPtrTy = llvm::PointerType::getUnqual(IvarListnfABITy); + IvarListnfABIPtrTy = llvm::PointerType::getUnqual(VMContext); // struct _class_ro_t { // uint32_t const flags; @@ -5987,12 +5986,12 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper( ClassnfABITy = llvm::StructType::create( {llvm::PointerType::getUnqual(VMContext), llvm::PointerType::getUnqual(VMContext), CachePtrTy, - llvm::PointerType::getUnqual(ImpnfABITy), - llvm::PointerType::getUnqual(ClassRonfABITy)}, + llvm::PointerType::getUnqual(VMContext), + llvm::PointerType::getUnqual(VMContext)}, "struct._class_t"); // LLVM for struct _class_t * - ClassnfABIPtrTy = llvm::PointerType::getUnqual(ClassnfABITy); + ClassnfABIPtrTy = llvm::PointerType::getUnqual(VMContext); // struct _category_t { // const char * const name; @@ -6036,7 +6035,7 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper( MessageRefTy = cast(Types.ConvertType(MessageRefCTy)); // MessageRefPtrTy - LLVM for struct _message_ref_t* - MessageRefPtrTy = llvm::PointerType::getUnqual(MessageRefTy); + MessageRefPtrTy = llvm::PointerType::getUnqual(VMContext); // SuperMessageRefTy - LLVM for: // struct _super_message_ref_t { @@ -6047,7 +6046,7 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper( ImpnfABITy, SelectorPtrTy); // SuperMessageRefPtrTy - LLVM for struct _super_message_ref_t* - SuperMessageRefPtrTy = llvm::PointerType::getUnqual(SuperMessageRefTy); + SuperMessageRefPtrTy = llvm::PointerType::getUnqual(VMContext); // struct objc_typeinfo { // const void** vtable; // objc_ehtype_vtable + 2 @@ -6055,9 +6054,9 @@ ObjCNonFragileABITypesHelper::ObjCNonFragileABITypesHelper( // Class cls; // }; EHTypeTy = llvm::StructType::create("struct._objc_typeinfo", - llvm::PointerType::getUnqual(Int8PtrTy), + llvm::PointerType::getUnqual(VMContext), Int8PtrTy, ClassnfABIPtrTy); - EHTypePtrTy = llvm::PointerType::getUnqual(EHTypeTy); + EHTypePtrTy = llvm::PointerType::getUnqual(VMContext); } llvm::Function *CGObjCNonFragileABIMac::ModuleInitFunction() { From 78a4b9d9b46d59e20cf01d6008ab0080052e9938 Mon Sep 17 00:00:00 2001 From: Mats Jun Larsen Date: Fri, 4 Apr 2025 16:18:13 +0900 Subject: [PATCH 0615/1029] [polly] Prefer PointerType::get with LLVMContext over Type (NFC) (#133868) Part of #123569 --- polly/lib/CodeGen/LoopGeneratorsGOMP.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp b/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp index b98416a92097f..61c153d2ccfa5 100644 --- a/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp +++ b/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp @@ -30,13 +30,9 @@ void ParallelLoopGeneratorGOMP::createCallSpawnThreads(Value *SubFn, if (!F) { GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage; - Type *Params[] = {PointerType::getUnqual(FunctionType::get( - Builder.getVoidTy(), Builder.getPtrTy(), false)), - Builder.getPtrTy(), - Builder.getInt32Ty(), - LongType, - LongType, - LongType}; + Type *Params[] = { + Builder.getPtrTy(), Builder.getPtrTy(), Builder.getInt32Ty(), + LongType, LongType, LongType}; FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false); F = Function::Create(Ty, Linkage, Name, M); From 22130ca486b2fb43198453a53eb048cd8a97e9a8 Mon Sep 17 00:00:00 2001 From: Mariya Podchishchaeva Date: Fri, 4 Apr 2025 09:37:28 +0200 Subject: [PATCH 0616/1029] [MS][clang] Fix crash on deletion of array of pointers (#134088) Sometimes a non-array delete is treated as delete[] when input pointer is pointer to array. With vector deleting destructors support we now generate a virtual destructor call instead of simple loop over the elements. This patch adjusts the codepath that generates virtual call to expect the case of pointer to array. --- clang/lib/AST/Expr.cpp | 3 ++ clang/lib/CodeGen/MicrosoftCXXABI.cpp | 3 ++ .../microsoft-vector-deleting-dtors.cpp | 47 +++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 389fa70a61b4b..e8e5f2fa0cc12 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -71,6 +71,9 @@ const CXXRecordDecl *Expr::getBestDynamicClassType() const { if (const PointerType *PTy = DerivedType->getAs()) DerivedType = PTy->getPointeeType(); + while (const ArrayType *ATy = DerivedType->getAsArrayTypeUnsafe()) + DerivedType = ATy->getElementType(); + if (DerivedType->isDependentType()) return nullptr; diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp index 91689bb2ec75f..ba5f74f153d59 100644 --- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp +++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp @@ -2034,6 +2034,9 @@ llvm::Value *MicrosoftCXXABI::EmitVirtualDestructorCall( ThisTy = D->getDestroyedType(); } + while (const ArrayType *ATy = Context.getAsArrayType(ThisTy)) + ThisTy = ATy->getElementType(); + This = adjustThisArgumentForVirtualFunctionCall(CGF, GD, This, true); RValue RV = CGF.EmitCXXDestructorCall(GD, Callee, This.emitRawPointer(CGF), ThisTy, diff --git a/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp b/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp index 439ff84456033..9d23708602a43 100644 --- a/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp +++ b/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors.cpp @@ -35,6 +35,10 @@ void operator delete(void *p) { i-=2; } void operator delete[](void *p) { i--; } }; +struct AllocatedAsArray : public Bird { + +}; + // Vector deleting dtor for Bird is an alias because no new Bird[] expressions // in the TU. // X64: @"??_EBird@@UEAAPEAXI@Z" = weak dso_local unnamed_addr alias ptr (ptr, i32), ptr @"??_GBird@@UEAAPEAXI@Z" @@ -55,6 +59,14 @@ Bird* alloc() { return P; } + +template +struct S { + void foo() { void *p = new C(); delete (C *)p; } +}; + +S sp; + void bar() { dealloc(alloc()); @@ -63,6 +75,8 @@ void bar() { Bird *p = new HasOperatorDelete[2]; dealloc(p); + + sp.foo(); } // CHECK-LABEL: define dso_local void @{{.*}}dealloc{{.*}}( @@ -99,6 +113,36 @@ void bar() { // CHECK: delete.end: // CHECK-NEXT: ret void +// Definition of S::foo, check that it has vector deleting destructor call +// X64-LABEL: define linkonce_odr dso_local void @"?foo@?$S@$$BY102UAllocatedAsArray@@@@QEAAXXZ" +// X86-LABEL: define linkonce_odr dso_local x86_thiscallcc void @"?foo@?$S@$$BY102UAllocatedAsArray@@@@QAEXXZ" +// CHECK: delete.notnull: ; preds = %arrayctor.cont +// CHECK-NEXT: %[[DEL_PTR:.*]] = getelementptr inbounds [1 x [3 x %struct.AllocatedAsArray]], ptr %[[THE_ARRAY:.*]], i32 0, i32 0 +// X64-NEXT: %[[COOKIEGEP:.*]] = getelementptr inbounds i8, ptr %[[DEL_PTR]], i64 -8 +// X86-NEXT: %[[COOKIEGEP:.*]] = getelementptr inbounds i8, ptr %[[DEL_PTR]], i32 -4 +// X64-NEXT: %[[HOWMANY:.*]] = load i64, ptr %[[COOKIEGEP]] +// X86-NEXT: %[[HOWMANY:.*]] = load i32, ptr %[[COOKIEGEP]] +// X64-NEXT: %[[ISNOELEM:.*]] = icmp eq i64 %[[HOWMANY]], 0 +// X86-NEXT: %[[ISNOELEM:.*]] = icmp eq i32 %[[HOWMANY]], 0 +// CHECK-NEXT: br i1 %[[ISNOELEM]], label %vdtor.nocall, label %vdtor.call +// CHECK: vdtor.nocall: ; preds = %delete.notnull +// X64-NEXT: %[[HOWMANYBYTES:.*]] = mul i64 8, %[[HOWMANY]] +// X86-NEXT: %[[HOWMANYBYTES:.*]] = mul i32 4, %[[HOWMANY]] +// X64-NEXT: %[[ADDCOOKIESIZE:.*]] = add i64 %[[HOWMANYBYTES]], 8 +// X86-NEXT: %[[ADDCOOKIESIZE:.*]] = add i32 %[[HOWMANYBYTES]], 4 +// X64-NEXT: call void @"??_V@YAXPEAX_K@Z"(ptr noundef %[[COOKIEGEP]], i64 noundef %[[ADDCOOKIESIZE]]) +// X86-NEXT: call void @"??_V@YAXPAXI@Z"(ptr noundef %[[COOKIEGEP]], i32 noundef %[[ADDCOOKIESIZE]]) +// CHECK-NEXT: br label %delete.end +// CHECK: vdtor.call: ; preds = %delete.notnull +// CHECK-NEXT: %[[VTABLE:.*]] = load ptr, ptr %[[DEL_PTR]] +// CHECK-NEXT: %[[FPGEP:.*]] = getelementptr inbounds ptr, ptr %[[VTABLE]], i64 0 +// CHECK-NEXT: %[[FPLOAD:.*]] = load ptr, ptr %[[FPGEP]] +// X64-NEXT: %[[CALL:.*]] = call noundef ptr %[[FPLOAD]](ptr noundef nonnull align 8 dereferenceable(8) %[[DEL_PTR]], i32 noundef 3) +// X86-NEXT: %[[CALL:.*]] = call x86_thiscallcc noundef ptr %[[FPLOAD]](ptr noundef nonnull align 4 dereferenceable(4) %[[DEL_PTR]], i32 noundef 3) +// CHECK-NEXT: br label %delete.end +// CHECK: delete.end: +// CHECK-NEXT: ret void + // Vector dtor definition for Parrot. // X64-LABEL: define weak dso_local noundef ptr @"??_EParrot@@UEAAPEAXI@Z"( // X64-SAME: ptr {{.*}} %[[THIS:.*]], i32 {{.*}} %[[IMPLICIT_PARAM:.*]]) unnamed_addr @@ -169,3 +213,6 @@ void bar() { // CHECK: dtor.call_delete: // X64-NEXT: call void @"??3HasOperatorDelete@@SAXPEAX@Z" // X86-NEXT: call void @"??3HasOperatorDelete@@SAXPAX@Z" + +// X64: define weak dso_local noundef ptr @"??_EAllocatedAsArray@@UEAAPEAXI@Z" +// X86: define weak dso_local x86_thiscallcc noundef ptr @"??_EAllocatedAsArray@@UAEPAXI@Z" From e4f76e3a3335dda608c661d76efacc70e607a5e4 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Fri, 4 Apr 2025 09:49:27 +0200 Subject: [PATCH 0617/1029] Reapply "[lldb] Return *const* UnwindPlan pointers from FuncUnwinders " (#134246) This reverts commit 094904303d50e0ab14bc5f2586a602f79af95953, reapplying d7afafdbc464e65c56a0a1d77bad426aa7538306 (#133247). The failure ought to be fixed by 0509932bb6a291ba11253f30c465ab3ad164ae08. --- lldb/include/lldb/Symbol/FuncUnwinders.h | 78 +++---- lldb/include/lldb/Symbol/UnwindPlan.h | 4 +- .../lldb/Target/RegisterContextUnwind.h | 16 +- lldb/source/Commands/CommandObjectTarget.cpp | 106 +++++---- lldb/source/Symbol/FuncUnwinders.cpp | 207 +++++++++--------- lldb/source/Symbol/UnwindPlan.cpp | 2 +- lldb/source/Target/RegisterContextUnwind.cpp | 84 ++++--- 7 files changed, 247 insertions(+), 250 deletions(-) diff --git a/lldb/include/lldb/Symbol/FuncUnwinders.h b/lldb/include/lldb/Symbol/FuncUnwinders.h index 1d4c28324e90f..479ccf87b6e2c 100644 --- a/lldb/include/lldb/Symbol/FuncUnwinders.h +++ b/lldb/include/lldb/Symbol/FuncUnwinders.h @@ -36,18 +36,19 @@ class FuncUnwinders { ~FuncUnwinders(); - lldb::UnwindPlanSP GetUnwindPlanAtCallSite(Target &target, Thread &thread); + std::shared_ptr GetUnwindPlanAtCallSite(Target &target, + Thread &thread); - lldb::UnwindPlanSP GetUnwindPlanAtNonCallSite(Target &target, - lldb_private::Thread &thread); + std::shared_ptr + GetUnwindPlanAtNonCallSite(Target &target, lldb_private::Thread &thread); - lldb::UnwindPlanSP GetUnwindPlanFastUnwind(Target &target, - lldb_private::Thread &thread); + std::shared_ptr + GetUnwindPlanFastUnwind(Target &target, lldb_private::Thread &thread); - lldb::UnwindPlanSP + std::shared_ptr GetUnwindPlanArchitectureDefault(lldb_private::Thread &thread); - lldb::UnwindPlanSP + std::shared_ptr GetUnwindPlanArchitectureDefaultAtFunctionEntry(lldb_private::Thread &thread); Address &GetFirstNonPrologueInsn(Target &target); @@ -77,32 +78,34 @@ class FuncUnwinders { // used. Instead, clients should ask for the *behavior* they are looking for, // using one of the above UnwindPlan retrieval methods. - lldb::UnwindPlanSP GetAssemblyUnwindPlan(Target &target, Thread &thread); + std::shared_ptr GetAssemblyUnwindPlan(Target &target, + Thread &thread); - lldb::UnwindPlanSP GetObjectFileUnwindPlan(Target &target); + std::shared_ptr GetObjectFileUnwindPlan(Target &target); - lldb::UnwindPlanSP GetObjectFileAugmentedUnwindPlan(Target &target, - Thread &thread); + std::shared_ptr + GetObjectFileAugmentedUnwindPlan(Target &target, Thread &thread); - lldb::UnwindPlanSP GetEHFrameUnwindPlan(Target &target); + std::shared_ptr GetEHFrameUnwindPlan(Target &target); - lldb::UnwindPlanSP GetEHFrameAugmentedUnwindPlan(Target &target, - Thread &thread); + std::shared_ptr + GetEHFrameAugmentedUnwindPlan(Target &target, Thread &thread); - lldb::UnwindPlanSP GetDebugFrameUnwindPlan(Target &target); + std::shared_ptr GetDebugFrameUnwindPlan(Target &target); - lldb::UnwindPlanSP GetDebugFrameAugmentedUnwindPlan(Target &target, - Thread &thread); + std::shared_ptr + GetDebugFrameAugmentedUnwindPlan(Target &target, Thread &thread); - lldb::UnwindPlanSP GetCompactUnwindUnwindPlan(Target &target); + std::shared_ptr GetCompactUnwindUnwindPlan(Target &target); - lldb::UnwindPlanSP GetArmUnwindUnwindPlan(Target &target); + std::shared_ptr GetArmUnwindUnwindPlan(Target &target); - lldb::UnwindPlanSP GetSymbolFileUnwindPlan(Thread &thread); + std::shared_ptr GetSymbolFileUnwindPlan(Thread &thread); - lldb::UnwindPlanSP GetArchDefaultUnwindPlan(Thread &thread); + std::shared_ptr GetArchDefaultUnwindPlan(Thread &thread); - lldb::UnwindPlanSP GetArchDefaultAtFuncEntryUnwindPlan(Thread &thread); + std::shared_ptr + GetArchDefaultAtFuncEntryUnwindPlan(Thread &thread); private: lldb::UnwindAssemblySP GetUnwindAssemblyProfiler(Target &target); @@ -113,7 +116,8 @@ class FuncUnwinders { // unwind rule for the pc, and LazyBoolCalculate if it was unable to // determine this for some reason. lldb_private::LazyBool CompareUnwindPlansForIdenticalInitialPCLocation( - Thread &thread, const lldb::UnwindPlanSP &a, const lldb::UnwindPlanSP &b); + Thread &thread, const std::shared_ptr &a, + const std::shared_ptr &b); UnwindTable &m_unwind_table; @@ -129,22 +133,22 @@ class FuncUnwinders { std::recursive_mutex m_mutex; - lldb::UnwindPlanSP m_unwind_plan_assembly_sp; - lldb::UnwindPlanSP m_unwind_plan_object_file_sp; - lldb::UnwindPlanSP m_unwind_plan_eh_frame_sp; - lldb::UnwindPlanSP m_unwind_plan_debug_frame_sp; + std::shared_ptr m_unwind_plan_assembly_sp; + std::shared_ptr m_unwind_plan_object_file_sp; + std::shared_ptr m_unwind_plan_eh_frame_sp; + std::shared_ptr m_unwind_plan_debug_frame_sp; // augmented by assembly inspection so it's valid everywhere - lldb::UnwindPlanSP m_unwind_plan_object_file_augmented_sp; - lldb::UnwindPlanSP m_unwind_plan_eh_frame_augmented_sp; - lldb::UnwindPlanSP m_unwind_plan_debug_frame_augmented_sp; - - std::vector m_unwind_plan_compact_unwind; - lldb::UnwindPlanSP m_unwind_plan_arm_unwind_sp; - lldb::UnwindPlanSP m_unwind_plan_symbol_file_sp; - lldb::UnwindPlanSP m_unwind_plan_fast_sp; - lldb::UnwindPlanSP m_unwind_plan_arch_default_sp; - lldb::UnwindPlanSP m_unwind_plan_arch_default_at_func_entry_sp; + std::shared_ptr m_unwind_plan_object_file_augmented_sp; + std::shared_ptr m_unwind_plan_eh_frame_augmented_sp; + std::shared_ptr m_unwind_plan_debug_frame_augmented_sp; + + std::vector> m_unwind_plan_compact_unwind; + std::shared_ptr m_unwind_plan_arm_unwind_sp; + std::shared_ptr m_unwind_plan_symbol_file_sp; + std::shared_ptr m_unwind_plan_fast_sp; + std::shared_ptr m_unwind_plan_arch_default_sp; + std::shared_ptr m_unwind_plan_arch_default_at_func_entry_sp; // Fetching the UnwindPlans can be expensive - if we've already attempted to // get one & failed, don't try again. diff --git a/lldb/include/lldb/Symbol/UnwindPlan.h b/lldb/include/lldb/Symbol/UnwindPlan.h index 9adda27b8f928..6640a23a3e868 100644 --- a/lldb/include/lldb/Symbol/UnwindPlan.h +++ b/lldb/include/lldb/Symbol/UnwindPlan.h @@ -482,7 +482,7 @@ class UnwindPlan { m_return_addr_register = regnum; } - uint32_t GetReturnAddressRegister() { return m_return_addr_register; } + uint32_t GetReturnAddressRegister() const { return m_return_addr_register; } uint32_t GetInitialCFARegister() const { if (m_row_list.empty()) @@ -497,7 +497,7 @@ class UnwindPlan { m_plan_valid_ranges = std::move(ranges); } - bool PlanValidAtAddress(Address addr); + bool PlanValidAtAddress(Address addr) const; bool IsValidRowIndex(uint32_t idx) const; diff --git a/lldb/include/lldb/Target/RegisterContextUnwind.h b/lldb/include/lldb/Target/RegisterContextUnwind.h index c4ae29e657bfb..044a387fe5aa2 100644 --- a/lldb/include/lldb/Target/RegisterContextUnwind.h +++ b/lldb/include/lldb/Target/RegisterContextUnwind.h @@ -127,7 +127,8 @@ class RegisterContextUnwind : public lldb_private::RegisterContext { /// Check if the given unwind plan indicates a signal trap handler, and /// update frame type and symbol context if so. - void PropagateTrapHandlerFlagFromUnwindPlan(lldb::UnwindPlanSP unwind_plan); + void PropagateTrapHandlerFlagFromUnwindPlan( + std::shared_ptr unwind_plan); // Provide a location for where THIS function saved the CALLER's register // value @@ -194,16 +195,17 @@ class RegisterContextUnwind : public lldb_private::RegisterContext { const UnwindPlan::Row::FAValue &fa, lldb::addr_t &address); - lldb::UnwindPlanSP GetFastUnwindPlanForFrame(); + std::shared_ptr GetFastUnwindPlanForFrame(); - lldb::UnwindPlanSP GetFullUnwindPlanForFrame(); + std::shared_ptr GetFullUnwindPlanForFrame(); void UnwindLogMsg(const char *fmt, ...) __attribute__((format(printf, 2, 3))); void UnwindLogMsgVerbose(const char *fmt, ...) __attribute__((format(printf, 2, 3))); - bool IsUnwindPlanValidForCurrentPC(lldb::UnwindPlanSP unwind_plan_sp); + bool IsUnwindPlanValidForCurrentPC( + std::shared_ptr unwind_plan_sp); lldb::addr_t GetReturnAddressHint(int32_t plan_offset); @@ -215,9 +217,9 @@ class RegisterContextUnwind : public lldb_private::RegisterContext { // i.e. where THIS frame saved them /// - lldb::UnwindPlanSP m_fast_unwind_plan_sp; // may be NULL - lldb::UnwindPlanSP m_full_unwind_plan_sp; - lldb::UnwindPlanSP m_fallback_unwind_plan_sp; // may be NULL + std::shared_ptr m_fast_unwind_plan_sp; // may be NULL + std::shared_ptr m_full_unwind_plan_sp; + std::shared_ptr m_fallback_unwind_plan_sp; // may be NULL bool m_all_registers_available; // Can we retrieve all regs or just // nonvolatile regs? diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index c77bddb4af061..3f7d3007ed168 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -3641,77 +3641,70 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed { result.GetOutputStream().Printf("\n"); - UnwindPlanSP non_callsite_unwind_plan = - func_unwinders_sp->GetUnwindPlanAtNonCallSite(*target, *thread); - if (non_callsite_unwind_plan) { + if (std::shared_ptr plan_sp = + func_unwinders_sp->GetUnwindPlanAtNonCallSite(*target, *thread)) { result.GetOutputStream().Printf( "Asynchronous (not restricted to call-sites) UnwindPlan is '%s'\n", - non_callsite_unwind_plan->GetSourceName().AsCString()); + plan_sp->GetSourceName().AsCString()); } - UnwindPlanSP callsite_unwind_plan = - func_unwinders_sp->GetUnwindPlanAtCallSite(*target, *thread); - if (callsite_unwind_plan) { + if (std::shared_ptr plan_sp = + func_unwinders_sp->GetUnwindPlanAtCallSite(*target, *thread)) { result.GetOutputStream().Printf( "Synchronous (restricted to call-sites) UnwindPlan is '%s'\n", - callsite_unwind_plan->GetSourceName().AsCString()); + plan_sp->GetSourceName().AsCString()); } - UnwindPlanSP fast_unwind_plan = - func_unwinders_sp->GetUnwindPlanFastUnwind(*target, *thread); - if (fast_unwind_plan) { - result.GetOutputStream().Printf( - "Fast UnwindPlan is '%s'\n", - fast_unwind_plan->GetSourceName().AsCString()); + if (std::shared_ptr plan_sp = + func_unwinders_sp->GetUnwindPlanFastUnwind(*target, *thread)) { + result.GetOutputStream().Printf("Fast UnwindPlan is '%s'\n", + plan_sp->GetSourceName().AsCString()); } result.GetOutputStream().Printf("\n"); - UnwindPlanSP assembly_sp = - func_unwinders_sp->GetAssemblyUnwindPlan(*target, *thread); - if (assembly_sp) { + if (std::shared_ptr plan_sp = + func_unwinders_sp->GetAssemblyUnwindPlan(*target, *thread)) { result.GetOutputStream().Printf( "Assembly language inspection UnwindPlan:\n"); - assembly_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + plan_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - UnwindPlanSP of_unwind_sp = - func_unwinders_sp->GetObjectFileUnwindPlan(*target); - if (of_unwind_sp) { + if (std::shared_ptr plan_sp = + func_unwinders_sp->GetObjectFileUnwindPlan(*target)) { result.GetOutputStream().Printf("object file UnwindPlan:\n"); - of_unwind_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + plan_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - UnwindPlanSP of_unwind_augmented_sp = - func_unwinders_sp->GetObjectFileAugmentedUnwindPlan(*target, *thread); - if (of_unwind_augmented_sp) { + if (std::shared_ptr plan_sp = + func_unwinders_sp->GetObjectFileAugmentedUnwindPlan(*target, + *thread)) { result.GetOutputStream().Printf("object file augmented UnwindPlan:\n"); - of_unwind_augmented_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + plan_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - UnwindPlanSP ehframe_sp = - func_unwinders_sp->GetEHFrameUnwindPlan(*target); - if (ehframe_sp) { + if (std::shared_ptr plan_sp = + func_unwinders_sp->GetEHFrameUnwindPlan(*target)) { result.GetOutputStream().Printf("eh_frame UnwindPlan:\n"); - ehframe_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + plan_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - UnwindPlanSP ehframe_augmented_sp = - func_unwinders_sp->GetEHFrameAugmentedUnwindPlan(*target, *thread); - if (ehframe_augmented_sp) { + if (std::shared_ptr plan_sp = + func_unwinders_sp->GetEHFrameAugmentedUnwindPlan(*target, + *thread)) { result.GetOutputStream().Printf("eh_frame augmented UnwindPlan:\n"); - ehframe_augmented_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + plan_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - if (UnwindPlanSP plan_sp = + if (std::shared_ptr plan_sp = func_unwinders_sp->GetDebugFrameUnwindPlan(*target)) { result.GetOutputStream().Printf("debug_frame UnwindPlan:\n"); plan_sp->Dump(result.GetOutputStream(), thread.get(), @@ -3719,7 +3712,7 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed { result.GetOutputStream().Printf("\n"); } - if (UnwindPlanSP plan_sp = + if (std::shared_ptr plan_sp = func_unwinders_sp->GetDebugFrameAugmentedUnwindPlan(*target, *thread)) { result.GetOutputStream().Printf("debug_frame augmented UnwindPlan:\n"); @@ -3728,36 +3721,35 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed { result.GetOutputStream().Printf("\n"); } - UnwindPlanSP arm_unwind_sp = - func_unwinders_sp->GetArmUnwindUnwindPlan(*target); - if (arm_unwind_sp) { + if (std::shared_ptr plan_sp = + func_unwinders_sp->GetArmUnwindUnwindPlan(*target)) { result.GetOutputStream().Printf("ARM.exidx unwind UnwindPlan:\n"); - arm_unwind_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + plan_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - if (UnwindPlanSP symfile_plan_sp = + if (std::shared_ptr plan_sp = func_unwinders_sp->GetSymbolFileUnwindPlan(*thread)) { result.GetOutputStream().Printf("Symbol file UnwindPlan:\n"); - symfile_plan_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + plan_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - UnwindPlanSP compact_unwind_sp = - func_unwinders_sp->GetCompactUnwindUnwindPlan(*target); - if (compact_unwind_sp) { + if (std::shared_ptr plan_sp = + func_unwinders_sp->GetCompactUnwindUnwindPlan(*target)) { result.GetOutputStream().Printf("Compact unwind UnwindPlan:\n"); - compact_unwind_sp->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + plan_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } - if (fast_unwind_plan) { + if (std::shared_ptr plan_sp = + func_unwinders_sp->GetUnwindPlanFastUnwind(*target, *thread)) { result.GetOutputStream().Printf("Fast UnwindPlan:\n"); - fast_unwind_plan->Dump(result.GetOutputStream(), thread.get(), - LLDB_INVALID_ADDRESS); + plan_sp->Dump(result.GetOutputStream(), thread.get(), + LLDB_INVALID_ADDRESS); result.GetOutputStream().Printf("\n"); } diff --git a/lldb/source/Symbol/FuncUnwinders.cpp b/lldb/source/Symbol/FuncUnwinders.cpp index a5ca7b094c949..a74029d8343c7 100644 --- a/lldb/source/Symbol/FuncUnwinders.cpp +++ b/lldb/source/Symbol/FuncUnwinders.cpp @@ -71,40 +71,47 @@ FuncUnwinders::FuncUnwinders(UnwindTable &unwind_table, Address addr, FuncUnwinders::~FuncUnwinders() = default; -UnwindPlanSP FuncUnwinders::GetUnwindPlanAtCallSite(Target &target, - Thread &thread) { +std::shared_ptr +FuncUnwinders::GetUnwindPlanAtCallSite(Target &target, Thread &thread) { std::lock_guard guard(m_mutex); - if (UnwindPlanSP plan_sp = GetObjectFileUnwindPlan(target)) + if (std::shared_ptr plan_sp = + GetObjectFileUnwindPlan(target)) return plan_sp; - if (UnwindPlanSP plan_sp = GetSymbolFileUnwindPlan(thread)) + if (std::shared_ptr plan_sp = + GetSymbolFileUnwindPlan(thread)) return plan_sp; - if (UnwindPlanSP plan_sp = GetDebugFrameUnwindPlan(target)) + if (std::shared_ptr plan_sp = + GetDebugFrameUnwindPlan(target)) return plan_sp; - if (UnwindPlanSP plan_sp = GetEHFrameUnwindPlan(target)) + if (std::shared_ptr plan_sp = GetEHFrameUnwindPlan(target)) return plan_sp; - if (UnwindPlanSP plan_sp = GetCompactUnwindUnwindPlan(target)) + if (std::shared_ptr plan_sp = + GetCompactUnwindUnwindPlan(target)) return plan_sp; - if (UnwindPlanSP plan_sp = GetArmUnwindUnwindPlan(target)) + if (std::shared_ptr plan_sp = + GetArmUnwindUnwindPlan(target)) return plan_sp; return nullptr; } -UnwindPlanSP FuncUnwinders::GetCompactUnwindUnwindPlan(Target &target) { +std::shared_ptr +FuncUnwinders::GetCompactUnwindUnwindPlan(Target &target) { std::lock_guard guard(m_mutex); if (m_unwind_plan_compact_unwind.size() > 0) return m_unwind_plan_compact_unwind[0]; // FIXME support multiple compact // unwind plans for one func if (m_tried_unwind_plan_compact_unwind) - return UnwindPlanSP(); + return nullptr; m_tried_unwind_plan_compact_unwind = true; if (m_range.GetBaseAddress().IsValid()) { Address current_pc(m_range.GetBaseAddress()); CompactUnwindInfo *compact_unwind = m_unwind_table.GetCompactUnwindInfo(); if (compact_unwind) { - UnwindPlanSP unwind_plan_sp(new UnwindPlan(lldb::eRegisterKindGeneric)); + auto unwind_plan_sp = + std::make_shared(lldb::eRegisterKindGeneric); if (compact_unwind->GetUnwindPlan(target, current_pc, *unwind_plan_sp)) { m_unwind_plan_compact_unwind.push_back(unwind_plan_sp); return m_unwind_plan_compact_unwind[0]; // FIXME support multiple @@ -113,10 +120,11 @@ UnwindPlanSP FuncUnwinders::GetCompactUnwindUnwindPlan(Target &target) { } } } - return UnwindPlanSP(); + return nullptr; } -lldb::UnwindPlanSP FuncUnwinders::GetObjectFileUnwindPlan(Target &target) { +std::shared_ptr +FuncUnwinders::GetObjectFileUnwindPlan(Target &target) { std::lock_guard guard(m_mutex); if (m_unwind_plan_object_file_sp.get() || m_tried_unwind_plan_object_file) @@ -126,17 +134,16 @@ lldb::UnwindPlanSP FuncUnwinders::GetObjectFileUnwindPlan(Target &target) { if (m_range.GetBaseAddress().IsValid()) { CallFrameInfo *object_file_frame = m_unwind_table.GetObjectFileUnwindInfo(); if (object_file_frame) { - m_unwind_plan_object_file_sp = - std::make_shared(lldb::eRegisterKindGeneric); - if (!object_file_frame->GetUnwindPlan(m_range, - *m_unwind_plan_object_file_sp)) - m_unwind_plan_object_file_sp.reset(); + auto plan_sp = std::make_shared(lldb::eRegisterKindGeneric); + if (object_file_frame->GetUnwindPlan(m_range, *plan_sp)) + m_unwind_plan_object_file_sp = std::move(plan_sp); } } return m_unwind_plan_object_file_sp; } -UnwindPlanSP FuncUnwinders::GetEHFrameUnwindPlan(Target &target) { +std::shared_ptr +FuncUnwinders::GetEHFrameUnwindPlan(Target &target) { std::lock_guard guard(m_mutex); if (m_unwind_plan_eh_frame_sp.get() || m_tried_unwind_plan_eh_frame) return m_unwind_plan_eh_frame_sp; @@ -145,16 +152,16 @@ UnwindPlanSP FuncUnwinders::GetEHFrameUnwindPlan(Target &target) { if (m_range.GetBaseAddress().IsValid()) { DWARFCallFrameInfo *eh_frame = m_unwind_table.GetEHFrameInfo(); if (eh_frame) { - m_unwind_plan_eh_frame_sp = - std::make_shared(lldb::eRegisterKindGeneric); - if (!eh_frame->GetUnwindPlan(m_range, *m_unwind_plan_eh_frame_sp)) - m_unwind_plan_eh_frame_sp.reset(); + auto plan_sp = std::make_shared(lldb::eRegisterKindGeneric); + if (eh_frame->GetUnwindPlan(m_range, *plan_sp)) + m_unwind_plan_eh_frame_sp = std::move(plan_sp); } } return m_unwind_plan_eh_frame_sp; } -UnwindPlanSP FuncUnwinders::GetDebugFrameUnwindPlan(Target &target) { +std::shared_ptr +FuncUnwinders::GetDebugFrameUnwindPlan(Target &target) { std::lock_guard guard(m_mutex); if (m_unwind_plan_debug_frame_sp || m_tried_unwind_plan_debug_frame) return m_unwind_plan_debug_frame_sp; @@ -163,16 +170,16 @@ UnwindPlanSP FuncUnwinders::GetDebugFrameUnwindPlan(Target &target) { if (m_range.GetBaseAddress().IsValid()) { DWARFCallFrameInfo *debug_frame = m_unwind_table.GetDebugFrameInfo(); if (debug_frame) { - m_unwind_plan_debug_frame_sp = - std::make_shared(lldb::eRegisterKindGeneric); - if (!debug_frame->GetUnwindPlan(m_range, *m_unwind_plan_debug_frame_sp)) - m_unwind_plan_debug_frame_sp.reset(); + auto plan_sp = std::make_shared(lldb::eRegisterKindGeneric); + if (debug_frame->GetUnwindPlan(m_range, *plan_sp)) + m_unwind_plan_debug_frame_sp = std::move(plan_sp); } } return m_unwind_plan_debug_frame_sp; } -UnwindPlanSP FuncUnwinders::GetArmUnwindUnwindPlan(Target &target) { +std::shared_ptr +FuncUnwinders::GetArmUnwindUnwindPlan(Target &target) { std::lock_guard guard(m_mutex); if (m_unwind_plan_arm_unwind_sp.get() || m_tried_unwind_plan_arm_unwind) return m_unwind_plan_arm_unwind_sp; @@ -182,11 +189,9 @@ UnwindPlanSP FuncUnwinders::GetArmUnwindUnwindPlan(Target &target) { Address current_pc(m_range.GetBaseAddress()); ArmUnwindInfo *arm_unwind_info = m_unwind_table.GetArmUnwindInfo(); if (arm_unwind_info) { - m_unwind_plan_arm_unwind_sp = - std::make_shared(lldb::eRegisterKindGeneric); - if (!arm_unwind_info->GetUnwindPlan(target, current_pc, - *m_unwind_plan_arm_unwind_sp)) - m_unwind_plan_arm_unwind_sp.reset(); + auto plan_sp = std::make_shared(lldb::eRegisterKindGeneric); + if (arm_unwind_info->GetUnwindPlan(target, current_pc, *plan_sp)) + m_unwind_plan_arm_unwind_sp = std::move(plan_sp); } } return m_unwind_plan_arm_unwind_sp; @@ -210,7 +215,8 @@ class RegisterContextToInfo: public SymbolFile::RegisterInfoResolver { }; } // namespace -UnwindPlanSP FuncUnwinders::GetSymbolFileUnwindPlan(Thread &thread) { +std::shared_ptr +FuncUnwinders::GetSymbolFileUnwindPlan(Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_symbol_file_sp.get() || m_tried_unwind_plan_symbol_file) return m_unwind_plan_symbol_file_sp; @@ -224,9 +230,9 @@ UnwindPlanSP FuncUnwinders::GetSymbolFileUnwindPlan(Thread &thread) { return m_unwind_plan_symbol_file_sp; } -UnwindPlanSP +std::shared_ptr FuncUnwinders::GetObjectFileAugmentedUnwindPlan(Target &target, - Thread &thread) { + Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_object_file_augmented_sp.get() || m_tried_unwind_plan_object_file_augmented) @@ -234,30 +240,27 @@ FuncUnwinders::GetObjectFileAugmentedUnwindPlan(Target &target, m_tried_unwind_plan_object_file_augmented = true; - UnwindPlanSP object_file_unwind_plan = GetObjectFileUnwindPlan(target); + std::shared_ptr object_file_unwind_plan = + GetObjectFileUnwindPlan(target); if (!object_file_unwind_plan) return m_unwind_plan_object_file_augmented_sp; - m_unwind_plan_object_file_augmented_sp = - std::make_shared(*object_file_unwind_plan); - // Augment the instructions with epilogue descriptions if necessary // so the UnwindPlan can be used at any instruction in the function. UnwindAssemblySP assembly_profiler_sp(GetUnwindAssemblyProfiler(target)); if (assembly_profiler_sp) { - if (!assembly_profiler_sp->AugmentUnwindPlanFromCallSite( - m_range, thread, *m_unwind_plan_object_file_augmented_sp)) { - m_unwind_plan_object_file_augmented_sp.reset(); - } - } else { - m_unwind_plan_object_file_augmented_sp.reset(); + auto plan_sp = std::make_shared(*object_file_unwind_plan); + + if (assembly_profiler_sp->AugmentUnwindPlanFromCallSite(m_range, thread, + *plan_sp)) + m_unwind_plan_object_file_augmented_sp = std::move(plan_sp); } return m_unwind_plan_object_file_augmented_sp; } -UnwindPlanSP FuncUnwinders::GetEHFrameAugmentedUnwindPlan(Target &target, - Thread &thread) { +std::shared_ptr +FuncUnwinders::GetEHFrameAugmentedUnwindPlan(Target &target, Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_eh_frame_augmented_sp.get() || m_tried_unwind_plan_eh_frame_augmented) @@ -275,30 +278,27 @@ UnwindPlanSP FuncUnwinders::GetEHFrameAugmentedUnwindPlan(Target &target, m_tried_unwind_plan_eh_frame_augmented = true; - UnwindPlanSP eh_frame_plan = GetEHFrameUnwindPlan(target); + std::shared_ptr eh_frame_plan = + GetEHFrameUnwindPlan(target); if (!eh_frame_plan) return m_unwind_plan_eh_frame_augmented_sp; - m_unwind_plan_eh_frame_augmented_sp = - std::make_shared(*eh_frame_plan); - // Augment the eh_frame instructions with epilogue descriptions if necessary // so the UnwindPlan can be used at any instruction in the function. UnwindAssemblySP assembly_profiler_sp(GetUnwindAssemblyProfiler(target)); if (assembly_profiler_sp) { - if (!assembly_profiler_sp->AugmentUnwindPlanFromCallSite( - m_range, thread, *m_unwind_plan_eh_frame_augmented_sp)) { - m_unwind_plan_eh_frame_augmented_sp.reset(); - } - } else { - m_unwind_plan_eh_frame_augmented_sp.reset(); + auto plan_sp = std::make_shared(*eh_frame_plan); + if (assembly_profiler_sp->AugmentUnwindPlanFromCallSite(m_range, thread, + *plan_sp)) + m_unwind_plan_eh_frame_augmented_sp = std::move(plan_sp); } return m_unwind_plan_eh_frame_augmented_sp; } -UnwindPlanSP FuncUnwinders::GetDebugFrameAugmentedUnwindPlan(Target &target, - Thread &thread) { +std::shared_ptr +FuncUnwinders::GetDebugFrameAugmentedUnwindPlan(Target &target, + Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_debug_frame_augmented_sp.get() || m_tried_unwind_plan_debug_frame_augmented) @@ -316,30 +316,28 @@ UnwindPlanSP FuncUnwinders::GetDebugFrameAugmentedUnwindPlan(Target &target, m_tried_unwind_plan_debug_frame_augmented = true; - UnwindPlanSP debug_frame_plan = GetDebugFrameUnwindPlan(target); + std::shared_ptr debug_frame_plan = + GetDebugFrameUnwindPlan(target); if (!debug_frame_plan) return m_unwind_plan_debug_frame_augmented_sp; - m_unwind_plan_debug_frame_augmented_sp = - std::make_shared(*debug_frame_plan); - // Augment the debug_frame instructions with epilogue descriptions if // necessary so the UnwindPlan can be used at any instruction in the // function. UnwindAssemblySP assembly_profiler_sp(GetUnwindAssemblyProfiler(target)); if (assembly_profiler_sp) { - if (!assembly_profiler_sp->AugmentUnwindPlanFromCallSite( - m_range, thread, *m_unwind_plan_debug_frame_augmented_sp)) { - m_unwind_plan_debug_frame_augmented_sp.reset(); - } - } else - m_unwind_plan_debug_frame_augmented_sp.reset(); + auto plan_sp = std::make_shared(*debug_frame_plan); + + if (assembly_profiler_sp->AugmentUnwindPlanFromCallSite(m_range, thread, + *plan_sp)) + m_unwind_plan_debug_frame_augmented_sp = std::move(plan_sp); + } return m_unwind_plan_debug_frame_augmented_sp; } -UnwindPlanSP FuncUnwinders::GetAssemblyUnwindPlan(Target &target, - Thread &thread) { +std::shared_ptr +FuncUnwinders::GetAssemblyUnwindPlan(Target &target, Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_assembly_sp.get() || m_tried_unwind_plan_assembly || !m_unwind_table.GetAllowAssemblyEmulationUnwindPlans()) { @@ -360,12 +358,10 @@ UnwindPlanSP FuncUnwinders::GetAssemblyUnwindPlan(Target &target, UnwindAssemblySP assembly_profiler_sp(GetUnwindAssemblyProfiler(target)); if (assembly_profiler_sp) { - m_unwind_plan_assembly_sp = - std::make_shared(lldb::eRegisterKindGeneric); - if (!assembly_profiler_sp->GetNonCallSiteUnwindPlanFromAssembly( - range, thread, *m_unwind_plan_assembly_sp)) { - m_unwind_plan_assembly_sp.reset(); - } + auto plan_sp = std::make_shared(lldb::eRegisterKindGeneric); + if (assembly_profiler_sp->GetNonCallSiteUnwindPlanFromAssembly( + range, thread, *plan_sp)) + m_unwind_plan_assembly_sp = std::move(plan_sp); } return m_unwind_plan_assembly_sp; } @@ -374,7 +370,8 @@ UnwindPlanSP FuncUnwinders::GetAssemblyUnwindPlan(Target &target, // If they have the same way of getting the pc value (e.g. "CFA - 8" + "CFA is // sp"), then it will return LazyBoolTrue. LazyBool FuncUnwinders::CompareUnwindPlansForIdenticalInitialPCLocation( - Thread &thread, const UnwindPlanSP &a, const UnwindPlanSP &b) { + Thread &thread, const std::shared_ptr &a, + const std::shared_ptr &b) { LazyBool plans_are_identical = eLazyBoolCalculate; RegisterNumber pc_reg(thread, eRegisterKindGeneric, LLDB_REGNUM_GENERIC_PC); @@ -404,17 +401,19 @@ LazyBool FuncUnwinders::CompareUnwindPlansForIdenticalInitialPCLocation( return plans_are_identical; } -UnwindPlanSP FuncUnwinders::GetUnwindPlanAtNonCallSite(Target &target, - Thread &thread) { - UnwindPlanSP eh_frame_sp = GetEHFrameUnwindPlan(target); +std::shared_ptr +FuncUnwinders::GetUnwindPlanAtNonCallSite(Target &target, Thread &thread) { + std::shared_ptr eh_frame_sp = GetEHFrameUnwindPlan(target); if (!eh_frame_sp) eh_frame_sp = GetDebugFrameUnwindPlan(target); if (!eh_frame_sp) eh_frame_sp = GetObjectFileUnwindPlan(target); - UnwindPlanSP arch_default_at_entry_sp = + std::shared_ptr arch_default_at_entry_sp = GetUnwindPlanArchitectureDefaultAtFunctionEntry(thread); - UnwindPlanSP arch_default_sp = GetUnwindPlanArchitectureDefault(thread); - UnwindPlanSP assembly_sp = GetAssemblyUnwindPlan(target, thread); + std::shared_ptr arch_default_sp = + GetUnwindPlanArchitectureDefault(thread); + std::shared_ptr assembly_sp = + GetAssemblyUnwindPlan(target, thread); // This point of this code is to detect when a function is using a non- // standard ABI, and the eh_frame correctly describes that alternate ABI. @@ -443,20 +442,24 @@ UnwindPlanSP FuncUnwinders::GetUnwindPlanAtNonCallSite(Target &target, return eh_frame_sp; } - if (UnwindPlanSP plan_sp = GetSymbolFileUnwindPlan(thread)) + if (std::shared_ptr plan_sp = + GetSymbolFileUnwindPlan(thread)) return plan_sp; - if (UnwindPlanSP plan_sp = GetDebugFrameAugmentedUnwindPlan(target, thread)) + if (std::shared_ptr plan_sp = + GetDebugFrameAugmentedUnwindPlan(target, thread)) return plan_sp; - if (UnwindPlanSP plan_sp = GetEHFrameAugmentedUnwindPlan(target, thread)) + if (std::shared_ptr plan_sp = + GetEHFrameAugmentedUnwindPlan(target, thread)) return plan_sp; - if (UnwindPlanSP plan_sp = GetObjectFileAugmentedUnwindPlan(target, thread)) + if (std::shared_ptr plan_sp = + GetObjectFileAugmentedUnwindPlan(target, thread)) return plan_sp; return assembly_sp; } -UnwindPlanSP FuncUnwinders::GetUnwindPlanFastUnwind(Target &target, - Thread &thread) { +std::shared_ptr +FuncUnwinders::GetUnwindPlanFastUnwind(Target &target, Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_fast_sp.get() || m_tried_unwind_fast) return m_unwind_plan_fast_sp; @@ -465,17 +468,15 @@ UnwindPlanSP FuncUnwinders::GetUnwindPlanFastUnwind(Target &target, UnwindAssemblySP assembly_profiler_sp(GetUnwindAssemblyProfiler(target)); if (assembly_profiler_sp) { - m_unwind_plan_fast_sp = - std::make_shared(lldb::eRegisterKindGeneric); - if (!assembly_profiler_sp->GetFastUnwindPlan(m_range, thread, - *m_unwind_plan_fast_sp)) { - m_unwind_plan_fast_sp.reset(); - } + auto plan_sp = std::make_shared(lldb::eRegisterKindGeneric); + if (assembly_profiler_sp->GetFastUnwindPlan(m_range, thread, *plan_sp)) + m_unwind_plan_fast_sp = std::move(plan_sp); } return m_unwind_plan_fast_sp; } -UnwindPlanSP FuncUnwinders::GetUnwindPlanArchitectureDefault(Thread &thread) { +std::shared_ptr +FuncUnwinders::GetUnwindPlanArchitectureDefault(Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_arch_default_sp.get() || m_tried_unwind_arch_default) return m_unwind_plan_arch_default_sp; @@ -491,7 +492,7 @@ UnwindPlanSP FuncUnwinders::GetUnwindPlanArchitectureDefault(Thread &thread) { return m_unwind_plan_arch_default_sp; } -UnwindPlanSP +std::shared_ptr FuncUnwinders::GetUnwindPlanArchitectureDefaultAtFunctionEntry(Thread &thread) { std::lock_guard guard(m_mutex); if (m_unwind_plan_arch_default_at_func_entry_sp.get() || @@ -540,7 +541,8 @@ FuncUnwinders::GetUnwindAssemblyProfiler(Target &target) { Address FuncUnwinders::GetLSDAAddress(Target &target) { Address lsda_addr; - UnwindPlanSP unwind_plan_sp = GetEHFrameUnwindPlan(target); + std::shared_ptr unwind_plan_sp = + GetEHFrameUnwindPlan(target); if (unwind_plan_sp.get() == nullptr) { unwind_plan_sp = GetCompactUnwindUnwindPlan(target); } @@ -556,7 +558,8 @@ Address FuncUnwinders::GetLSDAAddress(Target &target) { Address FuncUnwinders::GetPersonalityRoutinePtrAddress(Target &target) { Address personality_addr; - UnwindPlanSP unwind_plan_sp = GetEHFrameUnwindPlan(target); + std::shared_ptr unwind_plan_sp = + GetEHFrameUnwindPlan(target); if (unwind_plan_sp.get() == nullptr) { unwind_plan_sp = GetCompactUnwindUnwindPlan(target); } diff --git a/lldb/source/Symbol/UnwindPlan.cpp b/lldb/source/Symbol/UnwindPlan.cpp index f2846eb927bf8..cfa8eefaa55bb 100644 --- a/lldb/source/Symbol/UnwindPlan.cpp +++ b/lldb/source/Symbol/UnwindPlan.cpp @@ -451,7 +451,7 @@ const UnwindPlan::Row *UnwindPlan::GetLastRow() const { return m_row_list.back().get(); } -bool UnwindPlan::PlanValidAtAddress(Address addr) { +bool UnwindPlan::PlanValidAtAddress(Address addr) const { // If this UnwindPlan has no rows, it is an invalid UnwindPlan. if (GetRowCount() == 0) { Log *log = GetLog(LLDBLog::Unwind); diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp index 4f8b8a281a020..3ed49e12476dd 100644 --- a/lldb/source/Target/RegisterContextUnwind.cpp +++ b/lldb/source/Target/RegisterContextUnwind.cpp @@ -84,7 +84,7 @@ RegisterContextUnwind::RegisterContextUnwind(Thread &thread, } bool RegisterContextUnwind::IsUnwindPlanValidForCurrentPC( - lldb::UnwindPlanSP unwind_plan_sp) { + std::shared_ptr unwind_plan_sp) { if (!unwind_plan_sp) return false; @@ -141,8 +141,9 @@ void RegisterContextUnwind::InitializeZerothFrame() { if (ABISP abi_sp = process->GetABI()) current_pc = abi_sp->FixCodeAddress(current_pc); - UnwindPlanSP lang_runtime_plan_sp = LanguageRuntime::GetRuntimeUnwindPlan( - m_thread, this, m_behaves_like_zeroth_frame); + std::shared_ptr lang_runtime_plan_sp = + LanguageRuntime::GetRuntimeUnwindPlan(m_thread, this, + m_behaves_like_zeroth_frame); if (lang_runtime_plan_sp.get()) { UnwindLogMsg("This is an async frame"); } @@ -265,7 +266,7 @@ void RegisterContextUnwind::InitializeZerothFrame() { // Try the fall back unwind plan since the // full unwind plan failed. FuncUnwindersSP func_unwinders_sp; - UnwindPlanSP call_site_unwind_plan; + std::shared_ptr call_site_unwind_plan; bool cfa_status = false; if (m_sym_ctx_valid) { @@ -340,8 +341,9 @@ void RegisterContextUnwind::InitializeNonZerothFrame() { // A LanguageRuntime may provide an UnwindPlan that is used in this // stack trace base on the RegisterContext contents, intsead // of the normal UnwindPlans we would use for the return-pc. - UnwindPlanSP lang_runtime_plan_sp = LanguageRuntime::GetRuntimeUnwindPlan( - m_thread, this, m_behaves_like_zeroth_frame); + std::shared_ptr lang_runtime_plan_sp = + LanguageRuntime::GetRuntimeUnwindPlan(m_thread, this, + m_behaves_like_zeroth_frame); if (lang_runtime_plan_sp.get()) { UnwindLogMsg("This is an async frame"); } @@ -749,39 +751,37 @@ bool RegisterContextUnwind::BehavesLikeZerothFrame() const { // 4. m_current_offset_backed_up_one should have the current byte offset into // the function, maybe backed up by 1, std::nullopt if unknown -UnwindPlanSP RegisterContextUnwind::GetFastUnwindPlanForFrame() { - UnwindPlanSP unwind_plan_sp; +std::shared_ptr +RegisterContextUnwind::GetFastUnwindPlanForFrame() { ModuleSP pc_module_sp(m_current_pc.GetModule()); if (!m_current_pc.IsValid() || !pc_module_sp || pc_module_sp->GetObjectFile() == nullptr) - return unwind_plan_sp; + return nullptr; if (IsFrameZero()) - return unwind_plan_sp; + return nullptr; FuncUnwindersSP func_unwinders_sp( pc_module_sp->GetUnwindTable().GetFuncUnwindersContainingAddress( m_current_pc, m_sym_ctx)); if (!func_unwinders_sp) - return unwind_plan_sp; + return nullptr; // If we're in _sigtramp(), unwinding past this frame requires special // knowledge. if (m_frame_type == eTrapHandlerFrame || m_frame_type == eDebuggerFrame) - return unwind_plan_sp; + return nullptr; - unwind_plan_sp = func_unwinders_sp->GetUnwindPlanFastUnwind( - *m_thread.CalculateTarget(), m_thread); - if (unwind_plan_sp) { + if (std::shared_ptr unwind_plan_sp = + func_unwinders_sp->GetUnwindPlanFastUnwind( + *m_thread.CalculateTarget(), m_thread)) { if (unwind_plan_sp->PlanValidAtAddress(m_current_pc)) { m_frame_type = eNormalFrame; return unwind_plan_sp; - } else { - unwind_plan_sp.reset(); } } - return unwind_plan_sp; + return nullptr; } // On entry to this method, @@ -793,9 +793,9 @@ UnwindPlanSP RegisterContextUnwind::GetFastUnwindPlanForFrame() { // 4. m_current_offset_backed_up_one should have the current byte offset into // the function, maybe backed up by 1, std::nullopt if unknown -UnwindPlanSP RegisterContextUnwind::GetFullUnwindPlanForFrame() { - UnwindPlanSP unwind_plan_sp; - UnwindPlanSP arch_default_unwind_plan_sp; +std::shared_ptr +RegisterContextUnwind::GetFullUnwindPlanForFrame() { + std::shared_ptr arch_default_unwind_plan_sp; ExecutionContext exe_ctx(m_thread.shared_from_this()); Process *process = exe_ctx.GetProcessPtr(); ABI *abi = process ? process->GetABI().get() : nullptr; @@ -833,9 +833,8 @@ UnwindPlanSP RegisterContextUnwind::GetFullUnwindPlanForFrame() { process->GetLoadAddressPermissions(current_pc_addr, permissions) && (permissions & ePermissionsExecutable) == 0)) { if (abi) { - unwind_plan_sp = abi->CreateFunctionEntryUnwindPlan(); m_frame_type = eNormalFrame; - return unwind_plan_sp; + return abi->CreateFunctionEntryUnwindPlan(); } } } @@ -872,32 +871,29 @@ UnwindPlanSP RegisterContextUnwind::GetFullUnwindPlanForFrame() { DWARFCallFrameInfo *eh_frame = pc_module_sp->GetUnwindTable().GetEHFrameInfo(); if (eh_frame) { - unwind_plan_sp = std::make_shared(lldb::eRegisterKindGeneric); + auto unwind_plan_sp = + std::make_shared(lldb::eRegisterKindGeneric); if (eh_frame->GetUnwindPlan(m_current_pc, *unwind_plan_sp)) return unwind_plan_sp; - else - unwind_plan_sp.reset(); } ArmUnwindInfo *arm_exidx = pc_module_sp->GetUnwindTable().GetArmUnwindInfo(); if (arm_exidx) { - unwind_plan_sp = std::make_shared(lldb::eRegisterKindGeneric); + auto unwind_plan_sp = + std::make_shared(lldb::eRegisterKindGeneric); if (arm_exidx->GetUnwindPlan(exe_ctx.GetTargetRef(), m_current_pc, *unwind_plan_sp)) return unwind_plan_sp; - else - unwind_plan_sp.reset(); } CallFrameInfo *object_file_unwind = pc_module_sp->GetUnwindTable().GetObjectFileUnwindInfo(); if (object_file_unwind) { - unwind_plan_sp = std::make_shared(lldb::eRegisterKindGeneric); + auto unwind_plan_sp = + std::make_shared(lldb::eRegisterKindGeneric); if (object_file_unwind->GetUnwindPlan(m_current_pc, *unwind_plan_sp)) return unwind_plan_sp; - else - unwind_plan_sp.reset(); } return arch_default_unwind_plan_sp; @@ -911,15 +907,13 @@ UnwindPlanSP RegisterContextUnwind::GetFullUnwindPlanForFrame() { // substitute plan. Otherwise, use eh_frame. if (m_sym_ctx_valid) { lldb::PlatformSP platform = process->GetTarget().GetPlatform(); - unwind_plan_sp = platform->GetTrapHandlerUnwindPlan( - process->GetTarget().GetArchitecture().GetTriple(), - GetSymbolOrFunctionName(m_sym_ctx)); - - if (unwind_plan_sp) + if (auto unwind_plan_sp = platform->GetTrapHandlerUnwindPlan( + process->GetTarget().GetArchitecture().GetTriple(), + GetSymbolOrFunctionName(m_sym_ctx))) return unwind_plan_sp; } - unwind_plan_sp = + auto unwind_plan_sp = func_unwinders_sp->GetEHFrameUnwindPlan(process->GetTarget()); if (!unwind_plan_sp) unwind_plan_sp = @@ -944,7 +938,7 @@ UnwindPlanSP RegisterContextUnwind::GetFullUnwindPlanForFrame() { // normally we would call GetUnwindPlanAtCallSite() -- because CallSite may // return an unwind plan sourced from either eh_frame (that's what we // intend) or compact unwind (this won't work) - unwind_plan_sp = + auto unwind_plan_sp = func_unwinders_sp->GetEHFrameUnwindPlan(process->GetTarget()); if (!unwind_plan_sp) unwind_plan_sp = @@ -960,7 +954,7 @@ UnwindPlanSP RegisterContextUnwind::GetFullUnwindPlanForFrame() { // Typically the NonCallSite UnwindPlan is the unwind created by inspecting // the assembly language instructions if (m_behaves_like_zeroth_frame && process) { - unwind_plan_sp = func_unwinders_sp->GetUnwindPlanAtNonCallSite( + auto unwind_plan_sp = func_unwinders_sp->GetUnwindPlanAtNonCallSite( process->GetTarget(), m_thread); if (unwind_plan_sp && unwind_plan_sp->PlanValidAtAddress(m_current_pc)) { if (unwind_plan_sp->GetSourcedFromCompiler() == eLazyBoolNo) { @@ -975,7 +969,7 @@ UnwindPlanSP RegisterContextUnwind::GetFullUnwindPlanForFrame() { // assembly code it is often written in a way that it valid at all // location what helps in the most common cases when the instruction // emulation fails. - UnwindPlanSP call_site_unwind_plan = + std::shared_ptr call_site_unwind_plan = func_unwinders_sp->GetUnwindPlanAtCallSite(process->GetTarget(), m_thread); if (call_site_unwind_plan && @@ -1010,6 +1004,7 @@ UnwindPlanSP RegisterContextUnwind::GetFullUnwindPlanForFrame() { } } + std::shared_ptr unwind_plan_sp; // Typically this is unwind info from an eh_frame section intended for // exception handling; only valid at call sites if (process) { @@ -1042,7 +1037,7 @@ UnwindPlanSP RegisterContextUnwind::GetFullUnwindPlanForFrame() { // sites then the architecture default plan and for hand written assembly // code it is often written in a way that it valid at all location what // helps in the most common cases when the instruction emulation fails. - UnwindPlanSP call_site_unwind_plan = + std::shared_ptr call_site_unwind_plan = func_unwinders_sp->GetUnwindPlanAtCallSite(process->GetTarget(), m_thread); if (call_site_unwind_plan && @@ -1786,7 +1781,8 @@ bool RegisterContextUnwind::TryFallbackUnwindPlan() { // Switch the full UnwindPlan to be the fallback UnwindPlan. If we decide // this isn't working, we need to restore. We'll also need to save & restore // the value of the m_cfa ivar. Save is down below a bit in 'old_cfa'. - UnwindPlanSP original_full_unwind_plan_sp = m_full_unwind_plan_sp; + std::shared_ptr original_full_unwind_plan_sp = + m_full_unwind_plan_sp; addr_t old_cfa = m_cfa; addr_t old_afa = m_afa; @@ -1915,7 +1911,7 @@ bool RegisterContextUnwind::ForceSwitchToFallbackUnwindPlan() { } void RegisterContextUnwind::PropagateTrapHandlerFlagFromUnwindPlan( - lldb::UnwindPlanSP unwind_plan) { + std::shared_ptr unwind_plan) { if (unwind_plan->GetUnwindPlanForSignalTrap() != eLazyBoolYes) { // Unwind plan does not indicate trap handler. Do nothing. We may // already be flagged as trap handler flag due to the symbol being From f528a80d3ed70181483e3ce10aa3a7a4dd812817 Mon Sep 17 00:00:00 2001 From: Vladi Krapp Date: Fri, 4 Apr 2025 08:55:35 +0100 Subject: [PATCH 0618/1029] [Arm] Add more -mtp=cp15 tests (#134098) This patch systematically covers all -mtp=cp15 behaviour options for better code coverage. --- clang/test/Driver/arm-thread-pointer.c | 186 ++++++++++++------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/clang/test/Driver/arm-thread-pointer.c b/clang/test/Driver/arm-thread-pointer.c index 37d1a3e4d7e89..9433cbfa091ff 100644 --- a/clang/test/Driver/arm-thread-pointer.c +++ b/clang/test/Driver/arm-thread-pointer.c @@ -1,93 +1,93 @@ -// Test of the AArch32 values of -mtp=, checking that each one maps to -// the right target features. - -// RUN: %clang --target=armv7-linux -mtp=cp15 -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv7_THREAD_POINTER-HARD %s -// ARMv7_THREAD_POINTER-HARD: "-target-feature" "+read-tp-tpidruro" - -// RUN: %clang --target=armv7-linux -mtp=tpidruro -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv7_THREAD_POINTER-HARD %s -// RUN: %clang --target=armv7-linux -mtp=tpidrurw -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv7_THREAD_POINTER-TPIDRURW %s -// ARMv7_THREAD_POINTER-TPIDRURW: "-target-feature" "+read-tp-tpidrurw" -// RUN: %clang --target=armv7-linux -mtp=tpidrprw -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv7_THREAD_POINTER-TPIDRPRW %s -// ARMv7_THREAD_POINTER-TPIDRPRW: "-target-feature" "+read-tp-tpidrprw" - -// RUN: %clang --target=armv6k-linux -mtp=cp15 -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARM_THREAD_POINTER-HARD %s -// ARM_THREAD_POINTER-HARD: "-target-feature" "+read-tp-tpidruro" - -// RUN: %clang --target=armv6k-linux -mtp=auto -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARM_THREAD_POINTER_AUTO %s -// ARM_THREAD_POINTER_AUTO-NOT: "-target-feature" "+read-tp-tpidruro" - -// RUN: %clang --target=thumbv6k-apple-darwin -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=THUMBv6_THREAD_POINTER_NO_AUTO %s -// THUMBv6_THREAD_POINTER_NO_AUTO-NOT: "-target-feature" "+read-tp-tpidruro" - -// RUN: not %clang --target=thumbv6k-apple-darwin -mtp=cp15 -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=THUMBv6_THREAD_POINTER_NO_HARD %s -// THUMBv6_THREAD_POINTER_NO_HARD: unsupported option '-mtp=' for target 'thumbv6k-apple-darwin' - -// RUN: not %clang --target=thumbv6t2-linux -mtp=cp15 -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARM_THREAD_POINTER_NO_HARD %s -// ARM_THREAD_POINTER_NO_HARD: hardware TLS register is not supported for the armv6t2 sub-architecture - -// RUN: %clang --target=armv5t-linux -mtp=cp15 -x assembler -### %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv5_THREAD_POINTER_ASSEMBLER %s -// ARMv5_THREAD_POINTER_ASSEMBLER-NOT: hardware TLS register is not supported for the armv5 sub-architecture - -// RUN: not %clang --target=armv6-linux -mthumb -mtp=cp15 -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=THUMBv6_THREAD_POINTER_UNSUPP %s -// RUN: not %clang --target=thumbv6-linux -mthumb -mtp=cp15 -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=THUMBv6_THREAD_POINTER_UNSUPP %s -// THUMBv6_THREAD_POINTER_UNSUPP: hardware TLS register is not supported for the thumbv6 sub-architecture - -// RUN: %clang --target=armv7-linux -mtp=soft -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv7_THREAD_POINTER_SOFT %s -// ARMv7_THREAD_POINTER_SOFT-NOT: "-target-feature" "+read-tp-tpidruro" - -// RUN: %clang --target=armv7-linux -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv7_THREAD_POINTER_NON %s -// ARMv7_THREAD_POINTER_NON: "-target-feature" "+read-tp-tpidruro" - -// RUN: %clang --target=armv7-linux -mtp=auto -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv7_THREAD_POINTER_Auto %s -// ARMv7_THREAD_POINTER_Auto: "-target-feature" "+read-tp-tpidruro" - -// RUN: %clang --target=armv7-linux -mtp=cp15 -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv7_THREAD_POINTER_HARD %s -// ARMv7_THREAD_POINTER_HARD: "-target-feature" "+read-tp-tpidruro" - -// RUN: %clang --target=armv7m-linux -mtp=auto -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv7m_THREAD_POINTER_Auto %s -// ARMv7m_THREAD_POINTER_Auto-NOT: "-target-feature" "+read-tp-tpidruro" - -// RUN: not %clang --target=armv7m-linux -mtp=cp15 -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv7m_THREAD_POINTER_HARD %s -// ARMv7m_THREAD_POINTER_HARD: hardware TLS register is not supported for the thumbv7m sub-architecture - -// RUN: %clang --target=armv5t-linux -mtp=auto -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv5t_THREAD_POINTER_Auto %s -// ARMv5t_THREAD_POINTER_Auto-NOT: "-target-feature" "+read-tp-tpidruro" - -// RUN: %clang --target=armv6k-linux -mtp=cp15 -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv6k_THREAD_POINTER_Auto %s -// ARMv6k_THREAD_POINTER_Auto: "-target-feature" "+read-tp-tpidruro" - -// RUN: not %clang --target=armv6t2-linux -mtp=cp15 -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv6t2_THREAD_POINTER_HARD %s -// ARMv6t2_THREAD_POINTER_HARD: hardware TLS register is not supported for the armv6t2 sub-architecture - -// RUN: %clang --target=armv6t2-linux -mtp=auto -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMV6t2_THREAD_POINTER_AUTO %s -// ARMV6t2_THREAD_POINTER_AUTO-NOT: "-target-feature" "+read-tp-tpidruro" - -// RUN: %clang --target=armv6kz-linux -mtp=cp15 -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMv6kz_THREAD_POINTER_HARD %s -// ARMv6kz_THREAD_POINTER_HARD: "-target-feature" "+read-tp-tpidruro" - -// RUN: %clang --target=armv6kz-linux -mtp=auto -### -S %s 2>&1 | \ -// RUN: FileCheck -check-prefix=ARMV6KZ_THREAD_POINTER_AUTO %s -// ARMV6KZ_THREAD_POINTER_AUTO-NOT: "-target-feature" "+read-tp-tpidruro" \ No newline at end of file +// This file tests the -mtp= functionality in Clang’s ARM driver. +// It verifies: +// +// 1. ARMv7 targets: explicit hardware modes, explicit soft mode, and auto mode. +// 2. M Profile variants: explicit hardware mode should fail and auto mode defaults to soft. +// 3. ARMv6 variants: explicit hardware modes on ARMv6K/KZ work, but auto mode falls back to soft when Thumb2 is missing. +// 4. ARMv5 variants: explicit hardware mode is rejected and auto mode defaults to soft. +// 5. Miscellaneous error cases (e.g. empty -mtp value). +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// 1. ARMv7 Targets +//===----------------------------------------------------------------------===// + +// Test explicit hardware mode using "tpidrprw" on an ARMv7 target. +// RUN: %clang --target=armv7-linux -mtp=tpidrprw -### -S %s 2>&1 | FileCheck -check-prefix=ARMv7_TPIDRPRW %s +// ARMv7_TPIDRPRW: "-target-feature" "+read-tp-tpidrprw" + +// Test explicit hardware mode using "tpidrurw" on an ARMv7 target. +// RUN: %clang --target=armv7-linux -mtp=tpidrurw -### -S %s 2>&1 | FileCheck -check-prefix=ARMv7_TPIDRURW %s +// ARMv7_TPIDRURW: "-target-feature" "+read-tp-tpidrurw" + +// Test explicit hardware mode using "tpidruro" on an ARMv7 target. +// RUN: %clang --target=armv7-linux -mtp=tpidruro -### -S %s 2>&1 | FileCheck -check-prefix=ARMv7_TPIDRURO %s +// ARMv7_TPIDRURO: "-target-feature" "+read-tp-tpidruro" + +// Test explicit "soft" mode on an ARMv7 target (forces software mode). +// RUN: %clang --target=armv7-linux -mtp=soft -### -S %s 2>&1 | FileCheck -check-prefix=ARM_Soft %s +// ARM_Soft-NOT: "-target-feature" "+read-tp-" + +// Test auto mode on an ARMv7 target (hardware support and Thumb2 yield HW mode). +// RUN: %clang --target=armv7-linux -mtp=auto -### -S %s 2>&1 | FileCheck -check-prefix=ARMv7_Auto %s +// Default mode is implicitly -mtp=auto +// RUN: %clang --target=armv7-linux -### -S %s 2>&1 | FileCheck -check-prefix=ARMv7_Auto %s +// ARMv7_Auto: "-target-feature" "+read-tp-tpidruro" + +//===----------------------------------------------------------------------===// +// 2. M Profile Variants (e.g. thumbv6t2) +//===----------------------------------------------------------------------===// + +// Test explicit hardware mode on a M Profile target: thumbv6t2 does not support CP15. +// RUN: not %clang --target=thumbv6t2-linux -mtp=cp15 -### -S %s 2>&1 | FileCheck -check-prefix=Thumbv6t2_Error %s +// Thumbv6t2_Error: error: hardware TLS register is not supported for the armv6t2 sub-architecture + +// Test auto mode on a M Profile target: should default to soft mode. +// RUN: %clang --target=thumbv6t2-linux -mtp=auto -### -S %s 2>&1 | FileCheck -check-prefix=Thumbv6t2_Auto %s +// Thumbv6t2_Auto-NOT: "-target-feature" "+read-tp-" + + +//===----------------------------------------------------------------------===// +// 3. ARMv6 Variants +//===----------------------------------------------------------------------===// + +// Test explicit hardware mode using "cp15" on an ARMv6K and ARMv6KZ targets. +// RUN: %clang --target=armv6k-linux -mtp=cp15 -### -S %s 2>&1 | FileCheck -check-prefix=ARMv6k_Cp15 %s +// RUN: %clang --target=armv6kz-linux -mtp=cp15 -### -S %s 2>&1 | FileCheck -check-prefix=ARMv6k_Cp15 %s +// ARMv6k_Cp15: "-target-feature" "+read-tp-tpidruro" + + +// Test auto mode on ARMv6K and ARMv6KZ targets: defaults to soft mode due to missing Thumb2 encoding. +// RUN: %clang --target=armv6k-linux -mtp=auto -### -S %s 2>&1 | FileCheck -check-prefix=ARMv6k_Auto %s +// RUN: %clang --target=armv6kz-linux -mtp=auto -### -S %s 2>&1 | FileCheck -check-prefix=ARMv6k_Auto %s +// ARMv6k_Auto-NOT: "-target-feature" "+read-tp-" + + +//===----------------------------------------------------------------------===// +// 4. ARMv5 Variants +//===----------------------------------------------------------------------===// + +// Test explicit hardware mode on an ARMv5T target: hardware TP is not supported. +// RUN: not %clang --target=armv5t-linux -mtp=cp15 -### -S %s 2>&1 | FileCheck -check-prefix=ARMv5t_Error %s +// ARMv5t_Error: error: hardware TLS register is not supported for the armv5 sub-architecture + +// Test auto mode on an ARMv5T target: should default to soft mode. +// RUN: %clang --target=armv5t-linux -mtp=auto -### -S %s 2>&1 | FileCheck -check-prefix=ARMv5t_Auto %s +// ARMv5t_Auto-NOT: "-target-feature" "+read-tp-" + +//===----------------------------------------------------------------------===// +// 5. Miscellaneous Tests +//===----------------------------------------------------------------------===// + +// Test empty -mtp value on an ARMv7 target: should produce a missing argument error. +// RUN: not %clang --target=armv7-linux -mtp= -### -S %s 2>&1 | FileCheck -check-prefix=Empty_MTP %s +// Empty_MTP: error: {{.*}}missing + +// Test explicit hardware mode in assembler mode on an unsupporting target does not fail with error +// RUN: %clang --target=thumbv6t2-linux -mtp=cp15 -x assembler -### %s 2>&1 | FileCheck -check-prefix=Thumbv6t2_Asm %s +// Thumbv6t2_Asm-NOT: "-target-feature" "+read-tp-" + +// A dummy main is provided to form a valid translation unit. +int main(void) { return 0; } + From aaf398c2e76853e890fa3aae35a7ea3aa4edac97 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 4 Apr 2025 09:03:49 +0100 Subject: [PATCH 0619/1029] [AArch64] Regenerate apple-unrolling-multi-exit.ll test checks (#134257) --- .../AArch64/apple-unrolling-multi-exit.ll | 84 +++++++++---------- 1 file changed, 40 insertions(+), 44 deletions(-) diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll index 31b23eae0f866..30fff6c9e293c 100644 --- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll +++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 ; RUN: opt -p loop-unroll -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s ; RUN: opt -p loop-unroll -mcpu=apple-m2 -S %s | FileCheck --check-prefix=APPLE %s ; RUN: opt -p loop-unroll -mcpu=apple-m3 -S %s | FileCheck --check-prefix=APPLE %s @@ -26,20 +26,20 @@ define i1 @multi_2_exit_find_i8_loop(ptr %vec, i8 %tgt) { ; APPLE: [[LOOP_HEADER_PROL_PREHEADER]]: ; APPLE-NEXT: br label %[[LOOP_HEADER_PROL:.*]] ; APPLE: [[LOOP_HEADER_PROL]]: -; APPLE-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH_PROL:.*]] ], [ [[START]], %[[LOOP_HEADER_PROL_PREHEADER]] ] +; APPLE-NEXT: [[PTR_IV_PROL:%.*]] = phi ptr [ [[PTR_IV_NEXT_PROL:%.*]], %[[LOOP_LATCH_PROL:.*]] ], [ [[START]], %[[LOOP_HEADER_PROL_PREHEADER]] ] ; APPLE-NEXT: [[PROL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_HEADER_PROL_PREHEADER]] ], [ [[PROL_ITER_NEXT:%.*]], %[[LOOP_LATCH_PROL]] ] -; APPLE-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8 -; APPLE-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]] -; APPLE-NEXT: br i1 [[C_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT3:.*]], label %[[LOOP_LATCH_PROL]] +; APPLE-NEXT: [[L_PROL:%.*]] = load i8, ptr [[PTR_IV_PROL]], align 8 +; APPLE-NEXT: [[C_1_PROL:%.*]] = icmp eq i8 [[L_PROL]], [[TGT]] +; APPLE-NEXT: br i1 [[C_1_PROL]], label %[[EXIT_UNR_LCSSA_LOOPEXIT3:.*]], label %[[LOOP_LATCH_PROL]] ; APPLE: [[LOOP_LATCH_PROL]]: -; APPLE-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1 -; APPLE-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] +; APPLE-NEXT: [[PTR_IV_NEXT_PROL]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_PROL]], i64 1 +; APPLE-NEXT: [[C_2_PROL:%.*]] = icmp eq ptr [[PTR_IV_NEXT_PROL]], [[END]] ; APPLE-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1 ; APPLE-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]] ; APPLE-NEXT: br i1 [[PROL_ITER_CMP]], label %[[LOOP_HEADER_PROL]], label %[[LOOP_HEADER_PROL_LOOPEXIT_UNR_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]] ; APPLE: [[LOOP_HEADER_PROL_LOOPEXIT_UNR_LCSSA]]: ; APPLE-NEXT: [[RES_UNR_PH:%.*]] = phi ptr [ [[END]], %[[LOOP_LATCH_PROL]] ] -; APPLE-NEXT: [[PTR_IV_UNR_PH:%.*]] = phi ptr [ [[PTR_IV_NEXT]], %[[LOOP_LATCH_PROL]] ] +; APPLE-NEXT: [[PTR_IV_UNR_PH:%.*]] = phi ptr [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ] ; APPLE-NEXT: br label %[[LOOP_HEADER_PROL_LOOPEXIT]] ; APPLE: [[LOOP_HEADER_PROL_LOOPEXIT]]: ; APPLE-NEXT: [[RES_UNR:%.*]] = phi ptr [ poison, %[[ENTRY]] ], [ [[RES_UNR_PH]], %[[LOOP_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ] @@ -49,17 +49,17 @@ define i1 @multi_2_exit_find_i8_loop(ptr %vec, i8 %tgt) { ; APPLE: [[ENTRY_NEW]]: ; APPLE-NEXT: br label %[[LOOP_HEADER:.*]] ; APPLE: [[LOOP_HEADER]]: -; APPLE-NEXT: [[PTR_IV1:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[RES:%.*]], %[[LOOP_LATCH_3:.*]] ] -; APPLE-NEXT: [[L1:%.*]] = load i8, ptr [[PTR_IV1]], align 8 -; APPLE-NEXT: [[C_4:%.*]] = icmp eq i8 [[L1]], [[TGT]] -; APPLE-NEXT: br i1 [[C_4]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]] +; APPLE-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_3:%.*]], %[[LOOP_LATCH_3:.*]] ] +; APPLE-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8 +; APPLE-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]] +; APPLE-NEXT: br i1 [[C_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]] ; APPLE: [[LOOP_LATCH]]: -; APPLE-NEXT: [[PTR_IV_NEXT1:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV1]], i64 1 -; APPLE-NEXT: [[L_1:%.*]] = load i8, ptr [[PTR_IV_NEXT1]], align 8 +; APPLE-NEXT: [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1 +; APPLE-NEXT: [[L_1:%.*]] = load i8, ptr [[PTR_IV_NEXT]], align 8 ; APPLE-NEXT: [[C_1_1:%.*]] = icmp eq i8 [[L_1]], [[TGT]] ; APPLE-NEXT: br i1 [[C_1_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_1:.*]] ; APPLE: [[LOOP_LATCH_1]]: -; APPLE-NEXT: [[PTR_IV_NEXT_1:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT1]], i64 1 +; APPLE-NEXT: [[PTR_IV_NEXT_1:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 1 ; APPLE-NEXT: [[L_2:%.*]] = load i8, ptr [[PTR_IV_NEXT_1]], align 8 ; APPLE-NEXT: [[C_1_2:%.*]] = icmp eq i8 [[L_2]], [[TGT]] ; APPLE-NEXT: br i1 [[C_1_2]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_2:.*]] @@ -69,22 +69,22 @@ define i1 @multi_2_exit_find_i8_loop(ptr %vec, i8 %tgt) { ; APPLE-NEXT: [[C_1_3:%.*]] = icmp eq i8 [[L_3]], [[TGT]] ; APPLE-NEXT: br i1 [[C_1_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_3]] ; APPLE: [[LOOP_LATCH_3]]: -; APPLE-NEXT: [[RES]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT_2]], i64 1 -; APPLE-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]] -; APPLE-NEXT: br i1 [[C_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_HEADER]] +; APPLE-NEXT: [[PTR_IV_NEXT_3]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT_2]], i64 1 +; APPLE-NEXT: [[C_2_3:%.*]] = icmp eq ptr [[PTR_IV_NEXT_3]], [[END]] +; APPLE-NEXT: br i1 [[C_2_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_HEADER]] ; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]: -; APPLE-NEXT: [[RES_PH_PH:%.*]] = phi ptr [ [[PTR_IV1]], %[[LOOP_HEADER]] ], [ [[PTR_IV_NEXT1]], %[[LOOP_LATCH]] ], [ [[PTR_IV_NEXT_1]], %[[LOOP_LATCH_1]] ], [ [[PTR_IV_NEXT_2]], %[[LOOP_LATCH_2]] ], [ [[END]], %[[LOOP_LATCH_3]] ] +; APPLE-NEXT: [[RES_PH_PH:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[PTR_IV_NEXT]], %[[LOOP_LATCH]] ], [ [[PTR_IV_NEXT_1]], %[[LOOP_LATCH_1]] ], [ [[PTR_IV_NEXT_2]], %[[LOOP_LATCH_2]] ], [ [[END]], %[[LOOP_LATCH_3]] ] ; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA:.*]] ; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT3]]: -; APPLE-NEXT: [[RES_PH_PH4:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER_PROL]] ] +; APPLE-NEXT: [[RES_PH_PH4:%.*]] = phi ptr [ [[PTR_IV_PROL]], %[[LOOP_HEADER_PROL]] ] ; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]] ; APPLE: [[EXIT_UNR_LCSSA]]: ; APPLE-NEXT: [[RES_PH:%.*]] = phi ptr [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ], [ [[RES_PH_PH4]], %[[EXIT_UNR_LCSSA_LOOPEXIT3]] ] ; APPLE-NEXT: br label %[[EXIT]] ; APPLE: [[EXIT]]: -; APPLE-NEXT: [[RES1:%.*]] = phi ptr [ [[RES_UNR]], %[[LOOP_HEADER_PROL_LOOPEXIT]] ], [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ] -; APPLE-NEXT: [[C_5:%.*]] = icmp eq ptr [[RES1]], [[END]] -; APPLE-NEXT: ret i1 [[C_5]] +; APPLE-NEXT: [[RES:%.*]] = phi ptr [ [[RES_UNR]], %[[LOOP_HEADER_PROL_LOOPEXIT]] ], [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ] +; APPLE-NEXT: [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]] +; APPLE-NEXT: ret i1 [[C_3]] ; ; OTHER-LABEL: define i1 @multi_2_exit_find_i8_loop( ; OTHER-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0:[0-9]+]] { @@ -154,20 +154,20 @@ define i1 @multi_2_exit_find_ptr_loop(ptr %vec, ptr %tgt) { ; APPLE: [[LOOP_HEADER_PROL_PREHEADER]]: ; APPLE-NEXT: br label %[[LOOP_HEADER_PROL:.*]] ; APPLE: [[LOOP_HEADER_PROL]]: -; APPLE-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH_PROL:.*]] ], [ [[START]], %[[LOOP_HEADER_PROL_PREHEADER]] ] +; APPLE-NEXT: [[PTR_IV_PROL:%.*]] = phi ptr [ [[PTR_IV_NEXT_PROL:%.*]], %[[LOOP_LATCH_PROL:.*]] ], [ [[START]], %[[LOOP_HEADER_PROL_PREHEADER]] ] ; APPLE-NEXT: [[PROL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_HEADER_PROL_PREHEADER]] ], [ [[PROL_ITER_NEXT:%.*]], %[[LOOP_LATCH_PROL]] ] -; APPLE-NEXT: [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8 -; APPLE-NEXT: [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]] -; APPLE-NEXT: br i1 [[C_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT3:.*]], label %[[LOOP_LATCH_PROL]] +; APPLE-NEXT: [[L_PROL:%.*]] = load ptr, ptr [[PTR_IV_PROL]], align 8 +; APPLE-NEXT: [[C_1_PROL:%.*]] = icmp eq ptr [[L_PROL]], [[TGT]] +; APPLE-NEXT: br i1 [[C_1_PROL]], label %[[EXIT_UNR_LCSSA_LOOPEXIT3:.*]], label %[[LOOP_LATCH_PROL]] ; APPLE: [[LOOP_LATCH_PROL]]: -; APPLE-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8 -; APPLE-NEXT: [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] +; APPLE-NEXT: [[PTR_IV_NEXT_PROL]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_PROL]], i64 8 +; APPLE-NEXT: [[C_2_PROL:%.*]] = icmp eq ptr [[PTR_IV_NEXT_PROL]], [[END]] ; APPLE-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1 ; APPLE-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]] ; APPLE-NEXT: br i1 [[PROL_ITER_CMP]], label %[[LOOP_HEADER_PROL]], label %[[LOOP_HEADER_PROL_LOOPEXIT_UNR_LCSSA:.*]], !llvm.loop [[LOOP2:![0-9]+]] ; APPLE: [[LOOP_HEADER_PROL_LOOPEXIT_UNR_LCSSA]]: ; APPLE-NEXT: [[RES_UNR_PH:%.*]] = phi ptr [ [[END]], %[[LOOP_LATCH_PROL]] ] -; APPLE-NEXT: [[PTR_IV_UNR_PH:%.*]] = phi ptr [ [[PTR_IV_NEXT]], %[[LOOP_LATCH_PROL]] ] +; APPLE-NEXT: [[PTR_IV_UNR_PH:%.*]] = phi ptr [ [[PTR_IV_NEXT_PROL]], %[[LOOP_LATCH_PROL]] ] ; APPLE-NEXT: br label %[[LOOP_HEADER_PROL_LOOPEXIT]] ; APPLE: [[LOOP_HEADER_PROL_LOOPEXIT]]: ; APPLE-NEXT: [[RES_UNR:%.*]] = phi ptr [ poison, %[[ENTRY]] ], [ [[RES_UNR_PH]], %[[LOOP_HEADER_PROL_LOOPEXIT_UNR_LCSSA]] ] @@ -177,17 +177,17 @@ define i1 @multi_2_exit_find_ptr_loop(ptr %vec, ptr %tgt) { ; APPLE: [[ENTRY_NEW]]: ; APPLE-NEXT: br label %[[LOOP_HEADER:.*]] ; APPLE: [[LOOP_HEADER]]: -; APPLE-NEXT: [[PTR_IV1:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_3:%.*]], %[[LOOP_LATCH_3:.*]] ] -; APPLE-NEXT: [[L1:%.*]] = load ptr, ptr [[PTR_IV1]], align 8 -; APPLE-NEXT: [[C_4:%.*]] = icmp eq ptr [[L1]], [[TGT]] -; APPLE-NEXT: br i1 [[C_4]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]] +; APPLE-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_UNR]], %[[ENTRY_NEW]] ], [ [[PTR_IV_NEXT_3:%.*]], %[[LOOP_LATCH_3:.*]] ] +; APPLE-NEXT: [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8 +; APPLE-NEXT: [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]] +; APPLE-NEXT: br i1 [[C_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP_LATCH:.*]] ; APPLE: [[LOOP_LATCH]]: -; APPLE-NEXT: [[PTR_IV_NEXT1:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV1]], i64 8 -; APPLE-NEXT: [[L_1:%.*]] = load ptr, ptr [[PTR_IV_NEXT1]], align 8 +; APPLE-NEXT: [[PTR_IV_NEXT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8 +; APPLE-NEXT: [[L_1:%.*]] = load ptr, ptr [[PTR_IV_NEXT]], align 8 ; APPLE-NEXT: [[C_1_1:%.*]] = icmp eq ptr [[L_1]], [[TGT]] ; APPLE-NEXT: br i1 [[C_1_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_1:.*]] ; APPLE: [[LOOP_LATCH_1]]: -; APPLE-NEXT: [[PTR_IV_NEXT_1:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT1]], i64 8 +; APPLE-NEXT: [[PTR_IV_NEXT_1:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_NEXT]], i64 8 ; APPLE-NEXT: [[L_2:%.*]] = load ptr, ptr [[PTR_IV_NEXT_1]], align 8 ; APPLE-NEXT: [[C_1_2:%.*]] = icmp eq ptr [[L_2]], [[TGT]] ; APPLE-NEXT: br i1 [[C_1_2]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_LATCH_2:.*]] @@ -201,10 +201,10 @@ define i1 @multi_2_exit_find_ptr_loop(ptr %vec, ptr %tgt) { ; APPLE-NEXT: [[C_2_3:%.*]] = icmp eq ptr [[PTR_IV_NEXT_3]], [[END]] ; APPLE-NEXT: br i1 [[C_2_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT]], label %[[LOOP_HEADER]] ; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]: -; APPLE-NEXT: [[RES_PH_PH:%.*]] = phi ptr [ [[PTR_IV1]], %[[LOOP_HEADER]] ], [ [[PTR_IV_NEXT1]], %[[LOOP_LATCH]] ], [ [[PTR_IV_NEXT_1]], %[[LOOP_LATCH_1]] ], [ [[PTR_IV_NEXT_2]], %[[LOOP_LATCH_2]] ], [ [[END]], %[[LOOP_LATCH_3]] ] +; APPLE-NEXT: [[RES_PH_PH:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[PTR_IV_NEXT]], %[[LOOP_LATCH]] ], [ [[PTR_IV_NEXT_1]], %[[LOOP_LATCH_1]] ], [ [[PTR_IV_NEXT_2]], %[[LOOP_LATCH_2]] ], [ [[END]], %[[LOOP_LATCH_3]] ] ; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA:.*]] ; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT3]]: -; APPLE-NEXT: [[RES_PH_PH4:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER_PROL]] ] +; APPLE-NEXT: [[RES_PH_PH4:%.*]] = phi ptr [ [[PTR_IV_PROL]], %[[LOOP_HEADER_PROL]] ] ; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]] ; APPLE: [[EXIT_UNR_LCSSA]]: ; APPLE-NEXT: [[RES_PH:%.*]] = phi ptr [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ], [ [[RES_PH_PH4]], %[[EXIT_UNR_LCSSA_LOOPEXIT3]] ] @@ -336,6 +336,7 @@ exit: %c.3 = icmp eq ptr %res, %end ret i1 %c.3 } + define i1 @multi_3_exit_find_ptr_loop(ptr %vec, ptr %tgt, ptr %tgt2) { ; APPLE-LABEL: define i1 @multi_3_exit_find_ptr_loop( ; APPLE-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]], ptr [[TGT2:%.*]]) #[[ATTR0]] { @@ -508,8 +509,3 @@ exit.2: } declare void @llvm.assume(i1 noundef) -;. -; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} -; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"} -; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]} -;. From 9eb7e64145f66f59f07edffb4f9c8a648174543e Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Fri, 4 Apr 2025 09:41:54 +0100 Subject: [PATCH 0620/1029] [mlir][spirv] Add verification for Bias operand (#134231) --- mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp | 50 ++++++++++++++++++----- mlir/test/Dialect/SPIRV/IR/image-ops.mlir | 32 ++++++++++++++- 2 files changed, 71 insertions(+), 11 deletions(-) diff --git a/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp b/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp index b198294b9bdd6..a021931425fb0 100644 --- a/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/ImageOps.cpp @@ -42,6 +42,36 @@ static LogicalResult verifyImageOperands(Operation *imageOp, // The order we process operands is important. In case of multiple argument // taking operands, the arguments are ordered starting with operands having // smaller-numbered bits first. + if (spirv::bitEnumContainsAny(attr.getValue(), spirv::ImageOperands::Bias)) { + if (!isa(imageOp)) + return imageOp->emitError( + "Bias is only valid with implicit-lod instructions"); + + if (index + 1 > operands.size()) + return imageOp->emitError("Bias operand requires 1 argument"); + + if (!isa(operands[index].getType())) + return imageOp->emitError("Bias must be a floating-point type scalar"); + + auto samplingOp = cast(imageOp); + auto sampledImageType = + cast(samplingOp.getSampledImage().getType()); + auto imageType = cast(sampledImageType.getImageType()); + + if (!llvm::is_contained({spirv::Dim::Dim1D, spirv::Dim::Dim2D, + spirv::Dim::Dim3D, spirv::Dim::Cube}, + imageType.getDim())) + return imageOp->emitError( + "Bias must only be used with an image type that has " + "a dim operand of 1D, 2D, 3D, or Cube"); + + if (imageType.getSamplingInfo() != spirv::ImageSamplingInfo::SingleSampled) + return imageOp->emitError("Bias must only be used with an image type " + "that has a MS operand of 0"); + + ++index; + } + if (spirv::bitEnumContainsAny(attr.getValue(), spirv::ImageOperands::Lod)) { if (!isa(imageOp) && !isa(imageOp)) @@ -74,12 +104,13 @@ static LogicalResult verifyImageOperands(Operation *imageOp, if (!llvm::is_contained({spirv::Dim::Dim1D, spirv::Dim::Dim2D, spirv::Dim::Dim3D, spirv::Dim::Cube}, imageType.getDim())) - return imageOp->emitError("Lod only be used with an image type that has " - "a dim operand of 1D, 2D, 3D, or Cube"); + return imageOp->emitError( + "Lod must only be used with an image type that has " + "a dim operand of 1D, 2D, 3D, or Cube"); if (imageType.getSamplingInfo() != spirv::ImageSamplingInfo::SingleSampled) - return imageOp->emitError( - "Lod only be used with an image type that has a MS operand of 0"); + return imageOp->emitError("Lod must only be used with an image type that " + "has a MS operand of 0"); ++index; } @@ -99,8 +130,8 @@ static LogicalResult verifyImageOperands(Operation *imageOp, auto imageType = cast(sampledImageType.getImageType()); if (imageType.getSamplingInfo() != spirv::ImageSamplingInfo::SingleSampled) - return imageOp->emitError( - "Grad only be used with an image type that has a MS operand of 0"); + return imageOp->emitError("Grad must only be used with an image type " + "that has a MS operand of 0"); int64_t numberOfComponents = 0; @@ -147,10 +178,9 @@ static LogicalResult verifyImageOperands(Operation *imageOp, // TODO: Add the validation rules for the following Image Operands. spirv::ImageOperands noSupportOperands = - spirv::ImageOperands::Bias | spirv::ImageOperands::ConstOffset | - spirv::ImageOperands::Offset | spirv::ImageOperands::ConstOffsets | - spirv::ImageOperands::Sample | spirv::ImageOperands::MinLod | - spirv::ImageOperands::MakeTexelAvailable | + spirv::ImageOperands::ConstOffset | spirv::ImageOperands::Offset | + spirv::ImageOperands::ConstOffsets | spirv::ImageOperands::Sample | + spirv::ImageOperands::MinLod | spirv::ImageOperands::MakeTexelAvailable | spirv::ImageOperands::MakeTexelVisible | spirv::ImageOperands::SignExtend | spirv::ImageOperands::ZeroExtend; diff --git a/mlir/test/Dialect/SPIRV/IR/image-ops.mlir b/mlir/test/Dialect/SPIRV/IR/image-ops.mlir index 9a0b8b79e3e01..1ebdfdb41de1b 100644 --- a/mlir/test/Dialect/SPIRV/IR/image-ops.mlir +++ b/mlir/test/Dialect/SPIRV/IR/image-ops.mlir @@ -276,6 +276,36 @@ func.func @sample_implicit_proj_dref(%arg0 : !spirv.sampled_image>, %arg1 : f32, %arg2 : f32) -> () { + // expected-error @+1 {{too many image operand arguments have been provided}} + %0 = spirv.ImageSampleImplicitLod %arg0, %arg1 ["Bias"], %arg2, %arg2 : !spirv.sampled_image>, f32, f32, f32 -> vector<4xf32> + spirv.Return +} + +// ----- + +func.func @bias_too_many_arguments(%arg0 : !spirv.sampled_image>, %arg1 : f32, %arg2 : i32) -> () { + // expected-error @+1 {{Bias must be a floating-point type scalar}} + %0 = spirv.ImageSampleImplicitLod %arg0, %arg1 ["Bias"], %arg2 : !spirv.sampled_image>, f32, i32 -> vector<4xf32> + spirv.Return +} + +// ----- + +func.func @bias_with_rect(%arg0 : !spirv.sampled_image>, %arg1 : f32, %arg2 : f32) -> () { + // expected-error @+1 {{Bias must only be used with an image type that has a dim operand of 1D, 2D, 3D, or Cube}} + %0 = spirv.ImageSampleImplicitLod %arg0, %arg1 ["Bias"], %arg2 : !spirv.sampled_image>, f32, f32 -> vector<4xf32> + spirv.Return +} + +// TODO: We cannot currently test Bias with MS != 0 as all implemented implicit operations already check for that. + +// ----- + //===----------------------------------------------------------------------===// // spirv.ImageOperands: Lod //===----------------------------------------------------------------------===// @@ -305,7 +335,7 @@ func.func @lod_too_many_arguments(%arg0 : !spirv.sampled_image>, %arg1 : vector<2xf32>, %arg2 : f32) -> () { - // expected-error @+1 {{Lod only be used with an image type that has a dim operand of 1D, 2D, 3D, or Cube}} + // expected-error @+1 {{Lod must only be used with an image type that has a dim operand of 1D, 2D, 3D, or Cube}} %0 = spirv.ImageSampleExplicitLod %arg0, %arg1 ["Lod"], %arg2 : !spirv.sampled_image>, vector<2xf32>, f32 -> vector<4xf32> spirv.Return } From c154d66339ed6e3c236849e4999cf0dc6ddec777 Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Fri, 4 Apr 2025 09:01:56 +0000 Subject: [PATCH 0621/1029] [mlir] Apply ClangTidyBugProne finding (NFC). argument name 'outputType' in comment does not match parameter name 'outputTypes' --- mlir/lib/Transforms/Utils/DialectConversion.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 444c505b64232..4554801b3a388 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -1391,7 +1391,7 @@ Block *ConversionPatternRewriterImpl::applySignatureConversion( MaterializationKind::Source, OpBuilder::InsertPoint(newBlock, newBlock->begin()), origArg.getLoc(), /*valuesToMap=*/{origArg}, /*inputs=*/ValueRange(), - /*outputType=*/origArgType, /*originalType=*/Type(), converter); + /*outputTypes=*/origArgType, /*originalType=*/Type(), converter); appendRewrite(block, origArg, converter); continue; } @@ -1496,7 +1496,7 @@ Value ConversionPatternRewriterImpl::findOrBuildReplacementValue( buildUnresolvedMaterialization(MaterializationKind::Source, computeInsertPoint(repl), value.getLoc(), /*valuesToMap=*/repl, /*inputs=*/repl, - /*outputType=*/value.getType(), + /*outputTypes=*/value.getType(), /*originalType=*/Type(), converter) .front(); return castValue; @@ -1552,7 +1552,7 @@ void ConversionPatternRewriterImpl::notifyOpReplaced( buildUnresolvedMaterialization( MaterializationKind::Source, computeInsertPoint(result), result.getLoc(), /*valuesToMap=*/{result}, /*inputs=*/ValueRange(), - /*outputType=*/result.getType(), /*originalType=*/Type(), + /*outputTypes=*/result.getType(), /*originalType=*/Type(), currentTypeConverter); continue; } else { From 1302610f03a1f10c2eea4c66445ccba4c52887b6 Mon Sep 17 00:00:00 2001 From: Tobias Stadler Date: Fri, 4 Apr 2025 10:16:40 +0100 Subject: [PATCH 0622/1029] [MergeFunc] Fix crash caused by bitcasting ArrayType (#133259) createCast in MergeFunctions did not consider ArrayTypes, which results in the creation of a bitcast between ArrayTypes in the thunk function, leading to an assertion failure in the provided test case. The version of createCast in GlobalMergeFunctions does handle ArrayTypes, so this common code has been factored out into the IRBuilder. --- llvm/include/llvm/IR/IRBuilder.h | 7 ++ llvm/lib/CodeGen/GlobalMergeFunctions.cpp | 47 ++---------- llvm/lib/IR/IRBuilder.cpp | 34 +++++++++ llvm/lib/Transforms/IPO/MergeFunctions.cpp | 31 +------- .../Transforms/MergeFunc/crash-cast-arrays.ll | 76 +++++++++++++++++++ 5 files changed, 124 insertions(+), 71 deletions(-) create mode 100644 llvm/test/Transforms/MergeFunc/crash-cast-arrays.ll diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index 07660e93253da..0e68ffadc6939 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -2299,6 +2299,13 @@ class IRBuilderBase { // isSigned parameter. Value *CreateIntCast(Value *, Type *, const char *) = delete; + /// Cast between aggregate types that must have identical structure but may + /// differ in their leaf types. The leaf values are recursively extracted, + /// casted, and then reinserted into a value of type DestTy. The leaf types + /// must be castable using a bitcast or ptrcast, because signedness is + /// not specified. + Value *CreateAggregateCast(Value *V, Type *DestTy); + //===--------------------------------------------------------------------===// // Instruction creation methods: Compare Instructions //===--------------------------------------------------------------------===// diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp index e920b1be6822c..d4c53e79ed2e1 100644 --- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp +++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp @@ -140,44 +140,6 @@ static bool ignoreOp(const Instruction *I, unsigned OpIdx) { return true; } -static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) { - Type *SrcTy = V->getType(); - if (SrcTy->isStructTy()) { - assert(DestTy->isStructTy()); - assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements()); - Value *Result = PoisonValue::get(DestTy); - for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) { - Value *Element = - createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)), - DestTy->getStructElementType(I)); - - Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I)); - } - return Result; - } - assert(!DestTy->isStructTy()); - if (auto *SrcAT = dyn_cast(SrcTy)) { - auto *DestAT = dyn_cast(DestTy); - assert(DestAT); - assert(SrcAT->getNumElements() == DestAT->getNumElements()); - Value *Result = PoisonValue::get(DestTy); - for (unsigned int I = 0, E = SrcAT->getNumElements(); I < E; ++I) { - Value *Element = - createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)), - DestAT->getElementType()); - - Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I)); - } - return Result; - } - assert(!DestTy->isArrayTy()); - if (SrcTy->isIntegerTy() && DestTy->isPointerTy()) - return Builder.CreateIntToPtr(V, DestTy); - if (SrcTy->isPointerTy() && DestTy->isIntegerTy()) - return Builder.CreatePtrToInt(V, DestTy); - return Builder.CreateBitCast(V, DestTy); -} - void GlobalMergeFunc::analyze(Module &M) { ++NumAnalyzedModues; for (Function &Func : M) { @@ -268,7 +230,7 @@ static Function *createMergedFunction(FuncMergeInfo &FI, if (OrigC->getType() != NewArg->getType()) { IRBuilder<> Builder(Inst->getParent(), Inst->getIterator()); Inst->setOperand(OpndIndex, - createCast(Builder, NewArg, OrigC->getType())); + Builder.CreateAggregateCast(NewArg, OrigC->getType())); } else { Inst->setOperand(OpndIndex, NewArg); } @@ -297,7 +259,8 @@ static void createThunk(FuncMergeInfo &FI, ArrayRef Params, // Add arguments which are passed through Thunk. for (Argument &AI : Thunk->args()) { - Args.push_back(createCast(Builder, &AI, ToFuncTy->getParamType(ParamIdx))); + Args.push_back( + Builder.CreateAggregateCast(&AI, ToFuncTy->getParamType(ParamIdx))); ++ParamIdx; } @@ -305,7 +268,7 @@ static void createThunk(FuncMergeInfo &FI, ArrayRef Params, for (auto *Param : Params) { assert(ParamIdx < ToFuncTy->getNumParams()); Args.push_back( - createCast(Builder, Param, ToFuncTy->getParamType(ParamIdx))); + Builder.CreateAggregateCast(Param, ToFuncTy->getParamType(ParamIdx))); ++ParamIdx; } @@ -319,7 +282,7 @@ static void createThunk(FuncMergeInfo &FI, ArrayRef Params, if (Thunk->getReturnType()->isVoidTy()) Builder.CreateRetVoid(); else - Builder.CreateRet(createCast(Builder, CI, Thunk->getReturnType())); + Builder.CreateRet(Builder.CreateAggregateCast(CI, Thunk->getReturnType())); } // Check if the old merged/optimized IndexOperandHashMap is compatible with diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 421b617a5fb7e..e5a2f08c393c9 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -76,6 +76,40 @@ void IRBuilderBase::SetInstDebugLocation(Instruction *I) const { } } +Value *IRBuilderBase::CreateAggregateCast(Value *V, Type *DestTy) { + Type *SrcTy = V->getType(); + if (SrcTy == DestTy) + return V; + + if (SrcTy->isAggregateType()) { + unsigned NumElements; + if (SrcTy->isStructTy()) { + assert(DestTy->isStructTy() && "Expected StructType"); + assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements() && + "Expected StructTypes with equal number of elements"); + NumElements = SrcTy->getStructNumElements(); + } else { + assert(SrcTy->isArrayTy() && DestTy->isArrayTy() && "Expected ArrayType"); + assert(SrcTy->getArrayNumElements() == DestTy->getArrayNumElements() && + "Expected ArrayTypes with equal number of elements"); + NumElements = SrcTy->getArrayNumElements(); + } + + Value *Result = PoisonValue::get(DestTy); + for (unsigned I = 0; I < NumElements; ++I) { + Type *ElementTy = SrcTy->isStructTy() ? DestTy->getStructElementType(I) + : DestTy->getArrayElementType(); + Value *Element = + CreateAggregateCast(CreateExtractValue(V, ArrayRef(I)), ElementTy); + + Result = CreateInsertValue(Result, Element, ArrayRef(I)); + } + return Result; + } + + return CreateBitOrPointerCast(V, DestTy); +} + CallInst * IRBuilderBase::createCallHelper(Function *Callee, ArrayRef Ops, const Twine &Name, FMFSource FMFSource, diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp index 924db314674d5..c58c0f40c1b23 100644 --- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -511,33 +511,6 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) { } } -// Helper for writeThunk, -// Selects proper bitcast operation, -// but a bit simpler then CastInst::getCastOpcode. -static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) { - Type *SrcTy = V->getType(); - if (SrcTy->isStructTy()) { - assert(DestTy->isStructTy()); - assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements()); - Value *Result = PoisonValue::get(DestTy); - for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) { - Value *Element = - createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)), - DestTy->getStructElementType(I)); - - Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I)); - } - return Result; - } - assert(!DestTy->isStructTy()); - if (SrcTy->isIntegerTy() && DestTy->isPointerTy()) - return Builder.CreateIntToPtr(V, DestTy); - else if (SrcTy->isPointerTy() && DestTy->isIntegerTy()) - return Builder.CreatePtrToInt(V, DestTy); - else - return Builder.CreateBitCast(V, DestTy); -} - // Erase the instructions in PDIUnrelatedWL as they are unrelated to the // parameter debug info, from the entry block. void MergeFunctions::eraseInstsUnrelatedToPDI( @@ -789,7 +762,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { unsigned i = 0; FunctionType *FFTy = F->getFunctionType(); for (Argument &AI : H->args()) { - Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i))); + Args.push_back(Builder.CreateAggregateCast(&AI, FFTy->getParamType(i))); ++i; } @@ -804,7 +777,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) { if (H->getReturnType()->isVoidTy()) { RI = Builder.CreateRetVoid(); } else { - RI = Builder.CreateRet(createCast(Builder, CI, H->getReturnType())); + RI = Builder.CreateRet(Builder.CreateAggregateCast(CI, H->getReturnType())); } if (MergeFunctionsPDI) { diff --git a/llvm/test/Transforms/MergeFunc/crash-cast-arrays.ll b/llvm/test/Transforms/MergeFunc/crash-cast-arrays.ll new file mode 100644 index 0000000000000..6a18feba1263a --- /dev/null +++ b/llvm/test/Transforms/MergeFunc/crash-cast-arrays.ll @@ -0,0 +1,76 @@ +; RUN: opt -S -passes=mergefunc < %s | FileCheck %s + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" + +%A = type { double } +; the intermediary struct causes A_arr and B_arr to be different types +%A_struct = type { %A } +%A_arr = type { [1 x %A_struct] } + +%B = type { double } +%B_struct = type { %B } +%B_arr = type { [1 x %B_struct] } + +; conversion between C_arr and D_arr is possible, but requires ptrcast +%C = type { i64 } +%C_struct = type { %C } +%C_arr = type { [1 x %C_struct] } + +%D = type { ptr } +%D_struct = type { %D } +%D_arr = type { [1 x %D_struct] } + +declare void @noop() + +define %A_arr @a() { +; CHECK-LABEL: define %A_arr @a() { +; CHECK-NEXT: call void @noop() +; CHECK-NEXT: ret %A_arr zeroinitializer +; + call void @noop() + ret %A_arr zeroinitializer +} + +define %C_arr @c() { +; CHECK-LABEL: define %C_arr @c() { +; CHECK-NEXT: call void @noop() +; CHECK-NEXT: ret %C_arr zeroinitializer +; + call void @noop() + ret %C_arr zeroinitializer +} + +define %B_arr @b() { +; CHECK-LABEL: define %B_arr @b() { +; CHECK-NEXT: [[TMP1:%.*]] = tail call %A_arr @a +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue %A_arr [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue [1 x %A_struct] [[TMP2]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue %A_struct [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue %A [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue %B poison, double [[TMP5]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertvalue %B_struct poison, %B [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertvalue [1 x %B_struct] poison, %B_struct [[TMP7]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertvalue %B_arr poison, [1 x %B_struct] [[TMP8]], 0 +; CHECK-NEXT: ret %B_arr [[TMP9]] +; + call void @noop() + ret %B_arr zeroinitializer +} + +define %D_arr @d() { +; CHECK-LABEL: define %D_arr @d() { +; CHECK-NEXT: [[TMP1:%.*]] = tail call %C_arr @c +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue %C_arr [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue [1 x %C_struct] [[TMP2]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue %C_struct [[TMP3]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue %C [[TMP4]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP5]] to ptr +; CHECK-NEXT: [[TMP6:%.*]] = insertvalue %D poison, ptr [[TMP10]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertvalue %D_struct poison, %D [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertvalue [1 x %D_struct] poison, %D_struct [[TMP7]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertvalue %D_arr poison, [1 x %D_struct] [[TMP8]], 0 +; CHECK-NEXT: ret %D_arr [[TMP9]] +; + call void @noop() + ret %D_arr zeroinitializer +} From a4573ee38d4497749d06aedb422159277cccfd66 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 4 Apr 2025 10:16:50 +0100 Subject: [PATCH 0623/1029] [LoopUnroll] UnrollRuntimeMultiExit takes precedence over TTI. (#134259) Update UnrollRuntimeLoopRemainder to always give priority to the UnrollRuntimeMultiExit option, if provided. After ad9da92cf6f7357 (https://github.com/llvm/llvm-project/pull/124462), we would ignore the option if the backend indicates multi-exit is profitable. This means it cannot be used to disable runtime unrolling. To be consistent with canProfitablyRuntimeUnrollMultiExitLoop, always respect the option. This surfaced while discussing https://github.com/llvm/llvm-project/pull/131998. PR: https://github.com/llvm/llvm-project/pull/134259 --- .../Transforms/Utils/LoopUnrollRuntime.cpp | 26 ++++++++++--------- .../AArch64/apple-unrolling-multi-exit.ll | 1 + 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 524b268aee2f3..bf882d7406853 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -465,10 +465,6 @@ static bool canProfitablyRuntimeUnrollMultiExitLoop( Loop *L, SmallVectorImpl &OtherExits, BasicBlock *LatchExit, bool UseEpilogRemainder) { - // Priority goes to UnrollRuntimeMultiExit if it's supplied. - if (UnrollRuntimeMultiExit.getNumOccurrences()) - return UnrollRuntimeMultiExit; - // The main pain point with multi-exit loop unrolling is that once unrolled, // we will not be able to merge all blocks into a straight line code. // There are branches within the unrolled loop that go to the OtherExits. @@ -633,14 +629,20 @@ bool llvm::UnrollRuntimeLoopRemainder( if (!PreserveLCSSA) return false; - if (!RuntimeUnrollMultiExit && - !canProfitablyRuntimeUnrollMultiExitLoop(L, OtherExits, LatchExit, - UseEpilogRemainder)) { - LLVM_DEBUG( - dbgs() - << "Multiple exit/exiting blocks in loop and multi-exit unrolling not " - "enabled!\n"); - return false; + // Priority goes to UnrollRuntimeMultiExit if it's supplied. + if (UnrollRuntimeMultiExit.getNumOccurrences()) { + if (!UnrollRuntimeMultiExit) + return false; + } else { + // Otherwise perform multi-exit unrolling, if either the target indicates + // it is profitable or the general profitability heuristics apply. + if (!RuntimeUnrollMultiExit && + !canProfitablyRuntimeUnrollMultiExitLoop(L, OtherExits, LatchExit, + UseEpilogRemainder)) { + LLVM_DEBUG(dbgs() << "Multiple exit/exiting blocks in loop and " + "multi-exit unrolling not enabled!\n"); + return false; + } } } // Use Scalar Evolution to compute the trip count. This allows more loops to diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll index 30fff6c9e293c..90595982a586e 100644 --- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll +++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll @@ -3,6 +3,7 @@ ; RUN: opt -p loop-unroll -mcpu=apple-m2 -S %s | FileCheck --check-prefix=APPLE %s ; RUN: opt -p loop-unroll -mcpu=apple-m3 -S %s | FileCheck --check-prefix=APPLE %s ; RUN: opt -p loop-unroll -mcpu=apple-m4 -S %s | FileCheck --check-prefix=APPLE %s +; RUN: opt -p loop-unroll -mcpu=apple-m1 -unroll-runtime-multi-exit=false -S %s | FileCheck --check-prefix=OTHER %s ; RUN: opt -p loop-unroll -mcpu=cortex-a57 -S %s | FileCheck --check-prefix=OTHER %s target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" From 4da5e9dd320e9d48be0fa05ba1a8faf50fb53834 Mon Sep 17 00:00:00 2001 From: gbMattN Date: Fri, 4 Apr 2025 10:41:00 +0100 Subject: [PATCH 0624/1029] [ASan] Fix shadowed-stack-serialization test on targets with different stack layout --- .../test/asan/TestCases/shadowed-stack-serialization.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/asan/TestCases/shadowed-stack-serialization.cpp b/compiler-rt/test/asan/TestCases/shadowed-stack-serialization.cpp index f2706c671c261..4018c097aed5a 100644 --- a/compiler-rt/test/asan/TestCases/shadowed-stack-serialization.cpp +++ b/compiler-rt/test/asan/TestCases/shadowed-stack-serialization.cpp @@ -6,7 +6,7 @@ int main() { { int x; delete &x; + // CHECK: {{.*}}) 'x' (line [[@LINE-2]]) } } -// CHECK: [32, 36) 'x' From 5812516ae2e034d70b0cca20b95d627e163b4567 Mon Sep 17 00:00:00 2001 From: Alaa Ali Date: Fri, 4 Apr 2025 05:46:58 -0400 Subject: [PATCH 0625/1029] [MLIR] Fix canonicalization pattern for 'shape.shape_of' (#134234) This PR will fix a bug in a canonicalization pattern (operation shape.shape_of: shape of reshape) ``` // Before func.func @f(%arg0: tensor, %arg1: tensor<3xi32>) -> tensor<3xindex> { %reshape = tensor.reshape %arg0(%arg1) : (tensor, tensor<3xi32>) -> tensor %0 = shape.shape_of %reshape : tensor -> tensor<3xindex> return %0 : tensor<3xindex> } //This is will error out as follows: error: 'tensor.cast' op operand type 'tensor<3xi32>' and result type 'tensor<3xindex>' are cast incompatible %0 = shape.shape_of %reshape : tensor -> tensor<3xindex> ^ note: see current operation: %0 = "tensor.cast"(%arg1) : (tensor<3xi32>) -> tensor<3xindex> ``` ``` // After func.func @f(%arg0: tensor, %arg1: tensor<3xi32>) -> tensor<3xindex> { %0 = arith.index_cast %arg1 : tensor<3xi32> to tensor<3xindex> return %0 : tensor<3xindex> } ``` See file canonicalize.mlir in the change list for an example. For the context, this bug was found while running a test on Keras 3, the canonicalizer errors out due to an invalid tensor.cast operation when the batch size is dynamic. The operands of the op are tensor<3xi32> cast to tensor<3xindex>. This change is related to a previous PR: https://github.com/llvm/llvm-project/pull/98531 --------- Co-authored-by: Alaa Ali Co-authored-by: Mehdi Amini --- mlir/lib/Dialect/Shape/IR/Shape.cpp | 19 ++++++++-- mlir/test/Dialect/Shape/canonicalize.mlir | 46 ++++++++++++++++++++++- 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp index 10ba808cd26c2..f670614806dbd 100644 --- a/mlir/lib/Dialect/Shape/IR/Shape.cpp +++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp @@ -1734,10 +1734,23 @@ struct ShapeOfFromReshape : public OpRewritePattern { // Operand 'shape' of 'tensor.reshape' may now be used as the result of // 'shape.shape_of'. While its type is guaranteed to be compatible in well- // formed IR, it may not be identical (dynamically vs statically shaped), - // in which case it needs to be cast first. + // in which case it needs to be cast first using 'tensor.cast'. + // Additionally, it may not have identical element type (i32 vs index) + // while it has identical shaped type (dynamic vs static), in which case it + // needs to be cast first using 'arith.index_cast'. Note: 'shape.shape_of' + // op result must be shape or extent tensor. Value shape = tensorReshapeOp.getShape(); - if (op.getType() != shape.getType()) - shape = rewriter.create(op.getLoc(), op.getType(), shape); + + auto opTensorTy = cast(op.getType()); + auto shapeTensorTy = cast(shape.getType()); + + if (opTensorTy != shapeTensorTy) { + if (opTensorTy.getElementType() == shapeTensorTy.getElementType()) + shape = rewriter.create(op.getLoc(), opTensorTy, shape); + else if (!isExtentTensorType(shapeTensorTy)) + shape = + rewriter.create(op.getLoc(), opTensorTy, shape); + } rewriter.replaceOp(op, shape); return success(); diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir index cf439c9c1b854..b42fa75e4112d 100644 --- a/mlir/test/Dialect/Shape/canonicalize.mlir +++ b/mlir/test/Dialect/Shape/canonicalize.mlir @@ -1389,10 +1389,25 @@ func.func @shape_of_from_reshape(%arg0: tensor<*xf32>, %arg1: tensor) - // ----- -// CHECK-LABEL: func @shape_of_from_reshape_compatible_types +// Check statically shaped types, with element types i32 to index. +// CHECK-LABEL: func @shape_of_from_reshape_int_to_index +// CHECK-SAME: %[[INPUT:.*]]: tensor +// CHECK-SAME: %[[SHAPE:.*]]: tensor<3xi32> +func.func @shape_of_from_reshape_int_to_index(%arg0: tensor, %arg1: tensor<3xi32>) -> tensor<3xindex> { + // CHECK: %[[CAST_SHAPE:.*]] = arith.index_cast %[[SHAPE]] : tensor<3xi32> to tensor<3xindex> + // CHECK: return %[[CAST_SHAPE]] : tensor<3xindex> + %0 = tensor.reshape %arg0(%arg1) : (tensor, tensor<3xi32>) -> tensor + %1 = shape.shape_of %0 : tensor -> tensor<3xindex> + return %1 : tensor<3xindex> +} + +// ----- + +// Check similar element types, with statically shaped to dynamically shaped. +// CHECK-LABEL: func @shape_of_from_reshape_static_to_dynamic // CHECK-SAME: %[[INPUT:.*]]: tensor<*xf32> // CHECK-SAME: %[[SHAPE:.*]]: tensor<5xindex> -func.func @shape_of_from_reshape_compatible_types(%arg0: tensor<*xf32>, %arg1: tensor<5xindex>) -> tensor { +func.func @shape_of_from_reshape_static_to_dynamic(%arg0: tensor<*xf32>, %arg1: tensor<5xindex>) -> tensor { // CHECK: %[[CAST_SHAPE:.*]] = tensor.cast %[[SHAPE]] : tensor<5xindex> to tensor // CHECK: return %[[CAST_SHAPE]] : tensor %0 = tensor.reshape %arg0(%arg1) : (tensor<*xf32>, tensor<5xindex>) -> tensor<*xf32> @@ -1402,6 +1417,33 @@ func.func @shape_of_from_reshape_compatible_types(%arg0: tensor<*xf32>, %arg1: t // ----- +// Check similar element types, with dynamically shaped to statically shaped. +// CHECK-LABEL: func @shape_of_from_reshape_dynamic_to_static +// CHECK-SAME: %[[INPUT:.*]]: tensor<*xf32> +// CHECK-SAME: %[[SHAPE:.*]]: tensor +func.func @shape_of_from_reshape_dynamic_to_static(%arg0: tensor<*xf32>, %arg1: tensor) -> tensor<5xindex> { + // CHECK: %[[CAST_SHAPE:.*]] = tensor.cast %[[SHAPE]] : tensor to tensor<5xindex> + // CHECK: return %[[CAST_SHAPE]] : tensor<5xindex> + %0 = tensor.reshape %arg0(%arg1) : (tensor<*xf32>, tensor) -> tensor<*xf32> + %1 = shape.shape_of %0 : tensor<*xf32> -> tensor<5xindex> + return %1 : tensor<5xindex> +} + +// ----- + +// Check similar element types and similar static shape. +// CHECK-LABEL: func @shape_of_from_reshape_identical_types +// CHECK-SAME: %[[INPUT:.*]]: tensor<*xf32> +// CHECK-SAME: %[[SHAPE:.*]]: tensor<5xindex> +func.func @shape_of_from_reshape_identical_types(%arg0: tensor<*xf32>, %arg1: tensor<5xindex>) -> tensor<5xindex> { + // CHECK: return %[[SHAPE]] : tensor<5xindex> + %0 = tensor.reshape %arg0(%arg1) : (tensor<*xf32>, tensor<5xindex>) -> tensor<*xf32> + %1 = shape.shape_of %0 : tensor<*xf32> -> tensor<5xindex> + return %1 : tensor<5xindex> +} + +// ----- + // CHECK-LABEL: func @shape_of_from_reshape_nofold // CHECK-SAME: %[[INPUT:.*]]: tensor<*xf32> // CHECK-SAME: %[[SHAPE:.*]]: tensor From a03b2250db20db7d6416ca37b41f5041e613f632 Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Fri, 4 Apr 2025 15:39:25 +0530 Subject: [PATCH 0626/1029] [NVPTX][Docs] [NFC] Update docs on intrinsics (#133136) Recently, we have added a set of complex intrinsics on the TMA, tcgen05, and Cvt family of instructions. This patch captures the key learnings from our experience so far and documents them as guidelines for future design. Signed-off-by: Durgadoss R --- llvm/include/llvm/IR/IntrinsicsNVVM.td | 78 ++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 80e10f33b770d..3e9588a515c9e 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -10,6 +10,84 @@ // //===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// Guidelines on NVPTX Intrinsic design +//===----------------------------------------------------------------------===// +// +// The NVPTX intrinsics are used to model instructions in the PTX ISA. +// While simpler intrinsics can represent certain features effectively, +// more complex instructions like TMA and MMA are not as straightforward +// to model. A single variant of these complex instructions can expand +// into hundreds of intrinsics. Additionally, any expansion in the +// corresponding ISA can exponentially increase these numbers, making it +// difficult to manage them in the IR and backend passes. Therefore, +// a careful design of intrinsic interfaces can ease maintenance and +// contribute to a sustainable, long-term solution. +// +// The default approach is to have a 1:1 match between the intrinsic and +// the instruction where the instruction suffixes map to the intrinsic name +// and the instruction arguments map to the intrinsic arguments or return +// value. +// +// However, when there are too many instruction/intrinsic variants like +// the TMA/MMA family, it is desirable to encode some variants as a +// constant argument, referred to as 'flags'. +// TODO: Add a guideline to quantify the metric on 'how many intrinsics' here. +// +// Below are a set of guidelines that may help in choosing +// an appropriate design for the complex intrinsics: +// +// 1. Each flag argument represents one set of instruction modifiers. +// These flags are compile-time integer constants. +// +// 2. When an intrinsic uses flags, document it with details of the +// flag usage in the ``NVPTXUsage.rst`` file. +// 3. Annotate all flag arguments with ImmArg>. +// 4. Place the flag arguments at the end of the (actual)argument list. +// +// 5. Use `i1` for boolean flags and `i8` for others. Usually, +// the `i8` types represent an `enum` encoding the family of +// modifiers. +// 6. Note that, the specific variant for non-boolean flags may not be +// obvious in the IR. So, maintain consistency between the enum value +// definitions and their usage in the backend. +// * Provide a meaningful default value in the enums wherever applicable. +// * TODO: Investigate auto-upgrade capability for intrinsics +// when only flag value mappings change. +// +// 7. Identify the key features of an intrinsic and distinguish between +// first-order and supplementary information. Typically, encoding the +// first-order information in the intrinsic name while using flags +// for supplementary details improves readability. +// For example: +// +// i. For MMA intrinsics, 'dense' vs. 'sparse' is a fundamental feature, +// whereas an optional scaling applied to matrices is relatively secondary. +// +// ii. For TMAs, the mode of copy (e.g., 'Tile' or 'Im2col') is a first-order +// information, while features like an optional cache hint tend to be +// secondary. +// +// 8. If there are invalid combinations within a set of modifiers, avoid +// encoding them as flags, as much as possible. This helps reduce the +// need for error handling of unsupported cases in the backend. +// For example, some 'cvt' intrinsics support only a subset of the +// possible rounding modes; so it is preferable not to encode the +// rounding modes as flags. +// 9. Similarly, when there are invalid combinations across a set of +// modifiers, avoid encoding them as flags to prevent additional +// complexity in error handling. +// +// 10. Maintain a consistent design within an intrinsic family, including +// argument ordering as well as the usage and ordering of flags. +// 11. When designing an intrinsic corresponding to an instruction or its variant, +// consider the entire instruction family. This may reveal common features +// that can be modelled consistently across the family. +// +// In summary, strive to balance the aspects mentioned above, to achieve +// a scalable design with maximum readability. +//===----------------------------------------------------------------------===// + // The following intrinsics were once defined here, but are now auto-upgraded // to target-generic LLVM intrinsics. // From d6c076eeaa9d57363c6b64989d17fd094363bf9e Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Fri, 4 Apr 2025 03:33:52 -0700 Subject: [PATCH 0627/1029] [mlir][tosa] Reorder Tosa_ExtensionAttrs to match with definition order (#134319) Simple refactor change. Signed-off-by: Jerry Ge --- mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td index 4cb2e8006ca57..3a6d3d178ff37 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td @@ -250,9 +250,10 @@ def Tosa_EXT_DYNAMIC : I32EnumAttrCase<"dynamic", 11>; def Tosa_ExtensionAttr : Tosa_I32EnumAttr<"Extension", "supported TOSA extensions", "ext", [ - Tosa_EXT_INT16, Tosa_EXT_INT4, Tosa_EXT_BF16, Tosa_EXT_FP8E4M3, - Tosa_EXT_FP8E5M2, Tosa_EXT_FFT, Tosa_EXT_VARIABLE, Tosa_EXT_CONTROLFLOW, - Tosa_EXT_DOUBLEROUND, Tosa_EXT_INEXACTROUND, Tosa_EXT_DYNAMIC, Tosa_EXT_NONE + Tosa_EXT_NONE, Tosa_EXT_INT16, Tosa_EXT_INT4, Tosa_EXT_BF16, + Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_FFT, Tosa_EXT_VARIABLE, + Tosa_EXT_CONTROLFLOW, Tosa_EXT_DOUBLEROUND, Tosa_EXT_INEXACTROUND, + Tosa_EXT_DYNAMIC ]>; def Tosa_ExtensionArrayAttr From 2bdc1a1337692a5743658ba6b680e5d914e684a4 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 4 Apr 2025 11:48:01 +0100 Subject: [PATCH 0628/1029] [LV] Use frozen start value for FindLastIV if needed. (#132691) FindLastIV introduces multiple uses of the start value, where in the original source there was only a single use, when the epilogue is vectorized. Each use of undef may produce a different result, so introducing multiple uses can produce incorrect results when the input is undef/poison. If the start value may be undef or poison, freeze it and use the frozen value, which will be the same at all uses. See the following scenarios in Alive2: * Both main and epilogue vector loops execute, go to exit block: https://alive2.llvm.org/ce/z/_TSvRr * Both main and epilogue vector loops execute, go to scalar loop: https://alive2.llvm.org/ce/z/CsPj5v * Only epilogue vector loop executes, go to exit block: https://alive2.llvm.org/ce/z/5XqkNV * Only epilogue vector loop executes, go to scalar loop: https://alive2.llvm.org/ce/z/JUpqRN The latter 2 show requiring freezing the resume phi. That means we cannot freeze in the preheader. We could move the freeze to the main iteration count check, but that would be a bit fragile to find and other transforms can sink the freeze if needed. Depends on https://github.com/llvm/llvm-project/pull/132689 and https://github.com/llvm/llvm-project/pull/132690. Fixes https://github.com/llvm/llvm-project/issues/126836 PR: https://github.com/llvm/llvm-project/pull/132691 --- .../Transforms/Vectorize/LoopVectorize.cpp | 97 ++++++++++++++----- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 7 ++ .../AArch64/epilog-iv-select-cmp.ll | 18 ++-- .../LoopVectorize/epilog-iv-select-cmp.ll | 9 +- 4 files changed, 95 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0291a8bfd9674..65cce5e7d194d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7659,14 +7659,17 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( RdxDesc.getRecurrenceKind())) { using namespace llvm::PatternMatch; - Value *Cmp, *OrigResumeV; + Value *Cmp, *OrigResumeV, *CmpOp; bool IsExpectedPattern = match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)), m_Specific(RdxDesc.getSentinelValue()), m_Value(OrigResumeV))) && - match(Cmp, - m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), - m_Specific(RdxDesc.getRecurrenceStartValue()))); + (match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), + m_Value(CmpOp))) && + (match(CmpOp, + m_Freeze(m_Specific(RdxDesc.getRecurrenceStartValue()))) || + (CmpOp == RdxDesc.getRecurrenceStartValue() && + isGuaranteedNotToBeUndefOrPoison(CmpOp)))); assert(IsExpectedPattern && "Unexpected reduction resume pattern"); (void)IsExpectedPattern; MainResumeValue = OrigResumeV; @@ -10374,6 +10377,36 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { VPlanTransforms::runPass(VPlanTransforms::removeDeadRecipes, MainPlan); using namespace VPlanPatternMatch; + // When vectorizing the epilogue, FindLastIV reductions can introduce multiple + // uses of undef/poison. If the reduction start value may be undef or poison + // it needs to be frozen and the frozen start has to be used when computing + // the reduction result. We also need to use the frozen value in the resume + // phi generated by the main vector loop, as this is also used to compute the + // reduction result after the epilogue vector loop. + auto AddFreezeForFindLastIVReductions = [](VPlan &Plan, + bool UpdateResumePhis) { + VPBuilder Builder(Plan.getEntry()); + for (VPRecipeBase &R : *Plan.getMiddleBlock()) { + auto *VPI = dyn_cast(&R); + if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindLastIVResult) + continue; + VPValue *OrigStart = VPI->getOperand(1); + if (isGuaranteedNotToBeUndefOrPoison(OrigStart->getLiveInIRValue())) + continue; + VPInstruction *Freeze = + Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr"); + VPI->setOperand(1, Freeze); + if (UpdateResumePhis) + OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) { + return Freeze != &U && isa(&U) && + cast(&U)->getOpcode() == + VPInstruction::ResumePhi; + }); + } + }; + AddFreezeForFindLastIVReductions(MainPlan, true); + AddFreezeForFindLastIVReductions(EpiPlan, false); + VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader(); VPValue *VectorTC = &MainPlan.getVectorTripCount(); // If there is a suitable resume value for the canonical induction in the @@ -10401,24 +10434,7 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); - // Re-use the trip count and steps expanded for the main loop, as - // skeleton creation needs it as a value that dominates both the scalar - // and vector epilogue loops - // TODO: This is a workaround needed for epilogue vectorization and it - // should be removed once induction resume value creation is done - // directly in VPlan. - for (auto &R : make_early_inc_range(*Plan.getEntry())) { - auto *ExpandR = dyn_cast(&R); - if (!ExpandR) - continue; - auto *ExpandedVal = - Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second); - ExpandR->replaceAllUsesWith(ExpandedVal); - if (Plan.getTripCount() == ExpandR) - Plan.resetTripCount(ExpandedVal); - ExpandR->eraseFromParent(); - } - + DenseMap ToFrozen; // Ensure that the start values for all header phi recipes are updated before // vectorizing the epilogue loop. for (VPRecipeBase &R : Header->phis()) { @@ -10484,6 +10500,10 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, ResumeV = Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue()); } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) { + ToFrozen[RdxDesc.getRecurrenceStartValue()] = + cast(ResumeV)->getIncomingValueForBlock( + EPI.MainLoopIterationCountCheck); + // VPReductionPHIRecipe for FindLastIV reductions requires an adjustment // to the resume value. The resume value is adjusted to the sentinel // value when the final value from the main vector loop equals the start @@ -10492,8 +10512,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, // variable. BasicBlock *ResumeBB = cast(ResumeV)->getParent(); IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt()); - Value *Cmp = - Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue()); + Value *Cmp = Builder.CreateICmpEQ( + ResumeV, ToFrozen[RdxDesc.getRecurrenceStartValue()]); ResumeV = Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV); } @@ -10509,6 +10529,35 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV); cast(&R)->setStartValue(StartVal); } + + // For some VPValues in the epilogue plan we must re-use the generated IR + // values from the main plan. Replace them with live-in VPValues. + // TODO: This is a workaround needed for epilogue vectorization and it + // should be removed once induction resume value creation is done + // directly in VPlan. + for (auto &R : make_early_inc_range(*Plan.getEntry())) { + // Re-use frozen values from the main plan for Freeze VPInstructions in the + // epilogue plan. This ensures all users use the same frozen value. + auto *VPI = dyn_cast(&R); + if (VPI && VPI->getOpcode() == Instruction::Freeze) { + VPI->replaceAllUsesWith(Plan.getOrAddLiveIn( + ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue()))); + continue; + } + + // Re-use the trip count and steps expanded for the main loop, as + // skeleton creation needs it as a value that dominates both the scalar + // and vector epilogue loops + auto *ExpandR = dyn_cast(&R); + if (!ExpandR) + continue; + auto *ExpandedVal = + Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second); + ExpandR->replaceAllUsesWith(ExpandedVal); + if (Plan.getTripCount() == ExpandR) + Plan.resetTripCount(ExpandedVal); + ExpandR->eraseFromParent(); + } } // Generate bypass values from the additional bypass block. Note that when the diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index b16a8fc563f4c..a117d82e64ef7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -423,6 +423,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { if (isSingleScalar() || isVectorToScalar()) return true; switch (Opcode) { + case Instruction::Freeze: case Instruction::ICmp: case Instruction::PHI: case Instruction::Select: @@ -474,6 +475,10 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Idx = State.get(getOperand(1), /*IsScalar=*/true); return Builder.CreateExtractElement(Vec, Idx, Name); } + case Instruction::Freeze: { + Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); + return Builder.CreateFreeze(Op, Name); + } case Instruction::ICmp: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); @@ -909,6 +914,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { return false; switch (getOpcode()) { case Instruction::ExtractElement: + case Instruction::Freeze: case Instruction::ICmp: case Instruction::Select: case VPInstruction::AnyOf: @@ -941,6 +947,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { case Instruction::ICmp: case Instruction::Select: case Instruction::Or: + case Instruction::Freeze: // TODO: Cover additional opcodes. return vputils::onlyFirstLaneUsed(this); case VPInstruction::ActiveLaneMask: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll index 7296cc0840dc0..c0806ea16a5fc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll @@ -9,6 +9,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 8 +; CHECK-NEXT: [[FR:%.*]] = freeze i8 [[START]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] ; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 32 @@ -42,7 +43,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) ; CHECK-NEXT: [[TMP13:%.*]] = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[RDX_MINMAX]]) ; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP13]], -128 -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP13]], i8 [[START]] +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP13]], i8 [[FR]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: @@ -53,8 +54,8 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[START]] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[FR]] ; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i8 -128, i8 [[BC_MERGE_RDX]] ; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i32 [[TMP2]], 8 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF4]] @@ -82,7 +83,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP22:%.*]] = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> [[TMP20]]) ; CHECK-NEXT: [[RDX_SELECT_CMP14:%.*]] = icmp ne i8 [[TMP22]], -128 -; CHECK-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i8 [[TMP22]], i8 [[START]] +; CHECK-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i8 [[TMP22]], i8 [[FR]] ; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: @@ -128,6 +129,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK-NEXT: [[N_POS:%.*]] = icmp sgt i32 [[N]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[N_POS]]) ; CHECK-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[START]] ; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[N_EXT]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] @@ -166,7 +168,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK-NEXT: [[RDX_MINMAX6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX5]], <4 x i32> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX6]]) ; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP8]], -2147483648 -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[START]] +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[FR]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: @@ -175,8 +177,8 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[START]] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[FR]] ; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -2147483648, i32 [[BC_MERGE_RDX]] ; CHECK-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[TMP0]], 4 ; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF7]] @@ -203,7 +205,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP14]]) ; CHECK-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp ne i32 [[TMP16]], -2147483648 -; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[START]] +; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[FR]] ; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]] ; CHECK-NEXT: br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index ee154ea5a169a..800b6f3f28b7d 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -217,6 +217,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4 +; CHECK-NEXT: [[FR:%.*]] = freeze i8 [[START]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] ; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: ; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 4 @@ -243,7 +244,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP10:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP8]]) ; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i8 [[TMP10]], -128 -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i8 [[TMP10]], i8 [[START]] +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i8 [[TMP10]], i8 [[FR]] ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: @@ -254,8 +255,8 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[START]] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[FR]] ; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i8 -128, i8 [[BC_MERGE_RDX]] ; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[TMP2]], 4 ; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF2]] @@ -283,7 +284,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP19:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP17]]) ; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP19]], -128 -; CHECK-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP19]], i8 [[START]] +; CHECK-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP19]], i8 [[FR]] ; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N14]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: From a9a7b711e46548b2d6e9fef4daf39c455923644a Mon Sep 17 00:00:00 2001 From: Vladi Krapp Date: Fri, 4 Apr 2025 11:51:18 +0100 Subject: [PATCH 0629/1029] [ARM][NFC] Remove lines unnecessary for test (#134359) --- clang/test/Driver/arm-thread-pointer.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/clang/test/Driver/arm-thread-pointer.c b/clang/test/Driver/arm-thread-pointer.c index 9433cbfa091ff..7d87ed2a8e7f7 100644 --- a/clang/test/Driver/arm-thread-pointer.c +++ b/clang/test/Driver/arm-thread-pointer.c @@ -88,6 +88,3 @@ // RUN: %clang --target=thumbv6t2-linux -mtp=cp15 -x assembler -### %s 2>&1 | FileCheck -check-prefix=Thumbv6t2_Asm %s // Thumbv6t2_Asm-NOT: "-target-feature" "+read-tp-" -// A dummy main is provided to form a valid translation unit. -int main(void) { return 0; } - From a17d49687a2f5bc43fa57376a566c65a7fc97d7b Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Fri, 4 Apr 2025 11:54:49 +0100 Subject: [PATCH 0630/1029] [Flang][Driver][AMDGPU] Fix -mcode-object-version (#134230) This patch updates flang to follow clang's behavior when processing the `-mcode-object-version` option. It is now used to populate an LLVM module flag called `amdhsa_code_object_version` expected by the backend and also updates the driver to add the `--amdhsa-code-object-version` option to the frontend invocation for device compilation of AMDGPU targets. --- clang/lib/Driver/ToolChains/Flang.cpp | 3 +++ flang/include/flang/Frontend/CodeGenOptions.h | 2 +- flang/lib/Frontend/FrontendActions.cpp | 11 ++++++++++ flang/test/Driver/code-object-version.f90 | 7 ++++++ .../amdgpu-code-object-version.f90 | 22 +++++++++++++++++++ 5 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 flang/test/Integration/amdgpu-code-object-version.f90 diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 96e2486da764c..a8b4688aed09c 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -420,6 +420,9 @@ void Flang::AddAMDGPUTargetArgs(const ArgList &Args, if (Arg *A = Args.getLastArg(options::OPT_mcode_object_version_EQ)) { StringRef Val = A->getValue(); CmdArgs.push_back(Args.MakeArgString("-mcode-object-version=" + Val)); + CmdArgs.push_back(Args.MakeArgString("-mllvm")); + CmdArgs.push_back( + Args.MakeArgString("--amdhsa-code-object-version=" + Val)); } const ToolChain &TC = getToolChain(); diff --git a/flang/include/flang/Frontend/CodeGenOptions.h b/flang/include/flang/Frontend/CodeGenOptions.h index 23d99e1f0897a..2b4e823b3fef4 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.h +++ b/flang/include/flang/Frontend/CodeGenOptions.h @@ -95,7 +95,7 @@ class CodeGenOptions : public CodeGenOptionsBase { /// \brief Code object version for AMDGPU. llvm::CodeObjectVersionKind CodeObjectVersion = - llvm::CodeObjectVersionKind::COV_5; + llvm::CodeObjectVersionKind::COV_None; /// Optimization remark with an optional regular expression pattern. struct OptRemark { diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index bd2c0632cb35d..d304e74f34f5c 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -804,6 +804,17 @@ void CodeGenAction::generateLLVMIR() { llvmModule->addModuleFlag( llvm::Module::Error, "target-abi", llvm::MDString::get(llvmModule->getContext(), targetOpts.abi)); + + if (triple.isAMDGPU() || + (triple.isSPIRV() && triple.getVendor() == llvm::Triple::AMD)) { + // Emit amdhsa_code_object_version module flag, which is code object version + // times 100. + if (opts.CodeObjectVersion != llvm::CodeObjectVersionKind::COV_None) { + llvmModule->addModuleFlag(llvm::Module::Error, + "amdhsa_code_object_version", + opts.CodeObjectVersion); + } + } } static std::unique_ptr diff --git a/flang/test/Driver/code-object-version.f90 b/flang/test/Driver/code-object-version.f90 index e10877563c4d0..430cc864d03ec 100644 --- a/flang/test/Driver/code-object-version.f90 +++ b/flang/test/Driver/code-object-version.f90 @@ -5,5 +5,12 @@ ! RUN: %flang -target x86_64-unknown-linux-gnu -mcode-object-version=3 -S %s -o \ ! RUN: /dev/null 2>&1 | FileCheck --check-prefix=UNUSED_PARAM %s +! RUN: %flang -target amdgcn-amd-amdhsa -mcpu=gfx908 -mcode-object-version=5 -nogpulib -c %s -### 2>&1 \ +! RUN: | FileCheck %s -check-prefix=VALID_USE + ! INVALID_VERSION: error: invalid integral value '3' in '-mcode-object-version=3' ! UNUSED_PARAM: warning: argument unused during compilation: '-mcode-object-version=3' [-Wunused-command-line-argument] + +! VALID_USE: "-fc1" "-triple" "amdgcn-amd-amdhsa" +! VALID_USE-SAME: "-mcode-object-version=5" +! VALID_USE-SAME: "-mllvm" "--amdhsa-code-object-version=5" diff --git a/flang/test/Integration/amdgpu-code-object-version.f90 b/flang/test/Integration/amdgpu-code-object-version.f90 new file mode 100644 index 0000000000000..c5194d2007f2a --- /dev/null +++ b/flang/test/Integration/amdgpu-code-object-version.f90 @@ -0,0 +1,22 @@ +!REQUIRES: amdgpu-registered-target + +!RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -target-cpu gfx908 %s -o - | FileCheck --check-prefix=COV-DEFAULT %s +!RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -target-cpu gfx908 -mcode-object-version=none %s -o - | FileCheck --check-prefix=COV-NONE %s +!RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -target-cpu gfx908 -mcode-object-version=4 %s -o - | FileCheck --check-prefix=COV-4 %s +!RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -target-cpu gfx908 -mcode-object-version=5 %s -o - | FileCheck --check-prefix=COV-5 %s +!RUN: %flang_fc1 -emit-llvm -triple amdgcn-amd-amdhsa -target-cpu gfx908 -mcode-object-version=6 %s -o - | FileCheck --check-prefix=COV-6 %s + +!COV-DEFAULT-NOT: !{{.*}} = !{{{.*}}, !"amdhsa_code_object_version", {{.*}}} +!COV-NONE-NOT: !{{.*}} = !{{{.*}}, !"amdhsa_code_object_version", {{.*}}} + +!COV-4: !llvm.module.flags = !{{{.*}}, ![[COV_FLAG:.*]]} +!COV-4: ![[COV_FLAG]] = !{i32 1, !"amdhsa_code_object_version", i32 400} + +!COV-5: !llvm.module.flags = !{{{.*}}, ![[COV_FLAG:.*]]} +!COV-5: ![[COV_FLAG]] = !{i32 1, !"amdhsa_code_object_version", i32 500} + +!COV-6: !llvm.module.flags = !{{{.*}}, ![[COV_FLAG:.*]]} +!COV-6: ![[COV_FLAG]] = !{i32 1, !"amdhsa_code_object_version", i32 600} + +subroutine target_simple +end subroutine From b0b97e3b0507f44c126dd2e6d3e6575cf1ec598d Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Fri, 4 Apr 2025 12:13:45 +0100 Subject: [PATCH 0631/1029] [LLVM][AArch64] Refactor lowering of fixed length integer setcc operations. (#132434) The original code is essentially performing isel during legalisation with the AArch64 specific nodes offering no additional value compared to ISD::SETCC. --- .../Target/AArch64/AArch64ISelLowering.cpp | 133 +++----- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 10 - .../lib/Target/AArch64/AArch64InstrFormats.td | 6 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 36 ++- llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll | 15 +- .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 6 +- .../AArch64/neon-bitwise-instructions.ll | 18 +- .../AArch64/neon-compare-instructions.ll | 301 +++++------------- .../sve-fixed-length-extract-subvector.ll | 5 +- .../AArch64/sve-fixed-length-masked-gather.ll | 4 +- .../sve-fixed-length-masked-scatter.ll | 3 +- .../AArch64/vec-combine-compare-to-bitmask.ll | 39 +-- 12 files changed, 185 insertions(+), 391 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e0be0d83f7513..a1ba3922996a1 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2057,6 +2057,15 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) { setOperationAction(ISD::READ_REGISTER, MVT::i128, Custom); setOperationAction(ISD::WRITE_REGISTER, MVT::i128, Custom); } + + if (VT.isInteger()) { + // Let common code emit inverted variants of compares we do support. + setCondCodeAction(ISD::SETNE, VT, Expand); + setCondCodeAction(ISD::SETLE, VT, Expand); + setCondCodeAction(ISD::SETLT, VT, Expand); + setCondCodeAction(ISD::SETULE, VT, Expand); + setCondCodeAction(ISD::SETULT, VT, Expand); + } } bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT, @@ -2581,31 +2590,21 @@ unsigned AArch64TargetLowering::ComputeNumSignBitsForTargetNode( unsigned VTBits = VT.getScalarSizeInBits(); unsigned Opcode = Op.getOpcode(); switch (Opcode) { - case AArch64ISD::CMEQ: - case AArch64ISD::CMGE: - case AArch64ISD::CMGT: - case AArch64ISD::CMHI: - case AArch64ISD::CMHS: - case AArch64ISD::FCMEQ: - case AArch64ISD::FCMGE: - case AArch64ISD::FCMGT: - case AArch64ISD::CMEQz: - case AArch64ISD::CMGEz: - case AArch64ISD::CMGTz: - case AArch64ISD::CMLEz: - case AArch64ISD::CMLTz: - case AArch64ISD::FCMEQz: - case AArch64ISD::FCMGEz: - case AArch64ISD::FCMGTz: - case AArch64ISD::FCMLEz: - case AArch64ISD::FCMLTz: - // Compares return either 0 or all-ones - return VTBits; - case AArch64ISD::VASHR: { - unsigned Tmp = - DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); - return std::min(Tmp + Op.getConstantOperandVal(1), VTBits); - } + case AArch64ISD::FCMEQ: + case AArch64ISD::FCMGE: + case AArch64ISD::FCMGT: + case AArch64ISD::FCMEQz: + case AArch64ISD::FCMGEz: + case AArch64ISD::FCMGTz: + case AArch64ISD::FCMLEz: + case AArch64ISD::FCMLTz: + // Compares return either 0 or all-ones + return VTBits; + case AArch64ISD::VASHR: { + unsigned Tmp = + DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); + return std::min(Tmp + Op.getConstantOperandVal(1), VTBits); + } } return 1; @@ -2812,19 +2811,9 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::VASHR) MAKE_CASE(AArch64ISD::VSLI) MAKE_CASE(AArch64ISD::VSRI) - MAKE_CASE(AArch64ISD::CMEQ) - MAKE_CASE(AArch64ISD::CMGE) - MAKE_CASE(AArch64ISD::CMGT) - MAKE_CASE(AArch64ISD::CMHI) - MAKE_CASE(AArch64ISD::CMHS) MAKE_CASE(AArch64ISD::FCMEQ) MAKE_CASE(AArch64ISD::FCMGE) MAKE_CASE(AArch64ISD::FCMGT) - MAKE_CASE(AArch64ISD::CMEQz) - MAKE_CASE(AArch64ISD::CMGEz) - MAKE_CASE(AArch64ISD::CMGTz) - MAKE_CASE(AArch64ISD::CMLEz) - MAKE_CASE(AArch64ISD::CMLTz) MAKE_CASE(AArch64ISD::FCMEQz) MAKE_CASE(AArch64ISD::FCMGEz) MAKE_CASE(AArch64ISD::FCMGTz) @@ -15840,9 +15829,6 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, SplatBitSize, HasAnyUndefs); bool IsZero = IsCnst && SplatValue == 0; - bool IsOne = - IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1; - bool IsMinusOne = IsCnst && SplatValue.isAllOnes(); if (SrcVT.getVectorElementType().isFloatingPoint()) { switch (CC) { @@ -15889,50 +15875,7 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, } } - switch (CC) { - default: - return SDValue(); - case AArch64CC::NE: { - SDValue Cmeq; - if (IsZero) - Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); - else - Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); - return DAG.getNOT(dl, Cmeq, VT); - } - case AArch64CC::EQ: - if (IsZero) - return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); - return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); - case AArch64CC::GE: - if (IsZero) - return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); - return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); - case AArch64CC::GT: - if (IsZero) - return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); - if (IsMinusOne) - return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); - return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); - case AArch64CC::LE: - if (IsZero) - return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); - return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); - case AArch64CC::LS: - return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); - case AArch64CC::LO: - return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); - case AArch64CC::LT: - if (IsZero) - return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); - if (IsOne) - return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); - return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); - case AArch64CC::HI: - return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); - case AArch64CC::HS: - return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); - } + return SDValue(); } SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, @@ -15950,13 +15893,8 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); SDLoc dl(Op); - if (LHS.getValueType().getVectorElementType().isInteger()) { - assert(LHS.getValueType() == RHS.getValueType()); - AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); - SDValue Cmp = - EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); - return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); - } + if (LHS.getValueType().getVectorElementType().isInteger()) + return Op; // Lower isnan(x) | isnan(never-nan) to x != x. // Lower !isnan(x) & !isnan(never-nan) to x == x. @@ -18152,7 +18090,9 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) return SDValue(); - return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); + SDLoc DL(N); + SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType()); + return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE); } // Given a vecreduce_add node, detect the below pattern and convert it to the @@ -18763,7 +18703,8 @@ static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0)); - SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In); + SDValue Zero = DAG.getConstant(0, DL, In.getValueType()); + SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT); return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM); } @@ -25292,6 +25233,16 @@ static SDValue performSETCCCombine(SDNode *N, if (SDValue V = performOrXorChainCombine(N, DAG)) return V; + EVT CmpVT = LHS.getValueType(); + + // NOTE: This exists as a combine only because it proved too awkward to match + // splat(1) across all the NEON types during isel. + APInt SplatLHSVal; + if (CmpVT.isInteger() && Cond == ISD::SETGT && + ISD::isConstantSplatVector(LHS.getNode(), SplatLHSVal) && + SplatLHSVal.isOne()) + return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, CmpVT), RHS, ISD::SETGE); + return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index bc0c3a832bb28..ba275e18fa126 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -241,21 +241,11 @@ enum NodeType : unsigned { VSRI, // Vector comparisons - CMEQ, - CMGE, - CMGT, - CMHI, - CMHS, FCMEQ, FCMGE, FCMGT, // Vector zero comparisons - CMEQz, - CMGEz, - CMGTz, - CMLEz, - CMLTz, FCMEQz, FCMGEz, FCMGTz, diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 255cd0ec5840c..6d8b84ea4239c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -7086,7 +7086,7 @@ multiclass SIMD_FP8_CVTLsz, string asm, ValueType dty, SDPatternOperator class BaseSIMDCmpTwoVector size, bits<2> size2, bits<5> opcode, RegisterOperand regtype, string asm, string kind, string zero, ValueType dty, - ValueType sty, SDNode OpNode> + ValueType sty, SDPatternOperator OpNode> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero # "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "", @@ -7110,7 +7110,7 @@ class BaseSIMDCmpTwoVector size, bits<2> size2, // Comparisons support all element sizes, except 1xD. multiclass SIMDCmpTwoVector opc, string asm, - SDNode OpNode> { + SDPatternOperator OpNode> { def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64, asm, ".8b", "0", v8i8, v8i8, OpNode>; @@ -7981,7 +7981,7 @@ multiclass SIMDCmpTwoScalarD opc, string asm, SDPatternOperator OpNode> { def v1i64rz : BaseSIMDCmpTwoScalar; - def : Pat<(v1i64 (OpNode FPR64:$Rn)), + def : Pat<(v1i64 (OpNode v1i64:$Rn)), (!cast(NAME # v1i64rz) FPR64:$Rn)>; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index a3b1ae55df028..a2d98a0862988 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -846,23 +846,35 @@ def AArch64vsri : SDNode<"AArch64ISD::VSRI", SDT_AArch64vshiftinsert>; def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>; -def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>; -def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>; -def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>; -def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>; -def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>; +def AArch64cmeq : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETEQ)>; +def AArch64cmge : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETGE)>; +def AArch64cmgt : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETGT)>; +def AArch64cmhi : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETUGT)>; +def AArch64cmhs : PatFrag<(ops node:$lhs, node:$rhs), + (setcc node:$lhs, node:$rhs, SETUGE)>; def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>; def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>; def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>; -def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>; -def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>; -def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>; -def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>; -def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>; +def AArch64cmeqz : PatFrag<(ops node:$lhs), + (setcc node:$lhs, immAllZerosV, SETEQ)>; +def AArch64cmgez : PatFrags<(ops node:$lhs), + [(setcc node:$lhs, immAllZerosV, SETGE), + (setcc node:$lhs, immAllOnesV, SETGT)]>; +def AArch64cmgtz : PatFrag<(ops node:$lhs), + (setcc node:$lhs, immAllZerosV, SETGT)>; +def AArch64cmlez : PatFrag<(ops node:$lhs), + (setcc immAllZerosV, node:$lhs, SETGE)>; +def AArch64cmltz : PatFrag<(ops node:$lhs), + (setcc immAllZerosV, node:$lhs, SETGT)>; + def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS), - (vnot (AArch64cmeqz (and node:$LHS, node:$RHS)))>; + (vnot (AArch64cmeqz (and node:$LHS, node:$RHS)))>; def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>; def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>; @@ -5671,7 +5683,7 @@ defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>; defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>; defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>; foreach VT = [ v8i8, v16i8, v4i16, v8i16, v2i32, v4i32, v2i64 ] in { -def : Pat<(vnot (AArch64cmeqz VT:$Rn)), (!cast("CMTST"#VT) VT:$Rn, VT:$Rn)>; +def : Pat<(VT (vnot (AArch64cmeqz VT:$Rn))), (!cast("CMTST"#VT) VT:$Rn, VT:$Rn)>; } defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>; let Predicates = [HasNEON] in { diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll index f0c9dccb21d84..c7a423f2e4f8d 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -352,17 +352,16 @@ define void @typei1_orig(i64 %a, ptr %p, ptr %q) { ; ; CHECK-GI-LABEL: typei1_orig: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldr q1, [x2] +; CHECK-GI-NEXT: ldr q0, [x2] ; CHECK-GI-NEXT: cmp x0, #0 -; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff ; CHECK-GI-NEXT: cset w8, gt -; CHECK-GI-NEXT: neg v1.8h, v1.8h -; CHECK-GI-NEXT: dup v2.8h, w8 -; CHECK-GI-NEXT: mvn v0.16b, v0.16b -; CHECK-GI-NEXT: mul v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: cmeq v1.8h, v1.8h, #0 +; CHECK-GI-NEXT: neg v0.8h, v0.8h +; CHECK-GI-NEXT: dup v1.8h, w8 +; CHECK-GI-NEXT: mul v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: movi v1.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: cmtst v0.8h, v0.8h, v0.8h ; CHECK-GI-NEXT: mvn v1.16b, v1.16b -; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: uzp1 v0.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: shl v0.16b, v0.16b, #7 ; CHECK-GI-NEXT: sshr v0.16b, v0.16b, #7 ; CHECK-GI-NEXT: str q0, [x1] diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index 81770a4ebdd4d..c834ca772b6ac 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -2382,11 +2382,11 @@ define <2 x i1> @test_signed_v2f64_v2i1(<2 x double> %f) { ; CHECK-GI-LABEL: test_signed_v2f64_v2i1: ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: fcvtzs v0.2d, v0.2d -; CHECK-GI-NEXT: movi v2.2d, #0xffffffffffffffff ; CHECK-GI-NEXT: cmlt v1.2d, v0.2d, #0 ; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: cmgt v1.2d, v0.2d, v2.2d -; CHECK-GI-NEXT: bif v0.16b, v2.16b, v1.16b +; CHECK-GI-NEXT: movi v1.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: cmge v2.2d, v0.2d, #0 +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-GI-NEXT: xtn v0.2s, v0.2d ; CHECK-GI-NEXT: ret %x = call <2 x i1> @llvm.fptosi.sat.v2f64.v2i1(<2 x double> %f) diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll index f6dbf5251fc27..fb65a748c865f 100644 --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -1499,8 +1499,7 @@ define <8 x i8> @vselect_cmpz_ne(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { ; ; CHECK-GI-LABEL: vselect_cmpz_ne: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: cmeq v0.8b, v0.8b, #0 -; CHECK-GI-NEXT: mvn v0.8b, v0.8b +; CHECK-GI-NEXT: cmtst v0.8b, v0.8b, v0.8b ; CHECK-GI-NEXT: bsl v0.8b, v1.8b, v2.8b ; CHECK-GI-NEXT: ret %cmp = icmp ne <8 x i8> %a, zeroinitializer @@ -1533,17 +1532,10 @@ define <8 x i8> @vselect_tst(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { } define <8 x i8> @sext_tst(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) { -; CHECK-SD-LABEL: sext_tst: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sext_tst: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: cmeq v0.8b, v0.8b, #0 -; CHECK-GI-NEXT: mvn v0.8b, v0.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sext_tst: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret %tmp3 = and <8 x i8> %a, %b %tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer %d = sext <8 x i1> %tmp4 to <8 x i8> diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll index 8f7d5dd5588b9..2c2cb72112879 100644 --- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll @@ -738,17 +738,10 @@ define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) { } define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) { -; CHECK-SD-LABEL: cmtst8xi8: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.8b, v0.8b, v1.8b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmtst8xi8: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: cmeq v0.8b, v0.8b, #0 -; CHECK-GI-NEXT: mvn v0.8b, v0.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmtst8xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret %tmp3 = and <8 x i8> %A, %B %tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer %tmp5 = sext <8 x i1> %tmp4 to <8 x i8> @@ -756,17 +749,10 @@ define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) { } define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) { -; CHECK-SD-LABEL: cmtst16xi8: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmtst16xi8: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: cmeq v0.16b, v0.16b, #0 -; CHECK-GI-NEXT: mvn v0.16b, v0.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmtst16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret %tmp3 = and <16 x i8> %A, %B %tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer %tmp5 = sext <16 x i1> %tmp4 to <16 x i8> @@ -774,17 +760,10 @@ define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) { } define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) { -; CHECK-SD-LABEL: cmtst4xi16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.4h, v0.4h, v1.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmtst4xi16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: cmeq v0.4h, v0.4h, #0 -; CHECK-GI-NEXT: mvn v0.8b, v0.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmtst4xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret %tmp3 = and <4 x i16> %A, %B %tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer %tmp5 = sext <4 x i1> %tmp4 to <4 x i16> @@ -792,17 +771,10 @@ define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) { } define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) { -; CHECK-SD-LABEL: cmtst8xi16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.8h, v0.8h, v1.8h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmtst8xi16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: cmeq v0.8h, v0.8h, #0 -; CHECK-GI-NEXT: mvn v0.16b, v0.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmtst8xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret %tmp3 = and <8 x i16> %A, %B %tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer %tmp5 = sext <8 x i1> %tmp4 to <8 x i16> @@ -810,17 +782,10 @@ define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) { } define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) { -; CHECK-SD-LABEL: cmtst2xi32: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmtst2xi32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: cmeq v0.2s, v0.2s, #0 -; CHECK-GI-NEXT: mvn v0.8b, v0.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmtst2xi32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret %tmp3 = and <2 x i32> %A, %B %tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer %tmp5 = sext <2 x i1> %tmp4 to <2 x i32> @@ -828,17 +793,10 @@ define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) { } define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) { -; CHECK-SD-LABEL: cmtst4xi32: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.4s, v0.4s, v1.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmtst4xi32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-GI-NEXT: mvn v0.16b, v0.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmtst4xi32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret %tmp3 = and <4 x i32> %A, %B %tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer %tmp5 = sext <4 x i1> %tmp4 to <4 x i32> @@ -846,17 +804,10 @@ define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) { } define <2 x i64> @cmtst2xi64(<2 x i64> %A, <2 x i64> %B) { -; CHECK-SD-LABEL: cmtst2xi64: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.2d, v0.2d, v1.2d -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmtst2xi64: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: cmeq v0.2d, v0.2d, #0 -; CHECK-GI-NEXT: mvn v0.16b, v0.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmtst2xi64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret %tmp3 = and <2 x i64> %A, %B %tmp4 = icmp ne <2 x i64> %tmp3, zeroinitializer %tmp5 = sext <2 x i1> %tmp4 to <2 x i64> @@ -1120,112 +1071,70 @@ define <2 x i64> @cmgez2xi64_alt(<2 x i64> %A) { } define <8 x i8> @cmgez8xi8_alt2(<8 x i8> %A) { -; CHECK-SD-LABEL: cmgez8xi8_alt2: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmge v0.8b, v0.8b, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmgez8xi8_alt2: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi d1, #0xffffffffffffffff -; CHECK-GI-NEXT: cmgt v0.8b, v0.8b, v1.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmgez8xi8_alt2: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.8b, v0.8b, #0 +; CHECK-NEXT: ret %tmp3 = icmp sgt <8 x i8> %A, %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> ret <8 x i8> %tmp4 } define <16 x i8> @cmgez16xi8_alt2(<16 x i8> %A) { -; CHECK-SD-LABEL: cmgez16xi8_alt2: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmge v0.16b, v0.16b, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmgez16xi8_alt2: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-GI-NEXT: cmgt v0.16b, v0.16b, v1.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmgez16xi8_alt2: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.16b, v0.16b, #0 +; CHECK-NEXT: ret %tmp3 = icmp sgt <16 x i8> %A, %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> ret <16 x i8> %tmp4 } define <4 x i16> @cmgez4xi16_alt2(<4 x i16> %A) { -; CHECK-SD-LABEL: cmgez4xi16_alt2: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmge v0.4h, v0.4h, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmgez4xi16_alt2: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi d1, #0xffffffffffffffff -; CHECK-GI-NEXT: cmgt v0.4h, v0.4h, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmgez4xi16_alt2: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.4h, v0.4h, #0 +; CHECK-NEXT: ret %tmp3 = icmp sgt <4 x i16> %A, %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> ret <4 x i16> %tmp4 } define <8 x i16> @cmgez8xi16_alt2(<8 x i16> %A) { -; CHECK-SD-LABEL: cmgez8xi16_alt2: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmge v0.8h, v0.8h, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmgez8xi16_alt2: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-GI-NEXT: cmgt v0.8h, v0.8h, v1.8h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmgez8xi16_alt2: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.8h, v0.8h, #0 +; CHECK-NEXT: ret %tmp3 = icmp sgt <8 x i16> %A, %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> ret <8 x i16> %tmp4 } define <2 x i32> @cmgez2xi32_alt2(<2 x i32> %A) { -; CHECK-SD-LABEL: cmgez2xi32_alt2: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmge v0.2s, v0.2s, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmgez2xi32_alt2: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi d1, #0xffffffffffffffff -; CHECK-GI-NEXT: cmgt v0.2s, v0.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmgez2xi32_alt2: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.2s, v0.2s, #0 +; CHECK-NEXT: ret %tmp3 = icmp sgt <2 x i32> %A, %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ret <2 x i32> %tmp4 } define <4 x i32> @cmgez4xi32_alt2(<4 x i32> %A) { -; CHECK-SD-LABEL: cmgez4xi32_alt2: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmge v0.4s, v0.4s, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmgez4xi32_alt2: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-GI-NEXT: cmgt v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmgez4xi32_alt2: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.4s, v0.4s, #0 +; CHECK-NEXT: ret %tmp3 = icmp sgt <4 x i32> %A, %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ret <4 x i32> %tmp4 } define <2 x i64> @cmgez2xi64_alt2(<2 x i64> %A) { -; CHECK-SD-LABEL: cmgez2xi64_alt2: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmge v0.2d, v0.2d, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmgez2xi64_alt2: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-GI-NEXT: cmgt v0.2d, v0.2d, v1.2d -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmgez2xi64_alt2: +; CHECK: // %bb.0: +; CHECK-NEXT: cmge v0.2d, v0.2d, #0 +; CHECK-NEXT: ret %tmp3 = icmp sgt <2 x i64> %A, %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ret <2 x i64> %tmp4 @@ -1692,112 +1601,70 @@ define <2 x i64> @cmltz2xi64_alt(<2 x i64> %A) { } define <8 x i8> @cmneqz8xi8(<8 x i8> %A) { -; CHECK-SD-LABEL: cmneqz8xi8: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.8b, v0.8b, v0.8b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmneqz8xi8: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: cmeq v0.8b, v0.8b, #0 -; CHECK-GI-NEXT: mvn v0.8b, v0.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmneqz8xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b +; CHECK-NEXT: ret %tmp3 = icmp ne <8 x i8> %A, zeroinitializer %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> ret <8 x i8> %tmp4 } define <16 x i8> @cmneqz16xi8(<16 x i8> %A) { -; CHECK-SD-LABEL: cmneqz16xi8: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.16b, v0.16b, v0.16b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmneqz16xi8: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: cmeq v0.16b, v0.16b, #0 -; CHECK-GI-NEXT: mvn v0.16b, v0.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmneqz16xi8: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b +; CHECK-NEXT: ret %tmp3 = icmp ne <16 x i8> %A, zeroinitializer %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> ret <16 x i8> %tmp4 } define <4 x i16> @cmneqz4xi16(<4 x i16> %A) { -; CHECK-SD-LABEL: cmneqz4xi16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.4h, v0.4h, v0.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmneqz4xi16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: cmeq v0.4h, v0.4h, #0 -; CHECK-GI-NEXT: mvn v0.8b, v0.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmneqz4xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.4h, v0.4h, v0.4h +; CHECK-NEXT: ret %tmp3 = icmp ne <4 x i16> %A, zeroinitializer %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> ret <4 x i16> %tmp4 } define <8 x i16> @cmneqz8xi16(<8 x i16> %A) { -; CHECK-SD-LABEL: cmneqz8xi16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.8h, v0.8h, v0.8h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmneqz8xi16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: cmeq v0.8h, v0.8h, #0 -; CHECK-GI-NEXT: mvn v0.16b, v0.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmneqz8xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.8h, v0.8h, v0.8h +; CHECK-NEXT: ret %tmp3 = icmp ne <8 x i16> %A, zeroinitializer %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> ret <8 x i16> %tmp4 } define <2 x i32> @cmneqz2xi32(<2 x i32> %A) { -; CHECK-SD-LABEL: cmneqz2xi32: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.2s, v0.2s, v0.2s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmneqz2xi32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: cmeq v0.2s, v0.2s, #0 -; CHECK-GI-NEXT: mvn v0.8b, v0.8b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmneqz2xi32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.2s, v0.2s, v0.2s +; CHECK-NEXT: ret %tmp3 = icmp ne <2 x i32> %A, zeroinitializer %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> ret <2 x i32> %tmp4 } define <4 x i32> @cmneqz4xi32(<4 x i32> %A) { -; CHECK-SD-LABEL: cmneqz4xi32: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.4s, v0.4s, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmneqz4xi32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-GI-NEXT: mvn v0.16b, v0.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmneqz4xi32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s +; CHECK-NEXT: ret %tmp3 = icmp ne <4 x i32> %A, zeroinitializer %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ret <4 x i32> %tmp4 } define <2 x i64> @cmneqz2xi64(<2 x i64> %A) { -; CHECK-SD-LABEL: cmneqz2xi64: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: cmtst v0.2d, v0.2d, v0.2d -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: cmneqz2xi64: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: cmeq v0.2d, v0.2d, #0 -; CHECK-GI-NEXT: mvn v0.16b, v0.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: cmneqz2xi64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.2d, v0.2d, v0.2d +; CHECK-NEXT: ret %tmp3 = icmp ne <2 x i64> %A, zeroinitializer %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> ret <2 x i64> %tmp4 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll index bda7ff9115e09..55f70b2ffc15b 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -547,13 +547,12 @@ define void @extract_subvector_legalization_v8i32() vscale_range(2,2) #0 { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: adrp x8, .LCPI40_0 ; CHECK-NEXT: add x8, x8, :lo12:.LCPI40_0 -; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] ; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 -; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s -; CHECK-NEXT: cmeq v1.4s, v1.4s, v2.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: cmeq v1.4s, v1.4s, #0 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: cmpne p0.d, p1/z, z1.d, #0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index 5516a4716d59d..a50d0dc37eaf6 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -309,8 +309,8 @@ define void @masked_gather_v2i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 -; CHECK-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: cmeq v0.2d, v0.2d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ldr q0, [x1] ; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index e3e06dcdf17f3..a42fce70f4f15 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -295,9 +295,8 @@ define void @masked_scatter_v2i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: sshll v1.2d, v1.2s, #0 +; CHECK-NEXT: cmeq v1.2d, v0.2d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index b0a30b7150637..4e2ca082e28b5 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -30,8 +30,7 @@ define i16 @convert_to_bitmask16(<16 x i8> %vec) { ; CHECK-GI: ; %bb.0: ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GI-NEXT: cmeq.16b v0, v0, #0 -; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: cmtst.16b v0, v0, v0 ; CHECK-GI-NEXT: umov.b w8, v0[1] ; CHECK-GI-NEXT: umov.b w9, v0[0] ; CHECK-GI-NEXT: umov.b w10, v0[2] @@ -106,8 +105,7 @@ define i16 @convert_to_bitmask8(<8 x i16> %vec) { ; CHECK-GI: ; %bb.0: ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GI-NEXT: cmeq.8h v0, v0, #0 -; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: cmtst.8h v0, v0, v0 ; CHECK-GI-NEXT: xtn.8b v0, v0 ; CHECK-GI-NEXT: umov.b w8, v0[1] ; CHECK-GI-NEXT: umov.b w9, v0[0] @@ -158,8 +156,7 @@ define i4 @convert_to_bitmask4(<4 x i32> %vec) { ; CHECK-GI: ; %bb.0: ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 -; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: cmtst.4s v0, v0, v0 ; CHECK-GI-NEXT: mov.s w8, v0[1] ; CHECK-GI-NEXT: mov.s w9, v0[2] ; CHECK-GI-NEXT: fmov w11, s0 @@ -709,10 +706,8 @@ define i8 @convert_large_vector(<8 x i32> %vec) { ; CHECK-GI: ; %bb.0: ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 -; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 -; CHECK-GI-NEXT: cmeq.4s v1, v1, #0 -; CHECK-GI-NEXT: mvn.16b v0, v0 -; CHECK-GI-NEXT: mvn.16b v1, v1 +; CHECK-GI-NEXT: cmtst.4s v0, v0, v0 +; CHECK-GI-NEXT: cmtst.4s v1, v1, v1 ; CHECK-GI-NEXT: uzp1.8h v0, v0, v1 ; CHECK-GI-NEXT: xtn.8b v0, v0 ; CHECK-GI-NEXT: umov.b w8, v0[1] @@ -766,9 +761,7 @@ define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) { ; CHECK-GI-NEXT: sub sp, sp, #16 ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: movi.4s v1, #63, msl #16 -; CHECK-GI-NEXT: and.16b v0, v0, v1 -; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 -; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: cmtst.4s v0, v0, v1 ; CHECK-GI-NEXT: mov.s w8, v0[1] ; CHECK-GI-NEXT: mov.s w9, v0[2] ; CHECK-GI-NEXT: fmov w11, s0 @@ -819,18 +812,11 @@ define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) { } define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) { -; CHECK-SD-LABEL: no_convert_without_direct_bitcast: -; CHECK-SD: ; %bb.0: -; CHECK-SD-NEXT: cmtst.8h v0, v0, v0 -; CHECK-SD-NEXT: xtn.8b v0, v0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: no_convert_without_direct_bitcast: -; CHECK-GI: ; %bb.0: -; CHECK-GI-NEXT: cmeq.8h v0, v0, #0 -; CHECK-GI-NEXT: mvn.16b v0, v0 -; CHECK-GI-NEXT: xtn.8b v0, v0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: no_convert_without_direct_bitcast: +; CHECK: ; %bb.0: +; CHECK-NEXT: cmtst.8h v0, v0, v0 +; CHECK-NEXT: xtn.8b v0, v0 +; CHECK-NEXT: ret %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer ret <8 x i1> %cmp_result @@ -882,8 +868,7 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) { ; CHECK-GI-NEXT: cmeq.4s v1, v1, v2 ; CHECK-GI-NEXT: mvn.16b v1, v1 ; CHECK-GI-NEXT: mov.s v0[3], w3 -; CHECK-GI-NEXT: cmeq.4s v0, v0, #0 -; CHECK-GI-NEXT: mvn.16b v0, v0 +; CHECK-GI-NEXT: cmtst.4s v0, v0, v0 ; CHECK-GI-NEXT: mov.s w8, v0[1] ; CHECK-GI-NEXT: mov.s w9, v0[2] ; CHECK-GI-NEXT: mov.s w10, v0[3] From 0d17547879ffbd50d79907ea23fdac199758d45e Mon Sep 17 00:00:00 2001 From: JaydeepChauhan14 Date: Fri, 4 Apr 2025 17:12:20 +0530 Subject: [PATCH 0632/1029] [X86][NFC] Added POWI function testcases (#134276) - Moved existing llvm/test/CodeGen/X86/powi.ll file to llvm/test/CodeGen/X86/powi-const.ll. - Added new testcases for powi into llvm/test/CodeGen/X86/powi.ll. --- llvm/test/CodeGen/X86/powi-const.ll | 233 ++++++++++++++++ llvm/test/CodeGen/X86/powi.ll | 411 +++++++++++----------------- 2 files changed, 392 insertions(+), 252 deletions(-) create mode 100644 llvm/test/CodeGen/X86/powi-const.ll diff --git a/llvm/test/CodeGen/X86/powi-const.ll b/llvm/test/CodeGen/X86/powi-const.ll new file mode 100644 index 0000000000000..49f0e9a6b1455 --- /dev/null +++ b/llvm/test/CodeGen/X86/powi-const.ll @@ -0,0 +1,233 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86-X87 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 + +; Ideally this would compile to 5 multiplies. + +define double @pow_wrapper(double %a) nounwind readonly ssp noredzone { +; X86-X87-LABEL: pow_wrapper: +; X86-X87: # %bb.0: +; X86-X87-NEXT: fldl {{[0-9]+}}(%esp) +; X86-X87-NEXT: fld %st(0) +; X86-X87-NEXT: fmul %st(1), %st +; X86-X87-NEXT: fmul %st, %st(1) +; X86-X87-NEXT: fmul %st, %st(0) +; X86-X87-NEXT: fmul %st, %st(1) +; X86-X87-NEXT: fmul %st, %st(0) +; X86-X87-NEXT: fmulp %st, %st(1) +; X86-X87-NEXT: retl +; +; X86-SSE-LABEL: pow_wrapper: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: andl $-8, %esp +; X86-SSE-NEXT: subl $8, %esp +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movapd %xmm0, %xmm1 +; X86-SSE-NEXT: mulsd %xmm0, %xmm1 +; X86-SSE-NEXT: mulsd %xmm1, %xmm0 +; X86-SSE-NEXT: mulsd %xmm1, %xmm1 +; X86-SSE-NEXT: mulsd %xmm1, %xmm0 +; X86-SSE-NEXT: mulsd %xmm1, %xmm1 +; X86-SSE-NEXT: mulsd %xmm0, %xmm1 +; X86-SSE-NEXT: movsd %xmm1, (%esp) +; X86-SSE-NEXT: fldl (%esp) +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: retl +; +; X64-LABEL: pow_wrapper: +; X64: # %bb.0: +; X64-NEXT: movapd %xmm0, %xmm1 +; X64-NEXT: mulsd %xmm0, %xmm1 +; X64-NEXT: mulsd %xmm1, %xmm0 +; X64-NEXT: mulsd %xmm1, %xmm1 +; X64-NEXT: mulsd %xmm1, %xmm0 +; X64-NEXT: mulsd %xmm1, %xmm1 +; X64-NEXT: mulsd %xmm1, %xmm0 +; X64-NEXT: retq + %ret = tail call double @llvm.powi.f64.i32(double %a, i32 15) nounwind ; [#uses=1] + ret double %ret +} + +define double @pow_wrapper_optsize(double %a) nounwind optsize { +; X86-X87-LABEL: pow_wrapper_optsize: +; X86-X87: # %bb.0: +; X86-X87-NEXT: subl $12, %esp +; X86-X87-NEXT: fldl {{[0-9]+}}(%esp) +; X86-X87-NEXT: fstpl (%esp) +; X86-X87-NEXT: movl $15, {{[0-9]+}}(%esp) +; X86-X87-NEXT: calll __powidf2 +; X86-X87-NEXT: addl $12, %esp +; X86-X87-NEXT: retl +; +; X86-SSE-LABEL: pow_wrapper_optsize: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: movl $15, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: calll __powidf2 +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: retl +; +; X64-LABEL: pow_wrapper_optsize: +; X64: # %bb.0: +; X64-NEXT: movl $15, %edi +; X64-NEXT: jmp __powidf2@PLT # TAILCALL + %ret = tail call double @llvm.powi.f64.i32(double %a, i32 15) nounwind ; [#uses=1] + ret double %ret +} + +define double @pow_wrapper_pgso(double %a) nounwind !prof !14 { +; X86-X87-LABEL: pow_wrapper_pgso: +; X86-X87: # %bb.0: +; X86-X87-NEXT: subl $12, %esp +; X86-X87-NEXT: fldl {{[0-9]+}}(%esp) +; X86-X87-NEXT: fstpl (%esp) +; X86-X87-NEXT: movl $15, {{[0-9]+}}(%esp) +; X86-X87-NEXT: calll __powidf2 +; X86-X87-NEXT: addl $12, %esp +; X86-X87-NEXT: retl +; +; X86-SSE-LABEL: pow_wrapper_pgso: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: movl $15, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: calll __powidf2 +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: retl +; +; X64-LABEL: pow_wrapper_pgso: +; X64: # %bb.0: +; X64-NEXT: movl $15, %edi +; X64-NEXT: jmp __powidf2@PLT # TAILCALL + %ret = tail call double @llvm.powi.f64.i32(double %a, i32 15) nounwind ; [#uses=1] + ret double %ret +} + +define double @pow_wrapper_minsize(double %a) nounwind minsize { +; X86-X87-LABEL: pow_wrapper_minsize: +; X86-X87: # %bb.0: +; X86-X87-NEXT: subl $12, %esp +; X86-X87-NEXT: fldl {{[0-9]+}}(%esp) +; X86-X87-NEXT: fstpl (%esp) +; X86-X87-NEXT: movl $15, {{[0-9]+}}(%esp) +; X86-X87-NEXT: calll __powidf2 +; X86-X87-NEXT: addl $12, %esp +; X86-X87-NEXT: retl +; +; X86-SSE-LABEL: pow_wrapper_minsize: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movsd %xmm0, (%esp) +; X86-SSE-NEXT: movl $15, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: calll __powidf2 +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: retl +; +; X64-LABEL: pow_wrapper_minsize: +; X64: # %bb.0: +; X64-NEXT: pushq $15 +; X64-NEXT: popq %rdi +; X64-NEXT: jmp __powidf2@PLT # TAILCALL + %ret = tail call double @llvm.powi.f64.i32(double %a, i32 15) nounwind ; [#uses=1] + ret double %ret +} + +define <2 x float> @powi_v2f32(<2 x float> %a) nounwind minsize { +; X86-X87-LABEL: powi_v2f32: +; X86-X87: # %bb.0: +; X86-X87-NEXT: pushl %esi +; X86-X87-NEXT: subl $16, %esp +; X86-X87-NEXT: flds {{[0-9]+}}(%esp) +; X86-X87-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-X87-NEXT: flds {{[0-9]+}}(%esp) +; X86-X87-NEXT: pushl $15 +; X86-X87-NEXT: popl %esi +; X86-X87-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-X87-NEXT: fstps (%esp) +; X86-X87-NEXT: calll __powisf2 +; X86-X87-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-X87-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-X87-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-X87-NEXT: fstps (%esp) +; X86-X87-NEXT: calll __powisf2 +; X86-X87-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-X87-NEXT: fxch %st(1) +; X86-X87-NEXT: addl $16, %esp +; X86-X87-NEXT: popl %esi +; X86-X87-NEXT: retl +; +; X86-SSE-LABEL: powi_v2f32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %esi +; X86-SSE-NEXT: subl $32, %esp +; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE-NEXT: pushl $15 +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: calll __powisf2 +; X86-SSE-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-SSE-NEXT: movss %xmm0, (%esp) +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: calll __powisf2 +; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: addl $32, %esp +; X86-SSE-NEXT: popl %esi +; X86-SSE-NEXT: retl +; +; X64-LABEL: powi_v2f32: +; X64: # %bb.0: +; X64-NEXT: pushq %rbx +; X64-NEXT: subq $32, %rsp +; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; X64-NEXT: pushq $15 +; X64-NEXT: popq %rbx +; X64-NEXT: movl %ebx, %edi +; X64-NEXT: callq __powisf2@PLT +; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; X64-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: movl %ebx, %edi +; X64-NEXT: callq __powisf2@PLT +; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: addq $32, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: retq + %ret = tail call < 2 x float> @llvm.powi.v2f32.i32(<2 x float> %a, i32 15) nounwind ; + ret <2 x float> %ret +} + +declare double @llvm.powi.f64.i32(double, i32) nounwind readonly +declare < 2 x float> @llvm.powi.v2f32.i32(<2 x float>, i32) nounwind readonly + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 0} diff --git a/llvm/test/CodeGen/X86/powi.ll b/llvm/test/CodeGen/X86/powi.ll index 7dc6564e62a85..e4c691bfbd2e5 100644 --- a/llvm/test/CodeGen/X86/powi.ll +++ b/llvm/test/CodeGen/X86/powi.ll @@ -1,268 +1,175 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86-X87 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FAST-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=SDAG-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FAST-X64 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=SDAG-X64 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X64 -; Ideally this would compile to 5 multiplies. - -define double @pow_wrapper(double %a) nounwind readonly ssp noredzone { -; X86-X87-LABEL: pow_wrapper: -; X86-X87: # %bb.0: -; X86-X87-NEXT: fldl {{[0-9]+}}(%esp) -; X86-X87-NEXT: fld %st(0) -; X86-X87-NEXT: fmul %st(1), %st -; X86-X87-NEXT: fmul %st, %st(1) -; X86-X87-NEXT: fmul %st, %st(0) -; X86-X87-NEXT: fmul %st, %st(1) -; X86-X87-NEXT: fmul %st, %st(0) -; X86-X87-NEXT: fmulp %st, %st(1) -; X86-X87-NEXT: retl +define float @test_powi_f32_i32(float %Val, i32 %x) nounwind { +; FAST-X86-LABEL: test_powi_f32_i32: +; FAST-X86: # %bb.0: +; FAST-X86-NEXT: subl $12, %esp +; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; FAST-X86-NEXT: flds {{[0-9]+}}(%esp) +; FAST-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FAST-X86-NEXT: fstps (%esp) +; FAST-X86-NEXT: calll __powisf2 +; FAST-X86-NEXT: addl $12, %esp +; FAST-X86-NEXT: retl ; -; X86-SSE-LABEL: pow_wrapper: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %ebp -; X86-SSE-NEXT: movl %esp, %ebp -; X86-SSE-NEXT: andl $-8, %esp -; X86-SSE-NEXT: subl $8, %esp -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movapd %xmm0, %xmm1 -; X86-SSE-NEXT: mulsd %xmm0, %xmm1 -; X86-SSE-NEXT: mulsd %xmm1, %xmm0 -; X86-SSE-NEXT: mulsd %xmm1, %xmm1 -; X86-SSE-NEXT: mulsd %xmm1, %xmm0 -; X86-SSE-NEXT: mulsd %xmm1, %xmm1 -; X86-SSE-NEXT: mulsd %xmm0, %xmm1 -; X86-SSE-NEXT: movsd %xmm1, (%esp) -; X86-SSE-NEXT: fldl (%esp) -; X86-SSE-NEXT: movl %ebp, %esp -; X86-SSE-NEXT: popl %ebp -; X86-SSE-NEXT: retl +; SDAG-X86-LABEL: test_powi_f32_i32: +; SDAG-X86: # %bb.0: +; SDAG-X86-NEXT: subl $12, %esp +; SDAG-X86-NEXT: flds {{[0-9]+}}(%esp) +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; SDAG-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SDAG-X86-NEXT: fstps (%esp) +; SDAG-X86-NEXT: calll __powisf2 +; SDAG-X86-NEXT: addl $12, %esp +; SDAG-X86-NEXT: retl ; -; X64-LABEL: pow_wrapper: -; X64: # %bb.0: -; X64-NEXT: movapd %xmm0, %xmm1 -; X64-NEXT: mulsd %xmm0, %xmm1 -; X64-NEXT: mulsd %xmm1, %xmm0 -; X64-NEXT: mulsd %xmm1, %xmm1 -; X64-NEXT: mulsd %xmm1, %xmm0 -; X64-NEXT: mulsd %xmm1, %xmm1 -; X64-NEXT: mulsd %xmm1, %xmm0 -; X64-NEXT: retq - %ret = tail call double @llvm.powi.f64.i32(double %a, i32 15) nounwind ; [#uses=1] - ret double %ret -} - -define double @pow_wrapper_optsize(double %a) optsize { -; X86-X87-LABEL: pow_wrapper_optsize: -; X86-X87: # %bb.0: -; X86-X87-NEXT: subl $12, %esp -; X86-X87-NEXT: .cfi_def_cfa_offset 16 -; X86-X87-NEXT: fldl {{[0-9]+}}(%esp) -; X86-X87-NEXT: fstpl (%esp) -; X86-X87-NEXT: movl $15, {{[0-9]+}}(%esp) -; X86-X87-NEXT: calll __powidf2 -; X86-X87-NEXT: addl $12, %esp -; X86-X87-NEXT: .cfi_def_cfa_offset 4 -; X86-X87-NEXT: retl +; GISEL-X86-LABEL: test_powi_f32_i32: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: subl $12, %esp +; GISEL-X86-NEXT: flds {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: fstps (%esp) +; GISEL-X86-NEXT: calll __powisf2 +; GISEL-X86-NEXT: addl $12, %esp +; GISEL-X86-NEXT: retl ; -; X86-SSE-LABEL: pow_wrapper_optsize: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 16 -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movsd %xmm0, (%esp) -; X86-SSE-NEXT: movl $15, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: calll __powidf2 -; X86-SSE-NEXT: addl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 4 -; X86-SSE-NEXT: retl +; FAST-X64-LABEL: test_powi_f32_i32: +; FAST-X64: # %bb.0: +; FAST-X64-NEXT: pushq %rax +; FAST-X64-NEXT: callq __powisf2@PLT +; FAST-X64-NEXT: popq %rax +; FAST-X64-NEXT: retq ; -; X64-LABEL: pow_wrapper_optsize: -; X64: # %bb.0: -; X64-NEXT: movl $15, %edi -; X64-NEXT: jmp __powidf2@PLT # TAILCALL - %ret = tail call double @llvm.powi.f64.i32(double %a, i32 15) nounwind ; [#uses=1] - ret double %ret +; SDAG-X64-LABEL: test_powi_f32_i32: +; SDAG-X64: # %bb.0: +; SDAG-X64-NEXT: jmp __powisf2@PLT # TAILCALL +; +; GISEL-X64-LABEL: test_powi_f32_i32: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: jmp __powisf2@PLT # TAILCALL + %res = call float @llvm.powi.f32.i32(float %Val, i32 %x) + ret float %res } -define double @pow_wrapper_pgso(double %a) !prof !14 { -; X86-X87-LABEL: pow_wrapper_pgso: -; X86-X87: # %bb.0: -; X86-X87-NEXT: subl $12, %esp -; X86-X87-NEXT: .cfi_def_cfa_offset 16 -; X86-X87-NEXT: fldl {{[0-9]+}}(%esp) -; X86-X87-NEXT: fstpl (%esp) -; X86-X87-NEXT: movl $15, {{[0-9]+}}(%esp) -; X86-X87-NEXT: calll __powidf2 -; X86-X87-NEXT: addl $12, %esp -; X86-X87-NEXT: .cfi_def_cfa_offset 4 -; X86-X87-NEXT: retl +define double @test_powi_f64_i32(double %Val, i32 %x) nounwind { +; FAST-X86-LABEL: test_powi_f64_i32: +; FAST-X86: # %bb.0: +; FAST-X86-NEXT: subl $12, %esp +; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; FAST-X86-NEXT: fldl {{[0-9]+}}(%esp) +; FAST-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FAST-X86-NEXT: fstpl (%esp) +; FAST-X86-NEXT: calll __powidf2 +; FAST-X86-NEXT: addl $12, %esp +; FAST-X86-NEXT: retl ; -; X86-SSE-LABEL: pow_wrapper_pgso: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 16 -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movsd %xmm0, (%esp) -; X86-SSE-NEXT: movl $15, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: calll __powidf2 -; X86-SSE-NEXT: addl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 4 -; X86-SSE-NEXT: retl +; SDAG-X86-LABEL: test_powi_f64_i32: +; SDAG-X86: # %bb.0: +; SDAG-X86-NEXT: subl $12, %esp +; SDAG-X86-NEXT: fldl {{[0-9]+}}(%esp) +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; SDAG-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SDAG-X86-NEXT: fstpl (%esp) +; SDAG-X86-NEXT: calll __powidf2 +; SDAG-X86-NEXT: addl $12, %esp +; SDAG-X86-NEXT: retl ; -; X64-LABEL: pow_wrapper_pgso: -; X64: # %bb.0: -; X64-NEXT: movl $15, %edi -; X64-NEXT: jmp __powidf2@PLT # TAILCALL - %ret = tail call double @llvm.powi.f64.i32(double %a, i32 15) nounwind ; [#uses=1] - ret double %ret -} - -define double @pow_wrapper_minsize(double %a) minsize { -; X86-X87-LABEL: pow_wrapper_minsize: -; X86-X87: # %bb.0: -; X86-X87-NEXT: subl $12, %esp -; X86-X87-NEXT: .cfi_def_cfa_offset 16 -; X86-X87-NEXT: fldl {{[0-9]+}}(%esp) -; X86-X87-NEXT: fstpl (%esp) -; X86-X87-NEXT: movl $15, {{[0-9]+}}(%esp) -; X86-X87-NEXT: calll __powidf2 -; X86-X87-NEXT: addl $12, %esp -; X86-X87-NEXT: .cfi_def_cfa_offset 4 -; X86-X87-NEXT: retl +; GISEL-X86-LABEL: test_powi_f64_i32: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: subl $12, %esp +; GISEL-X86-NEXT: fldl {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: fstpl (%esp) +; GISEL-X86-NEXT: calll __powidf2 +; GISEL-X86-NEXT: addl $12, %esp +; GISEL-X86-NEXT: retl ; -; X86-SSE-LABEL: pow_wrapper_minsize: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 16 -; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movsd %xmm0, (%esp) -; X86-SSE-NEXT: movl $15, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: calll __powidf2 -; X86-SSE-NEXT: addl $12, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 4 -; X86-SSE-NEXT: retl +; FAST-X64-LABEL: test_powi_f64_i32: +; FAST-X64: # %bb.0: +; FAST-X64-NEXT: pushq %rax +; FAST-X64-NEXT: callq __powidf2@PLT +; FAST-X64-NEXT: popq %rax +; FAST-X64-NEXT: retq ; -; X64-LABEL: pow_wrapper_minsize: -; X64: # %bb.0: -; X64-NEXT: pushq $15 -; X64-NEXT: .cfi_adjust_cfa_offset 8 -; X64-NEXT: popq %rdi -; X64-NEXT: .cfi_adjust_cfa_offset -8 -; X64-NEXT: jmp __powidf2@PLT # TAILCALL - %ret = tail call double @llvm.powi.f64.i32(double %a, i32 15) nounwind ; [#uses=1] - ret double %ret +; SDAG-X64-LABEL: test_powi_f64_i32: +; SDAG-X64: # %bb.0: +; SDAG-X64-NEXT: jmp __powidf2@PLT # TAILCALL +; +; GISEL-X64-LABEL: test_powi_f64_i32: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: jmp __powidf2@PLT # TAILCALL + %res = call double @llvm.powi.f64.i32(double %Val, i32 %x) + ret double %res } -define <2 x float> @powi_v2f32(<2 x float> %a) minsize { -; X86-X87-LABEL: powi_v2f32: -; X86-X87: # %bb.0: -; X86-X87-NEXT: pushl %esi -; X86-X87-NEXT: .cfi_def_cfa_offset 8 -; X86-X87-NEXT: subl $16, %esp -; X86-X87-NEXT: .cfi_def_cfa_offset 24 -; X86-X87-NEXT: .cfi_offset %esi, -8 -; X86-X87-NEXT: flds {{[0-9]+}}(%esp) -; X86-X87-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-X87-NEXT: flds {{[0-9]+}}(%esp) -; X86-X87-NEXT: pushl $15 -; X86-X87-NEXT: .cfi_adjust_cfa_offset 4 -; X86-X87-NEXT: popl %esi -; X86-X87-NEXT: .cfi_adjust_cfa_offset -4 -; X86-X87-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-X87-NEXT: fstps (%esp) -; X86-X87-NEXT: calll __powisf2 -; X86-X87-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-X87-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-X87-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-X87-NEXT: fstps (%esp) -; X86-X87-NEXT: calll __powisf2 -; X86-X87-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-X87-NEXT: fxch %st(1) -; X86-X87-NEXT: addl $16, %esp -; X86-X87-NEXT: .cfi_def_cfa_offset 8 -; X86-X87-NEXT: popl %esi -; X86-X87-NEXT: .cfi_def_cfa_offset 4 -; X86-X87-NEXT: retl +define x86_fp80 @test_powi_f80_i32(x86_fp80 %Val, i32 %x) nounwind { +; FAST-X86-LABEL: test_powi_f80_i32: +; FAST-X86: # %bb.0: +; FAST-X86-NEXT: subl $28, %esp +; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; FAST-X86-NEXT: fldt {{[0-9]+}}(%esp) +; FAST-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FAST-X86-NEXT: fstpt (%esp) +; FAST-X86-NEXT: calll __powixf2 +; FAST-X86-NEXT: addl $28, %esp +; FAST-X86-NEXT: retl +; +; SDAG-X86-LABEL: test_powi_f80_i32: +; SDAG-X86: # %bb.0: +; SDAG-X86-NEXT: subl $28, %esp +; SDAG-X86-NEXT: fldt {{[0-9]+}}(%esp) +; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; SDAG-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; SDAG-X86-NEXT: fstpt (%esp) +; SDAG-X86-NEXT: calll __powixf2 +; SDAG-X86-NEXT: addl $28, %esp +; SDAG-X86-NEXT: retl +; +; GISEL-X86-LABEL: test_powi_f80_i32: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: subl $28, %esp +; GISEL-X86-NEXT: fldt {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; GISEL-X86-NEXT: fstpt (%esp) +; GISEL-X86-NEXT: calll __powixf2 +; GISEL-X86-NEXT: addl $28, %esp +; GISEL-X86-NEXT: retl ; -; X86-SSE-LABEL: powi_v2f32: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE-NEXT: subl $32, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 40 -; X86-SSE-NEXT: .cfi_offset %esi, -8 -; X86-SSE-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; X86-SSE-NEXT: pushl $15 -; X86-SSE-NEXT: .cfi_adjust_cfa_offset 4 -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: .cfi_adjust_cfa_offset -4 -; X86-SSE-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: movss %xmm0, (%esp) -; X86-SSE-NEXT: calll __powisf2 -; X86-SSE-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-SSE-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X86-SSE-NEXT: movss %xmm0, (%esp) -; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) -; X86-SSE-NEXT: calll __powisf2 -; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) -; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE-NEXT: addl $32, %esp -; X86-SSE-NEXT: .cfi_def_cfa_offset 8 -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: .cfi_def_cfa_offset 4 -; X86-SSE-NEXT: retl +; FAST-X64-LABEL: test_powi_f80_i32: +; FAST-X64: # %bb.0: +; FAST-X64-NEXT: subq $24, %rsp +; FAST-X64-NEXT: fldt {{[0-9]+}}(%rsp) +; FAST-X64-NEXT: fstpt (%rsp) +; FAST-X64-NEXT: callq __powixf2@PLT +; FAST-X64-NEXT: addq $24, %rsp +; FAST-X64-NEXT: retq ; -; X64-LABEL: powi_v2f32: -; X64: # %bb.0: -; X64-NEXT: pushq %rbx -; X64-NEXT: .cfi_def_cfa_offset 16 -; X64-NEXT: subq $32, %rsp -; X64-NEXT: .cfi_def_cfa_offset 48 -; X64-NEXT: .cfi_offset %rbx, -16 -; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; X64-NEXT: pushq $15 -; X64-NEXT: .cfi_adjust_cfa_offset 8 -; X64-NEXT: popq %rbx -; X64-NEXT: .cfi_adjust_cfa_offset -8 -; X64-NEXT: movl %ebx, %edi -; X64-NEXT: callq __powisf2@PLT -; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; X64-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; X64-NEXT: movl %ebx, %edi -; X64-NEXT: callq __powisf2@PLT -; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: movaps %xmm1, %xmm0 -; X64-NEXT: addq $32, %rsp -; X64-NEXT: .cfi_def_cfa_offset 16 -; X64-NEXT: popq %rbx -; X64-NEXT: .cfi_def_cfa_offset 8 -; X64-NEXT: retq - %ret = tail call < 2 x float> @llvm.powi.v2f32.i32(<2 x float> %a, i32 15) nounwind ; - ret <2 x float> %ret +; SDAG-X64-LABEL: test_powi_f80_i32: +; SDAG-X64: # %bb.0: +; SDAG-X64-NEXT: subq $24, %rsp +; SDAG-X64-NEXT: fldt {{[0-9]+}}(%rsp) +; SDAG-X64-NEXT: fstpt (%rsp) +; SDAG-X64-NEXT: callq __powixf2@PLT +; SDAG-X64-NEXT: addq $24, %rsp +; SDAG-X64-NEXT: retq +; +; GISEL-X64-LABEL: test_powi_f80_i32: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: subq $24, %rsp +; GISEL-X64-NEXT: fldt {{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: fstpt (%rsp) +; GISEL-X64-NEXT: callq __powixf2@PLT +; GISEL-X64-NEXT: addq $24, %rsp +; GISEL-X64-NEXT: retq + %res = call x86_fp80 @llvm.powi.f80.i32(x86_fp80 %Val, i32 %x) + ret x86_fp80 %res } - -declare double @llvm.powi.f64.i32(double, i32) nounwind readonly -declare < 2 x float> @llvm.powi.v2f32.i32(<2 x float>, i32) nounwind readonly - -!llvm.module.flags = !{!0} -!0 = !{i32 1, !"ProfileSummary", !1} -!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} -!2 = !{!"ProfileFormat", !"InstrProf"} -!3 = !{!"TotalCount", i64 10000} -!4 = !{!"MaxCount", i64 10} -!5 = !{!"MaxInternalCount", i64 1} -!6 = !{!"MaxFunctionCount", i64 1000} -!7 = !{!"NumCounts", i64 3} -!8 = !{!"NumFunctions", i64 3} -!9 = !{!"DetailedSummary", !10} -!10 = !{!11, !12, !13} -!11 = !{i32 10000, i64 100, i32 1} -!12 = !{i32 999000, i64 100, i32 1} -!13 = !{i32 999999, i64 1, i32 2} -!14 = !{!"function_entry_count", i64 0} From a2e888f5b49113c66b055290cb7069ae88c9d2e1 Mon Sep 17 00:00:00 2001 From: Vy Nguyen Date: Fri, 4 Apr 2025 08:00:46 -0400 Subject: [PATCH 0633/1029] [LLDB][NFC]Fix stack-use-after free bug. (#134296) Details: detailed_command_telemetry (bool) and command_id (int) could already be freed when the dispatcher's dtor runs. So we should just copy them into the lambda since they are cheap. --- lldb/source/Interpreter/CommandInterpreter.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index 949b1191c28f0..112d2f20fda41 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -1918,7 +1918,9 @@ bool CommandInterpreter::HandleCommand(const char *command_line, // Those will be collected by the on-exit-callback. }); - helper.DispatchOnExit([&](lldb_private::telemetry::CommandInfo *info) { + helper.DispatchOnExit([&cmd_obj, &parsed_command_args, &result, + detailed_command_telemetry, command_id]( + lldb_private::telemetry::CommandInfo *info) { // TODO: this is logging the time the command-handler finishes. // But we may want a finer-grain durations too? // (ie., the execute_time recorded below?) From babbc6f8429ca07cdf2f5b6ff5e9516c383079b7 Mon Sep 17 00:00:00 2001 From: Zahira Ammarguellat Date: Fri, 4 Apr 2025 05:04:16 -0700 Subject: [PATCH 0634/1029] [NFC] Fixes proposed by code sanitizer. (#134138) --- clang-tools-extra/clangd/ConfigCompile.cpp | 2 +- clang-tools-extra/clangd/Headers.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/clang-tools-extra/clangd/ConfigCompile.cpp b/clang-tools-extra/clangd/ConfigCompile.cpp index 3d7f792aa136b..13c2405e76df7 100644 --- a/clang-tools-extra/clangd/ConfigCompile.cpp +++ b/clang-tools-extra/clangd/ConfigCompile.cpp @@ -535,7 +535,7 @@ struct FragmentCompiler { } if (Filters->empty()) return std::nullopt; - auto Filter = [Filters](llvm::StringRef Path) { + auto Filter = [Filters = std::move(Filters)](llvm::StringRef Path) { for (auto &Regex : *Filters) if (Regex.match(Path)) return true; diff --git a/clang-tools-extra/clangd/Headers.cpp b/clang-tools-extra/clangd/Headers.cpp index 0ffd9ee4d2751..87fd261b906e6 100644 --- a/clang-tools-extra/clangd/Headers.cpp +++ b/clang-tools-extra/clangd/Headers.cpp @@ -305,14 +305,14 @@ IncludeInserter::calculateIncludePath(const HeaderFile &InsertedHeader, if (llvm::sys::path::is_absolute(Suggested)) return std::nullopt; bool IsAngled = false; - for (auto Filter : AngledHeaders) { + for (auto &Filter : AngledHeaders) { if (Filter(Suggested)) { IsAngled = true; break; } } bool IsQuoted = false; - for (auto Filter : QuotedHeaders) { + for (auto &Filter : QuotedHeaders) { if (Filter(Suggested)) { IsQuoted = true; break; From 547d054ef13c2ca5ec42f7061d8ba941c05605d9 Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Fri, 4 Apr 2025 15:21:48 +0300 Subject: [PATCH 0635/1029] [clang-tidy][NFC][doc] improve "options" sections of `misc-`, `cppcore-` and other checks (#133694) Improved "options" sections of various checks: 1. Added Options keyword to be a delimiter between "body" and "options" parts of docs 2. Added default values where were absent. 3. Changed double-tick to single-tick in default values. --------- Co-authored-by: EugeneZelenko --- .../checks/android/comparison-in-temp-failure-retry.rst | 1 + .../docs/clang-tidy/checks/cert/msc51-cpp.rst | 2 +- .../docs/clang-tidy/checks/concurrency/mt-unsafe.rst | 3 +++ .../clang-tidy/checks/cppcoreguidelines/no-malloc.rst | 6 +++--- .../checks/cppcoreguidelines/owning-memory.rst | 4 ++-- .../pro-bounds-constant-array-index.rst | 1 + .../checks/cppcoreguidelines/pro-type-member-init.rst | 1 + .../clang-tidy/checks/misc/coroutine-hostile-raii.rst | 8 ++++---- .../docs/clang-tidy/checks/misc/include-cleaner.rst | 4 ++-- .../misc/non-private-member-variables-in-classes.rst | 9 +++++---- .../checks/readability/container-data-pointer.rst | 2 +- .../checks/readability/container-size-empty.rst | 3 +++ .../inconsistent-declaration-parameter-name.rst | 3 +++ .../checks/readability/redundant-inline-specifier.rst | 2 +- .../checks/readability/redundant-smartptr-get.rst | 2 ++ 15 files changed, 33 insertions(+), 18 deletions(-) diff --git a/clang-tools-extra/docs/clang-tidy/checks/android/comparison-in-temp-failure-retry.rst b/clang-tools-extra/docs/clang-tidy/checks/android/comparison-in-temp-failure-retry.rst index 93112ee2bea64..31cc72b0579c4 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/android/comparison-in-temp-failure-retry.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/android/comparison-in-temp-failure-retry.rst @@ -41,3 +41,4 @@ Options .. option:: RetryMacros A comma-separated list of the names of retry macros to be checked. + Default is `TEMP_FAILURE_RETRY`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/msc51-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/msc51-cpp.rst index 1e0e34efe0a58..99e550aef0e7a 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/msc51-cpp.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/msc51-cpp.rst @@ -37,4 +37,4 @@ Options .. option:: DisallowedSeedTypes A comma-separated list of the type names which are disallowed. - Default values are ``time_t``, ``std::time_t``. + Default value is `time_t,std::time_t`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/concurrency/mt-unsafe.rst b/clang-tools-extra/docs/clang-tidy/checks/concurrency/mt-unsafe.rst index 4e46ba1edc34f..337be787d962b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/concurrency/mt-unsafe.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/concurrency/mt-unsafe.rst @@ -32,6 +32,9 @@ Examples: sleep(1); // implementation may use SIGALRM +Options +------- + .. option:: FunctionSet Specifies which functions in libc should be considered thread-safe, diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/no-malloc.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/no-malloc.rst index 237520aa6690a..e3a162078a3b8 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/no-malloc.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/no-malloc.rst @@ -35,14 +35,14 @@ Options .. option:: Allocations Semicolon-separated list of fully qualified names of memory allocation functions. - Defaults to ``::malloc;::calloc``. + Defaults to `::malloc;::calloc`. .. option:: Deallocations Semicolon-separated list of fully qualified names of memory allocation functions. - Defaults to ``::free``. + Defaults to `::free`. .. option:: Reallocations Semicolon-separated list of fully qualified names of memory allocation functions. - Defaults to ``::realloc``. + Defaults to `::realloc`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/owning-memory.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/owning-memory.rst index 3c91d09dda1f2..4fc49f8bd6eee 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/owning-memory.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/owning-memory.rst @@ -95,14 +95,14 @@ Options Semicolon-separated list of fully qualified names of legacy functions that create resources but cannot introduce ``gsl::owner<>``. - Defaults to ``::malloc;::aligned_alloc;::realloc;::calloc;::fopen;::freopen;::tmpfile``. + Defaults to `::malloc;::aligned_alloc;::realloc;::calloc;::fopen;::freopen;::tmpfile`. .. option:: LegacyResourceConsumers Semicolon-separated list of fully qualified names of legacy functions expecting resource owners as pointer arguments but cannot introduce ``gsl::owner<>``. - Defaults to ``::free;::realloc;::freopen;::fclose``. + Defaults to `::free;::realloc;::freopen;::fclose`. Limitations diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-constant-array-index.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-constant-array-index.rst index 4e877676cf1fe..9b82e0c45a314 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-constant-array-index.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-bounds-constant-array-index.rst @@ -21,6 +21,7 @@ Options The check can generate fixes after this option has been set to the name of the include file that contains ``gsl::at()``, e.g. `"gsl/gsl.h"`. + Default is an empty string. .. option:: IncludeStyle diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-type-member-init.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-type-member-init.rst index 97af01a895e1c..3c6797bce9450 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-type-member-init.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-type-member-init.rst @@ -37,6 +37,7 @@ Options If set to `true`, the check will provide fix-its with literal initializers \( ``int i = 0;`` \) instead of curly braces \( ``int i{};`` \). + Default is `false`. This rule is part of the `Type safety (Type.6) `_ diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst index a39c1853b313c..0b054e4e20bd6 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/coroutine-hostile-raii.rst @@ -45,8 +45,8 @@ Options A semicolon-separated list of qualified types which should not be allowed to persist across suspension points. - Eg: ``my::lockable; a::b;::my::other::lockable;`` - The default value of this option is `"std::lock_guard;std::scoped_lock"`. + Eg: `my::lockable;a::b;::my::other::lockable` + The default value of this option is `std::lock_guard;std::scoped_lock`. .. option:: AllowedAwaitablesList @@ -78,6 +78,6 @@ Options co_await wait(); } - Eg: ``my::safe::awaitable;other::awaitable`` - The default value of this option is empty string `""`. + Eg: `my::safe::awaitable;other::awaitable` + Default is an empty string. diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/include-cleaner.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/include-cleaner.rst index e40335b2543b2..d112f01cbc0b1 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc/include-cleaner.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/include-cleaner.rst @@ -31,8 +31,8 @@ Options A semicolon-separated list of regexes to disable insertion/removal of header files that match this regex as a suffix. E.g., `foo/.*` disables - insertion/removal for all headers under the directory `foo`. By default, no - headers will be ignored. + insertion/removal for all headers under the directory `foo`. Default is an + empty string, no headers will be ignored. .. option:: DeduplicateFindings diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/non-private-member-variables-in-classes.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/non-private-member-variables-in-classes.rst index 57990622e60cd..9d5b7f6673159 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/misc/non-private-member-variables-in-classes.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/non-private-member-variables-in-classes.rst @@ -17,10 +17,11 @@ Options .. option:: IgnoreClassesWithAllMemberVariablesBeingPublic - Allows to completely ignore classes if **all** the member variables in that - class a declared with a ``public`` access specifier. + When `true`, allows to completely ignore classes if **all** the member + variables in that class declared with a ``public`` access specifier. + Default is `false`. .. option:: IgnorePublicMemberVariables - Allows to ignore (not diagnose) **all** the member variables declared with - a ``public`` access specifier. + When `true`, allows to ignore (not diagnose) **all** the member variables + declared with a ``public`` access specifier. Default is `false`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/container-data-pointer.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/container-data-pointer.rst index 0d10829ed3c2f..a4eff16cbab14 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/container-data-pointer.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/container-data-pointer.rst @@ -18,4 +18,4 @@ Options .. option:: IgnoredContainers Semicolon-separated list of containers regexp for which this check won't be - enforced. Default is `empty`. + enforced. Default is an empty string. diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst index 43ad74f60dbe5..da6f770b3d74b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst @@ -25,6 +25,9 @@ The check issues warning if a container has ``empty()`` and ``size()`` or `size_type` can be any kind of integer type. +Options +------- + .. option:: ExcludedComparisonTypes A semicolon-separated list of class names for which the check will ignore diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/inconsistent-declaration-parameter-name.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/inconsistent-declaration-parameter-name.rst index 95341d52da4f6..4661d2cd8c9a4 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/inconsistent-declaration-parameter-name.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/inconsistent-declaration-parameter-name.rst @@ -52,6 +52,9 @@ In the case of multiple redeclarations or function template specializations, a warning is issued for every redeclaration or specialization inconsistent with the definition or the first declaration seen in a translation unit. +Options +------- + .. option:: IgnoreMacros If this option is set to `true` (default is `true`), the check will not warn diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst index eee324cddab48..c33c05b42e500 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst @@ -29,4 +29,4 @@ Options .. option:: StrictMode If set to `true`, the check will also flag functions and variables that - already have internal linkage as redundant. + already have internal linkage as redundant. Default is `false`. diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-smartptr-get.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-smartptr-get.rst index 20851b0acad97..ab8a3681907e3 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-smartptr-get.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-smartptr-get.rst @@ -14,6 +14,8 @@ Examples: *ptr->get() ==> **ptr if (ptr.get() == nullptr) ... => if (ptr == nullptr) ... +Options +------- .. option:: IgnoreMacros From d02786e7785ffa8c0aae4d89e9f6391bb4645500 Mon Sep 17 00:00:00 2001 From: Ilya Biryukov Date: Fri, 4 Apr 2025 14:23:55 +0200 Subject: [PATCH 0636/1029] [Sema] Handle AttributedType in template deduction with derived-to-base conversions (#134361) Fix #134356. We accidentally skipped checking derived-to-base conversions because deduction did not strip sugar in the relevant code. This caused deduction failures when a parameter type had an attribute. --- clang/lib/Sema/SemaTemplateDeduction.cpp | 2 +- .../Sema/nullability-and-template-deduction.cpp | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 clang/test/Sema/nullability-and-template-deduction.cpp diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index ab6e18aee7206..170b9f05002b1 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -4447,7 +4447,7 @@ static bool AdjustFunctionParmAndArgTypesForDeduction( // transformed A can be a pointer to a derived class pointed to by // the deduced A. if (isSimpleTemplateIdType(ParamType) || - (isa(ParamType) && + (ParamType->getAs() && isSimpleTemplateIdType( ParamType->castAs()->getPointeeType()))) TDF |= TDF_DerivedClass; diff --git a/clang/test/Sema/nullability-and-template-deduction.cpp b/clang/test/Sema/nullability-and-template-deduction.cpp new file mode 100644 index 0000000000000..3ea6d38d26b69 --- /dev/null +++ b/clang/test/Sema/nullability-and-template-deduction.cpp @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 -fsyntax-only %s -verify +// expected-no-diagnostics + +template struct Base {}; +template struct Derived : Base {}; + +template void foo(Base *_Nonnull); + +template void bar(Base *); + + +void test() { + Derived d; + foo(&d); + bar(&d); +} From da69eb75cbc634a56886e94de3e546c63c17567e Mon Sep 17 00:00:00 2001 From: Ilya Biryukov Date: Fri, 4 Apr 2025 14:35:15 +0200 Subject: [PATCH 0637/1029] [NFC] [ASTMatchers] Share code of `forEachArgumentWithParamType` with UnsafeBufferUsage (#132387) This changes exposes a low-level helper that is used to implement `forEachArgumentWithParamType` but can also be used without matchers, e.g. if performance is a concern. Commit f5ee10538b68835112323c241ca7db67ca78bf62 introduced a copy of the implementation of the `forEachArgumentWithParamType` matcher that was needed for optimizing performance of `-Wunsafe-buffer-usage`. This change shares the code between the two so that we do not repeat ourselves and any bugfixes or changes will be picked up by both implementations in the future. --- clang/include/clang/ASTMatchers/ASTMatchers.h | 80 +++---------- .../clang/ASTMatchers/LowLevelHelpers.h | 37 ++++++ clang/lib/ASTMatchers/CMakeLists.txt | 1 + clang/lib/ASTMatchers/LowLevelHelpers.cpp | 106 ++++++++++++++++++ clang/lib/Analysis/UnsafeBufferUsage.cpp | 95 +--------------- 5 files changed, 163 insertions(+), 156 deletions(-) create mode 100644 clang/include/clang/ASTMatchers/LowLevelHelpers.h create mode 100644 clang/lib/ASTMatchers/LowLevelHelpers.cpp diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h index 738617759eb29..e6b684b24b080 100644 --- a/clang/include/clang/ASTMatchers/ASTMatchers.h +++ b/clang/include/clang/ASTMatchers/ASTMatchers.h @@ -71,6 +71,7 @@ #include "clang/AST/TypeLoc.h" #include "clang/ASTMatchers/ASTMatchersInternal.h" #include "clang/ASTMatchers/ASTMatchersMacros.h" +#include "clang/ASTMatchers/LowLevelHelpers.h" #include "clang/Basic/AttrKinds.h" #include "clang/Basic/ExceptionSpecificationType.h" #include "clang/Basic/FileManager.h" @@ -5211,72 +5212,25 @@ AST_POLYMORPHIC_MATCHER_P2(forEachArgumentWithParamType, internal::Matcher, ArgMatcher, internal::Matcher, ParamMatcher) { BoundNodesTreeBuilder Result; - // The first argument of an overloaded member operator is the implicit object - // argument of the method which should not be matched against a parameter, so - // we skip over it here. - BoundNodesTreeBuilder Matches; - unsigned ArgIndex = - cxxOperatorCallExpr( - callee(cxxMethodDecl(unless(isExplicitObjectMemberFunction())))) - .matches(Node, Finder, &Matches) - ? 1 - : 0; - const FunctionProtoType *FProto = nullptr; - - if (const auto *Call = dyn_cast(&Node)) { - if (const auto *Value = - dyn_cast_or_null(Call->getCalleeDecl())) { - QualType QT = Value->getType().getCanonicalType(); - - // This does not necessarily lead to a `FunctionProtoType`, - // e.g. K&R functions do not have a function prototype. - if (QT->isFunctionPointerType()) - FProto = QT->getPointeeType()->getAs(); - - if (QT->isMemberFunctionPointerType()) { - const auto *MP = QT->getAs(); - assert(MP && "Must be member-pointer if its a memberfunctionpointer"); - FProto = MP->getPointeeType()->getAs(); - assert(FProto && - "The call must have happened through a member function " - "pointer"); - } - } - } - - unsigned ParamIndex = 0; bool Matched = false; - unsigned NumArgs = Node.getNumArgs(); - if (FProto && FProto->isVariadic()) - NumArgs = std::min(NumArgs, FProto->getNumParams()); - - for (; ArgIndex < NumArgs; ++ArgIndex, ++ParamIndex) { + auto ProcessParamAndArg = [&](QualType ParamType, const Expr *Arg) { BoundNodesTreeBuilder ArgMatches(*Builder); - if (ArgMatcher.matches(*(Node.getArg(ArgIndex)->IgnoreParenCasts()), Finder, - &ArgMatches)) { - BoundNodesTreeBuilder ParamMatches(ArgMatches); + if (!ArgMatcher.matches(*Arg, Finder, &ArgMatches)) + return; + BoundNodesTreeBuilder ParamMatches(std::move(ArgMatches)); + if (!ParamMatcher.matches(ParamType, Finder, &ParamMatches)) + return; + Result.addMatch(ParamMatches); + Matched = true; + return; + }; + if (auto *Call = llvm::dyn_cast(&Node)) + matchEachArgumentWithParamType(*Call, ProcessParamAndArg); + else if (auto *Construct = llvm::dyn_cast(&Node)) + matchEachArgumentWithParamType(*Construct, ProcessParamAndArg); + else + llvm_unreachable("expected CallExpr or CXXConstructExpr"); - // This test is cheaper compared to the big matcher in the next if. - // Therefore, please keep this order. - if (FProto && FProto->getNumParams() > ParamIndex) { - QualType ParamType = FProto->getParamType(ParamIndex); - if (ParamMatcher.matches(ParamType, Finder, &ParamMatches)) { - Result.addMatch(ParamMatches); - Matched = true; - continue; - } - } - if (expr(anyOf(cxxConstructExpr(hasDeclaration(cxxConstructorDecl( - hasParameter(ParamIndex, hasType(ParamMatcher))))), - callExpr(callee(functionDecl( - hasParameter(ParamIndex, hasType(ParamMatcher))))))) - .matches(Node, Finder, &ParamMatches)) { - Result.addMatch(ParamMatches); - Matched = true; - continue; - } - } - } *Builder = std::move(Result); return Matched; } diff --git a/clang/include/clang/ASTMatchers/LowLevelHelpers.h b/clang/include/clang/ASTMatchers/LowLevelHelpers.h new file mode 100644 index 0000000000000..ad1fffb5e5e01 --- /dev/null +++ b/clang/include/clang/ASTMatchers/LowLevelHelpers.h @@ -0,0 +1,37 @@ +//===- LowLevelHelpers.h - helpers with pure AST interface ---- *- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Collects a number of helpers that are used by matchers, but can be reused +// outside of them, e.g. when corresponding matchers cannot be used due to +// performance constraints. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_ASTMATCHERS_LOWLEVELHELPERS_H +#define LLVM_CLANG_ASTMATCHERS_LOWLEVELHELPERS_H + +#include "clang/AST/Expr.h" +#include "clang/AST/ExprCXX.h" +#include "clang/AST/Type.h" +#include "llvm/ADT/STLFunctionalExtras.h" + +namespace clang { +namespace ast_matchers { + +void matchEachArgumentWithParamType( + const CallExpr &Node, + llvm::function_ref + OnParamAndArg); + +void matchEachArgumentWithParamType( + const CXXConstructExpr &Node, + llvm::function_ref + OnParamAndArg); + +} // namespace ast_matchers +} // namespace clang + +#endif diff --git a/clang/lib/ASTMatchers/CMakeLists.txt b/clang/lib/ASTMatchers/CMakeLists.txt index 30303c1e39a00..7769fd656ac06 100644 --- a/clang/lib/ASTMatchers/CMakeLists.txt +++ b/clang/lib/ASTMatchers/CMakeLists.txt @@ -9,6 +9,7 @@ add_clang_library(clangASTMatchers ASTMatchFinder.cpp ASTMatchersInternal.cpp GtestMatchers.cpp + LowLevelHelpers.cpp LINK_LIBS clangAST diff --git a/clang/lib/ASTMatchers/LowLevelHelpers.cpp b/clang/lib/ASTMatchers/LowLevelHelpers.cpp new file mode 100644 index 0000000000000..eb2604c6252dd --- /dev/null +++ b/clang/lib/ASTMatchers/LowLevelHelpers.cpp @@ -0,0 +1,106 @@ +//===- LowLevelHelpers.cpp -------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/ASTMatchers/LowLevelHelpers.h" +#include "clang/AST/Decl.h" +#include "clang/AST/DeclCXX.h" +#include "clang/AST/Expr.h" +#include "clang/AST/ExprCXX.h" +#include + +namespace clang { +namespace ast_matchers { + +static const FunctionDecl *getCallee(const CXXConstructExpr &D) { + return D.getConstructor(); +} +static const FunctionDecl *getCallee(const CallExpr &D) { + return D.getDirectCallee(); +} + +template +static void matchEachArgumentWithParamTypeImpl( + const ExprNode &Node, + llvm::function_ref + OnParamAndArg) { + static_assert(std::is_same_v || + std::is_same_v); + // The first argument of an overloaded member operator is the implicit object + // argument of the method which should not be matched against a parameter, so + // we skip over it here. + unsigned ArgIndex = 0; + if (const auto *CE = dyn_cast(&Node)) { + const auto *MD = dyn_cast_or_null(CE->getDirectCallee()); + if (MD && !MD->isExplicitObjectMemberFunction()) { + // This is an overloaded operator call. + // We need to skip the first argument, which is the implicit object + // argument of the method which should not be matched against a + // parameter. + ++ArgIndex; + } + } + + const FunctionProtoType *FProto = nullptr; + + if (const auto *Call = dyn_cast(&Node)) { + if (const auto *Value = + dyn_cast_or_null(Call->getCalleeDecl())) { + QualType QT = Value->getType().getCanonicalType(); + + // This does not necessarily lead to a `FunctionProtoType`, + // e.g. K&R functions do not have a function prototype. + if (QT->isFunctionPointerType()) + FProto = QT->getPointeeType()->getAs(); + + if (QT->isMemberFunctionPointerType()) { + const auto *MP = QT->getAs(); + assert(MP && "Must be member-pointer if its a memberfunctionpointer"); + FProto = MP->getPointeeType()->getAs(); + assert(FProto && + "The call must have happened through a member function " + "pointer"); + } + } + } + + unsigned ParamIndex = 0; + unsigned NumArgs = Node.getNumArgs(); + if (FProto && FProto->isVariadic()) + NumArgs = std::min(NumArgs, FProto->getNumParams()); + + for (; ArgIndex < NumArgs; ++ArgIndex, ++ParamIndex) { + QualType ParamType; + if (FProto && FProto->getNumParams() > ParamIndex) + ParamType = FProto->getParamType(ParamIndex); + else if (const FunctionDecl *FD = getCallee(Node); + FD && FD->getNumParams() > ParamIndex) + ParamType = FD->getParamDecl(ParamIndex)->getType(); + else + continue; + + OnParamAndArg(ParamType, Node.getArg(ArgIndex)->IgnoreParenCasts()); + } +} + +void matchEachArgumentWithParamType( + const CallExpr &Node, + llvm::function_ref + OnParamAndArg) { + matchEachArgumentWithParamTypeImpl(Node, OnParamAndArg); +} + +void matchEachArgumentWithParamType( + const CXXConstructExpr &Node, + llvm::function_ref + OnParamAndArg) { + matchEachArgumentWithParamTypeImpl(Node, OnParamAndArg); +} + +} // namespace ast_matchers + +} // namespace clang diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp index 776cbf6196b60..fbe753de9ef1f 100644 --- a/clang/lib/Analysis/UnsafeBufferUsage.cpp +++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp @@ -20,6 +20,7 @@ #include "clang/AST/Stmt.h" #include "clang/AST/StmtVisitor.h" #include "clang/AST/Type.h" +#include "clang/ASTMatchers/LowLevelHelpers.h" #include "clang/Basic/SourceLocation.h" #include "clang/Lex/Lexer.h" #include "clang/Lex/Preprocessor.h" @@ -300,98 +301,6 @@ static void findStmtsInUnspecifiedLvalueContext( OnResult(BO->getLHS()); } -/// Note: Copied and modified from ASTMatchers. -/// Matches all arguments and their respective types for a \c CallExpr. -/// It is very similar to \c forEachArgumentWithParam but -/// it works on calls through function pointers as well. -/// -/// The difference is, that function pointers do not provide access to a -/// \c ParmVarDecl, but only the \c QualType for each argument. -/// -/// Given -/// \code -/// void f(int i); -/// int y; -/// f(y); -/// void (*f_ptr)(int) = f; -/// f_ptr(y); -/// \endcode -/// callExpr( -/// forEachArgumentWithParamType( -/// declRefExpr(to(varDecl(hasName("y")))), -/// qualType(isInteger()).bind("type) -/// )) -/// matches f(y) and f_ptr(y) -/// with declRefExpr(...) -/// matching int y -/// and qualType(...) -/// matching int -static void forEachArgumentWithParamType( - const CallExpr &Node, - const llvm::function_ref - OnParamAndArg) { - // The first argument of an overloaded member operator is the implicit object - // argument of the method which should not be matched against a parameter, so - // we skip over it here. - unsigned ArgIndex = 0; - if (const auto *CE = dyn_cast(&Node)) { - const auto *MD = dyn_cast_or_null(CE->getDirectCallee()); - if (MD && !MD->isExplicitObjectMemberFunction()) { - // This is an overloaded operator call. - // We need to skip the first argument, which is the implicit object - // argument of the method which should not be matched against a - // parameter. - ++ArgIndex; - } - } - - const FunctionProtoType *FProto = nullptr; - - if (const auto *Call = dyn_cast(&Node)) { - if (const auto *Value = - dyn_cast_or_null(Call->getCalleeDecl())) { - QualType QT = Value->getType().getCanonicalType(); - - // This does not necessarily lead to a `FunctionProtoType`, - // e.g. K&R functions do not have a function prototype. - if (QT->isFunctionPointerType()) - FProto = QT->getPointeeType()->getAs(); - - if (QT->isMemberFunctionPointerType()) { - const auto *MP = QT->getAs(); - assert(MP && "Must be member-pointer if its a memberfunctionpointer"); - FProto = MP->getPointeeType()->getAs(); - assert(FProto && - "The call must have happened through a member function " - "pointer"); - } - } - } - - unsigned ParamIndex = 0; - unsigned NumArgs = Node.getNumArgs(); - if (FProto && FProto->isVariadic()) - NumArgs = std::min(NumArgs, FProto->getNumParams()); - - const auto GetParamType = - [&FProto, &Node](unsigned int ParamIndex) -> std::optional { - if (FProto && FProto->getNumParams() > ParamIndex) { - return FProto->getParamType(ParamIndex); - } - const auto *FD = Node.getDirectCallee(); - if (FD && FD->getNumParams() > ParamIndex) { - return FD->getParamDecl(ParamIndex)->getType(); - } - return std::nullopt; - }; - - for (; ArgIndex < NumArgs; ++ArgIndex, ++ParamIndex) { - auto ParamType = GetParamType(ParamIndex); - if (ParamType) - OnParamAndArg(*ParamType, Node.getArg(ArgIndex)->IgnoreParenCasts()); - } -} - // Finds any expression `e` such that `InnerMatcher` matches `e` and // `e` is in an Unspecified Pointer Context (UPC). static void findStmtsInUnspecifiedPointerContext( @@ -408,7 +317,7 @@ static void findStmtsInUnspecifiedPointerContext( if (const auto *FnDecl = CE->getDirectCallee(); FnDecl && FnDecl->hasAttr()) return; - forEachArgumentWithParamType( + ast_matchers::matchEachArgumentWithParamType( *CE, [&InnerMatcher](QualType Type, const Expr *Arg) { if (Type->isAnyPointerType()) InnerMatcher(Arg); From 16a1d5d51f6bafa19afc140b033db2cfb090743a Mon Sep 17 00:00:00 2001 From: Mariya Podchishchaeva Date: Fri, 4 Apr 2025 14:44:44 +0200 Subject: [PATCH 0638/1029] [clang] Do not diagnose unused deleted operator delete[] (#134357) For vector deleting dtors support we now also search and save operator delete[]. Avoid diagnosing deleted operator delete[] when doing that because vector deleting dtors are only called when delete[] is present and whenever delete[] is present in the TU it will be diagnosed correctly. Fixes https://github.com/llvm/llvm-project/issues/134265 --- clang/include/clang/AST/DeclCXX.h | 2 +- clang/include/clang/Sema/Sema.h | 3 ++- clang/lib/AST/DeclCXX.cpp | 3 +-- clang/lib/Sema/SemaDeclCXX.cpp | 6 +++--- clang/lib/Sema/SemaExprCXX.cpp | 8 +++++--- clang/test/SemaCXX/gh134265.cpp | 22 ++++++++++++++++++++++ 6 files changed, 34 insertions(+), 10 deletions(-) create mode 100644 clang/test/SemaCXX/gh134265.cpp diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h index 764f85b04e6a0..56cec07ec0293 100644 --- a/clang/include/clang/AST/DeclCXX.h +++ b/clang/include/clang/AST/DeclCXX.h @@ -2878,7 +2878,7 @@ class CXXDestructorDecl : public CXXMethodDecl { static CXXDestructorDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID); void setOperatorDelete(FunctionDecl *OD, Expr *ThisArg); - void setOperatorArrayDelete(FunctionDecl *OD, Expr *ThisArg); + void setOperatorArrayDelete(FunctionDecl *OD); const FunctionDecl *getOperatorDelete() const { return getCanonicalDecl()->OperatorDelete; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index b835697f99670..6bf1caf6bdd18 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -8336,7 +8336,8 @@ class Sema final : public SemaBase { DeclarationName Name); FunctionDecl *FindDeallocationFunctionForDestructor(SourceLocation StartLoc, CXXRecordDecl *RD, - DeclarationName Name); + DeclarationName Name, + bool Diagnose = true); /// ActOnCXXDelete - Parsed a C++ 'delete' expression (C++ 5.3.5), as in: /// @code ::delete ptr; @endcode diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 7aa710ad7309b..fffc50eb0b078 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -3031,8 +3031,7 @@ void CXXDestructorDecl::setOperatorDelete(FunctionDecl *OD, Expr *ThisArg) { } } -void CXXDestructorDecl::setOperatorArrayDelete(FunctionDecl *OD, - Expr *ThisArg) { +void CXXDestructorDecl::setOperatorArrayDelete(FunctionDecl *OD) { auto *First = cast(getFirstDecl()); if (OD && !First->OperatorArrayDelete) First->OperatorArrayDelete = OD; diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 07379c6876731..b86f7118e0b34 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -11048,12 +11048,12 @@ bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) { // Lookup delete[] too in case we have to emit a vector deleting dtor; DeclarationName VDeleteName = Context.DeclarationNames.getCXXOperatorName(OO_Array_Delete); - FunctionDecl *ArrOperatorDelete = - FindDeallocationFunctionForDestructor(Loc, RD, VDeleteName); + FunctionDecl *ArrOperatorDelete = FindDeallocationFunctionForDestructor( + Loc, RD, VDeleteName, /*Diagnose=*/false); // delete[] in the TU will make sure the operator is referenced and its // uses diagnosed, otherwise vector deleting dtor won't be called anyway, // so just record it in the destructor. - Destructor->setOperatorArrayDelete(ArrOperatorDelete, ThisArg); + Destructor->setOperatorArrayDelete(ArrOperatorDelete); } } diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index e43f5e3f75bfe..d5f52cd5853f0 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -3265,11 +3265,13 @@ FunctionDecl *Sema::FindUsualDeallocationFunction(SourceLocation StartLoc, return Result.FD; } -FunctionDecl *Sema::FindDeallocationFunctionForDestructor( - SourceLocation Loc, CXXRecordDecl *RD, DeclarationName Name) { +FunctionDecl *Sema::FindDeallocationFunctionForDestructor(SourceLocation Loc, + CXXRecordDecl *RD, + DeclarationName Name, + bool Diagnose) { FunctionDecl *OperatorDelete = nullptr; - if (FindDeallocationFunction(Loc, RD, Name, OperatorDelete)) + if (FindDeallocationFunction(Loc, RD, Name, OperatorDelete, Diagnose)) return nullptr; if (OperatorDelete) return OperatorDelete; diff --git a/clang/test/SemaCXX/gh134265.cpp b/clang/test/SemaCXX/gh134265.cpp new file mode 100644 index 0000000000000..c7bdeb2add0cc --- /dev/null +++ b/clang/test/SemaCXX/gh134265.cpp @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 %s -verify -fsyntax-only + +struct Foo { + virtual ~Foo() {} // expected-error {{attempt to use a deleted function}} + static void operator delete(void* ptr) = delete; // expected-note {{explicitly marked deleted here}} +}; + + +struct Bar { + virtual ~Bar() {} + static void operator delete[](void* ptr) = delete; +}; + +struct Baz { + virtual ~Baz() {} + static void operator delete[](void* ptr) = delete; // expected-note {{explicitly marked deleted here}} +}; + +void foobar() { + Baz *B = new Baz[10](); + delete [] B; // expected-error {{attempt to use a deleted function}} +} From d2bcc11067e682a0753c1068e378d66d59edff73 Mon Sep 17 00:00:00 2001 From: Nashe Mncube Date: Fri, 4 Apr 2025 14:12:44 +0100 Subject: [PATCH 0639/1029] [AArch64][SVE] Use FeatureUseFixedOverScalableIfEqualCost for A510 and A520 (#132246) Inefficient SVE codegen occurs on at least two in-order cores, those being Cortex-A510 and Cortex-A520. For example a simple vector add ``` void foo(float a, float b, float dst, unsigned n) { for (unsigned i = 0; i < n; ++i) dst[i] = a[i] + b[i]; } ``` Vectorizes the inner loop into the following interleaved sequence of instructions. ``` add x12, x1, x10 ld1b { z0.b }, p0/z, [x1, x10] add x13, x2, x10 ld1b { z1.b }, p0/z, [x2, x10] ldr z2, [x12, #1, mul vl] ldr z3, [x13, #1, mul vl] dech x11 add x12, x0, x10 fadd z0.s, z1.s, z0.s fadd z1.s, z3.s, z2.s st1b { z0.b }, p0, [x0, x10] addvl x10, x10, #2 str z1, [x12, #1, mul vl] ``` By adjusting the target features to prefer fixed over scalable if the cost is equal we get the following vectorized loop. ``` ldp q0, q3, [x11, #-16] subs x13, x13, #8 ldp q1, q2, [x10, #-16] add x10, x10, #32 add x11, x11, #32 fadd v0.4s, v1.4s, v0.4s fadd v1.4s, v2.4s, v3.4s stp q0, q1, [x12, #-16] add x12, x12, #32 ``` Which is more efficient. --- llvm/lib/Target/AArch64/AArch64Processors.td | 2 + .../AArch64/sve-fixed-width-inorder-core.ll | 170 ++++++++++++++++++ 2 files changed, 172 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 67d3ff685e6f1..c37dd025d80aa 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -723,6 +723,7 @@ def ProcessorFeatures { FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE, + FeatureUseFixedOverScalableIfEqualCost, FeatureRAS, FeatureRCPC, FeatureRDM]; list A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVEBitPerm, @@ -732,6 +733,7 @@ def ProcessorFeatures { FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, FeatureJS, FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM, + FeatureUseFixedOverScalableIfEqualCost, FeatureDotProd]; list A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVEBitPerm, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll new file mode 100644 index 0000000000000..19d0cc0650167 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll @@ -0,0 +1,170 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a510 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA510 +; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a520 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA520 + +define void @sve_add(ptr %dst, ptr %a, ptr %b, i64 %n) { +; CHECK-CA510-LABEL: define void @sve_add( +; CHECK-CA510-SAME: ptr [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-CA510-NEXT: [[ENTRY:.*:]] +; CHECK-CA510-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-CA510-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-CA510-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-CA510-NEXT: [[CMP9_NOT:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-CA510-NEXT: br i1 [[CMP9_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK-CA510: [[FOR_BODY_PREHEADER]]: +; CHECK-CA510-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-CA510-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK-CA510: [[VECTOR_MEMCHECK]]: +; CHECK-CA510-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[A2]] +; CHECK-CA510-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; CHECK-CA510-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[B3]] +; CHECK-CA510-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32 +; CHECK-CA510-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-CA510-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK-CA510: [[VECTOR_PH]]: +; CHECK-CA510-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-CA510-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-CA510-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-CA510: [[VECTOR_BODY]]: +; CHECK-CA510-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-CA510-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-CA510-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP2]] +; CHECK-CA510-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0 +; CHECK-CA510-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 4 +; CHECK-CA510-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-CA510-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-CA510-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP2]] +; CHECK-CA510-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 0 +; CHECK-CA510-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 4 +; CHECK-CA510-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP7]], align 4 +; CHECK-CA510-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP8]], align 4 +; CHECK-CA510-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD]] +; CHECK-CA510-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD5]] +; CHECK-CA510-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[TMP2]] +; CHECK-CA510-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 0 +; CHECK-CA510-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 4 +; CHECK-CA510-NEXT: store <4 x float> [[TMP9]], ptr [[TMP12]], align 4 +; CHECK-CA510-NEXT: store <4 x float> [[TMP10]], ptr [[TMP13]], align 4 +; CHECK-CA510-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-CA510-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-CA510-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-CA510: [[MIDDLE_BLOCK]]: +; CHECK-CA510-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-CA510-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-CA510: [[SCALAR_PH]]: +; CHECK-CA510-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-CA510-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-CA510: [[FOR_BODY]]: +; CHECK-CA510-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-CA510-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-CA510-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-CA510-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-CA510-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-CA510-NEXT: [[ADD:%.*]] = fadd fast float [[TMP16]], [[TMP15]] +; CHECK-CA510-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-CA510-NEXT: store float [[ADD]], ptr [[ARRAYIDX4]], align 4 +; CHECK-CA510-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-CA510-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-CA510-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-CA510: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; CHECK-CA510-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK-CA510: [[FOR_COND_CLEANUP]]: +; CHECK-CA510-NEXT: ret void +; +; CHECK-CA520-LABEL: define void @sve_add( +; CHECK-CA520-SAME: ptr [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-CA520-NEXT: [[ENTRY:.*:]] +; CHECK-CA520-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-CA520-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-CA520-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-CA520-NEXT: [[CMP9_NOT:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-CA520-NEXT: br i1 [[CMP9_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK-CA520: [[FOR_BODY_PREHEADER]]: +; CHECK-CA520-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-CA520-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK-CA520: [[VECTOR_MEMCHECK]]: +; CHECK-CA520-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[A2]] +; CHECK-CA520-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; CHECK-CA520-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[B3]] +; CHECK-CA520-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32 +; CHECK-CA520-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-CA520-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK-CA520: [[VECTOR_PH]]: +; CHECK-CA520-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-CA520-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-CA520-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-CA520: [[VECTOR_BODY]]: +; CHECK-CA520-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-CA520-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; CHECK-CA520-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP2]] +; CHECK-CA520-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0 +; CHECK-CA520-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 4 +; CHECK-CA520-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-CA520-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-CA520-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP2]] +; CHECK-CA520-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 0 +; CHECK-CA520-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 4 +; CHECK-CA520-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP7]], align 4 +; CHECK-CA520-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP8]], align 4 +; CHECK-CA520-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD]] +; CHECK-CA520-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD5]] +; CHECK-CA520-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[TMP2]] +; CHECK-CA520-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 0 +; CHECK-CA520-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 4 +; CHECK-CA520-NEXT: store <4 x float> [[TMP9]], ptr [[TMP12]], align 4 +; CHECK-CA520-NEXT: store <4 x float> [[TMP10]], ptr [[TMP13]], align 4 +; CHECK-CA520-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-CA520-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-CA520-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-CA520: [[MIDDLE_BLOCK]]: +; CHECK-CA520-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-CA520-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-CA520: [[SCALAR_PH]]: +; CHECK-CA520-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-CA520-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-CA520: [[FOR_BODY]]: +; CHECK-CA520-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-CA520-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-CA520-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-CA520-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-CA520-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-CA520-NEXT: [[ADD:%.*]] = fadd fast float [[TMP16]], [[TMP15]] +; CHECK-CA520-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-CA520-NEXT: store float [[ADD]], ptr [[ARRAYIDX4]], align 4 +; CHECK-CA520-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-CA520-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-CA520-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-CA520: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; CHECK-CA520-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK-CA520: [[FOR_COND_CLEANUP]]: +; CHECK-CA520-NEXT: ret void +; +entry: + %cmp9.not = icmp eq i64 %n, 0 + br i1 %cmp9.not, label %for.cond.cleanup, label %for.body +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw float, ptr %a, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw float, ptr %b, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4 + %add = fadd fast float %1, %0 + %arrayidx4 = getelementptr inbounds nuw float, ptr %dst, i64 %indvars.iv + store float %add, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +} +;. +; CHECK-CA510: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-CA510: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-CA510: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-CA510: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +;. +; CHECK-CA520: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-CA520: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-CA520: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-CA520: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +;. From 85fd83ed49604d1046e937afa76f3ad802e28822 Mon Sep 17 00:00:00 2001 From: Asher Mancinelli Date: Fri, 4 Apr 2025 06:13:30 -0700 Subject: [PATCH 0640/1029] [flang][nfc] Use llvm memmove intrinsic over regular call (#134294) Follow up to #134170. We should be using the LLVM intrinsics instead of plain fir.calls when we can. Existing code creates a declaration for the llvm intrinsic and a regular fir.call, which makes it hard for consumers of the IR to find all the intrinsic calls. --- .../Optimizer/Builder/LowLevelIntrinsics.h | 3 - flang/lib/Optimizer/Builder/Character.cpp | 14 +- .../Optimizer/Builder/LowLevelIntrinsics.cpp | 10 - flang/test/HLFIR/assign-codegen.fir | 9 +- flang/test/HLFIR/associate-codegen.fir | 7 +- .../HLFIR/char_extremum-bufferization.fir | 45 ++- flang/test/HLFIR/concat-bufferization.fir | 193 ++++++----- flang/test/Lower/host-associated.f90 | 311 +++++++++--------- flang/test/Lower/optional-value-caller.f90 | 12 +- flang/test/Lower/pointer-references.f90 | 12 +- flang/test/Lower/statement-function.f90 | 6 +- flang/test/Lower/structure-constructors.f90 | 20 +- 12 files changed, 315 insertions(+), 327 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/LowLevelIntrinsics.h b/flang/include/flang/Optimizer/Builder/LowLevelIntrinsics.h index be106f7ea33b7..45499906e39f8 100644 --- a/flang/include/flang/Optimizer/Builder/LowLevelIntrinsics.h +++ b/flang/include/flang/Optimizer/Builder/LowLevelIntrinsics.h @@ -24,9 +24,6 @@ class FirOpBuilder; namespace fir::factory { -/// Get the LLVM intrinsic for `memmove`. Use the 64 bit version. -mlir::func::FuncOp getLlvmMemmove(FirOpBuilder &builder); - /// Get the LLVM intrinsic for `memset`. Use the 64 bit version. mlir::func::FuncOp getLlvmMemset(FirOpBuilder &builder); diff --git a/flang/lib/Optimizer/Builder/Character.cpp b/flang/lib/Optimizer/Builder/Character.cpp index b7a7453efdb39..844630996ccb2 100644 --- a/flang/lib/Optimizer/Builder/Character.cpp +++ b/flang/lib/Optimizer/Builder/Character.cpp @@ -15,6 +15,7 @@ #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/FIROpsSupport.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "llvm/Support/Debug.h" #include @@ -335,13 +336,12 @@ void fir::factory::CharacterExprHelper::createCopy( auto castCount = builder.createConvert(loc, i64Ty, count); auto totalBytes = builder.create(loc, kindBytes, castCount); - auto notVolatile = builder.createBool(loc, false); - auto memmv = getLlvmMemmove(builder); - auto argTys = memmv.getFunctionType().getInputs(); - auto toPtr = builder.createConvert(loc, argTys[0], toBuff); - auto fromPtr = builder.createConvert(loc, argTys[1], fromBuff); - builder.create( - loc, memmv, mlir::ValueRange{toPtr, fromPtr, totalBytes, notVolatile}); + auto llvmPointerType = + mlir::LLVM::LLVMPointerType::get(builder.getContext()); + auto toPtr = builder.createConvert(loc, llvmPointerType, toBuff); + auto fromPtr = builder.createConvert(loc, llvmPointerType, fromBuff); + builder.create(loc, toPtr, fromPtr, totalBytes, + /*isVolatile=*/false); return; } diff --git a/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp b/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp index e8547cf2b1e1b..73fd8fa0c2b61 100644 --- a/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp +++ b/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp @@ -21,16 +21,6 @@ #include "flang/Optimizer/Builder/LowLevelIntrinsics.h" #include "flang/Optimizer/Builder/FIRBuilder.h" -mlir::func::FuncOp fir::factory::getLlvmMemmove(fir::FirOpBuilder &builder) { - auto ptrTy = builder.getRefType(builder.getIntegerType(8)); - llvm::SmallVector args = {ptrTy, ptrTy, builder.getI64Type(), - builder.getI1Type()}; - auto memmoveTy = - mlir::FunctionType::get(builder.getContext(), args, std::nullopt); - return builder.createFunction(builder.getUnknownLoc(), - "llvm.memmove.p0.p0.i64", memmoveTy); -} - mlir::func::FuncOp fir::factory::getLlvmMemset(fir::FirOpBuilder &builder) { auto ptrTy = builder.getRefType(builder.getIntegerType(8)); llvm::SmallVector args = {ptrTy, ptrTy, builder.getI64Type(), diff --git a/flang/test/HLFIR/assign-codegen.fir b/flang/test/HLFIR/assign-codegen.fir index 7e03aa0bd464d..b2fe4fb9ed130 100644 --- a/flang/test/HLFIR/assign-codegen.fir +++ b/flang/test/HLFIR/assign-codegen.fir @@ -104,10 +104,9 @@ func.func @scalar_character(%arg0: !fir.boxchar<1>, %arg1: !fir.boxchar<1>) { // CHECK: %[[VAL_10:.*]] = arith.constant 1 : i64 // CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_9]] : (index) -> i64 // CHECK: %[[VAL_12:.*]] = arith.muli %[[VAL_10]], %[[VAL_11]] : i64 -// CHECK: %[[VAL_13:.*]] = arith.constant false -// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_3]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_6]] : (!fir.ref>) -> !fir.ref -// CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_14]], %[[VAL_15]], %[[VAL_12]], %[[VAL_13]]) : (!fir.ref, !fir.ref, i64, i1) -> () +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_3]] : (!fir.ref>) -> !llvm.ptr +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_6]] : (!fir.ref>) -> !llvm.ptr +// CHECK: "llvm.intr.memmove"(%[[VAL_14]], %[[VAL_15]], %[[VAL_12]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () // CHECK: %[[VAL_16:.*]] = arith.constant 1 : index // CHECK: %[[VAL_17:.*]] = arith.subi %[[VAL_2]]#1, %[[VAL_16]] : index // CHECK: %[[VAL_18:.*]] = arith.constant 32 : i8 @@ -480,4 +479,4 @@ func.func @test_scalar_opt_char_box(%arg0: !fir.ref>, %arg1: !fi // CHECK: fir.result %[[VAL_8]], %[[VAL_9]] : !fir.ref>, index // CHECK: } // ... -// CHECK: fir.call @llvm.memmove.p0.p0.i64( +// CHECK: "llvm.intr.memmove" diff --git a/flang/test/HLFIR/associate-codegen.fir b/flang/test/HLFIR/associate-codegen.fir index d2ab142744072..f7ee4fa83c9f2 100644 --- a/flang/test/HLFIR/associate-codegen.fir +++ b/flang/test/HLFIR/associate-codegen.fir @@ -80,10 +80,9 @@ func.func @associate_char(%arg0: !fir.boxchar<1> ) { // CHECK: %[[VAL_9:.*]] = arith.constant 1 : i64 // CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_2]]#1 : (index) -> i64 // CHECK: %[[VAL_11:.*]] = arith.muli %[[VAL_9]], %[[VAL_10]] : i64 -// CHECK: %[[VAL_12:.*]] = arith.constant false -// CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_8]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.ref>) -> !fir.ref -// CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_13]], %[[VAL_14]], %[[VAL_11]], %[[VAL_12]]) : (!fir.ref, !fir.ref, i64, i1) -> () +// CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_8]] : (!fir.ref>) -> !llvm.ptr +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.ref>) -> !llvm.ptr +// CHECK: "llvm.intr.memmove"(%[[VAL_13]], %[[VAL_14]], %[[VAL_11]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () // CHECK: %[[VAL_15:.*]] = arith.constant 1 : index // CHECK: %[[VAL_16:.*]] = arith.subi %[[VAL_7]], %[[VAL_15]] : index // CHECK: fir.do_loop %[[VAL_17:.*]] = %[[VAL_2]]#1 to %[[VAL_16]] step %[[VAL_15]] { diff --git a/flang/test/HLFIR/char_extremum-bufferization.fir b/flang/test/HLFIR/char_extremum-bufferization.fir index 4ce1471ae3271..5a7b2e3193f63 100644 --- a/flang/test/HLFIR/char_extremum-bufferization.fir +++ b/flang/test/HLFIR/char_extremum-bufferization.fir @@ -39,10 +39,9 @@ func.func @_QPmax1(%arg0: !fir.boxchar<1> {fir.bindc_name = "c1"}, %arg1: !fir.b // CHECK: %[[C1_I64:.*]] = arith.constant 1 : i64 // CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (index) -> i64 // CHECK: %[[VAL_20:.*]] = arith.muli %[[C1_I64]], %[[VAL_19]] : i64 -// CHECK: %[[FALSE:.*]] = arith.constant false -// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_16]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_14]] : (!fir.ref>) -> !fir.ref -// CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_21]], %[[VAL_22]], %[[VAL_20]], %[[FALSE]]) : (!fir.ref, !fir.ref, i64, i1) -> () +// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_16]] : (!fir.ref>) -> !llvm.ptr +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_14]] : (!fir.ref>) -> !llvm.ptr +// CHECK: "llvm.intr.memmove"(%[[VAL_21]], %[[VAL_22]], %[[VAL_20]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () // CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[VAL_23:.*]] = arith.subi %[[VAL_7]], %[[C1]] : index // CHECK: %[[C32_I8:.*]] = arith.constant 32 : i8 @@ -76,7 +75,8 @@ func.func @_QPmin1(%arg0: !fir.boxchar<1> {fir.bindc_name = "c1"}, %arg1: !fir.b return } -// CHECK: func.func @_QPmin1(%[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c1"}, %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "c2"}, %[[ARG2:.*]]: !fir.boxchar<1> {fir.bindc_name = "c3"}) { +// CHECK-LABEL: func.func @_QPmin1 +// CHECK-SAME: (%[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c1"}, %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "c2"}, %[[ARG2:.*]]: !fir.boxchar<1> {fir.bindc_name = "c3"}) { // CHECK: %[[VAL_0:.*]]:2 = fir.unboxchar %[[ARG0]] : (!fir.boxchar<1>) -> (!fir.ref>, index) // CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]]#0 typeparams %[[VAL_0]]#1 {uniq_name = "_QFmin1Ec1"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) // CHECK: %[[VAL_2:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) @@ -100,10 +100,9 @@ func.func @_QPmin1(%arg0: !fir.boxchar<1> {fir.bindc_name = "c1"}, %arg1: !fir.b // CHECK: %[[C1_I64:.*]] = arith.constant 1 : i64 // CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (index) -> i64 // CHECK: %[[VAL_20:.*]] = arith.muli %[[C1_I64]], %[[VAL_19]] : i64 -// CHECK: %[[FALSE:.*]] = arith.constant false -// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_16]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_14]] : (!fir.ref>) -> !fir.ref -// CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_21]], %[[VAL_22]], %[[VAL_20]], %[[FALSE]]) : (!fir.ref, !fir.ref, i64, i1) -> () +// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_16]] : (!fir.ref>) -> !llvm.ptr +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_14]] : (!fir.ref>) -> !llvm.ptr +// CHECK: "llvm.intr.memmove"(%[[VAL_21]], %[[VAL_22]], %[[VAL_20]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () // CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[VAL_23:.*]] = arith.subi %[[VAL_7]], %[[C1]] : index // CHECK: %[[C32_I8:.*]] = arith.constant 32 : i8 @@ -195,10 +194,9 @@ func.func @_QPmax2(%arg0: !fir.boxchar<1> {fir.bindc_name = "c1"}, %arg1: !fir.b // CHECK: %[[C1_I64:.*]] = arith.constant 1 : i64 // CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (index) -> i64 // CHECK: %[[VAL_30:.*]] = arith.muli %[[C1_I64]], %[[VAL_29]] : i64 -// CHECK: %[[FALSE:.*]] = arith.constant false -// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_26]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_24]] : (!fir.ref>) -> !fir.ref -// CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_31]], %[[VAL_32]], %[[VAL_30]], %[[FALSE]]) : (!fir.ref, !fir.ref, i64, i1) -> () +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_26]] : (!fir.ref>) -> !llvm.ptr +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_24]] : (!fir.ref>) -> !llvm.ptr +// CHECK: "llvm.intr.memmove"(%[[VAL_31]], %[[VAL_32]], %[[VAL_30]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () // CHECK: %[[C1_3:.*]] = arith.constant 1 : index // CHECK: %[[VAL_33:.*]] = arith.subi %[[VAL_15]], %[[C1_3]] : index // CHECK: %[[C32_I8:.*]] = arith.constant 32 : i8 @@ -293,10 +291,9 @@ func.func @_QPmin2(%arg0: !fir.boxchar<1> {fir.bindc_name = "c1"}, %arg1: !fir.b // CHECK: %[[C1_I64:.*]] = arith.constant 1 : i64 // CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (index) -> i64 // CHECK: %[[VAL_30:.*]] = arith.muli %[[C1_I64]], %[[VAL_29]] : i64 -// CHECK: %[[FALSE:.*]] = arith.constant false -// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_26]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_24]] : (!fir.ref>) -> !fir.ref -// CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_31]], %[[VAL_32]], %[[VAL_30]], %[[FALSE]]) : (!fir.ref, !fir.ref, i64, i1) -> () +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_26]] : (!fir.ref>) -> !llvm.ptr +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_24]] : (!fir.ref>) -> !llvm.ptr +// CHECK: "llvm.intr.memmove"(%[[VAL_31]], %[[VAL_32]], %[[VAL_30]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () // CHECK: %[[C1_3:.*]] = arith.constant 1 : index // CHECK: %[[VAL_33:.*]] = arith.subi %[[VAL_15]], %[[C1_3]] : index // CHECK: %[[C32_I8:.*]] = arith.constant 32 : i8 @@ -372,10 +369,9 @@ func.func @_QPmax3(%arg0: !fir.boxchar<1> {fir.bindc_name = "c1"}, %arg1: !fir.b // CHECK: %[[C1_I64:.*]] = arith.constant 1 : i64 // CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (index) -> i64 // CHECK: %[[VAL_32:.*]] = arith.muli %[[C1_I64]], %[[VAL_31]] : i64 -// CHECK: %[[FALSE:.*]] = arith.constant false -// CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_28]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_34:.*]] = fir.convert %[[VAL_26]] : (!fir.ref>) -> !fir.ref -// CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_33]], %[[VAL_34]], %[[VAL_32]], %[[FALSE]]) : (!fir.ref, !fir.ref, i64, i1) -> () +// CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_28]] : (!fir.ref>) -> !llvm.ptr +// CHECK: %[[VAL_34:.*]] = fir.convert %[[VAL_26]] : (!fir.ref>) -> !llvm.ptr +// CHECK: "llvm.intr.memmove"(%[[VAL_33]], %[[VAL_34]], %[[VAL_32]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () // CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[VAL_35:.*]] = arith.subi %[[VAL_19]], %[[C1]] : index // CHECK: %[[C32_I8:.*]] = arith.constant 32 : i8 @@ -448,10 +444,9 @@ func.func @_QPmin3(%arg0: !fir.boxchar<1> {fir.bindc_name = "c1"}, %arg1: !fir.b // CHECK: %[[C1_I64:.*]] = arith.constant 1 : i64 // CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (index) -> i64 // CHECK: %[[VAL_32:.*]] = arith.muli %[[C1_I64]], %[[VAL_31]] : i64 -// CHECK: %[[FALSE:.*]] = arith.constant false -// CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_28]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_34:.*]] = fir.convert %[[VAL_26]] : (!fir.ref>) -> !fir.ref -// CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_33]], %[[VAL_34]], %[[VAL_32]], %[[FALSE]]) : (!fir.ref, !fir.ref, i64, i1) -> () +// CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_28]] : (!fir.ref>) -> !llvm.ptr +// CHECK: %[[VAL_34:.*]] = fir.convert %[[VAL_26]] : (!fir.ref>) -> !llvm.ptr +// CHECK: "llvm.intr.memmove"(%[[VAL_33]], %[[VAL_34]], %[[VAL_32]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () // CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[VAL_35:.*]] = arith.subi %[[VAL_19]], %[[C1]] : index // CHECK: %[[C32_I8:.*]] = arith.constant 32 : i8 diff --git a/flang/test/HLFIR/concat-bufferization.fir b/flang/test/HLFIR/concat-bufferization.fir index a68cc3c1ed2ca..5609a57134d30 100644 --- a/flang/test/HLFIR/concat-bufferization.fir +++ b/flang/test/HLFIR/concat-bufferization.fir @@ -16,38 +16,43 @@ func.func @concat(%arg0: !fir.boxchar<1>, %arg1: !fir.boxchar<1>, %arg2: !fir.bo return } // CHECK-LABEL: func.func @concat( -// CHECK-SAME: %[[VAL_0:[^:]*]]: !fir.boxchar<1>, -// CHECK-SAME: %[[VAL_1:[^:]*]]: !fir.boxchar<1>, -// CHECK-SAME: %[[VAL_2:[^:]*]]: !fir.boxchar<1>) { -// CHECK: %[[VAL_3:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref>, index) -// CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]]#0 typeparams %[[VAL_3]]#1 {uniq_name = "c1"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) -// CHECK: %[[VAL_5:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) -// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 {uniq_name = "c2"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) -// CHECK: %[[VAL_7:.*]]:2 = fir.unboxchar %[[VAL_2]] : (!fir.boxchar<1>) -> (!fir.ref>, index) -// CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]]#0 typeparams %[[VAL_7]]#1 {uniq_name = "c3"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) -// CHECK: %[[VAL_9:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_7]]#1 : index -// CHECK: %[[VAL_10:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_7]]#1 : index -// CHECK: %[[VAL_11:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_10]] : index) {bindc_name = ".chrtmp"} -// CHECK: %[[VAL_12:.*]] = arith.constant 1 : i64 -// CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_5]]#1 : (index) -> i64 -// CHECK: %[[VAL_14:.*]] = arith.muli %[[VAL_12]], %[[VAL_13]] : i64 -// CHECK: %[[VAL_15:.*]] = arith.constant false -// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_11]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_6]]#1 : (!fir.ref>) -> !fir.ref -// CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_16]], %[[VAL_17]], %[[VAL_14]], %[[VAL_15]]) : (!fir.ref, !fir.ref, i64, i1) -> () -// CHECK: %[[VAL_18:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_19:.*]] = arith.subi %[[VAL_10]], %[[VAL_18]] : index -// CHECK: fir.do_loop %[[VAL_20:.*]] = %[[VAL_5]]#1 to %[[VAL_19]] step %[[VAL_18]] { -// CHECK: %[[VAL_21:.*]] = arith.subi %[[VAL_20]], %[[VAL_5]]#1 : index -// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_8]]#1 : (!fir.ref>) -> !fir.ref>> -// CHECK: %[[VAL_23:.*]] = fir.coordinate_of %[[VAL_22]], %[[VAL_21]] : (!fir.ref>>, index) -> !fir.ref> -// CHECK: %[[VAL_24:.*]] = fir.load %[[VAL_23]] : !fir.ref> -// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_11]] : (!fir.ref>) -> !fir.ref>> -// CHECK: %[[VAL_26:.*]] = fir.coordinate_of %[[VAL_25]], %[[VAL_20]] : (!fir.ref>>, index) -> !fir.ref> -// CHECK: fir.store %[[VAL_24]] to %[[VAL_26]] : !fir.ref> -// CHECK: } -// CHECK: %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_11]] typeparams %[[VAL_10]] {uniq_name = "tmp"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) -// CHECK: hlfir.assign %[[VAL_27]]#0 to %[[VAL_4]]#0 : !fir.boxchar<1>, !fir.boxchar<1> +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.boxchar<1>, +// CHECK-SAME: %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.boxchar<1>, +// CHECK-SAME: %[[VAL_2:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.boxchar<1>) { +// CHECK: %[[VAL_3:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]]#0 typeparams %[[VAL_3]]#1 {uniq_name = "c1"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) +// CHECK: %[[VAL_5:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 {uniq_name = "c2"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) +// CHECK: %[[VAL_7:.*]]:2 = fir.unboxchar %[[VAL_2]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]]#0 typeparams %[[VAL_7]]#1 {uniq_name = "c3"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) +// CHECK: %[[VAL_9:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_7]]#1 : index +// CHECK: %[[VAL_10:.*]] = arith.addi %[[VAL_5]]#1, %[[VAL_7]]#1 : index +// CHECK: %[[VAL_11:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_10]] : index) {bindc_name = ".chrtmp"} +// CHECK: %[[VAL_12:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_5]]#1 : (index) -> i64 +// CHECK: %[[VAL_14:.*]] = arith.muli %[[VAL_12]], %[[VAL_13]] : i64 +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_11]] : (!fir.ref>) -> !llvm.ptr +// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_6]]#1 : (!fir.ref>) -> !llvm.ptr +// CHECK: "llvm.intr.memmove"(%[[VAL_15]], %[[VAL_16]], %[[VAL_14]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () +// CHECK: %[[VAL_17:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_18:.*]] = arith.subi %[[VAL_10]], %[[VAL_17]] : index +// CHECK: fir.do_loop %[[VAL_19:.*]] = %[[VAL_5]]#1 to %[[VAL_18]] step %[[VAL_17]] { +// CHECK: %[[VAL_20:.*]] = arith.subi %[[VAL_19]], %[[VAL_5]]#1 : index +// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_8]]#1 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_22:.*]] = fir.coordinate_of %[[VAL_21]], %[[VAL_20]] : (!fir.ref>>, index) -> !fir.ref> +// CHECK: %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref> +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_11]] : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_25:.*]] = fir.coordinate_of %[[VAL_24]], %[[VAL_19]] : (!fir.ref>>, index) -> !fir.ref> +// CHECK: fir.store %[[VAL_23]] to %[[VAL_25]] : !fir.ref> +// CHECK: } +// CHECK: %[[VAL_26:.*]]:2 = hlfir.declare %[[VAL_11]] typeparams %[[VAL_10]] {uniq_name = "tmp"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) +// CHECK: %[[VAL_27:.*]] = arith.constant false +// CHECK: %[[VAL_28:.*]] = fir.undefined tuple, i1> +// CHECK: %[[VAL_29:.*]] = fir.insert_value %[[VAL_28]], %[[VAL_27]], [1 : index] : (tuple, i1>, i1) -> tuple, i1> +// CHECK: %[[VAL_30:.*]] = fir.insert_value %[[VAL_29]], %[[VAL_26]]#0, [0 : index] : (tuple, i1>, !fir.boxchar<1>) -> tuple, i1> +// CHECK: hlfir.assign %[[VAL_26]]#0 to %[[VAL_4]]#0 : !fir.boxchar<1>, !fir.boxchar<1> +// CHECK: return +// CHECK: } func.func @concat_chained(%arg0: !fir.boxchar<1>, %arg1: !fir.boxchar<1>, %arg2: !fir.boxchar<1>, %arg3: !fir.boxchar<1>) { @@ -66,62 +71,70 @@ func.func @concat_chained(%arg0: !fir.boxchar<1>, %arg1: !fir.boxchar<1>, %arg2: hlfir.assign %11 to %1#0 : !hlfir.expr>, !fir.boxchar<1> return } -// CHECK-LABEL: func.func @concat_chained( -// CHECK-SAME: %[[VAL_0:[^:]*]]: !fir.boxchar<1>, -// CHECK-SAME: %[[VAL_1:[^:]*]]: !fir.boxchar<1>, -// CHECK-SAME: %[[VAL_2:[^:]*]]: !fir.boxchar<1>, -// CHECK-SAME: %[[VAL_3:[^:]*]]: !fir.boxchar<1>) { -// CHECK: %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref>, index) -// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 {uniq_name = "c1"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) -// CHECK: %[[VAL_6:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) -// CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 {uniq_name = "c2"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) -// CHECK: %[[VAL_8:.*]]:2 = fir.unboxchar %[[VAL_2]] : (!fir.boxchar<1>) -> (!fir.ref>, index) -// CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]]#0 typeparams %[[VAL_8]]#1 {uniq_name = "c3"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) -// CHECK: %[[VAL_10:.*]]:2 = fir.unboxchar %[[VAL_3]] : (!fir.boxchar<1>) -> (!fir.ref>, index) -// CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]]#0 typeparams %[[VAL_10]]#1 {uniq_name = "c4"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) -// CHECK: %[[VAL_12:.*]] = arith.addi %[[VAL_6]]#1, %[[VAL_8]]#1 : index -// CHECK: %[[VAL_13:.*]] = arith.addi %[[VAL_6]]#1, %[[VAL_8]]#1 : index -// CHECK: %[[VAL_14:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_13]] : index) {bindc_name = ".chrtmp"} -// CHECK: %[[VAL_15:.*]] = arith.constant 1 : i64 -// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64 -// CHECK: %[[VAL_17:.*]] = arith.muli %[[VAL_15]], %[[VAL_16]] : i64 -// CHECK: %[[VAL_18:.*]] = arith.constant false -// CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_14]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_7]]#1 : (!fir.ref>) -> !fir.ref -// CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_19]], %[[VAL_20]], %[[VAL_17]], %[[VAL_18]]) : (!fir.ref, !fir.ref, i64, i1) -> () -// CHECK: %[[VAL_21:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_22:.*]] = arith.subi %[[VAL_13]], %[[VAL_21]] : index -// CHECK: fir.do_loop %[[VAL_23:.*]] = %[[VAL_6]]#1 to %[[VAL_22]] step %[[VAL_21]] { -// CHECK: %[[VAL_24:.*]] = arith.subi %[[VAL_23]], %[[VAL_6]]#1 : index -// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_9]]#1 : (!fir.ref>) -> !fir.ref>> -// CHECK: %[[VAL_26:.*]] = fir.coordinate_of %[[VAL_25]], %[[VAL_24]] : (!fir.ref>>, index) -> !fir.ref> -// CHECK: %[[VAL_27:.*]] = fir.load %[[VAL_26]] : !fir.ref> -// CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_14]] : (!fir.ref>) -> !fir.ref>> -// CHECK: %[[VAL_29:.*]] = fir.coordinate_of %[[VAL_28]], %[[VAL_23]] : (!fir.ref>>, index) -> !fir.ref> -// CHECK: fir.store %[[VAL_27]] to %[[VAL_29]] : !fir.ref> -// CHECK: } -// CHECK: %[[VAL_30:.*]]:2 = hlfir.declare %[[VAL_14]] typeparams %[[VAL_13]] {uniq_name = "tmp"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) -// CHECK: %[[VAL_31:.*]] = arith.addi %[[VAL_12]], %[[VAL_10]]#1 : index -// CHECK: %[[VAL_32:.*]] = arith.addi %[[VAL_13]], %[[VAL_10]]#1 : index -// CHECK: %[[VAL_33:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_32]] : index) {bindc_name = ".chrtmp"} -// CHECK: %[[VAL_34:.*]] = arith.constant 1 : i64 -// CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_13]] : (index) -> i64 -// CHECK: %[[VAL_36:.*]] = arith.muli %[[VAL_34]], %[[VAL_35]] : i64 -// CHECK: %[[VAL_37:.*]] = arith.constant false -// CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_33]] : (!fir.ref>) -> !fir.ref -// CHECK: %[[VAL_39:.*]] = fir.convert %[[VAL_30]]#1 : (!fir.ref>) -> !fir.ref -// CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_38]], %[[VAL_39]], %[[VAL_36]], %[[VAL_37]]) : (!fir.ref, !fir.ref, i64, i1) -> () -// CHECK: %[[VAL_40:.*]] = arith.constant 1 : index -// CHECK: %[[VAL_41:.*]] = arith.subi %[[VAL_32]], %[[VAL_40]] : index -// CHECK: fir.do_loop %[[VAL_42:.*]] = %[[VAL_13]] to %[[VAL_41]] step %[[VAL_40]] { -// CHECK: %[[VAL_43:.*]] = arith.subi %[[VAL_42]], %[[VAL_13]] : index -// CHECK: %[[VAL_44:.*]] = fir.convert %[[VAL_11]]#1 : (!fir.ref>) -> !fir.ref>> -// CHECK: %[[VAL_45:.*]] = fir.coordinate_of %[[VAL_44]], %[[VAL_43]] : (!fir.ref>>, index) -> !fir.ref> -// CHECK: %[[VAL_46:.*]] = fir.load %[[VAL_45]] : !fir.ref> -// CHECK: %[[VAL_47:.*]] = fir.convert %[[VAL_33]] : (!fir.ref>) -> !fir.ref>> -// CHECK: %[[VAL_48:.*]] = fir.coordinate_of %[[VAL_47]], %[[VAL_42]] : (!fir.ref>>, index) -> !fir.ref> -// CHECK: fir.store %[[VAL_46]] to %[[VAL_48]] : !fir.ref> -// CHECK: } -// CHECK: %[[VAL_49:.*]]:2 = hlfir.declare %[[VAL_33]] typeparams %[[VAL_32]] {uniq_name = "tmp"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) -// CHECK: hlfir.assign %[[VAL_49]]#0 to %[[VAL_5]]#0 : !fir.boxchar<1>, !fir.boxchar<1> +// CHECK-LABEL: func.func @concat_chained( +// CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.boxchar<1>, +// CHECK-SAME: %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.boxchar<1>, +// CHECK-SAME: %[[VAL_2:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.boxchar<1>, +// CHECK-SAME: %[[VAL_3:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.boxchar<1>) { +// CHECK: %[[VAL_4:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]]#0 typeparams %[[VAL_4]]#1 {uniq_name = "c1"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) +// CHECK: %[[VAL_6:.*]]:2 = fir.unboxchar %[[VAL_1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 {uniq_name = "c2"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) +// CHECK: %[[VAL_8:.*]]:2 = fir.unboxchar %[[VAL_2]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]]#0 typeparams %[[VAL_8]]#1 {uniq_name = "c3"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) +// CHECK: %[[VAL_10:.*]]:2 = fir.unboxchar %[[VAL_3]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]]#0 typeparams %[[VAL_10]]#1 {uniq_name = "c4"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) +// CHECK: %[[VAL_12:.*]] = arith.addi %[[VAL_6]]#1, %[[VAL_8]]#1 : index +// CHECK: %[[VAL_13:.*]] = arith.addi %[[VAL_6]]#1, %[[VAL_8]]#1 : index +// CHECK: %[[VAL_14:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_13]] : index) {bindc_name = ".chrtmp"} +// CHECK: %[[VAL_15:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_6]]#1 : (index) -> i64 +// CHECK: %[[VAL_17:.*]] = arith.muli %[[VAL_15]], %[[VAL_16]] : i64 +// CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_14]] : (!fir.ref>) -> !llvm.ptr +// CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_7]]#1 : (!fir.ref>) -> !llvm.ptr +// CHECK: "llvm.intr.memmove"(%[[VAL_18]], %[[VAL_19]], %[[VAL_17]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () +// CHECK: %[[VAL_20:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_21:.*]] = arith.subi %[[VAL_13]], %[[VAL_20]] : index +// CHECK: fir.do_loop %[[VAL_22:.*]] = %[[VAL_6]]#1 to %[[VAL_21]] step %[[VAL_20]] { +// CHECK: %[[VAL_23:.*]] = arith.subi %[[VAL_22]], %[[VAL_6]]#1 : index +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_9]]#1 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_25:.*]] = fir.coordinate_of %[[VAL_24]], %[[VAL_23]] : (!fir.ref>>, index) -> !fir.ref> +// CHECK: %[[VAL_26:.*]] = fir.load %[[VAL_25]] : !fir.ref> +// CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_14]] : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_28:.*]] = fir.coordinate_of %[[VAL_27]], %[[VAL_22]] : (!fir.ref>>, index) -> !fir.ref> +// CHECK: fir.store %[[VAL_26]] to %[[VAL_28]] : !fir.ref> +// CHECK: } +// CHECK: %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_14]] typeparams %[[VAL_13]] {uniq_name = "tmp"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) +// CHECK: %[[VAL_30:.*]] = arith.constant false +// CHECK: %[[VAL_31:.*]] = fir.undefined tuple, i1> +// CHECK: %[[VAL_32:.*]] = fir.insert_value %[[VAL_31]], %[[VAL_30]], [1 : index] : (tuple, i1>, i1) -> tuple, i1> +// CHECK: %[[VAL_33:.*]] = fir.insert_value %[[VAL_32]], %[[VAL_29]]#0, [0 : index] : (tuple, i1>, !fir.boxchar<1>) -> tuple, i1> +// CHECK: %[[VAL_34:.*]] = arith.addi %[[VAL_12]], %[[VAL_10]]#1 : index +// CHECK: %[[VAL_35:.*]] = arith.addi %[[VAL_13]], %[[VAL_10]]#1 : index +// CHECK: %[[VAL_36:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_35]] : index) {bindc_name = ".chrtmp"} +// CHECK: %[[VAL_37:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_13]] : (index) -> i64 +// CHECK: %[[VAL_39:.*]] = arith.muli %[[VAL_37]], %[[VAL_38]] : i64 +// CHECK: %[[VAL_40:.*]] = fir.convert %[[VAL_36]] : (!fir.ref>) -> !llvm.ptr +// CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_29]]#1 : (!fir.ref>) -> !llvm.ptr +// CHECK: "llvm.intr.memmove"(%[[VAL_40]], %[[VAL_41]], %[[VAL_39]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () +// CHECK: %[[VAL_42:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_43:.*]] = arith.subi %[[VAL_35]], %[[VAL_42]] : index +// CHECK: fir.do_loop %[[VAL_44:.*]] = %[[VAL_13]] to %[[VAL_43]] step %[[VAL_42]] { +// CHECK: %[[VAL_45:.*]] = arith.subi %[[VAL_44]], %[[VAL_13]] : index +// CHECK: %[[VAL_46:.*]] = fir.convert %[[VAL_11]]#1 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_47:.*]] = fir.coordinate_of %[[VAL_46]], %[[VAL_45]] : (!fir.ref>>, index) -> !fir.ref> +// CHECK: %[[VAL_48:.*]] = fir.load %[[VAL_47]] : !fir.ref> +// CHECK: %[[VAL_49:.*]] = fir.convert %[[VAL_36]] : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_50:.*]] = fir.coordinate_of %[[VAL_49]], %[[VAL_44]] : (!fir.ref>>, index) -> !fir.ref> +// CHECK: fir.store %[[VAL_48]] to %[[VAL_50]] : !fir.ref> +// CHECK: } +// CHECK: %[[VAL_51:.*]]:2 = hlfir.declare %[[VAL_36]] typeparams %[[VAL_35]] {uniq_name = "tmp"} : (!fir.ref>, index) -> (!fir.boxchar<1>, !fir.ref>) +// CHECK: %[[VAL_52:.*]] = arith.constant false +// CHECK: %[[VAL_53:.*]] = fir.undefined tuple, i1> +// CHECK: %[[VAL_54:.*]] = fir.insert_value %[[VAL_53]], %[[VAL_52]], [1 : index] : (tuple, i1>, i1) -> tuple, i1> +// CHECK: %[[VAL_55:.*]] = fir.insert_value %[[VAL_54]], %[[VAL_51]]#0, [0 : index] : (tuple, i1>, !fir.boxchar<1>) -> tuple, i1> +// CHECK: hlfir.assign %[[VAL_51]]#0 to %[[VAL_5]]#0 : !fir.boxchar<1>, !fir.boxchar<1> +// CHECK: return +// CHECK: } diff --git a/flang/test/Lower/host-associated.f90 b/flang/test/Lower/host-associated.f90 index 33acdff1bb74c..d5392411e0c56 100644 --- a/flang/test/Lower/host-associated.f90 +++ b/flang/test/Lower/host-associated.f90 @@ -471,163 +471,160 @@ subroutine test_proc_dummy_other(proc) call proc(4) end subroutine test_proc_dummy_other -! CHECK-LABEL: func @_QPtest_proc_dummy_char() { -! CHECK-DAG: %[[VAL_0:.*]] = arith.constant 10 : index -! CHECK-DAG: %[[VAL_1:.*]] = arith.constant 0 : i32 -! CHECK-DAG: %[[VAL_2:.*]] = arith.constant 9 : index -! CHECK-DAG: %[[VAL_3:.*]] = arith.constant false -! CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index -! CHECK-DAG: %[[VAL_5:.*]] = arith.constant 32 : i8 -! CHECK-DAG: %[[VAL_6:.*]] = arith.constant 6 : i32 -! CHECK-DAG: %[[VAL_8:.*]] = arith.constant 10 : i64 -! CHECK-DAG: %[[VAL_9:.*]] = arith.constant 40 : index -! CHECK-DAG: %[[VAL_10:.*]] = arith.constant 0 : index -! CHECK: %[[VAL_11:.*]] = fir.alloca !fir.char<1,40> {bindc_name = ".result"} -! CHECK: %[[VAL_12:.*]] = fir.alloca !fir.char<1,10> {bindc_name = "message", uniq_name = "_QFtest_proc_dummy_charEmessage"} -! CHECK: %[[VAL_13:.*]] = fir.alloca tuple> -! CHECK: %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_13]], %[[VAL_1]] : (!fir.ref>>, i32) -> !fir.ref> -! CHECK: %[[VAL_16:.*]] = fir.emboxchar %[[VAL_12]], %[[VAL_0]] : (!fir.ref>, index) -> !fir.boxchar<1> -! CHECK: fir.store %[[VAL_16]] to %[[VAL_14]] : !fir.ref> -! CHECK: %[[VAL_17:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref> -! CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_2]] : (index) -> i64 -! CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_12]] : (!fir.ref>) -> !fir.ref -! CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_17]] : (!fir.ref>) -> !fir.ref -! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_19]], %[[VAL_20]], %[[VAL_18]], %[[VAL_3]]) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () -! CHECK: %[[VAL_21:.*]] = fir.undefined !fir.char<1> -! CHECK: %[[VAL_22:.*]] = fir.insert_value %[[VAL_21]], %[[VAL_5]], [0 : index] : (!fir.char<1>, i8) -> !fir.char<1> -! CHECK: br ^bb1(%[[VAL_2]], %[[VAL_4]] : index, index) -! CHECK: ^bb1(%[[VAL_23:.*]]: index, %[[VAL_24:.*]]: index): -! CHECK: %[[VAL_25:.*]] = arith.cmpi sgt, %[[VAL_24]], %[[VAL_10]] : index -! CHECK: cond_br %[[VAL_25]], ^bb2, ^bb3 -! CHECK: ^bb2: -! CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_12]] : (!fir.ref>) -> !fir.ref>> -! CHECK: %[[VAL_27:.*]] = fir.coordinate_of %[[VAL_26]], %[[VAL_23]] : (!fir.ref>>, index) -> !fir.ref> -! CHECK: fir.store %[[VAL_22]] to %[[VAL_27]] : !fir.ref> -! CHECK: %[[VAL_28:.*]] = arith.addi %[[VAL_23]], %[[VAL_4]] : index -! CHECK: %[[VAL_29:.*]] = arith.subi %[[VAL_24]], %[[VAL_4]] : index -! CHECK: br ^bb1(%[[VAL_28]], %[[VAL_29]] : index, index) -! CHECK: ^bb3: -! CHECK: %[[VAL_30:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref>) -> !fir.ref -! CHECK: %[[VAL_32:.*]] = fir.call @_FortranAioBeginExternalListOutput(%[[VAL_6]], %[[VAL_31]], %{{.*}}) {{.*}}: (i32, !fir.ref, i32) -> !fir.ref -! CHECK: %[[VAL_33:.*]] = fir.address_of(@_QFtest_proc_dummy_charPgen_message) : (!fir.ref>, index, !fir.ref>>) -> !fir.boxchar<1> -! CHECK: %[[VAL_34:.*]] = fir.emboxproc %[[VAL_33]], %[[VAL_13]] : ((!fir.ref>, index, !fir.ref>>) -> !fir.boxchar<1>, !fir.ref>>) -> !fir.boxproc<() -> ()> -! CHECK: %[[VAL_35:.*]] = fir.undefined tuple ()>, i64> -! CHECK: %[[VAL_36:.*]] = fir.insert_value %[[VAL_35]], %[[VAL_34]], [0 : index] : (tuple ()>, i64>, !fir.boxproc<() -> ()>) -> tuple ()>, i64> -! CHECK: %[[VAL_37:.*]] = fir.insert_value %[[VAL_36]], %[[VAL_8]], [1 : index] : (tuple ()>, i64>, i64) -> tuple ()>, i64> -! CHECK: %[[VAL_38:.*]] = llvm.intr.stacksave : !llvm.ptr -! CHECK: %[[VAL_39:.*]] = fir.call @_QPget_message(%[[VAL_11]], %[[VAL_9]], %[[VAL_37]]) {{.*}}: (!fir.ref>, index, tuple ()>, i64>) -> !fir.boxchar<1> -! CHECK: %[[VAL_40:.*]] = fir.convert %[[VAL_11]] : (!fir.ref>) -> !fir.ref -! CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_9]] : (index) -> i64 -! CHECK: %[[VAL_42:.*]] = fir.call @_FortranAioOutputAscii(%[[VAL_32]], %[[VAL_40]], %[[VAL_41]]) {{.*}}: (!fir.ref, !fir.ref, i64) -> i1 -! CHECK: llvm.intr.stackrestore %[[VAL_38]] : !llvm.ptr -! CHECK: %[[VAL_43:.*]] = fir.call @_FortranAioEndIoStatement(%[[VAL_32]]) {{.*}}: (!fir.ref) -> i32 -! CHECK: return -! CHECK: } - -! CHECK-LABEL: func private @_QFtest_proc_dummy_charPgen_message( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>, -! CHECK-SAME: %[[VAL_1:.*]]: index, -! CHECK-SAME: %[[VAL_2:.*]]: !fir.ref>> {fir.host_assoc}) -> !fir.boxchar<1> attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { -! CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : i32 -! CHECK-DAG: %[[VAL_4:.*]] = arith.constant 10 : index -! CHECK-DAG: %[[VAL_5:.*]] = arith.constant false -! CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1 : index -! CHECK-DAG: %[[VAL_7:.*]] = arith.constant 32 : i8 -! CHECK-DAG: %[[VAL_8:.*]] = arith.constant 0 : index -! CHECK: %[[VAL_9:.*]] = fir.coordinate_of %[[VAL_2]], %[[VAL_3]] : (!fir.ref>>, i32) -> !fir.ref> -! CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_9]] : !fir.ref> -! CHECK: %[[VAL_11:.*]]:2 = fir.unboxchar %[[VAL_10]] : (!fir.boxchar<1>) -> (!fir.ref>, index) -! CHECK: %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_11]]#1, %[[VAL_4]] : index -! CHECK: %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_4]], %[[VAL_11]]#1 : index -! CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (index) -> i64 -! CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref -! CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_11]]#0 : (!fir.ref>) -> !fir.ref -! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_16]], %[[VAL_17]], %[[VAL_15]], %[[VAL_5]]) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () -! CHECK: %[[VAL_18:.*]] = fir.undefined !fir.char<1> -! CHECK: %[[VAL_19:.*]] = fir.insert_value %[[VAL_18]], %[[VAL_7]], [0 : index] : (!fir.char<1>, i8) -> !fir.char<1> -! CHECK: %[[VAL_20:.*]] = arith.subi %[[VAL_4]], %[[VAL_14]] : index -! CHECK: br ^bb1(%[[VAL_14]], %[[VAL_20]] : index, index) -! CHECK: ^bb1(%[[VAL_21:.*]]: index, %[[VAL_22:.*]]: index): -! CHECK: %[[VAL_23:.*]] = arith.cmpi sgt, %[[VAL_22]], %[[VAL_8]] : index -! CHECK: cond_br %[[VAL_23]], ^bb2, ^bb3 -! CHECK: ^bb2: -! CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref>> -! CHECK: %[[VAL_25:.*]] = fir.coordinate_of %[[VAL_24]], %[[VAL_21]] : (!fir.ref>>, index) -> !fir.ref> -! CHECK: fir.store %[[VAL_19]] to %[[VAL_25]] : !fir.ref> -! CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_21]], %[[VAL_6]] : index -! CHECK: %[[VAL_27:.*]] = arith.subi %[[VAL_22]], %[[VAL_6]] : index -! CHECK: br ^bb1(%[[VAL_26]], %[[VAL_27]] : index, index) -! CHECK: ^bb3: -! CHECK: %[[VAL_28:.*]] = fir.emboxchar %[[VAL_0]], %[[VAL_4]] : (!fir.ref>, index) -> !fir.boxchar<1> -! CHECK: return %[[VAL_28]] : !fir.boxchar<1> -! CHECK: } - -! CHECK-LABEL: func @_QPget_message( -! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>, -! CHECK-SAME: %[[VAL_1:.*]]: index, -! CHECK-SAME: %[[VAL_2:.*]]: tuple ()>, i64> {fir.char_proc}) -> !fir.boxchar<1> { -! CHECK-DAG: %[[VAL_3:.*]] = arith.constant 40 : index -! CHECK-DAG: %[[VAL_4:.*]] = arith.constant 12 : index -! CHECK-DAG: %[[VAL_5:.*]] = arith.constant false -! CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1 : index -! CHECK-DAG: %[[VAL_7:.*]] = arith.constant 32 : i8 -! CHECK-DAG: %[[VAL_8:.*]] = arith.constant 0 : index -! CHECK: %[[VAL_10:.*]] = fir.address_of(@_QQclX{{.*}}) : !fir.ref> -! CHECK: %[[VAL_11:.*]] = fir.extract_value %[[VAL_2]], [0 : index] : (tuple ()>, i64>) -> !fir.boxproc<() -> ()> -! CHECK: %[[VAL_12:.*]] = fir.box_addr %[[VAL_11]] : (!fir.boxproc<() -> ()>) -> (() -> ()) -! CHECK: %[[VAL_13:.*]] = fir.extract_value %[[VAL_2]], [1 : index] : (tuple ()>, i64>) -> i64 -! CHECK: %[[VAL_14:.*]] = llvm.intr.stacksave : !llvm.ptr -! CHECK: %[[VAL_15:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_13]] : i64) {bindc_name = ".result"} -! CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_12]] : (() -> ()) -> ((!fir.ref>, index) -> !fir.boxchar<1>) -! CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_13]] : (i64) -> index -! CHECK: %[[VAL_18:.*]] = fir.call %[[VAL_16]](%[[VAL_15]], %[[VAL_17]]) {{.*}}: (!fir.ref>, index) -> !fir.boxchar<1> -! CHECK: %[[VAL_19:.*]] = arith.addi %[[VAL_17]], %[[VAL_4]] : index -! CHECK: %[[VAL_20:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_19]] : index) {bindc_name = ".chrtmp"} -! CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_4]] : (index) -> i64 -! CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.ref>) -> !fir.ref -! CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_10]] : (!fir.ref>) -> !fir.ref -! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_22]], %[[VAL_23]], %[[VAL_21]], %[[VAL_5]]) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () -! CHECK: br ^bb1(%[[VAL_4]], %[[VAL_17]] : index, index) -! CHECK: ^bb1(%[[VAL_24:.*]]: index, %[[VAL_25:.*]]: index): -! CHECK: %[[VAL_26:.*]] = arith.cmpi sgt, %[[VAL_25]], %[[VAL_8]] : index -! CHECK: cond_br %[[VAL_26]], ^bb2, ^bb3 -! CHECK: ^bb2: -! CHECK: %[[VAL_27:.*]] = arith.subi %[[VAL_24]], %[[VAL_4]] : index -! CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_15]] : (!fir.ref>) -> !fir.ref>> -! CHECK: %[[VAL_29:.*]] = fir.coordinate_of %[[VAL_28]], %[[VAL_27]] : (!fir.ref>>, index) -> !fir.ref> -! CHECK: %[[VAL_30:.*]] = fir.load %[[VAL_29]] : !fir.ref> -! CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_20]] : (!fir.ref>) -> !fir.ref>> -! CHECK: %[[VAL_32:.*]] = fir.coordinate_of %[[VAL_31]], %[[VAL_24]] : (!fir.ref>>, index) -> !fir.ref> -! CHECK: fir.store %[[VAL_30]] to %[[VAL_32]] : !fir.ref> -! CHECK: %[[VAL_33:.*]] = arith.addi %[[VAL_24]], %[[VAL_6]] : index -! CHECK: %[[VAL_34:.*]] = arith.subi %[[VAL_25]], %[[VAL_6]] : index -! CHECK: br ^bb1(%[[VAL_33]], %[[VAL_34]] : index, index) -! CHECK: ^bb3: -! CHECK: %[[VAL_35:.*]] = arith.cmpi sgt, %[[VAL_19]], %[[VAL_3]] : index -! CHECK: %[[VAL_36:.*]] = arith.select %[[VAL_35]], %[[VAL_3]], %[[VAL_19]] : index -! CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (index) -> i64 -! CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref -! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_38]], %[[VAL_22]], %[[VAL_37]], %[[VAL_5]]) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () -! CHECK: %[[VAL_39:.*]] = fir.undefined !fir.char<1> -! CHECK: %[[VAL_40:.*]] = fir.insert_value %[[VAL_39]], %[[VAL_7]], [0 : index] : (!fir.char<1>, i8) -> !fir.char<1> -! CHECK: %[[VAL_41:.*]] = arith.subi %[[VAL_3]], %[[VAL_36]] : index -! CHECK: br ^bb4(%[[VAL_36]], %[[VAL_41]] : index, index) -! CHECK: ^bb4(%[[VAL_42:.*]]: index, %[[VAL_43:.*]]: index): -! CHECK: %[[VAL_44:.*]] = arith.cmpi sgt, %[[VAL_43]], %[[VAL_8]] : index -! CHECK: cond_br %[[VAL_44]], ^bb5, ^bb6 -! CHECK: ^bb5: -! CHECK: %[[VAL_45:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref>> -! CHECK: %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_45]], %[[VAL_42]] : (!fir.ref>>, index) -> !fir.ref> -! CHECK: fir.store %[[VAL_40]] to %[[VAL_46]] : !fir.ref> -! CHECK: %[[VAL_47:.*]] = arith.addi %[[VAL_42]], %[[VAL_6]] : index -! CHECK: %[[VAL_48:.*]] = arith.subi %[[VAL_43]], %[[VAL_6]] : index -! CHECK: br ^bb4(%[[VAL_47]], %[[VAL_48]] : index, index) -! CHECK: ^bb6: -! CHECK: llvm.intr.stackrestore %[[VAL_14]] : !llvm.ptr -! CHECK: %[[VAL_49:.*]] = fir.emboxchar %[[VAL_0]], %[[VAL_3]] : (!fir.ref>, index) -> !fir.boxchar<1> -! CHECK: return %[[VAL_49]] : !fir.boxchar<1> -! CHECK: } +! CHECK-LABEL: func.func @_QPtest_proc_dummy_char() { +! CHECK: %[[VAL_0:.*]] = arith.constant 0 : index +! CHECK: %[[VAL_1:.*]] = arith.constant 40 : index +! CHECK: %[[VAL_2:.*]] = arith.constant 10 : i64 +! CHECK: %[[VAL_4:.*]] = arith.constant 6 : i32 +! CHECK: %[[VAL_5:.*]] = arith.constant 32 : i8 +! CHECK: %[[VAL_6:.*]] = arith.constant 1 : index +! CHECK: %[[VAL_7:.*]] = arith.constant 9 : index +! CHECK: %[[VAL_8:.*]] = arith.constant 0 : i32 +! CHECK: %[[VAL_9:.*]] = arith.constant 10 : index +! CHECK: %[[VAL_10:.*]] = fir.alloca !fir.char<1,40> {bindc_name = ".result"} +! CHECK: %[[VAL_11:.*]] = fir.alloca !fir.char<1,10> {bindc_name = "message", uniq_name = "_QFtest_proc_dummy_charEmessage"} +! CHECK: %[[VAL_12:.*]] = fir.alloca tuple> +! CHECK: %[[VAL_13:.*]] = fir.coordinate_of %[[VAL_12]], %[[VAL_8]] : (!fir.ref>>, i32) -> !fir.ref> +! CHECK: %[[VAL_14:.*]] = fir.emboxchar %[[VAL_11]], %[[VAL_9]] : (!fir.ref>, index) -> !fir.boxchar<1> +! CHECK: fir.store %[[VAL_14]] to %[[VAL_13]] : !fir.ref> +! CHECK: %[[VAL_15:.*]] = fir.address_of(@{{.*}}) : !fir.ref> +! CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_7]] : (index) -> i64 +! CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_11]] : (!fir.ref>) -> !llvm.ptr +! CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_15]] : (!fir.ref>) -> !llvm.ptr +! CHECK: "llvm.intr.memmove"(%[[VAL_17]], %[[VAL_18]], %[[VAL_16]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () +! CHECK: %[[VAL_19:.*]] = fir.undefined !fir.char<1> +! CHECK: %[[VAL_20:.*]] = fir.insert_value %[[VAL_19]], %[[VAL_5]], [0 : index] : (!fir.char<1>, i8) -> !fir.char<1> +! CHECK: cf.br ^bb1(%[[VAL_7]], %[[VAL_6]] : index, index) +! CHECK: ^bb1(%[[VAL_21:.*]]: index, %[[VAL_22:.*]]: index): +! CHECK: %[[VAL_23:.*]] = arith.cmpi sgt, %[[VAL_22]], %[[VAL_0]] : index +! CHECK: cf.cond_br %[[VAL_23]], ^bb2, ^bb3 +! CHECK: ^bb2: +! CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_11]] : (!fir.ref>) -> !fir.ref>> +! CHECK: %[[VAL_25:.*]] = fir.coordinate_of %[[VAL_24]], %[[VAL_21]] : (!fir.ref>>, index) -> !fir.ref> +! CHECK: fir.store %[[VAL_20]] to %[[VAL_25]] : !fir.ref> +! CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_21]], %[[VAL_6]] : index +! CHECK: %[[VAL_27:.*]] = arith.subi %[[VAL_22]], %[[VAL_6]] : index +! CHECK: cf.br ^bb1(%[[VAL_26]], %[[VAL_27]] : index, index) +! CHECK: ^bb3: +! CHECK: %[[VAL_28:.*]] = fir.address_of(@{{.*}}) : !fir.ref> +! CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (!fir.ref>) -> !fir.ref +! CHECK: %[[VAL_30:.*]] = fir.call @_FortranAioBeginExternalListOutput +! CHECK: %[[VAL_31:.*]] = fir.address_of(@_QFtest_proc_dummy_charPgen_message) : (!fir.ref>, index, !fir.ref>>) -> !fir.boxchar<1> +! CHECK: %[[VAL_32:.*]] = fir.emboxproc %[[VAL_31]], %[[VAL_12]] : ((!fir.ref>, index, !fir.ref>>) -> !fir.boxchar<1>, !fir.ref>>) -> !fir.boxproc<() -> ()> +! CHECK: %[[VAL_33:.*]] = fir.undefined tuple ()>, i64> +! CHECK: %[[VAL_34:.*]] = fir.insert_value %[[VAL_33]], %[[VAL_32]], [0 : index] : (tuple ()>, i64>, !fir.boxproc<() -> ()>) -> tuple ()>, i64> +! CHECK: %[[VAL_35:.*]] = fir.insert_value %[[VAL_34]], %[[VAL_2]], [1 : index] : (tuple ()>, i64>, i64) -> tuple ()>, i64> +! CHECK: %[[VAL_36:.*]] = llvm.intr.stacksave : !llvm.ptr +! CHECK: %[[VAL_37:.*]] = fir.call @_QPget_message(%[[VAL_10]], %[[VAL_1]], %[[VAL_35]]) +! CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_10]] : (!fir.ref>) -> !fir.ref +! CHECK: %[[VAL_39:.*]] = fir.convert %[[VAL_1]] : (index) -> i64 +! CHECK: %[[VAL_40:.*]] = fir.call @_FortranAioOutputAscii +! CHECK: llvm.intr.stackrestore %[[VAL_36]] : !llvm.ptr +! CHECK: %[[VAL_41:.*]] = fir.call @_FortranAioEndIoStatement +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func private @_QFtest_proc_dummy_charPgen_message( +! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref>, +! CHECK-SAME: %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: index, +! CHECK-SAME: %[[VAL_2:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref>> {fir.host_assoc}) -> !fir.boxchar<1> attributes {{.*}} { +! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +! CHECK: %[[VAL_4:.*]] = arith.constant 32 : i8 +! CHECK: %[[VAL_5:.*]] = arith.constant 1 : index +! CHECK: %[[VAL_6:.*]] = arith.constant 10 : index +! CHECK: %[[VAL_7:.*]] = arith.constant 0 : i32 +! CHECK: %[[VAL_8:.*]] = fir.coordinate_of %[[VAL_2]], %[[VAL_7]] : (!fir.ref>>, i32) -> !fir.ref> +! CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_8]] : !fir.ref> +! CHECK: %[[VAL_10:.*]]:2 = fir.unboxchar %[[VAL_9]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +! CHECK: %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_10]]#1, %[[VAL_6]] : index +! CHECK: %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_6]], %[[VAL_10]]#1 : index +! CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (index) -> i64 +! CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !llvm.ptr +! CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_10]]#0 : (!fir.ref>) -> !llvm.ptr +! CHECK: "llvm.intr.memmove"(%[[VAL_14]], %[[VAL_15]], %[[VAL_13]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () +! CHECK: %[[VAL_16:.*]] = fir.undefined !fir.char<1> +! CHECK: %[[VAL_17:.*]] = fir.insert_value %[[VAL_16]], %[[VAL_4]], [0 : index] : (!fir.char<1>, i8) -> !fir.char<1> +! CHECK: %[[VAL_18:.*]] = arith.subi %[[VAL_6]], %[[VAL_12]] : index +! CHECK: cf.br ^bb1(%[[VAL_12]], %[[VAL_18]] : index, index) +! CHECK: ^bb1(%[[VAL_19:.*]]: index, %[[VAL_20:.*]]: index): +! CHECK: %[[VAL_21:.*]] = arith.cmpi sgt, %[[VAL_20]], %[[VAL_3]] : index +! CHECK: cf.cond_br %[[VAL_21]], ^bb2, ^bb3 +! CHECK: ^bb2: +! CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref>> +! CHECK: %[[VAL_23:.*]] = fir.coordinate_of %[[VAL_22]], %[[VAL_19]] : (!fir.ref>>, index) -> !fir.ref> +! CHECK: fir.store %[[VAL_17]] to %[[VAL_23]] : !fir.ref> +! CHECK: %[[VAL_24:.*]] = arith.addi %[[VAL_19]], %[[VAL_5]] : index +! CHECK: %[[VAL_25:.*]] = arith.subi %[[VAL_20]], %[[VAL_5]] : index +! CHECK: cf.br ^bb1(%[[VAL_24]], %[[VAL_25]] : index, index) +! CHECK: ^bb3: +! CHECK: %[[VAL_26:.*]] = fir.emboxchar %[[VAL_0]], %[[VAL_6]] : (!fir.ref>, index) -> !fir.boxchar<1> +! CHECK: return %[[VAL_26]] : !fir.boxchar<1> +! CHECK: } + +! CHECK-LABEL: func.func @_QPget_message( +! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref>, +! CHECK-SAME: %[[VAL_1:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: index, +! CHECK-SAME: %[[VAL_2:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: tuple ()>, i64> {fir.char_proc}) -> !fir.boxchar<1> { +! CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +! CHECK: %[[VAL_4:.*]] = arith.constant 32 : i8 +! CHECK: %[[VAL_5:.*]] = arith.constant 1 : index +! CHECK: %[[VAL_6:.*]] = arith.constant 12 : index +! CHECK: %[[VAL_7:.*]] = arith.constant 40 : index +! CHECK: %[[VAL_8:.*]] = fir.address_of(@{{.*}}) : !fir.ref> +! CHECK: %[[VAL_9:.*]] = fir.extract_value %[[VAL_2]], [0 : index] : (tuple ()>, i64>) -> !fir.boxproc<() -> ()> +! CHECK: %[[VAL_10:.*]] = fir.box_addr %[[VAL_9]] : (!fir.boxproc<() -> ()>) -> (() -> ()) +! CHECK: %[[VAL_11:.*]] = fir.extract_value %[[VAL_2]], [1 : index] : (tuple ()>, i64>) -> i64 +! CHECK: %[[VAL_12:.*]] = llvm.intr.stacksave : !llvm.ptr +! CHECK: %[[VAL_13:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_11]] : i64) {bindc_name = ".result"} +! CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_10]] : (() -> ()) -> ((!fir.ref>, index) -> !fir.boxchar<1>) +! CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_11]] : (i64) -> index +! CHECK: %[[VAL_16:.*]] = fir.call %[[VAL_14]](%[[VAL_13]], %[[VAL_15]]) fastmath : (!fir.ref>, index) -> !fir.boxchar<1> +! CHECK: %[[VAL_17:.*]] = arith.addi %[[VAL_15]], %[[VAL_6]] : index +! CHECK: %[[VAL_18:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_17]] : index) {bindc_name = ".chrtmp"} +! CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_6]] : (index) -> i64 +! CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_18]] : (!fir.ref>) -> !llvm.ptr +! CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_8]] : (!fir.ref>) -> !llvm.ptr +! CHECK: "llvm.intr.memmove"(%[[VAL_20]], %[[VAL_21]], %[[VAL_19]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () +! CHECK: cf.br ^bb1(%[[VAL_6]], %[[VAL_15]] : index, index) +! CHECK: ^bb1(%[[VAL_22:.*]]: index, %[[VAL_23:.*]]: index): +! CHECK: %[[VAL_24:.*]] = arith.cmpi sgt, %[[VAL_23]], %[[VAL_3]] : index +! CHECK: cf.cond_br %[[VAL_24]], ^bb2, ^bb3 +! CHECK: ^bb2: +! CHECK: %[[VAL_25:.*]] = arith.subi %[[VAL_22]], %[[VAL_6]] : index +! CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_13]] : (!fir.ref>) -> !fir.ref>> +! CHECK: %[[VAL_27:.*]] = fir.coordinate_of %[[VAL_26]], %[[VAL_25]] : (!fir.ref>>, index) -> !fir.ref> +! CHECK: %[[VAL_28:.*]] = fir.load %[[VAL_27]] : !fir.ref> +! CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_18]] : (!fir.ref>) -> !fir.ref>> +! CHECK: %[[VAL_30:.*]] = fir.coordinate_of %[[VAL_29]], %[[VAL_22]] : (!fir.ref>>, index) -> !fir.ref> +! CHECK: fir.store %[[VAL_28]] to %[[VAL_30]] : !fir.ref> +! CHECK: %[[VAL_31:.*]] = arith.addi %[[VAL_22]], %[[VAL_5]] : index +! CHECK: %[[VAL_32:.*]] = arith.subi %[[VAL_23]], %[[VAL_5]] : index +! CHECK: cf.br ^bb1(%[[VAL_31]], %[[VAL_32]] : index, index) +! CHECK: ^bb3: +! CHECK: %[[VAL_33:.*]] = arith.cmpi sgt, %[[VAL_17]], %[[VAL_7]] : index +! CHECK: %[[VAL_34:.*]] = arith.select %[[VAL_33]], %[[VAL_7]], %[[VAL_17]] : index +! CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_34]] : (index) -> i64 +! CHECK: %[[VAL_36:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !llvm.ptr +! CHECK: "llvm.intr.memmove"(%[[VAL_36]], %[[VAL_20]], %[[VAL_35]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () +! CHECK: %[[VAL_37:.*]] = fir.undefined !fir.char<1> +! CHECK: %[[VAL_38:.*]] = fir.insert_value %[[VAL_37]], %[[VAL_4]], [0 : index] : (!fir.char<1>, i8) -> !fir.char<1> +! CHECK: %[[VAL_39:.*]] = arith.subi %[[VAL_7]], %[[VAL_34]] : index +! CHECK: cf.br ^bb4(%[[VAL_34]], %[[VAL_39]] : index, index) +! CHECK: ^bb4(%[[VAL_40:.*]]: index, %[[VAL_41:.*]]: index): +! CHECK: %[[VAL_42:.*]] = arith.cmpi sgt, %[[VAL_41]], %[[VAL_3]] : index +! CHECK: cf.cond_br %[[VAL_42]], ^bb5, ^bb6 +! CHECK: ^bb5: +! CHECK: %[[VAL_43:.*]] = fir.convert %[[VAL_0]] : (!fir.ref>) -> !fir.ref>> +! CHECK: %[[VAL_44:.*]] = fir.coordinate_of %[[VAL_43]], %[[VAL_40]] : (!fir.ref>>, index) -> !fir.ref> +! CHECK: fir.store %[[VAL_38]] to %[[VAL_44]] : !fir.ref> +! CHECK: %[[VAL_45:.*]] = arith.addi %[[VAL_40]], %[[VAL_5]] : index +! CHECK: %[[VAL_46:.*]] = arith.subi %[[VAL_41]], %[[VAL_5]] : index +! CHECK: cf.br ^bb4(%[[VAL_45]], %[[VAL_46]] : index, index) +! CHECK: ^bb6: +! CHECK: llvm.intr.stackrestore %[[VAL_12]] : !llvm.ptr +! CHECK: %[[VAL_47:.*]] = fir.emboxchar %[[VAL_0]], %[[VAL_7]] : (!fir.ref>, index) -> !fir.boxchar<1> +! CHECK: return %[[VAL_47]] : !fir.boxchar<1> +! CHECK: } subroutine test_proc_dummy_char character(40) get_message diff --git a/flang/test/Lower/optional-value-caller.f90 b/flang/test/Lower/optional-value-caller.f90 index d3ad5cf85e6b9..8c3a90ba0e4ff 100644 --- a/flang/test/Lower/optional-value-caller.f90 +++ b/flang/test/Lower/optional-value-caller.f90 @@ -365,9 +365,9 @@ subroutine test_char(c) ! CHECK: %[[VAL_4:.*]] = arith.select %[[VAL_2]], %[[VAL_1]]#1, %[[VAL_3]] : index ! CHECK: %[[VAL_5:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_4]] : index) {adapt.valuebyref} ! CHECK: %[[VAL_6:.*]] = fir.if %[[VAL_2]] -> (!fir.ref>) { -! CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_5]] : (!fir.ref>) -> !fir.ref -! CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref>) -> !fir.ref -! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_13]], %[[VAL_14]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () +! CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_5]] : (!fir.ref>) -> !llvm.ptr +! CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_1]]#0 : (!fir.ref>) -> !llvm.ptr +! CHECK: "llvm.intr.memmove"(%[[VAL_13]], %[[VAL_14]], %{{.*}}) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () ! CHECK: fir.result %[[VAL_5]] : !fir.ref> ! CHECK: } else { ! CHECK: %[[VAL_24:.*]] = fir.absent !fir.ref> @@ -394,9 +394,9 @@ subroutine test_char_ptr(c) ! CHECK: %[[VAL_10:.*]] = arith.select %[[VAL_5]], %[[VAL_7]], %[[VAL_9]] : index ! CHECK: %[[VAL_11:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_10]] : index) {adapt.valuebyref} ! CHECK: %[[VAL_12:.*]] = fir.if %[[VAL_5]] -> (!fir.ref>) { -! CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_11]] : (!fir.ref>) -> !fir.ref -! CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_8]] : (!fir.ptr>) -> !fir.ref -! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_19]], %[[VAL_20]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () +! CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_11]] : (!fir.ref>) -> !llvm.ptr +! CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_8]] : (!fir.ptr>) -> !llvm.ptr +! CHECK: "llvm.intr.memmove"(%[[VAL_19]], %[[VAL_20]], %{{.*}}) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () ! CHECK: fir.result %[[VAL_11]] : !fir.ref> ! CHECK: } else { ! CHECK: %[[VAL_30:.*]] = fir.absent !fir.ref> diff --git a/flang/test/Lower/pointer-references.f90 b/flang/test/Lower/pointer-references.f90 index 02394e7ec76b0..cac06dc432895 100644 --- a/flang/test/Lower/pointer-references.f90 +++ b/flang/test/Lower/pointer-references.f90 @@ -32,17 +32,17 @@ subroutine char_ptr(p) ! CHECK-DAG: %[[one:.*]] = arith.constant 1 ! CHECK-DAG: %[[size:.*]] = fir.convert %{{.*}} : (index) -> i64 ! CHECK: %[[count:.*]] = arith.muli %[[one]], %[[size]] : i64 - ! CHECK: %[[dst:.*]] = fir.convert %[[addr]] : (!fir.ptr>) -> !fir.ref - ! CHECK: %[[src:.*]] = fir.convert %[[str]] : (!fir.ref>) -> !fir.ref - ! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[dst]], %[[src]], %{{[0-9]+}}, %false) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () + ! CHECK: %[[dst:.*]] = fir.convert %[[addr]] : (!fir.ptr>) -> !llvm.ptr + ! CHECK: %[[src:.*]] = fir.convert %[[str]] : (!fir.ref>) -> !llvm.ptr + ! CHECK: "llvm.intr.memmove"(%[[dst]], %[[src]], %{{[0-9]+}}) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () p = "hello world!" ! CHECK: %[[boxload2:.*]] = fir.load %[[arg0]] ! CHECK: %[[addr2:.*]] = fir.box_addr %[[boxload2]] ! CHECK: %[[count:.*]] = arith.muli %{{.*}}, %{{.*}} : i64 - ! CHECK: %[[dst:.*]] = fir.convert %{{.*}} : (!fir.ref>) -> !fir.ref - ! CHECK: %[[src:.*]] = fir.convert %[[addr2]] : (!fir.ptr>) -> !fir.ref - ! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[dst]], %[[src]], %[[count]], %{{.*}}) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () + ! CHECK: %[[dst:.*]] = fir.convert %{{.*}} : (!fir.ref>) -> !llvm.ptr + ! CHECK: %[[src:.*]] = fir.convert %[[addr2]] : (!fir.ptr>) -> !llvm.ptr + ! CHECK: "llvm.intr.memmove"(%[[dst]], %[[src]], %[[count]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () x = p end subroutine diff --git a/flang/test/Lower/statement-function.f90 b/flang/test/Lower/statement-function.f90 index 96d39f9ce0d23..cfec06c35baa8 100644 --- a/flang/test/Lower/statement-function.f90 +++ b/flang/test/Lower/statement-function.f90 @@ -170,9 +170,9 @@ subroutine truncate_arg ! CHECK: %[[c1:.*]] = arith.constant 1 : i64 ! CHECK: %[[select_i64:.*]] = fir.convert %[[select]] : (index) -> i64 ! CHECK: %[[length:.*]] = arith.muli %[[c1]], %[[select_i64]] : i64 -! CHECK: %[[cast_temp_i8:.*]] = fir.convert %[[temp]] : (!fir.ref>) -> !fir.ref -! CHECK: %[[cast_arg_i8:.*]] = fir.convert %[[cast_arg]] : (!fir.ref>) -> !fir.ref -! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[cast_temp_i8]], %[[cast_arg_i8]], %[[length]], %{{.*}}) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () +! CHECK: %[[cast_temp_i8:.*]] = fir.convert %[[temp]] : (!fir.ref>) -> !llvm.ptr +! CHECK: %[[cast_arg_i8:.*]] = fir.convert %[[cast_arg]] : (!fir.ref>) -> !llvm.ptr +! CHECK: "llvm.intr.memmove"(%[[cast_temp_i8]], %[[cast_arg_i8]], %[[length]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () ! CHECK: %[[c1_i64:.*]] = arith.constant 1 : i64 ! CHECK: %[[ub:.*]] = arith.subi %[[c10]], %[[c1_i64]] : i64 ! CHECK: %[[ub_index:.*]] = fir.convert %[[ub]] : (i64) -> index diff --git a/flang/test/Lower/structure-constructors.f90 b/flang/test/Lower/structure-constructors.f90 index 171c8eb631f6e..5641a370e0ae4 100644 --- a/flang/test/Lower/structure-constructors.f90 +++ b/flang/test/Lower/structure-constructors.f90 @@ -48,9 +48,9 @@ subroutine test_char_scalar(x) ! CHECK: %[[ccoor:.*]] = fir.coordinate_of %[[tmp]], c : (!fir.ref}>>) -> !fir.ref> ! CHECK: %[[cst:.*]] = fir.address_of(@_QQ{{.*}}) : !fir.ref> - ! CHECK-DAG: %[[ccast:.*]] = fir.convert %[[ccoor]] : (!fir.ref>) -> !fir.ref - ! CHECK-DAG: %[[cstcast:.*]] = fir.convert %[[cst]] : (!fir.ref>) -> !fir.ref - ! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[ccast]], %[[cstcast]], %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () + ! CHECK-DAG: %[[ccast:.*]] = fir.convert %[[ccoor]] : (!fir.ref>) -> !llvm.ptr + ! CHECK-DAG: %[[cstcast:.*]] = fir.convert %[[cst]] : (!fir.ref>) -> !llvm.ptr + ! CHECK: "llvm.intr.memmove"(%[[ccast]], %[[cstcast]], %{{.*}}) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () real :: x call print_char_scalar(t_char_scalar(x=x, c="abc")) end subroutine @@ -105,10 +105,9 @@ subroutine test_char_array(x, c1) ! CHECK: %[[VAL_26:.*]] = arith.constant 1 : i64 ! CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (index) -> i64 ! CHECK: %[[VAL_28:.*]] = arith.muli %[[VAL_26]], %[[VAL_27]] : i64 - ! CHECK: %[[VAL_29:.*]] = arith.constant false - ! CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_24]] : (!fir.ref>) -> !fir.ref - ! CHECK: %[[VAL_31:.*]] = fir.convert %[[char_temp]] : (!fir.ref>) -> !fir.ref - ! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_30]], %[[VAL_31]], %[[VAL_28]], %[[VAL_29]]) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () + ! CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_24]] : (!fir.ref>) -> !llvm.ptr + ! CHECK: %[[VAL_31:.*]] = fir.convert %[[char_temp]] : (!fir.ref>) -> !llvm.ptr + ! CHECK: "llvm.intr.memmove"(%[[VAL_30]], %[[VAL_31]], %[[VAL_28]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () ! CHECK: %[[VAL_32:.*]] = fir.array_amend %[[VAL_22]], %[[VAL_24]] : (!fir.array<5x!fir.char<1,3>>, !fir.ref>) -> !fir.array<5x!fir.char<1,3>> ! CHECK: fir.result %[[VAL_32]] : !fir.array<5x!fir.char<1,3>> ! CHECK: } @@ -287,10 +286,9 @@ subroutine test_parent_component1() ! CHECK: %[[VAL_14:.*]] = arith.constant 1 : i64 ! CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_13]] : (index) -> i64 ! CHECK: %[[VAL_16:.*]] = arith.muli %[[VAL_14]], %[[VAL_15]] : i64 -! CHECK: %[[VAL_17:.*]] = arith.constant false -! CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_12]] : (!fir.ref>) -> !fir.ref -! CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_10]] : (!fir.ref>) -> !fir.ref -! CHECK: fir.call @llvm.memmove.p0.p0.i64(%[[VAL_18]], %[[VAL_19]], %[[VAL_16]], %[[VAL_17]]) {{.*}}: (!fir.ref, !fir.ref, i64, i1) -> () +! CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_12]] : (!fir.ref>) -> !llvm.ptr +! CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_10]] : (!fir.ref>) -> !llvm.ptr +! CHECK: "llvm.intr.memmove"(%[[VAL_18]], %[[VAL_19]], %[[VAL_16]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () ! CHECK: %[[VAL_21:.*]] = fir.coordinate_of %[[VAL_0]], mask : (!fir.ref,mask:!fir.logical<4>}>>) -> !fir.ref> ! CHECK: %[[VAL_22:.*]] = arith.constant true ! CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i1) -> !fir.logical<4> From 6966b4f4a57be101150fc714f17e8d534db8e79d Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 4 Apr 2025 06:23:17 -0700 Subject: [PATCH 0641/1029] [mlir][arith] Remove func patterns from `populateArithWideIntEmulationPatterns` (#134316) This function should populate only patterns that are related to wide integer operation emulation. --- mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp index 22babe8123617..d5d1559c658ff 100644 --- a/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp @@ -1208,6 +1208,12 @@ struct EmulateWideIntPass final RewritePatternSet patterns(ctx); arith::populateArithWideIntEmulationPatterns(typeConverter, patterns); + // Populate `func.*` conversion patterns. + populateFunctionOpInterfaceTypeConversionPattern( + patterns, typeConverter); + populateCallOpTypeConversionPattern(patterns, typeConverter); + populateReturnOpTypeConversionPattern(patterns, typeConverter); + if (failed(applyPartialConversion(op, target, std::move(patterns)))) signalPassFailure(); } @@ -1281,12 +1287,6 @@ arith::WideIntEmulationConverter::WideIntEmulationConverter( void arith::populateArithWideIntEmulationPatterns( const WideIntEmulationConverter &typeConverter, RewritePatternSet &patterns) { - // Populate `func.*` conversion patterns. - populateFunctionOpInterfaceTypeConversionPattern(patterns, - typeConverter); - populateCallOpTypeConversionPattern(patterns, typeConverter); - populateReturnOpTypeConversionPattern(patterns, typeConverter); - // Populate `arith.*` conversion patterns. patterns.add< // Misc ops. From 77cfa38dcbe100ac26b98309bbf7cf346a75c2e5 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Fri, 4 Apr 2025 22:26:46 +0900 Subject: [PATCH 0642/1029] [DirectX][TTI] Sort switch statements. NFC (#134379) --- .../Target/DirectX/DirectXTargetTransformInfo.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index ba656dc737140..765e1977041b9 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -38,18 +38,18 @@ bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( Intrinsic::ID ID) const { switch (ID) { + case Intrinsic::dx_asdouble: + case Intrinsic::dx_firstbitlow: + case Intrinsic::dx_firstbitshigh: + case Intrinsic::dx_firstbituhigh: case Intrinsic::dx_frac: case Intrinsic::dx_rsqrt: + case Intrinsic::dx_splitdouble: + case Intrinsic::dx_wave_readlane: case Intrinsic::dx_wave_reduce_max: - case Intrinsic::dx_wave_reduce_umax: case Intrinsic::dx_wave_reduce_sum: + case Intrinsic::dx_wave_reduce_umax: case Intrinsic::dx_wave_reduce_usum: - case Intrinsic::dx_wave_readlane: - case Intrinsic::dx_asdouble: - case Intrinsic::dx_splitdouble: - case Intrinsic::dx_firstbituhigh: - case Intrinsic::dx_firstbitshigh: - case Intrinsic::dx_firstbitlow: return true; default: return false; From fb9deab74e5dc9a9227732fcd95c1aadacf86d44 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 4 Apr 2025 09:27:27 -0400 Subject: [PATCH 0643/1029] Add additional test coverage for WG14 N3042 This addresses a post-commit request for some additional tests --- clang/test/C/C23/n3042.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clang/test/C/C23/n3042.c b/clang/test/C/C23/n3042.c index fdcb48eb1322a..56208419fb6d3 100644 --- a/clang/test/C/C23/n3042.c +++ b/clang/test/C/C23/n3042.c @@ -83,6 +83,9 @@ void test() { (float)null_val; // expected-error {{cannot cast an object of type 'nullptr_t' to 'float'}} (float)nullptr; // expected-error {{cannot cast an object of type 'nullptr_t' to 'float'}} (nullptr_t)(int *)12; // expected-error {{cannot cast an object of type 'int *' to 'nullptr_t'}} + (nullptr_t)"testing"; // expected-error {{cannot cast an object of type 'char *' to 'nullptr_t'}} + (nullptr_t)1.0f; // expected-error {{cannot cast an object of type 'float' to 'nullptr_t'}} + (nullptr_t)'a'; // expected-error {{cannot cast an object of type 'int' to 'nullptr_t'}} (void)null_val; // ok (void)nullptr; // ok From fd6260f13bfecfb13537e184f4e8365cf35492fd Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 4 Apr 2025 14:34:08 +0100 Subject: [PATCH 0644/1029] [EquivClasses] Shorten members_{begin,end} idiom (#134373) Introduce members() iterator-helper to shorten the members_{begin,end} idiom. A previous attempt of this patch was #130319, which had to be reverted due to unit-test failures when attempting to call members() on the end iterator. In this patch, members() accepts either an ECValue or an ElemTy, which is more intuitive and doesn't suffer from the same issue. --- llvm/include/llvm/ADT/EquivalenceClasses.h | 9 +++++++++ llvm/lib/Analysis/LoopAccessAnalysis.cpp | 5 ++--- llvm/lib/Analysis/VectorUtils.cpp | 6 +++--- llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp | 5 ++--- llvm/lib/Transforms/IPO/LowerTypeTests.cpp | 13 ++++++------- llvm/lib/Transforms/Scalar/Float2Int.cpp | 10 ++++------ llvm/unittests/ADT/EquivalenceClassesTest.cpp | 13 +++++++++++++ 7 files changed, 39 insertions(+), 22 deletions(-) diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h index 906971baf74af..ad1f385cd9414 100644 --- a/llvm/include/llvm/ADT/EquivalenceClasses.h +++ b/llvm/include/llvm/ADT/EquivalenceClasses.h @@ -16,6 +16,7 @@ #define LLVM_ADT_EQUIVALENCECLASSES_H #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" #include #include #include @@ -184,6 +185,14 @@ class EquivalenceClasses { return member_iterator(nullptr); } + iterator_range members(const ECValue &ECV) const { + return make_range(member_begin(ECV), member_end()); + } + + iterator_range members(const ElemTy &V) const { + return make_range(findLeader(V), member_end()); + } + /// Returns true if \p V is contained an equivalence class. bool contains(const ElemTy &V) const { return TheMapping.find(V) != TheMapping.end(); diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 47ff31b9a0525..a37ed5c706bdb 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -526,9 +526,8 @@ void RuntimePointerChecking::groupChecks( // iteration order within an equivalence class member is only dependent on // the order in which unions and insertions are performed on the // equivalence class, the iteration order is deterministic. - for (auto MI = DepCands.findLeader(Access), ME = DepCands.member_end(); - MI != ME; ++MI) { - auto PointerI = PositionMap.find(MI->getPointer()); + for (auto M : DepCands.members(Access)) { + auto PointerI = PositionMap.find(M.getPointer()); assert(PointerI != PositionMap.end() && "pointer in equivalence class not found in PositionMap"); for (unsigned Pointer : PointerI->second) { diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 663b961da848d..46f588f4c6705 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -847,7 +847,7 @@ llvm::computeMinimumValueSizes(ArrayRef Blocks, DemandedBits &DB, if (!E->isLeader()) continue; uint64_t LeaderDemandedBits = 0; - for (Value *M : make_range(ECs.member_begin(*E), ECs.member_end())) + for (Value *M : ECs.members(*E)) LeaderDemandedBits |= DBits[M]; uint64_t MinBW = llvm::bit_width(LeaderDemandedBits); @@ -859,7 +859,7 @@ llvm::computeMinimumValueSizes(ArrayRef Blocks, DemandedBits &DB, // indvars. // If we are required to shrink a PHI, abandon this entire equivalence class. bool Abort = false; - for (Value *M : make_range(ECs.member_begin(*E), ECs.member_end())) + for (Value *M : ECs.members(*E)) if (isa(M) && MinBW < M->getType()->getScalarSizeInBits()) { Abort = true; break; @@ -867,7 +867,7 @@ llvm::computeMinimumValueSizes(ArrayRef Blocks, DemandedBits &DB, if (Abort) continue; - for (Value *M : make_range(ECs.member_begin(*E), ECs.member_end())) { + for (Value *M : ECs.members(*E)) { auto *MI = dyn_cast(M); if (!MI) continue; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp index 32472201cf9c2..dd3bec774ec67 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp @@ -1021,9 +1021,8 @@ void RecursiveSearchSplitting::setupWorkList() { continue; BitVector Cluster = SG.createNodesBitVector(); - for (auto MI = NodeEC.member_begin(*Node); MI != NodeEC.member_end(); - ++MI) { - const SplitGraph::Node &N = SG.getNode(*MI); + for (unsigned M : NodeEC.members(*Node)) { + const SplitGraph::Node &N = SG.getNode(M); if (N.isGraphEntryPoint()) N.getDependencies(Cluster); } diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index fcd8918f1d9d7..7cf7d74acfcfa 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -2349,14 +2349,13 @@ bool LowerTypeTestsModule::lower() { std::vector TypeIds; std::vector Globals; std::vector ICallBranchFunnels; - for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(*C); - MI != GlobalClasses.member_end(); ++MI) { - if (isa(*MI)) - TypeIds.push_back(cast(*MI)); - else if (isa(*MI)) - Globals.push_back(cast(*MI)); + for (auto M : GlobalClasses.members(*C)) { + if (isa(M)) + TypeIds.push_back(cast(M)); + else if (isa(M)) + Globals.push_back(cast(M)); else - ICallBranchFunnels.push_back(cast(*MI)); + ICallBranchFunnels.push_back(cast(M)); } // Order type identifiers by unique ID for determinism. This ordering is diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp index 927877b3135e5..14686ce8c2ab6 100644 --- a/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -320,10 +320,8 @@ bool Float2IntPass::validateAndTransform(const DataLayout &DL) { Type *ConvertedToTy = nullptr; // For every member of the partition, union all the ranges together. - for (auto MI = ECs.member_begin(*E), ME = ECs.member_end(); MI != ME; - ++MI) { - Instruction *I = *MI; - auto SeenI = SeenInsts.find(I); + for (Instruction *I : ECs.members(*E)) { + auto *SeenI = SeenInsts.find(I); if (SeenI == SeenInsts.end()) continue; @@ -391,8 +389,8 @@ bool Float2IntPass::validateAndTransform(const DataLayout &DL) { } } - for (auto MI = ECs.member_begin(*E), ME = ECs.member_end(); MI != ME; ++MI) - convert(*MI, Ty); + for (Instruction *I : ECs.members(*E)) + convert(I, Ty); MadeChange = true; } diff --git a/llvm/unittests/ADT/EquivalenceClassesTest.cpp b/llvm/unittests/ADT/EquivalenceClassesTest.cpp index bfb7c8d185fc8..2f9c441cde5c7 100644 --- a/llvm/unittests/ADT/EquivalenceClassesTest.cpp +++ b/llvm/unittests/ADT/EquivalenceClassesTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/EquivalenceClasses.h" +#include "gmock/gmock.h" #include "gtest/gtest.h" using namespace llvm; @@ -75,6 +76,18 @@ TEST(EquivalenceClassesTest, TwoSets) { EXPECT_FALSE(EqClasses.isEquivalent(i, j)); } +TEST(EquivalenceClassesTest, MembersIterator) { + EquivalenceClasses EC; + EC.unionSets(1, 2); + EC.insert(4); + EC.insert(5); + EC.unionSets(5, 1); + EXPECT_EQ(EC.getNumClasses(), 2u); + + EXPECT_THAT(EC.members(4), testing::ElementsAre(4)); + EXPECT_THAT(EC.members(1), testing::ElementsAre(5, 1, 2)); +} + // Type-parameterized tests: Run the same test cases with different element // types. template class ParameterizedTest : public testing::Test {}; From ecd4c0857b69e2c3f592d805bafde8e9f6a19005 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 4 Apr 2025 21:34:45 +0800 Subject: [PATCH 0645/1029] [Verifier] Require that dbg.declare variable is a ptr (#134355) As far as I understand, the first operand of dbg_declare should be a pointer (inside a metadata wrapper). However, using a non-pointer is currently not rejected, and we have some tests that use non-pointer types. As far as I can tell, these tests either meant to use dbg_value or are just incorrect hand-crafted tests. Ran into this while trying to `fix` #134008. --- llvm/docs/SourceLevelDebugging.rst | 8 ++++---- llvm/lib/IR/Verifier.cpp | 8 ++++++-- llvm/test/CodeGen/AArch64/fast-isel-dbg.ll | 4 ++-- .../test/CodeGen/AArch64/selectiondag-order.ll | 4 ++-- llvm/test/CodeGen/MIR/X86/diexpr-win32.mir | 2 +- llvm/test/CodeGen/X86/selectiondag-order.ll | 4 ++-- llvm/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll | 4 ++-- llvm/test/Transforms/Coroutines/coro-debug.ll | 18 +++++++++--------- .../Transforms/LoopVectorize/discriminator.ll | 4 ++-- 9 files changed, 30 insertions(+), 26 deletions(-) diff --git a/llvm/docs/SourceLevelDebugging.rst b/llvm/docs/SourceLevelDebugging.rst index c1a95efd2d8bc..b3007756a8d07 100644 --- a/llvm/docs/SourceLevelDebugging.rst +++ b/llvm/docs/SourceLevelDebugging.rst @@ -208,10 +208,10 @@ comma-separated arguments in parentheses, as with a `call`. #dbg_declare([Value|MDNode], DILocalVariable, DIExpression, DILocation) This record provides information about a local element (e.g., variable). -The first argument is an SSA value corresponding to a variable address, and is -typically a static alloca in the function entry block. The second argument is a -`local variable `_ containing a description of -the variable. The third argument is a `complex expression +The first argument is an SSA ``ptr`` value corresponding to a variable address, +and is typically a static alloca in the function entry block. The second +argument is a `local variable `_ containing a +description of the variable. The third argument is a `complex expression `_. The fourth argument is a `source location `_. A ``#dbg_declare`` record describes the *address* of a source variable. diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 7c6cd414554e3..7423e746dfa9a 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6666,10 +6666,14 @@ void Verifier::visit(DbgVariableRecord &DVR) { CheckDI(MD && (isa(MD) || isa(MD) || (isa(MD) && !cast(MD)->getNumOperands())), "invalid #dbg record address/value", &DVR, MD); - if (auto *VAM = dyn_cast(MD)) + if (auto *VAM = dyn_cast(MD)) { visitValueAsMetadata(*VAM, F); - else if (auto *AL = dyn_cast(MD)) + if (DVR.isDbgDeclare()) + CheckDI(VAM->getValue()->getType()->isPointerTy(), + "location of #dbg_declare must be a pointer", &DVR, MD); + } else if (auto *AL = dyn_cast(MD)) { visitDIArgList(*AL, F); + } CheckDI(isa_and_nonnull(DVR.getRawVariable()), "invalid #dbg record variable", &DVR, DVR.getRawVariable()); diff --git a/llvm/test/CodeGen/AArch64/fast-isel-dbg.ll b/llvm/test/CodeGen/AArch64/fast-isel-dbg.ll index d17c14747b942..ac6fa71e9a9e4 100644 --- a/llvm/test/CodeGen/AArch64/fast-isel-dbg.ll +++ b/llvm/test/CodeGen/AArch64/fast-isel-dbg.ll @@ -6,8 +6,8 @@ target triple="aarch64--" ; CHECK-LABEL: name: func ; CHECK: DBG_VALUE -define void @func(i32 %a) !dbg !4 { - call void @llvm.dbg.declare(metadata i32 %a, metadata !5, metadata !DIExpression()), !dbg !7 +define void @func(ptr %a) !dbg !4 { + call void @llvm.dbg.declare(metadata ptr %a, metadata !5, metadata !DIExpression()), !dbg !7 ret void } diff --git a/llvm/test/CodeGen/AArch64/selectiondag-order.ll b/llvm/test/CodeGen/AArch64/selectiondag-order.ll index fb40653723fec..32534fa79a34a 100644 --- a/llvm/test/CodeGen/AArch64/selectiondag-order.ll +++ b/llvm/test/CodeGen/AArch64/selectiondag-order.ll @@ -53,10 +53,10 @@ end: ; preds = %body ; AARCH64-CHECK: BB1_1: -define i64 @simulateWithDbgDeclare(<2 x i32> %a) local_unnamed_addr { +define i64 @simulateWithDbgDeclare(<2 x i32> %a, ptr %ptr) local_unnamed_addr { entry: %rand = tail call i64 @lrand48() #3 - tail call void @llvm.dbg.declare(metadata i64 %rand, metadata !6, metadata !7), !dbg !8 + tail call void @llvm.dbg.declare(metadata ptr %ptr, metadata !6, metadata !7), !dbg !8 br label %body body: ; preds = %body, %entry diff --git a/llvm/test/CodeGen/MIR/X86/diexpr-win32.mir b/llvm/test/CodeGen/MIR/X86/diexpr-win32.mir index b1bcf24f8c5f4..d8d76758a08a0 100644 --- a/llvm/test/CodeGen/MIR/X86/diexpr-win32.mir +++ b/llvm/test/CodeGen/MIR/X86/diexpr-win32.mir @@ -82,7 +82,7 @@ entry: %0 = bitcast ptr %s to ptr %bytes = load i32, ptr %0, !dbg !34 - call void @llvm.dbg.declare(metadata i32 %bytes, metadata !35, metadata !28), !dbg !34 + call void @llvm.dbg.value(metadata i32 %bytes, metadata !35, metadata !28), !dbg !34 %1 = add i32 %bytes, %acc, !dbg !36 ret i32 %1, !dbg !36 } diff --git a/llvm/test/CodeGen/X86/selectiondag-order.ll b/llvm/test/CodeGen/X86/selectiondag-order.ll index 163e2cb90b2fe..63ab06e660550 100644 --- a/llvm/test/CodeGen/X86/selectiondag-order.ll +++ b/llvm/test/CodeGen/X86/selectiondag-order.ll @@ -53,10 +53,10 @@ end: ; preds = %body ; X86-CHECK: callq lrand48 ; X86-CHECK: movq %rax, %rbx -define i64 @simulateWithDbgDeclare(<2 x i32> %a) local_unnamed_addr { +define i64 @simulateWithDbgDeclare(<2 x i32> %a, ptr %ptr) local_unnamed_addr { entry: %rand = tail call i64 @lrand48() #3 - tail call void @llvm.dbg.declare(metadata i64 %rand, metadata !6, metadata !7), !dbg !8 + tail call void @llvm.dbg.declare(metadata ptr %ptr, metadata !6, metadata !7), !dbg !8 br label %body body: ; preds = %body, %entry diff --git a/llvm/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll b/llvm/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll index 94b527a445d3a..35b7b044abb55 100644 --- a/llvm/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll +++ b/llvm/test/DebugInfo/ARM/lowerbdgdeclare_vla.ll @@ -19,9 +19,9 @@ target triple = "thumbv7-apple-ios8.0.0" ; Function Attrs: nounwind optsize readnone define void @run(float %r) #0 !dbg !4 { entry: - tail call void @llvm.dbg.declare(metadata float %r, metadata !11, metadata !DIExpression()), !dbg !22 + tail call void @llvm.dbg.value(metadata float %r, metadata !11, metadata !DIExpression()), !dbg !22 %conv = fptosi float %r to i32, !dbg !23 - tail call void @llvm.dbg.declare(metadata i32 %conv, metadata !12, metadata !DIExpression()), !dbg !23 + tail call void @llvm.dbg.value(metadata i32 %conv, metadata !12, metadata !DIExpression()), !dbg !23 %vla = alloca float, i32 %conv, align 4, !dbg !24 tail call void @llvm.dbg.declare(metadata ptr %vla, metadata !14, metadata !DIExpression(DW_OP_deref)), !dbg !24 ; The VLA alloca should be described by a dbg.declare: diff --git a/llvm/test/Transforms/Coroutines/coro-debug.ll b/llvm/test/Transforms/Coroutines/coro-debug.ll index 3a4eeddbe6198..17a0b80c5b5e5 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug.ll @@ -29,8 +29,8 @@ sw.bb: ; preds = %entry %direct = load i32, ptr %x.addr, align 4, !dbg !14 %gep = getelementptr inbounds [16 x i8], ptr undef, i32 %direct, !dbg !14 call void @llvm.dbg.declare(metadata ptr %gep, metadata !27, metadata !13), !dbg !14 - call void @llvm.dbg.declare(metadata i32 %conv, metadata !26, metadata !13), !dbg !14 - call void @llvm.dbg.declare(metadata i32 %direct, metadata !25, metadata !13), !dbg !14 + call void @llvm.dbg.value(metadata i32 %conv, metadata !26, metadata !13), !dbg !14 + call void @llvm.dbg.value(metadata i32 %direct, metadata !25, metadata !13), !dbg !14 call void @llvm.dbg.declare(metadata ptr %x.addr, metadata !12, metadata !13), !dbg !14 call void @llvm.dbg.declare(metadata ptr %coro_hdl, metadata !15, metadata !13), !dbg !16 call void @llvm.dbg.declare(metadata ptr %late_local, metadata !29, metadata !13), !dbg !16 @@ -66,7 +66,7 @@ coro_Cleanup: ; preds = %sw.epilog, %sw.bb1 %5 = load ptr, ptr %coro_hdl, align 8, !dbg !24 %6 = call ptr @llvm.coro.free(token %0, ptr %5), !dbg !24 call void @free(ptr %6), !dbg !24 - call void @llvm.dbg.declare(metadata i32 %asm_res, metadata !32, metadata !13), !dbg !16 + call void @llvm.dbg.value(metadata i32 %asm_res, metadata !32, metadata !13), !dbg !16 br label %coro_Suspend, !dbg !24 coro_Suspend: ; preds = %coro_Cleanup, %sw.default @@ -176,14 +176,14 @@ attributes #7 = { noduplicate } ; CHECK: %[[DBG_PTR:.*]] = alloca ptr ; CHECK: #dbg_declare(ptr %[[DBG_PTR]], ![[RESUME_COROHDL:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, ; CHECK: #dbg_declare(ptr %[[DBG_PTR]], ![[RESUME_X:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, [[EXPR_TAIL:.*]]) -; CHECK: #dbg_declare(ptr %[[DBG_PTR]], ![[RESUME_DIRECT:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, [[EXPR_TAIL]]) ; CHECK: store ptr {{.*}}, ptr %[[DBG_PTR]] ; CHECK-NOT: alloca ptr -; CHECK: #dbg_declare(i8 0, ![[RESUME_CONST:[0-9]+]], !DIExpression(DW_OP_LLVM_convert, 8, DW_ATE_signed, DW_OP_LLVM_convert, 32, DW_ATE_signed), +; CHECK: call void @coro.devirt.trigger(ptr null) +; CHECK: #dbg_value(i8 0, ![[RESUME_CONST:[0-9]+]], !DIExpression(DW_OP_LLVM_convert, 8, DW_ATE_signed, DW_OP_LLVM_convert, 32, DW_ATE_signed), +; CHECK: #dbg_value(ptr %[[DBG_PTR]], ![[RESUME_DIRECT:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), ; Note that keeping the undef value here could be acceptable, too. ; CHECK-NOT: #dbg_declare(ptr undef, !{{[0-9]+}}, !DIExpression(), -; CHECK: call void @coro.devirt.trigger(ptr null) -; CHECK: #dbg_value(ptr {{.*}}, ![[RESUME_DIRECT_VALUE:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), +; CHECK: #dbg_value(ptr %[[DBG_PTR]], ![[RESUME_DIRECT_VALUE:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, {{[0-9]+}}, DW_OP_deref), ; Check that the dbg.declare intrinsic of invoke instruction is hanled correctly. ; CHECK: %[[ALLOCATED_STORAGE:.+]] = invoke ptr @allocate() ; CHECK-NEXT: to label %[[NORMAL_DEST:.+]] unwind @@ -193,7 +193,7 @@ attributes #7 = { noduplicate } ; CHECK-NEXT: to label %[[DEFAULT_DEST:.+]] [label ; CHECK: [[DEFAULT_DEST]]: ; CHECK-NOT: {{.*}}: -; CHECK: #dbg_declare(i32 %[[CALLBR_RES]] +; CHECK: #dbg_value(i32 %[[CALLBR_RES]] ; CHECK: define internal fastcc void @f.destroy(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[DESTROY:[0-9]+]] ; CHECK: define internal fastcc void @f.cleanup(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[CLEANUP:[0-9]+]] @@ -202,8 +202,8 @@ attributes #7 = { noduplicate } ; CHECK: ![[RESUME]] = distinct !DISubprogram(name: "f", linkageName: "flink" ; CHECK: ![[RESUME_COROHDL]] = !DILocalVariable(name: "coro_hdl", scope: ![[RESUME]] ; CHECK: ![[RESUME_X]] = !DILocalVariable(name: "x", arg: 1, scope: ![[RESUME]] -; CHECK: ![[RESUME_DIRECT]] = !DILocalVariable(name: "direct_mem", scope: ![[RESUME]] ; CHECK: ![[RESUME_CONST]] = !DILocalVariable(name: "direct_const", scope: ![[RESUME]] +; CHECK: ![[RESUME_DIRECT]] = !DILocalVariable(name: "direct_mem", scope: ![[RESUME]] ; CHECK: ![[RESUME_DIRECT_VALUE]] = !DILocalVariable(name: "direct_value", scope: ![[RESUME]] ; CHECK: ![[DESTROY]] = distinct !DISubprogram(name: "f", linkageName: "flink" diff --git a/llvm/test/Transforms/LoopVectorize/discriminator.ll b/llvm/test/Transforms/LoopVectorize/discriminator.ll index b66a70b9768c4..fe71b6bd9e765 100644 --- a/llvm/test/Transforms/LoopVectorize/discriminator.ll +++ b/llvm/test/Transforms/LoopVectorize/discriminator.ll @@ -32,8 +32,8 @@ define void @_Z3foov() local_unnamed_addr #0 !dbg !6 { %7 = load i32, ptr %6, align 4, !dbg !17, !tbaa !15 %8 = add nsw i32 %7, %5, !dbg !17 ;PSEUDO_PROBE-COUNT-5: call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 -1), !dbg ![[#PROBE:]] -;DBG_VALUE: #dbg_declare{{.*}} ![[DBG:[0-9]*]] - call void @llvm.dbg.declare(metadata i32 %8, metadata !22, metadata !DIExpression()), !dbg !17 +;DBG_VALUE: #dbg_value{{.*}} ![[DBG:[0-9]*]] + call void @llvm.dbg.value(metadata i32 %8, metadata !22, metadata !DIExpression()), !dbg !17 store i32 %8, ptr %6, align 4, !dbg !17, !tbaa !15 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !18 %exitcond = icmp eq i64 %indvars.iv.next, 4096, !dbg !19 From 846000c0052000034d424c744739b30b70c8c8fc Mon Sep 17 00:00:00 2001 From: Nashe Mncube Date: Fri, 4 Apr 2025 14:36:38 +0100 Subject: [PATCH 0646/1029] Revert "[AArch64][SVE] Use FeatureUseFixedOverScalableIfEqualCost for A510 and A520" (#134382) Reverts llvm/llvm-project#132246 --- llvm/lib/Target/AArch64/AArch64Processors.td | 2 - .../AArch64/sve-fixed-width-inorder-core.ll | 170 ------------------ 2 files changed, 172 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index c37dd025d80aa..67d3ff685e6f1 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -723,7 +723,6 @@ def ProcessorFeatures { FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE, - FeatureUseFixedOverScalableIfEqualCost, FeatureRAS, FeatureRCPC, FeatureRDM]; list A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVEBitPerm, @@ -733,7 +732,6 @@ def ProcessorFeatures { FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, FeatureJS, FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM, - FeatureUseFixedOverScalableIfEqualCost, FeatureDotProd]; list A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVEBitPerm, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll deleted file mode 100644 index 19d0cc0650167..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll +++ /dev/null @@ -1,170 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a510 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA510 -; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a520 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA520 - -define void @sve_add(ptr %dst, ptr %a, ptr %b, i64 %n) { -; CHECK-CA510-LABEL: define void @sve_add( -; CHECK-CA510-SAME: ptr [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-CA510-NEXT: [[ENTRY:.*:]] -; CHECK-CA510-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 -; CHECK-CA510-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-CA510-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 -; CHECK-CA510-NEXT: [[CMP9_NOT:%.*]] = icmp eq i64 [[N]], 0 -; CHECK-CA510-NEXT: br i1 [[CMP9_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] -; CHECK-CA510: [[FOR_BODY_PREHEADER]]: -; CHECK-CA510-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 -; CHECK-CA510-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] -; CHECK-CA510: [[VECTOR_MEMCHECK]]: -; CHECK-CA510-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[A2]] -; CHECK-CA510-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 -; CHECK-CA510-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[B3]] -; CHECK-CA510-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32 -; CHECK-CA510-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] -; CHECK-CA510-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] -; CHECK-CA510: [[VECTOR_PH]]: -; CHECK-CA510-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 -; CHECK-CA510-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-CA510-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK-CA510: [[VECTOR_BODY]]: -; CHECK-CA510-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-CA510-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-CA510-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP2]] -; CHECK-CA510-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0 -; CHECK-CA510-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 4 -; CHECK-CA510-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 -; CHECK-CA510-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 -; CHECK-CA510-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP2]] -; CHECK-CA510-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 0 -; CHECK-CA510-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 4 -; CHECK-CA510-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP7]], align 4 -; CHECK-CA510-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP8]], align 4 -; CHECK-CA510-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD]] -; CHECK-CA510-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD5]] -; CHECK-CA510-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[TMP2]] -; CHECK-CA510-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 0 -; CHECK-CA510-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 4 -; CHECK-CA510-NEXT: store <4 x float> [[TMP9]], ptr [[TMP12]], align 4 -; CHECK-CA510-NEXT: store <4 x float> [[TMP10]], ptr [[TMP13]], align 4 -; CHECK-CA510-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-CA510-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-CA510-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK-CA510: [[MIDDLE_BLOCK]]: -; CHECK-CA510-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-CA510-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] -; CHECK-CA510: [[SCALAR_PH]]: -; CHECK-CA510-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] -; CHECK-CA510-NEXT: br label %[[FOR_BODY:.*]] -; CHECK-CA510: [[FOR_BODY]]: -; CHECK-CA510-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] -; CHECK-CA510-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-CA510-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-CA510-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-CA510-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-CA510-NEXT: [[ADD:%.*]] = fadd fast float [[TMP16]], [[TMP15]] -; CHECK-CA510-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDVARS_IV]] -; CHECK-CA510-NEXT: store float [[ADD]], ptr [[ARRAYIDX4]], align 4 -; CHECK-CA510-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-CA510-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-CA510-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK-CA510: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; CHECK-CA510-NEXT: br label %[[FOR_COND_CLEANUP]] -; CHECK-CA510: [[FOR_COND_CLEANUP]]: -; CHECK-CA510-NEXT: ret void -; -; CHECK-CA520-LABEL: define void @sve_add( -; CHECK-CA520-SAME: ptr [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-CA520-NEXT: [[ENTRY:.*:]] -; CHECK-CA520-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 -; CHECK-CA520-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 -; CHECK-CA520-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 -; CHECK-CA520-NEXT: [[CMP9_NOT:%.*]] = icmp eq i64 [[N]], 0 -; CHECK-CA520-NEXT: br i1 [[CMP9_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] -; CHECK-CA520: [[FOR_BODY_PREHEADER]]: -; CHECK-CA520-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 -; CHECK-CA520-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] -; CHECK-CA520: [[VECTOR_MEMCHECK]]: -; CHECK-CA520-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[A2]] -; CHECK-CA520-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 -; CHECK-CA520-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[B3]] -; CHECK-CA520-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32 -; CHECK-CA520-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] -; CHECK-CA520-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] -; CHECK-CA520: [[VECTOR_PH]]: -; CHECK-CA520-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 -; CHECK-CA520-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] -; CHECK-CA520-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK-CA520: [[VECTOR_BODY]]: -; CHECK-CA520-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-CA520-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 -; CHECK-CA520-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP2]] -; CHECK-CA520-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0 -; CHECK-CA520-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 4 -; CHECK-CA520-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 -; CHECK-CA520-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 -; CHECK-CA520-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP2]] -; CHECK-CA520-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 0 -; CHECK-CA520-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 4 -; CHECK-CA520-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP7]], align 4 -; CHECK-CA520-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP8]], align 4 -; CHECK-CA520-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD]] -; CHECK-CA520-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD5]] -; CHECK-CA520-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[TMP2]] -; CHECK-CA520-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 0 -; CHECK-CA520-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 4 -; CHECK-CA520-NEXT: store <4 x float> [[TMP9]], ptr [[TMP12]], align 4 -; CHECK-CA520-NEXT: store <4 x float> [[TMP10]], ptr [[TMP13]], align 4 -; CHECK-CA520-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-CA520-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-CA520-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK-CA520: [[MIDDLE_BLOCK]]: -; CHECK-CA520-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-CA520-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] -; CHECK-CA520: [[SCALAR_PH]]: -; CHECK-CA520-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] -; CHECK-CA520-NEXT: br label %[[FOR_BODY:.*]] -; CHECK-CA520: [[FOR_BODY]]: -; CHECK-CA520-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] -; CHECK-CA520-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] -; CHECK-CA520-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-CA520-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDVARS_IV]] -; CHECK-CA520-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 -; CHECK-CA520-NEXT: [[ADD:%.*]] = fadd fast float [[TMP16]], [[TMP15]] -; CHECK-CA520-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDVARS_IV]] -; CHECK-CA520-NEXT: store float [[ADD]], ptr [[ARRAYIDX4]], align 4 -; CHECK-CA520-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-CA520-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; CHECK-CA520-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK-CA520: [[FOR_COND_CLEANUP_LOOPEXIT]]: -; CHECK-CA520-NEXT: br label %[[FOR_COND_CLEANUP]] -; CHECK-CA520: [[FOR_COND_CLEANUP]]: -; CHECK-CA520-NEXT: ret void -; -entry: - %cmp9.not = icmp eq i64 %n, 0 - br i1 %cmp9.not, label %for.cond.cleanup, label %for.body -for.body: ; preds = %for.body.preheader, %for.body - %indvars.iv = phi i64 [ 0, %entry], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds nuw float, ptr %a, i64 %indvars.iv - %0 = load float, ptr %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds nuw float, ptr %b, i64 %indvars.iv - %1 = load float, ptr %arrayidx2, align 4 - %add = fadd fast float %1, %0 - %arrayidx4 = getelementptr inbounds nuw float, ptr %dst, i64 %indvars.iv - store float %add, ptr %arrayidx4, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %n - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - ret void -} -;. -; CHECK-CA510: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK-CA510: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-CA510: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-CA510: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} -;. -; CHECK-CA520: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK-CA520: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK-CA520: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK-CA520: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} -;. From 7d4ea771c4817bdb042ebc6a2aa35f877785edcf Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 3 Apr 2025 16:59:21 +0200 Subject: [PATCH 0647/1029] [SDAG] Use index type size for offset accumulation This is a precondition of the API. Not testable with in-tree targets. Fixes https://github.com/llvm/llvm-project/issues/134008. --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 63ee2d78cfa1b..1287d6ed4a764 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -1564,7 +1564,7 @@ static bool processDbgDeclare(FunctionLoweringInfo &FuncInfo, // Look through casts and constant offset GEPs. These mostly come from // inalloca. - APInt Offset(DL.getTypeSizeInBits(Address->getType()), 0); + APInt Offset(DL.getIndexTypeSizeInBits(Address->getType()), 0); Address = Address->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); // Check if the variable is a static alloca or a byval or inalloca From d4002b43f517fea0292bf71dccaa3d0f6dd798b9 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 4 Apr 2025 14:32:48 +0000 Subject: [PATCH 0648/1029] [lldb] Skip Expression NonZeroFrame test on Windows It is failing on our Windows on Arm bot: https://lab.llvm.org/buildbot/#/builders/141/builds/7605 Will investigate later. --- .../expr-from-non-zero-frame/TestExprFromNonZeroFrame.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py b/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py index 623c5b87f14c7..bc3f0459bd649 100644 --- a/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py +++ b/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py @@ -7,6 +7,8 @@ class ExprFromNonZeroFrame(TestBase): NO_DEBUG_INFO_TESTCASE = True + # Expression fails to evaluate on Windows, for unknown reasons. + @skipIfWindows def test(self): """ Tests that we can use SBFrame::EvaluateExpression on a frame From 5fbd0658a02bd91cf36419f2f425732a06315432 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 4 Apr 2025 15:44:26 +0100 Subject: [PATCH 0649/1029] [VPlan] Add initial CFG simplification, removing BranchOnCond true. (#106748) Add an initial CFG simplification transform, which removes the dead edges for blocks terminated with BranchOnCond true. At the moment, this removes the edge between middle block and scalar preheader when folding the tail. PR: https://github.com/llvm/llvm-project/pull/106748 --- .../Transforms/Vectorize/LoopVectorize.cpp | 13 +- llvm/lib/Transforms/Vectorize/VPlan.h | 26 +++- .../Transforms/Vectorize/VPlanTransforms.cpp | 47 +++++++ .../AArch64/clamped-trip-count.ll | 14 +- .../AArch64/conditional-branches-cost.ll | 41 +++--- .../AArch64/divs-with-scalable-vfs.ll | 8 +- .../first-order-recurrence-fold-tail.ll | 7 +- .../AArch64/induction-costs-sve.ll | 29 ++-- .../AArch64/low_trip_count_predicates.ll | 5 +- .../AArch64/masked-call-scalarize.ll | 14 +- .../LoopVectorize/AArch64/masked-call.ll | 75 ---------- .../LoopVectorize/AArch64/optsize_minsize.ll | 20 +-- .../partial-reduce-dot-product-epilogue.ll | 2 +- .../partial-reduce-dot-product-neon.ll | 6 +- .../AArch64/partial-reduce-dot-product.ll | 6 +- .../LoopVectorize/AArch64/pr73894.ll | 6 +- .../AArch64/reduction-recurrence-costs-sve.ll | 24 ++-- .../AArch64/scalable-strict-fadd.ll | 45 +++--- .../LoopVectorize/AArch64/store-costs-sve.ll | 8 +- .../sve-interleaved-masked-accesses.ll | 8 +- .../AArch64/sve-tail-folding-forced.ll | 2 +- .../AArch64/sve-tail-folding-optsize.ll | 2 +- .../sve-tail-folding-overflow-checks.ll | 4 +- .../AArch64/sve-tail-folding-reductions.ll | 36 ++--- .../AArch64/sve-tail-folding-unroll.ll | 4 +- .../LoopVectorize/AArch64/sve-tail-folding.ll | 21 ++- .../LoopVectorize/AArch64/sve2-histcnt.ll | 2 +- .../AArch64/tail-folding-styles.ll | 16 +-- ...eave-to-widen-memory-remove-loop-region.ll | 4 +- .../ARM/mve-gather-scatter-tailpred.ll | 16 +-- .../ARM/mve-hoist-runtime-checks.ll | 4 +- .../ARM/mve-reduction-predselect.ll | 16 +-- .../LoopVectorize/ARM/mve-reduction-types.ll | 54 ++++---- .../LoopVectorize/ARM/optsize_minsize.ll | 8 +- .../LoopVectorize/RISCV/inloop-reduction.ll | 24 ++-- .../LoopVectorize/RISCV/low-trip-count.ll | 4 +- .../Transforms/LoopVectorize/RISCV/pr88802.ll | 4 +- .../LoopVectorize/RISCV/scalable-tailfold.ll | 26 ++-- .../truncate-to-minimal-bitwidth-cost.ll | 5 +- .../truncate-to-minimal-bitwidth-evl-crash.ll | 4 +- .../RISCV/type-info-cache-evl-crash.ll | 4 +- .../LoopVectorize/RISCV/uniform-load-store.ll | 56 ++++---- ...-force-tail-with-evl-bin-unary-ops-args.ll | 72 +++++----- ...ize-force-tail-with-evl-call-intrinsics.ll | 36 ++--- ...ize-force-tail-with-evl-cast-intrinsics.ll | 40 +++--- ...rize-force-tail-with-evl-cond-reduction.ll | 24 ++-- .../vectorize-force-tail-with-evl-div.ll | 16 +-- ...ce-tail-with-evl-fixed-order-recurrence.ll | 54 ++------ ...ze-force-tail-with-evl-inloop-reduction.ll | 108 +++++++-------- ...ectorize-force-tail-with-evl-interleave.ll | 4 +- ...-force-tail-with-evl-intermediate-store.ll | 12 +- .../vectorize-force-tail-with-evl-iv32.ll | 4 +- ...e-force-tail-with-evl-known-no-overflow.ll | 12 +- ...ze-force-tail-with-evl-masked-loadstore.ll | 4 +- ...e-force-tail-with-evl-ordered-reduction.ll | 6 +- ...vectorize-force-tail-with-evl-reduction.ll | 108 +++++++-------- ...-force-tail-with-evl-reverse-load-store.ll | 25 ++-- ...e-force-tail-with-evl-safe-dep-distance.ll | 12 +- ...orize-force-tail-with-evl-uniform-store.ll | 5 +- .../RISCV/vectorize-vp-intrinsics.ll | 4 +- ...an-vp-intrinsics-fixed-order-recurrence.ll | 5 +- .../RISCV/vplan-vp-intrinsics-reduction.ll | 28 +--- .../SystemZ/force-target-instruction-cost.ll | 4 +- .../LoopVectorize/SystemZ/pr47665.ll | 4 +- .../predicated-first-order-recurrence.ll | 7 +- .../LoopVectorize/X86/cost-model.ll | 6 +- .../X86/divs-with-tail-folding.ll | 8 +- ...bounds-flags-for-reverse-vector-pointer.ll | 8 +- .../LoopVectorize/X86/induction-costs.ll | 4 +- .../Transforms/LoopVectorize/X86/optsize.ll | 28 ++-- .../Transforms/LoopVectorize/X86/pr81872.ll | 15 +- .../X86/scev-checks-unprofitable.ll | 4 +- .../LoopVectorize/X86/small-size.ll | 12 +- .../LoopVectorize/X86/tail_loop_folding.ll | 14 +- .../X86/vect.omp.force.small-tc.ll | 4 +- .../X86/vectorize-force-tail-with-evl.ll | 4 +- .../X86/vectorize-interleaved-accesses-gap.ll | 4 +- llvm/test/Transforms/LoopVectorize/as_cast.ll | 2 +- .../LoopVectorize/dead_instructions.ll | 8 +- .../dont-fold-tail-for-divisible-TC.ll | 4 +- ...-order-recurrence-sink-replicate-region.ll | 95 +------------ .../LoopVectorize/first-order-recurrence.ll | 130 ++++++++---------- .../Transforms/LoopVectorize/loop-form.ll | 4 +- .../LoopVectorize/memdep-fold-tail.ll | 4 +- llvm/test/Transforms/LoopVectorize/optsize.ll | 12 +- .../pr45679-fold-tail-by-masking.ll | 24 ++-- .../pr46525-expander-insertpoint.ll | 5 +- .../pr51614-fold-tail-by-masking.ll | 6 +- .../LoopVectorize/reduction-inloop-pred.ll | 30 ++-- .../LoopVectorize/reduction-inloop-uf4.ll | 4 +- .../LoopVectorize/reduction-predselect.ll | 20 +-- .../LoopVectorize/select-reduction.ll | 13 +- ...e-reduction-results-in-tail-folded-loop.ll | 7 +- .../strict-fadd-interleave-only.ll | 36 ++--- .../tail-folding-alloca-in-loop.ll | 4 +- .../tail-folding-counting-down.ll | 2 +- ...folding-optimize-vector-induction-width.ll | 36 ++--- .../LoopVectorize/tail-folding-switch.ll | 4 +- .../tail-folding-vectorization-factor-1.ll | 9 +- .../vector-loop-backedge-elimination.ll | 30 ++-- .../vplan-sink-scalars-and-merge.ll | 20 +-- 101 files changed, 862 insertions(+), 1072 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 65cce5e7d194d..cc6fd790bc437 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2486,12 +2486,13 @@ void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) { PreVectorPH->swapSuccessors(); // We just connected a new block to the scalar preheader. Update all - // ResumePhis by adding an incoming value for it. + // ResumePhis by adding an incoming value for it, replicating the last value. for (VPRecipeBase &R : *cast(ScalarPH)) { auto *ResumePhi = dyn_cast(&R); if (!ResumePhi || ResumePhi->getOpcode() != VPInstruction::ResumePhi) continue; - ResumePhi->addOperand(ResumePhi->getOperand(1)); + ResumePhi->addOperand( + ResumePhi->getOperand(ResumePhi->getNumOperands() - 1)); } } @@ -2660,7 +2661,10 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) { LoopScalarPreHeader = SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT, LI, nullptr, Twine(Prefix) + "scalar.ph"); - replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); + // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock + // wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar + // preheader may be unreachable at this point. Instead it is replaced in + // createVectorizedLoopSkeleton. } /// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV @@ -2756,6 +2760,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { // faster. emitMemRuntimeChecks(LoopScalarPreHeader); + replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); return LoopVectorPreHeader; } @@ -7909,6 +7914,7 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { // Generate the induction variable. EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); + replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); return LoopVectorPreHeader; } @@ -8057,6 +8063,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { Phi->removeIncomingValue(EPI.MemSafetyCheck); } + replaceVPBBWithIRVPBB(Plan.getScalarPreheader(), LoopScalarPreHeader); return LoopVectorPreHeader; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 50baf220a1002..ebdc09feeb06e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3529,12 +3529,28 @@ class VPlan { /// Returns the 'middle' block of the plan, that is the block that selects /// whether to execute the scalar tail loop or the exit block from the loop - /// latch. - const VPBasicBlock *getMiddleBlock() const { - return cast(getScalarPreheader()->getPredecessors().front()); - } + /// latch. If there is an early exit from the vector loop, the middle block + /// conceptully has the early exit block as third successor, split accross 2 + /// VPBBs. In that case, the second VPBB selects whether to execute the scalar + /// tail loop or the exit bock. If the scalar tail loop or exit block are + /// known to always execute, the middle block may branch directly to that + /// block. This function cannot be called once the vector loop region has been + /// removed. VPBasicBlock *getMiddleBlock() { - return cast(getScalarPreheader()->getPredecessors().front()); + VPRegionBlock *LoopRegion = getVectorLoopRegion(); + assert( + LoopRegion && + "cannot call the function after vector loop region has been removed"); + auto *RegionSucc = cast(LoopRegion->getSingleSuccessor()); + if (RegionSucc->getSingleSuccessor() || + is_contained(RegionSucc->getSuccessors(), getScalarPreheader())) + return RegionSucc; + // There is an early exit. The successor of RegionSucc is the middle block. + return cast(RegionSucc->getSuccessors()[1]); + } + + const VPBasicBlock *getMiddleBlock() const { + return const_cast(this)->getMiddleBlock(); } /// Return the VPBasicBlock for the preheader of the scalar loop. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index be3b3d19a3a11..098e35fbe5bbb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1682,6 +1682,52 @@ void VPlanTransforms::truncateToMinimalBitwidths( "some entries in MinBWs haven't been processed"); } +/// Remove BranchOnCond recipes with true conditions together with removing +/// dead edges to their successors. +static void removeBranchOnCondTrue(VPlan &Plan) { + using namespace llvm::VPlanPatternMatch; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getEntry()))) { + if (VPBB->getNumSuccessors() != 2 || + !match(&VPBB->back(), m_BranchOnCond(m_True()))) + continue; + + VPBasicBlock *RemovedSucc = cast(VPBB->getSuccessors()[1]); + const auto &Preds = RemovedSucc->getPredecessors(); + assert(count(Preds, VPBB) == 1 && + "There must be a single edge between VPBB and its successor"); + unsigned DeadIdx = std::distance(Preds.begin(), find(Preds, VPBB)); + + // Values coming from VPBB into ResumePhi recipes of RemoveSucc are removed + // from these recipes. + for (VPRecipeBase &R : make_early_inc_range(*RemovedSucc)) { + assert((!isa(&R) || + !isa(cast(&R)->getInstruction())) && + !isa(&R) && + "Cannot update VPIRInstructions wrapping phis or header phis yet"); + auto *VPI = dyn_cast(&R); + if (!VPI || VPI->getOpcode() != VPInstruction::ResumePhi) + break; + VPBuilder B(VPI); + SmallVector NewOperands; + // Create new operand list, with the dead incoming value filtered out. + for (const auto &[Idx, Op] : enumerate(VPI->operands())) { + if (Idx == DeadIdx) + continue; + NewOperands.push_back(Op); + } + VPI->replaceAllUsesWith(B.createNaryOp(VPInstruction::ResumePhi, + NewOperands, VPI->getDebugLoc(), + VPI->getName())); + VPI->eraseFromParent(); + } + // Disconnect blocks and remove the terminator. RemovedSucc will be deleted + // automatically on VPlan destruction if it becomes unreachable. + VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc); + VPBB->back().eraseFromParent(); + } +} + void VPlanTransforms::optimize(VPlan &Plan) { runPass(removeRedundantCanonicalIVs, Plan); runPass(removeRedundantInductionCasts, Plan); @@ -1691,6 +1737,7 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(legalizeAndOptimizeInductions, Plan); runPass(removeRedundantExpandSCEVRecipes, Plan); runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType()); + runPass(removeBranchOnCondTrue, Plan); runPass(removeDeadRecipes, Plan); runPass(createAndOptimizeReplicateRegions, Plan); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll index 249b8412cb6cb..3a4c1c0cc7ada 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll @@ -15,7 +15,6 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[VAL]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -41,10 +40,10 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1 ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8) ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[DST]], [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -100,7 +99,6 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[VAL]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -126,10 +124,10 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]]) ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[DST]], [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 2327eeacc10d2..4435c31e3b189 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -457,17 +457,17 @@ define void @latch_branch_cost(ptr %dst) { ; PRED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104 ; PRED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 104, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: -; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; PRED-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] ; PRED-NEXT: store i8 0, ptr [[GEP]], align 1 -; PRED-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[IV]], 1 -; PRED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; PRED-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; PRED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; PRED-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100 +; PRED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; PRED: [[EXIT]]: ; PRED-NEXT: ret void ; @@ -713,9 +713,6 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; PRED-NEXT: [[TMP3:%.*]] = mul i64 [[N_VEC]], 8 -; PRED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]] -; PRED-NEXT: [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2 ; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2 ; PRED-NEXT: [[TMP8:%.*]] = sub i64 257, [[TMP7]] @@ -741,10 +738,10 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; PRED-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 ; PRED-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[DST]], %[[ENTRY]] ] -; PRED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ] +; PRED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: ; PRED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] @@ -869,9 +866,9 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; DEFAULT-NEXT: br label %[[EXIT:.*]] ; DEFAULT: [[SCALAR_PH]]: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; DEFAULT-NEXT: br label %[[LOOP:.*]] ; DEFAULT: [[LOOP]]: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -971,9 +968,9 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { ; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; PRED-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -1408,9 +1405,9 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias ; PRED-NEXT: [[TMP85:%.*]] = extractelement <8 x i1> [[TMP84]], i32 0 ; PRED-NEXT: br i1 [[TMP85]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label %[[LOOP_HEADER:.*]] ; PRED: [[LOOP_HEADER]]: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] @@ -1521,9 +1518,9 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize { ; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 ; DEFAULT-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; DEFAULT-NEXT: br label %[[EXIT:.*]] ; DEFAULT: [[SCALAR_PH]]: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; DEFAULT-NEXT: br label %[[LOOP_HEADER:.*]] ; DEFAULT: [[LOOP_HEADER]]: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] @@ -1584,9 +1581,9 @@ define void @redundant_branch_and_tail_folding(ptr %dst, i1 %c) optsize { ; PRED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 ; PRED-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; PRED-NEXT: br label %[[LOOP_HEADER:.*]] ; PRED: [[LOOP_HEADER]]: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll index 2c7656be74e2e..d59607711b5bf 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll @@ -158,9 +158,9 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i ; CHECK-NEXT: [[TMP37:%.*]] = extractelement [[TMP36]], i32 0 ; CHECK-NEXT: br i1 [[TMP37]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] @@ -284,9 +284,9 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) { ; CHECK-NEXT: [[TMP48:%.*]] = extractelement [[TMP47]], i32 0 ; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll index 86a9af6fd5a3c..30e454d6e3b13 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence-fold-tail.ll @@ -70,11 +70,10 @@ define i32 @test_phi_iterator_invalidation(ptr %A, ptr noalias %B) { ; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP30]], i32 0 ; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP24]], i32 3 -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1004, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index 08fea4bfc9b2e..a4dc329c061e3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -142,9 +142,9 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; PRED-NEXT: [[TMP29:%.*]] = extractelement [[TMP28]], i32 0 ; PRED-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -313,9 +313,9 @@ define void @iv_trunc(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP24]], i32 0 ; PRED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label %[[FOR_BODY:.*]] ; PRED: [[FOR_BODY]]: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -448,7 +448,6 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3 ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 ; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 ; PRED-NEXT: [[TMP14:%.*]] = sub i64 [[TMP0]], 4 ; PRED-NEXT: [[TMP15:%.*]] = icmp ugt i64 [[TMP0]], 4 ; PRED-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i64 [[TMP14]], i64 0 @@ -506,10 +505,10 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP35]], i32 0 ; PRED-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] -; PRED-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: ; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] @@ -643,7 +642,6 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3 ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 ; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 ; PRED-NEXT: [[TMP13:%.*]] = sub i64 [[TMP0]], 4 ; PRED-NEXT: [[TMP14:%.*]] = icmp ugt i64 [[TMP0]], 4 ; PRED-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 0 @@ -701,10 +699,10 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 { ; PRED-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP34]], i32 0 ; PRED-NEXT: br i1 [[TMP35]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] -; PRED-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL7:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: ; PRED-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] @@ -812,7 +810,6 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 ; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; PRED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1 -; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32 ; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; PRED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer ; PRED-NEXT: br label %[[VECTOR_BODY:.*]] @@ -842,10 +839,10 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; PRED-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: -; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] -; PRED-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: ; PRED-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll index c60f25d8d61e3..641564560fc26 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll @@ -426,7 +426,6 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef % ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP1]]) ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[CONV]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -446,9 +445,9 @@ define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef % ; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 ; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[WHILE_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[WHILE_END_LOOPEXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[WHILE_BODY:.*]] ; CHECK: [[WHILE_BODY]]: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll index 121a6ed53309e..aa144c25055cc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll @@ -54,14 +54,11 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; ; TFCOMMON-LABEL: @test_widen_exp_v2( ; TFCOMMON-NEXT: entry: -; TFCOMMON-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 -; TFCOMMON-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1 -; TFCOMMON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 -; TFCOMMON-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; TFCOMMON-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 2 -; TFCOMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2 +; TFCOMMON-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0:%.*]], 1 +; TFCOMMON-NEXT: [[TMP1:%.*]] = sub i64 [[N_RND_UP]], 2 +; TFCOMMON-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[N_RND_UP]], 2 ; TFCOMMON-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0 -; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[TMP0]]) +; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 0, i64 [[N_RND_UP]]) ; TFCOMMON-NEXT: br label [[LOOP:%.*]] ; TFCOMMON: vector.body: ; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] @@ -100,9 +97,6 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-LABEL: @test_widen_exp_v2( ; TFA_INTERLEAVE-NEXT: entry: ; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0:%.*]], 1 -; TFA_INTERLEAVE-NEXT: [[N_RND_UP1:%.*]] = add i64 [[N_RND_UP]], 3 -; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP1]], 4 -; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP1]], [[N_MOD_VF]] ; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[N_RND_UP]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[N_RND_UP]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 36c3a4a2b4e43..585c2df08f7d6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -52,12 +52,6 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-LABEL: define void @test_widen( ; TFCOMMON-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; TFCOMMON-NEXT: [[ENTRY:.*]]: -; TFCOMMON-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFCOMMON-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFCOMMON-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TFCOMMON-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] -; TFCOMMON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; TFCOMMON-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFCOMMON-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFCOMMON-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) @@ -81,12 +75,6 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-LABEL: define void @test_widen( ; TFA_INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; TFA_INTERLEAVE-NEXT: [[ENTRY:.*]]: -; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] -; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() @@ -195,12 +183,6 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-LABEL: define void @test_if_then( ; TFCOMMON-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] { ; TFCOMMON-NEXT: [[ENTRY:.*]]: -; TFCOMMON-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFCOMMON-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFCOMMON-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TFCOMMON-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] -; TFCOMMON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; TFCOMMON-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFCOMMON-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFCOMMON-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) @@ -227,12 +209,6 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-LABEL: define void @test_if_then( ; TFA_INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] { ; TFA_INTERLEAVE-NEXT: [[ENTRY:.*]]: -; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] -; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() @@ -363,12 +339,6 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-LABEL: define void @test_widen_if_then_else( ; TFCOMMON-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] { ; TFCOMMON-NEXT: [[ENTRY:.*]]: -; TFCOMMON-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFCOMMON-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFCOMMON-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TFCOMMON-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] -; TFCOMMON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; TFCOMMON-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFCOMMON-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFCOMMON-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) @@ -398,12 +368,6 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-LABEL: define void @test_widen_if_then_else( ; TFA_INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] { ; TFA_INTERLEAVE-NEXT: [[ENTRY:.*]]: -; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] -; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() @@ -659,12 +623,6 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFALWAYS-LABEL: define void @test_widen_optmask( ; TFALWAYS-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] { ; TFALWAYS-NEXT: [[ENTRY:.*]]: -; TFALWAYS-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFALWAYS-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFALWAYS-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TFALWAYS-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] -; TFALWAYS-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; TFALWAYS-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFALWAYS-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFALWAYS-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) @@ -688,12 +646,6 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK-LABEL: define void @test_widen_optmask( ; TFFALLBACK-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] { ; TFFALLBACK-NEXT: [[ENTRY:.*]]: -; TFFALLBACK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFFALLBACK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFFALLBACK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TFFALLBACK-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] -; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFFALLBACK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) @@ -717,12 +669,6 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-LABEL: define void @test_widen_optmask( ; TFA_INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]]) #[[ATTR0]] { ; TFA_INTERLEAVE-NEXT: [[ENTRY:.*]]: -; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] -; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() @@ -836,12 +782,6 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFALWAYS-LABEL: define double @test_widen_fmuladd_and_call( ; TFALWAYS-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], double [[M:%.*]]) #[[ATTR0]] { ; TFALWAYS-NEXT: [[ENTRY:.*]]: -; TFALWAYS-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFALWAYS-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFALWAYS-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TFALWAYS-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] -; TFALWAYS-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; TFALWAYS-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFALWAYS-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFALWAYS-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) @@ -872,12 +812,6 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFFALLBACK-LABEL: define double @test_widen_fmuladd_and_call( ; TFFALLBACK-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], double [[M:%.*]]) #[[ATTR0]] { ; TFFALLBACK-NEXT: [[ENTRY:.*]]: -; TFFALLBACK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFFALLBACK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 -; TFFALLBACK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TFFALLBACK-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] -; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFFALLBACK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2 ; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) @@ -908,12 +842,6 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFA_INTERLEAVE-LABEL: define double @test_widen_fmuladd_and_call( ; TFA_INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], double [[M:%.*]]) #[[ATTR0]] { ; TFA_INTERLEAVE-NEXT: [[ENTRY:.*]]: -; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] -; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() @@ -1061,9 +989,6 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-SAME: ptr noalias [[P2:%.*]], ptr noalias [[P:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { ; TFA_INTERLEAVE-NEXT: [[ENTRY:.*]]: ; TFA_INTERLEAVE-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; TFA_INTERLEAVE-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP0]], 1 -; TFA_INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 -; TFA_INTERLEAVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; TFA_INTERLEAVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 2 ; TFA_INTERLEAVE-NEXT: [[TMP2:%.*]] = icmp ugt i64 [[TMP0]], 2 ; TFA_INTERLEAVE-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 0 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll index be6faaa539632..3239d2c2e9388 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll @@ -403,9 +403,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[SCALAR_PH]]: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] ; DEFAULT: [[FOR_BODY]]: ; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -598,9 +598,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) ; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; OPTSIZE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; OPTSIZE: [[MIDDLE_BLOCK]]: -; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; OPTSIZE: [[SCALAR_PH]]: -; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] ; OPTSIZE: [[FOR_BODY]]: ; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -726,9 +726,9 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 ; DEFAULT-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 ; DEFAULT-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[SCALAR_PH]]: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] ; DEFAULT: [[FOR_BODY]]: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -802,9 +802,9 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 ; OPTSIZE-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 ; OPTSIZE-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; OPTSIZE: [[MIDDLE_BLOCK]]: -; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; OPTSIZE: [[SCALAR_PH]]: -; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] ; OPTSIZE: [[FOR_BODY]]: ; OPTSIZE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -878,9 +878,9 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 ; MINSIZE-NEXT: [[TMP25:%.*]] = extractelement [[TMP24]], i32 0 ; MINSIZE-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; MINSIZE: [[MIDDLE_BLOCK]]: -; MINSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; MINSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; MINSIZE: [[SCALAR_PH]]: -; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] ; MINSIZE: [[FOR_BODY]]: ; MINSIZE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index b5755ebd35931..0e5e785a94636 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -561,7 +561,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index 83226a2074315..a8476dbddb3c2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -1295,7 +1295,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1-NEXT: br label [[EXIT:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_predicated( @@ -1630,7 +1630,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED-NEXT: br label [[EXIT:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i32 @dotp_predicated( @@ -1965,7 +1965,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-MAXBW-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW-NEXT: br label [[EXIT:%.*]] ; CHECK-MAXBW: scalar.ph: ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index bcdbb4d4dfbf7..542008f34131f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -1557,7 +1557,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) -; CHECK-INTERLEAVE1-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1-NEXT: br label [[EXIT:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_predicated( @@ -1602,7 +1602,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED-NEXT: br label [[EXIT:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i32 @dotp_predicated( @@ -1647,7 +1647,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-MAXBW: middle.block: ; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) -; CHECK-MAXBW-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW-NEXT: br label [[EXIT:%.*]] ; CHECK-MAXBW: scalar.ph: ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll index 4cfb9b45ef56b..d9a3a71141540 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr73894.ll @@ -53,10 +53,10 @@ define i32 @pr70988(ptr %src, i32 %n) { ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP17]], i32 [[TMP18]]) -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[INDUC:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDUC_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll index 51a08d3b0a3bf..969bb413f9c50 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll @@ -254,20 +254,16 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2) ; PRED-NEXT: br i1 [[TMP43]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32( [[TMP41]]) -; PRED-NEXT: [[TMP45:%.*]] = call i32 @llvm.vscale.i32() -; PRED-NEXT: [[TMP46:%.*]] = mul i32 [[TMP45]], 4 -; PRED-NEXT: [[TMP47:%.*]] = sub i32 [[TMP46]], 1 -; PRED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP29]], i32 [[TMP47]] -; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: -; PRED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP28]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; PRED-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; PRED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP44]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; PRED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, %[[ENTRY]] ] +; PRED-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ 0, %[[ENTRY]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; PRED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: -; PRED-NEXT: [[TMP48:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP53:%.*]], %[[LOOP]] ] -; PRED-NEXT: [[SCALAR_RECUR10:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[TMP48]], %[[LOOP]] ] +; PRED-NEXT: [[TMP45:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP53:%.*]], %[[LOOP]] ] +; PRED-NEXT: [[SCALAR_RECUR10:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[TMP45]], %[[LOOP]] ] ; PRED-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LOOP]] ] ; PRED-NEXT: [[SUM_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_2:%.*]], %[[LOOP]] ] ; PRED-NEXT: [[TMP52:%.*]] = add i64 [[Y]], 1 @@ -498,10 +494,10 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 { ; PRED-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: ; PRED-NEXT: [[TMP19:%.*]] = call i16 @llvm.vector.reduce.or.nxv8i16( [[TMP16]]) -; PRED-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; PRED-NEXT: br label %[[EXIT:.*]] ; PRED: [[SCALAR_PH]]: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; PRED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP19]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; PRED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, %[[ENTRY]] ] ; PRED-NEXT: br label %[[LOOP:.*]] ; PRED: [[LOOP]]: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index 6c31182d5d20d..8ba3debd46c8e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -156,10 +156,10 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: -; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]] ; CHECK-ORDERED-TF: scalar.ph: -; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -420,10 +420,10 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP46:%.*]] = extractelement [[TMP45]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: -; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]] ; CHECK-ORDERED-TF: scalar.ph: -; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP35]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -643,7 +643,6 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-TF-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 -; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 2 ; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4 ; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = sub i64 [[TMP2]], [[TMP10]] @@ -673,11 +672,11 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali ; CHECK-ORDERED-TF-NEXT: [[TMP22:%.*]] = extractelement [[TMP21]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: -; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]] ; CHECK-ORDERED-TF: scalar.ph: -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[A2]], [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[A1]], [[ENTRY]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX2:%.*]] = phi float [ [[A1]], [[ENTRY]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ] @@ -925,10 +924,10 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu ; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: -; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_END_LOOPEXIT:%.*]] ; CHECK-ORDERED-TF: scalar.ph: -; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -1161,10 +1160,10 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no ; CHECK-ORDERED-TF-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: -; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]] ; CHECK-ORDERED-TF: scalar.ph: -; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[ENTRY]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -1644,10 +1643,10 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = extractelement [[TMP60]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP61]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: -; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]] ; CHECK-ORDERED-TF: scalar.ph: -; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP50]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1972,10 +1971,10 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = extractelement [[TMP60]], i32 0 ; CHECK-ORDERED-TF-NEXT: br i1 [[TMP61]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-ORDERED-TF: middle.block: -; CHECK-ORDERED-TF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-ORDERED-TF-NEXT: br label [[FOR_END:%.*]] ; CHECK-ORDERED-TF: scalar.ph: -; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP50]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-ORDERED-TF-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-ORDERED-TF-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-ORDERED-TF: for.body: ; CHECK-ORDERED-TF-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index 7b8b379ce3806..9b6a1686eee6e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -112,9 +112,9 @@ define void @cost_store_i8(ptr %dst) #0 { ; PRED-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 ; PRED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PRED: middle.block: -; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; PRED-NEXT: br label [[EXIT:%.*]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -260,9 +260,9 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; PRED-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 ; PRED-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; PRED: middle.block: -; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; PRED-NEXT: br label [[EXIT:%.*]] ; PRED: scalar.ph: -; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; PRED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; PRED-NEXT: br label [[LOOP:%.*]] ; PRED: loop: ; PRED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index 9d6b691f3ed31..d8681bef80417 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -116,7 +116,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP17]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_TAIL_FOLDING: scalar.ph: ; entry: @@ -247,7 +247,7 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_TAIL_FOLDING: scalar.ph: ; entry: @@ -382,7 +382,7 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_TAIL_FOLDING: scalar.ph: ; entry: @@ -563,7 +563,7 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: -; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_TAIL_FOLDING: scalar.ph: ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll index 25403599977cb..19aea78176a8e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -75,7 +75,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll index 63bb485e7f085..5a4e0efb36cb9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll @@ -36,7 +36,7 @@ define void @trip1025_i64(ptr noalias nocapture noundef %dst, ptr noalias nocapt ; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll index b39c47cc7906d..a22f065415307 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll @@ -32,7 +32,7 @@ define void @cannot_overflow_i32_induction_var(ptr noalias %dst, ptr readonly %s ; CHECK-NEXT: [[TMP5:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; entry: @@ -89,7 +89,7 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src, ; CHECK-NEXT: [[TMP8:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; CHECK-NEXT: br i1 [[TMP8]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll index b174bf6a9dd1d..f4639203d1036 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-reductions.ll @@ -43,10 +43,10 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -97,10 +97,10 @@ define i32 @add_reduction_i32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 ; CHECK-IN-LOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-IN-LOOP: middle.block: -; CHECK-IN-LOOP-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-IN-LOOP-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK-IN-LOOP: scalar.ph: -; CHECK-IN-LOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-IN-LOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-IN-LOOP-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-IN-LOOP: while.body: ; CHECK-IN-LOOP-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -168,10 +168,10 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK: while.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -221,10 +221,10 @@ define float @add_reduction_f32(ptr %ptr, i64 %n) #0 { ; CHECK-IN-LOOP-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 ; CHECK-IN-LOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-IN-LOOP: middle.block: -; CHECK-IN-LOOP-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-IN-LOOP-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK-IN-LOOP: scalar.ph: -; CHECK-IN-LOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-IN-LOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-IN-LOOP-NEXT: br label [[WHILE_BODY:%.*]] ; CHECK-IN-LOOP: while.body: ; CHECK-IN-LOOP-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -298,10 +298,10 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP20]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP25]], [[MIDDLE_BLOCK]] ], [ 7, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 7, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -364,10 +364,10 @@ define i32 @cond_xor_reduction(ptr noalias %a, ptr noalias %cond, i64 %N) #0 { ; CHECK-IN-LOOP-NEXT: [[TMP23:%.*]] = extractelement [[TMP22]], i32 0 ; CHECK-IN-LOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-IN-LOOP: middle.block: -; CHECK-IN-LOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-IN-LOOP-NEXT: br label [[FOR_END:%.*]] ; CHECK-IN-LOOP: scalar.ph: -; CHECK-IN-LOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP19]], [[MIDDLE_BLOCK]] ], [ 7, [[ENTRY]] ] +; CHECK-IN-LOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-IN-LOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 7, [[ENTRY]] ] ; CHECK-IN-LOOP-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-IN-LOOP: for.body: ; CHECK-IN-LOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll index ce761913ea0fc..977115ce5321a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -78,7 +78,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP76:%.*]] = extractelement [[TMP72]], i32 0 ; CHECK-NEXT: br i1 [[TMP76]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; entry: @@ -193,7 +193,7 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[TMP98:%.*]] = extractelement [[TMP94]], i32 0 ; CHECK-NEXT: br i1 [[TMP98]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index aab4f33f87c0f..397eee28bda02 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -39,7 +39,7 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP13]], i32 0 ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; entry: @@ -86,7 +86,7 @@ define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; entry: @@ -141,7 +141,7 @@ define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; entry: @@ -179,7 +179,6 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 ; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]] @@ -208,7 +207,7 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP22:%.*]] = extractelement [[TMP21]], i32 0 ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; entry: @@ -266,7 +265,7 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali ; CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; entry: @@ -326,7 +325,7 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; @@ -393,7 +392,7 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK-NEXT: [[TMP19:%.*]] = extractelement [[TMP18]], i32 0 ; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; @@ -461,7 +460,7 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP13]], i32 0 ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; @@ -520,7 +519,7 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; entry: @@ -583,7 +582,7 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP18:%.*]] = extractelement [[TMP17]], i32 0 ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll index dd1f77582e0be..56cea996f3d80 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll @@ -568,7 +568,7 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic ; CHECK-NEXT: [[TMP11:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_EXIT:%.*]] ; CHECK: scalar.ph: ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll index 07b2ed9185db2..71d03afa6b6f1 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll @@ -81,9 +81,9 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] ; DATA-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; DATA: middle.block: -; DATA-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; DATA-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; DATA: scalar.ph: -; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; DATA-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; DATA-NEXT: br label [[WHILE_BODY:%.*]] ; DATA: while.body: ; DATA-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -133,9 +133,9 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_NO_LANEMASK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]] ; DATA_NO_LANEMASK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; DATA_NO_LANEMASK: middle.block: -; DATA_NO_LANEMASK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; DATA_NO_LANEMASK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; DATA_NO_LANEMASK: scalar.ph: -; DATA_NO_LANEMASK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; DATA_NO_LANEMASK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; DATA_NO_LANEMASK-NEXT: br label [[WHILE_BODY:%.*]] ; DATA_NO_LANEMASK: while.body: ; DATA_NO_LANEMASK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -180,9 +180,9 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_AND_CONTROL-NEXT: [[TMP15:%.*]] = extractelement [[TMP14]], i32 0 ; DATA_AND_CONTROL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; DATA_AND_CONTROL: middle.block: -; DATA_AND_CONTROL-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; DATA_AND_CONTROL-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; DATA_AND_CONTROL: scalar.ph: -; DATA_AND_CONTROL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; DATA_AND_CONTROL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; DATA_AND_CONTROL-NEXT: br label [[WHILE_BODY:%.*]] ; DATA_AND_CONTROL: while.body: ; DATA_AND_CONTROL-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -228,9 +228,9 @@ define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[TMP16:%.*]] = extractelement [[TMP15]], i32 0 ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; DATA_AND_CONTROL_NO_RT_CHECK: middle.block: -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; DATA_AND_CONTROL_NO_RT_CHECK: scalar.ph: -; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: br label [[WHILE_BODY:%.*]] ; DATA_AND_CONTROL_NO_RT_CHECK: while.body: ; DATA_AND_CONTROL_NO_RT_CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll index 82118200ade1c..25e1e2d4cab1e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll @@ -113,9 +113,9 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) { ; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; VF4-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VF4: [[MIDDLE_BLOCK]]: -; VF4-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; VF4-NEXT: br label %[[EXIT:.*]] ; VF4: [[SCALAR_PH]]: -; VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF4-NEXT: br label %[[LOOP:.*]] ; VF4: [[LOOP]]: ; VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll index 5c3ce532093ef..9522c7e7b61a8 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-gather-scatter-tailpred.ll @@ -27,9 +27,9 @@ define void @test_stride1_4i32(ptr readonly %data, ptr noalias nocapture %dst, i ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -218,9 +218,9 @@ define void @test_stride3_4i32(ptr readonly %data, ptr noalias nocapture %dst, i ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -281,9 +281,9 @@ define void @test_stride4_4i32(ptr readonly %data, ptr noalias nocapture %dst, i ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -345,9 +345,9 @@ define void @test_stride_loopinvar_4i32(ptr readonly %data, ptr noalias nocaptur ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_023:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll index 8b9586cbb1cc9..68f7e53884bc0 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll @@ -79,9 +79,9 @@ define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture nou ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[INNER_LOOP_EXIT]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[INNER_LOOP:%.*]] ; CHECK: inner.loop: ; CHECK-NEXT: [[J_021_US:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC_US:%.*]], [[INNER_LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll index af43b747c0043..6d42dee565c87 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll @@ -22,7 +22,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -76,7 +76,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -129,7 +129,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -181,7 +181,7 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -233,7 +233,7 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -285,7 +285,7 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -337,7 +337,7 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP4]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -389,7 +389,7 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll index eea27ffad0b90..2078a10d04ce7 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-types.ll @@ -36,10 +36,10 @@ define i32 @mla_i32(ptr noalias nocapture readonly %A, ptr noalias nocapture rea ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -118,10 +118,10 @@ define i32 @mla_i8(ptr noalias nocapture readonly %A, ptr noalias nocapture read ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -194,10 +194,10 @@ define i32 @add_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -260,10 +260,10 @@ define i32 @mul_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 1, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -326,10 +326,10 @@ define i32 @and_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ -1, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -392,10 +392,10 @@ define i32 @or_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -458,10 +458,10 @@ define i32 @xor_i32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]]) -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -524,10 +524,10 @@ define float @fadd_f32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP4]]) -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -590,10 +590,10 @@ define float @fmul_f32(ptr nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]]) -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll index 8ba859bfc980e..1e91fc9c7c56d 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll @@ -399,9 +399,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) ; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[SCALAR_PH]]: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] ; DEFAULT: [[FOR_BODY]]: ; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -594,9 +594,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n) ; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; OPTSIZE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; OPTSIZE: [[MIDDLE_BLOCK]]: -; OPTSIZE-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]] +; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; OPTSIZE: [[SCALAR_PH]]: -; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] ; OPTSIZE: [[FOR_BODY]]: ; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll index d9090efe2d3a0..edf89a0fa7d7f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -149,10 +149,10 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: ; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP10]]) -; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; IF-EVL-OUTLOOP: scalar.ph: -; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: for.body: ; IF-EVL-OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -204,10 +204,10 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) { ; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-INLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL-INLOOP: middle.block: -; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; IF-EVL-INLOOP: scalar.ph: -; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ] ; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-INLOOP: for.body: ; IF-EVL-INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -380,10 +380,10 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: ; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP15]]) -; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-OUTLOOP: scalar.ph: -; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: for.body: ; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -433,10 +433,10 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-INLOOP-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL-INLOOP: middle.block: -; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-INLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-INLOOP: scalar.ph: -; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-INLOOP: for.body: ; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll index 38b4b1ac28777..0f2b6b30ae0e1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll @@ -148,9 +148,9 @@ define void @trip8_i8(ptr noalias nocapture noundef %dst, ptr noalias nocapture ; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0( [[TMP13]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll index bcb71db5c3b7a..9cf7bc9fe07d6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll @@ -141,9 +141,9 @@ define void @test(ptr %p, i64 %a, i8 %b) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[FOR_COND]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT1:%.*]], label [[SCALAR_PH1]] +; CHECK-NEXT: br label [[EXIT1:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_COND1:%.*]] ; CHECK: for.cond: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH1]] ], [ [[ADD:%.*]], [[FOR_BODY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll index 7aff21767e3d4..c6bcd7201777a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-tailfold.ll @@ -34,9 +34,9 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -97,9 +97,9 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -160,10 +160,10 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[TMP12]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -225,9 +225,9 @@ define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -281,9 +281,9 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -373,9 +373,9 @@ define void @vector_add_trip1024(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index dc2e99d33c377..61706fe4d55c0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -167,7 +167,6 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP10]], 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[X]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = trunc i32 [[N_VEC]] to i8 ; CHECK-NEXT: [[TMP7:%.*]] = trunc [[BROADCAST_SPLAT]] to ; CHECK-NEXT: [[TMP8:%.*]] = or splat (i1 true), [[TMP7]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, ptr [[DST]], i64 0 @@ -188,9 +187,9 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP6]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: ; CHECK-NEXT: [[F_039:%.*]] = phi i8 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP_LATCH:.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll index 656ae1cc03a80..d343da1110986 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll @@ -39,9 +39,9 @@ define void @truncate_to_minimal_bitwidths_widen_cast_recipe(ptr %src) { ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll index d3cb418c4380b..b0a1300c62c55 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll @@ -59,9 +59,9 @@ define void @type_info_cache_clobber(ptr %dstv, ptr %src, i64 %wide.trip.count) ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 56b989b6ac4fa..d8713bdda689a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -115,9 +115,9 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: -; TF-SCALABLE-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: -; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; TF-SCALABLE: [[FOR_BODY]]: ; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -149,9 +149,9 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 ; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] ; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1028, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] ; TF-FIXEDLEN: [[FOR_BODY]]: ; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -457,9 +457,9 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: -; TF-SCALABLE-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: -; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; TF-SCALABLE: [[FOR_BODY]]: ; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] @@ -502,9 +502,9 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 ; TF-FIXEDLEN-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] ; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1028, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] ; TF-FIXEDLEN: [[FOR_BODY]]: ; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] @@ -652,9 +652,9 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: -; TF-SCALABLE-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: -; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; TF-SCALABLE: [[FOR_BODY]]: ; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -686,9 +686,9 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 ; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] ; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1028, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] ; TF-FIXEDLEN: [[FOR_BODY]]: ; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -823,9 +823,9 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: -; TF-SCALABLE-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: -; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; TF-SCALABLE: [[FOR_BODY]]: ; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -857,9 +857,9 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 ; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] ; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1028, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] ; TF-FIXEDLEN: [[FOR_BODY]]: ; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -1020,9 +1020,9 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: -; TF-SCALABLE-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: -; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; TF-SCALABLE: [[FOR_BODY]]: ; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -1081,9 +1081,9 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-FIXEDLEN-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 ; TF-FIXEDLEN-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] ; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1028, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] ; TF-FIXEDLEN: [[FOR_BODY]]: ; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -1259,9 +1259,9 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: -; TF-SCALABLE-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: -; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; TF-SCALABLE: [[FOR_BODY]]: ; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] @@ -1304,9 +1304,9 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-FIXEDLEN-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 ; TF-FIXEDLEN-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] ; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1028, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] ; TF-FIXEDLEN: [[FOR_BODY]]: ; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ] @@ -1452,9 +1452,9 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; TF-SCALABLE-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; TF-SCALABLE: [[MIDDLE_BLOCK]]: -; TF-SCALABLE-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-SCALABLE-NEXT: br label %[[FOR_END:.*]] ; TF-SCALABLE: [[SCALAR_PH]]: -; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-SCALABLE-NEXT: br label %[[FOR_BODY:.*]] ; TF-SCALABLE: [[FOR_BODY]]: ; TF-SCALABLE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] @@ -1486,9 +1486,9 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; TF-FIXEDLEN-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1028 ; TF-FIXEDLEN-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; TF-FIXEDLEN: [[MIDDLE_BLOCK]]: -; TF-FIXEDLEN-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; TF-FIXEDLEN-NEXT: br label %[[FOR_END:.*]] ; TF-FIXEDLEN: [[SCALAR_PH]]: -; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1028, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; TF-FIXEDLEN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; TF-FIXEDLEN-NEXT: br label %[[FOR_BODY:.*]] ; TF-FIXEDLEN: [[FOR_BODY]]: ; TF-FIXEDLEN-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll index 241f16bd1e7bf..d77b2bf9ca722 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-bin-unary-ops-args.ll @@ -51,9 +51,9 @@ define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -144,9 +144,9 @@ define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -237,9 +237,9 @@ define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -330,9 +330,9 @@ define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -423,9 +423,9 @@ define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -516,9 +516,9 @@ define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -609,9 +609,9 @@ define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -702,9 +702,9 @@ define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -795,9 +795,9 @@ define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -888,9 +888,9 @@ define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -981,9 +981,9 @@ define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1074,9 +1074,9 @@ define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1167,9 +1167,9 @@ define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1263,9 +1263,9 @@ define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1357,9 +1357,9 @@ define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1451,9 +1451,9 @@ define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1545,9 +1545,9 @@ define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -1692,9 +1692,9 @@ define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[FINISH_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[LEN:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll index 36659d7e30666..1e623828b4f1b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-call-intrinsics.ll @@ -64,9 +64,9 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -178,9 +178,9 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -292,9 +292,9 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -406,9 +406,9 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -512,9 +512,9 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -611,9 +611,9 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -713,9 +713,9 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -821,9 +821,9 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -927,9 +927,9 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll index 04c9fac961a7b..b816751f04884 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cast-intrinsics.ll @@ -56,9 +56,9 @@ define void @vp_sext(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -155,9 +155,9 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -254,9 +254,9 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -353,9 +353,9 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -452,9 +452,9 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -551,9 +551,9 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -650,9 +650,9 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -749,9 +749,9 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -848,9 +848,9 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -947,9 +947,9 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 354cc91c6f01f..f1ce35aa8d9a7 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -61,10 +61,10 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: ; IF-EVL-OUTLOOP-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP20]]) -; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-OUTLOOP: scalar.ph: -; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: for.body: ; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -118,10 +118,10 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL-INLOOP: middle.block: -; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-INLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-INLOOP: scalar.ph: -; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-INLOOP: for.body: ; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -309,10 +309,10 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: ; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PREDPHI]]) -; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-OUTLOOP: scalar.ph: -; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-OUTLOOP: for.body: ; IF-EVL-OUTLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] @@ -369,10 +369,10 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL-INLOOP: middle.block: -; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-INLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-INLOOP: scalar.ph: -; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-INLOOP: for.body: ; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll index adc37e5797187..f23754b764822 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll @@ -46,9 +46,9 @@ define void @test_sdiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -142,9 +142,9 @@ define void @test_udiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -237,9 +237,9 @@ define void @test_srem(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -332,9 +332,9 @@ define void @test_urem(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ] ; IF-EVL-NEXT: br label %[[LOOP:.*]] ; IF-EVL: [[LOOP]]: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll index cf7b67fd9e7b5..bbb14ecc3b12a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-fixed-order-recurrence.ll @@ -56,14 +56,10 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: [[TMP21:%.*]] = call i32 @llvm.vscale.i32() -; IF-EVL-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], 4 -; IF-EVL-NEXT: [[TMP23:%.*]] = sub i32 [[TMP22]], 1 -; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[VP_OP_LOAD]], i32 [[TMP23]] -; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FOR_END:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 33, %[[ENTRY]] ] ; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] ; IF-EVL: [[FOR_BODY]]: ; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] @@ -207,24 +203,16 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: [[TMP25:%.*]] = call i32 @llvm.vscale.i32() -; IF-EVL-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], 4 -; IF-EVL-NEXT: [[TMP27:%.*]] = sub i32 [[TMP26]], 1 -; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[VP_OP_LOAD]], i32 [[TMP27]] -; IF-EVL-NEXT: [[TMP28:%.*]] = call i32 @llvm.vscale.i32() -; IF-EVL-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], 4 -; IF-EVL-NEXT: [[TMP30:%.*]] = sub i32 [[TMP29]], 1 -; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT3:%.*]] = extractelement [[TMP19]], i32 [[TMP30]] -; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FOR_END:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT4:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT3]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 33, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT3:%.*]] = phi i32 [ 22, %[[ENTRY]] ] ; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] ; IF-EVL: [[FOR_BODY]]: ; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] ; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP31:%.*]], %[[FOR_BODY]] ] -; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT4]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT3]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] ; IF-EVL-NEXT: [[TMP31]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR1]], [[FOR2]] @@ -384,30 +372,18 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: -; IF-EVL-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() -; IF-EVL-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 4 -; IF-EVL-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 1 -; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[VP_OP_LOAD]], i32 [[TMP31]] -; IF-EVL-NEXT: [[TMP32:%.*]] = call i32 @llvm.vscale.i32() -; IF-EVL-NEXT: [[TMP33:%.*]] = mul i32 [[TMP32]], 4 -; IF-EVL-NEXT: [[TMP34:%.*]] = sub i32 [[TMP33]], 1 -; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT6:%.*]] = extractelement [[TMP22]], i32 [[TMP34]] -; IF-EVL-NEXT: [[TMP35:%.*]] = call i32 @llvm.vscale.i32() -; IF-EVL-NEXT: [[TMP36:%.*]] = mul i32 [[TMP35]], 4 -; IF-EVL-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], 1 -; IF-EVL-NEXT: [[VECTOR_RECUR_EXTRACT7:%.*]] = extractelement [[TMP23]], i32 [[TMP37]] -; IF-EVL-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; IF-EVL-NEXT: br label %[[FOR_END:.*]] ; IF-EVL: [[SCALAR_PH]]: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 33, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT7:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT6]], %[[MIDDLE_BLOCK]] ], [ 22, %[[ENTRY]] ] -; IF-EVL-NEXT: [[SCALAR_RECUR_INIT8:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT7]], %[[MIDDLE_BLOCK]] ], [ 11, %[[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 33, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT5:%.*]] = phi i32 [ 22, %[[ENTRY]] ] +; IF-EVL-NEXT: [[SCALAR_RECUR_INIT6:%.*]] = phi i32 [ 11, %[[ENTRY]] ] ; IF-EVL-NEXT: br label %[[FOR_BODY:.*]] ; IF-EVL: [[FOR_BODY]]: ; IF-EVL-NEXT: [[INDVARS:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ] ; IF-EVL-NEXT: [[FOR1:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TMP38:%.*]], %[[FOR_BODY]] ] -; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT7]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] -; IF-EVL-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT8]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR2:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT5]], %[[SCALAR_PH]] ], [ [[FOR1]], %[[FOR_BODY]] ] +; IF-EVL-NEXT: [[FOR3:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT6]], %[[SCALAR_PH]] ], [ [[FOR2]], %[[FOR_BODY]] ] ; IF-EVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDVARS]] ; IF-EVL-NEXT: [[TMP38]] = load i32, ptr [[ARRAYIDX]], align 4 ; IF-EVL-NEXT: [[ADD:%.*]] = add nsw i32 [[FOR2]], [[FOR3]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll index 60ab87e4442f4..1fb84dbc79c49 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll @@ -46,10 +46,10 @@ define i32 @add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -156,10 +156,10 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[MUL]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -271,10 +271,10 @@ define i32 @or(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -386,10 +386,10 @@ define i32 @and(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -501,10 +501,10 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -616,10 +616,10 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -734,10 +734,10 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -852,10 +852,10 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -970,10 +970,10 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_MINMAX]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1088,10 +1088,10 @@ define float @fadd(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1198,10 +1198,10 @@ define float @fmul(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: [[TMP7:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[MUL]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -1314,10 +1314,10 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1434,10 +1434,10 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[RDX_MINMAX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1549,10 +1549,10 @@ define float @fminimum(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP5]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -1661,10 +1661,10 @@ define float @fmaximum(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[TMP5]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -1781,10 +1781,10 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) { ; IF-EVL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1910,10 +1910,10 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP16]]) ; IF-EVL-NEXT: [[TMP20:%.*]] = freeze i1 [[TMP19]] ; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 [[INV:%.*]], i32 [[START:%.*]] -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -2035,10 +2035,10 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP16]]) ; IF-EVL-NEXT: [[TMP20:%.*]] = freeze i1 [[TMP19]] ; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 [[INV:%.*]], i32 [[START:%.*]] -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll index dc51fac268e05..c616fc1b11b93 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll @@ -65,9 +65,9 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) { ; IF-EVL-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll index 2b1d06bd8121a..f18e48dc22457 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-intermediate-store.ll @@ -68,10 +68,10 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-OUTLOOP: middle.block: ; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; IF-EVL-OUTLOOP-NEXT: store i32 [[TMP23]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] -; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-OUTLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-OUTLOOP: scalar.ph: -; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ], [ [[START]], [[VECTOR_MEMCHECK]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY1]] ], [ [[START]], [[VECTOR_MEMCHECK]] ] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL-OUTLOOP: for.body: ; IF-EVL-OUTLOOP-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -130,10 +130,10 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-INLOOP: middle.block: ; IF-EVL-INLOOP-NEXT: store i32 [[TMP22]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] -; IF-EVL-INLOOP-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-INLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-INLOOP: scalar.ph: -; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] -; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP22]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ], [ [[START]], [[VECTOR_MEMCHECK]] ] +; IF-EVL-INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; IF-EVL-INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[START]], [[VECTOR_MEMCHECK]] ] ; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL-INLOOP: for.body: ; IF-EVL-INLOOP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll index f4abc7e209dd3..fa35025ecacd7 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-iv32.ll @@ -43,9 +43,9 @@ define void @iv32(ptr noalias %a, ptr noalias %b, i32 %N) { ; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[IV_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY1:%.*]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll index 303da8e0f7117..a02ef31dc9b1b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-known-no-overflow.ll @@ -46,9 +46,9 @@ define void @trip_count_max_1024(ptr %p, i64 %tc) vscale_range(2, 1024) { ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -121,9 +121,9 @@ define void @overflow_at_0(ptr %p, i64 %tc) vscale_range(2, 1024) { ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -192,9 +192,9 @@ define void @no_overflow_at_0(ptr %p, i64 %tc) vscale_range(2, 1024) { ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll index 73c14f86e2782..815d4cbc10fe4 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-masked-loadstore.ll @@ -47,9 +47,9 @@ define void @masked_loadstore(ptr noalias %a, ptr noalias %b, i64 %n) { ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[EXIT:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll index 8b6427a0b75dd..5e16b398266d5 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-ordered-reduction.ll @@ -45,10 +45,10 @@ define float @fadd(ptr noalias nocapture readonly %a, i64 %n) { ; IF-EVL-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll index 9271aa6424199..a9a9bf7294cca 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll @@ -46,10 +46,10 @@ define i32 @add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP14]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -158,10 +158,10 @@ define i32 @mul(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> [[TMP6]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -275,10 +275,10 @@ define i32 @or(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.nxv4i32( [[TMP14]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -393,10 +393,10 @@ define i32 @and(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.and.nxv4i32( [[TMP14]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -511,10 +511,10 @@ define i32 @xor(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.xor.nxv4i32( [[TMP14]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -631,10 +631,10 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[TMP15]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -756,10 +756,10 @@ define i32 @smax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32( [[TMP15]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -881,10 +881,10 @@ define i32 @umin(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32( [[TMP15]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1006,10 +1006,10 @@ define i32 @umax(ptr %a, i64 %n, i32 %start) { ; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32( [[TMP15]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1129,10 +1129,10 @@ define float @fadd(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP17:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP14]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1241,10 +1241,10 @@ define float @fmul(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP8:%.*]] = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 1.000000e+00, <8 x float> [[TMP6]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -1360,10 +1360,10 @@ define float @fmin(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fmin.nxv4f32( [[TMP15]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1485,10 +1485,10 @@ define float @fmax(ptr %a, i64 %n, float %start) #0 { ; IF-EVL-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv4f32( [[TMP15]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP18]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1602,10 +1602,10 @@ define float @fminimum(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> [[TMP5]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -1714,10 +1714,10 @@ define float @fmaximum(ptr %a, i64 %n, float %start) { ; IF-EVL-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> [[TMP5]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY1]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY1:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY1]] ] ; IF-EVL-NEXT: br label [[FOR_BODY1:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] @@ -1835,10 +1835,10 @@ define float @fmuladd(ptr %a, ptr %b, i64 %n, float %start) { ; IF-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; IF-EVL: middle.block: ; IF-EVL-NEXT: [[TMP20:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[TMP17]]) -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -1964,10 +1964,10 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP15]]) ; IF-EVL-NEXT: [[TMP19:%.*]] = freeze i1 [[TMP18]] ; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP19]], i32 [[INV:%.*]], i32 [[START:%.*]] -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -2089,10 +2089,10 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) { ; IF-EVL-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP15]]) ; IF-EVL-NEXT: [[TMP19:%.*]] = freeze i1 [[TMP18]] ; IF-EVL-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP19]], i32 [[INV:%.*]], i32 [[START:%.*]] -; IF-EVL-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_END:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index 695af0d241159..7bdb67b8a0fba 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -22,15 +22,13 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 -; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]] -; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], -1 ; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]] ; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64 @@ -54,10 +52,10 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[LOOPEND:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[STARTVAL]], [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] @@ -122,15 +120,13 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4 -; IF-EVL-NEXT: [[IND_END:%.*]] = sub i64 [[STARTVAL:%.*]], [[N_VEC]] -; IF-EVL-NEXT: [[IND_END1:%.*]] = trunc i64 [[N_VEC]] to i32 ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[STARTVAL:%.*]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[OFFSET_IDX3:%.*]] = trunc i64 [[EVL_BASED_IV]] to i32 ; IF-EVL-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], -1 ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[OFFSET_IDX3]] @@ -161,10 +157,10 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[LOOPEND:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[LOOPEND:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[STARTVAL]], [[ENTRY:%.*]] ] -; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[STARTVAL]], [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[ADD_PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ] @@ -255,7 +251,6 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16 -; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[N_VEC]] ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -295,9 +290,9 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[EXIT:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 1024, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll index 01465f6d614d1..fb2ec8c61d745 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-safe-dep-distance.ll @@ -44,9 +44,9 @@ define void @test(ptr %p) { ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[EXIT:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -359,9 +359,9 @@ define void @trivial_due_max_vscale(ptr %p) { ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[EXIT:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -443,9 +443,9 @@ define void @no_high_lmul_or_interleave(ptr %p) { ; IF-EVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[EXIT:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll index c7c8ee4326a8b..5d4552f0cddb0 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll @@ -27,7 +27,6 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[SPEC_SELECT]], [[N_VEC]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -50,9 +49,9 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) { ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[SPEC_SELECT]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll index 2e953735f5413..99e4d7539b932 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-vp-intrinsics.ll @@ -48,9 +48,9 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll index e4c8586bded73..ed25e1e03906b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-fixed-order-recurrence.ll @@ -48,9 +48,8 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: Successor(s): middle.block ; IF-EVL-EMPTY: ; IF-EVL: middle.block: -; IF-EVL-NEXT: EMIT vp<[[RESUME_EXTRACT:%.+]]> = extract-from-end ir<[[LD]]>, ir<1> -; IF-EVL-NEXT: EMIT branch-on-cond ir -; IF-EVL-NEXT: Successor(s): ir-bb, scalar.ph +; IF-EVL-NEXT: Successor(s): ir-bb + ; IF-EVL: Cost of 0 for VF vscale x 4: FIRST-ORDER-RECURRENCE-PHI ir<[[FOR_PHI]]> = phi ir<33>, ir<[[LD]]> ; IF-EVL: Cost of 4 for VF vscale x 4: WIDEN-INTRINSIC vp<[[SPLICE]]> = call llvm.experimental.vp.splice(ir<[[FOR_PHI]]>, ir<[[LD]]>, ir<-1>, ir, vp<[[PREV_EVL]]>, vp<[[EVL]]>) entry: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index c7d3946b707e1..6546fccb14fc2 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -63,19 +63,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: middle.block: ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, vp<[[RDX_SELECT]]> ; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> -; IF-EVL-OUTLOOP-NEXT: EMIT branch-on-cond ir -; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb, scalar.ph -; IF-EVL-OUTLOOP-EMPTY: -; IF-EVL-OUTLOOP-NEXT: scalar.ph: -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0> -; IF-EVL-OUTLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> -; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb -; IF-EVL-OUTLOOP-EMPTY: -; IF-EVL-OUTLOOP-NEXT: ir-bb: -; IF-EVL-OUTLOOP-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph) -; IF-EVL-OUTLOOP-NEXT: IR %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] -; IF-EVL-OUTLOOP: IR %exitcond.not = icmp eq i64 %iv.next, %n -; IF-EVL-OUTLOOP-NEXT: No successors +; IF-EVL-OUTLOOP-NEXT: Successor(s): ir-bb ; IF-EVL-OUTLOOP-EMPTY: ; IF-EVL-OUTLOOP-NEXT: ir-bb: ; IF-EVL-OUTLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]> from middle.block) @@ -115,19 +103,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: middle.block: ; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX:%.+]]> = compute-reduction-result ir<[[RDX_PHI]]>, ir<[[ADD]]> ; IF-EVL-INLOOP-NEXT: EMIT vp<[[RDX_EX:%.+]]> = extract-from-end vp<[[RDX]]>, ir<1> -; IF-EVL-INLOOP-NEXT: EMIT branch-on-cond ir -; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb, scalar.ph -; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP-NEXT: scalar.ph: -; IF-EVL-INLOOP-NEXT: EMIT vp<[[IV_RESUME:%.+]]> = resume-phi vp<[[VTC]]>, ir<0> -; IF-EVL-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> -; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb -; IF-EVL-INLOOP-EMPTY: -; IF-EVL-INLOOP-NEXT: ir-bb: -; IF-EVL-INLOOP-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<[[IV_RESUME]]> from scalar.ph) -; IF-EVL-INLOOP-NEXT: IR %rdx = phi i32 [ %start, %entry ], [ %add, %for.body ] -; IF-EVL-INLOOP: IR %exitcond.not = icmp eq i64 %iv.next, %n -; IF-EVL-INLOOP-NEXT: No successors +; IF-EVL-INLOOP-NEXT: Successor(s): ir-bb ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: ir-bb: ; IF-EVL-INLOOP-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RDX_EX]]> from middle.block) diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll index 385f7621801b4..082e3266e7c8f 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/force-target-instruction-cost.ll @@ -37,9 +37,9 @@ define void @test_scalar_steps_target_instruction_cost(ptr %dst) { ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10 ; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 30, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll index 2de0f7e4d4016..bc0ccfb45c057 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/pr47665.ll @@ -117,9 +117,9 @@ define void @test(ptr %p, i40 %a) { ; CHECK: pred.store.continue30: ; CHECK-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll index 3ea4f1aea51d1..e0fc73f669946 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/predicated-first-order-recurrence.ll @@ -63,11 +63,10 @@ define void @func_21() { ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6 ; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1 -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 6, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LV:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index c39dd6ffe5f01..a1c727f62ba7a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -1051,10 +1051,10 @@ define i64 @live_in_known_1_via_scev() { ; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[VEC_PHI]]) -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[PH]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 3, [[PH]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[PH]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 3, [[PH]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll index 55cceb2bf00ed..fd53a4ce95e9a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/divs-with-tail-folding.ll @@ -43,9 +43,9 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) { ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -158,9 +158,9 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll index ca9547a38dd33..7bcf8a7bcdb81 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll @@ -41,10 +41,10 @@ define i1 @fn(ptr %nno) #0 { ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP12]]) -; CHECK-NEXT: br i1 true, label [[FOR_END36:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -2, [[MIDDLE_BLOCK]] ], [ 10, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY20:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC35:%.*]] ] @@ -63,7 +63,7 @@ define i1 @fn(ptr %nno) #0 { ; CHECK-NEXT: [[SUM_1]] = or i32 [[REM27_PN]], [[SUM_01]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; CHECK-NEXT: [[CMP19_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; CHECK-NEXT: br i1 [[CMP19_NOT]], label [[FOR_END36]], label [[FOR_BODY20]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP19_NOT]], label [[EXIT]], label [[FOR_BODY20]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_INC35]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[CMP41:%.*]] = icmp eq i32 [[SUM_1_LCSSA]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index 68fb58604406a..907b3135ea8c5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -631,9 +631,9 @@ define void @wide_iv_trunc(ptr %dst, i64 %N) { ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll index 7ce1be31f08ac..074aeb81a0414 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -31,9 +31,9 @@ define i32 @foo_optsize() #0 { ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] @@ -69,9 +69,9 @@ define i32 @foo_optsize() #0 { ; AUTOVF-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 ; AUTOVF-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AUTOVF: middle.block: -; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; AUTOVF-NEXT: br label [[FOR_END:%.*]] ; AUTOVF: scalar.ph: -; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 224, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] ; AUTOVF: for.body: ; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] @@ -129,9 +129,9 @@ define i32 @foo_minsize() #1 { ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] @@ -167,9 +167,9 @@ define i32 @foo_minsize() #1 { ; AUTOVF-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 224 ; AUTOVF-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; AUTOVF: middle.block: -; AUTOVF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; AUTOVF-NEXT: br label [[FOR_END:%.*]] ; AUTOVF: scalar.ph: -; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 224, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] ; AUTOVF: for.body: ; AUTOVF-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] @@ -363,8 +363,6 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 64 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[N_VEC]], -72 -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <64 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT3]], <64 x i32> poison, <64 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -382,9 +380,9 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 { ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[START]], [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] @@ -409,8 +407,6 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 { ; AUTOVF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8 ; AUTOVF-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; AUTOVF-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP3]], 1 -; AUTOVF-NEXT: [[TMP4:%.*]] = mul i32 [[N_VEC]], -72 -; AUTOVF-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i32 [[TMP4]] ; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 ; AUTOVF-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT3]], <8 x i32> poison, <8 x i32> zeroinitializer ; AUTOVF-NEXT: br label [[VECTOR_BODY:%.*]] @@ -428,9 +424,9 @@ define void @tail_folded_store_avx512(ptr %start, ptr %end) #3 { ; AUTOVF-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; AUTOVF-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; AUTOVF: middle.block: -; AUTOVF-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; AUTOVF-NEXT: br label [[EXIT:%.*]] ; AUTOVF: scalar.ph: -; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ] +; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[START]], [[ENTRY:%.*]] ] ; AUTOVF-NEXT: br label [[LOOP:%.*]] ; AUTOVF: loop: ; AUTOVF-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll index bb59a00365215..08adfdd4793eb 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr81872.ll @@ -39,15 +39,15 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 { ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[BB6:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]] +; CHECK-NEXT: br label [[BB6:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 87, [[MIDDLE_BLOCK]] ], [ 99, [[BB5:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 99, [[BB5:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[AND:%.*]] = and i64 [[IV]], 1 ; CHECK-NEXT: [[ICMP17:%.*]] = icmp eq i64 [[AND]], 0 -; CHECK-NEXT: br i1 [[ICMP17]], label [[BB18:%.*]], label [[LOOP_LATCH]], !prof [[PROF6:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP17]], label [[BB18:%.*]], label [[LOOP_LATCH]], !prof [[PROF5:![0-9]+]] ; CHECK: bb18: ; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[IV]], 1 ; CHECK-NEXT: [[GETELEMENTPTR19:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[OR]] @@ -56,7 +56,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 { ; CHECK: loop.latch: ; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 ; CHECK-NEXT: [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 90 -; CHECK-NEXT: br i1 [[ICMP22]], label [[BB6]], label [[LOOP_HEADER]], !prof [[PROF7:![0-9]+]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP22]], label [[BB6]], label [[LOOP_HEADER]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: bb6: ; CHECK-NEXT: ret void ; @@ -100,8 +100,7 @@ attributes #0 = {"target-cpu"="haswell" "target-features"="+avx2" } ; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]} ; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[PROF5]] = !{!"branch_weights", i32 1, i32 3} -; CHECK: [[PROF6]] = !{!"branch_weights", i32 1, i32 1} -; CHECK: [[PROF7]] = !{!"branch_weights", i32 0, i32 0} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]], [[META3]]} +; CHECK: [[PROF5]] = !{!"branch_weights", i32 1, i32 1} +; CHECK: [[PROF6]] = !{!"branch_weights", i32 0, i32 0} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll index 4d4b9e5052b3b..6a7dc51700192 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/scev-checks-unprofitable.ll @@ -37,9 +37,9 @@ define void @value_defined_in_loop1_used_for_trip_counts(i32 %start, i1 %c, ptr ; CHECK-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> zeroinitializer, ptr [[TMP2]], i32 1, <16 x i1> [[TMP0]]) ; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT_1_LOOPEXIT1:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT_1_LOOPEXIT1:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_3_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[LOOP_3_PREHEADER]] ] ; CHECK-NEXT: br label %[[LOOP_3:.*]] ; CHECK: [[LOOP_2_PREHEADER]]: ; CHECK-NEXT: br label %[[LOOP_2:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll index 54eff6a23b6e5..42a982bf833a4 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll @@ -122,7 +122,7 @@ define void @example2(i32 %n, i32 %x) optsize { ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[DOT_PREHEADER_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_PREHEADER_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH5:%.*]] ; CHECK: ..preheader_crit_edge: @@ -204,13 +204,13 @@ define void @example2(i32 %n, i32 %x) optsize { ; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT29]], [[N_VEC12]] ; CHECK-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK28:%.*]], label [[VECTOR_BODY13]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block28: -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH8]] -; CHECK: scalar.ph7: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] +; CHECK: scalar.ph7: +; CHECK-NEXT: br label [[DOTLR_PH1:%.*]] ; CHECK: .lr.ph5: ; CHECK-NEXT: br i1 poison, label [[DOT_PREHEADER_CRIT_EDGE]], label [[DOTLR_PH5]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: .lr.ph: -; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 poison, label [[DOTLR_PH]], label [[DOTLR_PH1]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: ._crit_edge.loopexit: ; CHECK-NEXT: br label [[DOT_CRIT_EDGE]] ; CHECK: ._crit_edge: @@ -326,7 +326,7 @@ define void @example3(i32 %n, ptr noalias nocapture %p, ptr noalias nocapture %q ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -512,7 +512,7 @@ define void @example23c(ptr noalias nocapture %src, ptr noalias nocapture %dst) ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[TMP26:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[TMP26:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[TMP25:%.*]] ; CHECK: 25: diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll index 9b6016458572f..c38410091d7bc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -31,9 +31,9 @@ define dso_local void @tail_folding_enabled(ptr noalias nocapture %A, ptr noalia ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 432, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void @@ -98,9 +98,9 @@ define dso_local void @tail_folding_disabled(ptr noalias nocapture %A, ptr noali ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 432, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void @@ -183,10 +183,10 @@ define i32 @reduction_i32(ptr nocapture readonly %A, ptr nocapture readonly %B, ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP11]]) -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll index f7fde01f3314f..d7a3b3262d01d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -147,9 +147,9 @@ define void @vectorized1(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 ; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll index 4f68bb883a260..27321e7ad4657 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll @@ -41,9 +41,9 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; IF-EVL-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IF-EVL-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; IF-EVL: middle.block: -; IF-EVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[FOR_BODY:%.*]] ; IF-EVL: for.body: ; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll index 7bd70628793c8..7b1c60bf507ce 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll @@ -74,9 +74,9 @@ define void @test_pr59090(ptr %l_out, ptr noalias %b) #0 { ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10008 ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10008, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/as_cast.ll b/llvm/test/Transforms/LoopVectorize/as_cast.ll index 58a8c3d078f02..67aacefebd555 100644 --- a/llvm/test/Transforms/LoopVectorize/as_cast.ll +++ b/llvm/test/Transforms/LoopVectorize/as_cast.ll @@ -29,7 +29,7 @@ loop: ; check that we branch to the exit block ; CHECK: middle.block: -; CHECK: br i1 true, label %exit, label %scalar.ph +; CHECK: br label %exit exit: ret void diff --git a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll index 9d84712fb94b2..653baf838c59c 100644 --- a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll @@ -94,11 +94,11 @@ define void @pr47390(ptr %a) { ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8 ; CHECK-NEXT: br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 7, %[[MIDDLE_BLOCK]] ], [ -1, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ 9, %[[MIDDLE_BLOCK]] ], [ 1, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ -1, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i32 [ 1, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll index b9b0ab877b112..aa8299bb040eb 100644 --- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll +++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll @@ -200,9 +200,9 @@ define dso_local void @cannotProveAlignedTC(ptr noalias nocapture %A, i32 %p, i3 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[RIVPLUS1:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index a333c9c89af25..794ff99a68672 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -76,20 +76,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%conv>, ir<1> -; CHECK-NEXT: EMIT branch-on-cond ir -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph -; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0> -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph) -; CHECK-NEXT: IR %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph) -; CHECK: IR %ec = icmp eq i32 %iv.next, 20001 -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb ; CHECK-NEXT: No successors @@ -167,20 +154,7 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%recur.next>, ir<1> -; CHECK-NEXT: EMIT branch-on-cond ir -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph -; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0> -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph) -; CHECK-NEXT: IR %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph) -; CHECK: IR %ec = icmp eq i32 %iv.next, 20001 -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb ; CHECK-NEXT: No successors @@ -240,23 +214,8 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RES:%.+]]> = compute-reduction-result ir<%and.red>, vp<[[SEL]]> -; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%recur.next>, ir<1> -; CHECK-NEXT: EMIT branch-on-cond ir -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph -; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234> -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph) -; CHECK-NEXT: IR %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph) -; CHECK-NEXT: IR %and.red = phi i32 [ 1234, %entry ], [ %and.red.next, %loop ] -; CHECK: IR %ec = icmp eq i32 %iv.next, 20001 -; CHECK-NEXT: No successors +; CHECK-NEXT: EMIT vp<[[RED_EX:%.+]]> = extract-from-end vp<[[RED_RES]]>, ir<1> +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb ; CHECK-NEXT: IR %res = phi i32 [ %and.red.next, %loop ] (extra operand: vp<[[RED_EX]]> from middle.block) @@ -358,20 +317,7 @@ define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%conv>, ir<1> -; CHECK-NEXT: EMIT branch-on-cond ir -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph -; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0> -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %0 = phi i32 [ 0, %entry ], [ %conv, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph) -; CHECK-NEXT: IR %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph) -; CHECK: IR %ec = icmp eq i32 %iv.next, 20001 -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb ; CHECK-NEXT: No successors @@ -456,20 +402,7 @@ define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%recur.next>, ir<1> -; CHECK-NEXT: EMIT branch-on-cond ir -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph -; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> -; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[VEC_TC]]>, ir<0> -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph) -; CHECK-NEXT: IR %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph) -; CHECK: IR %C = icmp sgt i32 %iv.next, %recur.next -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb ; CHECK-NEXT: No successors @@ -509,7 +442,6 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias ; CHECK-NEXT: Successor(s): vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<[[END:%.+]]> = DERIVED-IV ir<2> + vp<[[VEC_TC]]> * ir<1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { @@ -548,20 +480,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%l>, ir<1> -; CHECK-NEXT: EMIT branch-on-cond ir -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph -; CHECK-NEXT: EMIT vp<[[RESUME_IV:%.*]]> = resume-phi vp<[[END]]>, ir<2> -; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %iv = phi i64 [ 2, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME_IV]]> from scalar.ph) -; CHECK-NEXT: IR %.pn = phi i32 [ 0, %entry ], [ %l, %loop ] (extra operand: vp<[[RESUME_1_P]]> from scalar.ph) -; CHECK: IR %ec = icmp ugt i64 %iv, 3 -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index f51fc9b7b4ff7..c622315a47178 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -2663,7 +2663,6 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 -; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] @@ -2763,12 +2762,11 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]] ; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP43]], i32 3 -; UNROLL-NO-IC-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF29:![0-9]+]] +; UNROLL-NO-IC-NEXT: br label [[BB1:%.*]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB:%.*]] ] -; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP51]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[Y]], [[BB:%.*]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ] ; UNROLL-NO-IC-NEXT: br label [[BB2:%.*]] ; UNROLL-NO-IC: bb1: ; UNROLL-NO-IC-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ] @@ -2781,7 +2779,7 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-IC-NEXT: [[VAR7]] = udiv i32 219220132, [[VAR3]] ; UNROLL-NO-IC-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-IC-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-IC-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF30:![0-9]+]], !llvm.loop [[LOOP31:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]] ; ; UNROLL-NO-VF-LABEL: @sink_into_replication_region( ; UNROLL-NO-VF-NEXT: bb: @@ -2794,7 +2792,6 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2 ; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; UNROLL-NO-VF-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 -; UNROLL-NO-VF-NEXT: [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ] @@ -2828,11 +2825,11 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27:![0-9]+]], !llvm.loop [[LOOP28:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]] -; UNROLL-NO-VF-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF29:![0-9]+]] +; UNROLL-NO-VF-NEXT: br label [[BB1:%.*]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB:%.*]] ] -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] -; UNROLL-NO-VF-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] +; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[Y]], [[BB:%.*]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB]] ] +; UNROLL-NO-VF-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ] ; UNROLL-NO-VF-NEXT: br label [[BB2:%.*]] ; UNROLL-NO-VF: bb1: ; UNROLL-NO-VF-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] @@ -2845,7 +2842,7 @@ define i32 @sink_into_replication_region(i32 %y) { ; UNROLL-NO-VF-NEXT: [[VAR7]] = udiv i32 219220132, [[VAR3]] ; UNROLL-NO-VF-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-VF-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF30:![0-9]+]], !llvm.loop [[LOOP31:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]] ; ; SINK-AFTER-LABEL: @sink_into_replication_region( ; SINK-AFTER-NEXT: bb: @@ -2858,7 +2855,6 @@ define i32 @sink_into_replication_region(i32 %y) { ; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 ; SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; SINK-AFTER-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 -; SINK-AFTER-NEXT: [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]] ; SINK-AFTER-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 ; SINK-AFTER-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer ; SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]] @@ -2915,12 +2911,11 @@ define i32 @sink_into_replication_region(i32 %y) { ; SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27:![0-9]+]], !llvm.loop [[LOOP28:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) -; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3 -; SINK-AFTER-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF29:![0-9]+]] +; SINK-AFTER-NEXT: br label [[BB1:%.*]] ; SINK-AFTER: scalar.ph: -; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB:%.*]] ] -; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] -; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] +; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[Y]], [[BB:%.*]] ] +; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB]] ] +; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ] ; SINK-AFTER-NEXT: br label [[BB2:%.*]] ; SINK-AFTER: bb1: ; SINK-AFTER-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] @@ -2933,7 +2928,7 @@ define i32 @sink_into_replication_region(i32 %y) { ; SINK-AFTER-NEXT: [[VAR7]] = udiv i32 219220132, [[VAR3]] ; SINK-AFTER-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; SINK-AFTER-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; SINK-AFTER-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF30:![0-9]+]], !llvm.loop [[LOOP31:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]] ; bb: br label %bb2 @@ -2967,7 +2962,6 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-IC-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8 ; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; UNROLL-NO-IC-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 -; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]] ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 ; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] @@ -3126,17 +3120,16 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) ; UNROLL-NO-IC-NEXT: [[TMP74:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-IC-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP32:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP31:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP73]], [[TMP72]] ; UNROLL-NO-IC-NEXT: [[TMP75:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) -; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP43]], i32 3 -; UNROLL-NO-IC-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF29]] +; UNROLL-NO-IC-NEXT: br label [[BB1:%.*]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] -; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP75]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[Y]], [[BB:%.*]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[BB]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB]] ] +; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ] ; UNROLL-NO-IC-NEXT: br label [[BB2:%.*]] ; UNROLL-NO-IC: bb1: ; UNROLL-NO-IC-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[TMP75]], [[MIDDLE_BLOCK]] ] @@ -3153,7 +3146,7 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-IC-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-IC-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; UNROLL-NO-IC-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-IC-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF30]], !llvm.loop [[LOOP33:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29]], !llvm.loop [[LOOP32:![0-9]+]] ; ; UNROLL-NO-VF-LABEL: @sink_into_replication_region_multiple( ; UNROLL-NO-VF-NEXT: bb: @@ -3166,7 +3159,6 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2 ; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; UNROLL-NO-VF-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 -; UNROLL-NO-VF-NEXT: [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] @@ -3210,15 +3202,15 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = select i1 [[TMP5]], i32 [[TMP11]], i32 [[VEC_PHI1]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP32:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP31:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP17]], [[TMP16]] -; UNROLL-NO-VF-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF29]] +; UNROLL-NO-VF-NEXT: br label [[BB1:%.*]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB:%.*]] ] -; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP9]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] -; UNROLL-NO-VF-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] +; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[Y]], [[BB:%.*]] ] +; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[BB]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB]] ] +; UNROLL-NO-VF-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ] ; UNROLL-NO-VF-NEXT: br label [[BB2:%.*]] ; UNROLL-NO-VF: bb1: ; UNROLL-NO-VF-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] @@ -3235,7 +3227,7 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; UNROLL-NO-VF-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-VF-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; UNROLL-NO-VF-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF30]], !llvm.loop [[LOOP33:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29]], !llvm.loop [[LOOP32:![0-9]+]] ; ; SINK-AFTER-LABEL: @sink_into_replication_region_multiple( ; SINK-AFTER-NEXT: bb: @@ -3248,7 +3240,6 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; SINK-AFTER-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 ; SINK-AFTER-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; SINK-AFTER-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 -; SINK-AFTER-NEXT: [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]] ; SINK-AFTER-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 ; SINK-AFTER-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]] @@ -3333,16 +3324,15 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) ; SINK-AFTER-NEXT: [[TMP38:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SINK-AFTER-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP32:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF27]], !llvm.loop [[LOOP31:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP37]]) -; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3 -; SINK-AFTER-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]], !prof [[PROF29]] +; SINK-AFTER-NEXT: br label [[BB1:%.*]] ; SINK-AFTER: scalar.ph: -; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB:%.*]] ] -; SINK-AFTER-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] -; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] -; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP39]], [[MIDDLE_BLOCK]] ], [ 0, [[BB]] ] +; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[Y]], [[BB:%.*]] ] +; SINK-AFTER-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[BB]] ] +; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB]] ] +; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ] ; SINK-AFTER-NEXT: br label [[BB2:%.*]] ; SINK-AFTER: bb1: ; SINK-AFTER-NEXT: [[VAR:%.*]] = phi i32 [ [[VAR6:%.*]], [[BB2]] ], [ [[TMP39]], [[MIDDLE_BLOCK]] ] @@ -3359,7 +3349,7 @@ define i32 @sink_into_replication_region_multiple(ptr %x, i32 %y) { ; SINK-AFTER-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; SINK-AFTER-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; SINK-AFTER-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; SINK-AFTER-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF30]], !llvm.loop [[LOOP33:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF29]], !llvm.loop [[LOOP32:![0-9]+]] ; bb: br label %bb2 @@ -3408,7 +3398,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4) ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; UNROLL-NO-IC-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 @@ -3430,7 +3420,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; UNROLL-NO-IC-NEXT: [[EXT:%.*]] = zext i1 [[B3]] to i32 ; UNROLL-NO-IC-NEXT: [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]] ; UNROLL-NO-IC-NEXT: store i32 0, ptr [[A_GEP]], align 4 -; UNROLL-NO-IC-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP35:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]] ; UNROLL-NO-IC: for.end: ; UNROLL-NO-IC-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-IC-NEXT: ret i32 [[FOR_LCSSA]] @@ -3457,7 +3447,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; UNROLL-NO-VF-NEXT: store i32 0, ptr [[TMP9]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; UNROLL-NO-VF-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -3477,7 +3467,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; UNROLL-NO-VF-NEXT: [[EXT:%.*]] = zext i1 [[B3]] to i32 ; UNROLL-NO-VF-NEXT: [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]] ; UNROLL-NO-VF-NEXT: store i32 0, ptr [[A_GEP]], align 4 -; UNROLL-NO-VF-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP35:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: ret i32 [[FOR_LCSSA]] @@ -3501,7 +3491,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) ; SINK-AFTER-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; SINK-AFTER-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 @@ -3523,7 +3513,7 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; SINK-AFTER-NEXT: [[EXT:%.*]] = zext i1 [[B3]] to i32 ; SINK-AFTER-NEXT: [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]] ; SINK-AFTER-NEXT: store i32 0, ptr [[A_GEP]], align 4 -; SINK-AFTER-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP35:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP34:![0-9]+]] ; SINK-AFTER: for.end: ; SINK-AFTER-NEXT: [[FOR_LCSSA:%.*]] = phi i32 [ [[FOR]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ] ; SINK-AFTER-NEXT: ret i32 [[FOR_LCSSA]] @@ -3569,7 +3559,7 @@ define void @unused_recurrence(ptr %a) { ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], splat (i16 4) ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; UNROLL-NO-IC-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 ; UNROLL-NO-IC-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -3584,7 +3574,7 @@ define void @unused_recurrence(ptr %a) { ; UNROLL-NO-IC-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 ; UNROLL-NO-IC-NEXT: [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5 ; UNROLL-NO-IC-NEXT: [[CMP:%.*]] = icmp eq i16 [[IV]], 1000 -; UNROLL-NO-IC-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]] ; UNROLL-NO-IC: for.end: ; UNROLL-NO-IC-NEXT: ret void ; @@ -3603,7 +3593,7 @@ define void @unused_recurrence(ptr %a) { ; UNROLL-NO-VF-NEXT: [[TMP2]] = add i16 [[TMP1]], 5 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028 -; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: @@ -3617,7 +3607,7 @@ define void @unused_recurrence(ptr %a) { ; UNROLL-NO-VF-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 ; UNROLL-NO-VF-NEXT: [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5 ; UNROLL-NO-VF-NEXT: [[CMP:%.*]] = icmp eq i16 [[IV]], 1000 -; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: ret void ; @@ -3635,7 +3625,7 @@ define void @unused_recurrence(ptr %a) { ; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], splat (i16 4) ; SINK-AFTER-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1028 -; SINK-AFTER-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3 ; SINK-AFTER-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -3650,7 +3640,7 @@ define void @unused_recurrence(ptr %a) { ; SINK-AFTER-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 ; SINK-AFTER-NEXT: [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5 ; SINK-AFTER-NEXT: [[CMP:%.*]] = icmp eq i16 [[IV]], 1000 -; SINK-AFTER-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]] ; SINK-AFTER: for.end: ; SINK-AFTER-NEXT: ret void ; @@ -3678,21 +3668,20 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-IC: vector.ph: ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 ; UNROLL-NO-IC-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; UNROLL-NO-IC: middle.block: -; UNROLL-NO-IC-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; UNROLL-NO-IC-NEXT: br label [[EXIT:%.*]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: ; UNROLL-NO-IC-NEXT: [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD:%.*]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[ADD]] = add i64 [[PHI]], 1 -; UNROLL-NO-IC-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 +; UNROLL-NO-IC-NEXT: [[LOAD]] = load i32, ptr [[SRC:%.*]], align 4 ; UNROLL-NO-IC-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; UNROLL-NO-IC-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP37:![0-9]+]] ; UNROLL-NO-IC: exit: ; UNROLL-NO-IC-NEXT: ret i32 0 ; @@ -3716,7 +3705,7 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; UNROLL-NO-VF-NEXT: [[ADD]] = add i64 [[PHI]], 1 ; UNROLL-NO-VF-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-VF-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; UNROLL-NO-VF-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP37:![0-9]+]] ; UNROLL-NO-VF: exit: ; UNROLL-NO-VF-NEXT: ret i32 0 ; @@ -3726,21 +3715,20 @@ define i32 @recurence_uniform_load(ptr %src, ptr noalias %dst) { ; SINK-AFTER: vector.ph: ; SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]] ; SINK-AFTER: vector.body: -; SINK-AFTER-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 ; SINK-AFTER-NEXT: br label [[MIDDLE_BLOCK:%.*]] ; SINK-AFTER: middle.block: -; SINK-AFTER-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; SINK-AFTER-NEXT: br label [[EXIT:%.*]] ; SINK-AFTER: scalar.ph: -; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[TMP0]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] +; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; SINK-AFTER-NEXT: br label [[LOOP:%.*]] ; SINK-AFTER: loop: ; SINK-AFTER-NEXT: [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] ; SINK-AFTER-NEXT: [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[LOAD:%.*]], [[LOOP]] ] ; SINK-AFTER-NEXT: [[ADD]] = add i64 [[PHI]], 1 -; SINK-AFTER-NEXT: [[LOAD]] = load i32, ptr [[SRC]], align 4 +; SINK-AFTER-NEXT: [[LOAD]] = load i32, ptr [[SRC:%.*]], align 4 ; SINK-AFTER-NEXT: [[ICMP:%.*]] = icmp ult i64 [[PHI]], 1 -; SINK-AFTER-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP38:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[ICMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP37:![0-9]+]] ; SINK-AFTER: exit: ; SINK-AFTER-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll index 6aacbd3f98a31..914b9ad4a9e5a 100644 --- a/llvm/test/Transforms/LoopVectorize/loop-form.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -80,9 +80,9 @@ define void @bottom_tested(ptr %p, i32 %n) { ; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TAILFOLD: middle.block: -; TAILFOLD-NEXT: br i1 true, label [[IF_END:%.*]], label [[SCALAR_PH]] +; TAILFOLD-NEXT: br label [[IF_END:%.*]] ; TAILFOLD: scalar.ph: -; TAILFOLD-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; TAILFOLD-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; TAILFOLD-NEXT: br label [[FOR_COND:%.*]] ; TAILFOLD: for.cond: ; TAILFOLD-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_COND]] ] diff --git a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll index d1ad7e3f4fc0d..c9066f22c5592 100644 --- a/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll +++ b/llvm/test/Transforms/LoopVectorize/memdep-fold-tail.ll @@ -69,9 +69,9 @@ define void @maxvf3() { ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[J:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll index e557c76b6fdf0..c7149b0845981 100644 --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -698,9 +698,9 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026 ; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1026, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -747,9 +747,9 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize { ; PGSO-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026 ; PGSO-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; PGSO: [[MIDDLE_BLOCK]]: -; PGSO-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; PGSO-NEXT: br label %[[FOR_END:.*]] ; PGSO: [[SCALAR_PH]]: -; PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1026, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; PGSO-NEXT: br label %[[FOR_BODY:.*]] ; PGSO: [[FOR_BODY]]: ; PGSO-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] @@ -796,9 +796,9 @@ define void @stride1(ptr noalias %B, i32 %BStride) optsize { ; NPGSO-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026 ; NPGSO-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; NPGSO: [[MIDDLE_BLOCK]]: -; NPGSO-NEXT: br i1 true, label %[[FOR_END:.*]], label %[[SCALAR_PH]] +; NPGSO-NEXT: br label %[[FOR_END:.*]] ; NPGSO: [[SCALAR_PH]]: -; NPGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1026, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; NPGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; NPGSO-NEXT: br label %[[FOR_BODY:.*]] ; NPGSO: [[FOR_BODY]]: ; NPGSO-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll index b207cca03c90f..0b6b789c1dcea 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll @@ -57,9 +57,9 @@ define void @pr45679(ptr %A) optsize { ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ] @@ -119,9 +119,9 @@ define void @pr45679(ptr %A) optsize { ; VF2UF2-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 ; VF2UF2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VF2UF2: middle.block: -; VF2UF2-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; VF2UF2-NEXT: br label [[EXIT:%.*]] ; VF2UF2: scalar.ph: -; VF2UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; VF2UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; VF2UF2-NEXT: br label [[LOOP:%.*]] ; VF2UF2: loop: ; VF2UF2-NEXT: [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ] @@ -176,9 +176,9 @@ define void @pr45679(ptr %A) optsize { ; VF1UF4-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 ; VF1UF4-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VF1UF4: middle.block: -; VF1UF4-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; VF1UF4-NEXT: br label [[EXIT:%.*]] ; VF1UF4: scalar.ph: -; VF1UF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; VF1UF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; VF1UF4-NEXT: br label [[LOOP:%.*]] ; VF1UF4: loop: ; VF1UF4-NEXT: [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ] @@ -256,9 +256,9 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) { ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -323,9 +323,9 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) { ; VF2UF2-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; VF2UF2-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VF2UF2: middle.block: -; VF2UF2-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; VF2UF2-NEXT: br label [[FOR_END:%.*]] ; VF2UF2: scalar.ph: -; VF2UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; VF2UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; VF2UF2-NEXT: br label [[FOR_BODY:%.*]] ; VF2UF2: for.body: ; VF2UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -385,9 +385,9 @@ define void @load_variant(ptr noalias %a, ptr noalias %b) { ; VF1UF4-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; VF1UF4-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VF1UF4: middle.block: -; VF1UF4-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; VF1UF4-NEXT: br label [[FOR_END:%.*]] ; VF1UF4: scalar.ph: -; VF1UF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; VF1UF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; VF1UF4-NEXT: br label [[FOR_BODY:%.*]] ; VF1UF4: for.body: ; VF1UF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll index 1e1b03b6cd8cf..d4a6aed472832 100644 --- a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll +++ b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll @@ -23,7 +23,6 @@ define void @test(i16 %x, i64 %y, ptr %ptr) { ; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], 1 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[INC]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -32,9 +31,9 @@ define void @test(i16 %x, i64 %y, ptr %ptr) { ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll b/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll index 705152662be24..77794dcb9369d 100644 --- a/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll +++ b/llvm/test/Transforms/LoopVectorize/pr51614-fold-tail-by-masking.ll @@ -61,10 +61,10 @@ define dso_local i16 @reverse_interleave_load_fold_mask() optsize { ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP28:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP26]]) -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ -1, [[MIDDLE_BLOCK]] ], [ 41, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP28]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 41, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IVMINUS1:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll index c76057a18bf3c..55a0aa3900029 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll @@ -61,7 +61,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -171,7 +171,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -265,7 +265,7 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -376,7 +376,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -489,7 +489,7 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP47]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -599,7 +599,7 @@ define i32 @reduction_mul(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -707,7 +707,7 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -813,7 +813,7 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -919,7 +919,7 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -1025,7 +1025,7 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -1133,7 +1133,7 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -1222,7 +1222,7 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -1309,7 +1309,7 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -1495,7 +1495,7 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP30]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP33:%.*]] = trunc <4 x i32> [[TMP32]] to <4 x i8> ; CHECK-NEXT: [[TMP34:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP33]]) -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -1590,7 +1590,7 @@ define i8 @reduction_and_trunc(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP29]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[TMP32:%.*]] = trunc <4 x i32> [[TMP31]] to <4 x i8> ; CHECK-NEXT: [[TMP33:%.*]] = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> [[TMP32]]) -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll index 279e0be3c8312..7f81a672478bb 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll @@ -350,7 +350,7 @@ define i32 @predicated(ptr noalias nocapture %A) { ; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP104]], [[TMP101]] ; CHECK-NEXT: [[BIN_RDX37:%.*]] = add i32 [[TMP107]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX38:%.*]] = add i32 [[TMP110]], [[BIN_RDX37]] -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -589,7 +589,7 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[BIN_RDX:%.*]] = mul i32 [[TMP112]], [[TMP109]] ; CHECK-NEXT: [[BIN_RDX39:%.*]] = mul i32 [[TMP115]], [[BIN_RDX]] ; CHECK-NEXT: [[BIN_RDX40:%.*]] = mul i32 [[TMP118]], [[BIN_RDX39]] -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll index 3d40707a5e97e..51d14e696d3ff 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll @@ -61,7 +61,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) { ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -164,7 +164,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP43]]) -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -270,7 +270,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP42]]) -; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[DOT_CRIT_EDGE:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: @@ -375,7 +375,7 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP42]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -480,7 +480,7 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP42]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -585,7 +585,7 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP42]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -690,7 +690,7 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP44:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP42]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -795,7 +795,7 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP44:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP42]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -883,7 +883,7 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP25]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: @@ -969,7 +969,7 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP25]]) -; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction.ll b/llvm/test/Transforms/LoopVectorize/select-reduction.ll index 16cc05da17a63..cfc9bb25a9208 100644 --- a/llvm/test/Transforms/LoopVectorize/select-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/select-reduction.ll @@ -18,7 +18,6 @@ define i32 @test(i64 %N, i32 %x) { ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[EXTRA_ITER]], 1 -; CHECK-NEXT: [[IND_END:%.*]] = sub i64 [[EXTRA_ITER]], [[N_VEC]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -37,10 +36,10 @@ define i32 @test(i64 %N, i32 %x) { ; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]]) -; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[NEXT:%.*]] = phi i32 [ [[SEL:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] @@ -93,10 +92,10 @@ define i32 @pr66895_tail_fold_reduction_exit_inst_gets_simplified(i32 %n) { ; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VEC_PHI]]) -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ -4, [[MIDDLE_BLOCK]] ], [ 12, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 12, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll index 57bc7b8337249..bf86cbd601f44 100644 --- a/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/store-reduction-results-in-tail-folded-loop.ll @@ -20,7 +20,6 @@ define void @pr75298_store_reduction_value_in_folded_loop(i64 %iv.start) optsize ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IV_START]], [[N_VEC]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -42,10 +41,10 @@ define void @pr75298_store_reduction_value_in_folded_loop(i64 %iv.start) optsize ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: store i32 [[TMP6]], ptr @a, align 4 -; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[IV_START]], [[PH]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[PH]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IV_START]], [[PH]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PH]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll index a293225968d52..eefa3da97a4bc 100644 --- a/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll +++ b/llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll @@ -23,10 +23,10 @@ define float @pr70988() { ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT3]], 1022 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1022, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ] @@ -58,10 +58,10 @@ define float @pr70988() { ; CHECK-ALM-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT3]], 1022 ; CHECK-ALM-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-ALM: middle.block: -; CHECK-ALM-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-ALM-NEXT: br label [[EXIT:%.*]] ; CHECK-ALM: scalar.ph: -; CHECK-ALM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1022, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ALM-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-ALM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] +; CHECK-ALM-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-ALM-NEXT: br label [[LOOP:%.*]] ; CHECK-ALM: loop: ; CHECK-ALM-NEXT: [[INDEX:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT:%.*]], [[LOOP]] ] @@ -127,10 +127,10 @@ define float @pr72720reduction_using_active_lane_mask(ptr %src) { ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NARROW:%.*]], [[LOOP]] ] @@ -179,10 +179,10 @@ define float @pr72720reduction_using_active_lane_mask(ptr %src) { ; CHECK-ALM-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 ; CHECK-ALM-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK-ALM: middle.block: -; CHECK-ALM-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-ALM-NEXT: br label [[EXIT:%.*]] ; CHECK-ALM: scalar.ph: -; CHECK-ALM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ALM-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-ALM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] +; CHECK-ALM-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-ALM-NEXT: br label [[LOOP:%.*]] ; CHECK-ALM: loop: ; CHECK-ALM-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NARROW:%.*]], [[LOOP]] ] @@ -237,10 +237,10 @@ define float @fadd_reduction_with_live_in(float %inc) { ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1002 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1002, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -273,10 +273,10 @@ define float @fadd_reduction_with_live_in(float %inc) { ; CHECK-ALM-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1002 ; CHECK-ALM-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-ALM: middle.block: -; CHECK-ALM-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-ALM-NEXT: br label [[EXIT:%.*]] ; CHECK-ALM: scalar.ph: -; CHECK-ALM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1002, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-ALM-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ] +; CHECK-ALM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] +; CHECK-ALM-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-ALM-NEXT: br label [[LOOP:%.*]] ; CHECK-ALM: loop: ; CHECK-ALM-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll index 3a54244a41017..3cf8b3f4bf2b7 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll @@ -53,9 +53,9 @@ define i32 @test(ptr %vf1, i64 %n) { ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 204 ; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 204, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll index a757314ec7a46..d2d99827d5f35 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: vector.body: ; CHECK-LABEL: middle.block: -; CHECK-NEXT: br i1 true, +; CHECK-NEXT: br label %while.end.loopexit target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll index ff089c7401d53..adc4c8e2f2d8e 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-optimize-vector-induction-width.ll @@ -33,9 +33,9 @@ define void @canonical_small_tc_i8(ptr nocapture noundef writeonly %p) { ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -94,9 +94,9 @@ define void @canonical_upper_limit_i8(ptr nocapture noundef writeonly %p) { ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 ; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 256, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -155,9 +155,9 @@ define void @canonical_lower_limit_i16(ptr nocapture noundef writeonly %p) { ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 258 ; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 258, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -216,9 +216,9 @@ define void @canonical_upper_limit_i16(ptr nocapture noundef writeonly %p) { ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 ; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 65536, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -277,9 +277,9 @@ define void @canonical_lower_limit_i32(ptr nocapture noundef writeonly %p) { ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65538 ; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 65538, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -338,9 +338,9 @@ define void @canonical_upper_limit_i32(ptr nocapture noundef writeonly %p) { ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967296 ; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4294967296, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -399,9 +399,9 @@ define void @canonical_lower_limit_i64(ptr nocapture noundef writeonly %p) { ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[TMP8]], 4294967298 ; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4294967298, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -460,9 +460,9 @@ define void @canonical_upper_limit_i64(ptr nocapture noundef writeonly %p) { ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[TMP8]], 0 ; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -521,9 +521,9 @@ define void @canonical_lower_limit_i128(ptr nocapture noundef writeonly %p) { ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i256 [[TMP8]], 18446744073709551618 ; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[END:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i256 [ 18446744073709551618, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i256 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i256 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll index a73958cb30543..222c1eeb6e443 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-switch.ll @@ -54,9 +54,9 @@ define void @tail_fold_switch(ptr %dst, i32 %0) { ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8 ; CHECK-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 8, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll index 455d1bc96e226..caa5969bbc365 100644 --- a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll +++ b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll @@ -53,9 +53,9 @@ define void @VF1-VPlanExe(ptr %dst) { ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void @@ -88,7 +88,6 @@ define void @VF1-VPWidenCanonicalIVRecipeExe(ptr %ptr1) { ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds double, ptr [[PTR1:%.*]], i64 15 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR1]], i64 128 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ] @@ -132,9 +131,9 @@ define void @VF1-VPWidenCanonicalIVRecipeExe(ptr %ptr1) { ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR1]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[PTR1]], [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll index 665bbd9f82eff..476ba3d5d1f35 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination.ll @@ -163,7 +163,6 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8 ; VF8UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF8UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 -; VF8UF1-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] ; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; VF8UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -227,9 +226,9 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF1: [[PRED_STORE_CONTINUE14]]: ; VF8UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF1: [[MIDDLE_BLOCK]]: -; VF8UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; VF8UF1-NEXT: br label %[[EXIT:.*]] ; VF8UF1: [[SCALAR_PH]]: -; VF8UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ] +; VF8UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[ENTRY]] ] ; VF8UF1-NEXT: br label %[[LOOP:.*]] ; VF8UF1: [[LOOP]]: ; VF8UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -251,7 +250,6 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 ; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF8UF2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 -; VF8UF2-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] ; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; VF8UF2-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] @@ -372,9 +370,9 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF8UF2: [[PRED_STORE_CONTINUE30]]: ; VF8UF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: -; VF8UF2-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; VF8UF2-NEXT: br label %[[EXIT:.*]] ; VF8UF2: [[SCALAR_PH]]: -; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ] +; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[ENTRY]] ] ; VF8UF2-NEXT: br label %[[LOOP:.*]] ; VF8UF2: [[LOOP]]: ; VF8UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -396,7 +394,6 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF16UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 ; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF16UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1 -; VF16UF1-NEXT: [[TMP1:%.*]] = add i64 2, [[N_VEC]] ; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; VF16UF1-NEXT: [[BROADCAST_SPLAT1:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -516,9 +513,9 @@ define void @remove_loop_region_with_replicate_recipe(ptr %dst, i64 range(i64 5, ; VF16UF1: [[PRED_STORE_CONTINUE30]]: ; VF16UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: -; VF16UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; VF16UF1-NEXT: br label %[[EXIT:.*]] ; VF16UF1: [[SCALAR_PH]]: -; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 2, %[[ENTRY]] ] +; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2, %[[ENTRY]] ] ; VF16UF1-NEXT: br label %[[LOOP:.*]] ; VF16UF1: [[LOOP]]: ; VF16UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -725,7 +722,6 @@ define void @scev_expand_step(i64 %x, ptr %dst) { ; VF8UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 8 ; VF8UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF8UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1 -; VF8UF1-NEXT: [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]] ; VF8UF1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; VF8UF1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -813,9 +809,9 @@ define void @scev_expand_step(i64 %x, ptr %dst) { ; VF8UF1: [[PRED_STORE_CONTINUE14]]: ; VF8UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF1: [[MIDDLE_BLOCK]]: -; VF8UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; VF8UF1-NEXT: br label %[[EXIT:.*]] ; VF8UF1: [[SCALAR_PH]]: -; VF8UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; VF8UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF8UF1-NEXT: br label %[[LOOP:.*]] ; VF8UF1: [[LOOP]]: ; VF8UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -842,7 +838,6 @@ define void @scev_expand_step(i64 %x, ptr %dst) { ; VF8UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 ; VF8UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF8UF2-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1 -; VF8UF2-NEXT: [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]] ; VF8UF2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; VF8UF2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] @@ -1011,9 +1006,9 @@ define void @scev_expand_step(i64 %x, ptr %dst) { ; VF8UF2: [[PRED_STORE_CONTINUE30]]: ; VF8UF2-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF8UF2: [[MIDDLE_BLOCK]]: -; VF8UF2-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; VF8UF2-NEXT: br label %[[EXIT:.*]] ; VF8UF2: [[SCALAR_PH]]: -; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF8UF2-NEXT: br label %[[LOOP:.*]] ; VF8UF2: [[LOOP]]: ; VF8UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] @@ -1040,7 +1035,6 @@ define void @scev_expand_step(i64 %x, ptr %dst) { ; VF16UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16 ; VF16UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; VF16UF1-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1 -; VF16UF1-NEXT: [[TMP2:%.*]] = mul i64 [[N_VEC]], [[STEP]] ; VF16UF1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 ; VF16UF1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -1208,9 +1202,9 @@ define void @scev_expand_step(i64 %x, ptr %dst) { ; VF16UF1: [[PRED_STORE_CONTINUE30]]: ; VF16UF1-NEXT: br label %[[MIDDLE_BLOCK:.*]] ; VF16UF1: [[MIDDLE_BLOCK]]: -; VF16UF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; VF16UF1-NEXT: br label %[[EXIT:.*]] ; VF16UF1: [[SCALAR_PH]]: -; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF16UF1-NEXT: br label %[[LOOP:.*]] ; VF16UF1: [[LOOP]]: ; VF16UF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 6d0701c4a149b..cf9991d68fce6 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -262,7 +262,6 @@ define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: Successor(s): vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<[[END:%.+]]> = DERIVED-IV ir<21> + vp<[[VEC_TC]]> * ir<1> ; CHECK-NEXT: CLONE ir<%gep.A.uniform> = getelementptr inbounds ir<%A>, ir<0> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: @@ -1049,7 +1048,6 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n ; CHECK-NEXT: Successor(s): vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: -; CHECK-NEXT: vp<[[END:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1> ; CHECK-NEXT: Successor(s): vector loop ; CHECK-EMPTY: ; CHECK-NEXT: vector loop: { @@ -1086,23 +1084,7 @@ define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr n ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT branch-on-cond ir -; CHECK-NEXT: Successor(s): ir-bb, scalar.ph -; CHECK-EMPTY: -; CHECK-NEXT: scalar.ph -; CHECK-NEXT: EMIT vp<[[RESUME:%.+]]> = resume-phi vp<[[END]]>, ir<%n> -; CHECK-NEXT: Successor(s): ir-bb -; CHECK-EMPTY: -; CHECK-NEXT: ir-bb: -; CHECK-NEXT: IR %iv = phi i32 [ %n, %entry ], [ %iv.next, %loop ] (extra operand: vp<[[RESUME]]> from scalar.ph) -; CHECK-NEXT: IR %iv.next = add nsw i32 %iv, -1 -; CHECK-NEXT: IR %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv -; CHECK-NEXT: IR %l = load i32, ptr %gep.src, align 16 -; CHECK-NEXT: IR %dead_gep = getelementptr inbounds i32, ptr %dst, i64 1 -; CHECK-NEXT: IR %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv -; CHECK-NEXT: IR store i32 %l, ptr %gep.dst, align 16 -; CHECK-NEXT: IR %ec = icmp eq i32 %iv.next, 0 -; CHECK-NEXT: No successors +; CHECK-NEXT: Successor(s): ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb ; CHECK-NEXT: No successors From aca270877fc607a5558ff0a0f104fd5b6bb8fc62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Fri, 4 Apr 2025 16:27:25 +0100 Subject: [PATCH 0650/1029] [SLP] Use named structs in vectorizeStores() (NFC) (#132781) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a mostly straightforward replacement of the previous `std::pair>>` data structure used in `SLPVectorizerPass::vectorizeStores()` with slightly more readable alternatives. I had done that change in my local tree to help me better understand the code. It’s not very invasive, so I thought I’d create a PR for it. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 118 ++++++++++-------- 1 file changed, 69 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c384b11bbc1a5..f799c46ab2875 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20322,6 +20322,38 @@ static bool checkTreeSizes(ArrayRef> Sizes, return Dev * 96 / (Mean * Mean) == 0; } +namespace { + +/// A group of stores that we'll try to bundle together using vector ops. +/// They are ordered using the signed distance of their address operand to the +/// address of this group's BaseInstr. +struct RelatedStoreInsts { + RelatedStoreInsts(unsigned BaseInstrIdx) { reset(BaseInstrIdx); } + void reset(unsigned NewBaseInstr) { + BaseInstrIdx = NewBaseInstr; + Instrs.clear(); + insertOrLookup(NewBaseInstr, 0); + } + + /// Tries to insert \p InstrIdx as the store with a pointer distance of + /// \p PtrDist. + /// Does nothing if there is already a store with that \p PtrDist. + /// \returns The previously associated Instruction index, or std::nullopt + std::optional insertOrLookup(unsigned InstrIdx, int PtrDist) { + auto [It, Inserted] = Instrs.emplace(PtrDist, InstrIdx); + return Inserted ? std::nullopt : std::optional(It->second); + } + + /// The index of the Base instruction, i.e. the one with a 0 pointer distance. + unsigned BaseInstrIdx; + + /// Maps a pointer distance from \p BaseInstrIdx to an instruction index. + using DistToInstMap = std::map; + DistToInstMap Instrs; +}; + +} // end anonymous namespace + bool SLPVectorizerPass::vectorizeStores( ArrayRef Stores, BoUpSLP &R, DenseSet> @@ -20331,31 +20363,22 @@ bool SLPVectorizerPass::vectorizeStores( BoUpSLP::ValueSet VectorizedStores; bool Changed = false; - struct StoreDistCompare { - bool operator()(const std::pair &Op1, - const std::pair &Op2) const { - return Op1.second < Op2.second; - } - }; - // A set of pairs (index of store in Stores array ref, Distance of the store - // address relative to base store address in units). - using StoreIndexToDistSet = - std::set, StoreDistCompare>; - auto TryToVectorize = [&](const StoreIndexToDistSet &Set) { + auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) { int PrevDist = -1; BoUpSLP::ValueList Operands; // Collect the chain into a list. - for (auto [Idx, Data] : enumerate(Set)) { - if (Operands.empty() || Data.second - PrevDist == 1) { - Operands.push_back(Stores[Data.first]); - PrevDist = Data.second; - if (Idx != Set.size() - 1) + for (auto [Idx, Data] : enumerate(StoreSeq)) { + auto &[Dist, InstIdx] = Data; + if (Operands.empty() || Dist - PrevDist == 1) { + Operands.push_back(Stores[InstIdx]); + PrevDist = Dist; + if (Idx != StoreSeq.size() - 1) continue; } - auto E = make_scope_exit([&, &DataVar = Data]() { + auto E = make_scope_exit([&, &Dist = Dist, &InstIdx = InstIdx]() { Operands.clear(); - Operands.push_back(Stores[DataVar.first]); - PrevDist = DataVar.second; + Operands.push_back(Stores[InstIdx]); + PrevDist = Dist; }); if (Operands.size() <= 1 || @@ -20622,7 +20645,8 @@ bool SLPVectorizerPass::vectorizeStores( // Need to store the index of the very first store separately, since the set // may be reordered after the insertion and the first store may be moved. This // container allows to reduce number of calls of getPointersDiff() function. - SmallVector> SortedStores; + SmallVector SortedStores; + // Inserts the specified store SI with the given index Idx to the set of the // stores. If the store with the same distance is found already - stop // insertion, try to vectorize already found stores. If some stores from this @@ -20656,56 +20680,52 @@ bool SLPVectorizerPass::vectorizeStores( // dependencies and no need to waste compile time to try to vectorize them. // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}. auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) { - for (std::pair &Set : SortedStores) { + for (RelatedStoreInsts &StoreSeq : SortedStores) { std::optional Diff = getPointersDiff( - Stores[Set.first]->getValueOperand()->getType(), - Stores[Set.first]->getPointerOperand(), + Stores[StoreSeq.BaseInstrIdx]->getValueOperand()->getType(), + Stores[StoreSeq.BaseInstrIdx]->getPointerOperand(), SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); if (!Diff) continue; - auto It = Set.second.find(std::make_pair(Idx, *Diff)); - if (It == Set.second.end()) { - Set.second.emplace(Idx, *Diff); + std::optional PrevInst = + StoreSeq.insertOrLookup(/*InstrIdx=*/Idx, /*PtrDist=*/*Diff); + if (!PrevInst) { + // No store was associated to that distance. Keep collecting. return; } // Try to vectorize the first found set to avoid duplicate analysis. - TryToVectorize(Set.second); - unsigned ItIdx = It->first; - int ItDist = It->second; - StoreIndexToDistSet PrevSet; - copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()), - [&](const std::pair &Pair) { - return Pair.first > ItIdx; + TryToVectorize(StoreSeq.Instrs); + RelatedStoreInsts::DistToInstMap PrevSet; + copy_if(StoreSeq.Instrs, std::inserter(PrevSet, PrevSet.end()), + [&](const std::pair &DistAndIdx) { + return DistAndIdx.second > *PrevInst; }); - Set.second.clear(); - Set.first = Idx; - Set.second.emplace(Idx, 0); + StoreSeq.reset(Idx); // Insert stores that followed previous match to try to vectorize them // with this store. - unsigned StartIdx = ItIdx + 1; + unsigned StartIdx = *PrevInst + 1; SmallBitVector UsedStores(Idx - StartIdx); // Distances to previously found dup store (or this store, since they // store to the same addresses). SmallVector Dists(Idx - StartIdx, 0); - for (const std::pair &Pair : reverse(PrevSet)) { + for (auto [PtrDist, InstIdx] : reverse(PrevSet)) { // Do not try to vectorize sequences, we already tried. - if (VectorizedStores.contains(Stores[Pair.first])) + if (VectorizedStores.contains(Stores[InstIdx])) break; - unsigned BI = Pair.first - StartIdx; + unsigned BI = InstIdx - StartIdx; UsedStores.set(BI); - Dists[BI] = Pair.second - ItDist; + Dists[BI] = PtrDist - *Diff; } for (unsigned I = StartIdx; I < Idx; ++I) { unsigned BI = I - StartIdx; if (UsedStores.test(BI)) - Set.second.emplace(I, Dists[BI]); + StoreSeq.insertOrLookup(I, Dists[BI]); } return; } - auto &Res = SortedStores.emplace_back(); - Res.first = Idx; - Res.second.emplace(Idx, 0); + // We did not find a comparable store, start a new sequence. + SortedStores.emplace_back(Idx); }; Type *PrevValTy = nullptr; for (auto [I, SI] : enumerate(Stores)) { @@ -20715,8 +20735,8 @@ bool SLPVectorizerPass::vectorizeStores( PrevValTy = SI->getValueOperand()->getType(); // Check that we do not try to vectorize stores of different types. if (PrevValTy != SI->getValueOperand()->getType()) { - for (auto &Set : SortedStores) - TryToVectorize(Set.second); + for (RelatedStoreInsts &StoreSeq : SortedStores) + TryToVectorize(StoreSeq.Instrs); SortedStores.clear(); PrevValTy = SI->getValueOperand()->getType(); } @@ -20724,8 +20744,8 @@ bool SLPVectorizerPass::vectorizeStores( } // Final vectorization attempt. - for (auto &Set : SortedStores) - TryToVectorize(Set.second); + for (RelatedStoreInsts &StoreSeq : SortedStores) + TryToVectorize(StoreSeq.Instrs); return Changed; } From c8bde44cfcc75a8389f1a72917e0aadc125f5e22 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 4 Apr 2025 08:40:51 -0700 Subject: [PATCH 0651/1029] [flang] Implement FSEEK and FTELL (#133003) Add function and subroutine forms of FSEEK and FTELL as intrinsic procedures. Accept common aliases from legacy compilers as well. A separate patch to llvm-test-suite will enable tests for these procedures once this patch has merged. Depends on https://github.com/llvm/llvm-project/pull/132423; CI builds will likely fail until that patch is merged and this PR is rebased. --- flang-rt/lib/runtime/extensions.cpp | 30 ++++++++ flang-rt/lib/runtime/unit.cpp | 45 +++++++++-- flang-rt/lib/runtime/unit.h | 14 +++- flang/docs/Intrinsics.md | 38 ++++++++++ .../flang/Optimizer/Builder/IntrinsicCall.h | 4 + .../Optimizer/Builder/Runtime/Intrinsics.h | 5 ++ flang/include/flang/Runtime/extensions.h | 5 ++ flang/lib/Evaluate/intrinsics.cpp | 32 ++++++-- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 74 +++++++++++++++++++ .../Optimizer/Builder/Runtime/Intrinsics.cpp | 24 ++++++ 10 files changed, 255 insertions(+), 16 deletions(-) diff --git a/flang-rt/lib/runtime/extensions.cpp b/flang-rt/lib/runtime/extensions.cpp index a73279e445797..6b553ff97e5ab 100644 --- a/flang-rt/lib/runtime/extensions.cpp +++ b/flang-rt/lib/runtime/extensions.cpp @@ -10,12 +10,14 @@ // extensions that will eventually be implemented in Fortran. #include "flang/Runtime/extensions.h" +#include "unit.h" #include "flang-rt/runtime/descriptor.h" #include "flang-rt/runtime/terminator.h" #include "flang-rt/runtime/tools.h" #include "flang/Runtime/command.h" #include "flang/Runtime/entry-names.h" #include "flang/Runtime/io-api.h" +#include "flang/Runtime/iostat-consts.h" #include #include #include @@ -275,5 +277,33 @@ void RTNAME(Perror)(const char *str) { perror(str); } // GNU extension function TIME() std::int64_t RTNAME(time)() { return time(nullptr); } +// Extension procedures related to I/O + +namespace io { +std::int32_t RTNAME(Fseek)(int unitNumber, std::int64_t zeroBasedPos, + int whence, const char *sourceFileName, int lineNumber) { + if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) { + Terminator terminator{sourceFileName, lineNumber}; + IoErrorHandler handler{terminator}; + if (unit->Fseek( + zeroBasedPos, static_cast(whence), handler)) { + return IostatOk; + } else { + return IostatCannotReposition; + } + } else { + return IostatBadUnitNumber; + } +} + +std::int64_t RTNAME(Ftell)(int unitNumber) { + if (ExternalFileUnit * unit{ExternalFileUnit::LookUp(unitNumber)}) { + return unit->InquirePos() - 1; // zero-based result + } else { + return -1; + } +} +} // namespace io + } // namespace Fortran::runtime } // extern "C" diff --git a/flang-rt/lib/runtime/unit.cpp b/flang-rt/lib/runtime/unit.cpp index 43501aeb48458..199287d7237fd 100644 --- a/flang-rt/lib/runtime/unit.cpp +++ b/flang-rt/lib/runtime/unit.cpp @@ -441,14 +441,14 @@ void ExternalFileUnit::Rewind(IoErrorHandler &handler) { "REWIND(UNIT=%d) on non-sequential file", unitNumber()); } else { DoImpliedEndfile(handler); - SetPosition(0, handler); + SetPosition(0); currentRecordNumber = 1; leftTabLimit.reset(); anyWriteSinceLastPositioning_ = false; } } -void ExternalFileUnit::SetPosition(std::int64_t pos, IoErrorHandler &handler) { +void ExternalFileUnit::SetPosition(std::int64_t pos) { frameOffsetInFile_ = pos; recordOffsetInFrame_ = 0; if (access == Access::Direct) { @@ -457,6 +457,18 @@ void ExternalFileUnit::SetPosition(std::int64_t pos, IoErrorHandler &handler) { BeginRecord(); } +void ExternalFileUnit::Sought(std::int64_t zeroBasedPos) { + SetPosition(zeroBasedPos); + if (zeroBasedPos == 0) { + currentRecordNumber = 1; + } else { + // We no longer know which record we're in. Set currentRecordNumber to + // a large value from whence we can both advance and backspace. + currentRecordNumber = std::numeric_limits::max() / 2; + endfileRecordNumber.reset(); + } +} + bool ExternalFileUnit::SetStreamPos( std::int64_t oneBasedPos, IoErrorHandler &handler) { if (access != Access::Stream) { @@ -474,14 +486,31 @@ bool ExternalFileUnit::SetStreamPos( frameOffsetInFile_ + recordOffsetInFrame_) { DoImpliedEndfile(handler); } - SetPosition(oneBasedPos - 1, handler); - // We no longer know which record we're in. Set currentRecordNumber to - // a large value from whence we can both advance and backspace. - currentRecordNumber = std::numeric_limits::max() / 2; - endfileRecordNumber.reset(); + Sought(oneBasedPos - 1); return true; } +// GNU FSEEK extension +RT_API_ATTRS bool ExternalFileUnit::Fseek(std::int64_t zeroBasedPos, + enum FseekWhence whence, IoErrorHandler &handler) { + if (whence == FseekEnd) { + Flush(handler); // updates knownSize_ + if (auto size{knownSize()}) { + zeroBasedPos += *size; + } else { + return false; + } + } else if (whence == FseekCurrent) { + zeroBasedPos += InquirePos() - 1; + } + if (zeroBasedPos >= 0) { + Sought(zeroBasedPos); + return true; + } else { + return false; + } +} + bool ExternalFileUnit::SetDirectRec( std::int64_t oneBasedRec, IoErrorHandler &handler) { if (access != Access::Direct) { @@ -498,7 +527,7 @@ bool ExternalFileUnit::SetDirectRec( return false; } currentRecordNumber = oneBasedRec; - SetPosition((oneBasedRec - 1) * *openRecl, handler); + SetPosition((oneBasedRec - 1) * *openRecl); return true; } diff --git a/flang-rt/lib/runtime/unit.h b/flang-rt/lib/runtime/unit.h index bb3d3650da34b..86e5639f1250e 100644 --- a/flang-rt/lib/runtime/unit.h +++ b/flang-rt/lib/runtime/unit.h @@ -33,6 +33,12 @@ class UnitMap; class ChildIo; class ExternalFileUnit; +enum FseekWhence { + FseekSet = 0, + FseekCurrent = 1, + FseekEnd = 2, +}; + RT_OFFLOAD_VAR_GROUP_BEGIN // Predefined file units. extern RT_VAR_ATTRS ExternalFileUnit *defaultInput; // unit 5 @@ -176,8 +182,9 @@ class ExternalFileUnit : public ConnectionState, RT_API_ATTRS void Endfile(IoErrorHandler &); RT_API_ATTRS void Rewind(IoErrorHandler &); RT_API_ATTRS void EndIoStatement(); - RT_API_ATTRS bool SetStreamPos( - std::int64_t, IoErrorHandler &); // one-based, for POS= + RT_API_ATTRS bool SetStreamPos(std::int64_t oneBasedPos, IoErrorHandler &); + RT_API_ATTRS bool Fseek( + std::int64_t zeroBasedPos, enum FseekWhence, IoErrorHandler &); RT_API_ATTRS bool SetDirectRec( std::int64_t, IoErrorHandler &); // one-based, for REC= RT_API_ATTRS std::int64_t InquirePos() const { @@ -196,7 +203,8 @@ class ExternalFileUnit : public ConnectionState, static RT_API_ATTRS UnitMap &CreateUnitMap(); static RT_API_ATTRS UnitMap &GetUnitMap(); RT_API_ATTRS const char *FrameNextInput(IoErrorHandler &, std::size_t); - RT_API_ATTRS void SetPosition(std::int64_t, IoErrorHandler &); // zero-based + RT_API_ATTRS void SetPosition(std::int64_t zeroBasedPos); + RT_API_ATTRS void Sought(std::int64_t zeroBasedPos); RT_API_ATTRS void BeginSequentialVariableUnformattedInputRecord( IoErrorHandler &); RT_API_ATTRS void BeginVariableFormattedInputRecord(IoErrorHandler &); diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index 8b675c33b09d1..ecf6fbeabd654 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -1197,6 +1197,44 @@ program chdir_func end program chdir_func ``` +### Non-Standard Intrinsics: FSEEK and FTELL + +#### Description +`FSEEK(UNIT, OFFSET, WHENCE)` Sets position in file opened as `UNIT`, returns status. + +`CALL FSEEK(UNIT, OFFSET, WHENCE[, STATUS])` Sets position, returns any error in `STATUS` if present. + +`FTELL(UNIT)` Returns current absolute byte offset. + +`CALL FTELL(UNIT, OFFSET)` Set `OFFSET` to current byte offset in file. + +These intrinsic procedures are available as both functions and subroutines, +but both forms cannot be used in the same scope. + +These arguments must all be integers. +The value returned from the function form of `FTELL` is `INTEGER(8)`. + +| | | +|------------|-------------------------------------------------| +| `UNIT` | An open unit number | +| `OFFSET` | A byte offset; set to -1 by `FTELL` on error | +| `WHENCE` | 0: `OFFSET` is an absolute position | +| | 1: `OFFSET` is relative to the current position | +| | 2: `OFFSET` is relative to the end of the file | +| `STATUS` | Set to a nonzero value if an error occurs | +|------------|-------------------------------------------------| + +The aliases `FSEEK64`, `FSEEKO64`, `FSEEKI8`, `FTELL64`, `FTELLO64`, and +`FTELLI8` are also accepted for further compatibility. + +Avoid using these intrinsics in new code when the standard `ACCESS="STREAM"` +feature meets your needs. + +#### Usage and Info + +- **Standard:** Extensions to GNU, Intel, and SUN (at least) +- **Class:** Subroutine, function + ### Non-Standard Intrinsics: IERRNO #### Description diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 29cde05480173..00b7b696eb4f9 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -266,6 +266,10 @@ struct IntrinsicLibrary { mlir::Value genFraction(mlir::Type resultType, mlir::ArrayRef args); void genFree(mlir::ArrayRef args); + fir::ExtendedValue genFseek(std::optional, + mlir::ArrayRef args); + fir::ExtendedValue genFtell(std::optional, + mlir::ArrayRef args); fir::ExtendedValue genGetCwd(std::optional resultType, llvm::ArrayRef args); void genGetCommand(mlir::ArrayRef args); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h index 2e5adf6bd0ab7..9ca4b2baeaa65 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h @@ -49,6 +49,11 @@ void genEtime(fir::FirOpBuilder &builder, mlir::Location loc, void genFree(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value ptr); +mlir::Value genFseek(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value unit, mlir::Value offset, mlir::Value whence); +mlir::Value genFtell(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value unit); + mlir::Value genGetUID(fir::FirOpBuilder &, mlir::Location); mlir::Value genGetGID(fir::FirOpBuilder &, mlir::Location); diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h index 47ef4c12ef73a..db2245875e85a 100644 --- a/flang/include/flang/Runtime/extensions.h +++ b/flang/include/flang/Runtime/extensions.h @@ -38,6 +38,11 @@ void FORTRAN_PROCEDURE_NAME(fdate)(char *string, std::int64_t length); void RTNAME(Free)(std::intptr_t ptr); +// Common extensions FSEEK & FTELL, variously named +std::int32_t RTNAME(Fseek)(int unit, std::int64_t zeroBasedPos, int whence, + const char *sourceFileName, int lineNumber); +std::int64_t RTNAME(Ftell)(int unit); + // GNU Fortran 77 compatibility function IARGC. std::int32_t FORTRAN_PROCEDURE_NAME(iargc)(); diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index e4f82b7fddb02..ed90b4bc097dd 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -545,6 +545,12 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ KINDInt, Rank::vector, IntrinsicClass::transformationalFunction}, {"floor", {{"a", AnyReal}, DefaultingKIND}, KINDInt}, {"fraction", {{"x", SameReal}}, SameReal}, + {"fseek", + {{"unit", AnyInt, Rank::scalar}, {"offset", AnyInt, Rank::scalar}, + {"whence", AnyInt, Rank::scalar}}, + DefaultInt, Rank::scalar}, + {"ftell", {{"unit", AnyInt, Rank::scalar}}, + TypePattern{IntType, KindCode::exactKind, 8}, Rank::scalar}, {"gamma", {{"x", SameReal}}, SameReal}, {"get_team", {{"level", DefaultInt, Rank::scalar, Optionality::optional}}, TeamType, Rank::scalar, IntrinsicClass::transformationalFunction}, @@ -1083,11 +1089,16 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ // LOC, probably others // TODO: Optionally warn on operand promotion extension -// Aliases for a few generic intrinsic functions for legacy -// compatibility and builtins. +// Aliases for a few generic procedures for legacy compatibility and builtins. static const std::pair genericAlias[]{ {"and", "iand"}, {"getenv", "get_environment_variable"}, + {"fseek64", "fseek"}, + {"fseeko64", "fseek"}, // SUN + {"fseeki8", "fseek"}, // Intel + {"ftell64", "ftell"}, + {"ftello64", "ftell"}, // SUN + {"ftelli8", "ftell"}, // Intel {"imag", "aimag"}, {"lshift", "shiftl"}, {"or", "ior"}, @@ -1524,6 +1535,17 @@ static const IntrinsicInterface intrinsicSubroutine[]{ {"exit", {{"status", DefaultInt, Rank::scalar, Optionality::optional}}, {}, Rank::elemental, IntrinsicClass::impureSubroutine}, {"free", {{"ptr", Addressable}}, {}}, + {"fseek", + {{"unit", AnyInt, Rank::scalar}, {"offset", AnyInt, Rank::scalar}, + {"whence", AnyInt, Rank::scalar}, + {"status", AnyInt, Rank::scalar, Optionality::optional, + common::Intent::InOut}}, + {}, Rank::elemental, IntrinsicClass::impureSubroutine}, + {"ftell", + {{"unit", AnyInt, Rank::scalar}, + {"offset", AnyInt, Rank::scalar, Optionality::required, + common::Intent::Out}}, + {}, Rank::elemental, IntrinsicClass::impureSubroutine}, {"get_command", {{"command", DefaultChar, Rank::scalar, Optionality::optional, common::Intent::Out}, @@ -2811,9 +2833,9 @@ bool IntrinsicProcTable::Implementation::IsDualIntrinsic( const std::string &name) const { // Collection for some intrinsics with function and subroutine form, // in order to pass the semantic check. - static const std::string dualIntrinsic[]{{"chdir"s}, {"etime"s}, {"getcwd"s}, - {"hostnm"s}, {"rename"s}, {"second"s}, {"system"s}, {"unlink"s}}; - + static const std::string dualIntrinsic[]{{"chdir"}, {"etime"}, {"fseek"}, + {"ftell"}, {"getcwd"}, {"hostnm"}, {"rename"}, {"second"}, {"system"}, + {"unlink"}}; return llvm::is_contained(dualIntrinsic, name); } diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 2df9349269a69..0ca636bc091ec 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -462,6 +462,17 @@ static constexpr IntrinsicHandler handlers[]{ {"floor", &I::genFloor}, {"fraction", &I::genFraction}, {"free", &I::genFree}, + {"fseek", + &I::genFseek, + {{{"unit", asValue}, + {"offset", asValue}, + {"whence", asValue}, + {"status", asAddr, handleDynamicOptional}}}, + /*isElemental=*/false}, + {"ftell", + &I::genFtell, + {{{"unit", asValue}, {"offset", asAddr}}}, + /*isElemental=*/false}, {"get_command", &I::genGetCommand, {{{"command", asBox, handleDynamicOptional}, @@ -4139,6 +4150,69 @@ void IntrinsicLibrary::genFree(llvm::ArrayRef args) { fir::runtime::genFree(builder, loc, fir::getBase(args[0])); } +// FSEEK +fir::ExtendedValue +IntrinsicLibrary::genFseek(std::optional resultType, + llvm::ArrayRef args) { + assert((args.size() == 4 && !resultType.has_value()) || + (args.size() == 3 && resultType.has_value())); + mlir::Value unit = fir::getBase(args[0]); + mlir::Value offset = fir::getBase(args[1]); + mlir::Value whence = fir::getBase(args[2]); + if (!unit) + fir::emitFatalError(loc, "expected UNIT argument"); + if (!offset) + fir::emitFatalError(loc, "expected OFFSET argument"); + if (!whence) + fir::emitFatalError(loc, "expected WHENCE argument"); + mlir::Value statusValue = + fir::runtime::genFseek(builder, loc, unit, offset, whence); + if (resultType.has_value()) { // function + return builder.createConvert(loc, *resultType, statusValue); + } else { // subroutine + const fir::ExtendedValue &statusVar = args[3]; + if (!isStaticallyAbsent(statusVar)) { + mlir::Value statusAddr = fir::getBase(statusVar); + mlir::Value statusIsPresentAtRuntime = + builder.genIsNotNullAddr(loc, statusAddr); + builder.genIfThen(loc, statusIsPresentAtRuntime) + .genThen([&]() { + builder.createStoreWithConvert(loc, statusValue, statusAddr); + }) + .end(); + } + return {}; + } +} + +// FTELL +fir::ExtendedValue +IntrinsicLibrary::genFtell(std::optional resultType, + llvm::ArrayRef args) { + assert((args.size() == 2 && !resultType.has_value()) || + (args.size() == 1 && resultType.has_value())); + mlir::Value unit = fir::getBase(args[0]); + if (!unit) + fir::emitFatalError(loc, "expected UNIT argument"); + mlir::Value offsetValue = fir::runtime::genFtell(builder, loc, unit); + if (resultType.has_value()) { // function + return offsetValue; + } else { // subroutine + const fir::ExtendedValue &offsetVar = args[1]; + if (!isStaticallyAbsent(offsetVar)) { + mlir::Value offsetAddr = fir::getBase(offsetVar); + mlir::Value offsetIsPresentAtRuntime = + builder.genIsNotNullAddr(loc, offsetAddr); + builder.genIfThen(loc, offsetIsPresentAtRuntime) + .genThen([&]() { + builder.createStoreWithConvert(loc, offsetValue, offsetAddr); + }) + .end(); + } + return {}; + } +} + // GETCWD fir::ExtendedValue IntrinsicLibrary::genGetCwd(std::optional resultType, diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp index 3aad0625042a2..773d6408079cc 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp @@ -128,6 +128,30 @@ void fir::runtime::genFree(fir::FirOpBuilder &builder, mlir::Location loc, builder.createConvert(loc, intPtrTy, ptr)); } +mlir::Value fir::runtime::genFseek(fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Value unit, + mlir::Value offset, mlir::Value whence) { + auto runtimeFunc = fir::runtime::getRuntimeFunc(loc, builder); + mlir::FunctionType runtimeFuncTy = runtimeFunc.getFunctionType(); + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, runtimeFuncTy.getInput(2)); + llvm::SmallVector args = + fir::runtime::createArguments(builder, loc, runtimeFuncTy, unit, offset, + whence, sourceFile, sourceLine); + return builder.create(loc, runtimeFunc, args).getResult(0); + ; +} + +mlir::Value fir::runtime::genFtell(fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Value unit) { + auto runtimeFunc = fir::runtime::getRuntimeFunc(loc, builder); + mlir::FunctionType runtimeFuncTy = runtimeFunc.getFunctionType(); + llvm::SmallVector args = + fir::runtime::createArguments(builder, loc, runtimeFuncTy, unit); + return builder.create(loc, runtimeFunc, args).getResult(0); +} + mlir::Value fir::runtime::genGetGID(fir::FirOpBuilder &builder, mlir::Location loc) { auto runtimeFunc = From 3674a5f18e8d34cc597e0bd81f38fab3731139f5 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 4 Apr 2025 08:41:32 -0700 Subject: [PATCH 0652/1029] [flang] Permit unused USE association of subprogram name (#134009) A function or subroutine can allow an object of the same name to appear in its scope, so long as the name is not used. This is similar to the case of a name being imported from multiple distinct modules, and implemented by the same representation. It's not clear whether this is conforming behavior or a common extension. --- .../include/flang/Support/Fortran-features.h | 2 +- flang/lib/Semantics/resolve-names.cpp | 23 +++++++++++++++--- flang/lib/Semantics/tools.cpp | 24 +++++++++++-------- flang/lib/Support/Fortran-features.cpp | 1 + flang/test/Semantics/resolve18.f90 | 8 +++++-- 5 files changed, 42 insertions(+), 16 deletions(-) diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h index 356623c643e46..335273100d70e 100644 --- a/flang/include/flang/Support/Fortran-features.h +++ b/flang/include/flang/Support/Fortran-features.h @@ -75,7 +75,7 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable, VectorSubscriptFinalization, UndefinedFunctionResult, UselessIomsg, MismatchingDummyProcedure, SubscriptedEmptyArray, UnsignedLiteralTruncation, CompatibleDeclarationsFromDistinctModules, - NullActualForDefaultIntentAllocatable) + NullActualForDefaultIntentAllocatable, UseAssociationIntoSameNameSubprogram) using LanguageFeatures = EnumSet; using UsageWarnings = EnumSet; diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 11c0ecc9e8410..50ca58ea01429 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -719,6 +719,7 @@ class ScopeHandler : public ImplicitRulesVisitor { void NotePossibleBadForwardRef(const parser::Name &); std::optional HadForwardRef(const Symbol &) const; bool CheckPossibleBadForwardRef(const Symbol &); + bool ConvertToUseError(Symbol &, const SourceName &, const Symbol &used); bool inSpecificationPart_{false}; bool deferImplicitTyping_{false}; @@ -3335,7 +3336,7 @@ ModuleVisitor::SymbolRename ModuleVisitor::AddUse( // symbol must be either a Use or a Generic formed by merging two uses. // Convert it to a UseError with this additional location. -static bool ConvertToUseError( +bool ScopeHandler::ConvertToUseError( Symbol &symbol, const SourceName &location, const Symbol &used) { if (auto *ued{symbol.detailsIf()}) { ued->add_occurrence(location, used); @@ -3353,9 +3354,25 @@ static bool ConvertToUseError( symbol.set_details( UseErrorDetails{*useDetails}.add_occurrence(location, used)); return true; - } else { - return false; } + if (const auto *hostAssocDetails{symbol.detailsIf()}; + hostAssocDetails && hostAssocDetails->symbol().has() && + &symbol.owner() == &currScope() && + &hostAssocDetails->symbol() == currScope().symbol()) { + // Handle USE-association of procedure FOO into function/subroutine FOO, + // replacing its place-holding HostAssocDetails symbol. + context().Warn(common::UsageWarning::UseAssociationIntoSameNameSubprogram, + location, + "'%s' is use-associated into a subprogram of the same name"_port_en_US, + used.name()); + SourceName created{context().GetTempName(currScope())}; + Symbol &tmpUse{MakeSymbol(created, Attrs(), UseDetails{location, used})}; + UseErrorDetails useError{tmpUse.get()}; + useError.add_occurrence(location, hostAssocDetails->symbol()); + symbol.set_details(std::move(useError)); + return true; + } + return false; } // Two ultimate symbols are distinct, but they have the same name and come diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index 6867777bbcdc0..08d260555f37e 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -1733,16 +1733,20 @@ bool HadUseError( at, "Reference to '%s' is ambiguous"_err_en_US, symbol->name())}; for (const auto &[location, sym] : details->occurrences()) { const Symbol &ultimate{sym->GetUltimate()}; - auto &attachment{ - msg.Attach(location, "'%s' was use-associated from module '%s'"_en_US, - at, sym->owner().GetName().value())}; - if (&*sym != &ultimate) { - // For incompatible definitions where one comes from a hermetic - // module file's incorporated dependences and the other from another - // module of the same name. - attachment.Attach(ultimate.name(), - "ultimately from '%s' in module '%s'"_en_US, ultimate.name(), - ultimate.owner().GetName().value()); + if (sym->owner().IsModule()) { + auto &attachment{msg.Attach(location, + "'%s' was use-associated from module '%s'"_en_US, at, + sym->owner().GetName().value())}; + if (&*sym != &ultimate) { + // For incompatible definitions where one comes from a hermetic + // module file's incorporated dependences and the other from another + // module of the same name. + attachment.Attach(ultimate.name(), + "ultimately from '%s' in module '%s'"_en_US, ultimate.name(), + ultimate.owner().GetName().value()); + } + } else { + msg.Attach(sym->name(), "declared here"_en_US); } } context.SetError(*symbol); diff --git a/flang/lib/Support/Fortran-features.cpp b/flang/lib/Support/Fortran-features.cpp index 4bc92f3924ef6..4f1af27231301 100644 --- a/flang/lib/Support/Fortran-features.cpp +++ b/flang/lib/Support/Fortran-features.cpp @@ -85,6 +85,7 @@ LanguageFeatureControl::LanguageFeatureControl() { warnUsage_.set(UsageWarning::UselessIomsg); warnUsage_.set(UsageWarning::UnsignedLiteralTruncation); warnUsage_.set(UsageWarning::NullActualForDefaultIntentAllocatable); + warnUsage_.set(UsageWarning::UseAssociationIntoSameNameSubprogram); // New warnings, on by default warnLanguage_.set(LanguageFeature::SavedLocalInSpecExpr); warnLanguage_.set(LanguageFeature::NullActualForAllocatable); diff --git a/flang/test/Semantics/resolve18.f90 b/flang/test/Semantics/resolve18.f90 index 467fceb58657e..fef526908bbf9 100644 --- a/flang/test/Semantics/resolve18.f90 +++ b/flang/test/Semantics/resolve18.f90 @@ -22,13 +22,17 @@ subroutine s(i) end module subroutine foo - !ERROR: Cannot use-associate 'foo'; it is already declared in this scope + !PORTABILITY: 'foo' is use-associated into a subprogram of the same name use m1 + !ERROR: Reference to 'foo' is ambiguous + call foo end subroutine bar - !ERROR: Cannot use-associate 'bar'; it is already declared in this scope + !PORTABILITY: 'foo' is use-associated into a subprogram of the same name use m1, bar => foo + !ERROR: Reference to 'bar' is ambiguous + call bar end !OK to use-associate a type with the same name as a generic From 262b3f7615b9a4dd660eb39afade73c24777e66a Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 4 Apr 2025 08:42:38 -0700 Subject: [PATCH 0653/1029] [flang] Remove runtime dependence on C++ support for types (#134164) Fortran::runtime::Descriptor::BytesFor() only works for Fortran intrinsic types for which a C++ type counterpart exists, so it crashes on some types that are legitimate Fortran types like REAL(2). Move some logic from Evaluate into a new header in flang/Common, then use it to avoid this needless dependence on C++. --- flang-rt/lib/runtime/descriptor.cpp | 15 ++--- flang/include/flang/Common/Fortran-consts.h | 2 +- flang/include/flang/Common/real.h | 1 - flang/include/flang/Common/type-kinds.h | 63 +++++++++++++++++++++ flang/include/flang/Evaluate/target.h | 3 +- flang/include/flang/Evaluate/type.h | 29 ++-------- flang/lib/Evaluate/target.cpp | 19 ++----- flang/lib/Evaluate/tools.cpp | 3 +- flang/lib/Evaluate/type.cpp | 5 +- flang/lib/Lower/ConvertType.cpp | 9 +-- flang/lib/Semantics/expression.cpp | 4 +- flang/lib/Semantics/type.cpp | 3 +- 12 files changed, 94 insertions(+), 62 deletions(-) create mode 100644 flang/include/flang/Common/type-kinds.h diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp index a1f4b044bddd7..495e25e96aded 100644 --- a/flang-rt/lib/runtime/descriptor.cpp +++ b/flang-rt/lib/runtime/descriptor.cpp @@ -13,8 +13,8 @@ #include "flang-rt/runtime/derived.h" #include "flang-rt/runtime/stat.h" #include "flang-rt/runtime/terminator.h" -#include "flang-rt/runtime/tools.h" #include "flang-rt/runtime/type-info.h" +#include "flang/Common/type-kinds.h" #include #include #include @@ -61,18 +61,11 @@ RT_API_ATTRS void Descriptor::Establish(TypeCode t, std::size_t elementBytes, } } -namespace { -template struct TypeSizeGetter { - constexpr RT_API_ATTRS std::size_t operator()() const { - CppTypeFor arr[2]; - return sizeof arr / 2; - } -}; -} // namespace - RT_API_ATTRS std::size_t Descriptor::BytesFor(TypeCategory category, int kind) { Terminator terminator{__FILE__, __LINE__}; - return ApplyType(category, kind, terminator); + int bytes{common::TypeSizeInBytes(category, kind)}; + RUNTIME_CHECK(terminator, bytes > 0); + return bytes; } RT_API_ATTRS void Descriptor::Establish(TypeCategory c, int kind, void *p, diff --git a/flang/include/flang/Common/Fortran-consts.h b/flang/include/flang/Common/Fortran-consts.h index 3ce5b6ac7b686..74ef1c85d2c86 100644 --- a/flang/include/flang/Common/Fortran-consts.h +++ b/flang/include/flang/Common/Fortran-consts.h @@ -9,7 +9,7 @@ #ifndef FORTRAN_COMMON_FORTRAN_CONSTS_H_ #define FORTRAN_COMMON_FORTRAN_CONSTS_H_ -#include "enum-set.h" +#include "enum-class.h" #include namespace Fortran::common { diff --git a/flang/include/flang/Common/real.h b/flang/include/flang/Common/real.h index b47ba46581db6..785cde3236bf4 100644 --- a/flang/include/flang/Common/real.h +++ b/flang/include/flang/Common/real.h @@ -13,7 +13,6 @@ // The various representations are distinguished by their binary precisions // (number of explicit significand bits and any implicit MSB in the fraction). -#include "api-attrs.h" #include namespace Fortran::common { diff --git a/flang/include/flang/Common/type-kinds.h b/flang/include/flang/Common/type-kinds.h new file mode 100644 index 0000000000000..4e5c4f69fcc67 --- /dev/null +++ b/flang/include/flang/Common/type-kinds.h @@ -0,0 +1,63 @@ +//===-- include/flang/Common/type-kinds.h -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_COMMON_TYPE_KINDS_H_ +#define FORTRAN_COMMON_TYPE_KINDS_H_ + +#include "Fortran-consts.h" +#include "real.h" +#include + +namespace Fortran::common { + +static constexpr int maxKind{16}; + +// A predicate that is true when a kind value is a kind that could possibly +// be supported for an intrinsic type category on some target instruction +// set architecture. +static constexpr bool IsValidKindOfIntrinsicType( + TypeCategory category, std::int64_t kind) { + switch (category) { + case TypeCategory::Integer: + case TypeCategory::Unsigned: + return kind == 1 || kind == 2 || kind == 4 || kind == 8 || kind == 16; + case TypeCategory::Real: + case TypeCategory::Complex: + return kind == 2 || kind == 3 || kind == 4 || kind == 8 || kind == 10 || + kind == 16; + case TypeCategory::Character: + return kind == 1 || kind == 2 || kind == 4; + case TypeCategory::Logical: + return kind == 1 || kind == 2 || kind == 4 || kind == 8; + default: + return false; + } +} + +static constexpr int TypeSizeInBytes(TypeCategory category, std::int64_t kind) { + if (IsValidKindOfIntrinsicType(category, kind)) { + if (category == TypeCategory::Real || category == TypeCategory::Complex) { + int precision{PrecisionOfRealKind(kind)}; + int bits{BitsForBinaryPrecision(precision)}; + if (bits == 80) { // x87 is stored in 16-byte containers + bits = 128; + } + if (category == TypeCategory::Complex) { + bits *= 2; + } + return bits >> 3; + } else { + return kind; + } + } else { + return -1; + } +} + +} // namespace Fortran::common +#endif // FORTRAN_COMMON_TYPE_KINDS_H_ diff --git a/flang/include/flang/Evaluate/target.h b/flang/include/flang/Evaluate/target.h index 7b1593ca270db..cc6172b492b3c 100644 --- a/flang/include/flang/Evaluate/target.h +++ b/flang/include/flang/Evaluate/target.h @@ -15,6 +15,7 @@ #include "flang/Common/enum-class.h" #include "flang/Common/enum-set.h" #include "flang/Common/target-rounding.h" +#include "flang/Common/type-kinds.h" #include "flang/Evaluate/common.h" #include "flang/Support/Fortran.h" #include @@ -131,7 +132,7 @@ class TargetCharacteristics { const IeeeFeatures &ieeeFeatures() const { return ieeeFeatures_; } private: - static constexpr int maxKind{16}; + static constexpr int maxKind{common::maxKind}; std::uint8_t byteSize_[common::TypeCategory_enumSize][maxKind + 1]{}; std::uint8_t align_[common::TypeCategory_enumSize][maxKind + 1]{}; bool isBigEndian_{false}; diff --git a/flang/include/flang/Evaluate/type.h b/flang/include/flang/Evaluate/type.h index cfb162f040e8a..f3bba7790e1a2 100644 --- a/flang/include/flang/Evaluate/type.h +++ b/flang/include/flang/Evaluate/type.h @@ -25,6 +25,7 @@ #include "flang/Common/idioms.h" #include "flang/Common/real.h" #include "flang/Common/template.h" +#include "flang/Common/type-kinds.h" #include "flang/Support/Fortran-features.h" #include "flang/Support/Fortran.h" #include @@ -62,28 +63,6 @@ using LogicalResult = Type; using LargestReal = Type; using Ascii = Type; -// A predicate that is true when a kind value is a kind that could possibly -// be supported for an intrinsic type category on some target instruction -// set architecture. -static constexpr bool IsValidKindOfIntrinsicType( - TypeCategory category, std::int64_t kind) { - switch (category) { - case TypeCategory::Integer: - case TypeCategory::Unsigned: - return kind == 1 || kind == 2 || kind == 4 || kind == 8 || kind == 16; - case TypeCategory::Real: - case TypeCategory::Complex: - return kind == 2 || kind == 3 || kind == 4 || kind == 8 || kind == 10 || - kind == 16; - case TypeCategory::Character: - return kind == 1 || kind == 2 || kind == 4; - case TypeCategory::Logical: - return kind == 1 || kind == 2 || kind == 4 || kind == 8; - default: - return false; - } -} - // DynamicType is meant to be suitable for use as the result type for // GetType() functions and member functions; consequently, it must be // capable of being used in a constexpr context. So it does *not* @@ -95,7 +74,7 @@ static constexpr bool IsValidKindOfIntrinsicType( class DynamicType { public: constexpr DynamicType(TypeCategory cat, int k) : category_{cat}, kind_{k} { - CHECK(IsValidKindOfIntrinsicType(category_, kind_)); + CHECK(common::IsValidKindOfIntrinsicType(category_, kind_)); } DynamicType(int charKind, const semantics::ParamValue &len); // When a known length is presented, resolve it to its effective @@ -103,7 +82,7 @@ class DynamicType { constexpr DynamicType(int k, std::int64_t len) : category_{TypeCategory::Character}, kind_{k}, knownLength_{ len >= 0 ? len : 0} { - CHECK(IsValidKindOfIntrinsicType(category_, kind_)); + CHECK(common::IsValidKindOfIntrinsicType(category_, kind_)); } explicit constexpr DynamicType( const semantics::DerivedTypeSpec &dt, bool poly = false) @@ -360,7 +339,7 @@ using IndirectSubscriptIntegerExpr = // category that could possibly be supported on any target. template using CategoryKindTuple = - std::conditional_t>, std::tuple<>>; template diff --git a/flang/lib/Evaluate/target.cpp b/flang/lib/Evaluate/target.cpp index ba768f38c0ba4..c443278148304 100644 --- a/flang/lib/Evaluate/target.cpp +++ b/flang/lib/Evaluate/target.cpp @@ -8,6 +8,7 @@ #include "flang/Evaluate/target.h" #include "flang/Common/template.h" +#include "flang/Common/type-kinds.h" #include "flang/Evaluate/common.h" #include "flang/Evaluate/type.h" @@ -19,21 +20,11 @@ TargetCharacteristics::TargetCharacteristics() { auto enableCategoryKinds{[this](TypeCategory category) { for (int kind{1}; kind <= maxKind; ++kind) { if (CanSupportType(category, kind)) { - auto byteSize{static_cast(kind)}; - if (category == TypeCategory::Real || - category == TypeCategory::Complex) { - if (kind == 3) { - // non-IEEE 16-bit format (truncated 32-bit) - byteSize = 2; - } else if (kind == 10) { - // x87 floating-point - // Follow gcc precedent for "long double" - byteSize = 16; - } - } + auto byteSize{ + static_cast(common::TypeSizeInBytes(category, kind))}; std::size_t align{byteSize}; if (category == TypeCategory::Complex) { - byteSize = 2 * byteSize; + align /= 2; } EnableType(category, kind, byteSize, align); } @@ -53,7 +44,7 @@ TargetCharacteristics::TargetCharacteristics() { bool TargetCharacteristics::CanSupportType( TypeCategory category, std::int64_t kind) { - return IsValidKindOfIntrinsicType(category, kind); + return common::IsValidKindOfIntrinsicType(category, kind); } bool TargetCharacteristics::EnableType(common::TypeCategory category, diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp index fcd6860917247..702711e3cff53 100644 --- a/flang/lib/Evaluate/tools.cpp +++ b/flang/lib/Evaluate/tools.cpp @@ -8,6 +8,7 @@ #include "flang/Evaluate/tools.h" #include "flang/Common/idioms.h" +#include "flang/Common/type-kinds.h" #include "flang/Evaluate/characteristics.h" #include "flang/Evaluate/traverse.h" #include "flang/Parser/message.h" @@ -1349,7 +1350,7 @@ template static std::optional> DataConstantConversionHelper( FoldingContext &context, const DynamicType &toType, const Expr &expr) { - if (!IsValidKindOfIntrinsicType(FROM, toType.kind())) { + if (!common::IsValidKindOfIntrinsicType(FROM, toType.kind())) { return std::nullopt; } DynamicType sizedType{FROM, toType.kind()}; diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp index c8f75f91ed9c6..5b5f3c2cd0cf0 100644 --- a/flang/lib/Evaluate/type.cpp +++ b/flang/lib/Evaluate/type.cpp @@ -8,6 +8,7 @@ #include "flang/Evaluate/type.h" #include "flang/Common/idioms.h" +#include "flang/Common/type-kinds.h" #include "flang/Evaluate/expression.h" #include "flang/Evaluate/fold.h" #include "flang/Evaluate/target.h" @@ -118,7 +119,7 @@ namespace Fortran::evaluate { DynamicType::DynamicType(int k, const semantics::ParamValue &pv) : category_{TypeCategory::Character}, kind_{k} { - CHECK(IsValidKindOfIntrinsicType(category_, kind_)); + CHECK(common::IsValidKindOfIntrinsicType(category_, kind_)); if (auto n{ToInt64(pv.GetExplicit())}) { knownLength_ = *n > 0 ? *n : 0; } else { @@ -660,7 +661,7 @@ std::optional DynamicType::From( if (const auto *intrinsic{type.AsIntrinsic()}) { if (auto kind{ToInt64(intrinsic->kind())}) { TypeCategory category{intrinsic->category()}; - if (IsValidKindOfIntrinsicType(category, *kind)) { + if (common::IsValidKindOfIntrinsicType(category, *kind)) { if (category == TypeCategory::Character) { const auto &charType{type.characterTypeSpec()}; return DynamicType{static_cast(*kind), charType.length()}; diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp index 2fab520e6c475..d45f9e7c0bf1b 100644 --- a/flang/lib/Lower/ConvertType.cpp +++ b/flang/lib/Lower/ConvertType.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "flang/Lower/ConvertType.h" +#include "flang/Common/type-kinds.h" #include "flang/Lower/AbstractConverter.h" #include "flang/Lower/CallInterface.h" #include "flang/Lower/ConvertVariable.h" @@ -32,7 +33,7 @@ using Fortran::common::VectorElementCategory; //===--------------------------------------------------------------------===// static mlir::Type genRealType(mlir::MLIRContext *context, int kind) { - if (Fortran::evaluate::IsValidKindOfIntrinsicType( + if (Fortran::common::IsValidKindOfIntrinsicType( Fortran::common::TypeCategory::Real, kind)) { switch (kind) { case 2: @@ -59,7 +60,7 @@ int getIntegerBits() { } static mlir::Type genIntegerType(mlir::MLIRContext *context, int kind, bool isUnsigned = false) { - if (Fortran::evaluate::IsValidKindOfIntrinsicType( + if (Fortran::common::IsValidKindOfIntrinsicType( Fortran::common::TypeCategory::Integer, kind)) { mlir::IntegerType::SignednessSemantics signedness = (isUnsigned ? mlir::IntegerType::SignednessSemantics::Unsigned @@ -82,7 +83,7 @@ static mlir::Type genIntegerType(mlir::MLIRContext *context, int kind, } static mlir::Type genLogicalType(mlir::MLIRContext *context, int KIND) { - if (Fortran::evaluate::IsValidKindOfIntrinsicType( + if (Fortran::common::IsValidKindOfIntrinsicType( Fortran::common::TypeCategory::Logical, KIND)) return fir::LogicalType::get(context, KIND); return {}; @@ -91,7 +92,7 @@ static mlir::Type genLogicalType(mlir::MLIRContext *context, int KIND) { static mlir::Type genCharacterType( mlir::MLIRContext *context, int KIND, Fortran::lower::LenParameterTy len = fir::CharacterType::unknownLen()) { - if (Fortran::evaluate::IsValidKindOfIntrinsicType( + if (Fortran::common::IsValidKindOfIntrinsicType( Fortran::common::TypeCategory::Character, KIND)) return fir::CharacterType::get(context, KIND, len); return {}; diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index f2b9702d7c5a0..e139bda7e4950 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -12,6 +12,7 @@ #include "resolve-names-utils.h" #include "resolve-names.h" #include "flang/Common/idioms.h" +#include "flang/Common/type-kinds.h" #include "flang/Evaluate/common.h" #include "flang/Evaluate/fold.h" #include "flang/Evaluate/tools.h" @@ -1058,7 +1059,8 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::Name &n) { if (const semantics::IntrinsicTypeSpec * intrinType{typeSpec->AsIntrinsic()}) { if (auto k{ToInt64(Fold(semantics::KindExpr{intrinType->kind()}))}; - k && IsValidKindOfIntrinsicType(TypeCategory::Integer, *k)) { + k && + common::IsValidKindOfIntrinsicType(TypeCategory::Integer, *k)) { kind = *k; } } diff --git a/flang/lib/Semantics/type.cpp b/flang/lib/Semantics/type.cpp index c5a75c4d619c5..964a37e1c822b 100644 --- a/flang/lib/Semantics/type.cpp +++ b/flang/lib/Semantics/type.cpp @@ -9,6 +9,7 @@ #include "flang/Semantics/type.h" #include "check-declarations.h" #include "compute-offsets.h" +#include "flang/Common/type-kinds.h" #include "flang/Evaluate/fold.h" #include "flang/Evaluate/tools.h" #include "flang/Evaluate/type.h" @@ -125,7 +126,7 @@ void DerivedTypeSpec::EvaluateParameters(SemanticsContext &context) { auto restorer{foldingContext.WithPDTInstance(*this)}; auto folded{Fold(foldingContext, KindExpr{intrinType->kind()})}; if (auto k{evaluate::ToInt64(folded)}; k && - evaluate::IsValidKindOfIntrinsicType(TypeCategory::Integer, *k)) { + common::IsValidKindOfIntrinsicType(TypeCategory::Integer, *k)) { parameterKind = static_cast(*k); } else { messages.Say( From ade9d1f8101256398a2b664eba5aac18c6a3bfce Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 4 Apr 2025 08:43:02 -0700 Subject: [PATCH 0654/1029] [flang][runtime] Remove bad runtime assertion (#134176) The RUNTIME_CHECK in question doesn't allow for the possibility that an allocatable or pointer component could be processed by defined I/O. Remove it in favor of a dynamic allocation check. --- flang-rt/lib/runtime/descriptor-io.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/flang-rt/lib/runtime/descriptor-io.h b/flang-rt/lib/runtime/descriptor-io.h index dd399164325cb..eb60f106c9203 100644 --- a/flang-rt/lib/runtime/descriptor-io.h +++ b/flang-rt/lib/runtime/descriptor-io.h @@ -263,10 +263,8 @@ static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io, // Component is itself a descriptor char *pointer{ origDescriptor.Element(origSubscripts) + component.offset()}; - RUNTIME_CHECK( - terminator, component.genre() == typeInfo::Component::Genre::Automatic); const Descriptor &compDesc{*reinterpret_cast(pointer)}; - return DescriptorIO
(io, compDesc, table); + return compDesc.IsAllocated() && DescriptorIO(io, compDesc, table); } #else terminator.Crash("not yet implemented: component IO"); From efd7caac2e60209fd9358a24f038c91afe6a4a0a Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 4 Apr 2025 08:43:25 -0700 Subject: [PATCH 0655/1029] [flang] IEEE_SUPPORT_FLAG(..., LOCAL) in specification expression (#134270) The optional second argument to IEEE_SUPPORT_FLAG (and related functions from the intrinsic IEEE_ARITHMETIC module) is needed only for its type, not its value. Restrictions on local objects as arguments to function references in specification expressions shouldn't apply to it. Define a new attribute for dummy data object characteristics to distinguish such arguments, set it for the appropriate intrinsic function references, and test it during specification expression validation. --- .../include/flang/Evaluate/characteristics.h | 2 +- flang/lib/Evaluate/check-expression.cpp | 91 +++++++++++------- flang/lib/Evaluate/intrinsics.cpp | 92 +++++++++++++------ flang/test/Evaluate/errors01.f90 | 8 ++ 4 files changed, 135 insertions(+), 58 deletions(-) diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h index 2fecb44fc0082..6d29b57889681 100644 --- a/flang/include/flang/Evaluate/characteristics.h +++ b/flang/include/flang/Evaluate/characteristics.h @@ -220,7 +220,7 @@ class TypeAndShape { // 15.3.2.2 struct DummyDataObject { ENUM_CLASS(Attr, Optional, Allocatable, Asynchronous, Contiguous, Value, - Volatile, Pointer, Target, DeducedFromActual) + Volatile, Pointer, Target, DeducedFromActual, OnlyIntrinsicInquiry) using Attrs = common::EnumSet; static bool IdenticalSignificantAttrs(const Attrs &x, const Attrs &y) { return (x - Attr::DeducedFromActual) == (y - Attr::DeducedFromActual); diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp index 3d338b04e64bb..4d272795ff9bd 100644 --- a/flang/lib/Evaluate/check-expression.cpp +++ b/flang/lib/Evaluate/check-expression.cpp @@ -133,13 +133,23 @@ bool IsConstantExprHelper::operator()( auto shape{GetShape(call.arguments()[0]->UnwrapExpr())}; return shape && IsConstantExprShape(*shape); } else if (proc.IsPure()) { + std::size_t j{0}; for (const auto &arg : call.arguments()) { - if (!arg) { + if (const auto *dataDummy{j < proc.dummyArguments.size() + ? std::get_if( + &proc.dummyArguments[j].u) + : nullptr}; + dataDummy && + dataDummy->attrs.test( + characteristics::DummyDataObject::Attr::OnlyIntrinsicInquiry)) { + // The value of the argument doesn't matter + } else if (!arg) { return false; } else if (const auto *expr{arg->UnwrapExpr()}; - !expr || !(*this)(*expr)) { + !expr || !(*this)(*expr)) { return false; } + ++j; } return true; } @@ -647,7 +657,6 @@ class CheckSpecificationExprHelper } Result operator()(const ProcedureRef &x) const { - bool inInquiry{false}; if (const auto *symbol{x.proc().GetSymbol()}) { const Symbol &ultimate{symbol->GetUltimate()}; if (!semantics::IsPureProcedure(ultimate)) { @@ -679,10 +688,12 @@ class CheckSpecificationExprHelper } // References to internal functions are caught in expression semantics. // TODO: other checks for standard module procedures + auto restorer{common::ScopedSet(inInquiry_, false)}; + return (*this)(x.arguments()); } else { // intrinsic const SpecificIntrinsic &intrin{DEREF(x.proc().GetSpecificIntrinsic())}; - inInquiry = context_.intrinsics().GetIntrinsicClass(intrin.name) == - IntrinsicClass::inquiryFunction; + bool inInquiry{context_.intrinsics().GetIntrinsicClass(intrin.name) == + IntrinsicClass::inquiryFunction}; if (scope_.IsDerivedType()) { // C750, C754 if ((context_.intrinsics().IsIntrinsic(intrin.name) && badIntrinsicsForComponents_.find(intrin.name) != @@ -709,37 +720,55 @@ class CheckSpecificationExprHelper if (intrin.name == "present") { return std::nullopt; // always ok } - // Catch CHARACTER(:), ALLOCATABLE :: X; CHARACTER(LEN(X)) :: Y - if (inInquiry && x.arguments().size() >= 1) { - if (const auto &arg{x.arguments().at(0)}) { - if (auto dataRef{ExtractDataRef(*arg, true, true)}) { - if (intrin.name == "allocated" || intrin.name == "associated" || - intrin.name == "is_contiguous") { // ok - } else if (intrin.name == "len" && - IsPermissibleInquiry(dataRef->GetFirstSymbol(), - dataRef->GetLastSymbol(), - DescriptorInquiry::Field::Len)) { // ok - } else if (intrin.name == "lbound" && - IsPermissibleInquiry(dataRef->GetFirstSymbol(), - dataRef->GetLastSymbol(), - DescriptorInquiry::Field::LowerBound)) { // ok - } else if ((intrin.name == "shape" || intrin.name == "size" || - intrin.name == "sizeof" || - intrin.name == "storage_size" || - intrin.name == "ubound") && - IsPermissibleInquiry(dataRef->GetFirstSymbol(), - dataRef->GetLastSymbol(), - DescriptorInquiry::Field::Extent)) { // ok - } else { - return "non-constant inquiry function '"s + intrin.name + - "' not allowed for local object"; + const auto &proc{intrin.characteristics.value()}; + std::size_t j{0}; + for (const auto &arg : x.arguments()) { + bool checkArg{true}; + if (const auto *dataDummy{j < proc.dummyArguments.size() + ? std::get_if( + &proc.dummyArguments[j].u) + : nullptr}) { + if (dataDummy->attrs.test(characteristics::DummyDataObject::Attr:: + OnlyIntrinsicInquiry)) { + checkArg = false; // value unused, e.g. IEEE_SUPPORT_FLAG(,,,. X) + } + } + if (arg && checkArg) { + // Catch CHARACTER(:), ALLOCATABLE :: X; CHARACTER(LEN(X)) :: Y + if (inInquiry) { + if (auto dataRef{ExtractDataRef(*arg, true, true)}) { + if (intrin.name == "allocated" || intrin.name == "associated" || + intrin.name == "is_contiguous") { // ok + } else if (intrin.name == "len" && + IsPermissibleInquiry(dataRef->GetFirstSymbol(), + dataRef->GetLastSymbol(), + DescriptorInquiry::Field::Len)) { // ok + } else if (intrin.name == "lbound" && + IsPermissibleInquiry(dataRef->GetFirstSymbol(), + dataRef->GetLastSymbol(), + DescriptorInquiry::Field::LowerBound)) { // ok + } else if ((intrin.name == "shape" || intrin.name == "size" || + intrin.name == "sizeof" || + intrin.name == "storage_size" || + intrin.name == "ubound") && + IsPermissibleInquiry(dataRef->GetFirstSymbol(), + dataRef->GetLastSymbol(), + DescriptorInquiry::Field::Extent)) { // ok + } else { + return "non-constant inquiry function '"s + intrin.name + + "' not allowed for local object"; + } } } + auto restorer{common::ScopedSet(inInquiry_, inInquiry)}; + if (auto err{(*this)(*arg)}) { + return err; + } } + ++j; } + return std::nullopt; } - auto restorer{common::ScopedSet(inInquiry_, inInquiry)}; - return (*this)(x.arguments()); } private: diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index ed90b4bc097dd..997a745466dea 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -256,7 +256,8 @@ ENUM_CLASS(ArgFlag, none, defaultsToSameKind, // for MatchingDefaultKIND defaultsToSizeKind, // for SizeDefaultKIND defaultsToDefaultForResult, // for DefaultingKIND - notAssumedSize) + notAssumedSize, + onlyConstantInquiry) // e.g., PRECISION(X) struct IntrinsicDummyArgument { const char *keyword{nullptr}; @@ -398,7 +399,8 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ DefaultLogical}, {"bit_size", {{"i", SameIntOrUnsigned, Rank::anyOrAssumedRank, Optionality::required, - common::Intent::In, {ArgFlag::canBeMoldNull}}}, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, SameInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"ble", {{"i", AnyIntOrUnsigned, Rank::elementalOrBOZ}, @@ -439,7 +441,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"digits", {{"x", AnyIntUnsignedOrReal, Rank::anyOrAssumedRank, Optionality::required, common::Intent::In, - {ArgFlag::canBeMoldNull}}}, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, DefaultInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"dim", {{"x", OperandIntOrReal}, {"y", OperandIntOrReal}}, OperandIntOrReal}, @@ -485,7 +487,8 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ IntrinsicClass::transformationalFunction}, {"epsilon", {{"x", SameReal, Rank::anyOrAssumedRank, Optionality::required, - common::Intent::In, {ArgFlag::canBeMoldNull}}}, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, SameReal, Rank::scalar, IntrinsicClass::inquiryFunction}, {"erf", {{"x", SameReal}}, SameReal}, {"erfc", {{"x", SameReal}}, SameReal}, @@ -568,7 +571,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"huge", {{"x", SameIntUnsignedOrReal, Rank::anyOrAssumedRank, Optionality::required, common::Intent::In, - {ArgFlag::canBeMoldNull}}}, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, SameIntUnsignedOrReal, Rank::scalar, IntrinsicClass::inquiryFunction}, {"hypot", {{"x", OperandReal}, {"y", OperandReal}}, OperandReal}, {"iachar", {{"c", AnyChar}, DefaultingKIND}, KINDInt}, @@ -656,7 +659,8 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"jzext", {{"i", AnyInt}}, DefaultInt}, {"kind", {{"x", AnyIntrinsic, Rank::anyOrAssumedRank, Optionality::required, - common::Intent::In, {ArgFlag::canBeMoldNull}}}, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, DefaultInt, Rank::elemental, IntrinsicClass::inquiryFunction}, {"lbound", {{"array", AnyData, Rank::anyOrAssumedRank}, RequiredDIM, @@ -730,7 +734,8 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ SameCharNoLen}, {"maxexponent", {{"x", AnyReal, Rank::anyOrAssumedRank, Optionality::required, - common::Intent::In, {ArgFlag::canBeMoldNull}}}, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, DefaultInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"maxloc", {{"array", AnyRelatable, Rank::array}, RequiredDIM, OptionalMASK, @@ -775,7 +780,8 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ SameCharNoLen}, {"minexponent", {{"x", AnyReal, Rank::anyOrAssumedRank, Optionality::required, - common::Intent::In, {ArgFlag::canBeMoldNull}}}, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, DefaultInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"minloc", {{"array", AnyRelatable, Rank::array}, RequiredDIM, OptionalMASK, @@ -804,7 +810,8 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"nearest", {{"x", SameReal}, {"s", AnyReal}}, SameReal}, {"new_line", {{"a", SameCharNoLen, Rank::anyOrAssumedRank, Optionality::required, - common::Intent::In, {ArgFlag::canBeMoldNull}}}, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, SameCharNoLen, Rank::scalar, IntrinsicClass::inquiryFunction}, {"nint", {{"a", AnyReal}, DefaultingKIND}, KINDInt}, {"norm2", {{"x", SameReal, Rank::array}, RequiredDIM}, SameReal, @@ -844,21 +851,25 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ SameNumeric, Rank::scalar, IntrinsicClass::transformationalFunction}, {"precision", {{"x", AnyFloating, Rank::anyOrAssumedRank, Optionality::required, - common::Intent::In, {ArgFlag::canBeMoldNull}}}, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, DefaultInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"present", {{"a", Addressable, Rank::anyOrAssumedRank}}, DefaultLogical, Rank::scalar, IntrinsicClass::inquiryFunction}, {"radix", {{"x", AnyIntOrReal, Rank::anyOrAssumedRank, Optionality::required, - common::Intent::In, {ArgFlag::canBeMoldNull}}}, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, DefaultInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"range", {{"x", AnyNumeric, Rank::anyOrAssumedRank, Optionality::required, - common::Intent::In, {ArgFlag::canBeMoldNull}}}, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, DefaultInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"rank", {{"a", AnyData, Rank::anyOrAssumedRank, Optionality::required, - common::Intent::In, {ArgFlag::canBeMoldNull}}}, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, DefaultInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"real", {{"a", SameComplex, Rank::elemental}}, SameReal}, // 16.9.160(4)(ii) @@ -987,7 +998,8 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ IntrinsicClass::transformationalFunction}, {"tiny", {{"x", SameReal, Rank::anyOrAssumedRank, Optionality::required, - common::Intent::In, {ArgFlag::canBeMoldNull}}}, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, SameReal, Rank::scalar, IntrinsicClass::inquiryFunction}, {"trailz", {{"i", AnyInt}}, DefaultInt}, {"transfer", @@ -1044,35 +1056,59 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"__builtin_ieee_next_up", {{"x", SameReal}}, SameReal}, {"__builtin_ieee_real", {{"a", AnyIntOrReal}, DefaultingKIND}, KINDReal}, {"__builtin_ieee_support_datatype", - {{"x", AnyReal, Rank::known, Optionality::optional}}, DefaultLogical}, + {{"x", AnyReal, Rank::known, Optionality::optional, common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, + DefaultLogical}, {"__builtin_ieee_support_denormal", - {{"x", AnyReal, Rank::known, Optionality::optional}}, DefaultLogical}, + {{"x", AnyReal, Rank::known, Optionality::optional, common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, + DefaultLogical}, {"__builtin_ieee_support_divide", - {{"x", AnyReal, Rank::known, Optionality::optional}}, DefaultLogical}, + {{"x", AnyReal, Rank::known, Optionality::optional, common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, + DefaultLogical}, {"__builtin_ieee_support_flag", {{"flag", IeeeFlagType, Rank::scalar}, - {"x", AnyReal, Rank::known, Optionality::optional}}, + {"x", AnyReal, Rank::known, Optionality::optional, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, DefaultLogical}, {"__builtin_ieee_support_halting", {{"flag", IeeeFlagType, Rank::scalar}}, DefaultLogical}, {"__builtin_ieee_support_inf", - {{"x", AnyReal, Rank::known, Optionality::optional}}, DefaultLogical}, + {{"x", AnyReal, Rank::known, Optionality::optional, common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, + DefaultLogical}, {"__builtin_ieee_support_io", - {{"x", AnyReal, Rank::known, Optionality::optional}}, DefaultLogical}, + {{"x", AnyReal, Rank::known, Optionality::optional, common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, + DefaultLogical}, {"__builtin_ieee_support_nan", - {{"x", AnyReal, Rank::known, Optionality::optional}}, DefaultLogical}, + {{"x", AnyReal, Rank::known, Optionality::optional, common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, + DefaultLogical}, {"__builtin_ieee_support_rounding", {{"round_value", IeeeRoundType, Rank::scalar}, - {"x", AnyReal, Rank::known, Optionality::optional}}, + {"x", AnyReal, Rank::known, Optionality::optional, + common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, DefaultLogical}, {"__builtin_ieee_support_sqrt", - {{"x", AnyReal, Rank::known, Optionality::optional}}, DefaultLogical}, + {{"x", AnyReal, Rank::known, Optionality::optional, common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, + DefaultLogical}, {"__builtin_ieee_support_standard", - {{"x", AnyReal, Rank::known, Optionality::optional}}, DefaultLogical}, + {{"x", AnyReal, Rank::known, Optionality::optional, common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, + DefaultLogical}, {"__builtin_ieee_support_subnormal", - {{"x", AnyReal, Rank::known, Optionality::optional}}, DefaultLogical}, + {{"x", AnyReal, Rank::known, Optionality::optional, common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, + DefaultLogical}, {"__builtin_ieee_support_underflow_control", - {{"x", AnyReal, Rank::known, Optionality::optional}}, DefaultLogical}, + {{"x", AnyReal, Rank::known, Optionality::optional, common::Intent::In, + {ArgFlag::canBeMoldNull, ArgFlag::onlyConstantInquiry}}}, + DefaultLogical}, {"__builtin_numeric_storage_size", {}, DefaultInt}, }; @@ -2671,6 +2707,10 @@ std::optional IntrinsicInterface::Match( std::get_if( &dc->u)}) { dummyObject->type.set_corank(0); + if (d.flags.test(ArgFlag::onlyConstantInquiry)) { + dummyObject->attrs.set( + characteristics::DummyDataObject::Attr::OnlyIntrinsicInquiry); + } } dummyArgs.emplace_back(std::move(*dc)); if (d.typePattern.kindCode == KindCode::same && !sameDummyArg) { diff --git a/flang/test/Evaluate/errors01.f90 b/flang/test/Evaluate/errors01.f90 index 283c246393dcd..b20922237f240 100644 --- a/flang/test/Evaluate/errors01.f90 +++ b/flang/test/Evaluate/errors01.f90 @@ -167,6 +167,14 @@ subroutine s14(n) !CHECK: error: IBITS() must have POS+LEN (>=33) no greater than 32 print *, ibits(0, 33, n) end + subroutine s15 + use ieee_arithmetic, only: ieee_flag_type, ieee_underflow, ieee_support_flag + type(ieee_flag_type) :: f1 = ieee_underflow, f2 + !CHECK: portability: specification expression refers to local object 'f1' (initialized and saved) + integer ok(merge(kind(1),-1,ieee_support_flag(f1, x))) + !CHECK: error: Invalid specification expression: reference to local entity 'f2' + integer bad(merge(kind(1),-1,ieee_support_flag(f2, x))) + end subroutine warnings use ieee_arithmetic, only: ieee_scalb real, parameter :: ok1 = scale(0.0, 99999) ! 0.0 From 507ce46b6fb0271c82df1352e82d592c0cbf9aaf Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 4 Apr 2025 08:43:56 -0700 Subject: [PATCH 0656/1029] [flang][preprocessor] Directive continuation must skip empty macros (#134149) When a compiler directive continuation line starts with keyword macro names that have empty expansions, skip them. --- flang/lib/Parser/prescan.cpp | 2 +- flang/test/Preprocessing/directive-contin-with-pp.F90 | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index 0df1e3e291923..809b728c47ffe 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -1325,7 +1325,7 @@ const char *Prescanner::FreeFormContinuationLine(bool ampersand) { if (p >= limit_) { return nullptr; } - p = SkipWhiteSpace(p); + p = SkipWhiteSpaceIncludingEmptyMacros(p); if (*p == '!') { ++p; if (InCompilerDirective()) { diff --git a/flang/test/Preprocessing/directive-contin-with-pp.F90 b/flang/test/Preprocessing/directive-contin-with-pp.F90 index 6e84c2bde52f9..f1cfafe0bdf14 100644 --- a/flang/test/Preprocessing/directive-contin-with-pp.F90 +++ b/flang/test/Preprocessing/directive-contin-with-pp.F90 @@ -8,6 +8,7 @@ #define COMMENT ! #define OMP_START !$omp #define OMP_CONT !$omp& +#define EMPTY module m contains @@ -50,6 +51,11 @@ subroutine s1(x1, x2, x3, x4, x5, x6, x7) OMP_CONT reduction(+:x) do j3 = 1, n end do + +EMPTY !$omp parallel & +EMPTY !$omp do + do j4 = 1, n + end do end COMMENT & @@ -79,6 +85,9 @@ subroutine s3 !CHECK: !$OMP PARALLEL DO REDUCTION(+: x) !CHECK: DO j3=1_4,n !CHECK: END DO +!CHECK: !$OMP PARALLEL DO +!CHECK: DO j4=1_4,n +!CHECK: END DO !CHECK: END SUBROUTINE !CHECK: SUBROUTINE s2 !CHECK: END SUBROUTINE From 1bef59c9db07840609c919fa95127decbfc3f55d Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 4 Apr 2025 08:44:22 -0700 Subject: [PATCH 0657/1029] [flang][preprocessor] Further macro replacement of continued identifiers (#134302) The preprocessor can perform macro replacement within identifiers when they are split up with Fortran line continuation, but is failing to do macro replacement on a continued identifier when none of its parts are replaced. --- flang/lib/Parser/prescan.cpp | 57 ++++++++++++++++++------------ flang/test/Preprocessing/pp047.F | 25 +++++++++++++ flang/test/Preprocessing/pp135.F90 | 25 +++++++++++++ 3 files changed, 85 insertions(+), 22 deletions(-) create mode 100644 flang/test/Preprocessing/pp047.F create mode 100644 flang/test/Preprocessing/pp135.F90 diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index 809b728c47ffe..755cb18cb8caf 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -749,35 +749,48 @@ bool Prescanner::NextToken(TokenSequence &tokens) { } preventHollerith_ = false; } else if (IsLegalInIdentifier(*at_)) { - int parts{1}; - const char *afterLast{nullptr}; + std::size_t parts{1}; + bool anyDefined{false}; + bool hadContinuation{false}; + // Subtlety: When an identifier is split across continuation lines, + // its parts are kept as distinct pp-tokens if that macro replacement + // should operate on them independently. This trick accommodates the + // historic practice of using line continuation for token pasting after + // replacement. + // In free form, the macro to be replaced must have been preceded + // by '&' and followed by either '&' or, if last, the end of a line. + // call & call foo& call foo& + // &MACRO& OR &MACRO& OR &MACRO + // &foo(...) &(...) do { EmitChar(tokens, *at_); ++at_, ++column_; - afterLast = at_; - if (SkipToNextSignificantCharacter() && IsLegalIdentifierStart(*at_)) { + hadContinuation = SkipToNextSignificantCharacter(); + if (hadContinuation && IsLegalIdentifierStart(*at_)) { + // Continued identifier tokens.CloseToken(); ++parts; + if (!anyDefined && + (parts > 2 || inFixedForm_ || + (start > start_ && start[-1] == '&')) && + preprocessor_.IsNameDefined( + tokens.TokenAt(tokens.SizeInTokens() - 1))) { + anyDefined = true; + } } } while (IsLegalInIdentifier(*at_)); - if (parts >= 3) { - // Subtlety: When an identifier is split across three or more continuation - // lines (or two continuation lines, immediately preceded or followed - // by '&' free form continuation line markers, its parts are kept as - // distinct pp-tokens so that macro replacement operates on them - // independently. This trick accommodates the historic practice of - // using line continuation for token pasting after replacement. - } else if (parts == 2) { - if (afterLast && afterLast < limit_) { - afterLast = SkipWhiteSpace(afterLast); - } - if ((start > start_ && start[-1] == '&') || - (afterLast && afterLast < limit_ && - (*afterLast == '&' || *afterLast == '\n'))) { - // call & call foo& call foo& - // &MACRO& OR &MACRO& OR &MACRO - // &foo(...) &(...) - } else { + if (!anyDefined && parts > 1) { + tokens.CloseToken(); + char after{*SkipWhiteSpace(at_)}; + anyDefined = (hadContinuation || after == '\n' || after == '&') && + preprocessor_.IsNameDefined( + tokens.TokenAt(tokens.SizeInTokens() - 1)); + tokens.ReopenLastToken(); + } + if (!anyDefined) { + // If no part was a defined macro, combine the parts into one so that + // the combination itself can be subject to macro replacement. + while (parts-- > 1) { tokens.ReopenLastToken(); } } diff --git a/flang/test/Preprocessing/pp047.F b/flang/test/Preprocessing/pp047.F new file mode 100644 index 0000000000000..1d4f9f848e58a --- /dev/null +++ b/flang/test/Preprocessing/pp047.F @@ -0,0 +1,25 @@ +! RUN: %flang -E %s 2>&1 | FileCheck %s +#define FOO BAR +#define FO BA +#define OO AR +! CHECK: print *,BAR, 1 + print *, + +FOO + +, 1 + print *, +! CHECK: print *,FAR, 2 + +F + +OO + +, 2 +! CHECK: print *,BAO, 3 + print *, + +FO + +O + +, 3 +! CHECK: print *,BAR, 4 + print *, + +F + +O + +O + +, 4 + end diff --git a/flang/test/Preprocessing/pp135.F90 b/flang/test/Preprocessing/pp135.F90 new file mode 100644 index 0000000000000..2905a8cec5d93 --- /dev/null +++ b/flang/test/Preprocessing/pp135.F90 @@ -0,0 +1,25 @@ +! RUN: %flang -E %s 2>&1 | FileCheck %s +#define FOO BAR +#define FO BA +#define OO AR +! CHECK: print *, BAR, 1 +print *, & + &FOO& + &, 1 +! CHECK: print *, FAR, 2 +print *, & + &F& + &OO& + &, 2 +! CHECK: print *, BAO, 3 +print *, & + &FO& + &O& + &, 3 +! CHECK: print *, BAR, 4 +print *, & + &F& + &O& + &O& + &, 4 +end From 4090910a695efcba4b484e9f8ad2b564e9a4e7ed Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 4 Apr 2025 11:48:46 -0400 Subject: [PATCH 0658/1029] [libc++] Guard additional headers with _LIBCPP_HAS_LOCALIZATION (#131921) There were some remaining headers that were not guarded with _LIBCPP_HAS_LOCALIZATION, leading to errors when trying to use modules on platforms that don't support localization (since all the headers get pulled in when building the 'std' module). This patch brings these headers in line with what we do for every other header that depends on localization. This patch also requires including from <__configuration/platform.h> in order to define _NEWLIB_VERSION. In the long term, we should use a better approach for doing that, such as defining a macro in the __config_site header. --- libcxx/include/__configuration/platform.h | 7 + libcxx/include/__locale | 177 ++++++----- libcxx/include/__locale_dir/locale_base_api.h | 116 +++---- libcxx/include/fstream | 55 ++-- libcxx/include/regex | 298 +++++++++--------- libcxx/include/strstream | 55 ++-- .../configs/armv7m-picolibc-libc++.cfg.in | 4 - .../test/libcxx/system_reserved_names.gen.py | 6 + 8 files changed, 376 insertions(+), 342 deletions(-) diff --git a/libcxx/include/__configuration/platform.h b/libcxx/include/__configuration/platform.h index 8d0f8f63f5213..f3c199dee172b 100644 --- a/libcxx/include/__configuration/platform.h +++ b/libcxx/include/__configuration/platform.h @@ -42,6 +42,13 @@ # endif #endif +// This is required in order for _NEWLIB_VERSION to be defined in places where we use it. +// TODO: We shouldn't be including arbitrarily-named headers from libc++ since this can break valid +// user code. Move code paths that need _NEWLIB_VERSION to another customization mechanism. +#if __has_include() +# include +#endif + #ifndef __BYTE_ORDER__ # error \ "Your compiler doesn't seem to define __BYTE_ORDER__, which is required by libc++ to know the endianness of your target platform" diff --git a/libcxx/include/__locale b/libcxx/include/__locale index 5ae3228989749..47323046fab38 100644 --- a/libcxx/include/__locale +++ b/libcxx/include/__locale @@ -11,32 +11,35 @@ #define _LIBCPP___LOCALE #include <__config> -#include <__locale_dir/locale_base_api.h> -#include <__memory/addressof.h> -#include <__memory/shared_count.h> -#include <__mutex/once_flag.h> -#include <__type_traits/make_unsigned.h> -#include <__utility/no_destroy.h> -#include <__utility/private_constructor_tag.h> -#include -#include -#include -#include -#include + +#if _LIBCPP_HAS_LOCALIZATION + +# include <__locale_dir/locale_base_api.h> +# include <__memory/addressof.h> +# include <__memory/shared_count.h> +# include <__mutex/once_flag.h> +# include <__type_traits/make_unsigned.h> +# include <__utility/no_destroy.h> +# include <__utility/private_constructor_tag.h> +# include +# include +# include +# include +# include // Some platforms require more includes than others. Keep the includes on all plaforms for now. -#include -#include +# include +# include -#if _LIBCPP_HAS_WIDE_CHARACTERS -# include -#else -# include <__std_mbstate_t.h> -#endif +# if _LIBCPP_HAS_WIDE_CHARACTERS +# include +# else +# include <__std_mbstate_t.h> +# endif -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif _LIBCPP_BEGIN_NAMESPACE_STD @@ -86,9 +89,9 @@ public: // locale operations: string name() const; bool operator==(const locale&) const; -#if _LIBCPP_STD_VER <= 17 +# if _LIBCPP_STD_VER <= 17 _LIBCPP_HIDE_FROM_ABI bool operator!=(const locale& __y) const { return !(*this == __y); } -#endif +# endif template _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS bool operator()(const basic_string<_CharT, _Traits, _Allocator>&, const basic_string<_CharT, _Traits, _Allocator>&) const; @@ -238,9 +241,9 @@ long collate<_CharT>::do_hash(const char_type* __lo, const char_type* __hi) cons } extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate; -#if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate; -#endif +# endif // template class collate_byname; @@ -265,7 +268,7 @@ protected: string_type do_transform(const char_type* __lo, const char_type* __hi) const override; }; -#if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template <> class _LIBCPP_EXPORTED_FROM_ABI collate_byname : public collate { __locale::__locale_t __l_; @@ -284,7 +287,7 @@ protected: const char_type* __lo1, const char_type* __hi1, const char_type* __lo2, const char_type* __hi2) const override; string_type do_transform(const char_type* __lo, const char_type* __hi) const override; }; -#endif +# endif template bool locale::operator()(const basic_string<_CharT, _Traits, _Allocator>& __x, @@ -297,7 +300,7 @@ bool locale::operator()(const basic_string<_CharT, _Traits, _Allocator>& __x, class _LIBCPP_EXPORTED_FROM_ABI ctype_base { public: -#if defined(_LIBCPP_PROVIDES_DEFAULT_RUNE_TABLE) +# if defined(_LIBCPP_PROVIDES_DEFAULT_RUNE_TABLE) typedef unsigned long mask; static const mask space = 1 << 0; static const mask print = 1 << 1; @@ -309,14 +312,14 @@ public: static const mask punct = 1 << 7; static const mask xdigit = 1 << 8; static const mask blank = 1 << 9; -# if defined(__BIONIC__) +# if defined(__BIONIC__) // Historically this was a part of regex_traits rather than ctype_base. The // historical value of the constant is preserved for ABI compatibility. static const mask __regex_word = 0x8000; -# else +# else static const mask __regex_word = 1 << 10; -# endif // defined(__BIONIC__) -#elif defined(__GLIBC__) +# endif // defined(__BIONIC__) +# elif defined(__GLIBC__) typedef unsigned short mask; static const mask space = _ISspace; static const mask print = _ISprint; @@ -328,12 +331,12 @@ public: static const mask punct = _ISpunct; static const mask xdigit = _ISxdigit; static const mask blank = _ISblank; -# if defined(__mips__) || (BYTE_ORDER == BIG_ENDIAN) +# if defined(__mips__) || (BYTE_ORDER == BIG_ENDIAN) static const mask __regex_word = static_cast(_ISbit(15)); -# else +# else static const mask __regex_word = 0x80; -# endif -#elif defined(_LIBCPP_MSVCRT_LIKE) +# endif +# elif defined(_LIBCPP_MSVCRT_LIKE) typedef unsigned short mask; static const mask space = _SPACE; static const mask print = _BLANK | _PUNCT | _ALPHA | _DIGIT; @@ -346,16 +349,16 @@ public: static const mask xdigit = _HEX; static const mask blank = _BLANK; static const mask __regex_word = 0x4000; // 0x8000 and 0x0100 and 0x00ff are used -# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_PRINT -# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA -#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) -# ifdef __APPLE__ +# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_PRINT +# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA +# elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) +# ifdef __APPLE__ typedef uint32_t mask; -# elif defined(__FreeBSD__) +# elif defined(__FreeBSD__) typedef unsigned long mask; -# elif defined(__NetBSD__) +# elif defined(__NetBSD__) typedef unsigned short mask; -# endif +# endif static const mask space = _CTYPE_S; static const mask print = _CTYPE_R; static const mask cntrl = _CTYPE_C; @@ -366,16 +369,16 @@ public: static const mask punct = _CTYPE_P; static const mask xdigit = _CTYPE_X; -# if defined(__NetBSD__) +# if defined(__NetBSD__) static const mask blank = _CTYPE_BL; // NetBSD defines classes up to 0x2000 // see sys/ctype_bits.h, _CTYPE_Q static const mask __regex_word = 0x8000; -# else +# else static const mask blank = _CTYPE_B; static const mask __regex_word = 0x80; -# endif -#elif defined(_AIX) +# endif +# elif defined(_AIX) typedef unsigned int mask; static const mask space = _ISSPACE; static const mask print = _ISPRINT; @@ -388,7 +391,7 @@ public: static const mask xdigit = _ISXDIGIT; static const mask blank = _ISBLANK; static const mask __regex_word = 0x8000; -#elif defined(_NEWLIB_VERSION) +# elif defined(_NEWLIB_VERSION) // Same type as Newlib's _ctype_ array in newlib/libc/include/ctype.h. typedef char mask; // In case char is signed, static_cast is needed to avoid warning on @@ -405,11 +408,11 @@ public: static const mask blank = static_cast(_B); // mask is already fully saturated, use a different type in regex_type_traits. static const unsigned short __regex_word = 0x100; -# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_PRINT -# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA -# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_XDIGIT -#elif defined(__MVS__) -# if defined(__NATIVE_ASCII_F) +# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_PRINT +# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA +# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_XDIGIT +# elif defined(__MVS__) +# if defined(__NATIVE_ASCII_F) typedef unsigned int mask; static const mask space = _ISSPACE_A; static const mask print = _ISPRINT_A; @@ -421,7 +424,7 @@ public: static const mask punct = _ISPUNCT_A; static const mask xdigit = _ISXDIGIT_A; static const mask blank = _ISBLANK_A; -# else +# else typedef unsigned short mask; static const mask space = __ISSPACE; static const mask print = __ISPRINT; @@ -433,11 +436,11 @@ public: static const mask punct = __ISPUNCT; static const mask xdigit = __ISXDIGIT; static const mask blank = __ISBLANK; -# endif +# endif static const mask __regex_word = 0x8000; -#else -# error unknown rune table for this platform -- do you mean to define _LIBCPP_PROVIDES_DEFAULT_RUNE_TABLE? -#endif +# else +# error unknown rune table for this platform -- do you mean to define _LIBCPP_PROVIDES_DEFAULT_RUNE_TABLE? +# endif static const mask alnum = alpha | digit; static const mask graph = alnum | punct; @@ -451,7 +454,7 @@ public: template class _LIBCPP_TEMPLATE_VIS ctype; -#if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template <> class _LIBCPP_EXPORTED_FROM_ABI ctype : public locale::facet, public ctype_base { public: @@ -516,7 +519,7 @@ protected: virtual const char_type* do_narrow(const char_type* __low, const char_type* __high, char __dfault, char* __dest) const; }; -#endif // _LIBCPP_HAS_WIDE_CHARACTERS +# endif // _LIBCPP_HAS_WIDE_CHARACTERS inline _LIBCPP_HIDE_FROM_ABI bool __libcpp_isascii(int __c) { return (__c & ~0x7F) == 0; } @@ -581,25 +584,25 @@ public: static locale::id id; -#ifdef _CACHED_RUNES +# ifdef _CACHED_RUNES static const size_t table_size = _CACHED_RUNES; -#else +# else static const size_t table_size = 256; // FIXME: Don't hardcode this. -#endif +# endif _LIBCPP_HIDE_FROM_ABI const mask* table() const _NOEXCEPT { return __tab_; } static const mask* classic_table() _NOEXCEPT; -#if defined(__GLIBC__) || defined(__EMSCRIPTEN__) +# if defined(__GLIBC__) || defined(__EMSCRIPTEN__) static const int* __classic_upper_table() _NOEXCEPT; static const int* __classic_lower_table() _NOEXCEPT; -#endif -#if defined(__NetBSD__) +# endif +# if defined(__NetBSD__) static const short* __classic_upper_table() _NOEXCEPT; static const short* __classic_lower_table() _NOEXCEPT; -#endif -#if defined(__MVS__) +# endif +# if defined(__MVS__) static const unsigned short* __classic_upper_table() _NOEXCEPT; static const unsigned short* __classic_lower_table() _NOEXCEPT; -#endif +# endif protected: ~ctype() override; @@ -634,7 +637,7 @@ protected: const char_type* do_tolower(char_type* __low, const char_type* __high) const override; }; -#if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template <> class _LIBCPP_EXPORTED_FROM_ABI ctype_byname : public ctype { __locale::__locale_t __l_; @@ -659,7 +662,7 @@ protected: const char_type* do_narrow(const char_type* __low, const char_type* __high, char __dfault, char* __dest) const override; }; -#endif // _LIBCPP_HAS_WIDE_CHARACTERS +# endif // _LIBCPP_HAS_WIDE_CHARACTERS template inline _LIBCPP_HIDE_FROM_ABI bool isspace(_CharT __c, const locale& __loc) { @@ -825,7 +828,7 @@ protected: // template <> class codecvt -#if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template <> class _LIBCPP_EXPORTED_FROM_ABI codecvt : public locale::facet, public codecvt_base { __locale::__locale_t __l_; @@ -904,7 +907,7 @@ protected: virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const; virtual int do_max_length() const _NOEXCEPT; }; -#endif // _LIBCPP_HAS_WIDE_CHARACTERS +# endif // _LIBCPP_HAS_WIDE_CHARACTERS // template <> class codecvt // deprecated in C++20 @@ -986,7 +989,7 @@ protected: virtual int do_max_length() const _NOEXCEPT; }; -#if _LIBCPP_HAS_CHAR8_T +# if _LIBCPP_HAS_CHAR8_T // template <> class codecvt // C++20 @@ -1067,7 +1070,7 @@ protected: virtual int do_max_length() const _NOEXCEPT; }; -#endif +# endif // template <> class codecvt // deprecated in C++20 @@ -1149,7 +1152,7 @@ protected: virtual int do_max_length() const _NOEXCEPT; }; -#if _LIBCPP_HAS_CHAR8_T +# if _LIBCPP_HAS_CHAR8_T // template <> class codecvt // C++20 @@ -1230,7 +1233,7 @@ protected: virtual int do_max_length() const _NOEXCEPT; }; -#endif +# endif // template class codecvt_byname @@ -1252,17 +1255,17 @@ codecvt_byname<_InternT, _ExternT, _StateT>::~codecvt_byname() {} _LIBCPP_SUPPRESS_DEPRECATED_POP extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; -#if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; -#endif +# endif extern template class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; // deprecated in C++20 extern template class _LIBCPP_DEPRECATED_IN_CXX20 _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; // deprecated in C++20 -#if _LIBCPP_HAS_CHAR8_T +# if _LIBCPP_HAS_CHAR8_T extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; // C++20 extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname; // C++20 -#endif +# endif template struct __narrow_to_utf8 { @@ -1442,7 +1445,7 @@ protected: string __grouping_; }; -#if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template <> class _LIBCPP_EXPORTED_FROM_ABI numpunct : public locale::facet { public: @@ -1471,7 +1474,7 @@ protected: char_type __thousands_sep_; string __grouping_; }; -#endif // _LIBCPP_HAS_WIDE_CHARACTERS +# endif // _LIBCPP_HAS_WIDE_CHARACTERS // template class numpunct_byname @@ -1494,7 +1497,7 @@ private: void __init(const char*); }; -#if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template <> class _LIBCPP_EXPORTED_FROM_ABI numpunct_byname : public numpunct { public: @@ -1510,8 +1513,10 @@ protected: private: void __init(const char*); }; -#endif // _LIBCPP_HAS_WIDE_CHARACTERS +# endif // _LIBCPP_HAS_WIDE_CHARACTERS _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_HAS_LOCALIZATION + #endif // _LIBCPP___LOCALE diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h index 92916944227d7..bbc30b1cfe03f 100644 --- a/libcxx/include/__locale_dir/locale_base_api.h +++ b/libcxx/include/__locale_dir/locale_base_api.h @@ -111,59 +111,61 @@ // int __sscanf(const char*, __locale_t, const char*, ...); // required by the headers // } -#if defined(__APPLE__) -# include <__locale_dir/support/apple.h> -#elif defined(__FreeBSD__) -# include <__locale_dir/support/freebsd.h> -#elif defined(_LIBCPP_MSVCRT_LIKE) -# include <__locale_dir/support/windows.h> -#elif defined(__Fuchsia__) -# include <__locale_dir/support/fuchsia.h> -#elif defined(__linux__) -# include <__locale_dir/support/linux.h> -#else +#if _LIBCPP_HAS_LOCALIZATION + +# if defined(__APPLE__) +# include <__locale_dir/support/apple.h> +# elif defined(__FreeBSD__) +# include <__locale_dir/support/freebsd.h> +# elif defined(_LIBCPP_MSVCRT_LIKE) +# include <__locale_dir/support/windows.h> +# elif defined(__Fuchsia__) +# include <__locale_dir/support/fuchsia.h> +# elif defined(__linux__) +# include <__locale_dir/support/linux.h> +# else // TODO: This is a temporary definition to bridge between the old way we defined the locale base API // (by providing global non-reserved names) and the new API. As we move individual platforms // towards the new way of defining the locale base API, this should disappear since each platform // will define those directly. -# if defined(_AIX) || defined(__MVS__) -# include <__locale_dir/locale_base_api/ibm.h> -# elif defined(__ANDROID__) -# include <__locale_dir/locale_base_api/android.h> -# elif defined(__OpenBSD__) -# include <__locale_dir/locale_base_api/openbsd.h> -# elif defined(__wasi__) || _LIBCPP_HAS_MUSL_LIBC -# include <__locale_dir/locale_base_api/musl.h> -# endif - -# include <__locale_dir/locale_base_api/bsd_locale_fallbacks.h> - -# include <__cstddef/size_t.h> -# include <__utility/forward.h> -# include -# include -# include -# if _LIBCPP_HAS_WIDE_CHARACTERS -# include -# endif +# if defined(_AIX) || defined(__MVS__) +# include <__locale_dir/locale_base_api/ibm.h> +# elif defined(__ANDROID__) +# include <__locale_dir/locale_base_api/android.h> +# elif defined(__OpenBSD__) +# include <__locale_dir/locale_base_api/openbsd.h> +# elif defined(__wasi__) || _LIBCPP_HAS_MUSL_LIBC +# include <__locale_dir/locale_base_api/musl.h> +# endif + +# include <__locale_dir/locale_base_api/bsd_locale_fallbacks.h> + +# include <__cstddef/size_t.h> +# include <__utility/forward.h> +# include +# include +# include +# if _LIBCPP_HAS_WIDE_CHARACTERS +# include +# endif _LIBCPP_BEGIN_NAMESPACE_STD namespace __locale { // // Locale management // -# define _LIBCPP_COLLATE_MASK LC_COLLATE_MASK -# define _LIBCPP_CTYPE_MASK LC_CTYPE_MASK -# define _LIBCPP_MONETARY_MASK LC_MONETARY_MASK -# define _LIBCPP_NUMERIC_MASK LC_NUMERIC_MASK -# define _LIBCPP_TIME_MASK LC_TIME_MASK -# define _LIBCPP_MESSAGES_MASK LC_MESSAGES_MASK -# define _LIBCPP_ALL_MASK LC_ALL_MASK -# define _LIBCPP_LC_ALL LC_ALL +# define _LIBCPP_COLLATE_MASK LC_COLLATE_MASK +# define _LIBCPP_CTYPE_MASK LC_CTYPE_MASK +# define _LIBCPP_MONETARY_MASK LC_MONETARY_MASK +# define _LIBCPP_NUMERIC_MASK LC_NUMERIC_MASK +# define _LIBCPP_TIME_MASK LC_TIME_MASK +# define _LIBCPP_MESSAGES_MASK LC_MESSAGES_MASK +# define _LIBCPP_ALL_MASK LC_ALL_MASK +# define _LIBCPP_LC_ALL LC_ALL using __locale_t _LIBCPP_NODEBUG = locale_t; -# if defined(_LIBCPP_BUILDING_LIBRARY) +# if defined(_LIBCPP_BUILDING_LIBRARY) using __lconv_t _LIBCPP_NODEBUG = lconv; inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __name, __locale_t __loc) { @@ -177,7 +179,7 @@ inline _LIBCPP_HIDE_FROM_ABI char* __setlocale(int __category, char const* __loc inline _LIBCPP_HIDE_FROM_ABI void __freelocale(__locale_t __loc) { freelocale(__loc); } inline _LIBCPP_HIDE_FROM_ABI __lconv_t* __localeconv(__locale_t& __loc) { return __libcpp_localeconv_l(__loc); } -# endif // _LIBCPP_BUILDING_LIBRARY +# endif // _LIBCPP_BUILDING_LIBRARY // // Strtonum functions @@ -206,15 +208,15 @@ __strtoull(const char* __nptr, char** __endptr, int __base, __locale_t __loc) { // // Character manipulation functions // -# if defined(_LIBCPP_BUILDING_LIBRARY) +# if defined(_LIBCPP_BUILDING_LIBRARY) inline _LIBCPP_HIDE_FROM_ABI int __islower(int __ch, __locale_t __loc) { return islower_l(__ch, __loc); } inline _LIBCPP_HIDE_FROM_ABI int __isupper(int __ch, __locale_t __loc) { return isupper_l(__ch, __loc); } -# endif +# endif inline _LIBCPP_HIDE_FROM_ABI int __isdigit(int __ch, __locale_t __loc) { return isdigit_l(__ch, __loc); } inline _LIBCPP_HIDE_FROM_ABI int __isxdigit(int __ch, __locale_t __loc) { return isxdigit_l(__ch, __loc); } -# if defined(_LIBCPP_BUILDING_LIBRARY) +# if defined(_LIBCPP_BUILDING_LIBRARY) inline _LIBCPP_HIDE_FROM_ABI int __strcoll(const char* __s1, const char* __s2, __locale_t __loc) { return strcoll_l(__s1, __s2, __loc); } @@ -224,7 +226,7 @@ inline _LIBCPP_HIDE_FROM_ABI size_t __strxfrm(char* __dest, const char* __src, s inline _LIBCPP_HIDE_FROM_ABI int __toupper(int __ch, __locale_t __loc) { return toupper_l(__ch, __loc); } inline _LIBCPP_HIDE_FROM_ABI int __tolower(int __ch, __locale_t __loc) { return tolower_l(__ch, __loc); } -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS inline _LIBCPP_HIDE_FROM_ABI int __wcscoll(const wchar_t* __s1, const wchar_t* __s2, __locale_t __loc) { return wcscoll_l(__s1, __s2, __loc); } @@ -246,7 +248,7 @@ inline _LIBCPP_HIDE_FROM_ABI int __iswpunct(wint_t __ch, __locale_t __loc) { ret inline _LIBCPP_HIDE_FROM_ABI int __iswxdigit(wint_t __ch, __locale_t __loc) { return iswxdigit_l(__ch, __loc); } inline _LIBCPP_HIDE_FROM_ABI wint_t __towupper(wint_t __ch, __locale_t __loc) { return towupper_l(__ch, __loc); } inline _LIBCPP_HIDE_FROM_ABI wint_t __towlower(wint_t __ch, __locale_t __loc) { return towlower_l(__ch, __loc); } -# endif +# endif inline _LIBCPP_HIDE_FROM_ABI size_t __strftime(char* __s, size_t __max, const char* __format, const tm* __tm, __locale_t __loc) { @@ -259,7 +261,7 @@ __strftime(char* __s, size_t __max, const char* __format, const tm* __tm, __loca inline _LIBCPP_HIDE_FROM_ABI decltype(__libcpp_mb_cur_max_l(__locale_t())) __mb_len_max(__locale_t __loc) { return __libcpp_mb_cur_max_l(__loc); } -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS inline _LIBCPP_HIDE_FROM_ABI wint_t __btowc(int __ch, __locale_t __loc) { return __libcpp_btowc_l(__ch, __loc); } inline _LIBCPP_HIDE_FROM_ABI int __wctob(wint_t __ch, __locale_t __loc) { return __libcpp_wctob_l(__ch, __loc); } inline _LIBCPP_HIDE_FROM_ABI size_t @@ -287,17 +289,17 @@ inline _LIBCPP_HIDE_FROM_ABI size_t __mbsrtowcs(wchar_t* __dest, const char** __src, size_t __len, mbstate_t* __ps, __locale_t __loc) { return __libcpp_mbsrtowcs_l(__dest, __src, __len, __ps, __loc); } -# endif // _LIBCPP_HAS_WIDE_CHARACTERS -# endif // _LIBCPP_BUILDING_LIBRARY +# endif // _LIBCPP_HAS_WIDE_CHARACTERS +# endif // _LIBCPP_BUILDING_LIBRARY _LIBCPP_DIAGNOSTIC_PUSH _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wgcc-compat") _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wformat-nonliteral") // GCC doesn't support [[gnu::format]] on variadic templates -# ifdef _LIBCPP_COMPILER_CLANG_BASED -# define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) _LIBCPP_ATTRIBUTE_FORMAT(__VA_ARGS__) -# else -# define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) /* nothing */ -# endif +# ifdef _LIBCPP_COMPILER_CLANG_BASED +# define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) _LIBCPP_ATTRIBUTE_FORMAT(__VA_ARGS__) +# else +# define _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(...) /* nothing */ +# endif template _LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__printf__, 4, 5) int __snprintf( @@ -315,11 +317,13 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __s return std::__libcpp_sscanf_l(__s, __loc, __format, std::forward<_Args>(__args)...); } _LIBCPP_DIAGNOSTIC_POP -# undef _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT +# undef _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT } // namespace __locale _LIBCPP_END_NAMESPACE_STD -#endif // Compatibility definition of locale base APIs +# endif // Compatibility definition of locale base APIs + +#endif // _LIBCPP_HAS_LOCALIZATION #endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_H diff --git a/libcxx/include/fstream b/libcxx/include/fstream index 43e3741897aa1..6c975ec23bf48 100644 --- a/libcxx/include/fstream +++ b/libcxx/include/fstream @@ -189,35 +189,36 @@ typedef basic_fstream wfstream; #if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS) # include <__cxx03/fstream> #else -# include <__algorithm/max.h> -# include <__assert> # include <__config> -# include <__filesystem/path.h> -# include <__fwd/fstream.h> -# include <__locale> -# include <__memory/addressof.h> -# include <__memory/unique_ptr.h> -# include <__ostream/basic_ostream.h> -# include <__type_traits/enable_if.h> -# include <__type_traits/is_same.h> -# include <__utility/move.h> -# include <__utility/swap.h> -# include <__utility/unreachable.h> -# include -# include -# include -# include -# include - -# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -# endif - -_LIBCPP_PUSH_MACROS -# include <__undef_macros> # if _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION +# include <__algorithm/max.h> +# include <__assert> +# include <__filesystem/path.h> +# include <__fwd/fstream.h> +# include <__locale> +# include <__memory/addressof.h> +# include <__memory/unique_ptr.h> +# include <__ostream/basic_ostream.h> +# include <__type_traits/enable_if.h> +# include <__type_traits/is_same.h> +# include <__utility/move.h> +# include <__utility/swap.h> +# include <__utility/unreachable.h> +# include +# include +# include +# include +# include + +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif + +_LIBCPP_PUSH_MACROS +# include <__undef_macros> + _LIBCPP_BEGIN_NAMESPACE_STD # if _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_WIN32API) @@ -1570,10 +1571,10 @@ extern template class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS basic_filebuf; _LIBCPP_END_NAMESPACE_STD -# endif // _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION - _LIBCPP_POP_MACROS +# endif // _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION + # if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/regex b/libcxx/include/regex index 96229c6a6ee42..067f904d4e699 100644 --- a/libcxx/include/regex +++ b/libcxx/include/regex @@ -792,49 +792,52 @@ typedef regex_token_iterator wsregex_token_iterator; #if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS) # include <__cxx03/regex> #else -# include <__algorithm/find.h> -# include <__algorithm/search.h> -# include <__assert> # include <__config> -# include <__iterator/back_insert_iterator.h> -# include <__iterator/default_sentinel.h> -# include <__iterator/wrap_iter.h> -# include <__locale> -# include <__memory/addressof.h> -# include <__memory/shared_ptr.h> -# include <__memory_resource/polymorphic_allocator.h> -# include <__type_traits/is_swappable.h> -# include <__utility/move.h> -# include <__utility/pair.h> -# include <__utility/swap.h> -# include <__verbose_abort> -# include -# include -# include -# include -# include + +# if _LIBCPP_HAS_LOCALIZATION + +# include <__algorithm/find.h> +# include <__algorithm/search.h> +# include <__assert> +# include <__iterator/back_insert_iterator.h> +# include <__iterator/default_sentinel.h> +# include <__iterator/wrap_iter.h> +# include <__locale> +# include <__memory/addressof.h> +# include <__memory/shared_ptr.h> +# include <__memory_resource/polymorphic_allocator.h> +# include <__type_traits/is_swappable.h> +# include <__utility/move.h> +# include <__utility/pair.h> +# include <__utility/swap.h> +# include <__verbose_abort> +# include +# include +# include +# include +# include // standard-mandated includes // [iterator.range] -# include <__iterator/access.h> -# include <__iterator/data.h> -# include <__iterator/empty.h> -# include <__iterator/reverse_access.h> -# include <__iterator/size.h> +# include <__iterator/access.h> +# include <__iterator/data.h> +# include <__iterator/empty.h> +# include <__iterator/reverse_access.h> +# include <__iterator/size.h> // [re.syn] -# include -# include +# include +# include -# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -# endif +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__undef_macros> -# define _LIBCPP_REGEX_COMPLEXITY_FACTOR 4096 +# define _LIBCPP_REGEX_COMPLEXITY_FACTOR 4096 _LIBCPP_BEGIN_NAMESPACE_STD @@ -847,11 +850,11 @@ enum syntax_option_type { nosubs = 1 << 1, optimize = 1 << 2, collate = 1 << 3, -# ifdef _LIBCPP_ABI_REGEX_CONSTANTS_NONZERO +# ifdef _LIBCPP_ABI_REGEX_CONSTANTS_NONZERO ECMAScript = 1 << 9, -# else +# else ECMAScript = 0, -# endif +# endif basic = 1 << 4, extended = 1 << 5, awk = 1 << 6, @@ -862,11 +865,11 @@ enum syntax_option_type { }; _LIBCPP_HIDE_FROM_ABI inline _LIBCPP_CONSTEXPR syntax_option_type __get_grammar(syntax_option_type __g) { -# ifdef _LIBCPP_ABI_REGEX_CONSTANTS_NONZERO +# ifdef _LIBCPP_ABI_REGEX_CONSTANTS_NONZERO return static_cast(__g & 0x3F0); -# else +# else return static_cast(__g & 0x1F0); -# endif +# endif } inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR syntax_option_type operator~(syntax_option_type __x) { @@ -988,11 +991,11 @@ public: template [[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_regex_error() { -# if _LIBCPP_HAS_EXCEPTIONS +# if _LIBCPP_HAS_EXCEPTIONS throw regex_error(_Ev); -# else +# else _LIBCPP_VERBOSE_ABORT("regex_error was thrown in -fno-exceptions mode"); -# endif +# endif } template @@ -1001,7 +1004,7 @@ public: typedef _CharT char_type; typedef basic_string string_type; typedef locale locale_type; -# if defined(__BIONIC__) || defined(_NEWLIB_VERSION) +# if defined(__BIONIC__) || defined(_NEWLIB_VERSION) // Originally bionic's ctype_base used its own ctype masks because the // builtin ctype implementation wasn't in libc++ yet. Bionic's ctype mask // was only 8 bits wide and already saturated, so it used a wider type here @@ -1016,9 +1019,9 @@ public: // often used for space constrained environments, so it makes sense not to // duplicate the ctype table. typedef uint16_t char_class_type; -# else +# else typedef ctype_base::mask char_class_type; -# endif +# endif static const char_class_type __regex_word = ctype_base::__regex_word; @@ -1058,30 +1061,30 @@ private: template string_type __transform_primary(_ForwardIterator __f, _ForwardIterator __l, char) const; -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template string_type __transform_primary(_ForwardIterator __f, _ForwardIterator __l, wchar_t) const; -# endif +# endif template string_type __lookup_collatename(_ForwardIterator __f, _ForwardIterator __l, char) const; -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template string_type __lookup_collatename(_ForwardIterator __f, _ForwardIterator __l, wchar_t) const; -# endif +# endif template char_class_type __lookup_classname(_ForwardIterator __f, _ForwardIterator __l, bool __icase, char) const; -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template char_class_type __lookup_classname(_ForwardIterator __f, _ForwardIterator __l, bool __icase, wchar_t) const; -# endif +# endif static int __regex_traits_value(unsigned char __ch, int __radix); _LIBCPP_HIDE_FROM_ABI int __regex_traits_value(char __ch, int __radix) const { return __regex_traits_value(static_cast(__ch), __radix); } -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS _LIBCPP_HIDE_FROM_ABI int __regex_traits_value(wchar_t __ch, int __radix) const; -# endif +# endif }; template @@ -1140,7 +1143,7 @@ regex_traits<_CharT>::__transform_primary(_ForwardIterator __f, _ForwardIterator return __d; } -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template template typename regex_traits<_CharT>::string_type @@ -1159,7 +1162,7 @@ regex_traits<_CharT>::__transform_primary(_ForwardIterator __f, _ForwardIterator } return __d; } -# endif +# endif // lookup_collatename is very FreeBSD-specific @@ -1184,7 +1187,7 @@ regex_traits<_CharT>::__lookup_collatename(_ForwardIterator __f, _ForwardIterato return __r; } -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template template typename regex_traits<_CharT>::string_type @@ -1212,7 +1215,7 @@ regex_traits<_CharT>::__lookup_collatename(_ForwardIterator __f, _ForwardIterato } return __r; } -# endif // _LIBCPP_HAS_WIDE_CHARACTERS +# endif // _LIBCPP_HAS_WIDE_CHARACTERS // lookup_classname @@ -1227,7 +1230,7 @@ regex_traits<_CharT>::__lookup_classname(_ForwardIterator __f, _ForwardIterator return std::__get_classname(__s.c_str(), __icase); } -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template template typename regex_traits<_CharT>::char_class_type @@ -1243,7 +1246,7 @@ regex_traits<_CharT>::__lookup_classname(_ForwardIterator __f, _ForwardIterator } return __get_classname(__n.c_str(), __icase); } -# endif // _LIBCPP_HAS_WIDE_CHARACTERS +# endif // _LIBCPP_HAS_WIDE_CHARACTERS template bool regex_traits<_CharT>::isctype(char_type __c, char_class_type __m) const { @@ -1254,28 +1257,28 @@ bool regex_traits<_CharT>::isctype(char_type __c, char_class_type __m) const { inline _LIBCPP_HIDE_FROM_ABI bool __is_07(unsigned char __c) { return (__c & 0xF8u) == -# if defined(__MVS__) && !defined(__NATIVE_ASCII_F) +# if defined(__MVS__) && !defined(__NATIVE_ASCII_F) 0xF0; -# else +# else 0x30; -# endif +# endif } inline _LIBCPP_HIDE_FROM_ABI bool __is_89(unsigned char __c) { return (__c & 0xFEu) == -# if defined(__MVS__) && !defined(__NATIVE_ASCII_F) +# if defined(__MVS__) && !defined(__NATIVE_ASCII_F) 0xF8; -# else +# else 0x38; -# endif +# endif } inline _LIBCPP_HIDE_FROM_ABI unsigned char __to_lower(unsigned char __c) { -# if defined(__MVS__) && !defined(__NATIVE_ASCII_F) +# if defined(__MVS__) && !defined(__NATIVE_ASCII_F) return __c & 0xBF; -# else +# else return __c | 0x20; -# endif +# endif } template @@ -1294,12 +1297,12 @@ int regex_traits<_CharT>::__regex_traits_value(unsigned char __ch, int __radix) return -1; } -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template inline int regex_traits<_CharT>::__regex_traits_value(wchar_t __ch, int __radix) const { return __regex_traits_value(static_cast(__ct_->narrow(__ch, char_type())), __radix); } -# endif +# endif template class __node; @@ -1942,10 +1945,10 @@ public: template <> _LIBCPP_EXPORTED_FROM_ABI void __match_any_but_newline::__exec(__state&) const; -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS template <> _LIBCPP_EXPORTED_FROM_ABI void __match_any_but_newline::__exec(__state&) const; -# endif +# endif // __match_char @@ -2177,7 +2180,8 @@ void __bracket_expression<_CharT, _Traits>::__exec(__state& __s) const { } } if (!__equivalences_.empty()) { - string_type __s2 = __traits_.transform_primary(std::addressof(__ch2.first), std::addressof(__ch2.first) + 2); + string_type __s2 = + __traits_.transform_primary(std::addressof(__ch2.first), std::addressof(__ch2.first) + 2); for (size_t __i = 0; __i < __equivalences_.size(); ++__i) { if (__s2 == __equivalences_[__i]) { __found = true; @@ -2225,7 +2229,8 @@ void __bracket_expression<_CharT, _Traits>::__exec(__state& __s) const { } } if (!__ranges_.empty()) { - string_type __s2 = __collate_ ? __traits_.transform(std::addressof(__ch), std::addressof(__ch) + 1) : string_type(1, __ch); + string_type __s2 = + __collate_ ? __traits_.transform(std::addressof(__ch), std::addressof(__ch) + 1) : string_type(1, __ch); for (size_t __i = 0; __i < __ranges_.size(); ++__i) { if (__ranges_[__i].first <= __s2 && __s2 <= __ranges_[__i].second) { __found = true; @@ -2266,9 +2271,9 @@ template > class _LIBCPP_TEMPLATE_VIS basic_regex; typedef basic_regex regex; -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS typedef basic_regex wregex; -# endif +# endif template class _LIBCPP_TEMPLATE_VIS _LIBCPP_PREFERRED_NAME(regex) @@ -2339,21 +2344,21 @@ public: : __flags_(__f), __marked_count_(0), __loop_count_(0), __open_count_(0), __end_(nullptr) { __init(__first, __last); } -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI basic_regex(initializer_list __il, flag_type __f = regex_constants::ECMAScript) : __flags_(__f), __marked_count_(0), __loop_count_(0), __open_count_(0), __end_(nullptr) { __init(__il.begin(), __il.end()); } -# endif // _LIBCPP_CXX03_LANG +# endif // _LIBCPP_CXX03_LANG // ~basic_regex() = default; // basic_regex& operator=(const basic_regex&) = default; // basic_regex& operator=(basic_regex&&) = default; _LIBCPP_HIDE_FROM_ABI basic_regex& operator=(const value_type* __p) { return assign(__p); } -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI basic_regex& operator=(initializer_list __il) { return assign(__il); } -# endif // _LIBCPP_CXX03_LANG +# endif // _LIBCPP_CXX03_LANG template _LIBCPP_HIDE_FROM_ABI basic_regex& operator=(const basic_string& __p) { return assign(__p); @@ -2361,9 +2366,9 @@ public: // assign: _LIBCPP_HIDE_FROM_ABI basic_regex& assign(const basic_regex& __that) { return *this = __that; } -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI basic_regex& assign(basic_regex&& __that) _NOEXCEPT { return *this = std::move(__that); } -# endif +# endif _LIBCPP_HIDE_FROM_ABI basic_regex& assign(const value_type* __p, flag_type __f = regex_constants::ECMAScript) { return assign(__p, __p + __traits_.length(__p), __f); } @@ -2400,14 +2405,14 @@ public: return assign(basic_regex(__first, __last, __f)); } -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI basic_regex& assign(initializer_list __il, flag_type __f = regex_constants::ECMAScript) { return assign(__il.begin(), __il.end(), __f); } -# endif // _LIBCPP_CXX03_LANG +# endif // _LIBCPP_CXX03_LANG // const operations: _LIBCPP_HIDE_FROM_ABI unsigned mark_count() const { return __marked_count_; } @@ -2648,11 +2653,11 @@ private: friend class __lookahead; }; -# if _LIBCPP_STD_VER >= 17 +# if _LIBCPP_STD_VER >= 17 template ::value, int> = 0> basic_regex(_ForwardIterator, _ForwardIterator, regex_constants::syntax_option_type = regex_constants::ECMAScript) -> basic_regex::value_type>; -# endif +# endif template const regex_constants::syntax_option_type basic_regex<_CharT, _Traits>::icase; @@ -4185,10 +4190,10 @@ void basic_regex<_CharT, _Traits>::__push_lookahead(const basic_regex& __exp, bo typedef sub_match csub_match; typedef sub_match ssub_match; -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS typedef sub_match wcsub_match; typedef sub_match wssub_match; -# endif +# endif template class _LIBCPP_TEMPLATE_VIS _LIBCPP_PREFERRED_NAME(csub_match) @@ -4228,7 +4233,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator==(const sub_match<_BiIter>& __x, cons return __x.compare(__y) == 0; } -# if _LIBCPP_STD_VER >= 20 +# if _LIBCPP_STD_VER >= 20 template using __sub_match_cat _LIBCPP_NODEBUG = compare_three_way_result_t::value_type>>; @@ -4237,7 +4242,7 @@ template _LIBCPP_HIDE_FROM_ABI auto operator<=>(const sub_match<_BiIter>& __x, const sub_match<_BiIter>& __y) { return static_cast<__sub_match_cat<_BiIter>>(__x.compare(__y) <=> 0); } -# else // _LIBCPP_STD_VER >= 20 +# else // _LIBCPP_STD_VER >= 20 template inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const sub_match<_BiIter>& __x, const sub_match<_BiIter>& __y) { return !(__x == __y); @@ -4304,7 +4309,7 @@ operator<=(const basic_string::value_type, _ST const sub_match<_BiIter>& __y) { return !(__y < __x); } -# endif // _LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 20 template inline _LIBCPP_HIDE_FROM_ABI bool @@ -4313,7 +4318,7 @@ operator==(const sub_match<_BiIter>& __x, return __x.compare(typename sub_match<_BiIter>::string_type(__y.data(), __y.size())) == 0; } -# if _LIBCPP_STD_VER >= 20 +# if _LIBCPP_STD_VER >= 20 template _LIBCPP_HIDE_FROM_ABI auto operator<=>(const sub_match<_BiIter>& __x, @@ -4321,7 +4326,7 @@ operator<=>(const sub_match<_BiIter>& __x, return static_cast<__sub_match_cat<_BiIter>>( __x.compare(typename sub_match<_BiIter>::string_type(__y.data(), __y.size())) <=> 0); } -# else // _LIBCPP_STD_VER >= 20 +# else // _LIBCPP_STD_VER >= 20 template inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const sub_match<_BiIter>& __x, @@ -4392,7 +4397,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(typename iterator_traits<_BiIter>::value_type const* __x, const sub_match<_BiIter>& __y) { return !(__y < __x); } -# endif // _LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 20 template inline _LIBCPP_HIDE_FROM_ABI bool @@ -4400,13 +4405,13 @@ operator==(const sub_match<_BiIter>& __x, typename iterator_traits<_BiIter>::val return __x.compare(__y) == 0; } -# if _LIBCPP_STD_VER >= 20 +# if _LIBCPP_STD_VER >= 20 template _LIBCPP_HIDE_FROM_ABI auto operator<=>(const sub_match<_BiIter>& __x, typename iterator_traits<_BiIter>::value_type const* __y) { return static_cast<__sub_match_cat<_BiIter>>(__x.compare(__y) <=> 0); } -# else // _LIBCPP_STD_VER >= 20 +# else // _LIBCPP_STD_VER >= 20 template inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const sub_match<_BiIter>& __x, typename iterator_traits<_BiIter>::value_type const* __y) { @@ -4474,7 +4479,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(typename iterator_traits<_BiIter>::value_type const& __x, const sub_match<_BiIter>& __y) { return !(__y < __x); } -# endif // _LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 20 template inline _LIBCPP_HIDE_FROM_ABI bool @@ -4483,14 +4488,14 @@ operator==(const sub_match<_BiIter>& __x, typename iterator_traits<_BiIter>::val return __x.compare(string_type(1, __y)) == 0; } -# if _LIBCPP_STD_VER >= 20 +# if _LIBCPP_STD_VER >= 20 template _LIBCPP_HIDE_FROM_ABI auto operator<=>(const sub_match<_BiIter>& __x, typename iterator_traits<_BiIter>::value_type const& __y) { using string_type = basic_string::value_type>; return static_cast<__sub_match_cat<_BiIter>>(__x.compare(string_type(1, __y)) <=> 0); } -# else // _LIBCPP_STD_VER >= 20 +# else // _LIBCPP_STD_VER >= 20 template inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const sub_match<_BiIter>& __x, typename iterator_traits<_BiIter>::value_type const& __y) { @@ -4521,7 +4526,7 @@ inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const sub_match<_BiIter>& __x, typename iterator_traits<_BiIter>::value_type const& __y) { return !(__y < __x); } -# endif // _LIBCPP_STD_VER >= 20 +# endif // _LIBCPP_STD_VER >= 20 template inline _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _ST>& @@ -4531,10 +4536,10 @@ operator<<(basic_ostream<_CharT, _ST>& __os, const sub_match<_BiIter>& __m) { typedef match_results cmatch; typedef match_results smatch; -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS typedef match_results wcmatch; typedef match_results wsmatch; -# endif +# endif template class _LIBCPP_TEMPLATE_VIS _LIBCPP_PREFERRED_NAME(cmatch) _LIBCPP_IF_WIDE_CHARACTERS(_LIBCPP_PREFERRED_NAME(wcmatch)) @@ -4564,12 +4569,12 @@ public: typedef basic_string string_type; // construct/copy/destroy: -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG match_results() : match_results(allocator_type()) {} explicit match_results(const allocator_type& __a); -# else +# else explicit match_results(const allocator_type& __a = allocator_type()); -# endif +# endif // match_results(const match_results&) = default; // match_results& operator=(const match_results&) = default; @@ -4819,13 +4824,13 @@ _LIBCPP_HIDE_FROM_ABI bool operator==(const match_results<_BidirectionalIterator return __x.__matches_ == __y.__matches_ && __x.__prefix_ == __y.__prefix_ && __x.__suffix_ == __y.__suffix_; } -# if _LIBCPP_STD_VER < 20 +# if _LIBCPP_STD_VER < 20 template inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const match_results<_BidirectionalIterator, _Allocator>& __x, const match_results<_BidirectionalIterator, _Allocator>& __y) { return !(__x == __y); } -# endif +# endif template inline _LIBCPP_HIDE_FROM_ABI void @@ -5237,13 +5242,13 @@ regex_search(const basic_string<_CharT, _ST, _SA>& __s, return __r; } -# if _LIBCPP_STD_VER >= 14 +# if _LIBCPP_STD_VER >= 14 template bool regex_search(const basic_string<_Cp, _ST, _SA>&& __s, match_results::const_iterator, _Ap>&, const basic_regex<_Cp, _Tp>& __e, regex_constants::match_flag_type __flags = regex_constants::match_default) = delete; -# endif +# endif // regex_match @@ -5292,14 +5297,14 @@ regex_match(const basic_string<_CharT, _ST, _SA>& __s, return std::regex_match(__s.begin(), __s.end(), __m, __e, __flags); } -# if _LIBCPP_STD_VER >= 14 +# if _LIBCPP_STD_VER >= 14 template inline _LIBCPP_HIDE_FROM_ABI bool regex_match(const basic_string<_CharT, _ST, _SA>&& __s, match_results::const_iterator, _Allocator>& __m, const basic_regex<_CharT, _Traits>& __e, regex_constants::match_flag_type __flags = regex_constants::match_default) = delete; -# endif +# endif template inline _LIBCPP_HIDE_FROM_ABI bool @@ -5326,10 +5331,10 @@ class _LIBCPP_TEMPLATE_VIS regex_iterator; typedef regex_iterator cregex_iterator; typedef regex_iterator sregex_iterator; -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS typedef regex_iterator wcregex_iterator; typedef regex_iterator wsregex_iterator; -# endif +# endif template class _LIBCPP_TEMPLATE_VIS _LIBCPP_PREFERRED_NAME(cregex_iterator) @@ -5342,9 +5347,9 @@ public: typedef const value_type* pointer; typedef const value_type& reference; typedef forward_iterator_tag iterator_category; -# if _LIBCPP_STD_VER >= 20 +# if _LIBCPP_STD_VER >= 20 typedef input_iterator_tag iterator_concept; -# endif +# endif private: _BidirectionalIterator __begin_; @@ -5359,20 +5364,20 @@ public: _BidirectionalIterator __b, const regex_type& __re, regex_constants::match_flag_type __m = regex_constants::match_default); -# if _LIBCPP_STD_VER >= 14 +# if _LIBCPP_STD_VER >= 14 regex_iterator(_BidirectionalIterator __a, _BidirectionalIterator __b, const regex_type&& __re, regex_constants::match_flag_type __m = regex_constants::match_default) = delete; -# endif +# endif _LIBCPP_HIDE_FROM_ABI bool operator==(const regex_iterator& __x) const; -# if _LIBCPP_STD_VER >= 20 +# if _LIBCPP_STD_VER >= 20 _LIBCPP_HIDE_FROM_ABI bool operator==(default_sentinel_t) const { return *this == regex_iterator(); } -# endif -# if _LIBCPP_STD_VER < 20 +# endif +# if _LIBCPP_STD_VER < 20 _LIBCPP_HIDE_FROM_ABI bool operator!=(const regex_iterator& __x) const { return !(*this == __x); } -# endif +# endif _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __match_; } _LIBCPP_HIDE_FROM_ABI pointer operator->() const { return std::addressof(__match_); } @@ -5456,10 +5461,10 @@ class _LIBCPP_TEMPLATE_VIS regex_token_iterator; typedef regex_token_iterator cregex_token_iterator; typedef regex_token_iterator sregex_token_iterator; -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS typedef regex_token_iterator wcregex_token_iterator; typedef regex_token_iterator wsregex_token_iterator; -# endif +# endif template class _LIBCPP_TEMPLATE_VIS _LIBCPP_PREFERRED_NAME(cregex_token_iterator) @@ -5473,9 +5478,9 @@ public: typedef const value_type* pointer; typedef const value_type& reference; typedef forward_iterator_tag iterator_category; -# if _LIBCPP_STD_VER >= 20 +# if _LIBCPP_STD_VER >= 20 typedef input_iterator_tag iterator_concept; -# endif +# endif private: typedef regex_iterator<_BidirectionalIterator, _CharT, _Traits> _Position; @@ -5493,67 +5498,67 @@ public: const regex_type& __re, int __submatch = 0, regex_constants::match_flag_type __m = regex_constants::match_default); -# if _LIBCPP_STD_VER >= 14 +# if _LIBCPP_STD_VER >= 14 regex_token_iterator(_BidirectionalIterator __a, _BidirectionalIterator __b, const regex_type&& __re, int __submatch = 0, regex_constants::match_flag_type __m = regex_constants::match_default) = delete; -# endif +# endif regex_token_iterator(_BidirectionalIterator __a, _BidirectionalIterator __b, const regex_type& __re, const vector& __submatches, regex_constants::match_flag_type __m = regex_constants::match_default); -# if _LIBCPP_STD_VER >= 14 +# if _LIBCPP_STD_VER >= 14 regex_token_iterator(_BidirectionalIterator __a, _BidirectionalIterator __b, const regex_type&& __re, const vector& __submatches, regex_constants::match_flag_type __m = regex_constants::match_default) = delete; -# endif +# endif -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG regex_token_iterator(_BidirectionalIterator __a, _BidirectionalIterator __b, const regex_type& __re, initializer_list __submatches, regex_constants::match_flag_type __m = regex_constants::match_default); -# if _LIBCPP_STD_VER >= 14 +# if _LIBCPP_STD_VER >= 14 regex_token_iterator(_BidirectionalIterator __a, _BidirectionalIterator __b, const regex_type&& __re, initializer_list __submatches, regex_constants::match_flag_type __m = regex_constants::match_default) = delete; -# endif -# endif // _LIBCPP_CXX03_LANG +# endif +# endif // _LIBCPP_CXX03_LANG template regex_token_iterator(_BidirectionalIterator __a, _BidirectionalIterator __b, const regex_type& __re, const int (&__submatches)[_Np], regex_constants::match_flag_type __m = regex_constants::match_default); -# if _LIBCPP_STD_VER >= 14 +# if _LIBCPP_STD_VER >= 14 template regex_token_iterator(_BidirectionalIterator __a, _BidirectionalIterator __b, const regex_type&& __re, const int (&__submatches)[_Np], regex_constants::match_flag_type __m = regex_constants::match_default) = delete; -# endif +# endif regex_token_iterator(const regex_token_iterator&); regex_token_iterator& operator=(const regex_token_iterator&); _LIBCPP_HIDE_FROM_ABI bool operator==(const regex_token_iterator& __x) const; -# if _LIBCPP_STD_VER >= 20 +# if _LIBCPP_STD_VER >= 20 _LIBCPP_HIDE_FROM_ABI bool operator==(default_sentinel_t) const { return *this == regex_token_iterator(); } -# endif -# if _LIBCPP_STD_VER < 20 +# endif +# if _LIBCPP_STD_VER < 20 _LIBCPP_HIDE_FROM_ABI bool operator!=(const regex_token_iterator& __x) const { return !(*this == __x); } -# endif +# endif _LIBCPP_HIDE_FROM_ABI const value_type& operator*() const { return *__result_; } _LIBCPP_HIDE_FROM_ABI const value_type* operator->() const { return __result_; } @@ -5615,7 +5620,7 @@ regex_token_iterator<_BidirectionalIterator, _CharT, _Traits>::regex_token_itera __init(__a, __b); } -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG template regex_token_iterator<_BidirectionalIterator, _CharT, _Traits>::regex_token_iterator( @@ -5628,7 +5633,7 @@ regex_token_iterator<_BidirectionalIterator, _CharT, _Traits>::regex_token_itera __init(__a, __b); } -# endif // _LIBCPP_CXX03_LANG +# endif // _LIBCPP_CXX03_LANG template template @@ -5678,7 +5683,8 @@ template bool regex_token_iterator<_BidirectionalIterator, _CharT, _Traits>::operator==(const regex_token_iterator& __x) const { if (__result_ == nullptr && __x.__result_ == nullptr) return true; - if (__result_ == std::addressof(__suffix_) && __x.__result_ == std::addressof(__x.__suffix_) && __suffix_ == __x.__suffix_) + if (__result_ == std::addressof(__suffix_) && __x.__result_ == std::addressof(__x.__suffix_) && + __suffix_ == __x.__suffix_) return true; if (__result_ == nullptr || __x.__result_ == nullptr) return false; @@ -5803,7 +5809,7 @@ regex_replace(const _CharT* __s, _LIBCPP_END_NAMESPACE_STD -# if _LIBCPP_STD_VER >= 17 +# if _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD namespace pmr { template @@ -5813,16 +5819,18 @@ using match_results _LIBCPP_AVAILABILITY_PMR = using cmatch _LIBCPP_AVAILABILITY_PMR = match_results; using smatch _LIBCPP_AVAILABILITY_PMR = match_results; -# if _LIBCPP_HAS_WIDE_CHARACTERS +# if _LIBCPP_HAS_WIDE_CHARACTERS using wcmatch _LIBCPP_AVAILABILITY_PMR = match_results; using wsmatch _LIBCPP_AVAILABILITY_PMR = match_results; -# endif +# endif } // namespace pmr _LIBCPP_END_NAMESPACE_STD -# endif +# endif _LIBCPP_POP_MACROS +# endif // _LIBCPP_HAS_LOCALIZATION + # if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20 # include # include diff --git a/libcxx/include/strstream b/libcxx/include/strstream index 90d56694e7a6c..1a17f8389c078 100644 --- a/libcxx/include/strstream +++ b/libcxx/include/strstream @@ -133,30 +133,33 @@ private: # include <__cxx03/strstream> #else # include <__config> -# include <__ostream/basic_ostream.h> -# include -# include -# include -# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -# endif +# if _LIBCPP_HAS_LOCALIZATION -# if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM) || defined(_LIBCPP_BUILDING_LIBRARY) +# include <__ostream/basic_ostream.h> +# include +# include +# include + +# if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +# endif + +# if _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM) || defined(_LIBCPP_BUILDING_LIBRARY) _LIBCPP_PUSH_MACROS -# include <__undef_macros> +# include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD class _LIBCPP_DEPRECATED _LIBCPP_EXPORTED_FROM_ABI strstreambuf : public streambuf { public: -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI strstreambuf() : strstreambuf(0) {} explicit strstreambuf(streamsize __alsize); -# else +# else explicit strstreambuf(streamsize __alsize = 0); -# endif +# endif strstreambuf(void* (*__palloc)(size_t), void (*__pfree)(void*)); strstreambuf(char* __gnext, streamsize __n, char* __pbeg = nullptr); strstreambuf(const char* __gnext, streamsize __n); @@ -166,10 +169,10 @@ public: strstreambuf(unsigned char* __gnext, streamsize __n, unsigned char* __pbeg = nullptr); strstreambuf(const unsigned char* __gnext, streamsize __n); -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI strstreambuf(strstreambuf&& __rhs); _LIBCPP_HIDE_FROM_ABI strstreambuf& operator=(strstreambuf&& __rhs); -# endif // _LIBCPP_CXX03_LANG +# endif // _LIBCPP_CXX03_LANG ~strstreambuf() override; @@ -203,7 +206,7 @@ private: void __init(char* __gnext, streamsize __n, char* __pbeg); }; -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG inline _LIBCPP_HIDE_FROM_ABI strstreambuf::strstreambuf(strstreambuf&& __rhs) : streambuf(__rhs), @@ -232,7 +235,7 @@ inline _LIBCPP_HIDE_FROM_ABI strstreambuf& strstreambuf::operator=(strstreambuf& return *this; } -# endif // _LIBCPP_CXX03_LANG +# endif // _LIBCPP_CXX03_LANG class _LIBCPP_DEPRECATED _LIBCPP_EXPORTED_FROM_ABI istrstream : public istream { public: @@ -241,7 +244,7 @@ public: _LIBCPP_HIDE_FROM_ABI istrstream(const char* __s, streamsize __n) : istream(&__sb_), __sb_(__s, __n) {} _LIBCPP_HIDE_FROM_ABI istrstream(char* __s, streamsize __n) : istream(&__sb_), __sb_(__s, __n) {} -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI istrstream(istrstream&& __rhs) // extension : istream(std::move(static_cast(__rhs))), __sb_(std::move(__rhs.__sb_)) { istream::set_rdbuf(&__sb_); @@ -252,7 +255,7 @@ public: istream::operator=(std::move(__rhs)); return *this; } -# endif // _LIBCPP_CXX03_LANG +# endif // _LIBCPP_CXX03_LANG ~istrstream() override; @@ -274,7 +277,7 @@ public: _LIBCPP_HIDE_FROM_ABI ostrstream(char* __s, int __n, ios_base::openmode __mode = ios_base::out) : ostream(&__sb_), __sb_(__s, __n, __s + (__mode & ios::app ? std::strlen(__s) : 0)) {} -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI ostrstream(ostrstream&& __rhs) // extension : ostream(std::move(static_cast(__rhs))), __sb_(std::move(__rhs.__sb_)) { ostream::set_rdbuf(&__sb_); @@ -285,7 +288,7 @@ public: ostream::operator=(std::move(__rhs)); return *this; } -# endif // _LIBCPP_CXX03_LANG +# endif // _LIBCPP_CXX03_LANG ~ostrstream() override; @@ -316,7 +319,7 @@ public: _LIBCPP_HIDE_FROM_ABI strstream(char* __s, int __n, ios_base::openmode __mode = ios_base::in | ios_base::out) : iostream(&__sb_), __sb_(__s, __n, __s + (__mode & ios::app ? std::strlen(__s) : 0)) {} -# ifndef _LIBCPP_CXX03_LANG +# ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI strstream(strstream&& __rhs) // extension : iostream(std::move(static_cast(__rhs))), __sb_(std::move(__rhs.__sb_)) { iostream::set_rdbuf(&__sb_); @@ -327,7 +330,7 @@ public: iostream::operator=(std::move(__rhs)); return *this; } -# endif // _LIBCPP_CXX03_LANG +# endif // _LIBCPP_CXX03_LANG ~strstream() override; @@ -350,7 +353,11 @@ _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS -# endif // _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM) || defined(_LIBCPP_BUILDING_LIBRARY) -#endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS) +# endif // _LIBCPP_STD_VER < 26 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_STRSTREAM) || + // defined(_LIBCPP_BUILDING_LIBRARY) + +# endif // _LIBCPP_HAS_LOCALIZATION + +#endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS) #endif // _LIBCPP_STRSTREAM diff --git a/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in b/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in index 7aedfde89916c..9bff5021494ef 100644 --- a/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in +++ b/libcxx/test/configs/armv7m-picolibc-libc++.cfg.in @@ -11,10 +11,6 @@ config.substitutions.append(('%{compile_flags}', # "large atomic operation may incur significant performance penalty; the # access size (4 bytes) exceeds the max lock-free size (0 bytes)" ' -Wno-atomic-alignment' - - # Various libc++ headers check for the definition of _NEWLIB_VERSION - # which for picolibc is defined in picolibc.h. - ' -include picolibc.h' )) config.substitutions.append(('%{link_flags}', '-nostdlib -nostdlib++ -L %{lib-dir} -lc++ -lc++abi' diff --git a/libcxx/test/libcxx/system_reserved_names.gen.py b/libcxx/test/libcxx/system_reserved_names.gen.py index 304c803b76c3d..a4f2928eda332 100644 --- a/libcxx/test/libcxx/system_reserved_names.gen.py +++ b/libcxx/test/libcxx/system_reserved_names.gen.py @@ -11,6 +11,7 @@ # provided macros (in other words, ensure that we push/pop correctly everywhere). # RUN: %{python} %s %{libcxx-dir}/utils +# END. import sys @@ -28,6 +29,11 @@ {lit_header_restrictions.get(header, '')} {lit_header_undeprecations.get(header, '')} +// UNSUPPORTED: FROZEN-CXX03-HEADERS-FIXME + +// This is required to detect the platform we're building for below. +#include <__config> + #define SYSTEM_RESERVED_NAME This name should not be used in libc++ // libc++ does not use single-letter names as a matter of principle. From 07161a3fb16f07f4001de43e17d0cd487841ef98 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 3 Apr 2025 23:07:53 -0700 Subject: [PATCH 0659/1029] [RISCV] Return NoMatch if register list does not start with a curly brace. This way we emit the error message that explains the full syntax for a register list. parseZcmpStackAdj had to be modified to not assume the previous operand had been successfully parsed as a register list. --- .../lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 15 +++++++++++---- llvm/test/MC/RISCV/rv32xqccmp-invalid.s | 5 +++++ llvm/test/MC/RISCV/rv32zcmp-invalid.s | 3 +++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index d90d1dda07081..2fdee13a734f6 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2576,10 +2576,12 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, // must include `fp`/`s0` in the list: // Rlist: {ra, s0[-sN]} // XRlist: {x1, x8[-x9][, x18[-xN]]} - SMLoc S = getLoc(); - if (parseToken(AsmToken::LCurly, "register list must start with '{'")) - return ParseStatus::Failure; + if (getTok().isNot(AsmToken::LCurly)) + return ParseStatus::NoMatch; + + SMLoc S = getLoc(); + Lex(); bool IsRVE = isRVE(); @@ -2674,7 +2676,12 @@ ParseStatus RISCVAsmParser::parseZcmpStackAdj(OperandVector &Operands, return ParseStatus::NoMatch; int64_t StackAdjustment = getTok().getIntVal(); - unsigned RlistVal = static_cast(Operands[1].get())->Rlist.Val; + + auto *RListOp = static_cast(Operands.back().get()); + if (!RListOp->isRlist()) + return ParseStatus::NoMatch; + + unsigned RlistVal = RListOp->Rlist.Val; assert(RlistVal != RISCVZC::INVALID_RLIST); unsigned StackAdjBase = RISCVZC::getStackAdjBase(RlistVal, isRV64()); diff --git a/llvm/test/MC/RISCV/rv32xqccmp-invalid.s b/llvm/test/MC/RISCV/rv32xqccmp-invalid.s index 5bfc2e3498bef..ece3513120392 100644 --- a/llvm/test/MC/RISCV/rv32xqccmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32xqccmp-invalid.s @@ -37,3 +37,8 @@ qc.cm.pop {ra, s0-s1}, -40 # CHECK-ERROR: :[[@LINE+1]]:14: error: register list must include 's0' or 'x8' qc.cm.pushfp {ra}, -16 +# CHECK-ERROR: :[[@LINE+1]]:12: error: operand must be {ra [, s0[-sN]]} or {x1 [, x8[-x9][, x18[-xN]]]} +qc.cm.push x1, -16 + +# CHECK-ERROR: :[[@LINE+1]]:14: error: operand must be {ra, s0[-sN]} or {x1, x8[-x9][, x18[-xN]]} +qc.cm.pushfp x1, -16 diff --git a/llvm/test/MC/RISCV/rv32zcmp-invalid.s b/llvm/test/MC/RISCV/rv32zcmp-invalid.s index c41cc35a8f8ee..b4261f865fae7 100644 --- a/llvm/test/MC/RISCV/rv32zcmp-invalid.s +++ b/llvm/test/MC/RISCV/rv32zcmp-invalid.s @@ -45,3 +45,6 @@ cm.pop {ra, x8-x9, x18-x17}, -40 # CHECK-ERROR: :[[@LINE+1]]:16: error: invalid register cm.pop {ra, x8-f8, x18-x17}, -40 + +# CHECK-ERROR: :[[@LINE+1]]:9: error: operand must be {ra [, s0[-sN]]} or {x1 [, x8[-x9][, x18[-xN]]]} +cm.push x1, -16 From 0d3f5ec0da064d2314098644e78d29d3c84e179c Mon Sep 17 00:00:00 2001 From: Evan Wilde Date: Fri, 4 Apr 2025 09:02:24 -0700 Subject: [PATCH 0660/1029] [compiler-rt][CMake] Pass all flags to _Float16 try-compile (#133952) The try-compile mechanism requires that `CMAKE_REQUIRED_FLAGS` is a space-separated string instead of a list of flags. The original code expanded `BUILTIN_FLAGS` into `CMAKE_REQUIRED_FLAGS` as a space-separated string and then would overwrite `CMAKE_REQUIRED_FLAGS` with `TARGET_${arch}_CFLAGS` prepended to the unexpanded `BUILTIN_CFLAGS_${arch}`. This resulted in the first two arguments being passed into the try-compile invocation, but dropping the other arguments listed in `BUILTIN_CFLAGS_${arch}`. This patch appends `TARGET_${arch}_CFLAGS` and `BUILTIN_CFLAGS_${arch}` to `CMAKE_REQUIRED_FLAGS` before expanding CMAKE_REQUIRED_FLAGS as a space-separated string. This passes any pre-set required flags, in addition to all of the builtin and target flags to the Float16 detection. --- compiler-rt/lib/builtins/CMakeLists.txt | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 626b21e30ed6b..5d78b5a780428 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -854,15 +854,17 @@ else () cmake_push_check_state() # TODO: we should probably make most of the checks in builtin-config depend on the target flags. set(BUILTIN_CFLAGS_${arch} ${BUILTIN_CFLAGS}) - # CMAKE_REQUIRED_FLAGS must be a space separated string but unlike TARGET_${arch}_CFLAGS, - # BUILTIN_CFLAGS_${arch} is a CMake list, so we have to join it to create a valid command line. - list(JOIN BUILTIN_CFLAGS " " CMAKE_REQUIRED_FLAGS) - set(CMAKE_REQUIRED_FLAGS "${TARGET_${arch}_CFLAGS} ${BUILTIN_CFLAGS_${arch}}") + # CMAKE_REQUIRED_FLAGS must be a space separated string + # Join BUILTIN_CFLAGS_${arch} and TARGET_${arch}_CFLAGS as a + # space-separated string. + list(APPEND CMAKE_REQUIRED_FLAGS + ${BUILTIN_CFLAGS_${arch}} + ${TARGET_${arch}_CFLAGS}) + list(JOIN CMAKE_REQUIRED_FLAGS " " CMAKE_REQUIRED_FLAGS) message(STATUS "Performing additional configure checks with target flags: ${CMAKE_REQUIRED_FLAGS}") # For ARM archs, exclude any VFP builtins if VFP is not supported if (${arch} MATCHES "^(arm|armhf|armv7|armv7s|armv7k|armv7m|armv7em|armv8m.main|armv8.1m.main)$") - string(REPLACE ";" " " _TARGET_${arch}_CFLAGS "${TARGET_${arch}_CFLAGS}") - check_compile_definition(__ARM_FP "${CMAKE_C_FLAGS} ${_TARGET_${arch}_CFLAGS}" COMPILER_RT_HAS_${arch}_VFP) + check_compile_definition(__ARM_FP "${CMAKE_C_FLAGS}" COMPILER_RT_HAS_${arch}_VFP) if(NOT COMPILER_RT_HAS_${arch}_VFP) list(REMOVE_ITEM ${arch}_SOURCES ${arm_Thumb1_VFPv2_DP_SOURCES} ${arm_Thumb1_VFPv2_SP_SOURCES} ${arm_Thumb1_SjLj_EH_SOURCES}) else() From e8d50097849081168f0285418ce8a36733eb7154 Mon Sep 17 00:00:00 2001 From: Kevin Gleason Date: Fri, 4 Apr 2025 11:29:51 -0500 Subject: [PATCH 0661/1029] [mlir] Fix parsing of empty complex tensors (#134322) After https://github.com/llvm/llvm-project/pull/133220 we had some empty complex literals (`tensor<0xcomplex>`) failing to parse. This was largely due to the ambiguity between `shape.empty()` meaning splat (`dense<1>`) or empty literal (`dense<>`). Used type's numel to disambiguate during verification. --- mlir/lib/AsmParser/AttributeParser.cpp | 6 ++++-- mlir/test/IR/parser.mlir | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/mlir/lib/AsmParser/AttributeParser.cpp b/mlir/lib/AsmParser/AttributeParser.cpp index 93a24dee29ad2..2474e88373e04 100644 --- a/mlir/lib/AsmParser/AttributeParser.cpp +++ b/mlir/lib/AsmParser/AttributeParser.cpp @@ -566,8 +566,10 @@ DenseElementsAttr TensorLiteralParser::getAttr(SMLoc loc, ShapedType type) { if (ComplexType complexTy = dyn_cast(eltType)) { eltType = complexTy.getElementType(); isComplex = true; - // Complex types have 2 elements. - if (shape.empty() && storage.size() != 2) { + // Complex types have N*2 elements or complex splat. + // Empty shape may mean a splat or empty literal, only validate splats. + bool isSplat = shape.empty() && type.getNumElements() != 0; + if (isSplat && storage.size() != 2) { p.emitError(loc) << "parsed " << storage.size() << " elements, but type (" << complexTy << ") expected 2 elements"; return nullptr; diff --git a/mlir/test/IR/parser.mlir b/mlir/test/IR/parser.mlir index cace1fefa43d6..8b192ff11d573 100644 --- a/mlir/test/IR/parser.mlir +++ b/mlir/test/IR/parser.mlir @@ -730,6 +730,10 @@ func.func @densetensorattr() -> () { "complex_attr"(){bar = dense<(1.000000e+00,0.000000e+00)> : tensor>} : () -> () // CHECK: dense<[(1.000000e+00,0.000000e+00), (2.000000e+00,2.000000e+00)]> : tensor<2xcomplex> "complex_attr"(){bar = dense<[(1.000000e+00,0.000000e+00), (2.000000e+00,2.000000e+00)]> : tensor<2xcomplex>} : () -> () + // CHECK: dense<> : tensor<0xcomplex> + "complex_attr"(){bar = dense<> : tensor<0xcomplex>} : () -> () + // CHECK: dense<> : tensor<2x0xcomplex> + "complex_attr"(){bar = dense<> : tensor<2x0xcomplex>} : () -> () return } From 3d24046b3306cbc682aa1f17426169a942d8931a Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis Date: Fri, 4 Apr 2025 17:31:14 +0100 Subject: [PATCH 0662/1029] [BOLT] Skip out-of-range pending relocations (#116964) When a pending relocation is created it is also marked whether it is optional or not. It can be optional when such relocation is added as part of an optimization (i.e., `scanExternalRefs`). When bolt tries to `flushPendingRelocations`, it safely skips any optional relocations that cannot be encoded due to being out of range. A pre-requisite to that is the usage of the `-force-patch` flag. Alternatrively, BOLT will bail out with a relevant message. Background: BOLT, as part of scanExternalRefs, identifies external references from calls and creates some pending relocations for them. Those when flushed will update references to point to the optimized functions. This optimization can be disabled using `--no-scan`. BOLT can assert if any of these pending relocations cannot be encoded. This patch does not disable this optimization but instead selectively applies it given that a pending relocation is optional and `-force-patch` was enabled. --- bolt/include/bolt/Core/Relocation.h | 3 ++ bolt/lib/Core/BinaryFunction.cpp | 2 + bolt/lib/Core/BinarySection.cpp | 27 +++++++++++- bolt/lib/Core/Relocation.cpp | 33 ++++++++++++++ bolt/unittests/Core/BinaryContext.cpp | 62 +++++++++++++++++++++++++++ bolt/unittests/Core/CMakeLists.txt | 1 + 6 files changed, 127 insertions(+), 1 deletion(-) diff --git a/bolt/include/bolt/Core/Relocation.h b/bolt/include/bolt/Core/Relocation.h index 78e94cd63829d..3fcf69d79dba2 100644 --- a/bolt/include/bolt/Core/Relocation.h +++ b/bolt/include/bolt/Core/Relocation.h @@ -86,6 +86,9 @@ class Relocation { /// Adjust value depending on relocation type (make it PC relative or not). static uint64_t encodeValue(uint32_t Type, uint64_t Value, uint64_t PC); + /// Return true if there are enough bits to encode the relocation value. + static bool canEncodeValue(uint32_t Type, uint64_t Value, uint64_t PC); + /// Extract current relocated value from binary contents. This is used for /// RISC architectures where values are encoded in specific bits depending /// on the relocation value. For X86, we limit to sign extending the value diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index d1b293ada5fdc..c4f4d234b30c0 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -1795,6 +1795,8 @@ bool BinaryFunction::scanExternalRefs() { // Create relocation for every fixup. for (const MCFixup &Fixup : Fixups) { std::optional Rel = BC.MIB->createRelocation(Fixup, *BC.MAB); + // Can be skipped in case of overlow during relocation value encoding. + Rel->setOptional(); if (!Rel) { Success = false; continue; diff --git a/bolt/lib/Core/BinarySection.cpp b/bolt/lib/Core/BinarySection.cpp index b16e0a4333aa2..e5def7547a187 100644 --- a/bolt/lib/Core/BinarySection.cpp +++ b/bolt/lib/Core/BinarySection.cpp @@ -12,6 +12,7 @@ #include "bolt/Core/BinarySection.h" #include "bolt/Core/BinaryContext.h" +#include "bolt/Utils/CommandLineOpts.h" #include "bolt/Utils/Utils.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" @@ -22,8 +23,8 @@ using namespace llvm; using namespace bolt; namespace opts { -extern cl::opt PrintRelocations; extern cl::opt HotData; +extern cl::opt PrintRelocations; } // namespace opts uint64_t BinarySection::Count = 0; @@ -174,11 +175,30 @@ void BinarySection::flushPendingRelocations(raw_pwrite_stream &OS, OS.pwrite(Patch.Bytes.data(), Patch.Bytes.size(), SectionFileOffset + Patch.Offset); + uint64_t SkippedPendingRelocations = 0; for (Relocation &Reloc : PendingRelocations) { uint64_t Value = Reloc.Addend; if (Reloc.Symbol) Value += Resolver(Reloc.Symbol); + // Safely skip any optional pending relocation that cannot be encoded. + if (Reloc.isOptional() && + !Relocation::canEncodeValue(Reloc.Type, Value, + SectionAddress + Reloc.Offset)) { + + // A successful run of 'scanExternalRefs' means that all pending + // relocations are flushed. Otherwise, PatchEntries should run. + if (!opts::ForcePatch) { + BC.errs() + << "BOLT-ERROR: cannot encode relocation for symbol " + << Reloc.Symbol->getName() + << " as it is out-of-range. To proceed must use -force-patch\n"; + exit(1); + } + + ++SkippedPendingRelocations; + continue; + } Value = Relocation::encodeValue(Reloc.Type, Value, SectionAddress + Reloc.Offset); @@ -197,6 +217,11 @@ void BinarySection::flushPendingRelocations(raw_pwrite_stream &OS, } clearList(PendingRelocations); + + if (SkippedPendingRelocations > 0 && opts::Verbosity >= 1) { + BC.outs() << "BOLT-INFO: skipped " << SkippedPendingRelocations + << " out-of-range optional relocations\n"; + } } BinarySection::~BinarySection() { updateContents(nullptr, 0); } diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp index 1a142c7d9716c..4696a1f1f0402 100644 --- a/bolt/lib/Core/Relocation.cpp +++ b/bolt/lib/Core/Relocation.cpp @@ -271,6 +271,16 @@ static uint64_t encodeValueX86(uint32_t Type, uint64_t Value, uint64_t PC) { return Value; } +static bool canEncodeValueAArch64(uint32_t Type, uint64_t Value, uint64_t PC) { + switch (Type) { + default: + llvm_unreachable("unsupported relocation"); + case ELF::R_AARCH64_CALL26: + case ELF::R_AARCH64_JUMP26: + return isInt<28>(Value - PC); + } +} + static uint64_t encodeValueAArch64(uint32_t Type, uint64_t Value, uint64_t PC) { switch (Type) { default: @@ -303,6 +313,16 @@ static uint64_t encodeValueAArch64(uint32_t Type, uint64_t Value, uint64_t PC) { return Value; } +static uint64_t canEncodeValueRISCV(uint32_t Type, uint64_t Value, + uint64_t PC) { + switch (Type) { + default: + llvm_unreachable("unsupported relocation"); + case ELF::R_RISCV_64: + return true; + } +} + static uint64_t encodeValueRISCV(uint32_t Type, uint64_t Value, uint64_t PC) { switch (Type) { default: @@ -739,6 +759,19 @@ uint64_t Relocation::encodeValue(uint32_t Type, uint64_t Value, uint64_t PC) { } } +bool Relocation::canEncodeValue(uint32_t Type, uint64_t Value, uint64_t PC) { + switch (Arch) { + default: + llvm_unreachable("Unsupported architecture"); + case Triple::aarch64: + return canEncodeValueAArch64(Type, Value, PC); + case Triple::riscv64: + return canEncodeValueRISCV(Type, Value, PC); + case Triple::x86_64: + return true; + } +} + uint64_t Relocation::extractValue(uint32_t Type, uint64_t Contents, uint64_t PC) { switch (Arch) { diff --git a/bolt/unittests/Core/BinaryContext.cpp b/bolt/unittests/Core/BinaryContext.cpp index 09d16966334da..377517adf03db 100644 --- a/bolt/unittests/Core/BinaryContext.cpp +++ b/bolt/unittests/Core/BinaryContext.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "bolt/Core/BinaryContext.h" +#include "bolt/Utils/CommandLineOpts.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/Support/TargetSelect.h" @@ -161,6 +162,67 @@ TEST_P(BinaryContextTester, FlushPendingRelocJUMP26) { << "Wrong forward branch value\n"; } +TEST_P(BinaryContextTester, + FlushOptionalOutOfRangePendingRelocCALL26_ForcePatchOff) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + + // Tests that flushPendingRelocations exits if any pending relocation is out + // of range and PatchEntries hasn't run. Pending relocations are added by + // scanExternalRefs, so this ensures that either all scanExternalRefs + // relocations were flushed or PatchEntries ran. + + BinarySection &BS = BC->registerOrUpdateSection( + ".text", ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + // Create symbol 'Func0x4' + MCSymbol *RelSymbol = BC->getOrCreateGlobalSymbol(4, "Func"); + ASSERT_TRUE(RelSymbol); + Relocation Reloc{8, RelSymbol, ELF::R_AARCH64_CALL26, 0, 0}; + Reloc.setOptional(); + BS.addPendingRelocation(Reloc); + + SmallVector Vect; + raw_svector_ostream OS(Vect); + + // Resolve relocation symbol to a high value so encoding will be out of range. + EXPECT_EXIT(BS.flushPendingRelocations( + OS, [&](const MCSymbol *S) { return 0x800000F; }), + ::testing::ExitedWithCode(1), + "BOLT-ERROR: cannot encode relocation for symbol Func0x4 as it is" + " out-of-range. To proceed must use -force-patch"); +} + +TEST_P(BinaryContextTester, + FlushOptionalOutOfRangePendingRelocCALL26_ForcePatchOn) { + if (GetParam() != Triple::aarch64) + GTEST_SKIP(); + + // Tests that flushPendingRelocations can skip flushing any optional pending + // relocations that cannot be encoded, given that PatchEntries runs. + opts::ForcePatch = true; + + opts::Verbosity = 1; + testing::internal::CaptureStdout(); + + BinarySection &BS = BC->registerOrUpdateSection( + ".text", ELF::SHT_PROGBITS, ELF::SHF_EXECINSTR | ELF::SHF_ALLOC); + MCSymbol *RelSymbol = BC->getOrCreateGlobalSymbol(4, "Func"); + ASSERT_TRUE(RelSymbol); + Relocation Reloc{8, RelSymbol, ELF::R_AARCH64_CALL26, 0, 0}; + Reloc.setOptional(); + BS.addPendingRelocation(Reloc); + + SmallVector Vect; + raw_svector_ostream OS(Vect); + + // Resolve relocation symbol to a high value so encoding will be out of range. + BS.flushPendingRelocations(OS, [&](const MCSymbol *S) { return 0x800000F; }); + outs().flush(); + std::string CapturedStdOut = testing::internal::GetCapturedStdout(); + EXPECT_EQ(CapturedStdOut, + "BOLT-INFO: skipped 1 out-of-range optional relocations\n"); +} + #endif TEST_P(BinaryContextTester, BaseAddress) { diff --git a/bolt/unittests/Core/CMakeLists.txt b/bolt/unittests/Core/CMakeLists.txt index 8ac88b701ea05..54e8ea10cda12 100644 --- a/bolt/unittests/Core/CMakeLists.txt +++ b/bolt/unittests/Core/CMakeLists.txt @@ -19,6 +19,7 @@ target_link_libraries(CoreTests LLVMBOLTCore LLVMBOLTRewrite LLVMBOLTProfile + LLVMBOLTUtils LLVMTestingSupport ) From 5acab1bd15004e0ab7af60d2c4919c189bd38520 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Fri, 4 Apr 2025 09:37:15 -0700 Subject: [PATCH 0663/1029] [mlir][SPIRV] `IfOpConversion`: Compute result types earlier (#134380) Compute the result types and bail out before modifying any IR. That is more efficient when type conversion failed, because no modifications must be rolled back. Note: This is in preparation of the One-Shot Dialect Conversion refactoring. --- mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp index 31d8cd2206148..baac1b374b126 100644 --- a/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp +++ b/mlir/lib/Conversion/SCFToSPIRV/SCFToSPIRV.cpp @@ -225,6 +225,18 @@ struct IfOpConversion : SCFToSPIRVPattern { // subsequently converges. auto loc = ifOp.getLoc(); + // Compute return types. + SmallVector returnTypes; + for (auto result : ifOp.getResults()) { + auto convertedType = typeConverter.convertType(result.getType()); + if (!convertedType) + return rewriter.notifyMatchFailure( + loc, + llvm::formatv("failed to convert type '{0}'", result.getType())); + + returnTypes.push_back(convertedType); + } + // Create `spirv.selection` operation, selection header block and merge // block. auto selectionOp = @@ -261,16 +273,6 @@ struct IfOpConversion : SCFToSPIRVPattern { thenBlock, ArrayRef(), elseBlock, ArrayRef()); - SmallVector returnTypes; - for (auto result : ifOp.getResults()) { - auto convertedType = typeConverter.convertType(result.getType()); - if (!convertedType) - return rewriter.notifyMatchFailure( - loc, - llvm::formatv("failed to convert type '{0}'", result.getType())); - - returnTypes.push_back(convertedType); - } replaceSCFOutputValue(ifOp, selectionOp, rewriter, scfToSPIRVContext, returnTypes); return success(); From f9193f3b18f08547e2f92b5e354a44655bfc1b94 Mon Sep 17 00:00:00 2001 From: Snehasish Kumar Date: Fri, 4 Apr 2025 10:37:25 -0600 Subject: [PATCH 0664/1029] [DebugInfo] Preserve line and column number when merging debug info. (#129960) This patch introduces a new option `-preserve-merged-debug-info` to preserve an arbitrary but deterministic version of debug information when DILocations are merged. This is intended to be used in production environments from which sample based profiles are derived such as AutoFDO and MemProf. With this patch we have see a 0.2% improvement on an internal workload at Google when generating AutoFDO profiles. It also significantly improves the ability for MemProf by preserving debug info for merged call instructions used in the contextual profile. --------- Co-authored-by: Krzysztof Pszeniczny --- llvm/docs/HowToUpdateDebugInfo.rst | 14 +++- llvm/docs/SourceLevelDebugging.rst | 13 ++++ llvm/lib/IR/DebugInfoMetadata.cpp | 21 +++++ .../DebugInfo/pick-merged-source-locations.ll | 77 +++++++++++++++++++ 4 files changed, 121 insertions(+), 4 deletions(-) create mode 100644 llvm/test/DebugInfo/pick-merged-source-locations.ll diff --git a/llvm/docs/HowToUpdateDebugInfo.rst b/llvm/docs/HowToUpdateDebugInfo.rst index d8c300f2f3a70..3088f59c1066a 100644 --- a/llvm/docs/HowToUpdateDebugInfo.rst +++ b/llvm/docs/HowToUpdateDebugInfo.rst @@ -9,7 +9,8 @@ Introduction ============ Certain kinds of code transformations can inadvertently result in a loss of -debug info, or worse, make debug info misrepresent the state of a program. +debug info, or worse, make debug info misrepresent the state of a program. Debug +info availability is also essential for SamplePGO. This document specifies how to correctly update debug info in various kinds of code transformations, and offers suggestions for how to create targeted debug @@ -89,9 +90,14 @@ has a location with an accurate scope attached, and b) to prevent misleading single-stepping (or breakpoint) behavior. Often, merged instructions are memory accesses which can trap: having an accurate scope attached greatly assists in crash triage by identifying the (possibly inlined) function where the bad -memory access occurred. This rule is also meant to assist SamplePGO by banning -scenarios in which a sample of a block containing a merged instruction is -misattributed to a block containing one of the instructions-to-be-merged. +memory access occurred. + +To maintain distinct source locations for SamplePGO, it is often beneficial to +retain an arbitrary but deterministic location instead of discarding line and +column information as part of merging. In particular, loss of location +information for calls inhibit optimizations such as indirect call promotion. +This behavior can be optionally enabled until support for accurately +representing merged instructions in the line table is implemented. Examples of transformations that should follow this rule include: diff --git a/llvm/docs/SourceLevelDebugging.rst b/llvm/docs/SourceLevelDebugging.rst index b3007756a8d07..8a11dcf5254a9 100644 --- a/llvm/docs/SourceLevelDebugging.rst +++ b/llvm/docs/SourceLevelDebugging.rst @@ -55,6 +55,8 @@ the stored debug information into source-language specific information. As such, a debugger must be aware of the source-language, and is thus tied to a specific language or family of languages. +.. _intro_consumers: + Debug information consumers --------------------------- @@ -71,6 +73,17 @@ as Visual Studio and WinDBG. LLVM's debug information format is mostly derived from and inspired by DWARF, but it is feasible to translate into other target debug info formats such as STABS. +SamplePGO (also known as `AutoFDO `_) +is a variant of profile guided optimizations which uses hardware sampling based +profilers to collect branch frequency data with low overhead in production +environments. It relies on debug information to associate profile information +to LLVM IR which is then used to guide optimization heuristics. Maintaining +deterministic and distinct source locations is necessary to maximize the +accuracy of mapping hardware sample counts to LLVM IR. For example, DWARF +`discriminators `_ allow +SamplePGO to distinguish between multiple paths of execution which map to the +same source line. + It would also be reasonable to use debug information to feed profiling tools for analysis of generated code, or, tools for reconstructing the original source from generated code. diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index f8c24d896df32..12aba7d2bd123 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" +#include "llvm/Support/CommandLine.h" #include #include @@ -34,6 +35,12 @@ cl::opt EnableFSDiscriminator( cl::desc("Enable adding flow sensitive discriminators")); } // namespace llvm +// When true, preserves line and column number by picking one of the merged +// location info in a deterministic manner to assist sample based PGO. +static cl::opt PickMergedSourceLocations( + "pick-merged-source-locations", cl::init(false), cl::Hidden, + cl::desc("Preserve line and column number when merging locations.")); + uint32_t DIType::getAlignInBits() const { return (getTag() == dwarf::DW_TAG_LLVM_ptrauth_type ? 0 : SubclassData32); } @@ -125,6 +132,20 @@ DILocation *DILocation::getMergedLocation(DILocation *LocA, DILocation *LocB) { if (LocA == LocB) return LocA; + // For some use cases (SamplePGO), it is important to retain distinct source + // locations. When this flag is set, we choose arbitrarily between A and B, + // rather than computing a merged location using line 0, which is typically + // not useful for PGO. + if (PickMergedSourceLocations) { + auto A = std::make_tuple(LocA->getLine(), LocA->getColumn(), + LocA->getDiscriminator(), LocA->getFilename(), + LocA->getDirectory()); + auto B = std::make_tuple(LocB->getLine(), LocB->getColumn(), + LocB->getDiscriminator(), LocB->getFilename(), + LocB->getDirectory()); + return A < B ? LocA : LocB; + } + LLVMContext &C = LocA->getContext(); using LocVec = SmallVector; diff --git a/llvm/test/DebugInfo/pick-merged-source-locations.ll b/llvm/test/DebugInfo/pick-merged-source-locations.ll new file mode 100644 index 0000000000000..2a9387e039232 --- /dev/null +++ b/llvm/test/DebugInfo/pick-merged-source-locations.ll @@ -0,0 +1,77 @@ +;; This test verifies that we assign a deterministic location for merged +;; instructions when -pick-merged-source-locations is enabled. We use the +;; simplifycfg pass to test this behaviour since it was a common source of +;; merged instructions, however we intend this to apply to all users of the +;; getMergedLocation API. + +;; Run simplifycfg and check that only 1 call to bar remains and it's debug +;; location has a valid line number (lexicographically smallest). +; RUN: opt %s -passes=simplifycfg -hoist-common-insts -pick-merged-source-locations -S | FileCheck %s --check-prefix=ENABLED +; ENABLED: call i32 @bar{{.*!dbg !}}[[TAG:[0-9]+]] +; ENABLED-NOT: call i32 @bar +; ENABLED: ![[TAG]] = !DILocation(line: 9, column: 16, scope: !9) + +;; Run simplifycfg without the pass to ensure that we don't spuriously start +;; passing the test if simplifycfg behaviour changes. +; RUN: opt %s -passes=simplifycfg -hoist-common-insts -pick-merged-source-locations=false -S | FileCheck %s --check-prefix=DISABLED +; DISABLED: call i32 @bar{{.*!dbg !}}[[TAG:[0-9]+]] +; DISABLED-NOT: call i32 @bar +; DISABLED: ![[TAG]] = !DILocation(line: 0, scope: !9) + +; ModuleID = '../llvm/test/DebugInfo/Inputs/debug-info-merge-call.c' +source_filename = "../llvm/test/DebugInfo/Inputs/debug-info-merge-call.c" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define dso_local i32 @test(i32 %n) !dbg !9 { +entry: + %call = call i32 @foo(i32 %n), !dbg !12 + %cmp1 = icmp sgt i32 %n, 100, !dbg !13 + br i1 %cmp1, label %if.then, label %if.else, !dbg !13 + +if.then: ; preds = %entry + %call2 = call i32 @bar(i32 %n), !dbg !14 + %add = add nsw i32 %call2, %call, !dbg !15 + br label %if.end, !dbg !16 + +if.else: ; preds = %entry + %call4 = call i32 @bar(i32 %n), !dbg !17 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %r.0 = phi i32 [ %add, %if.then ], [ %call4, %if.else ], !dbg !18 + ret i32 %r.0, !dbg !19 +} + +declare !dbg !20 i32 @foo(i32) + +declare !dbg !21 i32 @bar(i32) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7} +!llvm.ident = !{!8} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 21.0.0git (git@github.com:snehasish/llvm-project.git 6ce41db6b0275d060d6e60f88b96a1657024345c)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "../llvm/test/DebugInfo/Inputs/debug-info-merge-call.c", directory: "/usr/local/google/home/snehasishk/working/llvm-project/build-assert", checksumkind: CSK_MD5, checksum: "ac1be6c40dad11691922d600f9d55c55") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{!"clang version 21.0.0git (git@github.com:snehasish/llvm-project.git 6ce41db6b0275d060d6e60f88b96a1657024345c)"} +!9 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 5, type: !10, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!10 = !DISubroutineType(types: !11) +!11 = !{} +!12 = !DILocation(line: 7, column: 13, scope: !9) +!13 = !DILocation(line: 8, column: 8, scope: !9) +!14 = !DILocation(line: 9, column: 16, scope: !9) +!15 = !DILocation(line: 9, column: 14, scope: !9) +!16 = !DILocation(line: 10, column: 3, scope: !9) +!17 = !DILocation(line: 11, column: 10, scope: !9) +!18 = !DILocation(line: 0, scope: !9) +!19 = !DILocation(line: 13, column: 3, scope: !9) +!20 = !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !10, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) +!21 = !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !10, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) + From 70a1445e401f6d7b531933ab157fba98ec84f69f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 4 Apr 2025 09:30:55 -0700 Subject: [PATCH 0665/1029] [RISCV] Prefer RegList over Rlist in assembler. NFC This makes it more obvious what the R means. I've kept rlist in place that refer to the encoding. --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 60 +++++++++---------- .../RISCV/MCTargetDesc/RISCVBaseInfo.cpp | 10 ++-- .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 6 +- .../RISCV/MCTargetDesc/RISCVInstPrinter.cpp | 4 +- .../RISCV/MCTargetDesc/RISCVInstPrinter.h | 4 +- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 4 +- llvm/lib/Target/RISCV/RISCVInstrInfoXqccmp.td | 18 +++--- llvm/lib/Target/RISCV/RISCVInstrInfoZc.td | 16 ++--- 8 files changed, 61 insertions(+), 61 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 2fdee13a734f6..bc725ea939aec 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -215,10 +215,10 @@ class RISCVAsmParser : public MCTargetAsmParser { ParseStatus parseGPRPair(OperandVector &Operands, bool IsRV64Inst); ParseStatus parseFRMArg(OperandVector &Operands); ParseStatus parseFenceArg(OperandVector &Operands); - ParseStatus parseReglist(OperandVector &Operands) { + ParseStatus parseRegList(OperandVector &Operands) { return parseRegListCommon(Operands, /*MustIncludeS0=*/false); } - ParseStatus parseReglistS0(OperandVector &Operands) { + ParseStatus parseRegListS0(OperandVector &Operands) { return parseRegListCommon(Operands, /*MustIncludeS0=*/true); } ParseStatus parseRegListCommon(OperandVector &Operands, bool MustIncludeS0); @@ -349,7 +349,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { VType, FRM, Fence, - Rlist, + RegList, Spimm, RegReg, } Kind; @@ -388,8 +388,8 @@ struct RISCVOperand final : public MCParsedAsmOperand { unsigned Val; }; - struct RlistOp { - unsigned Val; + struct RegListOp { + unsigned Encoding; }; struct SpimmOp { @@ -411,7 +411,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { VTypeOp VType; FRMOp FRM; FenceOp Fence; - RlistOp Rlist; + RegListOp RegList; SpimmOp Spimm; RegRegOp RegReg; }; @@ -448,8 +448,8 @@ struct RISCVOperand final : public MCParsedAsmOperand { case KindTy::Fence: Fence = o.Fence; break; - case KindTy::Rlist: - Rlist = o.Rlist; + case KindTy::RegList: + RegList = o.RegList; break; case KindTy::Spimm: Spimm = o.Spimm; @@ -482,9 +482,9 @@ struct RISCVOperand final : public MCParsedAsmOperand { bool isMem() const override { return false; } bool isSystemRegister() const { return Kind == KindTy::SystemRegister; } bool isRegReg() const { return Kind == KindTy::RegReg; } - bool isRlist() const { return Kind == KindTy::Rlist; } - bool isRlistS0() const { - return Kind == KindTy::Rlist && Rlist.Val != RISCVZC::RA; + bool isRegList() const { return Kind == KindTy::RegList; } + bool isRegListS0() const { + return Kind == KindTy::RegList && RegList.Encoding != RISCVZC::RA; } bool isSpimm() const { return Kind == KindTy::Spimm; } @@ -1009,9 +1009,9 @@ struct RISCVOperand final : public MCParsedAsmOperand { OS << getFence(); OS << '>'; break; - case KindTy::Rlist: - OS << "'; break; case KindTy::Spimm: @@ -1098,10 +1098,10 @@ struct RISCVOperand final : public MCParsedAsmOperand { return Op; } - static std::unique_ptr createRlist(unsigned RlistEncode, + static std::unique_ptr createRegList(unsigned RlistEncode, SMLoc S) { - auto Op = std::make_unique(KindTy::Rlist); - Op->Rlist.Val = RlistEncode; + auto Op = std::make_unique(KindTy::RegList); + Op->RegList.Encoding = RlistEncode; Op->StartLoc = S; return Op; } @@ -1183,9 +1183,9 @@ struct RISCVOperand final : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createImm(Imm)); } - void addRlistOperands(MCInst &Inst, unsigned N) const { + void addRegListOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createImm(Rlist.Val)); + Inst.addOperand(MCOperand::createImm(RegList.Encoding)); } void addRegRegOperands(MCInst &Inst, unsigned N) const { @@ -2569,13 +2569,13 @@ ParseStatus RISCVAsmParser::parseRegReg(OperandVector &Operands) { ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, bool MustIncludeS0) { - // Rlist: {ra [, s0[-sN]]} - // XRlist: {x1 [, x8[-x9][, x18[-xN]]]} + // RegList: {ra [, s0[-sN]]} + // XRegList: {x1 [, x8[-x9][, x18[-xN]]]} // When MustIncludeS0 = true (not the default) (used for `qc.cm.pushfp`) which // must include `fp`/`s0` in the list: - // Rlist: {ra, s0[-sN]} - // XRlist: {x1, x8[-x9][, x18[-xN]]} + // RegList: {ra, s0[-sN]} + // XRegList: {x1, x8[-x9][, x18[-xN]]} if (getTok().isNot(AsmToken::LCurly)) return ParseStatus::NoMatch; @@ -2656,13 +2656,13 @@ ParseStatus RISCVAsmParser::parseRegListCommon(OperandVector &Operands, return Error(S, "invalid register list, {ra, s0-s10} or {x1, x8-x9, " "x18-x26} is not supported"); - auto Encode = RISCVZC::encodeRlist(RegEnd, IsRVE); + auto Encode = RISCVZC::encodeRegList(RegEnd, IsRVE); assert(Encode != RISCVZC::INVALID_RLIST); if (MustIncludeS0 && Encode == RISCVZC::RA) return Error(S, "register list must include 's0' or 'x8'"); - Operands.push_back(RISCVOperand::createRlist(Encode, S)); + Operands.push_back(RISCVOperand::createRegList(Encode, S)); return ParseStatus::Success; } @@ -2677,14 +2677,14 @@ ParseStatus RISCVAsmParser::parseZcmpStackAdj(OperandVector &Operands, int64_t StackAdjustment = getTok().getIntVal(); - auto *RListOp = static_cast(Operands.back().get()); - if (!RListOp->isRlist()) + auto *RegListOp = static_cast(Operands.back().get()); + if (!RegListOp->isRegList()) return ParseStatus::NoMatch; - unsigned RlistVal = RListOp->Rlist.Val; + unsigned RlistEncode = RegListOp->RegList.Encoding; - assert(RlistVal != RISCVZC::INVALID_RLIST); - unsigned StackAdjBase = RISCVZC::getStackAdjBase(RlistVal, isRV64()); + assert(RlistEncode != RISCVZC::INVALID_RLIST); + unsigned StackAdjBase = RISCVZC::getStackAdjBase(RlistEncode, isRV64()); if (Negative != ExpectNegative || StackAdjustment % 16 != 0 || StackAdjustment < StackAdjBase || (StackAdjustment - StackAdjBase) > 48) { int64_t Lower = StackAdjBase; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp index d5f08ac05f82b..7e199af98cb03 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp @@ -240,14 +240,14 @@ float RISCVLoadFPImm::getFPImm(unsigned Imm) { return bit_cast(I); } -void RISCVZC::printRlist(unsigned SlistEncode, raw_ostream &OS) { +void RISCVZC::printRegList(unsigned RlistEncode, raw_ostream &OS) { OS << "{ra"; - if (SlistEncode > RISCVZC::RA) { + if (RlistEncode > RISCVZC::RA) { OS << ", s0"; - if (SlistEncode == RISCVZC::RA_S0_S11) + if (RlistEncode == RISCVZC::RA_S0_S11) OS << "-s11"; - else if (SlistEncode > RISCVZC::RA_S0 && SlistEncode <= RISCVZC::RA_S0_S11) - OS << "-s" << (SlistEncode - RISCVZC::RA_S0); + else if (RlistEncode > RISCVZC::RA_S0 && RlistEncode <= RISCVZC::RA_S0_S11) + OS << "-s" << (RlistEncode - RISCVZC::RA_S0); } OS << "}"; } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index adccd1e6c5002..f4d18dec054c1 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -609,7 +609,7 @@ enum RLISTENCODE { INVALID_RLIST, }; -inline unsigned encodeRlist(MCRegister EndReg, bool IsRVE = false) { +inline unsigned encodeRegList(MCRegister EndReg, bool IsRVE = false) { assert((!IsRVE || EndReg <= RISCV::X9) && "Invalid Rlist for RV32E"); switch (EndReg) { case RISCV::X1: @@ -641,7 +641,7 @@ inline unsigned encodeRlist(MCRegister EndReg, bool IsRVE = false) { } } -inline static unsigned encodeRlistNumRegs(unsigned NumRegs) { +inline static unsigned encodeRegListNumRegs(unsigned NumRegs) { assert(NumRegs > 0 && NumRegs < 14 && NumRegs != 12 && "Unexpected number of registers"); if (NumRegs == 13) @@ -662,7 +662,7 @@ inline static unsigned getStackAdjBase(unsigned RlistVal, bool IsRV64) { return alignTo(NumRegs * RegSize, 16); } -void printRlist(unsigned SlistEncode, raw_ostream &OS); +void printRegList(unsigned RlistEncode, raw_ostream &OS); } // namespace RISCVZC namespace RISCVVInversePseudosTable { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index cd2322cc5b26d..8a384020820ff 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -222,8 +222,8 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo, // Print a Zcmp RList. If we are printing architectural register names rather // than ABI register names, we need to print "{x1, x8-x9, x18-x27}" for all // registers. Otherwise, we print "{ra, s0-s11}". -void RISCVInstPrinter::printRlist(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { +void RISCVInstPrinter::printRegList(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Imm = MI->getOperand(OpNo).getImm(); O << "{"; printRegName(O, RISCV::X1); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h index 6d4928ee64ec9..7463088d1bebf 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h @@ -50,8 +50,8 @@ class RISCVInstPrinter : public MCInstPrinter { raw_ostream &O); void printVMaskReg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); - void printRlist(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, - raw_ostream &O); + void printRegList(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printStackAdj(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O, bool Negate = false); diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 7a68f2878880e..41051e46f1bb1 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -1989,7 +1989,7 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters( if (PushedRegNum > 0) { // Use encoded number to represent registers to spill. unsigned Opcode = getPushOpcode(RVFI->getPushPopKind(*MF), hasFP(*MF)); - unsigned RegEnc = RISCVZC::encodeRlistNumRegs(PushedRegNum); + unsigned RegEnc = RISCVZC::encodeRegListNumRegs(PushedRegNum); MachineInstrBuilder PushBuilder = BuildMI(MBB, MI, DL, TII.get(Opcode)) .setMIFlag(MachineInstr::FrameSetup); @@ -2151,7 +2151,7 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters( unsigned PushedRegNum = RVFI->getRVPushRegs(); if (PushedRegNum > 0) { unsigned Opcode = getPopOpcode(RVFI->getPushPopKind(*MF)); - unsigned RegEnc = RISCVZC::encodeRlistNumRegs(PushedRegNum); + unsigned RegEnc = RISCVZC::encodeRegListNumRegs(PushedRegNum); MachineInstrBuilder PopBuilder = BuildMI(MBB, MI, DL, TII.get(Opcode)) .setMIFlag(MachineInstr::FrameDestroy); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqccmp.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqccmp.td index bee937f91f46c..228b75e33c080 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqccmp.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqccmp.td @@ -25,17 +25,17 @@ // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// -def RlistS0AsmOperand : AsmOperandClass { - let Name = "RlistS0"; - let ParserMethod = "parseReglistS0"; - let RenderMethod = "addRlistOperands"; - let DiagnosticType = "InvalidRlistS0"; +def RegListS0AsmOperand : AsmOperandClass { + let Name = "RegListS0"; + let ParserMethod = "parseRegListS0"; + let RenderMethod = "addRegListOperands"; + let DiagnosticType = "InvalidRegListS0"; let DiagnosticString = "operand must be {ra, s0[-sN]} or {x1, x8[-x9][, x18[-xN]]}"; } -def rlist_s0 : RISCVOp { - let ParserMatchClass = RlistS0AsmOperand; - let PrintMethod = "printRlist"; +def reglist_s0 : RISCVOp { + let ParserMatchClass = RegListS0AsmOperand; + let PrintMethod = "printRegList"; let DecoderMethod = "decodeXqccmpRlistS0"; let EncoderMethod = "getRlistS0OpValue"; let MCOperandPredicate = [{ @@ -59,7 +59,7 @@ def rlist_s0 : RISCVOp { class RVInstXqccmpCPPPFP funct5, string opcodestr, DAGOperand immtype = stackadj> - : RVInst16<(outs), (ins rlist_s0:$rlist, immtype:$stackadj), + : RVInst16<(outs), (ins reglist_s0:$rlist, immtype:$stackadj), opcodestr, "$rlist, $stackadj", [], InstFormatOther> { bits<4> rlist; bits<16> stackadj; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td index 3f90714cdbe88..bcda5331d845f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td @@ -35,10 +35,10 @@ def uimm8ge32 : RISCVOp { let OperandType = "OPERAND_UIMM8_GE32"; } -def RlistAsmOperand : AsmOperandClass { - let Name = "Rlist"; - let ParserMethod = "parseReglist"; - let DiagnosticType = "InvalidRlist"; +def RegListAsmOperand : AsmOperandClass { + let Name = "RegList"; + let ParserMethod = "parseRegList"; + let DiagnosticType = "InvalidRegList"; let DiagnosticString = "operand must be {ra [, s0[-sN]]} or {x1 [, x8[-x9][, x18[-xN]]]}"; } @@ -58,9 +58,9 @@ def NegStackAdjAsmOperand : AsmOperandClass { let RenderMethod = "addSpimmOperands"; } -def rlist : RISCVOp { - let ParserMatchClass = RlistAsmOperand; - let PrintMethod = "printRlist"; +def reglist : RISCVOp { + let ParserMatchClass = RegListAsmOperand; + let PrintMethod = "printRegList"; let DecoderMethod = "decodeZcmpRlist"; let EncoderMethod = "getRlistOpValue"; let MCOperandPredicate = [{ @@ -155,7 +155,7 @@ class RVZcArith_r funct5, string OpcodeStr> : class RVInstZcCPPP funct5, string opcodestr, DAGOperand immtype = stackadj> - : RVInst16<(outs), (ins rlist:$rlist, immtype:$stackadj), + : RVInst16<(outs), (ins reglist:$rlist, immtype:$stackadj), opcodestr, "$rlist, $stackadj", [], InstFormatOther> { bits<4> rlist; bits<16> stackadj; From 5942f0269e62021620e871bfe2e671f4f0f2d932 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 4 Apr 2025 09:49:57 -0700 Subject: [PATCH 0666/1029] [flang] Preserve compiler directives in -E output (#133959) No longer require -fopenmp or -fopenacc with -E, unless specific version number options are also required for predefined macros. This means that most source can be preprocessed with -E and then later compiled with -fopenmp, -fopenacc, or neither. This means that OpenMP conditional compilation lines (!$) are also passed through to -E output. The tricky part of this patch was dealing with the fact that those conditional lines can also contain regular Fortran line continuation, and that now has to be deferred when !$ lines are interspersed. --- flang/lib/Parser/parsing.cpp | 68 +++++-- flang/lib/Parser/prescan.cpp | 175 ++++++++++-------- flang/lib/Parser/token-sequence.cpp | 1 + .../compiler-directive-continuation.f90 | 39 ++-- flang/test/Parser/OpenMP/sentinels.f | 10 +- .../continuation-in-conditional-compilation.f | 5 +- flang/test/Preprocessing/bug126459.F90 | 6 +- flang/test/Preprocessing/line-in-contin.F90 | 2 +- flang/test/Preprocessing/pp132.f90 | 4 +- .../test/Preprocessing/preprocessed-dirs.F90 | 2 +- 10 files changed, 188 insertions(+), 124 deletions(-) diff --git a/flang/lib/Parser/parsing.cpp b/flang/lib/Parser/parsing.cpp index 8fcac7b3cacb1..5f486cbf8e4c8 100644 --- a/flang/lib/Parser/parsing.cpp +++ b/flang/lib/Parser/parsing.cpp @@ -79,16 +79,24 @@ const SourceFile *Parsing::Prescan(const std::string &path, Options options) { .set_expandIncludeLines(!options.prescanAndReformat || options.expandIncludeLinesInPreprocessedOutput) .AddCompilerDirectiveSentinel("dir$"); - if (options.features.IsEnabled(LanguageFeature::OpenACC)) { + bool noneOfTheAbove{!options.features.IsEnabled(LanguageFeature::OpenACC) && + !options.features.IsEnabled(LanguageFeature::OpenMP) && + !options.features.IsEnabled(LanguageFeature::CUDA)}; + if (options.features.IsEnabled(LanguageFeature::OpenACC) || + (options.prescanAndReformat && noneOfTheAbove)) { prescanner.AddCompilerDirectiveSentinel("$acc"); } - if (options.features.IsEnabled(LanguageFeature::OpenMP)) { + if (options.features.IsEnabled(LanguageFeature::OpenMP) || + (options.prescanAndReformat && noneOfTheAbove)) { prescanner.AddCompilerDirectiveSentinel("$omp"); prescanner.AddCompilerDirectiveSentinel("$"); // OMP conditional line } - if (options.features.IsEnabled(LanguageFeature::CUDA)) { + if (options.features.IsEnabled(LanguageFeature::CUDA) || + (options.prescanAndReformat && noneOfTheAbove)) { prescanner.AddCompilerDirectiveSentinel("$cuf"); prescanner.AddCompilerDirectiveSentinel("@cuf"); + } + if (options.features.IsEnabled(LanguageFeature::CUDA)) { preprocessor_.Define("_CUDA", "1"); } ProvenanceRange range{allSources.AddIncludedFile( @@ -119,11 +127,13 @@ void Parsing::EmitPreprocessedSource( int sourceLine{0}; int column{1}; bool inDirective{false}; + bool ompConditionalLine{false}; bool inContinuation{false}; bool lineWasBlankBefore{true}; const AllSources &allSources{allCooked().allSources()}; - // All directives that flang support are known to have a length of 3 chars - constexpr int directiveNameLength{3}; + // All directives that flang supports are known to have a length of 4 chars, + // except for OpenMP conditional compilation lines (!$). + constexpr int directiveNameLength{4}; // We need to know the current directive in order to provide correct // continuation for the directive std::string directive; @@ -133,6 +143,7 @@ void Parsing::EmitPreprocessedSource( out << '\n'; // TODO: DOS CR-LF line ending if necessary column = 1; inDirective = false; + ompConditionalLine = false; inContinuation = false; lineWasBlankBefore = true; ++sourceLine; @@ -153,16 +164,21 @@ void Parsing::EmitPreprocessedSource( return ch; }}; + bool inDirectiveSentinel{false}; if (ch == '!' && lineWasBlankBefore) { // Other comment markers (C, *, D) in original fixed form source // input card column 1 will have been deleted or normalized to !, // which signifies a comment (directive) in both source forms. inDirective = true; - } - bool inDirectiveSentinel{ - inDirective && directive.size() < directiveNameLength}; - if (inDirectiveSentinel && IsLetter(ch)) { - directive += getOriginalChar(ch); + inDirectiveSentinel = true; + } else if (inDirective && !ompConditionalLine && + directive.size() < directiveNameLength) { + if (IsLetter(ch) || ch == '$' || ch == '@') { + directive += getOriginalChar(ch); + inDirectiveSentinel = true; + } else if (directive == "$"s) { + ompConditionalLine = true; + } } std::optional position{provenance @@ -199,9 +215,16 @@ void Parsing::EmitPreprocessedSource( // column limit override option. // OpenMP and OpenACC directives' continuations should have the // corresponding sentinel at the next line. - const auto continuation{ - inDirective ? "&\n!$" + directive + "&" : "&\n &"s}; - out << continuation; + out << "&\n"; + if (inDirective) { + if (ompConditionalLine) { + out << "!$ &"; + } else { + out << '!' << directive << '&'; + } + } else { + out << " &"; + } column = 7; // start of fixed form source field ++sourceLine; inContinuation = true; @@ -212,11 +235,20 @@ void Parsing::EmitPreprocessedSource( out << ' '; } } - if (!inContinuation && !inDirectiveSentinel && position && - position->column <= 72 && ch != ' ') { - // Preserve original indentation - for (; column < position->column; ++column) { - out << ' '; + if (ch != ' ') { + if (ompConditionalLine) { + // Only digits can stay in the label field + if (!(ch >= '0' && ch <= '9')) { + for (; column < 7; ++column) { + out << ' '; + } + } + } else if (!inContinuation && !inDirectiveSentinel && position && + position->column <= 72) { + // Preserve original indentation + for (; column < position->column; ++column) { + out << ' '; + } } } out << getOriginalChar(ch); diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp index 755cb18cb8caf..31fdadeddef53 100644 --- a/flang/lib/Parser/prescan.cpp +++ b/flang/lib/Parser/prescan.cpp @@ -150,16 +150,18 @@ void Prescanner::Statement() { CHECK(*at_ == '!'); } std::optional condOffset; + bool isFFOpenMPCondCompilation{false}; if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') { // OpenMP conditional compilation line. condOffset = 2; + isFFOpenMPCondCompilation = inFixedForm_; } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'c' && directiveSentinel_[2] == 'u' && directiveSentinel_[3] == 'f' && directiveSentinel_[4] == '\0') { // CUDA conditional compilation line. condOffset = 5; } - if (condOffset) { + if (condOffset && !preprocessingOnly_) { at_ += *condOffset, column_ += *condOffset; if (auto payload{IsIncludeLine(at_)}) { FortranInclude(at_ + *payload); @@ -171,6 +173,8 @@ void Prescanner::Statement() { } } else { // Compiler directive. Emit normalized sentinel, squash following spaces. + // Conditional compilation lines (!$) take this path in -E mode too + // so that -fopenmp only has to appear on the later compilation. EmitChar(tokens, '!'); ++at_, ++column_; for (const char *sp{directiveSentinel_}; *sp != '\0'; @@ -178,10 +182,23 @@ void Prescanner::Statement() { EmitChar(tokens, *sp); } if (IsSpaceOrTab(at_)) { - EmitChar(tokens, ' '); while (int n{IsSpaceOrTab(at_)}) { + if (isFFOpenMPCondCompilation) { + EmitChar(tokens, ' '); + } at_ += n, ++column_; } + if (isFFOpenMPCondCompilation && column_ == 6) { + if (*at_ == '0') { + EmitChar(tokens, ' '); + } else { + tokens.CloseToken(); + EmitChar(tokens, '&'); + } + ++at_, ++column_; + } else { + EmitChar(tokens, ' '); + } } tokens.CloseToken(); } @@ -330,7 +347,7 @@ void Prescanner::Statement() { void Prescanner::CheckAndEmitLine( TokenSequence &tokens, Provenance newlineProvenance) { tokens.CheckBadFortranCharacters( - messages_, *this, disableSourceContinuation_); + messages_, *this, disableSourceContinuation_ || preprocessingOnly_); // Parenthesis nesting check does not apply while any #include is // active, nor on the lines before and after a top-level #include, // nor before or after conditional source. @@ -1260,10 +1277,12 @@ const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) { } tabInCurrentLine_ = false; char col1{*nextLine_}; - if (IsFixedFormCommentChar(col1)) { - int j{1}; - if (InCompilerDirective()) { - // Must be a continued compiler directive. + if (InCompilerDirective()) { + if (preprocessingOnly_ && directiveSentinel_[0] == '$' && + directiveSentinel_[1] == '\0') { + // in -E mode, don't treat "!$ &" as a continuation + } else if (IsFixedFormCommentChar(col1)) { + int j{1}; for (; j < 5; ++j) { char ch{directiveSentinel_[j - 1]}; if (ch == '\0') { @@ -1273,28 +1292,19 @@ const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) { return nullptr; } } - } else if (features_.IsEnabled(LanguageFeature::OpenMP)) { - // Fixed Source Form Conditional Compilation Sentinels. - if (nextLine_[1] != '$') { - return nullptr; - } - j++; - } else { - return nullptr; - } - for (; j < 5; ++j) { - if (nextLine_[j] != ' ') { - return nullptr; + for (; j < 5; ++j) { + if (nextLine_[j] != ' ') { + return nullptr; + } } - } - const char *col6{nextLine_ + 5}; - if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) { - if (mightNeedSpace && !IsSpace(nextLine_ + 6)) { - insertASpace_ = true; + const char *col6{nextLine_ + 5}; + if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) { + if (mightNeedSpace && !IsSpace(nextLine_ + 6)) { + insertASpace_ = true; + } + return nextLine_ + 6; } - return nextLine_ + 6; } - return nullptr; } else { // Normal case: not in a compiler directive. if (col1 == '&' && @@ -1334,59 +1344,60 @@ const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) { } const char *Prescanner::FreeFormContinuationLine(bool ampersand) { - const char *p{nextLine_}; + const char *lineStart{nextLine_}; + const char *p{lineStart}; if (p >= limit_) { return nullptr; } p = SkipWhiteSpaceIncludingEmptyMacros(p); - if (*p == '!') { - ++p; - if (InCompilerDirective()) { + if (InCompilerDirective()) { + if (preprocessingOnly_ && directiveSentinel_[0] == '$' && + directiveSentinel_[1] == '\0') { + // in -E mode, don't treat !$ as a continuation + } else if (*p++ == '!') { for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) { if (*s != ToLowerCaseLetter(*p)) { - return nullptr; + return nullptr; // not the same directive class } } - } else if (features_.IsEnabled(LanguageFeature::OpenMP) && *p == '$') { - ++p; - } else { - return nullptr; - } - p = SkipWhiteSpace(p); - if (*p == '&') { - if (!ampersand) { - insertASpace_ = true; + p = SkipWhiteSpace(p); + if (*p == '&') { + if (!ampersand) { + insertASpace_ = true; + } + return p + 1; + } else if (ampersand) { + return p; } - return p + 1; - } else if (ampersand) { - return p; - } else { - return nullptr; } - } else { - if (*p == '&') { - return p + 1; - } else if (*p == '!' || *p == '\n' || *p == '#') { - return nullptr; - } else if (ampersand || IsImplicitContinuation()) { - if (continuationInCharLiteral_) { - // 'a'& -> 'a''b' == "a'b" - // 'b' - if (features_.ShouldWarn( - common::LanguageFeature::MiscSourceExtensions)) { - Say(common::LanguageFeature::MiscSourceExtensions, - GetProvenanceRange(p, p + 1), - "Character literal continuation line should have been preceded by '&'"_port_en_US); - } - } else if (p > nextLine_) { - --p; - } else { - insertASpace_ = true; + return nullptr; + } + if (p[0] == '!' && p[1] == '$' && !preprocessingOnly_ && + features_.IsEnabled(LanguageFeature::OpenMP)) { + // !$ conditional line can be a continuation + p = lineStart = SkipWhiteSpace(p + 2); + } + if (*p == '&') { + return p + 1; + } else if (*p == '!' || *p == '\n' || *p == '#') { + return nullptr; + } else if (ampersand || IsImplicitContinuation()) { + if (continuationInCharLiteral_) { + // 'a'& -> 'a''b' == "a'b" + // 'b' + if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) { + Say(common::LanguageFeature::MiscSourceExtensions, + GetProvenanceRange(p, p + 1), + "Character literal continuation line should have been preceded by '&'"_port_en_US); } - return p; + } else if (p > lineStart) { + --p; } else { - return nullptr; + insertASpace_ = true; } + return p; + } else { + return nullptr; } } @@ -1419,6 +1430,8 @@ bool Prescanner::FreeFormContinuation() { } else if (*p == '!') { // & ! comment - ok } else if (ampersand && isPossibleMacroCall_ && (*p == ',' || *p == ')')) { return false; // allow & at end of a macro argument + } else if (ampersand && preprocessingOnly_ && !parenthesisNesting_) { + return false; // allow & at start of line, maybe after !$ } else if (features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) { Say(LanguageFeature::CruftAfterAmpersand, GetProvenance(p), "missing ! before comment after &"_warn_en_US); @@ -1481,35 +1494,37 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const { } *sp++ = ToLowerCaseLetter(*p); } + if (sp == sentinel) { + return std::nullopt; + } + *sp = '\0'; // A fixed form OpenMP conditional compilation sentinel must satisfy the // following criteria, for initial lines: // - Columns 3 through 5 must have only white space or numbers. // - Column 6 must be space or zero. - if (column == 3 && sentinel[0] == '$') { - const char *q{p}; - for (int col{3}; col < 6; ++col, ++q) { - if (!IsSpaceOrTab(q) && !IsDecimalDigit(*q)) { + bool isOpenMPConditional{sp == &sentinel[1] && sentinel[0] == '$'}; + bool hadDigit{false}; + if (isOpenMPConditional) { + for (; column < 6; ++column, ++p) { + if (IsDecimalDigit(*p)) { + hadDigit = true; + } else if (!IsSpaceOrTab(p)) { return std::nullopt; } } - if (*q != ' ' && *q != '0') { - return std::nullopt; - } } if (column == 6) { if (*p == '0') { ++p; } else if (int n{IsSpaceOrTab(p)}) { p += n; + } else if (isOpenMPConditional && preprocessingOnly_ && !hadDigit) { + // In -E mode, "!$ &" is treated as a directive } else { // This is a Continuation line, not an initial directive line. return std::nullopt; } } - if (sp == sentinel) { - return std::nullopt; - } - *sp = '\0'; if (const char *ss{IsCompilerDirectiveSentinel( sentinel, static_cast(sp - sentinel))}) { return { @@ -1575,7 +1590,8 @@ std::optional> Prescanner::IsCompilerDirectiveSentinel(const char *p) const { char sentinel[8]; for (std::size_t j{0}; j + 1 < sizeof sentinel && *p != '\n'; ++p, ++j) { - if (int n{*p == '&' ? 1 : IsSpaceOrTab(p)}) { + if (int n{IsSpaceOrTab(p)}; + n || !(IsLetter(*p) || *p == '$' || *p == '@')) { if (j > 0) { sentinel[j] = '\0'; p = SkipWhiteSpaceIncludingEmptyMacros(p + n); @@ -1668,7 +1684,8 @@ void Prescanner::SourceFormChange(std::string &&dir) { bool Prescanner::CompilerDirectiveContinuation( TokenSequence &tokens, const char *origSentinel) { if (inFixedForm_ || tokens.empty() || - tokens.TokenAt(tokens.SizeInTokens() - 1) != "&") { + tokens.TokenAt(tokens.SizeInTokens() - 1) != "&" || + (preprocessingOnly_ && !parenthesisNesting_)) { return false; } LineClassification followingLine{ClassifyLine(nextLine_)}; diff --git a/flang/lib/Parser/token-sequence.cpp b/flang/lib/Parser/token-sequence.cpp index cdbe89b1eb441..fb1ea5965f338 100644 --- a/flang/lib/Parser/token-sequence.cpp +++ b/flang/lib/Parser/token-sequence.cpp @@ -318,6 +318,7 @@ llvm::raw_ostream &TokenSequence::Dump(llvm::raw_ostream &o) const { o << '[' << j << "] @ " << start_[j] << " '" << TokenAt(j).ToString() << "'\n"; } + provenances_.Dump(o << "provenances_:\n"); return o; } diff --git a/flang/test/Parser/OpenMP/compiler-directive-continuation.f90 b/flang/test/Parser/OpenMP/compiler-directive-continuation.f90 index 87e4a72c54294..7ace109272302 100644 --- a/flang/test/Parser/OpenMP/compiler-directive-continuation.f90 +++ b/flang/test/Parser/OpenMP/compiler-directive-continuation.f90 @@ -1,12 +1,18 @@ -! RUN: %flang_fc1 -fopenmp -E %s 2>&1 | FileCheck %s --check-prefix=CHECK-OMP -! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s --strict-whitespace --check-prefix=CHECK-E +! RUN: %flang_fc1 -fopenmp -fdebug-unparse %s 2>&1 | FileCheck %s --check-prefix=CHECK-OMP +! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-OMP - -! Test in mixed way, i.e., combination of Fortran free source form +! Test in mixed way, i.e., combination of Fortran free source form ! and free source form with conditional compilation sentinel. ! CHECK-LABEL: subroutine mixed_form1() -! CHECK-OMP: i = 1 +100+ 1000+ 10 + 1 +1000000000 + 1000000 -! CHECK: i = 1 + 10 + 10000 + 1000000 +! CHECK-E:{{^}} i = 1 & +! CHECK-E:{{^}}!$ +100& +! CHECK-E:{{^}}!$ &+ 1000& +! CHECK-E:{{^}} &+ 10 + 1& +! CHECK-E:{{^}}!$ & +100000& +! CHECK-E:{{^}} &0000 + 1000000 +! CHECK-OMP: i=1001001112_4 +! CHECK-NO-OMP: i=1010011_4 subroutine mixed_form1() i = 1 & !$+100& @@ -14,13 +20,13 @@ subroutine mixed_form1() &+ 10 + 1& !$& +100000& &0000 + 1000000 -end subroutine - +end subroutine ! Testing continuation lines in only Fortran Free form Source ! CHECK-LABEL: subroutine mixed_form2() -! CHECK-OMP: i = 1 +10 +100 + 1000 + 10000 -! CHECK: i = 1 +10 +100 + 1000 + 10000 +! CHECK-E:{{^}} i = 1 +10 +100 + 1000 + 10000 +! CHECK-OMP: i=11111_4 +! CHECK-NO-OMP: i=11111_4 subroutine mixed_form2() i = 1 & +10 & @@ -29,16 +35,21 @@ subroutine mixed_form2() + 10000 end subroutine - ! Testing continuation line in only free source form conditional compilation sentinel. ! CHECK-LABEL: subroutine mixed_form3() -! CHECK-OMP: i=0 -! CHECK-OMP: i = 1 +10 +100+1000 +! CHECK-E:{{^}}!$ i=0 +! CHECK-E:{{^}}!$ i = 1 & +! CHECK-E:{{^}}!$ & +10 & +! CHECK-E:{{^}}!$ &+100& +! CHECK-E:{{^}}!$ +1000 +! CHECK-OMP: i=0_4 +! CHECK-OMP: i=1111_4 +! CHECK-NO-OMP-NOT: i=0_4 subroutine mixed_form3() !$ i=0 !$ i = 1 & !$ & +10 & !$&+100& - !$ +1000 + !$ +1000 end subroutine diff --git a/flang/test/Parser/OpenMP/sentinels.f b/flang/test/Parser/OpenMP/sentinels.f index f41ff13bcdd34..299b83e2abba8 100644 --- a/flang/test/Parser/OpenMP/sentinels.f +++ b/flang/test/Parser/OpenMP/sentinels.f @@ -1,4 +1,4 @@ -! RUN: %flang_fc1 -fopenmp -E %s | FileCheck %s +! RUN: %flang_fc1 -E %s | FileCheck %s ! CHECK: program main ! CHECK: interface ! CHECK: subroutine sub(a, b) @@ -60,13 +60,13 @@ subroutine sub(a, b) c$ +& , "comment" ! Test valid chars in initial and continuation lines. -! CHECK: "msg2" -! CHECK-SAME: "msg3" +! CHECK: !$ 20 PRINT *, "msg2" +! CHECK: !$ & , "msg3" c$ 20 PRINT *, "msg2" c$ & , "msg3" -! CHECK: "msg4" -! CHECK-SAME: "msg5" +! CHECK: !$ PRINT *, "msg4", +! CHECK: !$ & "msg5" c$ 0PRINT *, "msg4", c$ + "msg5" end diff --git a/flang/test/Parser/continuation-in-conditional-compilation.f b/flang/test/Parser/continuation-in-conditional-compilation.f index 987112301e335..35525b4fda582 100644 --- a/flang/test/Parser/continuation-in-conditional-compilation.f +++ b/flang/test/Parser/continuation-in-conditional-compilation.f @@ -1,6 +1,7 @@ -! RUN: %flang_fc1 -fopenmp -fopenacc -E %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s program main -! CHECK: k01=1+ 1 +! CHECK: k01=1+ +! CHECK: !$ & 1 k01=1+ !$ & 1 diff --git a/flang/test/Preprocessing/bug126459.F90 b/flang/test/Preprocessing/bug126459.F90 index fae8a07659f72..b0aa58630adc3 100644 --- a/flang/test/Preprocessing/bug126459.F90 +++ b/flang/test/Preprocessing/bug126459.F90 @@ -1,5 +1,5 @@ -! RUN: %flang -E -fopenmp %s 2>&1 | FileCheck %s -!CHECK: NDIR=0 +! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s 2>&1 | FileCheck %s +!CHECK: ndir=0 #define BLANKMACRO -BLANKMACRO !$ NDIR=0 +BLANKMACRO !$ ndir=0 end diff --git a/flang/test/Preprocessing/line-in-contin.F90 b/flang/test/Preprocessing/line-in-contin.F90 index 28efbd02d3ae8..4dc65d5614b30 100644 --- a/flang/test/Preprocessing/line-in-contin.F90 +++ b/flang/test/Preprocessing/line-in-contin.F90 @@ -1,4 +1,4 @@ -! RUN: %flang_fc1 -fopenmp -E %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s ! CHECK: call foo(0.) ! CHECK: call foo(1.) ! CHECK: call foo(2.) diff --git a/flang/test/Preprocessing/pp132.f90 b/flang/test/Preprocessing/pp132.f90 index 76ffa3e21c137..a0fdf46e7066f 100644 --- a/flang/test/Preprocessing/pp132.f90 +++ b/flang/test/Preprocessing/pp132.f90 @@ -1,8 +1,10 @@ -! RUN: %flang -E -fopenmp -fopenacc %s 2>&1 | FileCheck --strict-whitespace %s +! RUN: %flang -E %s 2>&1 | FileCheck --strict-whitespace %s ! CHECK: {{^}}!$OMP parallel default(shared) private(super_very_long_name_for_the_va& ! CHECK-NEXT: {{^}}!$OMP&riable) +! CHECK: {{^}}!$omp end parallel ! CHECK: {{^}}!$acc data copyin(super_very_long_name_for_the_variable, another_super& ! CHECK-NEXT: {{^}}!$acc&_wordy_variable_to_test) +! CHECK: {{^}}!$acc end data ! CHECK: {{^}}!$OMP something something ! Test correct continuations in compiler directives and left-alignment of sentinels subroutine foo diff --git a/flang/test/Preprocessing/preprocessed-dirs.F90 b/flang/test/Preprocessing/preprocessed-dirs.F90 index 26253b62ff22c..f4a5731506062 100644 --- a/flang/test/Preprocessing/preprocessed-dirs.F90 +++ b/flang/test/Preprocessing/preprocessed-dirs.F90 @@ -1,4 +1,4 @@ -! RUN: %flang_fc1 -E -fopenacc %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -E %s 2>&1 | FileCheck %s !CHECK: subroutine r4(x) Z real :: x Z !$acc routine Z print *, x Z end #define SUB(s, t) subroutine s(x) Z\ t :: x Z\ From 232525f06942adb3b9977632e38dcd5f08c0642d Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Fri, 4 Apr 2025 10:05:44 -0700 Subject: [PATCH 0667/1029] [lldb] Clear thread-creation breakpoints in ProcessGDBRemote::Clear (#134397) Currently, these breakpoints are being accumulated every time a new process if created (e.g. through a `run`). Depending on the circumstances, the old breakpoints are even left enabled, interfering with subsequent processes. This is addressed by removing the breakpoints in ProcessGDBRemote::Clear Note that these breakpoints are more of a PlatformDarwin thing, so in the future we should look into moving them there. --- .../Process/gdb-remote/ProcessGDBRemote.cpp | 3 +++ .../TestBreakpointsThreadInit.py | 20 +++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp index 68360788c96e6..d7e8c2ce7944e 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp @@ -3410,6 +3410,9 @@ Status ProcessGDBRemote::DisableWatchpoint(WatchpointSP wp_sp, bool notify) { void ProcessGDBRemote::Clear() { m_thread_list_real.Clear(); m_thread_list.Clear(); + if (m_thread_create_bp_sp) + if (TargetSP target_sp = m_target_wp.lock()) + target_sp->RemoveBreakpointByID(m_thread_create_bp_sp->GetID()); } Status ProcessGDBRemote::DoSignal(int signo) { diff --git a/lldb/test/API/macosx/thread_start_bps/TestBreakpointsThreadInit.py b/lldb/test/API/macosx/thread_start_bps/TestBreakpointsThreadInit.py index 1c6fd4f91c73e..bf667f6f7d336 100644 --- a/lldb/test/API/macosx/thread_start_bps/TestBreakpointsThreadInit.py +++ b/lldb/test/API/macosx/thread_start_bps/TestBreakpointsThreadInit.py @@ -35,3 +35,23 @@ def test_internal_bps_resolved(self): for bp in bps: num_resolved += bp.GetNumResolvedLocations() self.assertGreater(num_resolved, 0) + + @skipUnlessDarwin + def test_internal_bps_deleted_on_relaunch(self): + self.build() + + source_file = lldb.SBFileSpec("main.c") + target, process, thread, bkpt = lldbutil.run_to_source_breakpoint( + self, "initial hello", source_file + ) + + self.runCmd("break list --internal") + output = self.res.GetOutput() + self.assertEqual(output.count("thread-creation"), 1) + + process.Kill() + self.runCmd("run", RUN_SUCCEEDED) + + self.runCmd("break list --internal") + output = self.res.GetOutput() + self.assertEqual(output.count("thread-creation"), 1) From 90cf2e31abdee050b5811155c86605935046b07e Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 4 Apr 2025 10:09:39 -0700 Subject: [PATCH 0668/1029] Revert "[SLP]Initial support for (masked)loads + compress and (masked)interleaved" This reverts commit daab7d08078bb7cd37c66b78a56f4773e6b12fba to fix a crash reported in https://github.com/llvm/llvm-project/issues/134411. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 358 ++---------------- .../X86/entries-shuffled-diff-sizes.ll | 17 +- .../X86/gep-nodes-with-non-gep-inst.ll | 22 +- .../Transforms/SLPVectorizer/X86/pr47623.ll | 16 +- .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 162 +++++--- .../Transforms/SLPVectorizer/X86/pr47629.ll | 162 +++++--- .../X86/remark_gather-load-redux-cost.ll | 11 +- .../X86/reorder-possible-strided-node.ll | 52 ++- .../X86/reorder-reused-masked-gather.ll | 12 +- .../X86/reorder-reused-masked-gather2.ll | 11 +- .../X86/scatter-vectorize-reused-pointer.ll | 12 +- .../Transforms/SLPVectorizer/X86/sin-sqrt.ll | 8 +- .../SLPVectorizer/X86/split-load8_2-unord.ll | 11 +- .../X86/split-load8_2_unord_geps.ll | 11 +- 14 files changed, 348 insertions(+), 517 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f799c46ab2875..5a4715e083969 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -38,7 +38,6 @@ #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVDescriptors.h" -#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" @@ -1381,8 +1380,7 @@ class BoUpSLP { Gather, Vectorize, ScatterVectorize, - StridedVectorize, - CompressVectorize + StridedVectorize }; using ValueList = SmallVector; @@ -3380,7 +3378,6 @@ class BoUpSLP { Vectorize, ///< The node is regularly vectorized. ScatterVectorize, ///< Masked scatter/gather node. StridedVectorize, ///< Strided loads (and stores) - CompressVectorize, ///< (Masked) load with compress. NeedToGather, ///< Gather/buildvector node. CombinedVectorize, ///< Vectorized node, combined with its user into more ///< complex node like select/cmp to minmax, mul/add to @@ -3607,9 +3604,6 @@ class BoUpSLP { case StridedVectorize: dbgs() << "StridedVectorize\n"; break; - case CompressVectorize: - dbgs() << "CompressVectorize\n"; - break; case NeedToGather: dbgs() << "NeedToGather\n"; break; @@ -4825,8 +4819,7 @@ template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { if (Entry->isGather()) return "color=red"; if (Entry->State == TreeEntry::ScatterVectorize || - Entry->State == TreeEntry::StridedVectorize || - Entry->State == TreeEntry::CompressVectorize) + Entry->State == TreeEntry::StridedVectorize) return "color=blue"; return ""; } @@ -5426,157 +5419,6 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec, return Builder.CreateShuffleVector(Vec, Mask); } -/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered -/// with \p Order. -/// \return true if the mask represents strided access, false - otherwise. -static bool buildCompressMask(ArrayRef PointerOps, - ArrayRef Order, Type *ScalarTy, - const DataLayout &DL, ScalarEvolution &SE, - SmallVectorImpl &CompressMask) { - const unsigned Sz = PointerOps.size(); - CompressMask.assign(Sz, PoisonMaskElem); - // The first element always set. - CompressMask[0] = 0; - // Check if the mask represents strided access. - std::optional Stride = 0; - Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()]; - for (unsigned I : seq(1, Sz)) { - Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]]; - unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE); - CompressMask[I] = Pos; - if (!Stride) - continue; - if (*Stride == 0) { - *Stride = Pos; - continue; - } - if (Pos != *Stride * I) - Stride.reset(); - } - return Stride.has_value(); -} - -/// Checks if the \p VL can be transformed to a (masked)load + compress or -/// (masked) interleaved load. -static bool isMaskedLoadCompress( - ArrayRef VL, ArrayRef PointerOps, - ArrayRef Order, const TargetTransformInfo &TTI, - const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, - const DominatorTree &DT, const TargetLibraryInfo &TLI, - const function_ref AreAllUsersVectorized, bool &IsMasked, - unsigned &InterleaveFactor, SmallVectorImpl &CompressMask, - VectorType *&LoadVecTy) { - InterleaveFactor = 0; - Type *ScalarTy = VL.front()->getType(); - const unsigned Sz = VL.size(); - auto *VecTy = getWidenedType(ScalarTy, Sz); - constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - // Check external uses. - for (const auto [I, V] : enumerate(VL)) { - if (AreAllUsersVectorized(V)) - continue; - InstructionCost ExtractCost = - TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I); - InstructionCost ScalarCost = - TTI.getInstructionCost(cast(V), CostKind); - if (ExtractCost <= ScalarCost) - return false; - } - Value *Ptr0; - Value *PtrN; - if (Order.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); - } else { - Ptr0 = PointerOps[Order.front()]; - PtrN = PointerOps[Order.back()]; - } - std::optional Diff = - getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); - if (!Diff) - return false; - const unsigned MaxRegSize = - TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) - .getFixedValue(); - // Check for very large distances between elements. - if (*Diff / Sz >= MaxRegSize / 8) - return false; - Align CommonAlignment = computeCommonAlignment(VL); - LoadVecTy = getWidenedType(ScalarTy, *Diff + 1); - auto *LI = cast(Order.empty() ? VL.front() : VL[Order.front()]); - IsMasked = !isSafeToLoadUnconditionally( - Ptr0, LoadVecTy, CommonAlignment, DL, - cast(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT, - &TLI); - if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment, - LI->getPointerAddressSpace())) - return false; - // TODO: perform the analysis of each scalar load for better - // safe-load-unconditionally analysis. - bool IsStrided = - buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask); - assert(CompressMask.size() >= 2 && "At least two elements are required"); - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, PointerOps, PointerOps.front(), - Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy); - // The cost of scalar loads. - InstructionCost ScalarLoadsCost = - std::accumulate(VL.begin(), VL.end(), InstructionCost(), - [&](InstructionCost C, Value *V) { - return C + TTI.getInstructionCost(cast(V), - CostKind); - }) + - ScalarGEPCost; - APInt DemandedElts = APInt::getAllOnes(Sz); - InstructionCost GatherCost = - getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts, - /*Insert=*/true, - /*Extract=*/false, CostKind) + - ScalarLoadsCost; - InstructionCost LoadCost = 0; - if (IsMasked) { - LoadCost = - TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, - LI->getPointerAddressSpace(), CostKind); - } else { - CommonAlignment = LI->getAlign(); - LoadCost = - TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, - LI->getPointerAddressSpace(), CostKind); - } - SmallVector Mask; - if (!Order.empty()) - inversePermutation(Order, Mask); - if (IsStrided) { - // Check for potential segmented(interleaved) loads. - if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1], - CommonAlignment, - LI->getPointerAddressSpace())) { - InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost( - Instruction::Load, LoadVecTy, CompressMask[1], std::nullopt, - CommonAlignment, LI->getPointerAddressSpace(), CostKind, IsMasked); - if (!Mask.empty()) - InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, - VecTy, Mask, CostKind); - if (InterleavedCost < GatherCost) { - InterleaveFactor = CompressMask[1]; - return true; - } - } - } - if (!Order.empty()) { - SmallVector NewMask(Sz, PoisonMaskElem); - for (unsigned I : seq(Sz)) { - NewMask[I] = CompressMask[Mask[I]]; - } - CompressMask.swap(NewMask); - } - InstructionCost CompressCost = ::getShuffleCost( - TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind); - InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost; - return TotalVecCost < GatherCost; -} - BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, @@ -5648,6 +5490,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, // Check that the sorted loads are consecutive. if (static_cast(*Diff) == Sz - 1) return LoadsState::Vectorize; + if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || + TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) + return LoadsState::Gather; // Simple check if not a strided access - clear order. bool IsPossibleStrided = *Diff % (Sz - 1) == 0; // Try to generate strided load node if: @@ -5703,22 +5548,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, } } } - bool IsMasked; - unsigned InterleaveFactor; - SmallVector CompressMask; - VectorType *LoadVecTy; - if (isMaskedLoadCompress( - VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI, - [&](Value *V) { - return areAllUsersVectorized(cast(V), - UserIgnoreList); - }, - IsMasked, InterleaveFactor, CompressMask, LoadVecTy)) - return LoadsState::CompressVectorize; } - if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || - TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) - return LoadsState::Gather; // Correctly identify compare the cost of loads + shuffles rather than // strided/masked gather loads. Returns true if vectorized + shuffles // representation is better than just gather. @@ -5811,8 +5641,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, } // If need the reorder - consider as high-cost masked gather for now. if ((LS == LoadsState::Vectorize || - LS == LoadsState::StridedVectorize || - LS == LoadsState::CompressVectorize) && + LS == LoadsState::StridedVectorize) && !Order.empty() && !isReverseOrder(Order)) LS = LoadsState::ScatterVectorize; States.push_back(LS); @@ -5877,14 +5706,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, CommonAlignment, CostKind) + VectorGEPCost; break; - case LoadsState::CompressVectorize: - VecLdCost += TTI.getMaskedMemoryOpCost( - Instruction::Load, SubVecTy, CommonAlignment, - LI0->getPointerAddressSpace(), CostKind) + - VectorGEPCost + - ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy, - {}, CostKind); - break; case LoadsState::ScatterVectorize: VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy, LI0->getPointerOperand(), @@ -6258,8 +6079,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, return std::nullopt; if (TE.State == TreeEntry::SplitVectorize || ((TE.State == TreeEntry::Vectorize || - TE.State == TreeEntry::StridedVectorize || - TE.State == TreeEntry::CompressVectorize) && + TE.State == TreeEntry::StridedVectorize) && (isa(TE.getMainOp()) || (TopToBottom && isa(TE.getMainOp()))))) { assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) && @@ -6446,8 +6266,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, OrdersType CurrentOrder; LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(), CurrentOrder, PointerOps); - if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize || - Res == LoadsState::CompressVectorize) + if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize) return std::move(CurrentOrder); } // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars @@ -6687,8 +6506,7 @@ void BoUpSLP::reorderTopToBottom() { VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || - TE->State == TreeEntry::SplitVectorize || - TE->State == TreeEntry::CompressVectorize) || + TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && @@ -6862,8 +6680,7 @@ void BoUpSLP::reorderTopToBottom() { if ((TE->State == TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty()) || ((TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize || - TE->State == TreeEntry::CompressVectorize) && + TE->State == TreeEntry::StridedVectorize) && (isa(TE->getMainOp()) || (SLPReVec && isa(TE->getMainOp()))))) { @@ -6911,7 +6728,6 @@ bool BoUpSLP::canReorderOperands( return OpData.first == I && (OpData.second->State == TreeEntry::Vectorize || OpData.second->State == TreeEntry::StridedVectorize || - OpData.second->State == TreeEntry::CompressVectorize || OpData.second->State == TreeEntry::SplitVectorize); })) continue; @@ -6926,7 +6742,6 @@ bool BoUpSLP::canReorderOperands( // node, just reorder reuses mask. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && - TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) GatherOps.push_back(TE); @@ -6937,7 +6752,6 @@ bool BoUpSLP::canReorderOperands( [&Gather, UserTE, I](TreeEntry *TE) { assert(TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && - TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && "Only non-vectorized nodes are expected."); if (TE->UserTreeIndex.UserTE == UserTE && @@ -6974,7 +6788,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (const std::unique_ptr &TE : VectorizableTree) { if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && - TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize) NonVectorized.push_back(TE.get()); if (std::optional CurrentOrder = @@ -6982,7 +6795,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Queue.push(TE.get()); if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || - TE->State == TreeEntry::CompressVectorize || TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.insert(TE.get()); @@ -7011,7 +6823,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (TreeEntry *TE : OrderedOps) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || - TE->State == TreeEntry::CompressVectorize || TE->State == TreeEntry::SplitVectorize || (TE->isGather() && GathersToOrders.contains(TE))) || !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() || @@ -7306,7 +7117,6 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // Gathers are processed separately. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && - TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && (TE->State != TreeEntry::ScatterVectorize || TE->ReorderIndices.empty())) @@ -7339,8 +7149,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Data.first->reorderOperands(Mask); if (!isa(Data.first->getMainOp()) || Data.first->isAltShuffle() || - Data.first->State == TreeEntry::StridedVectorize || - Data.first->State == TreeEntry::CompressVectorize) { + Data.first->State == TreeEntry::StridedVectorize) { reorderScalars(Data.first->Scalars, Mask); reorderOrder(Data.first->ReorderIndices, MaskOrder, /*BottomOrder=*/true); @@ -8118,16 +7927,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads( }); if (It == Slice.end()) return false; - const TreeEntry &TE = - *VectorizableTree[std::get<0>(P)]; - ArrayRef VL = TE.Scalars; - OrdersType Order; - SmallVector PointerOps; - LoadsState State = canVectorizeLoads( - VL, VL.front(), Order, PointerOps); - if (State == LoadsState::ScatterVectorize || - State == LoadsState::CompressVectorize) - return false; + ArrayRef VL = + VectorizableTree[std::get<0>(P)]->Scalars; ConsecutiveNodesSize += VL.size(); unsigned Start = std::distance(Slice.begin(), It); unsigned Sz = Slice.size() - Start; @@ -8592,44 +8393,23 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. - auto IsGatheredNode = [&]() { - if (!GatheredLoadsEntriesFirst) - return false; - return all_of(VL, [&](Value *V) { - if (isa(V)) - return true; - return any_of(getTreeEntries(V), [&](const TreeEntry *TE) { - return TE->Idx >= *GatheredLoadsEntriesFirst; - }); - }); - }; switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) { case LoadsState::Vectorize: return TreeEntry::Vectorize; - case LoadsState::CompressVectorize: - if (!IsGraphTransformMode && !VectorizableTree.empty()) { - // Delay slow vectorized nodes for better vectorization attempts. - LoadEntriesToVectorize.insert(VectorizableTree.size()); - return TreeEntry::NeedToGather; - } - return IsGatheredNode() ? TreeEntry::NeedToGather - : TreeEntry::CompressVectorize; case LoadsState::ScatterVectorize: if (!IsGraphTransformMode && !VectorizableTree.empty()) { // Delay slow vectorized nodes for better vectorization attempts. LoadEntriesToVectorize.insert(VectorizableTree.size()); return TreeEntry::NeedToGather; } - return IsGatheredNode() ? TreeEntry::NeedToGather - : TreeEntry::ScatterVectorize; + return TreeEntry::ScatterVectorize; case LoadsState::StridedVectorize: if (!IsGraphTransformMode && VectorizableTree.size() > 1) { // Delay slow vectorized nodes for better vectorization attempts. LoadEntriesToVectorize.insert(VectorizableTree.size()); return TreeEntry::NeedToGather; } - return IsGatheredNode() ? TreeEntry::NeedToGather - : TreeEntry::StridedVectorize; + return TreeEntry::StridedVectorize; case LoadsState::Gather: #ifndef NDEBUG Type *ScalarTy = VL0->getType(); @@ -9730,15 +9510,6 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, << "SLP: added a new TreeEntry (jumbled LoadInst).\n"; TE->dump()); break; - case TreeEntry::CompressVectorize: - // Vectorizing non-consecutive loads with (masked)load + compress. - TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S, - UserTreeIdx, ReuseShuffleIndices, CurrentOrder); - LLVM_DEBUG( - dbgs() - << "SLP: added a new TreeEntry (masked LoadInst + compress).\n"; - TE->dump()); - break; case TreeEntry::StridedVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, @@ -12270,8 +12041,6 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { if (TE.State == TreeEntry::ScatterVectorize || TE.State == TreeEntry::StridedVectorize) return TTI::CastContextHint::GatherScatter; - if (TE.State == TreeEntry::CompressVectorize) - return TTI::CastContextHint::Masked; if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load && !TE.isAltShuffle()) { if (TE.ReorderIndices.empty()) @@ -12365,8 +12134,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::StridedVectorize || - E->State == TreeEntry::CompressVectorize) && + E->State == TreeEntry::StridedVectorize) && "Unhandled state"); assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || @@ -12457,10 +12225,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // Negative value means vectorizing is profitable. auto GetGEPCostDiff = [=](ArrayRef Ptrs, Value *BasePtr) { assert((E->State == TreeEntry::Vectorize || - E->State == TreeEntry::StridedVectorize || - E->State == TreeEntry::CompressVectorize) && - "Entry state expected to be Vectorize, StridedVectorize or " - "MaskedLoadCompressVectorize here."); + E->State == TreeEntry::StridedVectorize) && + "Entry state expected to be Vectorize or StridedVectorize here."); InstructionCost ScalarCost = 0; InstructionCost VecCost = 0; std::tie(ScalarCost, VecCost) = getGEPCosts( @@ -12923,45 +12689,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, /*VariableMask=*/false, CommonAlignment, CostKind); break; } - case TreeEntry::CompressVectorize: { - SmallVector PointerOps(VL.size()); - for (auto [I, V] : enumerate(VL)) - PointerOps[I] = cast(V)->getPointerOperand(); - bool IsMasked; - unsigned InterleaveFactor; - SmallVector CompressMask; - VectorType *LoadVecTy; - [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress( - VL, PointerOps, std::nullopt, *TTI, *DL, *SE, *AC, *DT, *TLI, - [](Value *) { return true; }, IsMasked, InterleaveFactor, - CompressMask, LoadVecTy); - assert(IsVectorized && "Expected to be vectorized"); - Align CommonAlignment; - if (IsMasked) - CommonAlignment = computeCommonAlignment(VL); - else - CommonAlignment = LI0->getAlign(); - if (InterleaveFactor) { - VecLdCost = TTI->getInterleavedMemoryOpCost( - Instruction::Load, LoadVecTy, InterleaveFactor, std::nullopt, - CommonAlignment, LI0->getPointerAddressSpace(), CostKind); - } else if (IsMasked) { - VecLdCost = TTI->getMaskedMemoryOpCost( - Instruction::Load, LoadVecTy, CommonAlignment, - LI0->getPointerAddressSpace(), CostKind); - // TODO: include this cost into CommonCost. - VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - LoadVecTy, CompressMask, CostKind); - } else { - VecLdCost = TTI->getMemoryOpCost( - Instruction::Load, LoadVecTy, CommonAlignment, - LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); - // TODO: include this cost into CommonCost. - VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - LoadVecTy, CompressMask, CostKind); - } - break; - } case TreeEntry::ScatterVectorize: { Align CommonAlignment = computeCommonAlignment(UniqueValues.getArrayRef()); @@ -13251,7 +12978,6 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { if (VectorizableTree.size() == 1 && (VectorizableTree[0]->State == TreeEntry::Vectorize || VectorizableTree[0]->State == TreeEntry::StridedVectorize || - VectorizableTree[0]->State == TreeEntry::CompressVectorize || (ForReduction && AreVectorizableGathers(VectorizableTree[0].get(), VectorizableTree[0]->Scalars.size()) && @@ -13275,8 +13001,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { if (VectorizableTree[0]->isGather() || (VectorizableTree[1]->isGather() && VectorizableTree[0]->State != TreeEntry::ScatterVectorize && - VectorizableTree[0]->State != TreeEntry::StridedVectorize && - VectorizableTree[0]->State != TreeEntry::CompressVectorize)) + VectorizableTree[0]->State != TreeEntry::StridedVectorize)) return false; return true; @@ -17458,40 +17183,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *PO = LI->getPointerOperand(); if (E->State == TreeEntry::Vectorize) { NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign()); - } else if (E->State == TreeEntry::CompressVectorize) { - SmallVector PointerOps(E->Scalars.size()); - for (auto [I, V] : enumerate(E->Scalars)) - PointerOps[I] = cast(V)->getPointerOperand(); - bool IsMasked; - unsigned InterleaveFactor; - SmallVector CompressMask; - VectorType *LoadVecTy; - [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress( - E->Scalars, PointerOps, std::nullopt, *TTI, *DL, *SE, *AC, *DT, - *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor, - CompressMask, LoadVecTy); - assert(IsVectorized && "Expected to be vectorized"); - Align CommonAlignment; - if (IsMasked) - CommonAlignment = computeCommonAlignment(E->Scalars); - else - CommonAlignment = LI->getAlign(); - if (IsMasked) { - SmallVector MaskValues( - getNumElements(LoadVecTy) / getNumElements(LI->getType()), - ConstantInt::getFalse(VecTy->getContext())); - for (int I : CompressMask) - MaskValues[I] = ConstantInt::getTrue(VecTy->getContext()); - Constant *MaskValue = ConstantVector::get(MaskValues); - NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment, - MaskValue); - } else { - NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment); - } - NewLI = ::propagateMetadata(NewLI, E->Scalars); - // TODO: include this cost into CommonCost. - NewLI = - cast(Builder.CreateShuffleVector(NewLI, CompressMask)); } else if (E->State == TreeEntry::StridedVectorize) { Value *Ptr0 = cast(E->Scalars.front())->getPointerOperand(); Value *PtrN = cast(E->Scalars.back())->getPointerOperand(); @@ -17561,9 +17252,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Align CommonAlignment = computeCommonAlignment(E->Scalars); NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment); } - Value *V = E->State == TreeEntry::CompressVectorize - ? NewLI - : ::propagateMetadata(NewLI, E->Scalars); + Value *V = ::propagateMetadata(NewLI, E->Scalars); V = FinalShuffle(V, E); E->VectorizedValue = V; @@ -18165,14 +17854,11 @@ Value *BoUpSLP::vectorizeTree( ArrayRef UseEntries = getTreeEntries(U); return !UseEntries.empty() && (E->State == TreeEntry::Vectorize || - E->State == TreeEntry::StridedVectorize || - E->State == TreeEntry::CompressVectorize) && + E->State == TreeEntry::StridedVectorize) && any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) { return (UseEntry->State == TreeEntry::Vectorize || UseEntry->State == - TreeEntry::StridedVectorize || - UseEntry->State == - TreeEntry::CompressVectorize) && + TreeEntry::StridedVectorize) && doesInTreeUserNeedToExtract( Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll index b99a1c2d83394..aa9195f8c48ce 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll @@ -15,16 +15,19 @@ define void @test() { ; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]] ; CHECK-NEXT: store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), align 16 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1296), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = load <8 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1304), align 16 ; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP8]], <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP12]], <8 x float> [[TMP13]], i64 8) +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP15]], <4 x float> [[TMP7]], i64 0) +; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> [[TMP16]], <2 x float> [[TMP9]], i64 6) ; CHECK-NEXT: [[TMP18:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP14]], [[TMP17]] -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP18]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: store <16 x float> [[TMP15]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 +; CHECK-NEXT: store <16 x float> [[TMP18]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 ; CHECK-NEXT: ret void ; alloca_0: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll index 80ba7a40fb193..12263b065d89c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll @@ -9,9 +9,17 @@ define void @test() { ; CHECK-NEXT: [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> , <13 x i64> poison) -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[COND_IN_V]], align 8 +; CHECK-NEXT: [[BV:%.*]] = icmp eq i64 [[V]], 0 +; CHECK-NEXT: [[IN_1:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 4 +; CHECK-NEXT: [[V_1:%.*]] = load i64, ptr [[IN_1]], align 8 +; CHECK-NEXT: [[BV_1:%.*]] = icmp eq i64 [[V_1]], 0 +; CHECK-NEXT: [[IN_2:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 8 +; CHECK-NEXT: [[V_2:%.*]] = load i64, ptr [[IN_2]], align 8 +; CHECK-NEXT: [[BV_2:%.*]] = icmp eq i64 [[V_2]], 0 +; CHECK-NEXT: [[IN_3:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 12 +; CHECK-NEXT: [[V_3:%.*]] = load i64, ptr [[IN_3]], align 8 +; CHECK-NEXT: [[BV_3:%.*]] = icmp eq i64 [[V_3]], 0 ; CHECK-NEXT: ret void ; ; CHECK-SLP-THRESHOLD-LABEL: define void @test @@ -20,9 +28,11 @@ define void @test() { ; CHECK-SLP-THRESHOLD-NEXT: [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null ; CHECK-SLP-THRESHOLD-NEXT: br label [[BB:%.*]] ; CHECK-SLP-THRESHOLD: bb: -; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> , <13 x i64> poison) -; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> -; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer +; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0 +; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> +; CHECK-SLP-THRESHOLD-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison) +; CHECK-SLP-THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer ; CHECK-SLP-THRESHOLD-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll index a9c0eb3f9f2b9..f249394c91788 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -24,16 +24,20 @@ define void @foo() { ; SSE-NEXT: ret void ; ; AVX-LABEL: @foo( -; AVX-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16 -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 +; AVX-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> ; AVX-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 ; AVX-NEXT: ret void ; ; AVX512-LABEL: @foo( -; AVX512-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16 -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> -; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 +; AVX512-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8 +; AVX512-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 +; AVX512-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> ; AVX512-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 ; AVX512-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index a0e52c13ec621..925c334cb5f20 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -164,20 +164,36 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> -; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], -; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> -; AVX512F-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( ; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> -; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], -; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> -; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -274,30 +290,49 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24 -; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 +; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 +; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 +; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 +; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 +; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 +; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> -; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> -; AVX512F-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], +; AVX512F-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> -; AVX512VL-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], +; AVX512VL-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, ptr %1, align 4, !tbaa !2 @@ -412,30 +447,49 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24 -; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 +; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 +; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 +; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 +; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 +; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 +; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 +; AVX2-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 +; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 +; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 +; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 +; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 +; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 ; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], ; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> -; AVX512F-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; AVX512F-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> +; AVX512F-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512F-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; AVX512VL-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> +; AVX512VL-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512VL-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, ptr %t0, i64 1 @@ -633,21 +687,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> -; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> -; AVX512F-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> -; AVX512VL-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, ptr %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index 6c5638819dcea..dc1ba4ec7e7ab 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -164,20 +164,36 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> -; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], -; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> -; AVX512F-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( ; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> -; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], -; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> -; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 +; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 +; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -274,30 +290,49 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24 -; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 +; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 +; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 +; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 +; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 +; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 +; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> -; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> -; AVX512F-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], +; AVX512F-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> -; AVX512VL-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], +; AVX512VL-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, ptr %1, align 4, !tbaa !2 @@ -412,30 +447,49 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24 -; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> -; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 +; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 +; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 +; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 +; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 +; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 +; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 +; AVX2-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 +; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 +; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 +; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 +; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 +; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 +; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 +; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 ; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], ; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> -; AVX512F-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; AVX512F-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> +; AVX512F-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512F-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; AVX512VL-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> +; AVX512VL-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512VL-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, ptr %t0, i64 1 @@ -633,21 +687,25 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> -; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> -; AVX512F-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> +; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> -; AVX512VL-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, ptr %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll index bbb1b87fc3dfa..0807a1bd4cdea 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll @@ -5,9 +5,10 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[OFF0_1:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1 -; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[OFF0_1]], i32 8, <15 x i1> , <15 x i32> poison) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, <8 x ptr> [[TMP5]], <8 x i32> [[TMP3]] @@ -21,9 +22,9 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' - ; YAML-NEXT: - Cost: '-10' + ; YAML-NEXT: - Cost: '-1' ; YAML-NEXT: - String: ' and with tree size ' - ; YAML-NEXT: - TreeSize: '5' + ; YAML-NEXT: - TreeSize: '8' entry: %off0.1 = getelementptr inbounds i32, ptr %addr, i32 1 %idx0 = load i32, ptr %off0.1, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll index 02058b1fe8578..5bd954e741d43 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll @@ -5,17 +5,16 @@ define void @test() { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP6]], <2 x i32> [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP9]], i32 2 ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> @@ -65,17 +64,16 @@ define void @test1() { ; CHECK-LABEL: define void @test1( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP12]], <2 x i32> [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 2 ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer @@ -127,17 +125,16 @@ define void @test_div() { ; CHECK-LABEL: define void @test_div( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> @@ -187,17 +184,16 @@ define void @test_rem() { ; CHECK-LABEL: define void @test_rem( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll index 73b6c80730935..c7c67d31f9ded 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll @@ -3,12 +3,14 @@ define void @test(ptr noalias %0, ptr %p) { ; CHECK-LABEL: @test( +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x ptr> [[TMP2]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x ptr> [[TMP3]], <8 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 4, <16 x i1> , <16 x float> poison) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> , <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> , <16 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]] ; CHECK-NEXT: store <16 x float> [[TMP10]], ptr [[TMP5]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll index 92d5506977aeb..c114c5dee78e9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll @@ -8,11 +8,14 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 { ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8 -; CHECK-NEXT: [[TMP5:%.*]] = call <6 x float> @llvm.masked.load.v6f32.p1(ptr addrspace(1) [[TMP3]], i32 4, <6 x i1> , <6 x float> poison) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 24 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x float> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index d487e3616956c..1294a87ff6967 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -5,12 +5,16 @@ define void @test(i1 %c, ptr %arg) { ; CHECK-LABEL: @test( ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: -; CHECK-NEXT: [[TMP1:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG:%.*]], i32 8, <5 x i1> , <5 x i64> poison) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x i64> [[TMP1]], <5 x i64> poison, <4 x i32> +; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <4 x i32> ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: -; CHECK-NEXT: [[TMP3:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG]], i32 8, <5 x i1> , <5 x i64> poison) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <5 x i64> [[TMP3]], <5 x i64> poison, <4 x i32> +; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP7]], <4 x i32> ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: ; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll index b4996eb58b47e..e1e80d96d416d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll @@ -19,11 +19,11 @@ define void @test() { ; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]]) ; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]]) ; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]]) -; CHECK-NEXT: [[TMP1:%.*]] = load <6 x double>, ptr @src, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]]) -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[SIN1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP13]], double [[SIN3]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index 3f684e414c8ba..202ec9633712f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -8,10 +8,15 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr [[ARRAYIDX20]], i32 4, <12 x i1> , <12 x i32> poison) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP0]], i64 4) ; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]] ; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll index fdc0bc0e00eb8..8fe7d15b69cb1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll @@ -4,15 +4,16 @@ define void @test(ptr noalias %p, ptr noalias %addr, ptr noalias %s) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1 -; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[ADDR]], i32 8, <15 x i1> , <15 x i32> poison) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) -; CHECK-NEXT: [[TMP11:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[GEP2]], i32 8, <15 x i1> , <15 x i32> poison) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <15 x i32> [[TMP11]], <15 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP8]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) ; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]] From 1688c3062a56b4fca1f8ad28f2865df0ed8ca940 Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Fri, 4 Apr 2025 10:11:14 -0700 Subject: [PATCH 0669/1029] [clang] Do not share ownership of `PreprocessorOptions` (#133467) This PR makes it so that `CompilerInvocation` is the sole owner of the `PreprocessorOptions` instance. --- .../ExpandModularHeadersPPCallbacks.cpp | 15 +++---- clang-tools-extra/clangd/ModulesBuilder.cpp | 5 ++- .../clang/Frontend/CompilerInvocation.h | 3 -- clang/include/clang/Lex/Preprocessor.h | 14 +++--- clang/lib/Frontend/ASTUnit.cpp | 2 +- clang/lib/Frontend/CompilerInstance.cpp | 2 +- clang/lib/Lex/PPDirectives.cpp | 20 ++++----- clang/lib/Lex/PPLexerChange.cpp | 2 +- clang/lib/Lex/Preprocessor.cpp | 28 ++++++------ .../Analysis/MacroExpansionContextTest.cpp | 7 ++- clang/unittests/Basic/SourceManagerTest.cpp | 35 +++++++-------- clang/unittests/Lex/LexerTest.cpp | 4 +- clang/unittests/Lex/ModuleDeclStateTest.cpp | 10 ++--- clang/unittests/Lex/PPCallbacksTest.cpp | 43 +++++++++---------- .../Lex/PPConditionalDirectiveRecordTest.cpp | 4 +- .../Lex/PPDependencyDirectivesTest.cpp | 4 +- .../unittests/Lex/PPMemoryAllocationsTest.cpp | 4 +- 17 files changed, 94 insertions(+), 108 deletions(-) diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp index a15850cb63542..03a3e8404e069 100644 --- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp +++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp @@ -89,15 +89,14 @@ ExpandModularHeadersPPCallbacks::ExpandModularHeadersPPCallbacks( HeaderInfo = std::make_unique(HSOpts, Sources, Diags, LangOpts, &Compiler.getTarget()); - auto PO = std::make_shared(); - *PO = Compiler.getPreprocessorOpts(); - - PP = std::make_unique(PO, Diags, LangOpts, Sources, - *HeaderInfo, ModuleLoader, - /*IILookup=*/nullptr, - /*OwnsHeaderSearch=*/false); + PP = std::make_unique(Compiler.getPreprocessorOpts(), + Diags, LangOpts, Sources, + *HeaderInfo, ModuleLoader, + /*IILookup=*/nullptr, + /*OwnsHeaderSearch=*/false); PP->Initialize(Compiler.getTarget(), Compiler.getAuxTarget()); - InitializePreprocessor(*PP, *PO, Compiler.getPCHContainerReader(), + InitializePreprocessor(*PP, Compiler.getPreprocessorOpts(), + Compiler.getPCHContainerReader(), Compiler.getFrontendOpts(), Compiler.getCodeGenOpts()); ApplyHeaderSearchOptions(*HeaderInfo, HSOpts, LangOpts, Compiler.getTarget().getTriple()); diff --git a/clang-tools-extra/clangd/ModulesBuilder.cpp b/clang-tools-extra/clangd/ModulesBuilder.cpp index 03c5f5e1b5993..c1878f91b5e16 100644 --- a/clang-tools-extra/clangd/ModulesBuilder.cpp +++ b/clang-tools-extra/clangd/ModulesBuilder.cpp @@ -202,9 +202,10 @@ bool IsModuleFileUpToDate(PathRef ModuleFilePath, HeaderSearch HeaderInfo(HSOpts, SourceMgr, *Diags, LangOpts, /*Target=*/nullptr); + PreprocessorOptions PPOpts; TrivialModuleLoader ModuleLoader; - Preprocessor PP(std::make_shared(), *Diags, LangOpts, - SourceMgr, HeaderInfo, ModuleLoader); + Preprocessor PP(PPOpts, *Diags, LangOpts, SourceMgr, HeaderInfo, + ModuleLoader); IntrusiveRefCntPtr ModCache = createCrossProcessModuleCache(); PCHContainerOperations PCHOperations; diff --git a/clang/include/clang/Frontend/CompilerInvocation.h b/clang/include/clang/Frontend/CompilerInvocation.h index 1e4d2da86c2be..f71d27813b2a1 100644 --- a/clang/include/clang/Frontend/CompilerInvocation.h +++ b/clang/include/clang/Frontend/CompilerInvocation.h @@ -272,9 +272,6 @@ class CompilerInvocation : public CompilerInvocationBase { std::shared_ptr getHeaderSearchOptsPtr() { return HSOpts; } - std::shared_ptr getPreprocessorOptsPtr() { - return PPOpts; - } std::shared_ptr getLangOptsPtr() { return LangOpts; } /// @} diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 4fdc4e0439125..24bb524783e93 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -140,7 +140,7 @@ class Preprocessor { friend class VariadicMacroScopeGuard; llvm::unique_function OnToken; - std::shared_ptr PPOpts; + const PreprocessorOptions &PPOpts; DiagnosticsEngine *Diags; const LangOptions &LangOpts; const TargetInfo *Target = nullptr; @@ -1165,10 +1165,9 @@ class Preprocessor { void updateOutOfDateIdentifier(const IdentifierInfo &II) const; public: - Preprocessor(std::shared_ptr PPOpts, - DiagnosticsEngine &diags, const LangOptions &LangOpts, - SourceManager &SM, HeaderSearch &Headers, - ModuleLoader &TheModuleLoader, + Preprocessor(const PreprocessorOptions &PPOpts, DiagnosticsEngine &diags, + const LangOptions &LangOpts, SourceManager &SM, + HeaderSearch &Headers, ModuleLoader &TheModuleLoader, IdentifierInfoLookup *IILookup = nullptr, bool OwnsHeaderSearch = false, TranslationUnitKind TUKind = TU_Complete); @@ -1195,9 +1194,8 @@ class Preprocessor { /// Cleanup after model file parsing void FinalizeForModelFile(); - /// Retrieve the preprocessor options used to initialize this - /// preprocessor. - const PreprocessorOptions &getPreprocessorOpts() const { return *PPOpts; } + /// Retrieve the preprocessor options used to initialize this preprocessor. + const PreprocessorOptions &getPreprocessorOpts() const { return PPOpts; } DiagnosticsEngine &getDiagnostics() const { return *Diags; } void setDiagnostics(DiagnosticsEngine &D) { Diags = &D; } diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp index 0a5f1cfd1a264..04ddc93415507 100644 --- a/clang/lib/Frontend/ASTUnit.cpp +++ b/clang/lib/Frontend/ASTUnit.cpp @@ -844,7 +844,7 @@ std::unique_ptr ASTUnit::LoadFromASTFile( HeaderSearch &HeaderInfo = *AST->HeaderInfo; AST->PP = std::make_shared( - AST->PPOpts, AST->getDiagnostics(), *AST->LangOpts, + *AST->PPOpts, AST->getDiagnostics(), *AST->LangOpts, AST->getSourceManager(), HeaderInfo, AST->ModuleLoader, /*IILookup=*/nullptr, /*OwnsHeaderSearch=*/false); diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 91093d3ccb84c..9cab17ae70eeb 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -452,7 +452,7 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) { HeaderSearch *HeaderInfo = new HeaderSearch(getHeaderSearchOpts(), getSourceManager(), getDiagnostics(), getLangOpts(), &getTarget()); - PP = std::make_shared(Invocation->getPreprocessorOptsPtr(), + PP = std::make_shared(Invocation->getPreprocessorOpts(), getDiagnostics(), getLangOpts(), getSourceManager(), *HeaderInfo, *this, /*IdentifierInfoLookup=*/nullptr, diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index a29b73f97ab7e..0b53524e23641 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -1154,7 +1154,7 @@ Preprocessor::LookupEmbedFile(StringRef Filename, bool isAngled, bool OpenFile, } } - for (const auto &Entry : PPOpts->EmbedEntries) { + for (const auto &Entry : PPOpts.EmbedEntries) { LookupPath.clear(); SeparateComponents(LookupPath, Entry, Filename, false); llvm::Expected ShouldBeEntry = FM.getFileRef( @@ -2341,7 +2341,7 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( enum { Enter, Import, Skip, IncludeLimitReached } Action = Enter; - if (PPOpts->SingleFileParseMode) + if (PPOpts.SingleFileParseMode) Action = IncludeLimitReached; // If we've reached the max allowed include depth, it is usually due to an @@ -3420,11 +3420,11 @@ void Preprocessor::HandleIfdefDirective(Token &Result, Callbacks->Ifdef(DirectiveTok.getLocation(), MacroNameTok, MD); } - bool RetainExcludedCB = PPOpts->RetainExcludedConditionalBlocks && + bool RetainExcludedCB = PPOpts.RetainExcludedConditionalBlocks && getSourceManager().isInMainFile(DirectiveTok.getLocation()); // Should we include the stuff contained by this directive? - if (PPOpts->SingleFileParseMode && !MI) { + if (PPOpts.SingleFileParseMode && !MI) { // In 'single-file-parse mode' undefined identifiers trigger parsing of all // the directive blocks. CurPPLexer->pushConditionalLevel(DirectiveTok.getLocation(), @@ -3475,11 +3475,11 @@ void Preprocessor::HandleIfDirective(Token &IfToken, IfToken.getLocation(), DER.ExprRange, (ConditionalTrue ? PPCallbacks::CVK_True : PPCallbacks::CVK_False)); - bool RetainExcludedCB = PPOpts->RetainExcludedConditionalBlocks && + bool RetainExcludedCB = PPOpts.RetainExcludedConditionalBlocks && getSourceManager().isInMainFile(IfToken.getLocation()); // Should we include the stuff contained by this directive? - if (PPOpts->SingleFileParseMode && DER.IncludedUndefinedIds) { + if (PPOpts.SingleFileParseMode && DER.IncludedUndefinedIds) { // In 'single-file-parse mode' undefined identifiers trigger parsing of all // the directive blocks. CurPPLexer->pushConditionalLevel(IfToken.getLocation(), /*wasskip*/false, @@ -3546,10 +3546,10 @@ void Preprocessor::HandleElseDirective(Token &Result, const Token &HashToken) { if (Callbacks) Callbacks->Else(Result.getLocation(), CI.IfLoc); - bool RetainExcludedCB = PPOpts->RetainExcludedConditionalBlocks && + bool RetainExcludedCB = PPOpts.RetainExcludedConditionalBlocks && getSourceManager().isInMainFile(Result.getLocation()); - if ((PPOpts->SingleFileParseMode && !CI.FoundNonSkip) || RetainExcludedCB) { + if ((PPOpts.SingleFileParseMode && !CI.FoundNonSkip) || RetainExcludedCB) { // In 'single-file-parse mode' undefined identifiers trigger parsing of all // the directive blocks. CurPPLexer->pushConditionalLevel(CI.IfLoc, /*wasskip*/false, @@ -3626,10 +3626,10 @@ void Preprocessor::HandleElifFamilyDirective(Token &ElifToken, } } - bool RetainExcludedCB = PPOpts->RetainExcludedConditionalBlocks && + bool RetainExcludedCB = PPOpts.RetainExcludedConditionalBlocks && getSourceManager().isInMainFile(ElifToken.getLocation()); - if ((PPOpts->SingleFileParseMode && !CI.FoundNonSkip) || RetainExcludedCB) { + if ((PPOpts.SingleFileParseMode && !CI.FoundNonSkip) || RetainExcludedCB) { // In 'single-file-parse mode' undefined identifiers trigger parsing of all // the directive blocks. CurPPLexer->pushConditionalLevel(ElifToken.getLocation(), /*wasskip*/false, diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp index e1dcc5499170e..a373a52506a24 100644 --- a/clang/lib/Lex/PPLexerChange.cpp +++ b/clang/lib/Lex/PPLexerChange.cpp @@ -561,7 +561,7 @@ bool Preprocessor::HandleEndOfFile(Token &Result, bool isEndOfMacro) { if (creatingPCHWithThroughHeader() && !LeavingPCHThroughHeader) { // Reached the end of the compilation without finding the through header. Diag(CurLexer->getFileLoc(), diag::err_pp_through_header_not_seen) - << PPOpts->PCHThroughHeader << 0; + << PPOpts.PCHThroughHeader << 0; } if (!isIncrementalProcessingEnabled()) diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index ff99575dc611b..c25a3efd899e0 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -77,13 +77,13 @@ LLVM_INSTANTIATE_REGISTRY(PragmaHandlerRegistry) ExternalPreprocessorSource::~ExternalPreprocessorSource() = default; -Preprocessor::Preprocessor(std::shared_ptr PPOpts, +Preprocessor::Preprocessor(const PreprocessorOptions &PPOpts, DiagnosticsEngine &diags, const LangOptions &opts, SourceManager &SM, HeaderSearch &Headers, ModuleLoader &TheModuleLoader, IdentifierInfoLookup *IILookup, bool OwnsHeaders, TranslationUnitKind TUKind) - : PPOpts(std::move(PPOpts)), Diags(&diags), LangOpts(opts), + : PPOpts(PPOpts), Diags(&diags), LangOpts(opts), FileMgr(Headers.getFileMgr()), SourceMgr(SM), ScratchBuf(new ScratchBuffer(SourceMgr)), HeaderInfo(Headers), TheModuleLoader(TheModuleLoader), ExternalSource(nullptr), @@ -156,11 +156,11 @@ Preprocessor::Preprocessor(std::shared_ptr PPOpts, SkippingUntilPragmaHdrStop = true; // If using a PCH with a through header, start skipping tokens. - if (!this->PPOpts->PCHThroughHeader.empty() && - !this->PPOpts->ImplicitPCHInclude.empty()) + if (!this->PPOpts.PCHThroughHeader.empty() && + !this->PPOpts.ImplicitPCHInclude.empty()) SkippingUntilPCHThroughHeader = true; - if (this->PPOpts->GeneratePreamble) + if (this->PPOpts.GeneratePreamble) PreambleConditionalStack.startRecording(); MaxTokens = LangOpts.MaxTokens; @@ -577,18 +577,18 @@ void Preprocessor::EnterMainSourceFile() { // Start parsing the predefines. EnterSourceFile(FID, nullptr, SourceLocation()); - if (!PPOpts->PCHThroughHeader.empty()) { + if (!PPOpts.PCHThroughHeader.empty()) { // Lookup and save the FileID for the through header. If it isn't found // in the search path, it's a fatal error. OptionalFileEntryRef File = LookupFile( - SourceLocation(), PPOpts->PCHThroughHeader, + SourceLocation(), PPOpts.PCHThroughHeader, /*isAngled=*/false, /*FromDir=*/nullptr, /*FromFile=*/nullptr, /*CurDir=*/nullptr, /*SearchPath=*/nullptr, /*RelativePath=*/nullptr, /*SuggestedModule=*/nullptr, /*IsMapped=*/nullptr, /*IsFrameworkFound=*/nullptr); if (!File) { Diag(SourceLocation(), diag::err_pp_through_header_not_found) - << PPOpts->PCHThroughHeader; + << PPOpts.PCHThroughHeader; return; } setPCHThroughHeaderFileID( @@ -614,21 +614,21 @@ bool Preprocessor::isPCHThroughHeader(const FileEntry *FE) { } bool Preprocessor::creatingPCHWithThroughHeader() { - return TUKind == TU_Prefix && !PPOpts->PCHThroughHeader.empty() && + return TUKind == TU_Prefix && !PPOpts.PCHThroughHeader.empty() && PCHThroughHeaderFileID.isValid(); } bool Preprocessor::usingPCHWithThroughHeader() { - return TUKind != TU_Prefix && !PPOpts->PCHThroughHeader.empty() && + return TUKind != TU_Prefix && !PPOpts.PCHThroughHeader.empty() && PCHThroughHeaderFileID.isValid(); } bool Preprocessor::creatingPCHWithPragmaHdrStop() { - return TUKind == TU_Prefix && PPOpts->PCHWithHdrStop; + return TUKind == TU_Prefix && PPOpts.PCHWithHdrStop; } bool Preprocessor::usingPCHWithPragmaHdrStop() { - return TUKind != TU_Prefix && PPOpts->PCHWithHdrStop; + return TUKind != TU_Prefix && PPOpts.PCHWithHdrStop; } /// Skip tokens until after the #include of the through header or @@ -657,8 +657,8 @@ void Preprocessor::SkipTokensWhileUsingPCH() { if (ReachedMainFileEOF) { if (UsingPCHThroughHeader) Diag(SourceLocation(), diag::err_pp_through_header_not_seen) - << PPOpts->PCHThroughHeader << 1; - else if (!PPOpts->PCHWithHdrStopCreate) + << PPOpts.PCHThroughHeader << 1; + else if (!PPOpts.PCHWithHdrStopCreate) Diag(SourceLocation(), diag::err_pp_pragma_hdrstop_not_seen); } } diff --git a/clang/unittests/Analysis/MacroExpansionContextTest.cpp b/clang/unittests/Analysis/MacroExpansionContextTest.cpp index 48db9d46180ab..19074d7dcfdd4 100644 --- a/clang/unittests/Analysis/MacroExpansionContextTest.cpp +++ b/clang/unittests/Analysis/MacroExpansionContextTest.cpp @@ -60,11 +60,10 @@ class MacroExpansionContextTest : public ::testing::Test { SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf))); HeaderSearchOptions HSOpts; TrivialModuleLoader ModLoader; + PreprocessorOptions PPOpts; HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, Target.get()); - Preprocessor PP(std::make_shared(), Diags, LangOpts, - SourceMgr, HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); + Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, + /*IILookup=*/nullptr, /*OwnsHeaderSearch=*/false); PP.Initialize(*Target); auto Ctx = std::make_unique(LangOpts); diff --git a/clang/unittests/Basic/SourceManagerTest.cpp b/clang/unittests/Basic/SourceManagerTest.cpp index 1f2dba6fcc5d8..201c3f9a68d1d 100644 --- a/clang/unittests/Basic/SourceManagerTest.cpp +++ b/clang/unittests/Basic/SourceManagerTest.cpp @@ -136,12 +136,11 @@ TEST_F(SourceManagerTest, isBeforeInTranslationUnit) { SourceMgr.setMainFileID(mainFileID); HeaderSearchOptions HSOpts; + PreprocessorOptions PPOpts; TrivialModuleLoader ModLoader; HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, &*Target); - Preprocessor PP(std::make_shared(), Diags, LangOpts, - SourceMgr, HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); + Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, + /*IILookup =*/nullptr, /*OwnsHeaderSearch =*/false); PP.Initialize(*Target); PP.EnterMainSourceFile(); @@ -186,12 +185,11 @@ TEST_F(SourceManagerTest, isBeforeInTranslationUnitWithTokenSplit) { SourceMgr.createFileID(llvm::MemoryBuffer::getMemBuffer(main))); HeaderSearchOptions HSOpts; + PreprocessorOptions PPOpts; TrivialModuleLoader ModLoader; HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, &*Target); - Preprocessor PP(std::make_shared(), Diags, LangOpts, - SourceMgr, HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); + Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, + /*IILookup=*/nullptr, /*OwnsHeaderSearch=*/false); PP.Initialize(*Target); PP.EnterMainSourceFile(); llvm::SmallString<8> Scratch; @@ -462,11 +460,10 @@ TEST_F(SourceManagerTest, ResetsIncludeLocMap) { auto ParseFile = [&] { TrivialModuleLoader ModLoader; HeaderSearchOptions HSOpts; + PreprocessorOptions PPOpts; HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, &*Target); - Preprocessor PP(std::make_shared(), Diags, LangOpts, - SourceMgr, HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); + Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, + /*IILookup=*/nullptr, /*OwnsHeaderSearch=*/false); PP.Initialize(*Target); PP.EnterMainSourceFile(); PP.LexTokensUntilEOF(); @@ -538,13 +535,12 @@ TEST_F(SourceManagerTest, getMacroArgExpandedLocation) { SourceMgr.overrideFileContents(headerFile, std::move(HeaderBuf)); HeaderSearchOptions HSOpts; + PreprocessorOptions PPOpts; TrivialModuleLoader ModLoader; HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, &*Target); - Preprocessor PP(std::make_shared(), Diags, LangOpts, - SourceMgr, HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); + Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, + /*IILookup=*/nullptr, /*OwnsHeaderSearch=*/false); // Ensure we can get expanded locations in presence of implicit includes. // These are different than normal includes since predefines buffer doesn't // have a valid insertion location. @@ -657,12 +653,11 @@ TEST_F(SourceManagerTest, isBeforeInTranslationUnitWithMacroInInclude) { SourceMgr.overrideFileContents(headerFile, std::move(HeaderBuf)); HeaderSearchOptions HSOpts; + PreprocessorOptions PPOpts; TrivialModuleLoader ModLoader; HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, &*Target); - Preprocessor PP(std::make_shared(), Diags, LangOpts, - SourceMgr, HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); + Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, + /*IILookup=*/nullptr, /*OwnsHeaderSearch=*/false); PP.Initialize(*Target); std::vector Macros; diff --git a/clang/unittests/Lex/LexerTest.cpp b/clang/unittests/Lex/LexerTest.cpp index 96a07173e4bcc..3bbc571ee5307 100644 --- a/clang/unittests/Lex/LexerTest.cpp +++ b/clang/unittests/Lex/LexerTest.cpp @@ -59,9 +59,9 @@ class LexerTest : public ::testing::Test { HeaderSearchOptions HSOpts; HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, Target.get()); + PreprocessorOptions PPOpts; std::unique_ptr PP = std::make_unique( - std::make_shared(), Diags, LangOpts, SourceMgr, - HeaderInfo, ModLoader, + PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, /*IILookup =*/nullptr, /*OwnsHeaderSearch =*/false); PP->Initialize(*Target); diff --git a/clang/unittests/Lex/ModuleDeclStateTest.cpp b/clang/unittests/Lex/ModuleDeclStateTest.cpp index 16e726b9699fd..5b2719d36de85 100644 --- a/clang/unittests/Lex/ModuleDeclStateTest.cpp +++ b/clang/unittests/Lex/ModuleDeclStateTest.cpp @@ -77,11 +77,10 @@ class ModuleDeclStateTest : public ::testing::Test { HeaderInfo.emplace(HSOpts, SourceMgr, Diags, LangOpts, Target.get()); - return std::make_unique( - std::make_shared(), Diags, LangOpts, SourceMgr, - *HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); + return std::make_unique(PPOpts, Diags, LangOpts, SourceMgr, + *HeaderInfo, ModLoader, + /*IILookup=*/nullptr, + /*OwnsHeaderSearch=*/false); } void preprocess(Preprocessor &PP, std::unique_ptr C) { @@ -103,6 +102,7 @@ class ModuleDeclStateTest : public ::testing::Test { TrivialModuleLoader ModLoader; HeaderSearchOptions HSOpts; std::optional HeaderInfo; + PreprocessorOptions PPOpts; }; TEST_F(ModuleDeclStateTest, NamedModuleInterface) { diff --git a/clang/unittests/Lex/PPCallbacksTest.cpp b/clang/unittests/Lex/PPCallbacksTest.cpp index 735ff11bf92de..5b1e25ce9bdd5 100644 --- a/clang/unittests/Lex/PPCallbacksTest.cpp +++ b/clang/unittests/Lex/PPCallbacksTest.cpp @@ -199,10 +199,9 @@ class PPCallbacksTest : public ::testing::Test { HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, Target.get()); AddFakeHeader(HeaderInfo, HeaderPath, SystemHeader); - Preprocessor PP(std::make_shared(), Diags, LangOpts, - SourceMgr, HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); + PreprocessorOptions PPOpts; + Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, + /*IILookup=*/nullptr, /*OwnsHeaderSearch=*/false); return InclusionDirectiveCallback(PP)->FilenameRange; } @@ -218,10 +217,9 @@ class PPCallbacksTest : public ::testing::Test { HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, Target.get()); AddFakeHeader(HeaderInfo, HeaderPath, SystemHeader); - Preprocessor PP(std::make_shared(), Diags, LangOpts, - SourceMgr, HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); + PreprocessorOptions PPOpts; + Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, + /*IILookup=*/nullptr, /*OwnsHeaderSearch=*/false); return InclusionDirectiveCallback(PP)->FileType; } @@ -246,10 +244,9 @@ class PPCallbacksTest : public ::testing::Test { llvm::MemoryBuffer::getMemBuffer(SourceText); SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf))); HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, Target.get()); - Preprocessor PP(std::make_shared(), Diags, LangOpts, - SourceMgr, HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); + PreprocessorOptions PPOpts; + Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, + /*IILookup=*/nullptr, /*OwnsHeaderSearch=*/false); PP.Initialize(*Target); auto *Callbacks = new CondDirectiveCallbacks; PP.addPPCallbacks(std::unique_ptr(Callbacks)); @@ -269,12 +266,12 @@ class PPCallbacksTest : public ::testing::Test { HeaderSearchOptions HSOpts; TrivialModuleLoader ModLoader; + PreprocessorOptions PPOpts; HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, Target.get()); - Preprocessor PP(std::make_shared(), Diags, LangOpts, - SourceMgr, HeaderInfo, ModLoader, /*IILookup=*/nullptr, - /*OwnsHeaderSearch=*/false); + Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, + /*IILookup=*/nullptr, /*OwnsHeaderSearch=*/false); PP.Initialize(*Target); auto *Callbacks = new PragmaMarkCallbacks; @@ -298,13 +295,14 @@ class PPCallbacksTest : public ::testing::Test { HeaderSearchOptions HSOpts; TrivialModuleLoader ModLoader; + PreprocessorOptions PPOpts; + HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, OpenCLLangOpts, Target.get()); - Preprocessor PP(std::make_shared(), Diags, - OpenCLLangOpts, SourceMgr, HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); + Preprocessor PP(PPOpts, Diags, OpenCLLangOpts, SourceMgr, HeaderInfo, + ModLoader, /*IILookup=*/nullptr, + /*OwnsHeaderSearch=*/false); PP.Initialize(*Target); // parser actually sets correct pragma handlers for preprocessor @@ -436,14 +434,13 @@ TEST_F(PPCallbacksTest, FileNotFoundSkipped) { HeaderSearchOptions HSOpts; TrivialModuleLoader ModLoader; + PreprocessorOptions PPOpts; HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, Target.get()); DiagnosticConsumer *DiagConsumer = new DiagnosticConsumer; DiagnosticsEngine FileNotFoundDiags(DiagID, DiagOpts.get(), DiagConsumer); - Preprocessor PP(std::make_shared(), FileNotFoundDiags, - LangOpts, SourceMgr, HeaderInfo, ModLoader, - /*IILookup=*/nullptr, - /*OwnsHeaderSearch=*/false); + Preprocessor PP(PPOpts, FileNotFoundDiags, LangOpts, SourceMgr, HeaderInfo, + ModLoader, /*IILookup=*/nullptr, /*OwnsHeaderSearch=*/false); PP.Initialize(*Target); class FileNotFoundCallbacks : public PPCallbacks { diff --git a/clang/unittests/Lex/PPConditionalDirectiveRecordTest.cpp b/clang/unittests/Lex/PPConditionalDirectiveRecordTest.cpp index 112321f7a8d54..349490aa7eef5 100644 --- a/clang/unittests/Lex/PPConditionalDirectiveRecordTest.cpp +++ b/clang/unittests/Lex/PPConditionalDirectiveRecordTest.cpp @@ -76,8 +76,8 @@ TEST_F(PPConditionalDirectiveRecordTest, PPRecAPI) { HeaderSearchOptions HSOpts; TrivialModuleLoader ModLoader; HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, Target.get()); - Preprocessor PP(std::make_shared(), Diags, LangOpts, - SourceMgr, HeaderInfo, ModLoader, + PreprocessorOptions PPOpts; + Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, /*IILookup =*/nullptr, /*OwnsHeaderSearch =*/false); PP.Initialize(*Target); diff --git a/clang/unittests/Lex/PPDependencyDirectivesTest.cpp b/clang/unittests/Lex/PPDependencyDirectivesTest.cpp index 8f925bde7920c..03f1432d990cb 100644 --- a/clang/unittests/Lex/PPDependencyDirectivesTest.cpp +++ b/clang/unittests/Lex/PPDependencyDirectivesTest.cpp @@ -116,8 +116,8 @@ TEST_F(PPDependencyDirectivesTest, MacroGuard) { return llvm::ArrayRef(DepDirectivesObjects.back()->Directives); }; - auto PPOpts = std::make_shared(); - PPOpts->DependencyDirectivesForFile = [&](FileEntryRef File) + PreprocessorOptions PPOpts; + PPOpts.DependencyDirectivesForFile = [&](FileEntryRef File) -> std::optional> { return getDependencyDirectives(File); }; diff --git a/clang/unittests/Lex/PPMemoryAllocationsTest.cpp b/clang/unittests/Lex/PPMemoryAllocationsTest.cpp index b1c9a5ba8188e..ecbd7f47a2c88 100644 --- a/clang/unittests/Lex/PPMemoryAllocationsTest.cpp +++ b/clang/unittests/Lex/PPMemoryAllocationsTest.cpp @@ -68,8 +68,8 @@ TEST_F(PPMemoryAllocationsTest, PPMacroDefinesAllocations) { HeaderSearchOptions HSOpts; TrivialModuleLoader ModLoader; HeaderSearch HeaderInfo(HSOpts, SourceMgr, Diags, LangOpts, Target.get()); - Preprocessor PP(std::make_shared(), Diags, LangOpts, - SourceMgr, HeaderInfo, ModLoader, + PreprocessorOptions PPOpts; + Preprocessor PP(PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, /*IILookup =*/nullptr, /*OwnsHeaderSearch =*/false); PP.Initialize(*Target); From ea0869ccb9a80e235f6f47eaa47fb1d5888d660a Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Fri, 4 Apr 2025 10:20:44 -0700 Subject: [PATCH 0670/1029] [clang][parse] Fix build of ParseHLSLRootSignatureTest.cpp Fallout from PR #133467. --- clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp index acdf455a5d6aa..19d5b267f310a 100644 --- a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp +++ b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp @@ -84,11 +84,9 @@ class ParseHLSLRootSignatureTest : public ::testing::Test { HeaderSearchOptions SearchOpts; HeaderSearch HeaderInfo(SearchOpts, SourceMgr, Diags, LangOpts, Target.get()); - std::unique_ptr PP = std::make_unique( - std::make_shared(), Diags, LangOpts, SourceMgr, - HeaderInfo, ModLoader, - /*IILookup =*/nullptr, - /*OwnsHeaderSearch =*/false); + auto PP = std::make_unique( + PPOpts, Diags, LangOpts, SourceMgr, HeaderInfo, ModLoader, + /*IILookup =*/nullptr, /*OwnsHeaderSearch =*/false); PP->Initialize(*Target); PP->EnterMainSourceFile(); return PP; @@ -101,6 +99,7 @@ class ParseHLSLRootSignatureTest : public ::testing::Test { DiagnosticsEngine Diags; SourceManager SourceMgr; LangOptions LangOpts; + PreprocessorOptions PPOpts; std::shared_ptr TargetOpts; IntrusiveRefCntPtr Target; }; From cde90e68f8123e7abef3f9e18d79980aa19f460a Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Fri, 4 Apr 2025 10:21:33 -0700 Subject: [PATCH 0671/1029] [clang][deps] Respect `Lexer::cutOffLexing()` (#134404) This is crucial when recovering from fatal loader errors. Without it, the `Lexer` keeps yielding more tokens and the compiler may access invalid `ASTReader` state. rdar://133388373 --- clang/lib/Lex/Lexer.cpp | 3 ++ .../modules-relocated-mm-macro.c | 42 +++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 clang/test/ClangScanDeps/modules-relocated-mm-macro.c diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index 3128627490e28..93200458f04b4 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -4543,6 +4543,9 @@ bool Lexer::LexDependencyDirectiveToken(Token &Result) { using namespace dependency_directives_scan; + if (BufferPtr == BufferEnd) + return LexEndOfFile(Result, BufferPtr); + while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) { if (DepDirectives.front().Kind == pp_eof) return LexEndOfFile(Result, BufferEnd); diff --git a/clang/test/ClangScanDeps/modules-relocated-mm-macro.c b/clang/test/ClangScanDeps/modules-relocated-mm-macro.c new file mode 100644 index 0000000000000..17f479d9e0046 --- /dev/null +++ b/clang/test/ClangScanDeps/modules-relocated-mm-macro.c @@ -0,0 +1,42 @@ +// This test checks that we don't crash when we load two conflicting PCM files +// and instead emit the appropriate diagnostics. + +// RUN: rm -rf %t +// RUN: split-file %s %t + +// RUN: mkdir %t/frameworks1 + +// RUN: clang-scan-deps -format experimental-full -o %t/deps1.json -- \ +// RUN: %clang -fmodules -fmodules-cache-path=%t/cache \ +// RUN: -F %t/frameworks1 -F %t/frameworks2 \ +// RUN: -c %t/tu1.m -o %t/tu1.o + +// RUN: cp -r %t/frameworks2/A.framework %t/frameworks1 + +// RUN: not clang-scan-deps -format experimental-full -o %t/deps2.json -- \ +// RUN: %clang -fmodules -fmodules-cache-path=%t/cache \ +// RUN: -F %t/frameworks1 -F %t/frameworks2 \ +// RUN: -c %t/tu2.m -o %t/tu2.o \ +// RUN: 2>&1 | FileCheck %s + +// CHECK: fatal error: module 'A' is defined in both '{{.*}}.pcm' and '{{.*}}.pcm' + +//--- frameworks2/A.framework/Modules/module.modulemap +framework module A { header "A.h" } +//--- frameworks2/A.framework/Headers/A.h +#define MACRO_A 1 + +//--- frameworks2/B.framework/Modules/module.modulemap +framework module B { header "B.h" } +//--- frameworks2/B.framework/Headers/B.h +#include + +//--- tu1.m +#include + +//--- tu2.m +#include +#include // This results in a conflict and a fatal loader error. + +#if MACRO_A // This crashes with lexer that does not respect `cutOfLexing()`. +#endif From 6ee5e694bffd97e69fe5459fc8f94b79b91c88fb Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 4 Apr 2025 13:21:49 -0400 Subject: [PATCH 0672/1029] [gn] port 10c6ebc4271 (-gen-clang-diags-compat-ids) --- .../clang/include/clang/Basic/BUILD.gn | 22 ++++++++++++++++--- .../gn/secondary/clang/lib/Basic/BUILD.gn | 1 + 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn index 7db37467cfc8b..65dd10e7570e8 100644 --- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn @@ -49,16 +49,27 @@ foreach(diag_group, diag_groups) { ] td_file = "Diagnostic.td" } + + clang_tablegen("Diagnostic${diag_group}CompatIDs") { + args = [ + "-gen-clang-diags-compat-ids", + "-clang-component=${diag_group}", + ] + td_file = "Diagnostic.td" + } } group("diags_tablegen") { - # DiagnosticGroups and DiagnosticIndexName are intentionally not part of this - # group. Much of clang depends on the DiagKinds.inc files transitively, - # but almost nothing needs DiagnosticGroups.inc or DiagnosticIndexName.inc. + # DiagnosticGroups, DiagnosticIndexName, DiagnosticAllCompatIDs are + # intentionally not part of this group. Much of clang depends on the + # DiagKinds.inc files transitively, but almost nothing needs + # DiagnosticGroups.inc, DiagnosticIndexName.inc, or + # DiagnosticAllCompatIDs.inc. public_deps = [] foreach(diag_group, diag_groups) { public_deps += [ ":Diagnostic${diag_group}Kinds", ":Diagnostic${diag_group}Enums", + ":Diagnostic${diag_group}CompatIDs", ] } } @@ -73,6 +84,11 @@ clang_tablegen("DiagnosticIndexName") { td_file = "Diagnostic.td" } +clang_tablegen("DiagnosticAllCompatIDs") { + args = [ "-gen-clang-diags-compat-ids" ] + td_file = "Diagnostic.td" +} + # Attributes clang_tablegen("AttrList") { diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn index f73f151878809..c8f4cd8c35b98 100644 --- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn @@ -54,6 +54,7 @@ static_library("Basic") { ":write_vcsversion", "//clang/include/clang/Basic:AttrHasAttributeImpl", "//clang/include/clang/Basic:CXX11AttributeInfo", + "//clang/include/clang/Basic:DiagnosticAllCompatIDs", "//clang/include/clang/Basic:arm_fp16", "//clang/include/clang/Basic:arm_neon", "//clang/include/clang/Config", From 8f6551935a3896dfab0cc50a085e560d118d4eb3 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 4 Apr 2025 13:32:28 -0400 Subject: [PATCH 0673/1029] [gn] Add a missing dependency Needed after 6ee5e694bff --- .../utils/gn/secondary/clang-tools-extra/clangd/support/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/support/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/support/BUILD.gn index 3cdaa5301d105..217775c225781 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/support/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/support/BUILD.gn @@ -4,6 +4,7 @@ static_library("support") { output_name = "clangdSupport" configs += [ "//llvm/utils/gn/build:clang_code" ] deps = [ + "//clang/lib/Basic", "//llvm/lib/Support", "//llvm/utils/gn/build/libs/atomic", ] From d9bf39085213013fe8a6e3199c0eddfb040d8ad3 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 4 Apr 2025 13:52:06 -0400 Subject: [PATCH 0674/1029] [gn] port 4a4d41e723a ...and add missing TargetsToBuild dep. --- .../utils/gn/secondary/clang/tools/clang-sycl-linker/BUILD.gn | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/clang/tools/clang-sycl-linker/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-sycl-linker/BUILD.gn index 224a1d8bb684f..e20ad5a42b857 100644 --- a/llvm/utils/gn/secondary/clang/tools/clang-sycl-linker/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/tools/clang-sycl-linker/BUILD.gn @@ -9,15 +9,17 @@ executable("clang-sycl-linker") { deps = [ ":SYCLLinkOpts", "//clang/lib/Basic", + "//llvm/lib/Analysis", "//llvm/lib/BinaryFormat", "//llvm/lib/Bitcode/Writer", "//llvm/lib/IR", "//llvm/lib/IRReader", "//llvm/lib/Linker", + "//llvm/lib/MC", "//llvm/lib/Option", "//llvm/lib/Object", "//llvm/lib/Support", - "//llvm/lib/TargetParser", + "//llvm/lib/Target:TargetsToBuild", ] sources = [ "ClangSYCLLinker.cpp" ] } From b6b025797245a5e5416b522df041252e3c4ff868 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 4 Apr 2025 13:53:45 -0400 Subject: [PATCH 0675/1029] Fix the signature for __builtin___clear_cache (#134376) The signature was changed from void(char *, char *) to void(void *, void *) to match GCC's signature for the same builtin. Fixes #47833 --- clang/docs/ReleaseNotes.rst | 4 ++++ clang/include/clang/Basic/Builtins.td | 2 +- clang/test/Sema/clear_cache.c | 12 ++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 clang/test/Sema/clear_cache.c diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c521b56a98606..77252e3a98235 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -354,6 +354,10 @@ Bug Fixes to Compiler Builtins - The behvaiour of ``__add_pointer`` and ``__remove_pointer`` for Objective-C++'s ``id`` and interfaces has been fixed. +- The signature for ``__builtin___clear_cache`` was changed from + ``void(char *, char *)`` to ``void(void *, void *)`` to match GCC's signature + for the same builtin. (#GH47833) + Bug Fixes to Attribute Support ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - Fixed crash when a parameter to the ``clang::annotate`` attribute evaluates to ``void``. See #GH119125 diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index c7ca607e4b3d2..2e077176ac7e9 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -920,7 +920,7 @@ def FrameAddress : Builtin { def ClearCache : Builtin { let Spellings = ["__builtin___clear_cache"]; let Attributes = [NoThrow]; - let Prototype = "void(char*, char*)"; + let Prototype = "void(void*, void*)"; } def BuiltinSetjmp : Builtin { diff --git a/clang/test/Sema/clear_cache.c b/clang/test/Sema/clear_cache.c new file mode 100644 index 0000000000000..e6a3421309967 --- /dev/null +++ b/clang/test/Sema/clear_cache.c @@ -0,0 +1,12 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +// Ensure that __builtin___clear_cache has the expected signature. Clang used +// to have a signature accepting char * while GCC had a signature accepting +// void * that was documented incorrectly. +void test(void) { + int n = 0; + __builtin___clear_cache(&n, &n + 1); // Ok + + __builtin___clear_cache((const void *)&n, (const void *)(&n + 1)); // expected-warning 2 {{passing 'const void *' to parameter of type 'void *' discards qualifiers}} +} + From 30f2e92c6968a1348a76e7ba169259bb345b784c Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Fri, 4 Apr 2025 10:58:08 -0700 Subject: [PATCH 0676/1029] [clang] [sanitizer] predict trap checks succeed (#134310) Trap checks fail at most once (when the program crashes). --- clang/lib/CodeGen/CGExpr.cpp | 7 ++-- clang/test/CodeGen/allow-ubsan-check.c | 24 +++++++------ .../test/CodeGen/bounds-checking-debuginfo.c | 7 ++-- clang/test/CodeGen/cfi-check-fail.c | 6 ++-- clang/test/CodeGen/cfi-check-fail2.c | 10 +++--- clang/test/CodeGen/ubsan-trap-merge.c | 36 +++++++++++-------- 6 files changed, 52 insertions(+), 38 deletions(-) diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 0d7e5a2091146..7fe2d9582178e 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -3967,16 +3967,19 @@ void CodeGenFunction::EmitTrapCheck(llvm::Value *Checked, NoMerge = NoMerge || !CGM.getCodeGenOpts().OptimizationLevel || (CurCodeDecl && CurCodeDecl->hasAttr()); + llvm::MDBuilder MDHelper(getLLVMContext()); if (TrapBB && !NoMerge) { auto Call = TrapBB->begin(); assert(isa(Call) && "Expected call in trap BB"); Call->applyMergedLocation(Call->getDebugLoc(), Builder.getCurrentDebugLocation()); - Builder.CreateCondBr(Checked, Cont, TrapBB); + Builder.CreateCondBr(Checked, Cont, TrapBB, + MDHelper.createLikelyBranchWeights()); } else { TrapBB = createBasicBlock("trap"); - Builder.CreateCondBr(Checked, Cont, TrapBB); + Builder.CreateCondBr(Checked, Cont, TrapBB, + MDHelper.createLikelyBranchWeights()); EmitBlock(TrapBB); llvm::CallInst *TrapCall = diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c index c116604288546..e225fb63f08eb 100644 --- a/clang/test/CodeGen/allow-ubsan-check.c +++ b/clang/test/CodeGen/allow-ubsan-check.c @@ -49,7 +49,7 @@ // TR-NEXT: [[TMP7:%.*]] = xor i1 [[TMP6]], true, !nosanitize [[META2]] // TR-NEXT: [[TMP8:%.*]] = or i1 [[OR]], [[TMP7]], !nosanitize [[META2]] // TR-NEXT: [[TMP9:%.*]] = and i1 [[TMP5]], [[TMP8]], !nosanitize [[META2]] -// TR-NEXT: br i1 [[TMP9]], label %[[CONT:.*]], label %[[TRAP:.*]], !nosanitize [[META2]] +// TR-NEXT: br i1 [[TMP9]], label %[[CONT:.*]], label %[[TRAP:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]] // TR: [[TRAP]]: // TR-NEXT: tail call void @llvm.ubsantrap(i8 3) #[[ATTR5:[0-9]+]], !nosanitize [[META2]] // TR-NEXT: unreachable, !nosanitize [[META2]] @@ -107,12 +107,12 @@ int div(int x, int y) { // TR-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // TR-NEXT: [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]] // TR-NEXT: [[DOTNOT1:%.*]] = and i1 [[TMP0]], [[TMP1]] -// TR-NEXT: br i1 [[DOTNOT1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]] +// TR-NEXT: br i1 [[DOTNOT1]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF4:![0-9]+]], !nosanitize [[META2]] // TR: [[TRAP]]: // TR-NEXT: tail call void @llvm.ubsantrap(i8 22) #[[ATTR5]], !nosanitize [[META2]] // TR-NEXT: unreachable, !nosanitize [[META2]] // TR: [[CONT]]: -// TR-NEXT: [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA3:![0-9]+]] +// TR-NEXT: [[TMP2:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA5:![0-9]+]] // TR-NEXT: ret i32 [[TMP2]] // // REC-LABEL: define dso_local i32 @null( @@ -159,7 +159,7 @@ int null(int* x) { // TR-NEXT: [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]] // TR-NEXT: [[TMP2:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 41), !nosanitize [[META2]] // TR-NEXT: [[DOTDEMORGAN:%.*]] = and i1 [[TMP1]], [[TMP2]] -// TR-NEXT: br i1 [[DOTDEMORGAN]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]] +// TR-NEXT: br i1 [[DOTDEMORGAN]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF4]], !nosanitize [[META2]] // TR: [[TRAP]]: // TR-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR5]], !nosanitize [[META2]] // TR-NEXT: unreachable, !nosanitize [[META2]] @@ -224,7 +224,7 @@ void use(double*); // TR-NEXT: br i1 [[TMP3]], label %[[TRAP:.*]], label %[[BB4:.*]] // TR: [[BB4]]: // TR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[VLA]], i64 [[IDXPROM]] -// TR-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA7:![0-9]+]] +// TR-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA9:![0-9]+]] // TR-NEXT: ret double [[TMP5]] // TR: [[TRAP]]: // TR-NEXT: call void @llvm.ubsantrap(i8 71) #[[ATTR5]], !nosanitize [[META2]] @@ -267,12 +267,14 @@ double lbounds(int b, int i) { // CHECK: [[META10]] = !{!"double", [[META7]], i64 0} //. // TR: [[META2]] = !{} -// TR: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0} -// TR: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0} -// TR: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0} -// TR: [[META6]] = !{!"Simple C/C++ TBAA"} -// TR: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0} -// TR: [[META8]] = !{!"double", [[META5]], i64 0} +// TR: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1} +// TR: [[PROF4]] = !{!"branch_weights", i32 1, i32 1048575} +// TR: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0} +// TR: [[META6]] = !{!"int", [[META7:![0-9]+]], i64 0} +// TR: [[META7]] = !{!"omnipotent char", [[META8:![0-9]+]], i64 0} +// TR: [[META8]] = !{!"Simple C/C++ TBAA"} +// TR: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0} +// TR: [[META10]] = !{!"double", [[META7]], i64 0} //. // REC: [[META2]] = !{} // REC: [[PROF3]] = !{!"branch_weights", i32 1048575, i32 1} diff --git a/clang/test/CodeGen/bounds-checking-debuginfo.c b/clang/test/CodeGen/bounds-checking-debuginfo.c index 61c7af6e7c5b8..4f5ba2b76eeeb 100644 --- a/clang/test/CodeGen/bounds-checking-debuginfo.c +++ b/clang/test/CodeGen/bounds-checking-debuginfo.c @@ -23,7 +23,7 @@ void d(double*); // CHECK-TRAP-NEXT: [[CALL:%.*]] = call i32 (...) @f(), !dbg [[DBG22:![0-9]+]] // CHECK-TRAP-NEXT: [[TMP0:%.*]] = sext i32 [[CALL]] to i64, !dbg [[DBG23:![0-9]+]], !nosanitize [[META10:![0-9]+]] // CHECK-TRAP-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 10, !dbg [[DBG23]], !nosanitize [[META10]] -// CHECK-TRAP-NEXT: br i1 [[TMP1]], label %[[CONT:.*]], label %[[TRAP:.*]], !dbg [[DBG23]], !nosanitize [[META10]] +// CHECK-TRAP-NEXT: br i1 [[TMP1]], label %[[CONT:.*]], label %[[TRAP:.*]], !dbg [[DBG23]], !prof [[PROF27:![0-9]+]], !nosanitize [[META10]] // CHECK-TRAP: [[TRAP]]: // CHECK-TRAP-NEXT: call void @llvm.ubsantrap(i8 18) #[[ATTR3:[0-9]+]], !dbg [[DBG23]], !nosanitize [[META10]] // CHECK-TRAP-NEXT: unreachable, !dbg [[DBG23]], !nosanitize [[META10]] @@ -31,7 +31,7 @@ void d(double*); // CHECK-TRAP-NEXT: [[IDXPROM:%.*]] = sext i32 [[CALL]] to i64, !dbg [[DBG26:![0-9]+]] // CHECK-TRAP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x double], ptr [[A]], i64 0, i64 [[IDXPROM]], !dbg [[DBG26]] // CHECK-TRAP-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !dbg [[DBG26]] -// CHECK-TRAP-NEXT: ret double [[TMP2]], !dbg [[DBG27:![0-9]+]] +// CHECK-TRAP-NEXT: ret double [[TMP2]], !dbg [[DBG28:![0-9]+]] // // CHECK-NOTRAP-LABEL: define dso_local double @f1( // CHECK-NOTRAP-SAME: i32 noundef [[B:%.*]], i32 noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG4:![0-9]+]] { @@ -92,7 +92,8 @@ double f1(int b, int i) { // CHECK-TRAP: [[META24]] = distinct !DISubprogram(name: "__ubsan_check_array_bounds", scope: [[META5]], file: [[META5]], type: [[META25:![0-9]+]], flags: DIFlagArtificial, spFlags: DISPFlagDefinition, unit: [[META0]]) // CHECK-TRAP: [[META25]] = !DISubroutineType(types: null) // CHECK-TRAP: [[DBG26]] = !DILocation(line: 66, column: 10, scope: [[DBG4]]) -// CHECK-TRAP: [[DBG27]] = !DILocation(line: 66, column: 3, scope: [[DBG4]]) +// CHECK-TRAP: [[PROF27]] = !{!"branch_weights", i32 1048575, i32 1} +// CHECK-TRAP: [[DBG28]] = !DILocation(line: 66, column: 3, scope: [[DBG4]]) //. // CHECK-NOTRAP: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C11, file: [[META1:![0-9]+]], isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) // CHECK-NOTRAP: [[META1]] = !DIFile(filename: "", directory: {{.*}}) diff --git a/clang/test/CodeGen/cfi-check-fail.c b/clang/test/CodeGen/cfi-check-fail.c index 15f6c77abf2b2..d4262b20adc3f 100644 --- a/clang/test/CodeGen/cfi-check-fail.c +++ b/clang/test/CodeGen/cfi-check-fail.c @@ -13,7 +13,7 @@ void caller(void (*f)(void)) { // CHECK: %[[DATA:.*]] = load ptr, ptr %[[ALLOCA0]], align 8 // CHECK: %[[ADDR:.*]] = load ptr, ptr %[[ALLOCA1]], align 8 // CHECK: %[[ICMP_NOT_NULL:.*]] = icmp ne ptr %[[DATA]], null -// CHECK: br i1 %[[ICMP_NOT_NULL]], label %[[CONT0:.*]], label %[[TRAP:.*]], +// CHECK: br i1 %[[ICMP_NOT_NULL]], label %[[CONT0:.*]], label %[[TRAP:.*]], !prof // CHECK: [[TRAP]]: // CHECK-NEXT: call void @llvm.ubsantrap(i8 2) @@ -35,7 +35,7 @@ void caller(void (*f)(void)) { // CHECK: [[CONT1]]: // CHECK: %[[NOT_1:.*]] = icmp ne i8 %[[KIND]], 1 -// CHECK: br i1 %[[NOT_1]], label %[[CONT2:.*]], label %[[HANDLE1:.*]], !nosanitize +// CHECK: br i1 %[[NOT_1]], label %[[CONT2:.*]], label %[[HANDLE1:.*]], !prof !{{[0-9]+}}, !nosanitize // CHECK: [[HANDLE1]]: // CHECK-NEXT: call void @llvm.ubsantrap(i8 2) @@ -63,7 +63,7 @@ void caller(void (*f)(void)) { // CHECK: [[CONT4]]: // CHECK: %[[NOT_4:.*]] = icmp ne i8 %[[KIND]], 4 -// CHECK: br i1 %[[NOT_4]], label %[[CONT5:.*]], label %[[HANDLE4:.*]], !nosanitize +// CHECK: br i1 %[[NOT_4]], label %[[CONT5:.*]], label %[[HANDLE4:.*]], !prof !{{[0-9]+}}, !nosanitize // CHECK: [[HANDLE4]]: // CHECK-NEXT: call void @llvm.ubsantrap(i8 2) diff --git a/clang/test/CodeGen/cfi-check-fail2.c b/clang/test/CodeGen/cfi-check-fail2.c index d904ee41f607b..ae0ba78085894 100644 --- a/clang/test/CodeGen/cfi-check-fail2.c +++ b/clang/test/CodeGen/cfi-check-fail2.c @@ -19,7 +19,7 @@ void caller(void (*f)(void)) { // CHECK: %[[DATA:.*]] = load ptr, ptr %[[ALLOCA0]], align 8 // CHECK: %[[ADDR:.*]] = load ptr, ptr %[[ALLOCA1]], align 8 // CHECK: %[[ICMP_NOT_NULL:.*]] = icmp ne ptr %[[DATA]], null -// CHECK: br i1 %[[ICMP_NOT_NULL]], label %[[CONT0:.*]], label %[[TRAP:.*]], +// CHECK: br i1 %[[ICMP_NOT_NULL]], label %[[CONT0:.*]], label %[[TRAP:.*]], !prof // CHECK: [[TRAP]]: // CHECK-NEXT: call void @llvm.ubsantrap(i8 2) @@ -41,7 +41,7 @@ void caller(void (*f)(void)) { // CHECK: [[CONT1]]: // CHECK: %[[NOT_1:.*]] = icmp ne i8 %[[KIND]], 1 -// CHECK: br i1 %[[NOT_1]], label %[[CONT2:.*]], label %[[HANDLE1:.*]], !nosanitize +// CHECK: br i1 %[[NOT_1]], label %[[CONT2:.*]], label %[[HANDLE1:.*]], !prof // CHECK: [[HANDLE1]]: // CHECK-NEXT: call void @llvm.ubsantrap(i8 2) @@ -49,7 +49,7 @@ void caller(void (*f)(void)) { // CHECK: [[CONT2]]: // CHECK: %[[NOT_2:.*]] = icmp ne i8 %[[KIND]], 2 -// CHECK: br i1 %[[NOT_2]], label %[[CONT3:.*]], label %[[HANDLE2:.*]], !nosanitize +// CHECK: br i1 %[[NOT_2]], label %[[CONT3:.*]], label %[[HANDLE2:.*]], !prof // CHECK: [[HANDLE2]]: // CHECK-NEXT: call void @llvm.ubsantrap(i8 2) @@ -57,7 +57,7 @@ void caller(void (*f)(void)) { // CHECK: [[CONT3]]: // CHECK: %[[NOT_3:.*]] = icmp ne i8 %[[KIND]], 3 -// CHECK: br i1 %[[NOT_3]], label %[[CONT4:.*]], label %[[HANDLE3:.*]], !nosanitize +// CHECK: br i1 %[[NOT_3]], label %[[CONT4:.*]], label %[[HANDLE3:.*]], !prof // CHECK: [[HANDLE3]]: // CHECK-NEXT: call void @llvm.ubsantrap(i8 2) @@ -65,7 +65,7 @@ void caller(void (*f)(void)) { // CHECK: [[CONT4]]: // CHECK: %[[NOT_4:.*]] = icmp ne i8 %[[KIND]], 4 -// CHECK: br i1 %[[NOT_4]], label %[[CONT5:.*]], label %[[HANDLE4:.*]], !nosanitize +// CHECK: br i1 %[[NOT_4]], label %[[CONT5:.*]], label %[[HANDLE4:.*]], !prof // CHECK: [[HANDLE4]]: // CHECK-NEXT: call void @llvm.ubsantrap(i8 2) diff --git a/clang/test/CodeGen/ubsan-trap-merge.c b/clang/test/CodeGen/ubsan-trap-merge.c index 486aa55f5b811..b06420950d941 100644 --- a/clang/test/CodeGen/ubsan-trap-merge.c +++ b/clang/test/CodeGen/ubsan-trap-merge.c @@ -64,7 +64,7 @@ // TRAP-NOMERGE-NEXT: [[ENTRY:.*:]] // TRAP-NOMERGE-NEXT: [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 125), !nosanitize [[META2:![0-9]+]] // TRAP-NOMERGE-NEXT: [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]] -// TRAP-NOMERGE-NEXT: br i1 [[TMP1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]] +// TRAP-NOMERGE-NEXT: br i1 [[TMP1]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]] // TRAP-NOMERGE: [[TRAP]]: // TRAP-NOMERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4:[0-9]+]], !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: unreachable, !nosanitize [[META2]] @@ -104,7 +104,7 @@ // TRAP-MERGE-NEXT: [[ENTRY:.*:]] // TRAP-MERGE-NEXT: [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 125), !nosanitize [[META2:![0-9]+]] // TRAP-MERGE-NEXT: [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]] -// TRAP-MERGE-NEXT: br i1 [[TMP1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]] +// TRAP-MERGE-NEXT: br i1 [[TMP1]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]] // TRAP-MERGE: [[TRAP]]: // TRAP-MERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4:[0-9]+]], !nosanitize [[META2]] // TRAP-MERGE-NEXT: unreachable, !nosanitize [[META2]] @@ -187,7 +187,7 @@ int f(int x) { // TRAP-NOMERGE-NEXT: [[ENTRY:.*:]] // TRAP-NOMERGE-NEXT: [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 127), !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]] -// TRAP-NOMERGE-NEXT: br i1 [[TMP1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]] +// TRAP-NOMERGE-NEXT: br i1 [[TMP1]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]] // TRAP-NOMERGE: [[TRAP]]: // TRAP-NOMERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: unreachable, !nosanitize [[META2]] @@ -227,7 +227,7 @@ int f(int x) { // TRAP-MERGE-NEXT: [[ENTRY:.*:]] // TRAP-MERGE-NEXT: [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 127), !nosanitize [[META2]] // TRAP-MERGE-NEXT: [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]] -// TRAP-MERGE-NEXT: br i1 [[TMP1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]] +// TRAP-MERGE-NEXT: br i1 [[TMP1]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]] // TRAP-MERGE: [[TRAP]]: // TRAP-MERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]] // TRAP-MERGE-NEXT: unreachable, !nosanitize [[META2]] @@ -317,14 +317,14 @@ int g(int x) { // TRAP-NOMERGE-NEXT: [[ENTRY:.*:]] // TRAP-NOMERGE-NEXT: [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 127), !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]] -// TRAP-NOMERGE-NEXT: br i1 [[TMP1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]] +// TRAP-NOMERGE-NEXT: br i1 [[TMP1]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]] // TRAP-NOMERGE: [[TRAP]]: // TRAP-NOMERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: unreachable, !nosanitize [[META2]] // TRAP-NOMERGE: [[CONT]]: // TRAP-NOMERGE-NEXT: [[TMP2:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[Y]], i32 129), !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !nosanitize [[META2]] -// TRAP-NOMERGE-NEXT: br i1 [[TMP3]], label %[[TRAP1:.*]], label %[[CONT2:.*]], !nosanitize [[META2]] +// TRAP-NOMERGE-NEXT: br i1 [[TMP3]], label %[[TRAP1:.*]], label %[[CONT2:.*]], !prof [[PROF3]], !nosanitize [[META2]] // TRAP-NOMERGE: [[TRAP1]]: // TRAP-NOMERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: unreachable, !nosanitize [[META2]] @@ -385,14 +385,14 @@ int g(int x) { // TRAP-MERGE-NEXT: [[ENTRY:.*:]] // TRAP-MERGE-NEXT: [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 127), !nosanitize [[META2]] // TRAP-MERGE-NEXT: [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]] -// TRAP-MERGE-NEXT: br i1 [[TMP1]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]] +// TRAP-MERGE-NEXT: br i1 [[TMP1]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]] // TRAP-MERGE: [[TRAP]]: // TRAP-MERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]] // TRAP-MERGE-NEXT: unreachable, !nosanitize [[META2]] // TRAP-MERGE: [[CONT]]: // TRAP-MERGE-NEXT: [[TMP2:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[Y]], i32 129), !nosanitize [[META2]] // TRAP-MERGE-NEXT: [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !nosanitize [[META2]] -// TRAP-MERGE-NEXT: br i1 [[TMP3]], label %[[TRAP]], label %[[CONT1:.*]], !nosanitize [[META2]] +// TRAP-MERGE-NEXT: br i1 [[TMP3]], label %[[TRAP]], label %[[CONT1:.*]], !prof [[PROF3]], !nosanitize [[META2]] // TRAP-MERGE: [[CONT1]]: // TRAP-MERGE-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !nosanitize [[META2]] // TRAP-MERGE-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP0]], 0, !nosanitize [[META2]] @@ -546,14 +546,14 @@ int h(int x, int y) { // TRAP-NOMERGE-NEXT: [[ENTRY:.*:]] // TRAP-NOMERGE-NEXT: [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 125), !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]] -// TRAP-NOMERGE-NEXT: br i1 [[TMP1]], label %[[TRAP_I:.*]], label %[[F_EXIT:.*]], !nosanitize [[META2]] +// TRAP-NOMERGE-NEXT: br i1 [[TMP1]], label %[[TRAP_I:.*]], label %[[F_EXIT:.*]], !prof [[PROF3]], !nosanitize [[META2]] // TRAP-NOMERGE: [[TRAP_I]]: // TRAP-NOMERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: unreachable, !nosanitize [[META2]] // TRAP-NOMERGE: [[F_EXIT]]: // TRAP-NOMERGE-NEXT: [[TMP2:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[Y]], i32 127), !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !nosanitize [[META2]] -// TRAP-NOMERGE-NEXT: br i1 [[TMP3]], label %[[TRAP_I2:.*]], label %[[G_EXIT:.*]], !nosanitize [[META2]] +// TRAP-NOMERGE-NEXT: br i1 [[TMP3]], label %[[TRAP_I2:.*]], label %[[G_EXIT:.*]], !prof [[PROF3]], !nosanitize [[META2]] // TRAP-NOMERGE: [[TRAP_I2]]: // TRAP-NOMERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: unreachable, !nosanitize [[META2]] @@ -562,7 +562,7 @@ int h(int x, int y) { // TRAP-NOMERGE-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: [[TMP6:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]]), !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: [[TMP7:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1, !nosanitize [[META2]] -// TRAP-NOMERGE-NEXT: br i1 [[TMP7]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]] +// TRAP-NOMERGE-NEXT: br i1 [[TMP7]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]] // TRAP-NOMERGE: [[TRAP]]: // TRAP-NOMERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]] // TRAP-NOMERGE-NEXT: unreachable, !nosanitize [[META2]] @@ -637,14 +637,14 @@ int h(int x, int y) { // TRAP-MERGE-NEXT: [[ENTRY:.*:]] // TRAP-MERGE-NEXT: [[TMP0:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X]], i32 125), !nosanitize [[META2]] // TRAP-MERGE-NEXT: [[TMP1:%.*]] = extractvalue { i32, i1 } [[TMP0]], 1, !nosanitize [[META2]] -// TRAP-MERGE-NEXT: br i1 [[TMP1]], label %[[TRAP_I:.*]], label %[[F_EXIT:.*]], !nosanitize [[META2]] +// TRAP-MERGE-NEXT: br i1 [[TMP1]], label %[[TRAP_I:.*]], label %[[F_EXIT:.*]], !prof [[PROF3]], !nosanitize [[META2]] // TRAP-MERGE: [[TRAP_I]]: // TRAP-MERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]] // TRAP-MERGE-NEXT: unreachable, !nosanitize [[META2]] // TRAP-MERGE: [[F_EXIT]]: // TRAP-MERGE-NEXT: [[TMP2:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[Y]], i32 127), !nosanitize [[META2]] // TRAP-MERGE-NEXT: [[TMP3:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1, !nosanitize [[META2]] -// TRAP-MERGE-NEXT: br i1 [[TMP3]], label %[[TRAP_I2:.*]], label %[[G_EXIT:.*]], !nosanitize [[META2]] +// TRAP-MERGE-NEXT: br i1 [[TMP3]], label %[[TRAP_I2:.*]], label %[[G_EXIT:.*]], !prof [[PROF3]], !nosanitize [[META2]] // TRAP-MERGE: [[TRAP_I2]]: // TRAP-MERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]] // TRAP-MERGE-NEXT: unreachable, !nosanitize [[META2]] @@ -653,7 +653,7 @@ int h(int x, int y) { // TRAP-MERGE-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP2]], 0, !nosanitize [[META2]] // TRAP-MERGE-NEXT: [[TMP6:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[TMP4]], i32 [[TMP5]]), !nosanitize [[META2]] // TRAP-MERGE-NEXT: [[TMP7:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1, !nosanitize [[META2]] -// TRAP-MERGE-NEXT: br i1 [[TMP7]], label %[[TRAP:.*]], label %[[CONT:.*]], !nosanitize [[META2]] +// TRAP-MERGE-NEXT: br i1 [[TMP7]], label %[[TRAP:.*]], label %[[CONT:.*]], !prof [[PROF3]], !nosanitize [[META2]] // TRAP-MERGE: [[TRAP]]: // TRAP-MERGE-NEXT: tail call void @llvm.ubsantrap(i8 0) #[[ATTR4]], !nosanitize [[META2]] // TRAP-MERGE-NEXT: unreachable, !nosanitize [[META2]] @@ -734,3 +734,11 @@ int m(int x, int y) { // TRAP-NOMERGE: attributes #[[ATTR4]] = { nomerge noreturn nounwind } // HANDLER-NOMERGE: attributes #[[ATTR4]] = { nomerge noreturn nounwind } // MINRT-NOMERGE: attributes #[[ATTR4]] = { nomerge noreturn nounwind } + +// TRAP-MERGE: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575} +// HANDLER-MERGE: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575} +// MINRT-MERGE: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575} + +// TRAP-NOMERGE: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575} +// HANDLER-NOMERGE: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575} +// MINRT-NOMERGE: [[PROF3]] = !{!"branch_weights", i32 1, i32 1048575} From d62d15e298ce323cb933f4949b42fe46dcb01b77 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 4 Apr 2025 19:05:59 +0100 Subject: [PATCH 0677/1029] [RISCV] Undo unprofitable zext of icmp combine (#134306) InstCombine will combine this zext of an icmp where the source has a single bit set to a lshr plus trunc (`InstCombinerImpl::transformZExtICmp`): ```llvm define @f( %x) { %1 = and %x, splat (i64 8) %2 = icmp ne %1, splat (i64 0) %3 = zext %2 to ret %3 } ``` ```llvm define @reverse_zexticmp_i64( %x) { %1 = trunc %x to %2 = lshr %1, splat (i8 2) %3 = and %2, splat (i8 1) ret %3 } ``` In a loop, this ends up being unprofitable for RISC-V because the codegen now goes from: ```asm f: # @f .cfi_startproc # %bb.0: vsetvli a0, zero, e64, m1, ta, ma vand.vi v8, v8, 8 vmsne.vi v0, v8, 0 vsetvli zero, zero, e8, mf8, ta, ma vmv.v.i v8, 0 vmerge.vim v8, v8, 1, v0 ret ``` To a series of narrowing vnsrl.wis: ```asm f: # @f .cfi_startproc # %bb.0: vsetvli a0, zero, e64, m1, ta, ma vand.vi v8, v8, 8 vsetvli zero, zero, e32, mf2, ta, ma vnsrl.wi v8, v8, 3 vsetvli zero, zero, e16, mf4, ta, ma vnsrl.wi v8, v8, 0 vsetvli zero, zero, e8, mf8, ta, ma vnsrl.wi v8, v8, 0 ret ``` In the original form, the vmv.v.i is loop invariant and is hoisted out, and the vmerge.vim usually gets folded away into a masked instruction, so you usually just end up with a vsetvli + vmsne.vi. The truncate requires multiple instructions and introduces a vtype toggle for each one, and is measurably slower on the BPI-F3. This reverses the transform in RISCVISelLowering for truncations greater than twice the bitwidth, i.e. it keeps single vnsrl.wis. Fixes #132245 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 67 ++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/zext-icmp.ll | 86 +++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/rvv/zext-icmp.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8c409adedc2df..ad44ee755698a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -15040,6 +15040,70 @@ static SDValue performTRUNCATECombine(SDNode *N, SelectionDAG &DAG, return combineTruncSelectToSMaxUSat(N, DAG); } +// InstCombinerImpl::transformZExtICmp will narrow a zext of an icmp with a +// truncation. But RVV doesn't have truncation instructions for more than twice +// the bitwidth. +// +// E.g. trunc %x to will generate: +// +// vsetvli a0, zero, e32, m2, ta, ma +// vnsrl.wi v12, v8, 0 +// vsetvli zero, zero, e16, m1, ta, ma +// vnsrl.wi v8, v12, 0 +// vsetvli zero, zero, e8, mf2, ta, ma +// vnsrl.wi v8, v8, 0 +// +// So reverse the combine so we generate an vmseq/vmsne again: +// +// and (lshr (trunc X), ShAmt), 1 +// --> +// zext (icmp ne (and X, (1 << ShAmt)), 0) +// +// and (lshr (not (trunc X)), ShAmt), 1 +// --> +// zext (icmp eq (and X, (1 << ShAmt)), 0) +static SDValue reverseZExtICmpCombine(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + using namespace SDPatternMatch; + SDLoc DL(N); + + if (!Subtarget.hasVInstructions()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + + APInt ShAmt; + SDValue Inner; + if (!sd_match(N, m_And(m_OneUse(m_Srl(m_Value(Inner), m_ConstInt(ShAmt))), + m_One()))) + return SDValue(); + + SDValue X; + bool IsNot; + if (sd_match(Inner, m_Not(m_Trunc(m_Value(X))))) + IsNot = true; + else if (sd_match(Inner, m_Trunc(m_Value(X)))) + IsNot = false; + else + return SDValue(); + + EVT WideVT = X.getValueType(); + if (VT.getScalarSizeInBits() >= WideVT.getScalarSizeInBits() / 2) + return SDValue(); + + SDValue Res = + DAG.getNode(ISD::AND, DL, WideVT, X, + DAG.getConstant(1 << ShAmt.getZExtValue(), DL, WideVT)); + Res = DAG.getSetCC(DL, + EVT::getVectorVT(*DAG.getContext(), MVT::i1, + WideVT.getVectorElementCount()), + Res, DAG.getConstant(0, DL, WideVT), + IsNot ? ISD::SETEQ : ISD::SETNE); + return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Res); +} + // Combines two comparison operation and logic operation to one selection // operation(min, max) and logic operation. Returns new constructed Node if // conditions for optimization are satisfied. @@ -15067,6 +15131,9 @@ static SDValue performANDCombine(SDNode *N, return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, And); } + if (SDValue V = reverseZExtICmpCombine(N, DAG, Subtarget)) + return V; + if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget)) return V; if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget)) diff --git a/llvm/test/CodeGen/RISCV/rvv/zext-icmp.ll b/llvm/test/CodeGen/RISCV/rvv/zext-icmp.ll new file mode 100644 index 0000000000000..e5043281a27dd --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/zext-icmp.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s +; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s + +; Test that we reverse InstCombinerImpl::transformZExtICmp when unprofitable + +define @reverse_zexticmp_i16( %x) { +; CHECK-LABEL: reverse_zexticmp_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: ret + %1 = trunc %x to + %2 = lshr %1, splat (i8 2) + %3 = and %2, splat (i8 1) + ret %3 +} + +define @reverse_zexticmp_i32( %x) { +; CHECK-LABEL: reverse_zexticmp_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vand.vi v8, v8, 4 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: ret + %1 = trunc %x to + %2 = lshr %1, splat (i8 2) + %3 = and %2, splat (i8 1) + ret %3 +} + +define @reverse_zexticmp_neg_i32( %x) { +; CHECK-LABEL: reverse_zexticmp_neg_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma +; CHECK-NEXT: vand.vi v8, v8, 4 +; CHECK-NEXT: vmseq.vi v0, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: ret + %1 = trunc %x to + %2 = xor %1, splat (i8 -1) + %3 = lshr %2, splat (i8 2) + %4 = and %3, splat (i8 1) + ret %4 +} + +define @reverse_zexticmp_i64( %x) { +; CHECK-LABEL: reverse_zexticmp_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vand.vi v8, v8, 4 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: ret + %1 = trunc %x to + %2 = lshr %1, splat (i8 2) + %3 = and %2, splat (i8 1) + ret %3 +} + +define @reverse_zexticmp_neg_i64( %x) { +; CHECK-LABEL: reverse_zexticmp_neg_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vand.vi v8, v8, 4 +; CHECK-NEXT: vmseq.vi v0, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: ret + %1 = trunc %x to + %2 = xor %1, splat (i8 -1) + %3 = lshr %2, splat (i8 2) + %4 = and %3, splat (i8 1) + ret %4 +} + From cd2f85a24b55336c96de56276b54d1196fd55fd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 4 Apr 2025 11:06:10 -0700 Subject: [PATCH 0678/1029] [mlir][NVVM] Add ops for vote all and any sync (#134309) Add operations for `nvvm.vote.all.sync` and `nvvm.vote.any.sync` intrinsics similar to `nvvm.vote.ballot.sync`. --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 3 +- flang/test/Lower/CUDA/cuda-device-proc.cuf | 2 +- mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 48 ++++++++++++++++--- mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 41 +++++----------- .../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 15 ++++++ mlir/test/Dialect/LLVMIR/nvvm.mlir | 10 +++- mlir/test/Target/LLVMIR/nvvmir.mlir | 8 +++- 7 files changed, 87 insertions(+), 40 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 0ca636bc091ec..702a55a49c953 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -6616,7 +6616,8 @@ IntrinsicLibrary::genVoteBallotSync(mlir::Type resultType, mlir::Value arg1 = builder.create(loc, builder.getI1Type(), args[1]); return builder - .create(loc, resultType, args[0], arg1) + .create(loc, resultType, args[0], arg1, + mlir::NVVM::VoteSyncKind::ballot) .getResult(); } diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index a7f9038761b51..7d6d920dfb2e8 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -303,7 +303,7 @@ end subroutine ! CHECK-LABEL: func.func @_QPtestvote() ! CHECK: fir.call @llvm.nvvm.vote.all.sync ! CHECK: fir.call @llvm.nvvm.vote.any.sync -! CHECK: %{{.*}} = nvvm.vote.ballot.sync %{{.*}}, %{{.*}} : i32 +! CHECK: %{{.*}} = nvvm.vote.sync ballot %{{.*}}, %{{.*}} -> i32 ! CHECK-DAG: func.func private @__ldca_i4x4_(!fir.ref>, !fir.ref>) ! CHECK-DAG: func.func private @__ldcg_i4x4_(!fir.ref>, !fir.ref>) diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 8a54804b220a1..0a6e66919f021 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -808,15 +808,49 @@ def NVVM_ShflOp : let hasVerifier = 1; } -def NVVM_VoteBallotOp : - NVVM_Op<"vote.ballot.sync">, - Results<(outs LLVM_Type:$res)>, - Arguments<(ins LLVM_Type:$mask, LLVM_Type:$pred)> { +def VoteSyncKindAny : I32EnumAttrCase<"any", 0>; +def VoteSyncKindAll : I32EnumAttrCase<"all", 1>; +def VoteSyncKindBallot : I32EnumAttrCase<"ballot", 2>; +def VoteSyncKindUni : I32EnumAttrCase<"uni", 3>; + +def VoteSyncKind : I32EnumAttr<"VoteSyncKind", "NVVM vote sync kind", + [VoteSyncKindAny, VoteSyncKindAll, + VoteSyncKindBallot, VoteSyncKindUni]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::NVVM"; +} + +def VoteSyncKindAttr : EnumAttr; + +def NVVM_VoteSyncOp + : NVVM_Op<"vote.sync">, + Results<(outs AnyTypeOf<[I32, I1]>:$res)>, + Arguments<(ins I32:$mask, I1:$pred, VoteSyncKindAttr:$kind)> { + let summary = "Vote across thread group"; + let description = [{ + The `vote.sync` op will cause executing thread to wait until all non-exited + threads corresponding to membermask have executed `vote.sync` with the same + qualifiers and same membermask value before resuming execution. + + The vote operation kinds are: + - `any`: True if source predicate is True for some thread in membermask. + - `all`: True if source predicate is True for all non-exited threads in + membermask. + - `uni`: True if source predicate has the same value in all non-exited + threads in membermask. + - `ballot`: In the ballot form, the destination result is a 32 bit integer. + In this form, the predicate from each thread in membermask are copied into + the corresponding bit position of the result, where the bit position + corresponds to the thread’s lane id. + + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-vote-sync) + }]; string llvmBuilder = [{ - $res = createIntrinsicCall(builder, - llvm::Intrinsic::nvvm_vote_ballot_sync, {$mask, $pred}); + auto intId = getVoteSyncIntrinsicId($kind); + $res = createIntrinsicCall(builder, intId, {$mask, $pred}); }]; - let hasCustomAssemblyFormat = 1; + let assemblyFormat = "$kind $mask `,` $pred attr-dict `->` type($res)"; + let hasVerifier = 1; } def NVVM_SyncWarpOp : diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 556114f4370b3..09bff6101edd3 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -48,34 +48,6 @@ using namespace NVVM; #include "mlir/Dialect/LLVMIR/NVVMOpsDialect.cpp.inc" #include "mlir/Dialect/LLVMIR/NVVMOpsEnums.cpp.inc" -//===----------------------------------------------------------------------===// -// Printing/parsing for NVVM ops -//===----------------------------------------------------------------------===// - -static void printNVVMIntrinsicOp(OpAsmPrinter &p, Operation *op) { - p << " " << op->getOperands(); - if (op->getNumResults() > 0) - p << " : " << op->getResultTypes(); -} - -// ::= `llvm.nvvm.vote.ballot.sync %mask, %pred` : result_type -ParseResult VoteBallotOp::parse(OpAsmParser &parser, OperationState &result) { - MLIRContext *context = parser.getContext(); - auto int32Ty = IntegerType::get(context, 32); - auto int1Ty = IntegerType::get(context, 1); - - SmallVector ops; - Type type; - return failure(parser.parseOperandList(ops) || - parser.parseOptionalAttrDict(result.attributes) || - parser.parseColonType(type) || - parser.addTypeToList(type, result.types) || - parser.resolveOperands(ops, {int32Ty, int1Ty}, - parser.getNameLoc(), result.operands)); -} - -void VoteBallotOp::print(OpAsmPrinter &p) { printNVVMIntrinsicOp(p, *this); } - //===----------------------------------------------------------------------===// // Verifier methods //===----------------------------------------------------------------------===// @@ -1160,6 +1132,19 @@ LogicalResult NVVM::MatchSyncOp::verify() { return success(); } +LogicalResult NVVM::VoteSyncOp::verify() { + if (getKind() == NVVM::VoteSyncKind::ballot) { + if (!getType().isInteger(32)) { + return emitOpError("vote.sync 'ballot' returns an i32"); + } + } else { + if (!getType().isInteger(1)) { + return emitOpError("vote.sync 'any', 'all' and 'uni' returns an i1"); + } + } + return success(); +} + //===----------------------------------------------------------------------===// // getIntrinsicID/getIntrinsicIDAndArgs methods //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp index 9d14ff09ab434..beff90237562d 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp @@ -121,6 +121,21 @@ static llvm::Intrinsic::ID getMatchSyncIntrinsicId(Type valType, } } +static llvm::Intrinsic::ID getVoteSyncIntrinsicId(NVVM::VoteSyncKind kind) { + switch (kind) { + case NVVM::VoteSyncKind::any: + return llvm::Intrinsic::nvvm_vote_any_sync; + case NVVM::VoteSyncKind::all: + return llvm::Intrinsic::nvvm_vote_all_sync; + case NVVM::VoteSyncKind::ballot: + return llvm::Intrinsic::nvvm_vote_ballot_sync; + case NVVM::VoteSyncKind::uni: + return llvm::Intrinsic::nvvm_vote_uni_sync; + default: + llvm_unreachable("unsupported vote kind"); + } +} + /// Return the intrinsic ID associated with ldmatrix for the given paramters. static llvm::Intrinsic::ID getLdMatrixIntrinsicId(NVVM::MMALayout layout, int32_t num) { diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir index 18bf39424f0bf..d3915492c38a0 100644 --- a/mlir/test/Dialect/LLVMIR/nvvm.mlir +++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir @@ -129,8 +129,14 @@ func.func @nvvm_shfl_pred( // CHECK-LABEL: @nvvm_vote( func.func @nvvm_vote(%arg0 : i32, %arg1 : i1) -> i32 { - // CHECK: nvvm.vote.ballot.sync %{{.*}}, %{{.*}} : i32 - %0 = nvvm.vote.ballot.sync %arg0, %arg1 : i32 + // CHECK: nvvm.vote.sync ballot %{{.*}}, %{{.*}} -> i32 + %0 = nvvm.vote.sync ballot %arg0, %arg1 -> i32 + // CHECK: nvvm.vote.sync all %{{.*}}, %{{.*}} -> i1 + %1 = nvvm.vote.sync all %arg0, %arg1 -> i1 + // CHECK: nvvm.vote.sync any %{{.*}}, %{{.*}} -> i1 + %2 = nvvm.vote.sync any %arg0, %arg1 -> i1 + // CHECK: nvvm.vote.sync uni %{{.*}}, %{{.*}} -> i1 + %3 = nvvm.vote.sync uni %arg0, %arg1 -> i1 llvm.return %0 : i32 } diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index c3ec88db1d694..3a0713f2feee8 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -255,7 +255,13 @@ llvm.func @nvvm_shfl_pred( // CHECK-LABEL: @nvvm_vote llvm.func @nvvm_vote(%0 : i32, %1 : i1) -> i32 { // CHECK: call i32 @llvm.nvvm.vote.ballot.sync(i32 %{{.*}}, i1 %{{.*}}) - %3 = nvvm.vote.ballot.sync %0, %1 : i32 + %3 = nvvm.vote.sync ballot %0, %1 -> i32 + // CHECK: call i1 @llvm.nvvm.vote.all.sync(i32 %{{.*}}, i1 %{{.*}}) + %4 = nvvm.vote.sync all %0, %1 -> i1 + // CHECK: call i1 @llvm.nvvm.vote.any.sync(i32 %{{.*}}, i1 %{{.*}}) + %5 = nvvm.vote.sync any %0, %1 -> i1 + // CHECK: call i1 @llvm.nvvm.vote.uni.sync(i32 %{{.*}}, i1 %{{.*}}) + %6 = nvvm.vote.sync uni %0, %1 -> i1 llvm.return %3 : i32 } From 19aec007351394b552855120e6840537d8780e9d Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 3 Apr 2025 13:21:22 -0400 Subject: [PATCH 0679/1029] [SLP]Initial support for (masked)loads + compress and (masked)interleaved Added initial support for (masked)loads + compress and (masked)interleaved loads. Reviewers: RKSimon, hiraditya Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/132099 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 378 ++++++++++++++++-- .../X86/entries-shuffled-diff-sizes.ll | 17 +- .../X86/gep-nodes-with-non-gep-inst.ll | 22 +- .../Transforms/SLPVectorizer/X86/pr47623.ll | 16 +- .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 162 +++----- .../Transforms/SLPVectorizer/X86/pr47629.ll | 162 +++----- .../X86/remark_gather-load-redux-cost.ll | 11 +- .../X86/reorder-possible-strided-node.ll | 52 +-- .../X86/reorder-reused-masked-gather.ll | 11 +- .../X86/reorder-reused-masked-gather2.ll | 10 +- .../X86/reordered-masked-loads.ll | 65 +++ .../X86/scatter-vectorize-reused-pointer.ll | 12 +- .../Transforms/SLPVectorizer/X86/sin-sqrt.ll | 8 +- .../SLPVectorizer/X86/split-load8_2-unord.ll | 10 +- .../X86/split-load8_2_unord_geps.ll | 11 +- 15 files changed, 596 insertions(+), 351 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5a4715e083969..31c684e16f051 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -38,6 +38,7 @@ #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryLocation.h" @@ -1380,7 +1381,8 @@ class BoUpSLP { Gather, Vectorize, ScatterVectorize, - StridedVectorize + StridedVectorize, + CompressVectorize }; using ValueList = SmallVector; @@ -3378,6 +3380,7 @@ class BoUpSLP { Vectorize, ///< The node is regularly vectorized. ScatterVectorize, ///< Masked scatter/gather node. StridedVectorize, ///< Strided loads (and stores) + CompressVectorize, ///< (Masked) load with compress. NeedToGather, ///< Gather/buildvector node. CombinedVectorize, ///< Vectorized node, combined with its user into more ///< complex node like select/cmp to minmax, mul/add to @@ -3604,6 +3607,9 @@ class BoUpSLP { case StridedVectorize: dbgs() << "StridedVectorize\n"; break; + case CompressVectorize: + dbgs() << "CompressVectorize\n"; + break; case NeedToGather: dbgs() << "NeedToGather\n"; break; @@ -4819,7 +4825,8 @@ template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { if (Entry->isGather()) return "color=red"; if (Entry->State == TreeEntry::ScatterVectorize || - Entry->State == TreeEntry::StridedVectorize) + Entry->State == TreeEntry::StridedVectorize || + Entry->State == TreeEntry::CompressVectorize) return "color=blue"; return ""; } @@ -5419,6 +5426,157 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec, return Builder.CreateShuffleVector(Vec, Mask); } +/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered +/// with \p Order. +/// \return true if the mask represents strided access, false - otherwise. +static bool buildCompressMask(ArrayRef PointerOps, + ArrayRef Order, Type *ScalarTy, + const DataLayout &DL, ScalarEvolution &SE, + SmallVectorImpl &CompressMask) { + const unsigned Sz = PointerOps.size(); + CompressMask.assign(Sz, PoisonMaskElem); + // The first element always set. + CompressMask[0] = 0; + // Check if the mask represents strided access. + std::optional Stride = 0; + Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()]; + for (unsigned I : seq(1, Sz)) { + Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]]; + unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE); + CompressMask[I] = Pos; + if (!Stride) + continue; + if (*Stride == 0) { + *Stride = Pos; + continue; + } + if (Pos != *Stride * I) + Stride.reset(); + } + return Stride.has_value(); +} + +/// Checks if the \p VL can be transformed to a (masked)load + compress or +/// (masked) interleaved load. +static bool isMaskedLoadCompress( + ArrayRef VL, ArrayRef PointerOps, + ArrayRef Order, const TargetTransformInfo &TTI, + const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, + const DominatorTree &DT, const TargetLibraryInfo &TLI, + const function_ref AreAllUsersVectorized, bool &IsMasked, + unsigned &InterleaveFactor, SmallVectorImpl &CompressMask, + VectorType *&LoadVecTy) { + InterleaveFactor = 0; + Type *ScalarTy = VL.front()->getType(); + const unsigned Sz = VL.size(); + auto *VecTy = getWidenedType(ScalarTy, Sz); + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + // Check external uses. + for (const auto [I, V] : enumerate(VL)) { + if (AreAllUsersVectorized(V)) + continue; + InstructionCost ExtractCost = + TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I); + InstructionCost ScalarCost = + TTI.getInstructionCost(cast(V), CostKind); + if (ExtractCost <= ScalarCost) + return false; + } + Value *Ptr0; + Value *PtrN; + if (Order.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[Order.front()]; + PtrN = PointerOps[Order.back()]; + } + std::optional Diff = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE); + if (!Diff) + return false; + const unsigned MaxRegSize = + TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) + .getFixedValue(); + // Check for very large distances between elements. + if (*Diff / Sz >= MaxRegSize / 8) + return false; + Align CommonAlignment = computeCommonAlignment(VL); + LoadVecTy = getWidenedType(ScalarTy, *Diff + 1); + auto *LI = cast(Order.empty() ? VL.front() : VL[Order.front()]); + IsMasked = !isSafeToLoadUnconditionally( + Ptr0, LoadVecTy, CommonAlignment, DL, + cast(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT, + &TLI); + if (IsMasked && !TTI.isLegalMaskedLoad(LoadVecTy, CommonAlignment, + LI->getPointerAddressSpace())) + return false; + // TODO: perform the analysis of each scalar load for better + // safe-load-unconditionally analysis. + bool IsStrided = + buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask); + assert(CompressMask.size() >= 2 && "At least two elements are required"); + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, PointerOps, PointerOps.front(), + Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy); + // The cost of scalar loads. + InstructionCost ScalarLoadsCost = + std::accumulate(VL.begin(), VL.end(), InstructionCost(), + [&](InstructionCost C, Value *V) { + return C + TTI.getInstructionCost(cast(V), + CostKind); + }) + + ScalarGEPCost; + APInt DemandedElts = APInt::getAllOnes(Sz); + InstructionCost GatherCost = + getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts, + /*Insert=*/true, + /*Extract=*/false, CostKind) + + ScalarLoadsCost; + InstructionCost LoadCost = 0; + if (IsMasked) { + LoadCost = + TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, + LI->getPointerAddressSpace(), CostKind); + } else { + CommonAlignment = LI->getAlign(); + LoadCost = + TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, + LI->getPointerAddressSpace(), CostKind); + } + SmallVector Mask; + if (!Order.empty()) + inversePermutation(Order, Mask); + if (IsStrided) { + // Check for potential segmented(interleaved) loads. + if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1], + CommonAlignment, + LI->getPointerAddressSpace())) { + InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost( + Instruction::Load, LoadVecTy, CompressMask[1], std::nullopt, + CommonAlignment, LI->getPointerAddressSpace(), CostKind, IsMasked); + if (!Mask.empty()) + InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, + VecTy, Mask, CostKind); + if (InterleavedCost < GatherCost) { + InterleaveFactor = CompressMask[1]; + return true; + } + } + } + if (!Order.empty()) { + SmallVector NewMask(Sz, PoisonMaskElem); + for (unsigned I : seq(Sz)) { + NewMask[I] = CompressMask[Mask[I]]; + } + CompressMask.swap(NewMask); + } + InstructionCost CompressCost = ::getShuffleCost( + TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind); + InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost; + return TotalVecCost < GatherCost; +} + BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, @@ -5490,9 +5648,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, // Check that the sorted loads are consecutive. if (static_cast(*Diff) == Sz - 1) return LoadsState::Vectorize; - if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || - TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) - return LoadsState::Gather; // Simple check if not a strided access - clear order. bool IsPossibleStrided = *Diff % (Sz - 1) == 0; // Try to generate strided load node if: @@ -5548,7 +5703,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, } } } + bool IsMasked; + unsigned InterleaveFactor; + SmallVector CompressMask; + VectorType *LoadVecTy; + if (isMaskedLoadCompress( + VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI, + [&](Value *V) { + return areAllUsersVectorized(cast(V), + UserIgnoreList); + }, + IsMasked, InterleaveFactor, CompressMask, LoadVecTy)) + return LoadsState::CompressVectorize; } + if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || + TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) + return LoadsState::Gather; // Correctly identify compare the cost of loads + shuffles rather than // strided/masked gather loads. Returns true if vectorized + shuffles // representation is better than just gather. @@ -5641,7 +5811,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, } // If need the reorder - consider as high-cost masked gather for now. if ((LS == LoadsState::Vectorize || - LS == LoadsState::StridedVectorize) && + LS == LoadsState::StridedVectorize || + LS == LoadsState::CompressVectorize) && !Order.empty() && !isReverseOrder(Order)) LS = LoadsState::ScatterVectorize; States.push_back(LS); @@ -5706,6 +5877,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, CommonAlignment, CostKind) + VectorGEPCost; break; + case LoadsState::CompressVectorize: + VecLdCost += TTI.getMaskedMemoryOpCost( + Instruction::Load, SubVecTy, CommonAlignment, + LI0->getPointerAddressSpace(), CostKind) + + VectorGEPCost + + ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy, + {}, CostKind); + break; case LoadsState::ScatterVectorize: VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy, LI0->getPointerOperand(), @@ -6079,7 +6258,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, return std::nullopt; if (TE.State == TreeEntry::SplitVectorize || ((TE.State == TreeEntry::Vectorize || - TE.State == TreeEntry::StridedVectorize) && + TE.State == TreeEntry::StridedVectorize || + TE.State == TreeEntry::CompressVectorize) && (isa(TE.getMainOp()) || (TopToBottom && isa(TE.getMainOp()))))) { assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) && @@ -6266,7 +6446,8 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, OrdersType CurrentOrder; LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(), CurrentOrder, PointerOps); - if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize) + if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize || + Res == LoadsState::CompressVectorize) return std::move(CurrentOrder); } // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars @@ -6506,7 +6687,8 @@ void BoUpSLP::reorderTopToBottom() { VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || - TE->State == TreeEntry::SplitVectorize) || + TE->State == TreeEntry::SplitVectorize || + TE->State == TreeEntry::CompressVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.try_emplace(TE.get(), *CurrentOrder); if (TE->State == TreeEntry::Vectorize && @@ -6680,7 +6862,8 @@ void BoUpSLP::reorderTopToBottom() { if ((TE->State == TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty()) || ((TE->State == TreeEntry::Vectorize || - TE->State == TreeEntry::StridedVectorize) && + TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::CompressVectorize) && (isa(TE->getMainOp()) || (SLPReVec && isa(TE->getMainOp()))))) { @@ -6728,6 +6911,7 @@ bool BoUpSLP::canReorderOperands( return OpData.first == I && (OpData.second->State == TreeEntry::Vectorize || OpData.second->State == TreeEntry::StridedVectorize || + OpData.second->State == TreeEntry::CompressVectorize || OpData.second->State == TreeEntry::SplitVectorize); })) continue; @@ -6742,6 +6926,7 @@ bool BoUpSLP::canReorderOperands( // node, just reorder reuses mask. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) GatherOps.push_back(TE); @@ -6752,6 +6937,7 @@ bool BoUpSLP::canReorderOperands( [&Gather, UserTE, I](TreeEntry *TE) { assert(TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && "Only non-vectorized nodes are expected."); if (TE->UserTreeIndex.UserTE == UserTE && @@ -6788,6 +6974,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (const std::unique_ptr &TE : VectorizableTree) { if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize) NonVectorized.push_back(TE.get()); if (std::optional CurrentOrder = @@ -6795,6 +6982,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Queue.push(TE.get()); if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::CompressVectorize || TE->State == TreeEntry::SplitVectorize) || !TE->ReuseShuffleIndices.empty()) GathersToOrders.insert(TE.get()); @@ -6823,6 +7011,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (TreeEntry *TE : OrderedOps) { if (!(TE->State == TreeEntry::Vectorize || TE->State == TreeEntry::StridedVectorize || + TE->State == TreeEntry::CompressVectorize || TE->State == TreeEntry::SplitVectorize || (TE->isGather() && GathersToOrders.contains(TE))) || !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() || @@ -7117,6 +7306,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { // Gathers are processed separately. if (TE->State != TreeEntry::Vectorize && TE->State != TreeEntry::StridedVectorize && + TE->State != TreeEntry::CompressVectorize && TE->State != TreeEntry::SplitVectorize && (TE->State != TreeEntry::ScatterVectorize || TE->ReorderIndices.empty())) @@ -7149,7 +7339,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { Data.first->reorderOperands(Mask); if (!isa(Data.first->getMainOp()) || Data.first->isAltShuffle() || - Data.first->State == TreeEntry::StridedVectorize) { + Data.first->State == TreeEntry::StridedVectorize || + Data.first->State == TreeEntry::CompressVectorize) { reorderScalars(Data.first->Scalars, Mask); reorderOrder(Data.first->ReorderIndices, MaskOrder, /*BottomOrder=*/true); @@ -7927,8 +8118,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads( }); if (It == Slice.end()) return false; - ArrayRef VL = - VectorizableTree[std::get<0>(P)]->Scalars; + const TreeEntry &TE = + *VectorizableTree[std::get<0>(P)]; + ArrayRef VL = TE.Scalars; + OrdersType Order; + SmallVector PointerOps; + LoadsState State = canVectorizeLoads( + VL, VL.front(), Order, PointerOps); + if (State == LoadsState::ScatterVectorize || + State == LoadsState::CompressVectorize) + return false; ConsecutiveNodesSize += VL.size(); unsigned Start = std::distance(Slice.begin(), It); unsigned Sz = Slice.size() - Start; @@ -8393,23 +8592,44 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. + auto IsGatheredNode = [&]() { + if (!GatheredLoadsEntriesFirst) + return false; + return all_of(VL, [&](Value *V) { + if (isa(V)) + return true; + return any_of(getTreeEntries(V), [&](const TreeEntry *TE) { + return TE->Idx >= *GatheredLoadsEntriesFirst; + }); + }); + }; switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) { case LoadsState::Vectorize: return TreeEntry::Vectorize; + case LoadsState::CompressVectorize: + if (!IsGraphTransformMode && !VectorizableTree.empty()) { + // Delay slow vectorized nodes for better vectorization attempts. + LoadEntriesToVectorize.insert(VectorizableTree.size()); + return TreeEntry::NeedToGather; + } + return IsGatheredNode() ? TreeEntry::NeedToGather + : TreeEntry::CompressVectorize; case LoadsState::ScatterVectorize: if (!IsGraphTransformMode && !VectorizableTree.empty()) { // Delay slow vectorized nodes for better vectorization attempts. LoadEntriesToVectorize.insert(VectorizableTree.size()); return TreeEntry::NeedToGather; } - return TreeEntry::ScatterVectorize; + return IsGatheredNode() ? TreeEntry::NeedToGather + : TreeEntry::ScatterVectorize; case LoadsState::StridedVectorize: if (!IsGraphTransformMode && VectorizableTree.size() > 1) { // Delay slow vectorized nodes for better vectorization attempts. LoadEntriesToVectorize.insert(VectorizableTree.size()); return TreeEntry::NeedToGather; } - return TreeEntry::StridedVectorize; + return IsGatheredNode() ? TreeEntry::NeedToGather + : TreeEntry::StridedVectorize; case LoadsState::Gather: #ifndef NDEBUG Type *ScalarTy = VL0->getType(); @@ -9510,6 +9730,15 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, << "SLP: added a new TreeEntry (jumbled LoadInst).\n"; TE->dump()); break; + case TreeEntry::CompressVectorize: + // Vectorizing non-consecutive loads with (masked)load + compress. + TE = newTreeEntry(VL, TreeEntry::CompressVectorize, Bundle, S, + UserTreeIdx, ReuseShuffleIndices, CurrentOrder); + LLVM_DEBUG( + dbgs() + << "SLP: added a new TreeEntry (masked LoadInst + compress).\n"; + TE->dump()); + break; case TreeEntry::StridedVectorize: // Vectorizing non-consecutive loads with `llvm.masked.gather`. TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, @@ -12041,6 +12270,8 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { if (TE.State == TreeEntry::ScatterVectorize || TE.State == TreeEntry::StridedVectorize) return TTI::CastContextHint::GatherScatter; + if (TE.State == TreeEntry::CompressVectorize) + return TTI::CastContextHint::Masked; if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load && !TE.isAltShuffle()) { if (TE.ReorderIndices.empty()) @@ -12115,8 +12346,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } InstructionCost CommonCost = 0; SmallVector Mask; - if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize || - !isReverseOrder(E->ReorderIndices))) { + if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize && + (E->State != TreeEntry::StridedVectorize || + !isReverseOrder(E->ReorderIndices))) { SmallVector NewMask; if (E->getOpcode() == Instruction::Store) { // For stores the order is actually a mask. @@ -12134,7 +12366,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize || - E->State == TreeEntry::StridedVectorize) && + E->State == TreeEntry::StridedVectorize || + E->State == TreeEntry::CompressVectorize) && "Unhandled state"); assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || @@ -12225,8 +12458,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // Negative value means vectorizing is profitable. auto GetGEPCostDiff = [=](ArrayRef Ptrs, Value *BasePtr) { assert((E->State == TreeEntry::Vectorize || - E->State == TreeEntry::StridedVectorize) && - "Entry state expected to be Vectorize or StridedVectorize here."); + E->State == TreeEntry::StridedVectorize || + E->State == TreeEntry::CompressVectorize) && + "Entry state expected to be Vectorize, StridedVectorize or " + "MaskedLoadCompressVectorize here."); InstructionCost ScalarCost = 0; InstructionCost VecCost = 0; std::tie(ScalarCost, VecCost) = getGEPCosts( @@ -12689,6 +12924,51 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, /*VariableMask=*/false, CommonAlignment, CostKind); break; } + case TreeEntry::CompressVectorize: { + bool IsMasked; + unsigned InterleaveFactor; + SmallVector CompressMask; + VectorType *LoadVecTy; + SmallVector Scalars(VL.begin(), VL.end()); + if (!E->ReorderIndices.empty()) { + SmallVector Mask(E->ReorderIndices.begin(), + E->ReorderIndices.end()); + reorderScalars(Scalars, Mask); + } + SmallVector PointerOps(Scalars.size()); + for (auto [I, V] : enumerate(Scalars)) + PointerOps[I] = cast(V)->getPointerOperand(); + [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress( + Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT, + *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor, + CompressMask, LoadVecTy); + assert(IsVectorized && "Expected to be vectorized"); + Align CommonAlignment; + if (IsMasked) + CommonAlignment = computeCommonAlignment(VL); + else + CommonAlignment = LI0->getAlign(); + if (InterleaveFactor) { + VecLdCost = TTI->getInterleavedMemoryOpCost( + Instruction::Load, LoadVecTy, InterleaveFactor, std::nullopt, + CommonAlignment, LI0->getPointerAddressSpace(), CostKind); + } else if (IsMasked) { + VecLdCost = TTI->getMaskedMemoryOpCost( + Instruction::Load, LoadVecTy, CommonAlignment, + LI0->getPointerAddressSpace(), CostKind); + // TODO: include this cost into CommonCost. + VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + LoadVecTy, CompressMask, CostKind); + } else { + VecLdCost = TTI->getMemoryOpCost( + Instruction::Load, LoadVecTy, CommonAlignment, + LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); + // TODO: include this cost into CommonCost. + VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + LoadVecTy, CompressMask, CostKind); + } + break; + } case TreeEntry::ScatterVectorize: { Align CommonAlignment = computeCommonAlignment(UniqueValues.getArrayRef()); @@ -12978,6 +13258,7 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { if (VectorizableTree.size() == 1 && (VectorizableTree[0]->State == TreeEntry::Vectorize || VectorizableTree[0]->State == TreeEntry::StridedVectorize || + VectorizableTree[0]->State == TreeEntry::CompressVectorize || (ForReduction && AreVectorizableGathers(VectorizableTree[0].get(), VectorizableTree[0]->Scalars.size()) && @@ -13001,7 +13282,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { if (VectorizableTree[0]->isGather() || (VectorizableTree[1]->isGather() && VectorizableTree[0]->State != TreeEntry::ScatterVectorize && - VectorizableTree[0]->State != TreeEntry::StridedVectorize)) + VectorizableTree[0]->State != TreeEntry::StridedVectorize && + VectorizableTree[0]->State != TreeEntry::CompressVectorize)) return false; return true; @@ -16658,7 +16940,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ArrayRef(reinterpret_cast(E->ReorderIndices.begin()), E->ReorderIndices.size()); ShuffleBuilder.add(V, Mask); - } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) { + } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) || + E->State == TreeEntry::CompressVectorize) { ShuffleBuilder.addOrdered(V, {}); } else { ShuffleBuilder.addOrdered(V, E->ReorderIndices); @@ -17183,6 +17466,46 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *PO = LI->getPointerOperand(); if (E->State == TreeEntry::Vectorize) { NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign()); + } else if (E->State == TreeEntry::CompressVectorize) { + bool IsMasked; + unsigned InterleaveFactor; + SmallVector CompressMask; + VectorType *LoadVecTy; + SmallVector Scalars(E->Scalars.begin(), E->Scalars.end()); + if (!E->ReorderIndices.empty()) { + SmallVector Mask(E->ReorderIndices.begin(), + E->ReorderIndices.end()); + reorderScalars(Scalars, Mask); + } + SmallVector PointerOps(Scalars.size()); + for (auto [I, V] : enumerate(Scalars)) + PointerOps[I] = cast(V)->getPointerOperand(); + [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress( + Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT, + *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor, + CompressMask, LoadVecTy); + assert(IsVectorized && "Expected to be vectorized"); + Align CommonAlignment; + if (IsMasked) + CommonAlignment = computeCommonAlignment(E->Scalars); + else + CommonAlignment = LI->getAlign(); + if (IsMasked) { + SmallVector MaskValues( + getNumElements(LoadVecTy) / getNumElements(LI->getType()), + ConstantInt::getFalse(VecTy->getContext())); + for (int I : CompressMask) + MaskValues[I] = ConstantInt::getTrue(VecTy->getContext()); + Constant *MaskValue = ConstantVector::get(MaskValues); + NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment, + MaskValue); + } else { + NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment); + } + NewLI = ::propagateMetadata(NewLI, E->Scalars); + // TODO: include this cost into CommonCost. + NewLI = + cast(Builder.CreateShuffleVector(NewLI, CompressMask)); } else if (E->State == TreeEntry::StridedVectorize) { Value *Ptr0 = cast(E->Scalars.front())->getPointerOperand(); Value *PtrN = cast(E->Scalars.back())->getPointerOperand(); @@ -17252,7 +17575,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Align CommonAlignment = computeCommonAlignment(E->Scalars); NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment); } - Value *V = ::propagateMetadata(NewLI, E->Scalars); + Value *V = E->State == TreeEntry::CompressVectorize + ? NewLI + : ::propagateMetadata(NewLI, E->Scalars); V = FinalShuffle(V, E); E->VectorizedValue = V; @@ -17854,11 +18179,14 @@ Value *BoUpSLP::vectorizeTree( ArrayRef UseEntries = getTreeEntries(U); return !UseEntries.empty() && (E->State == TreeEntry::Vectorize || - E->State == TreeEntry::StridedVectorize) && + E->State == TreeEntry::StridedVectorize || + E->State == TreeEntry::CompressVectorize) && any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) { return (UseEntry->State == TreeEntry::Vectorize || UseEntry->State == - TreeEntry::StridedVectorize) && + TreeEntry::StridedVectorize || + UseEntry->State == + TreeEntry::CompressVectorize) && doesInTreeUserNeedToExtract( Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll index aa9195f8c48ce..b99a1c2d83394 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/entries-shuffled-diff-sizes.ll @@ -15,19 +15,16 @@ define void @test() { ; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc ninf nsz arcp contract afn float [[GEPLOAD1612]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP4]], [[TMP0]] ; CHECK-NEXT: store <16 x float> [[TMP6]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2928), align 16 -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1288), align 16 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1296), align 16 -; CHECK-NEXT: [[TMP13:%.*]] = load <8 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1304), align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1272), align 16 ; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 1620), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP12]], <8 x float> [[TMP13]], i64 8) -; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP15]], <4 x float> [[TMP7]], i64 0) -; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> [[TMP16]], <2 x float> [[TMP9]], i64 6) +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP14]], <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = fmul reassoc ninf nsz arcp contract afn <16 x float> [[TMP14]], [[TMP17]] -; CHECK-NEXT: store <16 x float> [[TMP18]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP18]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: store <16 x float> [[TMP15]], ptr getelementptr ([16000 x i8], ptr @GLOB, i64 0, i64 2992), align 16 ; CHECK-NEXT: ret void ; alloca_0: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll index 12263b065d89c..80ba7a40fb193 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gep-nodes-with-non-gep-inst.ll @@ -9,17 +9,9 @@ define void @test() { ; CHECK-NEXT: [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null ; CHECK-NEXT: br label [[BB:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[COND_IN_V]], align 8 -; CHECK-NEXT: [[BV:%.*]] = icmp eq i64 [[V]], 0 -; CHECK-NEXT: [[IN_1:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 4 -; CHECK-NEXT: [[V_1:%.*]] = load i64, ptr [[IN_1]], align 8 -; CHECK-NEXT: [[BV_1:%.*]] = icmp eq i64 [[V_1]], 0 -; CHECK-NEXT: [[IN_2:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 8 -; CHECK-NEXT: [[V_2:%.*]] = load i64, ptr [[IN_2]], align 8 -; CHECK-NEXT: [[BV_2:%.*]] = icmp eq i64 [[V_2]], 0 -; CHECK-NEXT: [[IN_3:%.*]] = getelementptr i64, ptr [[COND_IN_V]], i64 12 -; CHECK-NEXT: [[V_3:%.*]] = load i64, ptr [[IN_3]], align 8 -; CHECK-NEXT: [[BV_3:%.*]] = icmp eq i64 [[V_3]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> , <13 x i64> poison) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer ; CHECK-NEXT: ret void ; ; CHECK-SLP-THRESHOLD-LABEL: define void @test @@ -28,11 +20,9 @@ define void @test() { ; CHECK-SLP-THRESHOLD-NEXT: [[COND_IN_V:%.*]] = select i1 false, ptr null, ptr null ; CHECK-SLP-THRESHOLD-NEXT: br label [[BB:%.*]] ; CHECK-SLP-THRESHOLD: bb: -; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[COND_IN_V]], i32 0 -; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = getelementptr i64, <4 x ptr> [[TMP1]], <4 x i64> -; CHECK-SLP-THRESHOLD-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> [[TMP2]], i32 8, <4 x i1> splat (i1 true), <4 x i64> poison) -; CHECK-SLP-THRESHOLD-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], zeroinitializer +; CHECK-SLP-THRESHOLD-NEXT: [[TMP0:%.*]] = call <13 x i64> @llvm.masked.load.v13i64.p0(ptr [[COND_IN_V]], i32 8, <13 x i1> , <13 x i64> poison) +; CHECK-SLP-THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <13 x i64> [[TMP0]], <13 x i64> poison, <4 x i32> +; CHECK-SLP-THRESHOLD-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer ; CHECK-SLP-THRESHOLD-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll index f249394c91788..a9c0eb3f9f2b9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47623.ll @@ -24,20 +24,16 @@ define void @foo() { ; SSE-NEXT: ret void ; ; AVX-LABEL: @foo( -; AVX-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 -; AVX-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 -; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16 +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> ; AVX-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 ; AVX-NEXT: ret void ; ; AVX512-LABEL: @foo( -; AVX512-NEXT: [[TMP1:%.*]] = load i32, ptr @b, align 16 -; AVX512-NEXT: [[TMP2:%.*]] = load i32, ptr getelementptr inbounds nuw (i8, ptr @b, i64 8), align 8 -; AVX512-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP1]], i64 0 -; AVX512-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i64 1 -; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr @b, align 16 +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <2 x i32> +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> ; AVX512-NEXT: store <8 x i32> [[TMP5]], ptr @a, align 16 ; AVX512-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index 925c334cb5f20..a0e52c13ec621 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -164,36 +164,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> +; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; AVX512F-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( ; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> +; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -290,49 +274,30 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 -; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 -; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 -; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 -; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 -; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 -; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 -; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 -; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 -; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 -; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 -; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24 +; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], -; AVX512F-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> +; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; AVX512F-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], -; AVX512VL-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, ptr %1, align 4, !tbaa !2 @@ -447,49 +412,30 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 -; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 -; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 -; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 -; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 -; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 -; AVX2-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 +; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24 +; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], ; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> -; AVX512F-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512F-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> +; AVX512F-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; AVX512F-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> -; AVX512VL-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512VL-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, ptr %t0, i64 1 @@ -687,25 +633,21 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512F-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> +; AVX512F-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512VL-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, ptr %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index dc1ba4ec7e7ab..6c5638819dcea 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -164,36 +164,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; ; AVX512F-LABEL: @gather_load_2( ; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX512F-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX512F-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512F-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512F-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512F-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> +; AVX512F-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; AVX512F-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( ; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 4 -; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40 -; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 12 -; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 -; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 -; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], -; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = call <10 x i32> @llvm.masked.load.v10i32.p0(ptr nonnull [[TMP3]], i32 4, <10 x i1> , <10 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = shufflevector <10 x i32> [[TMP4]], <10 x i32> poison, <4 x i32> +; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -290,49 +274,30 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 60 -; AVX2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 72 -; AVX2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 36 -; AVX2-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24 -; AVX2-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 84 -; AVX2-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 -; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 -; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 -; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 -; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 -; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 -; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24 +; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_3( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], -; AVX512F-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> +; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; AVX512F-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP6]], -; AVX512VL-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load i32, ptr %1, align 4, !tbaa !2 @@ -447,49 +412,30 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_4( -; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 44 -; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 16 -; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 60 -; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 72 -; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 36 -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 24 -; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[T1]], i64 84 -; AVX2-NEXT: [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T19:%.*]] = load i32, ptr [[T18]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T23:%.*]] = load i32, ptr [[T22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T27:%.*]] = load i32, ptr [[T26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[T31:%.*]] = load i32, ptr [[T30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0 -; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1 -; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2 -; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3 -; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4 -; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5 -; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6 -; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7 +; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24 +; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> , <16 x i32> poison), !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> ; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], ; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_4( -; AVX512F-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> -; AVX512F-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512F-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> +; AVX512F-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; AVX512F-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( -; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <8 x ptr> poison, ptr [[T1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr i32, <8 x ptr> [[TMP2]], <8 x i64> -; AVX512VL-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP3]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], -; AVX512VL-NEXT: store <8 x i32> [[TMP5]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> , <22 x i32> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %t5 = getelementptr inbounds i32, ptr %t0, i64 1 @@ -687,25 +633,21 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512F-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512F-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> +; AVX512F-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <8 x ptr> [[TMP4]], <8 x i64> -; AVX512VL-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP6]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] -; AVX512VL-NEXT: store <8 x float> [[TMP9]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> poison, <8 x i32> +; AVX512VL-NEXT: store <8 x float> [[TMP10]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = load float, ptr %1, align 4, !tbaa !2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll index 0807a1bd4cdea..bbb1b87fc3dfa 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll @@ -5,10 +5,9 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) +; CHECK-NEXT: [[OFF0_1:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[OFF0_1]], i32 8, <15 x i1> , <15 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, <8 x ptr> [[TMP5]], <8 x i32> [[TMP3]] @@ -22,9 +21,9 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' - ; YAML-NEXT: - Cost: '-1' + ; YAML-NEXT: - Cost: '-10' ; YAML-NEXT: - String: ' and with tree size ' - ; YAML-NEXT: - TreeSize: '8' + ; YAML-NEXT: - TreeSize: '5' entry: %off0.1 = getelementptr inbounds i32, ptr %addr, i32 1 %idx0 = load i32, ptr %off0.1, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll index 5bd954e741d43..02058b1fe8578 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll @@ -5,16 +5,17 @@ define void @test() { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP6]], <2 x i32> [[TMP10]], i64 0) ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> @@ -64,16 +65,17 @@ define void @test1() { ; CHECK-LABEL: define void @test1( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP12]], <2 x i32> [[TMP10]], i64 0) ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer @@ -125,16 +127,17 @@ define void @test_div() { ; CHECK-LABEL: define void @test_div( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0) ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> @@ -184,16 +187,17 @@ define void @test_rem() { ; CHECK-LABEL: define void @test_rem( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr null, i32 4, <8 x i1> , <8 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP11]], <2 x i32> [[TMP3]], i64 0) ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll index c7c67d31f9ded..9369a5962e643 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll @@ -3,14 +3,11 @@ define void @test(ptr noalias %0, ptr %p) { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x ptr> [[TMP2]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x ptr> [[TMP3]], <8 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> splat (i1 true), <8 x float> poison) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> , <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 4, <16 x i1> , <16 x float> poison) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> , <16 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]] ; CHECK-NEXT: store <16 x float> [[TMP10]], ptr [[TMP5]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll index c114c5dee78e9..63dbf3ce78c32 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll @@ -8,14 +8,10 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 { ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 24 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = call <6 x float> @llvm.masked.load.v6f32.p1(ptr addrspace(1) [[TMP3]], i32 4, <6 x i1> , <6 x float> poison) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP6]], i64 0) -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP8]], i64 2) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x float> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll new file mode 100644 index 0000000000000..843d1cf46ffcc --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-unknown -mcpu=znver2 < %s | FileCheck %s + +%struct.ae = type { %struct.g } +%struct.g = type { [11 x double] } + +define void @test() { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[M1:%.*]] = alloca [[STRUCT_AE:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[M1]], i64 8 +; CHECK-NEXT: [[ARRAYIDX_I4:%.*]] = getelementptr i8, ptr null, i64 16 +; CHECK-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 40 +; CHECK-NEXT: [[TMP1:%.*]] = load <5 x double>, ptr [[M1]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX_I5_I]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <7 x double>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <7 x double> [[TMP4]], <7 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <5 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = fptosi <4 x double> [[TMP9]] to <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sitofp <4 x i32> [[TMP10]] to <4 x double> +; CHECK-NEXT: store <4 x double> [[TMP11]], ptr [[ARRAYIDX_I4]], align 8 +; CHECK-NEXT: ret void +; +entry: + %m1 = alloca %struct.ae, align 8 + %0 = getelementptr i8, ptr %m1, i64 8 + %1 = load double, ptr %0, align 8 + %arrayidx.i1 = getelementptr i8, ptr %m1, i64 24 + %2 = load double, ptr %arrayidx.i1, align 8 + %add.i2 = fadd double %1, %2 + %conv.i3 = fptosi double %add.i2 to i32 + %conv2.i3 = sitofp i32 %conv.i3 to double + %3 = load double, ptr %m1, align 8 + %arrayidx.i1.i = getelementptr i8, ptr %m1, i64 48 + %4 = load double, ptr %arrayidx.i1.i, align 8 + %add.i1 = fadd double %3, %4 + %conv.i2 = fptosi double %add.i1 to i32 + %conv2.i2 = sitofp i32 %conv.i2 to double + %arrayidx.i4 = getelementptr i8, ptr null, i64 16 + store double %conv2.i2, ptr %arrayidx.i4, align 8 + %5 = getelementptr i8, ptr null, i64 24 + store double %conv2.i3, ptr %5, align 8 + %arrayidx.i5 = getelementptr i8, ptr %m1, i64 32 + %6 = load double, ptr %arrayidx.i5, align 8 + %add.i1.i.i = fadd double %6, %6 + %conv.i1.i = fptosi double %add.i1.i.i to i32 + %conv2.i1.i = sitofp i32 %conv.i1.i to double + %arrayidx.i1.i.i = getelementptr i8, ptr null, i64 32 + store double %conv2.i1.i, ptr %arrayidx.i1.i.i, align 8 + %arrayidx.i5.i = getelementptr i8, ptr %m1, i64 40 + %7 = load double, ptr %arrayidx.i5.i, align 8 + %arrayidx.i.i.i.i2.i1 = getelementptr i8, ptr %m1, i64 56 + %8 = load double, ptr %arrayidx.i.i.i.i2.i1, align 8 + %add.i1.i.i.i = fadd double %7, %8 + %conv.i1.i.i = fptosi double %add.i1.i.i.i to i32 + %conv2.i1.i.i = sitofp i32 %conv.i1.i.i to double + %arrayidx.i.i.i3.i1 = getelementptr i8, ptr null, i64 40 + store double %conv2.i1.i.i, ptr %arrayidx.i.i.i3.i1, align 8 + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index 1294a87ff6967..d487e3616956c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -5,16 +5,12 @@ define void @test(i1 %c, ptr %arg) { ; CHECK-LABEL: @test( ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: -; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG:%.*]], i32 8, <5 x i1> , <5 x i64> poison) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x i64> [[TMP1]], <5 x i64> poison, <4 x i32> ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: -; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <5 x i64> @llvm.masked.load.v5i64.p0(ptr [[ARG]], i32 8, <5 x i1> , <5 x i64> poison) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <5 x i64> [[TMP3]], <5 x i64> poison, <4 x i32> ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: ; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll index e1e80d96d416d..b4996eb58b47e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll @@ -19,11 +19,11 @@ define void @test() { ; CHECK-NEXT: [[SIN1:%.*]] = call fast double @llvm.sin.f64(double [[A3]]) ; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]]) ; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]]) -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <6 x double>, ptr @src, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]]) -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[SIN1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP13]], double [[SIN3]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index 202ec9633712f..4cf2f99e60aeb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -8,15 +8,9 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX20]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP0]], i64 4) +; CHECK-NEXT: [[TMP1:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr [[ARRAYIDX20]], i32 4, <12 x i1> , <12 x i32> poison) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <12 x i32> [[TMP1]], <12 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]] ; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll index 8fe7d15b69cb1..fdc0bc0e00eb8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll @@ -4,16 +4,15 @@ define void @test(ptr noalias %p, ptr noalias %addr, ptr noalias %s) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i32, ptr [[ADDR:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[ADDR]], i32 8, <15 x i1> , <15 x i32> poison) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <15 x i32> [[TMP0]], <15 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) +; CHECK-NEXT: [[TMP11:%.*]] = call <15 x i32> @llvm.masked.load.v15i32.p0(ptr [[GEP2]], i32 8, <15 x i1> , <15 x i32> poison) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <15 x i32> [[TMP11]], <15 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP8]], i32 4, <8 x i1> splat (i1 true), <8 x i32> poison) ; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]] From bf1d27889b5831ed3e072a9223bdac098bf71af3 Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Fri, 4 Apr 2025 11:25:24 -0700 Subject: [PATCH 0680/1029] [WebKit checkers] Treat Objective-C message send return value as safe (#133605) Objective-C selectors are supposed to return autoreleased object. Treat these return values as safe. --- .../Checkers/WebKit/RawPtrRefCallArgsChecker.cpp | 8 ++++++++ .../Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp | 8 ++++++++ .../Analysis/Checkers/WebKit/objc-mock-types.h | 5 +++++ .../Checkers/WebKit/unretained-call-args-arc.mm | 9 +++++++++ .../Checkers/WebKit/unretained-call-args.mm | 9 +++++++++ .../Checkers/WebKit/unretained-local-vars.mm | 15 +++++++++++++-- 6 files changed, 52 insertions(+), 2 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp index ce8f0df697b06..13088920cfa19 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp @@ -47,6 +47,7 @@ class RawPtrRefCallArgsChecker virtual std::optional isUnsafePtr(QualType) const = 0; virtual bool isSafePtr(const CXXRecordDecl *Record) const = 0; virtual bool isSafePtrType(const QualType type) const = 0; + virtual bool isSafeExpr(const Expr *) const { return false; } virtual const char *ptrKind() const = 0; void checkASTDecl(const TranslationUnitDecl *TUD, AnalysisManager &MGR, @@ -233,6 +234,8 @@ class RawPtrRefCallArgsChecker return true; if (EFA.isACallToEnsureFn(ArgOrigin)) return true; + if (isSafeExpr(ArgOrigin)) + return true; return false; }); } @@ -469,6 +472,11 @@ class UnretainedCallArgsChecker final : public RawPtrRefCallArgsChecker { return isRetainPtrType(type); } + bool isSafeExpr(const Expr *E) const final { + return ento::cocoa::isCocoaObjectRef(E->getType()) && + isa(E); + } + const char *ptrKind() const final { return "unretained"; } }; diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp index d413e33a490c5..9975d1a91b681 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp @@ -179,6 +179,7 @@ class RawPtrRefLocalVarsChecker virtual std::optional isUnsafePtr(const QualType T) const = 0; virtual bool isSafePtr(const CXXRecordDecl *) const = 0; virtual bool isSafePtrType(const QualType) const = 0; + virtual bool isSafeExpr(const Expr *) const { return false; } virtual const char *ptrKind() const = 0; void checkASTDecl(const TranslationUnitDecl *TUD, AnalysisManager &MGR, @@ -300,6 +301,9 @@ class RawPtrRefLocalVarsChecker if (EFA.isACallToEnsureFn(InitArgOrigin)) return true; + if (isSafeExpr(InitArgOrigin)) + return true; + if (auto *Ref = llvm::dyn_cast(InitArgOrigin)) { if (auto *MaybeGuardian = dyn_cast_or_null(Ref->getFoundDecl())) { @@ -426,6 +430,10 @@ class UnretainedLocalVarsChecker final : public RawPtrRefLocalVarsChecker { bool isSafePtrType(const QualType type) const final { return isRetainPtrType(type); } + bool isSafeExpr(const Expr *E) const final { + return ento::cocoa::isCocoaObjectRef(E->getType()) && + isa(E); + } const char *ptrKind() const final { return "unretained"; } }; diff --git a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h index ef46a7c0a2925..059a203e3b0d1 100644 --- a/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h +++ b/clang/test/Analysis/Checkers/WebKit/objc-mock-types.h @@ -71,6 +71,7 @@ __attribute__((objc_root_class)) + (Class) superclass; - (instancetype) init; - (instancetype)retain; +- (instancetype)autorelease; - (void)release; - (BOOL)isKindOfClass:(Class)aClass; @end @@ -221,6 +222,10 @@ template struct RetainPtr { operator PtrType() const { return t; } operator bool() const { return t; } +#if !__has_feature(objc_arc) + PtrType autorelease() { [[clang::suppress]] return [t autorelease]; } +#endif + private: CFTypeRef toCFTypeRef(id ptr) { return (__bridge CFTypeRef)ptr; } CFTypeRef toCFTypeRef(const void* ptr) { return (CFTypeRef)ptr; } diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args-arc.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args-arc.mm index eb4735da60a05..f1f4d912663aa 100644 --- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args-arc.mm +++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args-arc.mm @@ -18,6 +18,7 @@ void foo() { @interface AnotherObj : NSObject - (void)foo:(SomeObj *)obj; +- (SomeObj *)getSomeObj; @end @implementation AnotherObj @@ -27,4 +28,12 @@ - (void)foo:(SomeObj*)obj { CFArrayAppendValue(provide_cf(), nullptr); // expected-warning@-1{{Call argument for parameter 'theArray' is unretained and unsafe [alpha.webkit.UnretainedCallArgsChecker]}} } + +- (SomeObj *)getSomeObj { + return provide(); +} + +- (void)doWorkOnSomeObj { + [[self getSomeObj] doWork]; +} @end diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm index 55e795ee9a598..dd21864300387 100644 --- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm +++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm @@ -405,6 +405,7 @@ void idcf(CFTypeRef obj) { @interface TestObject : NSObject - (void)doWork:(NSString *)msg, ...; - (void)doWorkOnSelf; +- (SomeObj *)getSomeObj; @end @implementation TestObject @@ -421,4 +422,12 @@ - (void)doWorkOnSelf { [self doWork:@"hello", RetainPtr { provide() }.get(), RetainPtr { provide_cf() }.get()]; } +- (SomeObj *)getSomeObj { + return RetainPtr(provide()).autorelease(); +} + +- (void)doWorkOnSomeObj { + [[self getSomeObj] doWork]; +} + @end diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-local-vars.mm b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars.mm index 0a3d9e54fa024..a71a80ea3d647 100644 --- a/clang/test/Analysis/Checkers/WebKit/unretained-local-vars.mm +++ b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars.mm @@ -14,8 +14,9 @@ void bar(SomeObj *) {} } // namespace raw_ptr namespace pointer { +SomeObj *provide(); void foo_ref() { - SomeObj *bar = [[SomeObj alloc] init]; + SomeObj *bar = provide(); // expected-warning@-1{{Local variable 'bar' is unretained and unsafe [alpha.webkit.UnretainedLocalVarsChecker]}} [bar doWork]; } @@ -387,6 +388,7 @@ unsigned ccf(CFTypeRef obj) { } // ptr_conversion bool doMoreWorkOpaque(OtherObj*); +SomeObj* provide(); @implementation OtherObj - (instancetype)init { @@ -397,4 +399,13 @@ - (instancetype)init { - (void)doMoreWork:(OtherObj *)other { doMoreWorkOpaque(other); } -@end \ No newline at end of file + +- (SomeObj*)getSomeObj { + return RetainPtr(provide()).autorelease(); +} + +- (void)storeSomeObj { + auto *obj = [self getSomeObj]; + [obj doWork]; +} +@end From 1d88cfcbb881a921f2d7676e108fe420d9f23ad9 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 4 Apr 2025 18:29:21 +0000 Subject: [PATCH 0681/1029] [gn build] Port 46e2c07fa28b --- llvm/utils/gn/secondary/lldb/source/ValueObject/BUILD.gn | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/utils/gn/secondary/lldb/source/ValueObject/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/ValueObject/BUILD.gn index 5d36cf0e1e053..21b06af6f09c1 100644 --- a/llvm/utils/gn/secondary/lldb/source/ValueObject/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/ValueObject/BUILD.gn @@ -24,7 +24,10 @@ static_library("ValueObject") { ] include_dirs = [ ".." ] sources = [ + "DILAST.cpp", + "DILEval.cpp", "DILLexer.cpp", + "DILParser.cpp", "ValueObject.cpp", "ValueObjectCast.cpp", "ValueObjectChild.cpp", From 22921540cd3d46d75f746386cd6ce2cb1217bf95 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 4 Apr 2025 18:29:22 +0000 Subject: [PATCH 0682/1029] [gn build] Port 7d3dfc862d28 --- .../gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn index 96db1ab43e76e..858dc6070cc63 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/JITLink/BUILD.gn @@ -41,6 +41,9 @@ static_library("JITLink") { "MachOLinkGraphBuilder.cpp", "MachO_arm64.cpp", "MachO_x86_64.cpp", + "XCOFF.cpp", + "XCOFFLinkGraphBuilder.cpp", + "XCOFF_ppc64.cpp", "aarch32.cpp", "aarch64.cpp", "i386.cpp", From e2092a430aadd731c44003d334dd292263336e81 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 4 Apr 2025 18:29:23 +0000 Subject: [PATCH 0683/1029] [gn build] Port da69eb75cbc6 --- llvm/utils/gn/secondary/clang/lib/ASTMatchers/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/ASTMatchers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/ASTMatchers/BUILD.gn index b7234bc995340..63bf7268db306 100644 --- a/llvm/utils/gn/secondary/clang/lib/ASTMatchers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/ASTMatchers/BUILD.gn @@ -10,5 +10,6 @@ static_library("ASTMatchers") { "ASTMatchFinder.cpp", "ASTMatchersInternal.cpp", "GtestMatchers.cpp", + "LowLevelHelpers.cpp", ] } From e4cbb7780bdef33bdedb1d66488586e07d3764a8 Mon Sep 17 00:00:00 2001 From: Maksim Panchenko Date: Fri, 4 Apr 2025 11:42:21 -0700 Subject: [PATCH 0684/1029] [BOLT][AArch64] Fix symbolization of unoptimized TLS access (#134332) TLS relocations may not have a valid BOLT symbol associated with them. While symbolizing the operand, we were checking for the symbol value, and since there was no symbol the check resulted in a crash. Handle TLS case while performing operand symbolization on AArch64. --- bolt/lib/Rewrite/RewriteInstance.cpp | 7 ++-- .../Target/AArch64/AArch64MCSymbolizer.cpp | 35 +++++++++++++------ bolt/test/AArch64/tls.c | 8 +++++ 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 3217dd4324bc7..23faa92642d01 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -2827,9 +2827,10 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection, if (SymbolAddress == 0) ReferencedSymbol = BC->registerNameAtAddress(SymbolName, 0, 0, 0); - LLVM_DEBUG(dbgs() << "BOLT-DEBUG: forcing relocation against symbol " - << ReferencedSymbol->getName() << " with addend " - << Addend << '\n'); + LLVM_DEBUG( + dbgs() << "BOLT-DEBUG: forcing relocation against symbol " + << (ReferencedSymbol ? ReferencedSymbol->getName() : "") + << " with addend " << Addend << '\n'); } else if (ReferencedBF) { ReferencedSymbol = ReferencedBF->getSymbol(); uint64_t RefFunctionOffset = 0; diff --git a/bolt/lib/Target/AArch64/AArch64MCSymbolizer.cpp b/bolt/lib/Target/AArch64/AArch64MCSymbolizer.cpp index 772328f84c97a..28b4fc0705eaf 100644 --- a/bolt/lib/Target/AArch64/AArch64MCSymbolizer.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCSymbolizer.cpp @@ -119,26 +119,39 @@ AArch64MCSymbolizer::adjustRelocation(const Relocation &Rel, // The ADRP+LDR sequence was converted into ADRP+ADD. We are looking at the // second instruction and have to use the relocation type for ADD. AdjustedRel.Type = ELF::R_AARCH64_ADD_ABS_LO12_NC; - } else { - // For instructions that reference GOT, ignore the referenced symbol and - // use value at the relocation site. FixRelaxationPass will look at - // instruction pairs and will perform necessary adjustments. + return AdjustedRel; + } + + // ADRP is a special case since the linker can leave the instruction opcode + // intact and modify only the operand. We are doing our best to detect when + // such conversion has happened without looking at the next instruction. + // + // If we detect that a page referenced by the ADRP cannot belong to GOT, and + // that it matches the symbol from the relocation, then we can be certain + // that the linker converted the GOT reference into the local one. Otherwise, + // we leave the disambiguation resolution to FixRelaxationPass. + // + // Note that ADRP relaxation described above cannot happen for TLS relocation. + // Since TLS relocations may not even have a valid symbol (not supported by + // BOLT), we explicitly exclude them from the check. + if (BC.MIB->isADRP(Inst) && Rel.Addend == 0 && !Relocation::isTLS(Rel.Type)) { ErrorOr SymbolValue = BC.getSymbolValue(*Rel.Symbol); assert(SymbolValue && "Symbol value should be set"); const uint64_t SymbolPageAddr = *SymbolValue & ~0xfffULL; - // Check if defined symbol and GOT are on the same page. If they are not, - // disambiguate the operand. - if (BC.MIB->isADRP(Inst) && Rel.Addend == 0 && - SymbolPageAddr == Rel.Value && + if (SymbolPageAddr == Rel.Value && !isPageAddressValidForGOT(SymbolPageAddr)) { AdjustedRel.Type = ELF::R_AARCH64_ADR_PREL_PG_HI21; - } else { - AdjustedRel.Symbol = BC.registerNameAtAddress("__BOLT_got_zero", 0, 0, 0); - AdjustedRel.Addend = Rel.Value; + return AdjustedRel; } } + // For instructions that reference GOT, ignore the referenced symbol and + // use value at the relocation site. FixRelaxationPass will look at + // instruction pairs and will perform necessary adjustments. + AdjustedRel.Symbol = BC.registerNameAtAddress("__BOLT_got_zero", 0, 0, 0); + AdjustedRel.Addend = Rel.Value; + return AdjustedRel; } diff --git a/bolt/test/AArch64/tls.c b/bolt/test/AArch64/tls.c index 3aa33777114ad..b531811f679ff 100644 --- a/bolt/test/AArch64/tls.c +++ b/bolt/test/AArch64/tls.c @@ -34,3 +34,11 @@ int main() { // RUN: -target aarch64-linux -fuse-ld=lld \ // RUN: -nostdlib // RUN: llvm-bolt %t_pie.exe -o %t.bolt + +// RUN: %clang %cflags -fPIC -shared %s -o %t.so -Wl,-q -fuse-ld=lld +// RUN: llvm-objdump -d -r --disassemble-symbols=main %t.so | FileCheck %s +// RUN: llvm-bolt %t.so -o %t.bolt.so + +// Verify that unoptimized TLS access was generated for shared object. +// CHECK: adrp x0 +// CHECK-NEXT: R_AARCH64_TLSDESC_ADR_PAGE21 tbssstruct From 50f0b30cffa72129a3179bd9ac83692114f6b19f Mon Sep 17 00:00:00 2001 From: Sirraide Date: Fri, 4 Apr 2025 20:54:13 +0200 Subject: [PATCH 0685/1029] [Clang] [Sema] Allow static assertions in the first part of a `for` loop in C (#134415) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No release note for this one because the one added by #129737 already mentions ‘non-variable declarations’. Fixes #56471. --- clang/lib/Parse/ParseStmt.cpp | 8 +++++++- clang/test/C/C11/n1330.c | 10 ++++------ clang/test/Sema/for.c | 5 +++++ clang/test/SemaCXX/for-static-assert.cpp | 7 +++++++ 4 files changed, 23 insertions(+), 7 deletions(-) create mode 100644 clang/test/SemaCXX/for-static-assert.cpp diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index 150b2879fc94f..e8ec140fbe3e5 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -2142,7 +2142,13 @@ StmtResult Parser::ParseForStatement(SourceLocation *TrailingElseLoc) { } DeclGroupPtrTy DG; SourceLocation DeclStart = Tok.getLocation(), DeclEnd; - if (Tok.is(tok::kw_using)) { + if (!getLangOpts().CPlusPlus && + Tok.isOneOf(tok::kw_static_assert, tok::kw__Static_assert)) { + ProhibitAttributes(attrs); + Decl *D = ParseStaticAssertDeclaration(DeclEnd); + DG = Actions.ConvertDeclToDeclGroup(D); + FirstPart = Actions.ActOnDeclStmt(DG, DeclStart, Tok.getLocation()); + } else if (Tok.is(tok::kw_using)) { DG = ParseAliasDeclarationInInitStatement(DeclaratorContext::ForInit, attrs); FirstPart = Actions.ActOnDeclStmt(DG, DeclStart, Tok.getLocation()); diff --git a/clang/test/C/C11/n1330.c b/clang/test/C/C11/n1330.c index 153f1994192dd..f8de2c1557421 100644 --- a/clang/test/C/C11/n1330.c +++ b/clang/test/C/C11/n1330.c @@ -43,14 +43,12 @@ void test(void) { _Static_assert(1, "this works"); _Static_assert(0, "this fails"); // expected-error {{static assertion failed: this fails}} - // The use of a _Static_assert in a for loop declaration is prohibited per - // 6.8.5p3 requiring the declaration to only declare identifiers for objects + // While the use of a _Static_assert in a for loop declaration is prohibited per + // 6.8.5p3 (requiring the declaration to only declare identifiers for objects // having auto or register storage class; a static assertion does not declare - // an identifier nor an object. - // FIXME: this diagnostic is pretty terrible. + // an identifier nor an object), we permit it as an extension. int i = 0; - for (_Static_assert(1, "this should not compile"); i < 10; ++i) // expected-error {{expected identifier or '('}} \ - expected-error {{expected ';' in 'for' statement specifier}} + for (_Static_assert(1, "this should compile"); i < 10; ++i) ; // Ensure that only an integer constant expression can be used as the diff --git a/clang/test/Sema/for.c b/clang/test/Sema/for.c index 33aaf7a074ad3..e16169aac0c4c 100644 --- a/clang/test/Sema/for.c +++ b/clang/test/Sema/for.c @@ -24,3 +24,8 @@ void b10(void) { for (typedef struct { int i; } (*s)(struct { int j; });;); } /* void b11 (void) { for (static _Thread_local struct { int i; } s;s.i;); } /* c11-warning {{declaration of non-local variable in 'for' loop is a C23 extension}} c23-warning {{declaration of non-local variable in 'for' loop is incompatible with C standards before C23}} */ #endif + +void b12(void) { + for(_Static_assert(1, "");;) {} /* c11-warning {{non-variable declaration in 'for' loop is a C23 extension}} + c23-warning {{non-variable declaration in 'for' loop is incompatible with C standards before C23}} */ +} diff --git a/clang/test/SemaCXX/for-static-assert.cpp b/clang/test/SemaCXX/for-static-assert.cpp new file mode 100644 index 0000000000000..f08044324e13b --- /dev/null +++ b/clang/test/SemaCXX/for-static-assert.cpp @@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +// C permits a 'static_assert' in the first part of a 'for' loop +// whereas C++ does not. +void f() { + for(static_assert(true);;) {} // expected-error {{expected expression}} +} From a97f73405f8e074263a0ed2dd2b8c87c014f46d9 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 4 Apr 2025 14:56:26 -0400 Subject: [PATCH 0686/1029] [libc++] Fix deployment targets that were incorrectly bumped (#134278) When I introduced the various `_LIBCPP_INTRODUCED_IN_LLVM_XY_ATTRIBUTE` macros in 182f5e9b2f03, I tried to correlate them to the right OS versions, but it seems that I made a few mistakes. This wasn't caught in the CI because we don't test back-deployment that far. rdar://148405946 --- libcxx/include/__configuration/availability.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/libcxx/include/__configuration/availability.h b/libcxx/include/__configuration/availability.h index 7e2ad79378ccf..80c632646900b 100644 --- a/libcxx/include/__configuration/availability.h +++ b/libcxx/include/__configuration/availability.h @@ -171,10 +171,10 @@ __attribute__((availability(driverkit, strict, introduced = 23.0))) // LLVM 15 -# if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 130400) || \ - (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 160500) || \ - (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 160500) || \ - (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90500) || \ +# if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 130300) || \ + (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 160300) || \ + (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 160300) || \ + (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 90300) || \ (defined(__ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_BRIDGE_OS_VERSION_MIN_REQUIRED__ < 70500) || \ (defined(__ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_DRIVERKIT_VERSION_MIN_REQUIRED__ < 220400) # define _LIBCPP_INTRODUCED_IN_LLVM_15 0 @@ -182,10 +182,10 @@ # define _LIBCPP_INTRODUCED_IN_LLVM_15 1 # endif # define _LIBCPP_INTRODUCED_IN_LLVM_15_ATTRIBUTE \ - __attribute__((availability(macos, strict, introduced = 13.4))) \ - __attribute__((availability(ios, strict, introduced = 16.5))) \ - __attribute__((availability(tvos, strict, introduced = 16.5))) \ - __attribute__((availability(watchos, strict, introduced = 9.5))) \ + __attribute__((availability(macos, strict, introduced = 13.3))) \ + __attribute__((availability(ios, strict, introduced = 16.3))) \ + __attribute__((availability(tvos, strict, introduced = 16.3))) \ + __attribute__((availability(watchos, strict, introduced = 9.3))) \ __attribute__((availability(bridgeos, strict, introduced = 7.5))) \ __attribute__((availability(driverkit, strict, introduced = 22.4))) From a2d983cffba87f9f35ededf7a2d6515d3698216e Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 4 Apr 2025 15:01:36 -0400 Subject: [PATCH 0687/1029] Improve diagnostic wording for invalid callback attribute uses (#134423) We were previously telling the user how many arguments were passed to the attribute rather than saying how many arguments were expected to be passed to the callback function. This rewords the diagnostic to hopefully be a bit more clear. Fixes #47451 --- clang/docs/ReleaseNotes.rst | 4 ++ .../clang/Basic/DiagnosticSemaKinds.td | 4 ++ clang/lib/Sema/SemaDeclAttr.cpp | 13 ++---- clang/test/Sema/attr-callback-broken.c | 40 +++++++++---------- 4 files changed, 32 insertions(+), 29 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 77252e3a98235..5217e04b5e83f 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -367,6 +367,10 @@ Bug Fixes to Attribute Support or ``__attribute__((malloc(deallocator, ptr-index)))`` (`#51607 `_). +- Corrected the diagnostic for the ``callback`` attribute when passing too many + or too few attribute argument indicies for the specified callback function. + (#GH47451) + Bug Fixes to C++ Support ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 52dc477039129..dc98ceadd23ca 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3195,6 +3195,10 @@ def err_attribute_wrong_number_arguments : Error< def err_attribute_wrong_number_arguments_for : Error < "%0 attribute references function %1, which %plural{0:takes no arguments|1:takes one argument|" ":takes exactly %2 arguments}2">; +def err_callback_attribute_wrong_arg_count : Error< + "'callback' attribute references function of type %0 which expects %1 " + "%plural{1:argument|:arguments}1 but attribute specifies %2 parameter index " + "%plural{1:argument|:arguments}2">; def err_attribute_bounds_for_function : Error< "%0 attribute references parameter %1, but the function %2 has only %3 parameters">; def err_attribute_no_member_function : Error< diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 0b844b44930b9..d76afe9d6464d 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -4145,15 +4145,10 @@ static void handleCallbackAttr(Sema &S, Decl *D, const ParsedAttr &AL) { return; } - if (CalleeFnProtoType->getNumParams() > EncodingIndices.size() - 1) { - S.Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) - << AL << (unsigned)(EncodingIndices.size() - 1); - return; - } - - if (CalleeFnProtoType->getNumParams() < EncodingIndices.size() - 1) { - S.Diag(AL.getLoc(), diag::err_attribute_wrong_number_arguments) - << AL << (unsigned)(EncodingIndices.size() - 1); + if (CalleeFnProtoType->getNumParams() != EncodingIndices.size() - 1) { + S.Diag(AL.getLoc(), diag::err_callback_attribute_wrong_arg_count) + << QualType{CalleeFnProtoType, 0} << CalleeFnProtoType->getNumParams() + << (unsigned)(EncodingIndices.size() - 1); return; } diff --git a/clang/test/Sema/attr-callback-broken.c b/clang/test/Sema/attr-callback-broken.c index b9e5f45f405ef..ec6ec04ae0e67 100644 --- a/clang/test/Sema/attr-callback-broken.c +++ b/clang/test/Sema/attr-callback-broken.c @@ -2,13 +2,13 @@ __attribute__((callback())) void no_callee(void (*callback)(void)); // expected-error {{'callback' attribute specifies no callback callee}} -__attribute__((callback(1, 1))) void too_many_args_1(void (*callback)(void)) {} // expected-error {{'callback' attribute takes one argument}} -__attribute__((callback(1, -1))) void too_many_args_2(double (*callback)(void)); // expected-error {{'callback' attribute takes one argument}} -__attribute__((callback(1, 2, 2))) void too_many_args_3(void (*callback)(int), int); // expected-error {{'callback' attribute requires exactly 2 arguments}} +__attribute__((callback(1, 1))) void too_many_args_1(void (*callback)(void)) {} // expected-error-re {{'callback' attribute references function of type 'void ({{(void)?}})' which expects 0 arguments but attribute specifies 1 parameter index argument}} +__attribute__((callback(1, -1))) void too_many_args_2(double (*callback)(void)); // expected-error-re {{'callback' attribute references function of type 'double ({{(void)?}})' which expects 0 arguments but attribute specifies 1 parameter index argument}} +__attribute__((callback(1, 2, 2))) void too_many_args_3(void (*callback)(int), int); // expected-error {{'callback' attribute references function of type 'void (int)' which expects 1 argument but attribute specifies 2 parameter index arguments}} -__attribute__((callback(1, 2))) void too_few_args_1(void (*callback)(int, int), int); // expected-error {{'callback' attribute takes one argument}} -__attribute__((callback(1))) void too_few_args_2(int (*callback)(int)); // expected-error {{'callback' attribute takes no arguments}} -__attribute__((callback(1, -1))) void too_few_args_3(void (*callback)(int, int)) {} // expected-error {{'callback' attribute takes one argument}} +__attribute__((callback(1, 2))) void too_few_args_1(void (*callback)(int, int), int); // expected-error {{'callback' attribute references function of type 'void (int, int)' which expects 2 arguments but attribute specifies 1 parameter index argument}} +__attribute__((callback(1))) void too_few_args_2(int (*callback)(int)); // expected-error {{'callback' attribute references function of type 'int (int)' which expects 1 argument but attribute specifies 0 parameter index arguments}} +__attribute__((callback(1, -1))) void too_few_args_3(void (*callback)(int, int)) {} // expected-error {{'callback' attribute references function of type 'void (int, int)' which expects 2 arguments but attribute specifies 1 parameter index argument}} __attribute__((callback(-1))) void oob_args_1(void (*callback)(void)); // expected-error {{'callback' attribute specifies invalid callback callee}} __attribute__((callback(2))) void oob_args_2(int *(*callback)(void)) {} // expected-error {{'callback' attribute parameter 1 is out of bounds}} @@ -33,22 +33,22 @@ __attribute__((callback(1, 0))) void no_this_2(void *(*callback)(int, void *)); __attribute__((callback(1, -1))) void vararg_cb_1(void (*callback)(int, ...)) {} // expected-error {{'callback' attribute callee may not be variadic}} __attribute__((callback(1, 1))) void vararg_cb_2(void (*callback)(int, ...), int a); // expected-error {{'callback' attribute callee may not be variadic}} -__attribute__((callback(1, -1, 1, 2, 3, 4, -1))) void varargs_1(void (*callback)(int, ...), int a, float b, double c) {} // expected-error {{'callback' attribute requires exactly 6 arguments}} -__attribute__((callback(1, -1, 4, 2, 3, 4, -1))) void varargs_2(void (*callback)(void *, double, int, ...), int a, float b, double c); // expected-error {{'callback' attribute requires exactly 6 arguments}} +__attribute__((callback(1, -1, 1, 2, 3, 4, -1))) void varargs_1(void (*callback)(int, ...), int a, float b, double c) {} // expected-error {{'callback' attribute references function of type 'void (int, ...)' which expects 1 argument but attribute specifies 6 parameter index arguments}} +__attribute__((callback(1, -1, 4, 2, 3, 4, -1))) void varargs_2(void (*callback)(void *, double, int, ...), int a, float b, double c); // expected-error {{'callback' attribute references function of type 'void (void *, double, int, ...)' which expects 3 arguments but attribute specifies 6 parameter index arguments}} -__attribute__((callback(1, -1, 1))) void self_arg_1(void (*callback)(int, ...)) {} // expected-error {{'callback' attribute requires exactly 2 arguments}} -__attribute__((callback(1, -1, 1, -1, -1, 1))) void self_arg_2(void (*callback)(int, ...)); // expected-error {{'callback' attribute requires exactly 5 arguments}} +__attribute__((callback(1, -1, 1))) void self_arg_1(void (*callback)(int, ...)) {} // expected-error {{'callback' attribute references function of type 'void (int, ...)' which expects 1 argument but attribute specifies 2 parameter index arguments}} +__attribute__((callback(1, -1, 1, -1, -1, 1))) void self_arg_2(void (*callback)(int, ...)); // expected-error {{'callback' attribute references function of type 'void (int, ...)' which expects 1 argument but attribute specifies 5 parameter index arguments}} __attribute__((callback(cb))) void unknown_name1(void (*callback)(void)) {} // expected-error {{'callback' attribute argument 'cb' is not a known function parameter}} __attribute__((callback(cb, ab))) void unknown_name2(void (*cb)(int), int a) {} // expected-error {{'callback' attribute argument 'ab' is not a known function parameter}} -__attribute__((callback(callback, 1))) void too_many_args_1b(void (*callback)(void)) {} // expected-error {{'callback' attribute takes one argument}} -__attribute__((callback(callback, __))) void too_many_args_2b(double (*callback)(void)); // expected-error {{'callback' attribute takes one argument}} -__attribute__((callback(callback, 2, 2))) void too_many_args_3b(void (*callback)(int), int); // expected-error {{'callback' attribute requires exactly 2 arguments}} +__attribute__((callback(callback, 1))) void too_many_args_1b(void (*callback)(void)) {} // expected-error-re {{'callback' attribute references function of type 'void ({{(void)?}})' which expects 0 arguments but attribute specifies 1 parameter index argument}} +__attribute__((callback(callback, __))) void too_many_args_2b(double (*callback)(void)); // expected-error-re {{'callback' attribute references function of type 'double ({{(void)?}})' which expects 0 arguments but attribute specifies 1 parameter index argument}} +__attribute__((callback(callback, 2, 2))) void too_many_args_3b(void (*callback)(int), int); // expected-error {{'callback' attribute references function of type 'void (int)' which expects 1 argument but attribute specifies 2 parameter index arguments}} -__attribute__((callback(callback, a))) void too_few_args_1b(void (*callback)(int, int), int a); // expected-error {{'callback' attribute takes one argument}} -__attribute__((callback(callback))) void too_few_args_2b(int (*callback)(int)); // expected-error {{'callback' attribute takes no arguments}} -__attribute__((callback(callback, __))) void too_few_args_3b(void (*callback)(int, int)) {} // expected-error {{'callback' attribute takes one argument}} +__attribute__((callback(callback, a))) void too_few_args_1b(void (*callback)(int, int), int a); // expected-error {{'callback' attribute references function of type 'void (int, int)' which expects 2 arguments but attribute specifies 1 parameter index argument}} +__attribute__((callback(callback))) void too_few_args_2b(int (*callback)(int)); // expected-error {{'callback' attribute references function of type 'int (int)' which expects 1 argument but attribute specifies 0 parameter index arguments}} +__attribute__((callback(callback, __))) void too_few_args_3b(void (*callback)(int, int)) {} // expected-error {{'callback' attribute references function of type 'void (int, int)' which expects 2 arguments but attribute specifies 1 parameter index argument}} __attribute__((callback(__))) void oob_args_1b(void (*callback)(void)); // expected-error {{'callback' attribute specifies invalid callback callee}} @@ -68,8 +68,8 @@ __attribute__((callback(1, this))) void no_this_2b(void *(*callback)(int, void * __attribute__((callback(callback, __))) void vararg_cb_1b(void (*callback)(int, ...)) {} // expected-error {{'callback' attribute callee may not be variadic}} __attribute__((callback(1, a))) void vararg_cb_2b(void (*callback)(int, ...), int a); // expected-error {{'callback' attribute callee may not be variadic}} -__attribute__((callback(callback, __, callback, a, b, c, __))) void varargs_1b(void (*callback)(int, ...), int a, float b, double c) {} // expected-error {{'callback' attribute requires exactly 6 arguments}} -__attribute__((callback(1, __, c, a, b, c, -1))) void varargs_2b(void (*callback)(void *, double, int, ...), int a, float b, double c); // expected-error {{'callback' attribute requires exactly 6 arguments}} +__attribute__((callback(callback, __, callback, a, b, c, __))) void varargs_1b(void (*callback)(int, ...), int a, float b, double c) {} // expected-error {{'callback' attribute references function of type 'void (int, ...)' which expects 1 argument but attribute specifies 6 parameter index arguments}} +__attribute__((callback(1, __, c, a, b, c, -1))) void varargs_2b(void (*callback)(void *, double, int, ...), int a, float b, double c); // expected-error {{'callback' attribute references function of type 'void (void *, double, int, ...)' which expects 3 arguments but attribute specifies 6 parameter index arguments}} -__attribute__((callback(1, __, callback))) void self_arg_1b(void (*callback)(int, ...)) {} // expected-error {{'callback' attribute requires exactly 2 arguments}} -__attribute__((callback(callback, __, callback, __, __, callback))) void self_arg_2b(void (*callback)(int, ...)); // expected-error {{'callback' attribute requires exactly 5 arguments}} +__attribute__((callback(1, __, callback))) void self_arg_1b(void (*callback)(int, ...)) {} // expected-error {{'callback' attribute references function of type 'void (int, ...)' which expects 1 argument but attribute specifies 2 parameter index arguments}} +__attribute__((callback(callback, __, callback, __, __, callback))) void self_arg_2b(void (*callback)(int, ...)); // expected-error {{'callback' attribute references function of type 'void (int, ...)' which expects 1 argument but attribute specifies 5 parameter index arguments}} From d8fd665960634bd27bf72f06925314312087a3fe Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Fri, 4 Apr 2025 12:04:20 -0700 Subject: [PATCH 0688/1029] [alpha.webkit.ForwardDeclChecker] Ignore forward declared struct. (#133804) There are some system libraries such as sqlite3 which forward declare a struct then use a pointer to that forward declared type in various APIs. Ignore these types ForwardDeclChecker like other pointer types. --- .../Checkers/WebKit/ForwardDeclChecker.cpp | 12 ++++++------ .../Analysis/Checkers/WebKit/forward-decl-checker.mm | 4 ++++ .../Analysis/Checkers/WebKit/mock-system-header.h | 2 ++ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ForwardDeclChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ForwardDeclChecker.cpp index a524593b0119b..2c63224df129a 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ForwardDeclChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ForwardDeclChecker.cpp @@ -108,17 +108,16 @@ class ForwardDeclChecker : public Checker> { RTC.visitTypedef(TD); auto QT = TD->getUnderlyingType().getCanonicalType(); if (BR->getSourceManager().isInSystemHeader(TD->getBeginLoc())) { - if (auto *Type = QT.getTypePtrOrNull(); Type && QT->isPointerType()) + if (auto *Type = QT.getTypePtrOrNull()) SystemTypes.insert(Type); } } bool isUnknownType(QualType QT) const { - auto *Type = QT.getTypePtrOrNull(); - if (!Type) - return false; auto *CanonicalType = QT.getCanonicalType().getTypePtrOrNull(); - auto PointeeQT = Type->getPointeeType(); + if (!CanonicalType) + return false; + auto PointeeQT = CanonicalType->getPointeeType(); auto *PointeeType = PointeeQT.getTypePtrOrNull(); if (!PointeeType) return false; @@ -128,7 +127,8 @@ class ForwardDeclChecker : public Checker> { auto Name = R->getName(); return !R->hasDefinition() && !RTC.isUnretained(QT) && !SystemTypes.contains(CanonicalType) && - !Name.starts_with("Opaque") && Name != "_NSZone"; + !SystemTypes.contains(PointeeType) && !Name.starts_with("Opaque") && + Name != "_NSZone"; } void visitRecordDecl(const RecordDecl *RD, const Decl *DeclWithIssue) const { diff --git a/clang/test/Analysis/Checkers/WebKit/forward-decl-checker.mm b/clang/test/Analysis/Checkers/WebKit/forward-decl-checker.mm index 151cbe2affa92..64100d60c4867 100644 --- a/clang/test/Analysis/Checkers/WebKit/forward-decl-checker.mm +++ b/clang/test/Analysis/Checkers/WebKit/forward-decl-checker.mm @@ -25,6 +25,8 @@ Obj* provide_obj_ptr(); void receive_obj_ptr(Obj* p = nullptr); +sqlite3* open_db(); +void close_db(sqlite3*); Obj* ptr(Obj* arg) { receive_obj_ptr(provide_obj_ptr()); @@ -34,6 +36,8 @@ receive_obj_ptr(arg); receive_obj_ptr(nullptr); receive_obj_ptr(); + auto* db = open_db(); + close_db(db); return obj; } diff --git a/clang/test/Analysis/Checkers/WebKit/mock-system-header.h b/clang/test/Analysis/Checkers/WebKit/mock-system-header.h index 450fb24687343..1e44de8eb62ad 100644 --- a/clang/test/Analysis/Checkers/WebKit/mock-system-header.h +++ b/clang/test/Analysis/Checkers/WebKit/mock-system-header.h @@ -16,6 +16,8 @@ struct MemberVariable { T* obj { nullptr }; }; +typedef struct sqlite3 sqlite3; + typedef unsigned char uint8_t; enum os_log_type_t : uint8_t { From c22586a9d18dd0f066e5660102f7de15fd239e2c Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Fri, 4 Apr 2025 15:19:36 -0400 Subject: [PATCH 0689/1029] [PowerPC] Update altivec.h to use __inline__ for c89 compatibility (#134430) --- clang/lib/Headers/altivec.h | 161 ++++++++++++++-------------- clang/test/Headers/altivec-header.c | 1 + 2 files changed, 83 insertions(+), 79 deletions(-) diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h index 8da65055012f1..71d8d3c0c0771 100644 --- a/clang/lib/Headers/altivec.h +++ b/clang/lib/Headers/altivec.h @@ -17525,70 +17525,73 @@ vec_bperm(vector unsigned long long __a, vector unsigned char __b) { /* vec_reve */ -static inline __ATTRS_o_ai vector bool char vec_reve(vector bool char __a) { +static __inline__ __ATTRS_o_ai vector bool char vec_reve(vector bool char __a) { return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); } -static inline __ATTRS_o_ai vector signed char vec_reve(vector signed char __a) { +static __inline__ __ATTRS_o_ai vector signed char +vec_reve(vector signed char __a) { return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); } -static inline __ATTRS_o_ai vector unsigned char +static __inline__ __ATTRS_o_ai vector unsigned char vec_reve(vector unsigned char __a) { return __builtin_shufflevector(__a, __a, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); } -static inline __ATTRS_o_ai vector bool int vec_reve(vector bool int __a) { +static __inline__ __ATTRS_o_ai vector bool int vec_reve(vector bool int __a) { return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); } -static inline __ATTRS_o_ai vector signed int vec_reve(vector signed int __a) { +static __inline__ __ATTRS_o_ai vector signed int +vec_reve(vector signed int __a) { return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); } -static inline __ATTRS_o_ai vector unsigned int +static __inline__ __ATTRS_o_ai vector unsigned int vec_reve(vector unsigned int __a) { return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); } -static inline __ATTRS_o_ai vector bool short vec_reve(vector bool short __a) { +static __inline__ __ATTRS_o_ai vector bool short +vec_reve(vector bool short __a) { return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0); } -static inline __ATTRS_o_ai vector signed short +static __inline__ __ATTRS_o_ai vector signed short vec_reve(vector signed short __a) { return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0); } -static inline __ATTRS_o_ai vector unsigned short +static __inline__ __ATTRS_o_ai vector unsigned short vec_reve(vector unsigned short __a) { return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0); } -static inline __ATTRS_o_ai vector float vec_reve(vector float __a) { +static __inline__ __ATTRS_o_ai vector float vec_reve(vector float __a) { return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); } #ifdef __VSX__ -static inline __ATTRS_o_ai vector bool long long +static __inline__ __ATTRS_o_ai vector bool long long vec_reve(vector bool long long __a) { return __builtin_shufflevector(__a, __a, 1, 0); } -static inline __ATTRS_o_ai vector signed long long +static __inline__ __ATTRS_o_ai vector signed long long vec_reve(vector signed long long __a) { return __builtin_shufflevector(__a, __a, 1, 0); } -static inline __ATTRS_o_ai vector unsigned long long +static __inline__ __ATTRS_o_ai vector unsigned long long vec_reve(vector unsigned long long __a) { return __builtin_shufflevector(__a, __a, 1, 0); } -static inline __ATTRS_o_ai vector double vec_reve(vector double __a) { +static __inline__ __ATTRS_o_ai vector double vec_reve(vector double __a) { return __builtin_shufflevector(__a, __a, 1, 0); } #endif @@ -17721,42 +17724,42 @@ typedef vector signed int unaligned_vec_sint __attribute__((aligned(1))); typedef vector unsigned int unaligned_vec_uint __attribute__((aligned(1))); typedef vector float unaligned_vec_float __attribute__((aligned(1))); -static inline __ATTRS_o_ai vector signed char vec_xl(ptrdiff_t __offset, - const signed char *__ptr) { +static __inline__ __ATTRS_o_ai vector signed char +vec_xl(ptrdiff_t __offset, const signed char *__ptr) { return *(unaligned_vec_schar *)(__ptr + __offset); } -static inline __ATTRS_o_ai vector unsigned char +static __inline__ __ATTRS_o_ai vector unsigned char vec_xl(ptrdiff_t __offset, const unsigned char *__ptr) { return *(unaligned_vec_uchar*)(__ptr + __offset); } -static inline __ATTRS_o_ai vector signed short +static __inline__ __ATTRS_o_ai vector signed short vec_xl(ptrdiff_t __offset, const signed short *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_sshort *)__addr; } -static inline __ATTRS_o_ai vector unsigned short +static __inline__ __ATTRS_o_ai vector unsigned short vec_xl(ptrdiff_t __offset, const unsigned short *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_ushort *)__addr; } -static inline __ATTRS_o_ai vector signed int vec_xl(ptrdiff_t __offset, - const signed int *__ptr) { +static __inline__ __ATTRS_o_ai vector signed int +vec_xl(ptrdiff_t __offset, const signed int *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_sint *)__addr; } -static inline __ATTRS_o_ai vector unsigned int +static __inline__ __ATTRS_o_ai vector unsigned int vec_xl(ptrdiff_t __offset, const unsigned int *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_uint *)__addr; } -static inline __ATTRS_o_ai vector float vec_xl(ptrdiff_t __offset, - const float *__ptr) { +static __inline__ __ATTRS_o_ai vector float vec_xl(ptrdiff_t __offset, + const float *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_float *)__addr; } @@ -17766,20 +17769,20 @@ typedef vector signed long long unaligned_vec_sll __attribute__((aligned(1))); typedef vector unsigned long long unaligned_vec_ull __attribute__((aligned(1))); typedef vector double unaligned_vec_double __attribute__((aligned(1))); -static inline __ATTRS_o_ai vector signed long long +static __inline__ __ATTRS_o_ai vector signed long long vec_xl(ptrdiff_t __offset, const signed long long *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_sll *)__addr; } -static inline __ATTRS_o_ai vector unsigned long long +static __inline__ __ATTRS_o_ai vector unsigned long long vec_xl(ptrdiff_t __offset, const unsigned long long *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_ull *)__addr; } -static inline __ATTRS_o_ai vector double vec_xl(ptrdiff_t __offset, - const double *__ptr) { +static __inline__ __ATTRS_o_ai vector double vec_xl(ptrdiff_t __offset, + const double *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_double *)__addr; } @@ -17790,13 +17793,13 @@ static inline __ATTRS_o_ai vector double vec_xl(ptrdiff_t __offset, typedef vector signed __int128 unaligned_vec_si128 __attribute__((aligned(1))); typedef vector unsigned __int128 unaligned_vec_ui128 __attribute__((aligned(1))); -static inline __ATTRS_o_ai vector signed __int128 +static __inline__ __ATTRS_o_ai vector signed __int128 vec_xl(ptrdiff_t __offset, const signed __int128 *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_si128 *)__addr; } -static inline __ATTRS_o_ai vector unsigned __int128 +static __inline__ __ATTRS_o_ai vector unsigned __int128 vec_xl(ptrdiff_t __offset, const unsigned __int128 *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; return *(unaligned_vec_ui128 *)__addr; @@ -17991,64 +17994,64 @@ vec_load_splats(unsigned long long __offset, const float *__ptr) { #define vec_xstd2 vec_xst #define vec_xstw4 vec_xst -static inline __ATTRS_o_ai void +static __inline__ __ATTRS_o_ai void vec_xst(vector signed char __vec, ptrdiff_t __offset, signed char *__ptr) { *(unaligned_vec_schar *)(__ptr + __offset) = __vec; } -static inline __ATTRS_o_ai void +static __inline__ __ATTRS_o_ai void vec_xst(vector unsigned char __vec, ptrdiff_t __offset, unsigned char *__ptr) { *(unaligned_vec_uchar *)(__ptr + __offset) = __vec; } -static inline __ATTRS_o_ai void +static __inline__ __ATTRS_o_ai void vec_xst(vector signed short __vec, ptrdiff_t __offset, signed short *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; *(unaligned_vec_sshort *)__addr = __vec; } -static inline __ATTRS_o_ai void vec_xst(vector unsigned short __vec, - ptrdiff_t __offset, - unsigned short *__ptr) { +static __inline__ __ATTRS_o_ai void vec_xst(vector unsigned short __vec, + ptrdiff_t __offset, + unsigned short *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; *(unaligned_vec_ushort *)__addr = __vec; } -static inline __ATTRS_o_ai void vec_xst(vector signed int __vec, - ptrdiff_t __offset, signed int *__ptr) { +static __inline__ __ATTRS_o_ai void +vec_xst(vector signed int __vec, ptrdiff_t __offset, signed int *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; *(unaligned_vec_sint *)__addr = __vec; } -static inline __ATTRS_o_ai void +static __inline__ __ATTRS_o_ai void vec_xst(vector unsigned int __vec, ptrdiff_t __offset, unsigned int *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; *(unaligned_vec_uint *)__addr = __vec; } -static inline __ATTRS_o_ai void vec_xst(vector float __vec, ptrdiff_t __offset, - float *__ptr) { +static __inline__ __ATTRS_o_ai void vec_xst(vector float __vec, + ptrdiff_t __offset, float *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; *(unaligned_vec_float *)__addr = __vec; } #ifdef __VSX__ -static inline __ATTRS_o_ai void vec_xst(vector signed long long __vec, - ptrdiff_t __offset, - signed long long *__ptr) { +static __inline__ __ATTRS_o_ai void vec_xst(vector signed long long __vec, + ptrdiff_t __offset, + signed long long *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; *(unaligned_vec_sll *)__addr = __vec; } -static inline __ATTRS_o_ai void vec_xst(vector unsigned long long __vec, - ptrdiff_t __offset, - unsigned long long *__ptr) { +static __inline__ __ATTRS_o_ai void vec_xst(vector unsigned long long __vec, + ptrdiff_t __offset, + unsigned long long *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; *(unaligned_vec_ull *)__addr = __vec; } -static inline __ATTRS_o_ai void vec_xst(vector double __vec, ptrdiff_t __offset, - double *__ptr) { +static __inline__ __ATTRS_o_ai void vec_xst(vector double __vec, + ptrdiff_t __offset, double *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; *(unaligned_vec_double *)__addr = __vec; } @@ -18056,16 +18059,16 @@ static inline __ATTRS_o_ai void vec_xst(vector double __vec, ptrdiff_t __offset, #if defined(__POWER8_VECTOR__) && defined(__powerpc64__) && \ defined(__SIZEOF_INT128__) -static inline __ATTRS_o_ai void vec_xst(vector signed __int128 __vec, - ptrdiff_t __offset, - signed __int128 *__ptr) { +static __inline__ __ATTRS_o_ai void vec_xst(vector signed __int128 __vec, + ptrdiff_t __offset, + signed __int128 *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; *(unaligned_vec_si128 *)__addr = __vec; } -static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec, - ptrdiff_t __offset, - unsigned __int128 *__ptr) { +static __inline__ __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec, + ptrdiff_t __offset, + unsigned __int128 *__ptr) { signed char *__addr = (signed char *)__ptr + __offset; *(unaligned_vec_ui128 *)__addr = __vec; } @@ -18075,51 +18078,51 @@ static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec, #if defined(__POWER10_VECTOR__) && defined(__VSX__) && \ defined(__SIZEOF_INT128__) -static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, - ptrdiff_t __offset, - signed char *__ptr) { +static __inline__ __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + ptrdiff_t __offset, + signed char *__ptr) { *(__ptr + __offset) = (signed char)__vec[0]; } -static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, - ptrdiff_t __offset, - unsigned char *__ptr) { +static __inline__ __ATTRS_o_ai void +vec_xst_trunc(vector unsigned __int128 __vec, ptrdiff_t __offset, + unsigned char *__ptr) { *(__ptr + __offset) = (unsigned char)__vec[0]; } -static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, - ptrdiff_t __offset, - signed short *__ptr) { +static __inline__ __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + ptrdiff_t __offset, + signed short *__ptr) { *(__ptr + __offset) = (signed short)__vec[0]; } -static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, - ptrdiff_t __offset, - unsigned short *__ptr) { +static __inline__ __ATTRS_o_ai void +vec_xst_trunc(vector unsigned __int128 __vec, ptrdiff_t __offset, + unsigned short *__ptr) { *(__ptr + __offset) = (unsigned short)__vec[0]; } -static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, - ptrdiff_t __offset, - signed int *__ptr) { +static __inline__ __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + ptrdiff_t __offset, + signed int *__ptr) { *(__ptr + __offset) = (signed int)__vec[0]; } -static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, - ptrdiff_t __offset, - unsigned int *__ptr) { +static __inline__ __ATTRS_o_ai void +vec_xst_trunc(vector unsigned __int128 __vec, ptrdiff_t __offset, + unsigned int *__ptr) { *(__ptr + __offset) = (unsigned int)__vec[0]; } -static inline __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, - ptrdiff_t __offset, - signed long long *__ptr) { +static __inline__ __ATTRS_o_ai void vec_xst_trunc(vector signed __int128 __vec, + ptrdiff_t __offset, + signed long long *__ptr) { *(__ptr + __offset) = (signed long long)__vec[0]; } -static inline __ATTRS_o_ai void vec_xst_trunc(vector unsigned __int128 __vec, - ptrdiff_t __offset, - unsigned long long *__ptr) { +static __inline__ __ATTRS_o_ai void +vec_xst_trunc(vector unsigned __int128 __vec, ptrdiff_t __offset, + unsigned long long *__ptr) { *(__ptr + __offset) = (unsigned long long)__vec[0]; } #endif diff --git a/clang/test/Headers/altivec-header.c b/clang/test/Headers/altivec-header.c index 00e5f444de7cc..67aee4b270ca6 100644 --- a/clang/test/Headers/altivec-header.c +++ b/clang/test/Headers/altivec-header.c @@ -1,5 +1,6 @@ // RUN: %clang_cc1 -triple powerpc64-unknown-unknown -target-feature +altivec -ffreestanding -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple powerpc64-unknown-unknown -target-feature +altivec -ffreestanding -emit-llvm -flax-vector-conversions=none -o - %s | FileCheck %s +// RUN: %clang_cc1 -std=c89 -triple powerpc64-unknown-unknown -target-feature +altivec -ffreestanding -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple powerpc64-unknown-unknown -target-feature +altivec -ffreestanding -emit-llvm -x c++ -o - %s | FileCheck %s #include From ebb0e6cb2555186fa0417ecb54925aa659346771 Mon Sep 17 00:00:00 2001 From: Zahira Ammarguellat Date: Fri, 4 Apr 2025 12:21:47 -0700 Subject: [PATCH 0690/1029] [NFC] Remove dead code detected by code sanitizer. (#134385) The code sanitizer is failing with this error: `Execution cannot reach this statement.` The execution code path would early exit at line 928 if `(Lil && Ril) = true`. --- .../clang-tidy/misc/RedundantExpressionCheck.cpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp index c249244b1a1b2..8e5a528bc5d3e 100644 --- a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.cpp @@ -927,18 +927,6 @@ static bool areExprsSameMacroOrLiteral(const BinaryOperator *BinOp, if (Lil && Ril) return Lil->getValue() == Ril->getValue(); - const auto *LStrl = dyn_cast(Lhs); - const auto *RStrl = dyn_cast(Rhs); - if (Lil && Ril) { - const llvm::StringRef L = Lexer::getSourceText( - CharSourceRange::getTokenRange(LStrl->getBeginLoc()), SM, - Context->getLangOpts(), 0); - const llvm::StringRef R = Lexer::getSourceText( - CharSourceRange::getTokenRange(RStrl->getBeginLoc()), SM, - Context->getLangOpts(), 0); - return L.compare(R) == 0; - } - const auto *Lbl = dyn_cast(Lhs); const auto *Rbl = dyn_cast(Rhs); if (Lbl && Rbl) From 6263de90df7f58c8b98475024d5eef102e10a372 Mon Sep 17 00:00:00 2001 From: erichkeane Date: Thu, 3 Apr 2025 09:09:41 -0700 Subject: [PATCH 0691/1029] [OpenACC] Implement 'modifier-list' sema/AST OpenACC 3.3-NEXT has changed the way tags for copy, copyin, copyout, and create clauses are specified, and end up adding a few extras, and permits them as a list. This patch encodes these as bitmask enum so they can be stored succinctly, but still diagnose reasonably. --- clang/include/clang/AST/OpenACCClause.h | 52 +++++---- .../clang/Basic/DiagnosticParseKinds.td | 3 + .../clang/Basic/DiagnosticSemaKinds.td | 2 + clang/include/clang/Basic/OpenACCKinds.h | 71 ++++++++++++ clang/include/clang/Parse/Parser.h | 2 + clang/include/clang/Sema/SemaOpenACC.h | 51 +++++---- clang/lib/AST/OpenACCClause.cpp | 35 +++--- clang/lib/AST/TextNodeDumper.cpp | 27 +++-- clang/lib/Parse/ParseOpenACC.cpp | 94 ++++++++++++---- clang/lib/Sema/SemaOpenACC.cpp | 1 - clang/lib/Sema/SemaOpenACCClause.cpp | 106 ++++++++++++++---- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 48 ++++---- clang/lib/Sema/TreeTransform.h | 46 ++++---- clang/lib/Serialization/ASTReader.cpp | 15 +-- clang/lib/Serialization/ASTWriter.cpp | 7 +- .../ast-print-openacc-combined-construct.cpp | 12 +- .../ast-print-openacc-compute-construct.cpp | 12 +- .../AST/ast-print-openacc-data-construct.cpp | 16 +-- .../ast-print-openacc-declare-construct.cpp | 8 +- clang/test/ParserOpenACC/parse-clauses.c | 74 +++++++++--- .../combined-construct-copy-ast.cpp | 14 +-- .../combined-construct-copy-clause.c | 15 +++ .../combined-construct-copyin-ast.cpp | 16 +-- .../combined-construct-copyin-clause.c | 18 ++- .../combined-construct-copyout-ast.cpp | 16 +-- .../combined-construct-copyout-clause.c | 18 ++- .../combined-construct-create-ast.cpp | 6 +- .../combined-construct-create-clause.c | 26 ++++- .../compute-construct-copy-clause.c | 15 +++ .../compute-construct-copyin-clause.c | 17 ++- .../compute-construct-copyout-clause.c | 17 ++- .../compute-construct-create-clause.c | 25 ++++- .../compute-construct-varlist-ast.cpp | 62 +++++----- .../SemaOpenACC/data-construct-copy-ast.cpp | 16 +-- .../SemaOpenACC/data-construct-copy-clause.c | 16 +++ .../SemaOpenACC/data-construct-copyin-ast.cpp | 22 ++-- .../data-construct-copyin-clause.c | 23 +++- .../data-construct-copyout-ast.cpp | 22 ++-- .../data-construct-copyout-clause.c | 24 +++- .../SemaOpenACC/data-construct-create-ast.cpp | 12 +- .../data-construct-create-clause.c | 35 +++++- .../SemaOpenACC/declare-construct-ast.cpp | 20 ++-- clang/test/SemaOpenACC/declare-construct.cpp | 45 ++++++++ 43 files changed, 866 insertions(+), 316 deletions(-) diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h index 4c5fe03a34361..b3a5746af7cb0 100644 --- a/clang/include/clang/AST/OpenACCClause.h +++ b/clang/include/clang/AST/OpenACCClause.h @@ -1087,11 +1087,13 @@ class OpenACCCopyClause final : public OpenACCClauseWithVarList, private llvm::TrailingObjects { friend TrailingObjects; + OpenACCModifierKind Modifiers; OpenACCCopyClause(OpenACCClauseKind Spelling, SourceLocation BeginLoc, - SourceLocation LParenLoc, ArrayRef VarList, - SourceLocation EndLoc) - : OpenACCClauseWithVarList(Spelling, BeginLoc, LParenLoc, EndLoc) { + SourceLocation LParenLoc, OpenACCModifierKind Mods, + ArrayRef VarList, SourceLocation EndLoc) + : OpenACCClauseWithVarList(Spelling, BeginLoc, LParenLoc, EndLoc), + Modifiers(Mods) { assert((Spelling == OpenACCClauseKind::Copy || Spelling == OpenACCClauseKind::PCopy || Spelling == OpenACCClauseKind::PresentOrCopy) && @@ -1110,20 +1112,23 @@ class OpenACCCopyClause final static OpenACCCopyClause * Create(const ASTContext &C, OpenACCClauseKind Spelling, SourceLocation BeginLoc, SourceLocation LParenLoc, - ArrayRef VarList, SourceLocation EndLoc); + OpenACCModifierKind Mods, ArrayRef VarList, + SourceLocation EndLoc); + + OpenACCModifierKind getModifierList() const { return Modifiers; } }; class OpenACCCopyInClause final : public OpenACCClauseWithVarList, private llvm::TrailingObjects { friend TrailingObjects; - bool IsReadOnly; + OpenACCModifierKind Modifiers; OpenACCCopyInClause(OpenACCClauseKind Spelling, SourceLocation BeginLoc, - SourceLocation LParenLoc, bool IsReadOnly, + SourceLocation LParenLoc, OpenACCModifierKind Mods, ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(Spelling, BeginLoc, LParenLoc, EndLoc), - IsReadOnly(IsReadOnly) { + Modifiers(Mods) { assert((Spelling == OpenACCClauseKind::CopyIn || Spelling == OpenACCClauseKind::PCopyIn || Spelling == OpenACCClauseKind::PresentOrCopyIn) && @@ -1139,24 +1144,25 @@ class OpenACCCopyInClause final C->getClauseKind() == OpenACCClauseKind::PCopyIn || C->getClauseKind() == OpenACCClauseKind::PresentOrCopyIn; } - bool isReadOnly() const { return IsReadOnly; } + OpenACCModifierKind getModifierList() const { return Modifiers; } static OpenACCCopyInClause * Create(const ASTContext &C, OpenACCClauseKind Spelling, - SourceLocation BeginLoc, SourceLocation LParenLoc, bool IsReadOnly, - ArrayRef VarList, SourceLocation EndLoc); + SourceLocation BeginLoc, SourceLocation LParenLoc, + OpenACCModifierKind Mods, ArrayRef VarList, + SourceLocation EndLoc); }; class OpenACCCopyOutClause final : public OpenACCClauseWithVarList, private llvm::TrailingObjects { friend TrailingObjects; - bool IsZero; + OpenACCModifierKind Modifiers; OpenACCCopyOutClause(OpenACCClauseKind Spelling, SourceLocation BeginLoc, - SourceLocation LParenLoc, bool IsZero, + SourceLocation LParenLoc, OpenACCModifierKind Mods, ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(Spelling, BeginLoc, LParenLoc, EndLoc), - IsZero(IsZero) { + Modifiers(Mods) { assert((Spelling == OpenACCClauseKind::CopyOut || Spelling == OpenACCClauseKind::PCopyOut || Spelling == OpenACCClauseKind::PresentOrCopyOut) && @@ -1172,24 +1178,25 @@ class OpenACCCopyOutClause final C->getClauseKind() == OpenACCClauseKind::PCopyOut || C->getClauseKind() == OpenACCClauseKind::PresentOrCopyOut; } - bool isZero() const { return IsZero; } + OpenACCModifierKind getModifierList() const { return Modifiers; } static OpenACCCopyOutClause * Create(const ASTContext &C, OpenACCClauseKind Spelling, - SourceLocation BeginLoc, SourceLocation LParenLoc, bool IsZero, - ArrayRef VarList, SourceLocation EndLoc); + SourceLocation BeginLoc, SourceLocation LParenLoc, + OpenACCModifierKind Mods, ArrayRef VarList, + SourceLocation EndLoc); }; class OpenACCCreateClause final : public OpenACCClauseWithVarList, private llvm::TrailingObjects { friend TrailingObjects; - bool IsZero; + OpenACCModifierKind Modifiers; OpenACCCreateClause(OpenACCClauseKind Spelling, SourceLocation BeginLoc, - SourceLocation LParenLoc, bool IsZero, + SourceLocation LParenLoc, OpenACCModifierKind Mods, ArrayRef VarList, SourceLocation EndLoc) : OpenACCClauseWithVarList(Spelling, BeginLoc, LParenLoc, EndLoc), - IsZero(IsZero) { + Modifiers(Mods) { assert((Spelling == OpenACCClauseKind::Create || Spelling == OpenACCClauseKind::PCreate || Spelling == OpenACCClauseKind::PresentOrCreate) && @@ -1205,11 +1212,12 @@ class OpenACCCreateClause final C->getClauseKind() == OpenACCClauseKind::PCreate || C->getClauseKind() == OpenACCClauseKind::PresentOrCreate; } - bool isZero() const { return IsZero; } + OpenACCModifierKind getModifierList() const { return Modifiers; } static OpenACCCreateClause * Create(const ASTContext &C, OpenACCClauseKind Spelling, - SourceLocation BeginLoc, SourceLocation LParenLoc, bool IsZero, - ArrayRef VarList, SourceLocation EndLoc); + SourceLocation BeginLoc, SourceLocation LParenLoc, + OpenACCModifierKind Mods, ArrayRef VarList, + SourceLocation EndLoc); }; class OpenACCReductionClause final diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 954f538e15026..f46e7fed28794 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1458,6 +1458,9 @@ def err_acc_invalid_reduction_operator : Error<"invalid reduction operator, expected '+', '*', 'max', 'min', " "'&', '|', '^', '&&', or '||'">; def err_acc_incorrect_bind_arg : Error<"expected identifier or string literal">; +def err_acc_modifier + : Error<"%enum_select{%Unknown{unknown}|%Duplicate{duplicate}}" + "0 modifier %1 in OpenACC modifier-list on '%2' clause">; // OpenMP support. def warn_pragma_omp_ignored : Warning< diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index dc98ceadd23ca..393bfecf9a36b 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13055,6 +13055,8 @@ def warn_acc_confusing_routine_name InGroup>; def err_acc_decl_for_routine : Error<"expected function or lambda declaration for 'routine' construct">; +def err_acc_invalid_modifier + : Error<"OpenACC '%0' modifier not valid on '%1' clause">; // AMDGCN builtins diagnostics def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">; diff --git a/clang/include/clang/Basic/OpenACCKinds.h b/clang/include/clang/Basic/OpenACCKinds.h index c2d7732123ef2..652831e23a758 100644 --- a/clang/include/clang/Basic/OpenACCKinds.h +++ b/clang/include/clang/Basic/OpenACCKinds.h @@ -15,10 +15,13 @@ #define LLVM_CLANG_BASIC_OPENACCKINDS_H #include "clang/Basic/Diagnostic.h" +#include "llvm/ADT/BitmaskEnum.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" namespace clang { +LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); + // Represents the Construct/Directive kind of a pragma directive. Note the // OpenACC standard is inconsistent between calling these Construct vs // Directive, but we're calling it a Directive to be consistent with OpenMP. @@ -619,6 +622,74 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &Out, OpenACCGangKind Op) { return printOpenACCGangKind(Out, Op); } + +// Represents the 'modifier' of a 'modifier-list', as applied to copy, copyin, +// copyout, and create. Implemented as a 'bitmask' +enum class OpenACCModifierKind : uint8_t { + Invalid = 0, + Always = 1 << 0, + AlwaysIn = 1 << 1, + AlwaysOut = 1 << 2, + Readonly = 1 << 3, + Zero = 1 << 4, + LLVM_MARK_AS_BITMASK_ENUM(Zero) +}; + +inline bool isOpenACCModifierBitSet(OpenACCModifierKind List, + OpenACCModifierKind Bit) { + return (List & Bit) != OpenACCModifierKind::Invalid; +} + +template +inline StreamTy &printOpenACCModifierKind(StreamTy &Out, + OpenACCModifierKind Mods) { + if (Mods == OpenACCModifierKind::Invalid) + return Out << ""; + + bool First = true; + + if (isOpenACCModifierBitSet(Mods, OpenACCModifierKind::Always)) { + Out << "always"; + First = false; + } + + if (isOpenACCModifierBitSet(Mods, OpenACCModifierKind::AlwaysIn)) { + if (!First) + Out << ", "; + Out << "alwaysin"; + First = false; + } + + if (isOpenACCModifierBitSet(Mods, OpenACCModifierKind::AlwaysOut)) { + if (!First) + Out << ", "; + Out << "alwaysout"; + First = false; + } + + if (isOpenACCModifierBitSet(Mods, OpenACCModifierKind::Readonly)) { + if (!First) + Out << ", "; + Out << "readonly"; + First = false; + } + + if (isOpenACCModifierBitSet(Mods, OpenACCModifierKind::Zero)) { + if (!First) + Out << ", "; + Out << "zero"; + First = false; + } + return Out; +} +inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out, + OpenACCModifierKind Op) { + return printOpenACCModifierKind(Out, Op); +} +inline llvm::raw_ostream &operator<<(llvm::raw_ostream &Out, + OpenACCModifierKind Op) { + return printOpenACCModifierKind(Out, Op); +} } // namespace clang #endif // LLVM_CLANG_BASIC_OPENACCKINDS_H diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 5770692c42f13..53da6269a3b11 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -3769,6 +3769,8 @@ class Parser : public CodeCompletionHandler { ExprResult ParseOpenACCIDExpression(); /// Parses the variable list for the `cache` construct. OpenACCCacheParseInfo ParseOpenACCCacheVarList(); + /// Parses the 'modifier-list' for copy, copyin, copyout, create. + OpenACCModifierKind tryParseModifierList(OpenACCClauseKind CK); using OpenACCVarParseResult = std::pair; /// Parses a single variable in a variable list for OpenACC. diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h index 18d92e62d71a7..4c3a13a3b044f 100644 --- a/clang/include/clang/Sema/SemaOpenACC.h +++ b/clang/include/clang/Sema/SemaOpenACC.h @@ -238,8 +238,7 @@ class SemaOpenACC : public SemaBase { struct VarListDetails { SmallVector VarList; - bool IsReadOnly; - bool IsZero; + OpenACCModifierKind ModifierKind; }; struct WaitDetails { @@ -451,12 +450,10 @@ class SemaOpenACC : public SemaBase { return const_cast(this)->getVarList(); } - bool isReadOnly() const { - return std::get(Details).IsReadOnly; + OpenACCModifierKind getModifierList() const { + return std::get(Details).ModifierKind; } - bool isZero() const { return std::get(Details).IsZero; } - bool isForce() const { assert(ClauseKind == OpenACCClauseKind::Collapse && "Only 'collapse' has a force tag"); @@ -552,8 +549,8 @@ class SemaOpenACC : public SemaBase { Details = GangDetails{std::move(GKs), std::move(IntExprs)}; } - void setVarListDetails(ArrayRef VarList, bool IsReadOnly, - bool IsZero) { + void setVarListDetails(ArrayRef VarList, + OpenACCModifierKind ModKind) { assert((ClauseKind == OpenACCClauseKind::Private || ClauseKind == OpenACCClauseKind::NoCreate || ClauseKind == OpenACCClauseKind::Present || @@ -582,23 +579,25 @@ class SemaOpenACC : public SemaBase { DirKind == OpenACCDirectiveKind::Update) || ClauseKind == OpenACCClauseKind::FirstPrivate) && "Parsed clause kind does not have a var-list"); - assert((!IsReadOnly || ClauseKind == OpenACCClauseKind::CopyIn || + assert((ModKind == OpenACCModifierKind::Invalid || + ClauseKind == OpenACCClauseKind::Copy || + ClauseKind == OpenACCClauseKind::PCopy || + ClauseKind == OpenACCClauseKind::PresentOrCopy || + ClauseKind == OpenACCClauseKind::CopyIn || ClauseKind == OpenACCClauseKind::PCopyIn || - ClauseKind == OpenACCClauseKind::PresentOrCopyIn) && - "readonly: tag only valid on copyin"); - assert((!IsZero || ClauseKind == OpenACCClauseKind::CopyOut || + ClauseKind == OpenACCClauseKind::PresentOrCopyIn || + ClauseKind == OpenACCClauseKind::CopyOut || ClauseKind == OpenACCClauseKind::PCopyOut || ClauseKind == OpenACCClauseKind::PresentOrCopyOut || ClauseKind == OpenACCClauseKind::Create || ClauseKind == OpenACCClauseKind::PCreate || ClauseKind == OpenACCClauseKind::PresentOrCreate) && - "zero: tag only valid on copyout/create"); - Details = - VarListDetails{{VarList.begin(), VarList.end()}, IsReadOnly, IsZero}; + "Modifier Kind only valid on copy, copyin, copyout, create"); + Details = VarListDetails{{VarList.begin(), VarList.end()}, ModKind}; } - void setVarListDetails(llvm::SmallVector &&VarList, bool IsReadOnly, - bool IsZero) { + void setVarListDetails(llvm::SmallVector &&VarList, + OpenACCModifierKind ModKind) { assert((ClauseKind == OpenACCClauseKind::Private || ClauseKind == OpenACCClauseKind::NoCreate || ClauseKind == OpenACCClauseKind::Present || @@ -627,18 +626,21 @@ class SemaOpenACC : public SemaBase { DirKind == OpenACCDirectiveKind::Update) || ClauseKind == OpenACCClauseKind::FirstPrivate) && "Parsed clause kind does not have a var-list"); - assert((!IsReadOnly || ClauseKind == OpenACCClauseKind::CopyIn || + assert((ModKind == OpenACCModifierKind::Invalid || + ClauseKind == OpenACCClauseKind::Copy || + ClauseKind == OpenACCClauseKind::PCopy || + ClauseKind == OpenACCClauseKind::PresentOrCopy || + ClauseKind == OpenACCClauseKind::CopyIn || ClauseKind == OpenACCClauseKind::PCopyIn || - ClauseKind == OpenACCClauseKind::PresentOrCopyIn) && - "readonly: tag only valid on copyin"); - assert((!IsZero || ClauseKind == OpenACCClauseKind::CopyOut || + ClauseKind == OpenACCClauseKind::PresentOrCopyIn || + ClauseKind == OpenACCClauseKind::CopyOut || ClauseKind == OpenACCClauseKind::PCopyOut || ClauseKind == OpenACCClauseKind::PresentOrCopyOut || ClauseKind == OpenACCClauseKind::Create || ClauseKind == OpenACCClauseKind::PCreate || ClauseKind == OpenACCClauseKind::PresentOrCreate) && - "zero: tag only valid on copyout/create"); - Details = VarListDetails{std::move(VarList), IsReadOnly, IsZero}; + "Modifier Kind only valid on copy, copyin, copyout, create"); + Details = VarListDetails{std::move(VarList), ModKind}; } void setReductionDetails(OpenACCReductionOperator Op, @@ -826,7 +828,8 @@ class SemaOpenACC : public SemaBase { // Checking for the arguments specific to the declare-clause that need to be // checked during both phases of template translation. - bool CheckDeclareClause(SemaOpenACC::OpenACCParsedClause &Clause); + bool CheckDeclareClause(SemaOpenACC::OpenACCParsedClause &Clause, + OpenACCModifierKind Mods); ExprResult ActOnRoutineName(Expr *RoutineName); diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp index ab76e6dffa0ff..d7cbb51335359 100644 --- a/clang/lib/AST/OpenACCClause.cpp +++ b/clang/lib/AST/OpenACCClause.cpp @@ -434,11 +434,12 @@ OpenACCDeviceClause *OpenACCDeviceClause::Create(const ASTContext &C, OpenACCCopyClause * OpenACCCopyClause::Create(const ASTContext &C, OpenACCClauseKind Spelling, SourceLocation BeginLoc, SourceLocation LParenLoc, - ArrayRef VarList, SourceLocation EndLoc) { + OpenACCModifierKind Mods, ArrayRef VarList, + SourceLocation EndLoc) { void *Mem = C.Allocate(OpenACCCopyClause::totalSizeToAlloc(VarList.size())); return new (Mem) - OpenACCCopyClause(Spelling, BeginLoc, LParenLoc, VarList, EndLoc); + OpenACCCopyClause(Spelling, BeginLoc, LParenLoc, Mods, VarList, EndLoc); } OpenACCLinkClause *OpenACCLinkClause::Create(const ASTContext &C, @@ -463,34 +464,34 @@ OpenACCDeviceResidentClause *OpenACCDeviceResidentClause::Create( OpenACCCopyInClause * OpenACCCopyInClause::Create(const ASTContext &C, OpenACCClauseKind Spelling, SourceLocation BeginLoc, SourceLocation LParenLoc, - bool IsReadOnly, ArrayRef VarList, + OpenACCModifierKind Mods, ArrayRef VarList, SourceLocation EndLoc) { void *Mem = C.Allocate(OpenACCCopyInClause::totalSizeToAlloc(VarList.size())); - return new (Mem) OpenACCCopyInClause(Spelling, BeginLoc, LParenLoc, - IsReadOnly, VarList, EndLoc); + return new (Mem) + OpenACCCopyInClause(Spelling, BeginLoc, LParenLoc, Mods, VarList, EndLoc); } OpenACCCopyOutClause * OpenACCCopyOutClause::Create(const ASTContext &C, OpenACCClauseKind Spelling, SourceLocation BeginLoc, SourceLocation LParenLoc, - bool IsZero, ArrayRef VarList, + OpenACCModifierKind Mods, ArrayRef VarList, SourceLocation EndLoc) { void *Mem = C.Allocate( OpenACCCopyOutClause::totalSizeToAlloc(VarList.size())); - return new (Mem) OpenACCCopyOutClause(Spelling, BeginLoc, LParenLoc, IsZero, + return new (Mem) OpenACCCopyOutClause(Spelling, BeginLoc, LParenLoc, Mods, VarList, EndLoc); } OpenACCCreateClause * OpenACCCreateClause::Create(const ASTContext &C, OpenACCClauseKind Spelling, SourceLocation BeginLoc, SourceLocation LParenLoc, - bool IsZero, ArrayRef VarList, + OpenACCModifierKind Mods, ArrayRef VarList, SourceLocation EndLoc) { void *Mem = C.Allocate(OpenACCCreateClause::totalSizeToAlloc(VarList.size())); - return new (Mem) OpenACCCreateClause(Spelling, BeginLoc, LParenLoc, IsZero, - VarList, EndLoc); + return new (Mem) + OpenACCCreateClause(Spelling, BeginLoc, LParenLoc, Mods, VarList, EndLoc); } OpenACCDeviceTypeClause *OpenACCDeviceTypeClause::Create( @@ -808,6 +809,8 @@ void OpenACCClausePrinter::VisitDeviceClause(const OpenACCDeviceClause &C) { void OpenACCClausePrinter::VisitCopyClause(const OpenACCCopyClause &C) { OS << C.getClauseKind() << '('; + if (C.getModifierList() != OpenACCModifierKind::Invalid) + OS << C.getModifierList() << ": "; llvm::interleaveComma(C.getVarList(), OS, [&](const Expr *E) { printExpr(E); }); OS << ")"; @@ -830,8 +833,8 @@ void OpenACCClausePrinter::VisitDeviceResidentClause( void OpenACCClausePrinter::VisitCopyInClause(const OpenACCCopyInClause &C) { OS << C.getClauseKind() << '('; - if (C.isReadOnly()) - OS << "readonly: "; + if (C.getModifierList() != OpenACCModifierKind::Invalid) + OS << C.getModifierList() << ": "; llvm::interleaveComma(C.getVarList(), OS, [&](const Expr *E) { printExpr(E); }); OS << ")"; @@ -839,8 +842,8 @@ void OpenACCClausePrinter::VisitCopyInClause(const OpenACCCopyInClause &C) { void OpenACCClausePrinter::VisitCopyOutClause(const OpenACCCopyOutClause &C) { OS << C.getClauseKind() << '('; - if (C.isZero()) - OS << "zero: "; + if (C.getModifierList() != OpenACCModifierKind::Invalid) + OS << C.getModifierList() << ": "; llvm::interleaveComma(C.getVarList(), OS, [&](const Expr *E) { printExpr(E); }); OS << ")"; @@ -848,8 +851,8 @@ void OpenACCClausePrinter::VisitCopyOutClause(const OpenACCCopyOutClause &C) { void OpenACCClausePrinter::VisitCreateClause(const OpenACCCreateClause &C) { OS << C.getClauseKind() << '('; - if (C.isZero()) - OS << "zero: "; + if (C.getModifierList() != OpenACCModifierKind::Invalid) + OS << C.getModifierList() << ": "; llvm::interleaveComma(C.getVarList(), OS, [&](const Expr *E) { printExpr(E); }); OS << ")"; diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 05f1953aa473c..be8d609974d81 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -405,9 +405,6 @@ void TextNodeDumper::Visit(const OpenACCClause *C) { case OpenACCClauseKind::Async: case OpenACCClauseKind::Auto: case OpenACCClauseKind::Attach: - case OpenACCClauseKind::Copy: - case OpenACCClauseKind::PCopy: - case OpenACCClauseKind::PresentOrCopy: case OpenACCClauseKind::Host: case OpenACCClauseKind::If: case OpenACCClauseKind::IfPresent: @@ -457,26 +454,38 @@ void TextNodeDumper::Visit(const OpenACCClause *C) { OS << ": force"; break; + case OpenACCClauseKind::Copy: + case OpenACCClauseKind::PCopy: + case OpenACCClauseKind::PresentOrCopy: + OS << " clause"; + if (cast(C)->getModifierList() != + OpenACCModifierKind::Invalid) + OS << " modifiers: " << cast(C)->getModifierList(); + break; case OpenACCClauseKind::CopyIn: case OpenACCClauseKind::PCopyIn: case OpenACCClauseKind::PresentOrCopyIn: OS << " clause"; - if (cast(C)->isReadOnly()) - OS << " : readonly"; + if (cast(C)->getModifierList() != + OpenACCModifierKind::Invalid) + OS << " modifiers: " << cast(C)->getModifierList(); break; case OpenACCClauseKind::CopyOut: case OpenACCClauseKind::PCopyOut: case OpenACCClauseKind::PresentOrCopyOut: OS << " clause"; - if (cast(C)->isZero()) - OS << " : zero"; + if (cast(C)->getModifierList() != + OpenACCModifierKind::Invalid) + OS << " modifiers: " + << cast(C)->getModifierList(); break; case OpenACCClauseKind::Create: case OpenACCClauseKind::PCreate: case OpenACCClauseKind::PresentOrCreate: OS << " clause"; - if (cast(C)->isZero()) - OS << " : zero"; + if (cast(C)->getModifierList() != + OpenACCModifierKind::Invalid) + OS << " modifiers: " << cast(C)->getModifierList(); break; case OpenACCClauseKind::Wait: OS << " clause"; diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp index 4f4ae362983d0..64916995907c5 100644 --- a/clang/lib/Parse/ParseOpenACC.cpp +++ b/clang/lib/Parse/ParseOpenACC.cpp @@ -662,6 +662,70 @@ ExprResult Parser::ParseOpenACCConditionExpr() { return R.isInvalid() ? ExprError() : R.get().second; } +// Tries to parse the 'modifier-list' for a 'copy', 'copyin', 'copyout', or +// 'create' clause. +OpenACCModifierKind Parser::tryParseModifierList(OpenACCClauseKind CK) { + // Use the tentative parsing to decide whether we are a comma-delmited list of + // identifers ending in a colon so we can do an actual parse with diagnostics. + { + RevertingTentativeParsingAction TPA{*this}; + // capture any pairs. + while (isTokenIdentifierOrKeyword(*this, getCurToken()) && + NextToken().is(tok::comma)) { + ConsumeToken(); + ConsumeToken(); + } + + if (!isTokenIdentifierOrKeyword(*this, getCurToken()) || + !NextToken().is(tok::colon)) { + // No modifiers as this isn't a valid modifier-list. + return OpenACCModifierKind::Invalid; + } + } + + auto GetModKind = [](Token T) { + return StringSwitch(T.getIdentifierInfo()->getName()) + .Case("always", OpenACCModifierKind::Always) + .Case("alwaysin", OpenACCModifierKind::AlwaysIn) + .Case("alwaysout", OpenACCModifierKind::AlwaysOut) + .Case("readonly", OpenACCModifierKind::Readonly) + .Case("zero", OpenACCModifierKind::Zero) + .Default(OpenACCModifierKind::Invalid); + }; + + OpenACCModifierKind CurModList = OpenACCModifierKind::Invalid; + auto ConsumeModKind = [&]() { + Token IdentToken = getCurToken(); + OpenACCModifierKind NewKind = GetModKind(IdentToken); + + if (NewKind == OpenACCModifierKind::Invalid) + Diag(IdentToken.getLocation(), diag::err_acc_modifier) + << diag::ACCModifier::Unknown << IdentToken.getIdentifierInfo() << CK; + else if ((NewKind & CurModList) != OpenACCModifierKind::Invalid) + Diag(IdentToken.getLocation(), diag::err_acc_modifier) + << diag::ACCModifier::Duplicate << IdentToken.getIdentifierInfo() + << CK; + else + CurModList |= NewKind; + + // Consumes the identifier. + ConsumeToken(); + // Consumes the comma or colon. + ConsumeToken(); + }; + + // Inspect all but the last item. We inspected enough to know that our current + // token is the identifier-like thing, so just check for the comma. + while (NextToken().is(tok::comma)) + ConsumeModKind(); + + // Above we confirmed that this should be correct/we should be on the last + // item. + ConsumeModKind(); + + return CurModList; +} + // OpenACC 3.3, section 1.7: // To simplify the specification and convey appropriate constraint information, // a pqr-list is a comma-separated list of pdr items. The one exception is a @@ -981,26 +1045,21 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams( break; } + case OpenACCClauseKind::Copy: + case OpenACCClauseKind::PCopy: + case OpenACCClauseKind::PresentOrCopy: case OpenACCClauseKind::CopyIn: case OpenACCClauseKind::PCopyIn: - case OpenACCClauseKind::PresentOrCopyIn: { - bool IsReadOnly = tryParseAndConsumeSpecialTokenKind( - *this, OpenACCSpecialTokenKind::ReadOnly, ClauseKind); - ParsedClause.setVarListDetails(ParseOpenACCVarList(DirKind, ClauseKind), - IsReadOnly, - /*IsZero=*/false); - break; - } - case OpenACCClauseKind::Create: - case OpenACCClauseKind::PCreate: - case OpenACCClauseKind::PresentOrCreate: + case OpenACCClauseKind::PresentOrCopyIn: case OpenACCClauseKind::CopyOut: case OpenACCClauseKind::PCopyOut: - case OpenACCClauseKind::PresentOrCopyOut: { - bool IsZero = tryParseAndConsumeSpecialTokenKind( - *this, OpenACCSpecialTokenKind::Zero, ClauseKind); + case OpenACCClauseKind::PresentOrCopyOut: + case OpenACCClauseKind::Create: + case OpenACCClauseKind::PCreate: + case OpenACCClauseKind::PresentOrCreate: { + OpenACCModifierKind ModList = tryParseModifierList(ClauseKind); ParsedClause.setVarListDetails(ParseOpenACCVarList(DirKind, ClauseKind), - /*IsReadOnly=*/false, IsZero); + ModList); break; } case OpenACCClauseKind::Reduction: { @@ -1026,15 +1085,12 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams( case OpenACCClauseKind::Detach: case OpenACCClauseKind::DevicePtr: case OpenACCClauseKind::UseDevice: - case OpenACCClauseKind::Copy: - case OpenACCClauseKind::PCopy: - case OpenACCClauseKind::PresentOrCopy: case OpenACCClauseKind::FirstPrivate: case OpenACCClauseKind::NoCreate: case OpenACCClauseKind::Present: case OpenACCClauseKind::Private: ParsedClause.setVarListDetails(ParseOpenACCVarList(DirKind, ClauseKind), - /*IsReadOnly=*/false, /*IsZero=*/false); + OpenACCModifierKind::Invalid); break; case OpenACCClauseKind::Collapse: { bool HasForce = tryParseAndConsumeSpecialTokenKind( diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index 819fb0853e8f5..c80f5f848f60b 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -2070,7 +2070,6 @@ void SemaOpenACC::CheckRoutineDecl(SourceLocation DirLoc, return; } - // TODO ERICH: Check bind here. auto BindItr = llvm::find_if(Clauses, llvm::IsaPred); for (auto *A : NextParsedFDecl->attrs()) { // OpenACC 3.3 2.15: diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp index a98b6712014cd..7d10c50d404d2 100644 --- a/clang/lib/Sema/SemaOpenACCClause.cpp +++ b/clang/lib/Sema/SemaOpenACCClause.cpp @@ -683,6 +683,59 @@ class SemaOpenACCClauseVisitor { return false; } + OpenACCModifierKind + CheckModifierList(SemaOpenACC::OpenACCParsedClause &Clause, + OpenACCModifierKind Mods) { + auto CheckSingle = [=](OpenACCModifierKind CurMods, + OpenACCModifierKind ValidKinds, + OpenACCModifierKind Bit) { + if (!isOpenACCModifierBitSet(CurMods, Bit) || + isOpenACCModifierBitSet(ValidKinds, Bit)) + return CurMods; + + SemaRef.Diag(Clause.getLParenLoc(), diag::err_acc_invalid_modifier) + << Bit << Clause.getClauseKind(); + + return CurMods ^ Bit; + }; + auto Check = [&](OpenACCModifierKind ValidKinds) { + if ((Mods | ValidKinds) == ValidKinds) + return Mods; + + Mods = CheckSingle(Mods, ValidKinds, OpenACCModifierKind::Always); + Mods = CheckSingle(Mods, ValidKinds, OpenACCModifierKind::AlwaysIn); + Mods = CheckSingle(Mods, ValidKinds, OpenACCModifierKind::AlwaysOut); + Mods = CheckSingle(Mods, ValidKinds, OpenACCModifierKind::Readonly); + Mods = CheckSingle(Mods, ValidKinds, OpenACCModifierKind::Zero); + return Mods; + }; + + switch (Clause.getClauseKind()) { + default: + llvm_unreachable("Only for copy, copyin, copyout, create"); + case OpenACCClauseKind::Copy: + case OpenACCClauseKind::PCopy: + case OpenACCClauseKind::PresentOrCopy: + return Check(OpenACCModifierKind::Always | OpenACCModifierKind::AlwaysIn | + OpenACCModifierKind::AlwaysOut); + case OpenACCClauseKind::CopyIn: + case OpenACCClauseKind::PCopyIn: + case OpenACCClauseKind::PresentOrCopyIn: + return Check(OpenACCModifierKind::Always | OpenACCModifierKind::AlwaysIn | + OpenACCModifierKind::Readonly); + case OpenACCClauseKind::CopyOut: + case OpenACCClauseKind::PCopyOut: + case OpenACCClauseKind::PresentOrCopyOut: + return Check(OpenACCModifierKind::Always | OpenACCModifierKind::AlwaysIn | + OpenACCModifierKind::Zero); + case OpenACCClauseKind::Create: + case OpenACCClauseKind::PCreate: + case OpenACCClauseKind::PresentOrCreate: + return Check(OpenACCModifierKind::Zero); + } + llvm_unreachable("didn't return from switch above?"); + } + public: SemaOpenACCClauseVisitor(SemaOpenACC &S, ArrayRef ExistingClauses) @@ -1070,7 +1123,7 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitPresentClause( // 'declare' has some restrictions that need to be enforced separately, so // check it here. - if (SemaRef.CheckDeclareClause(Clause)) + if (SemaRef.CheckDeclareClause(Clause, OpenACCModifierKind::Invalid)) return nullptr; return OpenACCPresentClause::Create(Ctx, Clause.getBeginLoc(), @@ -1106,25 +1159,28 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyClause( // really isn't anything to do here. GCC does some duplicate-finding, though // it isn't apparent in the standard where this is justified. + OpenACCModifierKind NewMods = + CheckModifierList(Clause, Clause.getModifierList()); + // 'declare' has some restrictions that need to be enforced separately, so // check it here. - if (SemaRef.CheckDeclareClause(Clause)) + if (SemaRef.CheckDeclareClause(Clause, NewMods)) return nullptr; return OpenACCCopyClause::Create( Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getVarList(), Clause.getEndLoc()); + Clause.getModifierList(), Clause.getVarList(), Clause.getEndLoc()); } OpenACCClause *SemaOpenACCClauseVisitor::VisitLinkClause( SemaOpenACC::OpenACCParsedClause &Clause) { // 'declare' has some restrictions that need to be enforced separately, so // check it here. - if (SemaRef.CheckDeclareClause(Clause)) + if (SemaRef.CheckDeclareClause(Clause, OpenACCModifierKind::Invalid)) return nullptr; Clause.setVarListDetails(SemaRef.CheckLinkClauseVarList(Clause.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); + OpenACCModifierKind::Invalid); return OpenACCLinkClause::Create(Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getVarList(), @@ -1135,7 +1191,7 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceResidentClause( SemaOpenACC::OpenACCParsedClause &Clause) { // 'declare' has some restrictions that need to be enforced separately, so // check it here. - if (SemaRef.CheckDeclareClause(Clause)) + if (SemaRef.CheckDeclareClause(Clause, OpenACCModifierKind::Invalid)) return nullptr; return OpenACCDeviceResidentClause::Create( @@ -1149,14 +1205,17 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyInClause( // really isn't anything to do here. GCC does some duplicate-finding, though // it isn't apparent in the standard where this is justified. + OpenACCModifierKind NewMods = + CheckModifierList(Clause, Clause.getModifierList()); + // 'declare' has some restrictions that need to be enforced separately, so // check it here. - if (SemaRef.CheckDeclareClause(Clause)) + if (SemaRef.CheckDeclareClause(Clause, NewMods)) return nullptr; return OpenACCCopyInClause::Create( Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.isReadOnly(), Clause.getVarList(), Clause.getEndLoc()); + Clause.getModifierList(), Clause.getVarList(), Clause.getEndLoc()); } OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyOutClause( @@ -1165,14 +1224,17 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyOutClause( // really isn't anything to do here. GCC does some duplicate-finding, though // it isn't apparent in the standard where this is justified. + OpenACCModifierKind NewMods = + CheckModifierList(Clause, Clause.getModifierList()); + // 'declare' has some restrictions that need to be enforced separately, so // check it here. - if (SemaRef.CheckDeclareClause(Clause)) + if (SemaRef.CheckDeclareClause(Clause, NewMods)) return nullptr; return OpenACCCopyOutClause::Create( Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.isZero(), Clause.getVarList(), Clause.getEndLoc()); + Clause.getModifierList(), Clause.getVarList(), Clause.getEndLoc()); } OpenACCClause *SemaOpenACCClauseVisitor::VisitCreateClause( @@ -1181,14 +1243,17 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitCreateClause( // really isn't anything to do here. GCC does some duplicate-finding, though // it isn't apparent in the standard where this is justified. + OpenACCModifierKind NewMods = + CheckModifierList(Clause, Clause.getModifierList()); + // 'declare' has some restrictions that need to be enforced separately, so // check it here. - if (SemaRef.CheckDeclareClause(Clause)) + if (SemaRef.CheckDeclareClause(Clause, NewMods)) return nullptr; return OpenACCCreateClause::Create( Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.isZero(), Clause.getVarList(), Clause.getEndLoc()); + Clause.getModifierList(), Clause.getVarList(), Clause.getEndLoc()); } OpenACCClause *SemaOpenACCClauseVisitor::VisitAttachClause( @@ -1199,8 +1264,7 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitAttachClause( llvm::erase_if(VarList, [&](Expr *E) { return SemaRef.CheckVarIsPointerType(OpenACCClauseKind::Attach, E); }); - Clause.setVarListDetails(VarList, - /*IsReadOnly=*/false, /*IsZero=*/false); + Clause.setVarListDetails(VarList, OpenACCModifierKind::Invalid); return OpenACCAttachClause::Create(Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getVarList(), Clause.getEndLoc()); @@ -1214,8 +1278,7 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDetachClause( llvm::erase_if(VarList, [&](Expr *E) { return SemaRef.CheckVarIsPointerType(OpenACCClauseKind::Detach, E); }); - Clause.setVarListDetails(VarList, - /*IsReadOnly=*/false, /*IsZero=*/false); + Clause.setVarListDetails(VarList, OpenACCModifierKind::Invalid); return OpenACCDetachClause::Create(Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getVarList(), Clause.getEndLoc()); @@ -1248,12 +1311,11 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitDevicePtrClause( llvm::erase_if(VarList, [&](Expr *E) { return SemaRef.CheckVarIsPointerType(OpenACCClauseKind::DevicePtr, E); }); - Clause.setVarListDetails(VarList, - /*IsReadOnly=*/false, /*IsZero=*/false); + Clause.setVarListDetails(VarList, OpenACCModifierKind::Invalid); // 'declare' has some restrictions that need to be enforced separately, so // check it here. - if (SemaRef.CheckDeclareClause(Clause)) + if (SemaRef.CheckDeclareClause(Clause, OpenACCModifierKind::Invalid)) return nullptr; return OpenACCDevicePtrClause::Create( @@ -2396,7 +2458,8 @@ SemaOpenACC::CheckLinkClauseVarList(ArrayRef VarExprs) { return NewVarList; } -bool SemaOpenACC::CheckDeclareClause(SemaOpenACC::OpenACCParsedClause &Clause) { +bool SemaOpenACC::CheckDeclareClause(SemaOpenACC::OpenACCParsedClause &Clause, + OpenACCModifierKind Mods) { if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Declare) return false; @@ -2487,7 +2550,6 @@ bool SemaOpenACC::CheckDeclareClause(SemaOpenACC::OpenACCParsedClause &Clause) { FilteredVarList.push_back(VarExpr); } - Clause.setVarListDetails(FilteredVarList, Clause.isReadOnly(), - Clause.isZero()); + Clause.setVarListDetails(FilteredVarList, Mods); return false; } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 2d6f2ca67af8a..e0f7ccc4674d8 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -1186,22 +1186,24 @@ void OpenACCDeclClauseInstantiator::VisitVectorClause( void OpenACCDeclClauseInstantiator::VisitCopyClause( const OpenACCCopyClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); - if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause)) + C.getModifierList()); + if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause, C.getModifierList())) return; NewClause = OpenACCCopyClause::Create( SemaRef.getASTContext(), ParsedClause.getClauseKind(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), - ParsedClause.getVarList(), ParsedClause.getEndLoc()); + ParsedClause.getModifierList(), ParsedClause.getVarList(), + ParsedClause.getEndLoc()); } void OpenACCDeclClauseInstantiator::VisitLinkClause( const OpenACCLinkClause &C) { ParsedClause.setVarListDetails( SemaRef.OpenACC().CheckLinkClauseVarList(VisitVarList(C.getVarList())), - /*IsReadOnly=*/false, /*IsZero=*/false); + OpenACCModifierKind::Invalid); - if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause)) + if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause, + OpenACCModifierKind::Invalid)) return; NewClause = OpenACCLinkClause::Create( @@ -1213,8 +1215,9 @@ void OpenACCDeclClauseInstantiator::VisitLinkClause( void OpenACCDeclClauseInstantiator::VisitDeviceResidentClause( const OpenACCDeviceResidentClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); - if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause)) + OpenACCModifierKind::Invalid); + if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause, + OpenACCModifierKind::Invalid)) return; NewClause = OpenACCDeviceResidentClause::Create( SemaRef.getASTContext(), ParsedClause.getBeginLoc(), @@ -1224,48 +1227,49 @@ void OpenACCDeclClauseInstantiator::VisitDeviceResidentClause( void OpenACCDeclClauseInstantiator::VisitCopyInClause( const OpenACCCopyInClause &C) { - ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), C.isReadOnly(), - /*IsZero=*/false); + ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), + C.getModifierList()); - if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause)) + if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause, C.getModifierList())) return; NewClause = OpenACCCopyInClause::Create( SemaRef.getASTContext(), ParsedClause.getClauseKind(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), - ParsedClause.isReadOnly(), ParsedClause.getVarList(), + ParsedClause.getModifierList(), ParsedClause.getVarList(), ParsedClause.getEndLoc()); } void OpenACCDeclClauseInstantiator::VisitCopyOutClause( const OpenACCCopyOutClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, C.isZero()); + C.getModifierList()); - if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause)) + if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause, C.getModifierList())) return; NewClause = OpenACCCopyOutClause::Create( SemaRef.getASTContext(), ParsedClause.getClauseKind(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), - ParsedClause.isZero(), ParsedClause.getVarList(), + ParsedClause.getModifierList(), ParsedClause.getVarList(), ParsedClause.getEndLoc()); } void OpenACCDeclClauseInstantiator::VisitCreateClause( const OpenACCCreateClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, C.isZero()); + C.getModifierList()); - if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause)) + if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause, C.getModifierList())) return; NewClause = OpenACCCreateClause::Create( SemaRef.getASTContext(), ParsedClause.getClauseKind(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), - ParsedClause.isZero(), ParsedClause.getVarList(), + ParsedClause.getModifierList(), ParsedClause.getVarList(), ParsedClause.getEndLoc()); } void OpenACCDeclClauseInstantiator::VisitPresentClause( const OpenACCPresentClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); - if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause)) + OpenACCModifierKind::Invalid); + if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause, + OpenACCModifierKind::Invalid)) return; NewClause = OpenACCPresentClause::Create( SemaRef.getASTContext(), ParsedClause.getBeginLoc(), @@ -1282,9 +1286,9 @@ void OpenACCDeclClauseInstantiator::VisitDevicePtrClause( OpenACCClauseKind::DevicePtr, E); }), VarList.end()); - ParsedClause.setVarListDetails(VarList, - /*IsReadOnly=*/false, /*IsZero=*/false); - if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause)) + ParsedClause.setVarListDetails(VarList, OpenACCModifierKind::Invalid); + if (SemaRef.OpenACC().CheckDeclareClause(ParsedClause, + OpenACCModifierKind::Invalid)) return; NewClause = OpenACCDevicePtrClause::Create( SemaRef.getASTContext(), ParsedClause.getBeginLoc(), diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 237c5a9ef501b..b9bf748a2e98e 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -11760,7 +11760,7 @@ void OpenACCClauseTransform::VisitSelfClause( } ParsedClause.setVarListDetails(InstantiatedVarList, - /*IsReadOnly=*/false, /*IsZero=*/false); + OpenACCModifierKind::Invalid); NewClause = OpenACCSelfClause::Create( Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), @@ -11818,7 +11818,7 @@ template void OpenACCClauseTransform::VisitPrivateClause( const OpenACCPrivateClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); + OpenACCModifierKind::Invalid); NewClause = OpenACCPrivateClause::Create( Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), @@ -11830,7 +11830,7 @@ template void OpenACCClauseTransform::VisitHostClause( const OpenACCHostClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); + OpenACCModifierKind::Invalid); NewClause = OpenACCHostClause::Create( Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), @@ -11842,7 +11842,7 @@ template void OpenACCClauseTransform::VisitDeviceClause( const OpenACCDeviceClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); + OpenACCModifierKind::Invalid); NewClause = OpenACCDeviceClause::Create( Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), @@ -11854,7 +11854,7 @@ template void OpenACCClauseTransform::VisitFirstPrivateClause( const OpenACCFirstPrivateClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); + OpenACCModifierKind::Invalid); NewClause = OpenACCFirstPrivateClause::Create( Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), @@ -11866,7 +11866,7 @@ template void OpenACCClauseTransform::VisitNoCreateClause( const OpenACCNoCreateClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); + OpenACCModifierKind::Invalid); NewClause = OpenACCNoCreateClause::Create( Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), @@ -11878,7 +11878,7 @@ template void OpenACCClauseTransform::VisitPresentClause( const OpenACCPresentClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); + OpenACCModifierKind::Invalid); NewClause = OpenACCPresentClause::Create( Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), @@ -11890,12 +11890,13 @@ template void OpenACCClauseTransform::VisitCopyClause( const OpenACCCopyClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); + C.getModifierList()); NewClause = OpenACCCopyClause::Create( Self.getSema().getASTContext(), ParsedClause.getClauseKind(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), - ParsedClause.getVarList(), ParsedClause.getEndLoc()); + ParsedClause.getModifierList(), ParsedClause.getVarList(), + ParsedClause.getEndLoc()); } template @@ -11923,13 +11924,13 @@ void OpenACCClauseTransform::VisitBindClause( template void OpenACCClauseTransform::VisitCopyInClause( const OpenACCCopyInClause &C) { - ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), C.isReadOnly(), - /*IsZero=*/false); + ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), + C.getModifierList()); NewClause = OpenACCCopyInClause::Create( Self.getSema().getASTContext(), ParsedClause.getClauseKind(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), - ParsedClause.isReadOnly(), ParsedClause.getVarList(), + ParsedClause.getModifierList(), ParsedClause.getVarList(), ParsedClause.getEndLoc()); } @@ -11937,12 +11938,12 @@ template void OpenACCClauseTransform::VisitCopyOutClause( const OpenACCCopyOutClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, C.isZero()); + C.getModifierList()); NewClause = OpenACCCopyOutClause::Create( Self.getSema().getASTContext(), ParsedClause.getClauseKind(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), - ParsedClause.isZero(), ParsedClause.getVarList(), + ParsedClause.getModifierList(), ParsedClause.getVarList(), ParsedClause.getEndLoc()); } @@ -11950,12 +11951,12 @@ template void OpenACCClauseTransform::VisitCreateClause( const OpenACCCreateClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, C.isZero()); + C.getModifierList()); NewClause = OpenACCCreateClause::Create( Self.getSema().getASTContext(), ParsedClause.getClauseKind(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), - ParsedClause.isZero(), ParsedClause.getVarList(), + ParsedClause.getModifierList(), ParsedClause.getVarList(), ParsedClause.getEndLoc()); } template @@ -11969,8 +11970,7 @@ void OpenACCClauseTransform::VisitAttachClause( OpenACCClauseKind::Attach, E); }); - ParsedClause.setVarListDetails(VarList, - /*IsReadOnly=*/false, /*IsZero=*/false); + ParsedClause.setVarListDetails(VarList, OpenACCModifierKind::Invalid); NewClause = OpenACCAttachClause::Create( Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), ParsedClause.getVarList(), @@ -11991,8 +11991,7 @@ void OpenACCClauseTransform::VisitDetachClause( }), VarList.end()); - ParsedClause.setVarListDetails(VarList, - /*IsReadOnly=*/false, /*IsZero=*/false); + ParsedClause.setVarListDetails(VarList, OpenACCModifierKind::Invalid); NewClause = OpenACCDetachClause::Create( Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), ParsedClause.getVarList(), @@ -12003,7 +12002,7 @@ template void OpenACCClauseTransform::VisitDeleteClause( const OpenACCDeleteClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); + OpenACCModifierKind::Invalid); NewClause = OpenACCDeleteClause::Create( Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), ParsedClause.getVarList(), @@ -12014,7 +12013,7 @@ template void OpenACCClauseTransform::VisitUseDeviceClause( const OpenACCUseDeviceClause &C) { ParsedClause.setVarListDetails(VisitVarList(C.getVarList()), - /*IsReadOnly=*/false, /*IsZero=*/false); + OpenACCModifierKind::Invalid); NewClause = OpenACCUseDeviceClause::Create( Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), ParsedClause.getVarList(), @@ -12032,8 +12031,7 @@ void OpenACCClauseTransform::VisitDevicePtrClause( OpenACCClauseKind::DevicePtr, E); }); - ParsedClause.setVarListDetails(VarList, - /*IsReadOnly=*/false, /*IsZero=*/false); + ParsedClause.setVarListDetails(VarList, OpenACCModifierKind::Invalid); NewClause = OpenACCDevicePtrClause::Create( Self.getSema().getASTContext(), ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(), ParsedClause.getVarList(), diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 38697eb835134..d8d77e7f55232 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -12715,36 +12715,37 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() { case OpenACCClauseKind::PresentOrCopy: case OpenACCClauseKind::Copy: { SourceLocation LParenLoc = readSourceLocation(); + OpenACCModifierKind ModList = readEnum(); llvm::SmallVector VarList = readOpenACCVarList(); return OpenACCCopyClause::Create(getContext(), ClauseKind, BeginLoc, - LParenLoc, VarList, EndLoc); + LParenLoc, ModList, VarList, EndLoc); } case OpenACCClauseKind::CopyIn: case OpenACCClauseKind::PCopyIn: case OpenACCClauseKind::PresentOrCopyIn: { SourceLocation LParenLoc = readSourceLocation(); - bool IsReadOnly = readBool(); + OpenACCModifierKind ModList = readEnum(); llvm::SmallVector VarList = readOpenACCVarList(); return OpenACCCopyInClause::Create(getContext(), ClauseKind, BeginLoc, - LParenLoc, IsReadOnly, VarList, EndLoc); + LParenLoc, ModList, VarList, EndLoc); } case OpenACCClauseKind::CopyOut: case OpenACCClauseKind::PCopyOut: case OpenACCClauseKind::PresentOrCopyOut: { SourceLocation LParenLoc = readSourceLocation(); - bool IsZero = readBool(); + OpenACCModifierKind ModList = readEnum(); llvm::SmallVector VarList = readOpenACCVarList(); return OpenACCCopyOutClause::Create(getContext(), ClauseKind, BeginLoc, - LParenLoc, IsZero, VarList, EndLoc); + LParenLoc, ModList, VarList, EndLoc); } case OpenACCClauseKind::Create: case OpenACCClauseKind::PCreate: case OpenACCClauseKind::PresentOrCreate: { SourceLocation LParenLoc = readSourceLocation(); - bool IsZero = readBool(); + OpenACCModifierKind ModList = readEnum(); llvm::SmallVector VarList = readOpenACCVarList(); return OpenACCCreateClause::Create(getContext(), ClauseKind, BeginLoc, - LParenLoc, IsZero, VarList, EndLoc); + LParenLoc, ModList, VarList, EndLoc); } case OpenACCClauseKind::Async: { SourceLocation LParenLoc = readSourceLocation(); diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index f27be5fb4c76c..a48c05061626a 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -8715,6 +8715,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { case OpenACCClauseKind::PresentOrCopy: { const auto *CC = cast(C); writeSourceLocation(CC->getLParenLoc()); + writeEnum(CC->getModifierList()); writeOpenACCVarList(CC); return; } @@ -8723,7 +8724,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { case OpenACCClauseKind::PresentOrCopyIn: { const auto *CIC = cast(C); writeSourceLocation(CIC->getLParenLoc()); - writeBool(CIC->isReadOnly()); + writeEnum(CIC->getModifierList()); writeOpenACCVarList(CIC); return; } @@ -8732,7 +8733,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { case OpenACCClauseKind::PresentOrCopyOut: { const auto *COC = cast(C); writeSourceLocation(COC->getLParenLoc()); - writeBool(COC->isZero()); + writeEnum(COC->getModifierList()); writeOpenACCVarList(COC); return; } @@ -8741,7 +8742,7 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) { case OpenACCClauseKind::PresentOrCreate: { const auto *CC = cast(C); writeSourceLocation(CC->getLParenLoc()); - writeBool(CC->isZero()); + writeEnum(CC->getModifierList()); writeOpenACCVarList(CC); return; } diff --git a/clang/test/AST/ast-print-openacc-combined-construct.cpp b/clang/test/AST/ast-print-openacc-combined-construct.cpp index b5afc1515aa18..be959fb2b6117 100644 --- a/clang/test/AST/ast-print-openacc-combined-construct.cpp +++ b/clang/test/AST/ast-print-openacc-combined-construct.cpp @@ -171,16 +171,16 @@ void foo() { #pragma acc parallel loop no_create(i, array[1], array, array[1:2]) present(i, array[1], array, array[1:2]) for(int i = 0;i<5;++i); -// CHECK: #pragma acc parallel loop copy(i, array[1], array, array[1:2]) pcopy(i, array[1], array, array[1:2]) present_or_copy(i, array[1], array, array[1:2]) -#pragma acc parallel loop copy(i, array[1], array, array[1:2]) pcopy(i, array[1], array, array[1:2]) present_or_copy(i, array[1], array, array[1:2]) +// CHECK: #pragma acc parallel loop copy(alwaysin: i, array[1], array, array[1:2]) pcopy(i, array[1], array, array[1:2]) present_or_copy(i, array[1], array, array[1:2]) +#pragma acc parallel loop copy(alwaysin: i, array[1], array, array[1:2]) pcopy(i, array[1], array, array[1:2]) present_or_copy(i, array[1], array, array[1:2]) for(int i = 0;i<5;++i); -// CHECK: #pragma acc parallel loop copyin(i, array[1], array, array[1:2]) pcopyin(readonly: i, array[1], array, array[1:2]) present_or_copyin(i, array[1], array, array[1:2]) -#pragma acc parallel loop copyin(i, array[1], array, array[1:2]) pcopyin(readonly:i, array[1], array, array[1:2]) present_or_copyin(i, array[1], array, array[1:2]) +// CHECK: #pragma acc parallel loop copyin(i, array[1], array, array[1:2]) pcopyin(readonly: i, array[1], array, array[1:2]) present_or_copyin(always, alwaysin: i, array[1], array, array[1:2]) +#pragma acc parallel loop copyin(i, array[1], array, array[1:2]) pcopyin(readonly:i, array[1], array, array[1:2]) present_or_copyin(always, alwaysin: i, array[1], array, array[1:2]) for(int i = 0;i<5;++i); -// CHECK: #pragma acc parallel loop copyout(i, array[1], array, array[1:2]) pcopyout(zero: i, array[1], array, array[1:2]) present_or_copyout(i, array[1], array, array[1:2]) -#pragma acc parallel loop copyout(i, array[1], array, array[1:2]) pcopyout(zero: i, array[1], array, array[1:2]) present_or_copyout(i, array[1], array, array[1:2]) +// CHECK: #pragma acc parallel loop copyout(i, array[1], array, array[1:2]) pcopyout(zero: i, array[1], array, array[1:2]) present_or_copyout(always, alwaysin: i, array[1], array, array[1:2]) +#pragma acc parallel loop copyout(i, array[1], array, array[1:2]) pcopyout(zero: i, array[1], array, array[1:2]) present_or_copyout(always, alwaysin: i, array[1], array, array[1:2]) for(int i = 0;i<5;++i); // CHECK: #pragma acc parallel loop create(i, array[1], array, array[1:2]) pcreate(zero: i, array[1], array, array[1:2]) present_or_create(i, array[1], array, array[1:2]) diff --git a/clang/test/AST/ast-print-openacc-compute-construct.cpp b/clang/test/AST/ast-print-openacc-compute-construct.cpp index 9516bfd843000..1fbb81a220aab 100644 --- a/clang/test/AST/ast-print-openacc-compute-construct.cpp +++ b/clang/test/AST/ast-print-openacc-compute-construct.cpp @@ -56,12 +56,16 @@ void foo() { #pragma acc parallel no_create(i, array[1], array, array[1:2]) present(i, array[1], array, array[1:2]) while(true); -// CHECK: #pragma acc parallel copyin(i, array[1], array, array[1:2]) pcopyin(readonly: i, array[1], array, array[1:2]) present_or_copyin(i, array[1], array, array[1:2]) -#pragma acc parallel copyin(i, array[1], array, array[1:2]) pcopyin(readonly:i, array[1], array, array[1:2]) present_or_copyin(i, array[1], array, array[1:2]) +// CHECK: #pragma acc parallel copy(i, array[1], array, array[1:2]) pcopy(alwaysin: i, array[1], array, array[1:2]) present_or_copy(always, alwaysout: i, array[1], array, array[1:2]) +#pragma acc parallel copy(i, array[1], array, array[1:2]) pcopy(alwaysin:i, array[1], array, array[1:2]) present_or_copy(always, alwaysout: i, array[1], array, array[1:2]) while(true); -// CHECK: #pragma acc parallel copyout(i, array[1], array, array[1:2]) pcopyout(zero: i, array[1], array, array[1:2]) present_or_copyout(i, array[1], array, array[1:2]) -#pragma acc parallel copyout(i, array[1], array, array[1:2]) pcopyout(zero: i, array[1], array, array[1:2]) present_or_copyout(i, array[1], array, array[1:2]) +// CHECK: #pragma acc parallel copyin(i, array[1], array, array[1:2]) pcopyin(readonly: i, array[1], array, array[1:2]) present_or_copyin(always, readonly: i, array[1], array, array[1:2]) +#pragma acc parallel copyin(i, array[1], array, array[1:2]) pcopyin(readonly:i, array[1], array, array[1:2]) present_or_copyin(readonly, always: i, array[1], array, array[1:2]) + while(true); + +// CHECK: #pragma acc parallel copyout(i, array[1], array, array[1:2]) pcopyout(zero: i, array[1], array, array[1:2]) present_or_copyout(always, zero: i, array[1], array, array[1:2]) +#pragma acc parallel copyout(i, array[1], array, array[1:2]) pcopyout(zero: i, array[1], array, array[1:2]) present_or_copyout(always, zero: i, array[1], array, array[1:2]) while(true); // CHECK: #pragma acc parallel create(i, array[1], array, array[1:2]) pcreate(zero: i, array[1], array, array[1:2]) present_or_create(i, array[1], array, array[1:2]) diff --git a/clang/test/AST/ast-print-openacc-data-construct.cpp b/clang/test/AST/ast-print-openacc-data-construct.cpp index 6d6f54cb45ada..b7d2428fb605a 100644 --- a/clang/test/AST/ast-print-openacc-data-construct.cpp +++ b/clang/test/AST/ast-print-openacc-data-construct.cpp @@ -17,8 +17,8 @@ void foo() { // CHECK: #pragma acc enter data copyin(Var) #pragma acc enter data copyin(Var) ; -// CHECK: #pragma acc exit data copyout(Var) -#pragma acc exit data copyout(Var) +// CHECK: #pragma acc exit data copyout(always, zero: Var) +#pragma acc exit data copyout(zero, always: Var) ; // CHECK: #pragma acc host_data use_device(Var) #pragma acc host_data use_device(Var) @@ -44,8 +44,8 @@ void foo() { // CHECK: #pragma acc data default(none) async(i) #pragma acc data default(none) async(i) ; -// CHECK: #pragma acc enter data copyin(i) async(i) -#pragma acc enter data copyin(i) async(i) +// CHECK: #pragma acc enter data copyin(always: i) async(i) +#pragma acc enter data copyin(always: i) async(i) // CHECK: #pragma acc exit data copyout(i) async #pragma acc exit data copyout(i) async @@ -56,8 +56,8 @@ void foo() { // CHECK: #pragma acc enter data copyin(Var) wait #pragma acc enter data copyin(Var) wait -// CHECK: #pragma acc exit data copyout(Var) wait(*iPtr, i) -#pragma acc exit data copyout(Var) wait(*iPtr, i) +// CHECK: #pragma acc exit data copyout(always, zero: Var) wait(*iPtr, i) +#pragma acc exit data copyout(always, zero: Var) wait(*iPtr, i) // CHECK: #pragma acc data default(none) wait(queues: *iPtr, i) #pragma acc data default(none) wait(queues:*iPtr, i) @@ -88,8 +88,8 @@ void foo() { #pragma acc data present(i, array[1], array, array[1:2]) ; -// CHECK: #pragma acc data default(none) copy(i, array[1], array, array[1:2]) pcopy(i, array[1], array, array[1:2]) present_or_copy(i, array[1], array, array[1:2]) -#pragma acc data default(none) copy(i, array[1], array, array[1:2]) pcopy(i, array[1], array, array[1:2]) present_or_copy(i, array[1], array, array[1:2]) +// CHECK: #pragma acc data default(none) copy(i, array[1], array, array[1:2]) pcopy(i, array[1], array, array[1:2]) present_or_copy(alwaysin, alwaysout: i, array[1], array, array[1:2]) +#pragma acc data default(none) copy(i, array[1], array, array[1:2]) pcopy(i, array[1], array, array[1:2]) present_or_copy(alwaysin, alwaysout: i, array[1], array, array[1:2]) ; // CHECK: #pragma acc enter data copyin(i, array[1], array, array[1:2]) pcopyin(readonly: i, array[1], array, array[1:2]) present_or_copyin(i, array[1], array, array[1:2]) diff --git a/clang/test/AST/ast-print-openacc-declare-construct.cpp b/clang/test/AST/ast-print-openacc-declare-construct.cpp index fce4afc6aedae..2a61b08c5500b 100644 --- a/clang/test/AST/ast-print-openacc-declare-construct.cpp +++ b/clang/test/AST/ast-print-openacc-declare-construct.cpp @@ -5,8 +5,8 @@ int GlobalArray[5]; int GlobalArray2[5]; // CHECK: #pragma acc declare deviceptr(Global) copyin(GlobalArray) #pragma acc declare deviceptr(Global), copyin(GlobalArray) -// CHECK: #pragma acc declare create(Global2, GlobalArray2) -#pragma acc declare create(Global2, GlobalArray2) +// CHECK: #pragma acc declare create(zero: Global2, GlobalArray2) +#pragma acc declare create(zero: Global2, GlobalArray2) namespace NS { int NSVar; @@ -18,8 +18,8 @@ int NSArray[5]; struct Struct { static const int StaticMem = 5; static const int StaticMemArray[5]; -// CHECK: #pragma acc declare copyin(StaticMem, StaticMemArray) -#pragma acc declare copyin(StaticMem, StaticMemArray) +// CHECK: #pragma acc declare copyin(always, alwaysin: StaticMem, StaticMemArray) +#pragma acc declare copyin(always, alwaysin: StaticMem, StaticMemArray) void MemFunc1(int Arg) { int Local; diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c index e31b7492dab2c..2319b1abb9e83 100644 --- a/clang/test/ParserOpenACC/parse-clauses.c +++ b/clang/test/ParserOpenACC/parse-clauses.c @@ -1,6 +1,6 @@ -// RUN: %clang_cc1 %s -verify -fopenacc -std=c99 -// RUNX: %clang_cc1 %s -verify -fopenacc -// RUNX: %clang_cc1 %s -verify -fopenacc -x c++ +// RUN: %clang_cc1 %s -verify=expected,c -fopenacc -std=c99 +// RUN: %clang_cc1 %s -verify=expected,c -fopenacc +// RUN: %clang_cc1 %s -verify=expected,cpp -fopenacc -x c++ void func() { @@ -422,7 +422,8 @@ void VarListClauses() { #pragma acc serial copy(HasMem.MemArr[:]), self for(int i = 0; i < 5;++i) {} - // expected-error@+1{{expected expression}} + // cpp-error@+2{{expected unqualified-id}} + // c-error@+1{{expected expression}} #pragma acc serial copy(HasMem.MemArr[::]), self for(int i = 0; i < 5;++i) {} @@ -443,6 +444,21 @@ void VarListClauses() { #pragma acc serial present_or_copy(HasMem.MemArr[3:]) for(int i = 0; i < 5;++i) {} + // expected-error@+2{{unknown modifier 'foo' in OpenACC modifier-list on 'copy' clause}} + // expected-error@+1{{unknown modifier 'bar' in OpenACC modifier-list on 'copy' clause}} +#pragma acc parallel copy(foo, bar: HasMem.MemArr[3:]) self + for(int i = 0; i < 5;++i) {} + + // expected-error@+1{{duplicate modifier 'always' in OpenACC modifier-list on 'copy' clause}} +#pragma acc parallel copy(always, alwaysin, always: HasMem.MemArr[3:]) self + for(int i = 0; i < 5;++i) {} + + // expected-error@+3{{use of undeclared identifier 'always'}} + // expected-error@+2{{use of undeclared identifier 'alwaysin'}} + // expected-error@+1{{use of undeclared identifier 'always'}} +#pragma acc parallel copy(always, alwaysin, always, HasMem.MemArr[3:]) self + for(int i = 0; i < 5;++i) {} + // expected-error@+2 2{{OpenACC variable in 'use_device' clause is not a valid variable name or array name}} // expected-error@+1{{expected ','}} #pragma acc host_data use_device(s.array[s.value] s.array[s.value :5] ), if_present @@ -580,15 +596,15 @@ void VarListClauses() { #pragma acc serial copyout(zero s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} - // expected-error@+1{{invalid tag 'readonly' on 'copyout' clause}} + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copyout' clause}} #pragma acc serial copyout(readonly:s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} - // expected-error@+1{{invalid tag 'invalid' on 'copyout' clause}} + // expected-error@+1{{unknown modifier 'invalid' in OpenACC modifier-list on 'copyout' clause}} #pragma acc serial copyout(invalid:s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} - // expected-error@+1{{invalid tag 'invalid' on 'copyout' clause}} + // expected-error@+1{{unknown modifier 'invalid' in OpenACC modifier-list on 'copyout' clause}} #pragma acc serial copyout(invalid:s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} @@ -597,6 +613,15 @@ void VarListClauses() { #pragma acc serial copyout(invalid s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} + // expected-error@+2{{unknown modifier 'invalid' in OpenACC modifier-list on 'copyout' clause}} + // expected-error@+1{{unknown modifier 'bar' in OpenACC modifier-list on 'copyout' clause}} +#pragma acc serial copyout(invalid, bar: s.array[s.value : 5], s.value), self + for(int i = 0; i < 5;++i) {} + + // expected-error@+1{{duplicate modifier 'zero' in OpenACC modifier-list on 'copyout' clause}} +#pragma acc serial copyout(zero, zero, always: s.array[s.value : 5], s.value), self + for(int i = 0; i < 5;++i) {} + // expected-error@+1{{expected ','}} #pragma acc serial create(s.array[s.value] s.array[s.value :5] ), self for(int i = 0; i < 5;++i) {} @@ -607,6 +632,20 @@ void VarListClauses() { #pragma acc serial create(zero:s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} + // expected-error@+1{{OpenACC 'always' modifier not valid on 'create' clause}} +#pragma acc serial create(always, zero:s.array[s.value : 5], s.value), self + for(int i = 0; i < 5;++i) {} + + // expected-error@+2{{duplicate modifier 'always' in OpenACC modifier-list on 'create' clause}} + // expected-error@+1{{OpenACC 'always' modifier not valid on 'create' clause}} +#pragma acc serial create(always, always, zero:s.array[s.value : 5], s.value), self + for(int i = 0; i < 5;++i) {} + + // expected-error@+2{{unknown modifier 'invalid' in OpenACC modifier-list on 'create' clause}} + // expected-error@+1{{OpenACC 'always' modifier not valid on 'create' clause}} +#pragma acc serial create(always, invalid, zero:s.array[s.value : 5], s.value), self + for(int i = 0; i < 5;++i) {} + // expected-warning@+1{{OpenACC clause name 'pcreate' is a deprecated clause name and is now an alias for 'create'}} #pragma acc serial pcreate(s.array[s.value : 5], s.value) for(int i = 0; i < 5;++i) {} @@ -623,15 +662,15 @@ void VarListClauses() { #pragma acc serial create(zero s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} - // expected-error@+1{{invalid tag 'readonly' on 'create' clause}} + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'create' clause}} #pragma acc serial create(readonly:s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} - // expected-error@+1{{invalid tag 'invalid' on 'create' clause}} + // expected-error@+1{{unknown modifier 'invalid' in OpenACC modifier-list on 'create' clause}} #pragma acc serial create(invalid:s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} - // expected-error@+1{{invalid tag 'invalid' on 'create' clause}} + // expected-error@+1{{unknown modifier 'invalid' in OpenACC modifier-list on 'create' clause}} #pragma acc serial create(invalid:s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} @@ -666,15 +705,15 @@ void VarListClauses() { #pragma acc serial copyin(readonly s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} - // expected-error@+1{{invalid tag 'zero' on 'copyin' clause}} + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copyin' clause}} #pragma acc serial copyin(zero :s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} - // expected-error@+1{{invalid tag 'invalid' on 'copyin' clause}} + // expected-error@+1{{unknown modifier 'invalid' in OpenACC modifier-list on 'copyin' clause}} #pragma acc serial copyin(invalid:s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} - // expected-error@+1{{invalid tag 'invalid' on 'copyin' clause}} + // expected-error@+1{{unknown modifier 'invalid' in OpenACC modifier-list on 'copyin' clause}} #pragma acc serial copyin(invalid:s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} @@ -682,6 +721,15 @@ void VarListClauses() { // expected-error@+1{{expected ','}} #pragma acc serial copyin(invalid s.array[s.value : 5], s.value), self for(int i = 0; i < 5;++i) {} + + // expected-error@+2{{unknown modifier 'foo' in OpenACC modifier-list on 'copyin' clause}} + // expected-error@+1{{unknown modifier 'bar' in OpenACC modifier-list on 'copyin' clause}} +#pragma acc serial copyin(foo, bar: s.array[s.value : 5], s.value), self + for(int i = 0; i < 5;++i) {} + + // expected-error@+1{{duplicate modifier 'readonly' in OpenACC modifier-list on 'copyin' clause}} +#pragma acc serial copyin(always, readonly, readonly: s.array[s.value : 5], s.value), self + for(int i = 0; i < 5;++i) {} } void ReductionClauseParsing() { diff --git a/clang/test/SemaOpenACC/combined-construct-copy-ast.cpp b/clang/test/SemaOpenACC/combined-construct-copy-ast.cpp index a696badb0847c..75e5d0283889e 100644 --- a/clang/test/SemaOpenACC/combined-construct-copy-ast.cpp +++ b/clang/test/SemaOpenACC/combined-construct-copy-ast.cpp @@ -14,18 +14,18 @@ void NormalUses(float *PointerParam) { // CHECK: ParmVarDecl // CHECK-NEXT: CompoundStmt -#pragma acc parallel loop copy(GlobalArray) pcopy(PointerParam[Global]) present_or_copy(Global) +#pragma acc parallel loop copy(always:GlobalArray) pcopy(alwaysin:PointerParam[Global]) present_or_copy(alwaysout: Global) for (unsigned i = 0; i < 5; ++i); // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop - // CHECK-NEXT: copy clause + // CHECK-NEXT: copy clause modifiers: always // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcopy clause + // CHECK-NEXT: pcopy clause modifiers: alwaysin // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' - // CHECK-NEXT: present_or_copy clause + // CHECK-NEXT: present_or_copy clause modifiers: alwaysout // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' // CHECK-NEXT: ForStmt // CHECK:NullStmt @@ -42,12 +42,12 @@ void TemplUses(T t, U u) { // CHECK-NEXT: ParmVarDecl{{.*}} referenced u 'U' // CHECK-NEXT: CompoundStmt -#pragma acc parallel loop copy(t) pcopy(NTTP, u) present_or_copy(u[0:t]) +#pragma acc parallel loop copy(t) pcopy(always, alwaysin, alwaysout: NTTP, u) present_or_copy(u[0:t]) for (unsigned i = 0; i < 5; ++i); // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop // CHECK-NEXT: copy clause // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcopy clause + // CHECK-NEXT: pcopy clause modifiers: always, alwaysin, alwaysout // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: present_or_copy clause @@ -75,7 +75,7 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop // CHECK-NEXT: copy clause // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcopy clause + // CHECK-NEXT: pcopy clause modifiers: always, alwaysin, alwaysout // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' diff --git a/clang/test/SemaOpenACC/combined-construct-copy-clause.c b/clang/test/SemaOpenACC/combined-construct-copy-clause.c index c588fd8d94987..07c412c621ff1 100644 --- a/clang/test/SemaOpenACC/combined-construct-copy-clause.c +++ b/clang/test/SemaOpenACC/combined-construct-copy-clause.c @@ -70,3 +70,18 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo #pragma acc loop present_or_copy(LocalInt) for(int i = 5; i < 10;++i); } +void ModList() { + int V1; + // expected-error@+2{{OpenACC 'readonly' modifier not valid on 'copy' clause}} + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copy' clause}} +#pragma acc parallel loop copy(always, alwaysin, alwaysout, zero, readonly: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copy' clause}} +#pragma acc serial loop copy(readonly: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copy' clause}} +#pragma acc kernels loop copy(zero: V1) + for(int i = 5; i < 10;++i); +#pragma acc parallel loop copy(always, alwaysin, alwaysout: V1) + for(int i = 5; i < 10;++i); +} diff --git a/clang/test/SemaOpenACC/combined-construct-copyin-ast.cpp b/clang/test/SemaOpenACC/combined-construct-copyin-ast.cpp index e7ac8238a7db6..f7b5cb841b966 100644 --- a/clang/test/SemaOpenACC/combined-construct-copyin-ast.cpp +++ b/clang/test/SemaOpenACC/combined-construct-copyin-ast.cpp @@ -14,18 +14,18 @@ void NormalUses(float *PointerParam) { // CHECK: ParmVarDecl // CHECK-NEXT: CompoundStmt -#pragma acc parallel loop copyin(GlobalArray) pcopyin(readonly:PointerParam[Global]) present_or_copyin(Global) +#pragma acc parallel loop copyin(GlobalArray) pcopyin(readonly:PointerParam[Global]) present_or_copyin(always, alwaysin: Global) for (unsigned i = 0; i < 5; ++i); // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcopyin clause : readonly + // CHECK-NEXT: pcopyin clause modifiers: readonly // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' - // CHECK-NEXT: present_or_copyin clause + // CHECK-NEXT: present_or_copyin clause modifiers: always, alwaysin // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' // CHECK-NEXT: ForStmt // CHECK: NullStmt @@ -42,12 +42,12 @@ void TemplUses(T t, U u) { // CHECK-NEXT: ParmVarDecl{{.*}} referenced u 'U' // CHECK-NEXT: CompoundStmt -#pragma acc parallel loop copyin(t) pcopyin(readonly: NTTP, u) present_or_copyin(u[0:t]) +#pragma acc parallel loop copyin(always, readonly, alwaysin: t) pcopyin(readonly: NTTP, u) present_or_copyin(u[0:t]) for (unsigned i = 0; i < 5; ++i); // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop - // CHECK-NEXT: copyin clause + // CHECK-NEXT: copyin clause modifiers: always, alwaysin, readonly // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcopyin clause : readonly + // CHECK-NEXT: pcopyin clause modifiers: readonly // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: present_or_copyin clause @@ -73,9 +73,9 @@ void TemplUses(T t, U u) { // #pragma acc parallel copyin(t) pcopyin(readonly: NTTP, u) present_or_copyin(u[0:t]) // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop - // CHECK-NEXT: copyin clause + // CHECK-NEXT: copyin clause modifiers: always, alwaysin, readonly // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcopyin clause : readonly + // CHECK-NEXT: pcopyin clause modifiers: readonly // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' diff --git a/clang/test/SemaOpenACC/combined-construct-copyin-clause.c b/clang/test/SemaOpenACC/combined-construct-copyin-clause.c index 4cb635ddb2470..b4a6eafdb9ebd 100644 --- a/clang/test/SemaOpenACC/combined-construct-copyin-clause.c +++ b/clang/test/SemaOpenACC/combined-construct-copyin-clause.c @@ -61,7 +61,7 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc parallel loop copyin((float)ArrayParam[2]) for(int i = 0; i < 5; ++i); - // expected-error@+2{{invalid tag 'invalid' on 'copyin' clause}} + // expected-error@+2{{unknown modifier 'invalid' in OpenACC modifier-list on 'copyin' clause}} // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc parallel loop copyin(invalid:(float)ArrayParam[2]) for(int i = 0; i < 5; ++i); @@ -76,3 +76,19 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo #pragma acc loop present_or_copyin(LocalInt) for(int i = 5; i < 10;++i); } + +void ModList() { + int V1; + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'copyin' clause}} + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copyin' clause}} +#pragma acc parallel loop copyin(always, alwaysin, alwaysout, zero, readonly: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'copyin' clause}} +#pragma acc serial loop copyin(alwaysout: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copyin' clause}} +#pragma acc kernels loop copyin(zero: V1) + for(int i = 5; i < 10;++i); +#pragma acc parallel loop copyin(always, alwaysin, readonly: V1) + for(int i = 5; i < 10;++i); +} diff --git a/clang/test/SemaOpenACC/combined-construct-copyout-ast.cpp b/clang/test/SemaOpenACC/combined-construct-copyout-ast.cpp index 02eda413dd53f..ec4451e9df7d3 100644 --- a/clang/test/SemaOpenACC/combined-construct-copyout-ast.cpp +++ b/clang/test/SemaOpenACC/combined-construct-copyout-ast.cpp @@ -14,18 +14,18 @@ void NormalUses(float *PointerParam) { // CHECK: ParmVarDecl // CHECK-NEXT: CompoundStmt -#pragma acc parallel loop copyout(GlobalArray) pcopyout(zero:PointerParam[Global]) present_or_copyout(Global) +#pragma acc parallel loop copyout(GlobalArray) pcopyout(zero:PointerParam[Global]) present_or_copyout(always, alwaysin: Global) for (unsigned i = 0; i < 5; ++i); // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop // CHECK-NEXT: copyout clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcopyout clause : zero + // CHECK-NEXT: pcopyout clause modifiers: zero // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' - // CHECK-NEXT: present_or_copyout clause + // CHECK-NEXT: present_or_copyout clause modifiers: always, alwaysin // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' // CHECK-NEXT: For // CHECK: NullStmt @@ -42,12 +42,12 @@ void TemplUses(T t, U u) { // CHECK-NEXT: ParmVarDecl{{.*}} referenced u 'U' // CHECK-NEXT: CompoundStmt -#pragma acc parallel loop copyout(t) pcopyout(zero: NTTP, u) present_or_copyout(u[0:t]) +#pragma acc parallel loop copyout(always, alwaysin: t) pcopyout(zero: NTTP, u) present_or_copyout(u[0:t]) for (unsigned i = 0; i < 5; ++i); // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop - // CHECK-NEXT: copyout clause + // CHECK-NEXT: copyout clause modifiers: always, alwaysin // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcopyout clause : zero + // CHECK-NEXT: pcopyout clause modifiers: zero // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: present_or_copyout clause @@ -73,9 +73,9 @@ void TemplUses(T t, U u) { // #pragma acc parallel copyout(t) pcopyout(zero: NTTP, u) present_or_copyout(u[0:t]) // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop - // CHECK-NEXT: copyout clause + // CHECK-NEXT: copyout clause modifiers: always, alwaysin // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcopyout clause : zero + // CHECK-NEXT: pcopyout clause modifiers: zero // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' diff --git a/clang/test/SemaOpenACC/combined-construct-copyout-clause.c b/clang/test/SemaOpenACC/combined-construct-copyout-clause.c index c43f9592062fa..6621adb5c6124 100644 --- a/clang/test/SemaOpenACC/combined-construct-copyout-clause.c +++ b/clang/test/SemaOpenACC/combined-construct-copyout-clause.c @@ -61,7 +61,7 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc parallel loop copyout((float)ArrayParam[2]) for(int i = 0; i < 5; ++i); - // expected-error@+2{{invalid tag 'invalid' on 'copyout' clause}} + // expected-error@+2{{unknown modifier 'invalid' in OpenACC modifier-list on 'copyout' clause}} // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc parallel loop copyout(invalid:(float)ArrayParam[2]) for(int i = 0; i < 5; ++i); @@ -76,3 +76,19 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo #pragma acc loop present_or_copyout(LocalInt) for(int i = 0; i < 6;++i); } +void ModList() { + int V1; + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'copyout' clause}} + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copyout' clause}} +#pragma acc parallel loop copyout(always, alwaysin, alwaysout, zero, readonly: V1) + for(int i = 0; i < 6;++i); + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'copyout' clause}} +#pragma acc serial loop copyout(alwaysout: V1) + for(int i = 0; i < 6;++i); + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copyout' clause}} +#pragma acc kernels loop copyout(readonly: V1) + for(int i = 0; i < 6;++i); +#pragma acc parallel loop copyout(always, alwaysin, zero: V1) + for(int i = 0; i < 6;++i); +} + diff --git a/clang/test/SemaOpenACC/combined-construct-create-ast.cpp b/clang/test/SemaOpenACC/combined-construct-create-ast.cpp index 16268eda4e5d2..888654e690c0b 100644 --- a/clang/test/SemaOpenACC/combined-construct-create-ast.cpp +++ b/clang/test/SemaOpenACC/combined-construct-create-ast.cpp @@ -19,7 +19,7 @@ void NormalUses(float *PointerParam) { // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop // CHECK-NEXT: create clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcreate clause : zero + // CHECK-NEXT: pcreate clause modifiers: zero // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' @@ -47,7 +47,7 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop // CHECK-NEXT: create clause // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcreate clause : zero + // CHECK-NEXT: pcreate clause modifiers: zero // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: present_or_create clause @@ -75,7 +75,7 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop // CHECK-NEXT: create clause // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcreate clause : zero + // CHECK-NEXT: pcreate clause modifiers: zero // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' diff --git a/clang/test/SemaOpenACC/combined-construct-create-clause.c b/clang/test/SemaOpenACC/combined-construct-create-clause.c index c17e6921a7da2..bf7dfe83a0511 100644 --- a/clang/test/SemaOpenACC/combined-construct-create-clause.c +++ b/clang/test/SemaOpenACC/combined-construct-create-clause.c @@ -61,7 +61,7 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc parallel loop create((float)ArrayParam[2]) for(int i = 0; i < 5; ++i); - // expected-error@+2{{invalid tag 'invalid' on 'create' clause}} + // expected-error@+2{{unknown modifier 'invalid' in OpenACC modifier-list on 'create' clause}} // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc parallel loop create(invalid:(float)ArrayParam[2]) for(int i = 0; i < 5; ++i); @@ -76,3 +76,27 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo #pragma acc loop present_or_create(LocalInt) for(int i = 5; i < 10;++i); } + +void ModList() { + int V1; + // expected-error@+4{{OpenACC 'always' modifier not valid on 'create' clause}} + // expected-error@+3{{OpenACC 'alwaysin' modifier not valid on 'create' clause}} + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'create' clause}} + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'create' clause}} +#pragma acc parallel loop create(always, alwaysin, alwaysout, zero, readonly: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'always' modifier not valid on 'create' clause}} +#pragma acc serial loop create(always: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'alwaysin' modifier not valid on 'create' clause}} +#pragma acc kernels loop create(alwaysin: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'create' clause}} +#pragma acc parallel loop create(alwaysout: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'create' clause}} +#pragma acc serial loop create(readonly: V1) + for(int i = 5; i < 10;++i); +#pragma acc kernels loop create(zero: V1) + for(int i = 5; i < 10;++i); +} diff --git a/clang/test/SemaOpenACC/compute-construct-copy-clause.c b/clang/test/SemaOpenACC/compute-construct-copy-clause.c index c4a9963ef4c7c..e83bdab64c246 100644 --- a/clang/test/SemaOpenACC/compute-construct-copy-clause.c +++ b/clang/test/SemaOpenACC/compute-construct-copy-clause.c @@ -70,3 +70,18 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo #pragma acc loop present_or_copy(LocalInt) for(int i = 5; i < 10;++i); } +void ModList() { + int V1; + // expected-error@+2{{OpenACC 'readonly' modifier not valid on 'copy' clause}} + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copy' clause}} +#pragma acc parallel copy(always, alwaysin, alwaysout, zero, readonly: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copy' clause}} +#pragma acc serial copy(readonly: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copy' clause}} +#pragma acc kernels copy(zero: V1) + for(int i = 5; i < 10;++i); +#pragma acc parallel copy(always, alwaysin, alwaysout: V1) + for(int i = 5; i < 10;++i); +} diff --git a/clang/test/SemaOpenACC/compute-construct-copyin-clause.c b/clang/test/SemaOpenACC/compute-construct-copyin-clause.c index 84b5c29f3bf67..eaa8a604df32a 100644 --- a/clang/test/SemaOpenACC/compute-construct-copyin-clause.c +++ b/clang/test/SemaOpenACC/compute-construct-copyin-clause.c @@ -61,7 +61,7 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc parallel copyin((float)ArrayParam[2]) while(1); - // expected-error@+2{{invalid tag 'invalid' on 'copyin' clause}} + // expected-error@+2{{unknown modifier 'invalid' in OpenACC modifier-list on 'copyin' clause}} // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc parallel copyin(invalid:(float)ArrayParam[2]) while(1); @@ -76,3 +76,18 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo #pragma acc loop present_or_copyin(LocalInt) for(int i = 5; i < 10;++i); } +void ModList() { + int V1; + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'copyin' clause}} + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copyin' clause}} +#pragma acc parallel copyin(always, alwaysin, alwaysout, zero, readonly: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'copyin' clause}} +#pragma acc serial copyin(alwaysout: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copyin' clause}} +#pragma acc kernels copyin(zero: V1) + for(int i = 5; i < 10;++i); +#pragma acc parallel copyin(always, alwaysin, readonly: V1) + for(int i = 5; i < 10;++i); +} diff --git a/clang/test/SemaOpenACC/compute-construct-copyout-clause.c b/clang/test/SemaOpenACC/compute-construct-copyout-clause.c index da64be291494e..f1ea21d0824cc 100644 --- a/clang/test/SemaOpenACC/compute-construct-copyout-clause.c +++ b/clang/test/SemaOpenACC/compute-construct-copyout-clause.c @@ -61,7 +61,7 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc parallel copyout((float)ArrayParam[2]) while(1); - // expected-error@+2{{invalid tag 'invalid' on 'copyout' clause}} + // expected-error@+2{{unknown modifier 'invalid' in OpenACC modifier-list on 'copyout' clause}} // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc parallel copyout(invalid:(float)ArrayParam[2]) while(1); @@ -76,3 +76,18 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo #pragma acc loop present_or_copyout(LocalInt) for(int i = 0; i < 6;++i); } +void ModList() { + int V1; + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'copyout' clause}} + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copyout' clause}} +#pragma acc parallel copyout(always, alwaysin, alwaysout, zero, readonly: V1) + for(int i = 0; i < 6;++i); + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'copyout' clause}} +#pragma acc serial copyout(alwaysout: V1) + for(int i = 0; i < 6;++i); + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copyout' clause}} +#pragma acc kernels copyout(readonly: V1) + for(int i = 0; i < 6;++i); +#pragma acc parallel copyout(always, alwaysin, zero: V1) + for(int i = 0; i < 6;++i); +} diff --git a/clang/test/SemaOpenACC/compute-construct-create-clause.c b/clang/test/SemaOpenACC/compute-construct-create-clause.c index d54a82a4e7dfa..926c5b88a5115 100644 --- a/clang/test/SemaOpenACC/compute-construct-create-clause.c +++ b/clang/test/SemaOpenACC/compute-construct-create-clause.c @@ -62,7 +62,7 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc parallel create((float)ArrayParam[2]) while(1); - // expected-error@+2{{invalid tag 'invalid' on 'create' clause}} + // expected-error@+2{{unknown modifier 'invalid' in OpenACC modifier-list on 'create' clause}} // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc parallel create(invalid:(float)ArrayParam[2]) while(1); @@ -77,3 +77,26 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo #pragma acc loop present_or_create(LocalInt) for(int i = 5; i < 10;++i); } +void ModList() { + int V1; + // expected-error@+4{{OpenACC 'always' modifier not valid on 'create' clause}} + // expected-error@+3{{OpenACC 'alwaysin' modifier not valid on 'create' clause}} + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'create' clause}} + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'create' clause}} +#pragma acc parallel create(always, alwaysin, alwaysout, zero, readonly: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'always' modifier not valid on 'create' clause}} +#pragma acc serial create(always: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'alwaysin' modifier not valid on 'create' clause}} +#pragma acc kernels create(alwaysin: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'create' clause}} +#pragma acc parallel create(alwaysout: V1) + for(int i = 5; i < 10;++i); + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'create' clause}} +#pragma acc serial create(readonly: V1) + for(int i = 5; i < 10;++i); +#pragma acc kernels loop create(zero: V1) + for(int i = 5; i < 10;++i); +} diff --git a/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp b/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp index 1bfd4e8af6481..df01c59349665 100644 --- a/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp +++ b/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp @@ -99,52 +99,52 @@ void NormalUses(float *PointerParam) { // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: NullStmt -#pragma acc parallel copy(GlobalArray) pcopy(PointerParam[Global]) present_or_copy(Global) +#pragma acc parallel copy(GlobalArray) pcopy(always: PointerParam[Global]) present_or_copy(alwaysin, alwaysout: Global) while(true); // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel - // CHECK-NEXT: copy clause + // CHECK-NEXT: copy clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcopy clause + // CHECK-NEXT: pcopy clause modifiers: always // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' - // CHECK-NEXT: present_or_copy clause + // CHECK-NEXT: present_or_copy clause modifiers: alwaysin, alwaysout // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' // CHECK-NEXT: WhileStmt // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: NullStmt -#pragma acc parallel copyin(GlobalArray) pcopyin(readonly: PointerParam[Global]) present_or_copyin(Global) +#pragma acc parallel copyin(GlobalArray) pcopyin(readonly: PointerParam[Global]) present_or_copyin(always, alwaysin: Global) while(true); // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcopyin clause : readonly + // CHECK-NEXT: pcopyin clause modifiers: readonly // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' - // CHECK-NEXT: present_or_copyin clause + // CHECK-NEXT: present_or_copyin clause modifiers: always, alwaysin // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' // CHECK-NEXT: WhileStmt // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: NullStmt -#pragma acc parallel copyout(GlobalArray) pcopyout(zero:PointerParam[Global]) present_or_copyout(Global) +#pragma acc parallel copyout(GlobalArray) pcopyout(zero:PointerParam[Global]) present_or_copyout(always, alwaysin: Global) while(true); // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel // CHECK-NEXT: copyout clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcopyout clause : zero + // CHECK-NEXT: pcopyout clause modifiers: zero // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' - // CHECK-NEXT: present_or_copyout clause + // CHECK-NEXT: present_or_copyout clause modifiers: always, alwaysin // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' // CHECK-NEXT: WhileStmt // CHECK-NEXT: CXXBoolLiteralExpr @@ -155,7 +155,7 @@ void NormalUses(float *PointerParam) { // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel // CHECK-NEXT: create clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcreate clause : zero + // CHECK-NEXT: pcreate clause modifiers: zero // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' @@ -329,15 +329,15 @@ void TemplUses(T t, U u, T*PointerParam) { // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: NullStmt -#pragma acc parallel copy(t) pcopy(NTTP, u) present_or_copy(u[0:t]) +#pragma acc parallel copy(t) pcopy(always: NTTP, u) present_or_copy(alwaysin, alwaysout: u[0:t]) while(true); // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel // CHECK-NEXT: copy clause // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcopy clause + // CHECK-NEXT: pcopy clause modifiers: always // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' - // CHECK-NEXT: present_or_copy clause + // CHECK-NEXT: present_or_copy clause modifiers: alwaysin, alwaysout // CHECK-NEXT: ArraySectionExpr // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0 @@ -346,15 +346,15 @@ void TemplUses(T t, U u, T*PointerParam) { // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: NullStmt -#pragma acc parallel copyin(t) pcopyin(readonly:NTTP, u) present_or_copyin(u[0:t]) +#pragma acc parallel copyin(t) pcopyin(readonly:NTTP, u) present_or_copyin(always, alwaysin: u[0:t]) while(true); // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcopyin clause : readonly + // CHECK-NEXT: pcopyin clause modifiers: readonly // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' - // CHECK-NEXT: present_or_copyin clause + // CHECK-NEXT: present_or_copyin clause modifiers: always, alwaysin // CHECK-NEXT: ArraySectionExpr // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0 @@ -363,15 +363,15 @@ void TemplUses(T t, U u, T*PointerParam) { // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: NullStmt -#pragma acc parallel copyout(t) pcopyout(zero:NTTP, u) present_or_copyout(u[0:t]) +#pragma acc parallel copyout(t) pcopyout(zero:NTTP, u) present_or_copyout(always, alwaysin: u[0:t]) while(true); // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel // CHECK-NEXT: copyout clause // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcopyout clause : zero + // CHECK-NEXT: pcopyout clause modifiers: zero // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' - // CHECK-NEXT: present_or_copyout clause + // CHECK-NEXT: present_or_copyout clause modifiers: always, alwaysin // CHECK-NEXT: ArraySectionExpr // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0 @@ -385,7 +385,7 @@ void TemplUses(T t, U u, T*PointerParam) { // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel // CHECK-NEXT: create clause // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcreate clause : zero + // CHECK-NEXT: pcreate clause modifiers: zero // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: present_or_create clause @@ -529,16 +529,16 @@ void TemplUses(T t, U u, T*PointerParam) { // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: NullStmt -//#pragma acc parallel copy(t) pcopy(NTTP, u) copy_or_present(u[0:t]) +//#pragma acc parallel copy(t) pcopy(always: NTTP, u) present_or_copy(alwaysin, alwaysout: u[0:t]) // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel // CHECK-NEXT: copy clause // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcopy clause + // CHECK-NEXT: pcopy clause modifiers: always // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *' - // CHECK-NEXT: present_or_copy clause + // CHECK-NEXT: present_or_copy clause modifiers: alwaysin, alwaysout // CHECK-NEXT: ArraySectionExpr // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *' @@ -549,16 +549,16 @@ void TemplUses(T t, U u, T*PointerParam) { // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: NullStmt -//#pragma acc parallel copyin(t) pcopyin(readonly:NTTP, u) present_or_copyin(u[0:t]) +//#pragma acc parallel copyin(t) pcopyin(readonly:NTTP, u) present_or_copyin(always, alwaysin: u[0:t]) // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcopyin clause : readonly + // CHECK-NEXT: pcopyin clause modifiers: readonly // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *' - // CHECK-NEXT: present_or_copyin clause + // CHECK-NEXT: present_or_copyin clause modifiers: always, alwaysin // CHECK-NEXT: ArraySectionExpr // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *' @@ -569,16 +569,16 @@ void TemplUses(T t, U u, T*PointerParam) { // CHECK-NEXT: CXXBoolLiteralExpr // CHECK-NEXT: NullStmt -//#pragma acc parallel copyout(t) pcopyout(zero:NTTP, u) present_or_copyout(u[0:t]) +// #pragma acc parallel copyout(t) pcopyout(zero:NTTP, u) present_or_copyout(always, alwaysin: u[0:t]) // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel // CHECK-NEXT: copyout clause // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcopyout clause : zero + // CHECK-NEXT: pcopyout clause modifiers: zero // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *' - // CHECK-NEXT: present_or_copyout clause + // CHECK-NEXT: present_or_copyout clause modifiers: always, alwaysin // CHECK-NEXT: ArraySectionExpr // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *' @@ -593,7 +593,7 @@ void TemplUses(T t, U u, T*PointerParam) { // CHECK-NEXT: OpenACCComputeConstruct{{.*}} parallel // CHECK-NEXT: create clause // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcreate clause : zero + // CHECK-NEXT: pcreate clause modifiers: zero // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' diff --git a/clang/test/SemaOpenACC/data-construct-copy-ast.cpp b/clang/test/SemaOpenACC/data-construct-copy-ast.cpp index de067f00a2b29..8c0a12f244257 100644 --- a/clang/test/SemaOpenACC/data-construct-copy-ast.cpp +++ b/clang/test/SemaOpenACC/data-construct-copy-ast.cpp @@ -14,18 +14,18 @@ void NormalUses(float *PointerParam) { // CHECK: FunctionDecl{{.*}}NormalUses // CHECK: ParmVarDecl // CHECK-NEXT: CompoundStmt -#pragma acc data copy(GlobalArray) pcopy(PointerParam[Global]) present_or_copy(Global) +#pragma acc data copy(GlobalArray) pcopy(always, alwaysin: PointerParam[Global]) present_or_copy(alwaysout: Global) ; // CHECK-NEXT: OpenACCDataConstruct{{.*}} data // CHECK-NEXT: copy clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcopy clause + // CHECK-NEXT: pcopy clause modifiers: always, alwaysin // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' - // CHECK-NEXT: present_or_copy clause + // CHECK-NEXT: present_or_copy clause modifiers: alwaysout // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' // CHECK-NEXT: NullStmt } @@ -40,15 +40,15 @@ void TemplUses(T t, U u) { // CHECK-NEXT: ParmVarDecl{{.*}} referenced u 'U' // CHECK-NEXT: CompoundStmt -#pragma acc data copy(t) pcopy(NTTP, u) present_or_copy(u[0:t]) +#pragma acc data copy(always: t) pcopy(NTTP, u) present_or_copy(alwaysin, alwaysout: u[0:t]) ; // CHECK-NEXT: OpenACCDataConstruct{{.*}} data - // CHECK-NEXT: copy clause + // CHECK-NEXT: copy clause modifiers: always // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' // CHECK-NEXT: pcopy clause // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' - // CHECK-NEXT: present_or_copy clause + // CHECK-NEXT: present_or_copy clause modifiers: alwaysin, alwaysout // CHECK-NEXT: ArraySectionExpr // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0 @@ -69,14 +69,14 @@ void TemplUses(T t, U u) { // CHECK-NEXT: CompoundStmt // CHECK-NEXT: OpenACCDataConstruct{{.*}} data - // CHECK-NEXT: copy clause + // CHECK-NEXT: copy clause modifiers: always // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' // CHECK-NEXT: pcopy clause // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *' - // CHECK-NEXT: present_or_copy clause + // CHECK-NEXT: present_or_copy clause modifiers: alwaysin, alwaysout // CHECK-NEXT: ArraySectionExpr // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *' diff --git a/clang/test/SemaOpenACC/data-construct-copy-clause.c b/clang/test/SemaOpenACC/data-construct-copy-clause.c index 882a0bc87e003..0b2b0534073ed 100644 --- a/clang/test/SemaOpenACC/data-construct-copy-clause.c +++ b/clang/test/SemaOpenACC/data-construct-copy-clause.c @@ -67,3 +67,19 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo #pragma acc host_data present_or_copy(LocalInt) ; } +void ModList() { + int V1; + // expected-error@+2{{OpenACC 'readonly' modifier not valid on 'copy' clause}} + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copy' clause}} +#pragma acc data copy(always, alwaysin, alwaysout, zero, readonly: V1) + ; + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copy' clause}} +#pragma acc data copy(readonly: V1) + ; + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copy' clause}} +#pragma acc data copy(zero: V1) + ; +#pragma acc data copy(always, alwaysin, alwaysout: V1) +; +} + diff --git a/clang/test/SemaOpenACC/data-construct-copyin-ast.cpp b/clang/test/SemaOpenACC/data-construct-copyin-ast.cpp index fd21d60c84431..a3c8e7a36f9f8 100644 --- a/clang/test/SemaOpenACC/data-construct-copyin-ast.cpp +++ b/clang/test/SemaOpenACC/data-construct-copyin-ast.cpp @@ -19,7 +19,7 @@ void NormalUses(float *PointerParam) { // CHECK-NEXT: OpenACCDataConstruct{{.*}} data // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcopyin clause : readonly + // CHECK-NEXT: pcopyin clause modifiers: readonly // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' @@ -29,17 +29,17 @@ void NormalUses(float *PointerParam) { // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' // CHECK-NEXT: NullStmt -#pragma acc enter data copyin(GlobalArray) pcopyin(readonly:PointerParam[Global]) present_or_copyin(Global) +#pragma acc enter data copyin(GlobalArray) pcopyin(readonly:PointerParam[Global]) present_or_copyin(readonly, always: Global) // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}} enter data // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcopyin clause : readonly + // CHECK-NEXT: pcopyin clause modifiers: readonly // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' - // CHECK-NEXT: present_or_copyin clause + // CHECK-NEXT: present_or_copyin clause modifiers: always, readonly // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' } @@ -54,15 +54,15 @@ void TemplUses(T t, U u) { // CHECK-NEXT: ParmVarDecl{{.*}} referenced u 'U' // CHECK-NEXT: CompoundStmt -#pragma acc data copyin(t) pcopyin(readonly: NTTP, u) present_or_copyin(u[0:t]) +#pragma acc data copyin(t) pcopyin(readonly: NTTP, u) present_or_copyin(readonly, always: u[0:t]) ; // CHECK-NEXT: OpenACCDataConstruct{{.*}} data // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcopyin clause : readonly + // CHECK-NEXT: pcopyin clause modifiers: readonly // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' - // CHECK-NEXT: present_or_copyin clause + // CHECK-NEXT: present_or_copyin clause modifiers: always, readonly // CHECK-NEXT: ArraySectionExpr // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0 @@ -73,7 +73,7 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}} enter data // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcopyin clause : readonly + // CHECK-NEXT: pcopyin clause modifiers: readonly // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: present_or_copyin clause @@ -98,12 +98,12 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCDataConstruct{{.*}} data // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcopyin clause : readonly + // CHECK-NEXT: pcopyin clause modifiers: readonly // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *' - // CHECK-NEXT: present_or_copyin clause + // CHECK-NEXT: present_or_copyin clause modifiers: always, readonly // CHECK-NEXT: ArraySectionExpr // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *' @@ -115,7 +115,7 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}} enter data // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcopyin clause : readonly + // CHECK-NEXT: pcopyin clause modifiers: readonly // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' diff --git a/clang/test/SemaOpenACC/data-construct-copyin-clause.c b/clang/test/SemaOpenACC/data-construct-copyin-clause.c index 370cc7000f8d8..edc3f0a2e91fe 100644 --- a/clang/test/SemaOpenACC/data-construct-copyin-clause.c +++ b/clang/test/SemaOpenACC/data-construct-copyin-clause.c @@ -58,7 +58,7 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc data copyin((float)ArrayParam[2]) ; - // expected-error@+2{{invalid tag 'invalid' on 'copyin' clause}} + // expected-error@+2{{unknown modifier 'invalid' in OpenACC modifier-list on 'copyin' clause}} // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc data copyin(invalid:(float)ArrayParam[2]) ; @@ -71,3 +71,24 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo #pragma acc host_data pcopyin(LocalInt) ; } + +void ModList() { + int V1; + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'copyin' clause}} + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copyin' clause}} +#pragma acc data copyin(always, alwaysin, alwaysout, zero, readonly: V1) + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'copyin' clause}} +#pragma acc data copyin(alwaysout: V1) + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copyin' clause}} +#pragma acc data copyin(zero: V1) +#pragma acc data copyin(always, alwaysin, readonly: V1) + + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'copyin' clause}} + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copyin' clause}} +#pragma acc enter data copyin(always, alwaysin, alwaysout, zero, readonly: V1) + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'copyin' clause}} +#pragma acc enter data copyin(alwaysout: V1) + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copyin' clause}} +#pragma acc enter data copyin(zero: V1) +#pragma acc enter data copyin(always, alwaysin, readonly: V1) +} diff --git a/clang/test/SemaOpenACC/data-construct-copyout-ast.cpp b/clang/test/SemaOpenACC/data-construct-copyout-ast.cpp index 38e6e7b476fe5..069ced7de83d8 100644 --- a/clang/test/SemaOpenACC/data-construct-copyout-ast.cpp +++ b/clang/test/SemaOpenACC/data-construct-copyout-ast.cpp @@ -19,7 +19,7 @@ void NormalUses(float *PointerParam) { // CHECK-NEXT: OpenACCDataConstruct{{.*}} data // CHECK-NEXT: copyout clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcopyout clause : zero + // CHECK-NEXT: pcopyout clause modifiers: zero // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' @@ -29,17 +29,17 @@ void NormalUses(float *PointerParam) { // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' // CHECK-NEXT: NullStmt -#pragma acc exit data copyout(GlobalArray) pcopyout(zero:PointerParam[Global]) present_or_copyout(Global) +#pragma acc exit data copyout(GlobalArray) pcopyout(zero:PointerParam[Global]) present_or_copyout(always, zero: Global) // CHECK-NEXT: OpenACCExitDataConstruct{{.*}} exit data // CHECK-NEXT: copyout clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcopyout clause : zero + // CHECK-NEXT: pcopyout clause modifiers: zero // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' - // CHECK-NEXT: present_or_copyout clause + // CHECK-NEXT: present_or_copyout clause modifiers: always, zero // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int' } @@ -54,15 +54,15 @@ void TemplUses(T t, U u) { // CHECK-NEXT: ParmVarDecl{{.*}} referenced u 'U' // CHECK-NEXT: CompoundStmt -#pragma acc data copyout(t) pcopyout(zero: NTTP, u) present_or_copyout(u[0:t]) +#pragma acc data copyout(t) pcopyout(zero: NTTP, u) present_or_copyout(alwaysin: u[0:t]) ; // CHECK-NEXT: OpenACCDataConstruct{{.*}} data // CHECK-NEXT: copyout clause // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcopyout clause : zero + // CHECK-NEXT: pcopyout clause modifiers: zero // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' - // CHECK-NEXT: present_or_copyout clause + // CHECK-NEXT: present_or_copyout clause modifiers: alwaysin // CHECK-NEXT: ArraySectionExpr // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: IntegerLiteral{{.*}} 'int' 0 @@ -73,7 +73,7 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCExitDataConstruct{{.*}} exit data // CHECK-NEXT: copyout clause // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcopyout clause : zero + // CHECK-NEXT: pcopyout clause modifiers: zero // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: present_or_copyout clause @@ -98,12 +98,12 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCDataConstruct{{.*}} data // CHECK-NEXT: copyout clause // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcopyout clause : zero + // CHECK-NEXT: pcopyout clause modifiers: zero // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *' - // CHECK-NEXT: present_or_copyout clause + // CHECK-NEXT: present_or_copyout clause modifiers: alwaysin // CHECK-NEXT: ArraySectionExpr // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int *' // CHECK-NEXT: DeclRefExpr{{.*}}'int *' lvalue ParmVar{{.*}} 'u' 'int *' @@ -115,7 +115,7 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCExitDataConstruct{{.*}} exit data // CHECK-NEXT: copyout clause // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcopyout clause : zero + // CHECK-NEXT: pcopyout clause modifiers: zero // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' diff --git a/clang/test/SemaOpenACC/data-construct-copyout-clause.c b/clang/test/SemaOpenACC/data-construct-copyout-clause.c index 0f9d5f2ad5c83..8d137e093db0e 100644 --- a/clang/test/SemaOpenACC/data-construct-copyout-clause.c +++ b/clang/test/SemaOpenACC/data-construct-copyout-clause.c @@ -58,7 +58,7 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc data copyout((float)ArrayParam[2]) ; - // expected-error@+2{{invalid tag 'invalid' on 'copyout' clause}} + // expected-error@+2{{unknown modifier 'invalid' in OpenACC modifier-list on 'copyout' clause}} // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc data copyout(invalid:(float)ArrayParam[2]) ; @@ -71,3 +71,25 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo #pragma acc host_data pcopyout(LocalInt) ; } + +void ModList() { + int V1; + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'copyout' clause}} + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copyout' clause}} +#pragma acc data copyout(always, alwaysin, alwaysout, zero, readonly: V1) + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'copyout' clause}} +#pragma acc data copyout(alwaysout: V1) + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copyout' clause}} +#pragma acc data copyout(readonly: V1) +#pragma acc data copyout(always, alwaysin, zero: V1) + + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'copyout' clause}} + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copyout' clause}} +#pragma acc exit data copyout(always, alwaysin, alwaysout, zero, readonly: V1) + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'copyout' clause}} +#pragma acc exit data copyout(alwaysout: V1) + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copyout' clause}} +#pragma acc exit data copyout(readonly: V1) +#pragma acc exit data copyout(always, alwaysin, zero: V1) +} + diff --git a/clang/test/SemaOpenACC/data-construct-create-ast.cpp b/clang/test/SemaOpenACC/data-construct-create-ast.cpp index 623d44aac43f9..a147125aff29b 100644 --- a/clang/test/SemaOpenACC/data-construct-create-ast.cpp +++ b/clang/test/SemaOpenACC/data-construct-create-ast.cpp @@ -19,7 +19,7 @@ void NormalUses(float *PointerParam) { // CHECK-NEXT: OpenACCDataConstruct{{.*}} data // CHECK-NEXT: create clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcreate clause : zero + // CHECK-NEXT: pcreate clause modifiers: zero // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' @@ -33,7 +33,7 @@ void NormalUses(float *PointerParam) { // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}} enter data // CHECK-NEXT: create clause // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]' - // CHECK-NEXT: pcreate clause : zero + // CHECK-NEXT: pcreate clause modifiers: zero // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *' @@ -59,7 +59,7 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCDataConstruct{{.*}} data // CHECK-NEXT: create clause // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcreate clause : zero + // CHECK-NEXT: pcreate clause modifiers: zero // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: present_or_create clause @@ -73,7 +73,7 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}} enter data // CHECK-NEXT: create clause // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T' - // CHECK-NEXT: pcreate clause : zero + // CHECK-NEXT: pcreate clause modifiers: zero // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &' // CHECK-NEXT: DeclRefExpr{{.*}}'U' lvalue ParmVar{{.*}} 'u' 'U' // CHECK-NEXT: present_or_create clause @@ -98,7 +98,7 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCDataConstruct{{.*}} data // CHECK-NEXT: create clause // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcreate clause : zero + // CHECK-NEXT: pcreate clause modifiers: zero // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' @@ -115,7 +115,7 @@ void TemplUses(T t, U u) { // CHECK-NEXT: OpenACCEnterDataConstruct{{.*}} enter data // CHECK-NEXT: create clause // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int' - // CHECK-NEXT: pcreate clause : zero + // CHECK-NEXT: pcreate clause modifiers: zero // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int' diff --git a/clang/test/SemaOpenACC/data-construct-create-clause.c b/clang/test/SemaOpenACC/data-construct-create-clause.c index 4972bdca4b85d..e49d53b17ee82 100644 --- a/clang/test/SemaOpenACC/data-construct-create-clause.c +++ b/clang/test/SemaOpenACC/data-construct-create-clause.c @@ -58,7 +58,7 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc data create((float)ArrayParam[2]) ; - // expected-error@+2{{invalid tag 'invalid' on 'create' clause}} + // expected-error@+2{{unknown modifier 'invalid' in OpenACC modifier-list on 'create' clause}} // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} #pragma acc data create(invalid:(float)ArrayParam[2]) ; @@ -71,3 +71,36 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo #pragma acc host_data pcreate(LocalInt) ; } + +void ModList() { + int V1; + // expected-error@+4{{OpenACC 'always' modifier not valid on 'create' clause}} + // expected-error@+3{{OpenACC 'alwaysin' modifier not valid on 'create' clause}} + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'create' clause}} + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'create' clause}} +#pragma acc data create(always, alwaysin, alwaysout, zero, readonly: V1) + // expected-error@+1{{OpenACC 'always' modifier not valid on 'create' clause}} +#pragma acc data create(always: V1) + // expected-error@+1{{OpenACC 'alwaysin' modifier not valid on 'create' clause}} +#pragma acc data create(alwaysin: V1) + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'create' clause}} +#pragma acc data create(alwaysout: V1) + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'create' clause}} +#pragma acc data create(readonly: V1) +#pragma acc data create(zero: V1) + + // expected-error@+4{{OpenACC 'always' modifier not valid on 'create' clause}} + // expected-error@+3{{OpenACC 'alwaysin' modifier not valid on 'create' clause}} + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'create' clause}} + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'create' clause}} +#pragma acc enter data create(always, alwaysin, alwaysout, zero, readonly: V1) + // expected-error@+1{{OpenACC 'always' modifier not valid on 'create' clause}} +#pragma acc enter data create(always: V1) + // expected-error@+1{{OpenACC 'alwaysin' modifier not valid on 'create' clause}} +#pragma acc enter data create(alwaysin: V1) + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'create' clause}} +#pragma acc enter data create(alwaysout: V1) + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'create' clause}} +#pragma acc enter data create(readonly: V1) +#pragma acc enter data create(zero: V1) +} diff --git a/clang/test/SemaOpenACC/declare-construct-ast.cpp b/clang/test/SemaOpenACC/declare-construct-ast.cpp index ab49c9e42ea4e..54b40ff6e93de 100644 --- a/clang/test/SemaOpenACC/declare-construct-ast.cpp +++ b/clang/test/SemaOpenACC/declare-construct-ast.cpp @@ -11,11 +11,11 @@ int *Global; // CHECK: VarDecl{{.*}}Global 'int *' int GlobalArray[5]; // CHECK-NEXT: VarDecl{{.*}}GlobalArray 'int[5]' -#pragma acc declare deviceptr(Global), copyin(GlobalArray) +#pragma acc declare deviceptr(Global), copyin(readonly, always: GlobalArray) // CHECK-NEXT: OpenACCDeclareDecl // CHECK-NEXT: deviceptr clause // CHECK-NEXT: DeclRefExpr{{.*}}'Global' 'int *' -// CHECK-NEXT: copyin clause +// CHECK-NEXT: copyin clause modifiers: always, readonly // CHECK-NEXT: DeclRefExpr{{.*}}'GlobalArray' 'int[5]' int *Global1; @@ -162,12 +162,12 @@ struct DependentStruct { static constexpr T StaticMemArray2[5] = {}; // CHECK-NEXT: VarDecl{{.*}} StaticMemArray2 'const T[5]' // CHECK-NEXT: InitListExpr{{.*}}'void' -#pragma acc declare copyin(StaticMem, StaticMemArray) create(StaticMem2, StaticMemArray2) +#pragma acc declare copyin(StaticMem, StaticMemArray) create(zero: StaticMem2, StaticMemArray2) // CHECK-NEXT: OpenACCDeclareDecl // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'StaticMem' 'const T' // CHECK-NEXT: DeclRefExpr{{.*}}'StaticMemArray' 'const T[5]' - // CHECK-NEXT: create clause + // CHECK-NEXT: create clause modifiers: zero // CHECK-NEXT: DeclRefExpr{{.*}}'StaticMem2' 'const T' // CHECK-NEXT: DeclRefExpr{{.*}}'StaticMemArray2' 'const T[5]' @@ -189,14 +189,14 @@ struct DependentStruct { U LocalArray2[5]; // CHECK-NEXT: DeclStmt // CHECK-NEXT: VarDecl{{.*}} LocalArray2 'U[5]' -#pragma acc declare copy(Arg, Local, LocalArray) copyout(Arg2, Local2, LocalArray2) +#pragma acc declare copy(always, alwaysin: Arg, Local, LocalArray) copyout(zero: Arg2, Local2, LocalArray2) // CHECK-NEXT: DeclStmt // CHECK-NEXT: OpenACCDeclareDecl - // CHECK-NEXT: copy clause + // CHECK-NEXT: copy clause modifiers: always, alwaysin // CHECK-NEXT: DeclRefExpr{{.*}}'Arg' 'U' // CHECK-NEXT: DeclRefExpr{{.*}}'Local' 'T' // CHECK-NEXT: DeclRefExpr{{.*}}'LocalArray' 'U[5]' - // CHECK-NEXT: copyout clause + // CHECK-NEXT: copyout clause modifiers: zero // CHECK-NEXT: DeclRefExpr{{.*}}'Arg2' 'U' // CHECK-NEXT: DeclRefExpr{{.*}}'Local2' 'T' // CHECK-NEXT: DeclRefExpr{{.*}}'LocalArray2' 'U[5]' @@ -251,7 +251,7 @@ struct DependentStruct { // CHECK-NEXT: copyin clause // CHECK-NEXT: DeclRefExpr{{.*}}'StaticMem' 'const int' // CHECK-NEXT: DeclRefExpr{{.*}}'StaticMemArray' 'const int[5]' -// CHECK-NEXT: create clause +// CHECK-NEXT: create clause modifiers: zero // CHECK-NEXT: DeclRefExpr{{.*}}'StaticMem2' 'const int' // CHECK-NEXT: DeclRefExpr{{.*}}'StaticMemArray2' 'const int[5]' @@ -279,11 +279,11 @@ struct DependentStruct { // CHECK-NEXT: DeclStmt // CHECK-NEXT: OpenACCDeclareDecl -// CHECK-NEXT: copy clause +// CHECK-NEXT: copy clause modifiers: always, alwaysin // CHECK-NEXT: DeclRefExpr{{.*}}'Arg' 'float' // CHECK-NEXT: DeclRefExpr{{.*}}'Local' 'int' // CHECK-NEXT: DeclRefExpr{{.*}}'LocalArray' 'float[5]' -// CHECK-NEXT: copyout clause +// CHECK-NEXT: copyout clause modifiers: zero // CHECK-NEXT: DeclRefExpr{{.*}}'Arg2' 'float' // CHECK-NEXT: DeclRefExpr{{.*}}'Local2' 'int' // CHECK-NEXT: DeclRefExpr{{.*}}'LocalArray2' 'float[5]' diff --git a/clang/test/SemaOpenACC/declare-construct.cpp b/clang/test/SemaOpenACC/declare-construct.cpp index a1fed096635fa..25038b5bf242c 100644 --- a/clang/test/SemaOpenACC/declare-construct.cpp +++ b/clang/test/SemaOpenACC/declare-construct.cpp @@ -306,3 +306,48 @@ struct Struct2 { } }; +void ModList() { + int V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, + V11, V12, V13, V14, V15, V16, V17, V18; + // expected-error@+2{{OpenACC 'readonly' modifier not valid on 'copy' clause}} + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copy' clause}} +#pragma acc declare copy(always, alwaysin, alwaysout, zero, readonly: V1) + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copy' clause}} +#pragma acc declare copy(readonly: V2) + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copy' clause}} +#pragma acc declare copy(zero: V3) +#pragma acc declare copy(always, alwaysin, alwaysout: V4) + + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'copyin' clause}} + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copyin' clause}} +#pragma acc declare copyin(always, alwaysin, alwaysout, zero, readonly: V5) + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'copyin' clause}} +#pragma acc declare copyin(alwaysout: V6) + // expected-error@+1{{OpenACC 'zero' modifier not valid on 'copyin' clause}} +#pragma acc declare copyin(zero: V7) +#pragma acc declare copyin(always, alwaysin, readonly: V8) + + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'copyout' clause}} + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copyout' clause}} +#pragma acc declare copyout(always, alwaysin, alwaysout, zero, readonly: V9) + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'copyout' clause}} +#pragma acc declare copyout(alwaysout: V10) + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'copyout' clause}} +#pragma acc declare copyout(readonly: V11) +#pragma acc declare copyout(always, alwaysin, zero: V12) + + // expected-error@+4{{OpenACC 'always' modifier not valid on 'create' clause}} + // expected-error@+3{{OpenACC 'alwaysin' modifier not valid on 'create' clause}} + // expected-error@+2{{OpenACC 'alwaysout' modifier not valid on 'create' clause}} + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'create' clause}} +#pragma acc declare create(always, alwaysin, alwaysout, zero, readonly: V13) + // expected-error@+1{{OpenACC 'always' modifier not valid on 'create' clause}} +#pragma acc declare create(always: V14) + // expected-error@+1{{OpenACC 'alwaysin' modifier not valid on 'create' clause}} +#pragma acc declare create(alwaysin: V15) + // expected-error@+1{{OpenACC 'alwaysout' modifier not valid on 'create' clause}} +#pragma acc declare create(alwaysout: V16) + // expected-error@+1{{OpenACC 'readonly' modifier not valid on 'create' clause}} +#pragma acc declare create(readonly: V17) +#pragma acc declare create(zero: V18) +} From 9069ba183d0ad56a1e7b9710d9198686bc9b888b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 4 Apr 2025 12:11:26 -0700 Subject: [PATCH 0692/1029] [RISCV] Rename Spimm to StackAdj in most places. NFC Spimm in the spec refers to the 2-bit encoded value. All of the code uses the 0, 16, 32, or 48 adjustment value. Also remove the decodeZcmpSpimm as its identical to the default behavior for no custom DecoderMethod. --- .../Target/RISCV/AsmParser/RISCVAsmParser.cpp | 32 +++++++++---------- .../RISCV/Disassembler/RISCVDisassembler.cpp | 10 ++---- .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 2 +- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 14 ++++---- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 4 +-- llvm/lib/Target/RISCV/RISCVInstrInfoZc.td | 14 ++++---- 6 files changed, 34 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index bc725ea939aec..6c246176a05e8 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -350,7 +350,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { FRM, Fence, RegList, - Spimm, + StackAdj, RegReg, } Kind; @@ -392,7 +392,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { unsigned Encoding; }; - struct SpimmOp { + struct StackAdjOp { unsigned Val; }; @@ -412,7 +412,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { FRMOp FRM; FenceOp Fence; RegListOp RegList; - SpimmOp Spimm; + StackAdjOp StackAdj; RegRegOp RegReg; }; @@ -451,8 +451,8 @@ struct RISCVOperand final : public MCParsedAsmOperand { case KindTy::RegList: RegList = o.RegList; break; - case KindTy::Spimm: - Spimm = o.Spimm; + case KindTy::StackAdj: + StackAdj = o.StackAdj; break; case KindTy::RegReg: RegReg = o.RegReg; @@ -486,7 +486,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { bool isRegListS0() const { return Kind == KindTy::RegList && RegList.Encoding != RISCVZC::RA; } - bool isSpimm() const { return Kind == KindTy::Spimm; } + bool isStackAdj() const { return Kind == KindTy::StackAdj; } bool isGPR() const { return Kind == KindTy::Register && @@ -1014,9 +1014,9 @@ struct RISCVOperand final : public MCParsedAsmOperand { RISCVZC::printRegList(RegList.Encoding, OS); OS << '>'; break; - case KindTy::Spimm: - OS << "'; break; case KindTy::RegReg: @@ -1116,9 +1116,9 @@ struct RISCVOperand final : public MCParsedAsmOperand { return Op; } - static std::unique_ptr createSpimm(unsigned Spimm, SMLoc S) { - auto Op = std::make_unique(KindTy::Spimm); - Op->Spimm.Val = Spimm; + static std::unique_ptr createStackAdj(unsigned StackAdj, SMLoc S) { + auto Op = std::make_unique(KindTy::StackAdj); + Op->StackAdj.Val = StackAdj; Op->StartLoc = S; return Op; } @@ -1194,9 +1194,9 @@ struct RISCVOperand final : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createReg(RegReg.Reg2)); } - void addSpimmOperands(MCInst &Inst, unsigned N) const { + void addStackAdjOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createImm(Spimm.Val)); + Inst.addOperand(MCOperand::createImm(StackAdj.Val)); } void addFRMArgOperands(MCInst &Inst, unsigned N) const { @@ -2699,8 +2699,8 @@ ParseStatus RISCVAsmParser::parseZcmpStackAdj(OperandVector &Operands, "be a multiple of 16 bytes in the range"); } - unsigned Spimm = (StackAdjustment - StackAdjBase) / 16; - Operands.push_back(RISCVOperand::createSpimm(Spimm << 4, S)); + unsigned StackAdj = (StackAdjustment - StackAdjBase); + Operands.push_back(RISCVOperand::createStackAdj(StackAdj, S)); Lex(); return ParseStatus::Success; } diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 099490173bf08..716299ab896d1 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -527,8 +527,8 @@ static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, uint64_t Address, const MCDisassembler *Decoder); -static DecodeStatus decodeZcmpSpimm(MCInst &Inst, uint32_t Imm, - uint64_t Address, const void *Decoder); +static DecodeStatus decodeZcmpStackAdj(MCInst &Inst, uint32_t Imm, + uint64_t Address, const void *Decoder); static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, uint64_t Address, @@ -668,12 +668,6 @@ static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, return MCDisassembler::Success; } -static DecodeStatus decodeZcmpSpimm(MCInst &Inst, uint32_t Imm, - uint64_t Address, const void *Decoder) { - Inst.addOperand(MCOperand::createImm(Imm)); - return MCDisassembler::Success; -} - // Add implied SP operand for C.*SP compressed instructions. The SP operand // isn't explicitly encoded in the instruction. void RISCVDisassembler::addSPOperands(MCInst &MI) const { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index f4d18dec054c1..d7af9d79c4cde 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -352,7 +352,7 @@ enum OperandType : unsigned { OPERAND_RVKRNUM_2_14, OPERAND_RLIST, OPERAND_RLIST_S0, - OPERAND_SPIMM, + OPERAND_STACKADJ, // Operand is a 3-bit rounding mode, '111' indicates FRM register. // Represents 'frm' argument passing to floating-point operations. OPERAND_FRMARG, diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 41051e46f1bb1..c7b2b781422d1 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -962,10 +962,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // stack space. Align the stack size down to a multiple of 16. This is // needed for RVE. // FIXME: Can we increase the stack size to a multiple of 16 instead? - uint64_t Spimm = + uint64_t StackAdj = std::min(alignDown(StackSize, 16), static_cast(48)); - FirstFrameSetup->getOperand(1).setImm(Spimm); - StackSize -= Spimm; + FirstFrameSetup->getOperand(1).setImm(StackAdj); + StackSize -= StackAdj; unsigned CFIIndex = MF.addFrameInst( MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize - StackSize)); @@ -1278,10 +1278,10 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, // space. Align the stack size down to a multiple of 16. This is needed for // RVE. // FIXME: Can we increase the stack size to a multiple of 16 instead? - uint64_t Spimm = + uint64_t StackAdj = std::min(alignDown(StackSize, 16), static_cast(48)); - MBBI->getOperand(1).setImm(Spimm); - StackSize -= Spimm; + MBBI->getOperand(1).setImm(StackAdj); + StackSize -= StackAdj; if (StackSize != 0) deallocateStack(MF, MBB, MBBI, DL, StackSize, @@ -1984,7 +1984,7 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters( MBB.addLiveIn(Reg); // TODO: Handle QCI Interrupt + Push/Pop } else if (RVFI->isPushable(*MF)) { - // Emit CM.PUSH with base SPimm & evaluate Push stack + // Emit CM.PUSH with base StackAdj & evaluate Push stack unsigned PushedRegNum = RVFI->getRVPushRegs(); if (PushedRegNum > 0) { // Use encoded number to represent registers to spill. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 355bcb775cb35..44894365b6d41 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2702,8 +2702,8 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_RLIST_S0: Ok = Imm >= RISCVZC::RA_S0 && Imm <= RISCVZC::RA_S0_S11; break; - case RISCVOp::OPERAND_SPIMM: - Ok = (Imm & 0xf) == 0; + case RISCVOp::OPERAND_STACKADJ: + Ok = Imm >= 0 && Imm <= 48 && Imm % 16 == 0; break; case RISCVOp::OPERAND_FRMARG: Ok = RISCVFPRndMode::isValidRoundingMode(Imm); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td index bcda5331d845f..f1734405fae63 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td @@ -46,16 +46,16 @@ def StackAdjAsmOperand : AsmOperandClass { let Name = "StackAdj"; let ParserMethod = "parseZcmpStackAdj"; let DiagnosticType = "InvalidStackAdj"; - let PredicateMethod = "isSpimm"; - let RenderMethod = "addSpimmOperands"; + let PredicateMethod = "isStackAdj"; + let RenderMethod = "addStackAdjOperands"; } def NegStackAdjAsmOperand : AsmOperandClass { let Name = "NegStackAdj"; let ParserMethod = "parseZcmpNegStackAdj"; let DiagnosticType = "InvalidStackAdj"; - let PredicateMethod = "isSpimm"; - let RenderMethod = "addSpimmOperands"; + let PredicateMethod = "isStackAdj"; + let RenderMethod = "addStackAdjOperands"; } def reglist : RISCVOp { @@ -77,8 +77,7 @@ def reglist : RISCVOp { def stackadj : RISCVOp { let ParserMatchClass = StackAdjAsmOperand; let PrintMethod = "printStackAdj"; - let DecoderMethod = "decodeZcmpSpimm"; - let OperandType = "OPERAND_SPIMM"; + let OperandType = "OPERAND_STACKADJ"; let MCOperandPredicate = [{ int64_t Imm; if (!MCOp.evaluateAsConstantImm(Imm)) @@ -90,8 +89,7 @@ def stackadj : RISCVOp { def negstackadj : RISCVOp { let ParserMatchClass = NegStackAdjAsmOperand; let PrintMethod = "printNegStackAdj"; - let DecoderMethod = "decodeZcmpSpimm"; - let OperandType = "OPERAND_SPIMM"; + let OperandType = "OPERAND_STACKADJ"; let MCOperandPredicate = [{ int64_t Imm; if (!MCOp.evaluateAsConstantImm(Imm)) From 03604a784011bec2292f900b118d825f34f8cf89 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 4 Apr 2025 12:53:10 -0700 Subject: [PATCH 0693/1029] [lldb] Make lldbassert fire only once per instance (#134343) The `lldbassert` macro in LLDB behaves like a regular `assert` when assertions are enabled, and otherwise prints a pretty backtrace and prompts the user to file a bug. By default, this is emitted as a diagnostic event, but vendors can provide their own behavior, for example pre-populating a bug report. Recently, we ran into an issue where an `lldbassert` (in the Swift language plugin) would fire excessively, to the point that it was interfering with the usability of the debugger. Once an `lldbasser` has fired, there's no point in bothering the user over and over again for the same problem. This PR solves the problem by introducing a static `std::once_flag` in the macro. This way, every `lldbasser` will fire at most once per lldb instance. rdar://148520448 --- lldb/include/lldb/Utility/LLDBAssert.h | 11 +++++--- lldb/source/Utility/LLDBAssert.cpp | 37 +++++++++++++++----------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/lldb/include/lldb/Utility/LLDBAssert.h b/lldb/include/lldb/Utility/LLDBAssert.h index 21dbdb3b3202d..cee30b81402ca 100644 --- a/lldb/include/lldb/Utility/LLDBAssert.h +++ b/lldb/include/lldb/Utility/LLDBAssert.h @@ -10,6 +10,7 @@ #define LLDB_UTILITY_LLDBASSERT_H #include "llvm/ADT/StringRef.h" +#include #ifndef NDEBUG #define lldbassert(x) assert(x) @@ -19,8 +20,11 @@ // __FILE__ but only renders the last path component (the filename) instead of // an invocation dependent full path to that file. #define lldbassert(x) \ - lldb_private::_lldb_assert(static_cast(x), #x, __FUNCTION__, \ - __FILE_NAME__, __LINE__) + do { \ + static std::once_flag _once_flag; \ + lldb_private::_lldb_assert(static_cast(x), #x, __FUNCTION__, \ + __FILE_NAME__, __LINE__, _once_flag) \ + } while (0) #else #define lldbassert(x) \ lldb_private::_lldb_assert(static_cast(x), #x, __FUNCTION__, __FILE__, \ @@ -33,7 +37,8 @@ namespace lldb_private { /// Don't use _lldb_assert directly. Use the lldbassert macro instead so that /// LLDB asserts become regular asserts in NDEBUG builds. void _lldb_assert(bool expression, const char *expr_text, const char *func, - const char *file, unsigned int line); + const char *file, unsigned int line, + std::once_flag &once_flag); /// The default LLDB assert callback, which prints to stderr. typedef void (*LLDBAssertCallback)(llvm::StringRef message, diff --git a/lldb/source/Utility/LLDBAssert.cpp b/lldb/source/Utility/LLDBAssert.cpp index b84c581ccf822..611ad43cd071b 100644 --- a/lldb/source/Utility/LLDBAssert.cpp +++ b/lldb/source/Utility/LLDBAssert.cpp @@ -11,6 +11,7 @@ #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/Signals.h" #include "llvm/Support/raw_ostream.h" +#include #if LLVM_SUPPORT_XCODE_SIGNPOSTS #include @@ -33,29 +34,33 @@ static std::atomic g_lldb_assert_callback = &DefaultAssertCallback; void _lldb_assert(bool expression, const char *expr_text, const char *func, - const char *file, unsigned int line) { + const char *file, unsigned int line, + std::once_flag &once_flag) { if (LLVM_LIKELY(expression)) return; + std::call_once(once_flag, [&]() { #if LLVM_SUPPORT_XCODE_SIGNPOSTS - if (__builtin_available(macos 10.12, iOS 10, tvOS 10, watchOS 3, *)) { - os_log_fault(OS_LOG_DEFAULT, - "Assertion failed: (%s), function %s, file %s, line %u\n", - expr_text, func, file, line); - } + if (__builtin_available(macos 10.12, iOS 10, tvOS 10, watchOS 3, *)) { + os_log_fault(OS_LOG_DEFAULT, + "Assertion failed: (%s), function %s, file %s, line %u\n", + expr_text, func, file, line); + } #endif - std::string buffer; - llvm::raw_string_ostream backtrace(buffer); - llvm::sys::PrintStackTrace(backtrace); + std::string buffer; + llvm::raw_string_ostream backtrace(buffer); + llvm::sys::PrintStackTrace(backtrace); - (*g_lldb_assert_callback.load())( - llvm::formatv("Assertion failed: ({0}), function {1}, file {2}, line {3}", - expr_text, func, file, line) - .str(), - buffer, - "Please file a bug report against lldb and include the backtrace, the " - "version and as many details as possible."); + (*g_lldb_assert_callback.load())( + llvm::formatv( + "Assertion failed: ({0}), function {1}, file {2}, line {3}", + expr_text, func, file, line) + .str(), + buffer, + "Please file a bug report against lldb and include the backtrace, the " + "version and as many details as possible."); + }); } void SetLLDBAssertCallback(LLDBAssertCallback callback) { From 4c182df633bcd7fd7f0634b2cf6bcab91c3674ec Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Fri, 4 Apr 2025 12:53:46 -0700 Subject: [PATCH 0694/1029] [libc] Fix suseconds_t definition and utimes_test (#134326) The main issue was that the kernel expected `suseconds_t` to be 64 bits but ours was 32. This caused inconsistent failures since all valid `suseconds_t` values are less than 1000000 (1 million), and some configurations caused `struct timeval` to be padded to 128 bits. Also: forgot to use TEST_FILE instead of FILE_PATH in some places. --- libc/include/llvm-libc-types/suseconds_t.h | 7 ++++++- libc/test/src/sys/time/CMakeLists.txt | 2 ++ libc/test/src/sys/time/utimes_test.cpp | 22 ++++++++++++++-------- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/libc/include/llvm-libc-types/suseconds_t.h b/libc/include/llvm-libc-types/suseconds_t.h index 32ecc9f537d00..8e926e8401f5c 100644 --- a/libc/include/llvm-libc-types/suseconds_t.h +++ b/libc/include/llvm-libc-types/suseconds_t.h @@ -9,6 +9,11 @@ #ifndef LLVM_LIBC_TYPES_SUSECONDS_T_H #define LLVM_LIBC_TYPES_SUSECONDS_T_H -typedef __INT32_TYPE__ suseconds_t; +// Per posix: suseconds_t shall be a signed integer type capable of storing +// values at least in the range [-1, 1000000]. [...] the widths of [other +// types...] and suseconds_t are no greater than the width of type long. + +// The kernel expects 64 bit suseconds_t at least on x86_64. +typedef long suseconds_t; #endif // LLVM_LIBC_TYPES_SUSECONDS_T_H diff --git a/libc/test/src/sys/time/CMakeLists.txt b/libc/test/src/sys/time/CMakeLists.txt index c092d33e43d85..72a65eec00937 100644 --- a/libc/test/src/sys/time/CMakeLists.txt +++ b/libc/test/src/sys/time/CMakeLists.txt @@ -8,6 +8,7 @@ add_libc_unittest( utimes_test.cpp DEPENDS libc.hdr.fcntl_macros + libc.hdr.sys_stat_macros libc.src.errno.errno libc.src.fcntl.open libc.src.sys.time.utimes @@ -16,4 +17,5 @@ add_libc_unittest( libc.src.unistd.write libc.src.stdio.remove libc.src.sys.stat.stat + libc.test.UnitTest.ErrnoCheckingTest ) diff --git a/libc/test/src/sys/time/utimes_test.cpp b/libc/test/src/sys/time/utimes_test.cpp index 69607ba928e1e..66e69a1b2a700 100644 --- a/libc/test/src/sys/time/utimes_test.cpp +++ b/libc/test/src/sys/time/utimes_test.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "hdr/fcntl_macros.h" +#include "hdr/sys_stat_macros.h" #include "hdr/types/struct_timeval.h" #include "src/errno/libc_errno.h" #include "src/fcntl/open.h" @@ -14,17 +15,22 @@ #include "src/sys/stat/stat.h" #include "src/sys/time/utimes.h" #include "src/unistd/close.h" + +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" +using LlvmLibcUtimesTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + // SUCCESS: Takes a file and successfully updates // its last access and modified times. -TEST(LlvmLibcUtimesTest, ChangeTimesSpecific) { +TEST_F(LlvmLibcUtimesTest, ChangeTimesSpecific) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; constexpr const char *FILE_PATH = "utimes_pass.test"; auto TEST_FILE = libc_make_test_file_path(FILE_PATH); - int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT); + int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU); + ASSERT_ERRNO_SUCCESS(); ASSERT_GT(fd, 0); ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); @@ -36,11 +42,11 @@ TEST(LlvmLibcUtimesTest, ChangeTimesSpecific) { times[1].tv_usec = 23456; // ensure utimes succeeds - ASSERT_THAT(LIBC_NAMESPACE::utimes(FILE_PATH, times), Succeeds(0)); + ASSERT_THAT(LIBC_NAMESPACE::utimes(TEST_FILE, times), Succeeds(0)); // verify the times values against stat of the TEST_FILE struct stat statbuf; - ASSERT_EQ(LIBC_NAMESPACE::stat(FILE_PATH, &statbuf), 0); + ASSERT_EQ(LIBC_NAMESPACE::stat(TEST_FILE, &statbuf), 0); // seconds ASSERT_EQ(statbuf.st_atim.tv_sec, times[0].tv_sec); @@ -57,13 +63,13 @@ TEST(LlvmLibcUtimesTest, ChangeTimesSpecific) { // FAILURE: Invalid values in the timeval struct // to check that utimes rejects it. -TEST(LlvmLibcUtimesTest, InvalidMicroseconds) { +TEST_F(LlvmLibcUtimesTest, InvalidMicroseconds) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; constexpr const char *FILE_PATH = "utimes_fail.test"; auto TEST_FILE = libc_make_test_file_path(FILE_PATH); - int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT); + int fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU); ASSERT_GT(fd, 0); ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); @@ -76,7 +82,7 @@ TEST(LlvmLibcUtimesTest, InvalidMicroseconds) { times[1].tv_usec = 1000000; // invalid // ensure utimes fails - ASSERT_THAT(LIBC_NAMESPACE::utimes(FILE_PATH, times), Fails(EINVAL)); + ASSERT_THAT(LIBC_NAMESPACE::utimes(TEST_FILE, times), Fails(EINVAL)); // check for failure on // the other possible bad values @@ -87,6 +93,6 @@ TEST(LlvmLibcUtimesTest, InvalidMicroseconds) { times[1].tv_usec = 1000; // ensure utimes fails once more - ASSERT_THAT(LIBC_NAMESPACE::utimes(FILE_PATH, times), Fails(EINVAL)); + ASSERT_THAT(LIBC_NAMESPACE::utimes(TEST_FILE, times), Fails(EINVAL)); ASSERT_THAT(LIBC_NAMESPACE::remove(TEST_FILE), Succeeds(0)); } From b518242156f7c432249b1ca203a915b4f9906959 Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Fri, 4 Apr 2025 15:56:30 -0400 Subject: [PATCH 0695/1029] [PowerPC] Fix instruction name for dmr insert (#134301) --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 4 +- llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td | 12 +-- llvm/lib/Target/PowerPC/PPCInstrMMA.td | 4 +- llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp | 2 +- .../test/CodeGen/PowerPC/dmf-outer-product.ll | 32 +++---- llvm/test/CodeGen/PowerPC/dmr-enable.ll | 24 +++--- .../test/CodeGen/PowerPC/mmaplus-acc-spill.ll | 8 +- .../CodeGen/PowerPC/mmaplus-intrinsics.ll | 84 +++++++++---------- llvm/test/CodeGen/PowerPC/v1024ls.ll | 8 +- .../PowerPC/ppc-encoding-ISAFuture.txt | 8 +- .../PowerPC/ppc64le-encoding-ISAFuture.txt | 8 +- llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s | 24 +++--- 12 files changed, 109 insertions(+), 109 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 7f4ddae5db463..1f75425752a78 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -11814,11 +11814,11 @@ SDValue PPCTargetLowering::LowerDMFVectorLoad(SDValue Op, } SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); - SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTFDMR512, dl, MVT::v512i1, Loads[0], + SDValue Lo(DAG.getMachineNode(PPC::DMXXINSTDMR512, dl, MVT::v512i1, Loads[0], Loads[1]), 0); SDValue LoSub = DAG.getTargetConstant(PPC::sub_wacc_lo, dl, MVT::i32); - SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTFDMR512_HI, dl, MVT::v512i1, + SDValue Hi(DAG.getMachineNode(PPC::DMXXINSTDMR512_HI, dl, MVT::v512i1, Loads[2], Loads[3]), 0); SDValue HiSub = DAG.getTargetConstant(PPC::sub_wacc_hi, dl, MVT::i32); diff --git a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td index d4f0e222b457c..15215f7dc5faa 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td @@ -174,15 +174,15 @@ let Predicates = [IsISAFuture] in { let P = 1; } - def DMXXINSTFDMR512 : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc:$AT), + def DMXXINSTDMR512 : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc:$AT), (ins vsrprc:$XAp, vsrprc:$XBp), - "dmxxinstfdmr512 $AT, $XAp, $XBp, 0", []> { + "dmxxinstdmr512 $AT, $XAp, $XBp, 0", []> { let P = 0; } - def DMXXINSTFDMR512_HI : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc_hi:$AT), + def DMXXINSTDMR512_HI : XX3Form_AT3_XABp5_P1<60, 234, (outs wacc_hi:$AT), (ins vsrprc:$XAp, vsrprc:$XBp), - "dmxxinstfdmr512 $AT, $XAp, $XBp, 1", []> { + "dmxxinstdmr512 $AT, $XAp, $XBp, 1", []> { let P = 1; } @@ -190,9 +190,9 @@ let Predicates = [IsISAFuture] in { (ins dmrrowp:$AT, u2imm:$P), "dmxxextfdmr256 $XBp, $AT, $P", []>; - def DMXXINSTFDMR256 : XX2Form_AT3_XBp5_P2<60, 485, (outs dmrrowp:$AT), + def DMXXINSTDMR256 : XX2Form_AT3_XBp5_P2<60, 485, (outs dmrrowp:$AT), (ins vsrprc:$XBp, u2imm:$P), - "dmxxinstfdmr256 $AT, $XBp, $P", []>; + "dmxxinstdmr256 $AT, $XBp, $P", []>; def DMMR : XForm_ATB3<31, 6, 177, (outs dmr:$AT), (ins dmr:$AB), "dmmr $AT, $AB", diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td index d94ad125d258d..23b951871d5f4 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td +++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td @@ -1089,9 +1089,9 @@ let Predicates = [MMA, IsNotISAFuture] in { let Predicates = [MMA, IsISAFuture] in { def : Pat<(v512i1 (PPCAccBuild v4i32:$vs1, v4i32:$vs0, v4i32:$vs3, v4i32:$vs2)), - (DMXXINSTFDMR512 ConcatsMMA.VecsToVecPair0, ConcatsMMA.VecsToVecPair1)>; + (DMXXINSTDMR512 ConcatsMMA.VecsToVecPair0, ConcatsMMA.VecsToVecPair1)>; def : Pat<(v512i1 (int_ppc_mma_assemble_acc v16i8:$vs1, v16i8:$vs0, v16i8:$vs3, v16i8:$vs2)), - (DMXXINSTFDMR512 ConcatsMMA.VecsToVecPair0, ConcatsMMA.VecsToVecPair1)>; + (DMXXINSTDMR512 ConcatsMMA.VecsToVecPair0, ConcatsMMA.VecsToVecPair1)>; def : Pat<(v512i1 immAllZerosV), (XXSETACCZW)>; } diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 2177dba1e5762..e1d9db0e3daa7 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -1448,7 +1448,7 @@ void PPCRegisterInfo::lowerWACCRestore(MachineBasicBlock::iterator II, FrameIndex, IsLittleEndian ? 0 : 32); // Kill VSRpReg0, VSRpReg1 (killedRegState::Killed) - BuildMI(MBB, II, DL, TII.get(PPC::DMXXINSTFDMR512), DestReg) + BuildMI(MBB, II, DL, TII.get(PPC::DMXXINSTDMR512), DestReg) .addReg(VSRpReg0, RegState::Kill) .addReg(VSRpReg1, RegState::Kill); diff --git a/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll b/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll index cba52567c900d..ec43776509b44 100644 --- a/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll +++ b/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll @@ -51,10 +51,10 @@ define void @test_dmxvi8gerx4pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxvp vsp34, 0(r3) ; CHECK-NEXT: lxvp vsp36, 32(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-NEXT: lxvp vsp34, 64(r3) ; CHECK-NEXT: lxvp vsp36, 96(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: lxv v2, 16(r4) ; CHECK-NEXT: lxv vs0, 0(r5) ; CHECK-NEXT: lxv v3, 0(r4) @@ -71,10 +71,10 @@ define void @test_dmxvi8gerx4pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) { ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxvp vsp34, 96(r3) ; CHECK-BE-NEXT: lxvp vsp36, 64(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-BE-NEXT: lxvp vsp34, 32(r3) ; CHECK-BE-NEXT: lxvp vsp36, 0(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-BE-NEXT: lxv v2, 0(r4) ; CHECK-BE-NEXT: lxv vs0, 0(r5) ; CHECK-BE-NEXT: lxv v3, 16(r4) @@ -102,10 +102,10 @@ define void @test_dmxvi8gerx4spp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxvp vsp34, 0(r3) ; CHECK-NEXT: lxvp vsp36, 32(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-NEXT: lxvp vsp34, 64(r3) ; CHECK-NEXT: lxvp vsp36, 96(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: lxv v2, 16(r4) ; CHECK-NEXT: lxv vs0, 0(r5) ; CHECK-NEXT: lxv v3, 0(r4) @@ -122,10 +122,10 @@ define void @test_dmxvi8gerx4spp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) { ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxvp vsp34, 96(r3) ; CHECK-BE-NEXT: lxvp vsp36, 64(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-BE-NEXT: lxvp vsp34, 32(r3) ; CHECK-BE-NEXT: lxvp vsp36, 0(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-BE-NEXT: lxv v2, 0(r4) ; CHECK-BE-NEXT: lxv vs0, 0(r5) ; CHECK-BE-NEXT: lxv v3, 16(r4) @@ -153,10 +153,10 @@ define void @test_pmdmxvi8gerx4pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxvp vsp34, 0(r3) ; CHECK-NEXT: lxvp vsp36, 32(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-NEXT: lxvp vsp34, 64(r3) ; CHECK-NEXT: lxvp vsp36, 96(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: lxv v2, 16(r4) ; CHECK-NEXT: lxv vs0, 0(r5) ; CHECK-NEXT: lxv v3, 0(r4) @@ -173,10 +173,10 @@ define void @test_pmdmxvi8gerx4pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) { ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxvp vsp34, 96(r3) ; CHECK-BE-NEXT: lxvp vsp36, 64(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-BE-NEXT: lxvp vsp34, 32(r3) ; CHECK-BE-NEXT: lxvp vsp36, 0(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-BE-NEXT: lxv v2, 0(r4) ; CHECK-BE-NEXT: lxv vs0, 0(r5) ; CHECK-BE-NEXT: lxv v3, 16(r4) @@ -242,10 +242,10 @@ define dso_local void @test_pmdmxvi8gerx4spp(ptr %vop, ptr %vpp, ptr %vcp, ptr % ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxvp vsp34, 0(r3) ; CHECK-NEXT: lxvp vsp36, 32(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-NEXT: lxvp vsp34, 64(r3) ; CHECK-NEXT: lxvp vsp36, 96(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: lxv v2, 16(r4) ; CHECK-NEXT: lxv vs0, 0(r5) ; CHECK-NEXT: lxv v3, 0(r4) @@ -262,10 +262,10 @@ define dso_local void @test_pmdmxvi8gerx4spp(ptr %vop, ptr %vpp, ptr %vcp, ptr % ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxvp vsp34, 96(r3) ; CHECK-BE-NEXT: lxvp vsp36, 64(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-BE-NEXT: lxvp vsp34, 32(r3) ; CHECK-BE-NEXT: lxvp vsp36, 0(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-BE-NEXT: lxv v2, 0(r4) ; CHECK-BE-NEXT: lxv vs0, 0(r5) ; CHECK-BE-NEXT: lxv v3, 16(r4) diff --git a/llvm/test/CodeGen/PowerPC/dmr-enable.ll b/llvm/test/CodeGen/PowerPC/dmr-enable.ll index 91b0e94cd2716..a6c99a751e2c5 100644 --- a/llvm/test/CodeGen/PowerPC/dmr-enable.ll +++ b/llvm/test/CodeGen/PowerPC/dmr-enable.ll @@ -39,10 +39,10 @@ define void @tdmmr(ptr nocapture readonly %vp1, ptr nocapture %resp) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxvp vsp34, 0(r3) ; CHECK-NEXT: lxvp vsp36, 32(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-NEXT: lxvp vsp34, 64(r3) ; CHECK-NEXT: lxvp vsp36, 96(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: dmmr dmr0, dmr0 ; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-NEXT: stxvp vsp34, 96(r4) @@ -56,10 +56,10 @@ define void @tdmmr(ptr nocapture readonly %vp1, ptr nocapture %resp) { ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxvp vsp34, 96(r3) ; CHECK-BE-NEXT: lxvp vsp36, 64(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-BE-NEXT: lxvp vsp34, 32(r3) ; CHECK-BE-NEXT: lxvp vsp36, 0(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-BE-NEXT: dmmr dmr0, dmr0 ; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 ; CHECK-BE-NEXT: stxvp vsp36, 96(r4) @@ -80,16 +80,16 @@ define void @tdmxor(ptr nocapture readonly %vp1, ptr %vp2, ptr nocapture %resp) ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxvp vsp34, 0(r3) ; CHECK-NEXT: lxvp vsp36, 32(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-NEXT: lxvp vsp34, 64(r3) ; CHECK-NEXT: lxvp vsp36, 96(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: lxvp vsp34, 0(r4) ; CHECK-NEXT: lxvp vsp36, 32(r4) -; CHECK-NEXT: dmxxinstfdmr512 wacc_hi1, vsp36, vsp34, 1 +; CHECK-NEXT: dmxxinstdmr512 wacc_hi1, vsp36, vsp34, 1 ; CHECK-NEXT: lxvp vsp34, 64(r4) ; CHECK-NEXT: lxvp vsp36, 96(r4) -; CHECK-NEXT: dmxxinstfdmr512 wacc1, vsp36, vsp34, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc1, vsp36, vsp34, 0 ; CHECK-NEXT: dmxor dmr0, dmr1 ; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-NEXT: stxvp vsp34, 96(r5) @@ -103,16 +103,16 @@ define void @tdmxor(ptr nocapture readonly %vp1, ptr %vp2, ptr nocapture %resp) ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxvp vsp34, 96(r3) ; CHECK-BE-NEXT: lxvp vsp36, 64(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-BE-NEXT: lxvp vsp34, 32(r3) ; CHECK-BE-NEXT: lxvp vsp36, 0(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-BE-NEXT: lxvp vsp34, 96(r4) ; CHECK-BE-NEXT: lxvp vsp36, 64(r4) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi1, vsp36, vsp34, 1 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi1, vsp36, vsp34, 1 ; CHECK-BE-NEXT: lxvp vsp34, 32(r4) ; CHECK-BE-NEXT: lxvp vsp36, 0(r4) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc1, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc1, vsp36, vsp34, 0 ; CHECK-BE-NEXT: dmxor dmr0, dmr1 ; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 ; CHECK-BE-NEXT: stxvp vsp36, 96(r5) diff --git a/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll index c83ad73e00eda..c2c8a42c402a2 100644 --- a/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll +++ b/llvm/test/CodeGen/PowerPC/mmaplus-acc-spill.ll @@ -35,7 +35,7 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-NEXT: vmr v28, v2 ; CHECK-NEXT: std r30, 160(r1) # 8-byte Folded Spill ; CHECK-NEXT: ld r30, 272(r1) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp60, vsp62, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp60, vsp62, 0 ; CHECK-NEXT: xvf16ger2pp wacc0, v2, v4 ; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 ; CHECK-NEXT: stxvp vsp36, 64(r1) @@ -43,7 +43,7 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-NEXT: bl foo@notoc ; CHECK-NEXT: lxvp vsp34, 64(r1) ; CHECK-NEXT: lxvp vsp36, 32(r1) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-NEXT: xvf16ger2pp wacc0, v28, v30 ; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-NEXT: stxv v4, 48(r30) @@ -82,7 +82,7 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-BE-NEXT: vmr v28, v2 ; CHECK-BE-NEXT: std r30, 240(r1) # 8-byte Folded Spill ; CHECK-BE-NEXT: ld r30, 368(r1) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp60, vsp62, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp60, vsp62, 0 ; CHECK-BE-NEXT: xvf16ger2pp wacc0, v2, v4 ; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp34, wacc0, 0 ; CHECK-BE-NEXT: stxvp vsp36, 112(r1) @@ -91,7 +91,7 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i ; CHECK-BE-NEXT: nop ; CHECK-BE-NEXT: lxvp vsp34, 112(r1) ; CHECK-BE-NEXT: lxvp vsp36, 144(r1) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-BE-NEXT: xvf16ger2pp wacc0, v28, v30 ; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-BE-NEXT: stxv v5, 48(r30) diff --git a/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll index 31631d3f92d8f..41e702c94339d 100644 --- a/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll +++ b/llvm/test/CodeGen/PowerPC/mmaplus-intrinsics.ll @@ -29,7 +29,7 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-LABEL: ass_acc: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmr v3, v2 -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 ; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-NEXT: stxv v4, 48(r3) ; CHECK-NEXT: stxv v5, 32(r3) @@ -40,7 +40,7 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-BE-LABEL: ass_acc: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: vmr v3, v2 -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 ; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-BE-NEXT: stxv v5, 48(r3) ; CHECK-BE-NEXT: stxv v4, 32(r3) @@ -54,7 +54,7 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-O0-NEXT: # implicit-def: $vsrp17 ; CHECK-O0-NEXT: vmr v3, v4 ; CHECK-O0-NEXT: vmr v2, v4 -; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-O0-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 ; CHECK-O0-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r3) @@ -72,7 +72,7 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-O0-BE-NEXT: # implicit-def: $vsrp17 ; CHECK-O0-BE-NEXT: vmr v3, v4 ; CHECK-O0-BE-NEXT: vmr v2, v4 -; CHECK-O0-BE-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp34, 0 +; CHECK-O0-BE-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp34, 0 ; CHECK-O0-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-O0-BE-NEXT: xxlor vs0, v5, v5 ; CHECK-O0-BE-NEXT: stxv vs0, 48(r3) @@ -87,7 +87,7 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-AIX64-LABEL: ass_acc: ; CHECK-AIX64: # %bb.0: # %entry ; CHECK-AIX64-NEXT: vmr 3, 2 -; CHECK-AIX64-NEXT: dmxxinstfdmr512 0, 34, 34, 0 +; CHECK-AIX64-NEXT: dmxxinstdmr512 0, 34, 34, 0 ; CHECK-AIX64-NEXT: dmxxextfdmr512 34, 36, 0, 0 ; CHECK-AIX64-NEXT: stxv 5, 48(3) ; CHECK-AIX64-NEXT: stxv 4, 32(3) @@ -98,7 +98,7 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) { ; CHECK-AIX32-LABEL: ass_acc: ; CHECK-AIX32: # %bb.0: # %entry ; CHECK-AIX32-NEXT: vmr 3, 2 -; CHECK-AIX32-NEXT: dmxxinstfdmr512 0, 34, 34, 0 +; CHECK-AIX32-NEXT: dmxxinstdmr512 0, 34, 34, 0 ; CHECK-AIX32-NEXT: dmxxextfdmr512 34, 36, 0, 0 ; CHECK-AIX32-NEXT: stxv 5, 48(3) ; CHECK-AIX32-NEXT: stxv 4, 32(3) @@ -119,7 +119,7 @@ define void @ld_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-NEXT: lxv v5, 32(r3) ; CHECK-NEXT: lxv v2, 16(r3) ; CHECK-NEXT: lxv v4, 48(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-NEXT: stxv v4, 48(r7) ; CHECK-NEXT: stxv v5, 32(r7) @@ -133,7 +133,7 @@ define void @ld_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: lxv v5, 16(r3) ; CHECK-BE-NEXT: lxv v2, 32(r3) ; CHECK-BE-NEXT: lxv v4, 0(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-BE-NEXT: stxv v5, 48(r7) ; CHECK-BE-NEXT: stxv v4, 32(r7) @@ -153,7 +153,7 @@ define void @ld_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-NEXT: xxlor v3, vs0, vs0 ; CHECK-O0-NEXT: lxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor v2, vs0, vs0 -; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: stxv vs0, 48(r7) @@ -177,7 +177,7 @@ define void @ld_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-BE-NEXT: xxlor v3, vs0, vs0 ; CHECK-O0-BE-NEXT: lxv vs0, 0(r3) ; CHECK-O0-BE-NEXT: xxlor v2, vs0, vs0 -; CHECK-O0-BE-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-BE-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-O0-BE-NEXT: xxlor vs0, v5, v5 ; CHECK-O0-BE-NEXT: stxv vs0, 48(r7) @@ -195,7 +195,7 @@ define void @ld_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-AIX64-NEXT: lxv 5, 16(3) ; CHECK-AIX64-NEXT: lxv 2, 32(3) ; CHECK-AIX64-NEXT: lxv 4, 0(3) -; CHECK-AIX64-NEXT: dmxxinstfdmr512 0, 36, 34, 0 +; CHECK-AIX64-NEXT: dmxxinstdmr512 0, 36, 34, 0 ; CHECK-AIX64-NEXT: dmxxextfdmr512 34, 36, 0, 0 ; CHECK-AIX64-NEXT: stxv 5, 48(5) ; CHECK-AIX64-NEXT: stxv 4, 32(5) @@ -209,7 +209,7 @@ define void @ld_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-AIX32-NEXT: lxv 5, 16(3) ; CHECK-AIX32-NEXT: lxv 2, 32(3) ; CHECK-AIX32-NEXT: lxv 4, 0(3) -; CHECK-AIX32-NEXT: dmxxinstfdmr512 0, 36, 34, 0 +; CHECK-AIX32-NEXT: dmxxinstdmr512 0, 36, 34, 0 ; CHECK-AIX32-NEXT: dmxxextfdmr512 34, 36, 0, 0 ; CHECK-AIX32-NEXT: stxv 5, 48(5) ; CHECK-AIX32-NEXT: stxv 4, 32(5) @@ -233,7 +233,7 @@ define void @ld_op_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-NEXT: lxv v1, 32(r3) ; CHECK-NEXT: lxv v4, 16(r3) ; CHECK-NEXT: lxv v0, 48(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 ; CHECK-NEXT: xvi4ger8pp wacc0, v2, v2 ; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-NEXT: stxv v4, 48(r7) @@ -248,7 +248,7 @@ define void @ld_op_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: lxv v1, 16(r3) ; CHECK-BE-NEXT: lxv v4, 32(r3) ; CHECK-BE-NEXT: lxv v0, 0(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 ; CHECK-BE-NEXT: xvi4ger8pp wacc0, v2, v2 ; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-BE-NEXT: stxv v5, 48(r7) @@ -269,7 +269,7 @@ define void @ld_op_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-NEXT: xxlor v5, vs0, vs0 ; CHECK-O0-NEXT: lxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor v4, vs0, vs0 -; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp32, 0 +; CHECK-O0-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp32, 0 ; CHECK-O0-NEXT: xvi4ger8pp wacc0, v2, v2 ; CHECK-O0-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 @@ -294,7 +294,7 @@ define void @ld_op_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-BE-NEXT: xxlor v5, vs0, vs0 ; CHECK-O0-BE-NEXT: lxv vs0, 0(r3) ; CHECK-O0-BE-NEXT: xxlor v4, vs0, vs0 -; CHECK-O0-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp32, 0 +; CHECK-O0-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp32, 0 ; CHECK-O0-BE-NEXT: xvi4ger8pp wacc0, v2, v2 ; CHECK-O0-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-O0-BE-NEXT: xxlor vs0, v5, v5 @@ -313,7 +313,7 @@ define void @ld_op_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-AIX64-NEXT: lxv 1, 16(3) ; CHECK-AIX64-NEXT: lxv 4, 32(3) ; CHECK-AIX64-NEXT: lxv 0, 0(3) -; CHECK-AIX64-NEXT: dmxxinstfdmr512 0, 32, 36, 0 +; CHECK-AIX64-NEXT: dmxxinstdmr512 0, 32, 36, 0 ; CHECK-AIX64-NEXT: xvi4ger8pp 0, 2, 2 ; CHECK-AIX64-NEXT: dmxxextfdmr512 34, 36, 0, 0 ; CHECK-AIX64-NEXT: stxv 5, 48(5) @@ -328,7 +328,7 @@ define void @ld_op_st_xxmtacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-AIX32-NEXT: lxv 1, 16(3) ; CHECK-AIX32-NEXT: lxv 4, 32(3) ; CHECK-AIX32-NEXT: lxv 0, 0(3) -; CHECK-AIX32-NEXT: dmxxinstfdmr512 0, 32, 36, 0 +; CHECK-AIX32-NEXT: dmxxinstdmr512 0, 32, 36, 0 ; CHECK-AIX32-NEXT: xvi4ger8pp 0, 2, 2 ; CHECK-AIX32-NEXT: dmxxextfdmr512 34, 36, 0, 0 ; CHECK-AIX32-NEXT: stxv 5, 48(5) @@ -354,7 +354,7 @@ define void @ld_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-NEXT: lxv v5, 32(r3) ; CHECK-NEXT: lxv v2, 16(r3) ; CHECK-NEXT: lxv v4, 48(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-NEXT: stxv v4, 48(r3) ; CHECK-NEXT: stxv v5, 32(r3) @@ -372,7 +372,7 @@ define void @ld_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: lxv v5, 16(r3) ; CHECK-BE-NEXT: lxv v2, 32(r3) ; CHECK-BE-NEXT: lxv v4, 0(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-BE-NEXT: stxv v5, 48(r3) ; CHECK-BE-NEXT: stxv v4, 32(r3) @@ -396,7 +396,7 @@ define void @ld_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-NEXT: xxlor v3, vs0, vs0 ; CHECK-O0-NEXT: lxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor v2, vs0, vs0 -; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-O0-NEXT: xxlor vs3, v4, v4 ; CHECK-O0-NEXT: stxv vs3, 48(r3) @@ -424,7 +424,7 @@ define void @ld_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-BE-NEXT: xxlor v3, vs0, vs0 ; CHECK-O0-BE-NEXT: lxv vs0, 0(r3) ; CHECK-O0-BE-NEXT: xxlor v2, vs0, vs0 -; CHECK-O0-BE-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp36, 0 +; CHECK-O0-BE-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0 ; CHECK-O0-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-O0-BE-NEXT: xxlor vs3, v5, v5 ; CHECK-O0-BE-NEXT: stxv vs3, 48(r3) @@ -446,7 +446,7 @@ define void @ld_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-AIX64-NEXT: lxv 5, 16(3) ; CHECK-AIX64-NEXT: lxv 2, 32(3) ; CHECK-AIX64-NEXT: lxv 4, 0(3) -; CHECK-AIX64-NEXT: dmxxinstfdmr512 0, 36, 34, 0 +; CHECK-AIX64-NEXT: dmxxinstdmr512 0, 36, 34, 0 ; CHECK-AIX64-NEXT: dmxxextfdmr512 34, 36, 0, 0 ; CHECK-AIX64-NEXT: stxv 5, 48(3) ; CHECK-AIX64-NEXT: stxv 4, 32(3) @@ -464,7 +464,7 @@ define void @ld_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-AIX32-NEXT: lxv 5, 16(3) ; CHECK-AIX32-NEXT: lxv 2, 32(3) ; CHECK-AIX32-NEXT: lxv 4, 0(3) -; CHECK-AIX32-NEXT: dmxxinstfdmr512 0, 36, 34, 0 +; CHECK-AIX32-NEXT: dmxxinstdmr512 0, 36, 34, 0 ; CHECK-AIX32-NEXT: dmxxextfdmr512 34, 36, 0, 0 ; CHECK-AIX32-NEXT: stxv 5, 48(3) ; CHECK-AIX32-NEXT: stxv 4, 32(3) @@ -493,7 +493,7 @@ define void @ld_op_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-NEXT: lxv v1, 32(r3) ; CHECK-NEXT: lxv v4, 16(r3) ; CHECK-NEXT: lxv v0, 48(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 ; CHECK-NEXT: xvi4ger8pp wacc0, v2, v2 ; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-NEXT: stxv v4, 48(r7) @@ -508,7 +508,7 @@ define void @ld_op_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-BE-NEXT: lxv v1, 16(r3) ; CHECK-BE-NEXT: lxv v4, 32(r3) ; CHECK-BE-NEXT: lxv v0, 0(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 ; CHECK-BE-NEXT: xvi4ger8pp wacc0, v2, v2 ; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-BE-NEXT: stxv v5, 48(r7) @@ -529,7 +529,7 @@ define void @ld_op_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-NEXT: xxlor v5, vs0, vs0 ; CHECK-O0-NEXT: lxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor v4, vs0, vs0 -; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp32, 0 +; CHECK-O0-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp32, 0 ; CHECK-O0-NEXT: xvi4ger8pp wacc0, v2, v2 ; CHECK-O0-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 @@ -554,7 +554,7 @@ define void @ld_op_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-O0-BE-NEXT: xxlor v5, vs0, vs0 ; CHECK-O0-BE-NEXT: lxv vs0, 0(r3) ; CHECK-O0-BE-NEXT: xxlor v4, vs0, vs0 -; CHECK-O0-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp32, 0 +; CHECK-O0-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp32, 0 ; CHECK-O0-BE-NEXT: xvi4ger8pp wacc0, v2, v2 ; CHECK-O0-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-O0-BE-NEXT: xxlor vs0, v5, v5 @@ -573,7 +573,7 @@ define void @ld_op_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-AIX64-NEXT: lxv 1, 16(3) ; CHECK-AIX64-NEXT: lxv 4, 32(3) ; CHECK-AIX64-NEXT: lxv 0, 0(3) -; CHECK-AIX64-NEXT: dmxxinstfdmr512 0, 32, 36, 0 +; CHECK-AIX64-NEXT: dmxxinstdmr512 0, 32, 36, 0 ; CHECK-AIX64-NEXT: xvi4ger8pp 0, 2, 2 ; CHECK-AIX64-NEXT: dmxxextfdmr512 34, 36, 0, 0 ; CHECK-AIX64-NEXT: stxv 5, 48(5) @@ -588,7 +588,7 @@ define void @ld_op_st_xxmfacc(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) { ; CHECK-AIX32-NEXT: lxv 1, 16(3) ; CHECK-AIX32-NEXT: lxv 4, 32(3) ; CHECK-AIX32-NEXT: lxv 0, 0(3) -; CHECK-AIX32-NEXT: dmxxinstfdmr512 0, 32, 36, 0 +; CHECK-AIX32-NEXT: dmxxinstdmr512 0, 32, 36, 0 ; CHECK-AIX32-NEXT: xvi4ger8pp 0, 2, 2 ; CHECK-AIX32-NEXT: dmxxextfdmr512 34, 36, 0, 0 ; CHECK-AIX32-NEXT: stxv 5, 48(5) @@ -615,7 +615,7 @@ define void @cmplx_xxmacc(ptr %ptr1, ptr %ptr2, <16 x i8> %vc1, <16 x i8> %vc2) ; CHECK-NEXT: vmr v4, v3 ; CHECK-NEXT: vmr v3, v2 ; CHECK-NEXT: vmr v5, v2 -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp38, vsp32, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp38, vsp32, 0 ; CHECK-NEXT: xvf64gerpp wacc0, vsp34, v2 ; CHECK-NEXT: xvf64gerpp wacc0, vsp36, v4 ; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 @@ -634,7 +634,7 @@ define void @cmplx_xxmacc(ptr %ptr1, ptr %ptr2, <16 x i8> %vc1, <16 x i8> %vc2) ; CHECK-BE-NEXT: vmr v4, v3 ; CHECK-BE-NEXT: vmr v3, v2 ; CHECK-BE-NEXT: vmr v5, v2 -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp38, vsp32, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp38, vsp32, 0 ; CHECK-BE-NEXT: xvf64gerpp wacc0, vsp34, v2 ; CHECK-BE-NEXT: xvf64gerpp wacc0, vsp36, v4 ; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 @@ -662,7 +662,7 @@ define void @cmplx_xxmacc(ptr %ptr1, ptr %ptr2, <16 x i8> %vc1, <16 x i8> %vc2) ; CHECK-O0-NEXT: xxlor v7, vs0, vs0 ; CHECK-O0-NEXT: lxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor v6, vs0, vs0 -; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp38, vsp40, 0 +; CHECK-O0-NEXT: dmxxinstdmr512 wacc0, vsp38, vsp40, 0 ; CHECK-O0-NEXT: xxlor vs0, v5, v5 ; CHECK-O0-NEXT: xvf64gerpp wacc0, vsp32, vs0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 @@ -696,7 +696,7 @@ define void @cmplx_xxmacc(ptr %ptr1, ptr %ptr2, <16 x i8> %vc1, <16 x i8> %vc2) ; CHECK-O0-BE-NEXT: xxlor v7, vs0, vs0 ; CHECK-O0-BE-NEXT: lxv vs0, 0(r3) ; CHECK-O0-BE-NEXT: xxlor v6, vs0, vs0 -; CHECK-O0-BE-NEXT: dmxxinstfdmr512 wacc0, vsp38, vsp40, 0 +; CHECK-O0-BE-NEXT: dmxxinstdmr512 wacc0, vsp38, vsp40, 0 ; CHECK-O0-BE-NEXT: xxlor vs0, v5, v5 ; CHECK-O0-BE-NEXT: xvf64gerpp wacc0, vsp32, vs0 ; CHECK-O0-BE-NEXT: xxlor vs0, v4, v4 @@ -721,7 +721,7 @@ define void @cmplx_xxmacc(ptr %ptr1, ptr %ptr2, <16 x i8> %vc1, <16 x i8> %vc2) ; CHECK-AIX64-NEXT: vmr 4, 3 ; CHECK-AIX64-NEXT: vmr 3, 2 ; CHECK-AIX64-NEXT: vmr 5, 2 -; CHECK-AIX64-NEXT: dmxxinstfdmr512 0, 38, 32, 0 +; CHECK-AIX64-NEXT: dmxxinstdmr512 0, 38, 32, 0 ; CHECK-AIX64-NEXT: xvf64gerpp 0, 34, 2 ; CHECK-AIX64-NEXT: xvf64gerpp 0, 36, 4 ; CHECK-AIX64-NEXT: dmxxextfdmr512 34, 36, 0, 0 @@ -740,7 +740,7 @@ define void @cmplx_xxmacc(ptr %ptr1, ptr %ptr2, <16 x i8> %vc1, <16 x i8> %vc2) ; CHECK-AIX32-NEXT: vmr 4, 3 ; CHECK-AIX32-NEXT: vmr 3, 2 ; CHECK-AIX32-NEXT: vmr 5, 2 -; CHECK-AIX32-NEXT: dmxxinstfdmr512 0, 38, 32, 0 +; CHECK-AIX32-NEXT: dmxxinstdmr512 0, 38, 32, 0 ; CHECK-AIX32-NEXT: xvf64gerpp 0, 34, 2 ; CHECK-AIX32-NEXT: xvf64gerpp 0, 36, 4 ; CHECK-AIX32-NEXT: dmxxextfdmr512 34, 36, 0, 0 @@ -1049,7 +1049,7 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-NEXT: lxv v1, 32(r3) ; CHECK-NEXT: lxv v4, 16(r3) ; CHECK-NEXT: lxv v0, 48(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 ; CHECK-NEXT: plxvp vsp36, 8(r4), 0 ; CHECK-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 ; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 @@ -1065,7 +1065,7 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-BE-NEXT: lxv v1, 16(r3) ; CHECK-BE-NEXT: lxv v4, 32(r3) ; CHECK-BE-NEXT: lxv v0, 0(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp32, vsp36, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp32, vsp36, 0 ; CHECK-BE-NEXT: plxvp vsp36, 8(r4), 0 ; CHECK-BE-NEXT: pmxvf64gernn wacc0, vsp36, v2, 0, 0 ; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 @@ -1088,7 +1088,7 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-O0-NEXT: xxlor v3, vs0, vs0 ; CHECK-O0-NEXT: lxv vs0, 48(r3) ; CHECK-O0-NEXT: xxlor v2, vs0, vs0 -; CHECK-O0-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp32, 0 +; CHECK-O0-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp32, 0 ; CHECK-O0-NEXT: plxvp vsp34, 8(r4), 0 ; CHECK-O0-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-NEXT: pmxvf64gernn wacc0, vsp34, vs0, 0, 0 @@ -1116,7 +1116,7 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-O0-BE-NEXT: xxlor v3, vs0, vs0 ; CHECK-O0-BE-NEXT: lxv vs0, 0(r3) ; CHECK-O0-BE-NEXT: xxlor v2, vs0, vs0 -; CHECK-O0-BE-NEXT: dmxxinstfdmr512 wacc0, vsp34, vsp32, 0 +; CHECK-O0-BE-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp32, 0 ; CHECK-O0-BE-NEXT: plxvp vsp34, 8(r4), 0 ; CHECK-O0-BE-NEXT: xxlor vs0, v4, v4 ; CHECK-O0-BE-NEXT: pmxvf64gernn wacc0, vsp34, vs0, 0, 0 @@ -1137,7 +1137,7 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-AIX64-NEXT: lxv 1, 16(3) ; CHECK-AIX64-NEXT: lxv 4, 32(3) ; CHECK-AIX64-NEXT: lxv 0, 0(3) -; CHECK-AIX64-NEXT: dmxxinstfdmr512 0, 32, 36, 0 +; CHECK-AIX64-NEXT: dmxxinstdmr512 0, 32, 36, 0 ; CHECK-AIX64-NEXT: plxvp 36, 8(4), 0 ; CHECK-AIX64-NEXT: pmxvf64gernn 0, 36, 2, 0, 0 ; CHECK-AIX64-NEXT: dmxxextfdmr512 34, 36, 0, 0 @@ -1153,7 +1153,7 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p ; CHECK-AIX32-NEXT: lxv 1, 16(3) ; CHECK-AIX32-NEXT: lxv 4, 32(3) ; CHECK-AIX32-NEXT: lxv 0, 0(3) -; CHECK-AIX32-NEXT: dmxxinstfdmr512 0, 32, 36, 0 +; CHECK-AIX32-NEXT: dmxxinstdmr512 0, 32, 36, 0 ; CHECK-AIX32-NEXT: plxvp 36, 8(4), 0 ; CHECK-AIX32-NEXT: pmxvf64gernn 0, 36, 2, 0, 0 ; CHECK-AIX32-NEXT: dmxxextfdmr512 34, 36, 0, 0 diff --git a/llvm/test/CodeGen/PowerPC/v1024ls.ll b/llvm/test/CodeGen/PowerPC/v1024ls.ll index 5ec22cbc832ba..a454241a33f3e 100644 --- a/llvm/test/CodeGen/PowerPC/v1024ls.ll +++ b/llvm/test/CodeGen/PowerPC/v1024ls.ll @@ -11,10 +11,10 @@ define void @v1024ls(ptr nocapture readonly %vqp, ptr nocapture %resp) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxvp vsp34, 0(r3) ; CHECK-NEXT: lxvp vsp36, 32(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-NEXT: lxvp vsp34, 64(r3) ; CHECK-NEXT: lxvp vsp36, 96(r3) -; CHECK-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0 ; CHECK-NEXT: stxvp vsp34, 96(r4) ; CHECK-NEXT: stxvp vsp36, 64(r4) @@ -27,10 +27,10 @@ define void @v1024ls(ptr nocapture readonly %vqp, ptr nocapture %resp) { ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxvp vsp34, 96(r3) ; CHECK-BE-NEXT: lxvp vsp36, 64(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc_hi0, vsp36, vsp34, 1 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1 ; CHECK-BE-NEXT: lxvp vsp34, 32(r3) ; CHECK-BE-NEXT: lxvp vsp36, 0(r3) -; CHECK-BE-NEXT: dmxxinstfdmr512 wacc0, vsp36, vsp34, 0 +; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp36, vsp34, 0 ; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1 ; CHECK-BE-NEXT: stxvp vsp36, 96(r4) ; CHECK-BE-NEXT: stxvp vsp34, 64(r4) diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt index 5fa01371188cd..06fdb3bd25641 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt @@ -19,16 +19,16 @@ #CHECK: dmxxextfdmr256 8, 3, 3 0xf1 0x81 0x4f 0x90 -#CHECK: dmxxinstfdmr512 1, 2, 34, 0 +#CHECK: dmxxinstdmr512 1, 2, 34, 0 0xf0 0x82 0x17 0x52 -#CHECK: dmxxinstfdmr512 1, 2, 34, 1 +#CHECK: dmxxinstdmr512 1, 2, 34, 1 0xf0 0x83 0x17 0x52 -#CHECK: dmxxinstfdmr256 3, 8, 0 +#CHECK: dmxxinstdmr256 3, 8, 0 0xf1 0x80 0x47 0x94 -#CHECK: dmxxinstfdmr256 3, 8, 3 +#CHECK: dmxxinstdmr256 3, 8, 3 0xf1 0x81 0x4f 0x94 #CHECK: dmsetdmrz 3 diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt index 72660f97c3757..c2e32a792d39d 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt @@ -13,16 +13,16 @@ #CHECK: dmxxextfdmr256 8, 3, 3 0x90 0x4f 0x81 0xf1 -#CHECK: dmxxinstfdmr512 1, 2, 34, 0 +#CHECK: dmxxinstdmr512 1, 2, 34, 0 0x52 0x17 0x82 0xf0 -#CHECK: dmxxinstfdmr512 1, 2, 34, 1 +#CHECK: dmxxinstdmr512 1, 2, 34, 1 0x52 0x17 0x83 0xf0 -#CHECK: dmxxinstfdmr256 3, 8, 0 +#CHECK: dmxxinstdmr256 3, 8, 0 0x94 0x47 0x80 0xf1 -#CHECK: dmxxinstfdmr256 3, 8, 3 +#CHECK: dmxxinstdmr256 3, 8, 3 0x94 0x4f 0x81 0xf1 #CHECK: dmsetdmrz 3 diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s index f8d3f7741e52b..bd7cfc4dfebfb 100644 --- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s +++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s @@ -21,21 +21,21 @@ # CHECK-LE: dmxxextfdmr256 8, 3, 3 # encoding: [0x90,0x4f,0x81,0xf1] dmxxextfdmr256 8, 3, 3 -# CHECK-BE: dmxxinstfdmr512 1, 2, 34, 0 # encoding: [0xf0,0x82,0x17,0x52] -# CHECK-LE: dmxxinstfdmr512 1, 2, 34, 0 # encoding: [0x52,0x17,0x82,0xf0] - dmxxinstfdmr512 1, 2, 34, 0 +# CHECK-BE: dmxxinstdmr512 1, 2, 34, 0 # encoding: [0xf0,0x82,0x17,0x52] +# CHECK-LE: dmxxinstdmr512 1, 2, 34, 0 # encoding: [0x52,0x17,0x82,0xf0] + dmxxinstdmr512 1, 2, 34, 0 -# CHECK-BE: dmxxinstfdmr512 1, 2, 34, 1 # encoding: [0xf0,0x83,0x17,0x52] -# CHECK-LE: dmxxinstfdmr512 1, 2, 34, 1 # encoding: [0x52,0x17,0x83,0xf0] - dmxxinstfdmr512 1, 2, 34, 1 +# CHECK-BE: dmxxinstdmr512 1, 2, 34, 1 # encoding: [0xf0,0x83,0x17,0x52] +# CHECK-LE: dmxxinstdmr512 1, 2, 34, 1 # encoding: [0x52,0x17,0x83,0xf0] + dmxxinstdmr512 1, 2, 34, 1 -# CHECK-BE: dmxxinstfdmr256 3, 8, 0 # encoding: [0xf1,0x80,0x47,0x94] -# CHECK-LE: dmxxinstfdmr256 3, 8, 0 # encoding: [0x94,0x47,0x80,0xf1] - dmxxinstfdmr256 3, 8, 0 +# CHECK-BE: dmxxinstdmr256 3, 8, 0 # encoding: [0xf1,0x80,0x47,0x94] +# CHECK-LE: dmxxinstdmr256 3, 8, 0 # encoding: [0x94,0x47,0x80,0xf1] + dmxxinstdmr256 3, 8, 0 -# CHECK-BE: dmxxinstfdmr256 3, 8, 3 # encoding: [0xf1,0x81,0x4f,0x94] -# CHECK-LE: dmxxinstfdmr256 3, 8, 3 # encoding: [0x94,0x4f,0x81,0xf1] - dmxxinstfdmr256 3, 8, 3 +# CHECK-BE: dmxxinstdmr256 3, 8, 3 # encoding: [0xf1,0x81,0x4f,0x94] +# CHECK-LE: dmxxinstdmr256 3, 8, 3 # encoding: [0x94,0x4f,0x81,0xf1] + dmxxinstdmr256 3, 8, 3 # CHECK-BE: dmsetdmrz 3 # encoding: [0x7d,0x82,0x01,0x62] # CHECK-LE: dmsetdmrz 3 # encoding: [0x62,0x01,0x82,0x7d] From 3a859b11e3ca758043b88693fdf990d361a02ef1 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 4 Apr 2025 21:14:36 +0100 Subject: [PATCH 0696/1029] [VPlan] Set and use debug location for VPScalarIVStepsRecipe. This adds missing debug location for VPscalarIVStepsRecipe. The location of the corresponding phi is used. --- .../Transforms/Vectorize/LoopVectorizationPlanner.h | 4 ++-- llvm/lib/Transforms/Vectorize/VPlan.h | 13 ++++++++----- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 + llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 2 +- llvm/test/Transforms/LoopVectorize/debugloc.ll | 6 +++--- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 9e0c55c7bcce2..12d615d9adbcc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -263,10 +263,10 @@ class VPBuilder { VPScalarIVStepsRecipe * createScalarIVSteps(Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, VPValue *IV, VPValue *Step, - VPValue *VF) { + VPValue *VF, DebugLoc DL) { return tryInsertInstruction(new VPScalarIVStepsRecipe( IV, Step, VF, InductionOpcode, - FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags())); + FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags(), DL)); } //===--------------------------------------------------------------------===// diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index ebdc09feeb06e..a98d0ecb9a33b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3076,25 +3076,28 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, public: VPScalarIVStepsRecipe(VPValue *IV, VPValue *Step, VPValue *VF, - Instruction::BinaryOps Opcode, FastMathFlags FMFs) + Instruction::BinaryOps Opcode, FastMathFlags FMFs, + DebugLoc DL) : VPRecipeWithIRFlags(VPDef::VPScalarIVStepsSC, - ArrayRef({IV, Step, VF}), FMFs), + ArrayRef({IV, Step, VF}), FMFs, DL), InductionOpcode(Opcode) {} VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV, - VPValue *Step, VPValue *VF) + VPValue *Step, VPValue *VF, DebugLoc DL = {}) : VPScalarIVStepsRecipe( IV, Step, VF, IndDesc.getInductionOpcode(), dyn_cast_or_null(IndDesc.getInductionBinOp()) ? IndDesc.getInductionBinOp()->getFastMathFlags() - : FastMathFlags()) {} + : FastMathFlags(), + DL) {} ~VPScalarIVStepsRecipe() override = default; VPScalarIVStepsRecipe *clone() override { return new VPScalarIVStepsRecipe( getOperand(0), getOperand(1), getOperand(2), InductionOpcode, - hasFastMathFlags() ? getFastMathFlags() : FastMathFlags()); + hasFastMathFlags() ? getFastMathFlags() : FastMathFlags(), + getDebugLoc()); } /// Return true if this VPScalarIVStepsRecipe corresponds to part 0. Note that diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index a117d82e64ef7..7b5c6b6f6f76e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2029,6 +2029,7 @@ void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPScalarIVStepsRecipe::execute(VPTransformState &State) { + State.setDebugLocFrom(getDebugLoc()); // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); if (hasFastMathFlags()) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 098e35fbe5bbb..9815dfd31374b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -576,7 +576,7 @@ createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL); } return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step, - &Plan.getVF()); + &Plan.getVF(), DL); } static SmallVector collectUsersRecursively(VPValue *V) { diff --git a/llvm/test/Transforms/LoopVectorize/debugloc.ll b/llvm/test/Transforms/LoopVectorize/debugloc.ll index 03c3dcf3ec39f..c31f438feae6e 100644 --- a/llvm/test/Transforms/LoopVectorize/debugloc.ll +++ b/llvm/test/Transforms/LoopVectorize/debugloc.ll @@ -175,9 +175,8 @@ define void @test_scalar_steps(ptr nocapture %a, ptr noalias %b, i64 %size) !dbg ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0 -; CHECK-NOT: !dbg -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0, !dbg [[LOC8:!.+]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 2, !dbg [[LOC8]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP6]] @@ -215,6 +214,7 @@ exit: ; CHECK: [[LOC5]] = !DILocation(line: 320 ; CHECK: [[LOC6]] = !DILocation(line: 430 ; CHECK: [[LOC7]] = !DILocation(line: 540 +; CHECK: [[LOC8]] = !DILocation(line: 650 declare void @llvm.dbg.declare(metadata, metadata, metadata) From 61af05fe82c6989351c08de8d9eac4dc51f4ef79 Mon Sep 17 00:00:00 2001 From: Eugene Epshteyn Date: Fri, 4 Apr 2025 16:26:08 -0400 Subject: [PATCH 0697/1029] [flang] Add runtime and lowering implementation for extended intrinsic PUTENV (#134412) Implement extended intrinsic PUTENV, both function and subroutine forms. Add PUTENV documentation to flang/docs/Intrinsics.md. Add functional and semantic unit tests. --- .../include/flang-rt/runtime/environment.h | 7 ++ flang-rt/lib/runtime/command.cpp | 50 +++++++++++++++ flang-rt/lib/runtime/environment.cpp | 64 +++++++++++++++++++ flang/docs/Intrinsics.md | 39 ++++++++++- .../flang/Optimizer/Builder/IntrinsicCall.h | 2 + .../flang/Optimizer/Builder/Runtime/Command.h | 5 ++ flang/include/flang/Runtime/command.h | 4 ++ flang/lib/Evaluate/intrinsics.cpp | 12 +++- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 37 +++++++++++ .../lib/Optimizer/Builder/Runtime/Command.cpp | 14 ++++ flang/test/Lower/Intrinsics/putenv-func.f90 | 24 +++++++ flang/test/Lower/Intrinsics/putenv-sub.f90 | 54 ++++++++++++++++ flang/test/Semantics/putenv.f90 | 42 ++++++++++++ 13 files changed, 350 insertions(+), 4 deletions(-) create mode 100644 flang/test/Lower/Intrinsics/putenv-func.f90 create mode 100644 flang/test/Lower/Intrinsics/putenv-sub.f90 create mode 100644 flang/test/Semantics/putenv.f90 diff --git a/flang-rt/include/flang-rt/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h index ca6c2a7d44484..16258b3bbba9b 100644 --- a/flang-rt/include/flang-rt/runtime/environment.h +++ b/flang-rt/include/flang-rt/runtime/environment.h @@ -45,6 +45,13 @@ struct ExecutionEnvironment { const char *GetEnv( const char *name, std::size_t name_length, const Terminator &terminator); + std::int32_t SetEnv(const char *name, std::size_t name_length, + const char *value, std::size_t value_length, + const Terminator &terminator); + + std::int32_t UnsetEnv( + const char *name, std::size_t name_length, const Terminator &terminator); + int argc{0}; const char **argv{nullptr}; char **envp{nullptr}; diff --git a/flang-rt/lib/runtime/command.cpp b/flang-rt/lib/runtime/command.cpp index b69143bf458ba..a4e8e31ad0274 100644 --- a/flang-rt/lib/runtime/command.cpp +++ b/flang-rt/lib/runtime/command.cpp @@ -309,6 +309,55 @@ std::int32_t RTNAME(Hostnm)( return status; } +std::int32_t RTNAME(PutEnv)( + const char *str, size_t str_length, const char *sourceFile, int line) { + Terminator terminator{sourceFile, line}; + + RUNTIME_CHECK(terminator, str && str_length); + + // Note: don't trim the input string, because the user should be able + // to set the value to all spaces if necessary. + + // While Fortran's putenv() extended intrinsic sementics loosly follow + // Linux C library putenv(), don't actually use putenv() on Linux, because + // it takes the passed string pointer and incorporates it into the + // environment without copy. To make this safe, one would have to copy + // the passed string into some allocated memory, but then there's no good + // way to deallocate it. Instead, use the implementation from + // ExecutionEnvironment, which does the right thing for both Windows and + // Linux. + + std::int32_t status{0}; + + // Split the input string into name and value substrings. Note: + // if input string is in "name=value" form, then we set variable "name" with + // value "value". If the input string is in "name=" form, then we delete + // the variable "name". + + const char *str_end = str + str_length; + const char *str_sep = std::find(str, str_end, '='); + if (str_sep == str_end) { + // No separator, invalid input string + status = EINVAL; + } else if ((str_sep + 1) == str_end) { + // "name=" form, which means we need to delete this variable + status = executionEnvironment.UnsetEnv(str, str_sep - str, terminator); + } else { + // Example: consider str "abc=defg", str_length = 8 + // + // addr: 05 06 07 08 09 10 11 12 13 + // str@addr: a b c = d e f g ?? + // + // str = 5, str_end = 13, str_sep = 8, name length: str_sep - str = 3 + // value ptr: str_sep + 1 = 9, value length: 4 + // + status = executionEnvironment.SetEnv( + str, str_sep - str, str_sep + 1, str_end - str_sep - 1, terminator); + } + + return status; +} + std::int32_t RTNAME(Unlink)( const char *str, size_t strLength, const char *sourceFile, int line) { Terminator terminator{sourceFile, line}; @@ -324,4 +373,5 @@ std::int32_t RTNAME(Unlink)( return status; } + } // namespace Fortran::runtime diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp index cf2c65dd4fac0..1d5304254ed0e 100644 --- a/flang-rt/lib/runtime/environment.cpp +++ b/flang-rt/lib/runtime/environment.cpp @@ -181,4 +181,68 @@ const char *ExecutionEnvironment::GetEnv( return std::getenv(cStyleName.get()); } + +std::int32_t ExecutionEnvironment::SetEnv(const char *name, + std::size_t name_length, const char *value, std::size_t value_length, + const Terminator &terminator) { + + RUNTIME_CHECK(terminator, name && name_length && value && value_length); + + OwningPtr cStyleName{ + SaveDefaultCharacter(name, name_length, terminator)}; + RUNTIME_CHECK(terminator, cStyleName); + + OwningPtr cStyleValue{ + SaveDefaultCharacter(value, value_length, terminator)}; + RUNTIME_CHECK(terminator, cStyleValue); + + std::int32_t status{0}; + +#ifdef _WIN32 + + status = _putenv_s(cStyleName.get(), cStyleValue.get()); + +#else + + constexpr int overwrite = 1; + status = setenv(cStyleName.get(), cStyleValue.get(), overwrite); + +#endif + + if (status != 0) { + status = errno; + } + + return status; +} + +std::int32_t ExecutionEnvironment::UnsetEnv( + const char *name, std::size_t name_length, const Terminator &terminator) { + + RUNTIME_CHECK(terminator, name && name_length); + + OwningPtr cStyleName{ + SaveDefaultCharacter(name, name_length, terminator)}; + RUNTIME_CHECK(terminator, cStyleName); + + std::int32_t status{0}; + +#ifdef _WIN32 + + // Passing empty string as value will unset the variable + status = _putenv_s(cStyleName.get(), ""); + +#else + + status = unsetenv(cStyleName.get()); + +#endif + + if (status != 0) { + status = errno; + } + + return status; +} + } // namespace Fortran::runtime diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index ecf6fbeabd654..0118f8eb7d913 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -1040,6 +1040,41 @@ PROGRAM example_hostnm END PROGRAM ``` +### Non-Standard Intrinsics: PUTENV + +#### Description +`PUTENV(STR [, STATUS])` sets or deletes environment variable. + +This intrinsic is provided in both subroutine and function forms; however, only +one form can be used in any given program unit. + +| ARGUMENT | INTENT | TYPE | KIND | Description | +|----------|--------|-------------|---------|---------------------------------| +| `STR` | `IN` | `CHARACTER` | default | String in the form "name=value" (see below) | +| `STATUS` | `OUT` | `INTEGER` | default | Optional. Returns 0 on success, C's `errno` on failure. | + +#### Usage and Info + +- **Standard:** extension +- **Class:** Subroutine, function +- **Syntax:** `CALL PUTENV(STR [, STATUS])`, `STATUS = PUTENV(STR)` + +The passed string can be in the form "name=value" to set environment variable "name" to value "value". It can also be of the form "name=" to delete environment variable "name". + +The environment variables set by PUTENV can be read by GET_ENVIRONMENT_VARIABLE. + +#### Example +```Fortran + integer :: status + + ! Set variable my_var to value my_value + putenv("my_var=my_value", status) + + ! Delete variable my_var + putenv("my_var=") + end +``` + ### Non-standard Intrinsics: RENAME `RENAME(OLD, NEW[, STATUS])` renames/moves a file on the filesystem. @@ -1094,7 +1129,7 @@ function form. ### Non-Standard Intrinsics: TIME #### Description -`TIME()` returns the current time of the system as a INTEGER(8). +`TIME()` returns the current time of the system as a INTEGER(8). #### Usage and Info @@ -1269,7 +1304,7 @@ by `ISIZE`. `COMPAR` function takes the addresses of element `A` and `B` and must return: - a negative value if `A` < `B` - zero if `A` == `B` -- a positive value otherwise. +- a positive value otherwise. #### Usage and Info diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 00b7b696eb4f9..68617d6e37d7c 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -382,6 +382,8 @@ struct IntrinsicLibrary { mlir::Value genPoppar(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genPresent(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genProduct(mlir::Type, llvm::ArrayRef); + fir::ExtendedValue genPutenv(std::optional, + llvm::ArrayRef); void genRandomInit(llvm::ArrayRef); void genRandomNumber(llvm::ArrayRef); void genRandomSeed(llvm::ArrayRef); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Command.h b/flang/include/flang/Optimizer/Builder/Runtime/Command.h index 5880a703ed92e..fe19f24d951fd 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Command.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Command.h @@ -68,6 +68,11 @@ mlir::Value genHostnm(fir::FirOpBuilder &builder, mlir::Location loc, void genPerror(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value string); +/// Generate a call to the runtime function which implements the PUTENV +/// intrinsic. +mlir::Value genPutEnv(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value str, mlir::Value strLength); + /// Generate a call to the Unlink runtime function which implements /// the UNLINK intrinsic. mlir::Value genUnlink(fir::FirOpBuilder &builder, mlir::Location loc, diff --git a/flang/include/flang/Runtime/command.h b/flang/include/flang/Runtime/command.h index 16854c981ca23..19b486094da17 100644 --- a/flang/include/flang/Runtime/command.h +++ b/flang/include/flang/Runtime/command.h @@ -64,11 +64,15 @@ std::int32_t RTNAME(GetCwd)( std::int32_t RTNAME(Hostnm)( const Descriptor &res, const char *sourceFile, int line); +std::int32_t RTNAME(PutEnv)( + const char *str, size_t str_length, const char *sourceFile, int line); + // Calls unlink() std::int32_t RTNAME(Unlink)( const char *path, size_t pathLength, const char *sourceFile, int line); } // extern "C" + } // namespace Fortran::runtime #endif // FORTRAN_RUNTIME_COMMAND_H_ diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index 997a745466dea..709f2e6c85bb2 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -856,6 +856,8 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ DefaultInt, Rank::scalar, IntrinsicClass::inquiryFunction}, {"present", {{"a", Addressable, Rank::anyOrAssumedRank}}, DefaultLogical, Rank::scalar, IntrinsicClass::inquiryFunction}, + {"putenv", {{"str", DefaultChar, Rank::scalar}}, DefaultInt, Rank::scalar, + IntrinsicClass::transformationalFunction}, {"radix", {{"x", AnyIntOrReal, Rank::anyOrAssumedRank, Optionality::required, common::Intent::In, @@ -1639,6 +1641,12 @@ static const IntrinsicInterface intrinsicSubroutine[]{ {}, Rank::elemental, IntrinsicClass::pureSubroutine}, {"perror", {{"string", DefaultChar, Rank::scalar}}, {}, Rank::elemental, IntrinsicClass::impureSubroutine}, + {"putenv", + {{"str", DefaultChar, Rank::scalar, Optionality::required, + common::Intent::In}, + {"status", DefaultInt, Rank::scalar, Optionality::optional, + common::Intent::Out}}, + {}, Rank::elemental, IntrinsicClass::impureSubroutine}, {"mvbits", {{"from", SameIntOrUnsigned}, {"frompos", AnyInt}, {"len", AnyInt}, {"to", SameIntOrUnsigned, Rank::elemental, Optionality::required, @@ -2874,8 +2882,8 @@ bool IntrinsicProcTable::Implementation::IsDualIntrinsic( // Collection for some intrinsics with function and subroutine form, // in order to pass the semantic check. static const std::string dualIntrinsic[]{{"chdir"}, {"etime"}, {"fseek"}, - {"ftell"}, {"getcwd"}, {"hostnm"}, {"rename"}, {"second"}, {"system"}, - {"unlink"}}; + {"ftell"}, {"getcwd"}, {"hostnm"}, {"putenv"s}, {"rename"}, {"second"}, + {"system"}, {"unlink"}}; return llvm::is_contained(dualIntrinsic, name); } diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 702a55a49c953..93c00b6b28140 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -793,6 +793,10 @@ static constexpr IntrinsicHandler handlers[]{ {"dim", asValue}, {"mask", asBox, handleDynamicOptional}}}, /*isElemental=*/false}, + {"putenv", + &I::genPutenv, + {{{"str", asAddr}, {"status", asAddr, handleDynamicOptional}}}, + /*isElemental=*/false}, {"random_init", &I::genRandomInit, {{{"repeatable", asValue}, {"image_distinct", asValue}}}, @@ -7329,6 +7333,39 @@ IntrinsicLibrary::genProduct(mlir::Type resultType, "PRODUCT", resultType, args); } +// PUTENV +fir::ExtendedValue +IntrinsicLibrary::genPutenv(std::optional resultType, + llvm::ArrayRef args) { + assert((resultType.has_value() && args.size() == 1) || + (!resultType.has_value() && args.size() >= 1 && args.size() <= 2)); + + mlir::Value str = fir::getBase(args[0]); + mlir::Value strLength = fir::getLen(args[0]); + mlir::Value statusValue = + fir::runtime::genPutEnv(builder, loc, str, strLength); + + if (resultType.has_value()) { + // Function form, return status. + return builder.createConvert(loc, *resultType, statusValue); + } + + // Subroutine form, store status and return none. + const fir::ExtendedValue &status = args[1]; + if (!isStaticallyAbsent(status)) { + mlir::Value statusAddr = fir::getBase(status); + mlir::Value statusIsPresentAtRuntime = + builder.genIsNotNullAddr(loc, statusAddr); + builder.genIfThen(loc, statusIsPresentAtRuntime) + .genThen([&]() { + builder.createStoreWithConvert(loc, statusValue, statusAddr); + }) + .end(); + } + + return {}; +} + // RANDOM_INIT void IntrinsicLibrary::genRandomInit(llvm::ArrayRef args) { assert(args.size() == 2); diff --git a/flang/lib/Optimizer/Builder/Runtime/Command.cpp b/flang/lib/Optimizer/Builder/Runtime/Command.cpp index 27ea5961837e6..35aa529a9a727 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Command.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Command.cpp @@ -126,6 +126,20 @@ void fir::runtime::genPerror(fir::FirOpBuilder &builder, mlir::Location loc, builder.create(loc, runtimeFunc, args); } +mlir::Value fir::runtime::genPutEnv(fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Value str, + mlir::Value strLength) { + mlir::func::FuncOp func = + fir::runtime::getRuntimeFunc(loc, builder); + auto runtimeFuncTy = func.getFunctionType(); + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, runtimeFuncTy.getInput(1)); + llvm::SmallVector args = fir::runtime::createArguments( + builder, loc, runtimeFuncTy, str, strLength, sourceFile, sourceLine); + return builder.create(loc, func, args).getResult(0); +} + mlir::Value fir::runtime::genUnlink(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value path, mlir::Value pathLength) { diff --git a/flang/test/Lower/Intrinsics/putenv-func.f90 b/flang/test/Lower/Intrinsics/putenv-func.f90 new file mode 100644 index 0000000000000..9b28282a0b787 --- /dev/null +++ b/flang/test/Lower/Intrinsics/putenv-func.f90 @@ -0,0 +1,24 @@ +!RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s + +!CHECK-LABEL: func.func @_QPputenv_test +!CHECK-SAME: %[[dummyStr:.*]]: !fir.boxchar<1> {fir.bindc_name = "str"}) -> i32 { +integer function putenv_test(str) +CHARACTER(len=255) :: str + +!CHECK-DAG: %[[func_result:.*]] = fir.alloca i32 {bindc_name = "putenv_test", uniq_name = "_QFputenv_testEputenv_test"} +!CHECK-DAG: %[[func_result_decl:.*]]:{{.*}} = hlfir.declare %[[func_result]] {uniq_name = "_QFputenv_testEputenv_test"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK-DAG: %[[src_str_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref +!CHECK-DAG: %[[line_value:.*]] = arith.constant {{.*}} : i64 +!CHECK-DAG: %[[str:.*]] = fir.convert {{.*}} (!fir.ref>) -> !fir.ref +!CHECK-DAG: %[[str_len:.*]] = fir.convert {{.*}} : (index) -> i64 +!CHECK-DAG: %[[src_str:.*]] = fir.convert %[[src_str_addr]] : (!fir.ref) -> !fir.ref +!CHECK-DAG: %[[line:.*]] = fir.convert %[[line_value]] : (i64) -> i32 +!CHECK: %[[putenv_result:.*]] = fir.call @_FortranAPutEnv(%[[str]], %[[str_len]], %[[src_str]], %[[line]]) +!CHECK-SAME: -> i32 + +! Check _FortranAPutEnv result code handling +!CHECK-DAG: hlfir.assign %[[putenv_result]] to %[[func_result_decl]]#0 : i32, !fir.ref +!CHECK-DAG: %[[load_result:.*]] = fir.load %[[func_result_decl]]#0 : !fir.ref +!CHECK: return %[[load_result]] : i32 +putenv_test = putenv(str) +end function putenv_test diff --git a/flang/test/Lower/Intrinsics/putenv-sub.f90 b/flang/test/Lower/Intrinsics/putenv-sub.f90 new file mode 100644 index 0000000000000..285dbc6fddb19 --- /dev/null +++ b/flang/test/Lower/Intrinsics/putenv-sub.f90 @@ -0,0 +1,54 @@ +!RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s + +!CHECK-LABEL: func.func @_QPstr_only +!CHECK-SAME: %[[dummyStr:.*]]: !fir.boxchar<1> {fir.bindc_name = "str"}) { +subroutine str_only(str) + CHARACTER(len=*) :: str + !CHECK-DAG: %[[scope:.*]] = fir.dummy_scope : !fir.dscope + !CHECK-DAG: %[[unbox_str:.*]]:2 = fir.unboxchar %[[dummyStr]] : (!fir.boxchar<1>) -> (!fir.ref>, index) + !CHECK-DAG: %[[str_decl:.*]]:2 = hlfir.declare %[[unbox_str]]#0 typeparams %[[unbox_str]]#1 dummy_scope %[[scope]] {uniq_name = "_QFstr_onlyEstr"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) + !CHECK-DAG: %[[src_str_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref> + !CHECK-DAG: %[[line_value:.*]] = arith.constant {{.*}} : i64 + !CHECK-DAG: %[[str:.*]] = fir.convert %[[str_decl]]#1 : (!fir.ref>) -> !fir.ref + !CHECK-DAG: %[[str_len:.*]] = fir.convert %[[unbox_str]]#1 : (index) -> i64 + !CHECK-DAG: %[[src_str:.*]] = fir.convert %[[src_str_addr]] : (!fir.ref) -> !fir.ref + !CHECK-DAG: %[[line:.*]] = fir.convert %[[line_value]] : (i64) -> i32 + !CHECK: fir.call @_FortranAPutEnv(%[[str]], %[[str_len]], %[[src_str]], %[[line]]) + !CHECK-SAME: : (!fir.ref, i64, !fir.ref, i32) + !CHECK-SAME: -> i32 + call putenv(str) + !CHECK: return +end subroutine str_only + !CHECK: } + + !CHECK-LABEL: func.func @_QPall_arguments + !CHECK-SAME: %[[dummyStr:.*]]: !fir.boxchar<1> {fir.bindc_name = "str"} + !CHECK-SAME: %[[dummyStat:.*]]: !fir.ref {fir.bindc_name = "status"} + !CHECK-SAME: ) { +subroutine all_arguments(str, status) + CHARACTER(len=*) :: str + INTEGER :: status + !CHECK-DAG: %[[scope:.*]] = fir.dummy_scope : !fir.dscope + !CHECK-DAG: %[[unbox_str:.*]]:2 = fir.unboxchar %[[dummyStr]] : (!fir.boxchar<1>) -> (!fir.ref>, index) + !CHECK-DAG: %[[str_decl:.*]]:2 = hlfir.declare %[[unbox_str]]#0 typeparams %[[unbox_str]]#1 dummy_scope %[[scope]] {uniq_name = "_QFall_argumentsEstr"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) + !CHECK-DAG: %[[status_decl:.*]]:2 = hlfir.declare %[[dummyStat]] dummy_scope %[[scope]] {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + !CHECK-DAG: %[[src_str_addr:.*]] = fir.address_of(@_{{.*}}) : !fir.ref> + !CHECK-DAG: %[[line_value:.*]] = arith.constant {{.*}} : i64 + !CHECK-DAG: %[[str:.*]] = fir.convert %[[str_decl]]#1 : (!fir.ref>) -> !fir.ref + !CHECK-DAG: %[[str_len:.*]] = fir.convert %[[unbox_str]]#1 : (index) -> i64 + !CHECK-DAG: %[[src_str:.*]] = fir.convert %[[src_str_addr]] : (!fir.ref) -> !fir.ref + !CHECK-DAG: %[[line:.*]] = fir.convert %[[line_value]] : (i64) -> i32 + !CHECK: %[[putenv_result:.*]] = fir.call @_FortranAPutEnv(%[[str]], %[[str_len]], %[[src_str]], %[[line]]) + !CHECK-SAME: : (!fir.ref, i64, !fir.ref, i32) + !CHECK-SAME: -> i32 + + !CHECK-DAG: %[[status_i64:.*]] = fir.convert %[[status_decl]]#0 : (!fir.ref) -> i64 + !CHECK-DAG: %[[c_null:.*]] = arith.constant 0 : i64 + !CHECK-DAG: %[[cmp_result:.*]] = arith.cmpi ne, %[[status_i64]], %[[c_null]] : i64 + !CHECK: fir.if %[[cmp_result]] { + !CHECK-NEXT: fir.store %[[putenv_result]] to %[[status_decl]]#0 : !fir.ref + !CHECK-NEXT: } + call putenv(str, status) + !CHECK: return +end subroutine all_arguments + !CHECK: } diff --git a/flang/test/Semantics/putenv.f90 b/flang/test/Semantics/putenv.f90 new file mode 100644 index 0000000000000..8ec98f01ec7a4 --- /dev/null +++ b/flang/test/Semantics/putenv.f90 @@ -0,0 +1,42 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic +! Tests for the putenv intrinsics. + +subroutine bad_kind_error(str, status) + CHARACTER(len=255) :: str + INTEGER(2) :: status + !ERROR: Actual argument for 'status=' has bad type or kind 'INTEGER(2)' + call putenv(str, status) +end subroutine bad_kind_error + +subroutine bad_args_error() + !ERROR: missing mandatory 'str=' argument + call putenv() +end subroutine bad_args_error + +subroutine bad_function(str) + CHARACTER(len=255) :: str + INTEGER :: status + call putenv(str, status) + !ERROR: Cannot call subroutine 'putenv' like a function + status = putenv(str) +end subroutine bad_function + +subroutine bad_sub(str) + CHARACTER(len=255) :: str + INTEGER :: status + status = putenv(str) + !ERROR: Cannot call function 'putenv' like a subroutine + call putenv(str, status) +end subroutine bad_sub + +subroutine good_subroutine(str, status) + CHARACTER(len=255) :: str + INTEGER :: status + call putenv(str, status) +end subroutine good_subroutine + +subroutine good_function(str, status) + CHARACTER(len=255) :: str + INTEGER :: status + status = putenv(str) +end subroutine good_function From d341b632a1a07362dfb51f916baeedcbc8945e0f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 4 Apr 2025 13:27:33 -0700 Subject: [PATCH 0698/1029] [RISCV] Remove unused function declaration. NFC --- llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index 716299ab896d1..b705d03b5d1fc 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -527,9 +527,6 @@ static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, uint64_t Address, const MCDisassembler *Decoder); -static DecodeStatus decodeZcmpStackAdj(MCInst &Inst, uint32_t Imm, - uint64_t Address, const void *Decoder); - static DecodeStatus decodeCSSPushPopchk(MCInst &Inst, uint32_t Insn, uint64_t Address, const MCDisassembler *Decoder); From 12cf6d3b9e0b4983d609576b15dc37ceb7d1d8d7 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 4 Apr 2025 13:37:12 -0700 Subject: [PATCH 0699/1029] [lldb] Fix missing semicolon in lldbassert macro --- lldb/include/lldb/Utility/LLDBAssert.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/include/lldb/Utility/LLDBAssert.h b/lldb/include/lldb/Utility/LLDBAssert.h index cee30b81402ca..cc0146be25998 100644 --- a/lldb/include/lldb/Utility/LLDBAssert.h +++ b/lldb/include/lldb/Utility/LLDBAssert.h @@ -23,7 +23,7 @@ do { \ static std::once_flag _once_flag; \ lldb_private::_lldb_assert(static_cast(x), #x, __FUNCTION__, \ - __FILE_NAME__, __LINE__, _once_flag) \ + __FILE_NAME__, __LINE__, _once_flag); \ } while (0) #else #define lldbassert(x) \ From bbaf0877fa61ea5cadc6ded794c3184081790c66 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 4 Apr 2025 13:34:01 -0700 Subject: [PATCH 0700/1029] [RISCV] Assert on all invalid inputs to getStackAdjBase and printRegList. NFC --- llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp | 2 ++ llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 4 ++-- llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp | 5 ++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp index 7e199af98cb03..3b3460c308d7e 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp @@ -241,6 +241,8 @@ float RISCVLoadFPImm::getFPImm(unsigned Imm) { } void RISCVZC::printRegList(unsigned RlistEncode, raw_ostream &OS) { + assert(RlistEncode >= RLISTENCODE::RA && + RlistEncode <= RLISTENCODE::RA_S0_S11 && "Invalid Rlist"); OS << "{ra"; if (RlistEncode > RISCVZC::RA) { OS << ", s0"; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index d7af9d79c4cde..506c638c83a72 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -651,8 +651,8 @@ inline static unsigned encodeRegListNumRegs(unsigned NumRegs) { } inline static unsigned getStackAdjBase(unsigned RlistVal, bool IsRV64) { - assert(RlistVal != RLISTENCODE::INVALID_RLIST && - "{ra, s0-s10} is not supported, s11 must be included."); + assert(RlistVal >= RLISTENCODE::RA && RlistVal <= RLISTENCODE::RA_S0_S11 && + "Invalid Rlist"); unsigned NumRegs = (RlistVal - RLISTENCODE::RA) + 1; // s10 and s11 are saved together. if (RlistVal == RLISTENCODE::RA_S0_S11) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index 8a384020820ff..83ecf805489c1 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -225,6 +225,10 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo, void RISCVInstPrinter::printRegList(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { unsigned Imm = MI->getOperand(OpNo).getImm(); + + assert(Imm >= RISCVZC::RLISTENCODE::RA && + Imm <= RISCVZC::RLISTENCODE::RA_S0_S11 && "Invalid Rlist"); + O << "{"; printRegName(O, RISCV::X1); @@ -281,7 +285,6 @@ void RISCVInstPrinter::printStackAdj(const MCInst *MI, unsigned OpNo, bool IsRV64 = STI.hasFeature(RISCV::Feature64Bit); int64_t StackAdj = 0; auto RlistVal = MI->getOperand(0).getImm(); - assert(RlistVal != 16 && "Incorrect rlist."); auto Base = RISCVZC::getStackAdjBase(RlistVal, IsRV64); StackAdj = Imm + Base; assert((StackAdj >= Base && StackAdj <= Base + 48) && From 428fc2c8875eca42b4803fe100791270ec971e4d Mon Sep 17 00:00:00 2001 From: Finn Plummer Date: Fri, 4 Apr 2025 13:43:45 -0700 Subject: [PATCH 0701/1029] [NFC][HLSL][RootSignature] Make the Lexer adhere to naming conventions (#134136) - when developing the RootSignatureLexer library, we are creating new files so we should set the standard to adhere to the coding conventions for function naming - this was missed in the initial review but caught in the review of the parser pr [here](https://github.com/llvm/llvm-project/pull/133302#discussion_r2017632092) Co-authored-by: Finn Plummer --- .../include/clang/Lex/LexHLSLRootSignature.h | 12 ++++---- .../clang/Parse/ParseHLSLRootSignature.h | 2 +- clang/lib/Lex/LexHLSLRootSignature.cpp | 30 +++++++++---------- clang/lib/Parse/ParseHLSLRootSignature.cpp | 2 +- .../Lex/LexHLSLRootSignatureTest.cpp | 26 ++++++++-------- 5 files changed, 36 insertions(+), 36 deletions(-) diff --git a/clang/include/clang/Lex/LexHLSLRootSignature.h b/clang/include/clang/Lex/LexHLSLRootSignature.h index 4dc80ff546aa0..9275e0d75840b 100644 --- a/clang/include/clang/Lex/LexHLSLRootSignature.h +++ b/clang/include/clang/Lex/LexHLSLRootSignature.h @@ -61,13 +61,13 @@ class RootSignatureLexer { : Buffer(Signature), SourceLoc(SourceLoc) {} /// Consumes and returns the next token. - RootSignatureToken ConsumeToken(); + RootSignatureToken consumeToken(); /// Returns the token that proceeds CurToken - RootSignatureToken PeekNextToken(); + RootSignatureToken peekNextToken(); - bool EndOfBuffer() { - AdvanceBuffer(Buffer.take_while(isspace).size()); + bool isEndOfBuffer() { + advanceBuffer(Buffer.take_while(isspace).size()); return Buffer.empty(); } @@ -82,11 +82,11 @@ class RootSignatureLexer { clang::SourceLocation SourceLoc; /// Consumes the buffer and returns the lexed token. - RootSignatureToken LexToken(); + RootSignatureToken lexToken(); /// Advance the buffer by the specified number of characters. /// Updates the SourceLocation appropriately. - void AdvanceBuffer(unsigned NumCharacters = 1) { + void advanceBuffer(unsigned NumCharacters = 1) { Buffer = Buffer.drop_front(NumCharacters); SourceLoc = SourceLoc.getLocWithOffset(NumCharacters); } diff --git a/clang/include/clang/Parse/ParseHLSLRootSignature.h b/clang/include/clang/Parse/ParseHLSLRootSignature.h index 18cc2c6692551..a8dd6b02501ae 100644 --- a/clang/include/clang/Parse/ParseHLSLRootSignature.h +++ b/clang/include/clang/Parse/ParseHLSLRootSignature.h @@ -70,7 +70,7 @@ class RootSignatureParser { bool parseDescriptorTableClause(); /// Invoke the Lexer to consume a token and update CurToken with the result - void consumeNextToken() { CurToken = Lexer.ConsumeToken(); } + void consumeNextToken() { CurToken = Lexer.consumeToken(); } /// Return true if the next token one of the expected kinds bool peekExpectedToken(RootSignatureToken::Kind Expected); diff --git a/clang/lib/Lex/LexHLSLRootSignature.cpp b/clang/lib/Lex/LexHLSLRootSignature.cpp index b065d9855ddac..41ee572cf094a 100644 --- a/clang/lib/Lex/LexHLSLRootSignature.cpp +++ b/clang/lib/Lex/LexHLSLRootSignature.cpp @@ -15,16 +15,16 @@ using TokenKind = RootSignatureToken::Kind; // Lexer Definitions -static bool IsNumberChar(char C) { +static bool isNumberChar(char C) { // TODO(#126565): extend for float support exponents return isdigit(C); // integer support } -RootSignatureToken RootSignatureLexer::LexToken() { +RootSignatureToken RootSignatureLexer::lexToken() { // Discard any leading whitespace - AdvanceBuffer(Buffer.take_while(isspace).size()); + advanceBuffer(Buffer.take_while(isspace).size()); - if (EndOfBuffer()) + if (isEndOfBuffer()) return RootSignatureToken(TokenKind::end_of_stream, SourceLoc); // Record where this token is in the text for usage in parser diagnostics @@ -37,7 +37,7 @@ RootSignatureToken RootSignatureLexer::LexToken() { #define PUNCTUATOR(X, Y) \ case Y: { \ Result.TokKind = TokenKind::pu_##X; \ - AdvanceBuffer(); \ + advanceBuffer(); \ return Result; \ } #include "clang/Lex/HLSLRootSignatureTokenKinds.def" @@ -48,8 +48,8 @@ RootSignatureToken RootSignatureLexer::LexToken() { // Integer literal if (isdigit(C)) { Result.TokKind = TokenKind::int_literal; - Result.NumSpelling = Buffer.take_while(IsNumberChar); - AdvanceBuffer(Result.NumSpelling.size()); + Result.NumSpelling = Buffer.take_while(isNumberChar); + advanceBuffer(Result.NumSpelling.size()); return Result; } @@ -82,11 +82,11 @@ RootSignatureToken RootSignatureLexer::LexToken() { llvm_unreachable("Switch for an expected token was not provided"); } - AdvanceBuffer(); + advanceBuffer(); // Lex the integer literal - Result.NumSpelling = Buffer.take_while(IsNumberChar); - AdvanceBuffer(Result.NumSpelling.size()); + Result.NumSpelling = Buffer.take_while(isNumberChar); + advanceBuffer(Result.NumSpelling.size()); return Result; } @@ -103,26 +103,26 @@ RootSignatureToken RootSignatureLexer::LexToken() { // Then attempt to retreive a string from it Result.TokKind = Switch.Default(TokenKind::invalid); - AdvanceBuffer(TokSpelling.size()); + advanceBuffer(TokSpelling.size()); return Result; } -RootSignatureToken RootSignatureLexer::ConsumeToken() { +RootSignatureToken RootSignatureLexer::consumeToken() { // If we previously peeked then just return the previous value over if (NextToken && NextToken->TokKind != TokenKind::end_of_stream) { RootSignatureToken Result = *NextToken; NextToken = std::nullopt; return Result; } - return LexToken(); + return lexToken(); } -RootSignatureToken RootSignatureLexer::PeekNextToken() { +RootSignatureToken RootSignatureLexer::peekNextToken() { // Already peeked from the current token if (NextToken) return *NextToken; - NextToken = LexToken(); + NextToken = lexToken(); return *NextToken; } diff --git a/clang/lib/Parse/ParseHLSLRootSignature.cpp b/clang/lib/Parse/ParseHLSLRootSignature.cpp index 93a9689ebdf72..3513ef454f750 100644 --- a/clang/lib/Parse/ParseHLSLRootSignature.cpp +++ b/clang/lib/Parse/ParseHLSLRootSignature.cpp @@ -125,7 +125,7 @@ bool RootSignatureParser::peekExpectedToken(TokenKind Expected) { } bool RootSignatureParser::peekExpectedToken(ArrayRef AnyExpected) { - RootSignatureToken Result = Lexer.PeekNextToken(); + RootSignatureToken Result = Lexer.peekNextToken(); return llvm::is_contained(AnyExpected, Result.TokKind); } diff --git a/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp b/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp index 36bd201df1287..46f00450adb62 100644 --- a/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp +++ b/clang/unittests/Lex/LexHLSLRootSignatureTest.cpp @@ -19,7 +19,7 @@ class LexHLSLRootSignatureTest : public ::testing::Test { protected: LexHLSLRootSignatureTest() {} - void CheckTokens(hlsl::RootSignatureLexer &Lexer, + void checkTokens(hlsl::RootSignatureLexer &Lexer, SmallVector &Computed, SmallVector &Expected) { for (unsigned I = 0, E = Expected.size(); I != E; ++I) { @@ -27,13 +27,13 @@ class LexHLSLRootSignatureTest : public ::testing::Test { if (Expected[I] == TokenKind::invalid || Expected[I] == TokenKind::end_of_stream) continue; - hlsl::RootSignatureToken Result = Lexer.ConsumeToken(); + hlsl::RootSignatureToken Result = Lexer.consumeToken(); ASSERT_EQ(Result.TokKind, Expected[I]); Computed.push_back(Result); } - hlsl::RootSignatureToken EndOfStream = Lexer.ConsumeToken(); + hlsl::RootSignatureToken EndOfStream = Lexer.consumeToken(); ASSERT_EQ(EndOfStream.TokKind, TokenKind::end_of_stream); - ASSERT_TRUE(Lexer.EndOfBuffer()); + ASSERT_TRUE(Lexer.isEndOfBuffer()); } }; @@ -55,7 +55,7 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexNumbersTest) { TokenKind::pu_plus, TokenKind::int_literal, TokenKind::pu_plus, TokenKind::int_literal, }; - CheckTokens(Lexer, Tokens, Expected); + checkTokens(Lexer, Tokens, Expected); // Sample negative: int component hlsl::RootSignatureToken IntToken = Tokens[1]; @@ -119,7 +119,7 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexAllTokensTest) { #include "clang/Lex/HLSLRootSignatureTokenKinds.def" }; - CheckTokens(Lexer, Tokens, Expected); + checkTokens(Lexer, Tokens, Expected); } TEST_F(LexHLSLRootSignatureTest, ValidCaseInsensitiveKeywordsTest) { @@ -149,7 +149,7 @@ TEST_F(LexHLSLRootSignatureTest, ValidCaseInsensitiveKeywordsTest) { TokenKind::kw_offset, }; - CheckTokens(Lexer, Tokens, Expected); + checkTokens(Lexer, Tokens, Expected); } TEST_F(LexHLSLRootSignatureTest, ValidLexPeekTest) { @@ -161,26 +161,26 @@ TEST_F(LexHLSLRootSignatureTest, ValidLexPeekTest) { hlsl::RootSignatureLexer Lexer(Source, TokLoc); // Test basic peek - hlsl::RootSignatureToken Res = Lexer.PeekNextToken(); + hlsl::RootSignatureToken Res = Lexer.peekNextToken(); ASSERT_EQ(Res.TokKind, TokenKind::pu_r_paren); // Ensure it doesn't peek past one element - Res = Lexer.PeekNextToken(); + Res = Lexer.peekNextToken(); ASSERT_EQ(Res.TokKind, TokenKind::pu_r_paren); - Res = Lexer.ConsumeToken(); + Res = Lexer.consumeToken(); ASSERT_EQ(Res.TokKind, TokenKind::pu_r_paren); // Invoke after reseting the NextToken - Res = Lexer.PeekNextToken(); + Res = Lexer.peekNextToken(); ASSERT_EQ(Res.TokKind, TokenKind::int_literal); // Ensure we can still consume the second token - Res = Lexer.ConsumeToken(); + Res = Lexer.consumeToken(); ASSERT_EQ(Res.TokKind, TokenKind::int_literal); // Ensure end of stream token - Res = Lexer.PeekNextToken(); + Res = Lexer.peekNextToken(); ASSERT_EQ(Res.TokKind, TokenKind::end_of_stream); } From 24dfcc0c024f9ab8ba61c0994513f57e882961fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 4 Apr 2025 13:45:03 -0700 Subject: [PATCH 0702/1029] [flang][cuda] Use the nvvm.vote.sync op for all and any (#134433) NVVM operations are now available for all and any as well. Use the op and clean up the generation function to handle all the 3 vote sync kinds. --- .../flang/Optimizer/Builder/IntrinsicCall.h | 6 +- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 55 +++++-------------- flang/test/Lower/CUDA/cuda-device-proc.cuf | 4 +- 3 files changed, 19 insertions(+), 46 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 68617d6e37d7c..17052113859e1 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -19,6 +19,7 @@ #include "flang/Runtime/iostat-consts.h" #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/Math/IR/Math.h" #include @@ -450,9 +451,8 @@ struct IntrinsicLibrary { llvm::ArrayRef args); fir::ExtendedValue genUnpack(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genVerify(mlir::Type, llvm::ArrayRef); - mlir::Value genVoteAllSync(mlir::Type, llvm::ArrayRef); - mlir::Value genVoteAnySync(mlir::Type, llvm::ArrayRef); - mlir::Value genVoteBallotSync(mlir::Type, llvm::ArrayRef); + template + mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef); /// Implement all conversion functions like DBLE, the first argument is /// the value to convert. There may be an additional KIND arguments that diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 93c00b6b28140..0248586344ad9 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -48,7 +48,6 @@ #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/LLVMTypes.h" -#include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "llvm/Support/CommandLine.h" @@ -262,7 +261,7 @@ static constexpr IntrinsicHandler handlers[]{ {{{"mask", asAddr}, {"dim", asValue}}}, /*isElemental=*/false}, {"all_sync", - &I::genVoteAllSync, + &I::genVoteSync, {{{"mask", asValue}, {"pred", asValue}}}, /*isElemental=*/false}, {"allocated", @@ -275,7 +274,7 @@ static constexpr IntrinsicHandler handlers[]{ {{{"mask", asAddr}, {"dim", asValue}}}, /*isElemental=*/false}, {"any_sync", - &I::genVoteAnySync, + &I::genVoteSync, {{{"mask", asValue}, {"pred", asValue}}}, /*isElemental=*/false}, {"asind", &I::genAsind}, @@ -341,7 +340,7 @@ static constexpr IntrinsicHandler handlers[]{ {"atomicsubl", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicxori", &I::genAtomicXor, {{{"a", asAddr}, {"v", asValue}}}, false}, {"ballot_sync", - &I::genVoteBallotSync, + &I::genVoteSync, {{{"mask", asValue}, {"pred", asValue}}}, /*isElemental=*/false}, {"bessel_jn", @@ -6583,46 +6582,20 @@ IntrinsicLibrary::genMatchAllSync(mlir::Type resultType, return value; } -static mlir::Value genVoteSync(fir::FirOpBuilder &builder, mlir::Location loc, - llvm::StringRef funcName, mlir::Type resTy, - llvm::ArrayRef args) { - mlir::MLIRContext *context = builder.getContext(); - mlir::Type i32Ty = builder.getI32Type(); - mlir::Type i1Ty = builder.getI1Type(); - mlir::FunctionType ftype = - mlir::FunctionType::get(context, {i32Ty, i1Ty}, {resTy}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - llvm::SmallVector filteredArgs; - return builder.create(loc, funcOp, args).getResult(0); -} - -// ALL_SYNC -mlir::Value IntrinsicLibrary::genVoteAllSync(mlir::Type resultType, - llvm::ArrayRef args) { - assert(args.size() == 2); - return genVoteSync(builder, loc, "llvm.nvvm.vote.all.sync", - builder.getI1Type(), args); -} - -// ANY_SYNC -mlir::Value IntrinsicLibrary::genVoteAnySync(mlir::Type resultType, - llvm::ArrayRef args) { - assert(args.size() == 2); - return genVoteSync(builder, loc, "llvm.nvvm.vote.any.sync", - builder.getI1Type(), args); -} - -// BALLOT_SYNC -mlir::Value -IntrinsicLibrary::genVoteBallotSync(mlir::Type resultType, - llvm::ArrayRef args) { +// ALL_SYNC, ANY_SYNC, BALLOT_SYNC +template +mlir::Value IntrinsicLibrary::genVoteSync(mlir::Type resultType, + llvm::ArrayRef args) { assert(args.size() == 2); mlir::Value arg1 = builder.create(loc, builder.getI1Type(), args[1]); - return builder - .create(loc, resultType, args[0], arg1, - mlir::NVVM::VoteSyncKind::ballot) - .getResult(); + mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot + ? builder.getI32Type() + : builder.getI1Type(); + auto voteRes = + builder.create(loc, resTy, args[0], arg1, kind) + .getResult(); + return builder.create(loc, resultType, voteRes); } // MATCH_ANY_SYNC diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 7d6d920dfb2e8..8f5e6dd36da4e 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -301,8 +301,8 @@ attributes(device) subroutine testVote() end subroutine ! CHECK-LABEL: func.func @_QPtestvote() -! CHECK: fir.call @llvm.nvvm.vote.all.sync -! CHECK: fir.call @llvm.nvvm.vote.any.sync +! CHECK: %{{.*}} = nvvm.vote.sync all %{{.*}}, %{{.*}} -> i1 +! CHECK: %{{.*}} = nvvm.vote.sync any %{{.*}}, %{{.*}} -> i1 ! CHECK: %{{.*}} = nvvm.vote.sync ballot %{{.*}}, %{{.*}} -> i32 ! CHECK-DAG: func.func private @__ldca_i4x4_(!fir.ref>, !fir.ref>) From 19e0233eb844e653a3108de411366bd0165cf3ec Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 4 Apr 2025 13:59:33 -0700 Subject: [PATCH 0703/1029] [RISCV] Make decodeXqccmpRlistS0 defer to decodeZcmpRlist after checking for S0 being included. NFC This reduces code duplication. --- llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index b705d03b5d1fc..366291b53bebb 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -658,11 +658,9 @@ static DecodeStatus decodeZcmpRlist(MCInst &Inst, uint32_t Imm, static DecodeStatus decodeXqccmpRlistS0(MCInst &Inst, uint32_t Imm, uint64_t Address, const MCDisassembler *Decoder) { - bool IsRVE = Decoder->getSubtargetInfo().hasFeature(RISCV::FeatureStdExtE); - if (Imm < RISCVZC::RA_S0 || (IsRVE && Imm >= RISCVZC::RA_S0_S2)) + if (Imm < RISCVZC::RA_S0) return MCDisassembler::Fail; - Inst.addOperand(MCOperand::createImm(Imm)); - return MCDisassembler::Success; + return decodeZcmpRlist(Inst, Imm, Address, Decoder); } // Add implied SP operand for C.*SP compressed instructions. The SP operand From 18ff8df9583743f4e4ac2a74e55a28e35df958fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 4 Apr 2025 14:38:01 -0700 Subject: [PATCH 0704/1029] [flang][cuda] Register managed variables with double descriptor (#134444) Allocatable or pointer module variables with the CUDA managed attribute are defined with a double descriptor. One on the host and one on the device. Only the data pointed to by the descriptor will be allocated in managed memory. Allow the registration of any allocatable or pointer module variables like device or constant. --- .../Optimizer/Transforms/CUFAddConstructor.cpp | 9 ++++++--- flang/test/Fir/CUDA/cuda-constructor-2.f90 | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp index ad39640235e91..064f0f363f699 100644 --- a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp +++ b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp @@ -105,10 +105,15 @@ struct CUFAddConstructor if (!attr) continue; + if (attr.getValue() == cuf::DataAttribute::Managed && + !mlir::isa(globalOp.getType())) + TODO(loc, "registration of non-allocatable managed variables"); + mlir::func::FuncOp func; switch (attr.getValue()) { case cuf::DataAttribute::Device: - case cuf::DataAttribute::Constant: { + case cuf::DataAttribute::Constant: + case cuf::DataAttribute::Managed: { func = fir::runtime::getRuntimeFunc( loc, builder); auto fTy = func.getFunctionType(); @@ -141,8 +146,6 @@ struct CUFAddConstructor builder, loc, fTy, registeredMod, addr, gblName, sizeVal)}; builder.create(loc, func, args); } break; - case cuf::DataAttribute::Managed: - TODO(loc, "registration of managed variables"); default: break; } diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90 index 89fc99b736f4f..62118bb2eed2e 100644 --- a/flang/test/Fir/CUDA/cuda-constructor-2.f90 +++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90 @@ -60,3 +60,21 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, i } } } + +// ----- + +module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, "dlti.stack_alignment" = 128 : i64, "dlti.endianness" = "little">, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git 3372303188df0f7f8ac26e7ab610cf8b0f716d42)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + fir.global @_QMmEa00 {data_attr = #cuf.cuda} : !fir.box>> { + %c0 = arith.constant 0 : index + %0 = fir.zero_bits !fir.heap> + %1 = fir.shape %c0, %c0, %c0, %c0, %c0 : (index, index, index, index, index) -> !fir.shape<5> + %2 = fir.embox %0(%1) {allocator_idx = 3 : i32} : (!fir.heap>, !fir.shape<5>) -> !fir.box>> + fir.has_value %2 : !fir.box>> + } + gpu.module @cuda_device_mod { + } +} + +// CHECK: llvm.func internal @__cudaFortranConstructor() +// CHECK: fir.address_of(@_QMmEa00) +// CHECK: fir.call @_FortranACUFRegisterVariable From 412f7fa31607489dc400321968a70e114463b374 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Fri, 4 Apr 2025 14:39:57 -0700 Subject: [PATCH 0705/1029] [clang] Bump DIAG_SIZE_PARSE as we're hitting the limit downstream as of 6263de90df7f58c8b98475024d5eef102e10a372. --- clang/include/clang/Basic/DiagnosticIDs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h index f2bd19f9b6e8a..f936d4fb7a403 100644 --- a/clang/include/clang/Basic/DiagnosticIDs.h +++ b/clang/include/clang/Basic/DiagnosticIDs.h @@ -39,7 +39,7 @@ namespace clang { DIAG_SIZE_FRONTEND = 200, DIAG_SIZE_SERIALIZATION = 120, DIAG_SIZE_LEX = 500, - DIAG_SIZE_PARSE = 700, + DIAG_SIZE_PARSE = 800, DIAG_SIZE_AST = 300, DIAG_SIZE_COMMENT = 100, DIAG_SIZE_CROSSTU = 100, From ad39049ec48edcb2ad4024c80f1cadfc9f0e4cb0 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Fri, 4 Apr 2025 15:39:24 -0700 Subject: [PATCH 0706/1029] [DAGCombiner] Attempt to fold 'add' nodes to funnel-shift or rotate (#125612) Almost all of the rotate idioms that are valid for an 'or' are also valid when the halves are combined with an 'add'. Further, many of these cases are not handled by common bits tracking meaning that the 'add' is not converted to a 'disjoint or'. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 116 +++++++++--------- llvm/test/CodeGen/AMDGPU/rotate-add.ll | 30 +---- llvm/test/CodeGen/ARM/rotate-add.ll | 7 +- llvm/test/CodeGen/NVPTX/rotate-add.ll | 42 +++---- llvm/test/CodeGen/X86/rotate-add.ll | 55 ++------- 5 files changed, 92 insertions(+), 158 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index dc5c5f38e3bd8..8136f1794775e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -649,14 +649,15 @@ namespace { bool DemandHighBits = true); SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, - SDValue InnerPos, SDValue InnerNeg, bool HasPos, - unsigned PosOpcode, unsigned NegOpcode, - const SDLoc &DL); + SDValue InnerPos, SDValue InnerNeg, bool FromAdd, + bool HasPos, unsigned PosOpcode, + unsigned NegOpcode, const SDLoc &DL); SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg, - SDValue InnerPos, SDValue InnerNeg, bool HasPos, - unsigned PosOpcode, unsigned NegOpcode, - const SDLoc &DL); - SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); + SDValue InnerPos, SDValue InnerNeg, bool FromAdd, + bool HasPos, unsigned PosOpcode, + unsigned NegOpcode, const SDLoc &DL); + SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL, + bool FromAdd); SDValue MatchLoadCombine(SDNode *N); SDValue mergeTruncStores(StoreSDNode *N); SDValue reduceLoadWidth(SDNode *N); @@ -2986,6 +2987,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) { if (SDValue V = foldAddSubOfSignBit(N, DL, DAG)) return V; + if (SDValue V = MatchRotate(N0, N1, SDLoc(N), /*FromAdd=*/true)) + return V; + // Try to match AVGFLOOR fixedwidth pattern if (SDValue V = foldAddToAvg(N, DL)) return V; @@ -8175,7 +8179,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) { return V; // See if this is some rotate idiom. - if (SDValue Rot = MatchRotate(N0, N1, DL)) + if (SDValue Rot = MatchRotate(N0, N1, DL, /*FromAdd=*/false)) return Rot; if (SDValue Load = MatchLoadCombine(N)) @@ -8364,7 +8368,7 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift, // The IsRotate flag should be set when the LHS of both shifts is the same. // Otherwise if matching a general funnel shift, it should be clear. static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize, - SelectionDAG &DAG, bool IsRotate) { + SelectionDAG &DAG, bool IsRotate, bool FromAdd) { const auto &TLI = DAG.getTargetLoweringInfo(); // If EltSize is a power of 2 then: // @@ -8403,7 +8407,7 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize, // NOTE: We can only do this when matching operations which won't modify the // least Log2(EltSize) significant bits and not a general funnel shift. unsigned MaskLoBits = 0; - if (IsRotate && isPowerOf2_64(EltSize)) { + if (IsRotate && !FromAdd && isPowerOf2_64(EltSize)) { unsigned Bits = Log2_64(EltSize); unsigned NegBits = Neg.getScalarValueSizeInBits(); if (NegBits >= Bits) { @@ -8486,22 +8490,21 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize, // Neg with outer conversions stripped away. SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, SDValue InnerPos, - SDValue InnerNeg, bool HasPos, - unsigned PosOpcode, unsigned NegOpcode, - const SDLoc &DL) { - // fold (or (shl x, (*ext y)), - // (srl x, (*ext (sub 32, y)))) -> + SDValue InnerNeg, bool FromAdd, + bool HasPos, unsigned PosOpcode, + unsigned NegOpcode, const SDLoc &DL) { + // fold (or/add (shl x, (*ext y)), + // (srl x, (*ext (sub 32, y)))) -> // (rotl x, y) or (rotr x, (sub 32, y)) // - // fold (or (shl x, (*ext (sub 32, y))), - // (srl x, (*ext y))) -> + // fold (or/add (shl x, (*ext (sub 32, y))), + // (srl x, (*ext y))) -> // (rotr x, y) or (rotl x, (sub 32, y)) EVT VT = Shifted.getValueType(); if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG, - /*IsRotate*/ true)) { + /*IsRotate*/ true, FromAdd)) return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted, HasPos ? Pos : Neg); - } return SDValue(); } @@ -8514,30 +8517,30 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos, // TODO: Merge with MatchRotatePosNeg. SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg, SDValue InnerPos, - SDValue InnerNeg, bool HasPos, - unsigned PosOpcode, unsigned NegOpcode, - const SDLoc &DL) { + SDValue InnerNeg, bool FromAdd, + bool HasPos, unsigned PosOpcode, + unsigned NegOpcode, const SDLoc &DL) { EVT VT = N0.getValueType(); unsigned EltBits = VT.getScalarSizeInBits(); - // fold (or (shl x0, (*ext y)), - // (srl x1, (*ext (sub 32, y)))) -> + // fold (or/add (shl x0, (*ext y)), + // (srl x1, (*ext (sub 32, y)))) -> // (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y)) // - // fold (or (shl x0, (*ext (sub 32, y))), - // (srl x1, (*ext y))) -> + // fold (or/add (shl x0, (*ext (sub 32, y))), + // (srl x1, (*ext y))) -> // (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y)) - if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1)) { + if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1, + FromAdd)) return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1, HasPos ? Pos : Neg); - } // Matching the shift+xor cases, we can't easily use the xor'd shift amount // so for now just use the PosOpcode case if its legal. // TODO: When can we use the NegOpcode case? if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) { SDValue X; - // fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31))) + // fold (or/add (shl x0, y), (srl (srl x1, 1), (xor y, 31))) // -> (fshl x0, x1, y) if (sd_match(N1, m_Srl(m_Value(X), m_One())) && sd_match(InnerNeg, @@ -8546,7 +8549,7 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, return DAG.getNode(ISD::FSHL, DL, VT, N0, X, Pos); } - // fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y)) + // fold (or/add (shl (shl x0, 1), (xor y, 31)), (srl x1, y)) // -> (fshr x0, x1, y) if (sd_match(N0, m_Shl(m_Value(X), m_One())) && sd_match(InnerPos, @@ -8555,7 +8558,7 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, return DAG.getNode(ISD::FSHR, DL, VT, X, N1, Neg); } - // fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y)) + // fold (or/add (shl (add x0, x0), (xor y, 31)), (srl x1, y)) // -> (fshr x0, x1, y) // TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization? if (sd_match(N0, m_Add(m_Value(X), m_Deferred(X))) && @@ -8569,11 +8572,12 @@ SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, return SDValue(); } -// MatchRotate - Handle an 'or' of two operands. If this is one of the many -// idioms for rotate, and if the target supports rotation instructions, generate -// a rot[lr]. This also matches funnel shift patterns, similar to rotation but -// with different shifted sources. -SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { +// MatchRotate - Handle an 'or' or 'add' of two operands. If this is one of the +// many idioms for rotate, and if the target supports rotation instructions, +// generate a rot[lr]. This also matches funnel shift patterns, similar to +// rotation but with different shifted sources. +SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL, + bool FromAdd) { EVT VT = LHS.getValueType(); // The target must have at least one rotate/funnel flavor. @@ -8600,9 +8604,9 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE && LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) { assert(LHS.getValueType() == RHS.getValueType()); - if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) { + if (SDValue Rot = + MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL, FromAdd)) return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot); - } } // Match "(X shl/srl V1) & V2" where V2 may not be present. @@ -8736,10 +8740,10 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { return SDValue(); // Requires funnel shift support. } - // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) - // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) - // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1) - // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2) + // fold (or/add (shl x, C1), (srl x, C2)) -> (rotl x, C1) + // fold (or/add (shl x, C1), (srl x, C2)) -> (rotr x, C2) + // fold (or/add (shl x, C1), (srl y, C2)) -> (fshl x, y, C1) + // fold (or/add (shl x, C1), (srl y, C2)) -> (fshr x, y, C2) // iff C1+C2 == EltSizeInBits if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) { SDValue Res; @@ -8782,29 +8786,25 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { } if (IsRotate && (HasROTL || HasROTR)) { - SDValue TryL = - MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0, - RExtOp0, HasROTL, ISD::ROTL, ISD::ROTR, DL); - if (TryL) + if (SDValue TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, + LExtOp0, RExtOp0, FromAdd, HasROTL, + ISD::ROTL, ISD::ROTR, DL)) return TryL; - SDValue TryR = - MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0, - LExtOp0, HasROTR, ISD::ROTR, ISD::ROTL, DL); - if (TryR) + if (SDValue TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, + RExtOp0, LExtOp0, FromAdd, HasROTR, + ISD::ROTR, ISD::ROTL, DL)) return TryR; } - SDValue TryL = - MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt, - LExtOp0, RExtOp0, HasFSHL, ISD::FSHL, ISD::FSHR, DL); - if (TryL) + if (SDValue TryL = MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, + RHSShiftAmt, LExtOp0, RExtOp0, FromAdd, + HasFSHL, ISD::FSHL, ISD::FSHR, DL)) return TryL; - SDValue TryR = - MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt, - RExtOp0, LExtOp0, HasFSHR, ISD::FSHR, ISD::FSHL, DL); - if (TryR) + if (SDValue TryR = MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, + LHSShiftAmt, RExtOp0, LExtOp0, FromAdd, + HasFSHR, ISD::FSHR, ISD::FSHL, DL)) return TryR; return SDValue(); diff --git a/llvm/test/CodeGen/AMDGPU/rotate-add.ll b/llvm/test/CodeGen/AMDGPU/rotate-add.ll index faf89f41bdf86..53a49c9a21e2c 100644 --- a/llvm/test/CodeGen/AMDGPU/rotate-add.ll +++ b/llvm/test/CodeGen/AMDGPU/rotate-add.ll @@ -44,19 +44,15 @@ define i32 @test_rotl_var(i32 %x, i32 %y) { ; SI-LABEL: test_rotl_var: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, v1, v0 ; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 -; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_rotl_var: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, v1, v0 ; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 -; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] %shl = shl i32 %x, %y %sub = sub i32 32, %y @@ -69,19 +65,13 @@ define i32 @test_rotr_var(i32 %x, i32 %y) { ; SI-LABEL: test_rotr_var: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, v1, v0 -; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; SI-NEXT: v_alignbit_b32 v0, v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_rotr_var: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v2, v1, v0 -; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] %shr = lshr i32 %x, %y %sub = sub i32 32, %y @@ -174,21 +164,13 @@ define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) { ; SI-LABEL: test_fshr_special_case: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, v2, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; SI-NEXT: v_xor_b32_e32 v2, 31, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_fshr_special_case: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v1, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: v_xor_b32_e32 v2, 31, v2 -; VI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] %shl = lshr i32 %x1, %y %srli = shl i32 %x0, 1 diff --git a/llvm/test/CodeGen/ARM/rotate-add.ll b/llvm/test/CodeGen/ARM/rotate-add.ll index 9325e8b062dda..fd3055e5e2725 100644 --- a/llvm/test/CodeGen/ARM/rotate-add.ll +++ b/llvm/test/CodeGen/ARM/rotate-add.ll @@ -29,9 +29,8 @@ define i32 @test_simple_rotr(i32 %x) { define i32 @test_rotl_var(i32 %x, i32 %y) { ; CHECK-LABEL: test_rotl_var: ; CHECK: @ %bb.0: -; CHECK-NEXT: lsl r2, r0, r1 ; CHECK-NEXT: rsb r1, r1, #32 -; CHECK-NEXT: add r0, r2, r0, lsr r1 +; CHECK-NEXT: ror r0, r0, r1 ; CHECK-NEXT: bx lr %shl = shl i32 %x, %y %sub = sub i32 32, %y @@ -43,9 +42,7 @@ define i32 @test_rotl_var(i32 %x, i32 %y) { define i32 @test_rotr_var(i32 %x, i32 %y) { ; CHECK-LABEL: test_rotr_var: ; CHECK: @ %bb.0: -; CHECK-NEXT: lsr r2, r0, r1 -; CHECK-NEXT: rsb r1, r1, #32 -; CHECK-NEXT: add r0, r2, r0, lsl r1 +; CHECK-NEXT: ror r0, r0, r1 ; CHECK-NEXT: bx lr %shr = lshr i32 %x, %y %sub = sub i32 32, %y diff --git a/llvm/test/CodeGen/NVPTX/rotate-add.ll b/llvm/test/CodeGen/NVPTX/rotate-add.ll index c79a95958eca2..820e8000a5657 100644 --- a/llvm/test/CodeGen/NVPTX/rotate-add.ll +++ b/llvm/test/CodeGen/NVPTX/rotate-add.ll @@ -39,16 +39,13 @@ define i32 @test_simple_rotr(i32 %x) { define i32 @test_rotl_var(i32 %x, i32 %y) { ; CHECK-LABEL: test_rotl_var( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_rotl_var_param_0]; ; CHECK-NEXT: ld.param.u32 %r2, [test_rotl_var_param_1]; -; CHECK-NEXT: shl.b32 %r3, %r1, %r2; -; CHECK-NEXT: sub.s32 %r4, 32, %r2; -; CHECK-NEXT: shr.u32 %r5, %r1, %r4; -; CHECK-NEXT: add.s32 %r6, %r3, %r5; -; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: shf.l.wrap.b32 %r3, %r1, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %shl = shl i32 %x, %y %sub = sub i32 32, %y @@ -60,16 +57,13 @@ define i32 @test_rotl_var(i32 %x, i32 %y) { define i32 @test_rotr_var(i32 %x, i32 %y) { ; CHECK-LABEL: test_rotr_var( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_rotr_var_param_0]; ; CHECK-NEXT: ld.param.u32 %r2, [test_rotr_var_param_1]; -; CHECK-NEXT: shr.u32 %r3, %r1, %r2; -; CHECK-NEXT: sub.s32 %r4, 32, %r2; -; CHECK-NEXT: shl.b32 %r5, %r1, %r4; -; CHECK-NEXT: add.s32 %r6, %r3, %r5; -; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: shf.r.wrap.b32 %r3, %r1, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %shr = lshr i32 %x, %y %sub = sub i32 32, %y @@ -127,18 +121,14 @@ define i32 @test_invalid_rotr_var_and(i32 %x, i32 %y) { define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) { ; CHECK-LABEL: test_fshl_special_case( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_fshl_special_case_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_fshl_special_case_param_2]; -; CHECK-NEXT: shl.b32 %r3, %r1, %r2; -; CHECK-NEXT: ld.param.u32 %r4, [test_fshl_special_case_param_1]; -; CHECK-NEXT: shr.u32 %r5, %r4, 1; -; CHECK-NEXT: xor.b32 %r6, %r2, 31; -; CHECK-NEXT: shr.u32 %r7, %r5, %r6; -; CHECK-NEXT: add.s32 %r8, %r3, %r7; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ld.param.u32 %r2, [test_fshl_special_case_param_1]; +; CHECK-NEXT: ld.param.u32 %r3, [test_fshl_special_case_param_2]; +; CHECK-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %shl = shl i32 %x0, %y %srli = lshr i32 %x1, 1 @@ -151,18 +141,14 @@ define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) { define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) { ; CHECK-LABEL: test_fshr_special_case( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_fshr_special_case_param_0]; ; CHECK-NEXT: ld.param.u32 %r2, [test_fshr_special_case_param_1]; ; CHECK-NEXT: ld.param.u32 %r3, [test_fshr_special_case_param_2]; -; CHECK-NEXT: shr.u32 %r4, %r2, %r3; -; CHECK-NEXT: shl.b32 %r5, %r1, 1; -; CHECK-NEXT: xor.b32 %r6, %r3, 31; -; CHECK-NEXT: shl.b32 %r7, %r5, %r6; -; CHECK-NEXT: add.s32 %r8, %r4, %r7; -; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; %shl = lshr i32 %x1, %y %srli = shl i32 %x0, 1 diff --git a/llvm/test/CodeGen/X86/rotate-add.ll b/llvm/test/CodeGen/X86/rotate-add.ll index 6e19fc20abf04..c705505bbbf2a 100644 --- a/llvm/test/CodeGen/X86/rotate-add.ll +++ b/llvm/test/CodeGen/X86/rotate-add.ll @@ -43,22 +43,15 @@ define i32 @test_rotl_var(i32 %x, i32 %y) { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: negb %cl -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: addl %edx, %eax +; X86-NEXT: roll %cl, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_rotl_var: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll %cl, %eax -; X64-NEXT: negb %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %edi -; X64-NEXT: addl %edi, %eax +; X64-NEXT: roll %cl, %eax ; X64-NEXT: retq %shl = shl i32 %x, %y %sub = sub i32 32, %y @@ -72,22 +65,15 @@ define i32 @test_rotr_var(i32 %x, i32 %y) { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: negb %cl -; X86-NEXT: shll %cl, %eax -; X86-NEXT: addl %edx, %eax +; X86-NEXT: rorl %cl, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_rotr_var: ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movl %edi, %eax -; X64-NEXT: shrl %cl, %eax -; X64-NEXT: negb %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shll %cl, %edi -; X64-NEXT: addl %edi, %eax +; X64-NEXT: rorl %cl, %eax ; X64-NEXT: retq %shr = lshr i32 %x, %y %sub = sub i32 32, %y @@ -159,27 +145,18 @@ define i32 @test_invalid_rotr_var_and(i32 %x, i32 %y) { define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) { ; X86-LABEL: test_fshl_special_case: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: shll %cl, %edx -; X86-NEXT: shrl %eax -; X86-NEXT: notb %cl -; X86-NEXT: shrl %cl, %eax -; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shldl %cl, %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_fshl_special_case: ; X64: # %bb.0: ; X64-NEXT: movl %edx, %ecx -; X64-NEXT: # kill: def $esi killed $esi def $rsi -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: shll %cl, %edi -; X64-NEXT: shrl %esi -; X64-NEXT: notb %cl +; X64-NEXT: movl %edi, %eax ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shrl %cl, %esi -; X64-NEXT: leal (%rsi,%rdi), %eax +; X64-NEXT: shldl %cl, %esi, %eax ; X64-NEXT: retq %shl = shl i32 %x0, %y %srli = lshr i32 %x1, 1 @@ -192,26 +169,18 @@ define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) { define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) { ; X86-LABEL: test_fshr_special_case: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: shrl %cl, %edx -; X86-NEXT: addl %eax, %eax -; X86-NEXT: notb %cl -; X86-NEXT: shll %cl, %eax -; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shrdl %cl, %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: test_fshr_special_case: ; X64: # %bb.0: ; X64-NEXT: movl %edx, %ecx -; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: shrl %cl, %esi -; X64-NEXT: leal (%rdi,%rdi), %eax -; X64-NEXT: notb %cl +; X64-NEXT: movl %esi, %eax ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shll %cl, %eax -; X64-NEXT: addl %esi, %eax +; X64-NEXT: shrdl %cl, %edi, %eax ; X64-NEXT: retq %shl = lshr i32 %x1, %y %srli = shl i32 %x0, 1 From 64b060f129fc580cb3a9dce8b4456d496e6cdcd6 Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Fri, 4 Apr 2025 15:41:05 -0700 Subject: [PATCH 0707/1029] [mlir][tosa] Update URLs to TOSA specification (#134449) - The existing URLs are no longer valid, updated to the current one Signed-off-by: Jerry Ge --- mlir/docs/Dialects/TOSA.md | 2 +- mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td | 2 +- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 2 +- mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mlir/docs/Dialects/TOSA.md b/mlir/docs/Dialects/TOSA.md index 273a71aeed200..15a2b459f8b91 100644 --- a/mlir/docs/Dialects/TOSA.md +++ b/mlir/docs/Dialects/TOSA.md @@ -5,7 +5,7 @@ ## Rationale The MLIR TOSA dialect implements the [TOSA -specification](https://developer.mlplatform.org/w/tosa/). This document +specification](https://www.mlplatform.org/tosa/tosa_spec.html). This document describes the decision process for how TOSA expresses operators in high level dialects. diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td index 3a6d3d178ff37..fd8ff039d1d0c 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td @@ -72,7 +72,7 @@ class Tosa_Attr traits = []> // feed numerical precision parameters to the functional implementation of TOSA // operators. // The functional behavior is defined in the TOSA specification maintained at -// https://developer.mlplatform.org/w/tosa/. TOSA leverages MLIR's built in +// https://www.mlplatform.org/tosa/tosa_spec.html. TOSA leverages MLIR's built in // quantization support: https://mlir.llvm.org/docs/Quantization/, and supports // uniform quantization. Depending on datatype, asymmetric and symmetric // quantization are supported. The types themselves are described in diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index cda75da57f1ad..741de84cc5840 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // // This file defines the operation set for the TOSA dialect as defined in -// the TOSA specfication (https://developer.mlplatform.org/w/tosa/). +// the TOSA specfication (https://www.mlplatform.org/tosa/tosa_spec.html). // //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index 8ae67a25498ad..c8e9ad8bd3346 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -8,7 +8,7 @@ // // \file // This file implements the TOSA Specification: -// https://developer.mlplatform.org/w/tosa/ +// https://www.mlplatform.org/tosa/tosa_spec.html // //===----------------------------------------------------------------------===// From e3369a8dc9130ef261a092d866e8ba4f8242aa26 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Fri, 4 Apr 2025 16:51:35 -0600 Subject: [PATCH 0708/1029] [NFC][HLSL] Rename ResourceBinding Types (#134165) Non-functional change as first step in https://github.com/llvm/wg-hlsl/pull/207 Removes `Binding` from "Resource Instance" types --- llvm/include/llvm/Analysis/DXILResource.h | 64 +++++----- llvm/include/llvm/InitializePasses.h | 2 +- llvm/include/llvm/LinkAllPasses.h | 2 +- llvm/lib/Analysis/Analysis.cpp | 2 +- llvm/lib/Analysis/DXILResource.cpp | 105 ++++++++------- llvm/lib/Passes/PassRegistry.def | 5 +- .../lib/Target/DirectX/DXContainerGlobals.cpp | 34 +++-- llvm/lib/Target/DirectX/DXILOpLowering.cpp | 38 +++--- llvm/lib/Target/DirectX/DXILPrepare.cpp | 2 +- llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp | 46 +++---- .../Target/DirectX/DXILTranslateMetadata.cpp | 34 ++--- .../DXILResource/buffer-frombinding.ll | 2 +- llvm/unittests/Analysis/DXILResourceTest.cpp | 120 +++++++++--------- .../DirectX/UniqueResourceFromUseTests.cpp | 18 +-- 14 files changed, 230 insertions(+), 244 deletions(-) diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h index d399457e16916..ff7961c9ad51c 100644 --- a/llvm/include/llvm/Analysis/DXILResource.h +++ b/llvm/include/llvm/Analysis/DXILResource.h @@ -326,7 +326,7 @@ class ResourceTypeInfo { //===----------------------------------------------------------------------===// -class ResourceBindingInfo { +class ResourceInfo { public: struct ResourceBinding { uint32_t RecordID; @@ -353,9 +353,9 @@ class ResourceBindingInfo { GlobalVariable *Symbol = nullptr; public: - ResourceBindingInfo(uint32_t RecordID, uint32_t Space, uint32_t LowerBound, - uint32_t Size, TargetExtType *HandleTy, - GlobalVariable *Symbol = nullptr) + ResourceInfo(uint32_t RecordID, uint32_t Space, uint32_t LowerBound, + uint32_t Size, TargetExtType *HandleTy, + GlobalVariable *Symbol = nullptr) : Binding{RecordID, Space, LowerBound, Size}, HandleTy(HandleTy), Symbol(Symbol) {} @@ -372,14 +372,12 @@ class ResourceBindingInfo { std::pair getAnnotateProps(Module &M, dxil::ResourceTypeInfo &RTI) const; - bool operator==(const ResourceBindingInfo &RHS) const { + bool operator==(const ResourceInfo &RHS) const { return std::tie(Binding, HandleTy, Symbol) == std::tie(RHS.Binding, RHS.HandleTy, RHS.Symbol); } - bool operator!=(const ResourceBindingInfo &RHS) const { - return !(*this == RHS); - } - bool operator<(const ResourceBindingInfo &RHS) const { + bool operator!=(const ResourceInfo &RHS) const { return !(*this == RHS); } + bool operator<(const ResourceInfo &RHS) const { return Binding < RHS.Binding; } @@ -440,8 +438,8 @@ ModulePass *createDXILResourceTypeWrapperPassPass(); //===----------------------------------------------------------------------===// -class DXILBindingMap { - SmallVector Infos; +class DXILResourceMap { + SmallVector Infos; DenseMap CallMap; unsigned FirstUAV = 0; unsigned FirstCBuffer = 0; @@ -451,8 +449,8 @@ class DXILBindingMap { void populate(Module &M, DXILResourceTypeMap &DRTM); public: - using iterator = SmallVector::iterator; - using const_iterator = SmallVector::const_iterator; + using iterator = SmallVector::iterator; + using const_iterator = SmallVector::const_iterator; iterator begin() { return Infos.begin(); } const_iterator begin() const { return Infos.begin(); } @@ -466,12 +464,12 @@ class DXILBindingMap { return Pos == CallMap.end() ? Infos.end() : (Infos.begin() + Pos->second); } - /// Resolves a resource handle into a vector of ResourceBindingInfos that + /// Resolves a resource handle into a vector of ResourceInfos that /// represent the possible unique creations of the handle. Certain cases are /// ambiguous so multiple creation instructions may be returned. The resulting - /// ResourceBindingInfo can be used to depuplicate unique handles that + /// ResourceInfo can be used to depuplicate unique handles that /// reference the same resource - SmallVector findByUse(const Value *Key) const; + SmallVector findByUse(const Value *Key) const; const_iterator find(const CallInst *Key) const { auto Pos = CallMap.find(Key); @@ -521,48 +519,46 @@ class DXILBindingMap { void print(raw_ostream &OS, DXILResourceTypeMap &DRTM, const DataLayout &DL) const; - friend class DXILResourceBindingAnalysis; - friend class DXILResourceBindingWrapperPass; + friend class DXILResourceAnalysis; + friend class DXILResourceWrapperPass; }; -class DXILResourceBindingAnalysis - : public AnalysisInfoMixin { - friend AnalysisInfoMixin; +class DXILResourceAnalysis : public AnalysisInfoMixin { + friend AnalysisInfoMixin; static AnalysisKey Key; public: - using Result = DXILBindingMap; + using Result = DXILResourceMap; /// Gather resource info for the module \c M. - DXILBindingMap run(Module &M, ModuleAnalysisManager &AM); + DXILResourceMap run(Module &M, ModuleAnalysisManager &AM); }; -/// Printer pass for the \c DXILResourceBindingAnalysis results. -class DXILResourceBindingPrinterPass - : public PassInfoMixin { +/// Printer pass for the \c DXILResourceAnalysis results. +class DXILResourcePrinterPass : public PassInfoMixin { raw_ostream &OS; public: - explicit DXILResourceBindingPrinterPass(raw_ostream &OS) : OS(OS) {} + explicit DXILResourcePrinterPass(raw_ostream &OS) : OS(OS) {} PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); static bool isRequired() { return true; } }; -class DXILResourceBindingWrapperPass : public ModulePass { - std::unique_ptr Map; +class DXILResourceWrapperPass : public ModulePass { + std::unique_ptr Map; DXILResourceTypeMap *DRTM; public: static char ID; // Class identification, replacement for typeinfo - DXILResourceBindingWrapperPass(); - ~DXILResourceBindingWrapperPass() override; + DXILResourceWrapperPass(); + ~DXILResourceWrapperPass() override; - const DXILBindingMap &getBindingMap() const { return *Map; } - DXILBindingMap &getBindingMap() { return *Map; } + const DXILResourceMap &getBindingMap() const { return *Map; } + DXILResourceMap &getBindingMap() { return *Map; } void getAnalysisUsage(AnalysisUsage &AU) const override; bool runOnModule(Module &M) override; @@ -572,7 +568,7 @@ class DXILResourceBindingWrapperPass : public ModulePass { void dump() const; }; -ModulePass *createDXILResourceBindingWrapperPassPass(); +ModulePass *createDXILResourceWrapperPassPass(); } // namespace llvm diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index fb27867176788..c6110aa298893 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -84,7 +84,7 @@ void initializeDAHPass(PassRegistry &); void initializeDCELegacyPassPass(PassRegistry &); void initializeDXILMetadataAnalysisWrapperPassPass(PassRegistry &); void initializeDXILMetadataAnalysisWrapperPrinterPass(PassRegistry &); -void initializeDXILResourceBindingWrapperPassPass(PassRegistry &); +void initializeDXILResourceWrapperPassPass(PassRegistry &); void initializeDXILResourceTypeWrapperPassPass(PassRegistry &); void initializeDeadMachineInstructionElimPass(PassRegistry &); void initializeDebugifyMachineModulePass(PassRegistry &); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index ac1970334de0c..5965be676ea69 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -70,7 +70,7 @@ struct ForcePassLinking { (void)llvm::createCallGraphViewerPass(); (void)llvm::createCFGSimplificationPass(); (void)llvm::createStructurizeCFGPass(); - (void)llvm::createDXILResourceBindingWrapperPassPass(); + (void)llvm::createDXILResourceWrapperPassPass(); (void)llvm::createDXILResourceTypeWrapperPassPass(); (void)llvm::createDeadArgEliminationPass(); (void)llvm::createDeadCodeEliminationPass(); diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp index bc2b8a57f83a7..5cd5115bc4cd4 100644 --- a/llvm/lib/Analysis/Analysis.cpp +++ b/llvm/lib/Analysis/Analysis.cpp @@ -25,7 +25,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) { initializeCallGraphDOTPrinterPass(Registry); initializeCallGraphViewerPass(Registry); initializeCycleInfoWrapperPassPass(Registry); - initializeDXILResourceBindingWrapperPassPass(Registry); + initializeDXILResourceWrapperPassPass(Registry); initializeDXILResourceTypeWrapperPassPass(Registry); initializeDependenceAnalysisWrapperPassPass(Registry); initializeDominanceFrontierWrapperPassPass(Registry); diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index 22afb4cba6f26..b4bd43206b6e4 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -531,8 +531,8 @@ void ResourceTypeInfo::print(raw_ostream &OS, const DataLayout &DL) const { } } -GlobalVariable *ResourceBindingInfo::createSymbol(Module &M, StructType *Ty, - StringRef Name) { +GlobalVariable *ResourceInfo::createSymbol(Module &M, StructType *Ty, + StringRef Name) { assert(!Symbol && "Symbol has already been created"); Symbol = new GlobalVariable(M, Ty, /*isConstant=*/true, GlobalValue::ExternalLinkage, @@ -540,8 +540,8 @@ GlobalVariable *ResourceBindingInfo::createSymbol(Module &M, StructType *Ty, return Symbol; } -MDTuple *ResourceBindingInfo::getAsMetadata(Module &M, - dxil::ResourceTypeInfo &RTI) const { +MDTuple *ResourceInfo::getAsMetadata(Module &M, + dxil::ResourceTypeInfo &RTI) const { LLVMContext &Ctx = M.getContext(); const DataLayout &DL = M.getDataLayout(); @@ -610,8 +610,7 @@ MDTuple *ResourceBindingInfo::getAsMetadata(Module &M, } std::pair -ResourceBindingInfo::getAnnotateProps(Module &M, - dxil::ResourceTypeInfo &RTI) const { +ResourceInfo::getAnnotateProps(Module &M, dxil::ResourceTypeInfo &RTI) const { const DataLayout &DL = M.getDataLayout(); uint32_t ResourceKind = llvm::to_underlying(RTI.getResourceKind()); @@ -658,8 +657,8 @@ ResourceBindingInfo::getAnnotateProps(Module &M, return {Word0, Word1}; } -void ResourceBindingInfo::print(raw_ostream &OS, dxil::ResourceTypeInfo &RTI, - const DataLayout &DL) const { +void ResourceInfo::print(raw_ostream &OS, dxil::ResourceTypeInfo &RTI, + const DataLayout &DL) const { if (Symbol) { OS << " Symbol: "; Symbol->printAsOperand(OS); @@ -686,9 +685,8 @@ bool DXILResourceTypeMap::invalidate(Module &M, const PreservedAnalyses &PA, //===----------------------------------------------------------------------===// -void DXILBindingMap::populate(Module &M, DXILResourceTypeMap &DRTM) { - SmallVector> - CIToInfos; +void DXILResourceMap::populate(Module &M, DXILResourceTypeMap &DRTM) { + SmallVector> CIToInfos; for (Function &F : M.functions()) { if (!F.isDeclaration()) @@ -711,10 +709,10 @@ void DXILBindingMap::populate(Module &M, DXILResourceTypeMap &DRTM) { cast(CI->getArgOperand(1))->getZExtValue(); uint32_t Size = cast(CI->getArgOperand(2))->getZExtValue(); - ResourceBindingInfo RBI = ResourceBindingInfo{ - /*RecordID=*/0, Space, LowerBound, Size, HandleTy}; + ResourceInfo RI = + ResourceInfo{/*RecordID=*/0, Space, LowerBound, Size, HandleTy}; - CIToInfos.emplace_back(CI, RBI, RTI); + CIToInfos.emplace_back(CI, RI, RTI); } break; @@ -723,18 +721,18 @@ void DXILBindingMap::populate(Module &M, DXILResourceTypeMap &DRTM) { } llvm::stable_sort(CIToInfos, [](auto &LHS, auto &RHS) { - const auto &[LCI, LRBI, LRTI] = LHS; - const auto &[RCI, RRBI, RRTI] = RHS; + const auto &[LCI, LRI, LRTI] = LHS; + const auto &[RCI, RRI, RRTI] = RHS; // Sort by resource class first for grouping purposes, and then by the // binding and type so we can remove duplicates. ResourceClass LRC = LRTI.getResourceClass(); ResourceClass RRC = RRTI.getResourceClass(); - return std::tie(LRC, LRBI, LRTI) < std::tie(RRC, RRBI, RRTI); + return std::tie(LRC, LRI, LRTI) < std::tie(RRC, RRI, RRTI); }); - for (auto [CI, RBI, RTI] : CIToInfos) { - if (Infos.empty() || RBI != Infos.back()) - Infos.push_back(RBI); + for (auto [CI, RI, RTI] : CIToInfos) { + if (Infos.empty() || RI != Infos.back()) + Infos.push_back(RI); CallMap[CI] = Infos.size() - 1; } @@ -743,8 +741,8 @@ void DXILBindingMap::populate(Module &M, DXILResourceTypeMap &DRTM) { FirstUAV = FirstCBuffer = FirstSampler = Size; uint32_t NextID = 0; for (unsigned I = 0, E = Size; I != E; ++I) { - ResourceBindingInfo &RBI = Infos[I]; - ResourceTypeInfo &RTI = DRTM[RBI.getHandleTy()]; + ResourceInfo &RI = Infos[I]; + ResourceTypeInfo &RTI = DRTM[RI.getHandleTy()]; if (RTI.isUAV() && FirstUAV == Size) { FirstUAV = I; NextID = 0; @@ -762,16 +760,16 @@ void DXILBindingMap::populate(Module &M, DXILResourceTypeMap &DRTM) { FirstUAV = std::min({FirstUAV, FirstCBuffer}); // Adjust the resource binding to use the next ID. - RBI.setBindingID(NextID++); + RI.setBindingID(NextID++); } } -void DXILBindingMap::print(raw_ostream &OS, DXILResourceTypeMap &DRTM, - const DataLayout &DL) const { +void DXILResourceMap::print(raw_ostream &OS, DXILResourceTypeMap &DRTM, + const DataLayout &DL) const { for (unsigned I = 0, E = Infos.size(); I != E; ++I) { OS << "Binding " << I << ":\n"; - const dxil::ResourceBindingInfo &RBI = Infos[I]; - RBI.print(OS, DRTM[RBI.getHandleTy()], DL); + const dxil::ResourceInfo &RI = Infos[I]; + RI.print(OS, DRTM[RI.getHandleTy()], DL); OS << "\n"; } @@ -782,10 +780,10 @@ void DXILBindingMap::print(raw_ostream &OS, DXILResourceTypeMap &DRTM, } } -SmallVector -DXILBindingMap::findByUse(const Value *Key) const { +SmallVector +DXILResourceMap::findByUse(const Value *Key) const { if (const PHINode *Phi = dyn_cast(Key)) { - SmallVector Children; + SmallVector Children; for (const Value *V : Phi->operands()) { Children.append(findByUse(V)); } @@ -810,7 +808,7 @@ DXILBindingMap::findByUse(const Value *Key) const { // Check if any of the parameters are the resource we are following. If so // keep searching. If none of them are return an empty list const Type *UseType = CI->getType(); - SmallVector Children; + SmallVector Children; for (const Value *V : CI->args()) { if (V->getType() != UseType) continue; @@ -824,22 +822,22 @@ DXILBindingMap::findByUse(const Value *Key) const { //===----------------------------------------------------------------------===// AnalysisKey DXILResourceTypeAnalysis::Key; -AnalysisKey DXILResourceBindingAnalysis::Key; +AnalysisKey DXILResourceAnalysis::Key; -DXILBindingMap DXILResourceBindingAnalysis::run(Module &M, - ModuleAnalysisManager &AM) { - DXILBindingMap Data; +DXILResourceMap DXILResourceAnalysis::run(Module &M, + ModuleAnalysisManager &AM) { + DXILResourceMap Data; DXILResourceTypeMap &DRTM = AM.getResult(M); Data.populate(M, DRTM); return Data; } -PreservedAnalyses -DXILResourceBindingPrinterPass::run(Module &M, ModuleAnalysisManager &AM) { - DXILBindingMap &DBM = AM.getResult(M); +PreservedAnalyses DXILResourcePrinterPass::run(Module &M, + ModuleAnalysisManager &AM) { + DXILResourceMap &DRM = AM.getResult(M); DXILResourceTypeMap &DRTM = AM.getResult(M); - DBM.print(OS, DRTM, M.getDataLayout()); + DRM.print(OS, DRTM, M.getDataLayout()); return PreservedAnalyses::all(); } @@ -857,21 +855,19 @@ ModulePass *llvm::createDXILResourceTypeWrapperPassPass() { return new DXILResourceTypeWrapperPass(); } -DXILResourceBindingWrapperPass::DXILResourceBindingWrapperPass() - : ModulePass(ID) { - initializeDXILResourceBindingWrapperPassPass( - *PassRegistry::getPassRegistry()); +DXILResourceWrapperPass::DXILResourceWrapperPass() : ModulePass(ID) { + initializeDXILResourceWrapperPassPass(*PassRegistry::getPassRegistry()); } -DXILResourceBindingWrapperPass::~DXILResourceBindingWrapperPass() = default; +DXILResourceWrapperPass::~DXILResourceWrapperPass() = default; -void DXILResourceBindingWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { +void DXILResourceWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredTransitive(); AU.setPreservesAll(); } -bool DXILResourceBindingWrapperPass::runOnModule(Module &M) { - Map.reset(new DXILBindingMap()); +bool DXILResourceWrapperPass::runOnModule(Module &M) { + Map.reset(new DXILResourceMap()); DRTM = &getAnalysis().getResourceTypeMap(); Map->populate(M, *DRTM); @@ -879,10 +875,9 @@ bool DXILResourceBindingWrapperPass::runOnModule(Module &M) { return false; } -void DXILResourceBindingWrapperPass::releaseMemory() { Map.reset(); } +void DXILResourceWrapperPass::releaseMemory() { Map.reset(); } -void DXILResourceBindingWrapperPass::print(raw_ostream &OS, - const Module *M) const { +void DXILResourceWrapperPass::print(raw_ostream &OS, const Module *M) const { if (!Map) { OS << "No resource map has been built!\n"; return; @@ -892,13 +887,13 @@ void DXILResourceBindingWrapperPass::print(raw_ostream &OS, #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD -void DXILResourceBindingWrapperPass::dump() const { print(dbgs(), nullptr); } +void DXILResourceWrapperPass::dump() const { print(dbgs(), nullptr); } #endif -INITIALIZE_PASS(DXILResourceBindingWrapperPass, "dxil-resource-binding", +INITIALIZE_PASS(DXILResourceWrapperPass, "dxil-resources", "DXIL Resource Binding Analysis", false, true) -char DXILResourceBindingWrapperPass::ID = 0; +char DXILResourceWrapperPass::ID = 0; -ModulePass *llvm::createDXILResourceBindingWrapperPassPass() { - return new DXILResourceBindingWrapperPass(); +ModulePass *llvm::createDXILResourceWrapperPassPass() { + return new DXILResourceWrapperPass(); } diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 510a505995304..0def3304343eb 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -22,7 +22,7 @@ MODULE_ANALYSIS("callgraph", CallGraphAnalysis()) MODULE_ANALYSIS("collector-metadata", CollectorMetadataAnalysis()) MODULE_ANALYSIS("ctx-prof-analysis", CtxProfAnalysis()) MODULE_ANALYSIS("dxil-metadata", DXILMetadataAnalysis()) -MODULE_ANALYSIS("dxil-resource-binding", DXILResourceBindingAnalysis()) +MODULE_ANALYSIS("dxil-resources", DXILResourceAnalysis()) MODULE_ANALYSIS("dxil-resource-type", DXILResourceTypeAnalysis()) MODULE_ANALYSIS("inline-advisor", InlineAdvisorAnalysis()) MODULE_ANALYSIS("ir-similarity", IRSimilarityAnalysis()) @@ -128,8 +128,7 @@ MODULE_PASS("print-must-be-executed-contexts", MODULE_PASS("print-profile-summary", ProfileSummaryPrinterPass(errs())) MODULE_PASS("print-stack-safety", StackSafetyGlobalPrinterPass(errs())) MODULE_PASS("print", DXILMetadataAnalysisPrinterPass(errs())) -MODULE_PASS("print", - DXILResourceBindingPrinterPass(errs())) +MODULE_PASS("print", DXILResourcePrinterPass(errs())) MODULE_PASS("print", InlineAdvisorAnalysisPrinterPass(errs())) MODULE_PASS("print", ModuleDebugInfoPrinterPass(errs())) MODULE_PASS("print", PhysicalRegisterUsageInfoPrinterPass(errs())) diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index c7a130a1f9c8a..5279847419a81 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -67,7 +67,7 @@ class DXContainerGlobals : public llvm::ModulePass { AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); } }; @@ -181,13 +181,13 @@ void DXContainerGlobals::addRootSignature(Module &M, } void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) { - const DXILBindingMap &DBM = - getAnalysis().getBindingMap(); + const DXILResourceMap &DRM = + getAnalysis().getBindingMap(); DXILResourceTypeMap &DRTM = getAnalysis().getResourceTypeMap(); auto MakeBinding = - [](const dxil::ResourceBindingInfo::ResourceBinding &Binding, + [](const dxil::ResourceInfo::ResourceBinding &Binding, const dxbc::PSV::ResourceType Type, const dxil::ResourceKind Kind, const dxbc::PSV::ResourceFlags Flags = dxbc::PSV::ResourceFlags()) { dxbc::PSV::v2::ResourceBindInfo BindInfo; @@ -200,24 +200,21 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) { return BindInfo; }; - for (const dxil::ResourceBindingInfo &RBI : DBM.cbuffers()) { - const dxil::ResourceBindingInfo::ResourceBinding &Binding = - RBI.getBinding(); + for (const dxil::ResourceInfo &RI : DRM.cbuffers()) { + const dxil::ResourceInfo::ResourceBinding &Binding = RI.getBinding(); PSV.Resources.push_back(MakeBinding(Binding, dxbc::PSV::ResourceType::CBV, dxil::ResourceKind::CBuffer)); } - for (const dxil::ResourceBindingInfo &RBI : DBM.samplers()) { - const dxil::ResourceBindingInfo::ResourceBinding &Binding = - RBI.getBinding(); + for (const dxil::ResourceInfo &RI : DRM.samplers()) { + const dxil::ResourceInfo::ResourceBinding &Binding = RI.getBinding(); PSV.Resources.push_back(MakeBinding(Binding, dxbc::PSV::ResourceType::Sampler, dxil::ResourceKind::Sampler)); } - for (const dxil::ResourceBindingInfo &RBI : DBM.srvs()) { - const dxil::ResourceBindingInfo::ResourceBinding &Binding = - RBI.getBinding(); + for (const dxil::ResourceInfo &RI : DRM.srvs()) { + const dxil::ResourceInfo::ResourceBinding &Binding = RI.getBinding(); - dxil::ResourceTypeInfo &TypeInfo = DRTM[RBI.getHandleTy()]; + dxil::ResourceTypeInfo &TypeInfo = DRTM[RI.getHandleTy()]; dxbc::PSV::ResourceType ResType; if (TypeInfo.isStruct()) ResType = dxbc::PSV::ResourceType::SRVStructured; @@ -229,11 +226,10 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) { PSV.Resources.push_back( MakeBinding(Binding, ResType, TypeInfo.getResourceKind())); } - for (const dxil::ResourceBindingInfo &RBI : DBM.uavs()) { - const dxil::ResourceBindingInfo::ResourceBinding &Binding = - RBI.getBinding(); + for (const dxil::ResourceInfo &RI : DRM.uavs()) { + const dxil::ResourceInfo::ResourceBinding &Binding = RI.getBinding(); - dxil::ResourceTypeInfo &TypeInfo = DRTM[RBI.getHandleTy()]; + dxil::ResourceTypeInfo &TypeInfo = DRTM[RI.getHandleTy()]; dxbc::PSV::ResourceType ResType; if (TypeInfo.getUAV().HasCounter) ResType = dxbc::PSV::ResourceType::UAVStructuredWithCounter; @@ -302,7 +298,7 @@ INITIALIZE_PASS_BEGIN(DXContainerGlobals, "dxil-globals", INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper) INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass) INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DXILResourceBindingWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass) INITIALIZE_PASS_END(DXContainerGlobals, "dxil-globals", "DXContainer Global Emitter", false, true) diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index 3dcd3d8fd244a..41a9426998826 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -83,13 +83,13 @@ namespace { class OpLowerer { Module &M; DXILOpBuilder OpBuilder; - DXILBindingMap &DBM; + DXILResourceMap &DRM; DXILResourceTypeMap &DRTM; SmallVector CleanupCasts; public: - OpLowerer(Module &M, DXILBindingMap &DBM, DXILResourceTypeMap &DRTM) - : M(M), OpBuilder(M), DBM(DBM), DRTM(DRTM) {} + OpLowerer(Module &M, DXILResourceMap &DRM, DXILResourceTypeMap &DRTM) + : M(M), OpBuilder(M), DRM(DRM), DRTM(DRTM) {} /// Replace every call to \c F using \c ReplaceCall, and then erase \c F. If /// there is an error replacing a call, we emit a diagnostic and return true. @@ -277,9 +277,9 @@ class OpLowerer { return replaceFunction(F, [&](CallInst *CI) -> Error { IRB.SetInsertPoint(CI); - auto *It = DBM.find(CI); - assert(It != DBM.end() && "Resource not in map?"); - dxil::ResourceBindingInfo &RI = *It; + auto *It = DRM.find(CI); + assert(It != DRM.end() && "Resource not in map?"); + dxil::ResourceInfo &RI = *It; const auto &Binding = RI.getBinding(); dxil::ResourceClass RC = DRTM[RI.getHandleTy()].getResourceClass(); @@ -315,9 +315,9 @@ class OpLowerer { return replaceFunction(F, [&](CallInst *CI) -> Error { IRB.SetInsertPoint(CI); - auto *It = DBM.find(CI); - assert(It != DBM.end() && "Resource not in map?"); - dxil::ResourceBindingInfo &RI = *It; + auto *It = DRM.find(CI); + assert(It != DRM.end() && "Resource not in map?"); + dxil::ResourceInfo &RI = *It; const auto &Binding = RI.getBinding(); dxil::ResourceTypeInfo &RTI = DRTM[RI.getHandleTy()]; @@ -366,7 +366,7 @@ class OpLowerer { /// Lower `dx.resource.handlefrombinding` intrinsics depending on the shader /// model and taking into account binding information from - /// DXILResourceBindingAnalysis. + /// DXILResourceAnalysis. bool lowerHandleFromBinding(Function &F) { const Triple &TT = M.getTargetTriple(); if (TT.getDXILVersion() < VersionTuple(1, 6)) @@ -867,14 +867,14 @@ class OpLowerer { } // namespace PreservedAnalyses DXILOpLowering::run(Module &M, ModuleAnalysisManager &MAM) { - DXILBindingMap &DBM = MAM.getResult(M); + DXILResourceMap &DRM = MAM.getResult(M); DXILResourceTypeMap &DRTM = MAM.getResult(M); - bool MadeChanges = OpLowerer(M, DBM, DRTM).lowerIntrinsics(); + bool MadeChanges = OpLowerer(M, DRM, DRTM).lowerIntrinsics(); if (!MadeChanges) return PreservedAnalyses::all(); PreservedAnalyses PA; - PA.preserve(); + PA.preserve(); PA.preserve(); PA.preserve(); return PA; @@ -884,12 +884,12 @@ namespace { class DXILOpLoweringLegacy : public ModulePass { public: bool runOnModule(Module &M) override { - DXILBindingMap &DBM = - getAnalysis().getBindingMap(); + DXILResourceMap &DRM = + getAnalysis().getBindingMap(); DXILResourceTypeMap &DRTM = getAnalysis().getResourceTypeMap(); - return OpLowerer(M, DBM, DRTM).lowerIntrinsics(); + return OpLowerer(M, DRM, DRTM).lowerIntrinsics(); } StringRef getPassName() const override { return "DXIL Op Lowering"; } DXILOpLoweringLegacy() : ModulePass(ID) {} @@ -897,8 +897,8 @@ class DXILOpLoweringLegacy : public ModulePass { static char ID; // Pass identification. void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); } @@ -909,7 +909,7 @@ char DXILOpLoweringLegacy::ID = 0; INITIALIZE_PASS_BEGIN(DXILOpLoweringLegacy, DEBUG_TYPE, "DXIL Op Lowering", false, false) INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DXILResourceBindingWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass) INITIALIZE_PASS_END(DXILOpLoweringLegacy, DEBUG_TYPE, "DXIL Op Lowering", false, false) diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp index 0014cc9e1f67c..de97de209048b 100644 --- a/llvm/lib/Target/DirectX/DXILPrepare.cpp +++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp @@ -248,7 +248,7 @@ class DXILPrepareModule : public ModulePass { AU.addRequired(); AU.addPreserved(); AU.addPreserved(); - AU.addPreserved(); + AU.addPreserved(); } static char ID; // Pass identification. }; diff --git a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp index 4d1832f44bb63..cf3fb34bba437 100644 --- a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp +++ b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp @@ -174,12 +174,12 @@ struct FormatResourceDimension }; struct FormatBindingID - : public llvm::FormatAdapter { + : public llvm::FormatAdapter { dxil::ResourceClass RC; - explicit FormatBindingID(const dxil::ResourceBindingInfo &RBI, + explicit FormatBindingID(const dxil::ResourceInfo &RI, const dxil::ResourceTypeInfo &RTI) - : llvm::FormatAdapter(RBI), + : llvm::FormatAdapter(RI), RC(RTI.getResourceClass()) {} void format(llvm::raw_ostream &OS, StringRef Style) override { @@ -188,12 +188,12 @@ struct FormatBindingID }; struct FormatBindingLocation - : public llvm::FormatAdapter { + : public llvm::FormatAdapter { dxil::ResourceClass RC; - explicit FormatBindingLocation(const dxil::ResourceBindingInfo &RBI, + explicit FormatBindingLocation(const dxil::ResourceInfo &RI, const dxil::ResourceTypeInfo &RTI) - : llvm::FormatAdapter(RBI), + : llvm::FormatAdapter(RI), RC(RTI.getResourceClass()) {} void format(llvm::raw_ostream &OS, StringRef Style) override { @@ -205,9 +205,9 @@ struct FormatBindingLocation }; struct FormatBindingSize - : public llvm::FormatAdapter { - explicit FormatBindingSize(const dxil::ResourceBindingInfo &RI) - : llvm::FormatAdapter(RI) {} + : public llvm::FormatAdapter { + explicit FormatBindingSize(const dxil::ResourceInfo &RI) + : llvm::FormatAdapter(RI) {} void format(llvm::raw_ostream &OS, StringRef Style) override { uint32_t Size = Item.getBinding().Size; @@ -220,7 +220,7 @@ struct FormatBindingSize } // namespace -static void prettyPrintResources(raw_ostream &OS, const DXILBindingMap &DBM, +static void prettyPrintResources(raw_ostream &OS, const DXILResourceMap &DRM, DXILResourceTypeMap &DRTM) { // Column widths are arbitrary but match the widths DXC uses. OS << ";\n; Resource Bindings:\n;\n"; @@ -231,17 +231,17 @@ static void prettyPrintResources(raw_ostream &OS, const DXILBindingMap &DBM, "", "", "", "", ""); // TODO: Do we want to sort these by binding or something like that? - for (const dxil::ResourceBindingInfo &RBI : DBM) { - const dxil::ResourceTypeInfo &RTI = DRTM[RBI.getHandleTy()]; + for (const dxil::ResourceInfo &RI : DRM) { + const dxil::ResourceTypeInfo &RTI = DRTM[RI.getHandleTy()]; dxil::ResourceClass RC = RTI.getResourceClass(); - StringRef Name(RBI.getName()); + StringRef Name(RI.getName()); StringRef Type(getRCName(RC)); StringRef Format(getFormatName(RTI)); FormatResourceDimension Dim(RTI); - FormatBindingID ID(RBI, RTI); - FormatBindingLocation Bind(RBI, RTI); - FormatBindingSize Count(RBI); + FormatBindingID ID(RI, RTI); + FormatBindingLocation Bind(RI, RTI); + FormatBindingSize Count(RI); OS << formatv("; {0,-30} {1,10} {2,7} {3,11} {4,7} {5,14} {6,9}\n", Name, Type, Format, Dim, ID, Bind, Count); } @@ -250,9 +250,9 @@ static void prettyPrintResources(raw_ostream &OS, const DXILBindingMap &DBM, PreservedAnalyses DXILPrettyPrinterPass::run(Module &M, ModuleAnalysisManager &MAM) { - const DXILBindingMap &DBM = MAM.getResult(M); + const DXILResourceMap &DRM = MAM.getResult(M); DXILResourceTypeMap &DRTM = MAM.getResult(M); - prettyPrintResources(OS, DBM, DRTM); + prettyPrintResources(OS, DRM, DRTM); return PreservedAnalyses::all(); } @@ -278,7 +278,7 @@ class DXILPrettyPrinterLegacy : public llvm::ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); AU.addRequired(); - AU.addRequired(); + AU.addRequired(); } }; } // namespace @@ -287,16 +287,16 @@ char DXILPrettyPrinterLegacy::ID = 0; INITIALIZE_PASS_BEGIN(DXILPrettyPrinterLegacy, "dxil-pretty-printer", "DXIL Metadata Pretty Printer", true, true) INITIALIZE_PASS_DEPENDENCY(DXILResourceTypeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DXILResourceBindingWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass) INITIALIZE_PASS_END(DXILPrettyPrinterLegacy, "dxil-pretty-printer", "DXIL Metadata Pretty Printer", true, true) bool DXILPrettyPrinterLegacy::runOnModule(Module &M) { - const DXILBindingMap &DBM = - getAnalysis().getBindingMap(); + const DXILResourceMap &DRM = + getAnalysis().getBindingMap(); DXILResourceTypeMap &DRTM = getAnalysis().getResourceTypeMap(); - prettyPrintResources(OS, DBM, DRTM); + prettyPrintResources(OS, DRM, DRTM); return false; } diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp index ed510c5856bf6..e177fcb0520e8 100644 --- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp +++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp @@ -72,22 +72,22 @@ enum class EntryPropsTag { } // namespace -static NamedMDNode *emitResourceMetadata(Module &M, DXILBindingMap &DBM, +static NamedMDNode *emitResourceMetadata(Module &M, DXILResourceMap &DRM, DXILResourceTypeMap &DRTM) { LLVMContext &Context = M.getContext(); - for (ResourceBindingInfo &RI : DBM) + for (ResourceInfo &RI : DRM) if (!RI.hasSymbol()) RI.createSymbol(M, DRTM[RI.getHandleTy()].createElementStruct()); SmallVector SRVs, UAVs, CBufs, Smps; - for (const ResourceBindingInfo &RI : DBM.srvs()) + for (const ResourceInfo &RI : DRM.srvs()) SRVs.push_back(RI.getAsMetadata(M, DRTM[RI.getHandleTy()])); - for (const ResourceBindingInfo &RI : DBM.uavs()) + for (const ResourceInfo &RI : DRM.uavs()) UAVs.push_back(RI.getAsMetadata(M, DRTM[RI.getHandleTy()])); - for (const ResourceBindingInfo &RI : DBM.cbuffers()) + for (const ResourceInfo &RI : DRM.cbuffers()) CBufs.push_back(RI.getAsMetadata(M, DRTM[RI.getHandleTy()])); - for (const ResourceBindingInfo &RI : DBM.samplers()) + for (const ResourceInfo &RI : DRM.samplers()) Smps.push_back(RI.getAsMetadata(M, DRTM[RI.getHandleTy()])); Metadata *SRVMD = SRVs.empty() ? nullptr : MDNode::get(Context, SRVs); @@ -95,7 +95,7 @@ static NamedMDNode *emitResourceMetadata(Module &M, DXILBindingMap &DBM, Metadata *CBufMD = CBufs.empty() ? nullptr : MDNode::get(Context, CBufs); Metadata *SmpMD = Smps.empty() ? nullptr : MDNode::get(Context, Smps); - if (DBM.empty()) + if (DRM.empty()) return nullptr; NamedMDNode *ResourceMD = M.getOrInsertNamedMetadata("dx.resources"); @@ -318,7 +318,7 @@ static void translateBranchMetadata(Module &M) { } } -static void translateMetadata(Module &M, DXILBindingMap &DBM, +static void translateMetadata(Module &M, DXILResourceMap &DRM, DXILResourceTypeMap &DRTM, const ModuleShaderFlags &ShaderFlags, const ModuleMetadataInfo &MMDI) { @@ -329,7 +329,7 @@ static void translateMetadata(Module &M, DXILBindingMap &DBM, emitValidatorVersionMD(M, MMDI); emitShaderModelVersionMD(M, MMDI); emitDXILVersionTupleMD(M, MMDI); - NamedMDNode *NamedResourceMD = emitResourceMetadata(M, DBM, DRTM); + NamedMDNode *NamedResourceMD = emitResourceMetadata(M, DRM, DRTM); auto *ResourceMD = (NamedResourceMD != nullptr) ? NamedResourceMD->getOperand(0) : nullptr; // FIXME: Add support to construct Signatures @@ -381,12 +381,12 @@ static void translateMetadata(Module &M, DXILBindingMap &DBM, PreservedAnalyses DXILTranslateMetadata::run(Module &M, ModuleAnalysisManager &MAM) { - DXILBindingMap &DBM = MAM.getResult(M); + DXILResourceMap &DRM = MAM.getResult(M); DXILResourceTypeMap &DRTM = MAM.getResult(M); const ModuleShaderFlags &ShaderFlags = MAM.getResult(M); const dxil::ModuleMetadataInfo MMDI = MAM.getResult(M); - translateMetadata(M, DBM, DRTM, ShaderFlags, MMDI); + translateMetadata(M, DRM, DRTM, ShaderFlags, MMDI); translateBranchMetadata(M); return PreservedAnalyses::all(); @@ -402,17 +402,17 @@ class DXILTranslateMetadataLegacy : public ModulePass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addPreserved(); + AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); } bool runOnModule(Module &M) override { - DXILBindingMap &DBM = - getAnalysis().getBindingMap(); + DXILResourceMap &DRM = + getAnalysis().getBindingMap(); DXILResourceTypeMap &DRTM = getAnalysis().getResourceTypeMap(); const ModuleShaderFlags &ShaderFlags = @@ -420,7 +420,7 @@ class DXILTranslateMetadataLegacy : public ModulePass { dxil::ModuleMetadataInfo MMDI = getAnalysis().getModuleMetadata(); - translateMetadata(M, DBM, DRTM, ShaderFlags, MMDI); + translateMetadata(M, DRM, DRTM, ShaderFlags, MMDI); translateBranchMetadata(M); return true; } @@ -436,7 +436,7 @@ ModulePass *llvm::createDXILTranslateMetadataLegacyPass() { INITIALIZE_PASS_BEGIN(DXILTranslateMetadataLegacy, "dxil-translate-metadata", "DXIL Translate Metadata", false, false) -INITIALIZE_PASS_DEPENDENCY(DXILResourceBindingWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass) INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper) INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass) INITIALIZE_PASS_END(DXILTranslateMetadataLegacy, "dxil-translate-metadata", diff --git a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll index a416124221dcb..9b4d7722b72ac 100644 --- a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll +++ b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -disable-output -passes="print" < %s 2>&1 | FileCheck %s +; RUN: opt -S -disable-output -passes="print" < %s 2>&1 | FileCheck %s @G = external constant <4 x float>, align 4 diff --git a/llvm/unittests/Analysis/DXILResourceTest.cpp b/llvm/unittests/Analysis/DXILResourceTest.cpp index b0f6b631a0339..66549067a354c 100644 --- a/llvm/unittests/Analysis/DXILResourceTest.cpp +++ b/llvm/unittests/Analysis/DXILResourceTest.cpp @@ -93,14 +93,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getResourceClass(), ResourceClass::SRV); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::RawBuffer); - ResourceBindingInfo RBI( + ResourceInfo RI( /*RecordID=*/0, /*Space=*/0, /*LowerBound=*/0, /*Size=*/1, RTI.getHandleTy()); - GlobalVariable *GV = RBI.createSymbol(M, RTI.createElementStruct(), "Buffer"); - std::pair Props = RBI.getAnnotateProps(M, RTI); + GlobalVariable *GV = RI.createSymbol(M, RTI.createElementStruct(), "Buffer"); + std::pair Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x0000000bU); EXPECT_EQ(Props.second, 0U); - MDTuple *MD = RBI.getAsMetadata(M, RTI); + MDTuple *MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(0, GV, "Buffer", 0, 0, 1, 11, 0, nullptr)); // RWByteAddressBuffer BufferOut : register(u3, space2); @@ -112,14 +112,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getUAV().IsROV, false); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::RawBuffer); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/1, /*Space=*/2, /*LowerBound=*/3, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "BufferOut"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "BufferOut"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x0000100bU); EXPECT_EQ(Props.second, 0U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(1, GV, "BufferOut", 2, 3, 1, 11, false, false, false, nullptr)); @@ -135,14 +135,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getStruct(DL).AlignLog2, Log2(Align(8))); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::StructuredBuffer); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/0, /*Space=*/0, /*LowerBound=*/0, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "Buffer0"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "Buffer0"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x0000030cU); EXPECT_EQ(Props.second, 0x00000010U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(0, GV, "Buffer0", 0, 0, 1, 12, 0, TestMD.get(1, 16))); @@ -155,14 +155,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getStruct(DL).AlignLog2, 0u); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::StructuredBuffer); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/1, /*Space=*/0, /*LowerBound=*/1, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "Buffer1"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "Buffer1"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x0000000cU); EXPECT_EQ(Props.second, 0x0000000cU); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(1, GV, "Buffer1", 0, 1, 1, 12, 0, TestMD.get(1, 12))); @@ -177,14 +177,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getTyped().ElementCount, 4u); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::Texture2D); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/2, /*Space=*/0, /*LowerBound=*/2, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "ColorMapTexture"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "ColorMapTexture"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x00000002U); EXPECT_EQ(Props.second, 0x00000409U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(2, GV, "ColorMapTexture", 0, 2, 1, 2, 0, TestMD.get(0, 9))); @@ -201,14 +201,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getMultiSampleCount(), 8u); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::Texture2DMS); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/0, /*Space=*/0, /*LowerBound=*/0, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "DepthBuffer"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "DepthBuffer"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x00000003U); EXPECT_EQ(Props.second, 0x00080109U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ( MD, TestMD.get(0, GV, "DepthBuffer", 0, 0, 1, 3, 8, TestMD.get(0, 9))); @@ -222,14 +222,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getFeedbackType(), SamplerFeedbackType::MinMip); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::FeedbackTexture2D); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/0, /*Space=*/0, /*LowerBound=*/0, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "feedbackMinMip"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "feedbackMinMip"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x00001011U); EXPECT_EQ(Props.second, 0U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(0, GV, "feedbackMinMip", 0, 0, 1, 17, false, false, false, TestMD.get(2, 0))); @@ -243,14 +243,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getFeedbackType(), SamplerFeedbackType::MipRegionUsed); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::FeedbackTexture2DArray); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/0, /*Space=*/0, /*LowerBound=*/0, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "feedbackMipRegion"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "feedbackMipRegion"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x00001012U); EXPECT_EQ(Props.second, 0x00000001U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(0, GV, "feedbackMipRegion", 0, 0, 1, 18, false, false, false, TestMD.get(2, 1))); @@ -268,14 +268,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getUAV().IsROV, false); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::Texture2D); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/0, /*Space=*/2, /*LowerBound=*/0, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "OutputTexture"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "OutputTexture"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x00005002U); EXPECT_EQ(Props.second, 0x00000204U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(0, GV, "OutputTexture", 2, 0, 1, 2, true, false, false, TestMD.get(0, 4))); @@ -292,14 +292,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getTyped().ElementCount, 4u); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::TypedBuffer); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/0, /*Space=*/0, /*LowerBound=*/0, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "ROB"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "ROB"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x0000300aU); EXPECT_EQ(Props.second, 0x00000409U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(0, GV, "ROB", 0, 0, 1, 10, false, false, true, TestMD.get(0, 9))); @@ -319,14 +319,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getStruct(DL).AlignLog2, Log2(Align(4))); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::StructuredBuffer); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/0, /*Space=*/0, /*LowerBound=*/2, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "g_OutputBuffer"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "g_OutputBuffer"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x0000920cU); EXPECT_EQ(Props.second, 0x00000014U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(0, GV, "g_OutputBuffer", 0, 2, 1, 12, false, true, false, TestMD.get(1, 20))); @@ -346,14 +346,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getMultiSampleCount(), 8u); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::Texture2DMSArray); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/0, /*Space=*/0, /*LowerBound=*/0, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "g_rw_t2dmsa"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "g_rw_t2dmsa"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x00001008U); EXPECT_EQ(Props.second, 0x00080105U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(0, GV, "g_rw_t2dmsa", 0, 0, 1, 8, false, false, false, TestMD.get(0, 5))); @@ -366,14 +366,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getCBufferSize(DL), 32u); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::CBuffer); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/0, /*Space=*/0, /*LowerBound=*/0, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), ""); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), ""); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x0000000dU); EXPECT_EQ(Props.second, 0x00000020U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(0, GV, "", 0, 0, 1, 32, nullptr)); // SamplerState ColorMapSampler : register(s0); @@ -384,14 +384,14 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getSamplerType(), dxil::SamplerType::Default); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::Sampler); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/0, /*Space=*/0, /*LowerBound=*/0, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "ColorMapSampler"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "ColorMapSampler"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x0000000eU); EXPECT_EQ(Props.second, 0U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(0, GV, "ColorMapSampler", 0, 0, 1, 0, nullptr)); RTI = ResourceTypeInfo(llvm::TargetExtType::get( @@ -401,13 +401,13 @@ TEST(DXILResource, AnnotationsAndMetadata) { EXPECT_EQ(RTI.getSamplerType(), dxil::SamplerType::Comparison); EXPECT_EQ(RTI.getResourceKind(), ResourceKind::Sampler); - RBI = ResourceBindingInfo( + RI = ResourceInfo( /*RecordID=*/0, /*Space=*/0, /*LowerBound=*/0, /*Size=*/1, RTI.getHandleTy()); - GV = RBI.createSymbol(M, RTI.createElementStruct(), "CmpSampler"); - Props = RBI.getAnnotateProps(M, RTI); + GV = RI.createSymbol(M, RTI.createElementStruct(), "CmpSampler"); + Props = RI.getAnnotateProps(M, RTI); EXPECT_EQ(Props.first, 0x0000800eU); EXPECT_EQ(Props.second, 0U); - MD = RBI.getAsMetadata(M, RTI); + MD = RI.getAsMetadata(M, RTI); EXPECT_MDEQ(MD, TestMD.get(0, GV, "CmpSampler", 0, 0, 1, 1, nullptr)); } diff --git a/llvm/unittests/Target/DirectX/UniqueResourceFromUseTests.cpp b/llvm/unittests/Target/DirectX/UniqueResourceFromUseTests.cpp index f272381c0c250..675a3dc19b912 100644 --- a/llvm/unittests/Target/DirectX/UniqueResourceFromUseTests.cpp +++ b/llvm/unittests/Target/DirectX/UniqueResourceFromUseTests.cpp @@ -34,7 +34,7 @@ class UniqueResourceFromUseTest : public testing::Test { PB = new PassBuilder(); PB->registerModuleAnalyses(*MAM); MAM->registerPass([&] { return DXILResourceTypeAnalysis(); }); - MAM->registerPass([&] { return DXILResourceBindingAnalysis(); }); + MAM->registerPass([&] { return DXILResourceAnalysis(); }); } virtual void TearDown() { @@ -62,7 +62,7 @@ declare void @a.func(target("dx.RawBuffer", float, 1, 0) %handle) auto M = parseAssemblyString(Assembly, Error, Context); ASSERT_TRUE(M) << "Bad assembly?"; - const DXILBindingMap &DBM = MAM->getResult(*M); + const DXILResourceMap &DRM = MAM->getResult(*M); for (const Function &F : M->functions()) { if (F.getName() != "a.func") { continue; @@ -73,7 +73,7 @@ declare void @a.func(target("dx.RawBuffer", float, 1, 0) %handle) for (const User *U : F.users()) { const CallInst *CI = cast(U); const Value *Handle = CI->getArgOperand(0); - const auto Bindings = DBM.findByUse(Handle); + const auto Bindings = DRM.findByUse(Handle); ASSERT_EQ(Bindings.size(), 1u) << "Handle should resolve into one resource"; @@ -112,7 +112,7 @@ declare target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", flo auto M = parseAssemblyString(Assembly, Error, Context); ASSERT_TRUE(M) << "Bad assembly?"; - const DXILBindingMap &DBM = MAM->getResult(*M); + const DXILResourceMap &DRM = MAM->getResult(*M); for (const Function &F : M->functions()) { if (F.getName() != "a.func") { continue; @@ -123,7 +123,7 @@ declare target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", flo for (const User *U : F.users()) { const CallInst *CI = cast(U); const Value *Handle = CI->getArgOperand(0); - const auto Bindings = DBM.findByUse(Handle); + const auto Bindings = DRM.findByUse(Handle); ASSERT_EQ(Bindings.size(), 1u) << "Handle should resolve into one resource"; @@ -165,7 +165,7 @@ declare target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", flo auto M = parseAssemblyString(Assembly, Error, Context); ASSERT_TRUE(M) << "Bad assembly?"; - const DXILBindingMap &DBM = MAM->getResult(*M); + const DXILResourceMap &DRM = MAM->getResult(*M); for (const Function &F : M->functions()) { if (F.getName() != "a.func") { continue; @@ -176,7 +176,7 @@ declare target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", flo for (const User *U : F.users()) { const CallInst *CI = cast(U); const Value *Handle = CI->getArgOperand(0); - const auto Bindings = DBM.findByUse(Handle); + const auto Bindings = DRM.findByUse(Handle); ASSERT_EQ(Bindings.size(), 4u) << "Handle should resolve into four resources"; @@ -245,7 +245,7 @@ declare target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", flo auto M = parseAssemblyString(Assembly, Error, Context); ASSERT_TRUE(M) << "Bad assembly?"; - const DXILBindingMap &DBM = MAM->getResult(*M); + const DXILResourceMap &DRM = MAM->getResult(*M); for (const Function &F : M->functions()) { if (F.getName() != "a.func") { continue; @@ -256,7 +256,7 @@ declare target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", flo for (const User *U : F.users()) { const CallInst *CI = cast(U); const Value *Handle = CI->getArgOperand(0); - const auto Bindings = DBM.findByUse(Handle); + const auto Bindings = DRM.findByUse(Handle); ASSERT_EQ(Bindings.size(), 2u) << "Handle should resolve into four resources"; From bdff739c7e79933aa7b3233da1bdadceeb7e03e6 Mon Sep 17 00:00:00 2001 From: Austin Schuh Date: Fri, 4 Apr 2025 15:52:44 -0700 Subject: [PATCH 0709/1029] cuda clang: Clean up test dependency for CUDA surfaces (#134459) https://github.com/llvm/llvm-project/pull/132883 added support for cuda surfaces but reached into clang/test/Headers/ from clang/test/CodeGen/ to grab the minimal cuda.h. Duplicate that file instead based on comments in the review, to fix remote test runs. Signed-off-by: Austin Schuh --- clang/test/CodeGen/include/cuda.h | 194 +++++++++++++++++++++++ clang/test/CodeGen/nvptx-surface.cu | 2 +- clang/test/Headers/Inputs/include/cuda.h | 4 +- 3 files changed, 198 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/include/cuda.h diff --git a/clang/test/CodeGen/include/cuda.h b/clang/test/CodeGen/include/cuda.h new file mode 100644 index 0000000000000..58202442e1f8c --- /dev/null +++ b/clang/test/CodeGen/include/cuda.h @@ -0,0 +1,194 @@ +/* Minimal declarations for CUDA support. Testing purposes only. + * This should stay in sync with clang/test/Headers/Inputs/include/cuda.h + */ +#pragma once + +// Make this file work with nvcc, for testing compatibility. + +#ifndef __NVCC__ +#define __constant__ __attribute__((constant)) +#define __device__ __attribute__((device)) +#define __global__ __attribute__((global)) +#define __host__ __attribute__((host)) +#define __shared__ __attribute__((shared)) +#define __managed__ __attribute__((managed)) +#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__))) + +struct dim3 { + unsigned x, y, z; + __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} +}; + +// Host- and device-side placement new overloads. +void *operator new(__SIZE_TYPE__, void *p) { return p; } +void *operator new[](__SIZE_TYPE__, void *p) { return p; } +__device__ void *operator new(__SIZE_TYPE__, void *p) { return p; } +__device__ void *operator new[](__SIZE_TYPE__, void *p) { return p; } + +#define CUDA_VERSION 10100 + +struct char1 { + char x; + __host__ __device__ char1(char x = 0) : x(x) {} +}; +struct char2 { + char x, y; + __host__ __device__ char2(char x = 0, char y = 0) : x(x), y(y) {} +}; +struct char4 { + char x, y, z, w; + __host__ __device__ char4(char x = 0, char y = 0, char z = 0, char w = 0) : x(x), y(y), z(z), w(w) {} +}; + +struct uchar1 { + unsigned char x; + __host__ __device__ uchar1(unsigned char x = 0) : x(x) {} +}; +struct uchar2 { + unsigned char x, y; + __host__ __device__ uchar2(unsigned char x = 0, unsigned char y = 0) : x(x), y(y) {} +}; +struct uchar4 { + unsigned char x, y, z, w; + __host__ __device__ uchar4(unsigned char x = 0, unsigned char y = 0, unsigned char z = 0, unsigned char w = 0) : x(x), y(y), z(z), w(w) {} +}; + +struct short1 { + short x; + __host__ __device__ short1(short x = 0) : x(x) {} +}; +struct short2 { + short x, y; + __host__ __device__ short2(short x = 0, short y = 0) : x(x), y(y) {} +}; +struct short4 { + short x, y, z, w; + __host__ __device__ short4(short x = 0, short y = 0, short z = 0, short w = 0) : x(x), y(y), z(z), w(w) {} +}; + +struct ushort1 { + unsigned short x; + __host__ __device__ ushort1(unsigned short x = 0) : x(x) {} +}; +struct ushort2 { + unsigned short x, y; + __host__ __device__ ushort2(unsigned short x = 0, unsigned short y = 0) : x(x), y(y) {} +}; +struct ushort4 { + unsigned short x, y, z, w; + __host__ __device__ ushort4(unsigned short x = 0, unsigned short y = 0, unsigned short z = 0, unsigned short w = 0) : x(x), y(y), z(z), w(w) {} +}; + +struct int1 { + int x; + __host__ __device__ int1(int x = 0) : x(x) {} +}; +struct int2 { + int x, y; + __host__ __device__ int2(int x = 0, int y = 0) : x(x), y(y) {} +}; +struct int4 { + int x, y, z, w; + __host__ __device__ int4(int x = 0, int y = 0, int z = 0, int w = 0) : x(x), y(y), z(z), w(w) {} +}; + +struct uint1 { + unsigned x; + __host__ __device__ uint1(unsigned x = 0) : x(x) {} +}; +struct uint2 { + unsigned x, y; + __host__ __device__ uint2(unsigned x = 0, unsigned y = 0) : x(x), y(y) {} +}; +struct uint3 { + unsigned x, y, z; + __host__ __device__ uint3(unsigned x = 0, unsigned y = 0, unsigned z = 0) : x(x), y(y), z(z) {} +}; +struct uint4 { + unsigned x, y, z, w; + __host__ __device__ uint4(unsigned x = 0, unsigned y = 0, unsigned z = 0, unsigned w = 0) : x(x), y(y), z(z), w(w) {} +}; + +struct longlong1 { + long long x; + __host__ __device__ longlong1(long long x = 0) : x(x) {} +}; +struct longlong2 { + long long x, y; + __host__ __device__ longlong2(long long x = 0, long long y = 0) : x(x), y(y) {} +}; +struct longlong4 { + long long x, y, z, w; + __host__ __device__ longlong4(long long x = 0, long long y = 0, long long z = 0, long long w = 0) : x(x), y(y), z(z), w(w) {} +}; + +struct ulonglong1 { + unsigned long long x; + __host__ __device__ ulonglong1(unsigned long long x = 0) : x(x) {} +}; +struct ulonglong2 { + unsigned long long x, y; + __host__ __device__ ulonglong2(unsigned long long x = 0, unsigned long long y = 0) : x(x), y(y) {} +}; +struct ulonglong4 { + unsigned long long x, y, z, w; + __host__ __device__ ulonglong4(unsigned long long x = 0, unsigned long long y = 0, unsigned long long z = 0, unsigned long long w = 0) : x(x), y(y), z(z), w(w) {} +}; + +struct float1 { + float x; + __host__ __device__ float1(float x = 0) : x(x) {} +}; +struct float2 { + float x, y; + __host__ __device__ float2(float x = 0, float y = 0) : x(x), y(y) {} +}; +struct float4 { + float x, y, z, w; + __host__ __device__ float4(float x = 0, float y = 0, float z = 0, float w = 0) : x(x), y(y), z(z), w(w) {} +}; + +struct double1 { + double x; + __host__ __device__ double1(double x = 0) : x(x) {} +}; +struct double2 { + double x, y; + __host__ __device__ double2(double x = 0, double y = 0) : x(x), y(y) {} +}; +struct double4 { + double x, y, z, w; + __host__ __device__ double4(double x = 0, double y = 0, double z = 0, double w = 0) : x(x), y(y), z(z), w(w) {} +}; + +typedef unsigned long long cudaTextureObject_t; +typedef unsigned long long cudaSurfaceObject_t; + +enum cudaTextureReadMode { + cudaReadModeNormalizedFloat, + cudaReadModeElementType +}; + +enum cudaSurfaceBoundaryMode { + cudaBoundaryModeZero, + cudaBoundaryModeClamp, + cudaBoundaryModeTrap +}; + +enum { + cudaTextureType1D, + cudaTextureType2D, + cudaTextureType3D, + cudaTextureTypeCubemap, + cudaTextureType1DLayered, + cudaTextureType2DLayered, + cudaTextureTypeCubemapLayered +}; + +struct textureReference {}; +template +struct __attribute__((device_builtin_texture_type)) texture + : public textureReference {}; + +#endif // !__NVCC__ diff --git a/clang/test/CodeGen/nvptx-surface.cu b/clang/test/CodeGen/nvptx-surface.cu index 7c42e5d118153..cf1fe76893a17 100644 --- a/clang/test/CodeGen/nvptx-surface.cu +++ b/clang/test/CodeGen/nvptx-surface.cu @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -O3 -o - %s -emit-llvm | FileCheck %s // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -O3 -o - %s -emit-llvm | FileCheck %s -#include "../Headers/Inputs/include/cuda.h" +#include "include/cuda.h" #include "__clang_cuda_texture_intrinsics.h" diff --git a/clang/test/Headers/Inputs/include/cuda.h b/clang/test/Headers/Inputs/include/cuda.h index 40a00b5af295a..4e8a2f46d27b3 100644 --- a/clang/test/Headers/Inputs/include/cuda.h +++ b/clang/test/Headers/Inputs/include/cuda.h @@ -1,4 +1,6 @@ -/* Minimal declarations for CUDA support. Testing purposes only. */ +/* Minimal declarations for CUDA support. Testing purposes only. + * This should stay in sync with clang/test/CodeGen/include/cuda.h + */ #pragma once // Make this file work with nvcc, for testing compatibility. From b0d0636026cdd2d1088d60c169a7a3a3371c0e66 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Fri, 4 Apr 2025 15:53:11 -0700 Subject: [PATCH 0710/1029] [CIR] Upstream support for break and continue statements (#134181) This adds ClangIR support for break and continue statements in loops. Because only loops are currently implemented upstream in CIR, only breaks in loops are supported here, but this same code will work (with minor changes to the verification and cfg flattening) when switch statements are upstreamed. --- .../CIR/Dialect/Builder/CIRBaseBuilder.h | 10 ++ clang/include/clang/CIR/Dialect/IR/CIROps.td | 44 ++++++- clang/include/clang/CIR/MissingFeatures.h | 2 - clang/lib/CIR/CodeGen/CIRGenFunction.h | 2 + clang/lib/CIR/CodeGen/CIRGenStmt.cpp | 35 ++++- clang/lib/CIR/Dialect/IR/CIRDialect.cpp | 21 +++ .../lib/CIR/Dialect/Transforms/FlattenCFG.cpp | 21 +-- clang/test/CIR/CodeGen/loop.cpp | 122 ++++++++++++++++++ 8 files changed, 232 insertions(+), 25 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index 51939e3af833d..c74150e9127c4 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -146,6 +146,16 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return create(loc, condBuilder, bodyBuilder, stepBuilder); } + /// Create a break operation. + cir::BreakOp createBreak(mlir::Location loc) { + return create(loc); + } + + /// Create a continue operation. + cir::ContinueOp createContinue(mlir::Location loc) { + return create(loc); + } + mlir::TypedAttr getConstPtrAttr(mlir::Type type, int64_t value) { auto valueAttr = mlir::IntegerAttr::get( mlir::IntegerType::get(type.getContext(), 64), value); diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index 562493888e10c..609e60ca74b49 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -518,7 +518,7 @@ def ConditionOp : CIR_Op<"condition", [ `cir.bool` operand and, depending on its value, may branch to different regions: - - When in the `cond` region of a `cir.loop`, it continues the loop + - When in the `cond` region of a loop, it continues the loop if true, or exits it if false. - When in the `ready` region of a `cir.await`, it branches to the `resume` region when true, and to the `suspend` region when false. @@ -526,12 +526,12 @@ def ConditionOp : CIR_Op<"condition", [ Example: ```mlir - cir.loop for(cond : { - cir.condition(%arg0) // Branches to `step` region or exits. - }, step : { - [...] - }) { - [...] + cir.for cond { + cir.condition(%val) // Branches to `step` region or exits. + } body { + cir.yield + } step { + cir.yield } cir.await(user, ready : { @@ -610,6 +610,36 @@ def YieldOp : CIR_Op<"yield", [ReturnLike, Terminator, ]; } +//===----------------------------------------------------------------------===// +// BreakOp +//===----------------------------------------------------------------------===// + +def BreakOp : CIR_Op<"break", [Terminator]> { + let summary = "C/C++ `break` statement equivalent"; + let description = [{ + The `cir.break` operation is used to cease the execution of the current loop + or switch operation and transfer control to the parent operation. It is only + allowed within a breakable operations (loops and switches). + }]; + let assemblyFormat = "attr-dict"; + let hasVerifier = 1; +} + +//===----------------------------------------------------------------------===// +// ContinueOp +//===----------------------------------------------------------------------===// + +def ContinueOp : CIR_Op<"continue", [Terminator]> { + let summary = "C/C++ `continue` statement equivalent"; + let description = [{ + The `cir.continue` operation is used to end execution of the current + iteration of a loop and resume execution beginning at the next iteration. + It is only allowed within loop regions. + }]; + let assemblyFormat = "attr-dict"; + let hasVerifier = 1; +} + //===----------------------------------------------------------------------===// // ScopeOp //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 21a1d99c7c218..86fdaf1ddaf51 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -122,12 +122,10 @@ struct MissingFeatures { // Future CIR operations static bool awaitOp() { return false; } - static bool breakOp() { return false; } static bool callOp() { return false; } static bool complexCreateOp() { return false; } static bool complexImagOp() { return false; } static bool complexRealOp() { return false; } - static bool continueOp() { return false; } static bool ifOp() { return false; } static bool labelOp() { return false; } static bool ptrDiffOp() { return false; } diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 4889c3ce4ca9c..1bedbe28ae625 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -399,6 +399,8 @@ class CIRGenFunction : public CIRGenTypeCache { LValue emitBinaryOperatorLValue(const BinaryOperator *e); + mlir::LogicalResult emitBreakStmt(const clang::BreakStmt &s); + mlir::LogicalResult emitContinueStmt(const clang::ContinueStmt &s); mlir::LogicalResult emitDoStmt(const clang::DoStmt &s); /// Emit an expression as an initializer for an object (variable, field, etc.) diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp index b5c1f0ae2a7ef..00d33e7feddff 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp @@ -56,6 +56,12 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s, return mlir::success(); switch (s->getStmtClass()) { + case Stmt::BreakStmtClass: + case Stmt::CompoundStmtClass: + case Stmt::ContinueStmtClass: + case Stmt::DeclStmtClass: + case Stmt::ReturnStmtClass: + llvm_unreachable("should have emitted these statements as simple"); #define STMT(Type, Base) #define ABSTRACT_STMT(Op) @@ -88,13 +94,9 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s, case Stmt::SEHFinallyStmtClass: case Stmt::MSDependentExistsStmtClass: case Stmt::NullStmtClass: - case Stmt::CompoundStmtClass: - case Stmt::DeclStmtClass: case Stmt::LabelStmtClass: case Stmt::AttributedStmtClass: case Stmt::GotoStmtClass: - case Stmt::BreakStmtClass: - case Stmt::ContinueStmtClass: case Stmt::DefaultStmtClass: case Stmt::CaseStmtClass: case Stmt::SEHLeaveStmtClass: @@ -106,7 +108,6 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s, case Stmt::CXXTryStmtClass: case Stmt::CXXForRangeStmtClass: case Stmt::IndirectGotoStmtClass: - case Stmt::ReturnStmtClass: case Stmt::GCCAsmStmtClass: case Stmt::MSAsmStmtClass: case Stmt::OMPParallelDirectiveClass: @@ -219,7 +220,6 @@ mlir::LogicalResult CIRGenFunction::emitSimpleStmt(const Stmt *s, bool useCurrentScope) { switch (s->getStmtClass()) { default: - // Only compound and return statements are supported right now. return mlir::failure(); case Stmt::DeclStmtClass: return emitDeclStmt(cast(*s)); @@ -229,6 +229,10 @@ mlir::LogicalResult CIRGenFunction::emitSimpleStmt(const Stmt *s, else emitCompoundStmt(cast(*s)); break; + case Stmt::ContinueStmtClass: + return emitContinueStmt(cast(*s)); + case Stmt::BreakStmtClass: + return emitBreakStmt(cast(*s)); case Stmt::ReturnStmtClass: return emitReturnStmt(cast(*s)); } @@ -316,6 +320,25 @@ mlir::LogicalResult CIRGenFunction::emitReturnStmt(const ReturnStmt &s) { return mlir::success(); } +mlir::LogicalResult +CIRGenFunction::emitContinueStmt(const clang::ContinueStmt &s) { + builder.createContinue(getLoc(s.getContinueLoc())); + + // Insert the new block to continue codegen after the continue statement. + builder.createBlock(builder.getBlock()->getParent()); + + return mlir::success(); +} + +mlir::LogicalResult CIRGenFunction::emitBreakStmt(const clang::BreakStmt &s) { + builder.createBreak(getLoc(s.getBreakLoc())); + + // Insert the new block to continue codegen after the break statement. + builder.createBlock(builder.getBlock()->getParent()); + + return mlir::success(); +} + mlir::LogicalResult CIRGenFunction::emitForStmt(const ForStmt &s) { cir::ForOp forOp; diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp index 143ed5544375f..798aca602c4e3 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp @@ -151,6 +151,17 @@ void cir::AllocaOp::build(mlir::OpBuilder &odsBuilder, odsState.addTypes(addr); } +//===----------------------------------------------------------------------===// +// BreakOp +//===----------------------------------------------------------------------===// + +LogicalResult cir::BreakOp::verify() { + assert(!cir::MissingFeatures::switchOp()); + if (!getOperation()->getParentOfType()) + return emitOpError("must be within a loop"); + return success(); +} + //===----------------------------------------------------------------------===// // ConditionOp //===----------------------------------------------------------------------===// @@ -241,6 +252,16 @@ OpFoldResult cir::ConstantOp::fold(FoldAdaptor /*adaptor*/) { return getValue(); } +//===----------------------------------------------------------------------===// +// ContinueOp +//===----------------------------------------------------------------------===// + +LogicalResult cir::ContinueOp::verify() { + if (!getOperation()->getParentOfType()) + return emitOpError("must be within a loop"); + return success(); +} + //===----------------------------------------------------------------------===// // CastOp //===----------------------------------------------------------------------===// diff --git a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp index 52f4b2241505d..b49c5ffd35c00 100644 --- a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp +++ b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp @@ -148,23 +148,24 @@ class CIRLoopOpInterfaceFlattening // driver to customize the order that operations are visited. // Lower continue statements. + mlir::Block *dest = (step ? step : cond); op.walkBodySkippingNestedLoops([&](mlir::Operation *op) { - // When continue ops are supported, there will be a check for them here - // and a call to lowerTerminator(). The call to `advance()` handles the - // case where this is not a continue op. - assert(!cir::MissingFeatures::continueOp()); - return mlir::WalkResult::advance(); + if (!isa(op)) + return mlir::WalkResult::advance(); + + lowerTerminator(op, dest, rewriter); + return mlir::WalkResult::skip(); }); // Lower break statements. assert(!cir::MissingFeatures::switchOp()); walkRegionSkipping( op.getBody(), [&](mlir::Operation *op) { - // When break ops are supported, there will be a check for them here - // and a call to lowerTerminator(). The call to `advance()` handles - // the case where this is not a break op. - assert(!cir::MissingFeatures::breakOp()); - return mlir::WalkResult::advance(); + if (!isa(op)) + return mlir::WalkResult::advance(); + + lowerTerminator(op, exit, rewriter); + return mlir::WalkResult::skip(); }); // Lower optional body region yield. diff --git a/clang/test/CIR/CodeGen/loop.cpp b/clang/test/CIR/CodeGen/loop.cpp index a950460e8838d..46fa66e2fc7aa 100644 --- a/clang/test/CIR/CodeGen/loop.cpp +++ b/clang/test/CIR/CodeGen/loop.cpp @@ -265,3 +265,125 @@ void test_empty_while_true() { // OGCG: br label %[[WHILE_BODY:.*]] // OGCG: [[WHILE_BODY]]: // OGCG: ret void + +void unreachable_after_continue() { + for (;;) { + continue; + int x = 1; + } +} + +// CIR: cir.func @unreachable_after_continue +// CIR: cir.scope { +// CIR: cir.for : cond { +// CIR: %[[TRUE:.*]] = cir.const #true +// CIR: cir.condition(%[[TRUE]]) +// CIR: } body { +// CIR: cir.scope { +// CIR: %[[X:.*]] = cir.alloca !s32i, !cir.ptr, ["x", init] {alignment = 4 : i64} +// CIR: cir.continue +// CIR: ^bb1: // no predecessors +// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i +// CIR: cir.store %[[ONE]], %[[X]] : !s32i, !cir.ptr +// CIR: cir.yield +// CIR: } +// CIR: cir.yield +// CIR: } step { +// CIR: cir.yield +// CIR: } +// CIR: } +// CIR: cir.return +// CIR: } + +// LLVM: define void @unreachable_after_continue() +// LLVM: %[[X:.*]] = alloca i32, i64 1, align 4 +// LLVM: br label %[[LABEL1:.*]] +// LLVM: [[LABEL1]]: +// LLVM: br label %[[LABEL2:.*]] +// LLVM: [[LABEL2]]: +// LLVM: br i1 true, label %[[LABEL3:.*]], label %[[LABEL8:.*]] +// LLVM: [[LABEL3]]: +// LLVM: br label %[[LABEL4:.*]] +// LLVM: [[LABEL4]]: +// LLVM: br label %[[LABEL7:.*]] +// LLVM: [[LABEL5:.*]]: +// LLVM-SAME: ; No predecessors! +// LLVM: store i32 1, ptr %[[X]], align 4 +// LLVM: br label %[[LABEL6:.*]] +// LLVM: [[LABEL6]]: +// LLVM: br label %[[LABEL7:.*]] +// LLVM: [[LABEL7]]: +// LLVM: br label %[[LABEL2]] +// LLVM: [[LABEL8]]: +// LLVM: br label %[[LABEL9:]] +// LLVM: [[LABEL9]]: +// LLVM: ret void + +// OGCG: define{{.*}} void @_Z26unreachable_after_continuev() +// OGCG: entry: +// OGCG: %[[X:.*]] = alloca i32, align 4 +// OGCG: br label %[[FOR_COND:.*]] +// OGCG: [[FOR_COND]]: +// OGCG: br label %[[FOR_COND]] + +void unreachable_after_break() { + for (;;) { + break; + int x = 1; + } +} + +// CIR: cir.func @unreachable_after_break +// CIR: cir.scope { +// CIR: cir.for : cond { +// CIR: %[[TRUE:.*]] = cir.const #true +// CIR: cir.condition(%[[TRUE]]) +// CIR: } body { +// CIR: cir.scope { +// CIR: %[[X:.*]] = cir.alloca !s32i, !cir.ptr, ["x", init] {alignment = 4 : i64} +// CIR: cir.break +// CIR: ^bb1: // no predecessors +// CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i +// CIR: cir.store %[[ONE]], %[[X]] : !s32i, !cir.ptr +// CIR: cir.yield +// CIR: } +// CIR: cir.yield +// CIR: } step { +// CIR: cir.yield +// CIR: } +// CIR: } +// CIR: cir.return +// CIR: } + +// LLVM: define void @unreachable_after_break() +// LLVM: %[[X:.*]] = alloca i32, i64 1, align 4 +// LLVM: br label %[[LABEL1:.*]] +// LLVM: [[LABEL1]]: +// LLVM: br label %[[LABEL2:.*]] +// LLVM: [[LABEL2]]: +// LLVM: br i1 true, label %[[LABEL3:.*]], label %[[LABEL8:.*]] +// LLVM: [[LABEL3]]: +// LLVM: br label %[[LABEL4:.*]] +// LLVM: [[LABEL4]]: +// LLVM: br label %[[LABEL8]] +// LLVM: [[LABEL5:.*]]: +// LLVM-SAME: ; No predecessors! +// LLVM: store i32 1, ptr %[[X]], align 4 +// LLVM: br label %[[LABEL6:.*]] +// LLVM: [[LABEL6]]: +// LLVM: br label %[[LABEL7:.*]] +// LLVM: [[LABEL7]]: +// LLVM: br label %[[LABEL2]] +// LLVM: [[LABEL8]]: +// LLVM: br label %[[LABEL9:]] +// LLVM: [[LABEL9]]: +// LLVM: ret void + +// OGCG: define{{.*}} void @_Z23unreachable_after_breakv() +// OGCG: entry: +// OGCG: %[[X:.*]] = alloca i32, align 4 +// OGCG: br label %[[FOR_COND:.*]] +// OGCG: [[FOR_COND]]: +// OGCG: br label %[[FOR_END:.*]] +// OGCG: [[FOR_END]]: +// OGCG: ret void From 6f34d03b3132a8286630f8496aa7dce9605e677b Mon Sep 17 00:00:00 2001 From: Un1q32 Date: Fri, 4 Apr 2025 19:02:39 -0400 Subject: [PATCH 0711/1029] Remove iOS 5 check for tailcalls on ARM (#133354) Fixes #102053 The check was added in 8decdc472f308b13d7fb7fd50c3919db086c0417, and at the time iOS 5 was the latest iOS version, before that commit tail calls were disabled for all ARMv7 targets. Testing a build of wasm3 with the patch on a device running iOS 3.0 shows a noticeable performance improvement and no issues. --- llvm/lib/Target/ARM/ARMSubtarget.cpp | 3 --- llvm/test/CodeGen/ARM/2010-11-29-PrologueBug.ll | 4 ++-- llvm/test/CodeGen/ARM/ldm.ll | 8 ++++---- llvm/test/CodeGen/ARM/zextload_demandedbits.ll | 3 +-- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index 893084785e6f0..759070c6f08da 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -226,9 +226,6 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { SupportsTailCall = !isThumb1Only() || hasV8MBaselineOps(); - if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0)) - SupportsTailCall = false; - switch (IT) { case DefaultIT: RestrictIT = false; diff --git a/llvm/test/CodeGen/ARM/2010-11-29-PrologueBug.ll b/llvm/test/CodeGen/ARM/2010-11-29-PrologueBug.ll index ff1e769600d38..b3b6c77f9bb8f 100644 --- a/llvm/test/CodeGen/ARM/2010-11-29-PrologueBug.ll +++ b/llvm/test/CodeGen/ARM/2010-11-29-PrologueBug.ll @@ -9,8 +9,8 @@ entry: ; CHECK: mov r7, sp ; CHECK: bl _foo ; CHECK: bl _foo -; CHECK: bl _foo -; CHECK: pop {r7, pc} +; CHECK: pop +; CHECK: b %0 = tail call ptr @foo(ptr %x) nounwind %1 = tail call ptr @foo(ptr %0) nounwind diff --git a/llvm/test/CodeGen/ARM/ldm.ll b/llvm/test/CodeGen/ARM/ldm.ll index 2f7486020890d..2d2fc578cf106 100644 --- a/llvm/test/CodeGen/ARM/ldm.ll +++ b/llvm/test/CodeGen/ARM/ldm.ll @@ -5,9 +5,9 @@ define i32 @t1() { ; CHECK-LABEL: t1: -; CHECK: pop +; CHECK: ldrd ; V4T-LABEL: t1: -; V4T: pop +; V4T: ldm %tmp = load i32, ptr @X ; [#uses=1] %tmp3 = load i32, ptr getelementptr ([0 x i32], ptr @X, i32 0, i32 1) ; [#uses=1] %tmp4 = tail call i32 @f1( i32 %tmp, i32 %tmp3 ) ; [#uses=1] @@ -16,9 +16,9 @@ define i32 @t1() { define i32 @t2() { ; CHECK-LABEL: t2: -; CHECK: pop +; CHECK: ldm ; V4T-LABEL: t2: -; V4T: pop +; V4T: ldm %tmp = load i32, ptr getelementptr ([0 x i32], ptr @X, i32 0, i32 2) ; [#uses=1] %tmp3 = load i32, ptr getelementptr ([0 x i32], ptr @X, i32 0, i32 3) ; [#uses=1] %tmp5 = load i32, ptr getelementptr ([0 x i32], ptr @X, i32 0, i32 4) ; [#uses=1] diff --git a/llvm/test/CodeGen/ARM/zextload_demandedbits.ll b/llvm/test/CodeGen/ARM/zextload_demandedbits.ll index 8519d30b7dabf..fe6febdf6e90a 100644 --- a/llvm/test/CodeGen/ARM/zextload_demandedbits.ll +++ b/llvm/test/CodeGen/ARM/zextload_demandedbits.ll @@ -10,8 +10,7 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64- ; CHECK: quux ; CHECK: lsl ; CHECK: asr -; CHECK: bl -; CHECK: pop +; CHECK: b define void @quux(ptr %arg) { bb: %tmp1 = getelementptr inbounds %struct.eggs, ptr %arg, i32 0, i32 1 From 5271dead61dca30f4a6db0f0df8da00f8987449e Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 4 Apr 2025 16:33:40 -0700 Subject: [PATCH 0712/1029] [lldb] Add a {ObjectFile,SymbolFile}::GetObjectName method (#133370) Add ObjectFile::GetObjectName and SymbolFile::GetObjectName to retrieve the name of the object file, including the `.a` for static libraries. We currently do something similar in CommandObjectTarget, but the code for dumping this is a lot more involved than what's being offered by the new method. We have options to print he full path, the base name, and the directoy of the path and trim it to a specific width. This is motivated by #133211, where Greg pointed out that the old code would print the static archive (the .a file) rather than the actual object file inside of it. --- lldb/include/lldb/Symbol/ObjectFile.h | 1 + lldb/include/lldb/Symbol/SymbolFile.h | 2 ++ .../Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp | 7 +------ lldb/source/Symbol/ObjectFile.cpp | 9 +++++++++ lldb/source/Symbol/SymbolFile.cpp | 6 ++++++ 5 files changed, 19 insertions(+), 6 deletions(-) diff --git a/lldb/include/lldb/Symbol/ObjectFile.h b/lldb/include/lldb/Symbol/ObjectFile.h index 874926da2ceb7..cfcca04a76de8 100644 --- a/lldb/include/lldb/Symbol/ObjectFile.h +++ b/lldb/include/lldb/Symbol/ObjectFile.h @@ -748,6 +748,7 @@ class ObjectFile : public std::enable_shared_from_this, static lldb::DataBufferSP MapFileData(const FileSpec &file, uint64_t Size, uint64_t Offset); + std::string GetObjectName() const; protected: // Member variables. diff --git a/lldb/include/lldb/Symbol/SymbolFile.h b/lldb/include/lldb/Symbol/SymbolFile.h index dd056035d546e..f35d3ee9f22ae 100644 --- a/lldb/include/lldb/Symbol/SymbolFile.h +++ b/lldb/include/lldb/Symbol/SymbolFile.h @@ -491,6 +491,8 @@ class SymbolFile : public PluginInterface { return args; } + std::string GetObjectName() const; + protected: void AssertModuleLock(); diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp index ce351274b4576..961c212e2e6dc 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp @@ -727,12 +727,7 @@ void SymbolFileDWARFDebugMap::ForEachSymbolFile( Progress::kDefaultHighFrequencyReportTime); for (uint32_t oso_idx = 0; oso_idx < num_oso_idxs; ++oso_idx) { if (SymbolFileDWARF *oso_dwarf = GetSymbolFileByOSOIndex(oso_idx)) { - progress.Increment(oso_idx, oso_dwarf->GetObjectFile() - ? oso_dwarf->GetObjectFile() - ->GetFileSpec() - .GetFilename() - .GetString() - : ""); + progress.Increment(oso_idx, oso_dwarf->GetObjectName()); if (closure(*oso_dwarf) == IterationAction::Stop) return; } diff --git a/lldb/source/Symbol/ObjectFile.cpp b/lldb/source/Symbol/ObjectFile.cpp index 264acad050e35..2f2c59d6af620 100644 --- a/lldb/source/Symbol/ObjectFile.cpp +++ b/lldb/source/Symbol/ObjectFile.cpp @@ -775,6 +775,15 @@ uint32_t ObjectFile::GetCacheHash() { return *m_cache_hash; } +std::string ObjectFile::GetObjectName() const { + if (ModuleSP module_sp = GetModule()) + if (ConstString object_name = module_sp->GetObjectName()) + return llvm::formatv("{0}({1})", GetFileSpec().GetFilename().GetString(), + object_name.GetString()) + .str(); + return GetFileSpec().GetFilename().GetString(); +} + namespace llvm { namespace json { diff --git a/lldb/source/Symbol/SymbolFile.cpp b/lldb/source/Symbol/SymbolFile.cpp index 16ed98d7840f7..94e32b55572dd 100644 --- a/lldb/source/Symbol/SymbolFile.cpp +++ b/lldb/source/Symbol/SymbolFile.cpp @@ -259,3 +259,9 @@ void SymbolFileCommon::Dump(Stream &s) { if (Symtab *symtab = GetSymtab()) symtab->Dump(&s, nullptr, eSortOrderNone); } + +std::string SymbolFile::GetObjectName() const { + if (const ObjectFile *object_file = GetObjectFile()) + return object_file->GetObjectName(); + return ""; +} From 74a78028932f5e26c93dc6fd7efab65b97e6824c Mon Sep 17 00:00:00 2001 From: alx32 <103613512+alx32@users.noreply.github.com> Date: Fri, 4 Apr 2025 16:37:35 -0700 Subject: [PATCH 0713/1029] [lld-macho] Fix code section ordering in output binary (#134010) In `OutputSegment.cpp`, we need to ensure a specific order for certain sections. The current sorting logic incorrectly prioritizes code sections over explicitly defined section orders. This is problematic because the `__objc_stubs` section is both a code section *and* has a specific ordering requirement. The current logic would incorrectly prioritize its code section status, causing it to be sorted *before* the `__stubs` section. This incorrect ordering breaks the branch extension algorithm, ultimately leading to linker failures due to relocation errors. We also modify the `lld/test/MachO/arm64-objc-stubs.s` test to ensure correct section ordering. --- lld/MachO/OutputSegment.cpp | 32 +++++++++++++++++++++---------- lld/test/MachO/arm64-objc-stubs.s | 28 +++++++++++++++++---------- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/lld/MachO/OutputSegment.cpp b/lld/MachO/OutputSegment.cpp index c320af3fb3177..185444673ae47 100644 --- a/lld/MachO/OutputSegment.cpp +++ b/lld/MachO/OutputSegment.cpp @@ -103,20 +103,32 @@ static int sectionOrder(OutputSection *osec) { // be the section that determines whether we need thunks or not. if (osec->name == section_names::text) return -6; + + // Prioritize specific section ordering based on our knowledge. This ensures + // that certain sections are placed in a particular order, even if they + // are also categorized as code sections. This explicit ordering takes + // precedence over the general code section ordering. + int knownPriority = + StringSwitch(osec->name) + .Case(section_names::stubs, -4) + .Case(section_names::stubHelper, -3) + .Case(section_names::objcStubs, -2) + .Case(section_names::initOffsets, -1) + .Case(section_names::unwindInfo, + std::numeric_limits::max() - 1) + .Case(section_names::ehFrame, std::numeric_limits::max()) + .Default(0); + + if (knownPriority != 0) + return knownPriority; + // Ensure all code sections are contiguous with `__text` for thunk // calculations. - if (sections::isCodeSection(osec->name, segment_names::text, osec->flags) && - osec->name != section_names::stubHelper) { + if (sections::isCodeSection(osec->name, segment_names::text, osec->flags)) { return -5; } - return StringSwitch(osec->name) - .Case(section_names::stubs, -4) - .Case(section_names::stubHelper, -3) - .Case(section_names::objcStubs, -2) - .Case(section_names::initOffsets, -1) - .Case(section_names::unwindInfo, std::numeric_limits::max() - 1) - .Case(section_names::ehFrame, std::numeric_limits::max()) - .Default(osec->inputOrder); + + return osec->inputOrder; } else if (segname == segment_names::data || segname == segment_names::dataConst) { // For each thread spawned, dyld will initialize its TLVs by copying the diff --git a/lld/test/MachO/arm64-objc-stubs.s b/lld/test/MachO/arm64-objc-stubs.s index 1b8ebff924300..da25b1292faa6 100644 --- a/lld/test/MachO/arm64-objc-stubs.s +++ b/lld/test/MachO/arm64-objc-stubs.s @@ -1,22 +1,23 @@ # REQUIRES: aarch64 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %s -o %t.o -# RUN: %lld -arch arm64 -lSystem -o %t.out %t.o +# RUN: %lld -arch arm64 -lSystem -o %t.out %t.o -U _external_func # RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s -# RUN: %lld -arch arm64 -lSystem -o %t.out %t.o -dead_strip +# RUN: %lld -arch arm64 -lSystem -o %t.out %t.o -dead_strip -U _external_func # RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s -# RUN: %lld -arch arm64 -lSystem -o %t.out %t.o -objc_stubs_fast +# RUN: %lld -arch arm64 -lSystem -o %t.out %t.o -objc_stubs_fast -U _external_func # RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s # RUN: llvm-otool -l %t.out | FileCheck %s --check-prefix=FASTALIGN -# RUN: %lld -arch arm64 -lSystem -o %t.out %t.o -objc_stubs_small +# RUN: %lld -arch arm64 -lSystem -o %t.out %t.o -objc_stubs_small -U _external_func # RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s --check-prefix=SMALL # RUN: llvm-otool -l %t.out | FileCheck %s --check-prefix=SMALLALIGN +# RUN: llvm-objdump --section-headers %t.out | FileCheck %s --check-prefix=SECTIONS # CHECK: Contents of (__TEXT,__objc_stubs) section # CHECK-NEXT: _objc_msgSend$foo: # CHECK-NEXT: adrp x1, 8 ; 0x100008000 -# CHECK-NEXT: ldr x1, [x1, #0x10] +# CHECK-NEXT: ldr x1, [x1, #0x18] # CHECK-NEXT: adrp x16, 4 ; 0x100004000 # CHECK-NEXT: ldr x16, [x16] # CHECK-NEXT: br x16 @@ -26,7 +27,7 @@ # CHECK-NEXT: _objc_msgSend$length: # CHECK-NEXT: adrp x1, 8 ; 0x100008000 -# CHECK-NEXT: ldr x1, [x1, #0x18] +# CHECK-NEXT: ldr x1, [x1, #0x20] # CHECK-NEXT: adrp x16, 4 ; 0x100004000 # CHECK-NEXT: ldr x16, [x16] # CHECK-NEXT: br x16 @@ -44,13 +45,13 @@ # FASTALIGN-NEXT: align 2^5 (32) # SMALL: _objc_msgSend$foo: -# SMALL-NEXT: adrp x1, 4 ; 0x100004000 -# SMALL-NEXT: ldr x1, [x1, #0x10] +# SMALL-NEXT: adrp x1, 8 ; 0x100008000 +# SMALL-NEXT: ldr x1, [x1, #0x18] # SMALL-NEXT: b # SMALL-NEXT: _objc_msgSend$length: -# SMALL-NEXT: adrp x1, 4 ; 0x100004000 -# SMALL-NEXT: ldr x1, [x1, #0x18] +# SMALL-NEXT: adrp x1, 8 ; 0x100008000 +# SMALL-NEXT: ldr x1, [x1, #0x20] # SMALL-NEXT: b # SMALLALIGN: sectname __objc_stubs @@ -60,6 +61,12 @@ # SMALLALIGN-NEXT: offset # SMALLALIGN-NEXT: align 2^2 (4) +## Check correct section ordering +# SECTIONS: Sections: +# SECTIONS: __text +# SECTIONS: __stubs +# SECTIONS: __objc_stubs + .section __TEXT,__objc_methname,cstring_literals lselref1: .asciz "foo" @@ -81,4 +88,5 @@ _main: bl _objc_msgSend$length bl _objc_msgSend$foo bl _objc_msgSend$foo + bl _external_func ret From 7001993880066a40783b960aa3f236a57d09e061 Mon Sep 17 00:00:00 2001 From: Slava Zakharin Date: Fri, 4 Apr 2025 16:39:49 -0700 Subject: [PATCH 0714/1029] [flang] Temporary include variant.h in enum-class.h. (#134460) I am having problems building Fortran runtime for CUDA after #134164. I need more time to investigate it, but in the meantime including variant.h (or any header that eventually includes a libcudacxx header) resolves the issue. --- flang/include/flang/Common/enum-class.h | 1 + 1 file changed, 1 insertion(+) diff --git a/flang/include/flang/Common/enum-class.h b/flang/include/flang/Common/enum-class.h index 41575d45091a8..8c254c8bc6a70 100644 --- a/flang/include/flang/Common/enum-class.h +++ b/flang/include/flang/Common/enum-class.h @@ -17,6 +17,7 @@ #ifndef FORTRAN_COMMON_ENUM_CLASS_H_ #define FORTRAN_COMMON_ENUM_CLASS_H_ +#include "flang/Common/variant.h" #include #include From e8b52acca2376aac90ba8e2927e52ddd5253bcbb Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Fri, 4 Apr 2025 16:55:43 -0700 Subject: [PATCH 0715/1029] [libc][NFC] replace NULL with nullptr (#134464) Simple cleanup --- libc/src/sys/time/linux/utimes.cpp | 6 +++--- libc/test/src/time/ctime_test.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/libc/src/sys/time/linux/utimes.cpp b/libc/src/sys/time/linux/utimes.cpp index 1cc5a8344e91a..e6e3d073a81a4 100644 --- a/libc/src/sys/time/linux/utimes.cpp +++ b/libc/src/sys/time/linux/utimes.cpp @@ -30,11 +30,11 @@ LLVM_LIBC_FUNCTION(int, utimes, #elif defined(SYS_utimensat) // the utimensat syscall requires a timespec struct, not timeval. struct timespec ts[2]; - struct timespec *ts_ptr = nullptr; // default value if times is NULL + struct timespec *ts_ptr = nullptr; // default value if times is nullptr // convert the microsec values in timeval struct times // to nanosecond values in timespec struct ts - if (times != NULL) { + if (times != nullptr) { // ensure consistent values if ((times[0].tv_usec < 0 || times[1].tv_usec < 0) || @@ -54,7 +54,7 @@ LLVM_LIBC_FUNCTION(int, utimes, ts_ptr = ts; } - // If times was NULL, ts_ptr remains NULL, which utimensat interprets + // If times was nullptr, ts_ptr remains nullptr, which utimensat interprets // as setting times to the current time. // utimensat syscall. diff --git a/libc/test/src/time/ctime_test.cpp b/libc/test/src/time/ctime_test.cpp index 7ec71bb1e4ed1..6f1168f0b6685 100644 --- a/libc/test/src/time/ctime_test.cpp +++ b/libc/test/src/time/ctime_test.cpp @@ -11,10 +11,10 @@ #include "test/UnitTest/Test.h" #include "test/src/time/TmHelper.h" -TEST(LlvmLibcCtime, NULL) { +TEST(LlvmLibcCtime, nullptr) { char *result; - result = LIBC_NAMESPACE::ctime(NULL); - ASSERT_STREQ(NULL, result); + result = LIBC_NAMESPACE::ctime(nullptr); + ASSERT_STREQ(nullptr, result); } TEST(LlvmLibcCtime, ValidUnixTimestamp0) { @@ -38,5 +38,5 @@ TEST(LlvmLibcCtime, InvalidArgument) { char *result; t = 2147483648; result = LIBC_NAMESPACE::ctime(&t); - ASSERT_STREQ(NULL, result); + ASSERT_STREQ(nullptr, result); } From 78905ce6cbd3fa8f8b467e7cad0e9a093c1b1c44 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Fri, 4 Apr 2025 17:36:14 -0700 Subject: [PATCH 0716/1029] [CIR] Upstream support for logical not operations (#133966) When unary operation support was initially upstreamed, the cir.cast operation hadn't been upstreamed yet, so logical not wasn't included. Since casts have now been added, this change adds support for logical not. --- .../CIR/Dialect/Builder/CIRBaseBuilder.h | 5 ++ clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 31 ++++++- .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 9 +- clang/test/CIR/CodeGen/unary.cpp | 90 +++++++++++++++++++ 4 files changed, 132 insertions(+), 3 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index c74150e9127c4..c1e93fe790c08 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -121,6 +121,11 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return cir::BoolAttr::get(getContext(), getBoolTy(), state); } + mlir::Value createNot(mlir::Value value) { + return create(value.getLoc(), value.getType(), + cir::UnaryOpKind::Not, value); + } + /// Create a do-while operation. cir::DoWhileOp createDoWhile( mlir::Location loc, diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 6289a8f1d2ed7..3863d21487531 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -488,6 +488,8 @@ class ScalarExprEmitter : public StmtVisitor { return emitUnaryOp(e, cir::UnaryOpKind::Not, op); } + mlir::Value VisitUnaryLNot(const UnaryOperator *e); + /// Emit a conversion from the specified type to the specified destination /// type, both of which are CIR scalar types. /// TODO: do we need ScalarConversionOpts here? Should be done in another @@ -1315,7 +1317,7 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) { "fixed point casts"); return {}; } - cgf.getCIRGenModule().errorNYI(subExpr->getSourceRange(), "fp options"); + assert(!cir::MissingFeatures::cgFPOptionsRAII()); return emitScalarConversion(Visit(subExpr), subExpr->getType(), destTy, ce->getExprLoc()); } @@ -1353,6 +1355,33 @@ mlir::Value CIRGenFunction::emitScalarConversion(mlir::Value src, .emitScalarConversion(src, srcTy, dstTy, loc); } +mlir::Value ScalarExprEmitter::VisitUnaryLNot(const UnaryOperator *e) { + // Perform vector logical not on comparison with zero vector. + if (e->getType()->isVectorType() && + e->getType()->castAs()->getVectorKind() == + VectorKind::Generic) { + assert(!cir::MissingFeatures::vectorType()); + cgf.cgm.errorNYI(e->getSourceRange(), "vector logical not"); + return {}; + } + + // Compare operand to zero. + mlir::Value boolVal = cgf.evaluateExprAsBool(e->getSubExpr()); + + // Invert value. + boolVal = builder.createNot(boolVal); + + // ZExt result to the expr type. + mlir::Type dstTy = cgf.convertType(e->getType()); + if (mlir::isa(dstTy)) + return builder.createBoolToInt(boolVal, dstTy); + if (mlir::isa(dstTy)) + return boolVal; + + cgf.cgm.errorNYI("destination type for logical-not unary operator is NYI"); + return {}; +} + /// Return the size or alignment of the type of argument of the sizeof /// expression as an integer. mlir::Value ScalarExprEmitter::VisitUnaryExprOrTypeTraitExpr( diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 1c455039269b9..81b80e2e4eafb 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -501,9 +501,14 @@ mlir::LogicalResult CIRToLLVMCastOpLowering::matchAndRewrite( assert(!MissingFeatures::cxxABI()); assert(!MissingFeatures::dataMemberType()); break; - case cir::CastKind::ptr_to_bool: - assert(!cir::MissingFeatures::opCmp()); + case cir::CastKind::ptr_to_bool: { + mlir::Value llvmSrcVal = adaptor.getOperands().front(); + mlir::Value zeroPtr = rewriter.create( + castOp.getLoc(), llvmSrcVal.getType()); + rewriter.replaceOpWithNewOp( + castOp, mlir::LLVM::ICmpPredicate::ne, llvmSrcVal, zeroPtr); break; + } case cir::CastKind::address_space: { mlir::Type dstTy = castOp.getType(); mlir::Value llvmSrcVal = adaptor.getOperands().front(); diff --git a/clang/test/CIR/CodeGen/unary.cpp b/clang/test/CIR/CodeGen/unary.cpp index a6405e653a07c..b0a315a18f8fb 100644 --- a/clang/test/CIR/CodeGen/unary.cpp +++ b/clang/test/CIR/CodeGen/unary.cpp @@ -466,3 +466,93 @@ _Float16 fp16UMinus(_Float16 f) { // OGCG: %[[PROMOTED:.*]] = fpext half %[[F_LOAD]] to float // OGCG: %[[RESULT:.*]] = fneg float %[[PROMOTED]] // OGCG: %[[UNPROMOTED:.*]] = fptrunc float %[[RESULT]] to half + +void test_logical_not() { + int a = 5; + a = !a; + bool b = false; + b = !b; + float c = 2.0f; + c = !c; + int *p = 0; + b = !p; + double d = 3.0; + b = !d; +} + +// CHECK: cir.func @test_logical_not() +// CHECK: %[[A:.*]] = cir.load %[[A_ADDR:.*]] : !cir.ptr, !s32i +// CHECK: %[[A_BOOL:.*]] = cir.cast(int_to_bool, %[[A]] : !s32i), !cir.bool +// CHECK: %[[A_NOT:.*]] = cir.unary(not, %[[A_BOOL]]) : !cir.bool, !cir.bool +// CHECK: %[[A_CAST:.*]] = cir.cast(bool_to_int, %[[A_NOT]] : !cir.bool), !s32i +// CHECK: cir.store %[[A_CAST]], %[[A_ADDR]] : !s32i, !cir.ptr +// CHECK: %[[B:.*]] = cir.load %[[B_ADDR:.*]] : !cir.ptr, !cir.bool +// CHECK: %[[B_NOT:.*]] = cir.unary(not, %[[B]]) : !cir.bool, !cir.bool +// CHECK: cir.store %[[B_NOT]], %[[B_ADDR]] : !cir.bool, !cir.ptr +// CHECK: %[[C:.*]] = cir.load %[[C_ADDR:.*]] : !cir.ptr, !cir.float +// CHECK: %[[C_BOOL:.*]] = cir.cast(float_to_bool, %[[C]] : !cir.float), !cir.bool +// CHECK: %[[C_NOT:.*]] = cir.unary(not, %[[C_BOOL]]) : !cir.bool, !cir.bool +// CHECK: %[[C_CAST:.*]] = cir.cast(bool_to_float, %[[C_NOT]] : !cir.bool), !cir.float +// CHECK: cir.store %[[C_CAST]], %[[C_ADDR]] : !cir.float, !cir.ptr +// CHECK: %[[P:.*]] = cir.load %[[P_ADDR:.*]] : !cir.ptr>, !cir.ptr +// CHECK: %[[P_BOOL:.*]] = cir.cast(ptr_to_bool, %[[P]] : !cir.ptr), !cir.bool +// CHECK: %[[P_NOT:.*]] = cir.unary(not, %[[P_BOOL]]) : !cir.bool, !cir.bool +// CHECK: cir.store %[[P_NOT]], %[[B_ADDR]] : !cir.bool, !cir.ptr +// CHECK: %[[D:.*]] = cir.load %[[D_ADDR:.*]] : !cir.ptr, !cir.double +// CHECK: %[[D_BOOL:.*]] = cir.cast(float_to_bool, %[[D]] : !cir.double), !cir.bool +// CHECK: %[[D_NOT:.*]] = cir.unary(not, %[[D_BOOL]]) : !cir.bool, !cir.bool +// CHECK: cir.store %[[D_NOT]], %[[B_ADDR]] : !cir.bool, !cir.ptr + +// LLVM: define void @test_logical_not() +// LLVM: %[[A:.*]] = load i32, ptr %[[A_ADDR:.*]], align 4 +// LLVM: %[[A_BOOL:.*]] = icmp ne i32 %[[A]], 0 +// LLVM: %[[A_NOT:.*]] = xor i1 %[[A_BOOL]], true +// LLVM: %[[A_CAST:.*]] = zext i1 %[[A_NOT]] to i32 +// LLVM: store i32 %[[A_CAST]], ptr %[[A_ADDR]], align 4 +// LLVM: %[[B:.*]] = load i8, ptr %[[B_ADDR:.*]], align 1 +// LLVM: %[[B_BOOL:.*]] = trunc i8 %[[B]] to i1 +// LLVM: %[[B_NOT:.*]] = xor i1 %[[B_BOOL]], true +// LLVM: %[[B_CAST:.*]] = zext i1 %[[B_NOT]] to i8 +// LLVM: store i8 %[[B_CAST]], ptr %[[B_ADDR]], align 1 +// LLVM: %[[C:.*]] = load float, ptr %[[C_ADDR:.*]], align 4 +// LLVM: %[[C_BOOL:.*]] = fcmp une float %[[C]], 0.000000e+00 +// LLVM: %[[C_NOT:.*]] = xor i1 %[[C_BOOL]], true +// LLVM: %[[C_CAST:.*]] = uitofp i1 %[[C_NOT]] to float +// LLVM: store float %[[C_CAST]], ptr %[[C_ADDR]], align 4 +// LLVM: %[[P:.*]] = load ptr, ptr %[[P_ADDR:.*]], align 8 +// LLVM: %[[P_BOOL:.*]] = icmp ne ptr %[[P]], null +// LLVM: %[[P_NOT:.*]] = xor i1 %[[P_BOOL]], true +// LLVM: %[[P_CAST:.*]] = zext i1 %[[P_NOT]] to i8 +// LLVM: store i8 %[[P_CAST]], ptr %[[B_ADDR]], align 1 +// LLVM: %[[D:.*]] = load double, ptr %[[D_ADDR:.*]], align 8 +// LLVM: %[[D_BOOL:.*]] = fcmp une double %[[D]], 0.000000e+00 +// LLVM: %[[D_NOT:.*]] = xor i1 %[[D_BOOL]], true +// LLVM: %[[D_CAST:.*]] = zext i1 %[[D_NOT]] to i8 +// LLVM: store i8 %[[D_CAST]], ptr %[[B_ADDR]], align 1 + +// OGCG: define{{.*}} void @_Z16test_logical_notv() +// OGCG: %[[A:.*]] = load i32, ptr %[[A_ADDR:.*]], align 4 +// OGCG: %[[A_BOOL:.*]] = icmp ne i32 %[[A]], 0 +// OGCG: %[[A_NOT:.*]] = xor i1 %[[A_BOOL]], true +// OGCG: %[[A_CAST:.*]] = zext i1 %[[A_NOT]] to i32 +// OGCG: store i32 %[[A_CAST]], ptr %[[A_ADDR]], align 4 +// OGCG: %[[B:.*]] = load i8, ptr %[[B_ADDR:.*]], align 1 +// OGCG: %[[B_BOOL:.*]] = trunc i8 %[[B]] to i1 +// OGCG: %[[B_NOT:.*]] = xor i1 %[[B_BOOL]], true +// OGCG: %[[B_CAST:.*]] = zext i1 %[[B_NOT]] to i8 +// OGCG: store i8 %[[B_CAST]], ptr %[[B_ADDR]], align 1 +// OGCG: %[[C:.*]] = load float, ptr %[[C_ADDR:.*]], align 4 +// OGCG: %[[C_BOOL:.*]] = fcmp une float %[[C]], 0.000000e+00 +// OGCG: %[[C_NOT:.*]] = xor i1 %[[C_BOOL]], true +// OGCG: %[[C_CAST:.*]] = uitofp i1 %[[C_NOT]] to float +// OGCG: store float %[[C_CAST]], ptr %[[C_ADDR]], align 4 +// OGCG: %[[P:.*]] = load ptr, ptr %[[P_ADDR:.*]], align 8 +// OGCG: %[[P_BOOL:.*]] = icmp ne ptr %[[P]], null +// OGCG: %[[P_NOT:.*]] = xor i1 %[[P_BOOL]], true +// OGCG: %[[P_CAST:.*]] = zext i1 %[[P_NOT]] to i8 +// OGCG: store i8 %[[P_CAST]], ptr %[[B_ADDR]], align 1 +// OGCG: %[[D:.*]] = load double, ptr %[[D_ADDR:.*]], align 8 +// OGCG: %[[D_BOOL:.*]] = fcmp une double %[[D]], 0.000000e+00 +// OGCG: %[[D_NOT:.*]] = xor i1 %[[D_BOOL]], true +// OGCG: %[[D_CAST:.*]] = zext i1 %[[D_NOT]] to i8 +// OGCG: store i8 %[[D_CAST]], ptr %[[B_ADDR]], align 1 From 6272e1f37e0710b51d38cb98b905a3f2ffea7966 Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Fri, 4 Apr 2025 18:43:31 -0700 Subject: [PATCH 0717/1029] [lldb] Make `RegisterContextThreadMemory` thread safe (#134469) The UpdateRegisterContext method can be called from multiple threads. --- .../Plugins/Process/Utility/RegisterContextThreadMemory.cpp | 2 ++ .../Plugins/Process/Utility/RegisterContextThreadMemory.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextThreadMemory.cpp b/lldb/source/Plugins/Process/Utility/RegisterContextThreadMemory.cpp index 75438550ce914..29927e3b5e4ed 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterContextThreadMemory.cpp +++ b/lldb/source/Plugins/Process/Utility/RegisterContextThreadMemory.cpp @@ -25,6 +25,8 @@ RegisterContextThreadMemory::RegisterContextThreadMemory( RegisterContextThreadMemory::~RegisterContextThreadMemory() = default; void RegisterContextThreadMemory::UpdateRegisterContext() { + std::lock_guard lock(m_update_register_ctx_lock); + ThreadSP thread_sp(m_thread_wp.lock()); if (thread_sp) { ProcessSP process_sp(thread_sp->GetProcess()); diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextThreadMemory.h b/lldb/source/Plugins/Process/Utility/RegisterContextThreadMemory.h index 23f675508cf38..1df32bbc1f057 100644 --- a/lldb/source/Plugins/Process/Utility/RegisterContextThreadMemory.h +++ b/lldb/source/Plugins/Process/Utility/RegisterContextThreadMemory.h @@ -99,6 +99,8 @@ class RegisterContextThreadMemory : public lldb_private::RegisterContext { RegisterContextThreadMemory(const RegisterContextThreadMemory &) = delete; const RegisterContextThreadMemory & operator=(const RegisterContextThreadMemory &) = delete; + + std::mutex m_update_register_ctx_lock; }; } // namespace lldb_private From 1f72fa29ecb4b283f449c8bf931fcaf0fa1069ee Mon Sep 17 00:00:00 2001 From: weiwei chen Date: Fri, 4 Apr 2025 22:44:07 -0400 Subject: [PATCH 0718/1029] [X86Backend][M68KBackend] Make Ctx in X86MCInstLower (M68KInstLower) the same as AsmPrinter.OutContext (#133352) In `X86MCInstLower::LowerMachineOperand`, a new `MCSymbol` can be created in `GetSymbolFromOperand(MO)` where `MO.getType()` is `MachineOperand::MO_ExternalSymbol` ``` case MachineOperand::MO_ExternalSymbol: return LowerSymbolOperand(MO, GetSymbolFromOperand(MO)); ``` at https://github.com/llvm/llvm-project/blob/725a7b664b92cd2e884806de5a08900b43d43cce/llvm/lib/Target/X86/X86MCInstLower.cpp#L196 However, this newly created symbol will not be marked properly with its `IsExternal` field since `Ctx.getOrCreateSymbol(Name)` doesn't know if the newly created `MCSymbol` is for `MachineOperand::MO_ExternalSymbol`. Looking at other backends, for example `Arch64MCInstLower` is doing for handling `MC_ExternalSymbol` https://github.com/llvm/llvm-project/blob/14c36db16fc090ef494ff6d8207562c414b40e30/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp#L366-L367 https://github.com/llvm/llvm-project/blob/14c36db16fc090ef494ff6d8207562c414b40e30/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp#L145-L148 It creates/gets the MCSymbol from `AsmPrinter.OutContext` instead of from `Ctx`. Moreover, `Ctx` for `AArch64MCLower` is the same as `AsmPrinter.OutContext`. https://github.com/llvm/llvm-project/blob/8e7d6baf0e013408be932758b4a5334c14a34086/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp#L100. This applies to almost all the other backends except X86 and M68k. ``` $git grep "MCInstLowering(" lib/Target/AArch64/AArch64AsmPrinter.cpp:100: : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this), lib/Target/AMDGPU/AMDGPUMCInstLower.cpp:223: AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this); lib/Target/AMDGPU/AMDGPUMCInstLower.cpp:257: AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this); lib/Target/AMDGPU/R600MCInstLower.cpp:52: R600MCInstLower MCInstLowering(OutContext, STI, *this); lib/Target/ARC/ARCAsmPrinter.cpp:41: MCInstLowering(&OutContext, *this) {} lib/Target/AVR/AVRAsmPrinter.cpp:196: AVRMCInstLower MCInstLowering(OutContext, *this); lib/Target/BPF/BPFAsmPrinter.cpp:144: BPFMCInstLower MCInstLowering(OutContext, *this); lib/Target/CSKY/CSKYAsmPrinter.cpp:41: : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this) {} lib/Target/Lanai/LanaiAsmPrinter.cpp:147: LanaiMCInstLower MCInstLowering(OutContext, *this); lib/Target/Lanai/LanaiAsmPrinter.cpp:184: LanaiMCInstLower MCInstLowering(OutContext, *this); lib/Target/MSP430/MSP430AsmPrinter.cpp:149: MSP430MCInstLower MCInstLowering(OutContext, *this); lib/Target/Mips/MipsAsmPrinter.h:126: : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(*this) {} lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp:695: WebAssemblyMCInstLower MCInstLowering(OutContext, *this); lib/Target/X86/X86MCInstLower.cpp:2200: X86MCInstLower MCInstLowering(*MF, *this); ``` This patch makes `X86MCInstLower` and `M68KInstLower` to have their `Ctx` from `AsmPrinter.OutContext` instead of getting it from `MF.getContext()` to be consistent with all the other backends. I think since normal use case (probably anything other than our un-conventional case) only handles one llvm module all the way through in the codegen pipeline till the end of code emission (AsmPrint), `AsmPrinter.OutContext` is the same as MachineFunction's MCContext, so this change is an NFC. ---- This fixes an error while running the generated code in ORC JIT for our use case with [MCLinker](https://youtu.be/yuSBEXkjfEA?si=HjgjkxJ9hLfnSvBj&t=813) (see more details below): https://github.com/llvm/llvm-project/pull/133291#issuecomment-2759200983 We (Mojo) are trying to do a MC level linking so that we break llvm module into multiple submodules to compile and codegen in parallel (technically into *.o files with symbol linkage type change), but instead of archive all of them into one `.a` file, we want to fix the symbol linkage type and still produce one *.o file. The parallel codegen pipeline generates the codegen data structures in their own `MCContext` (which is `Ctx` here). So if function `f` and `g` got split into different submodules, they will have different `Ctx`. And when we try to create an external symbol with the same name for each of them with `Ctx.getOrCreate(SymName)`, we will get two different `MCSymbol*` because `f` and `g`'s `MCContext` are different and they can't see each other. This is unfortunately not what we want for external symbols. Using `AsmPrinter.OutContext` helps, since it is shared, if we try to get or create the `MCSymbol` there, we'll be able to deduplicate. --- llvm/lib/Target/M68k/M68kMCInstLower.cpp | 2 +- llvm/lib/Target/X86/X86MCInstLower.cpp | 4 +- llvm/unittests/CodeGen/CMakeLists.txt | 1 + llvm/unittests/CodeGen/X86MCInstLowerTest.cpp | 174 ++++++++++++++++++ 4 files changed, 178 insertions(+), 3 deletions(-) create mode 100644 llvm/unittests/CodeGen/X86MCInstLowerTest.cpp diff --git a/llvm/lib/Target/M68k/M68kMCInstLower.cpp b/llvm/lib/Target/M68k/M68kMCInstLower.cpp index 957c0f9d3da82..8698fc0de4710 100644 --- a/llvm/lib/Target/M68k/M68kMCInstLower.cpp +++ b/llvm/lib/Target/M68k/M68kMCInstLower.cpp @@ -33,7 +33,7 @@ using namespace llvm; #define DEBUG_TYPE "m68k-mc-inst-lower" M68kMCInstLower::M68kMCInstLower(MachineFunction &MF, M68kAsmPrinter &AP) - : Ctx(MF.getContext()), MF(MF), TM(MF.getTarget()), MAI(*TM.getMCAsmInfo()), + : Ctx(AP.OutContext), MF(MF), TM(MF.getTarget()), MAI(*TM.getMCAsmInfo()), AsmPrinter(AP) {} MCSymbol * diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 52332c46851c2..d9945bdf2db60 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -142,8 +142,8 @@ void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) { X86MCInstLower::X86MCInstLower(const MachineFunction &mf, X86AsmPrinter &asmprinter) - : Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()), - AsmPrinter(asmprinter) {} + : Ctx(asmprinter.OutContext), MF(mf), TM(mf.getTarget()), + MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {} MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { return AsmPrinter.MMI->getObjFileInfo(); diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt index 4f580e7539f4d..a972dc32c40a2 100644 --- a/llvm/unittests/CodeGen/CMakeLists.txt +++ b/llvm/unittests/CodeGen/CMakeLists.txt @@ -47,6 +47,7 @@ add_llvm_unittest(CodeGenTests TargetOptionsTest.cpp TestAsmPrinter.cpp MLRegAllocDevelopmentFeatures.cpp + X86MCInstLowerTest.cpp ) add_subdirectory(GlobalISel) diff --git a/llvm/unittests/CodeGen/X86MCInstLowerTest.cpp b/llvm/unittests/CodeGen/X86MCInstLowerTest.cpp new file mode 100644 index 0000000000000..f5a59b16b4487 --- /dev/null +++ b/llvm/unittests/CodeGen/X86MCInstLowerTest.cpp @@ -0,0 +1,174 @@ +//===- llvm/unittest/CodeGen/X86MCInstLowerTest.cpp +//-------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "TestAsmPrinter.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/Target/TargetMachine.h" +#include "gtest/gtest.h" + +namespace llvm { + +class X86MCInstLowerTest : public testing::Test { +protected: + static void SetUpTestCase() { + InitializeAllTargetMCs(); + InitializeAllTargetInfos(); + InitializeAllTargets(); + InitializeAllAsmPrinters(); + } + + // Function to setup codegen pipeline and returns the AsmPrinter. + AsmPrinter *addPassesToEmitFile(llvm::legacy::PassManagerBase &PM, + llvm::raw_pwrite_stream &Out, + llvm::CodeGenFileType FileType, + llvm::MachineModuleInfoWrapperPass *MMIWP) { + TargetPassConfig *PassConfig = TM->createPassConfig(PM); + + PassConfig->setDisableVerify(true); + PM.add(PassConfig); + PM.add(MMIWP); + + if (PassConfig->addISelPasses()) + return nullptr; + + PassConfig->addMachinePasses(); + PassConfig->setInitialized(); + + MC.reset(new MCContext(TM->getTargetTriple(), TM->getMCAsmInfo(), + TM->getMCRegisterInfo(), TM->getMCSubtargetInfo())); + MC->setObjectFileInfo(TM->getObjFileLowering()); + TM->getObjFileLowering()->Initialize(*MC, *TM); + MC->setObjectFileInfo(TM->getObjFileLowering()); + + // Use a new MCContext for AsmPrinter for testing. + // AsmPrinter.OutContext will be different from + // MachineFunction's MCContext in MMIWP. + Expected> MCStreamerOrErr = + TM->createMCStreamer(Out, nullptr, FileType, *MC); + + if (auto Err = MCStreamerOrErr.takeError()) + return nullptr; + + AsmPrinter *Printer = + TM->getTarget().createAsmPrinter(*TM, std::move(*MCStreamerOrErr)); + + if (!Printer) + return nullptr; + + PM.add(Printer); + + return Printer; + } + + void SetUp() override { + // Module to compile. + const char *FooStr = R""""( + @G = external global i32 + + define i32 @foo() { + %1 = load i32, i32* @G; load the global variable + %2 = call i32 @f() + %3 = mul i32 %1, %2 + ret i32 %3 + } + + declare i32 @f() #0 + )""""; + StringRef AssemblyF(FooStr); + + // Get target triple for X86_64 + Triple TargetTriple("x86_64--"); + std::string Error; + const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error); + // Skip the test if target is not built. + if (!T) + GTEST_SKIP(); + + // Get TargetMachine. + // Use Reloc::Model::PIC_ and CodeModel::Model::Large + // to get GOT during codegen as MO_ExternalSymbol. + TargetOptions Options; + TM = std::unique_ptr(T->createTargetMachine( + TargetTriple, "", "", Options, Reloc::Model::PIC_, + CodeModel::Model::Large, CodeGenOptLevel::Default)); + if (!TM) + GTEST_SKIP(); + + SMDiagnostic SMError; + + // Parse the module. + M = parseAssemblyString(AssemblyF, SMError, Context); + if (!M) + report_fatal_error(SMError.getMessage()); + M->setDataLayout(TM->createDataLayout()); + + // Get llvm::Function from M + Foo = M->getFunction("foo"); + if (!Foo) + report_fatal_error("foo?"); + + // Prepare the MCContext for codegen M. + // MachineFunction for Foo will have this MCContext. + MCFoo.reset(new MCContext(TargetTriple, TM->getMCAsmInfo(), + TM->getMCRegisterInfo(), + TM->getMCSubtargetInfo())); + MCFoo->setObjectFileInfo(TM->getObjFileLowering()); + TM->getObjFileLowering()->Initialize(*MCFoo, *TM); + MCFoo->setObjectFileInfo(TM->getObjFileLowering()); + } + + LLVMContext Context; + std::unique_ptr TM; + std::unique_ptr M; + + std::unique_ptr MC; + std::unique_ptr MCFoo; + + Function *Foo; + std::unique_ptr MFFoo; +}; + +TEST_F(X86MCInstLowerTest, moExternalSymbol_MCSYMBOL) { + + MachineModuleInfoWrapperPass *MMIWP = + new MachineModuleInfoWrapperPass(TM.get(), &*MCFoo); + + legacy::PassManager PassMgrF; + SmallString<1024> Buf; + llvm::raw_svector_ostream OS(Buf); + AsmPrinter *Printer = + addPassesToEmitFile(PassMgrF, OS, CodeGenFileType::AssemblyFile, MMIWP); + PassMgrF.run(*M); + + // Check GOT MCSymbol is from Printer.OutContext. + MCSymbol *GOTPrinterPtr = + Printer->OutContext.lookupSymbol("_GLOBAL_OFFSET_TABLE_"); + + // Check GOT MCSymbol is NOT from MachineFunction's MCContext. + MCSymbol *GOTMFCtxPtr = + MMIWP->getMMI().getMachineFunction(*Foo)->getContext().lookupSymbol( + "_GLOBAL_OFFSET_TABLE_"); + + EXPECT_NE(GOTPrinterPtr, nullptr); + EXPECT_EQ(GOTMFCtxPtr, nullptr); +} + +} // end namespace llvm From d59594d697c4b1f0f44a4e9ee296543a88575178 Mon Sep 17 00:00:00 2001 From: Sudharsan Veeravalli Date: Sat, 5 Apr 2025 08:23:54 +0530 Subject: [PATCH 0719/1029] [RISCV] Relax out of range Xqcibi conditional branches (#134336) If .L1 is not within +-4KiB range, convert qc.(e.)bge a0, 10, .L1 to qc.(e.)blt a0, 10, 8(10) j .L1 This is similar to what is done for the RISCV conditional branches. --- .../RISCV/MCTargetDesc/RISCVAsmBackend.cpp | 36 +++++ .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp | 97 +++++++++++++ llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td | 33 +++++ .../MC/RISCV/xqcibi-long-conditional-jump.s | 134 ++++++++++++++++++ llvm/test/MC/RISCV/xqcibi-relocations.s | 15 +- 5 files changed, 307 insertions(+), 8 deletions(-) create mode 100644 llvm/test/MC/RISCV/xqcibi-long-conditional-jump.s diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 10a26554ed672..ac0f7421664c5 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -198,6 +198,30 @@ static unsigned getRelaxedOpcode(unsigned Op) { return RISCV::PseudoLongBLTU; case RISCV::BGEU: return RISCV::PseudoLongBGEU; + case RISCV::QC_BEQI: + return RISCV::PseudoLongQC_BEQI; + case RISCV::QC_BNEI: + return RISCV::PseudoLongQC_BNEI; + case RISCV::QC_BLTI: + return RISCV::PseudoLongQC_BLTI; + case RISCV::QC_BGEI: + return RISCV::PseudoLongQC_BGEI; + case RISCV::QC_BLTUI: + return RISCV::PseudoLongQC_BLTUI; + case RISCV::QC_BGEUI: + return RISCV::PseudoLongQC_BGEUI; + case RISCV::QC_E_BEQI: + return RISCV::PseudoLongQC_E_BEQI; + case RISCV::QC_E_BNEI: + return RISCV::PseudoLongQC_E_BNEI; + case RISCV::QC_E_BLTI: + return RISCV::PseudoLongQC_E_BLTI; + case RISCV::QC_E_BGEI: + return RISCV::PseudoLongQC_E_BGEI; + case RISCV::QC_E_BLTUI: + return RISCV::PseudoLongQC_E_BLTUI; + case RISCV::QC_E_BGEUI: + return RISCV::PseudoLongQC_E_BGEUI; } } @@ -224,6 +248,18 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst, case RISCV::BGE: case RISCV::BLTU: case RISCV::BGEU: + case RISCV::QC_BEQI: + case RISCV::QC_BNEI: + case RISCV::QC_BLTI: + case RISCV::QC_BGEI: + case RISCV::QC_BLTUI: + case RISCV::QC_BGEUI: + case RISCV::QC_E_BEQI: + case RISCV::QC_E_BNEI: + case RISCV::QC_E_BLTI: + case RISCV::QC_E_BGEI: + case RISCV::QC_E_BLTUI: + case RISCV::QC_E_BGEUI: Res.setOpcode(getRelaxedOpcode(Inst.getOpcode())); Res.addOperand(Inst.getOperand(0)); Res.addOperand(Inst.getOperand(1)); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index fc98859314680..37a2ac336d20c 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -68,6 +68,10 @@ class RISCVMCCodeEmitter : public MCCodeEmitter { SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; + void expandQCLongCondBrImm(const MCInst &MI, SmallVectorImpl &CB, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI, unsigned Size) const; + /// TableGen'erated function for getting the binary encoding for an /// instruction. uint64_t getBinaryCodeForInstr(const MCInst &MI, @@ -240,6 +244,30 @@ static unsigned getInvertedBranchOp(unsigned BrOp) { return RISCV::BGEU; case RISCV::PseudoLongBGEU: return RISCV::BLTU; + case RISCV::PseudoLongQC_BEQI: + return RISCV::QC_BNEI; + case RISCV::PseudoLongQC_BNEI: + return RISCV::QC_BEQI; + case RISCV::PseudoLongQC_BLTI: + return RISCV::QC_BGEI; + case RISCV::PseudoLongQC_BGEI: + return RISCV::QC_BLTI; + case RISCV::PseudoLongQC_BLTUI: + return RISCV::QC_BGEUI; + case RISCV::PseudoLongQC_BGEUI: + return RISCV::QC_BLTUI; + case RISCV::PseudoLongQC_E_BEQI: + return RISCV::QC_E_BNEI; + case RISCV::PseudoLongQC_E_BNEI: + return RISCV::QC_E_BEQI; + case RISCV::PseudoLongQC_E_BLTI: + return RISCV::QC_E_BGEI; + case RISCV::PseudoLongQC_E_BGEI: + return RISCV::QC_E_BLTI; + case RISCV::PseudoLongQC_E_BLTUI: + return RISCV::QC_E_BGEUI; + case RISCV::PseudoLongQC_E_BGEUI: + return RISCV::QC_E_BLTUI; } } @@ -305,6 +333,57 @@ void RISCVMCCodeEmitter::expandLongCondBr(const MCInst &MI, } } +// Expand PseudoLongQC_(E_)Bxxx to an inverted conditional branch and an +// unconditional jump. +void RISCVMCCodeEmitter::expandQCLongCondBrImm(const MCInst &MI, + SmallVectorImpl &CB, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI, + unsigned Size) const { + MCRegister SrcReg1 = MI.getOperand(0).getReg(); + auto BrImm = MI.getOperand(1).getImm(); + MCOperand SrcSymbol = MI.getOperand(2); + unsigned Opcode = MI.getOpcode(); + uint32_t Offset; + unsigned InvOpc = getInvertedBranchOp(Opcode); + // Emit inverted conditional branch with offset: + // 8 (QC.BXXX(4) + JAL(4)) + // or + // 10 (QC.E.BXXX(6) + JAL(4)). + if (Size == 4) { + MCInst TmpBr = + MCInstBuilder(InvOpc).addReg(SrcReg1).addImm(BrImm).addImm(8); + uint32_t BrBinary = getBinaryCodeForInstr(TmpBr, Fixups, STI); + support::endian::write(CB, BrBinary, llvm::endianness::little); + } else { + MCInst TmpBr = + MCInstBuilder(InvOpc).addReg(SrcReg1).addImm(BrImm).addImm(10); + uint64_t BrBinary = + getBinaryCodeForInstr(TmpBr, Fixups, STI) & 0xffff'ffff'ffffu; + SmallVector Encoding; + support::endian::write(Encoding, BrBinary, llvm::endianness::little); + assert(Encoding[6] == 0 && Encoding[7] == 0 && + "Unexpected encoding for 48-bit instruction"); + Encoding.truncate(6); + CB.append(Encoding); + } + Offset = Size; + // Save the number fixups. + size_t FixupStartIndex = Fixups.size(); + // Emit an unconditional jump to the destination. + MCInst TmpJ = + MCInstBuilder(RISCV::JAL).addReg(RISCV::X0).addOperand(SrcSymbol); + uint32_t JBinary = getBinaryCodeForInstr(TmpJ, Fixups, STI); + support::endian::write(CB, JBinary, llvm::endianness::little); + // Drop any fixup added so we can add the correct one. + Fixups.resize(FixupStartIndex); + if (SrcSymbol.isExpr()) { + Fixups.push_back(MCFixup::create(Offset, SrcSymbol.getExpr(), + MCFixupKind(RISCV::fixup_riscv_jal), + MI.getLoc())); + } +} + void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, SmallVectorImpl &CB, SmallVectorImpl &Fixups, @@ -339,6 +418,24 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, expandLongCondBr(MI, CB, Fixups, STI); MCNumEmitted += 2; return; + case RISCV::PseudoLongQC_BEQI: + case RISCV::PseudoLongQC_BNEI: + case RISCV::PseudoLongQC_BLTI: + case RISCV::PseudoLongQC_BGEI: + case RISCV::PseudoLongQC_BLTUI: + case RISCV::PseudoLongQC_BGEUI: + expandQCLongCondBrImm(MI, CB, Fixups, STI, 4); + MCNumEmitted += 2; + return; + case RISCV::PseudoLongQC_E_BEQI: + case RISCV::PseudoLongQC_E_BNEI: + case RISCV::PseudoLongQC_E_BLTI: + case RISCV::PseudoLongQC_E_BGEI: + case RISCV::PseudoLongQC_E_BLTUI: + case RISCV::PseudoLongQC_E_BGEUI: + expandQCLongCondBrImm(MI, CB, Fixups, STI, 6); + MCNumEmitted += 2; + return; case RISCV::PseudoTLSDESCCall: expandTLSDESCCall(MI, CB, Fixups, STI); MCNumEmitted += 1; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 64d6f6d8f8bbf..f762c4943f630 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -1152,6 +1152,39 @@ let EmitPriority = 0 in { } // EmitPriority = 0 } // Predicates = [HasVendorXqcilo, IsRV32] +//===----------------------------------------------------------------------===// +// Pseudo-instructions +//===----------------------------------------------------------------------===// + +class LongBcciPseudo + : Pseudo<(outs), (ins GPR:$rs1, InTyImm:$imm, simm21_lsb0_jal:$imm20), []> +{ + let Size = size; + let isBarrier = 1; + let isBranch = 1; + let hasSideEffects = 0; + let mayStore = 0; + let mayLoad = 0; + let isAsmParserOnly = 1; + let hasNoSchedulingInfo = 1; +} + +// This will be expanded into either: +// QC.BXXX(4 bytes) + JAL(4 bytes) +// or +// QC.E.BXXX(6 bytes) + JAL(4 bytes) +def PseudoLongQC_BEQI : LongBcciPseudo; +def PseudoLongQC_BNEI : LongBcciPseudo; +def PseudoLongQC_BLTI : LongBcciPseudo; +def PseudoLongQC_BGEI : LongBcciPseudo; +def PseudoLongQC_BLTUI : LongBcciPseudo; +def PseudoLongQC_BGEUI : LongBcciPseudo; +def PseudoLongQC_E_BEQI : LongBcciPseudo; +def PseudoLongQC_E_BNEI : LongBcciPseudo; +def PseudoLongQC_E_BLTI : LongBcciPseudo; +def PseudoLongQC_E_BGEI : LongBcciPseudo; +def PseudoLongQC_E_BLTUI : LongBcciPseudo; +def PseudoLongQC_E_BGEUI : LongBcciPseudo; //===----------------------------------------------------------------------===// // Code Gen Patterns diff --git a/llvm/test/MC/RISCV/xqcibi-long-conditional-jump.s b/llvm/test/MC/RISCV/xqcibi-long-conditional-jump.s new file mode 100644 index 0000000000000..788fddaa94463 --- /dev/null +++ b/llvm/test/MC/RISCV/xqcibi-long-conditional-jump.s @@ -0,0 +1,134 @@ +# RUN: llvm-mc -filetype=obj --mattr=+experimental-xqcibi -triple=riscv32 %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqcibi -d -M no-aliases - \ +# RUN: | FileCheck --check-prefix=CHECK-INST %s +# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+relax,+experimental-xqcibi %s \ +# RUN: | llvm-objdump --mattr=+experimental-xqcibi -dr -M no-aliases - \ +# RUN: | FileCheck --check-prefix=CHECK-INST-RELAX %s + + .text + .type test,@function + +test: + +# CHECK-INST: qc.beqi a0, 0xa, 0x8 +# CHECK-INST-NEXT: jal zero, 0x1458 +# CHECK-INST-RELAX: qc.beqi a0, 0xa, 0x8 +# CHECK-INST-RELAX-NEXT: jal zero, {{.*}} +# CHECK-INST-RELAX-NEXT: R_RISCV_JAL .L1 + qc.bnei a0, 10, .L1 +.fill 1300, 4, 0 +.L1: + ret + +# CHECK-INST: qc.bnei a0, 0x6, 0x1462 +# CHECK-INST-NEXT: jal zero, 0x28b2 +# CHECK-INST-RELAX: qc.bnei a0, 0x6, 0x1462 +# CHECK-INST-RELAX-NEXT: jal zero, {{.*}} +# CHECK-INST-RELAX-NEXT: R_RISCV_JAL .L2 + qc.beqi a0, 6, .L2 +.fill 1300, 4, 0 +.L2: + ret + +# CHECK-INST: qc.bgei a0, 0xd, 0x28bc +# CHECK-INST-NEXT: jal zero, 0x3d0c +# CHECK-INST-RELAX: qc.bgei a0, 0xd, 0x28bc +# CHECK-INST-RELAX-NEXT: jal zero, {{.*}} +# CHECK-INST-RELAX-NEXT: R_RISCV_JAL .L3 + qc.blti a0, 13, .L3 +.fill 1300, 4, 0 +.L3: + ret + +# CHECK-INST: qc.blti a0, 0x1, 0x3d16 +# CHECK-INST-NEXT: jal zero, 0x5166 +# CHECK-INST-RELAX: qc.blti a0, 0x1, 0x3d16 +# CHECK-INST-RELAX-NEXT: jal zero, {{.*}} +# CHECK-INST-RELAX-NEXT: R_RISCV_JAL .L4 + qc.bgei a0, 1, .L4 +.fill 1300, 4, 0 +.L4: + ret + +# CHECK-INST: qc.bgeui a0, 0x5, 0x5170 +# CHECK-INST-NEXT: jal zero, 0x65c0 +# CHECK-INST-RELAX: qc.bgeui a0, 0x5, 0x5170 +# CHECK-INST-RELAX-NEXT: jal zero, {{.*}} +# CHECK-INST-RELAX-NEXT: R_RISCV_JAL .L5 + qc.bltui a0, 5, .L5 +.fill 1300, 4, 0 +.L5: + ret + +# CHECK-INST: qc.bltui a0, 0xc, 0x65ca +# CHECK-INST-NEXT: jal zero, 0x7a1a +# CHECK-INST-RELAX: qc.bltui a0, 0xc, 0x65ca +# CHECK-INST-RELAX-NEXT: jal zero, {{.*}} +# CHECK-INST-RELAX-NEXT: R_RISCV_JAL .L6 + qc.bgeui a0, 12, .L6 +.fill 1300, 4, 0 +.L6: + ret + +# CHECK-INST: qc.e.beqi a0, 0x1, 0x7a26 +# CHECK-INST-NEXT: jal zero, 0x8e76 +# CHECK-INST-RELAX: qc.e.beqi a0, 0x1, 0x7a26 +# CHECK-INST-RELAX-NEXT: jal zero, {{.*}} +# CHECK-INST-RELAX-NEXT: R_RISCV_JAL .L7 + qc.e.bnei a0, 1, .L7 +.fill 1300, 4, 0 +.L7: + ret + +# CHECK-INST: qc.e.bnei a0, 0x2, 0x8e82 +# CHECK-INST-NEXT: jal zero, 0xa2d2 +# CHECK-INST-RELAX: qc.e.bnei a0, 0x2, 0x8e82 +# CHECK-INST-RELAX-NEXT: jal zero, {{.*}} +# CHECK-INST-RELAX-NEXT: R_RISCV_JAL .L8 + qc.e.beqi a0, 2, .L8 +.fill 1300, 4, 0 +.L8: + ret + +# CHECK-INST: qc.e.bgei a0, 0x3, 0xa2de +# CHECK-INST-NEXT: jal zero, 0xb72e +# CHECK-INST-RELAX: qc.e.bgei a0, 0x3, 0xa2de +# CHECK-INST-RELAX-NEXT: jal zero, {{.*}} +# CHECK-INST-RELAX-NEXT: R_RISCV_JAL .L9 + qc.e.blti a0, 3, .L9 +.fill 1300, 4, 0 +.L9: + ret + +# CHECK-INST: qc.e.blti a0, 0x4, 0xb73a +# CHECK-INST-NEXT: jal zero, 0xcb8a +# CHECK-INST-RELAX: qc.e.blti a0, 0x4, 0xb73a +# CHECK-INST-RELAX-NEXT: jal zero, {{.*}} +# CHECK-INST-RELAX-NEXT: R_RISCV_JAL .L10 + qc.e.bgei a0, 4, .L10 +.fill 1300, 4, 0 +.L10: + ret + +# CHECK-INST: qc.e.bgeui a0, 0x5, 0xcb96 +# CHECK-INST-NEXT: jal zero, 0xdfe6 +# CHECK-INST-RELAX: qc.e.bgeui a0, 0x5, 0xcb96 +# CHECK-INST-RELAX-NEXT: jal zero, {{.*}} +# CHECK-INST-RELAX-NEXT: R_RISCV_JAL .L11 + qc.e.bltui a0, 5, .L11 +.fill 1300, 4, 0 +.L11: + ret + +# CHECK-INST: qc.e.bltui a0, 0x6, 0xdff2 +# CHECK-INST-NEXT: jal zero, 0xf442 +# CHECK-INST-RELAX: qc.e.bltui a0, 0x6, 0xdff2 +# CHECK-INST-RELAX-NEXT: jal zero, {{.*}} +# CHECK-INST-RELAX-NEXT: R_RISCV_JAL .L12 + qc.e.bgeui a0, 6, .L12 +.fill 1300, 4, 0 +.L12: + ret + +.Lfunc_end0: + .size test, .Lfunc_end0-test diff --git a/llvm/test/MC/RISCV/xqcibi-relocations.s b/llvm/test/MC/RISCV/xqcibi-relocations.s index 4899e5f1eac46..7028e8a737c86 100644 --- a/llvm/test/MC/RISCV/xqcibi-relocations.s +++ b/llvm/test/MC/RISCV/xqcibi-relocations.s @@ -10,23 +10,22 @@ .text -# Check that branch to an undefined symbol is handled -# FIXME: This should be relaxed to an inverse branch and jump +# Since foo is undefined, this will be relaxed to (qc.beqi + jal) qc.bnei x6, 10, foo -# RELOC: R_RISCV_BRANCH foo 0x0 +# RELOC: R_RISCV_JAL foo 0x0 # INSTR: qc.bnei t1, 10, foo # FIXUP: fixup A - offset: 0, value: foo, kind: fixup_riscv_branch -# FIXME: This should be relaxed to an inverse branch and jump +# Since foo is undefined, this will be relaxed to (qc.e.bltui + jal) qc.e.bgeui x8, 12, foo -# RELOC: R_RISCV_CUSTOM193 foo 0x0 +# RELOC: R_RISCV_JAL foo 0x0 # INSTR: qc.e.bgeui s0, 12, foo # FIXUP: fixup A - offset: 0, value: foo, kind: fixup_riscv_qc_e_branch -# Check that a label in a different section is handled similar to an undefined symbol -# FIXME: This should be relaxed to an inverse branch and jump +# Check that a label in a different section is handled similar to an undefined +# symbol and gets relaxed to (qc.e.bgeui + jal) qc.e.bltui x4, 9, .bar -# RELOC: R_RISCV_CUSTOM193 .bar 0x0 +# RELOC: R_RISCV_JAL .bar 0x0 # INSTR: qc.e.bltui tp, 9, .bar # FIXUP: fixup A - offset: 0, value: .bar, kind: fixup_riscv_qc_e_branch From 71884b63a413c7803fce8ec7bf2857938765f4e2 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 4 Apr 2025 19:57:24 -0700 Subject: [PATCH 0720/1029] [AArch64] Transition from MCSymbolRefExpr::VariantKind constants Shift ELF `@plt` and `@gotpcrel` references in data directives, as well as Mach-O `@specifier` notations, to use `AArch64MCExpr::Specifier` constants. This is a follow-up to #132595. COFF-specific specifiers are not moved yet. In addition, partition @-specifiers into COFF, ELF, and Mach-O, so that mix-and-match is rejected at parse time. ELF and Mach-O specifiers are distinct, with `None` being the only shared value. For Mach-O-specific specifiers, we adopt the `M_xxx` naming convention. Pull Request: https://github.com/llvm/llvm-project/pull/133214 --- .../CodeGen/TargetLoweringObjectFileImpl.h | 2 +- llvm/include/llvm/MC/MCAsmInfo.h | 5 +- llvm/include/llvm/MC/MCExpr.h | 8 -- .../lib/Target/AArch64/AArch64MCInstLower.cpp | 17 +++-- .../AArch64/AArch64TargetObjectFile.cpp | 9 ++- .../Target/AArch64/AArch64TargetObjectFile.h | 5 -- .../AArch64/AsmParser/AArch64AsmParser.cpp | 73 ++++++++++--------- .../AArch64ExternalSymbolizer.cpp | 25 ++++--- .../MCTargetDesc/AArch64ELFObjectWriter.cpp | 21 +++--- .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 41 +++++++---- .../AArch64/MCTargetDesc/AArch64MCExpr.h | 22 ++++++ .../MCTargetDesc/AArch64MachObjectWriter.cpp | 24 +++--- .../AArch64WinCOFFObjectWriter.cpp | 4 +- llvm/test/MC/AArch64/coff-relocations.s | 7 ++ .../MC/AArch64/data-directive-specifier.s | 12 ++- 15 files changed, 157 insertions(+), 118 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h index 8b0e5798d1b61..af8d73b4fa064 100644 --- a/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h +++ b/llvm/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h @@ -37,7 +37,7 @@ class TargetLoweringObjectFileELF : public TargetLoweringObjectFile { SmallPtrSet Used; protected: - uint8_t PLTRelativeSpecifier = 0; + uint16_t PLTRelativeSpecifier = 0; public: ~TargetLoweringObjectFileELF() override = default; diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index 6714abac5c726..ec27a47e36820 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -65,10 +65,13 @@ class MCAsmInfo { /// quote, e.g., `'A`. }; - struct VariantKindDesc { + // This describes a @ style relocation specifier (expr@specifier) supported by + // AsmParser::parsePrimaryExpr. + struct AtSpecifier { uint32_t Kind; StringRef Name; }; + using VariantKindDesc = AtSpecifier; protected: //===------------------------------------------------------------------===// diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index 12830ee648ae0..782f7ea8957d9 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -198,14 +198,6 @@ class MCSymbolRefExpr : public MCExpr { VK_GOT, VK_GOTPCREL, - VK_PLT, - VK_TLVP, // Mach-O thread local variable relocations - VK_TLVPPAGE, - VK_TLVPPAGEOFF, - VK_PAGE, - VK_PAGEOFF, - VK_GOTPAGE, - VK_GOTPAGEOFF, VK_SECREL, VK_WEAKREF, // The link between the symbols in .weakref foo, bar diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp index 6a02a75ddbb4d..165b7d8ad6330 100644 --- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -151,31 +151,32 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandMachO(const MachineOperand &MO, MCSymbol *Sym) const { // FIXME: We would like an efficient form for this, so we don't have to do a // lot of extra uniquing. - MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; + auto Spec = AArch64MCExpr::None; if ((MO.getTargetFlags() & AArch64II::MO_GOT) != 0) { if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) - RefKind = MCSymbolRefExpr::VK_GOTPAGE; + Spec = AArch64MCExpr::M_GOTPAGE; else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGEOFF) - RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF; + Spec = AArch64MCExpr::M_GOTPAGEOFF; else llvm_unreachable("Unexpected target flags with MO_GOT on GV operand"); } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) { if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) - RefKind = MCSymbolRefExpr::VK_TLVPPAGE; + Spec = AArch64MCExpr::M_TLVPPAGE; else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGEOFF) - RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF; + Spec = AArch64MCExpr::M_TLVPPAGEOFF; else llvm_unreachable("Unexpected target flags with MO_TLS on GV operand"); } else { if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) - RefKind = MCSymbolRefExpr::VK_PAGE; + Spec = AArch64MCExpr::M_PAGE; else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGEOFF) - RefKind = MCSymbolRefExpr::VK_PAGEOFF; + Spec = AArch64MCExpr::M_PAGEOFF; } - const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx); + // TODO: Migrate to AArch64MCExpr::create like ELF. + const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Spec, Ctx); if (!MO.isJTI() && MO.getOffset()) Expr = MCBinaryExpr::createAdd( Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp index b662e75741d38..b9ba2a41877ec 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -25,6 +25,9 @@ using namespace dwarf; void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); + PLTRelativeSpecifier = AArch64MCExpr::VK_PLT; + SupportIndirectSymViaGOTPCRel = true; + // AARCH64 ELF ABI does not define static relocation type for TLS offset // within a module. Do not generate AT_location for TLS variables. SupportDebugThreadLocalLocation = false; @@ -58,7 +61,7 @@ const MCExpr *AArch64_ELFTargetObjectFile::getIndirectSymViaGOTPCRel( int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const { int64_t FinalOffset = Offset + MV.getConstant(); const MCExpr *Res = - MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext()); + MCSymbolRefExpr::create(Sym, AArch64MCExpr::VK_GOTPCREL, getContext()); const MCExpr *Off = MCConstantExpr::create(FinalOffset, getContext()); return MCBinaryExpr::createAdd(Res, Off, getContext()); } @@ -77,7 +80,7 @@ const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference( if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) { const MCSymbol *Sym = TM.getSymbol(GV); const MCExpr *Res = - MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext()); + MCSymbolRefExpr::create(Sym, AArch64MCExpr::M_GOT, getContext()); MCSymbol *PCSym = getContext().createTempSymbol(); Streamer.emitLabel(PCSym); const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext()); @@ -102,7 +105,7 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel( // On ARM64 Darwin, we can reference symbols with foo@GOT-., which // is an indirect pc-relative reference. const MCExpr *Res = - MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext()); + MCSymbolRefExpr::create(Sym, AArch64MCExpr::M_GOT, getContext()); MCSymbol *PCSym = getContext().createTempSymbol(); Streamer.emitLabel(PCSym); const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext()); diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h index de79acd229873..6b3381452c70b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -20,11 +20,6 @@ class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF { void Initialize(MCContext &Ctx, const TargetMachine &TM) override; public: - AArch64_ELFTargetObjectFile() { - PLTRelativeSpecifier = MCSymbolRefExpr::VK_PLT; - SupportIndirectSymViaGOTPCRel = true; - } - const MCExpr *getIndirectSymViaGOTPCRel(const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV, int64_t Offset, diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 28b4cbb5efed8..38710e9344687 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -339,7 +339,7 @@ class AArch64AsmParser : public MCTargetAsmParser { static bool classifySymbolRef(const MCExpr *Expr, AArch64MCExpr::Specifier &ELFSpec, - MCSymbolRefExpr::VariantKind &DarwinRefKind, + AArch64MCExpr::Specifier &DarwinSpec, int64_t &Addend); }; @@ -889,16 +889,16 @@ class AArch64Operand : public MCParsedAsmOperand { bool isSymbolicUImm12Offset(const MCExpr *Expr) const { AArch64MCExpr::Specifier ELFSpec; - MCSymbolRefExpr::VariantKind DarwinRefKind; + AArch64MCExpr::Specifier DarwinSpec; int64_t Addend; - if (!AArch64AsmParser::classifySymbolRef(Expr, ELFSpec, DarwinRefKind, + if (!AArch64AsmParser::classifySymbolRef(Expr, ELFSpec, DarwinSpec, Addend)) { // If we don't understand the expression, assume the best and // let the fixup and relocation code deal with it. return true; } - if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || + if (DarwinSpec == AArch64MCExpr::M_PAGEOFF || llvm::is_contained( {AArch64MCExpr::VK_LO12, AArch64MCExpr::VK_GOT_LO12, AArch64MCExpr::VK_GOT_AUTH_LO12, AArch64MCExpr::VK_DTPREL_LO12, @@ -912,8 +912,8 @@ class AArch64Operand : public MCParsedAsmOperand { // size when converted, so there is no "out of range" condition when using // @pageoff. return true; - } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF || - DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) { + } else if (DarwinSpec == AArch64MCExpr::M_GOTPAGEOFF || + DarwinSpec == AArch64MCExpr::M_TLVPPAGEOFF) { // @gotpageoff/@tlvppageoff can only be used directly, not with an addend. return Addend == 0; } @@ -1006,13 +1006,13 @@ class AArch64Operand : public MCParsedAsmOperand { } AArch64MCExpr::Specifier ELFSpec; - MCSymbolRefExpr::VariantKind DarwinRefKind; + AArch64MCExpr::Specifier DarwinSpec; int64_t Addend; - if (AArch64AsmParser::classifySymbolRef(Expr, ELFSpec, DarwinRefKind, + if (AArch64AsmParser::classifySymbolRef(Expr, ELFSpec, DarwinSpec, Addend)) { - return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || - DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF || - (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0) || + return DarwinSpec == AArch64MCExpr::M_PAGEOFF || + DarwinSpec == AArch64MCExpr::M_TLVPPAGEOFF || + (DarwinSpec == AArch64MCExpr::M_GOTPAGEOFF && Addend == 0) || llvm::is_contained( {AArch64MCExpr::VK_LO12, AArch64MCExpr::VK_GOT_AUTH_LO12, AArch64MCExpr::VK_DTPREL_HI12, AArch64MCExpr::VK_DTPREL_LO12, @@ -1120,13 +1120,13 @@ class AArch64Operand : public MCParsedAsmOperand { return false; AArch64MCExpr::Specifier ELFSpec; - MCSymbolRefExpr::VariantKind DarwinRefKind; + AArch64MCExpr::Specifier DarwinSpec; int64_t Addend; - if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFSpec, DarwinRefKind, + if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFSpec, DarwinSpec, Addend)) { return false; } - if (DarwinRefKind != MCSymbolRefExpr::VK_None) + if (DarwinSpec != AArch64MCExpr::None) return false; return llvm::is_contained(AllowedModifiers, ELFSpec); @@ -3297,22 +3297,22 @@ ParseStatus AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { return ParseStatus::Failure; AArch64MCExpr::Specifier ELFSpec; - MCSymbolRefExpr::VariantKind DarwinRefKind; + AArch64MCExpr::Specifier DarwinSpec; int64_t Addend; - if (classifySymbolRef(Expr, ELFSpec, DarwinRefKind, Addend)) { - if (DarwinRefKind == MCSymbolRefExpr::VK_None && + if (classifySymbolRef(Expr, ELFSpec, DarwinSpec, Addend)) { + if (DarwinSpec == AArch64MCExpr::None && ELFSpec == AArch64MCExpr::VK_INVALID) { // No modifier was specified at all; this is the syntax for an ELF basic // ADRP relocation (unfortunately). Expr = AArch64MCExpr::create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext()); - } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE || - DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) && + } else if ((DarwinSpec == AArch64MCExpr::M_GOTPAGE || + DarwinSpec == AArch64MCExpr::M_TLVPPAGE) && Addend != 0) { return Error(S, "gotpage label reference not allowed an addend"); - } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE && - DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE && - DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE && + } else if (DarwinSpec != AArch64MCExpr::M_PAGE && + DarwinSpec != AArch64MCExpr::M_GOTPAGE && + DarwinSpec != AArch64MCExpr::M_TLVPPAGE && ELFSpec != AArch64MCExpr::VK_ABS_PAGE_NC && ELFSpec != AArch64MCExpr::VK_GOT_PAGE && ELFSpec != AArch64MCExpr::VK_GOT_AUTH_PAGE && @@ -3351,10 +3351,10 @@ ParseStatus AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) { return ParseStatus::Failure; AArch64MCExpr::Specifier ELFSpec; - MCSymbolRefExpr::VariantKind DarwinRefKind; + AArch64MCExpr::Specifier DarwinSpec; int64_t Addend; - if (classifySymbolRef(Expr, ELFSpec, DarwinRefKind, Addend)) { - if (DarwinRefKind == MCSymbolRefExpr::VK_None && + if (classifySymbolRef(Expr, ELFSpec, DarwinSpec, Addend)) { + if (DarwinSpec == AArch64MCExpr::None && ELFSpec == AArch64MCExpr::VK_INVALID) { // No modifier was specified at all; this is the syntax for an ELF basic // ADR relocation (unfortunately). @@ -5817,13 +5817,13 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc, if (Inst.getOperand(2).isExpr()) { const MCExpr *Expr = Inst.getOperand(2).getExpr(); AArch64MCExpr::Specifier ELFSpec; - MCSymbolRefExpr::VariantKind DarwinRefKind; + AArch64MCExpr::Specifier DarwinSpec; int64_t Addend; - if (classifySymbolRef(Expr, ELFSpec, DarwinRefKind, Addend)) { + if (classifySymbolRef(Expr, ELFSpec, DarwinSpec, Addend)) { // Only allow these with ADDXri. - if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || - DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) && + if ((DarwinSpec == AArch64MCExpr::M_PAGEOFF || + DarwinSpec == AArch64MCExpr::M_TLVPPAGEOFF) && Inst.getOpcode() == AArch64::ADDXri) return false; @@ -8192,11 +8192,12 @@ bool AArch64AsmParser::parseAuthExpr(const MCExpr *&Res, SMLoc &EndLoc) { return false; } -bool AArch64AsmParser::classifySymbolRef( - const MCExpr *Expr, AArch64MCExpr::Specifier &ELFSpec, - MCSymbolRefExpr::VariantKind &DarwinRefKind, int64_t &Addend) { +bool AArch64AsmParser::classifySymbolRef(const MCExpr *Expr, + AArch64MCExpr::Specifier &ELFSpec, + AArch64MCExpr::Specifier &DarwinSpec, + int64_t &Addend) { ELFSpec = AArch64MCExpr::VK_INVALID; - DarwinRefKind = MCSymbolRefExpr::VK_None; + DarwinSpec = AArch64MCExpr::None; Addend = 0; if (const AArch64MCExpr *AE = dyn_cast(Expr)) { @@ -8207,7 +8208,7 @@ bool AArch64AsmParser::classifySymbolRef( const MCSymbolRefExpr *SE = dyn_cast(Expr); if (SE) { // It's a simple symbol reference with no addend. - DarwinRefKind = SE->getKind(); + DarwinSpec = AArch64MCExpr::Specifier(SE->getKind()); return true; } @@ -8223,13 +8224,13 @@ bool AArch64AsmParser::classifySymbolRef( return false; if (Res.getSymA()) - DarwinRefKind = Res.getSymA()->getKind(); + DarwinSpec = AArch64MCExpr::Specifier(Res.getSymA()->getKind()); Addend = Res.getConstant(); // It's some symbol reference + a constant addend, but really // shouldn't use both Darwin and ELF syntax. return ELFSpec == AArch64MCExpr::VK_INVALID || - DarwinRefKind == MCSymbolRefExpr::VK_None; + DarwinSpec == AArch64MCExpr::None; } /// Force static initialization. diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp index 09d706f0a303b..8b65f856b2a67 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "AArch64ExternalSymbolizer.h" +#include "MCTargetDesc/AArch64MCExpr.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -19,23 +20,23 @@ using namespace llvm; #define DEBUG_TYPE "aarch64-disassembler" -static MCSymbolRefExpr::VariantKind -getVariant(uint64_t LLVMDisassembler_VariantKind) { +static AArch64MCExpr::Specifier +getMachOSpecifier(uint64_t LLVMDisassembler_VariantKind) { switch (LLVMDisassembler_VariantKind) { case LLVMDisassembler_VariantKind_None: - return MCSymbolRefExpr::VK_None; + return AArch64MCExpr::None; case LLVMDisassembler_VariantKind_ARM64_PAGE: - return MCSymbolRefExpr::VK_PAGE; + return AArch64MCExpr::M_PAGE; case LLVMDisassembler_VariantKind_ARM64_PAGEOFF: - return MCSymbolRefExpr::VK_PAGEOFF; + return AArch64MCExpr::M_PAGEOFF; case LLVMDisassembler_VariantKind_ARM64_GOTPAGE: - return MCSymbolRefExpr::VK_GOTPAGE; + return AArch64MCExpr::M_GOTPAGE; case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF: - return MCSymbolRefExpr::VK_GOTPAGEOFF; + return AArch64MCExpr::M_GOTPAGEOFF; case LLVMDisassembler_VariantKind_ARM64_TLVP: - return MCSymbolRefExpr::VK_TLVPPAGE; + return AArch64MCExpr::M_TLVPPAGE; case LLVMDisassembler_VariantKind_ARM64_TLVOFF: - return MCSymbolRefExpr::VK_TLVPPAGEOFF; + return AArch64MCExpr::M_TLVPPAGEOFF; default: llvm_unreachable("bad LLVMDisassembler_VariantKind"); } @@ -170,9 +171,9 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand( if (SymbolicOp.AddSymbol.Name) { StringRef Name(SymbolicOp.AddSymbol.Name); MCSymbol *Sym = Ctx.getOrCreateSymbol(Name); - MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind); - if (Variant != MCSymbolRefExpr::VK_None) - Add = MCSymbolRefExpr::create(Sym, Variant, Ctx); + auto Spec = getMachOSpecifier(SymbolicOp.VariantKind); + if (Spec != AArch64MCExpr::None) + Add = MCSymbolRefExpr::create(Sym, Spec, Ctx); else Add = MCSymbolRefExpr::create(Sym, Ctx); } else { diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index a509edf160d32..fa72cbf032cdf 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -117,13 +117,9 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, bool IsNC = AArch64MCExpr::isNotChecked(RefKind); assert((!Target.getSymA() || - Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None || - Target.getSymA()->getKind() == MCSymbolRefExpr::VK_PLT || - Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOTPCREL) && - "Should only be expression-level modifiers here"); - - assert((!Target.getSymB() || - Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) && + getSpecifier(Target.getSymA()) == AArch64MCExpr::None || + getSpecifier(Target.getSymA()) == AArch64MCExpr::VK_PLT || + getSpecifier(Target.getSymA()) == AArch64MCExpr::VK_GOTPCREL) && "Should only be expression-level modifiers here"); switch (SymLoc) { @@ -147,7 +143,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, case FK_Data_2: return R_CLS(PREL16); case FK_Data_4: { - return Target.getAccessVariant() == MCSymbolRefExpr::VK_PLT + return AArch64MCExpr::Specifier(Target.getAccessVariant()) == + AArch64MCExpr::VK_PLT ? R_CLS(PLT32) : R_CLS(PREL32); } @@ -258,8 +255,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, case FK_Data_2: return R_CLS(ABS16); case FK_Data_4: - return (!IsILP32 && - Target.getAccessVariant() == MCSymbolRefExpr::VK_GOTPCREL) + return (!IsILP32 && AArch64MCExpr::Specifier(Target.getAccessVariant()) == + AArch64MCExpr::VK_GOTPCREL) ? ELF::R_AARCH64_GOTPCREL32 : R_CLS(ABS32); case FK_Data_8: { @@ -554,8 +551,8 @@ bool AArch64ELFObjectWriter::needsRelocateWithSymbol(const MCValue &Val, if ((Val.getRefKind() & AArch64MCExpr::VK_GOT) == AArch64MCExpr::VK_GOT) return true; - return is_contained({MCSymbolRefExpr::VK_GOTPCREL, MCSymbolRefExpr::VK_PLT}, - Val.getAccessVariant()); + return is_contained({AArch64MCExpr::VK_GOTPCREL, AArch64MCExpr::VK_PLT}, + AArch64MCExpr::Specifier(Val.getAccessVariant())); } std::unique_ptr diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 9ff53631a995e..8cffd9ce557db 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "AArch64MCAsmInfo.h" +#include "MCTargetDesc/AArch64MCExpr.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCStreamer.h" @@ -30,19 +31,27 @@ static cl::opt AsmWriterVariant( cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"), clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"))); -const MCAsmInfo::VariantKindDesc variantKindDescs[] = { +const MCAsmInfo::AtSpecifier COFFAtSpecifiers[] = { {MCSymbolRefExpr::VK_COFF_IMGREL32, "IMGREL"}, - {MCSymbolRefExpr::VK_GOT, "GOT"}, - {MCSymbolRefExpr::VK_GOTPAGE, "GOTPAGE"}, - {MCSymbolRefExpr::VK_GOTPAGEOFF, "GOTPAGEOFF"}, - {MCSymbolRefExpr::VK_GOTPCREL, "GOTPCREL"}, - {MCSymbolRefExpr::VK_PAGE, "PAGE"}, - {MCSymbolRefExpr::VK_PAGEOFF, "PAGEOFF"}, - {MCSymbolRefExpr::VK_PLT, "PLT"}, - {MCSymbolRefExpr::VK_TLVP, "TLVP"}, - {MCSymbolRefExpr::VK_TLVPPAGE, "TLVPPAGE"}, - {MCSymbolRefExpr::VK_TLVPPAGEOFF, "TLVPPAGEOFF"}, {MCSymbolRefExpr::VK_WEAKREF, "WEAKREF"}, + {AArch64MCExpr::M_PAGEOFF, "PAGEOFF"}, +}; + +const MCAsmInfo::AtSpecifier ELFAtSpecifiers[] = { + {AArch64MCExpr::VK_GOT, "GOT"}, + {AArch64MCExpr::VK_GOTPCREL, "GOTPCREL"}, + {AArch64MCExpr::VK_PLT, "PLT"}, +}; + +const MCAsmInfo::AtSpecifier MachOAtSpecifiers[] = { + {AArch64MCExpr::M_GOT, "GOT"}, + {AArch64MCExpr::M_GOTPAGE, "GOTPAGE"}, + {AArch64MCExpr::M_GOTPAGEOFF, "GOTPAGEOFF"}, + {AArch64MCExpr::M_PAGE, "PAGE"}, + {AArch64MCExpr::M_PAGEOFF, "PAGEOFF"}, + {AArch64MCExpr::M_TLVP, "TLVP"}, + {AArch64MCExpr::M_TLVPPAGE, "TLVPPAGE"}, + {AArch64MCExpr::M_TLVPPAGEOFF, "TLVPPAGEOFF"}, }; AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) { @@ -64,7 +73,7 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) { ExceptionsType = ExceptionHandling::DwarfCFI; - initializeVariantKinds(variantKindDescs); + initializeVariantKinds(MachOAtSpecifiers); } const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol( @@ -75,7 +84,7 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol( // version. MCContext &Context = Streamer.getContext(); const MCExpr *Res = - MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Context); + MCSymbolRefExpr::create(Sym, AArch64MCExpr::M_GOT, Context); MCSymbol *PCSym = Context.createTempSymbol(); Streamer.emitLabel(PCSym); const MCExpr *PC = MCSymbolRefExpr::create(PCSym, Context); @@ -115,7 +124,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { HasIdentDirective = true; - initializeVariantKinds(variantKindDescs); + initializeVariantKinds(ELFAtSpecifiers); } AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() { @@ -134,7 +143,7 @@ AArch64MCAsmInfoMicrosoftCOFF::AArch64MCAsmInfoMicrosoftCOFF() { ExceptionsType = ExceptionHandling::WinEH; WinEHEncodingType = WinEH::EncodingType::Itanium; - initializeVariantKinds(variantKindDescs); + initializeVariantKinds(COFFAtSpecifiers); } AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() { @@ -153,5 +162,5 @@ AArch64MCAsmInfoGNUCOFF::AArch64MCAsmInfoGNUCOFF() { ExceptionsType = ExceptionHandling::WinEH; WinEHEncodingType = WinEH::EncodingType::Itanium; - initializeVariantKinds(variantKindDescs); + initializeVariantKinds(COFFAtSpecifiers); } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index 50abaa9861538..3128f9d10a4bc 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -25,6 +25,7 @@ class AArch64MCExpr : public MCTargetExpr { public: enum Specifier : uint16_t { // clang-format off + None = 0, // Symbol locations specifying (roughly speaking) what calculation should be // performed to construct the final address for the relocated // symbol. E.g. direct, via the GOT, ... @@ -120,6 +121,20 @@ class AArch64MCExpr : public MCTargetExpr { VK_SECREL_LO12 = VK_SECREL | VK_PAGEOFF, VK_SECREL_HI12 = VK_SECREL | VK_HI12, + // ELF relocation specifiers in data directives: + VK_PLT = 0x400, + VK_GOTPCREL, + + // Mach-O @ relocation specifiers: + M_GOT, + M_GOTPAGE, + M_GOTPAGEOFF, + M_PAGE, + M_PAGEOFF, + M_TLVP, + M_TLVPPAGE, + M_TLVPPAGEOFF, + VK_INVALID = 0xfff // clang-format on }; @@ -214,6 +229,13 @@ class AArch64AuthMCExpr final : public AArch64MCExpr { return E->getSpecifier() == VK_AUTH || E->getSpecifier() == VK_AUTHADDR; } }; + +// Getter for the legacy representation that encodes the relocation specifier in +// MCSymbolRefExpr::SubclassData. +static inline AArch64MCExpr::Specifier +getSpecifier(const MCSymbolRefExpr *SRE) { + return AArch64MCExpr::Specifier(SRE->getKind()); +} } // end namespace llvm #endif diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 3deee6548f279..c3a6174131806 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -66,12 +66,12 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( return true; case FK_Data_4: Log2Size = Log2_32(4); - if (Sym->getKind() == MCSymbolRefExpr::VK_GOT) + if (getSpecifier(Sym) == AArch64MCExpr::M_GOT) RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); return true; case FK_Data_8: Log2Size = Log2_32(8); - if (Sym->getKind() == MCSymbolRefExpr::VK_GOT) + if (getSpecifier(Sym) == AArch64MCExpr::M_GOT) RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); return true; case AArch64::fixup_aarch64_add_imm12: @@ -81,34 +81,34 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( case AArch64::fixup_aarch64_ldst_imm12_scale8: case AArch64::fixup_aarch64_ldst_imm12_scale16: Log2Size = Log2_32(4); - switch (Sym->getKind()) { + switch (AArch64MCExpr::Specifier(getSpecifier(Sym))) { default: return false; - case MCSymbolRefExpr::VK_PAGEOFF: + case AArch64MCExpr::M_PAGEOFF: RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12); return true; - case MCSymbolRefExpr::VK_GOTPAGEOFF: + case AArch64MCExpr::M_GOTPAGEOFF: RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12); return true; - case MCSymbolRefExpr::VK_TLVPPAGEOFF: + case AArch64MCExpr::M_TLVPPAGEOFF: RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12); return true; } case AArch64::fixup_aarch64_pcrel_adrp_imm21: Log2Size = Log2_32(4); // This encompasses the relocation for the whole 21-bit value. - switch (Sym->getKind()) { + switch (getSpecifier(Sym)) { default: Asm.getContext().reportError(Fixup.getLoc(), "ADR/ADRP relocations must be GOT relative"); return false; - case MCSymbolRefExpr::VK_PAGE: + case AArch64MCExpr::M_PAGE: RelocType = unsigned(MachO::ARM64_RELOC_PAGE21); return true; - case MCSymbolRefExpr::VK_GOTPAGE: + case AArch64MCExpr::M_GOTPAGE: RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21); return true; - case MCSymbolRefExpr::VK_TLVPPAGE: + case AArch64MCExpr::M_TLVPPAGE: RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21); return true; } @@ -221,7 +221,7 @@ void AArch64MachObjectWriter::recordRelocation( // Check for "_foo@got - .", which comes through here as: // Ltmp0: // ... _foo@got - Ltmp0 - if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT && + if (getSpecifier(Target.getSymA()) == AArch64MCExpr::M_GOT && Asm.getSymbolOffset(*B) == Asm.getFragmentOffset(*Fragment) + Fixup.getOffset()) { // SymB is the PC, so use a PC-rel pointer-to-GOT relocation. @@ -232,7 +232,7 @@ void AArch64MachObjectWriter::recordRelocation( MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); Writer->addRelocation(A_Base, Fragment->getParent(), MRE); return; - } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None) { + } else if (getSpecifier(Target.getSymA()) != AArch64MCExpr::None) { // Otherwise, neither symbol can be modified. Asm.getContext().reportError(Fixup.getLoc(), "unsupported relocation of modified symbol"); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp index 4213edafa9a0f..d679f5f621e0a 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -61,8 +61,8 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType( FixupKind = FK_PCRel_4; } - auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None - : Target.getSymA()->getKind(); + auto Modifier = + Target.isAbsolute() ? AArch64MCExpr::None : Target.getSymA()->getKind(); const MCExpr *Expr = Fixup.getValue(); if (const AArch64MCExpr *A64E = dyn_cast(Expr)) { diff --git a/llvm/test/MC/AArch64/coff-relocations.s b/llvm/test/MC/AArch64/coff-relocations.s index 2370fd9fb4365..a8a466c8b5818 100644 --- a/llvm/test/MC/AArch64/coff-relocations.s +++ b/llvm/test/MC/AArch64/coff-relocations.s @@ -7,6 +7,8 @@ // RUN: llvm-objdump -s %t.obj | FileCheck %s --check-prefix=DATA // RUN: llvm-objdump -s %t-ec.obj | FileCheck %s --check-prefix=DATA +# RUN: not llvm-mc -triple=aarch64-windows -filetype=obj %s --defsym ERR=1 -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR --implicit-check-not=error: + // IMAGE_REL_ARM64_ADDR32 .Linfo_foo: .asciz "foo" @@ -121,3 +123,8 @@ tbz x0, #0, target // DATA: Contents of section .rdata: // DATA-NEXT: 0000 30000000 08000000 + +.ifdef ERR +# ERR: [[#@LINE+1]]:12: error: invalid variant 'plt' +.long func@plt +.endif diff --git a/llvm/test/MC/AArch64/data-directive-specifier.s b/llvm/test/MC/AArch64/data-directive-specifier.s index 3a8665126097a..c4ca5d760b41c 100644 --- a/llvm/test/MC/AArch64/data-directive-specifier.s +++ b/llvm/test/MC/AArch64/data-directive-specifier.s @@ -1,5 +1,6 @@ # RUN: llvm-mc -triple=aarch64 -filetype=obj %s | llvm-readobj -r - | FileCheck %s # RUN: not llvm-mc -triple=aarch64 -filetype=obj %s --defsym ERR=1 -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR --implicit-check-not=error: +# RUN: not llvm-mc -triple=aarch64 -filetype=obj %s --defsym OBJERR=1 -o /dev/null 2>&1 | FileCheck %s --check-prefix=OBJERR --implicit-check-not=error: .globl g g: @@ -31,14 +32,21 @@ data1: .word extern@gotpcrel+4 .word extern@GOTPCREL-5 +## Test parse-time errors .ifdef ERR -# ERR: [[#@LINE+1]]:7: error: symbol 'und' can not be undefined in a subtraction expression +# ERR: [[#@LINE+1]]:14: error: invalid variant 'pageoff' +.word extern@pageoff +.endif + +## Test assemble-time errors +.ifdef OBJERR +# OBJERR: [[#@LINE+1]]:7: error: symbol 'und' can not be undefined in a subtraction expression .word extern@plt - und .quad g@plt - . .word extern@gotpcrel - . -# ERR: [[#@LINE+1]]:7: error: symbol 'und' can not be undefined in a subtraction expression +# OBJERR: [[#@LINE+1]]:7: error: symbol 'und' can not be undefined in a subtraction expression .word extern@gotpcrel - und .endif From 1d7bd3bc5c4f085dfa4443ea3fbab46356ab0a8e Mon Sep 17 00:00:00 2001 From: jobhdez Date: Fri, 4 Apr 2025 20:51:56 -0700 Subject: [PATCH 0721/1029] [libc] Remove extra parenthesis in sin.cpp comments (#134477) --- libc/src/math/generic/sin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/src/math/generic/sin.cpp b/libc/src/math/generic/sin.cpp index 4a58dcf4b173f..ba370d4bea4da 100644 --- a/libc/src/math/generic/sin.cpp +++ b/libc/src/math/generic/sin.cpp @@ -158,7 +158,7 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) { Float128 sin_k_f128 = get_sin_k(k); Float128 cos_k_f128 = get_sin_k(k + 64); - // sin(x) = sin((k * pi/128 + u) + // sin(x) = sin(k * pi/128 + u) // = sin(u) * cos(k*pi/128) + cos(u) * sin(k*pi/128) Float128 r = fputil::quick_add(fputil::quick_mul(sin_k_f128, cos_u), fputil::quick_mul(cos_k_f128, sin_u)); From b6a96183015e99ec554f7a6b8c35a0e679b8ba39 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 4 Apr 2025 21:04:05 -0700 Subject: [PATCH 0722/1029] [MCParser] Rename confusing variable names https://reviews.llvm.org/D24047 added `IsAtStartOfStatement` to MCAsmLexer, while its subclass AsmLexer had a variable of the same name. The assignment in `UnLex` is unnecessary, which is now removed. 60b403e75cd25a0c76aaaf4e6b176923acf49443 (2019) named the result `parseStatement` `Parsed`. `HasError` is a clearer name. --- llvm/include/llvm/MC/MCParser/AsmLexer.h | 7 +++---- llvm/lib/MC/MCParser/AsmParser.cpp | 7 +++---- llvm/lib/MC/MCParser/MasmParser.cpp | 19 +++++++++---------- .../lib/Target/X86/AsmParser/X86AsmParser.cpp | 2 -- 4 files changed, 15 insertions(+), 20 deletions(-) diff --git a/llvm/include/llvm/MC/MCParser/AsmLexer.h b/llvm/include/llvm/MC/MCParser/AsmLexer.h index bf4aa09df613f..ba07837bcddf3 100644 --- a/llvm/include/llvm/MC/MCParser/AsmLexer.h +++ b/llvm/include/llvm/MC/MCParser/AsmLexer.h @@ -53,7 +53,7 @@ class AsmLexer { const MCAsmInfo &MAI; bool IsAtStartOfLine = true; - bool AtStartOfStatement = true; + bool JustConsumedEOL = true; bool IsPeeking = false; bool EndStatementAtEOF = true; @@ -91,7 +91,7 @@ class AsmLexer { const AsmToken &Lex() { assert(!CurTok.empty()); // Mark if we parsing out a EndOfStatement. - AtStartOfStatement = CurTok.front().getKind() == AsmToken::EndOfStatement; + JustConsumedEOL = CurTok.front().getKind() == AsmToken::EndOfStatement; CurTok.erase(CurTok.begin()); // LexToken may generate multiple tokens via UnLex but will always return // the first one. Place returned value at head of CurTok vector. @@ -103,11 +103,10 @@ class AsmLexer { } void UnLex(AsmToken const &Token) { - AtStartOfStatement = false; CurTok.insert(CurTok.begin(), Token); } - bool isAtStartOfStatement() { return AtStartOfStatement; } + bool justConsumedEOL() { return JustConsumedEOL; } StringRef LexUntilEndOfStatement(); diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 71f2bdbdf0b16..d743c73ffcf10 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -981,20 +981,19 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) { // While we have input, parse each statement. while (Lexer.isNot(AsmToken::Eof)) { ParseStatementInfo Info(&AsmStrRewrites); - bool Parsed = parseStatement(Info, nullptr); + bool HasError = parseStatement(Info, nullptr); // If we have a Lexer Error we are on an Error Token. Load in Lexer Error // for printing ErrMsg via Lex() only if no (presumably better) parser error // exists. - if (Parsed && !hasPendingError() && Lexer.getTok().is(AsmToken::Error)) { + if (HasError && !hasPendingError() && Lexer.getTok().is(AsmToken::Error)) Lex(); - } // parseStatement returned true so may need to emit an error. printPendingErrors(); // Skipping to the next line if needed. - if (Parsed && !getLexer().isAtStartOfStatement()) + if (HasError && !getLexer().justConsumedEOL()) eatToEndOfStatement(); } diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp index 64a97ecf68277..874bf187d5242 100644 --- a/llvm/lib/MC/MCParser/MasmParser.cpp +++ b/llvm/lib/MC/MCParser/MasmParser.cpp @@ -1100,6 +1100,7 @@ bool MasmParser::expandMacros() { const AsmToken &MasmParser::Lex(ExpandKind ExpandNextToken) { if (Lexer.getTok().is(AsmToken::Error)) Error(Lexer.getErrLoc(), Lexer.getErr()); + bool StartOfStatement = false; // if it's a end of statement with a comment in it if (getTok().is(AsmToken::EndOfStatement)) { @@ -1107,10 +1108,10 @@ const AsmToken &MasmParser::Lex(ExpandKind ExpandNextToken) { if (!getTok().getString().empty() && getTok().getString().front() != '\n' && getTok().getString().front() != '\r' && MAI.preserveAsmComments()) Out.addExplicitComment(Twine(getTok().getString())); + StartOfStatement = true; } const AsmToken *tok = &Lexer.Lex(); - bool StartOfStatement = Lexer.isAtStartOfStatement(); while (ExpandNextToken == ExpandMacros && tok->is(AsmToken::Identifier)) { if (StartOfStatement) { @@ -1203,20 +1204,19 @@ bool MasmParser::Run(bool NoInitialTextSection, bool NoFinalize) { Lex(); ParseStatementInfo Info(&AsmStrRewrites); - bool Parsed = parseStatement(Info, nullptr); + bool HasError = parseStatement(Info, nullptr); // If we have a Lexer Error we are on an Error Token. Load in Lexer Error // for printing ErrMsg via Lex() only if no (presumably better) parser error // exists. - if (Parsed && !hasPendingError() && Lexer.getTok().is(AsmToken::Error)) { + if (HasError && !hasPendingError() && Lexer.getTok().is(AsmToken::Error)) Lex(); - } // parseStatement returned true so may need to emit an error. printPendingErrors(); // Skipping to the next line if needed. - if (Parsed && !getLexer().isAtStartOfStatement()) + if (HasError && !getLexer().justConsumedEOL()) eatToEndOfStatement(); } @@ -2802,9 +2802,9 @@ bool MasmParser::handleMacroInvocation(const MCAsmMacro *M, SMLoc NameLoc) { SmallVector AsmStrRewrites; while (Lexer.isNot(AsmToken::Eof)) { ParseStatementInfo Info(&AsmStrRewrites); - bool Parsed = parseStatement(Info, nullptr); + bool HasError = parseStatement(Info, nullptr); - if (!Parsed && Info.ExitValue) { + if (!HasError && Info.ExitValue) { ExitValue = std::move(*Info.ExitValue); break; } @@ -2812,15 +2812,14 @@ bool MasmParser::handleMacroInvocation(const MCAsmMacro *M, SMLoc NameLoc) { // If we have a Lexer Error we are on an Error Token. Load in Lexer Error // for printing ErrMsg via Lex() only if no (presumably better) parser error // exists. - if (Parsed && !hasPendingError() && Lexer.getTok().is(AsmToken::Error)) { + if (HasError && !hasPendingError() && Lexer.getTok().is(AsmToken::Error)) Lex(); - } // parseStatement returned true so may need to emit an error. printPendingErrors(); // Skipping to the next line if needed. - if (Parsed && !getLexer().isAtStartOfStatement()) + if (HasError && !getLexer().justConsumedEOL()) eatToEndOfStatement(); } diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index cd38ac85dac7a..8221679f1969c 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1097,8 +1097,6 @@ class X86AsmParser : public MCTargetAsmParser { bool MatchingInlineAsm = false) { MCAsmParser &Parser = getParser(); if (MatchingInlineAsm) { - if (!getLexer().isAtStartOfStatement()) - Parser.eatToEndOfStatement(); return false; } return Parser.Error(L, Msg, Range); From a07b37475ccbf6b718fdf64fd6f0756ea1958852 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 4 Apr 2025 21:15:42 -0700 Subject: [PATCH 0723/1029] [LoongArch] Simplify evaluateAsRelocatableImpl Similar to RISCV --- llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp index ca3e401d542da..dceb62dcf4604 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp @@ -51,7 +51,7 @@ bool LoongArchMCExpr::evaluateAsRelocatableImpl( Res.setSpecifier(specifier); // Custom fixup types are not valid with symbol difference expressions. - return Res.getSymB() ? specifier == VK_None : true; + return !Res.getSubSym(); } void LoongArchMCExpr::visitUsedExpr(MCStreamer &Streamer) const { From f47034cbe5c02b748742c733cf453b3b907687e5 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Fri, 4 Apr 2025 21:19:43 -0700 Subject: [PATCH 0724/1029] AMDGPU: Add round-to-odd rounding during f64 to bf16 conversion (#133995) f64 -> bf16 conversion can be lowered to f64 -> f32 followed by f32 -> bf16: v_cvt_f32_f64_e32 v0, v[0:1] v_cvt_pk_bf16_f32 v0, v0, s0 Both conversion instructions will do round-to-even rounding, and thus we will have double rounding issue which may generate incorrect result in some data range. We need to add round-to-odd rounding during f64 -> f32 to avoid double rounding,. NOTE: we are having the same issue with f64 -> f16 conversion. Will add round-to-odd rounding for it in a separate patch, which fixes SWDEV-523856 --------- Co-authored-by: Matt Arsenault --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 36 ++++++---- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 5 -- llvm/test/CodeGen/AMDGPU/bf16-conversions.ll | 72 ++++++++++++++++++-- 3 files changed, 89 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a583a5cb990e7..21f8c7cfeec1f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -911,8 +911,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, MVT::i1, Promote); if (Subtarget->hasBF16ConversionInsts()) { - setOperationAction(ISD::FP_ROUND, MVT::v2bf16, Legal); - setOperationAction(ISD::FP_ROUND, MVT::bf16, Legal); + setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal); } @@ -6888,23 +6887,34 @@ SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, } SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getValueType() == MVT::f16 && - "Do not know how to custom lower FP_ROUND for non-f16 type"); - SDValue Src = Op.getOperand(0); EVT SrcVT = Src.getValueType(); - if (SrcVT != MVT::f64) - return Op; - - // TODO: Handle strictfp - if (Op.getOpcode() != ISD::FP_ROUND) + if (SrcVT.getScalarType() != MVT::f64) return Op; + EVT DstVT = Op.getValueType(); SDLoc DL(Op); + if (DstVT == MVT::f16) { + // TODO: Handle strictfp + if (Op.getOpcode() != ISD::FP_ROUND) + return Op; + + SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); + return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); + } + + assert(DstVT.getScalarType() == MVT::bf16 && + "custom lower FP_ROUND for f16 or bf16"); + assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal"); - SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); - return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); + // Round-inexact-to-odd f64 to f32, then do the final rounding using the + // hardware f32 -> bf16 instruction. + EVT F32VT = SrcVT.isVector() ? SrcVT.changeVectorElementType(MVT::f32) : + MVT::f32; + SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG); + return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod, + DAG.getTargetConstant(0, DL, MVT::i32)); } SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 9feb5df2f9203..8686a85620a17 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1443,16 +1443,11 @@ let SubtargetPredicate = HasBF16ConversionInsts in { } def : GCNPat<(v2bf16 (bf16_fpround v2f32:$src)), (V_CVT_PK_BF16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 0, (EXTRACT_SUBREG VReg_64:$src, sub1))>; - def : GCNPat<(v2bf16 (bf16_fpround v2f64:$src)), - (V_CVT_PK_BF16_F32_e64 0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG VReg_128:$src, sub0_sub1)), - 0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG VReg_128:$src, sub2_sub3)))>; def : GCNPat<(v2bf16 (build_vector (bf16 (bf16_fpround (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), (bf16 (bf16_fpround (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))), (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>; def : GCNPat<(bf16 (bf16_fpround (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), (V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 (IMPLICIT_DEF)))>; - def : GCNPat<(bf16 (bf16_fpround (f64 (VOP3Mods f64:$src0, i32:$src0_modifiers)))), - (V_CVT_PK_BF16_F32_e64 0, (f32 (V_CVT_F32_F64_e64 $src0_modifiers, $src0)), 0, (f32 (IMPLICIT_DEF)))>; } class Cvt_Scale_Sr_F32ToBF16F16_Pat : GCNPat< diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index 4c01e583713a7..3be911ab9e7f4 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -153,9 +153,34 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; ; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v: ; GFX-950: ; %bb.0: -; GFX-950-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] -; GFX-950-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX-950-NEXT: v_mov_b32_e32 v4, v3 +; GFX-950-NEXT: v_and_b32_e32 v3, 0x7fffffff, v4 +; GFX-950-NEXT: v_mov_b32_e32 v5, v1 +; GFX-950-NEXT: v_cvt_f32_f64_e32 v1, v[2:3] +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[6:7], v1 +; GFX-950-NEXT: v_and_b32_e32 v8, 1, v1 +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], v[2:3], v[6:7] +; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[6:7] +; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v8 +; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3] +; GFX-950-NEXT: v_add_u32_e32 v2, v1, v2 +; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX-950-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX-950-NEXT: s_brev_b32 s4, 1 +; GFX-950-NEXT: v_and_or_b32 v4, v4, s4, v1 +; GFX-950-NEXT: v_and_b32_e32 v1, 0x7fffffff, v5 +; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v6 +; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], v[0:1], v[2:3] +; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3] +; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7 +; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] +; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0 +; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX-950-NEXT: v_and_or_b32 v0, v5, s4, v0 +; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4 ; GFX-950-NEXT: ; return to shader part epilog %res = fptrunc <2 x double> %src to <2 x bfloat> %cast = bitcast <2 x bfloat> %res to float @@ -347,7 +372,18 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { ; ; GFX-950-LABEL: fptrunc_f64_to_bf16: ; GFX-950: ; %bb.0: ; %entry -; GFX-950-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] +; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0 +; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX-950-NEXT: s_brev_b32 s0, 1 +; GFX-950-NEXT: v_and_or_b32 v0, v1, s0, v0 ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX-950-NEXT: flat_store_short v[2:3], v0 ; GFX-950-NEXT: s_endpgm @@ -385,7 +421,19 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { ; ; GFX-950-LABEL: fptrunc_f64_to_bf16_neg: ; GFX-950: ; %bb.0: ; %entry -; GFX-950-NEXT: v_cvt_f32_f64_e64 v0, -v[0:1] +; GFX-950-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] +; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0 +; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-950-NEXT: s_brev_b32 s4, 1 +; GFX-950-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX-950-NEXT: v_and_or_b32 v0, v6, s4, v0 ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX-950-NEXT: flat_store_short v[2:3], v0 ; GFX-950-NEXT: s_endpgm @@ -424,7 +472,19 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { ; ; GFX-950-LABEL: fptrunc_f64_to_bf16_abs: ; GFX-950: ; %bb.0: ; %entry -; GFX-950-NEXT: v_cvt_f32_f64_e64 v0, |v[0:1]| +; GFX-950-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7 +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] +; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0 +; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc +; GFX-950-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX-950-NEXT: s_brev_b32 s0, 1 +; GFX-950-NEXT: v_and_or_b32 v0, v6, s0, v0 ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX-950-NEXT: flat_store_short v[2:3], v0 ; GFX-950-NEXT: s_endpgm From a1935fd3809772c06f9a09fa151181642ae92b20 Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Fri, 4 Apr 2025 22:10:19 -0700 Subject: [PATCH 0725/1029] [clang] Remove unused SourceManager.cpp includes, NFC (trying out clangd) --- clang/lib/Basic/SourceManager.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index b1f2180c1d462..a78ffc1e90ebe 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -24,15 +24,11 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/Allocator.h" -#include "llvm/Support/AutoConvert.h" #include "llvm/Support/Capacity.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/MathExtras.h" #include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" #include #include From f3e6473df46fd920e09e06e57a5549eb8e3a8bd3 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 4 Apr 2025 22:17:21 -0700 Subject: [PATCH 0726/1029] MCValue: reduce getSymB uses The MCValue::SymB MCSymbolRefExpr member might be replaced with a MCSymbol in the future. Reduce direct access. --- llvm/lib/MC/MCAssembler.cpp | 8 +++----- llvm/lib/MC/MCValue.cpp | 5 +++-- .../Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp | 3 ++- llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 2 +- llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp | 3 ++- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 835fa8af4cf8f..2c9cd0e5b626e 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -201,11 +201,9 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF, if (Sym.isDefined()) Value += getSymbolOffset(Sym); } - if (const MCSymbolRefExpr *B = Target.getSymB()) { - const MCSymbol &Sym = B->getSymbol(); - if (Sym.isDefined()) - Value -= getSymbolOffset(Sym); - } + if (const MCSymbol *Sub = Target.getSubSym()) + if (Sub->isDefined()) + Value -= getSymbolOffset(*Sub); bool ShouldAlignPC = FixupFlags & MCFixupKindInfo::FKF_IsAlignedDownTo32Bits; assert((ShouldAlignPC ? IsPCRel : true) && diff --git a/llvm/lib/MC/MCValue.cpp b/llvm/lib/MC/MCValue.cpp index a90ba4eaa5f7c..8b2edc9ac57ec 100644 --- a/llvm/lib/MC/MCValue.cpp +++ b/llvm/lib/MC/MCValue.cpp @@ -9,6 +9,7 @@ #include "llvm/MC/MCValue.h" #include "llvm/Config/llvm-config.h" #include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCSymbol.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" @@ -28,9 +29,9 @@ void MCValue::print(raw_ostream &OS) const { OS << *getSymA(); - if (getSymB()) { + if (auto *B = getSubSym()) { OS << " - "; - OS << *getSymB(); + B->print(OS, nullptr); } if (getConstant()) diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index c83a18746e060..f22b208f8dffc 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -498,7 +498,8 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAssembler &Asm, llvm_unreachable("unsupported fixup size"); } MCValue A = MCValue::get(Target.getSymA(), nullptr, Target.getConstant()); - MCValue B = MCValue::get(Target.getSymB()); + MCValue B = MCValue::get( + MCSymbolRefExpr::create(Target.getSubSym(), Asm.getContext())); auto FA = MCFixup::create(Fixup.getOffset(), nullptr, std::get<0>(FK)); auto FB = MCFixup::create(Fixup.getOffset(), nullptr, std::get<1>(FK)); auto &Assembler = const_cast(Asm); diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 6c246176a05e8..a8922c3f9f2e8 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2806,7 +2806,7 @@ bool RISCVAsmParser::isSymbolDiff(const MCExpr *Expr) { MCValue Res; if (Expr->evaluateAsRelocatable(Res, nullptr)) { return Res.getRefKind() == RISCVMCExpr::VK_None && Res.getSymA() && - Res.getSymB(); + Res.getSubSym(); } return false; } diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index ac0f7421664c5..f208618814142 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -657,7 +657,8 @@ bool RISCVAsmBackend::handleAddSubRelocations(const MCAssembler &Asm, llvm_unreachable("unsupported fixup size"); } MCValue A = MCValue::get(Target.getSymA(), nullptr, Target.getConstant()); - MCValue B = MCValue::get(Target.getSymB()); + MCValue B = MCValue::get( + MCSymbolRefExpr::create(Target.getSubSym(), Asm.getContext())); auto FA = MCFixup::create( Fixup.getOffset(), nullptr, static_cast(FirstLiteralRelocationKind + TA)); From 44923d8631fb28b4de54d4210762f256c3894cef Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sat, 5 Apr 2025 06:46:16 +0000 Subject: [PATCH 0727/1029] [Github] Bump Github Runner Version in Agent Containers The Github runner version got bumped recently and it would be good to keep this up to date. Also debugging an issue where Github ARC is failing to create new pods and trying to see if it might be related to outdated versions. --- .../workflows/containers/github-action-ci-windows/Dockerfile | 2 +- .github/workflows/containers/github-action-ci/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/containers/github-action-ci-windows/Dockerfile b/.github/workflows/containers/github-action-ci-windows/Dockerfile index 9a1fab694c9df..a46e55f2b09d9 100644 --- a/.github/workflows/containers/github-action-ci-windows/Dockerfile +++ b/.github/workflows/containers/github-action-ci-windows/Dockerfile @@ -108,7 +108,7 @@ RUN choco install -y handle RUN pip3 install pywin32 buildbot-worker==2.8.4 -ARG RUNNER_VERSION=2.322.0 +ARG RUNNER_VERSION=2.323.0 ENV RUNNER_VERSION=$RUNNER_VERSION RUN powershell -Command \ diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile index 514b86ced7a87..e1e9827737adb 100644 --- a/.github/workflows/containers/github-action-ci/Dockerfile +++ b/.github/workflows/containers/github-action-ci/Dockerfile @@ -95,7 +95,7 @@ WORKDIR /home/gha FROM ci-container as ci-container-agent -ENV GITHUB_RUNNER_VERSION=2.322.0 +ENV GITHUB_RUNNER_VERSION=2.323.0 RUN mkdir actions-runner && \ cd actions-runner && \ From fb96d5171ee4cba7b2cb9cceea755aa4bd51b3c4 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sat, 5 Apr 2025 07:24:36 +0000 Subject: [PATCH 0728/1029] Reapply "[CI] Fix Monolithic Linux Build in Ubuntu 24.04 (#133628)" This reverts commit d72be157823d41e7eaf457cc37ea99c07431a25c. Now that the container version got bumped, we need to reland this. --- .ci/monolithic-linux.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index 4b6e56b4a4eda..ec7a85bc5f15f 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -53,9 +53,9 @@ targets="${2}" lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests" echo "--- cmake" -pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt -pip install -q -r "${MONOREPO_ROOT}"/lldb/test/requirements.txt -pip install -q -r "${MONOREPO_ROOT}"/.ci/requirements.txt +pip install --break-system-packages -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt +pip install --break-system-packages -q -r "${MONOREPO_ROOT}"/lldb/test/requirements.txt +pip install --break-system-packages -q -r "${MONOREPO_ROOT}"/.ci/requirements.txt cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D LLVM_ENABLE_PROJECTS="${projects}" \ -G Ninja \ From 70a20757e8fff0a56cd7f95c2854b0fa68e7089c Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Sat, 5 Apr 2025 10:45:23 +0300 Subject: [PATCH 0729/1029] [clang] Update C++ DR status page --- clang/www/cxx_dr_status.html | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 138f12facf0ad..eeb1d9d74bf00 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -17242,7 +17242,7 @@

C++ defect report implementation status

2898 - open + tentatively ready Clarify implicit conversion sequence from cv T to T Not resolved @@ -17750,7 +17750,7 @@

C++ defect report implementation status

2982 - open + tentatively ready Deduction in type-constraints Not resolved @@ -17780,7 +17780,7 @@

C++ defect report implementation status

2987 - open + tentatively ready Remove dilapidated wording from static_cast Not resolved @@ -17945,6 +17945,36 @@

C++ defect report implementation status

open Comma-delimited vs. comma-separated output for #embed Not resolved + + + 3015 + open + Handling of header-names for #include and #embed + Not resolved + + + 3016 + open + Satisfying the syntactic requirements of #include and #embed + Not resolved + + + 3017 + open + Commas in controlling expression of conditional inclusion + Not resolved + + + 3018 + open + Validity of defined in __has_embed + Not resolved + + + 3019 + open + Restrictions on character sequences in header-names + Not resolved From a64191053136078761a72fe800feedb8bcc70d31 Mon Sep 17 00:00:00 2001 From: Mats Jun Larsen Date: Sat, 5 Apr 2025 18:01:36 +0900 Subject: [PATCH 0730/1029] [clang][CGObjC] Remove unused ExternalProtocolPtrTy (NFC) (#133870) This function was previously used to get a type to the protocol that was used to bitcast the initializer of GenerateProtocol. This bitcast has later been removed (thanks to opaque pointers), but the member was left behind. History: - 020de3254acc3 used ExternalProtocolPtrTy - 34ee69b4ce662 removes the bitcast Also technically part of #123569 --- clang/lib/CodeGen/CGObjCMac.cpp | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp index 98f988dfecf84..1f11347b81411 100644 --- a/clang/lib/CodeGen/CGObjCMac.cpp +++ b/clang/lib/CodeGen/CGObjCMac.cpp @@ -184,25 +184,6 @@ class ObjCCommonTypesHelper { /// SelectorPtrTy - LLVM type for selector handles (typeof(SEL)) llvm::PointerType *SelectorPtrTy; -private: - /// ProtocolPtrTy - LLVM type for external protocol handles - /// (typeof(Protocol)) - llvm::Type *ExternalProtocolPtrTy; - -public: - llvm::Type *getExternalProtocolPtrTy() { - if (!ExternalProtocolPtrTy) { - // FIXME: It would be nice to unify this with the opaque type, so that the - // IR comes out a bit cleaner. - CodeGen::CodeGenTypes &Types = CGM.getTypes(); - ASTContext &Ctx = CGM.getContext(); - llvm::Type *T = Types.ConvertType(Ctx.getObjCProtoType()); - ExternalProtocolPtrTy = llvm::PointerType::getUnqual(T); - } - - return ExternalProtocolPtrTy; - } - // SuperCTy - clang type for struct objc_super. QualType SuperCTy; // SuperPtrCTy - clang type for struct objc_super *. @@ -5636,8 +5617,7 @@ CGObjCNonFragileABIMac::CGObjCNonFragileABIMac(CodeGen::CodeGenModule &cgm) /* *** */ ObjCCommonTypesHelper::ObjCCommonTypesHelper(CodeGen::CodeGenModule &cgm) - : VMContext(cgm.getLLVMContext()), CGM(cgm), - ExternalProtocolPtrTy(nullptr) { + : VMContext(cgm.getLLVMContext()), CGM(cgm) { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); unsigned ProgramAS = CGM.getDataLayout().getProgramAddressSpace(); From 75bbf768a4ae3d9f6cd45ab7448acf3792d94d82 Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Sat, 5 Apr 2025 10:10:44 +0100 Subject: [PATCH 0731/1029] [compiler-rt][rtsan] Linux's eventfd interception. (#132836) --- compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp | 9 +++++++++ .../lib/rtsan/tests/rtsan_test_interceptors_posix.cpp | 7 +++++++ 2 files changed, 16 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index 9d1a689a5a070..aed43fcd92249 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -1359,6 +1359,12 @@ INTERCEPTOR(int, timerfd_gettime, int fd, struct itimerspec *val) { __rtsan_notify_intercepted_call("timerfd_gettime"); return REAL(timerfd_gettime)(fd, val); } + +/* eventfd wrappers calls SYS_eventfd2 down the line */ +INTERCEPTOR(int, eventfd, unsigned int count, int flags) { + __rtsan_notify_intercepted_call("eventfd"); + return REAL(eventfd)(count, flags); +} #define RTSAN_MAYBE_INTERCEPT_INOTIFY_INIT INTERCEPT_FUNCTION(inotify_init) #define RTSAN_MAYBE_INTERCEPT_INOTIFY_INIT1 INTERCEPT_FUNCTION(inotify_init1) #define RTSAN_MAYBE_INTERCEPT_INOTIFY_ADD_WATCH \ @@ -1370,6 +1376,7 @@ INTERCEPTOR(int, timerfd_gettime, int fd, struct itimerspec *val) { INTERCEPT_FUNCTION(timerfd_settime) #define RTSAN_MAYBE_INTERCEPT_TIMERFD_GETTIME \ INTERCEPT_FUNCTION(timerfd_gettime) +#define RTSAN_MAYBE_INTERCEPT_EVENTFD INTERCEPT_FUNCTION(eventfd) #else #define RTSAN_MAYBE_INTERCEPT_INOTIFY_INIT #define RTSAN_MAYBE_INTERCEPT_INOTIFY_INIT1 @@ -1378,6 +1385,7 @@ INTERCEPTOR(int, timerfd_gettime, int fd, struct itimerspec *val) { #define RTSAN_MAYBE_INTERCEPT_TIMERFD_CREATE #define RTSAN_MAYBE_INTERCEPT_TIMERFD_SETTIME #define RTSAN_MAYBE_INTERCEPT_TIMERFD_GETTIME +#define RTSAN_MAYBE_INTERCEPT_EVENTFD #endif INTERCEPTOR(int, pipe, int pipefd[2]) { @@ -1644,6 +1652,7 @@ void __rtsan::InitializeInterceptors() { RTSAN_MAYBE_INTERCEPT_TIMERFD_CREATE; RTSAN_MAYBE_INTERCEPT_TIMERFD_SETTIME; RTSAN_MAYBE_INTERCEPT_TIMERFD_GETTIME; + RTSAN_MAYBE_INTERCEPT_EVENTFD; INTERCEPT_FUNCTION(pipe); INTERCEPT_FUNCTION(mkfifo); diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index f12df9ea90855..3a43f4f9c434c 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -44,6 +44,7 @@ #include #include #if SANITIZER_LINUX +#include #include #include #endif @@ -1677,6 +1678,12 @@ TEST(TestRtsanInterceptors, TimerfdGettimeDiesWhenRealtime) { ExpectRealtimeDeath(Func, "timerfd_gettime"); ExpectNonRealtimeSurvival(Func); } + +TEST(TestRtsanInterceptors, EventfdDiesWhenRealtime) { + auto Func = []() { eventfd(EFD_CLOEXEC, 0); }; + ExpectRealtimeDeath(Func, "eventfd"); + ExpectNonRealtimeSurvival(Func); +} #endif TEST(TestRtsanInterceptors, MkfifoDiesWhenRealtime) { From 6c9e098b397fa8802b701a3a4ac827476b33568e Mon Sep 17 00:00:00 2001 From: David CARLIER Date: Sat, 5 Apr 2025 10:30:45 +0100 Subject: [PATCH 0732/1029] [compiler-rt][rtsan] symlink/symlinkat interception. (#134168) --- .../lib/rtsan/rtsan_interceptors_posix.cpp | 13 +++++++++++++ .../tests/rtsan_test_interceptors_posix.cpp | 16 ++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp index aed43fcd92249..e8cea21ddf9aa 100644 --- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp @@ -322,6 +322,17 @@ INTERCEPTOR(int, ftruncate64, int fd, off64_t length) { #define RTSAN_MAYBE_INTERCEPT_FTRUNCATE64 #endif +INTERCEPTOR(int, symlink, const char *target, const char *linkpath) { + __rtsan_notify_intercepted_call("symlink"); + return REAL(symlink)(target, linkpath); +} + +INTERCEPTOR(int, symlinkat, const char *target, int newdirfd, + const char *linkpath) { + __rtsan_notify_intercepted_call("symlinkat"); + return REAL(symlinkat)(target, newdirfd, linkpath); +} + // Streams INTERCEPTOR(FILE *, fopen, const char *path, const char *mode) { @@ -1523,6 +1534,8 @@ void __rtsan::InitializeInterceptors() { RTSAN_MAYBE_INTERCEPT_READLINKAT; INTERCEPT_FUNCTION(unlink); INTERCEPT_FUNCTION(unlinkat); + INTERCEPT_FUNCTION(symlink); + INTERCEPT_FUNCTION(symlinkat); INTERCEPT_FUNCTION(truncate); INTERCEPT_FUNCTION(ftruncate); RTSAN_MAYBE_INTERCEPT_TRUNCATE64; diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp index 3a43f4f9c434c..048da5858d665 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp @@ -897,6 +897,22 @@ TEST_F(RtsanOpenedFileTest, FtruncateDiesWhenRealtime) { ExpectNonRealtimeSurvival(Func); } +TEST_F(RtsanOpenedFileTest, SymlinkDiesWhenRealtime) { + auto Func = [&]() { + symlink("/tmp/rtsan_symlink_test", GetTemporaryFilePath()); + }; + ExpectRealtimeDeath(Func, "symlink"); + ExpectNonRealtimeSurvival(Func); +} + +TEST_F(RtsanOpenedFileTest, SymlinkatDiesWhenRealtime) { + auto Func = [&]() { + symlinkat("/tmp/rtsan_symlinkat_test", AT_FDCWD, GetTemporaryFilePath()); + }; + ExpectRealtimeDeath(Func, "symlinkat"); + ExpectNonRealtimeSurvival(Func); +} + TEST_F(RtsanFileTest, FcloseDiesWhenRealtime) { FILE *f = fopen(GetTemporaryFilePath(), "w"); EXPECT_THAT(f, Ne(nullptr)); From 008e3a0b3d08a863f777d67e067eed75f0c08402 Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Sat, 5 Apr 2025 10:48:46 +0200 Subject: [PATCH 0733/1029] [ValueTracking] Test for trunc nuw cond in assume. (NFC) --- llvm/test/Transforms/InstCombine/assume.ll | 26 ++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll index 17473a225ff95..1f8fa1716b2aa 100644 --- a/llvm/test/Transforms/InstCombine/assume.ll +++ b/llvm/test/Transforms/InstCombine/assume.ll @@ -1009,6 +1009,32 @@ define i1 @not_cond_use(i8 %x) { ret i1 %rval } +define i1 @assume_trunc_nuw_eq_one(i8 %x) { +; CHECK-LABEL: @assume_trunc_nuw_eq_one( +; CHECK-NEXT: [[A:%.*]] = trunc nuw i8 [[X:%.*]] to i1 +; CHECK-NEXT: call void @llvm.assume(i1 [[A]]) +; CHECK-NEXT: [[Q:%.*]] = icmp eq i8 [[X]], 1 +; CHECK-NEXT: ret i1 [[Q]] +; + %a = trunc nuw i8 %x to i1 + call void @llvm.assume(i1 %a) + %q = icmp eq i8 %x, 1 + ret i1 %q +} + +define i1 @neg_assume_trunc_eq_one(i8 %x) { +; CHECK-LABEL: @neg_assume_trunc_eq_one( +; CHECK-NEXT: [[A:%.*]] = trunc i8 [[X:%.*]] to i1 +; CHECK-NEXT: call void @llvm.assume(i1 [[A]]) +; CHECK-NEXT: [[Q:%.*]] = icmp eq i8 [[X]], 1 +; CHECK-NEXT: ret i1 [[Q]] +; + %a = trunc i8 %x to i1 + call void @llvm.assume(i1 %a) + %q = icmp eq i8 %x, 1 + ret i1 %q +} + declare void @use(i1) declare void @llvm.dbg.value(metadata, metadata, metadata) From 16573315d986568641e536c37c84fcc5dd161bf6 Mon Sep 17 00:00:00 2001 From: Andreas Jonson Date: Sat, 5 Apr 2025 12:07:13 +0200 Subject: [PATCH 0734/1029] [InstCombine] Test for fold of and of icmps with operands in the range(0,2) (NFC) proof https://alive2.llvm.org/ce/z/xeazCu this is a regression found in https://github.com/llvm/llvm-project/pull/128861 This fold is done when icmp eq/ne x, 1/0 is folded to trunc nuw x iff x is in the range(0,2) --- .../Transforms/InstCombine/and-or-icmps.ll | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/llvm/test/Transforms/InstCombine/and-or-icmps.ll b/llvm/test/Transforms/InstCombine/and-or-icmps.ll index 9651858a0caef..8824ae48417b0 100644 --- a/llvm/test/Transforms/InstCombine/and-or-icmps.ll +++ b/llvm/test/Transforms/InstCombine/and-or-icmps.ll @@ -3409,3 +3409,31 @@ define i1 @and_ugt_to_mask_off_by_one(i8 %x) { %and2 = and i1 %cmp, %cmp2 ret i1 %and2 } + +; TODO: shall fold to trunc nuw i8 (and %x, %y) to i1. +define i1 @and_icmp_ne_with_binary_range_operands(i8 range(i8 0, 2) %x, i8 range(i8 0, 2) %y) { +; CHECK-LABEL: @and_icmp_ne_with_binary_range_operands( +; CHECK-NEXT: [[ICMP1:%.*]] = icmp ne i8 [[X:%.*]], 0 +; CHECK-NEXT: [[ICMP2:%.*]] = icmp ne i8 [[Y:%.*]], 0 +; CHECK-NEXT: [[RET:%.*]] = and i1 [[ICMP1]], [[ICMP2]] +; CHECK-NEXT: ret i1 [[RET]] +; + %icmp1 = icmp ne i8 %x, 0 + %icmp2 = icmp ne i8 %y, 0 + %ret = and i1 %icmp1, %icmp2 + ret i1 %ret +} + +; TODO: shall fold to trunc nuw i8 (and %x, %y) to i1. +define i1 @and_icmp_eq_with_binary_range_operands(i8 range(i8 0, 2) %x, i8 range(i8 0, 2) %y) { +; CHECK-LABEL: @and_icmp_eq_with_binary_range_operands( +; CHECK-NEXT: [[ICMP1:%.*]] = icmp ne i8 [[X:%.*]], 0 +; CHECK-NEXT: [[ICMP2:%.*]] = icmp ne i8 [[Y:%.*]], 0 +; CHECK-NEXT: [[RET:%.*]] = and i1 [[ICMP1]], [[ICMP2]] +; CHECK-NEXT: ret i1 [[RET]] +; + %icmp1 = icmp eq i8 %x, 1 + %icmp2 = icmp eq i8 %y, 1 + %ret = and i1 %icmp1, %icmp2 + ret i1 %ret +} From cd54cb062bba9c90a8f3723bf66caa7effbcf259 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Sat, 5 Apr 2025 19:23:56 +0800 Subject: [PATCH 0735/1029] [X86][AMX] Add missing __inline__ for AMXCOMPLEX intrinsics, NFCI (#134484) Found by #64779. --- clang/lib/Headers/amxcomplexintrin.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/clang/lib/Headers/amxcomplexintrin.h b/clang/lib/Headers/amxcomplexintrin.h index 84ef972fcadf0..87ee8f3919c23 100644 --- a/clang/lib/Headers/amxcomplexintrin.h +++ b/clang/lib/Headers/amxcomplexintrin.h @@ -135,9 +135,8 @@ _tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k, /// The 1st source tile. Max size is 1024 Bytes. /// \param src1 /// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_COMPLEX -static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { +static __inline__ void __DEFAULT_FN_ATTRS_COMPLEX +__tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0, __tile1024i src1) { dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col, dst->tile, src0.tile, src1.tile); } @@ -158,9 +157,8 @@ static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0, /// The 1st source tile. Max size is 1024 Bytes. /// \param src1 /// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_COMPLEX -static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { +static __inline__ void __DEFAULT_FN_ATTRS_COMPLEX +__tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0, __tile1024i src1) { dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col, dst->tile, src0.tile, src1.tile); } From 13799998c06984f808ff687e7866441a3135fd18 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 5 Apr 2025 12:24:39 +0100 Subject: [PATCH 0736/1029] [EquivalenceClasses] Use DenseMap instead of std::set. (NFC) (#134264) Replace the std::set with DenseMap, which removes the requirement for an ordering predicate. This also requires to allocate the ECValue objects separately. This patch uses a BumpPtrAllocator. Follow-up to https://github.com/llvm/llvm-project/pull/134075. Compile-time impact is mostly neutral or slightly positive: https://llvm-compile-time-tracker.com/compare.php?from=ee4e8197fa67dd1ed6e9470e00708e7feeaacd97&to=242e6a8e42889eebfc0bb5d433a4de7dd9e224a7&stat=instructions:u --- llvm/include/llvm/ADT/EquivalenceClasses.h | 66 +++++++------------ .../Transforms/Vectorize/VectorCombine.cpp | 1 + llvm/unittests/ADT/EquivalenceClassesTest.cpp | 26 -------- .../Transforms/OneShotAnalysis.h | 17 ++--- 4 files changed, 29 insertions(+), 81 deletions(-) diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h index ad1f385cd9414..e0a7af9421c35 100644 --- a/llvm/include/llvm/ADT/EquivalenceClasses.h +++ b/llvm/include/llvm/ADT/EquivalenceClasses.h @@ -15,13 +15,14 @@ #ifndef LLVM_ADT_EQUIVALENCECLASSES_H #define LLVM_ADT_EQUIVALENCECLASSES_H +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/Support/Allocator.h" #include #include #include #include -#include namespace llvm { @@ -33,8 +34,7 @@ namespace llvm { /// /// This implementation is an efficient implementation that only stores one copy /// of the element being indexed per entry in the set, and allows any arbitrary -/// type to be indexed (as long as it can be ordered with operator< or a -/// comparator is provided). +/// type to be indexed (as long as it can be implements DenseMapInfo). /// /// Here is a simple example using integers: /// @@ -58,18 +58,17 @@ namespace llvm { /// 4 /// 5 1 2 /// -template > -class EquivalenceClasses { +template class EquivalenceClasses { /// ECValue - The EquivalenceClasses data structure is just a set of these. /// Each of these represents a relation for a value. First it stores the - /// value itself, which provides the ordering that the set queries. Next, it - /// provides a "next pointer", which is used to enumerate all of the elements - /// in the unioned set. Finally, it defines either a "end of list pointer" or - /// "leader pointer" depending on whether the value itself is a leader. A - /// "leader pointer" points to the node that is the leader for this element, - /// if the node is not a leader. A "end of list pointer" points to the last - /// node in the list of members of this list. Whether or not a node is a - /// leader is determined by a bit stolen from one of the pointers. + /// value itself. Next, it provides a "next pointer", which is used to + /// enumerate all of the elements in the unioned set. Finally, it defines + /// either a "end of list pointer" or "leader pointer" depending on whether + /// the value itself is a leader. A "leader pointer" points to the node that + /// is the leader for this element, if the node is not a leader. A "end of + /// list pointer" points to the last node in the list of members of this list. + /// Whether or not a node is a leader is determined by a bit stolen from one + /// of the pointers. class ECValue { friend class EquivalenceClasses; @@ -113,36 +112,15 @@ class EquivalenceClasses { } }; - /// A wrapper of the comparator, to be passed to the set. - struct ECValueComparator { - using is_transparent = void; - - ECValueComparator() : compare(Compare()) {} - - bool operator()(const ECValue &lhs, const ECValue &rhs) const { - return compare(lhs.Data, rhs.Data); - } - - template - bool operator()(const T &lhs, const ECValue &rhs) const { - return compare(lhs, rhs.Data); - } - - template - bool operator()(const ECValue &lhs, const T &rhs) const { - return compare(lhs.Data, rhs); - } - - const Compare compare; - }; - /// TheMapping - This implicitly provides a mapping from ElemTy values to the /// ECValues, it just keeps the key as part of the value. - std::set TheMapping; + DenseMap TheMapping; /// List of all members, used to provide a determinstic iteration order. SmallVector Members; + mutable BumpPtrAllocator ECValueAllocator; + public: EquivalenceClasses() = default; EquivalenceClasses(const EquivalenceClasses &RHS) { @@ -232,10 +210,14 @@ class EquivalenceClasses { /// insert - Insert a new value into the union/find set, ignoring the request /// if the value already exists. const ECValue &insert(const ElemTy &Data) { - auto I = TheMapping.insert(ECValue(Data)); - if (I.second) - Members.push_back(&*I.first); - return *I.first; + auto I = TheMapping.insert({Data, nullptr}); + if (!I.second) + return *I.first->second; + + auto *ECV = new (ECValueAllocator) ECValue(Data); + I.first->second = ECV; + Members.push_back(ECV); + return *ECV; } /// findLeader - Given a value in the set, return a member iterator for the @@ -246,7 +228,7 @@ class EquivalenceClasses { auto I = TheMapping.find(V); if (I == TheMapping.end()) return member_iterator(nullptr); - return findLeader(*I); + return findLeader(*I->second); } member_iterator findLeader(const ECValue &ECV) const { return member_iterator(ECV.getLeader()); diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 4bfe41a5ed00d..ffb82bd5baf4e 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -33,6 +33,7 @@ #include "llvm/Transforms/Utils/LoopUtils.h" #include #include +#include #define DEBUG_TYPE "vector-combine" #include "llvm/Transforms/Utils/InstructionWorklist.h" diff --git a/llvm/unittests/ADT/EquivalenceClassesTest.cpp b/llvm/unittests/ADT/EquivalenceClassesTest.cpp index 2f9c441cde5c7..ff243f51102fb 100644 --- a/llvm/unittests/ADT/EquivalenceClassesTest.cpp +++ b/llvm/unittests/ADT/EquivalenceClassesTest.cpp @@ -109,30 +109,4 @@ TYPED_TEST_P(ParameterizedTest, MultipleSets) { EXPECT_FALSE(EqClasses.isEquivalent(i, j)); } -namespace { -// A dummy struct for testing EquivalenceClasses with a comparator. -struct TestStruct { - TestStruct(int value) : value(value) {} - - bool operator==(const TestStruct &other) const { - return value == other.value; - } - - int value; -}; -// Comparator to be used in test case. -struct TestStructComparator { - bool operator()(const TestStruct &lhs, const TestStruct &rhs) const { - return lhs.value < rhs.value; - } -}; -} // namespace - -REGISTER_TYPED_TEST_SUITE_P(ParameterizedTest, MultipleSets); -using ParamTypes = - testing::Types, - EquivalenceClasses>; -INSTANTIATE_TYPED_TEST_SUITE_P(EquivalenceClassesTest, ParameterizedTest, - ParamTypes, ); - } // llvm diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h index bd23a19f74728..673027f76190d 100644 --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h @@ -224,17 +224,8 @@ class OneShotAnalysisState : public AnalysisState { } private: - /// llvm::EquivalenceClasses wants comparable elements. This comparator uses - /// pointer comparison on the defining op. This is a poor man's comparison - /// but it's not like UnionFind needs ordering anyway. - struct ValueComparator { - bool operator()(const Value &lhs, const Value &rhs) const { - return lhs.getImpl() < rhs.getImpl(); - } - }; - - using EquivalenceClassRangeType = llvm::iterator_range< - llvm::EquivalenceClasses::member_iterator>; + using EquivalenceClassRangeType = + llvm::iterator_range::member_iterator>; /// Check that aliasInfo for `v` exists and return a reference to it. EquivalenceClassRangeType getAliases(Value v) const; @@ -249,7 +240,7 @@ class OneShotAnalysisState : public AnalysisState { /// value may alias with one of multiple other values. The concrete aliasing /// value may not even be known at compile time. All such values are /// considered to be aliases. - llvm::EquivalenceClasses aliasInfo; + llvm::EquivalenceClasses aliasInfo; /// Auxiliary structure to store all the equivalent buffer classes. Equivalent /// buffer information is "must be" conservative: Only if two values are @@ -257,7 +248,7 @@ class OneShotAnalysisState : public AnalysisState { /// possible that, in the presence of branches, it cannot be determined /// statically if two values are equivalent. In that case, the values are /// considered to be not equivalent. - llvm::EquivalenceClasses equivalentInfo; + llvm::EquivalenceClasses equivalentInfo; // Bufferization statistics. int64_t statNumTensorOutOfPlace = 0; From 475cbf0ad6e72f33e5ba5890a1c6e84e39a19e83 Mon Sep 17 00:00:00 2001 From: James E T Smith Date: Sat, 5 Apr 2025 07:46:11 -0400 Subject: [PATCH 0737/1029] [libc++] Implement ranges::iota (#68494) # Overview As a disclaimer, this is my first PR to LLVM and while I've tried to ensure I've followed the LLVM and libc++ contributing guidelines, there's probably a good chance I missed something. If I have, just let me know and I'll try to correct it as soon as I can. This PR implements `std::ranges::iota` and `std::ranges::out_value_result` outlined in [P2440r1](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p2440r1.html). As outlined in the paper above, I've: - Implemented `out_value_result` and added to `` - Added `out_value_result`, `iota_result`, and two overloads of `iota` to `std::ranges` in `` - Updated the version macro `__cpp_lib_ranges_iota` in `` I've also added tests for `ranges::iota` and `ranges::out_value_result`. Lastly, I added those structs to the appropriate module files. Partially implements #105184 EDIT: Forgot to mention in the original post, thanks to @hawkinsw for taking a look at a preliminary version of this PR! # TODOs - [x] Updating the range [status doc](https://github.com/jamesETsmith/llvm-project/blob/main/libcxx/docs/Status/RangesMajorFeatures.csv) - [x] Ensure all comments from https://reviews.llvm.org/D121436 are addressed here - [X] EDIT (I'll do this in a separate PR). ~~I'm open to implementing the rest of P2440r1 (`ranges::shift_left` and `ranges::shift_right`) if that's ok, I just wanted to get feedback on `ranges::iota` first~~ - [x] I've been having trouble building the modules locally and want to make sure that's working properly Closes: #134060 --- libcxx/docs/FeatureTestMacroTable.rst | 2 +- libcxx/docs/Status/Cxx23.rst | 2 +- libcxx/docs/Status/Cxx23Papers.csv | 2 +- libcxx/include/CMakeLists.txt | 2 + libcxx/include/__algorithm/out_value_result.h | 56 +++++ libcxx/include/__numeric/ranges_iota.h | 65 ++++++ libcxx/include/algorithm | 4 + libcxx/include/module.modulemap | 2 + libcxx/include/numeric | 1 + libcxx/include/version | 2 +- libcxx/modules/std/algorithm.inc | 4 +- libcxx/modules/std/numeric.inc | 8 +- .../no_unique_address.compile.pass.cpp | 5 +- .../out_value_result.pass.cpp | 141 ++++++++++++ ...result_alias_declarations.compile.pass.cpp | 6 +- .../ranges_robust_against_dangling.pass.cpp | 9 + ...es_robust_against_proxy_iterators.pass.cpp | 22 +- .../numeric.version.compile.pass.cpp | 32 +-- .../version.version.compile.pass.cpp | 32 +-- .../numeric.iota/ranges.iota.pass.cpp | 215 ++++++++++++++++++ libcxx/test/support/test_iterators.h | 40 +++- .../generate_feature_test_macro_components.py | 1 - 22 files changed, 585 insertions(+), 68 deletions(-) create mode 100644 libcxx/include/__algorithm/out_value_result.h create mode 100644 libcxx/include/__numeric/ranges_iota.h create mode 100644 libcxx/test/std/algorithms/algorithms.results/out_value_result.pass.cpp create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.iota/ranges.iota.pass.cpp diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index bbcc76b52f0a9..9015ccb18dddf 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -374,7 +374,7 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_ranges_find_last`` ``202207L`` ---------------------------------------------------------- ----------------- - ``__cpp_lib_ranges_iota`` *unimplemented* + ``__cpp_lib_ranges_iota`` ``202202L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_ranges_join_with`` *unimplemented* ---------------------------------------------------------- ----------------- diff --git a/libcxx/docs/Status/Cxx23.rst b/libcxx/docs/Status/Cxx23.rst index 3a922adcb0235..0f9d4bf0d1af8 100644 --- a/libcxx/docs/Status/Cxx23.rst +++ b/libcxx/docs/Status/Cxx23.rst @@ -40,4 +40,4 @@ Library Working Group Issues Status .. csv-table:: :file: Cxx23Issues.csv :header-rows: 1 - :widths: auto + :widths: auto \ No newline at end of file diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv index 4e45debd419ef..923d8bf9341d0 100644 --- a/libcxx/docs/Status/Cxx23Papers.csv +++ b/libcxx/docs/Status/Cxx23Papers.csv @@ -46,7 +46,7 @@ "`P2255R2 `__","A type trait to detect reference binding to temporary","2022-02 (Virtual)","|Partial|","","Implemented the type traits only." "`P2273R3 `__","Making ``std::unique_ptr`` constexpr","2022-02 (Virtual)","|Complete|","16","" "`P2387R3 `__","Pipe support for user-defined range adaptors","2022-02 (Virtual)","|Complete|","19","" -"`P2440R1 `__","``ranges::iota``, ``ranges::shift_left`` and ``ranges::shift_right``","2022-02 (Virtual)","","","" +"`P2440R1 `__","``ranges::iota``, ``ranges::shift_left`` and ``ranges::shift_right``","2022-02 (Virtual)","|Partial|","","Only ``ranges::iota`` is implemented." "`P2441R2 `__","``views::join_with``","2022-02 (Virtual)","|In Progress|","","" "`P2442R1 `__","Windowing range adaptors: ``views::chunk`` and ``views::slide``","2022-02 (Virtual)","","","" "`P2443R1 `__","``views::chunk_by``","2022-02 (Virtual)","|Complete|","18","" diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index a021b9bb44d67..7b09beb74b173 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -64,6 +64,7 @@ set(files __algorithm/next_permutation.h __algorithm/none_of.h __algorithm/nth_element.h + __algorithm/out_value_result.h __algorithm/partial_sort.h __algorithm/partial_sort_copy.h __algorithm/partition.h @@ -615,6 +616,7 @@ set(files __numeric/midpoint.h __numeric/partial_sum.h __numeric/pstl.h + __numeric/ranges_iota.h __numeric/reduce.h __numeric/saturation_arithmetic.h __numeric/transform_exclusive_scan.h diff --git a/libcxx/include/__algorithm/out_value_result.h b/libcxx/include/__algorithm/out_value_result.h new file mode 100644 index 0000000000000..9e1e0e07286d4 --- /dev/null +++ b/libcxx/include/__algorithm/out_value_result.h @@ -0,0 +1,56 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___ALGORITHM_OUT_VALUE_RESULT_H +#define _LIBCPP___ALGORITHM_OUT_VALUE_RESULT_H + +#include <__concepts/convertible_to.h> +#include <__config> +#include <__utility/move.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + +_LIBCPP_BEGIN_NAMESPACE_STD + +#if _LIBCPP_STD_VER >= 23 + +namespace ranges { + +template +struct out_value_result { + _LIBCPP_NO_UNIQUE_ADDRESS _OutIter1 out; + _LIBCPP_NO_UNIQUE_ADDRESS _ValType1 value; + + template + requires convertible_to && convertible_to + _LIBCPP_HIDE_FROM_ABI constexpr operator out_value_result<_OutIter2, _ValType2>() const& { + return {out, value}; + } + + template + requires convertible_to<_OutIter1, _OutIter2> && convertible_to<_ValType1, _ValType2> + _LIBCPP_HIDE_FROM_ABI constexpr operator out_value_result<_OutIter2, _ValType2>() && { + return {std::move(out), std::move(value)}; + } +}; + +} // namespace ranges + +#endif // _LIBCPP_STD_VER >= 23 + +_LIBCPP_END_NAMESPACE_STD + +_LIBCPP_POP_MACROS + +#endif // _LIBCPP___ALGORITHM_OUT_VALUE_RESULT_H diff --git a/libcxx/include/__numeric/ranges_iota.h b/libcxx/include/__numeric/ranges_iota.h new file mode 100644 index 0000000000000..bf53dd85fecef --- /dev/null +++ b/libcxx/include/__numeric/ranges_iota.h @@ -0,0 +1,65 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___NUMERIC_RANGES_IOTA_H +#define _LIBCPP___NUMERIC_RANGES_IOTA_H + +#include <__algorithm/out_value_result.h> +#include <__config> +#include <__iterator/concepts.h> +#include <__ranges/access.h> +#include <__ranges/concepts.h> +#include <__ranges/dangling.h> +#include <__utility/as_const.h> +#include <__utility/move.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + +_LIBCPP_BEGIN_NAMESPACE_STD + +#if _LIBCPP_STD_VER >= 23 +namespace ranges { +template +using iota_result = ranges::out_value_result<_Out, _Tp>; + +struct __iota_fn { +public: + template _Sent, weakly_incrementable _Tp> + requires indirectly_writable<_Out, const _Tp&> + _LIBCPP_HIDE_FROM_ABI static constexpr iota_result<_Out, _Tp> operator()(_Out __first, _Sent __last, _Tp __value) { + while (__first != __last) { + *__first = std::as_const(__value); + ++__first; + ++__value; + } + return {std::move(__first), std::move(__value)}; + } + + template _Range> + _LIBCPP_HIDE_FROM_ABI static constexpr iota_result, _Tp> + operator()(_Range&& __r, _Tp __value) { + return operator()(ranges::begin(__r), ranges::end(__r), std::move(__value)); + } +}; + +inline constexpr auto iota = __iota_fn{}; +} // namespace ranges + +#endif // _LIBCPP_STD_VER >= 23 + +_LIBCPP_END_NAMESPACE_STD + +_LIBCPP_POP_MACROS + +#endif // _LIBCPP___NUMERIC_RANGES_IOTA_H diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm index 6ba903ad3ce1e..bf67d3363a595 100644 --- a/libcxx/include/algorithm +++ b/libcxx/include/algorithm @@ -45,6 +45,9 @@ namespace ranges { template struct in_value_result; // since C++23 + template + struct out_value_result; // since C++23 + template S, class Proj = identity, indirect_strict_weak_order> Comp = ranges::less> // since C++20 constexpr I min_element(I first, S last, Comp comp = {}, Proj proj = {}); @@ -1936,6 +1939,7 @@ template # include <__algorithm/in_out_result.h> # include <__algorithm/lexicographical_compare_three_way.h> # include <__algorithm/min_max_result.h> +# include <__algorithm/out_value_result.h> # include <__algorithm/ranges_adjacent_find.h> # include <__algorithm/ranges_all_of.h> # include <__algorithm/ranges_any_of.h> diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 0ce42fc4d3633..324931b1bb078 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -490,6 +490,7 @@ module std [system] { module next_permutation { header "__algorithm/next_permutation.h" } module none_of { header "__algorithm/none_of.h" } module nth_element { header "__algorithm/nth_element.h" } + module out_value_result { header "__algorithm/out_value_result.h" } module partial_sort_copy { header "__algorithm/partial_sort_copy.h" } module partial_sort { header "__algorithm/partial_sort.h" } module partition_copy { header "__algorithm/partition_copy.h" } @@ -1727,6 +1728,7 @@ module std [system] { module partial_sum { header "__numeric/partial_sum.h" } module pstl { header "__numeric/pstl.h" } module reduce { header "__numeric/reduce.h" } + module ranges_iota { header "__numeric/ranges_iota.h" } module saturation_arithmetic { header "__numeric/saturation_arithmetic.h" } module transform_exclusive_scan { header "__numeric/transform_exclusive_scan.h" } module transform_inclusive_scan { header "__numeric/transform_inclusive_scan.h" } diff --git a/libcxx/include/numeric b/libcxx/include/numeric index 2f2b86136fb98..48c330fcb009c 100644 --- a/libcxx/include/numeric +++ b/libcxx/include/numeric @@ -172,6 +172,7 @@ constexpr T saturate_cast(U x) noexcept; // freestanding, Sin # include <__numeric/gcd_lcm.h> # include <__numeric/inclusive_scan.h> # include <__numeric/pstl.h> +# include <__numeric/ranges_iota.h> # include <__numeric/reduce.h> # include <__numeric/transform_exclusive_scan.h> # include <__numeric/transform_inclusive_scan.h> diff --git a/libcxx/include/version b/libcxx/include/version index 83ae11dabd2bc..49102716c3605 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -513,7 +513,7 @@ __cpp_lib_void_t 201411L # define __cpp_lib_ranges_chunk_by 202202L # define __cpp_lib_ranges_contains 202207L # define __cpp_lib_ranges_find_last 202207L -// # define __cpp_lib_ranges_iota 202202L +# define __cpp_lib_ranges_iota 202202L // # define __cpp_lib_ranges_join_with 202202L # define __cpp_lib_ranges_repeat 202207L // # define __cpp_lib_ranges_slide 202202L diff --git a/libcxx/modules/std/algorithm.inc b/libcxx/modules/std/algorithm.inc index 3c2139cd64ee4..95c05f01e5562 100644 --- a/libcxx/modules/std/algorithm.inc +++ b/libcxx/modules/std/algorithm.inc @@ -20,7 +20,9 @@ export namespace std { using std::ranges::in_value_result; #endif using std::ranges::min_max_result; - // using std::ranges::out_value_result; +#if _LIBCPP_STD_VER >= 23 + using std::ranges::out_value_result; +#endif } // namespace ranges // [alg.nonmodifying], non-modifying sequence operations diff --git a/libcxx/modules/std/numeric.inc b/libcxx/modules/std/numeric.inc index 3bc7b23168158..5a549552081d2 100644 --- a/libcxx/modules/std/numeric.inc +++ b/libcxx/modules/std/numeric.inc @@ -42,8 +42,12 @@ export namespace std { using std::iota; namespace ranges { - // using std::ranges::iota_result; - // using std::ranges::iota; + +#if _LIBCPP_STD_VER >= 23 + using std::ranges::iota; + using std::ranges::iota_result; +#endif // _LIBCPP_STD_VER >= 23 + } // namespace ranges // [numeric.ops.gcd], greatest common divisor diff --git a/libcxx/test/std/algorithms/algorithms.results/no_unique_address.compile.pass.cpp b/libcxx/test/std/algorithms/algorithms.results/no_unique_address.compile.pass.cpp index 34dbd64a49ae8..70da332c56e2f 100644 --- a/libcxx/test/std/algorithms/algorithms.results/no_unique_address.compile.pass.cpp +++ b/libcxx/test/std/algorithms/algorithms.results/no_unique_address.compile.pass.cpp @@ -53,7 +53,10 @@ static_assert(sizeof(std::ranges::in_out_out_result) == 2); static_assert(sizeof(std::ranges::in_out_out_result) == sizeof(int)); static_assert(sizeof(std::ranges::in_out_out_result) == 3); -#if TEST_STD_VER >= 23 +#if _LIBCPP_STD_VER >= 23 +static_assert(sizeof(std::ranges::out_value_result) == sizeof(int)); +static_assert(sizeof(std::ranges::out_value_result) == sizeof(int)); +static_assert(sizeof(std::ranges::out_value_result) == 2); static_assert(sizeof(std::ranges::in_value_result) == sizeof(int)); static_assert(sizeof(std::ranges::in_value_result) == sizeof(int)); diff --git a/libcxx/test/std/algorithms/algorithms.results/out_value_result.pass.cpp b/libcxx/test/std/algorithms/algorithms.results/out_value_result.pass.cpp new file mode 100644 index 0000000000000..7bdbb7f60e9a2 --- /dev/null +++ b/libcxx/test/std/algorithms/algorithms.results/out_value_result.pass.cpp @@ -0,0 +1,141 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// template +// struct out_value_result; + +#include +#include +#include + +#include "MoveOnly.h" + +using std::ranges::out_value_result; + +// +// Helper structs +// + +// only explicit construction +struct IterTypeExplicit { + explicit IterTypeExplicit(int*); +}; + +// implicit construction +struct IterTypeImplicit { + IterTypeImplicit(int*); +}; + +struct IterTypeImplicitRef { + IterTypeImplicitRef(int&); +}; + +struct NotConvertible {}; + +template +struct ConvertibleFrom { + constexpr ConvertibleFrom(T c) : content{c} {} + T content; +}; + +// Standard layout classes can't have virtual functions +struct NonStandardLayoutTypeBase { + virtual ~NonStandardLayoutTypeBase(); +}; +struct NonStandardLayoutType : public NonStandardLayoutTypeBase {}; + +// +constexpr bool test_constraints() { + // requires convertible_to && convertible_to + static_assert(std::is_constructible_v, out_value_result>); + + // test failure when implicit conversion isn't allowed + static_assert(!std::is_constructible_v, out_value_result>); + + // test success when implicit conversion is allowed, checking combinations of value, reference, and const + static_assert(std::is_constructible_v, out_value_result>); + static_assert(std::is_constructible_v, out_value_result const>); + static_assert(std::is_constructible_v, out_value_result&>); + static_assert(std::is_constructible_v, out_value_result const&>); + + static_assert(!std::is_constructible_v, out_value_result&>); + + // has to be convertible via const& + static_assert(std::is_convertible_v&, out_value_result>); + static_assert(std::is_convertible_v&, out_value_result>); + static_assert(std::is_convertible_v&&, out_value_result>); + static_assert(std::is_convertible_v&&, out_value_result>); + + // should be move constructible + static_assert(std::is_move_constructible_v>); + static_assert(std::is_move_constructible_v>); + + // conversions should not work if there is no conversion + static_assert(!std::is_convertible_v, out_value_result>); + static_assert(!std::is_convertible_v, out_value_result>); + + // check standard layout + static_assert(std::is_standard_layout_v>); + static_assert(!std::is_standard_layout_v>); + + return true; +} + +// Test results +constexpr bool test() { + { + // Check that conversion operator works + out_value_result res{10, 1}; + assert(res.out == 10); + assert(res.value == 1); + out_value_result, ConvertibleFrom> res2 = res; + assert(res2.out.content == 10); + assert(res2.value.content == 1); + } + { + // Check that out_value_result isn't overconstrained w.r.t. move/copy constructors + out_value_result res{MoveOnly{}, 10}; + assert(res.out.get() == 1); + assert(res.value == 10); + auto res2 = std::move(res); + assert(res.out.get() == 0); + assert(res.value == 10); + assert(res2.out.get() == 1); + assert(res2.value == 10); + } + { + // Check structured binding + auto [out, val] = out_value_result{1, 2}; + assert(out == 1); + assert(val == 2); + } + { + // Check default construction + out_value_result res; + static_assert(std::is_same_v); + static_assert(std::is_same_v); + } + { + // Check aggregate initiazliation + out_value_result res = {1, 2}; + assert(res.out == 1); + assert(res.value == 2); + } + + return true; +} + +int main(int, char**) { + test_constraints(); + static_assert(test_constraints()); + test(); + static_assert(test()); + return 0; +} diff --git a/libcxx/test/std/algorithms/ranges_result_alias_declarations.compile.pass.cpp b/libcxx/test/std/algorithms/ranges_result_alias_declarations.compile.pass.cpp index 6940b23cfca1e..dcf25099dbc47 100644 --- a/libcxx/test/std/algorithms/ranges_result_alias_declarations.compile.pass.cpp +++ b/libcxx/test/std/algorithms/ranges_result_alias_declarations.compile.pass.cpp @@ -13,6 +13,7 @@ // ensure that all result alias declarations are defined #include +#include #include #include @@ -62,9 +63,6 @@ static_assert(std::is_same_v, next_permutation_result> static_assert(std::is_same_v, prev_permutation_result>); #if TEST_STD_VER >= 23 - static_assert(std::is_same_v, fold_left_with_iter_result>); - -// static_assert(std::is_same_v, iota_result>); - +static_assert(std::is_same_v, iota_result>); #endif // TEST_STD_VER diff --git a/libcxx/test/std/algorithms/ranges_robust_against_dangling.pass.cpp b/libcxx/test/std/algorithms/ranges_robust_against_dangling.pass.cpp index 2691269ef1b28..e0a6aaaf03aa0 100644 --- a/libcxx/test/std/algorithms/ranges_robust_against_dangling.pass.cpp +++ b/libcxx/test/std/algorithms/ranges_robust_against_dangling.pass.cpp @@ -18,11 +18,13 @@ #include #include #include +#include #include #include #include #include "test_iterators.h" +#include "test_macros.h" struct NonBorrowedRange { using Iter = int*; @@ -78,6 +80,9 @@ constexpr bool test_all() { using std::ranges::move_result; using std::ranges::move_backward_result; using std::ranges::next_permutation_result; +#if TEST_STD_VER >= 23 + using std::ranges::out_value_result; +#endif using std::ranges::partial_sort_copy_result; using std::ranges::partition_copy_result; using std::ranges::prev_permutation_result; @@ -217,6 +222,10 @@ constexpr bool test_all() { dangling_1st>(std::ranges::prev_permutation, in); dangling_1st>(std::ranges::next_permutation, in); +#if TEST_STD_VER >= 23 + dangling_1st>(std::ranges::iota, in, x); +#endif + return true; } diff --git a/libcxx/test/std/algorithms/ranges_robust_against_proxy_iterators.pass.cpp b/libcxx/test/std/algorithms/ranges_robust_against_proxy_iterators.pass.cpp index bebaeb01f22e9..9d4b0d608518a 100644 --- a/libcxx/test/std/algorithms/ranges_robust_against_proxy_iterators.pass.cpp +++ b/libcxx/test/std/algorithms/ranges_robust_against_proxy_iterators.pass.cpp @@ -14,6 +14,7 @@ // a customization point) rather than plain `swap` (which might not work with certain valid iterators). #include +#include #include #include @@ -28,22 +29,22 @@ #include "test_macros.h" // (in, ...) -template -constexpr void test(Func&& func, Input& in, Args&& ...args) { +template +constexpr void test(Func&& func, Input& in, Args&&... args) { (void)func(in.begin(), in.end(), std::forward(args)...); (void)func(in, std::forward(args)...); } // (in1, in2, ...) -template -constexpr void test(Func&& func, Range1& r1, Range2& r2, Args&& ...args) { +template +constexpr void test(Func&& func, Range1& r1, Range2& r2, Args&&... args) { (void)func(r1.begin(), r1.end(), r2.begin(), r2.end(), std::forward(args)...); (void)func(r1, r2, std::forward(args)...); } // (in, mid, ...) -template -constexpr void test_mid(Func&& func, Input& in, std::ranges::iterator_t mid, Args&& ...args) { +template +constexpr void test_mid(Func&& func, Input& in, std::ranges::iterator_t mid, Args&&... args) { (void)func(in.begin(), mid, in.end(), std::forward(args)...); (void)func(in, mid, std::forward(args)...); } @@ -68,9 +69,9 @@ constexpr void run_tests() { Proxy x{num}; int count = 1; - auto unary_pred = [](const Proxy&) { return true; }; + auto unary_pred = [](const Proxy&) { return true; }; auto binary_func = [](const Proxy&, const Proxy&) -> Proxy { return Proxy(T()); }; - auto gen = [] { return Proxy(T{42}); }; + auto gen = [] { return Proxy(T{42}); }; test(std::ranges::any_of, in, unary_pred); test(std::ranges::all_of, in, unary_pred); @@ -106,6 +107,11 @@ constexpr void run_tests() { test(std::ranges::search, in, in2); test(std::ranges::search_n, in, count, x); test(std::ranges::find_end, in, in2); +#if TEST_STD_VER >= 23 + if constexpr (std::weakly_incrementable) { + test(std::ranges::iota, in, x); + } +#endif test(std::ranges::is_partitioned, in, unary_pred); test(std::ranges::is_sorted, in); test(std::ranges::is_sorted_until, in); diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp index d132b7c7b9c4f..0743c6c71840c 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp @@ -197,17 +197,11 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_ranges_iota -# error "__cpp_lib_ranges_iota should be defined in c++23" -# endif -# if __cpp_lib_ranges_iota != 202202L -# error "__cpp_lib_ranges_iota should have the value 202202L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_ranges_iota -# error "__cpp_lib_ranges_iota should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_ranges_iota +# error "__cpp_lib_ranges_iota should be defined in c++23" +# endif +# if __cpp_lib_ranges_iota != 202202L +# error "__cpp_lib_ranges_iota should have the value 202202L in c++23" # endif # ifdef __cpp_lib_saturation_arithmetic @@ -250,17 +244,11 @@ # endif # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_ranges_iota -# error "__cpp_lib_ranges_iota should be defined in c++26" -# endif -# if __cpp_lib_ranges_iota != 202202L -# error "__cpp_lib_ranges_iota should have the value 202202L in c++26" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_ranges_iota -# error "__cpp_lib_ranges_iota should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_ranges_iota +# error "__cpp_lib_ranges_iota should be defined in c++26" +# endif +# if __cpp_lib_ranges_iota != 202202L +# error "__cpp_lib_ranges_iota should have the value 202202L in c++26" # endif # ifndef __cpp_lib_saturation_arithmetic diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index e5a657207923b..07e96e53f2e93 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -5771,17 +5771,11 @@ # error "__cpp_lib_ranges_find_last should have the value 202207L in c++23" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_ranges_iota -# error "__cpp_lib_ranges_iota should be defined in c++23" -# endif -# if __cpp_lib_ranges_iota != 202202L -# error "__cpp_lib_ranges_iota should have the value 202202L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_ranges_iota -# error "__cpp_lib_ranges_iota should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_ranges_iota +# error "__cpp_lib_ranges_iota should be defined in c++23" +# endif +# if __cpp_lib_ranges_iota != 202202L +# error "__cpp_lib_ranges_iota should have the value 202202L in c++23" # endif # if !defined(_LIBCPP_VERSION) @@ -7664,17 +7658,11 @@ # error "__cpp_lib_ranges_find_last should have the value 202207L in c++26" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_ranges_iota -# error "__cpp_lib_ranges_iota should be defined in c++26" -# endif -# if __cpp_lib_ranges_iota != 202202L -# error "__cpp_lib_ranges_iota should have the value 202202L in c++26" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_ranges_iota -# error "__cpp_lib_ranges_iota should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_ranges_iota +# error "__cpp_lib_ranges_iota should be defined in c++26" +# endif +# if __cpp_lib_ranges_iota != 202202L +# error "__cpp_lib_ranges_iota should have the value 202202L in c++26" # endif # if !defined(_LIBCPP_VERSION) diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.iota/ranges.iota.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.iota/ranges.iota.pass.cpp new file mode 100644 index 0000000000000..9fa50f1326f1d --- /dev/null +++ b/libcxx/test/std/numerics/numeric.ops/numeric.iota/ranges.iota.pass.cpp @@ -0,0 +1,215 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Testing std::ranges::iota + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +#include +#include +#include +#include +#include + +#include "almost_satisfies_types.h" +#include "test_iterators.h" +#include "test_macros.h" + +// +// Testing constraints +// + +// Concepts to check different overloads of std::ranges::iota +template +concept HasIotaIter = requires(Iter&& iter, Sent&& sent, Value&& val) { + std::ranges::iota(std::forward(iter), std::forward(sent), std::forward(val)); +}; + +template +concept HasIotaRange = + requires(Range&& range, Value&& val) { std::ranges::iota(std::forward(range), std::forward(val)); }; + +// Test constraints of the iterator/sentinel overload +// ================================================== +static_assert(HasIotaIter); + +// !input_or_output_iterator +static_assert(!HasIotaIter); + +// !sentinel_for +static_assert(!HasIotaIter); +static_assert(!HasIotaIter); + +// !weakly_incrementable +static_assert(!HasIotaIter); + +// !indirectly writable +static_assert(!HasIotaIter); + +// Test constraints for the range overload +// ======================================= +static_assert(HasIotaRange, int>); + +// !weakly_incrementable +static_assert(!HasIotaRange, WeaklyIncrementableNotMovable>); + +// !ranges::output_range +static_assert(!HasIotaRange, OutputIteratorNotIndirectlyWritable>); + +// +// Testing results +// + +struct DangerousCopyAssign { + int val; + using difference_type = int; + + constexpr explicit DangerousCopyAssign(int v) : val(v) {} + + // Needed in postfix + constexpr DangerousCopyAssign(DangerousCopyAssign const& other) { this->val = other.val; } + + /* + This class has a "mischievous" non-const overload of copy-assignment + operator that modifies the object being assigned from. `ranges::iota` + should not be invoking this overload thanks to the `std::as_const` in its + implementation. If for some reason it does invoke it, there will be a compiler + error. + */ + constexpr DangerousCopyAssign& operator=(DangerousCopyAssign& a) = delete; + + // safe copy assignment std::as_const inside ranges::iota should ensure this + // overload gets called + constexpr DangerousCopyAssign& operator=(DangerousCopyAssign const& a) { + this->val = a.val; + return *this; + } + + constexpr bool operator==(DangerousCopyAssign const& rhs) { return this->val == rhs.val; } + + // prefix + constexpr DangerousCopyAssign& operator++() { + ++(this->val); + return *this; + } + + // postfix + constexpr DangerousCopyAssign operator++(int) { + auto tmp = *this; + ++this->val; + return tmp; + } +}; + +template +constexpr void test_result(std::array input, int starting_value, std::array const expected) { + { // (iterator, sentinel) overload + auto in_begin = Iter(input.data()); + auto in_end = Sent(Iter(input.data() + input.size())); + std::same_as> decltype(auto) result = + std::ranges::iota(std::move(in_begin), std::move(in_end), starting_value); + assert(result.out == in_end); + assert(result.value == starting_value + static_cast(N)); + assert(std::ranges::equal(input, expected)); + } + + { // (range) overload + // in the range overload adds the additional constraint that it must be an output range + // so skip this for the input iterators we test + auto in_begin = Iter(input.data()); + auto in_end = Sent(Iter(input.data() + input.size())); + auto range = std::ranges::subrange(std::move(in_begin), std::move(in_end)); + + std::same_as> decltype(auto) result = + std::ranges::iota(range, starting_value); + assert(result.out == in_end); + assert(result.value == starting_value + static_cast(N)); + assert(std::ranges::equal(input, expected)); + } +} + +template > +constexpr void test_results() { + // Empty + test_result({}, 0, {}); + // 1-element sequence + test_result({1}, 0, {0}); + // Longer sequence + test_result({1, 2, 3, 4, 5}, 0, {0, 1, 2, 3, 4}); +} + +constexpr void test_user_defined_type() { + // Simple non-fundamental type + struct UserDefinedType { + int val; + using difference_type = int; + + constexpr explicit UserDefinedType(int v) : val(v) {} + constexpr UserDefinedType(UserDefinedType const& other) { this->val = other.val; } + constexpr UserDefinedType& operator=(UserDefinedType const& a) { + this->val = a.val; + return *this; + } + + // prefix + constexpr UserDefinedType& operator++() { + ++(this->val); + return *this; + } + + // postfix + constexpr UserDefinedType operator++(int) { + auto tmp = *this; + ++this->val; + return tmp; + } + }; + + // Setup + using A = UserDefinedType; + std::array a = {A{0}, A{0}, A{0}, A{0}, A{0}}; + std::array expected = {A{0}, A{1}, A{2}, A{3}, A{4}}; + + // Fill with values + std::ranges::iota(a, A{0}); + auto proj_val = [](UserDefinedType const& el) { return el.val; }; + + // Check + assert(std::ranges::equal(a, expected, std::ranges::equal_to{}, proj_val, proj_val)); +} + +constexpr void test_dangerous_copy_assign() { + using A = DangerousCopyAssign; + + // If the dangerous non-const copy assignment is called, the final values in + // aa should increment by 2 rather than 1. + std::array aa = {A{0}, A{0}, A{0}}; + std::array expected = {A{0}, A{1}, A{2}}; + std::ranges::iota(aa, A{0}); + auto proj_val = [](DangerousCopyAssign const& el) { return el.val; }; + assert(std::ranges::equal(aa, expected, std::ranges::equal_to{}, proj_val, proj_val)); +} + +constexpr bool test_results() { + // Tests on fundamental types + types::for_each(types::cpp17_input_iterator_list{}, [] { test_results< Iter>(); }); + test_results>(); + test_results>(); + test_results>(); + + // Tests on non-fundamental types + test_user_defined_type(); + test_dangerous_copy_assign(); + return true; +} + +int main(int, char**) { + test_results(); + static_assert(test_results()); + return 0; +} diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h index ead8a3e8f87d2..702b82b9e15a7 100644 --- a/libcxx/test/support/test_iterators.h +++ b/libcxx/test/support/test_iterators.h @@ -1177,6 +1177,23 @@ rvalue_iterator(T*) -> rvalue_iterator; static_assert(std::random_access_iterator>); +// The ProxyDiffTBase allows us to conditionally specify Proxy::difference_type +// which we need in certain situations. For example when we want +// std::weakly_incrementable> to be true. +template +struct ProxyDiffTBase { + // Add default `operator<=>` so that the derived type, Proxy, can also use the default `operator<=>` + friend constexpr auto operator<=>(const ProxyDiffTBase&, const ProxyDiffTBase&) = default; +}; + +template + requires requires { std::iter_difference_t{}; } +struct ProxyDiffTBase { + using difference_type = std::iter_difference_t; + // Add default `operator<=>` so that the derived type, Proxy, can also use the default `operator<=>` + friend constexpr auto operator<=>(const ProxyDiffTBase&, const ProxyDiffTBase&) = default; +}; + // Proxy // ====================================================================== // Proxy that can wrap a value or a reference. It simulates C++23's tuple @@ -1187,6 +1204,7 @@ static_assert(std::random_access_iterator>); // This class is useful for testing that if algorithms support proxy iterator // properly, i.e. calling ranges::iter_swap and ranges::iter_move instead of // plain swap and std::move. + template struct Proxy; @@ -1197,7 +1215,7 @@ template inline constexpr bool IsProxy> = true; template -struct Proxy { +struct Proxy : ProxyDiffTBase { T data; constexpr T& getData() & { return data; } @@ -1248,7 +1266,7 @@ struct Proxy { // Compare operators are defined for the convenience of the tests friend constexpr bool operator==(const Proxy&, const Proxy&) - requires (std::equality_comparable && !std::is_reference_v) + requires(std::equality_comparable && !std::is_reference_v) = default; // Helps compare e.g. `Proxy` and `Proxy`. Note that the default equality comparison operator is deleted @@ -1260,7 +1278,7 @@ struct Proxy { } friend constexpr auto operator<=>(const Proxy&, const Proxy&) - requires (std::three_way_comparable && !std::is_reference_v) + requires(std::three_way_comparable && !std::is_reference_v) = default; // Helps compare e.g. `Proxy` and `Proxy`. Note that the default 3-way comparison operator is deleted when @@ -1270,6 +1288,22 @@ struct Proxy { requires std::three_way_comparable_with, std::decay_t> { return lhs.data <=> rhs.data; } + + // Needed to allow certain types to be weakly_incrementable + constexpr Proxy& operator++() + requires(std::weakly_incrementable>) + { + ++data; + return *this; + } + + constexpr Proxy operator++(int) + requires(std::incrementable>) + { + Proxy tmp = *this; + operator++(); + return tmp; + } }; // This is to make ProxyIterator model `std::indirectly_readable` diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index 53252d5e2d673..febfb0f739e2c 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -1081,7 +1081,6 @@ def add_version_header(tc): "name": "__cpp_lib_ranges_iota", "values": {"c++23": 202202}, "headers": ["numeric"], - "unimplemented": True, }, { "name": "__cpp_lib_ranges_join_with", From 68ce637872a2e1a1dedbb9caa6eacf15ec5c7b99 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Sat, 5 Apr 2025 06:20:25 -0700 Subject: [PATCH 0738/1029] [mlir][Transforms][NFC] Dialect Conversion: Replace after legalizing constants (#134384) When folding an op during a conversion, first try to legalize all generated constants, then replace the original operation. This is slightly more efficient because fewer rewrites must be rolled back in case a generated constant could not be legalized. Note: This is in preparation of the One-Shot Dialect Conversion refactoring. --- mlir/lib/Transforms/Utils/DialectConversion.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 4554801b3a388..566dab7f30adb 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -2109,9 +2109,6 @@ OperationLegalizer::legalizeWithFold(Operation *op, if (replacementValues.empty()) return legalize(op, rewriter); - // Insert a replacement for 'op' with the folded replacement values. - rewriter.replaceOp(op, replacementValues); - // Recursively legalize any new constant operations. for (unsigned i = curState.numRewrites, e = rewriterImpl.rewrites.size(); i != e; ++i) { @@ -2128,6 +2125,9 @@ OperationLegalizer::legalizeWithFold(Operation *op, } } + // Insert a replacement for 'op' with the folded replacement values. + rewriter.replaceOp(op, replacementValues); + LLVM_DEBUG(logSuccess(rewriterImpl.logger, "")); return success(); } From fcead25550bb727215919ecf2dfad17765223c19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 5 Apr 2025 16:53:05 +0300 Subject: [PATCH 0739/1029] [lldb] Fix building with GCC without asserts This case was missed in 03604a784011bec2292f900b118d825f34f8cf89. --- lldb/include/lldb/Utility/LLDBAssert.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lldb/include/lldb/Utility/LLDBAssert.h b/lldb/include/lldb/Utility/LLDBAssert.h index cc0146be25998..e76dac46b94f4 100644 --- a/lldb/include/lldb/Utility/LLDBAssert.h +++ b/lldb/include/lldb/Utility/LLDBAssert.h @@ -27,8 +27,11 @@ } while (0) #else #define lldbassert(x) \ - lldb_private::_lldb_assert(static_cast(x), #x, __FUNCTION__, __FILE__, \ - __LINE__) + do { \ + static std::once_flag _once_flag; \ + lldb_private::_lldb_assert(static_cast(x), #x, __FUNCTION__, \ + __FILE__, __LINE__, _once_flag); \ + } while (0) #endif #endif From 13faa819168e568404bee320ca7db9bc386f081f Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Sat, 5 Apr 2025 16:49:05 +0200 Subject: [PATCH 0740/1029] [TypePromotion] Do not zero-extend getelementptr indexes since signed A miscompilation issue has been addressed with improved handling. Fixes: https://github.com/llvm/llvm-project/issues/133928. Alive2: https://alive2.llvm.org/ce/z/gcMNvS. --- llvm/lib/CodeGen/TypePromotion.cpp | 6 +-- .../test/CodeGen/AArch64/typepromotion-gep.ll | 41 +++++++++++++++++++ .../CodeGen/AArch64/typepromotion-phisret.ll | 22 +++++----- .../Transforms/TypePromotion/ARM/phis-ret.ll | 18 ++++---- .../Transforms/TypePromotion/ARM/pointers.ll | 28 ++++++------- 5 files changed, 75 insertions(+), 40 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/typepromotion-gep.ll diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp index b1f99094daa4a..e9fa78eabff7c 100644 --- a/llvm/lib/CodeGen/TypePromotion.cpp +++ b/llvm/lib/CodeGen/TypePromotion.cpp @@ -806,10 +806,10 @@ bool TypePromotionImpl::TryToPromote(Value *V, unsigned PromotedWidth, if (CurrentVisited.count(V)) return true; - // Ignore GEPs because they don't need promoting and the constant indices - // will prevent the transformation. + // Skip promoting GEPs as their indices should have already been + // canonicalized to pointer width. if (isa(V)) - return true; + return false; if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) { LLVM_DEBUG(dbgs() << "IR Promotion: Can't handle: " << *V << "\n"); diff --git a/llvm/test/CodeGen/AArch64/typepromotion-gep.ll b/llvm/test/CodeGen/AArch64/typepromotion-gep.ll new file mode 100644 index 0000000000000..760a913e46b31 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/typepromotion-gep.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -p typepromotion %s -o - | FileCheck %s + +target triple = "arm64-apple-macosx15.0.0" + +define i4 @gep_offset_signedness(ptr %ptr, i8 %offset, i1 %cond) { +; CHECK-LABEL: define i4 @gep_offset_signedness( +; CHECK-SAME: ptr [[PTR:%.*]], i8 [[OFFSET:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[UNUSED_TRUNC:%.*]] = trunc i8 [[OFFSET]] to i4 +; CHECK-NEXT: [[PTR_IDX:%.*]] = getelementptr i8, ptr [[PTR]], i8 [[OFFSET]] +; CHECK-NEXT: [[COND_2:%.*]] = icmp uge ptr [[PTR_IDX]], [[PTR]] +; CHECK-NEXT: br i1 [[COND_2]], label %[[RETURN:.*]], label %[[ELSE:.*]] +; CHECK: [[RETURN]]: +; CHECK-NEXT: [[RET_VAL:%.*]] = phi i4 [ 0, %[[ELSE_RET:.*]] ], [ 1, %[[ENTRY]] ], [ 0, %[[ELSE]] ] +; CHECK-NEXT: ret i4 [[RET_VAL]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[ADD:%.*]] = add nuw i8 0, 0 +; CHECK-NEXT: [[COND_3:%.*]] = icmp ult i8 [[ADD]], [[OFFSET]] +; CHECK-NEXT: br i1 [[COND]], label %[[RETURN]], label %[[ELSE_RET]] +; CHECK: [[ELSE_RET]]: +; CHECK-NEXT: br label %[[RETURN]] +; +entry: + %unused_trunc = trunc i8 %offset to i4 + %ptr.idx = getelementptr i8, ptr %ptr, i8 %offset + %cond.2 = icmp uge ptr %ptr.idx, %ptr + br i1 %cond.2, label %return, label %else + +return: ; preds = %else.ret, %else, %entry + %ret.val = phi i4 [ 0, %else.ret ], [ 1, %entry ], [ 0, %else ] + ret i4 %ret.val + +else: ; preds = %entry + %add = add nuw i8 0, 0 + %cond.3 = icmp ult i8 %add, %offset + br i1 %cond, label %return, label %else.ret + +else.ret: ; preds = %else + br label %return +} diff --git a/llvm/test/CodeGen/AArch64/typepromotion-phisret.ll b/llvm/test/CodeGen/AArch64/typepromotion-phisret.ll index d60578b7bafe4..a9164312b99cc 100644 --- a/llvm/test/CodeGen/AArch64/typepromotion-phisret.ll +++ b/llvm/test/CodeGen/AArch64/typepromotion-phisret.ll @@ -237,25 +237,27 @@ define i16 @promote_arg_return(i16 zeroext %arg1, i16 zeroext %arg2, ptr %res) { define i16 @signext_bitcast_phi_select(i16 signext %start, ptr %in) { ; CHECK-LABEL: signext_bitcast_phi_select: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #-1 // =0xffffffff -; CHECK-NEXT: and w9, w0, #0xffff -; CHECK-NEXT: cmp w8, w9, sxth +; CHECK-NEXT: mov w9, #-1 // =0xffffffff +; CHECK-NEXT: mov w10, #32768 // =0x8000 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: cmp w9, w0, sxth ; CHECK-NEXT: b.lt .LBB6_3 ; CHECK-NEXT: .LBB6_1: // %if.then ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrh w0, [x1, w9, sxtw #1] -; CHECK-NEXT: cmp w0, w9 +; CHECK-NEXT: sxth x8, w0 +; CHECK-NEXT: ldrh w8, [x1, x8, lsl #1] +; CHECK-NEXT: cmp w8, w0, uxth ; CHECK-NEXT: b.eq .LBB6_4 ; CHECK-NEXT: // %bb.2: // %if.else ; CHECK-NEXT: // in Loop: Header=BB6_1 Depth=1 -; CHECK-NEXT: lsr w10, w9, #15 -; CHECK-NEXT: eor w10, w10, #0x1 -; CHECK-NEXT: add w9, w10, w9 -; CHECK-NEXT: cmp w8, w9, sxth +; CHECK-NEXT: bic w8, w10, w0 +; CHECK-NEXT: add w0, w0, w8, lsr #15 +; CHECK-NEXT: cmp w9, w0, sxth ; CHECK-NEXT: b.ge .LBB6_1 ; CHECK-NEXT: .LBB6_3: -; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: .LBB6_4: // %exit +; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret entry: %const = bitcast i16 -1 to i16 diff --git a/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll b/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll index 6f41742e66e53..d3ba82460a18a 100644 --- a/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll +++ b/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll @@ -293,28 +293,24 @@ define i16 @promote_arg_return(i16 zeroext %arg1, i16 zeroext %arg2, ptr %res) { define i16 @signext_bitcast_phi_select(i16 signext %start, ptr %in) { ; CHECK-LABEL: @signext_bitcast_phi_select( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[START:%.*]] to i32 ; CHECK-NEXT: [[CONST:%.*]] = bitcast i16 -1 to i16 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[SELECT:%.*]], [[IF_ELSE:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[IDX]] to i16 +; CHECK-NEXT: [[TMP1:%.*]] = phi i16 [ [[SELECT:%.*]], [[IF_ELSE:%.*]] ], [ [[START:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt i16 [[TMP1]], [[CONST]] ; CHECK-NEXT: br i1 [[CMP_I]], label [[EXIT:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: -; CHECK-NEXT: [[IDX_NEXT:%.*]] = getelementptr i16, ptr [[IN:%.*]], i32 [[IDX]] +; CHECK-NEXT: [[IDX_NEXT:%.*]] = getelementptr i16, ptr [[IN:%.*]], i16 [[TMP1]] ; CHECK-NEXT: [[LD:%.*]] = load i16, ptr [[IDX_NEXT]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[LD]] to i32 -; CHECK-NEXT: [[CMP1_I:%.*]] = icmp eq i32 [[TMP2]], [[IDX]] +; CHECK-NEXT: [[CMP1_I:%.*]] = icmp eq i16 [[LD]], [[TMP1]] ; CHECK-NEXT: br i1 [[CMP1_I]], label [[EXIT]], label [[IF_ELSE]] ; CHECK: if.else: -; CHECK-NEXT: [[LOBIT:%.*]] = lshr i32 [[IDX]], 15 -; CHECK-NEXT: [[LOBIT_NOT:%.*]] = xor i32 [[LOBIT]], 1 -; CHECK-NEXT: [[SELECT]] = add nuw i32 [[LOBIT_NOT]], [[IDX]] +; CHECK-NEXT: [[LOBIT:%.*]] = lshr i16 [[TMP1]], 15 +; CHECK-NEXT: [[LOBIT_NOT:%.*]] = xor i16 [[LOBIT]], 1 +; CHECK-NEXT: [[SELECT]] = add nuw i16 [[LOBIT_NOT]], [[TMP1]] ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: exit: -; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[RES]] to i16 +; CHECK-NEXT: [[TMP3:%.*]] = phi i16 [ [[LD]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ] ; CHECK-NEXT: ret i16 [[TMP3]] ; entry: diff --git a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll index 362b4ec73401c..187fc8b7267b8 100644 --- a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll +++ b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll @@ -4,20 +4,18 @@ define void @phi_pointers(ptr %a, ptr %b, i8 zeroext %M, i8 zeroext %N) { ; CHECK-LABEL: @phi_pointers( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[M:%.*]] to i32 -; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[N:%.*]] to i32 -; CHECK-NEXT: [[ADD:%.*]] = add nuw i32 [[TMP0]], 1 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[ADD]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[ADD]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = add nuw i8 [[M:%.*]], 1 +; CHECK-NEXT: [[AND:%.*]] = and i8 [[ADD]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i8 [[ADD]], [[N:%.*]] ; CHECK-NEXT: [[BASE:%.*]] = select i1 [[CMP]], ptr [[A:%.*]], ptr [[B:%.*]] ; CHECK-NEXT: [[OTHER:%.*]] = select i1 [[CMP]], ptr [[B]], ptr [[B]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[BASE]], [[ENTRY:%.*]] ], [ [[GEP:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[AND]], [[ENTRY]] ], [ [[INC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[AND]], [[ENTRY]] ], [ [[INC:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[LOAD:%.*]] = load i16, ptr [[PTR]], align 2 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IDX]], 1 -; CHECK-NEXT: [[GEP]] = getelementptr inbounds i16, ptr [[PTR]], i32 [[INC]] +; CHECK-NEXT: [[INC]] = add nuw nsw i8 [[IDX]], 1 +; CHECK-NEXT: [[GEP]] = getelementptr inbounds i16, ptr [[PTR]], i8 [[INC]] ; CHECK-NEXT: [[COND:%.*]] = icmp eq ptr [[GEP]], [[OTHER]] ; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: @@ -47,11 +45,9 @@ exit: define void @phi_pointers_null(ptr %a, ptr %b, i8 zeroext %M, i8 zeroext %N) { ; CHECK-LABEL: @phi_pointers_null( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[M:%.*]] to i32 -; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[N:%.*]] to i32 -; CHECK-NEXT: [[ADD:%.*]] = add nuw i32 [[TMP0]], 1 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[ADD]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[ADD]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = add nuw i8 [[M:%.*]], 1 +; CHECK-NEXT: [[AND:%.*]] = and i8 [[ADD]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i8 [[ADD]], [[N:%.*]] ; CHECK-NEXT: [[BASE:%.*]] = select i1 [[CMP]], ptr [[A:%.*]], ptr [[B:%.*]] ; CHECK-NEXT: [[OTHER:%.*]] = select i1 [[CMP]], ptr [[B]], ptr [[B]] ; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq ptr [[BASE]], [[OTHER]] @@ -60,13 +56,13 @@ define void @phi_pointers_null(ptr %a, ptr %b, i8 zeroext %M, i8 zeroext %N) { ; CHECK-NEXT: br label [[LOOP]] ; CHECK: loop: ; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[BASE]], [[ENTRY:%.*]] ], [ null, [[FAIL]] ], [ [[GEP:%.*]], [[IF_THEN:%.*]] ] -; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[AND]], [[ENTRY]] ], [ 0, [[FAIL]] ], [ [[INC:%.*]], [[IF_THEN]] ] +; CHECK-NEXT: [[IDX:%.*]] = phi i8 [ [[AND]], [[ENTRY]] ], [ 0, [[FAIL]] ], [ [[INC:%.*]], [[IF_THEN]] ] ; CHECK-NEXT: [[UNDEF:%.*]] = icmp eq ptr [[PTR]], undef ; CHECK-NEXT: br i1 [[UNDEF]], label [[EXIT:%.*]], label [[IF_THEN]] ; CHECK: if.then: ; CHECK-NEXT: [[LOAD:%.*]] = load i16, ptr [[PTR]], align 2 -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IDX]], 1 -; CHECK-NEXT: [[GEP]] = getelementptr inbounds i16, ptr [[PTR]], i32 [[INC]] +; CHECK-NEXT: [[INC]] = add nuw nsw i8 [[IDX]], 1 +; CHECK-NEXT: [[GEP]] = getelementptr inbounds i16, ptr [[PTR]], i8 [[INC]] ; CHECK-NEXT: [[COND:%.*]] = icmp eq ptr [[GEP]], [[OTHER]] ; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]] ; CHECK: exit: From c07ab9e2ab0f288c10af172d11a3936b89b952fb Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 5 Apr 2025 16:22:47 +0100 Subject: [PATCH 0741/1029] [VPlan] Set debug location for recipes in VPBB::executeRecipes. Set the debug location for each recipe before executing the recipe, instead of ad-hoc setting the debug location during individual recipe execution. This simplifies the code and ensures that all recipe repsect the recipe's debug location. There are some minor changes, where previously we would re-use a previously set debug location. --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 4 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 23 ----- .../LoopVectorize/dbg-outer-loop-vect.ll | 87 ++++++++++--------- .../test/Transforms/LoopVectorize/debugloc.ll | 6 +- .../LoopVectorize/if-pred-non-void.ll | 4 +- 5 files changed, 52 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1e2f70e5c103e..85fd34d79be42 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -552,8 +552,10 @@ void VPBasicBlock::executeRecipes(VPTransformState *State, BasicBlock *BB) { State->CFG.PrevVPBB = this; - for (VPRecipeBase &Recipe : Recipes) + for (VPRecipeBase &Recipe : Recipes) { + State->setDebugLocFrom(Recipe.getDebugLoc()); Recipe.execute(*State); + } LLVM_DEBUG(dbgs() << "LV: filled BB:" << *BB); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 7b5c6b6f6f76e..ea56b4fa3c833 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -327,7 +327,6 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, } void VPPartialReductionRecipe::execute(VPTransformState &State) { - State.setDebugLocFrom(getDebugLoc()); auto &Builder = State.Builder; assert(getOpcode() == Instruction::Add && @@ -882,7 +881,6 @@ void VPInstruction::execute(VPTransformState &State) { "Recipe not a FPMathOp but has fast-math flags?"); if (hasFastMathFlags()) State.Builder.setFastMathFlags(getFastMathFlags()); - State.setDebugLocFrom(getDebugLoc()); bool GeneratesPerFirstLaneOnly = canGenerateScalarForFirstLane() && (vputils::onlyFirstLaneUsed(this) || isVectorToScalar() || isSingleScalar()); @@ -1163,7 +1161,6 @@ void VPIRPhi::print(raw_ostream &O, const Twine &Indent, void VPWidenCallRecipe::execute(VPTransformState &State) { assert(State.VF.isVector() && "not widening"); - State.setDebugLocFrom(getDebugLoc()); FunctionType *VFTy = Variant->getFunctionType(); // Add return type if intrinsic is overloaded on it. @@ -1232,7 +1229,6 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { assert(State.VF.isVector() && "not widening"); - State.setDebugLocFrom(getDebugLoc()); SmallVector TysForDecl; // Add return type if intrinsic is overloaded on it. @@ -1355,7 +1351,6 @@ void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPHistogramRecipe::execute(VPTransformState &State) { - State.setDebugLocFrom(getDebugLoc()); IRBuilderBase &Builder = State.Builder; Value *Address = State.get(getOperand(0)); @@ -1456,8 +1451,6 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPWidenSelectRecipe::execute(VPTransformState &State) { - State.setDebugLocFrom(getDebugLoc()); - // The condition can be loop invariant but still defined inside the // loop. This means that we can't just use the original 'cond' value. // We have to take the 'vectorized' value and pick the first lane. @@ -1569,7 +1562,6 @@ void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { #endif void VPWidenRecipe::execute(VPTransformState &State) { - State.setDebugLocFrom(getDebugLoc()); auto &Builder = State.Builder; switch (Opcode) { case Instruction::Call: @@ -1750,7 +1742,6 @@ void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPWidenCastRecipe::execute(VPTransformState &State) { - State.setDebugLocFrom(getDebugLoc()); auto &Builder = State.Builder; /// Vectorize casts. assert(State.VF.isVector() && "Not vectorizing?"); @@ -2029,7 +2020,6 @@ void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, #endif void VPScalarIVStepsRecipe::execute(VPTransformState &State) { - State.setDebugLocFrom(getDebugLoc()); // Fast-math-flags propagate from the original induction instruction. IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); if (hasFastMathFlags()) @@ -2223,7 +2213,6 @@ static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, void VPVectorEndPointerRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; - State.setDebugLocFrom(getDebugLoc()); unsigned CurrentPart = getUnrollPart(*this); Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true, CurrentPart, Builder); @@ -2259,7 +2248,6 @@ void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent, void VPVectorPointerRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; - State.setDebugLocFrom(getDebugLoc()); unsigned CurrentPart = getUnrollPart(*this); Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, CurrentPart, Builder); @@ -2285,7 +2273,6 @@ void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, void VPBlendRecipe::execute(VPTransformState &State) { assert(isNormalized() && "Expected blend to be normalized!"); - State.setDebugLocFrom(getDebugLoc()); // We know that all PHIs in non-header blocks are converted into // selects, so we don't have to worry about the insertion order and we // can just use the builder. @@ -2367,7 +2354,6 @@ void VPReductionRecipe::execute(VPTransformState &State) { // Propagate the fast-math flags carried by the underlying instruction. IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); State.Builder.setFastMathFlags(getFastMathFlags()); - State.setDebugLocFrom(getDebugLoc()); Value *NewVecOp = State.get(getVecOp()); if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, State.VF.isScalar()); @@ -2570,7 +2556,6 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, #endif Value *VPScalarCastRecipe ::generate(VPTransformState &State) { - State.setDebugLocFrom(getDebugLoc()); assert(vputils::onlyFirstLaneUsed(this) && "Codegen only implemented for first lane."); switch (Opcode) { @@ -2602,7 +2587,6 @@ void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, #endif void VPBranchOnMaskRecipe::execute(VPTransformState &State) { - State.setDebugLocFrom(getDebugLoc()); assert(State.Lane && "Branch on Mask works only on single instance."); VPValue *BlockInMask = getOperand(0); @@ -2628,7 +2612,6 @@ InstructionCost VPBranchOnMaskRecipe::computeCost(ElementCount VF, } void VPPredInstPHIRecipe::execute(VPTransformState &State) { - State.setDebugLocFrom(getDebugLoc()); assert(State.Lane && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = cast(State.get(getOperand(0), *State.Lane)); @@ -2734,7 +2717,6 @@ void VPWidenLoadRecipe::execute(VPTransformState &State) { bool CreateGather = !isConsecutive(); auto &Builder = State.Builder; - State.setDebugLocFrom(getDebugLoc()); Value *Mask = nullptr; if (auto *VPMask = getMask()) { // Mask reversal is only needed for non-all-one (null) masks, as reverse @@ -2793,7 +2775,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { bool CreateGather = !isConsecutive(); auto &Builder = State.Builder; - State.setDebugLocFrom(getDebugLoc()); CallInst *NewLI; Value *EVL = State.get(getEVL(), VPLane(0)); Value *Addr = State.get(getAddr(), !CreateGather); @@ -2868,7 +2849,6 @@ void VPWidenStoreRecipe::execute(VPTransformState &State) { const Align Alignment = getLoadStoreAlignment(&Ingredient); auto &Builder = State.Builder; - State.setDebugLocFrom(getDebugLoc()); Value *Mask = nullptr; if (auto *VPMask = getMask()) { @@ -2914,7 +2894,6 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { const Align Alignment = getLoadStoreAlignment(&Ingredient); auto &Builder = State.Builder; - State.setDebugLocFrom(getDebugLoc()); CallInst *NewSI = nullptr; Value *StoredVal = State.get(StoredValue); @@ -3717,7 +3696,6 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) { assert(EnableVPlanNativePath && "Non-native vplans are not expected to have VPWidenPHIRecipes."); - State.setDebugLocFrom(getDebugLoc()); Value *Op0 = State.get(getOperand(0)); Type *VecTy = Op0->getType(); Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name); @@ -3743,7 +3721,6 @@ void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { PHINode *Phi = State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask"); Phi->addIncoming(StartMask, VectorPH); - Phi->setDebugLoc(getDebugLoc()); State.set(this, Phi); } diff --git a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll index 15510060e0c6c..e6b8946f10723 100644 --- a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll +++ b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll @@ -15,23 +15,23 @@ define void @foo(ptr %h) !dbg !4 { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_COND_CLEANUP32:%.*]] ] ; CHECK-NEXT: br label [[FOR_COND5_PREHEADER1:%.*]], !dbg [[DBG21]] ; CHECK: for.cond5.preheader1: -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP4:%.*]], [[FOR_COND5_PREHEADER1]] ], !dbg [[DBG34:![0-9]+]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP4:%.*]], [[FOR_COND5_PREHEADER1]] ], !dbg [[DBG22:![0-9]+]] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[H]], <4 x i64> [[VEC_PHI]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> zeroinitializer, <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG22:![0-9]+]] -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 1, !dbg [[DBG22]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 1), <4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG22]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 2, !dbg [[DBG22]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 2), <4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG22]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 3, !dbg [[DBG22]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 3), <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG22]] -; CHECK-NEXT: [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1), !dbg [[DBG24:![0-9]+]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 5), !dbg [[DBG25:![0-9]+]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0, !dbg [[DBG26:![0-9]+]] -; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP32]], label [[FOR_COND5_PREHEADER1]], !dbg [[DBG26]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> zeroinitializer, <4 x ptr> [[TMP0]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG23:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 1, !dbg [[DBG25:![0-9]+]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 1), <4 x ptr> [[TMP1]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG23]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 2, !dbg [[DBG25]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 2), <4 x ptr> [[TMP2]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG23]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x ptr> [[TMP0]], i64 3, !dbg [[DBG25]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> splat (i32 3), <4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true)), !dbg [[DBG23]] +; CHECK-NEXT: [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], splat (i64 1), !dbg [[DBG26:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], splat (i64 5), !dbg [[DBG27:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0, !dbg [[DBG28:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP32]], label [[FOR_COND5_PREHEADER1]], !dbg [[DBG28]] ; CHECK: vector.latch: ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]], !dbg [[DBG21]] ; CHECK: scalar.ph: @@ -40,27 +40,27 @@ define void @foo(ptr %h) !dbg !4 { ; CHECK: for.cond1.preheader: ; CHECK-NEXT: [[I_023:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC13:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] ; CHECK-NEXT: #dbg_value(i64 [[I_023]], [[META11]], !DIExpression(), [[META20]]) -; CHECK-NEXT: br label [[FOR_COND5_PREHEADER:%.*]], !dbg [[DBG26]] +; CHECK-NEXT: br label [[FOR_COND5_PREHEADER:%.*]], !dbg [[DBG28]] ; CHECK: for.cond5.preheader: -; CHECK-NEXT: [[L_022:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INC10:%.*]], [[FOR_COND5_PREHEADER]] ], !dbg [[DBG34]] +; CHECK-NEXT: [[L_022:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INC10:%.*]], [[FOR_COND5_PREHEADER]] ], !dbg [[DBG22]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[H]], i64 [[L_022]] -; CHECK-NEXT: store i32 0, ptr [[TMP10]], align 4, !dbg [[DBG22]] -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr i32, ptr [[TMP10]], i64 1, !dbg [[DBG31:![0-9]+]] -; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX_1]], align 4, !dbg [[DBG22]] -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr i32, ptr [[TMP10]], i64 2, !dbg [[DBG31]] -; CHECK-NEXT: store i32 2, ptr [[ARRAYIDX_2]], align 4, !dbg [[DBG22]] -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr i32, ptr [[TMP10]], i64 3, !dbg [[DBG31]] -; CHECK-NEXT: store i32 3, ptr [[ARRAYIDX_3]], align 4, !dbg [[DBG22]] -; CHECK-NEXT: [[INC10]] = add nuw nsw i64 [[L_022]], 1, !dbg [[DBG24]] -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC10]], 5, !dbg [[DBG25]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_COND5_PREHEADER]], !dbg [[DBG26]] +; CHECK-NEXT: store i32 0, ptr [[TMP10]], align 4, !dbg [[DBG23]] +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr i32, ptr [[TMP10]], i64 1, !dbg [[DBG25]] +; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX_1]], align 4, !dbg [[DBG23]] +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr i32, ptr [[TMP10]], i64 2, !dbg [[DBG25]] +; CHECK-NEXT: store i32 2, ptr [[ARRAYIDX_2]], align 4, !dbg [[DBG23]] +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr i32, ptr [[TMP10]], i64 3, !dbg [[DBG25]] +; CHECK-NEXT: store i32 3, ptr [[ARRAYIDX_3]], align 4, !dbg [[DBG23]] +; CHECK-NEXT: [[INC10]] = add nuw nsw i64 [[L_022]], 1, !dbg [[DBG26]] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC10]], 5, !dbg [[DBG27]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_COND5_PREHEADER]], !dbg [[DBG28]] ; CHECK: for.cond.cleanup3: -; CHECK-NEXT: [[INC13]] = add nuw nsw i64 [[I_023]], 1, !dbg [[DBG32:![0-9]+]] +; CHECK-NEXT: [[INC13]] = add nuw nsw i64 [[I_023]], 1, !dbg [[DBG33:![0-9]+]] ; CHECK-NEXT: #dbg_value(i64 [[INC13]], [[META11]], !DIExpression(), [[META20]]) -; CHECK-NEXT: [[EXITCOND24_NOT:%.*]] = icmp eq i64 [[INC13]], 23, !dbg [[DBG33:![0-9]+]] -; CHECK-NEXT: br i1 [[EXITCOND24_NOT]], label [[EXIT]], label [[FOR_COND1_PREHEADER]], !dbg [[DBG21]], !llvm.loop [[LOOP34:![0-9]+]] +; CHECK-NEXT: [[EXITCOND24_NOT:%.*]] = icmp eq i64 [[INC13]], 23, !dbg [[DBG34:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND24_NOT]], label [[EXIT]], label [[FOR_COND1_PREHEADER]], !dbg [[DBG21]], !llvm.loop [[LOOP35:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: ret void, !dbg [[DBG35:![0-9]+]] +; CHECK-NEXT: ret void, !dbg [[DBG36:![0-9]+]] ; entry: call void @llvm.dbg.value(metadata i64 0, metadata !11, metadata !DIExpression()), !dbg !20 @@ -156,18 +156,19 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) ; CHECK: [[META19]] = distinct !DILexicalBlock(scope: [[META15]], file: [[META1]], line: 11, column: 5) ; CHECK: [[META20]] = !DILocation(line: 0, scope: [[META12]]) ; CHECK: [[DBG21]] = !DILocation(line: 10, column: 3, scope: [[META12]]) -; CHECK: [[DBG22]] = !DILocation(line: 13, column: 11, scope: [[META23:![0-9]+]]) -; CHECK: [[META23]] = distinct !DILexicalBlock(scope: [[META18]], file: [[META1]], line: 12, column: 7) -; CHECK: [[DBG24]] = !DILocation(line: 11, column: 32, scope: [[META19]]) -; CHECK: [[DBG25]] = !DILocation(line: 11, column: 26, scope: [[META19]]) -; CHECK: [[DBG26]] = !DILocation(line: 11, column: 5, scope: [[META15]]) -; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[DBG21]], [[META28:![0-9]+]], [[META29:![0-9]+]], [[META30:![0-9]+]]} -; CHECK: [[META28]] = !DILocation(line: 13, column: 13, scope: [[META12]]) -; CHECK: [[META29]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META30]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[DBG31]] = !DILocation(line: 13, column: 2, scope: [[META23]]) -; CHECK: [[DBG32]] = !DILocation(line: 10, column: 30, scope: [[META16]]) -; CHECK: [[DBG33]] = !DILocation(line: 10, column: 24, scope: [[META16]]) -; CHECK: [[LOOP34]] = distinct !{[[LOOP34]], [[DBG21]], [[META28]], [[META29]]} -; CHECK: [[DBG35]] = !DILocation(line: 14, column: 1, scope: [[DBG4]]) +; CHECK: [[DBG22]] = !DILocation(line: 10, column: 5, scope: [[META12]]) +; CHECK: [[DBG23]] = !DILocation(line: 13, column: 11, scope: [[META24:![0-9]+]]) +; CHECK: [[META24]] = distinct !DILexicalBlock(scope: [[META18]], file: [[META1]], line: 12, column: 7) +; CHECK: [[DBG25]] = !DILocation(line: 13, column: 2, scope: [[META24]]) +; CHECK: [[DBG26]] = !DILocation(line: 11, column: 32, scope: [[META19]]) +; CHECK: [[DBG27]] = !DILocation(line: 11, column: 26, scope: [[META19]]) +; CHECK: [[DBG28]] = !DILocation(line: 11, column: 5, scope: [[META15]]) +; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[DBG21]], [[META30:![0-9]+]], [[META31:![0-9]+]], [[META32:![0-9]+]]} +; CHECK: [[META30]] = !DILocation(line: 13, column: 13, scope: [[META12]]) +; CHECK: [[META31]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META32]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[DBG33]] = !DILocation(line: 10, column: 30, scope: [[META16]]) +; CHECK: [[DBG34]] = !DILocation(line: 10, column: 24, scope: [[META16]]) +; CHECK: [[LOOP35]] = distinct !{[[LOOP35]], [[DBG21]], [[META30]], [[META31]]} +; CHECK: [[DBG36]] = !DILocation(line: 14, column: 1, scope: [[DBG4]]) ;. diff --git a/llvm/test/Transforms/LoopVectorize/debugloc.ll b/llvm/test/Transforms/LoopVectorize/debugloc.ll index c31f438feae6e..8fe355c6d567d 100644 --- a/llvm/test/Transforms/LoopVectorize/debugloc.ll +++ b/llvm/test/Transforms/LoopVectorize/debugloc.ll @@ -67,8 +67,8 @@ define i32 @test_debug_loc_on_branch_in_loop(ptr noalias %src, ptr noalias %dst) ; CHECK-NEXT: br i1 [[EXT]], label %pred.store.if, label %pred.store.continue, !dbg [[LOC3]] ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: -; CHECK-NEXT: [[GEP:%.+]] = getelementptr inbounds i32, ptr %dst, i64 {{.+}}, !dbg [[LOC3]] -; CHECK-NEXT: store i32 0, ptr [[GEP]], align 4, !dbg [[LOC3]] +; CHECK-NEXT: [[GEP:%.+]] = getelementptr inbounds i32, ptr %dst, i64 {{.+}} +; CHECK-NEXT: store i32 0, ptr [[GEP]], align 4 ; CHECK-NEXT: br label %pred.store.continue, !dbg [[LOC3]] ; CHECK-EMPTY: ; @@ -107,7 +107,7 @@ define i32 @test_different_debug_loc_on_replicate_recipe(ptr noalias %src, ptr n ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: ; CHECK-NEXT: [[GEP:%.+]] = getelementptr inbounds i32, ptr %dst, i64 {{.+}}, !dbg [[LOC5:!.+]] -; CHECK-NEXT: store i32 0, ptr [[GEP]], align 4, !dbg [[LOC5]] +; CHECK-NEXT: store i32 0, ptr [[GEP]], align 4 ; CHECK-NEXT: br label %pred.store.continue, !dbg [[LOC4]] ; CHECK-EMPTY: ; diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll index fa03c30b8752d..d717a3feed3ea 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -636,8 +636,8 @@ define void @pr30172(ptr nocapture %asd, ptr nocapture %bsd) !dbg !5 {; ; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP5]], 100 ; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = xor i1 [[TMP12]], true, !dbg [[DBG34:![0-9]+]] ; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = icmp sge i32 [[TMP4]], 200, !dbg [[DBG34]] -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp sge i32 [[TMP5]], 200, !dbg [[DBG34]] +; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = icmp sge i32 [[TMP4]], 200 +; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp sge i32 [[TMP5]], 200 ; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i1 [[TMP15]], i1 false, !dbg [[DBG35:![0-9]+]] ; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = select i1 [[TMP14]], i1 [[TMP16]], i1 false, !dbg [[DBG35]] ; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP12]] From 6ac5cbdd29b630259432fc5b478cd2ddf78a17cd Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Sat, 5 Apr 2025 09:41:50 -0700 Subject: [PATCH 0742/1029] [NFC][LLVM] Eliminate duplicate code in INITIALIZE_PASS macros (#134457) - Refactor INITIALIZE_PASS and INITIALIZE_PASS_WITH_OPTIONS macros to eliminate some code duplication. --- llvm/include/llvm/PassSupport.h | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/llvm/include/llvm/PassSupport.h b/llvm/include/llvm/PassSupport.h index d3ba32f22efe1..57210b2488b53 100644 --- a/llvm/include/llvm/PassSupport.h +++ b/llvm/include/llvm/PassSupport.h @@ -35,20 +35,6 @@ namespace llvm { class Pass; -#define INITIALIZE_PASS(passName, arg, name, cfg, analysis) \ - static void *initialize##passName##PassOnce(PassRegistry &Registry) { \ - PassInfo *PI = new PassInfo( \ - name, arg, &passName::ID, \ - PassInfo::NormalCtor_t(callDefaultCtor), cfg, analysis); \ - Registry.registerPass(*PI, true); \ - return PI; \ - } \ - static llvm::once_flag Initialize##passName##PassFlag; \ - void llvm::initialize##passName##Pass(PassRegistry &Registry) { \ - llvm::call_once(Initialize##passName##PassFlag, \ - initialize##passName##PassOnce, std::ref(Registry)); \ - } - #define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis) \ static void *initialize##passName##PassOnce(PassRegistry &Registry) { @@ -67,15 +53,18 @@ class Pass; initialize##passName##PassOnce, std::ref(Registry)); \ } -#define INITIALIZE_PASS_WITH_OPTIONS(PassName, Arg, Name, Cfg, Analysis) \ - INITIALIZE_PASS_BEGIN(PassName, Arg, Name, Cfg, Analysis) \ - PassName::registerOptions(); \ - INITIALIZE_PASS_END(PassName, Arg, Name, Cfg, Analysis) +#define INITIALIZE_PASS(passName, arg, name, cfg, analysis) \ + INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis) \ + INITIALIZE_PASS_END(passName, arg, name, cfg, analysis) #define INITIALIZE_PASS_WITH_OPTIONS_BEGIN(PassName, Arg, Name, Cfg, Analysis) \ INITIALIZE_PASS_BEGIN(PassName, Arg, Name, Cfg, Analysis) \ PassName::registerOptions(); +#define INITIALIZE_PASS_WITH_OPTIONS(PassName, Arg, Name, Cfg, Analysis) \ + INITIALIZE_PASS_WITH_OPTIONS_BEGIN(PassName, Arg, Name, Cfg, Analysis) \ + INITIALIZE_PASS_END(PassName, Arg, Name, Cfg, Analysis) + template < class PassName, std::enable_if_t{}, bool> = true> From 33246f79e87a0e629ae776d1811a1175a3f10065 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 11:16:12 -0700 Subject: [PATCH 0743/1029] [MC] Rework evaluateSymbolicAdd to eliminate MCValue::SymB reliance Reworked evaluateSymbolicAdd and isSymbolRefDifferenceFullyResolved to remove their reliance on MCValue::SymB, which previously used the MCSymbolRefExpr member when folding two symbolic expressions. This dependency prevented replacing MCValue::SymB with a MCSymbol. By refactoring, we enable this replacement, which is a more significant improvement. Note that this change eliminates the rare "unsupported subtraction of qualified symbol" diagnostic, resulting in a minor loss of information. However, the benefit of enabling MCValue::SymB replacement with MCSymbol outweighs this slight regression. --- llvm/include/llvm/MC/MCExpr.h | 3 + llvm/include/llvm/MC/MCObjectWriter.h | 3 +- llvm/include/llvm/MC/MCValue.h | 12 +- llvm/lib/MC/MCAssembler.cpp | 7 - llvm/lib/MC/MCExpr.cpp | 143 +++++++++--------- llvm/lib/MC/MCObjectWriter.cpp | 14 +- llvm/test/MC/AArch64/elf-reloc-ptrauth.s | 10 +- .../AArch64/label-arithmetic-diags-darwin.s | 8 +- llvm/test/MC/ELF/bad-expr.s | 4 +- llvm/test/MC/X86/macho-reloc-errors-x86_64.s | 2 +- 10 files changed, 93 insertions(+), 113 deletions(-) diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index 782f7ea8957d9..3127a93d32581 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -259,6 +259,9 @@ class MCSymbolRefExpr : public MCExpr { VariantKind getKind() const { return (VariantKind)(getSubclassData() & VariantKindMask); } + uint16_t getSpecifier() const { + return (getSubclassData() & VariantKindMask); + } bool hasSubsectionsViaSymbols() const { return (getSubclassData() & HasSubsectionsViaSymbolsBit) != 0; diff --git a/llvm/include/llvm/MC/MCObjectWriter.h b/llvm/include/llvm/MC/MCObjectWriter.h index 81ba6ffd5d44e..da919c941e507 100644 --- a/llvm/include/llvm/MC/MCObjectWriter.h +++ b/llvm/include/llvm/MC/MCObjectWriter.h @@ -83,8 +83,7 @@ class MCObjectWriter { /// Clients are not required to answer precisely and may conservatively return /// false, even when a difference is fully resolved. bool isSymbolRefDifferenceFullyResolved(const MCAssembler &Asm, - const MCSymbolRefExpr *A, - const MCSymbolRefExpr *B, + const MCSymbol &A, const MCSymbol &B, bool InSet) const; virtual bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm, diff --git a/llvm/include/llvm/MC/MCValue.h b/llvm/include/llvm/MC/MCValue.h index 4b789412325e4..d291c4cb5aff0 100644 --- a/llvm/include/llvm/MC/MCValue.h +++ b/llvm/include/llvm/MC/MCValue.h @@ -36,15 +36,17 @@ class raw_ostream; class MCValue { const MCSymbolRefExpr *SymA = nullptr, *SymB = nullptr; int64_t Cst = 0; - uint32_t RefKind = 0; + uint32_t Specifier = 0; public: + friend class MCExpr; MCValue() = default; int64_t getConstant() const { return Cst; } const MCSymbolRefExpr *getSymA() const { return SymA; } const MCSymbolRefExpr *getSymB() const { return SymB; } - uint32_t getRefKind() const { return RefKind; } - void setSpecifier(uint32_t S) { RefKind = S; } + uint32_t getRefKind() const { return Specifier; } + uint32_t getSpecifier() const { return Specifier; } + void setSpecifier(uint32_t S) { Specifier = S; } const MCSymbol *getAddSym() const { return SymA ? &SymA->getSymbol() : nullptr; @@ -71,7 +73,7 @@ class MCValue { R.Cst = Val; R.SymA = SymA; R.SymB = SymB; - R.RefKind = RefKind; + R.Specifier = RefKind; return R; } @@ -80,7 +82,7 @@ class MCValue { R.Cst = Val; R.SymA = nullptr; R.SymB = nullptr; - R.RefKind = 0; + R.Specifier = 0; return R; } diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 2c9cd0e5b626e..962b8e006dc0b 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -159,13 +159,6 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF, Ctx.reportError(Fixup.getLoc(), "expected relocatable expression"); return true; } - if (const MCSymbolRefExpr *RefB = Target.getSymB()) { - if (RefB->getKind() != MCSymbolRefExpr::VK_None) { - Ctx.reportError(Fixup.getLoc(), - "unsupported subtraction of qualified symbol"); - return true; - } - } unsigned FixupFlags = getBackend().getFixupKindInfo(Fixup.getKind()).Flags; if (FixupFlags & MCFixupKindInfo::FKF_IsTarget) diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index 773df74291064..c5500ef9cf34d 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -295,19 +295,18 @@ bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm, } /// Helper method for \see EvaluateSymbolAdd(). -static void AttemptToFoldSymbolOffsetDifference( - const MCAssembler *Asm, const SectionAddrMap *Addrs, bool InSet, - const MCSymbolRefExpr *&A, const MCSymbolRefExpr *&B, int64_t &Addend) { +static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm, + const SectionAddrMap *Addrs, + bool InSet, const MCSymbol *&A, + const MCSymbol *&B, + int64_t &Addend) { if (!A || !B) return; - const MCSymbol &SA = A->getSymbol(); - const MCSymbol &SB = B->getSymbol(); - + const MCSymbol &SA = *A, &SB = *B; if (SA.isUndefined() || SB.isUndefined()) return; - - if (!Asm->getWriter().isSymbolRefDifferenceFullyResolved(*Asm, A, B, InSet)) + if (!Asm->getWriter().isSymbolRefDifferenceFullyResolved(*Asm, SA, SB, InSet)) return; auto FinalizeFolding = [&]() { @@ -345,8 +344,7 @@ static void AttemptToFoldSymbolOffsetDifference( } // Eagerly evaluate when layout is finalized. - Addend += Asm->getSymbolOffset(A->getSymbol()) - - Asm->getSymbolOffset(B->getSymbol()); + Addend += Asm->getSymbolOffset(SA) - Asm->getSymbolOffset(SB); if (Addrs && (&SecA != &SecB)) Addend += (Addrs->lookup(&SecA) - Addrs->lookup(&SecB)); @@ -420,65 +418,52 @@ static void AttemptToFoldSymbolOffsetDifference( } } -/// Evaluate the result of an add between (conceptually) two MCValues. -/// -/// This routine conceptually attempts to construct an MCValue: -/// Result = (Result_A - Result_B + Result_Cst) -/// from two MCValue's LHS and RHS where -/// Result = LHS + RHS -/// and -/// Result = (LHS_A - LHS_B + LHS_Cst) + (RHS_A - RHS_B + RHS_Cst). -/// -/// This routine attempts to aggressively fold the operands such that the result -/// is representable in an MCValue, but may not always succeed. -/// -/// \returns True on success, false if the result is not representable in an -/// MCValue. - -/// NOTE: It is really important to have both the Asm and Layout arguments. -/// They might look redundant, but this function can be used before layout -/// is done (see the object streamer for example) and having the Asm argument -/// lets us avoid relaxations early. +// Evaluate the sum of two relocatable expressions. +// +// Result = (LHS_A - LHS_B + LHS_Cst) + (RHS_A - RHS_B + RHS_Cst). +// +// This routine attempts to aggressively fold the operands such that the result +// is representable in an MCValue, but may not always succeed. +// +// LHS_A and RHS_A might have relocation specifiers while LHS_B and RHS_B +// cannot have specifiers. +// +// \returns True on success, false if the result is not representable in an +// MCValue. + +// NOTE: This function can be used before layout is done (see the object +// streamer for example) and having the Asm argument lets us avoid relaxations +// early. static bool evaluateSymbolicAdd(const MCAssembler *Asm, const SectionAddrMap *Addrs, bool InSet, - const MCValue &LHS, const MCValue &RHS, + const MCValue &LHS, + const MCSymbolRefExpr *RhsAdd, + const MCSymbolRefExpr *RhsSub, int64_t RHS_Cst, MCValue &Res) { - // FIXME: This routine (and other evaluation parts) are *incredibly* sloppy - // about dealing with modifiers. This will ultimately bite us, one day. - const MCSymbolRefExpr *LHS_A = LHS.getSymA(); - const MCSymbolRefExpr *LHS_B = LHS.getSymB(); + const MCSymbol *LHS_A = LHS.getAddSym(); + const MCSymbol *LHS_B = LHS.getSubSym(); int64_t LHS_Cst = LHS.getConstant(); - const MCSymbolRefExpr *RHS_A = RHS.getSymA(); - const MCSymbolRefExpr *RHS_B = RHS.getSymB(); - int64_t RHS_Cst = RHS.getConstant(); - - if (LHS.getRefKind() != RHS.getRefKind()) - return false; + const MCSymbol *RHS_A = RhsAdd ? &RhsAdd->getSymbol() : nullptr; + const MCSymbol *RHS_B = RhsSub ? &RhsSub->getSymbol() : nullptr; // Fold the result constant immediately. int64_t Result_Cst = LHS_Cst + RHS_Cst; // If we have a layout, we can fold resolved differences. if (Asm) { - // First, fold out any differences which are fully resolved. By - // reassociating terms in + // While LHS_A-LHS_B and RHS_A-RHS_B from recursive calls have already been + // folded, reassociating terms in // Result = (LHS_A - LHS_B + LHS_Cst) + (RHS_A - RHS_B + RHS_Cst). - // we have the four possible differences: - // (LHS_A - LHS_B), - // (LHS_A - RHS_B), - // (RHS_A - LHS_B), - // (RHS_A - RHS_B). - // Since we are attempting to be as aggressive as possible about folding, we - // attempt to evaluate each possible alternative. - AttemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, LHS_A, LHS_B, - Result_Cst); - AttemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, LHS_A, RHS_B, - Result_Cst); - AttemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, RHS_A, LHS_B, - Result_Cst); - AttemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, RHS_A, RHS_B, - Result_Cst); + // might bring more opportunities. + if (LHS_A && RHS_B && !LHS.getSymA()->getSpecifier()) { + attemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, LHS_A, RHS_B, + Result_Cst); + } + if (RHS_A && LHS_B && !RhsAdd->getSpecifier()) { + attemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, RHS_A, LHS_B, + Result_Cst); + } } // We can't represent the addition or subtraction of two symbols. @@ -487,9 +472,10 @@ static bool evaluateSymbolicAdd(const MCAssembler *Asm, // At this point, we have at most one additive symbol and one subtractive // symbol -- find them. - const MCSymbolRefExpr *A = LHS_A ? LHS_A : RHS_A; - const MCSymbolRefExpr *B = LHS_B ? LHS_B : RHS_B; - + auto *A = LHS_A ? LHS.getSymA() : RHS_A ? RhsAdd : nullptr; + auto *B = LHS_B ? LHS.getSymB() : RHS_B ? RhsSub : nullptr; + if (B && B->getKind() != MCSymbolRefExpr::VK_None) + return false; Res = MCValue::get(A, B, Result_Cst); return true; } @@ -641,31 +627,40 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, // We only support a few operations on non-constant expressions, handle // those first. + auto Op = ABE->getOpcode(); + int64_t LHS = LHSValue.getConstant(), RHS = RHSValue.getConstant(); if (!LHSValue.isAbsolute() || !RHSValue.isAbsolute()) { - switch (ABE->getOpcode()) { + switch (Op) { default: return false; - case MCBinaryExpr::Sub: - // Negate RHS and add. - // The cast avoids undefined behavior if the constant is INT64_MIN. - return evaluateSymbolicAdd( - Asm, Addrs, InSet, LHSValue, - MCValue::get(RHSValue.getSymB(), RHSValue.getSymA(), - -(uint64_t)RHSValue.getConstant(), - RHSValue.getRefKind()), - Res); - case MCBinaryExpr::Add: - return evaluateSymbolicAdd(Asm, Addrs, InSet, LHSValue, RHSValue, Res); + case MCBinaryExpr::Sub: + // TODO: Prevent folding for AArch64 @AUTH operands. + if (LHSValue.getSpecifier() || RHSValue.getSpecifier()) + return false; + if (Op == MCBinaryExpr::Sub) { + std::swap(RHSValue.SymA, RHSValue.SymB); + RHSValue.Cst = -(uint64_t)RHSValue.Cst; + } + if (RHSValue.isAbsolute()) { + LHSValue.Cst += RHSValue.Cst; + Res = LHSValue; + return true; + } + if (LHSValue.isAbsolute()) { + RHSValue.Cst += LHSValue.Cst; + Res = RHSValue; + return true; + } + return evaluateSymbolicAdd(Asm, Addrs, InSet, LHSValue, RHSValue.SymA, + RHSValue.SymB, RHSValue.Cst, Res); } } // FIXME: We need target hooks for the evaluation. It may be limited in // width, and gas defines the result of comparisons differently from // Apple as. - int64_t LHS = LHSValue.getConstant(), RHS = RHSValue.getConstant(); int64_t Result = 0; - auto Op = ABE->getOpcode(); switch (Op) { case MCBinaryExpr::AShr: Result = LHS >> RHS; break; case MCBinaryExpr::Add: Result = LHS + RHS; break; diff --git a/llvm/lib/MC/MCObjectWriter.cpp b/llvm/lib/MC/MCObjectWriter.cpp index 7183acc3865ef..6d5010dcae0a0 100644 --- a/llvm/lib/MC/MCObjectWriter.cpp +++ b/llvm/lib/MC/MCObjectWriter.cpp @@ -27,16 +27,10 @@ void MCObjectWriter::reset() { CGProfile.clear(); } -bool MCObjectWriter::isSymbolRefDifferenceFullyResolved( - const MCAssembler &Asm, const MCSymbolRefExpr *A, const MCSymbolRefExpr *B, - bool InSet) const { - // Modified symbol references cannot be resolved. - if (A->getKind() != MCSymbolRefExpr::VK_None || - B->getKind() != MCSymbolRefExpr::VK_None) - return false; - - const MCSymbol &SA = A->getSymbol(); - const MCSymbol &SB = B->getSymbol(); +bool MCObjectWriter::isSymbolRefDifferenceFullyResolved(const MCAssembler &Asm, + const MCSymbol &SA, + const MCSymbol &SB, + bool InSet) const { assert(!SA.isUndefined() && !SB.isUndefined()); return isSymbolRefDifferenceFullyResolvedImpl(Asm, SA, *SB.getFragment(), InSet, /*IsPCRel=*/false); diff --git a/llvm/test/MC/AArch64/elf-reloc-ptrauth.s b/llvm/test/MC/AArch64/elf-reloc-ptrauth.s index bed85bcc5798b..9fe78a4e4e822 100644 --- a/llvm/test/MC/AArch64/elf-reloc-ptrauth.s +++ b/llvm/test/MC/AArch64/elf-reloc-ptrauth.s @@ -41,8 +41,6 @@ // RELOC-NEXT: 70 00000000 10000000 // ^^^^ discriminator // ^^ 0 no addr diversity 0 reserved 00 ia key 0000 reserved -// RELOC-NEXT: 80 04000000 00000000 -// Folded to constant 4 bytes difference between _g9 and _g8 .section .helper .local "_g 6" @@ -91,10 +89,6 @@ _g9: .quad ("_g 7" + 7)@AUTH(ia,16) .quad 0 -// ASM: .xword _g9@AUTH(ia,42)-_g8@AUTH(ia,42) -.quad _g9@AUTH(ia,42) - _g8@AUTH(ia,42) -.quad 0 - .ifdef ASMONLY // ASM: .xword _g10@AUTH(ia,42)+1 @@ -190,4 +184,8 @@ _g9: // ERROBJ: :[[#@LINE+1]]:7: error: expected relocatable expression .quad _g9@AUTH(ia,42) - _g8 +// ERROBJ: :[[#@LINE+1]]:7: error: expected relocatable expression +.quad _g9@AUTH(ia,42) - _g8@AUTH(ia,42) +.quad 0 + .endif // ERROBJ diff --git a/llvm/test/MC/AArch64/label-arithmetic-diags-darwin.s b/llvm/test/MC/AArch64/label-arithmetic-diags-darwin.s index e32db7c125bb4..3e51e487e1288 100644 --- a/llvm/test/MC/AArch64/label-arithmetic-diags-darwin.s +++ b/llvm/test/MC/AArch64/label-arithmetic-diags-darwin.s @@ -14,13 +14,9 @@ Lend: // CHECK-NEXT: ^ add w0, w1, #(Lend - var@TLVPPAGEOFF) + // CHECK: [[#@LINE-1]]:3: error: expected relocatable expression cmp w0, #(Lend - var@TLVPPAGEOFF) - // CHECK: error: unsupported subtraction of qualified symbol - // CHECK-NEXT: add w0, w1, #(Lend - var@TLVPPAGEOFF) - // CHECK-NEXT: ^ - // CHECK: error: unsupported subtraction of qualified symbol - // CHECK-NEXT: cmp w0, #(Lend - var@TLVPPAGEOFF) - // CHECK-NEXT: ^ + // CHECK: [[#@LINE-1]]:3: error: expected relocatable expression add w0, w1, #(Lstart - Lend) cmp w0, #(Lstart - Lend) diff --git a/llvm/test/MC/ELF/bad-expr.s b/llvm/test/MC/ELF/bad-expr.s index 59809700d4a11..2346f5493b98a 100644 --- a/llvm/test/MC/ELF/bad-expr.s +++ b/llvm/test/MC/ELF/bad-expr.s @@ -7,6 +7,6 @@ x: // CHECK: [[@LINE+1]]:{{[0-9]+}}: error: symbol '__executable_start' can not be undefined in a subtraction expression .quad x-__executable_start -// CHECK: [[@LINE+1]]:{{[0-9]+}}: error: unsupported subtraction of qualified symbol - .long bar - foo@got +// CHECK: [[@LINE+1]]:7: error: expected relocatable expression +.long bar - foo@got foo: diff --git a/llvm/test/MC/X86/macho-reloc-errors-x86_64.s b/llvm/test/MC/X86/macho-reloc-errors-x86_64.s index 4498d03338f39..daf84a7dbdc4f 100644 --- a/llvm/test/MC/X86/macho-reloc-errors-x86_64.s +++ b/llvm/test/MC/X86/macho-reloc-errors-x86_64.s @@ -10,7 +10,7 @@ mov %rax, thing@TLVP // CHECK-ERROR: 3:9: error: 32-bit absolute addressing is not supported in 64-bit mode -// CHECK-ERROR: 4:9: error: unsupported subtraction of qualified symbol +// CHECK-ERROR: 4:9: error: expected relocatable expression // CHECK-ERROR: 5:9: error: unsupported pc-relative relocation of difference // CHECK-ERROR: 6:9: error: unsupported relocation with identical base // CHECK-ERROR: 7:9: error: unsupported relocation with subtraction expression, symbol 'thing' can not be undefined in a subtraction expression From cadfaa83ff1f7d9f983b23a756f25884a3d6314c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 11:45:52 -0700 Subject: [PATCH 0744/1029] [AVR,CSKY] Migrate away from MCValue::getSymB The MCValue::SymB MCSymbolRefExpr member might be replaced with a MCSymbol in the future. Reduce direct access. --- llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp | 3 ++- llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp index e187a825a5268..b6d71f27c1272 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp @@ -85,12 +85,13 @@ bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result, MCSymbolRefExpr::VariantKind Modifier = Sym->getKind(); if (Modifier != MCSymbolRefExpr::VK_None) return false; + assert(!Value.getSubSym()); if (specifier == VK_PM) { Modifier = MCSymbolRefExpr::VariantKind(AVRMCExpr::VK_PM); } Sym = MCSymbolRefExpr::create(&Sym->getSymbol(), Modifier, Context); - Result = MCValue::get(Sym, Value.getSymB(), Value.getConstant()); + Result = MCValue::get(Sym, nullptr, Value.getConstant()); } return true; diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp index 338ac63d88241..696cd1daa035c 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCExpr.cpp @@ -72,5 +72,5 @@ bool CSKYMCExpr::evaluateAsRelocatableImpl(MCValue &Res, if (!getSubExpr()->evaluateAsRelocatable(Res, Asm)) return false; Res.setSpecifier(specifier); - return !Res.getSymB(); + return !Res.getSubSym(); } From b4f7a2ab578957657336a598220fce6fc00f56b5 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Sat, 5 Apr 2025 14:53:18 -0400 Subject: [PATCH 0745/1029] [libc++] Bump OS version for macOS backdeployment CI jobs (#131883) In 0547e573c555, I introduced backdeployment testing on macOS using Github-provided builders. This was done by basically building libc++ on a slightly older macOS (like macOS 13) and then running against the system library on that machine. However, that created a dependency that libc++ must keep working on macOS 13, which doesn't support the latest-released Xcode. This patch solves that problem by moving the deployment testing to a newer version of macOS which supports the latest-released version of Xcode. Sadly, that also reduces the backdeployment coverage we have since we're not actually testing on older OSes, but is necessary to satisfy the documented libc++ support policy. In the future, we could improve the situation by providing a Lit configuration that allows compiling (but not running) all the tests, building the tests on a supported macOS, and then shipping those tests on an older backdeployment target in order to run them against the system library. Since that requires significant engineering, this isn't done at this time. --- .github/workflows/libcxx-build-and-test.yaml | 14 ++++++++++++-- .../string.capacity/allocation_size.pass.cpp | 5 ----- .../fstreams/filebuf.virtuals/setbuf.pass.cpp | 5 +++-- .../facet.num.get.members/get_double.pass.cpp | 4 ++++ .../facet.num.get.members/get_float.pass.cpp | 4 ++++ .../facet.num.get.members/get_long_double.pass.cpp | 4 ++++ .../basic.string/string.capacity/max_size.pass.cpp | 6 ------ .../string.capacity/over_max_size.pass.cpp | 6 ++++++ 8 files changed, 33 insertions(+), 15 deletions(-) diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml index 9b60bc81285f8..326c6f288750a 100644 --- a/.github/workflows/libcxx-build-and-test.yaml +++ b/.github/workflows/libcxx-build-and-test.yaml @@ -197,10 +197,20 @@ jobs: os: macos-15 - config: apple-configuration os: macos-15 + # TODO: These jobs are intended to test back-deployment (building against ToT libc++ but running against an + # older system-provided libc++.dylib). Doing this properly would require building the test suite on a + # recent macOS using a recent Clang (hence recent Xcode), and then running the actual test suite on an + # older mac. We could do that by e.g. sharing artifacts between the two jobs. + # + # However, our Lit configuration currently doesn't provide a good way to do that in a batch, so our only + # alternative is to actually build on the same host that we're going to run on. Sadly, that doesn't work + # since older macOSes don't support newer Xcodes. For now, we run the "backdeployment" jobs on recent + # macOS versions as a way to avoid rotting that configuration, but it doesn't provide a lot of additional + # coverage. - config: apple-system - os: macos-13 + os: macos-15 - config: apple-system-hardened - os: macos-13 + os: macos-15 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp index 6f127e1b62b02..77da29225957b 100644 --- a/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp +++ b/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp @@ -8,11 +8,6 @@ // -// This test demonstrates the smaller allocation sizes when the alignment -// requirements of std::string are dropped from 16 to 8. -// -// XFAIL: using-built-library-before-llvm-19 - #include #include #include diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp index 10435dc482367..3e09ab4d0f22a 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/setbuf.pass.cpp @@ -10,8 +10,9 @@ // basic_streambuf* setbuf(char_type* s, streamsize n) override; -// In C++23 and later, this test requires support for P2467R1 in the dylib (a3f17ba3febbd546f2342ffc780ac93b694fdc8d) -// XFAIL: (!c++03 && !c++11 && !c++14 && !c++17 && !c++20) && using-built-library-before-llvm-18 +// This test requires the fix to https://github.com/llvm/llvm-project/issues/60509 in the dylib, +// which landed in 5afb937d8a30445642ccaf33866ee4cdd0713222. +// XFAIL: using-built-library-before-llvm-19 #include #include diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp index 1708e94b682c4..a388c0b15a840 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_double.pass.cpp @@ -6,6 +6,10 @@ // //===----------------------------------------------------------------------===// +// The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of +// FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib. +// XFAIL: using-built-library-before-llvm-19 + // // class num_get diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp index 8268b5419eb3e..596d81cbc8c91 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_float.pass.cpp @@ -6,6 +6,10 @@ // //===----------------------------------------------------------------------===// +// The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of +// FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib. +// XFAIL: using-built-library-before-llvm-19 + // // class num_get diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp index f3569ed6e5d89..8a9fd41501626 100644 --- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp +++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long_double.pass.cpp @@ -6,6 +6,10 @@ // //===----------------------------------------------------------------------===// +// The fix for LWG2381 (https://github.com/llvm/llvm-project/pull/77948) changed behavior of +// FP parsing. This requires 3e15c97fa3812993bdc319827a5c6d867b765ae8 in the dylib. +// XFAIL: using-built-library-before-llvm-19 + // // class num_get diff --git a/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp index f68f9e9d5fe29..ac660d8fe9941 100644 --- a/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp +++ b/libcxx/test/std/strings/basic.string/string.capacity/max_size.pass.cpp @@ -10,12 +10,6 @@ // XFAIL: FROZEN-CXX03-HEADERS-FIXME -// After changing the alignment of the allocated pointer from 16 to 8, the exception -// thrown is no longer `bad_alloc` but instead length_error on systems using new -// headers but a dylib that doesn't contain 04ce0ba. -// -// XFAIL: using-built-library-before-llvm-19 - // // size_type max_size() const; // constexpr since C++20 diff --git a/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp b/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp index b919551c9f880..5eb3240699a81 100644 --- a/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp +++ b/libcxx/test/std/strings/basic.string/string.capacity/over_max_size.pass.cpp @@ -8,6 +8,12 @@ // UNSUPPORTED: no-exceptions +// After changing the alignment of the allocated pointer from 16 to 8, the exception +// thrown is no longer `bad_alloc` but instead length_error on systems using new +// headers but a dylib that doesn't contain 04ce0ba. +// +// XFAIL: using-built-library-before-llvm-19 + // // size_type max_size() const; // constexpr since C++20 From 5e8f43811acfd72ac5da4df2a5436b27ad1eeab4 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 12:03:59 -0700 Subject: [PATCH 0746/1029] MCValue: Make getSymB private and improve documentation --- llvm/include/llvm/MC/MCExpr.h | 6 ++++++ llvm/include/llvm/MC/MCValue.h | 31 ++++++++++++++++--------------- llvm/lib/MC/MCExpr.cpp | 12 ++++++------ 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index 3127a93d32581..b0e347d690f0e 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -26,6 +26,7 @@ class MCSymbol; class MCValue; class raw_ostream; class StringRef; +class MCSymbolRefExpr; using SectionAddrMap = DenseMap; @@ -130,6 +131,11 @@ class MCExpr { MCFragment *findAssociatedFragment() const; /// @} + + static bool evaluateSymbolicAdd(const MCAssembler *, const SectionAddrMap *, + bool, const MCValue &, + const MCSymbolRefExpr *, + const MCSymbolRefExpr *, int64_t, MCValue &); }; inline raw_ostream &operator<<(raw_ostream &OS, const MCExpr &E) { diff --git a/llvm/include/llvm/MC/MCValue.h b/llvm/include/llvm/MC/MCValue.h index d291c4cb5aff0..6e37fb56ab27f 100644 --- a/llvm/include/llvm/MC/MCValue.h +++ b/llvm/include/llvm/MC/MCValue.h @@ -19,31 +19,32 @@ namespace llvm { class raw_ostream; -/// This represents an "assembler immediate". -/// -/// In its most general form, this can hold ":Kind:(SymbolA - SymbolB + -/// imm64)". Not all targets supports relocations of this general form, but we -/// need to represent this anyway. -/// -/// In general both SymbolA and SymbolB will also have a modifier -/// analogous to the top-level Kind. Current targets are not expected -/// to make use of both though. The choice comes down to whether -/// relocation modifiers apply to the closest symbol or the whole -/// expression. -/// -/// Note that this class must remain a simple POD value class, because we need -/// it to live in unions etc. +// Represents a relocatable expression in its most general form: +// relocation_specifier(SymA - SymB + imm64). +// +// Not all targets support SymB. For PC-relative relocations, a specifier is +// typically used instead of setting SymB to DOT. +// +// Some targets encode the relocation specifier within SymA using +// MCSymbolRefExpr::SubclassData and access it via getAccessVariant(), though +// this method is now deprecated. +// +// This class must remain a simple POD value class, as it needs to reside in +// unions and similar structures. class MCValue { const MCSymbolRefExpr *SymA = nullptr, *SymB = nullptr; int64_t Cst = 0; uint32_t Specifier = 0; + // SymB cannot have a specifier. Use getSubSym instead. + const MCSymbolRefExpr *getSymB() const { return SymB; } + public: + friend class MCAssembler; friend class MCExpr; MCValue() = default; int64_t getConstant() const { return Cst; } const MCSymbolRefExpr *getSymA() const { return SymA; } - const MCSymbolRefExpr *getSymB() const { return SymB; } uint32_t getRefKind() const { return Specifier; } uint32_t getSpecifier() const { return Specifier; } void setSpecifier(uint32_t S) { Specifier = S; } diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index c5500ef9cf34d..11a5a739b4a06 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -434,12 +434,12 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm, // NOTE: This function can be used before layout is done (see the object // streamer for example) and having the Asm argument lets us avoid relaxations // early. -static bool evaluateSymbolicAdd(const MCAssembler *Asm, - const SectionAddrMap *Addrs, bool InSet, - const MCValue &LHS, - const MCSymbolRefExpr *RhsAdd, - const MCSymbolRefExpr *RhsSub, int64_t RHS_Cst, - MCValue &Res) { +bool MCExpr::evaluateSymbolicAdd(const MCAssembler *Asm, + const SectionAddrMap *Addrs, bool InSet, + const MCValue &LHS, + const MCSymbolRefExpr *RhsAdd, + const MCSymbolRefExpr *RhsSub, int64_t RHS_Cst, + MCValue &Res) { const MCSymbol *LHS_A = LHS.getAddSym(); const MCSymbol *LHS_B = LHS.getSubSym(); int64_t LHS_Cst = LHS.getConstant(); From ed2b82fb8b066f8c269be9e403ee20d86b5ead8a Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 12:12:11 -0700 Subject: [PATCH 0747/1029] MCValue: Change getAccessVariant to return uint16_t Some targets encode the relocation specifier within SymA using MCSymbolRefExpr::SubclassData. They will cast the specifier to *MCExpr::Specifier. Migrate away from the confusing MCSymbolRefExpr::VariantKind. Note: getAccessVariant is a deprecated method to get the relocation specifier. --- llvm/include/llvm/MC/MCValue.h | 4 +++- llvm/lib/MC/MCValue.cpp | 6 +++--- .../MCTargetDesc/WebAssemblyWasmObjectWriter.cpp | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/MC/MCValue.h b/llvm/include/llvm/MC/MCValue.h index 6e37fb56ab27f..cb89e245f2887 100644 --- a/llvm/include/llvm/MC/MCValue.h +++ b/llvm/include/llvm/MC/MCValue.h @@ -65,7 +65,9 @@ class MCValue { /// Print the value to stderr. void dump() const; - MCSymbolRefExpr::VariantKind getAccessVariant() const; + // Get the relocation specifier from SymA. This is a workaround for targets + // that do not use MCValue::Specifier. + uint16_t getAccessVariant() const; static MCValue get(const MCSymbolRefExpr *SymA, const MCSymbolRefExpr *SymB = nullptr, diff --git a/llvm/lib/MC/MCValue.cpp b/llvm/lib/MC/MCValue.cpp index 8b2edc9ac57ec..b7ac3f247ecf9 100644 --- a/llvm/lib/MC/MCValue.cpp +++ b/llvm/lib/MC/MCValue.cpp @@ -44,10 +44,10 @@ LLVM_DUMP_METHOD void MCValue::dump() const { } #endif -MCSymbolRefExpr::VariantKind MCValue::getAccessVariant() const { +uint16_t MCValue::getAccessVariant() const { const MCSymbolRefExpr *A = getSymA(); if (!A) - return MCSymbolRefExpr::VK_None; + return 0; - return A->getKind(); + return A->getSpecifier(); } diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp index 7a5ebe342fccf..941422891832d 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp @@ -69,7 +69,7 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType( assert(RefA); auto& SymA = cast(RefA->getSymbol()); - MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant(); + auto Modifier = Target.getAccessVariant(); switch (Modifier) { case MCSymbolRefExpr::VK_GOT: From 582b1b2ac9de696debe6041aa500141c2fef5aa3 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Sat, 5 Apr 2025 20:04:04 +0000 Subject: [PATCH 0748/1029] [CI] Use env variable to enable pip breaking system packages This patch uses an env variable instead of the --break-system-packages flag. This enables the heterogenous configuration between the old and new premerge systems as the old premerge container does not recognize the --break-system-packages flag. An env variable will work on new premerge and have no impact on old premerge. --- .ci/monolithic-linux.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index ec7a85bc5f15f..13c7a93c364db 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -53,9 +53,10 @@ targets="${2}" lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests" echo "--- cmake" -pip install --break-system-packages -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt -pip install --break-system-packages -q -r "${MONOREPO_ROOT}"/lldb/test/requirements.txt -pip install --break-system-packages -q -r "${MONOREPO_ROOT}"/.ci/requirements.txt +export PIP_BREAK_SYSTEM_PACKAGES=1 +pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt +pip install -q -r "${MONOREPO_ROOT}"/lldb/test/requirements.txt +pip install -q -r "${MONOREPO_ROOT}"/.ci/requirements.txt cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D LLVM_ENABLE_PROJECTS="${projects}" \ -G Ninja \ From 52eb11f925ddeba4e1b3840fd636ee87387f3ada Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 13:08:19 -0700 Subject: [PATCH 0749/1029] [MC] Replace getSpecifier(Target.getSymA()) with Target.getSymSpecifier() Add MCValue::getSymSpecifier as a workaround for targets that encode the relocation specifier on SymA. This function asserts that SymA is not null. --- llvm/include/llvm/MC/MCValue.h | 2 ++ .../AArch64/AsmParser/AArch64AsmParser.cpp | 2 +- .../MCTargetDesc/AArch64ELFObjectWriter.cpp | 6 +++--- .../MCTargetDesc/AArch64MachObjectWriter.cpp | 4 ++-- .../AArch64WinCOFFObjectWriter.cpp | 2 +- .../MCTargetDesc/ARMWinCOFFObjectWriter.cpp | 7 ++----- .../MCTargetDesc/PPCXCOFFObjectWriter.cpp | 2 +- .../MCTargetDesc/RISCVELFObjectWriter.cpp | 2 +- .../MCTargetDesc/SystemZELFObjectWriter.cpp | 2 +- .../Target/X86/MCTargetDesc/X86AsmBackend.cpp | 3 +-- .../X86/MCTargetDesc/X86ELFObjectWriter.cpp | 2 +- .../X86/MCTargetDesc/X86MachObjectWriter.cpp | 19 +++++++++---------- .../MCTargetDesc/X86WinCOFFObjectWriter.cpp | 11 +++++------ 13 files changed, 30 insertions(+), 34 deletions(-) diff --git a/llvm/include/llvm/MC/MCValue.h b/llvm/include/llvm/MC/MCValue.h index cb89e245f2887..9ef77703356a4 100644 --- a/llvm/include/llvm/MC/MCValue.h +++ b/llvm/include/llvm/MC/MCValue.h @@ -67,6 +67,8 @@ class MCValue { // Get the relocation specifier from SymA. This is a workaround for targets // that do not use MCValue::Specifier. + uint16_t getSymSpecifier() const { return SymA->getSpecifier(); } + // Get the relocation specifier from SymA, or 0 when SymA is null. uint16_t getAccessVariant() const; static MCValue get(const MCSymbolRefExpr *SymA, diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 38710e9344687..8b8c5a22b829c 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -8224,7 +8224,7 @@ bool AArch64AsmParser::classifySymbolRef(const MCExpr *Expr, return false; if (Res.getSymA()) - DarwinSpec = AArch64MCExpr::Specifier(Res.getSymA()->getKind()); + DarwinSpec = AArch64MCExpr::Specifier(Res.getSymSpecifier()); Addend = Res.getConstant(); // It's some symbol reference + a constant addend, but really diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index fa72cbf032cdf..4763fdf8deb05 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -117,9 +117,9 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, bool IsNC = AArch64MCExpr::isNotChecked(RefKind); assert((!Target.getSymA() || - getSpecifier(Target.getSymA()) == AArch64MCExpr::None || - getSpecifier(Target.getSymA()) == AArch64MCExpr::VK_PLT || - getSpecifier(Target.getSymA()) == AArch64MCExpr::VK_GOTPCREL) && + Target.getSymSpecifier() == AArch64MCExpr::None || + Target.getSymSpecifier() == AArch64MCExpr::VK_PLT || + Target.getSymSpecifier() == AArch64MCExpr::VK_GOTPCREL) && "Should only be expression-level modifiers here"); switch (SymLoc) { diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index c3a6174131806..cca337a1a66dd 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -221,7 +221,7 @@ void AArch64MachObjectWriter::recordRelocation( // Check for "_foo@got - .", which comes through here as: // Ltmp0: // ... _foo@got - Ltmp0 - if (getSpecifier(Target.getSymA()) == AArch64MCExpr::M_GOT && + if (Target.getSymSpecifier() == AArch64MCExpr::M_GOT && Asm.getSymbolOffset(*B) == Asm.getFragmentOffset(*Fragment) + Fixup.getOffset()) { // SymB is the PC, so use a PC-rel pointer-to-GOT relocation. @@ -232,7 +232,7 @@ void AArch64MachObjectWriter::recordRelocation( MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); Writer->addRelocation(A_Base, Fragment->getParent(), MRE); return; - } else if (getSpecifier(Target.getSymA()) != AArch64MCExpr::None) { + } else if (Target.getSymSpecifier() != AArch64MCExpr::None) { // Otherwise, neither symbol can be modified. Asm.getContext().reportError(Fixup.getLoc(), "unsupported relocation of modified symbol"); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp index d679f5f621e0a..f580b37252e80 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -62,7 +62,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType( } auto Modifier = - Target.isAbsolute() ? AArch64MCExpr::None : Target.getSymA()->getKind(); + Target.isAbsolute() ? AArch64MCExpr::None : Target.getSymSpecifier(); const MCExpr *Expr = Fixup.getValue(); if (const AArch64MCExpr *A64E = dyn_cast(Expr)) { diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp index 0a605df41891e..da8762c855fb2 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp @@ -44,10 +44,7 @@ unsigned ARMWinCOFFObjectWriter::getRelocType(MCContext &Ctx, const MCFixup &Fixup, bool IsCrossSection, const MCAsmBackend &MAB) const { - MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() - ? MCSymbolRefExpr::VK_None - : Target.getSymA()->getKind(); - + auto Spec = Target.getAddSym() ? Target.getSymSpecifier() : 0; unsigned FixupKind = Fixup.getKind(); if (IsCrossSection) { if (FixupKind != FK_Data_4) { @@ -64,7 +61,7 @@ unsigned ARMWinCOFFObjectWriter::getRelocType(MCContext &Ctx, return COFF::IMAGE_REL_ARM_ABSOLUTE; } case FK_Data_4: - switch (Modifier) { + switch (Spec) { case MCSymbolRefExpr::VK_COFF_IMGREL32: return COFF::IMAGE_REL_ARM_ADDR32NB; case MCSymbolRefExpr::VK_SECREL: diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp index 97be62ff5da06..f36f25559365c 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp @@ -41,7 +41,7 @@ llvm::createPPCXCOFFObjectWriter(bool Is64Bit) { std::pair PPCXCOFFObjectWriter::getRelocTypeAndSignSize( const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { const auto Specifier = - Target.isAbsolute() ? PPCMCExpr::VK_None : getSpecifier(Target.getSymA()); + Target.isAbsolute() ? PPCMCExpr::VK_None : Target.getSymSpecifier(); // People from AIX OS team says AIX link editor does not care about // the sign bit in the relocation entry "most" of the time. // The system assembler seems to set the sign bit on relocation entry diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp index 2e73ba54ae21b..77a55eceaf68e 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp @@ -51,7 +51,7 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, const MCFixup &Fixup, bool IsPCRel) const { assert((!Target.getSymA() || - Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) && + Target.getSymSpecifier() == MCSymbolRefExpr::VK_None) && "sym@specifier should have been rejected"); const MCExpr *Expr = Fixup.getValue(); // Determine the type of the relocation diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp index cfec53bd5dec9..e5c51b0bdba8f 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp @@ -220,7 +220,7 @@ unsigned SystemZELFObjectWriter::getRelocType(MCContext &Ctx, bool SystemZELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, const MCSymbol &Sym, unsigned Type) const { - switch (getSpecifier(V.getSymA())) { + switch (V.getSymSpecifier()) { case SystemZMCExpr::VK_GOT: case SystemZMCExpr::VK_PLT: return true; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index d698c917d4382..65997840cbe8d 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -744,8 +744,7 @@ bool X86AsmBackend::fixupNeedsRelaxationAdvanced(const MCAssembler &Asm, if (Fixup.getKind() == FK_Data_1) { MCValue Target; if (Fixup.getValue()->evaluateAsRelocatable(Target, &Asm) && - Target.getSymA() && - getSpecifier(Target.getSymA()) == X86MCExpr::VK_ABS8) + Target.getSymA() && Target.getSymSpecifier() == X86MCExpr::VK_ABS8) return false; } return true; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index 94dc110726f0d..205392a2c96be 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -391,7 +391,7 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, bool X86ELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, const MCSymbol &Sym, unsigned Type) const { - switch (getSpecifier(V.getSymA())) { + switch (V.getSymSpecifier()) { case X86MCExpr::VK_GOT: case X86MCExpr::VK_PLT: case X86MCExpr::VK_GOTPCREL: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 7e195e78ce087..c76ef4e936409 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -151,7 +151,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( const MCSymbol *B_Base = Writer->getAtom(*B); // Neither symbol can be modified. - if (getSpecifier(Target.getSymA()) != X86MCExpr::VK_None) { + if (Target.getSymSpecifier()) { Asm.getContext().reportError(Fixup.getLoc(), "unsupported relocation of modified symbol"); return; @@ -266,7 +266,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( return; } - auto Specifier = getSpecifier(Target.getSymA()); + auto Specifier = Target.getSymSpecifier(); if (IsPCRel) { if (IsRIPRel) { if (Specifier == X86MCExpr::VK_GOTPCREL) { @@ -279,7 +279,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( Type = MachO::X86_64_RELOC_GOT; } else if (Specifier == X86MCExpr::VK_TLVP) { Type = MachO::X86_64_RELOC_TLV; - } else if (Specifier != X86MCExpr::VK_None) { + } else if (Specifier) { Asm.getContext().reportError( Fixup.getLoc(), "unsupported symbol modifier in relocation"); return; @@ -307,7 +307,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( } } } else { - if (Specifier != X86MCExpr::VK_None) { + if (Specifier) { Asm.getContext().reportError( Fixup.getLoc(), "unsupported symbol modifier in branch relocation"); @@ -330,7 +330,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( Asm.getContext().reportError( Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel"); return; - } else if (Specifier != X86MCExpr::VK_None) { + } else if (Specifier) { Asm.getContext().reportError( Fixup.getLoc(), "unsupported symbol modifier in relocation"); return; @@ -460,8 +460,8 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer, const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) { - const MCSymbolRefExpr *SymA = Target.getSymA(); - assert(getSpecifier(SymA) == X86MCExpr::VK_TLVP && !is64Bit() && + const MCSymbol *SymA = Target.getAddSym(); + assert(Target.getSymSpecifier() == X86MCExpr::VK_TLVP && !is64Bit() && "Should only be called with a 32-bit TLVP relocation!"); unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); @@ -489,7 +489,7 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer, MRE.r_word0 = Value; MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (MachO::GENERIC_RELOC_TLV << 28); - Writer->addRelocation(&SymA->getSymbol(), Fragment->getParent(), MRE); + Writer->addRelocation(SymA, Fragment->getParent(), MRE); } void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, @@ -502,8 +502,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); // If this is a 32-bit TLVP reloc it's handled a bit differently. - if (Target.getSymA() && - getSpecifier(Target.getSymA()) == X86MCExpr::VK_TLVP) { + if (Target.getSymA() && Target.getSymSpecifier() == X86MCExpr::VK_TLVP) { recordTLVPRelocation(Writer, Asm, Fragment, Fixup, Target, FixedValue); return; } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 7b8adfb292c00..30076978401be 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -59,8 +59,7 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx, } } - auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None - : Target.getSymA()->getKind(); + auto Spec = Target.getAddSym() ? Target.getSymSpecifier() : 0; if (Is64Bit) { switch (FixupKind) { case FK_PCRel_4: @@ -76,9 +75,9 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx, case FK_Data_4: case X86::reloc_signed_4byte: case X86::reloc_signed_4byte_relax: - if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32) + if (Spec == MCSymbolRefExpr::VK_COFF_IMGREL32) return COFF::IMAGE_REL_AMD64_ADDR32NB; - if (Modifier == MCSymbolRefExpr::VK_SECREL) + if (Spec == MCSymbolRefExpr::VK_SECREL) return COFF::IMAGE_REL_AMD64_SECREL; return COFF::IMAGE_REL_AMD64_ADDR32; case FK_Data_8: @@ -100,9 +99,9 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx, case FK_Data_4: case X86::reloc_signed_4byte: case X86::reloc_signed_4byte_relax: - if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32) + if (Spec == MCSymbolRefExpr::VK_COFF_IMGREL32) return COFF::IMAGE_REL_I386_DIR32NB; - if (Modifier == MCSymbolRefExpr::VK_SECREL) + if (Spec == MCSymbolRefExpr::VK_SECREL) return COFF::IMAGE_REL_I386_SECREL; return COFF::IMAGE_REL_I386_DIR32; case FK_SecRel_2: From 2fd6f8fb5e3a52e901276d97c285b8de66742985 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 5 Apr 2025 21:10:25 +0100 Subject: [PATCH 0750/1029] [LV] Don't add blocks to loop in GeneratedRTChecks (NFC). Blocks will get added to parent loops as needed during VPlan execution. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index cc6fd790bc437..585caaffa63da 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2082,9 +2082,6 @@ class GeneratedRTChecks { auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); - // Create new preheader for vector loop. - if (OuterLoop) - OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI); SCEVCheckBlock->getTerminator()->eraseFromParent(); SCEVCheckBlock->moveBefore(LoopVectorPreHeader); @@ -2122,9 +2119,6 @@ class GeneratedRTChecks { DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); MemCheckBlock->moveBefore(LoopVectorPreHeader); - if (OuterLoop) - OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI); - BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); if (AddBranchWeights) { From 7ccdc3d5ca648c09bbeb86f5063f7b0ee3e9b5e2 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 13:16:24 -0700 Subject: [PATCH 0751/1029] [MC] Replace getSymA()->getSymbol() with getAddSym. NFC We will replace the MCSymbolRefExpr member in MCValue with MCSymbol. This change reduces dependence on MCSymbolRefExpr. --- llvm/lib/MC/ELFObjectWriter.cpp | 11 +++++------ llvm/lib/MC/MCAssembler.cpp | 18 +++++++----------- llvm/lib/MC/MCExpr.cpp | 9 ++++----- llvm/lib/MC/MachObjectWriter.cpp | 6 +++--- llvm/lib/MC/XCOFFObjectWriter.cpp | 2 +- .../MCTargetDesc/AArch64MachObjectWriter.cpp | 13 ++++++------- .../ARM/MCTargetDesc/ARMMachObjectWriter.cpp | 6 +++--- .../Target/Mips/AsmParser/MipsAsmParser.cpp | 7 +++---- .../Sparc/MCTargetDesc/SparcAsmBackend.cpp | 2 +- .../X86/MCTargetDesc/X86ELFObjectWriter.cpp | 4 ++-- .../X86/MCTargetDesc/X86MachObjectWriter.cpp | 14 +++++--------- 11 files changed, 40 insertions(+), 52 deletions(-) diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index f3445daf73ac1..23d5517920e7b 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -1251,10 +1251,9 @@ bool ELFObjectWriter::shouldRelocateWithSymbol(const MCAssembler &Asm, const MCSymbolELF *Sym, uint64_t C, unsigned Type) const { - const MCSymbolRefExpr *RefA = Val.getSymA(); // A PCRel relocation to an absolute value has no symbol (or section). We // represent that with a relocation to a null section. - if (!RefA) + if (!Val.getAddSym()) return false; // An undefined symbol is not in any section, so the relocation has to point @@ -1379,8 +1378,8 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm, if (auto *RefB = Target.getSubSym()) { // When there is no relocation specifier, a linker relaxation target may // emit ADD/SUB relocations for A-B+C. - if (Target.getSymA() && Backend.handleAddSubRelocations( - Asm, *Fragment, Fixup, Target, FixedValue)) + if (Target.getAddSym() && Backend.handleAddSubRelocations( + Asm, *Fragment, Fixup, Target, FixedValue)) return; const auto &SymB = cast(*RefB); @@ -1405,8 +1404,8 @@ void ELFObjectWriter::recordRelocation(MCAssembler &Asm, } // We either rejected the fixup or folded B into C at this point. - const MCSymbolRefExpr *RefA = Target.getSymA(); - const auto *SymA = RefA ? cast(&RefA->getSymbol()) : nullptr; + const auto *RefA = Target.getAddSym(); + const auto *SymA = RefA ? cast(RefA) : nullptr; bool ViaWeakRef = false; if (SymA && SymA->isVariable()) { diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 962b8e006dc0b..39ff41e89f1e2 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -125,15 +125,11 @@ bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const { if (V.getSubSym() || V.getRefKind() != MCSymbolRefExpr::VK_None) return false; - const MCSymbolRefExpr *Ref = V.getSymA(); - if (!Ref) + auto *Sym = V.getAddSym(); + if (!Sym || V.getSymSpecifier()) return false; - if (Ref->getKind() != MCSymbolRefExpr::VK_None) - return false; - - const MCSymbol &Sym = Ref->getSymbol(); - if (!isThumbFunc(&Sym)) + if (!isThumbFunc(Sym)) return false; ThumbFuncs.insert(Symbol); // Cache it. @@ -460,14 +456,14 @@ static bool getSymbolOffsetImpl(const MCAssembler &Asm, const MCSymbol &S, uint64_t Offset = Target.getConstant(); - const MCSymbolRefExpr *A = Target.getSymA(); + const MCSymbol *A = Target.getAddSym(); if (A) { uint64_t ValA; // FIXME: On most platforms, `Target`'s component symbols are labels from // having been simplified during evaluation, but on Mach-O they can be // variables due to PR19203. This, and the line below for `B` can be // restored to call `getLabelOffset` when PR19203 is fixed. - if (!getSymbolOffsetImpl(Asm, A->getSymbol(), ReportError, ValA)) + if (!getSymbolOffsetImpl(Asm, *A, ReportError, ValA)) return false; Offset += ValA; } @@ -516,11 +512,11 @@ const MCSymbol *MCAssembler::getBaseSymbol(const MCSymbol &Symbol) const { return nullptr; } - const MCSymbolRefExpr *A = Value.getSymA(); + const MCSymbol *A = Value.getAddSym(); if (!A) return nullptr; - const MCSymbol &ASym = A->getSymbol(); + const MCSymbol &ASym = *A; if (ASym.isCommon()) { getContext().reportError(Expr->getLoc(), "Common symbol '" + ASym.getName() + diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index 11a5a739b4a06..1a7c4590c1219 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -536,15 +536,14 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, if (Res.getRefKind() != MCSymbolRefExpr::VK_None || !Res.getSymA() || Res.getSubSym() || Res.getConstant()) return false; - Res = - MCValue::get(MCSymbolRefExpr::create(&Res.getSymA()->getSymbol(), - Kind, Asm->getContext()), - Res.getSymB(), Res.getConstant(), Res.getRefKind()); + Res = MCValue::get( + MCSymbolRefExpr::create(Res.getAddSym(), Kind, Asm->getContext()), + Res.getSymB(), Res.getConstant(), Res.getRefKind()); } if (!IsMachO) return true; - const MCSymbolRefExpr *A = Res.getSymA(); + auto *A = Res.getAddSym(); auto *B = Res.getSubSym(); // FIXME: This is small hack. Given // a = b + 4 diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index 31758214e5ab6..dc1d8e003b34f 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -109,9 +109,9 @@ uint64_t MachObjectWriter::getSymbolAddress(const MCSymbol &S, S.getName() + "'"); // Verify that any used symbols are defined. - if (Target.getSymA() && Target.getSymA()->getSymbol().isUndefined()) + if (Target.getSymA() && Target.getAddSym()->isUndefined()) report_fatal_error("unable to evaluate offset to undefined symbol '" + - Target.getSymA()->getSymbol().getName() + "'"); + Target.getAddSym()->getName() + "'"); if (Target.getSubSym() && Target.getSubSym()->isUndefined()) report_fatal_error("unable to evaluate offset to undefined symbol '" + Target.getSubSym()->getName() + "'"); @@ -507,7 +507,7 @@ void MachObjectWriter::writeLinkerOptionsLoadCommand( static bool isFixupTargetValid(const MCValue &Target) { // Target is (LHS - RHS + cst). // We don't support the form where LHS is null: -RHS + cst - if (!Target.getSymA() && Target.getSubSym()) + if (!Target.getAddSym() && Target.getSubSym()) return false; return true; } diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp index cc150649f1139..0834568f6dfe6 100644 --- a/llvm/lib/MC/XCOFFObjectWriter.cpp +++ b/llvm/lib/MC/XCOFFObjectWriter.cpp @@ -686,7 +686,7 @@ void XCOFFWriter::recordRelocation(MCAssembler &Asm, const MCFragment *Fragment, return SectionMap[ContainingSect]->Address + Asm.getSymbolOffset(*Sym); }; - const MCSymbol *const SymA = &Target.getSymA()->getSymbol(); + const MCSymbol *const SymA = Target.getAddSym(); MCAsmBackend &Backend = Asm.getBackend(); bool IsPCRel = Backend.getFixupKindInfo(Fixup.getKind()).Flags & diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index cca337a1a66dd..bc967e3e8a6e8 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -176,11 +176,10 @@ void AArch64MachObjectWriter::recordRelocation( // assembler local symbols. If we got here, that's not what we have, // so complain loudly. if (Kind == AArch64::fixup_aarch64_pcrel_branch19) { - Asm.getContext().reportError(Fixup.getLoc(), - "conditional branch requires assembler-local" - " label. '" + - Target.getSymA()->getSymbol().getName() + - "' is external."); + Asm.getContext().reportError( + Fixup.getLoc(), "conditional branch requires assembler-local" + " label. '" + + Target.getAddSym()->getName() + "' is external."); return; } @@ -214,7 +213,7 @@ void AArch64MachObjectWriter::recordRelocation( // something similar? } } else if (auto *B = Target.getSubSym()) { // A - B + constant - const MCSymbol *A = &Target.getSymA()->getSymbol(); + const MCSymbol *A = Target.getAddSym(); const MCSymbol *A_Base = Writer->getAtom(*A); const MCSymbol *B_Base = Writer->getAtom(*B); @@ -293,7 +292,7 @@ void AArch64MachObjectWriter::recordRelocation( RelSymbol = B_Base; Type = MachO::ARM64_RELOC_SUBTRACTOR; } else { // A + constant - const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); + const MCSymbol *Symbol = Target.getAddSym(); const MCSectionMachO &Section = static_cast(*Fragment->getParent()); diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index e7348326a69cf..353e4fa5b2a9d 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -149,7 +149,7 @@ void ARMMachObjectWriter::recordARMScatteredHalfRelocation( unsigned Type = MachO::ARM_RELOC_HALF; // See . - const MCSymbol *A = &Target.getSymA()->getSymbol(); + const MCSymbol *A = Target.getAddSym(); if (!A->getFragment()) { Asm.getContext().reportError(Fixup.getLoc(), @@ -257,7 +257,7 @@ void ARMMachObjectWriter::recordARMScatteredRelocation( unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); // See . - const MCSymbol *A = &Target.getSymA()->getSymbol(); + const MCSymbol *A = Target.getAddSym(); if (!A->getFragment()) { Asm.getContext().reportError(Fixup.getLoc(), @@ -388,7 +388,7 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer, // Get the symbol data, if any. const MCSymbol *A = nullptr; if (Target.getSymA()) - A = &Target.getSymA()->getSymbol(); + A = Target.getAddSym(); // FIXME: For other platforms, we need to use scattered relocations for // internal relocations with offsets. If this is an internal relocation with diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 8c6fe0b77d234..15ada7fc608f0 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -2945,15 +2945,14 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, bool IsPtr64 = ABI.ArePtrs64bit(); bool IsLocalSym = - Res.getSymA()->getSymbol().isInSection() || - Res.getSymA()->getSymbol().isTemporary() || - (Res.getSymA()->getSymbol().isELF() && + Res.getAddSym()->isInSection() || Res.getAddSym()->isTemporary() || + (Res.getAddSym()->isELF() && cast(Res.getSymA()->getSymbol()).getBinding() == ELF::STB_LOCAL); // For O32, "$"-prefixed symbols are recognized as temporary while // .L-prefixed symbols are not (PrivateGlobalPrefix is "$"). Recognize ".L" // manually. - if (ABI.IsO32() && Res.getSymA()->getSymbol().getName().starts_with(".L")) + if (ABI.IsO32() && Res.getAddSym()->getName().starts_with(".L")) IsLocalSym = true; bool UseXGOT = STI->hasFeature(Mips::FeatureXGOT) && !IsLocalSym; diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index 131ef44407407..e689c534b7058 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -277,7 +277,7 @@ namespace { default: return false; case Sparc::fixup_sparc_wplt30: - if (Target.getSymA()->getSymbol().isTemporary()) + if (Target.getAddSym()->isTemporary()) return false; [[fallthrough]]; case Sparc::fixup_sparc_tls_gd_hi22: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index 205392a2c96be..ca10f4716ba8c 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -353,8 +353,8 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, case X86MCExpr::VK_TLSLDM: case X86MCExpr::VK_TPOFF: case X86MCExpr::VK_DTPOFF: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *S = Target.getAddSym()) + cast(S)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index c76ef4e936409..6871b8adeebbd 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -140,7 +140,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( Type = MachO::X86_64_RELOC_BRANCH; } } else if (Target.getSubSym()) { // A - B + constant - const MCSymbol *A = &Target.getSymA()->getSymbol(); + const MCSymbol *A = Target.getAddSym(); if (A->isTemporary()) A = &Writer->findAliasedSymbol(*A); const MCSymbol *A_Base = Writer->getAtom(*A); @@ -212,7 +212,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( Index = B->getFragment()->getParent()->getOrdinal() + 1; Type = MachO::X86_64_RELOC_SUBTRACTOR; } else { - const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); + const MCSymbol *Symbol = Target.getAddSym(); if (Symbol->isTemporary() && Value) { const MCSection &Sec = Symbol->getSection(); if (!MCAsmInfoDarwin::isSectionAtomizableBySymbols(Sec)) @@ -370,7 +370,7 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, unsigned Type = MachO::GENERIC_RELOC_VANILLA; // See . - const MCSymbol *A = &Target.getSymA()->getSymbol(); + const MCSymbol *A = Target.getAddSym(); if (!A->getFragment()) { Asm.getContext().reportError( @@ -500,9 +500,10 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, uint64_t &FixedValue) { unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); + const MCSymbol *A = Target.getAddSym(); // If this is a 32-bit TLVP reloc it's handled a bit differently. - if (Target.getSymA() && Target.getSymSpecifier() == X86MCExpr::VK_TLVP) { + if (A && Target.getSymSpecifier() == X86MCExpr::VK_TLVP) { recordTLVPRelocation(Writer, Asm, Fragment, Fixup, Target, FixedValue); return; } @@ -516,11 +517,6 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, return; } - // Get the symbol data, if any. - const MCSymbol *A = nullptr; - if (Target.getSymA()) - A = &Target.getSymA()->getSymbol(); - // If this is an internal relocation with an offset, it also needs a scattered // relocation entry. uint32_t Offset = Target.getConstant(); From 086af836889436baffc71c743c7c8259bad8ed60 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 13:23:13 -0700 Subject: [PATCH 0752/1029] [MC] Replace getSymA()->getSymbol() with getAddSym. NFC We will replace the MCSymbolRefExpr member in MCValue with MCSymbol. This change reduces dependence on MCSymbolRefExpr. --- llvm/lib/MC/MCAssembler.cpp | 9 +++------ llvm/lib/MC/MCExpr.cpp | 2 +- llvm/lib/MC/MCObjectStreamer.cpp | 6 ++---- llvm/lib/MC/WasmObjectWriter.cpp | 5 ++--- llvm/lib/MC/WinCOFFObjectWriter.cpp | 4 ++-- 5 files changed, 10 insertions(+), 16 deletions(-) diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 39ff41e89f1e2..685fd1cbdea4d 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -164,14 +164,11 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF, bool IsPCRel = FixupFlags & MCFixupKindInfo::FKF_IsPCRel; bool IsResolved = false; if (IsPCRel) { - if (Target.getSubSym()) { - IsResolved = false; - } else if (!Target.getSymA()) { + if (Target.getSubSym() || !Target.getAddSym()) { IsResolved = false; } else { - const MCSymbolRefExpr *A = Target.getSymA(); - const MCSymbol &SA = A->getSymbol(); - if (A->getKind() != MCSymbolRefExpr::VK_None || SA.isUndefined()) { + auto &SA = *Target.getAddSym(); + if (Target.getSymSpecifier() || SA.isUndefined()) { IsResolved = false; } else { IsResolved = (FixupFlags & MCFixupKindInfo::FKF_Constant) || diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index 1a7c4590c1219..b921b55950772 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -578,7 +578,7 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, break; case MCUnaryExpr::Minus: /// -(a - b + const) ==> (b - a - const) - if (Value.getSymA() && !Value.getSubSym()) + if (Value.getAddSym() && !Value.getSubSym()) return false; // The cast avoids undefined behavior if the constant is INT64_MIN. diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index b6ee894f93c49..86232c1712dc5 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -676,8 +676,7 @@ MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, return std::make_pair(false, std::string(".reloc offset is not representable")); - const MCSymbolRefExpr &SRE = cast(*OffsetVal.getSymA()); - const MCSymbol &Symbol = SRE.getSymbol(); + const MCSymbol &Symbol = *OffsetVal.getAddSym(); if (Symbol.isDefined()) { uint32_t SymbolOffset = 0; std::optional> Error = @@ -693,8 +692,7 @@ MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, } PendingFixups.emplace_back( - &SRE.getSymbol(), DF, - MCFixup::create(OffsetVal.getConstant(), Expr, Kind, Loc)); + &Symbol, DF, MCFixup::create(OffsetVal.getConstant(), Expr, Kind, Loc)); return std::nullopt; } diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp index 9d5a290f70cad..b35ca704c519a 100644 --- a/llvm/lib/MC/WasmObjectWriter.cpp +++ b/llvm/lib/MC/WasmObjectWriter.cpp @@ -519,8 +519,7 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm, } // We either rejected the fixup or folded B into C at this point. - const MCSymbolRefExpr *RefA = Target.getSymA(); - const auto *SymA = cast(&RefA->getSymbol()); + const auto *SymA = cast(Target.getAddSym()); // The .init_array isn't translated as data, so don't do relocations in it. if (FixupSection.getName().starts_with(".init_array")) { @@ -607,7 +606,7 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm, SymA->setUsedInReloc(); } - switch (RefA->getKind()) { + switch (Target.getSymSpecifier()) { case MCSymbolRefExpr::VK_GOT: case MCSymbolRefExpr::VK_WASM_GOT_TLS: SymA->setUsedInGOT(); diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp index ccf8a2d34c1b5..b7d05b864bb39 100644 --- a/llvm/lib/MC/WinCOFFObjectWriter.cpp +++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp @@ -837,9 +837,9 @@ void WinCOFFWriter::recordRelocation(MCAssembler &Asm, const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, uint64_t &FixedValue) { - assert(Target.getSymA() && "Relocation must reference a symbol!"); + assert(Target.getAddSym() && "Relocation must reference a symbol!"); - const MCSymbol &A = Target.getSymA()->getSymbol(); + const MCSymbol &A = *Target.getAddSym(); if (!A.isRegistered()) { Asm.getContext().reportError(Fixup.getLoc(), Twine("symbol '") + A.getName() + From b1cd3cb3f42881a84ebc3da1dfae59637281d73c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 13:34:24 -0700 Subject: [PATCH 0753/1029] [MC] Replace getSymA()->getSymbol() with getAddSym. NFC We will replace the MCSymbolRefExpr member in MCValue with MCSymbol. This change reduces dependence on MCSymbolRefExpr. --- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 5 ++--- llvm/lib/MC/MCAssembler.cpp | 11 +++++------ llvm/lib/MC/MCMachOStreamer.cpp | 5 ++--- llvm/lib/MC/MCObjectStreamer.cpp | 10 +++++----- llvm/lib/MC/MCValue.cpp | 8 +++----- llvm/lib/MC/MachObjectWriter.cpp | 6 +++--- .../LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp | 2 +- llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp | 3 +-- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 2 +- 9 files changed, 23 insertions(+), 29 deletions(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 0deaf94502b11..d245cd75745d0 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -3883,12 +3883,11 @@ static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME, MCValue MV; if (!(*ME)->evaluateAsRelocatable(MV, nullptr) || MV.isAbsolute()) return; - const MCSymbolRefExpr *SymA = MV.getSymA(); - if (!SymA) + const MCSymbol *GOTEquivSym = MV.getAddSym(); + if (!GOTEquivSym) return; // Check that GOT equivalent symbol is cached. - const MCSymbol *GOTEquivSym = &SymA->getSymbol(); if (!AP.GlobalGOTEquivs.count(GOTEquivSym)) return; diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 685fd1cbdea4d..5e3081251e40a 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -182,10 +182,9 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF, Value = Target.getConstant(); - if (const MCSymbolRefExpr *A = Target.getSymA()) { - const MCSymbol &Sym = A->getSymbol(); - if (Sym.isDefined()) - Value += getSymbolOffset(Sym); + if (const MCSymbol *Add = Target.getAddSym()) { + if (Add->isDefined()) + Value += getSymbolOffset(*Add); } if (const MCSymbol *Sub = Target.getSubSym()) if (Sub->isDefined()) @@ -289,9 +288,9 @@ uint64_t MCAssembler::computeFragmentSize(const MCFragment &F) const { uint64_t FragmentOffset = getFragmentOffset(OF); int64_t TargetLocation = Value.getConstant(); - if (const MCSymbolRefExpr *A = Value.getSymA()) { + if (const auto *SA = Value.getAddSym()) { uint64_t Val; - if (!getSymbolOffset(A->getSymbol(), Val)) { + if (!getSymbolOffset(*SA, Val)) { getContext().reportError(OF.getLoc(), "expected absolute expression"); return 0; } diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp index f246c0daafb9a..9b7152f561175 100644 --- a/llvm/lib/MC/MCMachOStreamer.cpp +++ b/llvm/lib/MC/MCMachOStreamer.cpp @@ -182,10 +182,9 @@ void MCMachOStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) { MCValue Res; if (Value->evaluateAsRelocatable(Res, nullptr)) { - if (const MCSymbolRefExpr *SymAExpr = Res.getSymA()) { - const MCSymbol &SymA = SymAExpr->getSymbol(); + if (const auto *SymA = Res.getAddSym()) { if (!Res.getSubSym() && - (SymA.getName().empty() || Res.getConstant() != 0)) + (SymA->getName().empty() || Res.getConstant() != 0)) cast(Symbol)->setAltEntry(); } } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 86232c1712dc5..e228418fea987 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -610,25 +610,25 @@ getOffsetAndDataFragment(const MCSymbol &Symbol, uint32_t &RelocOffset, std::string(".reloc symbol offset is not " "representable")); - const MCSymbolRefExpr &SRE = cast(*OffsetVal.getSymA()); - if (!SRE.getSymbol().isDefined()) + const MCSymbol &SA = *OffsetVal.getAddSym(); + if (!SA.isDefined()) return std::make_pair(false, std::string("symbol used in the .reloc offset is " "not defined")); - if (SRE.getSymbol().isVariable()) + if (SA.isVariable()) return std::make_pair(false, std::string("symbol used in the .reloc offset is " "variable")); - MCFragment *Fragment = SRE.getSymbol().getFragment(); + MCFragment *Fragment = SA.getFragment(); // FIXME Support symbols with no DF. For example: // .reloc .data, ENUM_VALUE, if (!Fragment || Fragment->getKind() != MCFragment::FT_Data) return std::make_pair(false, std::string("symbol in offset has no data " "fragment")); - RelocOffset = SRE.getSymbol().getOffset() + OffsetVal.getConstant(); + RelocOffset = SA.getOffset() + OffsetVal.getConstant(); DF = cast(Fragment); } else { RelocOffset = Symbol.getOffset(); diff --git a/llvm/lib/MC/MCValue.cpp b/llvm/lib/MC/MCValue.cpp index b7ac3f247ecf9..913fe83ab94eb 100644 --- a/llvm/lib/MC/MCValue.cpp +++ b/llvm/lib/MC/MCValue.cpp @@ -27,7 +27,7 @@ void MCValue::print(raw_ostream &OS) const { if (getRefKind()) OS << ':' << getRefKind() << ':'; - OS << *getSymA(); + SymA->print(OS, nullptr); if (auto *B = getSubSym()) { OS << " - "; @@ -45,9 +45,7 @@ LLVM_DUMP_METHOD void MCValue::dump() const { #endif uint16_t MCValue::getAccessVariant() const { - const MCSymbolRefExpr *A = getSymA(); - if (!A) + if (!SymA) return 0; - - return A->getSpecifier(); + return SymA->getSpecifier(); } diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index dc1d8e003b34f..9e6e3f5e28e54 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -109,7 +109,7 @@ uint64_t MachObjectWriter::getSymbolAddress(const MCSymbol &S, S.getName() + "'"); // Verify that any used symbols are defined. - if (Target.getSymA() && Target.getAddSym()->isUndefined()) + if (Target.getAddSym() && Target.getAddSym()->isUndefined()) report_fatal_error("unable to evaluate offset to undefined symbol '" + Target.getAddSym()->getName() + "'"); if (Target.getSubSym() && Target.getSubSym()->isUndefined()) @@ -117,8 +117,8 @@ uint64_t MachObjectWriter::getSymbolAddress(const MCSymbol &S, Target.getSubSym()->getName() + "'"); uint64_t Address = Target.getConstant(); - if (Target.getSymA()) - Address += getSymbolAddress(Target.getSymA()->getSymbol(), Asm); + if (Target.getAddSym()) + Address += getSymbolAddress(*Target.getAddSym(), Asm); if (Target.getSubSym()) Address += getSymbolAddress(*Target.getSubSym(), Asm); return Address; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index f22b208f8dffc..adeb7455ad616 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -456,7 +456,7 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAssembler &Asm, "relocatable SymA-SymB cannot have relocation specifier"); std::pair FK; uint64_t FixedValueA, FixedValueB; - const MCSymbol &SA = Target.getSymA()->getSymbol(); + const MCSymbol &SA = *Target.getAddSym(); const MCSymbol &SB = *Target.getSubSym(); bool force = !SA.isInSection() || !SB.isInSection(); diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 15ada7fc608f0..8d9ae1e20006a 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -2947,8 +2947,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, bool IsLocalSym = Res.getAddSym()->isInSection() || Res.getAddSym()->isTemporary() || (Res.getAddSym()->isELF() && - cast(Res.getSymA()->getSymbol()).getBinding() == - ELF::STB_LOCAL); + cast(Res.getAddSym())->getBinding() == ELF::STB_LOCAL); // For O32, "$"-prefixed symbols are recognized as temporary while // .L-prefixed symbols are not (PrivateGlobalPrefix is "$"). Recognize ".L" // manually. diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 65997840cbe8d..af827a42c48eb 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -744,7 +744,7 @@ bool X86AsmBackend::fixupNeedsRelaxationAdvanced(const MCAssembler &Asm, if (Fixup.getKind() == FK_Data_1) { MCValue Target; if (Fixup.getValue()->evaluateAsRelocatable(Target, &Asm) && - Target.getSymA() && Target.getSymSpecifier() == X86MCExpr::VK_ABS8) + Target.getAddSym() && Target.getSymSpecifier() == X86MCExpr::VK_ABS8) return false; } return true; From d71ee7d23048ca64d14a7536927a006867cea39a Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 5 Apr 2025 13:35:45 -0700 Subject: [PATCH 0754/1029] [clang-format] Set C11 instead of C17 for LK_C (#134472) Fix #134453 --- clang/lib/Format/Format.cpp | 2 +- clang/lib/Format/FormatToken.cpp | 2 +- clang/lib/Format/TokenAnnotator.cpp | 3 +-- clang/lib/Format/TokenAnnotator.h | 4 +--- clang/lib/Format/UnwrappedLineParser.cpp | 4 +--- clang/unittests/Format/TokenAnnotatorTest.cpp | 6 ++++++ 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index b74a8631efe0f..226d39f635676 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -4010,7 +4010,7 @@ LangOptions getFormattingLangOpts(const FormatStyle &Style) { switch (Style.Language) { case FormatStyle::LK_C: - LangOpts.C17 = 1; + LangOpts.C11 = 1; break; case FormatStyle::LK_Cpp: case FormatStyle::LK_ObjC: diff --git a/clang/lib/Format/FormatToken.cpp b/clang/lib/Format/FormatToken.cpp index 7752139142430..1d49d787f9cc9 100644 --- a/clang/lib/Format/FormatToken.cpp +++ b/clang/lib/Format/FormatToken.cpp @@ -44,7 +44,7 @@ static SmallVector CppNonKeywordTypes = { bool FormatToken::isTypeName(const LangOptions &LangOpts) const { if (is(TT_TypeName) || Tok.isSimpleTypeSpecifier(LangOpts)) return true; - return (LangOpts.CXXOperatorNames || LangOpts.C17) && is(tok::identifier) && + return (LangOpts.CXXOperatorNames || LangOpts.C11) && is(tok::identifier) && std::binary_search(CppNonKeywordTypes.begin(), CppNonKeywordTypes.end(), TokenText); } diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index dfb59e8d6f420..bd54470dcba37 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -129,7 +129,6 @@ class AnnotatingParser { : Style(Style), Line(Line), CurrentToken(Line.First), AutoFound(false), IsCpp(Style.isCpp()), LangOpts(getFormattingLangOpts(Style)), Keywords(Keywords), Scopes(Scopes), TemplateDeclarationDepth(0) { - assert(IsCpp == (LangOpts.CXXOperatorNames || LangOpts.C17)); Contexts.push_back(Context(tok::unknown, 1, /*IsExpression=*/false)); resetTokenMetadata(); } @@ -3847,7 +3846,7 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts, }; const auto *Next = Current.Next; - const bool IsCpp = LangOpts.CXXOperatorNames || LangOpts.C17; + const bool IsCpp = LangOpts.CXXOperatorNames || LangOpts.C11; // Find parentheses of parameter list. if (Current.is(tok::kw_operator)) { diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h index c0c13941ef4f7..e4b94431e68b4 100644 --- a/clang/lib/Format/TokenAnnotator.h +++ b/clang/lib/Format/TokenAnnotator.h @@ -224,9 +224,7 @@ class TokenAnnotator { public: TokenAnnotator(const FormatStyle &Style, const AdditionalKeywords &Keywords) : Style(Style), IsCpp(Style.isCpp()), - LangOpts(getFormattingLangOpts(Style)), Keywords(Keywords) { - assert(IsCpp == (LangOpts.CXXOperatorNames || LangOpts.C17)); - } + LangOpts(getFormattingLangOpts(Style)), Keywords(Keywords) {} /// Adapts the indent levels of comment lines to the indent of the /// subsequent line. diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 213b706807b2a..9641da1577ded 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -167,9 +167,7 @@ UnwrappedLineParser::UnwrappedLineParser( ? IG_Rejected : IG_Inited), IncludeGuardToken(nullptr), FirstStartColumn(FirstStartColumn), - Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable) { - assert(IsCpp == (LangOpts.CXXOperatorNames || LangOpts.C17)); -} + Macros(Style.Macros, SourceMgr, Style, Allocator, IdentTable) {} void UnwrappedLineParser::reset() { PPBranchLevel = -1; diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 7e0af1c7b4c36..38dc10a08f640 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -3935,6 +3935,12 @@ TEST_F(TokenAnnotatorTest, UserDefinedConversionFunction) { EXPECT_TOKEN(Tokens[5], tok::l_paren, TT_FunctionDeclarationLParen); } +TEST_F(TokenAnnotatorTest, UTF8StringLiteral) { + auto Tokens = annotate("return u8\"foo\";", getLLVMStyle(FormatStyle::LK_C)); + ASSERT_EQ(Tokens.size(), 4u) << Tokens; + EXPECT_TOKEN(Tokens[1], tok::utf8_string_literal, TT_Unknown); +} + } // namespace } // namespace format } // namespace clang From 0c84d71eda538b5ac73811f241d4a0555ff49099 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 13:40:04 -0700 Subject: [PATCH 0755/1029] [MC] Replace getSymA()->getSymbol() with getAddSym. NFC We will replace the MCSymbolRefExpr member in MCValue with MCSymbol. This change reduces dependence on MCSymbolRefExpr. --- .../Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp | 4 ++-- llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp | 4 ++-- llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp | 4 ++-- llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp | 4 ++-- .../Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp | 4 ++-- .../LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp | 4 ++-- llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp | 4 ++-- llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp | 4 ++-- llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp | 4 ++-- llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp | 6 +++--- llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp | 6 ++---- llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp | 4 ++-- .../Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp | 4 ++-- llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp | 4 ++-- 14 files changed, 29 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 4763fdf8deb05..ee230a41242f6 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -128,8 +128,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, case AArch64MCExpr::VK_TPREL: case AArch64MCExpr::VK_TLSDESC: case AArch64MCExpr::VK_TLSDESC_AUTH: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *SA = Target.getAddSym()) + cast(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index d88d3bdd39378..f9c8f5f820e6f 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -103,8 +103,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, case ARMMCExpr::VK_TLSLDM_FDPIC: case ARMMCExpr::VK_TLSLDO: case ARMMCExpr::VK_TPOFF: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *SA = Target.getAddSym()) + cast(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp index 7c0c23a86de8d..73ea49cc0ee92 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp @@ -51,8 +51,8 @@ unsigned BPFELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, case FK_Data_8: return ELF::R_BPF_64_ABS64; case FK_Data_4: - if (const MCSymbolRefExpr *A = Target.getSymA()) { - const MCSymbol &Sym = A->getSymbol(); + if (const auto *A = Target.getAddSym()) { + const MCSymbol &Sym = *A; if (Sym.isDefined()) { MCSection &Section = Sym.getSection(); diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp index 6adeb53731fde..af02631d051be 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp @@ -50,8 +50,8 @@ unsigned CSKYELFObjectWriter::getRelocType(MCContext &Ctx, case CSKYMCExpr::VK_TLSGD: case CSKYMCExpr::VK_TLSLDM: case CSKYMCExpr::VK_TLSLDO: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *SA = Target.getAddSym()) + cast(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp index b9e0ea5960f31..039e4c981890d 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp @@ -51,8 +51,8 @@ unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx, case HexagonMCExpr::VK_IE: case HexagonMCExpr::VK_IE_GOT: case HexagonMCExpr::VK_TPREL: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *SA = Target.getAddSym()) + cast(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index 7af832647f657..ca1eb665132e7 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -64,8 +64,8 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx, case LoongArchMCExpr::VK_TLS_LD_PCREL20_S2: case LoongArchMCExpr::VK_TLS_GD_PCREL20_S2: case LoongArchMCExpr::VK_TLS_DESC_PCREL20_S2: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *SA = Target.getAddSym()) + cast(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp index 4a08591629e35..192bc5f4ae602 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp @@ -74,8 +74,8 @@ unsigned M68kELFObjectWriter::getRelocType(MCContext &Ctx, case M68kMCExpr::VK_TLSLD: case M68kMCExpr::VK_TLSLDM: case M68kMCExpr::VK_TPOFF: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *SA = Target.getAddSym()) + cast(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 526190263fbb8..c7801476a2b61 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -171,8 +171,8 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx, case MipsMCExpr::MEK_GOTTPREL: case MipsMCExpr::MEK_TPREL_HI: case MipsMCExpr::MEK_TPREL_LO: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *SA = Target.getAddSym()) + cast(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index 1964f1b70957d..845c590478e5c 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -100,8 +100,8 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, case PPCMCExpr::VK_TPREL_HIGHEST: case PPCMCExpr::VK_TPREL_HIGHESTA: case PPCMCExpr::VK_TPREL_LO: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *SA = Target.getAddSym()) + cast(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp index 77a55eceaf68e..586440f407d71 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp @@ -50,7 +50,7 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - assert((!Target.getSymA() || + assert((!Target.getAddSym() || Target.getSymSpecifier() == MCSymbolRefExpr::VK_None) && "sym@specifier should have been rejected"); const MCExpr *Expr = Fixup.getValue(); @@ -65,8 +65,8 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, case RISCVMCExpr::VK_TLS_GOT_HI: case RISCVMCExpr::VK_TLS_GD_HI: case RISCVMCExpr::VK_TLSDESC_HI: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *SA = Target.getAddSym()) + cast(SA)->setType(ELF::STT_TLS); break; case RISCVMCExpr::VK_PLTPCREL: case RISCVMCExpr::VK_GOTPCREL: diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp index 73c24aa8a60bb..a48dca7be0d28 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp @@ -48,11 +48,9 @@ const MCFixup *RISCVMCExpr::getPCRelHiFixup(const MCFragment **DFOut) const { if (!getSubExpr()->evaluateAsRelocatable(AUIPCLoc, nullptr)) return nullptr; - const MCSymbolRefExpr *AUIPCSRE = AUIPCLoc.getSymA(); - if (!AUIPCSRE) + const MCSymbol *AUIPCSymbol = AUIPCLoc.getAddSym(); + if (!AUIPCSymbol) return nullptr; - - const MCSymbol *AUIPCSymbol = &AUIPCSRE->getSymbol(); const auto *DF = dyn_cast_or_null(AUIPCSymbol->getFragment()); if (!DF) diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp index 2a39d991e2985..2cb7fa8233949 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp @@ -63,8 +63,8 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx, case SparcMCExpr::VK_TLS_IE_ADD: case SparcMCExpr::VK_TLS_LE_HIX22: case SparcMCExpr::VK_TLS_LE_LOX10: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *SA = Target.getAddSym()) + cast(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp index e5c51b0bdba8f..d6e0e15bec358 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp @@ -166,8 +166,8 @@ unsigned SystemZELFObjectWriter::getRelocType(MCContext &Ctx, case SystemZMCExpr::VK_TLSLD: case SystemZMCExpr::VK_TLSLDM: case SystemZMCExpr::VK_DTPOFF: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *SA = Target.getAddSym()) + cast(SA)->setType(ELF::STT_TLS); break; default: break; diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp index 5f2e6160075ca..727ed56aac26f 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp @@ -45,8 +45,8 @@ unsigned VEELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, case VEMCExpr::VK_TLS_GD_LO32: case VEMCExpr::VK_TPOFF_HI32: case VEMCExpr::VK_TPOFF_LO32: - if (auto *S = Target.getSymA()) - cast(S->getSymbol()).setType(ELF::STT_TLS); + if (auto *SA = Target.getAddSym()) + cast(SA)->setType(ELF::STT_TLS); break; default: break; From 46a2f4174a051f29a09dbc3844df763571c67309 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 5 Apr 2025 21:47:14 +0100 Subject: [PATCH 0756/1029] Revert "[LV] Don't add blocks to loop in GeneratedRTChecks (NFC)." This reverts commit 2fd6f8fb5e3a52e901276d97c285b8de66742985. This missed a possible case, causing buildbot failures. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 585caaffa63da..cc6fd790bc437 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2082,6 +2082,9 @@ class GeneratedRTChecks { auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); + // Create new preheader for vector loop. + if (OuterLoop) + OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI); SCEVCheckBlock->getTerminator()->eraseFromParent(); SCEVCheckBlock->moveBefore(LoopVectorPreHeader); @@ -2119,6 +2122,9 @@ class GeneratedRTChecks { DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); MemCheckBlock->moveBefore(LoopVectorPreHeader); + if (OuterLoop) + OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI); + BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); if (AddBranchWeights) { From 7cf8a6201a6eb549b8d41214afa2694e0c1e344c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 13:51:45 -0700 Subject: [PATCH 0757/1029] [AArch64,MC] Replace getSymA()->getSymbol() with getAddSym. NFC We will replace the MCSymbolRefExpr member in MCValue with MCSymbol. This change reduces dependence on MCSymbolRefExpr. --- .../AArch64/AsmParser/AArch64AsmParser.cpp | 4 ++-- .../MCTargetDesc/AArch64ELFObjectWriter.cpp | 2 +- .../MCTargetDesc/AArch64MachObjectWriter.cpp | 19 ++++++++++--------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 8b8c5a22b829c..c3baec41cbedf 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -8220,10 +8220,10 @@ bool AArch64AsmParser::classifySymbolRef(const MCExpr *Expr, // Treat expressions with an ELFSpec (like ":abs_g1:3", or // ":abs_g1:x" where x is constant) as symbolic even if there is no symbol. - if (!Res.getSymA() && ELFSpec == AArch64MCExpr::VK_INVALID) + if (!Res.getAddSym() && ELFSpec == AArch64MCExpr::VK_INVALID) return false; - if (Res.getSymA()) + if (Res.getAddSym()) DarwinSpec = AArch64MCExpr::Specifier(Res.getSymSpecifier()); Addend = Res.getConstant(); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index ee230a41242f6..b03c55cafdcdf 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -116,7 +116,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, AArch64MCExpr::Specifier SymLoc = AArch64MCExpr::getSymbolLoc(RefKind); bool IsNC = AArch64MCExpr::isNotChecked(RefKind); - assert((!Target.getSymA() || + assert((!Target.getAddSym() || Target.getSymSpecifier() == AArch64MCExpr::None || Target.getSymSpecifier() == AArch64MCExpr::VK_PLT || Target.getSymSpecifier() == AArch64MCExpr::VK_GOTPCREL) && diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index bc967e3e8a6e8..6292203ce8401 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -34,8 +34,8 @@ namespace { class AArch64MachObjectWriter : public MCMachObjectTargetWriter { bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType, - const MCSymbolRefExpr *Sym, - unsigned &Log2Size, const MCAssembler &Asm); + AArch64MCExpr::Specifier Spec, + unsigned &Log2Size, const MCAssembler &Asm); public: AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype, bool IsILP32) @@ -49,7 +49,7 @@ class AArch64MachObjectWriter : public MCMachObjectTargetWriter { } // end anonymous namespace bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( - const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym, + const MCFixup &Fixup, unsigned &RelocType, AArch64MCExpr::Specifier Spec, unsigned &Log2Size, const MCAssembler &Asm) { RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED); Log2Size = ~0U; @@ -66,12 +66,12 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( return true; case FK_Data_4: Log2Size = Log2_32(4); - if (getSpecifier(Sym) == AArch64MCExpr::M_GOT) + if (Spec == AArch64MCExpr::M_GOT) RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); return true; case FK_Data_8: Log2Size = Log2_32(8); - if (getSpecifier(Sym) == AArch64MCExpr::M_GOT) + if (Spec == AArch64MCExpr::M_GOT) RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); return true; case AArch64::fixup_aarch64_add_imm12: @@ -81,7 +81,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( case AArch64::fixup_aarch64_ldst_imm12_scale8: case AArch64::fixup_aarch64_ldst_imm12_scale16: Log2Size = Log2_32(4); - switch (AArch64MCExpr::Specifier(getSpecifier(Sym))) { + switch (Spec) { default: return false; case AArch64MCExpr::M_PAGEOFF: @@ -97,7 +97,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( case AArch64::fixup_aarch64_pcrel_adrp_imm21: Log2Size = Log2_32(4); // This encompasses the relocation for the whole 21-bit value. - switch (getSpecifier(Sym)) { + switch (Spec) { default: Asm.getContext().reportError(Fixup.getLoc(), "ADR/ADRP relocations must be GOT relative"); @@ -191,8 +191,9 @@ void AArch64MachObjectWriter::recordRelocation( return; } - if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size, - Asm)) { + if (!getAArch64FixupKindMachOInfo( + Fixup, Type, AArch64MCExpr::Specifier(Target.getSymSpecifier()), + Log2Size, Asm)) { Asm.getContext().reportError(Fixup.getLoc(), "unknown AArch64 fixup kind!"); return; } From 70f5632cadb82b1be813d7ba688b80e22df634e2 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 13:53:38 -0700 Subject: [PATCH 0758/1029] [PowerPC,MC] Replace getSymA()->getSymbol() with getAddSym. NFC We will replace the MCSymbolRefExpr member in MCValue with MCSymbol. This change reduces dependence on MCSymbolRefExpr. --- llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 3063e1e380fd6..5d6999dbcf1b6 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -169,18 +169,18 @@ class PPCAsmBackend : public MCAsmBackend { // If the target symbol has a local entry point we must not attempt // to resolve the fixup directly. Emit a relocation and leave // resolution of the final target address to the linker. - if (const MCSymbolRefExpr *A = Target.getSymA()) { - if (const auto *S = dyn_cast(&A->getSymbol())) { + if (const auto *A = Target.getAddSym()) { + if (const auto *S = dyn_cast(A)) { // The "other" values are stored in the last 6 bits of the second // byte. The traditional defines for STO values assume the full byte // and thus the shift to pack it. unsigned Other = S->getOther() << 2; if ((Other & ELF::STO_PPC64_LOCAL_MASK) != 0) return true; - } else if (const auto *S = dyn_cast(&A->getSymbol())) { + } else if (const auto *S = dyn_cast(A)) { return !Target.isAbsolute() && S->isExternal() && S->getStorageClass() == XCOFF::C_WEAKEXT; - } + } } return false; } From aeec94500a5dbd576e5d2d16895fe00fa0b1e154 Mon Sep 17 00:00:00 2001 From: junfengd-nv Date: Sat, 5 Apr 2025 13:56:55 -0700 Subject: [PATCH 0759/1029] [mlir][inliner] Add doClone and canHandleMultipleBlocks callbacks to Inliner Config (#131226) Current inliner disables inlining when the caller is in a region with single block trait, while the callee function contains multiple blocks. the SingleBlock trait is used in operations such as do/while loop, for example fir.do_loop, fir.iterate_while and fir.if. Typically, calls within loops are good candidates for inlining. However, functions with multiple blocks are also common. for example, any function with "if () then return" will result in multiple blocks in MLIR. This change gives the flexibility of a customized inliner to handle such cases. doClone: clones instructions and other information from the callee function into the caller function. . canHandleMultipleBlocks: checks if functions with multiple blocks can be inlined into a region with the SingleBlock trait. The default behavior of the inliner remains unchanged. --------- Co-authored-by: jeanPerier Co-authored-by: Mehdi Amini --- mlir/include/mlir/Transforms/Inliner.h | 36 +++++ mlir/include/mlir/Transforms/InliningUtils.h | 61 ++++--- mlir/lib/Transforms/Utils/Inliner.cpp | 31 ++-- mlir/lib/Transforms/Utils/InliningUtils.cpp | 116 +++++++------- .../Transforms/test-inlining-callback.mlir | 24 +++ mlir/test/lib/Transforms/CMakeLists.txt | 1 + mlir/test/lib/Transforms/TestInlining.cpp | 14 +- .../lib/Transforms/TestInliningCallback.cpp | 151 ++++++++++++++++++ mlir/tools/mlir-opt/mlir-opt.cpp | 2 + 9 files changed, 333 insertions(+), 103 deletions(-) create mode 100644 mlir/test/Transforms/test-inlining-callback.mlir create mode 100644 mlir/test/lib/Transforms/TestInliningCallback.cpp diff --git a/mlir/include/mlir/Transforms/Inliner.h b/mlir/include/mlir/Transforms/Inliner.h index ec77319d6ac88..506b4455af646 100644 --- a/mlir/include/mlir/Transforms/Inliner.h +++ b/mlir/include/mlir/Transforms/Inliner.h @@ -27,6 +27,11 @@ class InlinerConfig { public: using DefaultPipelineTy = std::function; using OpPipelinesTy = llvm::StringMap; + using CloneCallbackSigTy = void(OpBuilder &builder, Region *src, + Block *inlineBlock, Block *postInsertBlock, + IRMapping &mapper, + bool shouldCloneInlinedRegion); + using CloneCallbackTy = std::function; InlinerConfig() = default; InlinerConfig(DefaultPipelineTy defaultPipeline, @@ -39,6 +44,9 @@ class InlinerConfig { } const OpPipelinesTy &getOpPipelines() const { return opPipelines; } unsigned getMaxInliningIterations() const { return maxInliningIterations; } + const CloneCallbackTy &getCloneCallback() const { return cloneCallback; } + bool getCanHandleMultipleBlocks() const { return canHandleMultipleBlocks; } + void setDefaultPipeline(DefaultPipelineTy pipeline) { defaultPipeline = std::move(pipeline); } @@ -46,6 +54,12 @@ class InlinerConfig { opPipelines = std::move(pipelines); } void setMaxInliningIterations(unsigned max) { maxInliningIterations = max; } + void setCloneCallback(CloneCallbackTy callback) { + cloneCallback = std::move(callback); + } + void setCanHandleMultipleBlocks(bool value = true) { + canHandleMultipleBlocks = value; + } private: /// An optional function that constructs an optimization pipeline for @@ -60,6 +74,28 @@ class InlinerConfig { /// For SCC-based inlining algorithms, specifies maximum number of iterations /// when inlining within an SCC. unsigned maxInliningIterations{0}; + /// Callback for cloning operations during inlining + CloneCallbackTy cloneCallback = [](OpBuilder &builder, Region *src, + Block *inlineBlock, Block *postInsertBlock, + IRMapping &mapper, + bool shouldCloneInlinedRegion) { + // Check to see if the region is being cloned, or moved inline. In + // either case, move the new blocks after the 'insertBlock' to improve + // IR readability. + Region *insertRegion = inlineBlock->getParent(); + if (shouldCloneInlinedRegion) + src->cloneInto(insertRegion, postInsertBlock->getIterator(), mapper); + else + insertRegion->getBlocks().splice(postInsertBlock->getIterator(), + src->getBlocks(), src->begin(), + src->end()); + }; + /// Determine if the inliner can inline a function containing multiple + /// blocks into a region that requires a single block. By default, it is + /// not allowed. If it is true, cloneCallback should perform the extra + /// transformation. see the example in + /// mlir/test/lib/Transforms/TestInliningCallback.cpp + bool canHandleMultipleBlocks{false}; }; /// This is an implementation of the inliner diff --git a/mlir/include/mlir/Transforms/InliningUtils.h b/mlir/include/mlir/Transforms/InliningUtils.h index becfe9b047ef4..552030983d724 100644 --- a/mlir/include/mlir/Transforms/InliningUtils.h +++ b/mlir/include/mlir/Transforms/InliningUtils.h @@ -18,6 +18,7 @@ #include "mlir/IR/Location.h" #include "mlir/IR/Region.h" #include "mlir/IR/ValueRange.h" +#include "mlir/Transforms/Inliner.h" #include namespace mlir { @@ -253,33 +254,39 @@ class InlinerInterface /// provided, will be used to update the inlined operations' location /// information. 'shouldCloneInlinedRegion' corresponds to whether the source /// region should be cloned into the 'inlinePoint' or spliced directly. -LogicalResult inlineRegion(InlinerInterface &interface, Region *src, - Operation *inlinePoint, IRMapping &mapper, - ValueRange resultsToReplace, - TypeRange regionResultTypes, - std::optional inlineLoc = std::nullopt, - bool shouldCloneInlinedRegion = true); -LogicalResult inlineRegion(InlinerInterface &interface, Region *src, - Block *inlineBlock, Block::iterator inlinePoint, - IRMapping &mapper, ValueRange resultsToReplace, - TypeRange regionResultTypes, - std::optional inlineLoc = std::nullopt, - bool shouldCloneInlinedRegion = true); +LogicalResult +inlineRegion(InlinerInterface &interface, + function_ref cloneCallback, + Region *src, Operation *inlinePoint, IRMapping &mapper, + ValueRange resultsToReplace, TypeRange regionResultTypes, + std::optional inlineLoc = std::nullopt, + bool shouldCloneInlinedRegion = true); +LogicalResult +inlineRegion(InlinerInterface &interface, + function_ref cloneCallback, + Region *src, Block *inlineBlock, Block::iterator inlinePoint, + IRMapping &mapper, ValueRange resultsToReplace, + TypeRange regionResultTypes, + std::optional inlineLoc = std::nullopt, + bool shouldCloneInlinedRegion = true); /// This function is an overload of the above 'inlineRegion' that allows for /// providing the set of operands ('inlinedOperands') that should be used /// in-favor of the region arguments when inlining. -LogicalResult inlineRegion(InlinerInterface &interface, Region *src, - Operation *inlinePoint, ValueRange inlinedOperands, - ValueRange resultsToReplace, - std::optional inlineLoc = std::nullopt, - bool shouldCloneInlinedRegion = true); -LogicalResult inlineRegion(InlinerInterface &interface, Region *src, - Block *inlineBlock, Block::iterator inlinePoint, - ValueRange inlinedOperands, - ValueRange resultsToReplace, - std::optional inlineLoc = std::nullopt, - bool shouldCloneInlinedRegion = true); +LogicalResult +inlineRegion(InlinerInterface &interface, + function_ref cloneCallback, + Region *src, Operation *inlinePoint, ValueRange inlinedOperands, + ValueRange resultsToReplace, + std::optional inlineLoc = std::nullopt, + bool shouldCloneInlinedRegion = true); +LogicalResult +inlineRegion(InlinerInterface &interface, + function_ref cloneCallback, + Region *src, Block *inlineBlock, Block::iterator inlinePoint, + ValueRange inlinedOperands, ValueRange resultsToReplace, + std::optional inlineLoc = std::nullopt, + bool shouldCloneInlinedRegion = true); /// This function inlines a given region, 'src', of a callable operation, /// 'callable', into the location defined by the given call operation. This @@ -287,9 +294,11 @@ LogicalResult inlineRegion(InlinerInterface &interface, Region *src, /// failure, no changes are made to the module. 'shouldCloneInlinedRegion' /// corresponds to whether the source region should be cloned into the 'call' or /// spliced directly. -LogicalResult inlineCall(InlinerInterface &interface, CallOpInterface call, - CallableOpInterface callable, Region *src, - bool shouldCloneInlinedRegion = true); +LogicalResult +inlineCall(InlinerInterface &interface, + function_ref cloneCallback, + CallOpInterface call, CallableOpInterface callable, Region *src, + bool shouldCloneInlinedRegion = true); } // namespace mlir diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp index f511504594cfa..54b5c788a3526 100644 --- a/mlir/lib/Transforms/Utils/Inliner.cpp +++ b/mlir/lib/Transforms/Utils/Inliner.cpp @@ -652,7 +652,7 @@ Inliner::Impl::inlineCallsInSCC(InlinerInterfaceImpl &inlinerIface, bool inlineInPlace = useList.hasOneUseAndDiscardable(it.targetNode); LogicalResult inlineResult = - inlineCall(inlinerIface, call, + inlineCall(inlinerIface, inliner.config.getCloneCallback(), call, cast(targetRegion->getParentOp()), targetRegion, /*shouldCloneInlinedRegion=*/!inlineInPlace); if (failed(inlineResult)) { @@ -730,19 +730,22 @@ bool Inliner::Impl::shouldInline(ResolvedCall &resolvedCall) { // Don't allow inlining if the callee has multiple blocks (unstructured // control flow) but we cannot be sure that the caller region supports that. - bool calleeHasMultipleBlocks = - llvm::hasNItemsOrMore(*callableRegion, /*N=*/2); - // If both parent ops have the same type, it is safe to inline. Otherwise, - // decide based on whether the op has the SingleBlock trait or not. - // Note: This check does currently not account for SizedRegion/MaxSizedRegion. - auto callerRegionSupportsMultipleBlocks = [&]() { - return callableRegion->getParentOp()->getName() == - resolvedCall.call->getParentOp()->getName() || - !resolvedCall.call->getParentOp() - ->mightHaveTrait(); - }; - if (calleeHasMultipleBlocks && !callerRegionSupportsMultipleBlocks()) - return false; + if (!inliner.config.getCanHandleMultipleBlocks()) { + bool calleeHasMultipleBlocks = + llvm::hasNItemsOrMore(*callableRegion, /*N=*/2); + // If both parent ops have the same type, it is safe to inline. Otherwise, + // decide based on whether the op has the SingleBlock trait or not. + // Note: This check does currently not account for + // SizedRegion/MaxSizedRegion. + auto callerRegionSupportsMultipleBlocks = [&]() { + return callableRegion->getParentOp()->getName() == + resolvedCall.call->getParentOp()->getName() || + !resolvedCall.call->getParentOp() + ->mightHaveTrait(); + }; + if (calleeHasMultipleBlocks && !callerRegionSupportsMultipleBlocks()) + return false; + } if (!inliner.isProfitableToInline(resolvedCall)) return false; diff --git a/mlir/lib/Transforms/Utils/InliningUtils.cpp b/mlir/lib/Transforms/Utils/InliningUtils.cpp index e113389b26ae7..3dd95d2845715 100644 --- a/mlir/lib/Transforms/Utils/InliningUtils.cpp +++ b/mlir/lib/Transforms/Utils/InliningUtils.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Transforms/InliningUtils.h" +#include "mlir/Transforms/Inliner.h" #include "mlir/IR/Builders.h" #include "mlir/IR/IRMapping.h" @@ -266,10 +267,11 @@ static void handleResultImpl(InlinerInterface &interface, OpBuilder &builder, } static LogicalResult -inlineRegionImpl(InlinerInterface &interface, Region *src, Block *inlineBlock, - Block::iterator inlinePoint, IRMapping &mapper, - ValueRange resultsToReplace, TypeRange regionResultTypes, - std::optional inlineLoc, +inlineRegionImpl(InlinerInterface &interface, + function_ref cloneCallback, + Region *src, Block *inlineBlock, Block::iterator inlinePoint, + IRMapping &mapper, ValueRange resultsToReplace, + TypeRange regionResultTypes, std::optional inlineLoc, bool shouldCloneInlinedRegion, CallOpInterface call = {}) { assert(resultsToReplace.size() == regionResultTypes.size()); // We expect the region to have at least one block. @@ -296,16 +298,10 @@ inlineRegionImpl(InlinerInterface &interface, Region *src, Block *inlineBlock, if (call && callable) handleArgumentImpl(interface, builder, call, callable, mapper); - // Check to see if the region is being cloned, or moved inline. In either - // case, move the new blocks after the 'insertBlock' to improve IR - // readability. + // Clone the callee's source into the caller. Block *postInsertBlock = inlineBlock->splitBlock(inlinePoint); - if (shouldCloneInlinedRegion) - src->cloneInto(insertRegion, postInsertBlock->getIterator(), mapper); - else - insertRegion->getBlocks().splice(postInsertBlock->getIterator(), - src->getBlocks(), src->begin(), - src->end()); + cloneCallback(builder, src, inlineBlock, postInsertBlock, mapper, + shouldCloneInlinedRegion); // Get the range of newly inserted blocks. auto newBlocks = llvm::make_range(std::next(inlineBlock->getIterator()), @@ -374,9 +370,11 @@ inlineRegionImpl(InlinerInterface &interface, Region *src, Block *inlineBlock, } static LogicalResult -inlineRegionImpl(InlinerInterface &interface, Region *src, Block *inlineBlock, - Block::iterator inlinePoint, ValueRange inlinedOperands, - ValueRange resultsToReplace, std::optional inlineLoc, +inlineRegionImpl(InlinerInterface &interface, + function_ref cloneCallback, + Region *src, Block *inlineBlock, Block::iterator inlinePoint, + ValueRange inlinedOperands, ValueRange resultsToReplace, + std::optional inlineLoc, bool shouldCloneInlinedRegion, CallOpInterface call = {}) { // We expect the region to have at least one block. if (src->empty()) @@ -398,53 +396,54 @@ inlineRegionImpl(InlinerInterface &interface, Region *src, Block *inlineBlock, } // Call into the main region inliner function. - return inlineRegionImpl(interface, src, inlineBlock, inlinePoint, mapper, - resultsToReplace, resultsToReplace.getTypes(), - inlineLoc, shouldCloneInlinedRegion, call); + return inlineRegionImpl(interface, cloneCallback, src, inlineBlock, + inlinePoint, mapper, resultsToReplace, + resultsToReplace.getTypes(), inlineLoc, + shouldCloneInlinedRegion, call); } -LogicalResult mlir::inlineRegion(InlinerInterface &interface, Region *src, - Operation *inlinePoint, IRMapping &mapper, - ValueRange resultsToReplace, - TypeRange regionResultTypes, - std::optional inlineLoc, - bool shouldCloneInlinedRegion) { - return inlineRegion(interface, src, inlinePoint->getBlock(), +LogicalResult mlir::inlineRegion( + InlinerInterface &interface, + function_ref cloneCallback, Region *src, + Operation *inlinePoint, IRMapping &mapper, ValueRange resultsToReplace, + TypeRange regionResultTypes, std::optional inlineLoc, + bool shouldCloneInlinedRegion) { + return inlineRegion(interface, cloneCallback, src, inlinePoint->getBlock(), ++inlinePoint->getIterator(), mapper, resultsToReplace, regionResultTypes, inlineLoc, shouldCloneInlinedRegion); } -LogicalResult mlir::inlineRegion(InlinerInterface &interface, Region *src, - Block *inlineBlock, - Block::iterator inlinePoint, IRMapping &mapper, - ValueRange resultsToReplace, - TypeRange regionResultTypes, - std::optional inlineLoc, - bool shouldCloneInlinedRegion) { - return inlineRegionImpl(interface, src, inlineBlock, inlinePoint, mapper, - resultsToReplace, regionResultTypes, inlineLoc, - shouldCloneInlinedRegion); + +LogicalResult mlir::inlineRegion( + InlinerInterface &interface, + function_ref cloneCallback, Region *src, + Block *inlineBlock, Block::iterator inlinePoint, IRMapping &mapper, + ValueRange resultsToReplace, TypeRange regionResultTypes, + std::optional inlineLoc, bool shouldCloneInlinedRegion) { + return inlineRegionImpl( + interface, cloneCallback, src, inlineBlock, inlinePoint, mapper, + resultsToReplace, regionResultTypes, inlineLoc, shouldCloneInlinedRegion); } -LogicalResult mlir::inlineRegion(InlinerInterface &interface, Region *src, - Operation *inlinePoint, - ValueRange inlinedOperands, - ValueRange resultsToReplace, - std::optional inlineLoc, - bool shouldCloneInlinedRegion) { - return inlineRegion(interface, src, inlinePoint->getBlock(), +LogicalResult mlir::inlineRegion( + InlinerInterface &interface, + function_ref cloneCallback, Region *src, + Operation *inlinePoint, ValueRange inlinedOperands, + ValueRange resultsToReplace, std::optional inlineLoc, + bool shouldCloneInlinedRegion) { + return inlineRegion(interface, cloneCallback, src, inlinePoint->getBlock(), ++inlinePoint->getIterator(), inlinedOperands, resultsToReplace, inlineLoc, shouldCloneInlinedRegion); } -LogicalResult mlir::inlineRegion(InlinerInterface &interface, Region *src, - Block *inlineBlock, - Block::iterator inlinePoint, - ValueRange inlinedOperands, - ValueRange resultsToReplace, - std::optional inlineLoc, - bool shouldCloneInlinedRegion) { - return inlineRegionImpl(interface, src, inlineBlock, inlinePoint, - inlinedOperands, resultsToReplace, inlineLoc, - shouldCloneInlinedRegion); + +LogicalResult mlir::inlineRegion( + InlinerInterface &interface, + function_ref cloneCallback, Region *src, + Block *inlineBlock, Block::iterator inlinePoint, ValueRange inlinedOperands, + ValueRange resultsToReplace, std::optional inlineLoc, + bool shouldCloneInlinedRegion) { + return inlineRegionImpl(interface, cloneCallback, src, inlineBlock, + inlinePoint, inlinedOperands, resultsToReplace, + inlineLoc, shouldCloneInlinedRegion); } /// Utility function used to generate a cast operation from the given interface, @@ -475,10 +474,11 @@ static Value materializeConversion(const DialectInlinerInterface *interface, /// failure, no changes are made to the module. 'shouldCloneInlinedRegion' /// corresponds to whether the source region should be cloned into the 'call' or /// spliced directly. -LogicalResult mlir::inlineCall(InlinerInterface &interface, - CallOpInterface call, - CallableOpInterface callable, Region *src, - bool shouldCloneInlinedRegion) { +LogicalResult +mlir::inlineCall(InlinerInterface &interface, + function_ref cloneCallback, + CallOpInterface call, CallableOpInterface callable, + Region *src, bool shouldCloneInlinedRegion) { // We expect the region to have at least one block. if (src->empty()) return failure(); @@ -552,7 +552,7 @@ LogicalResult mlir::inlineCall(InlinerInterface &interface, return cleanupState(); // Attempt to inline the call. - if (failed(inlineRegionImpl(interface, src, call->getBlock(), + if (failed(inlineRegionImpl(interface, cloneCallback, src, call->getBlock(), ++call->getIterator(), mapper, callResults, callableResultTypes, call.getLoc(), shouldCloneInlinedRegion, call))) diff --git a/mlir/test/Transforms/test-inlining-callback.mlir b/mlir/test/Transforms/test-inlining-callback.mlir new file mode 100644 index 0000000000000..c012c31e7e490 --- /dev/null +++ b/mlir/test/Transforms/test-inlining-callback.mlir @@ -0,0 +1,24 @@ +// RUN: mlir-opt -allow-unregistered-dialect %s -test-inline-callback | FileCheck %s + +// Test inlining with multiple blocks and scf.execute_region transformation +// CHECK-LABEL: func @test_inline_multiple_blocks +func.func @test_inline_multiple_blocks(%arg0: i32) -> i32 { + // CHECK: %[[RES:.*]] = scf.execute_region -> i32 + // CHECK-NEXT: %[[ADD1:.*]] = arith.addi %arg0, %arg0 + // CHECK-NEXT: cf.br ^bb1(%[[ADD1]] : i32) + // CHECK: ^bb1(%[[ARG:.*]]: i32): + // CHECK-NEXT: %[[ADD2:.*]] = arith.addi %[[ARG]], %[[ARG]] + // CHECK-NEXT: scf.yield %[[ADD2]] + // CHECK: return %[[RES]] + %fn = "test.functional_region_op"() ({ + ^bb0(%a : i32): + %b = arith.addi %a, %a : i32 + cf.br ^bb1(%b: i32) + ^bb1(%c: i32): + %d = arith.addi %c, %c : i32 + "test.return"(%d) : (i32) -> () + }) : () -> ((i32) -> i32) + + %0 = call_indirect %fn(%arg0) : (i32) -> i32 + return %0 : i32 +} diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt index c053fd4b20473..76041cd6cd791 100644 --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -29,6 +29,7 @@ add_mlir_library(MLIRTestTransforms TestConstantFold.cpp TestControlFlowSink.cpp TestInlining.cpp + TestInliningCallback.cpp TestMakeIsolatedFromAbove.cpp TestTransformsOps.cpp ${MLIRTestTransformsPDLSrc} diff --git a/mlir/test/lib/Transforms/TestInlining.cpp b/mlir/test/lib/Transforms/TestInlining.cpp index 223cc78dd1e21..ae904a92a5d68 100644 --- a/mlir/test/lib/Transforms/TestInlining.cpp +++ b/mlir/test/lib/Transforms/TestInlining.cpp @@ -18,6 +18,7 @@ #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/IRMapping.h" #include "mlir/Pass/Pass.h" +#include "mlir/Transforms/Inliner.h" #include "mlir/Transforms/InliningUtils.h" #include "llvm/ADT/StringSet.h" @@ -25,8 +26,9 @@ using namespace mlir; using namespace test; namespace { -struct Inliner : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(Inliner) +struct InlinerTest + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InlinerTest) StringRef getArgument() const final { return "test-inline"; } StringRef getDescription() const final { @@ -34,6 +36,8 @@ struct Inliner : public PassWrapper> { } void runOnOperation() override { + InlinerConfig config; + auto function = getOperation(); // Collect each of the direct function calls within the module. @@ -54,8 +58,8 @@ struct Inliner : public PassWrapper> { // Inline the functional region operation, but only clone the internal // region if there is more than one use. if (failed(inlineRegion( - interface, &callee.getBody(), caller, caller.getArgOperands(), - caller.getResults(), caller.getLoc(), + interface, config.getCloneCallback(), &callee.getBody(), caller, + caller.getArgOperands(), caller.getResults(), caller.getLoc(), /*shouldCloneInlinedRegion=*/!callee.getResult().hasOneUse()))) continue; @@ -71,6 +75,6 @@ struct Inliner : public PassWrapper> { namespace mlir { namespace test { -void registerInliner() { PassRegistration(); } +void registerInliner() { PassRegistration(); } } // namespace test } // namespace mlir diff --git a/mlir/test/lib/Transforms/TestInliningCallback.cpp b/mlir/test/lib/Transforms/TestInliningCallback.cpp new file mode 100644 index 0000000000000..012d62b7b1b42 --- /dev/null +++ b/mlir/test/lib/Transforms/TestInliningCallback.cpp @@ -0,0 +1,151 @@ +//===- TestInliningCallback.cpp - Pass to inline calls in the test dialect +//--------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file implements a pass to test inlining callbacks including +// canHandleMultipleBlocks and doClone. +//===----------------------------------------------------------------------===// + +#include "TestDialect.h" +#include "TestOps.h" +#include "mlir/Analysis/CallGraph.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/Inliner.h" +#include "mlir/Transforms/InliningUtils.h" +#include "llvm/ADT/StringSet.h" + +using namespace mlir; +using namespace test; + +namespace { +struct InlinerCallback + : public PassWrapper> { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(InlinerCallback) + + StringRef getArgument() const final { return "test-inline-callback"; } + StringRef getDescription() const final { + return "Test inlining region calls with call back functions"; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + static LogicalResult runPipelineHelper(Pass &pass, OpPassManager &pipeline, + Operation *op) { + return mlir::cast(pass).runPipeline(pipeline, op); + } + + // Customize the implementation of Inliner::doClone + // Wrap the callee into scf.execute_region operation + static void testDoClone(OpBuilder &builder, Region *src, Block *inlineBlock, + Block *postInsertBlock, IRMapping &mapper, + bool shouldCloneInlinedRegion) { + // Create a new scf.execute_region operation + mlir::Operation &call = inlineBlock->back(); + builder.setInsertionPointAfter(&call); + + auto executeRegionOp = builder.create( + call.getLoc(), call.getResultTypes()); + mlir::Region ®ion = executeRegionOp.getRegion(); + + // Move the inlined blocks into the region + src->cloneInto(®ion, mapper); + + // Split block before scf operation. + Block *continueBlock = + inlineBlock->splitBlock(executeRegionOp.getOperation()); + + // Replace all test.return with scf.yield + for (mlir::Block &block : region) { + + for (mlir::Operation &op : llvm::make_early_inc_range(block)) { + if (test::TestReturnOp returnOp = + llvm::dyn_cast(&op)) { + mlir::OpBuilder returnBuilder(returnOp); + returnBuilder.create(returnOp.getLoc(), + returnOp.getOperands()); + returnOp.erase(); + } + } + } + + // Add test.return after scf.execute_region + builder.setInsertionPointAfter(executeRegionOp); + builder.create(executeRegionOp.getLoc(), + executeRegionOp.getResults()); + } + + void runOnOperation() override { + InlinerConfig config; + CallGraph &cg = getAnalysis(); + + func::FuncOp function = getOperation(); + + // By default, assume that any inlining is profitable. + auto profitabilityCb = [&](const mlir::Inliner::ResolvedCall &call) { + return true; + }; + + // Set the clone callback in the config + config.setCloneCallback([](OpBuilder &builder, Region *src, + Block *inlineBlock, Block *postInsertBlock, + IRMapping &mapper, + bool shouldCloneInlinedRegion) { + return testDoClone(builder, src, inlineBlock, postInsertBlock, mapper, + shouldCloneInlinedRegion); + }); + + // Set canHandleMultipleBlocks to true in the config + config.setCanHandleMultipleBlocks(); + + // Get an instance of the inliner. + Inliner inliner(function, cg, *this, getAnalysisManager(), + runPipelineHelper, config, profitabilityCb); + + // Collect each of the direct function calls within the module. + SmallVector callers; + function.walk( + [&](func::CallIndirectOp caller) { callers.push_back(caller); }); + + // Build the inliner interface. + InlinerInterface interface(&getContext()); + + // Try to inline each of the call operations. + for (auto caller : callers) { + auto callee = dyn_cast_or_null( + caller.getCallee().getDefiningOp()); + if (!callee) + continue; + + // Inline the functional region operation, but only clone the internal + // region if there is more than one use. + if (failed(inlineRegion( + interface, config.getCloneCallback(), &callee.getBody(), caller, + caller.getArgOperands(), caller.getResults(), caller.getLoc(), + /*shouldCloneInlinedRegion=*/!callee.getResult().hasOneUse()))) + continue; + + // If the inlining was successful then erase the call and callee if + // possible. + caller.erase(); + if (callee.use_empty()) + callee.erase(); + } + } +}; +} // namespace + +namespace mlir { +namespace test { +void registerInlinerCallback() { PassRegistration(); } +} // namespace test +} // namespace mlir diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index d06ff8070e7cf..ca4706e96787f 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -73,6 +73,7 @@ void registerCommutativityUtils(); void registerConvertCallOpPass(); void registerConvertFuncOpPass(); void registerInliner(); +void registerInlinerCallback(); void registerMemRefBoundCheck(); void registerPatternsTestPass(); void registerSimpleParametricTilingPass(); @@ -215,6 +216,7 @@ void registerTestPasses() { mlir::test::registerConvertCallOpPass(); mlir::test::registerConvertFuncOpPass(); mlir::test::registerInliner(); + mlir::test::registerInlinerCallback(); mlir::test::registerMemRefBoundCheck(); mlir::test::registerPatternsTestPass(); mlir::test::registerSimpleParametricTilingPass(); From 7e62715e0cd433ed97749549c6582c4e1aa689a3 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 13:58:16 -0700 Subject: [PATCH 0760/1029] [RISCV,LoongArch,MC] Replace getSymA()->getSymbol() with getAddSym. NFC We will replace the MCSymbolRefExpr member in MCValue with MCSymbol. This change reduces dependence on MCSymbolRefExpr. Creating a MCSymbolRefExpr in *AsmBackend::handleAddSubRelocations is not efficient, but it is temporary and will be replaced when MCValue no longer uses MCSymbolRefExpr. --- .../Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp | 4 +++- llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 2 +- llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp | 6 ++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index adeb7455ad616..c39482094db12 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -497,7 +497,9 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAssembler &Asm, default: llvm_unreachable("unsupported fixup size"); } - MCValue A = MCValue::get(Target.getSymA(), nullptr, Target.getConstant()); + MCValue A = MCValue::get( + MCSymbolRefExpr::create(Target.getAddSym(), Asm.getContext()), nullptr, + Target.getConstant()); MCValue B = MCValue::get( MCSymbolRefExpr::create(Target.getSubSym(), Asm.getContext())); auto FA = MCFixup::create(Fixup.getOffset(), nullptr, std::get<0>(FK)); diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index a8922c3f9f2e8..d23ddb918e7f9 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2805,7 +2805,7 @@ bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr, bool RISCVAsmParser::isSymbolDiff(const MCExpr *Expr) { MCValue Res; if (Expr->evaluateAsRelocatable(Res, nullptr)) { - return Res.getRefKind() == RISCVMCExpr::VK_None && Res.getSymA() && + return Res.getRefKind() == RISCVMCExpr::VK_None && Res.getAddSym() && Res.getSubSym(); } return false; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index f208618814142..d245b15a39210 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -599,7 +599,7 @@ bool RISCVAsmBackend::evaluateTargetFixup(const MCAssembler &Asm, } } - if (!AUIPCTarget.getSymA()) + if (!AUIPCTarget.getAddSym()) return false; const MCSymbolELF &SA = cast(*AUIPCTarget.getAddSym()); @@ -656,7 +656,9 @@ bool RISCVAsmBackend::handleAddSubRelocations(const MCAssembler &Asm, default: llvm_unreachable("unsupported fixup size"); } - MCValue A = MCValue::get(Target.getSymA(), nullptr, Target.getConstant()); + MCValue A = MCValue::get( + MCSymbolRefExpr::create(Target.getAddSym(), Asm.getContext()), nullptr, + Target.getConstant()); MCValue B = MCValue::get( MCSymbolRefExpr::create(Target.getSubSym(), Asm.getContext())); auto FA = MCFixup::create( From acca419685b0a288e0b06786e6e7bc27bb087578 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 14:01:05 -0700 Subject: [PATCH 0761/1029] [WebAssembly,MC] Replace getSymA()->getSymbol() with getAddSym. NFC We will replace the MCSymbolRefExpr member in MCValue with MCSymbol. This change reduces dependence on MCSymbolRefExpr. --- .../WebAssemblyWasmObjectWriter.cpp | 56 +++++++++---------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp index 941422891832d..7fcd2ec0f64db 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp @@ -65,36 +65,32 @@ static const MCSection *getTargetSection(const MCExpr *Expr) { unsigned WebAssemblyWasmObjectWriter::getRelocType( const MCValue &Target, const MCFixup &Fixup, const MCSectionWasm &FixupSection, bool IsLocRel) const { - const MCSymbolRefExpr *RefA = Target.getSymA(); - assert(RefA); - auto& SymA = cast(RefA->getSymbol()); - - auto Modifier = Target.getAccessVariant(); - - switch (Modifier) { - case MCSymbolRefExpr::VK_GOT: - case MCSymbolRefExpr::VK_WASM_GOT_TLS: - return wasm::R_WASM_GLOBAL_INDEX_LEB; - case MCSymbolRefExpr::VK_WASM_TBREL: - assert(SymA.isFunction()); - return is64Bit() ? wasm::R_WASM_TABLE_INDEX_REL_SLEB64 - : wasm::R_WASM_TABLE_INDEX_REL_SLEB; - case MCSymbolRefExpr::VK_WASM_TLSREL: - return is64Bit() ? wasm::R_WASM_MEMORY_ADDR_TLS_SLEB64 - : wasm::R_WASM_MEMORY_ADDR_TLS_SLEB; - case MCSymbolRefExpr::VK_WASM_MBREL: - assert(SymA.isData()); - return is64Bit() ? wasm::R_WASM_MEMORY_ADDR_REL_SLEB64 - : wasm::R_WASM_MEMORY_ADDR_REL_SLEB; - case MCSymbolRefExpr::VK_WASM_TYPEINDEX: - return wasm::R_WASM_TYPE_INDEX_LEB; - case MCSymbolRefExpr::VK_None: - break; - case MCSymbolRefExpr::VK_WASM_FUNCINDEX: - return wasm::R_WASM_FUNCTION_INDEX_I32; - default: - report_fatal_error("unknown VariantKind"); - break; + auto &SymA = cast(*Target.getAddSym()); + auto Spec = Target.getSymSpecifier(); + switch (Spec) { + case MCSymbolRefExpr::VK_GOT: + case MCSymbolRefExpr::VK_WASM_GOT_TLS: + return wasm::R_WASM_GLOBAL_INDEX_LEB; + case MCSymbolRefExpr::VK_WASM_TBREL: + assert(SymA.isFunction()); + return is64Bit() ? wasm::R_WASM_TABLE_INDEX_REL_SLEB64 + : wasm::R_WASM_TABLE_INDEX_REL_SLEB; + case MCSymbolRefExpr::VK_WASM_TLSREL: + return is64Bit() ? wasm::R_WASM_MEMORY_ADDR_TLS_SLEB64 + : wasm::R_WASM_MEMORY_ADDR_TLS_SLEB; + case MCSymbolRefExpr::VK_WASM_MBREL: + assert(SymA.isData()); + return is64Bit() ? wasm::R_WASM_MEMORY_ADDR_REL_SLEB64 + : wasm::R_WASM_MEMORY_ADDR_REL_SLEB; + case MCSymbolRefExpr::VK_WASM_TYPEINDEX: + return wasm::R_WASM_TYPE_INDEX_LEB; + case MCSymbolRefExpr::VK_None: + break; + case MCSymbolRefExpr::VK_WASM_FUNCINDEX: + return wasm::R_WASM_FUNCTION_INDEX_I32; + default: + report_fatal_error("unknown VariantKind"); + break; } switch (unsigned(Fixup.getKind())) { From 0431fea88ac9a057d1c8751da7f9506cf51f54c5 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 14:04:00 -0700 Subject: [PATCH 0762/1029] [AMDGPU,MC] Replace getSymA()->getSymbol() with getAddSym. NFC We will replace the MCSymbolRefExpr member in MCValue with MCSymbol. This change reduces dependence on MCSymbolRefExpr. --- .../AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 50531af627e4a..bf27688e3b221 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -38,11 +38,11 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - if (const auto *SymA = Target.getSymA()) { + if (const auto *SymA = Target.getAddSym()) { // SCRATCH_RSRC_DWORD[01] is a special global variable that represents // the scratch buffer. - if (SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD0" || - SymA->getSymbol().getName() == "SCRATCH_RSRC_DWORD1") + if (SymA->getName() == "SCRATCH_RSRC_DWORD0" || + SymA->getName() == "SCRATCH_RSRC_DWORD1") return ELF::R_AMDGPU_ABS32_LO; } @@ -82,12 +82,12 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, } if (Fixup.getTargetKind() == AMDGPU::fixup_si_sopp_br) { - const auto *SymA = Target.getSymA(); + const auto *SymA = Target.getAddSym(); assert(SymA); - if (SymA->getSymbol().isUndefined()) { - Ctx.reportError(Fixup.getLoc(), Twine("undefined label '") + - SymA->getSymbol().getName() + "'"); + if (SymA->isUndefined()) { + Ctx.reportError(Fixup.getLoc(), + Twine("undefined label '") + SymA->getName() + "'"); return ELF::R_AMDGPU_NONE; } return ELF::R_AMDGPU_REL16; From d9a767cfc802aca491f5f25fb386679ab1100621 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 14:23:01 -0700 Subject: [PATCH 0763/1029] [Mips,MC] Replace getSymA()->getSymbol() with getAddSym. NFC We will replace the MCSymbolRefExpr member in MCValue with MCSymbol. This change reduces dependence on MCSymbolRefExpr. Create a MipsMCExpr::create overload that takes MCSymbol as an argument. We use the order preferred by other targets. --- llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp | 11 ++++++----- llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp | 7 ++++++- llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h | 8 +++++--- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 8d9ae1e20006a..53c9174725ab8 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -3009,7 +3009,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, const MCExpr *CallHiExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_HI16, SymExpr, getContext()); const MCExpr *CallLoExpr = MipsMCExpr::create( - MipsMCExpr::MEK_GOT_LO16, Res.getSymA(), getContext()); + Res.getAddSym(), MipsMCExpr::MEK_GOT_LO16, getContext()); TOut.emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(CallHiExpr), IDLoc, STI); @@ -3040,7 +3040,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, // The daddiu's marked with a '>' may be omitted if they are redundant. If // this happens then the last instruction must use $rd as the result // register. - GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT_DISP, Res.getSymA(), + GotExpr = MipsMCExpr::create(Res.getAddSym(), MipsMCExpr::MEK_GOT_DISP, getContext()); if (Res.getConstant() != 0) { // Symbols fully resolve with just the %got_disp(symbol) but we @@ -3075,7 +3075,7 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr, // External symbols fully resolve the symbol with just the %got(symbol) // but we must still account for any offset to the symbol for // expressions like symbol+8. - GotExpr = MipsMCExpr::create(MipsMCExpr::MEK_GOT, Res.getSymA(), + GotExpr = MipsMCExpr::create(Res.getAddSym(), MipsMCExpr::MEK_GOT, getContext()); if (Res.getConstant() != 0) LoExpr = MCConstantExpr::create(Res.getConstant(), getContext()); @@ -3771,8 +3771,9 @@ void MipsAsmParser::expandMem16Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, return; } - loadAndAddSymbolAddress(Res.getSymA(), TmpReg, BaseReg, - !ABI.ArePtrs64bit(), IDLoc, Out, STI); + loadAndAddSymbolAddress( + MCSymbolRefExpr::create(Res.getAddSym(), getContext()), TmpReg, + BaseReg, !ABI.ArePtrs64bit(), IDLoc, Out, STI); emitInstWithOffset(MCOperand::createImm(int16_t(Res.getConstant()))); } else { // FIXME: Implement 64-bit case. diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp index 39dc329d80222..cecda2c729d62 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp @@ -26,7 +26,12 @@ using namespace llvm; const MipsMCExpr *MipsMCExpr::create(MipsMCExpr::Specifier S, const MCExpr *Expr, MCContext &Ctx) { - return new (Ctx) MipsMCExpr(S, Expr); + return new (Ctx) MipsMCExpr(Expr, S); +} + +const MipsMCExpr *MipsMCExpr::create(const MCSymbol *Sym, Specifier S, + MCContext &Ctx) { + return new (Ctx) MipsMCExpr(MCSymbolRefExpr::create(Sym, Ctx), S); } const MipsMCExpr *MipsMCExpr::createGpOff(MipsMCExpr::Specifier S, diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h index 13dad8d8a0472..288ca5c277538 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h @@ -47,15 +47,17 @@ class MipsMCExpr : public MCTargetExpr { }; private: - const Specifier specifier; const MCExpr *Expr; + const Specifier specifier; - explicit MipsMCExpr(Specifier S, const MCExpr *Expr) - : specifier(S), Expr(Expr) {} + explicit MipsMCExpr(const MCExpr *Expr, Specifier S) + : Expr(Expr), specifier(S) {} public: static const MipsMCExpr *create(Specifier S, const MCExpr *Expr, MCContext &Ctx); + static const MipsMCExpr *create(const MCSymbol *Sym, Specifier S, + MCContext &Ctx); static const MipsMCExpr *createGpOff(Specifier S, const MCExpr *Expr, MCContext &Ctx); From 590d2a3ca5fde93aff1ef1611f1361180adb1813 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 14:25:05 -0700 Subject: [PATCH 0764/1029] [Mips,MC] Replace getSymA()->getSymbol() with getAddSym. NFC We will replace the MCSymbolRefExpr member in MCValue with MCSymbol. This change reduces dependence on MCSymbolRefExpr. --- llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp index b6d71f27c1272..c9957c2881111 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp @@ -81,16 +81,16 @@ bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result, return false; MCContext &Context = Asm->getContext(); - const MCSymbolRefExpr *Sym = Value.getSymA(); - MCSymbolRefExpr::VariantKind Modifier = Sym->getKind(); - if (Modifier != MCSymbolRefExpr::VK_None) + const MCSymbolRefExpr *Sym = nullptr; + auto Spec = AVRMCExpr::VK_None; + if (Value.getSymSpecifier() != MCSymbolRefExpr::VK_None) return false; assert(!Value.getSubSym()); - if (specifier == VK_PM) { - Modifier = MCSymbolRefExpr::VariantKind(AVRMCExpr::VK_PM); - } + if (specifier == VK_PM) + Spec = AVRMCExpr::VK_PM; - Sym = MCSymbolRefExpr::create(&Sym->getSymbol(), Modifier, Context); + // TODO: don't attach specifier to MCSymbolRefExpr. + Sym = MCSymbolRefExpr::create(Value.getAddSym(), Spec, Context); Result = MCValue::get(Sym, nullptr, Value.getConstant()); } From f8f9be32411d63fa65832306cbc9207ef91446af Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 14:41:47 -0700 Subject: [PATCH 0765/1029] [ARM,MC] Replace getSymA()->getSymbol() with getAddSym. NFC We will replace the MCSymbolRefExpr member in MCValue with MCSymbol. This change reduces dependence on MCSymbolRefExpr. --- .../Target/ARM/MCTargetDesc/ARMAsmBackend.cpp | 17 ++++++++--------- .../ARM/MCTargetDesc/ARMELFObjectWriter.cpp | 4 ++-- .../ARM/MCTargetDesc/ARMMachObjectWriter.cpp | 4 +--- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 9cf8584d82337..afe4be54fa843 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -460,9 +460,9 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, // Other relocation types don't want this bit though (branches couldn't encode // it if it *was* present, and no other relocations exist) and it can // interfere with checking valid expressions. - if (const MCSymbolRefExpr *A = Target.getSymA()) { - if (A->hasSubsectionsViaSymbols() && Asm.isThumbFunc(&A->getSymbol()) && - A->getSymbol().isExternal() && + bool IsMachO = Asm.getContext().getObjectFileType() == MCContext::IsMachO; + if (const auto *SA = Target.getAddSym()) { + if (IsMachO && Asm.isThumbFunc(SA) && SA->isExternal() && (Kind == FK_Data_4 || Kind == ARM::fixup_arm_movw_lo16 || Kind == ARM::fixup_arm_movt_hi16 || Kind == ARM::fixup_t2_movw_lo16 || Kind == ARM::fixup_t2_movt_hi16)) @@ -958,8 +958,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, const MCSubtargetInfo *STI) { - const MCSymbolRefExpr *A = Target.getSymA(); - const MCSymbol *Sym = A ? &A->getSymbol() : nullptr; + const MCSymbol *Sym = Target.getAddSym(); const unsigned FixupKind = Fixup.getKind(); if (FixupKind == ARM::fixup_arm_thumb_bl) { assert(Sym && "How did we resolve this?"); @@ -989,10 +988,10 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm, // We must always generate a relocation for BL/BLX instructions if we have // a symbol to reference, as the linker relies on knowing the destination // symbol's thumb-ness to get interworking right. - if (A && (FixupKind == ARM::fixup_arm_thumb_blx || - FixupKind == ARM::fixup_arm_blx || - FixupKind == ARM::fixup_arm_uncondbl || - FixupKind == ARM::fixup_arm_condbl)) + if (Sym && (FixupKind == ARM::fixup_arm_thumb_blx || + FixupKind == ARM::fixup_arm_blx || + FixupKind == ARM::fixup_arm_uncondbl || + FixupKind == ARM::fixup_arm_condbl)) return true; return false; } diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index f9c8f5f820e6f..42838bb83781c 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -122,10 +122,10 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, "invalid fixup for 4-byte pc-relative data relocation"); return ELF::R_ARM_NONE; case ARMMCExpr::VK_None: { - if (const MCSymbolRefExpr *SymRef = Target.getSymA()) { + if (const auto *SA = Target.getAddSym()) { // For GNU AS compatibility expressions such as // _GLOBAL_OFFSET_TABLE_ - label emit a R_ARM_BASE_PREL relocation. - if (SymRef->getSymbol().getName() == "_GLOBAL_OFFSET_TABLE_") + if (SA->getName() == "_GLOBAL_OFFSET_TABLE_") return ELF::R_ARM_BASE_PREL; } return ELF::R_ARM_REL32; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 353e4fa5b2a9d..03ad4149f766b 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -386,9 +386,7 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer, } // Get the symbol data, if any. - const MCSymbol *A = nullptr; - if (Target.getSymA()) - A = Target.getAddSym(); + const MCSymbol *A = Target.getAddSym(); // FIXME: For other platforms, we need to use scattered relocations for // internal relocations with offsets. If this is an internal relocation with From 7833107993e7df63301c655e0b4c49b2f625fa5e Mon Sep 17 00:00:00 2001 From: Alan Date: Sat, 5 Apr 2025 18:21:38 -0400 Subject: [PATCH 0766/1029] [OCaml] Make OCaml MetadataKind type consistent with C API (#134507) Fixes breakage of OCaml API introduced by commit 6894734. --- llvm/bindings/ocaml/debuginfo/llvm_debuginfo.ml | 1 + llvm/bindings/ocaml/debuginfo/llvm_debuginfo.mli | 1 + llvm/include/llvm-c/DebugInfo.h | 2 +- llvm/test/CMakeLists.txt | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/bindings/ocaml/debuginfo/llvm_debuginfo.ml b/llvm/bindings/ocaml/debuginfo/llvm_debuginfo.ml index b5c8128c4c090..3e9a82962d99a 100644 --- a/llvm/bindings/ocaml/debuginfo/llvm_debuginfo.ml +++ b/llvm/bindings/ocaml/debuginfo/llvm_debuginfo.ml @@ -144,6 +144,7 @@ module MetadataKind = struct | DIArgListMetadataKind | DIAssignIDMetadataKind | DISubrangeTypeMetadataKind + | DIFixedPointTypeMetadataKind end (** The amount of debug information to emit. *) diff --git a/llvm/bindings/ocaml/debuginfo/llvm_debuginfo.mli b/llvm/bindings/ocaml/debuginfo/llvm_debuginfo.mli index 8a36a2b7d81b1..d759b53642755 100644 --- a/llvm/bindings/ocaml/debuginfo/llvm_debuginfo.mli +++ b/llvm/bindings/ocaml/debuginfo/llvm_debuginfo.mli @@ -148,6 +148,7 @@ module MetadataKind : sig | DIArgListMetadataKind | DIAssignIDMetadataKind | DISubrangeTypeMetadataKind + | DIFixedPointTypeMetadataKind end (** The amount of debug information to emit. *) diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h index 9fbe31d2629bd..11e0b9b4c81e8 100644 --- a/llvm/include/llvm-c/DebugInfo.h +++ b/llvm/include/llvm-c/DebugInfo.h @@ -173,7 +173,6 @@ enum { LLVMDISubrangeMetadataKind, LLVMDIEnumeratorMetadataKind, LLVMDIBasicTypeMetadataKind, - LLVMDIFixedPointTypeMetadataKind, LLVMDIDerivedTypeMetadataKind, LLVMDICompositeTypeMetadataKind, LLVMDISubroutineTypeMetadataKind, @@ -199,6 +198,7 @@ enum { LLVMDIArgListMetadataKind, LLVMDIAssignIDMetadataKind, LLVMDISubrangeTypeMetadataKind, + LLVMDIFixedPointTypeMetadataKind, }; typedef unsigned LLVMMetadataKind; diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index d984193875fa2..a67e2b85d9b53 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -212,6 +212,7 @@ if(TARGET ocaml_llvm) ocaml_llvm_analysis ocaml_llvm_bitreader ocaml_llvm_bitwriter + ocaml_llvm_debuginfo ocaml_llvm_executionengine ocaml_llvm_irreader ocaml_llvm_linker From 55ff96abfa08ec94b0f8f4ebe187a3232e9d92b7 Mon Sep 17 00:00:00 2001 From: weiwei chen Date: Sat, 5 Apr 2025 19:41:52 -0400 Subject: [PATCH 0767/1029] [X86][Test] Reorder PassMgrF and OS. (#134481) Reordering `OS` and `PassMgrF` should fix the asan failure that's caused by OS being destroyed before `PassMgrF` deletes the AsmPrinter. As shown in[ this asan run ](https://lab.llvm.org/buildbot/#/builders/52/builds/7340/steps/12/logs/stdio) ``` This frame has 15 object(s): [32, 48) 'PassMgrF' (line 154) [64, 1112) 'Buf' (line 155) [1248, 1304) 'OS' (line 156) <== Memory access at offset 1280 is inside this variable ``` which indicates an ordering problem. This should help to fix all the sanitizer failures caused by the test `X86MCInstLowerTest.cpp` that's introduced by [this PR](https://github.com/llvm/llvm-project/pull/133352#issuecomment-2780173791). --- llvm/unittests/CodeGen/X86MCInstLowerTest.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/unittests/CodeGen/X86MCInstLowerTest.cpp b/llvm/unittests/CodeGen/X86MCInstLowerTest.cpp index f5a59b16b4487..72cc79a8dfd8c 100644 --- a/llvm/unittests/CodeGen/X86MCInstLowerTest.cpp +++ b/llvm/unittests/CodeGen/X86MCInstLowerTest.cpp @@ -151,9 +151,10 @@ TEST_F(X86MCInstLowerTest, moExternalSymbol_MCSYMBOL) { MachineModuleInfoWrapperPass *MMIWP = new MachineModuleInfoWrapperPass(TM.get(), &*MCFoo); - legacy::PassManager PassMgrF; SmallString<1024> Buf; llvm::raw_svector_ostream OS(Buf); + legacy::PassManager PassMgrF; + AsmPrinter *Printer = addPassesToEmitFile(PassMgrF, OS, CodeGenFileType::AssemblyFile, MMIWP); PassMgrF.run(*M); From aaaeb86acea77c5bcdb60011ce6aaaf4ebca4081 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Sat, 5 Apr 2025 17:03:55 -0700 Subject: [PATCH 0768/1029] [clang-format] Merge inline short functions for BS_Whitesmiths (#134473) Fix #126747 --- clang/lib/Format/UnwrappedLineFormatter.cpp | 9 +++++++-- clang/unittests/Format/FormatTest.cpp | 7 +++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index 000a5105ca407..62759d7945f7b 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -316,8 +316,13 @@ class LineJoiner { const AnnotatedLine *Line = nullptr; for (auto J = I - 1; J >= AnnotatedLines.begin(); --J) { assert(*J); - if (!(*J)->InPPDirective && !(*J)->isComment() && - (*J)->Level < TheLine->Level) { + if ((*J)->InPPDirective || (*J)->isComment() || + (*J)->Level > TheLine->Level) { + continue; + } + if ((*J)->Level < TheLine->Level || + (Style.BreakBeforeBraces == FormatStyle::BS_Whitesmiths && + (*J)->First->is(tok::l_brace))) { Line = *J; break; } diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 4dfa135120605..69c9ee1d1dcb2 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -15142,6 +15142,13 @@ TEST_F(FormatTest, PullInlineOnlyFunctionDefinitionsIntoSingleLine) { "}", MergeInlineOnly); + MergeInlineOnly.BreakBeforeBraces = FormatStyle::BS_Whitesmiths; + verifyFormat("class Foo\n" + " {\n" + " void f() { foo(); }\n" + " };", + MergeInlineOnly); + // Also verify behavior when BraceWrapping.AfterFunction = true MergeInlineOnly.BreakBeforeBraces = FormatStyle::BS_Custom; MergeInlineOnly.BraceWrapping.AfterFunction = true; From 94821ce45fe93aa78cc5ea03cd9deac91b7af127 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 17:16:54 -0700 Subject: [PATCH 0769/1029] MCValue: Store SymA specifier at Specifier The relocation specifier should be accessed via MCValue::Specifier. However, some targets encode the relocation specifier within SymA using MCSymbolRefExpr::SubclassData and access it via getAccessVariant(), though this method is now deprecated. This change stores the SymA specifier at Specifier as well, unifying the two code paths. * CSKY: GOT- and PLT- relocations now suppress the STT_SECTION conversion. * AArch64: https://reviews.llvm.org/D156505 added `getRefkind` check to prevent folding. This is a hack and is now removed. MCValue: Unify relocation specifier storage by storing SymA specifier at Specifier The relocation specifier is accessed via MCValue::Specifier, but some targets encoded it within SymA using MCSymbolRefExpr::SubclassData and retrieved it through the now-deprecated getAccessVariant() method. This commit unifies the two approaches by storing the SymA specifier at `Specifier` as well. Additional changes: - CSKY: GOT- and PLT- relocations now suppress STT_SECTION conversion. - AArch64: Removed the `getRefkind` check hack (introduced in https://reviews.llvm.org/D156505) that prevented folding. Removed the assertion from `getRelocType`. - RISCV: Removed the assertion from `getRelocType`. Future plans: - Replace MCSymbolRefExpr members with MCSymbol within MCValue. - Remove `getSymSpecifier` (added for migration). --- llvm/include/llvm/MC/MCExpr.h | 3 ++- llvm/include/llvm/MC/MCValue.h | 21 +++++++++++++------ llvm/lib/MC/MCAssembler.cpp | 2 +- llvm/lib/MC/MCExpr.cpp | 19 ++++++++++------- llvm/lib/MC/MCValue.cpp | 6 ------ .../MCTargetDesc/AArch64AsmBackend.cpp | 7 ++++++- .../MCTargetDesc/AArch64ELFObjectWriter.cpp | 6 ------ .../MCTargetDesc/PPCELFObjectWriter.cpp | 15 ++++--------- .../MCTargetDesc/RISCVELFObjectWriter.cpp | 3 --- llvm/test/MC/AArch64/elf-reloc-ptrauth.s | 19 +++++++---------- llvm/test/MC/CSKY/relocation-specifier.s | 4 ++-- 11 files changed, 49 insertions(+), 56 deletions(-) diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index b0e347d690f0e..0d7a961c364db 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -135,7 +135,8 @@ class MCExpr { static bool evaluateSymbolicAdd(const MCAssembler *, const SectionAddrMap *, bool, const MCValue &, const MCSymbolRefExpr *, - const MCSymbolRefExpr *, int64_t, MCValue &); + const MCSymbolRefExpr *, int64_t, uint32_t, + MCValue &); }; inline raw_ostream &operator<<(raw_ostream &OS, const MCExpr &E) { diff --git a/llvm/include/llvm/MC/MCValue.h b/llvm/include/llvm/MC/MCValue.h index 9ef77703356a4..fbe32ae5d59fe 100644 --- a/llvm/include/llvm/MC/MCValue.h +++ b/llvm/include/llvm/MC/MCValue.h @@ -36,6 +36,10 @@ class MCValue { int64_t Cst = 0; uint32_t Specifier = 0; + // SymA has a relocation specifier. This is a workaround for targets + // that encode specifiers within MCSymbolRefExpr::SubclassData. + bool SymSpecifier = false; + // SymB cannot have a specifier. Use getSubSym instead. const MCSymbolRefExpr *getSymB() const { return SymB; } @@ -67,18 +71,23 @@ class MCValue { // Get the relocation specifier from SymA. This is a workaround for targets // that do not use MCValue::Specifier. - uint16_t getSymSpecifier() const { return SymA->getSpecifier(); } - // Get the relocation specifier from SymA, or 0 when SymA is null. - uint16_t getAccessVariant() const; + uint16_t getSymSpecifier() const { return Specifier; } + uint16_t getAccessVariant() const { return Specifier; } static MCValue get(const MCSymbolRefExpr *SymA, - const MCSymbolRefExpr *SymB = nullptr, - int64_t Val = 0, uint32_t RefKind = 0) { + const MCSymbolRefExpr *SymB = nullptr, int64_t Val = 0, + uint32_t Specifier = 0) { MCValue R; R.Cst = Val; R.SymA = SymA; R.SymB = SymB; - R.Specifier = RefKind; + R.Specifier = Specifier; + assert(!(Specifier && SymA && SymA->getSpecifier()) && + "Specifier cannot be used with legacy SymSpecifier"); + if (!Specifier && SymA && SymA->getSpecifier()) { + R.Specifier = SymA->getSpecifier(); + R.SymSpecifier = true; + } return R; } diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 5e3081251e40a..5739097cc2620 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -168,7 +168,7 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF, IsResolved = false; } else { auto &SA = *Target.getAddSym(); - if (Target.getSymSpecifier() || SA.isUndefined()) { + if (Target.SymSpecifier || SA.isUndefined()) { IsResolved = false; } else { IsResolved = (FixupFlags & MCFixupKindInfo::FKF_Constant) || diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index b921b55950772..e3d4c5c27fe69 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -439,7 +439,7 @@ bool MCExpr::evaluateSymbolicAdd(const MCAssembler *Asm, const MCValue &LHS, const MCSymbolRefExpr *RhsAdd, const MCSymbolRefExpr *RhsSub, int64_t RHS_Cst, - MCValue &Res) { + uint32_t RhsSpec, MCValue &Res) { const MCSymbol *LHS_A = LHS.getAddSym(); const MCSymbol *LHS_B = LHS.getSubSym(); int64_t LHS_Cst = LHS.getConstant(); @@ -451,16 +451,16 @@ bool MCExpr::evaluateSymbolicAdd(const MCAssembler *Asm, int64_t Result_Cst = LHS_Cst + RHS_Cst; // If we have a layout, we can fold resolved differences. - if (Asm) { + if (Asm && !LHS.getSpecifier() && !RhsSpec) { // While LHS_A-LHS_B and RHS_A-RHS_B from recursive calls have already been // folded, reassociating terms in // Result = (LHS_A - LHS_B + LHS_Cst) + (RHS_A - RHS_B + RHS_Cst). // might bring more opportunities. - if (LHS_A && RHS_B && !LHS.getSymA()->getSpecifier()) { + if (LHS_A && RHS_B) { attemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, LHS_A, RHS_B, Result_Cst); } - if (RHS_A && LHS_B && !RhsAdd->getSpecifier()) { + if (RHS_A && LHS_B) { attemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, RHS_A, LHS_B, Result_Cst); } @@ -476,7 +476,12 @@ bool MCExpr::evaluateSymbolicAdd(const MCAssembler *Asm, auto *B = LHS_B ? LHS.getSymB() : RHS_B ? RhsSub : nullptr; if (B && B->getKind() != MCSymbolRefExpr::VK_None) return false; + auto Spec = LHS.getSpecifier(); + if (!Spec) + Spec = RhsSpec; Res = MCValue::get(A, B, Result_Cst); + Res.Specifier = Spec; + Res.SymSpecifier = 0; return true; } @@ -634,9 +639,6 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, return false; case MCBinaryExpr::Add: case MCBinaryExpr::Sub: - // TODO: Prevent folding for AArch64 @AUTH operands. - if (LHSValue.getSpecifier() || RHSValue.getSpecifier()) - return false; if (Op == MCBinaryExpr::Sub) { std::swap(RHSValue.SymA, RHSValue.SymB); RHSValue.Cst = -(uint64_t)RHSValue.Cst; @@ -652,7 +654,8 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, return true; } return evaluateSymbolicAdd(Asm, Addrs, InSet, LHSValue, RHSValue.SymA, - RHSValue.SymB, RHSValue.Cst, Res); + RHSValue.SymB, RHSValue.Cst, + RHSValue.SymSpecifier, Res); } } diff --git a/llvm/lib/MC/MCValue.cpp b/llvm/lib/MC/MCValue.cpp index 913fe83ab94eb..77deb0b4ab671 100644 --- a/llvm/lib/MC/MCValue.cpp +++ b/llvm/lib/MC/MCValue.cpp @@ -43,9 +43,3 @@ LLVM_DUMP_METHOD void MCValue::dump() const { print(dbgs()); } #endif - -uint16_t MCValue::getAccessVariant() const { - if (!SymA) - return 0; - return SymA->getSpecifier(); -} diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index c5accb5e3b51b..3335d9d6f009c 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -426,8 +426,13 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, AArch64MCExpr::Specifier SymLoc = AArch64MCExpr::getSymbolLoc(RefKind); if (SymLoc == AArch64AuthMCExpr::VK_AUTH || SymLoc == AArch64AuthMCExpr::VK_AUTHADDR) { + const auto *Expr = dyn_cast(Fixup.getValue()); + if (!Expr) { + Asm.getContext().reportError(Fixup.getValue()->getLoc(), + "expected relocatable expression"); + return; + } assert(Value == 0); - const auto *Expr = cast(Fixup.getValue()); Value = (uint64_t(Expr->getDiscriminator()) << 32) | (uint64_t(Expr->getKey()) << 60) | (uint64_t(Expr->hasAddressDiversity()) << 63); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index b03c55cafdcdf..c3c4c64cad5b0 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -116,12 +116,6 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, AArch64MCExpr::Specifier SymLoc = AArch64MCExpr::getSymbolLoc(RefKind); bool IsNC = AArch64MCExpr::isNotChecked(RefKind); - assert((!Target.getAddSym() || - Target.getSymSpecifier() == AArch64MCExpr::None || - Target.getSymSpecifier() == AArch64MCExpr::VK_PLT || - Target.getSymSpecifier() == AArch64MCExpr::VK_GOTPCREL) && - "Should only be expression-level modifiers here"); - switch (SymLoc) { case AArch64MCExpr::VK_DTPREL: case AArch64MCExpr::VK_GOTTPREL: diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index 845c590478e5c..e33961a973d1e 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -196,9 +196,9 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, Type = ELF::R_PPC_ADDR14; // XXX: or BRNTAKEN?_ break; case PPC::fixup_ppc_half16: - switch (RefKind) { + switch (Modifier) { default: - break; + llvm_unreachable("Unsupported specifier"); case PPCMCExpr::VK_LO: return ELF::R_PPC_ADDR16_LO; case PPCMCExpr::VK_HI: @@ -217,9 +217,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, return ELF::R_PPC64_ADDR16_HIGHEST; case PPCMCExpr::VK_HIGHESTA: return ELF::R_PPC64_ADDR16_HIGHESTA; - } - switch (Modifier) { - default: llvm_unreachable("Unsupported Modifier"); + case PPCMCExpr::VK_None: Type = ELF::R_PPC_ADDR16; break; @@ -373,17 +371,12 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, break; case PPC::fixup_ppc_half16ds: case PPC::fixup_ppc_half16dq: - switch (RefKind) { + switch (Modifier) { default: Ctx.reportError(Fixup.getLoc(), "invalid VariantKind"); return ELF::R_PPC64_NONE; - case PPCMCExpr::VK_None: - break; case PPCMCExpr::VK_LO: return ELF::R_PPC64_ADDR16_LO_DS; - } - switch (Modifier) { - default: llvm_unreachable("Unsupported Modifier"); case PPCMCExpr::VK_None: Type = ELF::R_PPC64_ADDR16_DS; break; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp index 586440f407d71..1662b8068084c 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp @@ -50,9 +50,6 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - assert((!Target.getAddSym() || - Target.getSymSpecifier() == MCSymbolRefExpr::VK_None) && - "sym@specifier should have been rejected"); const MCExpr *Expr = Fixup.getValue(); // Determine the type of the relocation unsigned Kind = Fixup.getTargetKind(); diff --git a/llvm/test/MC/AArch64/elf-reloc-ptrauth.s b/llvm/test/MC/AArch64/elf-reloc-ptrauth.s index 9fe78a4e4e822..6f47e2d614d19 100644 --- a/llvm/test/MC/AArch64/elf-reloc-ptrauth.s +++ b/llvm/test/MC/AArch64/elf-reloc-ptrauth.s @@ -164,15 +164,6 @@ _g9: // RUN: not llvm-mc -triple=aarch64 -filetype=obj --defsym=ERROBJ=1 %s -o /dev/null 2>&1 | \ // RUN: FileCheck %s --check-prefix=ERROBJ -// ERROBJ: :[[#@LINE+1]]:7: error: expected relocatable expression -.quad sym@AUTH(ia,42) + 1 - -// ERROBJ: :[[#@LINE+1]]:7: error: expected relocatable expression -.quad 1 + sym@AUTH(ia,42) - -// ERROBJ: :[[#@LINE+1]]:7: error: expected relocatable expression -.quad 1 + sym@AUTH(ia,42) + 1 - // ERROBJ: :[[#@LINE+1]]:7: error: expected relocatable expression .quad sym@AUTH(ia,42) + sym@AUTH(ia,42) @@ -181,11 +172,17 @@ _g9: // distance remains the same. Leave it in such state as for now since it // makes code simpler: subtraction of a non-AUTH symbol and of a constant // are handled identically. -// ERROBJ: :[[#@LINE+1]]:7: error: expected relocatable expression +// ERROBJ: :[[#@LINE+1]]:7: error: Cannot represent a difference across sections .quad _g9@AUTH(ia,42) - _g8 -// ERROBJ: :[[#@LINE+1]]:7: error: expected relocatable expression +// ERROBJ: :[[#@LINE+1]]:7: error: Cannot represent a difference across sections .quad _g9@AUTH(ia,42) - _g8@AUTH(ia,42) .quad 0 +// ERROBJ: :[[#@LINE+1]]:23: error: expected relocatable expression +.quad sym@AUTH(ia,42) + 1 + +// ERROBJ: :[[#@LINE+1]]:9: error: expected relocatable expression +.quad 1 + sym@AUTH(ia,42) + .endif // ERROBJ diff --git a/llvm/test/MC/CSKY/relocation-specifier.s b/llvm/test/MC/CSKY/relocation-specifier.s index 759d6df5e545c..e305743ddbf39 100644 --- a/llvm/test/MC/CSKY/relocation-specifier.s +++ b/llvm/test/MC/CSKY/relocation-specifier.s @@ -2,8 +2,8 @@ # RUN: llvm-readelf -rs %t | FileCheck %s --check-prefix=READELF # READELF: '.rela.data' -# READELF: R_CKCORE_GOT32 00000000 .data + 0 -# READELF: R_CKCORE_PLT32 00000000 .data + 0 +# READELF: R_CKCORE_GOT32 00000000 local + 0 +# READELF: R_CKCORE_PLT32 00000000 local + 0 # READELF: TLS GLOBAL DEFAULT UND gd # READELF: TLS GLOBAL DEFAULT UND ld From 38c3ad36be1facbe6db2dede7e93c0f12fb4e1dc Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 19:01:16 -0700 Subject: [PATCH 0770/1029] Define MCAsmBackend::shouldForceRelocation Return true if the MCValue has a specifier. When a relocation specifier is specified, GNU Assembler will generate a relocation unless the specifier can be optimized due to target-specific reasons (e.g. PPC `@l` `@ha`). This reduces targets' reliance on a MCAssembler::evaluateFixup hack `if (Target.SymSpecifier || SA.isUndefined()) {`, previosuly `if (A->getKind() != MCSymbolRefExpr::VK_None || SA.isUndefined()) {` llvm/test/MC/SystemZ/fixups.s is known to rely on this hack. --- llvm/include/llvm/MC/MCAsmBackend.h | 11 ++++------- llvm/lib/MC/MCAsmBackend.cpp | 6 ++++++ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/llvm/include/llvm/MC/MCAsmBackend.h b/llvm/include/llvm/MC/MCAsmBackend.h index 69ba400fac879..5953de30c2eb2 100644 --- a/llvm/include/llvm/MC/MCAsmBackend.h +++ b/llvm/include/llvm/MC/MCAsmBackend.h @@ -89,13 +89,10 @@ class MCAsmBackend { /// Get information on a fixup kind. virtual const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const; - /// Hook to check if a relocation is needed for some target specific reason. - virtual bool shouldForceRelocation(const MCAssembler &Asm, - const MCFixup &Fixup, - const MCValue &Target, - const MCSubtargetInfo *STI) { - return false; - } + // Hook to check if a relocation is needed. The default implementation tests + // whether the MCValue has a relocation specifier. + virtual bool shouldForceRelocation(const MCAssembler &, const MCFixup &, + const MCValue &, const MCSubtargetInfo *); /// Hook to check if extra nop bytes must be inserted for alignment directive. /// For some targets this may be necessary in order to support linker diff --git a/llvm/lib/MC/MCAsmBackend.cpp b/llvm/lib/MC/MCAsmBackend.cpp index b5a0766988e14..23cc134f65b52 100644 --- a/llvm/lib/MC/MCAsmBackend.cpp +++ b/llvm/lib/MC/MCAsmBackend.cpp @@ -109,6 +109,12 @@ const MCFixupKindInfo &MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { return Builtins[Kind]; } +bool MCAsmBackend::shouldForceRelocation(const MCAssembler &, const MCFixup &, + const MCValue &Target, + const MCSubtargetInfo *) { + return Target.getSpecifier(); +} + bool MCAsmBackend::fixupNeedsRelaxationAdvanced(const MCAssembler &Asm, const MCFixup &Fixup, bool Resolved, uint64_t Value, From 4182d2dcb5ecbfc34d41a6cd11810cd36844eddb Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 20:04:07 -0700 Subject: [PATCH 0771/1029] [ARM,PowerPC] shouldForceRelocation: check MCValue::Specifier Follow-up to 38c3ad36be1facbe6db2dede7e93c0f12fb4e1dc Removes reliance on a MCAssembler::evaluateFixup hack `if (Target.SymSpecifier || SA.isUndefined()) {` (previosuly `if (A->getKind() != MCSymbolRefExpr::VK_None || SA.isUndefined()) {`) Target.SymSpecifier will soon go away when MCValue replaces MCSymbolRefExpr members with MCSymbol and removes the temporary `SymSpecifier` workaround. --- llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp | 2 +- llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index afe4be54fa843..bed15bdc274ba 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -993,7 +993,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm, FixupKind == ARM::fixup_arm_uncondbl || FixupKind == ARM::fixup_arm_condbl)) return true; - return false; + return Target.getSpecifier(); } /// getFixupKindNumBytes - The number of bytes the fixup may change. diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 5d6999dbcf1b6..34a1424f08486 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -159,6 +159,10 @@ class PPCAsmBackend : public MCAsmBackend { bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, const MCSubtargetInfo *STI) override { + // If there is a @ specifier, unless it is optimized out (e.g. constant @l), + // force a relocation. + if (Target.getSpecifier()) + return true; MCFixupKind Kind = Fixup.getKind(); switch ((unsigned)Kind) { default: From d5893fc2a7e1191afdb4940469ec9371a319b114 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 21:02:08 -0700 Subject: [PATCH 0772/1029] MCValue: Replace MCSymbolRefExpr members with MCSymbol Commit 0999cbd0b9ed8aa893cce10d681dec6d54b200ad (2014) introduced `MCValue::RefKind` for AArch64 ELF as a clean approach to encode the relocation specifier. Following numerous migration commits, direct references to getSymA and getSymB have been eliminated. This allows us to seamlessly update SymA and SymB, replacing MCSymbolRefExpr with MCSymbol. Removeing reliance on a MCAssembler::evaluateFixup hack (`if (Target.SymSpecifier || SA.isUndefined()) {` (previosuly `if (A->getKind() != MCSymbolRefExpr::VK_None || SA.isUndefined()) {`)) requires 38c3ad36be1facbe6db2dede7e93c0f12fb4e1dc and 4182d2dcb5ecbfc34d41a6cd11810cd36844eddb Revert the temporary RISCV/LoongArch workaround (7e62715e0cd433ed97749549c6582c4e1aa689a3) during migration. MCAssembler::evaluateFixup needs an extra `!Add->isAbsolute()` case to support `movq abs@GOTPCREL(%rip), %rax; abs = 42` in llvm/test/MC/ELF/relocation-alias.s (ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl asserts if called on an absolute symbol). --- llvm/include/llvm/MC/MCExpr.h | 7 +-- llvm/include/llvm/MC/MCValue.h | 34 +++----------- llvm/lib/MC/MCAssembler.cpp | 37 +++++---------- llvm/lib/MC/MCExpr.cpp | 47 +++++++++---------- .../lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp | 6 +-- .../MCTargetDesc/LoongArchAsmBackend.cpp | 9 ++-- .../RISCV/MCTargetDesc/RISCVAsmBackend.cpp | 7 +-- llvm/test/MC/AArch64/elf-reloc-ptrauth.s | 2 +- 8 files changed, 52 insertions(+), 97 deletions(-) diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index 0d7a961c364db..6311a134cf5a7 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -133,9 +133,7 @@ class MCExpr { /// @} static bool evaluateSymbolicAdd(const MCAssembler *, const SectionAddrMap *, - bool, const MCValue &, - const MCSymbolRefExpr *, - const MCSymbolRefExpr *, int64_t, uint32_t, + bool, const MCValue &, const MCValue &, MCValue &); }; @@ -263,6 +261,9 @@ class MCSymbolRefExpr : public MCExpr { const MCSymbol &getSymbol() const { return *Symbol; } + // Some targets encode the relocation specifier within SymA using + // MCSymbolRefExpr::SubclassData and access it via getAccessVariant(), though + // this method is now deprecated. VariantKind getKind() const { return (VariantKind)(getSubclassData() & VariantKindMask); } diff --git a/llvm/include/llvm/MC/MCValue.h b/llvm/include/llvm/MC/MCValue.h index fbe32ae5d59fe..74f68e33e8763 100644 --- a/llvm/include/llvm/MC/MCValue.h +++ b/llvm/include/llvm/MC/MCValue.h @@ -25,40 +25,24 @@ class raw_ostream; // Not all targets support SymB. For PC-relative relocations, a specifier is // typically used instead of setting SymB to DOT. // -// Some targets encode the relocation specifier within SymA using -// MCSymbolRefExpr::SubclassData and access it via getAccessVariant(), though -// this method is now deprecated. -// // This class must remain a simple POD value class, as it needs to reside in // unions and similar structures. class MCValue { - const MCSymbolRefExpr *SymA = nullptr, *SymB = nullptr; + const MCSymbol *SymA = nullptr, *SymB = nullptr; int64_t Cst = 0; uint32_t Specifier = 0; - // SymA has a relocation specifier. This is a workaround for targets - // that encode specifiers within MCSymbolRefExpr::SubclassData. - bool SymSpecifier = false; - - // SymB cannot have a specifier. Use getSubSym instead. - const MCSymbolRefExpr *getSymB() const { return SymB; } - public: friend class MCAssembler; friend class MCExpr; MCValue() = default; int64_t getConstant() const { return Cst; } - const MCSymbolRefExpr *getSymA() const { return SymA; } uint32_t getRefKind() const { return Specifier; } uint32_t getSpecifier() const { return Specifier; } void setSpecifier(uint32_t S) { Specifier = S; } - const MCSymbol *getAddSym() const { - return SymA ? &SymA->getSymbol() : nullptr; - } - const MCSymbol *getSubSym() const { - return SymB ? &SymB->getSymbol() : nullptr; - } + const MCSymbol *getAddSym() const { return SymA; } + const MCSymbol *getSubSym() const { return SymB; } /// Is this an absolute (as opposed to relocatable) value. bool isAbsolute() const { return !SymA && !SymB; } @@ -72,22 +56,16 @@ class MCValue { // Get the relocation specifier from SymA. This is a workaround for targets // that do not use MCValue::Specifier. uint16_t getSymSpecifier() const { return Specifier; } + // Get the relocation specifier from SymA, or 0 when SymA is null. uint16_t getAccessVariant() const { return Specifier; } - static MCValue get(const MCSymbolRefExpr *SymA, - const MCSymbolRefExpr *SymB = nullptr, int64_t Val = 0, - uint32_t Specifier = 0) { + static MCValue get(const MCSymbol *SymA, const MCSymbol *SymB = nullptr, + int64_t Val = 0, uint32_t Specifier = 0) { MCValue R; R.Cst = Val; R.SymA = SymA; R.SymB = SymB; R.Specifier = Specifier; - assert(!(Specifier && SymA && SymA->getSpecifier()) && - "Specifier cannot be used with legacy SymSpecifier"); - if (!Specifier && SymA && SymA->getSpecifier()) { - R.Specifier = SymA->getSpecifier(); - R.SymSpecifier = true; - } return R; } diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 5739097cc2620..0bfb32115fe66 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -161,34 +161,23 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF, return getBackend().evaluateTargetFixup(*this, Fixup, DF, Target, STI, Value, WasForced); + const MCSymbol *Add = Target.getAddSym(); + const MCSymbol *Sub = Target.getSubSym(); bool IsPCRel = FixupFlags & MCFixupKindInfo::FKF_IsPCRel; bool IsResolved = false; - if (IsPCRel) { - if (Target.getSubSym() || !Target.getAddSym()) { - IsResolved = false; - } else { - auto &SA = *Target.getAddSym(); - if (Target.SymSpecifier || SA.isUndefined()) { - IsResolved = false; - } else { - IsResolved = (FixupFlags & MCFixupKindInfo::FKF_Constant) || - getWriter().isSymbolRefDifferenceFullyResolvedImpl( - *this, SA, *DF, false, true); - } - } - } else { + if (!IsPCRel) { IsResolved = Target.isAbsolute(); + } else if (Add && !Sub && !Add->isUndefined() && !Add->isAbsolute()) { + IsResolved = (FixupFlags & MCFixupKindInfo::FKF_Constant) || + getWriter().isSymbolRefDifferenceFullyResolvedImpl( + *this, *Add, *DF, false, true); } Value = Target.getConstant(); - - if (const MCSymbol *Add = Target.getAddSym()) { - if (Add->isDefined()) - Value += getSymbolOffset(*Add); - } - if (const MCSymbol *Sub = Target.getSubSym()) - if (Sub->isDefined()) - Value -= getSymbolOffset(*Sub); + if (Add && Add->isDefined()) + Value += getSymbolOffset(*Add); + if (Sub && Sub->isDefined()) + Value -= getSymbolOffset(*Sub); bool ShouldAlignPC = FixupFlags & MCFixupKindInfo::FKF_IsAlignedDownTo32Bits; assert((ShouldAlignPC ? IsPCRel : true) && @@ -208,8 +197,8 @@ bool MCAssembler::evaluateFixup(const MCFixup &Fixup, const MCFragment *DF, // kind. AVR needs the fixup value to bypass the assembly time overflow with a // relocation. if (IsResolved) { - auto TargetVal = MCValue::get(Target.getSymA(), Target.getSymB(), Value, - Target.getRefKind()); + auto TargetVal = Target; + TargetVal.Cst = Value; if (Fixup.getKind() >= FirstLiteralRelocationKind || getBackend().shouldForceRelocation(*this, Fixup, TargetVal, STI)) { IsResolved = false; diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index e3d4c5c27fe69..c856ef5f97203 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -436,22 +436,21 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm, // early. bool MCExpr::evaluateSymbolicAdd(const MCAssembler *Asm, const SectionAddrMap *Addrs, bool InSet, - const MCValue &LHS, - const MCSymbolRefExpr *RhsAdd, - const MCSymbolRefExpr *RhsSub, int64_t RHS_Cst, - uint32_t RhsSpec, MCValue &Res) { + const MCValue &LHS, const MCValue &RHS, + MCValue &Res) { const MCSymbol *LHS_A = LHS.getAddSym(); const MCSymbol *LHS_B = LHS.getSubSym(); int64_t LHS_Cst = LHS.getConstant(); - const MCSymbol *RHS_A = RhsAdd ? &RhsAdd->getSymbol() : nullptr; - const MCSymbol *RHS_B = RhsSub ? &RhsSub->getSymbol() : nullptr; + const MCSymbol *RHS_A = RHS.getAddSym(); + const MCSymbol *RHS_B = RHS.getSubSym(); + int64_t RHS_Cst = RHS.getConstant(); // Fold the result constant immediately. int64_t Result_Cst = LHS_Cst + RHS_Cst; // If we have a layout, we can fold resolved differences. - if (Asm && !LHS.getSpecifier() && !RhsSpec) { + if (Asm && !LHS.getSpecifier() && !RHS.getSpecifier()) { // While LHS_A-LHS_B and RHS_A-RHS_B from recursive calls have already been // folded, reassociating terms in // Result = (LHS_A - LHS_B + LHS_Cst) + (RHS_A - RHS_B + RHS_Cst). @@ -472,16 +471,12 @@ bool MCExpr::evaluateSymbolicAdd(const MCAssembler *Asm, // At this point, we have at most one additive symbol and one subtractive // symbol -- find them. - auto *A = LHS_A ? LHS.getSymA() : RHS_A ? RhsAdd : nullptr; - auto *B = LHS_B ? LHS.getSymB() : RHS_B ? RhsSub : nullptr; - if (B && B->getKind() != MCSymbolRefExpr::VK_None) - return false; + auto *A = LHS_A ? LHS_A : RHS_A; + auto *B = LHS_B ? LHS_B : RHS_B; auto Spec = LHS.getSpecifier(); if (!Spec) - Spec = RhsSpec; - Res = MCValue::get(A, B, Result_Cst); - Res.Specifier = Spec; - Res.SymSpecifier = 0; + Spec = RHS.getSpecifier(); + Res = MCValue::get(A, B, Result_Cst, Spec); return true; } @@ -532,18 +527,16 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, InSet || IsMachO)) { if (Kind != MCSymbolRefExpr::VK_None) { if (Res.isAbsolute()) { - Res = MCValue::get(SRE, nullptr, 0); + Res = MCValue::get(&Sym, nullptr, 0, Kind); return true; } // If the reference has a variant kind, we can only handle expressions // which evaluate exactly to a single unadorned symbol. Attach the // original VariantKind to SymA of the result. - if (Res.getRefKind() != MCSymbolRefExpr::VK_None || !Res.getSymA() || - Res.getSubSym() || Res.getConstant()) + if (Res.getRefKind() != MCSymbolRefExpr::VK_None || + !Res.getAddSym() || Res.getSubSym() || Res.getConstant()) return false; - Res = MCValue::get( - MCSymbolRefExpr::create(Res.getAddSym(), Kind, Asm->getContext()), - Res.getSymB(), Res.getConstant(), Res.getRefKind()); + Res.Specifier = Kind; } if (!IsMachO) return true; @@ -565,7 +558,7 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, } } - Res = MCValue::get(SRE, nullptr, 0); + Res = MCValue::get(&Sym, nullptr, 0, Kind); return true; } @@ -587,7 +580,7 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, return false; // The cast avoids undefined behavior if the constant is INT64_MIN. - Res = MCValue::get(Value.getSymB(), Value.getSymA(), + Res = MCValue::get(Value.getSubSym(), Value.getAddSym(), -(uint64_t)Value.getConstant()); break; case MCUnaryExpr::Not: @@ -653,9 +646,11 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, Res = RHSValue; return true; } - return evaluateSymbolicAdd(Asm, Addrs, InSet, LHSValue, RHSValue.SymA, - RHSValue.SymB, RHSValue.Cst, - RHSValue.SymSpecifier, Res); + if (LHSValue.SymB && LHSValue.Specifier) + return false; + if (RHSValue.SymB && RHSValue.Specifier) + return false; + return evaluateSymbolicAdd(Asm, Addrs, InSet, LHSValue, RHSValue, Res); } } diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp index c9957c2881111..6e0415fa91264 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp @@ -80,8 +80,6 @@ bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result, if (!Asm || !Asm->hasLayout()) return false; - MCContext &Context = Asm->getContext(); - const MCSymbolRefExpr *Sym = nullptr; auto Spec = AVRMCExpr::VK_None; if (Value.getSymSpecifier() != MCSymbolRefExpr::VK_None) return false; @@ -90,8 +88,8 @@ bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result, Spec = AVRMCExpr::VK_PM; // TODO: don't attach specifier to MCSymbolRefExpr. - Sym = MCSymbolRefExpr::create(Value.getAddSym(), Spec, Context); - Result = MCValue::get(Sym, nullptr, Value.getConstant()); + Result = + MCValue::get(Value.getAddSym(), nullptr, Value.getConstant(), Spec); } return true; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index c39482094db12..e74c8af2a850c 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -236,7 +236,7 @@ bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm, MCSym = MCSymbolRefExpr::create(Sym, Ctx); getSecToAlignSym()[Sec] = MCSym; } - return MCValue::get(MCSym, nullptr, + return MCValue::get(&MCSym->getSymbol(), nullptr, MaxBytesToEmit << 8 | Log2(AF.getAlignment())); }; @@ -497,11 +497,8 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAssembler &Asm, default: llvm_unreachable("unsupported fixup size"); } - MCValue A = MCValue::get( - MCSymbolRefExpr::create(Target.getAddSym(), Asm.getContext()), nullptr, - Target.getConstant()); - MCValue B = MCValue::get( - MCSymbolRefExpr::create(Target.getSubSym(), Asm.getContext())); + MCValue A = MCValue::get(Target.getAddSym(), nullptr, Target.getConstant()); + MCValue B = MCValue::get(Target.getSubSym()); auto FA = MCFixup::create(Fixup.getOffset(), nullptr, std::get<0>(FK)); auto FB = MCFixup::create(Fixup.getOffset(), nullptr, std::get<1>(FK)); auto &Assembler = const_cast(Asm); diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index d245b15a39210..652dd6586492d 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -656,11 +656,8 @@ bool RISCVAsmBackend::handleAddSubRelocations(const MCAssembler &Asm, default: llvm_unreachable("unsupported fixup size"); } - MCValue A = MCValue::get( - MCSymbolRefExpr::create(Target.getAddSym(), Asm.getContext()), nullptr, - Target.getConstant()); - MCValue B = MCValue::get( - MCSymbolRefExpr::create(Target.getSubSym(), Asm.getContext())); + MCValue A = MCValue::get(Target.getAddSym(), nullptr, Target.getConstant()); + MCValue B = MCValue::get(Target.getSubSym()); auto FA = MCFixup::create( Fixup.getOffset(), nullptr, static_cast(FirstLiteralRelocationKind + TA)); diff --git a/llvm/test/MC/AArch64/elf-reloc-ptrauth.s b/llvm/test/MC/AArch64/elf-reloc-ptrauth.s index 6f47e2d614d19..0b66811458da5 100644 --- a/llvm/test/MC/AArch64/elf-reloc-ptrauth.s +++ b/llvm/test/MC/AArch64/elf-reloc-ptrauth.s @@ -175,7 +175,7 @@ _g9: // ERROBJ: :[[#@LINE+1]]:7: error: Cannot represent a difference across sections .quad _g9@AUTH(ia,42) - _g8 -// ERROBJ: :[[#@LINE+1]]:7: error: Cannot represent a difference across sections +// ERROBJ: :[[#@LINE+1]]:7: error: expected relocatable expression .quad _g9@AUTH(ia,42) - _g8@AUTH(ia,42) .quad 0 From 803fbdd1faa813303cda3d93b3364eca2344ab6a Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 22:13:55 -0700 Subject: [PATCH 0773/1029] [PowerPC] Report proper error for invalid relocation specifier Generalize the test from https://reviews.llvm.org/D83255 Replace getAccessVariant with MCValue::getSpecifier Simplify code after MCValue improvement 94821ce45fe93aa78cc5ea03cd9deac91b7af127 --- .../MCTargetDesc/PPCELFObjectWriter.cpp | 63 +++++++++---------- llvm/test/MC/PowerPC/ppc64-errors-emit-obj.s | 7 --- .../MC/PowerPC/relocation-specifier-err.s | 14 +++++ 3 files changed, 45 insertions(+), 39 deletions(-) delete mode 100644 llvm/test/MC/PowerPC/ppc64-errors-emit-obj.s create mode 100644 llvm/test/MC/PowerPC/relocation-specifier-err.s diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index e33961a973d1e..d7ff92f64bf4f 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -38,25 +38,15 @@ PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI) Is64Bit ? ELF::EM_PPC64 : ELF::EM_PPC, /*HasRelocationAddend*/ true) {} -static PPCMCExpr::Specifier getAccessVariant(const MCValue &Target, - const MCFixup &Fixup) { - const MCExpr *Expr = Fixup.getValue(); - - if (Expr->getKind() != MCExpr::Target) - return PPCMCExpr::Specifier(Target.getAccessVariant()); - return cast(Expr)->getSpecifier(); -} - unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { MCFixupKind Kind = Fixup.getKind(); if (Kind >= FirstLiteralRelocationKind) return Kind - FirstLiteralRelocationKind; - auto RefKind = static_cast(Target.getRefKind()); - auto Modifier = getAccessVariant(Target, Fixup); - - switch (PPCMCExpr::Specifier(Modifier)) { + SMLoc Loc = Fixup.getValue()->getLoc(); + auto Spec = static_cast(Target.getSpecifier()); + switch (Spec) { case PPCMCExpr::VK_DTPMOD: case PPCMCExpr::VK_DTPREL: case PPCMCExpr::VK_DTPREL_HA: @@ -108,7 +98,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, } // determine the type of the relocation - unsigned Type; + unsigned Type = 0; if (IsPCRel) { switch (Fixup.getTargetKind()) { default: @@ -116,8 +106,10 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, case PPC::fixup_ppc_br24: case PPC::fixup_ppc_br24abs: case PPC::fixup_ppc_br24_notoc: - switch (Modifier) { - default: llvm_unreachable("Unsupported Modifier"); + switch (Spec) { + default: + Ctx.reportError(Loc, "unsupported relocation type"); + break; case PPCMCExpr::VK_None: Type = ELF::R_PPC_REL24; break; @@ -137,9 +129,9 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, Type = ELF::R_PPC_REL14; break; case PPC::fixup_ppc_half16: - switch (RefKind) { + switch (Spec) { default: - Ctx.reportError(Fixup.getLoc(), "invalid VariantKind"); + Ctx.reportError(Loc, "unsupported relocation type"); return ELF::R_PPC_NONE; case PPCMCExpr::VK_None: return ELF::R_PPC_REL16; @@ -157,9 +149,10 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, errs() << '\n'; report_fatal_error("Invalid PC-relative half16ds relocation"); case PPC::fixup_ppc_pcrel34: - switch (Modifier) { + switch (Spec) { default: - llvm_unreachable("Unsupported Modifier for fixup_ppc_pcrel34"); + Ctx.reportError(Loc, "unsupported relocation type"); + break; case PPCMCExpr::VK_PCREL: Type = ELF::R_PPC64_PCREL34; break; @@ -196,9 +189,10 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, Type = ELF::R_PPC_ADDR14; // XXX: or BRNTAKEN?_ break; case PPC::fixup_ppc_half16: - switch (Modifier) { + switch (Spec) { default: - llvm_unreachable("Unsupported specifier"); + Ctx.reportError(Loc, "unsupported relocation type"); + break; case PPCMCExpr::VK_LO: return ELF::R_PPC_ADDR16_LO; case PPCMCExpr::VK_HI: @@ -371,10 +365,10 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, break; case PPC::fixup_ppc_half16ds: case PPC::fixup_ppc_half16dq: - switch (Modifier) { + switch (Spec) { default: - Ctx.reportError(Fixup.getLoc(), "invalid VariantKind"); - return ELF::R_PPC64_NONE; + Ctx.reportError(Loc, "unsupported relocation type"); + break; case PPCMCExpr::VK_LO: return ELF::R_PPC64_ADDR16_LO_DS; case PPCMCExpr::VK_None: @@ -419,8 +413,10 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, } break; case PPC::fixup_ppc_nofixup: - switch (Modifier) { - default: llvm_unreachable("Unsupported Modifier"); + switch (Spec) { + default: + Ctx.reportError(Loc, "unsupported relocation type"); + break; case PPCMCExpr::VK_TLSGD: if (is64Bit()) Type = ELF::R_PPC64_TLSGD; @@ -445,9 +441,10 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, } break; case PPC::fixup_ppc_imm34: - switch (Modifier) { + switch (Spec) { default: - report_fatal_error("Unsupported Modifier for fixup_ppc_imm34."); + Ctx.reportError(Loc, "unsupported relocation type"); + break; case PPCMCExpr::VK_DTPREL: Type = ELF::R_PPC64_DTPREL34; break; @@ -457,8 +454,10 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, } break; case FK_Data_8: - switch (Modifier) { - default: llvm_unreachable("Unsupported Modifier"); + switch (Spec) { + default: + Ctx.reportError(Loc, "unsupported relocation type"); + break; case PPCMCExpr::VK_TOCBASE: Type = ELF::R_PPC64_TOC; break; @@ -477,7 +476,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, } break; case FK_Data_4: - switch (Modifier) { + switch (Spec) { case PPCMCExpr::VK_DTPREL: Type = ELF::R_PPC_DTPREL32; break; diff --git a/llvm/test/MC/PowerPC/ppc64-errors-emit-obj.s b/llvm/test/MC/PowerPC/ppc64-errors-emit-obj.s deleted file mode 100644 index 0d2c879380e0a..0000000000000 --- a/llvm/test/MC/PowerPC/ppc64-errors-emit-obj.s +++ /dev/null @@ -1,7 +0,0 @@ -# RUN: not --crash llvm-mc -triple powerpc64-- --filetype=obj < %s 2> %t -# RUN: FileCheck < %t %s -# RUN: not --crash llvm-mc -triple powerpc64le-- --filetype=obj < %s 2> %t -# RUN: FileCheck < %t %s - -# CHECK: Unsupported Modifier for fixup_ppc_imm34. -paddi 3, 13, symbol@toc, 0 diff --git a/llvm/test/MC/PowerPC/relocation-specifier-err.s b/llvm/test/MC/PowerPC/relocation-specifier-err.s new file mode 100644 index 0000000000000..835fde7519ace --- /dev/null +++ b/llvm/test/MC/PowerPC/relocation-specifier-err.s @@ -0,0 +1,14 @@ +# RUN: not llvm-mc -triple powerpc64 --filetype=obj %s -o %t 2>&1 | FileCheck %s +# RUN: not llvm-mc -triple powerpc64le --filetype=obj %s -o %t 2>&1 | FileCheck %s + +# CHECK: [[#@LINE+1]]:4: error: unsupported relocation type +bl foo@toc + +# CHECK: [[#@LINE+1]]:12: error: unsupported relocation type +addi 3, 3, foo@plt + +# CHECK: [[#@LINE+1]]:14: error: unsupported relocation type +paddi 3, 13, foo@toc, 0 + +# CHECK: [[#@LINE+1]]:7: error: unsupported relocation type +.quad foo@toc From e5923936109ce4ce7be2c8fb3372b14d33c385d9 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 23:44:57 -0700 Subject: [PATCH 0774/1029] MCValue: Replace getSymSpecifier with getSpecifier Commit 52eb11f925ddeba4e1b3840fd636ee87387f3ada temporarily introduced getSymSpecifier to prepare for "MCValue: Replace MCSymbolRefExpr members with MCSymbol" (d5893fc2a7e1191afdb4940469ec9371a319b114). The refactoring is now complete. --- llvm/include/llvm/MC/MCValue.h | 2 -- llvm/lib/MC/MCAssembler.cpp | 2 +- llvm/lib/MC/WasmObjectWriter.cpp | 2 +- llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 2 +- .../AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp | 6 +++--- .../AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp | 5 ++--- .../Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp | 2 +- llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp | 2 +- .../Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp | 3 +-- .../SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp | 2 +- .../MCTargetDesc/WebAssemblyWasmObjectWriter.cpp | 2 +- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp | 2 +- llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp | 2 +- llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp | 8 ++++---- .../Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp | 2 +- 15 files changed, 20 insertions(+), 24 deletions(-) diff --git a/llvm/include/llvm/MC/MCValue.h b/llvm/include/llvm/MC/MCValue.h index 74f68e33e8763..05cf82a26eab4 100644 --- a/llvm/include/llvm/MC/MCValue.h +++ b/llvm/include/llvm/MC/MCValue.h @@ -55,8 +55,6 @@ class MCValue { // Get the relocation specifier from SymA. This is a workaround for targets // that do not use MCValue::Specifier. - uint16_t getSymSpecifier() const { return Specifier; } - // Get the relocation specifier from SymA, or 0 when SymA is null. uint16_t getAccessVariant() const { return Specifier; } static MCValue get(const MCSymbol *SymA, const MCSymbol *SymB = nullptr, diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index 0bfb32115fe66..bf78a84d8340f 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -126,7 +126,7 @@ bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const { return false; auto *Sym = V.getAddSym(); - if (!Sym || V.getSymSpecifier()) + if (!Sym || V.getSpecifier()) return false; if (!isThumbFunc(Sym)) diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp index b35ca704c519a..302ba6ee5a548 100644 --- a/llvm/lib/MC/WasmObjectWriter.cpp +++ b/llvm/lib/MC/WasmObjectWriter.cpp @@ -606,7 +606,7 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm, SymA->setUsedInReloc(); } - switch (Target.getSymSpecifier()) { + switch (Target.getSpecifier()) { case MCSymbolRefExpr::VK_GOT: case MCSymbolRefExpr::VK_WASM_GOT_TLS: SymA->setUsedInGOT(); diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index c3baec41cbedf..dd67a312cc2cd 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -8224,7 +8224,7 @@ bool AArch64AsmParser::classifySymbolRef(const MCExpr *Expr, return false; if (Res.getAddSym()) - DarwinSpec = AArch64MCExpr::Specifier(Res.getSymSpecifier()); + DarwinSpec = AArch64MCExpr::Specifier(Res.getSpecifier()); Addend = Res.getConstant(); // It's some symbol reference + a constant addend, but really diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 6292203ce8401..fc0989678d2c1 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -192,7 +192,7 @@ void AArch64MachObjectWriter::recordRelocation( } if (!getAArch64FixupKindMachOInfo( - Fixup, Type, AArch64MCExpr::Specifier(Target.getSymSpecifier()), + Fixup, Type, AArch64MCExpr::Specifier(Target.getSpecifier()), Log2Size, Asm)) { Asm.getContext().reportError(Fixup.getLoc(), "unknown AArch64 fixup kind!"); return; @@ -221,7 +221,7 @@ void AArch64MachObjectWriter::recordRelocation( // Check for "_foo@got - .", which comes through here as: // Ltmp0: // ... _foo@got - Ltmp0 - if (Target.getSymSpecifier() == AArch64MCExpr::M_GOT && + if (Target.getSpecifier() == AArch64MCExpr::M_GOT && Asm.getSymbolOffset(*B) == Asm.getFragmentOffset(*Fragment) + Fixup.getOffset()) { // SymB is the PC, so use a PC-rel pointer-to-GOT relocation. @@ -232,7 +232,7 @@ void AArch64MachObjectWriter::recordRelocation( MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28); Writer->addRelocation(A_Base, Fragment->getParent(), MRE); return; - } else if (Target.getSymSpecifier() != AArch64MCExpr::None) { + } else if (Target.getSpecifier() != AArch64MCExpr::None) { // Otherwise, neither symbol can be modified. Asm.getContext().reportError(Fixup.getLoc(), "unsupported relocation of modified symbol"); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp index f580b37252e80..1e01fe25ede80 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -61,8 +61,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType( FixupKind = FK_PCRel_4; } - auto Modifier = - Target.isAbsolute() ? AArch64MCExpr::None : Target.getSymSpecifier(); + auto Spec = Target.getSpecifier(); const MCExpr *Expr = Fixup.getValue(); if (const AArch64MCExpr *A64E = dyn_cast(Expr)) { @@ -98,7 +97,7 @@ unsigned AArch64WinCOFFObjectWriter::getRelocType( return COFF::IMAGE_REL_ARM64_REL32; case FK_Data_4: - switch (Modifier) { + switch (Spec) { default: return COFF::IMAGE_REL_ARM64_ADDR32; case MCSymbolRefExpr::VK_COFF_IMGREL32: diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp index da8762c855fb2..bfcd5e8a9b3be 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp @@ -44,7 +44,7 @@ unsigned ARMWinCOFFObjectWriter::getRelocType(MCContext &Ctx, const MCFixup &Fixup, bool IsCrossSection, const MCAsmBackend &MAB) const { - auto Spec = Target.getAddSym() ? Target.getSymSpecifier() : 0; + auto Spec = Target.getSpecifier(); unsigned FixupKind = Fixup.getKind(); if (IsCrossSection) { if (FixupKind != FK_Data_4) { diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp index 6e0415fa91264..01bd8973170cd 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCExpr.cpp @@ -81,7 +81,7 @@ bool AVRMCExpr::evaluateAsRelocatableImpl(MCValue &Result, return false; auto Spec = AVRMCExpr::VK_None; - if (Value.getSymSpecifier() != MCSymbolRefExpr::VK_None) + if (Value.getSpecifier() != MCSymbolRefExpr::VK_None) return false; assert(!Value.getSubSym()); if (specifier == VK_PM) diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp index f36f25559365c..1b5fe08bea49d 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp @@ -40,8 +40,7 @@ llvm::createPPCXCOFFObjectWriter(bool Is64Bit) { std::pair PPCXCOFFObjectWriter::getRelocTypeAndSignSize( const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - const auto Specifier = - Target.isAbsolute() ? PPCMCExpr::VK_None : Target.getSymSpecifier(); + const auto Specifier = Target.getSpecifier(); // People from AIX OS team says AIX link editor does not care about // the sign bit in the relocation entry "most" of the time. // The system assembler seems to set the sign bit on relocation entry diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp index d6e0e15bec358..b676015ae9596 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp @@ -220,7 +220,7 @@ unsigned SystemZELFObjectWriter::getRelocType(MCContext &Ctx, bool SystemZELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, const MCSymbol &Sym, unsigned Type) const { - switch (V.getSymSpecifier()) { + switch (V.getSpecifier()) { case SystemZMCExpr::VK_GOT: case SystemZMCExpr::VK_PLT: return true; diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp index 7fcd2ec0f64db..8c074e1eaaee3 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp @@ -66,7 +66,7 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType( const MCValue &Target, const MCFixup &Fixup, const MCSectionWasm &FixupSection, bool IsLocRel) const { auto &SymA = cast(*Target.getAddSym()); - auto Spec = Target.getSymSpecifier(); + auto Spec = Target.getSpecifier(); switch (Spec) { case MCSymbolRefExpr::VK_GOT: case MCSymbolRefExpr::VK_WASM_GOT_TLS: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index af827a42c48eb..767818107de8d 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -744,7 +744,7 @@ bool X86AsmBackend::fixupNeedsRelaxationAdvanced(const MCAssembler &Asm, if (Fixup.getKind() == FK_Data_1) { MCValue Target; if (Fixup.getValue()->evaluateAsRelocatable(Target, &Asm) && - Target.getAddSym() && Target.getSymSpecifier() == X86MCExpr::VK_ABS8) + Target.getAddSym() && Target.getSpecifier() == X86MCExpr::VK_ABS8) return false; } return true; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index ca10f4716ba8c..c0cf92300c380 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -391,7 +391,7 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, bool X86ELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, const MCSymbol &Sym, unsigned Type) const { - switch (V.getSymSpecifier()) { + switch (V.getSpecifier()) { case X86MCExpr::VK_GOT: case X86MCExpr::VK_PLT: case X86MCExpr::VK_GOTPCREL: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 6871b8adeebbd..f2055d29ce7e0 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -151,7 +151,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( const MCSymbol *B_Base = Writer->getAtom(*B); // Neither symbol can be modified. - if (Target.getSymSpecifier()) { + if (Target.getSpecifier()) { Asm.getContext().reportError(Fixup.getLoc(), "unsupported relocation of modified symbol"); return; @@ -266,7 +266,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( return; } - auto Specifier = Target.getSymSpecifier(); + auto Specifier = Target.getSpecifier(); if (IsPCRel) { if (IsRIPRel) { if (Specifier == X86MCExpr::VK_GOTPCREL) { @@ -461,7 +461,7 @@ void X86MachObjectWriter::recordTLVPRelocation(MachObjectWriter *Writer, MCValue Target, uint64_t &FixedValue) { const MCSymbol *SymA = Target.getAddSym(); - assert(Target.getSymSpecifier() == X86MCExpr::VK_TLVP && !is64Bit() && + assert(Target.getSpecifier() == X86MCExpr::VK_TLVP && !is64Bit() && "Should only be called with a 32-bit TLVP relocation!"); unsigned Log2Size = getFixupKindLog2Size(Fixup.getKind()); @@ -503,7 +503,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, const MCSymbol *A = Target.getAddSym(); // If this is a 32-bit TLVP reloc it's handled a bit differently. - if (A && Target.getSymSpecifier() == X86MCExpr::VK_TLVP) { + if (A && Target.getSpecifier() == X86MCExpr::VK_TLVP) { recordTLVPRelocation(Writer, Asm, Fragment, Fixup, Target, FixedValue); return; } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 30076978401be..41d33d6d4860c 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -59,7 +59,7 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx, } } - auto Spec = Target.getAddSym() ? Target.getSymSpecifier() : 0; + auto Spec = Target.getSpecifier(); if (Is64Bit) { switch (FixupKind) { case FK_PCRel_4: From 8fa5b6cc0293d806e36b90d4116e5925fa5d7f2e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 5 Apr 2025 23:55:34 -0700 Subject: [PATCH 0775/1029] MCValue: Replace getAccessVariant with getSpecifier Commit 52eb11f925ddeba4e1b3840fd636ee87387f3ada temporarily introduced getSymSpecifier to prepare for "MCValue: Replace MCSymbolRefExpr members with MCSymbol" (d5893fc2a7e1191afdb4940469ec9371a319b114). The refactoring is now complete. --- llvm/include/llvm/MC/MCExpr.h | 4 ++-- llvm/include/llvm/MC/MCValue.h | 4 ---- .../Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp | 6 +++--- .../Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp | 2 +- llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp | 2 +- llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp | 2 +- llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp | 2 +- llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp | 2 +- .../Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp | 2 +- llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp | 2 +- .../Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp | 2 +- llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp | 2 +- 12 files changed, 14 insertions(+), 18 deletions(-) diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index 6311a134cf5a7..8516f45e07fea 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -262,8 +262,8 @@ class MCSymbolRefExpr : public MCExpr { const MCSymbol &getSymbol() const { return *Symbol; } // Some targets encode the relocation specifier within SymA using - // MCSymbolRefExpr::SubclassData and access it via getAccessVariant(), though - // this method is now deprecated. + // MCSymbolRefExpr::SubclassData, which is copied to MCValue::Specifier, + // though this method is now deprecated. VariantKind getKind() const { return (VariantKind)(getSubclassData() & VariantKindMask); } diff --git a/llvm/include/llvm/MC/MCValue.h b/llvm/include/llvm/MC/MCValue.h index 05cf82a26eab4..67793a1759e60 100644 --- a/llvm/include/llvm/MC/MCValue.h +++ b/llvm/include/llvm/MC/MCValue.h @@ -53,10 +53,6 @@ class MCValue { /// Print the value to stderr. void dump() const; - // Get the relocation specifier from SymA. This is a workaround for targets - // that do not use MCValue::Specifier. - uint16_t getAccessVariant() const { return Specifier; } - static MCValue get(const MCSymbol *SymA, const MCSymbol *SymB = nullptr, int64_t Val = 0, uint32_t Specifier = 0) { MCValue R; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index c3c4c64cad5b0..37d233ac2446c 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -137,7 +137,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, case FK_Data_2: return R_CLS(PREL16); case FK_Data_4: { - return AArch64MCExpr::Specifier(Target.getAccessVariant()) == + return AArch64MCExpr::Specifier(Target.getSpecifier()) == AArch64MCExpr::VK_PLT ? R_CLS(PLT32) : R_CLS(PREL32); @@ -249,7 +249,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, case FK_Data_2: return R_CLS(ABS16); case FK_Data_4: - return (!IsILP32 && AArch64MCExpr::Specifier(Target.getAccessVariant()) == + return (!IsILP32 && AArch64MCExpr::Specifier(Target.getSpecifier()) == AArch64MCExpr::VK_GOTPCREL) ? ELF::R_AARCH64_GOTPCREL32 : R_CLS(ABS32); @@ -546,7 +546,7 @@ bool AArch64ELFObjectWriter::needsRelocateWithSymbol(const MCValue &Val, if ((Val.getRefKind() & AArch64MCExpr::VK_GOT) == AArch64MCExpr::VK_GOT) return true; return is_contained({AArch64MCExpr::VK_GOTPCREL, AArch64MCExpr::VK_PLT}, - AArch64MCExpr::Specifier(Val.getAccessVariant())); + AArch64MCExpr::Specifier(Val.getSpecifier())); } std::unique_ptr diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index bf27688e3b221..b016e37e91ba8 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -46,7 +46,7 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx, return ELF::R_AMDGPU_ABS32_LO; } - switch (AMDGPUMCExpr::Specifier(Target.getAccessVariant())) { + switch (AMDGPUMCExpr::Specifier(Target.getSpecifier())) { default: break; case AMDGPUMCExpr::S_GOTPCREL: diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 42838bb83781c..e60510b01eeed 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -82,7 +82,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, unsigned Kind = Fixup.getTargetKind(); if (Kind >= FirstLiteralRelocationKind) return Kind - FirstLiteralRelocationKind; - uint8_t Specifier = Target.getAccessVariant(); + uint8_t Specifier = Target.getSpecifier(); auto CheckFDPIC = [&](uint32_t Type) { if (getOSABI() != ELF::ELFOSABI_ARM_FDPIC) Ctx.reportError(Fixup.getLoc(), diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp index 7e50d5c2e4a00..3555a66ae4dd9 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp @@ -40,7 +40,7 @@ unsigned AVRELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const unsigned Kind = Fixup.getTargetKind(); if (Kind >= FirstLiteralRelocationKind) return Kind - FirstLiteralRelocationKind; - auto Modifier = AVRMCExpr::Specifier(Target.getAccessVariant()); + auto Modifier = AVRMCExpr::Specifier(Target.getSpecifier()); switch ((unsigned)Fixup.getKind()) { case FK_Data_1: switch (Modifier) { diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp index af02631d051be..c7ece33777927 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp @@ -42,7 +42,7 @@ unsigned CSKYELFObjectWriter::getRelocType(MCContext &Ctx, const MCExpr *Expr = Fixup.getValue(); // Determine the type of the relocation unsigned Kind = Fixup.getTargetKind(); - uint8_t Modifier = Target.getAccessVariant(); + uint8_t Modifier = Target.getSpecifier(); switch (Target.getRefKind()) { case CSKYMCExpr::VK_TLSIE: diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index 750cca63c24d9..67ac69214be50 100644 --- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -1249,7 +1249,7 @@ bool HexagonAsmParser::parseInstruction(OperandVector &Operands) { MCValue Value; if (Expr->evaluateAsRelocatable(Value, nullptr)) { if (!Value.isAbsolute()) { - switch (HexagonMCExpr::VariantKind(Value.getAccessVariant())) { + switch (HexagonMCExpr::VariantKind(Value.getSpecifier())) { case HexagonMCExpr::VK_TPREL: case HexagonMCExpr::VK_DTPREL: // Don't lazy extend these expression variants diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp index 039e4c981890d..b7342d3675417 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp @@ -42,7 +42,7 @@ unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx, MCValue const &Target, MCFixup const &Fixup, bool IsPCRel) const { - auto Variant = HexagonMCExpr::VariantKind(Target.getAccessVariant()); + auto Variant = HexagonMCExpr::VariantKind(Target.getSpecifier()); switch (Variant) { case HexagonMCExpr::VK_GD_GOT: case HexagonMCExpr::VK_LD_GOT: diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp index 192bc5f4ae602..200d499753718 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp @@ -65,7 +65,7 @@ unsigned M68kELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - auto Specifier = M68kMCExpr::Specifier(Target.getAccessVariant()); + auto Specifier = M68kMCExpr::Specifier(Target.getSpecifier()); unsigned Kind = Fixup.getKind(); M68kRelType Type = getType(Kind, Specifier, IsPCRel); switch (Specifier) { diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp index b676015ae9596..729cfd99b7871 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZELFObjectWriter.cpp @@ -158,7 +158,7 @@ unsigned SystemZELFObjectWriter::getRelocType(MCContext &Ctx, unsigned Kind = Fixup.getKind(); if (Kind >= FirstLiteralRelocationKind) return Kind - FirstLiteralRelocationKind; - auto Specifier = SystemZMCExpr::Specifier(Target.getAccessVariant()); + auto Specifier = SystemZMCExpr::Specifier(Target.getSpecifier()); switch (Specifier) { case SystemZMCExpr::VK_INDNTPOFF: case SystemZMCExpr::VK_NTPOFF: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index c0cf92300c380..66e84d08c97e5 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -340,7 +340,7 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, MCFixupKind Kind = Fixup.getKind(); if (Kind >= FirstLiteralRelocationKind) return Kind - FirstLiteralRelocationKind; - auto Specifier = X86MCExpr::Specifier(Target.getAccessVariant()); + auto Specifier = X86MCExpr::Specifier(Target.getSpecifier()); switch (Specifier) { case X86MCExpr::VK_GOTTPOFF: case X86MCExpr::VK_INDNTPOFF: From 7cb66ff4648a15741a1908658dfef5cb3d4a9199 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 6 Apr 2025 00:03:48 -0700 Subject: [PATCH 0776/1029] [PowerPC] Use reportError Report a proper error and fix de1dc9c98f9ce74d38aceb44e00d258370d1bb34 --- .../lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp | 5 ++--- llvm/test/MC/PowerPC/pr24686.s | 7 ------- llvm/test/MC/PowerPC/relocation-specifier-err.s | 3 +++ 3 files changed, 5 insertions(+), 10 deletions(-) delete mode 100644 llvm/test/MC/PowerPC/pr24686.s diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index d7ff92f64bf4f..50b59b336bb5b 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -145,9 +145,8 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, break; case PPC::fixup_ppc_half16ds: case PPC::fixup_ppc_half16dq: - Target.print(errs()); - errs() << '\n'; - report_fatal_error("Invalid PC-relative half16ds relocation"); + Ctx.reportError(Loc, "unsupported relocation type"); + break; case PPC::fixup_ppc_pcrel34: switch (Spec) { default: diff --git a/llvm/test/MC/PowerPC/pr24686.s b/llvm/test/MC/PowerPC/pr24686.s deleted file mode 100644 index 35a379c697eaf..0000000000000 --- a/llvm/test/MC/PowerPC/pr24686.s +++ /dev/null @@ -1,7 +0,0 @@ -# RUN: not --crash llvm-mc -triple=powerpc64le-unknown-linux-gnu -filetype=obj %s \ -# RUN: 2>&1 | FileCheck %s - -_stext: -ld %r5, p_end - _stext(%r5) - -# CHECK: LLVM ERROR: Invalid PC-relative half16ds relocation diff --git a/llvm/test/MC/PowerPC/relocation-specifier-err.s b/llvm/test/MC/PowerPC/relocation-specifier-err.s index 835fde7519ace..e7210794cdd58 100644 --- a/llvm/test/MC/PowerPC/relocation-specifier-err.s +++ b/llvm/test/MC/PowerPC/relocation-specifier-err.s @@ -10,5 +10,8 @@ addi 3, 3, foo@plt # CHECK: [[#@LINE+1]]:14: error: unsupported relocation type paddi 3, 13, foo@toc, 0 +# CHECK: [[#@LINE+1]]:15: error: unsupported relocation type +ld %r5, p_end - .(%r5) + # CHECK: [[#@LINE+1]]:7: error: unsupported relocation type .quad foo@toc From e7dc05ebcf50a620bc2307207bfd27620c6ca648 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 6 Apr 2025 00:09:29 -0700 Subject: [PATCH 0777/1029] MCValue: Make print private This is a debug-only feature (the relocation specifier is target-specific and cannot be printed without backend support) and not supposed to be used externally. PowerPC imappropriated used it (removed by 7cb66ff4648a15741a1908658dfef5cb3d4a9199). --- llvm/include/llvm/MC/MCValue.h | 11 +++++------ llvm/lib/MC/MCValue.cpp | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/llvm/include/llvm/MC/MCValue.h b/llvm/include/llvm/MC/MCValue.h index 67793a1759e60..fd6da9955e3e2 100644 --- a/llvm/include/llvm/MC/MCValue.h +++ b/llvm/include/llvm/MC/MCValue.h @@ -32,6 +32,11 @@ class MCValue { int64_t Cst = 0; uint32_t Specifier = 0; + void print(raw_ostream &OS) const; + + /// Print the value to stderr. + void dump() const; + public: friend class MCAssembler; friend class MCExpr; @@ -47,12 +52,6 @@ class MCValue { /// Is this an absolute (as opposed to relocatable) value. bool isAbsolute() const { return !SymA && !SymB; } - /// Print the value to the stream \p OS. - void print(raw_ostream &OS) const; - - /// Print the value to stderr. - void dump() const; - static MCValue get(const MCSymbol *SymA, const MCSymbol *SymB = nullptr, int64_t Val = 0, uint32_t Specifier = 0) { MCValue R; diff --git a/llvm/lib/MC/MCValue.cpp b/llvm/lib/MC/MCValue.cpp index 77deb0b4ab671..04bd65b8aefed 100644 --- a/llvm/lib/MC/MCValue.cpp +++ b/llvm/lib/MC/MCValue.cpp @@ -24,8 +24,8 @@ void MCValue::print(raw_ostream &OS) const { // FIXME: prints as a number, which isn't ideal. But the meaning will be // target-specific anyway. - if (getRefKind()) - OS << ':' << getRefKind() << ':'; + if (getSpecifier()) + OS << ':' << getSpecifier() << ':'; SymA->print(OS, nullptr); From c0b4a8edfe2349b912890951a49a32b6a27747af Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 6 Apr 2025 00:12:44 -0700 Subject: [PATCH 0778/1029] MCValue: Replace getRefKind with getSpecifier --- llvm/include/llvm/MC/MCValue.h | 1 - llvm/lib/MC/MCAssembler.cpp | 2 +- llvm/lib/MC/MCExpr.cpp | 4 ++-- llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp | 6 +++--- .../Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp | 4 ++-- .../Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp | 4 ++-- llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp | 4 ++-- llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp | 2 +- .../Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp | 2 +- .../LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp | 2 +- llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp | 2 +- llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp | 4 ++-- llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp | 2 +- llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp | 2 +- llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp | 2 +- llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp | 2 +- 16 files changed, 22 insertions(+), 23 deletions(-) diff --git a/llvm/include/llvm/MC/MCValue.h b/llvm/include/llvm/MC/MCValue.h index fd6da9955e3e2..abed825ac0c59 100644 --- a/llvm/include/llvm/MC/MCValue.h +++ b/llvm/include/llvm/MC/MCValue.h @@ -42,7 +42,6 @@ class MCValue { friend class MCExpr; MCValue() = default; int64_t getConstant() const { return Cst; } - uint32_t getRefKind() const { return Specifier; } uint32_t getSpecifier() const { return Specifier; } void setSpecifier(uint32_t S) { Specifier = S; } diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index bf78a84d8340f..934bdb40d530d 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -122,7 +122,7 @@ bool MCAssembler::isThumbFunc(const MCSymbol *Symbol) const { if (!Expr->evaluateAsRelocatable(V, nullptr)) return false; - if (V.getSubSym() || V.getRefKind() != MCSymbolRefExpr::VK_None) + if (V.getSubSym() || V.getSpecifier() != MCSymbolRefExpr::VK_None) return false; auto *Sym = V.getAddSym(); diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index c856ef5f97203..5293fa58c0381 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -291,7 +291,7 @@ bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm, // Value with RefKind (e.g. %hi(0xdeadbeef) in MIPS) is not considered // absolute (the value is unknown at parse time), even if it might be resolved // by evaluateFixup. - return IsRelocatable && Value.isAbsolute() && Value.getRefKind() == 0; + return IsRelocatable && Value.isAbsolute() && Value.getSpecifier() == 0; } /// Helper method for \see EvaluateSymbolAdd(). @@ -533,7 +533,7 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, // If the reference has a variant kind, we can only handle expressions // which evaluate exactly to a single unadorned symbol. Attach the // original VariantKind to SymA of the result. - if (Res.getRefKind() != MCSymbolRefExpr::VK_None || + if (Res.getSpecifier() != MCSymbolRefExpr::VK_None || !Res.getAddSym() || Res.getSubSym() || Res.getConstant()) return false; Res.Specifier = Kind; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 3335d9d6f009c..b43de2dba92a2 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -222,7 +222,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, return Value >> 4; case AArch64::fixup_aarch64_movw: { AArch64MCExpr::Specifier RefKind = - static_cast(Target.getRefKind()); + static_cast(Target.getSpecifier()); if (AArch64MCExpr::getSymbolLoc(RefKind) != AArch64MCExpr::VK_ABS && AArch64MCExpr::getSymbolLoc(RefKind) != AArch64MCExpr::VK_SABS) { if (!RefKind) { @@ -422,7 +422,7 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, bool IsResolved, const MCSubtargetInfo *STI) const { if (Fixup.getTargetKind() == FK_Data_8 && TheTriple.isOSBinFormatELF()) { - auto RefKind = static_cast(Target.getRefKind()); + auto RefKind = static_cast(Target.getSpecifier()); AArch64MCExpr::Specifier SymLoc = AArch64MCExpr::getSymbolLoc(RefKind); if (SymLoc == AArch64AuthMCExpr::VK_AUTH || SymLoc == AArch64AuthMCExpr::VK_AUTHADDR) { @@ -480,7 +480,7 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, // FIXME: getFixupKindInfo() and getFixupKindNumBytes() could be fixed to // handle this more cleanly. This may affect the output of -show-mc-encoding. AArch64MCExpr::Specifier RefKind = - static_cast(Target.getRefKind()); + static_cast(Target.getSpecifier()); if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS || (!RefKind && Fixup.getTargetKind() == AArch64::fixup_aarch64_movw)) { // If the immediate is negative, generate MOVN else MOVZ. diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index 37d233ac2446c..c0c0e791b0eb9 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -112,7 +112,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx, if (Kind >= FirstLiteralRelocationKind) return Kind - FirstLiteralRelocationKind; AArch64MCExpr::Specifier RefKind = - static_cast(Target.getRefKind()); + static_cast(Target.getSpecifier()); AArch64MCExpr::Specifier SymLoc = AArch64MCExpr::getSymbolLoc(RefKind); bool IsNC = AArch64MCExpr::isNotChecked(RefKind); @@ -543,7 +543,7 @@ bool AArch64ELFObjectWriter::needsRelocateWithSymbol(const MCValue &Val, if (Val.getAddSym() && cast(Val.getAddSym())->isMemtag()) return true; - if ((Val.getRefKind() & AArch64MCExpr::VK_GOT) == AArch64MCExpr::VK_GOT) + if ((Val.getSpecifier() & AArch64MCExpr::VK_GOT) == AArch64MCExpr::VK_GOT) return true; return is_contained({AArch64MCExpr::VK_GOTPCREL, AArch64MCExpr::VK_PLT}, AArch64MCExpr::Specifier(Val.getSpecifier())); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index fc0989678d2c1..d2c7e7871ae82 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -391,8 +391,8 @@ void AArch64MachObjectWriter::recordRelocation( Value = 0; } - if (Target.getRefKind() == AArch64MCExpr::VK_AUTH || - Target.getRefKind() == AArch64MCExpr::VK_AUTHADDR) { + if (Target.getSpecifier() == AArch64MCExpr::VK_AUTH || + Target.getSpecifier() == AArch64MCExpr::VK_AUTHADDR) { auto *Expr = cast(Fixup.getValue()); assert(Type == MachO::ARM64_RELOC_UNSIGNED); diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp index c7ece33777927..d424399ce6bc9 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp @@ -44,7 +44,7 @@ unsigned CSKYELFObjectWriter::getRelocType(MCContext &Ctx, unsigned Kind = Fixup.getTargetKind(); uint8_t Modifier = Target.getSpecifier(); - switch (Target.getRefKind()) { + switch (Target.getSpecifier()) { case CSKYMCExpr::VK_TLSIE: case CSKYMCExpr::VK_TLSLE: case CSKYMCExpr::VK_TLSGD: @@ -170,7 +170,7 @@ unsigned CSKYELFObjectWriter::getRelocType(MCContext &Ctx, bool CSKYELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, const MCSymbol &, unsigned Type) const { - switch (V.getRefKind()) { + switch (V.getSpecifier()) { case CSKYMCExpr::VK_PLT: case CSKYMCExpr::VK_GOT: return true; diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp index 4c6512d1284b8..fe83e8f600d8c 100644 --- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp +++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp @@ -707,7 +707,7 @@ bool LoongArchAsmParser::classifySymbolRef(const MCExpr *Expr, MCValue Res; if (Expr->evaluateAsRelocatable(Res, nullptr)) - return Res.getRefKind() == LoongArchMCExpr::VK_None; + return Res.getSpecifier() == LoongArchMCExpr::VK_None; return false; } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index e74c8af2a850c..b6a98b3ff9aeb 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -452,7 +452,7 @@ bool LoongArchAsmBackend::handleAddSubRelocations(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, uint64_t &FixedValue) const { - assert(Target.getRefKind() == 0 && + assert(Target.getSpecifier() == 0 && "relocatable SymA-SymB cannot have relocation specifier"); std::pair FK; uint64_t FixedValueA, FixedValueB; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index ca1eb665132e7..2e2a503d5304f 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -50,7 +50,7 @@ unsigned LoongArchELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - switch (Target.getRefKind()) { + switch (Target.getSpecifier()) { case LoongArchMCExpr::VK_TLS_LE_HI20: case LoongArchMCExpr::VK_TLS_IE_PC_HI20: case LoongArchMCExpr::VK_TLS_IE_HI20: diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index c7801476a2b61..4af619e18eb47 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -162,7 +162,7 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx, if (Kind >= FirstLiteralRelocationKind) return Kind - FirstLiteralRelocationKind; - switch (Target.getRefKind()) { + switch (Target.getSpecifier()) { case MipsMCExpr::MEK_DTPREL: case MipsMCExpr::MEK_DTPREL_HI: case MipsMCExpr::MEK_DTPREL_LO: diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index d23ddb918e7f9..dba78fef0bad8 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -2798,14 +2798,14 @@ bool RISCVAsmParser::classifySymbolRef(const MCExpr *Expr, MCValue Res; if (Expr->evaluateAsRelocatable(Res, nullptr)) - return Res.getRefKind() == RISCVMCExpr::VK_None; + return Res.getSpecifier() == RISCVMCExpr::VK_None; return false; } bool RISCVAsmParser::isSymbolDiff(const MCExpr *Expr) { MCValue Res; if (Expr->evaluateAsRelocatable(Res, nullptr)) { - return Res.getRefKind() == RISCVMCExpr::VK_None && Res.getAddSym() && + return Res.getSpecifier() == RISCVMCExpr::VK_None && Res.getAddSym() && Res.getSubSym(); } return false; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 652dd6586492d..6641116db9a19 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -628,7 +628,7 @@ bool RISCVAsmBackend::handleAddSubRelocations(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, uint64_t &FixedValue) const { - assert(Target.getRefKind() == 0 && + assert(Target.getSpecifier() == 0 && "relocatable SymA-SymB cannot have relocation specifier"); uint64_t FixedValueA, FixedValueB; unsigned TA = 0, TB = 0; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp index 1662b8068084c..5fdf8e23d1214 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp @@ -56,7 +56,7 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx, if (Kind >= FirstLiteralRelocationKind) return Kind - FirstLiteralRelocationKind; - auto Spec = RISCVMCExpr::Specifier(Target.getRefKind()); + auto Spec = RISCVMCExpr::Specifier(Target.getSpecifier()); switch (Spec) { case RISCVMCExpr::VK_TPREL_HI: case RISCVMCExpr::VK_TLS_GOT_HI: diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp index 2cb7fa8233949..74ef4870a12d3 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp @@ -46,7 +46,7 @@ unsigned SparcELFObjectWriter::getRelocType(MCContext &Ctx, if (Kind >= FirstLiteralRelocationKind) return Kind - FirstLiteralRelocationKind; - switch (Target.getRefKind()) { + switch (Target.getSpecifier()) { case SparcMCExpr::VK_TLS_GD_HI22: case SparcMCExpr::VK_TLS_GD_LO10: case SparcMCExpr::VK_TLS_GD_ADD: diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp index 727ed56aac26f..5d0d18d86d3f8 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp @@ -40,7 +40,7 @@ class VEELFObjectWriter : public MCELFObjectTargetWriter { unsigned VEELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const { - switch (Target.getRefKind()) { + switch (Target.getSpecifier()) { case VEMCExpr::VK_TLS_GD_HI32: case VEMCExpr::VK_TLS_GD_LO32: case VEMCExpr::VK_TPOFF_HI32: From 53e1c8b118792b0177bff69d806c9e00034ceb1c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 6 Apr 2025 00:38:08 -0700 Subject: [PATCH 0779/1029] AArch64MCExpr: Remove unused getSpecifier --- llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index 3128f9d10a4bc..3f10c69869c27 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -229,13 +229,6 @@ class AArch64AuthMCExpr final : public AArch64MCExpr { return E->getSpecifier() == VK_AUTH || E->getSpecifier() == VK_AUTHADDR; } }; - -// Getter for the legacy representation that encodes the relocation specifier in -// MCSymbolRefExpr::SubclassData. -static inline AArch64MCExpr::Specifier -getSpecifier(const MCSymbolRefExpr *SRE) { - return AArch64MCExpr::Specifier(SRE->getKind()); -} } // end namespace llvm #endif From 8d71a2a905508f32ed201920ef7ac15a3ad8e4e6 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 6 Apr 2025 09:48:52 +0100 Subject: [PATCH 0780/1029] [VPlan] Use ExitBocks to check in VPlan::isExitBlock (NFC). Exit blocks of the VPlan are now hold in ExitBlocks. Use it to check if a block is an exit block. Otherwise we currently mis-classify the scalar loop header also as exit block, as it is not explicitly connected to the exit blocks. NFC at the moment, as the helper currently is never queried with the scalar header, but that will change in the future. --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 85fd34d79be42..a6e807b616ce5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -921,7 +921,7 @@ VPIRBasicBlock *VPlan::getExitBlock(BasicBlock *IRBB) const { } bool VPlan::isExitBlock(VPBlockBase *VPBB) { - return isa(VPBB) && VPBB->getNumSuccessors() == 0; + return is_contained(ExitBlocks, VPBB); } /// Generate the code inside the preheader and body of the vectorized loop. From 7013b51548c0bd2c7e5564735c44506909a2f8dc Mon Sep 17 00:00:00 2001 From: Hui Date: Sun, 6 Apr 2025 10:50:55 +0100 Subject: [PATCH 0781/1029] [libc++] Implement `std::flat_multiset` (#128363) fixes https://github.com/llvm/llvm-project/issues/105193 --- libcxx/docs/FeatureTestMacroTable.rst | 2 +- libcxx/docs/ReleaseNotes/21.rst | 2 +- libcxx/docs/Status/Cxx23Papers.csv | 2 +- libcxx/include/CMakeLists.txt | 2 + libcxx/include/__flat_set/flat_multiset.h | 792 ++++++++++++++++++ libcxx/include/__flat_set/flat_set.h | 41 +- libcxx/include/__flat_set/utils.h | 78 ++ libcxx/include/flat_set | 17 + libcxx/include/module.modulemap | 3 + libcxx/include/version | 2 +- libcxx/modules/std/flat_set.inc | 4 +- .../assert.sorted_unique.pass.cpp | 131 +++ .../flat.multiset/iterator.compile.pass.cpp | 42 + .../flat_multiset.nodiscard.verify.cpp | 20 + .../container.adaptors/flat.map/helpers.h | 172 +--- .../flat.multimap/helpers.h | 172 +--- .../flat.multiset.capacity/empty.pass.cpp | 52 ++ .../flat.multiset.capacity/max_size.pass.cpp | 68 ++ .../flat.multiset.capacity/size.pass.cpp | 71 ++ .../flat.multiset.cons/alloc.pass.cpp | 63 ++ .../assign_initializer_list.pass.cpp | 68 ++ .../flat.multiset.cons/compare.pass.cpp | 85 ++ .../flat.multiset.cons/containers.pass.cpp | 162 ++++ .../flat.multiset.cons/copy.pass.cpp | 70 ++ .../flat.multiset.cons/copy_alloc.pass.cpp | 66 ++ .../flat.multiset.cons/copy_assign.pass.cpp | 110 +++ .../deduct.compile.pass.cpp | 43 + .../flat.multiset.cons/deduct.pass.cpp | 410 +++++++++ .../flat.multiset.cons/deduct_pmr.pass.cpp | 104 +++ .../flat.multiset.cons/default.pass.cpp | 96 +++ .../flat.multiset.cons/dtor_noexcept.pass.cpp | 61 ++ .../initializer_list.pass.cpp | 157 ++++ .../flat.multiset.cons/iter_iter.pass.cpp | 141 ++++ .../flat.multiset.cons/move.pass.cpp | 188 +++++ .../flat.multiset.cons/move_alloc.pass.cpp | 79 ++ .../flat.multiset.cons/move_assign.pass.cpp | 239 ++++++ .../flat.multiset.cons/pmr.pass.cpp | 326 +++++++ .../flat.multiset.cons/range.pass.cpp | 176 ++++ .../sorted_container.pass.cpp | 147 ++++ .../sorted_initializer_list.pass.cpp | 158 ++++ .../sorted_iter_iter.pass.cpp | 160 ++++ .../flat.multiset.erasure/erase_if.pass.cpp | 113 +++ .../erase_if_exceptions.pass.cpp | 132 +++ .../flat.multiset.iterators/iterator.pass.cpp | 98 +++ .../iterator_comparison.pass.cpp | 158 ++++ ...rator_concept_conformance.compile.pass.cpp | 77 ++ ...range_concept_conformance.compile.pass.cpp | 52 ++ .../reverse_iterator.pass.cpp | 92 ++ .../flat.multiset.modifiers/clear.pass.cpp | 74 ++ .../flat.multiset.modifiers/emplace.pass.cpp | 136 +++ .../emplace_hint.pass.cpp | 241 ++++++ .../erase_iter.pass.cpp | 114 +++ .../erase_iter_iter.pass.cpp | 98 +++ .../erase_key.pass.cpp | 100 +++ .../erase_key_transparent.pass.cpp | 165 ++++ .../flat.multiset.modifiers/extract.pass.cpp | 102 +++ .../insert_cv.pass.cpp | 85 ++ .../insert_initializer_list.pass.cpp | 90 ++ .../insert_iter_cv.pass.cpp | 86 ++ .../insert_iter_iter.pass.cpp | 94 +++ .../insert_iter_rv.pass.cpp | 88 ++ .../insert_range.pass.cpp | 100 +++ .../insert_rv.pass.cpp | 92 ++ .../insert_sorted_initializer_list.pass.cpp | 67 ++ .../insert_sorted_iter_iter.pass.cpp | 82 ++ .../flat.multiset.modifiers/replace.pass.cpp | 88 ++ .../swap_exception.pass.cpp | 61 ++ .../swap_free.pass.cpp | 98 +++ .../swap_member.pass.cpp | 96 +++ .../flat.multiset.observers/comp.pass.cpp | 76 ++ .../contains.pass.cpp | 80 ++ .../contains_transparent.pass.cpp | 91 ++ .../flat.multiset.operations/count.pass.cpp | 80 ++ .../count_transparent.pass.cpp | 90 ++ .../equal_range.pass.cpp | 88 ++ .../equal_range_transparent.pass.cpp | 122 +++ .../flat.multiset.operations/find.pass.cpp | 64 ++ .../find_transparent.pass.cpp | 110 +++ .../lower_bound.pass.cpp | 80 ++ .../lower_bound_transparent.pass.cpp | 116 +++ .../upper_bound.pass.cpp | 81 ++ .../upper_bound_transparent.pass.cpp | 114 +++ .../flat.multiset/helpers.h | 119 +++ .../flat.multiset/incomplete_type.pass.cpp | 36 + .../flat.multiset/op_compare.pass.cpp | 105 +++ .../flat.multiset/types.compile.pass.cpp | 94 +++ .../container.adaptors/flat.set/helpers.h | 185 +--- .../container.adaptors/flat_helpers.h | 184 ++++ .../flat_set.version.compile.pass.cpp | 32 +- .../version.version.compile.pass.cpp | 32 +- .../generate_feature_test_macro_components.py | 1 - libcxx/utils/libcxx/test/modules.py | 2 +- 92 files changed, 9029 insertions(+), 614 deletions(-) create mode 100644 libcxx/include/__flat_set/flat_multiset.h create mode 100644 libcxx/include/__flat_set/utils.h create mode 100644 libcxx/test/libcxx/containers/container.adaptors/flat.multiset/assert.sorted_unique.pass.cpp create mode 100644 libcxx/test/libcxx/containers/container.adaptors/flat.multiset/iterator.compile.pass.cpp create mode 100644 libcxx/test/libcxx/diagnostics/flat_multiset.nodiscard.verify.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/deduct.compile.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/deduct.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/deduct_pmr.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/pmr.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if_exceptions.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_concept_conformance.compile.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/range_concept_conformance.compile.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_exception.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/incomplete_type.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multiset/types.compile.pass.cpp create mode 100644 libcxx/test/std/containers/container.adaptors/flat_helpers.h diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index 9015ccb18dddf..bf25b34ba9159 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -332,7 +332,7 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_flat_map`` ``202207L`` ---------------------------------------------------------- ----------------- - ``__cpp_lib_flat_set`` *unimplemented* + ``__cpp_lib_flat_set`` ``202207L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_format_ranges`` ``202207L`` ---------------------------------------------------------- ----------------- diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst index 7af109ddc8657..1f479d19e4772 100644 --- a/libcxx/docs/ReleaseNotes/21.rst +++ b/libcxx/docs/ReleaseNotes/21.rst @@ -43,8 +43,8 @@ Implemented Papers - P1361R2: Integration of chrono with text formatting (`Github `__) - P2255R2: A type trait to detect reference binding to temporary (implemented the type traits only) (`Github `__) - P2562R1: ``constexpr`` Stable Sorting (`Github `__) -- P1222R4: A Standard ``flat_set`` is partially implemented and ``flat_set`` is provided (`Github `__) - P0472R3: Put std::monostate in (`Github `__) +- P1222R4: A Standard ``flat_set`` (`Github `__) Improvements and New Features ----------------------------- diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv index 923d8bf9341d0..c26363bcda796 100644 --- a/libcxx/docs/Status/Cxx23Papers.csv +++ b/libcxx/docs/Status/Cxx23Papers.csv @@ -54,7 +54,7 @@ "`P0009R18 `__","mdspan: A Non-Owning Multidimensional Array Reference","2022-07 (Virtual)","|Complete|","18","" "`P0429R9 `__","A Standard ``flat_map``","2022-07 (Virtual)","|Complete|","20","" "`P1169R4 `__","``static operator()``","2022-07 (Virtual)","|Complete|","16","" -"`P1222R4 `__","A Standard ``flat_set``","2022-07 (Virtual)","|In progress|","","" +"`P1222R4 `__","A Standard ``flat_set``","2022-07 (Virtual)","|Complete|","21","" "`P1223R5 `__","``ranges::find_last()``, ``ranges::find_last_if()``, and ``ranges::find_last_if_not()``","2022-07 (Virtual)","|Complete|","19","" "`P1467R9 `__","Extended ``floating-point`` types and standard names","2022-07 (Virtual)","","","" "`P1642R11 `__","Freestanding ``[utilities]``, ``[ranges]``, and ``[iterators]``","2022-07 (Virtual)","","","" diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 7b09beb74b173..0429c90b01a76 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -370,8 +370,10 @@ set(files __flat_map/sorted_equivalent.h __flat_map/sorted_unique.h __flat_map/utils.h + __flat_set/flat_multiset.h __flat_set/flat_set.h __flat_set/ra_iterator.h + __flat_set/utils.h __format/buffer.h __format/concepts.h __format/container_adaptor.h diff --git a/libcxx/include/__flat_set/flat_multiset.h b/libcxx/include/__flat_set/flat_multiset.h new file mode 100644 index 0000000000000..0fed377b25e5a --- /dev/null +++ b/libcxx/include/__flat_set/flat_multiset.h @@ -0,0 +1,792 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___FLAT_MAP_FLAT_MULTISET_H +#define _LIBCPP___FLAT_MAP_FLAT_MULTISET_H + +#include <__algorithm/equal_range.h> +#include <__algorithm/lexicographical_compare_three_way.h> +#include <__algorithm/lower_bound.h> +#include <__algorithm/min.h> +#include <__algorithm/ranges_equal.h> +#include <__algorithm/ranges_inplace_merge.h> +#include <__algorithm/ranges_is_sorted.h> +#include <__algorithm/ranges_sort.h> +#include <__algorithm/ranges_unique.h> +#include <__algorithm/remove_if.h> +#include <__algorithm/upper_bound.h> +#include <__assert> +#include <__compare/synth_three_way.h> +#include <__concepts/convertible_to.h> +#include <__concepts/swappable.h> +#include <__config> +#include <__cstddef/byte.h> +#include <__cstddef/ptrdiff_t.h> +#include <__flat_map/key_value_iterator.h> +#include <__flat_map/sorted_equivalent.h> +#include <__flat_set/ra_iterator.h> +#include <__flat_set/utils.h> +#include <__functional/invoke.h> +#include <__functional/is_transparent.h> +#include <__functional/operations.h> +#include <__fwd/vector.h> +#include <__iterator/concepts.h> +#include <__iterator/distance.h> +#include <__iterator/iterator_traits.h> +#include <__iterator/prev.h> +#include <__iterator/ranges_iterator_traits.h> +#include <__iterator/reverse_iterator.h> +#include <__memory/allocator_traits.h> +#include <__memory/uses_allocator.h> +#include <__memory/uses_allocator_construction.h> +#include <__ranges/access.h> +#include <__ranges/concepts.h> +#include <__ranges/container_compatible_range.h> +#include <__ranges/drop_view.h> +#include <__ranges/from_range.h> +#include <__ranges/ref_view.h> +#include <__ranges/size.h> +#include <__ranges/subrange.h> +#include <__ranges/zip_view.h> +#include <__type_traits/conjunction.h> +#include <__type_traits/container_traits.h> +#include <__type_traits/invoke.h> +#include <__type_traits/is_allocator.h> +#include <__type_traits/is_nothrow_constructible.h> +#include <__type_traits/is_same.h> +#include <__type_traits/maybe_const.h> +#include <__utility/as_const.h> +#include <__utility/exception_guard.h> +#include <__utility/move.h> +#include <__utility/pair.h> +#include <__utility/scope_guard.h> +#include <__vector/vector.h> +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + +#if _LIBCPP_STD_VER >= 23 + +_LIBCPP_BEGIN_NAMESPACE_STD + +template , class _KeyContainer = vector<_Key>> +class flat_multiset { + template + friend class flat_multiset; + + friend __flat_set_utils; + + static_assert(is_same_v<_Key, typename _KeyContainer::value_type>); + static_assert(!is_same_v<_KeyContainer, std::vector>, "vector is not a sequence container"); + +public: + // types + using key_type = _Key; + using value_type = _Key; + using key_compare = __type_identity_t<_Compare>; + using value_compare = _Compare; + using reference = value_type&; + using const_reference = const value_type&; + using size_type = typename _KeyContainer::size_type; + using difference_type = typename _KeyContainer::difference_type; + using iterator = __ra_iterator; + using const_iterator = iterator; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + using container_type = _KeyContainer; + +public: + // [flat.multiset.cons], constructors + _LIBCPP_HIDE_FROM_ABI flat_multiset() noexcept(is_nothrow_default_constructible_v<_KeyContainer> && + is_nothrow_default_constructible_v<_Compare>) + : __keys_(), __compare_() {} + + _LIBCPP_HIDE_FROM_ABI flat_multiset(const flat_multiset&) = default; + + // The copy/move constructors are not specified in the spec, which means they should be defaulted. + // However, the move constructor can potentially leave a moved-from object in an inconsistent + // state if an exception is thrown. + _LIBCPP_HIDE_FROM_ABI flat_multiset(flat_multiset&& __other) noexcept( + is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_Compare>) +# if _LIBCPP_HAS_EXCEPTIONS + try +# endif // _LIBCPP_HAS_EXCEPTIONS + : __keys_(std::move(__other.__keys_)), __compare_(std::move(__other.__compare_)) { + __other.clear(); +# if _LIBCPP_HAS_EXCEPTIONS + } catch (...) { + __other.clear(); + // gcc does not like the `throw` keyword in a conditionally noexcept function + if constexpr (!(is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_Compare>)) { + throw; + } +# endif // _LIBCPP_HAS_EXCEPTIONS + } + + _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(const key_compare& __comp) : __keys_(), __compare_(__comp) {} + + _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(container_type __keys, const key_compare& __comp = key_compare()) + : __keys_(std::move(__keys)), __compare_(__comp) { + ranges::sort(__keys_, __compare_); + } + + _LIBCPP_HIDE_FROM_ABI + flat_multiset(sorted_equivalent_t, container_type __keys, const key_compare& __comp = key_compare()) + : __keys_(std::move(__keys)), __compare_(__comp) { + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); + } + + template + requires __has_input_iterator_category<_InputIterator>::value + _LIBCPP_HIDE_FROM_ABI + flat_multiset(_InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare()) + : __keys_(), __compare_(__comp) { + insert(__first, __last); + } + + template + requires __has_input_iterator_category<_InputIterator>::value + _LIBCPP_HIDE_FROM_ABI flat_multiset( + sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare()) + : __keys_(__first, __last), __compare_(__comp) { + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); + } + + template <_ContainerCompatibleRange _Range> + _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t __fr, _Range&& __rg) + : flat_multiset(__fr, std::forward<_Range>(__rg), key_compare()) {} + + template <_ContainerCompatibleRange _Range> + _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp) : flat_multiset(__comp) { + insert_range(std::forward<_Range>(__rg)); + } + + _LIBCPP_HIDE_FROM_ABI flat_multiset(initializer_list __il, const key_compare& __comp = key_compare()) + : flat_multiset(__il.begin(), __il.end(), __comp) {} + + _LIBCPP_HIDE_FROM_ABI + flat_multiset(sorted_equivalent_t, initializer_list __il, const key_compare& __comp = key_compare()) + : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __comp) {} + + template + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI explicit flat_multiset(const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc)), __compare_() {} + + template + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI flat_multiset(const key_compare& __comp, const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc)), __compare_(__comp) {} + + template + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI flat_multiset(const container_type& __keys, const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc, __keys)), __compare_() { + ranges::sort(__keys_, __compare_); + } + + template + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI + flat_multiset(const container_type& __keys, const key_compare& __comp, const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc, __keys)), __compare_(__comp) { + ranges::sort(__keys_, __compare_); + } + + template + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI flat_multiset(sorted_equivalent_t, const container_type& __keys, const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc, __keys)), __compare_() { + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); + } + + template + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI + flat_multiset(sorted_equivalent_t, const container_type& __keys, const key_compare& __comp, const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc, __keys)), __compare_(__comp) { + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); + } + + template + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI flat_multiset(const flat_multiset& __other, const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc, __other.__keys_)), + __compare_(__other.__compare_) {} + + template + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI flat_multiset(flat_multiset&& __other, const _Allocator& __alloc) +# if _LIBCPP_HAS_EXCEPTIONS + try +# endif // _LIBCPP_HAS_EXCEPTIONS + : __keys_(std::make_obj_using_allocator(__alloc, std::move(__other.__keys_))), + __compare_(std::move(__other.__compare_)) { + __other.clear(); +# if _LIBCPP_HAS_EXCEPTIONS + } catch (...) { + __other.clear(); + throw; +# endif // _LIBCPP_HAS_EXCEPTIONS + } + + template + requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator::value) + _LIBCPP_HIDE_FROM_ABI flat_multiset(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc)), __compare_() { + insert(__first, __last); + } + + template + requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator::value) + _LIBCPP_HIDE_FROM_ABI + flat_multiset(_InputIterator __first, _InputIterator __last, const key_compare& __comp, const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc)), __compare_(__comp) { + insert(__first, __last); + } + + template + requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator::value) + _LIBCPP_HIDE_FROM_ABI + flat_multiset(sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc, __first, __last)), __compare_() { + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); + } + + template + requires(__has_input_iterator_category<_InputIterator>::value && uses_allocator::value) + _LIBCPP_HIDE_FROM_ABI + flat_multiset(sorted_equivalent_t, + _InputIterator __first, + _InputIterator __last, + const key_compare& __comp, + const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc, __first, __last)), __compare_(__comp) { + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys_, __compare_), "Key container is not sorted"); + } + + template <_ContainerCompatibleRange _Range, class _Allocator> + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc)), __compare_() { + insert_range(std::forward<_Range>(__rg)); + } + + template <_ContainerCompatibleRange _Range, class _Allocator> + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI flat_multiset(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc) + : __keys_(std::make_obj_using_allocator(__alloc)), __compare_(__comp) { + insert_range(std::forward<_Range>(__rg)); + } + + template + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI flat_multiset(initializer_list __il, const _Allocator& __alloc) + : flat_multiset(__il.begin(), __il.end(), __alloc) {} + + template + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI + flat_multiset(initializer_list __il, const key_compare& __comp, const _Allocator& __alloc) + : flat_multiset(__il.begin(), __il.end(), __comp, __alloc) {} + + template + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI flat_multiset(sorted_equivalent_t, initializer_list __il, const _Allocator& __alloc) + : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __alloc) {} + + template + requires uses_allocator::value + _LIBCPP_HIDE_FROM_ABI flat_multiset( + sorted_equivalent_t, initializer_list __il, const key_compare& __comp, const _Allocator& __alloc) + : flat_multiset(sorted_equivalent, __il.begin(), __il.end(), __comp, __alloc) {} + + _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(initializer_list __il) { + clear(); + insert(__il); + return *this; + } + + // copy/move assignment are not specified in the spec (defaulted) + // but move assignment can potentially leave moved from object in an inconsistent + // state if an exception is thrown + _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(const flat_multiset&) = default; + + _LIBCPP_HIDE_FROM_ABI flat_multiset& operator=(flat_multiset&& __other) noexcept( + is_nothrow_move_assignable_v<_KeyContainer> && is_nothrow_move_assignable_v<_Compare>) { + auto __clear_other_guard = std::__make_scope_guard([&]() noexcept { __other.clear() /* noexcept */; }); + auto __clear_self_guard = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); + __keys_ = std::move(__other.__keys_); + __compare_ = std::move(__other.__compare_); + __clear_self_guard.__complete(); + return *this; + } + + // iterators + _LIBCPP_HIDE_FROM_ABI iterator begin() noexcept { return iterator(std::as_const(__keys_).begin()); } + _LIBCPP_HIDE_FROM_ABI const_iterator begin() const noexcept { return const_iterator(__keys_.begin()); } + _LIBCPP_HIDE_FROM_ABI iterator end() noexcept { return iterator(std::as_const(__keys_).end()); } + _LIBCPP_HIDE_FROM_ABI const_iterator end() const noexcept { return const_iterator(__keys_.end()); } + + _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() noexcept { return reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() noexcept { return reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); } + + _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const noexcept { return begin(); } + _LIBCPP_HIDE_FROM_ABI const_iterator cend() const noexcept { return end(); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(end()); } + _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept { return const_reverse_iterator(begin()); } + + // capacity + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool empty() const noexcept { return __keys_.empty(); } + _LIBCPP_HIDE_FROM_ABI size_type size() const noexcept { return __keys_.size(); } + _LIBCPP_HIDE_FROM_ABI size_type max_size() const noexcept { return __keys_.max_size(); } + + // [flat.multiset.modifiers], modifiers + template + requires is_constructible_v + _LIBCPP_HIDE_FROM_ABI iterator emplace(_Args&&... __args) { + if constexpr (sizeof...(__args) == 1 && (is_same_v, _Key> && ...)) { + return __emplace(std::forward<_Args>(__args)...); + } else { + return __emplace(_Key(std::forward<_Args>(__args)...)); + } + } + + template + requires is_constructible_v + _LIBCPP_HIDE_FROM_ABI iterator emplace_hint(const_iterator __hint, _Args&&... __args) { + if constexpr (sizeof...(__args) == 1 && (is_same_v, _Key> && ...)) { + return __emplace_hint(std::move(__hint), std::forward<_Args>(__args)...); + } else { + return __emplace_hint(std::move(__hint), _Key(std::forward<_Args>(__args)...)); + } + } + + _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return emplace(__x); } + + _LIBCPP_HIDE_FROM_ABI iterator insert(value_type&& __x) { return emplace(std::move(__x)); } + + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, const value_type& __x) { + return emplace_hint(__hint, __x); + } + + _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, value_type&& __x) { + return emplace_hint(__hint, std::move(__x)); + } + + template + requires __has_input_iterator_category<_InputIterator>::value + _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last) { + if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) { + __reserve(__last - __first); + } + __append_sort_merge(std::move(__first), std::move(__last)); + } + + template + requires __has_input_iterator_category<_InputIterator>::value + _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) { + if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) { + __reserve(__last - __first); + } + + __append_sort_merge(std::move(__first), std::move(__last)); + } + + template <_ContainerCompatibleRange _Range> + _LIBCPP_HIDE_FROM_ABI void insert_range(_Range&& __range) { + if constexpr (ranges::sized_range<_Range>) { + __reserve(ranges::size(__range)); + } + + __append_sort_merge(std::forward<_Range>(__range)); + } + + _LIBCPP_HIDE_FROM_ABI void insert(initializer_list __il) { insert(__il.begin(), __il.end()); } + + _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, initializer_list __il) { + insert(sorted_equivalent, __il.begin(), __il.end()); + } + + _LIBCPP_HIDE_FROM_ABI container_type extract() && { + auto __guard = std::__make_scope_guard([&]() noexcept { clear() /* noexcept */; }); + auto __ret = std::move(__keys_); + return __ret; + } + + _LIBCPP_HIDE_FROM_ABI void replace(container_type&& __keys) { + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(ranges::is_sorted(__keys, __compare_), "Key container is not sorted"); + auto __guard = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); + __keys_ = std::move(__keys); + __guard.__complete(); + } + + _LIBCPP_HIDE_FROM_ABI iterator erase(iterator __position) { + auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); + auto __key_iter = __keys_.erase(__position.__base()); + __on_failure.__complete(); + return iterator(__key_iter); + } + + // The following overload is the same as the iterator overload + // iterator erase(const_iterator __position); + + _LIBCPP_HIDE_FROM_ABI size_type erase(const key_type& __x) { + auto [__first, __last] = equal_range(__x); + auto __res = __last - __first; + erase(__first, __last); + return __res; + } + + template + requires(__is_transparent_v<_Compare> && !is_convertible_v<_Kp &&, iterator> && + !is_convertible_v<_Kp &&, const_iterator>) + _LIBCPP_HIDE_FROM_ABI size_type erase(_Kp&& __x) { + auto [__first, __last] = equal_range(__x); + auto __res = __last - __first; + erase(__first, __last); + return __res; + } + + _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last) { + auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); + auto __key_it = __keys_.erase(__first.__base(), __last.__base()); + __on_failure.__complete(); + return iterator(std::move(__key_it)); + } + + _LIBCPP_HIDE_FROM_ABI void swap(flat_multiset& __y) noexcept { + // warning: The spec has unconditional noexcept, which means that + // if any of the following functions throw an exception, + // std::terminate will be called + // This is discussed in P3567, which hasn't been voted on yet. + ranges::swap(__compare_, __y.__compare_); + ranges::swap(__keys_, __y.__keys_); + } + + _LIBCPP_HIDE_FROM_ABI void clear() noexcept { __keys_.clear(); } + + // observers + _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __compare_; } + _LIBCPP_HIDE_FROM_ABI value_compare value_comp() const { return __compare_; } + + // map operations + _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __x) { return __find_impl(*this, __x); } + + _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __x) const { return __find_impl(*this, __x); } + + template + requires __is_transparent_v<_Compare> + _LIBCPP_HIDE_FROM_ABI iterator find(const _Kp& __x) { + return __find_impl(*this, __x); + } + + template + requires __is_transparent_v<_Compare> + _LIBCPP_HIDE_FROM_ABI const_iterator find(const _Kp& __x) const { + return __find_impl(*this, __x); + } + + _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __x) const { + auto [__first, __last] = equal_range(__x); + return __last - __first; + } + + template + requires __is_transparent_v<_Compare> + _LIBCPP_HIDE_FROM_ABI size_type count(const _Kp& __x) const { + auto [__first, __last] = equal_range(__x); + return __last - __first; + } + + _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __x) const { return find(__x) != end(); } + + template + requires __is_transparent_v<_Compare> + _LIBCPP_HIDE_FROM_ABI bool contains(const _Kp& __x) const { + return find(__x) != end(); + } + + _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __x) { + const auto& __keys = __keys_; + return iterator(std::lower_bound(__keys.begin(), __keys.end(), __x, __compare_)); + } + + _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __x) const { + return const_iterator(std::lower_bound(__keys_.begin(), __keys_.end(), __x, __compare_)); + } + + template + requires __is_transparent_v<_Compare> + _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _Kp& __x) { + const auto& __keys = __keys_; + return iterator(std::lower_bound(__keys.begin(), __keys.end(), __x, __compare_)); + } + + template + requires __is_transparent_v<_Compare> + _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _Kp& __x) const { + return const_iterator(std::lower_bound(__keys_.begin(), __keys_.end(), __x, __compare_)); + } + + _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __x) { + const auto& __keys = __keys_; + return iterator(std::upper_bound(__keys.begin(), __keys.end(), __x, __compare_)); + } + + _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __x) const { + return const_iterator(std::upper_bound(__keys_.begin(), __keys_.end(), __x, __compare_)); + } + + template + requires __is_transparent_v<_Compare> + _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _Kp& __x) { + const auto& __keys = __keys_; + return iterator(std::upper_bound(__keys.begin(), __keys.end(), __x, __compare_)); + } + + template + requires __is_transparent_v<_Compare> + _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _Kp& __x) const { + return const_iterator(std::upper_bound(__keys_.begin(), __keys_.end(), __x, __compare_)); + } + + _LIBCPP_HIDE_FROM_ABI pair equal_range(const key_type& __x) { + return __equal_range_impl(*this, __x); + } + + _LIBCPP_HIDE_FROM_ABI pair equal_range(const key_type& __x) const { + return __equal_range_impl(*this, __x); + } + + template + requires __is_transparent_v<_Compare> + _LIBCPP_HIDE_FROM_ABI pair equal_range(const _Kp& __x) { + return __equal_range_impl(*this, __x); + } + template + requires __is_transparent_v<_Compare> + _LIBCPP_HIDE_FROM_ABI pair equal_range(const _Kp& __x) const { + return __equal_range_impl(*this, __x); + } + + friend _LIBCPP_HIDE_FROM_ABI bool operator==(const flat_multiset& __x, const flat_multiset& __y) { + return ranges::equal(__x, __y); + } + + friend _LIBCPP_HIDE_FROM_ABI auto operator<=>(const flat_multiset& __x, const flat_multiset& __y) { + return std::lexicographical_compare_three_way( + __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); + } + + friend _LIBCPP_HIDE_FROM_ABI void swap(flat_multiset& __x, flat_multiset& __y) noexcept { __x.swap(__y); } + +private: + template + _LIBCPP_HIDE_FROM_ABI void __append_sort_merge(_Args&&... __args) { + auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); + size_type __old_size = size(); + __flat_set_utils::__append(*this, std::forward<_Args>(__args)...); + if constexpr (!_WasSorted) { + ranges::sort(__keys_.begin() + __old_size, __keys_.end(), __compare_); + } else { + _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT( + ranges::is_sorted(__keys_ | ranges::views::drop(__old_size)), "Key container is not sorted"); + } + ranges::inplace_merge(__keys_.begin(), __keys_.begin() + __old_size, __keys_.end(), __compare_); + __on_failure.__complete(); + } + + template + _LIBCPP_HIDE_FROM_ABI iterator __emplace(_Kp&& __key) { + auto __it = upper_bound(__key); + return __flat_set_utils::__emplace_exact_pos(*this, __it, std::forward<_Kp>(__key)); + } + + template + _LIBCPP_HIDE_FROM_ABI iterator __emplace_hint(const_iterator __hint, _Kp&& __key) { + auto __prev_larger = __hint != cbegin() && __compare_(__key, *std::prev(__hint)); + auto __next_smaller = __hint != cend() && __compare_(*__hint, __key); + + if (!__prev_larger && !__next_smaller) [[likely]] { + // hint correct, just use exact hint iterator + } else if (__prev_larger && !__next_smaller) { + // the hint position is more to the right than the key should have been. + // we want to emplace the element to a position as right as possible + // e.g. Insert new element "2" in the following range + // 1, 1, 2, 2, 2, 3, 4, 6 + // ^ + // | + // hint + // We want to insert "2" after the last existing "2" + __hint = std::upper_bound(begin(), __hint, __key, __compare_); + } else { + _LIBCPP_ASSERT_INTERNAL(!__prev_larger && __next_smaller, "this means that the multiset is not sorted"); + + // the hint position is more to the left than the key should have been. + // we want to emplace the element to a position as left as possible + // 1, 1, 2, 2, 2, 3, 4, 6 + // ^ + // | + // hint + // We want to insert "2" before the first existing "2" + __hint = std::lower_bound(__hint, end(), __key, __compare_); + } + return __flat_set_utils::__emplace_exact_pos(*this, __hint, std::forward<_Kp>(__key)); + } + + template + _LIBCPP_HIDE_FROM_ABI static auto __find_impl(_Self&& __self, const _Kp& __key) { + auto __it = __self.lower_bound(__key); + auto __last = __self.end(); + if (__it == __last || __self.__compare_(__key, *__it)) { + return __last; + } + return __it; + } + + template + _LIBCPP_HIDE_FROM_ABI static auto __equal_range_impl(_Self&& __self, const _Kp& __key) { + using __iter = _If>, const_iterator, iterator>; + auto [__key_first, __key_last] = + std::equal_range(__self.__keys_.begin(), __self.__keys_.end(), __key, __self.__compare_); + return std::make_pair(__iter(__key_first), __iter(__key_last)); + } + + _LIBCPP_HIDE_FROM_ABI void __reserve(size_t __size) { + if constexpr (requires { __keys_.reserve(__size); }) { + __keys_.reserve(__size); + } + } + + template + friend typename flat_multiset<_Key2, _Compare2, _KeyContainer2>::size_type + erase_if(flat_multiset<_Key2, _Compare2, _KeyContainer2>&, _Predicate); + + _KeyContainer __keys_; + _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_; + + struct __key_equiv { + _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {} + _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const { + return !__comp_(std::get<0>(__x), std::get<0>(__y)) && !__comp_(std::get<0>(__y), std::get<0>(__x)); + } + key_compare __comp_; + }; +}; + +template > + requires(!__is_allocator<_Compare>::value && !__is_allocator<_KeyContainer>::value && + is_invocable_v) +flat_multiset(_KeyContainer, _Compare = _Compare()) + -> flat_multiset; + +template + requires(uses_allocator_v<_KeyContainer, _Allocator> && !__is_allocator<_KeyContainer>::value) +flat_multiset(_KeyContainer, _Allocator) + -> flat_multiset, _KeyContainer>; + +template + requires(!__is_allocator<_Compare>::value && !__is_allocator<_KeyContainer>::value && + uses_allocator_v<_KeyContainer, _Allocator> && + is_invocable_v) +flat_multiset(_KeyContainer, _Compare, _Allocator) + -> flat_multiset; + +template > + requires(!__is_allocator<_Compare>::value && !__is_allocator<_KeyContainer>::value && + is_invocable_v) +flat_multiset(sorted_equivalent_t, _KeyContainer, _Compare = _Compare()) + -> flat_multiset; + +template + requires(uses_allocator_v<_KeyContainer, _Allocator> && !__is_allocator<_KeyContainer>::value) +flat_multiset(sorted_equivalent_t, _KeyContainer, _Allocator) + -> flat_multiset, _KeyContainer>; + +template + requires(!__is_allocator<_Compare>::value && !__is_allocator<_KeyContainer>::value && + uses_allocator_v<_KeyContainer, _Allocator> && + is_invocable_v) +flat_multiset(sorted_equivalent_t, _KeyContainer, _Compare, _Allocator) + -> flat_multiset; + +template >> + requires(__has_input_iterator_category<_InputIterator>::value && !__is_allocator<_Compare>::value) +flat_multiset(_InputIterator, _InputIterator, _Compare = _Compare()) + -> flat_multiset<__iter_value_type<_InputIterator>, _Compare>; + +template >> + requires(__has_input_iterator_category<_InputIterator>::value && !__is_allocator<_Compare>::value) +flat_multiset(sorted_equivalent_t, _InputIterator, _InputIterator, _Compare = _Compare()) + -> flat_multiset<__iter_value_type<_InputIterator>, _Compare>; + +template >, + class _Allocator = allocator>, + class = __enable_if_t::value && __is_allocator<_Allocator>::value>> +flat_multiset(from_range_t, _Range&&, _Compare = _Compare(), _Allocator = _Allocator()) -> flat_multiset< + ranges::range_value_t<_Range>, + _Compare, + vector, __allocator_traits_rebind_t<_Allocator, ranges::range_value_t<_Range>>>>; + +template ::value>> +flat_multiset(from_range_t, _Range&&, _Allocator) -> flat_multiset< + ranges::range_value_t<_Range>, + less>, + vector, __allocator_traits_rebind_t<_Allocator, ranges::range_value_t<_Range>>>>; + +template > + requires(!__is_allocator<_Compare>::value) +flat_multiset(initializer_list<_Key>, _Compare = _Compare()) -> flat_multiset<_Key, _Compare>; + +template > + requires(!__is_allocator<_Compare>::value) +flat_multiset(sorted_equivalent_t, initializer_list<_Key>, _Compare = _Compare()) -> flat_multiset<_Key, _Compare>; + +template +struct uses_allocator, _Allocator> + : bool_constant > {}; + +template +_LIBCPP_HIDE_FROM_ABI typename flat_multiset<_Key, _Compare, _KeyContainer>::size_type +erase_if(flat_multiset<_Key, _Compare, _KeyContainer>& __flat_multiset, _Predicate __pred) { + auto __guard = std::__make_exception_guard([&] { __flat_multiset.clear(); }); + auto __it = + std::remove_if(__flat_multiset.__keys_.begin(), __flat_multiset.__keys_.end(), [&](const auto& __e) -> bool { + return static_cast(__pred(__e)); + }); + auto __res = __flat_multiset.__keys_.end() - __it; + __flat_multiset.__keys_.erase(__it, __flat_multiset.__keys_.end()); + __guard.__complete(); + return __res; +} + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP_STD_VER >= 23 + +_LIBCPP_POP_MACROS + +#endif // _LIBCPP___FLAT_MAP_FLAT_MULTISET_H diff --git a/libcxx/include/__flat_set/flat_set.h b/libcxx/include/__flat_set/flat_set.h index dca06c7236e73..a87496bb9916e 100644 --- a/libcxx/include/__flat_set/flat_set.h +++ b/libcxx/include/__flat_set/flat_set.h @@ -28,6 +28,7 @@ #include <__cstddef/ptrdiff_t.h> #include <__flat_map/sorted_unique.h> #include <__flat_set/ra_iterator.h> +#include <__flat_set/utils.h> #include <__functional/invoke.h> #include <__functional/is_transparent.h> #include <__functional/operations.h> @@ -82,6 +83,8 @@ class flat_set { template friend class flat_set; + friend __flat_set_utils; + static_assert(is_same_v<_Key, typename _KeyContainer::value_type>); static_assert(!is_same_v<_KeyContainer, std::vector>, "vector is not a sequence container"); @@ -619,31 +622,11 @@ class flat_set { __keys_.erase(__dup_start, __keys_.end()); } - template - _LIBCPP_HIDE_FROM_ABI void __append(_InputIterator __first, _InputIterator __last) { - __keys_.insert(__keys_.end(), std::move(__first), std::move(__last)); - } - - template - _LIBCPP_HIDE_FROM_ABI void __append(_Range&& __rng) { - if constexpr (requires { __keys_.insert_range(__keys_.end(), std::forward<_Range>(__rng)); }) { - // C++23 Sequence Container should have insert_range member function - // Note that not all Sequence Containers provide append_range. - __keys_.insert_range(__keys_.end(), std::forward<_Range>(__rng)); - } else if constexpr (ranges::common_range<_Range>) { - __keys_.insert(__keys_.end(), ranges::begin(__rng), ranges::end(__rng)); - } else { - for (auto&& __x : __rng) { - __keys_.insert(__keys_.end(), std::forward(__x)); - } - } - } - template _LIBCPP_HIDE_FROM_ABI void __append_sort_merge_unique(_Args&&... __args) { auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); size_type __old_size = size(); - __append(std::forward<_Args>(__args)...); + __flat_set_utils::__append(*this, std::forward<_Args>(__args)...); if (size() != __old_size) { if constexpr (!_WasSorted) { ranges::sort(__keys_.begin() + __old_size, __keys_.end(), __compare_); @@ -680,23 +663,11 @@ class flat_set { return std::make_pair(__iter(__it), __iter(std::next(__it))); } - template - _LIBCPP_HIDE_FROM_ABI iterator __emplace_exact_pos(const_iterator __it, _KeyArg&& __key) { - auto __on_failure = std::__make_exception_guard([&]() noexcept { - if constexpr (!__container_traits<_KeyContainer>::__emplacement_has_strong_exception_safety_guarantee) { - clear() /* noexcept */; - } - }); - auto __key_it = __keys_.emplace(__it.__base(), std::forward<_KeyArg>(__key)); - __on_failure.__complete(); - return iterator(std::move(__key_it)); - } - template _LIBCPP_HIDE_FROM_ABI pair __emplace(_Kp&& __key) { auto __it = lower_bound(__key); if (__it == end() || __compare_(__key, *__it)) { - return pair(__emplace_exact_pos(__it, std::forward<_Kp>(__key)), true); + return pair(__flat_set_utils::__emplace_exact_pos(*this, __it, std::forward<_Kp>(__key)), true); } else { return pair(std::move(__it), false); } @@ -717,7 +688,7 @@ class flat_set { _LIBCPP_HIDE_FROM_ABI iterator __emplace_hint(const_iterator __hint, _Kp&& __key) { if (__is_hint_correct(__hint, __key)) { if (__hint == cend() || __compare_(__key, *__hint)) { - return __emplace_exact_pos(__hint, std::forward<_Kp>(__key)); + return __flat_set_utils::__emplace_exact_pos(*this, __hint, std::forward<_Kp>(__key)); } else { // we already have an equal key return __hint; diff --git a/libcxx/include/__flat_set/utils.h b/libcxx/include/__flat_set/utils.h new file mode 100644 index 0000000000000..ed3b4c48580fb --- /dev/null +++ b/libcxx/include/__flat_set/utils.h @@ -0,0 +1,78 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___FLAT_SET_UTILS_H +#define _LIBCPP___FLAT_SET_UTILS_H + +#include <__config> +#include <__ranges/access.h> +#include <__ranges/concepts.h> +#include <__type_traits/container_traits.h> +#include <__type_traits/decay.h> +#include <__utility/exception_guard.h> +#include <__utility/forward.h> +#include <__utility/move.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + +#if _LIBCPP_STD_VER >= 23 + +_LIBCPP_BEGIN_NAMESPACE_STD + +// These utilities are defined in a class instead of a namespace so that this class can be befriended more easily. +struct __flat_set_utils { + // Emplace a key into a flat_{multi}set, at the exact position that + // __it point to, assuming that the key is not already present in the set. + // When an exception is thrown during the emplacement, the function will clear the set if the container does not + // have strong exception safety guarantee on emplacement. + template + _LIBCPP_HIDE_FROM_ABI static auto __emplace_exact_pos(_Set& __set, _Iter&& __iter, _KeyArg&& __key) { + using _KeyContainer = typename decay_t<_Set>::container_type; + auto __on_failure = std::__make_exception_guard([&]() noexcept { + if constexpr (!__container_traits<_KeyContainer>::__emplacement_has_strong_exception_safety_guarantee) { + __set.clear() /* noexcept */; + } + }); + auto __key_it = __set.__keys_.emplace(__iter.__base(), std::forward<_KeyArg>(__key)); + __on_failure.__complete(); + return typename decay_t<_Set>::iterator(std::move(__key_it)); + } + + template + _LIBCPP_HIDE_FROM_ABI static void __append(_Set& __set, _InputIterator __first, _InputIterator __last) { + __set.__keys_.insert(__set.__keys_.end(), std::move(__first), std::move(__last)); + } + + template + _LIBCPP_HIDE_FROM_ABI static void __append(_Set& __set, _Range&& __rng) { + if constexpr (requires { __set.__keys_.insert_range(__set.__keys_.end(), std::forward<_Range>(__rng)); }) { + // C++23 Sequence Container should have insert_range member function + // Note that not all Sequence Containers provide append_range. + __set.__keys_.insert_range(__set.__keys_.end(), std::forward<_Range>(__rng)); + } else if constexpr (ranges::common_range<_Range>) { + __set.__keys_.insert(__set.__keys_.end(), ranges::begin(__rng), ranges::end(__rng)); + } else { + for (auto&& __x : __rng) { + __set.__keys_.insert(__set.__keys_.end(), std::forward(__x)); + } + } + } +}; +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP_STD_VER >= 23 + +_LIBCPP_POP_MACROS + +#endif // #define _LIBCPP___FLAT_SET_UTILS_H diff --git a/libcxx/include/flat_set b/libcxx/include/flat_set index d03645fafafdb..ebbb3a0247f3e 100644 --- a/libcxx/include/flat_set +++ b/libcxx/include/flat_set @@ -31,6 +31,21 @@ namespace std { template typename flat_set::size_type erase_if(flat_set& c, Predicate pred); + + // [flat.multiset], class template flat_multiset + template, class KeyContainer = vector> + class flat_multiset; + + struct sorted_equivalent_t { explicit sorted_equivalent_t() = default; }; + inline constexpr sorted_equivalent_t sorted_equivalent{}; + + template + struct uses_allocator, Allocator>; + + // [flat.multiset.erasure], erasure for flat_multiset + template + typename flat_multiset::size_type + erase_if(flat_multiset& c, Predicate pred); } */ @@ -40,7 +55,9 @@ namespace std { # include <__config> # if _LIBCPP_STD_VER >= 23 +# include <__flat_map/sorted_equivalent.h> # include <__flat_map/sorted_unique.h> +# include <__flat_set/flat_multiset.h> # include <__flat_set/flat_set.h> # endif diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 324931b1bb078..87164c74c9d99 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -1305,13 +1305,16 @@ module std [system] { module flat_set { module flat_set { header "__flat_set/flat_set.h" + header "__flat_set/flat_multiset.h" export std.vector.vector export std.vector.fwd } module ra_iterator { header "__flat_set/ra_iterator.h" } + module utils { header "__flat_set/utils.h" } header "flat_set" export std.flat_map.sorted_unique + export std.flat_map.sorted_equivalent export * } diff --git a/libcxx/include/version b/libcxx/include/version index 49102716c3605..0a19ff9ad8187 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -486,7 +486,7 @@ __cpp_lib_void_t 201411L # define __cpp_lib_containers_ranges 202202L # define __cpp_lib_expected 202211L # define __cpp_lib_flat_map 202207L -// # define __cpp_lib_flat_set 202207L +# define __cpp_lib_flat_set 202207L # define __cpp_lib_format_ranges 202207L // # define __cpp_lib_formatters 202302L # define __cpp_lib_forward_like 202207L diff --git a/libcxx/modules/std/flat_set.inc b/libcxx/modules/std/flat_set.inc index 3f2c6e09a0ebe..51f39b75458b9 100644 --- a/libcxx/modules/std/flat_set.inc +++ b/libcxx/modules/std/flat_set.inc @@ -19,13 +19,11 @@ export namespace std { // [flat.set.erasure], erasure for flat_­set using std::erase_if; -#endif // _LIBCPP_STD_VER >= 23 -#if 0 // [flat.multiset], class template flat_­multiset using std::flat_multiset; using std::sorted_equivalent; using std::sorted_equivalent_t; -#endif +#endif // _LIBCPP_STD_VER >= 23 } // namespace std diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/assert.sorted_unique.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/assert.sorted_unique.pass.cpp new file mode 100644 index 0000000000000..54b07baaff27a --- /dev/null +++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/assert.sorted_unique.pass.cpp @@ -0,0 +1,131 @@ +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: has-unix-headers +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// UNSUPPORTED: libcpp-hardening-mode=none +// REQUIRES: libcpp-hardening-mode=debug +// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing + +// + +// flat_multiset(container_type , const key_compare& __comp = key_compare()) +// flat_multiset(const container_type& , const _Allocator& ) +// flat_multiset(const container_type& , const key_compare&, const _Allocator& ) +// void replace(container_type&& ) +// + +#include +#include +#include +#include +#include +#include + +#include "check_assertion.h" + +int main(int, char**) { + using M = std::flat_multiset; + + TEST_LIBCPP_ASSERT_FAILURE(([] { M m(std::sorted_equivalent, {4, 2, 3}); }()), "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { M m(std::sorted_equivalent, {4, 2, 3}, std::less{}); }()), "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector keys{4, 2, 3}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, keys, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector keys{4, 2, 3}; + const std::allocator alloc{}; + const std::less comp{}; + M m(std::sorted_equivalent, keys, comp, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector v{4, 2, 3}; + const std::less comp{}; + M m(std::sorted_equivalent, v.begin(), v.end(), comp); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector v{4, 2, 3}; + const std::less comp{}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, v.begin(), v.end(), comp, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector v{4, 2, 3}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, v.begin(), v.end(), alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::initializer_list v{4, 2, 3}; + const std::less comp{}; + M m(std::sorted_equivalent, v, comp); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::initializer_list v{4, 2, 3}; + const std::less comp{}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, v, comp, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::initializer_list v{4, 2, 3}; + const std::allocator alloc{}; + M m(std::sorted_equivalent, v, alloc); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + const std::vector v{4, 2, 3}; + M m; + m.insert(std::sorted_equivalent, v.begin(), v.end()); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::initializer_list v{4, 2, 3}; + M m; + m.insert(std::sorted_equivalent, v); + }()), + "Key container is not sorted"); + + TEST_LIBCPP_ASSERT_FAILURE( + ([] { + std::vector keys{2, 1, 3}; + M m; + m.replace(std::move(keys)); + }()), + "Key container is not sorted"); + + return 0; +} diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/iterator.compile.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/iterator.compile.pass.cpp new file mode 100644 index 0000000000000..0954e42e52001 --- /dev/null +++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multiset/iterator.compile.pass.cpp @@ -0,0 +1,42 @@ +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// test that iterators from different types of flat_multiset are not compatible + +#include +#include +#include +#include + +using Iter1 = std::flat_multiset::iterator; +using Iter2 = std::flat_multiset::iterator; +using Iter3 = std::flat_multiset>::iterator; +using Iter4 = std::flat_multiset, std::deque>::iterator; + +static_assert(std::is_convertible_v); +static_assert(!std::is_convertible_v); +static_assert(!std::is_convertible_v); +static_assert(!std::is_convertible_v); + +static_assert(!std::is_convertible_v); +static_assert(std::is_convertible_v); +static_assert(!std::is_convertible_v); +static_assert(!std::is_convertible_v); + +static_assert(!std::is_convertible_v); +static_assert(!std::is_convertible_v); +static_assert(std::is_convertible_v); +static_assert(!std::is_convertible_v); + +static_assert(!std::is_convertible_v); +static_assert(!std::is_convertible_v); +static_assert(!std::is_convertible_v); +static_assert(std::is_convertible_v); diff --git a/libcxx/test/libcxx/diagnostics/flat_multiset.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/flat_multiset.nodiscard.verify.cpp new file mode 100644 index 0000000000000..a271a293e94e7 --- /dev/null +++ b/libcxx/test/libcxx/diagnostics/flat_multiset.nodiscard.verify.cpp @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// [[nodiscard]] bool empty() const noexcept; + +#include + +void f() { + std::flat_multiset c; + c.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}} +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.map/helpers.h index 8dbb85a6c0acf..b6b8fa061c840 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.map/helpers.h +++ b/libcxx/test/std/containers/container.adaptors/flat.map/helpers.h @@ -15,6 +15,7 @@ #include #include +#include "../flat_helpers.h" #include "test_allocator.h" #include "test_macros.h" @@ -30,150 +31,6 @@ void check_invariant(const std::flat_map& m) { assert(std::adjacent_find(keys.begin(), keys.end(), key_equal) == keys.end()); } -struct StartsWith { - explicit StartsWith(char ch) : lower_(1, ch), upper_(1, ch + 1) {} - StartsWith(const StartsWith&) = delete; - void operator=(const StartsWith&) = delete; - struct Less { - using is_transparent = void; - bool operator()(const std::string& a, const std::string& b) const { return a < b; } - bool operator()(const StartsWith& a, const std::string& b) const { return a.upper_ <= b; } - bool operator()(const std::string& a, const StartsWith& b) const { return a < b.lower_; } - bool operator()(const StartsWith&, const StartsWith&) const { - assert(false); // should not be called - return false; - } - }; - -private: - std::string lower_; - std::string upper_; -}; - -template -struct CopyOnlyVector : std::vector { - using std::vector::vector; - - CopyOnlyVector(const CopyOnlyVector&) = default; - CopyOnlyVector(CopyOnlyVector&& other) : CopyOnlyVector(other) {} - CopyOnlyVector(CopyOnlyVector&& other, std::vector::allocator_type alloc) : CopyOnlyVector(other, alloc) {} - - CopyOnlyVector& operator=(const CopyOnlyVector&) = default; - CopyOnlyVector& operator=(CopyOnlyVector& other) { return this->operator=(other); } -}; - -template -struct Transparent { - T t; - - operator T() const - requires ConvertibleToT - { - return t; - } -}; - -template -using ConvertibleTransparent = Transparent; - -template -using NonConvertibleTransparent = Transparent; - -struct TransparentComparator { - using is_transparent = void; - - bool* transparent_used = nullptr; - TransparentComparator() = default; - TransparentComparator(bool& used) : transparent_used(&used) {} - - template - bool operator()(const T& t, const Transparent& transparent) const { - if (transparent_used != nullptr) { - *transparent_used = true; - } - return t < transparent.t; - } - - template - bool operator()(const Transparent& transparent, const T& t) const { - if (transparent_used != nullptr) { - *transparent_used = true; - } - return transparent.t < t; - } - - template - bool operator()(const T& t1, const T& t2) const { - return t1 < t2; - } -}; - -struct NonTransparentComparator { - template - bool operator()(const T&, const Transparent&) const; - - template - bool operator()(const Transparent&, const T&) const; - - template - bool operator()(const T&, const T&) const; -}; - -struct NoDefaultCtr { - NoDefaultCtr() = delete; -}; - -#ifndef TEST_HAS_NO_EXCEPTIONS -template -struct EmplaceUnsafeContainer : std::vector { - using std::vector::vector; - - template - auto emplace(Args&&... args) -> decltype(std::declval>().emplace(std::forward(args)...)) { - if (this->size() > 1) { - auto it1 = this->begin(); - auto it2 = it1 + 1; - // messing up the container - std::iter_swap(it1, it2); - } - - throw 42; - } - - template - auto insert(Args&&... args) -> decltype(std::declval>().insert(std::forward(args)...)) { - if (this->size() > 1) { - auto it1 = this->begin(); - auto it2 = it1 + 1; - // messing up the container - std::iter_swap(it1, it2); - } - - throw 42; - } -}; - -template -struct ThrowOnEraseContainer : std::vector { - using std::vector::vector; - - template - auto erase(Args&&... args) -> decltype(std::declval>().erase(std::forward(args)...)) { - throw 42; - } -}; - -template -struct ThrowOnMoveContainer : std::vector { - using std::vector::vector; - - ThrowOnMoveContainer(ThrowOnMoveContainer&&) { throw 42; } - - ThrowOnMoveContainer& operator=(ThrowOnMoveContainer&&) { throw 42; } -}; - -#endif - template void test_emplace_exception_guarantee([[maybe_unused]] F&& emplace_function) { #ifndef TEST_HAS_NO_EXCEPTIONS @@ -363,32 +220,5 @@ void test_erase_exception_guarantee([[maybe_unused]] F&& erase_function) { } #endif } -class Moveable { - int int_; - double double_; - -public: - Moveable() : int_(0), double_(0) {} - Moveable(int i, double d) : int_(i), double_(d) {} - Moveable(Moveable&& x) : int_(x.int_), double_(x.double_) { - x.int_ = -1; - x.double_ = -1; - } - Moveable& operator=(Moveable&& x) { - int_ = x.int_; - x.int_ = -1; - double_ = x.double_; - x.double_ = -1; - return *this; - } - - Moveable(const Moveable&) = delete; - Moveable& operator=(const Moveable&) = delete; - bool operator==(const Moveable& x) const { return int_ == x.int_ && double_ == x.double_; } - bool operator<(const Moveable& x) const { return int_ < x.int_ || (int_ == x.int_ && double_ < x.double_); } - - int get() const { return int_; } - bool moved() const { return int_ == -1; } -}; #endif // SUPPORT_FLAT_MAP_HELPERS_H diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h index 252e2454d497c..68d7f67a6669f 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h @@ -15,6 +15,7 @@ #include #include +#include "../flat_helpers.h" #include "test_allocator.h" #include "test_macros.h" @@ -25,150 +26,6 @@ void check_invariant(const std::flat_multimap& m) { assert(std::is_sorted(keys.begin(), keys.end(), m.key_comp())); } -struct StartsWith { - explicit StartsWith(char ch) : lower_(1, ch), upper_(1, ch + 1) {} - StartsWith(const StartsWith&) = delete; - void operator=(const StartsWith&) = delete; - struct Less { - using is_transparent = void; - bool operator()(const std::string& a, const std::string& b) const { return a < b; } - bool operator()(const StartsWith& a, const std::string& b) const { return a.upper_ <= b; } - bool operator()(const std::string& a, const StartsWith& b) const { return a < b.lower_; } - bool operator()(const StartsWith&, const StartsWith&) const { - assert(false); // should not be called - return false; - } - }; - -private: - std::string lower_; - std::string upper_; -}; - -template -struct CopyOnlyVector : std::vector { - using std::vector::vector; - - CopyOnlyVector(const CopyOnlyVector&) = default; - CopyOnlyVector(CopyOnlyVector&& other) : CopyOnlyVector(other) {} - CopyOnlyVector(CopyOnlyVector&& other, std::vector::allocator_type alloc) : CopyOnlyVector(other, alloc) {} - - CopyOnlyVector& operator=(const CopyOnlyVector&) = default; - CopyOnlyVector& operator=(CopyOnlyVector& other) { return this->operator=(other); } -}; - -template -struct Transparent { - T t; - - operator T() const - requires ConvertibleToT - { - return t; - } -}; - -template -using ConvertibleTransparent = Transparent; - -template -using NonConvertibleTransparent = Transparent; - -struct TransparentComparator { - using is_transparent = void; - - bool* transparent_used = nullptr; - TransparentComparator() = default; - TransparentComparator(bool& used) : transparent_used(&used) {} - - template - bool operator()(const T& t, const Transparent& transparent) const { - if (transparent_used != nullptr) { - *transparent_used = true; - } - return t < transparent.t; - } - - template - bool operator()(const Transparent& transparent, const T& t) const { - if (transparent_used != nullptr) { - *transparent_used = true; - } - return transparent.t < t; - } - - template - bool operator()(const T& t1, const T& t2) const { - return t1 < t2; - } -}; - -struct NonTransparentComparator { - template - bool operator()(const T&, const Transparent&) const; - - template - bool operator()(const Transparent&, const T&) const; - - template - bool operator()(const T&, const T&) const; -}; - -struct NoDefaultCtr { - NoDefaultCtr() = delete; -}; - -#ifndef TEST_HAS_NO_EXCEPTIONS -template -struct EmplaceUnsafeContainer : std::vector { - using std::vector::vector; - - template - auto emplace(Args&&... args) -> decltype(std::declval>().emplace(std::forward(args)...)) { - if (this->size() > 1) { - auto it1 = this->begin(); - auto it2 = it1 + 1; - // messing up the container - std::iter_swap(it1, it2); - } - - throw 42; - } - - template - auto insert(Args&&... args) -> decltype(std::declval>().insert(std::forward(args)...)) { - if (this->size() > 1) { - auto it1 = this->begin(); - auto it2 = it1 + 1; - // messing up the container - std::iter_swap(it1, it2); - } - - throw 42; - } -}; - -template -struct ThrowOnEraseContainer : std::vector { - using std::vector::vector; - - template - auto erase(Args&&... args) -> decltype(std::declval>().erase(std::forward(args)...)) { - throw 42; - } -}; - -template -struct ThrowOnMoveContainer : std::vector { - using std::vector::vector; - - ThrowOnMoveContainer(ThrowOnMoveContainer&&) { throw 42; } - - ThrowOnMoveContainer& operator=(ThrowOnMoveContainer&&) { throw 42; } -}; - -#endif - template void test_emplace_exception_guarantee([[maybe_unused]] F&& emplace_function) { #ifndef TEST_HAS_NO_EXCEPTIONS @@ -358,32 +215,5 @@ void test_erase_exception_guarantee([[maybe_unused]] F&& erase_function) { } #endif } -class Moveable { - int int_; - double double_; - -public: - Moveable() : int_(0), double_(0) {} - Moveable(int i, double d) : int_(i), double_(d) {} - Moveable(Moveable&& x) : int_(x.int_), double_(x.double_) { - x.int_ = -1; - x.double_ = -1; - } - Moveable& operator=(Moveable&& x) { - int_ = x.int_; - x.int_ = -1; - double_ = x.double_; - x.double_ = -1; - return *this; - } - - Moveable(const Moveable&) = delete; - Moveable& operator=(const Moveable&) = delete; - bool operator==(const Moveable& x) const { return int_ == x.int_ && double_ == x.double_; } - bool operator<(const Moveable& x) const { return int_ < x.int_ || (int_ == x.int_ && double_ < x.double_); } - - int get() const { return int_; } - bool moved() const { return int_ == -1; } -}; #endif // SUPPORT_FLAT_MULTIMAP_HELPERS_H diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp new file mode 100644 index 0000000000000..52f77438df2ce --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/empty.pass.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// [[nodiscard]] bool empty() const noexcept; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + M m; + ASSERT_SAME_TYPE(decltype(m.empty()), bool); + ASSERT_NOEXCEPT(m.empty()); + assert(m.empty()); + assert(std::as_const(m).empty()); + m = {1}; + assert(!m.empty()); + m.clear(); + assert(m.empty()); +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp new file mode 100644 index 0000000000000..4e3d1414b28af --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/max_size.pass.cpp @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// size_type max_size() const noexcept; + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_allocator.h" +#include "test_macros.h" + +void test() { + { + using A1 = limited_allocator; + using C = std::flat_multiset, std::vector>; + ASSERT_SAME_TYPE(C::difference_type, std::ptrdiff_t); + ASSERT_SAME_TYPE(C::size_type, std::size_t); + const C c; + ASSERT_NOEXCEPT(c.max_size()); + ASSERT_SAME_TYPE(decltype(c.max_size()), C::size_type); + assert(c.max_size() <= 10); + LIBCPP_ASSERT(c.max_size() == 10); + } + { + using A = limited_allocator; + using C = std::flat_multiset, std::vector>; + ASSERT_SAME_TYPE(C::difference_type, std::ptrdiff_t); + ASSERT_SAME_TYPE(C::size_type, std::size_t); + const C::size_type max_dist = static_cast(std::numeric_limits::max()); + const C c; + ASSERT_NOEXCEPT(c.max_size()); + ASSERT_SAME_TYPE(decltype(c.max_size()), C::size_type); + assert(c.max_size() <= max_dist); + LIBCPP_ASSERT(c.max_size() == max_dist); + } + { + typedef std::flat_multiset C; + ASSERT_SAME_TYPE(C::difference_type, std::ptrdiff_t); + ASSERT_SAME_TYPE(C::size_type, std::size_t); + const C::size_type max_dist = static_cast(std::numeric_limits::max()); + const C c; + ASSERT_NOEXCEPT(c.max_size()); + ASSERT_SAME_TYPE(decltype(c.max_size()), C::size_type); + assert(c.max_size() <= max_dist); + assert(c.max_size() <= alloc_max_size(std::allocator())); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp new file mode 100644 index 0000000000000..4aff08b8127b6 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.capacity/size.pass.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// size_type size() const noexcept; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using M = std::flat_multiset, KeyContainer>; + using S = typename M::size_type; + { + const M m = {1, 1, 4, 5, 5}; + ASSERT_SAME_TYPE(decltype(m.size()), S); + ASSERT_NOEXCEPT(m.size()); + assert(m.size() == 5); + } + { + const M m = {1}; + ASSERT_SAME_TYPE(decltype(m.size()), S); + ASSERT_NOEXCEPT(m.size()); + assert(m.size() == 1); + } + { + const M m; + ASSERT_SAME_TYPE(decltype(m.size()), S); + ASSERT_NOEXCEPT(m.size()); + assert(m.size() == 0); + } + { + M m; + S s = 500000; + for (std::size_t i = 0u; i < s; ++i) { + m.emplace(i); + m.emplace(i); + } + ASSERT_SAME_TYPE(decltype(m.size()), S); + ASSERT_NOEXCEPT(m.size()); + assert(m.size() == 2 * s); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp new file mode 100644 index 0000000000000..4fffcb304d20a --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/alloc.pass.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// explicit flat_multiset(const Allocator& a); + +#include +#include +#include +#include + +#include "test_macros.h" +#include "test_allocator.h" +#include "../../../test_compare.h" + +void test() { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multiset; + using M2 = std::flat_multiset; + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + // explicit + using M = std::flat_multiset, std::vector>>; + + static_assert(std::is_constructible_v>); + static_assert(!std::is_convertible_v, M>); + } + { + using A = test_allocator; + using M = std::flat_multiset, std::vector>>; + M m(A(0, 5)); + assert(m.empty()); + assert(m.begin() == m.end()); + assert(std::move(m).extract().get_allocator().get_id() == 5); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp new file mode 100644 index 0000000000000..ae81ab044932d --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/assign_initializer_list.pass.cpp @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multiset& operator=(initializer_list il); + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" +#include "test_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + { + M m = {8, 10}; + assert(m.size() == 2); + m = {3, 1, 2, 2, 3, 4, 3, 5, 6, 5}; + int expected[] = {1, 2, 2, 3, 3, 3, 4, 5, 5, 6}; + assert(std::ranges::equal(m, expected)); + } + { + M m = {10, 8}; + assert(m.size() == 2); + m = {3}; + double expected[] = {3}; + assert(std::ranges::equal(m, expected)); + } + { + // was empty + M m; + assert(m.size() == 0); + m = {3, 1, 2, 2, 3, 4, 3, 5, 6, 5}; + int expected[] = {1, 2, 2, 3, 3, 3, 4, 5, 5, 6}; + assert(std::ranges::equal(m, expected)); + } +} + +void test() { + test>(); + test>(); + test>(); + test>(); + test>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp new file mode 100644 index 0000000000000..6b68589e6814f --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/compare.pass.cpp @@ -0,0 +1,85 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// explicit flat_multiset(const key_compare& comp); +// template +// flat_multiset(const key_compare& comp, const Alloc& a); + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "../../../test_compare.h" +#include "test_allocator.h" + +void test() { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multiset; + using M2 = std::flat_multiset; + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + using C = test_less; + auto m = std::flat_multiset(C(3)); + assert(m.empty()); + assert(m.begin() == m.end()); + assert(m.key_comp() == C(3)); + } + { + // The one-argument ctor is explicit. + using C = test_less; + static_assert(std::is_constructible_v, C>); + static_assert(!std::is_convertible_v>); + + static_assert(std::is_constructible_v, std::less>); + static_assert(!std::is_convertible_v, std::flat_multiset>); + } + { + using C = test_less; + using A1 = test_allocator; + auto m = std::flat_multiset>(C(4), A1(5)); + assert(m.empty()); + assert(m.begin() == m.end()); + assert(m.key_comp() == C(4)); + assert(std::move(m).extract().get_allocator() == A1(5)); + } + { + // explicit(false) + using C = test_less; + using A1 = test_allocator; + std::flat_multiset> m = {C(4), A1(5)}; + assert(m.empty()); + assert(m.begin() == m.end()); + assert(m.key_comp() == C(4)); + assert(std::move(m).extract().get_allocator() == A1(5)); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp new file mode 100644 index 0000000000000..78eac420a8f22 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/containers.pass.cpp @@ -0,0 +1,162 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// explicit flat_multiset(container_type key_cont, const key_compare& comp = key_compare()); +// template +// flat_multiset(const container_type& key_cont, const Allocator& a); +// template +// flat_multiset(const container_type& key_cont, const key_compare& comp, const Alloc& a); + +#include +#include +#include +#include +#include + +#include "min_allocator.h" +#include "MoveOnly.h" +#include "test_allocator.h" +#include "test_iterators.h" +#include "test_macros.h" +#include "../../../test_compare.h" + +template +void conversion_test(T); + +template +concept ImplicitlyConstructible = requires(Args&&... args) { conversion_test({std::forward(args)...}); }; + +void test() { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multiset; + using M2 = std::flat_multiset; + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + // flat_multiset(container_type) + using M = std::flat_multiset; + std::vector ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + auto m = M(ks); + int expected[] = {1, 1, 1, 2, 2, 2, 3, 3, 3}; + assert(std::ranges::equal(m, expected)); + + // explicit(false) + static_assert(std::is_constructible_v&>); + static_assert(!ImplicitlyConstructible&>); + + m = M(std::move(ks)); + assert(ks.empty()); // it was moved-from + assert(std::ranges::equal(m, expected)); + } + { + // flat_multiset(container_type) + // move-only + int expected[] = {3, 3, 2, 1}; + using Ks = std::deque>; + using M = std::flat_multiset, Ks>; + Ks ks; + ks.push_back(1); + ks.push_back(3); + ks.push_back(3); + ks.push_back(2); + auto m = M(std::move(ks)); + assert(ks.empty()); // it was moved-from + assert(std::ranges::equal(m, expected, std::equal_to<>())); + } + { + // flat_multiset(container_type) + // container's allocators are used + using A = test_allocator; + using M = std::flat_multiset, std::deque>; + auto ks = std::deque({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5)); + auto m = M(std::move(ks)); + assert(ks.empty()); // it was moved-from + assert((m == M{1, 1, 1, 2, 2, 2, 3, 3, 3})); + auto keys = std::move(m).extract(); + assert(keys.get_allocator() == A(5)); + } + { + // flat_multiset(container_type, key_compare) + using C = test_less; + using M = std::flat_multiset; + std::vector ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + auto m = M(ks, C(4)); + assert(std::ranges::equal(m, std::vector{1, 1, 1, 2, 2, 2, 3, 3, 3})); + assert(m.key_comp() == C(4)); + + // explicit + static_assert(std::is_constructible_v&, const C&>); + static_assert(!ImplicitlyConstructible&, const C&>); + } + { + // flat_multiset(container_type , const Allocator&) + using A = test_allocator; + using M = std::flat_multiset, std::deque>; + auto ks = std::deque({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5)); + auto m = M(ks, A(4)); // replaces the allocators + assert(!ks.empty()); // it was an lvalue above + assert((m == M{1, 1, 1, 2, 2, 2, 3, 3, 3})); + auto keys = M(m).extract(); + assert(keys.get_allocator() == A(4)); + + // explicit(false) + static_assert(ImplicitlyConstructible&, const A&>); + M m2 = {ks, A(4)}; // implicit ctor + assert(!ks.empty()); // it was an lvalue above + assert(m2 == m); + auto keys2 = std::move(m).extract(); + assert(keys2.get_allocator() == A(4)); + } + { + // flat_multiset(container_type , const Allocator&) + using C = test_less; + using A = test_allocator; + using M = std::flat_multiset>; + std::vector ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + auto m = M(ks, C(4), A(5)); + assert(std::ranges::equal(m, std::vector{1, 1, 1, 2, 2, 2, 3, 3, 3})); + assert(m.key_comp() == C(4)); + auto m_copy = m; + auto keys = std::move(m_copy).extract(); + assert(keys.get_allocator() == A(5)); + + // explicit(false) + static_assert(ImplicitlyConstructible&, const A&>); + M m2 = {ks, C(4), A(5)}; + assert(m2 == m); + assert(m2.key_comp() == C(4)); + keys = std::move(m2).extract(); + assert(keys.get_allocator() == A(5)); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp new file mode 100644 index 0000000000000..b4f7220e1bac7 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy.pass.cpp @@ -0,0 +1,70 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multiset(const flat_multiset& m); + +#include +#include +#include +#include + +#include "test_macros.h" +#include "../../../test_compare.h" +#include "test_allocator.h" + +void test() { + { + using C = test_less; + std::vector> ks({1, 3, 5, 3, 1}, test_allocator(6)); + const int expected[] = {1, 1, 3, 3, 5}; + using M = std::flat_multiset; + auto mo = M(ks, C(5)); + auto m = mo; + + assert(m.key_comp() == C(5)); + assert(std::ranges::equal(m, expected)); + auto keys = std::move(m).extract(); + assert(keys.get_allocator() == test_allocator(6)); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert(std::ranges::equal(mo, expected)); + auto keys2 = std::move(mo).extract(); + assert(keys2.get_allocator() == test_allocator(6)); + } + { + using C = test_less; + using Ks = std::vector>; + auto ks = Ks({1, 3, 5, 3, 1}, other_allocator(6)); + const int expected[] = {1, 1, 3, 3, 5}; + using M = std::flat_multiset; + auto mo = M(Ks(ks, other_allocator(6)), C(5)); + auto m = mo; + + assert(m.key_comp() == C(5)); + assert(std::ranges::equal(m, expected)); + auto keys = std::move(m).extract(); + assert(keys.get_allocator() == other_allocator(-2)); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert(std::ranges::equal(mo, expected)); + auto keys2 = std::move(mo).extract(); + assert(keys2.get_allocator() == other_allocator(6)); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp new file mode 100644 index 0000000000000..ec8ad824ea14b --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_alloc.pass.cpp @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multiset(const flat_multiset&, const allocator_type&); + +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "../../../test_compare.h" +#include "test_allocator.h" + +void test() { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multiset; + using M2 = std::flat_multiset; + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + using C = test_less; + std::vector> ks({1, 3, 5, 5}, test_allocator(6)); + using M = std::flat_multiset; + auto mo = M(ks, C(5)); + auto m = M(mo, test_allocator(3)); + + assert(m.key_comp() == C(5)); + assert(std::ranges::equal(m, ks)); + auto keys = std::move(m).extract(); + assert(keys.get_allocator() == test_allocator(3)); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert(std::ranges::equal(mo, ks)); + auto keys2 = std::move(mo).extract(); + assert(keys2.get_allocator() == test_allocator(6)); + } +} +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp new file mode 100644 index 0000000000000..2b6176ac915a7 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/copy_assign.pass.cpp @@ -0,0 +1,110 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multiset& operator=(const flat_multiset& m); + +#include +#include +#include +#include + +#include "operator_hijacker.h" +#include "test_macros.h" +#include "../../../test_compare.h" +#include "test_allocator.h" + +void test() { + { + // test_allocator is not propagated + using C = test_less; + std::vector> ks({1, 3, 5, 5}, test_allocator(6)); + using M = std::flat_multiset; + auto mo = M(ks, C(5)); + auto m = M({{3, 4, 5, 4}}, C(3), test_allocator(2)); + m = mo; + + assert(m.key_comp() == C(5)); + assert(std::ranges::equal(m, ks)); + auto keys = std::move(m).extract(); + assert(keys.get_allocator() == test_allocator(2)); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert(std::ranges::equal(mo, ks)); + auto keys2 = std::move(mo).extract(); + assert(keys2.get_allocator() == test_allocator(6)); + } + { + // other_allocator is propagated + using C = test_less; + using Ks = std::vector>; + auto ks = Ks({1, 3, 5, 3}, other_allocator(6)); + const int expected[] = {1, 3, 3, 5}; + using M = std::flat_multiset; + auto mo = M(Ks(ks, other_allocator(6)), C(5)); + auto m = M({3, 4, 5}, C(3), other_allocator(2)); + m = mo; + + assert(m.key_comp() == C(5)); + assert(std::ranges::equal(m, expected)); + auto keys = std::move(m).extract(); + assert(keys.get_allocator() == other_allocator(6)); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert(std::ranges::equal(mo, expected)); + auto keys2 = std::move(mo).extract(); + assert(keys2.get_allocator() == other_allocator(6)); + } + { + // comparator is copied and invariant is preserved + using M = std::flat_multiset>; + M mo = M({1, 2}, std::less()); + M m = M({1, 2}, std::greater()); + assert(m.key_comp()(2, 1) == true); + assert(m != mo); + m = mo; + assert(m.key_comp()(2, 1) == false); + assert(m == mo); + } + { + // self-assignment + using M = std::flat_multiset; + M m = {{1, 2}}; + m = std::as_const(m); + assert((m == M{{1, 2}})); + } + { + // was empty + using M = std::flat_multiset; + M m; + assert(m.size() == 0); + m = {3, 1, 2, 2, 3, 4, 3, 5, 6, 5}; + int expected[] = {1, 2, 2, 3, 3, 3, 4, 5, 5, 6}; + assert(std::ranges::equal(m, expected)); + } + { + // Validate whether the container can be copy-assigned (move-assigned, swapped) + // with an ADL-hijacking operator& + std::flat_multiset so; + std::flat_multiset s; + s = so; + s = std::move(so); + swap(s, so); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/deduct.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/deduct.compile.pass.cpp new file mode 100644 index 0000000000000..f26c90bacdda7 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/deduct.compile.pass.cpp @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// Test CTAD on cases where deduction should fail. + +#include +#include +#include +#include +#include + +struct NotAnAllocator { + friend bool operator<(NotAnAllocator, NotAnAllocator) { return false; } +}; + +template +concept CanDeductFlatMultiSet = requires { std::flat_multiset(std::declval()...); }; + +static_assert(CanDeductFlatMultiSet>); + +// cannot deduce Key and T from nothing +static_assert(!CanDeductFlatMultiSet<>); + +// cannot deduce Key and T from just (Compare) +static_assert(!CanDeductFlatMultiSet>); + +// cannot deduce Key and T from just (Compare, Allocator) +static_assert(!CanDeductFlatMultiSet, std::allocator>); + +// cannot deduce Key and T from just (Allocator) +static_assert(!CanDeductFlatMultiSet>); + +// cannot convert from some arbitrary unrelated type +static_assert(!CanDeductFlatMultiSet); diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/deduct.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/deduct.pass.cpp new file mode 100644 index 0000000000000..7f611776e85c3 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/deduct.pass.cpp @@ -0,0 +1,410 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "deduction_guides_sfinae_checks.h" +#include "test_allocator.h" + +void test() { + { + // Deduction guide generated from + // flat_multiset(const flat_multiset&) + std::flat_multiset source = {1, 2, 2}; + std::flat_multiset s(source); + ASSERT_SAME_TYPE(decltype(s), decltype(source)); + assert(s == source); + } + { + // Deduction guide generated from + // flat_multiset(const flat_multiset&) + // braces instead of parens + std::flat_multiset> source = {1, 2, 2}; + std::flat_multiset s{source}; + ASSERT_SAME_TYPE(decltype(s), decltype(source)); + assert(s == source); + } + { + // Deduction guide generated from + // flat_set(const flat_set&, const Allocator&) + std::flat_multiset> source = {1, 2, 2}; + std::flat_multiset s(source, std::allocator()); + ASSERT_SAME_TYPE(decltype(s), decltype(source)); + assert(s == source); + } + { + std::deque> ks({1, 2, 1, INT_MAX, 3}, test_allocator(0, 42)); + std::deque> sorted_ks({1, 1, 2, 3, INT_MAX}, test_allocator(0, 42)); + int expected[] = {1, 1, 2, 3, INT_MAX}; + { + // template> + // flat_multiset(KeyContainer, Compare = Compare()) + // -> flat_multiset; + std::flat_multiset s(ks); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, decltype(ks)>); + assert(std::ranges::equal(s, expected)); + assert(std::move(s).extract().get_allocator().get_id() == 42); + } + { + // template> + // flat_multiset(sorted_equivalent_t, KeyContainer, Compare = Compare()) + // -> flat_multiset; + std::flat_multiset s(std::sorted_equivalent, sorted_ks); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, decltype(ks)>); + assert(std::ranges::equal(s, expected)); + assert(std::move(s).extract().get_allocator().get_id() == 42); + } + { + // template + // flat_multiset(KeyContainer, Allocator) + // -> flat_multiset, KeyContainer>; + std::flat_multiset s(ks, test_allocator(0, 44)); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, decltype(ks)>); + assert(std::ranges::equal(s, expected)); + assert(std::move(s).extract().get_allocator().get_id() == 44); + } + { + // template + // flat_multiset(sorted_equivalent_t, KeyContainer, Allocator) + // -> flat_multiset, KeyContainer>; + std::flat_multiset s(std::sorted_equivalent, sorted_ks, test_allocator(0, 44)); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, decltype(ks)>); + assert(std::ranges::equal(s, expected)); + assert(std::move(s).extract().get_allocator().get_id() == 44); + } + } + { + std::deque> ks({1, 2, 1, INT_MAX, 3}, test_allocator(0, 42)); + std::deque> sorted_ks({INT_MAX, 3, 2, 1, 1}, test_allocator(0, 42)); + int expected[] = {INT_MAX, 3, 2, 1, 1}; + { + // template> + // flat_multiset(KeyContainer, Compare = Compare()) + // -> flat_multiset; + std::flat_multiset s(ks, std::greater()); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, decltype(ks)>); + assert(std::ranges::equal(s, expected)); + assert(std::move(s).extract().get_allocator().get_id() == 42); + } + { + // template> + // flat_multiset(sorted_equivalent_t, KeyContainer, Compare = Compare()) + // -> flat_multiset; + + std::flat_multiset s(std::sorted_equivalent, sorted_ks, std::greater()); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, decltype(ks)>); + assert(std::ranges::equal(s, expected)); + assert(std::move(s).extract().get_allocator().get_id() == 42); + } + { + // template + // flat_multiset(KeyContainer, Compare, Allocator) + // -> flat_multiset; + std::flat_multiset s(ks, std::greater(), test_allocator(0, 44)); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, decltype(ks)>); + assert(std::ranges::equal(s, expected)); + assert(std::move(s).extract().get_allocator().get_id() == 44); + } + { + // template + // flat_multiset(sorted_equivalent_t, KeyContainer, Compare, Allocator) + // -> flat_multiset; + std::flat_multiset s(std::sorted_equivalent, sorted_ks, std::greater(), test_allocator(0, 44)); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, decltype(ks)>); + assert(std::ranges::equal(s, expected)); + assert(std::move(s).extract().get_allocator().get_id() == 44); + } + } + + { + int arr[] = {1, 2, 1, INT_MAX, 3}; + int sorted_arr[] = {1, 1, 2, 3, INT_MAX}; + const int arrc[] = {1, 2, 1, INT_MAX, 3}; + const int sorted_arrc[] = {1, 1, 2, 3, INT_MAX}; + { + // template>> + // flat_multiset(InputIterator, InputIterator, Compare = Compare()) + // -> flat_multiset, Compare>; + std::flat_multiset m(std::begin(arr), std::end(arr)); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + assert(std::ranges::equal(m, sorted_arr)); + } + { + // template>> + // flat_multiset(InputIterator, InputIterator, Compare = Compare()) + // -> flat_multiset, Compare>; + // const + std::flat_multiset m(std::begin(arrc), std::end(arrc)); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + assert(std::ranges::equal(m, sorted_arr)); + } + { + // template>> + // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, Compare = Compare()) + // -> flat_multiset, Compare>; + std::flat_multiset m(std::sorted_equivalent, std::begin(sorted_arr), std::end(sorted_arr)); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + assert(std::ranges::equal(m, sorted_arr)); + } + { + // template>> + // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, Compare = Compare()) + // -> flat_multiset, Compare>; + // const + std::flat_multiset m(std::sorted_equivalent, std::begin(sorted_arrc), std::end(sorted_arrc)); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + assert(std::ranges::equal(m, sorted_arr)); + } + { + // template>> + // flat_multiset(InputIterator, InputIterator, Compare = Compare()) + // -> flat_multiset, Compare>; + // flat_multiset iterator + std::flat_multiset mo; + std::flat_multiset m(mo.begin(), mo.end()); + ASSERT_SAME_TYPE(decltype(m), decltype(mo)); + } + { + // template>> + // flat_multiset(InputIterator, InputIterator, Compare = Compare()) + // -> flat_multiset, Compare>; + // flat_multiset const_iterator + std::flat_multiset mo; + std::flat_multiset m(mo.cbegin(), mo.cend()); + ASSERT_SAME_TYPE(decltype(m), decltype(mo)); + } + { + // This does not deduce to flat_multiset(InputIterator, InputIterator) + // But deduces to flat_multiset(initializer_list) + int source[3] = {1, 2, 3}; + std::flat_multiset s = {source, source + 3}; + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset); + assert(s.size() == 2); + } + { + // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator) + // braces + int source[3] = {1, 2, 3}; + std::flat_multiset s{std::sorted_equivalent, source, source + 3}; + static_assert(std::is_same_v>); + assert(s.size() == 3); + } + } + + { + int arr[] = {1, 2, 1, INT_MAX, 3}; + int sorted_arr[] = {INT_MAX, 3, 2, 1, 1}; + const int arrc[] = {1, 2, 1, INT_MAX, 3}; + const int sorted_arrc[] = {INT_MAX, 3, 2, 1, 1}; + using C = std::greater; + { + // template>> + // flat_multiset(InputIterator, InputIterator, Compare = Compare()) + // -> flat_multiset, Compare>; + std::flat_multiset m(std::begin(arr), std::end(arr), C()); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + assert(std::ranges::equal(m, sorted_arr)); + } + { + // template>> + // flat_multiset(InputIterator, InputIterator, Compare = Compare()) + // -> flat_multiset, Compare>; + // const + std::flat_multiset m(std::begin(arrc), std::end(arrc), C()); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + assert(std::ranges::equal(m, sorted_arr)); + } + { + // template>> + // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, Compare = Compare()) + // -> flat_multiset, Compare>; + std::flat_multiset m(std::sorted_equivalent, std::begin(sorted_arr), std::end(sorted_arr), C()); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + assert(std::ranges::equal(m, sorted_arr)); + } + { + // template>> + // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, Compare = Compare()) + // -> flat_multiset, Compare>; + // const + std::flat_multiset m(std::sorted_equivalent, std::begin(sorted_arrc), std::end(sorted_arrc), C()); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + assert(std::ranges::equal(m, sorted_arr)); + } + { + // template>> + // flat_multiset(InputIterator, InputIterator, Compare = Compare()) + // -> flat_multiset, Compare>; + // flat_multiset iterator + std::flat_multiset mo; + std::flat_multiset m(mo.begin(), mo.end(), C()); + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + } + { + // template>> + // flat_multiset(InputIterator, InputIterator, Compare = Compare()) + // -> flat_multiset, Compare>; + // flat_multiset const_iterator + std::flat_multiset mo; + std::flat_multiset m(mo.cbegin(), mo.cend(), C()); + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + } + } + { + const int sorted_arr[] = {1, 1, 2, 3, INT_MAX}; + { + // template> + // flat_multiset(initializer_list, Compare = Compare()) + // -> flat_multiset; + std::flat_multiset m{1, 2, 1, INT_MAX, 3}; + + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + assert(std::ranges::equal(m, sorted_arr)); + } + { + // template> + // flat_multiset(sorted_equivalent_t, initializer_list, Compare = Compare()) + // -> flat_multiset; + std::flat_multiset m(std::sorted_equivalent, {1, 1, 2, 3, INT_MAX}); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + assert(std::ranges::equal(m, sorted_arr)); + } + { + // One element with brace was treated as initializer_list + std::flat_multiset s = {1}; + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset); + assert(s.size() == 1); + } + { + // Two elements with brace was treated as initializer_list + using M = std::flat_multiset; + M m; + std::flat_multiset s{m, m}; // flat_multiset(initializer_list) + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset); + assert(s.size() == 2); + } + } + { + const int sorted_arr[] = {INT_MAX, 3, 2, 1, 1}; + using C = std::greater; + { + // template> + // flat_multiset(initializer_list, Compare = Compare()) + // -> flat_multiset; + std::flat_multiset m({1, 2, 1, INT_MAX, 3}, C()); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + assert(std::ranges::equal(m, sorted_arr)); + } + { + // template> + // flat_multiset(sorted_equivalent_t, initializer_list, Compare = Compare()) + // -> flat_multiset; + std::flat_multiset m(std::sorted_equivalent, {INT_MAX, 3, 2, 1, 1}, C()); + + ASSERT_SAME_TYPE(decltype(m), std::flat_multiset); + assert(std::ranges::equal(m, sorted_arr)); + } + } + { + std::list r = {1, 2, 1, INT_MAX, 3}; + const int expected[] = {1, 1, 2, 3, INT_MAX}; + { + // template>, + // class Allocator = allocator>> + // flat_multiset(from_range_t, R&&, Compare = Compare(), Allocator = Allocator()) + // -> flat_multiset, Compare, + // vector, + // alloc-rebind>>>; + std::flat_multiset s(std::from_range, r); + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset>); + assert(std::ranges::equal(s, expected)); + } + { + // template + // flat_multiset(from_range_t, R&&, Allocator) + // -> flat_multiset, less>, + // vector, + // alloc-rebind>>>; + std::flat_multiset s(std::from_range, r, test_allocator(0, 42)); + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, std::vector>>); + assert(std::ranges::equal(s, expected)); + assert(std::move(s).extract().get_allocator().get_id() == 42); + } + } + + { + // with comparator + std::list r = {1, 2, 1, INT_MAX, 3}; + const int expected[] = {INT_MAX, 3, 2, 1, 1}; + { + // template>, + // class Allocator = allocator>> + // flat_multiset(from_range_t, R&&, Compare = Compare(), Allocator = Allocator()) + // -> flat_multiset, Compare, + // vector, + // alloc-rebind>>>; + std::flat_multiset s(std::from_range, r, std::greater()); + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset>); + assert(std::ranges::equal(s, expected)); + } + { + // template + // flat_multiset(from_range_t, R&&, Allocator) + // -> flat_multiset, less>, + // vector, + // alloc-rebind>>>; + std::flat_multiset s(std::from_range, r, std::greater(), test_allocator(0, 42)); + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, std::vector>>); + assert(std::ranges::equal(s, expected)); + assert(std::move(s).extract().get_allocator().get_id() == 42); + } + } + + AssociativeContainerDeductionGuidesSfinaeAway>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/deduct_pmr.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/deduct_pmr.pass.cpp new file mode 100644 index 0000000000000..367dc55e34410 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/deduct_pmr.pass.cpp @@ -0,0 +1,104 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// UNSUPPORTED: availability-pmr-missing + +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_allocator.h" + +using P = std::pair; +using PC = std::pair; + +int main(int, char**) { + { + std::deque> ks({1, 2, 1, INT_MAX, 3}, test_allocator(0, 42)); + std::deque> sorted_ks({1, 1, 2, 3, INT_MAX}, test_allocator(0, 42)); + const int expected[] = {1, 1, 2, 3, INT_MAX}; + { + // template + // flat_multiset(KeyContainer, Allocator) + // -> flat_multiset, KeyContainer>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::monotonic_buffer_resource mr2; + std::pmr::deque pks(ks.begin(), ks.end(), &mr); + std::flat_multiset s(std::move(pks), &mr2); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, std::pmr::deque>); + assert(std::ranges::equal(s, expected)); + auto keys = std::move(s).extract(); + assert(keys.get_allocator().resource() == &mr2); + } + { + // template + // flat_multiset(sorted_equivalent_t, KeyContainer, Allocator) + // -> flat_multiset, KeyContainer>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::monotonic_buffer_resource mr2; + std::pmr::deque pks(sorted_ks.begin(), sorted_ks.end(), &mr); + std::flat_multiset s(std::sorted_equivalent, std::move(pks), &mr2); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, std::pmr::deque>); + assert(std::ranges::equal(s, expected)); + auto keys = std::move(s).extract(); + assert(keys.get_allocator().resource() == &mr2); + } + } + { + std::deque> ks({1, 2, 1, INT_MAX, 3}, test_allocator(0, 42)); + std::deque> sorted_ks({INT_MAX, 3, 2, 1, 1}, test_allocator(0, 42)); + const int expected[] = {INT_MAX, 3, 2, 1, 1}; + { + // template + // flat_multiset(KeyContainer, Compare, Allocator) + // -> flat_multiset; + std::pmr::monotonic_buffer_resource mr; + std::pmr::monotonic_buffer_resource mr2; + std::pmr::deque pks(ks.begin(), ks.end(), &mr); + std::flat_multiset s(std::move(pks), std::greater(), &mr2); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, std::pmr::deque>); + assert(std::ranges::equal(s, expected)); + auto keys = std::move(s).extract(); + assert(keys.get_allocator().resource() == &mr2); + } + { + // template + // flat_multiset(sorted_equivalent_t, KeyContainer, Compare, Allocator) + // -> flat_multiset; + std::pmr::monotonic_buffer_resource mr; + std::pmr::monotonic_buffer_resource mr2; + std::pmr::deque pks(sorted_ks.begin(), sorted_ks.end(), &mr); + std::flat_multiset s(std::sorted_equivalent, std::move(pks), std::greater(), &mr2); + + ASSERT_SAME_TYPE(decltype(s), std::flat_multiset, std::pmr::deque>); + assert(std::ranges::equal(s, expected)); + auto keys = std::move(s).extract(); + assert(keys.get_allocator().resource() == &mr2); + } + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp new file mode 100644 index 0000000000000..16f90322cd31a --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/default.pass.cpp @@ -0,0 +1,96 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multiset(); + +#include +#include +#include +#include +#include +#include + +#include "min_allocator.h" +#include "MoveOnly.h" +#include "test_allocator.h" +#include "test_macros.h" + +struct DefaultCtableComp { + explicit DefaultCtableComp() { default_constructed_ = true; } + bool operator()(int, int) const { return false; } + bool default_constructed_ = false; +}; + +struct ThrowingCtorComp { + ThrowingCtorComp() noexcept(false) {} + bool operator()(const auto&, const auto&) const { return false; } +}; + +void test() { + { + std::flat_multiset m; + assert(m.empty()); + } + { + // explicit(false) + std::flat_multiset m = {}; + assert(m.empty()); + } + { + std::flat_multiset>> m; + assert(m.empty()); + assert(m.begin() == m.end()); + assert(m.key_comp().default_constructed_); + } + { + using A1 = explicit_allocator; + { + std::flat_multiset> m; + assert(m.empty()); + assert(m.key_comp().default_constructed_); + } + { + A1 a1; + std::flat_multiset> m(a1); + assert(m.empty()); + assert(m.key_comp().default_constructed_); + } + } +#if defined(_LIBCPP_VERSION) + { + using C = std::flat_multiset; + static_assert(std::is_nothrow_default_constructible_v); + C c; + } + { + using C = std::flat_multiset, std::vector>>; + static_assert(std::is_nothrow_default_constructible_v); + C c; + } +#endif // _LIBCPP_VERSION + { + using C = std::flat_multiset, std::vector>>; + static_assert(!std::is_nothrow_default_constructible_v); + C c; + } + { + using C = std::flat_multiset; + static_assert(!std::is_nothrow_default_constructible_v); + C c; + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp new file mode 100644 index 0000000000000..f852f2f85572c --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/dtor_noexcept.pass.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// ~flat_multiset(); + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "MoveOnly.h" +#include "test_allocator.h" + +struct ThrowingDtorComp { + bool operator()(const auto&, const auto&) const; + ~ThrowingDtorComp() noexcept(false) {} +}; + +void test() { + { + using C = std::flat_multiset; + static_assert(std::is_nothrow_destructible_v); + C c; + } + { + using V = std::vector>; + using C = std::flat_multiset, V>; + static_assert(std::is_nothrow_destructible_v); + C c; + } + { + using V = std::deque>; + using C = std::flat_multiset, V>; + static_assert(std::is_nothrow_destructible_v); + C c; + } +#if defined(_LIBCPP_VERSION) + { + using C = std::flat_multiset; + static_assert(!std::is_nothrow_destructible_v); + C c; + } +#endif // _LIBCPP_VERSION +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp new file mode 100644 index 0000000000000..10638d75bbd14 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/initializer_list.pass.cpp @@ -0,0 +1,157 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multiset(initializer_list il, const key_compare& comp = key_compare()); +// template +// flat_multiset(initializer_list il, const Alloc& a); +// template +// flat_multiset(initializer_list il, const key_compare& comp, const Alloc& a); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "min_allocator.h" +#include "test_allocator.h" + +#include "../../../test_compare.h" + +struct DefaultCtableComp { + explicit DefaultCtableComp() { default_constructed_ = true; } + bool operator()(int, int) const { return false; } + bool default_constructed_ = false; +}; + +void test() { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multiset; + using M2 = std::flat_multiset; + using IL = std::initializer_list; + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + + { + // initializer_list needs to match exactly + using M = std::flat_multiset; + using C = typename M::key_compare; + static_assert(std::is_constructible_v>); + static_assert(std::is_constructible_v, C>); + static_assert(std::is_constructible_v, C, std::allocator>); + static_assert(std::is_constructible_v, std::allocator>); + static_assert(!std::is_constructible_v>); + static_assert(!std::is_constructible_v, C>); + static_assert(!std::is_constructible_v, C, std::allocator>); + static_assert(!std::is_constructible_v, std::allocator>); + static_assert(!std::is_constructible_v>); + static_assert(!std::is_constructible_v, C>); + static_assert(!std::is_constructible_v, C, std::allocator>); + static_assert(!std::is_constructible_v, std::allocator>); + } + + int expected[] = {1, 2, 2, 3, 3, 5}; + { + // flat_multiset(initializer_list); + using M = std::flat_multiset; + std::initializer_list il = {5, 2, 2, 3, 1, 3}; + M m(il); + assert(std::ranges::equal(m, expected)); + } + { + // flat_multiset(initializer_list); + // explicit(false) + using M = std::flat_multiset; + M m = {5, 2, 2, 3, 1, 3}; + assert(std::ranges::equal(m, expected)); + } + { + // flat_multiset(initializer_list); + using M = std::flat_multiset, std::deque>>; + M m = {5, 2, 2, 3, 1, 3}; + assert(std::ranges::equal(m, expected | std::views::reverse)); + } + { + using A = explicit_allocator; + { + // flat_multiset(initializer_list); + // different comparator + using M = std::flat_multiset>; + M m = {1, 2, 3}; + assert(m.size() == 3); + LIBCPP_ASSERT(*m.begin() == 1); + assert(m.key_comp().default_constructed_); + } + { + // flat_multiset(initializer_list, const Allocator&); + using M = std::flat_multiset, std::deque>; + A a; + M m({5, 2, 2, 3, 1, 3}, a); + assert(std::ranges::equal(m, expected | std::views::reverse)); + } + } + { + // flat_multiset(initializer_list, const key_compare&); + using C = test_less; + using M = std::flat_multiset; + auto m = M({5, 2, 2, 3, 1, 3}, C(10)); + assert(std::ranges::equal(m, expected)); + assert(m.key_comp() == C(10)); + + // explicit(false) + M m2 = {{5, 2, 2, 1, 3, 3}, C(10)}; + assert(m2 == m); + assert(m2.key_comp() == C(10)); + } + { + // flat_multiset(initializer_list, const key_compare&); + // Sorting uses the comparator that was passed in + using M = std::flat_multiset, std::deque>>; + auto m = M({5, 2, 2, 1, 3, 3}, std::greater()); + assert(std::ranges::equal(m, expected | std::views::reverse)); + assert(m.key_comp()(2, 1) == true); + } + { + // flat_multiset(initializer_list il, const key_compare& comp, const Alloc& a); + using A = explicit_allocator; + using M = std::flat_multiset, std::deque>; + A a; + M m({5, 2, 2, 3, 1, 3}, {}, a); + assert(std::ranges::equal(m, expected | std::views::reverse)); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp new file mode 100644 index 0000000000000..da9aef3dc36cd --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/iter_iter.pass.cpp @@ -0,0 +1,141 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// flat_multiset(InputIterator first, InputIterator last, const key_compare& comp = key_compare()); +// template +// flat_multiset(InputIterator first, InputIterator last, const Allocator& a); +// template +// flat_multiset(InputIterator first, InputIterator last, const key_compare& comp, const Allocator& a); + +#include +#include +#include +#include +#include +#include + +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_iterators.h" +#include "test_macros.h" +#include "../../../test_compare.h" + +void test() { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multiset; + using M2 = std::flat_multiset; + using Iter1 = typename M1::iterator; + using Iter2 = typename M2::iterator; + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + + int ar[] = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + int expected[] = {1, 1, 1, 2, 2, 2, 3, 3, 3}; + { + // flat_multiset(InputIterator , InputIterator) + // cpp17_input_iterator + using M = std::flat_multiset; + auto m = M(cpp17_input_iterator(ar), cpp17_input_iterator(ar + 9)); + assert(std::ranges::equal(m, expected)); + + // explicit(false) + M m2 = {cpp17_input_iterator(ar), cpp17_input_iterator(ar + 9)}; + assert(m2 == m); + } + { + // flat_multiset(InputIterator , InputIterator) + // greater + using M = std::flat_multiset, std::deque>>; + auto m = M(cpp17_input_iterator(ar), cpp17_input_iterator(ar + 9)); + assert(std::ranges::equal(m, expected | std::views::reverse)); + } + { + // flat_multiset(InputIterator , InputIterator) + // Test when the operands are of array type (also contiguous iterator type) + using M = std::flat_multiset, std::vector>>; + auto m = M(ar, ar); + assert(m.empty()); + } + { + // flat_multiset(InputIterator , InputIterator, const key_compare&) + using C = test_less; + using M = std::flat_multiset>; + auto m = M(ar, ar + 9, C(3)); + assert(std::ranges::equal(m, expected)); + assert(m.key_comp() == C(3)); + + // explicit(false) + M m2 = {ar, ar + 9, C(3)}; + assert(m2 == m); + assert(m2.key_comp() == C(3)); + } + { + // flat_multiset(InputIterator , InputIterator, const Allocator&) + using A1 = test_allocator; + using M = std::flat_multiset, std::vector>; + auto m = M(ar, ar + 9, A1(5)); + assert(std::ranges::equal(m, expected)); + assert(std::move(m).extract().get_allocator() == A1(5)); + } + { + // flat_multiset(InputIterator , InputIterator, const Allocator&) + // explicit(false) + using A1 = test_allocator; + using M = std::flat_multiset, std::vector>; + M m = {ar, ar + 9, A1(5)}; // implicit ctor + assert(std::ranges::equal(m, expected)); + assert(std::move(m).extract().get_allocator() == A1(5)); + } + { + // flat_multiset(InputIterator , InputIterator, const key_compare&, const Allocator&) + using C = test_less; + using A1 = test_allocator; + using M = std::flat_multiset>; + auto m = M(ar, ar + 9, C(3), A1(5)); + assert(std::ranges::equal(m, expected)); + assert(m.key_comp() == C(3)); + assert(std::move(m).extract().get_allocator() == A1(5)); + } + { + // flat_multiset(InputIterator , InputIterator, const key_compare&, const Allocator&) + // explicit(false) + using A1 = test_allocator; + using M = std::flat_multiset, std::deque>; + M m = {ar, ar + 9, {}, A1(5)}; // implicit ctor + assert(std::ranges::equal(m, expected)); + LIBCPP_ASSERT(std::ranges::equal(m, expected)); + assert(std::move(m).extract().get_allocator() == A1(5)); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp new file mode 100644 index 0000000000000..825ad75cc8f4c --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move.pass.cpp @@ -0,0 +1,188 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multiset(flat_multiset&&); + +#include +#include +#include +#include +#include +#include + +#include "../helpers.h" +#include "test_macros.h" +#include "../../../test_compare.h" +#include "test_allocator.h" +#include "min_allocator.h" + +void test() { + { + using C = test_less; + using A = test_allocator; + using M = std::flat_multiset>; + M mo = M({1, 2, 1, 3}, C(5), A(7)); + M m = std::move(mo); + assert((m == M{1, 1, 2, 3})); + assert(m.key_comp() == C(5)); + assert(std::move(m).extract().get_allocator() == A(7)); + + assert(mo.empty()); + assert(mo.key_comp() == C(5)); + assert(std::move(mo).extract().get_allocator().get_id() == test_alloc_base::moved_value); + } + { + using C = test_less; + using A = min_allocator; + using M = std::flat_multiset>; + M mo = M({1, 2, 1, 3}, C(5), A()); + M m = std::move(mo); + assert((m == M{1, 1, 2, 3})); + assert(m.key_comp() == C(5)); + assert(std::move(m).extract().get_allocator() == A()); + + assert(mo.empty()); + assert(mo.key_comp() == C(5)); + assert(std::move(mo).extract().get_allocator() == A()); + } + { + // A moved-from flat_multiset maintains its class invariant in the presence of moved-from comparators. + using M = std::flat_multiset>; + M mo = M({1, 2, 1, 3}, std::less()); + M m = std::move(mo); + assert(m.size() == 4); + assert(std::is_sorted(m.begin(), m.end(), m.value_comp())); + assert(m.key_comp()(1, 2) == true); + + assert(std::is_sorted(mo.begin(), mo.end(), mo.value_comp())); + LIBCPP_ASSERT(m.key_comp()(1, 2) == true); + LIBCPP_ASSERT(mo.empty()); + mo.insert({1, 1, 2, 3}); // insert has no preconditions + assert(m == mo); + } + { + // moved-from object maintains invariant if the underlying container does not clear after move + using M = std::flat_multiset, CopyOnlyVector>; + M m1 = M({1, 2, 1, 3}); + M m2 = std::move(m1); + assert(m2.size() == 4); + check_invariant(m1); + LIBCPP_ASSERT(m1.empty()); + LIBCPP_ASSERT(m1.size() == 0); + } +} + +template +struct ThrowingMoveAllocator { + using value_type = T; + explicit ThrowingMoveAllocator() = default; + ThrowingMoveAllocator(const ThrowingMoveAllocator&) = default; + ThrowingMoveAllocator(ThrowingMoveAllocator&&) noexcept(false) {} + T* allocate(std::ptrdiff_t n) { return std::allocator().allocate(n); } + void deallocate(T* p, std::ptrdiff_t n) { return std::allocator().deallocate(p, n); } + friend bool operator==(ThrowingMoveAllocator, ThrowingMoveAllocator) = default; +}; + +struct ThrowingMoveComp { + ThrowingMoveComp() = default; + ThrowingMoveComp(const ThrowingMoveComp&) noexcept(true) {} + ThrowingMoveComp(ThrowingMoveComp&&) noexcept(false) {} + bool operator()(const auto&, const auto&) const { return false; } +}; + +struct MoveSensitiveComp { + MoveSensitiveComp() noexcept(false) = default; + MoveSensitiveComp(const MoveSensitiveComp&) noexcept = default; + MoveSensitiveComp(MoveSensitiveComp&& rhs) { rhs.is_moved_from_ = true; } + MoveSensitiveComp& operator=(const MoveSensitiveComp&) noexcept(false) = default; + MoveSensitiveComp& operator=(MoveSensitiveComp&& rhs) { + rhs.is_moved_from_ = true; + return *this; + } + bool operator()(const auto&, const auto&) const { return false; } + bool is_moved_from_ = false; +}; + +void test_move_noexcept() { + { + using C = std::flat_multiset; + LIBCPP_STATIC_ASSERT(std::is_nothrow_move_constructible_v); + C c; + C d = std::move(c); + } + { + using C = std::flat_multiset, std::deque>>; + LIBCPP_STATIC_ASSERT(std::is_nothrow_move_constructible_v); + C c; + C d = std::move(c); + } +#if _LIBCPP_VERSION + { + // Container fails to be nothrow-move-constructible; this relies on libc++'s support for non-nothrow-copyable allocators + using C = std::flat_multiset, std::deque>>; + static_assert(!std::is_nothrow_move_constructible_v>>); + static_assert(!std::is_nothrow_move_constructible_v); + C c; + C d = std::move(c); + } +#endif // _LIBCPP_VERSION + { + // Comparator fails to be nothrow-move-constructible + using C = std::flat_multiset; + static_assert(!std::is_nothrow_move_constructible_v); + C c; + C d = std::move(c); + } +} + +#if !defined(TEST_HAS_NO_EXCEPTIONS) +static int countdown = 0; + +struct EvilContainer : std::vector { + EvilContainer() = default; + EvilContainer(EvilContainer&& rhs) { + // Throw on move-construction. + if (--countdown == 0) { + rhs.insert(rhs.end(), 0); + rhs.insert(rhs.end(), 0); + throw 42; + } + } +}; + +void test_move_exception() { + { + using M = std::flat_multiset, EvilContainer>; + M mo = {1, 2, 3}; + countdown = 1; + try { + M m = std::move(mo); + assert(false); // not reached + } catch (int x) { + assert(x == 42); + } + // The source flat_multiset maintains its class invariant. + check_invariant(mo); + LIBCPP_ASSERT(mo.empty()); + } +} +#endif // !defined(TEST_HAS_NO_EXCEPTIONS) + +int main(int, char**) { + test(); + test_move_noexcept(); +#if !defined(TEST_HAS_NO_EXCEPTIONS) + test_move_exception(); +#endif // !defined(TEST_HAS_NO_EXCEPTIONS) + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp new file mode 100644 index 0000000000000..ee8258e5ac846 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_alloc.pass.cpp @@ -0,0 +1,79 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multiset(flat_multiset&&, const allocator_type&); + +#include +#include +#include +#include +#include +#include + +#include "../helpers.h" +#include "test_macros.h" +#include "../../../test_compare.h" +#include "test_allocator.h" + +void test() { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multiset; + using M2 = std::flat_multiset; + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + int expected[] = {1, 1, 2, 2, 3}; + using C = test_less; + using A = test_allocator; + using M = std::flat_multiset>; + auto mo = M(expected, expected + 5, C(5), A(7)); + auto m = M(std::move(mo), A(3)); + + assert(m.key_comp() == C(5)); + assert(m.size() == 5); + auto keys = std::move(m).extract(); + assert(keys.get_allocator() == A(3)); + assert(std::ranges::equal(keys, expected)); + + // The original flat_multiset is moved-from. + assert(std::is_sorted(mo.begin(), mo.end(), mo.value_comp())); + assert(mo.empty()); + assert(mo.key_comp() == C(5)); + assert(std::move(mo).extract().get_allocator() == A(7)); + } + { + // moved-from object maintains invariant if one of underlying container does not clear after move + using M = std::flat_multiset, CopyOnlyVector>; + M m1 = M({1, 2, 2, 1, 3}); + M m2(std::move(m1), std::allocator{}); + assert(m2.size() == 5); + assert(std::ranges::is_sorted(m1)); + LIBCPP_ASSERT(m1.empty()); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp new file mode 100644 index 0000000000000..96e046e38668f --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/move_assign.pass.cpp @@ -0,0 +1,239 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multiset& operator=(flat_multiset&&); + +#include +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "MoveOnly.h" +#include "../helpers.h" +#include "../../../test_compare.h" +#include "test_allocator.h" +#include "min_allocator.h" + +struct MoveNegates { + int value_ = 0; + MoveNegates() = default; + MoveNegates(int v) : value_(v) {} + MoveNegates(MoveNegates&& rhs) : value_(rhs.value_) { rhs.value_ = -rhs.value_; } + MoveNegates& operator=(MoveNegates&& rhs) { + value_ = rhs.value_; + rhs.value_ = -rhs.value_; + return *this; + } + ~MoveNegates() = default; + auto operator<=>(const MoveNegates&) const = default; +}; + +struct MoveClears { + int value_ = 0; + MoveClears() = default; + MoveClears(int v) : value_(v) {} + MoveClears(MoveClears&& rhs) : value_(rhs.value_) { rhs.value_ = 0; } + MoveClears& operator=(MoveClears&& rhs) { + value_ = rhs.value_; + rhs.value_ = 0; + return *this; + } + ~MoveClears() = default; + auto operator<=>(const MoveClears&) const = default; +}; + +#if !defined(TEST_HAS_NO_EXCEPTIONS) +struct MoveAssignThrows : std::vector { + using std::vector::vector; + MoveAssignThrows& operator=(MoveAssignThrows&& other) { + push_back(0); + push_back(0); + other.push_back(0); + other.push_back(0); + throw 42; + } +}; +#endif // TEST_HAS_NO_EXCEPTIONS + +void test_move_assign_clears() { + // Preserves the class invariant for the moved-from flat_multiset. + { + const int expected[] = {1, 1, 2, 3, 4, 5, 6, 7, 8}; + using M = std::flat_multiset>; + M m = M(expected, expected + 9); + M m2 = M(expected, expected + 4); + + m2 = std::move(m); + + assert(std::equal(m2.begin(), m2.end(), expected, expected + 9)); + LIBCPP_ASSERT(m.empty()); + check_invariant(m); + m.insert(1); + m.insert(2); + assert(m.contains(1)); + assert(m.find(2) != m.end()); + } + { + const int expected[] = {1, 1, 2, 3, 4, 5, 6, 7, 8}; + using M = std::flat_multiset>; + M m = M(expected, expected + 9); + M m2 = M(expected, expected + 4); + + m2 = std::move(m); + + assert(std::equal(m2.begin(), m2.end(), expected, expected + 9)); + LIBCPP_ASSERT(m.empty()); + check_invariant(m); + m.insert(1); + m.insert(2); + assert(m.contains(1)); + assert(m.find(2) != m.end()); + } + { + // moved-from object maintains invariant if the underlying container does not clear after move + using M = std::flat_multiset, CopyOnlyVector>; + M m1 = M({1, 1, 2, 3}); + M m2 = M({1, 2}); + m2 = std::move(m1); + assert(m2.size() == 4); + check_invariant(m1); + LIBCPP_ASSERT(m1.empty()); + } +#if !defined(TEST_HAS_NO_EXCEPTIONS) + { + using M = std::flat_multiset, MoveAssignThrows>; + M m1 = {1, 1, 2, 3}; + M m2 = {1, 1, 2}; + try { + m2 = std::move(m1); + assert(false); + } catch (int e) { + assert(e == 42); + } + check_invariant(m1); + check_invariant(m2); + LIBCPP_ASSERT(m1.empty()); + LIBCPP_ASSERT(m2.empty()); + } +#endif // TEST_HAS_NO_EXCEPTIONS +} + +struct MoveSensitiveComp { + MoveSensitiveComp() noexcept(false) = default; + MoveSensitiveComp(const MoveSensitiveComp&) noexcept(false) = default; + MoveSensitiveComp(MoveSensitiveComp&& rhs) { rhs.is_moved_from_ = true; } + MoveSensitiveComp& operator=(const MoveSensitiveComp&) noexcept = default; + MoveSensitiveComp& operator=(MoveSensitiveComp&& rhs) { + rhs.is_moved_from_ = true; + return *this; + } + bool operator()(const auto&, const auto&) const { return false; } + bool is_moved_from_ = false; +}; + +struct MoveThrowsComp { + MoveThrowsComp(MoveThrowsComp&&) noexcept(false); + MoveThrowsComp(const MoveThrowsComp&) noexcept(true); + MoveThrowsComp& operator=(MoveThrowsComp&&) noexcept(false); + MoveThrowsComp& operator=(const MoveThrowsComp&) noexcept(true); + bool operator()(const auto&, const auto&) const; +}; + +void test_move_assign_no_except() { + // This tests a conforming extension + + { + using C = std::flat_multiset; + LIBCPP_STATIC_ASSERT(std::is_nothrow_move_assignable_v); + } + { + using C = std::flat_multiset, std::vector>>; + static_assert(!std::is_nothrow_move_assignable_v); + } + { + using C = std::flat_multiset, std::vector>>; + static_assert(!std::is_nothrow_move_assignable_v); + } + { + using C = std::flat_multiset, std::vector>>; + LIBCPP_STATIC_ASSERT(std::is_nothrow_move_assignable_v); + } + { + using C = std::flat_multiset, std::vector>>; + LIBCPP_STATIC_ASSERT(std::is_nothrow_move_assignable_v); + } + { + // Test with a comparator that throws on move-assignment. + using C = std::flat_multiset; + LIBCPP_STATIC_ASSERT(!std::is_nothrow_move_assignable_v); + } + { + // Test with a container that throws on move-assignment. + using C = std::flat_multiset, std::pmr::vector>; + static_assert(!std::is_nothrow_move_assignable_v); + } +} + +void test() { + { + using C = test_less; + using A1 = test_allocator; + using M = std::flat_multiset>; + M mo = M({1, 1, 2, 3}, C(5), A1(7)); + M m = M({}, C(3), A1(7)); + std::same_as decltype(auto) r = m = std::move(mo); + assert(&r == &m); + assert((m == M{1, 1, 2, 3})); + assert(m.key_comp() == C(5)); + auto ks = std::move(m).extract(); + assert(ks.get_allocator() == A1(7)); + assert(mo.empty()); + } + { + using C = test_less; + using A1 = other_allocator; + using M = std::flat_multiset>; + M mo = M({4, 4, 5}, C(5), A1(7)); + M m = M({1, 1, 2, 3, 4}, C(3), A1(7)); + std::same_as decltype(auto) r = m = std::move(mo); + assert(&r == &m); + assert((m == M{4, 4, 5})); + assert(m.key_comp() == C(5)); + auto ks = std::move(m).extract(); + assert(ks.get_allocator() == A1(7)); + assert(mo.empty()); + } + { + using A = min_allocator; + using M = std::flat_multiset, std::vector>; + M mo = M({5, 3, 4, 3}, A()); + M m = M({4, 1, 3, 2, 1}, A()); + std::same_as decltype(auto) r = m = std::move(mo); + assert(&r == &m); + assert((m == M{5, 4, 3, 3})); + auto ks = std::move(m).extract(); + assert(ks.get_allocator() == A()); + assert(mo.empty()); + } +} + +int main(int, char**) { + test(); + test_move_assign_clears(); + test_move_assign_no_except(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/pmr.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/pmr.pass.cpp new file mode 100644 index 0000000000000..381aafb00d4fa --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/pmr.pass.cpp @@ -0,0 +1,326 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// UNSUPPORTED: availability-pmr-missing + +// + +// Test various constructors with pmr + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_iterators.h" +#include "test_macros.h" +#include "test_allocator.h" +#include "../helpers.h" +#include "../../../test_compare.h" + +void test() { + { + // flat_multiset(const Allocator& a); + using M = std::flat_multiset, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::polymorphic_allocator pa = &mr; + auto m1 = M(pa); + assert(m1.empty()); + assert(std::move(m1).extract().get_allocator() == pa); + auto m2 = M(&mr); + assert(m2.empty()); + assert(std::move(m2).extract().get_allocator() == pa); + } + { + // flat_multiset(const key_compare& comp, const Alloc& a); + using M = std::flat_multiset, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + vm.emplace_back(std::greater()); + assert(vm[0] == M{}); + assert(vm[0].key_comp()(2, 1) == true); + assert(vm[0].value_comp()(2, 1) == true); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } + { + // flat_multiset(const container_type& key_cont, const Allocator& a); + using M = std::flat_multiset, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + std::pmr::vector ks = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + assert(ks.get_allocator().resource() != &mr); + vm.emplace_back(ks); + assert(ks.size() == 9); // ks' value is unchanged, since it was an lvalue above + assert((vm[0] == M{1, 1, 1, 2, 2, 2, 3, 3, 3})); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } + { + // flat_multiset(const flat_multiset&, const allocator_type&); + using C = test_less; + using M = std::flat_multiset>; + std::pmr::monotonic_buffer_resource mr1; + std::pmr::monotonic_buffer_resource mr2; + M mo = M({1, 2, 3}, C(5), &mr1); + M m = {mo, &mr2}; // also test the implicitness of this constructor + + assert(m.key_comp() == C(5)); + auto keys = std::move(m).extract(); + assert((keys == std::pmr::vector{1, 2, 3})); + assert(keys.get_allocator().resource() == &mr2); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + auto keys2 = std::move(mo).extract(); + assert((keys2 == std::pmr::vector{1, 2, 3})); + assert(keys2.get_allocator().resource() == &mr1); + } + { + // flat_multiset(const flat_multiset&, const allocator_type&); + using M = std::flat_multiset, std::pmr::vector>; + std::pmr::vector vs; + M m = {1, 2, 3}; + vs.push_back(m); + assert(vs[0] == m); + } + { + // flat_multiset& operator=(const flat_multiset& m); + // pmr allocator is not propagated + using M = std::flat_multiset, std::pmr::deque>; + std::pmr::monotonic_buffer_resource mr1; + std::pmr::monotonic_buffer_resource mr2; + M mo = M({1, 2, 3}, &mr1); + M m = M({4, 5}, &mr2); + m = mo; + assert((m == M{1, 2, 3})); + assert(std::move(m).extract().get_allocator().resource() == &mr2); + + // mo is unchanged + assert((mo == M{1, 2, 3})); + assert(std::move(mo).extract().get_allocator().resource() == &mr1); + } + { + // flat_multiset(const flat_multiset& m); + using C = test_less; + std::pmr::monotonic_buffer_resource mr; + using M = std::flat_multiset>; + auto mo = M({1, 2, 3}, C(5), &mr); + auto m = mo; + + assert(m.key_comp() == C(5)); + assert((m == M{1, 2, 3})); + auto ks = std::move(m).extract(); + assert(ks.get_allocator().resource() == std::pmr::get_default_resource()); + + // mo is unchanged + assert(mo.key_comp() == C(5)); + assert((mo == M{1, 2, 3})); + auto kso = std::move(mo).extract(); + assert(kso.get_allocator().resource() == &mr); + } + { + // flat_multiset(initializer_list il, const Alloc& a); + using M = std::flat_multiset, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + std::initializer_list il = {3, 1, 4, 1, 5}; + vm.emplace_back(il); + assert((vm[0] == M{1, 1, 3, 4, 5})); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } + { + // flat_multiset(initializer_list il, const key_compare& comp, const Alloc& a); + using C = test_less; + using M = std::flat_multiset>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + std::initializer_list il = {3, 1, 4, 1, 5}; + vm.emplace_back(il, C(5)); + assert((vm[0] == M{1, 1, 3, 4, 5})); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + assert(vm[0].key_comp() == C(5)); + } + { + // flat_multiset(InputIterator first, InputIterator last, const Allocator& a); + int ar[] = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + int expected[] = {1, 1, 1, 2, 2, 2, 3, 3, 3}; + { + // cpp17 iterator + using M = std::flat_multiset, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + vm.emplace_back(cpp17_input_iterator(ar), cpp17_input_iterator(ar + 9)); + assert(std::ranges::equal(vm[0], expected)); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } + { + using M = std::flat_multiset, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + vm.emplace_back(ar, ar); + assert(vm[0].empty()); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } + } + { + // flat_multiset(flat_multiset&&, const allocator_type&); + int expected[] = {1, 1, 2, 3}; + using C = test_less; + using M = std::flat_multiset>; + std::pmr::monotonic_buffer_resource mr1; + std::pmr::monotonic_buffer_resource mr2; + M mo = M({1, 3, 1, 2}, C(5), &mr1); + M m = {std::move(mo), &mr2}; // also test the implicitness of this constructor + + assert(m.key_comp() == C(5)); + assert(m.size() == 4); + assert(std::ranges::equal(m, expected)); + assert(std::move(m).extract().get_allocator().resource() == &mr2); + + // The original flat_multiset is moved-from. + assert(std::is_sorted(mo.begin(), mo.end(), mo.value_comp())); + assert(mo.key_comp() == C(5)); + assert(std::move(mo).extract().get_allocator().resource() == &mr1); + } + { + // flat_multiset(flat_multiset&&, const allocator_type&); + using M = std::flat_multiset, std::pmr::deque>; + std::pmr::vector vs; + M m = {1, 3, 1, 2}; + vs.push_back(std::move(m)); + assert((std::move(vs[0]).extract() == std::pmr::deque{1, 1, 2, 3})); + } + { + // flat_multiset& operator=(flat_multiset&&); + using M = std::flat_multiset, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr1; + std::pmr::monotonic_buffer_resource mr2; + M mo = + M({"short", "very long string that definitely won't fit in the SSO buffer and therefore becomes empty on move"}, + &mr1); + M m = M({"don't care"}, &mr2); + m = std::move(mo); + assert(m.size() == 2); + check_invariant(m); + assert(m.begin()->get_allocator().resource() == &mr2); + + check_invariant(mo); + mo.insert("foo"); + assert(mo.begin()->get_allocator().resource() == &mr1); + } + { + // flat_multiset(from_range_t, R&&, const Alloc&); + int ar[] = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + int expected[] = {1, 1, 1, 2, 2, 2, 3, 3, 3}; + { + // input_range + using M = std::flat_multiset, std::pmr::vector>; + using Iter = cpp20_input_iterator; + using Sent = sentinel_wrapper; + using R = std::ranges::subrange; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + vm.emplace_back(std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))); + assert(std::ranges::equal(vm[0], expected)); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } + { + using M = std::flat_multiset, std::pmr::vector>; + using R = std::ranges::subrange; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + vm.emplace_back(std::from_range, R(ar, ar)); + assert(vm[0].empty()); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } + } + { + // flat_multiset(sorted_equivalent_t, const container_type& key_cont, const Alloc& a); + using M = std::flat_multiset, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + std::pmr::vector ks = {1, 1, 2, 4, 10}; + vm.emplace_back(std::sorted_equivalent, ks); + assert(!ks.empty()); // it was an lvalue above + assert((vm[0] == M{1, 1, 2, 4, 10})); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } + { + // flat_multiset(sorted_equivalent_t, const container_type& key_cont,const Alloc& a); + using M = std::flat_multiset, std::pmr::vector>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + std::pmr::vector ks({1, 1, 2, 4, 10}, &mr); + vm.emplace_back(std::sorted_equivalent, ks); + assert((vm[0] == M{1, 1, 2, 4, 10})); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } + { + // flat_multiset(sorted_equivalent_t, initializer_list il, const Alloc& a); + // cpp_17 + using C = test_less; + using M = std::flat_multiset>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + int ar[] = {1, 1, 2, 4, 5}; + vm.emplace_back( + std::sorted_equivalent, cpp17_input_iterator(ar), cpp17_input_iterator(ar + 5), C(3)); + assert((vm[0] == M{1, 1, 2, 4, 5})); + assert(vm[0].key_comp() == C(3)); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } + { + // flat_multiset(sorted_equivalent_t, initializer_list il, const Alloc& a); + using C = test_less; + using M = std::flat_multiset>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + int ar[1] = {42}; + vm.emplace_back(std::sorted_equivalent, ar, ar, C(4)); + assert(vm[0] == M{}); + assert(vm[0].key_comp() == C(4)); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } + { + // flat_multiset(InputIterator first, InputIterator last, const Alloc& a); + // cpp_17 + using C = test_less; + using M = std::flat_multiset>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + int ar[] = {1, 1, 2, 4, 5}; + vm.emplace_back( + std::sorted_equivalent, cpp17_input_iterator(ar), cpp17_input_iterator(ar + 5), C(3)); + assert((vm[0] == M{1, 1, 2, 4, 5})); + assert(vm[0].key_comp() == C(3)); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } + { + // flat_multiset(InputIterator first, InputIterator last, const Alloc& a); + using C = test_less; + using M = std::flat_multiset>; + std::pmr::monotonic_buffer_resource mr; + std::pmr::vector vm(&mr); + int ar[1] = {42}; + vm.emplace_back(std::sorted_equivalent, ar, ar, C(4)); + assert(vm[0] == M{}); + assert(vm[0].key_comp() == C(4)); + assert(std::move(vm[0]).extract().get_allocator().resource() == &mr); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp new file mode 100644 index 0000000000000..76485b47ec5ea --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/range.pass.cpp @@ -0,0 +1,176 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template R> +// flat_multiset(from_range_t, R&&) +// template R> +// flat_multiset(from_range_t, R&&, const key_compare&) +// template R, class Alloc> +// flat_multiset(from_range_t, R&&, const Alloc&); +// template R, class Alloc> +// flat_multiset(from_range_t, R&&, const key_compare&, const Alloc&); + +#include +#include +#include +#include +#include +#include + +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_iterators.h" +#include "test_macros.h" +#include "../../../test_compare.h" + +// test constraint container-compatible-range + +template +using RangeOf = std::ranges::subrange; +using Set = std::flat_multiset; + +static_assert(std::is_constructible_v>); +static_assert(std::is_constructible_v>); +static_assert(!std::is_constructible_v>>); + +static_assert(std::is_constructible_v, std::less>); +static_assert(std::is_constructible_v, std::less>); +static_assert(!std::is_constructible_v>, std::less>); + +static_assert(std::is_constructible_v, std::allocator>); +static_assert(std::is_constructible_v, std::allocator>); +static_assert(!std::is_constructible_v>, std::allocator>); + +static_assert(std::is_constructible_v, std::less, std::allocator>); +static_assert(std::is_constructible_v, std::less, std::allocator>); +static_assert( + !std:: + is_constructible_v>, std::less, std::allocator>); + +void test() { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multiset; + using M2 = std::flat_multiset; + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + + int ar[] = {1, 1, 1, 2, 2, 3, 2, 3, 3}; + int expected[] = {1, 1, 1, 2, 2, 2, 3, 3, 3}; + { + // flat_multiset(from_range_t, R&&) + // input_range && !common + using M = std::flat_multiset; + using Iter = cpp20_input_iterator; + using Sent = sentinel_wrapper; + using R = std::ranges::subrange; + auto m = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))); + assert(std::ranges::equal(m, expected)); + + // explicit(false) + M m2 = {std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))}; + assert(m2 == m); + } + { + // flat_multiset(from_range_t, R&&) + // greater + using M = std::flat_multiset, std::deque>>; + using Iter = cpp20_input_iterator; + using Sent = sentinel_wrapper; + using R = std::ranges::subrange; + auto m = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))); + assert(std::ranges::equal(m, std::deque>{3, 3, 3, 2, 2, 2, 1, 1, 1})); + } + { + // flat_multiset(from_range_t, R&&) + // contiguous range + using M = std::flat_multiset; + using R = std::ranges::subrange; + auto m = M(std::from_range, R(ar, ar + 9)); + assert(std::ranges::equal(m, expected)); + } + { + // flat_multiset(from_range_t, R&&, const key_compare&) + using C = test_less; + using M = std::flat_multiset>; + using R = std::ranges::subrange; + auto m = M(std::from_range, R(ar, ar + 9), C(3)); + assert(std::ranges::equal(m, expected)); + assert(m.key_comp() == C(3)); + + // explicit(false) + M m2 = {std::from_range, R(ar, ar + 9), C(3)}; + assert(m2 == m); + assert(m2.key_comp() == C(3)); + } + { + // flat_multiset(from_range_t, R&&, const Allocator&) + using A1 = test_allocator; + using M = std::flat_multiset, std::vector>; + using R = std::ranges::subrange; + auto m = M(std::from_range, R(ar, ar + 9), A1(5)); + assert(std::ranges::equal(m, expected)); + assert(std::move(m).extract().get_allocator() == A1(5)); + } + { + // flat_multiset(from_range_t, R&&, const Allocator&) + // explicit(false) + using A1 = test_allocator; + using M = std::flat_multiset, std::deque>; + using R = std::ranges::subrange; + M m = {std::from_range, R(ar, ar + 9), A1(5)}; // implicit ctor + assert(std::ranges::equal(m, expected)); + assert(std::move(m).extract().get_allocator() == A1(5)); + } + { + // flat_multiset(from_range_t, R&&, const key_compare&, const Allocator&) + using C = test_less; + using A1 = test_allocator; + using M = std::flat_multiset>; + using R = std::ranges::subrange; + auto m = M(std::from_range, R(ar, ar + 9), C(3), A1(5)); + assert(std::ranges::equal(m, expected)); + assert(m.key_comp() == C(3)); + assert(std::move(m).extract().get_allocator() == A1(5)); + } + { + // flat_multiset(from_range_t, R&&, const key_compare&, const Allocator&) + // explicit(false) + using A1 = test_allocator; + using M = std::flat_multiset, std::deque>; + using R = std::ranges::subrange; + M m = {std::from_range, R(ar, ar + 9), {}, A1(5)}; // implicit ctor + assert(std::ranges::equal(m, expected)); + assert(std::move(m).extract().get_allocator() == A1(5)); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp new file mode 100644 index 0000000000000..76759be7da8e3 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_container.pass.cpp @@ -0,0 +1,147 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multiset(sorted_equivalent_t, container_type key_cont, const key_compare& comp = key_compare()); +// +// template +// flat_multiset(sorted_equivalent_t, const container_type& key_cont, const Alloc& a); +// template +// flat_multiset(sorted_equivalent_t, const container_type& key_cont, +// const key_compare& comp, const Alloc& a); + +#include +#include +#include +#include + +#include "min_allocator.h" +#include "MoveOnly.h" +#include "test_allocator.h" +#include "test_iterators.h" +#include "test_macros.h" +#include "../../../test_compare.h" + +void test() { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multiset; + using M2 = std::flat_multiset; + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + // flat_multiset(sorted_equivalent_t, container_type) + using M = std::flat_multiset; + std::vector ks = {1, 2, 2, 4, 10}; + auto ks2 = ks; + + auto m = M(std::sorted_equivalent, ks); + assert((m == M{1, 2, 2, 4, 10})); + m = M(std::sorted_equivalent, std::move(ks)); + assert(ks.empty()); // it was moved-from + assert((m == M{1, 2, 2, 4, 10})); + + // explicit(false) + M m2 = {std::sorted_equivalent, std::move(ks2)}; + assert(m == m2); + } + { + // flat_multiset(sorted_equivalent_t, container_type) + // non-default container, comparator and allocator type + using Ks = std::deque>; + using M = std::flat_multiset, Ks>; + Ks ks = {10, 4, 4, 2, 1}; + auto m = M(std::sorted_equivalent, ks); + assert((m == M{1, 2, 4, 4, 10})); + m = M(std::sorted_equivalent, std::move(ks)); + assert(ks.empty()); // it was moved-from + assert((m == M{1, 2, 4, 4, 10})); + } + { + // flat_multiset(sorted_equivalent_t, container_type) + // allocator copied into the containers + using A = test_allocator; + using M = std::flat_multiset, std::deque>; + auto ks = std::deque({1, 2, 2, 4, 10}, A(4)); + auto m = M(std::sorted_equivalent, std::move(ks)); + assert(ks.empty()); // it was moved-from + assert((m == M{1, 2, 2, 4, 10})); + assert(std::move(m).extract().get_allocator() == A(4)); + } + { + // flat_multiset(sorted_equivalent_t, container_type , key_compare) + using C = test_less; + using M = std::flat_multiset; + std::vector ks = {1, 2, 2, 4, 10}; + + auto m = M(std::sorted_equivalent, ks, C(4)); + assert((m == M{1, 2, 2, 4, 10})); + assert(m.key_comp() == C(4)); + + // explicit(false) + M m2 = {std::sorted_equivalent, ks, C(4)}; + assert(m2 == m); + assert(m2.key_comp() == C(4)); + } + { + // flat_multiset(sorted_equivalent_t, container_type , key_compare, const Allocator&) + using C = test_less; + using A = test_allocator; + using M = std::flat_multiset>; + std::vector ks = {1, 2, 2, 4, 10}; + auto m = M(std::sorted_equivalent, ks, C(4), A(5)); + assert((m == M{1, 2, 2, 4, 10})); + assert(m.key_comp() == C(4)); + assert(M(m).extract().get_allocator() == A(5)); + + // explicit(false) + M m2 = {ks, C(4), A(5)}; + assert(m2 == m); + assert(m2.key_comp() == C(4)); + assert(std::move(m2).extract().get_allocator() == A(5)); + } + { + // flat_multiset(sorted_equivalent_t, container_type , const Allocator&) + using A = test_allocator; + using M = std::flat_multiset, std::deque>; + auto ks = std::deque({1, 2, 2, 4, 10}, A(4)); + auto m = M(std::sorted_equivalent, ks, A(6)); // replaces the allocators + assert(!ks.empty()); // it was an lvalue above + assert((m == M{1, 2, 2, 4, 10})); + assert(M(m).extract().get_allocator() == A(6)); + + // explicit(false) + M m2 = {std::sorted_equivalent, ks, A(6)}; + assert(m2 == m); + assert(std::move(m2).extract().get_allocator() == A(6)); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp new file mode 100644 index 0000000000000..955662dd233ef --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_initializer_list.pass.cpp @@ -0,0 +1,158 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// flat_multiset(sorted_equivalent_t s, initializer_list il, +// const key_compare& comp = key_compare()) +// template +// flat_multiset(sorted_equivalent_t, initializer_list il, const Alloc& a); +// template +// flat_multiset(sorted_equivalent_t, initializer_list il, +// const key_compare& comp, const Alloc& a); + +#include +#include +#include +#include + +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_iterators.h" +#include "test_macros.h" +#include "../../../test_compare.h" + +template +std::initializer_list il = {1, 2, 4, 4, 5}; + +void test() { + const auto il1 = il; + const auto il2 = il; + + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multiset; + using M2 = std::flat_multiset; + using IL = std::initializer_list; + + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + // initializer_list needs to match exactly + using M = std::flat_multiset; + using C = typename M::key_compare; + static_assert(std::is_constructible_v>); + static_assert(std::is_constructible_v, C>); + static_assert( + std::is_constructible_v, C, std::allocator>); + static_assert( + std::is_constructible_v, std::allocator>); + static_assert(!std::is_constructible_v>); + static_assert(!std::is_constructible_v, C>); + static_assert( + !std:: + is_constructible_v, C, std::allocator>); + static_assert( + !std::is_constructible_v, std::allocator>); + static_assert(!std::is_constructible_v>); + static_assert(!std::is_constructible_v, C>); + static_assert( + !std:: + is_constructible_v, C, std::allocator>); + static_assert( + !std::is_constructible_v, std::allocator>); + } + + { + // flat_multiset(sorted_equivalent_t, initializer_list); + using M = std::flat_multiset; + auto m = M(std::sorted_equivalent, il1); + auto expected = M{1, 2, 4, 4, 5}; + assert(m == expected); + + // explicit(false) + M m2 = {std::sorted_equivalent, il1}; + assert(m2 == m); + } + { + // flat_multiset(sorted_equivalent_t, initializer_list, const key_compare&); + using M = std::flat_multiset>; + auto m = M(std::sorted_equivalent, il1, std::less()); + assert(m == M({1, 2, 4, 4, 5}, std::less<>())); + assert(m.key_comp()(1, 2) == true); + + // explicit(false) + M m2 = {std::sorted_equivalent, il1, std::less()}; + assert(m2 == m); + } + { + // flat_multiset(sorted_equivalent_t, initializer_list, const key_compare&); + // greater + using M = std::flat_multiset, std::deque>>; + std::initializer_list il4{5, 4, 4, 2, 1}; + auto m = M(std::sorted_equivalent, il4, std::greater()); + assert((m == M{5, 4, 4, 2, 1})); + } + { + // flat_multiset(sorted_equivalent_t, initializer_list, const Allocator&) + using A1 = test_allocator; + using M = std::flat_multiset, std::deque>; + auto m = M(std::sorted_equivalent, il2, A1(5)); + auto expected = M{1, 2, 4, 4, 5}; + assert(m == expected); + assert(M(m).extract().get_allocator() == A1(5)); + + // explicit(false) + M m2 = {std::sorted_equivalent, il2, A1(5)}; + assert(m2 == m); + assert(std::move(m2).extract().get_allocator() == A1(5)); + } + { + // flat_multiset(sorted_equivalent_t, initializer_list, const key_compare&, const Allocator&); + using C = test_less; + using A1 = test_allocator; + using M = std::flat_multiset>; + auto m = M(std::sorted_equivalent, il2, C(3), A1(5)); + assert((m == M{1, 2, 4, 4, 5})); + assert(m.key_comp() == C(3)); + assert(std::move(m).extract().get_allocator() == A1(5)); + } + { + // flat_multiset(sorted_equivalent_t, initializer_list, const key_compare&, const Allocator&); + // explicit(false) + using A1 = test_allocator; + using M = std::flat_multiset, std::deque>; + M m = {std::sorted_equivalent, il2, {}, A1(5)}; // implicit ctor + assert((m == M{1, 2, 4, 4, 5})); + assert(std::move(m).extract().get_allocator() == A1(5)); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp new file mode 100644 index 0000000000000..9ebe45d71d667 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.cons/sorted_iter_iter.pass.cpp @@ -0,0 +1,160 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// flat_multiset(sorted_equivalent_t, InputIterator first, InputIterator last, const key_compare& comp = key_compare()); +// template +// flat_multiset(sorted_equivalent_t, InputIterator first, InputIterator last, const Alloc& a); +// template +// flat_multiset(sorted_equivalent_t, InputIterator first, InputIterator last, const key_compare& comp, const Allocator& a); + +#include +#include +#include +#include + +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_iterators.h" +#include "test_macros.h" +#include "../../../test_compare.h" + +void test() { + { + // The constructors in this subclause shall not participate in overload + // resolution unless uses_allocator_v is true. + + using C = test_less; + using A1 = test_allocator; + using A2 = other_allocator; + using V1 = std::vector; + using V2 = std::vector; + using M1 = std::flat_multiset; + using M2 = std::flat_multiset; + using Iter1 = typename M1::iterator; + using Iter2 = typename M2::iterator; + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + static_assert(!std::is_constructible_v); + static_assert(!std::is_constructible_v); + } + { + // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator); + // cpp17_input_iterator + using M = std::flat_multiset; + int ar[] = {1, 2, 2, 4, 5}; + auto m = M(std::sorted_equivalent, cpp17_input_iterator(ar), cpp17_input_iterator(ar + 5)); + auto expected = M{1, 2, 2, 4, 5}; + assert(m == expected); + + // explicit(false) + M m2 = {std::sorted_equivalent, cpp17_input_iterator(ar), cpp17_input_iterator(ar + 5)}; + assert(m2 == m); + } + { + // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator); + // contiguous iterator + using C = test_less; + using M = std::flat_multiset>>; + int ar[] = {1, 2, 4, 4, 5}; + auto m = M(std::sorted_equivalent, ar, ar + 5); + auto expected = M{1, 2, 4, 4, 5}; + assert(m == expected); + } + { + // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&); + // cpp_17_input_iterator + using M = std::flat_multiset>; + int ar[] = {1, 2, 4, 4, 5}; + auto m = M(std::sorted_equivalent, + cpp17_input_iterator(ar), + cpp17_input_iterator(ar + 5), + std::less()); + assert(m == M({1, 2, 4, 4, 5}, std::less<>())); + assert(m.key_comp()(1, 2) == true); + + // explicit(false) + M m2 = {std::sorted_equivalent, + cpp17_input_iterator(ar), + cpp17_input_iterator(ar + 5), + std::less()}; + assert(m2 == m); + } + { + // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&); + // greater + using M = std::flat_multiset, std::deque>>; + int ar[] = {5, 4, 4, 2, 1}; + auto m = M(std::sorted_equivalent, + cpp17_input_iterator(ar), + cpp17_input_iterator(ar + 5), + std::greater()); + assert((m == M{5, 4, 4, 2, 1})); + } + { + // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&); + // contiguous iterator + using C = test_less; + using M = std::flat_multiset>>; + int ar[1] = {42}; + auto m = M(std::sorted_equivalent, ar, ar, C(5)); + assert(m.empty()); + assert(m.key_comp() == C(5)); + } + { + // flat_multiset(sorted_equivalent_t, InputIterator , InputIterator, const Allocator&) + using A1 = test_allocator; + using M = std::flat_multiset, std::vector>; + int ar[] = {1, 2, 4, 4, 5}; + auto m = M(std::sorted_equivalent, ar, ar + 5, A1(5)); + auto expected = M{1, 2, 4, 4, 5}; + assert(m == expected); + assert(M(m).extract().get_allocator() == A1(5)); + + // explicit(false) + M m2 = {std::sorted_equivalent, ar, ar + 5, A1(5)}; + assert(m2 == m); + assert(std::move(m2).extract().get_allocator() == A1(5)); + } + { + // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&); + using C = test_less; + using A1 = test_allocator; + using M = std::flat_multiset>; + int ar[] = {1, 2, 4, 4, 5}; + auto m = M(std::sorted_equivalent, ar, ar + 5, C(3), A1(5)); + assert((m == M{1, 2, 4, 4, 5})); + assert(m.key_comp() == C(3)); + assert(std::move(m).extract().get_allocator() == A1(5)); + } + { + // flat_multiset(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&); + // explicit(false) + using A1 = test_allocator; + using M = std::flat_multiset, std::deque>; + int ar[] = {1, 2, 4, 4, 5}; + M m = {std::sorted_equivalent, ar, ar + 5, {}, A1(5)}; // implicit ctor + assert((m == M{1, 2, 4, 4, 5})); + assert(std::move(m).extract().get_allocator() == A1(5)); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp new file mode 100644 index 0000000000000..21f3c918dec0d --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if.pass.cpp @@ -0,0 +1,113 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// typename flat_multiset::size_type +// erase_if(flat_multiset& c, Predicate pred); + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "test_allocator.h" +#include "min_allocator.h" + +// Verify that `flat_multiset` (like `set`) does NOT support std::erase. +// +template +concept HasStdErase = requires(S& s, typename S::value_type x) { std::erase(s, x); }; +static_assert(HasStdErase>); +static_assert(!HasStdErase>); + +template +M make(std::initializer_list vals) { + M ret; + for (int v : vals) + ret.emplace(v); + return ret; +} + +template +void test0( + std::initializer_list vals, Pred p, std::initializer_list expected, std::size_t expected_erased_count) { + M s = make(vals); + ASSERT_SAME_TYPE(typename M::size_type, decltype(std::erase_if(s, p))); + assert(expected_erased_count == std::erase_if(s, p)); + assert(s == make(expected)); +} + +struct NotBool { + bool b; + explicit operator bool() const { return b; } +}; + +template +void test_one() { + // Test all the plausible signatures for this predicate. + auto is1 = [](typename S::const_reference v) { return v == 1; }; + auto is2 = [](typename S::value_type v) { return v == 2; }; + auto is3 = [](const typename S::value_type& v) { return v == 3; }; + auto is4 = [](auto v) { return v == 4; }; + auto True = [](const auto&) { return true; }; + auto False = [](auto&&) { return false; }; + auto nonBoolIs1 = [](const auto& v) { return NotBool{v == 1}; }; + + test0({}, is1, {}, 0); + + test0({1}, is1, {}, 1); + test0({1, 1, 1}, is1, {}, 3); + test0({1}, is2, {1}, 0); + test0({1, 1, 1}, is2, {1, 1, 1}, 0); + + test0({1, 2}, is1, {2}, 1); + test0({1, 1, 1, 2, 2}, is1, {2, 2}, 3); + test0({1, 2}, is2, {1}, 1); + test0({1, 1, 1, 2, 2}, is2, {1, 1, 1}, 2); + test0({1, 2}, is3, {1, 2}, 0); + test0({1, 1, 1, 2, 2}, is3, {1, 1, 1, 2, 2}, 0); + + test0({1, 2, 3}, is1, {2, 3}, 1); + test0({1, 1, 2, 2, 3, 3}, is1, {2, 2, 3, 3}, 2); + test0({1, 2, 3}, is2, {1, 3}, 1); + test0({1, 1, 2, 2, 3, 3}, is2, {1, 1, 3, 3}, 2); + test0({1, 2, 3}, is3, {1, 2}, 1); + test0({1, 1, 2, 2, 3, 3}, is3, {1, 1, 2, 2}, 2); + test0({1, 2, 3}, is4, {1, 2, 3}, 0); + test0({1, 1, 2, 2, 3, 3}, is4, {1, 1, 2, 2, 3, 3}, 0); + + test0({1, 2, 3}, True, {}, 3); + test0({1, 2, 2, 3, 3, 3}, True, {}, 6); + test0({1, 2, 3}, False, {1, 2, 3}, 0); + test0({1, 2, 2, 3, 3, 3}, False, {1, 2, 2, 3, 3, 3}, 0); + + test0({1, 2, 3}, nonBoolIs1, {2, 3}, 1); + test0({1, 1, 2, 2, 3}, nonBoolIs1, {2, 2, 3}, 2); +} + +void test() { + test_one>(); + test_one, std::vector>>>(); + test_one, std::vector>>>(); + test_one, std::deque>>>(); + test_one, std::deque>>>(); + test_one>(); + test_one>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if_exceptions.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if_exceptions.pass.cpp new file mode 100644 index 0000000000000..64dc110006a5a --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.erasure/erase_if_exceptions.pass.cpp @@ -0,0 +1,132 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// UNSUPPORTED: no-exceptions + +// + +// template +// typename flat_multiset::size_type +// erase_if(flat_multiset& c, Predicate pred); +// If any member function in [flat.set.defn] exits via an exception, the invariant is restored. +// (This is not a member function, but let's respect the invariant anyway.) + +#include +#include +#include +#include +#include +#include +#include + +#include "../helpers.h" +#include "test_macros.h" + +struct Counter { + int c1, c2, throws; + void tick() { + c1 -= 1; + if (c1 == 0) { + c1 = c2; + throws += 1; + throw 42; + } + } +}; +Counter g_counter = {0, 0, 0}; + +struct ThrowingAssignment { + ThrowingAssignment(int i) : i_(i) {} + ThrowingAssignment(const ThrowingAssignment&) = default; + ThrowingAssignment& operator=(const ThrowingAssignment& rhs) { + g_counter.tick(); + i_ = rhs.i_; + g_counter.tick(); + return *this; + } + operator int() const { return i_; } + int i_; +}; + +struct ThrowingComparator { + bool operator()(const ThrowingAssignment& a, const ThrowingAssignment& b) const { + g_counter.tick(); + return a.i_ < b.i_; + } +}; + +struct ErasurePredicate { + bool operator()(const auto& x) const { return (3 <= x && x <= 5); } +}; + +void test() { + { + using M = std::flat_multiset; + for (int first_throw = 1; first_throw < 99; ++first_throw) { + for (int second_throw = 1; second_throw < 99; ++second_throw) { + g_counter = {0, 0, 0}; + M m = M({1, 1, 2, 3, 3, 3, 4, 5, 6, 7, 8}); + try { + g_counter = {first_throw, second_throw, 0}; + auto n = std::erase_if(m, ErasurePredicate()); + assert(n == 5); + // If it didn't throw at all, we're done. + g_counter = {0, 0, 0}; + assert((m == M{1, 1, 2, 6, 7, 8})); + first_throw = 99; // "done" + break; + } catch (int ex) { + assert(ex == 42); + check_invariant(m); + LIBCPP_ASSERT(m.empty()); + if (g_counter.throws == 1) { + // We reached the first throw but not the second throw. + break; + } + } + } + } + } + + { + using M = std::flat_multiset>; + for (int first_throw = 1; first_throw < 99; ++first_throw) { + for (int second_throw = 1; second_throw < 99; ++second_throw) { + g_counter = {0, 0, 0}; + std::deque container = {5, 6, 7, 8}; + container.insert(container.begin(), {1, 2, 3, 4}); + M m = M(std::move(container)); + try { + g_counter = {first_throw, second_throw, 0}; + auto n = std::erase_if(m, ErasurePredicate()); + assert(n == 3); + // If it didn't throw at all, we're done. + g_counter = {0, 0, 0}; + assert((m == M{1, 2, 6, 7, 8})); + first_throw = 99; // "done" + break; + } catch (int ex) { + assert(ex == 42); + check_invariant(m); + LIBCPP_ASSERT(m.empty()); + if (g_counter.throws == 1) { + // We reached the first throw but not the second throw. + break; + } + } + } + } + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp new file mode 100644 index 0000000000000..809f03df47977 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator.pass.cpp @@ -0,0 +1,98 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// iterator begin() noexcept; +// const_iterator begin() const noexcept +// iterator end() noexcept; +// const_iterator end() const noexcept; +// +// const_iterator cbegin() const noexcept; +// const_iterator cend() const noexcept; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + + M m = {1, 2, 3, 4, 1, 4, 2, 3, 1}; + int expected[] = {1, 1, 1, 2, 2, 3, 3, 4, 4}; + const M& cm = m; + ASSERT_SAME_TYPE(decltype(m.begin()), typename M::iterator); + ASSERT_SAME_TYPE(decltype(m.cbegin()), typename M::const_iterator); + ASSERT_SAME_TYPE(decltype(cm.begin()), typename M::const_iterator); + ASSERT_SAME_TYPE(decltype(m.end()), typename M::iterator); + ASSERT_SAME_TYPE(decltype(m.cend()), typename M::const_iterator); + ASSERT_SAME_TYPE(decltype(cm.end()), typename M::const_iterator); + static_assert(noexcept(m.begin())); + static_assert(noexcept(cm.begin())); + static_assert(noexcept(m.cbegin())); + static_assert(noexcept(m.end())); + static_assert(noexcept(cm.end())); + static_assert(noexcept(m.cend())); + assert(m.size() == 9); + assert(std::distance(m.begin(), m.end()) == 9); + assert(std::distance(cm.begin(), cm.end()) == 9); + assert(std::distance(m.cbegin(), m.cend()) == 9); + typename M::iterator i; // default-construct + i = m.begin(); // move-assignment + typename M::const_iterator k = i; // converting constructor + assert(i == k); // comparison + for (int j = 0; j < 9; ++j, ++i) { // pre-increment + assert(*i == expected[j]); // operator* + } + assert(i == m.end()); + for (int j = 8; j >= 0; --j) { + --i; // pre-decrement + assert((*i) == expected[j]); + } + assert(i == m.begin()); +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + + { + // N3644 testing + using C = std::flat_multiset; + C::iterator ii1{}, ii2{}; + C::iterator ii4 = ii1; + C::const_iterator cii{}; + assert(ii1 == ii2); + assert(ii1 == ii4); + assert(!(ii1 != ii2)); + + assert((ii1 == cii)); + assert((cii == ii1)); + assert(!(ii1 != cii)); + assert(!(cii != ii1)); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp new file mode 100644 index 0000000000000..d26e3446072ef --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_comparison.pass.cpp @@ -0,0 +1,158 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// flat_multiset iterators should be C++20 random access iterators + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + using KI = typename KeyContainer::iterator; + using I = M::iterator; + using CI = M::const_iterator; + using RI = M::reverse_iterator; + using CRI = M::const_reverse_iterator; + + static_assert(std::equality_comparable); + static_assert(std::equality_comparable); + static_assert(std::equality_comparable); + static_assert(std::equality_comparable); + + static_assert(std::totally_ordered); + static_assert(std::totally_ordered); + static_assert(std::totally_ordered); + static_assert(std::totally_ordered); + + M m = {1, 1, 3, 4}; + + I i1 = m.begin(); + I i2 = m.begin() + 1; + + assert(i1 == i1); + assert(!(i1 != i1)); + assert(i1 != i2); + assert(!(i1 == i2)); + assert(i1 < i2); + assert(!(i1 < i1)); + assert(i1 <= i1); + assert(i1 <= i2); + assert(!(i2 <= i1)); + assert(i2 > i1); + assert(!(i2 > i2)); + assert(i2 >= i1); + assert(i2 >= i2); + assert(!(i1 >= i2)); + + CI ci1 = m.cbegin(); + CI ci2 = m.cbegin() + 1; + assert(ci1 == ci1); + assert(!(ci1 != ci1)); + assert(ci1 != ci2); + assert(!(ci1 == ci2)); + assert(ci1 < ci2); + assert(!(ci1 < ci1)); + assert(ci1 <= ci1); + assert(ci1 <= ci2); + assert(!(ci2 <= ci1)); + assert(ci2 > ci1); + assert(!(ci2 > ci2)); + assert(ci2 >= ci1); + assert(ci2 >= ci2); + assert(!(ci1 >= ci2)); + + RI ri1 = m.rbegin(); + RI ri2 = m.rbegin() + 1; + assert(ri1 == ri1); + assert(!(ri1 != ri1)); + assert(ri1 != ri2); + assert(!(ri1 == ri2)); + assert(ri1 < ri2); + assert(!(ri1 < ri1)); + assert(ri1 <= ri1); + assert(ri1 <= ri2); + assert(!(ri2 <= ri1)); + assert(ri2 > ri1); + assert(!(ri2 > ri2)); + assert(ri2 >= ri1); + assert(ri2 >= ri2); + assert(!(ri1 >= ri2)); + + CRI cri1 = m.crbegin(); + CRI cri2 = m.crbegin() + 1; + assert(cri1 == cri1); + assert(!(cri1 != cri1)); + assert(cri1 != cri2); + assert(!(cri1 == cri2)); + assert(cri1 < cri2); + assert(!(cri1 < cri1)); + assert(cri1 <= cri1); + assert(cri1 <= cri2); + assert(!(cri2 <= cri1)); + assert(cri2 > cri1); + assert(!(cri2 > cri2)); + assert(cri2 >= cri1); + assert(cri2 >= cri2); + assert(!(cri1 >= cri2)); + + if constexpr (std::three_way_comparable) { + static_assert(std::three_way_comparable); // ...of course the wrapped iterators still support <=>. + static_assert(std::three_way_comparable); + static_assert(std::three_way_comparable); + static_assert(std::three_way_comparable); + static_assert(std::same_as I()), std::strong_ordering>); + static_assert(std::same_as CI()), std::strong_ordering>); + static_assert(std::same_as CI()), std::strong_ordering>); + static_assert(std::same_as RI()), std::strong_ordering>); + static_assert(std::same_as CRI()), std::strong_ordering>); + static_assert(std::same_as CRI()), std::strong_ordering>); + + assert(i1 <=> i1 == std::strong_ordering::equivalent); + assert(i1 <=> i2 == std::strong_ordering::less); + assert(i2 <=> i1 == std::strong_ordering::greater); + + assert(ci1 <=> ci1 == std::strong_ordering::equivalent); + assert(ci1 <=> ci2 == std::strong_ordering::less); + assert(ci2 <=> ci1 == std::strong_ordering::greater); + + assert(ri1 <=> ri1 == std::strong_ordering::equivalent); + assert(ri1 <=> ri2 == std::strong_ordering::less); + assert(ri2 <=> ri1 == std::strong_ordering::greater); + + assert(cri1 <=> cri1 == std::strong_ordering::equivalent); + assert(cri1 <=> cri2 == std::strong_ordering::less); + assert(cri2 <=> cri1 == std::strong_ordering::greater); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_concept_conformance.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_concept_conformance.compile.pass.cpp new file mode 100644 index 0000000000000..0745b8f2433bc --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/iterator_concept_conformance.compile.pass.cpp @@ -0,0 +1,77 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// iterator, const_iterator, reverse_iterator, const_reverse_iterator + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test() { + using Key = typename KeyContainer::value_type; + using C = std::flat_multiset, KeyContainer>; + using I = C::iterator; + using CI = C::const_iterator; + using RI = C::reverse_iterator; + using CRI = C::const_reverse_iterator; + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(!std::contiguous_iterator); + static_assert(!std::contiguous_iterator); + static_assert(!std::indirectly_writable>); + static_assert(!std::indirectly_writable>); + static_assert(!std::indirectly_writable>); + static_assert(!std::indirectly_writable>); + static_assert(std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(!std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(std::sentinel_for); + static_assert(std::indirectly_movable_storable); + static_assert(std::indirectly_movable_storable); + static_assert(std::indirectly_movable_storable); + static_assert(std::indirectly_movable_storable); + + static_assert(std::is_same_v::iterator_category, std::random_access_iterator_tag>); + static_assert(std::is_same_v::iterator_category, std::random_access_iterator_tag>); + static_assert(std::is_same_v::iterator_category, std::random_access_iterator_tag>); + static_assert(std::is_same_v::iterator_category, std::random_access_iterator_tag>); +} + +void test() { + test>(); + test>(); + test>(); + test>>(); +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/range_concept_conformance.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/range_concept_conformance.compile.pass.cpp new file mode 100644 index 0000000000000..ccb7b94e0b3f5 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/range_concept_conformance.compile.pass.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +#include +#include +#include +#include +#include +#include +#include +#include "MinSequenceContainer.h" +#include "min_allocator.h" + +template +void test() { + { + using Key = typename KeyContainer::value_type; + using C = std::flat_multiset, KeyContainer>; + + static_assert(std::same_as, typename C::iterator>); + static_assert(std::ranges::random_access_range); + static_assert(std::ranges::common_range); + static_assert(std::ranges::input_range); + static_assert(!std::ranges::view); + static_assert(std::ranges::sized_range); + static_assert(!std::ranges::borrowed_range); + static_assert(std::ranges::viewable_range); + + static_assert(std::same_as, typename C::const_iterator>); + static_assert(std::ranges::random_access_range); + static_assert(std::ranges::common_range); + static_assert(std::ranges::input_range); + static_assert(!std::ranges::view); + static_assert(std::ranges::sized_range); + static_assert(!std::ranges::borrowed_range); + static_assert(!std::ranges::viewable_range); + } +} + +void test() { + test>(); + test>(); + test>(); + test>>(); +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp new file mode 100644 index 0000000000000..9d443ef8784e2 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.iterators/reverse_iterator.pass.cpp @@ -0,0 +1,92 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// reverse_iterator rbegin() noexcept; +// const_reverse_iterator rbegin() const noexcept; +// reverse_iterator rend() noexcept; +// const_reverse_iterator rend() const noexcept; +// +// const_reverse_iterator crbegin() const noexcept; +// const_reverse_iterator crend() const noexcept; + +#include +#include +#include +#include +#include +#include + +#include + +#include "test_macros.h" +#include + +void test() { + { + using M = std::flat_multiset, std::deque>; + M m = {1, 1, 2, 2, 3, 4}; + int expected[] = {1, 1, 2, 2, 3, 4}; + const M& cm = m; + ASSERT_SAME_TYPE(decltype(m.rbegin()), M::reverse_iterator); + ASSERT_SAME_TYPE(decltype(m.crbegin()), M::const_reverse_iterator); + ASSERT_SAME_TYPE(decltype(cm.rbegin()), M::const_reverse_iterator); + ASSERT_SAME_TYPE(decltype(m.rend()), M::reverse_iterator); + ASSERT_SAME_TYPE(decltype(m.crend()), M::const_reverse_iterator); + ASSERT_SAME_TYPE(decltype(cm.rend()), M::const_reverse_iterator); + static_assert(noexcept(m.rbegin())); + static_assert(noexcept(cm.rbegin())); + static_assert(noexcept(m.crbegin())); + static_assert(noexcept(m.rend())); + static_assert(noexcept(cm.rend())); + static_assert(noexcept(m.crend())); + assert(m.size() == 6); + assert(std::distance(m.rbegin(), m.rend()) == 6); + assert(std::distance(cm.rbegin(), cm.rend()) == 6); + assert(std::distance(m.crbegin(), m.crend()) == 6); + assert(std::distance(cm.crbegin(), cm.crend()) == 6); + M::reverse_iterator i; // default-construct + ASSERT_SAME_TYPE(decltype(*i), const int&); + i = m.rbegin(); // move-assignment + M::const_reverse_iterator k = i; // converting constructor + assert(i == k); // comparison + for (int j = 5; j >= 0; --j, ++i) { // pre-increment + assert(*i == expected[j]); + } + assert(i == m.rend()); + for (int j = 0; j <= 5; ++j) { + --i; // pre-decrement + assert(*i == expected[j]); + } + assert(i == m.rbegin()); + } + { + // N3644 testing + using C = std::flat_multiset; + C::reverse_iterator ii1{}, ii2{}; + C::reverse_iterator ii4 = ii1; + C::const_reverse_iterator cii{}; + assert(ii1 == ii2); + assert(ii1 == ii4); + assert(!(ii1 != ii2)); + + assert((ii1 == cii)); + assert((cii == ii1)); + assert(!(ii1 != cii)); + assert(!(cii != ii1)); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp new file mode 100644 index 0000000000000..4d01ece7ed6a6 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/clear.pass.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multiset + +// void clear() noexcept; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// test noexcept + +template +concept NoExceptClear = requires(T t) { + { t.clear() } noexcept; +}; + +static_assert(NoExceptClear>); +#ifndef TEST_HAS_NO_EXCEPTIONS +static_assert(NoExceptClear, ThrowOnMoveContainer>>); +#endif + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + { + M m = {1, 1, 3, 5, 2, 3, 4, 5}; + assert(m.size() == 8); + ASSERT_NOEXCEPT(m.clear()); + ASSERT_SAME_TYPE(decltype(m.clear()), void); + m.clear(); + assert(m.size() == 0); + } + { + // was empty + M m; + assert(m.size() == 0); + m.clear(); + assert(m.size() == 0); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + test_one>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp new file mode 100644 index 0000000000000..3ef13964c905e --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace.pass.cpp @@ -0,0 +1,136 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// iterator emplace(Args&&... args); + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "../../../Emplaceable.h" +#include "DefaultOnly.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + using R = typename M::iterator; + { + // was empty + M m; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2)); + assert(r == m.begin()); + assert(m.size() == 1); + assert(*r == 2); + } + { + // key does not exist and inserted at the begin + M m = {3, 3, 3, 7}; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2)); + assert(r == m.begin()); + assert(m.size() == 5); + assert(*r == 2); + } + { + // key does not exist and inserted in the middle + M m = {1, 1, 3, 4}; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2)); + assert(r == m.begin() + 2); + assert(m.size() == 5); + assert(*r == 2); + } + { + // key does not exist and inserted at the end + M m = {1, 1}; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2)); + assert(r == m.begin() + 2); + assert(m.size() == 3); + assert(*r == 2); + } + { + // key already exists and original at the begin + M m = {2, 2, 5, 6}; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2)); + assert(r == m.begin() + 2); + assert(m.size() == 5); + assert(*r == 2); + } + { + // key already exists and original in the middle + M m = {0, 2, 2, 4}; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2)); + assert(r == m.begin() + 3); + assert(m.size() == 5); + assert(*r == 2); + } + { + // key already exists and original at the end + M m = {0, 1, 2}; + std::same_as decltype(auto) r = m.emplace(typename M::value_type(2)); + assert(r == m.begin() + 3); + assert(m.size() == 4); + assert(*r == 2); + } +} + +template +void test_emplaceable() { + using M = std::flat_multiset, KeyContainer>; + using R = typename M::iterator; + + M m; + ASSERT_SAME_TYPE(decltype(m.emplace()), R); + R r = m.emplace(2, 0.0); + assert(r == m.begin()); + assert(m.size() == 1); + assert(*r == Emplaceable(2, 0.0)); + r = m.emplace(1, 3.5); + assert(r == m.begin()); + assert(m.size() == 2); + assert(*r == Emplaceable(1, 3.5)); + r = m.emplace(1, 3.5); + assert(r == m.begin() + 1); + assert(m.size() == 3); + assert(*r == Emplaceable(1, 3.5)); +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + + test_emplaceable>(); + test_emplaceable>(); + test_emplaceable>(); + test_emplaceable>>(); +} + +void test_exception() { + auto emplace_func = [](auto& m, auto key_arg) { m.emplace(key_arg); }; + test_emplace_exception_guarantee(emplace_func); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp new file mode 100644 index 0000000000000..41a2e9c4ce115 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/emplace_hint.pass.cpp @@ -0,0 +1,241 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// iterator emplace_hint(const_iterator position, Args&&... args); + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "../../../Emplaceable.h" +#include "DefaultOnly.h" +#include "min_allocator.h" +#include "../helpers.h" + +struct CompareTensDigit { + bool operator()(auto lhs, auto rhs) const { return (lhs / 10) < (rhs / 10); } +}; + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + using R = M::iterator; + + { + // was empty + M m; + std::same_as decltype(auto) r = m.emplace_hint(m.end(), typename M::value_type(2)); + assert(r == m.begin()); + assert(m.size() == 1); + assert(*r == 2); + } + { + // hints correct and no duplicates + M m = {0, 1, 3}; + auto hint = m.begin() + 2; + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(2)); + assert(r == m.begin() + 2); + assert(m.size() == 4); + assert(*r == 2); + } + { + // hints correct at the begin + M m = {3, 4}; + auto hint = m.begin(); + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(2)); + assert(r == m.begin()); + assert(m.size() == 3); + assert(*r == 2); + } + { + // hints correct in the middle + M m = {0, 1, 3, 4}; + auto hint = m.begin() + 2; + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(2)); + assert(r == m.begin() + 2); + assert(m.size() == 5); + assert(*r == 2); + } + { + // hints correct at the end + M m = {0, 1}; + auto hint = m.end(); + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(2)); + assert(r == m.begin() + 2); + assert(m.size() == 3); + assert(*r == 2); + } + { + // hints correct but key already exists + M m = {0, 1, 2, 3, 4}; + auto hint = m.begin() + 2; + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(2)); + assert(r == m.begin() + 2); + assert(m.size() == 6); + assert(*r == 2); + } + { + // hint correct and at the first duplicate + using M2 = std::flat_multiset; + using R2 = M2::iterator; + M2 m{0, 10, 20, 25, 30}; + auto hint = m.begin() + 2; + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(21)); + assert(r == m.begin() + 2); + assert(m.size() == 6); + assert(*r == 21); + } + { + // hint correct and in-between duplicates + using M2 = std::flat_multiset; + using R2 = M2::iterator; + M2 m{0, 10, 20, 21, 22, 30}; + auto hint = m.begin() + 4; + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(23)); + assert(r == m.begin() + 4); + assert(m.size() == 7); + assert(*r == 23); + assert(*std::next(r) == 22); + } + { + // hint correct and after duplicates + using M2 = std::flat_multiset; + using R2 = M2::iterator; + M2 m{0, 10, 20, 21, 22, 30}; + auto hint = m.begin() + 5; + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(23)); + assert(r == m.begin() + 5); + assert(m.size() == 7); + assert(*r == 23); + assert(*std::next(r) == 30); + } + { + // hints incorrect and no duplicates + M m = {0, 1, 3}; + auto hint = m.begin() + 1; + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(2)); + assert(r == m.begin() + 2); + assert(m.size() == 4); + assert(*r == 2); + } + { + // hints incorrectly at the begin + M m = {1, 4}; + auto hint = m.begin(); + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(2)); + assert(r == m.begin() + 1); + assert(m.size() == 3); + assert(*r == 2); + } + { + // hints incorrectly in the middle + M m = {0, 1, 3, 4}; + auto hint = m.begin() + 1; + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(2)); + assert(r == m.begin() + 2); + assert(m.size() == 5); + assert(*r == 2); + } + { + // hints incorrectly at the end + M m = {0, 3}; + auto hint = m.end(); + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(2)); + assert(r == m.begin() + 1); + assert(m.size() == 3); + assert(*r == 2); + } + { + // hints incorrect and key already exists + M m = {0, 1, 2, 3, 4}; + auto hint = m.begin(); + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(2)); + assert(r == m.begin() + 2); + assert(m.size() == 6); + assert(*r == 2); + } + { + // hint incorrect and before the first duplicate + using M2 = std::flat_multiset; + using R2 = M2::iterator; + M2 m{0, 10, 20, 21, 22, 30}; + auto hint = m.begin(); + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(23)); + assert(r == m.begin() + 2); + assert(m.size() == 7); + assert(*r == 23); + assert(*std::next(r) == 20); + } + { + // hint incorrect and after the last duplicate + using M2 = std::flat_multiset; + using R2 = M2::iterator; + M2 m{0, 10, 20, 21, 22, 30, 40}; + auto hint = m.begin() + 6; + std::same_as decltype(auto) r = m.emplace_hint(hint, typename M::value_type(23)); + assert(r == m.begin() + 5); + assert(m.size() == 8); + assert(*r == 23); + assert(*std::next(r) == 30); + } +} + +template +void test_emplaceable() { + using M = std::flat_multiset, KeyContainer>; + using R = M::iterator; + + M m; + ASSERT_SAME_TYPE(decltype(m.emplace_hint(m.cbegin())), R); + R r = m.emplace_hint(m.end(), 2, 0.0); + assert(r == m.begin()); + assert(m.size() == 1); + assert(*m.begin() == Emplaceable(2, 0.0)); + r = m.emplace_hint(m.end(), 1, 3.5); + assert(r == m.begin()); + assert(m.size() == 2); + assert(*m.begin() == Emplaceable(1, 3.5)); + r = m.emplace_hint(m.end(), 1, 3.5); + assert(r == m.begin() + 1); + assert(m.size() == 3); + assert(*r == Emplaceable(1, 3.5)); +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + + test_emplaceable>(); + test_emplaceable>(); + test_emplaceable>(); + test_emplaceable>>(); +} + +void test_exception() { + auto emplace_func = [](auto& m, auto key_arg) { m.emplace_hint(m.begin(), key_arg); }; + test_emplace_exception_guarantee(emplace_func); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp new file mode 100644 index 0000000000000..8418efa67bb23 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter.pass.cpp @@ -0,0 +1,114 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// iterator erase(iterator position); +// iterator erase(const_iterator position); + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + using I = M::iterator; + + int ar[] = { + 1, + 3, + 3, + 3, + 5, + 5, + 7, + 8, + }; + M m(ar, ar + sizeof(ar) / sizeof(ar[0])); + + auto make = [](std::initializer_list il) { + M m2; + for (int i : il) { + m2.emplace(i); + } + return m2; + }; + assert(m.size() == 8); + assert(m == make({1, 3, 3, 3, 5, 5, 7, 8})); + std::same_as decltype(auto) i1 = m.erase(std::next(m.cbegin(), 3)); + assert(m.size() == 7); + assert(i1 == std::next(m.begin(), 3)); + assert(m == make({1, 3, 3, 5, 5, 7, 8})); + + std::same_as decltype(auto) i2 = m.erase(std::next(m.begin(), 0)); + assert(m.size() == 6); + assert(i2 == m.begin()); + assert(m == make({3, 3, 5, 5, 7, 8})); + + std::same_as decltype(auto) i3 = m.erase(std::next(m.cbegin(), 5)); + assert(m.size() == 5); + assert(i3 == m.end()); + assert(m == make({3, 3, 5, 5, 7})); + + std::same_as decltype(auto) i4 = m.erase(std::next(m.begin(), 1)); + assert(m.size() == 4); + assert(i4 == std::next(m.begin())); + assert(m == make({3, 5, 5, 7})); + + std::same_as decltype(auto) i5 = m.erase(std::next(m.cbegin(), 2)); + assert(m.size() == 3); + assert(i5 == std::next(m.begin(), 2)); + assert(m == make({3, 5, 7})); + + std::same_as decltype(auto) i6 = m.erase(std::next(m.begin(), 2)); + assert(m.size() == 2); + assert(i6 == std::next(m.begin(), 2)); + assert(m == make({3, 5})); + + std::same_as decltype(auto) i7 = m.erase(std::next(m.cbegin(), 0)); + assert(m.size() == 1); + assert(i7 == std::next(m.begin(), 0)); + assert(m == make({5})); + + std::same_as decltype(auto) i8 = m.erase(m.begin()); + assert(m.size() == 0); + assert(i8 == m.begin()); + assert(i8 == m.end()); +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +void test_exception() { + auto erase_function = [](auto& m, auto) { m.erase(m.begin() + 2); }; + test_erase_exception_guarantee(erase_function); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp new file mode 100644 index 0000000000000..2d54fef17b6c0 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_iter_iter.pass.cpp @@ -0,0 +1,98 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// iterator erase(const_iterator first, const_iterator last); + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + using I = M::iterator; + + auto make = [](std::initializer_list il) { + M m; + for (int i : il) { + m.emplace(i); + } + return m; + }; + + int ar[] = { + 1, + 1, + 3, + 3, + 5, + 6, + 6, + 8, + }; + M m(ar, ar + sizeof(ar) / sizeof(ar[0])); + assert(m.size() == 8); + std::same_as decltype(auto) i1 = m.erase(m.cbegin(), m.cbegin()); + assert(m.size() == 8); + assert(i1 == m.begin()); + assert(m == make({1, 1, 3, 3, 5, 6, 6, 8})); + + std::same_as decltype(auto) i2 = m.erase(m.cbegin(), std::next(m.cbegin(), 2)); + assert(m.size() == 6); + assert(i2 == m.begin()); + assert(m == make({3, 3, 5, 6, 6, 8})); + + std::same_as decltype(auto) i3 = m.erase(std::next(m.cbegin(), 2), std::next(m.cbegin(), 6)); + assert(m.size() == 2); + assert(i3 == std::next(m.begin(), 2)); + assert(m == make({3, 3})); + + std::same_as decltype(auto) i4 = m.erase(m.cbegin(), m.cend()); + assert(m.size() == 0); + assert(i4 == m.begin()); + assert(i4 == m.end()); + + // was empty + std::same_as decltype(auto) i5 = m.erase(m.cbegin(), m.cend()); + assert(m.size() == 0); + assert(i5 == m.begin()); + assert(i5 == m.end()); +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +void test_exception() { + auto erase_function = [](auto& m, auto) { m.erase(m.begin(), m.begin() + 2); }; + test_erase_exception_guarantee(erase_function); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp new file mode 100644 index 0000000000000..8175afa5b626e --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key.pass.cpp @@ -0,0 +1,100 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// size_type erase(const key_type& k); + +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template > +void test_one() { + using M = std::flat_multiset; + + auto make = [](std::initializer_list il) { + M m; + for (int i : il) { + m.emplace(i); + } + return m; + }; + M m = make({1, 1, 3, 5, 5, 5, 7, 8}); + ASSERT_SAME_TYPE(decltype(m.erase(9)), typename M::size_type); + auto n = m.erase(9); + assert(n == 0); + assert(m == make({1, 1, 3, 5, 5, 5, 7, 8})); + n = m.erase(4); + assert(n == 0); + assert(m == make({1, 1, 3, 5, 5, 5, 7, 8})); + n = m.erase(1); + assert(n == 2); + assert(m == make({3, 5, 5, 5, 7, 8})); + n = m.erase(8); + assert(n == 1); + assert(m == make({3, 5, 5, 5, 7})); + n = m.erase(3); + assert(n == 1); + assert(m == make({5, 5, 5, 7})); + n = m.erase(4); + assert(n == 0); + assert(m == make({5, 5, 5, 7})); + n = m.erase(6); + assert(n == 0); + assert(m == make({5, 5, 5, 7})); + n = m.erase(7); + assert(n == 1); + assert(m == make({5, 5, 5})); + n = m.erase(2); + assert(n == 0); + assert(m == make({5, 5, 5})); + n = m.erase(5); + assert(n == 3); + assert(m.empty()); + // was empty + n = m.erase(5); + assert(n == 0); + assert(m.empty()); +} + +void test() { + test_one>(); + test_one, std::greater<>>(); + test_one>(); + test_one>(); + test_one>>(); +} + +void test_exception() { + auto erase_function = [](auto& m, auto key_arg) { + using Set = std::decay_t; + using Key = typename Set::key_type; + const Key key{key_arg}; + m.erase(key); + }; + test_erase_exception_guarantee(erase_function); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp new file mode 100644 index 0000000000000..a8765495d91d4 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/erase_key_transparent.pass.cpp @@ -0,0 +1,165 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// size_type erase(K&& k); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanErase = requires(M m, Transparent k) { m.erase(k); }; +using TransparentSet = std::flat_multiset; +using NonTransparentSet = std::flat_multiset; +static_assert(CanErase); +static_assert(!CanErase); +static_assert(!CanErase); +static_assert(!CanErase); + +template +struct HeterogeneousKey { + explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {} + operator It() && { return it_; } + auto operator<=>(Key key) const { return key_ <=> key; } + friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) { + assert(false); + return false; + } + Key key_; + It it_; +}; + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + + M m = {1, 2, 3, 3, 4}; + ASSERT_SAME_TYPE(decltype(m.erase(9)), typename M::size_type); + auto n = m.erase(3); // erase(K&&) [with K=int] + assert(n == 2); + assert((m == M{1, 2, 4})); + typename M::key_type lvalue = 2; + n = m.erase(lvalue); // erase(K&&) [with K=int&] + assert(n == 1); + assert((m == M{1, 4})); + const typename M::key_type const_lvalue = 1; + n = m.erase(const_lvalue); // erase(const key_type&) + assert(n == 1); + assert((m == M{4})); +} + +template +void test_transparent_comparator() { + using M = std::flat_multiset; + { + M m = {"alpha", "beta", "beta", "epsilon", "epsilon", "epsilon", "eta", "eta", "gamma"}; + ASSERT_SAME_TYPE(decltype(m.erase(Transparent{"abc"})), typename M::size_type); + + auto n = m.erase(Transparent{"epsilon"}); + assert(n == 3); + + M expected = {"alpha", "beta", "beta", "eta", "eta", "gamma"}; + assert(m == expected); + + auto n2 = m.erase(Transparent{"aaa"}); + assert(n2 == 0); + assert(m == expected); + } + { + // was empty + M m; + auto n = m.erase(Transparent{"epsilon"}); + assert(n == 0); + assert(m.empty()); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + + test_transparent_comparator>(); + test_transparent_comparator>(); + test_transparent_comparator>(); + test_transparent_comparator>>(); + + { + // P2077's HeterogeneousKey example + using M = std::flat_multiset>; + M m = {1, 2, 3, 4, 5, 6, 7, 8}; + auto h1 = HeterogeneousKey(8, m.begin()); + std::same_as auto n = m.erase(h1); // lvalue is not convertible to It; erase(K&&) is the best match + assert(n == 1); + assert((m == M{1, 2, 3, 4, 5, 6, 7})); + std::same_as auto it = m.erase(std::move(h1)); // rvalue is convertible to It; erase(K&&) drops out + assert(it == m.begin()); + assert((m == M{2, 3, 4, 5, 6, 7})); + } + { + using M = std::flat_multiset>; + M m = {1, 2, 3, 4, 5, 6, 7, 8}; + auto h1 = HeterogeneousKey(8, m.begin()); + std::same_as auto n = m.erase(h1); // lvalue is not convertible to It; erase(K&&) is the best match + assert(n == 1); + assert((m == M{1, 2, 3, 4, 5, 6, 7})); + std::same_as auto it = m.erase(std::move(h1)); // rvalue is convertible to It; erase(K&&) drops out + assert(it == m.begin()); + assert((m == M{2, 3, 4, 5, 6, 7})); + } + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multiset m(std::sorted_equivalent, {1, 2, 3}, c); + assert(!transparent_used); + auto n = m.erase(Transparent{3}); + assert(n == 1); + assert(transparent_used); + } + { + // std::string and C string literal + using M = std::flat_multiset>; + M m = {"alpha", "beta", "beta", "epsilon", "eta", "gamma"}; + auto n = m.erase("beta"); + assert(n == 2); + assert((m == M{"alpha", "epsilon", "eta", "gamma"})); + } +} + +void test_exception() { + auto erase_transparent = [](auto& m, auto key_arg) { + using Set = std::decay_t; + using Key = typename Set::key_type; + m.erase(Transparent{key_arg}); + }; + test_erase_exception_guarantee(erase_transparent); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp new file mode 100644 index 0000000000000..8a66431396916 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/extract.pass.cpp @@ -0,0 +1,102 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// container_type extract() &&; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +concept CanExtract = requires(T&& t) { std::forward(t).extract(); }; + +static_assert(CanExtract&&>); +static_assert(!CanExtract&>); +static_assert(!CanExtract const&>); +static_assert(!CanExtract const&&>); + +template +void test_one() { + using M = std::flat_multiset, KeyContainer>; + { + M m = M({1, 1, 3}); + + std::same_as auto keys = std::move(m).extract(); + + auto expected_keys = {1, 1, 3}; + assert(std::ranges::equal(keys, expected_keys)); + check_invariant(m); + LIBCPP_ASSERT(m.empty()); + } + { + // was empty + M m; + assert(m.empty()); + auto keys = std::move(m).extract(); + assert(keys.empty()); + LIBCPP_ASSERT(m.empty()); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + + { + // extracted object maintains invariant if the underlying container does not clear after move + using M = std::flat_multiset, CopyOnlyVector>; + M m = M({1, 1, 3}); + std::same_as auto keys = std::move(m).extract(); + assert(keys.size() == 3); + check_invariant(m); + LIBCPP_ASSERT(m.empty()); + } +} + +void test_exception() { + { +#ifndef TEST_HAS_NO_EXCEPTIONS + using KeyContainer = ThrowOnMoveContainer; + using M = std::flat_multiset; + + M m; + m.emplace(1); + m.emplace(2); + try { + auto c = std::move(m).extract(); + assert(false); + } catch (int) { + check_invariant(m); + // In libc++, we try to erase the key after value emplacement failure. + // and after erasure failure, we clear the flat_multiset + LIBCPP_ASSERT(m.size() == 0); + } +#endif + } +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp new file mode 100644 index 0000000000000..eeb1bdd26ca16 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_cv.pass.cpp @@ -0,0 +1,85 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// iterator insert(const value_type& v); + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "../helpers.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + using R = typename M::iterator; + using VT = typename M::value_type; + M m; + + const VT v1(2); + std::same_as decltype(auto) r = m.insert(v1); + assert(r == m.begin()); + assert(m.size() == 1); + assert(*r == 2); + + const VT v2(1); + r = m.insert(v2); + assert(r == m.begin()); + assert(m.size() == 2); + assert(*r == 1); + + const VT v3(3); + r = m.insert(v3); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 3); + assert(*r == 3); + + const VT v4(3); + r = m.insert(v4); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 4); + assert(*r == 3); + + const VT v5(1); + r = m.insert(v5); + assert(r == m.begin() + 1); + assert(m.size() == 5); + assert(*r == 1); +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +void test_exception() { + auto insert_func = [](auto& m, auto key_arg) { + using value_type = typename std::decay_t::value_type; + const value_type p(key_arg); + m.insert(p); + }; + test_emplace_exception_guarantee(insert_func); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp new file mode 100644 index 0000000000000..9c56d3bfb750b --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_initializer_list.pass.cpp @@ -0,0 +1,90 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// void insert(initializer_list il); + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + + { + M m = {1, 1, 1, 3, 3, 3}; + m.insert({ + 4, + 4, + 4, + 1, + 1, + 1, + 2, + 2, + 2, + }); + assert(m.size() == 15); + + KeyContainer expected{1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4}; + assert(std::ranges::equal(m, expected)); + } + { + // was empty + M m; + m.insert({ + 4, + 4, + 4, + 1, + 1, + 1, + 2, + 2, + 2, + }); + assert(m.size() == 9); + KeyContainer expected{1, 1, 1, 2, 2, 2, 4, 4, 4}; + assert(std::ranges::equal(m, expected)); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +void test_exception() { + auto insert_func = [](auto& m, const auto& newValues) { + using FlatSet = std::decay_t; + using value_type = typename FlatSet::value_type; + std::initializer_list il = {newValues[0]}; + m.insert(il); + }; + test_insert_range_exception_guarantee(insert_func); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp new file mode 100644 index 0000000000000..61f00f5138118 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_cv.pass.cpp @@ -0,0 +1,86 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// iterator insert(const_iterator position, const value_type& v); + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "../helpers.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + using R = typename M::iterator; + using VT = typename M::value_type; + + M m; + const VT v1(2); + std::same_as decltype(auto) r = m.insert(m.end(), v1); + assert(r == m.begin()); + assert(m.size() == 1); + assert(*r == 2); + + const VT v2(1); + r = m.insert(m.end(), v2); + assert(r == m.begin()); + assert(m.size() == 2); + assert(*r == 1); + + const VT v3(3); + r = m.insert(m.end(), v3); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 3); + assert(*r == 3); + + const VT v4(3); + r = m.insert(m.end(), v4); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 4); + assert(*r == 3); + + const VT v5(1); + r = m.insert(m.begin() + 2, v5); + assert(r == m.begin() + 1); + assert(m.size() == 5); + assert(*r == 1); +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +void test_exception() { + auto insert_func = [](auto& m, auto key_arg) { + using FlatSet = std::decay_t; + using value_type = typename FlatSet::value_type; + const value_type p(key_arg); + m.insert(m.begin(), p); + }; + test_emplace_exception_guarantee(insert_func); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp new file mode 100644 index 0000000000000..3505e097cca69 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_iter.pass.cpp @@ -0,0 +1,94 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// void insert(InputIterator first, InputIterator last); + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "test_iterators.h" +#include "min_allocator.h" + +// test constraint InputIterator +template +concept CanInsert = requires(M m, Args&&... args) { m.insert(std::forward(args)...); }; + +using Set = std::flat_multiset; + +static_assert(CanInsert); +static_assert(CanInsert, cpp17_input_iterator>); +static_assert(!CanInsert); +static_assert(!CanInsert, cpp20_input_iterator>); + +template +void test_one() { + using M = std::flat_multiset, KeyContainer>; + + int ar1[] = { + 2, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 3, + }; + int ar2[] = { + 4, + 4, + 4, + 1, + 1, + 1, + 0, + 0, + 0, + }; + + M m; + m.insert(cpp17_input_iterator(ar1), cpp17_input_iterator(ar1 + sizeof(ar1) / sizeof(ar1[0]))); + assert(m.size() == 9); + M expected{1, 1, 1, 2, 2, 2, 3, 3, 3}; + assert(m == expected); + + m.insert(cpp17_input_iterator(ar2), cpp17_input_iterator(ar2 + sizeof(ar2) / sizeof(ar2[0]))); + assert(m.size() == 18); + M expected2{0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4}; + assert(m == expected2); +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +void test_exception() { + auto insert_func = [](auto& m, const auto& newValues) { m.insert(newValues.begin(), newValues.end()); }; + test_insert_range_exception_guarantee(insert_func); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp new file mode 100644 index 0000000000000..9976c04c9973a --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_iter_rv.pass.cpp @@ -0,0 +1,88 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// +// iterator insert(const_iterator position, value_type&&); + +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "MoveOnly.h" +#include "min_allocator.h" +#include "../helpers.h" +#include "test_macros.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + using V = Key; + using R = typename M::iterator; + M m; + std::same_as decltype(auto) r = m.insert(m.end(), V(2)); + assert(r == m.begin()); + assert(m.size() == 1); + assert(*r == V(2)); + + r = m.insert(m.end(), V(1)); + assert(r == m.begin()); + assert(m.size() == 2); + assert(*r == V(1)); + + r = m.insert(m.end(), V(3)); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 3); + assert(*r == V(3)); + + r = m.insert(m.end(), V(3)); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 4); + assert(*r == V(3)); + + r = m.insert(m.begin(), V(2)); + assert(r == m.begin() + 1); + assert(m.size() == 5); + assert(*r == V(2)); + + r = m.insert(m.begin() + 2, V(1)); + assert(r == m.begin() + 1); + assert(m.size() == 6); + assert(*r == V(1)); +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + test_one>>(); +} + +void test_exception() { + auto insert_func = [](auto& m, auto key_arg) { + using FlatSet = std::decay_t; + using value_type = typename FlatSet::value_type; + value_type p(key_arg); + m.insert(m.begin(), std::move(p)); + }; + test_emplace_exception_guarantee(insert_func); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp new file mode 100644 index 0000000000000..566be3921bf77 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_range.pass.cpp @@ -0,0 +1,100 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template R> +// void insert_range(R&& rg); + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "MoveOnly.h" +#include "test_macros.h" +#include "test_iterators.h" +#include "min_allocator.h" + +// test constraint container-compatible-range +template +concept CanInsertRange = requires(M m, R&& r) { m.insert_range(std::forward(r)); }; + +using Set = std::flat_multiset; + +static_assert(CanInsertRange>); +static_assert(CanInsertRange>); +static_assert(!CanInsertRange*>>); +static_assert(!CanInsertRange*>>); + +template +void test_one() { + using Key = typename KeyContainer::value_type; + + { + using M = std::flat_multiset, KeyContainer>; + using It = forward_iterator; + M m = {10, 10, 8, 5, 2, 1, 1}; + int ar[] = {3, 1, 4, 1, 5, 9}; + std::ranges::subrange r = {It(ar), It(ar + 6)}; + static_assert(std::ranges::common_range); + m.insert_range(r); + assert((m == M{1, 1, 1, 1, 2, 3, 4, 5, 5, 8, 9, 10, 10})); + } + { + using M = std::flat_multiset, KeyContainer>; + using It = cpp20_input_iterator; + M m = {10, 10, 8, 5, 2, 1, 1}; + int ar[] = {3, 1, 4, 1, 5, 9}; + std::ranges::subrange r = {It(ar), sentinel_wrapper(It(ar + 6))}; + static_assert(!std::ranges::common_range); + m.insert_range(r); + assert((m == M{1, 1, 1, 1, 2, 3, 4, 5, 5, 8, 9, 10, 10})); + } + { + // was empty + using M = std::flat_multiset, KeyContainer>; + M m; + int ar[] = {3, 1, 4, 1, 5, 9}; + m.insert_range(ar); + assert((m == M{1, 1, 3, 4, 5, 9})); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + { + // Items are forwarded correctly from the input range. + MoveOnly a[] = {3, 1, 4, 1, 5}; + std::flat_multiset m; + m.insert_range(a | std::views::as_rvalue); + MoveOnly expected[] = {1, 1, 3, 4, 5}; + assert(std::ranges::equal(m, expected)); + } +} + +void test_exception() { + auto insert_func = [](auto& m, const auto& newValues) { m.insert_range(newValues); }; + test_insert_range_exception_guarantee(insert_func); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp new file mode 100644 index 0000000000000..9328c42fb0cda --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_rv.pass.cpp @@ -0,0 +1,92 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// class flat_multiset + +// iterator insert(value_type&& v); + +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "MoveOnly.h" +#include "min_allocator.h" +#include "test_macros.h" +#include "../helpers.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset; + using R = typename M::iterator; + using V = typename M::value_type; + + M m; + std::same_as decltype(auto) r = m.insert(V(2)); + assert(r == m.begin()); + assert(m.size() == 1); + assert(*r == V(2)); + + r = m.insert(V(1)); + assert(r == m.begin()); + assert(m.size() == 2); + assert(*r == V(1)); + + r = m.insert(V(3)); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 3); + assert(*r == V(3)); + + r = m.insert(V(3)); + assert(r == std::ranges::prev(m.end())); + assert(m.size() == 4); + assert(*r == V(3)); + + r = m.insert(V(2)); + assert(r == m.begin() + 2); + assert(m.size() == 5); + assert(*r == V(2)); + + r = m.insert(V(1)); + assert(r == m.begin() + 1); + assert(m.size() == 6); + assert(*r == V(1)); +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + test_one>>(); +} + +void test_exception() { + auto insert_func = [](auto& m, auto key_arg) { + using FlatSet = std::decay_t; + using value_type = typename FlatSet::value_type; + value_type p(key_arg); + m.insert(std::move(p)); + }; + test_emplace_exception_guarantee(insert_func); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp new file mode 100644 index 0000000000000..11af199c3d1ee --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_initializer_list.pass.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// void insert(sorted_equivalent_t, initializer_list il); + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + { + M m = {1, 1, 1, 3, 3, 3}; + m.insert(std::sorted_equivalent, {0, 1, 1, 2, 2, 4}); + assert(m.size() == 12); + M expected = {0, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4}; + assert(m == expected); + } + { + // empty + M m; + m.insert(std::sorted_equivalent, {0, 1, 1, 2, 2, 4}); + M expected = {0, 1, 1, 2, 2, 4}; + assert(m == expected); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +void test_exception() { + auto insert_func = [](auto& m, const auto& newValues) { + using FlatSet = std::decay_t; + using value_type = typename FlatSet::value_type; + std::initializer_list il = {newValues[0]}; + m.insert(std::sorted_equivalent, il); + }; + test_insert_range_exception_guarantee(insert_func); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp new file mode 100644 index 0000000000000..07b62d04e0ebc --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/insert_sorted_iter_iter.pass.cpp @@ -0,0 +1,82 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template +// void insert(sorted_equivalent_t, InputIterator first, InputIterator last); + +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "test_iterators.h" +#include "min_allocator.h" + +// test constraint InputIterator +template +concept CanInsert = requires(M m, Args&&... args) { m.insert(std::forward(args)...); }; + +using Set = std::flat_multiset; + +static_assert(CanInsert); +static_assert(CanInsert, cpp17_input_iterator>); +static_assert(!CanInsert); +static_assert(!CanInsert, cpp20_input_iterator>); + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + + int ar1[] = {1, 2, 2, 3}; + + int ar2[] = {0, 2, 4, 4}; + + M m; + m.insert(std::sorted_equivalent, + cpp17_input_iterator(ar1), + cpp17_input_iterator(ar1 + sizeof(ar1) / sizeof(ar1[0]))); + assert(m.size() == 4); + M expected{1, 2, 2, 3}; + assert(m == expected); + + m.insert(std::sorted_equivalent, + cpp17_input_iterator(ar2), + cpp17_input_iterator(ar2 + sizeof(ar2) / sizeof(ar2[0]))); + assert(m.size() == 8); + M expected2{0, 1, 2, 2, 2, 3, 4, 4}; + assert(m == expected2); +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +void test_exception() { + auto insert_func = [](auto& m, const auto& newValues) { + m.insert(std::sorted_equivalent, newValues.begin(), newValues.end()); + }; + test_insert_range_exception_guarantee(insert_func); +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp new file mode 100644 index 0000000000000..5fe61389d72a1 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/replace.pass.cpp @@ -0,0 +1,88 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// void replace(container_type&& key_cont); + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +concept CanReplace = requires(T t, Args&&... args) { t.replace(std::forward(args)...); }; + +using Set = std::flat_multiset; +static_assert(CanReplace>); +static_assert(!CanReplace&>); + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + { + // was empty + M m; + KeyContainer new_keys = {7, 7, 8}; + auto expected_keys = new_keys; + m.replace(std::move(new_keys)); + assert(m.size() == 3); + assert(std::ranges::equal(m, expected_keys)); + } + { + M m = M({1, 1, 2, 2, 3}); + KeyContainer new_keys = {7, 7, 8, 8}; + auto expected_keys = new_keys; + m.replace(std::move(new_keys)); + assert(m.size() == 4); + assert(std::ranges::equal(m, expected_keys)); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +void test_exception() { +#ifndef TEST_HAS_NO_EXCEPTIONS + using KeyContainer = ThrowOnMoveContainer; + using M = std::flat_multiset; + + M m; + m.emplace(1); + m.emplace(2); + try { + KeyContainer new_keys{3, 4}; + m.replace(std::move(new_keys)); + assert(false); + } catch (int) { + check_invariant(m); + // In libc++, we clear the set + LIBCPP_ASSERT(m.size() == 0); + } +#endif +} + +int main(int, char**) { + test(); + test_exception(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_exception.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_exception.pass.cpp new file mode 100644 index 0000000000000..705ee88994872 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_exception.pass.cpp @@ -0,0 +1,61 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// `check_assertion.h` requires Unix headers and regex support. +// REQUIRES: has-unix-headers +// UNSUPPORTED: no-localization +// UNSUPPORTED: no-exceptions + +// + +// void swap(flat_multiset& y) noexcept; +// friend void swap(flat_multiset& x, flat_multiset& y) noexcept + +// Test that std::terminate is called if any exception is thrown during swap + +#include +#include +#include +#include +#include + +#include "test_macros.h" +#include "../helpers.h" +#include "check_assertion.h" + +template +void test_swap_exception_guarantee([[maybe_unused]] F&& swap_function) { + { + // key swap throws + using KeyContainer = ThrowOnMoveContainer; + using M = std::flat_multiset; + + M m1, m2; + m1.emplace(1); + m1.emplace(1); + m2.emplace(1); + m2.emplace(4); + // swap is noexcept + EXPECT_STD_TERMINATE([&] { swap_function(m1, m2); }); + } +} + +int main(int, char**) { + { + auto swap_func = [](auto& m1, auto& m2) { swap(m1, m2); }; + test_swap_exception_guarantee(swap_func); + } + + { + auto swap_func = [](auto& m1, auto& m2) { m1.swap(m2); }; + test_swap_exception_guarantee(swap_func); + } + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp new file mode 100644 index 0000000000000..2e3ed02c3c00e --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_free.pass.cpp @@ -0,0 +1,98 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// friend void swap(flat_multiset& x, flat_multiset& y) noexcept + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "MoveOnly.h" +#include "min_allocator.h" +#include "test_macros.h" +#include "../helpers.h" + +// test noexcept + +template +concept NoExceptAdlSwap = requires(T t1, T t2) { + { swap(t1, t2) } noexcept; +}; + +static_assert(NoExceptAdlSwap>); + +#ifndef TEST_HAS_NO_EXCEPTIONS +static_assert(NoExceptAdlSwap, ThrowOnMoveContainer>>); +#endif + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + + { + M m1; + M m2; + M m1_save = m1; + M m2_save = m2; + swap(m1, m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } + { + int ar2[] = {5, 5, 7, 8, 8, 10, 11, 12}; + M m1; + M m2(ar2, ar2 + sizeof(ar2) / sizeof(ar2[0])); + M m1_save = m1; + M m2_save = m2; + swap(m1, m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } + { + int ar1[] = {1, 1, 3, 4}; + M m1(ar1, ar1 + sizeof(ar1) / sizeof(ar1[0])); + M m2; + M m1_save = m1; + M m2_save = m2; + swap(m1, m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } + { + int ar1[] = {1, 1, 3, 4}; + int ar2[] = {5, 5, 7, 8, 9, 10, 11, 12}; + M m1(ar1, ar1 + sizeof(ar1) / sizeof(ar1[0])); + M m2(ar2, ar2 + sizeof(ar2) / sizeof(ar2[0])); + M m1_save = m1; + M m2_save = m2; + swap(m1, m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp new file mode 100644 index 0000000000000..1d0d9152d1c1f --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.modifiers/swap_member.pass.cpp @@ -0,0 +1,96 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// void swap(flat_multiset& y) noexcept; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "MoveOnly.h" +#include "min_allocator.h" +#include "test_macros.h" +#include "../helpers.h" + +// test noexcept + +template +concept NoExceptMemberSwap = requires(T t1, T t2) { + { t1.swap(t2) } noexcept; +}; + +static_assert(NoExceptMemberSwap>); +#ifndef TEST_HAS_NO_EXCEPTIONS +static_assert(NoExceptMemberSwap, ThrowOnMoveContainer>>); +#endif + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + { + M m1; + M m2; + M m1_save = m1; + M m2_save = m2; + m1.swap(m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } + { + int ar2[] = {5, 5, 7, 7, 9, 10, 11, 12}; + M m1; + M m2(ar2, ar2 + sizeof(ar2) / sizeof(ar2[0])); + M m1_save = m1; + M m2_save = m2; + m1.swap(m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } + { + int ar1[] = {1, 1, 3, 4}; + M m1(ar1, ar1 + sizeof(ar1) / sizeof(ar1[0])); + M m2; + M m1_save = m1; + M m2_save = m2; + m1.swap(m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } + { + int ar1[] = {1, 1, 3, 4}; + int ar2[] = {5, 5, 7, 8, 9, 10, 11, 12}; + M m1(ar1, ar1 + sizeof(ar1) / sizeof(ar1[0])); + M m2(ar2, ar2 + sizeof(ar2) / sizeof(ar2[0])); + M m1_save = m1; + M m2_save = m2; + m1.swap(m2); + assert(m1 == m2_save); + assert(m2 == m1_save); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp new file mode 100644 index 0000000000000..4ca64516e242f --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.observers/comp.pass.cpp @@ -0,0 +1,76 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// key_compare key_comp() const; +// value_compare value_comp() const; + +#include +#include +#include +#include +#include + +#include "test_macros.h" + +void test() { + { + using M = std::flat_multiset; + using Comp = std::less; // the default + M m = {}; + ASSERT_SAME_TYPE(M::key_compare, Comp); + ASSERT_SAME_TYPE(decltype(m.key_comp()), Comp); + ASSERT_SAME_TYPE(decltype(m.value_comp()), Comp); + Comp kc = m.key_comp(); + assert(kc(1, 2)); + assert(!kc(2, 1)); + auto vc = m.value_comp(); + assert(vc(1, 2)); + assert(!vc(2, 1)); + } + { + using Comp = std::function; + using M = std::flat_multiset; + Comp comp = std::greater(); + M m({}, comp); + ASSERT_SAME_TYPE(M::key_compare, Comp); + ASSERT_SAME_TYPE(decltype(m.key_comp()), Comp); + ASSERT_SAME_TYPE(decltype(m.value_comp()), Comp); + Comp kc = m.key_comp(); + assert(!kc(1, 2)); + assert(kc(2, 1)); + auto vc = m.value_comp(); + assert(!vc(1, 2)); + assert(vc(2, 1)); + } + { + using Comp = std::less<>; + using M = std::flat_multiset; + M m = {}; + ASSERT_SAME_TYPE(M::key_compare, Comp); + ASSERT_SAME_TYPE(decltype(m.key_comp()), Comp); + ASSERT_SAME_TYPE(decltype(m.value_comp()), Comp); + Comp kc = m.key_comp(); + assert(kc(1, 2)); + assert(!kc(2, 1)); + auto vc = m.value_comp(); + auto a = std::make_pair(1, 2); + ASSERT_SAME_TYPE(decltype(vc(a, a)), bool); + assert(vc(1, 2)); + assert(!vc(2, 1)); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp new file mode 100644 index 0000000000000..00fda6c2edd88 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains.pass.cpp @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// bool contains(const key_type& x) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + { + using M = std::flat_multiset, KeyContainer>; + M m = {1, 2, 2, 2, 4, 4, 5, 8}; + assert(!m.contains(0)); + assert(m.contains(1)); + assert(m.contains(2)); + assert(!m.contains(3)); + assert(m.contains(4)); + assert(m.contains(5)); + assert(!m.contains(6)); + assert(!m.contains(7)); + assert(std::as_const(m).contains(8)); + assert(!std::as_const(m).contains(9)); + m.clear(); + assert(!m.contains(1)); + } + { + using M = std::flat_multiset, KeyContainer>; + M m = {1, 2, 2, 4, 4, 5, 5, 8}; + assert(!m.contains(0)); + assert(m.contains(1)); + assert(m.contains(2)); + assert(!m.contains(3)); + assert(m.contains(4)); + assert(m.contains(5)); + assert(!m.contains(6)); + assert(!m.contains(7)); + assert(std::as_const(m).contains(8)); + assert(!std::as_const(m).contains(9)); + m.clear(); + assert(!m.contains(1)); + } + { + // empty + using M = std::flat_multiset, KeyContainer>; + M m; + assert(!m.contains(0)); + assert(!m.contains(1)); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp new file mode 100644 index 0000000000000..abee2b1bb12f9 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/contains_transparent.pass.cpp @@ -0,0 +1,91 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template bool contains(const K& x) const; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanContains = requires(M m, Transparent k) { m.contains(k); }; +using TransparentSet = std::flat_multiset; +using NonTransparentSet = std::flat_multiset; +static_assert(CanContains); +static_assert(CanContains); +static_assert(!CanContains); +static_assert(!CanContains); + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset; + + { + M m = {"alpha", "beta", "beta", "epsilon", "eta", "eta", "gamma"}; + ASSERT_SAME_TYPE(decltype(m.contains(Transparent{"abc"})), bool); + ASSERT_SAME_TYPE(decltype(std::as_const(m).contains(Transparent{"b"})), bool); + assert(m.contains(Transparent{"alpha"}) == true); + assert(m.contains(Transparent{"beta"}) == true); + assert(m.contains(Transparent{"epsilon"}) == true); + assert(m.contains(Transparent{"eta"}) == true); + assert(m.contains(Transparent{"gamma"}) == true); + assert(m.contains(Transparent{"al"}) == false); + assert(m.contains(Transparent{""}) == false); + assert(m.contains(Transparent{"g"}) == false); + } + { + // empty + M m; + assert(m.contains(Transparent{"gamma"}) == false); + assert(m.contains(Transparent{"al"}) == false); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multiset m(std::sorted_equivalent, {1, 1, 2, 2, 3}, c); + assert(!transparent_used); + auto b = m.contains(Transparent{3}); + assert(b); + assert(transparent_used); + } + { + // std::string and C string literal + using M = std::flat_multiset>; + M m = {"alpha", "beta", "beta", "epsilon", "eta", "gamma"}; + assert(m.contains("beta")); + assert(!m.contains("charlie")); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp new file mode 100644 index 0000000000000..1752dab0e0e3a --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count.pass.cpp @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// size_type count(const key_type& x) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using S = typename KeyContainer::size_type; + + { + using M = std::flat_multiset, KeyContainer>; + M m = {1, 2, 2, 2, 2, 4, 4, 5, 8, 8, 8}; + ASSERT_SAME_TYPE(decltype(m.count(0)), S); + assert(m.count(0) == 0); + assert(m.count(1) == 1); + assert(m.count(2) == 4); + assert(m.count(3) == 0); + assert(m.count(4) == 2); + assert(m.count(5) == 1); + assert(m.count(6) == 0); + assert(m.count(7) == 0); + assert(std::as_const(m).count(8) == 3); + assert(std::as_const(m).count(9) == 0); + } + { + using M = std::flat_multiset, KeyContainer>; + M m = {1, 2, 4, 4, 4, 4, 5, 5, 8}; + ASSERT_SAME_TYPE(decltype(m.count(0)), S); + assert(m.count(0) == 0); + assert(m.count(1) == 1); + assert(m.count(2) == 1); + assert(m.count(3) == 0); + assert(m.count(4) == 4); + assert(m.count(5) == 2); + assert(m.count(6) == 0); + assert(m.count(7) == 0); + assert(std::as_const(m).count(8) == 1); + assert(std::as_const(m).count(9) == 0); + } + { + // empty + using M = std::flat_multiset, KeyContainer>; + M m; + assert(m.count(0) == 0); + assert(m.count(1) == 0); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp new file mode 100644 index 0000000000000..a9160aebb7517 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/count_transparent.pass.cpp @@ -0,0 +1,90 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template size_type count(const K& x) const; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanCount = requires(M m, Transparent k) { m.count(k); }; +using TransparentSet = std::flat_multiset; +using NonTransparentSet = std::flat_multiset; +static_assert(CanCount); +static_assert(CanCount); +static_assert(!CanCount); +static_assert(!CanCount); + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset; + { + M m = {"alpha", "beta", "beta", "beta", "epsilon", "eta", "eta", "gamma"}; + ASSERT_SAME_TYPE(decltype(m.count(Transparent{"abc"})), typename M::size_type); + ASSERT_SAME_TYPE(decltype(std::as_const(m).count(Transparent{"b"})), typename M::size_type); + assert(m.count(Transparent{"alpha"}) == 1); + assert(m.count(Transparent{"beta"}) == 3); + assert(m.count(Transparent{"epsilon"}) == 1); + assert(m.count(Transparent{"eta"}) == 2); + assert(m.count(Transparent{"gamma"}) == 1); + assert(m.count(Transparent{"al"}) == 0); + assert(m.count(Transparent{""}) == 0); + assert(m.count(Transparent{"g"}) == 0); + } + { + // empty + M m; + assert(m.count(Transparent{"alpha"}) == 0); + assert(m.count(Transparent{"beta"}) == 0); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multiset m(std::sorted_equivalent, {1, 2, 2, 2, 3, 3, 3, 3}, c); + assert(!transparent_used); + auto n = m.count(Transparent{3}); + assert(n == 4); + assert(transparent_used); + } + { + // std::string and C string literal + using M = std::flat_multiset>; + M m = {"alpha", "beta", "beta", "epsilon", "eta", "gamma"}; + auto n = m.count("beta"); + assert(n == 2); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp new file mode 100644 index 0000000000000..54ae27e9ba19c --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range.pass.cpp @@ -0,0 +1,88 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// pair equal_range(const key_type& k); +// pair equal_range(const key_type& k) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + { + using M = std::flat_multiset, KeyContainer>; + using R = std::pair; + using CR = std::pair; + M m = {1, 2, 2, 4, 4, 5, 5, 5, 8}; + ASSERT_SAME_TYPE(decltype(m.equal_range(0)), R); + ASSERT_SAME_TYPE(decltype(std::as_const(m).equal_range(0)), CR); + auto begin = m.begin(); + assert(m.equal_range(0) == std::pair(begin, begin)); + assert(m.equal_range(1) == std::pair(begin, begin + 1)); + assert(m.equal_range(2) == std::pair(begin + 1, begin + 3)); + assert(m.equal_range(3) == std::pair(begin + 3, begin + 3)); + assert(m.equal_range(4) == std::pair(begin + 3, begin + 5)); + assert(m.equal_range(5) == std::pair(begin + 5, begin + 8)); + assert(m.equal_range(6) == std::pair(begin + 8, begin + 8)); + assert(m.equal_range(7) == std::pair(begin + 8, begin + 8)); + assert(std::as_const(m).equal_range(8) == std::pair(m.cbegin() + 8, m.cbegin() + 9)); + assert(std::as_const(m).equal_range(9) == std::pair(m.cbegin() + 9, m.cbegin() + 9)); + } + + { + using M = std::flat_multiset, KeyContainer>; + using R = std::pair; + using CR = std::pair; + M m = {1, 1, 1, 2, 4, 5, 8, 8}; + ASSERT_SAME_TYPE(decltype(m.equal_range(0)), R); + ASSERT_SAME_TYPE(decltype(std::as_const(m).equal_range(0)), CR); + auto begin = m.begin(); + assert(m.equal_range(0) == std::pair(begin + 8, begin + 8)); + assert(m.equal_range(1) == std::pair(begin + 5, begin + 8)); + assert(m.equal_range(2) == std::pair(begin + 4, begin + 5)); + assert(m.equal_range(3) == std::pair(begin + 4, begin + 4)); + assert(m.equal_range(4) == std::pair(begin + 3, begin + 4)); + assert(m.equal_range(5) == std::pair(begin + 2, begin + 3)); + assert(m.equal_range(6) == std::pair(begin + 2, begin + 2)); + assert(m.equal_range(7) == std::pair(begin + 2, begin + 2)); + assert(std::as_const(m).equal_range(8) == std::pair(m.cbegin(), m.cbegin() + 2)); + assert(std::as_const(m).equal_range(9) == std::pair(m.cbegin(), m.cbegin())); + } + { + // empty + using M = std::flat_multiset, KeyContainer>; + M m; + auto end = m.end(); + assert(m.equal_range(0) == std::pair(end, end)); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp new file mode 100644 index 0000000000000..ae16ec1127f31 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/equal_range_transparent.pass.cpp @@ -0,0 +1,122 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template pair equal_range(const K& x); +// template pair equal_range(const K& x) const; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanEqualRange = requires(M m, Transparent k) { m.equal_range(k); }; +using TransparentSet = std::flat_multiset; +using NonTransparentSet = std::flat_multiset; +static_assert(CanEqualRange); +static_assert(CanEqualRange); +static_assert(!CanEqualRange); +static_assert(!CanEqualRange); + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset; + + using R = std::pair; + using CR = std::pair; + + auto test_found = [](auto&& set, const std::string& expected_key, long expected_offset, long expected_length) { + auto [first, last] = set.equal_range(Transparent{expected_key}); + assert(last - first == expected_length); + assert(first - set.begin() == expected_offset); + for (auto it = first; it != last; ++it) { + assert(*it == expected_key); + } + }; + + auto test_not_found = [](auto&& set, const std::string& expected_key, long expected_offset) { + auto [first, last] = set.equal_range(Transparent{expected_key}); + assert(first == last); + assert(first - set.begin() == expected_offset); + }; + { + M m = {"alpha", "beta", "beta", "beta", "epsilon", "eta", "eta", "eta", "eta", "gamma", "gamma"}; + const auto& cm = m; + ASSERT_SAME_TYPE(decltype(m.equal_range(Transparent{"abc"})), R); + ASSERT_SAME_TYPE(decltype(std::as_const(m).equal_range(Transparent{"b"})), CR); + + test_found(m, "alpha", 0, 1); + test_found(m, "beta", 1, 3); + test_found(m, "epsilon", 4, 1); + test_found(m, "eta", 5, 4); + test_found(m, "gamma", 9, 2); + test_found(cm, "alpha", 0, 1); + test_found(cm, "beta", 1, 3); + test_found(cm, "epsilon", 4, 1); + test_found(cm, "eta", 5, 4); + test_found(cm, "gamma", 9, 2); + + test_not_found(m, "charlie", 4); + test_not_found(m, "aaa", 0); + test_not_found(m, "zzz", 11); + test_not_found(cm, "charlie", 4); + test_not_found(cm, "aaa", 0); + test_not_found(cm, "zzz", 11); + } + { + // empty + M m; + const auto& cm = m; + test_not_found(m, "aaa", 0); + test_not_found(cm, "charlie", 0); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multiset m(std::sorted_equivalent, {1, 2, 3, 3, 3}, c); + assert(!transparent_used); + auto p = m.equal_range(Transparent{3}); + assert(p.first != p.second); + assert(transparent_used); + } + { + // std::string and C string literal + using M = std::flat_multiset>; + M m = {"alpha", "beta", "beta", "epsilon", "eta", "gamma"}; + auto [first, last] = m.equal_range("beta"); + assert(first == m.begin() + 1); + assert(last == m.begin() + 3); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp new file mode 100644 index 0000000000000..49386a6f77fae --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find.pass.cpp @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// iterator find(const key_type& k); +// const_iterator find(const key_type& k) const; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset, KeyContainer>; + { + M m = {1, 1, 2, 4, 4, 4, 4, 5, 5, 8}; + ASSERT_SAME_TYPE(decltype(m.find(0)), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).find(0)), typename M::const_iterator); + assert(m.find(0) == m.end()); + assert(m.find(1) == m.begin()); + assert(m.find(2) == m.begin() + 2); + assert(m.find(3) == m.end()); + assert(m.find(4) == m.begin() + 3); + assert(m.find(5) == m.begin() + 7); + assert(m.find(6) == m.end()); + assert(m.find(7) == m.end()); + assert(std::as_const(m).find(8) == m.begin() + 9); + assert(std::as_const(m).find(9) == m.end()); + } + { + // empty + M m; + assert(m.find(0) == m.end()); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp new file mode 100644 index 0000000000000..9d0b75c7b52bc --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/find_transparent.pass.cpp @@ -0,0 +1,110 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template iterator find(const K& x); +// template const_iterator find(const K& x) const; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanFind = requires(M m, Transparent k) { m.find(k); }; +using TransparentSet = std::flat_multiset; +using NonTransparentSet = std::flat_multiset; +static_assert(CanFind); +static_assert(CanFind); +static_assert(!CanFind); +static_assert(!CanFind); + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset; + + { + M m = {"alpha", "alpha", "alpha", "beta", "epsilon", "epsilon", "eta", "gamma", "gamma"}; + + const auto& cm = m; + ASSERT_SAME_TYPE(decltype(m.find(Transparent{"abc"})), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).find(Transparent{"b"})), typename M::const_iterator); + + auto test_find = [](auto&& set, const std::string& expected_key, long expected_offset) { + auto iter = set.find(Transparent{expected_key}); + assert(iter - set.begin() == expected_offset); + }; + + test_find(m, "alpha", 0); + test_find(m, "beta", 3); + test_find(m, "epsilon", 4); + test_find(m, "eta", 6); + test_find(m, "gamma", 7); + test_find(m, "charlie", 9); + test_find(m, "aaa", 9); + test_find(m, "zzz", 9); + test_find(cm, "alpha", 0); + test_find(cm, "beta", 3); + test_find(cm, "epsilon", 4); + test_find(cm, "eta", 6); + test_find(cm, "gamma", 7); + test_find(cm, "charlie", 9); + test_find(cm, "aaa", 9); + test_find(cm, "zzz", 9); + } + { + // empty + M m; + auto iter = m.find(Transparent{"a"}); + assert(iter == m.end()); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multiset m(std::sorted_equivalent, {1, 2, 2, 2, 3, 3}, c); + assert(!transparent_used); + auto it = m.find(Transparent{3}); + assert(it != m.end()); + assert(transparent_used); + } + { + // std::string and C string literal + using M = std::flat_multiset>; + M m = {"alpha", "beta", "beta", "epsilon", "eta", "gamma"}; + auto it = m.find("beta"); + assert(it == m.begin() + 1); + auto it2 = m.find("charlie"); + assert(it2 == m.end()); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp new file mode 100644 index 0000000000000..ba41b822fda74 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound.pass.cpp @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// iterator lower_bound(const key_type& k); +// const_iterator lower_bound(const key_type& k) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + { + using M = std::flat_multiset, KeyContainer>; + M m = {1, 2, 2, 2, 4, 4, 5, 8, 8}; + ASSERT_SAME_TYPE(decltype(m.lower_bound(0)), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).lower_bound(0)), typename M::const_iterator); + assert(m.lower_bound(0) == m.begin()); + assert(m.lower_bound(1) == m.begin()); + assert(m.lower_bound(2) == m.begin() + 1); + assert(m.lower_bound(3) == m.begin() + 4); + assert(m.lower_bound(4) == m.begin() + 4); + assert(m.lower_bound(5) == m.begin() + 6); + assert(m.lower_bound(6) == m.begin() + 7); + assert(m.lower_bound(7) == m.begin() + 7); + assert(std::as_const(m).lower_bound(8) == m.begin() + 7); + assert(std::as_const(m).lower_bound(9) == m.end()); + } + { + using M = std::flat_multiset, KeyContainer>; + M m = {1, 1, 1, 2, 2, 4, 5, 5, 5, 8}; + ASSERT_SAME_TYPE(decltype(m.lower_bound(0)), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).lower_bound(0)), typename M::const_iterator); + assert(m.lower_bound(0) == m.end()); + assert(m.lower_bound(1) == m.begin() + 7); + assert(m.lower_bound(2) == m.begin() + 5); + assert(m.lower_bound(3) == m.begin() + 5); + assert(m.lower_bound(4) == m.begin() + 4); + assert(m.lower_bound(5) == m.begin() + 1); + assert(m.lower_bound(6) == m.begin() + 1); + assert(m.lower_bound(7) == m.begin() + 1); + assert(std::as_const(m).lower_bound(8) == m.begin()); + assert(std::as_const(m).lower_bound(9) == m.begin()); + } + { + // empty + using M = std::flat_multiset, KeyContainer>; + M m; + assert(m.lower_bound(0) == m.end()); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp new file mode 100644 index 0000000000000..c03fb27a7c27e --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/lower_bound_transparent.pass.cpp @@ -0,0 +1,116 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template iterator lower_bound(const K& x); +// template const_iterator lower_bound(const K& x) const; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanLowerBound = requires(M m, Transparent k) { m.lower_bound(k); }; +using TransparentSet = std::flat_multiset; +using NonTransparentSet = std::flat_multiset; +static_assert(CanLowerBound); +static_assert(CanLowerBound); +static_assert(!CanLowerBound); +static_assert(!CanLowerBound); + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset; + + { + M m = {"alpha", "alpha", "beta", "beta", "beta", "epsilon", "eta", "eta", "eta", "eta", "gamma"}; + const auto& cm = m; + ASSERT_SAME_TYPE(decltype(m.lower_bound(Transparent{"abc"})), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).lower_bound(Transparent{"b"})), typename M::const_iterator); + + auto test_lower_bound = [&](auto&& set, const std::string& expected_key, long expected_offset) { + auto iter = set.lower_bound(Transparent{expected_key}); + assert(iter - set.begin() == expected_offset); + }; + + test_lower_bound(m, "abc", 0); + test_lower_bound(m, "alpha", 0); + test_lower_bound(m, "beta", 2); + test_lower_bound(m, "bets", 5); + test_lower_bound(m, "charlie", 5); + test_lower_bound(m, "echo", 5); + test_lower_bound(m, "epsilon", 5); + test_lower_bound(m, "eta", 6); + test_lower_bound(m, "gamma", 10); + test_lower_bound(m, "golf", 11); + test_lower_bound(m, "zzz", 11); + + test_lower_bound(cm, "abc", 0); + test_lower_bound(cm, "alpha", 0); + test_lower_bound(cm, "beta", 2); + test_lower_bound(cm, "bets", 5); + test_lower_bound(cm, "charlie", 5); + test_lower_bound(cm, "echo", 5); + test_lower_bound(cm, "epsilon", 5); + test_lower_bound(cm, "eta", 6); + test_lower_bound(cm, "gamma", 10); + test_lower_bound(cm, "golf", 11); + test_lower_bound(cm, "zzz", 11); + } + { + // empty + M m; + auto iter = m.lower_bound(Transparent{"a"}); + assert(iter == m.end()); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multiset m(std::sorted_equivalent, {1, 2, 2, 3, 3}, c); + assert(!transparent_used); + auto it = m.lower_bound(Transparent{3}); + assert(it != m.end()); + assert(transparent_used); + } + { + // std::string and C string literal + using M = std::flat_multiset>; + M m = {"alpha", "beta", "beta", "epsilon", "eta", "gamma"}; + auto it = m.lower_bound("beta"); + assert(it == m.begin() + 1); + auto it2 = m.lower_bound("charlie"); + assert(it2 == m.begin() + 3); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp new file mode 100644 index 0000000000000..7828f0500c8b9 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound.pass.cpp @@ -0,0 +1,81 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// iterator upper_bound(const key_type& k); +// const_iterator upper_bound(const key_type& k) const; + +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + { + using M = std::flat_multiset, KeyContainer>; + M m = {1, 1, 2, 2, 4, 4, 5, 5, 8, 8}; + ASSERT_SAME_TYPE(decltype(m.upper_bound(0)), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).upper_bound(0)), typename M::const_iterator); + assert(m.upper_bound(0) == m.begin()); + assert(m.upper_bound(1) == m.begin() + 2); + assert(m.upper_bound(2) == m.begin() + 4); + assert(m.upper_bound(3) == m.begin() + 4); + assert(m.upper_bound(4) == m.begin() + 6); + assert(m.upper_bound(5) == m.begin() + 8); + assert(m.upper_bound(6) == m.begin() + 8); + assert(std::as_const(m).upper_bound(7) == m.begin() + 8); + assert(std::as_const(m).upper_bound(8) == m.end()); + assert(std::as_const(m).upper_bound(9) == m.end()); + } + + { + using M = std::flat_multiset, KeyContainer>; + M m = {1, 1, 1, 1, 2, 2, 4, 4, 4, 5, 5, 5, 5, 5, 8, 8}; + ASSERT_SAME_TYPE(decltype(m.upper_bound(0)), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).upper_bound(0)), typename M::const_iterator); + assert(m.upper_bound(0) == m.end()); + assert(m.upper_bound(1) == m.end()); + assert(m.upper_bound(2) == m.begin() + 12); + assert(m.upper_bound(3) == m.begin() + 10); + assert(m.upper_bound(4) == m.begin() + 10); + assert(m.upper_bound(5) == m.begin() + 7); + assert(m.upper_bound(6) == m.begin() + 2); + assert(m.upper_bound(7) == m.begin() + 2); + assert(std::as_const(m).upper_bound(8) == m.begin() + 2); + assert(std::as_const(m).upper_bound(9) == m.begin()); + } + { + // empty + using M = std::flat_multiset, KeyContainer>; + M m; + assert(m.upper_bound(0) == m.end()); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp new file mode 100644 index 0000000000000..de517fd7e520a --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/flat.multiset.operations/upper_bound_transparent.pass.cpp @@ -0,0 +1,114 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// template iterator upper_bound(const K& x); +// template const_iterator upper_bound(const K& x) const; + +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "../helpers.h" +#include "test_macros.h" +#include "min_allocator.h" + +// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type. +template +concept CanUpperBound = requires(M m, Transparent k) { m.upper_bound(k); }; +using TransparentSet = std::flat_multiset; +using NonTransparentSet = std::flat_multiset; +static_assert(CanUpperBound); +static_assert(CanUpperBound); +static_assert(!CanUpperBound); +static_assert(!CanUpperBound); + +template +void test_one() { + using Key = typename KeyContainer::value_type; + using M = std::flat_multiset; + + { + M m = {"alpha", "alpha", "beta", "epsilon", "epsilon", "epsilon", "eta", "gamma"}; + const auto& cm = m; + ASSERT_SAME_TYPE(decltype(m.lower_bound(Transparent{"abc"})), typename M::iterator); + ASSERT_SAME_TYPE(decltype(std::as_const(m).lower_bound(Transparent{"b"})), typename M::const_iterator); + + auto test_upper_bound = [&](auto&& set, const std::string& expected_key, long expected_offset) { + auto iter = set.upper_bound(Transparent{expected_key}); + assert(iter - set.begin() == expected_offset); + }; + + test_upper_bound(m, "abc", 0); + test_upper_bound(m, "alpha", 2); + test_upper_bound(m, "beta", 3); + test_upper_bound(m, "bets", 3); + test_upper_bound(m, "charlie", 3); + test_upper_bound(m, "echo", 3); + test_upper_bound(m, "epsilon", 6); + test_upper_bound(m, "eta", 7); + test_upper_bound(m, "gamma", 8); + test_upper_bound(m, "golf", 8); + test_upper_bound(m, "zzz", 8); + + test_upper_bound(cm, "abc", 0); + test_upper_bound(cm, "alpha", 2); + test_upper_bound(cm, "beta", 3); + test_upper_bound(cm, "bets", 3); + test_upper_bound(cm, "charlie", 3); + test_upper_bound(cm, "echo", 3); + test_upper_bound(cm, "epsilon", 6); + test_upper_bound(cm, "eta", 7); + test_upper_bound(cm, "gamma", 8); + test_upper_bound(cm, "golf", 8); + test_upper_bound(cm, "zzz", 8); + } + { + // empty + M m; + auto iter = m.upper_bound(Transparent{"a"}); + assert(iter == m.end()); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + + { + bool transparent_used = false; + TransparentComparator c(transparent_used); + std::flat_multiset m(std::sorted_equivalent, {1, 1, 1, 2, 3}, c); + assert(!transparent_used); + auto it = m.upper_bound(Transparent{2}); + assert(it != m.end()); + assert(transparent_used); + } + { + // std::string and C string literal + using M = std::flat_multiset>; + M m = {"alpha", "beta", "beta", "epsilon", "eta", "gamma"}; + auto it = m.upper_bound("beta"); + assert(it == m.begin() + 3); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h new file mode 100644 index 0000000000000..e7ed8a091d3be --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/helpers.h @@ -0,0 +1,119 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef SUPPORT_FLAT_MULTISET_HELPERS_H +#define SUPPORT_FLAT_MULTISET_HELPERS_H + +#include +#include +#include +#include +#include + +#include "../flat_helpers.h" +#include "test_allocator.h" +#include "test_macros.h" + +template +void check_invariant(const std::flat_multiset& m) { + assert(std::is_sorted(m.begin(), m.end(), m.key_comp())); +} + +template +void test_emplace_exception_guarantee([[maybe_unused]] F&& emplace_function) { +#ifndef TEST_HAS_NO_EXCEPTIONS + using C = TransparentComparator; + { + // Throw on emplace the key, and underlying has strong exception guarantee + using KeyContainer = std::vector>; + using M = std::flat_multiset; + + LIBCPP_STATIC_ASSERT(std::__container_traits::__emplacement_has_strong_exception_safety_guarantee); + + test_allocator_statistics stats; + + KeyContainer a({1, 1, 2, 2, 3, 4}, test_allocator{&stats}); + [[maybe_unused]] auto expected_keys = a; + M m(std::sorted_equivalent, std::move(a)); + + stats.throw_after = 1; + try { + emplace_function(m, 0); + assert(false); + } catch (const std::bad_alloc&) { + check_invariant(m); + // In libc++, the flat_multiset is unchanged + LIBCPP_ASSERT(m.size() == 6); + LIBCPP_ASSERT(std::ranges::equal(m, expected_keys)); + } + } + { + // Throw on emplace the key, and underlying has no strong exception guarantee + using KeyContainer = EmplaceUnsafeContainer; + using M = std::flat_multiset; + + LIBCPP_STATIC_ASSERT(!std::__container_traits::__emplacement_has_strong_exception_safety_guarantee); + KeyContainer a = {1, 1, 2, 2, 3, 4}; + M m(std::sorted_equivalent, std::move(a)); + try { + emplace_function(m, 0); + assert(false); + } catch (int) { + check_invariant(m); + // In libc++, the flat_multiset is cleared + LIBCPP_ASSERT(m.size() == 0); + } + } +#endif +} + +template +void test_insert_range_exception_guarantee([[maybe_unused]] F&& insert_function) { +#ifndef TEST_HAS_NO_EXCEPTIONS + using KeyContainer = EmplaceUnsafeContainer; + using M = std::flat_multiset; + test_allocator_statistics stats; + KeyContainer a{1, 2, 3, 4}; + M m(std::sorted_equivalent, std::move(a)); + + std::vector newValues = {0, 1, 5, 6, 7, 8}; + stats.throw_after = 1; + try { + insert_function(m, newValues); + assert(false); + } catch (int) { + check_invariant(m); + // In libc++, we clear if anything goes wrong when inserting a range + LIBCPP_ASSERT(m.size() == 0); + } +#endif +} + +template +void test_erase_exception_guarantee([[maybe_unused]] F&& erase_function) { +#ifndef TEST_HAS_NO_EXCEPTIONS + { + // key erase throws + using KeyContainer = ThrowOnEraseContainer; + using M = std::flat_multiset; + + KeyContainer a{1, 2, 3, 4}; + M m(std::sorted_equivalent, std::move(a)); + try { + erase_function(m, 3); + assert(false); + } catch (int) { + check_invariant(m); + // In libc++, we clear if anything goes wrong when erasing + LIBCPP_ASSERT(m.size() == 0); + } + } +#endif +} + +#endif // SUPPORT_FLAT_MULTISET_HELPERS_H diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/incomplete_type.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/incomplete_type.pass.cpp new file mode 100644 index 0000000000000..88aa8f5993efa --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/incomplete_type.pass.cpp @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// Check that std::flat_multiset and its iterators can be instantiated with an incomplete +// type. + +#include +#include + +struct A { + using Set = std::flat_multiset; + int data; + Set m; + Set::iterator it; + Set::const_iterator cit; +}; + +// Implement the operator< required in order to instantiate flat_multiset +bool operator<(A const& L, A const& R) { return L.data < R.data; } + +void test() { A a; } + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp new file mode 100644 index 0000000000000..94f0f2b34abcc --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/op_compare.pass.cpp @@ -0,0 +1,105 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// friend bool operator==(const flat_multiset& x, const flat_multiset& y); +// friend synth-three-way-result +// operator<=>(const flat_multiset& x, const flat_multiset& y); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "MinSequenceContainer.h" +#include "test_macros.h" +#include "min_allocator.h" +#include "test_allocator.h" +#include "test_comparisons.h" +#include "test_container_comparisons.h" + +template +void test_one() { + using Key = typename KeyContainer::value_type; + + { + using C = std::flat_multiset; + C s1, s2; + ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::strong_ordering); + AssertComparisonsReturnBool(); + assert(testComparisons(C{1, 1, 2}, C{1, 1, 3}, false, true)); + assert(testComparisons(C{1, 1}, C{1, 1}, true, false)); + assert(testComparisons(C{1, 10}, C{2, 2}, false, true)); + assert(testComparisons(C{}, C{1}, false, true)); + assert(testComparisons(C{2}, C{1, 1, 1, 1, 1}, false, false)); + } + { + // Comparisons use value_type's native operators, not the comparator + using C = std::flat_multiset>; + C s1 = {1, 1}; + C s2 = {2, 2}; + ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::strong_ordering); + AssertComparisonsReturnBool(); + assert(testComparisons(s1, s2, false, true)); + s2 = {1, 1}; + assert(testComparisons(s1, s2, true, false)); + s2 = {1, 2}; + assert(testComparisons(s1, s2, false, true)); + s1 = {0, 1, 2}; + assert(testComparisons(s1, s2, false, false)); + s2 = {0, 1, 3}; + assert(testComparisons(s1, s2, false, true)); + } +} + +void test() { + test_one>(); + test_one>(); + test_one>(); + test_one>>(); + + { + using C = std::flat_multiset; + C s1 = {1}; + C s2 = C(std::sorted_equivalent, {std::numeric_limits::quiet_NaN()}); + ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::partial_ordering); + AssertComparisonsReturnBool(); + assert(testComparisonsComplete(s1, s2, false, false, false)); + } + { + // Comparisons use value_type's native operators, not the comparator + struct StrongComp { + bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; } + }; + using C = std::flat_multiset; + C s1 = {1}; + C s2 = {std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN()}; + ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::partial_ordering); + AssertComparisonsReturnBool(); + assert(testComparisonsComplete(s1, s2, false, false, false)); + s1 = {1, std::numeric_limits::quiet_NaN(), 1}; + s2 = {1, std::numeric_limits::quiet_NaN(), 1}; + assert(std::lexicographical_compare_three_way(s1.begin(), s1.end(), s2.begin(), s2.end(), std::strong_order) == + std::strong_ordering::equal); + assert(s1 != s2); + assert((s1 <=> s2) == std::partial_ordering::unordered); + } +} + +int main(int, char**) { + test(); + + return 0; +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.multiset/types.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multiset/types.compile.pass.cpp new file mode 100644 index 0000000000000..f035487c9e578 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat.multiset/types.compile.pass.cpp @@ -0,0 +1,94 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// using key_type = Key; +// using value_type = Key; +// using key_compare = Compare; +// using value_compare = Compare; +// using reference = value_type&; +// using const_reference = const value_type&; +// using size_type = typename KeyContainer::size_type; +// using difference_type = typename KeyContainer::difference_type; +// using iterator = implementation-defined; // see [container.requirements] +// using const_iterator = implementation-defined; // see [container.requirements] +// using reverse_iterator = std::reverse_iterator; +// using const_reverse_iterator = std::reverse_iterator; +// using container_type = KeyContainer; + +#include +#include +#include +#include +#include +#include +#include +#include "min_allocator.h" + +void test() { + { + using M = std::flat_multiset; + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(requires { typename M::iterator; }); + static_assert(requires { typename M::const_iterator; }); + static_assert(std::is_same_v>); + static_assert( + std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(requires { typename M::value_compare; }); + } + + { + struct A {}; + struct Compare { + bool operator()(const std::string&, const std::string&) const; + }; + using M = std::flat_multiset>; + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(requires { typename M::iterator; }); + static_assert(requires { typename M::const_iterator; }); + static_assert(std::is_same_v>); + static_assert( + std::is_same_v>); + static_assert(std::is_same_v>); + } + { + using C = std::flat_multiset, std::deque>>; + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::random_access_iterator); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + // size_type is invariably size_t + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>>); + } +} diff --git a/libcxx/test/std/containers/container.adaptors/flat.set/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.set/helpers.h index 6aed4b1cf131d..4cc720311cf01 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.set/helpers.h +++ b/libcxx/test/std/containers/container.adaptors/flat.set/helpers.h @@ -15,6 +15,7 @@ #include #include +#include "../flat_helpers.h" #include "test_allocator.h" #include "test_macros.h" @@ -28,163 +29,6 @@ void check_invariant(const std::flat_set& m) { assert(std::adjacent_find(m.begin(), m.end(), key_equal) == m.end()); } -struct StartsWith { - explicit StartsWith(char ch) : lower_(1, ch), upper_(1, ch + 1) {} - StartsWith(const StartsWith&) = delete; - void operator=(const StartsWith&) = delete; - struct Less { - using is_transparent = void; - bool operator()(const std::string& a, const std::string& b) const { return a < b; } - bool operator()(const StartsWith& a, const std::string& b) const { return a.upper_ <= b; } - bool operator()(const std::string& a, const StartsWith& b) const { return a < b.lower_; } - bool operator()(const StartsWith&, const StartsWith&) const { - assert(false); // should not be called - return false; - } - }; - -private: - std::string lower_; - std::string upper_; -}; - -template -struct CopyOnlyVector : std::vector { - using std::vector::vector; - - CopyOnlyVector(const CopyOnlyVector&) = default; - CopyOnlyVector(CopyOnlyVector&& other) : CopyOnlyVector(other) {} - CopyOnlyVector(CopyOnlyVector&& other, std::vector::allocator_type alloc) : CopyOnlyVector(other, alloc) {} - - CopyOnlyVector& operator=(const CopyOnlyVector&) = default; - CopyOnlyVector& operator=(CopyOnlyVector& other) { return this->operator=(other); } -}; - -template -struct Transparent { - T t; - - explicit operator T() const - requires ConvertibleToT - { - return t; - } -}; - -template -using ExplicitlyConvertibleTransparent = Transparent; - -template -using NonConvertibleTransparent = Transparent; - -struct TransparentComparator { - using is_transparent = void; - - bool* transparent_used = nullptr; - TransparentComparator() = default; - TransparentComparator(bool& used) : transparent_used(&used) {} - - template - bool operator()(const T& t, const Transparent& transparent) const { - if (transparent_used != nullptr) { - *transparent_used = true; - } - return t < transparent.t; - } - - template - bool operator()(const Transparent& transparent, const T& t) const { - if (transparent_used != nullptr) { - *transparent_used = true; - } - return transparent.t < t; - } - - template - bool operator()(const T& t1, const T& t2) const { - return t1 < t2; - } -}; - -struct NonTransparentComparator { - template - bool operator()(const T&, const Transparent&) const; - - template - bool operator()(const Transparent&, const T&) const; - - template - bool operator()(const T&, const T&) const; -}; - -struct NoDefaultCtr { - NoDefaultCtr() = delete; -}; - -#ifndef TEST_HAS_NO_EXCEPTIONS -template -struct EmplaceUnsafeContainer : std::vector { - using std::vector::vector; - - template - auto emplace(Args&&... args) -> decltype(std::declval>().emplace(std::forward(args)...)) { - if (this->size() > 1) { - auto it1 = this->begin(); - auto it2 = it1 + 1; - // messing up the container - std::iter_swap(it1, it2); - } - - throw 42; - } - - template - auto insert(Args&&... args) -> decltype(std::declval>().insert(std::forward(args)...)) { - if (this->size() > 1) { - auto it1 = this->begin(); - auto it2 = it1 + 1; - // messing up the container - std::iter_swap(it1, it2); - } - - throw 42; - } - - template - auto insert_range(Args&&... args) - -> decltype(std::declval>().insert_range(std::forward(args)...)) { - if (this->size() > 1) { - auto it1 = this->begin(); - auto it2 = it1 + 1; - // messing up the container - std::iter_swap(it1, it2); - } - - throw 42; - } -}; - -template -struct ThrowOnEraseContainer : std::vector { - using std::vector::vector; - - template - auto erase(Args&&... args) -> decltype(std::declval>().erase(std::forward(args)...)) { - throw 42; - } -}; - -template -struct ThrowOnMoveContainer : std::vector { - using std::vector::vector; - - ThrowOnMoveContainer(ThrowOnMoveContainer&&) { throw 42; } - - ThrowOnMoveContainer& operator=(ThrowOnMoveContainer&&) { throw 42; } -}; - -#endif - template void test_emplace_exception_guarantee([[maybe_unused]] F&& emplace_function) { #ifndef TEST_HAS_NO_EXCEPTIONS @@ -276,32 +120,5 @@ void test_erase_exception_guarantee([[maybe_unused]] F&& erase_function) { } #endif } -class Moveable { - int int_; - double double_; - -public: - Moveable() : int_(0), double_(0) {} - Moveable(int i, double d) : int_(i), double_(d) {} - Moveable(Moveable&& x) : int_(x.int_), double_(x.double_) { - x.int_ = -1; - x.double_ = -1; - } - Moveable& operator=(Moveable&& x) { - int_ = x.int_; - x.int_ = -1; - double_ = x.double_; - x.double_ = -1; - return *this; - } - - Moveable(const Moveable&) = delete; - Moveable& operator=(const Moveable&) = delete; - bool operator==(const Moveable& x) const { return int_ == x.int_ && double_ == x.double_; } - bool operator<(const Moveable& x) const { return int_ < x.int_ || (int_ == x.int_ && double_ < x.double_); } - - int get() const { return int_; } - bool moved() const { return int_ == -1; } -}; #endif // TEST_STD_CONTAINERS_CONTAINER_ADAPTORS_FLAT_SET_HELPERS_H diff --git a/libcxx/test/std/containers/container.adaptors/flat_helpers.h b/libcxx/test/std/containers/container.adaptors/flat_helpers.h new file mode 100644 index 0000000000000..9cd408ef960a9 --- /dev/null +++ b/libcxx/test/std/containers/container.adaptors/flat_helpers.h @@ -0,0 +1,184 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef TEST_STD_CONTAINERS_CONTAINER_ADAPTORS_FLAT_HELPERS_H +#define TEST_STD_CONTAINERS_CONTAINER_ADAPTORS_FLAT_HELPERS_H + +#include + +#include "test_macros.h" + +template +struct CopyOnlyVector : std::vector { + using std::vector::vector; + + CopyOnlyVector(const CopyOnlyVector&) = default; + CopyOnlyVector(CopyOnlyVector&& other) : CopyOnlyVector(other) {} + CopyOnlyVector(CopyOnlyVector&& other, std::vector::allocator_type alloc) : CopyOnlyVector(other, alloc) {} + + CopyOnlyVector& operator=(const CopyOnlyVector&) = default; + CopyOnlyVector& operator=(CopyOnlyVector& other) { return this->operator=(other); } +}; + +template +struct Transparent { + T t; + + explicit operator T() const + requires ConvertibleToT + { + return t; + } +}; + +template +using ConvertibleTransparent = Transparent; + +template +using ExplicitlyConvertibleTransparent = Transparent; + +template +using NonConvertibleTransparent = Transparent; + +struct TransparentComparator { + using is_transparent = void; + + bool* transparent_used = nullptr; + TransparentComparator() = default; + TransparentComparator(bool& used) : transparent_used(&used) {} + + template + bool operator()(const T& t, const Transparent& transparent) const { + if (transparent_used != nullptr) { + *transparent_used = true; + } + return t < transparent.t; + } + + template + bool operator()(const Transparent& transparent, const T& t) const { + if (transparent_used != nullptr) { + *transparent_used = true; + } + return transparent.t < t; + } + + template + bool operator()(const T& t1, const T& t2) const { + return t1 < t2; + } +}; + +struct NonTransparentComparator { + template + bool operator()(const T&, const Transparent&) const; + + template + bool operator()(const Transparent&, const T&) const; + + template + bool operator()(const T&, const T&) const; +}; + +struct NoDefaultCtr { + NoDefaultCtr() = delete; +}; + +class Moveable { + int int_; + double double_; + +public: + Moveable() : int_(0), double_(0) {} + Moveable(int i, double d) : int_(i), double_(d) {} + Moveable(Moveable&& x) : int_(x.int_), double_(x.double_) { + x.int_ = -1; + x.double_ = -1; + } + Moveable& operator=(Moveable&& x) { + int_ = x.int_; + x.int_ = -1; + double_ = x.double_; + x.double_ = -1; + return *this; + } + + Moveable(const Moveable&) = delete; + Moveable& operator=(const Moveable&) = delete; + bool operator==(const Moveable& x) const { return int_ == x.int_ && double_ == x.double_; } + bool operator<(const Moveable& x) const { return int_ < x.int_ || (int_ == x.int_ && double_ < x.double_); } + + int get() const { return int_; } + bool moved() const { return int_ == -1; } +}; + +#ifndef TEST_HAS_NO_EXCEPTIONS +template +struct EmplaceUnsafeContainer : std::vector { + using std::vector::vector; + + template + auto emplace(Args&&... args) -> decltype(std::declval>().emplace(std::forward(args)...)) { + if (this->size() > 1) { + auto it1 = this->begin(); + auto it2 = it1 + 1; + // messing up the container + std::iter_swap(it1, it2); + } + + throw 42; + } + + template + auto insert(Args&&... args) -> decltype(std::declval>().insert(std::forward(args)...)) { + if (this->size() > 1) { + auto it1 = this->begin(); + auto it2 = it1 + 1; + // messing up the container + std::iter_swap(it1, it2); + } + + throw 42; + } + + template + auto insert_range(Args&&... args) + -> decltype(std::declval>().insert_range(std::forward(args)...)) { + if (this->size() > 1) { + auto it1 = this->begin(); + auto it2 = it1 + 1; + // messing up the container + std::iter_swap(it1, it2); + } + + throw 42; + } +}; + +template +struct ThrowOnEraseContainer : std::vector { + using std::vector::vector; + + template + auto erase(Args&&... args) -> decltype(std::declval>().erase(std::forward(args)...)) { + throw 42; + } +}; + +template +struct ThrowOnMoveContainer : std::vector { + using std::vector::vector; + + ThrowOnMoveContainer(ThrowOnMoveContainer&&) { throw 42; } + + ThrowOnMoveContainer& operator=(ThrowOnMoveContainer&&) { throw 42; } +}; + +#endif // TEST_HAS_NO_EXCEPTIONS + +#endif // TEST_STD_CONTAINERS_CONTAINER_ADAPTORS_FLAT_HELPERS_H \ No newline at end of file diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp index f9d0b0a6b4e4f..1d96845288b29 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp @@ -48,32 +48,20 @@ #elif TEST_STD_VER == 23 -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_flat_set -# error "__cpp_lib_flat_set should be defined in c++23" -# endif -# if __cpp_lib_flat_set != 202207L -# error "__cpp_lib_flat_set should have the value 202207L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_flat_set -# error "__cpp_lib_flat_set should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_flat_set +# error "__cpp_lib_flat_set should be defined in c++23" +# endif +# if __cpp_lib_flat_set != 202207L +# error "__cpp_lib_flat_set should have the value 202207L in c++23" # endif #elif TEST_STD_VER > 23 -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_flat_set -# error "__cpp_lib_flat_set should be defined in c++26" -# endif -# if __cpp_lib_flat_set != 202207L -# error "__cpp_lib_flat_set should have the value 202207L in c++26" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_flat_set -# error "__cpp_lib_flat_set should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_flat_set +# error "__cpp_lib_flat_set should be defined in c++26" +# endif +# if __cpp_lib_flat_set != 202207L +# error "__cpp_lib_flat_set should have the value 202207L in c++26" # endif #endif // TEST_STD_VER > 23 diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index 07e96e53f2e93..9b318f2deabc4 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -5128,17 +5128,11 @@ # error "__cpp_lib_flat_map should have the value 202207L in c++23" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_flat_set -# error "__cpp_lib_flat_set should be defined in c++23" -# endif -# if __cpp_lib_flat_set != 202207L -# error "__cpp_lib_flat_set should have the value 202207L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_flat_set -# error "__cpp_lib_flat_set should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_flat_set +# error "__cpp_lib_flat_set should be defined in c++23" +# endif +# if __cpp_lib_flat_set != 202207L +# error "__cpp_lib_flat_set should have the value 202207L in c++23" # endif # if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT @@ -6835,17 +6829,11 @@ # error "__cpp_lib_flat_map should have the value 202207L in c++26" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_flat_set -# error "__cpp_lib_flat_set should be defined in c++26" -# endif -# if __cpp_lib_flat_set != 202207L -# error "__cpp_lib_flat_set should have the value 202207L in c++26" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_flat_set -# error "__cpp_lib_flat_set should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_flat_set +# error "__cpp_lib_flat_set should be defined in c++26" +# endif +# if __cpp_lib_flat_set != 202207L +# error "__cpp_lib_flat_set should have the value 202207L in c++26" # endif # if !defined(_LIBCPP_VERSION) || _LIBCPP_AVAILABILITY_HAS_TO_CHARS_FLOATING_POINT diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index febfb0f739e2c..6a9c7b4ea586c 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -520,7 +520,6 @@ def add_version_header(tc): "name": "__cpp_lib_flat_set", "values": {"c++23": 202207}, "headers": ["flat_set"], - "unimplemented": True, }, { "name": "__cpp_lib_format", diff --git a/libcxx/utils/libcxx/test/modules.py b/libcxx/utils/libcxx/test/modules.py index 4c4cd273be3c3..bd4fbe78c1cdc 100644 --- a/libcxx/utils/libcxx/test/modules.py +++ b/libcxx/utils/libcxx/test/modules.py @@ -93,7 +93,7 @@ ExtraHeader["functional"] = "v1/__compare/compare_three_way.h$" # reuses some functionality defined inside -ExtraHeader["flat_set"] = "v1/__flat_map/sorted_unique.h$" +ExtraHeader["flat_set"] = "v1/__flat_map/sorted_.+.h$" # Some C compatibility headers define std::size_t, which is in <__cstddef/size_t.h> for header in ("cstdio", "cstdlib", "cstring", "ctime", "cuchar", "cwchar"): From 0a1742708ddc3a2b31d65479aaad143b5f7562b2 Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Sun, 6 Apr 2025 13:14:37 +0300 Subject: [PATCH 0782/1029] [SelectionDAG] Wire up -gen-sdnode-info TableGen backend (#125358) This patch introduces SelectionDAGGenTargetInfo and SDNodeInfo classes, which provide methods for accessing the generated SDNode descriptions. Pull Request: https://github.com/llvm/llvm-project/pull/125358 Draft PR: https://github.com/llvm/llvm-project/pull/119709 RFC: https://discourse.llvm.org/t/rfc-tablegen-erating-sdnode-descriptions --- llvm/include/llvm/CodeGen/SDNodeInfo.h | 115 ++++++++++++++++ llvm/include/llvm/CodeGen/SelectionDAG.h | 3 + .../llvm/CodeGen/SelectionDAGTargetInfo.h | 48 +++++++ llvm/include/llvm/CodeGen/TargetLowering.h | 5 - llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt | 1 + llvm/lib/CodeGen/SelectionDAG/SDNodeInfo.cpp | 128 ++++++++++++++++++ .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 8 +- .../SelectionDAG/SelectionDAGDumper.cpp | 4 + .../SelectionDAG/SelectionDAGTargetInfo.cpp | 2 + .../Target/AArch64/AArch64ISelLowering.cpp | 80 ----------- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 - .../AArch64/AArch64SelectionDAGInfo.cpp | 79 +++++++++++ .../Target/AArch64/AArch64SelectionDAGInfo.h | 3 + 13 files changed, 387 insertions(+), 93 deletions(-) create mode 100644 llvm/include/llvm/CodeGen/SDNodeInfo.h create mode 100644 llvm/lib/CodeGen/SelectionDAG/SDNodeInfo.cpp diff --git a/llvm/include/llvm/CodeGen/SDNodeInfo.h b/llvm/include/llvm/CodeGen/SDNodeInfo.h new file mode 100644 index 0000000000000..3992db31638b8 --- /dev/null +++ b/llvm/include/llvm/CodeGen/SDNodeInfo.h @@ -0,0 +1,115 @@ +//==------------------------------------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_SDNODEINFO_H +#define LLVM_CODEGEN_SDNODEINFO_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringTable.h" +#include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGenTypes/MachineValueType.h" + +namespace llvm { + +class SDNode; +class SelectionDAG; + +enum SDNP { + SDNPHasChain, + SDNPOutGlue, + SDNPInGlue, + SDNPOptInGlue, + SDNPMemOperand, + SDNPVariadic, +}; + +enum SDTC : uint8_t { + SDTCisVT, + SDTCisPtrTy, + SDTCisInt, + SDTCisFP, + SDTCisVec, + SDTCisSameAs, + SDTCisVTSmallerThanOp, + SDTCisOpSmallerThanOp, + SDTCisEltOfVec, + SDTCisSubVecOfVec, + SDTCVecEltisVT, + SDTCisSameNumEltsAs, + SDTCisSameSizeAs, +}; + +enum SDNF { + SDNFIsStrictFP, +}; + +struct SDTypeConstraint { + SDTC Kind; + uint8_t OpNo; + uint8_t OtherOpNo; + MVT::SimpleValueType VT; +}; + +struct SDNodeDesc { + uint16_t NumResults; + int16_t NumOperands; + uint32_t Properties; + uint32_t Flags; + uint32_t TSFlags; + unsigned NameOffset; + unsigned ConstraintOffset; + unsigned ConstraintCount; + + bool hasProperty(SDNP Property) const { return Properties & (1 << Property); } + + bool hasFlag(SDNF Flag) const { return Flags & (1 << Flag); } +}; + +class SDNodeInfo final { + unsigned NumOpcodes; + const SDNodeDesc *Descs; + StringTable Names; + const SDTypeConstraint *Constraints; + +public: + constexpr SDNodeInfo(unsigned NumOpcodes, const SDNodeDesc *Descs, + StringTable Names, const SDTypeConstraint *Constraints) + : NumOpcodes(NumOpcodes), Descs(Descs), Names(Names), + Constraints(Constraints) {} + + /// Returns true if there is a generated description for a node with the given + /// target-specific opcode. + bool hasDesc(unsigned Opcode) const { + assert(Opcode >= ISD::BUILTIN_OP_END && "Expected target-specific opcode"); + return Opcode < ISD::BUILTIN_OP_END + NumOpcodes; + } + + /// Returns the description of a node with the given opcode. + const SDNodeDesc &getDesc(unsigned Opcode) const { + assert(hasDesc(Opcode)); + return Descs[Opcode - ISD::BUILTIN_OP_END]; + } + + /// Returns operand constraints for a node with the given opcode. + ArrayRef getConstraints(unsigned Opcode) const { + const SDNodeDesc &Desc = getDesc(Opcode); + return ArrayRef(&Constraints[Desc.ConstraintOffset], Desc.ConstraintCount); + } + + /// Returns the name of the given target-specific opcode, suitable for + /// debug printing. + StringRef getName(unsigned Opcode) const { + return Names[getDesc(Opcode).NameOffset]; + } + + void verifyNode(const SelectionDAG &DAG, const SDNode *N) const; +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_SDNODEINFO_H diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 15a2370e5d8b8..d06e2b19fa0b5 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -2445,6 +2445,9 @@ class SelectionDAG { const SDLoc &DLoc); private: +#ifndef NDEBUG + void verifyNode(SDNode *N) const; +#endif void InsertNode(SDNode *N); bool RemoveNodeFromCSEMaps(SDNode *N); void AddModifiedNodeToCSEMaps(SDNode *N); diff --git a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h index ef5ae5dba58de..463f0ec350d9c 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h @@ -16,6 +16,7 @@ #define LLVM_CODEGEN_SELECTIONDAGTARGETINFO_H #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/SDNodeInfo.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/Support/CodeGen.h" #include @@ -35,6 +36,12 @@ class SelectionDAGTargetInfo { SelectionDAGTargetInfo &operator=(const SelectionDAGTargetInfo &) = delete; virtual ~SelectionDAGTargetInfo(); + /// Returns the name of the given target-specific opcode, suitable for + /// debug printing. + virtual const char *getTargetNodeName(unsigned Opcode) const { + return nullptr; + } + /// Returns true if a node with the given target-specific opcode has /// a memory operand. Nodes with such opcodes can only be created with /// `SelectionDAG::getMemIntrinsicNode`. @@ -48,6 +55,10 @@ class SelectionDAGTargetInfo { /// may raise a floating-point exception. virtual bool mayRaiseFPException(unsigned Opcode) const; + /// Checks that the given target-specific node is valid. Aborts if it is not. + virtual void verifyTargetNode(const SelectionDAG &DAG, + const SDNode *N) const {} + /// Emit target-specific code that performs a memcpy. /// This can be used by targets to provide code sequences for cases /// that don't fit the target's parameters for simple loads/stores and can be @@ -176,6 +187,43 @@ class SelectionDAGTargetInfo { } }; +/// Proxy class that targets should inherit from if they wish to use +/// the generated node descriptions. +class SelectionDAGGenTargetInfo : public SelectionDAGTargetInfo { +protected: + const SDNodeInfo &GenNodeInfo; + + explicit SelectionDAGGenTargetInfo(const SDNodeInfo &GenNodeInfo) + : GenNodeInfo(GenNodeInfo) {} + +public: + ~SelectionDAGGenTargetInfo() override; + + const char *getTargetNodeName(unsigned Opcode) const override { + assert(GenNodeInfo.hasDesc(Opcode) && + "The name should be provided by the derived class"); + return GenNodeInfo.getName(Opcode).data(); + } + + bool isTargetMemoryOpcode(unsigned Opcode) const override { + if (GenNodeInfo.hasDesc(Opcode)) + return GenNodeInfo.getDesc(Opcode).hasProperty(SDNPMemOperand); + return false; + } + + bool isTargetStrictFPOpcode(unsigned Opcode) const override { + if (GenNodeInfo.hasDesc(Opcode)) + return GenNodeInfo.getDesc(Opcode).hasFlag(SDNFIsStrictFP); + return false; + } + + void verifyTargetNode(const SelectionDAG &DAG, + const SDNode *N) const override { + if (GenNodeInfo.hasDesc(N->getOpcode())) + GenNodeInfo.verifyNode(DAG, N); + } +}; + } // end namespace llvm #endif // LLVM_CODEGEN_SELECTIONDAGTARGETINFO_H diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 29bf1d467ae0e..16066226f1896 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4989,11 +4989,6 @@ class TargetLowering : public TargetLoweringBase { bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const; -#ifndef NDEBUG - /// Check the given SDNode. Aborts if it is invalid. - virtual void verifyTargetSDNode(const SDNode *N) const {}; -#endif - //===--------------------------------------------------------------------===// // Inline Asm Support hooks // diff --git a/llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt b/llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt index cbfbfa3a321bc..93a742a19aa79 100644 --- a/llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt +++ b/llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt @@ -11,6 +11,7 @@ add_llvm_component_library(LLVMSelectionDAG LegalizeVectorOps.cpp LegalizeVectorTypes.cpp ResourcePriorityQueue.cpp + SDNodeInfo.cpp ScheduleDAGFast.cpp ScheduleDAGRRList.cpp ScheduleDAGSDNodes.cpp diff --git a/llvm/lib/CodeGen/SelectionDAG/SDNodeInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/SDNodeInfo.cpp new file mode 100644 index 0000000000000..e3f6c98a9a90a --- /dev/null +++ b/llvm/lib/CodeGen/SelectionDAG/SDNodeInfo.cpp @@ -0,0 +1,128 @@ +//==------------------------------------------------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/SDNodeInfo.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" + +using namespace llvm; + +static void reportNodeError(const SelectionDAG &DAG, const SDNode *N, + const Twine &Msg) { + std::string S; + raw_string_ostream SS(S); + SS << "invalid node: " << Msg << '\n'; + N->printrWithDepth(SS, &DAG, 2); + report_fatal_error(StringRef(S)); +} + +static void checkResultType(const SelectionDAG &DAG, const SDNode *N, + unsigned ResIdx, EVT ExpectedVT) { + EVT ActualVT = N->getValueType(ResIdx); + if (ActualVT != ExpectedVT) + reportNodeError( + DAG, N, + "result #" + Twine(ResIdx) + " has invalid type; expected " + + ExpectedVT.getEVTString() + ", got " + ActualVT.getEVTString()); +} + +static void checkOperandType(const SelectionDAG &DAG, const SDNode *N, + unsigned OpIdx, EVT ExpectedVT) { + EVT ActualVT = N->getOperand(OpIdx).getValueType(); + if (ActualVT != ExpectedVT) + reportNodeError( + DAG, N, + "operand #" + Twine(OpIdx) + " has invalid type; expected " + + ExpectedVT.getEVTString() + ", got " + ActualVT.getEVTString()); +} + +void SDNodeInfo::verifyNode(const SelectionDAG &DAG, const SDNode *N) const { + const SDNodeDesc &Desc = getDesc(N->getOpcode()); + bool HasChain = Desc.hasProperty(SDNPHasChain); + bool HasOutGlue = Desc.hasProperty(SDNPOutGlue); + bool HasInGlue = Desc.hasProperty(SDNPInGlue); + bool HasOptInGlue = Desc.hasProperty(SDNPOptInGlue); + bool IsVariadic = Desc.hasProperty(SDNPVariadic); + + unsigned ActualNumResults = N->getNumValues(); + unsigned ExpectedNumResults = Desc.NumResults + HasChain + HasOutGlue; + + if (ActualNumResults != ExpectedNumResults) + reportNodeError(DAG, N, + "invalid number of results; expected " + + Twine(ExpectedNumResults) + ", got " + + Twine(ActualNumResults)); + + // Chain result comes after all normal results. + if (HasChain) { + unsigned ChainResIdx = Desc.NumResults; + checkResultType(DAG, N, ChainResIdx, MVT::Other); + } + + // Glue result comes last. + if (HasOutGlue) { + unsigned GlueResIdx = Desc.NumResults + HasChain; + checkResultType(DAG, N, GlueResIdx, MVT::Glue); + } + + // In the most general case, the operands of a node go in the following order: + // chain, fix#0, ..., fix#M-1, var#0, ... var#N-1, glue + // If the number of operands is < 0, M can be any; + // If the node has SDNPVariadic property, N can be any. + bool HasOptionalOperands = Desc.NumOperands < 0 || IsVariadic; + + unsigned ActualNumOperands = N->getNumOperands(); + unsigned ExpectedMinNumOperands = + (Desc.NumOperands >= 0 ? Desc.NumOperands : 0) + HasChain + HasInGlue; + + // Check the lower bound. + if (ActualNumOperands < ExpectedMinNumOperands) { + StringRef How = HasOptionalOperands ? "at least " : ""; + reportNodeError(DAG, N, + "invalid number of operands; expected " + How + + Twine(ExpectedMinNumOperands) + ", got " + + Twine(ActualNumOperands)); + } + + // Check the upper bound. We can only do this if the number of fixed operands + // is known and there are no variadic operands. + if (Desc.NumOperands >= 0 && !IsVariadic) { + // Account for optional input glue. + unsigned ExpectedMaxNumOperands = ExpectedMinNumOperands + HasOptInGlue; + if (ActualNumOperands > ExpectedMaxNumOperands) { + StringRef How = HasOptInGlue ? "at most " : ""; + reportNodeError(DAG, N, + "invalid number of operands; expected " + How + + Twine(ExpectedMaxNumOperands) + ", got " + + Twine(ActualNumOperands)); + } + } + + // Chain operand comes first. + if (HasChain) + checkOperandType(DAG, N, 0, MVT::Other); + + // Glue operand comes last. + if (HasInGlue) + checkOperandType(DAG, N, ActualNumOperands - 1, MVT::Glue); + if (HasOptInGlue && ActualNumOperands >= 1 && + N->getOperand(ActualNumOperands - 1).getValueType() == MVT::Glue) + HasInGlue = true; + + // Check variadic operands. These should be Register or RegisterMask. + if (IsVariadic && Desc.NumOperands >= 0) { + unsigned VarOpStart = HasChain + Desc.NumOperands; + unsigned VarOpEnd = ActualNumOperands - HasInGlue; + for (unsigned OpIdx = VarOpStart; OpIdx != VarOpEnd; ++OpIdx) { + unsigned OpOpcode = N->getOperand(OpIdx).getOpcode(); + if (OpOpcode != ISD::Register && OpOpcode != ISD::RegisterMask) + reportNodeError(DAG, N, + "variadic operand #" + Twine(OpIdx) + + " must be Register or RegisterMask"); + } + } +} diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 69548d0462318..f2777bbf247b0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1146,11 +1146,11 @@ void SelectionDAG::DeallocateNode(SDNode *N) { #ifndef NDEBUG /// VerifySDNode - Check the given SDNode. Aborts if it is invalid. -static void VerifySDNode(SDNode *N, const TargetLowering *TLI) { +void SelectionDAG::verifyNode(SDNode *N) const { switch (N->getOpcode()) { default: - if (N->getOpcode() > ISD::BUILTIN_OP_END) - TLI->verifyTargetSDNode(N); + if (N->isTargetOpcode()) + getSelectionDAGInfo().verifyTargetNode(*this, N); break; case ISD::BUILD_PAIR: { EVT VT = N->getValueType(0); @@ -1194,7 +1194,7 @@ void SelectionDAG::InsertNode(SDNode *N) { AllNodes.push_back(N); #ifndef NDEBUG N->PersistentId = NextPersistentId++; - VerifySDNode(N, TLI); + verifyNode(N); #endif for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next) DUL->NodeInserted(N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 64ecff8d71f98..958c070ed50de 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetRegisterInfo.h" @@ -67,6 +68,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { return "<>"; } if (G) { + const SelectionDAGTargetInfo &TSI = G->getSelectionDAGInfo(); + if (const char *Name = TSI.getTargetNodeName(getOpcode())) + return Name; const TargetLowering &TLI = G->getTargetLoweringInfo(); const char *Name = TLI.getTargetNodeName(getOpcode()); if (Name) return Name; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp index 0f3b36658f10a..f4422a15bf9df 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp @@ -16,6 +16,8 @@ using namespace llvm; SelectionDAGTargetInfo::~SelectionDAGTargetInfo() = default; +SelectionDAGGenTargetInfo::~SelectionDAGGenTargetInfo() = default; + bool SelectionDAGTargetInfo::mayRaiseFPException(unsigned Opcode) const { // FIXME: All target memory opcodes are currently automatically considered // to possibly raise FP exceptions. See rev. 63336795. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a1ba3922996a1..0534d2d546325 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -30223,83 +30223,3 @@ bool AArch64TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { return TargetLowering::isTypeDesirableForOp(Opc, VT); } - -#ifndef NDEBUG -void AArch64TargetLowering::verifyTargetSDNode(const SDNode *N) const { - switch (N->getOpcode()) { - default: - break; - case AArch64ISD::SADDWT: - case AArch64ISD::SADDWB: - case AArch64ISD::UADDWT: - case AArch64ISD::UADDWB: { - assert(N->getNumValues() == 1 && "Expected one result!"); - assert(N->getNumOperands() == 2 && "Expected two operands!"); - EVT VT = N->getValueType(0); - EVT Op0VT = N->getOperand(0).getValueType(); - EVT Op1VT = N->getOperand(1).getValueType(); - assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() && - VT.isInteger() && Op0VT.isInteger() && Op1VT.isInteger() && - "Expected integer vectors!"); - assert(VT == Op0VT && - "Expected result and first input to have the same type!"); - assert(Op0VT.getSizeInBits() == Op1VT.getSizeInBits() && - "Expected vectors of equal size!"); - assert(Op0VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount() && - "Expected result vector and first input vector to have half the " - "lanes of the second input vector!"); - break; - } - case AArch64ISD::SUNPKLO: - case AArch64ISD::SUNPKHI: - case AArch64ISD::UUNPKLO: - case AArch64ISD::UUNPKHI: { - assert(N->getNumValues() == 1 && "Expected one result!"); - assert(N->getNumOperands() == 1 && "Expected one operand!"); - EVT VT = N->getValueType(0); - EVT OpVT = N->getOperand(0).getValueType(); - assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() && - VT.isInteger() && "Expected integer vectors!"); - assert(OpVT.getSizeInBits() == VT.getSizeInBits() && - "Expected vectors of equal size!"); - assert(OpVT.getVectorElementCount() == VT.getVectorElementCount() * 2 && - "Expected result vector with half the lanes of its input!"); - break; - } - case AArch64ISD::TRN1: - case AArch64ISD::TRN2: - case AArch64ISD::UZP1: - case AArch64ISD::UZP2: - case AArch64ISD::ZIP1: - case AArch64ISD::ZIP2: { - assert(N->getNumValues() == 1 && "Expected one result!"); - assert(N->getNumOperands() == 2 && "Expected two operands!"); - EVT VT = N->getValueType(0); - EVT Op0VT = N->getOperand(0).getValueType(); - EVT Op1VT = N->getOperand(1).getValueType(); - assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() && - "Expected vectors!"); - assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!"); - break; - } - case AArch64ISD::RSHRNB_I: { - assert(N->getNumValues() == 1 && "Expected one result!"); - assert(N->getNumOperands() == 2 && "Expected two operands!"); - EVT VT = N->getValueType(0); - EVT Op0VT = N->getOperand(0).getValueType(); - EVT Op1VT = N->getOperand(1).getValueType(); - assert(VT.isVector() && VT.isInteger() && - "Expected integer vector result type!"); - assert(Op0VT.isVector() && Op0VT.isInteger() && - "Expected first operand to be an integer vector!"); - assert(VT.getSizeInBits() == Op0VT.getSizeInBits() && - "Expected vectors of equal size!"); - assert(VT.getVectorElementCount() == Op0VT.getVectorElementCount() * 2 && - "Expected input vector with half the lanes of its result!"); - assert(Op1VT == MVT::i32 && isa(N->getOperand(1)) && - "Expected second operand to be a constant i32!"); - break; - } - } -} -#endif diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index ba275e18fa126..0d51ef2be8631 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1038,10 +1038,6 @@ class AArch64TargetLowering : public TargetLowering { /// True if stack clash protection is enabled for this functions. bool hasInlineStackProbe(const MachineFunction &MF) const override; -#ifndef NDEBUG - void verifyTargetSDNode(const SDNode *N) const override; -#endif - private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index 17adda15d9fc8..da2cd1ada3653 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -33,6 +33,85 @@ bool AArch64SelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const { Opcode <= AArch64ISD::LAST_STRICTFP_OPCODE; } +void AArch64SelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, + const SDNode *N) const { + switch (N->getOpcode()) { + default: + break; + case AArch64ISD::SADDWT: + case AArch64ISD::SADDWB: + case AArch64ISD::UADDWT: + case AArch64ISD::UADDWB: { + assert(N->getNumValues() == 1 && "Expected one result!"); + assert(N->getNumOperands() == 2 && "Expected two operands!"); + EVT VT = N->getValueType(0); + EVT Op0VT = N->getOperand(0).getValueType(); + EVT Op1VT = N->getOperand(1).getValueType(); + assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() && + VT.isInteger() && Op0VT.isInteger() && Op1VT.isInteger() && + "Expected integer vectors!"); + assert(VT == Op0VT && + "Expected result and first input to have the same type!"); + assert(Op0VT.getSizeInBits() == Op1VT.getSizeInBits() && + "Expected vectors of equal size!"); + assert(Op0VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount() && + "Expected result vector and first input vector to have half the " + "lanes of the second input vector!"); + break; + } + case AArch64ISD::SUNPKLO: + case AArch64ISD::SUNPKHI: + case AArch64ISD::UUNPKLO: + case AArch64ISD::UUNPKHI: { + assert(N->getNumValues() == 1 && "Expected one result!"); + assert(N->getNumOperands() == 1 && "Expected one operand!"); + EVT VT = N->getValueType(0); + EVT OpVT = N->getOperand(0).getValueType(); + assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() && + VT.isInteger() && "Expected integer vectors!"); + assert(OpVT.getSizeInBits() == VT.getSizeInBits() && + "Expected vectors of equal size!"); + assert(OpVT.getVectorElementCount() == VT.getVectorElementCount() * 2 && + "Expected result vector with half the lanes of its input!"); + break; + } + case AArch64ISD::TRN1: + case AArch64ISD::TRN2: + case AArch64ISD::UZP1: + case AArch64ISD::UZP2: + case AArch64ISD::ZIP1: + case AArch64ISD::ZIP2: { + assert(N->getNumValues() == 1 && "Expected one result!"); + assert(N->getNumOperands() == 2 && "Expected two operands!"); + EVT VT = N->getValueType(0); + EVT Op0VT = N->getOperand(0).getValueType(); + EVT Op1VT = N->getOperand(1).getValueType(); + assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() && + "Expected vectors!"); + assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!"); + break; + } + case AArch64ISD::RSHRNB_I: { + assert(N->getNumValues() == 1 && "Expected one result!"); + assert(N->getNumOperands() == 2 && "Expected two operands!"); + EVT VT = N->getValueType(0); + EVT Op0VT = N->getOperand(0).getValueType(); + EVT Op1VT = N->getOperand(1).getValueType(); + assert(VT.isVector() && VT.isInteger() && + "Expected integer vector result type!"); + assert(Op0VT.isVector() && Op0VT.isInteger() && + "Expected first operand to be an integer vector!"); + assert(VT.getSizeInBits() == Op0VT.getSizeInBits() && + "Expected vectors of equal size!"); + assert(VT.getVectorElementCount() == Op0VT.getVectorElementCount() * 2 && + "Expected input vector with half the lanes of its result!"); + assert(Op1VT == MVT::i32 && isa(N->getOperand(1)) && + "Expected second operand to be a constant i32!"); + break; + } + } +} + SDValue AArch64SelectionDAGInfo::EmitMOPS(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue SrcOrValue, diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h index 7efe49c720655..9c11833b3f67e 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -23,6 +23,9 @@ class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo { bool isTargetStrictFPOpcode(unsigned Opcode) const override; + void verifyTargetNode(const SelectionDAG &DAG, + const SDNode *N) const override; + SDValue EmitMOPS(unsigned Opcode, SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue SrcOrValue, SDValue Size, Align Alignment, bool isVolatile, From 3e08dcd767a217fa91580704a378b37167e20f74 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sun, 6 Apr 2025 13:00:14 +0200 Subject: [PATCH 0783/1029] [mlir][inliner] Move callback types from InlinerConfig -> InlinerInterface. NFC. The proper layering here is that Inliner depends on InlinerUtils, and not the other way round. Maybe it's time to give InliningUtils a less terrible file name. --- mlir/include/mlir/Transforms/Inliner.h | 42 ++++++------ mlir/include/mlir/Transforms/InliningUtils.h | 17 +++-- mlir/lib/Transforms/Utils/InliningUtils.cpp | 65 +++++++++---------- .../lib/Transforms/TestInliningCallback.cpp | 3 +- 4 files changed, 64 insertions(+), 63 deletions(-) diff --git a/mlir/include/mlir/Transforms/Inliner.h b/mlir/include/mlir/Transforms/Inliner.h index 506b4455af646..0d3d3d1a3f9f2 100644 --- a/mlir/include/mlir/Transforms/Inliner.h +++ b/mlir/include/mlir/Transforms/Inliner.h @@ -17,6 +17,7 @@ #include "mlir/Interfaces/CallInterfaces.h" #include "mlir/Pass/AnalysisManager.h" #include "mlir/Pass/PassManager.h" +#include "mlir/Transforms/InliningUtils.h" #include "llvm/ADT/StringMap.h" namespace mlir { @@ -27,11 +28,6 @@ class InlinerConfig { public: using DefaultPipelineTy = std::function; using OpPipelinesTy = llvm::StringMap; - using CloneCallbackSigTy = void(OpBuilder &builder, Region *src, - Block *inlineBlock, Block *postInsertBlock, - IRMapping &mapper, - bool shouldCloneInlinedRegion); - using CloneCallbackTy = std::function; InlinerConfig() = default; InlinerConfig(DefaultPipelineTy defaultPipeline, @@ -44,7 +40,9 @@ class InlinerConfig { } const OpPipelinesTy &getOpPipelines() const { return opPipelines; } unsigned getMaxInliningIterations() const { return maxInliningIterations; } - const CloneCallbackTy &getCloneCallback() const { return cloneCallback; } + const InlinerInterface::CloneCallbackTy &getCloneCallback() const { + return cloneCallback; + } bool getCanHandleMultipleBlocks() const { return canHandleMultipleBlocks; } void setDefaultPipeline(DefaultPipelineTy pipeline) { @@ -54,7 +52,7 @@ class InlinerConfig { opPipelines = std::move(pipelines); } void setMaxInliningIterations(unsigned max) { maxInliningIterations = max; } - void setCloneCallback(CloneCallbackTy callback) { + void setCloneCallback(InlinerInterface::CloneCallbackTy callback) { cloneCallback = std::move(callback); } void setCanHandleMultipleBlocks(bool value = true) { @@ -75,21 +73,21 @@ class InlinerConfig { /// when inlining within an SCC. unsigned maxInliningIterations{0}; /// Callback for cloning operations during inlining - CloneCallbackTy cloneCallback = [](OpBuilder &builder, Region *src, - Block *inlineBlock, Block *postInsertBlock, - IRMapping &mapper, - bool shouldCloneInlinedRegion) { - // Check to see if the region is being cloned, or moved inline. In - // either case, move the new blocks after the 'insertBlock' to improve - // IR readability. - Region *insertRegion = inlineBlock->getParent(); - if (shouldCloneInlinedRegion) - src->cloneInto(insertRegion, postInsertBlock->getIterator(), mapper); - else - insertRegion->getBlocks().splice(postInsertBlock->getIterator(), - src->getBlocks(), src->begin(), - src->end()); - }; + InlinerInterface::CloneCallbackTy cloneCallback = + [](OpBuilder &builder, Region *src, Block *inlineBlock, + Block *postInsertBlock, IRMapping &mapper, + bool shouldCloneInlinedRegion) { + // Check to see if the region is being cloned, or moved inline. In + // either case, move the new blocks after the 'insertBlock' to improve + // IR readability. + Region *insertRegion = inlineBlock->getParent(); + if (shouldCloneInlinedRegion) + src->cloneInto(insertRegion, postInsertBlock->getIterator(), mapper); + else + insertRegion->getBlocks().splice(postInsertBlock->getIterator(), + src->getBlocks(), src->begin(), + src->end()); + }; /// Determine if the inliner can inline a function containing multiple /// blocks into a region that requires a single block. By default, it is /// not allowed. If it is true, cloneCallback should perform the extra diff --git a/mlir/include/mlir/Transforms/InliningUtils.h b/mlir/include/mlir/Transforms/InliningUtils.h index 552030983d724..ed6413d8cd44c 100644 --- a/mlir/include/mlir/Transforms/InliningUtils.h +++ b/mlir/include/mlir/Transforms/InliningUtils.h @@ -18,7 +18,6 @@ #include "mlir/IR/Location.h" #include "mlir/IR/Region.h" #include "mlir/IR/ValueRange.h" -#include "mlir/Transforms/Inliner.h" #include namespace mlir { @@ -192,6 +191,12 @@ class DialectInlinerInterface class InlinerInterface : public DialectInterfaceCollection { public: + using CloneCallbackSigTy = void(OpBuilder &builder, Region *src, + Block *inlineBlock, Block *postInsertBlock, + IRMapping &mapper, + bool shouldCloneInlinedRegion); + using CloneCallbackTy = std::function; + using Base::Base; /// Process a set of blocks that have been inlined. This callback is invoked @@ -256,14 +261,14 @@ class InlinerInterface /// region should be cloned into the 'inlinePoint' or spliced directly. LogicalResult inlineRegion(InlinerInterface &interface, - function_ref cloneCallback, + function_ref cloneCallback, Region *src, Operation *inlinePoint, IRMapping &mapper, ValueRange resultsToReplace, TypeRange regionResultTypes, std::optional inlineLoc = std::nullopt, bool shouldCloneInlinedRegion = true); LogicalResult inlineRegion(InlinerInterface &interface, - function_ref cloneCallback, + function_ref cloneCallback, Region *src, Block *inlineBlock, Block::iterator inlinePoint, IRMapping &mapper, ValueRange resultsToReplace, TypeRange regionResultTypes, @@ -275,14 +280,14 @@ inlineRegion(InlinerInterface &interface, /// in-favor of the region arguments when inlining. LogicalResult inlineRegion(InlinerInterface &interface, - function_ref cloneCallback, + function_ref cloneCallback, Region *src, Operation *inlinePoint, ValueRange inlinedOperands, ValueRange resultsToReplace, std::optional inlineLoc = std::nullopt, bool shouldCloneInlinedRegion = true); LogicalResult inlineRegion(InlinerInterface &interface, - function_ref cloneCallback, + function_ref cloneCallback, Region *src, Block *inlineBlock, Block::iterator inlinePoint, ValueRange inlinedOperands, ValueRange resultsToReplace, std::optional inlineLoc = std::nullopt, @@ -296,7 +301,7 @@ inlineRegion(InlinerInterface &interface, /// spliced directly. LogicalResult inlineCall(InlinerInterface &interface, - function_ref cloneCallback, + function_ref cloneCallback, CallOpInterface call, CallableOpInterface callable, Region *src, bool shouldCloneInlinedRegion = true); diff --git a/mlir/lib/Transforms/Utils/InliningUtils.cpp b/mlir/lib/Transforms/Utils/InliningUtils.cpp index 3dd95d2845715..f654e962d631f 100644 --- a/mlir/lib/Transforms/Utils/InliningUtils.cpp +++ b/mlir/lib/Transforms/Utils/InliningUtils.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "mlir/Transforms/InliningUtils.h" -#include "mlir/Transforms/Inliner.h" #include "mlir/IR/Builders.h" #include "mlir/IR/IRMapping.h" @@ -266,13 +265,13 @@ static void handleResultImpl(InlinerInterface &interface, OpBuilder &builder, } } -static LogicalResult -inlineRegionImpl(InlinerInterface &interface, - function_ref cloneCallback, - Region *src, Block *inlineBlock, Block::iterator inlinePoint, - IRMapping &mapper, ValueRange resultsToReplace, - TypeRange regionResultTypes, std::optional inlineLoc, - bool shouldCloneInlinedRegion, CallOpInterface call = {}) { +static LogicalResult inlineRegionImpl( + InlinerInterface &interface, + function_ref cloneCallback, + Region *src, Block *inlineBlock, Block::iterator inlinePoint, + IRMapping &mapper, ValueRange resultsToReplace, TypeRange regionResultTypes, + std::optional inlineLoc, bool shouldCloneInlinedRegion, + CallOpInterface call = {}) { assert(resultsToReplace.size() == regionResultTypes.size()); // We expect the region to have at least one block. if (src->empty()) @@ -369,13 +368,13 @@ inlineRegionImpl(InlinerInterface &interface, return success(); } -static LogicalResult -inlineRegionImpl(InlinerInterface &interface, - function_ref cloneCallback, - Region *src, Block *inlineBlock, Block::iterator inlinePoint, - ValueRange inlinedOperands, ValueRange resultsToReplace, - std::optional inlineLoc, - bool shouldCloneInlinedRegion, CallOpInterface call = {}) { +static LogicalResult inlineRegionImpl( + InlinerInterface &interface, + function_ref cloneCallback, + Region *src, Block *inlineBlock, Block::iterator inlinePoint, + ValueRange inlinedOperands, ValueRange resultsToReplace, + std::optional inlineLoc, bool shouldCloneInlinedRegion, + CallOpInterface call = {}) { // We expect the region to have at least one block. if (src->empty()) return failure(); @@ -404,10 +403,10 @@ inlineRegionImpl(InlinerInterface &interface, LogicalResult mlir::inlineRegion( InlinerInterface &interface, - function_ref cloneCallback, Region *src, - Operation *inlinePoint, IRMapping &mapper, ValueRange resultsToReplace, - TypeRange regionResultTypes, std::optional inlineLoc, - bool shouldCloneInlinedRegion) { + function_ref cloneCallback, + Region *src, Operation *inlinePoint, IRMapping &mapper, + ValueRange resultsToReplace, TypeRange regionResultTypes, + std::optional inlineLoc, bool shouldCloneInlinedRegion) { return inlineRegion(interface, cloneCallback, src, inlinePoint->getBlock(), ++inlinePoint->getIterator(), mapper, resultsToReplace, regionResultTypes, inlineLoc, shouldCloneInlinedRegion); @@ -415,9 +414,9 @@ LogicalResult mlir::inlineRegion( LogicalResult mlir::inlineRegion( InlinerInterface &interface, - function_ref cloneCallback, Region *src, - Block *inlineBlock, Block::iterator inlinePoint, IRMapping &mapper, - ValueRange resultsToReplace, TypeRange regionResultTypes, + function_ref cloneCallback, + Region *src, Block *inlineBlock, Block::iterator inlinePoint, + IRMapping &mapper, ValueRange resultsToReplace, TypeRange regionResultTypes, std::optional inlineLoc, bool shouldCloneInlinedRegion) { return inlineRegionImpl( interface, cloneCallback, src, inlineBlock, inlinePoint, mapper, @@ -426,8 +425,8 @@ LogicalResult mlir::inlineRegion( LogicalResult mlir::inlineRegion( InlinerInterface &interface, - function_ref cloneCallback, Region *src, - Operation *inlinePoint, ValueRange inlinedOperands, + function_ref cloneCallback, + Region *src, Operation *inlinePoint, ValueRange inlinedOperands, ValueRange resultsToReplace, std::optional inlineLoc, bool shouldCloneInlinedRegion) { return inlineRegion(interface, cloneCallback, src, inlinePoint->getBlock(), @@ -437,10 +436,10 @@ LogicalResult mlir::inlineRegion( LogicalResult mlir::inlineRegion( InlinerInterface &interface, - function_ref cloneCallback, Region *src, - Block *inlineBlock, Block::iterator inlinePoint, ValueRange inlinedOperands, - ValueRange resultsToReplace, std::optional inlineLoc, - bool shouldCloneInlinedRegion) { + function_ref cloneCallback, + Region *src, Block *inlineBlock, Block::iterator inlinePoint, + ValueRange inlinedOperands, ValueRange resultsToReplace, + std::optional inlineLoc, bool shouldCloneInlinedRegion) { return inlineRegionImpl(interface, cloneCallback, src, inlineBlock, inlinePoint, inlinedOperands, resultsToReplace, inlineLoc, shouldCloneInlinedRegion); @@ -474,11 +473,11 @@ static Value materializeConversion(const DialectInlinerInterface *interface, /// failure, no changes are made to the module. 'shouldCloneInlinedRegion' /// corresponds to whether the source region should be cloned into the 'call' or /// spliced directly. -LogicalResult -mlir::inlineCall(InlinerInterface &interface, - function_ref cloneCallback, - CallOpInterface call, CallableOpInterface callable, - Region *src, bool shouldCloneInlinedRegion) { +LogicalResult mlir::inlineCall( + InlinerInterface &interface, + function_ref cloneCallback, + CallOpInterface call, CallableOpInterface callable, Region *src, + bool shouldCloneInlinedRegion) { // We expect the region to have at least one block. if (src->empty()) return failure(); diff --git a/mlir/test/lib/Transforms/TestInliningCallback.cpp b/mlir/test/lib/Transforms/TestInliningCallback.cpp index 012d62b7b1b42..c518f3f4ac85f 100644 --- a/mlir/test/lib/Transforms/TestInliningCallback.cpp +++ b/mlir/test/lib/Transforms/TestInliningCallback.cpp @@ -61,8 +61,7 @@ struct InlinerCallback src->cloneInto(®ion, mapper); // Split block before scf operation. - Block *continueBlock = - inlineBlock->splitBlock(executeRegionOp.getOperation()); + inlineBlock->splitBlock(executeRegionOp.getOperation()); // Replace all test.return with scf.yield for (mlir::Block &block : region) { From 283a78a088bc669f31d8a3567265b1a3ab129487 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 6 Apr 2025 12:18:08 +0100 Subject: [PATCH 0784/1029] Reapply "[LV] Don't add blocks to loop in GeneratedRTChecks (NFC)." This reverts commit 46a2f4174a051f29a09dbc3844df763571c67309. Recommits 2fd6f8fb5e3a with corresponding VPlan change to ensure LoopInfo is updated for all blocks during VPlan execution if needed. --- .../Transforms/Vectorize/LoopVectorize.cpp | 6 ----- llvm/lib/Transforms/Vectorize/VPlan.cpp | 24 ++++++++++--------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index cc6fd790bc437..585caaffa63da 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2082,9 +2082,6 @@ class GeneratedRTChecks { auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock); - // Create new preheader for vector loop. - if (OuterLoop) - OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI); SCEVCheckBlock->getTerminator()->eraseFromParent(); SCEVCheckBlock->moveBefore(LoopVectorPreHeader); @@ -2122,9 +2119,6 @@ class GeneratedRTChecks { DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); MemCheckBlock->moveBefore(LoopVectorPreHeader); - if (OuterLoop) - OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI); - BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond); if (AddBranchWeights) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index a6e807b616ce5..bc3957f573d82 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -432,6 +432,19 @@ BasicBlock *VPBasicBlock::createEmptyBasicBlock(VPTransformState &State) { void VPBasicBlock::connectToPredecessors(VPTransformState &State) { auto &CFG = State.CFG; BasicBlock *NewBB = CFG.VPBB2IRBB[this]; + + // Register NewBB in its loop. In innermost loops its the same for all + // BB's. + Loop *ParentLoop = State.CurrentParentLoop; + // If this block has a sole successor that is an exit block then it needs + // adding to the same parent loop as the exit block. + VPBlockBase *SuccVPBB = getSingleSuccessor(); + if (SuccVPBB && State.Plan->isExitBlock(SuccVPBB)) + ParentLoop = + State.LI->getLoopFor(cast(SuccVPBB)->getIRBasicBlock()); + if (ParentLoop && !State.LI->getLoopFor(NewBB)) + ParentLoop->addBasicBlockToLoop(NewBB, *State.LI); + // Hook up the new basic block to its predecessors. for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock(); @@ -517,17 +530,6 @@ void VPBasicBlock::execute(VPTransformState *State) { State->Builder.SetInsertPoint(NewBB); // Temporarily terminate with unreachable until CFG is rewired. UnreachableInst *Terminator = State->Builder.CreateUnreachable(); - // Register NewBB in its loop. In innermost loops its the same for all - // BB's. - Loop *ParentLoop = State->CurrentParentLoop; - // If this block has a sole successor that is an exit block then it needs - // adding to the same parent loop as the exit block. - VPBlockBase *SuccVPBB = getSingleSuccessor(); - if (SuccVPBB && State->Plan->isExitBlock(SuccVPBB)) - ParentLoop = State->LI->getLoopFor( - cast(SuccVPBB)->getIRBasicBlock()); - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewBB, *State->LI); State->Builder.SetInsertPoint(Terminator); State->CFG.PrevBB = NewBB; From ba3fa39b63d7185f7d067f35a39c2fea40ee8861 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 6 Apr 2025 12:20:30 +0100 Subject: [PATCH 0785/1029] [EarlyCSE] Re-generate checks for intrinsics.ll. --- .../Transforms/EarlyCSE/AArch64/intrinsics.ll | 159 ++++++++++++++++-- 1 file changed, 144 insertions(+), 15 deletions(-) diff --git a/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll b/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll index 69a503176aedf..94b17510bb95d 100644 --- a/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll +++ b/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll @@ -1,11 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse -earlycse-debug-hash | FileCheck %s ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes='early-cse' | FileCheck %s define <4 x i32> @test_cse(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) { +; CHECK-LABEL: define <4 x i32> @test_cse( +; CHECK-SAME: ptr [[A:%.*]], [2 x <4 x i32>] [[S_COERCE:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[S_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 0 +; CHECK-NEXT: [[S_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 1 +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ] +; CHECK-NEXT: [[RES_0:%.*]] = phi <4 x i32> [ undef, %[[ENTRY]] ], [ [[CALL:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_0_EXTRACT]] to <16 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_1_EXTRACT]] to <16 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP2]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], 1 +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +; CHECK-NEXT: [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]]) +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret <4 x i32> [[RES_0]] +; entry: ; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE. -; CHECK-LABEL: @test_cse -; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 br label %for.cond @@ -34,11 +56,32 @@ for.end: ; preds = %for.cond } define <4 x i32> @test_cse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) { +; CHECK-LABEL: define <4 x i32> @test_cse2( +; CHECK-SAME: ptr [[A:%.*]], [2 x <4 x i32>] [[S_COERCE:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[S_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 0 +; CHECK-NEXT: [[S_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 1 +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ] +; CHECK-NEXT: [[RES_0:%.*]] = phi <4 x i32> [ undef, %[[ENTRY]] ], [ [[CALL:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_0_EXTRACT]] to <16 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_1_EXTRACT]] to <16 x i8> +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], ptr [[A]]) +; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <4 x i32>, <4 x i32> } poison, <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertvalue { <4 x i32>, <4 x i32> } [[TMP2]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], 1 +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +; CHECK-NEXT: [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]]) +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret <4 x i32> [[RES_0]] +; entry: ; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE. -; CHECK-LABEL: @test_cse2 -; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %3, <4 x i32> %3, ptr %0) -; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> %s.coerce.fca.0.extract, <4 x i32> %s.coerce.fca.1.extract, ptr %a) %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 br label %for.cond @@ -68,11 +111,26 @@ for.end: ; preds = %for.cond } define <4 x i32> @test_cse3(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 { +; CHECK-LABEL: define <4 x i32> @test_cse3( +; CHECK-SAME: ptr [[A:%.*]], [2 x <4 x i32>] [[S_COERCE:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ] +; CHECK-NEXT: [[RES_0:%.*]] = phi <4 x i32> [ undef, %[[ENTRY]] ], [ [[CALL:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[A]]) +; CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 0 +; CHECK-NEXT: [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[VLD2_FCA_0_EXTRACT]], <4 x i32> [[VLD2_FCA_0_EXTRACT]]) +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret <4 x i32> [[RES_0]] +; entry: ; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE. -; CHECK-LABEL: @test_cse3 -; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0 -; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 br label %for.cond @@ -100,11 +158,33 @@ for.end: ; preds = %for.cond define <4 x i32> @test_nocse(ptr %a, ptr %b, [2 x <4 x i32>] %s.coerce, i32 %n) { +; CHECK-LABEL: define <4 x i32> @test_nocse( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], [2 x <4 x i32>] [[S_COERCE:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[S_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 0 +; CHECK-NEXT: [[S_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 1 +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ] +; CHECK-NEXT: [[RES_0:%.*]] = phi <4 x i32> [ undef, %[[ENTRY]] ], [ [[CALL:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_0_EXTRACT]] to <16 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_1_EXTRACT]] to <16 x i8> +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +; CHECK-NEXT: store i32 0, ptr [[B]], align 4 +; CHECK-NEXT: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[A]]) +; CHECK-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 0 +; CHECK-NEXT: [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[VLD2_FCA_0_EXTRACT]], <4 x i32> [[VLD2_FCA_0_EXTRACT]]) +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret <4 x i32> [[RES_0]] +; entry: ; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized ; away by Early CSE. -; CHECK-LABEL: @test_nocse -; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 br label %for.cond @@ -134,11 +214,33 @@ for.end: ; preds = %for.cond } define <4 x i32> @test_nocse2(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) { +; CHECK-LABEL: define <4 x i32> @test_nocse2( +; CHECK-SAME: ptr [[A:%.*]], [2 x <4 x i32>] [[S_COERCE:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[S_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 0 +; CHECK-NEXT: [[S_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 1 +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ] +; CHECK-NEXT: [[RES_0:%.*]] = phi <4 x i32> [ undef, %[[ENTRY]] ], [ [[CALL:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_0_EXTRACT]] to <16 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_1_EXTRACT]] to <16 x i8> +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_1_EXTRACT]], ptr [[A]]) +; CHECK-NEXT: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[A]]) +; CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 0 +; CHECK-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 2 +; CHECK-NEXT: [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[VLD3_FCA_0_EXTRACT]], <4 x i32> [[VLD3_FCA_2_EXTRACT]]) +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret <4 x i32> [[RES_0]] +; entry: ; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due ; to mismatch between st2 and ld3. -; CHECK-LABEL: @test_nocse2 -; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 br label %for.cond @@ -167,12 +269,33 @@ for.end: ; preds = %for.cond } define <4 x i32> @test_nocse3(ptr %a, [2 x <4 x i32>] %s.coerce, i32 %n) { +; CHECK-LABEL: define <4 x i32> @test_nocse3( +; CHECK-SAME: ptr [[A:%.*]], [2 x <4 x i32>] [[S_COERCE:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[S_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 0 +; CHECK-NEXT: [[S_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[S_COERCE]], 1 +; CHECK-NEXT: br label %[[FOR_COND:.*]] +; CHECK: [[FOR_COND]]: +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[INC:%.*]], %[[FOR_BODY:.*]] ] +; CHECK-NEXT: [[RES_0:%.*]] = phi <4 x i32> [ undef, %[[ENTRY]] ], [ [[CALL:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_0_EXTRACT]] to <16 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[S_COERCE_FCA_1_EXTRACT]] to <16 x i8> +; CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[S_COERCE_FCA_1_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], ptr [[A]]) +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[S_COERCE_FCA_0_EXTRACT]], <4 x i32> [[S_COERCE_FCA_0_EXTRACT]], ptr [[A]]) +; CHECK-NEXT: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[A]]) +; CHECK-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 0 +; CHECK-NEXT: [[CALL]] = call <4 x i32> @vaddq_s32(<4 x i32> [[VLD3_FCA_0_EXTRACT]], <4 x i32> [[VLD3_FCA_0_EXTRACT]]) +; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br label %[[FOR_COND]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret <4 x i32> [[RES_0]] +; entry: ; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to ; mismatch between st2 and st3. -; CHECK-LABEL: @test_nocse3 -; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0 -; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0 %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0 %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1 br label %for.cond @@ -214,6 +337,12 @@ declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr) declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr) define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) { +; CHECK-LABEL: define internal fastcc <4 x i32> @vaddq_s32( +; CHECK-SAME: <4 x i32> [[__P0:%.*]], <4 x i32> [[__P1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[__P0]], [[__P1]] +; CHECK-NEXT: ret <4 x i32> [[ADD]] +; entry: %add = add <4 x i32> %__p0, %__p1 ret <4 x i32> %add From 0defd832eb7c0618a67556e6fcbd32dd19e88b97 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sun, 6 Apr 2025 14:55:29 +0200 Subject: [PATCH 0786/1029] [AArch64] Avoid unused variable warnings in release builds This used to be under !NDEBUG before 0a1742708ddc3a2b31d65479aaad143b5f7562b2, so just put that back. The code only consists of assertions. --- llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp index da2cd1ada3653..2273e1c0ffa6e 100644 --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -35,6 +35,7 @@ bool AArch64SelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const { void AArch64SelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, const SDNode *N) const { +#ifndef NDEBUG switch (N->getOpcode()) { default: break; @@ -110,6 +111,7 @@ void AArch64SelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG, break; } } +#endif } SDValue AArch64SelectionDAGInfo::EmitMOPS(unsigned Opcode, SelectionDAG &DAG, From 449e2f5d66d9d99f58611a8778d688e14d051ca0 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 6 Apr 2025 14:35:21 +0100 Subject: [PATCH 0787/1029] [LV] Remove more DT updates from legacy code path (NFCI). Remove some legacy DT updates. Those should already be handled when updating the DT during VPlan execution. --- .../Transforms/Vectorize/LoopVectorize.cpp | 26 ++++--------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 585caaffa63da..cf7804e19e722 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2088,9 +2088,6 @@ class GeneratedRTChecks { Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, SCEVCheckBlock); - DT->addNewBlock(SCEVCheckBlock, Pred); - DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); - BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond); if (AddBranchWeights) @@ -2115,8 +2112,6 @@ class GeneratedRTChecks { Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader, MemCheckBlock); - DT->addNewBlock(MemCheckBlock, Pred); - DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); MemCheckBlock->moveBefore(LoopVectorPreHeader); BranchInst &BI = @@ -2556,13 +2551,9 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { } // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr, - "vector.ph"); - - assert(DT->properlyDominates(DT->getNode(TCCheckBlock), - DT->getNode(Bypass)->getIDom()) && - "TC check is expected to dominate Bypass"); + LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), + static_cast(nullptr), LI, + nullptr, "vector.ph"); BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters); @@ -7957,13 +7948,10 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, // Create new preheader for vector loop. LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), - DT, LI, nullptr, "vector.ph"); + static_cast(nullptr), LI, + nullptr, "vector.ph"); if (ForEpilogue) { - assert(DT->properlyDominates(DT->getNode(TCCheckBlock), - DT->getNode(Bypass)->getIDom()) && - "TC check is expected to dominate Bypass"); - LoopBypassBlocks.push_back(TCCheckBlock); // Save the trip count so we don't have to regenerate it in the @@ -8067,10 +8055,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( assert(EPI.TripCount && "Expected trip count to have been saved in the first pass."); - assert( - (!isa(EPI.TripCount) || - DT->dominates(cast(EPI.TripCount)->getParent(), Insert)) && - "saved trip count does not dominate insertion point."); Value *TC = EPI.TripCount; IRBuilder<> Builder(Insert->getTerminator()); Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); From 48441cb8a2fa3b3f9502ba4ba1242746615841cb Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 6 Apr 2025 17:36:08 +0100 Subject: [PATCH 0788/1029] [Matrix] Properly set Changed status when optimizing transposes. Currently Changed is not updated properly when transposes are optimized, causing missing analysis invalidation. Update optimizeTransposes to indicate if changes have been made. --- .../Scalar/LowerMatrixIntrinsics.cpp | 27 +++++++++++++------ .../analysis-invalidation.ll | 17 ++++++++++++ 2 files changed, 36 insertions(+), 8 deletions(-) create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/analysis-invalidation.ll diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 8a30a3e8d22e2..ab16ec77be105 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -792,7 +792,8 @@ class LowerMatrixIntrinsics { /// This creates and erases instructions as needed, and returns the newly /// created instruction while updating the iterator to avoid invalidation. If /// this returns nullptr, no new instruction was created. - Instruction *sinkTranspose(Instruction &I, BasicBlock::reverse_iterator &II) { + Instruction *sinkTranspose(Instruction &I, BasicBlock::reverse_iterator &II, + bool &Changed) { BasicBlock &BB = *I.getParent(); IRBuilder<> IB(&I); MatrixBuilder Builder(IB); @@ -809,6 +810,7 @@ class LowerMatrixIntrinsics { updateShapeAndReplaceAllUsesWith(I, TATA); eraseFromParentAndMove(&I, II, BB); eraseFromParentAndMove(TA, II, BB); + Changed = true; return nullptr; } @@ -816,6 +818,7 @@ class LowerMatrixIntrinsics { if (isSplat(TA)) { updateShapeAndReplaceAllUsesWith(I, TA); eraseFromParentAndMove(&I, II, BB); + Changed = true; return nullptr; } @@ -834,6 +837,7 @@ class LowerMatrixIntrinsics { updateShapeAndReplaceAllUsesWith(I, NewInst); eraseFromParentAndMove(&I, II, BB); eraseFromParentAndMove(TA, II, BB); + Changed = true; return NewInst; } @@ -859,6 +863,7 @@ class LowerMatrixIntrinsics { updateShapeAndReplaceAllUsesWith(I, NewInst); eraseFromParentAndMove(&I, II, BB); eraseFromParentAndMove(TA, II, BB); + Changed = true; return NewInst; } @@ -880,13 +885,14 @@ class LowerMatrixIntrinsics { updateShapeAndReplaceAllUsesWith(I, NewInst); eraseFromParentAndMove(&I, II, BB); eraseFromParentAndMove(TA, II, BB); + Changed = true; return NewInst; } return nullptr; } - void liftTranspose(Instruction &I) { + bool liftTranspose(Instruction &I) { // Erase dead Instructions after lifting transposes from binops. auto CleanupBinOp = [this](Instruction &T, Value *A, Value *B) { if (T.use_empty()) @@ -914,6 +920,7 @@ class LowerMatrixIntrinsics { R->getZExtValue()); updateShapeAndReplaceAllUsesWith(I, NewInst); CleanupBinOp(I, A, B); + return true; } // A^t + B ^t -> (A + B)^t. Pick rows and columns from first transpose. If // the shape of the second transpose is different, there's a shape conflict @@ -940,11 +947,14 @@ class LowerMatrixIntrinsics { ShapeMap[AddI] && "Shape of updated addition doesn't match cached shape."); } + return true; } + return false; } /// Try moving transposes in order to fold them away or into multiplies. - void optimizeTransposes() { + bool optimizeTransposes() { + bool Changed = false; // First sink all transposes inside matmuls and adds, hoping that we end up // with NN, NT or TN variants. for (BasicBlock &BB : reverse(Func)) { @@ -952,7 +962,7 @@ class LowerMatrixIntrinsics { Instruction &I = *II; // We may remove II. By default continue on the next/prev instruction. ++II; - if (Instruction *NewInst = sinkTranspose(I, II)) + if (Instruction *NewInst = sinkTranspose(I, II, Changed)) II = std::next(BasicBlock::reverse_iterator(NewInst)); } } @@ -961,9 +971,10 @@ class LowerMatrixIntrinsics { // to fold into consuming multiply or add. for (BasicBlock &BB : Func) { for (Instruction &I : llvm::make_early_inc_range(BB)) { - liftTranspose(I); + Changed |= liftTranspose(I); } } + return Changed; } bool Visit() { @@ -1006,15 +1017,15 @@ class LowerMatrixIntrinsics { WorkList = propagateShapeBackward(WorkList); } + bool Changed = false; if (!isMinimal()) { - optimizeTransposes(); + Changed |= optimizeTransposes(); if (PrintAfterTransposeOpt) { dbgs() << "Dump after matrix transpose optimization:\n"; Func.print(dbgs()); } } - bool Changed = false; SmallVector MaybeFusableInsts; SmallVector MatrixInsts; SmallVector LifetimeEnds; @@ -1043,7 +1054,7 @@ class LowerMatrixIntrinsics { if (!FusedInsts.contains(CI)) LowerMatrixMultiplyFused(CI, FusedInsts, LifetimeEnds); - Changed = !FusedInsts.empty(); + Changed |= !FusedInsts.empty(); // Fourth, lower remaining instructions with shape information. for (Instruction *Inst : MatrixInsts) { diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/analysis-invalidation.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/analysis-invalidation.ll new file mode 100644 index 0000000000000..a747328a71e7a --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/analysis-invalidation.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p lower-matrix-intrinsics -verify-analysis-invalidation -S %s | FileCheck %s + +define <3 x float> @splat_transpose(<3 x float> %in) { +; CHECK-LABEL: define <3 x float> @splat_transpose( +; CHECK-SAME: <3 x float> [[IN:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SPLAT:%.*]] = shufflevector <3 x float> [[IN]], <3 x float> zeroinitializer, <3 x i32> zeroinitializer +; CHECK-NEXT: ret <3 x float> [[SPLAT]] +; +entry: + %splat = shufflevector <3 x float> %in, <3 x float> zeroinitializer, <3 x i32> zeroinitializer + %r = tail call <3 x float> @llvm.matrix.transpose.v3f32(<3 x float> %splat, i32 3, i32 1) + ret <3 x float> %r +} + +declare <3 x float> @llvm.matrix.transpose.v3f32(<3 x float>, i32 immarg, i32 immarg) From eb70253fcbe57c7cb3c309c06b94b05f2eab314d Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Sun, 6 Apr 2025 20:08:52 +0300 Subject: [PATCH 0789/1029] [LangRef/DataLayout] Spell out requirements for alignment values (#104705) For 'p' the added wording matches the implementation. For 'i', 'f', 'v' the implementation also allows 0 for `` component, making 'i16:16:0' valid, for example. 'Fi0', 'Fn0' and 'S0' are also currently accepted. This is likely unintentional. There are no tests in the codebase that rely on this behavior, so the added wording prohibits zero alignments for these specifications. For 'a', the implementation currently allows 0 for both `` and `` components. The added wording prohibits specifying zero for `` with the same justification as above. Zero `` is used in tests, and the example at the end of the section suggests that this is valid, so that's left unchanged. This effectively prohibits zero alignments everywhere except for the `` component of aggregate specification. --- llvm/docs/LangRef.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index d242c945816cc..d462609fa0c52 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -3109,8 +3109,7 @@ as follows: ``S`` Specifies the natural alignment of the stack in bits. Alignment promotion of stack variables is limited to the natural stack - alignment to avoid dynamic stack realignment. The stack alignment - must be a multiple of 8-bits. If omitted, the natural stack + alignment to avoid dynamic stack realignment. If omitted, the natural stack alignment defaults to "unspecified", which does not prevent any alignment promotions. ``P
`` @@ -3136,8 +3135,8 @@ as follows: Defaults to the default address space of 0. ``p[n]::[:][:]`` This specifies the *size* of a pointer and its ```` and - ````\erred alignments for address space ``n``. ```` is optional - and defaults to ````. The fourth parameter ```` is the size of the + ````\erred alignments for address space ``n``. + The fourth parameter ```` is the size of the index that used for address calculation, which must be less than or equal to the pointer size. If not specified, the default index size is equal to the pointer size. All sizes @@ -3147,23 +3146,21 @@ as follows: ``i:[:]`` This specifies the alignment for an integer type of a given bit ````. The value of ```` must be in the range [1,2^24). - ```` is optional and defaults to ````. For ``i8``, the ```` value must equal 8, that is, ``i8`` must be naturally aligned. ``v:[:]`` This specifies the alignment for a vector type of a given bit ````. The value of ```` must be in the range [1,2^24). - ```` is optional and defaults to ````. ``f:[:]`` This specifies the alignment for a floating-point type of a given bit ````. Only values of ```` that are supported by the target will work. 32 (float) and 64 (double) are supported on all targets; 80 or 128 (different flavors of long double) are also supported on some targets. The value of ```` must be in the range [1,2^24). - ```` is optional and defaults to ````. ``a:[:]`` This specifies the alignment for an object of aggregate type. - ```` is optional and defaults to ````. + In addition to the usual requirements for alignment values, + the value of ```` can also be zero, which means one byte alignment. ``F`` This specifies the alignment for function pointers. The options for ```` are: @@ -3202,6 +3199,9 @@ as follows: as :ref:`Non-Integral Pointer Type ` s. The ``0`` address space cannot be specified as non-integral. +Unless explicitly stated otherwise, on every specification that specifies +an alignment, the value of the alignment must be in the range [1,2^16) +and must be a power of two times the width of a byte. On every specification that takes a ``:``, specifying the ```` alignment is optional. If omitted, the preceding ``:`` should be omitted too and ```` will be equal to ````. From 976de53f1008a9e06c94a99fdd59d0f523feaef7 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 6 Apr 2025 13:07:40 -0700 Subject: [PATCH 0790/1029] [MC,MachO] Replace SectionAddrMap workaround with cleaner variable handling Mach-O's ARM and X86 writers use MCExpr's `SectionAddrMap *Addrs` argument to compute label differences, which was a bit of a hack. The AArch64MachObjectWriter does this better by using `getSymbolAddress` in its `recordRelocation` function. This commit: 1. Moves the `SectionAddrMap` logic into the Mach-O code, removing the workaround. 2. Fixes a bug in `MachObjectWriter::getSymbolAddress` where it failed to subtract the `SymB` value. This bug has been present since commit b200f93125eb019d69c220fa447faea4f5d4eb8a (2011). --- llvm/lib/MC/MachObjectWriter.cpp | 2 +- .../ARM/MCTargetDesc/ARMMachObjectWriter.cpp | 14 +++++++-- .../X86/MCTargetDesc/X86MachObjectWriter.cpp | 29 +++++++++---------- 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp index 9e6e3f5e28e54..7fac6c78bc1a8 100644 --- a/llvm/lib/MC/MachObjectWriter.cpp +++ b/llvm/lib/MC/MachObjectWriter.cpp @@ -120,7 +120,7 @@ uint64_t MachObjectWriter::getSymbolAddress(const MCSymbol &S, if (Target.getAddSym()) Address += getSymbolAddress(*Target.getAddSym(), Asm); if (Target.getSubSym()) - Address += getSymbolAddress(*Target.getSubSym(), Asm); + Address -= getSymbolAddress(*Target.getSubSym(), Asm); return Address; } diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 03ad4149f766b..009607eb357f1 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -416,9 +416,17 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer, } else { // Resolve constant variables. if (A->isVariable()) { - int64_t Res; - if (A->getVariableValue()->evaluateAsAbsolute( - Res, Asm, Writer->getSectionAddressMap())) { + MCValue Val; + bool Relocatable = + A->getVariableValue()->evaluateAsRelocatable(Val, &Asm); + int64_t Res = Val.getConstant(); + bool isAbs = Val.isAbsolute(); + if (Relocatable && Val.getAddSym() && Val.getSubSym()) { + Res += Writer->getSymbolAddress(*Val.getAddSym(), Asm) - + Writer->getSymbolAddress(*Val.getSubSym(), Asm); + isAbs = true; + } + if (isAbs) { FixedValue = Res; return; } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index f2055d29ce7e0..10df56a05c4e4 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -246,19 +246,8 @@ void X86MachObjectWriter::RecordX86_64Relocation( if (IsPCRel) Value -= FixupAddress + (1 << Log2Size); } else if (Symbol->isVariable()) { - const MCExpr *Value = Symbol->getVariableValue(); - int64_t Res; - bool isAbs = - Value->evaluateAsAbsolute(Res, Asm, Writer->getSectionAddressMap()); - if (isAbs) { - FixedValue = Res; - return; - } else { - Asm.getContext().reportError(Fixup.getLoc(), - "unsupported relocation of variable '" + - Symbol->getName() + "'"); - return; - } + FixedValue = Writer->getSymbolAddress(*Symbol, Asm); + return; } else { Asm.getContext().reportError( Fixup.getLoc(), "unsupported relocation of undefined symbol '" + @@ -548,9 +537,17 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, // Resolve constant variables. if (A->isVariable()) { - int64_t Res; - if (A->getVariableValue()->evaluateAsAbsolute( - Res, Asm, Writer->getSectionAddressMap())) { + MCValue Val; + bool Relocatable = + A->getVariableValue()->evaluateAsRelocatable(Val, &Asm); + int64_t Res = Val.getConstant(); + bool isAbs = Val.isAbsolute(); + if (Relocatable && Val.getAddSym() && Val.getSubSym()) { + Res += Writer->getSymbolAddress(*Val.getAddSym(), Asm) - + Writer->getSymbolAddress(*Val.getSubSym(), Asm); + isAbs = true; + } + if (isAbs) { FixedValue = Res; return; } From b90a92687f399df5afe3e1a2493b0d9c6295ac8c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 6 Apr 2025 13:15:33 -0700 Subject: [PATCH 0791/1029] MCExpr: Remove unused SectionAddrMap workaround Mach-O's ARM and X86 writers use MCExpr's `SectionAddrMap *Addrs` argument to compute label differences, which was a bit of a hack. The hack has been cleaned up by commit 1b7759de8e6979dda2d949b1ba1c742922e5c366. --- llvm/include/llvm/MC/MCExpr.h | 13 ++---- llvm/include/llvm/MC/MCMachObjectWriter.h | 3 +- llvm/lib/MC/MCExpr.cpp | 51 ++++++++--------------- 3 files changed, 22 insertions(+), 45 deletions(-) diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index 8516f45e07fea..3d9113e92485d 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -28,8 +28,6 @@ class raw_ostream; class StringRef; class MCSymbolRefExpr; -using SectionAddrMap = DenseMap; - /// Base class for the full range of assembler expressions which are /// needed for parsing. class MCExpr { @@ -54,7 +52,7 @@ class MCExpr { SMLoc Loc; bool evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm, - const SectionAddrMap *Addrs, bool InSet) const; + bool InSet) const; protected: explicit MCExpr(ExprKind Kind, SMLoc Loc, unsigned SubclassData = 0) @@ -64,7 +62,7 @@ class MCExpr { } bool evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, - const SectionAddrMap *Addrs, bool InSet) const; + bool InSet) const; unsigned getSubclassData() const { return SubclassData; } @@ -98,8 +96,6 @@ class MCExpr { /// /// \param Res - The absolute value, if evaluation succeeds. /// \return - True on success. - bool evaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm, - const SectionAddrMap &Addrs) const; bool evaluateAsAbsolute(int64_t &Res) const; bool evaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm) const; bool evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm) const; @@ -132,9 +128,8 @@ class MCExpr { /// @} - static bool evaluateSymbolicAdd(const MCAssembler *, const SectionAddrMap *, - bool, const MCValue &, const MCValue &, - MCValue &); + static bool evaluateSymbolicAdd(const MCAssembler *, bool, const MCValue &, + const MCValue &, MCValue &); }; inline raw_ostream &operator<<(raw_ostream &OS, const MCExpr &E) { diff --git a/llvm/include/llvm/MC/MCMachObjectWriter.h b/llvm/include/llvm/MC/MCMachObjectWriter.h index 77652c7ff53a3..77f305dad27f8 100644 --- a/llvm/include/llvm/MC/MCMachObjectWriter.h +++ b/llvm/include/llvm/MC/MCMachObjectWriter.h @@ -141,7 +141,7 @@ class MachObjectWriter final : public MCObjectWriter { std::vector DataRegions; - SectionAddrMap SectionAddress; + DenseMap SectionAddress; // List of sections in layout order. Virtual sections are after non-virtual // sections. @@ -203,7 +203,6 @@ class MachObjectWriter final : public MCObjectWriter { const llvm::SmallVectorImpl &getSectionOrder() const { return SectionOrder; } - SectionAddrMap &getSectionAddressMap() { return SectionAddress; } MCLOHContainer &getLOHContainer() { return LOHContainer; } uint64_t getSectionAddress(const MCSection *Sec) const { diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index 5293fa58c0381..9fbe395e294b8 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -254,30 +254,23 @@ void MCTargetExpr::anchor() {} /* *** */ bool MCExpr::evaluateAsAbsolute(int64_t &Res) const { - return evaluateAsAbsolute(Res, nullptr, nullptr, false); -} - -bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm, - const SectionAddrMap &Addrs) const { - // Setting InSet causes us to absolutize differences across sections and that - // is what the MachO writer uses Addrs for. - return evaluateAsAbsolute(Res, &Asm, &Addrs, true); + return evaluateAsAbsolute(Res, nullptr, false); } bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm) const { - return evaluateAsAbsolute(Res, &Asm, nullptr, false); + return evaluateAsAbsolute(Res, &Asm, false); } bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm) const { - return evaluateAsAbsolute(Res, Asm, nullptr, false); + return evaluateAsAbsolute(Res, Asm, false); } bool MCExpr::evaluateKnownAbsolute(int64_t &Res, const MCAssembler &Asm) const { - return evaluateAsAbsolute(Res, &Asm, nullptr, true); + return evaluateAsAbsolute(Res, &Asm, true); } bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm, - const SectionAddrMap *Addrs, bool InSet) const { + bool InSet) const { MCValue Value; // Fast path constants. @@ -286,7 +279,7 @@ bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm, return true; } - bool IsRelocatable = evaluateAsRelocatableImpl(Value, Asm, Addrs, InSet); + bool IsRelocatable = evaluateAsRelocatableImpl(Value, Asm, InSet); Res = Value.getConstant(); // Value with RefKind (e.g. %hi(0xdeadbeef) in MIPS) is not considered // absolute (the value is unknown at parse time), even if it might be resolved @@ -296,7 +289,6 @@ bool MCExpr::evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm, /// Helper method for \see EvaluateSymbolAdd(). static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm, - const SectionAddrMap *Addrs, bool InSet, const MCSymbol *&A, const MCSymbol *&B, int64_t &Addend) { @@ -324,7 +316,7 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm, const MCFragment *FB = SB.getFragment(); const MCSection &SecA = *FA->getParent(); const MCSection &SecB = *FB->getParent(); - if ((&SecA != &SecB) && !Addrs) + if (&SecA != &SecB) return; // When layout is available, we can generally compute the difference using the @@ -345,9 +337,6 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm, // Eagerly evaluate when layout is finalized. Addend += Asm->getSymbolOffset(SA) - Asm->getSymbolOffset(SB); - if (Addrs && (&SecA != &SecB)) - Addend += (Addrs->lookup(&SecA) - Addrs->lookup(&SecB)); - FinalizeFolding(); } else { // When layout is not finalized, our ability to resolve differences between @@ -434,8 +423,7 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm, // NOTE: This function can be used before layout is done (see the object // streamer for example) and having the Asm argument lets us avoid relaxations // early. -bool MCExpr::evaluateSymbolicAdd(const MCAssembler *Asm, - const SectionAddrMap *Addrs, bool InSet, +bool MCExpr::evaluateSymbolicAdd(const MCAssembler *Asm, bool InSet, const MCValue &LHS, const MCValue &RHS, MCValue &Res) { const MCSymbol *LHS_A = LHS.getAddSym(); @@ -456,12 +444,10 @@ bool MCExpr::evaluateSymbolicAdd(const MCAssembler *Asm, // Result = (LHS_A - LHS_B + LHS_Cst) + (RHS_A - RHS_B + RHS_Cst). // might bring more opportunities. if (LHS_A && RHS_B) { - attemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, LHS_A, RHS_B, - Result_Cst); + attemptToFoldSymbolOffsetDifference(Asm, InSet, LHS_A, RHS_B, Result_Cst); } if (RHS_A && LHS_B) { - attemptToFoldSymbolOffsetDifference(Asm, Addrs, InSet, RHS_A, LHS_B, - Result_Cst); + attemptToFoldSymbolOffsetDifference(Asm, InSet, RHS_A, LHS_B, Result_Cst); } } @@ -481,10 +467,10 @@ bool MCExpr::evaluateSymbolicAdd(const MCAssembler *Asm, } bool MCExpr::evaluateAsRelocatable(MCValue &Res, const MCAssembler *Asm) const { - return evaluateAsRelocatableImpl(Res, Asm, nullptr, false); + return evaluateAsRelocatableImpl(Res, Asm, false); } bool MCExpr::evaluateAsValue(MCValue &Res, const MCAssembler &Asm) const { - return evaluateAsRelocatableImpl(Res, &Asm, nullptr, true); + return evaluateAsRelocatableImpl(Res, &Asm, true); } static bool canExpand(const MCSymbol &Sym, bool InSet) { if (Sym.isWeakExternal()) @@ -503,7 +489,6 @@ static bool canExpand(const MCSymbol &Sym, bool InSet) { } bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, - const SectionAddrMap *Addrs, bool InSet) const { ++stats::MCExprEvaluate; switch (getKind()) { @@ -523,7 +508,7 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, if (Sym.isVariable() && (Kind == MCSymbolRefExpr::VK_None || Layout) && canExpand(Sym, InSet)) { bool IsMachO = SRE->hasSubsectionsViaSymbols(); - if (Sym.getVariableValue()->evaluateAsRelocatableImpl(Res, Asm, Addrs, + if (Sym.getVariableValue()->evaluateAsRelocatableImpl(Res, Asm, InSet || IsMachO)) { if (Kind != MCSymbolRefExpr::VK_None) { if (Res.isAbsolute()) { @@ -566,7 +551,7 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, const MCUnaryExpr *AUE = cast(this); MCValue Value; - if (!AUE->getSubExpr()->evaluateAsRelocatableImpl(Value, Asm, Addrs, InSet)) + if (!AUE->getSubExpr()->evaluateAsRelocatableImpl(Value, Asm, InSet)) return false; switch (AUE->getOpcode()) { case MCUnaryExpr::LNot: @@ -600,10 +585,8 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, const MCBinaryExpr *ABE = cast(this); MCValue LHSValue, RHSValue; - if (!ABE->getLHS()->evaluateAsRelocatableImpl(LHSValue, Asm, Addrs, - InSet) || - !ABE->getRHS()->evaluateAsRelocatableImpl(RHSValue, Asm, Addrs, - InSet)) { + if (!ABE->getLHS()->evaluateAsRelocatableImpl(LHSValue, Asm, InSet) || + !ABE->getRHS()->evaluateAsRelocatableImpl(RHSValue, Asm, InSet)) { // Check if both are Target Expressions, see if we can compare them. if (const MCTargetExpr *L = dyn_cast(ABE->getLHS())) { if (const MCTargetExpr *R = dyn_cast(ABE->getRHS())) { @@ -650,7 +633,7 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, return false; if (RHSValue.SymB && RHSValue.Specifier) return false; - return evaluateSymbolicAdd(Asm, Addrs, InSet, LHSValue, RHSValue, Res); + return evaluateSymbolicAdd(Asm, InSet, LHSValue, RHSValue, Res); } } From 12a377ed71ca1116c3f8f848ec8c7a24b116c180 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 6 Apr 2025 21:17:43 +0100 Subject: [PATCH 0792/1029] [LV] Add test for mis-compile when narrowing interleave groups. Add test case showing mis-compile due to unrolling vector-pointer recipes after 6b98134. --- ...arrow-interleave-to-widen-memory-unroll.ll | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll index e57a5758265f0..a728e26a8df31 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll @@ -66,3 +66,85 @@ loop: exit: ret void } + +define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) { +; CHECK-LABEL: define void @test_2xi64_with_wide_load( +; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP11:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[WIDE_LOAD1]] +; CHECK-NEXT: store <2 x i64> [[TMP10]], ptr [[TMP8]], align 8 +; CHECK-NEXT: store <2 x i64> [[TMP11]], ptr [[TMP9]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]] +; CHECK-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = shl nsw i64 [[IV]], 1 +; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP13]] +; CHECK-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8 +; CHECK-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]] +; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = or disjoint i64 [[TMP13]], 1 +; CHECK-NEXT: [[DATA_1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP14]] +; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8 +; CHECK-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]] +; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 100 +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i64, ptr %factor, i64 %iv + %l.factor = load i64, ptr %arrayidx, align 8 + %1 = shl nsw i64 %iv, 1 + %data.0 = getelementptr inbounds i64, ptr %data, i64 %1 + %l.0 = load i64, ptr %data.0, align 8 + %mul.0 = mul i64 %l.factor, %l.0 + store i64 %mul.0, ptr %data.0, align 8 + %3 = or disjoint i64 %1, 1 + %data.1 = getelementptr inbounds i64, ptr %data, i64 %3 + %l.1 = load i64, ptr %data.1, align 8 + %mul.1 = mul i64 %l.factor, %l.1 + store i64 %mul.1, ptr %data.1, align 8 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 100 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} From 768ccf69f3febe962e0d63dc87fbee31e59547a7 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 6 Apr 2025 14:11:01 -0700 Subject: [PATCH 0793/1029] MCSymbolRefExpr: Remove HasSubsectionsViaSymbolsBit This information is only needed assembly time and we can get it with Asm->getContext().getAsmInfo()->hasSubsectionsViaSymbols(). --- llvm/include/llvm/MC/MCExpr.h | 26 ++------------------------ llvm/lib/MC/MCExpr.cpp | 7 +++---- 2 files changed, 5 insertions(+), 28 deletions(-) diff --git a/llvm/include/llvm/MC/MCExpr.h b/llvm/include/llvm/MC/MCExpr.h index 3d9113e92485d..f6377aa492c58 100644 --- a/llvm/include/llvm/MC/MCExpr.h +++ b/llvm/include/llvm/MC/MCExpr.h @@ -217,20 +217,6 @@ class MCSymbolRefExpr : public MCExpr { /// The symbol being referenced. const MCSymbol *Symbol; - // Subclass data stores VariantKind in bits 0..15 and HasSubsectionsViaSymbols - // in bit 16. - static const unsigned VariantKindBits = 16; - static const unsigned VariantKindMask = (1 << VariantKindBits) - 1; - - // FIXME: Remove this bit. - static const unsigned HasSubsectionsViaSymbolsBit = 1 << VariantKindBits; - - static unsigned encodeSubclassData(VariantKind Kind, - bool HasSubsectionsViaSymbols) { - return (unsigned)Kind | - (HasSubsectionsViaSymbols ? HasSubsectionsViaSymbolsBit : 0); - } - explicit MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind, const MCAsmInfo *MAI, SMLoc Loc = SMLoc()); @@ -259,16 +245,8 @@ class MCSymbolRefExpr : public MCExpr { // Some targets encode the relocation specifier within SymA using // MCSymbolRefExpr::SubclassData, which is copied to MCValue::Specifier, // though this method is now deprecated. - VariantKind getKind() const { - return (VariantKind)(getSubclassData() & VariantKindMask); - } - uint16_t getSpecifier() const { - return (getSubclassData() & VariantKindMask); - } - - bool hasSubsectionsViaSymbols() const { - return (getSubclassData() & HasSubsectionsViaSymbolsBit) != 0; - } + VariantKind getKind() const { return VariantKind(getSubclassData()); } + uint16_t getSpecifier() const { return getSubclassData(); } /// @} diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp index 9fbe395e294b8..4c159feea48f8 100644 --- a/llvm/lib/MC/MCExpr.cpp +++ b/llvm/lib/MC/MCExpr.cpp @@ -235,9 +235,7 @@ const MCConstantExpr *MCConstantExpr::create(int64_t Value, MCContext &Ctx, MCSymbolRefExpr::MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind, const MCAsmInfo *MAI, SMLoc Loc) - : MCExpr(MCExpr::SymbolRef, Loc, - encodeSubclassData(Kind, MAI->hasSubsectionsViaSymbols())), - Symbol(Symbol) { + : MCExpr(MCExpr::SymbolRef, Loc, Kind), Symbol(Symbol) { assert(Symbol); } @@ -507,7 +505,8 @@ bool MCExpr::evaluateAsRelocatableImpl(MCValue &Res, const MCAssembler *Asm, // Evaluate recursively if this is a variable. if (Sym.isVariable() && (Kind == MCSymbolRefExpr::VK_None || Layout) && canExpand(Sym, InSet)) { - bool IsMachO = SRE->hasSubsectionsViaSymbols(); + bool IsMachO = + Asm && Asm->getContext().getAsmInfo()->hasSubsectionsViaSymbols(); if (Sym.getVariableValue()->evaluateAsRelocatableImpl(Res, Asm, InSet || IsMachO)) { if (Kind != MCSymbolRefExpr::VK_None) { From 464286ba633b30bf0bfa9971ec0d61778ccc7561 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 6 Apr 2025 22:14:14 +0100 Subject: [PATCH 0794/1029] [VPlan] Don't narrow interleave groups if there are vector pointers. Do not narrow interleave groups if there are VectorPointer recipes and the plan was unrolled. The recipe implicitly uses VF from VPTransformState. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 8 +++++ ...arrow-interleave-to-widen-memory-unroll.ll | 30 +++++++++++-------- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 9815dfd31374b..818ed9f5dee23 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2590,6 +2590,14 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF, if (R.mayWriteToMemory() && !InterleaveR) return; + // Do not narrow interleave groups if there are VectorPointer recipes and + // the plan was unrolled. The recipe implicitly uses VF from + // VPTransformState. + // TODO: Remove restriction once the VF for the VectorPointer offset is + // modeled explicitly as operand. + if (isa(&R) && Plan.getUF() > 1) + return; + // All other ops are allowed, but we reject uses that cannot be converted // when checking all allowed consumers (store interleave groups) below. if (!InterleaveR) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll index a728e26a8df31..3a7b448366bda 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll @@ -76,27 +76,33 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP11:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <2 x i64> [[TMP10]], ptr [[TMP8]], align 8 -; CHECK-NEXT: store <2 x i64> [[TMP11]], ptr [[TMP9]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[STRIDED_VEC2]] +; CHECK-NEXT: [[TMP16:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[STRIDED_VEC5]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP15]], <4 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP16]], <4 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC6:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: store <4 x i64> [[INTERLEAVED_VEC6]], ptr [[TMP9]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: From 146ad71bc71a9cbecccea307bbd157ec910ae82a Mon Sep 17 00:00:00 2001 From: Mats Jun Larsen Date: Mon, 7 Apr 2025 08:30:33 +0900 Subject: [PATCH 0795/1029] [IR] Deprecate PointerType::get/getUnqual pointee type overload (#134517) Deprecates the methods and schedules them for removal in the future as the overloads taking LLVMContext are preferred, as the pointee type has no meaning in opaque pointers. From what my clangd can tell, there are no usages left in the monorepo Part of #123569 --- llvm/include/llvm/IR/DerivedTypes.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index 60606d34c32c3..ba5c41ff033f5 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -685,6 +685,8 @@ class PointerType : public Type { /// This constructs a pointer to an object of the specified type in a numbered /// address space. + [[deprecated("PointerType::get with pointee type is pending removal. Use " + "Context overload.")]] static PointerType *get(Type *ElementType, unsigned AddressSpace); /// This constructs an opaque pointer to an object in a numbered address /// space. @@ -692,6 +694,8 @@ class PointerType : public Type { /// This constructs a pointer to an object of the specified type in the /// default address space (address space zero). + [[deprecated("PointerType::getUnqual with pointee type is pending removal. " + "Use Context overload.")]] static PointerType *getUnqual(Type *ElementType) { return PointerType::get(ElementType, 0); } From f2987f255ae99cc43cd0da61ac42150af71b0ad6 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Mon, 7 Apr 2025 08:31:02 +0800 Subject: [PATCH 0796/1029] [X86][AVX10] Make warning message more informative, NFCI (#134528) --- clang/lib/Driver/ToolChains/Arch/X86.cpp | 8 ++++++-- clang/test/Driver/x86-target-features.c | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp index 429b041c9c513..e6ac3a9e4b350 100644 --- a/clang/lib/Driver/ToolChains/Arch/X86.cpp +++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp @@ -252,7 +252,9 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, D.Diag(diag::warn_drv_deprecated_arg) << Name << 1 << Name.drop_back(4); else if (Width == "256") D.Diag(diag::warn_drv_deprecated_custom) - << Name << "because AVX10/256 is not supported and will be removed"; + << Name + << "no alternative argument provided because " + "AVX10/256 is not supported and will be removed"; else assert((Width == "256" || Width == "512") && "Invalid vector length."); Features.push_back(Args.MakeArgString((IsNegative ? "-" : "+") + Name)); @@ -286,7 +288,9 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple, if (A->getOption().matches(options::OPT_mevex512) || A->getOption().matches(options::OPT_mno_evex512)) D.Diag(diag::warn_drv_deprecated_custom) - << Name << "because AVX10/256 is not supported and will be removed"; + << Name + << "no alternative argument provided because " + "AVX10/256 is not supported and will be removed"; if (A->getOption().matches(options::OPT_mapx_features_EQ) || A->getOption().matches(options::OPT_mno_apx_features_EQ)) { diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index 6416a34898e78..9c18098c87026 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -411,8 +411,8 @@ // RUN: %clang --target=i386 -mavx10.2-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_2_512,WARN-AVX10-512 %s // RUN: %clang --target=i386 -mavx10.2-256 -mavx10.1-512 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_2_256,AVX10_1_512 %s // RUN: %clang --target=i386 -mavx10.2-512 -mavx10.1-256 %s -### -o %t.o 2>&1 | FileCheck -check-prefixes=AVX10_2_512,AVX10_1_256 %s -// WARN-EVEX512: warning: argument '{{.*}}evex512' is deprecated, because AVX10/256 is not supported and will be removed [-Wdeprecated] -// WARN-AVX10-256: warning: argument 'avx10.{{.*}}-256' is deprecated, because AVX10/256 is not supported and will be removed [-Wdeprecated] +// WARN-EVEX512: warning: argument '{{.*}}evex512' is deprecated, no alternative argument provided because AVX10/256 is not supported and will be removed [-Wdeprecated] +// WARN-AVX10-256: warning: argument 'avx10.{{.*}}-256' is deprecated, no alternative argument provided because AVX10/256 is not supported and will be removed [-Wdeprecated] // WARN-AVX10-512: warning: argument 'avx10.{{.*}}-512' is deprecated, use 'avx10.{{.*}}' instead [-Wdeprecated] // EVEX512: "-target-feature" "+evex512" // NO-EVEX512: "-target-feature" "-evex512" From aef000dd43b8ee2397e71f877217a9ecbd23c608 Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Sun, 6 Apr 2025 21:57:20 -0300 Subject: [PATCH 0797/1029] [clang] fix serialization of SubstNonTypeTemplateParmExpr (#134560) This fixes a couple of mistakes introduced when merging https://github.com/llvm/llvm-project/pull/132748 Fixes msan failure reported here: https://github.com/llvm/llvm-project/pull/132748#issuecomment-2781105225 --- clang/lib/AST/ASTContext.cpp | 2 +- clang/lib/Serialization/ASTReaderStmt.cpp | 1 + clang/lib/Serialization/ASTWriterStmt.cpp | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 1b6b3d06ddc1e..320fd4e2f3077 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -7011,7 +7011,7 @@ TemplateName ASTContext::getCanonicalTemplateName(TemplateName Name, getCanonicalTemplateArgument(subst->getArgumentPack()); return getSubstTemplateTemplateParmPack( canonArgPack, subst->getAssociatedDecl()->getCanonicalDecl(), - subst->getFinal(), subst->getIndex()); + subst->getIndex(), subst->getFinal()); } case TemplateName::DeducedTemplate: { assert(IgnoreDeduced == false); diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index d26152f3780ed..22fe54b526433 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2229,6 +2229,7 @@ void ASTStmtReader::VisitSubstNonTypeTemplateParmExpr( E->PackIndex = Record.readInt(); else E->PackIndex = 0; + E->Final = CurrentUnpackingBits->getNextBit(); E->SubstNonTypeTemplateParmExprBits.NameLoc = readSourceLocation(); E->Replacement = Record.readSubExpr(); } diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 23bb5ff22efaf..d0a0f843c7542 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2229,6 +2229,7 @@ void ASTStmtWriter::VisitSubstNonTypeTemplateParmExpr( CurrentPackingBits.addBit((bool)E->getPackIndex()); if (auto PackIndex = E->getPackIndex()) Record.push_back(*PackIndex + 1); + CurrentPackingBits.addBit(E->getFinal()); Record.AddSourceLocation(E->getNameLoc()); Record.AddStmt(E->getReplacement()); From c9497a22ef9f9802401cf0cf3352dcbd4f14bcbd Mon Sep 17 00:00:00 2001 From: Alan Date: Sun, 6 Apr 2025 21:26:44 -0400 Subject: [PATCH 0798/1029] [OCaml] Fix test with invalid usage of #dbg_declare (#134508) Even though #dbg_declare can only describe pointers, one of the OCaml tests tried to add a #dbg_declare to an i32 argument. The change introduced in ecd4c08 caught this incorrect usage. --- llvm/test/Bindings/OCaml/debuginfo.ml | 29 +++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/llvm/test/Bindings/OCaml/debuginfo.ml b/llvm/test/Bindings/OCaml/debuginfo.ml index f95800dfcb025..6ebc7c35879a4 100644 --- a/llvm/test/Bindings/OCaml/debuginfo.ml +++ b/llvm/test/Bindings/OCaml/debuginfo.ml @@ -112,7 +112,18 @@ let test_get_function m dibuilder file_di m_di = stdout_metadata int_ty_di; (* CHECK: [[INT32_PTR:<0x[0-9a-f]*>]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) *) - let param_types = [| null_metadata; int_ty_di |] in + let int_ptr_ty_di = + Llvm_debuginfo.dibuild_create_pointer_type dibuilder + ~pointee_ty:int_ty_di + ~size_in_bits:32 + ~align_in_bits:32 + ~address_space:0 + ~name:"ptrint" + in + stdout_metadata int_ptr_ty_di; + (* CHECK: [[PTRINT32_PTR:<0x[0-9a-f]*>]] = !DIDerivedType(tag: DW_TAG_pointer_type, name: "ptrint", baseType: [[INT32_PTR]], size: 32, align: 32, dwarfAddressSpace: 0) + *) + let param_types = [| null_metadata; int_ty_di; int_ptr_ty_di |] in let fty_di = Llvm_debuginfo.dibuild_create_subroutine_type dibuilder ~file:file_di ~param_types flags_zero @@ -126,7 +137,7 @@ let test_get_function m dibuilder file_di m_di = Llvm_debuginfo.dibuild_get_or_create_type_array dibuilder ~data:param_types in stdout_metadata fty_di_args; - (* CHECK: [[FARGS_PTR:<0x[0-9a-f]*>]] = !{null, [[INT32_PTR]]} + (* CHECK: [[FARGS_PTR:<0x[0-9a-f]*>]] = !{null, [[INT32_PTR]], [[PTRINT32_PTR]]} *) stdout_metadata fty_di; (* CHECK: [[SBRTNTY_PTR:<0x[0-9a-f]*>]] = !DISubroutineType(types: [[FARGS_PTR]]) @@ -134,7 +145,8 @@ let test_get_function m dibuilder file_di m_di = (* Let's create the LLVM-IR function now. *) let name = "tfun" in let fty = - Llvm.function_type (Llvm.void_type context) [| Llvm.i32_type context |] + Llvm.function_type (Llvm.void_type context) + [| Llvm.i32_type context; Llvm.pointer_type context |] in let f = Llvm.define_function name fty m in let f_di = @@ -160,11 +172,12 @@ let test_bbinstr fty f f_di file_di dibuilder = group "basic_block and instructions tests"; (* Create this pattern: * if (arg0 != 0) { - * foo(arg0); + * foo(arg0, arg1); * } * return; *) let arg0 = (Llvm.params f).(0) in + let arg1 = (Llvm.params f).(1) in let builder = Llvm.builder_at_end context (Llvm.entry_block f) in let zero = Llvm.const_int (Llvm.i32_type context) 0 in let cmpi = Llvm.build_icmp Llvm.Icmp.Ne zero arg0 "cmpi" builder in @@ -185,7 +198,7 @@ let test_bbinstr fty f f_di file_di dibuilder = | Some file_of_f_di', Some file_of_scope' -> file_of_f_di' = file_di && file_of_scope' = file_di | _ -> false ); - let foocall = Llvm.build_call fty foodecl [| arg0 |] "" builder in + let foocall = Llvm.build_call fty foodecl [| arg0; arg1 |] "" builder in let foocall_loc = Llvm_debuginfo.dibuild_create_debug_location context ~line:10 ~column:12 ~scope @@ -290,17 +303,17 @@ let test_variables f dibuilder file_di fun_di = let () = Printf.printf "%s\n" (Llvm.string_of_lldbgrecord vdi) in (* CHECK: dbg_declare(ptr %my_alloca, ![[#]], !DIExpression(), ![[#]]) *) - let arg0 = (Llvm.params f).(0) in + let arg1 = (Llvm.params f).(1) in let arg_var = Llvm_debuginfo.dibuild_create_parameter_variable dibuilder ~scope:fun_di ~name:"my_arg" ~argno:1 ~file:file_di ~line:10 ~ty ~always_preserve:false flags_zero in - let argdi = Llvm_debuginfo.dibuild_insert_declare_before dibuilder ~storage:arg0 + let argdi = Llvm_debuginfo.dibuild_insert_declare_before dibuilder ~storage:arg1 ~var_info:arg_var ~expr:(Llvm_debuginfo.dibuild_expression dibuilder [||]) ~location ~instr:entry_term in let () = Printf.printf "%s\n" (Llvm.string_of_lldbgrecord argdi) in - (* CHECK: dbg_declare(i32 %0, ![[#]], !DIExpression(), ![[#]]) + (* CHECK: dbg_declare(ptr %1, ![[#]], !DIExpression(), ![[#]]) *) () From da6e2454fff3fbc86861e31b60f18d3467354375 Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Mon, 7 Apr 2025 04:57:55 +0300 Subject: [PATCH 0799/1029] [clang-tidy] Improve `bugprone-capturing-this-in-member-variable` check: add support of `bind` functions. (#132635) Improve `bugprone-capturing-this-in-member-variable` check: Added support of `bind`-like functions that capture and store `this` pointer in class member. Closes https://github.com/llvm/llvm-project/issues/131220. --- .../CapturingThisInMemberVariableCheck.cpp | 56 +++++++++++++----- .../CapturingThisInMemberVariableCheck.h | 1 + clang-tools-extra/docs/ReleaseNotes.rst | 5 +- .../capturing-this-in-member-variable.rst | 7 +++ .../capturing-this-in-member-variable.cpp | 58 ++++++++++++++++++- 5 files changed, 109 insertions(+), 18 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.cpp index add0576a42c33..1bfe384258ddb 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.cpp @@ -64,16 +64,23 @@ AST_MATCHER(CXXRecordDecl, correctHandleCaptureThisLambda) { constexpr const char *DefaultFunctionWrapperTypes = "::std::function;::std::move_only_function;::boost::function"; +constexpr const char *DefaultBindFunctions = + "::std::bind;::boost::bind;::std::bind_front;::std::bind_back;" + "::boost::compat::bind_front;::boost::compat::bind_back"; CapturingThisInMemberVariableCheck::CapturingThisInMemberVariableCheck( StringRef Name, ClangTidyContext *Context) : ClangTidyCheck(Name, Context), FunctionWrapperTypes(utils::options::parseStringList( - Options.get("FunctionWrapperTypes", DefaultFunctionWrapperTypes))) {} + Options.get("FunctionWrapperTypes", DefaultFunctionWrapperTypes))), + BindFunctions(utils::options::parseStringList( + Options.get("BindFunctions", DefaultBindFunctions))) {} void CapturingThisInMemberVariableCheck::storeOptions( ClangTidyOptions::OptionMap &Opts) { Options.store(Opts, "FunctionWrapperTypes", utils::options::serializeStringList(FunctionWrapperTypes)); + Options.store(Opts, "BindFunctions", + utils::options::serializeStringList(BindFunctions)); } void CapturingThisInMemberVariableCheck::registerMatchers(MatchFinder *Finder) { @@ -87,33 +94,52 @@ void CapturingThisInMemberVariableCheck::registerMatchers(MatchFinder *Finder) { // [self = this] capturesVar(varDecl(hasInitializer(cxxThisExpr()))))); auto IsLambdaCapturingThis = - lambdaExpr(hasAnyCapture(CaptureThis.bind("capture"))).bind("lambda"); - auto IsInitWithLambda = - anyOf(IsLambdaCapturingThis, - cxxConstructExpr(hasArgument(0, IsLambdaCapturingThis))); + lambdaExpr(hasAnyCapture(CaptureThis)).bind("lambda"); + + auto IsBindCapturingThis = + callExpr( + callee(functionDecl(matchers::matchesAnyListedName(BindFunctions)) + .bind("callee")), + hasAnyArgument(cxxThisExpr())) + .bind("bind"); + + auto IsInitWithLambdaOrBind = + anyOf(IsLambdaCapturingThis, IsBindCapturingThis, + cxxConstructExpr(hasArgument( + 0, anyOf(IsLambdaCapturingThis, IsBindCapturingThis)))); + Finder->addMatcher( cxxRecordDecl( anyOf(has(cxxConstructorDecl( unless(isCopyConstructor()), unless(isMoveConstructor()), hasAnyConstructorInitializer(cxxCtorInitializer( isMemberInitializer(), forField(IsStdFunctionField), - withInitializer(IsInitWithLambda))))), + withInitializer(IsInitWithLambdaOrBind))))), has(fieldDecl(IsStdFunctionField, - hasInClassInitializer(IsInitWithLambda)))), + hasInClassInitializer(IsInitWithLambdaOrBind)))), unless(correctHandleCaptureThisLambda())), this); } - void CapturingThisInMemberVariableCheck::check( const MatchFinder::MatchResult &Result) { - const auto *Capture = Result.Nodes.getNodeAs("capture"); - const auto *Lambda = Result.Nodes.getNodeAs("lambda"); + if (const auto *Lambda = Result.Nodes.getNodeAs("lambda")) { + diag(Lambda->getBeginLoc(), + "'this' captured by a lambda and stored in a class member variable; " + "disable implicit class copying/moving to prevent potential " + "use-after-free"); + } else if (const auto *Bind = Result.Nodes.getNodeAs("bind")) { + const auto *Callee = Result.Nodes.getNodeAs("callee"); + assert(Callee); + diag(Bind->getBeginLoc(), + "'this' captured by a '%0' call and stored in a class member " + "variable; disable implicit class copying/moving to prevent potential " + "use-after-free") + << Callee->getQualifiedNameAsString(); + } + const auto *Field = Result.Nodes.getNodeAs("field"); - diag(Lambda->getBeginLoc(), - "'this' captured by a lambda and stored in a class member variable; " - "disable implicit class copying/moving to prevent potential " - "use-after-free") - << Capture->getLocation(); + assert(Field); + diag(Field->getLocation(), "class member of type '%0' that stores captured 'this'", DiagnosticIDs::Note) diff --git a/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.h b/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.h index fe0b0aa10f108..934f99cd35797 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.h @@ -37,6 +37,7 @@ class CapturingThisInMemberVariableCheck : public ClangTidyCheck { private: ///< store the function wrapper types const std::vector FunctionWrapperTypes; + const std::vector BindFunctions; }; } // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 6c1f05009df98..fefb085409b44 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -103,8 +103,9 @@ New checks - New :doc:`bugprone-capturing-this-in-member-variable ` check. - Finds lambda captures that capture the ``this`` pointer and store it as class - members without handle the copy and move constructors and the assignments. + Finds lambda captures and ``bind`` function calls that capture the ``this`` + pointer and store it as class members without handle the copy and move + constructors and the assignments. - New :doc:`bugprone-unintended-char-ostream-output ` check. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst index b09d7d5fce959..dfc2ca1bbc7dd 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/capturing-this-in-member-variable.rst @@ -40,3 +40,10 @@ Options A semicolon-separated list of names of types. Used to specify function wrapper that can hold lambda expressions. Default is `::std::function;::std::move_only_function;::boost::function`. + +.. option:: BindFunctions + + A semicolon-separated list of fully qualified names of functions that can + capture ``this`` pointer. + Default is `::std::bind;::boost::bind;::std::bind_front;::std::bind_back; + ::boost::compat::bind_front;::boost::compat::bind_back`. diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/capturing-this-in-member-variable.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/capturing-this-in-member-variable.cpp index f5ebebfe4b058..4c90a8aa8944a 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/capturing-this-in-member-variable.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/capturing-this-in-member-variable.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++11-or-later %s bugprone-capturing-this-in-member-variable %t -- -config="{CheckOptions: {bugprone-capturing-this-in-member-variable.FunctionWrapperTypes: '::std::function;::Fn'}}" -- +// RUN: %check_clang_tidy -std=c++11-or-later %s bugprone-capturing-this-in-member-variable %t -- -config="{CheckOptions: {bugprone-capturing-this-in-member-variable.FunctionWrapperTypes: '::std::function;::Fn', bugprone-capturing-this-in-member-variable.BindFunctions: '::std::bind;::Bind'}}" -- namespace std { @@ -12,12 +12,22 @@ class function { template function(F &&); }; +template +function bind(F&&, Args&&...) { + return {}; +} + } // namespace std struct Fn { template Fn(F &&); }; +template +std::function Bind(F&&, Args&&...) { + return {}; +} + struct BasicConstructor { BasicConstructor() : Captured([this]() { static_cast(this); }) {} // CHECK-MESSAGES: :[[@LINE-1]]:33: warning: 'this' captured by a lambda and stored in a class member variable; @@ -208,3 +218,49 @@ struct CustomFunctionWrapper { Fn Captured; // CHECK-MESSAGES: :[[@LINE-1]]:6: note: class member of type 'Fn' that stores captured 'this' }; + +struct BindConstructor { + BindConstructor() : Captured(std::bind(&BindConstructor::method, this)) {} + // CHECK-MESSAGES: :[[@LINE-1]]:32: warning: 'this' captured by a 'std::bind' call and stored in a class member variable; + void method() {} + std::function Captured; + // CHECK-MESSAGES: :[[@LINE-1]]:25: note: class member of type 'std::function' that stores captured 'this' +}; + +struct BindField1 { + void method() {} + std::function Captured = std::bind(&BindField1::method, this); + // CHECK-MESSAGES: :[[@LINE-1]]:36: warning: 'this' captured by a 'std::bind' call and stored in a class member variable; + // CHECK-MESSAGES: :[[@LINE-2]]:25: note: class member of type 'std::function' that stores captured 'this' +}; + +struct BindField2 { + void method() {} + std::function Captured{std::bind(&BindField2::method, this)}; + // CHECK-MESSAGES: :[[@LINE-1]]:34: warning: 'this' captured by a 'std::bind' call and stored in a class member variable; + // CHECK-MESSAGES: :[[@LINE-2]]:25: note: class member of type 'std::function' that stores captured 'this' +}; + +struct BindCustom { + BindCustom() : Captured(Bind(&BindCustom::method, this)) {} + // CHECK-MESSAGES: :[[@LINE-1]]:27: warning: 'this' captured by a 'Bind' call and stored in a class member variable; + void method() {} + std::function Captured; + // CHECK-MESSAGES: :[[@LINE-1]]:25: note: class member of type 'std::function' that stores captured 'this' +}; + +struct BindNotCapturingThis { + void method(int) {} + BindNotCapturingThis(int V) : Captured(std::bind(&BindNotCapturingThis::method, V)) {} + std::function Captured; +}; + +struct DeletedCopyMoveWithBind { + DeletedCopyMoveWithBind() : Captured(std::bind(&DeletedCopyMoveWithBind::method, this)) {} + DeletedCopyMoveWithBind(DeletedCopyMoveWithBind const&) = delete; + DeletedCopyMoveWithBind(DeletedCopyMoveWithBind &&) = delete; + DeletedCopyMoveWithBind& operator=(DeletedCopyMoveWithBind const&) = delete; + DeletedCopyMoveWithBind& operator=(DeletedCopyMoveWithBind &&) = delete; + void method() {} + std::function Captured; +}; From 8f0d8d28ccd8a1ced82a744679c5152f90e80c77 Mon Sep 17 00:00:00 2001 From: Zhen Wang <37195552+wangzpgi@users.noreply.github.com> Date: Sun, 6 Apr 2025 19:31:09 -0700 Subject: [PATCH 0800/1029] Delete duplicated hlfir.declare op of induction variables of do concurrent when inside cuf kernel directive. (#134467) Delete duplicated creation of hlfir.declare op of do concurrent induction variables when inside cuf kernel directive. Obtain the correct hlfir.declare op generated from bindSymbol, and add it to ivValues. --- flang/lib/Lower/Bridge.cpp | 8 +++----- flang/test/Lower/CUDA/cuda-doconc.cuf | 11 +++++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 65edf1cea8761..b4d1197822a43 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -3209,13 +3209,11 @@ class FirConverter : public Fortran::lower::AbstractConverter { builder->restoreInsertionPoint(insPt); } - // Create the hlfir.declare operation using the symbol's name - auto declareOp = builder->create( - loc, ivValue, toStringRef(name.symbol->name())); - ivValue = declareOp.getResult(0); - // Bind the symbol to the declared variable bindSymbol(*name.symbol, ivValue); + Fortran::lower::SymbolBox hsb = localSymbols.lookupSymbol(*name.symbol); + fir::ExtendedValue extIvValue = symBoxToExtendedValue(hsb); + ivValue = fir::getBase(extIvValue); ivValues.push_back(ivValue); ivTypes.push_back(idxTy); ivLocs.push_back(loc); diff --git a/flang/test/Lower/CUDA/cuda-doconc.cuf b/flang/test/Lower/CUDA/cuda-doconc.cuf index e240b1adc206a..ebdc0de91cf38 100644 --- a/flang/test/Lower/CUDA/cuda-doconc.cuf +++ b/flang/test/Lower/CUDA/cuda-doconc.cuf @@ -15,8 +15,9 @@ subroutine doconc1 end ! CHECK: func.func @_QPdoconc1() { -! CHECK: %[[DECL:.*]]:2 = hlfir.declare %{{.*}}#0 {uniq_name = "_QFdoconc1Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: cuf.kernel<<<*, *>>> +! CHECK: %[[DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFdoconc1Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: cuf.kernel<<<*, *>>> (%arg0 : index) +! CHECK: fir.store %arg0 to %[[DECL]]#0 : !fir.ref ! CHECK: %{{.*}} = fir.load %[[DECL]]#0 : !fir.ref subroutine doconc2 @@ -32,8 +33,10 @@ subroutine doconc2 end ! CHECK: func.func @_QPdoconc2() { -! CHECK: %[[DECLI:.*]]:2 = hlfir.declare %{{.*}}#0 {uniq_name = "_QFdoconc2Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) -! CHECK: %[[DECLJ:.*]]:2 = hlfir.declare %{{.*}}#0 {uniq_name = "_QFdoconc2Ej"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLI:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFdoconc2Ei"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: %[[DECLJ:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFdoconc2Ej"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: cuf.kernel<<<*, *>>> (%arg0 : index, %arg1 : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index) step (%{{.*}}, %{{.*}} : index, index) { +! CHECK: fir.store %arg0 to %[[DECLI]]#0 : !fir.ref +! CHECK: fir.store %arg1 to %[[DECLJ]]#0 : !fir.ref ! CHECK: %{{.*}} = fir.load %[[DECLI]]#0 : !fir.ref ! CHECK: %{{.*}} = fir.load %[[DECLJ]]#0 : !fir.ref From 6ce0fd7f74502a75120bef43f12f56e3a5d80dfd Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Sun, 6 Apr 2025 23:42:27 -0300 Subject: [PATCH 0801/1029] [clang] NFC: clean trailing whitespaces in clang/test/CXX/drs/cwg15xx.cpp --- clang/test/CXX/drs/cwg15xx.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/clang/test/CXX/drs/cwg15xx.cpp b/clang/test/CXX/drs/cwg15xx.cpp index 30ec63999ca28..8e3e49dc42ce0 100644 --- a/clang/test/CXX/drs/cwg15xx.cpp +++ b/clang/test/CXX/drs/cwg15xx.cpp @@ -38,7 +38,7 @@ namespace cwg1512 { // cwg1512: 4 template void composite_pointer_type_is_ord() { composite_pointer_type_is_base(); - typedef __typeof(val() < val()) cmp; // #cwg1512-lt + typedef __typeof(val() < val()) cmp; // #cwg1512-lt // since-cxx17-warning@#cwg1512-lt {{ordered comparison of function pointers ('int (*)() noexcept' and 'int (*)()')}} // since-cxx17-note@#cwg1512-noexcept-1st {{in instantiation of function template specialization 'cwg1512::composite_pointer_type_is_ord' requested here}} // since-cxx17-warning@#cwg1512-lt {{ordered comparison of function pointers ('int (*)()' and 'int (*)() noexcept')}} @@ -332,7 +332,7 @@ namespace cwg1550 { // cwg1550: 3.4 namespace cwg1558 { // cwg1558: 12 #if __cplusplus >= 201103L template using first_of = T; - template first_of f(int); // #cwg1558-f + template first_of f(int); // #cwg1558-f template void f(...) = delete; // #cwg1558-f-deleted struct X { typedef void type; }; @@ -639,7 +639,7 @@ namespace cwg1591 { //cwg1591. Deducing array bound and element type from initi #if __cplusplus >= 201103L template int h(T const(&)[N]); int X = h({1,2,3}); // T deduced to int, N deduced to 3 - + template int j(T const(&)[3]); int Y = j({42}); // T deduced to int, array bound not considered @@ -655,12 +655,12 @@ namespace cwg1591 { //cwg1591. Deducing array bound and element type from initi template int n(T const(&)[N], T); int X1 = n({{1},{2},{3}},Aggr()); // OK, T is Aggr, N is 3 - - + + namespace check_multi_dim_arrays { template int ***f(const T (&a)[N][M][O]); // #cwg1591-f-3 template int **f(const T (&a)[N][M]); // #cwg1591-f-2 - + template int *f(const T (&a)[N]); // #cwg1591-f-1 int ***p3 = f({ { {1,2}, {3, 4} }, { {5,6}, {7, 8} }, { {9,10}, {11, 12} } }); int ***p33 = f({ { {1,2}, {3, 4} }, { {5,6}, {7, 8} }, { {9,10}, {11, 12, 13} } }); @@ -675,7 +675,7 @@ namespace cwg1591 { //cwg1591. Deducing array bound and element type from initi namespace check_multi_dim_arrays_rref { template int ***g(T (&&a)[N][M][O]); // #cwg1591-g-3 template int **g(T (&&a)[N][M]); // #cwg1591-g-2 - + template int *g(T (&&a)[N]); // #cwg1591-g-1 int ***p3 = g({ { {1,2}, {3, 4} }, { {5,6}, {7, 8} }, { {9,10}, {11, 12} } }); int ***p33 = g({ { {1,2}, {3, 4} }, { {5,6}, {7, 8} }, { {9,10}, {11, 12, 13} } }); @@ -687,7 +687,7 @@ namespace cwg1591 { //cwg1591. Deducing array bound and element type from initi int **p22 = g({ {1,2}, {3, 4} }); int *p1 = g({1, 2, 3}); } - + namespace check_arrays_of_init_list { template float *h(const std::initializer_list (&)[N]); template double *h(const T(&)[N]); @@ -695,7 +695,7 @@ namespace cwg1591 { //cwg1591. Deducing array bound and element type from initi float *fp = h({{1}, {1, 2}, {1, 2, 3}}); } namespace core_reflector_28543 { - + template int *i(T (&&)[N]); // #1 template char *i(std::initializer_list &&); // #2 template int **i(T (&&)[N][M]); // #3 #cwg1591-i-2 @@ -704,13 +704,13 @@ namespace cwg1591 { //cwg1591. Deducing array bound and element type from initi template short *i(T (&&)[2]); // #5 template using Arr = T[]; - + char *pc = i({1, 2, 3}); // OK prefer #2 via 13.3.3.2 [over.ics.rank] - char *pc2 = i({1, 2}); // #2 also + char *pc2 = i({1, 2}); // #2 also int *pi = i(Arr{1, 2, 3}); // OK prefer #1 void *pv1 = i({ {1, 2, 3}, {4, 5, 6} }); // ambiguous btw 3 & 4 - // since-cxx11-error@-1 {{call to 'i' is ambiguous}} + // since-cxx11-error@-1 {{call to 'i' is ambiguous}} // since-cxx11-note@#cwg1591-i-2 {{candidate function [with T = int, N = 2, M = 3]}} // since-cxx11-note@#cwg1591-i-1 {{candidate function [with T = int, N = 2]}} char **pcc = i({ {1}, {2, 3} }); // OK #4 From 0d68bad78a68874c31cdb337f97a0c4336b1125b Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 7 Apr 2025 08:17:34 +0700 Subject: [PATCH 0802/1029] IR: Fix typo in unreachable message --- llvm/lib/IR/Value.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index b5a69b9ecdde4..2c41101ef56ff 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -228,7 +228,7 @@ void Value::dropDroppableUse(Use &U) { return; } - llvm_unreachable("unkown droppable use"); + llvm_unreachable("unknown droppable use"); } bool Value::isUsedInBasicBlock(const BasicBlock *BB) const { From e90d40afaff3124408d79ae8ef96ae021ff4f8cf Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 7 Apr 2025 12:28:59 +0700 Subject: [PATCH 0803/1029] NaryReassociate: Remove redundant run lines These only differed in quoting the passes argument or not. There is further redundancy in some of these tests, but they split the invocation across multiple opt runs --- llvm/test/Transforms/NaryReassociate/nary-add.ll | 1 - llvm/test/Transforms/NaryReassociate/nary-mul.ll | 1 - llvm/test/Transforms/NaryReassociate/nary-req.ll | 1 - llvm/test/Transforms/NaryReassociate/nary-smax.ll | 1 - llvm/test/Transforms/NaryReassociate/nary-smin.ll | 1 - llvm/test/Transforms/NaryReassociate/nary-umax.ll | 1 - llvm/test/Transforms/NaryReassociate/nary-umin.ll | 1 - llvm/test/Transforms/NaryReassociate/pr24301.ll | 1 - 8 files changed, 8 deletions(-) diff --git a/llvm/test/Transforms/NaryReassociate/nary-add.ll b/llvm/test/Transforms/NaryReassociate/nary-add.ll index c3975918427de..e765d05aaca36 100644 --- a/llvm/test/Transforms/NaryReassociate/nary-add.ll +++ b/llvm/test/Transforms/NaryReassociate/nary-add.ll @@ -1,5 +1,4 @@ ; RUN: opt < %s -passes=nary-reassociate -S | FileCheck %s -; RUN: opt < %s -passes='nary-reassociate' -S | FileCheck %s target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" diff --git a/llvm/test/Transforms/NaryReassociate/nary-mul.ll b/llvm/test/Transforms/NaryReassociate/nary-mul.ll index ed8dee339ddce..fe8f13aff8869 100644 --- a/llvm/test/Transforms/NaryReassociate/nary-mul.ll +++ b/llvm/test/Transforms/NaryReassociate/nary-mul.ll @@ -1,5 +1,4 @@ ; RUN: opt < %s -passes=nary-reassociate -S | FileCheck %s -; RUN: opt < %s -passes='nary-reassociate' -S | FileCheck %s target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" diff --git a/llvm/test/Transforms/NaryReassociate/nary-req.ll b/llvm/test/Transforms/NaryReassociate/nary-req.ll index 3efc9b7ccbcff..054afd7d44e00 100644 --- a/llvm/test/Transforms/NaryReassociate/nary-req.ll +++ b/llvm/test/Transforms/NaryReassociate/nary-req.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=nary-reassociate -S | FileCheck %s -; RUN: opt < %s -passes='nary-reassociate' -S | FileCheck %s declare i32 @llvm.smax.i32(i32 %a, i32 %b) declare i64 @llvm.umin.i64(i64, i64) diff --git a/llvm/test/Transforms/NaryReassociate/nary-smax.ll b/llvm/test/Transforms/NaryReassociate/nary-smax.ll index 425cd4f3a7570..43e93c7d2a08f 100644 --- a/llvm/test/Transforms/NaryReassociate/nary-smax.ll +++ b/llvm/test/Transforms/NaryReassociate/nary-smax.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=nary-reassociate -S | FileCheck %s -; RUN: opt < %s -passes='nary-reassociate' -S | FileCheck %s declare i32 @llvm.smax.i32(i32 %a, i32 %b) diff --git a/llvm/test/Transforms/NaryReassociate/nary-smin.ll b/llvm/test/Transforms/NaryReassociate/nary-smin.ll index 0eada42b1ba45..a8e9fe8247c7a 100644 --- a/llvm/test/Transforms/NaryReassociate/nary-smin.ll +++ b/llvm/test/Transforms/NaryReassociate/nary-smin.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=nary-reassociate -S | FileCheck %s -; RUN: opt < %s -passes='nary-reassociate' -S | FileCheck %s declare i32 @llvm.smin.i32(i32 %a, i32 %b) diff --git a/llvm/test/Transforms/NaryReassociate/nary-umax.ll b/llvm/test/Transforms/NaryReassociate/nary-umax.ll index 015f12d26f92f..b391fc4ba3ebb 100644 --- a/llvm/test/Transforms/NaryReassociate/nary-umax.ll +++ b/llvm/test/Transforms/NaryReassociate/nary-umax.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=nary-reassociate -S | FileCheck %s -; RUN: opt < %s -passes='nary-reassociate' -S | FileCheck %s declare i32 @llvm.umax.i32(i32 %a, i32 %b) diff --git a/llvm/test/Transforms/NaryReassociate/nary-umin.ll b/llvm/test/Transforms/NaryReassociate/nary-umin.ll index 3d738609322e1..58fdcc3f47a5a 100644 --- a/llvm/test/Transforms/NaryReassociate/nary-umin.ll +++ b/llvm/test/Transforms/NaryReassociate/nary-umin.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=nary-reassociate -S | FileCheck %s -; RUN: opt < %s -passes='nary-reassociate' -S | FileCheck %s declare i32 @llvm.umin.i32(i32 %a, i32 %b) diff --git a/llvm/test/Transforms/NaryReassociate/pr24301.ll b/llvm/test/Transforms/NaryReassociate/pr24301.ll index 84f88412c946f..8dad4d6121c65 100644 --- a/llvm/test/Transforms/NaryReassociate/pr24301.ll +++ b/llvm/test/Transforms/NaryReassociate/pr24301.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=nary-reassociate -S | FileCheck %s -; RUN: opt < %s -passes='nary-reassociate' -S | FileCheck %s define i32 @foo(i32 %t4) { ; CHECK-LABEL: @foo( From d9ccfd7568337c4e38211eabd87818fb39573004 Mon Sep 17 00:00:00 2001 From: Tobias Gysi Date: Mon, 7 Apr 2025 08:12:46 +0200 Subject: [PATCH 0804/1029] [mlir][llvm] Respect call noinline attr in inliner (#134493) This commit extends the LLVM dialect inliner interface to respect the call op's noinline attribute. This is a follow-up to https://github.com/llvm/llvm-project/pull/133726 which added the noinline attribute to the LLVM dialect call op. --- .../lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp | 7 ++++++- mlir/test/Dialect/LLVMIR/inlining.mlir | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp index aab8d037cd8d2..1edf7fd070b27 100644 --- a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp +++ b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp @@ -664,11 +664,16 @@ struct LLVMInlinerInterface : public DialectInlinerInterface { bool isLegalToInline(Operation *call, Operation *callable, bool wouldBeCloned) const final { - if (!isa(call)) { + auto callOp = dyn_cast(call); + if (!callOp) { LLVM_DEBUG(llvm::dbgs() << "Cannot inline: call is not an '" << LLVM::CallOp::getOperationName() << "' op\n"); return false; } + if (callOp.getNoInline()) { + LLVM_DEBUG(llvm::dbgs() << "Cannot inline: call is marked no_inline\n"); + return false; + } auto funcOp = dyn_cast(callable); if (!funcOp) { LLVM_DEBUG(llvm::dbgs() diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir index eb249a4771753..136d0f85d509a 100644 --- a/mlir/test/Dialect/LLVMIR/inlining.mlir +++ b/mlir/test/Dialect/LLVMIR/inlining.mlir @@ -95,7 +95,7 @@ llvm.func @foo() -> (i32) attributes { no_inline } { llvm.return %0 : i32 } -llvm.func @bar() -> (i32) attributes { no_inline } { +llvm.func @bar() -> (i32) { %0 = llvm.mlir.constant(1 : i32) : i32 llvm.return %0 : i32 } @@ -106,7 +106,7 @@ llvm.func @callee_with_multiple_blocks(%cond: i1) -> (i32) { %0 = llvm.call @foo() : () -> (i32) llvm.br ^bb3(%0: i32) ^bb2: - %1 = llvm.call @bar() : () -> (i32) + %1 = llvm.call @bar() { no_inline } : () -> (i32) llvm.br ^bb3(%1: i32) ^bb3(%arg: i32): llvm.return %arg : i32 From f280d60c9839120618da353ab71004be33c4fa53 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Mon, 7 Apr 2025 00:12:43 -0700 Subject: [PATCH 0805/1029] [CSKY] Simplify shouldForceRelocation with MCValue::Specifier --- llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp index fb4a6f9b41d01..ea7968f01ee4a 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp @@ -13,6 +13,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCValue.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "csky-asmbackend" @@ -263,17 +264,11 @@ bool CSKYAsmBackend::shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, const MCSubtargetInfo * /*STI*/) { - if (Fixup.getKind() >= FirstLiteralRelocationKind) + if (Target.getSpecifier()) return true; switch (Fixup.getTargetKind()) { default: break; - case CSKY::fixup_csky_got32: - case CSKY::fixup_csky_got_imm18_scale4: - case CSKY::fixup_csky_gotoff: - case CSKY::fixup_csky_gotpc: - case CSKY::fixup_csky_plt32: - case CSKY::fixup_csky_plt_imm18_scale4: case CSKY::fixup_csky_doffset_imm18: case CSKY::fixup_csky_doffset_imm18_scale2: case CSKY::fixup_csky_doffset_imm18_scale4: From 87a4215ed154e867683b10c8d7fe1dbc79d81abb Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Mon, 7 Apr 2025 15:18:47 +0800 Subject: [PATCH 0806/1029] [Clang] Always verify LLVM IR inputs (#134396) We get a lot of issues that basically boil down to "I passed malformed LLVM IR to clang and it crashed". Clang does not perform IR verification by default in (non-assertion-enabled) release builds, and that's sensible for IR that Clang itself produces, which is expected to always be valid. However, if people pass in their own handwritten IR, we should report if it is malformed, instead of crashing. We should also report it in a way that does not produce a crash trace and ask for a bug report, as currently happens in assertions-enabled builds. This aligns the behavior with how opt/llc work. --- clang/include/clang/Basic/DiagnosticFrontendKinds.td | 2 ++ clang/lib/CodeGen/CodeGenAction.cpp | 12 +++++++++++- clang/test/CodeGen/invalid_llvm_ir.ll | 12 ++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGen/invalid_llvm_ir.ll diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td index 5f64b1cbfac87..6c72775197823 100644 --- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td +++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td @@ -379,6 +379,8 @@ def err_ast_action_on_llvm_ir : Error< "cannot apply AST actions to LLVM IR file '%0'">, DefaultFatal; +def err_invalid_llvm_ir : Error<"invalid LLVM IR input: %0">; + def err_os_unsupport_riscv_fmv : Error< "function multiversioning is currently only supported on Linux">; diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp index 4321efd49af36..1f5eb427b566f 100644 --- a/clang/lib/CodeGen/CodeGenAction.cpp +++ b/clang/lib/CodeGen/CodeGenAction.cpp @@ -39,6 +39,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" #include "llvm/IRReader/IRReader.h" #include "llvm/LTO/LTOBackend.h" #include "llvm/Linker/Linker.h" @@ -1048,8 +1049,17 @@ CodeGenAction::loadModule(MemoryBufferRef MBRef) { // Handle textual IR and bitcode file with one single module. llvm::SMDiagnostic Err; - if (std::unique_ptr M = parseIR(MBRef, Err, *VMContext)) + if (std::unique_ptr M = parseIR(MBRef, Err, *VMContext)) { + // For LLVM IR files, always verify the input and report the error in a way + // that does not ask people to report an issue for it. + std::string VerifierErr; + raw_string_ostream VerifierErrStream(VerifierErr); + if (llvm::verifyModule(*M, &VerifierErrStream)) { + CI.getDiagnostics().Report(diag::err_invalid_llvm_ir) << VerifierErr; + return {}; + } return M; + } // If MBRef is a bitcode with multiple modules (e.g., -fsplit-lto-unit // output), place the extra modules (actually only one, a regular LTO module) diff --git a/clang/test/CodeGen/invalid_llvm_ir.ll b/clang/test/CodeGen/invalid_llvm_ir.ll new file mode 100644 index 0000000000000..97a6802bc105e --- /dev/null +++ b/clang/test/CodeGen/invalid_llvm_ir.ll @@ -0,0 +1,12 @@ +; RUN: not %clang %s 2>&1 | FileCheck %s +; RUN: llvm-as -disable-verify < %s > %t.bc +; RUN: not %clang %t.bc 2>&1 | FileCheck %s + +; CHECK: error: invalid LLVM IR input: PHINode should have one entry for each predecessor of its parent basic block! +; CHECK-NEXT: %phi = phi i32 [ 0, %entry ] + +define void @test() { +entry: + %phi = phi i32 [ 0, %entry ] + ret void +} From 31ef7acf12e7f5011a813dcfd08b821ec44865f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= Date: Mon, 7 Apr 2025 09:46:03 +0200 Subject: [PATCH 0807/1029] [clang][analyzer] Fix a possible crash in CastSizeChecker (#134387) --- .../Checkers/CastSizeChecker.cpp | 2 ++ clang/test/Analysis/castsize.c | 26 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 clang/test/Analysis/castsize.c diff --git a/clang/lib/StaticAnalyzer/Checkers/CastSizeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CastSizeChecker.cpp index 2cff97a591b8c..0b52c9bd8ac2a 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CastSizeChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/CastSizeChecker.cpp @@ -62,6 +62,8 @@ static bool evenFlexibleArraySize(ASTContext &Ctx, CharUnits RegionSize, assert(Last && "empty structs should already be handled"); const Type *ElemType = Last->getType()->getArrayElementTypeNoTypeQual(); + if (!ElemType) + return false; CharUnits FlexSize; if (const ConstantArrayType *ArrayTy = Ctx.getAsConstantArrayType(Last->getType())) { diff --git a/clang/test/Analysis/castsize.c b/clang/test/Analysis/castsize.c new file mode 100644 index 0000000000000..81aa60c0414cd --- /dev/null +++ b/clang/test/Analysis/castsize.c @@ -0,0 +1,26 @@ +// RUN: %clang_analyze_cc1 -verify %s \ +// RUN: -analyzer-checker=core,unix.Malloc,alpha.core.CastSize + +typedef typeof(sizeof(int)) size_t; +void *malloc(size_t); + +struct s1 { + int a; + char x[]; +}; + +struct s2 { + int a[100]; + char x[]; +}; + +union u { + struct s1 a; + struct s2 b; +}; + +static union u *test() { + union u *req; + req = malloc(5); // expected-warning{{Cast a region whose size is not a multiple of the destination type size}} + return req; +} From 7b3b4a5b1b5f8c3ae6855c92cdbe783c804408ea Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 7 Apr 2025 14:59:34 +0700 Subject: [PATCH 0808/1029] IR: Use poison in dropDroppableUse (#134576) --- llvm/lib/IR/Value.cpp | 2 +- .../Transforms/Mem2Reg/ignore-droppable.ll | 18 +++++++++--------- llvm/test/Transforms/SROA/assume.ll | 2 +- llvm/test/Transforms/SROA/ignore-droppable.ll | 18 +++++++++--------- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 2c41101ef56ff..bb8d69288b923 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -221,7 +221,7 @@ void Value::dropDroppableUse(Use &U) { if (OpNo == 0) U.set(ConstantInt::getTrue(Assume->getContext())); else { - U.set(UndefValue::get(U.get()->getType())); + U.set(PoisonValue::get(U.get()->getType())); CallInst::BundleOpInfo &BOI = Assume->getBundleOpInfoForOperand(OpNo); BOI.Tag = Assume->getContext().pImpl->getOrInsertBundleTag("ignore"); } diff --git a/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll b/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll index b63c09f03ffdf..e9f40b56400f4 100644 --- a/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll +++ b/llvm/test/Transforms/Mem2Reg/ignore-droppable.ll @@ -7,8 +7,8 @@ declare void @llvm.lifetime.end.p0(i64 %size, ptr nocapture %ptr) define void @positive_assume_uses(ptr %arg) { ; CHECK-LABEL: @positive_assume_uses( -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(ptr [[ARG:%.*]]), "ignore"(ptr undef, i64 2) ] -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef, i64 8), "nonnull"(ptr [[ARG]]) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(ptr [[ARG:%.*]]), "ignore"(ptr poison, i64 2) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison, i64 8), "nonnull"(ptr [[ARG]]) ] ; CHECK-NEXT: ret void ; %A = alloca i32 @@ -35,8 +35,8 @@ define void @negative_assume_condition_use() { define void @positive_multiple_assume_uses() { ; CHECK-LABEL: @positive_multiple_assume_uses( -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef, i64 8), "ignore"(ptr undef, i64 16) ] -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef), "ignore"(ptr undef, i64 2) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison, i64 8), "ignore"(ptr poison, i64 16) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison), "ignore"(ptr poison, i64 2) ] ; CHECK-NEXT: ret void ; %A = alloca {i8, i16} @@ -48,8 +48,8 @@ define void @positive_multiple_assume_uses() { define void @positive_gep_assume_uses() { ; CHECK-LABEL: @positive_gep_assume_uses( -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef, i64 8), "ignore"(ptr undef, i64 16) ] -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef), "ignore"(ptr undef, i64 2) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison, i64 8), "ignore"(ptr poison, i64 16) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison), "ignore"(ptr poison, i64 2) ] ; CHECK-NEXT: ret void ; %A = alloca {i8, i16} @@ -64,9 +64,9 @@ define void @positive_gep_assume_uses() { define void @positive_mixed_assume_uses() { ; CHECK-LABEL: @positive_mixed_assume_uses( -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef), "ignore"(ptr undef, i64 8), "ignore"(ptr undef, i64 16) ] -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef), "ignore"(ptr undef, i64 2), "ignore"(ptr undef) ] -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef), "ignore"(ptr undef, i64 2), "ignore"(ptr undef) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison), "ignore"(ptr poison, i64 8), "ignore"(ptr poison, i64 16) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison), "ignore"(ptr poison, i64 2), "ignore"(ptr poison) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison), "ignore"(ptr poison, i64 2), "ignore"(ptr poison) ] ; CHECK-NEXT: ret void ; %A = alloca i8 diff --git a/llvm/test/Transforms/SROA/assume.ll b/llvm/test/Transforms/SROA/assume.ll index 50bddc833a9ad..80522eecf5e8a 100644 --- a/llvm/test/Transforms/SROA/assume.ll +++ b/llvm/test/Transforms/SROA/assume.ll @@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu" define void @_ZN2bg2baIiEC2ES_(i64 %v) { ; CHECK-LABEL: @_ZN2bg2baIiEC2ES_( ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison) ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SROA/ignore-droppable.ll b/llvm/test/Transforms/SROA/ignore-droppable.ll index 2d090a5f01fe2..0b9a036b243b4 100644 --- a/llvm/test/Transforms/SROA/ignore-droppable.ll +++ b/llvm/test/Transforms/SROA/ignore-droppable.ll @@ -8,8 +8,8 @@ declare void @llvm.lifetime.end.p0(i64 %size, ptr nocapture %ptr) define void @positive_assume_uses(ptr %arg) { ; CHECK-LABEL: @positive_assume_uses( -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(ptr [[ARG:%.*]]), "ignore"(ptr undef, i64 2) ] -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef, i64 8), "nonnull"(ptr [[ARG]]) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(ptr [[ARG:%.*]]), "ignore"(ptr poison, i64 2) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison, i64 8), "nonnull"(ptr [[ARG]]) ] ; CHECK-NEXT: ret void ; %A = alloca i32 @@ -36,8 +36,8 @@ define void @negative_assume_condition_use() { define void @positive_multiple_assume_uses() { ; CHECK-LABEL: @positive_multiple_assume_uses( -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef, i64 8), "ignore"(ptr undef, i64 16) ] -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef), "ignore"(ptr undef, i64 2) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison, i64 8), "ignore"(ptr poison, i64 16) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison), "ignore"(ptr poison, i64 2) ] ; CHECK-NEXT: ret void ; %A = alloca {i8, i16} @@ -49,8 +49,8 @@ define void @positive_multiple_assume_uses() { define void @positive_gep_assume_uses() { ; CHECK-LABEL: @positive_gep_assume_uses( -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef, i64 8), "ignore"(ptr undef, i64 16) ] -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef), "ignore"(ptr undef, i64 2) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison, i64 8), "ignore"(ptr poison, i64 16) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison), "ignore"(ptr poison, i64 2) ] ; CHECK-NEXT: ret void ; %A = alloca {i8, i16} @@ -65,9 +65,9 @@ define void @positive_gep_assume_uses() { define void @positive_mixed_assume_uses() { ; CHECK-LABEL: @positive_mixed_assume_uses( -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef), "ignore"(ptr undef, i64 8), "ignore"(ptr undef, i64 16) ] -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef), "ignore"(ptr undef, i64 2), "ignore"(ptr undef) ] -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr undef), "ignore"(ptr undef, i64 2), "ignore"(ptr undef) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison), "ignore"(ptr poison, i64 8), "ignore"(ptr poison, i64 16) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison), "ignore"(ptr poison, i64 2), "ignore"(ptr poison) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "ignore"(ptr poison), "ignore"(ptr poison, i64 2), "ignore"(ptr poison) ] ; CHECK-NEXT: ret void ; %A = alloca i8 From 4a5ff3ec21d0c6476d0da93b8550ba93560a5cbe Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 7 Apr 2025 15:00:41 +0700 Subject: [PATCH 0809/1029] Value: Remove redundant removeFromList in dropDroppableUse (#134580) --- llvm/lib/IR/Value.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index bb8d69288b923..6c52ced5f73b2 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -215,7 +215,6 @@ void Value::dropDroppableUsesIn(User &Usr) { } void Value::dropDroppableUse(Use &U) { - U.removeFromList(); if (auto *Assume = dyn_cast(U.getUser())) { unsigned OpNo = U.getOperandNo(); if (OpNo == 0) From 2f8b486f979f4b89929a447f516fd1da9a659834 Mon Sep 17 00:00:00 2001 From: Robert Imschweiler Date: Mon, 7 Apr 2025 10:01:36 +0200 Subject: [PATCH 0810/1029] [IR][JumpThreading] Fix infinite recursion on compare self-reference (#129501) In unreachable code, constant PHI nodes may appear and be replaced by their single value. As a result, instructions may become self-referencing. This commit adds checks to avoid going into infinite recursion when handling self-referencing compare instructions in `evaluateOnPredecessorEdge()`. This LLVM defect was identified via the AMD Fuzzing project. --- .../llvm/Transforms/Scalar/JumpThreading.h | 5 + llvm/lib/Transforms/Scalar/JumpThreading.cpp | 24 +++- .../JumpThreading/unreachable-loops.ll | 117 ++++++++++++++++++ 3 files changed, 142 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h index 84292c716a0a9..182cab02e640c 100644 --- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h +++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h @@ -208,6 +208,11 @@ class JumpThreadingPass : public PassInfoMixin { /// if 'HasProfile' is true creates new instance through /// FunctionAnalysisManager, otherwise nullptr. BlockFrequencyInfo *getOrCreateBFI(bool Force = false); + + // Internal overload of evaluateOnPredecessorEdge(). + Constant *evaluateOnPredecessorEdge(BasicBlock *BB, BasicBlock *PredPredBB, + Value *cond, const DataLayout &DL, + SmallPtrSet &Visited); }; } // end namespace llvm diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 18d5f201413c8..3548412001ac6 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -1494,6 +1495,17 @@ Constant *JumpThreadingPass::evaluateOnPredecessorEdge(BasicBlock *BB, BasicBlock *PredPredBB, Value *V, const DataLayout &DL) { + SmallPtrSet Visited; + return evaluateOnPredecessorEdge(BB, PredPredBB, V, DL, Visited); +} + +Constant *JumpThreadingPass::evaluateOnPredecessorEdge( + BasicBlock *BB, BasicBlock *PredPredBB, Value *V, const DataLayout &DL, + SmallPtrSet &Visited) { + if (!Visited.insert(V).second) + return nullptr; + auto _ = make_scope_exit([&Visited, V]() { Visited.erase(V); }); + BasicBlock *PredBB = BB->getSinglePredecessor(); assert(PredBB && "Expected a single predecessor"); @@ -1515,12 +1527,16 @@ Constant *JumpThreadingPass::evaluateOnPredecessorEdge(BasicBlock *BB, } // If we have a CmpInst, try to fold it for each incoming edge into PredBB. + // Note that during the execution of the pass, phi nodes may become constant + // and may be removed, which can lead to self-referencing instructions in + // code that becomes unreachable. Consequently, we need to handle those + // instructions in unreachable code and check before going into recursion. if (CmpInst *CondCmp = dyn_cast(V)) { if (CondCmp->getParent() == BB) { - Constant *Op0 = - evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0), DL); - Constant *Op1 = - evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1), DL); + Constant *Op0 = evaluateOnPredecessorEdge( + BB, PredPredBB, CondCmp->getOperand(0), DL, Visited); + Constant *Op1 = evaluateOnPredecessorEdge( + BB, PredPredBB, CondCmp->getOperand(1), DL, Visited); if (Op0 && Op1) { return ConstantFoldCompareInstOperands(CondCmp->getPredicate(), Op0, Op1, DL); diff --git a/llvm/test/Transforms/JumpThreading/unreachable-loops.ll b/llvm/test/Transforms/JumpThreading/unreachable-loops.ll index d8bd3f389aae8..79c5e9217312d 100644 --- a/llvm/test/Transforms/JumpThreading/unreachable-loops.ll +++ b/llvm/test/Transforms/JumpThreading/unreachable-loops.ll @@ -180,4 +180,121 @@ cleanup2343.loopexit4: ; preds = %cleanup1491 unreachable } +; This segfaults due to recursion in %C4. Reason: %L6 is identified to be a +; "partially redundant load" and is replaced by a PHI node. The PHI node is then +; simplified to be constant and is removed. This leads to %L6 being replaced by +; %C4, which makes %C4 invalid since it uses %L6. +; The test case has been generated by the AMD Fuzzing project and simplified +; manually and by llvm-reduce. + +define i32 @constant_phi_leads_to_self_reference(ptr %ptr) { +; CHECK-LABEL: @constant_phi_leads_to_self_reference( +; CHECK-NEXT: [[A9:%.*]] = alloca i1, align 1 +; CHECK-NEXT: br label [[F6:%.*]] +; CHECK: T3: +; CHECK-NEXT: br label [[BB5:%.*]] +; CHECK: BB5: +; CHECK-NEXT: [[L10:%.*]] = load i1, ptr [[A9]], align 1 +; CHECK-NEXT: br i1 [[L10]], label [[BB6:%.*]], label [[F6]] +; CHECK: BB6: +; CHECK-NEXT: [[LGV3:%.*]] = load i1, ptr [[PTR:%.*]], align 1 +; CHECK-NEXT: [[C4:%.*]] = icmp sle i1 [[C4]], true +; CHECK-NEXT: store i1 [[C4]], ptr [[PTR]], align 1 +; CHECK-NEXT: br i1 [[C4]], label [[F6]], label [[T3:%.*]] +; CHECK: F6: +; CHECK-NEXT: ret i32 0 +; CHECK: F7: +; CHECK-NEXT: br label [[BB5]] +; + %A9 = alloca i1, align 1 + br i1 false, label %BB4, label %F6 + +BB4: ; preds = %0 + br i1 false, label %F6, label %F1 + +F1: ; preds = %BB4 + br i1 false, label %T4, label %T3 + +T3: ; preds = %T4, %BB6, %F1 + %L6 = load i1, ptr %ptr, align 1 + br label %BB5 + +BB5: ; preds = %F7, %T3 + %L10 = load i1, ptr %A9, align 1 + br i1 %L10, label %BB6, label %F6 + +BB6: ; preds = %BB5 + %LGV3 = load i1, ptr %ptr, align 1 + %C4 = icmp sle i1 %L6, true + store i1 %C4, ptr %ptr, align 1 + br i1 %L6, label %F6, label %T3 + +T4: ; preds = %F1 + br label %T3 + +F6: ; preds = %BB6, %BB5, %BB4, %0 + ret i32 0 + +F7: ; No predecessors! + br label %BB5 +} + +; Same as above, but with multiple icmps referencing the same PHI node. + +define i32 @recursive_icmp_mult(ptr %ptr) { +; CHECK-LABEL: @recursive_icmp_mult( +; CHECK-NEXT: [[A9:%.*]] = alloca i1, align 1 +; CHECK-NEXT: br label [[F6:%.*]] +; CHECK: T3: +; CHECK-NEXT: br label [[BB5:%.*]] +; CHECK: BB5: +; CHECK-NEXT: [[L10:%.*]] = load i1, ptr [[A9]], align 1 +; CHECK-NEXT: br i1 [[L10]], label [[BB6:%.*]], label [[F6]] +; CHECK: BB6: +; CHECK-NEXT: [[LGV3:%.*]] = load i1, ptr [[PTR:%.*]], align 1 +; CHECK-NEXT: [[C4:%.*]] = icmp sle i1 [[C6:%.*]], true +; CHECK-NEXT: [[C5:%.*]] = icmp sle i1 [[C6]], false +; CHECK-NEXT: [[C6]] = icmp sle i1 [[C4]], [[C5]] +; CHECK-NEXT: store i1 [[C6]], ptr [[PTR]], align 1 +; CHECK-NEXT: br i1 [[C6]], label [[F6]], label [[T3:%.*]] +; CHECK: F6: +; CHECK-NEXT: ret i32 0 +; CHECK: F7: +; CHECK-NEXT: br label [[BB5]] +; + %A9 = alloca i1, align 1 + br i1 false, label %BB4, label %F6 + +BB4: ; preds = %0 + br i1 false, label %F6, label %F1 + +F1: ; preds = %BB4 + br i1 false, label %T4, label %T3 + +T3: ; preds = %T4, %BB6, %F1 + %L6 = load i1, ptr %ptr, align 1 + br label %BB5 + +BB5: ; preds = %F7, %T3 + %L10 = load i1, ptr %A9, align 1 + br i1 %L10, label %BB6, label %F6 + +BB6: ; preds = %BB5 + %LGV3 = load i1, ptr %ptr, align 1 + %C4 = icmp sle i1 %L6, true + %C5 = icmp sle i1 %L6, false + %C6 = icmp sle i1 %C4, %C5 + store i1 %C6, ptr %ptr, align 1 + br i1 %L6, label %F6, label %T3 + +T4: ; preds = %F1 + br label %T3 + +F6: ; preds = %BB6, %BB5, %BB4, %0 + ret i32 0 + +F7: ; No predecessors! + br label %BB5 +} + !0 = !{!"branch_weights", i32 2146410443, i32 1073205} From 04bb8ecb05ae4dc2a0407503678e0cef79c1d46c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 7 Apr 2025 15:20:23 +0700 Subject: [PATCH 0811/1029] AMDGPU: Disable sincos fold for constant inputs (#134579) --- llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp | 5 +++++ .../AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll | 8 +++----- .../CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll | 11 +++++------ 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index cf8b416d23e50..b65b4d67b3f8c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -1361,6 +1361,11 @@ bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN; Value *CArgVal = FPOp->getOperand(0); + + // TODO: Constant fold the call + if (isa(CArgVal)) + return false; + CallInst *CI = cast(FPOp); Function *F = B.GetInsertBlock()->getParent(); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll index ad7e913e6d22c..c8f45fe11390c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.ll @@ -125,12 +125,10 @@ define void @sincos_f32_value_is_same_constantfp(ptr addrspace(1) nocapture writ ; CHECK-LABEL: define void @sincos_f32_value_is_same_constantfp ; CHECK-SAME: (ptr addrspace(1) writeonly captures(none) [[SIN_OUT:%.*]], ptr addrspace(1) writeonly captures(none) [[COS_OUT:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) -; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float 4.200000e+01, ptr addrspace(5) [[__SINCOS_]]) -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 -; CHECK-NEXT: store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3sinf(float 4.200000e+01) +; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4 ; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float 4.200000e+01) -; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4 +; CHECK-NEXT: store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll index da5686972a86b..03b7d1646eb86 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll @@ -745,13 +745,12 @@ entry: define void @sincos_f32_value_is_same_constantfp(ptr addrspace(1) nocapture writeonly %sin_out, ptr addrspace(1) nocapture writeonly %cos_out) { ; CHECK-LABEL: define void @sincos_f32_value_is_same_constantfp -; CHECK-SAME: (ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[SIN_OUT:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] { +; CHECK-SAME: (ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[SIN_OUT:%.*]], ptr addrspace(1) writeonly captures(none) initializes((0, 4)) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR2]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5) -; CHECK-NEXT: [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float 4.200000e+01, ptr addrspace(5) [[__SINCOS_]]) -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4 -; CHECK-NEXT: store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4 -; CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call contract float @_Z3sinf(float 4.200000e+01) +; CHECK-NEXT: store float [[CALL]], ptr addrspace(1) [[SIN_OUT]], align 4 +; CHECK-NEXT: [[CALL1:%.*]] = tail call contract float @_Z3cosf(float 4.200000e+01) +; CHECK-NEXT: store float [[CALL1]], ptr addrspace(1) [[COS_OUT]], align 4 ; CHECK-NEXT: ret void ; entry: From 4a425a4966d6421674d4300e32b0eb57ebade65a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 7 Apr 2025 15:25:03 +0700 Subject: [PATCH 0812/1029] NaryReassociate: Check pattern before user scan (#134587) --- llvm/lib/Transforms/Scalar/NaryReassociate.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index 3b3f32a0ea591..ec17443b37143 100644 --- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -611,15 +611,15 @@ Value *NaryReassociatePass::tryReassociateMinOrMax(Instruction *I, Value *A = nullptr, *B = nullptr; MaxMinT m_MaxMin(m_Value(A), m_Value(B)); + if (!match(LHS, m_MaxMin)) + return nullptr; + if (LHS->hasNUsesOrMore(3) || // The optimization is profitable only if LHS can be removed in the end. // In other words LHS should be used (directly or indirectly) by I only. - llvm::any_of(LHS->users(), - [&](auto *U) { - return U != I && - !(U->hasOneUser() && *U->users().begin() == I); - }) || - !match(LHS, m_MaxMin)) + llvm::any_of(LHS->users(), [&](auto *U) { + return U != I && !(U->hasOneUser() && *U->users().begin() == I); + })) return nullptr; auto tryCombination = [&](Value *A, const SCEV *AExpr, Value *B, From 44e32fb80272b77186b42c7583dd0ed8ad668af4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Mon, 7 Apr 2025 10:44:37 +0200 Subject: [PATCH 0813/1029] [Clang][OpenCL] Fix wait_for_event argument address space with -fdeclare-opencl-builtins (#134598) The pointer argument for `wait_for_event(int, event_t*)` should take the default address space: generic if available, otherwise private. Before this patch it would always be generic with `-fdeclare-opencl-builtins`. This was inconsistent with the behavior when opencl-c.h is included. --- clang/lib/Sema/OpenCLBuiltins.td | 18 +++++++++++++++--- .../CodeGenOpenCL/fdeclare-opencl-builtins.cl | 9 +++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/clang/lib/Sema/OpenCLBuiltins.td b/clang/lib/Sema/OpenCLBuiltins.td index 4da61429fcce7..528b700a275e0 100644 --- a/clang/lib/Sema/OpenCLBuiltins.td +++ b/clang/lib/Sema/OpenCLBuiltins.td @@ -958,13 +958,25 @@ foreach name = ["async_work_group_strided_copy"] in { def : Builtin, PointerType, GlobalAS>, Size, Size, Event]>; def : Builtin, PointerType, LocalAS>, Size, Size, Event]>; } -foreach name = ["wait_group_events"] in { - def : Builtin]>; -} foreach name = ["prefetch"] in { def : Builtin, GlobalAS>, Size]>; } +// The wait_group_events is declared with an argument of type event_t*. +// The address-space of the pointer parameter is different if the generic address space is available. +multiclass BuiltinWithDefaultPointerArg { + foreach name = ["wait_group_events"] in { + def : Builtin]>; + } +} + +let Extension = FuncExtOpenCLCNamedAddressSpaceBuiltins in { + defm : BuiltinWithDefaultPointerArg; +} +let Extension = FuncExtOpenCLCGenericAddressSpace in { + defm : BuiltinWithDefaultPointerArg; +} + //-------------------------------------------------------------------- // OpenCL v2.0 s6.13.11 - Atomics Functions. // Functions that use memory_order and cl_mem_fence_flags enums are not diff --git a/clang/test/CodeGenOpenCL/fdeclare-opencl-builtins.cl b/clang/test/CodeGenOpenCL/fdeclare-opencl-builtins.cl index ac3bff9dbde27..8bd1db5d06819 100644 --- a/clang/test/CodeGenOpenCL/fdeclare-opencl-builtins.cl +++ b/clang/test/CodeGenOpenCL/fdeclare-opencl-builtins.cl @@ -48,6 +48,15 @@ void test_generic_optionality(float a, float *b) { float res = fract(a, b); } +// Test that the correct builtin is called depending on the generic address +// space feature availability. If not available, the __private version is called +// CHECK-LABEL: @test_wait_group_events +// CHECK-GAS: call spir_func void @_Z17wait_group_eventsiPU3AS49ocl_event +// CHECK-NOGAS: call spir_func void @_Z17wait_group_eventsiP9ocl_event +void test_wait_group_events(int i, event_t *e) { + wait_group_events(i, e); +} + // CHECK: attributes [[ATTR_CONST]] = // CHECK-SAME: memory(none) // CHECK: attributes [[ATTR_PURE]] = From f20cb3f8d2d4e97760a1a589290224a33ec60473 Mon Sep 17 00:00:00 2001 From: Mike Date: Mon, 7 Apr 2025 11:54:44 +0300 Subject: [PATCH 0814/1029] [mlir][bufferization] Drop the assumption for alloc result index (#134503) Relax the assumption that alloc op always has allocation at `getResult(0)`, allow to use `optimize-allocation-liveness` pass for custom ops with >1 results. Ops with multiple allocations are not handled here yet. --- .../Transforms/OptimizeAllocationLiveness.cpp | 30 +++++++++++++++++-- .../optimize-allocation-liveness.mlir | 25 ++++++++++++++++ mlir/test/lib/Dialect/Test/TestOps.td | 12 ++++++++ 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OptimizeAllocationLiveness.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OptimizeAllocationLiveness.cpp index 5178d4a62f374..e17b39cd7e371 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OptimizeAllocationLiveness.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OptimizeAllocationLiveness.cpp @@ -17,7 +17,10 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #define DEBUG_TYPE "optimize-allocation-liveness" #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ") @@ -88,6 +91,19 @@ static bool hasMemoryAllocEffect(MemoryEffectOpInterface memEffectOp) { return false; } +/// Extracts OpResult's with Allocate effects from given op +static SmallVector +collectAllocations(MemoryEffectOpInterface allocOp) { + SmallVector effects; + allocOp.getEffects(effects); + SmallVector allocResults; + for (const MemoryEffects::EffectInstance &it : effects) + if (isa(it.getEffect())) + if (auto val = it.getValue(); val && val.getDefiningOp() == allocOp) + allocResults.push_back(cast(val)); + return allocResults; +} + struct OptimizeAllocationLiveness : public bufferization::impl::OptimizeAllocationLivenessPassBase< OptimizeAllocationLiveness> { @@ -109,7 +125,15 @@ struct OptimizeAllocationLiveness auto allocOp = memEffectOp; LDBG("Checking alloc op: " << allocOp); - auto deallocOp = findUserWithFreeSideEffect(allocOp->getResult(0)); + SmallVector allocationResults = collectAllocations(allocOp); + // Multiple allocations from a single op are not considered here yet. + if (allocationResults.size() != 1) + return WalkResult::advance(); + + OpResult allocResult = allocationResults[0]; + LDBG("On allocation result: " << allocResult); + + auto *deallocOp = findUserWithFreeSideEffect(allocResult); if (!deallocOp || (deallocOp->getBlock() != allocOp->getBlock())) { // The pass handles allocations that have a single dealloc op in the // same block. We also should not hoist the dealloc op out of @@ -119,9 +143,9 @@ struct OptimizeAllocationLiveness Operation *lastUser = nullptr; const BufferViewFlowAnalysis::ValueSetT &deps = - analysis.resolve(allocOp->getResult(0)); + analysis.resolve(allocResult); for (auto dep : llvm::make_early_inc_range(deps)) { - for (auto user : dep.getUsers()) { + for (auto *user : dep.getUsers()) { // We are looking for a non dealloc op user. // check if user is the dealloc op itself. if (user == deallocOp) diff --git a/mlir/test/Dialect/Bufferization/Transforms/optimize-allocation-liveness.mlir b/mlir/test/Dialect/Bufferization/Transforms/optimize-allocation-liveness.mlir index 5f5a0ce54e2c1..63d33e3a88bed 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/optimize-allocation-liveness.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/optimize-allocation-liveness.mlir @@ -209,3 +209,28 @@ func.func private @test_conditional_deallocation() -> memref<32xf32, 1> { return %3 : memref<32xf32, 1> } + +// ----- +// CHECK-LABEL: func.func private @test_alloc_with_multiple_results() { +// CHECK: %[[ID1:.+]], %[[ALLOC1:.+]] = test.alloc_with_multiple_results : index, memref<64xf32> +// CHECK: memref.expand_shape %[[ALLOC1]] +// CHECK: memref.dealloc %[[ALLOC1]] : memref<64xf32> +// CHECK: %[[ID2:.+]], %[[ALLOC2:.+]] = test.alloc_with_multiple_results : index, memref<64xf32> +// CHECK: memref.expand_shape %[[ALLOC2]] +// CHECK: memref.dealloc %[[ALLOC2]] : memref<64xf32> +// CHECK: return +// CHECK: } + +// This test will check that allocations with multiple results and allocated +// buffer at non-zero position are accepted. +func.func private @test_alloc_with_multiple_results() -> () { + %id1, %alloc1 = test.alloc_with_multiple_results : index, memref<64xf32> + %expand_shape1 = memref.expand_shape %alloc1 [[0, 1]] output_shape [8, 8] : memref<64xf32> into memref<8x8xf32> + + %id2, %alloc2 = test.alloc_with_multiple_results : index, memref<64xf32> + %expand_shape2 = memref.expand_shape %alloc2 [[0, 1]] output_shape [8, 8] : memref<64xf32> into memref<8x8xf32> + + memref.dealloc %alloc1 : memref<64xf32> + memref.dealloc %alloc2 : memref<64xf32> + return +} diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index d8024145e711f..31be00ace1384 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -3441,4 +3441,16 @@ def TestMultiSlotAlloca : TEST_Op<"multi_slot_alloca", let assemblyFormat = "attr-dict `:` functional-type(operands, results)"; } +//===----------------------------------------------------------------------===// +// Test allocation Ops +//===----------------------------------------------------------------------===// + +def TestAllocWithMultipleResults : TEST_Op<"alloc_with_multiple_results"> { + let results = (outs Index:$index, + Res:$memref); + let assemblyFormat = [{ + attr-dict `:` type($index) `,` type($memref) + }]; +} + #endif // TEST_OPS From be6ccc98f38227db02164f17bfaf0ac86d800e4a Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 7 Apr 2025 09:55:52 +0100 Subject: [PATCH 0815/1029] [VPlan] Split out VPBlendRecipe simplifications from simplifyRecipes. NFC (#134073) This is split off from #133977 VPBlendRecipe normalisation is sensitive to the number of users a mask has, so should probably be run after the masks are simplified as much as possible. Note this could be run after removeDeadRecipes but this causes test diffs, some regressions, so this is left to a later patch. --- .../Transforms/Vectorize/VPlanTransforms.cpp | 148 ++++++++++-------- 1 file changed, 80 insertions(+), 68 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 818ed9f5dee23..67a55aa67c978 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -926,74 +926,6 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) { static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { using namespace llvm::VPlanPatternMatch; - if (auto *Blend = dyn_cast(&R)) { - // Try to remove redundant blend recipes. - SmallPtrSet UniqueValues; - if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) - UniqueValues.insert(Blend->getIncomingValue(0)); - for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) - if (!match(Blend->getMask(I), m_False())) - UniqueValues.insert(Blend->getIncomingValue(I)); - - if (UniqueValues.size() == 1) { - Blend->replaceAllUsesWith(*UniqueValues.begin()); - Blend->eraseFromParent(); - return; - } - - if (Blend->isNormalized()) - return; - - // Normalize the blend so its first incoming value is used as the initial - // value with the others blended into it. - - unsigned StartIndex = 0; - for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { - // If a value's mask is used only by the blend then is can be deadcoded. - // TODO: Find the most expensive mask that can be deadcoded, or a mask - // that's used by multiple blends where it can be removed from them all. - VPValue *Mask = Blend->getMask(I); - if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) { - StartIndex = I; - break; - } - } - - SmallVector OperandsWithMask; - OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); - - for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { - if (I == StartIndex) - continue; - OperandsWithMask.push_back(Blend->getIncomingValue(I)); - OperandsWithMask.push_back(Blend->getMask(I)); - } - - auto *NewBlend = new VPBlendRecipe( - cast(Blend->getUnderlyingValue()), OperandsWithMask); - NewBlend->insertBefore(&R); - - VPValue *DeadMask = Blend->getMask(StartIndex); - Blend->replaceAllUsesWith(NewBlend); - Blend->eraseFromParent(); - recursivelyDeleteDeadRecipes(DeadMask); - - /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask. - VPValue *NewMask; - if (NewBlend->getNumOperands() == 3 && - match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) { - VPValue *Inc0 = NewBlend->getOperand(0); - VPValue *Inc1 = NewBlend->getOperand(1); - VPValue *OldMask = NewBlend->getOperand(2); - NewBlend->setOperand(0, Inc1); - NewBlend->setOperand(1, Inc0); - NewBlend->setOperand(2, NewMask); - if (OldMask->getNumUsers() == 0) - cast(OldMask)->eraseFromParent(); - } - return; - } - // VPScalarIVSteps can only be simplified after unrolling. VPScalarIVSteps for // part 0 can be replaced by their start value, if only the first lane is // demanded. @@ -1092,6 +1024,85 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { } } +/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes +/// to make sure the masks are simplified. +static void simplifyBlends(VPlan &Plan) { + using namespace llvm::VPlanPatternMatch; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *Blend = dyn_cast(&R); + if (!Blend) + continue; + + // Try to remove redundant blend recipes. + SmallPtrSet UniqueValues; + if (Blend->isNormalized() || !match(Blend->getMask(0), m_False())) + UniqueValues.insert(Blend->getIncomingValue(0)); + for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I) + if (!match(Blend->getMask(I), m_False())) + UniqueValues.insert(Blend->getIncomingValue(I)); + + if (UniqueValues.size() == 1) { + Blend->replaceAllUsesWith(*UniqueValues.begin()); + Blend->eraseFromParent(); + continue; + } + + if (Blend->isNormalized()) + continue; + + // Normalize the blend so its first incoming value is used as the initial + // value with the others blended into it. + + unsigned StartIndex = 0; + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { + // If a value's mask is used only by the blend then is can be deadcoded. + // TODO: Find the most expensive mask that can be deadcoded, or a mask + // that's used by multiple blends where it can be removed from them all. + VPValue *Mask = Blend->getMask(I); + if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) { + StartIndex = I; + break; + } + } + + SmallVector OperandsWithMask; + OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex)); + + for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) { + if (I == StartIndex) + continue; + OperandsWithMask.push_back(Blend->getIncomingValue(I)); + OperandsWithMask.push_back(Blend->getMask(I)); + } + + auto *NewBlend = new VPBlendRecipe( + cast(Blend->getUnderlyingValue()), OperandsWithMask); + NewBlend->insertBefore(&R); + + VPValue *DeadMask = Blend->getMask(StartIndex); + Blend->replaceAllUsesWith(NewBlend); + Blend->eraseFromParent(); + recursivelyDeleteDeadRecipes(DeadMask); + + /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask. + VPValue *NewMask; + if (NewBlend->getNumOperands() == 3 && + match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) { + VPValue *Inc0 = NewBlend->getOperand(0); + VPValue *Inc1 = NewBlend->getOperand(1); + VPValue *OldMask = NewBlend->getOperand(2); + NewBlend->setOperand(0, Inc1); + NewBlend->setOperand(1, Inc0); + NewBlend->setOperand(2, NewMask); + if (OldMask->getNumUsers() == 0) + cast(OldMask)->eraseFromParent(); + } + } + } +} + /// Optimize the width of vector induction variables in \p Plan based on a known /// constant Trip Count, \p BestVF and \p BestUF. static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, @@ -1733,6 +1744,7 @@ void VPlanTransforms::optimize(VPlan &Plan) { runPass(removeRedundantInductionCasts, Plan); runPass(simplifyRecipes, Plan, *Plan.getCanonicalIV()->getScalarType()); + runPass(simplifyBlends, Plan); runPass(removeDeadRecipes, Plan); runPass(legalizeAndOptimizeInductions, Plan); runPass(removeRedundantExpandSCEVRecipes, Plan); From 387a8859cfea9e6f8282f14f21064d9ec562e66a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 7 Apr 2025 10:29:16 +0100 Subject: [PATCH 0816/1029] Fix MSVC "32-bit shift implicitly converted to 64 bits" warning. NFCI. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index ad44ee755698a..2a1dd2b2def17 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -15095,7 +15095,7 @@ static SDValue reverseZExtICmpCombine(SDNode *N, SelectionDAG &DAG, SDValue Res = DAG.getNode(ISD::AND, DL, WideVT, X, - DAG.getConstant(1 << ShAmt.getZExtValue(), DL, WideVT)); + DAG.getConstant(1ULL << ShAmt.getZExtValue(), DL, WideVT)); Res = DAG.getSetCC(DL, EVT::getVectorVT(*DAG.getContext(), MVT::i1, WideVT.getVectorElementCount()), From 679b2f714a3a3cbf487480127a4cc6ed296c7fab Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 7 Apr 2025 10:30:59 +0100 Subject: [PATCH 0817/1029] Fix MSVC "not all control paths return a value" warning. NFCI. --- llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp index c2dae5e3e5443..243ee37886026 100644 --- a/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp @@ -143,6 +143,7 @@ static llvm::StringRef getStorageClassString(XCOFF::StorageClass SC) { case XCOFF::StorageClass::C_TCSYM: return "C_TCSYM (Reserved)"; } + llvm_unreachable("Unknown XCOFF::StorageClass enum"); } Error XCOFFLinkGraphBuilder::processSections() { From ec400277c6810915a501fa901e9ba58ab6ade831 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 7 Apr 2025 10:31:38 +0100 Subject: [PATCH 0818/1029] Fix MSVC "not all control paths return a value" warning. NFCI. --- clang/lib/Driver/ToolChains/Darwin.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index 63e9fbfd4304c..4735dc3ad30ee 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1932,6 +1932,7 @@ struct DarwinPlatform { case DarwinPlatformKind::XROS: return llvm::Triple::XROS; } + llvm_unreachable("Unknown DarwinPlatformKind enum"); } SourceKind Kind; From 65c7ea713e0b411a707b0ccac374bda9f30234ea Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 7 Apr 2025 16:52:11 +0700 Subject: [PATCH 0819/1029] SLPVectorizer: Avoid looking at uselists of constants (#134578) --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 31c684e16f051..94c0289807245 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6273,7 +6273,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, SmallVector UserBVHead(TE.Scalars.size()); for (auto [I, V] : zip(UserBVHead, TE.Scalars)) { - if (!V->hasNUsesOrMore(1)) + if (isa(V) || !V->hasNUsesOrMore(1)) continue; auto *II = dyn_cast(*V->user_begin()); if (!II) @@ -13433,7 +13433,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { allSameBlock(VectorizableTree.front()->Scalars)); if (any_of(VectorizableTree, [&](const std::unique_ptr &TE) { return TE->isGather() && all_of(TE->Scalars, [&](Value *V) { - return isa(V) || + return isa(V) || (IsAllowedSingleBVNode && !V->hasNUsesOrMore(UsesLimit) && any_of(V->users(), IsaPred)); @@ -19459,7 +19459,7 @@ bool BoUpSLP::collectValuesToDemote( return FinalAnalysis(); if (any_of(E.Scalars, [&](Value *V) { - return !all_of(V->users(), [=](User *U) { + return !isa(V) && !all_of(V->users(), [=](User *U) { return isVectorized(U) || (E.Idx == 0 && UserIgnoreList && UserIgnoreList->contains(U)) || From 431c8dd0736c61176831750783d1195b9aa1a308 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Mon, 7 Apr 2025 03:03:04 -0700 Subject: [PATCH 0820/1029] [mlir][IR] Add support for UnknownLoc to `verify-diagnostics` (#134421) Diagnostics at unknown locations can now be verified with `-verify-diagnostics`. Example: ``` // expected-error@unknown {{something went wrong}} ``` Also clean up some MemRefToLLVM conversion tests that had to redirect all errors to stdout in order to FileCheck them. All of those tests can now be stored in a single `invalid.mlir`. That was not possible before. --- mlir/include/mlir/IR/Diagnostics.h | 4 +- mlir/lib/IR/Diagnostics.cpp | 66 +++++++++++-------- .../Conversion/MemRefToLLVM/invalid-uint.mlir | 8 --- .../test/Conversion/MemRefToLLVM/invalid.mlir | 34 ++++++++-- .../Conversion/MemRefToLLVM/issue-70160.mlir | 15 ----- 5 files changed, 70 insertions(+), 57 deletions(-) delete mode 100644 mlir/test/Conversion/MemRefToLLVM/invalid-uint.mlir delete mode 100644 mlir/test/Conversion/MemRefToLLVM/issue-70160.mlir diff --git a/mlir/include/mlir/IR/Diagnostics.h b/mlir/include/mlir/IR/Diagnostics.h index 36c433c63b26d..59bed71e4db88 100644 --- a/mlir/include/mlir/IR/Diagnostics.h +++ b/mlir/include/mlir/IR/Diagnostics.h @@ -640,8 +640,8 @@ class SourceMgrDiagnosticVerifierHandler : public SourceMgrDiagnosticHandler { /// Process a single diagnostic. void process(Diagnostic &diag); - /// Process a FileLineColLoc diagnostic. - void process(FileLineColLoc loc, StringRef msg, DiagnosticSeverity kind); + /// Process a LocationAttr diagnostic. + void process(LocationAttr loc, StringRef msg, DiagnosticSeverity kind); std::unique_ptr impl; }; diff --git a/mlir/lib/IR/Diagnostics.cpp b/mlir/lib/IR/Diagnostics.cpp index 19b32120f5890..b699e396f6577 100644 --- a/mlir/lib/IR/Diagnostics.cpp +++ b/mlir/lib/IR/Diagnostics.cpp @@ -678,10 +678,13 @@ struct SourceMgrDiagnosticVerifierHandlerImpl { /// A list of expected diagnostics for each buffer of the source manager. llvm::StringMap> expectedDiagsPerFile; + /// A list of expected diagnostics with unknown locations. + SmallVector expectedUnknownLocDiags; + /// Regex to match the expected diagnostics format. llvm::Regex expected = llvm::Regex("expected-(error|note|remark|warning)(-re)? " - "*(@([+-][0-9]+|above|below))? *{{(.*)}}$"); + "*(@([+-][0-9]+|above|below|unknown))? *{{(.*)}}$"); }; } // namespace detail } // namespace mlir @@ -774,6 +777,11 @@ SourceMgrDiagnosticVerifierHandlerImpl::computeExpectedDiags( record.lineNo += offset; else record.lineNo -= offset; + } else if (offsetMatch.consume_front("unknown")) { + // This is matching unknown locations. + record.fileLoc = SMLoc(); + expectedUnknownLocDiags.emplace_back(std::move(record)); + continue; } else if (offsetMatch.consume_front("above")) { // If the designator applies 'above' we add it to the last non // designator line. @@ -828,43 +836,45 @@ SourceMgrDiagnosticVerifierHandler::~SourceMgrDiagnosticVerifierHandler() { /// verified correctly, failure otherwise. LogicalResult SourceMgrDiagnosticVerifierHandler::verify() { // Verify that all expected errors were seen. - for (auto &expectedDiagsPair : impl->expectedDiagsPerFile) { - for (auto &err : expectedDiagsPair.second) { - if (err.matched) - continue; + auto checkExpectedDiags = [&](ExpectedDiag &err) { + if (!err.matched) impl->status = err.emitError(os, mgr, "expected " + getDiagKindStr(err.kind) + " \"" + err.substring + "\" was not produced"); - } - } + }; + for (auto &expectedDiagsPair : impl->expectedDiagsPerFile) + for (auto &err : expectedDiagsPair.second) + checkExpectedDiags(err); + for (auto &err : impl->expectedUnknownLocDiags) + checkExpectedDiags(err); impl->expectedDiagsPerFile.clear(); return impl->status; } /// Process a single diagnostic. void SourceMgrDiagnosticVerifierHandler::process(Diagnostic &diag) { - auto kind = diag.getSeverity(); - - // Process a FileLineColLoc. - if (auto fileLoc = diag.getLocation()->findInstanceOf()) - return process(fileLoc, diag.str(), kind); - - emitDiagnostic(diag.getLocation(), - "unexpected " + getDiagKindStr(kind) + ": " + diag.str(), - DiagnosticSeverity::Error); - impl->status = failure(); + return process(diag.getLocation(), diag.str(), diag.getSeverity()); } -/// Process a FileLineColLoc diagnostic. -void SourceMgrDiagnosticVerifierHandler::process(FileLineColLoc loc, +/// Process a diagnostic at a certain location. +void SourceMgrDiagnosticVerifierHandler::process(LocationAttr loc, StringRef msg, DiagnosticSeverity kind) { - // Get the expected diagnostics for this file. - auto diags = impl->getExpectedDiags(loc.getFilename()); - if (!diags) { - diags = impl->computeExpectedDiags(os, mgr, - getBufferForFile(loc.getFilename())); + FileLineColLoc fileLoc = loc.findInstanceOf(); + MutableArrayRef diags; + + if (fileLoc) { + // Get the expected diagnostics for this file. + if (auto maybeDiags = impl->getExpectedDiags(fileLoc.getFilename())) { + diags = *maybeDiags; + } else { + diags = impl->computeExpectedDiags( + os, mgr, getBufferForFile(fileLoc.getFilename())); + } + } else { + // Get all expected diagnostics at unknown locations. + diags = impl->expectedUnknownLocDiags; } // Search for a matching expected diagnostic. @@ -872,9 +882,11 @@ void SourceMgrDiagnosticVerifierHandler::process(FileLineColLoc loc, ExpectedDiag *nearMiss = nullptr; // If this was an expected error, remember that we saw it and return. - unsigned line = loc.getLine(); - for (auto &e : *diags) { - if (line == e.lineNo && e.match(msg)) { + for (auto &e : diags) { + // File line must match (unless it's an unknown location). + if (fileLoc && fileLoc.getLine() != e.lineNo) + continue; + if (e.match(msg)) { if (e.kind == kind) { e.matched = true; return; diff --git a/mlir/test/Conversion/MemRefToLLVM/invalid-uint.mlir b/mlir/test/Conversion/MemRefToLLVM/invalid-uint.mlir deleted file mode 100644 index 7e94677ebbdd7..0000000000000 --- a/mlir/test/Conversion/MemRefToLLVM/invalid-uint.mlir +++ /dev/null @@ -1,8 +0,0 @@ -// RUN: mlir-opt %s -finalize-memref-to-llvm -verify-diagnostics - -// CHECK-LABEL: @invalid_int_conversion -func.func @invalid_int_conversion() { - // expected-error@+1 {{conversion of memref memory space 1 : ui64 to integer address space failed. Consider adding memory space conversions.}} - %alloc = memref.alloc() {alignment = 64 : i64} : memref<10xf32, 1 : ui64> - return -} diff --git a/mlir/test/Conversion/MemRefToLLVM/invalid.mlir b/mlir/test/Conversion/MemRefToLLVM/invalid.mlir index 31bfa7a44a133..61c67005a08fc 100644 --- a/mlir/test/Conversion/MemRefToLLVM/invalid.mlir +++ b/mlir/test/Conversion/MemRefToLLVM/invalid.mlir @@ -1,15 +1,15 @@ -// RUN: mlir-opt %s -finalize-memref-to-llvm 2>&1 | FileCheck %s -// Since the error is at an unknown location, we use FileCheck instead of -// -verify-diagnostics here +// RUN: mlir-opt %s -finalize-memref-to-llvm -split-input-file -verify-diagnostics | FileCheck %s -// CHECK: redefinition of reserved function 'malloc' of different type '!llvm.func' is prohibited +// expected-error@+1{{redefinition of reserved function 'malloc' of different type '!llvm.func' is prohibited}} llvm.func @malloc(i64) func.func @redef_reserved() { %alloc = memref.alloc() : memref<1024x64xf32, 1> llvm.return } -// CHECK: conversion of memref memory space "foo" to integer address space failed. Consider adding memory space conversions. +// ----- + +// expected-error@unknown{{conversion of memref memory space "foo" to integer address space failed. Consider adding memory space conversions.}} // CHECK-LABEL: @bad_address_space func.func @bad_address_space(%a: memref<2xindex, "foo">) { %c0 = arith.constant 0 : index @@ -17,3 +17,27 @@ func.func @bad_address_space(%a: memref<2xindex, "foo">) { memref.store %c0, %a[%c0] : memref<2xindex, "foo"> return } + +// ----- + +// CHECK-LABEL: @invalid_int_conversion +func.func @invalid_int_conversion() { + // expected-error@+1 {{conversion of memref memory space 1 : ui64 to integer address space failed. Consider adding memory space conversions.}} + %alloc = memref.alloc() {alignment = 64 : i64} : memref<10xf32, 1 : ui64> + return +} + +// ----- + +// expected-error@unknown{{conversion of memref memory space #gpu.address_space to integer address space failed. Consider adding memory space conversions}} +// CHECK-LABEL: @issue_70160 +func.func @issue_70160() { + // expected-error@+1{{conversion of memref memory space #gpu.address_space to integer address space failed. Consider adding memory space conversions}} + %alloc = memref.alloc() : memref<1x32x33xi32, #gpu.address_space> + %alloc1 = memref.alloc() : memref + %c0 = arith.constant 0 : index + // CHECK: memref.load + %0 = memref.load %alloc[%c0, %c0, %c0] : memref<1x32x33xi32, #gpu.address_space> + memref.store %0, %alloc1[] : memref + func.return +} diff --git a/mlir/test/Conversion/MemRefToLLVM/issue-70160.mlir b/mlir/test/Conversion/MemRefToLLVM/issue-70160.mlir deleted file mode 100644 index 6970e5f413984..0000000000000 --- a/mlir/test/Conversion/MemRefToLLVM/issue-70160.mlir +++ /dev/null @@ -1,15 +0,0 @@ -// RUN: mlir-opt %s -finalize-memref-to-llvm 2>&1 | FileCheck %s -// Since the error is at an unknown location, we use FileCheck instead of -// -verify-diagnostics here - -// CHECK: conversion of memref memory space #gpu.address_space to integer address space failed. Consider adding memory space conversions -// CHECK-LABEL: @issue_70160 -func.func @issue_70160() { - %alloc = memref.alloc() : memref<1x32x33xi32, #gpu.address_space> - %alloc1 = memref.alloc() : memref - %c0 = arith.constant 0 : index - // CHECK: memref.load - %0 = memref.load %alloc[%c0, %c0, %c0] : memref<1x32x33xi32, #gpu.address_space> - memref.store %0, %alloc1[] : memref - func.return -} From 5748ddbab4883420ea23d2319006d814c4bfbda4 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Mon, 7 Apr 2025 18:03:26 +0800 Subject: [PATCH 0821/1029] [SLP] NFC. Add a comment to introduce the alternate instruction. (#134572) --- .../Transforms/Vectorize/SLPVectorizer.cpp | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 94c0289807245..e2031df810573 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -817,7 +817,28 @@ namespace { /// Main data required for vectorization of instructions. class InstructionsState { - /// The main/alternate instruction. MainOp is also VL0. + /// MainOp and AltOp are primarily determined by getSameOpcode. Currently, + /// only BinaryOperator, CastInst, and CmpInst support alternate instructions + /// (i.e., AltOp is not equal to MainOp; this can be checked using + /// isAltShuffle). + /// A rare exception is TrySplitNode, where the InstructionsState is derived + /// from getMainAltOpsNoStateVL. + /// For those InstructionsState that use alternate instructions, the resulting + /// vectorized output ultimately comes from a shufflevector. For example, + /// given a vector list (VL): + /// VL[0] = add i32 a, e + /// VL[1] = sub i32 b, f + /// VL[2] = add i32 c, g + /// VL[3] = sub i32 d, h + /// The vectorized result would be: + /// intermediated_0 = add <4 x i32> , + /// intermediated_1 = sub <4 x i32> , + /// result = shufflevector <4 x i32> intermediated_0, + /// <4 x i32> intermediated_1, + /// <4 x i32> + /// Since shufflevector is used in the final result, when calculating the cost + /// (getEntryCost), we must account for the usage of shufflevector in + /// GetVectorCost. Instruction *MainOp = nullptr; Instruction *AltOp = nullptr; From 3654621e8796508cc00b6f08241ab951af416fa1 Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik Date: Mon, 7 Apr 2025 03:08:38 -0700 Subject: [PATCH 0822/1029] [X86][NFC] Use `Triple &` to avoid copy (#134532) --- llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index ad94e306f339e..3b5869c19f2d3 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -631,7 +631,7 @@ std::vector> X86MCInstrAnalysis::findPltEntries(uint64_t PltSectionVA, ArrayRef PltContents, const MCSubtargetInfo &STI) const { - const auto TargetTriple = STI.getTargetTriple(); + const Triple &TargetTriple = STI.getTargetTriple(); switch (TargetTriple.getArch()) { case Triple::x86: return findX86PltEntries(PltSectionVA, PltContents); From 0fc7aec349394d4713bd88fb5f0319e39b96f187 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Mon, 7 Apr 2025 13:13:11 +0300 Subject: [PATCH 0823/1029] [BOLT] Gadget scanner: detect address materialization and arithmetic (#132540) In addition to authenticated pointers, consider the contents of a register safe if it was * written by PC-relative address computation * updated by an arithmetic instruction whose input address is safe --- bolt/include/bolt/Core/MCPlusBuilder.h | 35 +++ bolt/lib/Passes/PAuthGadgetScanner.cpp | 87 +++++-- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 37 +++ .../AArch64/gs-pacret-autiasp.s | 15 -- .../gs-pauth-address-materialization.s | 244 ++++++++++++++++++ 5 files changed, 380 insertions(+), 38 deletions(-) create mode 100644 bolt/test/binary-analysis/AArch64/gs-pauth-address-materialization.s diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index fa942accbea4e..b5ad219cfc796 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -587,6 +587,41 @@ class MCPlusBuilder { return getNoRegister(); } + /// Returns the register containing an address safely materialized by `Inst` + /// under the Pointer Authentication threat model. + /// + /// Returns the register `Inst` writes to if: + /// 1. the register is a materialized address, and + /// 2. the register has been materialized safely, i.e. cannot be attacker- + /// controlled, under the Pointer Authentication threat model. + /// + /// If the instruction does not write to any register satisfying the above + /// two conditions, NoRegister is returned. + /// + /// The Pointer Authentication threat model assumes an attacker is able to + /// modify any writable memory, but not executable code (due to W^X). + virtual MCPhysReg + getMaterializedAddressRegForPtrAuth(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return getNoRegister(); + } + + /// Analyzes if this instruction can safely perform address arithmetics + /// under Pointer Authentication threat model. + /// + /// If an (OutReg, InReg) pair is returned, then after Inst is executed, + /// OutReg is as trusted as InReg is. + /// + /// The arithmetic instruction is considered safe if OutReg is not attacker- + /// controlled, provided InReg and executable code are not. Please note that + /// registers other than InReg as well as the contents of memory which is + /// writable by the process should be considered attacker-controlled. + virtual std::optional> + analyzeAddressArithmeticsForPtrAuth(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return std::make_pair(getNoRegister(), getNoRegister()); + } + virtual bool isTerminator(const MCInst &Inst) const; virtual bool isNoop(const MCInst &Inst) const { diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index a3b320c545734..00846247fdc21 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -335,6 +335,49 @@ class PacRetAnalysis }); } + BitVector getClobberedRegs(const MCInst &Point) const { + BitVector Clobbered(NumRegs, false); + // Assume a call can clobber all registers, including callee-saved + // registers. There's a good chance that callee-saved registers will be + // saved on the stack at some point during execution of the callee. + // Therefore they should also be considered as potentially modified by an + // attacker/written to. + // Also, not all functions may respect the AAPCS ABI rules about + // caller/callee-saved registers. + if (BC.MIB->isCall(Point)) + Clobbered.set(); + else + BC.MIB->getClobberedRegs(Point, Clobbered); + return Clobbered; + } + + // Returns all registers that can be treated as if they are written by an + // authentication instruction. + SmallVector getRegsMadeSafeToDeref(const MCInst &Point, + const State &Cur) const { + SmallVector Regs; + const MCPhysReg NoReg = BC.MIB->getNoRegister(); + + // A signed pointer can be authenticated, or + ErrorOr AutReg = BC.MIB->getAuthenticatedReg(Point); + if (AutReg && *AutReg != NoReg) + Regs.push_back(*AutReg); + + // ... a safe address can be materialized, or + MCPhysReg NewAddrReg = BC.MIB->getMaterializedAddressRegForPtrAuth(Point); + if (NewAddrReg != NoReg) + Regs.push_back(NewAddrReg); + + // ... an address can be updated in a safe manner, producing the result + // which is as trusted as the input address. + if (auto DstAndSrc = BC.MIB->analyzeAddressArithmeticsForPtrAuth(Point)) { + if (Cur.SafeToDerefRegs[DstAndSrc->second]) + Regs.push_back(DstAndSrc->first); + } + + return Regs; + } + State computeNext(const MCInst &Point, const State &Cur) { PacStatePrinter P(BC); LLVM_DEBUG({ @@ -355,19 +398,16 @@ class PacRetAnalysis return State(); } + // First, compute various properties of the instruction, taking the state + // before its execution into account, if necessary. + + BitVector Clobbered = getClobberedRegs(Point); + SmallVector NewSafeToDerefRegs = + getRegsMadeSafeToDeref(Point, Cur); + + // Then, compute the state after this instruction is executed. State Next = Cur; - BitVector Clobbered(NumRegs, false); - // Assume a call can clobber all registers, including callee-saved - // registers. There's a good chance that callee-saved registers will be - // saved on the stack at some point during execution of the callee. - // Therefore they should also be considered as potentially modified by an - // attacker/written to. - // Also, not all functions may respect the AAPCS ABI rules about - // caller/callee-saved registers. - if (BC.MIB->isCall(Point)) - Clobbered.set(); - else - BC.MIB->getClobberedRegs(Point, Clobbered); + Next.SafeToDerefRegs.reset(Clobbered); // Keep track of this instruction if it writes to any of the registers we // need to track that for: @@ -375,17 +415,18 @@ class PacRetAnalysis if (Clobbered[Reg]) lastWritingInsts(Next, Reg) = {&Point}; - ErrorOr AutReg = BC.MIB->getAuthenticatedReg(Point); - if (AutReg && *AutReg != BC.MIB->getNoRegister()) { - // The sub-registers of *AutReg are also trusted now, but not its - // super-registers (as they retain untrusted register units). - BitVector AuthenticatedSubregs = - BC.MIB->getAliases(*AutReg, /*OnlySmaller=*/true); - for (MCPhysReg Reg : AuthenticatedSubregs.set_bits()) { - Next.SafeToDerefRegs.set(Reg); - if (RegsToTrackInstsFor.isTracked(Reg)) - lastWritingInsts(Next, Reg).clear(); - } + // After accounting for clobbered registers in general, override the state + // according to authentication and other *special cases* of clobbering. + + // The sub-registers are also safe-to-dereference now, but not their + // super-registers (as they retain untrusted register units). + BitVector NewSafeSubregs(NumRegs); + for (MCPhysReg SafeReg : NewSafeToDerefRegs) + NewSafeSubregs |= BC.MIB->getAliases(SafeReg, /*OnlySmaller=*/true); + for (MCPhysReg Reg : NewSafeSubregs.set_bits()) { + Next.SafeToDerefRegs.set(Reg); + if (RegsToTrackInstsFor.isTracked(Reg)) + lastWritingInsts(Next, Reg).clear(); } LLVM_DEBUG({ diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index b50a37abeda48..0d1908f91e514 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -304,6 +304,43 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { } } + MCPhysReg + getMaterializedAddressRegForPtrAuth(const MCInst &Inst) const override { + switch (Inst.getOpcode()) { + case AArch64::ADR: + case AArch64::ADRP: + // These instructions produce an address value based on the information + // encoded into the instruction itself (which should reside in a read-only + // code memory) and the value of PC register (that is, the location of + // this instruction), so the produced value is not attacker-controlled. + return Inst.getOperand(0).getReg(); + default: + return getNoRegister(); + } + } + + std::optional> + analyzeAddressArithmeticsForPtrAuth(const MCInst &Inst) const override { + switch (Inst.getOpcode()) { + default: + return std::nullopt; + case AArch64::ADDXri: + case AArch64::SUBXri: + // The immediate addend is encoded into the instruction itself, so it is + // not attacker-controlled under Pointer Authentication threat model. + return std::make_pair(Inst.getOperand(0).getReg(), + Inst.getOperand(1).getReg()); + case AArch64::ORRXrs: + // "mov Xd, Xm" is equivalent to "orr Xd, XZR, Xm, lsl #0" + if (Inst.getOperand(1).getReg() != AArch64::XZR || + Inst.getOperand(3).getImm() != 0) + return std::nullopt; + + return std::make_pair(Inst.getOperand(0).getReg(), + Inst.getOperand(2).getReg()); + } + } + bool isADRP(const MCInst &Inst) const override { return Inst.getOpcode() == AArch64::ADRP; } diff --git a/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s b/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s index 01b7cec3272e6..d506ec13f4895 100644 --- a/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s +++ b/bolt/test/binary-analysis/AArch64/gs-pacret-autiasp.s @@ -141,24 +141,9 @@ f_nonx30_ret_ok: stp x29, x30, [sp, #-16]! mov x29, sp bl g - add x0, x0, #3 ldp x29, x30, [sp], #16 - // FIXME: Should the scanner understand that an authenticated register (below x30, - // after the autiasp instruction), is OK to be moved to another register - // and then that register being used to return? - // This respects that pac-ret hardening intent, but the scanner currently - // will produce a false positive for this. - // Is it worthwhile to make the scanner more complex for this case? - // So far, scanning many millions of instructions across a linux distro, - // I haven't encountered such an example. - // The ".if 0" block below tests this case and currently fails. -.if 0 autiasp mov x16, x30 -.else - mov x16, x30 - autia x16, sp -.endif // CHECK-NOT: function f_nonx30_ret_ok ret x16 .size f_nonx30_ret_ok, .-f_nonx30_ret_ok diff --git a/bolt/test/binary-analysis/AArch64/gs-pauth-address-materialization.s b/bolt/test/binary-analysis/AArch64/gs-pauth-address-materialization.s new file mode 100644 index 0000000000000..b4dd53a5e3c8d --- /dev/null +++ b/bolt/test/binary-analysis/AArch64/gs-pauth-address-materialization.s @@ -0,0 +1,244 @@ +// -Wl,--no-relax prevents converting ADRP+ADD pairs into NOP+ADR. +// RUN: %clang %cflags -march=armv8.3-a -Wl,--no-relax %s -o %t.exe +// RUN: llvm-bolt-binary-analysis --scanners=pauth %t.exe 2>&1 | FileCheck %s + +// Test various patterns that should or should not be considered safe +// materialization of PC-relative addresses. +// +// Note that while "instructions that write to the affected registers" +// section of the report is still technically correct, it does not necessarily +// mention the instructions that are used incorrectly. +// +// FIXME: Switch to PAC* instructions instead of indirect tail call for testing +// if a register is considered safe when detection of signing oracles is +// implemented, as it is more traditional usage of PC-relative constants. +// Moreover, using PAC instructions would improve test robustness, as +// handling of *calls* can be influenced by what BOLT classifies as a +// tail call, for example. + + .text + +// Define a function that is reachable by ADR instruction. + .type sym,@function +sym: + ret + .size sym, .-sym + + .globl good_adr + .type good_adr,@function +good_adr: +// CHECK-NOT: good_adr + adr x0, sym + br x0 + .size good_adr, .-good_adr + + .globl good_adrp + .type good_adrp,@function +good_adrp: +// CHECK-NOT: good_adrp + adrp x0, sym + br x0 + .size good_adrp, .-good_adrp + + .globl good_adrp_add + .type good_adrp_add,@function +good_adrp_add: +// CHECK-NOT: good_adrp_add + adrp x0, sym + add x0, x0, :lo12:sym + br x0 + .size good_adrp_add, .-good_adrp_add + + .globl good_adrp_add_with_const_offset + .type good_adrp_add_with_const_offset,@function +good_adrp_add_with_const_offset: +// CHECK-NOT: good_adrp_add_with_const_offset + adrp x0, sym + add x0, x0, :lo12:sym + add x0, x0, #8 + br x0 + .size good_adrp_add_with_const_offset, .-good_adrp_add_with_const_offset + + .globl bad_adrp_with_nonconst_offset + .type bad_adrp_with_nonconst_offset,@function +bad_adrp_with_nonconst_offset: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_adrp_with_nonconst_offset, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: br x0 # TAILCALL +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: add x0, x0, x1 +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: adrp x0, #{{.*}} +// CHECK-NEXT: {{[0-9a-f]+}}: add x0, x0, x1 +// CHECK-NEXT: {{[0-9a-f]+}}: br x0 # TAILCALL + adrp x0, sym + add x0, x0, x1 + br x0 + .size bad_adrp_with_nonconst_offset, .-bad_adrp_with_nonconst_offset + + .globl bad_split_adrp + .type bad_split_adrp,@function +bad_split_adrp: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_split_adrp, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: br x0 # UNKNOWN CONTROL FLOW +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: add x0, x0, #0x{{[0-9a-f]+}} +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: add x0, x0, #0x{{[0-9a-f]+}} +// CHECK-NEXT: {{[0-9a-f]+}}: br x0 # UNKNOWN CONTROL FLOW + cbz x2, 1f + adrp x0, sym +1: + add x0, x0, :lo12:sym + br x0 + .size bad_split_adrp, .-bad_split_adrp + +// Materialization of absolute addresses is not handled, as it is not expected +// to be used by real-world code, but can be supported if needed. + + .globl bad_immediate_constant + .type bad_immediate_constant,@function +bad_immediate_constant: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_immediate_constant, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: br x0 # TAILCALL +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: mov x0, #{{.*}} +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: mov x0, #{{.*}} +// CHECK-NEXT: {{[0-9a-f]+}}: br x0 # TAILCALL + movz x0, #1234 + br x0 + .size bad_immediate_constant, .-bad_immediate_constant + +// Any ADR or ADRP instruction followed by any number of increments/decrements +// by constant is considered safe. + + .globl good_adr_with_add + .type good_adr_with_add,@function +good_adr_with_add: +// CHECK-NOT: good_adr_with_add + adr x0, sym + add x0, x0, :lo12:sym + br x0 + .size good_adr_with_add, .-good_adr_with_add + + .globl good_adrp_with_add_non_consecutive + .type good_adrp_with_add_non_consecutive,@function +good_adrp_with_add_non_consecutive: +// CHECK-NOT: good_adrp_with_add_non_consecutive + adrp x0, sym + mul x1, x2, x3 + add x0, x0, :lo12:sym + br x0 + .size good_adrp_with_add_non_consecutive, .-good_adrp_with_add_non_consecutive + + .globl good_many_offsets + .type good_many_offsets,@function +good_many_offsets: +// CHECK-NOT: good_many_offsets + adrp x0, sym + add x1, x0, #8 + add x2, x1, :lo12:sym + br x2 + .size good_many_offsets, .-good_many_offsets + + .globl good_negative_offset + .type good_negative_offset,@function +good_negative_offset: +// CHECK-NOT: good_negative_offset + adr x0, sym + sub x1, x0, #8 + br x1 + .size good_negative_offset, .-good_negative_offset + +// MOV Xd, Xm (which is an alias of ORR Xd, XZR, Xm) is handled as part of +// support for address arithmetics, but ORR in general is not. +// This restriction may be relaxed in the future. + + .globl good_mov_reg + .type good_mov_reg,@function +good_mov_reg: +// CHECK-NOT: good_mov_reg + adrp x0, sym + mov x1, x0 + orr x2, xzr, x1 // the same as "mov x2, x1" + br x2 + .size good_mov_reg, .-good_mov_reg + + .globl bad_orr_not_xzr + .type bad_orr_not_xzr,@function +bad_orr_not_xzr: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_orr_not_xzr, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: br x2 # TAILCALL +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: orr x2, x1, x0 +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: adrp x0, #{{(0x)?[0-9a-f]+}} +// CHECK-NEXT: {{[0-9a-f]+}}: mov x1, #0 +// CHECK-NEXT: {{[0-9a-f]+}}: orr x2, x1, x0 +// CHECK-NEXT: {{[0-9a-f]+}}: br x2 # TAILCALL + adrp x0, sym + // The generic case of "orr Xd, Xn, Xm" is not allowed so far, + // even if Xn is known to be safe + movz x1, #0 + orr x2, x1, x0 + br x2 + .size bad_orr_not_xzr, .-bad_orr_not_xzr + + .globl bad_orr_not_lsl0 + .type bad_orr_not_lsl0,@function +bad_orr_not_lsl0: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_orr_not_lsl0, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: br x2 # TAILCALL +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: orr x2, xzr, x0, lsl #1 +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: adrp x0, #{{(0x)?[0-9a-f]+}} +// CHECK-NEXT: {{[0-9a-f]+}}: orr x2, xzr, x0, lsl #1 +// CHECK-NEXT: {{[0-9a-f]+}}: br x2 # TAILCALL + adrp x0, sym + // Currently, the only allowed form of "orr" is that used by "mov Xd, Xn" alias. + // This can be relaxed in the future. + orr x2, xzr, x0, lsl #1 + br x2 + .size bad_orr_not_lsl0, .-bad_orr_not_lsl0 + +// Check that the input register operands of `add`/`mov` is correct. + + .globl bad_add_input_reg + .type bad_add_input_reg,@function +bad_add_input_reg: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_add_input_reg, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: br x0 # TAILCALL +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: add x0, x1, #0x{{[0-9a-f]+}} +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: adrp x0, #{{(0x)?[0-9a-f]+}} +// CHECK-NEXT: {{[0-9a-f]+}}: add x0, x1, #0x{{[0-9a-f]+}} +// CHECK-NEXT: {{[0-9a-f]+}}: br x0 # TAILCALL + adrp x0, sym + add x0, x1, :lo12:sym + br x0 + .size bad_add_input_reg, .-bad_add_input_reg + + .globl bad_mov_input_reg + .type bad_mov_input_reg,@function +bad_mov_input_reg: +// CHECK-LABEL: GS-PAUTH: non-protected call found in function bad_mov_input_reg, basic block {{[^,]+}}, at address +// CHECK-NEXT: The instruction is {{[0-9a-f]+}}: br x0 # TAILCALL +// CHECK-NEXT: The 1 instructions that write to the affected registers after any authentication are: +// CHECK-NEXT: 1. {{[0-9a-f]+}}: mov x0, x1 +// CHECK-NEXT: This happens in the following basic block: +// CHECK-NEXT: {{[0-9a-f]+}}: adrp x0, #{{(0x)?[0-9a-f]+}} +// CHECK-NEXT: {{[0-9a-f]+}}: mov x0, x1 +// CHECK-NEXT: {{[0-9a-f]+}}: br x0 # TAILCALL + adrp x0, sym + mov x0, x1 + br x0 + .size bad_mov_input_reg, .-bad_mov_input_reg + + .globl main + .type main,@function +main: + mov x0, 0 + ret + .size main, .-main From c9157d4692bb47b1fa6f053fd780ff47415590d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Mon, 7 Apr 2025 11:36:55 +0100 Subject: [PATCH 0824/1029] [llvm][docs] Reorder sections in GitHub.rst (#134212) Reorder sections in GitHub.rst so that "Branches" and "Stacked Pull Requests" appear after the more general section on pull requests. This improves the conceptual flow for readers new to the process: New order: * Introduction * Before your first PR * Pull Requests * Approvals * Landing your change * Branches * Stacked Pull Requests * ... Previous order: * Introduction * Before your first PR * Branches * Stacked Pull Requests * Pull Requests * Approvals * Landing your change * ... This change only reorders existing text - no content edits. --- llvm/docs/GitHub.rst | 201 ++++++++++++++++++++++--------------------- 1 file changed, 102 insertions(+), 99 deletions(-) diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst index a235fa51a6473..7b614b6ca5d45 100644 --- a/llvm/docs/GitHub.rst +++ b/llvm/docs/GitHub.rst @@ -4,6 +4,9 @@ LLVM GitHub User Guide ====================== +.. contents:: + :local: + Introduction ============ The LLVM Project uses `GitHub `_ for @@ -21,105 +24,6 @@ Before your first PR Please ensure that you have set a valid email address in your GitHub account, see :ref:`github-email-address`. -.. _github_branches: - -Branches -======== - -It is possible to create branches that starts with `users//`, however this is -intended to be able to support "stacked" pull-request. Do not create any branches in the -llvm/llvm-project repository otherwise, please use a fork (see below). User branches that -aren't associated with a pull-request **will be deleted**. - -Stacked Pull Requests -===================== - -To separate related changes or to break down a larger PR into smaller, reviewable -pieces, use "stacked pull requests" — this helps make the review process -smoother. - -.. note:: - The LLVM Project monorepo on GitHub is configured to always use "Squash and - Merge" as the pull request merge option. As a result, each PR results in - exactly one commit being merged into the project. - - This means that stacked pull requests are the only available option for - landing a series of related changes. In contrast, submitting a PR with - multiple commits and merging them as-is (without squashing) is not supported - in LLVM. - -While GitHub does not natively support stacked pull requests, there are several -common alternatives. - -To illustrate, assume that you are working on two branches in your fork of the -``llvm/llvm-project`` repository, and you want to eventually merge both into -``main``: - -- `feature_1`, which contains commit `feature_commit_1` -- `feature_2`, which contains commit `feature_commit_2` and depends on - `feature_1` (so it also includes `feature_commit_1`) - -Your options are as follows: - -#. Two PRs with a dependency note - - Create PR_1 for `feature_1` and PR_2 for `feature_2`. In PR_2, include a - note in the PR summary indicating that it depends on PR_1 (e.g., - “Depends on #PR_1”). - - To make review easier, make it clear which commits are part of the base PR - and which are new, e.g. "The first N commits are from the base PR". This - helps reviewers focus only on the incremental changes. - -#. Use user branches in ``llvm/llvm-project`` - - Create user branches in the main repository, as described - :ref:`above`. Then: - - - Open a pull request from `users//feature_1` → `main` - - Open another from `users//feature_2` → `users//feature_1` - - This approach allows GitHub to display clean, incremental diffs for each PR - in the stack, making it much easier for reviewers to see what has changed at - each step. Once `feature_1` is merged, you can rebase and re-target - `feature_2` to `main`. - -#. Use a stacked PR tool - - Use tools like SPR or Graphite (described below) to automate managing - stacked PRs. These tools are also based on using user branches - in ``llvm/llvm-project``. - -.. note:: - When not using user branches, GitHub will not display proper diffs for - subsequent PRs in a stack. Instead, it will show a combined diff that - includes all commits from earlier PRs. - - As described in the first option above, in such cases it is the PR author’s - responsibility to clearly indicate which commits are relevant to the - current PR. For example: “The first N commits are from the base PR.” - - You can avoid this issue by using user branches directly in the - ``llvm/llvm-project`` repository. - - -Using Graphite for stacked Pull Requests ----------------------------------------- - -`Graphite `_ is a stacked pull request tool supported -by the LLVM repo (the other being `reviewable.io `_). - -Graphite will want to create branches under ``llvm/llvm-project`` rather than your -private fork, so the guidance above, about branch naming, is critical, otherwise -``gt submit`` (i.e. publish your PRs for review) will fail. - -Use ``gt config`` then ``Branch naming settings`` and ``Set a prefix for branch names``. -Include the last ``/``. - -If you didn't do the above and Graphite created non-prefixed branches, a simple way to -unblock is to rename (``git -m ``), and then checkout the branch -and ``gt track``. - Pull Requests ============= The LLVM project is using GitHub Pull Requests for Code Reviews. This document @@ -310,6 +214,105 @@ commonly used first: request will understand that you're rebasing just your patches, and display this result correctly with a note that a force push did occur. +.. _github_branches: + +Branches +======== + +It is possible to create branches that start with `users//`, however this is +intended to be able to support "stacked" pull-request. Do not create any branches in the +llvm/llvm-project repository otherwise, please use a fork (see above). User branches that +aren't associated with a pull-request **will be deleted**. + +Stacked Pull Requests +===================== + +To separate related changes or to break down a larger PR into smaller, reviewable +pieces, use "stacked pull requests" — this helps make the review process +smoother. + +.. note:: + The LLVM Project monorepo on GitHub is configured to always use "Squash and + Merge" as the pull request merge option. As a result, each PR results in + exactly one commit being merged into the project. + + This means that stacked pull requests are the only available option for + landing a series of related changes. In contrast, submitting a PR with + multiple commits and merging them as-is (without squashing) is not supported + in LLVM. + +While GitHub does not natively support stacked pull requests, there are several +common alternatives. + +To illustrate, assume that you are working on two branches in your fork of the +``llvm/llvm-project`` repository, and you want to eventually merge both into +``main``: + +- `feature_1`, which contains commit `feature_commit_1` +- `feature_2`, which contains commit `feature_commit_2` and depends on + `feature_1` (so it also includes `feature_commit_1`) + +Your options are as follows: + +#. Two PRs with a dependency note + + Create PR_1 for `feature_1` and PR_2 for `feature_2`. In PR_2, include a + note in the PR summary indicating that it depends on PR_1 (e.g., + “Depends on #PR_1”). + + To make review easier, make it clear which commits are part of the base PR + and which are new, e.g. "The first N commits are from the base PR". This + helps reviewers focus only on the incremental changes. + +#. Use user branches in ``llvm/llvm-project`` + + Create user branches in the main repository, as described + :ref:`above`. Then: + + - Open a pull request from `users//feature_1` → `main` + - Open another from `users//feature_2` → `users//feature_1` + + This approach allows GitHub to display clean, incremental diffs for each PR + in the stack, making it much easier for reviewers to see what has changed at + each step. Once `feature_1` is merged, you can rebase and re-target + `feature_2` to `main`. + +#. Use a stacked PR tool + + Use tools like SPR or Graphite (described below) to automate managing + stacked PRs. These tools are also based on using user branches + in ``llvm/llvm-project``. + +.. note:: + When not using user branches, GitHub will not display proper diffs for + subsequent PRs in a stack. Instead, it will show a combined diff that + includes all commits from earlier PRs. + + As described in the first option above, in such cases it is the PR author’s + responsibility to clearly indicate which commits are relevant to the + current PR. For example: “The first N commits are from the base PR.” + + You can avoid this issue by using user branches directly in the + ``llvm/llvm-project`` repository. + + +Using Graphite for stacked Pull Requests +---------------------------------------- + +`Graphite `_ is a stacked pull request tool supported +by the LLVM repo (the other being `reviewable.io `_). + +Graphite will want to create branches under ``llvm/llvm-project`` rather than your +private fork, so the guidance above, about branch naming, is critical, otherwise +``gt submit`` (i.e. publish your PRs for review) will fail. + +Use ``gt config`` then ``Branch naming settings`` and ``Set a prefix for branch names``. +Include the last ``/``. + +If you didn't do the above and Graphite created non-prefixed branches, a simple way to +unblock is to rename (``git -m ``), and then checkout the branch +and ``gt track``. + Pre-merge Continuous Integration (CI) ------------------------------------- From 7b007c092d665bcb3f00ff937e04b20e6ec32c55 Mon Sep 17 00:00:00 2001 From: Jack Frankland Date: Mon, 7 Apr 2025 11:37:26 +0100 Subject: [PATCH 0825/1029] [mlir][tosa-to-linalg] Add acc_type lowering Support (#134267) Add support for lowering of convolution operations where the `acc_type` attribute differs from the result type of the operation. The only case of this in for convolutions in the TOSA-v1.0 specification is an fp16 convolution which internally uses an fp32 accumulator; all other operations have accumulator types that match their output/result types. Add lit tests for the fp16 convolution with fp32 accumulator operators described above. Signed-off-by: Jack Frankland --- .../TosaToLinalg/TosaToLinalgNamed.cpp | 50 ++++++++++++++----- .../TosaToLinalg/tosa-to-linalg-named.mlir | 40 +++++++++++++++ 2 files changed, 77 insertions(+), 13 deletions(-) diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp index fc1cad2423450..86f5e9baf4a94 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp @@ -119,10 +119,11 @@ static AffineMap getBroadcastingMap(PatternRewriter &rewriter, Value source, } // Broadcast the source value to all the outer dimensions of the result value. -// If required, the element type is expanded using an arith.extsi operation. -static mlir::Value linalgBroadcastAndMaybeExtSI(PatternRewriter &rewriter, - Location loc, Value source, - Value result) { +// If required, the element type is expanded using an arith.extsi or arith.extf +// operation as appropriate. +static mlir::Value linalgBroadcastAndMaybeExt(PatternRewriter &rewriter, + Location loc, Value source, + Value result) { ShapedType resultTy = cast(result.getType()); const int64_t resultRank = resultTy.getRank(); // Creating maps for the input and output of the broacast-like generic op. @@ -135,11 +136,16 @@ static mlir::Value linalgBroadcastAndMaybeExtSI(PatternRewriter &rewriter, .create( loc, resultTy, ValueRange({source}), result, indexingMaps, getNParallelLoopsAttrs(resultTy.getRank()), - [](OpBuilder &builder, Location loc, ValueRange args) { + [&resultTy](OpBuilder &builder, Location loc, ValueRange args) { Value biasVal = args[0]; Type resType = args[1].getType(); if (resType != biasVal.getType()) { - biasVal = builder.create(loc, resType, biasVal); + biasVal = + resultTy.getElementType().isFloat() + ? builder.create(loc, resType, biasVal) + .getResult() + : builder.create(loc, resType, biasVal) + .getResult(); } builder.create(loc, biasVal); }) @@ -253,12 +259,14 @@ class ConvConverter : public OpConversionPattern { ShapedType resultTy = cast(op->getResult(0).getType()); Type inputETy = inputTy.getElementType(); - Type resultETy = resultTy.getElementType(); DenseI64ArrayAttr padAttr = op.getPadAttr(); DenseI64ArrayAttr strideTosaAttr = op.getStrideAttr(); DenseI64ArrayAttr dilationTosaAttr = op.getDilationAttr(); + Type accETy = op.getAccType(); + Type accTy = RankedTensorType::get(resultTy.getShape(), accETy); + // Get and verify zero points. FailureOr maybeIZp = op.getInputZeroPoint(); if (failed(maybeIZp)) @@ -385,10 +393,10 @@ class ConvConverter : public OpConversionPattern { auto dilationAttr = rewriter.getI64TensorAttr(dilation); Value biasEmptyTensor = rewriter.create( - loc, resultTy.getShape(), resultETy, filteredDims); + loc, resultTy.getShape(), accETy, filteredDims); Value broadcastBias = - linalgBroadcastAndMaybeExtSI(rewriter, loc, bias, biasEmptyTensor); + linalgBroadcastAndMaybeExt(rewriter, loc, bias, biasEmptyTensor); if (hasZp) { auto iZp = rewriter.getI32IntegerAttr(inputZpVal); @@ -410,10 +418,15 @@ class ConvConverter : public OpConversionPattern { Value conv = rewriter .create( - loc, resultTy, ValueRange{input, weight}, + loc, accTy, ValueRange{input, weight}, ValueRange{broadcastBias}, strideAttr, dilationAttr) ->getResult(0); + // We may need to truncate back to the result type if the accumulator was + // wider than the result. + if (resultTy != accTy) + conv = rewriter.create(loc, resultTy, conv); + rewriter.replaceOp(op, conv); return success(); } @@ -444,6 +457,8 @@ class DepthwiseConvConverter auto strideTosaAttr = cast(op->getAttr("stride")); auto dilationTosaAttr = cast(op->getAttr("dilation")); + Type accETy = op.getAccType(); + if (!weightTy.hasStaticShape() || !biasTy.hasStaticShape()) return rewriter.notifyMatchFailure( op, "tosa.depthwise_conv ops require static shapes"); @@ -516,11 +531,11 @@ class DepthwiseConvConverter ShapedType linalgConvTy = RankedTensorType::get({resultShape[0], resultShape[1], resultShape[2], weightShape[2], weightShape[3]}, - resultETy); + accETy); - auto resultZeroAttr = rewriter.getZeroAttr(resultETy); + auto resultZeroAttr = rewriter.getZeroAttr(accETy); Value emptyTensor = rewriter.create( - loc, linalgConvTy.getShape(), resultETy, filteredDims); + loc, linalgConvTy.getShape(), accETy, filteredDims); Value zero = rewriter.create(loc, resultZeroAttr); Value zeroTensor = rewriter .create(loc, ValueRange{zero}, @@ -543,6 +558,15 @@ class DepthwiseConvConverter ValueRange{zeroTensor}, strideAttr, dilationAttr) .getResult(0); + // We may need to truncate back to the result type if the accumulator was + // wider than the result. + if (accETy != resultETy) + conv = rewriter.create( + loc, + RankedTensorType::get(cast(conv.getType()).getShape(), + resultETy), + conv); + SmallVector reassociationMap; createDepthwiseConvCollapseMap(resultRank, reassociationMap, rewriter); Value convReshape = rewriter.create( diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir index 19c12ba3edbd4..242772fe5cdcf 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir @@ -658,6 +658,20 @@ func.func @conv2d_quant(%arg0 : tensor<1x12x12x1xi8>, %arg1 : tensor<1024x3x3x1x // ----- +// CHECK-LABEL: @conv2d_f16_f32_acc +func.func @conv2d_f16_f32_acc(%input: tensor<1x49x42x27xf16>, %weights: tensor<28x3x3x27xf16>, %bias: tensor<28xf16>) -> () { + %input_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf16>}> : () -> tensor<1xf16> + %weight_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf16>}> : () -> tensor<1xf16> + // CHECK: linalg.generic {{{.*}}} ins(%{{.*}} : tensor<28xf16>) outs(%{{.*}} : tensor<1x45x40x28xf32>) + // CHECK: arith.extf %{{.*}} : f16 to f32 + // CHECK: %[[CONV:.*]] = linalg.conv_2d_nhwc_fhwc {{{.*}}} ins(%{{.*}}, %{{.*}} : tensor<1x49x42x27xf16>, tensor<28x3x3x27xf16>) outs(%{{.*}} : tensor<1x45x40x28xf32>) -> tensor<1x45x40x28xf32> + // CHECK: tosa.cast %[[CONV]] : (tensor<1x45x40x28xf32>) -> tensor<1x45x40x28xf16> + %0 = tosa.conv2d %input, %weights, %bias, %input_zp, %weight_zp {acc_type = f32, pad = array, stride = array, dilation = array} : (tensor<1x49x42x27xf16>, tensor<28x3x3x27xf16>, tensor<28xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<1x45x40x28xf16> + return +} + +// ----- + // CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d3)> // CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> @@ -848,6 +862,18 @@ func.func @depthwise_int_conv_zero_zp(%arg0 : tensor<1x7x5x3xi8>, %arg1 : tensor // ----- +// CHECK-LABEL: @depthwise_conv2d_f16_f32_acc +func.func @depthwise_conv2d_f16_f32_acc(%arg0 : tensor<1x7x5x3xf16>, %arg1 : tensor<3x1x3x11xf16>, %arg2 : tensor<33xf16>) -> () { + %input_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf16>}> : () -> tensor<1xf16> + %weight_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf16>}> : () -> tensor<1xf16> + // CHECK: %[[CONV:.*]] = linalg.depthwise_conv_2d_nhwc_hwcm {{{.*}}} ins(%{{.*}}, %{{.*}} : tensor<1x7x5x3xf16>, tensor<3x1x3x11xf16>) outs(%{{.*}} : tensor<1x5x5x3x11xf32>) -> tensor<1x5x5x3x11xf32> + // CHECK: tosa.cast %[[CONV]] : (tensor<1x5x5x3x11xf32>) -> tensor<1x5x5x3x11xf16> + %2 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %input_zp, %weight_zp {acc_type = f32, pad = array, stride = array, dilation = array } : (tensor<1x7x5x3xf16>, tensor<3x1x3x11xf16>, tensor<33xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<1x5x5x33xf16> + return +} + +// ----- + // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d4)> // CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> @@ -918,6 +944,20 @@ func.func @conv3d_i8(%input: tensor<1x49x48x47x27xi8>, %weights: tensor<28x3x4x5 // ----- +// CHECK-LABEL: @conv3d_f16_f32_acc +func.func @conv3d_f16_f32_acc(%input: tensor<1x49x48x47x27xf16>, %weights: tensor<28x3x4x5x27xf16>, %bias: tensor<28xf16>) -> () { + %input_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf16>}> : () -> tensor<1xf16> + %weight_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf16>}> : () -> tensor<1xf16> + // CHECK: linalg.generic {{{.*}}} ins(%{{.*}} : tensor<28xf16>) outs(%{{.*}} : tensor<1x47x45x43x28xf32>) + // CHECK: arith.extf %{{.*}} : f16 to f32 + // CHECK: %[[CONV:.*]] = linalg.conv_3d_ndhwc_dhwcf {{{.*}}} ins(%{{.*}}, %{{.*}} : tensor<1x49x48x47x27xf16>, tensor<3x4x5x27x28xf16>) outs(%{{.*}} : tensor<1x47x45x43x28xf32>) -> tensor<1x47x45x43x28xf32> + // CHECK: tosa.cast %[[CONV]] : (tensor<1x47x45x43x28xf32>) -> tensor<1x47x45x43x28xf16> + %0 = tosa.conv3d %input, %weights, %bias, %input_zp, %weight_zp {acc_type = f32, pad = array, stride = array, dilation = array} : (tensor<1x49x48x47x27xf16>, tensor<28x3x4x5x27xf16>, tensor<28xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<1x47x45x43x28xf16> + return +} + +// ----- + // CHECK-LABEL: @test_transpose // CHECK-SAME: (%[[ARG0:.+]]: tensor<1x2x3xi32>) func.func @test_transpose(%arg0: tensor<1x2x3xi32>) -> () { From 2c107238d54dde577de96455e6d29c0ff1b2b953 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Mon, 7 Apr 2025 13:37:34 +0300 Subject: [PATCH 0826/1029] [BOLT] Make DataflowAnalysis::getStateBefore() const (NFC) (#133308) --- bolt/include/bolt/Passes/DataflowAnalysis.h | 11 +++++++---- bolt/lib/Passes/PAuthGadgetScanner.cpp | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/bolt/include/bolt/Passes/DataflowAnalysis.h b/bolt/include/bolt/Passes/DataflowAnalysis.h index 2afaa6d3043a6..f6ca39cf6f860 100644 --- a/bolt/include/bolt/Passes/DataflowAnalysis.h +++ b/bolt/include/bolt/Passes/DataflowAnalysis.h @@ -292,14 +292,17 @@ class DataflowAnalysis { /// Relies on a ptr map to fetch the previous instruction and then retrieve /// state. WARNING: Watch out for invalidated pointers. Do not use this /// function if you invalidated pointers after the analysis has been completed - ErrorOr getStateBefore(const MCInst &Point) { - return getStateAt(PrevPoint[&Point]); + ErrorOr getStateBefore(const MCInst &Point) const { + auto It = PrevPoint.find(&Point); + if (It == PrevPoint.end()) + return make_error_code(std::errc::result_out_of_range); + return getStateAt(It->getSecond()); } - ErrorOr getStateBefore(ProgramPoint Point) { + ErrorOr getStateBefore(ProgramPoint Point) const { if (Point.isBB()) return getStateAt(*Point.getBB()); - return getStateAt(PrevPoint[Point.getInst()]); + return getStateBefore(*Point.getInst()); } /// Remove any state annotations left by this analysis diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index 00846247fdc21..df9e87bd4e999 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -443,7 +443,7 @@ class PacRetAnalysis public: std::vector getLastClobberingInsts(const MCInst &Inst, BinaryFunction &BF, - const ArrayRef UsedDirtyRegs) { + const ArrayRef UsedDirtyRegs) const { if (RegsToTrackInstsFor.empty()) return {}; auto MaybeState = getStateBefore(Inst); From 409df9f74c04710c1b5ca9a4d33521a590ab3bf0 Mon Sep 17 00:00:00 2001 From: Mel Chen Date: Mon, 7 Apr 2025 19:10:16 +0800 Subject: [PATCH 0827/1029] [TTI][LV] Change the prototype of preferInLoopReduction. nfc (#132698) This patch changes the preferInLoopReduction function to take a RecurKind instead of an unsigned Opcode. This makes it possible to distinguish non-arithmetic reductions such as min/max, AnyOf, and FindLastIV, and also helps unify IAnyOf with FAnyOf and IFindLastIV with FFindLastIV. Related patch #118393 #131830 --- llvm/include/llvm/Analysis/TargetTransformInfo.h | 12 +++++++----- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h | 2 +- llvm/lib/Analysis/TargetTransformInfo.cpp | 4 ++-- .../Target/AArch64/AArch64TargetTransformInfo.cpp | 1 - llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp | 6 +++--- llvm/lib/Target/ARM/ARMTargetTransformInfo.h | 2 +- llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h | 1 - llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++--- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 4835c66a7a3bc..2efca0d1d754f 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -23,6 +23,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/IR/FMF.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/PassManager.h" @@ -1777,8 +1778,9 @@ class TargetTransformInfo { /// vectorization, false - otherwise. bool preferAlternateOpcodeVectorization() const; - /// \returns True if the target prefers reductions in loop. - bool preferInLoopReduction(unsigned Opcode, Type *Ty) const; + /// \returns True if the target prefers reductions of \p Kind to be performed + /// in the loop. + bool preferInLoopReduction(RecurKind Kind, Type *Ty) const; /// \returns True if the target prefers reductions select kept in the loop /// when tail folding. i.e. @@ -2330,7 +2332,7 @@ class TargetTransformInfo::Concept { unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; virtual bool preferFixedOverScalableIfEqualCost() const = 0; - virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty) const = 0; + virtual bool preferInLoopReduction(RecurKind Kind, Type *Ty) const = 0; virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty) const = 0; virtual bool preferAlternateOpcodeVectorization() const = 0; @@ -3143,8 +3145,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { bool preferFixedOverScalableIfEqualCost() const override { return Impl.preferFixedOverScalableIfEqualCost(); } - bool preferInLoopReduction(unsigned Opcode, Type *Ty) const override { - return Impl.preferInLoopReduction(Opcode, Ty); + bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override { + return Impl.preferInLoopReduction(Kind, Ty); } bool preferAlternateOpcodeVectorization() const override { return Impl.preferAlternateOpcodeVectorization(); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 261d5eacc91b0..3fe0a9101fdee 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1010,7 +1010,7 @@ class TargetTransformInfoImplBase { bool preferFixedOverScalableIfEqualCost() const { return false; } - bool preferInLoopReduction(unsigned Opcode, Type *Ty) const { return false; } + bool preferInLoopReduction(RecurKind Kind, Type *Ty) const { return false; } bool preferAlternateOpcodeVectorization() const { return true; } bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty) const { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index e3212135e9b19..4fea4e5711f5a 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1380,9 +1380,9 @@ bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const { return TTIImpl->preferFixedOverScalableIfEqualCost(); } -bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, +bool TargetTransformInfo::preferInLoopReduction(RecurKind Kind, Type *Ty) const { - return TTIImpl->preferInLoopReduction(Opcode, Ty); + return TTIImpl->preferInLoopReduction(Kind, Ty); } bool TargetTransformInfo::preferAlternateOpcodeVectorization() const { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 77be41b78bc7f..417af74f712e7 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -12,7 +12,6 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64SMEAttributes.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 1b134bbe5ff6a..2f9c262511ae4 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2693,13 +2693,13 @@ void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, BaseT::getPeelingPreferences(L, SE, PP); } -bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty) const { +bool ARMTTIImpl::preferInLoopReduction(RecurKind Kind, Type *Ty) const { if (!ST->hasMVEIntegerOps()) return false; unsigned ScalarBits = Ty->getScalarSizeInBits(); - switch (Opcode) { - case Instruction::Add: + switch (Kind) { + case RecurKind::Add: return ScalarBits <= 64; default: return false; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index ca5129c997fb0..2b144f1628038 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -224,7 +224,7 @@ class ARMTTIImpl : public BasicTTIImplBase { ArrayRef Args = {}, const Instruction *CxtI = nullptr); - bool preferInLoopReduction(unsigned Opcode, Type *Ty) const; + bool preferInLoopReduction(RecurKind Kind, Type *Ty) const; bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty) const; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 1c5524748b605..c61dd1507f168 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -18,7 +18,6 @@ #include "RISCVSubtarget.h" #include "RISCVTargetMachine.h" -#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/IR/Function.h" diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index cf7804e19e722..5df1061691a67 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4853,7 +4853,7 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars().find(PN)->second; if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || - TTI.preferInLoopReduction(RdxDesc.getOpcode(), + TTI.preferInLoopReduction(RdxDesc.getRecurrenceKind(), RdxDesc.getRecurrenceType())) continue; T = RdxDesc.getRecurrenceType(); @@ -7020,9 +7020,9 @@ void LoopVectorizationCostModel::collectInLoopReductions() { // If the target would prefer this reduction to happen "in-loop", then we // want to record it as such. - unsigned Opcode = RdxDesc.getOpcode(); + RecurKind Kind = RdxDesc.getRecurrenceKind(); if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && - !TTI.preferInLoopReduction(Opcode, Phi->getType())) + !TTI.preferInLoopReduction(Kind, Phi->getType())) continue; // Check that we can correctly put the reductions into the loop, by From b9c876dd9a3334a26f1cc975b23a91821895aa16 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Mon, 7 Apr 2025 12:10:43 +0100 Subject: [PATCH 0828/1029] [flang][test] fix sporadically failing test (#134608) The test is checking output from MLIR debug prints. MLIR passes can be executed in parallel, for example a pass on func.func might schedule different func.func operations in different threads. This led to intermittent test failures where debug output from different threads became mixed up. Fix by disabling mlir multithreading for this test. --- flang/test/Transforms/DoConcurrent/loop_nest_test.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 index 0d21b31519728..32bed61fe69e4 100644 --- a/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 +++ b/flang/test/Transforms/DoConcurrent/loop_nest_test.f90 @@ -3,7 +3,7 @@ ! REQUIRES: asserts ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host \ -! RUN: -mmlir -debug %s -o - 2> %t.log || true +! RUN: -mmlir -debug -mmlir -mlir-disable-threading %s -o - 2> %t.log || true ! RUN: FileCheck %s < %t.log From b9ec68431b45a9859517aacac684e7290f1679f2 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Mon, 7 Apr 2025 07:12:33 -0400 Subject: [PATCH 0829/1029] Correctly diagnose incomplete arrays with static storage in C (#134374) A file scope declaration without an initializer which is neither extern nor thread_local is a tentative definition. If the declaration of an identifier for an object is a tentative definition and has internal linkage, the declared type shall not be an incomplete type. Clang was previously failing to diagnose this in -pedantic mode. Fixes #50661 --------- Co-authored-by: Mariya Podchishchaeva --- clang/docs/ReleaseNotes.rst | 3 +++ clang/include/clang/Basic/DiagnosticSemaKinds.td | 5 +++-- clang/lib/Sema/SemaDecl.cpp | 6 ++++-- clang/test/C/drs/dr0xx.c | 4 +++- clang/test/Sema/incomplete-decl.c | 5 ++--- clang/test/Sema/tentative-decls.c | 4 ++-- 6 files changed, 17 insertions(+), 10 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5217e04b5e83f..4d7a09e89ae42 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -319,6 +319,9 @@ Improvements to Clang's diagnostics - ``-Wc++98-compat`` no longer diagnoses use of ``__auto_type`` or ``decltype(auto)`` as though it was the extension for ``auto``. (#GH47900) +- Now correctly diagnose a tentative definition of an array with static + storage duration in pedantic mode in C. (#GH50661) + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 393bfecf9a36b..c2b01833a5c46 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -7350,8 +7350,9 @@ def err_typecheck_pointer_arith_void_type : Error< "arithmetic on%select{ a|}0 pointer%select{|s}0 to void">; def err_typecheck_decl_incomplete_type : Error< "variable has incomplete type %0">; -def ext_typecheck_decl_incomplete_type : ExtWarn< - "tentative definition of variable with internal linkage has incomplete non-array type %0">, +def ext_typecheck_decl_incomplete_type : Extension< + "tentative definition of variable with internal linkage has incomplete " + "%select{non-array|array}0 type %1">, InGroup>; def err_tentative_def_incomplete_type : Error< "tentative definition has type %0 that is never completed">; diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index d630f9bd409fd..540f5f23fe89a 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -14246,7 +14246,8 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) { Var->getLocation(), ArrayT->getElementType(), diag::err_array_incomplete_or_sizeless_type)) Var->setInvalidDecl(); - } else if (Var->getStorageClass() == SC_Static) { + } + if (Var->getStorageClass() == SC_Static) { // C99 6.9.2p3: If the declaration of an identifier for an object is // a tentative definition and has internal linkage (C99 6.2.2p3), the // declared type shall not be an incomplete type. @@ -14258,7 +14259,8 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) { // NOTE: to avoid multiple warnings, only check the first declaration. if (Var->isFirstDecl()) RequireCompleteType(Var->getLocation(), Type, - diag::ext_typecheck_decl_incomplete_type); + diag::ext_typecheck_decl_incomplete_type, + Type->isArrayType()); } } diff --git a/clang/test/C/drs/dr0xx.c b/clang/test/C/drs/dr0xx.c index 252dc9329c4ca..5fe023deaece9 100644 --- a/clang/test/C/drs/dr0xx.c +++ b/clang/test/C/drs/dr0xx.c @@ -139,7 +139,9 @@ int dr010_c = sizeof(dr010_t); /* expected-error {{invalid application of 'sizeo * Note: DR034 has a question resolved by DR011 and another question where the * result is UB. */ -static int dr011_a[]; /* expected-warning {{tentative array definition assumed to have one element}} */ +static int dr011_a[]; /* expected-warning {{tentative array definition assumed to have one element}} + expected-warning {{tentative definition of variable with internal linkage has incomplete array type 'int[]'}} + */ void dr011(void) { extern int i[]; { diff --git a/clang/test/Sema/incomplete-decl.c b/clang/test/Sema/incomplete-decl.c index bf2890bba9911..f8f234c6b7828 100644 --- a/clang/test/Sema/incomplete-decl.c +++ b/clang/test/Sema/incomplete-decl.c @@ -3,7 +3,7 @@ -struct foo; // c-note 5 {{forward declaration of 'struct foo'}} \ +struct foo; // c-note 4 {{forward declaration of 'struct foo'}} \ cxx-note 3 {{forward declaration of 'foo'}} void b; // expected-error {{variable has incomplete type 'void'}} @@ -11,8 +11,7 @@ struct foo f; // c-error {{tentative definition has type 'struct foo' that is ne cxx-error {{variable has incomplete type 'struct foo'}} static void c; // expected-error {{variable has incomplete type 'void'}} -static struct foo g; // c-warning {{tentative definition of variable with internal linkage has incomplete non-array type 'struct foo'}} \ - c-error {{tentative definition has type 'struct foo' that is never completed}} \ +static struct foo g; // c-error {{tentative definition has type 'struct foo' that is never completed}} \ cxx-error {{variable has incomplete type 'struct foo'}} extern void d; // cxx-error {{variable has incomplete type 'void'}} diff --git a/clang/test/Sema/tentative-decls.c b/clang/test/Sema/tentative-decls.c index 713e65f3d9b35..94d21bdbf94da 100644 --- a/clang/test/Sema/tentative-decls.c +++ b/clang/test/Sema/tentative-decls.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 %s -fsyntax-only -Wprivate-extern -verify +// RUN: %clang_cc1 %s -fsyntax-only -Wprivate-extern -pedantic -verify // PR3310 struct a x1; // expected-note 2{{forward declaration of 'struct a'}} @@ -58,7 +58,7 @@ void func2(void) extern double *p; } -static int a0[]; +static int a0[]; // expected-warning {{tentative definition of variable with internal linkage has incomplete array type 'int[]'}} static int b0; static int a0[] = { 4 }; From 96de8435b90e5164da2d38ee34594e86069360e7 Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Mon, 7 Apr 2025 13:13:02 +0200 Subject: [PATCH 0830/1029] [mlir][NVVM] Fix default label unreachable warning in `getVoteSyncIntrinsicId` (#134600) Fixes the following warning after the changes in https://github.com/llvm/llvm-project/pull/134309: ``` llvm-project/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp:134:3: warning: default label in switch which covers all enumeration values [-Wcovered-switch-default] default: ^ 1 warning generated. ``` --- .../lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp index beff90237562d..7dfe320cff2ab 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp @@ -131,9 +131,8 @@ static llvm::Intrinsic::ID getVoteSyncIntrinsicId(NVVM::VoteSyncKind kind) { return llvm::Intrinsic::nvvm_vote_ballot_sync; case NVVM::VoteSyncKind::uni: return llvm::Intrinsic::nvvm_vote_uni_sync; - default: - llvm_unreachable("unsupported vote kind"); } + llvm_unreachable("unsupported vote kind"); } /// Return the intrinsic ID associated with ldmatrix for the given paramters. From 6c7c8b4776f51dc66138f71333e6ada0c33ca5e2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 7 Apr 2025 12:23:08 +0100 Subject: [PATCH 0831/1029] [X86] LowerINTRINSIC_WO_CHAIN - cleanup and clang-format VP2INTERSECT handling. NFC. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a4381b99dbae0..d720aadb2e81d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -27094,19 +27094,15 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::x86_avx512_vp2intersect_d_512: case Intrinsic::x86_avx512_vp2intersect_d_256: case Intrinsic::x86_avx512_vp2intersect_d_128: { + SDLoc DL(Op); MVT MaskVT = Op.getSimpleValueType(); - SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other); - SDLoc DL(Op); - - SDValue Operation = - DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs, - Op->getOperand(1), Op->getOperand(2)); - - SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, - MaskVT, Operation); - SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, - MaskVT, Operation); + SDValue Operation = DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs, + Op.getOperand(1), Op.getOperand(2)); + SDValue Result0 = + DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, MaskVT, Operation); + SDValue Result1 = + DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, MaskVT, Operation); return DAG.getMergeValues({Result0, Result1}, DL); } case Intrinsic::x86_mmx_pslli_w: From c9280ba25aef6cd47e973a439da23c7b52bddb56 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 7 Apr 2025 12:27:43 +0100 Subject: [PATCH 0832/1029] [AMDGPU] Simplify emitAtomicRMWLegalRemark. NFC. (#134614) --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 21f8c7cfeec1f..356040da95672 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16723,8 +16723,8 @@ static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) { static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) { LLVMContext &Ctx = RMW->getContext(); - StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or(""); - StringRef MemScope = SS.empty() ? StringRef("system") : SS; + StringRef MemScope = + Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system"); return OptimizationRemark(DEBUG_TYPE, "Passed", RMW) << "Hardware instruction generated for atomic " From bd84d66700b23132eecea71fb81d6d7378473937 Mon Sep 17 00:00:00 2001 From: Zhaoxin Yang Date: Mon, 7 Apr 2025 19:58:48 +0800 Subject: [PATCH 0833/1029] [lld][LoongArch] Convert TLS IE to LE in the normal or medium code model (#123680) Original code sequence: * pcalau12i $a0, %ie_pc_hi20(sym) * ld.d $a0, $a0, %ie_pc_lo12(sym) The code sequence converted is as follows: * lu12i.w $a0, %le_hi20(sym) # le_hi20 != 0, otherwise NOP * ori $a0, src, %le_lo12(sym) # le_hi20 != 0, src = $a0, # otherwise, src = $zero TODO: When relaxation is enabled, redundant NOP can be removed. This will be implemented in a future patch. Note: In the normal or medium code model, original code sequence with relocations allow interleaving, because converted code sequence calculates the absolute offset. However, in extreme code model, to identify the current code model, the first four instructions with relocations must appear consecutively. --- lld/ELF/Arch/LoongArch.cpp | 85 +++++++++++++++++++++++++++ lld/ELF/Relocations.cpp | 17 +++++- lld/test/ELF/loongarch-relax-tls-ie.s | 70 ++++++++++++++++++++++ lld/test/ELF/loongarch-tls-ie.s | 30 ++++------ 4 files changed, 183 insertions(+), 19 deletions(-) create mode 100644 lld/test/ELF/loongarch-relax-tls-ie.s diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp index 4edc625b05cb0..86f1778112a32 100644 --- a/lld/ELF/Arch/LoongArch.cpp +++ b/lld/ELF/Arch/LoongArch.cpp @@ -39,6 +39,7 @@ class LoongArch final : public TargetInfo { void relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const override; bool relaxOnce(int pass) const override; + void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override; void finalizeRelax(int passes) const override; }; } // end anonymous namespace @@ -53,6 +54,8 @@ enum Op { ADDI_W = 0x02800000, ADDI_D = 0x02c00000, ANDI = 0x03400000, + ORI = 0x03800000, + LU12I_W = 0x14000000, PCADDI = 0x18000000, PCADDU12I = 0x1c000000, LD_W = 0x28800000, @@ -1002,6 +1005,88 @@ static bool relax(Ctx &ctx, InputSection &sec) { return changed; } +// Convert TLS IE to LE in the normal or medium code model. +// Original code sequence: +// * pcalau12i $a0, %ie_pc_hi20(sym) +// * ld.d $a0, $a0, %ie_pc_lo12(sym) +// +// The code sequence converted is as follows: +// * lu12i.w $a0, %le_hi20(sym) # le_hi20 != 0, otherwise NOP +// * ori $a0, src, %le_lo12(sym) # le_hi20 != 0, src = $a0, +// # otherwise, src = $zero +// +// When relaxation enables, redundant NOPs can be removed. +static void tlsIeToLe(uint8_t *loc, const Relocation &rel, uint64_t val) { + assert(isInt<32>(val) && + "val exceeds the range of medium code model in tlsIeToLe"); + + bool isUInt12 = isUInt<12>(val); + const uint32_t currInsn = read32le(loc); + switch (rel.type) { + case R_LARCH_TLS_IE_PC_HI20: + if (isUInt12) + write32le(loc, insn(ANDI, R_ZERO, R_ZERO, 0)); // nop + else + write32le(loc, insn(LU12I_W, getD5(currInsn), extractBits(val, 31, 12), + 0)); // lu12i.w $a0, %le_hi20 + break; + case R_LARCH_TLS_IE_PC_LO12: + if (isUInt12) + write32le(loc, insn(ORI, getD5(currInsn), R_ZERO, + val)); // ori $a0, $zero, %le_lo12 + else + write32le(loc, insn(ORI, getD5(currInsn), getJ5(currInsn), + lo12(val))); // ori $a0, $a0, %le_lo12 + break; + } +} + +void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { + const unsigned bits = ctx.arg.is64 ? 64 : 32; + uint64_t secAddr = sec.getOutputSection()->addr; + if (auto *s = dyn_cast(&sec)) + secAddr += s->outSecOff; + else if (auto *ehIn = dyn_cast(&sec)) + secAddr += ehIn->getParent()->outSecOff; + bool isExtreme = false; + const MutableArrayRef relocs = sec.relocs(); + for (size_t i = 0, size = relocs.size(); i != size; ++i) { + Relocation &rel = relocs[i]; + uint8_t *loc = buf + rel.offset; + uint64_t val = SignExtend64( + sec.getRelocTargetVA(ctx, rel, secAddr + rel.offset), bits); + + switch (rel.expr) { + case R_RELAX_HINT: + continue; + case R_RELAX_TLS_IE_TO_LE: + if (rel.type == R_LARCH_TLS_IE_PC_HI20) { + // LoongArch does not support IE to LE optimization in the extreme code + // model. In this case, the relocs are as follows: + // + // * i -- R_LARCH_TLS_IE_PC_HI20 + // * i+1 -- R_LARCH_TLS_IE_PC_LO12 + // * i+2 -- R_LARCH_TLS_IE64_PC_LO20 + // * i+3 -- R_LARCH_TLS_IE64_PC_HI12 + isExtreme = + (i + 2 < size && relocs[i + 2].type == R_LARCH_TLS_IE64_PC_LO20); + } + if (isExtreme) { + rel.expr = getRelExpr(rel.type, *rel.sym, loc); + val = SignExtend64(sec.getRelocTargetVA(ctx, rel, secAddr + rel.offset), + bits); + relocateNoSym(loc, rel.type, val); + } else { + tlsIeToLe(loc, rel, val); + } + continue; + default: + break; + } + relocate(loc, rel, val); + } +} + // When relaxing just R_LARCH_ALIGN, relocDeltas is usually changed only once in // the absence of a linker script. For call and load/store R_LARCH_RELAX, code // shrinkage may reduce displacement and make more relocations eligible for diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index df8d2a6a5f988..81de664fd1c23 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -1376,6 +1376,11 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type, return 1; } + // LoongArch supports IE to LE optimization in non-extreme code model. + bool execOptimizeInLoongArch = + ctx.arg.emachine == EM_LOONGARCH && + (type == R_LARCH_TLS_IE_PC_HI20 || type == R_LARCH_TLS_IE_PC_LO12); + // ARM, Hexagon, LoongArch and RISC-V do not support GD/LD to IE/LE // optimizations. // RISC-V supports TLSDESC to IE/LE optimizations. @@ -1383,7 +1388,8 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type, // optimization as well. bool execOptimize = !ctx.arg.shared && ctx.arg.emachine != EM_ARM && - ctx.arg.emachine != EM_HEXAGON && ctx.arg.emachine != EM_LOONGARCH && + ctx.arg.emachine != EM_HEXAGON && + (ctx.arg.emachine != EM_LOONGARCH || execOptimizeInLoongArch) && !(isRISCV && expr != R_TLSDESC_PC && expr != R_TLSDESC_CALL) && !sec->file->ppc64DisableTLSRelax; @@ -1477,6 +1483,15 @@ unsigned RelocationScanner::handleTlsRelocation(RelExpr expr, RelType type, return 1; } + // LoongArch TLS GD/LD relocs reuse the RE_LOONGARCH_GOT, in which + // NEEDS_TLSIE shouldn't set. So we check independently. + if (ctx.arg.emachine == EM_LOONGARCH && expr == RE_LOONGARCH_GOT && + execOptimize && isLocalInExecutable) { + ctx.hasTlsIe.store(true, std::memory_order_relaxed); + sec->addReloc({R_RELAX_TLS_IE_TO_LE, type, offset, addend, &sym}); + return 1; + } + return 0; } diff --git a/lld/test/ELF/loongarch-relax-tls-ie.s b/lld/test/ELF/loongarch-relax-tls-ie.s new file mode 100644 index 0000000000000..82e609d005aff --- /dev/null +++ b/lld/test/ELF/loongarch-relax-tls-ie.s @@ -0,0 +1,70 @@ +# REQUIRES: loongarch +## Test LA64 IE -> LE in various cases. + +# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax %s -o %t.o + +## FIXME: IE relaxation has not yet been implemented. +## --relax/--no-relax has the same result. Also check --emit-relocs. +# RUN: ld.lld --emit-relocs %t.o -o %t +# RUN: llvm-readelf -x .got %t 2>&1 | FileCheck --check-prefix=LE-GOT %s +# RUN: llvm-objdump -dr --no-show-raw-insn %t | FileCheck --check-prefixes=LE %s + +# RUN: ld.lld --emit-relocs --no-relax %t.o -o %t.norelax +# RUN: llvm-readelf -x .got %t.norelax 2>&1 | FileCheck --check-prefix=LE-GOT %s +# RUN: llvm-objdump -dr --no-show-raw-insn %t.norelax | FileCheck --check-prefixes=LE %s + +# LE-GOT: could not find section '.got' + +# a@tprel = st_value(a) = 0xfff +# b@tprel = st_value(a) = 0x1000 +# LE: 20158: nop +# LE-NEXT: R_LARCH_TLS_IE_PC_HI20 a +# LE-NEXT: R_LARCH_RELAX *ABS* +# LE-NEXT: ori $a0, $zero, 4095 +# LE-NEXT: R_LARCH_TLS_IE_PC_LO12 a +# LE-NEXT: R_LARCH_RELAX *ABS* +# LE-NEXT: add.d $a0, $a0, $tp +# LE-NEXT: 20164: lu12i.w $a1, 1 +# LE-NEXT: R_LARCH_TLS_IE_PC_HI20 b +# LE-NEXT: ori $a1, $a1, 0 +# LE-NEXT: R_LARCH_TLS_IE_PC_LO12 b +# LE-NEXT: add.d $a1, $a1, $tp +# LE-NEXT: 20170: nop +# LE-NEXT: R_LARCH_TLS_IE_PC_HI20 a +# LE-NEXT: R_LARCH_RELAX *ABS* +# LE-NEXT: lu12i.w $a3, 1 +# LE-NEXT: R_LARCH_TLS_IE_PC_HI20 b +# LE-NEXT: R_LARCH_RELAX *ABS* +# LE-NEXT: ori $a2, $zero, 4095 +# LE-NEXT: R_LARCH_TLS_IE_PC_LO12 a +# LE-NEXT: ori $a3, $a3, 0 +# LE-NEXT: R_LARCH_TLS_IE_PC_LO12 b +# LE-NEXT: add.d $a2, $a2, $tp +# LE-NEXT: add.d $a3, $a3, $tp + +la.tls.ie $a0, a # relax +add.d $a0, $a0, $tp + +# PCALAU12I does not have R_LARCH_RELAX. No relaxation. +pcalau12i $a1, %ie_pc_hi20(b) +ld.d $a1, $a1, %ie_pc_lo12(b) +add.d $a1, $a1, $tp + +# Test instructions are interleaved. +# PCALAU12I has an R_LARCH_RELAX. We perform relaxation. +pcalau12i $a2, %ie_pc_hi20(a) +.reloc .-4, R_LARCH_RELAX, 0 +pcalau12i $a3, %ie_pc_hi20(b) +.reloc .-4, R_LARCH_RELAX, 0 +ld.d $a2, $a2, %ie_pc_lo12(a) +ld.d $a3, $a3, %ie_pc_lo12(b) +add.d $a2, $a2, $tp +add.d $a3, $a3, $tp + +.section .tbss,"awT",@nobits +.globl a +.zero 0xfff ## Place a at 0xfff, LE needs only one ins. +a: +.zero 1 ## Place b at 0x1000, LE needs two ins. +b: +.zero 4 diff --git a/lld/test/ELF/loongarch-tls-ie.s b/lld/test/ELF/loongarch-tls-ie.s index 78c207991b4e6..ddfd9c976cb9b 100644 --- a/lld/test/ELF/loongarch-tls-ie.s +++ b/lld/test/ELF/loongarch-tls-ie.s @@ -12,7 +12,7 @@ ## LA32 IE -> LE # RUN: ld.lld %t/32.o -o %t/32 # RUN: llvm-readelf -r %t/32 | FileCheck --check-prefix=NOREL %s -# RUN: llvm-readelf -x .got %t/32 | FileCheck --check-prefix=LE32-GOT %s +# RUN: llvm-readelf -x .got %t/32 2>&1 | FileCheck --check-prefix=LE32-GOT %s # RUN: llvm-objdump -d --no-show-raw-insn %t/32 | FileCheck --check-prefixes=LE32 %s ## LA64 IE @@ -23,7 +23,7 @@ ## LA64 IE -> LE # RUN: ld.lld %t/64.o -o %t/64 # RUN: llvm-readelf -r %t/64 | FileCheck --check-prefix=NOREL %s -# RUN: llvm-readelf -x .got %t/64 | FileCheck --check-prefix=LE64-GOT %s +# RUN: llvm-readelf -x .got %t/64 2>&1 | FileCheck --check-prefix=LE64-GOT %s # RUN: llvm-objdump -d --no-show-raw-insn %t/64 | FileCheck --check-prefixes=LE64 %s # IE32-REL: FLAGS STATIC_TLS @@ -62,29 +62,23 @@ # a@tprel = st_value(a) = 0x8 # b@tprel = st_value(a) = 0xc -# LE32-GOT: section '.got': -# LE32-GOT-NEXT: 0x0003012c 08000000 0c000000 -# LE64-GOT: section '.got': -# LE64-GOT-NEXT: 0x000301e0 08000000 00000000 0c000000 00000000 +# LE32-GOT: could not find section '.got' +# LE64-GOT: could not find section '.got' ## LA32: -## &.got[0] - . = 0x3012c - 0x20114: 0x10 pages, page offset 0x12c -## &.got[1] - . = 0x30130 - 0x20120: 0x10 pages, page offset 0x130 -# LE32: 20114: pcalau12i $a4, 16 -# LE32-NEXT: ld.w $a4, $a4, 300 +# LE32: 200d4: nop +# LE32-NEXT: ori $a4, $zero, 8 # LE32-NEXT: add.w $a4, $a4, $tp -# LE32-NEXT: 20120: pcalau12i $a5, 16 -# LE32-NEXT: ld.w $a5, $a5, 304 +# LE32-NEXT: 200e0: nop +# LE32-NEXT: ori $a5, $zero, 12 # LE32-NEXT: add.w $a5, $a5, $tp ## LA64: -## &.got[0] - . = 0x301e0 - 0x201c8: 0x10 pages, page offset 0x1e0 -## &.got[1] - . = 0x301e8 - 0x201d4: 0x10 pages, page offset 0x1e8 -# LE64: 201c8: pcalau12i $a4, 16 -# LE64-NEXT: ld.d $a4, $a4, 480 +# LE64: 20158: nop +# LE64-NEXT: ori $a4, $zero, 8 # LE64-NEXT: add.d $a4, $a4, $tp -# LE64-NEXT: 201d4: pcalau12i $a5, 16 -# LE64-NEXT: ld.d $a5, $a5, 488 +# LE64-NEXT: 20164: nop +# LE64-NEXT: ori $a5, $zero, 12 # LE64-NEXT: add.d $a5, $a5, $tp #--- 32.s From 37deb0959311bba0d1d51ee9b9d24d1ea400f6d4 Mon Sep 17 00:00:00 2001 From: Uday Bondhugula Date: Mon, 7 Apr 2025 17:36:28 +0530 Subject: [PATCH 0834/1029] [MLIR][Affine] Fix signatures of normalize memref utilities (#134466) These methods were passing derived op types by pointers, which deviates from the style. While on this, fix obsolete comments on those methods. --- mlir/include/mlir/Dialect/Affine/Utils.h | 6 ++-- mlir/lib/Dialect/Affine/Utils/Utils.cpp | 35 +++++++++---------- .../MemRef/Transforms/NormalizeMemRefs.cpp | 4 +-- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h index ff1900bc8f2eb..3b4bb34105581 100644 --- a/mlir/include/mlir/Dialect/Affine/Utils.h +++ b/mlir/include/mlir/Dialect/Affine/Utils.h @@ -247,11 +247,11 @@ LogicalResult replaceAllMemRefUsesWith(Value oldMemRef, Value newMemRef, /// and updates all its indexing uses. Returns failure if any of its uses /// escape (while leaving the IR in a valid state). template -LogicalResult normalizeMemRef(AllocLikeOp *op); +LogicalResult normalizeMemRef(AllocLikeOp op); extern template LogicalResult -normalizeMemRef(memref::AllocaOp *op); +normalizeMemRef(memref::AllocaOp op); extern template LogicalResult -normalizeMemRef(memref::AllocOp *op); +normalizeMemRef(memref::AllocOp op); /// Normalizes `memrefType` so that the affine layout map of the memref is /// transformed to an identity map with a new shape being computed for the diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp index 2723cff6900d0..2925aa918cb1c 100644 --- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp @@ -1741,7 +1741,7 @@ static AffineExpr createDimSizeExprForTiledLayout(AffineExpr oldMapOutput, template static void createNewDynamicSizes(MemRefType oldMemRefType, MemRefType newMemRefType, AffineMap map, - AllocLikeOp *allocOp, OpBuilder b, + AllocLikeOp allocOp, OpBuilder b, SmallVectorImpl &newDynamicSizes) { // Create new input for AffineApplyOp. SmallVector inAffineApply; @@ -1750,13 +1750,13 @@ static void createNewDynamicSizes(MemRefType oldMemRefType, for (unsigned d = 0; d < oldMemRefType.getRank(); ++d) { if (oldMemRefShape[d] < 0) { // Use dynamicSizes of allocOp for dynamic dimension. - inAffineApply.emplace_back(allocOp->getDynamicSizes()[dynIdx]); + inAffineApply.emplace_back(allocOp.getDynamicSizes()[dynIdx]); dynIdx++; } else { // Create ConstantOp for static dimension. auto constantAttr = b.getIntegerAttr(b.getIndexType(), oldMemRefShape[d]); inAffineApply.emplace_back( - b.create(allocOp->getLoc(), constantAttr)); + b.create(allocOp.getLoc(), constantAttr)); } } @@ -1780,18 +1780,17 @@ static void createNewDynamicSizes(MemRefType oldMemRefType, AffineMap newMap = AffineMap::get(map.getNumInputs(), map.getNumSymbols(), newMapOutput); Value affineApp = - b.create(allocOp->getLoc(), newMap, inAffineApply); + b.create(allocOp.getLoc(), newMap, inAffineApply); newDynamicSizes.emplace_back(affineApp); } newDimIdx++; } } -// TODO: Currently works for static memrefs with a single layout map. template -LogicalResult mlir::affine::normalizeMemRef(AllocLikeOp *allocOp) { - MemRefType memrefType = allocOp->getType(); - OpBuilder b(*allocOp); +LogicalResult mlir::affine::normalizeMemRef(AllocLikeOp allocOp) { + MemRefType memrefType = allocOp.getType(); + OpBuilder b(allocOp); // Fetch a new memref type after normalizing the old memref to have an // identity map layout. @@ -1801,9 +1800,9 @@ LogicalResult mlir::affine::normalizeMemRef(AllocLikeOp *allocOp) { // transformed to an identity map. return failure(); - Value oldMemRef = allocOp->getResult(); + Value oldMemRef = allocOp.getResult(); - SmallVector symbolOperands(allocOp->getSymbolOperands()); + SmallVector symbolOperands(allocOp.getSymbolOperands()); AffineMap layoutMap = memrefType.getLayout().getAffineMap(); AllocLikeOp newAlloc; // Check if `layoutMap` is a tiled layout. Only single layout map is @@ -1811,17 +1810,17 @@ LogicalResult mlir::affine::normalizeMemRef(AllocLikeOp *allocOp) { SmallVector> tileSizePos; (void)getTileSizePos(layoutMap, tileSizePos); if (newMemRefType.getNumDynamicDims() > 0 && !tileSizePos.empty()) { - MemRefType oldMemRefType = cast(oldMemRef.getType()); + auto oldMemRefType = cast(oldMemRef.getType()); SmallVector newDynamicSizes; createNewDynamicSizes(oldMemRefType, newMemRefType, layoutMap, allocOp, b, newDynamicSizes); // Add the new dynamic sizes in new AllocOp. newAlloc = - b.create(allocOp->getLoc(), newMemRefType, newDynamicSizes, - allocOp->getAlignmentAttr()); + b.create(allocOp.getLoc(), newMemRefType, newDynamicSizes, + allocOp.getAlignmentAttr()); } else { - newAlloc = b.create(allocOp->getLoc(), newMemRefType, - allocOp->getAlignmentAttr()); + newAlloc = b.create(allocOp.getLoc(), newMemRefType, + allocOp.getAlignmentAttr()); } // Replace all uses of the old memref. if (failed(replaceAllMemRefUsesWith(oldMemRef, /*newMemRef=*/newAlloc, @@ -1842,14 +1841,14 @@ LogicalResult mlir::affine::normalizeMemRef(AllocLikeOp *allocOp) { return hasSingleEffect(op, oldMemRef); })); oldMemRef.replaceAllUsesWith(newAlloc); - allocOp->erase(); + allocOp.erase(); return success(); } template LogicalResult -mlir::affine::normalizeMemRef(memref::AllocaOp *op); +mlir::affine::normalizeMemRef(memref::AllocaOp op); template LogicalResult -mlir::affine::normalizeMemRef(memref::AllocOp *op); +mlir::affine::normalizeMemRef(memref::AllocOp op); MemRefType mlir::affine::normalizeMemRefType(MemRefType memrefType) { unsigned rank = memrefType.getRank(); diff --git a/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp b/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp index 08b853fe65b85..95fed04a7864e 100644 --- a/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp +++ b/mlir/lib/Dialect/MemRef/Transforms/NormalizeMemRefs.cpp @@ -356,12 +356,12 @@ void NormalizeMemRefs::normalizeFuncOpMemRefs(func::FuncOp funcOp, SmallVector allocOps; funcOp.walk([&](memref::AllocOp op) { allocOps.push_back(op); }); for (memref::AllocOp allocOp : allocOps) - (void)normalizeMemRef(&allocOp); + (void)normalizeMemRef(allocOp); SmallVector allocaOps; funcOp.walk([&](memref::AllocaOp op) { allocaOps.push_back(op); }); for (memref::AllocaOp allocaOp : allocaOps) - (void)normalizeMemRef(&allocaOp); + (void)normalizeMemRef(allocaOp); // We use this OpBuilder to create new memref layout later. OpBuilder b(funcOp); From 4d1e4eff5f6c6c62781926b0d31ad155364ffe62 Mon Sep 17 00:00:00 2001 From: Mats Jun Larsen Date: Mon, 7 Apr 2025 21:09:00 +0900 Subject: [PATCH 0835/1029] [IR] Avoid call to deprecated PointerType::get (NFC) (#134609) Should keep MSVC quiet as noticed by @rksimon in #134517. Assertions have been copied over from PointerType::get in order to not silently change invariants with this call. --- llvm/include/llvm/IR/DerivedTypes.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index ba5c41ff033f5..d0dffa9de616a 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -697,7 +697,10 @@ class PointerType : public Type { [[deprecated("PointerType::getUnqual with pointee type is pending removal. " "Use Context overload.")]] static PointerType *getUnqual(Type *ElementType) { - return PointerType::get(ElementType, 0); + assert(ElementType && "Can't get a pointer to type!"); + assert(isValidElementType(ElementType) && + "Invalid type for pointer element!"); + return PointerType::getUnqual(ElementType->getContext()); } /// This constructs an opaque pointer to an object in the From 9fe6f6a0d430b872750354c20f3e4a651bd1f135 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Mon, 7 Apr 2025 14:23:22 +0200 Subject: [PATCH 0836/1029] [bazel] Change `gentbl_cc_library(tbl_outs)` to dicts (#134349) Follow up from https://github.com/llvm/llvm-project/pull/134271 --- .../llvm-project-overlay/clang/BUILD.bazel | 473 +- .../llvm-project-overlay/lldb/BUILD.bazel | 64 +- .../lldb/source/Plugins/BUILD.bazel | 173 +- .../llvm-project-overlay/llvm/BUILD.bazel | 339 +- .../llvm/unittests/BUILD.bazel | 19 +- .../llvm-project-overlay/mlir/BUILD.bazel | 5112 +++++------------ .../mlir/examples/toy/Ch2/BUILD.bazel | 24 +- .../mlir/examples/toy/Ch3/BUILD.bazel | 31 +- .../mlir/examples/toy/Ch4/BUILD.bazel | 45 +- .../mlir/examples/toy/Ch5/BUILD.bazel | 45 +- .../mlir/examples/toy/Ch6/BUILD.bazel | 45 +- .../mlir/examples/toy/Ch7/BUILD.bazel | 45 +- .../mlir/test/BUILD.bazel | 223 +- .../mlir/unittests/BUILD.bazel | 21 +- 14 files changed, 1931 insertions(+), 4728 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 7e0c18e69e316..90637864498cf 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -111,10 +111,7 @@ td_library( gentbl_cc_library( name = "basic_arm_neon_inc_gen", - tbl_outs = [( - ["-gen-arm-neon-sema"], - "include/clang/Basic/arm_neon.inc", - )], + tbl_outs = {"include/clang/Basic/arm_neon.inc": ["-gen-arm-neon-sema"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_neon.td", deps = [":ArmTdFiles"], @@ -122,10 +119,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_fp16_inc_gen", - tbl_outs = [( - ["-gen-arm-neon-sema"], - "include/clang/Basic/arm_fp16.inc", - )], + tbl_outs = {"include/clang/Basic/arm_fp16.inc": ["-gen-arm-neon-sema"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_fp16.td", deps = [":ArmTdFiles"], @@ -133,10 +127,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_mve_aliases_gen", - tbl_outs = [( - ["-gen-arm-mve-builtin-aliases"], - "include/clang/Basic/arm_mve_builtin_aliases.inc", - )], + tbl_outs = {"include/clang/Basic/arm_mve_builtin_aliases.inc": ["-gen-arm-mve-builtin-aliases"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_mve.td", deps = [":ArmTdFiles"], @@ -144,10 +135,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_sve_builtins_gen", - tbl_outs = [( - ["-gen-arm-sve-builtins"], - "include/clang/Basic/arm_sve_builtins.inc", - )], + tbl_outs = {"include/clang/Basic/arm_sve_builtins.inc": ["-gen-arm-sve-builtins"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sve.td", deps = [":ArmTdFiles"], @@ -155,10 +143,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_sve_builtin_cg_gen", - tbl_outs = [( - ["-gen-arm-sve-builtin-codegen"], - "include/clang/Basic/arm_sve_builtin_cg.inc", - )], + tbl_outs = {"include/clang/Basic/arm_sve_builtin_cg.inc": ["-gen-arm-sve-builtin-codegen"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sve.td", deps = [":ArmTdFiles"], @@ -166,10 +151,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_sve_immcheck_types_gen", - tbl_outs = [( - ["-gen-arm-immcheck-types"], - "include/clang/Basic/arm_immcheck_types.inc", - )], + tbl_outs = {"include/clang/Basic/arm_immcheck_types.inc": ["-gen-arm-immcheck-types"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sve.td", deps = [":ArmTdFiles"], @@ -177,10 +159,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_sve_typeflags_gen", - tbl_outs = [( - ["-gen-arm-sve-typeflags"], - "include/clang/Basic/arm_sve_typeflags.inc", - )], + tbl_outs = {"include/clang/Basic/arm_sve_typeflags.inc": ["-gen-arm-sve-typeflags"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sve.td", deps = [":ArmTdFiles"], @@ -188,10 +167,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_sve_sema_rangechecks_gen", - tbl_outs = [( - ["-gen-arm-sve-sema-rangechecks"], - "include/clang/Basic/arm_sve_sema_rangechecks.inc", - )], + tbl_outs = {"include/clang/Basic/arm_sve_sema_rangechecks.inc": ["-gen-arm-sve-sema-rangechecks"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sve.td", deps = [":ArmTdFiles"], @@ -199,10 +175,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_sve_streaming_attrs_gen", - tbl_outs = [( - ["-gen-arm-sve-streaming-attrs"], - "include/clang/Basic/arm_sve_streaming_attrs.inc", - )], + tbl_outs = {"include/clang/Basic/arm_sve_streaming_attrs.inc": ["-gen-arm-sve-streaming-attrs"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sve.td", deps = [":ArmTdFiles"], @@ -210,10 +183,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_sme_builtins_gen", - tbl_outs = [( - ["-gen-arm-sme-builtins"], - "include/clang/Basic/arm_sme_builtins.inc", - )], + tbl_outs = {"include/clang/Basic/arm_sme_builtins.inc": ["-gen-arm-sme-builtins"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sme.td", deps = [":ArmTdFiles"], @@ -221,10 +191,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_sme_builtin_cg_gen", - tbl_outs = [( - ["-gen-arm-sme-builtin-codegen"], - "include/clang/Basic/arm_sme_builtin_cg.inc", - )], + tbl_outs = {"include/clang/Basic/arm_sme_builtin_cg.inc": ["-gen-arm-sme-builtin-codegen"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sme.td", deps = [":ArmTdFiles"], @@ -232,10 +199,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_sme_builtins_za_state_gen", - tbl_outs = [( - ["-gen-arm-sme-builtin-za-state"], - "include/clang/Basic/arm_sme_builtins_za_state.inc", - )], + tbl_outs = {"include/clang/Basic/arm_sme_builtins_za_state.inc": ["-gen-arm-sme-builtin-za-state"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sme.td", deps = [":ArmTdFiles"], @@ -243,10 +207,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_sme_sema_rangechecks_gen", - tbl_outs = [( - ["-gen-arm-sme-sema-rangechecks"], - "include/clang/Basic/arm_sme_sema_rangechecks.inc", - )], + tbl_outs = {"include/clang/Basic/arm_sme_sema_rangechecks.inc": ["-gen-arm-sme-sema-rangechecks"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sme.td", deps = [":ArmTdFiles"], @@ -254,10 +215,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_sme_streaming_attrs_gen", - tbl_outs = [( - ["-gen-arm-sme-streaming-attrs"], - "include/clang/Basic/arm_sme_streaming_attrs.inc", - )], + tbl_outs = {"include/clang/Basic/arm_sme_streaming_attrs.inc": ["-gen-arm-sme-streaming-attrs"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sme.td", deps = [":ArmTdFiles"], @@ -265,10 +223,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_mve_cg_gen", - tbl_outs = [( - ["-gen-arm-mve-builtin-codegen"], - "include/clang/Basic/arm_mve_builtin_cg.inc", - )], + tbl_outs = {"include/clang/Basic/arm_mve_builtin_cg.inc": ["-gen-arm-mve-builtin-codegen"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_mve.td", deps = [":ArmTdFiles"], @@ -276,10 +231,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_mve_inc_gen", - tbl_outs = [( - ["-gen-arm-mve-builtin-def"], - "include/clang/Basic/arm_mve_builtins.inc", - )], + tbl_outs = {"include/clang/Basic/arm_mve_builtins.inc": ["-gen-arm-mve-builtin-def"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_mve.td", deps = [":ArmTdFiles"], @@ -287,10 +239,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_mve_sema_gen", - tbl_outs = [( - ["-gen-arm-mve-builtin-sema"], - "include/clang/Basic/arm_mve_builtin_sema.inc", - )], + tbl_outs = {"include/clang/Basic/arm_mve_builtin_sema.inc": ["-gen-arm-mve-builtin-sema"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_mve.td", deps = [":ArmTdFiles"], @@ -316,10 +265,7 @@ td_library( gentbl_cc_library( name = "basic_builtins_bpf_gen", - tbl_outs = [( - ["-gen-clang-builtins"], - "include/clang/Basic/BuiltinsBPF.inc", - )], + tbl_outs = {"include/clang/Basic/BuiltinsBPF.inc": ["-gen-clang-builtins"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/BuiltinsBPF.td", deps = [":BuiltinsBaseTdFiles"], @@ -327,10 +273,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_builtins_hexagon_gen", - tbl_outs = [( - ["-gen-clang-builtins"], - "include/clang/Basic/BuiltinsHexagon.inc", - )], + tbl_outs = {"include/clang/Basic/BuiltinsHexagon.inc": ["-gen-clang-builtins"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/BuiltinsHexagon.td", deps = [":BuiltinsBaseTdFiles"], @@ -338,10 +281,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_builtins_nvptx_gen", - tbl_outs = [( - ["-gen-clang-builtins"], - "include/clang/Basic/BuiltinsNVPTX.inc", - )], + tbl_outs = {"include/clang/Basic/BuiltinsNVPTX.inc": ["-gen-clang-builtins"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/BuiltinsNVPTX.td", deps = [":BuiltinsBaseTdFiles"], @@ -349,10 +289,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_builtins_spirv_gen", - tbl_outs = [( - ["-gen-clang-builtins"], - "include/clang/Basic/BuiltinsSPIRV.inc", - )], + tbl_outs = {"include/clang/Basic/BuiltinsSPIRV.inc": ["-gen-clang-builtins"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/BuiltinsSPIRV.td", deps = [":BuiltinsBaseTdFiles"], @@ -360,10 +297,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_builtins_riscv_gen", - tbl_outs = [( - ["-gen-clang-builtins"], - "include/clang/Basic/BuiltinsRISCV.inc", - )], + tbl_outs = {"include/clang/Basic/BuiltinsRISCV.inc": ["-gen-clang-builtins"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/BuiltinsRISCV.td", deps = [ @@ -374,10 +308,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_builtins_x86_gen", - tbl_outs = [( - ["-gen-clang-builtins"], - "include/clang/Basic/BuiltinsX86.inc", - )], + tbl_outs = {"include/clang/Basic/BuiltinsX86.inc": ["-gen-clang-builtins"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/BuiltinsX86.td", deps = [ @@ -388,10 +319,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_builtins_x86_64_gen", - tbl_outs = [( - ["-gen-clang-builtins"], - "include/clang/Basic/BuiltinsX86_64.inc", - )], + tbl_outs = {"include/clang/Basic/BuiltinsX86_64.inc": ["-gen-clang-builtins"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/BuiltinsX86_64.td", deps = [ @@ -402,10 +330,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_builtins_gen", - tbl_outs = [( - ["-gen-clang-builtins"], - "include/clang/Basic/Builtins.inc", - )], + tbl_outs = {"include/clang/Basic/Builtins.inc": ["-gen-clang-builtins"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/Builtins.td", deps = [":BuiltinsBaseTdFiles"], @@ -413,10 +338,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_builtin_templates_gen", - tbl_outs = [( - ["-gen-clang-builtin-templates"], - "include/clang/Basic/BuiltinTemplates.inc", - )], + tbl_outs = {"include/clang/Basic/BuiltinTemplates.inc": ["-gen-clang-builtin-templates"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/BuiltinTemplates.td", deps = [ @@ -434,10 +356,7 @@ td_library( gentbl_cc_library( name = "basic_riscv_vector_builtins_gen", - tbl_outs = [( - ["-gen-riscv-vector-builtins"], - "include/clang/Basic/riscv_vector_builtins.inc", - )], + tbl_outs = {"include/clang/Basic/riscv_vector_builtins.inc": ["-gen-riscv-vector-builtins"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/riscv_vector.td", deps = [":RiscvTdFiles"], @@ -445,10 +364,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_riscv_vector_builtin_cg_gen", - tbl_outs = [( - ["-gen-riscv-vector-builtin-codegen"], - "include/clang/Basic/riscv_vector_builtin_cg.inc", - )], + tbl_outs = {"include/clang/Basic/riscv_vector_builtin_cg.inc": ["-gen-riscv-vector-builtin-codegen"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/riscv_vector.td", deps = [":RiscvTdFiles"], @@ -456,10 +372,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_riscv_vector_builtin_sema_gen", - tbl_outs = [( - ["-gen-riscv-vector-builtin-sema"], - "include/clang/Basic/riscv_vector_builtin_sema.inc", - )], + tbl_outs = {"include/clang/Basic/riscv_vector_builtin_sema.inc": ["-gen-riscv-vector-builtin-sema"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/riscv_vector.td", deps = [":RiscvTdFiles"], @@ -467,10 +380,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_riscv_sifive_vector_builtins_gen", - tbl_outs = [( - ["-gen-riscv-sifive-vector-builtins"], - "include/clang/Basic/riscv_sifive_vector_builtins.inc", - )], + tbl_outs = {"include/clang/Basic/riscv_sifive_vector_builtins.inc": ["-gen-riscv-sifive-vector-builtins"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/riscv_sifive_vector.td", deps = [":RiscvTdFiles"], @@ -478,10 +388,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_riscv_sifive_vector_builtin_cg_gen", - tbl_outs = [( - ["-gen-riscv-sifive-vector-builtin-codegen"], - "include/clang/Basic/riscv_sifive_vector_builtin_cg.inc", - )], + tbl_outs = {"include/clang/Basic/riscv_sifive_vector_builtin_cg.inc": ["-gen-riscv-sifive-vector-builtin-codegen"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/riscv_sifive_vector.td", deps = [":RiscvTdFiles"], @@ -489,10 +396,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_riscv_sifive_vector_builtin_sema_gen", - tbl_outs = [( - ["-gen-riscv-sifive-vector-builtin-sema"], - "include/clang/Basic/riscv_sifive_vector_builtin_sema.inc", - )], + tbl_outs = {"include/clang/Basic/riscv_sifive_vector_builtin_sema.inc": ["-gen-riscv-sifive-vector-builtin-sema"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/riscv_sifive_vector.td", deps = [":RiscvTdFiles"], @@ -500,10 +404,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_cde_gen", - tbl_outs = [( - ["-gen-arm-cde-builtin-def"], - "include/clang/Basic/arm_cde_builtins.inc", - )], + tbl_outs = {"include/clang/Basic/arm_cde_builtins.inc": ["-gen-arm-cde-builtin-def"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_cde.td", deps = [":ArmTdFiles"], @@ -511,10 +412,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_cde_aliases_gen", - tbl_outs = [( - ["-gen-arm-cde-builtin-aliases"], - "include/clang/Basic/arm_cde_builtin_aliases.inc", - )], + tbl_outs = {"include/clang/Basic/arm_cde_builtin_aliases.inc": ["-gen-arm-cde-builtin-aliases"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_cde.td", deps = [":ArmTdFiles"], @@ -522,10 +420,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_cde_cg_gen", - tbl_outs = [( - ["-gen-arm-cde-builtin-codegen"], - "include/clang/Basic/arm_cde_builtin_cg.inc", - )], + tbl_outs = {"include/clang/Basic/arm_cde_builtin_cg.inc": ["-gen-arm-cde-builtin-codegen"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_cde.td", deps = [":ArmTdFiles"], @@ -533,10 +428,7 @@ gentbl_cc_library( gentbl_cc_library( name = "basic_arm_cde_sema_gen", - tbl_outs = [( - ["-gen-arm-cde-builtin-sema"], - "include/clang/Basic/arm_cde_builtin_sema.inc", - )], + tbl_outs = {"include/clang/Basic/arm_cde_builtin_sema.inc": ["-gen-arm-cde-builtin-sema"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_cde.td", deps = [":ArmTdFiles"], @@ -567,32 +459,14 @@ td_library( gentbl_cc_library( name = "basic_attr_gen", - tbl_outs = [ - ( - ["-gen-clang-attr-has-attribute-impl"], - "include/clang/Basic/AttrHasAttributeImpl.inc", - ), - ( - ["-gen-clang-attr-list"], - "include/clang/Basic/AttrList.inc", - ), - ( - ["-gen-clang-attr-parsed-attr-list"], - "include/clang/Basic/AttrParsedAttrList.inc", - ), - ( - ["-gen-clang-attr-subject-match-rule-list"], - "include/clang/Basic/AttrSubMatchRulesList.inc", - ), - ( - ["-gen-clang-regular-keyword-attr-info"], - "include/clang/Basic/RegularKeywordAttrInfo.inc", - ), - ( - ["-gen-cxx11-attribute-info"], - "include/clang/Basic/CXX11AttributeInfo.inc", - ), - ], + tbl_outs = { + "include/clang/Basic/AttrHasAttributeImpl.inc": ["-gen-clang-attr-has-attribute-impl"], + "include/clang/Basic/AttrList.inc": ["-gen-clang-attr-list"], + "include/clang/Basic/AttrParsedAttrList.inc": ["-gen-clang-attr-parsed-attr-list"], + "include/clang/Basic/AttrSubMatchRulesList.inc": ["-gen-clang-attr-subject-match-rule-list"], + "include/clang/Basic/RegularKeywordAttrInfo.inc": ["-gen-clang-regular-keyword-attr-info"], + "include/clang/Basic/CXX11AttributeInfo.inc": ["-gen-cxx11-attribute-info"], + }, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/Attr.td", deps = [":BasicCoreTdFiles"], @@ -601,10 +475,7 @@ gentbl_cc_library( gentbl_cc_library( name = "libsema_openclbuiltins_inc_gen", strip_include_prefix = "lib/Sema", - tbl_outs = [( - ["-gen-clang-opencl-builtins"], - "lib/Sema/OpenCLBuiltins.inc", - )], + tbl_outs = {"lib/Sema/OpenCLBuiltins.inc": ["-gen-clang-opencl-builtins"]}, tblgen = ":clang-tblgen", td_file = "lib/Sema/OpenCLBuiltins.td", ) @@ -791,28 +662,13 @@ cc_library( gentbl_cc_library( name = "ast_attr_gen", - tbl_outs = [ - ( - ["-gen-clang-attr-ast-visitor"], - "include/clang/AST/AttrVisitor.inc", - ), - ( - ["-gen-clang-attr-classes"], - "include/clang/AST/Attrs.inc", - ), - ( - ["-gen-clang-attr-text-node-dump"], - "include/clang/AST/AttrTextNodeDump.inc", - ), - ( - ["-gen-clang-attr-node-traverse"], - "include/clang/AST/AttrNodeTraverse.inc", - ), - ( - ["-gen-clang-attr-impl"], - "include/clang/AST/AttrImpl.inc", - ), - ], + tbl_outs = { + "include/clang/AST/AttrVisitor.inc": ["-gen-clang-attr-ast-visitor"], + "include/clang/AST/Attrs.inc": ["-gen-clang-attr-classes"], + "include/clang/AST/AttrTextNodeDump.inc": ["-gen-clang-attr-text-node-dump"], + "include/clang/AST/AttrNodeTraverse.inc": ["-gen-clang-attr-node-traverse"], + "include/clang/AST/AttrImpl.inc": ["-gen-clang-attr-impl"], + }, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/Attr.td", deps = [":BasicCoreTdFiles"], @@ -821,12 +677,7 @@ gentbl_cc_library( gentbl_cc_library( name = "ast_attr_doc_table_gen", strip_include_prefix = "lib/AST", - tbl_outs = [ - ( - ["-gen-clang-attr-doc-table"], - "lib/AST/AttrDocTable.inc", - ), - ], + tbl_outs = {"lib/AST/AttrDocTable.inc": ["-gen-clang-attr-doc-table"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/Attr.td", deps = [":BasicCoreTdFiles"], @@ -834,10 +685,7 @@ gentbl_cc_library( gentbl_cc_library( name = "ast_decl_nodes_gen", - tbl_outs = [( - ["-gen-clang-decl-nodes"], - "include/clang/AST/DeclNodes.inc", - )], + tbl_outs = {"include/clang/AST/DeclNodes.inc": ["-gen-clang-decl-nodes"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/DeclNodes.td", deps = [":ASTNodeTdFiles"], @@ -845,10 +693,7 @@ gentbl_cc_library( gentbl_cc_library( name = "ast_stmt_nodes_gen", - tbl_outs = [( - ["-gen-clang-stmt-nodes"], - "include/clang/AST/StmtNodes.inc", - )], + tbl_outs = {"include/clang/AST/StmtNodes.inc": ["-gen-clang-stmt-nodes"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/StmtNodes.td", deps = [":ASTNodeTdFiles"], @@ -856,10 +701,7 @@ gentbl_cc_library( gentbl_cc_library( name = "ast_comment_nodes_gen", - tbl_outs = [( - ["-gen-clang-comment-nodes"], - "include/clang/AST/CommentNodes.inc", - )], + tbl_outs = {"include/clang/AST/CommentNodes.inc": ["-gen-clang-comment-nodes"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/CommentNodes.td", deps = [":ASTNodeTdFiles"], @@ -867,56 +709,38 @@ gentbl_cc_library( gentbl_cc_library( name = "ast_comment_command_info_gen", - tbl_outs = [ - ( - ["-gen-clang-comment-command-info"], - "include/clang/AST/CommentCommandInfo.inc", - ), - ( - ["-gen-clang-comment-command-list"], - "include/clang/AST/CommentCommandList.inc", - ), - ], + tbl_outs = { + "include/clang/AST/CommentCommandInfo.inc": ["-gen-clang-comment-command-info"], + "include/clang/AST/CommentCommandList.inc": ["-gen-clang-comment-command-list"], + }, tblgen = ":clang-tblgen", td_file = "include/clang/AST/CommentCommands.td", ) gentbl_cc_library( name = "ast_comment_html_tags_gen", - tbl_outs = [( - ["-gen-clang-comment-html-tags"], - "include/clang/AST/CommentHTMLTags.inc", - )], + tbl_outs = {"include/clang/AST/CommentHTMLTags.inc": ["-gen-clang-comment-html-tags"]}, tblgen = ":clang-tblgen", td_file = "include/clang/AST/CommentHTMLTags.td", ) gentbl_cc_library( name = "ast_comment_html_tags_properties_gen", - tbl_outs = [( - ["-gen-clang-comment-html-tags-properties"], - "include/clang/AST/CommentHTMLTagsProperties.inc", - )], + tbl_outs = {"include/clang/AST/CommentHTMLTagsProperties.inc": ["-gen-clang-comment-html-tags-properties"]}, tblgen = ":clang-tblgen", td_file = "include/clang/AST/CommentHTMLTags.td", ) gentbl_cc_library( name = "ast_comment_html_named_character_references_gen", - tbl_outs = [( - ["-gen-clang-comment-html-named-character-references"], - "include/clang/AST/CommentHTMLNamedCharacterReferences.inc", - )], + tbl_outs = {"include/clang/AST/CommentHTMLNamedCharacterReferences.inc": ["-gen-clang-comment-html-named-character-references"]}, tblgen = ":clang-tblgen", td_file = "include/clang/AST/CommentHTMLNamedCharacterReferences.td", ) gentbl_cc_library( name = "ast_stmt_data_collectors_gen", - tbl_outs = [( - ["-gen-clang-data-collectors"], - "include/clang/AST/StmtDataCollectors.inc", - )], + tbl_outs = {"include/clang/AST/StmtDataCollectors.inc": ["-gen-clang-data-collectors"]}, tblgen = ":clang-tblgen", td_file = "include/clang/AST/StmtDataCollectors.td", ) @@ -924,42 +748,27 @@ gentbl_cc_library( gentbl_cc_library( name = "ast_bytecode_opcodes_gen", strip_include_prefix = "lib/AST/ByteCode", - tbl_outs = [( - ["-gen-clang-opcodes"], - "lib/AST/ByteCode/Opcodes.inc", - )], + tbl_outs = {"lib/AST/ByteCode/Opcodes.inc": ["-gen-clang-opcodes"]}, tblgen = ":clang-tblgen", td_file = "lib/AST/ByteCode/Opcodes.td", ) gentbl_cc_library( name = "ast_properties_base_gen", - tbl_outs = [ - ( - ["-gen-clang-basic-reader"], - "include/clang/AST/AbstractBasicReader.inc", - ), - ( - ["-gen-clang-basic-writer"], - "include/clang/AST/AbstractBasicWriter.inc", - ), - ], + tbl_outs = { + "include/clang/AST/AbstractBasicReader.inc": ["-gen-clang-basic-reader"], + "include/clang/AST/AbstractBasicWriter.inc": ["-gen-clang-basic-writer"], + }, tblgen = ":clang-tblgen", td_file = "include/clang/AST/PropertiesBase.td", ) gentbl_cc_library( name = "ast_type_properties_gen", - tbl_outs = [ - ( - ["-gen-clang-type-reader"], - "include/clang/AST/AbstractTypeReader.inc", - ), - ( - ["-gen-clang-type-writer"], - "include/clang/AST/AbstractTypeWriter.inc", - ), - ], + tbl_outs = { + "include/clang/AST/AbstractTypeReader.inc": ["-gen-clang-type-reader"], + "include/clang/AST/AbstractTypeWriter.inc": ["-gen-clang-type-writer"], + }, tblgen = ":clang-tblgen", td_file = "include/clang/AST/TypeProperties.td", td_srcs = [ @@ -971,10 +780,7 @@ gentbl_cc_library( gentbl_cc_library( name = "type_nodes_gen", - tbl_outs = [( - ["-gen-clang-type-nodes"], - "include/clang/AST/TypeNodes.inc", - )], + tbl_outs = {"include/clang/AST/TypeNodes.inc": ["-gen-clang-type-nodes"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/TypeNodes.td", deps = [":ASTNodeTdFiles"], @@ -1124,24 +930,12 @@ cc_library( gentbl_cc_library( name = "sema_attr_gen", - tbl_outs = [ - ( - ["-gen-clang-attr-parsed-attr-impl"], - "include/clang/Sema/AttrParsedAttrImpl.inc", - ), - ( - ["-gen-clang-attr-parsed-attr-kinds"], - "include/clang/Sema/AttrParsedAttrKinds.inc", - ), - ( - ["-gen-clang-attr-spelling-index"], - "include/clang/Sema/AttrSpellingListIndex.inc", - ), - ( - ["-gen-clang-attr-template-instantiate"], - "include/clang/Sema/AttrTemplateInstantiate.inc", - ), - ], + tbl_outs = { + "include/clang/Sema/AttrParsedAttrImpl.inc": ["-gen-clang-attr-parsed-attr-impl"], + "include/clang/Sema/AttrParsedAttrKinds.inc": ["-gen-clang-attr-parsed-attr-kinds"], + "include/clang/Sema/AttrSpellingListIndex.inc": ["-gen-clang-attr-spelling-index"], + "include/clang/Sema/AttrTemplateInstantiate.inc": ["-gen-clang-attr-template-instantiate"], + }, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/Attr.td", deps = [":BasicCoreTdFiles"], @@ -1204,16 +998,10 @@ cc_library( gentbl_cc_library( name = "parse_attr_gen", - tbl_outs = [ - ( - ["-gen-clang-attr-parser-string-switches"], - "include/clang/Parse/AttrParserStringSwitches.inc", - ), - ( - ["-gen-clang-attr-subject-match-rules-parser-string-switches"], - "include/clang/Parse/AttrSubMatchRulesParserStringSwitches.inc", - ), - ], + tbl_outs = { + "include/clang/Parse/AttrParserStringSwitches.inc": ["-gen-clang-attr-parser-string-switches"], + "include/clang/Parse/AttrSubMatchRulesParserStringSwitches.inc": ["-gen-clang-attr-subject-match-rules-parser-string-switches"], + }, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/Attr.td", deps = [":BasicCoreTdFiles"], @@ -1405,16 +1193,10 @@ td_library( gentbl_cc_library( name = "tooling_syntax_gen", - tbl_outs = [ - ( - ["-gen-clang-syntax-node-list"], - "include/clang/Tooling/Syntax/Nodes.inc", - ), - ( - ["-gen-clang-syntax-node-classes"], - "include/clang/Tooling/Syntax/NodeClasses.inc", - ), - ], + tbl_outs = { + "include/clang/Tooling/Syntax/Nodes.inc": ["-gen-clang-syntax-node-list"], + "include/clang/Tooling/Syntax/NodeClasses.inc": ["-gen-clang-syntax-node-classes"], + }, tblgen = ":clang-tblgen", td_file = "include/clang/Tooling/Syntax/Nodes.td", deps = [":ToolingSyntaxTdFiles"], @@ -1584,10 +1366,7 @@ td_library( gentbl_cc_library( name = "static_analyzer_checkers_gen", - tbl_outs = [( - ["-gen-clang-sa-checkers"], - "include/clang/StaticAnalyzer/Checkers/Checkers.inc", - )], + tbl_outs = {"include/clang/StaticAnalyzer/Checkers/Checkers.inc": ["-gen-clang-sa-checkers"]}, tblgen = ":clang-tblgen", td_file = "include/clang/StaticAnalyzer/Checkers/Checkers.td", deps = [":CheckerBaseTdFiles"], @@ -1621,10 +1400,7 @@ cc_library( gentbl_cc_library( name = "driver_options_inc_gen", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "include/clang/Driver/Options.inc", - )], + tbl_outs = {"include/clang/Driver/Options.inc": ["-gen-opt-parser-defs"]}, tblgen = "//llvm:llvm-tblgen", td_file = "include/clang/Driver/Options.td", deps = ["//llvm:OptParserTdFiles"], @@ -1685,10 +1461,7 @@ cc_library( gentbl_cc_library( name = "headers_arm_neon_gen", - tbl_outs = [( - ["-gen-arm-neon"], - "lib/Headers/arm_neon.h", - )], + tbl_outs = {"lib/Headers/arm_neon.h": ["-gen-arm-neon"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_neon.td", deps = [":ArmTdFiles"], @@ -1696,10 +1469,7 @@ gentbl_cc_library( gentbl_cc_library( name = "headers_arm_fp16_gen", - tbl_outs = [( - ["-gen-arm-fp16"], - "lib/Headers/arm_fp16.h", - )], + tbl_outs = {"lib/Headers/arm_fp16.h": ["-gen-arm-fp16"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_fp16.td", deps = [":ArmTdFiles"], @@ -1707,10 +1477,7 @@ gentbl_cc_library( gentbl_cc_library( name = "headers_arm_mve_gen", - tbl_outs = [( - ["-gen-arm-mve-header"], - "lib/Headers/arm_mve.h", - )], + tbl_outs = {"lib/Headers/arm_mve.h": ["-gen-arm-mve-header"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_mve.td", deps = [":ArmTdFiles"], @@ -1718,10 +1485,7 @@ gentbl_cc_library( gentbl_cc_library( name = "headers_arm_cde_gen", - tbl_outs = [( - ["-gen-arm-cde-header"], - "lib/Headers/arm_cde.h", - )], + tbl_outs = {"lib/Headers/arm_cde.h": ["-gen-arm-cde-header"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_cde.td", deps = [":ArmTdFiles"], @@ -1729,10 +1493,7 @@ gentbl_cc_library( gentbl_cc_library( name = "headers_arm_sve_gen", - tbl_outs = [( - ["-gen-arm-sve-header"], - "lib/Headers/arm_sve.h", - )], + tbl_outs = {"lib/Headers/arm_sve.h": ["-gen-arm-sve-header"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sve.td", deps = [":ArmTdFiles"], @@ -1740,10 +1501,7 @@ gentbl_cc_library( gentbl_cc_library( name = "headers_arm_bf16_gen", - tbl_outs = [( - ["-gen-arm-bf16"], - "lib/Headers/arm_bf16.h", - )], + tbl_outs = {"lib/Headers/arm_bf16.h": ["-gen-arm-bf16"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_bf16.td", deps = [":ArmTdFiles"], @@ -1751,10 +1509,7 @@ gentbl_cc_library( gentbl_cc_library( name = "headers_arm_sme_gen", - tbl_outs = [( - ["-gen-arm-sme-header"], - "lib/Headers/arm_sme.h", - )], + tbl_outs = {"lib/Headers/arm_sme.h": ["-gen-arm-sme-header"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_sme.td", deps = [":ArmTdFiles"], @@ -1762,10 +1517,7 @@ gentbl_cc_library( gentbl_cc_library( name = "headers_arm_vector_type_gen", - tbl_outs = [( - ["-gen-arm-vector-type"], - "lib/Headers/arm_vector_types.h", - )], + tbl_outs = {"lib/Headers/arm_vector_types.h": ["-gen-arm-vector-type"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/arm_neon.td", deps = [":ArmTdFiles"], @@ -1773,10 +1525,7 @@ gentbl_cc_library( gentbl_cc_library( name = "headers_riscv_vector", - tbl_outs = [( - ["-gen-riscv-vector-header"], - "lib/Headers/riscv_vector.h", - )], + tbl_outs = {"lib/Headers/riscv_vector.h": ["-gen-riscv-vector-header"]}, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/riscv_vector.td", deps = [":RiscvTdFiles"], @@ -2031,16 +1780,10 @@ cc_library( gentbl_cc_library( name = "serialization_attr_gen", - tbl_outs = [ - ( - ["-gen-clang-attr-pch-read"], - "include/clang/Serialization/AttrPCHRead.inc", - ), - ( - ["-gen-clang-attr-pch-write"], - "include/clang/Serialization/AttrPCHWrite.inc", - ), - ], + tbl_outs = { + "include/clang/Serialization/AttrPCHRead.inc": ["-gen-clang-attr-pch-read"], + "include/clang/Serialization/AttrPCHWrite.inc": ["-gen-clang-attr-pch-write"], + }, tblgen = ":clang-tblgen", td_file = "include/clang/Basic/Attr.td", deps = [":BasicCoreTdFiles"], @@ -2455,10 +2198,7 @@ cc_binary( gentbl_cc_library( name = "linker_wrapper_opts_gen", strip_include_prefix = "tools/clang-linker-wrapper", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/clang-linker-wrapper/LinkerWrapperOpts.inc", - )], + tbl_outs = {"tools/clang-linker-wrapper/LinkerWrapperOpts.inc": ["-gen-opt-parser-defs"]}, tblgen = "//llvm:llvm-tblgen", td_file = "tools/clang-linker-wrapper/LinkerWrapperOpts.td", deps = ["//llvm:OptParserTdFiles"], @@ -2554,10 +2294,7 @@ cc_binary( gentbl_cc_library( name = "ScanDepsTableGen", strip_include_prefix = "tools/clang-scan-deps", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/clang-scan-deps/Opts.inc", - )], + tbl_outs = {"tools/clang-scan-deps/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = "//llvm:llvm-tblgen", td_file = "tools/clang-scan-deps/Opts.td", deps = ["//llvm:OptParserTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel index 0f59718782b01..06a79d0045e16 100644 --- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel @@ -331,16 +331,10 @@ cc_library( gentbl_cc_library( name = "InterpreterProperties", strip_include_prefix = "source/Interpreter", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "source/Interpreter/InterpreterProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "source/Interpreter/InterpreterPropertiesEnum.inc", - ), - ], + tbl_outs = { + "source/Interpreter/InterpreterProperties.inc": ["-gen-lldb-property-defs"], + "source/Interpreter/InterpreterPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = ":lldb-tblgen", td_file = "source/Interpreter/InterpreterProperties.td", deps = [":CoreTdFiles"], @@ -406,12 +400,7 @@ td_library( gentbl_cc_library( name = "CommandOptions", strip_include_prefix = "source/Commands", - tbl_outs = [ - ( - ["-gen-lldb-option-defs"], - "source/Commands/CommandOptions.inc", - ), - ], + tbl_outs = {"source/Commands/CommandOptions.inc": ["-gen-lldb-option-defs"]}, tblgen = ":lldb-tblgen", td_file = "source/Commands/Options.td", deps = [":CommandsTdFiles"], @@ -595,16 +584,10 @@ td_library( gentbl_cc_library( name = "CoreProperties", strip_include_prefix = "source/Core", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "source/Core/CoreProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "source/Core/CorePropertiesEnum.inc", - ), - ], + tbl_outs = { + "source/Core/CoreProperties.inc": ["-gen-lldb-property-defs"], + "source/Core/CorePropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = ":lldb-tblgen", td_file = "source/Core/CoreProperties.td", deps = [":CoreTdFiles"], @@ -676,16 +659,10 @@ cc_library( gentbl_cc_library( name = "TargetProperties", strip_include_prefix = "source/Target", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "source/Target/TargetProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "source/Target/TargetPropertiesEnum.inc", - ), - ], + tbl_outs = { + "source/Target/TargetProperties.inc": ["-gen-lldb-property-defs"], + "source/Target/TargetPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = ":lldb-tblgen", td_file = "source/Target/TargetProperties.td", deps = [":CoreTdFiles"], @@ -832,10 +809,7 @@ cc_import( gentbl_cc_library( name = "lldb_options_inc_gen", strip_include_prefix = ".", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "Options.inc", - )], + tbl_outs = {"Options.inc": ["-gen-opt-parser-defs"]}, tblgen = "//llvm:llvm-tblgen", td_file = "tools/driver/Options.td", deps = ["//llvm:OptParserTdFiles"], @@ -981,10 +955,7 @@ cc_binary( gentbl_cc_library( name = "lldb_server_opts_gen", strip_include_prefix = ".", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "LLGSOptions.inc", - )], + tbl_outs = {"LLGSOptions.inc": ["-gen-opt-parser-defs"]}, tblgen = "//llvm:llvm-tblgen", td_file = "tools/lldb-server/LLGSOptions.td", deps = ["//llvm:OptParserTdFiles"], @@ -1051,10 +1022,7 @@ expand_template( gentbl_cc_library( name = "lldb_dap_opts_gen", strip_include_prefix = "tools/lldb-dap", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/lldb-dap/Options.inc", - )], + tbl_outs = {"tools/lldb-dap/Options.inc": ["-gen-opt-parser-defs"]}, tblgen = "//llvm:llvm-tblgen", td_file = "tools/lldb-dap/Options.td", deps = ["//llvm:OptParserTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel index 7d0a4112ac312..a4b51463a9241 100644 --- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel @@ -215,16 +215,10 @@ cc_library( gentbl_cc_library( name = "PlatformMacOSXProperties", strip_include_prefix = "Platform/MacOSX", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "Platform/MacOSX/PlatformMacOSXProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "Platform/MacOSX/PlatformMacOSXPropertiesEnum.inc", - ), - ], + tbl_outs = { + "Platform/MacOSX/PlatformMacOSXProperties.inc": ["-gen-lldb-property-defs"], + "Platform/MacOSX/PlatformMacOSXPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = "//lldb:lldb-tblgen", td_file = "Platform/MacOSX/PlatformMacOSXProperties.td", deps = ["//lldb:CoreTdFiles"], @@ -301,16 +295,10 @@ cc_library( gentbl_cc_library( name = "SymbolFileDWARFProperties", strip_include_prefix = "SymbolFile/DWARF", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "SymbolFile/DWARF/SymbolFileDWARFProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "SymbolFile/DWARF/SymbolFileDWARFPropertiesEnum.inc", - ), - ], + tbl_outs = { + "SymbolFile/DWARF/SymbolFileDWARFProperties.inc": ["-gen-lldb-property-defs"], + "SymbolFile/DWARF/SymbolFileDWARFPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = "//lldb:lldb-tblgen", td_file = "SymbolFile/DWARF/SymbolFileDWARFProperties.td", deps = ["//lldb:CoreTdFiles"], @@ -454,16 +442,10 @@ cc_library( gentbl_cc_library( name = "ProcessGDBRemoteProperties", strip_include_prefix = "Process/gdb-remote", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "Process/gdb-remote/ProcessGDBRemoteProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "Process/gdb-remote/ProcessGDBRemotePropertiesEnum.inc", - ), - ], + tbl_outs = { + "Process/gdb-remote/ProcessGDBRemoteProperties.inc": ["-gen-lldb-property-defs"], + "Process/gdb-remote/ProcessGDBRemotePropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = "//lldb:lldb-tblgen", td_file = "Process/gdb-remote/ProcessGDBRemoteProperties.td", deps = ["//lldb:CoreTdFiles"], @@ -548,16 +530,10 @@ cc_library( gentbl_cc_library( name = "StructuredDataDarwinLogProperties", strip_include_prefix = "StructuredData/DarwinLog", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "StructuredData/DarwinLog/StructuredDataDarwinLogProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "StructuredData/DarwinLog/StructuredDataDarwinLogPropertiesEnum.inc", - ), - ], + tbl_outs = { + "StructuredData/DarwinLog/StructuredDataDarwinLogProperties.inc": ["-gen-lldb-property-defs"], + "StructuredData/DarwinLog/StructuredDataDarwinLogPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = "//lldb:lldb-tblgen", td_file = "StructuredData/DarwinLog/StructuredDataDarwinLogProperties.td", deps = ["//lldb:CoreTdFiles"], @@ -617,16 +593,10 @@ cc_library( gentbl_cc_library( name = "PlatformQemuUserProperties", strip_include_prefix = "Platform/QemuUser", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "Platform/QemuUser/PlatformQemuUserProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "Platform/QemuUser/PlatformQemuUserPropertiesEnum.inc", - ), - ], + tbl_outs = { + "Platform/QemuUser/PlatformQemuUserProperties.inc": ["-gen-lldb-property-defs"], + "Platform/QemuUser/PlatformQemuUserPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = "//lldb:lldb-tblgen", td_file = "Platform/QemuUser/PlatformQemuUserProperties.td", deps = ["//lldb:CoreTdFiles"], @@ -690,16 +660,10 @@ cc_library( gentbl_cc_library( name = "PlatformAndroidProperties", strip_include_prefix = "Platform/Android", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "Platform/Android/PlatformAndroidProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "Platform/Android/PlatformAndroidPropertiesEnum.inc", - ), - ], + tbl_outs = { + "Platform/Android/PlatformAndroidProperties.inc": ["-gen-lldb-property-defs"], + "Platform/Android/PlatformAndroidPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = "//lldb:lldb-tblgen", td_file = "Platform/Android/PlatformAndroidProperties.td", deps = ["//lldb:CoreTdFiles"], @@ -902,10 +866,7 @@ cc_library( gentbl_cc_library( name = "TraceExporterCTFOptions", strip_include_prefix = "TraceExporter/ctf", - tbl_outs = [( - ["-gen-lldb-option-defs"], - "TraceExporter/ctf/TraceExporterCTFCommandOptions.inc", - )], + tbl_outs = {"TraceExporter/ctf/TraceExporterCTFCommandOptions.inc": ["-gen-lldb-option-defs"]}, tblgen = "//lldb:lldb-tblgen", td_file = "TraceExporter/ctf/TraceExporterCTFOptions.td", deps = [ @@ -1228,16 +1189,10 @@ cc_library( gentbl_cc_library( name = "DynamicLoaderMacOSXDYLDProperties", strip_include_prefix = "DynamicLoader/MacOSX-DYLD", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinPropertiesEnum.inc", - ), - ], + tbl_outs = { + "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.inc": ["-gen-lldb-property-defs"], + "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = "//lldb:lldb-tblgen", td_file = "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td", deps = ["//lldb:CoreTdFiles"], @@ -1270,16 +1225,10 @@ cc_library( gentbl_cc_library( name = "DynamicLoaderDarwinKernelProperties", strip_include_prefix = "DynamicLoader/Darwin-Kernel", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernelProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernelPropertiesEnum.inc", - ), - ], + tbl_outs = { + "DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernelProperties.inc": ["-gen-lldb-property-defs"], + "DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernelPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = "//lldb:lldb-tblgen", td_file = "DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernelProperties.td", deps = ["//lldb:CoreTdFiles"], @@ -1692,16 +1641,10 @@ cc_library( gentbl_cc_library( name = "JITLoaderGDBProperties", strip_include_prefix = "JITLoader/GDB", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "JITLoader/GDB/JITLoaderGDBProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "JITLoader/GDB/JITLoaderGDBPropertiesEnum.inc", - ), - ], + tbl_outs = { + "JITLoader/GDB/JITLoaderGDBProperties.inc": ["-gen-lldb-property-defs"], + "JITLoader/GDB/JITLoaderGDBPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = "//lldb:lldb-tblgen", td_file = "JITLoader/GDB/JITLoaderGDBProperties.td", deps = ["//lldb:CoreTdFiles"], @@ -1748,16 +1691,10 @@ cc_library( gentbl_cc_library( name = "SymbolLocatorDebuginfodProperties", strip_include_prefix = "SymbolLocator/Debuginfod", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "SymbolLocator/Debuginfod/SymbolLocatorDebuginfodProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "SymbolLocator/Debuginfod/SymbolLocatorDebuginfodPropertiesEnum.inc", - ), - ], + tbl_outs = { + "SymbolLocator/Debuginfod/SymbolLocatorDebuginfodProperties.inc": ["-gen-lldb-property-defs"], + "SymbolLocator/Debuginfod/SymbolLocatorDebuginfodPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = "//lldb:lldb-tblgen", td_file = "SymbolLocator/Debuginfod/SymbolLocatorDebuginfodProperties.td", deps = ["//lldb:CoreTdFiles"], @@ -2014,16 +1951,10 @@ cc_library( gentbl_cc_library( name = "ObjectFilePECOFFProperties", strip_include_prefix = "ObjectFile/PECOFF", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "ObjectFile/PECOFF/ObjectFilePECOFFProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "ObjectFile/PECOFF/ObjectFilePECOFFPropertiesEnum.inc", - ), - ], + tbl_outs = { + "ObjectFile/PECOFF/ObjectFilePECOFFProperties.inc": ["-gen-lldb-property-defs"], + "ObjectFile/PECOFF/ObjectFilePECOFFPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = "//lldb:lldb-tblgen", td_file = "ObjectFile/PECOFF/ObjectFilePECOFFProperties.td", deps = ["//lldb:CoreTdFiles"], @@ -2225,16 +2156,10 @@ cc_library( gentbl_cc_library( name = "ProcessKDPProperties", strip_include_prefix = "Process/MacOSX-Kernel", - tbl_outs = [ - ( - ["-gen-lldb-property-defs"], - "Process/MacOSX-Kernel/ProcessKDPProperties.inc", - ), - ( - ["-gen-lldb-property-enum-defs"], - "Process/MacOSX-Kernel/ProcessKDPPropertiesEnum.inc", - ), - ], + tbl_outs = { + "Process/MacOSX-Kernel/ProcessKDPProperties.inc": ["-gen-lldb-property-defs"], + "Process/MacOSX-Kernel/ProcessKDPPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], + }, tblgen = "//lldb:lldb-tblgen", td_file = "Process/MacOSX-Kernel/ProcessKDPProperties.td", deps = ["//lldb:CoreTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 44fb4357c1e1f..7431ff306b4d7 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -740,10 +740,7 @@ cc_binary( gentbl_cc_library( name = "intrinsic_enums_gen", - tbl_outs = [( - ["-gen-intrinsic-enums"], - "include/llvm/IR/IntrinsicEnums.inc", - )], + tbl_outs = {"include/llvm/IR/IntrinsicEnums.inc": ["-gen-intrinsic-enums"]}, tblgen = ":llvm-min-tblgen", td_file = "include/llvm/IR/Intrinsics.td", deps = [":CommonTargetTdFiles"], @@ -751,10 +748,7 @@ gentbl_cc_library( gentbl_cc_library( name = "intrinsics_impl_gen", - tbl_outs = [( - ["-gen-intrinsic-impl"], - "include/llvm/IR/IntrinsicImpl.inc", - )], + tbl_outs = {"include/llvm/IR/IntrinsicImpl.inc": ["-gen-intrinsic-impl"]}, tblgen = ":llvm-min-tblgen", td_file = "include/llvm/IR/Intrinsics.td", deps = [":CommonTargetTdFiles"], @@ -762,10 +756,7 @@ gentbl_cc_library( gentbl_cc_library( name = "vt_gen", - tbl_outs = [( - ["-gen-vt"], - "include/llvm/CodeGen/GenVT.inc", - )], + tbl_outs = {"include/llvm/CodeGen/GenVT.inc": ["-gen-vt"]}, tblgen = ":llvm-min-tblgen", td_file = "include/llvm/CodeGen/ValueTypes.td", ) @@ -851,13 +842,10 @@ llvm_target_intrinsics_list = [ gentbl_cc_library( name = "intrinsic_" + target["name"] + "_gen", includes = ["include"], - tbl_outs = [( - [ - "-gen-intrinsic-enums", - "-intrinsic-prefix=" + target["intrinsic_prefix"], - ], - "include/llvm/IR/Intrinsics" + target["name"] + ".h", - )], + tbl_outs = {"include/llvm/IR/Intrinsics" + target["name"] + ".h": [ + "-gen-intrinsic-enums", + "-intrinsic-prefix=" + target["intrinsic_prefix"], + ]}, tblgen = ":llvm-min-tblgen", td_file = "include/llvm/IR/Intrinsics.td", deps = [ @@ -869,10 +857,7 @@ llvm_target_intrinsics_list = [ gentbl_cc_library( name = "attributes_gen", - tbl_outs = [( - ["-gen-attrs"], - "include/llvm/IR/Attributes.inc", - )], + tbl_outs = {"include/llvm/IR/Attributes.inc": ["-gen-attrs"]}, tblgen = ":llvm-min-tblgen", td_file = "include/llvm/IR/Attributes.td", ) @@ -1309,10 +1294,7 @@ td_library( gentbl_cc_library( name = "ARMTargetParserDefGen", - tbl_outs = [( - ["-gen-arm-target-def"], - "include/llvm/TargetParser/ARMTargetParserDef.inc", - )], + tbl_outs = {"include/llvm/TargetParser/ARMTargetParserDef.inc": ["-gen-arm-target-def"]}, tblgen = ":llvm-min-tblgen", td_file = "lib/Target/ARM/ARM.td", deps = [ @@ -1330,10 +1312,7 @@ td_library( gentbl_cc_library( name = "AArch64TargetParserDefGen", - tbl_outs = [( - ["-gen-arm-target-def"], - "include/llvm/TargetParser/AArch64TargetParserDef.inc", - )], + tbl_outs = {"include/llvm/TargetParser/AArch64TargetParserDef.inc": ["-gen-arm-target-def"]}, tblgen = ":llvm-min-tblgen", td_file = "lib/Target/AArch64/AArch64.td", deps = [ @@ -1351,10 +1330,7 @@ td_library( gentbl_cc_library( name = "RISCVTargetParserDefGen", - tbl_outs = [( - ["-gen-riscv-target-def"], - "include/llvm/TargetParser/RISCVTargetParserDef.inc", - )], + tbl_outs = {"include/llvm/TargetParser/RISCVTargetParserDef.inc": ["-gen-riscv-target-def"]}, tblgen = ":llvm-min-tblgen", td_file = "lib/Target/RISCV/RISCV.td", deps = [ @@ -1443,10 +1419,7 @@ td_library( gentbl_cc_library( name = "InstCombineTableGen", strip_include_prefix = "lib/Target/AMDGPU", - tbl_outs = [( - ["-gen-searchable-tables"], - "lib/Target/AMDGPU/InstCombineTables.inc", - )], + tbl_outs = {"lib/Target/AMDGPU/InstCombineTables.inc": ["-gen-searchable-tables"]}, tblgen = ":llvm-tblgen", td_file = "lib/Target/AMDGPU/InstCombineTables.td", deps = [ @@ -3138,33 +3111,21 @@ cc_library( gentbl_cc_library( name = "amdgpu_isel_target_gen", strip_include_prefix = "lib/Target/AMDGPU", - tbl_outs = [ - ( - ["-gen-global-isel"], - "lib/Target/AMDGPU/AMDGPUGenGlobalISel.inc", - ), - ( - [ - "-gen-global-isel-combiner", - "-combiners=AMDGPUPreLegalizerCombiner", - ], - "lib/Target/AMDGPU/AMDGPUGenPreLegalizeGICombiner.inc", - ), - ( - [ - "-gen-global-isel-combiner", - "-combiners=AMDGPUPostLegalizerCombiner", - ], - "lib/Target/AMDGPU/AMDGPUGenPostLegalizeGICombiner.inc", - ), - ( - [ - "-gen-global-isel-combiner", - "-combiners=AMDGPURegBankCombiner", - ], - "lib/Target/AMDGPU/AMDGPUGenRegBankGICombiner.inc", - ), - ], + tbl_outs = { + "lib/Target/AMDGPU/AMDGPUGenGlobalISel.inc": ["-gen-global-isel"], + "lib/Target/AMDGPU/AMDGPUGenPreLegalizeGICombiner.inc": [ + "-gen-global-isel-combiner", + "-combiners=AMDGPUPreLegalizerCombiner", + ], + "lib/Target/AMDGPU/AMDGPUGenPostLegalizeGICombiner.inc": [ + "-gen-global-isel-combiner", + "-combiners=AMDGPUPostLegalizerCombiner", + ], + "lib/Target/AMDGPU/AMDGPUGenRegBankGICombiner.inc": [ + "-gen-global-isel-combiner", + "-combiners=AMDGPURegBankCombiner", + ], + }, tblgen = ":llvm-tblgen", td_file = "lib/Target/AMDGPU/AMDGPUGISel.td", deps = [ @@ -3176,40 +3137,16 @@ gentbl_cc_library( gentbl_cc_library( name = "r600_target_gen", strip_include_prefix = "lib/Target/AMDGPU", - tbl_outs = [ - ( - ["-gen-asm-writer"], - "lib/Target/AMDGPU/R600GenAsmWriter.inc", - ), - ( - ["-gen-callingconv"], - "lib/Target/AMDGPU/R600GenCallingConv.inc", - ), - ( - ["-gen-dag-isel"], - "lib/Target/AMDGPU/R600GenDAGISel.inc", - ), - ( - ["-gen-dfa-packetizer"], - "lib/Target/AMDGPU/R600GenDFAPacketizer.inc", - ), - ( - ["-gen-instr-info"], - "lib/Target/AMDGPU/R600GenInstrInfo.inc", - ), - ( - ["-gen-emitter"], - "lib/Target/AMDGPU/R600GenMCCodeEmitter.inc", - ), - ( - ["-gen-register-info"], - "lib/Target/AMDGPU/R600GenRegisterInfo.inc", - ), - ( - ["-gen-subtarget"], - "lib/Target/AMDGPU/R600GenSubtargetInfo.inc", - ), - ], + tbl_outs = { + "lib/Target/AMDGPU/R600GenAsmWriter.inc": ["-gen-asm-writer"], + "lib/Target/AMDGPU/R600GenCallingConv.inc": ["-gen-callingconv"], + "lib/Target/AMDGPU/R600GenDAGISel.inc": ["-gen-dag-isel"], + "lib/Target/AMDGPU/R600GenDFAPacketizer.inc": ["-gen-dfa-packetizer"], + "lib/Target/AMDGPU/R600GenInstrInfo.inc": ["-gen-instr-info"], + "lib/Target/AMDGPU/R600GenMCCodeEmitter.inc": ["-gen-emitter"], + "lib/Target/AMDGPU/R600GenRegisterInfo.inc": ["-gen-register-info"], + "lib/Target/AMDGPU/R600GenSubtargetInfo.inc": ["-gen-subtarget"], + }, tblgen = ":llvm-tblgen", td_file = "lib/Target/AMDGPU/R600.td", deps = [ @@ -3221,33 +3158,21 @@ gentbl_cc_library( gentbl_cc_library( name = "riscv_isel_target_gen", strip_include_prefix = "lib/Target/RISCV", - tbl_outs = [ - ( - ["-gen-global-isel"], - "lib/Target/RISCV/RISCVGenGlobalISel.inc", - ), - ( - [ - "-gen-global-isel-combiner", - "-combiners=RISCVO0PreLegalizerCombiner", - ], - "lib/Target/RISCV/RISCVGenO0PreLegalizeGICombiner.inc", - ), - ( - [ - "-gen-global-isel-combiner", - "-combiners=RISCVPostLegalizerCombiner", - ], - "lib/Target/RISCV/RISCVGenPostLegalizeGICombiner.inc", - ), - ( - [ - "-gen-global-isel-combiner", - "-combiners=RISCVPreLegalizerCombiner", - ], - "lib/Target/RISCV/RISCVGenPreLegalizeGICombiner.inc", - ), - ], + tbl_outs = { + "lib/Target/RISCV/RISCVGenGlobalISel.inc": ["-gen-global-isel"], + "lib/Target/RISCV/RISCVGenO0PreLegalizeGICombiner.inc": [ + "-gen-global-isel-combiner", + "-combiners=RISCVO0PreLegalizerCombiner", + ], + "lib/Target/RISCV/RISCVGenPostLegalizeGICombiner.inc": [ + "-gen-global-isel-combiner", + "-combiners=RISCVPostLegalizerCombiner", + ], + "lib/Target/RISCV/RISCVGenPreLegalizeGICombiner.inc": [ + "-gen-global-isel-combiner", + "-combiners=RISCVPreLegalizerCombiner", + ], + }, tblgen = ":llvm-tblgen", td_file = "lib/Target/RISCV/RISCVGISel.td", deps = [ @@ -3692,10 +3617,7 @@ cc_library( gentbl_cc_library( name = "JITLinkTableGen", strip_include_prefix = "lib/ExecutionEngine/JITLink", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "lib/ExecutionEngine/JITLink/COFFOptions.inc", - )], + tbl_outs = {"lib/ExecutionEngine/JITLink/COFFOptions.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "lib/ExecutionEngine/JITLink/COFFOptions.td", deps = [":OptParserTdFiles"], @@ -3935,10 +3857,7 @@ cc_library( gentbl_cc_library( name = "DllOptionsTableGen", strip_include_prefix = "lib/ToolDrivers/llvm-dlltool", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "lib/ToolDrivers/llvm-dlltool/Options.inc", - )], + tbl_outs = {"lib/ToolDrivers/llvm-dlltool/Options.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "lib/ToolDrivers/llvm-dlltool/Options.td", deps = [":OptParserTdFiles"], @@ -3961,10 +3880,7 @@ cc_library( gentbl_cc_library( name = "LibOptionsTableGen", strip_include_prefix = "lib/ToolDrivers/llvm-lib", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "lib/ToolDrivers/llvm-lib/Options.inc", - )], + tbl_outs = {"lib/ToolDrivers/llvm-lib/Options.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "lib/ToolDrivers/llvm-lib/Options.td", deps = [":OptParserTdFiles"], @@ -4188,10 +4104,7 @@ cc_library( gentbl_cc_library( name = "DsymutilTableGen", strip_include_prefix = "tools/dsymutil", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/dsymutil/Options.inc", - )], + tbl_outs = {"tools/dsymutil/Options.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/dsymutil/Options.td", deps = [":OptParserTdFiles"], @@ -4400,10 +4313,7 @@ cc_binary( gentbl_cc_library( name = "CGDataOptsTableGen", strip_include_prefix = "tools/llvm-cgdata", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-cgdata/Opts.inc", - )], + tbl_outs = {"tools/llvm-cgdata/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-cgdata/Opts.td", deps = [":OptParserTdFiles"], @@ -4477,10 +4387,7 @@ cc_binary( gentbl_cc_library( name = "CvtResTableGen", strip_include_prefix = "tools/llvm-cvtres", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-cvtres/Opts.inc", - )], + tbl_outs = {"tools/llvm-cvtres/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-cvtres/Opts.td", deps = [":OptParserTdFiles"], @@ -4535,10 +4442,7 @@ cc_binary( gentbl_cc_library( name = "CxxfiltOptsTableGen", strip_include_prefix = "tools/llvm-cxxfilt", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-cxxfilt/Opts.inc", - )], + tbl_outs = {"tools/llvm-cxxfilt/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-cxxfilt/Opts.td", deps = [":OptParserTdFiles"], @@ -4582,10 +4486,7 @@ cc_binary( gentbl_cc_library( name = "DebugInfodFindOptsTableGen", strip_include_prefix = "tools/llvm-debuginfod-find", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-debuginfod-find/Opts.inc", - )], + tbl_outs = {"tools/llvm-debuginfod-find/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-debuginfod-find/Opts.td", deps = [":OptParserTdFiles"], @@ -4652,10 +4553,7 @@ cc_binary( gentbl_cc_library( name = "DwarfutilOptionsTableGen", strip_include_prefix = "tools/llvm-dwarfutil", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-dwarfutil/Options.inc", - )], + tbl_outs = {"tools/llvm-dwarfutil/Options.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-dwarfutil/Options.td", deps = [":OptParserTdFiles"], @@ -4690,10 +4588,7 @@ cc_binary( gentbl_cc_library( name = "DwpOptionsTableGen", strip_include_prefix = "tools/llvm-dwp", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-dwp/Opts.inc", - )], + tbl_outs = {"tools/llvm-dwp/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-dwp/Opts.td", deps = [":OptParserTdFiles"], @@ -4764,10 +4659,7 @@ cc_binary( gentbl_cc_library( name = "GSYMUtilOptionsTableGen", strip_include_prefix = "tools/llvm-gsymutil", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-gsymutil/Opts.inc", - )], + tbl_outs = {"tools/llvm-gsymutil/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-gsymutil/Opts.td", deps = [":OptParserTdFiles"], @@ -4801,10 +4693,7 @@ llvm_driver_cc_binary( gentbl_cc_library( name = "IfsOptionsTableGen", strip_include_prefix = "tools/llvm-ifs", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-ifs/Opts.inc", - )], + tbl_outs = {"tools/llvm-ifs/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-ifs/Opts.td", deps = [":OptParserTdFiles"], @@ -4884,10 +4773,7 @@ cc_binary( gentbl_cc_library( name = "LibtoolDarwinOptionsTableGen", strip_include_prefix = "tools/llvm-libtool-darwin", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-libtool-darwin/Opts.inc", - )], + tbl_outs = {"tools/llvm-libtool-darwin/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-libtool-darwin/Opts.td", deps = [":OptParserTdFiles"], @@ -4945,10 +4831,7 @@ cc_binary( gentbl_cc_library( name = "LipoOptsTableGen", strip_include_prefix = "tools/llvm-lipo", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-lipo/LipoOpts.inc", - )], + tbl_outs = {"tools/llvm-lipo/LipoOpts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-lipo/LipoOpts.td", deps = [":OptParserTdFiles"], @@ -5064,10 +4947,7 @@ cc_binary( gentbl_cc_library( name = "MlTableGen", strip_include_prefix = "tools/llvm-ml", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-ml/Opts.inc", - )], + tbl_outs = {"tools/llvm-ml/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-ml/Opts.td", deps = [":OptParserTdFiles"], @@ -5124,10 +5004,7 @@ cc_binary( gentbl_cc_library( name = "MtTableGen", strip_include_prefix = "tools/llvm-mt", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-mt/Opts.inc", - )], + tbl_outs = {"tools/llvm-mt/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-mt/Opts.td", deps = [":OptParserTdFiles"], @@ -5155,10 +5032,7 @@ llvm_driver_cc_binary( gentbl_cc_library( name = "NmOptsTableGen", strip_include_prefix = "tools/llvm-nm", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-nm/Opts.inc", - )], + tbl_outs = {"tools/llvm-nm/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-nm/Opts.td", deps = [":OptParserTdFiles"], @@ -5198,10 +5072,7 @@ td_library( gentbl_cc_library( name = "llvm-objcopy-opts", strip_include_prefix = "tools/llvm-objcopy", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-objcopy/ObjcopyOpts.inc", - )], + tbl_outs = {"tools/llvm-objcopy/ObjcopyOpts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-objcopy/ObjcopyOpts.td", deps = [ @@ -5213,10 +5084,7 @@ gentbl_cc_library( gentbl_cc_library( name = "llvm-installnametool-opts", strip_include_prefix = "tools/llvm-objcopy", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-objcopy/InstallNameToolOpts.inc", - )], + tbl_outs = {"tools/llvm-objcopy/InstallNameToolOpts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-objcopy/InstallNameToolOpts.td", deps = [ @@ -5228,10 +5096,7 @@ gentbl_cc_library( gentbl_cc_library( name = "llvm-strip-opts", strip_include_prefix = "tools/llvm-objcopy", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-objcopy/StripOpts.inc", - )], + tbl_outs = {"tools/llvm-objcopy/StripOpts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-objcopy/StripOpts.td", deps = [ @@ -5243,10 +5108,7 @@ gentbl_cc_library( gentbl_cc_library( name = "llvm-bitcode-strip-opts", strip_include_prefix = "tools/llvm-objcopy", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-objcopy/BitcodeStripOpts.inc", - )], + tbl_outs = {"tools/llvm-objcopy/BitcodeStripOpts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-objcopy/BitcodeStripOpts.td", deps = [ @@ -5353,10 +5215,7 @@ llvm_driver_cc_binary( gentbl_cc_library( name = "ObjdumpOptsTableGen", strip_include_prefix = "tools/llvm-objdump", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-objdump/ObjdumpOpts.inc", - )], + tbl_outs = {"tools/llvm-objdump/ObjdumpOpts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-objdump/ObjdumpOpts.td", deps = [":OptParserTdFiles"], @@ -5370,10 +5229,7 @@ binary_alias( gentbl_cc_library( name = "OtoolOptsTableGen", strip_include_prefix = "tools/llvm-objdump", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-objdump/OtoolOpts.inc", - )], + tbl_outs = {"tools/llvm-objdump/OtoolOpts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-objdump/OtoolOpts.td", deps = [":OptParserTdFiles"], @@ -5462,10 +5318,7 @@ cc_binary( gentbl_cc_library( name = "RcTableGen", strip_include_prefix = "tools/llvm-rc", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-rc/Opts.inc", - )], + tbl_outs = {"tools/llvm-rc/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-rc/Opts.td", deps = [":OptParserTdFiles"], @@ -5474,10 +5327,7 @@ gentbl_cc_library( gentbl_cc_library( name = "WindresTableGen", strip_include_prefix = "tools/llvm-rc", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-rc/WindresOpts.inc", - )], + tbl_outs = {"tools/llvm-rc/WindresOpts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-rc/WindresOpts.td", deps = [":OptParserTdFiles"], @@ -5522,10 +5372,7 @@ binary_alias( gentbl_cc_library( name = "ReadobjOptsTableGen", strip_include_prefix = "tools/llvm-readobj", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-readobj/Opts.inc", - )], + tbl_outs = {"tools/llvm-readobj/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-readobj/Opts.td", deps = [":OptParserTdFiles"], @@ -5625,10 +5472,7 @@ cc_binary( gentbl_cc_library( name = "SizeOptsTableGen", strip_include_prefix = "tools/llvm-size", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-size/Opts.inc", - )], + tbl_outs = {"tools/llvm-size/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-size/Opts.td", deps = [":OptParserTdFiles"], @@ -5677,10 +5521,7 @@ cc_binary( gentbl_cc_library( name = "StringsOptsTableGen", strip_include_prefix = "tools/llvm-strings", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-strings/Opts.inc", - )], + tbl_outs = {"tools/llvm-strings/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-strings/Opts.td", deps = [":OptParserTdFiles"], @@ -5704,10 +5545,7 @@ cc_binary( gentbl_cc_library( name = "SymbolizerOptsTableGen", strip_include_prefix = "tools/llvm-symbolizer", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-symbolizer/Opts.inc", - )], + tbl_outs = {"tools/llvm-symbolizer/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-symbolizer/Opts.td", deps = [":OptParserTdFiles"], @@ -5818,10 +5656,7 @@ cc_binary( gentbl_cc_library( name = "SancovOptsTableGen", strip_include_prefix = "tools/sancov", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/sancov/Opts.inc", - )], + tbl_outs = {"tools/sancov/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/sancov/Opts.td", deps = [":OptParserTdFiles"], @@ -6212,10 +6047,7 @@ cc_binary( gentbl_cc_library( name = "ReadTAPIOptsTableGen", strip_include_prefix = "tools/llvm-readtapi", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-readtapi/TapiOpts.inc", - )], + tbl_outs = {"tools/llvm-readtapi/TapiOpts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-readtapi/TapiOpts.td", deps = [":OptParserTdFiles"], @@ -6244,10 +6076,7 @@ cc_binary( gentbl_cc_library( name = "TLICheckerOptsTableGen", strip_include_prefix = "tools/llvm-tli-checker", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "tools/llvm-tli-checker/Opts.inc", - )], + tbl_outs = {"tools/llvm-tli-checker/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = ":llvm-tblgen", td_file = "tools/llvm-tli-checker/Opts.td", deps = [":OptParserTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel index 561f2b8f408f0..3b778fc90fe1a 100644 --- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel @@ -563,10 +563,7 @@ cc_test( gentbl_cc_library( name = "option_tests_gen", strip_include_prefix = "Option", - tbl_outs = [( - ["-gen-opt-parser-defs"], - "Option/Opts.inc", - )], + tbl_outs = {"Option/Opts.inc": ["-gen-opt-parser-defs"]}, tblgen = "//llvm:llvm-tblgen", td_file = "Option/Opts.td", deps = ["//llvm:OptParserTdFiles"], @@ -575,16 +572,10 @@ gentbl_cc_library( gentbl_cc_library( name = "automata_gen", strip_include_prefix = "TableGen", - tbl_outs = [ - ( - ["-gen-automata"], - "TableGen/AutomataAutomata.inc", - ), - ( - ["-gen-searchable-tables"], - "TableGen/AutomataTables.inc", - ), - ], + tbl_outs = { + "TableGen/AutomataAutomata.inc": ["-gen-automata"], + "TableGen/AutomataTables.inc": ["-gen-searchable-tables"], + }, tblgen = "//llvm:llvm-tblgen", td_file = "TableGen/Automata.td", deps = ["//llvm:CommonTargetTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index e0f072d769f31..141986392917e 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -73,16 +73,10 @@ exports_files(glob(["include/**/*.td"])) [ gentbl_cc_library( name = name + "IncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/IR/" + name + ".h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/IR/" + name + ".cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/IR/" + name + ".h.inc": ["-gen-op-interface-decls"], + "include/mlir/IR/" + name + ".cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/IR/" + name + ".td", deps = [":OpBaseTdFiles"], @@ -95,32 +89,14 @@ exports_files(glob(["include/**/*.td"])) gentbl_cc_library( name = "OpAsmInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-attr-interface-decls"], - "include/mlir/IR/OpAsmAttrInterface.h.inc", - ), - ( - ["-gen-attr-interface-defs"], - "include/mlir/IR/OpAsmAttrInterface.cpp.inc", - ), - ( - ["-gen-op-interface-decls"], - "include/mlir/IR/OpAsmOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/IR/OpAsmOpInterface.cpp.inc", - ), - ( - ["-gen-type-interface-decls"], - "include/mlir/IR/OpAsmTypeInterface.h.inc", - ), - ( - ["-gen-type-interface-defs"], - "include/mlir/IR/OpAsmTypeInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/IR/OpAsmAttrInterface.h.inc": ["-gen-attr-interface-decls"], + "include/mlir/IR/OpAsmAttrInterface.cpp.inc": ["-gen-attr-interface-defs"], + "include/mlir/IR/OpAsmOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/IR/OpAsmOpInterface.cpp.inc": ["-gen-op-interface-defs"], + "include/mlir/IR/OpAsmTypeInterface.h.inc": ["-gen-type-interface-decls"], + "include/mlir/IR/OpAsmTypeInterface.cpp.inc": ["-gen-type-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/IR/OpAsmInterface.td", deps = [":OpBaseTdFiles"], @@ -170,16 +146,10 @@ td_library( gentbl_cc_library( name = "BuiltinDialectIncGen", - tbl_outs = [ - ( - ["-gen-dialect-decls"], - "include/mlir/IR/BuiltinDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/IR/BuiltinDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/IR/BuiltinDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/IR/BuiltinDialect.cpp.inc": ["-gen-dialect-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/IR/BuiltinDialect.td", deps = [":BuiltinDialectTdFiles"], @@ -187,15 +157,10 @@ gentbl_cc_library( gentbl_cc_library( name = "BuiltinDialectBytecodeGen", - tbl_outs = [ - ( - [ - "-gen-bytecode", - "-bytecode-dialect=Builtin", - ], - "include/mlir/IR/BuiltinDialectBytecode.cpp.inc", - ), - ], + tbl_outs = {"include/mlir/IR/BuiltinDialectBytecode.cpp.inc": [ + "-gen-bytecode", + "-bytecode-dialect=Builtin", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/IR/BuiltinDialectBytecode.td", deps = [":BuiltinDialectTdFiles"], @@ -203,16 +168,10 @@ gentbl_cc_library( gentbl_cc_library( name = "BuiltinAttributesIncGen", - tbl_outs = [ - ( - ["--gen-attrdef-decls"], - "include/mlir/IR/BuiltinAttributes.h.inc", - ), - ( - ["--gen-attrdef-defs"], - "include/mlir/IR/BuiltinAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/IR/BuiltinAttributes.h.inc": ["--gen-attrdef-decls"], + "include/mlir/IR/BuiltinAttributes.cpp.inc": ["--gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/IR/BuiltinAttributes.td", deps = [":BuiltinDialectTdFiles"], @@ -220,16 +179,10 @@ gentbl_cc_library( gentbl_cc_library( name = "BuiltinAttributeInterfacesIncGen", - tbl_outs = [ - ( - ["--gen-attr-interface-decls"], - "include/mlir/IR/BuiltinAttributeInterfaces.h.inc", - ), - ( - ["--gen-attr-interface-defs"], - "include/mlir/IR/BuiltinAttributeInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/IR/BuiltinAttributeInterfaces.h.inc": ["--gen-attr-interface-decls"], + "include/mlir/IR/BuiltinAttributeInterfaces.cpp.inc": ["--gen-attr-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/IR/BuiltinAttributeInterfaces.td", deps = [":BuiltinDialectTdFiles"], @@ -237,16 +190,10 @@ gentbl_cc_library( gentbl_cc_library( name = "BuiltinLocationAttributesIncGen", - tbl_outs = [ - ( - ["--gen-attrdef-decls"], - "include/mlir/IR/BuiltinLocationAttributes.h.inc", - ), - ( - ["--gen-attrdef-defs"], - "include/mlir/IR/BuiltinLocationAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/IR/BuiltinLocationAttributes.h.inc": ["--gen-attrdef-decls"], + "include/mlir/IR/BuiltinLocationAttributes.cpp.inc": ["--gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/IR/BuiltinLocationAttributes.td", deps = [":BuiltinDialectTdFiles"], @@ -254,16 +201,10 @@ gentbl_cc_library( gentbl_cc_library( name = "BuiltinOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/IR/BuiltinOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/IR/BuiltinOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/IR/BuiltinOps.h.inc": ["-gen-op-decls"], + "include/mlir/IR/BuiltinOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/IR/BuiltinOps.td", deps = [ @@ -274,24 +215,12 @@ gentbl_cc_library( gentbl_cc_library( name = "BuiltinTypesIncGen", - tbl_outs = [ - ( - ["--gen-typedef-decls"], - "include/mlir/IR/BuiltinTypes.h.inc", - ), - ( - ["--gen-typedef-defs"], - "include/mlir/IR/BuiltinTypes.cpp.inc", - ), - ( - ["-gen-type-constraint-decls"], - "include/mlir/IR/BuiltinTypeConstraints.h.inc", - ), - ( - ["-gen-type-constraint-defs"], - "include/mlir/IR/BuiltinTypeConstraints.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/IR/BuiltinTypes.h.inc": ["--gen-typedef-decls"], + "include/mlir/IR/BuiltinTypes.cpp.inc": ["--gen-typedef-defs"], + "include/mlir/IR/BuiltinTypeConstraints.h.inc": ["-gen-type-constraint-decls"], + "include/mlir/IR/BuiltinTypeConstraints.cpp.inc": ["-gen-type-constraint-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/IR/BuiltinTypes.td", deps = [":BuiltinDialectTdFiles"], @@ -299,16 +228,10 @@ gentbl_cc_library( gentbl_cc_library( name = "BuiltinTypeInterfacesIncGen", - tbl_outs = [ - ( - ["--gen-type-interface-decls"], - "include/mlir/IR/BuiltinTypeInterfaces.h.inc", - ), - ( - ["--gen-type-interface-defs"], - "include/mlir/IR/BuiltinTypeInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/IR/BuiltinTypeInterfaces.h.inc": ["--gen-type-interface-decls"], + "include/mlir/IR/BuiltinTypeInterfaces.cpp.inc": ["--gen-type-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/IR/BuiltinTypeInterfaces.td", deps = [ @@ -328,16 +251,10 @@ td_library( gentbl_cc_library( name = "FunctionInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/FunctionInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/FunctionInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/FunctionInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/FunctionInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/FunctionInterfaces.td", deps = [ @@ -375,16 +292,10 @@ td_library( gentbl_cc_library( name = "RelayoutOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Linalg/IR/RelayoutOpInterface.td", deps = [ @@ -1488,30 +1399,18 @@ td_library( gentbl_cc_library( name = "AffineOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Affine/IR/AffineOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Affine/IR/AffineOps.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=affine", - ], - "include/mlir/Dialect/Affine/IR/AffineOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=affine", - ], - "include/mlir/Dialect/Affine/IR/AffineOpsDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Affine/IR/AffineOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Affine/IR/AffineOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/Affine/IR/AffineOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=affine", + ], + "include/mlir/Dialect/Affine/IR/AffineOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=affine", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Affine/IR/AffineOps.td", deps = [":AffineOpsTdFiles"], @@ -1519,16 +1418,10 @@ gentbl_cc_library( gentbl_cc_library( name = "AffineMemoryOpInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td", deps = [":AffineOpsTdFiles"], @@ -1547,16 +1440,10 @@ td_library( gentbl_cc_library( name = "AffineTransformOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td", deps = [ @@ -1600,56 +1487,29 @@ td_library( gentbl_cc_library( name = "AMDGPUIncGen", - tbl_outs = [ - ( - [ - "-gen-attrdef-decls", - "-dialect=amdgpu", - ], - "include/mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-dialect=amdgpu", - ], - "include/mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=amdgpu", - ], - "include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=amdgpu", - ], - "include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/AMDGPU/IR/AMDGPU.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/AMDGPU/IR/AMDGPU.md", - ), - ], + tbl_outs = { + "include/mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.h.inc": [ + "-gen-attrdef-decls", + "-dialect=amdgpu", + ], + "include/mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.cpp.inc": [ + "-gen-attrdef-defs", + "-dialect=amdgpu", + ], + "include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=amdgpu", + ], + "include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=amdgpu", + ], + "include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/AMDGPU/IR/AMDGPUEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/AMDGPU/IR/AMDGPU.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc": ["-gen-op-defs"], + "g3doc/Dialects/AMDGPU/IR/AMDGPU.md": ["-gen-op-doc"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/AMDGPU/IR/AMDGPU.td", deps = [":AMDGPUTdFiles"], @@ -1679,15 +1539,10 @@ cc_library( gentbl_cc_library( name = "AMDGPUPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=AMDGPU", - ], - "include/mlir/Dialect/AMDGPU/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/AMDGPU/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=AMDGPU", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/AMDGPU/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -1757,16 +1612,10 @@ td_library( gentbl_cc_library( name = "EmitCAttributesIncGen", - tbl_outs = [ - ( - ["--gen-attrdef-decls"], - "include/mlir/Dialect/EmitC/IR/EmitCAttributes.h.inc", - ), - ( - ["--gen-attrdef-defs"], - "include/mlir/Dialect/EmitC/IR/EmitCAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/EmitC/IR/EmitCAttributes.h.inc": ["--gen-attrdef-decls"], + "include/mlir/Dialect/EmitC/IR/EmitCAttributes.cpp.inc": ["--gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/EmitC/IR/EmitCAttributes.td", deps = [":EmitCTdFiles"], @@ -1774,46 +1623,22 @@ gentbl_cc_library( gentbl_cc_library( name = "EmitCOpsIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=emitc", - ], - "include/mlir/Dialect/EmitC/IR/EmitCDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=emitc", - ], - "include/mlir/Dialect/EmitC/IR/EmitCDialect.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/EmitC/IR/EmitCEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/EmitC/IR/EmitCEnums.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/EmitC/IR/EmitC.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/EmitC/IR/EmitC.cpp.inc", - ), - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/EmitC/IR/EmitCTypes.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/EmitC/IR/EmitCTypes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/EmitC/IR/EmitCDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=emitc", + ], + "include/mlir/Dialect/EmitC/IR/EmitCDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=emitc", + ], + "include/mlir/Dialect/EmitC/IR/EmitCEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/EmitC/IR/EmitCEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/EmitC/IR/EmitC.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/EmitC/IR/EmitC.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/EmitC/IR/EmitCTypes.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/EmitC/IR/EmitCTypes.cpp.inc": ["-gen-typedef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/EmitC/IR/EmitC.td", deps = [":EmitCTdFiles"], @@ -1821,15 +1646,10 @@ gentbl_cc_library( gentbl_cc_library( name = "EmitCPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=EmitC", - ], - "include/mlir/Dialect/EmitC/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/EmitC/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=EmitC", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/EmitC/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -1876,32 +1696,14 @@ td_library( gentbl_cc_library( name = "AsyncOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Async/IR/AsyncOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Async/IR/AsyncOps.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/Async/IR/AsyncOpsDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/Async/IR/AsyncOpsDialect.cpp.inc", - ), - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/Async/IR/AsyncOpsTypes.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/Async/IR/AsyncOpsTypes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Async/IR/AsyncOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Async/IR/AsyncOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/Async/IR/AsyncOpsDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/Async/IR/AsyncOpsDialect.cpp.inc": ["-gen-dialect-defs"], + "include/mlir/Dialect/Async/IR/AsyncOpsTypes.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/Async/IR/AsyncOpsTypes.cpp.inc": ["-gen-typedef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Async/IR/AsyncOps.td", deps = [":AsyncOpsTdFiles"], @@ -1909,29 +1711,20 @@ gentbl_cc_library( gentbl_cc_library( name = "AsyncPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Async", - ], - "include/mlir/Dialect/Async/Passes.h.inc", - ), - ( - [ - "-gen-pass-capi-header", - "--prefix=Async", - ], - "include/mlir/Dialect/Async/Passes.capi.h.inc", - ), - ( - [ - "-gen-pass-capi-impl", - "--prefix=Async", - ], - "include/mlir/Dialect/Async/Passes.capi.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Async/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Async", + ], + "include/mlir/Dialect/Async/Passes.capi.h.inc": [ + "-gen-pass-capi-header", + "--prefix=Async", + ], + "include/mlir/Dialect/Async/Passes.capi.cpp.inc": [ + "-gen-pass-capi-impl", + "--prefix=Async", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Async/Passes.td", deps = [":PassBaseTdFiles"], @@ -1954,34 +1747,19 @@ td_library( gentbl_cc_library( name = "ArmNeonIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=arm_neon", - ], - "include/mlir/Dialect/ArmNeon/ArmNeonDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=arm_neon", - ], - "include/mlir/Dialect/ArmNeon/ArmNeonDialect.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/ArmNeon/ArmNeon.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/ArmNeon/ArmNeon.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/ArmNeon/ArmNeon.md", - ), - ], + tbl_outs = { + "include/mlir/Dialect/ArmNeon/ArmNeonDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=arm_neon", + ], + "include/mlir/Dialect/ArmNeon/ArmNeonDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=arm_neon", + ], + "include/mlir/Dialect/ArmNeon/ArmNeon.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/ArmNeon/ArmNeon.cpp.inc": ["-gen-op-defs"], + "g3doc/Dialects/ArmNeon/ArmNeon.md": ["-gen-op-doc"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/ArmNeon/ArmNeon.td", deps = [":ArmNeonTdFiles"], @@ -2021,12 +1799,7 @@ cc_library( gentbl_cc_library( name = "ArmNeonConversionIncGen", - tbl_outs = [ - ( - ["-gen-llvmir-conversions"], - "include/mlir/Dialect/ArmNeon/ArmNeonConversions.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/ArmNeon/ArmNeonConversions.inc": ["-gen-llvmir-conversions"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/ArmNeon/ArmNeon.td", deps = [":ArmNeonTdFiles"], @@ -2072,23 +1845,14 @@ td_library( gentbl_cc_library( name = "ArmSMETransformsPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=ArmSME", - ], - "include/mlir/Dialect/ArmSME/Transforms/Passes.h.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/ArmSME/Transforms/PassesEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/ArmSME/Transforms/PassesEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/ArmSME/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=ArmSME", + ], + "include/mlir/Dialect/ArmSME/Transforms/PassesEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/ArmSME/Transforms/PassesEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/ArmSME/Transforms/Passes.td", deps = [ @@ -2099,38 +1863,20 @@ gentbl_cc_library( gentbl_cc_library( name = "ArmSMEIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/ArmSME/IR/ArmSME.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/ArmSME/IR/ArmSME.cpp.inc", - ), - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/ArmSME/IR/ArmSMETypes.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/ArmSME/IR/ArmSMETypes.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=arm_sme", - ], - "include/mlir/Dialect/ArmSME/IR/ArmSMEDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=arm_sme", - ], - "include/mlir/Dialect/ArmSME/IR/ArmSMEDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/ArmSME/IR/ArmSME.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/ArmSME/IR/ArmSME.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/ArmSME/IR/ArmSMETypes.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/ArmSME/IR/ArmSMETypes.cpp.inc": ["-gen-typedef-defs"], + "include/mlir/Dialect/ArmSME/IR/ArmSMEDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=arm_sme", + ], + "include/mlir/Dialect/ArmSME/IR/ArmSMEDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=arm_sme", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/ArmSME/IR/ArmSME.td", deps = [":ArmSMETdFiles"], @@ -2138,38 +1884,20 @@ gentbl_cc_library( gentbl_cc_library( name = "ArmSMEOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/ArmSME/IR/ArmSMEOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/ArmSME/IR/ArmSMEOps.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/ArmSME/IR/ArmSMEEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/ArmSME/IR/ArmSMEEnums.cpp.inc", - ), - ( - [ - "-gen-attrdef-decls", - "-attrdefs-dialect=arm_sme", - ], - "include/mlir/Dialect/ArmSME/IR/ArmSMEAttrDefs.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-attrdefs-dialect=arm_sme", - ], - "include/mlir/Dialect/ArmSME/IR/ArmSMEAttrDefs.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/ArmSME/IR/ArmSMEOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/ArmSME/IR/ArmSMEOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/ArmSME/IR/ArmSMEEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/ArmSME/IR/ArmSMEEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/ArmSME/IR/ArmSMEAttrDefs.h.inc": [ + "-gen-attrdef-decls", + "-attrdefs-dialect=arm_sme", + ], + "include/mlir/Dialect/ArmSME/IR/ArmSMEAttrDefs.cpp.inc": [ + "-gen-attrdef-defs", + "-attrdefs-dialect=arm_sme", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td", deps = [":ArmSMETdFiles"], @@ -2177,12 +1905,7 @@ gentbl_cc_library( gentbl_cc_library( name = "ArmSMEConversionIncGen", - tbl_outs = [ - ( - ["-gen-llvmir-conversions"], - "include/mlir/Dialect/ArmSME/IR/ArmSMEOpsConversions.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/ArmSME/IR/ArmSMEOpsConversions.inc": ["-gen-llvmir-conversions"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td", deps = [":ArmSMETdFiles"], @@ -2190,16 +1913,10 @@ gentbl_cc_library( gentbl_cc_library( name = "ArmSMEOpInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/ArmSME/IR/ArmSMEOpInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/ArmSME/IR/ArmSMEOpInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/ArmSME/IR/ArmSMEOpInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/ArmSME/IR/ArmSMEOpInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td", deps = [":ArmSMETdFiles"], @@ -2222,20 +1939,11 @@ cc_library( gentbl_cc_library( name = "ArmSMEIntrinsicOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.cpp.inc", - ), - ( - ["-gen-llvmir-conversions"], - "include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicConversions.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicConversions.inc": ["-gen-llvmir-conversions"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td", deps = [":ArmSMETdFiles"], @@ -2368,38 +2076,20 @@ td_library( gentbl_cc_library( name = "ArmSVEIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/ArmSVE/IR/ArmSVE.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/ArmSVE/IR/ArmSVE.cpp.inc", - ), - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/ArmSVE/IR/ArmSVETypes.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/ArmSVE/IR/ArmSVETypes.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=arm_sve", - ], - "include/mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=arm_sve", - ], - "include/mlir/Dialect/ArmSVE/IR/ArmSVEDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/ArmSVE/IR/ArmSVE.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/ArmSVE/IR/ArmSVE.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/ArmSVE/IR/ArmSVETypes.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/ArmSVE/IR/ArmSVETypes.cpp.inc": ["-gen-typedef-defs"], + "include/mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=arm_sve", + ], + "include/mlir/Dialect/ArmSVE/IR/ArmSVEDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=arm_sve", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/ArmSVE/IR/ArmSVE.td", deps = [":ArmSVETdFiles"], @@ -2423,15 +2113,10 @@ cc_library( gentbl_cc_library( name = "ArmSVEPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=ArmSVE", - ], - "include/mlir/Dialect/ArmSVE/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/ArmSVE/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=ArmSVE", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/ArmSVE/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -2459,12 +2144,7 @@ cc_library( gentbl_cc_library( name = "ArmSVEConversionIncGen", - tbl_outs = [ - ( - ["-gen-llvmir-conversions"], - "include/mlir/Dialect/ArmSVE/IR/ArmSVEConversions.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/ArmSVE/IR/ArmSVEConversions.inc": ["-gen-llvmir-conversions"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/ArmSVE/IR/ArmSVE.td", deps = [":ArmSVETdFiles"], @@ -2487,48 +2167,27 @@ td_library( gentbl_cc_library( name = "AMXIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=amx", - ], - "include/mlir/Dialect/AMX/AMXDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=amx", - ], - "include/mlir/Dialect/AMX/AMXDialect.cpp.inc", - ), - ( - [ - "-gen-typedef-decls", - "-typedefs-dialect=amx", - ], - "include/mlir/Dialect/AMX/AMXTypes.h.inc", - ), - ( - [ - "-gen-typedef-defs", - "-typedefs-dialect=amx", - ], - "include/mlir/Dialect/AMX/AMXTypes.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/AMX/AMX.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/AMX/AMX.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/AMX/AMX.md", - ), - ], + tbl_outs = { + "include/mlir/Dialect/AMX/AMXDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=amx", + ], + "include/mlir/Dialect/AMX/AMXDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=amx", + ], + "include/mlir/Dialect/AMX/AMXTypes.h.inc": [ + "-gen-typedef-decls", + "-typedefs-dialect=amx", + ], + "include/mlir/Dialect/AMX/AMXTypes.cpp.inc": [ + "-gen-typedef-defs", + "-typedefs-dialect=amx", + ], + "include/mlir/Dialect/AMX/AMX.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/AMX/AMX.cpp.inc": ["-gen-op-defs"], + "g3doc/Dialects/AMX/AMX.md": ["-gen-op-doc"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/AMX/AMX.td", deps = [":AMXTdFiles"], @@ -2565,12 +2224,7 @@ cc_library( gentbl_cc_library( name = "AMXConversionIncGen", - tbl_outs = [ - ( - ["-gen-llvmir-conversions"], - "include/mlir/Dialect/AMX/AMXConversions.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/AMX/AMXConversions.inc": ["-gen-llvmir-conversions"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/AMX/AMX.td", deps = [":AMXTdFiles"], @@ -2593,34 +2247,19 @@ td_library( gentbl_cc_library( name = "X86VectorIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=x86vector", - ], - "include/mlir/Dialect/X86Vector/X86VectorDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=x86vector", - ], - "include/mlir/Dialect/X86Vector/X86VectorDialect.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/X86Vector/X86Vector.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/X86Vector/X86Vector.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/X86Vector/X86Vector.md", - ), - ], + tbl_outs = { + "include/mlir/Dialect/X86Vector/X86VectorDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=x86vector", + ], + "include/mlir/Dialect/X86Vector/X86VectorDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=x86vector", + ], + "include/mlir/Dialect/X86Vector/X86Vector.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/X86Vector/X86Vector.cpp.inc": ["-gen-op-defs"], + "g3doc/Dialects/X86Vector/X86Vector.md": ["-gen-op-doc"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/X86Vector/X86Vector.td", deps = [":X86VectorTdFiles"], @@ -2660,12 +2299,7 @@ cc_library( gentbl_cc_library( name = "X86VectorConversionIncGen", - tbl_outs = [ - ( - ["-gen-llvmir-conversions"], - "include/mlir/Dialect/X86Vector/X86VectorConversions.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/X86Vector/X86VectorConversions.inc": ["-gen-llvmir-conversions"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/X86Vector/X86Vector.td", deps = [":X86VectorTdFiles"], @@ -2695,38 +2329,20 @@ td_library( gentbl_cc_library( name = "IRDLIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=irdl", - ], - "include/mlir/Dialect/IRDL/IR/IRDLDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=irdl", - ], - "include/mlir/Dialect/IRDL/IR/IRDLDialect.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/IRDL/IR/IRDL.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/IRDL/IR/IRDL.cpp.inc", - ), - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/IRDL/IR/IRDLTypes.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/IRDL/IR/IRDLTypes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/IRDL/IR/IRDLDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=irdl", + ], + "include/mlir/Dialect/IRDL/IR/IRDLDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=irdl", + ], + "include/mlir/Dialect/IRDL/IR/IRDL.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/IRDL/IR/IRDL.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/IRDL/IR/IRDLTypes.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/IRDL/IR/IRDLTypes.cpp.inc": ["-gen-typedef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/IRDL/IR/IRDLOps.td", deps = [":IRDLTdFiles"], @@ -2734,16 +2350,10 @@ gentbl_cc_library( gentbl_cc_library( name = "IRDLInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/IRDL/IR/IRDLInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/IRDL/IR/IRDLInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/IRDL/IR/IRDLInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/IRDL/IR/IRDLInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/IRDL/IR/IRDLInterfaces.td", deps = [":IRDLTdFiles"], @@ -2751,16 +2361,10 @@ gentbl_cc_library( gentbl_cc_library( name = "IRDLAttributesIncGen", - tbl_outs = [ - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/IRDL/IR/IRDLAttributes.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/IRDL/IR/IRDLAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/IRDL/IR/IRDLAttributes.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/IRDL/IR/IRDLAttributes.cpp.inc": ["-gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/IRDL/IR/IRDLAttributes.td", deps = [":IRDLTdFiles"], @@ -2768,16 +2372,10 @@ gentbl_cc_library( gentbl_cc_library( name = "IRDLEnumsIncGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/IRDL/IR/IRDLEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/IRDL/IR/IRDLEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/IRDL/IR/IRDLEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/IRDL/IR/IRDLEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/IRDL/IR/IRDLAttributes.td", deps = [":IRDLTdFiles"], @@ -2785,16 +2383,10 @@ gentbl_cc_library( gentbl_cc_library( name = "IRDLOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/IRDL/IR/IRDLOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/IRDL/IR/IRDLOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/IRDL/IR/IRDLOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/IRDL/IR/IRDLOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/IRDL/IR/IRDLOps.td", deps = [":IRDLTdFiles"], @@ -2802,16 +2394,10 @@ gentbl_cc_library( gentbl_cc_library( name = "IRDLTypesIncGen", - tbl_outs = [ - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/IRDL/IR/IRDLTypesGen.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/IRDL/IR/IRDLTypesGen.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/IRDL/IR/IRDLTypesGen.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/IRDL/IR/IRDLTypesGen.cpp.inc": ["-gen-typedef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/IRDL/IR/IRDLTypes.td", deps = [":IRDLTdFiles"], @@ -2877,24 +2463,12 @@ td_library( gentbl_cc_library( name = "SCFIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/SCF/IR/SCFOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/SCF/IR/SCFOps.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/SCF/IR/SCFOpsDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/SCF/IR/SCFOpsDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/SCF/IR/SCFOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/SCF/IR/SCFOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/SCF/IR/SCFOpsDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/SCF/IR/SCFOpsDialect.cpp.inc": ["-gen-dialect-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SCF/IR/SCFOps.td", deps = [":SCFTdFiles"], @@ -2902,24 +2476,12 @@ gentbl_cc_library( gentbl_cc_library( name = "SCFDeviceMappingInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-attr-interface-decls"], - "include/mlir/Dialect/SCF/IR/DeviceMappingAttrInterface.h.inc", - ), - ( - ["-gen-attr-interface-defs"], - "include/mlir/Dialect/SCF/IR/DeviceMappingAttrInterface.cpp.inc", - ), - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/SCF/IR/DeviceMappingAttributes.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/SCF/IR/DeviceMappingAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/SCF/IR/DeviceMappingAttrInterface.h.inc": ["-gen-attr-interface-decls"], + "include/mlir/Dialect/SCF/IR/DeviceMappingAttrInterface.cpp.inc": ["-gen-attr-interface-defs"], + "include/mlir/Dialect/SCF/IR/DeviceMappingAttributes.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/SCF/IR/DeviceMappingAttributes.cpp.inc": ["-gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td", deps = [":SCFTdFiles"], @@ -2927,15 +2489,10 @@ gentbl_cc_library( gentbl_cc_library( name = "SCFPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=SCF", - ], - "include/mlir/Dialect/SCF/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/SCF/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=SCF", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SCF/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -2995,16 +2552,10 @@ td_library( gentbl_cc_library( name = "SCFTransformOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td", deps = [ @@ -3059,24 +2610,12 @@ td_library( gentbl_cc_library( name = "SparseTensorAttrDefsIncGen", - tbl_outs = [ - ( - ["--gen-attrdef-decls"], - "include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.h.inc", - ), - ( - ["--gen-attrdef-defs"], - "include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.cpp.inc", - ), - ( - ["--gen-enum-decls"], - "include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrEnums.h.inc", - ), - ( - ["--gen-enum-defs"], - "include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.h.inc": ["--gen-attrdef-decls"], + "include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.cpp.inc": ["--gen-attrdef-defs"], + "include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrEnums.h.inc": ["--gen-enum-decls"], + "include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrEnums.cpp.inc": ["--gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td", deps = [":SparseTensorTdFiles"], @@ -3084,34 +2623,19 @@ gentbl_cc_library( gentbl_cc_library( name = "SparseTensorOpsIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=sparse_tensor", - ], - "include/mlir/Dialect/SparseTensor/IR/SparseTensorOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=sparse_tensor", - ], - "include/mlir/Dialect/SparseTensor/IR/SparseTensorOpsDialect.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/SparseTensor/SparseTensor.md", - ), - ], + tbl_outs = { + "include/mlir/Dialect/SparseTensor/IR/SparseTensorOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=sparse_tensor", + ], + "include/mlir/Dialect/SparseTensor/IR/SparseTensorOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=sparse_tensor", + ], + "include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.cpp.inc": ["-gen-op-defs"], + "g3doc/Dialects/SparseTensor/SparseTensor.md": ["-gen-op-doc"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td", deps = [":SparseTensorTdFiles"], @@ -3119,16 +2643,10 @@ gentbl_cc_library( gentbl_cc_library( name = "SparseTensorTypesIncGen", - tbl_outs = [ - ( - ["--gen-typedef-decls"], - "include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.h.inc", - ), - ( - ["--gen-typedef-defs"], - "include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.h.inc": ["--gen-typedef-decls"], + "include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.cpp.inc": ["--gen-typedef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td", deps = [":SparseTensorTdFiles"], @@ -3136,29 +2654,20 @@ gentbl_cc_library( gentbl_cc_library( name = "SparseTensorPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=SparseTensor", - ], - "include/mlir/Dialect/SparseTensor/Transforms/Passes.h.inc", - ), - ( - [ - "-gen-pass-capi-header", - "--prefix=SparseTensor", - ], - "include/mlir/Dialect/SparseTensor/Transforms/Passes.capi.h.inc", - ), - ( - [ - "-gen-pass-capi-impl", - "--prefix=SparseTensor", - ], - "include/mlir/Dialect/SparseTensor/Transforms/Passes.capi.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/SparseTensor/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=SparseTensor", + ], + "include/mlir/Dialect/SparseTensor/Transforms/Passes.capi.h.inc": [ + "-gen-pass-capi-header", + "--prefix=SparseTensor", + ], + "include/mlir/Dialect/SparseTensor/Transforms/Passes.capi.cpp.inc": [ + "-gen-pass-capi-impl", + "--prefix=SparseTensor", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SparseTensor/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -3166,16 +2675,10 @@ gentbl_cc_library( gentbl_cc_library( name = "SparseTensorInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/SparseTensor/IR/SparseTensorInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/SparseTensor/IR/SparseTensorInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/SparseTensor/IR/SparseTensorInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/SparseTensor/IR/SparseTensorInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SparseTensor/IR/SparseTensorInterfaces.td", deps = [":SparseTensorTdFiles"], @@ -3194,16 +2697,10 @@ td_library( gentbl_cc_library( name = "SparseTensorTransformOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.td", deps = [ @@ -3395,78 +2892,48 @@ td_library( gentbl_cc_library( name = "MeshIncGen", - tbl_outs = [ - ( - [ - "-gen-op-decls", - "-dialect=mesh", - ], - "include/mlir/Dialect/Mesh/IR/MeshOps.h.inc", - ), - ( - [ - "-gen-op-defs", - "-dialect=mesh", - ], - "include/mlir/Dialect/Mesh/IR/MeshOps.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=mesh", - ], - "include/mlir/Dialect/Mesh/IR/MeshDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=mesh", - ], - "include/mlir/Dialect/Mesh/IR/MeshDialect.cpp.inc", - ), - ( - [ - "-gen-enum-decls", - "-dialect=mesh", - ], - "include/mlir/Dialect/Mesh/IR/MeshEnums.h.inc", - ), - ( - [ - "-gen-enum-defs", - "-dialect=mesh", - ], - "include/mlir/Dialect/Mesh/IR/MeshEnums.cpp.inc", - ), - ( - [ - "-gen-attrdef-decls", - "-dialect=mesh", - ], - "include/mlir/Dialect/Mesh/IR/MeshAttributes.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-dialect=mesh", - ], - "include/mlir/Dialect/Mesh/IR/MeshAttributes.cpp.inc", - ), - ( - [ - "-gen-typedef-decls", - "-typedefs-dialect=mesh", - ], - "include/mlir/Dialect/Mesh/IR/MeshTypes.h.inc", - ), - ( - [ - "-gen-typedef-defs", - "-typedefs-dialect=mesh", - ], - "include/mlir/Dialect/Mesh/IR/MeshTypes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Mesh/IR/MeshOps.h.inc": [ + "-gen-op-decls", + "-dialect=mesh", + ], + "include/mlir/Dialect/Mesh/IR/MeshOps.cpp.inc": [ + "-gen-op-defs", + "-dialect=mesh", + ], + "include/mlir/Dialect/Mesh/IR/MeshDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=mesh", + ], + "include/mlir/Dialect/Mesh/IR/MeshDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=mesh", + ], + "include/mlir/Dialect/Mesh/IR/MeshEnums.h.inc": [ + "-gen-enum-decls", + "-dialect=mesh", + ], + "include/mlir/Dialect/Mesh/IR/MeshEnums.cpp.inc": [ + "-gen-enum-defs", + "-dialect=mesh", + ], + "include/mlir/Dialect/Mesh/IR/MeshAttributes.h.inc": [ + "-gen-attrdef-decls", + "-dialect=mesh", + ], + "include/mlir/Dialect/Mesh/IR/MeshAttributes.cpp.inc": [ + "-gen-attrdef-defs", + "-dialect=mesh", + ], + "include/mlir/Dialect/Mesh/IR/MeshTypes.h.inc": [ + "-gen-typedef-decls", + "-typedefs-dialect=mesh", + ], + "include/mlir/Dialect/Mesh/IR/MeshTypes.cpp.inc": [ + "-gen-typedef-defs", + "-typedefs-dialect=mesh", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Mesh/IR/MeshOps.td", deps = [ @@ -3477,16 +2944,10 @@ gentbl_cc_library( gentbl_cc_library( name = "MeshShardingInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Mesh/Interfaces/ShardingInterface.td", deps = [":OpBaseTdFiles"], @@ -3537,15 +2998,10 @@ cc_library( gentbl_cc_library( name = "MeshTransformsPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Mesh", - ], - "include/mlir/Dialect/Mesh/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Mesh/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Mesh", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Mesh/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -3647,58 +3103,25 @@ td_library( gentbl_cc_library( name = "NVGPUIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=nvgpu", - ], - "include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=nvgpu", - ], - "include/mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/NVGPU/IR/NVGPUOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/NVGPU/NVGPU.md", - ), - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/NVGPU/IR/NVGPUEnums.cpp.inc", - ), - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=nvgpu", + ], + "include/mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=nvgpu", + ], + "include/mlir/Dialect/NVGPU/IR/NVGPUOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/NVGPU/IR/NVGPUOps.cpp.inc": ["-gen-op-defs"], + "g3doc/Dialects/NVGPU/NVGPU.md": ["-gen-op-doc"], + "include/mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/NVGPU/IR/NVGPUTypeDefs.cpp.inc": ["-gen-typedef-defs"], + "include/mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/NVGPU/IR/NVGPUEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.cpp.inc": ["-gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/NVGPU/IR/NVGPUOps.td", deps = [ @@ -3710,15 +3133,10 @@ gentbl_cc_library( gentbl_cc_library( name = "NVGPUPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=NVGPU", - ], - "include/mlir/Dialect/NVGPU/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/NVGPU/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=NVGPU", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/NVGPU/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -3789,16 +3207,10 @@ td_library( gentbl_cc_library( name = "NVGPUTransformOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td", deps = [ @@ -3863,62 +3275,35 @@ td_library( gentbl_cc_library( name = "XeGPUIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=xegpu", - ], - "include/mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=xegpu", - ], - "include/mlir/Dialect/XeGPU/IR/XeGPUDialect.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/XeGPU/IR/XeGPU.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/XeGPU/XeGPU.md", - ), - ( - [ - "-gen-typedef-decls", - "-typedefs-dialect=xegpu", - ], - "include/mlir/Dialect/XeGPU/IR/XeGPUTypes.h.inc", - ), - ( - [ - "-gen-typedef-defs", - "-typedefs-dialect=xegpu", - ], - "include/mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc", - ), - ( - [ - "-gen-attrdef-decls", - "-attrdefs-dialect=xegpu", - ], - "include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-attrdefs-dialect=xegpu", - ], - "include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=xegpu", + ], + "include/mlir/Dialect/XeGPU/IR/XeGPUDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=xegpu", + ], + "include/mlir/Dialect/XeGPU/IR/XeGPU.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/XeGPU/IR/XeGPU.cpp.inc": ["-gen-op-defs"], + "g3doc/Dialects/XeGPU/XeGPU.md": ["-gen-op-doc"], + "include/mlir/Dialect/XeGPU/IR/XeGPUTypes.h.inc": [ + "-gen-typedef-decls", + "-typedefs-dialect=xegpu", + ], + "include/mlir/Dialect/XeGPU/IR/XeGPUTypes.cpp.inc": [ + "-gen-typedef-defs", + "-typedefs-dialect=xegpu", + ], + "include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc": [ + "-gen-attrdef-decls", + "-attrdefs-dialect=xegpu", + ], + "include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.cpp.inc": [ + "-gen-attrdef-defs", + "-attrdefs-dialect=xegpu", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/XeGPU/IR/XeGPU.td", deps = [ @@ -3946,16 +3331,10 @@ td_library( # duplicate declarations with the Arith enums. gentbl_cc_library( name = "XeGPUEnumsIncGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/XeGPU/IR/XeGPUEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/XeGPU/IR/XeGPUEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td", deps = [":XeGPUAttrTdFiles"], @@ -3989,15 +3368,10 @@ cc_library( gentbl_cc_library( name = "XeGPUPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=XeGPU", - ], - "include/mlir/Dialect/XeGPU/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/XeGPU/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=XeGPU", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/XeGPU/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -4051,32 +3425,14 @@ td_library( gentbl_cc_library( name = "FuncIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Func/IR/FuncOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Func/IR/FuncOps.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/Func/IR/FuncOpsDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/Func/IR/FuncOpsDialect.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/Func/IR/FuncOpsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/Func/IR/FuncOpsEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Func/IR/FuncOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Func/IR/FuncOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/Func/IR/FuncOpsDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/Func/IR/FuncOpsDialect.cpp.inc": ["-gen-dialect-defs"], + "include/mlir/Dialect/Func/IR/FuncOpsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/Func/IR/FuncOpsEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Func/IR/FuncOps.td", deps = [":FuncTdFiles"], @@ -4109,16 +3465,10 @@ td_library( gentbl_cc_library( name = "DialectUtilsIncGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/Utils/DialectUtilsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/Utils/DialectUtilsEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Utils/DialectUtilsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/Utils/DialectUtilsEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Utils/StructuredOpsUtils.td", deps = [":DialectUtilsTdFiles"], @@ -4313,15 +3663,10 @@ cc_library( gentbl_cc_library( name = "AffinePassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Affine", - ], - "include/mlir/Dialect/Affine/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Affine/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Affine", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Affine/Passes.td", deps = [":PassBaseTdFiles"], @@ -4366,29 +3711,20 @@ cc_library( gentbl_cc_library( name = "ConversionPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Conversion", - ], - "include/mlir/Conversion/Passes.h.inc", - ), - ( - [ - "-gen-pass-capi-header", - "--prefix=Conversion", - ], - "include/mlir/Conversion/Passes.capi.h.inc", - ), - ( - [ - "-gen-pass-capi-impl", - "--prefix=Conversion", - ], - "include/mlir/Conversion/Passes.capi.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Conversion/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Conversion", + ], + "include/mlir/Conversion/Passes.capi.h.inc": [ + "-gen-pass-capi-header", + "--prefix=Conversion", + ], + "include/mlir/Conversion/Passes.capi.cpp.inc": [ + "-gen-pass-capi-impl", + "--prefix=Conversion", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Conversion/Passes.td", deps = [ @@ -4739,32 +4075,14 @@ td_library( gentbl_cc_library( name = "ShapeOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Shape/IR/ShapeOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Shape/IR/ShapeOps.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/Shape/IR/ShapeOpsDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/Shape/IR/ShapeOpsDialect.cpp.inc", - ), - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/Shape/IR/ShapeOpsTypes.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/Shape/IR/ShapeOpsTypes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Shape/IR/ShapeOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Shape/IR/ShapeOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/Shape/IR/ShapeOpsDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/Shape/IR/ShapeOpsDialect.cpp.inc": ["-gen-dialect-defs"], + "include/mlir/Dialect/Shape/IR/ShapeOpsTypes.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/Shape/IR/ShapeOpsTypes.cpp.inc": ["-gen-typedef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Shape/IR/ShapeOps.td", deps = [":ShapeOpsTdFiles"], @@ -4773,12 +4091,7 @@ gentbl_cc_library( gentbl_cc_library( name = "MLIRShapeCanonicalizationIncGen", strip_include_prefix = "include/mlir/Dialect/Shape/IR", - tbl_outs = [ - ( - ["-gen-rewriters"], - "include/mlir/Dialect/Shape/IR/ShapeCanonicalization.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Shape/IR/ShapeCanonicalization.inc": ["-gen-rewriters"]}, tblgen = ":mlir-tblgen", td_file = "lib/Dialect/Shape/IR/ShapeCanonicalization.td", deps = [ @@ -4818,12 +4131,7 @@ cc_library( gentbl_cc_library( name = "ShapeToStandardGen", strip_include_prefix = "lib/Conversion/ShapeToStandard", - tbl_outs = [ - ( - ["-gen-rewriters"], - "lib/Conversion/ShapeToStandard/ShapeToStandard.cpp.inc", - ), - ], + tbl_outs = {"lib/Conversion/ShapeToStandard/ShapeToStandard.cpp.inc": ["-gen-rewriters"]}, tblgen = ":mlir-tblgen", td_file = "lib/Conversion/ShapeToStandard/ShapeToStandard.td", deps = [":ShapeOpsTdFiles"], @@ -4854,13 +4162,10 @@ cc_library( gentbl_cc_library( name = "ShapeTransformsPassIncGen", - tbl_outs = [( - [ - "-gen-pass-decls", - "-name=Shape", - ], - "include/mlir/Dialect/Shape/Transforms/Passes.h.inc", - )], + tbl_outs = {"include/mlir/Dialect/Shape/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Shape", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Shape/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -4913,32 +4218,14 @@ td_library( gentbl_cc_library( name = "ControlFlowOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/ControlFlow/IR/ControlFlowOpsDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/ControlFlow/IR/ControlFlowOpsDialect.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/ControlFlow/IR/ControlFlowOpsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/ControlFlow/IR/ControlFlowOpsEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/ControlFlow/IR/ControlFlowOpsDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/ControlFlow/IR/ControlFlowOpsDialect.cpp.inc": ["-gen-dialect-defs"], + "include/mlir/Dialect/ControlFlow/IR/ControlFlowOpsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/ControlFlow/IR/ControlFlowOpsEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/ControlFlow/IR/ControlFlowOps.td", deps = [ @@ -5048,16 +4335,10 @@ td_library( gentbl_cc_library( name = "FuncTransformOpsIncGen", strip_include_prefix = "include", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Func/TransformOps/FuncTransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Func/TransformOps/FuncTransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Func/TransformOps/FuncTransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Func/TransformOps/FuncTransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td", deps = [ @@ -5140,13 +4421,10 @@ cc_library( gentbl_cc_library( name = "FuncTransformsPassIncGen", - tbl_outs = [( - [ - "-gen-pass-decls", - "-name=Func", - ], - "include/mlir/Dialect/Func/Transforms/Passes.h.inc", - )], + tbl_outs = {"include/mlir/Dialect/Func/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Func", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Func/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -5250,16 +4528,10 @@ td_library( gentbl_cc_library( name = "VectorEnumsIncGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/Vector/Transforms/VectorTransformsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/Vector/Transforms/VectorTransformsEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Vector/Transforms/VectorTransformsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/Vector/Transforms/VectorTransformsEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Vector/Transforms/VectorTransformsBase.td", deps = [ @@ -5269,15 +4541,10 @@ gentbl_cc_library( gentbl_cc_library( name = "VectorPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Vector", - ], - "include/mlir/Dialect/Vector/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Vector/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Vector", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Vector/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -5556,32 +4823,14 @@ cc_library( gentbl_cc_library( name = "LLVMDialectInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/LLVMIR/LLVMInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/LLVMIR/LLVMInterfaces.cpp.inc", - ), - ( - ["-gen-attr-interface-decls"], - "include/mlir/Dialect/LLVMIR/LLVMAttrInterfaces.h.inc", - ), - ( - ["-gen-attr-interface-defs"], - "include/mlir/Dialect/LLVMIR/LLVMAttrInterfaces.cpp.inc", - ), - ( - ["-gen-type-interface-decls"], - "include/mlir/Dialect/LLVMIR/LLVMTypeInterfaces.h.inc", - ), - ( - ["-gen-type-interface-defs"], - "include/mlir/Dialect/LLVMIR/LLVMTypeInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/LLVMIR/LLVMInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/LLVMIR/LLVMInterfaces.cpp.inc": ["-gen-op-interface-defs"], + "include/mlir/Dialect/LLVMIR/LLVMAttrInterfaces.h.inc": ["-gen-attr-interface-decls"], + "include/mlir/Dialect/LLVMIR/LLVMAttrInterfaces.cpp.inc": ["-gen-attr-interface-defs"], + "include/mlir/Dialect/LLVMIR/LLVMTypeInterfaces.h.inc": ["-gen-type-interface-decls"], + "include/mlir/Dialect/LLVMIR/LLVMTypeInterfaces.cpp.inc": ["-gen-type-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/LLVMInterfaces.td", deps = [":LLVMOpsTdFiles"], @@ -5652,15 +4901,10 @@ cc_library( gentbl_cc_library( name = "LLVMPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=LLVM", - ], - "include/mlir/Dialect/LLVMIR/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/LLVMIR/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=LLVM", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -5769,16 +5013,10 @@ td_library( gentbl_cc_library( name = "GPUDeviceMapperEnumsGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/GPU/TransformOps/GPUDeviceMapperEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/GPU/TransformOps/GPUDeviceMapperEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/GPU/TransformOps/GPUDeviceMapperEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/GPU/TransformOps/GPUDeviceMapperEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td", deps = [ @@ -5789,16 +5027,10 @@ gentbl_cc_library( gentbl_cc_library( name = "GPUCompilationAttrInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-attr-interface-decls"], - "include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.h.inc", - ), - ( - ["-gen-attr-interface-defs"], - "include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.h.inc": ["-gen-attr-interface-decls"], + "include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.cpp.inc": ["-gen-attr-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td", deps = [ @@ -5809,16 +5041,10 @@ gentbl_cc_library( gentbl_cc_library( name = "GPUBaseIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/GPU/IR/GPUOpInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/GPU/IR/GPUOpInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/GPU/IR/GPUBase.td", deps = [":OpBaseTdFiles"], @@ -5826,46 +5052,22 @@ gentbl_cc_library( gentbl_cc_library( name = "GPUOpsIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=gpu", - ], - "include/mlir/Dialect/GPU/IR/GPUOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=gpu", - ], - "include/mlir/Dialect/GPU/IR/GPUOpsDialect.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/GPU/IR/GPUOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/GPU/IR/GPUOps.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/GPU/IR/GPUOpsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/GPU/IR/GPUOpsEnums.cpp.inc", - ), - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/GPU/IR/GPUOpsAttributes.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/GPU/IR/GPUOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=gpu", + ], + "include/mlir/Dialect/GPU/IR/GPUOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=gpu", + ], + "include/mlir/Dialect/GPU/IR/GPUOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/GPU/IR/GPUOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/GPU/IR/GPUOpsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/GPU/IR/GPUOpsEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/GPU/IR/GPUOpsAttributes.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc": ["-gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/GPU/IR/GPUOps.td", deps = [ @@ -5909,29 +5111,20 @@ cc_library( gentbl_cc_library( name = "GPUPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=GPU", - ], - "include/mlir/Dialect/GPU/Transforms/Passes.h.inc", - ), - ( - [ - "-gen-pass-capi-header", - "--prefix=GPU", - ], - "include/mlir/Dialect/GPU/Transforms/Passes.capi.h.inc", - ), - ( - [ - "-gen-pass-capi-impl", - "--prefix=GPU", - ], - "include/mlir/Dialect/GPU/Transforms/Passes.capi.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/GPU/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=GPU", + ], + "include/mlir/Dialect/GPU/Transforms/Passes.capi.h.inc": [ + "-gen-pass-capi-header", + "--prefix=GPU", + ], + "include/mlir/Dialect/GPU/Transforms/Passes.capi.cpp.inc": [ + "-gen-pass-capi-impl", + "--prefix=GPU", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/GPU/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -6059,16 +5252,10 @@ td_library( gentbl_cc_library( name = "GPUTransformOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td", deps = [ @@ -6156,12 +5343,7 @@ cc_library( gentbl_cc_library( name = "GPUToNVVMGen", strip_include_prefix = "lib/Conversion/GPUToNVVM", - tbl_outs = [ - ( - ["-gen-rewriters"], - "lib/Conversion/GPUToNVVM/GPUToNVVM.cpp.inc", - ), - ], + tbl_outs = {"lib/Conversion/GPUToNVVM/GPUToNVVM.cpp.inc": ["-gen-rewriters"]}, tblgen = ":mlir-tblgen", td_file = "lib/Conversion/GPUToNVVM/GPUToNVVM.td", deps = [ @@ -6308,12 +5490,7 @@ cc_library( gentbl_cc_library( name = "GPUToROCDLTGen", strip_include_prefix = "lib/Conversion/GPUToROCDL", - tbl_outs = [ - ( - ["-gen-rewriters"], - "lib/Conversion/GPUToROCDL/GPUToROCDL.cpp.inc", - ), - ], + tbl_outs = {"lib/Conversion/GPUToROCDL/GPUToROCDL.cpp.inc": ["-gen-rewriters"]}, tblgen = ":mlir-tblgen", td_file = "lib/Conversion/GPUToROCDL/GPUToROCDL.td", deps = [ @@ -6490,46 +5667,22 @@ cc_library( gentbl_cc_library( name = "LLVMOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/LLVMIR/LLVMOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/LLVMIR/LLVMOps.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/LLVMIR/LLVMOpsDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/LLVMIR/LLVMOpsDialect.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/LLVMIR/LLVMOpsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/LLVMIR/LLVMOpsEnums.cpp.inc", - ), - ( - [ - "--gen-attrdef-decls", - "-attrdefs-dialect=llvm", - ], - "include/mlir/Dialect/LLVMIR/LLVMOpsAttrDefs.h.inc", - ), - ( - [ - "--gen-attrdef-defs", - "-attrdefs-dialect=llvm", - ], - "include/mlir/Dialect/LLVMIR/LLVMOpsAttrDefs.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/LLVMIR/LLVMOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/LLVMIR/LLVMOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/LLVMIR/LLVMOpsDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/LLVMIR/LLVMOpsDialect.cpp.inc": ["-gen-dialect-defs"], + "include/mlir/Dialect/LLVMIR/LLVMOpsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/LLVMIR/LLVMOpsEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/LLVMIR/LLVMOpsAttrDefs.h.inc": [ + "--gen-attrdef-decls", + "-attrdefs-dialect=llvm", + ], + "include/mlir/Dialect/LLVMIR/LLVMOpsAttrDefs.cpp.inc": [ + "--gen-attrdef-defs", + "-attrdefs-dialect=llvm", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/LLVMOps.td", deps = [":LLVMOpsTdFiles"], @@ -6537,22 +5690,16 @@ gentbl_cc_library( gentbl_cc_library( name = "LLVMTypesIncGen", - tbl_outs = [ - ( - [ - "-gen-typedef-decls", - "-typedefs-dialect=llvm", - ], - "include/mlir/Dialect/LLVMIR/LLVMTypes.h.inc", - ), - ( - [ - "-gen-typedef-defs", - "-typedefs-dialect=llvm", - ], - "include/mlir/Dialect/LLVMIR/LLVMTypes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/LLVMIR/LLVMTypes.h.inc": [ + "-gen-typedef-decls", + "-typedefs-dialect=llvm", + ], + "include/mlir/Dialect/LLVMIR/LLVMTypes.cpp.inc": [ + "-gen-typedef-defs", + "-typedefs-dialect=llvm", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/LLVMTypes.td", deps = [":LLVMOpsTdFiles"], @@ -6560,16 +5707,10 @@ gentbl_cc_library( gentbl_cc_library( name = "LLVMIntrinsicOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td", deps = [":LLVMOpsTdFiles"], @@ -6577,24 +5718,12 @@ gentbl_cc_library( gentbl_cc_library( name = "LLVMConversionIncGen", - tbl_outs = [ - ( - ["-gen-llvmir-conversions"], - "include/mlir/Dialect/LLVMIR/LLVMConversions.inc", - ), - ( - ["-gen-enum-to-llvmir-conversions"], - "include/mlir/Dialect/LLVMIR/LLVMConversionEnumsToLLVM.inc", - ), - ( - ["-gen-enum-from-llvmir-conversions"], - "include/mlir/Dialect/LLVMIR/LLVMConversionEnumsFromLLVM.inc", - ), - ( - ["-gen-op-from-llvmir-conversions"], - "include/mlir/Dialect/LLVMIR/LLVMOpFromLLVMIRConversions.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/LLVMIR/LLVMConversions.inc": ["-gen-llvmir-conversions"], + "include/mlir/Dialect/LLVMIR/LLVMConversionEnumsToLLVM.inc": ["-gen-enum-to-llvmir-conversions"], + "include/mlir/Dialect/LLVMIR/LLVMConversionEnumsFromLLVM.inc": ["-gen-enum-from-llvmir-conversions"], + "include/mlir/Dialect/LLVMIR/LLVMOpFromLLVMIRConversions.inc": ["-gen-op-from-llvmir-conversions"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/LLVMOps.td", deps = [":LLVMOpsTdFiles"], @@ -6602,20 +5731,11 @@ gentbl_cc_library( gentbl_cc_library( name = "LLVMIntrinsicConversionIncGen", - tbl_outs = [ - ( - ["-gen-llvmir-conversions"], - "include/mlir/Dialect/LLVMIR/LLVMIntrinsicConversions.inc", - ), - ( - ["-gen-intr-from-llvmir-conversions"], - "include/mlir/Dialect/LLVMIR/LLVMIntrinsicFromLLVMIRConversions.inc", - ), - ( - ["-gen-convertible-llvmir-intrinsics"], - "include/mlir/Dialect/LLVMIR/LLVMConvertibleLLVMIRIntrinsics.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/LLVMIR/LLVMIntrinsicConversions.inc": ["-gen-llvmir-conversions"], + "include/mlir/Dialect/LLVMIR/LLVMIntrinsicFromLLVMIRConversions.inc": ["-gen-intr-from-llvmir-conversions"], + "include/mlir/Dialect/LLVMIR/LLVMConvertibleLLVMIRIntrinsics.inc": ["-gen-convertible-llvmir-intrinsics"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td", deps = [":LLVMOpsTdFiles"], @@ -6721,20 +5841,14 @@ td_library( gentbl_cc_library( name = "BasicPtxBuilderIntGen", - tbl_outs = [ - ( - [ - "-gen-op-interface-decls", - ], - "include/mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.h.inc", - ), - ( - [ - "-gen-op-interface-defs", - ], - "include/mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.h.inc": [ + "-gen-op-interface-decls", + ], + "include/mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.cpp.inc": [ + "-gen-op-interface-defs", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td", deps = [ @@ -6746,52 +5860,28 @@ gentbl_cc_library( gentbl_cc_library( name = "NVVMOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/LLVMIR/NVVMOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/LLVMIR/NVVMOps.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=nvvm", - ], - "include/mlir/Dialect/LLVMIR/NVVMOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=nvvm", - ], - "include/mlir/Dialect/LLVMIR/NVVMOpsDialect.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/LLVMIR/NVVMOpsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/LLVMIR/NVVMOpsEnums.cpp.inc", - ), - ( - [ - "-gen-attrdef-decls", - "-attrdefs-dialect=nvvm", - ], - "include/mlir/Dialect/LLVMIR/NVVMOpsAttributes.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-attrdefs-dialect=nvvm", - ], - "include/mlir/Dialect/LLVMIR/NVVMOpsAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/LLVMIR/NVVMOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/LLVMIR/NVVMOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/LLVMIR/NVVMOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=nvvm", + ], + "include/mlir/Dialect/LLVMIR/NVVMOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=nvvm", + ], + "include/mlir/Dialect/LLVMIR/NVVMOpsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/LLVMIR/NVVMOpsEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/LLVMIR/NVVMOpsAttributes.h.inc": [ + "-gen-attrdef-decls", + "-attrdefs-dialect=nvvm", + ], + "include/mlir/Dialect/LLVMIR/NVVMOpsAttributes.cpp.inc": [ + "-gen-attrdef-defs", + "-attrdefs-dialect=nvvm", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/NVVMOps.td", deps = [":NVVMOpsTdFiles"], @@ -6799,12 +5889,7 @@ gentbl_cc_library( gentbl_cc_library( name = "NVVMConversionIncGen", - tbl_outs = [ - ( - ["-gen-llvmir-conversions"], - "include/mlir/Dialect/LLVMIR/NVVMConversions.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/LLVMIR/NVVMConversions.inc": ["-gen-llvmir-conversions"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/NVVMOps.td", deps = [":NVVMOpsTdFiles"], @@ -6812,12 +5897,7 @@ gentbl_cc_library( gentbl_cc_library( name = "NVVMFromLLVMIRConversionsIncGen", - tbl_outs = [ - ( - ["-gen-intr-from-llvmir-conversions"], - "include/mlir/Dialect/LLVMIR/NVVMFromLLVMIRConversions.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/LLVMIR/NVVMFromLLVMIRConversions.inc": ["-gen-intr-from-llvmir-conversions"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/NVVMOps.td", deps = [":NVVMOpsTdFiles"], @@ -6825,12 +5905,7 @@ gentbl_cc_library( gentbl_cc_library( name = "NVVMConvertibleLLVMIRIntrinsicsIncGen", - tbl_outs = [ - ( - ["-gen-convertible-llvmir-intrinsics"], - "include/mlir/Dialect/LLVMIR/NVVMConvertibleLLVMIRIntrinsics.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/LLVMIR/NVVMConvertibleLLVMIRIntrinsics.inc": ["-gen-convertible-llvmir-intrinsics"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/NVVMOps.td", deps = [":NVVMOpsTdFiles"], @@ -6914,44 +5989,26 @@ td_library( gentbl_cc_library( name = "ROCDLOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/LLVMIR/ROCDLOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/LLVMIR/ROCDLOps.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=rocdl", - ], - "include/mlir/Dialect/LLVMIR/ROCDLOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=rocdl", - ], - "include/mlir/Dialect/LLVMIR/ROCDLOpsDialect.cpp.inc", - ), - ( - [ - "-gen-attrdef-decls", - "-attrdefs-dialect=rocdl", - ], - "include/mlir/Dialect/LLVMIR/ROCDLOpsAttributes.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-attrdefs-dialect=rocdl", - ], - "include/mlir/Dialect/LLVMIR/ROCDLOpsAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/LLVMIR/ROCDLOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/LLVMIR/ROCDLOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/LLVMIR/ROCDLOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=rocdl", + ], + "include/mlir/Dialect/LLVMIR/ROCDLOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=rocdl", + ], + "include/mlir/Dialect/LLVMIR/ROCDLOpsAttributes.h.inc": [ + "-gen-attrdef-decls", + "-attrdefs-dialect=rocdl", + ], + "include/mlir/Dialect/LLVMIR/ROCDLOpsAttributes.cpp.inc": [ + "-gen-attrdef-defs", + "-attrdefs-dialect=rocdl", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/ROCDLOps.td", deps = [":ROCDLOpsTdFiles"], @@ -6959,12 +6016,7 @@ gentbl_cc_library( gentbl_cc_library( name = "ROCDLConversionIncGen", - tbl_outs = [ - ( - ["-gen-llvmir-conversions"], - "include/mlir/Dialect/LLVMIR/ROCDLConversions.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/LLVMIR/ROCDLConversions.inc": ["-gen-llvmir-conversions"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/ROCDLOps.td", deps = [":ROCDLOpsTdFiles"], @@ -7005,24 +6057,12 @@ td_library( gentbl_cc_library( name = "PDLOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/PDL/IR/PDLOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/PDL/IR/PDLOps.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/PDL/IR/PDLOpsDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/PDL/IR/PDLOpsDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/PDL/IR/PDLOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/PDL/IR/PDLOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/PDL/IR/PDLOpsDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/PDL/IR/PDLOpsDialect.cpp.inc": ["-gen-dialect-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/PDL/IR/PDLOps.td", deps = [":PDLDialectTdFiles"], @@ -7030,16 +6070,10 @@ gentbl_cc_library( gentbl_cc_library( name = "PDLTypesIncGen", - tbl_outs = [ - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/PDL/IR/PDLOpsTypes.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/PDL/IR/PDLOpsTypes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/PDL/IR/PDLOpsTypes.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/PDL/IR/PDLOpsTypes.cpp.inc": ["-gen-typedef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/PDL/IR/PDLTypes.td", deps = [":PDLDialectTdFiles"], @@ -7079,30 +6113,18 @@ td_library( gentbl_cc_library( name = "PDLInterpOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=pdl_interp", - ], - "include/mlir/Dialect/PDLInterp/IR/PDLInterpOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=pdl_interp", - ], - "include/mlir/Dialect/PDLInterp/IR/PDLInterpOpsDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/PDLInterp/IR/PDLInterpOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=pdl_interp", + ], + "include/mlir/Dialect/PDLInterp/IR/PDLInterpOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=pdl_interp", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td", deps = [":PDLInterpOpsTdFiles"], @@ -7143,48 +6165,27 @@ td_library( gentbl_cc_library( name = "PolynomialIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Polynomial/IR/Polynomial.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Polynomial/IR/Polynomial.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=polynomial", - ], - "include/mlir/Dialect/Polynomial/IR/PolynomialDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=polynomial", - ], - "include/mlir/Dialect/Polynomial/IR/PolynomialDialect.cpp.inc", - ), - ( - [ - "--gen-typedef-decls", - "-typedefs-dialect=polynomial", - ], - "include/mlir/Dialect/Polynomial/IR/PolynomialTypes.h.inc", - ), - ( - [ - "--gen-typedef-defs", - "-typedefs-dialect=polynomial", - ], - "include/mlir/Dialect/Polynomial/IR/PolynomialTypes.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/Polynomial/Polynomial.md", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Polynomial/IR/Polynomial.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Polynomial/IR/Polynomial.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/Polynomial/IR/PolynomialDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=polynomial", + ], + "include/mlir/Dialect/Polynomial/IR/PolynomialDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=polynomial", + ], + "include/mlir/Dialect/Polynomial/IR/PolynomialTypes.h.inc": [ + "--gen-typedef-decls", + "-typedefs-dialect=polynomial", + ], + "include/mlir/Dialect/Polynomial/IR/PolynomialTypes.cpp.inc": [ + "--gen-typedef-defs", + "-typedefs-dialect=polynomial", + ], + "g3doc/Dialects/Polynomial/Polynomial.md": ["-gen-op-doc"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Polynomial/IR/Polynomial.td", deps = [":PolynomialTdFiles"], @@ -7192,22 +6193,16 @@ gentbl_cc_library( gentbl_cc_library( name = "PolynomialAttributesIncGen", - tbl_outs = [ - ( - [ - "-gen-attrdef-decls", - "-attrdefs-dialect=polynomial", - ], - "include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-attrdefs-dialect=polynomial", - ], - "include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.h.inc": [ + "-gen-attrdef-decls", + "-attrdefs-dialect=polynomial", + ], + "include/mlir/Dialect/Polynomial/IR/PolynomialAttributes.cpp.inc": [ + "-gen-attrdef-defs", + "-attrdefs-dialect=polynomial", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Polynomial/IR/Polynomial.td", deps = [":PolynomialTdFiles"], @@ -7216,12 +6211,7 @@ gentbl_cc_library( gentbl_cc_library( name = "PolynomialCanonicalizationIncGen", strip_include_prefix = "include/mlir/Dialect/Polynomial/IR", - tbl_outs = [ - ( - ["-gen-rewriters"], - "include/mlir/Dialect/Polynomial/IR/PolynomialCanonicalization.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Polynomial/IR/PolynomialCanonicalization.inc": ["-gen-rewriters"]}, tblgen = ":mlir-tblgen", td_file = "lib/Dialect/Polynomial/IR/PolynomialCanonicalization.td", deps = [ @@ -7250,24 +6240,12 @@ td_library( gentbl_cc_library( name = "PtrDialectIncGen", - tbl_outs = [ - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/Ptr/IR/PtrOpsDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/Ptr/IR/PtrOpsDialect.cpp.inc", - ), - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/Ptr/IR/PtrOpsTypes.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/Ptr/IR/PtrOpsTypes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Ptr/IR/PtrOpsDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/Ptr/IR/PtrOpsDialect.cpp.inc": ["-gen-dialect-defs"], + "include/mlir/Dialect/Ptr/IR/PtrOpsTypes.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/Ptr/IR/PtrOpsTypes.cpp.inc": ["-gen-typedef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Ptr/IR/PtrDialect.td", deps = [ @@ -7277,16 +6255,10 @@ gentbl_cc_library( gentbl_cc_library( name = "PtrOpsEnumsGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/Ptr/IR/PtrOpsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/Ptr/IR/PtrOpsEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Ptr/IR/PtrOpsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/Ptr/IR/PtrOpsEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Ptr/IR/PtrEnums.td", deps = [ @@ -7296,24 +6268,12 @@ gentbl_cc_library( gentbl_cc_library( name = "PtrMemorySpaceInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.cpp.inc", - ), - ( - ["-gen-attr-interface-decls"], - "include/mlir/Dialect/Ptr/IR/MemorySpaceAttrInterfaces.h.inc", - ), - ( - ["-gen-attr-interface-defs"], - "include/mlir/Dialect/Ptr/IR/MemorySpaceAttrInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.cpp.inc": ["-gen-op-interface-defs"], + "include/mlir/Dialect/Ptr/IR/MemorySpaceAttrInterfaces.h.inc": ["-gen-attr-interface-decls"], + "include/mlir/Dialect/Ptr/IR/MemorySpaceAttrInterfaces.cpp.inc": ["-gen-attr-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Ptr/IR/MemorySpaceInterfaces.td", deps = [ @@ -7323,16 +6283,10 @@ gentbl_cc_library( gentbl_cc_library( name = "PtrOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Ptr/IR/PtrOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Ptr/IR/PtrOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Ptr/IR/PtrOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Ptr/IR/PtrOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Ptr/IR/PtrOps.td", deps = [ @@ -7342,16 +6296,10 @@ gentbl_cc_library( gentbl_cc_library( name = "PtrAttrsIncGen", - tbl_outs = [ - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/Ptr/IR/PtrOpsAttrs.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/Ptr/IR/PtrOpsAttrs.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Ptr/IR/PtrOpsAttrs.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/Ptr/IR/PtrOpsAttrs.cpp.inc": ["-gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Ptr/IR/PtrAttrDefs.td", deps = [ @@ -7409,17 +6357,11 @@ td_library( gentbl_cc_library( name = "SPIRVImageInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/SPIRV/Interfaces/SPIRVImageInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/SPIRV/Interfaces/SPIRVImageInterfaces.cpp.inc", - ), - ], - tblgen = ":mlir-tblgen", + tbl_outs = { + "include/mlir/Dialect/SPIRV/Interfaces/SPIRVImageInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/SPIRV/Interfaces/SPIRVImageInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, + tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SPIRV/Interfaces/SPIRVImageInterfaces.td", deps = [ ":SPIRVImageInterfacesTdFiles", @@ -7441,48 +6383,18 @@ cc_library( gentbl_cc_library( name = "SPIRVOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/SPIRV/IR/SPIRVOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/SPIRV/IR/SPIRVOps.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/SPIRV/IR/SPIRVOpsDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/SPIRV/IR/SPIRVOpsDialect.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/SPIRV/SPIRVOps.md", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/SPIRV/IR/SPIRVEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/SPIRV/IR/SPIRVEnums.cpp.inc", - ), - ( - ["-gen-spirv-enum-avail-decls"], - "include/mlir/Dialect/SPIRV/IR/SPIRVEnumAvailability.h.inc", - ), - ( - ["-gen-spirv-enum-avail-defs"], - "include/mlir/Dialect/SPIRV/IR/SPIRVEnumAvailability.cpp.inc", - ), - ( - ["-gen-spirv-capability-implication"], - "include/mlir/Dialect/SPIRV/IR/SPIRVCapabilityImplication.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/SPIRV/IR/SPIRVOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/SPIRV/IR/SPIRVOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/SPIRV/IR/SPIRVOpsDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/SPIRV/IR/SPIRVOpsDialect.cpp.inc": ["-gen-dialect-defs"], + "g3doc/Dialects/SPIRV/SPIRVOps.md": ["-gen-op-doc"], + "include/mlir/Dialect/SPIRV/IR/SPIRVEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/SPIRV/IR/SPIRVEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/SPIRV/IR/SPIRVEnumAvailability.h.inc": ["-gen-spirv-enum-avail-decls"], + "include/mlir/Dialect/SPIRV/IR/SPIRVEnumAvailability.cpp.inc": ["-gen-spirv-enum-avail-defs"], + "include/mlir/Dialect/SPIRV/IR/SPIRVCapabilityImplication.inc": ["-gen-spirv-capability-implication"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SPIRV/IR/SPIRVOps.td", deps = [":SPIRVOpsTdFiles"], @@ -7490,16 +6402,10 @@ gentbl_cc_library( gentbl_cc_library( name = "SPIRVAttributesIncGen", - tbl_outs = [ - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.cpp.inc": ["-gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SPIRV/IR/SPIRVAttributes.td", deps = [":SPIRVOpsTdFiles"], @@ -7508,12 +6414,7 @@ gentbl_cc_library( gentbl_cc_library( name = "SPIRVCanonicalizationIncGen", strip_include_prefix = "lib/Dialect/SPIRV/IR", - tbl_outs = [ - ( - ["-gen-rewriters"], - "lib/Dialect/SPIRV/IR/SPIRVCanonicalization.inc", - ), - ], + tbl_outs = {"lib/Dialect/SPIRV/IR/SPIRVCanonicalization.inc": ["-gen-rewriters"]}, tblgen = ":mlir-tblgen", td_file = "lib/Dialect/SPIRV/IR/SPIRVCanonicalization.td", deps = [":SPIRVOpsTdFiles"], @@ -7521,20 +6422,11 @@ gentbl_cc_library( gentbl_cc_library( name = "SPIRVAvailabilityIncGen", - tbl_outs = [ - ( - ["-gen-avail-interface-decls"], - "include/mlir/Dialect/SPIRV/IR/SPIRVAvailability.h.inc", - ), - ( - ["-gen-avail-interface-defs"], - "include/mlir/Dialect/SPIRV/IR/SPIRVAvailability.cpp.inc", - ), - ( - ["-gen-spirv-avail-impls"], - "include/mlir/Dialect/SPIRV/IR/SPIRVOpAvailabilityImpl.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/SPIRV/IR/SPIRVAvailability.h.inc": ["-gen-avail-interface-decls"], + "include/mlir/Dialect/SPIRV/IR/SPIRVAvailability.cpp.inc": ["-gen-avail-interface-defs"], + "include/mlir/Dialect/SPIRV/IR/SPIRVOpAvailabilityImpl.inc": ["-gen-spirv-avail-impls"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SPIRV/IR/SPIRVOps.td", deps = [":SPIRVOpsTdFiles"], @@ -7542,12 +6434,7 @@ gentbl_cc_library( gentbl_cc_library( name = "SPIRVAttrUtilsGen", - tbl_outs = [ - ( - ["-gen-spirv-attr-utils"], - "include/mlir/Dialect/SPIRV/IR/SPIRVAttrUtils.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/SPIRV/IR/SPIRVAttrUtils.inc": ["-gen-spirv-attr-utils"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SPIRV/IR/SPIRVBase.td", deps = [":SPIRVOpsTdFiles"], @@ -7555,12 +6442,7 @@ gentbl_cc_library( gentbl_cc_library( name = "SPIRVSerializationGen", - tbl_outs = [ - ( - ["-gen-spirv-serialization"], - "include/mlir/Dialect/SPIRV/IR/SPIRVSerialization.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/SPIRV/IR/SPIRVSerialization.inc": ["-gen-spirv-serialization"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SPIRV/IR/SPIRVOps.td", deps = [":SPIRVOpsTdFiles"], @@ -7614,15 +6496,10 @@ cc_library( gentbl_cc_library( name = "SPIRVPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=SPIRV", - ], - "include/mlir/Dialect/SPIRV/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/SPIRV/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=SPIRV", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/SPIRV/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -7968,30 +6845,18 @@ td_library( gentbl_cc_library( name = "TensorOpsIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=tensor", - ], - "include/mlir/Dialect/Tensor/IR/TensorOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=tensor", - ], - "include/mlir/Dialect/Tensor/IR/TensorOpsDialect.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Tensor/IR/TensorOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Tensor/IR/TensorOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Tensor/IR/TensorOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=tensor", + ], + "include/mlir/Dialect/Tensor/IR/TensorOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=tensor", + ], + "include/mlir/Dialect/Tensor/IR/TensorOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Tensor/IR/TensorOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Tensor/IR/TensorOps.td", deps = [":TensorOpsTdFiles"], @@ -8114,15 +6979,10 @@ cc_library( gentbl_cc_library( name = "TensorPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Tensor", - ], - "include/mlir/Dialect/Tensor/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Tensor/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Tensor", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Tensor/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -8180,16 +7040,10 @@ td_library( gentbl_cc_library( name = "TensorTransformOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td", deps = [ @@ -8277,16 +7131,10 @@ cc_library( gentbl_cc_library( name = "DerivedAttributeOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/DerivedAttributeOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/DerivedAttributeOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/DerivedAttributeOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/DerivedAttributeOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/DerivedAttributeOpInterface.td", deps = [":DerivedAttributeOpInterfaceTdFiles"], @@ -8305,16 +7153,10 @@ cc_library( gentbl_cc_library( name = "DestinationStyleOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/DestinationStyleOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/DestinationStyleOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/DestinationStyleOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/DestinationStyleOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/DestinationStyleOpInterface.td", deps = [":DestinationStyleOpInterfaceTdFiles"], @@ -8334,16 +7176,10 @@ cc_library( gentbl_cc_library( name = "InferIntRangeInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/InferIntRangeInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/InferIntRangeInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/InferIntRangeInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/InferIntRangeInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/InferIntRangeInterface.td", deps = [":InferIntRangeInterfaceTdFiles"], @@ -8368,32 +7204,14 @@ td_library( gentbl_cc_library( name = "DataLayoutInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-attr-interface-decls"], - "include/mlir/Interfaces/DataLayoutAttrInterface.h.inc", - ), - ( - ["-gen-attr-interface-defs"], - "include/mlir/Interfaces/DataLayoutAttrInterface.cpp.inc", - ), - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/DataLayoutOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/DataLayoutOpInterface.cpp.inc", - ), - ( - ["-gen-type-interface-decls"], - "include/mlir/Interfaces/DataLayoutTypeInterface.h.inc", - ), - ( - ["-gen-type-interface-defs"], - "include/mlir/Interfaces/DataLayoutTypeInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/DataLayoutAttrInterface.h.inc": ["-gen-attr-interface-decls"], + "include/mlir/Interfaces/DataLayoutAttrInterface.cpp.inc": ["-gen-attr-interface-defs"], + "include/mlir/Interfaces/DataLayoutOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/DataLayoutOpInterface.cpp.inc": ["-gen-op-interface-defs"], + "include/mlir/Interfaces/DataLayoutTypeInterface.h.inc": ["-gen-type-interface-decls"], + "include/mlir/Interfaces/DataLayoutTypeInterface.cpp.inc": ["-gen-type-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/DataLayoutInterfaces.td", deps = [":OpBaseTdFiles"], @@ -8401,16 +7219,10 @@ gentbl_cc_library( gentbl_cc_library( name = "LoopLikeInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/LoopLikeInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/LoopLikeInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/LoopLikeInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/LoopLikeInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/LoopLikeInterface.td", deps = [":LoopLikeInterfaceTdFiles"], @@ -8418,24 +7230,12 @@ gentbl_cc_library( gentbl_cc_library( name = "MemorySlotInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/MemorySlotOpInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/MemorySlotOpInterfaces.cpp.inc", - ), - ( - ["-gen-type-interface-decls"], - "include/mlir/Interfaces/MemorySlotTypeInterfaces.h.inc", - ), - ( - ["-gen-type-interface-defs"], - "include/mlir/Interfaces/MemorySlotTypeInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/MemorySlotOpInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/MemorySlotOpInterfaces.cpp.inc": ["-gen-op-interface-defs"], + "include/mlir/Interfaces/MemorySlotTypeInterfaces.h.inc": ["-gen-type-interface-decls"], + "include/mlir/Interfaces/MemorySlotTypeInterfaces.cpp.inc": ["-gen-type-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/MemorySlotInterfaces.td", deps = [":MemorySlotInterfacesTdFiles"], @@ -8443,16 +7243,10 @@ gentbl_cc_library( gentbl_cc_library( name = "ShapedOpInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/ShapedOpInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/ShapedOpInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/ShapedOpInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/ShapedOpInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/ShapedOpInterfaces.td", deps = [":ShapedOpInterfacesTdFiles"], @@ -8460,16 +7254,10 @@ gentbl_cc_library( gentbl_cc_library( name = "ParallelCombiningOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/ParallelCombiningOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/ParallelCombiningOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/ParallelCombiningOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/ParallelCombiningOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/ParallelCombiningOpInterface.td", deps = [":ParallelCombiningOpInterfaceTdFiles"], @@ -8477,16 +7265,10 @@ gentbl_cc_library( gentbl_cc_library( name = "RuntimeVerifiableOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/RuntimeVerifiableOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/RuntimeVerifiableOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/RuntimeVerifiableOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/RuntimeVerifiableOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/RuntimeVerifiableOpInterface.td", deps = [":RuntimeVerifiableOpInterfaceTdFiles"], @@ -8494,16 +7276,10 @@ gentbl_cc_library( gentbl_cc_library( name = "VectorInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/VectorInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/VectorInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/VectorInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/VectorInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/VectorInterfaces.td", deps = [":VectorInterfacesTdFiles"], @@ -8511,16 +7287,10 @@ gentbl_cc_library( gentbl_cc_library( name = "ViewLikeInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/ViewLikeInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/ViewLikeInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/ViewLikeInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/ViewLikeInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/ViewLikeInterface.td", deps = [":ViewLikeInterfaceTdFiles"], @@ -8528,16 +7298,10 @@ gentbl_cc_library( gentbl_cc_library( name = "CopyOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/CopyOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/CopyOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/CopyOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/CopyOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/CopyOpInterface.td", deps = [":CopyOpInterfaceTdFiles"], @@ -8545,29 +7309,20 @@ gentbl_cc_library( gentbl_cc_library( name = "TransformsPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Transforms", - ], - "include/mlir/Transforms/Passes.h.inc", - ), - ( - [ - "-gen-pass-capi-header", - "--prefix=Transforms", - ], - "include/mlir/Transforms/Transforms.capi.h.inc", - ), - ( - [ - "-gen-pass-capi-impl", - "--prefix=Transforms", - ], - "include/mlir/Transforms/Transforms.capi.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Transforms", + ], + "include/mlir/Transforms/Transforms.capi.h.inc": [ + "-gen-pass-capi-header", + "--prefix=Transforms", + ], + "include/mlir/Transforms/Transforms.capi.cpp.inc": [ + "-gen-pass-capi-impl", + "--prefix=Transforms", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Transforms/Passes.td", deps = [ @@ -8767,24 +7522,12 @@ cc_library( gentbl_cc_library( name = "ToLLVMInterfaceIncGen", - tbl_outs = [ - ( - ["--gen-attr-interface-decls"], - "include/mlir/Conversion/ConvertToLLVM/ToLLVMAttrInterface.h.inc", - ), - ( - ["--gen-attr-interface-defs"], - "include/mlir/Conversion/ConvertToLLVM/ToLLVMAttrInterface.cpp.inc", - ), - ( - ["--gen-op-interface-decls"], - "include/mlir/Conversion/ConvertToLLVM/ToLLVMOpInterface.h.inc", - ), - ( - ["--gen-op-interface-defs"], - "include/mlir/Conversion/ConvertToLLVM/ToLLVMOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Conversion/ConvertToLLVM/ToLLVMAttrInterface.h.inc": ["--gen-attr-interface-decls"], + "include/mlir/Conversion/ConvertToLLVM/ToLLVMAttrInterface.cpp.inc": ["--gen-attr-interface-defs"], + "include/mlir/Conversion/ConvertToLLVM/ToLLVMOpInterface.h.inc": ["--gen-op-interface-decls"], + "include/mlir/Conversion/ConvertToLLVM/ToLLVMOpInterface.cpp.inc": ["--gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Conversion/ConvertToLLVM/ToLLVMInterface.td", deps = [":UBDialectTdFiles"], @@ -9185,16 +7928,10 @@ cc_library( gentbl_cc_library( name = "BytecodeOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Bytecode/BytecodeOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Bytecode/BytecodeOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Bytecode/BytecodeOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Bytecode/BytecodeOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Bytecode/BytecodeOpInterface.td", deps = [":BytecodeOpInterfaceTdFiles"], @@ -9219,16 +7956,10 @@ cc_library( gentbl_cc_library( name = "CallOpInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/CallInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/CallInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/CallInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/CallInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/CallInterfaces.td", deps = [":CallInterfacesTdFiles"], @@ -9248,16 +7979,10 @@ cc_library( gentbl_cc_library( name = "CastInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/CastInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/CastInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/CastInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/CastInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/CastInterfaces.td", deps = [":CastInterfacesTdFiles"], @@ -9276,16 +8001,10 @@ cc_library( gentbl_cc_library( name = "ControlFlowInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/ControlFlowInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/ControlFlowInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/ControlFlowInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/ControlFlowInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/ControlFlowInterfaces.td", deps = [":ControlFlowInterfacesTdFiles"], @@ -9305,16 +8024,10 @@ cc_library( gentbl_cc_library( name = "InferTypeOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/InferTypeOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/InferTypeOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/InferTypeOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/InferTypeOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/InferTypeOpInterface.td", deps = [":InferTypeOpInterfaceTdFiles"], @@ -9335,16 +8048,10 @@ cc_library( gentbl_cc_library( name = "SideEffectInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/SideEffectInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/SideEffectInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/SideEffectInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/SideEffectInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/SideEffectInterfaces.td", deps = [":SideEffectInterfacesTdFiles"], @@ -9364,16 +8071,10 @@ cc_library( gentbl_cc_library( name = "TilingInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/TilingInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/TilingInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/TilingInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/TilingInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/TilingInterface.td", deps = [":TilingInterfaceTdFiles"], @@ -10564,16 +9265,10 @@ td_library( gentbl_cc_library( name = "AtomicInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.td", deps = [ @@ -10600,15 +9295,10 @@ cc_library( gentbl_cc_library( name = "AccCommonGen", includes = ["/llvm/include"], - tbl_outs = [ - ( - [ - "-gen-directive-decl", - "-directives-dialect=OpenACC", - ], - "include/mlir/Dialect/OpenACC/AccCommon.td", - ), - ], + tbl_outs = {"include/mlir/Dialect/OpenACC/AccCommon.td": [ + "-gen-directive-decl", + "-directives-dialect=OpenACC", + ]}, tblgen = ":mlir-tblgen", td_file = "//llvm:include/llvm/Frontend/OpenACC/ACC.td", deps = ["//llvm:AccTdFiles"], @@ -10636,16 +9326,10 @@ td_library( gentbl_cc_library( name = "OpenACCOpsInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td", deps = [":OpenAccOpsTdFiles"], @@ -10653,16 +9337,10 @@ gentbl_cc_library( gentbl_cc_library( name = "OpenACCMPOpsInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/OpenACCMPCommon/Interfaces/OpenACCMPOpsInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/OpenACCMPCommon/Interfaces/OpenACCMPOpsInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/OpenACCMPCommon/Interfaces/OpenACCMPOpsInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/OpenACCMPCommon/Interfaces/OpenACCMPOpsInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/OpenACCMPCommon/Interfaces/OpenACCMPOpsInterfaces.td", deps = [":OpenAccOpsTdFiles"], @@ -10670,56 +9348,29 @@ gentbl_cc_library( gentbl_cc_library( name = "OpenACCOpsIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=acc", - ], - "include/mlir/Dialect/OpenACC/OpenACCOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=acc", - ], - "include/mlir/Dialect/OpenACC/OpenACCOpsDialect.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/OpenACC/OpenACCOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/OpenACC/OpenACCOps.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/OpenACC/OpenACCOpsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/OpenACC/OpenACCOpsEnums.cpp.inc", - ), - ( - [ - "-gen-attrdef-decls", - "-attrdefs-dialect=acc", - ], - "include/mlir/Dialect/OpenACC/OpenACCOpsAttributes.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-attrdefs-dialect=acc", - ], - "include/mlir/Dialect/OpenACC/OpenACCOpsAttributes.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/OpenACC/OpenACCOps.md", - ), - ], + tbl_outs = { + "include/mlir/Dialect/OpenACC/OpenACCOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=acc", + ], + "include/mlir/Dialect/OpenACC/OpenACCOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=acc", + ], + "include/mlir/Dialect/OpenACC/OpenACCOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/OpenACC/OpenACCOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/OpenACC/OpenACCOpsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/OpenACC/OpenACCOpsEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/OpenACC/OpenACCOpsAttributes.h.inc": [ + "-gen-attrdef-decls", + "-attrdefs-dialect=acc", + ], + "include/mlir/Dialect/OpenACC/OpenACCOpsAttributes.cpp.inc": [ + "-gen-attrdef-defs", + "-attrdefs-dialect=acc", + ], + "g3doc/Dialects/OpenACC/OpenACCOps.md": ["-gen-op-doc"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/OpenACC/OpenACCOps.td", deps = [ @@ -10732,22 +9383,16 @@ gentbl_cc_library( gentbl_cc_library( name = "OpenACCTypesIncGen", - tbl_outs = [ - ( - [ - "--gen-typedef-decls", - "-typedefs-dialect=acc", - ], - "include/mlir/Dialect/OpenACC/OpenACCOpsTypes.h.inc", - ), - ( - [ - "--gen-typedef-defs", - "-typedefs-dialect=acc", - ], - "include/mlir/Dialect/OpenACC/OpenACCOpsTypes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/OpenACC/OpenACCOpsTypes.h.inc": [ + "--gen-typedef-decls", + "-typedefs-dialect=acc", + ], + "include/mlir/Dialect/OpenACC/OpenACCOpsTypes.cpp.inc": [ + "--gen-typedef-defs", + "-typedefs-dialect=acc", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td", deps = [":OpenAccOpsTdFiles"], @@ -10755,16 +9400,10 @@ gentbl_cc_library( gentbl_cc_library( name = "OpenACCTypeInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-type-interface-decls"], - "include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.h.inc", - ), - ( - ["-gen-type-interface-defs"], - "include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.h.inc": ["-gen-type-interface-decls"], + "include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.cpp.inc": ["-gen-type-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/OpenACC/OpenACCTypeInterfaces.td", deps = [":OpenAccOpsTdFiles"], @@ -10806,15 +9445,10 @@ cc_library( gentbl_cc_library( name = "OpenACCPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=OpenACC", - ], - "include/mlir/Dialect/OpenACC/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/OpenACC/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=OpenACC", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/OpenACC/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -10845,15 +9479,10 @@ cc_library( gentbl_cc_library( name = "OmpCommonTdGen", includes = ["/llvm/include"], - tbl_outs = [ - ( - [ - "-gen-directive-decl", - "-directives-dialect=OpenMP", - ], - "include/mlir/Dialect/OpenMP/OmpCommon.td", - ), - ], + tbl_outs = {"include/mlir/Dialect/OpenMP/OmpCommon.td": [ + "-gen-directive-decl", + "-directives-dialect=OpenMP", + ]}, tblgen = ":mlir-tblgen", td_file = "//llvm:include/llvm/Frontend/OpenMP/OMP.td", deps = [ @@ -10886,74 +9515,38 @@ td_library( gentbl_cc_library( name = "OpenMPOpsIncGen", - tbl_outs = [ - ( - [ - "-gen-typedef-decls", - "-typedefs-dialect=omp", - ], - "include/mlir/Dialect/OpenMP/OpenMPOpsTypes.h.inc", - ), - ( - [ - "-gen-typedef-defs", - "-typedefs-dialect=omp", - ], - "include/mlir/Dialect/OpenMP/OpenMPOpsTypes.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/OpenMP/OpenMPOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/OpenMP/OpenMPOps.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/OpenMP/OpenMPOpsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/OpenMP/OpenMPOpsEnums.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=omp", - ], - "include/mlir/Dialect/OpenMP/OpenMPOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=omp", - ], - "include/mlir/Dialect/OpenMP/OpenMPOpsDialect.cpp.inc", - ), - ( - [ - "-gen-attrdef-decls", - "-attrdefs-dialect=omp", - ], - "include/mlir/Dialect/OpenMP/OpenMPOpsAttributes.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-attrdefs-dialect=omp", - ], - "include/mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/OpenMP/OpenMPOps.md", - ), - ( - ["-gen-openmp-clause-ops"], - "include/mlir/Dialect/OpenMP/OpenMPClauseOps.h.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/OpenMP/OpenMPOpsTypes.h.inc": [ + "-gen-typedef-decls", + "-typedefs-dialect=omp", + ], + "include/mlir/Dialect/OpenMP/OpenMPOpsTypes.cpp.inc": [ + "-gen-typedef-defs", + "-typedefs-dialect=omp", + ], + "include/mlir/Dialect/OpenMP/OpenMPOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/OpenMP/OpenMPOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/OpenMP/OpenMPOpsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/OpenMP/OpenMPOpsEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/OpenMP/OpenMPOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=omp", + ], + "include/mlir/Dialect/OpenMP/OpenMPOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=omp", + ], + "include/mlir/Dialect/OpenMP/OpenMPOpsAttributes.h.inc": [ + "-gen-attrdef-decls", + "-attrdefs-dialect=omp", + ], + "include/mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc": [ + "-gen-attrdef-defs", + "-attrdefs-dialect=omp", + ], + "g3doc/Dialects/OpenMP/OpenMPOps.md": ["-gen-op-doc"], + "include/mlir/Dialect/OpenMP/OpenMPClauseOps.h.inc": ["-gen-openmp-clause-ops"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/OpenMP/OpenMPOps.td", deps = [":OpenMPOpsTdFiles"], @@ -10961,16 +9554,10 @@ gentbl_cc_library( gentbl_cc_library( name = "OpenMPTypeInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-type-interface-decls"], - "include/mlir/Dialect/OpenMP/OpenMPTypeInterfaces.h.inc", - ), - ( - ["-gen-type-interface-defs"], - "include/mlir/Dialect/OpenMP/OpenMPTypeInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/OpenMP/OpenMPTypeInterfaces.h.inc": ["-gen-type-interface-decls"], + "include/mlir/Dialect/OpenMP/OpenMPTypeInterfaces.cpp.inc": ["-gen-type-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/OpenMP/OpenMPTypeInterfaces.td", deps = [":OpenMPOpsTdFiles"], @@ -10978,16 +9565,10 @@ gentbl_cc_library( gentbl_cc_library( name = "OpenMPInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td", deps = [":OpenMPOpsTdFiles"], @@ -11085,28 +9666,13 @@ td_library( gentbl_cc_library( name = "QuantOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Quant/IR/QuantOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Quant/IR/QuantOps.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/Quant/IR/QuantOpsDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/Quant/IR/QuantOpsDialect.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/QuantOps/QuantOps.md", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Quant/IR/QuantOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Quant/IR/QuantOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/Quant/IR/QuantOpsDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/Quant/IR/QuantOpsDialect.cpp.inc": ["-gen-dialect-defs"], + "g3doc/Dialects/QuantOps/QuantOps.md": ["-gen-op-doc"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Quant/IR/QuantOps.td", deps = [":QuantizationOpsTdFiles"], @@ -11114,15 +9680,10 @@ gentbl_cc_library( gentbl_cc_library( name = "QuantDialectBytecodeGen", - tbl_outs = [ - ( - [ - "-gen-bytecode", - "-bytecode-dialect=Quant", - ], - "include/mlir/Dialect/Quant/IR/QuantDialectBytecode.cpp.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Quant/IR/QuantDialectBytecode.cpp.inc": [ + "-gen-bytecode", + "-bytecode-dialect=Quant", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Quant/IR/QuantDialectBytecode.td", deps = [ @@ -11181,38 +9742,20 @@ td_library( gentbl_cc_library( name = "IndexOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Index/IR/IndexOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Index/IR/IndexOps.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=index", - ], - "include/mlir/Dialect/Index/IR/IndexOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=index", - ], - "include/mlir/Dialect/Index/IR/IndexOpsDialect.cpp.inc", - ), - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/Index/IR/IndexOpsAttrDefs.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/Index/IR/IndexOpsAttrDefs.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Index/IR/IndexOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Index/IR/IndexOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/Index/IR/IndexOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=index", + ], + "include/mlir/Dialect/Index/IR/IndexOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=index", + ], + "include/mlir/Dialect/Index/IR/IndexOpsAttrDefs.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/Index/IR/IndexOpsAttrDefs.cpp.inc": ["-gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Index/IR/IndexOps.td", deps = [":IndexOpsTdFiles"], @@ -11220,30 +9763,18 @@ gentbl_cc_library( gentbl_cc_library( name = "IndexEnumsIncGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/Index/IR/IndexEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/Index/IR/IndexEnums.cpp.inc", - ), - ( - [ - "-gen-attrdef-decls", - "-attrdefs-dialect=index", - ], - "include/mlir/Dialect/Index/IR/IndexAttrs.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-attrdefs-dialect=index", - ], - "include/mlir/Dialect/Index/IR/IndexAttrs.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Index/IR/IndexEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/Index/IR/IndexEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/Index/IR/IndexAttrs.h.inc": [ + "-gen-attrdef-decls", + "-attrdefs-dialect=index", + ], + "include/mlir/Dialect/Index/IR/IndexAttrs.cpp.inc": [ + "-gen-attrdef-defs", + "-attrdefs-dialect=index", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Index/IR/IndexEnums.td", deps = [":IndexOpsTdFiles"], @@ -11346,38 +9877,20 @@ td_library( gentbl_cc_library( name = "LinalgOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Linalg/IR/LinalgOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Linalg/IR/LinalgOps.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=linalg", - ], - "include/mlir/Dialect/Linalg/IR/LinalgOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=linalg", - ], - "include/mlir/Dialect/Linalg/IR/LinalgOpsDialect.cpp.inc", - ), - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/Linalg/IR/LinalgOpsAttrDefs.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/Linalg/IR/LinalgOpsAttrDefs.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Linalg/IR/LinalgOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Linalg/IR/LinalgOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/Linalg/IR/LinalgOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=linalg", + ], + "include/mlir/Dialect/Linalg/IR/LinalgOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=linalg", + ], + "include/mlir/Dialect/Linalg/IR/LinalgOpsAttrDefs.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/Linalg/IR/LinalgOpsAttrDefs.cpp.inc": ["-gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Linalg/IR/LinalgOps.td", deps = [":LinalgOpsTdFiles"], @@ -11385,16 +9898,10 @@ gentbl_cc_library( gentbl_cc_library( name = "LinalgRelayoutOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td", deps = [":LinalgOpsTdFiles"], @@ -11402,16 +9909,10 @@ gentbl_cc_library( gentbl_cc_library( name = "LinalgEnumsIncGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/Linalg/IR/LinalgOpsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/Linalg/IR/LinalgOpsEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Linalg/IR/LinalgOpsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/Linalg/IR/LinalgOpsEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Linalg/IR/LinalgEnums.td", deps = [":LinalgOpsTdFiles"], @@ -11419,16 +9920,10 @@ gentbl_cc_library( gentbl_cc_library( name = "LinalgMatchOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Linalg/TransformOps/LinalgMatchOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Linalg/TransformOps/LinalgMatchOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Linalg/TransformOps/LinalgMatchOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Linalg/TransformOps/LinalgMatchOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Linalg/TransformOps/LinalgMatchOps.td", deps = [ @@ -11441,16 +9936,10 @@ gentbl_cc_library( gentbl_cc_library( name = "LinalgTransformEnumsIncGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOpsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOpsEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOpsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOpsEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Linalg/TransformOps/LinalgTransformEnums.td", deps = [ @@ -11461,16 +9950,10 @@ gentbl_cc_library( gentbl_cc_library( name = "LinalgTransformOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td", deps = [ @@ -11515,16 +9998,10 @@ td_library( gentbl_cc_library( name = "LinalgStructuredOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td", deps = [":LinalgStructuredOpsTdFiles"], @@ -11532,16 +10009,10 @@ gentbl_cc_library( gentbl_cc_library( name = "LinalgInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Linalg/IR/LinalgInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Linalg/IR/LinalgInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td", deps = [":LinalgStructuredOpsTdFiles"], @@ -11560,16 +10031,10 @@ td_library( gentbl_cc_library( name = "BufferizableOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td", deps = [ @@ -11590,16 +10055,10 @@ td_library( gentbl_cc_library( name = "BufferDeallocationOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Bufferization/IR/BufferDeallocationOpInterface.td", deps = [ @@ -11620,16 +10079,10 @@ td_library( gentbl_cc_library( name = "BufferViewFlowOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Bufferization/IR/BufferViewFlowOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Bufferization/IR/BufferViewFlowOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Bufferization/IR/BufferViewFlowOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Bufferization/IR/BufferViewFlowOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Bufferization/IR/BufferViewFlowOpInterface.td", deps = [ @@ -11650,16 +10103,10 @@ td_library( gentbl_cc_library( name = "SubsetOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/SubsetOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/SubsetOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/SubsetOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/SubsetOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/SubsetOpInterface.td", deps = [ @@ -11692,12 +10139,7 @@ td_library( gentbl_cc_library( name = "LinalgDocIncGen", - tbl_outs = [ - ( - ["-gen-op-doc"], - "g3doc/Dialects/Linalg/LinalgOps.md", - ), - ], + tbl_outs = {"g3doc/Dialects/Linalg/LinalgOps.md": ["-gen-op-doc"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Linalg/IR/LinalgDoc.td", deps = [":LinalgDocTdFiles"], @@ -11849,29 +10291,20 @@ cc_library( gentbl_cc_library( name = "LinalgPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Linalg", - ], - "include/mlir/Dialect/Linalg/Passes.h.inc", - ), - ( - [ - "-gen-pass-capi-header", - "--prefix=Linalg", - ], - "include/mlir/Dialect/Linalg/Passes.capi.h.inc", - ), - ( - [ - "-gen-pass-capi-impl", - "--prefix=Linalg", - ], - "include/mlir/Dialect/Linalg/Passes.capi.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Linalg/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Linalg", + ], + "include/mlir/Dialect/Linalg/Passes.capi.h.inc": [ + "-gen-pass-capi-header", + "--prefix=Linalg", + ], + "include/mlir/Dialect/Linalg/Passes.capi.cpp.inc": [ + "-gen-pass-capi-impl", + "--prefix=Linalg", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Linalg/Passes.td", deps = [":PassBaseTdFiles"], @@ -11985,16 +10418,10 @@ td_library( gentbl_cc_library( name = "ValueBoundsOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Interfaces/ValueBoundsOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Interfaces/ValueBoundsOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Interfaces/ValueBoundsOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/ValueBoundsOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Interfaces/ValueBoundsOpInterface.td", deps = [ @@ -12097,16 +10524,10 @@ td_library( gentbl_cc_library( name = "MaskableOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Vector/Interfaces/MaskableOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Vector/Interfaces/MaskableOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Vector/Interfaces/MaskableOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Vector/Interfaces/MaskableOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Vector/Interfaces/MaskableOpInterface.td", deps = [":MaskableOpInterfaceTdFiles"], @@ -12114,16 +10535,10 @@ gentbl_cc_library( gentbl_cc_library( name = "MaskingOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Vector/Interfaces/MaskingOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Vector/Interfaces/MaskingOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Vector/Interfaces/MaskingOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Vector/Interfaces/MaskingOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Vector/Interfaces/MaskingOpInterface.td", deps = [":MaskingOpInterfaceTdFiles"], @@ -12131,26 +10546,17 @@ gentbl_cc_library( gentbl_cc_library( name = "VectorDialectIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=vector", - ], - "include/mlir/Dialect/Vector/IR/VectorDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=vector", - ], - "include/mlir/Dialect/Vector/IR/VectorDialect.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/Vector/Vector.md", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Vector/IR/VectorDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=vector", + ], + "include/mlir/Dialect/Vector/IR/VectorDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=vector", + ], + "g3doc/Dialects/Vector/Vector.md": ["-gen-op-doc"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Vector/IR/Vector.td", deps = [ @@ -12161,16 +10567,10 @@ gentbl_cc_library( gentbl_cc_library( name = "VectorOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Vector/IR/VectorOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Vector/IR/VectorOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Vector/IR/VectorOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Vector/IR/VectorOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Vector/IR/VectorOps.td", deps = [ @@ -12181,24 +10581,12 @@ gentbl_cc_library( gentbl_cc_library( name = "VectorAttributesIncGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/Vector/IR/VectorEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/Vector/IR/VectorEnums.cpp.inc", - ), - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/Vector/IR/VectorAttributes.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/Vector/IR/VectorAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Vector/IR/VectorEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/Vector/IR/VectorEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/Vector/IR/VectorAttributes.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/Vector/IR/VectorAttributes.cpp.inc": ["-gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Vector/IR/VectorAttributes.td", deps = [ @@ -12209,16 +10597,10 @@ gentbl_cc_library( gentbl_cc_library( name = "VectorTransformOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td", deps = [ @@ -12381,44 +10763,17 @@ td_library( gentbl_cc_library( name = "TosaDialectIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Tosa/IR/TosaOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Tosa/IR/TosaOps.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/Tosa/IR/TosaOpsDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/Tosa/IR/TosaOpsDialect.cpp.inc", - ), - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/Tosa/IR/TosaOpsTypesBase.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/Tosa/IR/TosaOpsTypesBase.cpp.inc", - ), - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/Tosa/IR/TosaAttributes.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/Tosa/IR/TosaAttributes.cpp.inc", - ), - ( - ["-gen-op-doc"], - "g3doc/Dialects/Tosa/TosaOps.md", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Tosa/IR/TosaOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Tosa/IR/TosaOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/Tosa/IR/TosaOpsDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/Tosa/IR/TosaOpsDialect.cpp.inc": ["-gen-dialect-defs"], + "include/mlir/Dialect/Tosa/IR/TosaOpsTypesBase.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/Tosa/IR/TosaOpsTypesBase.cpp.inc": ["-gen-typedef-defs"], + "include/mlir/Dialect/Tosa/IR/TosaAttributes.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/Tosa/IR/TosaAttributes.cpp.inc": ["-gen-attrdef-defs"], + "g3doc/Dialects/Tosa/TosaOps.md": ["-gen-op-doc"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Tosa/IR/TosaOps.td", deps = [":TosaDialectTdFiles"], @@ -12426,16 +10781,10 @@ gentbl_cc_library( gentbl_cc_library( name = "TosaEnumsIncGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/Tosa/IR/TosaEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/Tosa/IR/TosaEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Tosa/IR/TosaEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/Tosa/IR/TosaEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Tosa/IR/TosaOpBase.td", deps = [":TosaDialectTdFiles"], @@ -12443,20 +10792,11 @@ gentbl_cc_library( gentbl_cc_library( name = "TosaAvailabilityIncGen", - tbl_outs = [ - ( - ["-gen-avail-interface-decls"], - "include/mlir/Dialect/Tosa/IR/TosaAvailability.h.inc", - ), - ( - ["-gen-avail-interface-defs"], - "include/mlir/Dialect/Tosa/IR/TosaAvailability.cpp.inc", - ), - ( - ["-gen-tosa-avail-impls"], - "include/mlir/Dialect/Tosa/IR/TosaOpAvailabilityImpl.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Tosa/IR/TosaAvailability.h.inc": ["-gen-avail-interface-decls"], + "include/mlir/Dialect/Tosa/IR/TosaAvailability.cpp.inc": ["-gen-avail-interface-defs"], + "include/mlir/Dialect/Tosa/IR/TosaOpAvailabilityImpl.inc": ["-gen-tosa-avail-impls"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Tosa/IR/TosaOps.td", deps = [":TosaDialectTdFiles"], @@ -12465,15 +10805,10 @@ gentbl_cc_library( gentbl_cc_library( name = "TosaDialectBytecodeGen", strip_include_prefix = "include", - tbl_outs = [ - ( - [ - "-gen-bytecode", - "-bytecode-dialect=Tosa", - ], - "include/mlir/Dialect/Tosa/IR/TosaDialectBytecode.cpp.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Tosa/IR/TosaDialectBytecode.cpp.inc": [ + "-gen-bytecode", + "-bytecode-dialect=Tosa", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Tosa/IR/TosaDialectBytecode.td", deps = [ @@ -12483,16 +10818,10 @@ gentbl_cc_library( gentbl_cc_library( name = "TosaInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Tosa/IR/TosaInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Tosa/IR/TosaInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Tosa/IR/TosaInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Tosa/IR/TosaInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Tosa/IR/TosaInterfaces.td", deps = [":TosaDialectTdFiles"], @@ -12500,29 +10829,20 @@ gentbl_cc_library( gentbl_cc_library( name = "TosaPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=TosaOpt", - ], - "include/mlir/Dialect/Tosa/Transforms/Passes.h.inc", - ), - ( - [ - "-gen-enum-decls", - "-name=TosaOpt", - ], - "include/mlir/Dialect/Tosa/Transforms/PassesEnums.h.inc", - ), - ( - [ - "-gen-enum-defs", - "-name=TosaOpt", - ], - "include/mlir/Dialect/Tosa/Transforms/PassesEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Tosa/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=TosaOpt", + ], + "include/mlir/Dialect/Tosa/Transforms/PassesEnums.h.inc": [ + "-gen-enum-decls", + "-name=TosaOpt", + ], + "include/mlir/Dialect/Tosa/Transforms/PassesEnums.cpp.inc": [ + "-gen-enum-defs", + "-name=TosaOpt", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Tosa/Transforms/Passes.td", deps = [ @@ -12726,20 +11046,14 @@ td_library( gentbl_cc_library( name = "TransformDialectEnumsIncGen", - tbl_outs = [ - ( - [ - "-gen-enum-decls", - ], - "include/mlir/Dialect/Transform/IR/TransformDialectEnums.h.inc", - ), - ( - [ - "-gen-enum-defs", - ], - "include/mlir/Dialect/Transform/IR/TransformDialectEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Transform/IR/TransformDialectEnums.h.inc": [ + "-gen-enum-decls", + ], + "include/mlir/Dialect/Transform/IR/TransformDialectEnums.cpp.inc": [ + "-gen-enum-defs", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/IR/TransformAttrs.td", deps = [":TransformDialectTdFiles"], @@ -12747,20 +11061,14 @@ gentbl_cc_library( gentbl_cc_library( name = "TransformDialectMatchInterfacesIncGen", - tbl_outs = [ - ( - [ - "-gen-op-interface-decls", - ], - "include/mlir/Dialect/Transform/Interfaces/MatchInterfaces.h.inc", - ), - ( - [ - "-gen-op-interface-defs", - ], - "include/mlir/Dialect/Transform/Interfaces/MatchInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Transform/Interfaces/MatchInterfaces.h.inc": [ + "-gen-op-interface-decls", + ], + "include/mlir/Dialect/Transform/Interfaces/MatchInterfaces.cpp.inc": [ + "-gen-op-interface-defs", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/Interfaces/MatchInterfaces.td", deps = [ @@ -12771,32 +11079,20 @@ gentbl_cc_library( gentbl_cc_library( name = "TransformDialectInterfacesIncGen", - tbl_outs = [ - ( - [ - "-gen-op-interface-decls", - ], - "include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h.inc", - ), - ( - [ - "-gen-op-interface-defs", - ], - "include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.cpp.inc", - ), - ( - [ - "-gen-type-interface-decls", - ], - "include/mlir/Dialect/Transform/Interfaces/TransformTypeInterfaces.h.inc", - ), - ( - [ - "-gen-type-interface-defs", - ], - "include/mlir/Dialect/Transform/Interfaces/TransformTypeInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h.inc": [ + "-gen-op-interface-decls", + ], + "include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.cpp.inc": [ + "-gen-op-interface-defs", + ], + "include/mlir/Dialect/Transform/Interfaces/TransformTypeInterfaces.h.inc": [ + "-gen-type-interface-decls", + ], + "include/mlir/Dialect/Transform/Interfaces/TransformTypeInterfaces.cpp.inc": [ + "-gen-type-interface-defs", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.td", deps = [":TransformDialectTdFiles"], @@ -12804,20 +11100,14 @@ gentbl_cc_library( gentbl_cc_library( name = "TransformDialectIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - ], - "include/mlir/Dialect/Transform/IR/TransformDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - ], - "include/mlir/Dialect/Transform/IR/TransformDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Transform/IR/TransformDialect.h.inc": [ + "-gen-dialect-decls", + ], + "include/mlir/Dialect/Transform/IR/TransformDialect.cpp.inc": [ + "-gen-dialect-defs", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/IR/TransformDialect.td", deps = [":TransformDialectTdFiles"], @@ -12825,16 +11115,10 @@ gentbl_cc_library( gentbl_cc_library( name = "TransformOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Transform/IR/TransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Transform/IR/TransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Transform/IR/TransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Transform/IR/TransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/IR/TransformOps.td", deps = [ @@ -12846,16 +11130,10 @@ gentbl_cc_library( gentbl_cc_library( name = "TransformTypesIncGen", - tbl_outs = [ - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/Transform/IR/TransformTypes.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/Transform/IR/TransformTypes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Transform/IR/TransformTypes.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/Transform/IR/TransformTypes.cpp.inc": ["-gen-typedef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/IR/TransformTypes.td", deps = [":TransformDialectTdFiles"], @@ -12923,20 +11201,14 @@ td_library( gentbl_cc_library( name = "TransformPDLExtensionOpsIncGen", - tbl_outs = [ - ( - [ - "-gen-op-decls", - ], - "include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.h.inc", - ), - ( - [ - "-gen-op-defs", - ], - "include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.h.inc": [ + "-gen-op-decls", + ], + "include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.cpp.inc": [ + "-gen-op-defs", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.td", deps = [":TransformPDLExtensionTdFiles"], @@ -12962,20 +11234,14 @@ cc_library( gentbl_cc_library( name = "TransformIRDLExtensionOpsIncGen", - tbl_outs = [ - ( - [ - "-gen-op-decls", - ], - "include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.h.inc", - ), - ( - [ - "-gen-op-defs", - ], - "include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.h.inc": [ + "-gen-op-decls", + ], + "include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.cpp.inc": [ + "-gen-op-defs", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/IRDLExtension/IRDLExtensionOps.td", deps = [":TransformPDLExtensionTdFiles"], @@ -13009,20 +11275,14 @@ td_library( gentbl_cc_library( name = "TransformDebugExtensionOpsIncGen", - tbl_outs = [ - ( - [ - "-gen-op-decls", - ], - "include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.h.inc", - ), - ( - [ - "-gen-op-defs", - ], - "include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.h.inc": [ + "-gen-op-decls", + ], + "include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.cpp.inc": [ + "-gen-op-defs", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.td", deps = [":TransformDebugExtensionTdFiles"], @@ -13052,20 +11312,14 @@ td_library( gentbl_cc_library( name = "TransformLoopExtensionOpsIncGen", - tbl_outs = [ - ( - [ - "-gen-op-decls", - ], - "include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.h.inc", - ), - ( - [ - "-gen-op-defs", - ], - "include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.h.inc": [ + "-gen-op-decls", + ], + "include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.cpp.inc": [ + "-gen-op-defs", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.td", deps = [":TransformLoopExtensionTdFiles"], @@ -13101,15 +11355,10 @@ td_library( gentbl_cc_library( name = "TransformDialectTransformsIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Transform", - ], - "include/mlir/Dialect/Transform/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Transform/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Transform", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/Transforms/Passes.td", deps = [":TransformDialectTransformsTdFiles"], @@ -13166,24 +11415,12 @@ td_library( gentbl_cc_library( name = "ComplexAttributesIncGen", - tbl_outs = [ - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/Complex/IR/ComplexAttributes.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/Complex/IR/ComplexAttributes.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/Complex/IR/ComplexEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/Complex/IR/ComplexEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Complex/IR/ComplexAttributes.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/Complex/IR/ComplexAttributes.cpp.inc": ["-gen-attrdef-defs"], + "include/mlir/Dialect/Complex/IR/ComplexEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/Complex/IR/ComplexEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Complex/IR/ComplexAttributes.td", deps = [":ComplexOpsTdFiles"], @@ -13191,22 +11428,16 @@ gentbl_cc_library( gentbl_cc_library( name = "ComplexBaseIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=complex", - ], - "include/mlir/Dialect/Complex/IR/ComplexOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=complex", - ], - "include/mlir/Dialect/Complex/IR/ComplexOpsDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Complex/IR/ComplexOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=complex", + ], + "include/mlir/Dialect/Complex/IR/ComplexOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=complex", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Complex/IR/ComplexBase.td", deps = [":ComplexOpsTdFiles"], @@ -13214,16 +11445,10 @@ gentbl_cc_library( gentbl_cc_library( name = "ComplexOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Complex/IR/ComplexOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Complex/IR/ComplexOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Complex/IR/ComplexOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Complex/IR/ComplexOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Complex/IR/ComplexOps.td", deps = [":ComplexOpsTdFiles"], @@ -13395,30 +11620,18 @@ td_library( gentbl_cc_library( name = "ArithBaseIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=arith", - ], - "include/mlir/Dialect/Arith/IR/ArithOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=arith", - ], - "include/mlir/Dialect/Arith/IR/ArithOpsDialect.cpp.inc", - ), - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/Arith/IR/ArithOpsEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/Arith/IR/ArithOpsEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Arith/IR/ArithOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=arith", + ], + "include/mlir/Dialect/Arith/IR/ArithOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=arith", + ], + "include/mlir/Dialect/Arith/IR/ArithOpsEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/Arith/IR/ArithOpsEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Arith/IR/ArithBase.td", deps = [":ArithOpsTdFiles"], @@ -13426,30 +11639,18 @@ gentbl_cc_library( gentbl_cc_library( name = "ArithOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Arith/IR/ArithOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Arith/IR/ArithOps.cpp.inc", - ), - ( - [ - "-gen-attrdef-decls", - "-attrdefs-dialect=arith", - ], - "include/mlir/Dialect/Arith/IR/ArithOpsAttributes.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-attrdefs-dialect=arith", - ], - "include/mlir/Dialect/Arith/IR/ArithOpsAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Arith/IR/ArithOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Arith/IR/ArithOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/Arith/IR/ArithOpsAttributes.h.inc": [ + "-gen-attrdef-decls", + "-attrdefs-dialect=arith", + ], + "include/mlir/Dialect/Arith/IR/ArithOpsAttributes.cpp.inc": [ + "-gen-attrdef-defs", + "-attrdefs-dialect=arith", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Arith/IR/ArithOps.td", deps = [ @@ -13459,16 +11660,10 @@ gentbl_cc_library( gentbl_cc_library( name = "ArithOpsInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Arith/IR/ArithOpsInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Arith/IR/ArithOpsInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Arith/IR/ArithOpsInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Arith/IR/ArithOpsInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Arith/IR/ArithOpsInterfaces.td", deps = [ @@ -13479,12 +11674,7 @@ gentbl_cc_library( gentbl_cc_library( name = "ArithCanonicalizationIncGen", strip_include_prefix = "include/mlir/Dialect/Arith/IR", - tbl_outs = [ - ( - ["-gen-rewriters"], - "include/mlir/Dialect/Arith/IR/ArithCanonicalization.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Arith/IR/ArithCanonicalization.inc": ["-gen-rewriters"]}, tblgen = ":mlir-tblgen", td_file = "lib/Dialect/Arith/IR/ArithCanonicalization.td", deps = [ @@ -13539,15 +11729,10 @@ cc_library( gentbl_cc_library( name = "ArithPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Arith", - ], - "include/mlir/Dialect/Arith/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Arith/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Arith", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Arith/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -13621,22 +11806,16 @@ td_library( gentbl_cc_library( name = "MathBaseIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=math", - ], - "include/mlir/Dialect/Math/IR/MathOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=math", - ], - "include/mlir/Dialect/Math/IR/MathOpsDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Math/IR/MathOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=math", + ], + "include/mlir/Dialect/Math/IR/MathOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=math", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Math/IR/MathBase.td", deps = [":MathOpsTdFiles"], @@ -13644,16 +11823,10 @@ gentbl_cc_library( gentbl_cc_library( name = "MathOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Math/IR/MathOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Math/IR/MathOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Math/IR/MathOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Math/IR/MathOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Math/IR/MathOps.td", deps = [":MathOpsTdFiles"], @@ -13661,15 +11834,10 @@ gentbl_cc_library( gentbl_cc_library( name = "MathPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Math", - ], - "include/mlir/Dialect/Math/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Math/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Math", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Math/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -13771,22 +11939,16 @@ td_library( gentbl_cc_library( name = "MemRefBaseIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=memref", - ], - "include/mlir/Dialect/MemRef/IR/MemRefOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=memref", - ], - "include/mlir/Dialect/MemRef/IR/MemRefOpsDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/MemRef/IR/MemRefOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=memref", + ], + "include/mlir/Dialect/MemRef/IR/MemRefOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=memref", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/MemRef/IR/MemRefBase.td", deps = [":MemRefOpsTdFiles"], @@ -13794,16 +11956,10 @@ gentbl_cc_library( gentbl_cc_library( name = "MemRefOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/MemRef/IR/MemRefOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/MemRef/IR/MemRefOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/MemRef/IR/MemRefOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/MemRef/IR/MemRefOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/MemRef/IR/MemRefOps.td", deps = [ @@ -13876,15 +12032,10 @@ cc_library( gentbl_cc_library( name = "MemRefPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=MemRef", - ], - "include/mlir/Dialect/MemRef/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/MemRef/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=MemRef", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/MemRef/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -13945,16 +12096,10 @@ td_library( gentbl_cc_library( name = "MemRefTransformOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td", deps = [ @@ -14016,24 +12161,12 @@ td_library( gentbl_cc_library( name = "MLProgramOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/MLProgram/IR/MLProgramOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/MLProgram/IR/MLProgramOps.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/MLProgram/IR/MLProgramOpsDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/MLProgram/IR/MLProgramOpsDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/MLProgram/IR/MLProgramOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/MLProgram/IR/MLProgramOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/MLProgram/IR/MLProgramOpsDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/MLProgram/IR/MLProgramOpsDialect.cpp.inc": ["-gen-dialect-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/MLProgram/IR/MLProgramOps.td", deps = [":MLProgramOpsTdFiles"], @@ -14041,16 +12174,10 @@ gentbl_cc_library( gentbl_cc_library( name = "MLProgramAttributesIncGen", - tbl_outs = [ - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/MLProgram/IR/MLProgramAttributes.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/MLProgram/IR/MLProgramAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/MLProgram/IR/MLProgramAttributes.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/MLProgram/IR/MLProgramAttributes.cpp.inc": ["-gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/MLProgram/IR/MLProgramAttributes.td", deps = [":MLProgramOpsTdFiles"], @@ -14058,15 +12185,10 @@ gentbl_cc_library( gentbl_cc_library( name = "MLProgramPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=MLProgram", - ], - "include/mlir/Dialect/MLProgram/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/MLProgram/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=MLProgram", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/MLProgram/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -14074,16 +12196,10 @@ gentbl_cc_library( gentbl_cc_library( name = "MLProgramTypesIncGen", - tbl_outs = [ - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/MLProgram/IR/MLProgramTypes.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/MLProgram/IR/MLProgramTypes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/MLProgram/IR/MLProgramTypes.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/MLProgram/IR/MLProgramTypes.cpp.inc": ["-gen-typedef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/MLProgram/IR/MLProgramTypes.td", deps = [":MLProgramOpsTdFiles"], @@ -14150,24 +12266,12 @@ td_library( gentbl_cc_library( name = "MPIIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/MPI/IR/MPI.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/MPI/IR/MPI.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/mlir/Dialect/MPI/IR/MPIDialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/mlir/Dialect/MPI/IR/MPIDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/MPI/IR/MPI.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/MPI/IR/MPI.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/MPI/IR/MPIDialect.h.inc": ["-gen-dialect-decls"], + "include/mlir/Dialect/MPI/IR/MPIDialect.cpp.inc": ["-gen-dialect-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/MPI/IR/MPI.td", deps = [":MPITdFiles"], @@ -14175,16 +12279,10 @@ gentbl_cc_library( gentbl_cc_library( name = "MPIOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/MPI/IR/MPIOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/MPI/IR/MPIOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/MPI/IR/MPIOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/MPI/IR/MPIOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/MPI/IR/MPIOps.td", deps = [":MPITdFiles"], @@ -14192,16 +12290,10 @@ gentbl_cc_library( gentbl_cc_library( name = "MPITypesIncGen", - tbl_outs = [ - ( - ["-gen-typedef-decls"], - "include/mlir/Dialect/MPI/IR/MPITypesGen.h.inc", - ), - ( - ["-gen-typedef-defs"], - "include/mlir/Dialect/MPI/IR/MPITypesGen.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/MPI/IR/MPITypesGen.h.inc": ["-gen-typedef-decls"], + "include/mlir/Dialect/MPI/IR/MPITypesGen.cpp.inc": ["-gen-typedef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/MPI/IR/MPITypes.td", deps = [":MPITdFiles"], @@ -14209,24 +12301,12 @@ gentbl_cc_library( gentbl_cc_library( name = "MPIAttrsIncGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/MPI/IR/MPIEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/MPI/IR/MPIEnums.cpp.inc", - ), - ( - ["-gen-attrdef-decls"], - "include/mlir/Dialect/MPI/IR/MPIAttrDefs.h.inc", - ), - ( - ["-gen-attrdef-defs"], - "include/mlir/Dialect/MPI/IR/MPIAttrDefs.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/MPI/IR/MPIEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/MPI/IR/MPIEnums.cpp.inc": ["-gen-enum-defs"], + "include/mlir/Dialect/MPI/IR/MPIAttrDefs.h.inc": ["-gen-attrdef-decls"], + "include/mlir/Dialect/MPI/IR/MPIAttrDefs.cpp.inc": ["-gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/MPI/IR/MPI.td", deps = [":MPITdFiles"], @@ -14279,16 +12359,10 @@ td_library( gentbl_cc_library( name = "AllocationOpInterfaceIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/mlir/Dialect/Bufferization/IR/AllocationOpInterface.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/mlir/Dialect/Bufferization/IR/AllocationOpInterface.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Bufferization/IR/AllocationOpInterface.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Dialect/Bufferization/IR/AllocationOpInterface.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Bufferization/IR/AllocationOpInterface.td", deps = [":AllocationOpInterfaceTdFiles"], @@ -14328,22 +12402,16 @@ td_library( gentbl_cc_library( name = "BufferizationBaseIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=bufferization", - ], - "include/mlir/Dialect/Bufferization/IR/BufferizationOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=bufferization", - ], - "include/mlir/Dialect/Bufferization/IR/BufferizationOpsDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Bufferization/IR/BufferizationOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=bufferization", + ], + "include/mlir/Dialect/Bufferization/IR/BufferizationOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=bufferization", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Bufferization/IR/BufferizationBase.td", deps = [":BufferizationOpsTdFiles"], @@ -14362,16 +12430,10 @@ td_library( gentbl_cc_library( name = "BufferizationEnumsIncGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "include/mlir/Dialect/Bufferization/IR/BufferizationEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "include/mlir/Dialect/Bufferization/IR/BufferizationEnums.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Bufferization/IR/BufferizationEnums.h.inc": ["-gen-enum-decls"], + "include/mlir/Dialect/Bufferization/IR/BufferizationEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Bufferization/IR/BufferizationEnums.td", deps = [":BufferizationEnumsTdFiles"], @@ -14391,16 +12453,10 @@ td_library( gentbl_cc_library( name = "BufferizationTransformOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td", deps = [ @@ -14436,16 +12492,10 @@ cc_library( gentbl_cc_library( name = "BufferizationOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/Bufferization/IR/BufferizationOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/Bufferization/IR/BufferizationOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/Bufferization/IR/BufferizationOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/Bufferization/IR/BufferizationOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Bufferization/IR/BufferizationOps.td", deps = [ @@ -14521,15 +12571,10 @@ cc_library( gentbl_cc_library( name = "BufferizationPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Bufferization", - ], - "include/mlir/Dialect/Bufferization/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Bufferization/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Bufferization", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Bufferization/Transforms/Passes.td", deps = [":PassBaseTdFiles"], @@ -14627,22 +12672,16 @@ td_library( gentbl_cc_library( name = "DLTIBaseIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=dlti", - ], - "include/mlir/Dialect/DLTI/DLTIDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=dlti", - ], - "include/mlir/Dialect/DLTI/DLTIDialect.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/DLTI/DLTIDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=dlti", + ], + "include/mlir/Dialect/DLTI/DLTIDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=dlti", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/DLTI/DLTIBase.td", deps = [":OpBaseTdFiles"], @@ -14650,22 +12689,16 @@ gentbl_cc_library( gentbl_cc_library( name = "DLTIAttrsIncGen", - tbl_outs = [ - ( - [ - "-gen-attrdef-decls", - "-dialect=dlti", - ], - "include/mlir/Dialect/DLTI/DLTIAttrs.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-dialect=dlti", - ], - "include/mlir/Dialect/DLTI/DLTIAttrs.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/DLTI/DLTIAttrs.h.inc": [ + "-gen-attrdef-decls", + "-dialect=dlti", + ], + "include/mlir/Dialect/DLTI/DLTIAttrs.cpp.inc": [ + "-gen-attrdef-defs", + "-dialect=dlti", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/DLTI/DLTIAttrs.td", deps = [ @@ -14699,22 +12732,16 @@ td_library( gentbl_cc_library( name = "DLTITransformOpsIncGen", - tbl_outs = [ - ( - [ - "-gen-op-decls", - "-dialect=dlti", - ], - "include/mlir/Dialect/DLTI/TransformOps/DLTITransformOps.h.inc", - ), - ( - [ - "-gen-op-defs", - "-dialect=dlti", - ], - "include/mlir/Dialect/DLTI/TransformOps/DLTITransformOps.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/DLTI/TransformOps/DLTITransformOps.h.inc": [ + "-gen-op-decls", + "-dialect=dlti", + ], + "include/mlir/Dialect/DLTI/TransformOps/DLTITransformOps.cpp.inc": [ + "-gen-op-defs", + "-dialect=dlti", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/DLTI/TransformOps/DLTITransformOps.td", deps = [ @@ -14739,15 +12766,10 @@ cc_library( gentbl_cc_library( name = "ReducerIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Reducer", - ], - "include/mlir/Reducer/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Reducer/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Reducer", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Reducer/Passes.td", deps = [ @@ -14982,16 +13004,10 @@ td_library( gentbl_cc_library( name = "UBOpsInterfacesIncGen", - tbl_outs = [ - ( - ["--gen-attr-interface-decls"], - "include/mlir/Dialect/UB/IR/UBOpsInterfaces.h.inc", - ), - ( - ["--gen-attr-interface-defs"], - "include/mlir/Dialect/UB/IR/UBOpsInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/UB/IR/UBOpsInterfaces.h.inc": ["--gen-attr-interface-decls"], + "include/mlir/Dialect/UB/IR/UBOpsInterfaces.cpp.inc": ["--gen-attr-interface-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/UB/IR/UBOpsInterfaces.td", deps = [":UBDialectTdFiles"], @@ -14999,38 +13015,20 @@ gentbl_cc_library( gentbl_cc_library( name = "UBOpsIncGen", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=ub", - ], - "include/mlir/Dialect/UB/IR/UBOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=ub", - ], - "include/mlir/Dialect/UB/IR/UBOpsDialect.cpp.inc", - ), - ( - ["-gen-op-decls"], - "include/mlir/Dialect/UB/IR/UBOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/UB/IR/UBOps.cpp.inc", - ), - ( - ["--gen-attrdef-decls"], - "include/mlir/Dialect/UB/IR/UBOpsAttributes.h.inc", - ), - ( - ["--gen-attrdef-defs"], - "include/mlir/Dialect/UB/IR/UBOpsAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/UB/IR/UBOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=ub", + ], + "include/mlir/Dialect/UB/IR/UBOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=ub", + ], + "include/mlir/Dialect/UB/IR/UBOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/UB/IR/UBOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/UB/IR/UBOpsAttributes.h.inc": ["--gen-attrdef-decls"], + "include/mlir/Dialect/UB/IR/UBOpsAttributes.cpp.inc": ["--gen-attrdef-defs"], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/UB/IR/UBOps.td", deps = [":UBDialectTdFiles"], @@ -15123,44 +13121,26 @@ td_library( gentbl_cc_library( name = "VCIXOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/mlir/Dialect/LLVMIR/VCIXOps.h.inc", - ), - ( - ["-gen-op-defs"], - "include/mlir/Dialect/LLVMIR/VCIXOps.cpp.inc", - ), - ( - [ - "-gen-dialect-decls", - "-dialect=vcix", - ], - "include/mlir/Dialect/LLVMIR/VCIXOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=vcix", - ], - "include/mlir/Dialect/LLVMIR/VCIXOpsDialect.cpp.inc", - ), - ( - [ - "-gen-attrdef-decls", - "-attrdefs-dialect=vcix", - ], - "include/mlir/Dialect/LLVMIR/VCIXOpsAttributes.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "-attrdefs-dialect=vcix", - ], - "include/mlir/Dialect/LLVMIR/VCIXOpsAttributes.cpp.inc", - ), - ], + tbl_outs = { + "include/mlir/Dialect/LLVMIR/VCIXOps.h.inc": ["-gen-op-decls"], + "include/mlir/Dialect/LLVMIR/VCIXOps.cpp.inc": ["-gen-op-defs"], + "include/mlir/Dialect/LLVMIR/VCIXOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=vcix", + ], + "include/mlir/Dialect/LLVMIR/VCIXOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=vcix", + ], + "include/mlir/Dialect/LLVMIR/VCIXOpsAttributes.h.inc": [ + "-gen-attrdef-decls", + "-attrdefs-dialect=vcix", + ], + "include/mlir/Dialect/LLVMIR/VCIXOpsAttributes.cpp.inc": [ + "-gen-attrdef-defs", + "-attrdefs-dialect=vcix", + ], + }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/VCIXOps.td", deps = [":VCIXTdFiles"], @@ -15183,12 +13163,7 @@ cc_library( gentbl_cc_library( name = "VCIXConversionIncGen", - tbl_outs = [ - ( - ["-gen-llvmir-conversions"], - "include/mlir/Dialect/LLVMIR/VCIXConversions.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/LLVMIR/VCIXConversions.inc": ["-gen-llvmir-conversions"]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/LLVMIR/VCIXOps.td", deps = [":VCIXTdFiles"], @@ -15196,15 +13171,10 @@ gentbl_cc_library( gentbl_cc_library( name = "QuantPassIncGen", - tbl_outs = [ - ( - [ - "-gen-pass-decls", - "-name=Quant", - ], - "include/mlir/Dialect/Quant/Transforms/Passes.h.inc", - ), - ], + tbl_outs = {"include/mlir/Dialect/Quant/Transforms/Passes.h.inc": [ + "-gen-pass-decls", + "-name=Quant", + ]}, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Quant/Transforms/Passes.td", deps = [":PassBaseTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch2/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch2/BUILD.bazel index 321b8d7cb232e..0fdda77771438 100644 --- a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch2/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch2/BUILD.bazel @@ -22,24 +22,12 @@ td_library( gentbl_cc_library( name = "ToyOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/toy/Ops.h.inc", - ), - ( - ["-gen-op-defs"], - "include/toy/Ops.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/toy/Dialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/toy/Dialect.cpp.inc", - ), - ], + tbl_outs = { + "include/toy/Ops.h.inc": ["-gen-op-decls"], + "include/toy/Ops.cpp.inc": ["-gen-op-defs"], + "include/toy/Dialect.h.inc": ["-gen-dialect-decls"], + "include/toy/Dialect.cpp.inc": ["-gen-dialect-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "include/toy/Ops.td", deps = [":ToyOpsTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch3/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch3/BUILD.bazel index b776bedab46f9..62e1d68cf6860 100644 --- a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch3/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch3/BUILD.bazel @@ -22,24 +22,12 @@ td_library( gentbl_cc_library( name = "ToyOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/toy/Ops.h.inc", - ), - ( - ["-gen-op-defs"], - "include/toy/Ops.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/toy/Dialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/toy/Dialect.cpp.inc", - ), - ], + tbl_outs = { + "include/toy/Ops.h.inc": ["-gen-op-decls"], + "include/toy/Ops.cpp.inc": ["-gen-op-defs"], + "include/toy/Dialect.h.inc": ["-gen-dialect-decls"], + "include/toy/Dialect.cpp.inc": ["-gen-dialect-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "include/toy/Ops.td", deps = [":ToyOpsTdFiles"], @@ -48,12 +36,7 @@ gentbl_cc_library( gentbl_cc_library( name = "ToyCombineIncGen", strip_include_prefix = "mlir", - tbl_outs = [ - ( - ["-gen-rewriters"], - "mlir/ToyCombine.inc", - ), - ], + tbl_outs = {"mlir/ToyCombine.inc": ["-gen-rewriters"]}, tblgen = "//mlir:mlir-tblgen", td_file = "mlir/ToyCombine.td", deps = [":ToyOpsTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch4/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch4/BUILD.bazel index 68639df2aa2ca..3ac6ddca29818 100644 --- a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch4/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch4/BUILD.bazel @@ -25,16 +25,10 @@ td_library( gentbl_cc_library( name = "ToyInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/toy/ShapeInferenceOpInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/toy/ShapeInferenceOpInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/toy/ShapeInferenceOpInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/toy/ShapeInferenceOpInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "include/toy/ShapeInferenceInterface.td", deps = [":ToyOpsTdFiles"], @@ -42,24 +36,12 @@ gentbl_cc_library( gentbl_cc_library( name = "ToyOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/toy/Ops.h.inc", - ), - ( - ["-gen-op-defs"], - "include/toy/Ops.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/toy/Dialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/toy/Dialect.cpp.inc", - ), - ], + tbl_outs = { + "include/toy/Ops.h.inc": ["-gen-op-decls"], + "include/toy/Ops.cpp.inc": ["-gen-op-defs"], + "include/toy/Dialect.h.inc": ["-gen-dialect-decls"], + "include/toy/Dialect.cpp.inc": ["-gen-dialect-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "include/toy/Ops.td", deps = [":ToyOpsTdFiles"], @@ -68,12 +50,7 @@ gentbl_cc_library( gentbl_cc_library( name = "ToyCombineIncGen", strip_include_prefix = "mlir", - tbl_outs = [ - ( - ["-gen-rewriters"], - "mlir/ToyCombine.inc", - ), - ], + tbl_outs = {"mlir/ToyCombine.inc": ["-gen-rewriters"]}, tblgen = "//mlir:mlir-tblgen", td_file = "mlir/ToyCombine.td", deps = [":ToyOpsTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch5/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch5/BUILD.bazel index 9ce23b5d97547..34b7487bdf69c 100644 --- a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch5/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch5/BUILD.bazel @@ -25,16 +25,10 @@ td_library( gentbl_cc_library( name = "ToyInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/toy/ShapeInferenceOpInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/toy/ShapeInferenceOpInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/toy/ShapeInferenceOpInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/toy/ShapeInferenceOpInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "include/toy/ShapeInferenceInterface.td", deps = [":ToyOpsTdFiles"], @@ -42,24 +36,12 @@ gentbl_cc_library( gentbl_cc_library( name = "ToyOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/toy/Ops.h.inc", - ), - ( - ["-gen-op-defs"], - "include/toy/Ops.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/toy/Dialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/toy/Dialect.cpp.inc", - ), - ], + tbl_outs = { + "include/toy/Ops.h.inc": ["-gen-op-decls"], + "include/toy/Ops.cpp.inc": ["-gen-op-defs"], + "include/toy/Dialect.h.inc": ["-gen-dialect-decls"], + "include/toy/Dialect.cpp.inc": ["-gen-dialect-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "include/toy/Ops.td", deps = [":ToyOpsTdFiles"], @@ -68,12 +50,7 @@ gentbl_cc_library( gentbl_cc_library( name = "ToyCombineIncGen", strip_include_prefix = "mlir", - tbl_outs = [ - ( - ["-gen-rewriters"], - "mlir/ToyCombine.inc", - ), - ], + tbl_outs = {"mlir/ToyCombine.inc": ["-gen-rewriters"]}, tblgen = "//mlir:mlir-tblgen", td_file = "mlir/ToyCombine.td", deps = [":ToyOpsTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel index 86925aa0662c7..ea1ef88151891 100644 --- a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel @@ -25,16 +25,10 @@ td_library( gentbl_cc_library( name = "ToyInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/toy/ShapeInferenceOpInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/toy/ShapeInferenceOpInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/toy/ShapeInferenceOpInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/toy/ShapeInferenceOpInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "include/toy/ShapeInferenceInterface.td", deps = [":ToyOpsTdFiles"], @@ -42,24 +36,12 @@ gentbl_cc_library( gentbl_cc_library( name = "ToyOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/toy/Ops.h.inc", - ), - ( - ["-gen-op-defs"], - "include/toy/Ops.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/toy/Dialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/toy/Dialect.cpp.inc", - ), - ], + tbl_outs = { + "include/toy/Ops.h.inc": ["-gen-op-decls"], + "include/toy/Ops.cpp.inc": ["-gen-op-defs"], + "include/toy/Dialect.h.inc": ["-gen-dialect-decls"], + "include/toy/Dialect.cpp.inc": ["-gen-dialect-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "include/toy/Ops.td", deps = [":ToyOpsTdFiles"], @@ -68,12 +50,7 @@ gentbl_cc_library( gentbl_cc_library( name = "ToyCombineIncGen", strip_include_prefix = "mlir", - tbl_outs = [ - ( - ["-gen-rewriters"], - "mlir/ToyCombine.inc", - ), - ], + tbl_outs = {"mlir/ToyCombine.inc": ["-gen-rewriters"]}, tblgen = "//mlir:mlir-tblgen", td_file = "mlir/ToyCombine.td", deps = [":ToyOpsTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel index bf9ab79529b8d..6e67715468c05 100644 --- a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel @@ -25,16 +25,10 @@ td_library( gentbl_cc_library( name = "ToyInterfacesIncGen", - tbl_outs = [ - ( - ["-gen-op-interface-decls"], - "include/toy/ShapeInferenceOpInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "include/toy/ShapeInferenceOpInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "include/toy/ShapeInferenceOpInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/toy/ShapeInferenceOpInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "include/toy/ShapeInferenceInterface.td", deps = [":ToyOpsTdFiles"], @@ -42,24 +36,12 @@ gentbl_cc_library( gentbl_cc_library( name = "ToyOpsIncGen", - tbl_outs = [ - ( - ["-gen-op-decls"], - "include/toy/Ops.h.inc", - ), - ( - ["-gen-op-defs"], - "include/toy/Ops.cpp.inc", - ), - ( - ["-gen-dialect-decls"], - "include/toy/Dialect.h.inc", - ), - ( - ["-gen-dialect-defs"], - "include/toy/Dialect.cpp.inc", - ), - ], + tbl_outs = { + "include/toy/Ops.h.inc": ["-gen-op-decls"], + "include/toy/Ops.cpp.inc": ["-gen-op-defs"], + "include/toy/Dialect.h.inc": ["-gen-dialect-decls"], + "include/toy/Dialect.cpp.inc": ["-gen-dialect-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "include/toy/Ops.td", deps = [":ToyOpsTdFiles"], @@ -68,12 +50,7 @@ gentbl_cc_library( gentbl_cc_library( name = "ToyCombineIncGen", strip_include_prefix = "mlir", - tbl_outs = [ - ( - ["-gen-rewriters"], - "mlir/ToyCombine.inc", - ), - ], + tbl_outs = {"mlir/ToyCombine.inc": ["-gen-rewriters"]}, tblgen = "//mlir:mlir-tblgen", td_file = "mlir/ToyCombine.td", deps = [":ToyOpsTdFiles"], diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel index f29cc028c52c0..611fbecd215cd 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel @@ -132,16 +132,10 @@ td_library( gentbl_cc_library( name = "TestOpsSyntaxIncGen", strip_include_prefix = "lib/Dialect/Test", - tbl_outs = [ - ( - ["-gen-op-decls"], - "lib/Dialect/Test/TestOpsSyntax.h.inc", - ), - ( - ["-gen-op-defs"], - "lib/Dialect/Test/TestOpsSyntax.cpp.inc", - ), - ], + tbl_outs = { + "lib/Dialect/Test/TestOpsSyntax.h.inc": ["-gen-op-decls"], + "lib/Dialect/Test/TestOpsSyntax.cpp.inc": ["-gen-op-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "lib/Dialect/Test/TestOpsSyntax.td", test = True, @@ -153,26 +147,17 @@ gentbl_cc_library( gentbl_cc_library( name = "TestOpsIncGen", strip_include_prefix = "lib/Dialect/Test", - tbl_outs = [ - ( - [ - "-gen-dialect-decls", - "-dialect=test", - ], - "lib/Dialect/Test/TestOpsDialect.h.inc", - ), - ( - [ - "-gen-dialect-defs", - "-dialect=test", - ], - "lib/Dialect/Test/TestOpsDialect.cpp.inc", - ), - ( - ["-gen-rewriters"], - "lib/Dialect/Test/TestPatterns.inc", - ), - ], + tbl_outs = { + "lib/Dialect/Test/TestOpsDialect.h.inc": [ + "-gen-dialect-decls", + "-dialect=test", + ], + "lib/Dialect/Test/TestOpsDialect.cpp.inc": [ + "-gen-dialect-defs", + "-dialect=test", + ], + "lib/Dialect/Test/TestPatterns.inc": ["-gen-rewriters"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "lib/Dialect/Test/TestOps.td", test = True, @@ -184,32 +169,14 @@ gentbl_cc_library( gentbl_cc_library( name = "TestInterfacesIncGen", strip_include_prefix = "lib/Dialect/Test", - tbl_outs = [ - ( - ["-gen-attr-interface-decls"], - "lib/Dialect/Test/TestAttrInterfaces.h.inc", - ), - ( - ["-gen-attr-interface-defs"], - "lib/Dialect/Test/TestAttrInterfaces.cpp.inc", - ), - ( - ["-gen-type-interface-decls"], - "lib/Dialect/Test/TestTypeInterfaces.h.inc", - ), - ( - ["-gen-type-interface-defs"], - "lib/Dialect/Test/TestTypeInterfaces.cpp.inc", - ), - ( - ["-gen-op-interface-decls"], - "lib/Dialect/Test/TestOpInterfaces.h.inc", - ), - ( - ["-gen-op-interface-defs"], - "lib/Dialect/Test/TestOpInterfaces.cpp.inc", - ), - ], + tbl_outs = { + "lib/Dialect/Test/TestAttrInterfaces.h.inc": ["-gen-attr-interface-decls"], + "lib/Dialect/Test/TestAttrInterfaces.cpp.inc": ["-gen-attr-interface-defs"], + "lib/Dialect/Test/TestTypeInterfaces.h.inc": ["-gen-type-interface-decls"], + "lib/Dialect/Test/TestTypeInterfaces.cpp.inc": ["-gen-type-interface-defs"], + "lib/Dialect/Test/TestOpInterfaces.h.inc": ["-gen-op-interface-decls"], + "lib/Dialect/Test/TestOpInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "lib/Dialect/Test/TestInterfaces.td", test = True, @@ -222,22 +189,16 @@ gentbl_cc_library( gentbl_cc_library( name = "TestAttrDefsIncGen", strip_include_prefix = "lib/Dialect/Test", - tbl_outs = [ - ( - [ - "-gen-attrdef-decls", - "--attrdefs-dialect=test", - ], - "lib/Dialect/Test/TestAttrDefs.h.inc", - ), - ( - [ - "-gen-attrdef-defs", - "--attrdefs-dialect=test", - ], - "lib/Dialect/Test/TestAttrDefs.cpp.inc", - ), - ], + tbl_outs = { + "lib/Dialect/Test/TestAttrDefs.h.inc": [ + "-gen-attrdef-decls", + "--attrdefs-dialect=test", + ], + "lib/Dialect/Test/TestAttrDefs.cpp.inc": [ + "-gen-attrdef-defs", + "--attrdefs-dialect=test", + ], + }, tblgen = "//mlir:mlir-tblgen", td_file = "lib/Dialect/Test/TestOps.td", test = True, @@ -249,16 +210,10 @@ gentbl_cc_library( gentbl_cc_library( name = "TestEnumDefsIncGen", strip_include_prefix = "lib/Dialect/Test", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "lib/Dialect/Test/TestOpEnums.h.inc", - ), - ( - ["-gen-enum-defs"], - "lib/Dialect/Test/TestOpEnums.cpp.inc", - ), - ], + tbl_outs = { + "lib/Dialect/Test/TestOpEnums.h.inc": ["-gen-enum-decls"], + "lib/Dialect/Test/TestOpEnums.cpp.inc": ["-gen-enum-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "lib/Dialect/Test/TestEnumDefs.td", test = True, @@ -270,22 +225,16 @@ gentbl_cc_library( gentbl_cc_library( name = "TestTypeDefsIncGen", strip_include_prefix = "lib/Dialect/Test", - tbl_outs = [ - ( - [ - "-gen-typedef-decls", - "--typedefs-dialect=test", - ], - "lib/Dialect/Test/TestTypeDefs.h.inc", - ), - ( - [ - "-gen-typedef-defs", - "--typedefs-dialect=test", - ], - "lib/Dialect/Test/TestTypeDefs.cpp.inc", - ), - ], + tbl_outs = { + "lib/Dialect/Test/TestTypeDefs.h.inc": [ + "-gen-typedef-decls", + "--typedefs-dialect=test", + ], + "lib/Dialect/Test/TestTypeDefs.cpp.inc": [ + "-gen-typedef-defs", + "--typedefs-dialect=test", + ], + }, tblgen = "//mlir:mlir-tblgen", td_file = "lib/Dialect/Test/TestTypeDefs.td", test = True, @@ -316,30 +265,18 @@ td_library( gentbl_cc_library( name = "TestTransformDialectExtensionIncGen", strip_include_prefix = "lib/Dialect/Transform", - tbl_outs = [ - ( - ["-gen-op-decls"], - "lib/Dialect/Transform/TestTransformDialectExtension.h.inc", - ), - ( - ["-gen-op-defs"], - "lib/Dialect/Transform/TestTransformDialectExtension.cpp.inc", - ), - ( - [ - "-gen-typedef-decls", - "-typedefs-dialect=transform", - ], - "lib/Dialect/Transform/TestTransformDialectExtensionTypes.h.inc", - ), - ( - [ - "-gen-typedef-defs", - "-typedefs-dialect=transform", - ], - "lib/Dialect/Transform/TestTransformDialectExtensionTypes.cpp.inc", - ), - ], + tbl_outs = { + "lib/Dialect/Transform/TestTransformDialectExtension.h.inc": ["-gen-op-decls"], + "lib/Dialect/Transform/TestTransformDialectExtension.cpp.inc": ["-gen-op-defs"], + "lib/Dialect/Transform/TestTransformDialectExtensionTypes.h.inc": [ + "-gen-typedef-decls", + "-typedefs-dialect=transform", + ], + "lib/Dialect/Transform/TestTransformDialectExtensionTypes.cpp.inc": [ + "-gen-typedef-defs", + "-typedefs-dialect=transform", + ], + }, tblgen = "//mlir:mlir-tblgen", td_file = "lib/Dialect/Transform/TestTransformDialectExtension.td", test = True, @@ -496,16 +433,10 @@ gentbl_cc_library( name = "TestTilingInterfaceTransformOpsIncGen", includes = ["lib/Interfaces/TilingInterface"], strip_include_prefix = "lib", - tbl_outs = [ - ( - ["-gen-op-decls"], - "lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.h.inc", - ), - ( - ["-gen-op-defs"], - "lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp.inc", - ), - ], + tbl_outs = { + "lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.h.inc": ["-gen-op-decls"], + "lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td", deps = [ @@ -622,16 +553,10 @@ gentbl_cc_library( name = "TestTransformsOpsIncGen", includes = ["lib/Dialect/Test"], strip_include_prefix = "lib", - tbl_outs = [ - ( - ["-gen-op-decls"], - "lib/TestTransformsOps.h.inc", - ), - ( - ["-gen-op-defs"], - "lib/TestTransformsOps.cpp.inc", - ), - ], + tbl_outs = { + "lib/TestTransformsOps.h.inc": ["-gen-op-decls"], + "lib/TestTransformsOps.cpp.inc": ["-gen-op-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "lib/Transforms/TestTransformsOps.td", deps = [ @@ -1200,12 +1125,7 @@ gentbl_cc_library( name = "TestPDLLPatternsIncGen", includes = ["lib/Dialect/Test"], strip_include_prefix = "lib", - tbl_outs = [ - ( - ["-x=cpp"], - "lib/Tools/PDLL/TestPDLLPatterns.h.inc", - ), - ], + tbl_outs = {"lib/Tools/PDLL/TestPDLLPatterns.h.inc": ["-x=cpp"]}, tblgen = "//mlir:mlir-pdll", td_file = "lib/Tools/PDLL/TestPDLL.pdll", deps = [ @@ -1218,12 +1138,7 @@ gentbl_cc_library( name = "TestDialectConversionPDLLPatternsIncGen", includes = ["lib/Dialect/Test"], strip_include_prefix = "lib", - tbl_outs = [ - ( - ["-x=cpp"], - "lib/TestDialectConversionPDLLPatterns.h.inc", - ), - ], + tbl_outs = {"lib/TestDialectConversionPDLLPatterns.h.inc": ["-x=cpp"]}, tblgen = "//mlir:mlir-pdll", td_file = "lib/Transforms/TestDialectConversion.pdll", deps = [ diff --git a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel index a55c6f50118dc..4539341fe9acd 100644 --- a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel @@ -264,16 +264,10 @@ cc_test( gentbl_cc_library( name = "EnumsIncGen", - tbl_outs = [ - ( - ["-gen-enum-decls"], - "TableGen/EnumsGenTest.h.inc", - ), - ( - ["-gen-enum-defs"], - "TableGen/EnumsGenTest.cpp.inc", - ), - ], + tbl_outs = { + "TableGen/EnumsGenTest.h.inc": ["-gen-enum-decls"], + "TableGen/EnumsGenTest.cpp.inc": ["-gen-enum-defs"], + }, tblgen = "//mlir:mlir-tblgen", td_file = "TableGen/enums.td", deps = [ @@ -283,12 +277,7 @@ gentbl_cc_library( gentbl_cc_library( name = "PassIncGen", - tbl_outs = [ - ( - ["-gen-pass-decls"], - "TableGen/PassGenTest.h.inc", - ), - ], + tbl_outs = {"TableGen/PassGenTest.h.inc": ["-gen-pass-decls"]}, tblgen = "//mlir:mlir-tblgen", td_file = "TableGen/passes.td", deps = [ From e2fe78797fa39e22ebb3c65383d1bd30490535e5 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 7 Apr 2025 13:32:36 +0100 Subject: [PATCH 0837/1029] [Clang] Use "syncscope" instead of "synchscope". NFC. (#134616) This matches the spelling of the keyword in LLVM IR. --- clang/include/clang/AST/Expr.h | 2 +- .../clang/Basic/DiagnosticSemaKinds.td | 2 +- clang/include/clang/Basic/SyncScope.h | 38 +++++++++---------- clang/lib/CodeGen/CGAtomic.cpp | 8 ++-- clang/lib/Sema/SemaChecking.cpp | 2 +- clang/test/SemaOpenCL/atomic-ops.cl | 2 +- 6 files changed, 27 insertions(+), 27 deletions(-) diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index 08e34fdf2aa2f..dedbff5944af8 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -6757,7 +6757,7 @@ class PseudoObjectExpr final /// and corresponding __opencl_atomic_* for OpenCL 2.0. /// All of these instructions take one primary pointer, at least one memory /// order. The instructions for which getScopeModel returns non-null value -/// take one synch scope. +/// take one sync scope. class AtomicExpr : public Expr { public: enum AtomicOp { diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index c2b01833a5c46..1ad09aba60935 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -9076,7 +9076,7 @@ def err_atomic_op_needs_atomic_int : Error< def warn_atomic_op_has_invalid_memory_order : Warning< "%select{|success |failure }0memory order argument to atomic operation is invalid">, InGroup>; -def err_atomic_op_has_invalid_synch_scope : Error< +def err_atomic_op_has_invalid_sync_scope : Error< "synchronization scope argument to atomic operation is invalid">; def warn_atomic_implicit_seq_cst : Warning< "implicit use of sequentially-consistent atomic may incur stronger memory barriers than necessary">, diff --git a/clang/include/clang/Basic/SyncScope.h b/clang/include/clang/Basic/SyncScope.h index 45beff41afa11..5a8d2a7dd02e5 100644 --- a/clang/include/clang/Basic/SyncScope.h +++ b/clang/include/clang/Basic/SyncScope.h @@ -21,17 +21,17 @@ namespace clang { -/// Defines synch scope values used internally by clang. +/// Defines sync scope values used internally by clang. /// /// The enum values start from 0 and are contiguous. They are mainly used for -/// enumerating all supported synch scope values and mapping them to LLVM -/// synch scopes. Their numerical values may be different from the corresponding -/// synch scope enums used in source languages. +/// enumerating all supported sync scope values and mapping them to LLVM +/// sync scopes. Their numerical values may be different from the corresponding +/// sync scope enums used in source languages. /// -/// In atomic builtin and expressions, language-specific synch scope enums are +/// In atomic builtin and expressions, language-specific sync scope enums are /// used. Currently only OpenCL memory scope enums are supported and assumed /// to be used by all languages. However, in the future, other languages may -/// define their own set of synch scope enums. The language-specific synch scope +/// define their own set of sync scope enums. The language-specific sync scope /// values are represented by class AtomicScopeModel and its derived classes. /// /// To add a new enum value: @@ -88,31 +88,31 @@ inline llvm::StringRef getAsString(SyncScope S) { case SyncScope::OpenCLSubGroup: return "opencl_subgroup"; } - llvm_unreachable("Invalid synch scope"); + llvm_unreachable("Invalid sync scope"); } /// Defines the kind of atomic scope models. enum class AtomicScopeModelKind { None, OpenCL, HIP, Generic }; -/// Defines the interface for synch scope model. +/// Defines the interface for sync scope model. class AtomicScopeModel { public: virtual ~AtomicScopeModel() {} - /// Maps language specific synch scope values to internal + /// Maps language specific sync scope values to internal /// SyncScope enum. virtual SyncScope map(unsigned S) const = 0; - /// Check if the compile-time constant synch scope value + /// Check if the compile-time constant sync scope value /// is valid. virtual bool isValid(unsigned S) const = 0; - /// Get all possible synch scope values that might be + /// Get all possible sync scope values that might be /// encountered at runtime for the current language. virtual ArrayRef getRuntimeValues() const = 0; /// If atomic builtin function is called with invalid - /// synch scope value at runtime, it will fall back to a valid - /// synch scope value returned by this function. + /// sync scope value at runtime, it will fall back to a valid + /// sync scope value returned by this function. virtual unsigned getFallBackValue() const = 0; /// Create an atomic scope model by AtomicScopeModelKind. @@ -120,7 +120,7 @@ class AtomicScopeModel { static std::unique_ptr create(AtomicScopeModelKind K); }; -/// Defines the synch scope model for OpenCL. +/// Defines the sync scope model for OpenCL. class AtomicScopeOpenCLModel : public AtomicScopeModel { public: /// The enum values match the pre-defined macros @@ -147,7 +147,7 @@ class AtomicScopeOpenCLModel : public AtomicScopeModel { case SubGroup: return SyncScope::OpenCLSubGroup; } - llvm_unreachable("Invalid language synch scope value"); + llvm_unreachable("Invalid language sync scope value"); } bool isValid(unsigned S) const override { @@ -156,7 +156,7 @@ class AtomicScopeOpenCLModel : public AtomicScopeModel { } ArrayRef getRuntimeValues() const override { - static_assert(Last == SubGroup, "Does not include all synch scopes"); + static_assert(Last == SubGroup, "Does not include all sync scopes"); static const unsigned Scopes[] = { static_cast(WorkGroup), static_cast(Device), static_cast(AllSVMDevices), static_cast(SubGroup)}; @@ -168,7 +168,7 @@ class AtomicScopeOpenCLModel : public AtomicScopeModel { } }; -/// Defines the synch scope model for HIP. +/// Defines the sync scope model for HIP. class AtomicScopeHIPModel : public AtomicScopeModel { public: /// The enum values match the pre-defined macros @@ -198,7 +198,7 @@ class AtomicScopeHIPModel : public AtomicScopeModel { case System: return SyncScope::HIPSystem; } - llvm_unreachable("Invalid language synch scope value"); + llvm_unreachable("Invalid language sync scope value"); } bool isValid(unsigned S) const override { @@ -207,7 +207,7 @@ class AtomicScopeHIPModel : public AtomicScopeModel { } ArrayRef getRuntimeValues() const override { - static_assert(Last == System, "Does not include all synch scopes"); + static_assert(Last == System, "Does not include all sync scopes"); static const unsigned Scopes[] = { static_cast(SingleThread), static_cast(Wavefront), static_cast(Workgroup), static_cast(Agent), diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp index 3adb2a7ad207f..672e82f8dcc3e 100644 --- a/clang/lib/CodeGen/CGAtomic.cpp +++ b/clang/lib/CodeGen/CGAtomic.cpp @@ -781,8 +781,8 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *Expr, Address Dest, llvm::Value *Scope) { auto ScopeModel = Expr->getScopeModel(); - // LLVM atomic instructions always have synch scope. If clang atomic - // expression has no scope operand, use default LLVM synch scope. + // LLVM atomic instructions always have sync scope. If clang atomic + // expression has no scope operand, use default LLVM sync scope. if (!ScopeModel) { llvm::SyncScope::ID SS; if (CGF.getLangOpts().OpenCL) @@ -821,8 +821,8 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *Expr, Address Dest, CGF.createBasicBlock("atomic.scope.continue", CGF.CurFn); auto *SC = Builder.CreateIntCast(Scope, Builder.getInt32Ty(), false); - // If unsupported synch scope is encountered at run time, assume a fallback - // synch scope value. + // If unsupported sync scope is encountered at run time, assume a fallback + // sync scope value. auto FallBack = ScopeModel->getFallBackValue(); llvm::SwitchInst *SI = Builder.CreateSwitch(SC, BB[FallBack]); for (auto S : Scopes) { diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 5a4fa97366809..c21475ee69d9e 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -4229,7 +4229,7 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange, if (std::optional Result = Scope->getIntegerConstantExpr(Context)) { if (!ScopeModel->isValid(Result->getZExtValue())) - Diag(Scope->getBeginLoc(), diag::err_atomic_op_has_invalid_synch_scope) + Diag(Scope->getBeginLoc(), diag::err_atomic_op_has_invalid_sync_scope) << Scope->getSourceRange(); } SubExprs.push_back(Scope); diff --git a/clang/test/SemaOpenCL/atomic-ops.cl b/clang/test/SemaOpenCL/atomic-ops.cl index 209de22ecdf57..7a273546db772 100644 --- a/clang/test/SemaOpenCL/atomic-ops.cl +++ b/clang/test/SemaOpenCL/atomic-ops.cl @@ -159,7 +159,7 @@ void memory_checks(atomic_int *Ap, int *p, int val) { (void)__opencl_atomic_compare_exchange_weak(Ap, p, val, memory_order_seq_cst, memory_order_relaxed, memory_scope_work_group); } -void synchscope_checks(atomic_int *Ap, int scope) { +void syncscope_checks(atomic_int *Ap, int scope) { (void)__opencl_atomic_load(Ap, memory_order_relaxed, memory_scope_work_item); // expected-error{{synchronization scope argument to atomic operation is invalid}} (void)__opencl_atomic_load(Ap, memory_order_relaxed, memory_scope_work_group); (void)__opencl_atomic_load(Ap, memory_order_relaxed, memory_scope_device); From 4509bc12994ade02eda03642fa5bf5f68691f768 Mon Sep 17 00:00:00 2001 From: JaydeepChauhan14 Date: Mon, 7 Apr 2025 18:02:56 +0530 Subject: [PATCH 0838/1029] [X86][GlobalISel] Enable POWI function with libcall mapping (#134369) --- .../lib/Target/X86/GISel/X86LegalizerInfo.cpp | 8 ++-- llvm/test/CodeGen/X86/powi.ll | 43 ++++++++++++------- 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index b474d6a3f6356..ba9fa254a477a 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -99,10 +99,10 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, .widenScalarToNextPow2(0, /*Min=*/8) .clampScalar(0, s8, sMaxScalar); - getActionDefinitionsBuilder({G_LROUND, G_LLROUND, G_FCOS, G_FCOSH, G_FACOS, - G_FSIN, G_FSINH, G_FASIN, G_FTAN, G_FTANH, - G_FATAN, G_FATAN2, G_FPOW, G_FEXP, G_FEXP2, - G_FEXP10, G_FLOG, G_FLOG2, G_FLOG10}) + getActionDefinitionsBuilder({G_LROUND, G_LLROUND, G_FCOS, G_FCOSH, G_FACOS, + G_FSIN, G_FSINH, G_FASIN, G_FTAN, G_FTANH, + G_FATAN, G_FATAN2, G_FPOW, G_FEXP, G_FEXP2, + G_FEXP10, G_FLOG, G_FLOG2, G_FLOG10, G_FPOWI}) .libcall(); getActionDefinitionsBuilder(G_FSQRT) diff --git a/llvm/test/CodeGen/X86/powi.ll b/llvm/test/CodeGen/X86/powi.ll index e4c691bfbd2e5..4420d0499a5d0 100644 --- a/llvm/test/CodeGen/X86/powi.ll +++ b/llvm/test/CodeGen/X86/powi.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FAST-X86 ; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=SDAG-X86 -; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X86 +; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -fast-isel | FileCheck %s --check-prefixes=FAST-X64 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=SDAG-X64 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X64 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64 define float @test_powi_f32_i32(float %Val, i32 %x) nounwind { ; FAST-X86-LABEL: test_powi_f32_i32: @@ -32,10 +32,10 @@ define float @test_powi_f32_i32(float %Val, i32 %x) nounwind { ; GISEL-X86-LABEL: test_powi_f32_i32: ; GISEL-X86: # %bb.0: ; GISEL-X86-NEXT: subl $12, %esp -; GISEL-X86-NEXT: flds {{[0-9]+}}(%esp) ; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; GISEL-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; GISEL-X86-NEXT: fstps (%esp) +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movl %eax, (%esp) +; GISEL-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; GISEL-X86-NEXT: calll __powisf2 ; GISEL-X86-NEXT: addl $12, %esp ; GISEL-X86-NEXT: retl @@ -53,7 +53,10 @@ define float @test_powi_f32_i32(float %Val, i32 %x) nounwind { ; ; GISEL-X64-LABEL: test_powi_f32_i32: ; GISEL-X64: # %bb.0: -; GISEL-X64-NEXT: jmp __powisf2@PLT # TAILCALL +; GISEL-X64-NEXT: pushq %rax +; GISEL-X64-NEXT: callq __powisf2 +; GISEL-X64-NEXT: popq %rax +; GISEL-X64-NEXT: retq %res = call float @llvm.powi.f32.i32(float %Val, i32 %x) ret float %res } @@ -83,13 +86,20 @@ define double @test_powi_f64_i32(double %Val, i32 %x) nounwind { ; ; GISEL-X86-LABEL: test_powi_f64_i32: ; GISEL-X86: # %bb.0: -; GISEL-X86-NEXT: subl $12, %esp -; GISEL-X86-NEXT: fldl {{[0-9]+}}(%esp) -; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; GISEL-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; GISEL-X86-NEXT: fstpl (%esp) +; GISEL-X86-NEXT: pushl %esi +; GISEL-X86-NEXT: subl $24, %esp +; GISEL-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: movl 4(%eax), %eax +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; GISEL-X86-NEXT: xorl %esi, %esi +; GISEL-X86-NEXT: addl %esp, %esi +; GISEL-X86-NEXT: movl %ecx, (%esp) +; GISEL-X86-NEXT: movl %eax, 4(%esi) +; GISEL-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) ; GISEL-X86-NEXT: calll __powidf2 -; GISEL-X86-NEXT: addl $12, %esp +; GISEL-X86-NEXT: addl $24, %esp +; GISEL-X86-NEXT: popl %esi ; GISEL-X86-NEXT: retl ; ; FAST-X64-LABEL: test_powi_f64_i32: @@ -105,7 +115,10 @@ define double @test_powi_f64_i32(double %Val, i32 %x) nounwind { ; ; GISEL-X64-LABEL: test_powi_f64_i32: ; GISEL-X64: # %bb.0: -; GISEL-X64-NEXT: jmp __powidf2@PLT # TAILCALL +; GISEL-X64-NEXT: pushq %rax +; GISEL-X64-NEXT: callq __powidf2 +; GISEL-X64-NEXT: popq %rax +; GISEL-X64-NEXT: retq %res = call double @llvm.powi.f64.i32(double %Val, i32 %x) ret double %res } @@ -138,8 +151,8 @@ define x86_fp80 @test_powi_f80_i32(x86_fp80 %Val, i32 %x) nounwind { ; GISEL-X86-NEXT: subl $28, %esp ; GISEL-X86-NEXT: fldt {{[0-9]+}}(%esp) ; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; GISEL-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; GISEL-X86-NEXT: fstpt (%esp) +; GISEL-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; GISEL-X86-NEXT: calll __powixf2 ; GISEL-X86-NEXT: addl $28, %esp ; GISEL-X86-NEXT: retl @@ -167,7 +180,7 @@ define x86_fp80 @test_powi_f80_i32(x86_fp80 %Val, i32 %x) nounwind { ; GISEL-X64-NEXT: subq $24, %rsp ; GISEL-X64-NEXT: fldt {{[0-9]+}}(%rsp) ; GISEL-X64-NEXT: fstpt (%rsp) -; GISEL-X64-NEXT: callq __powixf2@PLT +; GISEL-X64-NEXT: callq __powixf2 ; GISEL-X64-NEXT: addq $24, %rsp ; GISEL-X64-NEXT: retq %res = call x86_fp80 @llvm.powi.f80.i32(x86_fp80 %Val, i32 %x) From bafa2f4442bcee26f05c22369d41646d5c8befb9 Mon Sep 17 00:00:00 2001 From: Matthias Springer Date: Mon, 7 Apr 2025 05:38:18 -0700 Subject: [PATCH 0839/1029] [mlir][memref] Check memory space before lowering alloc ops (#134427) Check the memory space before lowering allocation ops, instead of starting the lowering and then rolling back the pattern when the memory space was found to be incompatible with LLVM. Note: This is in preparation of the One-Shot Dialect Conversion refactoring. Note: `isConvertibleAndHasIdentityMaps` now also checks the memory space. --- mlir/include/mlir/Conversion/LLVMCommon/Pattern.h | 4 ++-- mlir/lib/Conversion/LLVMCommon/Pattern.cpp | 4 ++-- mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp | 7 +------ mlir/test/Conversion/MemRefToLLVM/invalid.mlir | 3 +-- 4 files changed, 6 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h index c65f7d7217be5..66c8731ec2bf4 100644 --- a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h +++ b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h @@ -75,8 +75,8 @@ class ConvertToLLVMPattern : public ConversionPattern { ValueRange indices, ConversionPatternRewriter &rewriter) const; - /// Returns if the given memref has identity maps and the element type is - /// convertible to LLVM. + /// Returns if the given memref type is convertible to LLVM and has an + /// identity layout map. bool isConvertibleAndHasIdentityMaps(MemRefType type) const; /// Returns the type of a pointer to an element of the memref. diff --git a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp index 71b68619cc793..32bfd72475569 100644 --- a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp +++ b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp @@ -99,9 +99,9 @@ Value ConvertToLLVMPattern::getStridedElementPtr( // only support memrefs with identity maps. bool ConvertToLLVMPattern::isConvertibleAndHasIdentityMaps( MemRefType type) const { - if (!typeConverter->convertType(type.getElementType())) + if (!type.getLayout().isIdentity()) return false; - return type.getLayout().isIdentity(); + return static_cast(typeConverter->convertType(type)); } Type ConvertToLLVMPattern::getElementPtrType(MemRefType type) const { diff --git a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp index c5b2e83df93dc..bad209a4ddecf 100644 --- a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp +++ b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp @@ -73,12 +73,7 @@ std::tuple AllocationOpLLVMLowering::allocateBufferManuallyAlign( MemRefType memRefType = getMemRefResultType(op); // Allocate the underlying buffer. Type elementPtrType = this->getElementPtrType(memRefType); - if (!elementPtrType) { - emitError(loc, "conversion of memref memory space ") - << memRefType.getMemorySpace() - << " to integer address space " - "failed. Consider adding memory space conversions."; - } + assert(elementPtrType && "could not compute element ptr type"); FailureOr allocFuncOp = getNotalignedAllocFn( getTypeConverter(), op->getParentWithTrait(), getIndexType()); diff --git a/mlir/test/Conversion/MemRefToLLVM/invalid.mlir b/mlir/test/Conversion/MemRefToLLVM/invalid.mlir index 61c67005a08fc..0d04bba96bcdb 100644 --- a/mlir/test/Conversion/MemRefToLLVM/invalid.mlir +++ b/mlir/test/Conversion/MemRefToLLVM/invalid.mlir @@ -22,7 +22,7 @@ func.func @bad_address_space(%a: memref<2xindex, "foo">) { // CHECK-LABEL: @invalid_int_conversion func.func @invalid_int_conversion() { - // expected-error@+1 {{conversion of memref memory space 1 : ui64 to integer address space failed. Consider adding memory space conversions.}} + // expected-error@unknown{{conversion of memref memory space 1 : ui64 to integer address space failed. Consider adding memory space conversions.}} %alloc = memref.alloc() {alignment = 64 : i64} : memref<10xf32, 1 : ui64> return } @@ -32,7 +32,6 @@ func.func @invalid_int_conversion() { // expected-error@unknown{{conversion of memref memory space #gpu.address_space to integer address space failed. Consider adding memory space conversions}} // CHECK-LABEL: @issue_70160 func.func @issue_70160() { - // expected-error@+1{{conversion of memref memory space #gpu.address_space to integer address space failed. Consider adding memory space conversions}} %alloc = memref.alloc() : memref<1x32x33xi32, #gpu.address_space> %alloc1 = memref.alloc() : memref %c0 = arith.constant 0 : index From 2f6bc47a18d9a97635b76520f0e33391aa72ba68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Mon, 7 Apr 2025 13:56:54 +0100 Subject: [PATCH 0840/1029] [mlir][vector] Standardise `valueToStore` Naming Across Vector Ops (NFC) (#134206) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change standardises the naming convention for the argument representing the value to store in various vector operations. Specifically, it ensures that all vector ops storing a value—whether into memory, a tensor, or another vector — use `valueToStore` for the corresponding argument name. Updated operations: * `vector.transfer_write`, `vector.insert`, `vector.scalable_insert`, `vector.insert_strided_slice`. For reference, here are operations that currently use `valueToStore`: * `vector.store` `vector.scatter`, `vector.compressstore`, `vector.maskedstore`. This change is non-functional (NFC) and does not affect the functionality of these operations. Implements #131602 --- .../mlir/Dialect/Vector/IR/VectorOps.td | 43 +++++++------ .../mlir/Interfaces/VectorInterfaces.td | 14 ++-- .../VectorToArmSME/VectorToArmSME.cpp | 2 +- .../VectorToLLVM/ConvertVectorToLLVM.cpp | 7 +- .../Conversion/VectorToSCF/VectorToSCF.cpp | 2 +- .../VectorToSPIRV/VectorToSPIRV.cpp | 11 ++-- .../ArmSME/Transforms/VectorLegalization.cpp | 4 +- .../Dialect/Linalg/Transforms/Hoisting.cpp | 2 +- .../Linalg/Transforms/Vectorization.cpp | 4 +- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 64 +++++++++++++------ .../Transforms/SubsetOpInterfaceImpl.cpp | 2 +- .../Vector/Transforms/VectorDistribute.cpp | 20 +++--- .../Transforms/VectorDropLeadUnitDim.cpp | 8 +-- ...sertExtractStridedSliceRewritePatterns.cpp | 10 +-- .../Vector/Transforms/VectorLinearize.cpp | 6 +- .../Vector/Transforms/VectorTransforms.cpp | 6 +- 16 files changed, 119 insertions(+), 86 deletions(-) diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index 134472cefbf4e..7fc56b1aa4e7e 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -907,7 +907,7 @@ def Vector_InsertOp : }]; let arguments = (ins - AnyType:$source, + AnyType:$valueToStore, AnyVectorOfAnyRank:$dest, Variadic:$dynamic_position, DenseI64ArrayAttr:$static_position @@ -916,15 +916,15 @@ def Vector_InsertOp : let builders = [ // Builder to insert a scalar/rank-0 vector into a rank-0 vector. - OpBuilder<(ins "Value":$source, "Value":$dest)>, - OpBuilder<(ins "Value":$source, "Value":$dest, "int64_t":$position)>, - OpBuilder<(ins "Value":$source, "Value":$dest, "OpFoldResult":$position)>, - OpBuilder<(ins "Value":$source, "Value":$dest, "ArrayRef":$position)>, - OpBuilder<(ins "Value":$source, "Value":$dest, "ArrayRef":$position)>, + OpBuilder<(ins "Value":$valueToStore, "Value":$dest)>, + OpBuilder<(ins "Value":$valueToStore, "Value":$dest, "int64_t":$position)>, + OpBuilder<(ins "Value":$valueToStore, "Value":$dest, "OpFoldResult":$position)>, + OpBuilder<(ins "Value":$valueToStore, "Value":$dest, "ArrayRef":$position)>, + OpBuilder<(ins "Value":$valueToStore, "Value":$dest, "ArrayRef":$position)>, ]; let extraClassDeclaration = extraPoisonClassDeclaration # [{ - Type getSourceType() { return getSource().getType(); } + Type getValueToStoreType() { return getValueToStore().getType(); } VectorType getDestVectorType() { return ::llvm::cast(getDest().getType()); } @@ -946,8 +946,8 @@ def Vector_InsertOp : }]; let assemblyFormat = [{ - $source `,` $dest custom($dynamic_position, $static_position) - attr-dict `:` type($source) `into` type($dest) + $valueToStore `,` $dest custom($dynamic_position, $static_position) + attr-dict `:` type($valueToStore) `into` type($dest) }]; let hasCanonicalizer = 1; @@ -957,13 +957,13 @@ def Vector_InsertOp : def Vector_ScalableInsertOp : Vector_Op<"scalable.insert", [Pure, - AllElementTypesMatch<["source", "dest"]>, + AllElementTypesMatch<["valueToStore", "dest"]>, AllTypesMatch<["dest", "res"]>, PredOpTrait<"position is a multiple of the source length.", CPred< "(getPos() % getSourceVectorType().getNumElements()) == 0" >>]>, - Arguments<(ins VectorOfRank<[1]>:$source, + Arguments<(ins VectorOfRank<[1]>:$valueToStore, ScalableVectorOfRank<[1]>:$dest, I64Attr:$pos)>, Results<(outs ScalableVectorOfRank<[1]>:$res)> { @@ -999,12 +999,12 @@ def Vector_ScalableInsertOp : }]; let assemblyFormat = [{ - $source `,` $dest `[` $pos `]` attr-dict `:` type($source) `into` type($dest) + $valueToStore `,` $dest `[` $pos `]` attr-dict `:` type($valueToStore) `into` type($dest) }]; let extraClassDeclaration = extraPoisonClassDeclaration # [{ VectorType getSourceVectorType() { - return ::llvm::cast(getSource().getType()); + return ::llvm::cast(getValueToStore().getType()); } VectorType getDestVectorType() { return ::llvm::cast(getDest().getType()); @@ -1068,20 +1068,20 @@ def Vector_InsertStridedSliceOp : PredOpTrait<"operand #0 and result have same element type", TCresVTEtIsSameAsOpBase<0, 0>>, AllTypesMatch<["dest", "res"]>]>, - Arguments<(ins AnyVectorOfNonZeroRank:$source, AnyVectorOfNonZeroRank:$dest, I64ArrayAttr:$offsets, + Arguments<(ins AnyVectorOfNonZeroRank:$valueToStore, AnyVectorOfNonZeroRank:$dest, I64ArrayAttr:$offsets, I64ArrayAttr:$strides)>, Results<(outs AnyVectorOfNonZeroRank:$res)> { let summary = "strided_slice operation"; let description = [{ - Takes a k-D source vector, an n-D destination vector (n >= k), n-sized + Takes a k-D valueToStore vector, an n-D destination vector (n >= k), n-sized `offsets` integer array attribute, a k-sized `strides` integer array attribute - and inserts the k-D source vector as a strided subvector at the proper offset + and inserts the k-D valueToStore vector as a strided subvector at the proper offset into the n-D destination vector. At the moment strides must contain only 1s. Returns an n-D vector that is a copy of the n-D destination vector in which - the last k-D dimensions contain the k-D source vector elements strided at + the last k-D dimensions contain the k-D valueToStore vector elements strided at the proper location as specified by the offsets. Example: @@ -1094,16 +1094,17 @@ def Vector_InsertStridedSliceOp : }]; let assemblyFormat = [{ - $source `,` $dest attr-dict `:` type($source) `into` type($dest) + $valueToStore `,` $dest attr-dict `:` type($valueToStore) `into` type($dest) }]; let builders = [ - OpBuilder<(ins "Value":$source, "Value":$dest, + OpBuilder<(ins "Value":$valueToStore, "Value":$dest, "ArrayRef":$offsets, "ArrayRef":$strides)> ]; let extraClassDeclaration = [{ + // TODO: Rename VectorType getSourceVectorType() { - return ::llvm::cast(getSource().getType()); + return ::llvm::cast(getValueToStore().getType()); } VectorType getDestVectorType() { return ::llvm::cast(getDest().getType()); @@ -1520,7 +1521,7 @@ def Vector_TransferWriteOp : AttrSizedOperandSegments, DestinationStyleOpInterface ]>, - Arguments<(ins AnyVectorOfAnyRank:$vector, + Arguments<(ins AnyVectorOfAnyRank:$valueToStore, AnyShaped:$source, Variadic:$indices, AffineMapAttr:$permutation_map, diff --git a/mlir/include/mlir/Interfaces/VectorInterfaces.td b/mlir/include/mlir/Interfaces/VectorInterfaces.td index be939bad14b7b..8ea9d925b3790 100644 --- a/mlir/include/mlir/Interfaces/VectorInterfaces.td +++ b/mlir/include/mlir/Interfaces/VectorInterfaces.td @@ -124,6 +124,14 @@ def VectorTransferOpInterface : OpInterface<"VectorTransferOpInterface"> { /*methodName=*/"getVector", /*args=*/(ins) >, + InterfaceMethod< + /*desc=*/[{ + Return the type of the vector that this operation operates on. + }], + /*retTy=*/"::mlir::VectorType", + /*methodName=*/"getVectorType", + /*args=*/(ins) + >, InterfaceMethod< /*desc=*/[{ Return the indices that specify the starting offsets into the source @@ -133,6 +141,7 @@ def VectorTransferOpInterface : OpInterface<"VectorTransferOpInterface"> { /*methodName=*/"getIndices", /*args=*/(ins) >, + InterfaceMethod< /*desc=*/[{ Return the permutation map that describes the mapping of vector @@ -202,11 +211,6 @@ def VectorTransferOpInterface : OpInterface<"VectorTransferOpInterface"> { return $_op.getPermutationMap().getNumResults(); } - /// Return the type of the vector that this operation operates on. - ::mlir::VectorType getVectorType() { - return ::llvm::cast<::mlir::VectorType>($_op.getVector().getType()); - } - /// Return "true" if at least one of the vector dimensions is a broadcasted /// dimension. bool hasBroadcastDim() { diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp index 4be0fffe8b728..58b85bc0ea6ac 100644 --- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp +++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp @@ -579,7 +579,7 @@ struct VectorInsertToArmSMELowering auto loc = insertOp.getLoc(); auto position = insertOp.getMixedPosition(); - Value source = insertOp.getSource(); + Value source = insertOp.getValueToStore(); // Overwrite entire vector with value. Should be handled by folder, but // just to be safe. diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index 213f7375b8d13..847e7e2beebe9 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -1257,7 +1257,7 @@ class VectorInsertOpConversion // We are going to mutate this 1D vector until it is either the final // result (in the non-aggregate case) or the value that needs to be // inserted into the aggregate result. - Value sourceAggregate = adaptor.getSource(); + Value sourceAggregate = adaptor.getValueToStore(); if (insertIntoInnermostDim) { // Scalar-into-1D-vector case, so we know we will have to create a // InsertElementOp. The question is into what destination. @@ -1279,7 +1279,8 @@ class VectorInsertOpConversion } // Insert the scalar into the 1D vector. sourceAggregate = rewriter.create( - loc, sourceAggregate.getType(), sourceAggregate, adaptor.getSource(), + loc, sourceAggregate.getType(), sourceAggregate, + adaptor.getValueToStore(), getAsLLVMValue(rewriter, loc, positionOfScalarWithin1DVector)); } @@ -1305,7 +1306,7 @@ struct VectorScalableInsertOpLowering matchAndRewrite(vector::ScalableInsertOp insOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { rewriter.replaceOpWithNewOp( - insOp, adaptor.getDest(), adaptor.getSource(), adaptor.getPos()); + insOp, adaptor.getDest(), adaptor.getValueToStore(), adaptor.getPos()); return success(); } }; diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp index 95db831185590..b9b598c02b4a2 100644 --- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp +++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp @@ -661,7 +661,7 @@ struct PrepareTransferWriteConversion buffers.dataBuffer); auto loadedVec = rewriter.create(loc, buffers.dataBuffer); rewriter.modifyOpInPlace(xferOp, [&]() { - xferOp.getVectorMutable().assign(loadedVec); + xferOp.getValueToStoreMutable().assign(loadedVec); xferOp->setAttr(kPassLabel, rewriter.getUnitAttr()); }); diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp index bca77ba68fbd1..de2af69eba9ec 100644 --- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp +++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp @@ -287,16 +287,16 @@ struct VectorInsertOpConvert final LogicalResult matchAndRewrite(vector::InsertOp insertOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - if (isa(insertOp.getSourceType())) + if (isa(insertOp.getValueToStoreType())) return rewriter.notifyMatchFailure(insertOp, "unsupported vector source"); if (!getTypeConverter()->convertType(insertOp.getDestVectorType())) return rewriter.notifyMatchFailure(insertOp, "unsupported dest vector type"); // Special case for inserting scalar values into size-1 vectors. - if (insertOp.getSourceType().isIntOrFloat() && + if (insertOp.getValueToStoreType().isIntOrFloat() && insertOp.getDestVectorType().getNumElements() == 1) { - rewriter.replaceOp(insertOp, adaptor.getSource()); + rewriter.replaceOp(insertOp, adaptor.getValueToStore()); return success(); } @@ -307,14 +307,15 @@ struct VectorInsertOpConvert final insertOp, "Static use of poison index handled elsewhere (folded to poison)"); rewriter.replaceOpWithNewOp( - insertOp, adaptor.getSource(), adaptor.getDest(), id.value()); + insertOp, adaptor.getValueToStore(), adaptor.getDest(), id.value()); } else { Value sanitizedIndex = sanitizeDynamicIndex( rewriter, insertOp.getLoc(), adaptor.getDynamicPosition()[0], vector::InsertOp::kPoisonIndex, insertOp.getDestVectorType().getNumElements()); rewriter.replaceOpWithNewOp( - insertOp, insertOp.getDest(), adaptor.getSource(), sanitizedIndex); + insertOp, insertOp.getDest(), adaptor.getValueToStore(), + sanitizedIndex); } return success(); } diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp index dec3dca988ae9..62a148d2b7e62 100644 --- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp +++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp @@ -357,7 +357,7 @@ struct LegalizeTransferWriteOpsByDecomposition auto loc = writeOp.getLoc(); auto smeTileType = getSMETileTypeForElement(vectorType.getElementType()); - auto inputSMETiles = adaptor.getVector(); + auto inputSMETiles = adaptor.getValueToStore(); Value destTensorOrMemref = writeOp.getSource(); for (auto [index, smeTile] : llvm::enumerate(decomposeToSMETiles( @@ -464,7 +464,7 @@ struct LegalizeMultiTileTransferWriteAsStoreLoop rewriter.setInsertionPointToStart(storeLoop.getBody()); // For each sub-tile of the multi-tile `vectorType`. - auto inputSMETiles = adaptor.getVector(); + auto inputSMETiles = adaptor.getValueToStore(); auto tileSliceIndex = storeLoop.getInductionVar(); for (auto [index, smeTile] : llvm::enumerate( decomposeToSMETiles(rewriter, vectorType, smeTileType))) { diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp index acfd9683f01f4..20e4e3cee7ed4 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp @@ -371,7 +371,7 @@ void mlir::linalg::hoistRedundantVectorTransfers(Operation *root, if (failed(maybeNewLoop)) return WalkResult::interrupt(); - transferWrite.getVectorMutable().assign( + transferWrite.getValueToStoreMutable().assign( maybeNewLoop->getOperation()->getResults().back()); changed = true; // Need to interrupt and restart because erasing the loop messes up diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 8c8b1b85ef5a3..5afe378463d13 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -3177,8 +3177,8 @@ LogicalResult LinalgCopyVTWForwardingPattern::matchAndRewrite( rewriter.create( xferOp.getLoc(), vector, out, xferOp.getIndices(), xferOp.getPermutationMapAttr(), xferOp.getMask(), - rewriter.getBoolArrayAttr( - SmallVector(vector.getType().getRank(), false))); + rewriter.getBoolArrayAttr(SmallVector( + dyn_cast(vector.getType()).getRank(), false))); rewriter.eraseOp(copyOp); rewriter.eraseOp(xferOp); diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 5a3983699d5a3..98d98f067de14 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -1555,7 +1555,7 @@ ExtractFromInsertTransposeChainState::handleInsertOpWithMatchingPos( if (insertedPos != llvm::ArrayRef(extractPosition).take_front(extractedRank)) return failure(); // Case 2.a. early-exit fold. - res = nextInsertOp.getSource(); + res = nextInsertOp.getValueToStore(); // Case 2.b. if internal transposition is present, canFold will be false. return success(canFold()); } @@ -1579,7 +1579,7 @@ ExtractFromInsertTransposeChainState::handleInsertOpWithPrefixPos(Value &res) { extractPosition.begin() + insertedPos.size()); extractedRank = extractPosition.size() - sentinels.size(); // Case 3.a. early-exit fold (break and delegate to post-while path). - res = nextInsertOp.getSource(); + res = nextInsertOp.getValueToStore(); // Case 3.b. if internal transposition is present, canFold will be false. return success(); } @@ -1936,7 +1936,7 @@ static Value foldExtractStridedOpFromInsertChain(ExtractOp extractOp) { insertRankDiff)) return Value(); } - extractOp.getVectorMutable().assign(insertOp.getSource()); + extractOp.getVectorMutable().assign(insertOp.getValueToStore()); // OpBuilder is only used as a helper to build an I64ArrayAttr. OpBuilder b(extractOp.getContext()); extractOp.setStaticPosition(offsetDiffs); @@ -2958,7 +2958,7 @@ LogicalResult InsertOp::verify() { if (position.size() > static_cast(destVectorType.getRank())) return emitOpError( "expected position attribute of rank no greater than dest vector rank"); - auto srcVectorType = llvm::dyn_cast(getSourceType()); + auto srcVectorType = llvm::dyn_cast(getValueToStoreType()); if (srcVectorType && (static_cast(srcVectorType.getRank()) + position.size() != static_cast(destVectorType.getRank()))) @@ -2994,12 +2994,13 @@ class InsertToBroadcast final : public OpRewritePattern { LogicalResult matchAndRewrite(InsertOp insertOp, PatternRewriter &rewriter) const override { - auto srcVecType = llvm::dyn_cast(insertOp.getSourceType()); + auto srcVecType = + llvm::dyn_cast(insertOp.getValueToStoreType()); if (!srcVecType || insertOp.getDestVectorType().getNumElements() != srcVecType.getNumElements()) return failure(); rewriter.replaceOpWithNewOp( - insertOp, insertOp.getDestVectorType(), insertOp.getSource()); + insertOp, insertOp.getDestVectorType(), insertOp.getValueToStore()); return success(); } }; @@ -3011,7 +3012,7 @@ class InsertSplatToSplat final : public OpRewritePattern { LogicalResult matchAndRewrite(InsertOp op, PatternRewriter &rewriter) const override { - auto srcSplat = op.getSource().getDefiningOp(); + auto srcSplat = op.getValueToStore().getDefiningOp(); auto dstSplat = op.getDest().getDefiningOp(); if (!srcSplat || !dstSplat) @@ -3100,17 +3101,17 @@ OpFoldResult vector::InsertOp::fold(FoldAdaptor adaptor) { // Fold "vector.insert %v, %dest [] : vector<2x2xf32> from vector<2x2xf32>" to // %v. Note: Do not fold "vector.insert %v, %dest [] : f32 into vector" // (type mismatch). - if (getNumIndices() == 0 && getSourceType() == getType()) - return getSource(); - SmallVector operands = {getSource(), getDest()}; + if (getNumIndices() == 0 && getValueToStoreType() == getType()) + return getValueToStore(); + SmallVector operands = {getValueToStore(), getDest()}; if (auto val = extractInsertFoldConstantOp(*this, adaptor, operands)) return val; if (auto res = foldPoisonIndexInsertExtractOp( getContext(), adaptor.getStaticPosition(), kPoisonIndex)) return res; - if (auto res = foldDenseElementsAttrDestInsertOp(*this, adaptor.getSource(), - adaptor.getDest(), - vectorSizeFoldThreshold)) { + if (auto res = foldDenseElementsAttrDestInsertOp( + *this, adaptor.getValueToStore(), adaptor.getDest(), + vectorSizeFoldThreshold)) { return res; } @@ -3291,7 +3292,7 @@ class FoldInsertStridedSliceSplat final LogicalResult matchAndRewrite(InsertStridedSliceOp insertStridedSliceOp, PatternRewriter &rewriter) const override { auto srcSplatOp = - insertStridedSliceOp.getSource().getDefiningOp(); + insertStridedSliceOp.getValueToStore().getDefiningOp(); auto destSplatOp = insertStridedSliceOp.getDest().getDefiningOp(); @@ -3316,7 +3317,7 @@ class FoldInsertStridedSliceOfExtract final LogicalResult matchAndRewrite(InsertStridedSliceOp insertStridedSliceOp, PatternRewriter &rewriter) const override { auto extractStridedSliceOp = - insertStridedSliceOp.getSource() + insertStridedSliceOp.getValueToStore() .getDefiningOp(); if (!extractStridedSliceOp) @@ -3365,7 +3366,7 @@ class InsertStridedSliceConstantFolder final !destVector.hasOneUse()) return failure(); - TypedValue sourceValue = op.getSource(); + TypedValue sourceValue = op.getValueToStore(); Attribute sourceCst; if (!matchPattern(sourceValue, m_Constant(&sourceCst))) return failure(); @@ -3425,7 +3426,7 @@ void vector::InsertStridedSliceOp::getCanonicalizationPatterns( OpFoldResult InsertStridedSliceOp::fold(FoldAdaptor adaptor) { if (getSourceVectorType() == getDestVectorType()) - return getSource(); + return getValueToStore(); return {}; } @@ -3691,7 +3692,7 @@ foldExtractStridedOpFromInsertChain(ExtractStridedSliceOp op) { } // The extract element chunk is a subset of the insert element. if (!disjoint && !patialoverlap) { - op.setOperand(insertOp.getSource()); + op.setOperand(insertOp.getValueToStore()); // OpBuilder is only used as a helper to build an I64ArrayAttr. OpBuilder b(op.getContext()); op.setOffsetsAttr(b.getI64ArrayAttr(offsetDiffs)); @@ -4349,6 +4350,13 @@ Type TransferReadOp::getExpectedMaskType() { return inferTransferOpMaskType(getVectorType(), getPermutationMap()); } +//===----------------------------------------------------------------------===// +// TransferReadOp: VectorTransferOpInterface methods. +//===----------------------------------------------------------------------===// +VectorType TransferReadOp::getVectorType() { + return cast(getVector().getType()); +} + template static bool isInBounds(TransferOp op, int64_t resultIdx, int64_t indicesIdx) { // TODO: support more aggressive createOrFold on: @@ -4739,7 +4747,9 @@ LogicalResult TransferWriteOp::verify() { [&](Twine t) { return emitOpError(t); }); } -// MaskableOpInterface methods. +//===----------------------------------------------------------------------===// +// TransferWriteOp: MaskableOpInterface methods. +//===----------------------------------------------------------------------===// /// Returns the mask type expected by this operation. Mostly used for /// verification purposes. @@ -4747,6 +4757,17 @@ Type TransferWriteOp::getExpectedMaskType() { return inferTransferOpMaskType(getVectorType(), getPermutationMap()); } +//===----------------------------------------------------------------------===// +// TransferWriteOp: VectorTransferOpInterface methods. +//===----------------------------------------------------------------------===// +Value TransferWriteOp::getVector() { return getOperand(0); } +VectorType TransferWriteOp::getVectorType() { + return cast(getValueToStore().getType()); +} + +//===----------------------------------------------------------------------===// +// TransferWriteOp: fold methods. +//===----------------------------------------------------------------------===// /// Fold: /// ``` /// %t1 = ... @@ -4863,6 +4884,9 @@ LogicalResult TransferWriteOp::fold(FoldAdaptor adaptor, return memref::foldMemRefCast(*this); } +//===----------------------------------------------------------------------===// +// TransferWriteOp: other methods. +//===----------------------------------------------------------------------===// std::optional> TransferWriteOp::getShapeForUnroll() { return llvm::to_vector<4>(getVectorType().getShape()); } @@ -4871,7 +4895,7 @@ void TransferWriteOp::getEffects( SmallVectorImpl> &effects) { if (llvm::isa(getShapedType())) - effects.emplace_back(MemoryEffects::Write::get(), &getSourceMutable(), + effects.emplace_back(MemoryEffects::Write::get(), &getValueToStoreMutable(), SideEffects::DefaultResource::get()); } diff --git a/mlir/lib/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.cpp b/mlir/lib/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.cpp index b450d5b78a466..e8e178fe75962 100644 --- a/mlir/lib/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.cpp @@ -45,7 +45,7 @@ struct TransferWriteOpSubsetInsertionOpInterface : public SubsetInsertionOpInterface::ExternalModel< TransferWriteOpSubsetInsertionOpInterface, vector::TransferWriteOp> { OpOperand &getSourceOperand(Operation *op) const { - return cast(op).getVectorMutable(); + return cast(op).getValueToStoreMutable(); } OpOperand &getDestinationOperand(Operation *op) const { diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp index e214257de2cdf..19f408ad1b570 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -496,7 +496,8 @@ struct WarpOpTransferWrite : public WarpDistributionPattern { rewriter.setInsertionPointToStart(&body); auto newWriteOp = cast(rewriter.clone(*writeOp.getOperation())); - newWriteOp.getVectorMutable().assign(newWarpOp.getResult(newRetIndices[0])); + newWriteOp.getValueToStoreMutable().assign( + newWarpOp.getResult(newRetIndices[0])); rewriter.eraseOp(writeOp); rewriter.create(newWarpOp.getLoc()); return success(); @@ -559,7 +560,8 @@ struct WarpOpTransferWrite : public WarpDistributionPattern { auto newWriteOp = cast(rewriter.clone(*writeOp.getOperation())); rewriter.eraseOp(writeOp); - newWriteOp.getVectorMutable().assign(newWarpOp.getResult(newRetIndices[0])); + newWriteOp.getValueToStoreMutable().assign( + newWarpOp.getResult(newRetIndices[0])); if (maybeMaskType) newWriteOp.getMaskMutable().assign(newWarpOp.getResult(newRetIndices[1])); return newWriteOp; @@ -1299,9 +1301,9 @@ struct WarpOpInsertScalar : public WarpDistributionPattern { // Yield destination vector, source scalar and position from warp op. SmallVector additionalResults{insertOp.getDest(), - insertOp.getSource()}; - SmallVector additionalResultTypes{distrType, - insertOp.getSource().getType()}; + insertOp.getValueToStore()}; + SmallVector additionalResultTypes{ + distrType, insertOp.getValueToStore().getType()}; additionalResults.append(SmallVector(insertOp.getDynamicPosition())); additionalResultTypes.append( SmallVector(insertOp.getDynamicPosition().getTypes())); @@ -1393,8 +1395,8 @@ struct WarpOpInsert : public WarpDistributionPattern { // out of the warp op. SmallVector newRetIndices; WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, warpOp, {insertOp.getSource(), insertOp.getDest()}, - {insertOp.getSourceType(), insertOp.getDestVectorType()}, + rewriter, warpOp, {insertOp.getValueToStore(), insertOp.getDest()}, + {insertOp.getValueToStoreType(), insertOp.getDestVectorType()}, newRetIndices); rewriter.setInsertionPointAfter(newWarpOp); Value distributedSrc = newWarpOp->getResult(newRetIndices[0]); @@ -1422,7 +1424,7 @@ struct WarpOpInsert : public WarpDistributionPattern { assert(distrDestDim != -1 && "could not find distributed dimension"); // Compute the distributed source vector type. - VectorType srcVecType = cast(insertOp.getSourceType()); + VectorType srcVecType = cast(insertOp.getValueToStoreType()); SmallVector distrSrcShape(srcVecType.getShape()); // E.g.: vector.insert %s, %d [2] : vector<96xf32> into vector<128x96xf32> // Case 1: distrDestDim = 1 (dim of size 96). In that case, each lane will @@ -1439,7 +1441,7 @@ struct WarpOpInsert : public WarpDistributionPattern { // Yield source and dest vectors from warp op. SmallVector newRetIndices; WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, warpOp, {insertOp.getSource(), insertOp.getDest()}, + rewriter, warpOp, {insertOp.getValueToStore(), insertOp.getDest()}, {distrSrcType, distrDestType}, newRetIndices); rewriter.setInsertionPointAfter(newWarpOp); Value distributedSrc = newWarpOp->getResult(newRetIndices[0]); diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp index b53aa997c9014..fda3baf3aa390 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp @@ -122,7 +122,7 @@ struct CastAwayInsertStridedSliceLeadingOneDim Location loc = insertOp.getLoc(); Value newSrcVector = rewriter.create( - loc, insertOp.getSource(), splatZero(srcDropCount)); + loc, insertOp.getValueToStore(), splatZero(srcDropCount)); Value newDstVector = rewriter.create( loc, insertOp.getDest(), splatZero(dstDropCount)); @@ -148,7 +148,7 @@ struct CastAwayInsertLeadingOneDim : public OpRewritePattern { LogicalResult matchAndRewrite(vector::InsertOp insertOp, PatternRewriter &rewriter) const override { - Type oldSrcType = insertOp.getSourceType(); + Type oldSrcType = insertOp.getValueToStoreType(); Type newSrcType = oldSrcType; int64_t oldSrcRank = 0, newSrcRank = 0; if (auto type = dyn_cast(oldSrcType)) { @@ -168,10 +168,10 @@ struct CastAwayInsertLeadingOneDim : public OpRewritePattern { // Trim leading one dimensions from both operands. Location loc = insertOp.getLoc(); - Value newSrcVector = insertOp.getSource(); + Value newSrcVector = insertOp.getValueToStore(); if (oldSrcRank != 0) { newSrcVector = rewriter.create( - loc, insertOp.getSource(), splatZero(srcDropCount)); + loc, insertOp.getValueToStore(), splatZero(srcDropCount)); } Value newDstVector = rewriter.create( loc, insertOp.getDest(), splatZero(dstDropCount)); diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp index 82a985c9e5824..d834a99076834 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorInsertExtractStridedSliceRewritePatterns.cpp @@ -61,7 +61,7 @@ class DecomposeDifferentRankInsertStridedSlice // A different pattern will kick in for InsertStridedSlice with matching // ranks. auto stridedSliceInnerOp = rewriter.create( - loc, op.getSource(), extracted, + loc, op.getValueToStore(), extracted, getI64SubArray(op.getOffsets(), /*dropFront=*/rankDiff), getI64SubArray(op.getStrides(), /*dropFront=*/0)); @@ -111,7 +111,7 @@ class ConvertSameRankInsertStridedSliceIntoShuffle return failure(); if (srcType == dstType) { - rewriter.replaceOp(op, op.getSource()); + rewriter.replaceOp(op, op.getValueToStore()); return success(); } @@ -131,8 +131,8 @@ class ConvertSameRankInsertStridedSliceIntoShuffle SmallVector offsets(nDest, 0); for (int64_t i = 0; i < nSrc; ++i) offsets[i] = i; - Value scaledSource = rewriter.create(loc, op.getSource(), - op.getSource(), offsets); + Value scaledSource = rewriter.create( + loc, op.getValueToStore(), op.getValueToStore(), offsets); // 2. Create a mask where we take the value from scaledSource of dest // depending on the offset. @@ -156,7 +156,7 @@ class ConvertSameRankInsertStridedSliceIntoShuffle off += stride, ++idx) { // 1. extract the proper subvector (or element) from source Value extractedSource = - rewriter.create(loc, op.getSource(), idx); + rewriter.create(loc, op.getValueToStore(), idx); if (isa(extractedSource.getType())) { // 2. If we have a vector, extract the proper subvector from destination // Otherwise we are at the element level and no need to recurse. diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp index 9dccc005322eb..a009aa03aaf64 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp @@ -439,7 +439,7 @@ struct LinearizeVectorInsert final return rewriter.notifyMatchFailure(insertOp, "scalable vectors are not supported."); - if (!isLessThanOrEqualTargetBitWidth(insertOp.getSourceType(), + if (!isLessThanOrEqualTargetBitWidth(insertOp.getValueToStoreType(), targetVectorBitWidth)) return rewriter.notifyMatchFailure( insertOp, "Can't flatten since targetBitWidth < OpSize"); @@ -448,7 +448,7 @@ struct LinearizeVectorInsert final if (insertOp.hasDynamicPosition()) return rewriter.notifyMatchFailure(insertOp, "dynamic position is not supported."); - auto srcTy = insertOp.getSourceType(); + auto srcTy = insertOp.getValueToStoreType(); auto srcAsVec = dyn_cast(srcTy); uint64_t srcSize = 0; if (srcAsVec) { @@ -484,7 +484,7 @@ struct LinearizeVectorInsert final // [offset+srcNumElements, end) rewriter.replaceOpWithNewOp( - insertOp, dstTy, adaptor.getDest(), adaptor.getSource(), indices); + insertOp, dstTy, adaptor.getDest(), adaptor.getValueToStore(), indices); return success(); } diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp index b6fac80d871e6..d50d5fe96f49a 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp @@ -748,7 +748,7 @@ struct BubbleUpBitCastForInsert : public OpRewritePattern { return failure(); // Only vector sources are supported for now. - auto insertSrcType = dyn_cast(insertOp.getSourceType()); + auto insertSrcType = dyn_cast(insertOp.getValueToStoreType()); if (!insertSrcType) return failure(); @@ -759,7 +759,7 @@ struct BubbleUpBitCastForInsert : public OpRewritePattern { VectorType newCastSrcType = VectorType::get(srcDims, castDstType.getElementType()); auto newCastSrcOp = rewriter.create( - bitcastOp.getLoc(), newCastSrcType, insertOp.getSource()); + bitcastOp.getLoc(), newCastSrcType, insertOp.getValueToStore()); SmallVector dstDims(insertOp.getDestVectorType().getShape()); dstDims.back() = @@ -850,7 +850,7 @@ struct BubbleUpBitCastForStridedSliceInsert VectorType::get(srcDims, castDstType.getElementType()); auto newCastSrcOp = rewriter.create( - bitcastOp.getLoc(), newCastSrcType, insertOp.getSource()); + bitcastOp.getLoc(), newCastSrcType, insertOp.getValueToStore()); SmallVector dstDims = llvm::to_vector<4>(insertOp.getDestVectorType().getShape()); From 382962b4a848f66cddbd7d04f6d613fe93a3f125 Mon Sep 17 00:00:00 2001 From: Krisztian Rugasi Date: Fri, 4 Apr 2025 12:35:34 +0200 Subject: [PATCH 0841/1029] [GlobalISel] Fix dangling reference in CombinerHelper::matchCombineExtractedVectorLoad --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index bab0c1596ca40..fed1dc53f9bb8 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1411,9 +1411,8 @@ bool CombinerHelper::matchCombineExtractedVectorLoad( LegalityQuery::MemDesc MMDesc(*NewMMO); - LegalityQuery Q = {TargetOpcode::G_LOAD, {VecEltTy, PtrTy}, {MMDesc}}; - - if (!isLegalOrBeforeLegalizer(Q)) + if (!isLegalOrBeforeLegalizer( + {TargetOpcode::G_LOAD, {VecEltTy, PtrTy}, {MMDesc}})) return false; // Load must be allowed and fast on the target. From 67dd2019aca117798bbb0eb9e0dc5560c06864fc Mon Sep 17 00:00:00 2001 From: Nashe Mncube Date: Mon, 7 Apr 2025 14:09:43 +0100 Subject: [PATCH 0842/1029] Recommit [AArch64][SVE]Use FeatureUseFixedOverScalableIfEqualCost for A510/A520 (#134606) Recommit. This work was done by #132246 but failed buildbots due to the test introduced needing updates Inefficient SVE codegen occurs on at least two in-order cores, those being Cortex-A510 and Cortex-A520. For example a simple vector add ``` void foo(float a, float b, float dst, unsigned n) { for (unsigned i = 0; i < n; ++i) dst[i] = a[i] + b[i]; } ``` Vectorizes the inner loop into the following interleaved sequence of instructions. ``` add x12, x1, x10 ld1b { z0.b }, p0/z, [x1, x10] add x13, x2, x10 ld1b { z1.b }, p0/z, [x2, x10] ldr z2, [x12, #1, mul vl] ldr z3, [x13, #1, mul vl] dech x11 add x12, x0, x10 fadd z0.s, z1.s, z0.s fadd z1.s, z3.s, z2.s st1b { z0.b }, p0, [x0, x10] addvl x10, x10, #2 str z1, [x12, #1, mul vl] ``` By adjusting the target features to prefer fixed over scalable if the cost is equal we get the following vectorized loop. ``` ldp q0, q3, [x11, #-16] subs x13, x13, #8 ldp q1, q2, [x10, #-16] add x10, x10, #32 add x11, x11, #32 fadd v0.4s, v1.4s, v0.4s fadd v1.4s, v2.4s, v3.4s stp q0, q1, [x12, #-16] add x12, x12, #32 ``` Which is more efficient. --- llvm/lib/Target/AArch64/AArch64Processors.td | 2 + .../AArch64/sve-fixed-width-inorder-core.ll | 168 ++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 67d3ff685e6f1..c37dd025d80aa 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -723,6 +723,7 @@ def ProcessorFeatures { FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2, FeatureComplxNum, FeatureCRC, FeatureDotProd, FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE, + FeatureUseFixedOverScalableIfEqualCost, FeatureRAS, FeatureRCPC, FeatureRDM]; list A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVEBitPerm, @@ -732,6 +733,7 @@ def ProcessorFeatures { FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC, FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, FeatureJS, FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM, + FeatureUseFixedOverScalableIfEqualCost, FeatureDotProd]; list A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM, FeatureMTE, FeatureETE, FeatureSVEBitPerm, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll new file mode 100644 index 0000000000000..51e24924cae7a --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll @@ -0,0 +1,168 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a510 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA510 +; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a520 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA520 + +define void @sve_add(ptr %dst, ptr %a, ptr %b, i64 %n) { +; CHECK-CA510-LABEL: define void @sve_add( +; CHECK-CA510-SAME: ptr [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-CA510-NEXT: [[ENTRY:.*:]] +; CHECK-CA510-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-CA510-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-CA510-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-CA510-NEXT: [[CMP9_NOT:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-CA510-NEXT: br i1 [[CMP9_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK-CA510: [[FOR_BODY_PREHEADER]]: +; CHECK-CA510-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-CA510-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK-CA510: [[VECTOR_MEMCHECK]]: +; CHECK-CA510-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[A2]] +; CHECK-CA510-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; CHECK-CA510-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[B3]] +; CHECK-CA510-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32 +; CHECK-CA510-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-CA510-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK-CA510: [[VECTOR_PH]]: +; CHECK-CA510-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-CA510-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-CA510-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-CA510: [[VECTOR_BODY]]: +; CHECK-CA510-NEXT: [[TMP2:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-CA510-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP2]] +; CHECK-CA510-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0 +; CHECK-CA510-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 4 +; CHECK-CA510-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-CA510-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-CA510-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP2]] +; CHECK-CA510-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 0 +; CHECK-CA510-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 4 +; CHECK-CA510-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP7]], align 4 +; CHECK-CA510-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP8]], align 4 +; CHECK-CA510-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD]] +; CHECK-CA510-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD5]] +; CHECK-CA510-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[TMP2]] +; CHECK-CA510-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 0 +; CHECK-CA510-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 4 +; CHECK-CA510-NEXT: store <4 x float> [[TMP9]], ptr [[TMP12]], align 4 +; CHECK-CA510-NEXT: store <4 x float> [[TMP10]], ptr [[TMP13]], align 4 +; CHECK-CA510-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP2]], 8 +; CHECK-CA510-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-CA510-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-CA510: [[MIDDLE_BLOCK]]: +; CHECK-CA510-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-CA510-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-CA510: [[SCALAR_PH]]: +; CHECK-CA510-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-CA510-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-CA510: [[FOR_BODY]]: +; CHECK-CA510-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-CA510-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-CA510-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-CA510-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-CA510-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-CA510-NEXT: [[ADD:%.*]] = fadd fast float [[TMP16]], [[TMP15]] +; CHECK-CA510-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-CA510-NEXT: store float [[ADD]], ptr [[ARRAYIDX4]], align 4 +; CHECK-CA510-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-CA510-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-CA510-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-CA510: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; CHECK-CA510-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK-CA510: [[FOR_COND_CLEANUP]]: +; CHECK-CA510-NEXT: ret void +; +; CHECK-CA520-LABEL: define void @sve_add( +; CHECK-CA520-SAME: ptr [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-CA520-NEXT: [[ENTRY:.*:]] +; CHECK-CA520-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-CA520-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-CA520-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 +; CHECK-CA520-NEXT: [[CMP9_NOT:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-CA520-NEXT: br i1 [[CMP9_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]] +; CHECK-CA520: [[FOR_BODY_PREHEADER]]: +; CHECK-CA520-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-CA520-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK-CA520: [[VECTOR_MEMCHECK]]: +; CHECK-CA520-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[A2]] +; CHECK-CA520-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; CHECK-CA520-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[B3]] +; CHECK-CA520-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32 +; CHECK-CA520-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-CA520-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK-CA520: [[VECTOR_PH]]: +; CHECK-CA520-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-CA520-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-CA520-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-CA520: [[VECTOR_BODY]]: +; CHECK-CA520-NEXT: [[TMP2:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-CA520-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[TMP2]] +; CHECK-CA520-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 0 +; CHECK-CA520-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP3]], i32 4 +; CHECK-CA520-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 +; CHECK-CA520-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 +; CHECK-CA520-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[TMP2]] +; CHECK-CA520-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 0 +; CHECK-CA520-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i32 4 +; CHECK-CA520-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP7]], align 4 +; CHECK-CA520-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP8]], align 4 +; CHECK-CA520-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD]] +; CHECK-CA520-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD5]] +; CHECK-CA520-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[TMP2]] +; CHECK-CA520-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 0 +; CHECK-CA520-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 4 +; CHECK-CA520-NEXT: store <4 x float> [[TMP9]], ptr [[TMP12]], align 4 +; CHECK-CA520-NEXT: store <4 x float> [[TMP10]], ptr [[TMP13]], align 4 +; CHECK-CA520-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP2]], 8 +; CHECK-CA520-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-CA520-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-CA520: [[MIDDLE_BLOCK]]: +; CHECK-CA520-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-CA520-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-CA520: [[SCALAR_PH]]: +; CHECK-CA520-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ] +; CHECK-CA520-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-CA520: [[FOR_BODY]]: +; CHECK-CA520-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-CA520-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-CA520-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-CA520-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-CA520-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-CA520-NEXT: [[ADD:%.*]] = fadd fast float [[TMP16]], [[TMP15]] +; CHECK-CA520-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-CA520-NEXT: store float [[ADD]], ptr [[ARRAYIDX4]], align 4 +; CHECK-CA520-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-CA520-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-CA520-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-CA520: [[FOR_COND_CLEANUP_LOOPEXIT]]: +; CHECK-CA520-NEXT: br label %[[FOR_COND_CLEANUP]] +; CHECK-CA520: [[FOR_COND_CLEANUP]]: +; CHECK-CA520-NEXT: ret void +; +entry: + %cmp9.not = icmp eq i64 %n, 0 + br i1 %cmp9.not, label %for.cond.cleanup, label %for.body +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds nuw float, ptr %a, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds nuw float, ptr %b, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4 + %add = fadd fast float %1, %0 + %arrayidx4 = getelementptr inbounds nuw float, ptr %dst, i64 %indvars.iv + store float %add, ptr %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void +} +;. +; CHECK-CA510: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-CA510: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-CA510: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-CA510: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +;. +; CHECK-CA520: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-CA520: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-CA520: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-CA520: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +;. From 771b94fa834b71cfcb0c16b875f656c7facdcb2f Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Mon, 7 Apr 2025 09:26:36 -0400 Subject: [PATCH 0843/1029] [NFC][SPIRV] remove unimplemented function define for `selectCross` (#134478) - `selectCross` looks to be a function that had its implementation and usage removed but this define some how stuck around. - this change removes the definition. --- llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 946a295c2df25..662c38ca4ed28 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -183,8 +183,6 @@ class SPIRVInstructionSelector : public InstructionSelector { bool selectCmp(Register ResVReg, const SPIRVType *ResType, unsigned comparisonOpcode, MachineInstr &I) const; - bool selectCross(Register ResVReg, const SPIRVType *ResType, - MachineInstr &I) const; bool selectDiscard(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; From 0ab2061c4fd0feac8b142ff76ed823534bebc634 Mon Sep 17 00:00:00 2001 From: Pedro Lobo Date: Mon, 7 Apr 2025 14:29:55 +0100 Subject: [PATCH 0844/1029] [WebAssembly] Represent trap instructions as `poison` (#134553) The WebAssemblyLowerRefTypesIntPtrConv pass currently uses `undef` to represent trap instructions. These can instead be represented by the `poison` value. --- .../Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp index c61aa5eff4a70..be500de67e320 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp @@ -68,8 +68,7 @@ bool WebAssemblyLowerRefTypesIntPtrConv::runOnFunction(Function &F) { !(ITP && WebAssembly::isWebAssemblyReferenceType(ITP->getDestTy()))) continue; - UndefValue *U = UndefValue::get(I->getType()); - I->replaceAllUsesWith(U); + I->replaceAllUsesWith(PoisonValue::get(I->getType())); Function *TrapIntrin = Intrinsic::getOrInsertDeclaration(F.getParent(), Intrinsic::debugtrap); From b2711e1526f9384be4905f516a5503427e0bafd7 Mon Sep 17 00:00:00 2001 From: Asher Mancinelli Date: Mon, 7 Apr 2025 06:51:02 -0700 Subject: [PATCH 0845/1029] [flang][nfc] Support volatile on ref, box, and class types (#134386) Part one of merging #132486. Add support for representing volatility in the type system for reference, box, and class types. Don't do anything with volatile just yet, only support and test their representation and utility functions. The naming convention is a little goofy - `fir::isa_volatile_type` and `fir::updateTypeWithVolatility` use different capitalization, but I put them near similar functions and tried to match the surrounding conventions and [the docs](https://github.com/llvm/llvm-project/blob/main/flang/docs/C%2B%2Bstyle.md#naming) best I could. --- .../flang/Optimizer/Builder/FIRBuilder.h | 2 +- .../include/flang/Optimizer/Dialect/FIRType.h | 8 ++ .../flang/Optimizer/Dialect/FIRTypes.td | 29 +++-- flang/lib/Optimizer/Builder/FIRBuilder.cpp | 4 +- flang/lib/Optimizer/Dialect/FIRType.cpp | 108 ++++++++++++++++-- flang/test/Fir/invalid-types.fir | 29 ++++- flang/unittests/Optimizer/FIRTypesTest.cpp | 36 ++++++ 7 files changed, 192 insertions(+), 24 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h index 1583cfb3f5b51..ddd4ef7114a63 100644 --- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h +++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h @@ -150,7 +150,7 @@ class FirOpBuilder : public mlir::OpBuilder, public mlir::OpBuilder::Listener { mlir::Block *getAllocaBlock(); /// Safely create a reference type to the type `eleTy`. - mlir::Type getRefType(mlir::Type eleTy); + mlir::Type getRefType(mlir::Type eleTy, bool isVolatile = false); /// Create a sequence of `eleTy` with `rank` dimensions of unknown size. mlir::Type getVarLenSeqTy(mlir::Type eleTy, unsigned rank = 1); diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h index 76e0aa352bcd9..0dbff258aea86 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRType.h +++ b/flang/include/flang/Optimizer/Dialect/FIRType.h @@ -221,6 +221,10 @@ inline bool isa_char_string(mlir::Type t) { /// (since they may hold one), and are not considered to be unknown size. bool isa_unknown_size_box(mlir::Type t); +/// Returns true iff `t` is a type capable of representing volatility and has +/// the volatile attribute set. +bool isa_volatile_type(mlir::Type t); + /// Returns true iff `t` is a fir.char type and has an unknown length. inline bool characterWithDynamicLen(mlir::Type t) { if (auto charTy = mlir::dyn_cast(t)) @@ -474,6 +478,10 @@ inline mlir::Type updateTypeForUnlimitedPolymorphic(mlir::Type ty) { return ty; } +/// Re-create the given type with the given volatility, if this is a type +/// that can represent volatility. +mlir::Type updateTypeWithVolatility(mlir::Type type, bool isVolatile); + /// Replace the element type of \p type by \p newElementType, preserving /// all other layers of the type (fir.ref/ptr/heap/array/box/class). /// If \p turnBoxIntoClass and the input is a fir.box, it will be turned into diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td index fd5bbbe44751f..84b3932ea75f6 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td +++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td @@ -77,24 +77,24 @@ def fir_BoxType : FIR_Type<"Box", "box", [], "BaseBoxType"> { to) whether the entity is an array, its size, or what type it has. }]; - let parameters = (ins "mlir::Type":$eleTy); + let parameters = (ins "mlir::Type":$eleTy, "bool":$isVolatile); let skipDefaultBuilders = 1; let builders = [ TypeBuilderWithInferredContext<(ins - "mlir::Type":$eleTy), [{ - return Base::get(eleTy.getContext(), eleTy); + "mlir::Type":$eleTy, CArg<"bool", "false">:$isVolatile), [{ + return Base::get(eleTy.getContext(), eleTy, isVolatile); }]>, ]; let extraClassDeclaration = [{ mlir::Type getElementType() const { return getEleTy(); } + bool isVolatile() const { return getIsVolatile(); } }]; let genVerifyDecl = 1; - - let assemblyFormat = "`<` $eleTy `>`"; + let hasCustomAssemblyFormat = 1; } def fir_CharacterType : FIR_Type<"Character", "char"> { @@ -146,16 +146,20 @@ def fir_ClassType : FIR_Type<"Class", "class", [], "BaseBoxType"> { is equivalent to a fir.box type with a dynamic type. }]; - let parameters = (ins "mlir::Type":$eleTy); + let parameters = (ins "mlir::Type":$eleTy, "bool":$isVolatile); let builders = [ - TypeBuilderWithInferredContext<(ins "mlir::Type":$eleTy), [{ - return $_get(eleTy.getContext(), eleTy); + TypeBuilderWithInferredContext<(ins "mlir::Type":$eleTy, CArg<"bool", "false">:$isVolatile), [{ + return $_get(eleTy.getContext(), eleTy, isVolatile); }]> ]; + let extraClassDeclaration = [{ + bool isVolatile() const { return getIsVolatile(); } + }]; + let genVerifyDecl = 1; - let assemblyFormat = "`<` $eleTy `>`"; + let hasCustomAssemblyFormat = 1; } def fir_FieldType : FIR_Type<"Field", "field"> { @@ -363,18 +367,19 @@ def fir_ReferenceType : FIR_Type<"Reference", "ref"> { The type of a reference to an entity in memory. }]; - let parameters = (ins "mlir::Type":$eleTy); + let parameters = (ins "mlir::Type":$eleTy, "bool":$isVolatile); let skipDefaultBuilders = 1; let builders = [ - TypeBuilderWithInferredContext<(ins "mlir::Type":$elementType), [{ - return Base::get(elementType.getContext(), elementType); + TypeBuilderWithInferredContext<(ins "mlir::Type":$elementType, CArg<"bool", "false">:$isVolatile), [{ + return Base::get(elementType.getContext(), elementType, isVolatile); }]>, ]; let extraClassDeclaration = [{ mlir::Type getElementType() const { return getEleTy(); } + bool isVolatile() const { return getIsVolatile(); } }]; let genVerifyDecl = 1; diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index fdc155ef2ef18..7fc30ca125a87 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -105,9 +105,9 @@ fir::FirOpBuilder::getNamedGlobal(mlir::ModuleOp modOp, return modOp.lookupSymbol(name); } -mlir::Type fir::FirOpBuilder::getRefType(mlir::Type eleTy) { +mlir::Type fir::FirOpBuilder::getRefType(mlir::Type eleTy, bool isVolatile) { assert(!mlir::isa(eleTy) && "cannot be a reference type"); - return fir::ReferenceType::get(eleTy); + return fir::ReferenceType::get(eleTy, isVolatile); } mlir::Type fir::FirOpBuilder::getVarLenSeqTy(mlir::Type eleTy, unsigned rank) { diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp index f3f969ba401e5..1df0ea93b759f 100644 --- a/flang/lib/Optimizer/Dialect/FIRType.cpp +++ b/flang/lib/Optimizer/Dialect/FIRType.cpp @@ -32,6 +32,21 @@ using namespace fir; namespace { +static llvm::StringRef getVolatileKeyword() { return "volatile"; } + +static mlir::ParseResult parseOptionalCommaAndKeyword(mlir::AsmParser &parser, + mlir::StringRef keyword, + bool &parsedKeyword) { + if (!parser.parseOptionalComma()) { + if (parser.parseKeyword(keyword)) + return mlir::failure(); + parsedKeyword = true; + return mlir::success(); + } + parsedKeyword = false; + return mlir::success(); +} + template TYPE parseIntSingleton(mlir::AsmParser &parser) { int kind = 0; @@ -215,6 +230,19 @@ mlir::Type getDerivedType(mlir::Type ty) { .Default([](mlir::Type t) { return t; }); } +mlir::Type updateTypeWithVolatility(mlir::Type type, bool isVolatile) { + // If we already have the volatility we asked for, return the type unchanged. + if (fir::isa_volatile_type(type) == isVolatile) + return type; + return mlir::TypeSwitch(type) + .Case( + [&](auto ty) -> mlir::Type { + using TYPE = decltype(ty); + return TYPE::get(ty.getEleTy(), isVolatile); + }) + .Default([&](mlir::Type t) -> mlir::Type { return t; }); +} + mlir::Type dyn_cast_ptrEleTy(mlir::Type t) { return llvm::TypeSwitch(t) .Case(t) + .Case( + [](auto t) { return t.isVolatile(); }) + .Default([](mlir::Type) { return false; }); +} + //===----------------------------------------------------------------------===// // BoxProcType //===----------------------------------------------------------------------===// @@ -738,9 +773,31 @@ static bool cannotBePointerOrHeapElementType(mlir::Type eleTy) { // BoxType //===----------------------------------------------------------------------===// +// `box` `<` type (`, volatile` $volatile^)? `>` +mlir::Type fir::BoxType::parse(mlir::AsmParser &parser) { + mlir::Type eleTy; + auto location = parser.getCurrentLocation(); + auto *context = parser.getContext(); + bool isVolatile = false; + if (parser.parseLess() || parser.parseType(eleTy)) + return {}; + if (parseOptionalCommaAndKeyword(parser, getVolatileKeyword(), isVolatile)) + return {}; + if (parser.parseGreater()) + return {}; + return parser.getChecked(location, context, eleTy, isVolatile); +} + +void fir::BoxType::print(mlir::AsmPrinter &printer) const { + printer << "<" << getEleTy(); + if (isVolatile()) + printer << ", " << getVolatileKeyword(); + printer << '>'; +} + llvm::LogicalResult fir::BoxType::verify(llvm::function_ref emitError, - mlir::Type eleTy) { + mlir::Type eleTy, bool isVolatile) { if (mlir::isa(eleTy)) return emitError() << "invalid element type\n"; // TODO @@ -807,9 +864,32 @@ void fir::CharacterType::print(mlir::AsmPrinter &printer) const { // ClassType //===----------------------------------------------------------------------===// +// `class` `<` type (`, volatile` $volatile^)? `>` +mlir::Type fir::ClassType::parse(mlir::AsmParser &parser) { + mlir::Type eleTy; + auto location = parser.getCurrentLocation(); + auto *context = parser.getContext(); + bool isVolatile = false; + if (parser.parseLess() || parser.parseType(eleTy)) + return {}; + if (parseOptionalCommaAndKeyword(parser, getVolatileKeyword(), isVolatile)) + return {}; + if (parser.parseGreater()) + return {}; + return parser.getChecked(location, context, eleTy, + isVolatile); +} + +void fir::ClassType::print(mlir::AsmPrinter &printer) const { + printer << "<" << getEleTy(); + if (isVolatile()) + printer << ", " << getVolatileKeyword(); + printer << '>'; +} + llvm::LogicalResult fir::ClassType::verify(llvm::function_ref emitError, - mlir::Type eleTy) { + mlir::Type eleTy, bool isVolatile) { if (mlir::isa` +// `ref` `<` type (`, volatile` $volatile^)? `>` mlir::Type fir::ReferenceType::parse(mlir::AsmParser &parser) { - return parseTypeSingleton(parser); + auto location = parser.getCurrentLocation(); + auto *context = parser.getContext(); + mlir::Type eleTy; + bool isVolatile = false; + if (parser.parseLess() || parser.parseType(eleTy)) + return {}; + if (parseOptionalCommaAndKeyword(parser, getVolatileKeyword(), isVolatile)) + return {}; + if (parser.parseGreater()) + return {}; + return parser.getChecked(location, context, eleTy, + isVolatile); } void fir::ReferenceType::print(mlir::AsmPrinter &printer) const { - printer << "<" << getEleTy() << '>'; + printer << "<" << getEleTy(); + if (isVolatile()) + printer << ", " << getVolatileKeyword(); + printer << '>'; } llvm::LogicalResult fir::ReferenceType::verify( - llvm::function_ref emitError, - mlir::Type eleTy) { + llvm::function_ref emitError, mlir::Type eleTy, + bool isVolatile) { if (mlir::isa(eleTy)) return emitError() << "cannot build a reference to type: " << eleTy << '\n'; diff --git a/flang/test/Fir/invalid-types.fir b/flang/test/Fir/invalid-types.fir index f4505097086ad..a3dc9242c4eb3 100644 --- a/flang/test/Fir/invalid-types.fir +++ b/flang/test/Fir/invalid-types.fir @@ -6,8 +6,7 @@ func.func private @box3() -> !fir.boxproc<> // ----- -// expected-error@+2 {{expected non-function type}} -// expected-error@+1 {{failed to parse fir_BoxType parameter 'eleTy' which is to be a `mlir::Type`}} +// expected-error@+1 {{expected non-function type}} func.func private @box1() -> !fir.box<> // ----- @@ -105,6 +104,11 @@ func.func private @mem3() -> !fir.ref<> // ----- +// expected-error@+1 {{expected non-function type}} +func.func private @mem3() -> !fir.ref<, volatile> + +// ----- + // expected-error@+1 {{expected ':'}} func.func private @arr1() -> !fir.array<*> @@ -162,3 +166,24 @@ func.func private @upe() -> !fir.class> // expected-error@+1 {{invalid element type}} func.func private @upe() -> !fir.box> + +// ----- + +// expected-error@+1 {{invalid element type}} +func.func private @upe() -> !fir.box, volatile> + +// ----- + +// expected-error@+1 {{invalid element type}} +func.func private @upe() -> !fir.class> + +// ----- + +// expected-error@+1 {{invalid element type}} +func.func private @upe() -> !fir.class, volatile> + +// ----- + +// expected-error@+1 {{expected non-function type}} +func.func private @upe() -> !fir.class<, volatile> + diff --git a/flang/unittests/Optimizer/FIRTypesTest.cpp b/flang/unittests/Optimizer/FIRTypesTest.cpp index b3151b4aa7efb..28d5eb7ead25f 100644 --- a/flang/unittests/Optimizer/FIRTypesTest.cpp +++ b/flang/unittests/Optimizer/FIRTypesTest.cpp @@ -316,3 +316,39 @@ TEST_F(FIRTypesTest, getTypeAsString) { EXPECT_EQ("boxchar_c8xU", fir::getTypeAsString(fir::BoxCharType::get(&context, 1), *kindMap)); } + +TEST_F(FIRTypesTest, isVolatileType) { + mlir::Type i32 = mlir::IntegerType::get(&context, 32); + + mlir::Type i32NonVolatileRef = fir::ReferenceType::get(i32); + mlir::Type i32NonVolatileBox = fir::BoxType::get(i32); + mlir::Type i32NonVolatileClass = fir::ClassType::get(i32); + + // Ensure the default value is false + EXPECT_EQ(i32NonVolatileRef, fir::ReferenceType::get(i32, false)); + EXPECT_EQ(i32NonVolatileBox, fir::BoxType::get(i32, false)); + EXPECT_EQ(i32NonVolatileClass, fir::ClassType::get(i32, false)); + + EXPECT_FALSE(fir::isa_volatile_type(i32)); + EXPECT_FALSE(fir::isa_volatile_type(i32NonVolatileRef)); + EXPECT_FALSE(fir::isa_volatile_type(i32NonVolatileBox)); + EXPECT_FALSE(fir::isa_volatile_type(i32NonVolatileClass)); + + // Should return the same type if it's not capable of representing volatility. + EXPECT_EQ(i32, fir::updateTypeWithVolatility(i32, true)); + + mlir::Type i32VolatileRef = + fir::updateTypeWithVolatility(i32NonVolatileRef, true); + mlir::Type i32VolatileBox = + fir::updateTypeWithVolatility(i32NonVolatileBox, true); + mlir::Type i32VolatileClass = + fir::updateTypeWithVolatility(i32NonVolatileClass, true); + + EXPECT_TRUE(fir::isa_volatile_type(i32VolatileRef)); + EXPECT_TRUE(fir::isa_volatile_type(i32VolatileBox)); + EXPECT_TRUE(fir::isa_volatile_type(i32VolatileClass)); + + EXPECT_EQ(i32VolatileRef, fir::ReferenceType::get(i32, true)); + EXPECT_EQ(i32VolatileBox, fir::BoxType::get(i32, true)); + EXPECT_EQ(i32VolatileClass, fir::ClassType::get(i32, true)); +} From 0d71d9ab28d626e85a0085a50ff634f76a950ce0 Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Mon, 7 Apr 2025 09:53:07 -0400 Subject: [PATCH 0846/1029] Reland [Clang][Cmake] fix libtool duplicate member name warnings (#133850) fixes https://github.com/llvm/llvm-project/issues/133199 As of the third commit the fix to the linker missing references in `Targets/DirectX.cpp` found in https://github.com/llvm/llvm-project/pull/133776 was fixed by moving `HLSLBufferLayoutBuilder.cpp` to `clang/lib/CodeGen/Targets/`. It fixes the circular reference issue found in https://github.com/llvm/llvm-project/pull/133619 for all `-DBUILD_SHARED_LIBS=ON` builds by removing `target_link_libraries` from the sub directory cmake files. testing for amdgpu offload was done via `cmake -B ../llvm_amdgpu -S llvm -GNinja -C offload/cmake/caches/Offload.cmake -DCMAKE_BUILD_TYPE=Release` PR https://github.com/llvm/llvm-project/pull/132252 Created a second file that shared .cpp in clang/lib/CodeGen/CMakeLists.txt For example There were two AMDGPU.cpp's one in TargetBuiltins and the other in Targets. Even though these were in different directories libtool warns that it might not distinguish them because they share the same base name. There are two potential fixes. The easy fix is to rename one of them and keep one cmake file. That solution though doesn't future proof this problem in the event of a third .cpp and it seems teams want to just use the target name https://github.com/llvm/llvm-project/pull/132252#issuecomment-2758178483. The alternative fix that this PR went with is to seperate the cmake files into their own sub directories as static libs. --- clang/lib/CodeGen/CMakeLists.txt | 49 +++++-------------- clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 2 +- .../lib/CodeGen/TargetBuiltins/CMakeLists.txt | 14 ++++++ clang/lib/CodeGen/Targets/CMakeLists.txt | 31 ++++++++++++ .../{ => Targets}/HLSLBufferLayoutBuilder.cpp | 0 5 files changed, 58 insertions(+), 38 deletions(-) create mode 100644 clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt create mode 100644 clang/lib/CodeGen/Targets/CMakeLists.txt rename clang/lib/CodeGen/{ => Targets}/HLSLBufferLayoutBuilder.cpp (100%) diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt index ebe2fbd7db295..7c627fc1cdb57 100644 --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -107,7 +107,6 @@ add_clang_library(clangCodeGen ConstantInitBuilder.cpp CoverageMappingGen.cpp ItaniumCXXABI.cpp - HLSLBufferLayoutBuilder.cpp LinkInModulesPass.cpp MacroPPCallbacks.cpp MicrosoftCXXABI.cpp @@ -116,43 +115,7 @@ add_clang_library(clangCodeGen PatternInit.cpp SanitizerMetadata.cpp SwiftCallingConv.cpp - TargetBuiltins/ARM.cpp - TargetBuiltins/AMDGPU.cpp - TargetBuiltins/Hexagon.cpp - TargetBuiltins/NVPTX.cpp - TargetBuiltins/PPC.cpp - TargetBuiltins/RISCV.cpp - TargetBuiltins/SPIR.cpp - TargetBuiltins/SystemZ.cpp - TargetBuiltins/WebAssembly.cpp - TargetBuiltins/X86.cpp TargetInfo.cpp - Targets/AArch64.cpp - Targets/AMDGPU.cpp - Targets/ARC.cpp - Targets/ARM.cpp - Targets/AVR.cpp - Targets/BPF.cpp - Targets/CSKY.cpp - Targets/DirectX.cpp - Targets/Hexagon.cpp - Targets/Lanai.cpp - Targets/LoongArch.cpp - Targets/M68k.cpp - Targets/MSP430.cpp - Targets/Mips.cpp - Targets/NVPTX.cpp - Targets/PNaCl.cpp - Targets/PPC.cpp - Targets/RISCV.cpp - Targets/SPIR.cpp - Targets/Sparc.cpp - Targets/SystemZ.cpp - Targets/TCE.cpp - Targets/VE.cpp - Targets/WebAssembly.cpp - Targets/X86.cpp - Targets/XCore.cpp VarBypassDetector.cpp DEPENDS @@ -170,4 +133,16 @@ add_clang_library(clangCodeGen clangFrontend clangLex clangSerialization + clangCodeGenTargetBuiltins + clangCodeGenTargets ) + + target_include_directories(clangCodeGen + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/TargetBuiltins + ${CMAKE_CURRENT_SOURCE_DIR}/Targets + ) + + add_subdirectory(TargetBuiltins) + add_subdirectory(Targets) diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index b56b739094ff3..577fee05d4af6 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -1,4 +1,4 @@ -//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===// +//===------- AMDGPU.cpp - Emit LLVM Code for builtins ---------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt b/clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt new file mode 100644 index 0000000000000..76be68a11d02a --- /dev/null +++ b/clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt @@ -0,0 +1,14 @@ +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..) + +add_clang_library(clangCodeGenTargetBuiltins STATIC + ARM.cpp + AMDGPU.cpp + Hexagon.cpp + NVPTX.cpp + PPC.cpp + RISCV.cpp + SPIR.cpp + SystemZ.cpp + WebAssembly.cpp + X86.cpp +) diff --git a/clang/lib/CodeGen/Targets/CMakeLists.txt b/clang/lib/CodeGen/Targets/CMakeLists.txt new file mode 100644 index 0000000000000..6cf4167e2cda2 --- /dev/null +++ b/clang/lib/CodeGen/Targets/CMakeLists.txt @@ -0,0 +1,31 @@ +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..) + +add_clang_library(clangCodeGenTargets STATIC + AArch64.cpp + AMDGPU.cpp + ARC.cpp + ARM.cpp + AVR.cpp + BPF.cpp + CSKY.cpp + DirectX.cpp + HLSLBufferLayoutBuilder.cpp + Hexagon.cpp + Lanai.cpp + LoongArch.cpp + M68k.cpp + MSP430.cpp + Mips.cpp + NVPTX.cpp + PNaCl.cpp + PPC.cpp + RISCV.cpp + SPIR.cpp + Sparc.cpp + SystemZ.cpp + TCE.cpp + VE.cpp + WebAssembly.cpp + X86.cpp + XCore.cpp +) diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp b/clang/lib/CodeGen/Targets/HLSLBufferLayoutBuilder.cpp similarity index 100% rename from clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp rename to clang/lib/CodeGen/Targets/HLSLBufferLayoutBuilder.cpp From cff65657d9896cc04ca96907dd8e789403432a28 Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Mon, 7 Apr 2025 15:00:33 +0100 Subject: [PATCH 0847/1029] [mlir][spirv] Fix incorrect argument erasure in deserializer (#134610) The current implementation iterates and modifies the list of arguments at the same time. Depending on the number of arguments this will trigger an assert: `assert(index < arguments.size())`. This change replaces loop with a range based erasure. --- mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp index d471d9a8e3d6c..25749ec598f00 100644 --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp @@ -2077,8 +2077,7 @@ LogicalResult ControlFlowStructurizer::structurize() { // block arguments from the original merge block. for (unsigned i = 0, e = outsideUses.size(); i != e; ++i) outsideUses[i].replaceAllUsesWith(selectionOp.getResult(i)); - for (unsigned i = 0, e = mergeBlock->getNumArguments(); i != e; ++i) - mergeBlock->eraseArgument(i); + mergeBlock->eraseArguments(0, mergeBlock->getNumArguments()); } // Check that whether some op in the to-be-erased blocks still has uses. Those From 82103dfae9b9ee03f8935f1c8987c7eaecbe9359 Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Mon, 7 Apr 2025 10:00:53 -0400 Subject: [PATCH 0848/1029] Revert "Reland [Clang][Cmake] fix libtool duplicate member name warnings" (#134656) Reverts llvm/llvm-project#133850 --- clang/lib/CodeGen/CMakeLists.txt | 49 ++++++++++++++----- .../{Targets => }/HLSLBufferLayoutBuilder.cpp | 0 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 2 +- .../lib/CodeGen/TargetBuiltins/CMakeLists.txt | 14 ------ clang/lib/CodeGen/Targets/CMakeLists.txt | 31 ------------ 5 files changed, 38 insertions(+), 58 deletions(-) rename clang/lib/CodeGen/{Targets => }/HLSLBufferLayoutBuilder.cpp (100%) delete mode 100644 clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt delete mode 100644 clang/lib/CodeGen/Targets/CMakeLists.txt diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt index 7c627fc1cdb57..ebe2fbd7db295 100644 --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -107,6 +107,7 @@ add_clang_library(clangCodeGen ConstantInitBuilder.cpp CoverageMappingGen.cpp ItaniumCXXABI.cpp + HLSLBufferLayoutBuilder.cpp LinkInModulesPass.cpp MacroPPCallbacks.cpp MicrosoftCXXABI.cpp @@ -115,7 +116,43 @@ add_clang_library(clangCodeGen PatternInit.cpp SanitizerMetadata.cpp SwiftCallingConv.cpp + TargetBuiltins/ARM.cpp + TargetBuiltins/AMDGPU.cpp + TargetBuiltins/Hexagon.cpp + TargetBuiltins/NVPTX.cpp + TargetBuiltins/PPC.cpp + TargetBuiltins/RISCV.cpp + TargetBuiltins/SPIR.cpp + TargetBuiltins/SystemZ.cpp + TargetBuiltins/WebAssembly.cpp + TargetBuiltins/X86.cpp TargetInfo.cpp + Targets/AArch64.cpp + Targets/AMDGPU.cpp + Targets/ARC.cpp + Targets/ARM.cpp + Targets/AVR.cpp + Targets/BPF.cpp + Targets/CSKY.cpp + Targets/DirectX.cpp + Targets/Hexagon.cpp + Targets/Lanai.cpp + Targets/LoongArch.cpp + Targets/M68k.cpp + Targets/MSP430.cpp + Targets/Mips.cpp + Targets/NVPTX.cpp + Targets/PNaCl.cpp + Targets/PPC.cpp + Targets/RISCV.cpp + Targets/SPIR.cpp + Targets/Sparc.cpp + Targets/SystemZ.cpp + Targets/TCE.cpp + Targets/VE.cpp + Targets/WebAssembly.cpp + Targets/X86.cpp + Targets/XCore.cpp VarBypassDetector.cpp DEPENDS @@ -133,16 +170,4 @@ add_clang_library(clangCodeGen clangFrontend clangLex clangSerialization - clangCodeGenTargetBuiltins - clangCodeGenTargets ) - - target_include_directories(clangCodeGen - PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/TargetBuiltins - ${CMAKE_CURRENT_SOURCE_DIR}/Targets - ) - - add_subdirectory(TargetBuiltins) - add_subdirectory(Targets) diff --git a/clang/lib/CodeGen/Targets/HLSLBufferLayoutBuilder.cpp b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp similarity index 100% rename from clang/lib/CodeGen/Targets/HLSLBufferLayoutBuilder.cpp rename to clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 577fee05d4af6..b56b739094ff3 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -1,4 +1,4 @@ -//===------- AMDGPU.cpp - Emit LLVM Code for builtins ---------------------===// +//===------- AMDCPU.cpp - Emit LLVM Code for builtins ---------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt b/clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt deleted file mode 100644 index 76be68a11d02a..0000000000000 --- a/clang/lib/CodeGen/TargetBuiltins/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..) - -add_clang_library(clangCodeGenTargetBuiltins STATIC - ARM.cpp - AMDGPU.cpp - Hexagon.cpp - NVPTX.cpp - PPC.cpp - RISCV.cpp - SPIR.cpp - SystemZ.cpp - WebAssembly.cpp - X86.cpp -) diff --git a/clang/lib/CodeGen/Targets/CMakeLists.txt b/clang/lib/CodeGen/Targets/CMakeLists.txt deleted file mode 100644 index 6cf4167e2cda2..0000000000000 --- a/clang/lib/CodeGen/Targets/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..) - -add_clang_library(clangCodeGenTargets STATIC - AArch64.cpp - AMDGPU.cpp - ARC.cpp - ARM.cpp - AVR.cpp - BPF.cpp - CSKY.cpp - DirectX.cpp - HLSLBufferLayoutBuilder.cpp - Hexagon.cpp - Lanai.cpp - LoongArch.cpp - M68k.cpp - MSP430.cpp - Mips.cpp - NVPTX.cpp - PNaCl.cpp - PPC.cpp - RISCV.cpp - SPIR.cpp - Sparc.cpp - SystemZ.cpp - TCE.cpp - VE.cpp - WebAssembly.cpp - X86.cpp - XCore.cpp -) From 8fddef8483dc9eb569580ffd13695b8f54d3c058 Mon Sep 17 00:00:00 2001 From: zhijian lin Date: Mon, 7 Apr 2025 10:03:05 -0400 Subject: [PATCH 0849/1029] [SelectionDAG] Introducing a new ISD::POISON SDNode to represent the poison value in the IR. (#125883) A new ISD::POISON SDNode is introduced to represent the `poison value` in the IR, replacing the previous use of ISD::UNDEF. --- llvm/include/llvm/CodeGen/ISDOpcodes.h | 3 ++ llvm/include/llvm/CodeGen/SelectionDAG.h | 3 ++ llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 6 ++- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 +- llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 14 +++++++ .../SelectionDAG/LegalizeFloatTypes.cpp | 1 + .../SelectionDAG/LegalizeIntegerTypes.cpp | 2 + .../SelectionDAG/LegalizeVectorTypes.cpp | 3 ++ .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 3 ++ .../SelectionDAG/SelectionDAGBuilder.cpp | 2 +- .../SelectionDAG/SelectionDAGDumper.cpp | 1 + .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 1 + .../AArch64/vector-insert-dag-combines.ll | 38 +++++++++---------- llvm/test/CodeGen/X86/half.ll | 24 ++++++------ llvm/test/CodeGen/X86/pr119158.ll | 7 ++-- 15 files changed, 73 insertions(+), 38 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 59f31f8443947..ad8a95a353b56 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -217,6 +217,9 @@ enum NodeType { /// UNDEF - An undefined node. UNDEF, + /// POISON - A poison node. + POISON, + /// FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or /// is evaluated to UNDEF), or returns VAL otherwise. Note that each /// read of UNDEF can yield different value, but FREEZE(UNDEF) cannot. diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index d06e2b19fa0b5..63423463eeee2 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1130,6 +1130,9 @@ class SelectionDAG { return getNode(ISD::UNDEF, SDLoc(), VT); } + /// Return a POISON node. POISON does not have a useful SDLoc. + SDValue getPOISON(EVT VT) { return getNode(ISD::POISON, SDLoc(), VT); } + /// Return a node that represents the runtime scaling 'MulImm * RuntimeVL'. SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold = true); diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 2283f99202e2f..b62cf08693f63 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -692,8 +692,10 @@ END_TWO_BYTE_PACK() /// \ISD namespace). bool isTargetOpcode() const { return NodeType >= ISD::BUILTIN_OP_END; } - /// Return true if the type of the node type undefined. - bool isUndef() const { return NodeType == ISD::UNDEF; } + /// Returns true if the node type is UNDEF or POISON. + bool isUndef() const { + return NodeType == ISD::UNDEF || NodeType == ISD::POISON; + } /// Test if this node is a memory intrinsic (with valid pointer information). bool isMemIntrinsic() const { return SDNodeBits.IsMemIntrinsic; } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8136f1794775e..38376de5783ae 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16287,7 +16287,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // Finally, recreate the node, it's operands were updated to use // frozen operands, so we just need to use it's "original" operands. SmallVector Ops(N0->ops()); - // Special-handle ISD::UNDEF, each single one of them can be it's own thing. + // TODO: ISD::UNDEF and ISD::POISON should get separate handling, but best + // leave for a future patch. for (SDValue &Op : Ops) { if (Op.isUndef()) Op = DAG.getFreeze(Op); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 754c29b6ba868..b8af281e1c24b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -986,6 +986,19 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) { TargetLowering::LegalizeAction Action = TargetLowering::Legal; bool SimpleFinishLegalizing = true; switch (Node->getOpcode()) { + // TODO: Currently, POISON is being lowered to UNDEF here. However, there is + // an open concern that this transformation may not be ideal, as targets + // should ideally handle POISON directly. Changing this behavior would require + // adding support for POISON in TableGen, which is a large change. + // Additionally, many existing test cases rely on the current behavior (e.g., + // llvm/test/CodeGen/PowerPC/vec_shuffle.ll). A broader discussion and + // incremental changes might be needed to properly + // support POISON without breaking existing targets and tests. + case ISD::POISON: { + SDValue UndefNode = DAG.getUNDEF(Node->getValueType(0)); + ReplaceNode(Node, UndefNode.getNode()); + break; + } case ISD::INTRINSIC_W_CHAIN: case ISD::INTRINSIC_WO_CHAIN: case ISD::INTRINSIC_VOID: @@ -3169,6 +3182,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { for (unsigned i = 0; i < Node->getNumValues(); i++) Results.push_back(Node->getOperand(i)); break; + case ISD::POISON: case ISD::UNDEF: { EVT VT = Node->getValueType(0); if (VT.isInteger()) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 01751dfe9eb62..5ed83060e150e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -2845,6 +2845,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break; + case ISD::POISON: case ISD::UNDEF: R = PromoteFloatRes_UNDEF(N); break; case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break; case ISD::VECREDUCE_FADD: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 204b323d7084a..f944104a0e9d6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -118,6 +118,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::VP_SRL: Res = PromoteIntRes_SRL(N); break; case ISD::VP_TRUNCATE: case ISD::TRUNCATE: Res = PromoteIntRes_TRUNCATE(N); break; + case ISD::POISON: case ISD::UNDEF: Res = PromoteIntRes_UNDEF(N); break; case ISD::VAARG: Res = PromoteIntRes_VAARG(N); break; case ISD::VSCALE: Res = PromoteIntRes_VSCALE(N); break; @@ -2932,6 +2933,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; + case ISD::POISON: case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::FREEZE: SplitRes_FREEZE(N, Lo, Hi); break; case ISD::SETCC: ExpandIntRes_SETCC(N, Lo, Hi); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 9d42ec2fdf859..f934d8b37561e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -71,6 +71,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::SELECT: R = ScalarizeVecRes_SELECT(N); break; case ISD::SELECT_CC: R = ScalarizeVecRes_SELECT_CC(N); break; case ISD::SETCC: R = ScalarizeVecRes_SETCC(N); break; + case ISD::POISON: case ISD::UNDEF: R = ScalarizeVecRes_UNDEF(N); break; case ISD::VECTOR_SHUFFLE: R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break; case ISD::IS_FPCLASS: R = ScalarizeVecRes_IS_FPCLASS(N); break; @@ -1137,6 +1138,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::VP_MERGE: case ISD::VP_SELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; + case ISD::POISON: case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::BITCAST: SplitVecRes_BITCAST(N, Lo, Hi); break; case ISD::BUILD_VECTOR: SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break; @@ -4592,6 +4594,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::SELECT_CC: Res = WidenVecRes_SELECT_CC(N); break; case ISD::VP_SETCC: case ISD::SETCC: Res = WidenVecRes_SETCC(N); break; + case ISD::POISON: case ISD::UNDEF: Res = WidenVecRes_UNDEF(N); break; case ISD::VECTOR_SHUFFLE: Res = WidenVecRes_VECTOR_SHUFFLE(cast(N)); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index f2777bbf247b0..d6dcb3f15ae7c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5387,6 +5387,9 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op, case ISD::CopyFromReg: return true; + case ISD::POISON: + return false; + case ISD::UNDEF: return PoisonOnly; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 89793c30f3710..8cae34d06c8ba 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1817,7 +1817,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { return DAG.getConstantFP(*CFP, getCurSDLoc(), VT); if (isa(C) && !V->getType()->isAggregateType()) - return DAG.getUNDEF(VT); + return isa(C) ? DAG.getPOISON(VT) : DAG.getUNDEF(VT); if (const ConstantExpr *CE = dyn_cast(C)) { visit(CE->getOpcode(), *CE); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 958c070ed50de..8fcec6c6cd7c6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -189,6 +189,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::CopyToReg: return "CopyToReg"; case ISD::CopyFromReg: return "CopyFromReg"; case ISD::UNDEF: return "undef"; + case ISD::POISON: return "poison"; case ISD::VSCALE: return "vscale"; case ISD::MERGE_VALUES: return "merge_values"; case ISD::INLINEASM: return "inlineasm"; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 1287d6ed4a764..bfbfdbc7e3ca2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -3276,6 +3276,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, case ISD::WRITE_REGISTER: Select_WRITE_REGISTER(NodeToMatch); return; + case ISD::POISON: case ISD::UNDEF: Select_UNDEF(NodeToMatch); return; diff --git a/llvm/test/CodeGen/AArch64/vector-insert-dag-combines.ll b/llvm/test/CodeGen/AArch64/vector-insert-dag-combines.ll index 0e05a63ef86de..5207d5cbf21a2 100644 --- a/llvm/test/CodeGen/AArch64/vector-insert-dag-combines.ll +++ b/llvm/test/CodeGen/AArch64/vector-insert-dag-combines.ll @@ -12,7 +12,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK: t0: ch,glue = EntryToken ; CHECK: t2: v8i8,ch = CopyFromReg t0, Register:v8i8 %0 ; CHECK: t4: v4i8 = extract_subvector t2, Constant:i64<0> -; CHECK: t6: v16i8 = insert_subvector undef:v16i8, t4, Constant:i64<0> +; CHECK: t6: v16i8 = insert_subvector poison:v16i8, t4, Constant:i64<0> ; CHECK: t8: ch,glue = CopyToReg t0, Register:v16i8 $q0, t6 ; CHECK: t9: ch = AArch64ISD::RET_GLUE t8, Register:v16i8 $q0, t8:1 @@ -20,7 +20,7 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK: SelectionDAG has 9 nodes: ; CHECK: t0: ch,glue = EntryToken ; CHECK: t2: v8i8,ch = CopyFromReg t0, Register:v8i8 %0 -; CHECK: t10: v16i8 = insert_subvector undef:v16i8, t2, Constant:i64<0> +; CHECK: t10: v16i8 = insert_subvector poison:v16i8, t2, Constant:i64<0> ; CHECK: t8: ch,glue = CopyToReg t0, Register:v16i8 $q0, t10 ; CHECK: t9: ch = AArch64ISD::RET_GLUE t8, Register:v16i8 $q0, t8:1 @@ -35,7 +35,7 @@ define <16 x i8> @insert_small_fixed_into_big_fixed(<8 x i8> %a) #0 { ; CHECK: t0: ch,glue = EntryToken ; CHECK: t2: v8i8,ch = CopyFromReg t0, Register:v8i8 %0 ; CHECK: t4: v4i8 = extract_subvector t2, Constant:i64<0> -; CHECK: t6: nxv16i8 = insert_subvector undef:nxv16i8, t4, Constant:i64<0> +; CHECK: t6: nxv16i8 = insert_subvector poison:nxv16i8, t4, Constant:i64<0> ; CHECK: t8: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t6 ; CHECK: t9: ch = AArch64ISD::RET_GLUE t8, Register:nxv16i8 $z0, t8:1 @@ -43,7 +43,7 @@ define <16 x i8> @insert_small_fixed_into_big_fixed(<8 x i8> %a) #0 { ; CHECK: SelectionDAG has 9 nodes: ; CHECK: t0: ch,glue = EntryToken ; CHECK: t2: v8i8,ch = CopyFromReg t0, Register:v8i8 %0 -; CHECK: t10: nxv16i8 = insert_subvector undef:nxv16i8, t2, Constant:i64<0> +; CHECK: t10: nxv16i8 = insert_subvector poison:nxv16i8, t2, Constant:i64<0> ; CHECK: t8: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t10 ; CHECK: t9: ch = AArch64ISD::RET_GLUE t8, Register:nxv16i8 $z0, t8:1 @@ -59,7 +59,7 @@ define @insert_small_fixed_into_big_scalable(<8 x i8> %a) #0 ; CHECK: t2: nxv8i16,ch = CopyFromReg t0, Register:nxv8i16 %0 ; CHECK: t3: nxv8i8 = truncate t2 ; CHECK: t5: v4i8 = extract_subvector t3, Constant:i64<0> -; CHECK: t7: v16i8 = insert_subvector undef:v16i8, t5, Constant:i64<0> +; CHECK: t7: v16i8 = insert_subvector poison:v16i8, t5, Constant:i64<0> ; CHECK: t9: ch,glue = CopyToReg t0, Register:v16i8 $q0, t7 ; CHECK: t10: ch = AArch64ISD::RET_GLUE t9, Register:v16i8 $q0, t9:1 @@ -69,7 +69,7 @@ define @insert_small_fixed_into_big_scalable(<8 x i8> %a) #0 ; CHECK: t2: nxv8i16,ch = CopyFromReg t0, Register:nxv8i16 %0 ; CHECK: t3: nxv8i8 = truncate t2 ; CHECK: t5: v4i8 = extract_subvector t3, Constant:i64<0> -; CHECK: t7: v16i8 = insert_subvector undef:v16i8, t5, Constant:i64<0> +; CHECK: t7: v16i8 = insert_subvector poison:v16i8, t5, Constant:i64<0> ; CHECK: t9: ch,glue = CopyToReg t0, Register:v16i8 $q0, t7 ; CHECK: t10: ch = AArch64ISD::RET_GLUE t9, Register:v16i8 $q0, t9:1 @@ -86,7 +86,7 @@ define <16 x i8> @insert_small_scalable_into_big_fixed( %a) #0 ; CHECK: t2: nxv8i16,ch = CopyFromReg t0, Register:nxv8i16 %0 ; CHECK: t3: nxv8i8 = truncate t2 ; CHECK: t5: v4i8 = extract_subvector t3, Constant:i64<0> -; CHECK: t7: nxv16i8 = insert_subvector undef:nxv16i8, t5, Constant:i64<0> +; CHECK: t7: nxv16i8 = insert_subvector poison:nxv16i8, t5, Constant:i64<0> ; CHECK: t9: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t7 ; CHECK: t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv16i8 $z0, t9:1 @@ -95,7 +95,7 @@ define <16 x i8> @insert_small_scalable_into_big_fixed( %a) #0 ; CHECK: t0: ch,glue = EntryToken ; CHECK: t2: nxv8i16,ch = CopyFromReg t0, Register:nxv8i16 %0 ; CHECK: t3: nxv8i8 = truncate t2 -; CHECK: t11: nxv16i8 = insert_subvector undef:nxv16i8, t3, Constant:i64<0> +; CHECK: t11: nxv16i8 = insert_subvector poison:nxv16i8, t3, Constant:i64<0> ; CHECK: t9: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t11 ; CHECK: t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv16i8 $z0, t9:1 @@ -111,7 +111,7 @@ define @insert_small_scalable_into_big_scalable_1( -; CHECK: t7: nxv16i8 = insert_subvector undef:nxv16i8, t5, Constant:i64<0> +; CHECK: t7: nxv16i8 = insert_subvector poison:nxv16i8, t5, Constant:i64<0> ; CHECK: t9: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t7 ; CHECK: t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv16i8 $z0, t9:1 @@ -120,7 +120,7 @@ define @insert_small_scalable_into_big_scalable_1( +; CHECK: t11: nxv16i8 = insert_subvector poison:nxv16i8, t3, Constant:i64<0> ; CHECK: t9: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t11 ; CHECK: t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv16i8 $z0, t9:1 @@ -135,7 +135,7 @@ define @insert_small_scalable_into_big_scalable_2( -; CHECK: t6: v8i8 = insert_subvector undef:v8i8, t4, Constant:i64<0> +; CHECK: t6: v8i8 = insert_subvector poison:v8i8, t4, Constant:i64<0> ; CHECK: t8: ch,glue = CopyToReg t0, Register:v8i8 $d0, t6 ; CHECK: t9: ch = AArch64ISD::RET_GLUE t8, Register:v8i8 $d0, t8:1 @@ -158,7 +158,7 @@ define <8 x i8> @extract_small_fixed_from_big_fixed(<16 x i8> %a) #0 { ; CHECK: t0: ch,glue = EntryToken ; CHECK: t2: v16i8,ch = CopyFromReg t0, Register:v16i8 %0 ; CHECK: t4: v4i8 = extract_subvector t2, Constant:i64<0> -; CHECK: t6: nxv8i8 = insert_subvector undef:nxv8i8, t4, Constant:i64<0> +; CHECK: t6: nxv8i8 = insert_subvector poison:nxv8i8, t4, Constant:i64<0> ; CHECK: t7: nxv8i16 = any_extend t6 ; CHECK: t9: ch,glue = CopyToReg t0, Register:nxv8i16 $z0, t7 ; CHECK: t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv8i16 $z0, t9:1 @@ -168,7 +168,7 @@ define <8 x i8> @extract_small_fixed_from_big_fixed(<16 x i8> %a) #0 { ; CHECK: t0: ch,glue = EntryToken ; CHECK: t2: v16i8,ch = CopyFromReg t0, Register:v16i8 %0 ; CHECK: t4: v4i8 = extract_subvector t2, Constant:i64<0> -; CHECK: t6: nxv8i8 = insert_subvector undef:nxv8i8, t4, Constant:i64<0> +; CHECK: t6: nxv8i8 = insert_subvector poison:nxv8i8, t4, Constant:i64<0> ; CHECK: t7: nxv8i16 = any_extend t6 ; CHECK: t9: ch,glue = CopyToReg t0, Register:nxv8i16 $z0, t7 ; CHECK: t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv8i16 $z0, t9:1 @@ -185,7 +185,7 @@ define @extract_small_scalable_from_big_fixed(<16 x i8> %a) #0 ; CHECK: t0: ch,glue = EntryToken ; CHECK: t2: nxv16i8,ch = CopyFromReg t0, Register:nxv16i8 %0 ; CHECK: t4: v4i8 = extract_subvector t2, Constant:i64<0> -; CHECK: t6: v8i8 = insert_subvector undef:v8i8, t4, Constant:i64<0> +; CHECK: t6: v8i8 = insert_subvector poison:v8i8, t4, Constant:i64<0> ; CHECK: t8: ch,glue = CopyToReg t0, Register:v8i8 $d0, t6 ; CHECK: t9: ch = AArch64ISD::RET_GLUE t8, Register:v8i8 $d0, t8:1 @@ -208,7 +208,7 @@ define <8 x i8> @extract_small_fixed_from_big_scalable( %a) #0 ; CHECK: t0: ch,glue = EntryToken ; CHECK: t2: nxv16i8,ch = CopyFromReg t0, Register:nxv16i8 %0 ; CHECK: t4: v4i8 = extract_subvector t2, Constant:i64<0> -; CHECK: t6: nxv8i8 = insert_subvector undef:nxv8i8, t4, Constant:i64<0> +; CHECK: t6: nxv8i8 = insert_subvector poison:nxv8i8, t4, Constant:i64<0> ; CHECK: t7: nxv8i16 = any_extend t6 ; CHECK: t9: ch,glue = CopyToReg t0, Register:nxv8i16 $z0, t7 ; CHECK: t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv8i16 $z0, t9:1 @@ -233,7 +233,7 @@ define @extract_small_scalable_from_big_scalable_1( -; CHECK: t6: nxv8i8 = insert_subvector undef:nxv8i8, t4, Constant:i64<0> +; CHECK: t6: nxv8i8 = insert_subvector poison:nxv8i8, t4, Constant:i64<0> ; CHECK: t7: nxv8i16 = any_extend t6 ; CHECK: t9: ch,glue = CopyToReg t0, Register:nxv8i16 $z0, t7 ; CHECK: t10: ch = AArch64ISD::RET_GLUE t9, Register:nxv8i16 $z0, t9:1 @@ -258,7 +258,7 @@ define @extract_small_scalable_from_big_scalable_2( -; CHECK: t6: v16i8 = insert_subvector undef:v16i8, t4, Constant:i64<0> +; CHECK: t6: v16i8 = insert_subvector poison:v16i8, t4, Constant:i64<0> ; CHECK: t8: ch,glue = CopyToReg t0, Register:v16i8 $q0, t6 ; CHECK: t9: ch = AArch64ISD::RET_GLUE t8, Register:v16i8 $q0, t8:1 @@ -285,7 +285,7 @@ define <16 x i8> @extract_fixed_from_scalable( %a) #0 { ; CHECK: t0: ch,glue = EntryToken ; CHECK: t2: v16i8,ch = CopyFromReg t0, Register:v16i8 %0 ; CHECK: t4: v4i8 = extract_subvector t2, Constant:i64<0> -; CHECK: t6: nxv16i8 = insert_subvector undef:nxv16i8, t4, Constant:i64<0> +; CHECK: t6: nxv16i8 = insert_subvector poison:nxv16i8, t4, Constant:i64<0> ; CHECK: t8: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t6 ; CHECK: t9: ch = AArch64ISD::RET_GLUE t8, Register:nxv16i8 $z0, t8:1 @@ -293,7 +293,7 @@ define <16 x i8> @extract_fixed_from_scalable( %a) #0 { ; CHECK: SelectionDAG has 9 nodes: ; CHECK: t0: ch,glue = EntryToken ; CHECK: t2: v16i8,ch = CopyFromReg t0, Register:v16i8 %0 -; CHECK: t10: nxv16i8 = insert_subvector undef:nxv16i8, t2, Constant:i64<0> +; CHECK: t10: nxv16i8 = insert_subvector poison:nxv16i8, t2, Constant:i64<0> ; CHECK: t8: ch,glue = CopyToReg t0, Register:nxv16i8 $z0, t10 ; CHECK: t9: ch = AArch64ISD::RET_GLUE t8, Register:nxv16i8 $z0, t8:1 diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index 1b98886ba24e7..a64238170cef9 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -1991,8 +1991,8 @@ define void @pr63114() { ; CHECK-LIBCALL-LABEL: pr63114: ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: movdqu (%rax), %xmm4 -; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,3,4,5,6,7] -; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,7,7] +; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm0 ; CHECK-LIBCALL-NEXT: movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] @@ -2001,8 +2001,8 @@ define void @pr63114() { ; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm0 ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] ; CHECK-LIBCALL-NEXT: por %xmm5, %xmm0 -; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,7,7] -; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[0,1,3,3,4,5,6,7] +; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,0,2,1] ; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm6 ; CHECK-LIBCALL-NEXT: por %xmm2, %xmm6 ; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm6 @@ -2020,8 +2020,8 @@ define void @pr63114() { ; CHECK-LIBCALL-NEXT: por %xmm5, %xmm7 ; CHECK-LIBCALL-NEXT: movdqu %xmm7, 0 ; CHECK-LIBCALL-NEXT: movdqu %xmm4, 32 -; CHECK-LIBCALL-NEXT: movdqu %xmm6, 48 -; CHECK-LIBCALL-NEXT: movdqu %xmm0, 16 +; CHECK-LIBCALL-NEXT: movdqu %xmm6, 16 +; CHECK-LIBCALL-NEXT: movdqu %xmm0, 48 ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: pr63114: @@ -2056,8 +2056,8 @@ define void @pr63114() { ; CHECK-I686-LABEL: pr63114: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: movdqu (%eax), %xmm6 -; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,3,4,5,6,7] -; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,7,7] +; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-I686-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; CHECK-I686-NEXT: pand %xmm1, %xmm0 ; CHECK-I686-NEXT: movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] @@ -2066,8 +2066,8 @@ define void @pr63114() { ; CHECK-I686-NEXT: pand %xmm3, %xmm0 ; CHECK-I686-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] ; CHECK-I686-NEXT: por %xmm4, %xmm0 -; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,7] -; CHECK-I686-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[0,1,3,3,4,5,6,7] +; CHECK-I686-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,2,1] ; CHECK-I686-NEXT: pand %xmm1, %xmm5 ; CHECK-I686-NEXT: por %xmm2, %xmm5 ; CHECK-I686-NEXT: pand %xmm3, %xmm5 @@ -2085,8 +2085,8 @@ define void @pr63114() { ; CHECK-I686-NEXT: por %xmm4, %xmm7 ; CHECK-I686-NEXT: movdqu %xmm7, 0 ; CHECK-I686-NEXT: movdqu %xmm6, 32 -; CHECK-I686-NEXT: movdqu %xmm5, 48 -; CHECK-I686-NEXT: movdqu %xmm0, 16 +; CHECK-I686-NEXT: movdqu %xmm5, 16 +; CHECK-I686-NEXT: movdqu %xmm0, 48 ; CHECK-I686-NEXT: retl %1 = load <24 x half>, ptr poison, align 2 %2 = shufflevector <24 x half> %1, <24 x half> poison, <8 x i32> diff --git a/llvm/test/CodeGen/X86/pr119158.ll b/llvm/test/CodeGen/X86/pr119158.ll index ca31df802c913..4a1da30ca6c25 100644 --- a/llvm/test/CodeGen/X86/pr119158.ll +++ b/llvm/test/CodeGen/X86/pr119158.ll @@ -5,9 +5,10 @@ define dso_local void @foo() #1 { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %newFuncRoot ; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [64,64,64,64,64,64,64,64] -; CHECK-NEXT: vpdpwssd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 -; CHECK-NEXT: vpsrld $7, %ymm1, %ymm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18,0,18,0,18,0,18,0,18,0,18,0,18,0,18,0] +; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0 +; CHECK-NEXT: vpsrld $7, %ymm0, %ymm0 ; CHECK-NEXT: vpackusdw %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: vmovdqu %ymm0, (%rax) From 28a391848cc58400a3103730138d46c75871c867 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 7 Apr 2025 18:21:04 +0700 Subject: [PATCH 0850/1029] Bitcode: Convert test to opaque pointers --- llvm/test/Bitcode/thinlto-unused-type-tests.ll | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/test/Bitcode/thinlto-unused-type-tests.ll b/llvm/test/Bitcode/thinlto-unused-type-tests.ll index c2c5dd2f9a301..a5fc4b0c55754 100644 --- a/llvm/test/Bitcode/thinlto-unused-type-tests.ll +++ b/llvm/test/Bitcode/thinlto-unused-type-tests.ll @@ -3,11 +3,11 @@ ; CHECK-NOT: Date: Tue, 8 Apr 2025 00:07:15 +0900 Subject: [PATCH 0851/1029] [HLSL] Include SPIRV in LLVM_TARGETS_TO_BUILD in the HLSL cmake cache (#133287) Since SPIRV is no longer an experimental target this wasn't actually enabling it any more. --- clang/cmake/caches/HLSL.cmake | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/clang/cmake/caches/HLSL.cmake b/clang/cmake/caches/HLSL.cmake index ed813f60c9c69..24afcbbda2a40 100644 --- a/clang/cmake/caches/HLSL.cmake +++ b/clang/cmake/caches/HLSL.cmake @@ -1,13 +1,10 @@ # Including the native target is important because some of LLVM's tests fail if # you don't. -set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "") +set(LLVM_TARGETS_TO_BUILD "Native;SPIRV" CACHE STRING "") -# Include the DirectX target for DXIL code generation, eventually we'll include -# SPIR-V here too. -set(LLVM_EXPERIMENTAL_TARGETS_TO_BUILD "DirectX;SPIRV" CACHE STRING "") +# Include the DirectX target for DXIL code generation. +set(LLVM_EXPERIMENTAL_TARGETS_TO_BUILD "DirectX" CACHE STRING "") -# HLSL support is currently limted to clang, eventually it will expand to -# clang-tools-extra too. set(LLVM_ENABLE_PROJECTS "clang;clang-tools-extra" CACHE STRING "") set(CLANG_ENABLE_HLSL On CACHE BOOL "") From 7fa388d77b61685a1e7d359ceb6f73c60f1ade83 Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Mon, 7 Apr 2025 17:20:17 +0200 Subject: [PATCH 0852/1029] [Flang][OpenMP] Fix bug with default(none) and host-assoc threadprivate variable (#134122) When a host associated `threadprivate` variable was used in a parallel region with `default(none)` in an internal subroutine was failing, because the compiler did not properly determine that the variable was pre-determined `threadprivate` and thus should not have been reported as missing a DSA. --- flang/lib/Semantics/resolve-directives.cpp | 2 +- .../threadprivate-host-association-3.f90 | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 flang/test/Lower/OpenMP/threadprivate-host-association-3.f90 diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index a5b3391859500..d75b4ea13d35f 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -2301,7 +2301,7 @@ void OmpAttributeVisitor::Post(const parser::Name &name) { if (symbol != found) { name.symbol = found; // adjust the symbol within region } else if (GetContext().defaultDSA == Symbol::Flag::OmpNone && - !symbol->test(Symbol::Flag::OmpThreadprivate) && + !symbol->GetUltimate().test(Symbol::Flag::OmpThreadprivate) && // Exclude indices of sequential loops that are privatised in // the scope of the parallel region, and not in this scope. // TODO: check whether this should be caught in IsObjectWithDSA diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90 new file mode 100644 index 0000000000000..22ee51f82bc0f --- /dev/null +++ b/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90 @@ -0,0 +1,44 @@ +! This test checks lowering of OpenMP Threadprivate Directive. +! Test for threadprivate variable in host association. + +!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s + +!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "main"} { +!CHECK: %[[A:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"} +!CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %[[A_ADDR:.*]] = fir.address_of(@_QFEa) : !fir.ref +!CHECK: %[[TP_A:.*]] = omp.threadprivate %[[A_ADDR]] : !fir.ref -> !fir.ref +!CHECK: %[[TP_A_DECL:.*]]:2 = hlfir.declare %[[TP_A]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: fir.call @_QFPsub() fastmath : () -> () +!CHECK: return +!CHECK: } +!CHECK: func.func private @_QFPsub() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage} { +!CHECK: %[[A:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"} +!CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %[[A_ADDR:.*]] = fir.address_of(@_QFEa) : !fir.ref +!CHECK: %[[TP_A:.*]] = omp.threadprivate %[[A_ADDR]] : !fir.ref -> !fir.ref +!CHECK: %[[TP_A_DECL:.*]]:2 = hlfir.declare %[[TP_A]] {uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: omp.parallel { +!CHECK: %[[PAR_TP_A:.*]] = omp.threadprivate %[[A_ADDR]] : !fir.ref -> !fir.ref +!CHECK: %[[PAR_TP_A_DECL:.*]]:2 = hlfir.declare %[[PAR_TP_A]] {uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %{{.*}} = fir.load %[[PAR_TP_A_DECL]]#0 : !fir.ref +!CHECK: omp.terminator +!CHECK: } +!CHECK: return +!CHECK: } +!CHECK: fir.global internal @_QFEa : i32 { +!CHECK: %[[A:.*]] = fir.undefined i32 +!CHECK: fir.has_value %[[A]] : i32 +!CHECK: } + +program main + integer :: a + !$omp threadprivate(a) + call sub() +contains + subroutine sub() + !$omp parallel default(none) + print *, a + !$omp end parallel + end +end From 954ccee5d53032f1cdea23368e11922edc20615d Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Mon, 7 Apr 2025 12:30:51 -0300 Subject: [PATCH 0853/1029] [clang] fix partial ordering of NTTP packs (#134461) This fixes partial ordering of pack expansions of NTTPs, by procedding with the check using the pattern of the NTTP through the rules of the non-pack case. This also unifies almost all of the different versions of FinishTemplateArgumentDeduction (except the function template case). This makes sure they all follow the rules consistently, instantiating the parameters and comparing those with the argument. Fixes #132562 --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/AST/ExprCXX.h | 6 +- clang/include/clang/Sema/Sema.h | 17 +- clang/lib/AST/ASTContext.cpp | 3 +- clang/lib/AST/ASTImporter.cpp | 7 +- clang/lib/Sema/SemaOverload.cpp | 8 +- clang/lib/Sema/SemaTemplate.cpp | 272 ++++++++++------- clang/lib/Sema/SemaTemplateDeduction.cpp | 285 +++++++----------- clang/lib/Sema/SemaTemplateInstantiate.cpp | 8 +- clang/lib/Sema/SemaTemplateVariadic.cpp | 3 +- clang/lib/Sema/TreeTransform.h | 3 +- .../test/Import/pack-expansion-expr/test.cpp | 2 +- clang/test/SemaTemplate/attributes.cpp | 22 +- clang/test/SemaTemplate/partial-order.cpp | 41 ++- clang/test/SemaTemplate/temp_arg_nontype.cpp | 16 +- 15 files changed, 357 insertions(+), 337 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 4d7a09e89ae42..f8f4dfbafb4f8 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -396,6 +396,7 @@ Bug Fixes to C++ Support - Improved fix for an issue with pack expansions of type constraints, where this now also works if the constraint has non-type or template template parameters. (#GH131798) +- Fixes to partial ordering of non-type template parameter packs. (#GH132562) - Fix crash when evaluating the trailing requires clause of generic lambdas which are part of a pack expansion. - Fixes matching of nested template template parameters. (#GH130362) diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index ac78d2faefe42..c613ce162a6a4 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -4209,10 +4209,10 @@ class PackExpansionExpr : public Expr { Stmt *Pattern; public: - PackExpansionExpr(QualType T, Expr *Pattern, SourceLocation EllipsisLoc, + PackExpansionExpr(Expr *Pattern, SourceLocation EllipsisLoc, UnsignedOrNone NumExpansions) - : Expr(PackExpansionExprClass, T, Pattern->getValueKind(), - Pattern->getObjectKind()), + : Expr(PackExpansionExprClass, Pattern->getType(), + Pattern->getValueKind(), Pattern->getObjectKind()), EllipsisLoc(EllipsisLoc), NumExpansions(NumExpansions ? *NumExpansions + 1 : 0), Pattern(Pattern) { diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 6bf1caf6bdd18..a0ac46033b4a1 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -10127,13 +10127,14 @@ class Sema final : public SemaBase { /// Contexts in which a converted constant expression is required. enum CCEKind { - CCEK_CaseValue, ///< Expression in a case label. - CCEK_Enumerator, ///< Enumerator value with fixed underlying type. - CCEK_TemplateArg, ///< Value of a non-type template parameter. - CCEK_InjectedTTP, ///< Injected parameter of a template template parameter. - CCEK_ArrayBound, ///< Array bound in array declarator or new-expression. - CCEK_ExplicitBool, ///< Condition in an explicit(bool) specifier. - CCEK_Noexcept, ///< Condition in a noexcept(bool) specifier. + CCEK_CaseValue, ///< Expression in a case label. + CCEK_Enumerator, ///< Enumerator value with fixed underlying type. + CCEK_TemplateArg, ///< Value of a non-type template parameter. + CCEK_TempArgStrict, ///< As above, but applies strict template checking + ///< rules. + CCEK_ArrayBound, ///< Array bound in array declarator or new-expression. + CCEK_ExplicitBool, ///< Condition in an explicit(bool) specifier. + CCEK_Noexcept, ///< Condition in a noexcept(bool) specifier. CCEK_StaticAssertMessageSize, ///< Call to size() in a static assert ///< message. CCEK_StaticAssertMessageData, ///< Call to data() in a static assert @@ -11895,7 +11896,7 @@ class Sema final : public SemaBase { QualType InstantiatedParamType, Expr *Arg, TemplateArgument &SugaredConverted, TemplateArgument &CanonicalConverted, - bool MatchingTTP, + bool StrictCheck, CheckTemplateArgumentKind CTAK); /// Check a template argument against its corresponding diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 320fd4e2f3077..0fe941e063d49 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -5852,8 +5852,7 @@ TemplateArgument ASTContext::getInjectedTemplateArg(NamedDecl *Param) const { T, VK, NTTP->getLocation()); if (NTTP->isParameterPack()) - E = new (*this) - PackExpansionExpr(DependentTy, E, NTTP->getLocation(), std::nullopt); + E = new (*this) PackExpansionExpr(E, NTTP->getLocation(), std::nullopt); Arg = TemplateArgument(E); } else { auto *TTP = cast(Param); diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 8c91cce22f78e..f4b977d1d14b3 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -8273,14 +8273,13 @@ ASTNodeImporter::VisitMaterializeTemporaryExpr(MaterializeTemporaryExpr *E) { ExpectedStmt ASTNodeImporter::VisitPackExpansionExpr(PackExpansionExpr *E) { Error Err = Error::success(); - auto ToType = importChecked(Err, E->getType()); - auto ToPattern = importChecked(Err, E->getPattern()); + auto *ToPattern = importChecked(Err, E->getPattern()); auto ToEllipsisLoc = importChecked(Err, E->getEllipsisLoc()); if (Err) return std::move(Err); - return new (Importer.getToContext()) PackExpansionExpr( - ToType, ToPattern, ToEllipsisLoc, E->getNumExpansions()); + return new (Importer.getToContext()) + PackExpansionExpr(ToPattern, ToEllipsisLoc, E->getNumExpansions()); } ExpectedStmt ASTNodeImporter::VisitSizeOfPackExpr(SizeOfPackExpr *E) { diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 0564557738170..f46ef2c7f5bd6 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -6201,7 +6201,7 @@ static ExprResult BuildConvertedConstantExpression(Sema &S, Expr *From, Sema::CCEKind CCE, NamedDecl *Dest, APValue &PreNarrowingValue) { - assert((S.getLangOpts().CPlusPlus11 || CCE == Sema::CCEK_InjectedTTP) && + assert((S.getLangOpts().CPlusPlus11 || CCE == Sema::CCEK_TempArgStrict) && "converted constant expression outside C++11 or TTP matching"); if (checkPlaceholderForOverload(S, From)) @@ -6272,7 +6272,7 @@ static ExprResult BuildConvertedConstantExpression(Sema &S, Expr *From, // class type. ExprResult Result; bool IsTemplateArgument = - CCE == Sema::CCEK_TemplateArg || CCE == Sema::CCEK_InjectedTTP; + CCE == Sema::CCEK_TemplateArg || CCE == Sema::CCEK_TempArgStrict; if (T->isRecordType()) { assert(IsTemplateArgument && "unexpected class type converted constant expr"); @@ -6325,7 +6325,7 @@ static ExprResult BuildConvertedConstantExpression(Sema &S, Expr *From, // value-dependent so we can't tell whether it's actually narrowing. // For matching the parameters of a TTP, the conversion is ill-formed // if it may narrow. - if (CCE != Sema::CCEK_InjectedTTP) + if (CCE != Sema::CCEK_TempArgStrict) break; [[fallthrough]]; case NK_Type_Narrowing: @@ -6400,7 +6400,7 @@ Sema::EvaluateConvertedConstantExpression(Expr *E, QualType T, APValue &Value, Expr::EvalResult Eval; Eval.Diag = &Notes; - assert(CCE != Sema::CCEK_InjectedTTP && "unnexpected CCE Kind"); + assert(CCE != Sema::CCEK_TempArgStrict && "unnexpected CCE Kind"); ConstantExprKind Kind; if (CCE == Sema::CCEK_TemplateArg && T->isRecordType()) diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 153f44f8ec67a..3d7fa38272e6a 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -5263,9 +5263,9 @@ bool Sema::CheckTemplateArgument(NamedDecl *Param, TemplateArgumentLoc &ArgLoc, auto checkExpr = [&](Expr *E) -> Expr * { TemplateArgument SugaredResult, CanonicalResult; unsigned CurSFINAEErrors = NumSFINAEErrors; - ExprResult Res = - CheckTemplateArgument(NTTP, NTTPType, E, SugaredResult, - CanonicalResult, CTAI.MatchingTTP, CTAK); + ExprResult Res = CheckTemplateArgument( + NTTP, NTTPType, E, SugaredResult, CanonicalResult, + /*StrictCheck=*/CTAI.MatchingTTP || CTAI.PartialOrdering, CTAK); // If the current template argument causes an error, give up now. if (Res.isInvalid() || CurSFINAEErrors < NumSFINAEErrors) return nullptr; @@ -5344,9 +5344,9 @@ bool Sema::CheckTemplateArgument(NamedDecl *Param, TemplateArgumentLoc &ArgLoc, } TemplateArgument SugaredResult, CanonicalResult; - E = CheckTemplateArgument(NTTP, NTTPType, E.get(), SugaredResult, - CanonicalResult, /*PartialOrderingTTP=*/false, - CTAK_Specified); + E = CheckTemplateArgument( + NTTP, NTTPType, E.get(), SugaredResult, CanonicalResult, + /*StrictCheck=*/CTAI.PartialOrdering, CTAK_Specified); if (E.isInvalid()) return true; @@ -6757,9 +6757,21 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, QualType ParamType, Expr *Arg, TemplateArgument &SugaredConverted, TemplateArgument &CanonicalConverted, - bool PartialOrderingTTP, + bool StrictCheck, CheckTemplateArgumentKind CTAK) { SourceLocation StartLoc = Arg->getBeginLoc(); + auto *ArgPE = dyn_cast(Arg); + Expr *DeductionArg = ArgPE ? ArgPE->getPattern() : Arg; + auto setDeductionArg = [&](Expr *NewDeductionArg) { + DeductionArg = NewDeductionArg; + if (ArgPE) { + // Recreate a pack expansion if we unwrapped one. + Arg = new (Context) PackExpansionExpr( + DeductionArg, ArgPE->getEllipsisLoc(), ArgPE->getNumExpansions()); + } else { + Arg = DeductionArg; + } + }; // If the parameter type somehow involves auto, deduce the type now. DeducedType *DeducedT = ParamType->getContainedDeducedType(); @@ -6769,7 +6781,7 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, // FIXME: The language rules don't say what happens in this case. // FIXME: We get an opaque dependent type out of decltype(auto) if the // expression is merely instantiation-dependent; is this enough? - if (Arg->isTypeDependent()) { + if (DeductionArg->isTypeDependent()) { auto *AT = dyn_cast(DeducedT); if (AT && AT->isDecltypeAuto()) { SugaredConverted = TemplateArgument(Arg); @@ -6782,9 +6794,6 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, // When checking a deduced template argument, deduce from its type even if // the type is dependent, in order to check the types of non-type template // arguments line up properly in partial ordering. - Expr *DeductionArg = Arg; - if (auto *PE = dyn_cast(DeductionArg)) - DeductionArg = PE->getPattern(); TypeSourceInfo *TSI = Context.getTrivialTypeSourceInfo(ParamType, Param->getLocation()); if (isa(DeducedT)) { @@ -6837,64 +6846,55 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, assert(!ParamType.hasQualifiers() && "non-type template parameter type cannot be qualified"); + // If either the parameter has a dependent type or the argument is + // type-dependent, there's nothing we can check now. + if (ParamType->isDependentType() || DeductionArg->isTypeDependent()) { + // Force the argument to the type of the parameter to maintain invariants. + ExprResult E = ImpCastExprToType( + DeductionArg, ParamType.getNonLValueExprType(Context), CK_Dependent, + ParamType->isLValueReferenceType() ? VK_LValue + : ParamType->isRValueReferenceType() ? VK_XValue + : VK_PRValue); + if (E.isInvalid()) + return ExprError(); + setDeductionArg(E.get()); + SugaredConverted = TemplateArgument(Arg); + CanonicalConverted = TemplateArgument( + Context.getCanonicalTemplateArgument(SugaredConverted)); + return Arg; + } + // FIXME: When Param is a reference, should we check that Arg is an lvalue? - if (CTAK == CTAK_Deduced && + if (CTAK == CTAK_Deduced && !StrictCheck && (ParamType->isReferenceType() ? !Context.hasSameType(ParamType.getNonReferenceType(), - Arg->getType()) - : !Context.hasSameUnqualifiedType(ParamType, Arg->getType()))) { - // FIXME: If either type is dependent, we skip the check. This isn't - // correct, since during deduction we're supposed to have replaced each - // template parameter with some unique (non-dependent) placeholder. - // FIXME: If the argument type contains 'auto', we carry on and fail the - // type check in order to force specific types to be more specialized than - // 'auto'. It's not clear how partial ordering with 'auto' is supposed to - // work. Similarly for CTAD, when comparing 'A' against 'A'. - if ((ParamType->isDependentType() || Arg->isTypeDependent()) && - !Arg->getType()->getContainedDeducedType()) { - SugaredConverted = TemplateArgument(Arg); - CanonicalConverted = TemplateArgument( - Context.getCanonicalTemplateArgument(SugaredConverted)); - return Arg; - } + DeductionArg->getType()) + : !Context.hasSameUnqualifiedType(ParamType, + DeductionArg->getType()))) { // FIXME: This attempts to implement C++ [temp.deduct.type]p17. Per DR1770, // we should actually be checking the type of the template argument in P, // not the type of the template argument deduced from A, against the // template parameter type. Diag(StartLoc, diag::err_deduced_non_type_template_arg_type_mismatch) - << Arg->getType() - << ParamType.getUnqualifiedType(); + << Arg->getType() << ParamType.getUnqualifiedType(); NoteTemplateParameterLocation(*Param); return ExprError(); } - // If either the parameter has a dependent type or the argument is - // type-dependent, there's nothing we can check now. - if (ParamType->isDependentType() || Arg->isTypeDependent()) { - // Force the argument to the type of the parameter to maintain invariants. - auto *PE = dyn_cast(Arg); - if (PE) - Arg = PE->getPattern(); - ExprResult E = ImpCastExprToType( - Arg, ParamType.getNonLValueExprType(Context), CK_Dependent, - ParamType->isLValueReferenceType() ? VK_LValue - : ParamType->isRValueReferenceType() ? VK_XValue - : VK_PRValue); - if (E.isInvalid()) - return ExprError(); - if (PE) { - // Recreate a pack expansion if we unwrapped one. - E = new (Context) - PackExpansionExpr(E.get()->getType(), E.get(), PE->getEllipsisLoc(), - PE->getNumExpansions()); - } - SugaredConverted = TemplateArgument(E.get()); + // If the argument is a pack expansion, we don't know how many times it would + // expand. If we continue checking the argument, this will make the template + // definition ill-formed if it would be ill-formed for any number of + // expansions during instantiation time. When partial ordering or matching + // template template parameters, this is exactly what we want. Otherwise, the + // normal template rules apply: we accept the template if it would be valid + // for any number of expansions (i.e. none). + if (ArgPE && !StrictCheck) { + SugaredConverted = TemplateArgument(Arg); CanonicalConverted = TemplateArgument( Context.getCanonicalTemplateArgument(SugaredConverted)); - return E; + return Arg; } - QualType CanonParamType = Context.getCanonicalType(ParamType); // Avoid making a copy when initializing a template parameter of class type // from a template parameter object of the same type. This is going beyond // the standard, but is required for soundness: in @@ -6903,15 +6903,15 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, // // Similarly, don't inject a call to a copy constructor when initializing // from a template parameter of the same type. - Expr *InnerArg = Arg->IgnoreParenImpCasts(); + Expr *InnerArg = DeductionArg->IgnoreParenImpCasts(); if (ParamType->isRecordType() && isa(InnerArg) && Context.hasSameUnqualifiedType(ParamType, InnerArg->getType())) { NamedDecl *ND = cast(InnerArg)->getDecl(); if (auto *TPO = dyn_cast(ND)) { SugaredConverted = TemplateArgument(TPO, ParamType); - CanonicalConverted = - TemplateArgument(TPO->getCanonicalDecl(), CanonParamType); + CanonicalConverted = TemplateArgument(TPO->getCanonicalDecl(), + ParamType.getCanonicalType()); return Arg; } if (isa(ND)) { @@ -6928,10 +6928,10 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, *this, Sema::ExpressionEvaluationContext::ConstantEvaluated); bool IsConvertedConstantExpression = true; - if (isa(Arg) || ParamType->isRecordType()) { + if (isa(DeductionArg) || ParamType->isRecordType()) { InitializationKind Kind = InitializationKind::CreateForInit( - Arg->getBeginLoc(), /*DirectInit=*/false, Arg); - Expr *Inits[1] = {Arg}; + StartLoc, /*DirectInit=*/false, DeductionArg); + Expr *Inits[1] = {DeductionArg}; InitializedEntity Entity = InitializedEntity::InitializeTemplateParameter(ParamType, Param); InitializationSequence InitSeq(*this, Entity, Kind, Inits); @@ -6941,14 +6941,15 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, Result = ActOnConstantExpression(Result.get()); if (Result.isInvalid() || !Result.get()) return ExprError(); - Arg = ActOnFinishFullExpr(Result.get(), Arg->getBeginLoc(), - /*DiscardedValue=*/false, - /*IsConstexpr=*/true, /*IsTemplateArgument=*/true) - .get(); + setDeductionArg(ActOnFinishFullExpr(Result.get(), Arg->getBeginLoc(), + /*DiscardedValue=*/false, + /*IsConstexpr=*/true, + /*IsTemplateArgument=*/true) + .get()); IsConvertedConstantExpression = false; } - if (getLangOpts().CPlusPlus17 || PartialOrderingTTP) { + if (getLangOpts().CPlusPlus17 || StrictCheck) { // C++17 [temp.arg.nontype]p1: // A template-argument for a non-type template parameter shall be // a converted constant expression of the type of the template-parameter. @@ -6956,24 +6957,25 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, ExprResult ArgResult; if (IsConvertedConstantExpression) { ArgResult = BuildConvertedConstantExpression( - Arg, ParamType, - PartialOrderingTTP ? CCEK_InjectedTTP : CCEK_TemplateArg, Param); + DeductionArg, ParamType, + StrictCheck ? CCEK_TempArgStrict : CCEK_TemplateArg, Param); assert(!ArgResult.isUnset()); if (ArgResult.isInvalid()) { NoteTemplateParameterLocation(*Param); return ExprError(); } } else { - ArgResult = Arg; + ArgResult = DeductionArg; } // For a value-dependent argument, CheckConvertedConstantExpression is // permitted (and expected) to be unable to determine a value. if (ArgResult.get()->isValueDependent()) { - SugaredConverted = TemplateArgument(ArgResult.get()); + setDeductionArg(ArgResult.get()); + SugaredConverted = TemplateArgument(Arg); CanonicalConverted = Context.getCanonicalTemplateArgument(SugaredConverted); - return ArgResult; + return Arg; } APValue PreNarrowingValue; @@ -6982,6 +6984,7 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, false, PreNarrowingValue); if (ArgResult.isInvalid()) return ExprError(); + setDeductionArg(ArgResult.get()); if (Value.isLValue()) { APValue::LValueBase Base = Value.getLValueBase(); @@ -7006,10 +7009,17 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, VD->getType()->isArrayType() && Value.getLValuePath()[0].getAsArrayIndex() == 0 && !Value.isLValueOnePastTheEnd() && ParamType->isPointerType()) { - SugaredConverted = TemplateArgument(VD, ParamType); - CanonicalConverted = TemplateArgument( - cast(VD->getCanonicalDecl()), CanonParamType); - return ArgResult.get(); + if (ArgPE) { + SugaredConverted = TemplateArgument(Arg); + CanonicalConverted = + Context.getCanonicalTemplateArgument(SugaredConverted); + } else { + SugaredConverted = TemplateArgument(VD, ParamType); + CanonicalConverted = + TemplateArgument(cast(VD->getCanonicalDecl()), + ParamType.getCanonicalType()); + } + return Arg; } // -- a subobject [until C++20] @@ -7030,9 +7040,16 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, if (Value.isAddrLabelDiff()) return Diag(StartLoc, diag::err_non_type_template_arg_addr_label_diff); - SugaredConverted = TemplateArgument(Context, ParamType, Value); - CanonicalConverted = TemplateArgument(Context, CanonParamType, Value); - return ArgResult.get(); + if (ArgPE) { + SugaredConverted = TemplateArgument(Arg); + CanonicalConverted = + Context.getCanonicalTemplateArgument(SugaredConverted); + } else { + SugaredConverted = TemplateArgument(Context, ParamType, Value); + CanonicalConverted = + TemplateArgument(Context, ParamType.getCanonicalType(), Value); + } + return Arg; } // C++ [temp.arg.nontype]p5: @@ -7061,18 +7078,18 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, // type, a converted constant expression of the type of the // template-parameter; or llvm::APSInt Value; - ExprResult ArgResult = - CheckConvertedConstantExpression(Arg, ParamType, Value, - CCEK_TemplateArg); + ExprResult ArgResult = CheckConvertedConstantExpression( + DeductionArg, ParamType, Value, CCEK_TemplateArg); if (ArgResult.isInvalid()) return ExprError(); + setDeductionArg(ArgResult.get()); // We can't check arbitrary value-dependent arguments. - if (ArgResult.get()->isValueDependent()) { - SugaredConverted = TemplateArgument(ArgResult.get()); + if (DeductionArg->isValueDependent()) { + SugaredConverted = TemplateArgument(Arg); CanonicalConverted = Context.getCanonicalTemplateArgument(SugaredConverted); - return ArgResult; + return Arg; } // Widen the argument value to sizeof(parameter type). This is almost @@ -7085,18 +7102,24 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, ? Context.getIntWidth(IntegerType) : Context.getTypeSize(IntegerType)); - SugaredConverted = TemplateArgument(Context, Value, ParamType); - CanonicalConverted = - TemplateArgument(Context, Value, Context.getCanonicalType(ParamType)); - return ArgResult; + if (ArgPE) { + SugaredConverted = TemplateArgument(Arg); + CanonicalConverted = + Context.getCanonicalTemplateArgument(SugaredConverted); + } else { + SugaredConverted = TemplateArgument(Context, Value, ParamType); + CanonicalConverted = TemplateArgument( + Context, Value, Context.getCanonicalType(ParamType)); + } + return Arg; } ExprResult ArgResult = DefaultLvalueConversion(Arg); if (ArgResult.isInvalid()) return ExprError(); - Arg = ArgResult.get(); + DeductionArg = ArgResult.get(); - QualType ArgType = Arg->getType(); + QualType ArgType = DeductionArg->getType(); // C++ [temp.arg.nontype]p1: // A template-argument for a non-type, non-template @@ -7107,11 +7130,11 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, // -- the name of a non-type template-parameter; or llvm::APSInt Value; if (!ArgType->isIntegralOrEnumerationType()) { - Diag(Arg->getBeginLoc(), diag::err_template_arg_not_integral_or_enumeral) - << ArgType << Arg->getSourceRange(); + Diag(StartLoc, diag::err_template_arg_not_integral_or_enumeral) + << ArgType << DeductionArg->getSourceRange(); NoteTemplateParameterLocation(*Param); return ExprError(); - } else if (!Arg->isValueDependent()) { + } else if (!DeductionArg->isValueDependent()) { class TmplArgICEDiagnoser : public VerifyICEDiagnoser { QualType T; @@ -7124,8 +7147,10 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, } } Diagnoser(ArgType); - Arg = VerifyIntegerConstantExpression(Arg, &Value, Diagnoser).get(); - if (!Arg) + DeductionArg = + VerifyIntegerConstantExpression(DeductionArg, &Value, Diagnoser) + .get(); + if (!DeductionArg) return ExprError(); } @@ -7138,23 +7163,28 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, // Okay: no conversion necessary } else if (ParamType->isBooleanType()) { // This is an integral-to-boolean conversion. - Arg = ImpCastExprToType(Arg, ParamType, CK_IntegralToBoolean).get(); + DeductionArg = + ImpCastExprToType(DeductionArg, ParamType, CK_IntegralToBoolean) + .get(); } else if (IsIntegralPromotion(Arg, ArgType, ParamType) || !ParamType->isEnumeralType()) { // This is an integral promotion or conversion. - Arg = ImpCastExprToType(Arg, ParamType, CK_IntegralCast).get(); + DeductionArg = + ImpCastExprToType(DeductionArg, ParamType, CK_IntegralCast).get(); } else { // We can't perform this conversion. - Diag(Arg->getBeginLoc(), diag::err_template_arg_not_convertible) - << Arg->getType() << ParamType << Arg->getSourceRange(); + Diag(StartLoc, diag::err_template_arg_not_convertible) + << DeductionArg->getType() << ParamType + << DeductionArg->getSourceRange(); NoteTemplateParameterLocation(*Param); return ExprError(); } + setDeductionArg(DeductionArg); // Add the value of this argument to the list of converted // arguments. We use the bitwidth and signedness of the template // parameter. - if (Arg->isValueDependent()) { + if (DeductionArg->isValueDependent()) { // The argument is value-dependent. Create a new // TemplateArgument with the converted expression. SugaredConverted = TemplateArgument(Arg); @@ -7212,14 +7242,20 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, } } - QualType T = ParamType->isEnumeralType() ? ParamType : IntegerType; - SugaredConverted = TemplateArgument(Context, Value, T); - CanonicalConverted = - TemplateArgument(Context, Value, Context.getCanonicalType(T)); + if (ArgPE) { + SugaredConverted = TemplateArgument(Arg); + CanonicalConverted = + Context.getCanonicalTemplateArgument(SugaredConverted); + } else { + QualType T = ParamType->isEnumeralType() ? ParamType : IntegerType; + SugaredConverted = TemplateArgument(Context, Value, T); + CanonicalConverted = + TemplateArgument(Context, Value, Context.getCanonicalType(T)); + } return Arg; } - QualType ArgType = Arg->getType(); + QualType ArgType = DeductionArg->getType(); DeclAccessPair FoundResult; // temporary for ResolveOverloadedFunction // Handle pointer-to-function, reference-to-function, and @@ -7246,7 +7282,7 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, ParamType->castAs()->getPointeeType() ->isFunctionType())) { - if (Arg->getType() == Context.OverloadTy) { + if (DeductionArg->getType() == Context.OverloadTy) { if (FunctionDecl *Fn = ResolveAddressOfOverloadedFunction(Arg, ParamType, true, FoundResult)) { @@ -7256,11 +7292,12 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, ExprResult Res = FixOverloadedFunctionReference(Arg, FoundResult, Fn); if (Res.isInvalid()) return ExprError(); - Arg = Res.get(); + DeductionArg = Res.get(); ArgType = Arg->getType(); } else return ExprError(); } + setDeductionArg(DeductionArg); if (!ParamType->isMemberPointerType()) { if (CheckTemplateArgumentAddressOfObjectOrFunction( @@ -7276,6 +7313,8 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, return Arg; } + setDeductionArg(DeductionArg); + if (ParamType->isPointerType()) { // -- for a non-type template-parameter of type pointer to // object, qualification conversions (4.4) and the @@ -7284,6 +7323,7 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, assert(ParamType->getPointeeType()->isIncompleteOrObjectType() && "Only object pointers allowed here"); + // FIXME: Deal with pack expansions here. if (CheckTemplateArgumentAddressOfObjectOrFunction( *this, Param, ParamType, Arg, SugaredConverted, CanonicalConverted)) return ExprError(); @@ -7300,6 +7340,7 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, assert(ParamRefType->getPointeeType()->isIncompleteOrObjectType() && "Only object references allowed here"); + // FIXME: Deal with pack expansions here. if (Arg->getType() == Context.OverloadTy) { if (FunctionDecl *Fn = ResolveAddressOfOverloadedFunction(Arg, ParamRefType->getPointeeType(), @@ -7324,17 +7365,18 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, // Deal with parameters of type std::nullptr_t. if (ParamType->isNullPtrType()) { - if (Arg->isTypeDependent() || Arg->isValueDependent()) { + if (DeductionArg->isTypeDependent() || DeductionArg->isValueDependent()) { SugaredConverted = TemplateArgument(Arg); CanonicalConverted = Context.getCanonicalTemplateArgument(SugaredConverted); return Arg; } - switch (isNullPointerValueTemplateArgument(*this, Param, ParamType, Arg)) { + switch (isNullPointerValueTemplateArgument(*this, Param, ParamType, + DeductionArg)) { case NPV_NotNullPointer: Diag(Arg->getExprLoc(), diag::err_template_arg_not_convertible) - << Arg->getType() << ParamType; + << DeductionArg->getType() << ParamType; NoteTemplateParameterLocation(*Param); return ExprError(); @@ -7343,10 +7385,17 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, case NPV_NullPointer: Diag(Arg->getExprLoc(), diag::warn_cxx98_compat_template_arg_null); - SugaredConverted = TemplateArgument(ParamType, - /*isNullPtr=*/true); - CanonicalConverted = TemplateArgument(Context.getCanonicalType(ParamType), + if (ArgPE) { + SugaredConverted = TemplateArgument(Arg); + CanonicalConverted = + Context.getCanonicalTemplateArgument(SugaredConverted); + } else { + SugaredConverted = TemplateArgument(ParamType, /*isNullPtr=*/true); + CanonicalConverted = + TemplateArgument(Context.getCanonicalType(ParamType), + /*isNullPtr=*/true); + } return Arg; } } @@ -7355,6 +7404,7 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param, // member, qualification conversions (4.4) are applied. assert(ParamType->isMemberPointerType() && "Only pointers to members remain"); + // FIXME: Deal with pack expansions here. if (CheckTemplateArgumentPointerToMember( *this, Param, ParamType, Arg, SugaredConverted, CanonicalConverted)) return ExprError(); diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 170b9f05002b1..200168087136b 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -2826,16 +2826,8 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( } /// Determine whether two template arguments are the same. -static bool isSameTemplateArg(ASTContext &Context, - TemplateArgument X, - const TemplateArgument &Y, - bool PartialOrdering, - bool PackExpansionMatchesPack = false) { - // If we're checking deduced arguments (X) against original arguments (Y), - // we will have flattened packs to non-expansions in X. - if (PackExpansionMatchesPack && X.isPackExpansion() && !Y.isPackExpansion()) - X = X.getPackExpansionPattern(); - +static bool isSameTemplateArg(ASTContext &Context, const TemplateArgument &X, + const TemplateArgument &Y) { if (X.getKind() != Y.getKind()) return false; @@ -2875,28 +2867,12 @@ static bool isSameTemplateArg(ASTContext &Context, case TemplateArgument::Pack: { unsigned PackIterationSize = X.pack_size(); - if (X.pack_size() != Y.pack_size()) { - if (!PartialOrdering) - return false; - - // C++0x [temp.deduct.type]p9: - // During partial ordering, if Ai was originally a pack expansion: - // - if P does not contain a template argument corresponding to Ai - // then Ai is ignored; - bool XHasMoreArg = X.pack_size() > Y.pack_size(); - if (!(XHasMoreArg && X.pack_elements().back().isPackExpansion()) && - !(!XHasMoreArg && Y.pack_elements().back().isPackExpansion())) - return false; - - if (XHasMoreArg) - PackIterationSize = Y.pack_size(); - } - + if (X.pack_size() != Y.pack_size()) + return false; ArrayRef XP = X.pack_elements(); ArrayRef YP = Y.pack_elements(); for (unsigned i = 0; i < PackIterationSize; ++i) - if (!isSameTemplateArg(Context, XP[i], YP[i], PartialOrdering, - PackExpansionMatchesPack)) + if (!isSameTemplateArg(Context, XP[i], YP[i])) return false; return true; } @@ -3074,22 +3050,16 @@ ConvertDeducedTemplateArgument(Sema &S, NamedDecl *Param, return ConvertArg(Arg, 0); } -// FIXME: This should not be a template, but -// ClassTemplatePartialSpecializationDecl sadly does not derive from -// TemplateDecl. /// \param IsIncomplete When used, we only consider template parameters that /// were deduced, disregarding any default arguments. After the function /// finishes, the object pointed at will contain a value indicating if the /// conversion was actually incomplete. -template static TemplateDeductionResult ConvertDeducedTemplateArguments( - Sema &S, TemplateDeclT *Template, bool IsDeduced, - SmallVectorImpl &Deduced, + Sema &S, NamedDecl *Template, TemplateParameterList *TemplateParams, + bool IsDeduced, SmallVectorImpl &Deduced, TemplateDeductionInfo &Info, Sema::CheckTemplateArgumentInfo &CTAI, LocalInstantiationScope *CurrentInstantiationScope, unsigned NumAlreadyConverted, bool *IsIncomplete) { - TemplateParameterList *TemplateParams = Template->getTemplateParameters(); - for (unsigned I = 0, N = TemplateParams->size(); I != N; ++I) { NamedDecl *Param = TemplateParams->getParam(I); @@ -3234,34 +3204,30 @@ template<> struct IsPartialSpecialization { static constexpr bool value = true; }; -template -static bool DeducedArgsNeedReplacement(TemplateDeclT *Template) { - return false; -} -template <> -bool DeducedArgsNeedReplacement( - VarTemplatePartialSpecializationDecl *Spec) { - return !Spec->isClassScopeExplicitSpecialization(); -} -template <> -bool DeducedArgsNeedReplacement( - ClassTemplatePartialSpecializationDecl *Spec) { - return !Spec->isClassScopeExplicitSpecialization(); -} -template static TemplateDeductionResult -CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template, +CheckDeducedArgumentConstraints(Sema &S, NamedDecl *Template, ArrayRef SugaredDeducedArgs, ArrayRef CanonicalDeducedArgs, TemplateDeductionInfo &Info) { llvm::SmallVector AssociatedConstraints; - Template->getAssociatedConstraints(AssociatedConstraints); + bool DeducedArgsNeedReplacement = false; + if (auto *TD = dyn_cast(Template)) { + TD->getAssociatedConstraints(AssociatedConstraints); + DeducedArgsNeedReplacement = !TD->isClassScopeExplicitSpecialization(); + } else if (auto *TD = + dyn_cast(Template)) { + TD->getAssociatedConstraints(AssociatedConstraints); + DeducedArgsNeedReplacement = !TD->isClassScopeExplicitSpecialization(); + } else { + cast(Template)->getAssociatedConstraints( + AssociatedConstraints); + } std::optional> Innermost; // If we don't need to replace the deduced template arguments, // we can add them immediately as the inner-most argument list. - if (!DeducedArgsNeedReplacement(Template)) + if (!DeducedArgsNeedReplacement) Innermost = CanonicalDeducedArgs; MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs( @@ -3288,73 +3254,60 @@ CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template, return TemplateDeductionResult::Success; } -/// Complete template argument deduction for a partial specialization. -template -static std::enable_if_t::value, - TemplateDeductionResult> -FinishTemplateArgumentDeduction( - Sema &S, T *Partial, bool IsPartialOrdering, - ArrayRef TemplateArgs, +/// Complete template argument deduction. +static TemplateDeductionResult FinishTemplateArgumentDeduction( + Sema &S, NamedDecl *Entity, TemplateParameterList *EntityTPL, + TemplateDecl *Template, bool PartialOrdering, + ArrayRef Ps, ArrayRef As, SmallVectorImpl &Deduced, - TemplateDeductionInfo &Info) { + TemplateDeductionInfo &Info, bool CopyDeducedArgs) { // Unevaluated SFINAE context. EnterExpressionEvaluationContext Unevaluated( S, Sema::ExpressionEvaluationContext::Unevaluated); - Sema::SFINAETrap Trap(S); - Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(Partial)); + Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(Entity)); // C++ [temp.deduct.type]p2: // [...] or if any template argument remains neither deduced nor // explicitly specified, template argument deduction fails. - Sema::CheckTemplateArgumentInfo CTAI(IsPartialOrdering); + Sema::CheckTemplateArgumentInfo CTAI(PartialOrdering); if (auto Result = ConvertDeducedTemplateArguments( - S, Partial, IsPartialOrdering, Deduced, Info, CTAI, - /*CurrentInstantiationScope=*/nullptr, /*NumAlreadyConverted=*/0, - /*IsIncomplete=*/nullptr); + S, Entity, EntityTPL, /*IsDeduced=*/PartialOrdering, Deduced, Info, + CTAI, + /*CurrentInstantiationScope=*/nullptr, + /*NumAlreadyConverted=*/0U, /*IsIncomplete=*/nullptr); Result != TemplateDeductionResult::Success) return Result; - // Form the template argument list from the deduced template arguments. - TemplateArgumentList *SugaredDeducedArgumentList = - TemplateArgumentList::CreateCopy(S.Context, CTAI.SugaredConverted); - TemplateArgumentList *CanonicalDeducedArgumentList = - TemplateArgumentList::CreateCopy(S.Context, CTAI.CanonicalConverted); + if (CopyDeducedArgs) { + // Form the template argument list from the deduced template arguments. + TemplateArgumentList *SugaredDeducedArgumentList = + TemplateArgumentList::CreateCopy(S.Context, CTAI.SugaredConverted); + TemplateArgumentList *CanonicalDeducedArgumentList = + TemplateArgumentList::CreateCopy(S.Context, CTAI.CanonicalConverted); + Info.reset(SugaredDeducedArgumentList, CanonicalDeducedArgumentList); + } - Info.reset(SugaredDeducedArgumentList, CanonicalDeducedArgumentList); + TemplateParameterList *TPL = Template->getTemplateParameters(); + TemplateArgumentListInfo InstArgs(TPL->getLAngleLoc(), TPL->getRAngleLoc()); + MultiLevelTemplateArgumentList MLTAL(Entity, CTAI.SugaredConverted, + /*Final=*/true); + MLTAL.addOuterRetainedLevels(TPL->getDepth()); - // Substitute the deduced template arguments into the template - // arguments of the class template partial specialization, and - // verify that the instantiated template arguments are both valid - // and are equivalent to the template arguments originally provided - // to the class template. - LocalInstantiationScope InstScope(S); - auto *Template = Partial->getSpecializedTemplate(); - const ASTTemplateArgumentListInfo *PartialTemplArgInfo = - Partial->getTemplateArgsAsWritten(); - - TemplateArgumentListInfo InstArgs(PartialTemplArgInfo->LAngleLoc, - PartialTemplArgInfo->RAngleLoc); - - if (S.SubstTemplateArguments( - PartialTemplArgInfo->arguments(), - MultiLevelTemplateArgumentList(Partial, CTAI.SugaredConverted, - /*Final=*/true), - InstArgs)) { + if (S.SubstTemplateArguments(Ps, MLTAL, InstArgs)) { unsigned ArgIdx = InstArgs.size(), ParamIdx = ArgIdx; - if (ParamIdx >= Partial->getTemplateParameters()->size()) - ParamIdx = Partial->getTemplateParameters()->size() - 1; + if (ParamIdx >= TPL->size()) + ParamIdx = TPL->size() - 1; - Decl *Param = const_cast( - Partial->getTemplateParameters()->getParam(ParamIdx)); + Decl *Param = const_cast(TPL->getParam(ParamIdx)); Info.Param = makeTemplateParameter(Param); - Info.FirstArg = (*PartialTemplArgInfo)[ArgIdx].getArgument(); + Info.FirstArg = Ps[ArgIdx].getArgument(); return TemplateDeductionResult::SubstitutionFailure; } bool ConstraintsNotSatisfied; Sema::CheckTemplateArgumentInfo InstCTAI; - if (S.CheckTemplateArgumentList(Template, Partial->getLocation(), InstArgs, + if (S.CheckTemplateArgumentList(Template, Template->getLocation(), InstArgs, /*DefaultArgs=*/{}, false, InstCTAI, /*UpdateArgsWithConversions=*/true, &ConstraintsNotSatisfied)) @@ -3362,59 +3315,9 @@ FinishTemplateArgumentDeduction( ? TemplateDeductionResult::ConstraintsNotSatisfied : TemplateDeductionResult::SubstitutionFailure; - TemplateParameterList *TemplateParams = Template->getTemplateParameters(); - for (unsigned I = 0, E = TemplateParams->size(); I != E; ++I) { - TemplateArgument InstArg = InstCTAI.SugaredConverted.data()[I]; - if (!isSameTemplateArg(S.Context, TemplateArgs[I], InstArg, - IsPartialOrdering)) { - Info.Param = makeTemplateParameter(TemplateParams->getParam(I)); - Info.FirstArg = TemplateArgs[I]; - Info.SecondArg = InstArg; - return TemplateDeductionResult::NonDeducedMismatch; - } - } - - if (Trap.hasErrorOccurred()) - return TemplateDeductionResult::SubstitutionFailure; - - if (!IsPartialOrdering) { - if (auto Result = CheckDeducedArgumentConstraints( - S, Partial, CTAI.SugaredConverted, CTAI.CanonicalConverted, Info); - Result != TemplateDeductionResult::Success) - return Result; - } - - return TemplateDeductionResult::Success; -} - -/// Complete template argument deduction for a class or variable template, -/// when partial ordering against a partial specialization. -// FIXME: Factor out duplication with partial specialization version above. -static TemplateDeductionResult FinishTemplateArgumentDeduction( - Sema &S, TemplateDecl *Template, bool PartialOrdering, - ArrayRef TemplateArgs, - SmallVectorImpl &Deduced, - TemplateDeductionInfo &Info) { - // Unevaluated SFINAE context. - EnterExpressionEvaluationContext Unevaluated( - S, Sema::ExpressionEvaluationContext::Unevaluated); - - Sema::ContextRAII SavedContext(S, getAsDeclContextOrEnclosing(Template)); - - // C++ [temp.deduct.type]p2: - // [...] or if any template argument remains neither deduced nor - // explicitly specified, template argument deduction fails. - Sema::CheckTemplateArgumentInfo CTAI(PartialOrdering); - if (auto Result = ConvertDeducedTemplateArguments( - S, Template, /*IsDeduced=*/PartialOrdering, Deduced, Info, CTAI, - /*CurrentInstantiationScope=*/nullptr, - /*NumAlreadyConverted=*/0U, /*IsIncomplete=*/nullptr); - Result != TemplateDeductionResult::Success) - return Result; - // Check that we produced the correct argument list. - SmallVector, 4> PsStack{TemplateArgs}, - AsStack{CTAI.CanonicalConverted}; + SmallVector, 4> PsStack{InstCTAI.SugaredConverted}, + AsStack{As}; for (;;) { auto take = [](SmallVectorImpl> &Stack) -> std::tuple &, TemplateArgument> { @@ -3443,13 +3346,11 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( break; TemplateArgument PP = P.isPackExpansion() ? P.getPackExpansionPattern() : P, PA = A.isPackExpansion() ? A.getPackExpansionPattern() : A; - if (!isSameTemplateArg(S.Context, PP, PA, /*PartialOrdering=*/false)) { + if (!isSameTemplateArg(S.Context, PP, PA)) { if (!P.isPackExpansion() && !A.isPackExpansion()) { - Info.Param = - makeTemplateParameter(Template->getTemplateParameters()->getParam( - (AsStack.empty() ? CTAI.CanonicalConverted.end() - : AsStack.front().begin()) - - 1 - CTAI.CanonicalConverted.begin())); + Info.Param = makeTemplateParameter(TPL->getParam( + (AsStack.empty() ? As.end() : AsStack.back().begin()) - + As.begin())); Info.FirstArg = P; Info.SecondArg = A; return TemplateDeductionResult::NonDeducedMismatch; @@ -3471,13 +3372,28 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( if (!PartialOrdering) { if (auto Result = CheckDeducedArgumentConstraints( - S, Template, CTAI.SugaredConverted, CTAI.CanonicalConverted, Info); + S, Entity, CTAI.SugaredConverted, CTAI.CanonicalConverted, Info); Result != TemplateDeductionResult::Success) return Result; } return TemplateDeductionResult::Success; } +static TemplateDeductionResult FinishTemplateArgumentDeduction( + Sema &S, NamedDecl *Entity, TemplateParameterList *EntityTPL, + TemplateDecl *Template, bool PartialOrdering, ArrayRef Ps, + ArrayRef As, + SmallVectorImpl &Deduced, + TemplateDeductionInfo &Info, bool CopyDeducedArgs) { + TemplateParameterList *TPL = Template->getTemplateParameters(); + SmallVector PsLoc(Ps.size()); + for (unsigned I = 0, N = Ps.size(); I != N; ++I) + PsLoc[I] = S.getTrivialTemplateArgumentLoc(Ps[I], QualType(), + TPL->getParam(I)->getLocation()); + return FinishTemplateArgumentDeduction(S, Entity, EntityTPL, Template, + PartialOrdering, PsLoc, As, Deduced, + Info, CopyDeducedArgs); +} /// Complete template argument deduction for DeduceTemplateArgumentsFromType. /// FIXME: this is mostly duplicated with the above two versions. Deduplicate @@ -3497,7 +3413,8 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( // explicitly specified, template argument deduction fails. Sema::CheckTemplateArgumentInfo CTAI; if (auto Result = ConvertDeducedTemplateArguments( - S, TD, /*IsDeduced=*/false, Deduced, Info, CTAI, + S, TD, TD->getTemplateParameters(), /*IsDeduced=*/false, Deduced, + Info, CTAI, /*CurrentInstantiationScope=*/nullptr, /*NumAlreadyConverted=*/0, /*IsIncomplete=*/nullptr); Result != TemplateDeductionResult::Success) @@ -3553,9 +3470,12 @@ DeduceTemplateArguments(Sema &S, T *Partial, TemplateDeductionResult Result; S.runWithSufficientStackSpace(Info.getLocation(), [&] { - Result = ::FinishTemplateArgumentDeduction(S, Partial, - /*IsPartialOrdering=*/false, - TemplateArgs, Deduced, Info); + Result = ::FinishTemplateArgumentDeduction( + S, Partial, Partial->getTemplateParameters(), + Partial->getSpecializedTemplate(), + /*IsPartialOrdering=*/false, + Partial->getTemplateArgsAsWritten()->arguments(), TemplateArgs, Deduced, + Info, /*CopyDeducedArgs=*/true); }); if (Result != TemplateDeductionResult::Success) @@ -4062,9 +3982,9 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction( bool IsIncomplete = false; CheckTemplateArgumentInfo CTAI(PartialOrdering); if (auto Result = ConvertDeducedTemplateArguments( - *this, FunctionTemplate, /*IsDeduced=*/true, Deduced, Info, CTAI, - CurrentInstantiationScope, NumExplicitlySpecified, - PartialOverloading ? &IsIncomplete : nullptr); + *this, FunctionTemplate, FunctionTemplate->getTemplateParameters(), + /*IsDeduced=*/true, Deduced, Info, CTAI, CurrentInstantiationScope, + NumExplicitlySpecified, PartialOverloading ? &IsIncomplete : nullptr); Result != TemplateDeductionResult::Success) return Result; @@ -5677,7 +5597,8 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( bool IsIncomplete = false; Sema::CheckTemplateArgumentInfo CTAI(/*PartialOrdering=*/true); if (auto Result = ConvertDeducedTemplateArguments( - S, FTD, /*IsDeduced=*/true, Deduced, Info, CTAI, + S, FTD, FTD->getTemplateParameters(), /*IsDeduced=*/true, Deduced, + Info, CTAI, /*CurrentInstantiationScope=*/nullptr, /*NumAlreadyConverted=*/0, &IsIncomplete); Result != TemplateDeductionResult::Success) @@ -6243,16 +6164,19 @@ FunctionDecl *Sema::getMoreConstrainedFunction(FunctionDecl *FD1, return AtLeastAsConstrained1 ? FD1 : FD2; } -/// Determine whether one partial specialization, P1, is at least as +/// Determine whether one template specialization, P1, is at least as /// specialized than another, P2. /// /// \tparam TemplateLikeDecl The kind of P2, which must be a /// TemplateDecl or {Class,Var}TemplatePartialSpecializationDecl. /// \param T1 The injected-class-name of P1 (faked for a variable template). /// \param T2 The injected-class-name of P2 (faked for a variable template). -template +/// \param Template The primary template of P2, in case it is a partial +/// specialization, the same as P2 otherwise. +template static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2, TemplateLikeDecl *P2, + TemplateDecl *Template, TemplateDeductionInfo &Info) { // C++ [temp.class.order]p1: // For two class template partial specializations, the first is at least as @@ -6295,15 +6219,18 @@ static bool isAtLeastAsSpecializedAs(Sema &S, QualType T1, QualType T2, if (Inst.isInvalid()) return false; - const auto *TST1 = cast(T1); + ArrayRef + Ps = cast(T2)->template_arguments(), + As = cast(T1)->template_arguments(); Sema::SFINAETrap Trap(S); TemplateDeductionResult Result; S.runWithSufficientStackSpace(Info.getLocation(), [&] { Result = ::FinishTemplateArgumentDeduction( - S, P2, /*IsPartialOrdering=*/true, TST1->template_arguments(), Deduced, - Info); + S, P2, P2->getTemplateParameters(), Template, + /*IsPartialOrdering=*/true, Ps, As, Deduced, Info, + /*CopyDeducedArgs=*/false); }); if (Result != TemplateDeductionResult::Success) @@ -6407,11 +6334,18 @@ getMoreSpecialized(Sema &S, QualType T1, QualType T2, TemplateLikeDecl *P1, constexpr bool IsMoreSpecialThanPrimaryCheck = !std::is_same_v; - bool Better1 = isAtLeastAsSpecializedAs(S, T1, T2, P2, Info); + TemplateDecl *P2T; + if constexpr (IsMoreSpecialThanPrimaryCheck) + P2T = P2; + else + P2T = P2->getSpecializedTemplate(); + + bool Better1 = isAtLeastAsSpecializedAs(S, T1, T2, P2, P2T, Info); if (IsMoreSpecialThanPrimaryCheck && !Better1) return nullptr; - bool Better2 = isAtLeastAsSpecializedAs(S, T2, T1, P1, Info); + bool Better2 = isAtLeastAsSpecializedAs(S, T2, T1, P1, + P1->getSpecializedTemplate(), Info); if (IsMoreSpecialThanPrimaryCheck && !Better2) return P1; @@ -6666,8 +6600,9 @@ bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs( TemplateDeductionResult TDK; runWithSufficientStackSpace(Info.getLocation(), [&] { - TDK = ::FinishTemplateArgumentDeduction(*this, AArg, PartialOrdering, PArgs, - Deduced, Info); + TDK = ::FinishTemplateArgumentDeduction( + *this, AArg, AArg->getTemplateParameters(), AArg, PartialOrdering, + AArgs, PArgs, Deduced, Info, /*CopyDeducedArgs=*/false); }); switch (TDK) { case TemplateDeductionResult::Success: diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index dd493a083d86d..d2408a94ad0ab 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -2383,10 +2383,10 @@ TemplateInstantiator::TransformSubstNonTypeTemplateParmExpr( // The call to CheckTemplateArgument here produces the ImpCast. TemplateArgument SugaredConverted, CanonicalConverted; if (SemaRef - .CheckTemplateArgument( - E->getParameter(), SubstType, SubstReplacement.get(), - SugaredConverted, CanonicalConverted, - /*PartialOrderingTTP=*/false, Sema::CTAK_Specified) + .CheckTemplateArgument(E->getParameter(), SubstType, + SubstReplacement.get(), SugaredConverted, + CanonicalConverted, + /*StrictCheck=*/false, Sema::CTAK_Specified) .isInvalid()) return true; return transformNonTypeTemplateParmRef( diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp index ef0e6ee23e942..3d4a245eb8bd5 100644 --- a/clang/lib/Sema/SemaTemplateVariadic.cpp +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp @@ -745,8 +745,7 @@ ExprResult Sema::CheckPackExpansion(Expr *Pattern, SourceLocation EllipsisLoc, } // Create the pack expansion expression and source-location information. - return new (Context) - PackExpansionExpr(Context.DependentTy, Pattern, EllipsisLoc, NumExpansions); + return new (Context) PackExpansionExpr(Pattern, EllipsisLoc, NumExpansions); } bool Sema::CheckParameterPacksForExpansion( diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index b9bf748a2e98e..57fcc4b3b3682 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -16126,8 +16126,7 @@ TreeTransform::TransformSizeOfPackExpr(SizeOfPackExpr *E) { if (DRE.isInvalid()) return ExprError(); ArgStorage = new (getSema().Context) - PackExpansionExpr(getSema().Context.DependentTy, DRE.get(), - E->getPackLoc(), std::nullopt); + PackExpansionExpr(DRE.get(), E->getPackLoc(), std::nullopt); } PackArgs = ArgStorage; } diff --git a/clang/test/Import/pack-expansion-expr/test.cpp b/clang/test/Import/pack-expansion-expr/test.cpp index 6866c41cfbcf4..3aa108b2ab897 100644 --- a/clang/test/Import/pack-expansion-expr/test.cpp +++ b/clang/test/Import/pack-expansion-expr/test.cpp @@ -1,7 +1,7 @@ // RUN: clang-import-test -dump-ast -import %S/Inputs/F.cpp -expression %s | FileCheck %s // CHECK: PackExpansionExpr -// CHECK-SAME: '' +// CHECK-SAME: 'T' // CHECK-NEXT: DeclRefExpr // CHECK-SAME: 'T' // CHECK-SAME: ParmVar diff --git a/clang/test/SemaTemplate/attributes.cpp b/clang/test/SemaTemplate/attributes.cpp index dea19d09745ca..020cfd291a502 100644 --- a/clang/test/SemaTemplate/attributes.cpp +++ b/clang/test/SemaTemplate/attributes.cpp @@ -80,7 +80,7 @@ void UseStmtAnnotations() { HasStmtAnnotations(); } // CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is // CHECK-NEXT: FunctionDecl {{.*}} HasPackAnnotations 'void ()' // CHECK-NEXT: AnnotateAttr {{.*}} "ANNOTATE_BAZ" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' +// CHECK-NEXT: PackExpansionExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: FunctionDecl {{.*}} used HasPackAnnotations 'void ()' // CHECK-NEXT: TemplateArgument{{.*}} pack @@ -111,7 +111,7 @@ void UsePackAnnotations() { HasPackAnnotations<1, 2, 3>(); } // CHECK-NEXT: FunctionDecl {{.*}} HasStmtPackAnnotations 'void ()' // CHECK: AttributedStmt {{.*}} // CHECK-NEXT: AnnotateAttr {{.*}} "ANNOTATE_QUUX" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' +// CHECK-NEXT: PackExpansionExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK: FunctionDecl {{.*}} used HasStmtPackAnnotations 'void ()' // CHECK-NEXT: TemplateArgument{{.*}} pack @@ -152,7 +152,7 @@ void UseOnlyPackAnnotations() { // CHECK-NEXT: MoveAssignment // CHECK-NEXT: Destructor // CHECK-NEXT: AnnotateAttr {{.*}} "ANNOTATE_FOZ" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' +// CHECK-NEXT: PackExpansionExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: CXXRecordDecl {{.*}} implicit struct AnnotatedPackTemplateStruct // CHECK-NEXT: ClassTemplateSpecializationDecl {{.*}} struct AnnotatedPackTemplateStruct definition @@ -285,7 +285,7 @@ void UseOnlyPackAnnotations() { // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is // CHECK-NEXT: AnnotateAttr {{.*}} "ANNOTATE_BOO" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' +// CHECK-NEXT: PackExpansionExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: CXXRecordDecl {{.*}} implicit struct AnnotatedPackTemplateStruct // CHECK-NEXT: ClassTemplatePartialSpecializationDecl {{.*}} struct AnnotatedPackTemplateStruct definition @@ -428,7 +428,7 @@ void UseAnnotatedPackTemplateStructSpecializations() { // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is // CHECK-NEXT: AnnotateAttr {{.*}} "ANNOTATE_BIR" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' +// CHECK-NEXT: PackExpansionExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: CXXRecordDecl {{.*}} implicit struct InvalidAnnotatedPackTemplateStruct // CHECK-NEXT: ClassTemplatePartialSpecializationDecl {{.*}} struct InvalidAnnotatedPackTemplateStruct definition @@ -478,7 +478,7 @@ void UseInvalidAnnotatedPackTemplateStruct() { // CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is // CHECK-NEXT: FunctionDecl {{.*}} RedeclaredAnnotatedFunc 'void ()' // CHECK-NEXT: AnnotateAttr {{.*}} "ANNOTATE_FAR" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' +// CHECK-NEXT: PackExpansionExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: FunctionDecl {{.*}} used RedeclaredAnnotatedFunc 'void ()' // CHECK-NEXT: TemplateArgument{{.*}} pack @@ -517,20 +517,20 @@ void UseInvalidAnnotatedPackTemplateStruct() { // CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'int' depth 0 index 0 ... Is // CHECK-NEXT: FunctionDecl {{.*}} prev {{.*}} RedeclaredAnnotatedFunc 'void ()' // CHECK-NEXT: AnnotateAttr {{.*}} Inherited "ANNOTATE_FAR" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' +// CHECK-NEXT: PackExpansionExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: AnnotateAttr {{.*}} "ANNOTATE_BOZ" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' +// CHECK-NEXT: PackExpansionExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: Function {{.*}} 'RedeclaredAnnotatedFunc' 'void ()' // CHECK-NEXT: FunctionTemplateDecl {{.*}} prev {{.*}} RedeclaredAnnotatedFunc // CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} 'int' depth 0 index 0 ... Is // CHECK-NEXT: FunctionDecl {{.*}} prev {{.*}} RedeclaredAnnotatedFunc 'void ()' // CHECK-NEXT: AnnotateAttr {{.*}} Inherited "ANNOTATE_FAR" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' +// CHECK-NEXT: PackExpansionExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: AnnotateAttr {{.*}} Inherited "ANNOTATE_BOZ" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' +// CHECK-NEXT: PackExpansionExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: AnnotateAttr {{.*}} "ANNOTATE_FIZ" // CHECK-NEXT: ConstantExpr {{.*}} 'int' @@ -545,7 +545,7 @@ void UseInvalidAnnotatedPackTemplateStruct() { // CHECK-NEXT: FunctionDecl {{.*}} prev {{.*}} RedeclaredAnnotatedFunc 'void ()' // CHECK-NEXT: CompoundStmt // CHECK-NEXT: AnnotateAttr {{.*}} Inherited "ANNOTATE_FAR" -// CHECK-NEXT: PackExpansionExpr {{.*}} '' +// CHECK-NEXT: PackExpansionExpr {{.*}} 'int' // CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'Is' 'int' // CHECK-NEXT: AnnotateAttr {{.*}} Inherited "ANNOTATE_FIZ" // CHECK-NEXT: ConstantExpr {{.*}} 'int' diff --git a/clang/test/SemaTemplate/partial-order.cpp b/clang/test/SemaTemplate/partial-order.cpp index 0a151de390236..db2624d1766bc 100644 --- a/clang/test/SemaTemplate/partial-order.cpp +++ b/clang/test/SemaTemplate/partial-order.cpp @@ -1,6 +1,4 @@ -// RUN: %clang_cc1 -std=c++1z %s -verify - -// expected-no-diagnostics +// RUN: %clang_cc1 -std=c++26 %s -verify namespace hana_enable_if_idiom { template struct A {}; @@ -12,3 +10,40 @@ namespace hana_enable_if_idiom { }; B b; } + +namespace GH132562 { + struct I { + int v = 0; + }; + + namespace t1 { + template struct A; + template + requires ((X.v == 0) ||...) + struct A; + } // namespace t1 + namespace t2 { + template struct A; // expected-note {{template is declared here}} + template struct A; + // expected-error@-1 {{is not more specialized than the primary template}} + // expected-note@-2 {{no viable conversion from 'int' to 'I'}} + + template struct B; // expected-note {{template is declared here}} + template struct B; + // expected-error@-1 {{is not more specialized than the primary template}} + // expected-note@-2 {{value of type 'const I' is not implicitly convertible to 'int'}} + } // namespace t2 + namespace t3 { + struct J { + int v = 0; + constexpr J(int v) : v(v) {} + }; + template struct A; + template struct A; + + template struct B; // expected-note {{template is declared here}} + template struct B; + // expected-error@-1 {{is not more specialized than the primary template}} + // expected-note@-2 {{value of type 'const J' is not implicitly convertible to 'int'}} + } // namespace t3 +} // namespace GH132562 diff --git a/clang/test/SemaTemplate/temp_arg_nontype.cpp b/clang/test/SemaTemplate/temp_arg_nontype.cpp index e989e45efb687..9363e748c7028 100644 --- a/clang/test/SemaTemplate/temp_arg_nontype.cpp +++ b/clang/test/SemaTemplate/temp_arg_nontype.cpp @@ -389,10 +389,13 @@ namespace PR17696 { namespace partial_order_different_types { template struct A; // expected-note@-1 {{template is declared here}} - template struct A<0, N, T, U, V> {}; - template struct A<0, N, T, U, V>; + template struct A<0, N, T, U, V> {}; // #P1 + template struct A<0, N, T, U, V>; // #P2 // expected-error@-1 {{class template partial specialization is not more specialized than the primary template}} A<0, 0, int, int, 0> a; + // expected-error@-1 {{ambiguous partial specializations}} + // expected-note@#P1 {{partial specialization matches}} + // expected-note@#P2 {{partial specialization matches}} } namespace partial_order_references { @@ -412,19 +415,18 @@ namespace partial_order_references { template struct B; // expected-note 2{{template}} template struct B<0, R> {}; // expected-error@-1 {{not more specialized than the primary}} - // expected-note@-2 {{'const int' vs 'int &'}} + // expected-note@-2 {{value of type 'const int' is not implicitly convertible to 'int &'}} B<0, N> b; // expected-error {{undefined}} - template struct C; // expected-note 2{{template}} + template struct C; // expected-note {{template}} + // This partial specialization is more specialized than the primary template. template struct C<0, R> {}; - // expected-error@-1 {{not more specialized than the primary}} - // expected-note@-2 {{'int' vs 'const int &'}} C<0, N> c; // expected-error {{undefined}} template struct D; // expected-note 2{{template}} template struct D<0, N> {}; // expected-error@-1 {{not more specialized than the primary}} - // expected-note@-2 {{'int' vs 'const int &'}} + // expected-note@-2 {{conversion from 'int' to 'const int &'}} extern const int K = 5; D<0, K> d; // expected-error {{undefined}} } From e2885772f05ddf9d81c54c5489801108838ca053 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Mon, 7 Apr 2025 10:30:38 -0500 Subject: [PATCH 0854/1029] [libc] Fix function that wasn't updated in wrapper headers --- clang/lib/Headers/llvm_libc_wrappers/stdlib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Headers/llvm_libc_wrappers/stdlib.h b/clang/lib/Headers/llvm_libc_wrappers/stdlib.h index 69afdf4a6897e..1da22abd0bc48 100644 --- a/clang/lib/Headers/llvm_libc_wrappers/stdlib.h +++ b/clang/lib/Headers/llvm_libc_wrappers/stdlib.h @@ -25,7 +25,7 @@ // The LLVM C library uses these named types so we forward declare them. typedef void (*__atexithandler_t)(void); -typedef int (*__bsearchcompare_t)(const void *, const void *); +typedef int (*__search_compare_t)(const void *, const void *); typedef int (*__qsortcompare_t)(const void *, const void *); typedef int (*__qsortrcompare_t)(const void *, const void *, void *); From 4b90f24db81fb4378d9f4816f31e16195d8adb0f Mon Sep 17 00:00:00 2001 From: Julian Lettner Date: Mon, 7 Apr 2025 08:33:27 -0700 Subject: [PATCH 0855/1029] [LLDB] Add integration test for libsanitizers trace collection (#134323) Add integration test for libsanitizers trace collection (`SanitizersAllocationTraces=all`). rdar://144244084 --- lldb/test/API/functionalities/asan/Makefile | 11 +- .../functionalities/asan/TestMemoryHistory.py | 104 +++++++++++------- .../functionalities/asan/TestReportData.py | 12 +- 3 files changed, 75 insertions(+), 52 deletions(-) diff --git a/lldb/test/API/functionalities/asan/Makefile b/lldb/test/API/functionalities/asan/Makefile index d66696fed7078..eae5ca3e4626c 100644 --- a/lldb/test/API/functionalities/asan/Makefile +++ b/lldb/test/API/functionalities/asan/Makefile @@ -1,8 +1,11 @@ C_SOURCES := main.c -asan: CFLAGS_EXTRAS := -fsanitize=address -g -gcolumn-info -asan: all +compiler_rt-asan: CFLAGS_EXTRAS := -fsanitize=address -g -gcolumn-info +compiler_rt-asan: all -libsanitizers: CFLAGS_EXTRAS := -fsanitize=address -fsanitize-stable-abi -g -gcolumn-info -libsanitizers: all +libsanitizers-asan: CFLAGS_EXTRAS := -fsanitize=address -fsanitize-stable-abi -g -gcolumn-info +libsanitizers-asan: all + +libsanitizers-traces: CFLAGS_EXTRAS := -g -gcolumn-info +libsanitizers-traces: all include Makefile.rules diff --git a/lldb/test/API/functionalities/asan/TestMemoryHistory.py b/lldb/test/API/functionalities/asan/TestMemoryHistory.py index b04182a543719..1568140b355dc 100644 --- a/lldb/test/API/functionalities/asan/TestMemoryHistory.py +++ b/lldb/test/API/functionalities/asan/TestMemoryHistory.py @@ -10,22 +10,28 @@ from lldbsuite.test import lldbutil from lldbsuite.test_event.build_exception import BuildError -class AsanTestCase(TestBase): +class MemoryHistoryTestCase(TestBase): @skipIfFreeBSD # llvm.org/pr21136 runtimes not yet available by default @expectedFailureNetBSD @skipUnlessAddressSanitizer - def test(self): - self.build(make_targets=["asan"]) - self.asan_tests() + def test_compiler_rt_asan(self): + self.build(make_targets=["compiler_rt-asan"]) + self.compiler_rt_asan_tests() - @skipIf(oslist=no_match(["macosx"])) - @skipIf(bugnumber="rdar://144997976") + @skipUnlessDarwin + @skipIf(bugnumber="rdar://109913184&143590169") def test_libsanitizers_asan(self): try: - self.build(make_targets=["libsanitizers"]) + self.build(make_targets=["libsanitizers-asan"]) except BuildError as e: self.skipTest("failed to build with libsanitizers") - self.libsanitizer_tests() + self.libsanitizers_asan_tests() + + @skipUnlessDarwin + @skipIf(macos_version=["<", "15.5"]) + def test_libsanitizers_traces(self): + self.build(make_targets=["libsanitizers-traces"]) + self.libsanitizers_traces_tests() def setUp(self): # Call super's setUp(). @@ -36,35 +42,60 @@ def setUp(self): self.line_breakpoint = line_number("main.c", "// break line") # Test line numbers: rdar://126237493 - def libsanitizer_tests(self): - target = self.createTestTarget() - - self.runCmd( - "env SanitizersAddress=1 MallocSanitizerZone=1 MallocSecureAllocator=0" - ) - - self.runCmd("run") - - # In libsanitizers, memory history is not supported until a report has been generated - self.expect( - "thread list", - "Process should be stopped due to ASan report", - substrs=["stopped", "stop reason = Use of deallocated memory"], - ) - - # test the 'memory history' command + # for libsanitizers and remove `skip_line_numbers` parameter + def check_traces(self, skip_line_numbers=False): self.expect( "memory history 'pointer'", substrs=[ "Memory deallocated by Thread", "a.out`f2", - "main.c", + "main.c" if skip_line_numbers else f"main.c:{self.line_free}", "Memory allocated by Thread", "a.out`f1", - "main.c", + "main.c" if skip_line_numbers else f"main.c:{self.line_malloc}", ], ) + # Set breakpoint: after free, but before bug + def set_breakpoint(self, target): + bkpt = target.BreakpointCreateByLocation("main.c", self.line_breakpoint) + self.assertGreater(bkpt.GetNumLocations(), 0, "Set the breakpoint successfully") + + def run_to_breakpoint(self, target): + self.set_breakpoint(target) + self.runCmd("run") + self.expect( + "thread list", + STOPPED_DUE_TO_BREAKPOINT, + substrs=["stopped", "stop reason = breakpoint"], + ) + + def libsanitizers_traces_tests(self): + target = self.createTestTarget() + + self.runCmd("env SanitizersAllocationTraces=all") + + self.run_to_breakpoint(target) + self.check_traces(skip_line_numbers=True) + + def libsanitizers_asan_tests(self): + target = self.createTestTarget() + + self.runCmd("env SanitizersAddress=1 MallocSanitizerZone=1") + + self.run_to_breakpoint(target) + self.check_traces(skip_line_numbers=True) + + self.runCmd("continue") + + # Stop on report + self.expect( + "thread list", + "Process should be stopped due to ASan report", + substrs=["stopped", "stop reason = Use of deallocated memory"], + ) + self.check_traces(skip_line_numbers=True) + # do the same using SB API process = self.dbg.GetSelectedTarget().process val = ( @@ -97,12 +128,12 @@ def libsanitizer_tests(self): "main.c", ) - def asan_tests(self): + def compiler_rt_asan_tests(self): target = self.createTestTarget() self.registerSanitizerLibrariesWithTarget(target) - self.runCmd("breakpoint set -f main.c -l %d" % self.line_breakpoint) + self.set_breakpoint(target) # "memory history" command should not work without a process self.expect( @@ -135,18 +166,7 @@ def asan_tests(self): substrs=["1 match found"], ) - # test the 'memory history' command - self.expect( - "memory history 'pointer'", - substrs=[ - "Memory deallocated by Thread", - "a.out`f2", - "main.c:%d" % self.line_free, - "Memory allocated by Thread", - "a.out`f1", - "main.c:%d" % self.line_malloc, - ], - ) + self.check_traces() # do the same using SB API process = self.dbg.GetSelectedTarget().process @@ -198,6 +218,8 @@ def asan_tests(self): substrs=["stopped", "stop reason = Use of deallocated memory"], ) + self.check_traces() + # make sure the 'memory history' command still works even when we're # generating a report now self.expect( diff --git a/lldb/test/API/functionalities/asan/TestReportData.py b/lldb/test/API/functionalities/asan/TestReportData.py index fabc985d0ed44..dd6834a01b80c 100644 --- a/lldb/test/API/functionalities/asan/TestReportData.py +++ b/lldb/test/API/functionalities/asan/TestReportData.py @@ -16,14 +16,14 @@ class AsanTestReportDataCase(TestBase): @skipUnlessAddressSanitizer @skipIf(archs=["i386"], bugnumber="llvm.org/PR36710") def test(self): - self.build(make_targets=["asan"]) + self.build(make_targets=["compiler_rt-asan"]) self.asan_tests() - @skipIf(oslist=no_match(["macosx"])) - @skipIf(bugnumber="rdar://144997976") + @skipUnlessDarwin + @skipIf(bugnumber="rdar://109913184&143590169") def test_libsanitizers_asan(self): try: - self.build(make_targets=["libsanitizers"]) + self.build(make_targets=["libsanitizers-asan"]) except BuildError as e: self.skipTest("failed to build with libsanitizers") self.asan_tests(libsanitizers=True) @@ -42,9 +42,7 @@ def asan_tests(self, libsanitizers=False): target = self.createTestTarget() if libsanitizers: - self.runCmd( - "env SanitizersAddress=1 MallocSanitizerZone=1 MallocSecureAllocator=0" - ) + self.runCmd("env SanitizersAddress=1 MallocSanitizerZone=1") else: self.registerSanitizerLibrariesWithTarget(target) From c75eac7c0347b2df88773983bef1e72cb53ad35d Mon Sep 17 00:00:00 2001 From: Alexandre Ganea Date: Mon, 7 Apr 2025 11:34:24 -0400 Subject: [PATCH 0856/1029] [LLD][COFF] Don't dllimport from static libraries (#134443) This reverts commit 6a1bdd9 and re-instate behavior that matches what MSVC link.exe does, that is, error out when trying to dllimport a symbol from a static library. A hint is now displayed in stdout, mentioning that we should rather dllimport the symbol from a import library. Fixes https://github.com/llvm/llvm-project/issues/131807 --- lld/COFF/Driver.cpp | 6 ++-- lld/COFF/SymbolTable.cpp | 20 ++++++----- lld/COFF/SymbolTable.h | 5 +-- .../COFF/imports-static-lib-indirect.test | 26 +++++++++++++++ lld/test/COFF/imports-static-lib.test | 33 +++++++++++++++++++ lld/test/COFF/undefined_lazy.test | 26 --------------- 6 files changed, 74 insertions(+), 42 deletions(-) create mode 100644 lld/test/COFF/imports-static-lib-indirect.test create mode 100644 lld/test/COFF/imports-static-lib.test delete mode 100644 lld/test/COFF/undefined_lazy.test diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 9bbab524b1f9a..7aa13bdce488e 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -2663,10 +2663,8 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { createECExportThunks(); // Resolve remaining undefined symbols and warn about imported locals. - ctx.forEachSymtab([&](SymbolTable &symtab) { - while (symtab.resolveRemainingUndefines()) - run(); - }); + ctx.forEachSymtab( + [&](SymbolTable &symtab) { symtab.resolveRemainingUndefines(); }); if (errorCount()) return; diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index 16afcc64316e3..8fb0ee4e890d6 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -231,6 +231,17 @@ void SymbolTable::reportUndefinedSymbol(const UndefinedDiag &undefDiag) { } if (numDisplayedRefs < numRefs) diag << "\n>>> referenced " << numRefs - numDisplayedRefs << " more times"; + + // Hints + StringRef name = undefDiag.sym->getName(); + if (name.consume_front("__imp_")) { + Symbol *imp = find(name); + if (imp && imp->isLazy()) { + diag << "\nNOTE: a relevant symbol '" << imp->getName() + << "' is available in " << toString(imp->getFile()) + << " but cannot be used because it is not an import library."; + } + } } void SymbolTable::loadMinGWSymbols() { @@ -432,11 +443,10 @@ void SymbolTable::reportUnresolvable() { reportProblemSymbols(undefs, /*localImports=*/nullptr, true); } -bool SymbolTable::resolveRemainingUndefines() { +void SymbolTable::resolveRemainingUndefines() { llvm::TimeTraceScope timeScope("Resolve remaining undefined symbols"); SmallPtrSet undefs; DenseMap localImports; - bool foundLazy = false; for (auto &i : symMap) { Symbol *sym = i.second; @@ -481,11 +491,6 @@ bool SymbolTable::resolveRemainingUndefines() { imp = findLocalSym(*mangledName); } } - if (imp && imp->isLazy()) { - forceLazy(imp); - foundLazy = true; - continue; - } if (imp && isa(imp)) { auto *d = cast(imp); replaceSymbol(sym, ctx, name, d); @@ -513,7 +518,6 @@ bool SymbolTable::resolveRemainingUndefines() { reportProblemSymbols( undefs, ctx.config.warnLocallyDefinedImported ? &localImports : nullptr, false); - return foundLazy; } std::pair SymbolTable::insert(StringRef name) { diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h index da19bf2800ecf..15e2644a6f519 100644 --- a/lld/COFF/SymbolTable.h +++ b/lld/COFF/SymbolTable.h @@ -67,10 +67,7 @@ class SymbolTable { // Try to resolve any undefined symbols and update the symbol table // accordingly, then print an error message for any remaining undefined // symbols and warn about imported local symbols. - // Returns whether more files might need to be linked in to resolve lazy - // symbols, in which case the caller is expected to call the function again - // after linking those files. - bool resolveRemainingUndefines(); + void resolveRemainingUndefines(); // Load lazy objects that are needed for MinGW automatic import and for // doing stdcall fixups. diff --git a/lld/test/COFF/imports-static-lib-indirect.test b/lld/test/COFF/imports-static-lib-indirect.test new file mode 100644 index 0000000000000..beda0d7a31afd --- /dev/null +++ b/lld/test/COFF/imports-static-lib-indirect.test @@ -0,0 +1,26 @@ +# REQUIRES: x86 + +# Pulling in on both a dllimport symbol and a static symbol should only warn. +# RUN: split-file %s %t.dir +# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/other.s -o %t.other.obj +# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/main.s -o %t.main.obj +# RUN: llvm-lib %t.other.obj -out:%t.other.lib +# RUN: lld-link %t.other.lib %t.main.obj -out:%t.dll -dll 2>&1 | FileCheck %s + +CHECK: warning: {{.*}} locally defined symbol imported: foo {{.*}} [LNK4217] + +#--- other.s +.text +.globl other +.globl foo +other: + ret +foo: + ret +#--- main.s +.text +.global _DllMainCRTStartup +_DllMainCRTStartup: + call *other(%rip) + call *__imp_foo(%rip) + ret diff --git a/lld/test/COFF/imports-static-lib.test b/lld/test/COFF/imports-static-lib.test new file mode 100644 index 0000000000000..8e9525dab5284 --- /dev/null +++ b/lld/test/COFF/imports-static-lib.test @@ -0,0 +1,33 @@ +# REQUIRES: x86 + +# Ensure that we don't import dllimport symbols from static (non-import) libraries +# RUN: split-file %s %t.dir +# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/foo.s -o %t.foo.obj +# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/main.s -o %t.main.obj +# RUN: llvm-lib %t.foo.obj -out:%t.foo.lib +# RUN: not lld-link %t.foo.lib %t.main.obj -out:%t.dll -dll 2>&1 | FileCheck %s + +CHECK: error: undefined symbol: __declspec(dllimport) foo +CHECK: NOTE: a relevant symbol 'foo' is available in {{.*}}.foo.lib but cannot be used because it is not an import library. + +# Now do the same thing, but import the symbol from a import library. +# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/foo_dll_main.s -o %t.foo_dll_main.obj +# RUN: lld-link /out:%t.dll /dll %t.foo.obj %t.foo_dll_main.obj /export:foo /implib:%t.foo.imp.lib +# RUN: lld-link %t.main.obj %t.foo.imp.lib -out:%t.exe -dll + +#--- foo.s +.text +.globl foo +foo: + ret +#--- foo_dll_main.s +.text +.global _DllMainCRTStartup +_DllMainCRTStartup: + ret +#--- main.s +.text +.global _DllMainCRTStartup +_DllMainCRTStartup: + call *__imp_foo(%rip) + ret diff --git a/lld/test/COFF/undefined_lazy.test b/lld/test/COFF/undefined_lazy.test deleted file mode 100644 index ed5cd358b5cd9..0000000000000 --- a/lld/test/COFF/undefined_lazy.test +++ /dev/null @@ -1,26 +0,0 @@ -# REQUIRES: x86 - -# RUN: split-file %s %t.dir -# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/foo.s -o %t.foo.obj -# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/bar.s -o %t.bar.obj -# RUN: llvm-mc --filetype=obj -triple=x86_64-windows-msvc %t.dir/qux.s -o %t.qux.obj -# RUN: llvm-lib %t.foo.obj -out:%t.foo.lib -# RUN: llvm-lib %t.bar.obj -out:%t.bar.lib -# RUN: lld-link %t.foo.lib %t.bar.lib %t.qux.obj -out:%t.dll -dll -# -#--- foo.s -.text -.globl foo -foo: - call bar -#--- bar.s -.text -.globl bar -bar: - ret -#--- qux.s -.text -.global _DllMainCRTStartup -_DllMainCRTStartup: - call *__imp_foo(%rip) - ret From b09daa4b2314342ed9084d7d85ccd3294fd68021 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Mon, 7 Apr 2025 16:39:04 +0100 Subject: [PATCH 0857/1029] [llvm][docs] 2 small fixes to GitHub guide (#134620) 1. `arc` is long gone, no need to mention it. 2. Say exactly where user branches can be made. --- llvm/docs/GitHub.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst index 7b614b6ca5d45..c33027daf6447 100644 --- a/llvm/docs/GitHub.rst +++ b/llvm/docs/GitHub.rst @@ -42,8 +42,7 @@ GitHub Tools You can interact with GitHub in several ways: via git command line tools, the web browser, `GitHub Desktop `_, or the `GitHub CLI `_. This guide will cover the git command line -tools and the GitHub CLI. The GitHub CLI (`gh`) will be most like the `arc` workflow and -recommended. +tools and the GitHub CLI. Creating Pull Requests ---------------------- @@ -219,10 +218,11 @@ commonly used first: Branches ======== -It is possible to create branches that start with `users//`, however this is -intended to be able to support "stacked" pull-request. Do not create any branches in the -llvm/llvm-project repository otherwise, please use a fork (see above). User branches that -aren't associated with a pull-request **will be deleted**. +It is possible to create branches in `llvm/llvm-project/` that start with +`users//`, however this is intended to be able to support "stacked" +pull-request. Do not create any branches in the `llvm/llvm-project` repository +otherwise, please use a fork (see above). User branches that aren't +associated with a pull-request **will be deleted**. Stacked Pull Requests ===================== From 268c065eab06b81a0d7256ac62c0865b3781e236 Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Mon, 7 Apr 2025 08:41:49 -0700 Subject: [PATCH 0858/1029] [fatlto] Add coroutine passes when using FatLTO with ThinLTO (#134434) When coroutines are used w/ both -ffat-lto-objects and -flto=thin, the coroutine passes are not added to the optimization pipelines. Ensure they are added before ModuleOptimization to generate a working ELF object. Fixes #134409. --- clang/test/CodeGenCoroutines/pr134409.cpp | 43 +++++++++++++++++++++++ llvm/lib/Passes/PassBuilderPipelines.cpp | 13 +++++++ 2 files changed, 56 insertions(+) create mode 100644 clang/test/CodeGenCoroutines/pr134409.cpp diff --git a/clang/test/CodeGenCoroutines/pr134409.cpp b/clang/test/CodeGenCoroutines/pr134409.cpp new file mode 100644 index 0000000000000..142962d44ede4 --- /dev/null +++ b/clang/test/CodeGenCoroutines/pr134409.cpp @@ -0,0 +1,43 @@ +// An end-to-end test to make sure coroutine passes are added for thinlto. +// REQUIRES: x86-registered-target +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++23 -ffat-lto-objects -flto=thin -emit-llvm %s -O3 -o - \ +// RUN: | FileCheck %s + +#include "Inputs/coroutine.h" + +class BasicCoroutine { +public: + struct Promise { + BasicCoroutine get_return_object() { return BasicCoroutine {}; } + + void unhandled_exception() noexcept { } + + void return_void() noexcept { } + + std::suspend_never initial_suspend() noexcept { return {}; } + std::suspend_never final_suspend() noexcept { return {}; } + }; + using promise_type = Promise; +}; + +// COM: match the embedded module, so we don't match something in it by accident. +// CHECK: @llvm.embedded.object = {{.*}} +// CHECK: @llvm.compiler.used = {{.*}} + +BasicCoroutine coro() { +// CHECK: define {{.*}} void @_Z4corov() {{.*}} { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret void +// CHECK-NEXT: } + co_return; +} + +int main() { +// CHECK: define {{.*}} i32 @main() {{.*}} { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @_Z4corov() +// CHECK-NEXT: ret i32 0 +// CHECK-NEXT: } + coro(); +} + diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index a18b36ba40754..f222dbede7da7 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1692,6 +1692,19 @@ PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO, if (ThinLTO && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) MPM.addPass(buildThinLTODefaultPipeline(Level, /*ImportSummary=*/nullptr)); else { + // ModuleSimplification does not run the coroutine passes for + // ThinLTOPreLink, so we need the coroutine passes to run for ThinLTO + // builds, otherwise they will miscompile. + if (ThinLTO) { + // TODO: replace w/ buildCoroWrapper() when it takes phase and level into + // consideration. + CGSCCPassManager CGPM; + CGPM.addPass(CoroSplitPass(Level != OptimizationLevel::O0)); + CGPM.addPass(CoroAnnotationElidePass()); + MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); + MPM.addPass(CoroCleanupPass()); + } + // otherwise, just use module optimization MPM.addPass( buildModuleOptimizationPipeline(Level, ThinOrFullLTOPhase::None)); From 7fe6e70e7c19a0f5309f1eb1b84f9094189ebff7 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 7 Apr 2025 08:54:44 -0700 Subject: [PATCH 0859/1029] [NFC][LLVM][NVPTX] Cleanup pass initialization for NVPTX (#134311) - Move all pass initialization function calls to NVPTX target initialization and out of individual pass constructors. - Move all pass initialization function declaration to NVPTX.h. - https://github.com/llvm/llvm-project/issues/111767 --- llvm/lib/Target/NVPTX/NVPTX.h | 18 +++++++++++++++ llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp | 4 +--- llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h | 2 -- llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp | 5 +---- llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h | 2 +- .../NVPTX/NVPTXAssignValidGlobalNames.cpp | 4 ---- llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp | 5 +---- .../Target/NVPTX/NVPTXCtorDtorLowering.cpp | 1 - llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.h | 3 --- llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp | 8 +------ .../lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp | 5 +---- llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp | 4 ---- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 4 ---- .../Target/NVPTX/NVPTXLowerUnreachable.cpp | 4 ---- llvm/lib/Target/NVPTX/NVPTXPeephole.cpp | 8 +------ .../lib/Target/NVPTX/NVPTXProxyRegErasure.cpp | 8 +------ llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 22 +------------------ llvm/lib/Target/NVPTX/NVVMIntrRange.cpp | 7 +----- llvm/lib/Target/NVPTX/NVVMReflect.cpp | 8 +------ 19 files changed, 29 insertions(+), 93 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 20a5bf46dc06b..ff983e52179af 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -55,6 +55,24 @@ MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); MachineFunctionPass *createNVPTXForwardParamsPass(); +void initializeGenericToNVVMLegacyPassPass(PassRegistry &); +void initializeNVPTXAllocaHoistingPass(PassRegistry &); +void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &); +void initializeNVPTXAtomicLowerPass(PassRegistry &); +void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); +void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); +void initializeNVPTXLowerAllocaPass(PassRegistry &); +void initializeNVPTXLowerUnreachablePass(PassRegistry &); +void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); +void initializeNVPTXLowerArgsLegacyPassPass(PassRegistry &); +void initializeNVPTXProxyRegErasurePass(PassRegistry &); +void initializeNVPTXForwardParamsPassPass(PassRegistry &); +void initializeNVVMIntrRangePass(PassRegistry &); +void initializeNVVMReflectPass(PassRegistry &); +void initializeNVPTXAAWrapperPassPass(PassRegistry &); +void initializeNVPTXExternalAAWrapperPass(PassRegistry &); +void initializeNVPTXPeepholePass(PassRegistry &); + struct NVVMIntrRangePass : PassInfoMixin { PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp index 1f770893828e2..b910ccab21bf3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp @@ -45,9 +45,7 @@ ImmutablePass *llvm::createNVPTXExternalAAWrapperPass() { return new NVPTXExternalAAWrapper(); } -NVPTXAAWrapperPass::NVPTXAAWrapperPass() : ImmutablePass(ID) { - initializeNVPTXAAWrapperPassPass(*PassRegistry::getPassRegistry()); -} +NVPTXAAWrapperPass::NVPTXAAWrapperPass() : ImmutablePass(ID) {} void NVPTXAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h index cfbf5dee3ec50..a82c3aaa72423 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h +++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.h @@ -98,9 +98,7 @@ class NVPTXExternalAAWrapper : public ExternalAAWrapperPass { }; ImmutablePass *createNVPTXAAWrapperPass(); -void initializeNVPTXAAWrapperPassPass(PassRegistry &); ImmutablePass *createNVPTXExternalAAWrapperPass(); -void initializeNVPTXExternalAAWrapperPass(PassRegistry &); } // end namespace llvm diff --git a/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp index f676496453f9f..8a8e423a2eb71 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "NVPTXAllocaHoisting.h" +#include "NVPTX.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -57,10 +58,6 @@ bool NVPTXAllocaHoisting::runOnFunction(Function &function) { char NVPTXAllocaHoisting::ID = 0; -namespace llvm { -void initializeNVPTXAllocaHoistingPass(PassRegistry &); -} - INITIALIZE_PASS( NVPTXAllocaHoisting, "alloca-hoisting", "Hoisting alloca instructions in non-entry blocks to the entry block", diff --git a/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h index d7de8e3a2f46a..a75a83fbb061c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h +++ b/llvm/lib/Target/NVPTX/NVPTXAllocaHoisting.h @@ -16,7 +16,7 @@ namespace llvm { class FunctionPass; -extern FunctionPass *createAllocaHoisting(); +FunctionPass *createAllocaHoisting(); } // end namespace llvm #endif diff --git a/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp index 724ef7fe98322..15417a15f389b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp @@ -38,10 +38,6 @@ class NVPTXAssignValidGlobalNames : public ModulePass { char NVPTXAssignValidGlobalNames::ID = 0; -namespace llvm { -void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &); -} - INITIALIZE_PASS(NVPTXAssignValidGlobalNames, "nvptx-assign-valid-global-names", "Assign valid PTX names to globals", false, false) diff --git a/llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp b/llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp index 5a3b110cf278d..918daf6c04ecd 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAtomicLower.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "NVPTXAtomicLower.h" +#include "NVPTX.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" @@ -55,10 +56,6 @@ bool NVPTXAtomicLower::runOnFunction(Function &F) { char NVPTXAtomicLower::ID = 0; -namespace llvm { -void initializeNVPTXAtomicLowerPass(PassRegistry &); -} - INITIALIZE_PASS(NVPTXAtomicLower, "nvptx-atomic-lower", "Lower atomics of local memory to simple load/stores", false, false) diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp index b10e0b14118a1..bb8cec05f4d84 100644 --- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.cpp @@ -256,7 +256,6 @@ PreservedAnalyses NVPTXCtorDtorLoweringPass::run(Module &M, } char NVPTXCtorDtorLoweringLegacy::ID = 0; -char &llvm::NVPTXCtorDtorLoweringLegacyPassID = NVPTXCtorDtorLoweringLegacy::ID; INITIALIZE_PASS(NVPTXCtorDtorLoweringLegacy, DEBUG_TYPE, "Lower ctors and dtors for NVPTX", false, false) diff --git a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.h b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.h index c03fe97f1a782..df58ddd3e78fc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXCtorDtorLowering.h @@ -15,9 +15,6 @@ namespace llvm { class Module; class PassRegistry; -extern char &NVPTXCtorDtorLoweringLegacyPassID; -extern void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); - /// Lower llvm.global_ctors and llvm.global_dtors to special kernels. class NVPTXCtorDtorLoweringPass : public PassInfoMixin { diff --git a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp index 47d44b985363d..f5063a80b8a15 100644 --- a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp @@ -136,16 +136,10 @@ static bool forwardDeviceParams(MachineFunction &MF) { /// Pass (Manager) Boilerplate /// ---------------------------------------------------------------------------- -namespace llvm { -void initializeNVPTXForwardParamsPassPass(PassRegistry &); -} // namespace llvm - namespace { struct NVPTXForwardParamsPass : public MachineFunctionPass { static char ID; - NVPTXForwardParamsPass() : MachineFunctionPass(ID) { - initializeNVPTXForwardParamsPassPass(*PassRegistry::getPassRegistry()); - } + NVPTXForwardParamsPass() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index 0f0e7c2f8227e..ac6f4061b9f1f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "NVPTXLowerAggrCopies.h" +#include "NVPTX.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/IR/Constants.h" @@ -137,10 +138,6 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { } // namespace -namespace llvm { -void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); -} - INITIALIZE_PASS(NVPTXLowerAggrCopies, "nvptx-lower-aggr-copies", "Lower aggregate copies, and llvm.mem* intrinsics into loops", false, false) diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp index 3b44a86013f04..88bc000f39bf7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -33,10 +33,6 @@ using namespace llvm; -namespace llvm { -void initializeNVPTXLowerAllocaPass(PassRegistry &); -} - namespace { class NVPTXLowerAlloca : public FunctionPass { bool runOnFunction(Function &F) override; diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index a683726facd0c..6452fa05947dd 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -161,10 +161,6 @@ using namespace llvm; -namespace llvm { -void initializeNVPTXLowerArgsLegacyPassPass(PassRegistry &); -} - namespace { class NVPTXLowerArgsLegacyPass : public FunctionPass { bool runOnFunction(Function &F) override; diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerUnreachable.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerUnreachable.cpp index a289d35f9b3f1..00a12bf818897 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerUnreachable.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerUnreachable.cpp @@ -78,10 +78,6 @@ using namespace llvm; -namespace llvm { -void initializeNVPTXLowerUnreachablePass(PassRegistry &); -} - namespace { class NVPTXLowerUnreachable : public FunctionPass { StringRef getPassName() const override; diff --git a/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp b/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp index f2f547da88c7c..e9b0aaeca4964 100644 --- a/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXPeephole.cpp @@ -44,17 +44,11 @@ using namespace llvm; #define DEBUG_TYPE "nvptx-peephole" -namespace llvm { -void initializeNVPTXPeepholePass(PassRegistry &); -} - namespace { struct NVPTXPeephole : public MachineFunctionPass { public: static char ID; - NVPTXPeephole() : MachineFunctionPass(ID) { - initializeNVPTXPeepholePass(*PassRegistry::getPassRegistry()); - } + NVPTXPeephole() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp b/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp index 8dfa2ebe80c3a..2253afa2806c3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp @@ -24,17 +24,11 @@ using namespace llvm; -namespace llvm { -void initializeNVPTXProxyRegErasurePass(PassRegistry &); -} - namespace { struct NVPTXProxyRegErasure : public MachineFunctionPass { static char ID; - NVPTXProxyRegErasure() : MachineFunctionPass(ID) { - initializeNVPTXProxyRegErasurePass(*PassRegistry::getPassRegistry()); - } + NVPTXProxyRegErasure() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 5bb168704bad0..8a25256ea1e4a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -87,27 +87,6 @@ static cl::opt EarlyByValArgsCopy( cl::desc("Create a copy of byval function arguments early."), cl::init(false), cl::Hidden); -namespace llvm { - -void initializeGenericToNVVMLegacyPassPass(PassRegistry &); -void initializeNVPTXAllocaHoistingPass(PassRegistry &); -void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry &); -void initializeNVPTXAtomicLowerPass(PassRegistry &); -void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); -void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); -void initializeNVPTXLowerAllocaPass(PassRegistry &); -void initializeNVPTXLowerUnreachablePass(PassRegistry &); -void initializeNVPTXCtorDtorLoweringLegacyPass(PassRegistry &); -void initializeNVPTXLowerArgsLegacyPassPass(PassRegistry &); -void initializeNVPTXProxyRegErasurePass(PassRegistry &); -void initializeNVPTXForwardParamsPassPass(PassRegistry &); -void initializeNVVMIntrRangePass(PassRegistry &); -void initializeNVVMReflectPass(PassRegistry &); -void initializeNVPTXAAWrapperPassPass(PassRegistry &); -void initializeNVPTXExternalAAWrapperPass(PassRegistry &); - -} // end namespace llvm - extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { // Register the target. RegisterTargetMachine X(getTheNVPTXTarget32()); @@ -132,6 +111,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() { initializeNVPTXDAGToDAGISelLegacyPass(PR); initializeNVPTXAAWrapperPassPass(PR); initializeNVPTXExternalAAWrapperPass(PR); + initializeNVPTXPeepholePass(PR); } static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { diff --git a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp index 8286e9661f202..91b8e470e055e 100644 --- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp +++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp @@ -25,16 +25,11 @@ using namespace llvm; #define DEBUG_TYPE "nvvm-intr-range" -namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); } - namespace { class NVVMIntrRange : public FunctionPass { public: static char ID; - NVVMIntrRange() : FunctionPass(ID) { - - initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry()); - } + NVVMIntrRange() : FunctionPass(ID) {} bool runOnFunction(Function &) override; }; diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 20b8bef1899b4..2809ec2303f99 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -47,19 +47,13 @@ using namespace llvm; #define DEBUG_TYPE "nvptx-reflect" -namespace llvm { -void initializeNVVMReflectPass(PassRegistry &); -} - namespace { class NVVMReflect : public FunctionPass { public: static char ID; unsigned int SmVersion; NVVMReflect() : NVVMReflect(0) {} - explicit NVVMReflect(unsigned int Sm) : FunctionPass(ID), SmVersion(Sm) { - initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); - } + explicit NVVMReflect(unsigned int Sm) : FunctionPass(ID), SmVersion(Sm) {} bool runOnFunction(Function &) override; }; From 16c84c4475b909d2de455a44139643c03fe3fe25 Mon Sep 17 00:00:00 2001 From: Farzon Lotfi Date: Mon, 7 Apr 2025 12:06:57 -0400 Subject: [PATCH 0860/1029] [DirectX] Add target builtins (#134439) - fixes #132303 - Moves dot2add from a language builtin to a target builtin. - Sets the scaffolding for Sema checks for DX builtins - Setup DirectX backend as able to have target builtins - Adds a DX TargetBuiltins emitter in `clang/lib/CodeGen/TargetBuiltins/DirectX.cpp` --- .github/new-prs-labeler.yml | 6 +++ clang/include/clang/Basic/Builtins.td | 6 --- clang/include/clang/Basic/BuiltinsDirectX.td | 15 ++++++++ clang/include/clang/Basic/CMakeLists.txt | 4 ++ clang/include/clang/Basic/TargetBuiltins.h | 11 ++++++ clang/include/clang/Sema/Sema.h | 7 ++++ clang/include/clang/Sema/SemaDirectX.h | 28 ++++++++++++++ clang/lib/Basic/Targets/DirectX.cpp | 20 ++++++++++ clang/lib/Basic/Targets/DirectX.h | 4 +- clang/lib/CodeGen/CGBuiltin.cpp | 2 + clang/lib/CodeGen/CGHLSLBuiltins.cpp | 12 ------ clang/lib/CodeGen/CMakeLists.txt | 1 + clang/lib/CodeGen/CodeGenFunction.h | 1 + clang/lib/CodeGen/TargetBuiltins/DirectX.cpp | 37 +++++++++++++++++++ .../lib/Headers/hlsl/hlsl_intrinsic_helpers.h | 4 +- clang/lib/Sema/CMakeLists.txt | 1 + clang/lib/Sema/Sema.cpp | 2 + clang/lib/Sema/SemaChecking.cpp | 3 ++ clang/lib/Sema/SemaDirectX.cpp | 23 ++++++++++++ clang/test/CodeGenDirectX/Builtins/dot2add.c | 23 ++++++++++++ 20 files changed, 187 insertions(+), 23 deletions(-) create mode 100644 clang/include/clang/Basic/BuiltinsDirectX.td create mode 100644 clang/include/clang/Sema/SemaDirectX.h create mode 100644 clang/lib/CodeGen/TargetBuiltins/DirectX.cpp create mode 100644 clang/lib/Sema/SemaDirectX.cpp create mode 100644 clang/test/CodeGenDirectX/Builtins/dot2add.c diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index b93cdff8af345..c0c61748010d0 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -660,6 +660,12 @@ backend:DirectX: - '**/*dxil*/**' - '**/*DXContainer*' - '**/*DXContainer*/**' + - clang/lib/Sema/SemaDirectX.cpp + - clang/include/clang/Sema/SemaDirectX.h + - clang/include/clang/Basic/BuiltinsDirectX.td + - clang/lib/CodeGen/TargetBuiltins/DirectX.cpp + - clang/test/CodeGenDirectX/** + - clang/test/SemaDirectX/** backend:SPIR-V: - clang/lib/Driver/ToolChains/SPIRV.* diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 2e077176ac7e9..868e5b92acdc9 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -4891,12 +4891,6 @@ def HLSLDotProduct : LangBuiltin<"HLSL_LANG"> { let Prototype = "void(...)"; } -def HLSLDot2Add : LangBuiltin<"HLSL_LANG"> { - let Spellings = ["__builtin_hlsl_dot2add"]; - let Attributes = [NoThrow, Const]; - let Prototype = "float(_ExtVector<2, _Float16>, _ExtVector<2, _Float16>, float)"; -} - def HLSLDot4AddI8Packed : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_dot4add_i8packed"]; let Attributes = [NoThrow, Const]; diff --git a/clang/include/clang/Basic/BuiltinsDirectX.td b/clang/include/clang/Basic/BuiltinsDirectX.td new file mode 100644 index 0000000000000..444532ab2874a --- /dev/null +++ b/clang/include/clang/Basic/BuiltinsDirectX.td @@ -0,0 +1,15 @@ +//===--- BuiltinsDirectX.td - DirectX Builtin function database -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +include "clang/Basic/BuiltinsBase.td" + +def DxDot2Add : Builtin { + let Spellings = ["__builtin_dx_dot2add"]; + let Attributes = [NoThrow, Const]; + let Prototype = "float(_ExtVector<2, _Float16>, _ExtVector<2, _Float16>, float)"; +} diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt index 6be6d063c20b4..4d5e1eaa3facb 100644 --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -82,6 +82,10 @@ clang_tablegen(BuiltinsBPF.inc -gen-clang-builtins SOURCE BuiltinsBPF.td TARGET ClangBuiltinsBPF) +clang_tablegen(BuiltinsDirectX.inc -gen-clang-builtins + SOURCE BuiltinsDirectX.td + TARGET ClangBuiltinsDirectX) + clang_tablegen(BuiltinsHexagon.inc -gen-clang-builtins SOURCE BuiltinsHexagon.td TARGET ClangBuiltinsHexagon) diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h index c1ba65064f159..4e490d87ee8d6 100644 --- a/clang/include/clang/Basic/TargetBuiltins.h +++ b/clang/include/clang/Basic/TargetBuiltins.h @@ -141,6 +141,17 @@ namespace clang { }; } + /// DirectX builtins + namespace DirectX { + enum { + LastTIBuiltin = clang::Builtin::FirstTSBuiltin - 1, +#define GET_BUILTIN_ENUMERATORS +#include "clang/Basic/BuiltinsDirectX.inc" +#undef GET_BUILTIN_ENUMERATORS + LastTSBuiltin + }; + } // namespace DirectX + /// SPIRV builtins namespace SPIRV { enum { diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index a0ac46033b4a1..7bd77d33a1f3d 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -160,6 +160,7 @@ class SemaAVR; class SemaBPF; class SemaCodeCompletion; class SemaCUDA; +class SemaDirectX; class SemaHLSL; class SemaHexagon; class SemaLoongArch; @@ -1074,6 +1075,11 @@ class Sema final : public SemaBase { return *CUDAPtr; } + SemaDirectX &DirectX() { + assert(DirectXPtr); + return *DirectXPtr; + } + SemaHLSL &HLSL() { assert(HLSLPtr); return *HLSLPtr; @@ -1212,6 +1218,7 @@ class Sema final : public SemaBase { std::unique_ptr BPFPtr; std::unique_ptr CodeCompletionPtr; std::unique_ptr CUDAPtr; + std::unique_ptr DirectXPtr; std::unique_ptr HLSLPtr; std::unique_ptr HexagonPtr; std::unique_ptr LoongArchPtr; diff --git a/clang/include/clang/Sema/SemaDirectX.h b/clang/include/clang/Sema/SemaDirectX.h new file mode 100644 index 0000000000000..0d78fb4a06f42 --- /dev/null +++ b/clang/include/clang/Sema/SemaDirectX.h @@ -0,0 +1,28 @@ +//===----- SemaDirectX.h ----- Semantic Analysis for DirectX constructs----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file declares semantic analysis for DirectX constructs. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_SEMA_SEMADIRECTX_H +#define LLVM_CLANG_SEMA_SEMADIRECTX_H + +#include "clang/AST/ASTFwd.h" +#include "clang/Sema/SemaBase.h" + +namespace clang { +class SemaDirectX : public SemaBase { +public: + SemaDirectX(Sema &S); + + bool CheckDirectXBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); +}; +} // namespace clang + +#endif // LLVM_CLANG_SEMA_SEMADIRECTX_H diff --git a/clang/lib/Basic/Targets/DirectX.cpp b/clang/lib/Basic/Targets/DirectX.cpp index 0dd27e6e93b33..ef8998dab0840 100644 --- a/clang/lib/Basic/Targets/DirectX.cpp +++ b/clang/lib/Basic/Targets/DirectX.cpp @@ -12,11 +12,31 @@ #include "DirectX.h" #include "Targets.h" +#include "clang/Basic/TargetBuiltins.h" using namespace clang; using namespace clang::targets; +static constexpr int NumBuiltins = + clang::DirectX::LastTSBuiltin - Builtin::FirstTSBuiltin; + +#define GET_BUILTIN_STR_TABLE +#include "clang/Basic/BuiltinsDirectX.inc" +#undef GET_BUILTIN_STR_TABLE + +static constexpr Builtin::Info BuiltinInfos[] = { +#define GET_BUILTIN_INFOS +#include "clang/Basic/BuiltinsDirectX.inc" +#undef GET_BUILTIN_INFOS +}; +static_assert(std::size(BuiltinInfos) == NumBuiltins); + void DirectXTargetInfo::getTargetDefines(const LangOptions &Opts, MacroBuilder &Builder) const { DefineStd(Builder, "DIRECTX", Opts); } + +llvm::SmallVector +DirectXTargetInfo::getTargetBuiltins() const { + return {{&BuiltinStrings, BuiltinInfos}}; +} diff --git a/clang/lib/Basic/Targets/DirectX.h b/clang/lib/Basic/Targets/DirectX.h index 6e3ddad626341..0e88a37e32493 100644 --- a/clang/lib/Basic/Targets/DirectX.h +++ b/clang/lib/Basic/Targets/DirectX.h @@ -73,9 +73,7 @@ class LLVM_LIBRARY_VISIBILITY DirectXTargetInfo : public TargetInfo { return Feature == "directx"; } - llvm::SmallVector getTargetBuiltins() const override { - return {}; - } + llvm::SmallVector getTargetBuiltins() const override; std::string_view getClobbers() const override { return ""; } diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 310addebd50e9..fe55dfffc1cbe 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -70,6 +70,8 @@ static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF, case llvm::Triple::bpfeb: case llvm::Triple::bpfel: return CGF->EmitBPFBuiltinExpr(BuiltinID, E); + case llvm::Triple::dxil: + return CGF->EmitDirectXBuiltinExpr(BuiltinID, E); case llvm::Triple::x86: case llvm::Triple::x86_64: return CGF->EmitX86BuiltinExpr(BuiltinID, E); diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp index 27d1c69439944..99c62808c323d 100644 --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp @@ -380,18 +380,6 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, getDotProductIntrinsic(CGM.getHLSLRuntime(), VecTy0->getElementType()), ArrayRef{Op0, Op1}, nullptr, "hlsl.dot"); } - case Builtin::BI__builtin_hlsl_dot2add: { - assert(CGM.getTarget().getTriple().getArch() == llvm::Triple::dxil && - "Intrinsic dot2add is only allowed for dxil architecture"); - Value *A = EmitScalarExpr(E->getArg(0)); - Value *B = EmitScalarExpr(E->getArg(1)); - Value *C = EmitScalarExpr(E->getArg(2)); - - Intrinsic::ID ID = llvm ::Intrinsic::dx_dot2add; - return Builder.CreateIntrinsic( - /*ReturnType=*/C->getType(), ID, ArrayRef{A, B, C}, nullptr, - "dx.dot2add"); - } case Builtin::BI__builtin_hlsl_dot4add_i8packed: { Value *A = EmitScalarExpr(E->getArg(0)); Value *B = EmitScalarExpr(E->getArg(1)); diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt index ebe2fbd7db295..dc5b2a35583b4 100644 --- a/clang/lib/CodeGen/CMakeLists.txt +++ b/clang/lib/CodeGen/CMakeLists.txt @@ -118,6 +118,7 @@ add_clang_library(clangCodeGen SwiftCallingConv.cpp TargetBuiltins/ARM.cpp TargetBuiltins/AMDGPU.cpp + TargetBuiltins/DirectX.cpp TargetBuiltins/Hexagon.cpp TargetBuiltins/NVPTX.cpp TargetBuiltins/PPC.cpp diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index af9798b30fbcf..34dee6df9dcfc 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4809,6 +4809,7 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *EmitAMDGPUBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitHLSLBuiltinExpr(unsigned BuiltinID, const CallExpr *E, ReturnValueSlot ReturnValue); + llvm::Value *EmitDirectXBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitSPIRVBuiltinExpr(unsigned BuiltinID, const CallExpr *E); llvm::Value *EmitScalarOrConstFoldImmArg(unsigned ICEArguments, unsigned Idx, const CallExpr *E); diff --git a/clang/lib/CodeGen/TargetBuiltins/DirectX.cpp b/clang/lib/CodeGen/TargetBuiltins/DirectX.cpp new file mode 100644 index 0000000000000..202601e257036 --- /dev/null +++ b/clang/lib/CodeGen/TargetBuiltins/DirectX.cpp @@ -0,0 +1,37 @@ +//===--------- DirectX.cpp - Emit LLVM Code for builtins ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This contains code to emit Builtin calls as LLVM code. +// +//===----------------------------------------------------------------------===// + +#include "CGHLSLRuntime.h" +#include "CodeGenFunction.h" +#include "clang/Basic/TargetBuiltins.h" +#include "llvm/IR/Intrinsics.h" + +using namespace clang; +using namespace CodeGen; +using namespace llvm; + +Value *CodeGenFunction::EmitDirectXBuiltinExpr(unsigned BuiltinID, + const CallExpr *E) { + switch (BuiltinID) { + case DirectX::BI__builtin_dx_dot2add: { + Value *A = EmitScalarExpr(E->getArg(0)); + Value *B = EmitScalarExpr(E->getArg(1)); + Value *C = EmitScalarExpr(E->getArg(2)); + + Intrinsic::ID ID = llvm ::Intrinsic::dx_dot2add; + return Builder.CreateIntrinsic( + /*ReturnType=*/C->getType(), ID, ArrayRef{A, B, C}, nullptr, + "dx.dot2add"); + } + } + return nullptr; +} diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h index 3c15f2b38d80f..a8f025b1b5f5f 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h @@ -46,8 +46,8 @@ distance_vec_impl(vector X, vector Y) { } constexpr float dot2add_impl(half2 a, half2 b, float c) { -#if defined(__DIRECTX__) - return __builtin_hlsl_dot2add(a, b, c); +#if (__has_builtin(__builtin_dx_dot2add)) + return __builtin_dx_dot2add(a, b, c); #else return dot(a, b) + c; #endif diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt index d3fe80f659f69..cc7921fc32254 100644 --- a/clang/lib/Sema/CMakeLists.txt +++ b/clang/lib/Sema/CMakeLists.txt @@ -47,6 +47,7 @@ add_clang_library(clangSema SemaConsumer.cpp SemaCoroutine.cpp SemaCUDA.cpp + SemaDirectX.cpp SemaDecl.cpp SemaDeclAttr.cpp SemaDeclCXX.cpp diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index 64f5633f380ec..32d7744be9229 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -47,6 +47,7 @@ #include "clang/Sema/SemaCUDA.h" #include "clang/Sema/SemaCodeCompletion.h" #include "clang/Sema/SemaConsumer.h" +#include "clang/Sema/SemaDirectX.h" #include "clang/Sema/SemaHLSL.h" #include "clang/Sema/SemaHexagon.h" #include "clang/Sema/SemaLoongArch.h" @@ -226,6 +227,7 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer, CodeCompletionPtr( std::make_unique(*this, CodeCompleter)), CUDAPtr(std::make_unique(*this)), + DirectXPtr(std::make_unique(*this)), HLSLPtr(std::make_unique(*this)), HexagonPtr(std::make_unique(*this)), LoongArchPtr(std::make_unique(*this)), diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index c21475ee69d9e..bffd0dd461d3d 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -61,6 +61,7 @@ #include "clang/Sema/SemaAMDGPU.h" #include "clang/Sema/SemaARM.h" #include "clang/Sema/SemaBPF.h" +#include "clang/Sema/SemaDirectX.h" #include "clang/Sema/SemaHLSL.h" #include "clang/Sema/SemaHexagon.h" #include "clang/Sema/SemaLoongArch.h" @@ -1930,6 +1931,8 @@ bool Sema::CheckTSBuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID, case llvm::Triple::bpfeb: case llvm::Triple::bpfel: return BPF().CheckBPFBuiltinFunctionCall(BuiltinID, TheCall); + case llvm::Triple::dxil: + return DirectX().CheckDirectXBuiltinFunctionCall(BuiltinID, TheCall); case llvm::Triple::hexagon: return Hexagon().CheckHexagonBuiltinFunctionCall(BuiltinID, TheCall); case llvm::Triple::mips: diff --git a/clang/lib/Sema/SemaDirectX.cpp b/clang/lib/Sema/SemaDirectX.cpp new file mode 100644 index 0000000000000..eaac24cdc710e --- /dev/null +++ b/clang/lib/Sema/SemaDirectX.cpp @@ -0,0 +1,23 @@ +//===- SemaDirectX.cpp - Semantic Analysis for DirectX constructs----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This implements Semantic Analysis for DirectX constructs. +//===----------------------------------------------------------------------===// + +#include "clang/Sema/SemaDirectX.h" +#include "clang/Basic/TargetBuiltins.h" +#include "clang/Sema/Sema.h" + +namespace clang { + +SemaDirectX::SemaDirectX(Sema &S) : SemaBase(S) {} + +bool SemaDirectX::CheckDirectXBuiltinFunctionCall(unsigned BuiltinID, + CallExpr *TheCall) { + return false; +} +} // namespace clang diff --git a/clang/test/CodeGenDirectX/Builtins/dot2add.c b/clang/test/CodeGenDirectX/Builtins/dot2add.c new file mode 100644 index 0000000000000..181f61fea1544 --- /dev/null +++ b/clang/test/CodeGenDirectX/Builtins/dot2add.c @@ -0,0 +1,23 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 + +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -emit-llvm -o - | FileCheck %s + +typedef _Float16 half; +typedef half half2 __attribute__((ext_vector_type(2))); + +// CHECK-LABEL: define float @test_dot2add( +// CHECK-SAME: <2 x half> noundef [[X:%.*]], <2 x half> noundef [[Y:%.*]], float noundef [[Z:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[X_ADDR:%.*]] = alloca <2 x half>, align 4 +// CHECK-NEXT: [[Y_ADDR:%.*]] = alloca <2 x half>, align 4 +// CHECK-NEXT: [[Z_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: store <2 x half> [[X]], ptr [[X_ADDR]], align 4 +// CHECK-NEXT: store <2 x half> [[Y]], ptr [[Y_ADDR]], align 4 +// CHECK-NEXT: store float [[Z]], ptr [[Z_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x half>, ptr [[X_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[Y_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[Z_ADDR]], align 4 +// CHECK-NEXT: [[DX_DOT2ADD:%.*]] = call float @llvm.dx.dot2add.v2f16(<2 x half> [[TMP0]], <2 x half> [[TMP1]], float [[TMP2]]) +// CHECK-NEXT: ret float [[DX_DOT2ADD]] +// +float test_dot2add(half2 X, half2 Y, float Z) { return __builtin_dx_dot2add(X, Y, Z); } From efce8f1ce680b86af48c49edda2c3cbb370a6ba0 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 7 Apr 2025 17:09:18 +0100 Subject: [PATCH 0861/1029] [X86] combineX86ShufflesRecursively - use enumerate to iterate over shuffle operands. NFC. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d720aadb2e81d..56b0f721383f1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41117,8 +41117,7 @@ static SDValue combineX86ShufflesRecursively( // Peek through vector widenings and set out of bounds mask indices to undef. // TODO: Can resolveTargetShuffleInputsAndMask do some of this? - for (unsigned I = 0, E = Ops.size(); I != E; ++I) { - SDValue &Op = Ops[I]; + for (auto [I, Op] : enumerate(Ops)) { if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() && isNullConstant(Op.getOperand(2))) { Op = Op.getOperand(1); From 0a35dd7e99596b1a0316bb81bee3c698adaf0a05 Mon Sep 17 00:00:00 2001 From: Linux User Date: Mon, 7 Apr 2025 16:12:50 +0000 Subject: [PATCH 0862/1029] [llvm-ar] --help: fix unquoted angle bracket (#101364) Changes the argument in llvm-ar help message from `-M [[modifiers] [relpos] " "[count] [files]\n" - << " " + ToolName + " -M [ Date: Mon, 7 Apr 2025 18:17:11 +0200 Subject: [PATCH 0863/1029] [libc++][ci] Removes C++2b selection option. (#88557) Since Clang 16 is no longer supported all compilers support C++23. --- libcxx/utils/libcxx/test/params.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py index 58ace0ba96e35..fc34009d0a551 100644 --- a/libcxx/utils/libcxx/test/params.py +++ b/libcxx/utils/libcxx/test/params.py @@ -83,12 +83,6 @@ def getStdFlag(cfg, std): if hasCompileFlag(cfg, "-std=" + std): return "-std=" + std - # TODO(LLVM-19) Remove the fallbacks needed for Clang 16. - fallbacks = { - "c++23": "c++2b", - } - if std in fallbacks and hasCompileFlag(cfg, "-std=" + fallbacks[std]): - return "-std=" + fallbacks[std] return None From 01ec74dfd0db307a3b67cc67448269231cd2e83c Mon Sep 17 00:00:00 2001 From: Leandro Lupori Date: Mon, 7 Apr 2025 13:18:07 -0300 Subject: [PATCH 0864/1029] [flang][OpenMP] Fix copyprivate of procedure pointers (#134292) Just modify the assert to consider fir::BoxProcType as valid. No other changes are needed. Fixes #131549 --- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 5 +-- flang/test/Lower/OpenMP/copyprivate3.f90 | 42 ++++++++++++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 flang/test/Lower/OpenMP/copyprivate3.f90 diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 12ac6b3285575..46febd33f0ce8 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -695,9 +695,10 @@ void TypeInfo::typeScan(mlir::Type ty) { } else if (auto pty = mlir::dyn_cast(ty)) { typeScan(pty.getEleTy()); } else { - // The scan ends when reaching any built-in or record type. + // The scan ends when reaching any built-in, record or boxproc type. assert(ty.isIntOrIndexOrFloat() || mlir::isa(ty) || - mlir::isa(ty) || mlir::isa(ty)); + mlir::isa(ty) || mlir::isa(ty) || + mlir::isa(ty)); } } diff --git a/flang/test/Lower/OpenMP/copyprivate3.f90 b/flang/test/Lower/OpenMP/copyprivate3.f90 new file mode 100644 index 0000000000000..13926e45f1948 --- /dev/null +++ b/flang/test/Lower/OpenMP/copyprivate3.f90 @@ -0,0 +1,42 @@ +! Test lowering of COPYPRIVATE with procedure pointers. +! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s + +!CHICK-SAME: %arg0: [[TYPE:!fir.ref i32>>>]], + +!CHECK-LABEL: func.func private @_copy_boxproc_i32_args( +!CHECK-SAME: %arg0: [[TYPE:!fir.ref i32>>]], +!CHECK-SAME: %arg1: [[TYPE]]) +!CHECK: %[[DST:.*]]:2 = hlfir.declare %arg0 {{.*}} : ([[TYPE]]) -> ([[TYPE]], [[TYPE]]) +!CHECK: %[[SRC:.*]]:2 = hlfir.declare %arg1 {{.*}} : ([[TYPE]]) -> ([[TYPE]], [[TYPE]]) +!CHECK: %[[TEMP:.*]] = fir.load %[[SRC]]#0 : [[TYPE]] +!CHECK: fir.store %[[TEMP]] to %[[DST]]#0 : [[TYPE]] +!CHECK: return + +!CHECK-LABEL: func @_QPtest_proc_ptr +!CHECK: omp.parallel +!CHECK: omp.single copyprivate(%{{.*}}#0 -> @_copy_boxproc_i32_args : !fir.ref i32>>) +subroutine test_proc_ptr() + interface + function sub1() bind(c) result(ret) + use, intrinsic :: iso_c_binding + integer(c_int) :: ret + end function sub1 + end interface + + procedure(sub1), pointer, save, bind(c) :: ffunptr + !$omp threadprivate(ffunptr) + + !$omp parallel + ffunptr => sub1 + !$omp single + ffunptr => sub1 + !$omp end single copyprivate(ffunptr) + if (ffunptr() /= 1) print *, 'err' + !$omp end parallel +end subroutine + +function sub1() bind(c) result(ret) + use, intrinsic::iso_c_binding + integer(c_int) :: ret + ret = 1 +end function sub1 From a406fb8f5a6a65144e5c2564c17ac9ee29da43ed Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Mon, 7 Apr 2025 18:19:58 +0200 Subject: [PATCH 0865/1029] [libc++] Clang-tidy operator& hijacker. (#128366) Guards against introducing new places where operator& depends on a template type. --- libcxx/test/libcxx/clang_tidy.gen.py | 4 ++ .../tools/clang_tidy_checks/CMakeLists.txt | 1 + .../tools/clang_tidy_checks/libcpp_module.cpp | 3 + .../robust_against_operator_ampersand.cpp | 56 +++++++++++++++++++ .../robust_against_operator_ampersand.hpp | 18 ++++++ 5 files changed, 82 insertions(+) create mode 100644 libcxx/test/tools/clang_tidy_checks/robust_against_operator_ampersand.cpp create mode 100644 libcxx/test/tools/clang_tidy_checks/robust_against_operator_ampersand.hpp diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py index f1135749febe4..dbab2875e3126 100644 --- a/libcxx/test/libcxx/clang_tidy.gen.py +++ b/libcxx/test/libcxx/clang_tidy.gen.py @@ -6,6 +6,7 @@ # # ===----------------------------------------------------------------------===## + # Run our custom libc++ clang-tidy checks on all public headers. # RUN: %{python} %s %{libcxx-dir}/utils @@ -23,6 +24,9 @@ // REQUIRES: has-clang-tidy +// The frozen headers should not be updated to the latest libc++ style, so don't test. +// UNSUPPORTED: FROZEN-CXX03-HEADERS-FIXME + // The GCC compiler flags are not always compatible with clang-tidy. // UNSUPPORTED: gcc diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt index 5797a32974820..521a60c0fc498 100644 --- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt +++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt @@ -96,6 +96,7 @@ set(SOURCES nodebug_on_aliases.cpp proper_version_checks.cpp robust_against_adl.cpp + robust_against_operator_ampersand.cpp uglify_attributes.cpp libcpp_module.cpp diff --git a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp index 32a33dddad632..2cf39e2b626f8 100644 --- a/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp +++ b/libcxx/test/tools/clang_tidy_checks/libcpp_module.cpp @@ -16,6 +16,7 @@ #include "nodebug_on_aliases.hpp" #include "proper_version_checks.hpp" #include "robust_against_adl.hpp" +#include "robust_against_operator_ampersand.hpp" #include "uglify_attributes.hpp" namespace { @@ -29,6 +30,8 @@ class LibcxxTestModule : public clang::tidy::ClangTidyModule { check_factories.registerCheck("libcpp-nodebug-on-aliases"); check_factories.registerCheck("libcpp-cpp-version-check"); check_factories.registerCheck("libcpp-robust-against-adl"); + check_factories.registerCheck( + "libcpp-robust-against-operator-ampersand"); check_factories.registerCheck("libcpp-uglify-attributes"); } }; diff --git a/libcxx/test/tools/clang_tidy_checks/robust_against_operator_ampersand.cpp b/libcxx/test/tools/clang_tidy_checks/robust_against_operator_ampersand.cpp new file mode 100644 index 0000000000000..a608954aa2cc7 --- /dev/null +++ b/libcxx/test/tools/clang_tidy_checks/robust_against_operator_ampersand.cpp @@ -0,0 +1,56 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-tidy/ClangTidyCheck.h" +#include "clang-tidy/ClangTidyModuleRegistry.h" +#include "clang/ASTMatchers/ASTMatchers.h" +#include "clang/Tooling/FixIt.h" + +#include "robust_against_operator_ampersand.hpp" + +// This clang-tidy check ensures that we don't use operator& on dependant +// types. If the type is user supplied it may call the type's operator&. +// Instead use std::addressof. +// +// This is part of libc++'s policy +// https://libcxx.llvm.org/CodingGuidelines.html#don-t-use-argument-dependent-lookup-unless-required-by-the-standard + +// TODO(LLVM-21) Remove dependentScopeDeclRefExpr +// dependentScopeDeclRefExpr requires Clang 20, this uses the same definition as Clang +#if defined(__clang_major__) && __clang_major__ < 20 +namespace clang::ast_matchers { +const internal::VariadicDynCastAllOfMatcher dependentScopeDeclRefExpr; +} // namespace clang::ast_matchers +#endif + +namespace libcpp { +robust_against_operator_ampersand::robust_against_operator_ampersand( + llvm::StringRef name, clang::tidy::ClangTidyContext* context) + : clang::tidy::ClangTidyCheck(name, context) {} + +void robust_against_operator_ampersand::registerMatchers(clang::ast_matchers::MatchFinder* finder) { + using namespace clang::ast_matchers; + finder->addMatcher( + cxxOperatorCallExpr(allOf(hasOperatorName("&"), argumentCountIs(1), isTypeDependent()), + unless(hasUnaryOperand(dependentScopeDeclRefExpr()))) + .bind("match"), + this); +} + +void robust_against_operator_ampersand::check(const clang::ast_matchers::MatchFinder::MatchResult& result) { + if (const auto* call = result.Nodes.getNodeAs< clang::CXXOperatorCallExpr >("match"); call != nullptr) { + diag(call->getBeginLoc(), "Guard against user provided operator& for dependent types.") + << clang::FixItHint::CreateReplacement( + call->getSourceRange(), + (llvm::Twine( + "std::addressof(" + clang::tooling::fixit::getText(*call->getArg(0), *result.Context) + ")")) + .str()); + } +} + +} // namespace libcpp diff --git a/libcxx/test/tools/clang_tidy_checks/robust_against_operator_ampersand.hpp b/libcxx/test/tools/clang_tidy_checks/robust_against_operator_ampersand.hpp new file mode 100644 index 0000000000000..5cdc0baca5c23 --- /dev/null +++ b/libcxx/test/tools/clang_tidy_checks/robust_against_operator_ampersand.hpp @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang-tidy/ClangTidyCheck.h" + +namespace libcpp { +class robust_against_operator_ampersand : public clang::tidy::ClangTidyCheck { +public: + robust_against_operator_ampersand(llvm::StringRef, clang::tidy::ClangTidyContext*); + void registerMatchers(clang::ast_matchers::MatchFinder*) override; + void check(const clang::ast_matchers::MatchFinder::MatchResult&) override; +}; +} // namespace libcpp From f7cc213d58df01fc630500c5c5ebe48b5577653d Mon Sep 17 00:00:00 2001 From: Lee Wei Date: Mon, 7 Apr 2025 10:22:47 -0600 Subject: [PATCH 0866/1029] [ConstraintSystem] Update comments (#127351) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It took me some time to fully understand the implementation of Fourier–Motzkin elimination in the Constraint System, so I added an example in the comments. Hopefully future developers can understand the algorithm more easily with the example. --- llvm/lib/Analysis/ConstraintSystem.cpp | 26 ++++++++++++++++++- .../Scalar/ConstraintElimination.cpp | 2 +- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Analysis/ConstraintSystem.cpp b/llvm/lib/Analysis/ConstraintSystem.cpp index 7216a0219080f..6457b6b425c6c 100644 --- a/llvm/lib/Analysis/ConstraintSystem.cpp +++ b/llvm/lib/Analysis/ConstraintSystem.cpp @@ -8,10 +8,10 @@ #include "llvm/Analysis/ConstraintSystem.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Support/MathExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/IR/Value.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include @@ -52,6 +52,12 @@ bool ConstraintSystem::eliminateUsingFM() { for (unsigned R1 = 0; R1 < NumRemainingConstraints; R1++) { // FIXME do not use copy for (unsigned R2 = R1 + 1; R2 < NumRemainingConstraints; R2++) { + // Examples of constraints stored as {Constant, Coeff_x, Coeff_y} + // R1: 0 >= 1 * x + (-2) * y => { 0, 1, -2 } + // R2: 3 >= 2 * x + 3 * y => { 3, 2, 3 } + // LastIdx = 2 (tracking coefficient of y) + // UpperLast: 3 + // LowerLast: -2 int64_t UpperLast = getLastCoefficient(RemainingRows[R2], LastIdx); int64_t LowerLast = getLastCoefficient(RemainingRows[R1], LastIdx); assert( @@ -73,10 +79,13 @@ bool ConstraintSystem::eliminateUsingFM() { unsigned IdxLower = 0; auto &LowerRow = RemainingRows[LowerR]; auto &UpperRow = RemainingRows[UpperR]; + // Update constant and coefficients of both constraints. + // Stops until every coefficient is updated or overflows. while (true) { if (IdxUpper >= UpperRow.size() || IdxLower >= LowerRow.size()) break; int64_t M1, M2, N; + // Starts with index 0 and updates every coefficients. int64_t UpperV = 0; int64_t LowerV = 0; uint16_t CurrentId = std::numeric_limits::max(); @@ -101,8 +110,23 @@ bool ConstraintSystem::eliminateUsingFM() { if (MulOverflow(LowerV, UpperLast, M2)) return false; + // This algorithm is a variant of sparse Gaussian elimination. + // + // The new coefficient for CurrentId is + // N = UpperV * (-1) * LowerLast + LowerV * UpperLast + // + // UpperRow: { 3, 2, 3 }, LowerLast: -2 + // LowerRow: { 0, 1, -2 }, UpperLast: 3 + // + // After multiplication: + // UpperRow: { 6, 4, 6 } + // LowerRow: { 0, 3, -6 } + // + // Eliminates y after addition: + // N: { 6, 7, 0 } => 6 >= 7 * x if (AddOverflow(M1, M2, N)) return false; + // Skip variable that is completely eliminated. if (N == 0) continue; NR.emplace_back(N, CurrentId); diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 267eb319a5616..456f5086309cf 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -238,7 +238,7 @@ struct ConstraintTy { unsigned empty() const { return Coefficients.empty(); } /// Returns true if all preconditions for this list of constraints are - /// satisfied given \p CS and the corresponding \p Value2Index mapping. + /// satisfied given \p Info. bool isValid(const ConstraintInfo &Info) const; bool isEq() const { return IsEq; } From f413772b318c2da4dfc488df81cb3c458606301f Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 7 Apr 2025 07:52:32 -0700 Subject: [PATCH 0867/1029] [SLP]Fix last instruction selection for vectorized last instruction in SplitVectorize nodes If the last instruction in the SplitVectorize node is vectorized and scheduled as part of some bundles, the SplitVectorize node might be placed in the wrong order, leading to a compiler crash. Need to check if the vectorized node has vector value and place the SplitVectorize node after the vector instruction to prevent a compile crash. Fixes issue reported in https://github.com/llvm/llvm-project/pull/133091#issuecomment-2782826805 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 12 ++- .../X86/split-node-last-inst-vectorized.ll | 99 +++++++++++++++++++ 2 files changed, 109 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/split-node-last-inst-vectorized.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e2031df810573..e9ba944924837 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15412,12 +15412,20 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { if (E->State == TreeEntry::SplitVectorize) { Res = FindLastInst(); + if (ArrayRef Entries = getTreeEntries(Res); !Entries.empty()) { + for (auto *E : Entries) { + auto *I = dyn_cast_or_null(E->VectorizedValue); + if (!I) + I = &getLastInstructionInBundle(E); + if (Res->comesBefore(I)) + Res = I; + } + } return *Res; } // Set insertpoint for gathered loads to the very first load. - if (E->State != TreeEntry::SplitVectorize && - GatheredLoadsEntriesFirst.has_value() && + if (GatheredLoadsEntriesFirst.has_value() && E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && E->getOpcode() == Instruction::Load) { Res = FindFirstInst(); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-last-inst-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-last-inst-vectorized.ll new file mode 100644 index 0000000000000..c3da2aad4c869 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-last-inst-vectorized.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define void @test(ptr %0, <8 x i8> %1) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[TMP0:%.*]], <8 x i8> [[TMP1:%.*]]) { +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP0]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13436 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13536 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13437 +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i8>, ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP9]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP11]], <8 x i8> [[TMP10]], i64 8) +; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[TMP7]], i64 0) +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP15]], <8 x i8> [[TMP14]], i64 8) +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i8> [[TMP16]], [[TMP12]] +; CHECK-NEXT: store <16 x i8> [[TMP17]], ptr [[TMP5]], align 4 +; CHECK-NEXT: ret void +; + %3 = load i8, ptr %0, align 2 + %4 = getelementptr i8, ptr %0, i64 13442 + %5 = load i8, ptr %4, align 2 + %6 = or i8 %5, %3 + %7 = getelementptr i8, ptr %0, i64 13550 + store i8 %6, ptr %7, align 2 + %8 = extractelement <8 x i8> %1, i64 0 + %9 = or i8 %5, %8 + %10 = getelementptr i8, ptr %0, i64 13542 + store i8 %9, ptr %10, align 2 + %11 = getelementptr i8, ptr %0, i64 13438 + %12 = load i8, ptr %11, align 2 + %13 = or i8 %12, %3 + %14 = getelementptr i8, ptr %0, i64 13546 + store i8 %13, ptr %14, align 2 + %15 = extractelement <8 x i8> %1, i64 2 + %16 = or i8 %12, %15 + %17 = getelementptr i8, ptr %0, i64 13538 + store i8 %16, ptr %17, align 2 + %18 = getelementptr i8, ptr %0, i64 13440 + %19 = load i8, ptr %18, align 4 + %20 = or i8 %19, %3 + %21 = getelementptr i8, ptr %0, i64 13548 + store i8 %20, ptr %21, align 4 + %22 = extractelement <8 x i8> %1, i64 4 + %23 = or i8 %19, %22 + %24 = getelementptr i8, ptr %0, i64 13540 + store i8 %23, ptr %24, align 4 + %25 = getelementptr i8, ptr %0, i64 13436 + %26 = load i8, ptr %25, align 4 + %27 = getelementptr i8, ptr %0, i64 13444 + %28 = load i8, ptr %27, align 4 + %29 = or i8 %28, %26 + %30 = getelementptr i8, ptr %0, i64 13544 + store i8 %29, ptr %30, align 4 + %31 = or i8 %26, %8 + %32 = getelementptr i8, ptr %0, i64 13536 + store i8 %31, ptr %32, align 4 + %33 = getelementptr i8, ptr %0, i64 13443 + %34 = load i8, ptr %33, align 1 + %35 = or i8 %34, %3 + %36 = getelementptr i8, ptr %0, i64 13551 + store i8 %35, ptr %36, align 1 + %37 = extractelement <8 x i8> %1, i64 7 + %38 = or i8 %34, %37 + %39 = getelementptr i8, ptr %0, i64 13543 + store i8 %38, ptr %39, align 1 + %40 = getelementptr i8, ptr %0, i64 13439 + %41 = load i8, ptr %40, align 1 + %42 = or i8 %41, %3 + %43 = getelementptr i8, ptr %0, i64 13547 + store i8 %42, ptr %43, align 1 + %44 = extractelement <8 x i8> %1, i64 3 + %45 = or i8 %41, %44 + %46 = getelementptr i8, ptr %0, i64 13539 + store i8 %45, ptr %46, align 1 + %47 = getelementptr i8, ptr %0, i64 13441 + %48 = load i8, ptr %47, align 1 + %49 = or i8 %48, %3 + %50 = getelementptr i8, ptr %0, i64 13549 + store i8 %49, ptr %50, align 1 + %51 = extractelement <8 x i8> %1, i64 5 + %52 = or i8 %48, %51 + %53 = getelementptr i8, ptr %0, i64 13541 + store i8 %52, ptr %53, align 1 + %54 = getelementptr i8, ptr %0, i64 13437 + %55 = load i8, ptr %54, align 1 + %56 = or i8 %55, %3 + %57 = getelementptr i8, ptr %0, i64 13545 + store i8 %56, ptr %57, align 1 + %58 = or i8 %55, %8 + %59 = getelementptr i8, ptr %0, i64 13537 + store i8 %58, ptr %59, align 1 + ret void +} From 600eeed51f538adc5f43c8223a57608e73aba31f Mon Sep 17 00:00:00 2001 From: Henry Jiang Date: Mon, 7 Apr 2025 12:32:02 -0400 Subject: [PATCH 0868/1029] [JITLink][NFC] Guard functions used only for debug for `XCOFFLinkGraphBuilder` (#134413) --- llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp index 243ee37886026..13099295eddeb 100644 --- a/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/XCOFFLinkGraphBuilder.cpp @@ -40,6 +40,7 @@ XCOFFLinkGraphBuilder::XCOFFLinkGraphBuilder( std::string(Obj.getFileName()), std::move(SSP), std::move(TT), std::move(Features), std::move(GetEdgeKindName))) {} +#ifndef NDEBUG static llvm::StringRef getStorageClassString(XCOFF::StorageClass SC) { switch (SC) { case XCOFF::StorageClass::C_FILE: @@ -145,6 +146,7 @@ static llvm::StringRef getStorageClassString(XCOFF::StorageClass SC) { } llvm_unreachable("Unknown XCOFF::StorageClass enum"); } +#endif Error XCOFFLinkGraphBuilder::processSections() { LLVM_DEBUG(dbgs() << " Creating graph sections...\n"); @@ -204,6 +206,7 @@ getXCOFFSymbolContainingSymbolRef(const object::XCOFFObjectFile &Obj, return object::XCOFFSymbolRef(DRI, &Obj); } +#ifndef NDEBUG static void printSymbolEntry(raw_ostream &OS, const object::XCOFFObjectFile &Obj, const object::XCOFFSymbolRef &Sym) { @@ -232,6 +235,7 @@ static void printSymbolEntry(raw_ostream &OS, } OS << "\n"; } +#endif Error XCOFFLinkGraphBuilder::processCsectsAndSymbols() { LLVM_DEBUG(dbgs() << " Creating graph blocks and symbols...\n"); From 112af8796451e19b43d7a0788039a27b08802974 Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Mon, 7 Apr 2025 09:43:46 -0700 Subject: [PATCH 0869/1029] [mlir][tosa] Add parenthesis for the weight padding calculation (#134420) Minor change. Signed-off-by: Jerry Ge --- mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp index 50d2202f68a87..ea6ac981b53cc 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaDecomposeTransposeConv.cpp @@ -129,7 +129,7 @@ class TransposeConvStridedConverter weightPadding[3] = (weightHeight % stride[0]) ? (stride[0] - weightHeight % stride[0]) : 0; weightPadding[5] = - weightWidth % stride[1] ? stride[1] - weightWidth % stride[1] : 0; + (weightWidth % stride[1]) ? (stride[1] - weightWidth % stride[1]) : 0; Value weightPaddingVal = getTosaConstShape(rewriter, op->getLoc(), weightPadding); From 1c8291ffc4d08ca6be915b880d163bfcf91b0c48 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 7 Apr 2025 23:45:19 +0700 Subject: [PATCH 0870/1029] RISCV: Convert test to opaque pointers --- llvm/test/CodeGen/RISCV/xcvmem.ll | 176 +++++++++++++++--------------- 1 file changed, 88 insertions(+), 88 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/xcvmem.ll b/llvm/test/CodeGen/RISCV/xcvmem.ll index 037e49b9b0df7..6dbebf4fb9588 100644 --- a/llvm/test/CodeGen/RISCV/xcvmem.ll +++ b/llvm/test/CodeGen/RISCV/xcvmem.ll @@ -2,294 +2,294 @@ ; RUN: llc -O3 -mtriple=riscv32 -mattr=+xcvmem -verify-machineinstrs < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK -define <2 x i32> @lb_ri_inc(i8* %a) { +define <2 x i32> @lb_ri_inc(ptr %a) { ; CHECK-LABEL: lb_ri_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.lb a1, (a0), 42 ; CHECK-NEXT: ret - %1 = load i8, i8* %a + %1 = load i8, ptr %a %2 = sext i8 %1 to i32 - %3 = getelementptr i8, i8* %a, i32 42 - %4 = ptrtoint i8* %3 to i32 + %3 = getelementptr i8, ptr %a, i32 42 + %4 = ptrtoint ptr %3 to i32 %5 = insertelement <2 x i32> undef, i32 %4, i32 0 %6 = insertelement <2 x i32> %5, i32 %2, i32 1 ret <2 x i32> %6 } -define <2 x i32> @lb_rr_inc(i8* %a, i32 %b) { +define <2 x i32> @lb_rr_inc(ptr %a, i32 %b) { ; CHECK-LABEL: lb_rr_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.lb a1, (a0), a1 ; CHECK-NEXT: ret - %1 = load i8, i8* %a + %1 = load i8, ptr %a %2 = sext i8 %1 to i32 - %3 = getelementptr i8, i8* %a, i32 %b - %4 = ptrtoint i8* %3 to i32 + %3 = getelementptr i8, ptr %a, i32 %b + %4 = ptrtoint ptr %3 to i32 %5 = insertelement <2 x i32> undef, i32 %4, i32 0 %6 = insertelement <2 x i32> %5, i32 %2, i32 1 ret <2 x i32> %6 } -define i32 @lb_rr(i8* %a, i32 %b) { +define i32 @lb_rr(ptr %a, i32 %b) { ; CHECK-LABEL: lb_rr: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.lb a0, a1(a0) ; CHECK-NEXT: ret - %1 = getelementptr i8, i8* %a, i32 %b - %2 = load i8, i8* %1 + %1 = getelementptr i8, ptr %a, i32 %b + %2 = load i8, ptr %1 %3 = sext i8 %2 to i32 ret i32 %3 } -define <2 x i32> @lbu_ri_inc(i8* %a) { +define <2 x i32> @lbu_ri_inc(ptr %a) { ; CHECK-LABEL: lbu_ri_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.lbu a1, (a0), 42 ; CHECK-NEXT: ret - %1 = load i8, i8* %a + %1 = load i8, ptr %a %2 = zext i8 %1 to i32 - %3 = getelementptr i8, i8* %a, i32 42 - %4 = ptrtoint i8* %3 to i32 + %3 = getelementptr i8, ptr %a, i32 42 + %4 = ptrtoint ptr %3 to i32 %5 = insertelement <2 x i32> undef, i32 %4, i32 0 %6 = insertelement <2 x i32> %5, i32 %2, i32 1 ret <2 x i32> %6 } -define <2 x i32> @lbu_rr_inc(i8* %a, i32 %b) { +define <2 x i32> @lbu_rr_inc(ptr %a, i32 %b) { ; CHECK-LABEL: lbu_rr_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.lbu a1, (a0), a1 ; CHECK-NEXT: ret - %1 = load i8, i8* %a + %1 = load i8, ptr %a %2 = zext i8 %1 to i32 - %3 = getelementptr i8, i8* %a, i32 %b - %4 = ptrtoint i8* %3 to i32 + %3 = getelementptr i8, ptr %a, i32 %b + %4 = ptrtoint ptr %3 to i32 %5 = insertelement <2 x i32> undef, i32 %4, i32 0 %6 = insertelement <2 x i32> %5, i32 %2, i32 1 ret <2 x i32> %6 } -define i32 @lbu_rr(i8* %a, i32 %b) { +define i32 @lbu_rr(ptr %a, i32 %b) { ; CHECK-LABEL: lbu_rr: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.lbu a0, a1(a0) ; CHECK-NEXT: ret - %1 = getelementptr i8, i8* %a, i32 %b - %2 = load i8, i8* %1 + %1 = getelementptr i8, ptr %a, i32 %b + %2 = load i8, ptr %1 %3 = zext i8 %2 to i32 ret i32 %3 } -define <2 x i32> @lh_ri_inc(i16* %a) { +define <2 x i32> @lh_ri_inc(ptr %a) { ; CHECK-LABEL: lh_ri_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.lh a1, (a0), 84 ; CHECK-NEXT: ret - %1 = load i16, i16* %a + %1 = load i16, ptr %a %2 = sext i16 %1 to i32 - %3 = getelementptr i16, i16* %a, i32 42 - %4 = ptrtoint i16* %3 to i32 + %3 = getelementptr i16, ptr %a, i32 42 + %4 = ptrtoint ptr %3 to i32 %5 = insertelement <2 x i32> undef, i32 %4, i32 0 %6 = insertelement <2 x i32> %5, i32 %2, i32 1 ret <2 x i32> %6 } -define <2 x i32> @lh_rr_inc(i16* %a, i32 %b) { +define <2 x i32> @lh_rr_inc(ptr %a, i32 %b) { ; CHECK-LABEL: lh_rr_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: cv.lh a1, (a0), a1 ; CHECK-NEXT: ret - %1 = load i16, i16* %a + %1 = load i16, ptr %a %2 = sext i16 %1 to i32 - %3 = getelementptr i16, i16* %a, i32 %b - %4 = ptrtoint i16* %3 to i32 + %3 = getelementptr i16, ptr %a, i32 %b + %4 = ptrtoint ptr %3 to i32 %5 = insertelement <2 x i32> undef, i32 %4, i32 0 %6 = insertelement <2 x i32> %5, i32 %2, i32 1 ret <2 x i32> %6 } -define i32 @lh_rr(i16* %a, i32 %b) { +define i32 @lh_rr(ptr %a, i32 %b) { ; CHECK-LABEL: lh_rr: ; CHECK: # %bb.0: ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: cv.lh a0, a1(a0) ; CHECK-NEXT: ret - %1 = getelementptr i16, i16* %a, i32 %b - %2 = load i16, i16* %1 + %1 = getelementptr i16, ptr %a, i32 %b + %2 = load i16, ptr %1 %3 = sext i16 %2 to i32 ret i32 %3 } -define <2 x i32> @lhu_ri_inc(i16* %a) { +define <2 x i32> @lhu_ri_inc(ptr %a) { ; CHECK-LABEL: lhu_ri_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.lhu a1, (a0), 84 ; CHECK-NEXT: ret - %1 = load i16, i16* %a + %1 = load i16, ptr %a %2 = zext i16 %1 to i32 - %3 = getelementptr i16, i16* %a, i32 42 - %4 = ptrtoint i16* %3 to i32 + %3 = getelementptr i16, ptr %a, i32 42 + %4 = ptrtoint ptr %3 to i32 %5 = insertelement <2 x i32> undef, i32 %4, i32 0 %6 = insertelement <2 x i32> %5, i32 %2, i32 1 ret <2 x i32> %6 } -define <2 x i32> @lhu_rr_inc(i16* %a, i32 %b) { +define <2 x i32> @lhu_rr_inc(ptr %a, i32 %b) { ; CHECK-LABEL: lhu_rr_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: cv.lhu a1, (a0), a1 ; CHECK-NEXT: ret - %1 = load i16, i16* %a + %1 = load i16, ptr %a %2 = zext i16 %1 to i32 - %3 = getelementptr i16, i16* %a, i32 %b - %4 = ptrtoint i16* %3 to i32 + %3 = getelementptr i16, ptr %a, i32 %b + %4 = ptrtoint ptr %3 to i32 %5 = insertelement <2 x i32> undef, i32 %4, i32 0 %6 = insertelement <2 x i32> %5, i32 %2, i32 1 ret <2 x i32> %6 } -define i32 @lhu_rr(i16* %a, i32 %b) { +define i32 @lhu_rr(ptr %a, i32 %b) { ; CHECK-LABEL: lhu_rr: ; CHECK: # %bb.0: ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: cv.lhu a0, a1(a0) ; CHECK-NEXT: ret - %1 = getelementptr i16, i16* %a, i32 %b - %2 = load i16, i16* %1 + %1 = getelementptr i16, ptr %a, i32 %b + %2 = load i16, ptr %1 %3 = zext i16 %2 to i32 ret i32 %3 } -define <2 x i32> @lw_ri_inc(i32* %a) { +define <2 x i32> @lw_ri_inc(ptr %a) { ; CHECK-LABEL: lw_ri_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.lw a1, (a0), 168 ; CHECK-NEXT: ret - %1 = load i32, i32* %a - %2 = getelementptr i32, i32* %a, i32 42 - %3 = ptrtoint i32* %2 to i32 + %1 = load i32, ptr %a + %2 = getelementptr i32, ptr %a, i32 42 + %3 = ptrtoint ptr %2 to i32 %4 = insertelement <2 x i32> undef, i32 %3, i32 0 %5 = insertelement <2 x i32> %4, i32 %1, i32 1 ret <2 x i32> %5 } -define <2 x i32> @lw_rr_inc(i32* %a, i32 %b) { +define <2 x i32> @lw_rr_inc(ptr %a, i32 %b) { ; CHECK-LABEL: lw_rr_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: cv.lw a1, (a0), a1 ; CHECK-NEXT: ret - %1 = load i32, i32* %a - %2 = getelementptr i32, i32* %a, i32 %b - %3 = ptrtoint i32* %2 to i32 + %1 = load i32, ptr %a + %2 = getelementptr i32, ptr %a, i32 %b + %3 = ptrtoint ptr %2 to i32 %4 = insertelement <2 x i32> undef, i32 %3, i32 0 %5 = insertelement <2 x i32> %4, i32 %1, i32 1 ret <2 x i32> %5 } -define i32 @lw_rr(i32* %a, i32 %b) { +define i32 @lw_rr(ptr %a, i32 %b) { ; CHECK-LABEL: lw_rr: ; CHECK: # %bb.0: ; CHECK-NEXT: slli a1, a1, 2 ; CHECK-NEXT: cv.lw a0, a1(a0) ; CHECK-NEXT: ret - %1 = getelementptr i32, i32* %a, i32 %b - %2 = load i32, i32* %1 + %1 = getelementptr i32, ptr %a, i32 %b + %2 = load i32, ptr %1 ret i32 %2 } -define i8* @sb_ri_inc(i8* %a, i8 %b) { +define ptr @sb_ri_inc(ptr %a, i8 %b) { ; CHECK-LABEL: sb_ri_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.sb a1, (a0), 42 ; CHECK-NEXT: ret - store i8 %b, i8* %a - %1 = getelementptr i8, i8* %a, i32 42 - ret i8* %1 + store i8 %b, ptr %a + %1 = getelementptr i8, ptr %a, i32 42 + ret ptr %1 } -define i8* @sb_rr_inc(i8* %a, i8 %b, i32 %c) { +define ptr @sb_rr_inc(ptr %a, i8 %b, i32 %c) { ; CHECK-LABEL: sb_rr_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.sb a1, (a0), a2 ; CHECK-NEXT: ret - store i8 %b, i8* %a - %1 = getelementptr i8, i8* %a, i32 %c - ret i8* %1 + store i8 %b, ptr %a + %1 = getelementptr i8, ptr %a, i32 %c + ret ptr %1 } -define void @sb_rr(i8* %a, i8 %b, i32 %c) { +define void @sb_rr(ptr %a, i8 %b, i32 %c) { ; CHECK-LABEL: sb_rr: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.sb a1, a2(a0) ; CHECK-NEXT: ret - %1 = getelementptr i8, i8* %a, i32 %c - store i8 %b, i8* %1 + %1 = getelementptr i8, ptr %a, i32 %c + store i8 %b, ptr %1 ret void } -define i16* @sh_ri_inc(i16* %a, i16 %b) { +define ptr @sh_ri_inc(ptr %a, i16 %b) { ; CHECK-LABEL: sh_ri_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.sh a1, (a0), 84 ; CHECK-NEXT: ret - store i16 %b, i16* %a - %1 = getelementptr i16, i16* %a, i32 42 - ret i16* %1 + store i16 %b, ptr %a + %1 = getelementptr i16, ptr %a, i32 42 + ret ptr %1 } -define i16* @sh_rr_inc(i16* %a, i16 %b, i32 %c) { +define ptr @sh_rr_inc(ptr %a, i16 %b, i32 %c) { ; CHECK-LABEL: sh_rr_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: slli a2, a2, 1 ; CHECK-NEXT: cv.sh a1, (a0), a2 ; CHECK-NEXT: ret - store i16 %b, i16* %a - %1 = getelementptr i16, i16* %a, i32 %c - ret i16* %1 + store i16 %b, ptr %a + %1 = getelementptr i16, ptr %a, i32 %c + ret ptr %1 } -define void @sh_rr(i16* %a, i16 %b, i32 %c) { +define void @sh_rr(ptr %a, i16 %b, i32 %c) { ; CHECK-LABEL: sh_rr: ; CHECK: # %bb.0: ; CHECK-NEXT: slli a2, a2, 1 ; CHECK-NEXT: cv.sh a1, a2(a0) ; CHECK-NEXT: ret - %1 = getelementptr i16, i16* %a, i32 %c - store i16 %b, i16* %1 + %1 = getelementptr i16, ptr %a, i32 %c + store i16 %b, ptr %1 ret void } -define i32* @sw_ri_inc(i32* %a, i32 %b) { +define ptr @sw_ri_inc(ptr %a, i32 %b) { ; CHECK-LABEL: sw_ri_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: cv.sw a1, (a0), 168 ; CHECK-NEXT: ret - store i32 %b, i32* %a - %1 = getelementptr i32, i32* %a, i32 42 - ret i32* %1 + store i32 %b, ptr %a + %1 = getelementptr i32, ptr %a, i32 42 + ret ptr %1 } -define i32* @sw_rr_inc(i32* %a, i32 %b, i32 %c) { +define ptr @sw_rr_inc(ptr %a, i32 %b, i32 %c) { ; CHECK-LABEL: sw_rr_inc: ; CHECK: # %bb.0: ; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: cv.sw a1, (a0), a2 ; CHECK-NEXT: ret - store i32 %b, i32* %a - %1 = getelementptr i32, i32* %a, i32 %c - ret i32* %1 + store i32 %b, ptr %a + %1 = getelementptr i32, ptr %a, i32 %c + ret ptr %1 } -define void @sw_rr(i32* %a, i32 %b, i32 %c) { +define void @sw_rr(ptr %a, i32 %b, i32 %c) { ; CHECK-LABEL: sw_rr: ; CHECK: # %bb.0: ; CHECK-NEXT: slli a2, a2, 2 ; CHECK-NEXT: cv.sw a1, a2(a0) ; CHECK-NEXT: ret - %1 = getelementptr i32, i32* %a, i32 %c - store i32 %b, i32* %1 + %1 = getelementptr i32, ptr %a, i32 %c + store i32 %b, ptr %1 ret void } From cb9afe53bf6c7ceb0bdf3b579c185cbff15a3b38 Mon Sep 17 00:00:00 2001 From: Tai Ly Date: Mon, 7 Apr 2025 11:47:07 -0500 Subject: [PATCH 0871/1029] [mlir][tosa] Fix validation pass assert (#134445) This fixes a validation pass assert when processing ops with quantized element types. The failure case is added to invalid.mlir The fix is to re-order the validation checking so that only ops with int/float operands and results pass the first stage of validation pass, so that the remaining checks do not need to handle quantized data types. Signed-off-by: Tai Ly --- .../Tosa/Transforms/TosaValidation.cpp | 19 ++++++++++--------- mlir/test/Dialect/Tosa/invalid.mlir | 9 +++++++++ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp index 3ec7354562d23..28e562c813eb3 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp @@ -1018,15 +1018,8 @@ void TosaValidation::runOnOperation() { if (op->getDialect() != tosaDialect) return; - // Profile-Extension based validation should be performed at the beginning. - if (strictOpSpecAlignment && - failed(profileComp.checkProfile(op, targetEnv))) - return signalPassFailure(); - - if (strictOpSpecAlignment && - failed(profileComp.checkExtension(op, targetEnv))) - return signalPassFailure(); - + // perform valid element type check at the beginning to + // protect rest of code against quantized element types for (Value operand : op->getOperands()) { auto elementTy = getElementTypeOrSelf(operand); if (!isValidElementType(elementTy)) { @@ -1044,6 +1037,14 @@ void TosaValidation::runOnOperation() { } } + if (strictOpSpecAlignment && + failed(profileComp.checkProfile(op, targetEnv))) + return signalPassFailure(); + + if (strictOpSpecAlignment && + failed(profileComp.checkExtension(op, targetEnv))) + return signalPassFailure(); + if (!allowInvalidOpDatatypeCombinations && failed(profileComp.checkInvalid(op))) { op->emitOpError("illegal: operand/result data types not supported"); diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index 8cf6d4b154792..12b2379a592c3 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -253,6 +253,15 @@ func.func @test_conv2d_quant_any_acc(%arg0: tensor<1x4x4x4x!quant.any>> return %0 : tensor<1x4x4x8x!quant.any>> } +// ----- +// CHECK-LABEL: conv2d_quant_any +func.func @test_conv2d_quant_any(%arg0: tensor<1x4x4x4x!quant.any>>, %arg1: tensor<8x1x1x4x!quant.any>>, %arg2: tensor<8x!quant.any>>) -> tensor<1x4x4x8x!quant.any>> { + %zp = "tosa.const" () { values = dense<0> : tensor<1xi8> } : () -> tensor<1xi8> + // expected-error@+1 {{'tosa.conv2d' op is not profile-aligned: element type '!quant.any>'}} + %0 = tosa.conv2d %arg0, %arg1, %arg2, %zp, %zp {acc_type = i32, dilation = array, pad = array, stride = array, local_bound = true} : (tensor<1x4x4x4x!quant.any>>, tensor<8x1x1x4x!quant.any>>, tensor<8x!quant.any>>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x4x8x!quant.any>> + return %0 : tensor<1x4x4x8x!quant.any>> +} + // ----- func.func @test_concat(%arg0 : tensor<2x1xf32>, %arg1 : tensor<2x2xf32>) -> tensor { From fbc8335311b27d73fb685d5ebfb702f7acf134d2 Mon Sep 17 00:00:00 2001 From: Jan Leyonberg Date: Mon, 7 Apr 2025 12:47:16 -0400 Subject: [PATCH 0872/1029] [MLIR][OpenMP] Add codegen for teams reductions (#133310) This patch adds the lowering of teams reductions from the omp dialect to LLVM-IR. Some minor cleanup was done in clang to remove an unused parameter. --- clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 3 +- .../llvm/Frontend/OpenMP/OMPIRBuilder.h | 10 +- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 173 ++++++++++----- .../Frontend/OpenMPIRBuilderTest.cpp | 1 + .../OpenMP/OpenMPToLLVMIRTranslation.cpp | 204 ++++++++++++++++-- .../omptarget-teams-distribute-reduction.mlir | 75 +++++++ .../LLVMIR/omptarget-teams-reduction.mlir | 79 +++++++ .../openmp-teams-distribute-reduction.mlir | 71 ++++++ .../Target/LLVMIR/openmp-teams-reduction.mlir | 79 +++++++ mlir/test/Target/LLVMIR/openmp-todo.mlir | 28 --- .../basic-target-parallel-reduction.f90 | 27 +++ .../basic-target-teams-parallel-reduction.f90 | 27 +++ 12 files changed, 666 insertions(+), 111 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir create mode 100644 mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir create mode 100644 mlir/test/Target/LLVMIR/openmp-teams-distribute-reduction.mlir create mode 100644 mlir/test/Target/LLVMIR/openmp-teams-reduction.mlir create mode 100644 offload/test/offloading/fortran/basic-target-parallel-reduction.f90 create mode 100644 offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index e86fbffe63252..f697c13f4c522 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1660,7 +1660,6 @@ void CGOpenMPRuntimeGPU::emitReduction( return; bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); - bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind); bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); ASTContext &C = CGM.getContext(); @@ -1757,7 +1756,7 @@ void CGOpenMPRuntimeGPU::emitReduction( llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail(OMPBuilder.createReductionsGPU( OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction, - DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, + llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang, CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc)); CGF.Builder.restoreIP(AfterIP); diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index ec013d1822439..6b104708bdb0d 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1907,8 +1907,6 @@ class OpenMPIRBuilder { /// nowait. /// \param IsTeamsReduction Optional flag set if it is a teams /// reduction. - /// \param HasDistribute Optional flag set if it is a - /// distribute reduction. /// \param GridValue Optional GPU grid value. /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be /// used for teams reduction. @@ -1917,7 +1915,6 @@ class OpenMPIRBuilder { const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef ReductionInfos, bool IsNoWait = false, bool IsTeamsReduction = false, - bool HasDistribute = false, ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR, std::optional GridValue = {}, unsigned ReductionBufNum = 1024, Value *SrcLocInfo = nullptr); @@ -1985,11 +1982,14 @@ class OpenMPIRBuilder { /// \param IsNoWait A flag set if the reduction is marked as nowait. /// \param IsByRef A flag set if the reduction is using reference /// or direct value. + /// \param IsTeamsReduction Optional flag set if it is a teams + /// reduction. InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef ReductionInfos, ArrayRef IsByRef, - bool IsNoWait = false); + bool IsNoWait = false, + bool IsTeamsReduction = false); ///} @@ -2273,6 +2273,8 @@ class OpenMPIRBuilder { int32_t MinTeams = 1; SmallVector MaxThreads = {-1}; int32_t MinThreads = 1; + int32_t ReductionDataSize = 0; + int32_t ReductionBufferLength = 0; }; /// Container to pass LLVM IR runtime values or constants related to the diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 68b1fa42934ad..d59a144d3cf99 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -3495,9 +3495,9 @@ checkReductionInfos(ArrayRef ReductionInfos, OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef ReductionInfos, - bool IsNoWait, bool IsTeamsReduction, bool HasDistribute, - ReductionGenCBKind ReductionGenCBKind, std::optional GridValue, - unsigned ReductionBufNum, Value *SrcLocInfo) { + bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind, + std::optional GridValue, unsigned ReductionBufNum, + Value *SrcLocInfo) { if (!updateToLocation(Loc)) return InsertPointTy(); Builder.restoreIP(CodeGenIP); @@ -3514,6 +3514,16 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( if (ReductionInfos.size() == 0) return Builder.saveIP(); + BasicBlock *ContinuationBlock = nullptr; + if (ReductionGenCBKind != ReductionGenCBKind::Clang) { + // Copied code from createReductions + BasicBlock *InsertBlock = Loc.IP.getBlock(); + ContinuationBlock = + InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize"); + InsertBlock->getTerminator()->eraseFromParent(); + Builder.SetInsertPoint(InsertBlock, InsertBlock->end()); + } + Function *CurFunc = Builder.GetInsertBlock()->getParent(); AttributeList FuncAttrs; AttrBuilder AttrBldr(Ctx); @@ -3669,11 +3679,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( ReductionFunc; }); } else { - assert(false && "Unhandled ReductionGenCBKind"); + Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs"); + Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs"); + Value *Reduced; + InsertPointOrErrorTy AfterIP = + RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced); + if (!AfterIP) + return AfterIP.takeError(); + Builder.CreateStore(Reduced, LHS, false); } } emitBlock(ExitBB, CurFunc); - + if (ContinuationBlock) { + Builder.CreateBr(ContinuationBlock); + Builder.SetInsertPoint(ContinuationBlock); + } Config.setEmitLLVMUsed(); return Builder.saveIP(); @@ -3688,27 +3708,95 @@ static Function *getFreshReductionFunc(Module &M) { ".omp.reduction.func", &M); } -OpenMPIRBuilder::InsertPointOrErrorTy -OpenMPIRBuilder::createReductions(const LocationDescription &Loc, - InsertPointTy AllocaIP, - ArrayRef ReductionInfos, - ArrayRef IsByRef, bool IsNoWait) { - assert(ReductionInfos.size() == IsByRef.size()); - for (const ReductionInfo &RI : ReductionInfos) { - (void)RI; - assert(RI.Variable && "expected non-null variable"); - assert(RI.PrivateVariable && "expected non-null private variable"); - assert(RI.ReductionGen && "expected non-null reduction generator callback"); - assert(RI.Variable->getType() == RI.PrivateVariable->getType() && - "expected variables and their private equivalents to have the same " - "type"); - assert(RI.Variable->getType()->isPointerTy() && - "expected variables to be pointers"); +static Error populateReductionFunction( + Function *ReductionFunc, + ArrayRef ReductionInfos, + IRBuilder<> &Builder, ArrayRef IsByRef, bool IsGPU) { + Module *Module = ReductionFunc->getParent(); + BasicBlock *ReductionFuncBlock = + BasicBlock::Create(Module->getContext(), "", ReductionFunc); + Builder.SetInsertPoint(ReductionFuncBlock); + Value *LHSArrayPtr = nullptr; + Value *RHSArrayPtr = nullptr; + if (IsGPU) { + // Need to alloca memory here and deal with the pointers before getting + // LHS/RHS pointers out + // + Argument *Arg0 = ReductionFunc->getArg(0); + Argument *Arg1 = ReductionFunc->getArg(1); + Type *Arg0Type = Arg0->getType(); + Type *Arg1Type = Arg1->getType(); + + Value *LHSAlloca = + Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr"); + Value *RHSAlloca = + Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr"); + Value *LHSAddrCast = + Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type); + Value *RHSAddrCast = + Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type); + Builder.CreateStore(Arg0, LHSAddrCast); + Builder.CreateStore(Arg1, RHSAddrCast); + LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast); + RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast); + } else { + LHSArrayPtr = ReductionFunc->getArg(0); + RHSArrayPtr = ReductionFunc->getArg(1); } + unsigned NumReductions = ReductionInfos.size(); + Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions); + + for (auto En : enumerate(ReductionInfos)) { + const OpenMPIRBuilder::ReductionInfo &RI = En.value(); + Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( + RedArrayTy, LHSArrayPtr, 0, En.index()); + Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr); + Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + LHSI8Ptr, RI.Variable->getType()); + Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); + Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( + RedArrayTy, RHSArrayPtr, 0, En.index()); + Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr); + Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast( + RHSI8Ptr, RI.PrivateVariable->getType()); + Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); + Value *Reduced; + OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = + RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced); + if (!AfterIP) + return AfterIP.takeError(); + + Builder.restoreIP(*AfterIP); + // TODO: Consider flagging an error. + if (!Builder.GetInsertBlock()) + return Error::success(); + + // store is inside of the reduction region when using by-ref + if (!IsByRef[En.index()]) + Builder.CreateStore(Reduced, LHSPtr); + } + Builder.CreateRetVoid(); + return Error::success(); +} + +OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions( + const LocationDescription &Loc, InsertPointTy AllocaIP, + ArrayRef ReductionInfos, ArrayRef IsByRef, + bool IsNoWait, bool IsTeamsReduction) { + assert(ReductionInfos.size() == IsByRef.size()); + if (Config.isGPU()) + return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos, + IsNoWait, IsTeamsReduction); + + checkReductionInfos(ReductionInfos, /*IsGPU*/ false); + if (!updateToLocation(Loc)) return InsertPointTy(); + if (ReductionInfos.size() == 0) + return Builder.saveIP(); + BasicBlock *InsertBlock = Loc.IP.getBlock(); BasicBlock *ContinuationBlock = InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize"); @@ -3832,38 +3920,13 @@ OpenMPIRBuilder::createReductions(const LocationDescription &Loc, // Populate the outlined reduction function using the elementwise reduction // function. Partial values are extracted from the type-erased array of // pointers to private variables. - BasicBlock *ReductionFuncBlock = - BasicBlock::Create(Module->getContext(), "", ReductionFunc); - Builder.SetInsertPoint(ReductionFuncBlock); - Value *LHSArrayPtr = ReductionFunc->getArg(0); - Value *RHSArrayPtr = ReductionFunc->getArg(1); + Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder, + IsByRef, /*isGPU=*/false); + if (Err) + return Err; - for (auto En : enumerate(ReductionInfos)) { - const ReductionInfo &RI = En.value(); - Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( - RedArrayTy, LHSArrayPtr, 0, En.index()); - Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr); - Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType()); - Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr); - Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64( - RedArrayTy, RHSArrayPtr, 0, En.index()); - Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr); - Value *RHSPtr = - Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType()); - Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr); - Value *Reduced; - InsertPointOrErrorTy AfterIP = - RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced); - if (!AfterIP) - return AfterIP.takeError(); - Builder.restoreIP(*AfterIP); - if (!Builder.GetInsertBlock()) - return InsertPointTy(); - // store is inside of the reduction region when using by-ref - if (!IsByRef[En.index()]) - Builder.CreateStore(Reduced, LHSPtr); - } - Builder.CreateRetVoid(); + if (!Builder.GetInsertBlock()) + return InsertPointTy(); Builder.SetInsertPoint(ContinuationBlock); return Builder.saveIP(); @@ -6239,8 +6302,10 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit( Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal); Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams); Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front()); - Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0); - Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0); + Constant *ReductionDataSize = + ConstantInt::getSigned(Int32, Attrs.ReductionDataSize); + Constant *ReductionBufferLength = + ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength); Function *Fn = getOrCreateRuntimeFunctionPtr( omp::RuntimeFunction::OMPRTL___kmpc_target_init); diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 27c0e0bf80255..2d3d318be7ff1 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -2354,6 +2354,7 @@ TEST_F(OpenMPIRBuilderTest, StaticWorkshareLoopTarget) { "256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"); OpenMPIRBuilder OMPBuilder(*M); OMPBuilder.Config.IsTargetDevice = true; + OMPBuilder.Config.setIsGPU(false); OMPBuilder.initialize(); IRBuilder<> Builder(BB); OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 4d610d6e2656d..8d1cc9b10a950 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -265,7 +265,6 @@ static LogicalResult checkImplementationStatus(Operation &op) { .Case([&](omp::TeamsOp op) { checkAllocate(op, result); checkPrivate(op, result); - checkReduction(op, result); }) .Case([&](omp::TaskOp op) { checkAllocate(op, result); @@ -1018,19 +1017,31 @@ allocReductionVars(T loop, ArrayRef reductionArgs, // variable allocated in the inlined region) llvm::Value *var = builder.CreateAlloca( moduleTranslation.convertType(reductionDecls[i].getType())); - deferredStores.emplace_back(phis[0], var); - privateReductionVariables[i] = var; - moduleTranslation.mapValue(reductionArgs[i], phis[0]); - reductionVariableMap.try_emplace(loop.getReductionVars()[i], phis[0]); + llvm::Type *ptrTy = builder.getPtrTy(); + llvm::Value *castVar = + builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); + llvm::Value *castPhi = + builder.CreatePointerBitCastOrAddrSpaceCast(phis[0], ptrTy); + + deferredStores.emplace_back(castPhi, castVar); + + privateReductionVariables[i] = castVar; + moduleTranslation.mapValue(reductionArgs[i], castPhi); + reductionVariableMap.try_emplace(loop.getReductionVars()[i], castPhi); } else { assert(allocRegion.empty() && "allocaction is implicit for by-val reduction"); llvm::Value *var = builder.CreateAlloca( moduleTranslation.convertType(reductionDecls[i].getType())); - moduleTranslation.mapValue(reductionArgs[i], var); - privateReductionVariables[i] = var; - reductionVariableMap.try_emplace(loop.getReductionVars()[i], var); + + llvm::Type *ptrTy = builder.getPtrTy(); + llvm::Value *castVar = + builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy); + + moduleTranslation.mapValue(reductionArgs[i], castVar); + privateReductionVariables[i] = castVar; + reductionVariableMap.try_emplace(loop.getReductionVars()[i], castVar); } } @@ -1250,18 +1261,20 @@ static LogicalResult createReductionsAndCleanup( LLVM::ModuleTranslation &moduleTranslation, llvm::OpenMPIRBuilder::InsertPointTy &allocaIP, SmallVectorImpl &reductionDecls, - ArrayRef privateReductionVariables, ArrayRef isByRef) { + ArrayRef privateReductionVariables, ArrayRef isByRef, + bool isNowait = false, bool isTeamsReduction = false) { // Process the reductions if required. if (op.getNumReductionVars() == 0) return success(); + SmallVector owningReductionGens; + SmallVector owningAtomicReductionGens; + SmallVector reductionInfos; + llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); // Create the reduction generators. We need to own them here because // ReductionInfo only accepts references to the generators. - SmallVector owningReductionGens; - SmallVector owningAtomicReductionGens; - SmallVector reductionInfos; collectReductionInfo(op, builder, moduleTranslation, reductionDecls, owningReductionGens, owningAtomicReductionGens, privateReductionVariables, reductionInfos); @@ -1273,7 +1286,7 @@ static LogicalResult createReductionsAndCleanup( builder.SetInsertPoint(tempTerminator); llvm::OpenMPIRBuilder::InsertPointOrErrorTy contInsertPoint = ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos, - isByRef, op.getNowait()); + isByRef, isNowait, isTeamsReduction); if (failed(handleError(contInsertPoint, *op))) return failure(); @@ -1666,9 +1679,9 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder, builder.restoreIP(*afterIP); // Process the reductions if required. - return createReductionsAndCleanup(sectionsOp, builder, moduleTranslation, - allocaIP, reductionDecls, - privateReductionVariables, isByRef); + return createReductionsAndCleanup( + sectionsOp, builder, moduleTranslation, allocaIP, reductionDecls, + privateReductionVariables, isByRef, sectionsOp.getNowait()); } /// Converts an OpenMP single construct into LLVM IR using OpenMPIRBuilder. @@ -1714,6 +1727,42 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder, return success(); } +static bool teamsReductionContainedInDistribute(omp::TeamsOp teamsOp) { + auto iface = + llvm::cast(teamsOp.getOperation()); + // Check that all uses of the reduction block arg has the same distribute op + // parent. + llvm::SmallVector debugUses; + Operation *distOp = nullptr; + for (auto ra : iface.getReductionBlockArgs()) + for (auto &use : ra.getUses()) { + auto *useOp = use.getOwner(); + // Ignore debug uses. + if (mlir::isa(useOp)) { + debugUses.push_back(useOp); + continue; + } + + auto currentDistOp = useOp->getParentOfType(); + // Use is not inside a distribute op - return false + if (!currentDistOp) + return false; + // Multiple distribute operations - return false + Operation *currentOp = currentDistOp.getOperation(); + if (distOp && (distOp != currentOp)) + return false; + + distOp = currentOp; + } + + // If we are going to use distribute reduction then remove any debug uses of + // the reduction parameters in teamsOp. Otherwise they will be left without + // any mapped value in moduleTranslation and will eventually error out. + for (auto use : debugUses) + use->erase(); + return true; +} + // Convert an OpenMP Teams construct to LLVM IR using OpenMPIRBuilder static LogicalResult convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder, @@ -1722,6 +1771,34 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder, if (failed(checkImplementationStatus(*op))) return failure(); + DenseMap reductionVariableMap; + unsigned numReductionVars = op.getNumReductionVars(); + SmallVector reductionDecls; + SmallVector privateReductionVariables(numReductionVars); + llvm::ArrayRef isByRef; + llvm::OpenMPIRBuilder::InsertPointTy allocaIP = + findAllocaInsertPoint(builder, moduleTranslation); + + // Only do teams reduction if there is no distribute op that captures the + // reduction instead. + bool doTeamsReduction = !teamsReductionContainedInDistribute(op); + if (doTeamsReduction) { + isByRef = getIsByRef(op.getReductionByref()); + + assert(isByRef.size() == op.getNumReductionVars()); + + MutableArrayRef reductionArgs = + llvm::cast(*op).getReductionBlockArgs(); + + collectReductionDecls(op, reductionDecls); + + if (failed(allocAndInitializeReductionVars( + op, reductionArgs, builder, moduleTranslation, allocaIP, + reductionDecls, privateReductionVariables, reductionVariableMap, + isByRef))) + return failure(); + } + auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) { LLVM::ModuleTranslation::SaveStack frame( moduleTranslation, allocaIP); @@ -1756,6 +1833,13 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder, return failure(); builder.restoreIP(*afterIP); + if (doTeamsReduction) { + // Process the reductions if required. + return createReductionsAndCleanup( + op, builder, moduleTranslation, allocaIP, reductionDecls, + privateReductionVariables, isByRef, + /*isNoWait*/ false, /*isTeamsReduction*/ true); + } return success(); } @@ -2273,9 +2357,10 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, return failure(); // Process the reductions if required. - if (failed(createReductionsAndCleanup(wsloopOp, builder, moduleTranslation, - allocaIP, reductionDecls, - privateReductionVariables, isByRef))) + if (failed(createReductionsAndCleanup( + wsloopOp, builder, moduleTranslation, allocaIP, reductionDecls, + privateReductionVariables, isByRef, wsloopOp.getNowait(), + /*isTeamsReduction=*/false))) return failure(); return cleanupPrivateVars(builder, moduleTranslation, wsloopOp.getLoc(), @@ -2377,8 +2462,9 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder, builder.SetInsertPoint(tempTerminator); llvm::OpenMPIRBuilder::InsertPointOrErrorTy contInsertPoint = - ompBuilder->createReductions(builder.saveIP(), allocaIP, - reductionInfos, isByRef, false); + ompBuilder->createReductions( + builder.saveIP(), allocaIP, reductionInfos, isByRef, + /*IsNoWait=*/false, /*IsTeamsReduction=*/false); if (!contInsertPoint) return contInsertPoint.takeError(); @@ -4161,6 +4247,37 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, if (failed(checkImplementationStatus(opInst))) return failure(); + /// Process teams op reduction in distribute if the reduction is contained in + /// the distribute op. + omp::TeamsOp teamsOp = opInst.getParentOfType(); + bool doDistributeReduction = + teamsOp ? teamsReductionContainedInDistribute(teamsOp) : false; + + DenseMap reductionVariableMap; + unsigned numReductionVars = teamsOp ? teamsOp.getNumReductionVars() : 0; + SmallVector reductionDecls; + SmallVector privateReductionVariables(numReductionVars); + llvm::ArrayRef isByRef; + + if (doDistributeReduction) { + isByRef = getIsByRef(teamsOp.getReductionByref()); + assert(isByRef.size() == teamsOp.getNumReductionVars()); + + collectReductionDecls(teamsOp, reductionDecls); + llvm::OpenMPIRBuilder::InsertPointTy allocaIP = + findAllocaInsertPoint(builder, moduleTranslation); + + MutableArrayRef reductionArgs = + llvm::cast(*teamsOp) + .getReductionBlockArgs(); + + if (failed(allocAndInitializeReductionVars( + teamsOp, reductionArgs, builder, moduleTranslation, allocaIP, + reductionDecls, privateReductionVariables, reductionVariableMap, + isByRef))) + return failure(); + } + using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) -> llvm::Error { @@ -4244,6 +4361,14 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder, return failure(); builder.restoreIP(*afterIP); + + if (doDistributeReduction) { + // Process the reductions if required. + return createReductionsAndCleanup( + teamsOp, builder, moduleTranslation, allocaIP, reductionDecls, + privateReductionVariables, isByRef, + /*isNoWait*/ false, /*isTeamsReduction*/ true); + } return success(); } @@ -4554,6 +4679,25 @@ static std::optional extractConstInteger(Value value) { return std::nullopt; } +static uint64_t getTypeByteSize(mlir::Type type, const DataLayout &dl) { + uint64_t sizeInBits = dl.getTypeSizeInBits(type); + uint64_t sizeInBytes = sizeInBits / 8; + return sizeInBytes; +} + +template +static uint64_t getReductionDataSize(OpTy &op) { + if (op.getNumReductionVars() > 0) { + assert(op.getNumReductionVars() == 1 && + "Only 1 reduction variable currently supported"); + mlir::Type reductionVarTy = op.getReductionVars()[0].getType(); + Operation *opp = op.getOperation(); + DataLayout dl = DataLayout(opp->getParentOfType()); + return getTypeByteSize(reductionVarTy, dl); + } + return 0; +} + /// Populate default `MinTeams`, `MaxTeams` and `MaxThreads` to their default /// values as stated by the corresponding clauses, if constant. /// @@ -4563,7 +4707,7 @@ static std::optional extractConstInteger(Value value) { static void initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp, llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &attrs, - bool isTargetDevice) { + bool isTargetDevice, bool isGPU) { // TODO: Handle constant 'if' clauses. Value numThreads, numTeamsLower, numTeamsUpper, threadLimit; @@ -4645,6 +4789,14 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp, (maxThreadsVal >= 0 && maxThreadsVal < combinedMaxThreadsVal)) combinedMaxThreadsVal = maxThreadsVal; + // Calculate reduction data size, limited to single reduction variable for + // now. + int32_t reductionDataSize = 0; + if (isGPU && capturedOp) { + if (auto teamsOp = castOrGetParentOfType(capturedOp)) + reductionDataSize = getReductionDataSize(teamsOp); + } + // Update kernel bounds structure for the `OpenMPIRBuilder` to use. omp::TargetRegionFlags kernelFlags = targetOp.getKernelExecFlags(capturedOp); assert( @@ -4661,6 +4813,11 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp, attrs.MaxTeams.front() = maxTeamsVal; attrs.MinThreads = 1; attrs.MaxThreads.front() = combinedMaxThreadsVal; + attrs.ReductionDataSize = reductionDataSize; + // TODO: Allow modified buffer length similar to + // fopenmp-cuda-teams-reduction-recs-num flag in clang. + if (attrs.ReductionDataSize != 0) + attrs.ReductionBufferLength = 1024; } /// Gather LLVM runtime values for all clauses evaluated in the host that are @@ -4741,6 +4898,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder(); bool isTargetDevice = ompBuilder->Config.isTargetDevice(); + bool isGPU = ompBuilder->Config.isGPU(); auto parentFn = opInst.getParentOfType(); auto argIface = cast(opInst); @@ -4943,7 +5101,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs defaultAttrs; Operation *targetCapturedOp = targetOp.getInnermostCapturedOmpOp(); initTargetDefaultAttrs(targetOp, targetCapturedOp, defaultAttrs, - isTargetDevice); + isTargetDevice, isGPU); // Collect host-evaluated values needed to properly launch the kernel from the // host. diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir new file mode 100644 index 0000000000000..af8fe7aacc336 --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir @@ -0,0 +1,75 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// Only check the overall shape of the code and the presence of relevant +// runtime calls. Actual IR checking is done at the OpenMPIRBuilder level. + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } { + omp.private {type = private} @_QFsimple_target_teams_only_reductionEindex__private_i32 : i32 + omp.declare_reduction @add_reduction_i32 : i32 init { + ^bb0(%arg0: i32): + %0 = llvm.mlir.constant(0 : i32) : i32 + omp.yield(%0 : i32) + } combiner { + ^bb0(%arg0: i32, %arg1: i32): + %0 = llvm.add %arg0, %arg1 : i32 + omp.yield(%0 : i32) + } + llvm.func @simple_target_teams_only_reduction_() attributes {fir.internal_name = "_QPsimple_target_teams_only_reduction", frame_pointer = #llvm.framePointerKind, omp.declare_target = #omp.declaretarget, target_cpu = "gfx1030", target_features = #llvm.target_features<["+16-bit-insts", "+ci-insts", "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", "+gfx10-3-insts", "+gfx10-insts", "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize32"]>} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "sum"} : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.mlir.constant(1 : i64) : i64 + %4 = llvm.alloca %3 x i32 {bindc_name = "index_"} : (i64) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %6 = llvm.mlir.constant(0 : i32) : i32 + %7 = llvm.mlir.constant(1 : i64) : i64 + %8 = llvm.mlir.constant(1 : i64) : i64 + llvm.store %6, %2 : i32, !llvm.ptr + %9 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "sum"} + %10 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "index_"} + omp.target map_entries(%9 -> %arg0, %10 -> %arg1 : !llvm.ptr, !llvm.ptr) { + %11 = llvm.mlir.constant(10000 : i32) : i32 + %12 = llvm.mlir.constant(1 : i32) : i32 + omp.teams reduction(@add_reduction_i32 %arg0 -> %arg2 : !llvm.ptr) { + omp.distribute private(@_QFsimple_target_teams_only_reductionEindex__private_i32 %arg1 -> %arg3 : !llvm.ptr) { + omp.loop_nest (%arg4) : i32 = (%12) to (%11) inclusive step (%12) { + llvm.store %arg4, %arg3 : i32, !llvm.ptr + %13 = llvm.load %arg2 : !llvm.ptr -> i32 + %14 = llvm.load %arg3 : !llvm.ptr -> i32 + %15 = llvm.add %13, %14 : i32 + llvm.store %15, %arg2 : i32, !llvm.ptr + omp.yield + } + } + omp.terminator + } + omp.terminator + } + llvm.return + } +} + +// CHECK: call i32 @__kmpc_target_init +// CHECK: call void @[[OUTLINED:__omp_offloading_[A-Za-z0-9_.]*]] +// CHECK: define internal void @[[OUTLINED]] +// CHECK: %[[MASTER:.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2 +// CHECK: icmp eq i32 %[[MASTER]], 1 +// CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]] + +// CHECK: call void @__kmpc_barrier + +// CHECK: [[THEN]]: +// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]] +// CHECK-NEXT: store i32 %[[FINAL_RESULT]] + + +// CHECK: call void @__kmpc_distribute_static_loop_4u +// CHECK-SAME: [[OUTLINED2:__omp_offloading_[A-Za-z0-9_.]*]] + +// CHECK: define internal void @[[OUTLINED2]] +// CHECK: %[[TEAM_RHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[TEAM_LHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[TEAM_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[TEAM_RHS]], %[[TEAM_LHS]] +// CHECK-NEXT: store i32 %[[TEAM_RESULT]] diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir new file mode 100644 index 0000000000000..edfb2839d6604 --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir @@ -0,0 +1,79 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// Only check the overall shape of the code and the presence of relevant +// runtime calls. Actual IR checking is done at the OpenMPIRBuilder level. + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } { + omp.declare_reduction @add_reduction_i32 : i32 init { + ^bb0(%arg0: i32): + %0 = llvm.mlir.constant(0 : i32) : i32 + omp.yield(%0 : i32) + } combiner { + ^bb0(%arg0: i32, %arg1: i32): + %0 = llvm.add %arg0, %arg1 : i32 + omp.yield(%0 : i32) + } + llvm.func @simple_target_teams_only_reduction_() attributes {fir.internal_name = "_QPsimple_target_teams_only_reduction", frame_pointer = #llvm.framePointerKind, omp.declare_target = #omp.declaretarget, target_cpu = "gfx1030", target_features = #llvm.target_features<["+16-bit-insts", "+ci-insts", "+dl-insts", "+dot1-insts", "+dot10-insts", "+dot2-insts", "+dot5-insts", "+dot6-insts", "+dot7-insts", "+dpp", "+gfx10-3-insts", "+gfx10-insts", "+gfx8-insts", "+gfx9-insts", "+gws", "+image-insts", "+s-memrealtime", "+s-memtime-inst", "+wavefrontsize32"]>} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "sum"} : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.mlir.constant(1 : i64) : i64 + %4 = llvm.alloca %3 x i32 {bindc_name = "index_"} : (i64) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %6 = llvm.mlir.constant(0 : i32) : i32 + %7 = llvm.mlir.constant(1 : i64) : i64 + %8 = llvm.mlir.constant(1 : i64) : i64 + llvm.store %6, %2 : i32, !llvm.ptr + %9 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "sum"} + %10 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "index_"} + omp.target map_entries(%9 -> %arg0, %10 -> %arg1 : !llvm.ptr, !llvm.ptr) { + %11 = llvm.mlir.constant(0 : index) : i64 + %12 = llvm.mlir.constant(10000 : index) : i64 + %13 = llvm.mlir.constant(1 : index) : i64 + omp.teams reduction(@add_reduction_i32 %arg0 -> %arg2 : !llvm.ptr) { + %14 = llvm.trunc %13 : i64 to i32 + llvm.br ^bb1(%14, %12 : i32, i64) + ^bb1(%15: i32, %16: i64): // 2 preds: ^bb0, ^bb2 + %17 = llvm.icmp "sgt" %16, %11 : i64 + llvm.cond_br %17, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + llvm.store %15, %arg1 : i32, !llvm.ptr + %18 = llvm.load %arg2 : !llvm.ptr -> i32 + %19 = llvm.load %arg1 : !llvm.ptr -> i32 + %20 = llvm.add %18, %19 : i32 + llvm.store %20, %arg2 : i32, !llvm.ptr + %21 = llvm.load %arg1 : !llvm.ptr -> i32 + %22 = llvm.add %21, %14 overflow : i32 + %23 = llvm.sub %16, %13 : i64 + llvm.br ^bb1(%22, %23 : i32, i64) + ^bb3: // pred: ^bb1 + llvm.store %15, %arg1 : i32, !llvm.ptr + omp.terminator + } + omp.terminator + } + llvm.return + } +} + +// CHECK: call i32 @__kmpc_target_init +// CHECK: call void @[[OUTLINED:__omp_offloading_[A-Za-z0-9_.]*]] +// CHECK: %[[MASTER:.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2 +// CHECK: icmp eq i32 %[[MASTER]], 1 +// CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]] +// CHECK: [[THEN]]: +// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]] +// CHECK-NEXT: store i32 %[[FINAL_RESULT]] + +// CHECK: call void @__kmpc_barrier +// CHECK: call void @__kmpc_target_deinit + +// CHECK: define internal void @[[OUTLINED]] +// Skip to the loop +// CHECK: br i1 +// CHECK: %[[TEAM_RHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[TEAM_LHS:[A-Za-z0-9_.]*]] = load i32 +// CHECK-NEXT: %[[TEAM_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[TEAM_RHS]], %[[TEAM_LHS]] +// CHECK-NEXT: store i32 %[[TEAM_RESULT]] diff --git a/mlir/test/Target/LLVMIR/openmp-teams-distribute-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-teams-distribute-reduction.mlir new file mode 100644 index 0000000000000..9e033f6a4da3c --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-teams-distribute-reduction.mlir @@ -0,0 +1,71 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// Only check the overall shape of the code and the presence of relevant +// runtime calls. Actual IR checking is done at the OpenMPIRBuilder level. + +omp.private {type = private} @_QFsimple_teams_reductionEindex__private_i32 : i32 +omp.declare_reduction @add_reduction_i32 : i32 init { +^bb0(%arg0: i32): + %0 = llvm.mlir.constant(0 : i32) : i32 + omp.yield(%0 : i32) +} combiner { +^bb0(%arg0: i32, %arg1: i32): + %0 = llvm.add %arg0, %arg1 : i32 + omp.yield(%0 : i32) +} +llvm.func @simple_teams_reduction_() attributes {fir.internal_name = "_QPsimple_teams_reduction", frame_pointer = #llvm.framePointerKind, target_cpu = "x86-64"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "sum"} : (i64) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i64) : i64 + %3 = llvm.alloca %2 x i32 {bindc_name = "index_"} : (i64) -> !llvm.ptr + %4 = llvm.mlir.constant(10000 : i32) : i32 + %5 = llvm.mlir.constant(1 : i32) : i32 + %6 = llvm.mlir.constant(0 : i32) : i32 + %7 = llvm.mlir.constant(1 : i64) : i64 + %8 = llvm.mlir.constant(1 : i64) : i64 + llvm.store %6, %1 : i32, !llvm.ptr + omp.teams reduction(@add_reduction_i32 %1 -> %arg0 : !llvm.ptr) { + omp.distribute private(@_QFsimple_teams_reductionEindex__private_i32 %3 -> %arg1 : !llvm.ptr) { + omp.loop_nest (%arg2) : i32 = (%5) to (%4) inclusive step (%5) { + llvm.store %arg2, %arg1 : i32, !llvm.ptr + %9 = llvm.load %arg0 : !llvm.ptr -> i32 + %10 = llvm.load %arg1 : !llvm.ptr -> i32 + %11 = llvm.add %9, %10 : i32 + llvm.store %11, %arg0 : i32, !llvm.ptr + omp.yield + } + } + omp.terminator + } + llvm.return +} +// Call to outlined function +// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams +// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Outlined function. +// CHECK: define internal void @[[OUTLINED]] + +// Private reduction variable and its initialization. +// CHECK: %[[PRIVATE:.+]] = alloca i32 +// CHECK: store i32 0, ptr %[[PRIVATE]] + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + +// Atomic version not generated +// CHECK: unreachable + +// Non atomic version +// CHECK: call void @__kmpc_end_reduce + +// Finalize +// CHECK: br label %[[FINALIZE:.+]] + +// CHECK: [[FINALIZE]]: +// CHECK: call void @__kmpc_barrier + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: add i32 diff --git a/mlir/test/Target/LLVMIR/openmp-teams-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-teams-reduction.mlir new file mode 100644 index 0000000000000..800a833cf5601 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-teams-reduction.mlir @@ -0,0 +1,79 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// Only check the overall shape of the code and the presence of relevant +// runtime calls. Actual IR checking is done at the OpenMPIRBuilder level. + +omp.declare_reduction @add_reduction_i32 : i32 init { +^bb0(%arg0: i32): + %0 = llvm.mlir.constant(0 : i32) : i32 + omp.yield(%0 : i32) +} combiner { +^bb0(%arg0: i32, %arg1: i32): + %0 = llvm.add %arg0, %arg1 : i32 + omp.yield(%0 : i32) +} +llvm.func @simple_teams_only_reduction_() attributes {fir.internal_name = "_QPsimple_teams_only_reduction", frame_pointer = #llvm.framePointerKind, target_cpu = "x86-64"} { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "sum"} : (i64) -> !llvm.ptr + %2 = llvm.mlir.constant(1 : i64) : i64 + %3 = llvm.alloca %2 x i32 {bindc_name = "index_"} : (i64) -> !llvm.ptr + %4 = llvm.mlir.constant(0 : index) : i64 + %5 = llvm.mlir.constant(10000 : index) : i64 + %6 = llvm.mlir.constant(1 : index) : i64 + %7 = llvm.mlir.constant(0 : i32) : i32 + %8 = llvm.mlir.constant(1 : i64) : i64 + %9 = llvm.mlir.constant(1 : i64) : i64 + llvm.store %7, %1 : i32, !llvm.ptr + omp.teams reduction(@add_reduction_i32 %1 -> %arg0 : !llvm.ptr) { + %10 = llvm.trunc %6 : i64 to i32 + llvm.br ^bb1(%10, %5 : i32, i64) + ^bb1(%11: i32, %12: i64): // 2 preds: ^bb0, ^bb2 + %13 = llvm.icmp "sgt" %12, %4 : i64 + llvm.cond_br %13, ^bb2, ^bb3 + ^bb2: // pred: ^bb1 + llvm.store %11, %3 : i32, !llvm.ptr + %14 = llvm.load %arg0 : !llvm.ptr -> i32 + %15 = llvm.load %3 : !llvm.ptr -> i32 + %16 = llvm.add %14, %15 : i32 + llvm.store %16, %arg0 : i32, !llvm.ptr + %17 = llvm.load %3 : !llvm.ptr -> i32 + %18 = llvm.add %17, %10 overflow : i32 + %19 = llvm.sub %12, %6 : i64 + llvm.br ^bb1(%18, %19 : i32, i64) + ^bb3: // pred: ^bb1 + llvm.store %11, %3 : i32, !llvm.ptr + omp.terminator + } + llvm.return +} + +// Allocate reduction array +// CHECK: %[[REDARRAY:[A-Za-z_.][A-Za-z0-9_.]*]] = alloca [1 x ptr], align 8 +// Call to outlined function +// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams +// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]] +// Outlined function. + +// Private reduction variable and its initialization. + +// Call to the reduction function. +// CHECK: call i32 @__kmpc_reduce +// Check that the reduction array is passed in. +// CHECK-SAME: %[[REDARRAY]] +// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]] + +// CHECK: [[FINALIZE:.+]]: +// CHECK: call void @__kmpc_barrier + +// Non atomic version +// CHECK: call void @__kmpc_end_reduce +// CHECK: br label %[[FINALIZE]] + +// Atomic version not generated +// CHECK: unreachable + +// CHECK: define internal void @[[OUTLINED]] + +// Reduction function. +// CHECK: define internal void @[[REDFUNC]] +// CHECK: add i32 diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir index af31f8bab73ac..7eafe396082e4 100644 --- a/mlir/test/Target/LLVMIR/openmp-todo.mlir +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -536,34 +536,6 @@ llvm.func @teams_private(%x : !llvm.ptr) { // ----- -omp.declare_reduction @add_f32 : f32 -init { -^bb0(%arg: f32): - %0 = llvm.mlir.constant(0.0 : f32) : f32 - omp.yield (%0 : f32) -} -combiner { -^bb1(%arg0: f32, %arg1: f32): - %1 = llvm.fadd %arg0, %arg1 : f32 - omp.yield (%1 : f32) -} -atomic { -^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr): - %2 = llvm.load %arg3 : !llvm.ptr -> f32 - llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32 - omp.yield -} -llvm.func @teams_reduction(%x : !llvm.ptr) { - // expected-error@below {{not yet implemented: Unhandled clause reduction in omp.teams operation}} - // expected-error@below {{LLVM Translation failed for operation: omp.teams}} - omp.teams reduction(@add_f32 %x -> %prv : !llvm.ptr) { - omp.terminator - } - llvm.return -} - -// ----- - llvm.func @wsloop_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) { // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.wsloop operation}} // expected-error@below {{LLVM Translation failed for operation: omp.wsloop}} diff --git a/offload/test/offloading/fortran/basic-target-parallel-reduction.f90 b/offload/test/offloading/fortran/basic-target-parallel-reduction.f90 new file mode 100644 index 0000000000000..cb84bcd3462cf --- /dev/null +++ b/offload/test/offloading/fortran/basic-target-parallel-reduction.f90 @@ -0,0 +1,27 @@ +! Basic offloading test with a target region +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-generic +! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +program main + use omp_lib + integer :: error = 0 + integer :: i + integer :: sum = 0 + + !$omp target parallel do reduction(+:sum) + do i = 1, 100 + sum = sum + i + end do + !$omp end target parallel do + + if (sum /= 5050) then + error = 1 + endif + + print *,"number of errors: ", error + +end program main + +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} +! CHECK: number of errors: 0 diff --git a/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 b/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 new file mode 100644 index 0000000000000..fab4950452478 --- /dev/null +++ b/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 @@ -0,0 +1,27 @@ +! Basic offloading test with a target region +! REQUIRES: flang, amdgpu + +! RUN: %libomptarget-compile-fortran-generic +! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +program main + use omp_lib + integer :: error = 0 + integer :: i + integer :: sum = 0 + + !$omp target teams distribute parallel do reduction(+:sum) + do i = 1, 1000 + sum = sum + i + end do + !$omp end target teams distribute parallel do + + if (sum /= 500500) then + error = 1 + endif + + print *,"number of errors: ", error + +end program main + +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} +! CHECK: number of errors: 0 From 4701afaeaf2aba604a6160cde04634be88e21022 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 7 Apr 2025 17:51:46 +0100 Subject: [PATCH 0873/1029] [X86] combineX86ShufflesRecursively - merge loops peeking through shuffle operands. NFC. Merge loops to peek through free insert_subvector / bitcasts / extract_subvector. To keep this NFC I haven't reordered the peek throughs - this will done in a future patch to help with #133947 regressions --- llvm/lib/Target/X86/X86ISelLowering.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 56b0f721383f1..bac5684733e60 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41115,9 +41115,9 @@ static SDValue combineX86ShufflesRecursively( } } - // Peek through vector widenings and set out of bounds mask indices to undef. - // TODO: Can resolveTargetShuffleInputsAndMask do some of this? for (auto [I, Op] : enumerate(Ops)) { + // Peek through vector widenings + set out of bounds mask indices to undef. + // TODO: Can resolveTargetShuffleInputsAndMask do some of this? if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() && isNullConstant(Op.getOperand(2))) { Op = Op.getOperand(1); @@ -41130,10 +41130,8 @@ static SDValue combineX86ShufflesRecursively( M = SM_SentinelUndef; } } - } - // Peek through any free bitcasts/extract_subvector nodes back to root size. - for (SDValue &Op : Ops){ + // Peek through any free bitcasts/extract_subvector nodes back to root size. SDValue BC = Op; if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) BC = peekThroughOneUseBitcasts(BC); From 783201b184572a07efe2dc6b6b9110873421cf11 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 7 Apr 2025 23:59:53 +0700 Subject: [PATCH 0874/1029] Attributor: Don't follow uses of ConstantData (#134573) These should not really have uselists, and it's not worth the compile time of looking at all uses of trivial constants. The main observable change of this is it no longer adds align attributes on constant null uses, but those are not useful. Some of these cases should potentially be more aggressive and not look at any Constant users. --- llvm/lib/Transforms/IPO/Attributor.cpp | 3 +++ .../Transforms/IPO/AttributorAttributes.cpp | 9 ++++++++- .../Attributor/IPConstantProp/pthreads.ll | 16 +++++++-------- llvm/test/Transforms/Attributor/allocator.ll | 12 +++++------ llvm/test/Transforms/Attributor/callbacks.ll | 20 +++++++++---------- llvm/test/Transforms/Attributor/issue87856.ll | 4 ++-- .../Transforms/Attributor/memory_locations.ll | 4 ++-- llvm/test/Transforms/Attributor/noalias.ll | 4 ++-- .../test/Transforms/Attributor/nocapture-1.ll | 6 +++--- llvm/test/Transforms/Attributor/noundef.ll | 4 ++-- .../Attributor/undefined_behavior.ll | 16 +++++++-------- .../Attributor/value-simplify-local-remote.ll | 2 +- .../Transforms/Attributor/value-simplify.ll | 10 +++++----- 13 files changed, 60 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index fe789c550a1bf..4d65354455379 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -1769,6 +1769,9 @@ bool Attributor::checkForAllUses( if (!CB(*this, &QueryingAA)) return false; + if (isa(V)) + return false; + // Check the trivial case first as it catches void values. if (V.use_empty()) return true; diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 0b39c8061b594..a477c90bb4f45 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -643,6 +643,10 @@ static void followUsesInContext(AAType &AA, Attributor &A, template static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S, Instruction &CtxI) { + const Value &Val = AA.getIRPosition().getAssociatedValue(); + if (isa(Val)) + return; + MustBeExecutedContextExplorer *Explorer = A.getInfoCache().getMustBeExecutedContextExplorer(); if (!Explorer) @@ -650,7 +654,7 @@ static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S, // Container for (transitive) uses of the associated value. SetVector Uses; - for (const Use &U : AA.getIRPosition().getAssociatedValue().uses()) + for (const Use &U : Val.uses()) Uses.insert(&U); followUsesInContext(AA, A, *Explorer, &CtxI, Uses, S); @@ -5283,6 +5287,9 @@ struct AAAlignImpl : AAAlign { // Check for users that allow alignment annotations. Value &AssociatedValue = getAssociatedValue(); + if (isa(AssociatedValue)) + return ChangeStatus::UNCHANGED; + for (const Use &U : AssociatedValue.uses()) { if (auto *SI = dyn_cast(U.getUser())) { if (SI->getPointerOperand() == &AssociatedValue) diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll b/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll index 5accfc9b5ce6c..502751147f884 100644 --- a/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll +++ b/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll @@ -37,10 +37,10 @@ define dso_local i32 @main() { ; TUNIT-NEXT: [[ALLOC1:%.*]] = alloca i8, align 8 ; TUNIT-NEXT: [[ALLOC2:%.*]] = alloca i8, align 8 ; TUNIT-NEXT: [[THREAD:%.*]] = alloca i64, align 8 -; TUNIT-NEXT: [[CALL:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @foo, ptr nofree readnone align 4294967296 undef) -; TUNIT-NEXT: [[CALL1:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @bar, ptr noalias nofree nonnull readnone align 8 captures(none) dereferenceable(8) undef) -; TUNIT-NEXT: [[CALL2:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @baz, ptr noalias nofree noundef nonnull readnone align 8 captures(none) dereferenceable(1) [[ALLOC1]]) -; TUNIT-NEXT: [[CALL3:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @buz, ptr noalias nofree noundef nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[ALLOC2]]) +; TUNIT-NEXT: [[CALL:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @foo, ptr nofree readnone undef) +; TUNIT-NEXT: [[CALL1:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @bar, ptr noalias nofree nonnull readnone align 8 captures(none) dereferenceable(8) undef) +; TUNIT-NEXT: [[CALL2:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @baz, ptr noalias nofree noundef nonnull readnone align 8 captures(none) dereferenceable(1) [[ALLOC1]]) +; TUNIT-NEXT: [[CALL3:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @buz, ptr noalias nofree noundef nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[ALLOC2]]) ; TUNIT-NEXT: ret i32 0 ; ; CGSCC-LABEL: define {{[^@]+}}@main() { @@ -48,10 +48,10 @@ define dso_local i32 @main() { ; CGSCC-NEXT: [[ALLOC1:%.*]] = alloca i8, align 8 ; CGSCC-NEXT: [[ALLOC2:%.*]] = alloca i8, align 8 ; CGSCC-NEXT: [[THREAD:%.*]] = alloca i64, align 8 -; CGSCC-NEXT: [[CALL:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @foo, ptr nofree noundef readnone align 4294967296 null) -; CGSCC-NEXT: [[CALL1:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @bar, ptr noalias nofree noundef nonnull readnone align 8 captures(none) dereferenceable(8) @GlobalVPtr) -; CGSCC-NEXT: [[CALL2:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @baz, ptr noalias nofree noundef nonnull readnone align 8 captures(none) dereferenceable(1) [[ALLOC1]]) -; CGSCC-NEXT: [[CALL3:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @buz, ptr noalias nofree noundef nonnull readnone align 8 dereferenceable(1) [[ALLOC2]]) +; CGSCC-NEXT: [[CALL:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @foo, ptr nofree noundef readnone null) +; CGSCC-NEXT: [[CALL1:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @bar, ptr noalias nofree noundef nonnull readnone align 8 captures(none) dereferenceable(8) @GlobalVPtr) +; CGSCC-NEXT: [[CALL2:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @baz, ptr noalias nofree noundef nonnull readnone align 8 captures(none) dereferenceable(1) [[ALLOC1]]) +; CGSCC-NEXT: [[CALL3:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @buz, ptr noalias nofree noundef nonnull readnone align 8 dereferenceable(1) [[ALLOC2]]) ; CGSCC-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/Attributor/allocator.ll b/llvm/test/Transforms/Attributor/allocator.ll index 4f74a59176cbb..b6c7e4ccc0218 100644 --- a/llvm/test/Transforms/Attributor/allocator.ll +++ b/llvm/test/Transforms/Attributor/allocator.ll @@ -425,21 +425,21 @@ define dso_local void @pthread_test(){ ; TUNIT-LABEL: define dso_local void @pthread_test() { ; TUNIT-NEXT: [[ARG1:%.*]] = alloca i8, align 8 ; TUNIT-NEXT: [[THREAD:%.*]] = alloca i64, align 8 -; TUNIT-NEXT: [[CALL1:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @pthread_allocation_should_remain_same, ptr noundef nonnull align 8 dereferenceable(1) [[ARG1]]) +; TUNIT-NEXT: [[CALL1:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @pthread_allocation_should_remain_same, ptr noundef nonnull align 8 dereferenceable(1) [[ARG1]]) ; TUNIT-NEXT: [[F1:%.*]] = alloca i8, i32 4, align 4 -; TUNIT-NEXT: [[CALL2:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @pthread_allocation_should_be_reduced, ptr noalias nofree nonnull readnone align 4 captures(none) dereferenceable(12) undef) +; TUNIT-NEXT: [[CALL2:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @pthread_allocation_should_be_reduced, ptr noalias nofree nonnull readnone align 4 captures(none) dereferenceable(12) undef) ; TUNIT-NEXT: [[F2:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 -; TUNIT-NEXT: [[CALL3:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @pthread_check_captured_pointer, ptr noundef nonnull align 4 dereferenceable(12) [[F2]]) +; TUNIT-NEXT: [[CALL3:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @pthread_check_captured_pointer, ptr noundef nonnull align 4 dereferenceable(12) [[F2]]) ; TUNIT-NEXT: ret void ; ; CGSCC-LABEL: define dso_local void @pthread_test() { ; CGSCC-NEXT: [[ARG1:%.*]] = alloca i8, align 8 ; CGSCC-NEXT: [[THREAD:%.*]] = alloca i64, align 8 -; CGSCC-NEXT: [[CALL1:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @pthread_allocation_should_remain_same, ptr noundef nonnull align 8 dereferenceable(1) [[ARG1]]) +; CGSCC-NEXT: [[CALL1:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @pthread_allocation_should_remain_same, ptr noundef nonnull align 8 dereferenceable(1) [[ARG1]]) ; CGSCC-NEXT: [[F:%.*]] = alloca [[STRUCT_FOO:%.*]], align 4 -; CGSCC-NEXT: [[CALL2:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @pthread_allocation_should_be_reduced, ptr noalias nofree noundef nonnull readonly align 4 captures(none) dereferenceable(12) [[F]]) +; CGSCC-NEXT: [[CALL2:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @pthread_allocation_should_be_reduced, ptr noalias nofree noundef nonnull readonly align 4 captures(none) dereferenceable(12) [[F]]) ; CGSCC-NEXT: [[F2:%.*]] = alloca [[STRUCT_FOO]], align 4 -; CGSCC-NEXT: [[CALL3:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef align 4294967296 null, ptr noundef nonnull @pthread_check_captured_pointer, ptr noundef nonnull align 4 dereferenceable(12) [[F2]]) +; CGSCC-NEXT: [[CALL3:%.*]] = call i32 @pthread_create(ptr noundef nonnull align 8 dereferenceable(8) [[THREAD]], ptr noundef null, ptr noundef nonnull @pthread_check_captured_pointer, ptr noundef nonnull align 4 dereferenceable(12) [[F2]]) ; CGSCC-NEXT: ret void ; %arg1 = alloca i8, align 8 diff --git a/llvm/test/Transforms/Attributor/callbacks.ll b/llvm/test/Transforms/Attributor/callbacks.ll index 9e1db3f56cbed..80a0b2befbbee 100644 --- a/llvm/test/Transforms/Attributor/callbacks.ll +++ b/llvm/test/Transforms/Attributor/callbacks.ll @@ -22,7 +22,7 @@ define void @t0_caller(ptr %a) { ; TUNIT-NEXT: [[PTR:%.*]] = alloca i32, align 128 ; TUNIT-NEXT: store i32 42, ptr [[B]], align 32 ; TUNIT-NEXT: store ptr [[B]], ptr [[C]], align 64 -; TUNIT-NEXT: call void (ptr, ptr, ptr, ...) @t0_callback_broker(ptr noundef align 4294967296 null, ptr noundef nonnull align 128 dereferenceable(4) [[PTR]], ptr noundef nonnull @t0_callback_callee, ptr align 256 [[A]], i64 undef, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) +; TUNIT-NEXT: call void (ptr, ptr, ptr, ...) @t0_callback_broker(ptr noundef null, ptr noundef nonnull align 128 dereferenceable(4) [[PTR]], ptr noundef nonnull @t0_callback_callee, ptr align 256 [[A]], i64 undef, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) ; TUNIT-NEXT: ret void ; ; CGSCC-LABEL: define {{[^@]+}}@t0_caller @@ -33,7 +33,7 @@ define void @t0_caller(ptr %a) { ; CGSCC-NEXT: [[PTR:%.*]] = alloca i32, align 128 ; CGSCC-NEXT: store i32 42, ptr [[B]], align 32 ; CGSCC-NEXT: store ptr [[B]], ptr [[C]], align 64 -; CGSCC-NEXT: call void (ptr, ptr, ptr, ...) @t0_callback_broker(ptr noundef align 4294967296 null, ptr noundef nonnull align 128 dereferenceable(4) [[PTR]], ptr noundef nonnull @t0_callback_callee, ptr align 256 [[A]], i64 noundef 99, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) +; CGSCC-NEXT: call void (ptr, ptr, ptr, ...) @t0_callback_broker(ptr noundef null, ptr noundef nonnull align 128 dereferenceable(4) [[PTR]], ptr noundef nonnull @t0_callback_callee, ptr align 256 [[A]], i64 noundef 99, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) ; CGSCC-NEXT: ret void ; entry: @@ -95,7 +95,7 @@ define void @t1_caller(ptr noalias %a) { ; TUNIT-NEXT: [[PTR:%.*]] = alloca i32, align 128 ; TUNIT-NEXT: store i32 42, ptr [[B]], align 32 ; TUNIT-NEXT: store ptr [[B]], ptr [[C]], align 64 -; TUNIT-NEXT: call void (ptr, ptr, ptr, ...) @t1_callback_broker(ptr noundef align 4294967296 null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t1_callback_callee, ptr align 256 captures(none) [[A]], i64 undef, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) +; TUNIT-NEXT: call void (ptr, ptr, ptr, ...) @t1_callback_broker(ptr noundef null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t1_callback_callee, ptr align 256 captures(none) [[A]], i64 undef, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) ; TUNIT-NEXT: ret void ; ; CGSCC-LABEL: define {{[^@]+}}@t1_caller @@ -106,7 +106,7 @@ define void @t1_caller(ptr noalias %a) { ; CGSCC-NEXT: [[PTR:%.*]] = alloca i32, align 128 ; CGSCC-NEXT: store i32 42, ptr [[B]], align 32 ; CGSCC-NEXT: store ptr [[B]], ptr [[C]], align 64 -; CGSCC-NEXT: call void (ptr, ptr, ptr, ...) @t1_callback_broker(ptr noundef align 4294967296 null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t1_callback_callee, ptr align 256 captures(none) [[A]], i64 noundef 99, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) +; CGSCC-NEXT: call void (ptr, ptr, ptr, ...) @t1_callback_broker(ptr noundef null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t1_callback_callee, ptr align 256 captures(none) [[A]], i64 noundef 99, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) ; CGSCC-NEXT: ret void ; entry: @@ -168,7 +168,7 @@ define void @t2_caller(ptr noalias %a) { ; TUNIT-NEXT: [[PTR:%.*]] = alloca i32, align 128 ; TUNIT-NEXT: store i32 42, ptr [[B]], align 32 ; TUNIT-NEXT: store ptr [[B]], ptr [[C]], align 64 -; TUNIT-NEXT: call void (ptr, ptr, ptr, ...) @t2_callback_broker(ptr noundef align 4294967296 null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t2_callback_callee, ptr align 256 captures(none) [[A]], i64 undef, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) +; TUNIT-NEXT: call void (ptr, ptr, ptr, ...) @t2_callback_broker(ptr noundef null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t2_callback_callee, ptr align 256 captures(none) [[A]], i64 undef, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) ; TUNIT-NEXT: ret void ; ; CGSCC-LABEL: define {{[^@]+}}@t2_caller @@ -179,7 +179,7 @@ define void @t2_caller(ptr noalias %a) { ; CGSCC-NEXT: [[PTR:%.*]] = alloca i32, align 128 ; CGSCC-NEXT: store i32 42, ptr [[B]], align 32 ; CGSCC-NEXT: store ptr [[B]], ptr [[C]], align 64 -; CGSCC-NEXT: call void (ptr, ptr, ptr, ...) @t2_callback_broker(ptr noundef align 4294967296 null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t2_callback_callee, ptr align 256 captures(none) [[A]], i64 noundef 99, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) +; CGSCC-NEXT: call void (ptr, ptr, ptr, ...) @t2_callback_broker(ptr noundef null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t2_callback_callee, ptr align 256 captures(none) [[A]], i64 noundef 99, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) ; CGSCC-NEXT: ret void ; entry: @@ -241,8 +241,8 @@ define void @t3_caller(ptr noalias %a) { ; TUNIT-NEXT: [[PTR:%.*]] = alloca i32, align 128 ; TUNIT-NEXT: store i32 42, ptr [[B]], align 32 ; TUNIT-NEXT: store ptr [[B]], ptr [[C]], align 64 -; TUNIT-NEXT: call void (ptr, ptr, ptr, ...) @t3_callback_broker(ptr noundef align 4294967296 null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t3_callback_callee, ptr align 256 captures(none) [[A]], i64 undef, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) -; TUNIT-NEXT: call void (ptr, ptr, ptr, ...) @t3_callback_broker(ptr noundef align 4294967296 null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t3_callback_callee, ptr align 256 captures(none) [[A]], i64 undef, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) +; TUNIT-NEXT: call void (ptr, ptr, ptr, ...) @t3_callback_broker(ptr noundef null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t3_callback_callee, ptr align 256 captures(none) [[A]], i64 undef, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) +; TUNIT-NEXT: call void (ptr, ptr, ptr, ...) @t3_callback_broker(ptr noundef null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t3_callback_callee, ptr align 256 captures(none) [[A]], i64 undef, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) ; TUNIT-NEXT: ret void ; ; CGSCC-LABEL: define {{[^@]+}}@t3_caller @@ -253,8 +253,8 @@ define void @t3_caller(ptr noalias %a) { ; CGSCC-NEXT: [[PTR:%.*]] = alloca i32, align 128 ; CGSCC-NEXT: store i32 42, ptr [[B]], align 32 ; CGSCC-NEXT: store ptr [[B]], ptr [[C]], align 64 -; CGSCC-NEXT: call void (ptr, ptr, ptr, ...) @t3_callback_broker(ptr noundef align 4294967296 null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t3_callback_callee, ptr align 256 captures(none) [[A]], i64 noundef 99, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) -; CGSCC-NEXT: call void (ptr, ptr, ptr, ...) @t3_callback_broker(ptr noundef align 4294967296 null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t3_callback_callee, ptr align 256 captures(none) [[A]], i64 noundef 99, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) +; CGSCC-NEXT: call void (ptr, ptr, ptr, ...) @t3_callback_broker(ptr noundef null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t3_callback_callee, ptr align 256 captures(none) [[A]], i64 noundef 99, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) +; CGSCC-NEXT: call void (ptr, ptr, ptr, ...) @t3_callback_broker(ptr noundef null, ptr noalias noundef nonnull align 128 captures(none) dereferenceable(4) [[PTR]], ptr noundef nonnull captures(none) @t3_callback_callee, ptr align 256 captures(none) [[A]], i64 noundef 99, ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(8) [[C]]) ; CGSCC-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/Attributor/issue87856.ll b/llvm/test/Transforms/Attributor/issue87856.ll index aa7072b335b40..4990ef909dfaa 100644 --- a/llvm/test/Transforms/Attributor/issue87856.ll +++ b/llvm/test/Transforms/Attributor/issue87856.ll @@ -4,7 +4,7 @@ define void @null_ptr_is_valid_call_with_null() #0 { ; CHECK-LABEL: define void @null_ptr_is_valid_call_with_null( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: call void @store_as0(ptr nofree noundef writeonly align 4294967296 null) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: call void @store_as0(ptr nofree noundef writeonly null) #[[ATTR4:[0-9]+]] ; CHECK-NEXT: ret void ; call void @store_as0(ptr null) @@ -34,7 +34,7 @@ define void @store_as0(ptr %0) { define void @call_store_as1() { ; CHECK-LABEL: define void @call_store_as1( ; CHECK-SAME: ) #[[ATTR3:[0-9]+]] { -; CHECK-NEXT: call void @store_as1(ptr addrspace(1) nofree noundef writeonly align 4294967296 captures(none) null) #[[ATTR4]] +; CHECK-NEXT: call void @store_as1(ptr addrspace(1) nofree noundef writeonly captures(none) null) #[[ATTR4]] ; CHECK-NEXT: ret void ; call void @store_as1(ptr addrspace(1) null) diff --git a/llvm/test/Transforms/Attributor/memory_locations.ll b/llvm/test/Transforms/Attributor/memory_locations.ll index 22cdf299bfc3e..9c27fcaacc749 100644 --- a/llvm/test/Transforms/Attributor/memory_locations.ll +++ b/llvm/test/Transforms/Attributor/memory_locations.ll @@ -368,7 +368,7 @@ define void @callerC2() { } define void @callerD1() { ; CHECK-LABEL: define {{[^@]+}}@callerD1() { -; CHECK-NEXT: [[UNKNOWN:%.*]] = call ptr @argmem_only(ptr noundef align 4294967296 null) +; CHECK-NEXT: [[UNKNOWN:%.*]] = call ptr @argmem_only(ptr noundef null) ; CHECK-NEXT: store i8 0, ptr [[UNKNOWN]], align 1 ; CHECK-NEXT: ret void ; @@ -378,7 +378,7 @@ define void @callerD1() { } define void @callerD2() { ; CHECK-LABEL: define {{[^@]+}}@callerD2() { -; CHECK-NEXT: [[UNKNOWN:%.*]] = call ptr @inaccesible_argmem_only_decl(ptr noundef align 4294967296 null) +; CHECK-NEXT: [[UNKNOWN:%.*]] = call ptr @inaccesible_argmem_only_decl(ptr noundef null) ; CHECK-NEXT: store i8 0, ptr [[UNKNOWN]], align 1 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/Attributor/noalias.ll b/llvm/test/Transforms/Attributor/noalias.ll index c22be2297ff25..46d9f77660253 100644 --- a/llvm/test/Transforms/Attributor/noalias.ll +++ b/llvm/test/Transforms/Attributor/noalias.ll @@ -293,13 +293,13 @@ define internal void @test9a(ptr %a, ptr %b) { ; TUNIT: Function Attrs: memory(readwrite, argmem: none) ; TUNIT-LABEL: define {{[^@]+}}@test9a ; TUNIT-SAME: () #[[ATTR4:[0-9]+]] { -; TUNIT-NEXT: call void @use_i8(ptr noundef align 4294967296 null) +; TUNIT-NEXT: call void @use_i8(ptr noundef null) ; TUNIT-NEXT: ret void ; ; CGSCC: Function Attrs: memory(readwrite, argmem: none) ; CGSCC-LABEL: define {{[^@]+}}@test9a ; CGSCC-SAME: () #[[ATTR5:[0-9]+]] { -; CGSCC-NEXT: call void @use_i8(ptr noundef align 4294967296 null) +; CGSCC-NEXT: call void @use_i8(ptr noundef null) ; CGSCC-NEXT: ret void ; call void @use_i8(ptr null) diff --git a/llvm/test/Transforms/Attributor/nocapture-1.ll b/llvm/test/Transforms/Attributor/nocapture-1.ll index b9d2aaf972b23..c6097b3868d16 100644 --- a/llvm/test/Transforms/Attributor/nocapture-1.ll +++ b/llvm/test/Transforms/Attributor/nocapture-1.ll @@ -478,7 +478,7 @@ define ptr @test4_2(ptr %x4_2, ptr %y4_2, ptr %z4_2, i1 %c) { ; TUNIT-SAME: (ptr nofree readnone captures(none) [[X4_2:%.*]], ptr nofree readnone returned "no-capture-maybe-returned" [[Y4_2:%.*]], ptr nofree readnone captures(none) [[Z4_2:%.*]], i1 noundef [[C:%.*]]) #[[ATTR7]] { ; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; TUNIT: t: -; TUNIT-NEXT: call void @test4_1(ptr nofree noundef readnone align 4294967296 null, i1 noundef [[C]]) #[[ATTR7]] +; TUNIT-NEXT: call void @test4_1(ptr nofree noundef readnone null, i1 noundef [[C]]) #[[ATTR7]] ; TUNIT-NEXT: store ptr null, ptr @g, align 8 ; TUNIT-NEXT: br label [[F]] ; TUNIT: f: @@ -489,7 +489,7 @@ define ptr @test4_2(ptr %x4_2, ptr %y4_2, ptr %z4_2, i1 %c) { ; CGSCC-SAME: (ptr nofree readnone captures(none) [[X4_2:%.*]], ptr nofree readnone returned "no-capture-maybe-returned" [[Y4_2:%.*]], ptr nofree readnone captures(none) [[Z4_2:%.*]], i1 noundef [[C:%.*]]) #[[ATTR10]] { ; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; CGSCC: t: -; CGSCC-NEXT: call void @test4_1(ptr nofree noundef readnone align 4294967296 null, i1 noundef [[C]]) #[[ATTR10]] +; CGSCC-NEXT: call void @test4_1(ptr nofree noundef readnone null, i1 noundef [[C]]) #[[ATTR10]] ; CGSCC-NEXT: store ptr null, ptr @g, align 8 ; CGSCC-NEXT: br label [[F]] ; CGSCC: f: @@ -771,7 +771,7 @@ declare void @unknown(ptr) define void @test_callsite() { ; CHECK-LABEL: define {{[^@]+}}@test_callsite() { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @unknown(ptr noundef align 4294967296 null) +; CHECK-NEXT: call void @unknown(ptr noundef null) ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/Attributor/noundef.ll b/llvm/test/Transforms/Attributor/noundef.ll index 1855a93449a44..e71118f6e7a28 100644 --- a/llvm/test/Transforms/Attributor/noundef.ll +++ b/llvm/test/Transforms/Attributor/noundef.ll @@ -49,11 +49,11 @@ define internal void @argument_dead_callback_callee(ptr %c) { define void @callback_caller() { ; TUNIT-LABEL: define {{[^@]+}}@callback_caller() { -; TUNIT-NEXT: call void @callback_broker(ptr noundef nonnull @argument_dead_callback_callee, ptr nofree readnone align 4294967296 undef) +; TUNIT-NEXT: call void @callback_broker(ptr noundef nonnull @argument_dead_callback_callee, ptr nofree readnone undef) ; TUNIT-NEXT: ret void ; ; CGSCC-LABEL: define {{[^@]+}}@callback_caller() { -; CGSCC-NEXT: call void @callback_broker(ptr noundef nonnull @argument_dead_callback_callee, ptr nofree noundef readnone align 4294967296 null) +; CGSCC-NEXT: call void @callback_broker(ptr noundef nonnull @argument_dead_callback_callee, ptr nofree noundef readnone null) ; CGSCC-NEXT: ret void ; call void @callback_broker(ptr @argument_dead_callback_callee, ptr null) diff --git a/llvm/test/Transforms/Attributor/undefined_behavior.ll b/llvm/test/Transforms/Attributor/undefined_behavior.ll index 9839bd5521329..7c81910028658 100644 --- a/llvm/test/Transforms/Attributor/undefined_behavior.ll +++ b/llvm/test/Transforms/Attributor/undefined_behavior.ll @@ -103,13 +103,13 @@ define void @store_wholly_unreachable_volatile() { ; TUNIT: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(none) ; TUNIT-LABEL: define {{[^@]+}}@store_wholly_unreachable_volatile ; TUNIT-SAME: () #[[ATTR2:[0-9]+]] { -; TUNIT-NEXT: store volatile i32 5, ptr null, align 4294967296 +; TUNIT-NEXT: store volatile i32 5, ptr null, align 4 ; TUNIT-NEXT: ret void ; ; CGSCC: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(none) ; CGSCC-LABEL: define {{[^@]+}}@store_wholly_unreachable_volatile ; CGSCC-SAME: () #[[ATTR3:[0-9]+]] { -; CGSCC-NEXT: store volatile i32 5, ptr null, align 4294967296 +; CGSCC-NEXT: store volatile i32 5, ptr null, align 4 ; CGSCC-NEXT: ret void ; store volatile i32 5, ptr null @@ -138,13 +138,13 @@ define void @store_null_pointer_is_defined() null_pointer_is_valid { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(write) ; TUNIT-LABEL: define {{[^@]+}}@store_null_pointer_is_defined ; TUNIT-SAME: () #[[ATTR3:[0-9]+]] { -; TUNIT-NEXT: store i32 5, ptr null, align 4294967296 +; TUNIT-NEXT: store i32 5, ptr null, align 4 ; TUNIT-NEXT: ret void ; ; CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(write) ; CGSCC-LABEL: define {{[^@]+}}@store_null_pointer_is_defined ; CGSCC-SAME: () #[[ATTR4:[0-9]+]] { -; CGSCC-NEXT: store i32 5, ptr null, align 4294967296 +; CGSCC-NEXT: store i32 5, ptr null, align 4 ; CGSCC-NEXT: ret void ; store i32 5, ptr null @@ -771,7 +771,7 @@ define void @arg_nonnull_violation3_1(i1 %c) { ; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; TUNIT: t: ; TUNIT-NEXT: call void @arg_nonnull_12(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]]) #[[ATTR7:[0-9]+]] -; TUNIT-NEXT: call void @arg_nonnull_12(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef writeonly align 4294967296 null) #[[ATTR7]] +; TUNIT-NEXT: call void @arg_nonnull_12(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef writeonly null) #[[ATTR7]] ; TUNIT-NEXT: unreachable ; TUNIT: f: ; TUNIT-NEXT: unreachable @@ -785,7 +785,7 @@ define void @arg_nonnull_violation3_1(i1 %c) { ; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; CGSCC: t: ; CGSCC-NEXT: call void @arg_nonnull_12(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]]) #[[ATTR12:[0-9]+]] -; CGSCC-NEXT: call void @arg_nonnull_12(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef writeonly align 4294967296 null) #[[ATTR12]] +; CGSCC-NEXT: call void @arg_nonnull_12(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef writeonly null) #[[ATTR12]] ; CGSCC-NEXT: unreachable ; CGSCC: f: ; CGSCC-NEXT: unreachable @@ -818,7 +818,7 @@ define void @arg_nonnull_violation3_2(i1 %c) { ; TUNIT-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; TUNIT: t: ; TUNIT-NEXT: call void @arg_nonnull_12_noundef_2(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]]) #[[ATTR7]] -; TUNIT-NEXT: call void @arg_nonnull_12_noundef_2(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef writeonly align 4294967296 null) #[[ATTR7]] +; TUNIT-NEXT: call void @arg_nonnull_12_noundef_2(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef writeonly null) #[[ATTR7]] ; TUNIT-NEXT: unreachable ; TUNIT: f: ; TUNIT-NEXT: unreachable @@ -832,7 +832,7 @@ define void @arg_nonnull_violation3_2(i1 %c) { ; CGSCC-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; CGSCC: t: ; CGSCC-NEXT: call void @arg_nonnull_12_noundef_2(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]]) #[[ATTR12]] -; CGSCC-NEXT: call void @arg_nonnull_12_noundef_2(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef writeonly align 4294967296 null) #[[ATTR12]] +; CGSCC-NEXT: call void @arg_nonnull_12_noundef_2(ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef nonnull writeonly align 4 captures(none) dereferenceable(4) [[PTR]], ptr nofree noundef writeonly null) #[[ATTR12]] ; CGSCC-NEXT: unreachable ; CGSCC: f: ; CGSCC-NEXT: unreachable diff --git a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll index e349d927611d9..374d5ba7ff52b 100644 --- a/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll +++ b/llvm/test/Transforms/Attributor/value-simplify-local-remote.ll @@ -310,7 +310,7 @@ entry: define weak_odr void @t3() { ; CHECK-LABEL: define {{[^@]+}}@t3() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr noundef align 4294967296 null) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr noundef null) ; CHECK-NEXT: br label [[USER_CODE_ENTRY:%.*]] ; CHECK: user_code.entry: ; CHECK-NEXT: br label [[FOR_COND:%.*]] diff --git a/llvm/test/Transforms/Attributor/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll index 59160c0834980..87ad0e5be0231 100644 --- a/llvm/test/Transforms/Attributor/value-simplify.ll +++ b/llvm/test/Transforms/Attributor/value-simplify.ll @@ -502,14 +502,14 @@ define ptr @complicated_args_preallocated() { ; TUNIT-LABEL: define {{[^@]+}}@complicated_args_preallocated ; TUNIT-SAME: () #[[ATTR3:[0-9]+]] { ; TUNIT-NEXT: [[C:%.*]] = call token @llvm.call.preallocated.setup(i32 noundef 1) #[[ATTR10:[0-9]+]] -; TUNIT-NEXT: [[CALL:%.*]] = call noundef nonnull align 4294967296 dereferenceable(4) ptr @test_preallocated(ptr nofree noundef writeonly preallocated(i32) align 4294967296 null) #[[ATTR9]] [ "preallocated"(token [[C]]) ] +; TUNIT-NEXT: [[CALL:%.*]] = call noundef nonnull align 4294967296 dereferenceable(4) ptr @test_preallocated(ptr nofree noundef writeonly preallocated(i32) null) #[[ATTR9]] [ "preallocated"(token [[C]]) ] ; TUNIT-NEXT: ret ptr [[CALL]] ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn ; CGSCC-LABEL: define {{[^@]+}}@complicated_args_preallocated ; CGSCC-SAME: () #[[ATTR4:[0-9]+]] { ; CGSCC-NEXT: [[C:%.*]] = call token @llvm.call.preallocated.setup(i32 noundef 1) #[[ATTR13]] -; CGSCC-NEXT: [[CALL:%.*]] = call ptr @test_preallocated(ptr nofree noundef writeonly preallocated(i32) align 4294967296 null) #[[ATTR14:[0-9]+]] [ "preallocated"(token [[C]]) ] +; CGSCC-NEXT: [[CALL:%.*]] = call ptr @test_preallocated(ptr nofree noundef writeonly preallocated(i32) null) #[[ATTR14:[0-9]+]] [ "preallocated"(token [[C]]) ] ; CGSCC-NEXT: unreachable ; %c = call token @llvm.call.preallocated.setup(i32 1) @@ -541,13 +541,13 @@ define void @complicated_args_sret(ptr %b) { ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) ; TUNIT-LABEL: define {{[^@]+}}@complicated_args_sret ; TUNIT-SAME: (ptr nofree writeonly captures(none) [[B:%.*]]) #[[ATTR4]] { -; TUNIT-NEXT: call void @test_sret(ptr nofree noundef writeonly sret([[STRUCT_X:%.*]]) align 4294967296 null, ptr nofree noundef writeonly align 8 captures(none) [[B]]) #[[ATTR11:[0-9]+]] +; TUNIT-NEXT: call void @test_sret(ptr nofree noundef writeonly sret([[STRUCT_X:%.*]]) null, ptr nofree noundef writeonly align 8 captures(none) [[B]]) #[[ATTR11:[0-9]+]] ; TUNIT-NEXT: ret void ; ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: write) ; CGSCC-LABEL: define {{[^@]+}}@complicated_args_sret ; CGSCC-SAME: (ptr nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[B:%.*]]) #[[ATTR6:[0-9]+]] { -; CGSCC-NEXT: call void @test_sret(ptr nofree noundef writeonly sret([[STRUCT_X:%.*]]) align 4294967296 dereferenceable_or_null(8) null, ptr nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[B]]) #[[ATTR15:[0-9]+]] +; CGSCC-NEXT: call void @test_sret(ptr nofree noundef writeonly sret([[STRUCT_X:%.*]]) dereferenceable_or_null(8) null, ptr nofree noundef nonnull writeonly align 8 captures(none) dereferenceable(8) [[B]]) #[[ATTR15:[0-9]+]] ; CGSCC-NEXT: ret void ; call void @test_sret(ptr sret(%struct.X) null, ptr %b) @@ -571,7 +571,7 @@ define ptr @complicated_args_nest() { ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none) ; CGSCC-LABEL: define {{[^@]+}}@complicated_args_nest ; CGSCC-SAME: () #[[ATTR3]] { -; CGSCC-NEXT: [[CALL:%.*]] = call noalias noundef align 4294967296 ptr @test_nest(ptr nofree noundef readnone align 4294967296 null) #[[ATTR12]] +; CGSCC-NEXT: [[CALL:%.*]] = call noalias noundef align 4294967296 ptr @test_nest(ptr nofree noundef readnone null) #[[ATTR12]] ; CGSCC-NEXT: ret ptr [[CALL]] ; %call = call ptr @test_nest(ptr null) From 23c27f3efcdda730b365698ade5fd0c1c283f2e7 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 7 Apr 2025 10:24:27 -0700 Subject: [PATCH 0875/1029] [NFC][LLVM][AArch64] Cleanup pass initialization for AArch64 (#134315) - Remove calls to pass initialization from pass constructors. - https://github.com/llvm/llvm-project/issues/111767 --- .../Target/AArch64/AArch64A53Fix835769.cpp | 4 +- .../AArch64/AArch64A57FPLoadBalancing.cpp | 4 +- .../AArch64/AArch64AdvSIMDScalarPass.cpp | 4 +- .../AArch64/AArch64Arm64ECCallLowering.cpp | 4 +- .../AArch64/AArch64CompressJumpTables.cpp | 4 +- .../Target/AArch64/AArch64CondBrTuning.cpp | 4 +- .../AArch64/AArch64ConditionOptimizer.cpp | 4 +- .../AArch64/AArch64ConditionalCompares.cpp | 4 +- .../AArch64DeadRegisterDefinitionsPass.cpp | 5 +- .../AArch64/AArch64ExpandPseudoInsts.cpp | 4 +- .../AArch64/AArch64LoadStoreOptimizer.cpp | 4 +- .../AArch64LowerHomogeneousPrologEpilog.cpp | 5 +- .../Target/AArch64/AArch64MIPeepholeOpt.cpp | 4 +- .../AArch64/AArch64PostCoalescerPass.cpp | 4 +- .../Target/AArch64/AArch64PromoteConstant.cpp | 4 +- .../AArch64RedundantCopyElimination.cpp | 5 +- .../Target/AArch64/AArch64SIMDInstrOpt.cpp | 4 +- .../AArch64/AArch64SpeculationHardening.cpp | 4 +- .../Target/AArch64/AArch64StackTagging.cpp | 4 +- .../AArch64/AArch64StackTaggingPreRA.cpp | 4 +- .../AArch64/AArch64StorePairSuppress.cpp | 4 +- .../Target/AArch64/AArch64TargetMachine.cpp | 82 ++++++++++--------- .../GISel/AArch64O0PreLegalizerCombiner.cpp | 2 - .../GISel/AArch64PostLegalizerCombiner.cpp | 2 - .../GISel/AArch64PostLegalizerLowering.cpp | 2 - .../GISel/AArch64PostSelectOptimize.cpp | 7 +- .../GISel/AArch64PreLegalizerCombiner.cpp | 2 - 27 files changed, 64 insertions(+), 120 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp index b3a7c737097f0..2760355ae6107 100644 --- a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -81,9 +81,7 @@ class AArch64A53Fix835769 : public MachineFunctionPass { public: static char ID; - explicit AArch64A53Fix835769() : MachineFunctionPass(ID) { - initializeAArch64A53Fix835769Pass(*PassRegistry::getPassRegistry()); - } + explicit AArch64A53Fix835769() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &F) override; diff --git a/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 218cded84d76b..87bc925c6dc16 100644 --- a/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -112,9 +112,7 @@ class AArch64A57FPLoadBalancing : public MachineFunctionPass { public: static char ID; - explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) { - initializeAArch64A57FPLoadBalancingPass(*PassRegistry::getPassRegistry()); - } + explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &F) override; diff --git a/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index 9e31243cd696c..08a6fa2ea8db0 100644 --- a/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -82,9 +82,7 @@ class AArch64AdvSIMDScalar : public MachineFunctionPass { public: static char ID; // Pass identification, replacement for typeid. - explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) { - initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry()); - } + explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &F) override; diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp index 9553a44fb317e..11e2c940548e2 100644 --- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -63,9 +63,7 @@ struct ThunkArgInfo { class AArch64Arm64ECCallLowering : public ModulePass { public: static char ID; - AArch64Arm64ECCallLowering() : ModulePass(ID) { - initializeAArch64Arm64ECCallLoweringPass(*PassRegistry::getPassRegistry()); - } + AArch64Arm64ECCallLowering() : ModulePass(ID) {} Function *buildExitThunk(FunctionType *FnTy, AttributeList Attrs); Function *buildEntryThunk(Function *F); diff --git a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp index ecab42b89ec30..6621a1f2fc764 100644 --- a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp +++ b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp @@ -47,9 +47,7 @@ class AArch64CompressJumpTables : public MachineFunctionPass { public: static char ID; - AArch64CompressJumpTables() : MachineFunctionPass(ID) { - initializeAArch64CompressJumpTablesPass(*PassRegistry::getPassRegistry()); - } + AArch64CompressJumpTables() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp index a091ab45c7737..96d7ce08a02e2 100644 --- a/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp +++ b/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp @@ -52,9 +52,7 @@ class AArch64CondBrTuning : public MachineFunctionPass { public: static char ID; - AArch64CondBrTuning() : MachineFunctionPass(ID) { - initializeAArch64CondBrTuningPass(*PassRegistry::getPassRegistry()); - } + AArch64CondBrTuning() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override; bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { return AARCH64_CONDBR_TUNING_NAME; } diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index 68243258a68f5..4c9f8c2723493 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -103,9 +103,7 @@ class AArch64ConditionOptimizer : public MachineFunctionPass { static char ID; - AArch64ConditionOptimizer() : MachineFunctionPass(ID) { - initializeAArch64ConditionOptimizerPass(*PassRegistry::getPassRegistry()); - } + AArch64ConditionOptimizer() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override; MachineInstr *findSuitableCompare(MachineBasicBlock *MBB); diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 0301032e84977..9b59ee6483cd9 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -771,9 +771,7 @@ class AArch64ConditionalCompares : public MachineFunctionPass { public: static char ID; - AArch64ConditionalCompares() : MachineFunctionPass(ID) { - initializeAArch64ConditionalComparesPass(*PassRegistry::getPassRegistry()); - } + AArch64ConditionalCompares() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override; bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 37222bf34426b..71284b0574e57 100644 --- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -40,10 +40,7 @@ class AArch64DeadRegisterDefinitions : public MachineFunctionPass { void processMachineBasicBlock(MachineBasicBlock &MBB); public: static char ID; // Pass identification, replacement for typeid. - AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) { - initializeAArch64DeadRegisterDefinitionsPass( - *PassRegistry::getPassRegistry()); - } + AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &F) override; diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 55a441b7d22b6..5e491bba786fa 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -50,9 +50,7 @@ class AArch64ExpandPseudo : public MachineFunctionPass { static char ID; - AArch64ExpandPseudo() : MachineFunctionPass(ID) { - initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry()); - } + AArch64ExpandPseudo() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &Fn) override; diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index cd976790ebb6f..06e633effe874 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -124,9 +124,7 @@ using LdStPairFlags = struct LdStPairFlags { struct AArch64LoadStoreOpt : public MachineFunctionPass { static char ID; - AArch64LoadStoreOpt() : MachineFunctionPass(ID) { - initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); - } + AArch64LoadStoreOpt() : MachineFunctionPass(ID) {} AliasAnalysis *AA; const AArch64InstrInfo *TII; diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp index b5911ac09cc18..7cdcd5416cfc1 100644 --- a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp +++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp @@ -73,10 +73,7 @@ class AArch64LowerHomogeneousPrologEpilog : public ModulePass { public: static char ID; - AArch64LowerHomogeneousPrologEpilog() : ModulePass(ID) { - initializeAArch64LowerHomogeneousPrologEpilogPass( - *PassRegistry::getPassRegistry()); - } + AArch64LowerHomogeneousPrologEpilog() : ModulePass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addPreserved(); diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp index 36a7becbc76d3..54347b610c507 100644 --- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp @@ -84,9 +84,7 @@ namespace { struct AArch64MIPeepholeOpt : public MachineFunctionPass { static char ID; - AArch64MIPeepholeOpt() : MachineFunctionPass(ID) { - initializeAArch64MIPeepholeOptPass(*PassRegistry::getPassRegistry()); - } + AArch64MIPeepholeOpt() : MachineFunctionPass(ID) {} const AArch64InstrInfo *TII; const AArch64RegisterInfo *TRI; diff --git a/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp b/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp index c399de0c56e34..cdf2822f3ed9d 100644 --- a/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp @@ -21,9 +21,7 @@ namespace { struct AArch64PostCoalescer : public MachineFunctionPass { static char ID; - AArch64PostCoalescer() : MachineFunctionPass(ID) { - initializeAArch64PostCoalescerPass(*PassRegistry::getPassRegistry()); - } + AArch64PostCoalescer() : MachineFunctionPass(ID) {} LiveIntervals *LIS; MachineRegisterInfo *MRI; diff --git a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp index 0e0b23ea41639..8edf1d0f9296b 100644 --- a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -108,9 +108,7 @@ class AArch64PromoteConstant : public ModulePass { static char ID; - AArch64PromoteConstant() : ModulePass(ID) { - initializeAArch64PromoteConstantPass(*PassRegistry::getPassRegistry()); - } + AArch64PromoteConstant() : ModulePass(ID) {} StringRef getPassName() const override { return "AArch64 Promote Constant"; } diff --git a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp index 69fc13883f6b8..9c1ab06e1c1c0 100644 --- a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp +++ b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp @@ -78,10 +78,7 @@ class AArch64RedundantCopyElimination : public MachineFunctionPass { public: static char ID; - AArch64RedundantCopyElimination() : MachineFunctionPass(ID) { - initializeAArch64RedundantCopyEliminationPass( - *PassRegistry::getPassRegistry()); - } + AArch64RedundantCopyElimination() : MachineFunctionPass(ID) {} struct RegImm { MCPhysReg Reg; diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp index 5e89a531f7e86..b3159b444e5b7 100644 --- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp +++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp @@ -150,9 +150,7 @@ struct AArch64SIMDInstrOpt : public MachineFunctionPass { // The maximum of N is curently 10 and it is for ST4 case. static const unsigned MaxNumRepl = 10; - AArch64SIMDInstrOpt() : MachineFunctionPass(ID) { - initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry()); - } + AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {} /// Based only on latency of instructions, determine if it is cost efficient /// to replace the instruction InstDesc by the instructions stored in the diff --git a/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp index 9aa8102aeab21..96707f20cd751 100644 --- a/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp +++ b/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp @@ -126,9 +126,7 @@ class AArch64SpeculationHardening : public MachineFunctionPass { static char ID; - AArch64SpeculationHardening() : MachineFunctionPass(ID) { - initializeAArch64SpeculationHardeningPass(*PassRegistry::getPassRegistry()); - } + AArch64SpeculationHardening() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &Fn) override; diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index 54327b3f15944..0c0b512e3b6ce 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -309,9 +309,7 @@ class AArch64StackTagging : public FunctionPass { : FunctionPass(ID), MergeInit(ClMergeInit.getNumOccurrences() ? ClMergeInit : !IsOptNone), UseStackSafety(ClUseStackSafety.getNumOccurrences() ? ClUseStackSafety - : !IsOptNone) { - initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry()); - } + : !IsOptNone) {} void tagAlloca(AllocaInst *AI, Instruction *InsertBefore, Value *Ptr, uint64_t Size); diff --git a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp index 558f20848babd..7f0b48dd4a198 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp @@ -62,9 +62,7 @@ class AArch64StackTaggingPreRA : public MachineFunctionPass { public: static char ID; - AArch64StackTaggingPreRA() : MachineFunctionPass(ID) { - initializeAArch64StackTaggingPreRAPass(*PassRegistry::getPassRegistry()); - } + AArch64StackTaggingPreRA() : MachineFunctionPass(ID) {} bool mayUseUncheckedLoadStore(); void uncheckUsesOf(unsigned TaggedReg, int FI); diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp index d8c8b17565abb..c9e729025c709 100644 --- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -38,9 +38,7 @@ class AArch64StorePairSuppress : public MachineFunctionPass { public: static char ID; - AArch64StorePairSuppress() : MachineFunctionPass(ID) { - initializeAArch64StorePairSuppressPass(*PassRegistry::getPassRegistry()); - } + AArch64StorePairSuppress() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return STPSUPPRESS_PASS_NAME; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index d85952ba5d93a..6d8d9a703df62 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -230,45 +230,46 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { RegisterTargetMachine Z(getTheARM64Target()); RegisterTargetMachine W(getTheARM64_32Target()); RegisterTargetMachine V(getTheAArch64_32Target()); - auto PR = PassRegistry::getPassRegistry(); - initializeGlobalISel(*PR); - initializeAArch64A53Fix835769Pass(*PR); - initializeAArch64A57FPLoadBalancingPass(*PR); - initializeAArch64AdvSIMDScalarPass(*PR); - initializeAArch64BranchTargetsPass(*PR); - initializeAArch64CollectLOHPass(*PR); - initializeAArch64CompressJumpTablesPass(*PR); - initializeAArch64ConditionalComparesPass(*PR); - initializeAArch64ConditionOptimizerPass(*PR); - initializeAArch64DeadRegisterDefinitionsPass(*PR); - initializeAArch64ExpandPseudoPass(*PR); - initializeAArch64LoadStoreOptPass(*PR); - initializeAArch64MIPeepholeOptPass(*PR); - initializeAArch64SIMDInstrOptPass(*PR); - initializeAArch64O0PreLegalizerCombinerPass(*PR); - initializeAArch64PreLegalizerCombinerPass(*PR); - initializeAArch64PointerAuthPass(*PR); - initializeAArch64PostCoalescerPass(*PR); - initializeAArch64PostLegalizerCombinerPass(*PR); - initializeAArch64PostLegalizerLoweringPass(*PR); - initializeAArch64PostSelectOptimizePass(*PR); - initializeAArch64PromoteConstantPass(*PR); - initializeAArch64RedundantCopyEliminationPass(*PR); - initializeAArch64StorePairSuppressPass(*PR); - initializeFalkorHWPFFixPass(*PR); - initializeFalkorMarkStridedAccessesLegacyPass(*PR); - initializeLDTLSCleanupPass(*PR); - initializeKCFIPass(*PR); - initializeSMEABIPass(*PR); - initializeSMEPeepholeOptPass(*PR); - initializeSVEIntrinsicOptsPass(*PR); - initializeAArch64SpeculationHardeningPass(*PR); - initializeAArch64SLSHardeningPass(*PR); - initializeAArch64StackTaggingPass(*PR); - initializeAArch64StackTaggingPreRAPass(*PR); - initializeAArch64LowerHomogeneousPrologEpilogPass(*PR); - initializeAArch64DAGToDAGISelLegacyPass(*PR); - initializeAArch64CondBrTuningPass(*PR); + auto &PR = *PassRegistry::getPassRegistry(); + initializeGlobalISel(PR); + initializeAArch64A53Fix835769Pass(PR); + initializeAArch64A57FPLoadBalancingPass(PR); + initializeAArch64AdvSIMDScalarPass(PR); + initializeAArch64BranchTargetsPass(PR); + initializeAArch64CollectLOHPass(PR); + initializeAArch64CompressJumpTablesPass(PR); + initializeAArch64ConditionalComparesPass(PR); + initializeAArch64ConditionOptimizerPass(PR); + initializeAArch64DeadRegisterDefinitionsPass(PR); + initializeAArch64ExpandPseudoPass(PR); + initializeAArch64LoadStoreOptPass(PR); + initializeAArch64MIPeepholeOptPass(PR); + initializeAArch64SIMDInstrOptPass(PR); + initializeAArch64O0PreLegalizerCombinerPass(PR); + initializeAArch64PreLegalizerCombinerPass(PR); + initializeAArch64PointerAuthPass(PR); + initializeAArch64PostCoalescerPass(PR); + initializeAArch64PostLegalizerCombinerPass(PR); + initializeAArch64PostLegalizerLoweringPass(PR); + initializeAArch64PostSelectOptimizePass(PR); + initializeAArch64PromoteConstantPass(PR); + initializeAArch64RedundantCopyEliminationPass(PR); + initializeAArch64StorePairSuppressPass(PR); + initializeFalkorHWPFFixPass(PR); + initializeFalkorMarkStridedAccessesLegacyPass(PR); + initializeLDTLSCleanupPass(PR); + initializeKCFIPass(PR); + initializeSMEABIPass(PR); + initializeSMEPeepholeOptPass(PR); + initializeSVEIntrinsicOptsPass(PR); + initializeAArch64SpeculationHardeningPass(PR); + initializeAArch64SLSHardeningPass(PR); + initializeAArch64StackTaggingPass(PR); + initializeAArch64StackTaggingPreRAPass(PR); + initializeAArch64LowerHomogeneousPrologEpilogPass(PR); + initializeAArch64DAGToDAGISelLegacyPass(PR); + initializeAArch64CondBrTuningPass(PR); + initializeAArch64Arm64ECCallLoweringPass(PR); } void AArch64TargetMachine::reset() { SubtargetMap.clear(); } @@ -333,8 +334,9 @@ getEffectiveAArch64CodeModel(const Triple &TT, *CM != CodeModel::Large) { report_fatal_error( "Only small, tiny and large code models are allowed on AArch64"); - } else if (*CM == CodeModel::Tiny && !TT.isOSBinFormatELF()) + } else if (*CM == CodeModel::Tiny && !TT.isOSBinFormatELF()) { report_fatal_error("tiny code model is only supported on ELF"); + } return *CM; } // The default MCJIT memory managers make no guarantees about where they can diff --git a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp index 4289066234420..460902c67fe35 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp @@ -142,8 +142,6 @@ void AArch64O0PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AArch64O0PreLegalizerCombiner::AArch64O0PreLegalizerCombiner() : MachineFunctionPass(ID) { - initializeAArch64O0PreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); - if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp index d4a14f8756304..96569f77bc224 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp @@ -533,8 +533,6 @@ void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AArch64PostLegalizerCombiner::AArch64PostLegalizerCombiner(bool IsOptNone) : MachineFunctionPass(ID), IsOptNone(IsOptNone) { - initializeAArch64PostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); - if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index bd50bc6652391..dea08d98f524f 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -1322,8 +1322,6 @@ void AArch64PostLegalizerLowering::getAnalysisUsage(AnalysisUsage &AU) const { AArch64PostLegalizerLowering::AArch64PostLegalizerLowering() : MachineFunctionPass(ID) { - initializeAArch64PostLegalizerLoweringPass(*PassRegistry::getPassRegistry()); - if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp index 7d07fe147208b..913a8870565d9 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostSelectOptimize.cpp @@ -33,7 +33,7 @@ class AArch64PostSelectOptimize : public MachineFunctionPass { public: static char ID; - AArch64PostSelectOptimize(); + AArch64PostSelectOptimize() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "AArch64 Post Select Optimizer"; @@ -59,11 +59,6 @@ void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const { MachineFunctionPass::getAnalysisUsage(AU); } -AArch64PostSelectOptimize::AArch64PostSelectOptimize() - : MachineFunctionPass(ID) { - initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry()); -} - unsigned getNonFlagSettingVariant(unsigned Opc) { switch (Opc) { default: diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 2c559d4beb5d1..416386555dc0e 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -831,8 +831,6 @@ void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner() : MachineFunctionPass(ID) { - initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); - if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } From c1c0d551bae4e4f4ed13fcd9264834a2713fe44b Mon Sep 17 00:00:00 2001 From: vdonaldson <37090318+vdonaldson@users.noreply.github.com> Date: Mon, 7 Apr 2025 13:24:48 -0400 Subject: [PATCH 0876/1029] [flang] Non-type-bound defined IO lowering for an array of derived type (#134667) Update Non-type-bound IO lowering to call OutputDerivedType for an array of derived type (rather than OutputDescriptor). --- flang/lib/Lower/IO.cpp | 19 +++++++++++++++---- flang/test/Lower/io-derived-type.f90 | 11 +++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp index 07c9e6a1726bf..13d612354da84 100644 --- a/flang/lib/Lower/IO.cpp +++ b/flang/lib/Lower/IO.cpp @@ -609,11 +609,22 @@ static void genNamelistIO(Fortran::lower::AbstractConverter &converter, ok = builder.create(loc, funcOp, args).getResult(0); } +/// Is \p type a derived type or an array of derived type? +static bool containsDerivedType(mlir::Type type) { + mlir::Type argTy = fir::unwrapPassByRefType(fir::unwrapRefType(type)); + if (mlir::isa(argTy)) + return true; + if (auto seqTy = mlir::dyn_cast(argTy)) + if (mlir::isa(seqTy.getEleTy())) + return true; + return false; +} + /// Get the output function to call for a value of the given type. static mlir::func::FuncOp getOutputFunc(mlir::Location loc, fir::FirOpBuilder &builder, mlir::Type type, bool isFormatted) { - if (mlir::isa(fir::unwrapPassByRefType(type))) + if (containsDerivedType(type)) return fir::runtime::getIORuntimeFunc(loc, builder); if (!isFormatted) @@ -710,7 +721,7 @@ static void genOutputItemList( if (mlir::isa(argType)) { mlir::Value box = fir::getBase(converter.genExprBox(loc, *expr, stmtCtx)); outputFuncArgs.push_back(builder.createConvert(loc, argType, box)); - if (mlir::isa(fir::unwrapPassByRefType(itemTy))) + if (containsDerivedType(itemTy)) outputFuncArgs.push_back(getNonTbpDefinedIoTableAddr(converter)); } else if (helper.isCharacterScalar(itemTy)) { fir::ExtendedValue exv = converter.genExprAddr(loc, expr, stmtCtx); @@ -745,7 +756,7 @@ static void genOutputItemList( static mlir::func::FuncOp getInputFunc(mlir::Location loc, fir::FirOpBuilder &builder, mlir::Type type, bool isFormatted) { - if (mlir::isa(fir::unwrapPassByRefType(type))) + if (containsDerivedType(type)) return fir::runtime::getIORuntimeFunc(loc, builder); if (!isFormatted) @@ -817,7 +828,7 @@ createIoRuntimeCallForItem(Fortran::lower::AbstractConverter &converter, auto boxTy = mlir::dyn_cast(box.getType()); assert(boxTy && "must be previously emboxed"); inputFuncArgs.push_back(builder.createConvert(loc, argType, box)); - if (mlir::isa(fir::unwrapPassByRefType(boxTy))) + if (containsDerivedType(boxTy)) inputFuncArgs.push_back(getNonTbpDefinedIoTableAddr(converter)); } else { mlir::Value itemAddr = fir::getBase(item); diff --git a/flang/test/Lower/io-derived-type.f90 b/flang/test/Lower/io-derived-type.f90 index ecbbc22d24b1e..316a2cdb5b14f 100644 --- a/flang/test/Lower/io-derived-type.f90 +++ b/flang/test/Lower/io-derived-type.f90 @@ -101,6 +101,7 @@ program p use m character*3 ccc(4) namelist /nnn/ jjj, ccc + type(t) :: y(5) ! CHECK: fir.call @_QMmPtest1 call test1 @@ -115,6 +116,16 @@ program p ! CHECK: %[[V_100:[0-9]+]] = fir.convert %[[V_99]] : (!fir.ref, !fir.ref, i32, i1>>>, i1>>) -> !fir.ref ! CHECK: %[[V_101:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_98]], %[[V_100]]) fastmath : (!fir.ref, !fir.box, !fir.ref) -> i1 print *, 'main, should call wft: ', t(4) + + ! CHECK: %[[V_33:[0-9]+]] = fir.shape %c2{{.*}} : (index) -> !fir.shape<1> + ! CHECK: %[[V_34:[0-9]+]] = hlfir.designate %7#0 (%c2{{.*}}:%c3{{.*}}:%c1{{.*}}) shape %[[V_33]] : (!fir.ref>>, index, index, index, !fir.shape<1>) -> !fir.ref>> + ! CHECK: %[[V_35:[0-9]+]] = fir.shape %c2{{.*}} : (index) -> !fir.shape<1> + ! CHECK: %[[V_36:[0-9]+]] = fir.embox %[[V_34]](%[[V_35]]) : (!fir.ref>>, !fir.shape<1>) -> !fir.box>> + ! CHECK: %[[V_37:[0-9]+]] = fir.convert %[[V_36]] : (!fir.box>>) -> !fir.box + ! CHECK: %[[V_38:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i1>>>, i1>> + ! CHECK: %[[V_39:[0-9]+]] = fir.convert %[[V_38]] : (!fir.ref, !fir.ref, i32, i1>>>, i1>>) -> !fir.ref + ! CHECK: %[[V_40:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_37]], %[[V_39]]) fastmath : (!fir.ref, !fir.box, !fir.ref) -> i1 + print *, y(2:3) end ! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple, !fir.ref, i32, i1>> From 1043f5cb0b117bd63369debe80a2d446126cc079 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 7 Apr 2025 13:29:01 -0400 Subject: [PATCH 0877/1029] [gn] port 16c84c4475b9 --- llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn | 4 ++++ llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn | 1 + llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn | 1 + llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn | 1 + 4 files changed, 7 insertions(+) diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn index 65dd10e7570e8..dd3b18a0918af 100644 --- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn @@ -129,6 +129,10 @@ clang_tablegen("BuiltinsBPF") { args = [ "-gen-clang-builtins" ] } +clang_tablegen("BuiltinsDirectX") { + args = [ "-gen-clang-builtins" ] +} + clang_tablegen("BuiltinsHexagon") { args = [ "-gen-clang-builtins" ] } diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn index c8f4cd8c35b98..b0d82797ec801 100644 --- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn @@ -26,6 +26,7 @@ static_library("Basic") { "//clang/include/clang/Basic:AttrSubMatchRulesList", "//clang/include/clang/Basic:Builtins", "//clang/include/clang/Basic:BuiltinsBPF", + "//clang/include/clang/Basic:BuiltinsDirectX", "//clang/include/clang/Basic:BuiltinsHexagon", "//clang/include/clang/Basic:BuiltinsNVPTX", "//clang/include/clang/Basic:BuiltinsRISCV", diff --git a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn index 35dd4ec7afca1..85ad5303a9246 100644 --- a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn @@ -106,6 +106,7 @@ static_library("CodeGen") { "SwiftCallingConv.cpp", "TargetBuiltins/AMDGPU.cpp", "TargetBuiltins/ARM.cpp", + "TargetBuiltins/DirectX.cpp", "TargetBuiltins/Hexagon.cpp", "TargetBuiltins/NVPTX.cpp", "TargetBuiltins/PPC.cpp", diff --git a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn index 2d3277fcb6c13..256bab6a9e36a 100644 --- a/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Sema/BUILD.gn @@ -75,6 +75,7 @@ static_library("Sema") { "SemaDeclAttr.cpp", "SemaDeclCXX.cpp", "SemaDeclObjC.cpp", + "SemaDirectX.cpp", "SemaExceptionSpec.cpp", "SemaExpr.cpp", "SemaExprCXX.cpp", From 4607d39c7eded3ff6d425cbc502e30349078365c Mon Sep 17 00:00:00 2001 From: Pranav Kant Date: Mon, 7 Apr 2025 10:36:38 -0700 Subject: [PATCH 0878/1029] [bazel] Fix build (#134697) Fixes fallback from #134439 --- utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 90637864498cf..c4fec647e1773 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -287,6 +287,14 @@ gentbl_cc_library( deps = [":BuiltinsBaseTdFiles"], ) +gentbl_cc_library( + name = "basic_builtins_directx_gen", + tbl_outs = {"include/clang/Basic/BuiltinsDirectX.inc": ["-gen-clang-builtins"]}, + tblgen = ":clang-tblgen", + td_file = "include/clang/Basic/BuiltinsDirectX.td", + deps = [":BuiltinsBaseTdFiles"], +) + gentbl_cc_library( name = "basic_builtins_spirv_gen", tbl_outs = {"include/clang/Basic/BuiltinsSPIRV.inc": ["-gen-clang-builtins"]}, @@ -611,6 +619,7 @@ cc_library( ":basic_attr_gen", ":basic_builtin_templates_gen", ":basic_builtins_bpf_gen", + ":basic_builtins_directx_gen", ":basic_builtins_gen", ":basic_builtins_hexagon_gen", ":basic_builtins_nvptx_gen", From 21d912121c9f41385b165a736be787527f5bd7c2 Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Mon, 7 Apr 2025 11:11:31 -0700 Subject: [PATCH 0879/1029] [lldb][debugserver] Fix an off-by-one error in watchpoint identification (#134314) debugserver takes the address of a watchpoint exception and calculates which watchpoint was responsible for it. There was an off-by-one error in the range calculation which causes two watchpoints on consecutive ranges to not correctly identify hits to the second watchpoint. The result is that lldb wouldn't show the second watchpoint as ever being hit. rdar://145107575 --- .../consecutive-watchpoints/Makefile | 3 + .../TestConsecutiveWatchpoints.py | 87 +++++++++++++++++++ .../watchpoint/consecutive-watchpoints/main.c | 22 +++++ .../debugserver/source/DNBBreakpoint.cpp | 2 +- 4 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/Makefile create mode 100644 lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/TestConsecutiveWatchpoints.py create mode 100644 lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/main.c diff --git a/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/Makefile b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/Makefile new file mode 100644 index 0000000000000..10495940055b6 --- /dev/null +++ b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/TestConsecutiveWatchpoints.py b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/TestConsecutiveWatchpoints.py new file mode 100644 index 0000000000000..229172e6ce0aa --- /dev/null +++ b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/TestConsecutiveWatchpoints.py @@ -0,0 +1,87 @@ +""" +Watch contiguous memory regions with separate watchpoints, check that lldb +correctly detect which watchpoint was hit for each one. +""" + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class ConsecutiveWatchpointsTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def continue_and_report_stop_reason(self, process, iter_str): + process.Continue() + self.assertIn( + process.GetState(), [lldb.eStateStopped, lldb.eStateExited], iter_str + ) + thread = process.GetSelectedThread() + return thread.GetStopReason() + + # debugserver only gained the ability to watch larger regions + # with this patch. + def test_large_watchpoint(self): + """Test watchpoint that covers a large region of memory.""" + self.build() + self.main_source_file = lldb.SBFileSpec("main.c") + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "break here", self.main_source_file + ) + + frame = thread.GetFrameAtIndex(0) + + field2_wp = ( + frame.locals["var"][0] + .GetChildMemberWithName("field2") + .Watch(True, False, True) + ) + field3_wp = ( + frame.locals["var"][0] + .GetChildMemberWithName("field3") + .Watch(True, False, True) + ) + field4_wp = ( + frame.locals["var"][0] + .GetChildMemberWithName("field4") + .Watch(True, False, True) + ) + field5_wp = ( + frame.locals["var"][0] + .GetChildMemberWithName("field5") + .Watch(True, False, True) + ) + + self.assertTrue(field2_wp.IsValid()) + self.assertTrue(field3_wp.IsValid()) + self.assertTrue(field4_wp.IsValid()) + self.assertTrue(field5_wp.IsValid()) + + reason = self.continue_and_report_stop_reason(process, "continue to field2 wp") + self.assertEqual(reason, lldb.eStopReasonWatchpoint) + stop_reason_watchpoint_id = ( + process.GetSelectedThread().GetStopReasonDataAtIndex(0) + ) + self.assertEqual(stop_reason_watchpoint_id, field2_wp.GetID()) + + reason = self.continue_and_report_stop_reason(process, "continue to field3 wp") + self.assertEqual(reason, lldb.eStopReasonWatchpoint) + stop_reason_watchpoint_id = ( + process.GetSelectedThread().GetStopReasonDataAtIndex(0) + ) + self.assertEqual(stop_reason_watchpoint_id, field3_wp.GetID()) + + reason = self.continue_and_report_stop_reason(process, "continue to field4 wp") + self.assertEqual(reason, lldb.eStopReasonWatchpoint) + stop_reason_watchpoint_id = ( + process.GetSelectedThread().GetStopReasonDataAtIndex(0) + ) + self.assertEqual(stop_reason_watchpoint_id, field4_wp.GetID()) + + reason = self.continue_and_report_stop_reason(process, "continue to field5 wp") + self.assertEqual(reason, lldb.eStopReasonWatchpoint) + stop_reason_watchpoint_id = ( + process.GetSelectedThread().GetStopReasonDataAtIndex(0) + ) + self.assertEqual(stop_reason_watchpoint_id, field5_wp.GetID()) diff --git a/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/main.c b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/main.c new file mode 100644 index 0000000000000..c0a3530be9f5e --- /dev/null +++ b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/main.c @@ -0,0 +1,22 @@ +#include +struct fields { + uint32_t field1; + uint32_t field2; // offset +4 + uint16_t field3; // offset +8 + uint16_t field4; // offset +10 + uint16_t field5; // offset +12 + uint16_t field6; // offset +14 +}; + +int main() { + struct fields var = {0, 0, 0, 0, 0, 0}; + + var.field1 = 5; // break here + var.field2 = 6; + var.field3 = 7; + var.field4 = 8; + var.field5 = 9; + var.field6 = 10; + + return var.field1 + var.field2 + var.field3; +} diff --git a/lldb/tools/debugserver/source/DNBBreakpoint.cpp b/lldb/tools/debugserver/source/DNBBreakpoint.cpp index f63ecf24222bd..e41bf9b4fd905 100644 --- a/lldb/tools/debugserver/source/DNBBreakpoint.cpp +++ b/lldb/tools/debugserver/source/DNBBreakpoint.cpp @@ -98,7 +98,7 @@ DNBBreakpointList::FindNearestWatchpoint(nub_addr_t addr) const { if (pos.second.IsEnabled()) { nub_addr_t start_addr = pos.second.Address(); nub_addr_t end_addr = start_addr + pos.second.ByteSize(); - if (addr >= start_addr && addr <= end_addr) + if (addr >= start_addr && addr < end_addr) return &pos.second; } } From 369c7739d0853b7931410037843d5a63f50bc0a1 Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Mon, 7 Apr 2025 11:21:58 -0700 Subject: [PATCH 0880/1029] Revert "[lldb][debugserver] Fix an off-by-one error in watchpoint identification (#134314)" This reverts commit 21d912121c9f41385b165a736be787527f5bd7c2. Failure on the aarch64 ubuntu bot when setting the 4th watchpoint; may be a hardware limitation on that bot. I thought creating four watchpoints would be generally safe, but I don't need to do that for my test, will re-land without it. --- .../consecutive-watchpoints/Makefile | 3 - .../TestConsecutiveWatchpoints.py | 87 ------------------- .../watchpoint/consecutive-watchpoints/main.c | 22 ----- .../debugserver/source/DNBBreakpoint.cpp | 2 +- 4 files changed, 1 insertion(+), 113 deletions(-) delete mode 100644 lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/Makefile delete mode 100644 lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/TestConsecutiveWatchpoints.py delete mode 100644 lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/main.c diff --git a/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/Makefile b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/Makefile deleted file mode 100644 index 10495940055b6..0000000000000 --- a/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -C_SOURCES := main.c - -include Makefile.rules diff --git a/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/TestConsecutiveWatchpoints.py b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/TestConsecutiveWatchpoints.py deleted file mode 100644 index 229172e6ce0aa..0000000000000 --- a/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/TestConsecutiveWatchpoints.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -Watch contiguous memory regions with separate watchpoints, check that lldb -correctly detect which watchpoint was hit for each one. -""" - -import lldb -from lldbsuite.test.decorators import * -from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil - - -class ConsecutiveWatchpointsTestCase(TestBase): - NO_DEBUG_INFO_TESTCASE = True - - def continue_and_report_stop_reason(self, process, iter_str): - process.Continue() - self.assertIn( - process.GetState(), [lldb.eStateStopped, lldb.eStateExited], iter_str - ) - thread = process.GetSelectedThread() - return thread.GetStopReason() - - # debugserver only gained the ability to watch larger regions - # with this patch. - def test_large_watchpoint(self): - """Test watchpoint that covers a large region of memory.""" - self.build() - self.main_source_file = lldb.SBFileSpec("main.c") - (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( - self, "break here", self.main_source_file - ) - - frame = thread.GetFrameAtIndex(0) - - field2_wp = ( - frame.locals["var"][0] - .GetChildMemberWithName("field2") - .Watch(True, False, True) - ) - field3_wp = ( - frame.locals["var"][0] - .GetChildMemberWithName("field3") - .Watch(True, False, True) - ) - field4_wp = ( - frame.locals["var"][0] - .GetChildMemberWithName("field4") - .Watch(True, False, True) - ) - field5_wp = ( - frame.locals["var"][0] - .GetChildMemberWithName("field5") - .Watch(True, False, True) - ) - - self.assertTrue(field2_wp.IsValid()) - self.assertTrue(field3_wp.IsValid()) - self.assertTrue(field4_wp.IsValid()) - self.assertTrue(field5_wp.IsValid()) - - reason = self.continue_and_report_stop_reason(process, "continue to field2 wp") - self.assertEqual(reason, lldb.eStopReasonWatchpoint) - stop_reason_watchpoint_id = ( - process.GetSelectedThread().GetStopReasonDataAtIndex(0) - ) - self.assertEqual(stop_reason_watchpoint_id, field2_wp.GetID()) - - reason = self.continue_and_report_stop_reason(process, "continue to field3 wp") - self.assertEqual(reason, lldb.eStopReasonWatchpoint) - stop_reason_watchpoint_id = ( - process.GetSelectedThread().GetStopReasonDataAtIndex(0) - ) - self.assertEqual(stop_reason_watchpoint_id, field3_wp.GetID()) - - reason = self.continue_and_report_stop_reason(process, "continue to field4 wp") - self.assertEqual(reason, lldb.eStopReasonWatchpoint) - stop_reason_watchpoint_id = ( - process.GetSelectedThread().GetStopReasonDataAtIndex(0) - ) - self.assertEqual(stop_reason_watchpoint_id, field4_wp.GetID()) - - reason = self.continue_and_report_stop_reason(process, "continue to field5 wp") - self.assertEqual(reason, lldb.eStopReasonWatchpoint) - stop_reason_watchpoint_id = ( - process.GetSelectedThread().GetStopReasonDataAtIndex(0) - ) - self.assertEqual(stop_reason_watchpoint_id, field5_wp.GetID()) diff --git a/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/main.c b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/main.c deleted file mode 100644 index c0a3530be9f5e..0000000000000 --- a/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/main.c +++ /dev/null @@ -1,22 +0,0 @@ -#include -struct fields { - uint32_t field1; - uint32_t field2; // offset +4 - uint16_t field3; // offset +8 - uint16_t field4; // offset +10 - uint16_t field5; // offset +12 - uint16_t field6; // offset +14 -}; - -int main() { - struct fields var = {0, 0, 0, 0, 0, 0}; - - var.field1 = 5; // break here - var.field2 = 6; - var.field3 = 7; - var.field4 = 8; - var.field5 = 9; - var.field6 = 10; - - return var.field1 + var.field2 + var.field3; -} diff --git a/lldb/tools/debugserver/source/DNBBreakpoint.cpp b/lldb/tools/debugserver/source/DNBBreakpoint.cpp index e41bf9b4fd905..f63ecf24222bd 100644 --- a/lldb/tools/debugserver/source/DNBBreakpoint.cpp +++ b/lldb/tools/debugserver/source/DNBBreakpoint.cpp @@ -98,7 +98,7 @@ DNBBreakpointList::FindNearestWatchpoint(nub_addr_t addr) const { if (pos.second.IsEnabled()) { nub_addr_t start_addr = pos.second.Address(); nub_addr_t end_addr = start_addr + pos.second.ByteSize(); - if (addr >= start_addr && addr < end_addr) + if (addr >= start_addr && addr <= end_addr) return &pos.second; } } From 529e9127f0d61f177f2c62ade531d876cf89bf59 Mon Sep 17 00:00:00 2001 From: YLChenZ Date: Tue, 8 Apr 2025 02:42:57 +0800 Subject: [PATCH 0881/1029] [clang][doc]: Merge entries with duplicate content. (#134089) Before the patch: ![docb4](https://github.com/user-attachments/assets/6db1000f-d555-48b8-8a19-85c41b043fd8) after the patch: ![doc-after](https://github.com/user-attachments/assets/1cff64b6-db2e-48d8-b0a9-a403fd61f8af) Fixes #133706 --- clang/utils/TableGen/ClangAttrEmitter.cpp | 82 ++++++++++++++++++++--- 1 file changed, 74 insertions(+), 8 deletions(-) diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index 06f6b073240ba..07844c8f0967b 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -5144,6 +5144,14 @@ class SpellingList { Spellings[(size_t)Kind].push_back(Name); } + + void merge(const SpellingList &Other) { + for (size_t Kind = 0; Kind < NumSpellingKinds; ++Kind) { + Spellings[Kind].insert(Spellings[Kind].end(), + Other.Spellings[Kind].begin(), + Other.Spellings[Kind].end()); + } + } }; class DocumentationData { @@ -5301,31 +5309,89 @@ void EmitClangAttrDocs(const RecordKeeper &Records, raw_ostream &OS) { return L->getValueAsString("Name") < R->getValueAsString("Name"); } }; - std::map, CategoryLess> - SplitDocs; + + std::map, CategoryLess> + MergedDocs; + + std::vector UndocumentedDocs; + const Record *UndocumentedCategory = nullptr; + + // Collect documentation data, grouping by category and heading. for (const auto *A : Records.getAllDerivedDefinitions("Attr")) { const Record &Attr = *A; std::vector Docs = Attr.getValueAsListOfDefs("Documentation"); + for (const auto *D : Docs) { const Record &Doc = *D; const Record *Category = Doc.getValueAsDef("Category"); // If the category is "InternalOnly", then there cannot be any other // documentation categories (otherwise, the attribute would be // emitted into the docs). - const StringRef Cat = Category->getValueAsString("Name"); - bool InternalOnly = Cat == "InternalOnly"; - if (InternalOnly && Docs.size() > 1) + StringRef Cat = Category->getValueAsString("Name"); + if (Cat == "InternalOnly" && Docs.size() > 1) PrintFatalError(Doc.getLoc(), "Attribute is \"InternalOnly\", but has multiple " "documentation categories"); - if (!InternalOnly) - SplitDocs[Category].push_back(DocumentationData( - Doc, Attr, GetAttributeHeadingAndSpellings(Doc, Attr, Cat))); + if (Cat == "InternalOnly") + continue; + + // Track the Undocumented category Record for later grouping + if (Cat == "Undocumented" && !UndocumentedCategory) + UndocumentedCategory = Category; + + // Generate Heading and Spellings. + auto HeadingAndSpellings = + GetAttributeHeadingAndSpellings(Doc, Attr, Cat); + + // Handle Undocumented category separately - no content merging + if (Cat == "Undocumented" && UndocumentedCategory) { + UndocumentedDocs.push_back( + DocumentationData(Doc, Attr, HeadingAndSpellings)); + continue; + } + + auto &CategoryDocs = MergedDocs[Category]; + + std::string key = Doc.getValueAsString("Content").str(); + uint32_t keyHash = llvm::hash_value(key); + + // If the content already exists, merge the documentation. + auto It = CategoryDocs.find(keyHash); + if (It != CategoryDocs.end()) { + // Merge heading + if (It->second.Heading != HeadingAndSpellings.first) + It->second.Heading += ", " + HeadingAndSpellings.first; + // Merge spellings + It->second.SupportedSpellings.merge(HeadingAndSpellings.second); + // Merge content + It->second.Documentation = &Doc; // Update reference + } else { + // Create new entry for unique content + CategoryDocs.emplace(keyHash, + DocumentationData(Doc, Attr, HeadingAndSpellings)); + } } } + std::map, CategoryLess> + SplitDocs; + + for (auto &CategoryPair : MergedDocs) { + + std::vector MD; + for (auto &DocPair : CategoryPair.second) + MD.push_back(std::move(DocPair.second)); + + SplitDocs.emplace(CategoryPair.first, MD); + } + + // Append Undocumented category entries + if (!UndocumentedDocs.empty() && UndocumentedCategory) { + SplitDocs.emplace(UndocumentedCategory, UndocumentedDocs); + } + // Having split the attributes out based on what documentation goes where, // we can begin to generate sections of documentation. for (auto &I : SplitDocs) { From 10bef367a5643bc41d0172b02e080645c68f821a Mon Sep 17 00:00:00 2001 From: Piotr Fusik Date: Mon, 7 Apr 2025 20:47:21 +0200 Subject: [PATCH 0882/1029] [RISCV][test] Fix a comment typo (#134242) --- llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll b/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll index c30945a75461a..2d48f2b49822b 100644 --- a/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll +++ b/llvm/test/CodeGen/RISCV/zbb-cmp-combine.ll @@ -6,7 +6,7 @@ ; Tests aimed to check optimization which combines ; two comparison operations and logic operation into ; one select(min/max) operation and one comparison -; operaion. +; operation. ; 4 patterns below will be converted to umin+less. define i1 @ulo(i64 %c, i64 %a, i64 %b) { From 8f5a3ec649a3b46093c8a8140bbd91f21236fde6 Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Mon, 7 Apr 2025 15:15:12 -0400 Subject: [PATCH 0883/1029] [sanitizer_common] Use HW_NCPUONLINE sysctl on NetBSD in GetNumberOfCPUs() (#134704) --- compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp index e5e79d4e0521c..ffd240974454e 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp @@ -840,7 +840,11 @@ u32 GetNumberOfCPUs() { int req[2]; uptr len = sizeof(ncpu); req[0] = CTL_HW; +# ifdef HW_NCPUONLINE + req[1] = HW_NCPUONLINE; +# else req[1] = HW_NCPU; +# endif CHECK_EQ(internal_sysctl(req, 2, &ncpu, &len, NULL, 0), 0); return ncpu; # elif SANITIZER_SOLARIS From 76fee8f4edf10fb39f0323171c060e97ec0005b0 Mon Sep 17 00:00:00 2001 From: Leandro Lupori Date: Mon, 7 Apr 2025 16:22:51 -0300 Subject: [PATCH 0884/1029] [flang][OpenMP][NFC] Don't use special chars in error messages (#134686) Some error messages were using a special char for `fi`, in the word `specified`, probably due to a typo. This caused an error on Windows: #134625 --- flang/lib/Semantics/check-omp-structure.cpp | 8 ++++---- flang/test/Semantics/OpenMP/interop-construct.f90 | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 757d2316e6b53..717982f66027c 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -5762,7 +5762,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPInteropConstruct &x) { const auto *objectSymbol{name->symbol}; if (llvm::is_contained(objectSymbolList, objectSymbol)) { context_.Say(GetContext().directiveSource, - "Each interop-var may be specified for at most one action-clause of each INTEROP construct."_err_en_US); + "Each interop-var may be specified for at most one action-clause of each INTEROP construct."_err_en_US); } else { objectSymbolList.insert(objectSymbol); } @@ -5777,7 +5777,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPInteropConstruct &x) { const auto *objectSymbol{name->symbol}; if (llvm::is_contained(objectSymbolList, objectSymbol)) { context_.Say(GetContext().directiveSource, - "Each interop-var may be specified for at most one action-clause of each INTEROP construct."_err_en_US); + "Each interop-var may be specified for at most one action-clause of each INTEROP construct."_err_en_US); } else { objectSymbolList.insert(objectSymbol); } @@ -5789,7 +5789,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPInteropConstruct &x) { const auto *objectSymbol{name->symbol}; if (llvm::is_contained(objectSymbolList, objectSymbol)) { context_.Say(GetContext().directiveSource, - "Each interop-var may be specified for at most one action-clause of each INTEROP construct."_err_en_US); + "Each interop-var may be specified for at most one action-clause of each INTEROP construct."_err_en_US); } else { objectSymbolList.insert(objectSymbol); } @@ -5800,7 +5800,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPInteropConstruct &x) { } if (targetCount > 1 || targetSyncCount > 1) { context_.Say(GetContext().directiveSource, - "Each interop-type may be specified at most once."_err_en_US); + "Each interop-type may be specified at most once."_err_en_US); } if (isDependClauseOccured && !targetSyncCount) { context_.Say(GetContext().directiveSource, diff --git a/flang/test/Semantics/OpenMP/interop-construct.f90 b/flang/test/Semantics/OpenMP/interop-construct.f90 index dfcd55c4adabe..bc32bd80c22a6 100644 --- a/flang/test/Semantics/OpenMP/interop-construct.f90 +++ b/flang/test/Semantics/OpenMP/interop-construct.f90 @@ -8,7 +8,7 @@ SUBROUTINE test_interop_01() USE omp_lib INTEGER(OMP_INTEROP_KIND) :: obj - !ERROR: Each interop-var may be specified for at most one action-clause of each INTEROP construct. + !ERROR: Each interop-var may be specified for at most one action-clause of each INTEROP construct. !$OMP INTEROP INIT(TARGETSYNC,TARGET: obj) USE(obj) PRINT *, 'pass' END SUBROUTINE test_interop_01 @@ -16,7 +16,7 @@ END SUBROUTINE test_interop_01 SUBROUTINE test_interop_02() USE omp_lib INTEGER(OMP_INTEROP_KIND) :: obj - !ERROR: Each interop-type may be specified at most once. + !ERROR: Each interop-type may be specified at most once. !$OMP INTEROP INIT(TARGETSYNC,TARGET,TARGETSYNC: obj) PRINT *, 'pass' END SUBROUTINE test_interop_02 From 27c099bc84d1432752b3828f1ba95de24db76c5e Mon Sep 17 00:00:00 2001 From: Drew Lewis Date: Mon, 7 Apr 2025 15:23:10 -0400 Subject: [PATCH 0885/1029] Move CodeGen cuda.h to Inputs from include (#134706) Mirrors the behavior of CodeGenCUDA directory and the location of other .h files in CodeGen. --- clang/test/CodeGen/{include => Inputs}/cuda.h | 0 clang/test/CodeGen/nvptx-surface.cu | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename clang/test/CodeGen/{include => Inputs}/cuda.h (100%) diff --git a/clang/test/CodeGen/include/cuda.h b/clang/test/CodeGen/Inputs/cuda.h similarity index 100% rename from clang/test/CodeGen/include/cuda.h rename to clang/test/CodeGen/Inputs/cuda.h diff --git a/clang/test/CodeGen/nvptx-surface.cu b/clang/test/CodeGen/nvptx-surface.cu index cf1fe76893a17..56995f2c0da80 100644 --- a/clang/test/CodeGen/nvptx-surface.cu +++ b/clang/test/CodeGen/nvptx-surface.cu @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -O3 -o - %s -emit-llvm | FileCheck %s // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -O3 -o - %s -emit-llvm | FileCheck %s -#include "include/cuda.h" +#include "Inputs/cuda.h" #include "__clang_cuda_texture_intrinsics.h" From 9965f3d337bba09fbf2497a78123957fdaee0ffa Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 7 Apr 2025 15:25:45 -0400 Subject: [PATCH 0886/1029] [libc++] Improve diagnostic when violating `std::atomic` trivially copyable mandates (#131754) When attempting to instantiate `std::atomic` with a non trivially copyable type, one gets errors from instantiating internals before the actual static assertion that check the template parameter type requirements. The `verify` test for it had a `// ADDITIONAL_COMPILE_FLAGS: -Xclang -verify-ignore-unexpected=error` directive to work around this issue. The changes I propose enable us to drop that directive. As I understand it, the `verify` test was misplaced so I moved it to `test/{std -> libcxx}/atomics`. (I ran into this while working on #121414 in which we would add another static assertion in `__check_atomic_mandates`) --- libcxx/include/__atomic/atomic.h | 9 ++++++++- libcxx/include/__atomic/support.h | 3 --- .../trivially_copyable.verify.cpp | 15 ++++++--------- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h index c65f9afe4f390..eead49dde6192 100644 --- a/libcxx/include/__atomic/atomic.h +++ b/libcxx/include/__atomic/atomic.h @@ -23,6 +23,7 @@ #include <__type_traits/is_integral.h> #include <__type_traits/is_nothrow_constructible.h> #include <__type_traits/is_same.h> +#include <__type_traits/is_trivially_copyable.h> #include <__type_traits/remove_const.h> #include <__type_traits/remove_pointer.h> #include <__type_traits/remove_volatile.h> @@ -230,8 +231,14 @@ struct __atomic_waitable_traits<__atomic_base<_Tp, _IsIntegral> > { } }; +template +struct __check_atomic_mandates { + using type _LIBCPP_NODEBUG = _Tp; + static_assert(is_trivially_copyable<_Tp>::value, "std::atomic requires that 'T' be a trivially copyable type"); +}; + template -struct atomic : public __atomic_base<_Tp> { +struct atomic : public __atomic_base::type> { using __base _LIBCPP_NODEBUG = __atomic_base<_Tp>; #if _LIBCPP_STD_VER >= 20 diff --git a/libcxx/include/__atomic/support.h b/libcxx/include/__atomic/support.h index 4b555ab483ca0..99d0f6aa543ca 100644 --- a/libcxx/include/__atomic/support.h +++ b/libcxx/include/__atomic/support.h @@ -10,7 +10,6 @@ #define _LIBCPP___ATOMIC_SUPPORT_H #include <__config> -#include <__type_traits/is_trivially_copyable.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -113,8 +112,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD template > struct __cxx_atomic_impl : public _Base { - static_assert(is_trivially_copyable<_Tp>::value, "std::atomic requires that 'T' be a trivially copyable type"); - _LIBCPP_HIDE_FROM_ABI __cxx_atomic_impl() _NOEXCEPT = default; _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __cxx_atomic_impl(_Tp __value) _NOEXCEPT : _Base(__value) {} }; diff --git a/libcxx/test/std/atomics/atomics.types.generic/trivially_copyable.verify.cpp b/libcxx/test/std/atomics/atomics.types.generic/trivially_copyable.verify.cpp index 0955707cdcf38..452e65d569230 100644 --- a/libcxx/test/std/atomics/atomics.types.generic/trivially_copyable.verify.cpp +++ b/libcxx/test/std/atomics/atomics.types.generic/trivially_copyable.verify.cpp @@ -6,26 +6,23 @@ // //===----------------------------------------------------------------------===// +// XFAIL: FROZEN-CXX03-HEADERS-FIXME + // // template // struct atomic; -// This test checks that we static_assert inside std::atomic when T -// is not trivially copyable, however Clang will sometimes emit additional -// errors while trying to instantiate the rest of std::atomic. -// We silence those to make the test more robust. -// ADDITIONAL_COMPILE_FLAGS: -Xclang -verify-ignore-unexpected=error - #include struct NotTriviallyCopyable { - explicit NotTriviallyCopyable(int i) : i_(i) { } - NotTriviallyCopyable(const NotTriviallyCopyable &rhs) : i_(rhs.i_) { } + explicit NotTriviallyCopyable(int i) : i_(i) {} + NotTriviallyCopyable(const NotTriviallyCopyable& rhs) : i_(rhs.i_) {} int i_; }; void f() { NotTriviallyCopyable x(42); - std::atomic a(x); // expected-error@*:* {{std::atomic requires that 'T' be a trivially copyable type}} + std::atomic a( + x); // expected-error@*:* {{std::atomic requires that 'T' be a trivially copyable type}} } From 3479c574667eec375cc4ed91e0d63aa2d408b62c Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Mon, 7 Apr 2025 15:50:21 -0400 Subject: [PATCH 0887/1029] PowerPC32:PIC: Update to bcl to fix branch prediction mis-predict issue (#134140) Update `bl` to `bcl 20, 31, .+4` for 32bit PIC code gen so the link stack is not corrupted and cause mis-predict for the branch predictor. fixes: https://github.com/llvm/llvm-project/issues/128644 --- llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp | 4 +-- llvm/test/CodeGen/PowerPC/mcm-5.ll | 4 +-- llvm/test/CodeGen/PowerPC/ppc32-pic-bcl.ll | 29 ++++++++++++++++++++ llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll | 2 +- llvm/test/CodeGen/PowerPC/ppc32-pic.ll | 2 +- 5 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/ppc32-pic-bcl.ll diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index ba6653e10bdc5..f07331bf6c6b5 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -967,9 +967,9 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { // L1$pb: MCSymbol *PICBase = MF->getPICBaseSymbol(); - // Emit the 'bl'. + // Emit 'bcl 20,31,.+4' so the link stack is not corrupted. EmitToStreamer(*OutStreamer, - MCInstBuilder(PPC::BL) + MCInstBuilder(PPC::BCLalways) // FIXME: We would like an efficient form for this, so we // don't have to do a lot of extra uniquing. .addExpr(MCSymbolRefExpr::create(PICBase, OutContext))); diff --git a/llvm/test/CodeGen/PowerPC/mcm-5.ll b/llvm/test/CodeGen/PowerPC/mcm-5.ll index b88f405211b05..f9629b5710d72 100644 --- a/llvm/test/CodeGen/PowerPC/mcm-5.ll +++ b/llvm/test/CodeGen/PowerPC/mcm-5.ll @@ -51,7 +51,7 @@ sw.epilog: ; preds = %sw.bb3, %sw.default ret i32 %5 } ; CHECK-LABEL: test_jump_table: -; CHECK-NOT: bl .L0$pb +; CHECK-NOT: bcl 20, 31, .L0$pb ; CHECK: addis [[REG1:[0-9]+]], 2, .LC[[TOCNUM:[0-9]+]]@toc@ha ; CHECK: ld [[REG2:[0-9]+]], .LC[[TOCNUM]]@toc@l([[REG1]]) @@ -64,7 +64,7 @@ sw.epilog: ; preds = %sw.bb3, %sw.default ; CHECK-NEXT: .long .LBB0_{{[0-9]+}}-.LJTI0_0 ; LARGE-LABEL: test_jump_table: -; LARGE: bl .L0$pb +; LARGE: bcl 20, 31, .L0$pb ; LARGE-NEXT: .L0$pb: ; LARGE: mflr [[REGBASE:[0-9]+]] diff --git a/llvm/test/CodeGen/PowerPC/ppc32-pic-bcl.ll b/llvm/test/CodeGen/PowerPC/ppc32-pic-bcl.ll new file mode 100644 index 0000000000000..1e938b135fbfd --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/ppc32-pic-bcl.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=powerpc -relocation-model=pic | \ +; RUN: FileCheck -check-prefixes=SMALL %s + +@val = global i8 0, align 1 + +define zeroext i8 @testbcl() nounwind { +; SMALL-LABEL: testbcl: +; SMALL: # %bb.0: # %entry +; SMALL-NEXT: mflr 0 +; SMALL-NEXT: stwu 1, -16(1) +; SMALL-NEXT: stw 30, 8(1) +; SMALL-NEXT: stw 0, 20(1) +; SMALL-NEXT: bcl 20, 31, .L0$pb +; SMALL-NEXT: .L0$pb: +; SMALL-NEXT: mflr 30 +; SMALL-NEXT: lwz 3, .L0$poff-.L0$pb(30) +; SMALL-NEXT: add 30, 3, 30 +; SMALL-NEXT: lwz 3, .LC0-.LTOC(30) +; SMALL-NEXT: lbz 3, 0(3) +; SMALL-NEXT: lwz 0, 20(1) +; SMALL-NEXT: lwz 30, 8(1) +; SMALL-NEXT: addi 1, 1, 16 +; SMALL-NEXT: mtlr 0 +; SMALL-NEXT: blr +entry: + %0 = load i8, ptr @val, align 1 + ret i8 %0 +} diff --git a/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll b/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll index 2f0b92964c13b..7be1a80b7af43 100644 --- a/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll +++ b/llvm/test/CodeGen/PowerPC/ppc32-pic-large.ll @@ -55,7 +55,7 @@ entry: ; LARGE-BSS-NEXT: foo: ; LARGE-BSS: stwu 1, -32(1) ; LARGE-BSS: stw 30, 24(1) -; LARGE-BSS: bl [[PB]] +; LARGE-BSS: bcl 20, 31, [[PB]] ; LARGE-BSS-NEXT: [[PB]]: ; LARGE-BSS: mflr 30 ; LARGE-BSS: lwz [[REG:[0-9]+]], [[POFF]]-[[PB]](30) diff --git a/llvm/test/CodeGen/PowerPC/ppc32-pic.ll b/llvm/test/CodeGen/PowerPC/ppc32-pic.ll index aed994144940c..f7d8df9133306 100644 --- a/llvm/test/CodeGen/PowerPC/ppc32-pic.ll +++ b/llvm/test/CodeGen/PowerPC/ppc32-pic.ll @@ -19,7 +19,7 @@ entry: ; SMALL: stwu 1, -32(1) ; SMALL: stw 30, 24(1) ; SMALL-BSS: bl _GLOBAL_OFFSET_TABLE_@local-4 -; SMALL-SECURE: bl .L0$pb +; SMALL-SECURE: bcl 20, 31, .L0$pb ; SMALL: mflr 30 ; SMALL-SECURE: addis 30, 30, _GLOBAL_OFFSET_TABLE_-.L0$pb@ha ; SMALL-SECURE: addi 30, 30, _GLOBAL_OFFSET_TABLE_-.L0$pb@l From 19dbde0e81e98758ae9d37ef027bc8d9bbd673df Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 7 Apr 2025 12:56:16 -0700 Subject: [PATCH 0888/1029] [NFC][LLVM] Cleanup pass initialization for ARM/ARV/Lanai/X86/XCore (#134400) - Remove pass initialization from pass constructors. - https://github.com/llvm/llvm-project/issues/111767 --- llvm/lib/Target/ARM/ARMFixCortexA57AES1742098Pass.cpp | 4 +--- llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp | 11 ++++------- llvm/lib/Target/Lanai/Lanai.h | 1 + llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp | 9 ++------- llvm/lib/Target/Lanai/LanaiTargetMachine.cpp | 5 +---- llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp | 5 +---- llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp | 4 +--- llvm/lib/Target/XCore/XCoreTargetMachine.cpp | 1 + 8 files changed, 12 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMFixCortexA57AES1742098Pass.cpp b/llvm/lib/Target/ARM/ARMFixCortexA57AES1742098Pass.cpp index 5ff602364933c..7d18242d8c16a 100644 --- a/llvm/lib/Target/ARM/ARMFixCortexA57AES1742098Pass.cpp +++ b/llvm/lib/Target/ARM/ARMFixCortexA57AES1742098Pass.cpp @@ -70,9 +70,7 @@ namespace { class ARMFixCortexA57AES1742098 : public MachineFunctionPass { public: static char ID; - explicit ARMFixCortexA57AES1742098() : MachineFunctionPass(ID) { - initializeARMFixCortexA57AES1742098Pass(*PassRegistry::getPassRegistry()); - } + explicit ARMFixCortexA57AES1742098() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &F) override; diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp index f257ccea6c50a..440d852fa4bc8 100644 --- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp @@ -34,9 +34,7 @@ class AVRExpandPseudo : public MachineFunctionPass { public: static char ID; - AVRExpandPseudo() : MachineFunctionPass(ID) { - initializeAVRExpandPseudoPass(*PassRegistry::getPassRegistry()); - } + AVRExpandPseudo() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -2643,8 +2641,7 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) { INITIALIZE_PASS(AVRExpandPseudo, "avr-expand-pseudo", AVR_EXPAND_PSEUDO_NAME, false, false) -namespace llvm { - -FunctionPass *createAVRExpandPseudoPass() { return new AVRExpandPseudo(); } -} // end of namespace llvm +FunctionPass *llvm::createAVRExpandPseudoPass() { + return new AVRExpandPseudo(); +} diff --git a/llvm/lib/Target/Lanai/Lanai.h b/llvm/lib/Target/Lanai/Lanai.h index 72a7efc58062d..1ef4462fed648 100644 --- a/llvm/lib/Target/Lanai/Lanai.h +++ b/llvm/lib/Target/Lanai/Lanai.h @@ -38,6 +38,7 @@ FunctionPass *createLanaiMemAluCombinerPass(); FunctionPass *createLanaiSetflagAluCombinerPass(); void initializeLanaiDAGToDAGISelLegacyPass(PassRegistry &); +void initializeLanaiMemAluCombinerPass(PassRegistry &); } // namespace llvm diff --git a/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp b/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp index 2442d7ee923f1..9fd1ff60587c1 100644 --- a/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp +++ b/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp @@ -22,6 +22,7 @@ // in the same machine basic block into one machine instruction. //===----------------------------------------------------------------------===// +#include "Lanai.h" #include "LanaiAluCode.h" #include "LanaiTargetMachine.h" #include "llvm/ADT/Statistic.h" @@ -44,10 +45,6 @@ static llvm::cl::opt DisableMemAluCombiner( llvm::cl::desc("Do not combine ALU and memory operators"), llvm::cl::Hidden); -namespace llvm { -void initializeLanaiMemAluCombinerPass(PassRegistry &); -} // namespace llvm - namespace { typedef MachineBasicBlock::iterator MbbIterator; typedef MachineFunction::iterator MfIterator; @@ -55,9 +52,7 @@ typedef MachineFunction::iterator MfIterator; class LanaiMemAluCombiner : public MachineFunctionPass { public: static char ID; - explicit LanaiMemAluCombiner() : MachineFunctionPass(ID) { - initializeLanaiMemAluCombinerPass(*PassRegistry::getPassRegistry()); - } + explicit LanaiMemAluCombiner() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "Lanai load / store optimization pass"; diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp index 05420b9f3c6e6..f5e83286b7052 100644 --- a/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp +++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.cpp @@ -26,16 +26,13 @@ using namespace llvm; -namespace llvm { -void initializeLanaiMemAluCombinerPass(PassRegistry &); -} // namespace llvm - extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLanaiTarget() { // Register the target. RegisterTargetMachine registered_target( getTheLanaiTarget()); PassRegistry &PR = *PassRegistry::getPassRegistry(); initializeLanaiDAGToDAGISelLegacyPass(PR); + initializeLanaiMemAluCombinerPass(PR); } static std::string computeDataLayout() { diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 0a187ee42e3f8..7f3393910da2c 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -631,10 +631,7 @@ class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass { public: static char ID; - X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) { - initializeX86LowerAMXIntrinsicsLegacyPassPass( - *PassRegistry::getPassRegistry()); - } + X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override { if (!X86ScalarizeAMX) diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp index 3870e80f9559b..9e373021a826a 100644 --- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp +++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp @@ -41,9 +41,7 @@ namespace { struct XCoreLowerThreadLocal : public ModulePass { static char ID; - XCoreLowerThreadLocal() : ModulePass(ID) { - initializeXCoreLowerThreadLocalPass(*PassRegistry::getPassRegistry()); - } + XCoreLowerThreadLocal() : ModulePass(ID) {} bool lowerGlobal(GlobalVariable *GV); diff --git a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp index a04f5b9e662e3..fc93ab5c500e0 100644 --- a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp @@ -106,6 +106,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeXCoreTarget() { RegisterTargetMachine X(getTheXCoreTarget()); PassRegistry &PR = *PassRegistry::getPassRegistry(); initializeXCoreDAGToDAGISelLegacyPass(PR); + initializeXCoreLowerThreadLocalPass(PR); } TargetTransformInfo From 9fdac840ec4901a6e3c71249a136cbecc4a9921a Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Mon, 7 Apr 2025 15:56:58 -0400 Subject: [PATCH 0889/1029] [sanitizer_common] Fix building with NetBSD 10.99.12 or newer (#134708) https://github.com/NetBSD/src/commit/16543c49052c820334cffc5c69b2afde18f02458 __lwp_getprivate_fast() was moved to a new arch-specific header file. Closes: #125566 Co-authored-by: Thomas Klausner --- .../lib/sanitizer_common/sanitizer_linux_libcdep.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp index ffd240974454e..9cc9da3d88c40 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp @@ -29,7 +29,13 @@ # include "sanitizer_solaris.h" # if SANITIZER_NETBSD -# define _RTLD_SOURCE // for __lwp_gettcb_fast() / __lwp_getprivate_fast() +# // for __lwp_gettcb_fast() / __lwp_getprivate_fast() +# include +# if defined(__NetBSD_Version__) && (__NetBSD_Version__ >= 1099001200) +# include +# else +# define _RTLD_SOURCE +# endif # endif # include // for dlsym() From 7d4cddadf2e9e59cde65592a1ea80576a1c1b0fc Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Mon, 7 Apr 2025 20:01:29 +0000 Subject: [PATCH 0890/1029] [gn build] Port 0a1742708ddc --- llvm/utils/gn/secondary/llvm/lib/CodeGen/SelectionDAG/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/SelectionDAG/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/SelectionDAG/BUILD.gn index e3a08dfa885b8..950ebeaa03987 100644 --- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/SelectionDAG/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/SelectionDAG/BUILD.gn @@ -23,6 +23,7 @@ static_library("SelectionDAG") { "LegalizeVectorOps.cpp", "LegalizeVectorTypes.cpp", "ResourcePriorityQueue.cpp", + "SDNodeInfo.cpp", "ScheduleDAGFast.cpp", "ScheduleDAGRRList.cpp", "ScheduleDAGSDNodes.cpp", From 06bf7a99fd21c1c450d81d2dc8d3e529042181be Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Mon, 7 Apr 2025 20:01:30 +0000 Subject: [PATCH 0891/1029] [gn build] Port 1f72fa29ecb4 --- llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn index 2fbc127199609..456c4f97c7f25 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn @@ -46,6 +46,7 @@ unittest("CodeGenTests") { "TargetOptionsTest.cpp", "TestAsmPrinter.cpp", "TypeTraitsTest.cpp", + "X86MCInstLowerTest.cpp", ] has_custom_main = true } From e1f6e40b282aef17e35de62f14e7b57b487fe2bc Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Mon, 7 Apr 2025 20:01:31 +0000 Subject: [PATCH 0892/1029] [gn build] Port 475cbf0ad6e7 --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index b7c5ae08dd741..9e9e102468548 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -132,6 +132,7 @@ if (current_toolchain == default_toolchain) { "__algorithm/next_permutation.h", "__algorithm/none_of.h", "__algorithm/nth_element.h", + "__algorithm/out_value_result.h", "__algorithm/partial_sort.h", "__algorithm/partial_sort_copy.h", "__algorithm/partition.h", @@ -1701,6 +1702,7 @@ if (current_toolchain == default_toolchain) { "__numeric/midpoint.h", "__numeric/partial_sum.h", "__numeric/pstl.h", + "__numeric/ranges_iota.h", "__numeric/reduce.h", "__numeric/saturation_arithmetic.h", "__numeric/transform_exclusive_scan.h", From 78f624a0d41e9a304ab6f4ac713417b88c667173 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Mon, 7 Apr 2025 20:01:32 +0000 Subject: [PATCH 0893/1029] [gn build] Port 7013b51548c0 --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 9e9e102468548..4f79e2c612b93 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -1456,8 +1456,10 @@ if (current_toolchain == default_toolchain) { "__flat_map/sorted_equivalent.h", "__flat_map/sorted_unique.h", "__flat_map/utils.h", + "__flat_set/flat_multiset.h", "__flat_set/flat_set.h", "__flat_set/ra_iterator.h", + "__flat_set/utils.h", "__format/buffer.h", "__format/concepts.h", "__format/container_adaptor.h", From 7aedebac8cb473555aa8a2928ac3851b0142921e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 7 Apr 2025 21:17:41 +0100 Subject: [PATCH 0894/1029] [VPlan] Populate ExitBlocks when cloning VPlan (NFC). Update VPlan::duplicate to add cloned exit blocks to ExitBlocks. Currently there are no uses of the exit blocks after cloning so this is NFC at the moment. --- llvm/lib/Transforms/Vectorize/VPlan.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index bc3957f573d82..4688eef194b32 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1216,6 +1216,13 @@ VPlan *VPlan::duplicate() { NewPlan->CreatedBlocks.push_back(this->CreatedBlocks[I]); CreatedBlocks.truncate(NumBlocksBeforeCloning); + // Update ExitBlocks of the new plan. + for (VPBlockBase *VPB : NewPlan->CreatedBlocks) { + if (VPB->getNumSuccessors() == 0 && isa(VPB) && + VPB != NewScalarHeader) + NewPlan->ExitBlocks.push_back(cast(VPB)); + } + return NewPlan; } From d4c16424cf295e7edb7ecd5f8831ff195f7a8fa7 Mon Sep 17 00:00:00 2001 From: Shafik Yaghmour Date: Mon, 7 Apr 2025 13:18:53 -0700 Subject: [PATCH 0895/1029] [LLVM][Demangle] Fix MS Demangler to be stricter about wide string literals (#134483) Static analysis detected that Demangler::demangleStringLiteral had a potential overflow if not checking StringByteSize properly. Added check to ensure that for wide string it is always even and that there were the byte count did not mismatch the actual size of the literal. Fixes: https://github.com/llvm/llvm-project/issues/129970 --- llvm/docs/ReleaseNotes.md | 2 ++ llvm/lib/Demangle/MicrosoftDemangle.cpp | 5 +++++ llvm/test/Demangle/invalid-manglings.test | 24 +++++++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 58cf71b947083..526d6b4002bba 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -70,6 +70,8 @@ Changes to LLVM infrastructure * Removed support for target intrinsics being defined in the target directories themselves (i.e., the `TargetIntrinsicInfo` class). +* Fix Microsoft demangling of string literals to be stricter + (#GH129970)) Changes to building LLVM ------------------------ diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp index 6be8b0fe73996..8d5f6b21e2e76 100644 --- a/llvm/lib/Demangle/MicrosoftDemangle.cpp +++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp @@ -1374,6 +1374,11 @@ Demangler::demangleStringLiteral(std::string_view &MangledName) { Result->IsTruncated = true; while (!consumeFront(MangledName, '@')) { + // For a wide string StringByteSize has to have an even length. + if (StringByteSize % 2 != 0) + goto StringLiteralError; + if (StringByteSize == 0) + goto StringLiteralError; if (MangledName.size() < 2) goto StringLiteralError; wchar_t W = demangleWcharLiteral(MangledName); diff --git a/llvm/test/Demangle/invalid-manglings.test b/llvm/test/Demangle/invalid-manglings.test index b77288488b2db..5d80d2d33e970 100644 --- a/llvm/test/Demangle/invalid-manglings.test +++ b/llvm/test/Demangle/invalid-manglings.test @@ -379,3 +379,27 @@ ; CHECK-EMPTY: ; CHECK-NEXT: .?AUBase@@@8 ; CHECK-NEXT: error: Invalid mangled name + +; Begin GH129970 + +??_C@_12EEHFKJGG@?$AAt?$AAe?$AAx@ +; CHECK-EMPTY: +; CHECK-NEXT: ??_C@_12EEHFKJGG@?$AAt?$AAe?$AAx@ +; CHECK-NEXT: error: Invalid mangled name + +??_C@_16EEHFKJGG@?$AAt?$AAe?$AAx@ +; CHECK-EMPTY: +; CHECK-NEXT: ??_C@_16EEHFKJGG@?$AAt?$AAe?$AAx@ +; CHECK-NEXT: error: Invalid mangled name + +??_C@_18EEHFKJGG@?$AAt?$AAe?$AAx@ +; CHECK-EMPTY: +; CHECK-NEXT: ??_C@_18EEHFKJGG@?$AAt?$AAe?$AAx@ +; CHECK-NEXT: error: Invalid mangled name + +??_C@_15EEHFKJGG@?$AAt?$AAe?$AAx?$AAx@ +; CHECK-EMPTY: +; CHECK-NEXT: ??_C@_15EEHFKJGG@?$AAt?$AAe?$AAx?$AAx@ +; CHECK-NEXT: error: Invalid mangled name + +; End GH129970 From 922260722795471870ef793f3187e9a37b145661 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 7 Apr 2025 13:22:52 -0700 Subject: [PATCH 0896/1029] gn build: Add check-builtins target. Tested on aarch64 Linux and x86_64 Linux. Reviewers: aeubanks, nico Reviewed By: aeubanks Pull Request: https://github.com/llvm/llvm-project/pull/134482 --- .../gn/build/sync_source_lists_from_cmake.py | 6 + llvm/utils/gn/secondary/BUILD.gn | 1 + .../compiler-rt/lib/builtins/BUILD.gn | 523 +---------------- .../compiler-rt/lib/builtins/sources.gni | 529 ++++++++++++++++++ .../compiler-rt/test/builtins/BUILD.gn | 87 +++ 5 files changed, 626 insertions(+), 520 deletions(-) create mode 100644 llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni create mode 100644 llvm/utils/gn/secondary/compiler-rt/test/builtins/BUILD.gn diff --git a/llvm/utils/gn/build/sync_source_lists_from_cmake.py b/llvm/utils/gn/build/sync_source_lists_from_cmake.py index a8d922acbd1dd..4cd30aa410864 100755 --- a/llvm/utils/gn/build/sync_source_lists_from_cmake.py +++ b/llvm/utils/gn/build/sync_source_lists_from_cmake.py @@ -108,6 +108,12 @@ def get_sources(source_re, text): gn_cpp = get_sources(gn_cpp_re, open(gn_file).read()) gn_cpp |= get_sources(gn_cpp_re2, open(gn_file).read()) + + sources_file = os.path.join(os.path.dirname(gn_file), "sources.gni") + if os.path.exists(sources_file): + gn_cpp |= get_sources(gn_cpp_re, open(sources_file).read()) + gn_cpp |= get_sources(gn_cpp_re2, open(sources_file).read()) + cmake_cpp = get_sources(cmake_cpp_re, open(cmake_file).read()) if gn_cpp == cmake_cpp: diff --git a/llvm/utils/gn/secondary/BUILD.gn b/llvm/utils/gn/secondary/BUILD.gn index 83093f33fcaf2..a0d9cd641f8d0 100644 --- a/llvm/utils/gn/secondary/BUILD.gn +++ b/llvm/utils/gn/secondary/BUILD.gn @@ -10,6 +10,7 @@ group("default") { "//compiler-rt", "//compiler-rt/include", "//compiler-rt/lib/scudo", + "//compiler-rt/test/builtins", "//lld/test", "//lldb/test", "//llvm/test", diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn index d1048259bcd44..44f5fdc20837c 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn @@ -1,18 +1,9 @@ +import("//compiler-rt/lib/builtins/sources.gni") import("//compiler-rt/target.gni") import("//llvm/utils/gn/build/buildflags.gni") -declare_args() { - # Skip the atomic builtin (should normally be provided by a shared library). - compiler_rt_exclude_atomic_builtin = true -} - lse_targets = [] -if (current_cpu == "x86" || current_cpu == "x64") { - # long double is not 80 bits on Android or MSVC. - long_double_is_80_bits = current_os != "android" && current_os != "win" -} - if (current_cpu == "arm64") { foreach(pat, [ @@ -89,516 +80,8 @@ static_library("builtins") { cflags_c = [ "-std=c11" ] } - sources = [ - "absvdi2.c", - "absvsi2.c", - "absvti2.c", - "adddf3.c", - "addsf3.c", - "addvdi3.c", - "addvsi3.c", - "addvti3.c", - "apple_versioning.c", - "ashldi3.c", - "ashlti3.c", - "ashrdi3.c", - "ashrti3.c", - "bswapdi2.c", - "bswapsi2.c", - "clzdi2.c", - "clzsi2.c", - "clzti2.c", - "cmpdi2.c", - "cmpti2.c", - "comparedf2.c", - "comparesf2.c", - "ctzdi2.c", - "ctzsi2.c", - "ctzti2.c", - "divdc3.c", - "divdf3.c", - "divdi3.c", - "divmoddi4.c", - "divmodsi4.c", - "divmodti4.c", - "divsc3.c", - "divsf3.c", - "divsi3.c", - "divti3.c", - "extendhfsf2.c", - "extendsfdf2.c", - "ffsdi2.c", - "ffssi2.c", - "ffsti2.c", - "fixdfdi.c", - "fixdfsi.c", - "fixdfti.c", - "fixsfdi.c", - "fixsfsi.c", - "fixsfti.c", - "fixunsdfdi.c", - "fixunsdfsi.c", - "fixunsdfti.c", - "fixunssfdi.c", - "fixunssfsi.c", - "fixunssfti.c", - "floatdidf.c", - "floatdisf.c", - "floatsidf.c", - "floatsisf.c", - "floattidf.c", - "floattisf.c", - "floatundidf.c", - "floatundisf.c", - "floatunsidf.c", - "floatunsisf.c", - "floatuntidf.c", - "floatuntisf.c", - "fp_mode.c", - "int_util.c", - "lshrdi3.c", - "lshrti3.c", - "moddi3.c", - "modsi3.c", - "modti3.c", - "muldc3.c", - "muldf3.c", - "muldi3.c", - "mulodi4.c", - "mulosi4.c", - "muloti4.c", - "mulsc3.c", - "mulsf3.c", - "multi3.c", - "mulvdi3.c", - "mulvsi3.c", - "mulvti3.c", - "negdf2.c", - "negdi2.c", - "negsf2.c", - "negti2.c", - "negvdi2.c", - "negvsi2.c", - "negvti2.c", - "os_version_check.c", - "paritydi2.c", - "paritysi2.c", - "parityti2.c", - "popcountdi2.c", - "popcountsi2.c", - "popcountti2.c", - "powidf2.c", - "powisf2.c", - "subdf3.c", - "subsf3.c", - "subvdi3.c", - "subvsi3.c", - "subvti3.c", - "trampoline_setup.c", - "truncdfhf2.c", - "truncdfsf2.c", - "truncsfhf2.c", - "ucmpdi2.c", - "ucmpti2.c", - "udivdi3.c", - "udivmoddi4.c", - "udivmodsi4.c", - "udivmodti4.c", - "udivsi3.c", - "udivti3.c", - "umoddi3.c", - "umodsi3.c", - "umodti3.c", - - # This depends on unwind.h which is present in Clang headers. We should - # reconsider this if we ever decide to support building builtins with - # other compilers. - "gcc_personality_v0.c", - ] - - if (current_os != "fuchsia") { - sources += [ "clear_cache.c" ] - } - - if (current_os != "fuchsia" && current_os != "baremetal") { - sources += [ - "emutls.c", - "enable_execute_stack.c", - "eprintf.c", - ] - } - - if (current_os == "mac" || current_os == "ios") { - sources += [ - "atomic_flag_clear.c", - "atomic_flag_clear_explicit.c", - "atomic_flag_test_and_set.c", - "atomic_flag_test_and_set_explicit.c", - "atomic_signal_fence.c", - "atomic_thread_fence.c", - ] - } - - if ((current_cpu == "x64" && current_os != "win") || current_cpu == "arm64") { - # GENERIC_TF_SOURCES - sources += [ - "addtf3.c", - "comparetf2.c", - "divtc3.c", - "divtf3.c", - "extenddftf2.c", - "extendhftf2.c", - "extendsftf2.c", - "fixtfdi.c", - "fixtfsi.c", - "fixtfti.c", - "fixunstfdi.c", - "fixunstfsi.c", - "fixunstfti.c", - "floatditf.c", - "floatsitf.c", - "floattitf.c", - "floatunditf.c", - "floatunsitf.c", - "floatuntitf.c", - "multc3.c", - "multf3.c", - "powitf2.c", - "subtf3.c", - "trunctfdf2.c", - "trunctfhf2.c", - "trunctfsf2.c", - ] - } - - if (current_cpu == "x86" || current_cpu == "x64") { - sources -= [ "fp_mode.c" ] - sources += [ - "cpu_model/x86.c", - "extendbfsf2.c", - "i386/fp_mode.c", - "truncdfbf2.c", - "truncsfbf2.c", - "trunctfbf2.c", - "truncxfbf2.c", - ] - if (long_double_is_80_bits) { - sources += [ - "divxc3.c", - "extendhfxf2.c", - "extendxftf2.c", - "fixunsxfdi.c", - "fixunsxfsi.c", - "fixunsxfti.c", - "fixxfdi.c", - "fixxfti.c", - "floatdixf.c", - "floattixf.c", - "floatundixf.c", - "floatuntixf.c", - "mulxc3.c", - "powixf2.c", - "trunctfxf2.c", - "truncxfhf2.c", - ] - } - } - if (current_cpu == "x86") { - sources -= [ - "ashldi3.c", - "ashrdi3.c", - "divdi3.c", - "floatdidf.c", - "floatdisf.c", - "floatundidf.c", - "floatundisf.c", - "lshrdi3.c", - "moddi3.c", - "muldi3.c", - "udivdi3.c", - "umoddi3.c", - ] - sources += [ - "i386/ashldi3.S", - "i386/ashrdi3.S", - "i386/divdi3.S", - "i386/floatdidf.S", - "i386/floatdisf.S", - "i386/floatundidf.S", - "i386/floatundisf.S", - "i386/lshrdi3.S", - "i386/moddi3.S", - "i386/muldi3.S", - "i386/udivdi3.S", - "i386/umoddi3.S", - ] - if (long_double_is_80_bits) { - sources -= [ - "floatdixf.c", - "floatundixf.c", - ] - sources += [ - "i386/floatdixf.S", - "i386/floatundixf.S", - ] - } - if (current_os == "win") { - sources += [ "i386/chkstk.S" ] - } - } else if (current_cpu == "x64") { - sources -= [ - "floatdidf.c", - "floatdisf.c", - "floatundidf.c", - "floatundisf.c", - ] - sources += [ - "x86_64/floatdidf.c", - "x86_64/floatdisf.c", - "x86_64/floatundidf.S", - "x86_64/floatundisf.S", - ] - if (long_double_is_80_bits) { - sources -= [ - "floatdixf.c", - "floatundixf.c", - ] - sources += [ - "x86_64/floatdixf.c", - "x86_64/floatundixf.S", - ] - } - if (current_os == "win") { - sources += [ "x86_64/chkstk.S" ] - } - } - - if (current_cpu == "arm") { - if (current_os != "mingw") { - sources -= [ - "bswapdi2.c", - "bswapsi2.c", - "clzdi2.c", - "clzsi2.c", - "comparesf2.c", - "divmodsi4.c", - "divsi3.c", - "fp_mode.c", - "modsi3.c", - "udivmodsi4.c", - "udivsi3.c", - "umodsi3.c", - ] - sources += [ - "arm/aeabi_cdcmp.S", - "arm/aeabi_cdcmpeq_check_nan.c", - "arm/aeabi_cfcmp.S", - "arm/aeabi_cfcmpeq_check_nan.c", - "arm/aeabi_dcmp.S", - "arm/aeabi_div0.c", - "arm/aeabi_drsub.c", - "arm/aeabi_fcmp.S", - "arm/aeabi_frsub.c", - "arm/aeabi_idivmod.S", - "arm/aeabi_ldivmod.S", - "arm/aeabi_memcmp.S", - "arm/aeabi_memcpy.S", - "arm/aeabi_memmove.S", - "arm/aeabi_memset.S", - "arm/aeabi_uidivmod.S", - "arm/aeabi_uldivmod.S", - "arm/bswapdi2.S", - "arm/bswapsi2.S", - "arm/clzdi2.S", - "arm/clzsi2.S", - "arm/comparesf2.S", - "arm/divmodsi4.S", - "arm/divsi3.S", - "arm/fp_mode.c", - "arm/modsi3.S", - "arm/switch16.S", - "arm/switch32.S", - "arm/switch8.S", - "arm/switchu8.S", - "arm/sync_fetch_and_add_4.S", - "arm/sync_fetch_and_add_8.S", - "arm/sync_fetch_and_and_4.S", - "arm/sync_fetch_and_and_8.S", - "arm/sync_fetch_and_max_4.S", - "arm/sync_fetch_and_max_8.S", - "arm/sync_fetch_and_min_4.S", - "arm/sync_fetch_and_min_8.S", - "arm/sync_fetch_and_nand_4.S", - "arm/sync_fetch_and_nand_8.S", - "arm/sync_fetch_and_or_4.S", - "arm/sync_fetch_and_or_8.S", - "arm/sync_fetch_and_sub_4.S", - "arm/sync_fetch_and_sub_8.S", - "arm/sync_fetch_and_umax_4.S", - "arm/sync_fetch_and_umax_8.S", - "arm/sync_fetch_and_umin_4.S", - "arm/sync_fetch_and_umin_8.S", - "arm/sync_fetch_and_xor_4.S", - "arm/sync_fetch_and_xor_8.S", - "arm/sync_synchronize.S", - "arm/udivmodsi4.S", - "arm/udivsi3.S", - "arm/umodsi3.S", - ] - - if (current_os == "android") { - sources += [ - "arm/adddf3vfp.S", - "arm/addsf3vfp.S", - "arm/divdf3vfp.S", - "arm/divsf3vfp.S", - "arm/eqdf2vfp.S", - "arm/eqsf2vfp.S", - "arm/extendsfdf2vfp.S", - "arm/fixdfsivfp.S", - "arm/fixsfsivfp.S", - "arm/fixunsdfsivfp.S", - "arm/fixunssfsivfp.S", - "arm/floatsidfvfp.S", - "arm/floatsisfvfp.S", - "arm/floatunssidfvfp.S", - "arm/floatunssisfvfp.S", - "arm/gedf2vfp.S", - "arm/gesf2vfp.S", - "arm/gtdf2vfp.S", - "arm/gtsf2vfp.S", - "arm/ledf2vfp.S", - "arm/lesf2vfp.S", - "arm/ltdf2vfp.S", - "arm/ltsf2vfp.S", - "arm/muldf3vfp.S", - "arm/mulsf3vfp.S", - "arm/nedf2vfp.S", - "arm/negdf2vfp.S", - "arm/negsf2vfp.S", - "arm/nesf2vfp.S", - "arm/restore_vfp_d8_d15_regs.S", - "arm/save_vfp_d8_d15_regs.S", - "arm/subdf3vfp.S", - "arm/subsf3vfp.S", - "arm/truncdfsf2vfp.S", - "arm/unorddf2vfp.S", - "arm/unordsf2vfp.S", - ] - } - } else { - sources += [ - "arm/aeabi_idivmod.S", - "arm/aeabi_ldivmod.S", - "arm/aeabi_uidivmod.S", - "arm/aeabi_uldivmod.S", - "arm/chkstk.S", - ] - } - } - - if (current_cpu == "arm64") { - sources -= [ "fp_mode.c" ] - sources += [ - "aarch64/fp_mode.c", - "cpu_model/aarch64.c", - ] - if (current_os == "mingw") { - sources += [ "aarch64/chkstk.S" ] - } - } - - if (current_cpu == "avr") { - sources += [ - "avr/divmodhi4.S", - "avr/divmodqi4.S", - "avr/exit.S", - "avr/mulhi3.S", - "avr/mulqi3.S", - "avr/udivmodhi4.S", - "avr/udivmodqi4.S", - ] - } - - if (current_cpu == "hexagon") { - sources += [ - "hexagon/common_entry_exit_abi1.S", - "hexagon/common_entry_exit_abi2.S", - "hexagon/common_entry_exit_legacy.S", - "hexagon/dfaddsub.S", - "hexagon/dfdiv.S", - "hexagon/dffma.S", - "hexagon/dfminmax.S", - "hexagon/dfmul.S", - "hexagon/dfsqrt.S", - "hexagon/divdi3.S", - "hexagon/divsi3.S", - "hexagon/fastmath2_dlib_asm.S", - "hexagon/fastmath2_ldlib_asm.S", - "hexagon/fastmath_dlib_asm.S", - "hexagon/memcpy_forward_vp4cp4n2.S", - "hexagon/memcpy_likely_aligned.S", - "hexagon/moddi3.S", - "hexagon/modsi3.S", - "hexagon/sfdiv_opt.S", - "hexagon/sfsqrt_opt.S", - "hexagon/udivdi3.S", - "hexagon/udivmoddi4.S", - "hexagon/udivmodsi4.S", - "hexagon/udivsi3.S", - "hexagon/umoddi3.S", - "hexagon/umodsi3.S", - ] - } - if (current_cpu == "loongarch" || current_cpu == "loongarch64") { - sources -= [ "fp_mode.c" ] - sources += [ "loongarch/fp_mode.c" ] - } - - if (current_cpu == "ppc64") { - sources += [ - "ppc/divtc3.c", - "ppc/fixtfdi.c", - "ppc/fixtfti.c", - "ppc/fixunstfdi.c", - "ppc/fixunstfti.c", - "ppc/floatditf.c", - "ppc/floattitf.c", - "ppc/floatunditf.c", - "ppc/gcc_qadd.c", - "ppc/gcc_qdiv.c", - "ppc/gcc_qmul.c", - "ppc/gcc_qsub.c", - "ppc/multc3.c", - ] - } - - if (current_cpu == "riscv" || current_cpu == "riscv64") { - sources += [ - "cpu_model/riscv.c", - "riscv/fp_mode.c", - "riscv/restore.S", - "riscv/save.S", - ] - } - if (current_cpu == "riscv") { - sources += [ "riscv/mulsi3.S" ] - } - - if (current_cpu == "riscv64") { - sources += [ "riscv/muldi3.S" ] - } - - if (current_cpu == "ve") { - sources += [ - "ve/grow_stack.S", - "ve/grow_stack_align.S", - ] - } - - if (!compiler_rt_exclude_atomic_builtin) { - sources += [ "atomic.c" ] - } + defines = builtins_defines + sources = builtins_sources deps = lse_targets } diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni new file mode 100644 index 0000000000000..9e8e52c977a17 --- /dev/null +++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni @@ -0,0 +1,529 @@ +declare_args() { + # Skip the atomic builtin (should normally be provided by a shared library). + compiler_rt_exclude_atomic_builtin = true +} + +if (current_cpu == "x86" || current_cpu == "x64") { + # long double is not 80 bits on Android or MSVC. + long_double_is_80_bits = current_os != "android" && current_os != "win" +} + +builtins_defines = [] + +# This is based on who sets HasFloat16 to true in clang/lib/Basic/Targets. +if (current_cpu == "aarch64" || current_cpu == "arm" || + current_cpu == "hexagon" || current_cpu == "riscv" || + current_cpu == "riscv64" || current_cpu == "x64") { + builtins_defines += [ "COMPILER_RT_HAS_FLOAT16" ] +} + +builtins_sources = [ + "absvdi2.c", + "absvsi2.c", + "absvti2.c", + "adddf3.c", + "addsf3.c", + "addvdi3.c", + "addvsi3.c", + "addvti3.c", + "apple_versioning.c", + "ashldi3.c", + "ashlti3.c", + "ashrdi3.c", + "ashrti3.c", + "bswapdi2.c", + "bswapsi2.c", + "clzdi2.c", + "clzsi2.c", + "clzti2.c", + "cmpdi2.c", + "cmpti2.c", + "comparedf2.c", + "comparesf2.c", + "ctzdi2.c", + "ctzsi2.c", + "ctzti2.c", + "divdc3.c", + "divdf3.c", + "divdi3.c", + "divmoddi4.c", + "divmodsi4.c", + "divmodti4.c", + "divsc3.c", + "divsf3.c", + "divsi3.c", + "divti3.c", + "extendhfsf2.c", + "extendsfdf2.c", + "ffsdi2.c", + "ffssi2.c", + "ffsti2.c", + "fixdfdi.c", + "fixdfsi.c", + "fixdfti.c", + "fixsfdi.c", + "fixsfsi.c", + "fixsfti.c", + "fixunsdfdi.c", + "fixunsdfsi.c", + "fixunsdfti.c", + "fixunssfdi.c", + "fixunssfsi.c", + "fixunssfti.c", + "floatdidf.c", + "floatdisf.c", + "floatsidf.c", + "floatsisf.c", + "floattidf.c", + "floattisf.c", + "floatundidf.c", + "floatundisf.c", + "floatunsidf.c", + "floatunsisf.c", + "floatuntidf.c", + "floatuntisf.c", + "fp_mode.c", + "int_util.c", + "lshrdi3.c", + "lshrti3.c", + "moddi3.c", + "modsi3.c", + "modti3.c", + "muldc3.c", + "muldf3.c", + "muldi3.c", + "mulodi4.c", + "mulosi4.c", + "muloti4.c", + "mulsc3.c", + "mulsf3.c", + "multi3.c", + "mulvdi3.c", + "mulvsi3.c", + "mulvti3.c", + "negdf2.c", + "negdi2.c", + "negsf2.c", + "negti2.c", + "negvdi2.c", + "negvsi2.c", + "negvti2.c", + "os_version_check.c", + "paritydi2.c", + "paritysi2.c", + "parityti2.c", + "popcountdi2.c", + "popcountsi2.c", + "popcountti2.c", + "powidf2.c", + "powisf2.c", + "subdf3.c", + "subsf3.c", + "subvdi3.c", + "subvsi3.c", + "subvti3.c", + "trampoline_setup.c", + "truncdfhf2.c", + "truncdfsf2.c", + "truncsfhf2.c", + "ucmpdi2.c", + "ucmpti2.c", + "udivdi3.c", + "udivmoddi4.c", + "udivmodsi4.c", + "udivmodti4.c", + "udivsi3.c", + "udivti3.c", + "umoddi3.c", + "umodsi3.c", + "umodti3.c", + + # This depends on unwind.h which is present in Clang headers. We should + # reconsider this if we ever decide to support building builtins with + # other compilers. + "gcc_personality_v0.c", +] + +if (current_os != "fuchsia") { + builtins_sources += [ "clear_cache.c" ] +} + +if (current_os != "fuchsia" && current_os != "baremetal") { + builtins_sources += [ + "emutls.c", + "enable_execute_stack.c", + "eprintf.c", + ] +} + +if (current_os == "mac" || current_os == "ios") { + builtins_sources += [ + "atomic_flag_clear.c", + "atomic_flag_clear_explicit.c", + "atomic_flag_test_and_set.c", + "atomic_flag_test_and_set_explicit.c", + "atomic_signal_fence.c", + "atomic_thread_fence.c", + ] +} + +if ((current_cpu == "x64" && current_os != "win") || current_cpu == "arm64") { + # GENERIC_TF_SOURCES + builtins_sources += [ + "addtf3.c", + "comparetf2.c", + "divtc3.c", + "divtf3.c", + "extenddftf2.c", + "extendhftf2.c", + "extendsftf2.c", + "fixtfdi.c", + "fixtfsi.c", + "fixtfti.c", + "fixunstfdi.c", + "fixunstfsi.c", + "fixunstfti.c", + "floatditf.c", + "floatsitf.c", + "floattitf.c", + "floatunditf.c", + "floatunsitf.c", + "floatuntitf.c", + "multc3.c", + "multf3.c", + "powitf2.c", + "subtf3.c", + "trunctfdf2.c", + "trunctfhf2.c", + "trunctfsf2.c", + ] +} + +if (current_cpu == "x86" || current_cpu == "x64") { + builtins_sources -= [ "fp_mode.c" ] + builtins_sources += [ + "cpu_model/x86.c", + "extendbfsf2.c", + "i386/fp_mode.c", + "truncdfbf2.c", + "truncsfbf2.c", + "trunctfbf2.c", + "truncxfbf2.c", + ] + if (long_double_is_80_bits) { + builtins_sources += [ + "divxc3.c", + "extendhfxf2.c", + "extendxftf2.c", + "fixunsxfdi.c", + "fixunsxfsi.c", + "fixunsxfti.c", + "fixxfdi.c", + "fixxfti.c", + "floatdixf.c", + "floattixf.c", + "floatundixf.c", + "floatuntixf.c", + "mulxc3.c", + "powixf2.c", + "trunctfxf2.c", + "truncxfhf2.c", + ] + } +} +if (current_cpu == "x86") { + builtins_sources -= [ + "ashldi3.c", + "ashrdi3.c", + "divdi3.c", + "floatdidf.c", + "floatdisf.c", + "floatundidf.c", + "floatundisf.c", + "lshrdi3.c", + "moddi3.c", + "muldi3.c", + "udivdi3.c", + "umoddi3.c", + ] + builtins_sources += [ + "i386/ashldi3.S", + "i386/ashrdi3.S", + "i386/divdi3.S", + "i386/floatdidf.S", + "i386/floatdisf.S", + "i386/floatundidf.S", + "i386/floatundisf.S", + "i386/lshrdi3.S", + "i386/moddi3.S", + "i386/muldi3.S", + "i386/udivdi3.S", + "i386/umoddi3.S", + ] + if (long_double_is_80_bits) { + builtins_sources -= [ + "floatdixf.c", + "floatundixf.c", + ] + builtins_sources += [ + "i386/floatdixf.S", + "i386/floatundixf.S", + ] + } + if (current_os == "win") { + builtins_sources += [ "i386/chkstk.S" ] + } +} else if (current_cpu == "x64") { + builtins_sources -= [ + "floatdidf.c", + "floatdisf.c", + "floatundidf.c", + "floatundisf.c", + ] + builtins_sources += [ + "x86_64/floatdidf.c", + "x86_64/floatdisf.c", + "x86_64/floatundidf.S", + "x86_64/floatundisf.S", + ] + if (long_double_is_80_bits) { + builtins_sources -= [ + "floatdixf.c", + "floatundixf.c", + ] + builtins_sources += [ + "x86_64/floatdixf.c", + "x86_64/floatundixf.S", + ] + } + if (current_os == "win") { + builtins_sources += [ "x86_64/chkstk.S" ] + } +} + +if (current_cpu == "arm") { + if (current_os != "mingw") { + builtins_sources -= [ + "bswapdi2.c", + "bswapsi2.c", + "clzdi2.c", + "clzsi2.c", + "comparesf2.c", + "divmodsi4.c", + "divsi3.c", + "fp_mode.c", + "modsi3.c", + "udivmodsi4.c", + "udivsi3.c", + "umodsi3.c", + ] + builtins_sources += [ + "arm/aeabi_cdcmp.S", + "arm/aeabi_cdcmpeq_check_nan.c", + "arm/aeabi_cfcmp.S", + "arm/aeabi_cfcmpeq_check_nan.c", + "arm/aeabi_dcmp.S", + "arm/aeabi_div0.c", + "arm/aeabi_drsub.c", + "arm/aeabi_fcmp.S", + "arm/aeabi_frsub.c", + "arm/aeabi_idivmod.S", + "arm/aeabi_ldivmod.S", + "arm/aeabi_memcmp.S", + "arm/aeabi_memcpy.S", + "arm/aeabi_memmove.S", + "arm/aeabi_memset.S", + "arm/aeabi_uidivmod.S", + "arm/aeabi_uldivmod.S", + "arm/bswapdi2.S", + "arm/bswapsi2.S", + "arm/clzdi2.S", + "arm/clzsi2.S", + "arm/comparesf2.S", + "arm/divmodsi4.S", + "arm/divsi3.S", + "arm/fp_mode.c", + "arm/modsi3.S", + "arm/switch16.S", + "arm/switch32.S", + "arm/switch8.S", + "arm/switchu8.S", + "arm/sync_fetch_and_add_4.S", + "arm/sync_fetch_and_add_8.S", + "arm/sync_fetch_and_and_4.S", + "arm/sync_fetch_and_and_8.S", + "arm/sync_fetch_and_max_4.S", + "arm/sync_fetch_and_max_8.S", + "arm/sync_fetch_and_min_4.S", + "arm/sync_fetch_and_min_8.S", + "arm/sync_fetch_and_nand_4.S", + "arm/sync_fetch_and_nand_8.S", + "arm/sync_fetch_and_or_4.S", + "arm/sync_fetch_and_or_8.S", + "arm/sync_fetch_and_sub_4.S", + "arm/sync_fetch_and_sub_8.S", + "arm/sync_fetch_and_umax_4.S", + "arm/sync_fetch_and_umax_8.S", + "arm/sync_fetch_and_umin_4.S", + "arm/sync_fetch_and_umin_8.S", + "arm/sync_fetch_and_xor_4.S", + "arm/sync_fetch_and_xor_8.S", + "arm/sync_synchronize.S", + "arm/udivmodsi4.S", + "arm/udivsi3.S", + "arm/umodsi3.S", + ] + + if (current_os == "android") { + builtins_sources += [ + "arm/adddf3vfp.S", + "arm/addsf3vfp.S", + "arm/divdf3vfp.S", + "arm/divsf3vfp.S", + "arm/eqdf2vfp.S", + "arm/eqsf2vfp.S", + "arm/extendsfdf2vfp.S", + "arm/fixdfsivfp.S", + "arm/fixsfsivfp.S", + "arm/fixunsdfsivfp.S", + "arm/fixunssfsivfp.S", + "arm/floatsidfvfp.S", + "arm/floatsisfvfp.S", + "arm/floatunssidfvfp.S", + "arm/floatunssisfvfp.S", + "arm/gedf2vfp.S", + "arm/gesf2vfp.S", + "arm/gtdf2vfp.S", + "arm/gtsf2vfp.S", + "arm/ledf2vfp.S", + "arm/lesf2vfp.S", + "arm/ltdf2vfp.S", + "arm/ltsf2vfp.S", + "arm/muldf3vfp.S", + "arm/mulsf3vfp.S", + "arm/nedf2vfp.S", + "arm/negdf2vfp.S", + "arm/negsf2vfp.S", + "arm/nesf2vfp.S", + "arm/restore_vfp_d8_d15_regs.S", + "arm/save_vfp_d8_d15_regs.S", + "arm/subdf3vfp.S", + "arm/subsf3vfp.S", + "arm/truncdfsf2vfp.S", + "arm/unorddf2vfp.S", + "arm/unordsf2vfp.S", + ] + } + } else { + builtins_sources += [ + "arm/aeabi_idivmod.S", + "arm/aeabi_ldivmod.S", + "arm/aeabi_uidivmod.S", + "arm/aeabi_uldivmod.S", + "arm/chkstk.S", + ] + } +} + +if (current_cpu == "arm64") { + builtins_sources -= [ "fp_mode.c" ] + builtins_sources += [ + "aarch64/fp_mode.c", + "cpu_model/aarch64.c", + ] + if (current_os == "mingw") { + builtins_sources += [ "aarch64/chkstk.S" ] + } +} + +if (current_cpu == "avr") { + builtins_sources += [ + "avr/divmodhi4.S", + "avr/divmodqi4.S", + "avr/exit.S", + "avr/mulhi3.S", + "avr/mulqi3.S", + "avr/udivmodhi4.S", + "avr/udivmodqi4.S", + ] +} + +if (current_cpu == "hexagon") { + builtins_sources += [ + "hexagon/common_entry_exit_abi1.S", + "hexagon/common_entry_exit_abi2.S", + "hexagon/common_entry_exit_legacy.S", + "hexagon/dfaddsub.S", + "hexagon/dfdiv.S", + "hexagon/dffma.S", + "hexagon/dfminmax.S", + "hexagon/dfmul.S", + "hexagon/dfsqrt.S", + "hexagon/divdi3.S", + "hexagon/divsi3.S", + "hexagon/fastmath2_dlib_asm.S", + "hexagon/fastmath2_ldlib_asm.S", + "hexagon/fastmath_dlib_asm.S", + "hexagon/memcpy_forward_vp4cp4n2.S", + "hexagon/memcpy_likely_aligned.S", + "hexagon/moddi3.S", + "hexagon/modsi3.S", + "hexagon/sfdiv_opt.S", + "hexagon/sfsqrt_opt.S", + "hexagon/udivdi3.S", + "hexagon/udivmoddi4.S", + "hexagon/udivmodsi4.S", + "hexagon/udivsi3.S", + "hexagon/umoddi3.S", + "hexagon/umodsi3.S", + ] +} +if (current_cpu == "loongarch" || current_cpu == "loongarch64") { + builtins_sources -= [ "fp_mode.c" ] + builtins_sources += [ "loongarch/fp_mode.c" ] +} + +if (current_cpu == "ppc64") { + builtins_sources += [ + "ppc/divtc3.c", + "ppc/fixtfdi.c", + "ppc/fixtfti.c", + "ppc/fixunstfdi.c", + "ppc/fixunstfti.c", + "ppc/floatditf.c", + "ppc/floattitf.c", + "ppc/floatunditf.c", + "ppc/gcc_qadd.c", + "ppc/gcc_qdiv.c", + "ppc/gcc_qmul.c", + "ppc/gcc_qsub.c", + "ppc/multc3.c", + ] +} + +if (current_cpu == "riscv" || current_cpu == "riscv64") { + builtins_sources += [ + "cpu_model/riscv.c", + "riscv/fp_mode.c", + "riscv/restore.S", + "riscv/save.S", + ] +} +if (current_cpu == "riscv") { + builtins_sources += [ "riscv/mulsi3.S" ] +} + +if (current_cpu == "riscv64") { + builtins_sources += [ "riscv/muldi3.S" ] +} + +if (current_cpu == "ve") { + builtins_sources += [ + "ve/grow_stack.S", + "ve/grow_stack_align.S", + ] +} + +if (!compiler_rt_exclude_atomic_builtin) { + builtins_sources += [ "atomic.c" ] +} diff --git a/llvm/utils/gn/secondary/compiler-rt/test/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/builtins/BUILD.gn new file mode 100644 index 0000000000000..87848075a804e --- /dev/null +++ b/llvm/utils/gn/secondary/compiler-rt/test/builtins/BUILD.gn @@ -0,0 +1,87 @@ +import("//compiler-rt/lib/builtins/sources.gni") +import("//compiler-rt/target.gni") +import("//compiler-rt/test/test.gni") +import("//llvm/lib/Target/targets.gni") +import("//llvm/utils/gn/build/toolchain/compiler.gni") +import("//llvm/utils/gn/build/write_cmake_config.gni") +import("//llvm/version.gni") + +write_cmake_config("builtins_mode_cfg") { + input = "Unit/lit.site.cfg.py.in" + output = + "$target_gen_dir/${crt_current_target_arch}BuiltinsConfig/lit.site.cfg.py" + values = [ + "BUILTINS_TEST_CONFIG_SUFFIX=$crt_current_target_suffix", + "BUILTINS_LIT_SOURCE_DIR=" + rebase_path("."), + "BUILTINS_TEST_TARGET_ARCH=$crt_current_target_arch", + "SANITIZER_TEST_CXX_LIBNAME=libc++", + "COMPILER_RT_BUILD_CRT_PYBOOL=False", + "MSVC_PYBOOL=False", + "BUILTINS_IS_MSVC_PYBOOL=False", + + "COMPILER_RT_BINARY_DIR=" + rebase_path("$root_gen_dir/compiler-rt"), + "LIT_SITE_CFG_IN_HEADER=## Autogenerated from $input, do not edit", + ] + + features = "" + foreach(source, builtins_sources) { + if (features != "") { + features += ";" + } + features += "librt_has_" + get_path_info(source, "name") + } + values += [ "BUILTINS_LIT_SOURCE_FEATURES=$features" ] + + cflags = "" + foreach(define, builtins_defines) { + cflags += " -D$define" + } + values += [ "BUILTINS_TEST_TARGET_CFLAGS=$cflags" ] +} + +if (current_toolchain != host_toolchain) { + group("builtins_toolchain") { + deps = [ + ":builtins_mode_cfg", + "//compiler-rt/include($host_toolchain)", + "//compiler-rt/lib/builtins", + "//compiler-rt/test:lit_common_configured", + ] + } +} + +supported_toolchains = [] +if (host_os == "linux") { + supported_toolchains += [ "//llvm/utils/gn/build/toolchain:stage2_unix" ] +} + +group("builtins") { + deps = [] + foreach(toolchain, supported_toolchains) { + deps += [ ":builtins_toolchain($toolchain)" ] + } +} + +if (supported_toolchains != []) { + action("check-builtins") { + script = "$root_build_dir/bin/llvm-lit" + args = [ "-sv" ] + foreach(toolchain, supported_toolchains) { + test_dir = rebase_path( + get_label_info(":lit_site_cfg($toolchain)", "target_gen_dir"), + root_build_dir) + args += [ test_dir + "/${crt_current_target_arch}BuiltinsConfig" ] + } + outputs = [ "$target_gen_dir/run-lit" ] # Non-existing, so that ninja runs + # it each time. + + # Since check-builtins is always dirty, //:default doesn't depend on it so + # that it's not part of the default ninja target. Hence, check-builtins + # shouldn't have any deps except :builtins. so that the default target is + # sure to build all the deps. + deps = [ ":builtins" ] + testonly = true + + pool = "//:console" + } +} From 15750a0ab2356bea9544b70a72edce421060086e Mon Sep 17 00:00:00 2001 From: Nicolas van Kempen Date: Mon, 7 Apr 2025 16:34:07 -0400 Subject: [PATCH 0897/1029] [clang-tidy] Use --match-full-lines instead of --strict-whitespace in check_clang_tidy (#133756) See Discourse post here: https://discourse.llvm.org/t/rfc-using-match-full-lines-in-clang-tidy-tests/85553 I've added `--match-partial-fixes` to all tests that were failing, unless I noticed the fix was quick and trivial. --- clang-tools-extra/docs/ReleaseNotes.rst | 6 +++ .../test/clang-tidy/check_clang_tidy.py | 16 +++++-- .../checkers/abseil/duration-addition.cpp | 2 +- .../checkers/abseil/duration-comparison.cpp | 2 +- .../abseil/duration-conversion-cast.cpp | 2 +- .../abseil/duration-factory-float.cpp | 2 +- .../abseil/duration-factory-scale.cpp | 2 +- .../checkers/abseil/duration-subtraction.cpp | 2 +- .../duration-unnecessary-conversion.cpp | 2 +- .../abseil/redundant-strcat-calls.cpp | 2 +- .../checkers/abseil/time-comparison.cpp | 2 +- .../checkers/abseil/time-subtraction.cpp | 2 +- .../abseil/upgrade-duration-conversions.cpp | 2 +- .../checkers/altera/struct-pack-align.cpp | 2 +- .../checkers/android/cloexec-memfd-create.cpp | 2 +- .../checkers/android/cloexec-open.cpp | 2 +- .../checkers/android/cloexec-socket.cpp | 2 +- .../incorrect-enable-shared-from-this.cpp | 2 +- .../bugprone/move-forwarding-reference.cpp | 2 +- ...rminated-result-in-initialization-strlen.c | 2 +- ...null-terminated-result-memcpy-safe-cxx.cpp | 2 +- .../not-null-terminated-result-strlen.c | 2 +- .../not-null-terminated-result-wcslen.cpp | 2 +- .../checkers/bugprone/posix-return.cpp | 2 +- .../checkers/bugprone/standalone-empty.cpp | 44 +++++++++---------- .../checkers/bugprone/stringview-nullptr.cpp | 2 +- .../bugprone/suspicious-string-compare.cpp | 2 +- .../checkers/bugprone/swapped-arguments.cpp | 2 +- .../prefer-member-initializer.cpp | 2 +- .../pro-bounds-constant-array-index.cpp | 2 +- .../pro-type-member-init-use-assignment.cpp | 2 +- .../pro-type-member-init.cpp | 2 +- .../virtual-class-destructor.cpp | 2 +- .../google/build-explicit-make-pair.cpp | 2 +- .../checkers/google/objc-avoid-nsobject-new.m | 2 +- .../google/upgrade-googletest-case.cpp | 2 +- ...prefer-isa-or-dyn-cast-in-conditionals.cpp | 2 +- .../llvm/prefer-register-over-unsigned.cpp | 2 +- .../llvm/prefer-register-over-unsigned2.cpp | 4 +- .../llvm/prefer-register-over-unsigned3.cpp | 6 +-- .../clang-tidy/checkers/llvm/twine-local.cpp | 2 +- .../const-correctness-pointer-as-pointers.cpp | 2 +- .../const-correctness-pointer-as-values.cpp | 2 +- .../misc/const-correctness-templates.cpp | 2 +- .../const-correctness-values-before-cxx23.cpp | 2 +- .../misc/const-correctness-values.cpp | 2 +- .../checkers/misc/definitions-in-headers.hpp | 2 +- .../checkers/misc/unused-parameters.cpp | 2 +- .../checkers/modernize/avoid-bind.cpp | 2 +- .../modernize/concat-nested-namespaces.cpp | 4 +- .../checkers/modernize/loop-convert-basic.cpp | 2 +- .../modernize/loop-convert-camelback.cpp | 6 +-- .../checkers/modernize/loop-convert-const.cpp | 2 +- .../checkers/modernize/loop-convert-extra.cpp | 2 +- .../modernize/loop-convert-lowercase.cpp | 8 ++-- .../loop-convert-rewritten-binop.cpp | 4 +- .../modernize/loop-convert-uppercase.cpp | 8 ++-- .../checkers/modernize/make-shared.cpp | 2 +- .../checkers/modernize/make-unique.cpp | 2 +- .../min-max-use-initializer-list.cpp | 2 +- .../checkers/modernize/pass-by-value.cpp | 2 +- .../checkers/modernize/redundant-void-arg.cpp | 2 +- .../checkers/modernize/replace-auto-ptr.cpp | 2 +- .../checkers/modernize/type-traits.cpp | 2 +- .../modernize/use-auto-cast-remove-stars.cpp | 2 +- .../checkers/modernize/use-auto-cast.cpp | 2 +- .../modernize/use-auto-for-pointer.cpp | 4 +- .../checkers/modernize/use-auto-iterator.cpp | 2 +- .../use-auto-min-type-name-length.cpp | 8 ++-- .../modernize/use-equals-default-copy.cpp | 2 +- .../checkers/modernize/use-equals-default.cpp | 2 +- .../use-integer-sign-comparison-qt.cpp | 5 ++- .../modernize/use-integer-sign-comparison.cpp | 3 +- .../checkers/modernize/use-nullptr.cpp | 2 +- .../checkers/modernize/use-override.cpp | 2 +- .../modernize/use-starts-ends-with.cpp | 14 +++--- .../checkers/modernize/use-std-format-fmt.cpp | 2 +- .../checkers/modernize/use-std-format.cpp | 4 +- .../checkers/modernize/use-std-print.cpp | 4 +- .../checkers/modernize/use-using.cpp | 2 +- .../performance/faster-string-find.cpp | 4 +- .../checkers/performance/for-range-copy.cpp | 2 +- .../noexcept-move-constructor-fix.cpp | 2 +- .../unnecessary-copy-initialization.cpp | 2 +- .../unnecessary-value-param-delayed.cpp | 2 +- .../performance/unnecessary-value-param.cpp | 2 +- .../readability/braces-around-statements.cpp | 2 +- .../readability/const-return-type.cpp | 2 +- .../readability/container-size-empty.cpp | 2 +- .../readability/implicit-bool-conversion.c | 2 +- .../readability/implicit-bool-conversion.cpp | 2 +- .../readability/math-missing-parentheses.cpp | 2 +- .../simplify-boolean-expr-members.cpp | 2 +- .../readability/simplify-boolean-expr.cpp | 2 +- ...ase-literal-suffix-integer-custom-list.cpp | 2 +- ...ate-conflicted-fixes-of-alias-checkers.cpp | 2 +- .../duplicate-fixes-of-alias-checkers.cpp | 2 +- 97 files changed, 164 insertions(+), 146 deletions(-) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index fefb085409b44..a8ae35c7f744e 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -91,6 +91,12 @@ Improvements to clang-query Improvements to clang-tidy -------------------------- +- Changed the :program:`check_clang_tidy.py` tool to use FileCheck's + ``--match-full-lines`` instead of ``strict-whitespace`` for ``CHECK-FIXES`` + clauses. Added a ``--match-partial-fixes`` option to keep previous behavior on + specific tests. This may break tests for users with custom out-of-tree checks + who use :program:`check_clang_tidy.py` as-is. + - Improved :program:`clang-tidy-diff.py` script. Add the `-warnings-as-errors` argument to treat warnings as errors. diff --git a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py index 93c49566a90e3..774bc970ef284 100755 --- a/clang-tools-extra/test/clang-tidy/check_clang_tidy.py +++ b/clang-tools-extra/test/clang-tidy/check_clang_tidy.py @@ -105,6 +105,7 @@ def __init__(self, args: argparse.Namespace, extra_args: List[str]) -> None: self.fixes = MessagePrefix("CHECK-FIXES") self.messages = MessagePrefix("CHECK-MESSAGES") self.notes = MessagePrefix("CHECK-NOTES") + self.match_partial_fixes = args.match_partial_fixes file_name_with_extension = self.assume_file_name or self.input_file_name _, extension = os.path.splitext(file_name_with_extension) @@ -248,10 +249,14 @@ def check_fixes(self) -> None: try_run( [ "FileCheck", - "-input-file=" + self.temp_file_name, + "--input-file=" + self.temp_file_name, self.input_file_name, - "-check-prefixes=" + ",".join(self.fixes.prefixes), - "-strict-whitespace", + "--check-prefixes=" + ",".join(self.fixes.prefixes), + ( + "--match-full-lines" + if not self.match_partial_fixes + else "--strict-whitespace" # Keeping past behavior. + ), ] ) @@ -372,6 +377,11 @@ def parse_arguments() -> Tuple[argparse.Namespace, List[str]]: default=["c++11-or-later"], help="Passed to clang. Special -or-later values are expanded.", ) + parser.add_argument( + "--match-partial-fixes", + action="store_true", + help="allow partial line matches for fixes", + ) return parser.parse_known_args() diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-addition.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-addition.cpp index 33cfc58fef3c6..562b513d784e6 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-addition.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-addition.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s abseil-duration-addition %t -- -- -I%S/Inputs +// RUN: %check_clang_tidy --match-partial-fixes %s abseil-duration-addition %t -- -- -I%S/Inputs #include "absl/time/time.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-comparison.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-comparison.cpp index 9fa422bec05ab..6110dfded6bac 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-comparison.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-comparison.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s abseil-duration-comparison %t -- -- -I%S/Inputs +// RUN: %check_clang_tidy --match-partial-fixes %s abseil-duration-comparison %t -- -- -I%S/Inputs #include "absl/time/time.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-conversion-cast.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-conversion-cast.cpp index 0c2a9d791f1d1..368b9d63e0ec7 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-conversion-cast.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-conversion-cast.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s abseil-duration-conversion-cast %t -- -- -I%S/Inputs +// RUN: %check_clang_tidy --match-partial-fixes %s abseil-duration-conversion-cast %t -- -- -I%S/Inputs #include "absl/time/time.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-factory-float.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-factory-float.cpp index 2649d2b90d8e6..2f38dbfe9778d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-factory-float.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-factory-float.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s abseil-duration-factory-float %t -- -- -I%S/Inputs +// RUN: %check_clang_tidy --match-partial-fixes %s abseil-duration-factory-float %t -- -- -I%S/Inputs #include "absl/time/time.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-factory-scale.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-factory-scale.cpp index 04c361328f5da..dd5f808f5a4c3 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-factory-scale.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-factory-scale.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s abseil-duration-factory-scale %t -- -- -I%S/Inputs +// RUN: %check_clang_tidy --match-partial-fixes %s abseil-duration-factory-scale %t -- -- -I%S/Inputs #include "absl/time/time.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-subtraction.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-subtraction.cpp index bd6f3172d7779..167258e32599d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-subtraction.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-subtraction.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s abseil-duration-subtraction %t -- -- -I %S/Inputs +// RUN: %check_clang_tidy --match-partial-fixes %s abseil-duration-subtraction %t -- -- -I %S/Inputs #include "absl/time/time.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-unnecessary-conversion.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-unnecessary-conversion.cpp index 9730f6b29b1f9..f4c69c5adc440 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-unnecessary-conversion.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/duration-unnecessary-conversion.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++11-or-later %s abseil-duration-unnecessary-conversion %t -- -- -I %S/Inputs +// RUN: %check_clang_tidy --match-partial-fixes -std=c++11-or-later %s abseil-duration-unnecessary-conversion %t -- -- -I %S/Inputs #include "absl/time/time.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/redundant-strcat-calls.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/redundant-strcat-calls.cpp index dbd354b132e2f..b5e866c3043fd 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/redundant-strcat-calls.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/redundant-strcat-calls.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s abseil-redundant-strcat-calls %t -- -- -isystem %clang_tidy_headers +// RUN: %check_clang_tidy --match-partial-fixes %s abseil-redundant-strcat-calls %t -- -- -isystem %clang_tidy_headers #include namespace absl { diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/time-comparison.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/time-comparison.cpp index ab03020c3c778..4de43ec56436e 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/time-comparison.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/time-comparison.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s abseil-time-comparison %t -- -- -I%S/Inputs +// RUN: %check_clang_tidy --match-partial-fixes %s abseil-time-comparison %t -- -- -I%S/Inputs #include "absl/time/time.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/time-subtraction.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/time-subtraction.cpp index 43d1feea1ec19..82014e8f46a5f 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/time-subtraction.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/time-subtraction.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++11-or-later %s abseil-time-subtraction %t -- -- -I %S/Inputs +// RUN: %check_clang_tidy --match-partial-fixes -std=c++11-or-later %s abseil-time-subtraction %t -- -- -I %S/Inputs #include "absl/time/time.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/upgrade-duration-conversions.cpp b/clang-tools-extra/test/clang-tidy/checkers/abseil/upgrade-duration-conversions.cpp index 32e65a63eb1c5..b5dfb4f4d73e8 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/abseil/upgrade-duration-conversions.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/upgrade-duration-conversions.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++11-or-later %s abseil-upgrade-duration-conversions %t -- -- -I%S/Inputs +// RUN: %check_clang_tidy --match-partial-fixes -std=c++11-or-later %s abseil-upgrade-duration-conversions %t -- -- -I%S/Inputs using int64_t = long long; diff --git a/clang-tools-extra/test/clang-tidy/checkers/altera/struct-pack-align.cpp b/clang-tools-extra/test/clang-tidy/checkers/altera/struct-pack-align.cpp index 472372ffe35c1..9aaca68b363a1 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/altera/struct-pack-align.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/altera/struct-pack-align.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s altera-struct-pack-align %t -- -header-filter=.* +// RUN: %check_clang_tidy --match-partial-fixes %s altera-struct-pack-align %t -- -header-filter=.* // Struct needs both alignment and packing struct error { diff --git a/clang-tools-extra/test/clang-tidy/checkers/android/cloexec-memfd-create.cpp b/clang-tools-extra/test/clang-tidy/checkers/android/cloexec-memfd-create.cpp index a8dafd5e887a5..b2c299b46d0a3 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/android/cloexec-memfd-create.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/android/cloexec-memfd-create.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s android-cloexec-memfd-create %t +// RUN: %check_clang_tidy --match-partial-fixes %s android-cloexec-memfd-create %t #define MFD_ALLOW_SEALING 1 #define __O_CLOEXEC 3 diff --git a/clang-tools-extra/test/clang-tidy/checkers/android/cloexec-open.cpp b/clang-tools-extra/test/clang-tidy/checkers/android/cloexec-open.cpp index 4ef1f400dad18..651e469721284 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/android/cloexec-open.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/android/cloexec-open.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s android-cloexec-open %t +// RUN: %check_clang_tidy --match-partial-fixes %s android-cloexec-open %t #define O_RDWR 1 #define O_EXCL 2 diff --git a/clang-tools-extra/test/clang-tidy/checkers/android/cloexec-socket.cpp b/clang-tools-extra/test/clang-tidy/checkers/android/cloexec-socket.cpp index 25f332d313871..d4d58640f0eea 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/android/cloexec-socket.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/android/cloexec-socket.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s android-cloexec-socket %t +// RUN: %check_clang_tidy --match-partial-fixes %s android-cloexec-socket %t #define SOCK_STREAM 1 #define SOCK_DGRAM 2 diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/incorrect-enable-shared-from-this.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/incorrect-enable-shared-from-this.cpp index d9048ef359281..82b6ea84e6ff7 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/incorrect-enable-shared-from-this.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/incorrect-enable-shared-from-this.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++11-or-later %s bugprone-incorrect-enable-shared-from-this %t +// RUN: %check_clang_tidy --match-partial-fixes -std=c++11-or-later %s bugprone-incorrect-enable-shared-from-this %t // NOLINTBEGIN namespace std { diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/move-forwarding-reference.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/move-forwarding-reference.cpp index 68eeb126b5dfa..66cd6baa4382d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/move-forwarding-reference.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/move-forwarding-reference.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++14-or-later %s bugprone-move-forwarding-reference %t -- -- -fno-delayed-template-parsing +// RUN: %check_clang_tidy --match-partial-fixes -std=c++14-or-later %s bugprone-move-forwarding-reference %t -- -- -fno-delayed-template-parsing namespace std { template struct remove_reference; diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-in-initialization-strlen.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-in-initialization-strlen.c index 6e83804b45c60..a383958fbb906 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-in-initialization-strlen.c +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-in-initialization-strlen.c @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s bugprone-not-null-terminated-result %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes %s bugprone-not-null-terminated-result %t -- \ // RUN: -- -std=c11 -I %S/Inputs/not-null-terminated-result #include "not-null-terminated-result-c.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-memcpy-safe-cxx.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-memcpy-safe-cxx.cpp index 97a7f268d469b..8124b3bfa2268 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-memcpy-safe-cxx.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-memcpy-safe-cxx.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s bugprone-not-null-terminated-result %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes %s bugprone-not-null-terminated-result %t -- \ // RUN: -- -std=c++11 -I %S/Inputs/not-null-terminated-result #include "not-null-terminated-result-cxx.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c index 6a907689921d0..4970af83bf4b6 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-strlen.c @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s bugprone-not-null-terminated-result %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes %s bugprone-not-null-terminated-result %t -- \ // RUN: -- -std=c11 -I %S/Inputs/not-null-terminated-result // FIXME: Something wrong with the APInt un/signed conversion on Windows: diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp index 7eb9330b36a26..06e2db9d6e0d6 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/not-null-terminated-result-wcslen.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s bugprone-not-null-terminated-result %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes %s bugprone-not-null-terminated-result %t -- \ // RUN: -- -std=c++11 -I %S/Inputs/not-null-terminated-result // FIXME: Something wrong with the APInt un/signed conversion on Windows: diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/posix-return.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/posix-return.cpp index 76d447a71d68b..8db05362069f7 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/posix-return.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/posix-return.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s bugprone-posix-return %t +// RUN: %check_clang_tidy --match-partial-fixes %s bugprone-posix-return %t #define NULL nullptr #define ZERO 0 diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/standalone-empty.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/standalone-empty.cpp index 53c651879f84b..bbb48d06ed924 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/standalone-empty.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/standalone-empty.cpp @@ -176,14 +176,14 @@ bool test_member_empty() { std::vector_with_clear v; v.empty(); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'empty()'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { std::vector_with_int_empty v; v.empty(); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'empty()'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { @@ -214,14 +214,14 @@ bool test_member_empty() { absl::string_with_clear s; s.empty(); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'empty()'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} s.clear();{{$}} + // CHECK-FIXES: s.clear(); } { absl::string_with_int_empty s; s.empty(); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'empty()'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} s.clear();{{$}} + // CHECK-FIXES: s.clear(); } { @@ -302,11 +302,11 @@ bool test_qualified_empty() { absl::string_with_clear v; std::empty(v); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'std::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); absl::empty(v); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'absl::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); test::empty(v); // no-warning @@ -361,21 +361,21 @@ bool test_unqualified_empty() { std::vector_with_void_empty v; empty(v); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'std::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { std::vector_with_clear v; empty(v); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'std::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { std::vector_with_int_empty v; empty(v); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'std::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { @@ -400,21 +400,21 @@ bool test_unqualified_empty() { absl::string_with_void_empty s; empty(s); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'absl::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} s.clear();{{$}} + // CHECK-FIXES: s.clear(); } { absl::string_with_clear s; empty(s); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'absl::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} s.clear();{{$}} + // CHECK-FIXES: s.clear(); } { absl::string_with_int_empty s; empty(s); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'absl::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} s.clear();{{$}} + // CHECK-FIXES: s.clear(); } { @@ -441,7 +441,7 @@ bool test_unqualified_empty() { using std::empty; empty(v); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'std::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { @@ -456,7 +456,7 @@ bool test_unqualified_empty() { using absl::empty; empty(s); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'absl::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} s.clear();{{$}} + // CHECK-FIXES: s.clear(); } { @@ -637,14 +637,14 @@ bool test_clear_in_base_class() { base::vector v; v.empty(); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'empty()'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { base::vector_non_dependent v; v.empty(); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'empty()'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { @@ -663,14 +663,14 @@ bool test_clear_in_base_class() { base::vector v; empty(v); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'base::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { base::vector_non_dependent v; empty(v); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'base::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { @@ -775,14 +775,14 @@ bool test_clear_with_qualifiers() { qualifiers::vector_with_volatile_clear v; v.empty(); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'empty()'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { volatile qualifiers::vector_with_volatile_clear v; v.empty(); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'empty()'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { @@ -795,14 +795,14 @@ bool test_clear_with_qualifiers() { qualifiers::vector_with_volatile_clear v; empty(v); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'qualifiers::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { volatile qualifiers::vector_with_volatile_clear v; empty(v); // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: ignoring the result of 'qualifiers::empty'; did you mean 'clear()'? [bugprone-standalone-empty] - // CHECK-FIXES: {{^ }} v.clear();{{$}} + // CHECK-FIXES: v.clear(); } { diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/stringview-nullptr.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/stringview-nullptr.cpp index ff5b256e71781..129a841b88eaa 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/stringview-nullptr.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/stringview-nullptr.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s bugprone-stringview-nullptr -std=c++17 %t +// RUN: %check_clang_tidy --match-partial-fixes %s bugprone-stringview-nullptr -std=c++17 %t namespace std { diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-string-compare.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-string-compare.cpp index c14b094f3fca3..d670fa9dec70f 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-string-compare.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-string-compare.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s bugprone-suspicious-string-compare %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes %s bugprone-suspicious-string-compare %t -- \ // RUN: -config='{CheckOptions: \ // RUN: {bugprone-suspicious-string-compare.WarnOnImplicitComparison: true, \ // RUN: bugprone-suspicious-string-compare.WarnOnLogicalNotComparison: true}}' \ diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/swapped-arguments.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/swapped-arguments.cpp index b2ad08be907a7..3d21396bf04eb 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/swapped-arguments.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/swapped-arguments.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s bugprone-swapped-arguments %t +// RUN: %check_clang_tidy --match-partial-fixes %s bugprone-swapped-arguments %t void F(int, double); diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp index 3432b1c84a9a5..7d6164946fc3d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s cppcoreguidelines-prefer-member-initializer %t -- -- -fcxx-exceptions +// RUN: %check_clang_tidy --match-partial-fixes %s cppcoreguidelines-prefer-member-initializer %t -- -- -fcxx-exceptions extern void __assert_fail (__const char *__assertion, __const char *__file, unsigned int __line, __const char *__function) diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-bounds-constant-array-index.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-bounds-constant-array-index.cpp index e8a4bc40a2e4e..6fd52276c2ff1 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-bounds-constant-array-index.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-bounds-constant-array-index.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s cppcoreguidelines-pro-bounds-constant-array-index %t +// RUN: %check_clang_tidy --match-partial-fixes %s cppcoreguidelines-pro-bounds-constant-array-index %t typedef __SIZE_TYPE__ size_t; diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-member-init-use-assignment.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-member-init-use-assignment.cpp index f1f64543c112a..c15d444bd0a66 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-member-init-use-assignment.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-member-init-use-assignment.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s cppcoreguidelines-pro-type-member-init %t -- -config="{CheckOptions: {cppcoreguidelines-pro-type-member-init.UseAssignment: true}}" -- -fsigned-char +// RUN: %check_clang_tidy --match-partial-fixes %s cppcoreguidelines-pro-type-member-init %t -- -config="{CheckOptions: {cppcoreguidelines-pro-type-member-init.UseAssignment: true}}" -- -fsigned-char struct T { int i; diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-member-init.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-member-init.cpp index d999b84cae03e..8896732110583 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-member-init.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-member-init.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++11,c++14,c++17 %s cppcoreguidelines-pro-type-member-init %t -- -- -fno-delayed-template-parsing +// RUN: %check_clang_tidy --match-partial-fixes -std=c++11,c++14,c++17 %s cppcoreguidelines-pro-type-member-init %t -- -- -fno-delayed-template-parsing // FIXME: Fix the checker to work in C++20 mode. struct PositiveFieldBeforeConstructor { diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/virtual-class-destructor.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/virtual-class-destructor.cpp index c599fc0d17c13..44d0251e354a9 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/virtual-class-destructor.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/virtual-class-destructor.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s cppcoreguidelines-virtual-class-destructor %t -- --fix-notes +// RUN: %check_clang_tidy --match-partial-fixes %s cppcoreguidelines-virtual-class-destructor %t -- --fix-notes // CHECK-MESSAGES: :[[@LINE+4]]:8: warning: destructor of 'PrivateVirtualBaseStruct' is private and prevents using the type [cppcoreguidelines-virtual-class-destructor] // CHECK-MESSAGES: :[[@LINE+3]]:8: note: make it public and virtual diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/build-explicit-make-pair.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/build-explicit-make-pair.cpp index 6dcd357f51131..94d546278eb93 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/google/build-explicit-make-pair.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/google/build-explicit-make-pair.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s google-build-explicit-make-pair %t +// RUN: %check_clang_tidy --match-partial-fixes %s google-build-explicit-make-pair %t namespace std { template diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/objc-avoid-nsobject-new.m b/clang-tools-extra/test/clang-tidy/checkers/google/objc-avoid-nsobject-new.m index f8b1d20f4f6da..f62af8f4c28fe 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/google/objc-avoid-nsobject-new.m +++ b/clang-tools-extra/test/clang-tidy/checkers/google/objc-avoid-nsobject-new.m @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s google-objc-avoid-nsobject-new %t +// RUN: %check_clang_tidy --match-partial-fixes %s google-objc-avoid-nsobject-new %t @interface NSObject + (instancetype)new; diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp index 39ff9b7f39634..cf24d2d533240 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s google-upgrade-googletest-case %t -- -- -I%S/Inputs +// RUN: %check_clang_tidy --match-partial-fixes %s google-upgrade-googletest-case %t -- -- -I%S/Inputs // RUN: %check_clang_tidy -check-suffix=NOSUITE %s google-upgrade-googletest-case %t -- -- -DNOSUITE -I%S/Inputs/gtest/nosuite #include "gtest/gtest.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-isa-or-dyn-cast-in-conditionals.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-isa-or-dyn-cast-in-conditionals.cpp index 48652d521baa7..88e4b643004fc 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-isa-or-dyn-cast-in-conditionals.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-isa-or-dyn-cast-in-conditionals.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s llvm-prefer-isa-or-dyn-cast-in-conditionals %t +// RUN: %check_clang_tidy --match-partial-fixes %s llvm-prefer-isa-or-dyn-cast-in-conditionals %t struct X; struct Y; diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-register-over-unsigned.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-register-over-unsigned.cpp index 22af2ac6ae998..5dd3a9de7c910 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-register-over-unsigned.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-register-over-unsigned.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s llvm-prefer-register-over-unsigned %t +// RUN: %check_clang_tidy --match-partial-fixes %s llvm-prefer-register-over-unsigned %t namespace llvm { class Register { diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-register-over-unsigned2.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-register-over-unsigned2.cpp index 3b796f2e4754a..0bf44c8afc704 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-register-over-unsigned2.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-register-over-unsigned2.cpp @@ -14,12 +14,12 @@ using namespace llvm; void apply_1() { unsigned Reg = getReg(); // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: variable 'Reg' declared as 'unsigned int'; use 'Register' instead [llvm-prefer-register-over-unsigned] - // CHECK-FIXES: apply_1() + // CHECK-FIXES: void apply_1() { // CHECK-FIXES-NEXT: Register Reg = getReg(); } void done_1() { llvm::Register Reg = getReg(); - // CHECK-FIXES: done_1() + // CHECK-FIXES: void done_1() { // CHECK-FIXES-NEXT: llvm::Register Reg = getReg(); } diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-register-over-unsigned3.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-register-over-unsigned3.cpp index 7d61f6c78a21c..6feb326998ca8 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-register-over-unsigned3.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/llvm/prefer-register-over-unsigned3.cpp @@ -12,14 +12,14 @@ Register getReg(); void do_nothing_1() { unsigned Reg1 = getReg(); - // CHECK-FIXES: do_nothing_1() + // CHECK-FIXES: void do_nothing_1() { // CHECK-FIXES-NEXT: unsigned Reg1 = getReg(); } void do_nothing_2() { using namespace llvm; unsigned Reg2 = getReg(); - // CHECK-FIXES: do_nothing_2() + // CHECK-FIXES: void do_nothing_2() { // CHECK-FIXES-NEXT: using namespace llvm; // CHECK-FIXES-NEXT: unsigned Reg2 = getReg(); } @@ -27,7 +27,7 @@ void do_nothing_2() { namespace llvm { void do_nothing_3() { unsigned Reg3 = getReg(); - // CHECK-FIXES: do_nothing_3() + // CHECK-FIXES: void do_nothing_3() { // CHECK-FIXES-NEXT: unsigned Reg3 = getReg(); } } // end namespace llvm diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvm/twine-local.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvm/twine-local.cpp index 3dcf6abe0c22e..05c9971252e71 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/llvm/twine-local.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/llvm/twine-local.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s llvm-twine-local %t +// RUN: %check_clang_tidy --match-partial-fixes %s llvm-twine-local %t namespace llvm { class Twine { diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp index 796dc3c579b4f..2ef47266b02b0 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-pointers.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s misc-const-correctness %t \ +// RUN: %check_clang_tidy --match-partial-fixes %s misc-const-correctness %t \ // RUN: -config='{CheckOptions: {\ // RUN: misc-const-correctness.AnalyzeValues: false,\ // RUN: misc-const-correctness.AnalyzeReferences: false,\ diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-values.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-values.cpp index 8cbdffaa801a9..74be3dccc9daa 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-values.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-pointer-as-values.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s misc-const-correctness %t \ +// RUN: %check_clang_tidy --match-partial-fixes %s misc-const-correctness %t \ // RUN: -config='{CheckOptions: \ // RUN: {misc-const-correctness.AnalyzeValues: true,\ // RUN: misc-const-correctness.WarnPointersAsValues: true,\ diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp index 248374a71dd40..5a890f212a603 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp @@ -14,7 +14,7 @@ void type_dependent_variables() { int value_int = 42; // CHECK-MESSAGES:[[@LINE-1]]:3: warning: variable 'value_int' of type 'int' can be declared 'const' - // CHECK-FIXES: int const value_int + // CHECK-FIXES: int const value_int = 42; } void instantiate_template_cases() { type_dependent_variables(); diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values-before-cxx23.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values-before-cxx23.cpp index af626255d9455..89856974eafc4 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values-before-cxx23.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values-before-cxx23.cpp @@ -10,7 +10,7 @@ double &non_const_ref_return() { double p_local0 = 0.0; // CHECK-MESSAGES: [[@LINE-1]]:3: warning: variable 'p_local0' of type 'double' can be declared 'const' - // CHECK-FIXES: double const p_local0 + // CHECK-FIXES: double const p_local0 = 0.0; double np_local0 = 42.42; return np_local0; } diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp index a80e1e1af1870..17dcf12e2536c 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s misc-const-correctness %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes %s misc-const-correctness %t -- \ // RUN: -config="{CheckOptions: {\ // RUN: misc-const-correctness.TransformValues: true, \ // RUN: misc-const-correctness.WarnPointersAsValues: false, \ diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/definitions-in-headers.hpp b/clang-tools-extra/test/clang-tidy/checkers/misc/definitions-in-headers.hpp index 9c91cb7033087..eabcd17817020 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/definitions-in-headers.hpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/definitions-in-headers.hpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s misc-definitions-in-headers %t -- --fix-notes +// RUN: %check_clang_tidy --match-partial-fixes %s misc-definitions-in-headers %t -- --fix-notes int f() { // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: function 'f' defined in a header file; function definitions in header files can lead to ODR violations [misc-definitions-in-headers] diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/unused-parameters.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/unused-parameters.cpp index 9b3dd070405b5..f1918e915dc1d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/unused-parameters.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/unused-parameters.cpp @@ -1,6 +1,6 @@ // RUN: echo "static void staticFunctionHeader(int i) {;}" > %T/header.h // RUN: echo "static void staticFunctionHeader(int /*i*/) {;}" > %T/header-fixed.h -// RUN: %check_clang_tidy -std=c++11 %s misc-unused-parameters %t -- -header-filter='.*' -- -fno-delayed-template-parsing +// RUN: %check_clang_tidy --match-partial-fixes -std=c++11 %s misc-unused-parameters %t -- -header-filter='.*' -- -fno-delayed-template-parsing // RUN: diff %T/header.h %T/header-fixed.h // FIXME: Make the test work in all language modes. diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-bind.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-bind.cpp index 0d100ffa38b27..342c96a6b947f 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-bind.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-bind.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++14-or-later %s modernize-avoid-bind %t +// RUN: %check_clang_tidy --match-partial-fixes -std=c++14-or-later %s modernize-avoid-bind %t namespace std { inline namespace impl { diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/concat-nested-namespaces.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/concat-nested-namespaces.cpp index 9d8f199f64087..a4f50dd0af6f2 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/concat-nested-namespaces.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/concat-nested-namespaces.cpp @@ -1,9 +1,9 @@ // RUN: cp %S/Inputs/concat-nested-namespaces/modernize-concat-nested-namespaces.h %T/modernize-concat-nested-namespaces.h -// RUN: %check_clang_tidy -std=c++17 -check-suffix=NORMAL %s modernize-concat-nested-namespaces %t -- -header-filter=".*" -- -I %T +// RUN: %check_clang_tidy --match-partial-fixes -std=c++17 -check-suffix=NORMAL %s modernize-concat-nested-namespaces %t -- -header-filter=".*" -- -I %T // RUN: FileCheck -input-file=%T/modernize-concat-nested-namespaces.h %S/Inputs/concat-nested-namespaces/modernize-concat-nested-namespaces.h -check-prefix=CHECK-FIXES // Restore header file and re-run with c++20: // RUN: cp %S/Inputs/concat-nested-namespaces/modernize-concat-nested-namespaces.h %T/modernize-concat-nested-namespaces.h -// RUN: %check_clang_tidy -std=c++20 -check-suffixes=NORMAL,CPP20 %s modernize-concat-nested-namespaces %t -- -header-filter=".*" -- -I %T +// RUN: %check_clang_tidy --match-partial-fixes -std=c++20 -check-suffixes=NORMAL,CPP20 %s modernize-concat-nested-namespaces %t -- -header-filter=".*" -- -I %T // RUN: FileCheck -input-file=%T/modernize-concat-nested-namespaces.h %S/Inputs/concat-nested-namespaces/modernize-concat-nested-namespaces.h -check-prefix=CHECK-FIXES #include "modernize-concat-nested-namespaces.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-basic.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-basic.cpp index 8d1d7378e5cff..419e7f899066d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-basic.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-basic.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-loop-convert %t -- -- -I %S/Inputs/loop-convert +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-loop-convert %t -- -- -I %S/Inputs/loop-convert #include "structures.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-camelback.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-camelback.cpp index 71ea97e21f950..919f9a65a212d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-camelback.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-camelback.cpp @@ -13,14 +13,14 @@ void naming() { printf("%d\n", arr[i]); } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead [modernize-loop-convert] - // CHECK-FIXES: for (int i : arr) + // CHECK-FIXES: for (int i : arr) { // CHECK-FIXES-NEXT: printf("%d\n", i); for (int i = 0; i < n; ++i) { printf("%d\n", nums[i]); } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead - // CHECK-FIXES: for (int num : nums) + // CHECK-FIXES: for (int num : nums) { // CHECK-FIXES-NEXT: printf("%d\n", num); int num = 0; @@ -28,6 +28,6 @@ void naming() { printf("%d\n", nums[i] + num); } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead - // CHECK-FIXES: for (int i : nums) + // CHECK-FIXES: for (int i : nums) { // CHECK-FIXES-NEXT: printf("%d\n", i + num); } diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-const.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-const.cpp index beec7fae961a9..6091f0a6552e3 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-const.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-const.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-loop-convert %t +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-loop-convert %t struct Str { Str() = default; diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-extra.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-extra.cpp index d52d0492034df..1ac555f1da0e6 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-extra.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-extra.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-loop-convert %t -- -- -I %S/Inputs/loop-convert +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-loop-convert %t -- -- -I %S/Inputs/loop-convert #include "structures.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-lowercase.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-lowercase.cpp index a8d6559368bae..32d0d3c2544a4 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-lowercase.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-lowercase.cpp @@ -14,21 +14,21 @@ void naming() { printf("%d\n", arr[i]); } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead [modernize-loop-convert] - // CHECK-FIXES: for (int i : arr) + // CHECK-FIXES: for (int i : arr) { // CHECK-FIXES-NEXT: printf("%d\n", i); for (int i = 0; i < n; ++i) { printf("%d\n", nums[i]); } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead - // CHECK-FIXES: for (int num : nums) + // CHECK-FIXES: for (int num : nums) { // CHECK-FIXES-NEXT: printf("%d\n", num); for (int i = 0; i < n; ++i) { printf("%d\n", nums_[i]); } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead - // CHECK-FIXES: for (int num : nums_) + // CHECK-FIXES: for (int num : nums_) { // CHECK-FIXES-NEXT: printf("%d\n", num); int num = 0; @@ -36,6 +36,6 @@ void naming() { printf("%d\n", nums[i] + num); } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead - // CHECK-FIXES: for (int i : nums) + // CHECK-FIXES: for (int i : nums) { // CHECK-FIXES-NEXT: printf("%d\n", i + num); } diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-rewritten-binop.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-rewritten-binop.cpp index 485fa661b525c..310e57e2652fe 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-rewritten-binop.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-rewritten-binop.cpp @@ -47,7 +47,7 @@ void rewritten() { (void)*It; } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead - // CHECK-FIXES: for (int & It : Oeo) + // CHECK-FIXES: for (int & It : Oeo) { // CHECK-FIXES-NEXT: (void)It; HasSpaceshipMem Hsm; @@ -55,6 +55,6 @@ void rewritten() { (void)*It; } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead - // CHECK-FIXES: for (int & It : Hsm) + // CHECK-FIXES: for (int & It : Hsm) { // CHECK-FIXES-NEXT: (void)It; } diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-uppercase.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-uppercase.cpp index cf5b25afb6d3f..cfe11f13d0076 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-uppercase.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-uppercase.cpp @@ -14,21 +14,21 @@ void naming() { printf("%d\n", ARR[I]); } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead [modernize-loop-convert] - // CHECK-FIXES: for (int I : ARR) + // CHECK-FIXES: for (int I : ARR) { // CHECK-FIXES-NEXT: printf("%d\n", I); for (int I = 0; I < N; ++I) { printf("%d\n", NUMS[I]); } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead - // CHECK-FIXES: for (int NUM : NUMS) + // CHECK-FIXES: for (int NUM : NUMS) { // CHECK-FIXES-NEXT: printf("%d\n", NUM); for (int I = 0; I < N; ++I) { printf("%d\n", NUMS_[I]); } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead - // CHECK-FIXES: for (int NUM : NUMS_) + // CHECK-FIXES: for (int NUM : NUMS_) { // CHECK-FIXES-NEXT: printf("%d\n", NUM); int NUM = 0; @@ -36,6 +36,6 @@ void naming() { printf("%d\n", NUMS[I] + NUM); } // CHECK-MESSAGES: :[[@LINE-3]]:3: warning: use range-based for loop instead - // CHECK-FIXES: for (int I : NUMS) + // CHECK-FIXES: for (int I : NUMS) { // CHECK-FIXES-NEXT: printf("%d\n", I + NUM); } diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-shared.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-shared.cpp index 2698036e4bd2e..e57f45c4127f9 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-shared.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-shared.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-make-shared %t -- -- -I %S/Inputs/smart-ptr +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-make-shared %t -- -- -I %S/Inputs/smart-ptr #include "shared_ptr.h" // CHECK-FIXES: #include diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique.cpp index fe512a8f3bf32..e665ca0a15a68 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++14-or-later %s modernize-make-unique %t -- -- -I %S/Inputs/smart-ptr +// RUN: %check_clang_tidy --match-partial-fixes -std=c++14-or-later %s modernize-make-unique %t -- -- -I %S/Inputs/smart-ptr #include "unique_ptr.h" #include "initializer_list.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp index f4e2131671804..ae270dcccd76d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/min-max-use-initializer-list.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-min-max-use-initializer-list %t +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-min-max-use-initializer-list %t // CHECK-FIXES: #include namespace utils { diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/pass-by-value.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/pass-by-value.cpp index 2aacbdd1c7a6a..c0ebaebe4ccf6 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/pass-by-value.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/pass-by-value.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-pass-by-value %t -- -- -fno-delayed-template-parsing +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-pass-by-value %t -- -- -fno-delayed-template-parsing namespace { // POD types are trivially move constructible. diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/redundant-void-arg.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/redundant-void-arg.cpp index 89bf7f04f5576..f43a910ba022c 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/redundant-void-arg.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/redundant-void-arg.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-redundant-void-arg %t +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-redundant-void-arg %t #define NULL 0 diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp index 6b8debd0d0e82..dea0857405e9d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-replace-auto-ptr %t -- -- -I %S/Inputs/replace-auto-ptr +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-replace-auto-ptr %t -- -- -I %S/Inputs/replace-auto-ptr // CHECK-FIXES: #include diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/type-traits.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/type-traits.cpp index 72241846384bc..eaec70814d4f1 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/type-traits.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/type-traits.cpp @@ -37,7 +37,7 @@ namespace ext { bool NoTemplate = std::is_const::value; // CHECK-MESSAGES-CXX17: :[[@LINE-1]]:19: warning: use c++17 style variable templates -// CHECK-FIXES-CXX17: bool NoTemplate = std::is_const_v +// CHECK-FIXES-CXX17: bool NoTemplate = std::is_const_v; template constexpr bool InTemplate = std::is_const::value; diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-cast-remove-stars.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-cast-remove-stars.cpp index b22df9408fc84..5b620adb36ab2 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-cast-remove-stars.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-cast-remove-stars.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-use-auto %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-use-auto %t -- \ // RUN: -config="{CheckOptions: {modernize-use-auto.RemoveStars: 'true' , modernize-use-auto.MinTypeNameLength: '0'}}" \ // RUN: -- -frtti diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-cast.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-cast.cpp index 94359006ac2ff..3946b97abb256 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-cast.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-cast.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-use-auto %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-use-auto %t -- \ // RUN: -config="{CheckOptions: {modernize-use-auto.MinTypeNameLength: '0'}}" \ // RUN: -- -I %S/Inputs/use-auto -frtti diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-for-pointer.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-for-pointer.cpp index 8a3e0bab26c12..1fd4189fb327e 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-for-pointer.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-for-pointer.cpp @@ -1,6 +1,6 @@ -// RUN: %check_clang_tidy -check-suffix=REMOVE %s modernize-use-auto %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes -check-suffix=REMOVE %s modernize-use-auto %t -- \ // RUN: -config="{CheckOptions: {modernize-use-auto.RemoveStars: 'true', modernize-use-auto.MinTypeNameLength: '0'}}" -// RUN: %check_clang_tidy %s modernize-use-auto %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-use-auto %t -- \ // RUN: -config="{CheckOptions: {modernize-use-auto.RemoveStars: 'false', modernize-use-auto.MinTypeNameLength: '0'}}" void pointerToFunction() { diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-iterator.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-iterator.cpp index 2663495ccfecc..02fb64676c52d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-iterator.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-iterator.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++11,c++14 %s modernize-use-auto %t -- -- -I %S/Inputs/use-auto +// RUN: %check_clang_tidy --match-partial-fixes -std=c++11,c++14 %s modernize-use-auto %t -- -- -I %S/Inputs/use-auto // FIXME: Fix the checker to work in C++17 mode. #include "containers.h" diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-min-type-name-length.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-min-type-name-length.cpp index d68810620674e..6cea26ee1a31c 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-min-type-name-length.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-min-type-name-length.cpp @@ -1,7 +1,7 @@ -// RUN: %check_clang_tidy -check-suffix=0-0 %s modernize-use-auto %t -- -config="{CheckOptions: {modernize-use-auto.RemoveStars: false, modernize-use-auto.MinTypeNameLength: 0}}" -- -frtti -// RUN: %check_clang_tidy -check-suffix=0-8 %s modernize-use-auto %t -- -config="{CheckOptions: {modernize-use-auto.RemoveStars: false, modernize-use-auto.MinTypeNameLength: 8}}" -- -frtti -// RUN: %check_clang_tidy -check-suffix=1-0 %s modernize-use-auto %t -- -config="{CheckOptions: {modernize-use-auto.RemoveStars: true, modernize-use-auto.MinTypeNameLength: 0}}" -- -frtti -// RUN: %check_clang_tidy -check-suffix=1-8 %s modernize-use-auto %t -- -config="{CheckOptions: {modernize-use-auto.RemoveStars: true, modernize-use-auto.MinTypeNameLength: 8}}" -- -frtti +// RUN: %check_clang_tidy --match-partial-fixes -check-suffix=0-0 %s modernize-use-auto %t -- -config="{CheckOptions: {modernize-use-auto.RemoveStars: false, modernize-use-auto.MinTypeNameLength: 0}}" -- -frtti +// RUN: %check_clang_tidy --match-partial-fixes -check-suffix=0-8 %s modernize-use-auto %t -- -config="{CheckOptions: {modernize-use-auto.RemoveStars: false, modernize-use-auto.MinTypeNameLength: 8}}" -- -frtti +// RUN: %check_clang_tidy --match-partial-fixes -check-suffix=1-0 %s modernize-use-auto %t -- -config="{CheckOptions: {modernize-use-auto.RemoveStars: true, modernize-use-auto.MinTypeNameLength: 0}}" -- -frtti +// RUN: %check_clang_tidy --match-partial-fixes -check-suffix=1-8 %s modernize-use-auto %t -- -config="{CheckOptions: {modernize-use-auto.RemoveStars: true, modernize-use-auto.MinTypeNameLength: 8}}" -- -frtti template extern T foo(); template struct P { explicit P(T t) : t_(t) {} T t_;}; diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp index 7f737148a7cd1..6520620486942 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-use-equals-default %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-use-equals-default %t -- \ // RUN: -config="{CheckOptions: {modernize-use-equals-default.IgnoreMacros: false}}" \ // RUN: -- -fno-delayed-template-parsing -fexceptions -Wno-error=return-type diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default.cpp index a1d6c25e6364a..209ca7d43664e 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-use-equals-default %t -- -- -fno-delayed-template-parsing -fexceptions +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-use-equals-default %t -- -- -fno-delayed-template-parsing -fexceptions // Out of line definition. class OL { diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp index 5a53c55f7f12f..135ee274433a2 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison-qt.cpp @@ -1,7 +1,8 @@ -// CHECK-FIXES: #include -// RUN: %check_clang_tidy -std=c++17 %s modernize-use-integer-sign-comparison %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes -std=c++17 %s modernize-use-integer-sign-comparison %t -- \ // RUN: -config="{CheckOptions: {modernize-use-integer-sign-comparison.EnableQtSupport: true}}" +// CHECK-FIXES: #include + // The code that triggers the check #define MAX_MACRO(a, b) (a < b) ? b : a diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp index 99f00444c2d3f..e0a84ef5aed26 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-integer-sign-comparison.cpp @@ -1,5 +1,6 @@ +// RUN: %check_clang_tidy --match-partial-fixes -std=c++20 %s modernize-use-integer-sign-comparison %t + // CHECK-FIXES: #include -// RUN: %check_clang_tidy -std=c++20 %s modernize-use-integer-sign-comparison %t // The code that triggers the check #define MAX_MACRO(a, b) (a < b) ? b : a diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.cpp index 2c36349da896c..fe9d6de5f8f0f 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-nullptr.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-use-nullptr %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-use-nullptr %t -- \ // RUN: -config="{CheckOptions: {modernize-use-nullptr.NullMacros: 'MY_NULL,NULL'}}" #define NULL 0 diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-override.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-override.cpp index bad8b7a8d7f08..c5745e39a324d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-override.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-override.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-use-override,cppcoreguidelines-explicit-virtual-functions %t -- -- -fexceptions +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-use-override,cppcoreguidelines-explicit-virtual-functions %t -- -- -fexceptions #define ABSTRACT = 0 diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-starts-ends-with.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-starts-ends-with.cpp index cd8463401e45a..88d04837c923e 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-starts-ends-with.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-starts-ends-with.cpp @@ -61,7 +61,7 @@ void test(std::string s, std::string_view sv, sub_string ss, sub_sub_string sss, if (s.find("....") == 0) { /* do something */ } // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with - // CHECK-FIXES: if (s.starts_with("....")) + // CHECK-FIXES: if (s.starts_with("....")) { /* do something */ } 0 != s.find("a"); // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with @@ -85,7 +85,7 @@ void test(std::string s, std::string_view sv, sub_string ss, sub_sub_string sss, if (s.rfind("....", 0) == 0) { /* do something */ } // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with - // CHECK-FIXES: if (s.starts_with("....")) + // CHECK-FIXES: if (s.starts_with("....")) { /* do something */ } 0 != s.rfind("a", 0); // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with @@ -94,17 +94,17 @@ void test(std::string s, std::string_view sv, sub_string ss, sub_sub_string sss, #define FIND find s.FIND("a") == 0; // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with - // CHECK-FIXES: s.starts_with("a") + // CHECK-FIXES: s.starts_with("a"); #define PREFIX "a" s.find(PREFIX) == 0; // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with - // CHECK-FIXES: s.starts_with(PREFIX) + // CHECK-FIXES: s.starts_with(PREFIX); #define ZERO 0 s.find("a") == ZERO; // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with - // CHECK-FIXES: s.starts_with("a") + // CHECK-FIXES: s.starts_with("a"); sv.find("a") == 0; // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with @@ -120,7 +120,7 @@ void test(std::string s, std::string_view sv, sub_string ss, sub_sub_string sss, sss.find("a") == 0; // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with - // CHECK-FIXES: ss.starts_with("a"); + // CHECK-FIXES: sss.starts_with("a"); sl.find("a") == 0; // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with @@ -172,7 +172,7 @@ void test(std::string s, std::string_view sv, sub_string ss, sub_sub_string sss, 0 != s.compare(0, sv.length(), sv); // CHECK-MESSAGES: :[[@LINE-1]]:{{[0-9]+}}: warning: use starts_with - // CHECK-FIXES: s.starts_with(sv); + // CHECK-FIXES: !s.starts_with(sv); #define LENGTH(x) (x).length() s.compare(0, LENGTH(s), s) == 0; diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp index 71c8af190467c..0df44a9765d78 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format-fmt.cpp @@ -20,5 +20,5 @@ std::basic_string sprintf(const S& fmt, const T&... args); std::string fmt_sprintf_simple() { return fmt::sprintf("Hello %s %d", "world", 42); // CHECK-MESSAGES: [[@LINE-1]]:10: warning: use 'fmt::format' instead of 'sprintf' [modernize-use-std-format] - // CHECK-FIXES: fmt::format("Hello {} {}", "world", 42); + // CHECK-FIXES: return fmt::format("Hello {} {}", "world", 42); } diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format.cpp index 2af2e8949a814..1a241e3712210 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format.cpp @@ -1,10 +1,10 @@ -// RUN: %check_clang_tidy \ +// RUN: %check_clang_tidy --match-partial-fixes \ // RUN: -std=c++20 %s modernize-use-std-format %t -- \ // RUN: -config="{CheckOptions: {modernize-use-std-format.StrictMode: true}}" \ // RUN: -- -isystem %clang_tidy_headers \ // RUN: -DPRI_CMDLINE_MACRO="\"s\"" \ // RUN: -D__PRI_CMDLINE_MACRO="\"s\"" -// RUN: %check_clang_tidy \ +// RUN: %check_clang_tidy --match-partial-fixes \ // RUN: -std=c++20 %s modernize-use-std-format %t -- \ // RUN: -config="{CheckOptions: {modernize-use-std-format.StrictMode: false}}" \ // RUN: -- -isystem %clang_tidy_headers \ diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print.cpp index 5da995d9d6e83..9bf60e765312b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-print.cpp @@ -1,10 +1,10 @@ -// RUN: %check_clang_tidy -check-suffixes=,STRICT \ +// RUN: %check_clang_tidy --match-partial-fixes -check-suffixes=,STRICT \ // RUN: -std=c++23 %s modernize-use-std-print %t -- \ // RUN: -config="{CheckOptions: {modernize-use-std-print.StrictMode: true}}" \ // RUN: -- -isystem %clang_tidy_headers -fexceptions \ // RUN: -DPRI_CMDLINE_MACRO="\"s\"" \ // RUN: -D__PRI_CMDLINE_MACRO="\"s\"" -// RUN: %check_clang_tidy -check-suffixes=,NOTSTRICT \ +// RUN: %check_clang_tidy --match-partial-fixes -check-suffixes=,NOTSTRICT \ // RUN: -std=c++23 %s modernize-use-std-print %t -- \ // RUN: -config="{CheckOptions: {modernize-use-std-print.StrictMode: false}}" \ // RUN: -- -isystem %clang_tidy_headers -fexceptions \ diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp index 214a66f3dcc88..2dcac05f0b46c 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s modernize-use-using %t -- -- -fno-delayed-template-parsing -I %S/Inputs/use-using/ +// RUN: %check_clang_tidy --match-partial-fixes %s modernize-use-using %t -- -- -fno-delayed-template-parsing -I %S/Inputs/use-using/ typedef int Type; // CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using] diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/faster-string-find.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/faster-string-find.cpp index b50d175cff3b7..e52441035e15f 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/faster-string-find.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/faster-string-find.cpp @@ -92,11 +92,11 @@ void StringFind() { std::wstring WStr; WStr.find(L"n"); // CHECK-MESSAGES: [[@LINE-1]]:13: warning: 'find' called with a string literal - // CHECK-FIXES: Str.find(L'n'); + // CHECK-FIXES: WStr.find(L'n'); // Even with unicode that fits in one wide char. WStr.find(L"\x3A9"); // CHECK-MESSAGES: [[@LINE-1]]:13: warning: 'find' called with a string literal - // CHECK-FIXES: Str.find(L'\x3A9'); + // CHECK-FIXES: WStr.find(L'\x3A9'); // std::string_view and std::wstring_view should work. std::string_view StrView; diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/for-range-copy.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/for-range-copy.cpp index f9d06898ca03d..00e135bd2c920 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/for-range-copy.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/for-range-copy.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s performance-for-range-copy %t -- -- -fno-delayed-template-parsing +// RUN: %check_clang_tidy --match-partial-fixes %s performance-for-range-copy %t -- -- -fno-delayed-template-parsing namespace std { diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/noexcept-move-constructor-fix.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/noexcept-move-constructor-fix.cpp index c826abfafbaf2..53d66ec134da3 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/noexcept-move-constructor-fix.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/noexcept-move-constructor-fix.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s performance-noexcept-move-constructor %t -- -- -fexceptions +// RUN: %check_clang_tidy --match-partial-fixes %s performance-noexcept-move-constructor %t -- -- -fexceptions struct C_1 { ~C_1() {} diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp index b5325776f54c6..c0f1fb9c0f6d2 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-copy-initialization.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++17-or-later %s performance-unnecessary-copy-initialization %t +// RUN: %check_clang_tidy --match-partial-fixes -std=c++17-or-later %s performance-unnecessary-copy-initialization %t template struct Iterator { diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-delayed.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-delayed.cpp index 6a87282489613..151b1cecf0f63 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-delayed.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-delayed.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s performance-unnecessary-value-param %t -- -- -fdelayed-template-parsing +// RUN: %check_clang_tidy --match-partial-fixes %s performance-unnecessary-value-param %t -- -- -fdelayed-template-parsing struct ExpensiveToCopyType { const ExpensiveToCopyType & constReference() const { diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp index 60ba7d01420b8..88c491ea1eabc 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s performance-unnecessary-value-param %t -- -- -fno-delayed-template-parsing +// RUN: %check_clang_tidy --match-partial-fixes %s performance-unnecessary-value-param %t -- -- -fno-delayed-template-parsing // CHECK-FIXES: #include diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/braces-around-statements.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/braces-around-statements.cpp index ff8488d2c6de3..b74c4cbf55f19 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/braces-around-statements.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/braces-around-statements.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s readability-braces-around-statements %t +// RUN: %check_clang_tidy --match-partial-fixes %s readability-braces-around-statements %t void do_something(const char *) {} diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp index d913ab4dee9ba..43a7ddbb6552f 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++14-or-later %s readability-const-return-type %t -- -- -Wno-error=return-type +// RUN: %check_clang_tidy --match-partial-fixes -std=c++14-or-later %s readability-const-return-type %t -- -- -Wno-error=return-type // p# = positive test // n# = negative test diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp index 46755270b48ea..2fd0b2224cb1c 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy -std=c++14-or-later %s readability-container-size-empty %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes -std=c++14-or-later %s readability-container-size-empty %t -- \ // RUN: -config="{CheckOptions: {readability-container-size-empty.ExcludedComparisonTypes: '::std::array;::IgnoredDummyType'}}" \ // RUN: -- -fno-delayed-template-parsing -isystem %clang_tidy_headers #include diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c index 0b231d10adf8f..11ff7dd816a44 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.c @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s readability-implicit-bool-conversion %t -- -- -std=c23 +// RUN: %check_clang_tidy --match-partial-fixes %s readability-implicit-bool-conversion %t -- -- -std=c23 // RUN: %check_clang_tidy -check-suffix=UPPER-CASE %s readability-implicit-bool-conversion %t -- \ // RUN: -config='{CheckOptions: { \ // RUN: readability-implicit-bool-conversion.UseUpperCaseLiteralSuffix: true \ diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.cpp index 75f666e3e07e5..f3e8bf044b31b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s readability-implicit-bool-conversion %t +// RUN: %check_clang_tidy --match-partial-fixes %s readability-implicit-bool-conversion %t // RUN: %check_clang_tidy -check-suffix=UPPER-CASE %s readability-implicit-bool-conversion %t -- \ // RUN: -config='{CheckOptions: { \ // RUN: readability-implicit-bool-conversion.UseUpperCaseLiteralSuffix: true \ diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/math-missing-parentheses.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/math-missing-parentheses.cpp index 4face0bb3fe68..80d2bc304bb5b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/math-missing-parentheses.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/math-missing-parentheses.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s readability-math-missing-parentheses %t +// RUN: %check_clang_tidy --match-partial-fixes %s readability-math-missing-parentheses %t #define MACRO_AND & #define MACRO_ADD + diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr-members.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr-members.cpp index e72091121e895..7f71b7ba6b9b2 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr-members.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr-members.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s readability-simplify-boolean-expr %t +// RUN: %check_clang_tidy --match-partial-fixes %s readability-simplify-boolean-expr %t class A { public: diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr.cpp index bad1055a01904..ee5ff7b865d95 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/simplify-boolean-expr.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s readability-simplify-boolean-expr %t +// RUN: %check_clang_tidy --match-partial-fixes %s readability-simplify-boolean-expr %t bool a1 = false; diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer-custom-list.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer-custom-list.cpp index 58e88381e21ea..3215075ebcf01 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer-custom-list.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/uppercase-literal-suffix-integer-custom-list.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s readability-uppercase-literal-suffix %t -- -config="{CheckOptions: {readability-uppercase-literal-suffix.NewSuffixes: 'L;uL'}}" -- -I %clang_tidy_headers +// RUN: %check_clang_tidy --match-partial-fixes %s readability-uppercase-literal-suffix %t -- -config="{CheckOptions: {readability-uppercase-literal-suffix.NewSuffixes: 'L;uL'}}" -- -I %clang_tidy_headers // RUN: grep -Ev "// *[A-Z-]+:" %s > %t.cpp // RUN: clang-tidy %t.cpp -checks='-*,readability-uppercase-literal-suffix' -fix -config="{CheckOptions: {readability-uppercase-literal-suffix.NewSuffixes: 'L;uL'}}" -- -I %clang_tidy_headers // RUN: clang-tidy %t.cpp -checks='-*,readability-uppercase-literal-suffix' -warnings-as-errors='-*,readability-uppercase-literal-suffix' -config="{CheckOptions: {readability-uppercase-literal-suffix.NewSuffixes: 'L;uL'}}" -- -I %clang_tidy_headers diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-conflicted-fixes-of-alias-checkers.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-conflicted-fixes-of-alias-checkers.cpp index b9577d000ac0b..d40bb869817d5 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-conflicted-fixes-of-alias-checkers.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-conflicted-fixes-of-alias-checkers.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s cppcoreguidelines-pro-type-member-init,hicpp-member-init,modernize-use-emplace,hicpp-use-emplace %t -- \ +// RUN: %check_clang_tidy --match-partial-fixes %s cppcoreguidelines-pro-type-member-init,hicpp-member-init,modernize-use-emplace,hicpp-use-emplace %t -- \ //// RUN: -config='{CheckOptions: { \ //// RUN: cppcoreguidelines-pro-type-member-init.UseAssignment: true, \ //// RUN: }}' diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-fixes-of-alias-checkers.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-fixes-of-alias-checkers.cpp index ff216298cfd60..94cfa1a70a481 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-fixes-of-alias-checkers.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-fixes-of-alias-checkers.cpp @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s cppcoreguidelines-pro-type-member-init,hicpp-member-init,modernize-use-emplace,hicpp-use-emplace %t +// RUN: %check_clang_tidy --match-partial-fixes %s cppcoreguidelines-pro-type-member-init,hicpp-member-init,modernize-use-emplace,hicpp-use-emplace %t namespace std { From d1a05721172272f7aab685b56d99e86814a15bff Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Mon, 7 Apr 2025 15:42:22 -0500 Subject: [PATCH 0898/1029] [AMDGPU] Add buffer.fat.ptr.load.lds intrinsic wrapping raw rsrc version (#133015) Add a buffer_fat_ptr_load_lds intrinsic, by analogy with global_load_lds, which enables using `ptr addrspace(7)` to set the rsrc and offset arguments to raw_ptr_buffer_load_lds. --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 21 +++++++++++++++++++ .../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 21 +++++++++++++++++++ .../lower-buffer-fat-pointers-mem-transfer.ll | 18 ++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 217e43fcce4fd..ae2f6e62c0272 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1941,6 +1941,27 @@ def int_amdgcn_s_buffer_prefetch_data : DefaultAttrsIntrinsic < } // defset AMDGPUBufferIntrinsics +// A wrapper around raw_ptr_buffer_load_lds that takes the global offset +// from the addrspace(7) pointer argument. +def int_amdgcn_buffer_fat_ptr_load_lds : Intrinsic < + [], + [LLVMQualPointerType<7>, // buffer fat pointer (SGPRx4 + VGPR) + LLVMQualPointerType<3>, // LDS base offset + llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) + llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) + llvm_i32_ty], // auxiliary/cachepolicy(imm): + // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), + // bit 3 = swz, bit 4 = scc (gfx90a) + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx12+: bits [0-2] = th, bits [3-4] = scope, + // bit 6 = swz + // all: volatile op (bit 31, stripped at lowering) + [IntrWillReturn, IntrArgMemOnly, + ReadOnly>, NoCapture>, + WriteOnly>, NoCapture>, + ImmArg>, ImmArg>, + ImmArg>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; + // Uses that do not set the done bit should set IntrWriteMem on the // call site. def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [ diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index e6250ddf2c26b..183c55729b0b1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -2167,6 +2167,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { case Intrinsic::memset: case Intrinsic::memset_inline: case Intrinsic::experimental_memset_pattern: + case Intrinsic::amdgcn_buffer_fat_ptr_load_lds: return true; } } @@ -2255,6 +2256,26 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { SplitUsers.insert(&I); return {NewRsrc, Off}; } + case Intrinsic::amdgcn_buffer_fat_ptr_load_lds: { + Value *BufferPtr = I.getArgOperand(0); + assert(isSplitFatPtr(BufferPtr->getType()) && + "amdgcn.buffer.fat.pointer.load.lds must have a buffer fat pointer " + "as argument 0"); + IRB.SetInsertPoint(&I); + auto [Rsrc, Off] = getPtrParts(BufferPtr); + Value *LDSPtr = I.getArgOperand(1); + Value *LoadSize = I.getArgOperand(2); + Value *ImmOff = I.getArgOperand(3); + Value *Aux = I.getArgOperand(4); + Value *SOffset = IRB.getInt32(0); + Instruction *NewLoad = IRB.CreateIntrinsic( + Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {}, + {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux}); + copyMetadata(NewLoad, &I); + SplitUsers.insert(&I); + I.replaceAllUsesWith(NewLoad); + return {nullptr, nullptr}; + } } return {nullptr, nullptr}; } diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll index ee51b0b84554e..56d0cdd29ffb2 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll @@ -1724,3 +1724,21 @@ define void @memset_pattern_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %leng call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 %length, i1 false) ret void } + +;;; Buffer load to LDS + +declare void @llvm.amdgcn.buffer.fat.ptr.load.lds(ptr addrspace(7), ptr addrspace(3), i32 immarg, i32 immarg, i32 immarg) + +define void @llvm_amdgcn_buffer_fat_ptr_load_lds(ptr addrspace(7) inreg %p, ptr addrspace(3) inreg %l, i32 %idx) { +; CHECK-LABEL: define void @llvm_amdgcn_buffer_fat_ptr_load_lds( +; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[P:%.*]], ptr addrspace(3) inreg [[L:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0 +; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1 +; CHECK-NEXT: [[Q:%.*]] = add i32 [[P_OFF]], [[IDX]] +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[P_RSRC]], ptr addrspace(3) [[L]], i32 4, i32 [[Q]], i32 0, i32 16, i32 0) +; CHECK-NEXT: ret void +; + %q = getelementptr i8, ptr addrspace(7) %p, i32 %idx + call void @llvm.amdgcn.buffer.fat.ptr.load.lds(ptr addrspace(7) %q, ptr addrspace(3) %l, i32 4, i32 16, i32 0) + ret void +} From 3756ba3c87f78a2746f26434209e29ac288a5fc6 Mon Sep 17 00:00:00 2001 From: vporpo Date: Mon, 7 Apr 2025 13:48:30 -0700 Subject: [PATCH 0899/1029] [SandboxIR] Implement ConstantDataSequential and subclasses (#133547) This patch implements sandboxir::ConstantDataSequential mirroring LLVM IR. --- llvm/include/llvm/SandboxIR/Constant.h | 122 +++++++++++++++++++++ llvm/include/llvm/SandboxIR/Context.h | 1 + llvm/include/llvm/SandboxIR/Type.h | 2 + llvm/include/llvm/SandboxIR/Values.def | 2 + llvm/lib/SandboxIR/Context.cpp | 8 ++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 101 +++++++++++++++++ 6 files changed, 236 insertions(+) diff --git a/llvm/include/llvm/SandboxIR/Constant.h b/llvm/include/llvm/SandboxIR/Constant.h index c4841e0b0dd66..ddd90291c509b 100644 --- a/llvm/include/llvm/SandboxIR/Constant.h +++ b/llvm/include/llvm/SandboxIR/Constant.h @@ -486,6 +486,128 @@ class ConstantAggregateZero final : public Constant { #endif }; +/// ConstantDataSequential - A vector or array constant whose element type is a +/// simple 1/2/4/8-byte integer or half/bfloat/float/double, and whose elements +/// are just simple data values (i.e. ConstantInt/ConstantFP). This Constant +/// node has no operands because it stores all of the elements of the constant +/// as densely packed data, instead of as Value*'s. +/// +/// This is the common base class of ConstantDataArray and ConstantDataVector. +class ConstantDataSequential : public Constant { +protected: + ConstantDataSequential(ClassID ID, llvm::ConstantDataSequential *C, + Context &Ctx) + : Constant(ID, C, Ctx) {} + +public: + /// Return true if a ConstantDataSequential can be formed with a vector or + /// array of the specified element type. + /// ConstantDataArray only works with normal float and int types that are + /// stored densely in memory, not with things like i42 or x86_f80. + static bool isElementTypeCompatible(Type *Ty) { + return llvm::ConstantDataSequential::isElementTypeCompatible(Ty->LLVMTy); + } + /// If this is a sequential container of integers (of any size), return the + /// specified element in the low bits of a uint64_t. + uint64_t getElementAsInteger(unsigned ElmIdx) const { + return cast(Val)->getElementAsInteger(ElmIdx); + } + /// If this is a sequential container of integers (of any size), return the + /// specified element as an APInt. + APInt getElementAsAPInt(unsigned ElmIdx) const { + return cast(Val)->getElementAsAPInt(ElmIdx); + } + /// If this is a sequential container of floating point type, return the + /// specified element as an APFloat. + APFloat getElementAsAPFloat(unsigned ElmIdx) const { + return cast(Val)->getElementAsAPFloat(ElmIdx); + } + /// If this is an sequential container of floats, return the specified element + /// as a float. + float getElementAsFloat(unsigned ElmIdx) const { + return cast(Val)->getElementAsFloat(ElmIdx); + } + /// If this is an sequential container of doubles, return the specified + /// element as a double. + double getElementAsDouble(unsigned ElmIdx) const { + return cast(Val)->getElementAsDouble(ElmIdx); + } + /// Return a Constant for a specified index's element. + /// Note that this has to compute a new constant to return, so it isn't as + /// efficient as getElementAsInteger/Float/Double. + Constant *getElementAsConstant(unsigned ElmIdx) const { + return Ctx.getOrCreateConstant( + cast(Val)->getElementAsConstant(ElmIdx)); + } + /// Return the element type of the array/vector. + Type *getElementType() const { + return Ctx.getType( + cast(Val)->getElementType()); + } + /// Return the number of elements in the array or vector. + unsigned getNumElements() const { + return cast(Val)->getNumElements(); + } + /// Return the size (in bytes) of each element in the array/vector. + /// The size of the elements is known to be a multiple of one byte. + uint64_t getElementByteSize() const { + return cast(Val)->getElementByteSize(); + } + /// This method returns true if this is an array of \p CharSize integers. + bool isString(unsigned CharSize = 8) const { + return cast(Val)->isString(CharSize); + } + /// This method returns true if the array "isString", ends with a null byte, + /// and does not contains any other null bytes. + bool isCString() const { + return cast(Val)->isCString(); + } + /// If this array is isString(), then this method returns the array as a + /// StringRef. Otherwise, it asserts out. + StringRef getAsString() const { + return cast(Val)->getAsString(); + } + /// If this array is isCString(), then this method returns the array (without + /// the trailing null byte) as a StringRef. Otherwise, it asserts out. + StringRef getAsCString() const { + return cast(Val)->getAsCString(); + } + /// Return the raw, underlying, bytes of this data. Note that this is an + /// extremely tricky thing to work with, as it exposes the host endianness of + /// the data elements. + StringRef getRawDataValues() const { + return cast(Val)->getRawDataValues(); + } + + static bool classof(const Value *From) { + return From->getSubclassID() == ClassID::ConstantDataArray || + From->getSubclassID() == ClassID::ConstantDataVector; + } +}; + +class ConstantDataArray final : public ConstantDataSequential { + ConstantDataArray(llvm::ConstantDataArray *C, Context &Ctx) + : ConstantDataSequential(ClassID::ConstantDataArray, C, Ctx) {} + friend class Context; + +public: + // TODO: Add missing functions. +}; + +/// A vector constant whose element type is a simple 1/2/4/8-byte integer or +/// float/double, and whose elements are just simple data values +/// (i.e. ConstantInt/ConstantFP). This Constant node has no operands because it +/// stores all of the elements of the constant as densely packed data, instead +/// of as Value*'s. +class ConstantDataVector final : public ConstantDataSequential { + ConstantDataVector(llvm::ConstantDataVector *C, Context &Ctx) + : ConstantDataSequential(ClassID::ConstantDataVector, C, Ctx) {} + friend class Context; + +public: + // TODO: Add missing functions. +}; + // TODO: Inherit from ConstantData. class ConstantPointerNull final : public Constant { ConstantPointerNull(llvm::ConstantPointerNull *C, Context &Ctx) diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h index 714d1ec78f452..e2b4d0f6baf18 100644 --- a/llvm/include/llvm/SandboxIR/Context.h +++ b/llvm/include/llvm/SandboxIR/Context.h @@ -130,6 +130,7 @@ class Context { } /// Get or create a sandboxir::Constant from an existing LLVM IR \p LLVMC. Constant *getOrCreateConstant(llvm::Constant *LLVMC); + friend class ConstantDataSequential; // For getOrCreateConstant(). friend class Utils; // For getMemoryBase void runEraseInstrCallbacks(Instruction *I); diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index ec32284dacd61..7d90168c06576 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -34,6 +34,7 @@ class StructType; class TargetExtType; class Module; class FPMathOperator; +class ConstantDataSequential; #define DEF_INSTR(ID, OPCODE, CLASS) class CLASS; #define DEF_CONST(ID, CLASS) class CLASS; #include "llvm/SandboxIR/Values.def" @@ -63,6 +64,7 @@ class Type { friend class TargetExtType; // For LLVMTy. friend class Module; // For LLVMTy. friend class FPMathOperator; // For LLVMTy. + friend class ConstantDataSequential; // For LLVMTy. // Friend all instruction classes because `create()` functions use LLVMTy. #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS; diff --git a/llvm/include/llvm/SandboxIR/Values.def b/llvm/include/llvm/SandboxIR/Values.def index f5ead54a08e10..a55abbd20f4c0 100644 --- a/llvm/include/llvm/SandboxIR/Values.def +++ b/llvm/include/llvm/SandboxIR/Values.def @@ -28,6 +28,8 @@ DEF_VALUE(Block, BasicBlock) DEF_CONST(Constant, Constant) DEF_CONST(ConstantInt, ConstantInt) DEF_CONST(ConstantFP, ConstantFP) +DEF_CONST(ConstantDataArray, ConstantDataArray) +DEF_CONST(ConstantDataVector, ConstantDataVector) DEF_CONST(ConstantArray, ConstantArray) DEF_CONST(ConstantStruct, ConstantStruct) DEF_CONST(ConstantVector, ConstantVector) diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp index 21039ce7ed834..fe67f9ef73fb6 100644 --- a/llvm/lib/SandboxIR/Context.cpp +++ b/llvm/lib/SandboxIR/Context.cpp @@ -360,6 +360,14 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { It->second = std::unique_ptr( new ConstantVector(cast(LLVMC), *this)); break; + case llvm::Value::ConstantDataArrayVal: + It->second = std::unique_ptr( + new ConstantDataArray(cast(LLVMC), *this)); + break; + case llvm::Value::ConstantDataVectorVal: + It->second = std::unique_ptr( + new ConstantDataVector(cast(LLVMC), *this)); + break; case llvm::Value::FunctionVal: It->second = std::unique_ptr( new Function(cast(LLVMC), *this)); diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index bac2e888019d4..690e9a521c168 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -607,6 +607,107 @@ define void @foo(ptr %ptr, {i32, i8} %v1, <2 x i8> %v2) { EXPECT_EQ(NewVectorCAZ->getElementCount(), ElementCount::getFixed(4)); } +// Tests ConstantDataSequential, ConstantDataArray and ConstantDataVector. +TEST_F(SandboxIRTest, ConstantDataSequential) { + parseIR(C, R"IR( +define void @foo() { + %array = extractvalue [2 x i8] [i8 0, i8 1], 0 + %vector = extractelement <2 x i8> , i32 0 + %farray = extractvalue [2 x float] [float 0.0, float 1.0], 0 + %fvector = extractelement <2 x double> , i32 0 + %string = extractvalue [6 x i8] [i8 72, i8 69, i8 76, i8 76, i8 79, i8 0], 0 + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto &F = *Ctx.createFunction(&LLVMF); + auto &BB = *F.begin(); + auto It = BB.begin(); + auto *I0 = &*It++; + auto *I1 = &*It++; + auto *I2 = &*It++; + auto *I3 = &*It++; + auto *I4 = &*It++; + auto *Array = cast(I0->getOperand(0)); + EXPECT_TRUE(isa(Array)); + auto *Vector = cast(I1->getOperand(0)); + EXPECT_TRUE(isa(Vector)); + auto *FArray = cast(I2->getOperand(0)); + EXPECT_TRUE(isa(FArray)); + auto *FVector = cast(I3->getOperand(0)); + EXPECT_TRUE(isa(FVector)); + auto *String = cast(I4->getOperand(0)); + EXPECT_TRUE(isa(String)); + + auto *Zero8 = sandboxir::ConstantInt::get(sandboxir::Type::getInt8Ty(Ctx), 0); + auto *One8 = sandboxir::ConstantInt::get(sandboxir::Type::getInt8Ty(Ctx), 1); + + // Check isElementTypeCompatible(). + for (llvm::Type *LLVMTy : + {llvm::Type::getIntNTy(C, 42), llvm::Type::getInt8Ty(C)}) + EXPECT_EQ(llvm::ConstantDataSequential::isElementTypeCompatible(LLVMTy), + sandboxir::ConstantDataSequential::isElementTypeCompatible( + Ctx.getType(LLVMTy))); + // Check getElementAsInteger(). + EXPECT_EQ(Array->getElementAsInteger(0), 0u); + EXPECT_EQ(Array->getElementAsInteger(1), 1u); + EXPECT_EQ(Vector->getElementAsInteger(0), 0u); + EXPECT_EQ(Vector->getElementAsInteger(1), 1u); + // Check getElementAsAPInt(). + EXPECT_EQ(Array->getElementAsAPInt(0), 0u); + EXPECT_EQ(Array->getElementAsAPInt(1), 1u); + EXPECT_EQ(Vector->getElementAsAPInt(0), 0u); + EXPECT_EQ(Vector->getElementAsAPInt(1), 1u); + // Check geteElementAsFloat(). + EXPECT_EQ(FArray->getElementAsFloat(0), 0.0); + EXPECT_EQ(FArray->getElementAsFloat(1), 1.0); + // Check getElementAsDouble(). + EXPECT_EQ(FVector->getElementAsDouble(0), 0.0); + EXPECT_EQ(FVector->getElementAsDouble(1), 1.0); + // Check getElementAsConstant(). + EXPECT_EQ(Array->getElementAsConstant(0), Zero8); + EXPECT_EQ(Array->getElementAsConstant(1), One8); + EXPECT_EQ(Vector->getElementAsConstant(0), Zero8); + EXPECT_EQ(Vector->getElementAsConstant(1), One8); + // Check getElementType(). + EXPECT_EQ(Array->getElementType(), sandboxir::Type::getInt8Ty(Ctx)); + EXPECT_EQ(Vector->getElementType(), sandboxir::Type::getInt8Ty(Ctx)); + EXPECT_EQ(FArray->getElementType(), sandboxir::Type::getFloatTy(Ctx)); + EXPECT_EQ(FVector->getElementType(), sandboxir::Type::getDoubleTy(Ctx)); + // Check getNumElements(), + EXPECT_EQ(Array->getNumElements(), 2u); + EXPECT_EQ(Vector->getNumElements(), 2u); + EXPECT_EQ(FArray->getNumElements(), 2u); + EXPECT_EQ(FVector->getNumElements(), 2u); + // Check getElementByteSize(). + EXPECT_EQ(Array->getElementByteSize(), 1u); + EXPECT_EQ(Vector->getElementByteSize(), 1u); + EXPECT_EQ(FArray->getElementByteSize(), 4u); + EXPECT_EQ(FVector->getElementByteSize(), 8u); + // Check isString(). + EXPECT_EQ(Array->isString(), true); + EXPECT_EQ(Vector->isString(), false); + EXPECT_EQ(FArray->isString(), false); + EXPECT_EQ(FVector->isString(), false); + EXPECT_EQ(String->isString(), true); + // Check isCString(). + EXPECT_EQ(Array->isCString(), false); + EXPECT_EQ(Vector->isCString(), false); + EXPECT_EQ(FArray->isCString(), false); + EXPECT_EQ(FVector->isCString(), false); + EXPECT_EQ(String->isCString(), true); + // Check getAsString(). + char Data[] = {'H', 'E', 'L', 'L', 'O', '\0'}; + StringRef HelloWithNull(Data, 6); + EXPECT_EQ(String->getAsString(), HelloWithNull); + // Check getAsCString(). + EXPECT_EQ(String->getAsCString(), "HELLO"); + // Check getRawDataValues(). + EXPECT_EQ(String->getRawDataValues(), HelloWithNull); +} + TEST_F(SandboxIRTest, ConstantPointerNull) { parseIR(C, R"IR( define ptr @foo() { From df28c81f5a2b61a3b5ad1e6274dd27697a9367ac Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Mon, 7 Apr 2025 13:48:01 -0700 Subject: [PATCH 0900/1029] [lldb][debugserver] Fix an off-by-one error in watchpoint identification (#134314) debugserver takes the address of a watchpoint exception and calculates which watchpoint was responsible for it. There was an off-by-one error in the range calculation which causes two watchpoints on consecutive ranges to not correctly identify hits to the second watchpoint. The result is that lldb wouldn't show the second watchpoint as ever being hit. Re-landing this test with a modification to only require two watchpoints in the test, instead of four. If four watchpoints can be set, it will test them. rdar://145107575 --- .../consecutive-watchpoints/Makefile | 3 + .../TestConsecutiveWatchpoints.py | 96 +++++++++++++++++++ .../watchpoint/consecutive-watchpoints/main.c | 22 +++++ .../debugserver/source/DNBBreakpoint.cpp | 2 +- 4 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/Makefile create mode 100644 lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/TestConsecutiveWatchpoints.py create mode 100644 lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/main.c diff --git a/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/Makefile b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/Makefile new file mode 100644 index 0000000000000..10495940055b6 --- /dev/null +++ b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/TestConsecutiveWatchpoints.py b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/TestConsecutiveWatchpoints.py new file mode 100644 index 0000000000000..bb73acc8fc35f --- /dev/null +++ b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/TestConsecutiveWatchpoints.py @@ -0,0 +1,96 @@ +""" +Watch contiguous memory regions with separate watchpoints, check that lldb +correctly detect which watchpoint was hit for each one. +""" + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class ConsecutiveWatchpointsTestCase(TestBase): + NO_DEBUG_INFO_TESTCASE = True + + def continue_and_report_stop_reason(self, process, iter_str): + process.Continue() + self.assertIn( + process.GetState(), [lldb.eStateStopped, lldb.eStateExited], iter_str + ) + thread = process.GetSelectedThread() + return thread.GetStopReason() + + # debugserver only gained the ability to watch larger regions + # with this patch. + def test_consecutive_watchpoints(self): + """Test watchpoint that covers a large region of memory.""" + self.build() + self.main_source_file = lldb.SBFileSpec("main.c") + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + self, "break here", self.main_source_file + ) + + frame = thread.GetFrameAtIndex(0) + + field2_wp = ( + frame.locals["var"][0] + .GetChildMemberWithName("field2") + .Watch(True, False, True) + ) + field3_wp = ( + frame.locals["var"][0] + .GetChildMemberWithName("field3") + .Watch(True, False, True) + ) + field4_wp = ( + frame.locals["var"][0] + .GetChildMemberWithName("field4") + .Watch(True, False, True) + ) + field5_wp = ( + frame.locals["var"][0] + .GetChildMemberWithName("field5") + .Watch(True, False, True) + ) + + # Require that the first two watchpoints + # are set -- hopefully every machine running + # the testsuite can support two watchpoints. + self.assertTrue(field2_wp.IsValid()) + self.assertTrue(field3_wp.IsValid()) + + reason = self.continue_and_report_stop_reason(process, "continue to field2 wp") + self.assertEqual(reason, lldb.eStopReasonWatchpoint) + stop_reason_watchpoint_id = ( + process.GetSelectedThread().GetStopReasonDataAtIndex(0) + ) + self.assertEqual(stop_reason_watchpoint_id, field2_wp.GetID()) + + reason = self.continue_and_report_stop_reason(process, "continue to field3 wp") + self.assertEqual(reason, lldb.eStopReasonWatchpoint) + stop_reason_watchpoint_id = ( + process.GetSelectedThread().GetStopReasonDataAtIndex(0) + ) + self.assertEqual(stop_reason_watchpoint_id, field3_wp.GetID()) + + # If we were able to set the second two watchpoints, + # check that they are hit. Some CI bots can only + # create two watchpoints. + if field4_wp.IsValid() and field5_wp.IsValid(): + reason = self.continue_and_report_stop_reason( + process, "continue to field4 wp" + ) + self.assertEqual(reason, lldb.eStopReasonWatchpoint) + stop_reason_watchpoint_id = ( + process.GetSelectedThread().GetStopReasonDataAtIndex(0) + ) + self.assertEqual(stop_reason_watchpoint_id, field4_wp.GetID()) + + reason = self.continue_and_report_stop_reason( + process, "continue to field5 wp" + ) + self.assertEqual(reason, lldb.eStopReasonWatchpoint) + stop_reason_watchpoint_id = ( + process.GetSelectedThread().GetStopReasonDataAtIndex(0) + ) + self.assertEqual(stop_reason_watchpoint_id, field5_wp.GetID()) diff --git a/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/main.c b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/main.c new file mode 100644 index 0000000000000..c0a3530be9f5e --- /dev/null +++ b/lldb/test/API/functionalities/watchpoint/consecutive-watchpoints/main.c @@ -0,0 +1,22 @@ +#include +struct fields { + uint32_t field1; + uint32_t field2; // offset +4 + uint16_t field3; // offset +8 + uint16_t field4; // offset +10 + uint16_t field5; // offset +12 + uint16_t field6; // offset +14 +}; + +int main() { + struct fields var = {0, 0, 0, 0, 0, 0}; + + var.field1 = 5; // break here + var.field2 = 6; + var.field3 = 7; + var.field4 = 8; + var.field5 = 9; + var.field6 = 10; + + return var.field1 + var.field2 + var.field3; +} diff --git a/lldb/tools/debugserver/source/DNBBreakpoint.cpp b/lldb/tools/debugserver/source/DNBBreakpoint.cpp index f63ecf24222bd..e41bf9b4fd905 100644 --- a/lldb/tools/debugserver/source/DNBBreakpoint.cpp +++ b/lldb/tools/debugserver/source/DNBBreakpoint.cpp @@ -98,7 +98,7 @@ DNBBreakpointList::FindNearestWatchpoint(nub_addr_t addr) const { if (pos.second.IsEnabled()) { nub_addr_t start_addr = pos.second.Address(); nub_addr_t end_addr = start_addr + pos.second.ByteSize(); - if (addr >= start_addr && addr <= end_addr) + if (addr >= start_addr && addr < end_addr) return &pos.second; } } From 2ac11daf92531c5de65dd645f4466c709f865a7e Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Mon, 7 Apr 2025 21:52:36 +0100 Subject: [PATCH 0901/1029] [SCEV] Improve code around constant TC (NFC) (#133261) --- llvm/lib/Analysis/ScalarEvolution.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 5f73644568cf6..c62ea1526981d 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -8253,14 +8253,14 @@ unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) { unsigned Multiple = getSmallConstantTripMultiple(L, ExitingBB); if (!Res) Res = Multiple; - Res = (unsigned)std::gcd(*Res, Multiple); + Res = std::gcd(*Res, Multiple); } return Res.value_or(1); } unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount) { - if (ExitCount == getCouldNotCompute()) + if (isa(ExitCount)) return 1; // Get the trip count @@ -8270,8 +8270,8 @@ unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L, // If a trip multiple is huge (>=2^32), the trip count is still divisible by // the greatest power of 2 divisor less than 2^32. return Multiple.getActiveBits() > 32 - ? 1U << std::min((unsigned)31, Multiple.countTrailingZeros()) - : (unsigned)Multiple.zextOrTrunc(32).getZExtValue(); + ? 1U << std::min(31U, Multiple.countTrailingZeros()) + : (unsigned)Multiple.getZExtValue(); } /// Returns the largest constant divisor of the trip count of this loop as a From 6a94bd136db9393b50bcd99f183e61c2e6e873a9 Mon Sep 17 00:00:00 2001 From: Pedro Lobo Date: Mon, 7 Apr 2025 21:56:02 +0100 Subject: [PATCH 0902/1029] [PPC] Change placeholder from `undef` to `poison` (#134552) Call `insertelement` on a `poison` value instead of `undef`. --- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index 2a5af3e50af26..3a4c2fcad8c83 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -126,7 +126,7 @@ PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { IC.Builder.CreateBitCast(II.getArgOperand(0), Mask->getType()); Value *Op1 = IC.Builder.CreateBitCast(II.getArgOperand(1), Mask->getType()); - Value *Result = UndefValue::get(Op0->getType()); + Value *Result = PoisonValue::get(Op0->getType()); // Only extract each element once. Value *ExtractedElts[32]; From 3382aef944ef7a497248ef85df75ec04f6c21642 Mon Sep 17 00:00:00 2001 From: amansharma612 Date: Tue, 8 Apr 2025 02:29:41 +0530 Subject: [PATCH 0903/1029] [libc] Fixed typo in porting.rst (#134488) Co-authored-by: Aman Sharma <210100011@iitb.ac.in> --- libc/docs/porting.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libc/docs/porting.rst b/libc/docs/porting.rst index 5747a8b9b0c73..e890e439f619f 100644 --- a/libc/docs/porting.rst +++ b/libc/docs/porting.rst @@ -53,7 +53,7 @@ architectures. Since a large part of the libc makes use of syscalls (or an equivalent on non-Linux like platforms), it might be simpler and convenient to bring up the libc for one architecture at a time. In such cases, wherein the support surface of LLVM's libc differs for each target architecture, one will -have to add a subdirectory (within the config directory os the operating +have to add a subdirectory (within the config directory of the operating system) for each target architecture, and list the relevant config information separately in those subdirectories. For example, for Linux, the x86_64 and aarch64 configs are in separate directories, named From eeebdb9711a1c47b1639d1ebeac8b134f86c0634 Mon Sep 17 00:00:00 2001 From: Steven Hedges Date: Mon, 7 Apr 2025 17:05:15 -0400 Subject: [PATCH 0904/1029] [llvm] Fix Typo in Kaleidoscope BuildingAJIT Example (#134391) Changed "precedecnce" to precedence. --- llvm/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp | 2 +- llvm/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp | 2 +- llvm/examples/Kaleidoscope/BuildingAJIT/Chapter3/toy.cpp | 2 +- llvm/examples/Kaleidoscope/BuildingAJIT/Chapter4/toy.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp b/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp index 426886c72e54d..0d6d03989f928 100644 --- a/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp +++ b/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter1/toy.cpp @@ -636,7 +636,7 @@ static std::unique_ptr ParsePrototype() { // Read the precedence if present. if (CurTok == tok_number) { if (NumVal < 1 || NumVal > 100) - return LogErrorP("Invalid precedecnce: must be 1..100"); + return LogErrorP("Invalid precedence: must be 1..100"); BinaryPrecedence = (unsigned)NumVal; getNextToken(); } diff --git a/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp b/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp index 426886c72e54d..0d6d03989f928 100644 --- a/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp +++ b/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter2/toy.cpp @@ -636,7 +636,7 @@ static std::unique_ptr ParsePrototype() { // Read the precedence if present. if (CurTok == tok_number) { if (NumVal < 1 || NumVal > 100) - return LogErrorP("Invalid precedecnce: must be 1..100"); + return LogErrorP("Invalid precedence: must be 1..100"); BinaryPrecedence = (unsigned)NumVal; getNextToken(); } diff --git a/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter3/toy.cpp b/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter3/toy.cpp index 426886c72e54d..0d6d03989f928 100644 --- a/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter3/toy.cpp +++ b/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter3/toy.cpp @@ -636,7 +636,7 @@ static std::unique_ptr ParsePrototype() { // Read the precedence if present. if (CurTok == tok_number) { if (NumVal < 1 || NumVal > 100) - return LogErrorP("Invalid precedecnce: must be 1..100"); + return LogErrorP("Invalid precedence: must be 1..100"); BinaryPrecedence = (unsigned)NumVal; getNextToken(); } diff --git a/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter4/toy.cpp b/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter4/toy.cpp index 1891635dbfd35..bbdb7907722fa 100644 --- a/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter4/toy.cpp +++ b/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter4/toy.cpp @@ -619,7 +619,7 @@ static std::unique_ptr ParsePrototype() { // Read the precedence if present. if (CurTok == tok_number) { if (NumVal < 1 || NumVal > 100) - return LogErrorP("Invalid precedecnce: must be 1..100"); + return LogErrorP("Invalid precedence: must be 1..100"); BinaryPrecedence = (unsigned)NumVal; getNextToken(); } From ad9f15ab5335ed3c3c40d3c148658b8f2c4223d6 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 7 Apr 2025 22:07:52 +0100 Subject: [PATCH 0905/1029] [VPlan] Introduce and use VPValue::replaceUsesOfWith (NFC). Adds an API matching LLVM's IR Value, which simplifies some code a bit. --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 7 ++----- llvm/lib/Transforms/Vectorize/VPlan.cpp | 7 +++++++ llvm/lib/Transforms/Vectorize/VPlanValue.h | 3 +++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5df1061691a67..807136f6e0e47 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9924,11 +9924,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( VPValue *Cmp = Select->getOperand(0); // If the compare is checking the reduction PHI node, adjust it to check // the start value. - if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) { - for (unsigned I = 0; I != CmpR->getNumOperands(); ++I) - if (CmpR->getOperand(I) == PhiR) - CmpR->setOperand(I, PhiR->getStartValue()); - } + if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) + CmpR->replaceUsesOfWith(PhiR, PhiR->getStartValue()); Builder.setInsertPoint(Select); // If the true value of the select is the reduction phi, the new value is diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 4688eef194b32..9474e7a171dff 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -1414,6 +1414,13 @@ void VPValue::replaceUsesWithIf( } } +void VPUser::replaceUsesOfWith(VPValue *From, VPValue *To) { + for (unsigned Idx = 0; Idx != getNumOperands(); ++Idx) { + if (getOperand(Idx) == From) + setOperand(Idx, To); + } +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const { OS << Tracker.getOrCreateName(this); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 2b762d0533d19..d322fdfa727e4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -246,6 +246,9 @@ class VPUser { New->addUser(*this); } + /// Replaces all uses of \p From in the VPUser with \p To. + void replaceUsesOfWith(VPValue *From, VPValue *To); + typedef SmallVectorImpl::iterator operand_iterator; typedef SmallVectorImpl::const_iterator const_operand_iterator; typedef iterator_range operand_range; From 01bc672b8a41774feff445b07e749262597501e4 Mon Sep 17 00:00:00 2001 From: Sarah Spall Date: Mon, 7 Apr 2025 15:25:47 -0700 Subject: [PATCH 0906/1029] [HLSL] Desugar ConstantArrayType when calculating cbuffer field layout (#134683) When calculating the layout for a cbuffer field, if that field is a ConstantArrayType, desguar it before casting it to a ConstantArrayType. Closes #134668 --------- Co-authored-by: Eli Friedman --- clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp | 2 +- clang/test/CodeGenHLSL/cbuffer.hlsl | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp index b546b6dd574ff..1ed33894b15aa 100644 --- a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp +++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp @@ -195,7 +195,7 @@ bool HLSLBufferLayoutBuilder::layoutField(const FieldDecl *FD, // Unwrap array to find the element type and get combined array size. QualType Ty = FieldTy; while (Ty->isConstantArrayType()) { - const ConstantArrayType *ArrayTy = cast(Ty); + auto *ArrayTy = CGM.getContext().getAsConstantArrayType(Ty); ArrayCount *= ArrayTy->getSExtSize(); Ty = ArrayTy->getElementType(); } diff --git a/clang/test/CodeGenHLSL/cbuffer.hlsl b/clang/test/CodeGenHLSL/cbuffer.hlsl index db06cea808b62..0a0465cc44e91 100644 --- a/clang/test/CodeGenHLSL/cbuffer.hlsl +++ b/clang/test/CodeGenHLSL/cbuffer.hlsl @@ -99,6 +99,19 @@ cbuffer CBArrays : register(b2) { // CHECK: @c7 = external addrspace(2) global [2 x i64], align 8 // CHECK: @c8 = external addrspace(2) global [4 x i32], align 4 +typedef uint32_t4 uint32_t8[2]; +typedef uint4 T1; +typedef T1 T2[2]; // check a double typedef + +cbuffer CBTypedefArray { + uint32_t8 t1[2]; + T2 t2[2]; +} + +// CHECK: @CBTypedefArray.cb = global target("dx.CBuffer", target("dx.Layout", %__cblayout_CBTypedefArray, +// CHECK-SAME: 128, 0, 64)) +// CHECK: @t1 = external addrspace(2) global [2 x [2 x <4 x i32>]], align 16 +// CHECK: @t2 = external addrspace(2) global [2 x [2 x <4 x i32>]], align 16 struct Empty {}; struct A { @@ -278,7 +291,7 @@ void main() { // CHECK-NEXT: call void @_init_resource_CBScalars.cb() // CHECK-NEXT: call void @_init_resource_CBArrays.cb() -// CHECK: !hlsl.cbs = !{![[CBSCALARS:[0-9]+]], ![[CBVECTORS:[0-9]+]], ![[CBARRAYS:[0-9]+]], ![[CBSTRUCTS:[0-9]+]], ![[CBCLASSES:[0-9]+]], +// CHECK: !hlsl.cbs = !{![[CBSCALARS:[0-9]+]], ![[CBVECTORS:[0-9]+]], ![[CBARRAYS:[0-9]+]], ![[CBTYPEDEFARRAY:[0-9]+]], ![[CBSTRUCTS:[0-9]+]], ![[CBCLASSES:[0-9]+]], // CHECK-SAME: ![[CBMIX:[0-9]+]], ![[CB_A:[0-9]+]], ![[CB_B:[0-9]+]], ![[CB_C:[0-9]+]]} // CHECK: ![[CBSCALARS]] = !{ptr @CBScalars.cb, ptr addrspace(2) @a1, ptr addrspace(2) @a2, ptr addrspace(2) @a3, ptr addrspace(2) @a4, @@ -290,6 +303,8 @@ void main() { // CHECK: ![[CBARRAYS]] = !{ptr @CBArrays.cb, ptr addrspace(2) @c1, ptr addrspace(2) @c2, ptr addrspace(2) @c3, ptr addrspace(2) @c4, // CHECK-SAME: ptr addrspace(2) @c5, ptr addrspace(2) @c6, ptr addrspace(2) @c7, ptr addrspace(2) @c8} +// CHECK: ![[CBTYPEDEFARRAY]] = !{ptr @CBTypedefArray.cb, ptr addrspace(2) @t1, ptr addrspace(2) @t2} + // CHECK: ![[CBSTRUCTS]] = !{ptr @CBStructs.cb, ptr addrspace(2) @a, ptr addrspace(2) @b, ptr addrspace(2) @c, ptr addrspace(2) @array_of_A, // CHECK-SAME: ptr addrspace(2) @d, ptr addrspace(2) @e, ptr addrspace(2) @f} From 9ce45579781317aade2004e970b352c0e06af291 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Mon, 7 Apr 2025 15:26:38 -0700 Subject: [PATCH 0907/1029] [RISCV] Add coverage for reported miscompile in shuffle lowering Derived from the example in https://github.com/llvm/llvm-project/issues/134126 --- .../RISCV/rvv/fixed-vectors-shuffle-int.ll | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll index 63fd1d1ed2d25..65f78dcfb4bce 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll @@ -1371,3 +1371,38 @@ define <8 x i64> @shuffle_v8i164_span_splat(<8 x i64> %a) nounwind { %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> ret <8 x i64> %res } + +; FIXME: Doing this as a span spat requires rewriting the undef elements in +; the mask not just using a prefix of the mask. +define <8 x i64> @shuffle_v8i64_span_splat_neg(<8 x i64> %a) nounwind { +; CHECK-LABEL: shuffle_v8i64_span_splat_neg: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v9, 1 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v12, v8, v9 +; CHECK-NEXT: vmv.v.v v13, v12 +; CHECK-NEXT: vmv.v.v v14, v12 +; CHECK-NEXT: vmv.v.v v15, v12 +; CHECK-NEXT: vmv4r.v v8, v12 +; CHECK-NEXT: ret + %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> + ret <8 x i64> %res +} + +; FIXME: A locally repeating shuffle needs to use a mask prefix +define <8 x i32> @shuffle_v8i32_locally_repeating_neg(<8 x i32> %a) { +; CHECK-LABEL: shuffle_v8i32_locally_repeating_neg: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI87_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI87_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v11, v9, v12 +; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 +; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: ret + %res = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> + ret <8 x i32> %res +} From 9b63a92ca723293dfe8570d1b2881ce949f1f6cc Mon Sep 17 00:00:00 2001 From: Andres Chavarria <84650073+chavandres@users.noreply.github.com> Date: Mon, 7 Apr 2025 16:50:30 -0600 Subject: [PATCH 0908/1029] Implement areInlineCompatible for SystemZ using feature bitset (#132976) ## What? Implement `areInlineCompatible` for the SystemZ target using FeatureBitset comparison. ## Why? The default implementation in `TargetTransformInfoImpl.h` makes a string comparison and only inlines when the target-cpu and the target-features for caller and callee are the same. We are missing out on optimizations when the callee has a subset of features of the caller. ## How? Get the FeatureBitset of the caller and callee and check when callee is a subset or equal to the caller's features. It's a similar implementation to ARM, PowerPC... ## Testing? Test cases check for when the callee is a subset of the caller, when it's not a subset and when both are equals. --- .../SystemZ/SystemZTargetTransformInfo.cpp | 14 +++++++ .../SystemZ/SystemZTargetTransformInfo.h | 4 ++ .../Inline/SystemZ/inline-target-attr.ll | 42 +++++++++++++++++++ .../Transforms/Inline/SystemZ/lit.local.cfg | 2 + 4 files changed, 62 insertions(+) create mode 100644 llvm/test/Transforms/Inline/SystemZ/inline-target-attr.ll create mode 100644 llvm/test/Transforms/Inline/SystemZ/lit.local.cfg diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 06a0a3a631654..e0b0099466c52 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -422,6 +422,20 @@ bool SystemZTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, C2.ScaleCost, C2.SetupCost); } +bool SystemZTTIImpl::areInlineCompatible(const Function *Caller, + const Function *Callee) const { + const TargetMachine &TM = getTLI()->getTargetMachine(); + + const FeatureBitset &CallerBits = + TM.getSubtargetImpl(*Caller)->getFeatureBits(); + const FeatureBitset &CalleeBits = + TM.getSubtargetImpl(*Callee)->getFeatureBits(); + + // Support only equal feature bitsets. Restriction should be relaxed in the + // future to allow inlining when callee's bits are subset of the caller's. + return CallerBits == CalleeBits; +} + unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const { bool Vector = (ClassID == 1); if (!Vector) diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 512fcc854d532..e64b1f1ccbd93 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -62,6 +62,10 @@ class SystemZTTIImpl : public BasicTTIImplBase { bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2); + + bool areInlineCompatible(const Function *Caller, + const Function *Callee) const; + /// @} /// \name Vector TTI Implementations diff --git a/llvm/test/Transforms/Inline/SystemZ/inline-target-attr.ll b/llvm/test/Transforms/Inline/SystemZ/inline-target-attr.ll new file mode 100644 index 0000000000000..b5c4f42655bb4 --- /dev/null +++ b/llvm/test/Transforms/Inline/SystemZ/inline-target-attr.ll @@ -0,0 +1,42 @@ +; RUN: opt < %s -mtriple=s390x-linux-gnu -S -passes=inline | FileCheck %s +; RUN: opt < %s -mtriple=s390x-linux-gnu -S -passes='cgscc(inline)' | FileCheck %s +; Check that we only inline when we have equal target attributes. + +define i32 @foo() #0 { +entry: + %call = call i32 (...) @baz() + ret i32 %call +; CHECK-LABEL: foo +; CHECK: call i32 (...) @baz() +} + +declare i32 @baz(...) #0 + +define i32 @bar() #1 { +entry: + %call = call i32 @foo() + ret i32 %call +; CHECK-LABEL: bar +; CHECK: call i32 @foo() +} + +define i32 @qux() #0 { +entry: + %call = call i32 @foo() + ret i32 %call +; CHECK-LABEL: qux +; CHECK: call i32 (...) @baz() +} + +define i32 @quux() #2 { +entry: + %call = call i32 @bar() + ret i32 %call +; CHECK-LABEL: quux +; CHECK: call i32 @bar() +} + + +attributes #0 = { "target-cpu"="generic" "target-features"="+guarded-storage" } +attributes #1 = { "target-cpu"="generic" "target-features"="+guarded-storage,+enhanced-sort" } +attributes #2 = { "target-cpu"="generic" "target-features"="+concurrent-functions" } diff --git a/llvm/test/Transforms/Inline/SystemZ/lit.local.cfg b/llvm/test/Transforms/Inline/SystemZ/lit.local.cfg new file mode 100644 index 0000000000000..f9dd98a21cc3e --- /dev/null +++ b/llvm/test/Transforms/Inline/SystemZ/lit.local.cfg @@ -0,0 +1,2 @@ +if not "SystemZ" in config.root.targets: + config.unsupported = True From ddb32a67c2e76bc0f69470a8d02a220e8083fa68 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Mon, 7 Apr 2025 16:21:12 -0700 Subject: [PATCH 0909/1029] [NFC] Add a precommit test for aarch64 jump table partitioning (#125987) --- .../CodeGen/AArch64/jump-table-partition.ll | 245 ++++++++++++++++++ 1 file changed, 245 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/jump-table-partition.ll diff --git a/llvm/test/CodeGen/AArch64/jump-table-partition.ll b/llvm/test/CodeGen/AArch64/jump-table-partition.ll new file mode 100644 index 0000000000000..122bbaef09185 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/jump-table-partition.ll @@ -0,0 +1,245 @@ +; The llc commands override two options +; - 'aarch64-enable-atomic-cfg-tidy' to false to turn off simplifycfg pass, +; which can simplify away switch instructions before isel lowers switch instructions. +; - 'aarch64-min-jump-table-entries' so 'switch' needs fewer cases to generate +; a jump table. + +; The static-data-splitter pass doesn't run. +; RUN: llc -mtriple=aarch64-unknown-linux-gnu -function-sections=true \ +; RUN: -aarch64-enable-atomic-cfg-tidy=false -aarch64-min-jump-table-entries=2 \ +; RUN: -unique-section-names=true %s -o - 2>&1 | FileCheck %s --check-prefixes=DEFAULT + +; DEFAULT: .section .rodata.hot.foo,"a",@progbits +; DEFAULT: .LJTI0_0: +; DEFAULT: .LJTI0_1: +; DEFAULT: .LJTI0_2: +; DEFAULT: .LJTI0_3: +; DEFAULT: .section .rodata.func_without_profile,"a",@progbits +; DEFAULT: .LJTI1_0: +; DEFAULT: .section .rodata.bar_prefix.bar,"a",@progbits +; DEFAULT: .LJTI2_0 + +; RUN: llc -mtriple=aarch64-unknown-linux-gnu -enable-split-machine-functions \ +; RUN: -partition-static-data-sections=true -function-sections=true \ +; RUN: -aarch64-enable-atomic-cfg-tidy=false -aarch64-min-jump-table-entries=2 \ +; RUN: -unique-section-names=false %s -o - 2>&1 | FileCheck %s --check-prefixes=NUM,JT + +; Section names will optionally have `.` if -function-sections is enabled. +; RUN: llc -mtriple=aarch64-unknown-linux-gnu -enable-split-machine-functions \ +; RUN: -partition-static-data-sections=true -function-sections=true \ +; RUN: -aarch64-enable-atomic-cfg-tidy=false -aarch64-min-jump-table-entries=2 \ +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefixes=FUNC,JT + +; RUN: llc -mtriple=aarch64-unknown-linux-gnu -enable-split-machine-functions \ +; RUN: -partition-static-data-sections=true -function-sections=false \ +; RUN: -aarch64-enable-atomic-cfg-tidy=false -aarch64-min-jump-table-entries=2 \ +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefixes=FUNCLESS,JT + +; A function's section prefix is used for all jump tables of this function. +; @foo is hot so its jump table data section has a hot prefix. +; NUM: .section .rodata.hot.,"a",@progbits,unique,2 +; FUNC: .section .rodata.hot.foo,"a",@progbits +; FUNCLESS: .section .rodata.hot.,"a",@progbits +; JT: .LJTI0_0: +; JT: .LJTI0_1: +; JT: .LJTI0_2: +; JT: .LJTI0_3: + +; func_without_profile doesn't have profiles, so its jumptable doesn't have +; hotness-based prefix. +; NUM: .section .rodata,"a",@progbits,unique,4 +; FUNC: .section .rodata.func_without_profile,"a",@progbits +; FUNCLESS: .section .rodata,"a",@progbits +; JT: .LJTI1_0: + +; @bar doesn't have profile information and it has a section prefix. +; Tests that its jump tables are placed in sections with function prefixes. +; NUM: .section .rodata.bar_prefix.,"a",@progbits,unique, +; FUNC: .section .rodata.bar_prefix.bar +; FUNCLESS: .section .rodata.bar_prefix.,"a" +; JT: .LJTI2_0 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +@str.9 = private constant [7 x i8] c".str.9\00" +@str.10 = private constant [8 x i8] c".str.10\00" +@str.11 = private constant [8 x i8] c".str.11\00" + +@case2 = private constant [7 x i8] c"case 2\00" +@case1 = private constant [7 x i8] c"case 1\00" +@default = private constant [8 x i8] c"default\00" +@jt3 = private constant [4 x i8] c"jt3\00" + +; jt0 and jt2 are hot. jt1 and jt3 are cold. +define i32 @foo(i32 %num) !prof !13 { +entry: + %mod3 = sdiv i32 %num, 3 + switch i32 %mod3, label %jt0.default [ + i32 1, label %jt0.bb1 + i32 2, label %jt0.bb2 + ], !prof !14 + +jt0.bb1: + call i32 @puts(ptr @case1) + br label %jt0.epilog + +jt0.bb2: + call i32 @puts(ptr @case2) + br label %jt0.epilog + +jt0.default: + call i32 @puts(ptr @default) + br label %jt0.epilog + +jt0.epilog: + %zero = icmp eq i32 %num, 0 + br i1 %zero, label %hot, label %cold, !prof !17 + +hot: + %c2 = call i32 @transform(i32 %num) + switch i32 %c2, label %jt2.default [ + i32 1, label %jt2.bb1 + i32 2, label %jt2.bb2 + ], !prof !14 + +jt2.bb1: + call i32 @puts(ptr @case1) + br label %jt1.epilog + +jt2.bb2: + call i32 @puts(ptr @case2) + br label %jt1.epilog + +jt2.default: + call i32 @puts(ptr @default) + br label %jt2.epilog + +jt2.epilog: + %c2cmp = icmp ne i32 %c2, 0 + br i1 %c2cmp, label %return, label %jt3.prologue, !prof !18 + +cold: + %c1 = call i32 @compute(i32 %num) + switch i32 %c1, label %jt1.default [ + i32 1, label %jt1.bb1 + i32 2, label %jt1.bb2 + ], !prof !14 + +jt1.bb1: + call i32 @puts(ptr @case1) + br label %jt1.epilog + +jt1.bb2: + call i32 @puts(ptr @case2) + br label %jt1.epilog + +jt1.default: + call i32 @puts(ptr @default) + br label %jt1.epilog + +jt1.epilog: + br label %return + +jt3.prologue: + %c3 = call i32 @cleanup(i32 %num) + switch i32 %c3, label %jt3.default [ + i32 1, label %jt3.bb1 + i32 2, label %jt3.bb2 + ], !prof !14 + +jt3.bb1: + call i32 @puts(ptr @case1) + br label %jt3.epilog + +jt3.bb2: + call i32 @puts(ptr @case2) + br label %jt3.epilog + +jt3.default: + call i32 @puts(ptr @default) + br label %jt3.epilog + +jt3.epilog: + call i32 @puts(ptr @jt3) + br label %return + +return: + ret i32 %mod3 +} + +define void @func_without_profile(i32 %num) { +entry: + switch i32 %num, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb1 + ] + +sw.bb: + call i32 @puts(ptr @str.10) + br label %sw.epilog + +sw.bb1: + call i32 @puts(ptr @str.9) + br label %sw.epilog + +sw.default: + call i32 @puts(ptr @str.11) + br label %sw.epilog + +sw.epilog: + ret void +} + +define void @bar(i32 %num) !section_prefix !20 { +entry: + switch i32 %num, label %sw.default [ + i32 1, label %sw.bb + i32 2, label %sw.bb1 + ] + +sw.bb: + call i32 @puts(ptr @str.10) + br label %sw.epilog + +sw.bb1: + call i32 @puts(ptr @str.9) + br label %sw.epilog + +sw.default: + call i32 @puts(ptr @str.11) + br label %sw.epilog + +sw.epilog: + ret void +} + +declare i32 @puts(ptr) +declare i32 @printf(ptr, ...) +declare i32 @compute(i32) +declare i32 @transform(i32) +declare i32 @cleanup(i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 230002} +!4 = !{!"MaxCount", i64 100000} +!5 = !{!"MaxInternalCount", i64 50000} +!6 = !{!"MaxFunctionCount", i64 100000} +!7 = !{!"NumCounts", i64 14} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12} +!11 = !{i32 990000, i64 10000, i32 7} +!12 = !{i32 999999, i64 1, i32 9} +!13 = !{!"function_entry_count", i64 100000} +!14 = !{!"branch_weights", i32 60000, i32 20000, i32 20000} +!15 = !{!"function_entry_count", i64 1} +!16 = !{!"branch_weights", i32 1, i32 0, i32 0, i32 0, i32 0, i32 0} +!17 = !{!"branch_weights", i32 99999, i32 1} +!18 = !{!"branch_weights", i32 99998, i32 1} +!19 = !{!"branch_weights", i32 97000, i32 1000, i32 1000, i32 1000} +!20 = !{!"function_section_prefix", !"bar_prefix"} From a38ad6e2a24b46eaea481c04de566dd2a45d667b Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Mon, 7 Apr 2025 16:27:08 -0700 Subject: [PATCH 0910/1029] [mlir][tosa] Check empty Concat input for inferReturnTypeComponents (#134728) Signed-off-by: Jerry Ge --- mlir/lib/Dialect/Tosa/IR/TosaOps.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index c8e9ad8bd3346..59946ca54b933 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -1066,6 +1066,10 @@ LogicalResult tosa::ConcatOp::inferReturnTypeComponents( hasRankedInput = true; } + + if (adaptor.getInput1().empty()) + return failure(); + Type inputType = llvm::cast(adaptor.getInput1().getType()[0]).getElementType(); if (!hasRankedInput) { From 65813e0e94c0403dad61e8365b39d76d7b3bfc14 Mon Sep 17 00:00:00 2001 From: Tom Yang Date: Mon, 7 Apr 2025 16:33:48 -0700 Subject: [PATCH 0911/1029] Control Darwin parallel image loading with target.parallel-module-load (#134437) A requested follow-up from https://github.com/llvm/llvm-project/pull/130912 by @JDevlieghere to control Darwin parallel image loading with the same `target.parallel-module-load` that controls the POSIX dyld parallel image loading. Darwin parallel image loading was introduced by https://github.com/llvm/llvm-project/pull/110646. This small change: * removes `plugin.dynamic-loader.darwin.experimental.enable-parallel-image-load` and associated code. * changes setting call site in `DynamicLoaderDarwin::PreloadModulesFromImageInfos` to use the new setting. Tested by running `ninja check-lldb` and loading some targets. Co-authored-by: Tom Yang --- .../DynamicLoader/MacOSX-DYLD/CMakeLists.txt | 13 ----- .../MacOSX-DYLD/DynamicLoaderDarwin.cpp | 15 +----- .../MacOSX-DYLD/DynamicLoaderDarwin.h | 2 - .../DynamicLoaderDarwinProperties.cpp | 53 ------------------- .../DynamicLoaderDarwinProperties.h | 34 ------------ .../DynamicLoaderDarwinProperties.td | 8 --- .../MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp | 8 +-- .../MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h | 2 - 8 files changed, 2 insertions(+), 133 deletions(-) delete mode 100644 lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.cpp delete mode 100644 lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.h delete mode 100644 lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/CMakeLists.txt b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/CMakeLists.txt index 77a560541fcb1..7308374c8bfba 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/CMakeLists.txt +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/CMakeLists.txt @@ -1,16 +1,7 @@ -lldb_tablegen(DynamicLoaderDarwinProperties.inc -gen-lldb-property-defs - SOURCE DynamicLoaderDarwinProperties.td - TARGET LLDBPluginDynamicLoaderDarwinPropertiesGen) - -lldb_tablegen(DynamicLoaderDarwinPropertiesEnum.inc -gen-lldb-property-enum-defs - SOURCE DynamicLoaderDarwinProperties.td - TARGET LLDBPluginDynamicLoaderDarwinPropertiesEnumGen) - add_lldb_library(lldbPluginDynamicLoaderMacOSXDYLD PLUGIN DynamicLoaderMacOSXDYLD.cpp DynamicLoaderMacOS.cpp DynamicLoaderDarwin.cpp - DynamicLoaderDarwinProperties.cpp LINK_LIBS lldbBreakpoint @@ -25,7 +16,3 @@ add_lldb_library(lldbPluginDynamicLoaderMacOSXDYLD PLUGIN Support TargetParser ) - -add_dependencies(lldbPluginDynamicLoaderMacOSXDYLD - LLDBPluginDynamicLoaderDarwinPropertiesGen - LLDBPluginDynamicLoaderDarwinPropertiesEnumGen) diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp index f9b49c50355d5..e25c4ff55e408 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp @@ -8,7 +8,6 @@ #include "DynamicLoaderDarwin.h" -#include "DynamicLoaderDarwinProperties.h" #include "lldb/Breakpoint/StoppointCallbackContext.h" #include "lldb/Core/Debugger.h" #include "lldb/Core/Module.h" @@ -79,17 +78,6 @@ void DynamicLoaderDarwin::DidLaunch() { SetNotificationBreakpoint(); } -void DynamicLoaderDarwin::CreateSettings(lldb_private::Debugger &debugger) { - if (!PluginManager::GetSettingForDynamicLoaderPlugin( - debugger, DynamicLoaderDarwinProperties::GetSettingName())) { - const bool is_global_setting = true; - PluginManager::CreateSettingForDynamicLoaderPlugin( - debugger, - DynamicLoaderDarwinProperties::GetGlobal().GetValueProperties(), - "Properties for the DynamicLoaderDarwin plug-in.", is_global_setting); - } -} - // Clear out the state of this class. void DynamicLoaderDarwin::Clear(bool clear_process) { std::lock_guard guard(m_mutex); @@ -670,8 +658,7 @@ DynamicLoaderDarwin::PreloadModulesFromImageInfos( image_info, FindTargetModuleForImageInfo(image_info, true, nullptr)); }; auto it = image_infos.begin(); - bool is_parallel_load = - DynamicLoaderDarwinProperties::GetGlobal().GetEnableParallelImageLoad(); + bool is_parallel_load = m_process->GetTarget().GetParallelModuleLoad(); if (is_parallel_load) { llvm::ThreadPoolTaskGroup taskGroup(Debugger::GetThreadPool()); for (size_t i = 0; i < size; ++i, ++it) { diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h index bc5464d76b950..37528b88b615e 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h @@ -58,8 +58,6 @@ class DynamicLoaderDarwin : public lldb_private::DynamicLoader { std::optional GetStartAddress() override; - static void CreateSettings(lldb_private::Debugger &debugger); - protected: void PrivateInitialize(lldb_private::Process *process); diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.cpp deleted file mode 100644 index f4d8a071e6d5d..0000000000000 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.cpp +++ /dev/null @@ -1,53 +0,0 @@ -//===-- DynamicLoaderDarwinProperties.cpp ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "DynamicLoaderDarwinProperties.h" - -using namespace lldb_private; - -#define LLDB_PROPERTIES_dynamicloaderdarwin_experimental -#include "DynamicLoaderDarwinProperties.inc" - -enum { -#define LLDB_PROPERTIES_dynamicloaderdarwin_experimental -#include "DynamicLoaderDarwinPropertiesEnum.inc" -}; - -llvm::StringRef DynamicLoaderDarwinProperties::GetSettingName() { - static constexpr llvm::StringLiteral g_setting_name("darwin"); - return g_setting_name; -} - -DynamicLoaderDarwinProperties::ExperimentalProperties::ExperimentalProperties() - : Properties(std::make_shared( - GetExperimentalSettingsName())) { - m_collection_sp->Initialize(g_dynamicloaderdarwin_experimental_properties); -} - -DynamicLoaderDarwinProperties::DynamicLoaderDarwinProperties() - : Properties(std::make_shared(GetSettingName())), - m_experimental_properties(std::make_unique()) { - m_collection_sp->AppendProperty( - Properties::GetExperimentalSettingsName(), - "Experimental settings - setting these won't produce errors if the " - "setting is not present.", - true, m_experimental_properties->GetValueProperties()); -} - -bool DynamicLoaderDarwinProperties::GetEnableParallelImageLoad() const { - return m_experimental_properties->GetPropertyAtIndexAs( - ePropertyEnableParallelImageLoad, - g_dynamicloaderdarwin_experimental_properties - [ePropertyEnableParallelImageLoad] - .default_uint_value != 0); -} - -DynamicLoaderDarwinProperties &DynamicLoaderDarwinProperties::GetGlobal() { - static DynamicLoaderDarwinProperties g_settings; - return g_settings; -} diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.h deleted file mode 100644 index 4c5e800c4f3e4..0000000000000 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.h +++ /dev/null @@ -1,34 +0,0 @@ -//===-- DynamicLoaderDarwinProperties.h -------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLDB_SOURCE_PLUGINS_DYNAMICLOADER_MACOSX_DYLD_DYNAMICLOADERDARWINPROPERTIES_H -#define LLDB_SOURCE_PLUGINS_DYNAMICLOADER_MACOSX_DYLD_DYNAMICLOADERDARWINPROPERTIES_H - -#include "lldb/Core/UserSettingsController.h" - -namespace lldb_private { - -class DynamicLoaderDarwinProperties : public Properties { -public: - class ExperimentalProperties : public Properties { - public: - ExperimentalProperties(); - }; - static llvm::StringRef GetSettingName(); - static DynamicLoaderDarwinProperties &GetGlobal(); - DynamicLoaderDarwinProperties(); - ~DynamicLoaderDarwinProperties() override = default; - bool GetEnableParallelImageLoad() const; - -private: - std::unique_ptr m_experimental_properties; -}; - -} // namespace lldb_private - -#endif // LLDB_SOURCE_PLUGINS_DYNAMICLOADER_MACOSX_DYLD_DYNAMICLOADERDARWINPROPERTIES_H diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td deleted file mode 100644 index c54580ce34729..0000000000000 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td +++ /dev/null @@ -1,8 +0,0 @@ -include "../../../../include/lldb/Core/PropertiesBase.td" - -let Definition = "dynamicloaderdarwin_experimental" in { - def EnableParallelImageLoad: Property<"enable-parallel-image-load", "Boolean">, - Global, - DefaultTrue, - Desc<"Load images in parallel.">; -} diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp index b05ed1ce2c823..f839948660aa0 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp @@ -1149,8 +1149,7 @@ bool DynamicLoaderMacOSXDYLD::IsFullyInitialized() { void DynamicLoaderMacOSXDYLD::Initialize() { PluginManager::RegisterPlugin(GetPluginNameStatic(), - GetPluginDescriptionStatic(), CreateInstance, - DebuggerInitialize); + GetPluginDescriptionStatic(), CreateInstance); DynamicLoaderMacOS::Initialize(); } @@ -1159,11 +1158,6 @@ void DynamicLoaderMacOSXDYLD::Terminate() { PluginManager::UnregisterPlugin(CreateInstance); } -void DynamicLoaderMacOSXDYLD::DebuggerInitialize( - lldb_private::Debugger &debugger) { - CreateSettings(debugger); -} - llvm::StringRef DynamicLoaderMacOSXDYLD::GetPluginDescriptionStatic() { return "Dynamic loader plug-in that watches for shared library loads/unloads " "in MacOSX user processes."; diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h index 924e2fc107743..ae7451722a8d7 100644 --- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h +++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h @@ -50,8 +50,6 @@ class DynamicLoaderMacOSXDYLD : public lldb_private::DynamicLoaderDarwin { static lldb_private::DynamicLoader * CreateInstance(lldb_private::Process *process, bool force); - static void DebuggerInitialize(lldb_private::Debugger &debugger); - /// Called after attaching a process. /// /// Allow DynamicLoader plug-ins to execute some code after From 0afa872a0db41cba313df473aa3fea52a35c8e70 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Tue, 8 Apr 2025 08:54:10 +0900 Subject: [PATCH 0912/1029] [DirectX] Scalarize the dx.saturate intrinsic (#134381) The DXIL Saturate op only takes scalars. Fixes #134378. --- .../DirectX/DirectXTargetTransformInfo.cpp | 1 + llvm/test/CodeGen/DirectX/saturate.ll | 36 ++++++++++++++++--- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 765e1977041b9..4cf3282b108f1 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -44,6 +44,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_firstbituhigh: case Intrinsic::dx_frac: case Intrinsic::dx_rsqrt: + case Intrinsic::dx_saturate: case Intrinsic::dx_splitdouble: case Intrinsic::dx_wave_readlane: case Intrinsic::dx_wave_reduce_max: diff --git a/llvm/test/CodeGen/DirectX/saturate.ll b/llvm/test/CodeGen/DirectX/saturate.ll index 0bb1e55421046..1e4a235448a48 100644 --- a/llvm/test/CodeGen/DirectX/saturate.ll +++ b/llvm/test/CodeGen/DirectX/saturate.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s +; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s ; Make sure the intrinsic dx.saturate is to appropriate DXIL op for half/float/double data types. ; CHECK-LABEL: test_saturate_half @@ -28,9 +28,35 @@ entry: ret double %hlsl.saturate } -; CHECK: attributes #[[#ATTR]] = {{{.*}} memory(none) {{.*}}} +; CHECK-LABEL: test_saturate_half4 +define noundef <4 x half> @test_saturate_half4(<4 x half> noundef %p0) { +entry: + ; CHECK: call half @dx.op.unary.f16(i32 7, half + ; CHECK: call half @dx.op.unary.f16(i32 7, half + ; CHECK: call half @dx.op.unary.f16(i32 7, half + ; CHECK: call half @dx.op.unary.f16(i32 7, half + %hlsl.saturate = call <4 x half> @llvm.dx.saturate.v4f16(<4 x half> %p0) + ret <4 x half> %hlsl.saturate +} + +; CHECK-LABEL: test_saturate_float3 +define noundef <3 x float> @test_saturate_float3(<3 x float> noundef %p0) { +entry: + ; CHECK: call float @dx.op.unary.f32(i32 7, float + ; CHECK: call float @dx.op.unary.f32(i32 7, float + ; CHECK: call float @dx.op.unary.f32(i32 7, float + %hlsl.saturate = call <3 x float> @llvm.dx.saturate.v3f32(<3 x float> %p0) + ret <3 x float> %hlsl.saturate +} -declare half @llvm.dx.saturate.f16(half) -declare float @llvm.dx.saturate.f32(float) -declare double @llvm.dx.saturate.f64(double) +; CHECK-LABEL: test_saturate_double2 +define noundef <2 x double> @test_saturate_double2(<2 x double> noundef %p0) { +entry: + ; CHECK: call double @dx.op.unary.f64(i32 7, double + ; CHECK: call double @dx.op.unary.f64(i32 7, double + %hlsl.saturate = call <2 x double> @llvm.dx.saturate.v4f64(<2 x double> %p0) + ret <2 x double> %hlsl.saturate +} + +; CHECK: attributes #[[#ATTR]] = {{{.*}} memory(none) {{.*}}} From 99d12ac172f34e4963cf3a36536719149d30d788 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Mon, 7 Apr 2025 17:03:25 -0700 Subject: [PATCH 0913/1029] [bazel] Fix build after 65813e0e94c0403dad61e8365b39d76d7b3bfc14 --- .../lldb/source/Plugins/BUILD.bazel | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel index a4b51463a9241..b204d8fa04584 100644 --- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel @@ -1186,25 +1186,12 @@ cc_library( ], ) -gentbl_cc_library( - name = "DynamicLoaderMacOSXDYLDProperties", - strip_include_prefix = "DynamicLoader/MacOSX-DYLD", - tbl_outs = { - "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.inc": ["-gen-lldb-property-defs"], - "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinPropertiesEnum.inc": ["-gen-lldb-property-enum-defs"], - }, - tblgen = "//lldb:lldb-tblgen", - td_file = "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td", - deps = ["//lldb:CoreTdFiles"], -) - cc_library( name = "PluginDynamicLoaderMacOSXDYLD", srcs = glob(["DynamicLoader/MacOSX-DYLD/*.cpp"]), hdrs = glob(["DynamicLoader/MacOSX-DYLD/*.h"]), includes = [".."], deps = [ - ":DynamicLoaderMacOSXDYLDProperties", ":PluginObjCRuntime", ":PluginTypeSystemClang", ":PluginTypeSystemClangHeaders", From d7354e337a4602da1e5913b3e6fceda2e8c5ecc0 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Tue, 8 Apr 2025 08:04:43 +0800 Subject: [PATCH 0914/1029] [SLP][REVEC] Fix ShuffleVector does not consider alternate instruction. (#134599) --- .../Transforms/Vectorize/SLPVectorizer.cpp | 5 +- .../SLPVectorizer/revec-shufflevector.ll | 190 +++++++++++++++--- 2 files changed, 169 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e9ba944924837..e6559f26be8c2 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -17770,13 +17770,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *Src = vectorizeOperand(E, 0); SmallVector ThisMask(calculateShufflevectorMask(E->Scalars)); if (auto *SVSrc = dyn_cast(Src)) { - assert(isa(SVSrc->getOperand(1)) && - "Not supported shufflevector usage."); SmallVector NewMask(ThisMask.size()); transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) { return SVSrc->getShuffleMask()[Mask]; }); - V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask); + V = Builder.CreateShuffleVector(SVSrc->getOperand(0), + SVSrc->getOperand(1), NewMask); } else { V = Builder.CreateShuffleVector(Src, ThisMask); } diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll index f11a0a9c024a2..b85c78ec8d2d0 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll @@ -1,14 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer,instcombine -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer,instcombine -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s --check-prefix COMBINE define void @test1(ptr %in, ptr %out) { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i32> [[TMP0]] to <8 x i64> -; CHECK-NEXT: store <8 x i64> [[TMP1]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> +; CHECK-NEXT: store <8 x i64> [[TMP1]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret void ; +; COMBINE-LABEL: @test1( +; COMBINE-NEXT: entry: +; COMBINE-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1 +; COMBINE-NEXT: [[TMP1:%.*]] = zext <8 x i32> [[TMP0]] to <8 x i64> +; COMBINE-NEXT: store <8 x i64> [[TMP1]], ptr [[OUT:%.*]], align 8 +; COMBINE-NEXT: ret void +; entry: %0 = load <8 x i32>, ptr %in, align 1 %1 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> @@ -34,11 +48,24 @@ define void @test2(ptr %in, ptr %out) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i32> [[TMP0]] to <8 x i64> +; CHECK-NEXT: [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[OUT:%.*]], align 8 +; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret void ; +; COMBINE-LABEL: @test2( +; COMBINE-NEXT: entry: +; COMBINE-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1 +; COMBINE-NEXT: [[TMP1:%.*]] = zext <8 x i32> [[TMP0]] to <8 x i64> +; COMBINE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> +; COMBINE-NEXT: store <8 x i64> [[TMP2]], ptr [[OUT:%.*]], align 8 +; COMBINE-NEXT: ret void +; entry: %0 = load <8 x i32>, ptr %in, align 1 %1 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> @@ -63,10 +90,19 @@ entry: define void @test3(<16 x i32> %0, ptr %out) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i32> @llvm.vector.insert.v64i32.v16i32(<64 x i32> poison, <16 x i32> [[TMP0:%.*]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <64 x i32> [[TMP3]], <64 x i32> poison, <64 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <64 x i32> [[TMP3]], <64 x i32> poison, <16 x i32> +; CHECK-NEXT: [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0 +; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT]], align 4 ; CHECK-NEXT: ret void ; +; COMBINE-LABEL: @test3( +; COMBINE-NEXT: entry: +; COMBINE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <16 x i32> +; COMBINE-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4 +; COMBINE-NEXT: ret void +; entry: %1 = shufflevector <16 x i32> %0, <16 x i32> poison, <4 x i32> %2 = shufflevector <16 x i32> %0, <16 x i32> poison, <4 x i32> @@ -87,10 +123,21 @@ define void @test4(ptr %in, ptr %out) { ; CHECK-LABEL: @test4( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4 +; CHECK-NEXT: [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT]], align 4 ; CHECK-NEXT: ret void ; +; COMBINE-LABEL: @test4( +; COMBINE-NEXT: entry: +; COMBINE-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 4 +; COMBINE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> +; COMBINE-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4 +; COMBINE-NEXT: ret void +; entry: %0 = load <8 x i32>, ptr %in, align 4 %1 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> @@ -109,9 +156,18 @@ entry: define void @test5(ptr %out) { ; CHECK-LABEL: @test5( ; CHECK-NEXT: entry: -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr [[OUT:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> poison, <8 x i32> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v8i32(<16 x i32> [[TMP0]], <8 x i32> zeroinitializer, i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 0 +; CHECK-NEXT: store <8 x i32> [[TMP2]], ptr [[TMP3]], align 4 ; CHECK-NEXT: ret void ; +; COMBINE-LABEL: @test5( +; COMBINE-NEXT: entry: +; COMBINE-NEXT: store <8 x i32> zeroinitializer, ptr [[OUT:%.*]], align 4 +; COMBINE-NEXT: ret void +; entry: %0 = shufflevector <8 x i32> zeroinitializer, <8 x i32> poison, <4 x i32> %1 = shufflevector <8 x i32> zeroinitializer, <8 x i32> poison, <4 x i32> @@ -125,33 +181,71 @@ entry: define void @test6(ptr %in0, ptr %in1, ptr %in2) { ; CHECK-LABEL: @test6( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr [[IN0:%.*]], i64 32 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[IN0:%.*]], i64 32 ; CHECK-NEXT: [[LOAD2:%.*]] = load <4 x float>, ptr [[GEP1]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[IN0]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr [[IN1:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = uitofp <32 x i8> [[TMP1]] to <32 x float> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <64 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = zext <32 x i8> [[TMP8]] to <32 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = uitofp <32 x i16> [[TMP10]] to <32 x float> ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x float> [[TMP14]], <16 x float> [[TMP15]], <16 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP14]], <4 x float> [[LOAD2]], i64 8) ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[TMP16]], <16 x float> poison, <32 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = fmul <32 x float> [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[GEP10:%.*]] = getelementptr inbounds nuw i8, ptr [[IN1]], i64 32 -; CHECK-NEXT: [[GEP11:%.*]] = getelementptr inbounds nuw i8, ptr [[IN2:%.*]], i64 128 +; CHECK-NEXT: [[GEP10:%.*]] = getelementptr inbounds i8, ptr [[IN1]], i64 32 +; CHECK-NEXT: [[GEP11:%.*]] = getelementptr inbounds i8, ptr [[IN2:%.*]], i64 128 ; CHECK-NEXT: [[TMP17:%.*]] = load <8 x float>, ptr [[IN0]], align 16 ; CHECK-NEXT: store <32 x float> [[TMP4]], ptr [[IN2]], align 16 ; CHECK-NEXT: [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = uitofp <16 x i8> [[LOAD5]] to <16 x float> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP8]], <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = call <32 x i8> @llvm.vector.insert.v32i8.v16i8(<32 x i8> poison, <16 x i8> [[LOAD5]], i64 0) +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <32 x i8> [[TMP19]], <32 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP19]], <32 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP15]] to <16 x i16> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <32 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = uitofp <16 x i16> [[TMP18]] to <16 x float> +; CHECK-NEXT: [[TMP20:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> [[LOAD2]], i64 0) +; CHECK-NEXT: [[TMP21:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> [[TMP17]], i64 0) +; CHECK-NEXT: [[TMP22:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP20]], <4 x float> [[TMP21]], i64 4) +; CHECK-NEXT: [[TMP23:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> [[TMP17]], i64 4) +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP22]], <4 x float> [[TMP23]], i64 8) ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = fmul <16 x float> [[TMP12]], [[TMP6]] ; CHECK-NEXT: store <16 x float> [[TMP13]], ptr [[GEP11]], align 16 ; CHECK-NEXT: ret void ; +; COMBINE-LABEL: @test6( +; COMBINE-NEXT: entry: +; COMBINE-NEXT: [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr [[IN0:%.*]], i64 32 +; COMBINE-NEXT: [[LOAD2:%.*]] = load <4 x float>, ptr [[GEP1]], align 16 +; COMBINE-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[IN0]], align 16 +; COMBINE-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr [[IN1:%.*]], align 1 +; COMBINE-NEXT: [[TMP2:%.*]] = uitofp <32 x i8> [[TMP1]] to <32 x float> +; COMBINE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> +; COMBINE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> +; COMBINE-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> [[TMP4]], <16 x i32> +; COMBINE-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <32 x i32> +; COMBINE-NEXT: [[TMP7:%.*]] = fmul <32 x float> [[TMP6]], [[TMP2]] +; COMBINE-NEXT: [[GEP10:%.*]] = getelementptr inbounds nuw i8, ptr [[IN1]], i64 32 +; COMBINE-NEXT: [[GEP11:%.*]] = getelementptr inbounds nuw i8, ptr [[IN2:%.*]], i64 128 +; COMBINE-NEXT: [[TMP8:%.*]] = load <8 x float>, ptr [[IN0]], align 16 +; COMBINE-NEXT: store <32 x float> [[TMP7]], ptr [[IN2]], align 16 +; COMBINE-NEXT: [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1 +; COMBINE-NEXT: [[TMP9:%.*]] = uitofp <16 x i8> [[LOAD5]] to <16 x float> +; COMBINE-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> +; COMBINE-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <16 x i32> +; COMBINE-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> +; COMBINE-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> +; COMBINE-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> poison, <16 x i32> +; COMBINE-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP12]], <16 x float> [[TMP14]], <16 x i32> +; COMBINE-NEXT: [[TMP16:%.*]] = shufflevector <16 x float> [[TMP15]], <16 x float> poison, <16 x i32> +; COMBINE-NEXT: [[TMP17:%.*]] = fmul <16 x float> [[TMP16]], [[TMP9]] +; COMBINE-NEXT: store <16 x float> [[TMP17]], ptr [[GEP11]], align 16 +; COMBINE-NEXT: ret void +; entry: %gep0 = getelementptr inbounds i8, ptr %in0, i64 16 %gep1 = getelementptr inbounds i8, ptr %in0, i64 32 @@ -236,3 +330,53 @@ entry: store <4 x float> %fmul11, ptr %gep14, align 16 ret void } + +define i32 @test7() { +; CHECK-LABEL: @test7( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> poison, <8 x float> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP0]], <8 x float> zeroinitializer, i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = fsub <16 x float> [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP3]], <32 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[TMP2]], <16 x float> [[TMP3]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP6]], <4 x float> zeroinitializer, i64 4) +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP7]], <4 x float> zeroinitializer, i64 8) +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP8]], <4 x float> zeroinitializer, i64 12) +; CHECK-NEXT: [[TMP10:%.*]] = fadd <16 x float> [[TMP9]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = fsub <16 x float> [[TMP9]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[TMP9]], [[TMP12]] +; CHECK-NEXT: store <16 x float> [[TMP13]], ptr null, align 16 +; CHECK-NEXT: ret i32 0 +; +; COMBINE-LABEL: @test7( +; COMBINE-NEXT: entry: +; COMBINE-NEXT: store <16 x float> poison, ptr null, align 16 +; COMBINE-NEXT: ret i32 0 +; +entry: + %0 = getelementptr i8, ptr null, i64 16 + %1 = getelementptr i8, ptr null, i64 32 + %2 = getelementptr i8, ptr null, i64 48 + %3 = fadd <8 x float> zeroinitializer, zeroinitializer + %4 = fsub <8 x float> zeroinitializer, zeroinitializer + %5 = shufflevector <8 x float> %3, <8 x float> zeroinitializer, <4 x i32> + %6 = shufflevector <8 x float> %3, <8 x float> zeroinitializer, <4 x i32> + %7 = shufflevector <8 x float> %4, <8 x float> zeroinitializer, <4 x i32> + %8 = shufflevector <8 x float> %4, <8 x float> zeroinitializer, <4 x i32> + %9 = fsub <4 x float> zeroinitializer, %5 + %10 = fsub <4 x float> zeroinitializer, %6 + %11 = fadd <4 x float> zeroinitializer, %7 + %12 = fadd <4 x float> zeroinitializer, %8 + %13 = fadd <4 x float> zeroinitializer, %9 + %14 = fadd <4 x float> zeroinitializer, %10 + %15 = fadd <4 x float> zeroinitializer, %11 + %16 = fadd <4 x float> zeroinitializer, %12 + store <4 x float> %13, ptr %2, align 16 + store <4 x float> %14, ptr %1, align 16 + store <4 x float> %15, ptr %0, align 16 + store <4 x float> %16, ptr null, align 16 + ret i32 0 +} From 1f359610b048ae31e5ea3a0416f55405ff398753 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 8 Apr 2025 07:23:00 +0700 Subject: [PATCH 0915/1029] ObjCARC: Drop pointer bitcast handling (#134274) There is more in the file to drop, but this looks like the easier part. --- .../Transforms/ObjCARC/ObjCARCContract.cpp | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp index 311d3b1cfc0a0..e11748b2c9dbb 100644 --- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp +++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp @@ -660,7 +660,6 @@ bool ObjCARCContract::run(Function &F, AAResults *A, DominatorTree *D) { }; Value *Arg = cast(Inst)->getArgOperand(0); - Value *OrigArg = Arg; // TODO: Change this to a do-while. for (;;) { @@ -687,24 +686,6 @@ bool ObjCARCContract::run(Function &F, AAResults *A, DominatorTree *D) { break; } } - - // Replace bitcast users of Arg that are dominated by Inst. - SmallVector BitCastUsers; - - // Add all bitcast users of the function argument first. - for (User *U : OrigArg->users()) - if (auto *BC = dyn_cast(U)) - BitCastUsers.push_back(BC); - - // Replace the bitcasts with the call return. Iterate until list is empty. - while (!BitCastUsers.empty()) { - auto *BC = BitCastUsers.pop_back_val(); - for (User *U : BC->users()) - if (auto *B = dyn_cast(U)) - BitCastUsers.push_back(B); - - ReplaceArgUses(BC); - } } // If this function has no escaping allocas or suspicious vararg usage, From 735f5b1cb45c4245f1c068dda8271142457925ec Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Mon, 7 Apr 2025 21:25:37 -0300 Subject: [PATCH 0916/1029] [clang] fix DependentTemplateSpecializationType transform (#134748) This changes the transform for DTST so it rebuilds the node if any of the template arguments changed. This fixes a regression reported here: https://github.com/llvm/llvm-project/pull/133610#issuecomment-2784576267 which was introduced by https://github.com/llvm/llvm-project/pull/133610 There are no release notes since the regression was never released. --- clang/lib/Sema/TreeTransform.h | 18 ++++++++++++------ clang/test/SemaTemplate/dependent-names.cpp | 9 +++++++++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 57fcc4b3b3682..1e126a8875331 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -7765,17 +7765,23 @@ QualType TreeTransform::TransformDependentTemplateSpecializationType( NewTemplateArgs.setLAngleLoc(TL.getLAngleLoc()); NewTemplateArgs.setRAngleLoc(TL.getRAngleLoc()); - typedef TemplateArgumentLocContainerIterator< - DependentTemplateSpecializationTypeLoc> ArgIterator; - if (getDerived().TransformTemplateArguments(ArgIterator(TL, 0), - ArgIterator(TL, TL.getNumArgs()), - NewTemplateArgs)) + auto ArgsRange = llvm::make_range>({TL, 0}, {TL, TL.getNumArgs()}); + + if (getDerived().TransformTemplateArguments(ArgsRange.begin(), + ArgsRange.end(), NewTemplateArgs)) return QualType(); + bool TemplateArgumentsChanged = !llvm::equal( + ArgsRange, NewTemplateArgs.arguments(), + [](const TemplateArgumentLoc &A, const TemplateArgumentLoc &B) { + return A.getArgument().structurallyEquals(B.getArgument()); + }); const DependentTemplateStorage &DTN = T->getDependentTemplateName(); QualType Result = TL.getType(); - if (getDerived().AlwaysRebuild() || SS.getScopeRep() != DTN.getQualifier()) { + if (getDerived().AlwaysRebuild() || SS.getScopeRep() != DTN.getQualifier() || + TemplateArgumentsChanged) { TemplateName Name = getDerived().RebuildTemplateName( SS, TL.getTemplateKeywordLoc(), DTN.getName(), TL.getTemplateNameLoc(), /*ObjectType=*/QualType(), /*FirstQualifierInScope=*/nullptr, diff --git a/clang/test/SemaTemplate/dependent-names.cpp b/clang/test/SemaTemplate/dependent-names.cpp index 92620e862fe3a..538abde3eddd5 100644 --- a/clang/test/SemaTemplate/dependent-names.cpp +++ b/clang/test/SemaTemplate/dependent-names.cpp @@ -458,3 +458,12 @@ namespace PR37680 { }; int f(b ba) { return ba.add<0>(); } } + +namespace TransformDependentTemplates { + template struct Test1 { + template + using Arg = typename T::template Arg; + void f(Arg); + void f(Arg); + }; +} // namespace TransformDependentTemplates From 1356e202b2b2747220e12c2a8cbd28846bb61548 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 7 Apr 2025 17:27:26 -0700 Subject: [PATCH 0917/1029] [NFC][LLVM][BPF] Cleanup pass initialization for BPF (#134414) - Remove calls to pass initialization from pass constructors and move them to target initialization. - https://github.com/llvm/llvm-project/issues/111767 --- llvm/lib/Target/BPF/BPFMIChecking.cpp | 4 +--- llvm/lib/Target/BPF/BPFMIPeephole.cpp | 8 ++------ llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp | 4 +--- llvm/lib/Target/BPF/BPFTargetMachine.cpp | 3 +++ 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp index 0633836df73b6..4f5dfafda9efe 100644 --- a/llvm/lib/Target/BPF/BPFMIChecking.cpp +++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp @@ -32,9 +32,7 @@ struct BPFMIPreEmitChecking : public MachineFunctionPass { MachineFunction *MF; const TargetRegisterInfo *TRI; - BPFMIPreEmitChecking() : MachineFunctionPass(ID) { - initializeBPFMIPreEmitCheckingPass(*PassRegistry::getPassRegistry()); - } + BPFMIPreEmitChecking() : MachineFunctionPass(ID) {} private: // Initialize class variables. diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp index 106572cdeb840..4febf3042fdd9 100644 --- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp +++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp @@ -51,9 +51,7 @@ struct BPFMIPeephole : public MachineFunctionPass { MachineFunction *MF; MachineRegisterInfo *MRI; - BPFMIPeephole() : MachineFunctionPass(ID) { - initializeBPFMIPeepholePass(*PassRegistry::getPassRegistry()); - } + BPFMIPeephole() : MachineFunctionPass(ID) {} private: // Initialize class variables. @@ -311,9 +309,7 @@ struct BPFMIPreEmitPeephole : public MachineFunctionPass { const BPFInstrInfo *TII; bool SupportGotol; - BPFMIPreEmitPeephole() : MachineFunctionPass(ID) { - initializeBPFMIPreEmitPeepholePass(*PassRegistry::getPassRegistry()); - } + BPFMIPreEmitPeephole() : MachineFunctionPass(ID) {} private: // Initialize class variables. diff --git a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp index f6735adbde640..666a5b6abfa4a 100644 --- a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp +++ b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp @@ -50,9 +50,7 @@ struct BPFMISimplifyPatchable : public MachineFunctionPass { const BPFInstrInfo *TII; MachineFunction *MF; - BPFMISimplifyPatchable() : MachineFunctionPass(ID) { - initializeBPFMISimplifyPatchablePass(*PassRegistry::getPassRegistry()); - } + BPFMISimplifyPatchable() : MachineFunctionPass(ID) {} private: std::set SkipInsts; diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp index 3379af6fe8744..4c4e6e27b9a5e 100644 --- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp +++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp @@ -47,7 +47,10 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() { initializeGlobalISel(PR); initializeBPFCheckAndAdjustIRPass(PR); initializeBPFMIPeepholePass(PR); + initializeBPFMIPreEmitPeepholePass(PR); initializeBPFDAGToDAGISelLegacyPass(PR); + initializeBPFMISimplifyPatchablePass(PR); + initializeBPFMIPreEmitCheckingPass(PR); } // DataLayout: little or big endian From a3754ade637f71d36495eb95ea2d8cab9885c9d9 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 7 Apr 2025 17:27:50 -0700 Subject: [PATCH 0918/1029] [NFC][LLVM][AMDGPU] Cleanup pass initialization for AMDGPU (#134410) - Remove calls to pass initialization from pass constructors. - https://github.com/llvm/llvm-project/issues/111767 --- llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp | 4 +--- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 5 +---- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 4 +--- llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp | 7 ++----- .../AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp | 5 +---- llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 6 +----- .../lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp | 5 +---- llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp | 6 ++---- llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp | 4 +--- llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp | 5 +---- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 6 +----- llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp | 6 +----- llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp | 10 ++-------- llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp | 6 +----- llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp | 4 +--- llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp | 4 +--- llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp | 4 +--- llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp | 4 +--- llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 4 +--- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 ++ .../Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 5 +---- llvm/lib/Target/AMDGPU/R600.h | 3 +++ llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp | 10 +--------- llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp | 10 +--------- 24 files changed, 30 insertions(+), 99 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index 5a6868f96d970..7bcc128cb114f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -38,9 +38,7 @@ ImmutablePass *llvm::createAMDGPUExternalAAWrapperPass() { return new AMDGPUExternalAAWrapper(); } -AMDGPUAAWrapperPass::AMDGPUAAWrapperPass() : ImmutablePass(ID) { - initializeAMDGPUAAWrapperPassPass(*PassRegistry::getPassRegistry()); -} +AMDGPUAAWrapperPass::AMDGPUAAWrapperPass() : ImmutablePass(ID) {} void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 0cee3c3cb5e92..87fa845f3cff7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -17,15 +17,12 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" +#include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO/Attributor.h" #define DEBUG_TYPE "amdgpu-attributor" -namespace llvm { -void initializeCycleInfoWrapperPassPass(PassRegistry &); -} // namespace llvm - using namespace llvm; static cl::opt KernargPreloadCount( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 9c482aeb3ea5c..df92847c1ba71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -336,9 +336,7 @@ class AMDGPUCodeGenPrepareImpl class AMDGPUCodeGenPrepare : public FunctionPass { public: static char ID; - AMDGPUCodeGenPrepare() : FunctionPass(ID) { - initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry()); - } + AMDGPUCodeGenPrepare() : FunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp index 8236ff609f851..2b64146a32675 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp @@ -139,10 +139,7 @@ void ExportClustering::apply(ScheduleDAGInstrs *DAG) { } // end namespace -namespace llvm { - -std::unique_ptr createAMDGPUExportClusteringDAGMutation() { +std::unique_ptr +llvm::createAMDGPUExportClusteringDAGMutation() { return std::make_unique(); } - -} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp index a5660432603fd..f924335844da2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp @@ -34,10 +34,7 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass { static char ID; public: - AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) { - initializeAMDGPUGlobalISelDivergenceLoweringPass( - *PassRegistry::getPassRegistry()); - } + AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index bbd262748d680..7b4d00c8214cb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -2696,16 +2696,12 @@ bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { } // namespace -namespace llvm { - /// \p Phase specifes whether or not this is a reentry into the /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the /// same scheduling region (e.g. pre and post-RA scheduling / multiple /// scheduling "phases"), we can reenter this mutation framework more than once /// for a given region. std::unique_ptr -createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) { +llvm::createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) { return std::make_unique(Phase); } - -} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 183c55729b0b1..4448570b6b979 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -2312,10 +2312,7 @@ class AMDGPULowerBufferFatPointers : public ModulePass { public: static char ID; - AMDGPULowerBufferFatPointers() : ModulePass(ID) { - initializeAMDGPULowerBufferFatPointersPass( - *PassRegistry::getPassRegistry()); - } + AMDGPULowerBufferFatPointers() : ModulePass(ID) {} bool run(Module &M, const TargetMachine &TM); bool runOnModule(Module &M) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 3c08d1edb4991..f9f2d43a5b041 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -1502,10 +1502,8 @@ class AMDGPULowerModuleLDSLegacy : public ModulePass { const AMDGPUTargetMachine *TM; static char ID; - AMDGPULowerModuleLDSLegacy(const AMDGPUTargetMachine *TM_ = nullptr) - : ModulePass(ID), TM(TM_) { - initializeAMDGPULowerModuleLDSLegacyPass(*PassRegistry::getPassRegistry()); - } + AMDGPULowerModuleLDSLegacy(const AMDGPUTargetMachine *TM = nullptr) + : ModulePass(ID), TM(TM) {} void getAnalysisUsage(AnalysisUsage &AU) const override { if (!TM) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp index 4d9f08b3af01d..eda479064d7b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp @@ -15,7 +15,7 @@ #include "AMDGPUMachineModuleInfo.h" #include "llvm/IR/Module.h" -namespace llvm { +using namespace llvm; AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI) : MachineModuleInfoELF(MMI) { @@ -34,5 +34,3 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI) SingleThreadOneAddressSpaceSSID = CTX.getOrInsertSyncScopeID("singlethread-one-as"); } - -} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp index 937e9e812ec44..9b6bb56c85d24 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp @@ -42,10 +42,7 @@ class AMDGPUMarkLastScratchLoadLegacy : public MachineFunctionPass { public: static char ID; - AMDGPUMarkLastScratchLoadLegacy() : MachineFunctionPass(ID) { - initializeAMDGPUMarkLastScratchLoadLegacyPass( - *PassRegistry::getPassRegistry()); - } + AMDGPUMarkLastScratchLoadLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp index b2a8143b82ab6..a52a6aef2bc39 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -473,8 +473,6 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) : MachineFunctionPass(ID), IsOptNone(IsOptNone) { - initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); - if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } @@ -519,8 +517,6 @@ INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs after legalization", false, false) -namespace llvm { -FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { +FunctionPass *llvm::createAMDGPUPostLegalizeCombiner(bool IsOptNone) { return new AMDGPUPostLegalizerCombiner(IsOptNone); } -} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp index 4ce3c0107d566..ca97591a87110 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -248,8 +248,6 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone) : MachineFunctionPass(ID), IsOptNone(IsOptNone) { - initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); - if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } @@ -296,8 +294,6 @@ INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs before legalization", false, false) -namespace llvm { -FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) { +FunctionPass *llvm::createAMDGPUPreLegalizeCombiner(bool IsOptNone) { return new AMDGPUPreLegalizerCombiner(IsOptNone); } -} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index 459f85ae6169a..9847fbf108b0c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -42,7 +42,7 @@ class AMDGPUPrintfRuntimeBinding final : public ModulePass { public: static char ID; - explicit AMDGPUPrintfRuntimeBinding(); + explicit AMDGPUPrintfRuntimeBinding() : ModulePass(ID) {} private: bool runOnModule(Module &M) override; @@ -76,15 +76,9 @@ INITIALIZE_PASS_END(AMDGPUPrintfRuntimeBinding, "amdgpu-printf-runtime-binding", char &llvm::AMDGPUPrintfRuntimeBindingID = AMDGPUPrintfRuntimeBinding::ID; -namespace llvm { -ModulePass *createAMDGPUPrintfRuntimeBinding() { +ModulePass *llvm::createAMDGPUPrintfRuntimeBinding() { return new AMDGPUPrintfRuntimeBinding(); } -} // namespace llvm - -AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding() : ModulePass(ID) { - initializeAMDGPUPrintfRuntimeBindingPass(*PassRegistry::getPassRegistry()); -} void AMDGPUPrintfRuntimeBindingImpl::getConversionSpecifiers( SmallVectorImpl &OpConvSpecifiers, StringRef Fmt, diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index b416d9756297c..8f9ad38d101a1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -427,8 +427,6 @@ void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone) : MachineFunctionPass(ID), IsOptNone(IsOptNone) { - initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry()); - if (!RuleConfig.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } @@ -473,8 +471,6 @@ INITIALIZE_PASS_END(AMDGPURegBankCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs after regbankselect", false, false) -namespace llvm { -FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone) { +FunctionPass *llvm::createAMDGPURegBankCombiner(bool IsOptNone) { return new AMDGPURegBankCombiner(IsOptNone); } -} // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index d5a83903e2b13..ad6a0772fe8b6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -40,9 +40,7 @@ class AMDGPURegBankLegalize : public MachineFunctionPass { static char ID; public: - AMDGPURegBankLegalize() : MachineFunctionPass(ID) { - initializeAMDGPURegBankLegalizePass(*PassRegistry::getPassRegistry()); - } + AMDGPURegBankLegalize() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp index 8a0c9faa34631..fe73aac0763e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp @@ -35,9 +35,7 @@ class AMDGPURegBankSelect : public MachineFunctionPass { public: static char ID; - AMDGPURegBankSelect() : MachineFunctionPass(ID) { - initializeAMDGPURegBankSelectPass(*PassRegistry::getPassRegistry()); - } + AMDGPURegBankSelect() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp index f255bfc128d6b..3183f617a2628 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUReserveWWMRegs.cpp @@ -32,9 +32,7 @@ class AMDGPUReserveWWMRegsLegacy : public MachineFunctionPass { public: static char ID; - AMDGPUReserveWWMRegsLegacy() : MachineFunctionPass(ID) { - initializeAMDGPUReserveWWMRegsLegacyPass(*PassRegistry::getPassRegistry()); - } + AMDGPUReserveWWMRegsLegacy() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp index 79e9312034da4..1c135f09080e1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp @@ -72,9 +72,7 @@ namespace { class AMDGPURewriteUndefForPHILegacy : public FunctionPass { public: static char ID; - AMDGPURewriteUndefForPHILegacy() : FunctionPass(ID) { - initializeAMDGPURewriteUndefForPHILegacyPass(*PassRegistry::getPassRegistry()); - } + AMDGPURewriteUndefForPHILegacy() : FunctionPass(ID) {} bool runOnFunction(Function &F) override; StringRef getPassName() const override { return "AMDGPU Rewrite Undef for PHI"; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index cc0d374c99254..0c60ba4db29ae 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -1285,9 +1285,7 @@ class AMDGPUSwLowerLDSLegacy : public ModulePass { const AMDGPUTargetMachine *AMDGPUTM; static char ID; AMDGPUSwLowerLDSLegacy(const AMDGPUTargetMachine *TM) - : ModulePass(ID), AMDGPUTM(TM) { - initializeAMDGPUSwLowerLDSLegacyPass(*PassRegistry::getPassRegistry()); - } + : ModulePass(ID), AMDGPUTM(TM) {} bool runOnModule(Module &M) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 4b5c70f09155f..f9029d3e496f8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -489,6 +489,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeR600PacketizerPass(*PR); initializeR600ExpandSpecialInstrsPassPass(*PR); initializeR600VectorRegMergerPass(*PR); + initializeR600EmitClauseMarkersPass(*PR); + initializeR600MachineCFGStructurizerPass(*PR); initializeGlobalISel(*PR); initializeAMDGPUDAGToDAGISelLegacyPass(*PR); initializeGCNDPPCombineLegacyPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index d087fbc86545c..733c5d520fb23 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -74,10 +74,7 @@ class AMDGPUUnifyDivergentExitNodesImpl { class AMDGPUUnifyDivergentExitNodes : public FunctionPass { public: static char ID; - AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) { - initializeAMDGPUUnifyDivergentExitNodesPass( - *PassRegistry::getPassRegistry()); - } + AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override; bool runOnFunction(Function &F) override; }; diff --git a/llvm/lib/Target/AMDGPU/R600.h b/llvm/lib/Target/AMDGPU/R600.h index 6c40c2813e204..9236675ce082a 100644 --- a/llvm/lib/Target/AMDGPU/R600.h +++ b/llvm/lib/Target/AMDGPU/R600.h @@ -45,6 +45,9 @@ extern char &R600VectorRegMergerID; void initializeR600PacketizerPass(PassRegistry &); extern char &R600PacketizerID; +void initializeR600EmitClauseMarkersPass(PassRegistry &); +void initializeR600MachineCFGStructurizerPass(PassRegistry &); + } // End namespace llvm #endif diff --git a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp index 0fa8d4847931a..b88d655d4e613 100644 --- a/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ b/llvm/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -21,12 +21,6 @@ using namespace llvm; -namespace llvm { - - void initializeR600EmitClauseMarkersPass(PassRegistry&); - -} // end namespace llvm - namespace { class R600EmitClauseMarkers : public MachineFunctionPass { @@ -289,9 +283,7 @@ class R600EmitClauseMarkers : public MachineFunctionPass { public: static char ID; - R600EmitClauseMarkers() : MachineFunctionPass(ID) { - initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry()); - } + R600EmitClauseMarkers() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { const R600Subtarget &ST = MF.getSubtarget(); diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp index f8b40b0a1cdfc..b3dd68b6a1433 100644 --- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp +++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp @@ -41,12 +41,6 @@ STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); -namespace llvm { - -void initializeR600MachineCFGStructurizerPass(PassRegistry &); - -} // end namespace llvm - namespace { //===----------------------------------------------------------------------===// @@ -104,9 +98,7 @@ class R600MachineCFGStructurizer : public MachineFunctionPass { static char ID; - R600MachineCFGStructurizer() : MachineFunctionPass(ID) { - initializeR600MachineCFGStructurizerPass(*PassRegistry::getPassRegistry()); - } + R600MachineCFGStructurizer() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "AMDGPU Control Flow Graph structurizer Pass"; From 707367621679742eae0e903470a5fe043424c698 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 7 Apr 2025 17:28:26 -0700 Subject: [PATCH 0919/1029] [NFC][LLVM][DirectX] Cleanup pass initialization for DirectX (#134419) - Remove calls to pass initialization from pass constructors. - https://github.com/llvm/llvm-project/issues/111767 --- llvm/lib/Target/DirectX/DXContainerGlobals.cpp | 4 +--- llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp | 7 +------ llvm/lib/Target/DirectX/DirectXTargetMachine.cpp | 1 + 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp index 5279847419a81..27451074581ee 100644 --- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp +++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp @@ -51,9 +51,7 @@ class DXContainerGlobals : public llvm::ModulePass { public: static char ID; // Pass identification, replacement for typeid - DXContainerGlobals() : ModulePass(ID) { - initializeDXContainerGlobalsPass(*PassRegistry::getPassRegistry()); - } + DXContainerGlobals() : ModulePass(ID) {} StringRef getPassName() const override { return "DXContainer Global Emitter"; diff --git a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp index cf3fb34bba437..88e23479cba2a 100644 --- a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp +++ b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp @@ -262,13 +262,8 @@ class DXILPrettyPrinterLegacy : public llvm::ModulePass { public: static char ID; - DXILPrettyPrinterLegacy() : ModulePass(ID), OS(dbgs()) { - initializeDXILPrettyPrinterLegacyPass(*PassRegistry::getPassRegistry()); - } - explicit DXILPrettyPrinterLegacy(raw_ostream &O) : ModulePass(ID), OS(O) { - initializeDXILPrettyPrinterLegacyPass(*PassRegistry::getPassRegistry()); - } + explicit DXILPrettyPrinterLegacy(raw_ostream &O) : ModulePass(ID), OS(O) {} StringRef getPassName() const override { return "DXIL Metadata Pretty Printer"; diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp index ce408b4034f83..747e4b3eb9411 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp @@ -64,6 +64,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeDirectXTarget() { initializeShaderFlagsAnalysisWrapperPass(*PR); initializeRootSignatureAnalysisWrapperPass(*PR); initializeDXILFinalizeLinkageLegacyPass(*PR); + initializeDXILPrettyPrinterLegacyPass(*PR); } class DXILTargetObjectFile : public TargetLoweringObjectFile { From 87afe4de22f24c81d6eca8c2ecdec5e57d6a961a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 8 Apr 2025 07:33:07 +0700 Subject: [PATCH 0920/1029] OMPIRBuilder: Cleanup checking for GlobalValue (#134583) GlobalValue already subsumes GlobalObject and GlobalVariable, plus make use of the std::get<0> already copied to a variable above. --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index d59a144d3cf99..6e5f267043b48 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -7092,9 +7092,7 @@ static Expected createOutlinedFunction( // preceding mapped arguments that refer to the same global that may be // seperate segments. To prevent this, we defer global processing until all // other processing has been performed. - if (llvm::isa(std::get<0>(InArg)) || - llvm::isa(std::get<0>(InArg)) || - llvm::isa(std::get<0>(InArg))) { + if (isa(Input)) { DeferredReplacement.push_back(std::make_pair(Input, InputCopy)); continue; } From 8286b804d519d045fa1ee3a1a2e7fae66e5e30ef Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 8 Apr 2025 07:36:55 +0700 Subject: [PATCH 0921/1029] OMPIRBuilder: Do not try to expand uses of ConstantData (#134584) --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 3 +++ llvm/lib/IR/ReplaceConstant.cpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 6e5f267043b48..28662efc02882 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -7097,6 +7097,9 @@ static Expected createOutlinedFunction( continue; } + if (isa(Input)) + continue; + ReplaceValue(Input, InputCopy, Func); } diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp index a31cfe6cca3c1..962368f061851 100644 --- a/llvm/lib/IR/ReplaceConstant.cpp +++ b/llvm/lib/IR/ReplaceConstant.cpp @@ -56,6 +56,9 @@ bool convertUsersOfConstantsToInstructions(ArrayRef Consts, // Find all expandable direct users of Consts. SmallVector Stack; for (Constant *C : Consts) { + assert(!isa(C) && + "should not be expanding trivial constant users"); + if (IncludeSelf) { assert(isExpandableUser(C) && "One of the constants is not expandable"); Stack.push_back(C); From 76e219d9c3b46e0cc1813e1e8452da0f4f54ab92 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 8 Apr 2025 07:40:57 +0700 Subject: [PATCH 0922/1029] SeparateConstOffsetFromGEP: Add more tests with lower-gep (#134684) I didn't see any failures while trying to break hasMoreThanOneUseInLoop or other paths here. --- .../AMDGPU/lower-gep.ll | 482 ++++++++++++++++++ 1 file changed, 482 insertions(+) create mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/lower-gep.ll diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/lower-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/lower-gep.ll new file mode 100644 index 0000000000000..687e921640492 --- /dev/null +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/lower-gep.ll @@ -0,0 +1,482 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -passes='separate-const-offset-from-gep' \ +; RUN: -reassociate-geps-verify-no-dead-code -S | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" + +%struct.Packed = type <{ [3 x i32], [8 x i64] }> ; <> means packed + +@packed_struct_array = addrspace(3) global [1024 x %struct.Packed] poison, align 1 + +; Verifies we can emit correct uglygep if the address is not natually +; aligned. This shoult not produce a no-op bitcast with opaque +; pointers. +define ptr addrspace(3) @packed_struct(i32 %i, i32 %j) { +; CHECK-LABEL: @packed_struct( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDXPROM:%.*]] = trunc i64 0 to i32 +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDXPROM]], 77824 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr addrspace(3) @packed_struct_array, i32 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[I:%.*]], 76 +; CHECK-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[UGLYGEP]], i32 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[J:%.*]], 3 +; CHECK-NEXT: [[UGLYGEP4:%.*]] = getelementptr i8, ptr addrspace(3) [[UGLYGEP3]], i32 [[TMP2]] +; CHECK-NEXT: [[UGLYGEP5:%.*]] = getelementptr i8, ptr addrspace(3) [[UGLYGEP4]], i32 112 +; CHECK-NEXT: ret ptr addrspace(3) [[UGLYGEP5]] +; +entry: + %add = add nsw i32 %j, 3 + %add1 = add nsw i32 %i, 1 + %arrayidx3 = getelementptr inbounds [1024 x %struct.Packed], ptr addrspace(3) @packed_struct_array, i64 0, i32 %add1, i32 1, i32 %add + ret ptr addrspace(3) %arrayidx3 +} + +%struct = type { i32, i32, i32 } + +define i32 @test1(ptr %ptr, i64 %idx) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[IDX:%.*]], 12 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i64 4 +; CHECK-NEXT: [[LV_1:%.*]] = load i32, ptr [[UGLYGEP1]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[LV_1]], 0 +; CHECK-NEXT: br i1 [[C]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[IDX]], 12 +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP2]] +; CHECK-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[UGLYGEP2]], i64 8 +; CHECK-NEXT: [[LV_2:%.*]] = load i32, ptr [[UGLYGEP3]], align 4 +; CHECK-NEXT: [[RES:%.*]] = add i32 [[LV_1]], [[LV_2]] +; CHECK-NEXT: ret i32 [[RES]] +; CHECK: else: +; CHECK-NEXT: ret i32 0 +; + %gep.1 = getelementptr %struct, ptr %ptr, i64 %idx, i32 1 + %lv.1 = load i32, ptr %gep.1 + %c = icmp slt i32 %lv.1, 0 + br i1 %c, label %then, label %else + +then: + %gep.2 = getelementptr %struct, ptr %ptr, i64 %idx, i32 2 + %lv.2 = load i32, ptr %gep.2 + %res = add i32 %lv.1, %lv.2 + ret i32 %res + +else: + ret i32 0 +} + +define i32 @test1_fatptr(ptr addrspace(7) %ptr, i64 %idx) { +; CHECK-LABEL: @test1_fatptr( +; CHECK-NEXT: [[IDXPROM:%.*]] = trunc i64 [[IDX:%.*]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[IDXPROM]], 12 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr addrspace(7) [[PTR:%.*]], i32 [[TMP1]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr addrspace(7) [[UGLYGEP]], i32 4 +; CHECK-NEXT: [[LV_1:%.*]] = load i32, ptr addrspace(7) [[UGLYGEP1]], align 4 +; CHECK-NEXT: [[C:%.*]] = icmp slt i32 [[LV_1]], 0 +; CHECK-NEXT: br i1 [[C]], label [[THEN:%.*]], label [[ELSE:%.*]] +; CHECK: then: +; CHECK-NEXT: [[IDXPROM2:%.*]] = trunc i64 [[IDX]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[IDXPROM2]], 12 +; CHECK-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr addrspace(7) [[PTR]], i32 [[TMP2]] +; CHECK-NEXT: [[UGLYGEP4:%.*]] = getelementptr i8, ptr addrspace(7) [[UGLYGEP3]], i32 8 +; CHECK-NEXT: [[LV_2:%.*]] = load i32, ptr addrspace(7) [[UGLYGEP4]], align 4 +; CHECK-NEXT: [[RES:%.*]] = add i32 [[LV_1]], [[LV_2]] +; CHECK-NEXT: ret i32 [[RES]] +; CHECK: else: +; CHECK-NEXT: ret i32 0 +; + %gep.1 = getelementptr %struct, ptr addrspace(7) %ptr, i64 %idx, i32 1 + %lv.1 = load i32, ptr addrspace(7) %gep.1 + %c = icmp slt i32 %lv.1, 0 + br i1 %c, label %then, label %else + +then: + %gep.2 = getelementptr %struct, ptr addrspace(7) %ptr, i64 %idx, i32 2 + %lv.2 = load i32, ptr addrspace(7) %gep.2 + %res = add i32 %lv.1, %lv.2 + ret i32 %res + +else: + ret i32 0 +} + + +; Test lowerToSingleIndexGEPs +define void @test_A_sub_B_add_ConstantInt(ptr %p) { +; CHECK-LABEL: @test_A_sub_B_add_ConstantInt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @foo() +; CHECK-NEXT: [[REM:%.*]] = srem i32 [[TMP0]], 5 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[K:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] +; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[K]], 5 +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[MUL]], [[REM]] +; CHECK-NEXT: [[CMP26:%.*]] = icmp ult i32 [[SUB1]], 512 +; CHECK-NEXT: br i1 [[CMP26]], label [[COND_TRUE:%.*]], label [[COND_END]] +; CHECK: cond.true: +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[MUL]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[REM]] to i64 +; CHECK-NEXT: [[SUB22:%.*]] = sub i64 [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[SUB22]], 2 +; CHECK-NEXT: [[UGLYGEP4:%.*]] = getelementptr i8, ptr [[UGLYGEP3:%.*]], i64 2044 +; CHECK-NEXT: [[UGLYGEP5:%.*]] = getelementptr i8, ptr [[UGLYGEP4]], i64 [[TMP3]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[UGLYGEP5]], align 4 +; CHECK-NEXT: br label [[COND_END]] +; CHECK: cond.end: +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[K]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 100 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %0 = tail call i32 @foo() + %rem = srem i32 %0, 5 + %add = add nsw i32 %rem , 511 + br label %for.body + +for.body: + %k = phi i32 [ 0, %entry ], [ %inc, %cond.end ] + %mul = mul nuw nsw i32 %k, 5 + %sub1 = sub nsw i32 %mul, %rem + %cmp26 = icmp ult i32 %sub1, 512 + br i1 %cmp26, label %cond.true, label %cond.end + +cond.true: + %sub2 = sub nsw i32 %add, %mul + %idxprom = sext i32 %sub2 to i64 + %arryidx = getelementptr inbounds float, ptr %p, i64 %idxprom + store float 1.0, ptr %arryidx, align 4 + br label %cond.end + +cond.end: + %inc = add nuw nsw i32 %k, 1 + %exitcond = icmp ne i32 %inc, 100 + br i1 %exitcond, label %for.body, label %for.end + +for.end: + ret void +} + +@extern_array = global [1024 x i32] poison, align 16 + +; Test lowerToSingleIndexGEPs with a global variable pointer +define void @test_A_sub_B_add_ConstantInt_gv_baseptr(ptr %p) { +; CHECK-LABEL: @test_A_sub_B_add_ConstantInt_gv_baseptr( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @foo() +; CHECK-NEXT: [[REM:%.*]] = srem i32 [[TMP0]], 5 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[K:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] +; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[K]], 5 +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[MUL]], [[REM]] +; CHECK-NEXT: [[CMP26:%.*]] = icmp ult i32 [[SUB1]], 512 +; CHECK-NEXT: br i1 [[CMP26]], label [[COND_TRUE:%.*]], label [[COND_END]] +; CHECK: cond.true: +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[MUL]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[REM]] to i64 +; CHECK-NEXT: [[SUB22:%.*]] = sub i64 [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[SUB22]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr @extern_array, i64 2044 +; CHECK-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i64 [[TMP3]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[UGLYGEP3]], align 4 +; CHECK-NEXT: br label [[COND_END]] +; CHECK: cond.end: +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[K]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 100 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %0 = tail call i32 @foo() + %rem = srem i32 %0, 5 + %add = add nsw i32 %rem , 511 + br label %for.body + +for.body: + %k = phi i32 [ 0, %entry ], [ %inc, %cond.end ] + %mul = mul nuw nsw i32 %k, 5 + %sub1 = sub nsw i32 %mul, %rem + %cmp26 = icmp ult i32 %sub1, 512 + br i1 %cmp26, label %cond.true, label %cond.end + +cond.true: + %sub2 = sub nsw i32 %add, %mul + %idxprom = sext i32 %sub2 to i64 + %arryidx = getelementptr inbounds float, ptr @extern_array, i64 %idxprom + store float 1.0, ptr %arryidx, align 4 + br label %cond.end + +cond.end: + %inc = add nuw nsw i32 %k, 1 + %exitcond = icmp ne i32 %inc, 100 + br i1 %exitcond, label %for.body, label %for.end + +for.end: + ret void +} + +; Test lowerToSingleIndexGEPs with a constant data variable pointer +define void @test_A_sub_B_add_ConstantInt_null_basptr() { +; CHECK-LABEL: @test_A_sub_B_add_ConstantInt_null_basptr( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @foo() +; CHECK-NEXT: [[REM:%.*]] = srem i32 [[TMP0]], 5 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[K:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[COND_END:%.*]] ] +; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[K]], 5 +; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[MUL]], [[REM]] +; CHECK-NEXT: [[CMP26:%.*]] = icmp ult i32 [[SUB1]], 512 +; CHECK-NEXT: br i1 [[CMP26]], label [[COND_TRUE:%.*]], label [[COND_END]] +; CHECK: cond.true: +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[MUL]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[REM]] to i64 +; CHECK-NEXT: [[SUB22:%.*]] = sub i64 [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[SUB22]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr null, i64 2044 +; CHECK-NEXT: [[UGLYGEP3:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i64 [[TMP3]] +; CHECK-NEXT: store float 1.000000e+00, ptr [[UGLYGEP3]], align 4 +; CHECK-NEXT: br label [[COND_END]] +; CHECK: cond.end: +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[K]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 100 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %0 = tail call i32 @foo() + %rem = srem i32 %0, 5 + %add = add nsw i32 %rem , 511 + br label %for.body + +for.body: + %k = phi i32 [ 0, %entry ], [ %inc, %cond.end ] + %mul = mul nuw nsw i32 %k, 5 + %sub1 = sub nsw i32 %mul, %rem + %cmp26 = icmp ult i32 %sub1, 512 + br i1 %cmp26, label %cond.true, label %cond.end + +cond.true: + %sub2 = sub nsw i32 %add, %mul + %idxprom = sext i32 %sub2 to i64 + %arryidx = getelementptr inbounds float, ptr null, i64 %idxprom + store float 1.0, ptr %arryidx, align 4 + br label %cond.end + +cond.end: + %inc = add nuw nsw i32 %k, 1 + %exitcond = icmp ne i32 %inc, 100 + br i1 %exitcond, label %for.body, label %for.end + +for.end: + ret void +} + +declare i32 @foo() + +define amdgpu_kernel void @multi_use_in_loop(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) { +; CHECK-LABEL: @multi_use_in_loop( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = sext i32 [[ARG2:%.*]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG:%.*]], i64 [[TMP]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[BB6:%.*]], label [[BB8:%.*]] +; CHECK: bb6: +; CHECK-NEXT: br label [[BB11:%.*]] +; CHECK: bb7: +; CHECK-NEXT: br label [[BB8]] +; CHECK: bb8: +; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP30:%.*]], [[BB7:%.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG1:%.*]], i64 [[TMP]] +; CHECK-NEXT: store i32 [[TMP9]], ptr addrspace(1) [[TMP10]], align 4 +; CHECK-NEXT: ret void +; CHECK: bb11: +; CHECK-NEXT: [[TMP12:%.*]] = phi i32 [ [[TMP30]], [[BB22:%.*]] ], [ 0, [[BB6]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ [[TMP25:%.*]], [[BB22]] ], [ 0, [[BB6]] ] +; CHECK-NEXT: [[TMP14:%.*]] = srem i32 [[TMP13]], [[ARG2]] +; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(1) [[TMP16]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = icmp sgt i32 [[TMP17]], 100 +; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: br i1 [[TMP18]], label [[BB20:%.*]], label [[BB22]] +; CHECK: bb20: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG1]], i64 [[TMP19]] +; CHECK-NEXT: store i32 0, ptr addrspace(1) [[TMP21]], align 4 +; CHECK-NEXT: br label [[BB22]] +; CHECK: bb22: +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) [[TMP23]], align 4 +; CHECK-NEXT: [[TMP25]] = add nuw nsw i32 [[TMP13]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[ARG1]], i64 [[TMP1]] +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr addrspace(1) [[UGLYGEP]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(1) [[UGLYGEP2]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP24]], [[TMP12]] +; CHECK-NEXT: [[TMP30]] = add i32 [[TMP29]], [[TMP28]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP25]], [[TMP4]] +; CHECK-NEXT: br i1 [[TMP31]], label [[BB7]], label [[BB11]] +; +bb: + %tmp = sext i32 %arg2 to i64 + %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp + %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4 + %tmp5 = icmp sgt i32 %tmp4, 0 + br i1 %tmp5, label %bb6, label %bb8 + +bb6: ; preds = %bb + br label %bb11 + +bb7: ; preds = %bb22 + br label %bb8 + +bb8: ; preds = %bb7, %bb + %tmp9 = phi i32 [ 0, %bb ], [ %tmp30, %bb7 ] + %tmp10 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp + store i32 %tmp9, ptr addrspace(1) %tmp10, align 4 + ret void + +bb11: ; preds = %bb22, %bb6 + %tmp12 = phi i32 [ %tmp30, %bb22 ], [ 0, %bb6 ] + %tmp13 = phi i32 [ %tmp25, %bb22 ], [ 0, %bb6 ] + %tmp14 = srem i32 %tmp13, %arg2 + %tmp15 = sext i32 %tmp14 to i64 + %tmp16 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp15 + %tmp17 = load i32, ptr addrspace(1) %tmp16, align 4 + %tmp18 = icmp sgt i32 %tmp17, 100 + %tmp19 = sext i32 %tmp13 to i64 + br i1 %tmp18, label %bb20, label %bb22 + +bb20: ; preds = %bb11 + %tmp21 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp19 + store i32 0, ptr addrspace(1) %tmp21, align 4 + br label %bb22 + +bb22: ; preds = %bb20, %bb11 + %tmp23 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp19 + %tmp24 = load i32, ptr addrspace(1) %tmp23, align 4 + %tmp25 = add nuw nsw i32 %tmp13, 1 + %tmp26 = sext i32 %tmp25 to i64 + %tmp27 = getelementptr inbounds i32, ptr addrspace(1) %arg1, i64 %tmp26 + %tmp28 = load i32, ptr addrspace(1) %tmp27, align 4 + %tmp29 = add i32 %tmp24, %tmp12 + %tmp30 = add i32 %tmp29, %tmp28 + %tmp31 = icmp eq i32 %tmp25, %tmp4 + br i1 %tmp31, label %bb7, label %bb11 +} + +@extern_array_1 = external addrspace(1) global [4096 x i32], align 16 + +@llvm.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @extern_array_1 to ptr) ] + +define void @use_in_other_func() { +; CHECK-LABEL: @use_in_other_func( +; CHECK-NEXT: store i32 0, ptr addrspace(1) @extern_array_1, align 4 +; CHECK-NEXT: store i32 0, ptr addrspace(1) @extern_array_1, align 4 +; CHECK-NEXT: ret void +; + store i32 0, ptr addrspace(1) @extern_array_1 + store i32 0, ptr addrspace(1) @extern_array_1 + ret void +} + +define amdgpu_kernel void @multi_use_in_loop_global_base_address(ptr addrspace(1) nocapture readonly %arg, i32 %arg2) { +; CHECK-LABEL: @multi_use_in_loop_global_base_address( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = sext i32 [[ARG2:%.*]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG:%.*]], i64 [[TMP]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[BB6:%.*]], label [[BB8:%.*]] +; CHECK: bb6: +; CHECK-NEXT: br label [[BB11:%.*]] +; CHECK: bb7: +; CHECK-NEXT: br label [[BB8]] +; CHECK: bb8: +; CHECK-NEXT: [[TMP9:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP30:%.*]], [[BB7:%.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr addrspace(1) @extern_array_1, i64 [[TMP]] +; CHECK-NEXT: store i32 [[TMP9]], ptr addrspace(1) [[TMP10]], align 4 +; CHECK-NEXT: ret void +; CHECK: bb11: +; CHECK-NEXT: [[TMP12:%.*]] = phi i32 [ [[TMP30]], [[BB22:%.*]] ], [ 0, [[BB6]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi i32 [ [[TMP25:%.*]], [[BB22]] ], [ 0, [[BB6]] ] +; CHECK-NEXT: [[TMP14:%.*]] = srem i32 [[TMP13]], [[ARG2]] +; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr addrspace(1) @extern_array_1, i64 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(1) [[TMP16]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = icmp sgt i32 [[TMP17]], 100 +; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: br i1 [[TMP18]], label [[BB20:%.*]], label [[BB22]] +; CHECK: bb20: +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr addrspace(1) @extern_array_1, i64 [[TMP19]] +; CHECK-NEXT: store i32 0, ptr addrspace(1) [[TMP21]], align 4 +; CHECK-NEXT: br label [[BB22]] +; CHECK: bb22: +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr addrspace(1) @extern_array_1, i64 [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(1) [[TMP23]], align 4 +; CHECK-NEXT: [[TMP25]] = add nuw nsw i32 [[TMP13]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr addrspace(1) @extern_array_1, i64 [[TMP1]] +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr addrspace(1) [[UGLYGEP]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(1) [[UGLYGEP2]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP24]], [[TMP12]] +; CHECK-NEXT: [[TMP30]] = add i32 [[TMP29]], [[TMP28]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP25]], [[TMP4]] +; CHECK-NEXT: br i1 [[TMP31]], label [[BB7]], label [[BB11]] +; +bb: + %tmp = sext i32 %arg2 to i64 + %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp + %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4 + %tmp5 = icmp sgt i32 %tmp4, 0 + br i1 %tmp5, label %bb6, label %bb8 + +bb6: ; preds = %bb + br label %bb11 + +bb7: ; preds = %bb22 + br label %bb8 + +bb8: ; preds = %bb7, %bb + %tmp9 = phi i32 [ 0, %bb ], [ %tmp30, %bb7 ] + %tmp10 = getelementptr inbounds i32, ptr addrspace(1) @extern_array_1, i64 %tmp + store i32 %tmp9, ptr addrspace(1) %tmp10, align 4 + ret void + +bb11: ; preds = %bb22, %bb6 + %tmp12 = phi i32 [ %tmp30, %bb22 ], [ 0, %bb6 ] + %tmp13 = phi i32 [ %tmp25, %bb22 ], [ 0, %bb6 ] + %tmp14 = srem i32 %tmp13, %arg2 + %tmp15 = sext i32 %tmp14 to i64 + %tmp16 = getelementptr inbounds i32, ptr addrspace(1) @extern_array_1, i64 %tmp15 + %tmp17 = load i32, ptr addrspace(1) %tmp16, align 4 + %tmp18 = icmp sgt i32 %tmp17, 100 + %tmp19 = sext i32 %tmp13 to i64 + br i1 %tmp18, label %bb20, label %bb22 + +bb20: ; preds = %bb11 + %tmp21 = getelementptr inbounds i32, ptr addrspace(1) @extern_array_1, i64 %tmp19 + store i32 0, ptr addrspace(1) %tmp21, align 4 + br label %bb22 + +bb22: ; preds = %bb20, %bb11 + %tmp23 = getelementptr inbounds i32, ptr addrspace(1) @extern_array_1, i64 %tmp19 + %tmp24 = load i32, ptr addrspace(1) %tmp23, align 4 + %tmp25 = add nuw nsw i32 %tmp13, 1 + %tmp26 = sext i32 %tmp25 to i64 + %tmp27 = getelementptr inbounds i32, ptr addrspace(1) @extern_array_1, i64 %tmp26 + %tmp28 = load i32, ptr addrspace(1) %tmp27, align 4 + %tmp29 = add i32 %tmp24, %tmp12 + %tmp30 = add i32 %tmp29, %tmp28 + %tmp31 = icmp eq i32 %tmp25, %tmp4 + br i1 %tmp31, label %bb7, label %bb11 +} From 1a9928442b7dbcd596d6b577809c22330706ccb5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 8 Apr 2025 07:44:45 +0700 Subject: [PATCH 0923/1029] SeparateConstOffsetFromGEP: Avoid looking at constant uses (#134685) We could be more aggressive and inspect uses of global variables, if the use context instruction is in the same function. --- llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 5 +++++ .../SeparateConstOffsetFromGEP/AMDGPU/lower-gep.ll | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index ab8e979e7b40a..e048015298461 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -1353,6 +1353,11 @@ bool SeparateConstOffsetFromGEP::isLegalToSwapOperand( } bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) { + // TODO: Could look at uses of globals, but we need to make sure we are + // looking at the correct function. + if (isa(V)) + return false; + int UsesInLoop = 0; for (User *U : V->users()) { if (Instruction *User = dyn_cast(U)) diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/lower-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/lower-gep.ll index 687e921640492..2305209dc0818 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/lower-gep.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/lower-gep.ll @@ -425,8 +425,8 @@ define amdgpu_kernel void @multi_use_in_loop_global_base_address(ptr addrspace(1 ; CHECK-NEXT: [[TMP25]] = add nuw nsw i32 [[TMP13]], 1 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[TMP13]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr addrspace(1) @extern_array_1, i64 [[TMP1]] -; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr addrspace(1) [[UGLYGEP]], i64 4 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr addrspace(1) @extern_array_1, i64 4 +; CHECK-NEXT: [[UGLYGEP2:%.*]] = getelementptr i8, ptr addrspace(1) [[UGLYGEP]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr addrspace(1) [[UGLYGEP2]], align 4 ; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP24]], [[TMP12]] ; CHECK-NEXT: [[TMP30]] = add i32 [[TMP29]], [[TMP28]] From a168ddc470825091ad52da12042fb38491ed81d6 Mon Sep 17 00:00:00 2001 From: Bruno Cardoso Lopes Date: Mon, 7 Apr 2025 17:53:18 -0700 Subject: [PATCH 0924/1029] [MLIR][LLVM] Block address support (#134335) Add support for import and translate. MLIR does not support using basic block references outside a function (like LLVM does), This PR does not consider changes to MLIR to that respect. It instead introduces two new ops: `llvm.blockaddress` and `llvm.blocktag`. Here's an example: ``` llvm.func @ba() -> !llvm.ptr { %0 = llvm.blockaddress > : !llvm.ptr llvm.br ^bb1 ^bb1: // pred: ^bb0 llvm.blocktag llvm.return %0 : !llvm.ptr } ``` Value `%0` hold the address of block tagged as `id = 1` in function `@ba`. Block tags need to be unique within a function and use of `llvm.blockaddress` requires a matching tag in a `llvm.blocktag`. --- .../mlir/Dialect/LLVMIR/LLVMAttrDefs.td | 19 +++++ mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 78 +++++++++++++++++++ .../mlir/Target/LLVMIR/ModuleTranslation.h | 35 +++++++++ mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 75 ++++++++++++++++++ .../Transforms/InlinerInterfaceImpl.cpp | 6 +- .../LLVMIR/LLVMToLLVMIRTranslation.cpp | 53 +++++++++++++ mlir/lib/Target/LLVMIR/ModuleImport.cpp | 27 ++++++- mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 26 +++++++ .../LLVMIR/blockaddress-canonicalize.mlir | 48 ++++++++++++ .../test/Dialect/LLVMIR/constant-folding.mlir | 15 ++++ mlir/test/Dialect/LLVMIR/inlining.mlir | 17 ++++ mlir/test/Dialect/LLVMIR/invalid.mlir | 22 ++++++ mlir/test/Dialect/LLVMIR/roundtrip.mlir | 21 +++++ .../test/Target/LLVMIR/Import/blockaddress.ll | 32 ++++++++ .../Target/LLVMIR/Import/import-failure.ll | 36 +++------ mlir/test/Target/LLVMIR/blockaddress.mlir | 36 +++++++++ 16 files changed, 515 insertions(+), 31 deletions(-) create mode 100644 mlir/test/Dialect/LLVMIR/blockaddress-canonicalize.mlir create mode 100644 mlir/test/Target/LLVMIR/Import/blockaddress.ll create mode 100644 mlir/test/Target/LLVMIR/blockaddress.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td index 549a37de2e412..690243525ede4 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td @@ -1224,6 +1224,25 @@ def LLVM_DSOLocalEquivalentAttr : LLVM_Attr<"DSOLocalEquivalent", let assemblyFormat = "$sym"; } +//===----------------------------------------------------------------------===// +// BlockAddressAttr +//===----------------------------------------------------------------------===// + +def LLVM_BlockTagAttr : LLVM_Attr<"BlockTag", "blocktag"> { + let parameters = (ins "uint32_t":$id); + let assemblyFormat = "`<` struct(params) `>`"; +} + +/// Folded into from LLVM_BlockAddressAttr. +def LLVM_BlockAddressAttr : LLVM_Attr<"BlockAddress", "blockaddress"> { + let description = [{ + Describes a block address identified by a pair of `$function` and `$tag`. + }]; + let parameters = (ins "FlatSymbolRefAttr":$function, + "BlockTagAttr":$tag); + let assemblyFormat = "`<` struct(params) `>`"; +} + //===----------------------------------------------------------------------===// // VecTypeHintAttr //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index 423cf948b03e1..b107b64e55b46 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -1625,6 +1625,84 @@ def LLVM_DSOLocalEquivalentOp : LLVM_Op<"dso_local_equivalent", let hasFolder = 1; } +//===----------------------------------------------------------------------===// +// BlockAddressOp & BlockTagOp +//===----------------------------------------------------------------------===// + +def LLVM_BlockAddressOp : LLVM_Op<"blockaddress", + [Pure, ConstantLike, DeclareOpInterfaceMethods]> { + let arguments = (ins LLVM_BlockAddressAttr:$block_addr); + let results = (outs LLVM_AnyPointer:$res); + + let summary = "Creates a LLVM blockaddress ptr"; + + let description = [{ + Creates an SSA value containing a pointer to a basic block. The block + address information (function and block) is given by the `BlockAddressAttr` + attribute. This operation assumes an existing `llvm.blocktag` operation + identifying an existing MLIR block within a function. Example: + + ```mlir + llvm.mlir.global private @g() : !llvm.ptr { + %0 = llvm.blockaddress > : !llvm.ptr + llvm.return %0 : !llvm.ptr + } + + llvm.func @fn() { + llvm.br ^bb1 + ^bb1: // pred: ^bb0 + llvm.blocktag + llvm.return + } + ``` + }]; + + let assemblyFormat = [{ + $block_addr + attr-dict `:` qualified(type($res)) + }]; + + let extraClassDeclaration = [{ + /// Return the llvm.func operation that is referenced here. + LLVMFuncOp getFunction(SymbolTableCollection &symbolTable); + + /// Search for the matching `llvm.blocktag` operation. This is performed + /// by walking the function in `block_addr`. + BlockTagOp getBlockTagOp(); + }]; + + let hasVerifier = 1; + let hasFolder = 1; +} + +def LLVM_BlockTagOp : LLVM_Op<"blocktag"> { + let description = [{ + This operation uses a `tag` to uniquely identify an MLIR block in a + function. The same tag is used by `llvm.blockaddress` in order to compute + the target address. + + A given function should have at most one `llvm.blocktag` operation with a + given `tag`. This operation cannot be used as a terminator but can be + placed everywhere else in a block. + + Example: + + ```mlir + llvm.func @f() -> !llvm.ptr { + %addr = llvm.blockaddress > : !llvm.ptr + llvm.br ^bb1 + ^bb1: + llvm.blocktag + llvm.return %addr : !llvm.ptr + } + ``` + }]; + let arguments = (ins LLVM_BlockTagAttr:$tag); + let assemblyFormat = [{ $tag attr-dict }]; + // Covered as part of LLVMFuncOp verifier. + let hasVerifier = 0; +} + def LLVM_ComdatSelectorOp : LLVM_Op<"comdat_selector", [Symbol]> { let arguments = (ins SymbolNameAttr:$sym_name, diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h index 99b1b65aeb6a5..30c190e50a4f7 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h @@ -136,6 +136,29 @@ class ModuleTranslation { return callMapping.lookup(op); } + /// Maps a blockaddress operation to its corresponding placeholder LLVM + /// value. + void mapUnresolvedBlockAddress(BlockAddressOp op, llvm::Value *cst) { + auto result = unresolvedBlockAddressMapping.try_emplace(op, cst); + (void)result; + assert(result.second && + "attempting to map a blockaddress that is already mapped"); + } + + /// Maps a blockaddress operation to its corresponding placeholder LLVM + /// value. + void mapBlockTag(BlockAddressAttr attr, BlockTagOp blockTag) { + // Attempts to map already mapped block labels which is fine if the given + // labels are verified to be unique. + blockTagMapping[attr] = blockTag; + } + + /// Finds an MLIR block that corresponds to the given MLIR call + /// operation. + BlockTagOp lookupBlockTag(BlockAddressAttr attr) const { + return blockTagMapping.lookup(attr); + } + /// Removes the mapping for blocks contained in the region and values defined /// in these blocks. void forgetMapping(Region ®ion); @@ -338,6 +361,8 @@ class ModuleTranslation { LogicalResult convertFunctions(); LogicalResult convertComdats(); + LogicalResult convertUnresolvedBlockAddress(); + /// Handle conversion for both globals and global aliases. /// /// - Create named global variables that correspond to llvm.mlir.global @@ -433,6 +458,16 @@ class ModuleTranslation { /// This map is populated on module entry. DenseMap comdatMapping; + /// Mapping from llvm.blockaddress operations to their corresponding LLVM + /// constant placeholders. After all basic blocks are translated, this + /// mapping is used to replace the placeholders with the LLVM block addresses. + DenseMap unresolvedBlockAddressMapping; + + /// Mapping from a BlockAddressAttr attribute to a matching BlockTagOp. This + /// is used to cache BlockTagOp locations instead of walking a LLVMFuncOp in + /// search for those. + DenseMap blockTagMapping; + /// Stack of user-specified state elements, useful when translating operations /// with regions. SmallVector> stack; diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 252bdd1425d5e..42a3839e8c638 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -2305,6 +2305,28 @@ static LogicalResult verifyComdat(Operation *op, return success(); } +static LogicalResult verifyBlockTags(LLVMFuncOp funcOp) { + llvm::DenseSet blockTags; + BlockTagOp badBlockTagOp; + if (funcOp + .walk([&](BlockTagOp blockTagOp) { + if (blockTags.contains(blockTagOp.getTag())) { + badBlockTagOp = blockTagOp; + return WalkResult::interrupt(); + } + blockTags.insert(blockTagOp.getTag()); + return WalkResult::advance(); + }) + .wasInterrupted()) { + badBlockTagOp.emitError() + << "duplicate block tag '" << badBlockTagOp.getTag().getId() + << "' in the same function: "; + return failure(); + } + + return success(); +} + /// Parse common attributes that might show up in the same order in both /// GlobalOp and AliasOp. template @@ -3060,6 +3082,9 @@ LogicalResult LLVMFuncOp::verify() { return emitError(diagnosticMessage); } + if (failed(verifyBlockTags(*this))) + return failure(); + return success(); } @@ -3815,6 +3840,56 @@ void InlineAsmOp::getEffects( } } +//===----------------------------------------------------------------------===// +// BlockAddressOp +//===----------------------------------------------------------------------===// + +LogicalResult +BlockAddressOp::verifySymbolUses(SymbolTableCollection &symbolTable) { + Operation *symbol = symbolTable.lookupSymbolIn(parentLLVMModule(*this), + getBlockAddr().getFunction()); + auto function = dyn_cast_or_null(symbol); + + if (!function) + return emitOpError("must reference a function defined by 'llvm.func'"); + + return success(); +} + +LLVMFuncOp BlockAddressOp::getFunction(SymbolTableCollection &symbolTable) { + return dyn_cast_or_null(symbolTable.lookupSymbolIn( + parentLLVMModule(*this), getBlockAddr().getFunction())); +} + +BlockTagOp BlockAddressOp::getBlockTagOp() { + auto funcOp = dyn_cast(mlir::SymbolTable::lookupNearestSymbolFrom( + parentLLVMModule(*this), getBlockAddr().getFunction())); + if (!funcOp) + return nullptr; + + BlockTagOp blockTagOp = nullptr; + funcOp.walk([&](LLVM::BlockTagOp labelOp) { + if (labelOp.getTag() == getBlockAddr().getTag()) { + blockTagOp = labelOp; + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + return blockTagOp; +} + +LogicalResult BlockAddressOp::verify() { + if (!getBlockTagOp()) + return emitOpError( + "expects an existing block label target in the referenced function"); + + return success(); +} + +/// Fold a blockaddress operation to a dedicated blockaddress +/// attribute. +OpFoldResult BlockAddressOp::fold(FoldAdaptor) { return getBlockAddr(); } + //===----------------------------------------------------------------------===// // AssumeOp (intrinsic) //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp index 1edf7fd070b27..7f3afffc9645e 100644 --- a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp +++ b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp @@ -731,8 +731,10 @@ struct LLVMInlinerInterface : public DialectInlinerInterface { } bool isLegalToInline(Operation *op, Region *, bool, IRMapping &) const final { - // The inliner cannot handle variadic function arguments. - return !isa(op); + // The inliner cannot handle variadic function arguments and blocktag + // operations prevent inlining since they the blockaddress operations + // reference them via the callee symbol. + return !(isa(op) || isa(op)); } /// Handle the given inlined return by replacing it with a branch. This diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp index 10b68a333bcbd..738f036bb376a 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp @@ -555,6 +555,59 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder, return success(); } + // Emit blockaddress. We first need to find the LLVM block referenced by this + // operation and then create a LLVM block address for it. + if (auto blockAddressOp = dyn_cast(opInst)) { + // getBlockTagOp() walks a function to search for block labels. Check + // whether it's in cache first. + BlockAddressAttr blockAddressAttr = blockAddressOp.getBlockAddr(); + BlockTagOp blockTagOp = moduleTranslation.lookupBlockTag(blockAddressAttr); + if (!blockTagOp) { + blockTagOp = blockAddressOp.getBlockTagOp(); + moduleTranslation.mapBlockTag(blockAddressAttr, blockTagOp); + } + + llvm::Value *llvmValue = nullptr; + StringRef fnName = blockAddressAttr.getFunction().getValue(); + if (llvm::BasicBlock *llvmBlock = + moduleTranslation.lookupBlock(blockTagOp->getBlock())) { + llvm::Function *llvmFn = moduleTranslation.lookupFunction(fnName); + llvmValue = llvm::BlockAddress::get(llvmFn, llvmBlock); + } else { + // The matching LLVM block is not yet emitted, a placeholder is created + // in its place. When the LLVM block is emitted later in translation, + // the llvmValue is replaced with the actual llvm::BlockAddress. + // A GlobalVariable is chosen as placeholder because in general LLVM + // constants are uniqued and are not proper for RAUW, since that could + // harm unrelated uses of the constant. + llvmValue = new llvm::GlobalVariable( + *moduleTranslation.getLLVMModule(), + llvm::PointerType::getUnqual(moduleTranslation.getLLVMContext()), + /*isConstant=*/true, llvm::GlobalValue::LinkageTypes::ExternalLinkage, + /*Initializer=*/nullptr, + Twine("__mlir_block_address_") + .concat(Twine(fnName)) + .concat(Twine((uint64_t)blockAddressOp.getOperation()))); + moduleTranslation.mapUnresolvedBlockAddress(blockAddressOp, llvmValue); + } + + moduleTranslation.mapValue(blockAddressOp.getResult(), llvmValue); + return success(); + } + + // Emit block label. If this label is seen before BlockAddressOp is + // translated, go ahead and already map it. + if (auto blockTagOp = dyn_cast(opInst)) { + auto funcOp = blockTagOp->getParentOfType(); + BlockAddressAttr blockAddressAttr = BlockAddressAttr::get( + &moduleTranslation.getContext(), + FlatSymbolRefAttr::get(&moduleTranslation.getContext(), + funcOp.getName()), + blockTagOp.getTag()); + moduleTranslation.mapBlockTag(blockAddressAttr, blockTagOp); + return success(); + } + return failure(); } diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index 5f047a59a9828..2859abdb41772 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -1381,9 +1381,18 @@ FailureOr ModuleImport::convertConstant(llvm::Constant *constant) { return builder.create(loc, targetExtType).getRes(); } + if (auto *blockAddr = dyn_cast(constant)) { + auto fnSym = + FlatSymbolRefAttr::get(context, blockAddr->getFunction()->getName()); + auto blockTag = + BlockTagAttr::get(context, blockAddr->getBasicBlock()->getNumber()); + return builder + .create(loc, convertType(blockAddr->getType()), + BlockAddressAttr::get(context, fnSym, blockTag)) + .getRes(); + } + StringRef error = ""; - if (isa(constant)) - error = " since blockaddress(...) is unsupported"; if (isa(constant)) error = " since ptrauth(...) is unsupported"; @@ -2448,8 +2457,13 @@ LogicalResult ModuleImport::processFunction(llvm::Function *func) { SmallVector reachableBasicBlocks; for (llvm::BasicBlock &basicBlock : *func) { // Skip unreachable blocks. - if (!reachable.contains(&basicBlock)) + if (!reachable.contains(&basicBlock)) { + if (basicBlock.hasAddressTaken()) + return emitError(funcOp.getLoc()) + << "unreachable block '" << basicBlock.getName() + << "' with address taken"; continue; + } Region &body = funcOp.getBody(); Block *block = builder.createBlock(&body, body.end()); mapBlock(&basicBlock, block); @@ -2606,6 +2620,13 @@ LogicalResult ModuleImport::processBasicBlock(llvm::BasicBlock *bb, } } } + + if (bb->hasAddressTaken()) { + OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointToStart(block); + builder.create(block->getParentOp()->getLoc(), + BlockTagAttr::get(context, bb->getNumber())); + } return success(); } diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index d30cb8a7d7974..ee7dc3a5231f4 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -1824,6 +1824,27 @@ LogicalResult ModuleTranslation::convertComdats() { return success(); } +LogicalResult ModuleTranslation::convertUnresolvedBlockAddress() { + for (auto &[blockAddressOp, llvmCst] : unresolvedBlockAddressMapping) { + BlockAddressAttr blockAddressAttr = blockAddressOp.getBlockAddr(); + BlockTagOp blockTagOp = lookupBlockTag(blockAddressAttr); + assert(blockTagOp && "expected all block tags to be already seen"); + + llvm::BasicBlock *llvmBlock = lookupBlock(blockTagOp->getBlock()); + assert(llvmBlock && "expected LLVM blocks to be already translated"); + + // Update mapping with new block address constant. + auto *llvmBlockAddr = llvm::BlockAddress::get( + lookupFunction(blockAddressAttr.getFunction().getValue()), llvmBlock); + llvmCst->replaceAllUsesWith(llvmBlockAddr); + mapValue(blockAddressOp.getResult(), llvmBlockAddr); + assert(llvmCst->use_empty() && "expected all uses to be replaced"); + cast(llvmCst)->eraseFromParent(); + } + unresolvedBlockAddressMapping.clear(); + return success(); +} + void ModuleTranslation::setAccessGroupsMetadata(AccessGroupOpInterface op, llvm::Instruction *inst) { if (llvm::MDNode *node = loopAnnotationTranslation->getAccessGroups(op)) @@ -2236,6 +2257,11 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext, if (failed(translator.convertFunctions())) return nullptr; + // Now that all MLIR blocks are resolved into LLVM ones, patch block address + // constants to point to the correct blocks. + if (failed(translator.convertUnresolvedBlockAddress())) + return nullptr; + // Once we've finished constructing elements in the module, we should convert // it to use the debug info format desired by LLVM. // See https://llvm.org/docs/RemoveDIsDebugInfo.html diff --git a/mlir/test/Dialect/LLVMIR/blockaddress-canonicalize.mlir b/mlir/test/Dialect/LLVMIR/blockaddress-canonicalize.mlir new file mode 100644 index 0000000000000..11dd6f0d97f78 --- /dev/null +++ b/mlir/test/Dialect/LLVMIR/blockaddress-canonicalize.mlir @@ -0,0 +1,48 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(llvm.func(canonicalize{region-simplify=aggressive}))' -split-input-file | FileCheck %s + +llvm.mlir.global private @x() {addr_space = 0 : i32, dso_local} : !llvm.ptr { + %0 = llvm.blockaddress > : !llvm.ptr + llvm.return %0 : !llvm.ptr +} + +// CHECK-LABEL: llvm.func @ba() +llvm.func @ba() -> !llvm.ptr { + %0 = llvm.blockaddress > : !llvm.ptr + llvm.br ^bb1 +^bb1: + // CHECK: llvm.blocktag + llvm.blocktag + llvm.br ^bb2 +^bb2: + // CHECK: llvm.blocktag + llvm.blocktag + llvm.return %0 : !llvm.ptr +} + +// ----- + + +llvm.mlir.global private @g() {addr_space = 0 : i32, dso_local} : !llvm.ptr { + %0 = llvm.blockaddress > : !llvm.ptr + llvm.return %0 : !llvm.ptr +} + +llvm.mlir.global private @h() {addr_space = 0 : i32, dso_local} : !llvm.ptr { + %0 = llvm.blockaddress > : !llvm.ptr + llvm.return %0 : !llvm.ptr +} + +// CHECK-LABEL: llvm.func @fn +llvm.func @fn(%cond : i1, %arg0 : i32, %arg1 : i32) -> i32 { + llvm.cond_br %cond, ^bb1, ^bb2 +^bb1: + // CHECK: llvm.blocktag + // CHECK: llvm.return + llvm.blocktag + llvm.return %arg0 : i32 +^bb2: + // CHECK: llvm.blocktag + // CHECK: llvm.return + llvm.blocktag + llvm.return %arg1 : i32 +} diff --git a/mlir/test/Dialect/LLVMIR/constant-folding.mlir b/mlir/test/Dialect/LLVMIR/constant-folding.mlir index 99f657f0aefec..0616f19b8fddb 100644 --- a/mlir/test/Dialect/LLVMIR/constant-folding.mlir +++ b/mlir/test/Dialect/LLVMIR/constant-folding.mlir @@ -196,3 +196,18 @@ llvm.func @dso_local_equivalent_select(%arg: i1) -> !llvm.ptr { } llvm.func @yay() + +// ----- + +// CHECK-LABEL: llvm.func @blockaddress_select +llvm.func @blockaddress_select(%arg: i1) -> !llvm.ptr { + // CHECK-NEXT: %[[ADDR:.+]] = llvm.blockaddress > + %0 = llvm.blockaddress > : !llvm.ptr + %1 = llvm.blockaddress > : !llvm.ptr + %2 = arith.select %arg, %0, %1 : !llvm.ptr + // CHECK-NEXT: llvm.br ^bb1 + llvm.br ^bb1 +^bb1: + llvm.blocktag + llvm.return %1 : !llvm.ptr +} diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir index 136d0f85d509a..551e0c97912d0 100644 --- a/mlir/test/Dialect/LLVMIR/inlining.mlir +++ b/mlir/test/Dialect/LLVMIR/inlining.mlir @@ -692,3 +692,20 @@ llvm.func @caller(%x : i32) -> i32 { %z = llvm.call @unreachable_func(%x) : (i32) -> (i32) llvm.return %z : i32 } + +// ----- +// Check that @func is not inlined because of llvm.blocktag + +func.func @func(%arg0 : i32) -> i32 { + llvm.blocktag + llvm.return %arg0 : i32 +} + +// CHECK-LABEL: @llvm_ret +func.func @llvm_ret(%arg0 : i32) -> i32 { + // CHECK-NOT: llvm.blocktag + // CHECK: %[[R:.*]] = call + %res = call @func(%arg0) : (i32) -> (i32) + // CHECK: return %[[R]] + return %res : i32 +} diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir index fb9631d99b91a..e70e3185af236 100644 --- a/mlir/test/Dialect/LLVMIR/invalid.mlir +++ b/mlir/test/Dialect/LLVMIR/invalid.mlir @@ -1780,3 +1780,25 @@ module { // expected-error@+1 {{failed to parse ModuleFlagAttr parameter 'value' which is to be a `uint32_t`}} llvm.module_flags [#llvm.mlir.module_flag] } + +// ----- + +llvm.func @t0() -> !llvm.ptr { + %0 = llvm.blockaddress > : !llvm.ptr + llvm.blocktag + llvm.br ^bb1 +^bb1: + // expected-error@+1 {{duplicate block tag '1' in the same function}} + llvm.blocktag + llvm.return %0 : !llvm.ptr +} + +// ----- + +llvm.func @t1() -> !llvm.ptr { + // expected-error@+1 {{expects an existing block label target in the referenced function}} + %0 = llvm.blockaddress > : !llvm.ptr + llvm.br ^bb1 +^bb1: + llvm.return %0 : !llvm.ptr +} diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir index d0aa65d14a176..88460fe374d87 100644 --- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir +++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir @@ -1002,3 +1002,24 @@ llvm.func @intrinsic_call_arg_attrs_bundles(%arg0: i32) -> i32 { %0 = llvm.call_intrinsic "llvm.riscv.sha256sig0"(%arg0) ["adazdazd"()] {constant} : (i32 {llvm.signext}) -> (i32) llvm.return %0 : i32 } + +llvm.mlir.global private @blockaddr_global() {addr_space = 0 : i32, dso_local} : !llvm.ptr { + %0 = llvm.blockaddress > : !llvm.ptr + llvm.return %0 : !llvm.ptr +} + +// CHECK: llvm.mlir.global private @blockaddr_global() {{.*}} +// CHECK-NEXT: %{{.*}} = llvm.blockaddress > : !llvm.ptr +// CHECK-NEXT: llvm.return %{{.*}} : !llvm.ptr + +llvm.func @blockaddr_fn() { + llvm.br ^bb1 +^bb1: + llvm.blocktag + llvm.return +} + +// CHECK-LABEL: llvm.func @blockaddr_fn +// CHECK-NEXT: llvm.br ^bb1 +// CHECK-NEXT:^bb1: +// CHECK-NEXT: llvm.blocktag diff --git a/mlir/test/Target/LLVMIR/Import/blockaddress.ll b/mlir/test/Target/LLVMIR/Import/blockaddress.ll new file mode 100644 index 0000000000000..fb6ef1b6c3a2b --- /dev/null +++ b/mlir/test/Target/LLVMIR/Import/blockaddress.ll @@ -0,0 +1,32 @@ +; RUN: mlir-translate -import-llvm -split-input-file %s | FileCheck %s + +@g = private global ptr blockaddress(@fn, %bb1) +define void @fn() { + br label %bb1 +bb1: + ret void +} + +; CHECK: llvm.mlir.global private @g() +; CHECK: %[[ADDR:.*]] = llvm.blockaddress > : !llvm.ptr +; CHECK: llvm.return %[[ADDR]] : !llvm.ptr + +; CHECK: llvm.func @fn() { +; CHECK: llvm.br ^[[RET_BB:.*]] +; CHECK: ^[[RET_BB]]: +; CHECK: llvm.blocktag +; CHECK: llvm.return +; CHECK: } + +; // ----- + +; CHECK-LABEL: blockaddr0 +define ptr @blockaddr0() { + br label %bb1 + ; CHECK: %[[BLOCKADDR:.*]] = llvm.blockaddress > : !llvm.ptr +bb1: + ; CHECK: [[BLOCKADDR]]: + ; CHECK: llvm.blocktag + ; CHECK-NEXT: llvm.return %[[BLOCKADDR]] : !llvm.ptr + ret ptr blockaddress(@blockaddr0, %bb1) +} diff --git a/mlir/test/Target/LLVMIR/Import/import-failure.ll b/mlir/test/Target/LLVMIR/Import/import-failure.ll index d3ea3a510d7f8..4fbf187659a7b 100644 --- a/mlir/test/Target/LLVMIR/Import/import-failure.ll +++ b/mlir/test/Target/LLVMIR/Import/import-failure.ll @@ -12,32 +12,6 @@ bb2: ; // ----- -; CHECK: -; CHECK-SAME: unhandled constant: ptr blockaddress(@unhandled_constant, %bb1) since blockaddress(...) is unsupported -; CHECK: -; CHECK-SAME: error: unhandled instruction: ret ptr blockaddress(@unhandled_constant, %bb1) -define ptr @unhandled_constant() { - br label %bb1 -bb1: - ret ptr blockaddress(@unhandled_constant, %bb1) -} - -; // ----- - -; CHECK: -; CHECK-SAME: unhandled constant: ptr blockaddress(@unhandled_global, %bb1) since blockaddress(...) is unsupported -; CHECK: -; CHECK-SAME: error: unhandled global variable: @private = private global ptr blockaddress(@unhandled_global, %bb1) -@private = private global ptr blockaddress(@unhandled_global, %bb1) - -define void @unhandled_global() { - br label %bb1 -bb1: - ret void -} - -; // ----- - ; Check that debug intrinsics with an unsupported argument are dropped. declare void @llvm.dbg.value(metadata, metadata, metadata) @@ -376,3 +350,13 @@ bb2: declare i32 @g() declare i32 @__gxx_personality_v0(...) + +; // ----- + +@g = private global ptr blockaddress(@fn, %bb1) +define void @fn() { + ret void +; CHECK: unreachable block 'bb1' with address taken +bb1: + ret void +} diff --git a/mlir/test/Target/LLVMIR/blockaddress.mlir b/mlir/test/Target/LLVMIR/blockaddress.mlir new file mode 100644 index 0000000000000..fb3d853531122 --- /dev/null +++ b/mlir/test/Target/LLVMIR/blockaddress.mlir @@ -0,0 +1,36 @@ +// RUN: mlir-translate -mlir-to-llvmir %s -split-input-file | FileCheck %s + +llvm.mlir.global private @g() {addr_space = 0 : i32, dso_local} : !llvm.ptr { + %0 = llvm.blockaddress > : !llvm.ptr + llvm.return %0 : !llvm.ptr +} + +llvm.func @fn() { + llvm.br ^bb1 +^bb1: + llvm.blocktag + llvm.return +} + +// CHECK: @g = private global ptr blockaddress(@fn, %1) +// CHECK: define void @fn() { +// CHECK: br label %[[RET:.*]] +// CHECK: [[RET]]: +// CHECK: ret void +// CHECK: } + +// ----- + +llvm.func @blockaddr0() -> !llvm.ptr { + %0 = llvm.blockaddress > : !llvm.ptr + llvm.br ^bb1 +^bb1: + llvm.blocktag + llvm.return %0 : !llvm.ptr +} + +// CHECK: define ptr @blockaddr0() { +// CHECK: br label %[[RET:.*]] +// CHECK: [[RET]]: +// CHECK: ret ptr blockaddress(@blockaddr0, %1) +// CHECK: } From 80fde75dc6c50a7d32f6dbfda9a6f2c24890b5cc Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 7 Apr 2025 18:08:55 -0700 Subject: [PATCH 0925/1029] [NFC][LLVM][SystemZ] Cleanup pass initialization for SystemZ (#134450) - Remove calls to pass initialization from pass constructors. - https://github.com/llvm/llvm-project/issues/111767 --- llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp | 5 +---- llvm/lib/Target/SystemZ/SystemZElimCompare.cpp | 4 +--- llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp | 4 +--- llvm/lib/Target/SystemZ/SystemZLongBranch.cpp | 4 +--- llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp | 4 +--- llvm/lib/Target/SystemZ/SystemZShortenInst.cpp | 4 +--- llvm/lib/Target/SystemZ/SystemZTDC.cpp | 4 +--- 7 files changed, 7 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp b/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp index a6cf0f57aaf06..8867b6ea10931 100644 --- a/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp +++ b/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp @@ -28,10 +28,7 @@ namespace { class SystemZCopyPhysRegs : public MachineFunctionPass { public: static char ID; - SystemZCopyPhysRegs() - : MachineFunctionPass(ID), TII(nullptr), MRI(nullptr) { - initializeSystemZCopyPhysRegsPass(*PassRegistry::getPassRegistry()); - } + SystemZCopyPhysRegs() : MachineFunctionPass(ID), TII(nullptr), MRI(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; diff --git a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp index 789365fb9e311..81f0014dd83f2 100644 --- a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp +++ b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp @@ -65,9 +65,7 @@ class SystemZElimCompare : public MachineFunctionPass { public: static char ID; - SystemZElimCompare() : MachineFunctionPass(ID) { - initializeSystemZElimComparePass(*PassRegistry::getPassRegistry()); - } + SystemZElimCompare() : MachineFunctionPass(ID) {} bool processBlock(MachineBasicBlock &MBB); bool runOnMachineFunction(MachineFunction &F) override; diff --git a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp index 06df93a60709c..a81cfeb633603 100644 --- a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp +++ b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp @@ -28,9 +28,7 @@ namespace { class SystemZLDCleanup : public MachineFunctionPass { public: static char ID; - SystemZLDCleanup() : MachineFunctionPass(ID), TII(nullptr), MF(nullptr) { - initializeSystemZLDCleanupPass(*PassRegistry::getPassRegistry()); - } + SystemZLDCleanup() : MachineFunctionPass(ID), TII(nullptr), MF(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; diff --git a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp index f19b932f3c731..d50d7e419faeb 100644 --- a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp +++ b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp @@ -135,9 +135,7 @@ class SystemZLongBranch : public MachineFunctionPass { public: static char ID; - SystemZLongBranch() : MachineFunctionPass(ID) { - initializeSystemZLongBranchPass(*PassRegistry::getPassRegistry()); - } + SystemZLongBranch() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &F) override; diff --git a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp index ffeba87795625..8d4cee655235f 100644 --- a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp +++ b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp @@ -32,9 +32,7 @@ namespace { class SystemZPostRewrite : public MachineFunctionPass { public: static char ID; - SystemZPostRewrite() : MachineFunctionPass(ID) { - initializeSystemZPostRewritePass(*PassRegistry::getPassRegistry()); - } + SystemZPostRewrite() : MachineFunctionPass(ID) {} const SystemZInstrInfo *TII; diff --git a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp index c0adfdbf120bd..5d71c25348d95 100644 --- a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp +++ b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp @@ -60,9 +60,7 @@ FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) { } SystemZShortenInst::SystemZShortenInst() - : MachineFunctionPass(ID), TII(nullptr) { - initializeSystemZShortenInstPass(*PassRegistry::getPassRegistry()); -} + : MachineFunctionPass(ID), TII(nullptr) {} // Tie operands if MI has become a two-address instruction. static void tieOpsIfNeeded(MachineInstr &MI) { diff --git a/llvm/lib/Target/SystemZ/SystemZTDC.cpp b/llvm/lib/Target/SystemZ/SystemZTDC.cpp index d5baa325951c2..1e254d694e392 100644 --- a/llvm/lib/Target/SystemZ/SystemZTDC.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTDC.cpp @@ -64,9 +64,7 @@ namespace { class SystemZTDCPass : public FunctionPass { public: static char ID; - SystemZTDCPass() : FunctionPass(ID) { - initializeSystemZTDCPassPass(*PassRegistry::getPassRegistry()); - } + SystemZTDCPass() : FunctionPass(ID) {} bool runOnFunction(Function &F) override; From 20eb60d3f2d416976b1b31384e97db21ee0f28c9 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 7 Apr 2025 18:09:46 -0700 Subject: [PATCH 0926/1029] [NFC][LLVM] Cleanup pass initialization for wasm/LoongArch (#134452) - Remove calls to pass initialization from pass constructors. - https://github.com/llvm/llvm-project/issues/111767 --- .../Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp | 4 +--- llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp | 8 ++------ llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp | 1 + llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h | 4 +--- 4 files changed, 5 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp index 79f37a0f548c6..27d20390eb6ae 100644 --- a/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp @@ -33,9 +33,7 @@ class LoongArchExpandAtomicPseudo : public MachineFunctionPass { const LoongArchInstrInfo *TII; static char ID; - LoongArchExpandAtomicPseudo() : MachineFunctionPass(ID) { - initializeLoongArchExpandAtomicPseudoPass(*PassRegistry::getPassRegistry()); - } + LoongArchExpandAtomicPseudo() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp index c2d73a260b1c1..9e9c65a041bf7 100644 --- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp @@ -41,9 +41,7 @@ class LoongArchPreRAExpandPseudo : public MachineFunctionPass { const LoongArchInstrInfo *TII; static char ID; - LoongArchPreRAExpandPseudo() : MachineFunctionPass(ID) { - initializeLoongArchPreRAExpandPseudoPass(*PassRegistry::getPassRegistry()); - } + LoongArchPreRAExpandPseudo() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; @@ -656,9 +654,7 @@ class LoongArchExpandPseudo : public MachineFunctionPass { const LoongArchInstrInfo *TII; static char ID; - LoongArchExpandPseudo() : MachineFunctionPass(ID) { - initializeLoongArchExpandPseudoPass(*PassRegistry::getPassRegistry()); - } + LoongArchExpandPseudo() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index 692392dc2bae0..dc490e8185504 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -40,6 +40,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTarget() { initializeLoongArchPreRAExpandPseudoPass(*PR); initializeLoongArchExpandPseudoPass(*PR); initializeLoongArchDAGToDAGISelLegacyPass(*PR); + initializeLoongArchExpandAtomicPseudoPass(*PR); } static cl::opt EnableLoongArchDeadRegisterElimination( diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h index 832ef1e49d784..7845cdfaebec7 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h @@ -132,9 +132,7 @@ class WebAssemblyExceptionInfo final : public MachineFunctionPass { public: static char ID; - WebAssemblyExceptionInfo() : MachineFunctionPass(ID) { - initializeWebAssemblyExceptionInfoPass(*PassRegistry::getPassRegistry()); - } + WebAssemblyExceptionInfo() : MachineFunctionPass(ID) {} ~WebAssemblyExceptionInfo() override { releaseMemory(); } WebAssemblyExceptionInfo(const WebAssemblyExceptionInfo &) = delete; WebAssemblyExceptionInfo & From bb1f32ded0b7bec3b69c30b1c21eb1210372cd7f Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Mon, 7 Apr 2025 18:10:06 -0700 Subject: [PATCH 0927/1029] [NFC][LLVM] Change `initializePassOnce` to return void (#134500) - The return value of these functions (called using `llvm::call_once`) is never used, so make these functions return void. --- llvm/include/llvm/PassSupport.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/include/llvm/PassSupport.h b/llvm/include/llvm/PassSupport.h index 57210b2488b53..2806d9b52b0b9 100644 --- a/llvm/include/llvm/PassSupport.h +++ b/llvm/include/llvm/PassSupport.h @@ -36,7 +36,7 @@ namespace llvm { class Pass; #define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis) \ - static void *initialize##passName##PassOnce(PassRegistry &Registry) { + static void initialize##passName##PassOnce(PassRegistry &Registry) { #define INITIALIZE_PASS_DEPENDENCY(depName) initialize##depName##Pass(Registry); @@ -45,7 +45,6 @@ class Pass; name, arg, &passName::ID, \ PassInfo::NormalCtor_t(callDefaultCtor), cfg, analysis); \ Registry.registerPass(*PI, true); \ - return PI; \ } \ static llvm::once_flag Initialize##passName##PassFlag; \ void llvm::initialize##passName##Pass(PassRegistry &Registry) { \ From 3f4e4e0a12c523f3d794f015ca7f40bcfc0ed8f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Mon, 7 Apr 2025 18:21:07 -0700 Subject: [PATCH 0928/1029] [flang][cuda] Add on_device interface to cudadevice (#134747) --- flang/module/cudadevice.f90 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 591e25e4108b2..9bd90bcfc30ec 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -1609,4 +1609,9 @@ attributes(device) integer function ballot_sync(mask, pred) end subroutine end interface + interface + attributes(device,host) logical function on_device() bind(c) + end function + end interface + end module From 3a0c95fb506301fdd083b6e5dff0a0c88da4a665 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 8 Apr 2025 08:51:20 +0700 Subject: [PATCH 0929/1029] llvm-reduce: Fix introducing unreachable code in simplify conditionals (#133842) After replacing the branch condition, this was calling simplifyCFG to perform the cleanups of the branch. This is far too heavy of a hammer. We do not want all of the extra optimizations in simplifyCFG, and this could also leave behind dead code. Instead, minimally fold the terminator and try to delete the newly dead code. This is pretty much a direct copy of what bugpoint does. --- .../tools/llvm-reduce/reduce-conditionals.ll | 132 ++++++++++++++++-- .../deltas/ReduceUsingSimplifyCFG.cpp | 19 ++- llvm/tools/llvm-reduce/deltas/Utils.cpp | 39 ++++++ llvm/tools/llvm-reduce/deltas/Utils.h | 6 + 4 files changed, 182 insertions(+), 14 deletions(-) diff --git a/llvm/test/tools/llvm-reduce/reduce-conditionals.ll b/llvm/test/tools/llvm-reduce/reduce-conditionals.ll index a832673d7350b..e1a9f3ae13944 100644 --- a/llvm/test/tools/llvm-reduce/reduce-conditionals.ll +++ b/llvm/test/tools/llvm-reduce/reduce-conditionals.ll @@ -1,10 +1,12 @@ -; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=simplify-conditionals-true --test FileCheck --test-arg --check-prefixes=CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t -; RUN: FileCheck --check-prefixes=RESULT-TRUE %s < %t +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=simplify-conditionals-true --test FileCheck --test-arg --check-prefixes=CHECK-INTERESTINGNESS,CHECK --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck --check-prefixes=RESULT-TRUE,RESULT,CHECK %s < %t -; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=simplify-conditionals-false --test FileCheck --test-arg --check-prefixes=CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t -; RUN: FileCheck --check-prefixes=RESULT-FALSE %s < %t +; RUN: llvm-reduce --abort-on-invalid-reduction --delta-passes=simplify-conditionals-false --test FileCheck --test-arg --check-prefixes=CHECK-INTERESTINGNESS,CHECK --test-arg %s --test-arg --input-file %s -o %t +; RUN: FileCheck --check-prefixes=RESULT-FALSE,RESULT,CHECK %s < %t -; CHECK-INTERESTINGNESS-LABEL: @func( +; Make sure there is no unreachable code introduced by the reduction + +; CHECK-LABEL: @func_simplifies_true( ; CHECK-INTERESTINGNESS: store i32 1, ; RESULT-TRUE: bb0: @@ -13,15 +15,98 @@ ; RESULT-TRUE-NEXT: br label %bb2 ; RESULT-TRUE-NOT: bb1 +; RESULT-FALSE: bb0: +; RESULT-FALSE-NEXT: store i32 0, ptr null, align 4 +; RESULT-FALSE-NEXT: br i1 %cond0, label %bb1, label %bb2 + +; RESULT-FALSE: bb1: ; preds = %bb0 +; RESULT-FALSE-NEXT: store i32 1, ptr null, align 4 +; RESULT-FALSE-NEXT: br label %bb3 + +; RESULT-FALSE: bb2: ; preds = %bb0 +; RESULT-FALSE-NEXT: store i32 2, ptr null, align 4 +; RESULT-FALSE-NEXT: br label %bb3 + +; RESULT-FALSE: bb3: ; preds = %bb1, %bb2 +; RESULT-FALSE-NEXT: ret void +define void @func_simplifies_true(i1 %cond0, i1 %cond1) { +bb0: + store i32 0, ptr null + br i1 %cond0, label %bb1, label %bb2 + +bb1: + store i32 1, ptr null + br i1 %cond1, label %bb2, label %bb3 + +bb2: + store i32 2, ptr null + br label %bb3 + +bb3: + ret void +} + +; CHECK-LABEL: @func_simplifies_false( +; CHECK-INTERESTINGNESS: store i32 0, + +; RESULT-TRUE: bb0: +; RESULT-TRUE: store i32 0, ptr null, align 4 +; RESULT-TRUE-NEXT: store i32 1, ptr null, align 4 +; RESULT-TRUE-NEXT: br label %bb2 +; RESULT-TRUE-NOT: bb1 + ; RESULT-FALSE: bb0: ; RESULT-FALSE: store i32 0, ptr null, align 4 ; RESULT-FALSE-NEXT: br label %bb2 -; RESULT-FALSE: bb1: ; No predecessors! -; RESULT-FALSE-NEXT: store i32 1, ptr null, align 4 +; RESULT-FALSE: bb2: ; preds = %bb0 +; RESULT-FALSE-NEXT: store i32 2, ptr null, align 4 ; RESULT-FALSE-NEXT: br label %bb3 -define void @func(i1 %cond0, i1 %cond1) { + +; RESULT-FALSE: bb3: ; preds = %bb2 +; RESULT-FALSE-NEXT: ret void +define void @func_simplifies_false(i1 %cond0, i1 %cond1) { +bb0: + store i32 0, ptr null + br i1 %cond0, label %bb1, label %bb2 + +bb1: + store i32 1, ptr null + br i1 %cond1, label %bb2, label %bb3 + +bb2: + store i32 2, ptr null + br label %bb3 + +bb3: + ret void +} + +; Make sure we don't break the reduction in the other functions by +; having something interesting in unrelated unreachable code. + +; CHECK-LABEL: @func_simplifies_true_with_interesting_unreachable_code( +; CHECK-INTERESTINGNESS: store i32 0, +; CHECK-INTERESTINGNESS: store i32 %arg, + + +; RESULT: bb0: +; RESULT-NEXT: store i32 0 +; RESULT-NEXT: br i1 %cond0, label %bb1, label %bb2 + +; RESULT: bb1: +; RESULT-NEXT: store i32 1 +; RESULT-NEXT: br i1 %cond1, label %bb2, label %bb3 + +; RESULT: bb2: +; RESULT-NEXT: store i32 2 +; RESULT-NEXT: br label %bb3 + +; RESULT: dead_code: ; preds = %dead_code +; RESULT-NEXT: store i32 %arg, +; RESULT-NEXT: br label %dead_code +define void @func_simplifies_true_with_interesting_unreachable_code(i1 %cond0, i1 %cond1, i32 %arg) { bb0: store i32 0, ptr null br i1 %cond0, label %bb1, label %bb2 @@ -36,4 +121,35 @@ bb2: bb3: ret void + +dead_code: + store i32 %arg, ptr null + br label %dead_code +} + +@block_address_user = constant [1 x ptr] [ptr blockaddress(@will_be_unreachable_blockaddress_use, %will_be_unreachable)] + +; CHECK-LABEL: @will_be_unreachable_blockaddress_use( +; CHECK-INTERESTINGNESS: inttoptr + +; RESULT-FALSE: entry: +; RESULT-FALSE-NEXT: %i2p = inttoptr i64 %int to ptr +; RESULT-FALSE-NEXT: br label %exit + +; RESULT-FALSE: exit: ; preds = %entry +; RESULT-FALSE-NEXT: ret i1 false +define i1 @will_be_unreachable_blockaddress_use(i1 %cond, i64 %int) { +entry: + %i2p = inttoptr i64 %int to ptr + br i1 %cond, label %will_be_unreachable, label %exit + +will_be_unreachable: + %load = load ptr, ptr %i2p, align 8 + br label %for.body + +for.body: + br label %for.body + +exit: + ret i1 false } diff --git a/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp index 3d6b35d1895e7..a982524af4cf6 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceUsingSimplifyCFG.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "ReduceUsingSimplifyCFG.h" +#include "Utils.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" @@ -35,13 +36,17 @@ void llvm::reduceUsingSimplifyCFGDeltaPass(Oracle &O, static void reduceConditionals(Oracle &O, ReducerWorkItem &WorkItem, bool Direction) { Module &M = WorkItem.getModule(); - SmallVector ToSimplify; LLVMContext &Ctx = M.getContext(); ConstantInt *ConstValToSet = Direction ? ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx); - for (auto &F : M) { + for (Function &F : M) { + if (F.isDeclaration()) + continue; + + SmallVector ToSimplify; + for (auto &BB : F) { auto *BR = dyn_cast(BB.getTerminator()); if (!BR || !BR->isConditional() || BR->getCondition() == ConstValToSet || @@ -51,11 +56,13 @@ static void reduceConditionals(Oracle &O, ReducerWorkItem &WorkItem, BR->setCondition(ConstValToSet); ToSimplify.push_back(&BB); } - } - TargetTransformInfo TTI(M.getDataLayout()); - for (auto *BB : ToSimplify) - simplifyCFG(BB, TTI); + if (!ToSimplify.empty()) { + // TODO: Should probably leave MergeBlockIntoPredecessor for a separate + // reduction + simpleSimplifyCFG(F, ToSimplify); + } + } } void llvm::reduceConditionalsTrueDeltaPass(Oracle &O, diff --git a/llvm/tools/llvm-reduce/deltas/Utils.cpp b/llvm/tools/llvm-reduce/deltas/Utils.cpp index 92a44921a7cfb..a980a0f9fad2f 100644 --- a/llvm/tools/llvm-reduce/deltas/Utils.cpp +++ b/llvm/tools/llvm-reduce/deltas/Utils.cpp @@ -14,6 +14,8 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalIFunc.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -47,3 +49,40 @@ bool llvm::hasAliasOrBlockAddressUse(Function &F) { return isa(U); }); } + +void llvm::simpleSimplifyCFG(Function &F, ArrayRef BBs, + bool FoldBlockIntoPredecessor) { + + for (BasicBlock *BB : BBs) { + ConstantFoldTerminator(BB); + if (FoldBlockIntoPredecessor) + MergeBlockIntoPredecessor(BB); + } + + // Remove unreachable blocks + // + // removeUnreachableBlocks can't be used here, it will turn various undefined + // behavior into unreachables, but llvm-reduce was the thing that generated + // the undefined behavior, and we don't want it to kill the entire program. + SmallPtrSet Visited(llvm::from_range, + depth_first(&F.getEntryBlock())); + + SmallVector Unreachable; + for (BasicBlock &BB : F) { + if (!Visited.count(&BB)) + Unreachable.push_back(&BB); + } + + // The dead BB's may be in a dead cycle or otherwise have references to each + // other. Because of this, we have to drop all references first, then delete + // them all at once. + for (BasicBlock *BB : Unreachable) { + for (BasicBlock *Successor : successors(&*BB)) + if (Visited.count(Successor)) + Successor->removePredecessor(&*BB); + BB->dropAllReferences(); + } + + for (BasicBlock *BB : Unreachable) + BB->eraseFromParent(); +} diff --git a/llvm/tools/llvm-reduce/deltas/Utils.h b/llvm/tools/llvm-reduce/deltas/Utils.h index 8cb4a3ebaf644..940030cca02f8 100644 --- a/llvm/tools/llvm-reduce/deltas/Utils.h +++ b/llvm/tools/llvm-reduce/deltas/Utils.h @@ -16,6 +16,7 @@ #include "llvm/Support/CommandLine.h" namespace llvm { +class BasicBlock; class Function; class Type; class Value; @@ -26,6 +27,11 @@ Value *getDefaultValue(Type *T); bool hasAliasUse(Function &F); bool hasAliasOrBlockAddressUse(Function &F); +// Constant fold terminators in \p and minimally prune unreachable code from the +// function. +void simpleSimplifyCFG(Function &F, ArrayRef BBs, + bool FoldBlockIntoPredecessor = true); + } // namespace llvm #endif From d057811655d8de3900748bba03d0c7ebcb6fafe3 Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Mon, 7 Apr 2025 23:19:32 -0300 Subject: [PATCH 0930/1029] [clang] fix diagnostic printing of expressions ignoring LangOpts (#134693) Currently when printing a template argument of expression type, the expression is converted immediately into a string to be sent to the diagnostic engine, unsing a fake LangOpts. This makes the expression printing look incorrect for the current language, besides being inneficient, as we don't actually need to print the expression if the diagnostic would be ignored. This fixes a nastiness with the TemplateArgument constructor for expressions being implicit, and all current users just passing an expression to a diagnostic were implicitly going through the template argument path. The expressions are also being printed unquoted. This will be fixed in a subsequent patch, as the test churn is much larger. --- .../clang-tidy/ClangTidyDiagnosticConsumer.cpp | 3 +++ clang/docs/ReleaseNotes.rst | 3 +++ clang/include/clang/AST/Expr.h | 8 ++++++++ clang/include/clang/AST/TemplateBase.h | 2 +- clang/include/clang/Basic/Diagnostic.h | 5 ++++- clang/lib/AST/ASTDiagnostic.cpp | 8 ++++++++ clang/lib/AST/TemplateBase.cpp | 16 +++------------- clang/lib/Basic/Diagnostic.cpp | 1 + clang/lib/Sema/SemaTemplateDeduction.cpp | 8 ++++---- clang/lib/Sema/SemaTemplateVariadic.cpp | 2 +- clang/lib/Sema/TreeTransform.h | 6 +++--- clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl | 10 +++++----- .../instantiate-expanded-type-constraint.cpp | 4 ++-- .../SemaTemplate/instantiate-requires-expr.cpp | 4 ++-- .../trailing-return-short-circuit.cpp | 4 ++-- .../cpp17_iterator_concepts.verify.cpp | 2 +- 16 files changed, 51 insertions(+), 35 deletions(-) diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp index 71e852545203e..abd6d7b4cd60f 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp @@ -22,6 +22,7 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/ASTDiagnostic.h" #include "clang/AST/Attr.h" +#include "clang/AST/Expr.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/Diagnostic.h" #include "clang/Basic/DiagnosticOptions.h" @@ -528,6 +529,8 @@ void ClangTidyDiagnosticConsumer::forwardDiagnostic(const Diagnostic &Info) { case clang::DiagnosticsEngine::ak_addrspace: Builder << static_cast(Info.getRawArg(Index)); break; + case clang::DiagnosticsEngine::ak_expr: + Builder << reinterpret_cast(Info.getRawArg(Index)); } } } diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index f8f4dfbafb4f8..e671183522565 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -280,6 +280,9 @@ Improvements to Clang's diagnostics - Clang now better preserves the sugared types of pointers to member. - Clang now better preserves the presence of the template keyword with dependent prefixes. +- Clang now respects the current language mode when printing expressions in + diagnostics. This fixes a bunch of `bool` being printed as `_Bool`, and also + a bunch of HLSL types being printed as their C++ equivalents. - When printing types for diagnostics, clang now doesn't suppress the scopes of template arguments contained within nested names. - The ``-Wshift-bool`` warning has been added to warn about shifting a boolean. (#GH28334) diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index dedbff5944af8..20f70863a05b3 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -7379,6 +7379,14 @@ class RecoveryExpr final : public Expr, friend class ASTStmtWriter; }; +/// Insertion operator for diagnostics. This allows sending +/// Expr into a diagnostic with <<. +inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &DB, + const Expr *E) { + DB.AddTaggedVal(reinterpret_cast(E), DiagnosticsEngine::ak_expr); + return DB; +} + } // end namespace clang #endif // LLVM_CLANG_AST_EXPR_H diff --git a/clang/include/clang/AST/TemplateBase.h b/clang/include/clang/AST/TemplateBase.h index a800a16fc3e7a..bea624eb04942 100644 --- a/clang/include/clang/AST/TemplateBase.h +++ b/clang/include/clang/AST/TemplateBase.h @@ -262,7 +262,7 @@ class TemplateArgument { /// This form of template argument only occurs in template argument /// lists used for dependent types and for expression; it will not /// occur in a non-dependent, canonical template argument list. - TemplateArgument(Expr *E, bool IsDefaulted = false) { + explicit TemplateArgument(Expr *E, bool IsDefaulted = false) { TypeOrValue.Kind = Expression; TypeOrValue.IsDefaulted = IsDefaulted; TypeOrValue.V = reinterpret_cast(E); diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h index 848acce3c4f13..19524856a9bb3 100644 --- a/clang/include/clang/Basic/Diagnostic.h +++ b/clang/include/clang/Basic/Diagnostic.h @@ -284,7 +284,10 @@ class DiagnosticsEngine : public RefCountedBase { ak_qualtype_pair, /// Attr * - ak_attr + ak_attr, + + /// Expr * + ak_expr, }; /// Represents on argument value, which is a union discriminated diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp index b4e7360e126fb..ccfef9c7ae361 100644 --- a/clang/lib/AST/ASTDiagnostic.cpp +++ b/clang/lib/AST/ASTDiagnostic.cpp @@ -508,6 +508,14 @@ void clang::FormatASTNodeDiagnosticArgument( NeedQuotes = false; break; } + case DiagnosticsEngine::ak_expr: { + const Expr *E = reinterpret_cast(Val); + assert(E && "Received null Expr!"); + E->printPretty(OS, /*Helper=*/nullptr, Context.getPrintingPolicy()); + // FIXME: Include quotes when printing expressions. + NeedQuotes = false; + break; + } } if (NeedQuotes) { diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp index 0be0a83b7010d..42da4e6ca9964 100644 --- a/clang/lib/AST/TemplateBase.cpp +++ b/clang/lib/AST/TemplateBase.cpp @@ -478,7 +478,7 @@ TemplateArgument TemplateArgument::getPackExpansionPattern() const { return getAsType()->castAs()->getPattern(); case Expression: - return cast(getAsExpr())->getPattern(); + return TemplateArgument(cast(getAsExpr())->getPattern()); case TemplateExpansion: return TemplateArgument(getAsTemplateOrTemplatePattern()); @@ -654,18 +654,8 @@ static const T &DiagTemplateArg(const T &DB, const TemplateArgument &Arg) { case TemplateArgument::TemplateExpansion: return DB << Arg.getAsTemplateOrTemplatePattern() << "..."; - case TemplateArgument::Expression: { - // This shouldn't actually ever happen, so it's okay that we're - // regurgitating an expression here. - // FIXME: We're guessing at LangOptions! - SmallString<32> Str; - llvm::raw_svector_ostream OS(Str); - LangOptions LangOpts; - LangOpts.CPlusPlus = true; - PrintingPolicy Policy(LangOpts); - Arg.getAsExpr()->printPretty(OS, nullptr, Policy); - return DB << OS.str(); - } + case TemplateArgument::Expression: + return DB << Arg.getAsExpr(); case TemplateArgument::Pack: { // FIXME: We're guessing at LangOptions! diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp index 9e2f134135647..4b4a85aaccf8b 100644 --- a/clang/lib/Basic/Diagnostic.cpp +++ b/clang/lib/Basic/Diagnostic.cpp @@ -1247,6 +1247,7 @@ FormatDiagnostic(const char *DiagStr, const char *DiagEnd, case DiagnosticsEngine::ak_nestednamespec: case DiagnosticsEngine::ak_declcontext: case DiagnosticsEngine::ak_attr: + case DiagnosticsEngine::ak_expr: getDiags()->ConvertArgToString(Kind, getRawArg(ArgNo), StringRef(Modifier, ModifierLen), StringRef(Argument, ArgumentLen), diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 200168087136b..6236bce743438 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -493,8 +493,8 @@ DeduceNullPtrTemplateArgument(Sema &S, TemplateParameterList *TemplateParams, : CK_NullToPointer) .get(); return DeduceNonTypeTemplateArgument( - S, TemplateParams, NTTP, DeducedTemplateArgument(Value), Value->getType(), - Info, PartialOrdering, Deduced, HasDeducedAnyParam); + S, TemplateParams, NTTP, TemplateArgument(Value), Value->getType(), Info, + PartialOrdering, Deduced, HasDeducedAnyParam); } /// Deduce the value of the given non-type template parameter @@ -508,8 +508,8 @@ DeduceNonTypeTemplateArgument(Sema &S, TemplateParameterList *TemplateParams, SmallVectorImpl &Deduced, bool *HasDeducedAnyParam) { return DeduceNonTypeTemplateArgument( - S, TemplateParams, NTTP, DeducedTemplateArgument(Value), Value->getType(), - Info, PartialOrdering, Deduced, HasDeducedAnyParam); + S, TemplateParams, NTTP, TemplateArgument(Value), Value->getType(), Info, + PartialOrdering, Deduced, HasDeducedAnyParam); } /// Deduce the value of the given non-type template parameter diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp index 3d4a245eb8bd5..3040a30454b0c 100644 --- a/clang/lib/Sema/SemaTemplateVariadic.cpp +++ b/clang/lib/Sema/SemaTemplateVariadic.cpp @@ -1286,7 +1286,7 @@ TemplateArgumentLoc Sema::getTemplateArgumentPackExpansionPattern( Expr *Pattern = Expansion->getPattern(); Ellipsis = Expansion->getEllipsisLoc(); NumExpansions = Expansion->getNumExpansions(); - return TemplateArgumentLoc(Pattern, Pattern); + return TemplateArgumentLoc(TemplateArgument(Pattern), Pattern); } case TemplateArgument::TemplateExpansion: diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 1e126a8875331..13762dc485c32 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -3981,7 +3981,7 @@ class TreeTransform { if (Result.isInvalid()) return TemplateArgumentLoc(); - return TemplateArgumentLoc(Result.get(), Result.get()); + return TemplateArgumentLoc(TemplateArgument(Result.get()), Result.get()); } case TemplateArgument::Template: @@ -16131,8 +16131,8 @@ TreeTransform::TransformSizeOfPackExpr(SizeOfPackExpr *E) { E->getPackLoc()); if (DRE.isInvalid()) return ExprError(); - ArgStorage = new (getSema().Context) - PackExpansionExpr(DRE.get(), E->getPackLoc(), std::nullopt); + ArgStorage = TemplateArgument(new (getSema().Context) PackExpansionExpr( + DRE.get(), E->getPackLoc(), std::nullopt)); } PackArgs = ArgStorage; } diff --git a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl index 941e0a975d5f4..34930d8963688 100644 --- a/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl +++ b/clang/test/SemaHLSL/BuiltIns/RWBuffers.hlsl @@ -60,13 +60,13 @@ RWBuffer > r9; // arrays not allowed // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}} // expected-note@*:* {{because 'half[4]' does not satisfy '__is_typed_resource_element_compatible'}} -// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(__fp16[4])' evaluated to false}} +// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(half[4])' evaluated to false}} RWBuffer r10; typedef vector int8; // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}} // expected-note@*:* {{because 'vector' (vector of 8 'int' values) does not satisfy '__is_typed_resource_element_compatible'}} -// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(int __attribute__((ext_vector_type(8))))' evaluated to false}} +// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector)' evaluated to false}} RWBuffer r11; typedef int MyInt; @@ -74,12 +74,12 @@ RWBuffer r12; // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}} // expected-note@*:* {{because 'bool' does not satisfy '__is_typed_resource_element_compatible'}} -// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(_Bool)' evaluated to false}} +// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(bool)' evaluated to false}} RWBuffer r13; // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}} // expected-note@*:* {{because 'vector' (vector of 2 'bool' values) does not satisfy '__is_typed_resource_element_compatible'}} -// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(_Bool __attribute__((ext_vector_type(2))))' evaluated to false}} +// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector)' evaluated to false}} RWBuffer> r14; enum numbers { one, two, three }; @@ -91,7 +91,7 @@ RWBuffer r15; // expected-error@+3 {{constraints not satisfied for class template 'RWBuffer'}} // expected-note@*:* {{because 'vector' (vector of 3 'double' values) does not satisfy '__is_typed_resource_element_compatible'}} -// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(double __attribute__((ext_vector_type(3))))' evaluated to false}} +// expected-note@*:* {{because '__builtin_hlsl_is_typed_resource_element_compatible(vector)' evaluated to false}} RWBuffer r16; diff --git a/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp b/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp index 73fef87b97822..3edf243982958 100644 --- a/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp +++ b/clang/test/SemaTemplate/instantiate-expanded-type-constraint.cpp @@ -8,7 +8,7 @@ constexpr bool is_same_v = true; template concept same_as = is_same_v; -// expected-note@-1{{because 'is_same_v' evaluated to false}} +// expected-note@-1{{because 'is_same_v' evaluated to false}} template concept either = (is_same_v || ...); @@ -16,7 +16,7 @@ concept either = (is_same_v || ...); template struct T { template... Us> - // expected-note@-1{{because 'same_as' evaluated to false}} + // expected-note@-1{{because 'same_as' evaluated to false}} static void foo(Us... u, int x) { }; // expected-note@-1{{candidate template ignored: deduced too few arguments}} // expected-note@-2{{candidate template ignored: constraints not satisfied}} diff --git a/clang/test/SemaTemplate/instantiate-requires-expr.cpp b/clang/test/SemaTemplate/instantiate-requires-expr.cpp index ab5fac1f9e63e..47689b93db50f 100644 --- a/clang/test/SemaTemplate/instantiate-requires-expr.cpp +++ b/clang/test/SemaTemplate/instantiate-requires-expr.cpp @@ -72,8 +72,8 @@ namespace type_requirement { template requires false_v; }> - // expected-note@-1 {{because 'false_v::template temp >; }>' evaluated to false}} - // expected-note@-2 {{because 'false_v::template temp >; }>' evaluated to false}} + // expected-note@-1 {{because 'false_v::template temp>; }>' evaluated to false}} + // expected-note@-2 {{because 'false_v::template temp>; }>' evaluated to false}} struct r2 {}; using r2i1 = r2>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template]}} diff --git a/clang/test/SemaTemplate/trailing-return-short-circuit.cpp b/clang/test/SemaTemplate/trailing-return-short-circuit.cpp index 0d1c9b52b0e85..4ef7888d23dc5 100644 --- a/clang/test/SemaTemplate/trailing-return-short-circuit.cpp +++ b/clang/test/SemaTemplate/trailing-return-short-circuit.cpp @@ -39,13 +39,13 @@ void usage() { Foo(true); // expected-error@-1{{no matching function for call to 'Foo'}} // expected-note@#FOO{{candidate template ignored: constraints not satisfied [with T = bool]}} - // expected-note@#FOO_REQ{{because 'sizeof(_Bool) > 2' (1 > 2) evaluated to false}} + // expected-note@#FOO_REQ{{because 'sizeof(bool) > 2' (1 > 2) evaluated to false}} // expected-note@#FOO_REQ{{because substituted constraint expression is ill-formed: type 'bool' cannot be used prior to '::' because it has no members}} TrailingReturn(true); // expected-error@-1{{no matching function for call to 'TrailingReturn'}} // expected-note@#TRAILING{{candidate template ignored: constraints not satisfied [with T = bool]}} - // expected-note@#TRAILING_REQ{{because 'sizeof(_Bool) > 2' (1 > 2) evaluated to false}} + // expected-note@#TRAILING_REQ{{because 'sizeof(bool) > 2' (1 > 2) evaluated to false}} // expected-note@#TRAILING_REQ_VAL{{because substituted constraint expression is ill-formed: type 'bool' cannot be used prior to '::' because it has no members}} // Fails the 1st check, fails 2nd because ::value is false. diff --git a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp index 99f67c8a795a9..c4462b26f5c92 100644 --- a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp +++ b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp @@ -173,7 +173,7 @@ void check_bidirectional_iterator_requirements() { _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement, ""); // expected-error {{static assertion failed}} // expected-note@*:* {{cannot decrement value of type 'missing_postdecrement'}} _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference, ""); // expected-error {{static assertion failed}} - // expected-note@*:* {{because type constraint 'same_as >' was not satisfied}} + // expected-note-re@*:* {{because type constraint 'same_as{{ ?}}>' was not satisfied}} // clang-format on } From 499930e38a814d4fb8227d9ab48a946658650e12 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 7 Apr 2025 19:35:48 -0700 Subject: [PATCH 0931/1029] gn build: Spell arm64 correctly. --- llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni index 9e8e52c977a17..b693b75e04582 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni +++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/sources.gni @@ -11,7 +11,7 @@ if (current_cpu == "x86" || current_cpu == "x64") { builtins_defines = [] # This is based on who sets HasFloat16 to true in clang/lib/Basic/Targets. -if (current_cpu == "aarch64" || current_cpu == "arm" || +if (current_cpu == "arm" || current_cpu == "arm64" || current_cpu == "hexagon" || current_cpu == "riscv" || current_cpu == "riscv64" || current_cpu == "x64") { builtins_defines += [ "COMPILER_RT_HAS_FLOAT16" ] From 5aae0ee660ffdae057d7c5c0e851b5620586d042 Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Tue, 8 Apr 2025 10:47:39 +0800 Subject: [PATCH 0932/1029] [clang-tidy] give dummy path when create ClangTidyContext (#134670) #121323 changed the way the absolute path is computed. Empty file name will cause absolute path ignore current folder. This patch add "dummy" file name to avoid this issue Fixed: #134502 --- clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp index abd6d7b4cd60f..731141a545a48 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp @@ -167,8 +167,8 @@ ClangTidyContext::ClangTidyContext( AllowEnablingAnalyzerAlphaCheckers(AllowEnablingAnalyzerAlphaCheckers), EnableModuleHeadersParsing(EnableModuleHeadersParsing) { // Before the first translation unit we can get errors related to command-line - // parsing, use empty string for the file name in this case. - setCurrentFile(""); + // parsing, use dummy string for the file name in this case. + setCurrentFile("dummy"); } ClangTidyContext::~ClangTidyContext() = default; From 6a3e5f89bb89548fb39ca241c8ed7c2005a190f1 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Mon, 7 Apr 2025 19:52:03 -0700 Subject: [PATCH 0933/1029] [ctxprof] Only prune the profile in modules containing only context trees (#134340) We will subsequently treat the whole profile as "flat" in the frontend, (i.e flatten and combine with the flat profile section), so we can have a profile for ThinLTO for parts of the application that don't come under the contextual profile. After ThinLTO, we will treat the module(s) containing contextual trees differently: they'll have only the contextual profile pertinent to them. The rest of the modules (non-contextual) will proceed "as usual", off the flattened profile. This patch implements pruning of the contextual profile to enable the above. --- llvm/include/llvm/Analysis/CtxProfAnalysis.h | 7 ++ llvm/lib/Analysis/CtxProfAnalysis.cpp | 54 ++++++++++----- llvm/lib/Transforms/IPO/ElimAvailExtern.cpp | 2 +- llvm/test/Analysis/CtxProfAnalysis/load.ll | 4 ++ llvm/test/Analysis/CtxProfAnalysis/pruning.ll | 69 +++++++++++++++++++ .../transform-to-local.ll | 16 ++++- 6 files changed, 130 insertions(+), 22 deletions(-) create mode 100644 llvm/test/Analysis/CtxProfAnalysis/pruning.ll diff --git a/llvm/include/llvm/Analysis/CtxProfAnalysis.h b/llvm/include/llvm/Analysis/CtxProfAnalysis.h index ede8bd2fe5001..d3813fa0784be 100644 --- a/llvm/include/llvm/Analysis/CtxProfAnalysis.h +++ b/llvm/include/llvm/Analysis/CtxProfAnalysis.h @@ -33,6 +33,11 @@ class PGOContextualProfile { FunctionInfo(StringRef Name) : Name(Name) {} }; PGOCtxProfile Profiles; + + // True if this module is a post-thinlto module containing just functions + // participating in one or more contextual profiles. + bool IsInSpecializedModule = false; + // For the GUIDs in this module, associate metadata about each function which // we'll need when we maintain the profiles during IPO transformations. std::map FuncInfo; @@ -56,6 +61,8 @@ class PGOContextualProfile { const PGOCtxProfile &profiles() const { return Profiles; } + bool isInSpecializedModule() const { return IsInSpecializedModule; } + bool isFunctionKnown(const Function &F) const { return getDefinedFunctionGUID(F) != 0; } diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp index e021e2a801006..3ae333b09d0ce 100644 --- a/llvm/lib/Analysis/CtxProfAnalysis.cpp +++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/CtxProfAnalysis.h" +#include "llvm/ADT/APInt.h" #include "llvm/ADT/STLExtras.h" #include "llvm/IR/Analysis.h" #include "llvm/IR/IntrinsicInst.h" @@ -20,6 +21,7 @@ #include "llvm/ProfileData/PGOCtxProfReader.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" #define DEBUG_TYPE "ctx_prof" @@ -95,26 +97,42 @@ PGOContextualProfile CtxProfAnalysis::run(Module &M, return {}; } - DenseSet ProfileRootsInModule; - for (const auto &F : M) - if (!F.isDeclaration()) - if (auto GUID = AssignGUIDPass::getGUID(F); - MaybeProfiles->Contexts.find(GUID) != MaybeProfiles->Contexts.end()) - ProfileRootsInModule.insert(GUID); - - // Trim first the roots that aren't in this module. - for (auto &[RootGuid, _] : - llvm::make_early_inc_range(MaybeProfiles->Contexts)) - if (!ProfileRootsInModule.contains(RootGuid)) - MaybeProfiles->Contexts.erase(RootGuid); - // If none of the roots are in the module, we have no profile (for this - // module) - if (MaybeProfiles->Contexts.empty()) - return {}; - - // OK, so we have a valid profile and it's applicable to roots in this module. + // FIXME: We should drive this from ThinLTO, but for the time being, use the + // module name as indicator. + // We want to *only* keep the contextual profiles in modules that capture + // context trees. That allows us to compute specific PSIs, for example. + auto DetermineRootsInModule = [&M]() -> const DenseSet { + DenseSet ProfileRootsInModule; + auto ModName = M.getName(); + auto Filename = sys::path::filename(ModName); + // Drop the file extension. + Filename = Filename.substr(0, Filename.find_last_of('.')); + // See if it parses + APInt Guid; + // getAsInteger returns true if there are more chars to read other than the + // integer. So the "false" test is what we want. + if (!Filename.getAsInteger(0, Guid)) + ProfileRootsInModule.insert(Guid.getZExtValue()); + return ProfileRootsInModule; + }; + const auto ProfileRootsInModule = DetermineRootsInModule(); PGOContextualProfile Result; + // the logic from here on allows for modules that contain - by design - more + // than one root. We currently don't support that, because the determination + // happens based on the module name matching the root guid, but the logic can + // avoid assuming that. + if (!ProfileRootsInModule.empty()) { + Result.IsInSpecializedModule = true; + // Trim first the roots that aren't in this module. + for (auto &[RootGuid, _] : + llvm::make_early_inc_range(MaybeProfiles->Contexts)) + if (!ProfileRootsInModule.contains(RootGuid)) + MaybeProfiles->Contexts.erase(RootGuid); + // we can also drop the flat profiles + MaybeProfiles->FlatProfiles.clear(); + } + for (const auto &F : M) { if (F.isDeclaration()) continue; diff --git a/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp b/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp index de11f7f6b123d..718452fc02764 100644 --- a/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp +++ b/llvm/lib/Transforms/IPO/ElimAvailExtern.cpp @@ -134,7 +134,7 @@ EliminateAvailableExternallyPass::run(Module &M, ModuleAnalysisManager &MAM) { // for this contextual information. Eliding it in favor of the original would // undo these optimizations. if (!eliminateAvailableExternally( - M, /*Convert=*/(CtxProf && !CtxProf->contexts().empty()))) + M, /*Convert=*/(CtxProf && CtxProf->isInSpecializedModule()))) return PreservedAnalyses::all(); return PreservedAnalyses::none(); } diff --git a/llvm/test/Analysis/CtxProfAnalysis/load.ll b/llvm/test/Analysis/CtxProfAnalysis/load.ll index 6091a99ed3680..bd21a4b710630 100644 --- a/llvm/test/Analysis/CtxProfAnalysis/load.ll +++ b/llvm/test/Analysis/CtxProfAnalysis/load.ll @@ -46,6 +46,9 @@ Function Info: Current Profile: Contexts: + - Guid: 12341 + TotalRootEntryCount: 90 + Counters: [ 9 ] - Guid: 11872291593386833696 TotalRootEntryCount: 4 Counters: [ 1 ] @@ -57,6 +60,7 @@ Contexts: Counters: [ 5 ] Flat Profile: +12341 : 9 728453322856651412 : 6 7 11872291593386833696 : 1 12074870348631550642 : 5 diff --git a/llvm/test/Analysis/CtxProfAnalysis/pruning.ll b/llvm/test/Analysis/CtxProfAnalysis/pruning.ll new file mode 100644 index 0000000000000..65fa328ef820b --- /dev/null +++ b/llvm/test/Analysis/CtxProfAnalysis/pruning.ll @@ -0,0 +1,69 @@ +; REQUIRES: x86_64-linux +; +; Check that we don't prune the contextual profile, unless the module name +; matches the guid of the root. +; +; RUN: rm -rf %t +; RUN: split-file %s %t +; RUN: llvm-ctxprof-util fromYAML --input=%t/profile.yaml --output=%t/profile.ctxprofdata +; +; RUN: cp %t/example.ll %t/1234.ll +; RUN: cp %t/example.ll %t/0x4d2.ll +; +; RUN: opt -passes='require,print' \ +; RUN: -use-ctx-profile=%t/profile.ctxprofdata \ +; RUN: -ctx-profile-printer-level=everything \ +; RUN: %t/example.ll -S 2>&1 | FileCheck %s + +; RUN: opt -passes='require,print' \ +; RUN: -use-ctx-profile=%t/profile.ctxprofdata \ +; RUN: -ctx-profile-printer-level=everything \ +; RUN: %t/not-matching.ll -S 2>&1 | FileCheck %s + +; RUN: opt -passes='require,print' \ +; RUN: -use-ctx-profile=%t/profile.ctxprofdata \ +; RUN: -ctx-profile-printer-level=everything \ +; RUN: %t/0x4d2.ll -S 2>&1 | FileCheck %s --check-prefix=PRUNED + +; CHECK: Contexts: +; CHECK: - Guid: 1234 +; CHECK: - Guid: 5678 +; CHECK: FlatProfiles: +; PRUNED-NOT: - Guid: 5678 +; PRUNED-NOT: FlatProfiles +; +; pick a large GUID that would be negative, if signed, to test a few ways the +; file name may be formatted. +;--- profile.yaml +Contexts: + - Guid: 1234 + TotalRootEntryCount: 24 + Counters: [9] + Callsites: - + - Guid: 1000 + Counters: [6, 7] + + - Guid: 5678 + TotalRootEntryCount: 24 + Counters: [9] + Callsites: - + - Guid: 1000 + Counters: [6, 7] +FlatProfiles: + - Guid: 777 + Counters: [2] +;--- example.ll +define void @an_entrypoint(i32 %a) !guid !0 { + ret void +} + +attributes #0 = { noinline } +!0 = !{ i64 1234 } + +;--- not-matching.ll +define void @an_entrypoint(i32 %a) !guid !0 { + ret void +} + +attributes #0 = { noinline } +!0 = !{ i64 1000 } diff --git a/llvm/test/Transforms/EliminateAvailableExternally/transform-to-local.ll b/llvm/test/Transforms/EliminateAvailableExternally/transform-to-local.ll index b6465f44faa18..c24e4925fe78f 100644 --- a/llvm/test/Transforms/EliminateAvailableExternally/transform-to-local.ll +++ b/llvm/test/Transforms/EliminateAvailableExternally/transform-to-local.ll @@ -1,16 +1,26 @@ ; REQUIRES: asserts -; RUN: opt -passes=elim-avail-extern -avail-extern-to-local -stats -S 2>&1 < %s | FileCheck %s +; RUN: rm -rf %t +; RUN: mkdir %t +; RUN: cp %s %t/1234.ll +; +; default behavior +; RUN: opt -passes=elim-avail-extern -stats -S 2>&1 %s | FileCheck %s --check-prefix=NOOP +; +; check the -avail-extern-to-local flag works as intended +; RUN: opt -passes=elim-avail-extern -avail-extern-to-local -stats -S 2>&1 %t/1234.ll | FileCheck %s +; RUN: opt -passes=elim-avail-extern -avail-extern-to-local -stats -S 2>&1 %s | FileCheck %s ; ; RUN: echo '{"Contexts": [{"Guid":1234, "TotalRootEntryCount": 5, "Counters": [1]}]}' | llvm-ctxprof-util fromYAML --input=- --output=%t_profile.ctxprofdata ; ; Because we pass a contextual profile with a root defined in this module, we expect the outcome to be the same as-if ; we passed -avail-extern-to-local, i.e. available_externally don't get elided and instead get converted to local linkage -; RUN: opt -passes='assign-guid,require,elim-avail-extern' -use-ctx-profile=%t_profile.ctxprofdata -stats -S 2>&1 < %s | FileCheck %s +; +; RUN: opt -passes='assign-guid,require,elim-avail-extern' -use-ctx-profile=%t_profile.ctxprofdata -stats -S 2>&1 %t/1234.ll | FileCheck %s ; If the profile doesn't apply to this module, available_externally won't get converted to internal linkage, and will be ; removed instead. ; RUN: echo '{"Contexts": [{"Guid":5678, "TotalRootEntryCount": 3, "Counters": [1]}]}' | llvm-ctxprof-util fromYAML --input=- --output=%t_profile_bad.ctxprofdata -; RUN: opt -passes='assign-guid,require,elim-avail-extern' -use-ctx-profile=%t_profile_bad.ctxprofdata -stats -S 2>&1 < %s | FileCheck %s --check-prefix=NOOP +; RUN: opt -passes='assign-guid,require,elim-avail-extern' -use-ctx-profile=%t_profile_bad.ctxprofdata -stats -S %s 2>&1 | FileCheck %s --check-prefix=NOOP declare void @call_out(ptr %fct) From f1bb2fe3562d2fe9a43c79ec8af908fdff23a33a Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Mon, 7 Apr 2025 19:55:00 -0700 Subject: [PATCH 0934/1029] [ctxprof] Use `isInSpecializedModule` as criteria for using contextual profile (#134468) After #134340, the availability of contextual profile isn't in itself an indication of compiling the module containing all the functions covered by that profile. --- llvm/lib/Transforms/IPO/ModuleInliner.cpp | 6 +++--- llvm/lib/Transforms/Utils/InlineFunction.cpp | 2 +- llvm/test/Analysis/CtxProfAnalysis/handle-select.ll | 8 ++++---- llvm/test/Analysis/CtxProfAnalysis/inline.ll | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/llvm/lib/Transforms/IPO/ModuleInliner.cpp index 480de5fe4b553..844e27590e501 100644 --- a/llvm/lib/Transforms/IPO/ModuleInliner.cpp +++ b/llvm/lib/Transforms/IPO/ModuleInliner.cpp @@ -171,8 +171,8 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M, << setIsVerbose(); }); } - } else if (CtxProfPromoteAlwaysInline && !CtxProf.contexts().empty() && - CB->isIndirectCall()) { + } else if (CtxProfPromoteAlwaysInline && + CtxProf.isInSpecializedModule() && CB->isIndirectCall()) { CtxProfAnalysis::collectIndirectCallPromotionList(*CB, CtxProf, ICPCandidates); } @@ -260,7 +260,7 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M, // iteration because the next iteration may not happen and we may // miss inlining it. // FIXME: enable for ctxprof. - if (CtxProf.contexts().empty()) + if (CtxProf.isInSpecializedModule()) if (tryPromoteCall(*ICB)) NewCallee = ICB->getCalledFunction(); } diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 131fbe654c11c..5beee1f681b81 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -2356,7 +2356,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, AAResults *CalleeAAR, bool InsertLifetime, Function *ForwardVarArgsTo) { - if (CtxProf.contexts().empty()) + if (!CtxProf.isInSpecializedModule()) return InlineFunction(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime, ForwardVarArgsTo); diff --git a/llvm/test/Analysis/CtxProfAnalysis/handle-select.ll b/llvm/test/Analysis/CtxProfAnalysis/handle-select.ll index dfbc5c9e60177..1880672580eb8 100644 --- a/llvm/test/Analysis/CtxProfAnalysis/handle-select.ll +++ b/llvm/test/Analysis/CtxProfAnalysis/handle-select.ll @@ -6,9 +6,9 @@ ; RUN: split-file %s %t ; RUN: llvm-ctxprof-util fromYAML --input=%t/profile.yaml --output=%t/profile.ctxprofdata ; -; RUN: opt -passes=ctx-instr-gen %t/example.ll -use-ctx-profile=%t/profile.ctxprofdata -S -o - | FileCheck %s --check-prefix=INSTR -; RUN: opt -passes=ctx-instr-gen,module-inline %t/example.ll -use-ctx-profile=%t/profile.ctxprofdata -S -o - | FileCheck %s --check-prefix=POST-INL -; RUN: opt -passes=ctx-instr-gen,module-inline,ctx-prof-flatten %t/example.ll -use-ctx-profile=%t/profile.ctxprofdata -S -o - | FileCheck %s --check-prefix=FLATTEN +; RUN: opt -passes=ctx-instr-gen %t/1234.ll -use-ctx-profile=%t/profile.ctxprofdata -S -o - | FileCheck %s --check-prefix=INSTR +; RUN: opt -passes=ctx-instr-gen,module-inline %t/1234.ll -use-ctx-profile=%t/profile.ctxprofdata -S -o - | FileCheck %s --check-prefix=POST-INL +; RUN: opt -passes=ctx-instr-gen,module-inline,ctx-prof-flatten %t/1234.ll -use-ctx-profile=%t/profile.ctxprofdata -S -o - | FileCheck %s --check-prefix=FLATTEN ; INSTR-LABEL: yes: ; INSTR-NEXT: call void @llvm.instrprof.increment(ptr @foo, i64 [[#]], i32 2, i32 1) @@ -45,7 +45,7 @@ ; entry count of that BB is 4. ; ![[SELPROF]] = !{!"branch_weights", i32 3, i32 1} -;--- example.ll +;--- 1234.ll define i32 @foo(i32 %t) !guid !0 { %test = icmp slt i32 %t, 0 br i1 %test, label %yes, label %no diff --git a/llvm/test/Analysis/CtxProfAnalysis/inline.ll b/llvm/test/Analysis/CtxProfAnalysis/inline.ll index 836ec8b2e8a37..a069acee1c943 100644 --- a/llvm/test/Analysis/CtxProfAnalysis/inline.ll +++ b/llvm/test/Analysis/CtxProfAnalysis/inline.ll @@ -3,7 +3,7 @@ ; RUN: split-file %s %t ; RUN: llvm-ctxprof-util fromYAML --input=%t/profile.yaml --output=%t/profile.ctxprofdata -; RUN: opt -passes='module-inline,print' -ctx-profile-printer-level=everything %t/module.ll -S \ +; RUN: opt -passes='module-inline,print' -ctx-profile-printer-level=everything %t/1000.ll -S \ ; RUN: -use-ctx-profile=%t/profile.ctxprofdata -ctx-profile-printer-level=yaml \ ; RUN: -o - 2> %t/profile-final.yaml | FileCheck %s ; RUN: diff %t/profile-final.yaml %t/expected.yaml @@ -34,7 +34,7 @@ ; Make sure the postlink thinlto pipeline is aware of ctxprof ; RUN: opt -passes='thinlto' -use-ctx-profile=%t/profile.ctxprofdata \ -; RUN: %t/module.ll -S -o - | FileCheck %s --check-prefix=PIPELINE +; RUN: %t/1000.ll -S -o - | FileCheck %s --check-prefix=PIPELINE ; PIPELINE-LABEL: define i32 @entrypoint ; PIPELINE-SAME: !prof ![[ENTRYPOINT_COUNT:[0-9]+]] @@ -50,7 +50,7 @@ ; These are the weights of the un-inlined @a, where the counters were 8, 500 (8 for entry, 500 for loop) ; PIPELINE: ![[LOOP_BW_ORIG]] = !{!"branch_weights", i32 492, i32 8} -;--- module.ll +;--- 1000.ll define i32 @entrypoint(i32 %x) !guid !0 { call void @llvm.instrprof.increment(ptr @entrypoint, i64 0, i32 3, i32 0) %t = icmp eq i32 %x, 0 From ac42b083f104442e12393ff09929a1add881010d Mon Sep 17 00:00:00 2001 From: Naveen Seth Hanig Date: Tue, 8 Apr 2025 05:02:57 +0200 Subject: [PATCH 0935/1029] [clang][modules] Guard against bad -fmodule-file mappings (#132059) (#133462) Fix https://github.com/llvm/llvm-project/issues/132059. Providing incorrect mappings via `-fmodule-file==` can crash the compiler when loading a module that imports an incorrectly mapped module. The crash occurs during AST body deserialization, when the compiler attempts to resolve remappings using the `ModuleFile` from the incorrectly mapped module's BMI file. The cause is an invalid access into an incorrectly loaded `ModuleFile`. This commit fixes the issue by verifying the identity of the imported module. --- clang/lib/Serialization/ASTReader.cpp | 12 +++++ clang/test/Modules/fmodule-file-mismatch.cppm | 48 +++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 clang/test/Modules/fmodule-file-mismatch.cppm diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index d8d77e7f55232..f88019b9a5345 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -3318,6 +3318,18 @@ ASTReader::ReadControlBlock(ModuleFile &F, Loaded, StoredSize, StoredModTime, StoredSignature, Capabilities); + // Check the AST we just read from ImportedFile contains a different + // module than we expected (ImportedName). This can occur for C++20 + // Modules when given a mismatch via -fmodule-file== + if (IsImportingStdCXXModule) { + if (const auto *Imported = + getModuleManager().lookupByFileName(ImportedFile); + Imported != nullptr && Imported->ModuleName != ImportedName) { + Diag(diag::err_failed_to_find_module_file) << ImportedName; + Result = Missing; + } + } + // If we diagnosed a problem, produce a backtrace. bool recompilingFinalized = Result == OutOfDate && (Capabilities & ARR_OutOfDate) && diff --git a/clang/test/Modules/fmodule-file-mismatch.cppm b/clang/test/Modules/fmodule-file-mismatch.cppm new file mode 100644 index 0000000000000..351f3f4295bf0 --- /dev/null +++ b/clang/test/Modules/fmodule-file-mismatch.cppm @@ -0,0 +1,48 @@ +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: cd %t + +// Related to issue #132059 + +// Precompile the module dependencies correctly +// RUN: %clang_cc1 -std=c++20 -emit-module-interface a.cppm -o a.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface b.cppm -o b.pcm \ +// RUN: -fmodule-file=A=a.pcm + +// Verify that providing incorrect mappings via +// `-fmodule-file==` does not crash the compiler when loading +// a module that imports the incorrectly mapped module. +// RUN: not %clang_cc1 -std=c++20 main1.cpp -fmodule-file=A=b.pcm + +//--- a.cppm +export module A; + +export int a() { + return 41; +} + +//--- b.cppm +export module B; +import A; + +export int b() { + return a() + 1; +} + +//--- main1.cpp +import A; + +int main() { + return a(); +} + +// Test again for the case where the BMI is first loaded correctly +// RUN: not %clang_cc1 -std=c++20 main2.cpp-fmodule-file=B=b.pcm \ +// RUN: -fmodule-file=A=b.pcm + +//--- main2.cpp +import B; + +int main() { + return b(); +} From 49d6e39eba88fdf7975c6f4dcb38595538331e6a Mon Sep 17 00:00:00 2001 From: Shoreshen <372660931@qq.com> Date: Tue, 8 Apr 2025 11:40:38 +0800 Subject: [PATCH 0936/1029] [AMDGPU] add tests for loop definition of bitconvert (#133052) This PR add test cases for all types of bit conversation, it prepares for PR: https://github.com/llvm/llvm-project/pull/132899 All tests passed due to: 1. For DAG, pattern will not separate SReg and VReg. One of the sample is: ``` define <2 x double> @v_bitcast_v4f32_to_v2f64(<4 x float> inreg %a, i32 %b) { %cmp = icmp eq i32 %b, 0 br i1 %cmp, label %cmp.true, label %cmp.false cmp.true: %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) %a2 = bitcast <4 x float> %a1 to <2 x double> br label %end cmp.false: %a3 = bitcast <4 x float> %a to <2 x double> br label %end end: %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] ret <2 x double> %phi } ``` It suppose to select from scalar register patterns. But the Vreg pattern is matched is as follow: ``` Debug log: ISEL: Starting selection on root node: t3: v2f64 = bitcast t2 ISEL: Starting pattern match Initial Opcode index to 440336 Skipped scope entry (due to false predicate) at index 440339, continuing at 440367 Skipped scope entry (due to false predicate) at index 440368, continuing at 440396 Skipped scope entry (due to false predicate) at index 440397, continuing at 440435 Skipped scope entry (due to false predicate) at index 440436, continuing at 440467 Skipped scope entry (due to false predicate) at index 440468, continuing at 440499 Skipped scope entry (due to false predicate) at index 440500, continuing at 440552 Skipped scope entry (due to false predicate) at index 440553, continuing at 440587 Skipped scope entry (due to false predicate) at index 440588, continuing at 440622 Skipped scope entry (due to false predicate) at index 440623, continuing at 440657 Skipped scope entry (due to false predicate) at index 440658, continuing at 440692 Skipped scope entry (due to false predicate) at index 440693, continuing at 440727 Skipped scope entry (due to false predicate) at index 440728, continuing at 440769 Skipped scope entry (due to false predicate) at index 440770, continuing at 440798 Skipped scope entry (due to false predicate) at index 440799, continuing at 440836 Skipped scope entry (due to false predicate) at index 440837, continuing at 440870 TypeSwitch[v2f64] from 440873 to 440892 Patterns: /*440892*/ OPC_CompleteMatch, 1, 0, // Src: (bitconvert:{ *:[v2f64] } VReg_128:{ *:[v4f32] }:$src0) - Complexity = 3 // Dst: VReg_128:{ *:[v2f64] }:$src0 ``` 2. Global isel will use `Select_COPY` to select bitcast --- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 108541 +++++++++++++++ .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll | 11387 ++ .../CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll | 178 + .../CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll | 556 + .../CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll | 1068 + .../CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll | 194 + .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 18360 +++ .../CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll | 209 + .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 12523 ++ .../CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll | 4926 + .../CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll | 228 + .../CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll | 1338 + .../CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll | 714 + .../CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll | 775 + .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 41662 ++++++ .../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll | 295 + .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll | 311 + .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll | 7969 ++ .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll | 328 + .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll | 344 + .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll | 361 + .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 377 + .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 394 + .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll | 6092 + llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll | 33893 ----- .../test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll | 68 + 26 files changed, 219198 insertions(+), 33893 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll create mode 100644 llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll new file mode 100644 index 0000000000000..51dc5ceb82b41 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -0,0 +1,108541 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <32 x float> @bitcast_v32i32_to_v32f32(<32 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i32_to_v32f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v32f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v32f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v32f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f32_to_v32i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f32_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f32_to_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i32_to_v16i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v16i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v16i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v16i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i64_to_v32i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i64_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i64_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i64_to_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <16 x double> @bitcast_v32i32_to_v16f64(<32 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i32_to_v16f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB4_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v16f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v16f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v16f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <32 x i32> @bitcast_v16f64_to_v32i32(<16 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f64_to_v32i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB5_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f64_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f64_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f64_to_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i32_to_v128i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v58 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v58 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v58 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 +; GCN-NEXT: .LBB6_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v58 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v58 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v58 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 +; GCN-NEXT: .LBB6_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GCN-NEXT: v_or_b32_e32 v1, v1, v52 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GCN-NEXT: v_or_b32_e32 v2, v2, v31 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_or_b32_e32 v31, v50, v31 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v63 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_or_b32_e32 v50, v51, v50 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v31 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v50 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; GCN-NEXT: v_or_b32_e32 v31, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v61 +; GCN-NEXT: v_or_b32_e32 v49, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; GCN-NEXT: v_or_b32_e32 v2, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v60 +; GCN-NEXT: v_or_b32_e32 v61, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39 +; GCN-NEXT: v_or_b32_e32 v62, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v57 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v38 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v56 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v37 +; GCN-NEXT: v_or_b32_e32 v7, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; GCN-NEXT: v_or_b32_e32 v8, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v36 +; GCN-NEXT: v_or_b32_e32 v9, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v46 +; GCN-NEXT: v_or_b32_e32 v10, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v35 +; GCN-NEXT: v_or_b32_e32 v11, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v45 +; GCN-NEXT: v_or_b32_e32 v12, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GCN-NEXT: v_or_b32_e32 v13, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; GCN-NEXT: v_or_b32_e32 v14, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v33 +; GCN-NEXT: v_or_b32_e32 v15, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v43 +; GCN-NEXT: v_or_b32_e32 v16, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v21 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v17, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v42 +; GCN-NEXT: v_or_b32_e32 v18, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v23 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v19, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v41 +; GCN-NEXT: v_or_b32_e32 v20, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v21, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v40 +; GCN-NEXT: v_or_b32_e32 v22, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v23, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v55 +; GCN-NEXT: v_or_b32_e32 v24, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v29 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v25, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 +; GCN-NEXT: v_or_b32_e32 v26, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v27, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v53 +; GCN-NEXT: v_or_b32_e32 v28, v1, v3 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v29, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v32, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v30, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v33, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v34, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v35, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v36, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v37, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v38, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v39, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v48, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v50, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v51, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v52, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v53, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v54, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v55, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v40, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v41, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v42, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v43, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v44, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v45, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v46, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v47, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v56, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v57, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v58, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v59, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v60, v3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v4, v1, v29 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GCN-NEXT: v_or_b32_e32 v5, v1, v32 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v63, v2, v30 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v61 +; GCN-NEXT: v_or_b32_e32 v61, v3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v62 +; GCN-NEXT: v_or_b32_e32 v6, v6, v34 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v35 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v29, v29, v36 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v30, v30, v37 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v38 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v7, v7, v39 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v8, v8, v48 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v9, v9, v50 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v10, v10, v51 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v11, v11, v52 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v12, v12, v53 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v13, v13, v54 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v14, v14, v55 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v15, v15, v40 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v16, v16, v41 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v17, v17, v42 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v18, v18, v43 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v19, v19, v44 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x58, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v20, v20, v45 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x5c, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GCN-NEXT: v_or_b32_e32 v21, v21, v46 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GCN-NEXT: v_or_b32_e32 v22, v22, v47 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GCN-NEXT: v_or_b32_e32 v23, v23, v56 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x68, v0 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; GCN-NEXT: v_or_b32_e32 v24, v24, v57 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x6c, v0 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GCN-NEXT: v_or_b32_e32 v25, v25, v58 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x70, v0 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; GCN-NEXT: v_or_b32_e32 v26, v26, v59 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; GCN-NEXT: v_or_b32_e32 v27, v27, v60 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v128i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 +; VI-NEXT: .LBB6_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 +; VI-NEXT: .LBB6_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v48 +; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v57 +; VI-NEXT: v_or_b32_sdwa v2, v2, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v41 +; VI-NEXT: v_or_b32_sdwa v48, v53, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v128i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: .LBB6_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB6_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: .LBB6_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v33, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v128i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-NEXT: .LBB6_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB6_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_add_nc_u32_e32 v32, 3, v32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-NEXT: .LBB6_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v75 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v63 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v65 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v74 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v61 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v73 +; GFX11-NEXT: v_or_b32_e32 v65, v67, v65 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v58 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v72 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v66 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v62 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v60 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v59 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v65 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v57 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v56 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v65 +; GFX11-NEXT: v_or_b32_e32 v55, v66, v64 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v47 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v46 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v45 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v64 +; GFX11-NEXT: v_or_b32_e32 v55, v65, v66 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v44 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v43 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-NEXT: v_or_b32_e32 v54, v65, v54 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v42 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v41 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v40 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v55 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_or_b32_e32 v54, v64, v65 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v182 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v183 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v181 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v53, v64, v53 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v55 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v65 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v54 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v180 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v179 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v178 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v177 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v176 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v167 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v166 +; GFX11-NEXT: v_or_b32_e32 v53, v53, v54 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v55 +; GFX11-NEXT: v_or_b32_e32 v52, v64, v52 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v65 +; GFX11-NEXT: v_or_b32_e32 v54, v66, v67 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v10, v53 +; GFX11-NEXT: v_or_b32_e32 v3, v11, v52 +; GFX11-NEXT: v_or_b32_e32 v4, v12, v54 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v6, 8, v165 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v164 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v51 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v163 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v162 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v161 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v160 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v151 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v50 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v150 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v149 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v148 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v147 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v146 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v49 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v39, v49 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v145 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v144 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v135 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v134 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v133 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v132 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v131 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v130 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v129 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v128 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v119 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v118 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v117 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v116 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v115 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v114 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v113 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v112 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v103 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v102 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v101 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v100 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v99 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v98 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v87 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v86 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v84 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v83 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v82 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v81 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-NEXT: v_lshlrev_b16 v28, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v32, 8, v70 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v69 +; GFX11-NEXT: v_lshlrev_b16 v34, 8, v68 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v128i8_to_v32i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:388 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v9 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v21 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v25 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v29 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v58, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v59, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v61, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:372 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v39 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v56 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v38 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v47 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v54 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v5, v46 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v5, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v37 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v43 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v48 +; GCN-NEXT: v_or_b32_e32 v8, v8, v45 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v9, v53 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v10, v42 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v10, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v11, v41 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v12, v40 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v13, v63 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v14, v50 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_or_b32_e32 v18, v18, v51 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_or_b32_e32 v19, v19, v49 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_or_b32_e32 v20, v20, v60 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_or_b32_e32 v21, v21, v58 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v22, v22, v62 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_or_b32_e32 v23, v23, v59 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_or_b32_e32 v24, v24, v32 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_or_b32_e32 v25, v25, v61 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_or_b32_e32 v26, v26, v34 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v27, v27, v52 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_or_b32_e32 v28, v28, v35 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GCN-NEXT: v_or_b32_e32 v29, v29, v33 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_or_b32_e32 v30, v30, v44 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v57 +; GCN-NEXT: v_or_b32_e32 v31, v31, v36 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v50, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v51, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v52, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v33, v34, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v35, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v36, v35 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v37, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v37, v38, v37 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v39, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v48, v39 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_or_b32_e32 v48, v55, v48 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v49, v54, v49 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v18, v18, v50 +; GCN-NEXT: v_or_b32_e32 v19, v19, v51 +; GCN-NEXT: v_or_b32_e32 v20, v20, v52 +; GCN-NEXT: v_or_b32_e32 v21, v21, v53 +; GCN-NEXT: v_or_b32_e32 v22, v22, v32 +; GCN-NEXT: v_or_b32_e32 v23, v23, v33 +; GCN-NEXT: v_or_b32_e32 v24, v24, v34 +; GCN-NEXT: v_or_b32_e32 v25, v25, v35 +; GCN-NEXT: v_or_b32_e32 v26, v26, v36 +; GCN-NEXT: v_or_b32_e32 v27, v27, v37 +; GCN-NEXT: v_or_b32_e32 v28, v28, v38 +; GCN-NEXT: v_or_b32_e32 v29, v29, v39 +; GCN-NEXT: v_or_b32_e32 v30, v30, v48 +; GCN-NEXT: v_or_b32_e32 v31, v31, v49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; kill: killed $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: .LBB7_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB7_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v39, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v56, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v38, v2 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v47, v3 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v54, v4 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v46, v5 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v37, v6 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v43, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_or_b32_e32 v8, v45, v8 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v53, v9 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v42, v10 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v41, v11 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v40, v12 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v63, v13 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v50, v14 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v0, v15 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v0, v16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v0, v17 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_or_b32_e32 v18, v51, v18 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_or_b32_e32 v19, v49, v19 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_or_b32_e32 v20, v60, v20 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_or_b32_e32 v21, v58, v21 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v25, v62, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v29, v59, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v37, v32, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v50, v61, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v41, v34, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v45, v52, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v56, v35, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v58, v33, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v59, v44, v22 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v57 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v57, v36, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v60, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v61, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v62, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v63, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v49, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v51, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v52, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v54, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v26, v24 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v28, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v30, v28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v31, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v32, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v33, v34, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v35, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v48, v35 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v48, v53, v48 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_mov_b32_e32 v0, v55 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v55, v53 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v55, v40, v55 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v40, v42, v40 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v42, v43, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v43, v44, v43 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v44, v46, v44 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; GCN-NEXT: v_or_b32_e32 v46, v0, v46 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v47, v0, v47 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v60, v0 +; GCN-NEXT: s_movk_i32 s7, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v61, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v62, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v63, v3 +; GCN-NEXT: s_mov_b32 s6, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v21 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v37 +; GCN-NEXT: v_add_i32_e32 v50, vcc, s7, v50 +; GCN-NEXT: v_add_i32_e32 v41, vcc, s7, v41 +; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v45 +; GCN-NEXT: v_add_i32_e32 v56, vcc, s7, v56 +; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v58 +; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v59 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x300, v57 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 +; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 +; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 +; GCN-NEXT: v_or_b32_e32 v4, v36, v4 +; GCN-NEXT: v_or_b32_e32 v5, v38, v5 +; GCN-NEXT: v_or_b32_e32 v6, v39, v6 +; GCN-NEXT: v_or_b32_e32 v7, v49, v7 +; GCN-NEXT: v_or_b32_e32 v8, v51, v8 +; GCN-NEXT: v_or_b32_e32 v9, v52, v9 +; GCN-NEXT: v_or_b32_e32 v10, v54, v10 +; GCN-NEXT: v_or_b32_e32 v11, v22, v11 +; GCN-NEXT: v_or_b32_e32 v12, v23, v12 +; GCN-NEXT: v_or_b32_e32 v13, v24, v13 +; GCN-NEXT: v_or_b32_e32 v14, v26, v14 +; GCN-NEXT: v_or_b32_e32 v15, v27, v15 +; GCN-NEXT: v_or_b32_e32 v16, v28, v16 +; GCN-NEXT: v_or_b32_e32 v17, v30, v17 +; GCN-NEXT: v_or_b32_e32 v18, v31, v18 +; GCN-NEXT: v_or_b32_e32 v19, v32, v19 +; GCN-NEXT: v_or_b32_e32 v20, v33, v20 +; GCN-NEXT: v_or_b32_e32 v21, v34, v21 +; GCN-NEXT: v_or_b32_e32 v22, v35, v25 +; GCN-NEXT: v_or_b32_e32 v23, v48, v29 +; GCN-NEXT: v_or_b32_e32 v24, v53, v37 +; GCN-NEXT: v_or_b32_e32 v25, v55, v50 +; GCN-NEXT: v_or_b32_e32 v26, v40, v41 +; GCN-NEXT: v_or_b32_e32 v27, v42, v45 +; GCN-NEXT: v_or_b32_e32 v28, v43, v56 +; GCN-NEXT: v_or_b32_e32 v29, v44, v58 +; GCN-NEXT: v_or_b32_e32 v30, v46, v59 +; GCN-NEXT: v_or_b32_e32 v31, v47, v57 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 +; GCN-NEXT: .LBB7_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v54 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: .LBB7_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB7_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v31, 0x300 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v9, 3, v61 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v39, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v37, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v63 +; VI-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v59 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 3, v62 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 +; VI-NEXT: v_add_u16_sdwa v10, v10, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v58 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v60 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_add_u16_sdwa v11, v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 +; VI-NEXT: v_add_u16_sdwa v12, v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v46 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_add_u16_sdwa v13, v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v45 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v44 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_add_u16_sdwa v14, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v54 +; VI-NEXT: v_add_u16_e32 v15, 3, v40 +; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_add_u16_sdwa v15, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v16, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v18, 3, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v18, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v18, 3, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v18, 0x300, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v19, 0x300, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v20, 0x300, v20 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v21, v21, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v21, 0x300, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v22, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v23, 3, v23 +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v23, 3, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v24, 3, v24 +; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v24, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v24, 3, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v25, 3, v25 +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v25, 3, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v26, 3, v26 +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v26, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v26, 3, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v27, 3, v27 +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v27 +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v27, 3, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v28, 3, v28 +; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v28, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v27, v28 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v28, 3, v28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v29, 3, v29 +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v29, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v28, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v29, 3, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v30, 3, v30 +; VI-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v30, 3, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 3, v32 +; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v32, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v30, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v32, 3, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v33, 3, v33 +; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v32, v31 +; VI-NEXT: .LBB7_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: .LBB7_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB7_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v39, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v37, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v54 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v18, v18, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v19, v19, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v18, v18, v19 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v21, v21, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v23, v23, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v23 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX9-NEXT: .LBB7_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v128i8_to_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:592 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:588 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:584 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:580 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:576 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:572 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:568 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:564 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:560 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:556 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:552 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:548 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:544 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:540 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:536 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:532 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:528 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:524 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:520 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:516 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:512 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:508 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:504 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:500 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:496 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:492 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:488 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:484 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:480 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:476 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:472 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:468 +; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:464 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:460 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:456 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:452 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:448 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:444 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:440 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:436 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:432 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:428 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:424 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:420 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:416 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:412 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:408 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:404 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:400 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:396 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:220 +; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-NEXT: scratch_load_u16 v115, off, s32 +; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:88 +; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:136 +; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:144 +; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:160 +; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:168 +; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:176 +; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:184 +; GFX11-NEXT: scratch_load_u16 v136, off, s32 offset:192 +; GFX11-NEXT: scratch_load_u16 v137, off, s32 offset:200 +; GFX11-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-NEXT: scratch_load_u16 v132, off, s32 offset:212 +; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:204 +; GFX11-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:188 +; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:180 +; GFX11-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-NEXT: scratch_load_u16 v160, off, s32 offset:156 +; GFX11-NEXT: scratch_load_u16 v161, off, s32 offset:148 +; GFX11-NEXT: scratch_load_u16 v167, off, s32 offset:140 +; GFX11-NEXT: scratch_load_u16 v176, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v177, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v40, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v42, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v43, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v44, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v58, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v59, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v60, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v124, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v125, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v126, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v127, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v111, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v120, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v121, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v122, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v123, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v106, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v107, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v108, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v109, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v110, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(54) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-NEXT: s_waitcnt vmcnt(53) +; GFX11-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-NEXT: s_waitcnt vmcnt(52) +; GFX11-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-NEXT: s_waitcnt vmcnt(51) +; GFX11-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-NEXT: s_waitcnt vmcnt(50) +; GFX11-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-NEXT: s_waitcnt vmcnt(49) +; GFX11-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-NEXT: s_waitcnt vmcnt(48) +; GFX11-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-NEXT: s_waitcnt vmcnt(47) +; GFX11-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-NEXT: s_waitcnt vmcnt(46) +; GFX11-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-NEXT: s_waitcnt vmcnt(45) +; GFX11-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-NEXT: s_waitcnt vmcnt(44) +; GFX11-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-NEXT: s_waitcnt vmcnt(43) +; GFX11-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-NEXT: s_waitcnt vmcnt(42) +; GFX11-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-NEXT: s_waitcnt vmcnt(41) +; GFX11-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-NEXT: s_waitcnt vmcnt(40) +; GFX11-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-NEXT: s_waitcnt vmcnt(39) +; GFX11-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-NEXT: s_waitcnt vmcnt(38) +; GFX11-NEXT: v_lshlrev_b16 v46, 8, v163 +; GFX11-NEXT: s_waitcnt vmcnt(37) +; GFX11-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-NEXT: s_waitcnt vmcnt(36) +; GFX11-NEXT: v_lshlrev_b16 v56, 8, v165 +; GFX11-NEXT: s_waitcnt vmcnt(35) +; GFX11-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-NEXT: s_waitcnt vmcnt(34) +; GFX11-NEXT: v_lshlrev_b16 v179, 8, v179 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-NEXT: s_waitcnt vmcnt(32) +; GFX11-NEXT: v_lshlrev_b16 v181, 8, v181 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: v_lshlrev_b16 v183, 8, v183 +; GFX11-NEXT: s_waitcnt vmcnt(29) +; GFX11-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: v_lshlrev_b16 v163, 8, v137 +; GFX11-NEXT: s_waitcnt vmcnt(27) +; GFX11-NEXT: v_lshlrev_b16 v164, 8, v138 +; GFX11-NEXT: v_lshlrev_b16 v165, 8, v103 +; GFX11-NEXT: v_lshlrev_b16 v166, 8, v102 +; GFX11-NEXT: v_lshlrev_b16 v144, 8, v101 +; GFX11-NEXT: v_lshlrev_b16 v145, 8, v100 +; GFX11-NEXT: v_lshlrev_b16 v146, 8, v99 +; GFX11-NEXT: v_lshlrev_b16 v147, 8, v31 +; GFX11-NEXT: v_lshlrev_b16 v148, 8, v30 +; GFX11-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v124 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v125 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v126 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v127 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v39 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v111 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v121 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v120 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v122 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v123 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v107 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v38 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v108 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v109 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v110 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v106 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v12 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v92 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v78 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v77 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v76 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v75 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v74 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v59 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v93 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v94 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v95 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v104 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v105 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v79 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v88 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v89 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v90 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v91 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v58 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v44 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v43 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v41 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v40 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v176 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v61 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v62 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v63 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v72 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v73 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v45 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v46 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v47 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v56 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v57 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v160 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v151 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v150 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v134 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v113 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v179 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v180 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v181 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v182 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v183 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v162 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v163 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v164 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v165 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v112 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v98 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v97 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v87 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v86 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v84 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v83 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v82 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v144 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v145 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v146 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v147 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v148 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v119 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v128 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v129 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v130 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v70 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v68 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v66 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v65 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v64 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v114 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v115 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v116 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v117 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v118 +; GFX11-NEXT: v_or_b32_e32 v32, v32, v99 +; GFX11-NEXT: v_or_b32_e32 v33, v33, v100 +; GFX11-NEXT: v_or_b32_e32 v34, v34, v101 +; GFX11-NEXT: v_or_b32_e32 v35, v35, v102 +; GFX11-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr92 +; GFX11-NEXT: ; implicit-def: $vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr77 +; GFX11-NEXT: ; implicit-def: $vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr124 +; GFX11-NEXT: ; implicit-def: $vgpr125 +; GFX11-NEXT: ; implicit-def: $vgpr126 +; GFX11-NEXT: ; implicit-def: $vgpr127 +; GFX11-NEXT: ; implicit-def: $vgpr111 +; GFX11-NEXT: ; implicit-def: $vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr121 +; GFX11-NEXT: ; implicit-def: $vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr108 +; GFX11-NEXT: ; implicit-def: $vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr110 +; GFX11-NEXT: ; implicit-def: $vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr94 +; GFX11-NEXT: ; implicit-def: $vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr104 +; GFX11-NEXT: ; implicit-def: $vgpr105 +; GFX11-NEXT: ; implicit-def: $vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr89 +; GFX11-NEXT: ; implicit-def: $vgpr90 +; GFX11-NEXT: ; implicit-def: $vgpr91 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: .LBB7_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB7_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v55, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v54, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v53, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v52, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v51, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_add_nc_u16 v5, v50, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v124, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v125, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v126, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v127, v3 +; GFX11-NEXT: v_add_nc_u16 v6, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v7, v48, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u16 v8, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v36, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_nc_u16 v10, v35, 3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_add_nc_u16 v2, v39, 3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v11, v34, 3 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_or_b32_e32 v3, v111, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v120, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v121, v5 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v6, v122, v6 +; GFX11-NEXT: v_or_b32_e32 v8, v107, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v108, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v109, v10 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v38, 3 +; GFX11-NEXT: v_or_b32_e32 v11, v110, v11 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_or_b32_e32 v2, v106, v2 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v12 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_add_nc_u16 v7, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v9, v92, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v78, 3 +; GFX11-NEXT: v_add_nc_u16 v11, v77, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v76, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v75, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v74, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v60, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v59, 3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_or_b32_e32 v7, v93, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v94, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v95, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v104, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v105, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v79, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v88, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v89, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v90, v15 +; GFX11-NEXT: v_or_b32_e32 v16, v91, v16 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_add_nc_u16 v12, v58, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v44, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v43, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v42, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v41, 3 +; GFX11-NEXT: v_add_nc_u16 v17, v40, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v178, 3 +; GFX11-NEXT: v_add_nc_u16 v19, v177, 3 +; GFX11-NEXT: v_add_nc_u16 v20, v176, 3 +; GFX11-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v62, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v63, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v72, v15 +; GFX11-NEXT: v_or_b32_e32 v16, v73, v16 +; GFX11-NEXT: v_or_b32_e32 v17, v45, v17 +; GFX11-NEXT: v_or_b32_e32 v18, v46, v18 +; GFX11-NEXT: v_or_b32_e32 v19, v47, v19 +; GFX11-NEXT: v_or_b32_e32 v20, v56, v20 +; GFX11-NEXT: v_or_b32_e32 v21, v57, v21 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v160, 3 +; GFX11-NEXT: v_add_nc_u16 v19, v151, 3 +; GFX11-NEXT: v_add_nc_u16 v20, v150, 3 +; GFX11-NEXT: v_add_nc_u16 v21, v149, 3 +; GFX11-NEXT: v_add_nc_u16 v22, v135, 3 +; GFX11-NEXT: v_add_nc_u16 v23, v134, 3 +; GFX11-NEXT: v_add_nc_u16 v24, v133, 3 +; GFX11-NEXT: v_add_nc_u16 v25, v132, 3 +; GFX11-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-NEXT: v_or_b32_e32 v18, v180, v18 +; GFX11-NEXT: v_or_b32_e32 v19, v181, v19 +; GFX11-NEXT: v_or_b32_e32 v20, v182, v20 +; GFX11-NEXT: v_or_b32_e32 v21, v183, v21 +; GFX11-NEXT: v_or_b32_e32 v22, v162, v22 +; GFX11-NEXT: v_or_b32_e32 v23, v163, v23 +; GFX11-NEXT: v_or_b32_e32 v24, v164, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v165, v25 +; GFX11-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_add_nc_u16 v22, v112, 3 +; GFX11-NEXT: v_add_nc_u16 v23, v98, 3 +; GFX11-NEXT: v_add_nc_u16 v24, v97, 3 +; GFX11-NEXT: v_add_nc_u16 v25, v96, 3 +; GFX11-NEXT: v_add_nc_u16 v26, v87, 3 +; GFX11-NEXT: v_add_nc_u16 v27, v86, 3 +; GFX11-NEXT: v_add_nc_u16 v28, v85, 3 +; GFX11-NEXT: v_add_nc_u16 v29, v84, 3 +; GFX11-NEXT: v_add_nc_u16 v30, v83, 3 +; GFX11-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-NEXT: v_or_b32_e32 v23, v145, v23 +; GFX11-NEXT: v_or_b32_e32 v24, v146, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v147, v25 +; GFX11-NEXT: v_or_b32_e32 v26, v148, v26 +; GFX11-NEXT: v_or_b32_e32 v27, v119, v27 +; GFX11-NEXT: v_or_b32_e32 v28, v128, v28 +; GFX11-NEXT: v_or_b32_e32 v29, v129, v29 +; GFX11-NEXT: v_or_b32_e32 v30, v130, v30 +; GFX11-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-NEXT: v_add_nc_u16 v27, v81, 3 +; GFX11-NEXT: v_add_nc_u16 v28, v80, 3 +; GFX11-NEXT: v_add_nc_u16 v29, v71, 3 +; GFX11-NEXT: v_add_nc_u16 v30, v70, 3 +; GFX11-NEXT: v_add_nc_u16 v31, v69, 3 +; GFX11-NEXT: v_add_nc_u16 v32, v68, 3 +; GFX11-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-NEXT: v_add_nc_u16 v34, v66, 3 +; GFX11-NEXT: v_add_nc_u16 v35, v65, 3 +; GFX11-NEXT: v_add_nc_u16 v36, v64, 3 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-NEXT: v_or_b32_e32 v27, v114, v27 +; GFX11-NEXT: v_or_b32_e32 v28, v115, v28 +; GFX11-NEXT: v_or_b32_e32 v29, v116, v29 +; GFX11-NEXT: v_or_b32_e32 v30, v117, v30 +; GFX11-NEXT: v_or_b32_e32 v31, v118, v31 +; GFX11-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-NEXT: v_or_b32_e32 v33, v100, v33 +; GFX11-NEXT: v_or_b32_e32 v34, v101, v34 +; GFX11-NEXT: v_or_b32_e32 v35, v102, v35 +; GFX11-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-NEXT: v_add_nc_u16 v34, 0x300, v34 +; GFX11-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-NEXT: v_add_nc_u16 v36, 0x300, v36 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-NEXT: .LBB7_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:392 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:396 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:400 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:404 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:408 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:412 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:416 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:420 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:424 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:428 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:432 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:436 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:440 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:444 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:448 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:452 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:456 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:460 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:464 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:468 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:472 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:476 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:480 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:484 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:488 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:492 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:496 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:500 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:504 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:508 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:512 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:516 +; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:520 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:524 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:528 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:532 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:536 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:540 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:544 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:548 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:552 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:556 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:560 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:564 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:568 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:572 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:576 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:580 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:584 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:588 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <64 x bfloat> @bitcast_v32i32_to_v64bf16(<32 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i32_to_v64bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB8_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v62 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: .LBB8_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB8_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v62 +; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; GCN-NEXT: .LBB8_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v60 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v59 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v56, v1, v2, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v45, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v44, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v47, vcc, 20, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_alignbit_b32 v32, v32, v33, 16 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_alignbit_b32 v34, v34, v35, 16 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_alignbit_b32 v36, v36, v37, 16 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_alignbit_b32 v38, v38, v39, 16 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_alignbit_b32 v48, v48, v49, 16 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_alignbit_b32 v50, v50, v51, 16 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_alignbit_b32 v52, v52, v53, 16 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_alignbit_b32 v54, v54, v55, 16 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_alignbit_b32 v40, v40, v41, 16 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v63 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_alignbit_b32 v42, v42, v43, 16 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v56, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v45, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v47, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <32 x i32> @bitcast_v64bf16_to_v32i32(<64 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v64bf16_to_v32i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v33 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v40 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v63 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v43 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v54 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; GCN-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v61, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; GCN-NEXT: v_alignbit_b32 v2, v2, v59, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; GCN-NEXT: v_alignbit_b32 v3, v3, v57, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v46 +; GCN-NEXT: v_alignbit_b32 v4, v4, v47, 16 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v5, v5, v45, 16 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v16, v16, v33, 16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v43 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; GCN-NEXT: v_alignbit_b32 v19, v19, v42, 16 +; GCN-NEXT: v_alignbit_b32 v20, v20, v44, 16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; GCN-NEXT: v_alignbit_b32 v22, v22, v48, 16 +; GCN-NEXT: v_alignbit_b32 v23, v23, v38, 16 +; GCN-NEXT: v_alignbit_b32 v24, v24, v50, 16 +; GCN-NEXT: v_alignbit_b32 v25, v25, v51, 16 +; GCN-NEXT: v_alignbit_b32 v26, v26, v53, 16 +; GCN-NEXT: v_alignbit_b32 v27, v27, v55, 16 +; GCN-NEXT: v_alignbit_b32 v28, v28, v40, 16 +; GCN-NEXT: v_alignbit_b32 v29, v29, v63, 16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; kill: killed $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: .LBB9_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB9_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v39 +; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v40 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v54 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v41 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v43 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v30 +; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v31 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v32 +; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v34 +; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v36 +; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v38 +; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v48 +; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v49 +; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v50 +; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 +; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v52 +; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; GCN-NEXT: v_alignbit_b32 v19, v21, v20, 16 +; GCN-NEXT: v_alignbit_b32 v20, v39, v54, 16 +; GCN-NEXT: v_alignbit_b32 v21, v48, v40, 16 +; GCN-NEXT: v_alignbit_b32 v22, v49, v22, 16 +; GCN-NEXT: v_alignbit_b32 v23, v50, v23, 16 +; GCN-NEXT: v_alignbit_b32 v24, v51, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v32, v25, 16 +; GCN-NEXT: v_alignbit_b32 v26, v33, v26, 16 +; GCN-NEXT: v_alignbit_b32 v27, v34, v27, 16 +; GCN-NEXT: v_alignbit_b32 v28, v35, v28, 16 +; GCN-NEXT: v_alignbit_b32 v29, v36, v29, 16 +; GCN-NEXT: v_alignbit_b32 v30, v37, v30, 16 +; GCN-NEXT: v_alignbit_b32 v31, v38, v31, 16 +; GCN-NEXT: .LBB9_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 +; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64bf16_to_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v39, v14, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-NEXT: v_add3_u32 v34, v34, v32, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo +; GFX11-NEXT: v_add3_u32 v35, v39, v14, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add3_u32 v33, v33, v13, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v34, v34, v12, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v33, v33, v11, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v35, v35, v10, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX11-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v33, v33, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v8 +; GFX11-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_bfe_u32 v35, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v6, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v33, v33, v7, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GFX11-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 +; GFX11-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v34, v34, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v4 +; GFX11-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v33, v33, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v0, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v33, v33, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v0 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-NEXT: v_add3_u32 v33, v33, v31, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_bfe_u32 v35, v30, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v30, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_add3_u32 v33, v33, v29, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v28, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v28 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-NEXT: v_add3_u32 v33, v33, v27, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_bfe_u32 v35, v26, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v26 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add3_u32 v33, v33, v25, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v33, v23, 0x7fff +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GFX11-NEXT: v_add3_u32 v35, v36, v34, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v21, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v32, v38, v33, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; GFX11-NEXT: v_add3_u32 v35, v37, v33, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v35, v37, v19, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v19 +; GFX11-NEXT: v_add3_u32 v37, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_bfe_u32 v48, v18, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v36, v48, v18, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-NEXT: v_add3_u32 v37, v38, v35, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_bfe_u32 v48, v36, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v16 +; GFX11-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo +; GFX11-NEXT: v_add3_u32 v38, v39, v17, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_bfe_u32 v37, v16, 16, 1 +; GFX11-NEXT: v_add3_u32 v48, v48, v36, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_add3_u32 v37, v37, v16, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 +; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i32_to_v64f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v62 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v63 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v62 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v63 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v48 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v49 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v50 +; GCN-NEXT: v_mov_b32_e32 v50, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v56 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v58 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v60 +; GCN-NEXT: v_cvt_f32_f16_e32 v47, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v57, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: .LBB10_2: ; %Flow +; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v31, v33 +; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v63 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v61 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v58 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 +; GCN-NEXT: .LBB10_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v61 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v59 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v59, v2, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v58 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v56, v2, v1 +; GCN-NEXT: v_add_i32_e32 v61, vcc, 8, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v57, v2, v1 +; GCN-NEXT: v_add_i32_e32 v60, vcc, 12, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v47 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v47, v2, v1 +; GCN-NEXT: v_add_i32_e32 v58, vcc, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v46 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_add_i32_e32 v46, vcc, 20, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v45 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v44 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v43 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v42 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v41 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v40 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v54 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v53 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v51 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v49 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v20, v21, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v48 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v37 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v35 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_or_b32_e32 v28, v29, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v34 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_or_b32_e32 v30, v31, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v36 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_or_b32_e32 v34, v35, v34 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_or_b32_e32 v36, v37, v36 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_or_b32_e32 v38, v39, v38 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_or_b32_e32 v48, v49, v48 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_or_b32_e32 v50, v51, v50 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_or_b32_e32 v52, v53, v52 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_or_b32_e32 v54, v55, v54 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_or_b32_e32 v40, v41, v40 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_or_b32_e32 v42, v43, v42 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v59, v61, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v56, v60, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v47, v46, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v64f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v64f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v64f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <32 x i32> @bitcast_v64f16_to_v32i32(<64 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v64f16_to_v32i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v62, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v61, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v59, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v58, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v57, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v56, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v47, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v37 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v31 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v1 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB11_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; GCN-NEXT: v_or_b32_e32 v0, v62, v0 +; GCN-NEXT: v_or_b32_e32 v1, v60, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; GCN-NEXT: v_or_b32_e32 v2, v58, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; GCN-NEXT: v_or_b32_e32 v3, v56, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; GCN-NEXT: v_or_b32_e32 v4, v46, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v51 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v32, v18 +; GCN-NEXT: v_or_b32_e32 v19, v43, v19 +; GCN-NEXT: v_or_b32_e32 v20, v41, v20 +; GCN-NEXT: v_or_b32_e32 v21, v55, v21 +; GCN-NEXT: v_or_b32_e32 v22, v49, v22 +; GCN-NEXT: v_or_b32_e32 v23, v50, v23 +; GCN-NEXT: v_or_b32_e32 v24, v39, v24 +; GCN-NEXT: v_or_b32_e32 v25, v36, v25 +; GCN-NEXT: v_or_b32_e32 v26, v48, v26 +; GCN-NEXT: v_or_b32_e32 v27, v52, v27 +; GCN-NEXT: v_or_b32_e32 v28, v53, v28 +; GCN-NEXT: v_or_b32_e32 v29, v54, v29 +; GCN-NEXT: v_or_b32_e32 v30, v40, v30 +; GCN-NEXT: v_or_b32_e32 v31, v42, v31 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: .LBB11_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB11_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v59 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v56 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v45 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v41 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v55 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v50 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v39 +; GCN-NEXT: v_mov_b32_e32 v39, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v48 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v31 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v49 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v52 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_or_b32_e32 v19, v21, v20 +; GCN-NEXT: v_or_b32_e32 v20, v55, v39 +; GCN-NEXT: v_or_b32_e32 v21, v41, v48 +; GCN-NEXT: v_or_b32_e32 v22, v22, v49 +; GCN-NEXT: v_or_b32_e32 v23, v23, v50 +; GCN-NEXT: v_or_b32_e32 v24, v24, v51 +; GCN-NEXT: v_or_b32_e32 v25, v25, v32 +; GCN-NEXT: v_or_b32_e32 v26, v26, v36 +; GCN-NEXT: v_or_b32_e32 v27, v27, v38 +; GCN-NEXT: v_or_b32_e32 v28, v28, v33 +; GCN-NEXT: v_or_b32_e32 v29, v29, v34 +; GCN-NEXT: v_or_b32_e32 v30, v30, v35 +; GCN-NEXT: v_or_b32_e32 v31, v31, v37 +; GCN-NEXT: .LBB11_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v32, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v33, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v18, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_or_b32_e32 v18, v18, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v32, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64f16_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64f16_to_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <64 x i16> @bitcast_v32i32_to_v64i16(<32 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i32_to_v64i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; kill: killed $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; kill: killed $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 +; GCN-NEXT: v_alignbit_b32 v50, v16, v15, 16 +; GCN-NEXT: v_alignbit_b32 v52, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v55, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v41, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GCN-NEXT: .LBB12_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 +; GCN-NEXT: v_alignbit_b32 v50, v16, v15, 16 +; GCN-NEXT: v_alignbit_b32 v52, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v55, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v41, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GCN-NEXT: .LBB12_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v59 +; GCN-NEXT: v_or_b32_e32 v1, v1, v44 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_or_b32_e32 v2, v2, v33 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; GCN-NEXT: v_or_b32_e32 v59, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; GCN-NEXT: v_or_b32_e32 v57, v1, v2 +; GCN-NEXT: v_add_i32_e32 v44, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; GCN-NEXT: v_or_b32_e32 v63, v1, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; GCN-NEXT: v_or_b32_e32 v46, v1, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v43 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; GCN-NEXT: v_or_b32_e32 v61, v3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v41 +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v55 +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v58 +; GCN-NEXT: v_or_b32_e32 v10, v10, v11 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v56 +; GCN-NEXT: v_or_b32_e32 v14, v14, v33 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_or_b32_e32 v15, v15, v50 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 52, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v47 +; GCN-NEXT: v_or_b32_e32 v16, v16, v52 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_or_b32_e32 v17, v17, v48 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 60, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v45 +; GCN-NEXT: v_or_b32_e32 v18, v18, v55 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 64, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_or_b32_e32 v19, v19, v39 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_or_b32_e32 v20, v20, v34 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x48, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v38 +; GCN-NEXT: v_or_b32_e32 v21, v21, v34 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x4c, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v42 +; GCN-NEXT: v_or_b32_e32 v22, v22, v34 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x50, v0 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GCN-NEXT: v_or_b32_e32 v23, v23, v34 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v40 +; GCN-NEXT: v_or_b32_e32 v24, v24, v34 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x58, v0 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v36 +; GCN-NEXT: v_or_b32_e32 v25, v25, v34 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x5c, v0 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v54 +; GCN-NEXT: v_or_b32_e32 v26, v26, v34 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v35 +; GCN-NEXT: v_or_b32_e32 v27, v27, v34 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x64, v0 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v53 +; GCN-NEXT: v_or_b32_e32 v28, v28, v34 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_or_b32_e32 v29, v29, v34 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x6c, v0 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GCN-NEXT: v_or_b32_e32 v30, v30, v51 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x70, v0 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: v_or_b32_e32 v32, v32, v43 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GCN-NEXT: v_or_b32_e32 v31, v31, v49 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v59, v44, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v63, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v29, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i32_to_v64i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: .LBB12_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i32_to_v64i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v31, 3, v31 +; GFX9-NEXT: v_add_u32_e32 v30, 3, v30 +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: .LBB12_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i32_to_v64i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v31, 3, v31 +; GFX11-NEXT: v_add_nc_u32_e32 v30, 3, v30 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: .LBB12_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i32> %a, splat (i32 3) + %a2 = bitcast <32 x i32> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x i32> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <32 x i32> @bitcast_v64i16_to_v32i32(<64 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i16_to_v32i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v37, v20 +; GCN-NEXT: v_mov_b32_e32 v38, v18 +; GCN-NEXT: v_mov_b32_e32 v39, v16 +; GCN-NEXT: v_mov_b32_e32 v48, v14 +; GCN-NEXT: v_mov_b32_e32 v49, v12 +; GCN-NEXT: v_mov_b32_e32 v50, v10 +; GCN-NEXT: v_mov_b32_e32 v51, v8 +; GCN-NEXT: v_mov_b32_e32 v52, v6 +; GCN-NEXT: v_mov_b32_e32 v53, v4 +; GCN-NEXT: v_mov_b32_e32 v54, v2 +; GCN-NEXT: v_mov_b32_e32 v55, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(12) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v24 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB13_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GCN-NEXT: v_or_b32_e32 v0, v0, v36 +; GCN-NEXT: v_or_b32_e32 v1, v1, v58 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; GCN-NEXT: v_or_b32_e32 v2, v2, v57 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; GCN-NEXT: v_or_b32_e32 v3, v3, v35 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; GCN-NEXT: v_or_b32_e32 v4, v4, v60 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v5, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v7, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v10, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v43 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v56 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v45 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v32 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v42 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v41 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v40 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v63 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v62 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v61 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v47 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v44 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v18, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v19, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v20, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v21, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v22, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v23, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v24, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v25, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v26, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v27, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v28, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v29, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v30, v32 +; GCN-NEXT: v_or_b32_e32 v31, v31, v59 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: .LBB13_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB13_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v36, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v58, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v57, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v35, v3 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v45 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v41 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v62 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v61 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v47 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v44 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v4, v60, v4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v32, v5 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v32, v6 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v32, v7 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v32, v8 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v32, v9 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v32, v10 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v32, v11 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v32, v12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v32, v13 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v32, v14 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v32, v15 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v32, v16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v32, v17 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v32, v18 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v32, v19 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v32, v20 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v32, v22 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v32, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v32, v25 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v32, v26 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v32, v27 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v32, v29 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v32, v30 +; GCN-NEXT: v_or_b32_e32 v31, v59, v31 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 +; GCN-NEXT: .LBB13_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(11) +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v32i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v33, 3 +; VI-NEXT: v_add_u16_e32 v32, 3, v15 +; VI-NEXT: v_add_u16_sdwa v15, v15, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v32, v15 +; VI-NEXT: v_add_u16_e32 v32, 3, v14 +; VI-NEXT: v_add_u16_sdwa v14, v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v32, v14 +; VI-NEXT: v_add_u16_e32 v32, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v32, v13 +; VI-NEXT: v_add_u16_e32 v32, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v32, v12 +; VI-NEXT: v_add_u16_e32 v32, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v32, v11 +; VI-NEXT: v_add_u16_e32 v32, 3, v10 +; VI-NEXT: v_add_u16_sdwa v10, v10, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v32, v10 +; VI-NEXT: v_add_u16_e32 v32, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v32, v9 +; VI-NEXT: v_add_u16_e32 v32, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v32, v8 +; VI-NEXT: v_add_u16_e32 v32, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v32, v7 +; VI-NEXT: v_add_u16_e32 v32, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v32, v6 +; VI-NEXT: v_add_u16_e32 v32, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v32, v5 +; VI-NEXT: v_add_u16_e32 v32, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v32, v4 +; VI-NEXT: v_add_u16_e32 v32, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v32, v3 +; VI-NEXT: v_add_u16_e32 v32, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v32, v2 +; VI-NEXT: v_add_u16_e32 v32, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v32, v1 +; VI-NEXT: v_add_u16_e32 v32, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v32, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 3, v31 +; VI-NEXT: v_add_u16_sdwa v31, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v32, v31 +; VI-NEXT: v_add_u16_e32 v32, 3, v30 +; VI-NEXT: v_add_u16_sdwa v30, v30, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v32, v30 +; VI-NEXT: v_add_u16_e32 v32, 3, v29 +; VI-NEXT: v_add_u16_sdwa v29, v29, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v32, v29 +; VI-NEXT: v_add_u16_e32 v32, 3, v28 +; VI-NEXT: v_add_u16_sdwa v28, v28, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v32, v28 +; VI-NEXT: v_add_u16_e32 v32, 3, v27 +; VI-NEXT: v_add_u16_sdwa v27, v27, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: v_add_u16_e32 v32, 3, v26 +; VI-NEXT: v_add_u16_sdwa v26, v26, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v32, v26 +; VI-NEXT: v_add_u16_e32 v32, 3, v25 +; VI-NEXT: v_add_u16_sdwa v25, v25, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v32, v25 +; VI-NEXT: v_add_u16_e32 v32, 3, v24 +; VI-NEXT: v_add_u16_sdwa v24, v24, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v32, v24 +; VI-NEXT: v_add_u16_e32 v32, 3, v23 +; VI-NEXT: v_add_u16_sdwa v23, v23, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v32, v23 +; VI-NEXT: v_add_u16_e32 v32, 3, v22 +; VI-NEXT: v_add_u16_sdwa v22, v22, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v32, v22 +; VI-NEXT: v_add_u16_e32 v32, 3, v21 +; VI-NEXT: v_add_u16_sdwa v21, v21, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v32, v21 +; VI-NEXT: v_add_u16_e32 v32, 3, v20 +; VI-NEXT: v_add_u16_sdwa v20, v20, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v32, v20 +; VI-NEXT: v_add_u16_e32 v32, 3, v19 +; VI-NEXT: v_add_u16_sdwa v19, v19, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v32, v19 +; VI-NEXT: v_add_u16_e32 v32, 3, v18 +; VI-NEXT: v_add_u16_sdwa v18, v18, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v32, v18 +; VI-NEXT: v_add_u16_e32 v32, 3, v17 +; VI-NEXT: v_add_u16_sdwa v17, v17, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v32, v17 +; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i16_to_v32i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i16_to_v32i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <32 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <32 x i32> + br label %end + +end: + %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i32> %phi +} + +define <16 x i64> @bitcast_v32f32_to_v16i64(<32 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f32_to_v16i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB14_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB14_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v16i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f32_to_v16i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f32_to_v16i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB14_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <32 x float> @bitcast_v16i64_to_v32f32(<16 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i64_to_v32f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB15_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB15_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i64_to_v32f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i64_to_v32f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB15_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i64_to_v32f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB15_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <16 x double> @bitcast_v32f32_to_v16f64(<32 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f32_to_v16f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB16_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB16_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v16f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f32_to_v16f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f32_to_v16f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <32 x float> @bitcast_v16f64_to_v32f32(<16 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f64_to_v32f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB17_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB17_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f64_to_v32f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f64_to_v32f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f64_to_v32f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f32_to_v128i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v58 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v58 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v58 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 +; GCN-NEXT: .LBB18_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_f32_e32 v58, 1.0, v58 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v59, 1.0, v59 +; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v58, v59, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v58 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v58 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v58 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 +; GCN-NEXT: .LBB18_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GCN-NEXT: v_or_b32_e32 v1, v1, v52 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GCN-NEXT: v_or_b32_e32 v2, v2, v31 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_or_b32_e32 v31, v50, v31 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v63 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_or_b32_e32 v50, v51, v50 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v31 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v50 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; GCN-NEXT: v_or_b32_e32 v31, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v61 +; GCN-NEXT: v_or_b32_e32 v49, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; GCN-NEXT: v_or_b32_e32 v2, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v60 +; GCN-NEXT: v_or_b32_e32 v61, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39 +; GCN-NEXT: v_or_b32_e32 v62, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v57 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v38 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v56 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v37 +; GCN-NEXT: v_or_b32_e32 v7, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; GCN-NEXT: v_or_b32_e32 v8, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v36 +; GCN-NEXT: v_or_b32_e32 v9, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v46 +; GCN-NEXT: v_or_b32_e32 v10, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v35 +; GCN-NEXT: v_or_b32_e32 v11, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v45 +; GCN-NEXT: v_or_b32_e32 v12, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GCN-NEXT: v_or_b32_e32 v13, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; GCN-NEXT: v_or_b32_e32 v14, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v33 +; GCN-NEXT: v_or_b32_e32 v15, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v43 +; GCN-NEXT: v_or_b32_e32 v16, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v21 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v17, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v42 +; GCN-NEXT: v_or_b32_e32 v18, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v23 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v19, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v41 +; GCN-NEXT: v_or_b32_e32 v20, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v21, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v40 +; GCN-NEXT: v_or_b32_e32 v22, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v23, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v55 +; GCN-NEXT: v_or_b32_e32 v24, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v29 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v25, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 +; GCN-NEXT: v_or_b32_e32 v26, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v27, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v53 +; GCN-NEXT: v_or_b32_e32 v28, v1, v3 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v29, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v32, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v30, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v33, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v34, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v35, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v36, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v37, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v38, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v39, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v48, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v50, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v51, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v52, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v53, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v54, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v55, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v40, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v41, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v42, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v43, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v44, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v45, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v46, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v47, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v56, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v57, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v58, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v59, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v60, v3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v4, v1, v29 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GCN-NEXT: v_or_b32_e32 v5, v1, v32 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v63, v2, v30 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v61 +; GCN-NEXT: v_or_b32_e32 v61, v3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v62 +; GCN-NEXT: v_or_b32_e32 v6, v6, v34 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v35 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v29, v29, v36 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v30, v30, v37 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v38 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v7, v7, v39 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v8, v8, v48 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v9, v9, v50 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v10, v10, v51 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v11, v11, v52 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v12, v12, v53 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v13, v13, v54 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v14, v14, v55 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v15, v15, v40 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v16, v16, v41 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v17, v17, v42 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v18, v18, v43 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v19, v19, v44 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x58, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v20, v20, v45 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x5c, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GCN-NEXT: v_or_b32_e32 v21, v21, v46 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GCN-NEXT: v_or_b32_e32 v22, v22, v47 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GCN-NEXT: v_or_b32_e32 v23, v23, v56 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x68, v0 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; GCN-NEXT: v_or_b32_e32 v24, v24, v57 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x6c, v0 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GCN-NEXT: v_or_b32_e32 v25, v25, v58 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x70, v0 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; GCN-NEXT: v_or_b32_e32 v26, v26, v59 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; GCN-NEXT: v_or_b32_e32 v27, v27, v60 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v128i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 +; VI-NEXT: .LBB18_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 +; VI-NEXT: .LBB18_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v48 +; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v57 +; VI-NEXT: v_or_b32_sdwa v2, v2, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v41 +; VI-NEXT: v_or_b32_sdwa v48, v53, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f32_to_v128i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: .LBB18_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: .LBB18_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v33, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f32_to_v128i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-NEXT: .LBB18_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v20, 1.0, v20 :: v_dual_add_f32 v19, 1.0, v19 +; GFX11-NEXT: v_dual_add_f32 v18, 1.0, v18 :: v_dual_add_f32 v17, 1.0, v17 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v24, 1.0, v24 :: v_dual_add_f32 v31, 1.0, v31 +; GFX11-NEXT: v_dual_add_f32 v32, 1.0, v32 :: v_dual_add_f32 v23, 1.0, v23 +; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-NEXT: v_dual_add_f32 v26, 1.0, v26 :: v_dual_add_f32 v29, 1.0, v29 +; GFX11-NEXT: v_dual_add_f32 v30, 1.0, v30 :: v_dual_add_f32 v25, 1.0, v25 +; GFX11-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v13, 1.0, v13 +; GFX11-NEXT: v_dual_add_f32 v28, 1.0, v28 :: v_dual_add_f32 v27, 1.0, v27 +; GFX11-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 +; GFX11-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v11, 1.0, v11 +; GFX11-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v3, 1.0, v3 +; GFX11-NEXT: v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v9, 1.0, v9 +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] +; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v7, 1.0, v7 +; GFX11-NEXT: v_dual_add_f32 v22, 1.0, v22 :: v_dual_add_f32 v21, 1.0, v21 +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-NEXT: .LBB18_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v75 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v63 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v65 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v74 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v61 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v73 +; GFX11-NEXT: v_or_b32_e32 v65, v67, v65 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v58 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v72 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v66 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v62 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v60 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v59 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v65 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v57 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v56 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v65 +; GFX11-NEXT: v_or_b32_e32 v55, v66, v64 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v47 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v46 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v45 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v64 +; GFX11-NEXT: v_or_b32_e32 v55, v65, v66 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v44 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v43 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-NEXT: v_or_b32_e32 v54, v65, v54 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v42 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v41 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v40 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v55 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_or_b32_e32 v54, v64, v65 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v182 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v183 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v181 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v53, v64, v53 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v55 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v65 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v54 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v180 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v179 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v178 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v177 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v176 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v167 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v166 +; GFX11-NEXT: v_or_b32_e32 v53, v53, v54 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v55 +; GFX11-NEXT: v_or_b32_e32 v52, v64, v52 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v65 +; GFX11-NEXT: v_or_b32_e32 v54, v66, v67 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v10, v53 +; GFX11-NEXT: v_or_b32_e32 v3, v11, v52 +; GFX11-NEXT: v_or_b32_e32 v4, v12, v54 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v6, 8, v165 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v164 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v51 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v163 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v162 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v161 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v160 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v151 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v50 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v150 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v149 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v148 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v147 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v146 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v49 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v39, v49 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v145 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v144 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v135 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v134 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v133 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v132 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v131 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v130 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v129 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v128 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v119 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v118 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v117 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v116 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v115 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v114 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v113 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v112 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v103 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v102 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v101 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v100 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v99 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v98 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v87 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v86 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v84 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v83 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v82 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v81 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-NEXT: v_lshlrev_b16 v28, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v32, 8, v70 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v69 +; GFX11-NEXT: v_lshlrev_b16 v34, 8, v68 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v128i8_to_v32f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:388 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v9 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v21 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v25 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v29 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v58, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v59, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v61, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:372 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB19_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v39 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v56 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v38 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v47 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v54 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v5, v46 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v5, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v37 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v43 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v48 +; GCN-NEXT: v_or_b32_e32 v8, v8, v45 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v9, v53 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v10, v42 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v10, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v11, v41 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v12, v40 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v13, v63 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v14, v50 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_or_b32_e32 v18, v18, v51 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_or_b32_e32 v19, v19, v49 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_or_b32_e32 v20, v20, v60 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_or_b32_e32 v21, v21, v58 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v22, v22, v62 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_or_b32_e32 v23, v23, v59 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_or_b32_e32 v24, v24, v32 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_or_b32_e32 v25, v25, v61 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_or_b32_e32 v26, v26, v34 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v27, v27, v52 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_or_b32_e32 v28, v28, v35 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GCN-NEXT: v_or_b32_e32 v29, v29, v33 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_or_b32_e32 v30, v30, v44 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v57 +; GCN-NEXT: v_or_b32_e32 v31, v31, v36 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v50, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v51, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v52, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v33, v34, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v35, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v36, v35 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v37, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v37, v38, v37 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v39, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v48, v39 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_or_b32_e32 v48, v55, v48 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v49, v54, v49 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v18, v18, v50 +; GCN-NEXT: v_or_b32_e32 v19, v19, v51 +; GCN-NEXT: v_or_b32_e32 v20, v20, v52 +; GCN-NEXT: v_or_b32_e32 v21, v21, v53 +; GCN-NEXT: v_or_b32_e32 v22, v22, v32 +; GCN-NEXT: v_or_b32_e32 v23, v23, v33 +; GCN-NEXT: v_or_b32_e32 v24, v24, v34 +; GCN-NEXT: v_or_b32_e32 v25, v25, v35 +; GCN-NEXT: v_or_b32_e32 v26, v26, v36 +; GCN-NEXT: v_or_b32_e32 v27, v27, v37 +; GCN-NEXT: v_or_b32_e32 v28, v28, v38 +; GCN-NEXT: v_or_b32_e32 v29, v29, v39 +; GCN-NEXT: v_or_b32_e32 v30, v30, v48 +; GCN-NEXT: v_or_b32_e32 v31, v31, v49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; kill: killed $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: .LBB19_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB19_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v39, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v56, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v38, v2 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v47, v3 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v54, v4 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v46, v5 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v37, v6 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v43, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_or_b32_e32 v8, v45, v8 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v53, v9 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v42, v10 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v41, v11 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v40, v12 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v63, v13 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v50, v14 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v0, v15 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v0, v16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v0, v17 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_or_b32_e32 v18, v51, v18 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_or_b32_e32 v19, v49, v19 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_or_b32_e32 v20, v60, v20 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_or_b32_e32 v21, v58, v21 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v25, v62, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v29, v59, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v37, v32, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v50, v61, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v41, v34, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v45, v52, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v56, v35, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v58, v33, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v59, v44, v22 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v57 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v57, v36, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v60, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v61, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v62, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v63, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v49, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v51, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v52, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v54, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v26, v24 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v28, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v30, v28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v31, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v32, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v33, v34, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v35, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v48, v35 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v48, v53, v48 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_mov_b32_e32 v0, v55 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v55, v53 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v55, v40, v55 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v40, v42, v40 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v42, v43, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v43, v44, v43 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v44, v46, v44 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; GCN-NEXT: v_or_b32_e32 v46, v0, v46 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v47, v0, v47 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v60, v0 +; GCN-NEXT: s_movk_i32 s7, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v61, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v62, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v63, v3 +; GCN-NEXT: s_mov_b32 s6, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v21 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v37 +; GCN-NEXT: v_add_i32_e32 v50, vcc, s7, v50 +; GCN-NEXT: v_add_i32_e32 v41, vcc, s7, v41 +; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v45 +; GCN-NEXT: v_add_i32_e32 v56, vcc, s7, v56 +; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v58 +; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v59 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x300, v57 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 +; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 +; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 +; GCN-NEXT: v_or_b32_e32 v4, v36, v4 +; GCN-NEXT: v_or_b32_e32 v5, v38, v5 +; GCN-NEXT: v_or_b32_e32 v6, v39, v6 +; GCN-NEXT: v_or_b32_e32 v7, v49, v7 +; GCN-NEXT: v_or_b32_e32 v8, v51, v8 +; GCN-NEXT: v_or_b32_e32 v9, v52, v9 +; GCN-NEXT: v_or_b32_e32 v10, v54, v10 +; GCN-NEXT: v_or_b32_e32 v11, v22, v11 +; GCN-NEXT: v_or_b32_e32 v12, v23, v12 +; GCN-NEXT: v_or_b32_e32 v13, v24, v13 +; GCN-NEXT: v_or_b32_e32 v14, v26, v14 +; GCN-NEXT: v_or_b32_e32 v15, v27, v15 +; GCN-NEXT: v_or_b32_e32 v16, v28, v16 +; GCN-NEXT: v_or_b32_e32 v17, v30, v17 +; GCN-NEXT: v_or_b32_e32 v18, v31, v18 +; GCN-NEXT: v_or_b32_e32 v19, v32, v19 +; GCN-NEXT: v_or_b32_e32 v20, v33, v20 +; GCN-NEXT: v_or_b32_e32 v21, v34, v21 +; GCN-NEXT: v_or_b32_e32 v22, v35, v25 +; GCN-NEXT: v_or_b32_e32 v23, v48, v29 +; GCN-NEXT: v_or_b32_e32 v24, v53, v37 +; GCN-NEXT: v_or_b32_e32 v25, v55, v50 +; GCN-NEXT: v_or_b32_e32 v26, v40, v41 +; GCN-NEXT: v_or_b32_e32 v27, v42, v45 +; GCN-NEXT: v_or_b32_e32 v28, v43, v56 +; GCN-NEXT: v_or_b32_e32 v29, v44, v58 +; GCN-NEXT: v_or_b32_e32 v30, v46, v59 +; GCN-NEXT: v_or_b32_e32 v31, v47, v57 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 +; GCN-NEXT: .LBB19_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v32f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v54 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: .LBB19_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB19_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v31, 0x300 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v9, 3, v61 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v39, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v37, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v63 +; VI-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v59 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 3, v62 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 +; VI-NEXT: v_add_u16_sdwa v10, v10, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v58 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v60 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_add_u16_sdwa v11, v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 +; VI-NEXT: v_add_u16_sdwa v12, v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v46 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_add_u16_sdwa v13, v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v45 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v44 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_add_u16_sdwa v14, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v54 +; VI-NEXT: v_add_u16_e32 v15, 3, v40 +; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_add_u16_sdwa v15, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v16, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v18, 3, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v18, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v18, 3, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v18, 0x300, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v19, 0x300, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v20, 0x300, v20 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v21, v21, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v21, 0x300, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v22, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v23, 3, v23 +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v23, 3, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v24, 3, v24 +; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v24, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v24, 3, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v25, 3, v25 +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v25, 3, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v26, 3, v26 +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v26, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v26, 3, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v27, 3, v27 +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v27 +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v27, 3, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v28, 3, v28 +; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v28, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v27, v28 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v28, 3, v28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v29, 3, v29 +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v29, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v28, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v29, 3, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v30, 3, v30 +; VI-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v30, 3, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 3, v32 +; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v32, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v30, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v32, 3, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v33, 3, v33 +; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v32, v31 +; VI-NEXT: .LBB19_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v32f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: .LBB19_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v39, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v37, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v54 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v18, v18, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v19, v19, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v18, v18, v19 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v21, v21, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v23, v23, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v23 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX9-NEXT: .LBB19_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v128i8_to_v32f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:592 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:588 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:584 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:580 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:576 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:572 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:568 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:564 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:560 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:556 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:552 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:548 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:544 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:540 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:536 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:532 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:528 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:524 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:520 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:516 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:512 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:508 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:504 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:500 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:496 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:492 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:488 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:484 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:480 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:476 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:472 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:468 +; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:464 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:460 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:456 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:452 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:448 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:444 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:440 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:436 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:432 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:428 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:424 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:420 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:416 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:412 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:408 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:404 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:400 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:396 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:220 +; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-NEXT: scratch_load_u16 v115, off, s32 +; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:88 +; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:136 +; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:144 +; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:160 +; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:168 +; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:176 +; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:184 +; GFX11-NEXT: scratch_load_u16 v136, off, s32 offset:192 +; GFX11-NEXT: scratch_load_u16 v137, off, s32 offset:200 +; GFX11-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-NEXT: scratch_load_u16 v132, off, s32 offset:212 +; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:204 +; GFX11-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:188 +; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:180 +; GFX11-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-NEXT: scratch_load_u16 v160, off, s32 offset:156 +; GFX11-NEXT: scratch_load_u16 v161, off, s32 offset:148 +; GFX11-NEXT: scratch_load_u16 v167, off, s32 offset:140 +; GFX11-NEXT: scratch_load_u16 v176, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v177, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v40, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v42, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v43, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v44, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v58, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v59, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v60, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v124, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v125, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v126, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v127, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v111, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v120, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v121, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v122, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v123, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v106, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v107, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v108, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v109, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v110, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(54) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-NEXT: s_waitcnt vmcnt(53) +; GFX11-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-NEXT: s_waitcnt vmcnt(52) +; GFX11-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-NEXT: s_waitcnt vmcnt(51) +; GFX11-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-NEXT: s_waitcnt vmcnt(50) +; GFX11-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-NEXT: s_waitcnt vmcnt(49) +; GFX11-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-NEXT: s_waitcnt vmcnt(48) +; GFX11-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-NEXT: s_waitcnt vmcnt(47) +; GFX11-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-NEXT: s_waitcnt vmcnt(46) +; GFX11-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-NEXT: s_waitcnt vmcnt(45) +; GFX11-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-NEXT: s_waitcnt vmcnt(44) +; GFX11-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-NEXT: s_waitcnt vmcnt(43) +; GFX11-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-NEXT: s_waitcnt vmcnt(42) +; GFX11-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-NEXT: s_waitcnt vmcnt(41) +; GFX11-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-NEXT: s_waitcnt vmcnt(40) +; GFX11-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-NEXT: s_waitcnt vmcnt(39) +; GFX11-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-NEXT: s_waitcnt vmcnt(38) +; GFX11-NEXT: v_lshlrev_b16 v46, 8, v163 +; GFX11-NEXT: s_waitcnt vmcnt(37) +; GFX11-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-NEXT: s_waitcnt vmcnt(36) +; GFX11-NEXT: v_lshlrev_b16 v56, 8, v165 +; GFX11-NEXT: s_waitcnt vmcnt(35) +; GFX11-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-NEXT: s_waitcnt vmcnt(34) +; GFX11-NEXT: v_lshlrev_b16 v179, 8, v179 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-NEXT: s_waitcnt vmcnt(32) +; GFX11-NEXT: v_lshlrev_b16 v181, 8, v181 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: v_lshlrev_b16 v183, 8, v183 +; GFX11-NEXT: s_waitcnt vmcnt(29) +; GFX11-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: v_lshlrev_b16 v163, 8, v137 +; GFX11-NEXT: s_waitcnt vmcnt(27) +; GFX11-NEXT: v_lshlrev_b16 v164, 8, v138 +; GFX11-NEXT: v_lshlrev_b16 v165, 8, v103 +; GFX11-NEXT: v_lshlrev_b16 v166, 8, v102 +; GFX11-NEXT: v_lshlrev_b16 v144, 8, v101 +; GFX11-NEXT: v_lshlrev_b16 v145, 8, v100 +; GFX11-NEXT: v_lshlrev_b16 v146, 8, v99 +; GFX11-NEXT: v_lshlrev_b16 v147, 8, v31 +; GFX11-NEXT: v_lshlrev_b16 v148, 8, v30 +; GFX11-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v124 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v125 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v126 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v127 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v39 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v111 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v121 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v120 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v122 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v123 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v107 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v38 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v108 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v109 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v110 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v106 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v12 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v92 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v78 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v77 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v76 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v75 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v74 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v59 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v93 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v94 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v95 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v104 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v105 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v79 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v88 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v89 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v90 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v91 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v58 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v44 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v43 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v41 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v40 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v176 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v61 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v62 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v63 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v72 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v73 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v45 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v46 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v47 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v56 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v57 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v160 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v151 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v150 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v134 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v113 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v179 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v180 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v181 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v182 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v183 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v162 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v163 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v164 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v165 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v112 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v98 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v97 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v87 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v86 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v84 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v83 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v82 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v144 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v145 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v146 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v147 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v148 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v119 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v128 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v129 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v130 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v70 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v68 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v66 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v65 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v64 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v114 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v115 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v116 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v117 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v118 +; GFX11-NEXT: v_or_b32_e32 v32, v32, v99 +; GFX11-NEXT: v_or_b32_e32 v33, v33, v100 +; GFX11-NEXT: v_or_b32_e32 v34, v34, v101 +; GFX11-NEXT: v_or_b32_e32 v35, v35, v102 +; GFX11-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr92 +; GFX11-NEXT: ; implicit-def: $vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr77 +; GFX11-NEXT: ; implicit-def: $vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr124 +; GFX11-NEXT: ; implicit-def: $vgpr125 +; GFX11-NEXT: ; implicit-def: $vgpr126 +; GFX11-NEXT: ; implicit-def: $vgpr127 +; GFX11-NEXT: ; implicit-def: $vgpr111 +; GFX11-NEXT: ; implicit-def: $vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr121 +; GFX11-NEXT: ; implicit-def: $vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr108 +; GFX11-NEXT: ; implicit-def: $vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr110 +; GFX11-NEXT: ; implicit-def: $vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr94 +; GFX11-NEXT: ; implicit-def: $vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr104 +; GFX11-NEXT: ; implicit-def: $vgpr105 +; GFX11-NEXT: ; implicit-def: $vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr89 +; GFX11-NEXT: ; implicit-def: $vgpr90 +; GFX11-NEXT: ; implicit-def: $vgpr91 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: .LBB19_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v55, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v54, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v53, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v52, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v51, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_add_nc_u16 v5, v50, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v124, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v125, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v126, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v127, v3 +; GFX11-NEXT: v_add_nc_u16 v6, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v7, v48, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u16 v8, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v36, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_nc_u16 v10, v35, 3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_add_nc_u16 v2, v39, 3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v11, v34, 3 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_or_b32_e32 v3, v111, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v120, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v121, v5 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v6, v122, v6 +; GFX11-NEXT: v_or_b32_e32 v8, v107, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v108, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v109, v10 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v38, 3 +; GFX11-NEXT: v_or_b32_e32 v11, v110, v11 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_or_b32_e32 v2, v106, v2 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v12 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_add_nc_u16 v7, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v9, v92, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v78, 3 +; GFX11-NEXT: v_add_nc_u16 v11, v77, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v76, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v75, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v74, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v60, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v59, 3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_or_b32_e32 v7, v93, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v94, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v95, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v104, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v105, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v79, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v88, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v89, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v90, v15 +; GFX11-NEXT: v_or_b32_e32 v16, v91, v16 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_add_nc_u16 v12, v58, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v44, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v43, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v42, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v41, 3 +; GFX11-NEXT: v_add_nc_u16 v17, v40, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v178, 3 +; GFX11-NEXT: v_add_nc_u16 v19, v177, 3 +; GFX11-NEXT: v_add_nc_u16 v20, v176, 3 +; GFX11-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v62, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v63, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v72, v15 +; GFX11-NEXT: v_or_b32_e32 v16, v73, v16 +; GFX11-NEXT: v_or_b32_e32 v17, v45, v17 +; GFX11-NEXT: v_or_b32_e32 v18, v46, v18 +; GFX11-NEXT: v_or_b32_e32 v19, v47, v19 +; GFX11-NEXT: v_or_b32_e32 v20, v56, v20 +; GFX11-NEXT: v_or_b32_e32 v21, v57, v21 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v160, 3 +; GFX11-NEXT: v_add_nc_u16 v19, v151, 3 +; GFX11-NEXT: v_add_nc_u16 v20, v150, 3 +; GFX11-NEXT: v_add_nc_u16 v21, v149, 3 +; GFX11-NEXT: v_add_nc_u16 v22, v135, 3 +; GFX11-NEXT: v_add_nc_u16 v23, v134, 3 +; GFX11-NEXT: v_add_nc_u16 v24, v133, 3 +; GFX11-NEXT: v_add_nc_u16 v25, v132, 3 +; GFX11-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-NEXT: v_or_b32_e32 v18, v180, v18 +; GFX11-NEXT: v_or_b32_e32 v19, v181, v19 +; GFX11-NEXT: v_or_b32_e32 v20, v182, v20 +; GFX11-NEXT: v_or_b32_e32 v21, v183, v21 +; GFX11-NEXT: v_or_b32_e32 v22, v162, v22 +; GFX11-NEXT: v_or_b32_e32 v23, v163, v23 +; GFX11-NEXT: v_or_b32_e32 v24, v164, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v165, v25 +; GFX11-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_add_nc_u16 v22, v112, 3 +; GFX11-NEXT: v_add_nc_u16 v23, v98, 3 +; GFX11-NEXT: v_add_nc_u16 v24, v97, 3 +; GFX11-NEXT: v_add_nc_u16 v25, v96, 3 +; GFX11-NEXT: v_add_nc_u16 v26, v87, 3 +; GFX11-NEXT: v_add_nc_u16 v27, v86, 3 +; GFX11-NEXT: v_add_nc_u16 v28, v85, 3 +; GFX11-NEXT: v_add_nc_u16 v29, v84, 3 +; GFX11-NEXT: v_add_nc_u16 v30, v83, 3 +; GFX11-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-NEXT: v_or_b32_e32 v23, v145, v23 +; GFX11-NEXT: v_or_b32_e32 v24, v146, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v147, v25 +; GFX11-NEXT: v_or_b32_e32 v26, v148, v26 +; GFX11-NEXT: v_or_b32_e32 v27, v119, v27 +; GFX11-NEXT: v_or_b32_e32 v28, v128, v28 +; GFX11-NEXT: v_or_b32_e32 v29, v129, v29 +; GFX11-NEXT: v_or_b32_e32 v30, v130, v30 +; GFX11-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-NEXT: v_add_nc_u16 v27, v81, 3 +; GFX11-NEXT: v_add_nc_u16 v28, v80, 3 +; GFX11-NEXT: v_add_nc_u16 v29, v71, 3 +; GFX11-NEXT: v_add_nc_u16 v30, v70, 3 +; GFX11-NEXT: v_add_nc_u16 v31, v69, 3 +; GFX11-NEXT: v_add_nc_u16 v32, v68, 3 +; GFX11-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-NEXT: v_add_nc_u16 v34, v66, 3 +; GFX11-NEXT: v_add_nc_u16 v35, v65, 3 +; GFX11-NEXT: v_add_nc_u16 v36, v64, 3 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-NEXT: v_or_b32_e32 v27, v114, v27 +; GFX11-NEXT: v_or_b32_e32 v28, v115, v28 +; GFX11-NEXT: v_or_b32_e32 v29, v116, v29 +; GFX11-NEXT: v_or_b32_e32 v30, v117, v30 +; GFX11-NEXT: v_or_b32_e32 v31, v118, v31 +; GFX11-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-NEXT: v_or_b32_e32 v33, v100, v33 +; GFX11-NEXT: v_or_b32_e32 v34, v101, v34 +; GFX11-NEXT: v_or_b32_e32 v35, v102, v35 +; GFX11-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-NEXT: v_add_nc_u16 v34, 0x300, v34 +; GFX11-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-NEXT: v_add_nc_u16 v36, 0x300, v36 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-NEXT: .LBB19_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:392 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:396 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:400 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:404 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:408 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:412 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:416 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:420 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:424 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:428 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:432 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:436 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:440 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:444 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:448 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:452 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:456 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:460 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:464 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:468 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:472 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:476 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:480 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:484 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:488 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:492 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:496 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:500 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:504 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:508 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:512 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:516 +; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:520 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:524 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:528 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:532 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:536 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:540 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:544 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:548 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:552 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:556 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:560 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:564 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:568 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:572 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:576 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:580 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:584 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:588 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <64 x bfloat> @bitcast_v32f32_to_v64bf16(<32 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f32_to_v64bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB20_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v62 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: .LBB20_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB20_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_add_f32_e32 v32, 1.0, v62 +; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; GCN-NEXT: .LBB20_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v60 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v59 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v56, v1, v2, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v45, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v44, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v47, vcc, 20, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_alignbit_b32 v32, v32, v33, 16 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_alignbit_b32 v34, v34, v35, 16 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_alignbit_b32 v36, v36, v37, 16 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_alignbit_b32 v38, v38, v39, 16 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_alignbit_b32 v48, v48, v49, 16 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_alignbit_b32 v50, v50, v51, 16 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_alignbit_b32 v52, v52, v53, 16 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_alignbit_b32 v54, v54, v55, 16 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_alignbit_b32 v40, v40, v41, 16 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v63 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_alignbit_b32 v42, v42, v43, 16 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v56, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v45, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v47, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f32_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f32_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <32 x float> @bitcast_v64bf16_to_v32f32(<64 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v64bf16_to_v32f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v33 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v40 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v63 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v43 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v54 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB21_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; GCN-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v61, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; GCN-NEXT: v_alignbit_b32 v2, v2, v59, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; GCN-NEXT: v_alignbit_b32 v3, v3, v57, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v46 +; GCN-NEXT: v_alignbit_b32 v4, v4, v47, 16 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v5, v5, v45, 16 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v16, v16, v33, 16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v43 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; GCN-NEXT: v_alignbit_b32 v19, v19, v42, 16 +; GCN-NEXT: v_alignbit_b32 v20, v20, v44, 16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; GCN-NEXT: v_alignbit_b32 v22, v22, v48, 16 +; GCN-NEXT: v_alignbit_b32 v23, v23, v38, 16 +; GCN-NEXT: v_alignbit_b32 v24, v24, v50, 16 +; GCN-NEXT: v_alignbit_b32 v25, v25, v51, 16 +; GCN-NEXT: v_alignbit_b32 v26, v26, v53, 16 +; GCN-NEXT: v_alignbit_b32 v27, v27, v55, 16 +; GCN-NEXT: v_alignbit_b32 v28, v28, v40, 16 +; GCN-NEXT: v_alignbit_b32 v29, v29, v63, 16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; kill: killed $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: .LBB21_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB21_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v39 +; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v40 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v54 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v41 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v43 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v30 +; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v31 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v32 +; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v34 +; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v36 +; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v38 +; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v48 +; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v49 +; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v50 +; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 +; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v52 +; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; GCN-NEXT: v_alignbit_b32 v19, v21, v20, 16 +; GCN-NEXT: v_alignbit_b32 v20, v39, v54, 16 +; GCN-NEXT: v_alignbit_b32 v21, v48, v40, 16 +; GCN-NEXT: v_alignbit_b32 v22, v49, v22, 16 +; GCN-NEXT: v_alignbit_b32 v23, v50, v23, 16 +; GCN-NEXT: v_alignbit_b32 v24, v51, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v32, v25, 16 +; GCN-NEXT: v_alignbit_b32 v26, v33, v26, 16 +; GCN-NEXT: v_alignbit_b32 v27, v34, v27, 16 +; GCN-NEXT: v_alignbit_b32 v28, v35, v28, 16 +; GCN-NEXT: v_alignbit_b32 v29, v36, v29, 16 +; GCN-NEXT: v_alignbit_b32 v30, v37, v30, 16 +; GCN-NEXT: v_alignbit_b32 v31, v38, v31, 16 +; GCN-NEXT: .LBB21_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v32f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v32f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 +; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64bf16_to_v32f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v39, v14, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-NEXT: v_add3_u32 v34, v34, v32, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo +; GFX11-NEXT: v_add3_u32 v35, v39, v14, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add3_u32 v33, v33, v13, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v34, v34, v12, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v33, v33, v11, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v35, v35, v10, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX11-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v33, v33, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v8 +; GFX11-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_bfe_u32 v35, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v6, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v33, v33, v7, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GFX11-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 +; GFX11-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v34, v34, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v4 +; GFX11-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v33, v33, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v0, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v33, v33, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v0 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-NEXT: v_add3_u32 v33, v33, v31, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_bfe_u32 v35, v30, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v30, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_add3_u32 v33, v33, v29, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v28, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v28 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-NEXT: v_add3_u32 v33, v33, v27, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_bfe_u32 v35, v26, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v26 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add3_u32 v33, v33, v25, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v33, v23, 0x7fff +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GFX11-NEXT: v_add3_u32 v35, v36, v34, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v21, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v32, v38, v33, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; GFX11-NEXT: v_add3_u32 v35, v37, v33, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v35, v37, v19, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v19 +; GFX11-NEXT: v_add3_u32 v37, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_bfe_u32 v48, v18, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v36, v48, v18, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-NEXT: v_add3_u32 v37, v38, v35, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_bfe_u32 v48, v36, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v16 +; GFX11-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo +; GFX11-NEXT: v_add3_u32 v38, v39, v17, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_bfe_u32 v37, v16, 16, 1 +; GFX11-NEXT: v_add3_u32 v48, v48, v36, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_add3_u32 v37, v37, v16, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 +; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f32_to_v64f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB22_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v62 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v63 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v62 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v63 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v36 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v48 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v49 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v50 +; GCN-NEXT: v_mov_b32_e32 v50, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v56 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v58 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v60 +; GCN-NEXT: v_cvt_f32_f16_e32 v47, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v57, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: .LBB22_2: ; %Flow +; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v31, v33 +; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB22_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v31, 1.0, v63 +; GCN-NEXT: v_add_f32_e32 v32, 1.0, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v63 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v61 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v58 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 +; GCN-NEXT: .LBB22_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v61 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v59 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v59, v2, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v58 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v56, v2, v1 +; GCN-NEXT: v_add_i32_e32 v61, vcc, 8, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v57, v2, v1 +; GCN-NEXT: v_add_i32_e32 v60, vcc, 12, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v47 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v47, v2, v1 +; GCN-NEXT: v_add_i32_e32 v58, vcc, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v46 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_add_i32_e32 v46, vcc, 20, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v45 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v44 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v43 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v42 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v41 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v40 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v54 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v53 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v51 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v49 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v20, v21, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v48 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v37 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v35 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_or_b32_e32 v28, v29, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v34 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_or_b32_e32 v30, v31, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v36 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_or_b32_e32 v34, v35, v34 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_or_b32_e32 v36, v37, v36 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_or_b32_e32 v38, v39, v38 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_or_b32_e32 v48, v49, v48 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_or_b32_e32 v50, v51, v50 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_or_b32_e32 v52, v53, v52 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_or_b32_e32 v54, v55, v54 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_or_b32_e32 v40, v41, v40 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_or_b32_e32 v42, v43, v42 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v59, v61, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v56, v60, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v47, v46, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v64f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f32_to_v64f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f32_to_v64f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <32 x float> @bitcast_v64f16_to_v32f32(<64 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v64f16_to_v32f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v62, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v61, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v59, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v58, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v57, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v56, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v47, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v37 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v31 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v1 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB23_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; GCN-NEXT: v_or_b32_e32 v0, v62, v0 +; GCN-NEXT: v_or_b32_e32 v1, v60, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; GCN-NEXT: v_or_b32_e32 v2, v58, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; GCN-NEXT: v_or_b32_e32 v3, v56, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; GCN-NEXT: v_or_b32_e32 v4, v46, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v51 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v32, v18 +; GCN-NEXT: v_or_b32_e32 v19, v43, v19 +; GCN-NEXT: v_or_b32_e32 v20, v41, v20 +; GCN-NEXT: v_or_b32_e32 v21, v55, v21 +; GCN-NEXT: v_or_b32_e32 v22, v49, v22 +; GCN-NEXT: v_or_b32_e32 v23, v50, v23 +; GCN-NEXT: v_or_b32_e32 v24, v39, v24 +; GCN-NEXT: v_or_b32_e32 v25, v36, v25 +; GCN-NEXT: v_or_b32_e32 v26, v48, v26 +; GCN-NEXT: v_or_b32_e32 v27, v52, v27 +; GCN-NEXT: v_or_b32_e32 v28, v53, v28 +; GCN-NEXT: v_or_b32_e32 v29, v54, v29 +; GCN-NEXT: v_or_b32_e32 v30, v40, v30 +; GCN-NEXT: v_or_b32_e32 v31, v42, v31 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: .LBB23_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB23_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v59 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v56 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v45 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v41 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v55 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v50 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v39 +; GCN-NEXT: v_mov_b32_e32 v39, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v48 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v31 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v49 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v52 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_or_b32_e32 v19, v21, v20 +; GCN-NEXT: v_or_b32_e32 v20, v55, v39 +; GCN-NEXT: v_or_b32_e32 v21, v41, v48 +; GCN-NEXT: v_or_b32_e32 v22, v22, v49 +; GCN-NEXT: v_or_b32_e32 v23, v23, v50 +; GCN-NEXT: v_or_b32_e32 v24, v24, v51 +; GCN-NEXT: v_or_b32_e32 v25, v25, v32 +; GCN-NEXT: v_or_b32_e32 v26, v26, v36 +; GCN-NEXT: v_or_b32_e32 v27, v27, v38 +; GCN-NEXT: v_or_b32_e32 v28, v28, v33 +; GCN-NEXT: v_or_b32_e32 v29, v29, v34 +; GCN-NEXT: v_or_b32_e32 v30, v30, v35 +; GCN-NEXT: v_or_b32_e32 v31, v31, v37 +; GCN-NEXT: .LBB23_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v32f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v32, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v33, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v18, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_or_b32_e32 v18, v18, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v32, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64f16_to_v32f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64f16_to_v32f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <64 x i16> @bitcast_v32f32_to_v64i16(<32 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f32_to_v64i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; kill: killed $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; kill: killed $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 +; GCN-NEXT: v_alignbit_b32 v50, v16, v15, 16 +; GCN-NEXT: v_alignbit_b32 v52, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v55, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v41, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GCN-NEXT: .LBB24_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 +; GCN-NEXT: v_alignbit_b32 v50, v16, v15, 16 +; GCN-NEXT: v_alignbit_b32 v52, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v55, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v41, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GCN-NEXT: .LBB24_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v59 +; GCN-NEXT: v_or_b32_e32 v1, v1, v44 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_or_b32_e32 v2, v2, v33 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; GCN-NEXT: v_or_b32_e32 v59, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; GCN-NEXT: v_or_b32_e32 v57, v1, v2 +; GCN-NEXT: v_add_i32_e32 v44, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; GCN-NEXT: v_or_b32_e32 v63, v1, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; GCN-NEXT: v_or_b32_e32 v46, v1, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v43 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; GCN-NEXT: v_or_b32_e32 v61, v3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v41 +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v55 +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v58 +; GCN-NEXT: v_or_b32_e32 v10, v10, v11 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v56 +; GCN-NEXT: v_or_b32_e32 v14, v14, v33 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_or_b32_e32 v15, v15, v50 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 52, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v47 +; GCN-NEXT: v_or_b32_e32 v16, v16, v52 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_or_b32_e32 v17, v17, v48 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 60, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v45 +; GCN-NEXT: v_or_b32_e32 v18, v18, v55 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 64, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_or_b32_e32 v19, v19, v39 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_or_b32_e32 v20, v20, v34 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x48, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v38 +; GCN-NEXT: v_or_b32_e32 v21, v21, v34 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x4c, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v42 +; GCN-NEXT: v_or_b32_e32 v22, v22, v34 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x50, v0 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GCN-NEXT: v_or_b32_e32 v23, v23, v34 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v40 +; GCN-NEXT: v_or_b32_e32 v24, v24, v34 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x58, v0 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v36 +; GCN-NEXT: v_or_b32_e32 v25, v25, v34 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x5c, v0 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v54 +; GCN-NEXT: v_or_b32_e32 v26, v26, v34 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v35 +; GCN-NEXT: v_or_b32_e32 v27, v27, v34 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x64, v0 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v53 +; GCN-NEXT: v_or_b32_e32 v28, v28, v34 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_or_b32_e32 v29, v29, v34 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x6c, v0 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GCN-NEXT: v_or_b32_e32 v30, v30, v51 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x70, v0 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: v_or_b32_e32 v32, v32, v43 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GCN-NEXT: v_or_b32_e32 v31, v31, v49 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v59, v44, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v63, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v29, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f32_to_v64i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; VI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; VI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; VI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; VI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; VI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; VI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; VI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; VI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; VI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; VI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; VI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f32_to_v64i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v31, 1.0, v31 +; GFX9-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GFX9-NEXT: v_add_f32_e32 v29, 1.0, v29 +; GFX9-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GFX9-NEXT: v_add_f32_e32 v27, 1.0, v27 +; GFX9-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GFX9-NEXT: v_add_f32_e32 v25, 1.0, v25 +; GFX9-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GFX9-NEXT: v_add_f32_e32 v23, 1.0, v23 +; GFX9-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GFX9-NEXT: v_add_f32_e32 v21, 1.0, v21 +; GFX9-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f32_to_v64i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v31, 1.0, v31 :: v_dual_add_f32 v30, 1.0, v30 +; GFX11-NEXT: v_dual_add_f32 v29, 1.0, v29 :: v_dual_add_f32 v28, 1.0, v28 +; GFX11-NEXT: v_dual_add_f32 v27, 1.0, v27 :: v_dual_add_f32 v26, 1.0, v26 +; GFX11-NEXT: v_dual_add_f32 v25, 1.0, v25 :: v_dual_add_f32 v24, 1.0, v24 +; GFX11-NEXT: v_dual_add_f32 v23, 1.0, v23 :: v_dual_add_f32 v22, 1.0, v22 +; GFX11-NEXT: v_dual_add_f32 v21, 1.0, v21 :: v_dual_add_f32 v20, 1.0, v20 +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <32 x float> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x float> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <32 x float> @bitcast_v64i16_to_v32f32(<64 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i16_to_v32f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v37, v20 +; GCN-NEXT: v_mov_b32_e32 v38, v18 +; GCN-NEXT: v_mov_b32_e32 v39, v16 +; GCN-NEXT: v_mov_b32_e32 v48, v14 +; GCN-NEXT: v_mov_b32_e32 v49, v12 +; GCN-NEXT: v_mov_b32_e32 v50, v10 +; GCN-NEXT: v_mov_b32_e32 v51, v8 +; GCN-NEXT: v_mov_b32_e32 v52, v6 +; GCN-NEXT: v_mov_b32_e32 v53, v4 +; GCN-NEXT: v_mov_b32_e32 v54, v2 +; GCN-NEXT: v_mov_b32_e32 v55, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(12) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v24 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB25_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GCN-NEXT: v_or_b32_e32 v0, v0, v36 +; GCN-NEXT: v_or_b32_e32 v1, v1, v58 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; GCN-NEXT: v_or_b32_e32 v2, v2, v57 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; GCN-NEXT: v_or_b32_e32 v3, v3, v35 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; GCN-NEXT: v_or_b32_e32 v4, v4, v60 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v5, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v7, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v10, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v43 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v56 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v45 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v32 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v42 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v41 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v40 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v63 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v62 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v61 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v47 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v44 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v18, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v19, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v20, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v21, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v22, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v23, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v24, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v25, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v26, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v27, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v28, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v29, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v30, v32 +; GCN-NEXT: v_or_b32_e32 v31, v31, v59 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: .LBB25_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB25_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v36, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v58, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v57, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v35, v3 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v45 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v41 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v62 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v61 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v47 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v44 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v4, v60, v4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v32, v5 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v32, v6 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v32, v7 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v32, v8 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v32, v9 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v32, v10 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v32, v11 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v32, v12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v32, v13 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v32, v14 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v32, v15 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v32, v16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v32, v17 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v32, v18 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v32, v19 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v32, v20 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v32, v22 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v32, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v32, v25 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v32, v26 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v32, v27 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v32, v29 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v32, v30 +; GCN-NEXT: v_or_b32_e32 v31, v59, v31 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 +; GCN-NEXT: .LBB25_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(11) +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v32f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v33, 3 +; VI-NEXT: v_add_u16_e32 v32, 3, v15 +; VI-NEXT: v_add_u16_sdwa v15, v15, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v32, v15 +; VI-NEXT: v_add_u16_e32 v32, 3, v14 +; VI-NEXT: v_add_u16_sdwa v14, v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v32, v14 +; VI-NEXT: v_add_u16_e32 v32, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v32, v13 +; VI-NEXT: v_add_u16_e32 v32, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v32, v12 +; VI-NEXT: v_add_u16_e32 v32, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v32, v11 +; VI-NEXT: v_add_u16_e32 v32, 3, v10 +; VI-NEXT: v_add_u16_sdwa v10, v10, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v32, v10 +; VI-NEXT: v_add_u16_e32 v32, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v32, v9 +; VI-NEXT: v_add_u16_e32 v32, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v32, v8 +; VI-NEXT: v_add_u16_e32 v32, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v32, v7 +; VI-NEXT: v_add_u16_e32 v32, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v32, v6 +; VI-NEXT: v_add_u16_e32 v32, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v32, v5 +; VI-NEXT: v_add_u16_e32 v32, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v32, v4 +; VI-NEXT: v_add_u16_e32 v32, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v32, v3 +; VI-NEXT: v_add_u16_e32 v32, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v32, v2 +; VI-NEXT: v_add_u16_e32 v32, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v32, v1 +; VI-NEXT: v_add_u16_e32 v32, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v32, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 3, v31 +; VI-NEXT: v_add_u16_sdwa v31, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v32, v31 +; VI-NEXT: v_add_u16_e32 v32, 3, v30 +; VI-NEXT: v_add_u16_sdwa v30, v30, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v32, v30 +; VI-NEXT: v_add_u16_e32 v32, 3, v29 +; VI-NEXT: v_add_u16_sdwa v29, v29, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v32, v29 +; VI-NEXT: v_add_u16_e32 v32, 3, v28 +; VI-NEXT: v_add_u16_sdwa v28, v28, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v32, v28 +; VI-NEXT: v_add_u16_e32 v32, 3, v27 +; VI-NEXT: v_add_u16_sdwa v27, v27, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: v_add_u16_e32 v32, 3, v26 +; VI-NEXT: v_add_u16_sdwa v26, v26, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v32, v26 +; VI-NEXT: v_add_u16_e32 v32, 3, v25 +; VI-NEXT: v_add_u16_sdwa v25, v25, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v32, v25 +; VI-NEXT: v_add_u16_e32 v32, 3, v24 +; VI-NEXT: v_add_u16_sdwa v24, v24, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v32, v24 +; VI-NEXT: v_add_u16_e32 v32, 3, v23 +; VI-NEXT: v_add_u16_sdwa v23, v23, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v32, v23 +; VI-NEXT: v_add_u16_e32 v32, 3, v22 +; VI-NEXT: v_add_u16_sdwa v22, v22, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v32, v22 +; VI-NEXT: v_add_u16_e32 v32, 3, v21 +; VI-NEXT: v_add_u16_sdwa v21, v21, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v32, v21 +; VI-NEXT: v_add_u16_e32 v32, 3, v20 +; VI-NEXT: v_add_u16_sdwa v20, v20, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v32, v20 +; VI-NEXT: v_add_u16_e32 v32, 3, v19 +; VI-NEXT: v_add_u16_sdwa v19, v19, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v32, v19 +; VI-NEXT: v_add_u16_e32 v32, 3, v18 +; VI-NEXT: v_add_u16_sdwa v18, v18, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v32, v18 +; VI-NEXT: v_add_u16_e32 v32, 3, v17 +; VI-NEXT: v_add_u16_sdwa v17, v17, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v32, v17 +; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i16_to_v32f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i16_to_v32f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB25_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <32 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <32 x float> + br label %end + +end: + %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x float> %phi +} + +define <16 x double> @bitcast_v16i64_to_v16f64(<16 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i64_to_v16f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB26_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; GCN-NEXT: .LBB26_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i64_to_v16f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i64_to_v16f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i64_to_v16f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <16 x i64> @bitcast_v16f64_to_v16i64(<16 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f64_to_v16i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB27_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GCN-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GCN-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GCN-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GCN-NEXT: .LBB27_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f64_to_v16i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f64_to_v16i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f64_to_v16i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i64_to_v128i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v57 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v57 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v57 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 +; GCN-NEXT: .LBB28_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; GCN-NEXT: v_addc_u32_e32 v57, vcc, 0, v57, vcc +; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v57, v59, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v30, v29, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v28, v27, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v26, v25, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v24, v23, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v22, v21, 8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v20, v19, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v18, v17, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v34, v18, v17, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v16, v15, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v14, v13, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v36, v14, v13, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v12, v11, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v37, v12, v11, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v38, v10, v9, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v8, v7, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v39, v8, v7, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v6, v5, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v48, v6, v5, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v50, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v51, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v52, v2, v1, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v57 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v57 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v53, 8, v57 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v54, 8, v30 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v26 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v22 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v20 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v20 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v18 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v18 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v18 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v16 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v12 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v10 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v6 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v6 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v4 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v2 +; GCN-NEXT: .LBB28_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v52 +; GCN-NEXT: v_or_b32_e32 v1, v1, v52 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GCN-NEXT: v_or_b32_e32 v2, v2, v31 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_or_b32_e32 v31, v50, v31 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v63 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_or_b32_e32 v50, v51, v50 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v31 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v50 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v49 +; GCN-NEXT: v_or_b32_e32 v31, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v61 +; GCN-NEXT: v_or_b32_e32 v49, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; GCN-NEXT: v_or_b32_e32 v2, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v60 +; GCN-NEXT: v_or_b32_e32 v61, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39 +; GCN-NEXT: v_or_b32_e32 v62, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v58 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v38 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v56 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v37 +; GCN-NEXT: v_or_b32_e32 v7, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v47 +; GCN-NEXT: v_or_b32_e32 v8, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v36 +; GCN-NEXT: v_or_b32_e32 v9, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v46 +; GCN-NEXT: v_or_b32_e32 v10, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v35 +; GCN-NEXT: v_or_b32_e32 v11, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v45 +; GCN-NEXT: v_or_b32_e32 v12, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v34 +; GCN-NEXT: v_or_b32_e32 v13, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v44 +; GCN-NEXT: v_or_b32_e32 v14, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v33 +; GCN-NEXT: v_or_b32_e32 v15, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v43 +; GCN-NEXT: v_or_b32_e32 v16, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v21 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v17, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v42 +; GCN-NEXT: v_or_b32_e32 v18, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v23 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v19, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v41 +; GCN-NEXT: v_or_b32_e32 v20, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v21, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v40 +; GCN-NEXT: v_or_b32_e32 v22, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v23, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v55 +; GCN-NEXT: v_or_b32_e32 v24, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v29 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v25, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 +; GCN-NEXT: v_or_b32_e32 v26, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v27, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v53 +; GCN-NEXT: v_or_b32_e32 v28, v1, v3 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v29, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v32, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v30, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v33, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v34, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v35, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v36, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v37, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v38, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v39, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v48, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v50, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v51, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v52, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v53, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v54, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v55, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v40, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v41, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v42, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v43, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v44, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v45, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v46, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v47, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v56, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v57, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v58, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v59, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v60, v3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v4, v1, v29 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GCN-NEXT: v_or_b32_e32 v5, v1, v32 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v63, v2, v30 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v61 +; GCN-NEXT: v_or_b32_e32 v61, v3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v62 +; GCN-NEXT: v_or_b32_e32 v6, v6, v34 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v35 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v29, v29, v36 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v30, v30, v37 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v38 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v7, v7, v39 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v8, v8, v48 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v9, v9, v50 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v10, v10, v51 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v11, v11, v52 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v12, v12, v53 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v13, v13, v54 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v14, v14, v55 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v15, v15, v40 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v16, v16, v41 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v17, v17, v42 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v18, v18, v43 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v19, v19, v44 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x58, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v20, v20, v45 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x5c, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GCN-NEXT: v_or_b32_e32 v21, v21, v46 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x60, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GCN-NEXT: v_or_b32_e32 v22, v22, v47 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x64, v0 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GCN-NEXT: v_or_b32_e32 v23, v23, v56 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x68, v0 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; GCN-NEXT: v_or_b32_e32 v24, v24, v57 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x6c, v0 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GCN-NEXT: v_or_b32_e32 v25, v25, v58 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x70, v0 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; GCN-NEXT: v_or_b32_e32 v26, v26, v59 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; GCN-NEXT: v_or_b32_e32 v27, v27, v60 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i64_to_v128i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[29:30] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[27:28] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[25:26] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[21:22] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[15:16] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[13:14] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[11:12] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10] +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v32 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; VI-NEXT: v_mov_b32_e32 v55, v39 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 +; VI-NEXT: .LBB28_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; VI-NEXT: v_add_u32_e32 v31, vcc, 3, v31 +; VI-NEXT: v_addc_u32_e32 v32, vcc, 0, v32, vcc +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v27 +; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v10 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v34, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v1 +; VI-NEXT: .LBB28_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v48 +; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v57 +; VI-NEXT: v_or_b32_sdwa v2, v2, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v41 +; VI-NEXT: v_or_b32_sdwa v48, v53, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i64_to_v128i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; kill: killed $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: .LBB28_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB28_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 3, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 3, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 3, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 3, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v10, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, 3, v11 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v12, vcc +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, 3, v13 +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v14, vcc +; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, 3, v15 +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v16, vcc +; GFX9-NEXT: v_add_co_u32_e32 v17, vcc, 3, v17 +; GFX9-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v18, vcc +; GFX9-NEXT: v_add_co_u32_e32 v19, vcc, 3, v19 +; GFX9-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v20, vcc +; GFX9-NEXT: v_add_co_u32_e32 v21, vcc, 3, v21 +; GFX9-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v22, vcc +; GFX9-NEXT: v_add_co_u32_e32 v23, vcc, 3, v23 +; GFX9-NEXT: v_addc_co_u32_e32 v24, vcc, 0, v24, vcc +; GFX9-NEXT: v_add_co_u32_e32 v25, vcc, 3, v25 +; GFX9-NEXT: v_addc_co_u32_e32 v26, vcc, 0, v26, vcc +; GFX9-NEXT: v_add_co_u32_e32 v27, vcc, 3, v27 +; GFX9-NEXT: v_addc_co_u32_e32 v28, vcc, 0, v28, vcc +; GFX9-NEXT: v_add_co_u32_e32 v29, vcc, 3, v29 +; GFX9-NEXT: v_addc_co_u32_e32 v30, vcc, 0, v30, vcc +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_add_co_u32_e32 v31, vcc, 3, v31 +; GFX9-NEXT: v_addc_co_u32_e32 v32, vcc, 0, v32, vcc +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v1 +; GFX9-NEXT: .LBB28_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v47 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v33, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i64_to_v128i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-NEXT: .LBB28_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB28_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v1, vcc_lo, v1, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v3, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, v5, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo +; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v7, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo +; GFX11-NEXT: v_add_co_u32 v9, vcc_lo, v9, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, vcc_lo +; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, v11, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v12, vcc_lo +; GFX11-NEXT: v_add_co_u32 v13, vcc_lo, v13, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v14, vcc_lo +; GFX11-NEXT: v_add_co_u32 v15, vcc_lo, v15, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v16, vcc_lo +; GFX11-NEXT: v_add_co_u32 v17, vcc_lo, v17, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v18, vcc_lo +; GFX11-NEXT: v_add_co_u32 v19, vcc_lo, v19, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v20, null, 0, v20, vcc_lo +; GFX11-NEXT: v_add_co_u32 v21, vcc_lo, v21, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo +; GFX11-NEXT: v_add_co_u32 v25, vcc_lo, v25, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo +; GFX11-NEXT: v_add_co_u32 v27, vcc_lo, v27, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v28, vcc_lo +; GFX11-NEXT: v_add_co_u32 v29, vcc_lo, v29, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v30, null, 0, v30, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_u32 v31, vcc_lo, v31, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v32, null, 0, v32, vcc_lo +; GFX11-NEXT: v_add_co_u32 v23, vcc_lo, v23, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v24, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-NEXT: .LBB28_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v75 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v63 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v65 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v74 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v61 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v73 +; GFX11-NEXT: v_or_b32_e32 v65, v67, v65 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v58 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v72 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v66 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v62 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v60 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v59 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v65 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v57 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v56 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v65 +; GFX11-NEXT: v_or_b32_e32 v55, v66, v64 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v47 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v46 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v45 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v64 +; GFX11-NEXT: v_or_b32_e32 v55, v65, v66 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v44 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v43 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-NEXT: v_or_b32_e32 v54, v65, v54 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v42 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v41 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v40 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v55 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_or_b32_e32 v54, v64, v65 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v182 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v183 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v181 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v53, v64, v53 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v55 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v65 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v54 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v180 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v179 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v178 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v177 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v176 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v167 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v166 +; GFX11-NEXT: v_or_b32_e32 v53, v53, v54 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v55 +; GFX11-NEXT: v_or_b32_e32 v52, v64, v52 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v65 +; GFX11-NEXT: v_or_b32_e32 v54, v66, v67 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v10, v53 +; GFX11-NEXT: v_or_b32_e32 v3, v11, v52 +; GFX11-NEXT: v_or_b32_e32 v4, v12, v54 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v6, 8, v165 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v164 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v51 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v163 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v162 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v161 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v160 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v151 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v50 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v150 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v149 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v148 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v147 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v146 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v49 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v39, v49 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v145 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v144 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v135 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v134 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v133 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v132 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v131 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v130 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v129 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v128 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v119 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v118 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v117 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v116 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v115 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v114 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v113 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v112 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v103 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v102 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v101 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v100 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v99 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v98 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v87 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v86 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v84 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v83 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v82 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v81 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-NEXT: v_lshlrev_b16 v28, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v32, 8, v70 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v69 +; GFX11-NEXT: v_lshlrev_b16 v34, 8, v68 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v128i8_to_v16i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:388 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v9 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v21 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v25 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v29 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v58, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v59, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v61, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:372 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB29_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v39 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v56 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v38 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v47 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v54 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v5, v46 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v5, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v37 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v43 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v48 +; GCN-NEXT: v_or_b32_e32 v8, v8, v45 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v9, v53 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v10, v42 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v10, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v11, v41 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v12, v40 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v13, v63 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v14, v50 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_or_b32_e32 v18, v18, v51 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_or_b32_e32 v19, v19, v49 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_or_b32_e32 v20, v20, v60 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_or_b32_e32 v21, v21, v58 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v22, v22, v62 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_or_b32_e32 v23, v23, v59 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_or_b32_e32 v24, v24, v32 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_or_b32_e32 v25, v25, v61 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_or_b32_e32 v26, v26, v34 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v27, v27, v52 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_or_b32_e32 v28, v28, v35 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GCN-NEXT: v_or_b32_e32 v29, v29, v33 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_or_b32_e32 v30, v30, v44 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v57 +; GCN-NEXT: v_or_b32_e32 v31, v31, v36 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v50, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v51, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v52, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v33, v34, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v35, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v36, v35 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v37, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v37, v38, v37 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v39, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v48, v39 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_or_b32_e32 v48, v55, v48 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v49, v54, v49 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v18, v18, v50 +; GCN-NEXT: v_or_b32_e32 v19, v19, v51 +; GCN-NEXT: v_or_b32_e32 v20, v20, v52 +; GCN-NEXT: v_or_b32_e32 v21, v21, v53 +; GCN-NEXT: v_or_b32_e32 v22, v22, v32 +; GCN-NEXT: v_or_b32_e32 v23, v23, v33 +; GCN-NEXT: v_or_b32_e32 v24, v24, v34 +; GCN-NEXT: v_or_b32_e32 v25, v25, v35 +; GCN-NEXT: v_or_b32_e32 v26, v26, v36 +; GCN-NEXT: v_or_b32_e32 v27, v27, v37 +; GCN-NEXT: v_or_b32_e32 v28, v28, v38 +; GCN-NEXT: v_or_b32_e32 v29, v29, v39 +; GCN-NEXT: v_or_b32_e32 v30, v30, v48 +; GCN-NEXT: v_or_b32_e32 v31, v31, v49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; kill: killed $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: .LBB29_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB29_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v39, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v56, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v38, v2 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v47, v3 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v54, v4 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v46, v5 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v37, v6 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v43, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_or_b32_e32 v8, v45, v8 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v53, v9 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v42, v10 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v41, v11 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v40, v12 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v63, v13 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v50, v14 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v0, v15 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v0, v16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v0, v17 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_or_b32_e32 v18, v51, v18 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_or_b32_e32 v19, v49, v19 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_or_b32_e32 v20, v60, v20 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_or_b32_e32 v21, v58, v21 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v25, v62, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v29, v59, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v37, v32, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v50, v61, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v41, v34, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v45, v52, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v56, v35, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v58, v33, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v59, v44, v22 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v57 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v57, v36, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v60, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v61, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v62, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v63, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v49, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v51, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v52, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v54, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v26, v24 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v28, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v30, v28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v31, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v32, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v33, v34, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v35, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v48, v35 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v48, v53, v48 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_mov_b32_e32 v0, v55 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v55, v53 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v55, v40, v55 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v40, v42, v40 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v42, v43, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v43, v44, v43 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v44, v46, v44 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; GCN-NEXT: v_or_b32_e32 v46, v0, v46 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v47, v0, v47 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v60, v0 +; GCN-NEXT: s_movk_i32 s7, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v61, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v62, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v63, v3 +; GCN-NEXT: s_mov_b32 s6, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v21 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v37 +; GCN-NEXT: v_add_i32_e32 v50, vcc, s7, v50 +; GCN-NEXT: v_add_i32_e32 v41, vcc, s7, v41 +; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v45 +; GCN-NEXT: v_add_i32_e32 v56, vcc, s7, v56 +; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v58 +; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v59 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x300, v57 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 +; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 +; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 +; GCN-NEXT: v_or_b32_e32 v4, v36, v4 +; GCN-NEXT: v_or_b32_e32 v5, v38, v5 +; GCN-NEXT: v_or_b32_e32 v6, v39, v6 +; GCN-NEXT: v_or_b32_e32 v7, v49, v7 +; GCN-NEXT: v_or_b32_e32 v8, v51, v8 +; GCN-NEXT: v_or_b32_e32 v9, v52, v9 +; GCN-NEXT: v_or_b32_e32 v10, v54, v10 +; GCN-NEXT: v_or_b32_e32 v11, v22, v11 +; GCN-NEXT: v_or_b32_e32 v12, v23, v12 +; GCN-NEXT: v_or_b32_e32 v13, v24, v13 +; GCN-NEXT: v_or_b32_e32 v14, v26, v14 +; GCN-NEXT: v_or_b32_e32 v15, v27, v15 +; GCN-NEXT: v_or_b32_e32 v16, v28, v16 +; GCN-NEXT: v_or_b32_e32 v17, v30, v17 +; GCN-NEXT: v_or_b32_e32 v18, v31, v18 +; GCN-NEXT: v_or_b32_e32 v19, v32, v19 +; GCN-NEXT: v_or_b32_e32 v20, v33, v20 +; GCN-NEXT: v_or_b32_e32 v21, v34, v21 +; GCN-NEXT: v_or_b32_e32 v22, v35, v25 +; GCN-NEXT: v_or_b32_e32 v23, v48, v29 +; GCN-NEXT: v_or_b32_e32 v24, v53, v37 +; GCN-NEXT: v_or_b32_e32 v25, v55, v50 +; GCN-NEXT: v_or_b32_e32 v26, v40, v41 +; GCN-NEXT: v_or_b32_e32 v27, v42, v45 +; GCN-NEXT: v_or_b32_e32 v28, v43, v56 +; GCN-NEXT: v_or_b32_e32 v29, v44, v58 +; GCN-NEXT: v_or_b32_e32 v30, v46, v59 +; GCN-NEXT: v_or_b32_e32 v31, v47, v57 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 +; GCN-NEXT: .LBB29_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v16i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v54 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: .LBB29_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB29_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v31, 0x300 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v9, 3, v61 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v39, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v37, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v63 +; VI-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v59 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 3, v62 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 +; VI-NEXT: v_add_u16_sdwa v10, v10, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v58 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v60 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_add_u16_sdwa v11, v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 +; VI-NEXT: v_add_u16_sdwa v12, v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v46 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_add_u16_sdwa v13, v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v45 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v44 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_add_u16_sdwa v14, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v54 +; VI-NEXT: v_add_u16_e32 v15, 3, v40 +; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_add_u16_sdwa v15, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v16, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v18, 3, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v18, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v18, 3, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v18, 0x300, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v19, 0x300, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v20, 0x300, v20 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v21, v21, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v21, 0x300, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v22, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v23, 3, v23 +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v23, 3, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v24, 3, v24 +; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v24, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v24, 3, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v25, 3, v25 +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v25, 3, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v26, 3, v26 +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v26, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v26, 3, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v27, 3, v27 +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v27 +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v27, 3, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v28, 3, v28 +; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v28, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v27, v28 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v28, 3, v28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v29, 3, v29 +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v29, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v28, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v29, 3, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v30, 3, v30 +; VI-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v30, 3, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 3, v32 +; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v32, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v30, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v32, 3, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v33, 3, v33 +; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v32, v31 +; VI-NEXT: .LBB29_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v16i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: .LBB29_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB29_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v39, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v37, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v54 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v18, v18, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v19, v19, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v18, v18, v19 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v21, v21, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v23, v23, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v23 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX9-NEXT: .LBB29_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v128i8_to_v16i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:592 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:588 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:584 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:580 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:576 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:572 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:568 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:564 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:560 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:556 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:552 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:548 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:544 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:540 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:536 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:532 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:528 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:524 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:520 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:516 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:512 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:508 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:504 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:500 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:496 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:492 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:488 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:484 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:480 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:476 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:472 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:468 +; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:464 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:460 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:456 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:452 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:448 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:444 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:440 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:436 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:432 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:428 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:424 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:420 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:416 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:412 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:408 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:404 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:400 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:396 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:220 +; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-NEXT: scratch_load_u16 v115, off, s32 +; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:88 +; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:136 +; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:144 +; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:160 +; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:168 +; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:176 +; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:184 +; GFX11-NEXT: scratch_load_u16 v136, off, s32 offset:192 +; GFX11-NEXT: scratch_load_u16 v137, off, s32 offset:200 +; GFX11-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-NEXT: scratch_load_u16 v132, off, s32 offset:212 +; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:204 +; GFX11-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:188 +; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:180 +; GFX11-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-NEXT: scratch_load_u16 v160, off, s32 offset:156 +; GFX11-NEXT: scratch_load_u16 v161, off, s32 offset:148 +; GFX11-NEXT: scratch_load_u16 v167, off, s32 offset:140 +; GFX11-NEXT: scratch_load_u16 v176, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v177, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v40, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v42, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v43, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v44, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v58, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v59, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v60, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v124, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v125, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v126, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v127, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v111, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v120, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v121, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v122, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v123, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v106, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v107, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v108, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v109, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v110, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(54) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-NEXT: s_waitcnt vmcnt(53) +; GFX11-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-NEXT: s_waitcnt vmcnt(52) +; GFX11-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-NEXT: s_waitcnt vmcnt(51) +; GFX11-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-NEXT: s_waitcnt vmcnt(50) +; GFX11-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-NEXT: s_waitcnt vmcnt(49) +; GFX11-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-NEXT: s_waitcnt vmcnt(48) +; GFX11-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-NEXT: s_waitcnt vmcnt(47) +; GFX11-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-NEXT: s_waitcnt vmcnt(46) +; GFX11-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-NEXT: s_waitcnt vmcnt(45) +; GFX11-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-NEXT: s_waitcnt vmcnt(44) +; GFX11-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-NEXT: s_waitcnt vmcnt(43) +; GFX11-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-NEXT: s_waitcnt vmcnt(42) +; GFX11-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-NEXT: s_waitcnt vmcnt(41) +; GFX11-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-NEXT: s_waitcnt vmcnt(40) +; GFX11-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-NEXT: s_waitcnt vmcnt(39) +; GFX11-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-NEXT: s_waitcnt vmcnt(38) +; GFX11-NEXT: v_lshlrev_b16 v46, 8, v163 +; GFX11-NEXT: s_waitcnt vmcnt(37) +; GFX11-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-NEXT: s_waitcnt vmcnt(36) +; GFX11-NEXT: v_lshlrev_b16 v56, 8, v165 +; GFX11-NEXT: s_waitcnt vmcnt(35) +; GFX11-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-NEXT: s_waitcnt vmcnt(34) +; GFX11-NEXT: v_lshlrev_b16 v179, 8, v179 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-NEXT: s_waitcnt vmcnt(32) +; GFX11-NEXT: v_lshlrev_b16 v181, 8, v181 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: v_lshlrev_b16 v183, 8, v183 +; GFX11-NEXT: s_waitcnt vmcnt(29) +; GFX11-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: v_lshlrev_b16 v163, 8, v137 +; GFX11-NEXT: s_waitcnt vmcnt(27) +; GFX11-NEXT: v_lshlrev_b16 v164, 8, v138 +; GFX11-NEXT: v_lshlrev_b16 v165, 8, v103 +; GFX11-NEXT: v_lshlrev_b16 v166, 8, v102 +; GFX11-NEXT: v_lshlrev_b16 v144, 8, v101 +; GFX11-NEXT: v_lshlrev_b16 v145, 8, v100 +; GFX11-NEXT: v_lshlrev_b16 v146, 8, v99 +; GFX11-NEXT: v_lshlrev_b16 v147, 8, v31 +; GFX11-NEXT: v_lshlrev_b16 v148, 8, v30 +; GFX11-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v124 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v125 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v126 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v127 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v39 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v111 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v121 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v120 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v122 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v123 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v107 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v38 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v108 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v109 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v110 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v106 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v12 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v92 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v78 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v77 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v76 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v75 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v74 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v59 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v93 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v94 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v95 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v104 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v105 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v79 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v88 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v89 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v90 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v91 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v58 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v44 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v43 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v41 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v40 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v176 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v61 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v62 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v63 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v72 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v73 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v45 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v46 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v47 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v56 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v57 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v160 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v151 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v150 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v134 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v113 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v179 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v180 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v181 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v182 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v183 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v162 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v163 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v164 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v165 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v112 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v98 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v97 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v87 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v86 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v84 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v83 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v82 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v144 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v145 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v146 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v147 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v148 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v119 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v128 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v129 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v130 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v70 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v68 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v66 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v65 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v64 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v114 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v115 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v116 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v117 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v118 +; GFX11-NEXT: v_or_b32_e32 v32, v32, v99 +; GFX11-NEXT: v_or_b32_e32 v33, v33, v100 +; GFX11-NEXT: v_or_b32_e32 v34, v34, v101 +; GFX11-NEXT: v_or_b32_e32 v35, v35, v102 +; GFX11-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr92 +; GFX11-NEXT: ; implicit-def: $vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr77 +; GFX11-NEXT: ; implicit-def: $vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr124 +; GFX11-NEXT: ; implicit-def: $vgpr125 +; GFX11-NEXT: ; implicit-def: $vgpr126 +; GFX11-NEXT: ; implicit-def: $vgpr127 +; GFX11-NEXT: ; implicit-def: $vgpr111 +; GFX11-NEXT: ; implicit-def: $vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr121 +; GFX11-NEXT: ; implicit-def: $vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr108 +; GFX11-NEXT: ; implicit-def: $vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr110 +; GFX11-NEXT: ; implicit-def: $vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr94 +; GFX11-NEXT: ; implicit-def: $vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr104 +; GFX11-NEXT: ; implicit-def: $vgpr105 +; GFX11-NEXT: ; implicit-def: $vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr89 +; GFX11-NEXT: ; implicit-def: $vgpr90 +; GFX11-NEXT: ; implicit-def: $vgpr91 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: .LBB29_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB29_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v55, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v54, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v53, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v52, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v51, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_add_nc_u16 v5, v50, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v124, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v125, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v126, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v127, v3 +; GFX11-NEXT: v_add_nc_u16 v6, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v7, v48, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u16 v8, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v36, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_nc_u16 v10, v35, 3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_add_nc_u16 v2, v39, 3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v11, v34, 3 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_or_b32_e32 v3, v111, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v120, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v121, v5 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v6, v122, v6 +; GFX11-NEXT: v_or_b32_e32 v8, v107, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v108, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v109, v10 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v38, 3 +; GFX11-NEXT: v_or_b32_e32 v11, v110, v11 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_or_b32_e32 v2, v106, v2 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v12 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_add_nc_u16 v7, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v9, v92, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v78, 3 +; GFX11-NEXT: v_add_nc_u16 v11, v77, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v76, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v75, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v74, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v60, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v59, 3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_or_b32_e32 v7, v93, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v94, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v95, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v104, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v105, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v79, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v88, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v89, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v90, v15 +; GFX11-NEXT: v_or_b32_e32 v16, v91, v16 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_add_nc_u16 v12, v58, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v44, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v43, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v42, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v41, 3 +; GFX11-NEXT: v_add_nc_u16 v17, v40, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v178, 3 +; GFX11-NEXT: v_add_nc_u16 v19, v177, 3 +; GFX11-NEXT: v_add_nc_u16 v20, v176, 3 +; GFX11-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v62, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v63, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v72, v15 +; GFX11-NEXT: v_or_b32_e32 v16, v73, v16 +; GFX11-NEXT: v_or_b32_e32 v17, v45, v17 +; GFX11-NEXT: v_or_b32_e32 v18, v46, v18 +; GFX11-NEXT: v_or_b32_e32 v19, v47, v19 +; GFX11-NEXT: v_or_b32_e32 v20, v56, v20 +; GFX11-NEXT: v_or_b32_e32 v21, v57, v21 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v160, 3 +; GFX11-NEXT: v_add_nc_u16 v19, v151, 3 +; GFX11-NEXT: v_add_nc_u16 v20, v150, 3 +; GFX11-NEXT: v_add_nc_u16 v21, v149, 3 +; GFX11-NEXT: v_add_nc_u16 v22, v135, 3 +; GFX11-NEXT: v_add_nc_u16 v23, v134, 3 +; GFX11-NEXT: v_add_nc_u16 v24, v133, 3 +; GFX11-NEXT: v_add_nc_u16 v25, v132, 3 +; GFX11-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-NEXT: v_or_b32_e32 v18, v180, v18 +; GFX11-NEXT: v_or_b32_e32 v19, v181, v19 +; GFX11-NEXT: v_or_b32_e32 v20, v182, v20 +; GFX11-NEXT: v_or_b32_e32 v21, v183, v21 +; GFX11-NEXT: v_or_b32_e32 v22, v162, v22 +; GFX11-NEXT: v_or_b32_e32 v23, v163, v23 +; GFX11-NEXT: v_or_b32_e32 v24, v164, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v165, v25 +; GFX11-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_add_nc_u16 v22, v112, 3 +; GFX11-NEXT: v_add_nc_u16 v23, v98, 3 +; GFX11-NEXT: v_add_nc_u16 v24, v97, 3 +; GFX11-NEXT: v_add_nc_u16 v25, v96, 3 +; GFX11-NEXT: v_add_nc_u16 v26, v87, 3 +; GFX11-NEXT: v_add_nc_u16 v27, v86, 3 +; GFX11-NEXT: v_add_nc_u16 v28, v85, 3 +; GFX11-NEXT: v_add_nc_u16 v29, v84, 3 +; GFX11-NEXT: v_add_nc_u16 v30, v83, 3 +; GFX11-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-NEXT: v_or_b32_e32 v23, v145, v23 +; GFX11-NEXT: v_or_b32_e32 v24, v146, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v147, v25 +; GFX11-NEXT: v_or_b32_e32 v26, v148, v26 +; GFX11-NEXT: v_or_b32_e32 v27, v119, v27 +; GFX11-NEXT: v_or_b32_e32 v28, v128, v28 +; GFX11-NEXT: v_or_b32_e32 v29, v129, v29 +; GFX11-NEXT: v_or_b32_e32 v30, v130, v30 +; GFX11-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-NEXT: v_add_nc_u16 v27, v81, 3 +; GFX11-NEXT: v_add_nc_u16 v28, v80, 3 +; GFX11-NEXT: v_add_nc_u16 v29, v71, 3 +; GFX11-NEXT: v_add_nc_u16 v30, v70, 3 +; GFX11-NEXT: v_add_nc_u16 v31, v69, 3 +; GFX11-NEXT: v_add_nc_u16 v32, v68, 3 +; GFX11-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-NEXT: v_add_nc_u16 v34, v66, 3 +; GFX11-NEXT: v_add_nc_u16 v35, v65, 3 +; GFX11-NEXT: v_add_nc_u16 v36, v64, 3 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-NEXT: v_or_b32_e32 v27, v114, v27 +; GFX11-NEXT: v_or_b32_e32 v28, v115, v28 +; GFX11-NEXT: v_or_b32_e32 v29, v116, v29 +; GFX11-NEXT: v_or_b32_e32 v30, v117, v30 +; GFX11-NEXT: v_or_b32_e32 v31, v118, v31 +; GFX11-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-NEXT: v_or_b32_e32 v33, v100, v33 +; GFX11-NEXT: v_or_b32_e32 v34, v101, v34 +; GFX11-NEXT: v_or_b32_e32 v35, v102, v35 +; GFX11-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-NEXT: v_add_nc_u16 v34, 0x300, v34 +; GFX11-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-NEXT: v_add_nc_u16 v36, 0x300, v36 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-NEXT: .LBB29_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:392 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:396 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:400 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:404 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:408 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:412 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:416 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:420 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:424 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:428 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:432 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:436 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:440 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:444 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:448 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:452 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:456 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:460 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:464 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:468 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:472 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:476 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:480 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:484 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:488 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:492 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:496 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:500 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:504 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:508 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:512 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:516 +; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:520 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:524 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:528 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:532 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:536 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:540 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:544 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:548 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:552 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:556 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:560 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:564 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:568 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:572 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:576 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:580 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:584 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:588 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <64 x bfloat> @bitcast_v16i64_to_v64bf16(<16 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i64_to_v64bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB30_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v62 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: .LBB30_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB30_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: v_addc_u32_e32 v32, vcc, 0, v62, vcc +; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v11 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v9 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v8 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v7 +; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v6 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v5 +; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v4 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v2 +; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v1 +; GCN-NEXT: .LBB30_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v60 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v59 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v56, v1, v2, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v45, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v44, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v47, vcc, 20, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_alignbit_b32 v32, v32, v33, 16 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_alignbit_b32 v34, v34, v35, 16 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_alignbit_b32 v36, v36, v37, 16 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_alignbit_b32 v38, v38, v39, 16 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_alignbit_b32 v48, v48, v49, 16 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_alignbit_b32 v50, v50, v51, 16 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_alignbit_b32 v52, v52, v53, 16 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_alignbit_b32 v54, v54, v55, 16 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_alignbit_b32 v40, v40, v41, 16 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v63 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_alignbit_b32 v42, v42, v43, 16 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v56, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v45, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v47, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i64_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB30_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB30_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i64_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB30_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB30_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i64_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: .LBB30_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <16 x i64> @bitcast_v64bf16_to_v16i64(<64 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v64bf16_to_v16i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v33 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v40 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v63 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v43 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v54 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB31_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; GCN-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v61, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; GCN-NEXT: v_alignbit_b32 v2, v2, v59, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; GCN-NEXT: v_alignbit_b32 v3, v3, v57, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v46 +; GCN-NEXT: v_alignbit_b32 v4, v4, v47, 16 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v5, v5, v45, 16 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v16, v16, v33, 16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v43 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; GCN-NEXT: v_alignbit_b32 v19, v19, v42, 16 +; GCN-NEXT: v_alignbit_b32 v20, v20, v44, 16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; GCN-NEXT: v_alignbit_b32 v22, v22, v48, 16 +; GCN-NEXT: v_alignbit_b32 v23, v23, v38, 16 +; GCN-NEXT: v_alignbit_b32 v24, v24, v50, 16 +; GCN-NEXT: v_alignbit_b32 v25, v25, v51, 16 +; GCN-NEXT: v_alignbit_b32 v26, v26, v53, 16 +; GCN-NEXT: v_alignbit_b32 v27, v27, v55, 16 +; GCN-NEXT: v_alignbit_b32 v28, v28, v40, 16 +; GCN-NEXT: v_alignbit_b32 v29, v29, v63, 16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; kill: killed $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: .LBB31_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB31_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v39 +; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v40 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v54 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v41 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v43 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v30 +; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v31 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v32 +; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v34 +; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v36 +; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v38 +; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v48 +; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v49 +; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v50 +; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 +; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v52 +; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; GCN-NEXT: v_alignbit_b32 v19, v21, v20, 16 +; GCN-NEXT: v_alignbit_b32 v20, v39, v54, 16 +; GCN-NEXT: v_alignbit_b32 v21, v48, v40, 16 +; GCN-NEXT: v_alignbit_b32 v22, v49, v22, 16 +; GCN-NEXT: v_alignbit_b32 v23, v50, v23, 16 +; GCN-NEXT: v_alignbit_b32 v24, v51, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v32, v25, 16 +; GCN-NEXT: v_alignbit_b32 v26, v33, v26, 16 +; GCN-NEXT: v_alignbit_b32 v27, v34, v27, 16 +; GCN-NEXT: v_alignbit_b32 v28, v35, v28, 16 +; GCN-NEXT: v_alignbit_b32 v29, v36, v29, 16 +; GCN-NEXT: v_alignbit_b32 v30, v37, v30, 16 +; GCN-NEXT: v_alignbit_b32 v31, v38, v31, 16 +; GCN-NEXT: .LBB31_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v16i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v16i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB31_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 +; GFX9-NEXT: .LBB31_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64bf16_to_v16i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v39, v14, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-NEXT: v_add3_u32 v34, v34, v32, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo +; GFX11-NEXT: v_add3_u32 v35, v39, v14, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add3_u32 v33, v33, v13, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v34, v34, v12, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v33, v33, v11, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v35, v35, v10, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX11-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v33, v33, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v8 +; GFX11-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_bfe_u32 v35, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v6, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v33, v33, v7, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GFX11-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 +; GFX11-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v34, v34, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v4 +; GFX11-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v33, v33, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v0, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v33, v33, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v0 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-NEXT: v_add3_u32 v33, v33, v31, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_bfe_u32 v35, v30, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v30, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_add3_u32 v33, v33, v29, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v28, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v28 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-NEXT: v_add3_u32 v33, v33, v27, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_bfe_u32 v35, v26, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v26 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add3_u32 v33, v33, v25, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v33, v23, 0x7fff +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GFX11-NEXT: v_add3_u32 v35, v36, v34, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v21, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v32, v38, v33, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; GFX11-NEXT: v_add3_u32 v35, v37, v33, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v35, v37, v19, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v19 +; GFX11-NEXT: v_add3_u32 v37, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_bfe_u32 v48, v18, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v36, v48, v18, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-NEXT: v_add3_u32 v37, v38, v35, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_bfe_u32 v48, v36, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v16 +; GFX11-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo +; GFX11-NEXT: v_add3_u32 v38, v39, v17, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_bfe_u32 v37, v16, 16, 1 +; GFX11-NEXT: v_add3_u32 v48, v48, v36, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_add3_u32 v37, v37, v16, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 +; GFX11-NEXT: .LBB31_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i64_to_v64f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB32_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v62 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v63 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v62 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v63 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v31 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v33 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v35 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v37 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v38 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v56 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v58 +; GCN-NEXT: v_cvt_f32_f16_e32 v47, v59 +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v60 +; GCN-NEXT: v_cvt_f32_f16_e32 v57, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v58, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: .LBB32_2: ; %Flow +; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB32_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v5 +; GCN-NEXT: v_addc_u32_e32 v32, vcc, 0, v6, vcc +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v7 +; GCN-NEXT: v_addc_u32_e32 v34, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v9 +; GCN-NEXT: v_addc_u32_e32 v36, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v11 +; GCN-NEXT: v_addc_u32_e32 v38, vcc, 0, v12, vcc +; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v13 +; GCN-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc +; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v15 +; GCN-NEXT: v_addc_u32_e32 v50, vcc, 0, v16, vcc +; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v17 +; GCN-NEXT: v_addc_u32_e32 v52, vcc, 0, v18, vcc +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v63 +; GCN-NEXT: v_addc_u32_e32 v54, vcc, 0, v62, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v52 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v51 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v50 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v49 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v48 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v39 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v38 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v37 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v36 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v35 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v34 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v33 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v32 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v31 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v47, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v57, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v58, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v59, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v61, v2 +; GCN-NEXT: .LBB32_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v61 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v60 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v59 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v59, v2, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v58 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v58, v2, v1 +; GCN-NEXT: v_add_i32_e32 v62, vcc, 8, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v57 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v57, v2, v1 +; GCN-NEXT: v_add_i32_e32 v61, vcc, 12, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v56 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v56, v2, v1 +; GCN-NEXT: v_add_i32_e32 v60, vcc, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v47 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_add_i32_e32 v47, vcc, 20, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v46 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v45 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v44 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v43 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v42 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v41 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v55 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v54 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v53 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v52 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v20, v21, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v51 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v50 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v49 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v48 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_or_b32_e32 v28, v29, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v38 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_or_b32_e32 v30, v34, v30 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x50, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v37 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_or_b32_e32 v36, v37, v36 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_or_b32_e32 v35, v38, v35 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x58, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_or_b32_e32 v33, v39, v33 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x5c, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_or_b32_e32 v32, v48, v32 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x60, v0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_or_b32_e32 v31, v49, v31 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_or_b32_e32 v50, v51, v50 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_or_b32_e32 v52, v53, v52 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_or_b32_e32 v54, v55, v54 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_or_b32_e32 v40, v41, v40 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_or_b32_e32 v42, v43, v42 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v59, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v61, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v60, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v56, v47, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v35, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v33, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i64_to_v64f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB32_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i64_to_v64f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB32_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i64_to_v64f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: .LBB32_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <16 x i64> @bitcast_v64f16_to_v16i64(<64 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v64f16_to_v16i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v62, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v61, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v59, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v58, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v57, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v56, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v47, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v37 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v31 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v1 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB33_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; GCN-NEXT: v_or_b32_e32 v0, v62, v0 +; GCN-NEXT: v_or_b32_e32 v1, v60, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; GCN-NEXT: v_or_b32_e32 v2, v58, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; GCN-NEXT: v_or_b32_e32 v3, v56, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; GCN-NEXT: v_or_b32_e32 v4, v46, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v51 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v32, v18 +; GCN-NEXT: v_or_b32_e32 v19, v43, v19 +; GCN-NEXT: v_or_b32_e32 v20, v41, v20 +; GCN-NEXT: v_or_b32_e32 v21, v55, v21 +; GCN-NEXT: v_or_b32_e32 v22, v49, v22 +; GCN-NEXT: v_or_b32_e32 v23, v50, v23 +; GCN-NEXT: v_or_b32_e32 v24, v39, v24 +; GCN-NEXT: v_or_b32_e32 v25, v36, v25 +; GCN-NEXT: v_or_b32_e32 v26, v48, v26 +; GCN-NEXT: v_or_b32_e32 v27, v52, v27 +; GCN-NEXT: v_or_b32_e32 v28, v53, v28 +; GCN-NEXT: v_or_b32_e32 v29, v54, v29 +; GCN-NEXT: v_or_b32_e32 v30, v40, v30 +; GCN-NEXT: v_or_b32_e32 v31, v42, v31 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: .LBB33_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB33_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v59 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v56 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v45 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v41 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v55 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v50 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v39 +; GCN-NEXT: v_mov_b32_e32 v39, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v48 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v31 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v49 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v52 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_or_b32_e32 v19, v21, v20 +; GCN-NEXT: v_or_b32_e32 v20, v55, v39 +; GCN-NEXT: v_or_b32_e32 v21, v41, v48 +; GCN-NEXT: v_or_b32_e32 v22, v22, v49 +; GCN-NEXT: v_or_b32_e32 v23, v23, v50 +; GCN-NEXT: v_or_b32_e32 v24, v24, v51 +; GCN-NEXT: v_or_b32_e32 v25, v25, v32 +; GCN-NEXT: v_or_b32_e32 v26, v26, v36 +; GCN-NEXT: v_or_b32_e32 v27, v27, v38 +; GCN-NEXT: v_or_b32_e32 v28, v28, v33 +; GCN-NEXT: v_or_b32_e32 v29, v29, v34 +; GCN-NEXT: v_or_b32_e32 v30, v30, v35 +; GCN-NEXT: v_or_b32_e32 v31, v31, v37 +; GCN-NEXT: .LBB33_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v16i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v32, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v33, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v18, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_or_b32_e32 v18, v18, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v32, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64f16_to_v16i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64f16_to_v16i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB33_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <64 x i16> @bitcast_v16i64_to_v64i16(<16 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i64_to_v64i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; kill: killed $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; kill: killed $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB34_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 +; GCN-NEXT: v_alignbit_b32 v49, v16, v15, 16 +; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v54, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v40, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v45, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GCN-NEXT: .LBB34_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB34_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; GCN-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; GCN-NEXT: v_alignbit_b32 v33, v31, v32, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v28, v27, 16 +; GCN-NEXT: v_alignbit_b32 v36, v26, v25, 16 +; GCN-NEXT: v_alignbit_b32 v37, v24, v23, 16 +; GCN-NEXT: v_alignbit_b32 v38, v22, v21, 16 +; GCN-NEXT: v_alignbit_b32 v39, v20, v19, 16 +; GCN-NEXT: v_alignbit_b32 v48, v18, v17, 16 +; GCN-NEXT: v_alignbit_b32 v49, v16, v15, 16 +; GCN-NEXT: v_alignbit_b32 v51, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v54, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v40, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v43, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v45, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v57, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v59, v2, v1, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GCN-NEXT: .LBB34_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v59 +; GCN-NEXT: v_or_b32_e32 v1, v1, v44 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_or_b32_e32 v2, v2, v33 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; GCN-NEXT: v_or_b32_e32 v59, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; GCN-NEXT: v_or_b32_e32 v57, v1, v2 +; GCN-NEXT: v_add_i32_e32 v44, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; GCN-NEXT: v_or_b32_e32 v63, v1, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; GCN-NEXT: v_or_b32_e32 v45, v1, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v43 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; GCN-NEXT: v_or_b32_e32 v61, v3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v60 +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v54 +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v58 +; GCN-NEXT: v_or_b32_e32 v10, v10, v11 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v51 +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v56 +; GCN-NEXT: v_or_b32_e32 v14, v14, v33 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GCN-NEXT: v_or_b32_e32 v15, v15, v49 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 52, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v47 +; GCN-NEXT: v_or_b32_e32 v16, v16, v51 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_or_b32_e32 v17, v17, v48 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 60, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v46 +; GCN-NEXT: v_or_b32_e32 v18, v18, v54 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 64, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_or_b32_e32 v19, v19, v39 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_or_b32_e32 v20, v20, v34 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x48, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v38 +; GCN-NEXT: v_or_b32_e32 v21, v21, v34 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x4c, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v42 +; GCN-NEXT: v_or_b32_e32 v22, v22, v34 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x50, v0 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GCN-NEXT: v_or_b32_e32 v23, v23, v34 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v41 +; GCN-NEXT: v_or_b32_e32 v24, v24, v34 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x58, v0 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v36 +; GCN-NEXT: v_or_b32_e32 v25, v25, v34 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x5c, v0 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v55 +; GCN-NEXT: v_or_b32_e32 v26, v26, v34 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x60, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v35 +; GCN-NEXT: v_or_b32_e32 v27, v27, v34 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x64, v0 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v53 +; GCN-NEXT: v_or_b32_e32 v28, v28, v34 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_or_b32_e32 v29, v29, v34 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x6c, v0 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_or_b32_e32 v30, v30, v52 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x70, v0 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: v_or_b32_e32 v32, v32, v43 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v0 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_or_b32_e32 v31, v31, v50 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v59, v44, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v63, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v45, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v29, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i64_to_v64i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v30, vcc, 3, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_addc_u32_e32 v31, vcc, 0, v31, vcc +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: .LBB34_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i64_to_v64i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v30, vcc, 3, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_addc_co_u32_e32 v31, vcc, 0, v31, vcc +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: .LBB34_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i64_to_v64i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v30, vcc_lo, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_co_ci_u32_e64 v31, null, 0, v31, vcc_lo +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: .LBB34_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i64> %a, splat (i64 3) + %a2 = bitcast <16 x i64> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x i64> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <16 x i64> @bitcast_v64i16_to_v16i64(<64 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i16_to_v16i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v37, v20 +; GCN-NEXT: v_mov_b32_e32 v38, v18 +; GCN-NEXT: v_mov_b32_e32 v39, v16 +; GCN-NEXT: v_mov_b32_e32 v48, v14 +; GCN-NEXT: v_mov_b32_e32 v49, v12 +; GCN-NEXT: v_mov_b32_e32 v50, v10 +; GCN-NEXT: v_mov_b32_e32 v51, v8 +; GCN-NEXT: v_mov_b32_e32 v52, v6 +; GCN-NEXT: v_mov_b32_e32 v53, v4 +; GCN-NEXT: v_mov_b32_e32 v54, v2 +; GCN-NEXT: v_mov_b32_e32 v55, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(12) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v24 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB35_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GCN-NEXT: v_or_b32_e32 v0, v0, v36 +; GCN-NEXT: v_or_b32_e32 v1, v1, v58 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; GCN-NEXT: v_or_b32_e32 v2, v2, v57 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; GCN-NEXT: v_or_b32_e32 v3, v3, v35 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; GCN-NEXT: v_or_b32_e32 v4, v4, v60 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v5, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v7, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v10, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v43 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v56 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v45 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v32 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v42 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v41 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v40 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v63 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v62 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v61 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v47 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v44 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v18, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v19, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v20, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v21, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v22, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v23, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v24, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v25, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v26, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v27, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v28, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v29, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v30, v32 +; GCN-NEXT: v_or_b32_e32 v31, v31, v59 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: .LBB35_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB35_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v36, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v58, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v57, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v35, v3 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v45 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v41 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v62 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v61 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v47 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v44 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v4, v60, v4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v32, v5 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v32, v6 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v32, v7 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v32, v8 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v32, v9 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v32, v10 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v32, v11 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v32, v12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v32, v13 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v32, v14 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v32, v15 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v32, v16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v32, v17 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v32, v18 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v32, v19 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v32, v20 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v32, v22 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v32, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v32, v25 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v32, v26 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v32, v27 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v32, v29 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v32, v30 +; GCN-NEXT: v_or_b32_e32 v31, v59, v31 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 +; GCN-NEXT: .LBB35_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(11) +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v16i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB35_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v33, 3 +; VI-NEXT: v_add_u16_e32 v32, 3, v15 +; VI-NEXT: v_add_u16_sdwa v15, v15, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v32, v15 +; VI-NEXT: v_add_u16_e32 v32, 3, v14 +; VI-NEXT: v_add_u16_sdwa v14, v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v32, v14 +; VI-NEXT: v_add_u16_e32 v32, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v32, v13 +; VI-NEXT: v_add_u16_e32 v32, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v32, v12 +; VI-NEXT: v_add_u16_e32 v32, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v32, v11 +; VI-NEXT: v_add_u16_e32 v32, 3, v10 +; VI-NEXT: v_add_u16_sdwa v10, v10, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v32, v10 +; VI-NEXT: v_add_u16_e32 v32, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v32, v9 +; VI-NEXT: v_add_u16_e32 v32, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v32, v8 +; VI-NEXT: v_add_u16_e32 v32, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v32, v7 +; VI-NEXT: v_add_u16_e32 v32, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v32, v6 +; VI-NEXT: v_add_u16_e32 v32, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v32, v5 +; VI-NEXT: v_add_u16_e32 v32, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v32, v4 +; VI-NEXT: v_add_u16_e32 v32, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v32, v3 +; VI-NEXT: v_add_u16_e32 v32, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v32, v2 +; VI-NEXT: v_add_u16_e32 v32, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v32, v1 +; VI-NEXT: v_add_u16_e32 v32, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v32, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 3, v31 +; VI-NEXT: v_add_u16_sdwa v31, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v32, v31 +; VI-NEXT: v_add_u16_e32 v32, 3, v30 +; VI-NEXT: v_add_u16_sdwa v30, v30, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v32, v30 +; VI-NEXT: v_add_u16_e32 v32, 3, v29 +; VI-NEXT: v_add_u16_sdwa v29, v29, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v32, v29 +; VI-NEXT: v_add_u16_e32 v32, 3, v28 +; VI-NEXT: v_add_u16_sdwa v28, v28, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v32, v28 +; VI-NEXT: v_add_u16_e32 v32, 3, v27 +; VI-NEXT: v_add_u16_sdwa v27, v27, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: v_add_u16_e32 v32, 3, v26 +; VI-NEXT: v_add_u16_sdwa v26, v26, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v32, v26 +; VI-NEXT: v_add_u16_e32 v32, 3, v25 +; VI-NEXT: v_add_u16_sdwa v25, v25, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v32, v25 +; VI-NEXT: v_add_u16_e32 v32, 3, v24 +; VI-NEXT: v_add_u16_sdwa v24, v24, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v32, v24 +; VI-NEXT: v_add_u16_e32 v32, 3, v23 +; VI-NEXT: v_add_u16_sdwa v23, v23, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v32, v23 +; VI-NEXT: v_add_u16_e32 v32, 3, v22 +; VI-NEXT: v_add_u16_sdwa v22, v22, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v32, v22 +; VI-NEXT: v_add_u16_e32 v32, 3, v21 +; VI-NEXT: v_add_u16_sdwa v21, v21, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v32, v21 +; VI-NEXT: v_add_u16_e32 v32, 3, v20 +; VI-NEXT: v_add_u16_sdwa v20, v20, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v32, v20 +; VI-NEXT: v_add_u16_e32 v32, 3, v19 +; VI-NEXT: v_add_u16_sdwa v19, v19, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v32, v19 +; VI-NEXT: v_add_u16_e32 v32, 3, v18 +; VI-NEXT: v_add_u16_sdwa v18, v18, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v32, v18 +; VI-NEXT: v_add_u16_e32 v32, 3, v17 +; VI-NEXT: v_add_u16_sdwa v17, v17, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v32, v17 +; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i16_to_v16i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB35_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i16_to_v16i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB35_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB35_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <16 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <16 x i64> + br label %end + +end: + %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i64> %phi +} + +define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f64_to_v128i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB36_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v51, v8, v7, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v4, v3, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v4, v3, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v53, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v61, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v62, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v63, v2, v1, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v30 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v22 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v18 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; GCN-NEXT: .LBB36_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB36_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v51, v8, v7, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v6, v5, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v52, v6, v5, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v4, v3, 24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v4, v3, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v53, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v61, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v62, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v63, v2, v1, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v30 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v40, 8, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v41, 8, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v42, 8, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v43, 8, v22 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v45, 8, v18 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v46, 8, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v57, 8, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v60, 8, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v2 +; GCN-NEXT: .LBB36_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v63 +; GCN-NEXT: v_or_b32_e32 v1, v1, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GCN-NEXT: v_or_b32_e32 v2, v2, v35 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v61 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_or_b32_e32 v35, v36, v35 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_or_b32_e32 v33, v33, v34 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v35 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v33 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v54, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v63, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v62, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v61, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v21 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v23 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v27 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v29 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v30 +; GCN-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v53 +; GCN-NEXT: v_or_b32_e32 v48, v48, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v60 +; GCN-NEXT: v_or_b32_e32 v54, v54, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v52 +; GCN-NEXT: v_or_b32_e32 v60, v50, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v59 +; GCN-NEXT: v_or_b32_e32 v59, v49, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v51 +; GCN-NEXT: v_or_b32_e32 v32, v39, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v58 +; GCN-NEXT: v_or_b32_e32 v30, v38, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GCN-NEXT: v_or_b32_e32 v28, v37, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v57 +; GCN-NEXT: v_or_b32_e32 v26, v36, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GCN-NEXT: v_or_b32_e32 v24, v63, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v56 +; GCN-NEXT: v_or_b32_e32 v22, v62, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GCN-NEXT: v_or_b32_e32 v20, v61, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v47 +; GCN-NEXT: v_or_b32_e32 v16, v35, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GCN-NEXT: v_or_b32_e32 v15, v15, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v46 +; GCN-NEXT: v_or_b32_e32 v17, v34, v17 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GCN-NEXT: v_or_b32_e32 v19, v33, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v45 +; GCN-NEXT: v_or_b32_e32 v21, v18, v21 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GCN-NEXT: v_or_b32_e32 v23, v14, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v44 +; GCN-NEXT: v_or_b32_e32 v25, v13, v14 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GCN-NEXT: v_or_b32_e32 v27, v12, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v43 +; GCN-NEXT: v_or_b32_e32 v29, v11, v12 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GCN-NEXT: v_or_b32_e32 v31, v10, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v42 +; GCN-NEXT: v_or_b32_e32 v33, v9, v10 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GCN-NEXT: v_or_b32_e32 v35, v8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v41 +; GCN-NEXT: v_or_b32_e32 v52, v7, v8 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GCN-NEXT: v_or_b32_e32 v50, v6, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v40 +; GCN-NEXT: v_or_b32_e32 v49, v5, v6 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GCN-NEXT: v_or_b32_e32 v51, v4, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v55 +; GCN-NEXT: v_or_b32_e32 v53, v3, v4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v55, v2, v3 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v41, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v10, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v12, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v5, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v6, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v7, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v8, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v9, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v11, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v13, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v18, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v34, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v36, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v37, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v38, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v39, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v40, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v42, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v43, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v44, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v45, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v46, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v47, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v56, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v57, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v58, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v61, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v62, v4, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v63, v4, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; GCN-NEXT: v_or_b32_e32 v3, v3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GCN-NEXT: v_or_b32_e32 v54, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v60 +; GCN-NEXT: v_or_b32_e32 v4, v2, v10 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v59 +; GCN-NEXT: v_or_b32_e32 v59, v10, v12 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v32 +; GCN-NEXT: v_or_b32_e32 v5, v10, v5 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v60, vcc, 20, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v5, v5, v6 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v28 +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v26 +; GCN-NEXT: v_or_b32_e32 v7, v7, v8 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; GCN-NEXT: v_or_b32_e32 v7, v7, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v22 +; GCN-NEXT: v_or_b32_e32 v9, v9, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v11, v11, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v13, v13, v18 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v15, v15, v34 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v17, v17, v36 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v19, v19, v37 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v21, v21, v38 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_or_b32_e32 v23, v23, v39 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v0 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_or_b32_e32 v25, v25, v40 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_or_b32_e32 v27, v27, v42 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v29, v29, v43 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v31, v31, v44 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x54, v0 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GCN-NEXT: v_or_b32_e32 v33, v33, v45 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x58, v0 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GCN-NEXT: v_or_b32_e32 v35, v35, v46 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x5c, v0 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v52 +; GCN-NEXT: v_or_b32_e32 v37, v37, v47 +; GCN-NEXT: v_add_i32_e32 v44, vcc, 0x60, v0 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v50 +; GCN-NEXT: v_or_b32_e32 v39, v39, v56 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x64, v0 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GCN-NEXT: v_or_b32_e32 v49, v49, v57 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x68, v0 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GCN-NEXT: v_or_b32_e32 v51, v51, v58 +; GCN-NEXT: v_add_i32_e32 v45, vcc, 0x6c, v0 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v53 +; GCN-NEXT: v_or_b32_e32 v53, v53, v61 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GCN-NEXT: v_or_b32_e32 v55, v55, v62 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 +; GCN-NEXT: v_or_b32_e32 v41, v41, v63 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v59, v60, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v29, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v33, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v35, v44, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v37, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v39, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v49, v45, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v51, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v53, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v55, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f64_to_v128i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; kill: killed $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[29:30] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[27:28] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[25:26] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[23:24] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[21:22] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[19:20] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[17:18] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[15:16] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[13:14] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[11:12] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[9:10] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[7:8] +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v32 +; VI-NEXT: .LBB36_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; VI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; VI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; VI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; VI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; VI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; VI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; VI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; VI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; VI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[39:40], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v57, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v60, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v52, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v1 +; VI-NEXT: .LBB36_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v55 +; VI-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v41 +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v54, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v53 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v52 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v51 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v42 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f64_to_v128i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; kill: killed $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; kill: killed $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(32) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: .LBB36_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; GFX9-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GFX9-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX9-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX9-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX9-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX9-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX9-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; GFX9-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; GFX9-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; GFX9-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v18 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v17 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v1 +; GFX9-NEXT: .LBB36_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v53 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v48 +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v34, v54, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f64_to_v128i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-NEXT: .LBB36_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; GFX11-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GFX11-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GFX11-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GFX11-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX11-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[19:20] +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v1 +; GFX11-NEXT: .LBB36_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v75 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v63 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v65 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v74 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v61 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v73 +; GFX11-NEXT: v_or_b32_e32 v65, v67, v65 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v58 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v72 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v66 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v62 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v66 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v60 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v59 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v65 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-NEXT: v_or_b32_e32 v55, v66, v67 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v57 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v56 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v65 +; GFX11-NEXT: v_or_b32_e32 v55, v66, v64 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v47 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v46 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v45 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v64 +; GFX11-NEXT: v_or_b32_e32 v55, v65, v66 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v44 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v43 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-NEXT: v_or_b32_e32 v54, v65, v54 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v42 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v41 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v40 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v55 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_or_b32_e32 v54, v64, v65 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v182 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v183 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v181 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v53, v64, v53 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v55 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v65 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v54 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-NEXT: v_and_b32_e32 v53, 0xff, v180 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v179 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v178 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v177 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v176 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v167 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v166 +; GFX11-NEXT: v_or_b32_e32 v53, v53, v54 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v55 +; GFX11-NEXT: v_or_b32_e32 v52, v64, v52 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v65 +; GFX11-NEXT: v_or_b32_e32 v54, v66, v67 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v10, v53 +; GFX11-NEXT: v_or_b32_e32 v3, v11, v52 +; GFX11-NEXT: v_or_b32_e32 v4, v12, v54 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v6, 8, v165 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v164 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v51 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v163 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v162 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v161 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v160 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v151 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v50 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v150 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v149 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v148 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v147 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v146 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v49 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v39, v49 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v145 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v144 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v135 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v134 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v133 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v132 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v131 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v130 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v129 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v128 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v119 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v118 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v117 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v116 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v115 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v114 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v113 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v112 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v103 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v102 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v101 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v100 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v99 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v98 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v87 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v86 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v84 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v83 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v82 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v81 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-NEXT: v_lshlrev_b16 v28, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v32, 8, v70 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v69 +; GFX11-NEXT: v_lshlrev_b16 v34, 8, v68 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v128i8_to_v16f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:388 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:36 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v9 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v21 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v25 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v29 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 +; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v7 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v58, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v59, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v61, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:340 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:372 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB37_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v39 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v56 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v38 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v47 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v54 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v5, v46 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v5, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v37 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v43 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v48 +; GCN-NEXT: v_or_b32_e32 v8, v8, v45 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v9, v53 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v10, v42 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v10, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v11, v41 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v12, v40 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v13, v63 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v14, v50 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_or_b32_e32 v18, v18, v51 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_or_b32_e32 v19, v19, v49 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_or_b32_e32 v20, v20, v60 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_or_b32_e32 v21, v21, v58 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v22, v22, v62 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_or_b32_e32 v23, v23, v59 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_or_b32_e32 v24, v24, v32 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_or_b32_e32 v25, v25, v61 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_or_b32_e32 v26, v26, v34 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v27, v27, v52 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_or_b32_e32 v28, v28, v35 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GCN-NEXT: v_or_b32_e32 v29, v29, v33 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_or_b32_e32 v30, v30, v44 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v57 +; GCN-NEXT: v_or_b32_e32 v31, v31, v36 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v50, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v51, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v52, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v33, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v33, v34, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v35, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v36, v35 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v37, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v37, v38, v37 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v39, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v48, v39 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_or_b32_e32 v48, v55, v48 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v49, v54, v49 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v18, v18, v50 +; GCN-NEXT: v_or_b32_e32 v19, v19, v51 +; GCN-NEXT: v_or_b32_e32 v20, v20, v52 +; GCN-NEXT: v_or_b32_e32 v21, v21, v53 +; GCN-NEXT: v_or_b32_e32 v22, v22, v32 +; GCN-NEXT: v_or_b32_e32 v23, v23, v33 +; GCN-NEXT: v_or_b32_e32 v24, v24, v34 +; GCN-NEXT: v_or_b32_e32 v25, v25, v35 +; GCN-NEXT: v_or_b32_e32 v26, v26, v36 +; GCN-NEXT: v_or_b32_e32 v27, v27, v37 +; GCN-NEXT: v_or_b32_e32 v28, v28, v38 +; GCN-NEXT: v_or_b32_e32 v29, v29, v39 +; GCN-NEXT: v_or_b32_e32 v30, v30, v48 +; GCN-NEXT: v_or_b32_e32 v31, v31, v49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; kill: killed $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: .LBB37_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB37_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v39, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v56, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v38, v2 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v47, v3 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v54, v4 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v46, v5 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v37, v6 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v43, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_or_b32_e32 v8, v45, v8 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v53, v9 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v42, v10 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v41, v11 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v40, v12 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v63, v13 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v50, v14 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v0, v15 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v0, v16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v0, v17 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_or_b32_e32 v18, v51, v18 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_or_b32_e32 v19, v49, v19 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_or_b32_e32 v20, v60, v20 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_or_b32_e32 v21, v58, v21 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v25, v62, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v29, v59, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v37, v32, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v50, v61, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v41, v34, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v45, v52, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v56, v35, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v58, v33, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v59, v44, v22 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v57 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v57, v36, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v60, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v61, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v62, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v63, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v49, v0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v51, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v52, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v54, v23, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v26, v24 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v28, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v30, v28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v31, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v32, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v33, v34, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v35, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v48, v35 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v48, v53, v48 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_mov_b32_e32 v0, v55 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v55, v53 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v55, v40, v55 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v40, v42, v40 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v42, v43, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v43, v44, v43 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v44, v46, v44 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; GCN-NEXT: v_or_b32_e32 v46, v0, v46 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v47, v0, v47 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v60, v0 +; GCN-NEXT: s_movk_i32 s7, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v61, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v62, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v63, v3 +; GCN-NEXT: s_mov_b32 s6, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v21 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v37 +; GCN-NEXT: v_add_i32_e32 v50, vcc, s7, v50 +; GCN-NEXT: v_add_i32_e32 v41, vcc, s7, v41 +; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v45 +; GCN-NEXT: v_add_i32_e32 v56, vcc, s7, v56 +; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v58 +; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v59 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 0x300, v57 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 +; GCN-NEXT: v_and_b32_e32 v56, 0xffff, v56 +; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 +; GCN-NEXT: v_or_b32_e32 v4, v36, v4 +; GCN-NEXT: v_or_b32_e32 v5, v38, v5 +; GCN-NEXT: v_or_b32_e32 v6, v39, v6 +; GCN-NEXT: v_or_b32_e32 v7, v49, v7 +; GCN-NEXT: v_or_b32_e32 v8, v51, v8 +; GCN-NEXT: v_or_b32_e32 v9, v52, v9 +; GCN-NEXT: v_or_b32_e32 v10, v54, v10 +; GCN-NEXT: v_or_b32_e32 v11, v22, v11 +; GCN-NEXT: v_or_b32_e32 v12, v23, v12 +; GCN-NEXT: v_or_b32_e32 v13, v24, v13 +; GCN-NEXT: v_or_b32_e32 v14, v26, v14 +; GCN-NEXT: v_or_b32_e32 v15, v27, v15 +; GCN-NEXT: v_or_b32_e32 v16, v28, v16 +; GCN-NEXT: v_or_b32_e32 v17, v30, v17 +; GCN-NEXT: v_or_b32_e32 v18, v31, v18 +; GCN-NEXT: v_or_b32_e32 v19, v32, v19 +; GCN-NEXT: v_or_b32_e32 v20, v33, v20 +; GCN-NEXT: v_or_b32_e32 v21, v34, v21 +; GCN-NEXT: v_or_b32_e32 v22, v35, v25 +; GCN-NEXT: v_or_b32_e32 v23, v48, v29 +; GCN-NEXT: v_or_b32_e32 v24, v53, v37 +; GCN-NEXT: v_or_b32_e32 v25, v55, v50 +; GCN-NEXT: v_or_b32_e32 v26, v40, v41 +; GCN-NEXT: v_or_b32_e32 v27, v42, v45 +; GCN-NEXT: v_or_b32_e32 v28, v43, v56 +; GCN-NEXT: v_or_b32_e32 v29, v44, v58 +; GCN-NEXT: v_or_b32_e32 v30, v46, v59 +; GCN-NEXT: v_or_b32_e32 v31, v47, v57 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x3000000, v31 +; GCN-NEXT: .LBB37_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v16f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v54 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB37_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: .LBB37_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB37_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v31, 0x300 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v9, 3, v61 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v9, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v39, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v37, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v63 +; VI-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v59 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 3, v62 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 +; VI-NEXT: v_add_u16_sdwa v10, v10, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v58 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v60 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_add_u16_sdwa v11, v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 +; VI-NEXT: v_add_u16_sdwa v12, v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v46 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_add_u16_sdwa v13, v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v45 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v44 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_add_u16_sdwa v14, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v54 +; VI-NEXT: v_add_u16_e32 v15, 3, v40 +; VI-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_add_u16_sdwa v15, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v14, v15 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v16, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v16, v16, v17 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v18, 3, v18 +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v18, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v18 +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v18, 3, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v18, 0x300, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v18, v19 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v19, 0x300, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v19, v20 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v20, 0x300, v20 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v21, v21, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v20, v21 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v21, 0x300, v21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v22, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v21, v22 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v23, 3, v23 +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v22, v23 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v23, 3, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v24, 3, v24 +; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v24, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v23, v24 +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v24, 3, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v25, 3, v25 +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v24, v25 +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v25, 3, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v26, 3, v26 +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v26, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v25, v26 +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v26, 3, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v27, 3, v27 +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v26, v27 +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v27, 3, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v28, 3, v28 +; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v28, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v27, v28 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v28, 3, v28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v29, 3, v29 +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v29, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v28, v29 +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v29, 3, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v30, 3, v30 +; VI-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v29, v30 +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v30, 3, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v32, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 3, v32 +; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v32, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v30, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v32, 3, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v32, 0x300, v32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v33, 3, v33 +; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v31, v33, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v32, v31 +; VI-NEXT: .LBB37_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v16f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:184 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB37_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v40, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v54, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: .LBB37_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB37_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(33) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(30) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v50, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v48, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v38, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v36, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v34, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v49, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v39, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v37, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v35, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v8, v33, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v54 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v14, v32, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v17, v17, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v18, v18, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v19, v19, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v18, v18, v19 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v20, v20, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v19, v19, v20 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v20 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v21, v21, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v21 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v22, v22, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v21, v21, v22 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v23, v23, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v24, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v23 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v24, v24, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v23, v23, v24 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v24 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v25, v25, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v24, v24, v25 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v26, v26, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v27, v27, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v26, v26, v27 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v28, v28, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v28, 3, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v29, v29, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v28, v28, v29 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v29, 3, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v30, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v30, v30, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v29, v29, v30 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v30, 3, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v31, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v30 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v31, v31, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v30, v30, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v32, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v32, v32, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v31, v31, v32 +; GFX9-NEXT: .LBB37_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v128i8_to_v16f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:592 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:588 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:584 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:580 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:576 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:572 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:568 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:564 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:560 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:556 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:552 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:548 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:544 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:540 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:536 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:532 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:528 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:524 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:520 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:516 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:512 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:508 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:504 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:500 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:496 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:492 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:488 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:484 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:480 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:476 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:472 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:468 +; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:464 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:460 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:456 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:452 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:448 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:444 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:440 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:436 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:432 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:428 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:424 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:420 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:416 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:412 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:408 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:404 +; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:400 +; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:396 +; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:392 +; GFX11-NEXT: v_dual_mov_b32 v32, v30 :: v_dual_mov_b32 v33, v28 +; GFX11-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v24 +; GFX11-NEXT: v_dual_mov_b32 v36, v22 :: v_dual_mov_b32 v37, v20 +; GFX11-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v39, v16 +; GFX11-NEXT: v_dual_mov_b32 v48, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-NEXT: v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v8 +; GFX11-NEXT: v_dual_mov_b32 v52, v6 :: v_dual_mov_b32 v53, v4 +; GFX11-NEXT: v_dual_mov_b32 v54, v2 :: v_dual_mov_b32 v55, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:380 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:372 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:364 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:356 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:348 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:340 +; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:332 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:324 +; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:316 +; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:308 +; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:300 +; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:292 +; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:284 +; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:276 +; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:268 +; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:260 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:252 +; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:248 +; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:244 +; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:240 +; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:236 +; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:232 +; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:228 +; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:224 +; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:220 +; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-NEXT: scratch_load_u16 v115, off, s32 +; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:88 +; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:136 +; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:144 +; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:152 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:160 +; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:168 +; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:176 +; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:184 +; GFX11-NEXT: scratch_load_u16 v136, off, s32 offset:192 +; GFX11-NEXT: scratch_load_u16 v137, off, s32 offset:200 +; GFX11-NEXT: scratch_load_u16 v138, off, s32 offset:208 +; GFX11-NEXT: scratch_load_u16 v132, off, s32 offset:212 +; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:204 +; GFX11-NEXT: scratch_load_u16 v134, off, s32 offset:196 +; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:188 +; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:180 +; GFX11-NEXT: scratch_load_u16 v150, off, s32 offset:172 +; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:164 +; GFX11-NEXT: scratch_load_u16 v160, off, s32 offset:156 +; GFX11-NEXT: scratch_load_u16 v161, off, s32 offset:148 +; GFX11-NEXT: scratch_load_u16 v167, off, s32 offset:140 +; GFX11-NEXT: scratch_load_u16 v176, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v177, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v40, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v41, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v42, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v43, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v44, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v58, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v59, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v60, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v124, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v125, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v126, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v127, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v111, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v120, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v121, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v122, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v123, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v106, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v107, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v108, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v109, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v110, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v93, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(54) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 +; GFX11-NEXT: s_waitcnt vmcnt(53) +; GFX11-NEXT: v_lshlrev_b16 v94, 8, v115 +; GFX11-NEXT: s_waitcnt vmcnt(52) +; GFX11-NEXT: v_lshlrev_b16 v95, 8, v116 +; GFX11-NEXT: s_waitcnt vmcnt(51) +; GFX11-NEXT: v_lshlrev_b16 v104, 8, v117 +; GFX11-NEXT: s_waitcnt vmcnt(50) +; GFX11-NEXT: v_lshlrev_b16 v105, 8, v118 +; GFX11-NEXT: s_waitcnt vmcnt(49) +; GFX11-NEXT: v_lshlrev_b16 v79, 8, v119 +; GFX11-NEXT: s_waitcnt vmcnt(48) +; GFX11-NEXT: v_lshlrev_b16 v88, 8, v128 +; GFX11-NEXT: s_waitcnt vmcnt(47) +; GFX11-NEXT: v_lshlrev_b16 v89, 8, v129 +; GFX11-NEXT: s_waitcnt vmcnt(46) +; GFX11-NEXT: v_lshlrev_b16 v90, 8, v130 +; GFX11-NEXT: s_waitcnt vmcnt(45) +; GFX11-NEXT: v_lshlrev_b16 v91, 8, v131 +; GFX11-NEXT: s_waitcnt vmcnt(44) +; GFX11-NEXT: v_lshlrev_b16 v61, 8, v144 +; GFX11-NEXT: s_waitcnt vmcnt(43) +; GFX11-NEXT: v_lshlrev_b16 v62, 8, v145 +; GFX11-NEXT: s_waitcnt vmcnt(42) +; GFX11-NEXT: v_lshlrev_b16 v63, 8, v146 +; GFX11-NEXT: s_waitcnt vmcnt(41) +; GFX11-NEXT: v_lshlrev_b16 v72, 8, v147 +; GFX11-NEXT: s_waitcnt vmcnt(40) +; GFX11-NEXT: v_lshlrev_b16 v73, 8, v148 +; GFX11-NEXT: s_waitcnt vmcnt(39) +; GFX11-NEXT: v_lshlrev_b16 v45, 8, v162 +; GFX11-NEXT: s_waitcnt vmcnt(38) +; GFX11-NEXT: v_lshlrev_b16 v46, 8, v163 +; GFX11-NEXT: s_waitcnt vmcnt(37) +; GFX11-NEXT: v_lshlrev_b16 v47, 8, v164 +; GFX11-NEXT: s_waitcnt vmcnt(36) +; GFX11-NEXT: v_lshlrev_b16 v56, 8, v165 +; GFX11-NEXT: s_waitcnt vmcnt(35) +; GFX11-NEXT: v_lshlrev_b16 v57, 8, v166 +; GFX11-NEXT: s_waitcnt vmcnt(34) +; GFX11-NEXT: v_lshlrev_b16 v179, 8, v179 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v180, 8, v180 +; GFX11-NEXT: s_waitcnt vmcnt(32) +; GFX11-NEXT: v_lshlrev_b16 v181, 8, v181 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v182, 8, v182 +; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: v_lshlrev_b16 v183, 8, v183 +; GFX11-NEXT: s_waitcnt vmcnt(29) +; GFX11-NEXT: v_lshlrev_b16 v162, 8, v136 +; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: v_lshlrev_b16 v163, 8, v137 +; GFX11-NEXT: s_waitcnt vmcnt(27) +; GFX11-NEXT: v_lshlrev_b16 v164, 8, v138 +; GFX11-NEXT: v_lshlrev_b16 v165, 8, v103 +; GFX11-NEXT: v_lshlrev_b16 v166, 8, v102 +; GFX11-NEXT: v_lshlrev_b16 v144, 8, v101 +; GFX11-NEXT: v_lshlrev_b16 v145, 8, v100 +; GFX11-NEXT: v_lshlrev_b16 v146, 8, v99 +; GFX11-NEXT: v_lshlrev_b16 v147, 8, v31 +; GFX11-NEXT: v_lshlrev_b16 v148, 8, v30 +; GFX11-NEXT: v_lshlrev_b16 v119, 8, v28 +; GFX11-NEXT: v_lshlrev_b16 v128, 8, v26 +; GFX11-NEXT: v_lshlrev_b16 v129, 8, v24 +; GFX11-NEXT: v_lshlrev_b16 v130, 8, v22 +; GFX11-NEXT: v_lshlrev_b16 v131, 8, v20 +; GFX11-NEXT: v_lshlrev_b16 v114, 8, v18 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v16 +; GFX11-NEXT: v_lshlrev_b16 v116, 8, v14 +; GFX11-NEXT: v_lshlrev_b16 v117, 8, v12 +; GFX11-NEXT: v_lshlrev_b16 v118, 8, v10 +; GFX11-NEXT: v_lshlrev_b16 v99, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v6 +; GFX11-NEXT: v_lshlrev_b16 v101, 8, v4 +; GFX11-NEXT: v_lshlrev_b16 v102, 8, v2 +; GFX11-NEXT: v_lshlrev_b16 v103, 8, v0 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB37_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v53 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v51 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v124 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v125 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v126 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v127 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v39 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v36 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v48 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v34 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v111 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v121 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v120 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v122 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v123 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v107 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v38 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v108 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v109 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v110 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v106 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v6, v12 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v92 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v78 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v77 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v76 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v75 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v74 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v59 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v93 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v94 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v95 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v104 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v105 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v79 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v88 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v89 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v90 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v91 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v58 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v44 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v43 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v41 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v40 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v176 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v167 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v61 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v62 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v63 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v72 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v73 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v45 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v46 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v47 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v56 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v57 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v161 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v160 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v151 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v150 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v135 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v134 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v133 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v132 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v113 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v179 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v180 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v181 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v182 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v183 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v162 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v163 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v164 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v165 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v166 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v112 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v98 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v97 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v96 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v87 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v86 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v85 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v84 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v83 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v82 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v144 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v145 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v146 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v147 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v148 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v119 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v128 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v129 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v130 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v131 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v71 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v70 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v68 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v67 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v66 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v65 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v64 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v114 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v115 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v116 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v117 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v118 +; GFX11-NEXT: v_or_b32_e32 v32, v32, v99 +; GFX11-NEXT: v_or_b32_e32 v33, v33, v100 +; GFX11-NEXT: v_or_b32_e32 v34, v34, v101 +; GFX11-NEXT: v_or_b32_e32 v35, v35, v102 +; GFX11-NEXT: v_or_b32_e32 v36, v36, v103 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr92 +; GFX11-NEXT: ; implicit-def: $vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr77 +; GFX11-NEXT: ; implicit-def: $vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr124 +; GFX11-NEXT: ; implicit-def: $vgpr125 +; GFX11-NEXT: ; implicit-def: $vgpr126 +; GFX11-NEXT: ; implicit-def: $vgpr127 +; GFX11-NEXT: ; implicit-def: $vgpr111 +; GFX11-NEXT: ; implicit-def: $vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr121 +; GFX11-NEXT: ; implicit-def: $vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr108 +; GFX11-NEXT: ; implicit-def: $vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr110 +; GFX11-NEXT: ; implicit-def: $vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr94 +; GFX11-NEXT: ; implicit-def: $vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr104 +; GFX11-NEXT: ; implicit-def: $vgpr105 +; GFX11-NEXT: ; implicit-def: $vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr89 +; GFX11-NEXT: ; implicit-def: $vgpr90 +; GFX11-NEXT: ; implicit-def: $vgpr91 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: .LBB37_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB37_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v55, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v54, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v53, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v52, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v51, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_add_nc_u16 v5, v50, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v124, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v125, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v126, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v127, v3 +; GFX11-NEXT: v_add_nc_u16 v6, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v7, v48, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_add_nc_u16 v8, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v36, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_nc_u16 v10, v35, 3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_add_nc_u16 v2, v39, 3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v11, v34, 3 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_or_b32_e32 v3, v111, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v120, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v121, v5 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v6, v122, v6 +; GFX11-NEXT: v_or_b32_e32 v8, v107, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v108, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v109, v10 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v38, 3 +; GFX11-NEXT: v_or_b32_e32 v11, v110, v11 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_or_b32_e32 v2, v106, v2 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v12 +; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 +; GFX11-NEXT: v_add_nc_u16 v7, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v32, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v9, v92, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v78, 3 +; GFX11-NEXT: v_add_nc_u16 v11, v77, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v76, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v75, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v74, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v60, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v59, 3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_or_b32_e32 v7, v93, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v94, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v95, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v104, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v105, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v79, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v88, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v89, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v90, v15 +; GFX11-NEXT: v_or_b32_e32 v16, v91, v16 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 +; GFX11-NEXT: v_add_nc_u16 v12, v58, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v44, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v43, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v42, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v41, 3 +; GFX11-NEXT: v_add_nc_u16 v17, v40, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v178, 3 +; GFX11-NEXT: v_add_nc_u16 v19, v177, 3 +; GFX11-NEXT: v_add_nc_u16 v20, v176, 3 +; GFX11-NEXT: v_add_nc_u16 v21, v167, 3 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v61, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v62, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v63, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v72, v15 +; GFX11-NEXT: v_or_b32_e32 v16, v73, v16 +; GFX11-NEXT: v_or_b32_e32 v17, v45, v17 +; GFX11-NEXT: v_or_b32_e32 v18, v46, v18 +; GFX11-NEXT: v_or_b32_e32 v19, v47, v19 +; GFX11-NEXT: v_or_b32_e32 v20, v56, v20 +; GFX11-NEXT: v_or_b32_e32 v21, v57, v21 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v14, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v16, v20, v21 +; GFX11-NEXT: v_add_nc_u16 v17, v161, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v160, 3 +; GFX11-NEXT: v_add_nc_u16 v19, v151, 3 +; GFX11-NEXT: v_add_nc_u16 v20, v150, 3 +; GFX11-NEXT: v_add_nc_u16 v21, v149, 3 +; GFX11-NEXT: v_add_nc_u16 v22, v135, 3 +; GFX11-NEXT: v_add_nc_u16 v23, v134, 3 +; GFX11-NEXT: v_add_nc_u16 v24, v133, 3 +; GFX11-NEXT: v_add_nc_u16 v25, v132, 3 +; GFX11-NEXT: v_add_nc_u16 v26, v113, 3 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-NEXT: v_or_b32_e32 v17, v179, v17 +; GFX11-NEXT: v_or_b32_e32 v18, v180, v18 +; GFX11-NEXT: v_or_b32_e32 v19, v181, v19 +; GFX11-NEXT: v_or_b32_e32 v20, v182, v20 +; GFX11-NEXT: v_or_b32_e32 v21, v183, v21 +; GFX11-NEXT: v_or_b32_e32 v22, v162, v22 +; GFX11-NEXT: v_or_b32_e32 v23, v163, v23 +; GFX11-NEXT: v_or_b32_e32 v24, v164, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v165, v25 +; GFX11-NEXT: v_or_b32_e32 v26, v166, v26 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v19, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_add_nc_u16 v22, v112, 3 +; GFX11-NEXT: v_add_nc_u16 v23, v98, 3 +; GFX11-NEXT: v_add_nc_u16 v24, v97, 3 +; GFX11-NEXT: v_add_nc_u16 v25, v96, 3 +; GFX11-NEXT: v_add_nc_u16 v26, v87, 3 +; GFX11-NEXT: v_add_nc_u16 v27, v86, 3 +; GFX11-NEXT: v_add_nc_u16 v28, v85, 3 +; GFX11-NEXT: v_add_nc_u16 v29, v84, 3 +; GFX11-NEXT: v_add_nc_u16 v30, v83, 3 +; GFX11-NEXT: v_add_nc_u16 v31, v82, 3 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-NEXT: v_or_b32_e32 v22, v144, v22 +; GFX11-NEXT: v_or_b32_e32 v23, v145, v23 +; GFX11-NEXT: v_or_b32_e32 v24, v146, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v147, v25 +; GFX11-NEXT: v_or_b32_e32 v26, v148, v26 +; GFX11-NEXT: v_or_b32_e32 v27, v119, v27 +; GFX11-NEXT: v_or_b32_e32 v28, v128, v28 +; GFX11-NEXT: v_or_b32_e32 v29, v129, v29 +; GFX11-NEXT: v_or_b32_e32 v30, v130, v30 +; GFX11-NEXT: v_or_b32_e32 v31, v131, v31 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v24 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v25 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v26 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v23, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v24, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v25, v28, v29 +; GFX11-NEXT: v_or_b32_e32 v26, v30, v31 +; GFX11-NEXT: v_add_nc_u16 v27, v81, 3 +; GFX11-NEXT: v_add_nc_u16 v28, v80, 3 +; GFX11-NEXT: v_add_nc_u16 v29, v71, 3 +; GFX11-NEXT: v_add_nc_u16 v30, v70, 3 +; GFX11-NEXT: v_add_nc_u16 v31, v69, 3 +; GFX11-NEXT: v_add_nc_u16 v32, v68, 3 +; GFX11-NEXT: v_add_nc_u16 v33, v67, 3 +; GFX11-NEXT: v_add_nc_u16 v34, v66, 3 +; GFX11-NEXT: v_add_nc_u16 v35, v65, 3 +; GFX11-NEXT: v_add_nc_u16 v36, v64, 3 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-NEXT: v_or_b32_e32 v27, v114, v27 +; GFX11-NEXT: v_or_b32_e32 v28, v115, v28 +; GFX11-NEXT: v_or_b32_e32 v29, v116, v29 +; GFX11-NEXT: v_or_b32_e32 v30, v117, v30 +; GFX11-NEXT: v_or_b32_e32 v31, v118, v31 +; GFX11-NEXT: v_or_b32_e32 v32, v99, v32 +; GFX11-NEXT: v_or_b32_e32 v33, v100, v33 +; GFX11-NEXT: v_or_b32_e32 v34, v101, v34 +; GFX11-NEXT: v_or_b32_e32 v35, v102, v35 +; GFX11-NEXT: v_or_b32_e32 v36, v103, v36 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v27 +; GFX11-NEXT: v_add_nc_u16 v28, 0x300, v28 +; GFX11-NEXT: v_add_nc_u16 v29, 0x300, v29 +; GFX11-NEXT: v_add_nc_u16 v30, 0x300, v30 +; GFX11-NEXT: v_add_nc_u16 v31, 0x300, v31 +; GFX11-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-NEXT: v_add_nc_u16 v34, 0x300, v34 +; GFX11-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-NEXT: v_add_nc_u16 v36, 0x300, v36 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GFX11-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v28, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v29, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v30, v33, v34 +; GFX11-NEXT: v_or_b32_e32 v31, v35, v36 +; GFX11-NEXT: .LBB37_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:392 +; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:396 +; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:400 +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:404 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:408 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:412 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:416 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:420 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:424 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:428 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:432 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:436 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:440 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:444 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:448 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:452 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:456 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:460 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:464 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:468 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:472 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:476 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:480 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:484 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:488 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:492 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:496 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:500 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:504 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:508 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:512 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:516 +; GFX11-NEXT: s_clause 0x12 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:520 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:524 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:528 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:532 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:536 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:540 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:544 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:548 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:552 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:556 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:560 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:564 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:568 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:572 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:576 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:580 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:584 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:588 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:592 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <64 x bfloat> @bitcast_v16f64_to_v64bf16(<16 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f64_to_v64bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB38_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v12 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v11 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v10 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v8 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v7 +; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v5 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v2 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: .LBB38_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB38_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v32 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v29 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v15 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v14 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v13 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v12 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v11 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v10 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v8 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v7 +; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v5 +; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v3 +; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v2 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v1 +; GCN-NEXT: .LBB38_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v34 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v63 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v61 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v60 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v60, v1, v2, 16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v58, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v56, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 12, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v47, vcc, 16, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v45 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v44, vcc, 20, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v43 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_alignbit_b32 v18, v18, v19, 16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_alignbit_b32 v20, v20, v21, 16 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_alignbit_b32 v24, v24, v25, 16 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_alignbit_b32 v26, v26, v27, 16 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_alignbit_b32 v28, v28, v29, 16 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_alignbit_b32 v30, v30, v31, 16 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_alignbit_b32 v32, v32, v33, 16 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_alignbit_b32 v34, v34, v35, 16 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_alignbit_b32 v36, v36, v37, 16 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_alignbit_b32 v38, v38, v39, 16 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_alignbit_b32 v48, v48, v49, 16 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_alignbit_b32 v50, v50, v51, 16 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_alignbit_b32 v52, v52, v53, 16 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_alignbit_b32 v54, v54, v55, 16 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_alignbit_b32 v40, v40, v41, 16 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_alignbit_b32 v42, v42, v43, 16 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v60, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v56, v47, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v44, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f64_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f64_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f64_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <16 x double> @bitcast_v64bf16_to_v16f64(<64 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v64bf16_to_v16f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v13 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v15 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v17 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v33 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v42 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v40 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v53 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v63 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v43 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v54 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB39_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; GCN-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v61, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; GCN-NEXT: v_alignbit_b32 v2, v2, v59, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v56 +; GCN-NEXT: v_alignbit_b32 v3, v3, v57, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v46 +; GCN-NEXT: v_alignbit_b32 v4, v4, v47, 16 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v5, v5, v45, 16 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v8, v8, v9, 16 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v10, v10, v11, 16 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v11, v11, v12, 16 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v12, v12, v13, 16 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v13, v13, v14, 16 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v14, v14, v15, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v16, v16, v33, 16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v17, v18, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v43 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; GCN-NEXT: v_alignbit_b32 v19, v19, v42, 16 +; GCN-NEXT: v_alignbit_b32 v20, v20, v44, 16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; GCN-NEXT: v_alignbit_b32 v22, v22, v48, 16 +; GCN-NEXT: v_alignbit_b32 v23, v23, v38, 16 +; GCN-NEXT: v_alignbit_b32 v24, v24, v50, 16 +; GCN-NEXT: v_alignbit_b32 v25, v25, v51, 16 +; GCN-NEXT: v_alignbit_b32 v26, v26, v53, 16 +; GCN-NEXT: v_alignbit_b32 v27, v27, v55, 16 +; GCN-NEXT: v_alignbit_b32 v28, v28, v40, 16 +; GCN-NEXT: v_alignbit_b32 v29, v29, v63, 16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; kill: killed $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: .LBB39_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB39_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v58 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v57 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v56 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v3, v4, v3, 16 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v47 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v46 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v45 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_alignbit_b32 v6, v7, v6, 16 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_alignbit_b32 v10, v11, v10, 16 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_alignbit_b32 v11, v12, v11, 16 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_alignbit_b32 v12, v13, v12, 16 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v15, v16, v15, 16 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v33 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v44 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v35 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v53 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v39 +; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v40 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v63 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v54 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v41 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v43 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v30 +; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v31 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v32 +; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v33 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v34 +; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v36 +; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v38 +; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v39 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v48 +; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v49 +; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v50 +; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 +; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v52 +; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_alignbit_b32 v18, v19, v18, 16 +; GCN-NEXT: v_alignbit_b32 v19, v21, v20, 16 +; GCN-NEXT: v_alignbit_b32 v20, v39, v54, 16 +; GCN-NEXT: v_alignbit_b32 v21, v48, v40, 16 +; GCN-NEXT: v_alignbit_b32 v22, v49, v22, 16 +; GCN-NEXT: v_alignbit_b32 v23, v50, v23, 16 +; GCN-NEXT: v_alignbit_b32 v24, v51, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v32, v25, 16 +; GCN-NEXT: v_alignbit_b32 v26, v33, v26, 16 +; GCN-NEXT: v_alignbit_b32 v27, v34, v27, 16 +; GCN-NEXT: v_alignbit_b32 v28, v35, v28, 16 +; GCN-NEXT: v_alignbit_b32 v29, v36, v29, 16 +; GCN-NEXT: v_alignbit_b32 v30, v37, v30, 16 +; GCN-NEXT: v_alignbit_b32 v31, v38, v31, 16 +; GCN-NEXT: .LBB39_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v16f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB39_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v15 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v14 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v13 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v12 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v11 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v10 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v9 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v8 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v7 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v6 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v5 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v4 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v3 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v2 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v1 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v0 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v32, 16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v31 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v30 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v29 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v28 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v27 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v26 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v25 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v24 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v23 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v22 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v21 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v20 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v19 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v18 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v18, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v17 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v32, 16 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v16f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB39_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v33, v34, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v32, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v31, v31, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v30, v30, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v29, v29, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v28, v28, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v27, v27, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v26, v26, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v25, v25, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v24, v24, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v23, v23, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v22, v22, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v21, v21, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v20, v20, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v19, v19, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v18, v18, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v17, v17, v32, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_perm_b32 v16, v16, v32, s7 +; GFX9-NEXT: .LBB39_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64bf16_to_v16f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB39_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_bfe_u32 v34, v32, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v39, v14, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-NEXT: v_add3_u32 v34, v34, v32, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc_lo +; GFX11-NEXT: v_add3_u32 v35, v39, v14, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_bfe_u32 v36, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v36, v36, v15, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v15, v36, v38 :: v_dual_lshlrev_b32 v36, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v15, v15, v32, 0x7060302 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v37, v34 :: v_dual_add_f32 v34, 0x40c00000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v14, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_perm_b32 v14, v14, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v13, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_cndmask_b32 v32, v32, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add3_u32 v33, v33, v13, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v33, v37, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_perm_b32 v13, v13, v32, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v34, v34, v12, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v12, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v12, v12, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v11, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v33, v33, v11, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_bfe_u32 v35, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v35, v35, v10, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v11, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v10 +; GFX11-NEXT: v_perm_b32 v11, v11, v32, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v35, v38 :: v_dual_lshlrev_b32 v37, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_perm_b32 v10, v10, v33, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v34, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v33, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v33, v33, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v8 +; GFX11-NEXT: v_perm_b32 v9, v9, v32, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v8, v8, v33, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_bfe_u32 v35, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v6, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v33, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v33, v33, v7, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v6, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GFX11-NEXT: v_perm_b32 v7, v7, v32, 0x7060302 +; GFX11-NEXT: v_perm_b32 v6, v6, v33, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_bfe_u32 v34, v4, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v32, v32, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v34, v34, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v3 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v4 +; GFX11-NEXT: v_perm_b32 v5, v5, v32, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v37 :: v_dual_cndmask_b32 v4, v34, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_perm_b32 v4, v4, v33, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v35, v2, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v2, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_bfe_u32 v33, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v33, v33, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v3, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v2, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_perm_b32 v3, v3, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_perm_b32 v2, v2, v33, 0x7060302 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v34, v0, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v0, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_bfe_u32 v33, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v33, v33, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v1, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v0 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v30 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-NEXT: v_perm_b32 v1, v1, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_perm_b32 v0, v0, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v31, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX11-NEXT: v_add3_u32 v33, v33, v31, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_bfe_u32 v35, v30, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v29 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v30, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v30 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v30, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v28 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-NEXT: v_perm_b32 v31, v31, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v30, v30, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_add3_u32 v33, v33, v29, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v29 +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_bfe_u32 v34, v28, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v27 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v28, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v28 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v28, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v26 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-NEXT: v_perm_b32 v29, v29, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_perm_b32 v28, v28, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-NEXT: v_add3_u32 v33, v33, v27, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_bfe_u32 v35, v26, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v25 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v26 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_dual_cndmask_b32 v26, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_perm_b32 v27, v27, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v32, v34, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v26, v26, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v34, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add3_u32 v33, v33, v25, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v35, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v37 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_perm_b32 v25, v25, v32, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_bfe_u32 v32, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v34, v38, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-NEXT: v_add3_u32 v32, v32, v35, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_perm_b32 v24, v24, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v33, v23, 0x7fff +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v33, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v22 +; GFX11-NEXT: v_dual_add_f32 v34, 0x40c00000, v37 :: v_dual_cndmask_b32 v33, v33, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-NEXT: v_perm_b32 v23, v23, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v36, v34, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v22, v35, v38 :: v_dual_lshlrev_b32 v35, 16, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-NEXT: v_perm_b32 v22, v22, v33, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v35 +; GFX11-NEXT: v_add3_u32 v35, v36, v34, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v32, v32, v21, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-NEXT: v_bfe_u32 v38, v33, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v32, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v32, v38, v33, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v36 :: v_dual_lshlrev_b32 v36, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v21, v21, v34, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v35, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v36 +; GFX11-NEXT: v_add3_u32 v35, v37, v33, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-NEXT: v_bfe_u32 v37, v19, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-NEXT: v_perm_b32 v20, v20, v32, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v35, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v35, v37, v19, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v19 +; GFX11-NEXT: v_add3_u32 v37, v38, v34, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_bfe_u32 v48, v18, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v36, v48, v18, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-NEXT: v_perm_b32 v19, v19, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v38, v35, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v37, v39, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v36, v37, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GFX11-NEXT: v_add3_u32 v37, v38, v35, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-NEXT: v_bfe_u32 v39, v17, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_bfe_u32 v48, v36, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v36 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v16 +; GFX11-NEXT: v_perm_b32 v18, v18, v34, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v35, v37, v38, vcc_lo +; GFX11-NEXT: v_add3_u32 v38, v39, v17, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_bfe_u32 v37, v16, 16, 1 +; GFX11-NEXT: v_add3_u32 v48, v48, v36, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v38, v39, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_add3_u32 v37, v37, v16, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v17, v17, v35, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v48, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v37, v50, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v16, v16, v36, 0x7060302 +; GFX11-NEXT: .LBB39_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f64_to_v64f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB40_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v13 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v12 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v11 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v9 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v8 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v7 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v49 +; GCN-NEXT: v_mov_b32_e32 v49, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v56 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v47, v58 +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v59 +; GCN-NEXT: v_cvt_f32_f16_e32 v57, v60 +; GCN-NEXT: v_cvt_f32_f16_e32 v58, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v59, v62 +; GCN-NEXT: v_cvt_f32_f16_e32 v60, v63 +; GCN-NEXT: v_cvt_f32_f16_e32 v61, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v62, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v63, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v4 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: .LBB40_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB40_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v7 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v24 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v25 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v26 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v27 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v3 +; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v60 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v59 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 +; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 +; GCN-NEXT: v_cvt_f32_f16_e32 v59, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v60, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v61, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v62, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v63, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 +; GCN-NEXT: .LBB40_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v34 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v63 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v63, v2, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v62 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v46, v2, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v61 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v61, v3, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v60 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v60, v4, v3 +; GCN-NEXT: v_add_i32_e32 v62, vcc, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v59 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v59, vcc, 20, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v58 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v58, v4, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v57 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v56 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v47 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v45 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v44 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v42 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v41 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v40 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v53 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v20, v21, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v52 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v51 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v50 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v48 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_or_b32_e32 v28, v29, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v38 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_or_b32_e32 v30, v31, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v37 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v35 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_or_b32_e32 v34, v35, v34 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_or_b32_e32 v36, v37, v36 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v39 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_or_b32_e32 v38, v39, v38 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v49 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_or_b32_e32 v48, v49, v48 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_or_b32_e32 v50, v51, v50 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_or_b32_e32 v52, v53, v52 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_or_b32_e32 v54, v55, v54 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_or_b32_e32 v40, v41, v40 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_or_b32_e32 v42, v43, v42 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v63, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v60, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f64_to_v64f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f64_to_v64f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f64_to_v64f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <16 x double> @bitcast_v64f16_to_v16f64(<64 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v64f16_to_v16f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v62, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v61, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v60, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v59, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v58, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v57, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v56, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v47, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v46, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v10 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v13 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v15 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v14 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v17 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v49 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v44 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v41 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v54 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v52 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v48 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v37 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v35 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v31 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v1 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB41_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v61 +; GCN-NEXT: v_or_b32_e32 v0, v62, v0 +; GCN-NEXT: v_or_b32_e32 v1, v60, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; GCN-NEXT: v_or_b32_e32 v2, v58, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; GCN-NEXT: v_or_b32_e32 v3, v56, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v47 +; GCN-NEXT: v_or_b32_e32 v4, v46, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v45 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v51 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v37 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v32, v18 +; GCN-NEXT: v_or_b32_e32 v19, v43, v19 +; GCN-NEXT: v_or_b32_e32 v20, v41, v20 +; GCN-NEXT: v_or_b32_e32 v21, v55, v21 +; GCN-NEXT: v_or_b32_e32 v22, v49, v22 +; GCN-NEXT: v_or_b32_e32 v23, v50, v23 +; GCN-NEXT: v_or_b32_e32 v24, v39, v24 +; GCN-NEXT: v_or_b32_e32 v25, v36, v25 +; GCN-NEXT: v_or_b32_e32 v26, v48, v26 +; GCN-NEXT: v_or_b32_e32 v27, v52, v27 +; GCN-NEXT: v_or_b32_e32 v28, v53, v28 +; GCN-NEXT: v_or_b32_e32 v29, v54, v29 +; GCN-NEXT: v_or_b32_e32 v30, v40, v30 +; GCN-NEXT: v_or_b32_e32 v31, v42, v31 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: .LBB41_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB41_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v59 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v56 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v46 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v45 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v11, v12, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v51 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v17, v18, v17 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v41 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v55 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v50 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v39 +; GCN-NEXT: v_mov_b32_e32 v39, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v48 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v42 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v31 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v48 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v49 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v51 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v52 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_or_b32_e32 v19, v21, v20 +; GCN-NEXT: v_or_b32_e32 v20, v55, v39 +; GCN-NEXT: v_or_b32_e32 v21, v41, v48 +; GCN-NEXT: v_or_b32_e32 v22, v22, v49 +; GCN-NEXT: v_or_b32_e32 v23, v23, v50 +; GCN-NEXT: v_or_b32_e32 v24, v24, v51 +; GCN-NEXT: v_or_b32_e32 v25, v25, v32 +; GCN-NEXT: v_or_b32_e32 v26, v26, v36 +; GCN-NEXT: v_or_b32_e32 v27, v27, v38 +; GCN-NEXT: v_or_b32_e32 v28, v28, v33 +; GCN-NEXT: v_or_b32_e32 v29, v29, v34 +; GCN-NEXT: v_or_b32_e32 v30, v30, v35 +; GCN-NEXT: v_or_b32_e32 v31, v31, v37 +; GCN-NEXT: .LBB41_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v16f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v32, 0x200 +; VI-NEXT: v_add_f16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_f16_sdwa v33, v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_f16_sdwa v33, v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_f16_sdwa v33, v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_f16_sdwa v33, v11, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_f16_sdwa v33, v10, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_f16_sdwa v33, v9, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_f16_sdwa v33, v8, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_f16_sdwa v33, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_f16_sdwa v33, v6, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_f16_sdwa v33, v5, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_f16_sdwa v33, v4, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_f16_sdwa v33, v3, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_f16_sdwa v33, v2, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_f16_sdwa v33, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_f16_sdwa v33, v0, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v33, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_f16_sdwa v33, v30, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_f16_sdwa v33, v29, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_f16_sdwa v33, v28, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_f16_sdwa v33, v27, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_f16_sdwa v33, v26, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_f16_sdwa v33, v25, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_f16_sdwa v33, v24, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_f16_sdwa v33, v23, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_f16_sdwa v33, v22, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_f16_sdwa v33, v21, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_f16_sdwa v33, v20, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_f16_sdwa v33, v19, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_f16_sdwa v33, v18, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_or_b32_e32 v18, v18, v33 +; VI-NEXT: v_add_f16_sdwa v33, v17, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v32, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64f16_to_v16f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64f16_to_v16f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB41_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <64 x i16> @bitcast_v16f64_to_v64i16(<16 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f64_to_v64i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB42_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v45, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v63, v2, v1, 16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GCN-NEXT: .LBB42_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB42_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GCN-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; GCN-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; GCN-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; GCN-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; GCN-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; GCN-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; GCN-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; GCN-NEXT: v_alignbit_b32 v33, v32, v31, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v30, v29, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v28, v27, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v26, v25, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v24, v23, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v22, v21, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v20, v19, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v14, v13, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v12, v11, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v10, v9, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v47, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v45, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v63, v2, v1, 16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GCN-NEXT: .LBB42_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v63 +; GCN-NEXT: v_or_b32_e32 v1, v1, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_or_b32_e32 v2, v2, v33 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v38, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v30 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_and_b32_e32 v52, 0xffff, v31 +; GCN-NEXT: v_and_b32_e32 v63, 0xffff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v45 +; GCN-NEXT: v_or_b32_e32 v46, v44, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v62 +; GCN-NEXT: v_or_b32_e32 v62, v42, v6 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 8, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v47 +; GCN-NEXT: v_or_b32_e32 v44, v41, v6 +; GCN-NEXT: v_add_i32_e32 v56, vcc, 12, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v61 +; GCN-NEXT: v_or_b32_e32 v45, v4, v6 +; GCN-NEXT: v_add_i32_e32 v47, vcc, 16, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v1, v1, v4 +; GCN-NEXT: v_add_i32_e32 v61, vcc, 20, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; GCN-NEXT: v_or_b32_e32 v2, v2, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v3, v3, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v59 +; GCN-NEXT: v_or_b32_e32 v5, v5, v8 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v7, v7, v10 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v58 +; GCN-NEXT: v_or_b32_e32 v9, v9, v12 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v11, v11, v14 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v43 +; GCN-NEXT: v_or_b32_e32 v13, v13, v16 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v15, v15, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v17, v40, v17 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_or_b32_e32 v19, v55, v19 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_or_b32_e32 v21, v53, v21 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_or_b32_e32 v23, v51, v23 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v0 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_or_b32_e32 v25, v36, v25 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v0 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_or_b32_e32 v27, v49, v27 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_or_b32_e32 v29, v38, v29 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_or_b32_e32 v31, v37, v31 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x54, v0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_or_b32_e32 v33, v33, v36 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x58, v0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_or_b32_e32 v36, v48, v36 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x5c, v0 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GCN-NEXT: v_or_b32_e32 v39, v39, v49 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GCN-NEXT: v_or_b32_e32 v35, v35, v51 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x64, v0 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_or_b32_e32 v50, v50, v53 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: v_or_b32_e32 v34, v34, v55 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x6c, v0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_or_b32_e32 v54, v54, v40 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; GCN-NEXT: v_or_b32_e32 v52, v52, v41 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; GCN-NEXT: v_or_b32_e32 v41, v63, v41 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v46, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v62, v56, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v47, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v45, v61, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v16, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v29, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v33, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v39, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v35, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f64_to_v64i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; VI-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; VI-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; VI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; VI-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; VI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: .LBB42_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f64_to_v64i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX9-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX9-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX9-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX9-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX9-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: .LBB42_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f64_to_v64i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f64 v[30:31], v[30:31], 1.0 +; GFX11-NEXT: v_add_f64 v[28:29], v[28:29], 1.0 +; GFX11-NEXT: v_add_f64 v[26:27], v[26:27], 1.0 +; GFX11-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 +; GFX11-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GFX11-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: .LBB42_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <16 x double> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x double> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <16 x double> @bitcast_v64i16_to_v16f64(<64 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i16_to_v16f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v37, v20 +; GCN-NEXT: v_mov_b32_e32 v38, v18 +; GCN-NEXT: v_mov_b32_e32 v39, v16 +; GCN-NEXT: v_mov_b32_e32 v48, v14 +; GCN-NEXT: v_mov_b32_e32 v49, v12 +; GCN-NEXT: v_mov_b32_e32 v50, v10 +; GCN-NEXT: v_mov_b32_e32 v51, v8 +; GCN-NEXT: v_mov_b32_e32 v52, v6 +; GCN-NEXT: v_mov_b32_e32 v53, v4 +; GCN-NEXT: v_mov_b32_e32 v54, v2 +; GCN-NEXT: v_mov_b32_e32 v55, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(12) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v24 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB43_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v55 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; GCN-NEXT: v_or_b32_e32 v0, v0, v36 +; GCN-NEXT: v_or_b32_e32 v1, v1, v58 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v53 +; GCN-NEXT: v_or_b32_e32 v2, v2, v57 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; GCN-NEXT: v_or_b32_e32 v3, v3, v35 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v51 +; GCN-NEXT: v_or_b32_e32 v4, v4, v60 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v5, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v49 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v48 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v7, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v39 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v38 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v10, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v43 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v56 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v46 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v45 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v32 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v34 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v42 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v41 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v40 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v63 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v62 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v61 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v47 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v33 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v44 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v18, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v19, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v20, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v21, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v22, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v23, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v24, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v25, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v26, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v27, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v28, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v29, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v30, v32 +; GCN-NEXT: v_or_b32_e32 v31, v31, v59 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: .LBB43_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB43_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v36, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v58, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v57, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v52 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v35, v3 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v50 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v43 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v45 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v42 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v41 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v62 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v61 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v47 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v44 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v4, v60, v4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v32, v5 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v32, v6 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v32, v7 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v32, v8 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v32, v9 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v32, v10 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v32, v11 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v32, v12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v32, v13 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v32, v14 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v32, v15 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v32, v16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v32, v17 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v32, v18 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v32, v19 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v32, v20 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v32, v22 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v32, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v32, v25 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v32, v26 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v32, v27 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v32, v29 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v32, v30 +; GCN-NEXT: v_or_b32_e32 v31, v59, v31 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v29 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v31 +; GCN-NEXT: .LBB43_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(11) +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v16f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB43_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v33, 3 +; VI-NEXT: v_add_u16_e32 v32, 3, v15 +; VI-NEXT: v_add_u16_sdwa v15, v15, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v32, v15 +; VI-NEXT: v_add_u16_e32 v32, 3, v14 +; VI-NEXT: v_add_u16_sdwa v14, v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v32, v14 +; VI-NEXT: v_add_u16_e32 v32, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v32, v13 +; VI-NEXT: v_add_u16_e32 v32, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v32, v12 +; VI-NEXT: v_add_u16_e32 v32, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v32, v11 +; VI-NEXT: v_add_u16_e32 v32, 3, v10 +; VI-NEXT: v_add_u16_sdwa v10, v10, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v32, v10 +; VI-NEXT: v_add_u16_e32 v32, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v32, v9 +; VI-NEXT: v_add_u16_e32 v32, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v32, v8 +; VI-NEXT: v_add_u16_e32 v32, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v32, v7 +; VI-NEXT: v_add_u16_e32 v32, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v32, v6 +; VI-NEXT: v_add_u16_e32 v32, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v32, v5 +; VI-NEXT: v_add_u16_e32 v32, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v32, v4 +; VI-NEXT: v_add_u16_e32 v32, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v32, v3 +; VI-NEXT: v_add_u16_e32 v32, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v32, v2 +; VI-NEXT: v_add_u16_e32 v32, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v32, v1 +; VI-NEXT: v_add_u16_e32 v32, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v32, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v32, 3, v31 +; VI-NEXT: v_add_u16_sdwa v31, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v32, v31 +; VI-NEXT: v_add_u16_e32 v32, 3, v30 +; VI-NEXT: v_add_u16_sdwa v30, v30, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v32, v30 +; VI-NEXT: v_add_u16_e32 v32, 3, v29 +; VI-NEXT: v_add_u16_sdwa v29, v29, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v32, v29 +; VI-NEXT: v_add_u16_e32 v32, 3, v28 +; VI-NEXT: v_add_u16_sdwa v28, v28, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v32, v28 +; VI-NEXT: v_add_u16_e32 v32, 3, v27 +; VI-NEXT: v_add_u16_sdwa v27, v27, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: v_add_u16_e32 v32, 3, v26 +; VI-NEXT: v_add_u16_sdwa v26, v26, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v32, v26 +; VI-NEXT: v_add_u16_e32 v32, 3, v25 +; VI-NEXT: v_add_u16_sdwa v25, v25, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v32, v25 +; VI-NEXT: v_add_u16_e32 v32, 3, v24 +; VI-NEXT: v_add_u16_sdwa v24, v24, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v32, v24 +; VI-NEXT: v_add_u16_e32 v32, 3, v23 +; VI-NEXT: v_add_u16_sdwa v23, v23, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v32, v23 +; VI-NEXT: v_add_u16_e32 v32, 3, v22 +; VI-NEXT: v_add_u16_sdwa v22, v22, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v32, v22 +; VI-NEXT: v_add_u16_e32 v32, 3, v21 +; VI-NEXT: v_add_u16_sdwa v21, v21, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v32, v21 +; VI-NEXT: v_add_u16_e32 v32, 3, v20 +; VI-NEXT: v_add_u16_sdwa v20, v20, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v32, v20 +; VI-NEXT: v_add_u16_e32 v32, 3, v19 +; VI-NEXT: v_add_u16_sdwa v19, v19, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v32, v19 +; VI-NEXT: v_add_u16_e32 v32, 3, v18 +; VI-NEXT: v_add_u16_sdwa v18, v18, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v32, v18 +; VI-NEXT: v_add_u16_e32 v32, 3, v17 +; VI-NEXT: v_add_u16_sdwa v17, v17, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v32, v17 +; VI-NEXT: v_add_u16_sdwa v32, v16, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB43_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i16_to_v16f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB43_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB43_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i16_to_v16f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB43_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB43_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <16 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <16 x double> + br label %end + +end: + %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x double> %phi +} + +define <64 x bfloat> @bitcast_v128i8_to_v64bf16(<128 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v128i8_to_v64bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v6 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v12 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v14 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v16 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v20 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v22 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v24 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v28 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v30 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:392 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v11 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v10 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v9 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v5 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:176 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:216 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:248 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:280 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:312 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:332 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:328 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:344 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:376 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:372 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:364 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:360 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB44_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v47, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v55, v1, v15 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v40, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v41, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v43, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v44, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v45, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v46, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GCN-NEXT: v_or_b32_e32 v22, v1, v22 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GCN-NEXT: v_or_b32_e32 v16, v1, v52 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GCN-NEXT: v_or_b32_e32 v5, v1, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v31 +; GCN-NEXT: v_or_b32_e32 v13, v1, v13 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v21, v1, v21 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v29 +; GCN-NEXT: v_or_b32_e32 v23, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v3, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v33, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v6, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v8, v1 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v10, v9, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v9, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v12, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v17, v12, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v12, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v12, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v25, v12, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v12, v8 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v37, v14, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v12, v14, v12 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v20, v14 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v24, v20 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_or_b32_e32 v24, v27, v24 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v28, v27 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v29, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v38, v19, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v19, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v48, v19, v11 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v49, v19, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v50, v19, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v51, v19, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v23 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; kill: killed $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; kill: killed $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; kill: killed $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; kill: killed $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; kill: killed $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; kill: killed $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; kill: killed $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: .LBB44_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB44_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v29 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v23, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v27 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v21, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v19 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v31 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v13, v6 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v56 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_or_b32_e32 v5, v5, v8 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_or_b32_e32 v9, v52, v9 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v1 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v35 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_or_b32_e32 v11, v22, v11 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v1 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v1, v13 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: v_mov_b32_e32 v2, v15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v1 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v1, v15 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v1 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v1, v17 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v1, v19 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v1 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_or_b32_e32 v20, v21, v20 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v1, v21 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v1 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_or_b32_e32 v28, v25, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v1, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_or_b32_e32 v37, v25, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_or_b32_e32 v40, v2, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_or_b32_e32 v45, v25, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v57, v1, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_or_b32_e32 v58, v25, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v59, v1, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_or_b32_e32 v60, v25, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v61, v1, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v62, v1, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v63, v1, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v2, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v48, v2, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v49, v2, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v51, v2, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v52, v2, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v2, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v55, v2, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v2, v24 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v2, v25 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v2, v26 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v2, v27 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v2, v29 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v2, v30 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v2, v31 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v33, v2, v33 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v2, v34 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v2, v35 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v2, v36 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v2, v38 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v50, v2, v50 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v54, v2, v54 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v41, v2, v41 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v42, v2, v42 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v43, v2, v43 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v44, v2, v44 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v46, v2, v46 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v47, v2, v47 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v56, 0xff, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v56, v2, v56 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v61, v61, v2 +; GCN-NEXT: s_movk_i32 s7, 0x300 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v62, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v63, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v1, v4 +; GCN-NEXT: s_mov_b32 s6, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v7 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v8 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v9 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v10 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v11 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v12 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v13 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v14 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v15 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v16 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v17 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v18 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v19 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v20 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v21 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v22 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s7, v23 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v28 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s7, v32 +; GCN-NEXT: v_add_i32_e32 v32, vcc, s7, v37 +; GCN-NEXT: v_add_i32_e32 v37, vcc, s7, v40 +; GCN-NEXT: v_add_i32_e32 v40, vcc, s7, v45 +; GCN-NEXT: v_add_i32_e32 v45, vcc, s7, v57 +; GCN-NEXT: v_add_i32_e32 v57, vcc, s7, v58 +; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v59 +; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v60 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v40 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 +; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 +; GCN-NEXT: v_or_b32_e32 v1, v39, v1 +; GCN-NEXT: v_or_b32_e32 v6, v48, v6 +; GCN-NEXT: v_or_b32_e32 v5, v49, v5 +; GCN-NEXT: v_or_b32_e32 v7, v51, v7 +; GCN-NEXT: v_or_b32_e32 v8, v52, v8 +; GCN-NEXT: v_or_b32_e32 v9, v53, v9 +; GCN-NEXT: v_or_b32_e32 v10, v55, v10 +; GCN-NEXT: v_or_b32_e32 v11, v24, v11 +; GCN-NEXT: v_or_b32_e32 v12, v25, v12 +; GCN-NEXT: v_or_b32_e32 v13, v26, v13 +; GCN-NEXT: v_or_b32_e32 v14, v27, v14 +; GCN-NEXT: v_or_b32_e32 v15, v29, v15 +; GCN-NEXT: v_or_b32_e32 v16, v30, v16 +; GCN-NEXT: v_or_b32_e32 v17, v31, v17 +; GCN-NEXT: v_or_b32_e32 v18, v33, v18 +; GCN-NEXT: v_or_b32_e32 v19, v34, v19 +; GCN-NEXT: v_or_b32_e32 v20, v35, v20 +; GCN-NEXT: v_or_b32_e32 v21, v36, v21 +; GCN-NEXT: v_or_b32_e32 v22, v38, v22 +; GCN-NEXT: v_or_b32_e32 v23, v50, v23 +; GCN-NEXT: v_or_b32_e32 v24, v54, v28 +; GCN-NEXT: v_or_b32_e32 v25, v41, v32 +; GCN-NEXT: v_or_b32_e32 v26, v42, v37 +; GCN-NEXT: v_or_b32_e32 v27, v43, v40 +; GCN-NEXT: v_or_b32_e32 v28, v44, v45 +; GCN-NEXT: v_or_b32_e32 v29, v46, v57 +; GCN-NEXT: v_or_b32_e32 v30, v47, v58 +; GCN-NEXT: v_or_b32_e32 v31, v56, v59 +; GCN-NEXT: v_add_i32_e32 v35, vcc, s6, v61 +; GCN-NEXT: v_add_i32_e32 v49, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v38, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v39, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v22 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v23 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v26 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v27 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v29 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v30 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v31 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v20 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v16 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v14 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v7 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v6 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v9 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v12 +; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v52 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v51 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v5 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v48 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v39 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v50 +; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v35 +; GCN-NEXT: .LBB44_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v63 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_alignbit_b32 v11, v11, v13, 16 +; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v11, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v63, v5, v11, 16 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v56, v5, v11, 16 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 8, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v62, v5, v11, 16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 12, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v61 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v61, v5, v11, 16 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 16, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 20, v0 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v60 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_alignbit_b32 v60, v13, v16, 16 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 24, v0 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_alignbit_b32 v16, v16, v19, 16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 28, v0 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v59 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_alignbit_b32 v59, v22, v23, 16 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 32, v0 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_alignbit_b32 v23, v23, v27, 16 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 36, v0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v58 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_alignbit_b32 v58, v29, v35, 16 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 40, v0 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_alignbit_b32 v3, v7, v3, 16 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v57 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_alignbit_b32 v57, v32, v35, 16 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 48, v0 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_alignbit_b32 v33, v34, v33, 16 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 52, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v47 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v4, v4, v35, 16 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 56, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 60, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v52, 16 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 64, v0 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v9, v9, v10, 16 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v45 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_alignbit_b32 v18, v18, v54, 16 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x48, v0 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_alignbit_b32 v17, v36, v17, 16 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x4c, v0 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_alignbit_b32 v26, v26, v42, 16 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x50, v0 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v8, v8, v25, 16 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x54, v0 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_alignbit_b32 v37, v37, v43, 16 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x58, v0 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v12, v14, v12, 16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x5c, v0 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_alignbit_b32 v20, v20, v41, 16 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x60, v0 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_alignbit_b32 v24, v28, v24, 16 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x64, v0 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_alignbit_b32 v30, v30, v40, 16 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x68, v0 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_alignbit_b32 v38, v39, v38, 16 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x6c, v0 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_alignbit_b32 v48, v48, v55, 16 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x74, v0 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; GCN-NEXT: v_alignbit_b32 v51, v51, v53, 16 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v63, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v56, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v62, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v60, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v59, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v33, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v37, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v48, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v49, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v51, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v23 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v27 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v10, v63, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v57, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v31, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: .LBB44_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB44_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v18, 0x300 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_sdwa v4, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v29, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v0, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v39, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v32, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v33, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v33, v33, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v34, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v35, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v35, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v35 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v36, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v36, v36, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v24, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v36 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_u16_e32 v8, 3, v61 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v9, 3, v62 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_u16_e32 v9, 3, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 3, v59 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_u16_e32 v10, 3, v60 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_u16_e32 v11, 3, v58 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_u16_e32 v12, 3, v47 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v46 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_u16_e32 v13, 3, v45 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v44 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_u16_e32 v14, 3, v43 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v15, 3, v42 +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v21, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v21 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v16, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v19, 0x300, v20 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v16, v19, v16 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: v_or_b32_sdwa v30, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v31, v50, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v31, 0x300, v31 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v37, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v40, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v37, v37, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v29, v40 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v38, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v55, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v38, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v30, v55 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v39, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v48, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v21, v39, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v48, 3, v48 +; VI-NEXT: v_or_b32_sdwa v48, v49, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v49, 3, v49 +; VI-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v20, v49, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v50, 3, v50 +; VI-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v39, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v39, 0x300, v39 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v51, 3, v51 +; VI-NEXT: v_or_b32_sdwa v51, v52, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v52, 3, v52 +; VI-NEXT: v_or_b32_sdwa v52, v53, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v53, 3, v53 +; VI-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v54, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v19, v51, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v18, v53, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v39, v18 +; VI-NEXT: v_add_u16_e32 v39, 0x300, v52 +; VI-NEXT: v_or_b32_e32 v19, v39, v19 +; VI-NEXT: v_add_u16_e32 v39, 0x300, v50 +; VI-NEXT: v_or_b32_e32 v20, v39, v20 +; VI-NEXT: v_add_u16_e32 v39, 0x300, v48 +; VI-NEXT: v_or_b32_e32 v21, v39, v21 +; VI-NEXT: v_or_b32_e32 v31, v31, v54 +; VI-NEXT: .LBB44_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v23 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v25 +; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v16, v17, v16, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v17, v19, v18, s6 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v18, v19, v18, s6 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v19, v20, v19, s6 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v20, v21, v20, s6 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v21, v22, v21, s6 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v22, v23, v22, s6 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v30, v31, v30, s6 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: .LBB44_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB44_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_perm_b32 v1, v1, v4, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_e32 v35, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v36, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v22 +; GFX9-NEXT: v_add_u16_e32 v36, 0x300, v36 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v37, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v21 +; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v23 +; GFX9-NEXT: v_perm_b32 v29, v34, v29, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v38, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v39, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v39 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v48, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 +; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v16 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v18 +; GFX9-NEXT: v_perm_b32 v17, v17, v20, s6 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v19 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v16, v18, v16, s6 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v49, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v20 +; GFX9-NEXT: v_perm_b32 v30, v33, v30, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v50, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v52, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v39, 0x300, v50 +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v51, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v51 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v52, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v32, 0x300, v19 +; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v53 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v55 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v50, 0x300, v40 +; GFX9-NEXT: v_perm_b32 v21, v50, v21, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v41 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v42, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v51, 0x300, v42 +; GFX9-NEXT: v_perm_b32 v20, v51, v20, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v43, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v44, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v45, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v45 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v46, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v24 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v26 +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v37 +; GFX9-NEXT: v_add_u16_e32 v37, 0x300, v38 +; GFX9-NEXT: v_add_u16_e32 v38, 0x300, v48 +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v49 +; GFX9-NEXT: v_add_u16_e32 v48, 0x300, v52 +; GFX9-NEXT: v_add_u16_e32 v49, 0x300, v54 +; GFX9-NEXT: v_add_u16_e32 v52, 0x300, v44 +; GFX9-NEXT: v_add_u16_e32 v53, 0x300, v46 +; GFX9-NEXT: v_perm_b32 v18, v53, v18, s6 +; GFX9-NEXT: v_perm_b32 v19, v52, v19, s6 +; GFX9-NEXT: v_perm_b32 v22, v49, v22, s6 +; GFX9-NEXT: v_perm_b32 v23, v48, v23, s6 +; GFX9-NEXT: v_perm_b32 v24, v39, v24, s6 +; GFX9-NEXT: v_perm_b32 v25, v38, v25, s6 +; GFX9-NEXT: v_perm_b32 v26, v37, v26, s6 +; GFX9-NEXT: v_perm_b32 v27, v36, v27, s6 +; GFX9-NEXT: v_perm_b32 v28, v35, v28, s6 +; GFX9-NEXT: .LBB44_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v128i8_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:580 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:576 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:572 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:568 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:564 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:560 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:556 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:552 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:548 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:544 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:540 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:536 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:532 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:528 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:524 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:520 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:516 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:512 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:508 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:504 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:500 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:496 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:492 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:488 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:484 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:480 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:476 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:472 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:468 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:464 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:460 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:456 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:452 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:448 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:444 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:440 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:436 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:432 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:428 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:424 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:420 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:416 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:412 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:408 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:404 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:400 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:396 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:392 +; GFX11-NEXT: v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24 +; GFX11-NEXT: v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26 +; GFX11-NEXT: v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20 +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16 +; GFX11-NEXT: v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4 +; GFX11-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 +; GFX11-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:380 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-NEXT: scratch_load_u16 v134, off, s32 offset:372 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:364 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:356 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:348 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:340 +; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:332 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:324 +; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:316 +; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:308 +; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:300 +; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:292 +; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:284 +; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:276 +; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:268 +; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:252 +; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:248 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-NEXT: scratch_load_u16 v88, off, s32 offset:240 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:236 +; GFX11-NEXT: scratch_load_u16 v93, off, s32 offset:232 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:228 +; GFX11-NEXT: scratch_load_u16 v91, off, s32 offset:224 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:220 +; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v150, off, s32 offset:388 +; GFX11-NEXT: scratch_load_u16 v182, off, s32 +; GFX11-NEXT: scratch_load_u16 v40, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v43, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v44, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v45, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v46, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v47, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v56, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v58, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v59, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v60, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v61, off, s32 offset:88 +; GFX11-NEXT: scratch_load_u16 v62, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v63, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v72, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v73, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:136 +; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:144 +; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:160 +; GFX11-NEXT: scratch_load_u16 v79, off, s32 offset:168 +; GFX11-NEXT: scratch_load_u16 v89, off, s32 offset:176 +; GFX11-NEXT: scratch_load_u16 v90, off, s32 offset:184 +; GFX11-NEXT: scratch_load_u16 v95, off, s32 offset:192 +; GFX11-NEXT: scratch_load_u16 v104, off, s32 offset:200 +; GFX11-NEXT: scratch_load_u16 v105, off, s32 offset:208 +; GFX11-NEXT: scratch_load_u16 v42, off, s32 offset:212 +; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:204 +; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:196 +; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:188 +; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:180 +; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:172 +; GFX11-NEXT: scratch_load_u16 v177, off, s32 offset:164 +; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:156 +; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:148 +; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:140 +; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v70, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v71, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v82, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v128, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v113, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v132, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v161, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v160, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v176, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v167, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(62) +; GFX11-NEXT: v_lshlrev_b16 v127, 8, v0 +; GFX11-NEXT: v_lshlrev_b16 v126, 8, v2 +; GFX11-NEXT: v_lshlrev_b16 v124, 8, v4 +; GFX11-NEXT: v_lshlrev_b16 v125, 8, v6 +; GFX11-NEXT: v_lshlrev_b16 v120, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v123, 8, v10 +; GFX11-NEXT: v_lshlrev_b16 v121, 8, v12 +; GFX11-NEXT: v_lshlrev_b16 v122, 8, v14 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: v_lshlrev_b16 v106, 8, v16 +; GFX11-NEXT: v_lshlrev_b16 v111, 8, v18 +; GFX11-NEXT: v_lshlrev_b16 v109, 8, v20 +; GFX11-NEXT: v_lshlrev_b16 v110, 8, v22 +; GFX11-NEXT: v_lshlrev_b16 v107, 8, v24 +; GFX11-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-NEXT: s_waitcnt vmcnt(61) +; GFX11-NEXT: v_lshlrev_b16 v88, 8, v88 +; GFX11-NEXT: s_waitcnt vmcnt(59) +; GFX11-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-NEXT: s_waitcnt vmcnt(57) +; GFX11-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-NEXT: s_waitcnt vmcnt(55) +; GFX11-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-NEXT: s_waitcnt vmcnt(54) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 +; GFX11-NEXT: s_waitcnt vmcnt(53) +; GFX11-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-NEXT: s_waitcnt vmcnt(52) +; GFX11-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-NEXT: s_waitcnt vmcnt(51) +; GFX11-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-NEXT: s_waitcnt vmcnt(50) +; GFX11-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-NEXT: s_waitcnt vmcnt(49) +; GFX11-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-NEXT: s_waitcnt vmcnt(48) +; GFX11-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-NEXT: s_waitcnt vmcnt(47) +; GFX11-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-NEXT: s_waitcnt vmcnt(46) +; GFX11-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-NEXT: s_waitcnt vmcnt(45) +; GFX11-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-NEXT: s_waitcnt vmcnt(44) +; GFX11-NEXT: v_lshlrev_b16 v58, 8, v59 +; GFX11-NEXT: s_waitcnt vmcnt(43) +; GFX11-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-NEXT: s_waitcnt vmcnt(42) +; GFX11-NEXT: v_lshlrev_b16 v60, 8, v61 +; GFX11-NEXT: s_waitcnt vmcnt(41) +; GFX11-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-NEXT: s_waitcnt vmcnt(40) +; GFX11-NEXT: v_lshlrev_b16 v62, 8, v63 +; GFX11-NEXT: s_waitcnt vmcnt(39) +; GFX11-NEXT: v_lshlrev_b16 v47, 8, v72 +; GFX11-NEXT: s_waitcnt vmcnt(38) +; GFX11-NEXT: v_lshlrev_b16 v72, 8, v73 +; GFX11-NEXT: s_waitcnt vmcnt(37) +; GFX11-NEXT: v_lshlrev_b16 v63, 8, v74 +; GFX11-NEXT: s_waitcnt vmcnt(36) +; GFX11-NEXT: v_lshlrev_b16 v74, 8, v75 +; GFX11-NEXT: s_waitcnt vmcnt(35) +; GFX11-NEXT: v_lshlrev_b16 v73, 8, v76 +; GFX11-NEXT: s_waitcnt vmcnt(34) +; GFX11-NEXT: v_lshlrev_b16 v75, 8, v77 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v61, 8, v78 +; GFX11-NEXT: s_waitcnt vmcnt(32) +; GFX11-NEXT: v_lshlrev_b16 v78, 8, v79 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v77, 8, v89 +; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: v_lshlrev_b16 v79, 8, v90 +; GFX11-NEXT: s_waitcnt vmcnt(29) +; GFX11-NEXT: v_lshlrev_b16 v76, 8, v95 +; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: v_lshlrev_b16 v90, 8, v104 +; GFX11-NEXT: s_waitcnt vmcnt(27) +; GFX11-NEXT: v_lshlrev_b16 v89, 8, v105 +; GFX11-NEXT: v_lshlrev_b16 v104, 8, v94 +; GFX11-NEXT: v_lshlrev_b16 v95, 8, v31 +; GFX11-NEXT: v_lshlrev_b16 v105, 8, v30 +; GFX11-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v51 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v34 +; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v53 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v66 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v128 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v113 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v132 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v100 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v161 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v160 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v176 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v167 +; GFX11-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v11, v10, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v102 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v87 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v114 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v96 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v133 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v117 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v135 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v181 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v150 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v41 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v40 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v43 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v182 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v46 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v45 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v57 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v56 +; GFX11-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v147 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v119 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v149 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v144 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v162 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v146 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v164 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v151 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v148 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v58 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v44 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v60 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v59 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v62 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v47 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v72 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v63 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v74 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v73 +; GFX11-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 +; GFX11-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v166 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v145 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v163 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v179 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v165 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v183 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v180 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v42 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v65 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v75 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v61 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v78 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v77 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v79 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v76 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v90 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v89 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v92 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v91 +; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v18, v20, v19, 0x5040100 +; GFX11-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 +; GFX11-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 +; GFX11-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v64 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v68 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v85 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v67 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v97 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v101 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v86 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v93 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v88 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v104 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v95 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v105 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v94 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v108 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v107 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v110 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v109 +; GFX11-NEXT: v_perm_b32 v22, v23, v22, 0x5040100 +; GFX11-NEXT: v_perm_b32 v23, v25, v24, 0x5040100 +; GFX11-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 +; GFX11-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 +; GFX11-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v103 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v112 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v99 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v129 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v98 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v131 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v116 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v134 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v111 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v106 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v122 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v121 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v123 +; GFX11-NEXT: v_or_b32_e32 v32, v32, v120 +; GFX11-NEXT: v_or_b32_e32 v33, v33, v125 +; GFX11-NEXT: v_or_b32_e32 v34, v34, v124 +; GFX11-NEXT: v_or_b32_e32 v35, v35, v126 +; GFX11-NEXT: v_or_b32_e32 v36, v36, v127 +; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-NEXT: v_perm_b32 v28, v30, v29, 0x5040100 +; GFX11-NEXT: v_perm_b32 v29, v32, v31, 0x5040100 +; GFX11-NEXT: v_perm_b32 v30, v34, v33, 0x5040100 +; GFX11-NEXT: v_perm_b32 v31, v36, v35, 0x5040100 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr77 +; GFX11-NEXT: ; implicit-def: $vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr90 +; GFX11-NEXT: ; implicit-def: $vgpr89 +; GFX11-NEXT: ; implicit-def: $vgpr92 +; GFX11-NEXT: ; implicit-def: $vgpr91 +; GFX11-NEXT: ; implicit-def: $vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr104 +; GFX11-NEXT: ; implicit-def: $vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr105 +; GFX11-NEXT: ; implicit-def: $vgpr94 +; GFX11-NEXT: ; implicit-def: $vgpr108 +; GFX11-NEXT: ; implicit-def: $vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr110 +; GFX11-NEXT: ; implicit-def: $vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr111 +; GFX11-NEXT: ; implicit-def: $vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr121 +; GFX11-NEXT: ; implicit-def: $vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr125 +; GFX11-NEXT: ; implicit-def: $vgpr124 +; GFX11-NEXT: ; implicit-def: $vgpr126 +; GFX11-NEXT: ; implicit-def: $vgpr127 +; GFX11-NEXT: .LBB44_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB44_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v134, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v118, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v131, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v116, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v129, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_add_nc_u16 v35, v35, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v126, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v127, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v125, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v124, v3 +; GFX11-NEXT: v_add_nc_u16 v33, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v31, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v98, 3 +; GFX11-NEXT: v_add_nc_u16 v116, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v30, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v98, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v1, v112, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_add_nc_u16 v3, v99, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v103, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v120, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v122, v1 +; GFX11-NEXT: v_add_nc_u16 v29, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v121, v3 +; GFX11-NEXT: v_add_nc_u16 v99, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v111, v4 +; GFX11-NEXT: v_add_nc_u16 v28, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v81, 3 +; GFX11-NEXT: v_add_nc_u16 v81, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v101, 3 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v3, v97, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v4, v83, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v106, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v110, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v109, v0 +; GFX11-NEXT: v_add_nc_u16 v83, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v108, v3 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v107, v4 +; GFX11-NEXT: v_add_nc_u16 v86, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v85, 3 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-NEXT: v_add_nc_u16 v67, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v69, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v94, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v104, v2 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v95, v3 +; GFX11-NEXT: v_add_nc_u16 v68, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v93, v4 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v64, 3 +; GFX11-NEXT: v_add_nc_u16 v64, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(26) +; GFX11-NEXT: v_add_nc_u16 v0, v42, 3 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_waitcnt vmcnt(24) +; GFX11-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v180, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v88, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v91, v1 +; GFX11-NEXT: v_add_nc_u16 v65, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v90, v3 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v89, v4 +; GFX11-NEXT: v_add_nc_u16 v69, 0x300, v1 +; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: v_add_nc_u16 v1, v179, 3 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v165, 3 +; GFX11-NEXT: v_add_nc_u16 v80, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: v_add_nc_u16 v0, v177, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: v_add_nc_u16 v4, v166, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v79, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v76, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v78, v0 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v77, v3 +; GFX11-NEXT: v_add_nc_u16 v85, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v75, v4 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v145, 3 +; GFX11-NEXT: v_add_nc_u16 v97, 0x300, v1 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_add_nc_u16 v1, v151, 3 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_add_nc_u16 v3, v178, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v164, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v61, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v74, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v73, v2 +; GFX11-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v63, v4 +; GFX11-NEXT: v_add_nc_u16 v103, 0x300, v2 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: v_add_nc_u16 v2, v162, 3 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v146, 3 +; GFX11-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: v_add_nc_u16 v1, v149, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v3, v144, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_add_nc_u16 v4, v147, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v62, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v60, v1 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v59, v3 +; GFX11-NEXT: v_add_nc_u16 v118, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v58, v4 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v119, 3 +; GFX11-NEXT: v_add_nc_u16 v119, 0x300, v2 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_add_nc_u16 v2, v135, 3 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-NEXT: v_add_nc_u16 v117, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v46, v3 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v45, v4 +; GFX11-NEXT: v_add_nc_u16 v129, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_add_nc_u16 v0, v114, 3 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v96, 3 +; GFX11-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v2, v102, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v55, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v43, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v41, v2 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v40, v3 +; GFX11-NEXT: v_add_nc_u16 v55, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v181, v4 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v37, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v54, 3 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v52, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v50, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v150, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v167, v1 +; GFX11-NEXT: v_add_nc_u16 v50, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v161, v3 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v160, v4 +; GFX11-NEXT: v_add_nc_u16 v52, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v51, 3 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v49, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v48, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v4, v36, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v115, v0 +; GFX11-NEXT: v_add_nc_u16 v34, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v100, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v113, v4 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v39, 3 +; GFX11-NEXT: v_add_nc_u16 v36, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v32, v32, 3 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-NEXT: v_or_b32_e32 v35, v71, v35 +; GFX11-NEXT: v_or_b32_e32 v33, v82, v33 +; GFX11-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-NEXT: v_or_b32_e32 v36, v70, v36 +; GFX11-NEXT: v_or_b32_e32 v32, v66, v32 +; GFX11-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-NEXT: v_add_nc_u16 v38, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v36 +; GFX11-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-NEXT: v_add_nc_u16 v36, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v39, 0x300, v1 +; GFX11-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 +; GFX11-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 +; GFX11-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 +; GFX11-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 +; GFX11-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 +; GFX11-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 +; GFX11-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 +; GFX11-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 +; GFX11-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 +; GFX11-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 +; GFX11-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 +; GFX11-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 +; GFX11-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 +; GFX11-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 +; GFX11-NEXT: .LBB44_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:392 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:396 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:400 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:404 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:408 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:412 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:416 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:420 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:424 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:428 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:432 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:436 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:440 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:444 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:448 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:452 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:456 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:460 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:464 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:468 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:472 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:476 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:480 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:484 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:488 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:492 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:496 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:500 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:504 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:508 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:512 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:516 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:520 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:524 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:528 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:532 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:536 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:540 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:544 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:548 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:552 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:556 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:560 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:564 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:568 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:572 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:576 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v64bf16_to_v128i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v41 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v40 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v55 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v58 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v59 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v56 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v57 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v46 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v41 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v54 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v53 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v50 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v39 +; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v38 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v36 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v33 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v42 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:128 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v45 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v12 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB45_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v28, v2, v3, 16 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v24, v2, v3, 16 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v25, v2, v3, 16 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v26, v2, v3, 16 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v30, v2, v3, 16 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v35 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v40, v2, v3, 16 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v39 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v42, v2, v3, 16 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v48 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v44, v2, v3, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v60 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v46, v2, v3, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v20 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v56, v2, v3, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v58, v2, v3, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v32 +; GCN-NEXT: v_alignbit_b32 v61, v2, v18, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v63 +; GCN-NEXT: v_alignbit_b32 v15, v2, v19, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; GCN-NEXT: v_alignbit_b32 v14, v2, v17, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; GCN-NEXT: v_alignbit_b32 v13, v2, v27, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; GCN-NEXT: v_alignbit_b32 v59, v2, v33, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v47 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v35 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v39 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v48 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v60 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v20 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v22 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v34 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v43 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v47 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 24, v62 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v16 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v23 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v51, v52, v16, 16 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v35, v12, v16, 16 +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_alignbit_b32 v36, v36, v12, 16 +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_alignbit_b32 v37, v37, v12, 16 +; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_alignbit_b32 v38, v38, v12, 16 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v39, v6, v12, 16 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v48, v3, v12, 16 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v50, v5, v3, 16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v52 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v52, v8, v3, 16 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v53, v10, v3, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v54, v2, v3, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v55, v7, v2, 16 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v41, v9, v2, 16 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v45, v11, v2, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v57, v1, v2, 16 +; GCN-NEXT: v_alignbit_b32 v4, v51, v28, 24 +; GCN-NEXT: v_alignbit_b32 v10, v51, v28, 16 +; GCN-NEXT: v_alignbit_b32 v3, v51, v28, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v35, v24, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v35, v24, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v8, v35, v24, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v36, v25, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v36, v25, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v12, v36, v25, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v35 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v48 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v49 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v50 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v52 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v53 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v55 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v45 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v57 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: .LBB45_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB45_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v29 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_alignbit_b32 v59, v13, v12, 16 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v31 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v21 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_alignbit_b32 v14, v15, v14, 16 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v63 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_alignbit_b32 v15, v17, v15, 16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v23 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v16 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v62 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v18 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v47 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v43 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v20 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v42, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v60 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v44, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v46, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v56, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v58, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v62, 0xffff0000, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v7 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v4 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_add_f32_e32 v60, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v6 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v18 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v19 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v23 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v27 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v30 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v31 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v33 +; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v35 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v36 +; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v37 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v38 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v39 +; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v48 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v49 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v50 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v51 +; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v52 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v53 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v54 +; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v55 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v40 +; GCN-NEXT: v_add_f32_e32 v50, 0x40c00000, v41 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v42 +; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v43 +; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v44 +; GCN-NEXT: v_add_f32_e32 v52, 0x40c00000, v45 +; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v46 +; GCN-NEXT: v_add_f32_e32 v39, 0x40c00000, v47 +; GCN-NEXT: v_add_f32_e32 v51, 0x40c00000, v56 +; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v57 +; GCN-NEXT: v_add_f32_e32 v47, 0x40c00000, v58 +; GCN-NEXT: v_add_f32_e32 v48, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v53, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v61 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v62 +; GCN-NEXT: v_add_f32_e32 v49, 0x40c00000, v63 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v61, v1, v6, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-NEXT: v_alignbit_b32 v58, v1, v2, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; GCN-NEXT: v_alignbit_b32 v56, v1, v3, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; GCN-NEXT: v_alignbit_b32 v46, v1, v4, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; GCN-NEXT: v_alignbit_b32 v44, v1, v5, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GCN-NEXT: v_alignbit_b32 v42, v1, v33, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; GCN-NEXT: v_alignbit_b32 v40, v1, v8, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; GCN-NEXT: v_alignbit_b32 v30, v1, v30, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v50 +; GCN-NEXT: v_alignbit_b32 v26, v1, v25, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v52 +; GCN-NEXT: v_alignbit_b32 v25, v1, v41, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; GCN-NEXT: v_alignbit_b32 v24, v1, v51, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; GCN-NEXT: v_alignbit_b32 v28, v1, v53, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v49 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v48 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v36 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v35 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v34 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v32 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 24, v10 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 24, v20 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v60 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 24, v60 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 24, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v57, v20, v21, 16 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v45, v18, v20, 16 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v41, v10, v18, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v55, v1, v10, 16 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v54, v62, v1, 16 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v53, v12, v1, 16 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v52, v11, v1, 16 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v50, v4, v1, 16 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v49, v3, v17, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v48, v2, v19, 16 +; GCN-NEXT: v_alignbit_b32 v39, v6, v16, 16 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v38, v29, v22, 16 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v37, v8, v23, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v36, v7, v43, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v35, v5, v47, 16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v31 +; GCN-NEXT: v_alignbit_b32 v51, v5, v9, 16 +; GCN-NEXT: v_alignbit_b32 v4, v51, v28, 24 +; GCN-NEXT: v_alignbit_b32 v10, v51, v28, 16 +; GCN-NEXT: v_alignbit_b32 v3, v51, v28, 8 +; GCN-NEXT: v_alignbit_b32 v1, v35, v24, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v35, v24, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v8, v35, v24, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v36, v25, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v36, v25, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v12, v36, v25, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v37, v26, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v38, v30, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v39, v40, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v48, v42, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v49, v44, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v50, v46, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v52, v56, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v53, v58, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v61, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v55, v15, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v41, v14, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v45, v13, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v57, v59, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v35 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v48 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v49 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v50 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v52 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v53 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v55 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v45 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v57 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: .LBB45_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v51 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v4 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v8 +; GCN-NEXT: v_or_b32_e32 v29, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v31, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v12 +; GCN-NEXT: v_or_b32_e32 v2, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v36 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v62, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v40 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v7, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v8, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v9, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v10, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v44 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v11, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v16, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v17, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v50 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v18, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v19, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v52 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v20, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v21, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v53 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v22, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v61 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v23, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v24, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v15, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v55 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v25, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v14, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v41 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v26, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v13 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v13, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v45 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v27, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v12, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v28, v1, v3 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v30, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v33, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v32, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v34, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v35, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v36, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v37, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v38, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v39, v3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v48, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v49, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v50, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v51, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v52, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v53, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v54, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v55, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v40, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v41, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v42, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v43, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v44, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v45, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v46, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v47, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v56, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v57, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v58, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v59, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v60, v3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v4, v1, v30 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v5, v1, v33 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v63, v2, v32 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v62 +; GCN-NEXT: v_or_b32_e32 v61, v3, v34 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v35 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v36 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v29, v29, v37 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v30, v30, v38 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v39 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v7, v7, v48 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v8, v8, v49 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v9, v9, v50 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v10, v10, v51 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v11, v11, v52 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v16, v16, v53 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v17, v17, v54 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v18, v18, v55 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v19, v19, v40 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v20, v20, v41 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GCN-NEXT: v_or_b32_e32 v21, v21, v42 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GCN-NEXT: v_or_b32_e32 v22, v22, v43 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x54, v0 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GCN-NEXT: v_or_b32_e32 v23, v23, v44 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x58, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v15, v15, v45 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x5c, v0 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_or_b32_e32 v25, v25, v46 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v14, v47 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x64, v0 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_or_b32_e32 v26, v26, v56 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x68, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v13, v57 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_or_b32_e32 v27, v27, v58 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x70, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v12, v59 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_or_b32_e32 v28, v28, v60 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v128i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; kill: killed $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB45_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v29 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v28 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v28 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v28 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v27 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v27 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v26 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v26 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v17 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[15:16] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v1 +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v17 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[13:14] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[11:12] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[9:10] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[7:8] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[5:6] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[3:4] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[1:2] +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[31:32] +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12 +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v45, v46 +; VI-NEXT: v_lshrrev_b64 v[46:47], 24, v[29:30] +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v31 +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v46, v63 +; VI-NEXT: v_mov_b32_e32 v63, v50 +; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[27:28] +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v8 +; VI-NEXT: v_mov_b32_e32 v51, v57 +; VI-NEXT: v_mov_b32_e32 v50, v56 +; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26] +; VI-NEXT: v_mov_b32_e32 v57, v43 +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[23:24] +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[21:22] +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[19:20] +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: v_mov_b32_e32 v47, v34 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: .LBB45_2: ; %Flow +; VI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: s_xor_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB45_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v18 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x7fff, v34 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v18, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v18 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v18, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_cndmask_b32_e32 v17, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v17, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v20 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v20 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v20, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v19 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v19 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v19, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v22 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v22 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v22, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v21 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v21 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v21, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v24 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v24 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v24, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v23 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v23 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v23, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v26 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v26 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v26, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v25 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v25, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v28 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v28 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v28, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v27 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v27, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v30 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v30 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v30, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v29 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v29 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v29, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v32 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v32 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; VI-NEXT: v_alignbit_b32 v32, v32, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v31 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v31 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v31, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v2 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v1 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v4 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v3 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v6 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v5 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v8 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v7 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v10 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v9 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v12 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v12 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v11 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v14 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v13 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v13 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v16 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v33, 16 +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v15 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v34, v35, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v33, 16 +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[11:12] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[9:10] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[7:8] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[5:6] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[3:4] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[1:2] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[19:20] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[17:18] +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v28 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v28 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v28 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v27 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v26 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v26 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v25 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v24 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v23 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v22 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v5 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v21 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v20 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v3 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v19 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[25:26] +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v45, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v50, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v31 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v31 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; VI-NEXT: v_lshrrev_b32_e32 v53, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v29 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v17 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: .LBB45_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v43 +; VI-NEXT: v_or_b32_sdwa v2, v2, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v43, v44, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46 +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v48 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v128i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v46, v15 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: v_mov_b32_e32 v47, v16 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; kill: killed $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; kill: killed $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; kill: killed $vgpr59 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; kill: killed $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB45_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(36) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v47 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(38) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v47 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v47 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v46 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v46 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v8 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[46:47] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[17:18] +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v17 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB45_2: ; %Flow +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v58, v57 +; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB45_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v16, v33, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v34, v15, v33, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v18, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v33, v15, v18, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v18, v15, v18, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: v_perm_b32 v17, v15, v20, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v18, v15, v18, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v21 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v61, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: v_perm_b32 v17, v15, v61, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v59, v15, v18, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: v_perm_b32 v58, v15, v18, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v63, v15, v18, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: v_perm_b32 v62, v15, v18, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v60, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: v_perm_b32 v56, v15, v26, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: v_perm_b32 v33, v15, v25, s7 +; GFX9-NEXT: s_waitcnt vmcnt(52) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: s_waitcnt vmcnt(51) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: v_perm_b32 v35, v15, v24, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v16, v17, vcc +; GFX9-NEXT: v_add3_u32 v15, v15, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_add3_u32 v15, v15, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v15, v16, vcc +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v15, vcc +; GFX9-NEXT: v_perm_b32 v37, v1, v23, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v2, v15, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v2, v4, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_perm_b32 v48, v1, v20, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v18, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_perm_b32 v50, v1, v17, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc +; GFX9-NEXT: v_perm_b32 v52, v1, v4, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v7, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v15, vcc +; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc +; GFX9-NEXT: v_perm_b32 v39, v1, v3, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v12, v1, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc +; GFX9-NEXT: v_add3_u32 v12, v12, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc +; GFX9-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v15, v15, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v16, vcc +; GFX9-NEXT: v_perm_b32 v54, v11, v2, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v15, v15, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v16, v41, vcc +; GFX9-NEXT: v_add3_u32 v15, v15, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc +; GFX9-NEXT: v_perm_b32 v41, v13, v1, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v47 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v47 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v44, v16, v44, vcc +; GFX9-NEXT: v_perm_b32 v16, v44, v13, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; GFX9-NEXT: v_perm_b32 v53, v8, v5, s7 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v32 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v30 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v28 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GFX9-NEXT: v_perm_b32 v36, v32, v29, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v25 +; GFX9-NEXT: v_perm_b32 v38, v22, v31, s7 +; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; GFX9-NEXT: v_perm_b32 v49, v19, v21, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v46 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v45, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v45, v45, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v43, v45, v43, vcc +; GFX9-NEXT: v_bfe_u32 v45, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v45, v45, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v45, v46, vcc +; GFX9-NEXT: v_perm_b32 v15, v15, v43, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v44 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v43 +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[15:16] +; GFX9-NEXT: v_perm_b32 v51, v6, v18, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX9-NEXT: v_perm_b32 v40, v10, v7, s7 +; GFX9-NEXT: v_perm_b32 v57, v28, v60, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v60 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v61 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v39 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[41:42] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[54:55] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[39:40] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[52:53] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[50:51] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[48:49] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[37:38] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[35:36] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[56:57] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[62:63] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[58:59] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(28) +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[60:61] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[43:44] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[43:44] +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v42 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v42 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v41 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v41 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v55 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v55 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v40 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v39 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v53 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v57 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v57 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v56 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v63 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v50 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v49 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v59 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v61 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v61 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v60 +; GFX9-NEXT: v_mov_b32_e32 v33, v60 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v58 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v33 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v62 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v62 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v61 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v61 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v60 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v60 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v59 +; GFX9-NEXT: .LBB45_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v51 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v48 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v52 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v60 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v47, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v51 +; GFX9-NEXT: v_or_b32_sdwa v38, v38, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: v_or_b32_sdwa v15, v46, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64bf16_to_v128i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x15 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:96 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:92 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:12 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: ; implicit-def: $vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB45_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v76, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v17 +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20] +; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18] +; GFX11-NEXT: .LBB45_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB45_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX11-NEXT: v_add_f32_e32 v35, 0x40c00000, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v17, v33, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add3_u32 v17, v17, v33, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v17, v17, v36 :: v_dual_and_b32 v18, 0xffff0000, v18 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v37, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_add3_u32 v37, v37, v18, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v77, v37, v39 :: v_dual_add_f32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v20 +; GFX11-NEXT: v_perm_b32 v69, v77, v17, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v38, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v34 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 24, v69 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v38, v38, v34, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 8, v69 +; GFX11-NEXT: v_dual_cndmask_b32 v33, v38, v18 :: v_dual_add_f32 v20, 0x40c00000, v20 +; GFX11-NEXT: v_bfe_u32 v48, v35, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v37 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v36, v48, v35, 0x7fff +; GFX11-NEXT: v_bfe_u32 v35, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v34, v36, v39, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v36, 16, v19 +; GFX11-NEXT: v_add3_u32 v35, v35, v18, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-NEXT: v_perm_b32 v68, v34, v33, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v34, v20, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v35, v37, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v38, 0x40c00000, v19 :: v_dual_lshlrev_b32 v37, 16, v22 +; GFX11-NEXT: v_add3_u32 v34, v34, v20, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-NEXT: v_bfe_u32 v35, v38, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v68 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v68 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v34, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v39, v36, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_add3_u32 v35, v35, v38, 0x7fff +; GFX11-NEXT: v_perm_b32 v65, v19, v18, 0x7060302 +; GFX11-NEXT: v_add3_u32 v20, v39, v36, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v19 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 24, v65 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v20, v34, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v20, 0x40c00000, v37 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v34 +; GFX11-NEXT: v_bfe_u32 v36, v20, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v35, v35, v39, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-NEXT: v_add3_u32 v36, v36, v20, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-NEXT: v_perm_b32 v64, v35, v34, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 8, v65 +; GFX11-NEXT: v_bfe_u32 v48, v37, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v36, v38, vcc_lo +; GFX11-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_add_f32 v39, 0x40c00000, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v64 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v64 +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[64:65] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v22, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v22 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v24 +; GFX11-NEXT: v_bfe_u32 v36, v39, 16, 1 +; GFX11-NEXT: v_add3_u32 v35, v35, v22, 0x7fff +; GFX11-NEXT: v_add3_u32 v22, v48, v37, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v39 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_add3_u32 v36, v36, v39, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v21, v35, v21, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v71, v21, v20, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v21 +; GFX11-NEXT: v_dual_cndmask_b32 v35, v22, v35 :: v_dual_add_f32 v22, 0x40c00000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v35 +; GFX11-NEXT: v_bfe_u32 v37, v22, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v48, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v22 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX11-NEXT: v_add3_u32 v37, v37, v22, 0x7fff +; GFX11-NEXT: v_perm_b32 v70, v36, v35, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v49, v38, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v22, v37, v39 :: v_dual_lshlrev_b32 v39, 16, v26 +; GFX11-NEXT: v_bfe_u32 v36, v24, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v24 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_bfe_u32 v37, v48, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-NEXT: v_add3_u32 v36, v36, v24, 0x7fff +; GFX11-NEXT: v_add3_u32 v24, v49, v38, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v48 +; GFX11-NEXT: v_add3_u32 v37, v37, v48, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_cndmask_b32 v23, v36, v23 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 16, v70 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v36, v24, v36, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v39 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_perm_b32 v81, v23, v22, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v38, v24, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v37, v37, v49, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v24 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_add3_u32 v38, v38, v24, 0x7fff +; GFX11-NEXT: v_perm_b32 v80, v37, v36, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v37, v26, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v49, 0x40c00000, v25 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v26 +; GFX11-NEXT: v_bfe_u32 v50, v39, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v38, v48, vcc_lo +; GFX11-NEXT: v_add3_u32 v37, v37, v26, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_bfe_u32 v38, v49, 16, 1 +; GFX11-NEXT: v_add3_u32 v26, v50, v39, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v28 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v49 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v37, v25, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v39 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-NEXT: v_add3_u32 v38, v38, v49, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 16, v80 +; GFX11-NEXT: v_dual_cndmask_b32 v37, v26, v37 :: v_dual_add_f32 v26, 0x40c00000, v48 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v48, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_bfe_u32 v39, v26, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v38, v38, v50, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v26 +; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_add3_u32 v39, v39, v26, 0x7fff +; GFX11-NEXT: v_perm_b32 v82, v38, v37, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v38, v28, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v50, 0x40c00000, v27 +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v28 +; GFX11-NEXT: v_bfe_u32 v51, v48, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v39, v49, vcc_lo +; GFX11-NEXT: v_add3_u32 v38, v38, v28, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_bfe_u32 v39, v50, 16, 1 +; GFX11-NEXT: v_add3_u32 v28, v51, v48, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v49, 16, v30 +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v50 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v38, v27, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v48 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-NEXT: v_add3_u32 v39, v39, v50, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-NEXT: v_perm_b32 v83, v25, v24, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v38, v28, v38, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v49 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v49, 16, v29 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v48, v28, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v51, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v28 +; GFX11-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_add3_u32 v48, v48, v28, 0x7fff +; GFX11-NEXT: v_perm_b32 v84, v39, v38, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v39, v30, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v51, 0x40c00000, v29 +; GFX11-NEXT: v_or_b32_e32 v29, 0x400000, v30 +; GFX11-NEXT: v_bfe_u32 v52, v49, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v28, v48, v50, vcc_lo +; GFX11-NEXT: v_add3_u32 v39, v39, v30, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_bfe_u32 v48, v51, 16, 1 +; GFX11-NEXT: v_add3_u32 v30, v52, v49, 0x7fff +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v32 +; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v51 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v39, v29, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v49 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-NEXT: v_add3_u32 v48, v48, v51, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 24, v83 +; GFX11-NEXT: v_dual_cndmask_b32 v39, v30, v39 :: v_dual_add_f32 v30, 0x40c00000, v50 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v31 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-NEXT: v_bfe_u32 v49, v30, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v48, v48, v52, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v30 +; GFX11-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_add3_u32 v49, v49, v30, 0x7fff +; GFX11-NEXT: v_perm_b32 v86, v48, v39, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v48, v32, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v52, 0x40c00000, v31 +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v32 +; GFX11-NEXT: v_bfe_u32 v53, v50, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v30, v49, v51, vcc_lo +; GFX11-NEXT: v_add3_u32 v48, v48, v32, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_bfe_u32 v49, v52, 16, 1 +; GFX11-NEXT: v_add3_u32 v32, v53, v50, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v52 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v48, v31, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v50 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-NEXT: v_add3_u32 v49, v49, v52, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_perm_b32 v85, v27, v26, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v48, v32, v48, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v32, 0x40c00000, v51 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v51, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v50, v32, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v49, v49, v53, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v32 +; GFX11-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_add3_u32 v50, v50, v32, 0x7fff +; GFX11-NEXT: v_perm_b32 v96, v49, v48, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v49, v2, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v53, 0x40c00000, v1 +; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v55, v51, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v50, v52, vcc_lo +; GFX11-NEXT: v_add3_u32 v49, v49, v2, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v32, v53, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v51 +; GFX11-NEXT: v_perm_b32 v87, v29, v28, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v49, v54, vcc_lo +; GFX11-NEXT: v_add3_u32 v49, v55, v51, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-NEXT: v_add3_u32 v52, v32, v53, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v49, v50, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v49, 0x40c00000, v54 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_bfe_u32 v51, v49, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v50, v52, v55, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v49 +; GFX11-NEXT: v_add3_u32 v51, v51, v49, 0x7fff +; GFX11-NEXT: v_perm_b32 v98, v50, v32, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v50, v4, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v4 +; GFX11-NEXT: v_add3_u32 v50, v50, v4, 0x7fff +; GFX11-NEXT: v_bfe_u32 v55, v52, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v49, v51, v53, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v51, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 16, v98 +; GFX11-NEXT: v_lshrrev_b32_e32 v76, 8, v98 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v50, v54, vcc_lo +; GFX11-NEXT: v_add3_u32 v50, v55, v52, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-NEXT: v_add3_u32 v51, v51, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v52, 0x40c00000, v54 +; GFX11-NEXT: v_cndmask_b32_e32 v50, v50, v53, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v53, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v51, v55, vcc_lo +; GFX11-NEXT: v_bfe_u32 v51, v52, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_perm_b32 v100, v3, v50, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-NEXT: v_add3_u32 v51, v51, v52, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v66, v53, 16, 1 +; GFX11-NEXT: v_bfe_u32 v52, v5, 16, 1 +; GFX11-NEXT: v_add3_u32 v3, v3, v6, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v51, v51, v54, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v6, v66, v53, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v53 +; GFX11-NEXT: v_add3_u32 v52, v52, v5, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v5 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v55, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_perm_b32 v97, v31, v30, 0x7060302 +; GFX11-NEXT: v_perm_b32 v103, v3, v51, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v53, v6, v54 :: v_dual_add_f32 v6, 0x40c00000, v55 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v54, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v52, v66, vcc_lo +; GFX11-NEXT: v_bfe_u32 v52, v6, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_perm_b32 v102, v5, v53, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-NEXT: v_add3_u32 v52, v52, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v8 +; GFX11-NEXT: v_bfe_u32 v67, v54, 16, 1 +; GFX11-NEXT: v_perm_b32 v99, v2, v1, 0x7060302 +; GFX11-NEXT: v_add3_u32 v5, v5, v8, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v6, v52, v55, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_bfe_u32 v52, v7, 16, 1 +; GFX11-NEXT: v_add3_u32 v8, v67, v54, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v54 +; GFX11-NEXT: v_or_b32_e32 v67, 0x400000, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v66, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v66, 16, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-NEXT: v_add3_u32 v52, v52, v7, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v54, v8, v55, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v8, 0x40c00000, v66 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_lshlrev_b32 v55, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v52, v67, vcc_lo +; GFX11-NEXT: v_bfe_u32 v52, v8, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_perm_b32 v182, v7, v54, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v7, v10, 16, 1 +; GFX11-NEXT: v_add3_u32 v52, v52, v8, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v67, 0x400000, v10 +; GFX11-NEXT: v_bfe_u32 v112, v55, 16, 1 +; GFX11-NEXT: v_perm_b32 v183, v5, v6, 0x7060302 +; GFX11-NEXT: v_add3_u32 v7, v7, v10, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v8, v52, v66, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v52, v9, 16, 1 +; GFX11-NEXT: v_add3_u32 v10, v112, v55, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v55 +; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v67, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-NEXT: v_add3_u32 v52, v52, v9, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_perm_b32 v101, v4, v49, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 16, v4 +; GFX11-NEXT: v_dual_cndmask_b32 v55, v10, v66 :: v_dual_add_f32 v10, 0x40c00000, v67 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v49 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v50 +; GFX11-NEXT: v_bfe_u32 v66, v10, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v9, v52, v112 :: v_dual_lshlrev_b32 v52, 16, v11 +; GFX11-NEXT: v_bfe_u32 v67, v12, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v176, v9, v55, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v9, 0x40c00000, v52 +; GFX11-NEXT: v_add3_u32 v52, v66, v10, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v10 +; GFX11-NEXT: v_add3_u32 v67, v67, v12, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v12 +; GFX11-NEXT: v_bfe_u32 v113, v9, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v11, 0x40c00000, v11 :: v_dual_cndmask_b32 v10, v52, v66 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v9 +; GFX11-NEXT: v_add3_u32 v52, v113, v9, 0x7fff +; GFX11-NEXT: v_perm_b32 v177, v7, v8, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; GFX11-NEXT: v_dual_cndmask_b32 v12, v67, v112 :: v_dual_lshlrev_b32 v67, 16, v14 +; GFX11-NEXT: v_bfe_u32 v112, v11, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v26 +; GFX11-NEXT: v_dual_cndmask_b32 v9, v52, v66 :: v_dual_add_f32 v52, 0x40c00000, v67 +; GFX11-NEXT: v_add3_u32 v66, v112, v11, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v67, 0x400000, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v112, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_bfe_u32 v113, v52, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v24 +; GFX11-NEXT: v_dual_cndmask_b32 v11, v66, v67 :: v_dual_add_f32 v66, 0x40c00000, v112 +; GFX11-NEXT: v_add3_u32 v67, v113, v52, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v52 +; GFX11-NEXT: v_bfe_u32 v113, v14, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-NEXT: v_bfe_u32 v114, v66, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v115, 0x400000, v66 +; GFX11-NEXT: v_perm_b32 v162, v11, v9, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v52, v67, v112, vcc_lo +; GFX11-NEXT: v_add3_u32 v67, v113, v14, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v14 +; GFX11-NEXT: v_add3_u32 v113, v114, v66, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v114, 16, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_bfe_u32 v116, v13, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-NEXT: v_dual_cndmask_b32 v14, v67, v112 :: v_dual_add_f32 v67, 0x40c00000, v114 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-NEXT: v_add3_u32 v112, v116, v13, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v149, v14, v52, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v114, v67, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v66, v113, v115, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v113, 0x400000, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v115, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_perm_b32 v163, v12, v10, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v17 +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_dual_cndmask_b32 v13, v112, v113 :: v_dual_add_f32 v112, 0x40c00000, v115 +; GFX11-NEXT: v_add3_u32 v113, v114, v67, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v114, 0x400000, v67 +; GFX11-NEXT: v_bfe_u32 v115, v16, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-NEXT: v_bfe_u32 v116, v112, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v117, 0x400000, v112 +; GFX11-NEXT: v_or_b32_e32 v118, 0x400000, v15 +; GFX11-NEXT: v_perm_b32 v148, v13, v66, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v67, v113, v114, vcc_lo +; GFX11-NEXT: v_add3_u32 v114, v115, v16, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v115, 0x400000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_bfe_u32 v113, v15, 16, 1 +; GFX11-NEXT: v_add3_u32 v116, v116, v112, 0x7fff +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v33 +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[96:97] +; GFX11-NEXT: v_cndmask_b32_e32 v16, v114, v115, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v112, v112 +; GFX11-NEXT: v_add3_u32 v113, v113, v15, 0x7fff +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[86:87] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[84:85] +; GFX11-NEXT: v_perm_b32 v135, v16, v67, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v112, v116, v117, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v52 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v53 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v113, v118, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX11-NEXT: v_perm_b32 v134, v15, v112, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v112 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v48 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v51 +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[134:135] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[148:149] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[162:163] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[176:177] +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v37 +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[182:183] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[82:83] +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v67 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v66 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v54 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v38 +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[102:103] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[98:99] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[80:81] +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v39 +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[100:101] +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[70:71] +; GFX11-NEXT: v_lshrrev_b64 v[67:68], 24, v[68:69] +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 24, v135 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 8, v135 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v134 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v134 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 24, v149 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v149 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v148 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 8, v148 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 24, v163 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 8, v163 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 16, v162 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 8, v162 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 24, v177 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v177 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 16, v176 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 8, v176 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 24, v183 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 8, v183 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v182 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v182 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 24, v103 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 8, v103 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 16, v102 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 8, v102 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 24, v101 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 8, v101 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 16, v100 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 8, v100 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 24, v99 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 8, v99 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 24, v97 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 8, v97 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v96 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 8, v96 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 24, v87 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v87 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v86 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v86 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 24, v85 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 16, v84 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v84 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v83 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 16, v82 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v82 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 24, v81 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 8, v81 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v80 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 24, v71 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v71 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v70 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 16, v77 +; GFX11-NEXT: .LBB45_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v76 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v63 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v75 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v66, 0xff, v60 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v73 +; GFX11-NEXT: v_or_b32_e32 v54, v66, v54 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v56 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v166 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v65 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v62 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v65 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v58 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v160 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v51 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-NEXT: v_or_b32_e32 v54, v65, v66 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v45 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v44 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v55 +; GFX11-NEXT: v_or_b32_e32 v53, v65, v53 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v42 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v147 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v41 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v54 +; GFX11-NEXT: v_or_b32_e32 v53, v55, v65 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v183 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v181 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v54 +; GFX11-NEXT: v_or_b32_e32 v52, v55, v52 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v180 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v132 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v179 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v52 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_or_b32_e32 v52, v54, v55 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v167 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v177 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v165 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-NEXT: v_or_b32_e32 v51, v54, v51 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v55 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v52 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-NEXT: v_and_b32_e32 v51, 0xff, v119 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v164 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v162 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v161 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v150 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v118 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v149 +; GFX11-NEXT: v_or_b32_e32 v51, v51, v52 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v53 +; GFX11-NEXT: v_or_b32_e32 v50, v54, v50 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v55 +; GFX11-NEXT: v_or_b32_e32 v52, v65, v66 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v10, v51 +; GFX11-NEXT: v_or_b32_e32 v3, v11, v50 +; GFX11-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v6, 8, v148 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v145 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v49 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v144 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v116 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v135 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v134 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v131 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v130 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v114 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v129 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v74 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v72 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v67 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v39, v48 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v61 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v69 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v59 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v57 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v47 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v46 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v146 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v43 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v70 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v40 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v71 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v133 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v182 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v178 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v81 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v128 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v176 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v82 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v163 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v83 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v117 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v151 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v103 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v115 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v102 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v86 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v87 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v113 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v100 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-NEXT: v_lshlrev_b16 v28, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v99 +; GFX11-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v32, 8, v97 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v112 +; GFX11-NEXT: v_lshlrev_b16 v34, 8, v98 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-NEXT: s_clause 0x15 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:96 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v128i8_to_v64f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v50, v27 +; GCN-NEXT: v_mov_b32_e32 v49, v25 +; GCN-NEXT: v_mov_b32_e32 v39, v21 +; GCN-NEXT: v_mov_b32_e32 v48, v3 +; GCN-NEXT: v_mov_b32_e32 v37, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:24 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v12 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v20 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v30 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:392 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v21 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v33 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v8 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v10 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:240 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:272 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v4 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:336 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v57, 8, v4 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:388 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:384 +; GCN-NEXT: v_lshlrev_b32_e32 v62, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v2 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:152 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:184 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:216 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:248 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:280 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:312 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:344 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:376 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:368 +; GCN-NEXT: s_waitcnt vmcnt(13) +; GCN-NEXT: v_lshlrev_b32_e32 v63, 8, v3 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB46_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_mov_b32_e32 v26, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v37 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v48 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v7 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v11 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v13 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v15 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v0, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GCN-NEXT: v_or_b32_e32 v0, v0, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v19 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v0, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v39 +; GCN-NEXT: v_or_b32_e32 v25, v0, v22 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v23 +; GCN-NEXT: v_or_b32_e32 v23, v0, v24 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v49 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v0, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v50 +; GCN-NEXT: v_or_b32_e32 v33, v0, v28 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v29 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v0, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v12 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v0, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v14 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v2, v4 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v37, v2, v4 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v2, v4 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v2, v4 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v48, v2, v4 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v49, v6, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v50, v6, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v51 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v6, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: v_mov_b32_e32 v7, v8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v8, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v10, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v12, v5 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v30 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v12, v5 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v7 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v12, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v7, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v16, v5 +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v53, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v9, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v18, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v20, v5 +; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v40, v40, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v22, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v5 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v26 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v5, v13 +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v42, v42, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v24, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v13, v15 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v15 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v26, v15 +; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v43, v43, v15 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v15, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v28, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v17, v27 +; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v46, v46, v27 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v51, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v51, v51, v52 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v52, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v52, v52, v54 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v54, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v54, v54, v55 +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v56 +; GCN-NEXT: v_or_b32_e32 v55, v55, v41 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v41, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v41, v41, v44 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v44, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v44, v44, v45 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v45, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v45, v45, v47 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v58 +; GCN-NEXT: v_or_b32_e32 v47, v47, v57 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v56, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v56, v56, v62 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v57, 0xff, v27 +; GCN-NEXT: v_or_b32_e32 v31, v57, v31 +; GCN-NEXT: v_and_b32_e32 v57, 0xff, v60 +; GCN-NEXT: v_or_b32_e32 v57, v57, v63 +; GCN-NEXT: v_and_b32_e32 v58, 0xff, v61 +; GCN-NEXT: v_or_b32_e32 v1, v58, v1 +; GCN-NEXT: v_and_b32_e32 v58, 0xff, v59 +; GCN-NEXT: v_or_b32_e32 v3, v58, v3 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v25 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v23 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v32 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v33 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v34 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v36 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v37 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v38 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v48 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v49 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v50 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v14 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v53 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v18 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v40 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v11 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v42 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v13 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v43 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v15 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v17 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v46 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v56 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v3 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: .LBB46_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB46_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v59 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v6, v3, v2 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v61 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v10, v1, v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v60 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v16, v63, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v20, v31, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v1, v62, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v58 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v57, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v1, v47, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v1, v45, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v1, v44, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v56 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v41, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v1, v55, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v1, v54, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v1, v52, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v46 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v43 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v42 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v3, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v0, v24 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v40 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v0, v26 +; GCN-NEXT: v_mov_b32_e32 v0, v37 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v4, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v28 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v53 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v32 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v33 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v8 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v1, v34 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v35 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v36 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v21, v39 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v51 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, v48 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v48 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v25, v49 +; GCN-NEXT: v_mov_b32_e32 v27, v50 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v49 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v50 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v50 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v51, 0xff, v51 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v51 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v52, 0xff, v52 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v52 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v53 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v54 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v55 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v14 +; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v2, v40 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v12 +; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v2, v41 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v29 +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v42 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v42 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v27 +; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v4, v43 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v25 +; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v44 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v45, 0xff, v45 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v3, v45 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v21 +; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 +; GCN-NEXT: v_or_b32_e32 v46, v22, v46 +; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v19 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v47, v2, v47 +; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v17 +; GCN-NEXT: v_and_b32_e32 v56, 0xff, v56 +; GCN-NEXT: v_or_b32_e32 v56, v18, v56 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 3, v15 +; GCN-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v57, v2, v57 +; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v13 +; GCN-NEXT: v_and_b32_e32 v58, 0xff, v58 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v58, v2, v58 +; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v11 +; GCN-NEXT: v_and_b32_e32 v59, 0xff, v59 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v59, v2, v59 +; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v9 +; GCN-NEXT: v_and_b32_e32 v60, 0xff, v60 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v60, v2, v60 +; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v7 +; GCN-NEXT: v_and_b32_e32 v61, 0xff, v61 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v61, v2, v61 +; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v5 +; GCN-NEXT: v_and_b32_e32 v62, 0xff, v62 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v62, v2, v62 +; GCN-NEXT: v_add_i32_e32 v63, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v63, 0xff, v63 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v63, v1, v63 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v45, v0, v3 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v44, vcc, 0x300, v6 +; GCN-NEXT: v_add_i32_e32 v43, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v42, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v41, vcc, s6, v20 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v40, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v55, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v53, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v49, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v39, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v38, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v36, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v35, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v34, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v33, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, s6, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v24 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v26 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v0 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v8 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v12 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v46, vcc, s6, v46 +; GCN-NEXT: v_add_i32_e32 v47, vcc, s6, v47 +; GCN-NEXT: v_add_i32_e32 v56, vcc, s6, v56 +; GCN-NEXT: v_add_i32_e32 v57, vcc, s6, v57 +; GCN-NEXT: v_add_i32_e32 v58, vcc, s6, v58 +; GCN-NEXT: v_add_i32_e32 v59, vcc, s6, v59 +; GCN-NEXT: v_add_i32_e32 v60, vcc, s6, v60 +; GCN-NEXT: v_add_i32_e32 v61, vcc, s6, v61 +; GCN-NEXT: v_add_i32_e32 v62, vcc, s6, v62 +; GCN-NEXT: v_add_i32_e32 v63, vcc, s6, v63 +; GCN-NEXT: v_add_i32_e32 v45, vcc, s6, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v63 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v62 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v61 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v60 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v59 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v58 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v57 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v56 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v47 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v46 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v14 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v16 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v20 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v22 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v11 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v13 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v15 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v17 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v19 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v21 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v23 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v24 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v25 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v27 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v28 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v29 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v30 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v31 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v33 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v35 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v36 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v37 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v38 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v39 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v44 +; GCN-NEXT: .LBB46_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v0, v50, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v50 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v45, v1, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v44, v1, v0 +; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v50 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v47, v1, v0 +; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v50 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v46, v1, v0 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v50 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v56, vcc, 20, v50 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v50 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v48, vcc, 28, v50 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v50 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v14, vcc, 36, v50 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v18, vcc, 40, v50 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v22, vcc, 44, v50 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v63, v1, v0 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 48, v50 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v60, v1, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 52, v50 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v30, v1, v0 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 56, v50 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v61, v1, v0 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v50 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v7, v1, v0 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 64, v50 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v62, v1, v0 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x44, v50 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v5, v1, v0 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x48, v50 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v52, v12, v0 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x4c, v50 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v54, v12, v0 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x50, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v32 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v1, v11, v0 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x54, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v20 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v11, v11, v0 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x58, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v26 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v15, v12, v0 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x5c, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v19, v6, v0 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x60, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v9, v6, v0 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x64, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v13, v2, v0 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x68, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v17, v2, v0 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x6c, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v21, v2, v0 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x70, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v25, v2, v0 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x74, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v29, v2, v0 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x78, v50 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v50 +; GCN-NEXT: buffer_store_dword v45, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v56, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, v48, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, v14, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, v18, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, v22, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v63, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v60, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v62, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v64f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v23 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v27 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v10, v63, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v57, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v31, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: .LBB46_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB46_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v18, 0x300 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_sdwa v4, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v29, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v0, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v39, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v32, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v33, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v33, v33, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v34, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v35, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v35, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v35 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v36, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v36, v36, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v24, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v36 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_u16_e32 v8, 3, v61 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v9, 3, v62 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_u16_e32 v9, 3, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 3, v59 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_u16_e32 v10, 3, v60 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_u16_e32 v11, 3, v58 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_u16_e32 v12, 3, v47 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v46 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_u16_e32 v13, 3, v45 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v44 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_u16_e32 v14, 3, v43 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v15, 3, v42 +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v21, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v21 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v16, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v19, 0x300, v20 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v16, v19, v16 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: v_or_b32_sdwa v30, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v31, v50, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v31, 0x300, v31 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v37, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v40, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v37, v37, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v29, v40 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v38, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v55, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v38, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v30, v55 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v39, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v48, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v21, v39, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v48, 3, v48 +; VI-NEXT: v_or_b32_sdwa v48, v49, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v49, 3, v49 +; VI-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v20, v49, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v50, 3, v50 +; VI-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v39, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v39, 0x300, v39 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v51, 3, v51 +; VI-NEXT: v_or_b32_sdwa v51, v52, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v52, 3, v52 +; VI-NEXT: v_or_b32_sdwa v52, v53, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v53, 3, v53 +; VI-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v54, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v19, v51, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v18, v53, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v39, v18 +; VI-NEXT: v_add_u16_e32 v39, 0x300, v52 +; VI-NEXT: v_or_b32_e32 v19, v39, v19 +; VI-NEXT: v_add_u16_e32 v39, 0x300, v50 +; VI-NEXT: v_or_b32_e32 v20, v39, v20 +; VI-NEXT: v_add_u16_e32 v39, 0x300, v48 +; VI-NEXT: v_or_b32_e32 v21, v39, v21 +; VI-NEXT: v_or_b32_e32 v31, v31, v54 +; VI-NEXT: .LBB46_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v64f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v23 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v25 +; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v16, v17, v16, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v17, v19, v18, s6 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v18, v19, v18, s6 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v19, v20, v19, s6 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v20, v21, v20, s6 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v21, v22, v21, s6 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v22, v23, v22, s6 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v30, v31, v30, s6 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: .LBB46_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB46_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_perm_b32 v1, v1, v4, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_e32 v35, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v36, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v22 +; GFX9-NEXT: v_add_u16_e32 v36, 0x300, v36 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v37, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v21 +; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v23 +; GFX9-NEXT: v_perm_b32 v29, v34, v29, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v38, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v39, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v39 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v48, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 +; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v16 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v18 +; GFX9-NEXT: v_perm_b32 v17, v17, v20, s6 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v19 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v16, v18, v16, s6 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v49, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v20 +; GFX9-NEXT: v_perm_b32 v30, v33, v30, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v50, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v52, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v39, 0x300, v50 +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v51, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v51 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v52, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v32, 0x300, v19 +; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v53 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v55 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v50, 0x300, v40 +; GFX9-NEXT: v_perm_b32 v21, v50, v21, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v41 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v42, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v51, 0x300, v42 +; GFX9-NEXT: v_perm_b32 v20, v51, v20, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v43, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v44, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v45, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v45 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v46, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v24 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v26 +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v37 +; GFX9-NEXT: v_add_u16_e32 v37, 0x300, v38 +; GFX9-NEXT: v_add_u16_e32 v38, 0x300, v48 +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v49 +; GFX9-NEXT: v_add_u16_e32 v48, 0x300, v52 +; GFX9-NEXT: v_add_u16_e32 v49, 0x300, v54 +; GFX9-NEXT: v_add_u16_e32 v52, 0x300, v44 +; GFX9-NEXT: v_add_u16_e32 v53, 0x300, v46 +; GFX9-NEXT: v_perm_b32 v18, v53, v18, s6 +; GFX9-NEXT: v_perm_b32 v19, v52, v19, s6 +; GFX9-NEXT: v_perm_b32 v22, v49, v22, s6 +; GFX9-NEXT: v_perm_b32 v23, v48, v23, s6 +; GFX9-NEXT: v_perm_b32 v24, v39, v24, s6 +; GFX9-NEXT: v_perm_b32 v25, v38, v25, s6 +; GFX9-NEXT: v_perm_b32 v26, v37, v26, s6 +; GFX9-NEXT: v_perm_b32 v27, v36, v27, s6 +; GFX9-NEXT: v_perm_b32 v28, v35, v28, s6 +; GFX9-NEXT: .LBB46_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v128i8_to_v64f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:580 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:576 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:572 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:568 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:564 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:560 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:556 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:552 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:548 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:544 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:540 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:536 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:532 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:528 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:524 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:520 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:516 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:512 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:508 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:504 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:500 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:496 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:492 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:488 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:484 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:480 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:476 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:472 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:468 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:464 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:460 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:456 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:452 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:448 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:444 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:440 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:436 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:432 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:428 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:424 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:420 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:416 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:412 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:408 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:404 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:400 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:396 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:392 +; GFX11-NEXT: v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24 +; GFX11-NEXT: v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26 +; GFX11-NEXT: v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20 +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16 +; GFX11-NEXT: v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4 +; GFX11-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 +; GFX11-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:380 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-NEXT: scratch_load_u16 v134, off, s32 offset:372 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:364 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:356 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:348 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:340 +; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:332 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:324 +; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:316 +; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:308 +; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:300 +; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:292 +; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:284 +; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:276 +; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:268 +; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:252 +; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:248 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-NEXT: scratch_load_u16 v88, off, s32 offset:240 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:236 +; GFX11-NEXT: scratch_load_u16 v93, off, s32 offset:232 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:228 +; GFX11-NEXT: scratch_load_u16 v91, off, s32 offset:224 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:220 +; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v150, off, s32 offset:388 +; GFX11-NEXT: scratch_load_u16 v182, off, s32 +; GFX11-NEXT: scratch_load_u16 v40, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v43, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v44, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v45, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v46, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v47, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v56, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v58, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v59, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v60, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v61, off, s32 offset:88 +; GFX11-NEXT: scratch_load_u16 v62, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v63, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v72, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v73, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:136 +; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:144 +; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:160 +; GFX11-NEXT: scratch_load_u16 v79, off, s32 offset:168 +; GFX11-NEXT: scratch_load_u16 v89, off, s32 offset:176 +; GFX11-NEXT: scratch_load_u16 v90, off, s32 offset:184 +; GFX11-NEXT: scratch_load_u16 v95, off, s32 offset:192 +; GFX11-NEXT: scratch_load_u16 v104, off, s32 offset:200 +; GFX11-NEXT: scratch_load_u16 v105, off, s32 offset:208 +; GFX11-NEXT: scratch_load_u16 v42, off, s32 offset:212 +; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:204 +; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:196 +; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:188 +; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:180 +; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:172 +; GFX11-NEXT: scratch_load_u16 v177, off, s32 offset:164 +; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:156 +; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:148 +; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:140 +; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v70, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v71, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v82, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v128, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v113, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v132, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v161, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v160, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v176, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v167, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(62) +; GFX11-NEXT: v_lshlrev_b16 v127, 8, v0 +; GFX11-NEXT: v_lshlrev_b16 v126, 8, v2 +; GFX11-NEXT: v_lshlrev_b16 v124, 8, v4 +; GFX11-NEXT: v_lshlrev_b16 v125, 8, v6 +; GFX11-NEXT: v_lshlrev_b16 v120, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v123, 8, v10 +; GFX11-NEXT: v_lshlrev_b16 v121, 8, v12 +; GFX11-NEXT: v_lshlrev_b16 v122, 8, v14 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: v_lshlrev_b16 v106, 8, v16 +; GFX11-NEXT: v_lshlrev_b16 v111, 8, v18 +; GFX11-NEXT: v_lshlrev_b16 v109, 8, v20 +; GFX11-NEXT: v_lshlrev_b16 v110, 8, v22 +; GFX11-NEXT: v_lshlrev_b16 v107, 8, v24 +; GFX11-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-NEXT: s_waitcnt vmcnt(61) +; GFX11-NEXT: v_lshlrev_b16 v88, 8, v88 +; GFX11-NEXT: s_waitcnt vmcnt(59) +; GFX11-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-NEXT: s_waitcnt vmcnt(57) +; GFX11-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-NEXT: s_waitcnt vmcnt(55) +; GFX11-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-NEXT: s_waitcnt vmcnt(54) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 +; GFX11-NEXT: s_waitcnt vmcnt(53) +; GFX11-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-NEXT: s_waitcnt vmcnt(52) +; GFX11-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-NEXT: s_waitcnt vmcnt(51) +; GFX11-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-NEXT: s_waitcnt vmcnt(50) +; GFX11-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-NEXT: s_waitcnt vmcnt(49) +; GFX11-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-NEXT: s_waitcnt vmcnt(48) +; GFX11-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-NEXT: s_waitcnt vmcnt(47) +; GFX11-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-NEXT: s_waitcnt vmcnt(46) +; GFX11-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-NEXT: s_waitcnt vmcnt(45) +; GFX11-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-NEXT: s_waitcnt vmcnt(44) +; GFX11-NEXT: v_lshlrev_b16 v58, 8, v59 +; GFX11-NEXT: s_waitcnt vmcnt(43) +; GFX11-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-NEXT: s_waitcnt vmcnt(42) +; GFX11-NEXT: v_lshlrev_b16 v60, 8, v61 +; GFX11-NEXT: s_waitcnt vmcnt(41) +; GFX11-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-NEXT: s_waitcnt vmcnt(40) +; GFX11-NEXT: v_lshlrev_b16 v62, 8, v63 +; GFX11-NEXT: s_waitcnt vmcnt(39) +; GFX11-NEXT: v_lshlrev_b16 v47, 8, v72 +; GFX11-NEXT: s_waitcnt vmcnt(38) +; GFX11-NEXT: v_lshlrev_b16 v72, 8, v73 +; GFX11-NEXT: s_waitcnt vmcnt(37) +; GFX11-NEXT: v_lshlrev_b16 v63, 8, v74 +; GFX11-NEXT: s_waitcnt vmcnt(36) +; GFX11-NEXT: v_lshlrev_b16 v74, 8, v75 +; GFX11-NEXT: s_waitcnt vmcnt(35) +; GFX11-NEXT: v_lshlrev_b16 v73, 8, v76 +; GFX11-NEXT: s_waitcnt vmcnt(34) +; GFX11-NEXT: v_lshlrev_b16 v75, 8, v77 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v61, 8, v78 +; GFX11-NEXT: s_waitcnt vmcnt(32) +; GFX11-NEXT: v_lshlrev_b16 v78, 8, v79 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v77, 8, v89 +; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: v_lshlrev_b16 v79, 8, v90 +; GFX11-NEXT: s_waitcnt vmcnt(29) +; GFX11-NEXT: v_lshlrev_b16 v76, 8, v95 +; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: v_lshlrev_b16 v90, 8, v104 +; GFX11-NEXT: s_waitcnt vmcnt(27) +; GFX11-NEXT: v_lshlrev_b16 v89, 8, v105 +; GFX11-NEXT: v_lshlrev_b16 v104, 8, v94 +; GFX11-NEXT: v_lshlrev_b16 v95, 8, v31 +; GFX11-NEXT: v_lshlrev_b16 v105, 8, v30 +; GFX11-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v51 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v34 +; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v53 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v66 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v128 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v113 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v132 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v100 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v161 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v160 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v176 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v167 +; GFX11-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v11, v10, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v102 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v87 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v114 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v96 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v133 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v117 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v135 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v181 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v150 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v41 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v40 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v43 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v182 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v46 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v45 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v57 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v56 +; GFX11-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v147 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v119 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v149 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v144 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v162 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v146 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v164 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v151 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v148 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v58 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v44 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v60 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v59 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v62 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v47 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v72 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v63 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v74 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v73 +; GFX11-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 +; GFX11-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v166 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v145 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v163 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v179 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v165 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v183 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v180 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v42 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v65 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v75 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v61 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v78 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v77 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v79 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v76 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v90 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v89 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v92 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v91 +; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v18, v20, v19, 0x5040100 +; GFX11-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 +; GFX11-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 +; GFX11-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v64 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v68 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v85 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v67 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v97 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v101 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v86 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v93 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v88 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v104 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v95 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v105 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v94 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v108 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v107 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v110 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v109 +; GFX11-NEXT: v_perm_b32 v22, v23, v22, 0x5040100 +; GFX11-NEXT: v_perm_b32 v23, v25, v24, 0x5040100 +; GFX11-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 +; GFX11-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 +; GFX11-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v103 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v112 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v99 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v129 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v98 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v131 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v116 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v134 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v111 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v106 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v122 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v121 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v123 +; GFX11-NEXT: v_or_b32_e32 v32, v32, v120 +; GFX11-NEXT: v_or_b32_e32 v33, v33, v125 +; GFX11-NEXT: v_or_b32_e32 v34, v34, v124 +; GFX11-NEXT: v_or_b32_e32 v35, v35, v126 +; GFX11-NEXT: v_or_b32_e32 v36, v36, v127 +; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-NEXT: v_perm_b32 v28, v30, v29, 0x5040100 +; GFX11-NEXT: v_perm_b32 v29, v32, v31, 0x5040100 +; GFX11-NEXT: v_perm_b32 v30, v34, v33, 0x5040100 +; GFX11-NEXT: v_perm_b32 v31, v36, v35, 0x5040100 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr77 +; GFX11-NEXT: ; implicit-def: $vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr90 +; GFX11-NEXT: ; implicit-def: $vgpr89 +; GFX11-NEXT: ; implicit-def: $vgpr92 +; GFX11-NEXT: ; implicit-def: $vgpr91 +; GFX11-NEXT: ; implicit-def: $vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr104 +; GFX11-NEXT: ; implicit-def: $vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr105 +; GFX11-NEXT: ; implicit-def: $vgpr94 +; GFX11-NEXT: ; implicit-def: $vgpr108 +; GFX11-NEXT: ; implicit-def: $vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr110 +; GFX11-NEXT: ; implicit-def: $vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr111 +; GFX11-NEXT: ; implicit-def: $vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr121 +; GFX11-NEXT: ; implicit-def: $vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr125 +; GFX11-NEXT: ; implicit-def: $vgpr124 +; GFX11-NEXT: ; implicit-def: $vgpr126 +; GFX11-NEXT: ; implicit-def: $vgpr127 +; GFX11-NEXT: .LBB46_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB46_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v134, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v118, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v131, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v116, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v129, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_add_nc_u16 v35, v35, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v126, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v127, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v125, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v124, v3 +; GFX11-NEXT: v_add_nc_u16 v33, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v31, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v98, 3 +; GFX11-NEXT: v_add_nc_u16 v116, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v30, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v98, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v1, v112, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_add_nc_u16 v3, v99, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v103, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v120, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v122, v1 +; GFX11-NEXT: v_add_nc_u16 v29, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v121, v3 +; GFX11-NEXT: v_add_nc_u16 v99, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v111, v4 +; GFX11-NEXT: v_add_nc_u16 v28, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v81, 3 +; GFX11-NEXT: v_add_nc_u16 v81, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v101, 3 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v3, v97, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v4, v83, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v106, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v110, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v109, v0 +; GFX11-NEXT: v_add_nc_u16 v83, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v108, v3 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v107, v4 +; GFX11-NEXT: v_add_nc_u16 v86, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v85, 3 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-NEXT: v_add_nc_u16 v67, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v69, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v94, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v104, v2 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v95, v3 +; GFX11-NEXT: v_add_nc_u16 v68, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v93, v4 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v64, 3 +; GFX11-NEXT: v_add_nc_u16 v64, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(26) +; GFX11-NEXT: v_add_nc_u16 v0, v42, 3 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_waitcnt vmcnt(24) +; GFX11-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v180, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v88, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v91, v1 +; GFX11-NEXT: v_add_nc_u16 v65, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v90, v3 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v89, v4 +; GFX11-NEXT: v_add_nc_u16 v69, 0x300, v1 +; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: v_add_nc_u16 v1, v179, 3 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v165, 3 +; GFX11-NEXT: v_add_nc_u16 v80, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: v_add_nc_u16 v0, v177, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: v_add_nc_u16 v4, v166, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v79, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v76, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v78, v0 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v77, v3 +; GFX11-NEXT: v_add_nc_u16 v85, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v75, v4 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v145, 3 +; GFX11-NEXT: v_add_nc_u16 v97, 0x300, v1 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_add_nc_u16 v1, v151, 3 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_add_nc_u16 v3, v178, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v164, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v61, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v74, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v73, v2 +; GFX11-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v63, v4 +; GFX11-NEXT: v_add_nc_u16 v103, 0x300, v2 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: v_add_nc_u16 v2, v162, 3 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v146, 3 +; GFX11-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: v_add_nc_u16 v1, v149, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v3, v144, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_add_nc_u16 v4, v147, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v62, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v60, v1 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v59, v3 +; GFX11-NEXT: v_add_nc_u16 v118, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v58, v4 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v119, 3 +; GFX11-NEXT: v_add_nc_u16 v119, 0x300, v2 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_add_nc_u16 v2, v135, 3 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-NEXT: v_add_nc_u16 v117, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v46, v3 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v45, v4 +; GFX11-NEXT: v_add_nc_u16 v129, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_add_nc_u16 v0, v114, 3 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v96, 3 +; GFX11-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v2, v102, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v55, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v43, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v41, v2 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v40, v3 +; GFX11-NEXT: v_add_nc_u16 v55, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v181, v4 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v37, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v54, 3 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v52, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v50, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v150, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v167, v1 +; GFX11-NEXT: v_add_nc_u16 v50, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v161, v3 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v160, v4 +; GFX11-NEXT: v_add_nc_u16 v52, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v51, 3 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v49, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v48, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v4, v36, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v115, v0 +; GFX11-NEXT: v_add_nc_u16 v34, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v100, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v113, v4 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v39, 3 +; GFX11-NEXT: v_add_nc_u16 v36, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v32, v32, 3 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-NEXT: v_or_b32_e32 v35, v71, v35 +; GFX11-NEXT: v_or_b32_e32 v33, v82, v33 +; GFX11-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-NEXT: v_or_b32_e32 v36, v70, v36 +; GFX11-NEXT: v_or_b32_e32 v32, v66, v32 +; GFX11-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-NEXT: v_add_nc_u16 v38, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v36 +; GFX11-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-NEXT: v_add_nc_u16 v36, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v39, 0x300, v1 +; GFX11-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 +; GFX11-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 +; GFX11-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 +; GFX11-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 +; GFX11-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 +; GFX11-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 +; GFX11-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 +; GFX11-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 +; GFX11-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 +; GFX11-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 +; GFX11-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 +; GFX11-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 +; GFX11-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 +; GFX11-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 +; GFX11-NEXT: .LBB46_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:392 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:396 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:400 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:404 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:408 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:412 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:416 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:420 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:424 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:428 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:432 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:436 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:440 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:444 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:448 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:452 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:456 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:460 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:464 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:468 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:472 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:476 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:480 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:484 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:488 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:492 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:496 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:500 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:504 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:508 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:512 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:516 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:520 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:524 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:528 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:532 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:536 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:540 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:544 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:548 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:552 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:556 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:560 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:564 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:568 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:572 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:576 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v64f16_to_v128i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v63, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v11 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v16 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v15 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v20 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v21 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v24 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v23 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v25 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v28 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v27 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v29 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v44 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v37 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v36 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v62 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v53 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v60 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v61 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v58 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v59 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v56 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v57 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v43 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v42 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v41 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v40 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v55 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v54 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v52 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v51 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v50 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:104 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v38 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v35 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v8 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v45 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v46 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v47 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v14 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v15 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v16 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB47_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v31 +; GCN-NEXT: v_or_b32_e32 v61, v32, v14 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v35 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v62, v15, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v55, v5, v6 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v59 +; GCN-NEXT: v_or_b32_e32 v43, v7, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; GCN-NEXT: v_or_b32_e32 v41, v9, v5 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; GCN-NEXT: v_or_b32_e32 v47, v10, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v40, v2, v1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; GCN-NEXT: v_or_b32_e32 v58, v11, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GCN-NEXT: v_or_b32_e32 v54, v4, v1 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; GCN-NEXT: v_or_b32_e32 v60, v12, v1 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; GCN-NEXT: v_or_b32_e32 v44, v13, v2 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v51, v5, v1 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v45, v6, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v36 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v7, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v46, v9, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v10, v1 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v48 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v56, v11, v5 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v49 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v12, v3 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v50 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v57, v13, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v52 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v28, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v42, v33, v7 +; GCN-NEXT: v_bfe_u32 v7, v35, 8, 8 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v7, v4 +; GCN-NEXT: v_bfe_u32 v4, v59, 8, 8 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v59, v4, v9 +; GCN-NEXT: v_bfe_u32 v4, v23, 8, 8 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v4, v1 +; GCN-NEXT: v_bfe_u32 v1, v32, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v1, v10 +; GCN-NEXT: v_bfe_u32 v1, v31, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v1, v5 +; GCN-NEXT: v_bfe_u32 v1, v22, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v1, v11 +; GCN-NEXT: v_bfe_u32 v1, v16, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v1, v3 +; GCN-NEXT: v_bfe_u32 v1, v15, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v15, v8, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v21, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v1, v6 +; GCN-NEXT: v_bfe_u32 v1, v20, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v16, v38, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v18, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v29, v2 +; GCN-NEXT: v_bfe_u32 v2, v14, 8, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v2, v24, 8, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v2, v30, 8, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v2, v19, 8, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v62, v61, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v62, v61, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v62, v61, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v43, v55, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v43, v55, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v33, v43, v55, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v47, v41, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v47, v41, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v47, v41, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v58, v40, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v58, v40, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v58, v40, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v60, v54, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v60, v54, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v60, v54, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v51, v44, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v51, v44, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v51, v44, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v25, v45, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v25, v45, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v25, v45, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v26, v46, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v26, v46, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v26, v46, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v27, v56, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v27, v56, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v27, v56, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v28, v57, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v28, v57, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v28, v57, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v35, v42, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v35, v42, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v35, v42, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v23, v59, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v23, v59, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v23, v59, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v9, v7, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v9, v7, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_alignbit_b32 v2, v9, v7, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_alignbit_b32 v2, v5, v4, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v3, v15, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v3, v15, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v3, v15, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v1, v16, 24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v1, v16, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v2, v1, v16, 8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v62 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v43 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v47 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v58 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v60 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v51 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v25 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v26 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v27 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v28 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v35 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v23 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v9 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v5 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v53, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v14, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v17, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; kill: killed $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; kill: killed $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: .LBB47_2: ; %Flow +; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB47_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v38 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v29 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v15 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v15, v16 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v8, v8, v15 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v19 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v14 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v8 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; GCN-NEXT: v_or_b32_e32 v8, v8, v17 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v49 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v14 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v8, v17, v8 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v14 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v16 +; GCN-NEXT: v_or_b32_e32 v8, v8, v19 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v48 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v14 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v8, v19, v8 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v14 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v18 +; GCN-NEXT: v_or_b32_e32 v8, v8, v21 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v39 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v63 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v47, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v14 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; GCN-NEXT: v_add_f32_e32 v57, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v59, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; GCN-NEXT: v_add_f32_e32 v61, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; GCN-NEXT: v_add_f32_e32 v62, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; GCN-NEXT: v_add_f32_e32 v63, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v31 +; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v32 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v37 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v34 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v48 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v49 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v51 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v52 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v54 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v57 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v55 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v40 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v41 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v58 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v59 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v43 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v58, v44 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v60 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v45 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v61 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v46 +; GCN-NEXT: v_cvt_f16_f32_e32 v61, v62 +; GCN-NEXT: v_cvt_f16_f32_e32 v60, v47 +; GCN-NEXT: v_cvt_f16_f32_e32 v47, v63 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v53 +; GCN-NEXT: v_mov_b32_e32 v53, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v56 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v33 +; GCN-NEXT: v_mov_b32_e32 v33, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: v_cvt_f16_f32_e32 v62, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GCN-NEXT: v_or_b32_e32 v59, v23, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GCN-NEXT: v_or_b32_e32 v23, v19, v5 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GCN-NEXT: v_or_b32_e32 v42, v9, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; GCN-NEXT: v_or_b32_e32 v35, v35, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; GCN-NEXT: v_or_b32_e32 v57, v28, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v51 +; GCN-NEXT: v_or_b32_e32 v28, v27, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; GCN-NEXT: v_or_b32_e32 v56, v13, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; GCN-NEXT: v_or_b32_e32 v27, v22, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v30 +; GCN-NEXT: v_or_b32_e32 v46, v24, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v32 +; GCN-NEXT: v_or_b32_e32 v26, v26, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v60 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_or_b32_e32 v45, v34, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v37 +; GCN-NEXT: v_or_b32_e32 v25, v25, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v62 +; GCN-NEXT: v_or_b32_e32 v44, v12, v14 +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: v_bfe_u32 v5, v62, 8, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v51, v40, v38 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_bfe_u32 v5, v18, 8, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v54, v54, v13 +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_bfe_u32 v5, v60, 8, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_or_b32_e32 v60, v52, v29 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v1, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v40, v50, v22 +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_bfe_u32 v1, v58, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_or_b32_e32 v58, v43, v30 +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_bfe_u32 v1, v41, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_or_b32_e32 v41, v61, v24 +; GCN-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_bfe_u32 v1, v55, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v47, v47, v32 +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_bfe_u32 v1, v49, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v55, v4, v31 +; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_bfe_u32 v1, v48, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v43, v3, v36 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_bfe_u32 v1, v11, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v61, v2, v34 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_bfe_u32 v1, v21, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v62, v15, v37 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_bfe_u32 v1, v20, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_bfe_u32 v1, v53, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_bfe_u32 v1, v33, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v19, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v62, v61, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v62, v61, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v62, v61, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v43, v55, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v43, v55, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v33, v43, v55, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v47, v41, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v47, v41, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v47, v41, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v58, v40, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v58, v40, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v58, v40, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v60, v54, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v60, v54, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v60, v54, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v51, v44, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v51, v44, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v51, v44, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v25, v45, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v25, v45, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v25, v45, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v26, v46, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v26, v46, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v26, v46, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v27, v56, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v27, v56, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v27, v56, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v28, v57, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v28, v57, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v28, v57, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v35, v42, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v35, v42, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v35, v42, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v23, v59, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v23, v59, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v23, v59, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 8 +; GCN-NEXT: v_mov_b32_e32 v5, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v3, 8 +; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v3, v15, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v3, v15, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v3, v15, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v16, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v16, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v16, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v62 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v43 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v47 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v58 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v60 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v28 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v35 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v5 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v53, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v14, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v17, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: .LBB47_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v61 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v62 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v2, v4 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v55 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v33 +; GCN-NEXT: v_or_b32_e32 v29, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v43 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v30, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v41 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v2, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v61, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v40 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v62, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v54 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v44 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v7, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v51 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v8, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v45 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v9, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v25 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v10, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v11, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v12, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v13, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v27 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v22, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v24, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v28 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v25, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v26, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v27, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v28, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v23 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v23, v1, v3 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_mov_b32_e32 v5, v19 +; GCN-NEXT: v_or_b32_e32 v19, v1, v3 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v21, v1, v3 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_mov_b32_e32 v4, v17 +; GCN-NEXT: v_or_b32_e32 v17, v1, v3 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v20, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v15 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v15, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v53 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v18, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v16 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v16, v1, v3 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v31, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v33, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v32, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v34, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v35, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v36, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v37, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v38, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v39, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v48, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v49, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v50, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v51, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v52, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v53, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v54, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v55, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v40, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v41, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v42, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v43, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v44, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v45, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v46, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v47, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v56, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v57, v3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v58, v3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v59, v3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v60, v3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v4, v1, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v5, v1, v33 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v63, v2, v32 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v61 +; GCN-NEXT: v_or_b32_e32 v61, v3, v34 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v62 +; GCN-NEXT: v_or_b32_e32 v6, v6, v35 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v36 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v29, v37 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v30, v38 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v39 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v7, v7, v48 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v8, v8, v49 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v9, v9, v50 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v10, v10, v51 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v11, v11, v52 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v12, v12, v53 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v22 +; GCN-NEXT: v_or_b32_e32 v13, v13, v54 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v24 +; GCN-NEXT: v_or_b32_e32 v22, v22, v55 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; GCN-NEXT: v_or_b32_e32 v24, v24, v40 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GCN-NEXT: v_or_b32_e32 v25, v25, v41 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v27 +; GCN-NEXT: v_or_b32_e32 v26, v26, v42 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x50, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; GCN-NEXT: v_or_b32_e32 v27, v27, v43 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x54, v0 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_or_b32_e32 v23, v23, v44 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x58, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v19, v19, v45 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x5c, v0 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v21, v21, v46 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v17, v17, v47 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x64, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v20, v20, v56 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x68, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v15, v15, v57 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v18, v18, v58 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v14, v59 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v16, v16, v60 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v128i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v28 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v61 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v60 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v4 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v4 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v3 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v2 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v2 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v1 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v61 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v61 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v60 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v30 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v30 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v28 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v27 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v26 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v26 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v24 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v24 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v23 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v22 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v22 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v21 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v20 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v20 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v19 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v18 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v17 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[13:14] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[7:8] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v31, v33 +; VI-NEXT: v_mov_b32_e32 v33, v43 +; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[5:6] +; VI-NEXT: v_mov_b32_e32 v43, v33 +; VI-NEXT: v_mov_b32_e32 v33, v46 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[3:4] +; VI-NEXT: v_mov_b32_e32 v46, v33 +; VI-NEXT: v_mov_b32_e32 v33, v53 +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[1:2] +; VI-NEXT: v_mov_b32_e32 v53, v33 +; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[60:61] +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[29:30] +; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28] +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v33, v36 +; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26] +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] +; VI-NEXT: v_mov_b32_e32 v36, v33 +; VI-NEXT: v_mov_b32_e32 v33, v41 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] +; VI-NEXT: v_mov_b32_e32 v34, v51 +; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18] +; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v33, v31 +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[19:20] +; VI-NEXT: v_mov_b32_e32 v51, v34 +; VI-NEXT: .LBB47_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB47_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v63, 0x200 +; VI-NEXT: v_add_f16_sdwa v31, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_or_b32_e32 v32, v18, v31 +; VI-NEXT: v_add_f16_sdwa v31, v17, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_or_b32_e32 v31, v17, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v20, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 +; VI-NEXT: v_or_b32_e32 v32, v20, v31 +; VI-NEXT: v_add_f16_sdwa v31, v19, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_or_b32_e32 v31, v19, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v34, v22, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 +; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 +; VI-NEXT: v_or_b32_e32 v32, v22, v31 +; VI-NEXT: v_add_f16_sdwa v31, v21, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 +; VI-NEXT: v_or_b32_e32 v31, v21, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_or_b32_e32 v32, v24, v31 +; VI-NEXT: v_add_f16_sdwa v31, v23, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 +; VI-NEXT: v_or_b32_e32 v31, v23, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v31, v26, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 +; VI-NEXT: v_or_b32_e32 v36, v26, v31 +; VI-NEXT: v_add_f16_sdwa v31, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_or_b32_e32 v35, v25, v31 +; VI-NEXT: v_add_f16_sdwa v31, v28, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 +; VI-NEXT: v_or_b32_e32 v38, v28, v31 +; VI-NEXT: v_add_f16_sdwa v31, v27, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 +; VI-NEXT: v_or_b32_e32 v37, v27, v31 +; VI-NEXT: v_add_f16_sdwa v31, v30, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 +; VI-NEXT: v_add_f16_sdwa v32, v29, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 +; VI-NEXT: v_or_b32_e32 v49, v30, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; VI-NEXT: v_or_b32_e32 v48, v29, v31 +; VI-NEXT: v_add_f16_sdwa v31, v61, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v61, 0x200, v61 +; VI-NEXT: v_add_f16_sdwa v32, v60, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v60, 0x200, v60 +; VI-NEXT: v_or_b32_e32 v51, v61, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; VI-NEXT: v_or_b32_e32 v50, v60, v31 +; VI-NEXT: v_add_f16_sdwa v31, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v32, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v53, v2, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; VI-NEXT: v_or_b32_e32 v52, v1, v31 +; VI-NEXT: v_add_f16_sdwa v31, v4, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v32, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v46, v4, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; VI-NEXT: v_or_b32_e32 v45, v3, v31 +; VI-NEXT: v_add_f16_sdwa v31, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v32, v5, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v43, v6, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; VI-NEXT: v_add_f16_sdwa v44, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v42, v5, v31 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v32, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v44 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v41, v8, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; VI-NEXT: v_or_b32_e32 v40, v7, v31 +; VI-NEXT: v_add_f16_sdwa v31, v10, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v32, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v55, v10, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; VI-NEXT: v_add_f16_sdwa v39, v12, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v54, v9, v31 +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v33, v11, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v39 +; VI-NEXT: v_add_f16_sdwa v47, v14, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v32, v12, v31 +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v33, v13, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v47 +; VI-NEXT: v_or_b32_e32 v57, v14, v56 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v33 +; VI-NEXT: v_add_f16_sdwa v33, v16, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_add_f16_sdwa v63, v15, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v33 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v59, v16, v58 +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v63 +; VI-NEXT: v_or_b32_e32 v58, v15, v58 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v59 +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v58 +; VI-NEXT: v_lshrrev_b64 v[58:59], 24, v[58:59] +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v56, v13, v56 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v57 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v56 +; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[56:57] +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v31, v11, v31 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v32 +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v31 +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[31:32] +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v55 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v54 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[54:55] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v41 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v40 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[40:41] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v43 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v42 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v46 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v45 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v53 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v52 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[50:51] +; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[42:43] +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[45:46] +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53] +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v49 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v48 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[48:49] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v38 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v37 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[37:38] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v36 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v35 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[35:36] +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v54, v39 +; VI-NEXT: v_mov_b32_e32 v37, v44 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v56, v58 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v49 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v48 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[48:49] +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[50:51] +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[50:51] +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v51 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v50 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v33, 8, 8 +; VI-NEXT: v_mov_b32_e32 v33, v47 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v33, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v39, 8, 8 +; VI-NEXT: v_mov_b32_e32 v39, v63 +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51] +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_bfe_u32 v32, v63, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v44, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_bfe_u32 v32, v47, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v44, v32 +; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v55, v32 +; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v36, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v58, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v57, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v59, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v34, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v53, 8, 8 +; VI-NEXT: v_mov_b32_e32 v58, v57 +; VI-NEXT: v_mov_b32_e32 v57, v59 +; VI-NEXT: v_mov_b32_e32 v59, v34 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v32, v41, 8, 8 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: .LBB47_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v52 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64f16_to_v128i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(45) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(46) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: .LBB47_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB47_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX9-NEXT: .LBB47_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64f16_to_v128i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-NEXT: .LBB47_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB47_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-NEXT: .LBB47_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v74 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v60 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v72 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v47 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v63 +; GFX11-NEXT: v_or_b32_e32 v54, v67, v54 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v42 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v62 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v64 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v57 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v64 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v45 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v43 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v51 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-NEXT: v_or_b32_e32 v54, v64, v67 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v183 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v181 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v55 +; GFX11-NEXT: v_or_b32_e32 v53, v64, v53 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v179 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v177 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v167 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v54 +; GFX11-NEXT: v_or_b32_e32 v53, v55, v64 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v165 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v164 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v54 +; GFX11-NEXT: v_or_b32_e32 v52, v55, v52 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v161 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v151 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v150 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v52 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_or_b32_e32 v52, v54, v55 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v145 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v147 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v144 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-NEXT: v_or_b32_e32 v51, v54, v51 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v55 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v52 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-NEXT: v_and_b32_e32 v51, 0xff, v133 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v131 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v129 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v119 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v117 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v116 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v115 +; GFX11-NEXT: v_or_b32_e32 v51, v51, v52 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v53 +; GFX11-NEXT: v_or_b32_e32 v50, v54, v50 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v55 +; GFX11-NEXT: v_or_b32_e32 v52, v64, v67 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v10, v51 +; GFX11-NEXT: v_or_b32_e32 v3, v11, v50 +; GFX11-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v6, 8, v102 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v100 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v49 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v98 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v86 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v82 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v70 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v68 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v75 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v73 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v66 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v39, v48 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v61 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v59 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v58 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v56 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v46 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v65 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v44 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v41 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v40 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v182 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v180 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v178 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v176 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v166 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v163 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v162 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v160 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v148 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v146 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v134 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v132 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v130 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v114 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v113 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v112 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v103 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v99 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v87 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v85 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-NEXT: v_lshlrev_b16 v28, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v32, 8, v81 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v71 +; GFX11-NEXT: v_lshlrev_b16 v34, 8, v69 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v128i8_to_v64i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:992 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v49, v7 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v55, v1 +; GCN-NEXT: v_mov_b32_e32 v60, v0 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:52 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:36 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v14 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 24, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v22 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 24, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v30 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v63, 24, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v26 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:392 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v23 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v21 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v33 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v25 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v19 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v9 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v13 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v7 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:188 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:184 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v24, 8, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v28, 8, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:344 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:376 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:372 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:368 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v30, 8, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:328 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:352 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:364 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:360 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:384 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v4 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; kill: killed $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; kill: killed $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; kill: killed $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; kill: killed $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; kill: killed $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; kill: killed $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; kill: killed $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB48_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v4, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v5, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v6, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v7, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v8, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v9, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v14, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v32, v1, v24 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v16, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v28, v1, v28 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 +; GCN-NEXT: v_or_b32_e32 v18, v18, v30 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v55 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v19, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v20, v1 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v17 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v21, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v23, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v24, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v25, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v25, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v25, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v25, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v25, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v25, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v25, v45 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v13, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v13, v51 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v44 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v12, v38 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v12, v39 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v56, v48, v12 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v50, v12 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v2, v36, v13 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v53, v13 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_or_b32_e32 v3, v40, v22 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v47, v22 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v1, v26 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_or_b32_e32 v1, v63, v26 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v62, v1, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v33 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v1, v38 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v61, v1, v48 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v48 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v59, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v53 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v63, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v53 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v44, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v58, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v52, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v46, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v50, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v45, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v48, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v43, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v41, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v42, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v47, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v1, v53 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v54, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v54, v1, v54 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v31 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v32 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v30 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v1 +; GCN-NEXT: v_mov_b32_e32 v1, v56 +; GCN-NEXT: v_or_b32_e32 v8, v49, v1 +; GCN-NEXT: v_or_b32_e32 v31, v51, v2 +; GCN-NEXT: v_or_b32_e32 v56, v57, v3 +; GCN-NEXT: v_or_b32_e32 v4, v55, v0 +; GCN-NEXT: v_or_b32_e32 v5, v40, v62 +; GCN-NEXT: v_or_b32_e32 v6, v37, v17 +; GCN-NEXT: v_or_b32_e32 v7, v25, v61 +; GCN-NEXT: v_or_b32_e32 v37, v34, v59 +; GCN-NEXT: v_or_b32_e32 v25, v9, v63 +; GCN-NEXT: v_or_b32_e32 v38, v10, v44 +; GCN-NEXT: v_or_b32_e32 v51, v11, v52 +; GCN-NEXT: v_or_b32_e32 v55, v12, v50 +; GCN-NEXT: v_or_b32_e32 v49, v13, v48 +; GCN-NEXT: v_or_b32_e32 v40, v14, v41 +; GCN-NEXT: v_or_b32_e32 v11, v15, v39 +; GCN-NEXT: v_or_b32_e32 v57, v16, v53 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v36, v12 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v18, v14 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v19, v13 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v20, v34 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v21, v20 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v22, v21 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v23, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v24, v23 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v33, v33, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v26, v58 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v27, v46 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v28, v45 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v29, v43 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v30, v42 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v35, v47 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v32, v54 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GCN-NEXT: v_mov_b32_e32 v1, v8 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v2 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v62 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v61 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v59 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v63 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v44 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v52 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v50 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v48 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v41 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v39 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v61, v1, v12, 16 +; GCN-NEXT: v_mov_b32_e32 v12, v33 +; GCN-NEXT: v_alignbit_b32 v33, v31, v14, 16 +; GCN-NEXT: v_mov_b32_e32 v27, v56 +; GCN-NEXT: v_alignbit_b32 v59, v56, v13, 16 +; GCN-NEXT: v_mov_b32_e32 v13, v19 +; GCN-NEXT: v_alignbit_b32 v29, v4, v34, 16 +; GCN-NEXT: v_alignbit_b32 v0, v5, v20, 16 +; GCN-NEXT: v_alignbit_b32 v14, v6, v21, 16 +; GCN-NEXT: v_mov_b32_e32 v21, v18 +; GCN-NEXT: v_mov_b32_e32 v18, v10 +; GCN-NEXT: v_mov_b32_e32 v56, v16 +; GCN-NEXT: v_mov_b32_e32 v16, v36 +; GCN-NEXT: v_mov_b32_e32 v10, v38 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v15, v7, v22, 16 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v62, v37 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v15, v37, v23, 16 +; GCN-NEXT: v_mov_b32_e32 v23, v9 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v37, v25 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v15, v10, v58, 16 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v34, v51 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v15, v51, v46, 16 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v58, v55 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v15, v55, v45, 16 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v52, v49 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v15, v49, v43, 16 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v46, v40 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v15, v40, v42, 16 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v41, v11 +; GCN-NEXT: v_alignbit_b32 v11, v11, v47, 16 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v3, v57, v54, 16 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v53 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; kill: killed $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; kill: killed $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; kill: killed $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: .LBB48_2: ; %Flow +; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v25, v27 +; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB48_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v39, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v30, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v44 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v38, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v1, v20, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v1, v51, v5 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v1, v28, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v6, v22, v7 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v1, v8 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_or_b32_e32 v8, v45, v9 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v9, v24, v10 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v1, v13 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v1, v14 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v1, v18 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v1, v19 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v1, v20 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v1, v21 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v1, v22 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_mov_b32_e32 v2, v35 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v51, v1, v23 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v17 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_mov_b32_e32 v4, v49 +; GCN-NEXT: v_mov_b32_e32 v43, v42 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v42, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_mov_b32_e32 v44, v40 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v46, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v57, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v1, v23 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v55 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v59, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v62, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v61, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v1, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v23 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v24, v23 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v24, v23 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v24, v23 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v24, v23 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_mov_b32_e32 v52, v36 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v24, v23 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_mov_b32_e32 v45, v53 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v24, v23 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v55, v24, v23 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v27, v25 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v28, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v29, v28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v31, v29 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v32, v31 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v17 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v54 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v33, v34, v33 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v17 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v36, v34 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v11 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v37, v36 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v37, v2, v37 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v49, v2, v49 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v54, 0xff, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_or_b32_e32 v54, v63, v54 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v43 +; GCN-NEXT: v_and_b32_e32 v40, 0xff, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v40, v2, v40 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; GCN-NEXT: v_or_b32_e32 v41, v47, v41 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:988 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: v_or_b32_e32 v43, v44, v43 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:992 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v44, 0xff, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; GCN-NEXT: v_or_b32_e32 v44, v45, v44 +; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v15 +; GCN-NEXT: v_and_b32_e32 v45, 0xff, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; GCN-NEXT: v_or_b32_e32 v45, v52, v45 +; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v3 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; GCN-NEXT: v_or_b32_e32 v47, v50, v47 +; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v4 +; GCN-NEXT: v_and_b32_e32 v56, 0xff, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; GCN-NEXT: v_or_b32_e32 v56, v48, v56 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v61, v61, v2 +; GCN-NEXT: s_movk_i32 s7, 0x300 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v1, v4 +; GCN-NEXT: s_mov_b32 s6, 0x3000000 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v18 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s7, v19 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v20 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v21 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v22 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s7, v26 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s7, v30 +; GCN-NEXT: v_add_i32_e32 v30, vcc, s7, v35 +; GCN-NEXT: v_add_i32_e32 v35, vcc, s7, v51 +; GCN-NEXT: v_add_i32_e32 v51, vcc, s7, v42 +; GCN-NEXT: v_add_i32_e32 v42, vcc, s7, v46 +; GCN-NEXT: v_add_i32_e32 v46, vcc, s7, v57 +; GCN-NEXT: v_add_i32_e32 v57, vcc, s7, v38 +; GCN-NEXT: v_add_i32_e32 v58, vcc, s7, v59 +; GCN-NEXT: v_add_i32_e32 v59, vcc, s7, v62 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v35 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v42 +; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v46 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v57 +; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v59 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v38, v1 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v38, v5 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v38, v6 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v38, v7 +; GCN-NEXT: v_or_b32_e32 v8, v39, v8 +; GCN-NEXT: v_or_b32_e32 v9, v53, v9 +; GCN-NEXT: v_or_b32_e32 v10, v55, v10 +; GCN-NEXT: v_or_b32_e32 v11, v23, v11 +; GCN-NEXT: v_or_b32_e32 v12, v24, v12 +; GCN-NEXT: v_or_b32_e32 v13, v25, v13 +; GCN-NEXT: v_or_b32_e32 v14, v27, v14 +; GCN-NEXT: v_or_b32_e32 v15, v28, v15 +; GCN-NEXT: v_or_b32_e32 v16, v29, v16 +; GCN-NEXT: v_or_b32_e32 v17, v31, v17 +; GCN-NEXT: v_or_b32_e32 v18, v32, v18 +; GCN-NEXT: v_or_b32_e32 v19, v33, v19 +; GCN-NEXT: v_or_b32_e32 v20, v34, v20 +; GCN-NEXT: v_or_b32_e32 v21, v36, v21 +; GCN-NEXT: v_or_b32_e32 v22, v37, v22 +; GCN-NEXT: v_or_b32_e32 v24, v49, v26 +; GCN-NEXT: v_or_b32_e32 v25, v54, v30 +; GCN-NEXT: v_or_b32_e32 v26, v40, v35 +; GCN-NEXT: v_or_b32_e32 v28, v41, v51 +; GCN-NEXT: v_or_b32_e32 v30, v43, v42 +; GCN-NEXT: v_or_b32_e32 v33, v44, v46 +; GCN-NEXT: v_or_b32_e32 v34, v45, v57 +; GCN-NEXT: v_or_b32_e32 v38, v47, v58 +; GCN-NEXT: v_or_b32_e32 v39, v56, v59 +; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v61 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v49, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v35, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v36, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v53, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v22 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v26 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v30 +; GCN-NEXT: v_add_i32_e32 v56, vcc, s6, v33 +; GCN-NEXT: v_add_i32_e32 v31, vcc, s6, v34 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v38 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v39 +; GCN-NEXT: v_alignbit_b32 v61, v1, v16, 16 +; GCN-NEXT: v_alignbit_b32 v33, v31, v56, 16 +; GCN-NEXT: v_alignbit_b32 v59, v25, v18, 16 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:984 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v29, v4, v8, 16 +; GCN-NEXT: v_alignbit_b32 v0, v5, v21, 16 +; GCN-NEXT: v_alignbit_b32 v2, v6, v13, 16 +; GCN-NEXT: v_alignbit_b32 v19, v7, v23, 16 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_alignbit_b32 v19, v17, v14, 16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v14, v2 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v19, v15, v12, 16 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_alignbit_b32 v19, v10, v53, 16 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_alignbit_b32 v19, v11, v52, 16 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_alignbit_b32 v19, v48, v51, 16 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_alignbit_b32 v19, v36, v50, 16 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_alignbit_b32 v19, v35, v49, 16 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_alignbit_b32 v19, v3, v32, 16 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_alignbit_b32 v19, v27, v37, 16 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v1 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v31 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v4 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v7 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v62, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v37, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v34, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v58, v48 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v48 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v52, v36 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v46, v35 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v41, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:996 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v27 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; GCN-NEXT: .LBB48_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v61 +; GCN-NEXT: v_or_b32_e32 v8, v8, v19 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v19 +; GCN-NEXT: buffer_store_dword v8, v60, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v8, vcc, 4, v60 +; GCN-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; GCN-NEXT: v_or_b32_e32 v56, v1, v8 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v44, v1, v2 +; GCN-NEXT: v_add_i32_e32 v45, vcc, 8, v60 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; GCN-NEXT: v_or_b32_e32 v63, v1, v2 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 12, v60 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GCN-NEXT: v_or_b32_e32 v59, v1, v2 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 16, v60 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:984 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 20, v60 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v4, vcc, 24, v60 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v60 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v60 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; GCN-NEXT: v_or_b32_e32 v61, v2, v3 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 36, v60 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v47, v2, v3 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 40, v60 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v23 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v60 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v23, v3, v5 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 48, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v15, v3, v5 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v60 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v62 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v57, v3, v5 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v60 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v21, v3, v5 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v60 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v37 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v3, v5 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v25, v3, v5 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v60 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v10 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v27, v3, v5 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v30, v3, v5 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x4c, v60 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v32, v3, v5 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x50, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v34, v3, v5 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x54, v60 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v29, v3, v5 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x58, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v37, v3, v5 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x5c, v60 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v5, v3, v5 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v48, v3, v7 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v60 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v46 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v50, v3, v7 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v55, v3, v7 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v60 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v41 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v54, v3, v7 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x70, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v41, v3, v7 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x74, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:996 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v52, v3, v7 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x78, v60 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v60 +; GCN-NEXT: buffer_store_dword v56, v45, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v63, v16, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v59, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v47, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v29, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v37, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v55, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v41, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v128i8_to_v64i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v23 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v26 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v27 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v53 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v42 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v43 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v0, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v2, v2, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v4, v4, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v5, v5, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v10, v63, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v12, v58, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v14, v45, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v8, v62, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v57, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v56, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v44, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v17, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v31, v31, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; kill: killed $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v30, v30, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v30, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v31, v31, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; kill: killed $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v18, 0x300 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_add_u16_sdwa v4, v0, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_or_b32_sdwa v29, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v0, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v29, 0x300, v29 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v49, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v3, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v39, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v4, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v5, v4 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v6, v5 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v32, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v6, v6, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v32, v32, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v28, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v28, 0x300, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v33, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v33, v33, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v27, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v27, 0x300, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v34, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v34, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v26, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v26, 0x300, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v35, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v35, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v25, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_or_b32_e32 v6, v7, v6 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v25, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v35 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v36, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v36, v36, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v24, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v7, v7, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v24, 0x300, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v36 +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_u16_e32 v8, 3, v61 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v9, 3, v62 +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v8, v8, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 +; VI-NEXT: v_or_b32_e32 v8, v9, v8 +; VI-NEXT: v_add_u16_e32 v9, 3, v63 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 3, v59 +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v9, v9, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: v_add_u16_e32 v10, 3, v60 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 3, v57 +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v10, v10, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 +; VI-NEXT: v_or_b32_e32 v10, v11, v10 +; VI-NEXT: v_add_u16_e32 v11, 3, v58 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 3, v56 +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v11, v11, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_or_b32_e32 v11, v12, v11 +; VI-NEXT: v_add_u16_e32 v12, 3, v47 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 3, v46 +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v12, v12, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_or_b32_e32 v12, v13, v12 +; VI-NEXT: v_add_u16_e32 v13, 3, v45 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 3, v44 +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v13, v13, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_or_b32_e32 v13, v14, v13 +; VI-NEXT: v_add_u16_e32 v14, 3, v43 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v15, 3, v42 +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v14, v14, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: v_or_b32_e32 v14, v15, v14 +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v15, v15, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_or_b32_sdwa v17, v19, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v21, v16, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v17, v17, v21 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v16, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v19, 0x300, v20 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v16, v19, v16 +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: v_or_b32_sdwa v30, v38, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v30, 0x300, v30 +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v31, v50, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v31, 0x300, v31 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: v_or_b32_sdwa v21, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v37, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v40, v21, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v37, v37, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v29, v40 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v23, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v23, 0x300, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v38, v38, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v48, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v55, v20, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v38, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v30, v55 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_sdwa v22, v39, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v22, 0x300, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v39, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v48, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v21, v39, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v48, 3, v48 +; VI-NEXT: v_or_b32_sdwa v48, v49, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v49, 3, v49 +; VI-NEXT: v_or_b32_sdwa v49, v50, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v20, v49, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v50, 3, v50 +; VI-NEXT: v_or_b32_sdwa v50, v51, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v39, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v49, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v39, 0x300, v39 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v51, 3, v51 +; VI-NEXT: v_or_b32_sdwa v51, v52, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v52, 3, v52 +; VI-NEXT: v_or_b32_sdwa v52, v53, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v53, 3, v53 +; VI-NEXT: v_or_b32_sdwa v53, v54, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v54, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v19, v51, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v18, v53, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v39, v18 +; VI-NEXT: v_add_u16_e32 v39, 0x300, v52 +; VI-NEXT: v_or_b32_e32 v19, v39, v19 +; VI-NEXT: v_add_u16_e32 v39, 0x300, v50 +; VI-NEXT: v_or_b32_e32 v20, v39, v20 +; VI-NEXT: v_add_u16_e32 v39, 0x300, v48 +; VI-NEXT: v_or_b32_e32 v21, v39, v21 +; VI-NEXT: v_or_b32_e32 v31, v31, v54 +; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v128i8_to_v64i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:184 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v23 +; GFX9-NEXT: s_waitcnt vmcnt(27) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v14 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v26 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(26) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v25 +; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:208 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:188 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:196 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:204 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:212 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:240 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:248 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:228 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:272 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:280 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:260 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:304 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:292 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:336 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:344 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:324 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v2 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:340 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:360 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:368 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:376 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:348 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:356 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:364 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:372 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:380 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v10, v62, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v11, v60, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v12, v56, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v14, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v15, v43, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v8, v63, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v61, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v59, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v58, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v57, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v46, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v45, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v16, v17, v16, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v17, v19, v18, s6 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v18, v19, v18, s6 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v19, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v19, v20, v19, s6 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v20, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v20, v21, v20, s6 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v21, v21, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v21, v22, v21, s6 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v22, v22, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v22, v23, v22, s6 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v23, v23, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v23, v24, v23, s6 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v24, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v24, v25, v24, s6 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v25, v25, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v25, v26, v25, s6 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v26, v26, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v26, v27, v26, s6 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v27, v27, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v27, v28, v27, s6 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v28, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v28, v29, v28, s6 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v29, v29, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v32, v32, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v29, v30, v29, s6 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; kill: killed $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v30, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v30, v31, v30, s6 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v31, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; kill: killed $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v24, 3, v24 +; GFX9-NEXT: v_perm_b32 v0, v2, v0, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_or_b32_sdwa v24, v25, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_perm_b32 v1, v1, v4, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v25, 3, v25 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v38, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v26, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_e32 v35, 0x300, v25 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v36, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v22, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v22, v23, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v22 +; GFX9-NEXT: v_add_u16_e32 v36, 0x300, v36 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v33, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v23, 3, v23 +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v37, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v21, 3, v21 +; GFX9-NEXT: v_or_b32_sdwa v21, v39, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v33, 0x300, v21 +; GFX9-NEXT: v_add_u16_e32 v34, 0x300, v23 +; GFX9-NEXT: v_perm_b32 v29, v34, v29, s6 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v32, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v38, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v39, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v7, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v39 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: v_or_b32_sdwa v48, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v63 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v61 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v59 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v58 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v57 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v15, 3, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v16 +; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v17, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v16 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v18 +; GFX9-NEXT: v_perm_b32 v17, v17, v20, s6 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v19 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v16, v18, v16, s6 +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v20, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v20, v49, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v49, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v20 +; GFX9-NEXT: v_perm_b32 v30, v33, v30, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v50, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v52, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v39, 0x300, v50 +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v51, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v51 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v52, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v19, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v19, v53, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v32, 0x300, v19 +; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v53, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v53 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v54, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v55 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v40, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v50, 0x300, v40 +; GFX9-NEXT: v_perm_b32 v21, v50, v21, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v41, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v41 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v42, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v51, 0x300, v42 +; GFX9-NEXT: v_perm_b32 v20, v51, v20, s6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v43, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v44, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v45, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v45 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v27, 3, v27 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v46, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v24 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v26 +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v37 +; GFX9-NEXT: v_add_u16_e32 v37, 0x300, v38 +; GFX9-NEXT: v_add_u16_e32 v38, 0x300, v48 +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v49 +; GFX9-NEXT: v_add_u16_e32 v48, 0x300, v52 +; GFX9-NEXT: v_add_u16_e32 v49, 0x300, v54 +; GFX9-NEXT: v_add_u16_e32 v52, 0x300, v44 +; GFX9-NEXT: v_add_u16_e32 v53, 0x300, v46 +; GFX9-NEXT: v_perm_b32 v18, v53, v18, s6 +; GFX9-NEXT: v_perm_b32 v19, v52, v19, s6 +; GFX9-NEXT: v_perm_b32 v22, v49, v22, s6 +; GFX9-NEXT: v_perm_b32 v23, v48, v23, s6 +; GFX9-NEXT: v_perm_b32 v24, v39, v24, s6 +; GFX9-NEXT: v_perm_b32 v25, v38, v25, s6 +; GFX9-NEXT: v_perm_b32 v26, v37, v26, s6 +; GFX9-NEXT: v_perm_b32 v27, v36, v27, s6 +; GFX9-NEXT: v_perm_b32 v28, v35, v28, s6 +; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v128i8_to_v64i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:580 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:576 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:572 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:568 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:564 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:560 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:556 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:552 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:548 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:544 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:540 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:536 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:532 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:528 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:524 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:520 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:516 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:512 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:508 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:504 +; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:500 +; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:496 +; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:492 +; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:488 +; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:484 +; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:480 +; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:476 +; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:472 +; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:468 +; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:464 +; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:460 +; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:456 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:452 +; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:448 +; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:444 +; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:440 +; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:436 +; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:432 +; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:428 +; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:424 +; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:420 +; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:416 +; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:412 +; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:408 +; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:404 +; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:400 +; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:396 +; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:392 +; GFX11-NEXT: v_dual_mov_b32 v37, v30 :: v_dual_mov_b32 v54, v24 +; GFX11-NEXT: v_dual_mov_b32 v55, v28 :: v_dual_mov_b32 v52, v26 +; GFX11-NEXT: v_dual_mov_b32 v50, v22 :: v_dual_mov_b32 v53, v20 +; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v51, v16 +; GFX11-NEXT: v_dual_mov_b32 v36, v14 :: v_dual_mov_b32 v49, v12 +; GFX11-NEXT: v_dual_mov_b32 v32, v10 :: v_dual_mov_b32 v39, v4 +; GFX11-NEXT: v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v33, v6 +; GFX11-NEXT: v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v38, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:384 +; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:380 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:376 +; GFX11-NEXT: scratch_load_u16 v134, off, s32 offset:372 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:368 +; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:364 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:360 +; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:356 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:352 +; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:348 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:344 +; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:340 +; GFX11-NEXT: scratch_load_u16 v12, off, s32 offset:336 +; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:332 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 offset:328 +; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:324 +; GFX11-NEXT: scratch_load_u16 v16, off, s32 offset:320 +; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:316 +; GFX11-NEXT: scratch_load_u16 v18, off, s32 offset:312 +; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:308 +; GFX11-NEXT: scratch_load_u16 v20, off, s32 offset:304 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:300 +; GFX11-NEXT: scratch_load_u16 v22, off, s32 offset:296 +; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:292 +; GFX11-NEXT: scratch_load_u16 v24, off, s32 offset:288 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:284 +; GFX11-NEXT: scratch_load_u16 v26, off, s32 offset:280 +; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:276 +; GFX11-NEXT: scratch_load_u16 v28, off, s32 offset:272 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:268 +; GFX11-NEXT: scratch_load_u16 v30, off, s32 offset:264 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:260 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v31, off, s32 offset:256 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:252 +; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:248 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:244 +; GFX11-NEXT: scratch_load_u16 v88, off, s32 offset:240 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:236 +; GFX11-NEXT: scratch_load_u16 v93, off, s32 offset:232 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:228 +; GFX11-NEXT: scratch_load_u16 v91, off, s32 offset:224 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:220 +; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:216 +; GFX11-NEXT: scratch_load_b32 v150, off, s32 offset:388 +; GFX11-NEXT: scratch_load_u16 v182, off, s32 +; GFX11-NEXT: scratch_load_u16 v40, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v43, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v44, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v45, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v46, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v47, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v56, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v58, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v59, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v60, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v61, off, s32 offset:88 +; GFX11-NEXT: scratch_load_u16 v62, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v63, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v72, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v73, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:136 +; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:144 +; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:152 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:160 +; GFX11-NEXT: scratch_load_u16 v79, off, s32 offset:168 +; GFX11-NEXT: scratch_load_u16 v89, off, s32 offset:176 +; GFX11-NEXT: scratch_load_u16 v90, off, s32 offset:184 +; GFX11-NEXT: scratch_load_u16 v95, off, s32 offset:192 +; GFX11-NEXT: scratch_load_u16 v104, off, s32 offset:200 +; GFX11-NEXT: scratch_load_u16 v105, off, s32 offset:208 +; GFX11-NEXT: scratch_load_u16 v42, off, s32 offset:212 +; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:204 +; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:196 +; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:188 +; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:180 +; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:172 +; GFX11-NEXT: scratch_load_u16 v177, off, s32 offset:164 +; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:156 +; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:148 +; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:140 +; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v70, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v71, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v82, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v128, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v113, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v132, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v161, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v160, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v176, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v167, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v181, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(62) +; GFX11-NEXT: v_lshlrev_b16 v127, 8, v0 +; GFX11-NEXT: v_lshlrev_b16 v126, 8, v2 +; GFX11-NEXT: v_lshlrev_b16 v124, 8, v4 +; GFX11-NEXT: v_lshlrev_b16 v125, 8, v6 +; GFX11-NEXT: v_lshlrev_b16 v120, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v123, 8, v10 +; GFX11-NEXT: v_lshlrev_b16 v121, 8, v12 +; GFX11-NEXT: v_lshlrev_b16 v122, 8, v14 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: v_lshlrev_b16 v106, 8, v16 +; GFX11-NEXT: v_lshlrev_b16 v111, 8, v18 +; GFX11-NEXT: v_lshlrev_b16 v109, 8, v20 +; GFX11-NEXT: v_lshlrev_b16 v110, 8, v22 +; GFX11-NEXT: v_lshlrev_b16 v107, 8, v24 +; GFX11-NEXT: v_lshlrev_b16 v108, 8, v26 +; GFX11-NEXT: s_waitcnt vmcnt(61) +; GFX11-NEXT: v_lshlrev_b16 v88, 8, v88 +; GFX11-NEXT: s_waitcnt vmcnt(59) +; GFX11-NEXT: v_lshlrev_b16 v93, 8, v93 +; GFX11-NEXT: s_waitcnt vmcnt(57) +; GFX11-NEXT: v_lshlrev_b16 v91, 8, v91 +; GFX11-NEXT: s_waitcnt vmcnt(55) +; GFX11-NEXT: v_lshlrev_b16 v92, 8, v92 +; GFX11-NEXT: s_waitcnt vmcnt(54) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v150 +; GFX11-NEXT: s_waitcnt vmcnt(53) +; GFX11-NEXT: v_lshlrev_b16 v150, 8, v182 +; GFX11-NEXT: s_waitcnt vmcnt(52) +; GFX11-NEXT: v_lshlrev_b16 v41, 8, v40 +; GFX11-NEXT: s_waitcnt vmcnt(51) +; GFX11-NEXT: v_lshlrev_b16 v40, 8, v43 +; GFX11-NEXT: s_waitcnt vmcnt(50) +; GFX11-NEXT: v_lshlrev_b16 v43, 8, v44 +; GFX11-NEXT: s_waitcnt vmcnt(49) +; GFX11-NEXT: v_lshlrev_b16 v182, 8, v45 +; GFX11-NEXT: s_waitcnt vmcnt(48) +; GFX11-NEXT: v_lshlrev_b16 v46, 8, v46 +; GFX11-NEXT: s_waitcnt vmcnt(47) +; GFX11-NEXT: v_lshlrev_b16 v45, 8, v47 +; GFX11-NEXT: s_waitcnt vmcnt(46) +; GFX11-NEXT: v_lshlrev_b16 v57, 8, v56 +; GFX11-NEXT: s_waitcnt vmcnt(45) +; GFX11-NEXT: v_lshlrev_b16 v56, 8, v58 +; GFX11-NEXT: s_waitcnt vmcnt(44) +; GFX11-NEXT: v_lshlrev_b16 v58, 8, v59 +; GFX11-NEXT: s_waitcnt vmcnt(43) +; GFX11-NEXT: v_lshlrev_b16 v44, 8, v60 +; GFX11-NEXT: s_waitcnt vmcnt(42) +; GFX11-NEXT: v_lshlrev_b16 v60, 8, v61 +; GFX11-NEXT: s_waitcnt vmcnt(41) +; GFX11-NEXT: v_lshlrev_b16 v59, 8, v62 +; GFX11-NEXT: s_waitcnt vmcnt(40) +; GFX11-NEXT: v_lshlrev_b16 v62, 8, v63 +; GFX11-NEXT: s_waitcnt vmcnt(39) +; GFX11-NEXT: v_lshlrev_b16 v47, 8, v72 +; GFX11-NEXT: s_waitcnt vmcnt(38) +; GFX11-NEXT: v_lshlrev_b16 v72, 8, v73 +; GFX11-NEXT: s_waitcnt vmcnt(37) +; GFX11-NEXT: v_lshlrev_b16 v63, 8, v74 +; GFX11-NEXT: s_waitcnt vmcnt(36) +; GFX11-NEXT: v_lshlrev_b16 v74, 8, v75 +; GFX11-NEXT: s_waitcnt vmcnt(35) +; GFX11-NEXT: v_lshlrev_b16 v73, 8, v76 +; GFX11-NEXT: s_waitcnt vmcnt(34) +; GFX11-NEXT: v_lshlrev_b16 v75, 8, v77 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v61, 8, v78 +; GFX11-NEXT: s_waitcnt vmcnt(32) +; GFX11-NEXT: v_lshlrev_b16 v78, 8, v79 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v77, 8, v89 +; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: v_lshlrev_b16 v79, 8, v90 +; GFX11-NEXT: s_waitcnt vmcnt(29) +; GFX11-NEXT: v_lshlrev_b16 v76, 8, v95 +; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: v_lshlrev_b16 v90, 8, v104 +; GFX11-NEXT: s_waitcnt vmcnt(27) +; GFX11-NEXT: v_lshlrev_b16 v89, 8, v105 +; GFX11-NEXT: v_lshlrev_b16 v104, 8, v94 +; GFX11-NEXT: v_lshlrev_b16 v95, 8, v31 +; GFX11-NEXT: v_lshlrev_b16 v105, 8, v30 +; GFX11-NEXT: v_lshlrev_b16 v94, 8, v28 +; GFX11-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v39 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v48 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v70 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v71 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v84 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v82 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v51 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v34 +; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v53 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v52 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v66 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v128 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v113 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v132 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v100 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v161 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v160 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v176 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v167 +; GFX11-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v5, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v11, v10, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v37 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v102 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v87 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v114 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v96 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v133 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v117 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v135 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v130 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v181 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v150 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v41 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v40 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v43 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v182 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v46 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v45 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v57 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v56 +; GFX11-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v10, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v12, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v14, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v16, v15, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v147 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v119 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v149 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v144 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v162 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v146 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v178 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v164 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v151 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v148 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v58 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v44 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v60 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v59 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v62 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v47 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v72 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v63 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v74 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v73 +; GFX11-NEXT: v_perm_b32 v12, v13, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v15, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v17, v16, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v19, v18, 0x5040100 +; GFX11-NEXT: v_perm_b32 v16, v21, v20, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v166 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v145 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v177 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v163 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v179 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v165 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v183 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v180 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v42 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v65 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v75 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v61 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v78 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v77 +; GFX11-NEXT: v_or_b32_e32 v21, v21, v79 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v76 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v90 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v89 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v92 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v91 +; GFX11-NEXT: v_perm_b32 v17, v18, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v18, v20, v19, 0x5040100 +; GFX11-NEXT: v_perm_b32 v19, v22, v21, 0x5040100 +; GFX11-NEXT: v_perm_b32 v20, v24, v23, 0x5040100 +; GFX11-NEXT: v_perm_b32 v21, v26, v25, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v64 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v68 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v85 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v67 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v97 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v101 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v86 +; GFX11-NEXT: v_or_b32_e32 v22, v22, v93 +; GFX11-NEXT: v_or_b32_e32 v23, v23, v88 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v104 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v95 +; GFX11-NEXT: v_or_b32_e32 v26, v26, v105 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v94 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v108 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v107 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v110 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v109 +; GFX11-NEXT: v_perm_b32 v22, v23, v22, 0x5040100 +; GFX11-NEXT: v_perm_b32 v23, v25, v24, 0x5040100 +; GFX11-NEXT: v_perm_b32 v24, v27, v26, 0x5040100 +; GFX11-NEXT: v_perm_b32 v25, v29, v28, 0x5040100 +; GFX11-NEXT: v_perm_b32 v26, v31, v30, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v103 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v112 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v99 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v129 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v98 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v131 +; GFX11-NEXT: v_and_b32_e32 v34, 0xff, v116 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v134 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-NEXT: v_or_b32_e32 v27, v27, v111 +; GFX11-NEXT: v_or_b32_e32 v28, v28, v106 +; GFX11-NEXT: v_or_b32_e32 v29, v29, v122 +; GFX11-NEXT: v_or_b32_e32 v30, v30, v121 +; GFX11-NEXT: v_or_b32_e32 v31, v31, v123 +; GFX11-NEXT: v_or_b32_e32 v32, v32, v120 +; GFX11-NEXT: v_or_b32_e32 v33, v33, v125 +; GFX11-NEXT: v_or_b32_e32 v34, v34, v124 +; GFX11-NEXT: v_or_b32_e32 v35, v35, v126 +; GFX11-NEXT: v_or_b32_e32 v36, v36, v127 +; GFX11-NEXT: v_perm_b32 v27, v28, v27, 0x5040100 +; GFX11-NEXT: v_perm_b32 v28, v30, v29, 0x5040100 +; GFX11-NEXT: v_perm_b32 v29, v32, v31, 0x5040100 +; GFX11-NEXT: v_perm_b32 v30, v34, v33, 0x5040100 +; GFX11-NEXT: v_perm_b32 v31, v36, v35, 0x5040100 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr78 +; GFX11-NEXT: ; implicit-def: $vgpr77 +; GFX11-NEXT: ; implicit-def: $vgpr79 +; GFX11-NEXT: ; implicit-def: $vgpr76 +; GFX11-NEXT: ; implicit-def: $vgpr90 +; GFX11-NEXT: ; implicit-def: $vgpr89 +; GFX11-NEXT: ; implicit-def: $vgpr92 +; GFX11-NEXT: ; implicit-def: $vgpr91 +; GFX11-NEXT: ; implicit-def: $vgpr93 +; GFX11-NEXT: ; implicit-def: $vgpr88 +; GFX11-NEXT: ; implicit-def: $vgpr104 +; GFX11-NEXT: ; implicit-def: $vgpr95 +; GFX11-NEXT: ; implicit-def: $vgpr105 +; GFX11-NEXT: ; implicit-def: $vgpr94 +; GFX11-NEXT: ; implicit-def: $vgpr108 +; GFX11-NEXT: ; implicit-def: $vgpr107 +; GFX11-NEXT: ; implicit-def: $vgpr110 +; GFX11-NEXT: ; implicit-def: $vgpr109 +; GFX11-NEXT: ; implicit-def: $vgpr111 +; GFX11-NEXT: ; implicit-def: $vgpr106 +; GFX11-NEXT: ; implicit-def: $vgpr122 +; GFX11-NEXT: ; implicit-def: $vgpr121 +; GFX11-NEXT: ; implicit-def: $vgpr123 +; GFX11-NEXT: ; implicit-def: $vgpr120 +; GFX11-NEXT: ; implicit-def: $vgpr125 +; GFX11-NEXT: ; implicit-def: $vgpr124 +; GFX11-NEXT: ; implicit-def: $vgpr126 +; GFX11-NEXT: ; implicit-def: $vgpr127 +; GFX11-NEXT: .LBB48_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v134, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v118, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v131, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v116, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v129, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_add_nc_u16 v35, v35, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v126, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v127, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v125, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v124, v3 +; GFX11-NEXT: v_add_nc_u16 v33, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v31, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v98, 3 +; GFX11-NEXT: v_add_nc_u16 v116, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v30, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v98, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v1, v112, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_add_nc_u16 v3, v99, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v103, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v123, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v120, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v122, v1 +; GFX11-NEXT: v_add_nc_u16 v29, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v121, v3 +; GFX11-NEXT: v_add_nc_u16 v99, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v111, v4 +; GFX11-NEXT: v_add_nc_u16 v28, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v81, 3 +; GFX11-NEXT: v_add_nc_u16 v81, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v101, 3 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v3, v97, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v4, v83, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v106, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v110, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v109, v0 +; GFX11-NEXT: v_add_nc_u16 v83, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v108, v3 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v107, v4 +; GFX11-NEXT: v_add_nc_u16 v86, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v85, 3 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-NEXT: v_add_nc_u16 v67, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v80, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v3, v68, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v69, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v105, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v94, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v104, v2 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v95, v3 +; GFX11-NEXT: v_add_nc_u16 v68, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v93, v4 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v64, 3 +; GFX11-NEXT: v_add_nc_u16 v64, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(26) +; GFX11-NEXT: v_add_nc_u16 v0, v42, 3 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v65, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_waitcnt vmcnt(24) +; GFX11-NEXT: v_add_nc_u16 v3, v183, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v180, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v88, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v92, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v91, v1 +; GFX11-NEXT: v_add_nc_u16 v65, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v90, v3 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v89, v4 +; GFX11-NEXT: v_add_nc_u16 v69, 0x300, v1 +; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: v_add_nc_u16 v1, v179, 3 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v165, 3 +; GFX11-NEXT: v_add_nc_u16 v80, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: v_add_nc_u16 v0, v177, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v3, v163, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: v_add_nc_u16 v4, v166, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v79, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v76, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v78, v0 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v77, v3 +; GFX11-NEXT: v_add_nc_u16 v85, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v75, v4 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v145, 3 +; GFX11-NEXT: v_add_nc_u16 v97, 0x300, v1 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_add_nc_u16 v1, v151, 3 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v148, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_add_nc_u16 v3, v178, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v164, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v61, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v74, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v73, v2 +; GFX11-NEXT: v_add_nc_u16 v101, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v72, v3 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v63, v4 +; GFX11-NEXT: v_add_nc_u16 v103, 0x300, v2 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: v_add_nc_u16 v2, v162, 3 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v146, 3 +; GFX11-NEXT: v_add_nc_u16 v112, 0x300, v1 +; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: v_add_nc_u16 v1, v149, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v3, v144, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_add_nc_u16 v4, v147, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v62, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v47, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v60, v1 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v59, v3 +; GFX11-NEXT: v_add_nc_u16 v118, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v58, v4 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v119, 3 +; GFX11-NEXT: v_add_nc_u16 v119, 0x300, v2 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_add_nc_u16 v2, v135, 3 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v130, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_add_nc_u16 v3, v133, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v4, v117, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v44, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v57, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v56, v0 +; GFX11-NEXT: v_add_nc_u16 v117, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v46, v3 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v45, v4 +; GFX11-NEXT: v_add_nc_u16 v129, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_add_nc_u16 v0, v114, 3 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v96, 3 +; GFX11-NEXT: v_add_nc_u16 v96, 0x300, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v2, v102, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v55, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v43, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v182, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v41, v2 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v40, v3 +; GFX11-NEXT: v_add_nc_u16 v55, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v181, v4 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v37, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v54, 3 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v52, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v3, v53, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v50, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v150, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v176, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v167, v1 +; GFX11-NEXT: v_add_nc_u16 v50, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v161, v3 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v160, v4 +; GFX11-NEXT: v_add_nc_u16 v52, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v51, 3 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v49, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v48, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v4, v36, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v132, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v128, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v115, v0 +; GFX11-NEXT: v_add_nc_u16 v34, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v100, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v113, v4 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v39, 3 +; GFX11-NEXT: v_add_nc_u16 v36, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v32, v32, 3 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GFX11-NEXT: v_or_b32_e32 v35, v71, v35 +; GFX11-NEXT: v_or_b32_e32 v33, v82, v33 +; GFX11-NEXT: v_or_b32_e32 v0, v84, v0 +; GFX11-NEXT: v_or_b32_e32 v36, v70, v36 +; GFX11-NEXT: v_or_b32_e32 v32, v66, v32 +; GFX11-NEXT: v_add_nc_u16 v35, 0x300, v35 +; GFX11-NEXT: v_add_nc_u16 v33, 0x300, v33 +; GFX11-NEXT: v_add_nc_u16 v38, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v36 +; GFX11-NEXT: v_add_nc_u16 v32, 0x300, v32 +; GFX11-NEXT: v_add_nc_u16 v36, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v39, 0x300, v1 +; GFX11-NEXT: v_perm_b32 v1, v33, v38, 0x5040100 +; GFX11-NEXT: v_perm_b32 v0, v35, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v32, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v36, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v39, v34, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v52, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v50, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v37, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v55, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v96, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v129, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v12, v117, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v119, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v118, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v112, v15, 0x5040100 +; GFX11-NEXT: v_perm_b32 v16, v103, v16, 0x5040100 +; GFX11-NEXT: v_perm_b32 v17, v101, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v18, v97, v18, 0x5040100 +; GFX11-NEXT: v_perm_b32 v19, v85, v19, 0x5040100 +; GFX11-NEXT: v_perm_b32 v20, v80, v20, 0x5040100 +; GFX11-NEXT: v_perm_b32 v21, v69, v21, 0x5040100 +; GFX11-NEXT: v_perm_b32 v22, v65, v22, 0x5040100 +; GFX11-NEXT: v_perm_b32 v23, v64, v23, 0x5040100 +; GFX11-NEXT: v_perm_b32 v24, v68, v24, 0x5040100 +; GFX11-NEXT: v_perm_b32 v25, v67, v25, 0x5040100 +; GFX11-NEXT: v_perm_b32 v26, v86, v26, 0x5040100 +; GFX11-NEXT: v_perm_b32 v27, v83, v27, 0x5040100 +; GFX11-NEXT: v_perm_b32 v28, v81, v28, 0x5040100 +; GFX11-NEXT: v_perm_b32 v29, v99, v29, 0x5040100 +; GFX11-NEXT: v_perm_b32 v30, v98, v30, 0x5040100 +; GFX11-NEXT: v_perm_b32 v31, v116, v31, 0x5040100 +; GFX11-NEXT: .LBB48_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:392 +; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:396 +; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:400 +; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:404 +; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:408 +; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:412 +; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:416 +; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:420 +; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:424 +; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:428 +; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:432 +; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:436 +; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:440 +; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:444 +; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:448 +; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:452 +; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:456 +; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:460 +; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:464 +; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:468 +; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:472 +; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:476 +; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:480 +; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:484 +; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:488 +; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:492 +; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:496 +; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:500 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:504 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:508 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:512 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:516 +; GFX11-NEXT: s_clause 0xf +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:520 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:524 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:528 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:532 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:536 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:540 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:544 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:548 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:552 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:556 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:560 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:564 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:568 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:572 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:576 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:580 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <128 x i8> %a, splat (i8 3) + %a2 = bitcast <128 x i8> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <128 x i8> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i16_to_v128i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:76 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(10) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v37 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; kill: killed $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; kill: killed $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; kill: killed $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; kill: killed $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; kill: killed $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB49_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v40, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GCN-NEXT: v_or_b32_e32 v54, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v50, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GCN-NEXT: v_or_b32_e32 v51, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v52, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; GCN-NEXT: v_or_b32_e32 v53, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v55, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; GCN-NEXT: v_or_b32_e32 v41, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v42, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; GCN-NEXT: v_or_b32_e32 v43, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v44, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; GCN-NEXT: v_or_b32_e32 v45, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v46, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; GCN-NEXT: v_or_b32_e32 v47, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v56, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GCN-NEXT: v_or_b32_e32 v57, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v58, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GCN-NEXT: v_or_b32_e32 v59, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v60, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GCN-NEXT: v_or_b32_e32 v61, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v23 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v62, v1, v6 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; GCN-NEXT: v_or_b32_e32 v63, v2, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; GCN-NEXT: v_or_b32_e32 v6, v5, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v10, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; GCN-NEXT: v_or_b32_e32 v14, v13, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v15, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GCN-NEXT: v_or_b32_e32 v22, v17, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v26, v19, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v30, v21, v27 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v32, v23, v29 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v33, v33, v25 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v4, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v8, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v12, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v16, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v20, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v24, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v28, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v31, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v7, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v9, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v49, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v48, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v39, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v11, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v38, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v1, v37, 8, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v5, v54, v40, 24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v40, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v40, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v51, v50, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v51, v50, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v51, v50, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v53, v52, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v53, v52, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v53, v52, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v41, v55, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v41, v55, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v41, v55, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v43, v42, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v43, v42, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v43, v42, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v45, v44, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v45, v44, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v45, v44, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v47, v46, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v47, v46, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v47, v46, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v57, v56, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v57, v56, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v57, v56, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v59, v58, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v59, v58, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v59, v58, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v61, v60, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v61, v60, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v61, v60, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v63, v62, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v63, v62, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v63, v62, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v6, v2, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v6, v2, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v6, v2, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v14, v10, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v14, v10, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v14, v10, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v22, v18, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v22, v18, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v22, v18, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v30, v26, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v30, v26, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v30, v26, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v53 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v43 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v45 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v47 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v57 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v59 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v61 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v63 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; kill: killed $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: .LBB49_2: ; %Flow +; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB49_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v29, v2 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v4, v25, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v34, v6 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v5 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v27, v7 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v36 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v33, v8 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v33, v9 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v33, v10 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v33, v11 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v33, v12 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v33, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v14, v16 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v14, v17 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v14, v15 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v14, v18 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v14, v5 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v14, v13 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v14, v19 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v14, v20 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v14, v21 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v14, v22 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v14, v23 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v14, v24 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v14, v25 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v37, v14, v26 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v14, v27 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v14, v28 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v14, v29 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v14, v30 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v14, v31 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v39, v14, v32 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v14, v1 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v14, v3 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x30000, v2 +; GCN-NEXT: v_add_i32_e32 v33, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v30, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v34 +; GCN-NEXT: v_add_i32_e32 v62, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v63, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v60, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v61, vcc, s6, v35 +; GCN-NEXT: v_add_i32_e32 v58, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v59, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v56, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v57, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v46, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v47, vcc, s6, v36 +; GCN-NEXT: v_add_i32_e32 v44, vcc, s6, v23 +; GCN-NEXT: v_add_i32_e32 v45, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v42, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v43, vcc, s6, v37 +; GCN-NEXT: v_add_i32_e32 v55, vcc, s6, v27 +; GCN-NEXT: v_add_i32_e32 v41, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v29 +; GCN-NEXT: v_add_i32_e32 v53, vcc, s6, v38 +; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v31 +; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v39 +; GCN-NEXT: v_add_i32_e32 v40, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v3 +; GCN-NEXT: v_alignbit_b32 v5, v54, v40, 24 +; GCN-NEXT: v_alignbit_b32 v9, v54, v40, 16 +; GCN-NEXT: v_alignbit_b32 v4, v54, v40, 8 +; GCN-NEXT: v_alignbit_b32 v1, v51, v50, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v51, v50, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v8, v51, v50, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v53, v52, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v53, v52, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v11, v53, v52, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v41, v55, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v41, v55, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v12, v41, v55, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v43, v42, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v43, v42, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v20, v43, v42, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v45, v44, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v45, v44, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v24, v45, v44, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v47, v46, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v47, v46, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v28, v47, v46, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v57, v56, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v57, v56, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v57, v56, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v59, v58, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v59, v58, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v59, v58, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v61, v60, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v61, v60, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v61, v60, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v63, v62, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v63, v62, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v63, v62, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v6, v2, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v6, v2, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v6, v2, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v14, v10, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v14, v10, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v14, v10, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v22, v18, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v22, v18, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v22, v18, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v30, v26, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v30, v26, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v30, v26, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v54 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v51 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v53 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v53 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v53 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v41 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v41 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v41 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v43 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v43 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v43 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v45 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v45 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v45 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v47 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v47 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v57 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v57 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v57 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v59 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v59 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v59 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v61 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v61 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v63 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v63 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v33 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v33 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GCN-NEXT: .LBB49_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v40 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v54 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v1, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v3, v5 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v50 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v8 +; GCN-NEXT: v_or_b32_e32 v29, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v31 +; GCN-NEXT: v_or_b32_e32 v31, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v11 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v53 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v41 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v20 +; GCN-NEXT: v_or_b32_e32 v7, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v43 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v8, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v24 +; GCN-NEXT: v_or_b32_e32 v9, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v45 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v11, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v28 +; GCN-NEXT: v_or_b32_e32 v12, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v47 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v13, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v56 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v15, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v57 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v16, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v17, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v19, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v60 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v20, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v61 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v21, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v62 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v23, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v63 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v24, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v10, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v14, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v18, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v22 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v22, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v25, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v26, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v27, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v33 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: v_or_b32_e32 v28, v1, v2 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v30, v2, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v33, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v32, v2, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v34, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v35, v2, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v39 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v36, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v37, v2, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v38, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v39, v2, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v49 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v48, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v49, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v50, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v51, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v52, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v53, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v54, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v55, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v40, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v41, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v42, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v43, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v44, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v45, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v46, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v47, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v56, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v57, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v58, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v59, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v60, v2, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v4, v1, v30 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v5, v1, v33 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v63, v2, v32 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v61, v3, v34 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v35 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v6, v6, v36 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v37 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v7, v7, v38 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v8, v7, v39 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v9, v7, v48 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v11, v7, v49 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v12, v7, v50 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v13, v7, v51 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v15, v7, v52 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v16, v7, v53 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v17, v7, v54 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v19, v7, v55 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v20, v7, v40 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v23 +; GCN-NEXT: v_or_b32_e32 v21, v7, v41 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v24 +; GCN-NEXT: v_or_b32_e32 v23, v7, v42 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x50, v0 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v7, v43 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x54, v0 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GCN-NEXT: v_or_b32_e32 v44, v51, v44 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x58, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v10, v45 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x5c, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v14, v46 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v18, v18, v47 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x64, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_or_b32_e32 v22, v22, v56 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x68, v0 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_or_b32_e32 v25, v25, v57 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v0 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_or_b32_e32 v26, v26, v58 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x70, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_or_b32_e32 v27, v27, v59 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_or_b32_e32 v28, v28, v60 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v128i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; kill: killed $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; kill: killed $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB49_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[13:14] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10] +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v31, v7 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v10 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v11 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v12 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v13 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v14 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v8 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v7, v6 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v36 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v37 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v5, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v20 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v17 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v18 +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, v46 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] +; VI-NEXT: v_mov_b32_e32 v32, v15 +; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26 +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24 +; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v20 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: .LBB49_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB49_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v31, 3 +; VI-NEXT: v_add_u16_sdwa v51, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v32, 3, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v18, v32, v18 +; VI-NEXT: v_add_u16_e32 v32, 3, v17 +; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v17, v32, v17 +; VI-NEXT: v_add_u16_e32 v32, 3, v20 +; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v20, v32, v20 +; VI-NEXT: v_add_u16_e32 v32, 3, v19 +; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_add_u16_sdwa v48, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v19, v32, v19 +; VI-NEXT: v_add_u16_e32 v32, 3, v22 +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 +; VI-NEXT: v_add_u16_sdwa v53, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v22, v32, v22 +; VI-NEXT: v_add_u16_e32 v32, 3, v21 +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; VI-NEXT: v_add_u16_sdwa v61, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v21, v32, v21 +; VI-NEXT: v_add_u16_e32 v32, 3, v24 +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v24, v32, v24 +; VI-NEXT: v_add_u16_e32 v32, 3, v23 +; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_add_u16_sdwa v58, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v23, v32, v23 +; VI-NEXT: v_add_u16_e32 v32, 3, v26 +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v26, v32, v26 +; VI-NEXT: v_add_u16_e32 v32, 3, v25 +; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; VI-NEXT: v_add_u16_sdwa v39, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v25, v32, v25 +; VI-NEXT: v_add_u16_e32 v32, 3, v28 +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v28, v32, v28 +; VI-NEXT: v_add_u16_e32 v32, 3, v27 +; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; VI-NEXT: v_add_u16_sdwa v35, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v27, v32, v27 +; VI-NEXT: v_add_u16_e32 v33, 3, v30 +; VI-NEXT: v_add_u16_e32 v34, 3, v29 +; VI-NEXT: v_add_u16_sdwa v32, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 +; VI-NEXT: v_add_u16_sdwa v52, v37, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v30, v33, v29 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 +; VI-NEXT: v_add_u16_e32 v33, 3, v37 +; VI-NEXT: v_add_u16_sdwa v50, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v29, v34, v29 +; VI-NEXT: v_add_u16_e32 v34, 3, v36 +; VI-NEXT: v_or_b32_e32 v37, v33, v32 +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v50 +; VI-NEXT: v_add_u16_sdwa v57, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v36, v34, v32 +; VI-NEXT: v_add_u16_e32 v33, 3, v2 +; VI-NEXT: v_add_u16_e32 v34, 3, v1 +; VI-NEXT: v_add_u16_sdwa v32, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; VI-NEXT: v_or_b32_e32 v2, v33, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; VI-NEXT: v_add_u16_sdwa v56, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v1, v34, v1 +; VI-NEXT: v_add_u16_e32 v33, 3, v4 +; VI-NEXT: v_add_u16_e32 v34, 3, v3 +; VI-NEXT: v_add_u16_sdwa v32, v3, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; VI-NEXT: v_or_b32_e32 v4, v33, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; VI-NEXT: v_add_u16_sdwa v47, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v3, v34, v3 +; VI-NEXT: v_add_u16_e32 v33, 3, v6 +; VI-NEXT: v_add_u16_e32 v34, 3, v5 +; VI-NEXT: v_add_u16_sdwa v32, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; VI-NEXT: v_or_b32_e32 v6, v33, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v5, v34, v5 +; VI-NEXT: v_add_u16_sdwa v34, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v38, 3, v8 +; VI-NEXT: v_add_u16_e32 v33, 3, v7 +; VI-NEXT: v_add_u16_sdwa v32, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; VI-NEXT: v_or_b32_e32 v8, v38, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 +; VI-NEXT: v_add_u16_sdwa v59, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v7, v33, v7 +; VI-NEXT: v_add_u16_e32 v33, 3, v10 +; VI-NEXT: v_add_u16_e32 v38, 3, v9 +; VI-NEXT: v_add_u16_sdwa v32, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 +; VI-NEXT: v_or_b32_e32 v10, v33, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; VI-NEXT: v_add_u16_sdwa v63, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v9, v38, v9 +; VI-NEXT: v_add_u16_e32 v33, 3, v12 +; VI-NEXT: v_add_u16_e32 v38, 3, v11 +; VI-NEXT: v_add_u16_sdwa v32, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v12, v33, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 +; VI-NEXT: v_add_u16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v11, v38, v11 +; VI-NEXT: v_add_u16_e32 v38, 3, v14 +; VI-NEXT: v_add_u16_e32 v49, 3, v13 +; VI-NEXT: v_add_u16_sdwa v32, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; VI-NEXT: v_add_u16_sdwa v60, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v38, v13 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; VI-NEXT: v_add_u16_sdwa v31, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_add_u16_e32 v32, 3, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v16, v16, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v15, v32, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16] +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v13, v49, v13 +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v14 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14] +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9 +; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 +; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v60, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v33, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v63, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v59, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v34, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v47, 8, 8 +; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v56, 8, 8 +; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v57, 8, 8 +; VI-NEXT: v_mov_b32_e32 v46, v35 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v52, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v46, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v39, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v49, v53 +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v52, v51 +; VI-NEXT: v_bfe_u32 v31, v51, 8, 8 +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 +; VI-NEXT: v_bfe_u32 v35, v58, 8, 8 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v39, v61, 8, 8 +; VI-NEXT: v_bfe_u32 v58, v48, 8, 8 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_bfe_u32 v61, v53, 8, 8 +; VI-NEXT: .LBB49_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i16_to_v128i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; kill: killed $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB49_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(45) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(46) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: .LBB49_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB49_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 +; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 +; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 +; GFX9-NEXT: .LBB49_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 +; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i16_to_v128i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:88 +; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:84 +; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:80 +; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:76 +; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:72 +; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:68 +; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:64 +; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:60 +; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:56 +; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:52 +; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:48 +; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:44 +; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:40 +; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:36 +; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:32 +; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:28 +; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:24 +; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:12 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: ; implicit-def: $vgpr74 +; GFX11-NEXT: ; implicit-def: $vgpr72 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr63 +; GFX11-NEXT: ; implicit-def: $vgpr62 +; GFX11-NEXT: ; implicit-def: $vgpr60 +; GFX11-NEXT: ; implicit-def: $vgpr57 +; GFX11-NEXT: ; implicit-def: $vgpr47 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr45 +; GFX11-NEXT: ; implicit-def: $vgpr43 +; GFX11-NEXT: ; implicit-def: $vgpr42 +; GFX11-NEXT: ; implicit-def: $vgpr183 +; GFX11-NEXT: ; implicit-def: $vgpr181 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr179 +; GFX11-NEXT: ; implicit-def: $vgpr177 +; GFX11-NEXT: ; implicit-def: $vgpr167 +; GFX11-NEXT: ; implicit-def: $vgpr165 +; GFX11-NEXT: ; implicit-def: $vgpr164 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr161 +; GFX11-NEXT: ; implicit-def: $vgpr151 +; GFX11-NEXT: ; implicit-def: $vgpr150 +; GFX11-NEXT: ; implicit-def: $vgpr147 +; GFX11-NEXT: ; implicit-def: $vgpr145 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr144 +; GFX11-NEXT: ; implicit-def: $vgpr133 +; GFX11-NEXT: ; implicit-def: $vgpr131 +; GFX11-NEXT: ; implicit-def: $vgpr129 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr75 +; GFX11-NEXT: ; implicit-def: $vgpr73 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr61 +; GFX11-NEXT: ; implicit-def: $vgpr59 +; GFX11-NEXT: ; implicit-def: $vgpr58 +; GFX11-NEXT: ; implicit-def: $vgpr56 +; GFX11-NEXT: ; implicit-def: $vgpr46 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr44 +; GFX11-NEXT: ; implicit-def: $vgpr41 +; GFX11-NEXT: ; implicit-def: $vgpr40 +; GFX11-NEXT: ; implicit-def: $vgpr182 +; GFX11-NEXT: ; implicit-def: $vgpr180 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr178 +; GFX11-NEXT: ; implicit-def: $vgpr176 +; GFX11-NEXT: ; implicit-def: $vgpr166 +; GFX11-NEXT: ; implicit-def: $vgpr163 +; GFX11-NEXT: ; implicit-def: $vgpr162 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr160 +; GFX11-NEXT: ; implicit-def: $vgpr149 +; GFX11-NEXT: ; implicit-def: $vgpr148 +; GFX11-NEXT: ; implicit-def: $vgpr146 +; GFX11-NEXT: ; implicit-def: $vgpr135 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr134 +; GFX11-NEXT: ; implicit-def: $vgpr132 +; GFX11-NEXT: ; implicit-def: $vgpr130 +; GFX11-NEXT: ; implicit-def: $vgpr128 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB49_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-NEXT: .LBB49_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB49_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] +; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] +; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] +; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] +; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v102, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v115, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v117, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v131, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v150, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v161, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v167, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v179, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v42, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v45, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v63, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 24, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v99, 8, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v128, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v26 +; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v25 +; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v22 +; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v73, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v17 +; GFX11-NEXT: .LBB49_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v74 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v60 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v72 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v47 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v39, v55, v39 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v63 +; GFX11-NEXT: v_or_b32_e32 v54, v67, v54 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v42 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v62 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v39 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_or_b32_e32 v55, v55, v64 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v57 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v52 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v64 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v45 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v43 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v51 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-NEXT: v_or_b32_e32 v54, v64, v67 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v183 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v181 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v55 +; GFX11-NEXT: v_or_b32_e32 v53, v64, v53 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v179 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v177 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v167 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v39 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v54 +; GFX11-NEXT: v_or_b32_e32 v53, v55, v64 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v165 +; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v164 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v53 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v54 +; GFX11-NEXT: v_or_b32_e32 v52, v55, v52 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v161 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v151 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v150 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v52 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v53 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_or_b32_e32 v52, v54, v55 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v145 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v147 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v144 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-NEXT: v_or_b32_e32 v51, v54, v51 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v55 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v39 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v52 +; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GFX11-NEXT: v_and_b32_e32 v51, 0xff, v133 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v131 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v129 +; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v119 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v50 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v117 +; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v116 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v115 +; GFX11-NEXT: v_or_b32_e32 v51, v51, v52 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v53 +; GFX11-NEXT: v_or_b32_e32 v50, v54, v50 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v55 +; GFX11-NEXT: v_or_b32_e32 v52, v64, v67 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: v_or_b32_e32 v1, v9, v39 +; GFX11-NEXT: v_or_b32_e32 v2, v10, v51 +; GFX11-NEXT: v_or_b32_e32 v3, v11, v50 +; GFX11-NEXT: v_or_b32_e32 v4, v12, v52 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v6, 8, v102 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v100 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v49 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v10, 8, v98 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v86 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v82 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v70 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v68 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v17 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v75 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v73 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v66 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v39, v48 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v61 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v59 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v58 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v19 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v56 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v46 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v65 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v44 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v41 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v40 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v21 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v182 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v180 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v178 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v176 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v166 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v38, v39 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v163 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v162 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v37 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v24 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v160 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v149 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v148 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v146 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v135 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v26 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v134 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v132 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v130 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v118 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v35 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v36, v35 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v114 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v113 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v112 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v103 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v101 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v99 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v87 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v85 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v31 +; GFX11-NEXT: v_lshlrev_b16 v28, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v83 +; GFX11-NEXT: v_lshlrev_b16 v30, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v32, 8, v81 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v71 +; GFX11-NEXT: v_lshlrev_b16 v34, 8, v69 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-NEXT: v_or_b32_e32 v26, v27, v28 +; GFX11-NEXT: v_or_b32_e32 v27, v29, v30 +; GFX11-NEXT: v_or_b32_e32 v28, v31, v32 +; GFX11-NEXT: v_or_b32_e32 v29, v33, v34 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 +; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-NEXT: s_clause 0x13 +; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v64bf16_to_v64f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v40 +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v44 +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v45 +; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v46 +; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v47 +; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v56 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 +; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v57 +; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v58 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v61 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 +; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v62 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; kill: killed $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB50_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GCN-NEXT: v_mov_b32_e32 v5, v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v45 +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v47 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v57 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v58 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v59 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v60 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v61 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v63 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v6 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v8 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v9 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v35 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v36 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v37 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v38 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v39 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v49 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v50 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v51 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v53 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v54 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v55 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v40 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v41 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v43 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v44 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v46 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v47 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v56 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v57 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v58 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v59 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v63 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: .LBB50_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB50_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v63 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v62 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v61 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v60 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v59 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v58 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v57 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v56 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v47 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v46 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v45 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v44 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v43 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v42 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v54 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v53 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v39 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v34 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v42, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v44, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v46, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v56, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v58, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v60, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v61, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v62, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v63, 0xffff0000, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v20 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v30 +; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v31 +; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v36 +; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v35 +; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v34 +; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v48 +; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v49 +; GCN-NEXT: v_add_f32_e32 v39, 0x40c00000, v50 +; GCN-NEXT: v_add_f32_e32 v48, 0x40c00000, v51 +; GCN-NEXT: v_add_f32_e32 v49, 0x40c00000, v52 +; GCN-NEXT: v_add_f32_e32 v50, 0x40c00000, v53 +; GCN-NEXT: v_add_f32_e32 v51, 0x40c00000, v54 +; GCN-NEXT: v_add_f32_e32 v52, 0x40c00000, v55 +; GCN-NEXT: v_add_f32_e32 v53, 0x40c00000, v40 +; GCN-NEXT: v_add_f32_e32 v54, 0x40c00000, v41 +; GCN-NEXT: v_add_f32_e32 v55, 0x40c00000, v42 +; GCN-NEXT: v_add_f32_e32 v40, 0x40c00000, v43 +; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v44 +; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v45 +; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v46 +; GCN-NEXT: v_add_f32_e32 v44, 0x40c00000, v47 +; GCN-NEXT: v_add_f32_e32 v45, 0x40c00000, v56 +; GCN-NEXT: v_add_f32_e32 v46, 0x40c00000, v57 +; GCN-NEXT: v_add_f32_e32 v47, 0x40c00000, v58 +; GCN-NEXT: v_add_f32_e32 v56, 0x40c00000, v59 +; GCN-NEXT: v_add_f32_e32 v57, 0x40c00000, v60 +; GCN-NEXT: v_add_f32_e32 v58, 0x40c00000, v61 +; GCN-NEXT: v_add_f32_e32 v59, 0x40c00000, v62 +; GCN-NEXT: v_add_f32_e32 v60, 0x40c00000, v63 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v63, 0x40c00000, v63 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v45 +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v47 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v57 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v58 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v59 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v60 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v61 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v63 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 +; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v31 +; GCN-NEXT: v_mov_b32_e32 v31, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v29 +; GCN-NEXT: v_mov_b32_e32 v29, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v28, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v27, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v26, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v25, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v24, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v23, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v21, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v20, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v19, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v18, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v17, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v15, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v14, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v13, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v12, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v11, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v10, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v9, v56 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v7, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v6, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GCN-NEXT: .LBB50_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v45, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v44, v2, v1 +; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v47, v2, v1 +; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v46, v2, v1 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_add_i32_e32 v56, vcc, 20, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v4, v4, v2 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v6, v6, v2 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v8, v8, v2 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 32, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v10, v10, v2 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v12, v12, v2 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 40, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v14, v14, v2 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v16, v16, v2 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 48, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v18, v18, v2 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v20, v20, v2 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 56, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v22, v2 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_or_b32_e32 v25, v26, v25 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v0 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_or_b32_e32 v27, v28, v27 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v0 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_or_b32_e32 v29, v30, v29 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_or_b32_e32 v31, v32, v31 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_or_b32_e32 v33, v34, v33 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x54, v0 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_or_b32_e32 v35, v36, v35 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x58, v0 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_or_b32_e32 v37, v38, v37 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x5c, v0 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_or_b32_e32 v39, v48, v39 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x60, v0 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GCN-NEXT: v_or_b32_e32 v49, v50, v49 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x64, v0 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GCN-NEXT: v_or_b32_e32 v51, v52, v51 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_or_b32_e32 v53, v54, v53 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x6c, v0 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: v_or_b32_e32 v55, v40, v55 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; GCN-NEXT: v_or_b32_e32 v41, v42, v41 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v43, v3 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v45, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v56, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v29, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v33, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v35, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v37, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v49, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v51, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v53, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v55, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v64f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 +; VI-NEXT: v_add_u32_e32 v35, vcc, s6, v35 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; VI-NEXT: v_bfe_u32 v35, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v18 +; VI-NEXT: v_add_u32_e32 v35, vcc, s6, v35 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 +; VI-NEXT: v_add_u32_e32 v36, vcc, s6, v36 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 +; VI-NEXT: v_add_u32_e32 v36, vcc, s6, v36 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 +; VI-NEXT: v_add_u32_e32 v37, vcc, s6, v37 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 +; VI-NEXT: v_add_u32_e32 v37, vcc, s6, v37 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 +; VI-NEXT: v_add_u32_e32 v38, vcc, s6, v38 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 +; VI-NEXT: v_add_u32_e32 v38, vcc, s6, v38 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 +; VI-NEXT: v_add_u32_e32 v39, vcc, s6, v39 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 +; VI-NEXT: v_add_u32_e32 v39, vcc, s6, v39 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 +; VI-NEXT: v_add_u32_e32 v48, vcc, s6, v48 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 +; VI-NEXT: v_add_u32_e32 v48, vcc, s6, v48 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 +; VI-NEXT: v_add_u32_e32 v49, vcc, s6, v49 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 +; VI-NEXT: v_add_u32_e32 v49, vcc, s6, v49 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 +; VI-NEXT: v_add_u32_e32 v50, vcc, s6, v50 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 +; VI-NEXT: v_add_u32_e32 v50, vcc, s6, v50 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 +; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 +; VI-NEXT: v_add_u32_e32 v51, vcc, s6, v51 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 +; VI-NEXT: v_add_u32_e32 v51, vcc, s6, v51 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 +; VI-NEXT: v_add_u32_e32 v52, vcc, s6, v52 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_add_u32_e32 v52, vcc, s6, v52 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 +; VI-NEXT: v_add_u32_e32 v53, vcc, s6, v53 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 +; VI-NEXT: v_add_u32_e32 v53, vcc, s6, v53 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 +; VI-NEXT: v_add_u32_e32 v54, vcc, s6, v54 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_add_u32_e32 v54, vcc, s6, v54 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 +; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 +; VI-NEXT: v_add_u32_e32 v55, vcc, s6, v55 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 +; VI-NEXT: v_add_u32_e32 v55, vcc, s6, v55 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 +; VI-NEXT: v_add_u32_e32 v40, vcc, s6, v40 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_add_u32_e32 v40, vcc, s6, v40 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; VI-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; VI-NEXT: v_bfe_u32 v41, v40, 16, 1 +; VI-NEXT: v_add_u32_e32 v41, vcc, v41, v40 +; VI-NEXT: v_add_u32_e32 v41, vcc, s6, v41 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v42, 0x400000, v40 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc +; VI-NEXT: v_bfe_u32 v41, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v41, vcc, v41, v0 +; VI-NEXT: v_add_u32_e32 v41, vcc, s6, v41 +; VI-NEXT: v_or_b32_e32 v42, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v17, v41, v42, vcc +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 +; VI-NEXT: v_bfe_u32 v42, v41, 16, 1 +; VI-NEXT: v_add_u32_e32 v42, vcc, v42, v41 +; VI-NEXT: v_add_u32_e32 v42, vcc, s6, v42 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v43, 0x400000, v41 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v41, v41 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc +; VI-NEXT: v_bfe_u32 v42, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v42, vcc, v42, v1 +; VI-NEXT: v_add_u32_e32 v42, vcc, s6, v42 +; VI-NEXT: v_or_b32_e32 v43, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; VI-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 +; VI-NEXT: v_bfe_u32 v43, v42, 16, 1 +; VI-NEXT: v_add_u32_e32 v43, vcc, v43, v42 +; VI-NEXT: v_add_u32_e32 v43, vcc, s6, v43 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v44, 0x400000, v42 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v42, v42 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v42, v43, v44, vcc +; VI-NEXT: v_bfe_u32 v43, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v43, vcc, v43, v2 +; VI-NEXT: v_add_u32_e32 v43, vcc, s6, v43 +; VI-NEXT: v_or_b32_e32 v44, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v43, v44, vcc +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; VI-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 +; VI-NEXT: v_bfe_u32 v44, v43, 16, 1 +; VI-NEXT: v_add_u32_e32 v44, vcc, v44, v43 +; VI-NEXT: v_add_u32_e32 v44, vcc, s6, v44 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v45, 0x400000, v43 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v43, v43 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v43, v44, v45, vcc +; VI-NEXT: v_bfe_u32 v44, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v44, vcc, v44, v3 +; VI-NEXT: v_add_u32_e32 v44, vcc, s6, v44 +; VI-NEXT: v_or_b32_e32 v45, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v44, v45, vcc +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; VI-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 +; VI-NEXT: v_bfe_u32 v45, v44, 16, 1 +; VI-NEXT: v_add_u32_e32 v45, vcc, v45, v44 +; VI-NEXT: v_add_u32_e32 v45, vcc, s6, v45 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v46, 0x400000, v44 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v44, v44 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v44, v45, v46, vcc +; VI-NEXT: v_bfe_u32 v45, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v45, vcc, v45, v4 +; VI-NEXT: v_add_u32_e32 v45, vcc, s6, v45 +; VI-NEXT: v_or_b32_e32 v46, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v45, v46, vcc +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; VI-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 +; VI-NEXT: v_bfe_u32 v46, v45, 16, 1 +; VI-NEXT: v_add_u32_e32 v46, vcc, v46, v45 +; VI-NEXT: v_add_u32_e32 v46, vcc, s6, v46 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v47, 0x400000, v45 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v45, v45 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v45, v46, v47, vcc +; VI-NEXT: v_bfe_u32 v46, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v46, vcc, v46, v5 +; VI-NEXT: v_add_u32_e32 v46, vcc, s6, v46 +; VI-NEXT: v_or_b32_e32 v47, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v46, v47, vcc +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; VI-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 +; VI-NEXT: v_bfe_u32 v47, v46, 16, 1 +; VI-NEXT: v_add_u32_e32 v47, vcc, v47, v46 +; VI-NEXT: v_add_u32_e32 v47, vcc, s6, v47 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v56, 0x400000, v46 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v46, v46 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v46, v47, v56, vcc +; VI-NEXT: v_bfe_u32 v47, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v47, vcc, v47, v6 +; VI-NEXT: v_add_u32_e32 v47, vcc, s6, v47 +; VI-NEXT: v_or_b32_e32 v56, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v47, v56, vcc +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; VI-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 +; VI-NEXT: v_bfe_u32 v56, v47, 16, 1 +; VI-NEXT: v_add_u32_e32 v56, vcc, v56, v47 +; VI-NEXT: v_add_u32_e32 v56, vcc, s6, v56 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v57, 0x400000, v47 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v47, v47 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v47, v56, v57, vcc +; VI-NEXT: v_bfe_u32 v56, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v56, vcc, v56, v7 +; VI-NEXT: v_add_u32_e32 v56, vcc, s6, v56 +; VI-NEXT: v_or_b32_e32 v57, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v56, v57, vcc +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v8 +; VI-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 +; VI-NEXT: v_bfe_u32 v57, v56, 16, 1 +; VI-NEXT: v_add_u32_e32 v57, vcc, v57, v56 +; VI-NEXT: v_add_u32_e32 v57, vcc, s6, v57 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v58, 0x400000, v56 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v56, v56 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v56, v57, v58, vcc +; VI-NEXT: v_bfe_u32 v57, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v57, vcc, v57, v8 +; VI-NEXT: v_add_u32_e32 v57, vcc, s6, v57 +; VI-NEXT: v_or_b32_e32 v58, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v57, v58, vcc +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; VI-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 +; VI-NEXT: v_bfe_u32 v58, v57, 16, 1 +; VI-NEXT: v_add_u32_e32 v58, vcc, v58, v57 +; VI-NEXT: v_add_u32_e32 v58, vcc, s6, v58 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v59, 0x400000, v57 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v57, v57 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v57, v58, v59, vcc +; VI-NEXT: v_bfe_u32 v58, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v58, vcc, v58, v9 +; VI-NEXT: v_add_u32_e32 v58, vcc, s6, v58 +; VI-NEXT: v_or_b32_e32 v59, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v58, v59, vcc +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v10 +; VI-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 +; VI-NEXT: v_bfe_u32 v59, v58, 16, 1 +; VI-NEXT: v_add_u32_e32 v59, vcc, v59, v58 +; VI-NEXT: v_add_u32_e32 v59, vcc, s6, v59 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v60, 0x400000, v58 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v58, v58 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v58, v59, v60, vcc +; VI-NEXT: v_bfe_u32 v59, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v59, vcc, v59, v10 +; VI-NEXT: v_add_u32_e32 v59, vcc, s6, v59 +; VI-NEXT: v_or_b32_e32 v60, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v59, v60, vcc +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; VI-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 +; VI-NEXT: v_bfe_u32 v60, v59, 16, 1 +; VI-NEXT: v_add_u32_e32 v60, vcc, v60, v59 +; VI-NEXT: v_add_u32_e32 v60, vcc, s6, v60 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v61, 0x400000, v59 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v59, v59 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v59, v60, v61, vcc +; VI-NEXT: v_bfe_u32 v60, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v60, vcc, v60, v11 +; VI-NEXT: v_add_u32_e32 v60, vcc, s6, v60 +; VI-NEXT: v_or_b32_e32 v61, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v60, v61, vcc +; VI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; VI-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 +; VI-NEXT: v_bfe_u32 v61, v60, 16, 1 +; VI-NEXT: v_add_u32_e32 v61, vcc, v61, v60 +; VI-NEXT: v_add_u32_e32 v61, vcc, s6, v61 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v62, 0x400000, v60 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v60, v60 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v60, v61, v62, vcc +; VI-NEXT: v_bfe_u32 v61, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v61, vcc, v61, v12 +; VI-NEXT: v_add_u32_e32 v61, vcc, s6, v61 +; VI-NEXT: v_or_b32_e32 v62, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v61, v62, vcc +; VI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; VI-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 +; VI-NEXT: v_bfe_u32 v62, v61, 16, 1 +; VI-NEXT: v_add_u32_e32 v62, vcc, v62, v61 +; VI-NEXT: v_add_u32_e32 v62, vcc, s6, v62 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v63, 0x400000, v61 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v61, v61 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v61, v62, v63, vcc +; VI-NEXT: v_bfe_u32 v62, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v62, vcc, v62, v13 +; VI-NEXT: v_add_u32_e32 v62, vcc, s6, v62 +; VI-NEXT: v_or_b32_e32 v63, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v62, v63, vcc +; VI-NEXT: v_lshlrev_b32_e32 v62, 16, v14 +; VI-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; VI-NEXT: v_bfe_u32 v63, v62, 16, 1 +; VI-NEXT: v_add_u32_e32 v63, vcc, v63, v62 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_u32_e32 v63, vcc, s6, v63 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; VI-NEXT: v_bfe_u32 v62, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v1, v63, v0, vcc +; VI-NEXT: v_add_u32_e32 v62, vcc, v62, v14 +; VI-NEXT: v_add_u32_e32 v62, vcc, s6, v62 +; VI-NEXT: v_or_b32_e32 v63, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v62, v63, vcc +; VI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; VI-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; VI-NEXT: v_bfe_u32 v63, v62, 16, 1 +; VI-NEXT: v_add_u32_e32 v63, vcc, v63, v62 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v63, vcc, s6, v63 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; VI-NEXT: v_bfe_u32 v62, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v0, v63, v0, vcc +; VI-NEXT: v_add_u32_e32 v62, vcc, v62, v15 +; VI-NEXT: v_add_u32_e32 v62, vcc, s6, v62 +; VI-NEXT: v_or_b32_e32 v63, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v62, v63, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v0, v61, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v0, v60, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v0, v59, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v0, v58, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v0, v57, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v0, v56, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v0, v47, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v0, v46, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v0, v45, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v0, v44, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v0, v43, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v0, v42, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v16, v55, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v16, v54, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v16, v53, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v16, v52, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v16, v51, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v16, v50, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v16, v49, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v16, v48, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v16, v39, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v16, v38, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v16, v37, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v16, v36, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v16, v35, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v16, v34, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; VI-NEXT: v_alignbit_b32 v1, v0, v41, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v16, v33, 16 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_alignbit_b32 v0, v0, v40, 16 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v64f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v34, v34, v33, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v34, v34, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v35, v35, v34, s6 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; GFX9-NEXT: v_bfe_u32 v35, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v35, v35, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v36, v36, v35, s6 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; GFX9-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v36, v36, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v37, v37, v36, s6 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; GFX9-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v37, v37, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v38, v38, v37, s6 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; GFX9-NEXT: v_bfe_u32 v38, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v38, v38, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v39, v39, v38, s6 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; GFX9-NEXT: v_bfe_u32 v39, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v39, v39, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v48, v48, v39, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; GFX9-NEXT: v_bfe_u32 v48, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v48, v48, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v49, v49, v48, s6 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; GFX9-NEXT: v_bfe_u32 v49, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v49, v49, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v50, v50, v49, s6 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; GFX9-NEXT: v_bfe_u32 v50, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v50, v50, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v51, v51, v50, s6 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; GFX9-NEXT: v_bfe_u32 v51, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v51, v51, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v52, v52, v51, s6 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; GFX9-NEXT: v_bfe_u32 v52, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v52, v52, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v53, v53, v52, s6 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; GFX9-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v53, v53, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v54, v54, v53, s6 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; GFX9-NEXT: v_bfe_u32 v54, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v54, v54, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v54, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v55, v55, v54, s6 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; GFX9-NEXT: v_bfe_u32 v55, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v55, v55, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v40, v40, v55, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; GFX9-NEXT: v_bfe_u32 v40, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v40, v40, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v41, v41, v40, s6 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc +; GFX9-NEXT: v_bfe_u32 v41, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v41, v41, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v41, v42, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 +; GFX9-NEXT: v_bfe_u32 v42, v41, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v42, v42, v41, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v41 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v41, v41 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc +; GFX9-NEXT: v_bfe_u32 v42, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v42, v42, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 +; GFX9-NEXT: v_bfe_u32 v43, v42, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v43, v43, v42, s6 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v42 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v42, v42 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v42, v43, v44, vcc +; GFX9-NEXT: v_bfe_u32 v43, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v43, v43, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v43, v44, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 +; GFX9-NEXT: v_bfe_u32 v44, v43, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v44, v44, v43, s6 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v43 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v43, v43 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v43, v44, v45, vcc +; GFX9-NEXT: v_bfe_u32 v44, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v44, v44, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v44, v45, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 +; GFX9-NEXT: v_bfe_u32 v45, v44, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v45, v45, v44, s6 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v44 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v44, v44 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v44, v45, v46, vcc +; GFX9-NEXT: v_bfe_u32 v45, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v45, v45, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v45, v46, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 +; GFX9-NEXT: v_bfe_u32 v46, v45, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v46, v46, v45, s6 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v45 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v45, v45 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v45, v46, v47, vcc +; GFX9-NEXT: v_bfe_u32 v46, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v46, v46, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v46, v47, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 +; GFX9-NEXT: v_bfe_u32 v47, v46, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v47, v47, v46, s6 +; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v46 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v46, v46 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v46, v47, v56, vcc +; GFX9-NEXT: v_bfe_u32 v47, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v47, v47, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v47, v56, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 +; GFX9-NEXT: v_bfe_u32 v56, v47, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v56, v56, v47, s6 +; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v47 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v47, v47 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v47, v56, v57, vcc +; GFX9-NEXT: v_bfe_u32 v56, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v56, v56, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v56, v57, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v56, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 +; GFX9-NEXT: v_bfe_u32 v57, v56, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v57, v57, v56, s6 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v56 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v56, v56 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v56, v57, v58, vcc +; GFX9-NEXT: v_bfe_u32 v57, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v57, v57, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v57, v58, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 +; GFX9-NEXT: v_bfe_u32 v58, v57, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v58, v58, v57, s6 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v57 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v57, v57 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v57, v58, v59, vcc +; GFX9-NEXT: v_bfe_u32 v58, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v58, v58, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v58, v59, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 +; GFX9-NEXT: v_bfe_u32 v59, v58, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v59, v59, v58, s6 +; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v58 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v58, v58 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v58, v59, v60, vcc +; GFX9-NEXT: v_bfe_u32 v59, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v59, v59, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v59, v60, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 +; GFX9-NEXT: v_bfe_u32 v60, v59, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v60, v60, v59, s6 +; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v59 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v59, v59 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v59, v60, v61, vcc +; GFX9-NEXT: v_bfe_u32 v60, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v60, v60, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v60, v61, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 +; GFX9-NEXT: v_bfe_u32 v61, v60, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v61, v61, v60, s6 +; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v60 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v60, v60 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v60, v61, v62, vcc +; GFX9-NEXT: v_bfe_u32 v61, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v61, v61, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v61, v62, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 +; GFX9-NEXT: v_bfe_u32 v62, v61, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v62, v62, v61, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v61 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v61, v61 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v61, v62, v63, vcc +; GFX9-NEXT: v_bfe_u32 v62, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v62, v62, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v62, v63, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_add3_u32 v63, v63, v62, s6 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_bfe_u32 v62, v14, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v63, v0, vcc +; GFX9-NEXT: v_add3_u32 v62, v62, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v62, v63, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_add3_u32 v63, v63, v62, s6 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_bfe_u32 v62, v15, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v63, v0, vcc +; GFX9-NEXT: v_add3_u32 v62, v62, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v62, v63, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v0, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v1, s6 +; GFX9-NEXT: v_perm_b32 v1, v16, v41, s6 +; GFX9-NEXT: v_perm_b32 v0, v17, v40, s6 +; GFX9-NEXT: v_perm_b32 v17, v32, v33, s6 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v13, v13, v61, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v60, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v59, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v58, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v57, s6 +; GFX9-NEXT: v_perm_b32 v8, v8, v56, s6 +; GFX9-NEXT: v_perm_b32 v7, v7, v47, s6 +; GFX9-NEXT: v_perm_b32 v6, v6, v46, s6 +; GFX9-NEXT: v_perm_b32 v5, v5, v45, s6 +; GFX9-NEXT: v_perm_b32 v4, v4, v44, s6 +; GFX9-NEXT: v_perm_b32 v3, v3, v43, s6 +; GFX9-NEXT: v_perm_b32 v2, v2, v42, s6 +; GFX9-NEXT: v_perm_b32 v31, v31, v55, s6 +; GFX9-NEXT: v_perm_b32 v30, v30, v54, s6 +; GFX9-NEXT: v_perm_b32 v29, v29, v53, s6 +; GFX9-NEXT: v_perm_b32 v28, v28, v52, s6 +; GFX9-NEXT: v_perm_b32 v27, v27, v51, s6 +; GFX9-NEXT: v_perm_b32 v26, v26, v50, s6 +; GFX9-NEXT: v_perm_b32 v25, v25, v49, s6 +; GFX9-NEXT: v_perm_b32 v24, v24, v48, s6 +; GFX9-NEXT: v_perm_b32 v23, v23, v39, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v38, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v37, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v36, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v35, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v34, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v16, v32, v16, s6 +; GFX9-NEXT: .LBB50_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64bf16_to_v64f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-NEXT: v_dual_add_f32 v69, 0x40c00000, v69 :: v_dual_lshlrev_b32 v70, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v10 +; GFX11-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v87, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v16 +; GFX11-NEXT: v_bfe_u32 v16, v32, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v87, 0x40c00000, v87 +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v16, v16, v32, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v38, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v17 +; GFX11-NEXT: v_bfe_u32 v17, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v11 +; GFX11-NEXT: v_add3_u32 v17, v17, v34, 0x7fff +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v32, v17, v39, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add3_u32 v33, v34, v36, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v37, v17, vcc_lo +; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v18 +; GFX11-NEXT: v_add3_u32 v34, v37, v35, 0x7fff +; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-NEXT: v_perm_b32 v17, v33, v17, 0x7060302 +; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v34, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v34, v37, v38, 0x7fff +; GFX11-NEXT: v_bfe_u32 v36, v39, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v19 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo +; GFX11-NEXT: v_add3_u32 v35, v36, v39, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v39 +; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v20 +; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v35, v38, v37, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v37 +; GFX11-NEXT: v_bfe_u32 v38, v48, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v48 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v36, v38, v48, 0x7fff +; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-NEXT: v_dual_add_f32 v49, 0x40c00000, v20 :: v_dual_add_f32 v48, 0x40c00000, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v36, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v36, v38, v39, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v39 +; GFX11-NEXT: v_bfe_u32 v38, v49, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-NEXT: v_bfe_u32 v39, v48, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v50, 0x40c00000, v21 +; GFX11-NEXT: v_perm_b32 v19, v35, v19, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v37, v38, v49, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v49 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-NEXT: v_dual_add_f32 v49, 0x40c00000, v22 :: v_dual_lshlrev_b32 v22, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v37, v38, vcc_lo +; GFX11-NEXT: v_add3_u32 v37, v39, v48, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v48 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-NEXT: v_bfe_u32 v48, v49, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v51, 0x40c00000, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v39, v50, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-NEXT: v_perm_b32 v21, v37, v21, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v38, v39, v50, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v50 +; GFX11-NEXT: v_add_f32_e32 v50, 0x40c00000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v38, v39, vcc_lo +; GFX11-NEXT: v_add3_u32 v38, v48, v49, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v49 +; GFX11-NEXT: v_bfe_u32 v48, v51, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; GFX11-NEXT: v_bfe_u32 v49, v50, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo +; GFX11-NEXT: v_add3_u32 v39, v48, v51, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v51 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-NEXT: v_dual_add_f32 v52, 0x40c00000, v23 :: v_dual_add_f32 v51, 0x40c00000, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v39, v48, vcc_lo +; GFX11-NEXT: v_add3_u32 v39, v49, v50, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v50 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-NEXT: v_bfe_u32 v50, v51, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v53, 0x40c00000, v24 +; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo +; GFX11-NEXT: v_bfe_u32 v49, v52, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v23, v39, v23, 0x7060302 +; GFX11-NEXT: v_add3_u32 v48, v49, v52, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v52 +; GFX11-NEXT: v_add_f32_e32 v52, 0x40c00000, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v24, v48, v49, vcc_lo +; GFX11-NEXT: v_add3_u32 v48, v50, v51, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v51 +; GFX11-NEXT: v_bfe_u32 v50, v53, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; GFX11-NEXT: v_bfe_u32 v51, v52, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo +; GFX11-NEXT: v_add3_u32 v49, v50, v53, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v53 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-NEXT: v_dual_add_f32 v54, 0x40c00000, v25 :: v_dual_add_f32 v53, 0x40c00000, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v49, v50, vcc_lo +; GFX11-NEXT: v_add3_u32 v49, v51, v52, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v52 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-NEXT: v_bfe_u32 v52, v53, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v55, 0x40c00000, v26 +; GFX11-NEXT: v_perm_b32 v24, v48, v24, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo +; GFX11-NEXT: v_bfe_u32 v51, v54, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v25, v49, v25, 0x7060302 +; GFX11-NEXT: v_add3_u32 v50, v51, v54, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v54 +; GFX11-NEXT: v_add_f32_e32 v54, 0x40c00000, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v26, v50, v51, vcc_lo +; GFX11-NEXT: v_add3_u32 v50, v52, v53, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v53 +; GFX11-NEXT: v_bfe_u32 v52, v55, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GFX11-NEXT: v_bfe_u32 v53, v54, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo +; GFX11-NEXT: v_add3_u32 v51, v52, v55, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v55 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-NEXT: v_dual_add_f32 v64, 0x40c00000, v27 :: v_dual_add_f32 v55, 0x40c00000, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v51, v52, vcc_lo +; GFX11-NEXT: v_add3_u32 v51, v53, v54, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v54 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-NEXT: v_bfe_u32 v54, v55, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v65, 0x40c00000, v28 +; GFX11-NEXT: v_perm_b32 v26, v50, v26, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo +; GFX11-NEXT: v_bfe_u32 v53, v64, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v27, v51, v27, 0x7060302 +; GFX11-NEXT: v_add3_u32 v52, v53, v64, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v64 +; GFX11-NEXT: v_add_f32_e32 v64, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v28, v52, v53, vcc_lo +; GFX11-NEXT: v_add3_u32 v52, v54, v55, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v55 +; GFX11-NEXT: v_bfe_u32 v54, v65, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; GFX11-NEXT: v_bfe_u32 v55, v64, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc_lo +; GFX11-NEXT: v_add3_u32 v53, v54, v65, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v65 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-NEXT: v_dual_add_f32 v66, 0x40c00000, v29 :: v_dual_add_f32 v65, 0x40c00000, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v53, v54, vcc_lo +; GFX11-NEXT: v_add3_u32 v53, v55, v64, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v64 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-NEXT: v_bfe_u32 v64, v65, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v67, 0x40c00000, v30 +; GFX11-NEXT: v_perm_b32 v28, v52, v28, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo +; GFX11-NEXT: v_bfe_u32 v55, v66, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v29, v53, v29, 0x7060302 +; GFX11-NEXT: v_add3_u32 v54, v55, v66, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v66 +; GFX11-NEXT: v_add_f32_e32 v66, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v30, v54, v55, vcc_lo +; GFX11-NEXT: v_add3_u32 v54, v64, v65, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v65 +; GFX11-NEXT: v_bfe_u32 v64, v67, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v0 +; GFX11-NEXT: v_bfe_u32 v65, v66, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo +; GFX11-NEXT: v_add3_u32 v55, v64, v67, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v67 +; GFX11-NEXT: v_add_f32_e32 v68, 0x40c00000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v67, 16, v1 +; GFX11-NEXT: v_perm_b32 v30, v54, v30, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v55, v64, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v67, 0x40c00000, v67 +; GFX11-NEXT: v_add3_u32 v55, v65, v66, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v66 +; GFX11-NEXT: v_bfe_u32 v65, v68, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-NEXT: v_bfe_u32 v66, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo +; GFX11-NEXT: v_add3_u32 v64, v65, v68, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v65, 0x400000, v68 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 +; GFX11-NEXT: v_bfe_u32 v68, v67, 16, 1 +; GFX11-NEXT: v_perm_b32 v31, v55, v31, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo +; GFX11-NEXT: v_add3_u32 v65, v66, v0, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v65, v66 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_add3_u32 v65, v68, v67, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v67 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-NEXT: v_perm_b32 v0, v0, v64, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v68, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v67, 0x400000, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v65, v65, v66, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v66, v68, v1, 0x7fff +; GFX11-NEXT: v_bfe_u32 v68, v69, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v66, v67, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v66, v68, v69, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v67, 0x400000, v69 +; GFX11-NEXT: v_bfe_u32 v68, v2, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 +; GFX11-NEXT: v_perm_b32 v1, v1, v65, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v66, v66, v67, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v67, v68, v2, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v67, v68 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_perm_b32 v2, v2, v66, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v69, v70, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v70 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 +; GFX11-NEXT: v_add3_u32 v67, v69, v70, 0x7fff +; GFX11-NEXT: v_bfe_u32 v69, v3, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo +; GFX11-NEXT: v_add3_u32 v68, v69, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v69, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v68, v69 :: v_dual_and_b32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v3, v3, v67, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v70, v71, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v69, 0x400000, v71 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v68, v70, v71, 0x7fff +; GFX11-NEXT: v_bfe_u32 v70, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v68, v68, v69, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v69, v70, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_dual_cndmask_b32 v4, v69, v70 :: v_dual_and_b32 v5, 0xffff0000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_perm_b32 v4, v4, v68, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v71, v80, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v80 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 +; GFX11-NEXT: v_add3_u32 v69, v71, v80, 0x7fff +; GFX11-NEXT: v_bfe_u32 v71, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo +; GFX11-NEXT: v_add3_u32 v70, v71, v5, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v71, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v70, v71 :: v_dual_and_b32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v5, v5, v69, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v80, v81, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v71, 0x400000, v81 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v70, v80, v81, 0x7fff +; GFX11-NEXT: v_bfe_u32 v80, v6, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v71, v80, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v71, v80 :: v_dual_and_b32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_perm_b32 v6, v6, v70, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v81, v82, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v82 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-NEXT: v_add3_u32 v71, v81, v82, 0x7fff +; GFX11-NEXT: v_bfe_u32 v81, v7, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v71, v71, v80, vcc_lo +; GFX11-NEXT: v_add3_u32 v80, v81, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v81, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v7, v80, v81 :: v_dual_and_b32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v7, v7, v71, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v82, v83, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v81, 0x400000, v83 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v80, v82, v83, 0x7fff +; GFX11-NEXT: v_bfe_u32 v82, v8, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v80, v80, v81, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v81, v82, v8, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_dual_cndmask_b32 v8, v81, v82 :: v_dual_and_b32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_perm_b32 v8, v8, v80, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v83, v84, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v84 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX11-NEXT: v_add3_u32 v81, v83, v84, 0x7fff +; GFX11-NEXT: v_bfe_u32 v83, v9, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v81, v81, v82, vcc_lo +; GFX11-NEXT: v_add3_u32 v82, v83, v9, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v83, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v9, v82, v83 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v9, v9, v81, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v84, v85, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v83, 0x400000, v85 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v82, v84, v85, 0x7fff +; GFX11-NEXT: v_bfe_u32 v84, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v83, v84, v10, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v83, v84 :: v_dual_and_b32 v11, 0xffff0000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_perm_b32 v10, v10, v82, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v85, v86, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v86 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v11 +; GFX11-NEXT: v_add3_u32 v83, v85, v86, 0x7fff +; GFX11-NEXT: v_bfe_u32 v85, v11, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v83, v83, v84 :: v_dual_lshlrev_b32 v84, 16, v13 +; GFX11-NEXT: v_bfe_u32 v86, v87, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_or_b32_e32 v97, 0x400000, v87 +; GFX11-NEXT: v_add_f32_e32 v84, 0x40c00000, v84 +; GFX11-NEXT: v_add3_u32 v86, v86, v87, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_bfe_u32 v98, v12, 16, 1 +; GFX11-NEXT: v_bfe_u32 v99, v84, 16, 1 +; GFX11-NEXT: v_add3_u32 v85, v85, v11, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v86, v86, v97, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v97, 0x400000, v12 +; GFX11-NEXT: v_add3_u32 v87, v98, v12, 0x7fff +; GFX11-NEXT: v_add3_u32 v98, v99, v84, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v99, 16, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v84 +; GFX11-NEXT: v_bfe_u32 v101, v13, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v12, v87, v97 :: v_dual_add_f32 v87, 0x40c00000, v99 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v97, v101, v13, 0x7fff +; GFX11-NEXT: v_perm_b32 v12, v12, v86, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v99, v87, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v84, v98, v100, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v98, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v101, 0x400000, v87 +; GFX11-NEXT: v_bfe_u32 v102, v14, 16, 1 +; GFX11-NEXT: v_add3_u32 v99, v99, v87, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v98, 0x40c00000, v98 :: v_dual_cndmask_b32 v87, v99, v101 +; GFX11-NEXT: v_add3_u32 v101, v102, v14, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_bfe_u32 v103, v98, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v98 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v14, v101, v102 :: v_dual_add_f32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_add3_u32 v103, v103, v98, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v14, v14, v87, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v99, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v113, 0x400000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v98, v103, v112, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v99, v99, v15, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v15, v99, v113, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v15, v15, v98, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v97, v100, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_perm_b32 v13, v13, v84, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v11, v11, v83, 0x7060302 +; GFX11-NEXT: .LBB50_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} + +define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v64f16_to_v64bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 +; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 +; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 +; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 +; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 +; GCN-NEXT: v_cvt_f16_f32_e32 v58, v31 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v59, v1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 +; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 +; GCN-NEXT: v_cvt_f16_f32_e32 v62, v62 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB51_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v15 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: .LBB51_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB51_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f32_f16_e32 v63, v63 +; GCN-NEXT: v_cvt_f32_f16_e32 v62, v62 +; GCN-NEXT: v_cvt_f32_f16_e32 v61, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 +; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 +; GCN-NEXT: v_cvt_f32_f16_e32 v58, v58 +; GCN-NEXT: v_cvt_f32_f16_e32 v57, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v56 +; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v63 +; GCN-NEXT: v_add_f32_e32 v62, 0x38000000, v62 +; GCN-NEXT: v_add_f32_e32 v61, 0x38000000, v61 +; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v60 +; GCN-NEXT: v_add_f32_e32 v59, 0x38000000, v59 +; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; GCN-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; GCN-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; GCN-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; GCN-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 +; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 +; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 +; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 +; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 +; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 +; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 +; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 +; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 +; GCN-NEXT: v_cvt_f16_f32_e32 v62, v62 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v63, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v12 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v15 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v16 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v17 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v18 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v45 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v47 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v56 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v57 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v59 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v60 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v62 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: .LBB51_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v45, v1, v2, 16 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v44, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v47, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v46, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; GCN-NEXT: v_add_i32_e32 v56, vcc, 20, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v3, v1, v3, 16 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v5, v1, v5, 16 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v7, v1, v7, 16 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v9, v1, v9, 16 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 36, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v11, v1, v11, 16 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v13, v1, v13, 16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 44, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v15, v1, v15, 16 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v17, v1, v17, 16 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v19, v1, v19, 16 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 56, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v21, v1, v21, 16 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v23, v1, v23, 16 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 64, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v25, v1, v25, 16 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x44, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v27, v1, v27, 16 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 0x48, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v29, v1, v29, 16 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x4c, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v31, v1, v31, 16 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x50, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v33, v1, v33, 16 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 0x54, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v35, v1, v35, 16 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 0x58, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v37, v1, v37, 16 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x5c, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v39, v1, v39, 16 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x60, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v49, v1, v49, 16 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x64, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v51, v1, v51, 16 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v53, v1, v53, 16 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x6c, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v55, v1, v55, 16 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v41, v1, v41, 16 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v45, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v56, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v29, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v33, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v35, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v37, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v49, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v51, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v53, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v55, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v32, 0x200 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v15 +; VI-NEXT: v_add_f16_sdwa v15, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v33, v15 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v14, v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v33, v14 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v13, v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v33, v13 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v12, v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v33, v12 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v11, v11, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v33, v11 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v33, v10 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v9, v9, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v33, v9 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v33, v8 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v33, v7 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v33, v6 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v33, v5 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v33, v4 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v3, v3, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v33, v3 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v33, v2 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v33, v1 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v33, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_e32 v33, 0x200, v31 +; VI-NEXT: v_add_f16_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v33, v31 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v30 +; VI-NEXT: v_add_f16_sdwa v30, v30, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v33, v30 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v29 +; VI-NEXT: v_add_f16_sdwa v29, v29, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v33, v29 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v28 +; VI-NEXT: v_add_f16_sdwa v28, v28, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v33, v28 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v27 +; VI-NEXT: v_add_f16_sdwa v27, v27, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v33, v27 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v26 +; VI-NEXT: v_add_f16_sdwa v26, v26, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v33, v26 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v25 +; VI-NEXT: v_add_f16_sdwa v25, v25, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v33, v25 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v24 +; VI-NEXT: v_add_f16_sdwa v24, v24, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v33, v24 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v23 +; VI-NEXT: v_add_f16_sdwa v23, v23, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v33, v23 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v22 +; VI-NEXT: v_add_f16_sdwa v22, v22, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v33, v22 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v21 +; VI-NEXT: v_add_f16_sdwa v21, v21, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v33, v21 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v20 +; VI-NEXT: v_add_f16_sdwa v20, v20, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v33, v20 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v19 +; VI-NEXT: v_add_f16_sdwa v19, v19, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v33, v19 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v18 +; VI-NEXT: v_add_f16_sdwa v18, v18, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v33, v18 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v17, v17, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v32, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v33, v17 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64f16_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64f16_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB51_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v64bf16_to_v64i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v6 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v10 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v18 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v22 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v25 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v27 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v28 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v30 +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v40 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v55 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v38 +; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v59 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v58 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v57 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v56 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v46 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v45 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v44 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v43 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v42 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v50 +; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v49 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 +; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v48 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v39 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v5 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v16 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:132 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v26 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v11 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v27 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; kill: killed $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; kill: killed $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; kill: killed $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; kill: killed $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; kill: killed $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; kill: killed $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; kill: killed $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; kill: killed $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; kill: killed $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB52_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v36 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v35 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v26 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v34 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v33 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v31 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v26 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v41 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v57 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v43 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v59 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v60 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v7 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; kill: killed $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: .LBB52_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB52_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v35 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v35 +; GCN-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v34 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v33 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v33 +; GCN-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v31 +; GCN-NEXT: v_alignbit_b32 v26, v27, v26, 16 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v9 +; GCN-NEXT: v_alignbit_b32 v10, v26, v10, 16 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GCN-NEXT: v_alignbit_b32 v8, v10, v8, 16 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GCN-NEXT: v_alignbit_b32 v4, v8, v4, 16 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_alignbit_b32 v2, v4, v2, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v61 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_alignbit_b32 v4, v4, v8, 16 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v41 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v25 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GCN-NEXT: v_alignbit_b32 v8, v8, v10, 16 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v24 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GCN-NEXT: v_alignbit_b32 v10, v10, v24, 16 +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v10 +; GCN-NEXT: v_alignbit_b32 v22, v22, v23, 16 +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GCN-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v37 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v19 +; GCN-NEXT: v_alignbit_b32 v21, v21, v22, 16 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v15 +; GCN-NEXT: v_alignbit_b32 v18, v21, v18, 16 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v13 +; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; GCN-NEXT: v_alignbit_b32 v16, v17, v16, 16 +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v59 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v60 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v57 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v43 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v34 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v37 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v51 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v53 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v54 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v55 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v40 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v41, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v42, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v43, 0x40c00000, v30 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v32 +; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v34 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v36 +; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v37 +; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v38 +; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v39 +; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v48 +; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v49 +; GCN-NEXT: v_add_f32_e32 v39, 0x40c00000, v50 +; GCN-NEXT: v_add_f32_e32 v56, 0x40c00000, v51 +; GCN-NEXT: v_add_f32_e32 v48, 0x40c00000, v52 +; GCN-NEXT: v_add_f32_e32 v57, 0x40c00000, v53 +; GCN-NEXT: v_add_f32_e32 v49, 0x40c00000, v54 +; GCN-NEXT: v_add_f32_e32 v59, 0x40c00000, v55 +; GCN-NEXT: v_add_f32_e32 v50, 0x40c00000, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v27 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v28 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v29 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v30 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v37 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v39 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v48 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v49 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v50 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v35 +; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v39, v51, v12, 16 +; GCN-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v49, v53, v7, 16 +; GCN-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v50, v54, v16, 16 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_alignbit_b32 v51, v21, v18, 16 +; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_alignbit_b32 v53, v23, v22, 16 +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_alignbit_b32 v54, v25, v24, 16 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v55, v26, v41, 16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mov_b32_e32 v26, v40 +; GCN-NEXT: v_alignbit_b32 v40, v26, v42, 16 +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v42, v44, v43, 16 +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_alignbit_b32 v44, v45, v32, 16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v45, v27, v34, 16 +; GCN-NEXT: v_alignbit_b32 v46, v28, v36, 16 +; GCN-NEXT: v_alignbit_b32 v47, v29, v38, 16 +; GCN-NEXT: v_alignbit_b32 v56, v30, v56, 16 +; GCN-NEXT: v_alignbit_b32 v58, v48, v57, 16 +; GCN-NEXT: v_alignbit_b32 v62, v52, v59, 16 +; GCN-NEXT: v_alignbit_b32 v7, v62, v20, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v58, v19, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v56, v17, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v47, v9, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v6, v46, v6, 16 +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v3, v45, v3, 16 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v1, v44, v1, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v42, v2, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v40, v4, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v55, v8, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v10, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v53, v15, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v51, v14, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v50, v13, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v49, v5, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v39, v11, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: .LBB52_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v59, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; GCN-NEXT: v_or_b32_e32 v57, v1, v2 +; GCN-NEXT: v_add_i32_e32 v63, vcc, 8, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v58, v1, v2 +; GCN-NEXT: v_add_i32_e32 v62, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; GCN-NEXT: v_or_b32_e32 v56, v1, v2 +; GCN-NEXT: v_add_i32_e32 v61, vcc, 16, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_add_i32_e32 v60, vcc, 20, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; GCN-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; GCN-NEXT: v_or_b32_e32 v10, v10, v11 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v44 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v42 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_or_b32_e32 v18, v18, v19 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_or_b32_e32 v20, v20, v21 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v26 +; GCN-NEXT: v_or_b32_e32 v22, v22, v23 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_or_b32_e32 v24, v24, v25 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v55 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_or_b32_e32 v26, v26, v27 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_or_b32_e32 v28, v28, v29 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v54 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GCN-NEXT: v_or_b32_e32 v30, v30, v31 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_or_b32_e32 v32, v32, v33 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v53 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_or_b32_e32 v34, v34, v35 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GCN-NEXT: v_or_b32_e32 v36, v36, v37 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 +; GCN-NEXT: v_and_b32_e32 v38, 0xffff, v51 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_or_b32_e32 v38, v38, v48 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x60, v0 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_or_b32_e32 v51, v51, v52 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x64, v0 +; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_or_b32_e32 v50, v50, v53 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: v_or_b32_e32 v54, v54, v55 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x6c, v0 +; GCN-NEXT: v_and_b32_e32 v49, 0xffff, v49 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_or_b32_e32 v49, v49, v40 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v41, 0xffff, v41 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_or_b32_e32 v41, v41, v42 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 +; GCN-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: v_or_b32_e32 v39, v39, v43 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v59, v63, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v62, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v61, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v56, v60, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v51, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v49, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64bf16_to_v64i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; VI-NEXT: v_bfe_u32 v33, v32, 16, 1 +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v32 +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x7fff, v33 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; VI-NEXT: v_bfe_u32 v33, v16, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v33, vcc, v33, v16 +; VI-NEXT: v_add_u32_e32 v33, vcc, s6, v33 +; VI-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; VI-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; VI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; VI-NEXT: v_bfe_u32 v34, v33, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v33 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; VI-NEXT: v_bfe_u32 v34, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v34, vcc, v34, v17 +; VI-NEXT: v_add_u32_e32 v34, vcc, s6, v34 +; VI-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; VI-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; VI-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; VI-NEXT: v_bfe_u32 v35, v34, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v34 +; VI-NEXT: v_add_u32_e32 v35, vcc, s6, v35 +; VI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; VI-NEXT: v_bfe_u32 v35, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v35, vcc, v35, v18 +; VI-NEXT: v_add_u32_e32 v35, vcc, s6, v35 +; VI-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc +; VI-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; VI-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; VI-NEXT: v_bfe_u32 v36, v35, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v35 +; VI-NEXT: v_add_u32_e32 v36, vcc, s6, v36 +; VI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; VI-NEXT: v_bfe_u32 v36, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v36, vcc, v36, v19 +; VI-NEXT: v_add_u32_e32 v36, vcc, s6, v36 +; VI-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc +; VI-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; VI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; VI-NEXT: v_bfe_u32 v37, v36, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v36 +; VI-NEXT: v_add_u32_e32 v37, vcc, s6, v37 +; VI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; VI-NEXT: v_bfe_u32 v37, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v37, vcc, v37, v20 +; VI-NEXT: v_add_u32_e32 v37, vcc, s6, v37 +; VI-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc +; VI-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; VI-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; VI-NEXT: v_bfe_u32 v38, v37, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v37 +; VI-NEXT: v_add_u32_e32 v38, vcc, s6, v38 +; VI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; VI-NEXT: v_bfe_u32 v38, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v38, vcc, v38, v21 +; VI-NEXT: v_add_u32_e32 v38, vcc, s6, v38 +; VI-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc +; VI-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; VI-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; VI-NEXT: v_bfe_u32 v39, v38, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v38 +; VI-NEXT: v_add_u32_e32 v39, vcc, s6, v39 +; VI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; VI-NEXT: v_bfe_u32 v39, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v39, vcc, v39, v22 +; VI-NEXT: v_add_u32_e32 v39, vcc, s6, v39 +; VI-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc +; VI-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; VI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; VI-NEXT: v_bfe_u32 v48, v39, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v39 +; VI-NEXT: v_add_u32_e32 v48, vcc, s6, v48 +; VI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; VI-NEXT: v_bfe_u32 v48, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v48, vcc, v48, v23 +; VI-NEXT: v_add_u32_e32 v48, vcc, s6, v48 +; VI-NEXT: v_or_b32_e32 v49, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc +; VI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; VI-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; VI-NEXT: v_bfe_u32 v49, v48, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v48 +; VI-NEXT: v_add_u32_e32 v49, vcc, s6, v49 +; VI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; VI-NEXT: v_bfe_u32 v49, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v49, vcc, v49, v24 +; VI-NEXT: v_add_u32_e32 v49, vcc, s6, v49 +; VI-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc +; VI-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; VI-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; VI-NEXT: v_bfe_u32 v50, v49, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v49 +; VI-NEXT: v_add_u32_e32 v50, vcc, s6, v50 +; VI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; VI-NEXT: v_bfe_u32 v50, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v50, vcc, v50, v25 +; VI-NEXT: v_add_u32_e32 v50, vcc, s6, v50 +; VI-NEXT: v_or_b32_e32 v51, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc +; VI-NEXT: v_lshlrev_b32_e32 v50, 16, v26 +; VI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; VI-NEXT: v_bfe_u32 v51, v50, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v50 +; VI-NEXT: v_add_u32_e32 v51, vcc, s6, v51 +; VI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; VI-NEXT: v_bfe_u32 v51, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v51, vcc, v51, v26 +; VI-NEXT: v_add_u32_e32 v51, vcc, s6, v51 +; VI-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc +; VI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; VI-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; VI-NEXT: v_bfe_u32 v52, v51, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v51 +; VI-NEXT: v_add_u32_e32 v52, vcc, s6, v52 +; VI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; VI-NEXT: v_bfe_u32 v52, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v52, vcc, v52, v27 +; VI-NEXT: v_add_u32_e32 v52, vcc, s6, v52 +; VI-NEXT: v_or_b32_e32 v53, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc +; VI-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; VI-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; VI-NEXT: v_bfe_u32 v53, v52, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v52 +; VI-NEXT: v_add_u32_e32 v53, vcc, s6, v53 +; VI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; VI-NEXT: v_bfe_u32 v53, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v53, vcc, v53, v28 +; VI-NEXT: v_add_u32_e32 v53, vcc, s6, v53 +; VI-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc +; VI-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; VI-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; VI-NEXT: v_bfe_u32 v54, v53, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v53 +; VI-NEXT: v_add_u32_e32 v54, vcc, s6, v54 +; VI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; VI-NEXT: v_bfe_u32 v54, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v54, vcc, v54, v29 +; VI-NEXT: v_add_u32_e32 v54, vcc, s6, v54 +; VI-NEXT: v_or_b32_e32 v55, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; VI-NEXT: v_lshlrev_b32_e32 v54, 16, v30 +; VI-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; VI-NEXT: v_bfe_u32 v55, v54, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v54 +; VI-NEXT: v_add_u32_e32 v55, vcc, s6, v55 +; VI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; VI-NEXT: v_bfe_u32 v55, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v55, vcc, v55, v30 +; VI-NEXT: v_add_u32_e32 v55, vcc, s6, v55 +; VI-NEXT: v_or_b32_e32 v40, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; VI-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; VI-NEXT: v_bfe_u32 v40, v55, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v55 +; VI-NEXT: v_add_u32_e32 v40, vcc, s6, v40 +; VI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; VI-NEXT: v_bfe_u32 v40, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v40, vcc, v40, v31 +; VI-NEXT: v_add_u32_e32 v40, vcc, s6, v40 +; VI-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; VI-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; VI-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; VI-NEXT: v_bfe_u32 v41, v40, 16, 1 +; VI-NEXT: v_add_u32_e32 v41, vcc, v41, v40 +; VI-NEXT: v_add_u32_e32 v41, vcc, s6, v41 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v42, 0x400000, v40 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc +; VI-NEXT: v_bfe_u32 v41, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v41, vcc, v41, v0 +; VI-NEXT: v_add_u32_e32 v41, vcc, s6, v41 +; VI-NEXT: v_or_b32_e32 v42, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v17, v41, v42, vcc +; VI-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; VI-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 +; VI-NEXT: v_bfe_u32 v42, v41, 16, 1 +; VI-NEXT: v_add_u32_e32 v42, vcc, v42, v41 +; VI-NEXT: v_add_u32_e32 v42, vcc, s6, v42 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v43, 0x400000, v41 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v41, v41 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc +; VI-NEXT: v_bfe_u32 v42, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v42, vcc, v42, v1 +; VI-NEXT: v_add_u32_e32 v42, vcc, s6, v42 +; VI-NEXT: v_or_b32_e32 v43, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc +; VI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; VI-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 +; VI-NEXT: v_bfe_u32 v43, v42, 16, 1 +; VI-NEXT: v_add_u32_e32 v43, vcc, v43, v42 +; VI-NEXT: v_add_u32_e32 v43, vcc, s6, v43 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v44, 0x400000, v42 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v42, v42 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v42, v43, v44, vcc +; VI-NEXT: v_bfe_u32 v43, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v43, vcc, v43, v2 +; VI-NEXT: v_add_u32_e32 v43, vcc, s6, v43 +; VI-NEXT: v_or_b32_e32 v44, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v43, v44, vcc +; VI-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; VI-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 +; VI-NEXT: v_bfe_u32 v44, v43, 16, 1 +; VI-NEXT: v_add_u32_e32 v44, vcc, v44, v43 +; VI-NEXT: v_add_u32_e32 v44, vcc, s6, v44 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v45, 0x400000, v43 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v43, v43 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v43, v44, v45, vcc +; VI-NEXT: v_bfe_u32 v44, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v44, vcc, v44, v3 +; VI-NEXT: v_add_u32_e32 v44, vcc, s6, v44 +; VI-NEXT: v_or_b32_e32 v45, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v44, v45, vcc +; VI-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; VI-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 +; VI-NEXT: v_bfe_u32 v45, v44, 16, 1 +; VI-NEXT: v_add_u32_e32 v45, vcc, v45, v44 +; VI-NEXT: v_add_u32_e32 v45, vcc, s6, v45 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v46, 0x400000, v44 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v44, v44 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v44, v45, v46, vcc +; VI-NEXT: v_bfe_u32 v45, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v45, vcc, v45, v4 +; VI-NEXT: v_add_u32_e32 v45, vcc, s6, v45 +; VI-NEXT: v_or_b32_e32 v46, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v45, v46, vcc +; VI-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; VI-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 +; VI-NEXT: v_bfe_u32 v46, v45, 16, 1 +; VI-NEXT: v_add_u32_e32 v46, vcc, v46, v45 +; VI-NEXT: v_add_u32_e32 v46, vcc, s6, v46 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v47, 0x400000, v45 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v45, v45 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v45, v46, v47, vcc +; VI-NEXT: v_bfe_u32 v46, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v46, vcc, v46, v5 +; VI-NEXT: v_add_u32_e32 v46, vcc, s6, v46 +; VI-NEXT: v_or_b32_e32 v47, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v46, v47, vcc +; VI-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; VI-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 +; VI-NEXT: v_bfe_u32 v47, v46, 16, 1 +; VI-NEXT: v_add_u32_e32 v47, vcc, v47, v46 +; VI-NEXT: v_add_u32_e32 v47, vcc, s6, v47 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v56, 0x400000, v46 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v46, v46 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v46, v47, v56, vcc +; VI-NEXT: v_bfe_u32 v47, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v47, vcc, v47, v6 +; VI-NEXT: v_add_u32_e32 v47, vcc, s6, v47 +; VI-NEXT: v_or_b32_e32 v56, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v47, v56, vcc +; VI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; VI-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 +; VI-NEXT: v_bfe_u32 v56, v47, 16, 1 +; VI-NEXT: v_add_u32_e32 v56, vcc, v56, v47 +; VI-NEXT: v_add_u32_e32 v56, vcc, s6, v56 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v57, 0x400000, v47 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v47, v47 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v47, v56, v57, vcc +; VI-NEXT: v_bfe_u32 v56, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v56, vcc, v56, v7 +; VI-NEXT: v_add_u32_e32 v56, vcc, s6, v56 +; VI-NEXT: v_or_b32_e32 v57, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v56, v57, vcc +; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v8 +; VI-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 +; VI-NEXT: v_bfe_u32 v57, v56, 16, 1 +; VI-NEXT: v_add_u32_e32 v57, vcc, v57, v56 +; VI-NEXT: v_add_u32_e32 v57, vcc, s6, v57 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v58, 0x400000, v56 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v56, v56 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v56, v57, v58, vcc +; VI-NEXT: v_bfe_u32 v57, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v57, vcc, v57, v8 +; VI-NEXT: v_add_u32_e32 v57, vcc, s6, v57 +; VI-NEXT: v_or_b32_e32 v58, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v57, v58, vcc +; VI-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; VI-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 +; VI-NEXT: v_bfe_u32 v58, v57, 16, 1 +; VI-NEXT: v_add_u32_e32 v58, vcc, v58, v57 +; VI-NEXT: v_add_u32_e32 v58, vcc, s6, v58 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v59, 0x400000, v57 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v57, v57 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v57, v58, v59, vcc +; VI-NEXT: v_bfe_u32 v58, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v58, vcc, v58, v9 +; VI-NEXT: v_add_u32_e32 v58, vcc, s6, v58 +; VI-NEXT: v_or_b32_e32 v59, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v58, v59, vcc +; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v10 +; VI-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 +; VI-NEXT: v_bfe_u32 v59, v58, 16, 1 +; VI-NEXT: v_add_u32_e32 v59, vcc, v59, v58 +; VI-NEXT: v_add_u32_e32 v59, vcc, s6, v59 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v60, 0x400000, v58 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v58, v58 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v58, v59, v60, vcc +; VI-NEXT: v_bfe_u32 v59, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v59, vcc, v59, v10 +; VI-NEXT: v_add_u32_e32 v59, vcc, s6, v59 +; VI-NEXT: v_or_b32_e32 v60, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v59, v60, vcc +; VI-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; VI-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 +; VI-NEXT: v_bfe_u32 v60, v59, 16, 1 +; VI-NEXT: v_add_u32_e32 v60, vcc, v60, v59 +; VI-NEXT: v_add_u32_e32 v60, vcc, s6, v60 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v61, 0x400000, v59 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v59, v59 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v59, v60, v61, vcc +; VI-NEXT: v_bfe_u32 v60, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v60, vcc, v60, v11 +; VI-NEXT: v_add_u32_e32 v60, vcc, s6, v60 +; VI-NEXT: v_or_b32_e32 v61, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v60, v61, vcc +; VI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; VI-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 +; VI-NEXT: v_bfe_u32 v61, v60, 16, 1 +; VI-NEXT: v_add_u32_e32 v61, vcc, v61, v60 +; VI-NEXT: v_add_u32_e32 v61, vcc, s6, v61 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v62, 0x400000, v60 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v60, v60 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v60, v61, v62, vcc +; VI-NEXT: v_bfe_u32 v61, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v61, vcc, v61, v12 +; VI-NEXT: v_add_u32_e32 v61, vcc, s6, v61 +; VI-NEXT: v_or_b32_e32 v62, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v61, v62, vcc +; VI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; VI-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 +; VI-NEXT: v_bfe_u32 v62, v61, 16, 1 +; VI-NEXT: v_add_u32_e32 v62, vcc, v62, v61 +; VI-NEXT: v_add_u32_e32 v62, vcc, s6, v62 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v63, 0x400000, v61 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v61, v61 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v61, v62, v63, vcc +; VI-NEXT: v_bfe_u32 v62, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v62, vcc, v62, v13 +; VI-NEXT: v_add_u32_e32 v62, vcc, s6, v62 +; VI-NEXT: v_or_b32_e32 v63, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v62, v63, vcc +; VI-NEXT: v_lshlrev_b32_e32 v62, 16, v14 +; VI-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; VI-NEXT: v_bfe_u32 v63, v62, 16, 1 +; VI-NEXT: v_add_u32_e32 v63, vcc, v63, v62 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_add_u32_e32 v63, vcc, s6, v63 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; VI-NEXT: v_bfe_u32 v62, v14, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v1, v63, v0, vcc +; VI-NEXT: v_add_u32_e32 v62, vcc, v62, v14 +; VI-NEXT: v_add_u32_e32 v62, vcc, s6, v62 +; VI-NEXT: v_or_b32_e32 v63, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v62, v63, vcc +; VI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; VI-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; VI-NEXT: v_bfe_u32 v63, v62, 16, 1 +; VI-NEXT: v_add_u32_e32 v63, vcc, v63, v62 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_add_u32_e32 v63, vcc, s6, v63 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; VI-NEXT: v_bfe_u32 v62, v15, 16, 1 +; VI-NEXT: v_cndmask_b32_e32 v0, v63, v0, vcc +; VI-NEXT: v_add_u32_e32 v62, vcc, v62, v15 +; VI-NEXT: v_add_u32_e32 v62, vcc, s6, v62 +; VI-NEXT: v_or_b32_e32 v63, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v62, v63, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v0, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v0, v1, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v0, v61, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v0, v60, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v0, v59, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v0, v58, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v0, v57, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v0, v56, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v0, v47, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v0, v46, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v0, v45, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v0, v44, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v0, v43, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v0, v42, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v31 +; VI-NEXT: v_alignbit_b32 v31, v16, v55, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v30 +; VI-NEXT: v_alignbit_b32 v30, v16, v54, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v29 +; VI-NEXT: v_alignbit_b32 v29, v16, v53, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; VI-NEXT: v_alignbit_b32 v28, v16, v52, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; VI-NEXT: v_alignbit_b32 v27, v16, v51, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; VI-NEXT: v_alignbit_b32 v26, v16, v50, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; VI-NEXT: v_alignbit_b32 v25, v16, v49, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; VI-NEXT: v_alignbit_b32 v24, v16, v48, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v23 +; VI-NEXT: v_alignbit_b32 v23, v16, v39, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; VI-NEXT: v_alignbit_b32 v22, v16, v38, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v21 +; VI-NEXT: v_alignbit_b32 v21, v16, v37, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; VI-NEXT: v_alignbit_b32 v20, v16, v36, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; VI-NEXT: v_alignbit_b32 v19, v16, v35, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; VI-NEXT: v_alignbit_b32 v18, v16, v34, 16 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; VI-NEXT: v_alignbit_b32 v1, v0, v41, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_alignbit_b32 v17, v16, v33, 16 +; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_alignbit_b32 v0, v0, v40, 16 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 +; VI-NEXT: .LBB52_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64bf16_to_v64i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v33, v33, v32, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX9-NEXT: v_bfe_u32 v33, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v33, v33, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 +; GFX9-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_add3_u32 v34, v34, v33, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v33 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v33, v34, v35, vcc +; GFX9-NEXT: v_bfe_u32 v34, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v34, v34, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v35, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v32, v34, v35, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX9-NEXT: v_bfe_u32 v35, v34, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_add3_u32 v35, v35, v34, s6 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v34 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v34, v34 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v34, v35, v36, vcc +; GFX9-NEXT: v_bfe_u32 v35, v18, 16, 1 +; GFX9-NEXT: v_add3_u32 v35, v35, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v36, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v35, v36, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v35, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX9-NEXT: v_bfe_u32 v36, v35, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX9-NEXT: v_add3_u32 v36, v36, v35, s6 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v35 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v35, v35 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v35, v36, v37, vcc +; GFX9-NEXT: v_bfe_u32 v36, v19, 16, 1 +; GFX9-NEXT: v_add3_u32 v36, v36, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v36, v37, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX9-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX9-NEXT: v_add3_u32 v37, v37, v36, s6 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v36, v36 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v36, v37, v38, vcc +; GFX9-NEXT: v_bfe_u32 v37, v20, 16, 1 +; GFX9-NEXT: v_add3_u32 v37, v37, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v38, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v37, v38, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v37, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX9-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX9-NEXT: v_add3_u32 v38, v38, v37, s6 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v37, v37 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v37, v38, v39, vcc +; GFX9-NEXT: v_bfe_u32 v38, v21, 16, 1 +; GFX9-NEXT: v_add3_u32 v38, v38, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v38, v39, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v38, 0x40c00000, v38 +; GFX9-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX9-NEXT: v_add3_u32 v39, v39, v38, s6 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v38 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v38, v38 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v38, v39, v48, vcc +; GFX9-NEXT: v_bfe_u32 v39, v22, 16, 1 +; GFX9-NEXT: v_add3_u32 v39, v39, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v48, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v39, v48, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; GFX9-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; GFX9-NEXT: v_bfe_u32 v48, v39, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX9-NEXT: v_add3_u32 v48, v48, v39, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v39 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v39, v39 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v39, v48, v49, vcc +; GFX9-NEXT: v_bfe_u32 v48, v23, 16, 1 +; GFX9-NEXT: v_add3_u32 v48, v48, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v49, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v48, 0x40c00000, v48 +; GFX9-NEXT: v_bfe_u32 v49, v48, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX9-NEXT: v_add3_u32 v49, v49, v48, s6 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v48 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v48, v48 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v48, v49, v50, vcc +; GFX9-NEXT: v_bfe_u32 v49, v24, 16, 1 +; GFX9-NEXT: v_add3_u32 v49, v49, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v50, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v49, v50, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v49, 16, v25 +; GFX9-NEXT: v_add_f32_e32 v49, 0x40c00000, v49 +; GFX9-NEXT: v_bfe_u32 v50, v49, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX9-NEXT: v_add3_u32 v50, v50, v49, s6 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v49 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v49, v49 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v49, v50, v51, vcc +; GFX9-NEXT: v_bfe_u32 v50, v25, 16, 1 +; GFX9-NEXT: v_add3_u32 v50, v50, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v51, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v50, v51, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v50, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 +; GFX9-NEXT: v_bfe_u32 v51, v50, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX9-NEXT: v_add3_u32 v51, v51, v50, s6 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v50 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v50, v50 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v50, v51, v52, vcc +; GFX9-NEXT: v_bfe_u32 v51, v26, 16, 1 +; GFX9-NEXT: v_add3_u32 v51, v51, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v51, v52, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v51, 0x40c00000, v51 +; GFX9-NEXT: v_bfe_u32 v52, v51, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX9-NEXT: v_add3_u32 v52, v52, v51, s6 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v51 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v51, v51 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v51, v52, v53, vcc +; GFX9-NEXT: v_bfe_u32 v52, v27, 16, 1 +; GFX9-NEXT: v_add3_u32 v52, v52, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v53, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v52, v53, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v52, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v52, 0x40c00000, v52 +; GFX9-NEXT: v_bfe_u32 v53, v52, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX9-NEXT: v_add3_u32 v53, v53, v52, s6 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v52 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v52, v52 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v52, v53, v54, vcc +; GFX9-NEXT: v_bfe_u32 v53, v28, 16, 1 +; GFX9-NEXT: v_add3_u32 v53, v53, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v54, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v53, v54, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v53, 0x40c00000, v53 +; GFX9-NEXT: v_bfe_u32 v54, v53, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX9-NEXT: v_add3_u32 v54, v54, v53, s6 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v53 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v53, v53 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v53, v54, v55, vcc +; GFX9-NEXT: v_bfe_u32 v54, v29, 16, 1 +; GFX9-NEXT: v_add3_u32 v54, v54, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v55, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v54, v55, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v54, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v54, 0x40c00000, v54 +; GFX9-NEXT: v_bfe_u32 v55, v54, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX9-NEXT: v_add3_u32 v55, v55, v54, s6 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v54 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v54, v54 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v54, v55, v40, vcc +; GFX9-NEXT: v_bfe_u32 v55, v30, 16, 1 +; GFX9-NEXT: v_add3_u32 v55, v55, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v40, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v55, v40, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; GFX9-NEXT: v_add_f32_e32 v55, 0x40c00000, v55 +; GFX9-NEXT: v_bfe_u32 v40, v55, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX9-NEXT: v_add3_u32 v40, v40, v55, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v55 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v55, v55 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v55, v40, v41, vcc +; GFX9-NEXT: v_bfe_u32 v40, v31, 16, 1 +; GFX9-NEXT: v_add3_u32 v40, v40, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v40, v41, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v40, 0x40c00000, v40 +; GFX9-NEXT: v_bfe_u32 v41, v40, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v41, v41, v40, s6 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v40 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v40, v40 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v40, v41, v42, vcc +; GFX9-NEXT: v_bfe_u32 v41, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v41, v41, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v42, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v41, v42, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v41, 0x40c00000, v41 +; GFX9-NEXT: v_bfe_u32 v42, v41, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v42, v42, v41, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v41 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v41, v41 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v41, v42, v43, vcc +; GFX9-NEXT: v_bfe_u32 v42, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v42, v42, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v42, 0x40c00000, v42 +; GFX9-NEXT: v_bfe_u32 v43, v42, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v43, v43, v42, s6 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v42 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v42, v42 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v42, v43, v44, vcc +; GFX9-NEXT: v_bfe_u32 v43, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v43, v43, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v43, v44, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v43, 0x40c00000, v43 +; GFX9-NEXT: v_bfe_u32 v44, v43, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v44, v44, v43, s6 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v43 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v43, v43 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v43, v44, v45, vcc +; GFX9-NEXT: v_bfe_u32 v44, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v44, v44, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v44, v45, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 +; GFX9-NEXT: v_bfe_u32 v45, v44, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v45, v45, v44, s6 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v44 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v44, v44 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v44, v45, v46, vcc +; GFX9-NEXT: v_bfe_u32 v45, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v45, v45, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v45, v46, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v45, 0x40c00000, v45 +; GFX9-NEXT: v_bfe_u32 v46, v45, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v46, v46, v45, s6 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v45 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v45, v45 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v45, v46, v47, vcc +; GFX9-NEXT: v_bfe_u32 v46, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v46, v46, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v47, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v46, v47, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v46, 0x40c00000, v46 +; GFX9-NEXT: v_bfe_u32 v47, v46, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v47, v47, v46, s6 +; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v46 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v46, v46 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v46, v47, v56, vcc +; GFX9-NEXT: v_bfe_u32 v47, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v47, v47, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v56, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v47, v56, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v47, 0x40c00000, v47 +; GFX9-NEXT: v_bfe_u32 v56, v47, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v56, v56, v47, s6 +; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v47 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v47, v47 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v47, v56, v57, vcc +; GFX9-NEXT: v_bfe_u32 v56, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v56, v56, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v57, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v56, v57, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v56, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v56, 0x40c00000, v56 +; GFX9-NEXT: v_bfe_u32 v57, v56, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v57, v57, v56, s6 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v56 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v56, v56 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v56, v57, v58, vcc +; GFX9-NEXT: v_bfe_u32 v57, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v57, v57, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v58, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v57, v58, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v57, 0x40c00000, v57 +; GFX9-NEXT: v_bfe_u32 v58, v57, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v58, v58, v57, s6 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v57 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v57, v57 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v57, v58, v59, vcc +; GFX9-NEXT: v_bfe_u32 v58, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v58, v58, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v59, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v58, v59, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v58, 0x40c00000, v58 +; GFX9-NEXT: v_bfe_u32 v59, v58, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v59, v59, v58, s6 +; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v58 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v58, v58 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v58, v59, v60, vcc +; GFX9-NEXT: v_bfe_u32 v59, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v59, v59, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v60, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v59, v60, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v59, 0x40c00000, v59 +; GFX9-NEXT: v_bfe_u32 v60, v59, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v60, v60, v59, s6 +; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v59 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v59, v59 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v59, v60, v61, vcc +; GFX9-NEXT: v_bfe_u32 v60, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v60, v60, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v61, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v60, v61, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v60, 0x40c00000, v60 +; GFX9-NEXT: v_bfe_u32 v61, v60, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v61, v61, v60, s6 +; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v60 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v60, v60 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v60, v61, v62, vcc +; GFX9-NEXT: v_bfe_u32 v61, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v61, v61, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v62, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v61, v62, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v61, 0x40c00000, v61 +; GFX9-NEXT: v_bfe_u32 v62, v61, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v62, v62, v61, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v61 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v61, v61 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v61, v62, v63, vcc +; GFX9-NEXT: v_bfe_u32 v62, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v62, v62, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v62, v63, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_add3_u32 v63, v63, v62, s6 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_bfe_u32 v62, v14, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v63, v0, vcc +; GFX9-NEXT: v_add3_u32 v62, v62, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v62, v63, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v62, 0x40c00000, v62 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_bfe_u32 v63, v62, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_add3_u32 v63, v63, v62, s6 +; GFX9-NEXT: v_or_b32_e32 v0, 0x400000, v62 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v62, v62 +; GFX9-NEXT: v_bfe_u32 v62, v15, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v63, v0, vcc +; GFX9-NEXT: v_add3_u32 v62, v62, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v63, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v62, v63, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v0, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v1, s6 +; GFX9-NEXT: v_perm_b32 v1, v16, v41, s6 +; GFX9-NEXT: v_perm_b32 v0, v17, v40, s6 +; GFX9-NEXT: v_perm_b32 v17, v32, v33, s6 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v13, v13, v61, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v60, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v59, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v58, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v57, s6 +; GFX9-NEXT: v_perm_b32 v8, v8, v56, s6 +; GFX9-NEXT: v_perm_b32 v7, v7, v47, s6 +; GFX9-NEXT: v_perm_b32 v6, v6, v46, s6 +; GFX9-NEXT: v_perm_b32 v5, v5, v45, s6 +; GFX9-NEXT: v_perm_b32 v4, v4, v44, s6 +; GFX9-NEXT: v_perm_b32 v3, v3, v43, s6 +; GFX9-NEXT: v_perm_b32 v2, v2, v42, s6 +; GFX9-NEXT: v_perm_b32 v31, v31, v55, s6 +; GFX9-NEXT: v_perm_b32 v30, v30, v54, s6 +; GFX9-NEXT: v_perm_b32 v29, v29, v53, s6 +; GFX9-NEXT: v_perm_b32 v28, v28, v52, s6 +; GFX9-NEXT: v_perm_b32 v27, v27, v51, s6 +; GFX9-NEXT: v_perm_b32 v26, v26, v50, s6 +; GFX9-NEXT: v_perm_b32 v25, v25, v49, s6 +; GFX9-NEXT: v_perm_b32 v24, v24, v48, s6 +; GFX9-NEXT: v_perm_b32 v23, v23, v39, s6 +; GFX9-NEXT: v_perm_b32 v22, v22, v38, s6 +; GFX9-NEXT: v_perm_b32 v21, v21, v37, s6 +; GFX9-NEXT: v_perm_b32 v20, v20, v36, s6 +; GFX9-NEXT: v_perm_b32 v19, v19, v35, s6 +; GFX9-NEXT: v_perm_b32 v18, v18, v34, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v16, v32, v16, s6 +; GFX9-NEXT: .LBB52_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64bf16_to_v64i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v69, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v71, 16, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_add_f32 v32, 0x40c00000, v32 +; GFX11-NEXT: v_dual_add_f32 v69, 0x40c00000, v69 :: v_dual_lshlrev_b32 v70, 16, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v81, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v37, v33, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v32 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-NEXT: v_lshlrev_b32_e32 v83, 16, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v85, 16, v10 +; GFX11-NEXT: v_add3_u32 v37, v37, v33, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v87, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v16 +; GFX11-NEXT: v_bfe_u32 v16, v32, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v87, 0x40c00000, v87 +; GFX11-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v34 +; GFX11-NEXT: v_add3_u32 v16, v16, v32, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v80, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v82, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v84, 16, v9 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v38, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v36, 0x40c00000, v17 +; GFX11-NEXT: v_bfe_u32 v17, v34, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add_f32_e32 v38, 0x40c00000, v18 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v86, 16, v11 +; GFX11-NEXT: v_add3_u32 v17, v17, v34, 0x7fff +; GFX11-NEXT: v_bfe_u32 v34, v36, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v32, v17, v39, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v33 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-NEXT: v_add3_u32 v33, v34, v36, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v37, v17, vcc_lo +; GFX11-NEXT: v_bfe_u32 v37, v35, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v35 +; GFX11-NEXT: v_cndmask_b32_e32 v33, v33, v34, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v18 +; GFX11-NEXT: v_add3_u32 v34, v37, v35, 0x7fff +; GFX11-NEXT: v_bfe_u32 v37, v38, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11-NEXT: v_perm_b32 v17, v33, v17, 0x7060302 +; GFX11-NEXT: v_perm_b32 v16, v32, v16, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v34, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v34, v37, v38, 0x7fff +; GFX11-NEXT: v_bfe_u32 v36, v39, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v37, 0x40c00000, v19 +; GFX11-NEXT: v_or_b32_e32 v35, 0x400000, v38 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11-NEXT: v_bfe_u32 v38, v37, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo +; GFX11-NEXT: v_add3_u32 v35, v36, v39, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v39 +; GFX11-NEXT: v_add_f32_e32 v48, 0x40c00000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-NEXT: v_add_f32_e32 v39, 0x40c00000, v20 +; GFX11-NEXT: v_perm_b32 v18, v34, v18, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v35, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v35, v38, v37, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v37 +; GFX11-NEXT: v_bfe_u32 v38, v48, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v48 +; GFX11-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v35, v35, v36, vcc_lo +; GFX11-NEXT: v_add3_u32 v36, v38, v48, 0x7fff +; GFX11-NEXT: v_bfe_u32 v38, v39, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-NEXT: v_dual_add_f32 v49, 0x40c00000, v20 :: v_dual_add_f32 v48, 0x40c00000, v21 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v36, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v36, v38, v39, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v39 +; GFX11-NEXT: v_bfe_u32 v38, v49, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11-NEXT: v_bfe_u32 v39, v48, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v50, 0x40c00000, v21 +; GFX11-NEXT: v_perm_b32 v19, v35, v19, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v36, v36, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v37, v38, v49, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v49 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-NEXT: v_dual_add_f32 v49, 0x40c00000, v22 :: v_dual_lshlrev_b32 v22, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11-NEXT: v_perm_b32 v20, v36, v20, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v37, v38, vcc_lo +; GFX11-NEXT: v_add3_u32 v37, v39, v48, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v48 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11-NEXT: v_bfe_u32 v48, v49, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v51, 0x40c00000, v22 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v37, v37, v38, vcc_lo +; GFX11-NEXT: v_bfe_u32 v39, v50, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-NEXT: v_perm_b32 v21, v37, v21, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v38, v39, v50, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v50 +; GFX11-NEXT: v_add_f32_e32 v50, 0x40c00000, v23 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v38, v39, vcc_lo +; GFX11-NEXT: v_add3_u32 v38, v48, v49, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v39, 0x400000, v49 +; GFX11-NEXT: v_bfe_u32 v48, v51, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; GFX11-NEXT: v_bfe_u32 v49, v50, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11-NEXT: v_cndmask_b32_e32 v38, v38, v39, vcc_lo +; GFX11-NEXT: v_add3_u32 v39, v48, v51, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v51 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-NEXT: v_dual_add_f32 v52, 0x40c00000, v23 :: v_dual_add_f32 v51, 0x40c00000, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v39, v48, vcc_lo +; GFX11-NEXT: v_add3_u32 v39, v49, v50, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v50 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11-NEXT: v_bfe_u32 v50, v51, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v53, 0x40c00000, v24 +; GFX11-NEXT: v_perm_b32 v22, v38, v22, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v39, v39, v48, vcc_lo +; GFX11-NEXT: v_bfe_u32 v49, v52, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v23, v39, v23, 0x7060302 +; GFX11-NEXT: v_add3_u32 v48, v49, v52, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v52 +; GFX11-NEXT: v_add_f32_e32 v52, 0x40c00000, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v24, v48, v49, vcc_lo +; GFX11-NEXT: v_add3_u32 v48, v50, v51, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v51 +; GFX11-NEXT: v_bfe_u32 v50, v53, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v26 +; GFX11-NEXT: v_bfe_u32 v51, v52, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX11-NEXT: v_cndmask_b32_e32 v48, v48, v49, vcc_lo +; GFX11-NEXT: v_add3_u32 v49, v50, v53, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v53 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-NEXT: v_dual_add_f32 v54, 0x40c00000, v25 :: v_dual_add_f32 v53, 0x40c00000, v26 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v27 +; GFX11-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v49, v50, vcc_lo +; GFX11-NEXT: v_add3_u32 v49, v51, v52, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v50, 0x400000, v52 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11-NEXT: v_bfe_u32 v52, v53, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v55, 0x40c00000, v26 +; GFX11-NEXT: v_perm_b32 v24, v48, v24, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v49, v49, v50, vcc_lo +; GFX11-NEXT: v_bfe_u32 v51, v54, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v25, v49, v25, 0x7060302 +; GFX11-NEXT: v_add3_u32 v50, v51, v54, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v54 +; GFX11-NEXT: v_add_f32_e32 v54, 0x40c00000, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v26, v50, v51, vcc_lo +; GFX11-NEXT: v_add3_u32 v50, v52, v53, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v51, 0x400000, v53 +; GFX11-NEXT: v_bfe_u32 v52, v55, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GFX11-NEXT: v_bfe_u32 v53, v54, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v50, v50, v51, vcc_lo +; GFX11-NEXT: v_add3_u32 v51, v52, v55, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v55 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-NEXT: v_dual_add_f32 v64, 0x40c00000, v27 :: v_dual_add_f32 v55, 0x40c00000, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; GFX11-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v27, v51, v52, vcc_lo +; GFX11-NEXT: v_add3_u32 v51, v53, v54, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v52, 0x400000, v54 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11-NEXT: v_bfe_u32 v54, v55, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v65, 0x40c00000, v28 +; GFX11-NEXT: v_perm_b32 v26, v50, v26, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v51, v51, v52, vcc_lo +; GFX11-NEXT: v_bfe_u32 v53, v64, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v27, v51, v27, 0x7060302 +; GFX11-NEXT: v_add3_u32 v52, v53, v64, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v64 +; GFX11-NEXT: v_add_f32_e32 v64, 0x40c00000, v29 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v28, v52, v53, vcc_lo +; GFX11-NEXT: v_add3_u32 v52, v54, v55, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v53, 0x400000, v55 +; GFX11-NEXT: v_bfe_u32 v54, v65, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v30 +; GFX11-NEXT: v_bfe_u32 v55, v64, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX11-NEXT: v_cndmask_b32_e32 v52, v52, v53, vcc_lo +; GFX11-NEXT: v_add3_u32 v53, v54, v65, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v65 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-NEXT: v_dual_add_f32 v66, 0x40c00000, v29 :: v_dual_add_f32 v65, 0x40c00000, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11-NEXT: v_cndmask_b32_e32 v29, v53, v54, vcc_lo +; GFX11-NEXT: v_add3_u32 v53, v55, v64, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v54, 0x400000, v64 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11-NEXT: v_bfe_u32 v64, v65, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v67, 0x40c00000, v30 +; GFX11-NEXT: v_perm_b32 v28, v52, v28, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v53, v53, v54, vcc_lo +; GFX11-NEXT: v_bfe_u32 v55, v66, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v29, v53, v29, 0x7060302 +; GFX11-NEXT: v_add3_u32 v54, v55, v66, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v66 +; GFX11-NEXT: v_add_f32_e32 v66, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v30, v54, v55, vcc_lo +; GFX11-NEXT: v_add3_u32 v54, v64, v65, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v55, 0x400000, v65 +; GFX11-NEXT: v_bfe_u32 v64, v67, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v65, v65 +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v0 +; GFX11-NEXT: v_bfe_u32 v65, v66, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v54, v54, v55, vcc_lo +; GFX11-NEXT: v_add3_u32 v55, v64, v67, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v67 +; GFX11-NEXT: v_add_f32_e32 v68, 0x40c00000, v31 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v67, 16, v1 +; GFX11-NEXT: v_perm_b32 v30, v54, v30, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v55, v64, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v67, 0x40c00000, v67 +; GFX11-NEXT: v_add3_u32 v55, v65, v66, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v64, 0x400000, v66 +; GFX11-NEXT: v_bfe_u32 v65, v68, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v66, v66 +; GFX11-NEXT: v_bfe_u32 v66, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v55, v55, v64, vcc_lo +; GFX11-NEXT: v_add3_u32 v64, v65, v68, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v65, 0x400000, v68 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v68, v68 +; GFX11-NEXT: v_bfe_u32 v68, v67, 16, 1 +; GFX11-NEXT: v_perm_b32 v31, v55, v31, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v64, v64, v65, vcc_lo +; GFX11-NEXT: v_add3_u32 v65, v66, v0, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v65, v66 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_add3_u32 v65, v68, v67, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v66, 0x400000, v67 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v67, v67 +; GFX11-NEXT: v_perm_b32 v0, v0, v64, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v68, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v67, 0x400000, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v65, v65, v66, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v66, v68, v1, 0x7fff +; GFX11-NEXT: v_bfe_u32 v68, v69, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v66, v67, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v66, v68, v69, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v67, 0x400000, v69 +; GFX11-NEXT: v_bfe_u32 v68, v2, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v69, v69 +; GFX11-NEXT: v_perm_b32 v1, v1, v65, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v66, v66, v67, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v67, v68, v2, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v67, v68 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v70, 0x40c00000, v70 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_perm_b32 v2, v2, v66, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v69, v70, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v68, 0x400000, v70 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v70, v70 +; GFX11-NEXT: v_add3_u32 v67, v69, v70, 0x7fff +; GFX11-NEXT: v_bfe_u32 v69, v3, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo +; GFX11-NEXT: v_add3_u32 v68, v69, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v69, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v68, v69 :: v_dual_and_b32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v71, 0x40c00000, v71 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v3, v3, v67, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v70, v71, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v69, 0x400000, v71 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v71, v71 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v68, v70, v71, 0x7fff +; GFX11-NEXT: v_bfe_u32 v70, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v68, v68, v69, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v69, v70, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_dual_cndmask_b32 v4, v69, v70 :: v_dual_and_b32 v5, 0xffff0000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v80, 0x40c00000, v80 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_perm_b32 v4, v4, v68, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v71, v80, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v70, 0x400000, v80 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v80, v80 +; GFX11-NEXT: v_add3_u32 v69, v71, v80, 0x7fff +; GFX11-NEXT: v_bfe_u32 v71, v5, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v69, v69, v70, vcc_lo +; GFX11-NEXT: v_add3_u32 v70, v71, v5, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v71, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v70, v71 :: v_dual_and_b32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_dual_add_f32 v81, 0x40c00000, v81 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v5, v5, v69, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v80, v81, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v71, 0x400000, v81 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v70, v80, v81, 0x7fff +; GFX11-NEXT: v_bfe_u32 v80, v6, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v70, v70, v71, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v71, v80, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v71, v80 :: v_dual_and_b32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v82, 0x40c00000, v82 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_perm_b32 v6, v6, v70, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v81, v82, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v80, 0x400000, v82 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v82, v82 +; GFX11-NEXT: v_add3_u32 v71, v81, v82, 0x7fff +; GFX11-NEXT: v_bfe_u32 v81, v7, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v71, v71, v80, vcc_lo +; GFX11-NEXT: v_add3_u32 v80, v81, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v81, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v7, v80, v81 :: v_dual_and_b32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_dual_add_f32 v83, 0x40c00000, v83 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v7, v7, v71, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v82, v83, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v81, 0x400000, v83 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v83, v83 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v80, v82, v83, 0x7fff +; GFX11-NEXT: v_bfe_u32 v82, v8, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v80, v80, v81, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v81, v82, v8, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_dual_cndmask_b32 v8, v81, v82 :: v_dual_and_b32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v84, 0x40c00000, v84 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_perm_b32 v8, v8, v80, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v83, v84, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v82, 0x400000, v84 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX11-NEXT: v_add3_u32 v81, v83, v84, 0x7fff +; GFX11-NEXT: v_bfe_u32 v83, v9, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v81, v81, v82, vcc_lo +; GFX11-NEXT: v_add3_u32 v82, v83, v9, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v83, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v9, v82, v83 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11-NEXT: v_dual_add_f32 v85, 0x40c00000, v85 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v9, v9, v81, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v84, v85, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v83, 0x400000, v85 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v85, v85 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v82, v84, v85, 0x7fff +; GFX11-NEXT: v_bfe_u32 v84, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v82, v82, v83, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v83, v84, v10, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v83, v84 :: v_dual_and_b32 v11, 0xffff0000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v86, 0x40c00000, v86 :: v_dual_add_f32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_perm_b32 v10, v10, v82, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v85, v86, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v84, 0x400000, v86 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v86, v86 +; GFX11-NEXT: v_or_b32_e32 v96, 0x400000, v11 +; GFX11-NEXT: v_add3_u32 v83, v85, v86, 0x7fff +; GFX11-NEXT: v_bfe_u32 v85, v11, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v83, v83, v84 :: v_dual_lshlrev_b32 v84, 16, v13 +; GFX11-NEXT: v_bfe_u32 v86, v87, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_or_b32_e32 v97, 0x400000, v87 +; GFX11-NEXT: v_add_f32_e32 v84, 0x40c00000, v84 +; GFX11-NEXT: v_add3_u32 v86, v86, v87, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_bfe_u32 v98, v12, 16, 1 +; GFX11-NEXT: v_bfe_u32 v99, v84, 16, 1 +; GFX11-NEXT: v_add3_u32 v85, v85, v11, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v86, v86, v97, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v97, 0x400000, v12 +; GFX11-NEXT: v_add3_u32 v87, v98, v12, 0x7fff +; GFX11-NEXT: v_add3_u32 v98, v99, v84, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v99, 16, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v84 +; GFX11-NEXT: v_bfe_u32 v101, v13, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v12, v87, v97 :: v_dual_add_f32 v87, 0x40c00000, v99 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v84, v84 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v97, v101, v13, 0x7fff +; GFX11-NEXT: v_perm_b32 v12, v12, v86, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v99, v87, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v84, v98, v100, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v98, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v101, 0x400000, v87 +; GFX11-NEXT: v_bfe_u32 v102, v14, 16, 1 +; GFX11-NEXT: v_add3_u32 v99, v99, v87, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v87, v87 +; GFX11-NEXT: v_or_b32_e32 v100, 0x400000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v98, 0x40c00000, v98 :: v_dual_cndmask_b32 v87, v99, v101 +; GFX11-NEXT: v_add3_u32 v101, v102, v14, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v102, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_bfe_u32 v103, v98, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v112, 0x400000, v98 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v14, v101, v102 :: v_dual_add_f32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_add3_u32 v103, v103, v98, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v98, v98 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v14, v14, v87, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v99, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v113, 0x400000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v98, v103, v112, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v99, v99, v15, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v15, v99, v113, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v15, v15, v98, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v97, v100, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_perm_b32 v13, v13, v84, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v11, v11, v83, 0x7060302 +; GFX11-NEXT: .LBB52_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <64 x bfloat> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x bfloat> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i16_to_v64bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v55 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v54 +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v35 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:116 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v18 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v16 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; kill: killed $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; kill: killed $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; kill: killed $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; kill: killed $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; kill: killed $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; kill: killed $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; kill: killed $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; kill: killed $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; kill: killed $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; kill: killed $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; kill: killed $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; kill: killed $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB53_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v44 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; GCN-NEXT: s_waitcnt vmcnt(12) +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v10 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: .LBB53_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB53_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v55, v10 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v14, v54, v14 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v12, v53, v12 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v33 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v16, v52, v16 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v31 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v41 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v42 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v43 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v44 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v18, v51, v18 +; GCN-NEXT: v_or_b32_e32 v20, v50, v20 +; GCN-NEXT: v_or_b32_e32 v8, v49, v8 +; GCN-NEXT: v_or_b32_e32 v22, v48, v22 +; GCN-NEXT: v_or_b32_e32 v24, v39, v24 +; GCN-NEXT: v_or_b32_e32 v26, v38, v26 +; GCN-NEXT: v_or_b32_e32 v28, v30, v28 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v30, v31 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v31, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v32, v2 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v32, v4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v32, v6 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v32, v33 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v33, v29 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v33, v27 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v33, v25 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v33, v23 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v33, v21 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v33, v19 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v33, v17 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v33, v15 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v33, v13 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v33, v11 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v33, v9 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v33, v7 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v33, v5 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v33, v3 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v33, v1 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v33, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v34, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v22 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v26 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v30, vcc, s6, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, s6, v31 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v32 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v29 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v27 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v7 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v19 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v21 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v23 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v27 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v6 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v31 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v28 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v22 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v34 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v10 +; GCN-NEXT: .LBB53_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v44, v2, v3, 16 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 8, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v15, v2, v3, 16 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 12, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v11, v2, v3, 16 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 16, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v18, vcc, 20, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v3, v2, v3, 16 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v7, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 28, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v45 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v45, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v46, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 36, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v47, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 40, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v56, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 44, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v57 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v57, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 48, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v58 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v58, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v59 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v34, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 56, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v60 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v33, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 60, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v61 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v32, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 64, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v31, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x44, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v63 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v41, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x48, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x4c, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v30 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_alignbit_b32 v4, v4, v6, 16 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x50, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v38 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_alignbit_b32 v6, v6, v8, 16 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 0x54, v0 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v39 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_alignbit_b32 v8, v8, v10, 16 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x58, v0 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v48 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_alignbit_b32 v10, v10, v12, 16 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x5c, v0 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v49 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_alignbit_b32 v12, v12, v14, 16 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x60, v0 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v50 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_alignbit_b32 v14, v14, v50, 16 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x64, v0 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 +; GCN-NEXT: v_alignbit_b32 v16, v51, v16, 16 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v52 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_alignbit_b32 v22, v52, v22, 16 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x6c, v0 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_alignbit_b32 v24, v53, v24, 16 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x70, v0 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v54 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_alignbit_b32 v26, v54, v26, 16 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x74, v0 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v55 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 +; GCN-NEXT: v_alignbit_b32 v28, v55, v28, 16 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v18, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v45, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v47, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v56, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v33, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v41, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v64bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB53_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v32, 3 +; VI-NEXT: v_add_u16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_u16_sdwa v33, v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 3, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_u16_sdwa v33, v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 3, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_u16_sdwa v33, v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v12, 3, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_u16_sdwa v33, v11, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v11, 3, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_u16_sdwa v33, v10, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v10, 3, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_u16_sdwa v33, v9, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v9, 3, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_u16_sdwa v33, v8, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_u16_sdwa v33, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_u16_sdwa v33, v6, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_u16_sdwa v33, v5, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_u16_sdwa v33, v4, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_u16_sdwa v33, v3, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_u16_sdwa v33, v2, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_u16_sdwa v33, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_u16_sdwa v33, v0, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_sdwa v33, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v31, 3, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_u16_sdwa v33, v30, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v30, 3, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_u16_sdwa v33, v29, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v29, 3, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_u16_sdwa v33, v28, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v28, 3, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_u16_sdwa v33, v27, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v27, 3, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_u16_sdwa v33, v26, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v26, 3, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_u16_sdwa v33, v25, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v25, 3, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_u16_sdwa v33, v24, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v24, 3, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_u16_sdwa v33, v23, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v23, 3, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_u16_sdwa v33, v22, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_u16_sdwa v33, v21, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_u16_sdwa v33, v20, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_u16_sdwa v33, v19, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_u16_sdwa v33, v18, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v18, 3, v18 +; VI-NEXT: v_or_b32_e32 v18, v18, v33 +; VI-NEXT: v_add_u16_sdwa v33, v17, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_add_u16_sdwa v32, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i16_to_v64bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB53_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i16_to_v64bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB53_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB53_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <64 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <64 x bfloat> + br label %end + +end: + %phi = phi <64 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x bfloat> %phi +} + +define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v64f16_to_v64i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v63, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v61, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v57, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v12 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v62, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v56, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v58, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v19 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v23 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v26 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v28 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v29 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v30 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:120 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v41 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v53 +; GCN-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v15 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v46 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v44 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v42 +; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v54 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v52 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v51 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v49 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v48 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v40 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v55 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v9 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v29 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:132 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v30 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v29 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v42, v56 +; GCN-NEXT: v_mov_b32_e32 v49, v57 +; GCN-NEXT: v_mov_b32_e32 v54, v58 +; GCN-NEXT: v_mov_b32_e32 v51, v62 +; GCN-NEXT: v_mov_b32_e32 v48, v4 +; GCN-NEXT: v_mov_b32_e32 v36, v5 +; GCN-NEXT: v_mov_b32_e32 v46, v6 +; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB54_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v12 +; GCN-NEXT: v_or_b32_e32 v9, v9, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; GCN-NEXT: v_or_b32_e32 v11, v11, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v15 +; GCN-NEXT: v_or_b32_e32 v14, v14, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v17 +; GCN-NEXT: v_or_b32_e32 v16, v16, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v19 +; GCN-NEXT: v_or_b32_e32 v18, v18, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v21 +; GCN-NEXT: v_or_b32_e32 v20, v20, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v23 +; GCN-NEXT: v_or_b32_e32 v22, v22, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v25 +; GCN-NEXT: v_or_b32_e32 v24, v24, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v27 +; GCN-NEXT: v_or_b32_e32 v26, v26, v29 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v29 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; GCN-NEXT: v_or_b32_e32 v3, v3, v29 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; GCN-NEXT: v_or_b32_e32 v4, v5, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v10 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v8 +; GCN-NEXT: v_or_b32_e32 v5, v7, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v46 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; GCN-NEXT: v_or_b32_e32 v6, v10, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v32 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v33 +; GCN-NEXT: v_or_b32_e32 v32, v29, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; GCN-NEXT: v_or_b32_e32 v34, v29, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v63 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v42, v42 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v44, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v46, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v56, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v58, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v61, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v63, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v57, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v62, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v45, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v60, v60 +; GCN-NEXT: v_cvt_f32_f16_e32 v41, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v59, v59 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v47, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v43, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v40, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; GCN-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; GCN-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; GCN-NEXT: v_add_f32_e32 v48, 0x38000000, v48 +; GCN-NEXT: v_add_f32_e32 v49, 0x38000000, v49 +; GCN-NEXT: v_add_f32_e32 v51, 0x38000000, v51 +; GCN-NEXT: v_add_f32_e32 v52, 0x38000000, v52 +; GCN-NEXT: v_add_f32_e32 v54, 0x38000000, v54 +; GCN-NEXT: v_add_f32_e32 v42, 0x38000000, v42 +; GCN-NEXT: v_add_f32_e32 v44, 0x38000000, v44 +; GCN-NEXT: v_add_f32_e32 v46, 0x38000000, v46 +; GCN-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; GCN-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; GCN-NEXT: v_add_f32_e32 v61, 0x38000000, v61 +; GCN-NEXT: v_add_f32_e32 v63, 0x38000000, v63 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v57, 0x38000000, v57 +; GCN-NEXT: v_add_f32_e32 v62, 0x38000000, v62 +; GCN-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; GCN-NEXT: v_add_f32_e32 v60, 0x38000000, v60 +; GCN-NEXT: v_add_f32_e32 v41, 0x38000000, v41 +; GCN-NEXT: v_add_f32_e32 v59, 0x38000000, v59 +; GCN-NEXT: v_add_f32_e32 v53, 0x38000000, v53 +; GCN-NEXT: v_add_f32_e32 v47, 0x38000000, v47 +; GCN-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; GCN-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; GCN-NEXT: v_add_f32_e32 v39, 0x38000000, v39 +; GCN-NEXT: v_add_f32_e32 v40, 0x38000000, v40 +; GCN-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; GCN-NEXT: v_add_f32_e32 v55, 0x38000000, v55 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v44 +; GCN-NEXT: v_cvt_f16_f32_e32 v46, v46 +; GCN-NEXT: v_cvt_f16_f32_e32 v56, v56 +; GCN-NEXT: v_cvt_f16_f32_e32 v58, v58 +; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 +; GCN-NEXT: v_cvt_f16_f32_e32 v63, v63 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v57, v57 +; GCN-NEXT: v_cvt_f16_f32_e32 v62, v62 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v45 +; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 +; GCN-NEXT: v_cvt_f16_f32_e32 v59, v59 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 +; GCN-NEXT: v_cvt_f16_f32_e32 v47, v47 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v61 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: v_or_b32_e32 v30, v30, v29 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v61, v38, v36 +; GCN-NEXT: v_or_b32_e32 v49, v49, v48 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v52, v51 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v42, v42, v54 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v46, v44 +; GCN-NEXT: v_mov_b32_e32 v46, v6 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v58, v56 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v63, v2 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v57, v28 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v45, v45, v62 +; GCN-NEXT: v_or_b32_e32 v41, v41, v60 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v53, v59 +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v50, v50, v47 +; GCN-NEXT: v_or_b32_e32 v39, v39, v43 +; GCN-NEXT: v_or_b32_e32 v37, v37, v40 +; GCN-NEXT: v_or_b32_e32 v1, v1, v55 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: v_alignbit_b32 v63, v34, v29, 16 +; GCN-NEXT: v_alignbit_b32 v36, v32, v36, 16 +; GCN-NEXT: v_alignbit_b32 v48, v46, v48, 16 +; GCN-NEXT: v_mov_b32_e32 v10, v5 +; GCN-NEXT: v_alignbit_b32 v51, v5, v51, 16 +; GCN-NEXT: v_mov_b32_e32 v7, v4 +; GCN-NEXT: v_alignbit_b32 v54, v4, v54, 16 +; GCN-NEXT: v_alignbit_b32 v29, v3, v44, 16 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; GCN-NEXT: v_alignbit_b32 v29, v1, v56, 16 +; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v2, v26, v2, 16 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v28, v24, v28, 16 +; GCN-NEXT: v_alignbit_b32 v53, v22, v62, 16 +; GCN-NEXT: v_alignbit_b32 v60, v20, v60, 16 +; GCN-NEXT: v_alignbit_b32 v59, v18, v59, 16 +; GCN-NEXT: v_alignbit_b32 v47, v16, v47, 16 +; GCN-NEXT: v_alignbit_b32 v43, v14, v43, 16 +; GCN-NEXT: v_alignbit_b32 v40, v11, v40, 16 +; GCN-NEXT: v_alignbit_b32 v55, v9, v55, 16 +; GCN-NEXT: .LBB54_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v30, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v35 +; GCN-NEXT: v_or_b32_e32 v2, v2, v29 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; GCN-NEXT: v_or_b32_e32 v56, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GCN-NEXT: v_or_b32_e32 v44, v1, v2 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; GCN-NEXT: v_or_b32_e32 v61, v1, v2 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GCN-NEXT: v_or_b32_e32 v58, v1, v2 +; GCN-NEXT: v_add_i32_e32 v63, vcc, 16, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; GCN-NEXT: v_or_b32_e32 v57, v1, v2 +; GCN-NEXT: v_add_i32_e32 v46, vcc, 20, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GCN-NEXT: v_or_b32_e32 v10, v1, v2 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; GCN-NEXT: v_or_b32_e32 v31, v1, v2 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v7, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 32, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v33, v1, v2 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 36, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v5, v1, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v35, v1, v2 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v1, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_or_b32_e32 v1, v1, v30 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 52, v0 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_or_b32_e32 v26, v26, v27 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 56, v0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_or_b32_e32 v28, v30, v28 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 60, v0 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_or_b32_e32 v24, v24, v25 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 64, v0 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v53 +; GCN-NEXT: v_or_b32_e32 v49, v30, v49 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x44, v0 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_or_b32_e32 v22, v22, v23 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 0x48, v0 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v60 +; GCN-NEXT: v_or_b32_e32 v45, v30, v54 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x4c, v0 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_or_b32_e32 v20, v20, v21 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 0x50, v0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v59 +; GCN-NEXT: v_or_b32_e32 v53, v30, v53 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x54, v0 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_or_b32_e32 v18, v18, v19 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x58, v0 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v47 +; GCN-NEXT: v_or_b32_e32 v50, v30, v50 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x5c, v0 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 0x60, v0 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v43 +; GCN-NEXT: v_or_b32_e32 v39, v30, v39 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x64, v0 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x68, v0 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v40 +; GCN-NEXT: v_or_b32_e32 v30, v30, v37 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v0 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v11, v11, v13 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v37, 0xffff, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: v_or_b32_e32 v37, v37, v55 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x74, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v9, v9, v12 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v56, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v61, v63, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v58, v46, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v57, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v34, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v33, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v36, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v35, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v48, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v49, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v45, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v53, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v42, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v39, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v37, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64f16_to_v64i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v32, 0x200 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v15 +; VI-NEXT: v_add_f16_sdwa v15, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v33, v15 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v14, v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v33, v14 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v13, v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v33, v13 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v12, v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v33, v12 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v11, v11, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v33, v11 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v33, v10 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v9, v9, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v33, v9 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v33, v8 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v33, v7 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v33, v6 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v33, v5 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v33, v4 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v3, v3, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v33, v3 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v33, v2 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v33, v1 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v33, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_e32 v33, 0x200, v31 +; VI-NEXT: v_add_f16_sdwa v31, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v31, v33, v31 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v30 +; VI-NEXT: v_add_f16_sdwa v30, v30, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v33, v30 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v29 +; VI-NEXT: v_add_f16_sdwa v29, v29, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v33, v29 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v28 +; VI-NEXT: v_add_f16_sdwa v28, v28, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v33, v28 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v27 +; VI-NEXT: v_add_f16_sdwa v27, v27, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v27, v33, v27 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v26 +; VI-NEXT: v_add_f16_sdwa v26, v26, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v33, v26 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v25 +; VI-NEXT: v_add_f16_sdwa v25, v25, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v33, v25 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v24 +; VI-NEXT: v_add_f16_sdwa v24, v24, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v24, v33, v24 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v23 +; VI-NEXT: v_add_f16_sdwa v23, v23, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v33, v23 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v22 +; VI-NEXT: v_add_f16_sdwa v22, v22, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v33, v22 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v21 +; VI-NEXT: v_add_f16_sdwa v21, v21, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v33, v21 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v20 +; VI-NEXT: v_add_f16_sdwa v20, v20, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v33, v20 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v19 +; VI-NEXT: v_add_f16_sdwa v19, v19, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v19, v33, v19 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v18 +; VI-NEXT: v_add_f16_sdwa v18, v18, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v18, v33, v18 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v17 +; VI-NEXT: v_add_f16_sdwa v17, v17, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v32, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v17, v33, v17 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB54_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64f16_to_v64i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_f16 v31, v31, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v30, v30, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v29, v29, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v28, v28, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v27, v27, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v26, v26, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v25, v25, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v24, v24, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v23, v23, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v22, v22, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v21, v21, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v20, v20, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB54_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64f16_to_v64i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB54_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <64 x half> %a, splat (half 0xH0200) + %a2 = bitcast <64 x half> %a1 to <64 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x half> %a to <64 x i16> + br label %end + +end: + %phi = phi <64 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i16> %phi +} + +define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i16_to_v64f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt vmcnt(8) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB55_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v40 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: .LBB55_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB55_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v46 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; GCN-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; GCN-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 3, v41 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 3, v40 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 3, v55 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 3, v54 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 3, v53 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 3, v52 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 3, v51 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 3, v50 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v48, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v46, vcc, 3, v63 +; GCN-NEXT: s_waitcnt vmcnt(8) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: s_waitcnt vmcnt(7) +; GCN-NEXT: v_add_i32_e32 v61, vcc, 3, v61 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: v_add_i32_e32 v60, vcc, 3, v60 +; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v62, vcc, 3, v62 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v63, vcc, 3, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v63 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v5 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v18 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v22 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v28 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v47 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v56 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v57 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v58 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v59 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v60 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v61 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v34 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v36 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v48 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v50 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v52 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v53 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v54 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v55 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v40 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v41 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v42 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v43 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v45 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: .LBB55_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v45, v2, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v44, v2, v1 +; GCN-NEXT: v_add_i32_e32 v59, vcc, 8, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v47, v2, v1 +; GCN-NEXT: v_add_i32_e32 v58, vcc, 12, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v46, v2, v1 +; GCN-NEXT: v_add_i32_e32 v57, vcc, 16, v0 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_add_i32_e32 v56, vcc, 20, v0 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v6, v7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v12, v13, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 44, v0 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 48, v0 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v18, v19, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v20, v21, v20 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 60, v0 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_or_b32_e32 v22, v23, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 64, v0 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_or_b32_e32 v24, v25, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 0x44, v0 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 0x48, v0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_or_b32_e32 v28, v29, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x4c, v0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_or_b32_e32 v30, v31, v30 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 0x50, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_or_b32_e32 v32, v33, v32 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 0x54, v0 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v34 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; GCN-NEXT: v_or_b32_e32 v34, v35, v34 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 0x58, v0 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_or_b32_e32 v36, v37, v36 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 0x5c, v0 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_or_b32_e32 v38, v39, v38 +; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x60, v0 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GCN-NEXT: v_or_b32_e32 v48, v49, v48 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x64, v0 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_or_b32_e32 v50, v51, v50 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x68, v0 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v52 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_or_b32_e32 v52, v53, v52 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x6c, v0 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v54 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_or_b32_e32 v54, v55, v54 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x70, v0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v40 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v40 +; GCN-NEXT: v_or_b32_e32 v40, v41, v40 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x74, v0 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v42 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_or_b32_e32 v42, v43, v42 +; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; GCN-NEXT: buffer_store_dword v45, v59, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v44, v58, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v47, v57, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v46, v56, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v32, v35, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v34, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v39, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v48, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v52, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v54, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v40, v43, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v42, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i16_to_v64f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB55_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v32, 3 +; VI-NEXT: v_add_u16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v33 +; VI-NEXT: v_add_u16_sdwa v33, v14, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 3, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v33 +; VI-NEXT: v_add_u16_sdwa v33, v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 3, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v33 +; VI-NEXT: v_add_u16_sdwa v33, v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v12, 3, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v33 +; VI-NEXT: v_add_u16_sdwa v33, v11, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v11, 3, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v33 +; VI-NEXT: v_add_u16_sdwa v33, v10, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v10, 3, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v33 +; VI-NEXT: v_add_u16_sdwa v33, v9, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v9, 3, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v33 +; VI-NEXT: v_add_u16_sdwa v33, v8, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v33 +; VI-NEXT: v_add_u16_sdwa v33, v7, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v33 +; VI-NEXT: v_add_u16_sdwa v33, v6, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v33 +; VI-NEXT: v_add_u16_sdwa v33, v5, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v33 +; VI-NEXT: v_add_u16_sdwa v33, v4, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v33 +; VI-NEXT: v_add_u16_sdwa v33, v3, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v33 +; VI-NEXT: v_add_u16_sdwa v33, v2, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v33 +; VI-NEXT: v_add_u16_sdwa v33, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v33 +; VI-NEXT: v_add_u16_sdwa v33, v0, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_sdwa v33, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v31, 3, v31 +; VI-NEXT: v_or_b32_e32 v31, v31, v33 +; VI-NEXT: v_add_u16_sdwa v33, v30, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v30, 3, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v33 +; VI-NEXT: v_add_u16_sdwa v33, v29, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v29, 3, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v33 +; VI-NEXT: v_add_u16_sdwa v33, v28, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v28, 3, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v33 +; VI-NEXT: v_add_u16_sdwa v33, v27, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v27, 3, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v33 +; VI-NEXT: v_add_u16_sdwa v33, v26, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v26, 3, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v33 +; VI-NEXT: v_add_u16_sdwa v33, v25, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v25, 3, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v33 +; VI-NEXT: v_add_u16_sdwa v33, v24, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v24, 3, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v33 +; VI-NEXT: v_add_u16_sdwa v33, v23, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v23, 3, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v33 +; VI-NEXT: v_add_u16_sdwa v33, v22, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v22, 3, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v33 +; VI-NEXT: v_add_u16_sdwa v33, v21, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v21, 3, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v33 +; VI-NEXT: v_add_u16_sdwa v33, v20, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v20, 3, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v33 +; VI-NEXT: v_add_u16_sdwa v33, v19, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v19, 3, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v33 +; VI-NEXT: v_add_u16_sdwa v33, v18, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v18, 3, v18 +; VI-NEXT: v_or_b32_e32 v18, v18, v33 +; VI-NEXT: v_add_u16_sdwa v33, v17, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_add_u16_sdwa v32, v16, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v16 +; VI-NEXT: v_or_b32_e32 v17, v17, v33 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: .LBB55_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i16_to_v64f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB55_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB55_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i16_to_v64f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v32 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB55_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB55_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <64 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <64 x half> + br label %end + +end: + %phi = phi <64 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x half> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll new file mode 100644 index 0000000000000..03c6a36ac9861 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -0,0 +1,11387 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <4 x float> @bitcast_v4i32_to_v4f32(<4 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i32_to_v4f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i32_to_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i32_to_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i32_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + +define <4 x i32> @bitcast_v4f32_to_v4i32(<4 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f32_to_v4i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v4i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + +define <2 x i64> @bitcast_v4i32_to_v2i64(<4 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i32_to_v2i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i32_to_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i32_to_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i32_to_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + +define <4 x i32> @bitcast_v2i64_to_v4i32(<2 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i64_to_v4i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i64_to_v4i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i64_to_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i64_to_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + +define <2 x double> @bitcast_v4i32_to_v2f64(<4 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i32_to_v2f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB4_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i32_to_v2f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i32_to_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i32_to_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + +define <4 x i32> @bitcast_v2f64_to_v4i32(<2 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f64_to_v4i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB5_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v4i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + +define <8 x i16> @bitcast_v4i32_to_v8i16(<4 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i32_to_v8i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v6, v3 +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB6_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB6_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v4, v8 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i32_to_v8i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i32_to_v8i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i32_to_v8i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + +define <4 x i32> @bitcast_v8i16_to_v4i32(<8 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i16_to_v4i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v10, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB7_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB7_4 +; GCN-NEXT: .LBB7_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB7_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN-NEXT: v_or_b32_e32 v1, v1, v11 +; GCN-NEXT: v_or_b32_e32 v2, v2, v5 +; GCN-NEXT: v_or_b32_e32 v3, v3, v7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: .LBB7_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v0, v8, v0 +; GCN-NEXT: v_or_b32_e32 v1, v11, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v7, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i16_to_v4i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v5, 3 +; VI-NEXT: v_add_u16_e32 v4, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_u16_e32 v4, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: v_add_u16_e32 v4, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_add_u16_e32 v4, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i16_to_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + +define <8 x half> @bitcast_v4i32_to_v8f16(<4 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i32_to_v8f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NEXT: v_mov_b32_e32 v10, v2 +; GCN-NEXT: v_mov_b32_e32 v11, v1 +; GCN-NEXT: v_mov_b32_e32 v8, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB8_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB8_4 +; GCN-NEXT: .LBB8_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB8_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB8_2 +; GCN-NEXT: .LBB8_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i32_to_v8f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i32_to_v8f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i32_to_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + +define <4 x i32> @bitcast_v8f16_to_v4i32(<8 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f16_to_v4i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB9_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB9_4 +; GCN-NEXT: .LBB9_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB9_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GCN-NEXT: v_or_b32_e32 v0, v10, v0 +; GCN-NEXT: v_or_b32_e32 v1, v8, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: .LBB9_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v6 +; GCN-NEXT: v_or_b32_e32 v3, v4, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f16_to_v4i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v4, 0x200 +; VI-NEXT: v_add_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v5 +; VI-NEXT: v_add_f16_sdwa v5, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v5 +; VI-NEXT: v_add_f16_sdwa v5, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + +define <8 x bfloat> @bitcast_v4i32_to_v8bf16(<4 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i32_to_v8bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v11, v3 +; GCN-NEXT: v_mov_b32_e32 v10, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v1 +; GCN-NEXT: v_mov_b32_e32 v8, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_4 +; GCN-NEXT: .LBB10_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB10_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_2 +; GCN-NEXT: .LBB10_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v11 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i32_to_v8bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i32_to_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i32_to_v8bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + +define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v8bf16_to_v4i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB11_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB11_4 +; GCN-NEXT: .LBB11_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB11_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB11_2 +; GCN-NEXT: .LBB11_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v2, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8bf16_to_v4i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v2, v4, v2, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s7 +; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add3_u32 v9, v9, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX11-NEXT: v_add3_u32 v7, v13, v2, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v7, v8 :: v_dual_add_f32 v7, 0x40c00000, v10 +; GFX11-NEXT: v_bfe_u32 v10, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v8, v9, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: v_add3_u32 v11, v11, v7, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo +; GFX11-NEXT: v_add3_u32 v9, v10, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v9, v10 +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_perm_b32 v1, v6, v1, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 +; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + +define <16 x i8> @bitcast_v4i32_to_v16i8(<4 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i32_to_v16i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v12, v3 +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v16, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; GCN-NEXT: .LBB12_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; GCN-NEXT: .LBB12_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i32_to_v16i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v19, v1 +; VI-NEXT: v_mov_b32_e32 v18, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: .LBB12_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i32_to_v16i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i32_to_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB12_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB12_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i32> %a, splat (i32 3) + %a2 = bitcast <4 x i32> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x i32> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + +define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i8_to_v4i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v15 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB13_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB13_4 +; GCN-NEXT: .LBB13_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB13_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v0, v0, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v8, v8, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v16, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v7, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v7, v9, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB13_2 +; GCN-NEXT: .LBB13_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v14 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v0, v19, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v21, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v8, v11, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v16, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v7, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v8 +; GCN-NEXT: v_or_b32_e32 v7, v9, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i8_to_v4i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v18, v2 +; VI-NEXT: v_mov_b32_e32 v17, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB13_4 +; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: .LBB13_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v17 +; VI-NEXT: v_add_u16_e32 v1, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v3, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v2, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v8 +; VI-NEXT: v_add_u16_e32 v4, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v12 +; VI-NEXT: v_add_u16_e32 v6, 3, v14 +; VI-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i8_to_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB13_4 +; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: .LBB13_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i8_to_v4i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v15 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v14 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v20 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v8, v11 +; GFX11-NEXT: v_or_b32_e32 v7, v10, v13 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: .LBB13_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v17, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v6, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v10, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v12, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v14, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v16, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v19, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v20, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v13, v10 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <4 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <4 x i32> + br label %end + +end: + %phi = phi <4 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i32> %phi +} + +define <2 x i64> @bitcast_v4f32_to_v2i64(<4 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f32_to_v2i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB14_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB14_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + +define <4 x float> @bitcast_v2i64_to_v4f32(<2 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i64_to_v4f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB15_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB15_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i64_to_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i64_to_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i64_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + +define <2 x double> @bitcast_v4f32_to_v2f64(<4 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f32_to_v2f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB16_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB16_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v2f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + +define <4 x float> @bitcast_v2f64_to_v4f32(<2 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f64_to_v4f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB17_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB17_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + +define <8 x i16> @bitcast_v4f32_to_v8i16(<4 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f32_to_v8i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v6, v3 +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB18_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB18_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v4, v8 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v8i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v8i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v8i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + +define <4 x float> @bitcast_v8i16_to_v4f32(<8 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i16_to_v4f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v10, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB19_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB19_4 +; GCN-NEXT: .LBB19_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB19_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN-NEXT: v_or_b32_e32 v1, v1, v11 +; GCN-NEXT: v_or_b32_e32 v2, v2, v5 +; GCN-NEXT: v_or_b32_e32 v3, v3, v7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB19_2 +; GCN-NEXT: .LBB19_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v0, v8, v0 +; GCN-NEXT: v_or_b32_e32 v1, v11, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v7, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i16_to_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v5, 3 +; VI-NEXT: v_add_u16_e32 v4, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_u16_e32 v4, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: v_add_u16_e32 v4, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_add_u16_e32 v4, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i16_to_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + +define <8 x half> @bitcast_v4f32_to_v8f16(<4 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f32_to_v8f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NEXT: v_mov_b32_e32 v10, v2 +; GCN-NEXT: v_mov_b32_e32 v11, v1 +; GCN-NEXT: v_mov_b32_e32 v8, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB20_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB20_4 +; GCN-NEXT: .LBB20_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB20_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB20_2 +; GCN-NEXT: .LBB20_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v8f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v8f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + +define <4 x float> @bitcast_v8f16_to_v4f32(<8 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f16_to_v4f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB21_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB21_4 +; GCN-NEXT: .LBB21_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB21_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GCN-NEXT: v_or_b32_e32 v0, v10, v0 +; GCN-NEXT: v_or_b32_e32 v1, v8, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB21_2 +; GCN-NEXT: .LBB21_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v6 +; GCN-NEXT: v_or_b32_e32 v3, v4, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f16_to_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v4, 0x200 +; VI-NEXT: v_add_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v5 +; VI-NEXT: v_add_f16_sdwa v5, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v5 +; VI-NEXT: v_add_f16_sdwa v5, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + +define <8 x bfloat> @bitcast_v4f32_to_v8bf16(<4 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f32_to_v8bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v11, v3 +; GCN-NEXT: v_mov_b32_e32 v10, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v1 +; GCN-NEXT: v_mov_b32_e32 v8, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB22_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB22_4 +; GCN-NEXT: .LBB22_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB22_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB22_2 +; GCN-NEXT: .LBB22_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v11 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v8bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v8bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + +define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v8bf16_to_v4f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB23_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB23_4 +; GCN-NEXT: .LBB23_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB23_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB23_2 +; GCN-NEXT: .LBB23_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v2, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8bf16_to_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v2, v4, v2, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s7 +; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add3_u32 v9, v9, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX11-NEXT: v_add3_u32 v7, v13, v2, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v7, v8 :: v_dual_add_f32 v7, 0x40c00000, v10 +; GFX11-NEXT: v_bfe_u32 v10, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v8, v9, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: v_add3_u32 v11, v11, v7, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo +; GFX11-NEXT: v_add3_u32 v9, v10, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v9, v10 +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_perm_b32 v1, v6, v1, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 +; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + +define <16 x i8> @bitcast_v4f32_to_v16i8(<4 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f32_to_v16i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v12, v3 +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v16, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; GCN-NEXT: .LBB24_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; GCN-NEXT: .LBB24_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f32_to_v16i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v19, v1 +; VI-NEXT: v_mov_b32_e32 v18, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; VI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; VI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f32_to_v16i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v19, 1.0, v19 +; GFX9-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GFX9-NEXT: v_add_f32_e32 v17, 1.0, v17 +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f32_to_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB24_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v19, 1.0, v19 :: v_dual_add_f32 v16, 1.0, v16 +; GFX11-NEXT: v_dual_add_f32 v17, 1.0, v17 :: v_dual_add_f32 v18, 1.0, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB24_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <4 x float> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x float> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + +define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i8_to_v4f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v15 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB25_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB25_4 +; GCN-NEXT: .LBB25_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB25_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v0, v0, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v8, v8, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v16, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v7, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v7, v9, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB25_2 +; GCN-NEXT: .LBB25_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v14 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v0, v19, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v21, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v8, v11, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v16, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v7, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v8 +; GCN-NEXT: v_or_b32_e32 v7, v9, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i8_to_v4f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v18, v2 +; VI-NEXT: v_mov_b32_e32 v17, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: .LBB25_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v17 +; VI-NEXT: v_add_u16_e32 v1, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v3, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v2, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v8 +; VI-NEXT: v_add_u16_e32 v4, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v12 +; VI-NEXT: v_add_u16_e32 v6, 3, v14 +; VI-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i8_to_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: .LBB25_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i8_to_v4f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v15 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v14 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v20 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v8, v11 +; GFX11-NEXT: v_or_b32_e32 v7, v10, v13 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v17, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v6, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v10, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v12, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v14, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v16, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v19, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v20, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v13, v10 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <4 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <4 x float> + br label %end + +end: + %phi = phi <4 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x float> %phi +} + +define <2 x double> @bitcast_v2i64_to_v2f64(<2 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i64_to_v2f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB26_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: .LBB26_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i64_to_v2f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i64_to_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i64_to_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + +define <2 x i64> @bitcast_v2f64_to_v2i64(<2 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f64_to_v2i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB27_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: .LBB27_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + +define <8 x i16> @bitcast_v2i64_to_v8i16(<2 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i64_to_v8i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v6, v3 +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB28_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GCN-NEXT: v_alignbit_b32 v5, v6, v8, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB28_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v4, v8 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i64_to_v8i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i64_to_v8i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i64_to_v8i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + +define <2 x i64> @bitcast_v8i16_to_v2i64(<8 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i16_to_v2i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v10, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB29_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB29_4 +; GCN-NEXT: .LBB29_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB29_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN-NEXT: v_or_b32_e32 v1, v1, v11 +; GCN-NEXT: v_or_b32_e32 v2, v2, v5 +; GCN-NEXT: v_or_b32_e32 v3, v3, v7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB29_2 +; GCN-NEXT: .LBB29_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v0, v8, v0 +; GCN-NEXT: v_or_b32_e32 v1, v11, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v7, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i16_to_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v5, 3 +; VI-NEXT: v_add_u16_e32 v4, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_u16_e32 v4, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: v_add_u16_e32 v4, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_add_u16_e32 v4, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i16_to_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + +define <8 x half> @bitcast_v2i64_to_v8f16(<2 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i64_to_v8f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v10, v3 +; GCN-NEXT: v_mov_b32_e32 v9, v2 +; GCN-NEXT: v_mov_b32_e32 v11, v1 +; GCN-NEXT: v_mov_b32_e32 v8, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB30_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB30_4 +; GCN-NEXT: .LBB30_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB30_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB30_2 +; GCN-NEXT: .LBB30_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v9 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v10, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i64_to_v8f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i64_to_v8f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i64_to_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + +define <2 x i64> @bitcast_v8f16_to_v2i64(<8 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f16_to_v2i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB31_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB31_4 +; GCN-NEXT: .LBB31_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB31_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GCN-NEXT: v_or_b32_e32 v0, v10, v0 +; GCN-NEXT: v_or_b32_e32 v1, v8, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB31_2 +; GCN-NEXT: .LBB31_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v6 +; GCN-NEXT: v_or_b32_e32 v3, v4, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f16_to_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v4, 0x200 +; VI-NEXT: v_add_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v5 +; VI-NEXT: v_add_f16_sdwa v5, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v5 +; VI-NEXT: v_add_f16_sdwa v5, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + +define <8 x bfloat> @bitcast_v2i64_to_v8bf16(<2 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i64_to_v8bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v11, v3 +; GCN-NEXT: v_mov_b32_e32 v10, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v1 +; GCN-NEXT: v_mov_b32_e32 v8, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB32_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB32_4 +; GCN-NEXT: .LBB32_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB32_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB32_2 +; GCN-NEXT: .LBB32_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v11, vcc +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i64_to_v8bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i64_to_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i64_to_v8bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + +define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v8bf16_to_v2i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB33_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB33_4 +; GCN-NEXT: .LBB33_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB33_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB33_2 +; GCN-NEXT: .LBB33_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v2, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8bf16_to_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v2, v4, v2, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s7 +; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add3_u32 v9, v9, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX11-NEXT: v_add3_u32 v7, v13, v2, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v7, v8 :: v_dual_add_f32 v7, 0x40c00000, v10 +; GFX11-NEXT: v_bfe_u32 v10, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v8, v9, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: v_add3_u32 v11, v11, v7, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo +; GFX11-NEXT: v_add3_u32 v9, v10, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v9, v10 +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_perm_b32 v1, v6, v1, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 +; GFX11-NEXT: .LBB33_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + +define <16 x i8> @bitcast_v2i64_to_v16i8(<2 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i64_to_v16i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v12, v3 +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v16, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB34_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; GCN-NEXT: .LBB34_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB34_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_alignbit_b32 v3, v16, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v16, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v16, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; GCN-NEXT: .LBB34_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v4, v16 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i64_to_v16i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v19, v1 +; VI-NEXT: v_mov_b32_e32 v18, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: .LBB34_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: .LBB34_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i64_to_v16i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB34_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB34_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i64_to_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB34_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB34_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i64> %a, splat (i64 3) + %a2 = bitcast <2 x i64> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x i64> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + +define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i8_to_v2i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v15 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB35_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB35_4 +; GCN-NEXT: .LBB35_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB35_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v0, v0, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v8, v8, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v16, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v7, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v7, v9, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB35_2 +; GCN-NEXT: .LBB35_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v14 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v0, v19, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v21, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v8, v11, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v16, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v7, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v8 +; GCN-NEXT: v_or_b32_e32 v7, v9, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i8_to_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v18, v2 +; VI-NEXT: v_mov_b32_e32 v17, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB35_2 +; VI-NEXT: .LBB35_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v17 +; VI-NEXT: v_add_u16_e32 v1, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v3, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v2, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v8 +; VI-NEXT: v_add_u16_e32 v4, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v12 +; VI-NEXT: v_add_u16_e32 v6, 3, v14 +; VI-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i8_to_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB35_2 +; GFX9-NEXT: .LBB35_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i8_to_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v15 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v14 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v20 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v8, v11 +; GFX11-NEXT: v_or_b32_e32 v7, v10, v13 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB35_2 +; GFX11-NEXT: .LBB35_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v17, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v6, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v10, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v12, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v14, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v16, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v19, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v20, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v13, v10 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <2 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <2 x i64> + br label %end + +end: + %phi = phi <2 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i64> %phi +} + +define <8 x i16> @bitcast_v2f64_to_v8i16(<2 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f64_to_v8i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v11, v1 +; GCN-NEXT: v_mov_b32_e32 v10, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB36_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v1, v11, v10, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GCN-NEXT: .LBB36_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB36_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v1, v11, v10, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GCN-NEXT: .LBB36_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v10 +; GCN-NEXT: v_mov_b32_e32 v2, v11 +; GCN-NEXT: v_mov_b32_e32 v4, v8 +; GCN-NEXT: v_mov_b32_e32 v6, v9 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v8i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v8i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v8i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + +define <2 x double> @bitcast_v8i16_to_v2f64(<8 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i16_to_v2f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v10, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB37_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB37_4 +; GCN-NEXT: .LBB37_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB37_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN-NEXT: v_or_b32_e32 v1, v1, v11 +; GCN-NEXT: v_or_b32_e32 v2, v2, v5 +; GCN-NEXT: v_or_b32_e32 v3, v3, v7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB37_2 +; GCN-NEXT: .LBB37_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v0, v8, v0 +; GCN-NEXT: v_or_b32_e32 v1, v11, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v7, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x30000, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i16_to_v2f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB37_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v5, 3 +; VI-NEXT: v_add_u16_e32 v4, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: v_add_u16_e32 v4, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: v_add_u16_e32 v4, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_add_u16_e32 v4, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: .LBB37_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i16_to_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + +define <8 x half> @bitcast_v2f64_to_v8f16(<2 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f64_to_v8f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB38_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: .LBB38_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB38_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: .LBB38_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v9 +; GCN-NEXT: v_mov_b32_e32 v1, v11 +; GCN-NEXT: v_mov_b32_e32 v2, v8 +; GCN-NEXT: v_mov_b32_e32 v3, v10 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v8f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v8f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + +define <2 x double> @bitcast_v8f16_to_v2f64(<8 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f16_to_v2f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB39_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB39_4 +; GCN-NEXT: .LBB39_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB39_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GCN-NEXT: v_or_b32_e32 v0, v10, v0 +; GCN-NEXT: v_or_b32_e32 v1, v8, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB39_2 +; GCN-NEXT: .LBB39_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v6 +; GCN-NEXT: v_or_b32_e32 v3, v4, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f16_to_v2f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB39_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v4, 0x200 +; VI-NEXT: v_add_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v5 +; VI-NEXT: v_add_f16_sdwa v5, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v5 +; VI-NEXT: v_add_f16_sdwa v5, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + +define <8 x bfloat> @bitcast_v2f64_to_v8bf16(<2 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f64_to_v8bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB40_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: .LBB40_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB40_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; GCN-NEXT: .LBB40_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v11 +; GCN-NEXT: v_mov_b32_e32 v1, v10 +; GCN-NEXT: v_mov_b32_e32 v2, v9 +; GCN-NEXT: v_mov_b32_e32 v3, v8 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v8bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v8bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + +define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v8bf16_to_v2f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB41_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB41_4 +; GCN-NEXT: .LBB41_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB41_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v0, v11, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB41_2 +; GCN-NEXT: .LBB41_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v2, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8bf16_to_v2f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v3, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v2, v4, v2, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s7 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s7 +; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_add3_u32 v9, v9, v3, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX11-NEXT: v_add3_u32 v7, v13, v2, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v7, v8 :: v_dual_add_f32 v7, 0x40c00000, v10 +; GFX11-NEXT: v_bfe_u32 v10, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v8, v9, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: v_add3_u32 v11, v11, v7, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v9, vcc_lo +; GFX11-NEXT: v_add3_u32 v9, v10, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v9, v10 +; GFX11-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v8, v8, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_perm_b32 v1, v6, v1, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v13, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v7, v0, 0x7060302 +; GFX11-NEXT: .LBB41_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + +define <16 x i8> @bitcast_v2f64_to_v16i8(<2 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f64_to_v16i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v17, v3 +; GCN-NEXT: v_mov_b32_e32 v16, v2 +; GCN-NEXT: v_mov_b32_e32 v19, v1 +; GCN-NEXT: v_mov_b32_e32 v18, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB42_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v11, v17, v16, 24 +; GCN-NEXT: v_alignbit_b32 v10, v17, v16, 16 +; GCN-NEXT: v_alignbit_b32 v9, v17, v16, 8 +; GCN-NEXT: v_alignbit_b32 v3, v19, v18, 24 +; GCN-NEXT: v_alignbit_b32 v2, v19, v18, 16 +; GCN-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GCN-NEXT: .LBB42_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB42_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GCN-NEXT: v_alignbit_b32 v11, v17, v16, 24 +; GCN-NEXT: v_alignbit_b32 v10, v17, v16, 16 +; GCN-NEXT: v_alignbit_b32 v9, v17, v16, 8 +; GCN-NEXT: v_alignbit_b32 v3, v19, v18, 24 +; GCN-NEXT: v_alignbit_b32 v2, v19, v18, 16 +; GCN-NEXT: v_alignbit_b32 v1, v19, v18, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GCN-NEXT: .LBB42_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v18 +; GCN-NEXT: v_mov_b32_e32 v4, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v16 +; GCN-NEXT: v_mov_b32_e32 v12, v17 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f64_to_v16i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v19, v1 +; VI-NEXT: v_mov_b32_e32 v18, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: .LBB42_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; VI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: .LBB42_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f64_to_v16i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX9-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB42_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f64_to_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB42_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GFX11-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB42_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <2 x double> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x double> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + +define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i8_to_v2f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v15 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB43_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB43_4 +; GCN-NEXT: .LBB43_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB43_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v0, v0, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v8, v8, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v16, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v7, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v7, v9, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB43_2 +; GCN-NEXT: .LBB43_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v14 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v0, v19, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v20, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v21, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v8, v11, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v16, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v7, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v8 +; GCN-NEXT: v_or_b32_e32 v7, v9, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i8_to_v2f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v18, v2 +; VI-NEXT: v_mov_b32_e32 v17, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB43_4 +; VI-NEXT: .LBB43_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB43_2 +; VI-NEXT: .LBB43_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v17 +; VI-NEXT: v_add_u16_e32 v1, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v3, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v2, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v8 +; VI-NEXT: v_add_u16_e32 v4, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v12 +; VI-NEXT: v_add_u16_e32 v6, 3, v14 +; VI-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i8_to_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB43_2 +; GFX9-NEXT: .LBB43_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i8_to_v2f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v15 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v14 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v20 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v8, v11 +; GFX11-NEXT: v_or_b32_e32 v7, v10, v13 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB43_2 +; GFX11-NEXT: .LBB43_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v17, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v6, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v10, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v12, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v14, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v16, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v19, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v20, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v13, v10 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <2 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <2 x double> + br label %end + +end: + %phi = phi <2 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x double> %phi +} + +define <8 x half> @bitcast_v8i16_to_v8f16(<8 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i16_to_v8f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, v7 +; GCN-NEXT: v_mov_b32_e32 v9, v6 +; GCN-NEXT: v_mov_b32_e32 v10, v5 +; GCN-NEXT: v_mov_b32_e32 v11, v4 +; GCN-NEXT: v_mov_b32_e32 v12, v3 +; GCN-NEXT: v_mov_b32_e32 v13, v2 +; GCN-NEXT: v_mov_b32_e32 v14, v1 +; GCN-NEXT: v_mov_b32_e32 v15, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB44_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB44_4 +; GCN-NEXT: .LBB44_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB44_3: ; %cmp.false +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB44_2 +; GCN-NEXT: .LBB44_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i16_to_v8f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v4, 3 +; VI-NEXT: v_add_u16_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v6, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v7, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v4, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_or_b32_e32 v2, v2, v7 +; VI-NEXT: v_or_b32_e32 v1, v1, v6 +; VI-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NEXT: .LBB44_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i16_to_v8f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + +define <8 x i16> @bitcast_v8f16_to_v8i16(<8 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f16_to_v8i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB45_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: v_or_b32_e32 v6, v6, v8 +; GCN-NEXT: v_or_b32_e32 v2, v2, v9 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GCN-NEXT: .LBB45_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f16_to_v8i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB45_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v5, 0x200 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v8, v3 +; VI-NEXT: v_or_b32_e32 v2, v7, v2 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: .LBB45_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v8i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v8i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + +define <8 x bfloat> @bitcast_v8i16_to_v8bf16(<8 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i16_to_v8bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v11, v6 +; GCN-NEXT: v_mov_b32_e32 v12, v4 +; GCN-NEXT: v_mov_b32_e32 v9, v2 +; GCN-NEXT: v_mov_b32_e32 v10, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB46_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB46_4 +; GCN-NEXT: .LBB46_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB46_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB46_2 +; GCN-NEXT: .LBB46_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v7, v0 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_or_b32_e32 v1, v1, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i16_to_v8bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v4, 3 +; VI-NEXT: v_add_u16_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v6, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v7, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v4, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_or_b32_e32 v2, v2, v7 +; VI-NEXT: v_or_b32_e32 v1, v1, v6 +; VI-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NEXT: .LBB46_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i16_to_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v8bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + +define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v8bf16_to_v8i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v7 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB47_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB47_4 +; GCN-NEXT: .LBB47_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB47_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB47_2 +; GCN-NEXT: .LBB47_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_alignbit_b32 v0, v9, v0, 16 +; GCN-NEXT: v_alignbit_b32 v4, v10, v2, 16 +; GCN-NEXT: v_alignbit_b32 v6, v7, v8, 16 +; GCN-NEXT: v_alignbit_b32 v2, v3, v5, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v11, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8bf16_to_v8i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v6 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v1 +; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, s6, v7 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; VI-NEXT: v_bfe_u32 v7, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; VI-NEXT: v_add_u32_e32 v7, vcc, s6, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, s6, v8 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v7, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; VI-NEXT: v_alignbit_b32 v1, v1, v5, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: .LBB47_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v8i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v6, v6, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add3_u32 v7, v7, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add3_u32 v8, v8, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; GFX9-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v8, v8, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v3, v7, v3, s6 +; GFX9-NEXT: v_perm_b32 v2, v6, v2, s6 +; GFX9-NEXT: v_perm_b32 v1, v5, v1, s6 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s6 +; GFX9-NEXT: .LBB47_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v8i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v5, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_bfe_u32 v13, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-NEXT: v_add3_u32 v7, v13, v1, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v8 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v12 +; GFX11-NEXT: v_add3_u32 v8, v11, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v12, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v13, v7, 16, 1 +; GFX11-NEXT: v_bfe_u32 v14, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v11, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v12, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v11, v13, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v13, v14, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: v_perm_b32 v1, v5, v1, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_perm_b32 v3, v7, v3, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v15, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x7060302 +; GFX11-NEXT: .LBB47_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + +define <16 x i8> @bitcast_v8i16_to_v16i8(<8 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i16_to_v16i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v20, v7 +; GCN-NEXT: v_mov_b32_e32 v16, v6 +; GCN-NEXT: v_mov_b32_e32 v17, v4 +; GCN-NEXT: v_mov_b32_e32 v21, v3 +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v19, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v20 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB48_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB48_4 +; GCN-NEXT: .LBB48_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB48_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 +; GCN-NEXT: v_bfe_u32 v7, v21, 8, 8 +; GCN-NEXT: v_or_b32_e32 v0, v0, v24 +; GCN-NEXT: v_or_b32_e32 v4, v1, v25 +; GCN-NEXT: v_or_b32_e32 v8, v2, v22 +; GCN-NEXT: v_or_b32_e32 v12, v3, v23 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_bfe_u32 v15, v20, 8, 8 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB48_2 +; GCN-NEXT: .LBB48_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v18 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v0, v22, v0 +; GCN-NEXT: v_or_b32_e32 v1, v23, v1 +; GCN-NEXT: v_or_b32_e32 v2, v24, v2 +; GCN-NEXT: v_or_b32_e32 v3, v25, v3 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v3 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i16_to_v16i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v19, v3 +; VI-NEXT: v_mov_b32_e32 v18, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_mov_b32_e32 v16, v0 +; VI-NEXT: v_mov_b32_e32 v17, v1 +; VI-NEXT: v_mov_b32_e32 v8, v18 +; VI-NEXT: v_mov_b32_e32 v21, v19 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 3 +; VI-NEXT: v_add_u16_sdwa v6, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v14, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v10, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v17, 3, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; VI-NEXT: v_add_u16_e32 v16, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_add_u16_e32 v21, 3, v19 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_add_u16_e32 v8, 3, v18 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; VI-NEXT: v_or_b32_e32 v1, v17, v1 +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: v_or_b32_e32 v19, v21, v4 +; VI-NEXT: v_or_b32_e32 v18, v8, v3 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[18:19] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v0 +; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v16 +; VI-NEXT: v_mov_b32_e32 v1, v20 +; VI-NEXT: v_mov_b32_e32 v4, v17 +; VI-NEXT: v_mov_b32_e32 v12, v21 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i16_to_v16i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i16_to_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB48_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB48_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i16> %a, splat (i16 3) + %a2 = bitcast <8 x i16> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x i16> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + +define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i8_to_v8i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 24, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v9 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB49_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v8 +; GCN-NEXT: v_or_b32_e32 v1, v1, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v4, v4, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v0, v0, v22 +; GCN-NEXT: v_or_b32_e32 v7, v7, v23 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v3, v16, v3 +; GCN-NEXT: v_or_b32_e32 v2, v17, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v8, v18, v5 +; GCN-NEXT: v_or_b32_e32 v5, v19, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v11, v1, v3 +; GCN-NEXT: v_or_b32_e32 v15, v4, v8 +; GCN-NEXT: v_or_b32_e32 v9, v0, v2 +; GCN-NEXT: v_or_b32_e32 v13, v6, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v1, v11, v2, 16 +; GCN-NEXT: v_alignbit_b32 v5, v15, v5, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: .LBB49_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB49_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v1, v23, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v5, v21, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v0, v22, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v4, v20, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; GCN-NEXT: v_or_b32_e32 v3, v19, v3 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_or_b32_e32 v7, v18, v7 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_or_b32_e32 v2, v17, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v6, v16, v6 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_or_b32_e32 v3, v7, v5 +; GCN-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-NEXT: v_or_b32_e32 v2, v6, v4 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v2 +; GCN-NEXT: v_alignbit_b32 v1, v11, v9, 16 +; GCN-NEXT: v_alignbit_b32 v5, v15, v13, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GCN-NEXT: .LBB49_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v9 +; GCN-NEXT: v_mov_b32_e32 v2, v11 +; GCN-NEXT: v_mov_b32_e32 v4, v13 +; GCN-NEXT: v_mov_b32_e32 v6, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i8_to_v8i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v18, v2 +; VI-NEXT: v_mov_b32_e32 v17, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB49_2 +; VI-NEXT: .LBB49_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v17 +; VI-NEXT: v_add_u16_e32 v1, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v3, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v2, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v8 +; VI-NEXT: v_add_u16_e32 v4, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v12 +; VI-NEXT: v_add_u16_e32 v6, 3, v14 +; VI-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i8_to_v8i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB49_2 +; GFX9-NEXT: .LBB49_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i8_to_v8i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v15 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB49_4 +; GFX11-NEXT: .LBB49_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v14 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v20 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v8, v11 +; GFX11-NEXT: v_or_b32_e32 v7, v10, v13 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB49_2 +; GFX11-NEXT: .LBB49_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v17, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v6, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v10, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v12, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v14, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v16, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v19, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v20, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v13, v10 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <8 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <8 x i16> + br label %end + +end: + %phi = phi <8 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i16> %phi +} + +define <8 x bfloat> @bitcast_v8f16_to_v8bf16(<8 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f16_to_v8bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v7 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB50_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB50_4 +; GCN-NEXT: .LBB50_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB50_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB50_2 +; GCN-NEXT: .LBB50_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v8 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v11 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f16_to_v8bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v5, 0x200 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v8, v3 +; VI-NEXT: v_or_b32_e32 v2, v7, v2 +; VI-NEXT: v_or_b32_e32 v1, v6, v1 +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v8bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} + +define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v8bf16_to_v8f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v7 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB51_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB51_4 +; GCN-NEXT: .LBB51_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB51_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB51_2 +; GCN-NEXT: .LBB51_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v8 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8bf16_to_v8f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v0, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v0 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v6 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v1 +; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_bfe_u32 v7, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v6 +; VI-NEXT: v_add_u32_e32 v7, vcc, s6, v7 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; VI-NEXT: v_bfe_u32 v7, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v7, vcc, v7, v2 +; VI-NEXT: v_add_u32_e32 v7, vcc, s6, v7 +; VI-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_bfe_u32 v8, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v7 +; VI-NEXT: v_add_u32_e32 v8, vcc, s6, v8 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; VI-NEXT: v_bfe_u32 v8, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8 +; VI-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v7, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v6, 16 +; VI-NEXT: v_alignbit_b32 v1, v1, v5, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v8f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v6, v6, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add3_u32 v7, v7, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX9-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v7, v7, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_add3_u32 v8, v8, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc +; GFX9-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v8, v8, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v3, v7, v3, s6 +; GFX9-NEXT: v_perm_b32 v2, v6, v2, s6 +; GFX9-NEXT: v_perm_b32 v1, v5, v1, s6 +; GFX9-NEXT: v_perm_b32 v0, v4, v0, s6 +; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v5, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-NEXT: v_bfe_u32 v13, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-NEXT: v_add3_u32 v7, v13, v1, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_bfe_u32 v11, v6, 16, 1 +; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v8 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v12 +; GFX11-NEXT: v_add3_u32 v8, v11, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v12, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v13, v7, 16, 1 +; GFX11-NEXT: v_bfe_u32 v14, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v11, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v12, v2, 0x7fff +; GFX11-NEXT: v_add3_u32 v11, v13, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v13, v14, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v3 +; GFX11-NEXT: v_perm_b32 v1, v5, v1, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_perm_b32 v3, v7, v3, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v15, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x7060302 +; GFX11-NEXT: .LBB51_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + +define <16 x i8> @bitcast_v8f16_to_v16i8(<8 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f16_to_v16i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v9, v6 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v9 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB52_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB52_4 +; GCN-NEXT: .LBB52_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB52_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 +; GCN-NEXT: v_or_b32_e32 v0, v17, v0 +; GCN-NEXT: v_or_b32_e32 v4, v16, v1 +; GCN-NEXT: v_or_b32_e32 v8, v19, v2 +; GCN-NEXT: v_or_b32_e32 v12, v18, v3 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_bfe_u32 v15, v14, 8, 8 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB52_2 +; GCN-NEXT: .LBB52_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 +; GCN-NEXT: v_or_b32_e32 v8, v1, v0 +; GCN-NEXT: v_or_b32_e32 v12, v2, v9 +; GCN-NEXT: v_or_b32_e32 v0, v4, v3 +; GCN-NEXT: v_or_b32_e32 v4, v5, v10 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_bfe_u32 v15, v14, 8, 8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f16_to_v16i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v19, v1 +; VI-NEXT: v_mov_b32_e32 v18, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: .LBB52_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 0x200 +; VI-NEXT: v_add_f16_sdwa v6, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 +; VI-NEXT: v_add_f16_sdwa v2, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v14, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v10, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v19, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_or_b32_e32 v0, v18, v0 +; VI-NEXT: v_or_b32_e32 v8, v17, v4 +; VI-NEXT: v_or_b32_e32 v7, v16, v3 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f16_to_v16i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v19, v19, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB52_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f16_to_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB52_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB52_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x half> %a, splat (half 0xH0200) + %a2 = bitcast <8 x half> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x half> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + +define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i8_to_v8f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v17, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 8, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB53_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GCN-NEXT: v_or_b32_e32 v0, v0, v16 +; GCN-NEXT: v_or_b32_e32 v1, v1, v18 +; GCN-NEXT: v_or_b32_e32 v2, v2, v19 +; GCN-NEXT: v_or_b32_e32 v3, v3, v20 +; GCN-NEXT: v_or_b32_e32 v4, v4, v21 +; GCN-NEXT: v_or_b32_e32 v5, v5, v22 +; GCN-NEXT: v_or_b32_e32 v6, v6, v23 +; GCN-NEXT: v_or_b32_e32 v7, v7, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: .LBB53_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB53_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v17 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v0, v15, v0 +; GCN-NEXT: v_or_b32_e32 v1, v23, v1 +; GCN-NEXT: v_or_b32_e32 v3, v22, v3 +; GCN-NEXT: v_or_b32_e32 v5, v21, v5 +; GCN-NEXT: v_or_b32_e32 v6, v20, v6 +; GCN-NEXT: v_or_b32_e32 v4, v19, v4 +; GCN-NEXT: v_or_b32_e32 v2, v18, v2 +; GCN-NEXT: v_or_b32_e32 v7, v16, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x300, v0 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v8 +; GCN-NEXT: .LBB53_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v11 +; GCN-NEXT: v_mov_b32_e32 v4, v9 +; GCN-NEXT: v_mov_b32_e32 v6, v13 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i8_to_v8f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v18, v2 +; VI-NEXT: v_mov_b32_e32 v17, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB53_4 +; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB53_2 +; VI-NEXT: .LBB53_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v17 +; VI-NEXT: v_add_u16_e32 v1, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v3, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v2, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v8 +; VI-NEXT: v_add_u16_e32 v4, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v12 +; VI-NEXT: v_add_u16_e32 v6, 3, v14 +; VI-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i8_to_v8f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB53_2 +; GFX9-NEXT: .LBB53_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i8_to_v8f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v15 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v14 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v20 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v8, v11 +; GFX11-NEXT: v_or_b32_e32 v7, v10, v13 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB53_2 +; GFX11-NEXT: .LBB53_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v17, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v6, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v10, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v12, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v14, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v16, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v19, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v20, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v13, v10 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <8 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <8 x half> + br label %end + +end: + %phi = phi <8 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x half> %phi +} + +define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v8bf16_to_v16i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v6 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB54_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB54_4 +; GCN-NEXT: .LBB54_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB54_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v20 +; GCN-NEXT: v_alignbit_b32 v0, v0, v18, 16 +; GCN-NEXT: v_alignbit_b32 v4, v6, v16, 16 +; GCN-NEXT: v_alignbit_b32 v8, v1, v21, 16 +; GCN-NEXT: v_alignbit_b32 v12, v14, v19, 16 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB54_2 +; GCN-NEXT: .LBB54_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; GCN-NEXT: v_alignbit_b32 v8, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v12, v14, v2, 16 +; GCN-NEXT: v_alignbit_b32 v0, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v4, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v15 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8bf16_to_v16i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v17, v3 +; VI-NEXT: v_mov_b32_e32 v16, v2 +; VI-NEXT: v_mov_b32_e32 v19, v1 +; VI-NEXT: v_mov_b32_e32 v18, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: .LBB54_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB54_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v19 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v19, v1, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v18, v1, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v17 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v17, v1, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; VI-NEXT: .LBB54_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v18 +; VI-NEXT: v_mov_b32_e32 v4, v19 +; VI-NEXT: v_mov_b32_e32 v8, v16 +; VI-NEXT: v_mov_b32_e32 v12, v17 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8bf16_to_v16i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v17, v3 +; GFX9-NEXT: v_mov_b32_e32 v16, v2 +; GFX9-NEXT: v_mov_b32_e32 v19, v1 +; GFX9-NEXT: v_mov_b32_e32 v18, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX9-NEXT: .LBB54_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB54_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v17 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v7, v7, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v7, v8, vcc +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v7, v7, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v10, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v10, v10, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; GFX9-NEXT: v_perm_b32 v1, v2, v3, s7 +; GFX9-NEXT: v_perm_b32 v0, v0, v4, s7 +; GFX9-NEXT: v_perm_b32 v8, v5, v9, s7 +; GFX9-NEXT: v_perm_b32 v7, v6, v10, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: .LBB54_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v18 +; GFX9-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-NEXT: v_mov_b32_e32 v12, v17 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8bf16_to_v16i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v17, v3 :: v_dual_mov_b32 v16, v2 +; GFX11-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v18, v0 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[16:17] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[18:19] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v18 +; GFX11-NEXT: .LBB54_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v4 +; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v12, v7, v8 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v1, v11, v3, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v9, v9, v2, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v4, vcc_lo +; GFX11-NEXT: v_dual_add_f32 v4, 0x40c00000, v7 :: v_dual_lshlrev_b32 v1, 16, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v3 +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v1 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v8 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v9, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v0, v1, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v10, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v0, v0, v1, 0x7fff +; GFX11-NEXT: v_perm_b32 v8, v6, v4, 0x7060302 +; GFX11-NEXT: v_add3_u32 v10, v10, v7, 0x7fff +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v11, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_perm_b32 v1, v5, v12, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v0, v13, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v2, v3, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_perm_b32 v7, v7, v9, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: .LBB54_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v18 +; GFX11-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-NEXT: v_mov_b32_e32 v8, v16 +; GFX11-NEXT: v_mov_b32_e32 v12, v17 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <8 x bfloat> %a1 to <16 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x bfloat> %a to <16 x i8> + br label %end + +end: + %phi = phi <16 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i8> %phi +} + +define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i8_to_v8bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v15 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB55_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v3, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v8, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v11, v1, v0 +; GCN-NEXT: v_or_b32_e32 v15, v18, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v19, v4 +; GCN-NEXT: v_or_b32_e32 v13, v6, v5 +; GCN-NEXT: v_or_b32_e32 v5, v20, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; GCN-NEXT: v_or_b32_e32 v7, v21, v9 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: .LBB55_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB55_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v12 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v14 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v9 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v23, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v9 +; GCN-NEXT: v_or_b32_e32 v4, v22, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v3 +; GCN-NEXT: v_or_b32_e32 v3, v21, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v7 +; GCN-NEXT: v_or_b32_e32 v7, v20, v8 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v6, v19, v6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_or_b32_e32 v1, v18, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v7, v5 +; GCN-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v2 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; GCN-NEXT: .LBB55_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v11 +; GCN-NEXT: v_mov_b32_e32 v1, v15 +; GCN-NEXT: v_mov_b32_e32 v2, v17 +; GCN-NEXT: v_mov_b32_e32 v4, v13 +; GCN-NEXT: v_mov_b32_e32 v6, v16 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i8_to_v8bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v18, v2 +; VI-NEXT: v_mov_b32_e32 v17, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB55_2 +; VI-NEXT: .LBB55_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v17 +; VI-NEXT: v_add_u16_e32 v1, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v3, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v2, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v8 +; VI-NEXT: v_add_u16_e32 v4, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v12 +; VI-NEXT: v_add_u16_e32 v6, 3, v14 +; VI-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v4, v3 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i8_to_v8bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v18, v2 +; GFX9-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v15 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB55_2 +; GFX9-NEXT: .LBB55_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v17 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i8_to_v8bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v17, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v16 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v15 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v14 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v19 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v20 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v9 +; GFX11-NEXT: v_or_b32_e32 v6, v8, v11 +; GFX11-NEXT: v_or_b32_e32 v7, v10, v13 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB55_2 +; GFX11-NEXT: .LBB55_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v17, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v6, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v10, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v12, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v14, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v16, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v19, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v20, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v11, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v13, v10 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i8> %a, splat (i8 3) + %a2 = bitcast <16 x i8> %a1 to <8 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i8> %a to <8 x bfloat> + br label %end + +end: + %phi = phi <8 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x bfloat> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll new file mode 100644 index 0000000000000..e7262375fbeb0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <5 x float> @bitcast_v5i32_to_v5f32(<5 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v5i32_to_v5f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5i32_to_v5f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5i32_to_v5f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5i32_to_v5f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i32> %a, splat (i32 3) + %a2 = bitcast <5 x i32> %a1 to <5 x float> + br label %end + +cmp.false: + %a3 = bitcast <5 x i32> %a to <5 x float> + br label %end + +end: + %phi = phi <5 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x float> %phi +} + +define <5 x i32> @bitcast_v5f32_to_v5i32(<5 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v5f32_to_v5i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f32_to_v5i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f32_to_v5i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f32_to_v5i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v5 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v3, 1.0, v3 +; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <5 x float> %a1 to <5 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x float> %a to <5 x i32> + br label %end + +end: + %phi = phi <5 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i32> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll new file mode 100644 index 0000000000000..1185a12a474ea --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll @@ -0,0 +1,556 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define half @bitcast_i16_to_f16(i16 %a, i32 %b) { +; GCN-LABEL: bitcast_i16_to_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB0_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB0_4 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB0_3: ; %cmp.false +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: .LBB0_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i16_to_f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i16_to_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i16_to_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v0, 3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i16 %a, 3 + %a2 = bitcast i16 %a1 to half + br label %end + +cmp.false: + %a3 = bitcast i16 %a to half + br label %end + +end: + %phi = phi half [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret half %phi +} + +define i16 @bitcast_f16_to_i16(half %a, i32 %b) { +; GCN-LABEL: bitcast_f16_to_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB1_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB1_4 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB1_3: ; %cmp.false +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: .LBB1_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f16_to_i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f16_to_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f16_e32 v0, 0x200, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f16_to_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f16_e32 v0, 0x200, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd half %a, 0xH0200 + %a2 = bitcast half %a1 to i16 + br label %end + +cmp.false: + %a3 = bitcast half %a to i16 + br label %end + +end: + %phi = phi i16 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i16 %phi +} + +define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) { +; GCN-LABEL: bitcast_i16_to_bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i16_to_bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i16_to_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i16_to_bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v0, 3 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i16 %a, 3 + %a2 = bitcast i16 %a1 to bfloat + br label %end + +cmp.false: + %a3 = bitcast i16 %a to bfloat + br label %end + +end: + %phi = phi bfloat [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret bfloat %phi +} + +define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { +; GCN-LABEL: bitcast_bf16_to_i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB3_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB3_4 +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB3_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: .LBB3_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_bf16_to_i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_bf16_to_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_bf16_to_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd bfloat %a, 0xR40C0 + %a2 = bitcast bfloat %a1 to i16 + br label %end + +cmp.false: + %a3 = bitcast bfloat %a to i16 + br label %end + +end: + %phi = phi i16 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i16 %phi +} + +define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) { +; GCN-LABEL: bitcast_f16_to_bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB4_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB4_4 +; GCN-NEXT: .LBB4_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB4_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: .LBB4_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f16_to_bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f16_to_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f16_e32 v0, 0x200, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f16_to_bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f16_e32 v0, 0x200, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd half %a, 0xH0200 + %a2 = bitcast half %a1 to bfloat + br label %end + +cmp.false: + %a3 = bitcast half %a to bfloat + br label %end + +end: + %phi = phi bfloat [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret bfloat %phi +} + +define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { +; GCN-LABEL: bitcast_bf16_to_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB5_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB5_4 +; GCN-NEXT: .LBB5_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB5_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: .LBB5_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_bf16_to_f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_bf16_to_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_bf16_to_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd bfloat %a, 0xR40C0 + %a2 = bitcast bfloat %a1 to half + br label %end + +cmp.false: + %a3 = bitcast bfloat %a to half + br label %end + +end: + %phi = phi half [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret half %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll new file mode 100644 index 0000000000000..61f9232ea50a1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll @@ -0,0 +1,1068 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <6 x float> @bitcast_v6i32_to_v6f32(<6 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v6i32_to_v6f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6i32_to_v6f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6i32_to_v6f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i32_to_v6f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + +define <6 x i32> @bitcast_v6f32_to_v6i32(<6 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v6f32_to_v6i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f32_to_v6i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v6i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v6i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + +define <3 x i64> @bitcast_v6i32_to_v3i64(<6 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v6i32_to_v3i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6i32_to_v3i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6i32_to_v3i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i32_to_v3i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} + +define <6 x i32> @bitcast_v3i64_to_v6i32(<3 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v3i64_to_v6i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3i64_to_v6i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3i64_to_v6i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i64_to_v6i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + +define <3 x double> @bitcast_v6i32_to_v3f64(<6 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v6i32_to_v3f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB4_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6i32_to_v3f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6i32_to_v3f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i32_to_v3f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i32> %a, splat (i32 3) + %a2 = bitcast <6 x i32> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <6 x i32> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + +define <6 x i32> @bitcast_v3f64_to_v6i32(<3 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v3f64_to_v6i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB5_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f64_to_v6i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v6i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v6i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <6 x i32> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <6 x i32> + br label %end + +end: + %phi = phi <6 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i32> %phi +} + +define <3 x i64> @bitcast_v6f32_to_v3i64(<6 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v6f32_to_v3i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB6_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f32_to_v3i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v3i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v3i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} + +define <6 x float> @bitcast_v3i64_to_v6f32(<3 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v3i64_to_v6f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB7_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3i64_to_v6f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3i64_to_v6f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i64_to_v6f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + +define <3 x double> @bitcast_v6f32_to_v3f64(<6 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v6f32_to_v3f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB8_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB8_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f32_to_v3f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f32_to_v3f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f32_to_v3f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <6 x float> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <6 x float> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + +define <6 x float> @bitcast_v3f64_to_v6f32(<3 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v3f64_to_v6f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB9_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f64_to_v6f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v6f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v6f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <6 x float> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <6 x float> + br label %end + +end: + %phi = phi <6 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x float> %phi +} + +define <3 x double> @bitcast_v3i64_to_v3f64(<3 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v3i64_to_v3f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: .LBB10_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3i64_to_v3f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3i64_to_v3f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i64_to_v3f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i64> %a, splat (i64 3) + %a2 = bitcast <3 x i64> %a1 to <3 x double> + br label %end + +cmp.false: + %a3 = bitcast <3 x i64> %a to <3 x double> + br label %end + +end: + %phi = phi <3 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x double> %phi +} + +define <3 x i64> @bitcast_v3f64_to_v3i64(<3 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v3f64_to_v3i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB11_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: .LBB11_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f64_to_v3i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f64_to_v3i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f64_to_v3i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v6 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <3 x double> %a1 to <3 x i64> + br label %end + +cmp.false: + %a3 = bitcast <3 x double> %a to <3 x i64> + br label %end + +end: + %phi = phi <3 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i64> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll new file mode 100644 index 0000000000000..952be022750a6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll @@ -0,0 +1,194 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <7 x float> @bitcast_v7i32_to_v7f32(<7 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v7i32_to_v7f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7i32_to_v7f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7i32_to_v7f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7i32_to_v7f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v7 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i32> %a, splat (i32 3) + %a2 = bitcast <7 x i32> %a1 to <7 x float> + br label %end + +cmp.false: + %a3 = bitcast <7 x i32> %a to <7 x float> + br label %end + +end: + %phi = phi <7 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x float> %phi +} + +define <7 x i32> @bitcast_v7f32_to_v7i32(<7 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v7f32_to_v7i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f32_to_v7i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f32_to_v7i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f32_to_v7i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v7 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 +; GFX11-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v3, 1.0, v3 +; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <7 x float> %a1 to <7 x i32> + br label %end + +cmp.false: + %a3 = bitcast <7 x float> %a to <7 x i32> + br label %end + +end: + %phi = phi <7 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i32> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll new file mode 100644 index 0000000000000..6e7b5dd33ea0b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -0,0 +1,18360 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <8 x float> @bitcast_v8i32_to_v8f32(<8 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i32_to_v8f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i32_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i32_to_v8f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i32_to_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + +define <8 x i32> @bitcast_v8f32_to_v8i32(<8 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f32_to_v8i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v8i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + +define <4 x i64> @bitcast_v8i32_to_v4i64(<8 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i32_to_v4i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i32_to_v4i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i32_to_v4i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i32_to_v4i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + +define <8 x i32> @bitcast_v4i64_to_v8i32(<4 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i64_to_v8i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i64_to_v8i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i64_to_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i64_to_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + +define <4 x double> @bitcast_v8i32_to_v4f64(<8 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i32_to_v4f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB4_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i32_to_v4f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i32_to_v4f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i32_to_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + +define <8 x i32> @bitcast_v4f64_to_v8i32(<4 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f64_to_v8i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB5_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v8i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + +define <16 x i16> @bitcast_v8i32_to_v16i16(<8 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i32_to_v16i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v14, v7 +; GCN-NEXT: v_mov_b32_e32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v10, v5 +; GCN-NEXT: v_mov_b32_e32 v16, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v3 +; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB6_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB6_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v8, v16 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i32_to_v16i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i32_to_v16i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i32_to_v16i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <8 x i32> @bitcast_v16i16_to_v8i32(<16 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i16_to_v8i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v20, v6 +; GCN-NEXT: v_mov_b32_e32 v19, v4 +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB7_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB7_4 +; GCN-NEXT: .LBB7_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB7_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v0, v0, v22 +; GCN-NEXT: v_or_b32_e32 v1, v1, v23 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v2, v2, v16 +; GCN-NEXT: v_or_b32_e32 v3, v3, v21 +; GCN-NEXT: v_or_b32_e32 v4, v4, v9 +; GCN-NEXT: v_or_b32_e32 v5, v5, v11 +; GCN-NEXT: v_or_b32_e32 v6, v6, v13 +; GCN-NEXT: v_or_b32_e32 v7, v7, v15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: .LBB7_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v0, v22, v0 +; GCN-NEXT: v_or_b32_e32 v1, v23, v1 +; GCN-NEXT: v_or_b32_e32 v2, v16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v21, v3 +; GCN-NEXT: v_or_b32_e32 v4, v9, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-NEXT: v_or_b32_e32 v6, v13, v6 +; GCN-NEXT: v_or_b32_e32 v7, v15, v7 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v8i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_add_u16_e32 v8, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_u16_e32 v8, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v8, v6 +; VI-NEXT: v_add_u16_e32 v8, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v8, v5 +; VI-NEXT: v_add_u16_e32 v8, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v8, v4 +; VI-NEXT: v_add_u16_e32 v8, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v8, v3 +; VI-NEXT: v_add_u16_e32 v8, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v8, v2 +; VI-NEXT: v_add_u16_e32 v8, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_add_u16_e32 v8, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v8, v0 +; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i16_to_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + +define <16 x half> @bitcast_v8i32_to_v16f16(<8 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i32_to_v16f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v17, v7 +; GCN-NEXT: v_mov_b32_e32 v18, v6 +; GCN-NEXT: v_mov_b32_e32 v19, v5 +; GCN-NEXT: v_mov_b32_e32 v20, v4 +; GCN-NEXT: v_mov_b32_e32 v21, v3 +; GCN-NEXT: v_mov_b32_e32 v22, v2 +; GCN-NEXT: v_mov_b32_e32 v23, v1 +; GCN-NEXT: v_mov_b32_e32 v16, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB8_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB8_4 +; GCN-NEXT: .LBB8_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB8_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB8_2 +; GCN-NEXT: .LBB8_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i32_to_v16f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i32_to_v16f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i32_to_v16f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + +define <8 x i32> @bitcast_v16f16_to_v8i32(<16 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f16_to_v8i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB9_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB9_4 +; GCN-NEXT: .LBB9_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB9_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GCN-NEXT: v_or_b32_e32 v0, v25, v0 +; GCN-NEXT: v_or_b32_e32 v1, v23, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GCN-NEXT: v_or_b32_e32 v2, v19, v2 +; GCN-NEXT: v_or_b32_e32 v3, v17, v3 +; GCN-NEXT: v_or_b32_e32 v4, v16, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-NEXT: v_or_b32_e32 v6, v9, v6 +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: .LBB9_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v11, v12 +; GCN-NEXT: v_or_b32_e32 v6, v9, v13 +; GCN-NEXT: v_or_b32_e32 v7, v8, v10 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f16_to_v8i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v8, 0x200 +; VI-NEXT: v_add_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v9 +; VI-NEXT: v_add_f16_sdwa v9, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v9 +; VI-NEXT: v_add_f16_sdwa v9, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v9 +; VI-NEXT: v_add_f16_sdwa v9, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v9 +; VI-NEXT: v_add_f16_sdwa v9, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v9 +; VI-NEXT: v_add_f16_sdwa v9, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v9 +; VI-NEXT: v_add_f16_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v9 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + +define <16 x bfloat> @bitcast_v8i32_to_v16bf16(<8 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i32_to_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v23, v7 +; GCN-NEXT: v_mov_b32_e32 v22, v6 +; GCN-NEXT: v_mov_b32_e32 v21, v5 +; GCN-NEXT: v_mov_b32_e32 v20, v4 +; GCN-NEXT: v_mov_b32_e32 v19, v3 +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v1 +; GCN-NEXT: v_mov_b32_e32 v16, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_4 +; GCN-NEXT: .LBB10_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB10_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_2 +; GCN-NEXT: .LBB10_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v23 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i32_to_v16bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i32_to_v16bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i32_to_v16bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + +define <8 x i32> @bitcast_v16bf16_to_v8i32(<16 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v16bf16_to_v8i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB11_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB11_4 +; GCN-NEXT: .LBB11_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB11_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; GCN-NEXT: v_alignbit_b32 v0, v0, v26, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v24, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v16, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB11_2 +; GCN-NEXT: .LBB11_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v6, v13, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16bf16_to_v8i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v4 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v2 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v1 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v7, v7, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v8, s7 +; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v13, v9, 16, 1 +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v15, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add3_u32 v13, v13, v9, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v15, v6, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v12, v12, v7, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v7, v12, v14 :: v_dual_lshlrev_b32 v12, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v7, v7, v8, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v9, v13, v10 :: v_dual_add_f32 v10, 0x40c00000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v11, v14 :: v_dual_lshlrev_b32 v11, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v6, v6, v9, 0x7060302 +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v11 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_add3_u32 v11, v12, v10, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v14, v9, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v13, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v14, v9, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v12 :: v_dual_lshlrev_b32 v12, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_perm_b32 v5, v5, v10, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v13, v9, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v12 +; GFX11-NEXT: v_add3_u32 v11, v13, v9, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: v_bfe_u32 v13, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v13, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_add3_u32 v13, v14, v10, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_perm_b32 v4, v4, v8, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; GFX11-NEXT: v_bfe_u32 v16, v2, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v14, v11, 16, 1 +; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v12, v16, v2, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v10, v13, v15 :: v_dual_lshlrev_b32 v15, 16, v0 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v12, v13, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 +; GFX11-NEXT: v_add3_u32 v13, v14, v11, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_bfe_u32 v16, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v12 +; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc_lo +; GFX11-NEXT: v_add3_u32 v14, v15, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add3_u32 v16, v16, v12, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v14, v15 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_perm_b32 v1, v1, v11, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v16, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v13, v13, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v13, v18, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v12, 0x7060302 +; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + +define <32 x i8> @bitcast_v8i32_to_v32i8(<8 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i32_to_v32i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v28, v7 +; GCN-NEXT: v_mov_b32_e32 v24, v6 +; GCN-NEXT: v_mov_b32_e32 v20, v5 +; GCN-NEXT: v_mov_b32_e32 v16, v4 +; GCN-NEXT: v_mov_b32_e32 v12, v3 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: .LBB12_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: .LBB12_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v8, v32 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i32_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: .LBB12_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v35, vcc, 3, v35 +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v33, vcc, 3, v33 +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v8, v34 +; VI-NEXT: v_mov_b32_e32 v12, v35 +; VI-NEXT: v_mov_b32_e32 v16, v32 +; VI-NEXT: v_mov_b32_e32 v20, v33 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v28, v7 +; VI-NEXT: v_mov_b32_e32 v1, v38 +; VI-NEXT: v_mov_b32_e32 v6, v37 +; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i32_to_v32i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v35, 3, v35 +; GFX9-NEXT: v_add_u32_e32 v34, 3, v34 +; GFX9-NEXT: v_add_u32_e32 v33, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v32, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v34 +; GFX9-NEXT: v_mov_b32_e32 v12, v35 +; GFX9-NEXT: v_mov_b32_e32 v16, v32 +; GFX9-NEXT: v_mov_b32_e32 v20, v33 +; GFX9-NEXT: v_mov_b32_e32 v24, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, v38 +; GFX9-NEXT: v_mov_b32_e32 v6, v37 +; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i32_to_v32i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6 +; GFX11-NEXT: v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4 +; GFX11-NEXT: v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2 +; GFX11-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB12_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v39, 3, v39 +; GFX11-NEXT: v_add_nc_u32_e32 v37, 3, v37 +; GFX11-NEXT: v_add_nc_u32_e32 v35, 3, v35 +; GFX11-NEXT: v_add_nc_u32_e32 v33, 3, v33 +; GFX11-NEXT: v_add_nc_u32_e32 v32, 3, v32 +; GFX11-NEXT: v_add_nc_u32_e32 v34, 3, v34 +; GFX11-NEXT: v_add_nc_u32_e32 v36, 3, v36 +; GFX11-NEXT: v_add_nc_u32_e32 v38, 3, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB12_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i32> %a, splat (i32 3) + %a2 = bitcast <8 x i32> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x i32> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i8_to_v8i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB13_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB13_4 +; GCN-NEXT: .LBB13_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB13_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v30 +; GCN-NEXT: v_or_b32_e32 v0, v0, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v6, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v8, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v12, v12, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v16, v16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v20, v20, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v35, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v36, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v9, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v7, v11, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v9, v13, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v11, v15, v14 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v13, v17, v18 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v15, v19, v21 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v7, v14, v15 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB13_2 +; GCN-NEXT: .LBB13_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v0, v37, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v38, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v39, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v48, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v49, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v12, v21, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v16, v23, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v20, v25, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v35, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v36, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v9, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_or_b32_e32 v7, v11, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_or_b32_e32 v9, v13, v10 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v12 +; GCN-NEXT: v_or_b32_e32 v11, v15, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v16 +; GCN-NEXT: v_or_b32_e32 v13, v17, v18 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x300, v20 +; GCN-NEXT: v_or_b32_e32 v15, v19, v21 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NEXT: v_or_b32_e32 v7, v15, v14 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v8i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB13_4 +; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: .LBB13_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v7, 0x300 +; VI-NEXT: v_add_u16_e32 v2, 3, v33 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v8 +; VI-NEXT: v_add_u16_e32 v3, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v12 +; VI-NEXT: v_add_u16_e32 v4, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v16 +; VI-NEXT: v_add_u16_e32 v5, 3, v18 +; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v20 +; VI-NEXT: v_add_u16_e32 v6, 3, v22 +; VI-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v24 +; VI-NEXT: v_add_u16_e32 v8, 3, v26 +; VI-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_add_u16_sdwa v8, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v28 +; VI-NEXT: v_add_u16_e32 v10, 3, v30 +; VI-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_add_u16_sdwa v7, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i8_to_v8i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB13_4 +; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: .LBB13_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v20 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v28 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i8_to_v8i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v38, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v36, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v37, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v2 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v51 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v49 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v50 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v48 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v38 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v18 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v12 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v30 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v35 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v36 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v37 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v11, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v12, v16, v17 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: .LBB13_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v10, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v51, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v49, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v50, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v48, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v38, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v39, v5 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_nc_u16 v6, v18, 3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_add_nc_u16 v3, v12, 3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_add_nc_u16 v4, v14, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v22, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v30, 3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_or_b32_e32 v3, v35, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v37, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v19, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v21, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v10 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v12 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v14 +; GFX11-NEXT: v_or_b32_e32 v12, v17, v16 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <8 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <8 x i32> + br label %end + +end: + %phi = phi <8 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i32> %phi +} + +define <4 x i64> @bitcast_v8f32_to_v4i64(<8 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f32_to_v4i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB14_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB14_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v4i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v4i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v4i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + +define <8 x float> @bitcast_v4i64_to_v8f32(<4 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i64_to_v8f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB15_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB15_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i64_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i64_to_v8f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i64_to_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB15_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + +define <4 x double> @bitcast_v8f32_to_v4f64(<8 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f32_to_v4f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB16_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB16_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v4f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v4f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + +define <8 x float> @bitcast_v4f64_to_v8f32(<4 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f64_to_v8f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB17_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB17_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v8f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + +define <16 x i16> @bitcast_v8f32_to_v16i16(<8 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f32_to_v16i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v14, v7 +; GCN-NEXT: v_mov_b32_e32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v10, v5 +; GCN-NEXT: v_mov_b32_e32 v16, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v3 +; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB18_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB18_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v8, v16 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v16i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v16i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v16i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <8 x float> @bitcast_v16i16_to_v8f32(<16 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i16_to_v8f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v20, v6 +; GCN-NEXT: v_mov_b32_e32 v19, v4 +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB19_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB19_4 +; GCN-NEXT: .LBB19_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB19_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v0, v0, v22 +; GCN-NEXT: v_or_b32_e32 v1, v1, v23 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v2, v2, v16 +; GCN-NEXT: v_or_b32_e32 v3, v3, v21 +; GCN-NEXT: v_or_b32_e32 v4, v4, v9 +; GCN-NEXT: v_or_b32_e32 v5, v5, v11 +; GCN-NEXT: v_or_b32_e32 v6, v6, v13 +; GCN-NEXT: v_or_b32_e32 v7, v7, v15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB19_2 +; GCN-NEXT: .LBB19_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v0, v22, v0 +; GCN-NEXT: v_or_b32_e32 v1, v23, v1 +; GCN-NEXT: v_or_b32_e32 v2, v16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v21, v3 +; GCN-NEXT: v_or_b32_e32 v4, v9, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-NEXT: v_or_b32_e32 v6, v13, v6 +; GCN-NEXT: v_or_b32_e32 v7, v15, v7 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_add_u16_e32 v8, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_u16_e32 v8, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v8, v6 +; VI-NEXT: v_add_u16_e32 v8, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v8, v5 +; VI-NEXT: v_add_u16_e32 v8, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v8, v4 +; VI-NEXT: v_add_u16_e32 v8, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v8, v3 +; VI-NEXT: v_add_u16_e32 v8, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v8, v2 +; VI-NEXT: v_add_u16_e32 v8, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_add_u16_e32 v8, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v8, v0 +; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i16_to_v8f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + +define <16 x half> @bitcast_v8f32_to_v16f16(<8 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f32_to_v16f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v17, v7 +; GCN-NEXT: v_mov_b32_e32 v18, v6 +; GCN-NEXT: v_mov_b32_e32 v19, v5 +; GCN-NEXT: v_mov_b32_e32 v20, v4 +; GCN-NEXT: v_mov_b32_e32 v21, v3 +; GCN-NEXT: v_mov_b32_e32 v22, v2 +; GCN-NEXT: v_mov_b32_e32 v23, v1 +; GCN-NEXT: v_mov_b32_e32 v16, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB20_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB20_4 +; GCN-NEXT: .LBB20_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB20_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB20_2 +; GCN-NEXT: .LBB20_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v23 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v22 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v21 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v20 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v19 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v18 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v16f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v16f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v16f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + +define <8 x float> @bitcast_v16f16_to_v8f32(<16 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f16_to_v8f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB21_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB21_4 +; GCN-NEXT: .LBB21_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB21_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GCN-NEXT: v_or_b32_e32 v0, v25, v0 +; GCN-NEXT: v_or_b32_e32 v1, v23, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GCN-NEXT: v_or_b32_e32 v2, v19, v2 +; GCN-NEXT: v_or_b32_e32 v3, v17, v3 +; GCN-NEXT: v_or_b32_e32 v4, v16, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-NEXT: v_or_b32_e32 v6, v9, v6 +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB21_2 +; GCN-NEXT: .LBB21_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v11, v12 +; GCN-NEXT: v_or_b32_e32 v6, v9, v13 +; GCN-NEXT: v_or_b32_e32 v7, v8, v10 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f16_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v8, 0x200 +; VI-NEXT: v_add_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v9 +; VI-NEXT: v_add_f16_sdwa v9, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v9 +; VI-NEXT: v_add_f16_sdwa v9, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v9 +; VI-NEXT: v_add_f16_sdwa v9, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v9 +; VI-NEXT: v_add_f16_sdwa v9, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v9 +; VI-NEXT: v_add_f16_sdwa v9, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v9 +; VI-NEXT: v_add_f16_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v9 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v8f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + +define <16 x bfloat> @bitcast_v8f32_to_v16bf16(<8 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f32_to_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v23, v7 +; GCN-NEXT: v_mov_b32_e32 v22, v6 +; GCN-NEXT: v_mov_b32_e32 v21, v5 +; GCN-NEXT: v_mov_b32_e32 v20, v4 +; GCN-NEXT: v_mov_b32_e32 v19, v3 +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v1 +; GCN-NEXT: v_mov_b32_e32 v16, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB22_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB22_4 +; GCN-NEXT: .LBB22_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB22_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB22_2 +; GCN-NEXT: .LBB22_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v17 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v18 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v19 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v20 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v21 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v22 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v23 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v16bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v16bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v16bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + +define <8 x float> @bitcast_v16bf16_to_v8f32(<16 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v16bf16_to_v8f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB23_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB23_4 +; GCN-NEXT: .LBB23_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB23_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; GCN-NEXT: v_alignbit_b32 v0, v0, v26, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v24, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v16, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB23_2 +; GCN-NEXT: .LBB23_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v6, v13, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16bf16_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v4 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v2 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v1 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v8f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v7, v7, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v8, s7 +; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v13, v9, 16, 1 +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v15, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add3_u32 v13, v13, v9, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v15, v6, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v12, v12, v7, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v7, v12, v14 :: v_dual_lshlrev_b32 v12, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v7, v7, v8, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v9, v13, v10 :: v_dual_add_f32 v10, 0x40c00000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v11, v14 :: v_dual_lshlrev_b32 v11, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v6, v6, v9, 0x7060302 +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v11 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_add3_u32 v11, v12, v10, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v14, v9, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v13, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v14, v9, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v12 :: v_dual_lshlrev_b32 v12, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_perm_b32 v5, v5, v10, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v13, v9, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v12 +; GFX11-NEXT: v_add3_u32 v11, v13, v9, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: v_bfe_u32 v13, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v13, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_add3_u32 v13, v14, v10, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_perm_b32 v4, v4, v8, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; GFX11-NEXT: v_bfe_u32 v16, v2, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v14, v11, 16, 1 +; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v12, v16, v2, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v10, v13, v15 :: v_dual_lshlrev_b32 v15, 16, v0 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v12, v13, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 +; GFX11-NEXT: v_add3_u32 v13, v14, v11, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_bfe_u32 v16, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v12 +; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc_lo +; GFX11-NEXT: v_add3_u32 v14, v15, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add3_u32 v16, v16, v12, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v14, v15 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_perm_b32 v1, v1, v11, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v16, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v13, v13, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v13, v18, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v12, 0x7060302 +; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + +define <32 x i8> @bitcast_v8f32_to_v32i8(<8 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f32_to_v32i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v28, v7 +; GCN-NEXT: v_mov_b32_e32 v24, v6 +; GCN-NEXT: v_mov_b32_e32 v20, v5 +; GCN-NEXT: v_mov_b32_e32 v16, v4 +; GCN-NEXT: v_mov_b32_e32 v12, v3 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: .LBB24_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: .LBB24_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v8, v32 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f32_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_add_f32_e32 v35, 1.0, v35 +; VI-NEXT: v_add_f32_e32 v34, 1.0, v34 +; VI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; VI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v8, v34 +; VI-NEXT: v_mov_b32_e32 v12, v35 +; VI-NEXT: v_mov_b32_e32 v16, v32 +; VI-NEXT: v_mov_b32_e32 v20, v33 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v28, v7 +; VI-NEXT: v_mov_b32_e32 v1, v38 +; VI-NEXT: v_mov_b32_e32 v6, v37 +; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f32_to_v32i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_add_f32_e32 v35, 1.0, v35 +; GFX9-NEXT: v_add_f32_e32 v34, 1.0, v34 +; GFX9-NEXT: v_add_f32_e32 v33, 1.0, v33 +; GFX9-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v34 +; GFX9-NEXT: v_mov_b32_e32 v12, v35 +; GFX9-NEXT: v_mov_b32_e32 v16, v32 +; GFX9-NEXT: v_mov_b32_e32 v20, v33 +; GFX9-NEXT: v_mov_b32_e32 v24, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, v38 +; GFX9-NEXT: v_mov_b32_e32 v6, v37 +; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f32_to_v32i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6 +; GFX11-NEXT: v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4 +; GFX11-NEXT: v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2 +; GFX11-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB24_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v39, 1.0, v39 :: v_dual_add_f32 v32, 1.0, v32 +; GFX11-NEXT: v_dual_add_f32 v37, 1.0, v37 :: v_dual_add_f32 v34, 1.0, v34 +; GFX11-NEXT: v_dual_add_f32 v35, 1.0, v35 :: v_dual_add_f32 v36, 1.0, v36 +; GFX11-NEXT: v_dual_add_f32 v33, 1.0, v33 :: v_dual_add_f32 v38, 1.0, v38 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB24_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <8 x float> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x float> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i8_to_v8f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB25_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB25_4 +; GCN-NEXT: .LBB25_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB25_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v30 +; GCN-NEXT: v_or_b32_e32 v0, v0, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v6, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v8, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v12, v12, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v16, v16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v20, v20, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v35, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v36, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v9, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v7, v11, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v9, v13, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v11, v15, v14 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v13, v17, v18 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v15, v19, v21 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v7, v14, v15 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB25_2 +; GCN-NEXT: .LBB25_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v0, v37, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v38, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v39, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v48, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v49, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v12, v21, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v16, v23, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v20, v25, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v35, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v36, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v9, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_or_b32_e32 v7, v11, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_or_b32_e32 v9, v13, v10 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v12 +; GCN-NEXT: v_or_b32_e32 v11, v15, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v16 +; GCN-NEXT: v_or_b32_e32 v13, v17, v18 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x300, v20 +; GCN-NEXT: v_or_b32_e32 v15, v19, v21 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NEXT: v_or_b32_e32 v7, v15, v14 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v8f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: .LBB25_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v7, 0x300 +; VI-NEXT: v_add_u16_e32 v2, 3, v33 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v8 +; VI-NEXT: v_add_u16_e32 v3, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v12 +; VI-NEXT: v_add_u16_e32 v4, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v16 +; VI-NEXT: v_add_u16_e32 v5, 3, v18 +; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v20 +; VI-NEXT: v_add_u16_e32 v6, 3, v22 +; VI-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v24 +; VI-NEXT: v_add_u16_e32 v8, 3, v26 +; VI-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_add_u16_sdwa v8, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v28 +; VI-NEXT: v_add_u16_e32 v10, 3, v30 +; VI-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_add_u16_sdwa v7, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i8_to_v8f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: .LBB25_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v20 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v28 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i8_to_v8f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v38, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v36, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v37, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v2 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v51 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v49 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v50 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v48 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v38 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v18 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v12 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v30 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v35 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v36 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v37 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v11, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v12, v16, v17 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v10, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v51, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v49, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v50, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v48, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v38, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v39, v5 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_nc_u16 v6, v18, 3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_add_nc_u16 v3, v12, 3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_add_nc_u16 v4, v14, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v22, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v30, 3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_or_b32_e32 v3, v35, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v37, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v19, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v21, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v10 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v12 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v14 +; GFX11-NEXT: v_or_b32_e32 v12, v17, v16 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <8 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <8 x float> + br label %end + +end: + %phi = phi <8 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x float> %phi +} + +define <4 x double> @bitcast_v4i64_to_v4f64(<4 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i64_to_v4f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB26_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: .LBB26_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i64_to_v4f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i64_to_v4f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i64_to_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + +define <4 x i64> @bitcast_v4f64_to_v4i64(<4 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f64_to_v4i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB27_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: .LBB27_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v4i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v4i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v4i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + +define <16 x i16> @bitcast_v4i64_to_v16i16(<4 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i64_to_v16i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v14, v7 +; GCN-NEXT: v_mov_b32_e32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v10, v5 +; GCN-NEXT: v_mov_b32_e32 v16, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v3 +; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB28_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v16, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB28_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v8, v16 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i64_to_v16i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i64_to_v16i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i64_to_v16i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <4 x i64> @bitcast_v16i16_to_v4i64(<16 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i16_to_v4i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v20, v6 +; GCN-NEXT: v_mov_b32_e32 v19, v4 +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB29_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB29_4 +; GCN-NEXT: .LBB29_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB29_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v0, v0, v22 +; GCN-NEXT: v_or_b32_e32 v1, v1, v23 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v2, v2, v16 +; GCN-NEXT: v_or_b32_e32 v3, v3, v21 +; GCN-NEXT: v_or_b32_e32 v4, v4, v9 +; GCN-NEXT: v_or_b32_e32 v5, v5, v11 +; GCN-NEXT: v_or_b32_e32 v6, v6, v13 +; GCN-NEXT: v_or_b32_e32 v7, v7, v15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB29_2 +; GCN-NEXT: .LBB29_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v0, v22, v0 +; GCN-NEXT: v_or_b32_e32 v1, v23, v1 +; GCN-NEXT: v_or_b32_e32 v2, v16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v21, v3 +; GCN-NEXT: v_or_b32_e32 v4, v9, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-NEXT: v_or_b32_e32 v6, v13, v6 +; GCN-NEXT: v_or_b32_e32 v7, v15, v7 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v4i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_add_u16_e32 v8, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_u16_e32 v8, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v8, v6 +; VI-NEXT: v_add_u16_e32 v8, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v8, v5 +; VI-NEXT: v_add_u16_e32 v8, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v8, v4 +; VI-NEXT: v_add_u16_e32 v8, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v8, v3 +; VI-NEXT: v_add_u16_e32 v8, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v8, v2 +; VI-NEXT: v_add_u16_e32 v8, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_add_u16_e32 v8, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v8, v0 +; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i16_to_v4i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v4i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + +define <16 x half> @bitcast_v4i64_to_v16f16(<4 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i64_to_v16f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v18, v7 +; GCN-NEXT: v_mov_b32_e32 v17, v6 +; GCN-NEXT: v_mov_b32_e32 v20, v5 +; GCN-NEXT: v_mov_b32_e32 v19, v4 +; GCN-NEXT: v_mov_b32_e32 v22, v3 +; GCN-NEXT: v_mov_b32_e32 v21, v2 +; GCN-NEXT: v_mov_b32_e32 v23, v1 +; GCN-NEXT: v_mov_b32_e32 v16, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB30_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB30_4 +; GCN-NEXT: .LBB30_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB30_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB30_2 +; GCN-NEXT: .LBB30_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v23, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v21 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v22, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v19 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v20, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v17 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v18, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i64_to_v16f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i64_to_v16f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i64_to_v16f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB30_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + +define <4 x i64> @bitcast_v16f16_to_v4i64(<16 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f16_to_v4i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB31_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB31_4 +; GCN-NEXT: .LBB31_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB31_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GCN-NEXT: v_or_b32_e32 v0, v25, v0 +; GCN-NEXT: v_or_b32_e32 v1, v23, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GCN-NEXT: v_or_b32_e32 v2, v19, v2 +; GCN-NEXT: v_or_b32_e32 v3, v17, v3 +; GCN-NEXT: v_or_b32_e32 v4, v16, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-NEXT: v_or_b32_e32 v6, v9, v6 +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB31_2 +; GCN-NEXT: .LBB31_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v11, v12 +; GCN-NEXT: v_or_b32_e32 v6, v9, v13 +; GCN-NEXT: v_or_b32_e32 v7, v8, v10 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f16_to_v4i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v8, 0x200 +; VI-NEXT: v_add_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v9 +; VI-NEXT: v_add_f16_sdwa v9, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v9 +; VI-NEXT: v_add_f16_sdwa v9, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v9 +; VI-NEXT: v_add_f16_sdwa v9, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v9 +; VI-NEXT: v_add_f16_sdwa v9, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v9 +; VI-NEXT: v_add_f16_sdwa v9, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v9 +; VI-NEXT: v_add_f16_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v9 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v4i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v4i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB31_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + +define <16 x bfloat> @bitcast_v4i64_to_v16bf16(<4 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i64_to_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v23, v7 +; GCN-NEXT: v_mov_b32_e32 v22, v6 +; GCN-NEXT: v_mov_b32_e32 v21, v5 +; GCN-NEXT: v_mov_b32_e32 v20, v4 +; GCN-NEXT: v_mov_b32_e32 v19, v3 +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v1 +; GCN-NEXT: v_mov_b32_e32 v16, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB32_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB32_4 +; GCN-NEXT: .LBB32_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB32_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB32_2 +; GCN-NEXT: .LBB32_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v17, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v18 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v19, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v20 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v21, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v23, vcc +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i64_to_v16bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i64_to_v16bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i64_to_v16bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB32_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + +define <4 x i64> @bitcast_v16bf16_to_v4i64(<16 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v16bf16_to_v4i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB33_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB33_4 +; GCN-NEXT: .LBB33_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB33_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; GCN-NEXT: v_alignbit_b32 v0, v0, v26, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v24, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v16, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB33_2 +; GCN-NEXT: .LBB33_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v6, v13, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16bf16_to_v4i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v4 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v2 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v1 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v4i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v7, v7, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v8, s7 +; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v4i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v13, v9, 16, 1 +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v15, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add3_u32 v13, v13, v9, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v15, v6, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v12, v12, v7, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v7, v12, v14 :: v_dual_lshlrev_b32 v12, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v7, v7, v8, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v9, v13, v10 :: v_dual_add_f32 v10, 0x40c00000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v11, v14 :: v_dual_lshlrev_b32 v11, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v6, v6, v9, 0x7060302 +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v11 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_add3_u32 v11, v12, v10, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v14, v9, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v13, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v14, v9, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v12 :: v_dual_lshlrev_b32 v12, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_perm_b32 v5, v5, v10, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v13, v9, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v12 +; GFX11-NEXT: v_add3_u32 v11, v13, v9, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: v_bfe_u32 v13, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v13, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_add3_u32 v13, v14, v10, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_perm_b32 v4, v4, v8, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; GFX11-NEXT: v_bfe_u32 v16, v2, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v14, v11, 16, 1 +; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v12, v16, v2, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v10, v13, v15 :: v_dual_lshlrev_b32 v15, 16, v0 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v12, v13, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 +; GFX11-NEXT: v_add3_u32 v13, v14, v11, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_bfe_u32 v16, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v12 +; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc_lo +; GFX11-NEXT: v_add3_u32 v14, v15, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add3_u32 v16, v16, v12, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v14, v15 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_perm_b32 v1, v1, v11, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v16, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v13, v13, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v13, v18, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v12, 0x7060302 +; GFX11-NEXT: .LBB33_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + +define <32 x i8> @bitcast_v4i64_to_v32i8(<4 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i64_to_v32i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v28, v7 +; GCN-NEXT: v_mov_b32_e32 v24, v6 +; GCN-NEXT: v_mov_b32_e32 v20, v5 +; GCN-NEXT: v_mov_b32_e32 v16, v4 +; GCN-NEXT: v_mov_b32_e32 v12, v3 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB34_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: .LBB34_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB34_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v20, vcc, 0, v20, vcc +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc +; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v32, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v32, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v32, 8 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: .LBB34_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v8, v32 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i64_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: .LBB34_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v34, vcc, 3, v34 +; VI-NEXT: v_addc_u32_e32 v35, vcc, 0, v35, vcc +; VI-NEXT: v_add_u32_e32 v32, vcc, 3, v32 +; VI-NEXT: v_addc_u32_e32 v33, vcc, 0, v33, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: .LBB34_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v8, v34 +; VI-NEXT: v_mov_b32_e32 v12, v35 +; VI-NEXT: v_mov_b32_e32 v16, v32 +; VI-NEXT: v_mov_b32_e32 v20, v33 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v28, v7 +; VI-NEXT: v_mov_b32_e32 v1, v38 +; VI-NEXT: v_mov_b32_e32 v6, v37 +; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i64_to_v32i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: .LBB34_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v34, vcc, 3, v34 +; GFX9-NEXT: v_addc_co_u32_e32 v35, vcc, 0, v35, vcc +; GFX9-NEXT: v_add_co_u32_e32 v32, vcc, 3, v32 +; GFX9-NEXT: v_addc_co_u32_e32 v33, vcc, 0, v33, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: .LBB34_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v34 +; GFX9-NEXT: v_mov_b32_e32 v12, v35 +; GFX9-NEXT: v_mov_b32_e32 v16, v32 +; GFX9-NEXT: v_mov_b32_e32 v20, v33 +; GFX9-NEXT: v_mov_b32_e32 v24, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, v38 +; GFX9-NEXT: v_mov_b32_e32 v6, v37 +; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i64_to_v32i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6 +; GFX11-NEXT: v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4 +; GFX11-NEXT: v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2 +; GFX11-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB34_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v36, vcc_lo, v36, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v37, null, 0, v37, vcc_lo +; GFX11-NEXT: v_add_co_u32 v34, vcc_lo, v34, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v35, null, 0, v35, vcc_lo +; GFX11-NEXT: v_add_co_u32 v32, vcc_lo, v32, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v33, null, 0, v33, vcc_lo +; GFX11-NEXT: v_add_co_u32 v38, vcc_lo, v38, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v39, null, 0, v39, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB34_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i64> %a, splat (i64 3) + %a2 = bitcast <4 x i64> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x i64> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i8_to_v4i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB35_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB35_4 +; GCN-NEXT: .LBB35_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB35_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v30 +; GCN-NEXT: v_or_b32_e32 v0, v0, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v6, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v8, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v12, v12, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v16, v16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v20, v20, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v35, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v36, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v9, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v7, v11, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v9, v13, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v11, v15, v14 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v13, v17, v18 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v15, v19, v21 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v7, v14, v15 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB35_2 +; GCN-NEXT: .LBB35_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v0, v37, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v38, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v39, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v48, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v49, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v12, v21, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v16, v23, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v20, v25, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v35, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v36, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v9, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_or_b32_e32 v7, v11, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_or_b32_e32 v9, v13, v10 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v12 +; GCN-NEXT: v_or_b32_e32 v11, v15, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v16 +; GCN-NEXT: v_or_b32_e32 v13, v17, v18 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x300, v20 +; GCN-NEXT: v_or_b32_e32 v15, v19, v21 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NEXT: v_or_b32_e32 v7, v15, v14 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v4i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB35_2 +; VI-NEXT: .LBB35_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v7, 0x300 +; VI-NEXT: v_add_u16_e32 v2, 3, v33 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v8 +; VI-NEXT: v_add_u16_e32 v3, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v12 +; VI-NEXT: v_add_u16_e32 v4, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v16 +; VI-NEXT: v_add_u16_e32 v5, 3, v18 +; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v20 +; VI-NEXT: v_add_u16_e32 v6, 3, v22 +; VI-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v24 +; VI-NEXT: v_add_u16_e32 v8, 3, v26 +; VI-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_add_u16_sdwa v8, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v28 +; VI-NEXT: v_add_u16_e32 v10, 3, v30 +; VI-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_add_u16_sdwa v7, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i8_to_v4i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB35_2 +; GFX9-NEXT: .LBB35_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v20 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v28 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i8_to_v4i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v38, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v36, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v37, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v2 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v51 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v49 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v50 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v48 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v38 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v18 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v12 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v30 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v35 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v36 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v37 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v11, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v12, v16, v17 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB35_2 +; GFX11-NEXT: .LBB35_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v10, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v51, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v49, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v50, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v48, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v38, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v39, v5 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_nc_u16 v6, v18, 3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_add_nc_u16 v3, v12, 3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_add_nc_u16 v4, v14, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v22, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v30, 3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_or_b32_e32 v3, v35, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v37, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v19, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v21, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v10 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v12 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v14 +; GFX11-NEXT: v_or_b32_e32 v12, v17, v16 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <4 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <4 x i64> + br label %end + +end: + %phi = phi <4 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i64> %phi +} + +define <16 x i16> @bitcast_v4f64_to_v16i16(<4 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f64_to_v16i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v17, v7 +; GCN-NEXT: v_mov_b32_e32 v16, v6 +; GCN-NEXT: v_mov_b32_e32 v19, v5 +; GCN-NEXT: v_mov_b32_e32 v18, v4 +; GCN-NEXT: v_mov_b32_e32 v21, v3 +; GCN-NEXT: v_mov_b32_e32 v20, v2 +; GCN-NEXT: v_mov_b32_e32 v23, v1 +; GCN-NEXT: v_mov_b32_e32 v22, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB36_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v13, v17, v16, 16 +; GCN-NEXT: v_alignbit_b32 v9, v19, v18, 16 +; GCN-NEXT: v_alignbit_b32 v5, v21, v20, 16 +; GCN-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; GCN-NEXT: .LBB36_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB36_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[22:23], v[22:23], 1.0 +; GCN-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 +; GCN-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; GCN-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; GCN-NEXT: v_alignbit_b32 v13, v17, v16, 16 +; GCN-NEXT: v_alignbit_b32 v9, v19, v18, 16 +; GCN-NEXT: v_alignbit_b32 v5, v21, v20, 16 +; GCN-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; GCN-NEXT: .LBB36_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v22 +; GCN-NEXT: v_mov_b32_e32 v2, v23 +; GCN-NEXT: v_mov_b32_e32 v4, v20 +; GCN-NEXT: v_mov_b32_e32 v6, v21 +; GCN-NEXT: v_mov_b32_e32 v8, v18 +; GCN-NEXT: v_mov_b32_e32 v10, v19 +; GCN-NEXT: v_mov_b32_e32 v12, v16 +; GCN-NEXT: v_mov_b32_e32 v14, v17 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v16i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v16i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v16i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <4 x double> @bitcast_v16i16_to_v4f64(<16 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i16_to_v4f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v20, v6 +; GCN-NEXT: v_mov_b32_e32 v19, v4 +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB37_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB37_4 +; GCN-NEXT: .LBB37_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB37_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v0, v0, v22 +; GCN-NEXT: v_or_b32_e32 v1, v1, v23 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v2, v2, v16 +; GCN-NEXT: v_or_b32_e32 v3, v3, v21 +; GCN-NEXT: v_or_b32_e32 v4, v4, v9 +; GCN-NEXT: v_or_b32_e32 v5, v5, v11 +; GCN-NEXT: v_or_b32_e32 v6, v6, v13 +; GCN-NEXT: v_or_b32_e32 v7, v7, v15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB37_2 +; GCN-NEXT: .LBB37_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v17 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v0, v22, v0 +; GCN-NEXT: v_or_b32_e32 v1, v23, v1 +; GCN-NEXT: v_or_b32_e32 v2, v16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v21, v3 +; GCN-NEXT: v_or_b32_e32 v4, v9, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-NEXT: v_or_b32_e32 v6, v13, v6 +; GCN-NEXT: v_or_b32_e32 v7, v15, v7 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v4f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB37_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_add_u16_e32 v8, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: v_add_u16_e32 v8, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v8, v6 +; VI-NEXT: v_add_u16_e32 v8, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v8, v5 +; VI-NEXT: v_add_u16_e32 v8, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v8, v4 +; VI-NEXT: v_add_u16_e32 v8, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v8, v3 +; VI-NEXT: v_add_u16_e32 v8, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v8, v2 +; VI-NEXT: v_add_u16_e32 v8, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_add_u16_e32 v8, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v8, v0 +; VI-NEXT: .LBB37_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i16_to_v4f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB37_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB37_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + +define <16 x half> @bitcast_v4f64_to_v16f16(<4 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f64_to_v16f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB38_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: .LBB38_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB38_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: .LBB38_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v19 +; GCN-NEXT: v_mov_b32_e32 v1, v23 +; GCN-NEXT: v_mov_b32_e32 v2, v16 +; GCN-NEXT: v_mov_b32_e32 v3, v22 +; GCN-NEXT: v_mov_b32_e32 v4, v17 +; GCN-NEXT: v_mov_b32_e32 v5, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v18 +; GCN-NEXT: v_mov_b32_e32 v7, v20 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v16f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v16f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v16f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + +define <4 x double> @bitcast_v16f16_to_v4f64(<16 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f16_to_v4f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v14 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB39_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB39_4 +; GCN-NEXT: .LBB39_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB39_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v24 +; GCN-NEXT: v_or_b32_e32 v0, v25, v0 +; GCN-NEXT: v_or_b32_e32 v1, v23, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GCN-NEXT: v_or_b32_e32 v2, v19, v2 +; GCN-NEXT: v_or_b32_e32 v3, v17, v3 +; GCN-NEXT: v_or_b32_e32 v4, v16, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-NEXT: v_or_b32_e32 v6, v9, v6 +; GCN-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB39_2 +; GCN-NEXT: .LBB39_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v23 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v11, v12 +; GCN-NEXT: v_or_b32_e32 v6, v9, v13 +; GCN-NEXT: v_or_b32_e32 v7, v8, v10 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f16_to_v4f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB39_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v8, 0x200 +; VI-NEXT: v_add_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v9 +; VI-NEXT: v_add_f16_sdwa v9, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v9 +; VI-NEXT: v_add_f16_sdwa v9, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v9 +; VI-NEXT: v_add_f16_sdwa v9, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v9 +; VI-NEXT: v_add_f16_sdwa v9, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v9 +; VI-NEXT: v_add_f16_sdwa v9, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v9 +; VI-NEXT: v_add_f16_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v9 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v4f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB39_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB39_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + +define <16 x bfloat> @bitcast_v4f64_to_v16bf16(<4 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f64_to_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB40_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: .LBB40_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB40_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v2 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v0 +; GCN-NEXT: .LBB40_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v23 +; GCN-NEXT: v_mov_b32_e32 v1, v22 +; GCN-NEXT: v_mov_b32_e32 v2, v21 +; GCN-NEXT: v_mov_b32_e32 v3, v20 +; GCN-NEXT: v_mov_b32_e32 v4, v19 +; GCN-NEXT: v_mov_b32_e32 v5, v18 +; GCN-NEXT: v_mov_b32_e32 v6, v17 +; GCN-NEXT: v_mov_b32_e32 v7, v16 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v16bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v16bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v16bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + +define <4 x double> @bitcast_v16bf16_to_v4f64(<16 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v16bf16_to_v4f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB41_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB41_4 +; GCN-NEXT: .LBB41_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB41_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; GCN-NEXT: v_alignbit_b32 v0, v0, v26, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v24, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GCN-NEXT: v_alignbit_b32 v2, v2, v20, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v18, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v16, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB41_2 +; GCN-NEXT: .LBB41_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v23 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v6, v13, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v9, v8, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16bf16_to_v4f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v7, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v7 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v6 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v4 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v3 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v2 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v1 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v8, 16 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v4f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v7, v7, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v8, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v8, s7 +; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v13, v9, 16, 1 +; GFX11-NEXT: v_bfe_u32 v10, v8, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v15, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_add3_u32 v13, v13, v9, 0x7fff +; GFX11-NEXT: v_add3_u32 v10, v10, v8, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v11, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v15, v6, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v12, v12, v7, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v7, v12, v14 :: v_dual_lshlrev_b32 v12, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v7, v7, v8, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v6 +; GFX11-NEXT: v_dual_cndmask_b32 v9, v13, v10 :: v_dual_add_f32 v10, 0x40c00000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5 +; GFX11-NEXT: v_bfe_u32 v12, v10, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v11, v14 :: v_dual_lshlrev_b32 v11, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v6, v6, v9, 0x7060302 +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v11 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_add3_u32 v11, v12, v10, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v14, v9, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v13, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v14, v9, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v12 :: v_dual_lshlrev_b32 v12, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_perm_b32 v5, v5, v10, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v13, v9, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v12 +; GFX11-NEXT: v_add3_u32 v11, v13, v9, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v9 +; GFX11-NEXT: v_bfe_u32 v13, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v11, v12, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v13, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_add3_u32 v13, v14, v10, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v3, v11, v12 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_perm_b32 v4, v4, v8, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; GFX11-NEXT: v_bfe_u32 v16, v2, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v14, v11, 16, 1 +; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v12, v16, v2, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v10, v13, v15 :: v_dual_lshlrev_b32 v15, 16, v0 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v12, v13, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 +; GFX11-NEXT: v_add3_u32 v13, v14, v11, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v11 +; GFX11-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_bfe_u32 v16, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v12 +; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v13, v14, vcc_lo +; GFX11-NEXT: v_add3_u32 v14, v15, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add3_u32 v16, v16, v12, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v14, v15 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_perm_b32 v1, v1, v11, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v13, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v16, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v13, v13, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v13, v18, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v12, 0x7060302 +; GFX11-NEXT: .LBB41_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + +define <32 x i8> @bitcast_v4f64_to_v32i8(<4 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f64_to_v32i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v35, v5 +; GCN-NEXT: v_mov_b32_e32 v34, v4 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB42_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v27, v7, v6, 24 +; GCN-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v25, v7, v6, 8 +; GCN-NEXT: v_alignbit_b32 v19, v35, v34, 24 +; GCN-NEXT: v_alignbit_b32 v18, v35, v34, 16 +; GCN-NEXT: v_alignbit_b32 v17, v35, v34, 8 +; GCN-NEXT: v_alignbit_b32 v11, v3, v2, 24 +; GCN-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v9, v3, v2, 8 +; GCN-NEXT: v_alignbit_b32 v38, v1, v0, 24 +; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v33, v1, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GCN-NEXT: .LBB42_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB42_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_alignbit_b32 v27, v7, v6, 24 +; GCN-NEXT: v_alignbit_b32 v26, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v25, v7, v6, 8 +; GCN-NEXT: v_alignbit_b32 v19, v35, v34, 24 +; GCN-NEXT: v_alignbit_b32 v18, v35, v34, 16 +; GCN-NEXT: v_alignbit_b32 v17, v35, v34, 8 +; GCN-NEXT: v_alignbit_b32 v11, v3, v2, 24 +; GCN-NEXT: v_alignbit_b32 v10, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v9, v3, v2, 8 +; GCN-NEXT: v_alignbit_b32 v38, v1, v0, 24 +; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v33, v1, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GCN-NEXT: .LBB42_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v12, v3 +; GCN-NEXT: v_mov_b32_e32 v16, v34 +; GCN-NEXT: v_mov_b32_e32 v20, v35 +; GCN-NEXT: v_mov_b32_e32 v24, v6 +; GCN-NEXT: v_mov_b32_e32 v28, v7 +; GCN-NEXT: v_mov_b32_e32 v1, v33 +; GCN-NEXT: v_mov_b32_e32 v2, v32 +; GCN-NEXT: v_mov_b32_e32 v3, v38 +; GCN-NEXT: v_mov_b32_e32 v6, v37 +; GCN-NEXT: v_mov_b32_e32 v7, v36 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f64_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: .LBB42_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: .LBB42_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v8, v34 +; VI-NEXT: v_mov_b32_e32 v12, v35 +; VI-NEXT: v_mov_b32_e32 v16, v32 +; VI-NEXT: v_mov_b32_e32 v20, v33 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v28, v7 +; VI-NEXT: v_mov_b32_e32 v1, v38 +; VI-NEXT: v_mov_b32_e32 v6, v37 +; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f64_to_v32i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: .LBB42_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v34 +; GFX9-NEXT: v_mov_b32_e32 v12, v35 +; GFX9-NEXT: v_mov_b32_e32 v16, v32 +; GFX9-NEXT: v_mov_b32_e32 v20, v33 +; GFX9-NEXT: v_mov_b32_e32 v24, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, v38 +; GFX9-NEXT: v_mov_b32_e32 v6, v37 +; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f64_to_v32i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6 +; GFX11-NEXT: v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4 +; GFX11-NEXT: v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2 +; GFX11-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB42_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 +; GFX11-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 +; GFX11-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 +; GFX11-NEXT: v_add_f64 v[38:39], v[38:39], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB42_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <4 x double> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x double> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i8_to_v4f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB43_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB43_4 +; GCN-NEXT: .LBB43_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB43_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v30 +; GCN-NEXT: v_or_b32_e32 v0, v0, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v6, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v8, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v12, v12, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v16, v16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v20, v20, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v35, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v36, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v9, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v7, v11, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v9, v13, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v11, v15, v14 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v13, v17, v18 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v15, v19, v21 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v7, v14, v15 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB43_2 +; GCN-NEXT: .LBB43_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v0, v37, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v38, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v39, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v48, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v49, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v12, v21, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v16, v23, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v20, v25, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v22 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v35, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v36, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v9, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_or_b32_e32 v7, v11, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_or_b32_e32 v9, v13, v10 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v12 +; GCN-NEXT: v_or_b32_e32 v11, v15, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v16 +; GCN-NEXT: v_or_b32_e32 v13, v17, v18 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x300, v20 +; GCN-NEXT: v_or_b32_e32 v15, v19, v21 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NEXT: v_or_b32_e32 v7, v15, v14 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v4f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB43_4 +; VI-NEXT: .LBB43_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB43_2 +; VI-NEXT: .LBB43_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v7, 0x300 +; VI-NEXT: v_add_u16_e32 v2, 3, v33 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v8 +; VI-NEXT: v_add_u16_e32 v3, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v12 +; VI-NEXT: v_add_u16_e32 v4, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v16 +; VI-NEXT: v_add_u16_e32 v5, 3, v18 +; VI-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v20 +; VI-NEXT: v_add_u16_e32 v6, 3, v22 +; VI-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v24 +; VI-NEXT: v_add_u16_e32 v8, 3, v26 +; VI-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_add_u16_sdwa v8, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v28 +; VI-NEXT: v_add_u16_e32 v10, 3, v30 +; VI-NEXT: v_or_b32_sdwa v8, v11, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_add_u16_sdwa v7, v9, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i8_to_v4f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB43_2 +; GFX9-NEXT: .LBB43_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v37, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v4, v36, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v5, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v20 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v5, v19, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v17, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v6, v15, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v28 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v7, v11, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i8_to_v4f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v38, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v36, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v37, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v2 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v51 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v49 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v50 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v48 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v38 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v18 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v12 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v30 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v35 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v36 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v37 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v11, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v12, v16, v17 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB43_2 +; GFX11-NEXT: .LBB43_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v10, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v51, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v49, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v50, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v48, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v38, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v39, v5 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_nc_u16 v6, v18, 3 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_add_nc_u16 v3, v12, 3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_add_nc_u16 v4, v14, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v22, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v30, 3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_or_b32_e32 v3, v35, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v36, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v37, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v19, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v21, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v11, v10 +; GFX11-NEXT: v_or_b32_e32 v10, v13, v12 +; GFX11-NEXT: v_or_b32_e32 v11, v15, v14 +; GFX11-NEXT: v_or_b32_e32 v12, v17, v16 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <4 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <4 x double> + br label %end + +end: + %phi = phi <4 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x double> %phi +} + +define <16 x half> @bitcast_v16i16_to_v16f16(<16 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i16_to_v16f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v31, v15 +; GCN-NEXT: v_mov_b32_e32 v30, v14 +; GCN-NEXT: v_mov_b32_e32 v29, v13 +; GCN-NEXT: v_mov_b32_e32 v28, v12 +; GCN-NEXT: v_mov_b32_e32 v27, v11 +; GCN-NEXT: v_mov_b32_e32 v26, v10 +; GCN-NEXT: v_mov_b32_e32 v25, v9 +; GCN-NEXT: v_mov_b32_e32 v24, v8 +; GCN-NEXT: v_mov_b32_e32 v23, v7 +; GCN-NEXT: v_mov_b32_e32 v22, v6 +; GCN-NEXT: v_mov_b32_e32 v21, v5 +; GCN-NEXT: v_mov_b32_e32 v20, v4 +; GCN-NEXT: v_mov_b32_e32 v19, v3 +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v1 +; GCN-NEXT: v_mov_b32_e32 v32, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB44_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB44_4 +; GCN-NEXT: .LBB44_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB44_3: ; %cmp.false +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB44_2 +; GCN-NEXT: .LBB44_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v31 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v29 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v16f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v8, 3 +; VI-NEXT: v_add_u16_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v10, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v11, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v12, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v13, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v14, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v15, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v8, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_or_b32_e32 v6, v6, v15 +; VI-NEXT: v_or_b32_e32 v5, v5, v14 +; VI-NEXT: v_or_b32_e32 v4, v4, v13 +; VI-NEXT: v_or_b32_e32 v3, v3, v12 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_or_b32_e32 v1, v1, v10 +; VI-NEXT: v_or_b32_e32 v0, v0, v9 +; VI-NEXT: .LBB44_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i16_to_v16f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v16f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB44_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + +define <16 x i16> @bitcast_v16f16_to_v16i16(<16 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f16_to_v16i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB45_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: v_or_b32_e32 v14, v14, v16 +; GCN-NEXT: v_or_b32_e32 v10, v10, v17 +; GCN-NEXT: v_or_b32_e32 v6, v6, v18 +; GCN-NEXT: v_or_b32_e32 v2, v2, v19 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GCN-NEXT: .LBB45_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f16_to_v16i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB45_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 0x200 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v16, v7 +; VI-NEXT: v_or_b32_e32 v6, v15, v6 +; VI-NEXT: v_or_b32_e32 v5, v14, v5 +; VI-NEXT: v_or_b32_e32 v4, v13, v4 +; VI-NEXT: v_or_b32_e32 v3, v12, v3 +; VI-NEXT: v_or_b32_e32 v2, v11, v2 +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_or_b32_e32 v0, v8, v0 +; VI-NEXT: .LBB45_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v16i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v16i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB45_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB45_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <16 x bfloat> @bitcast_v16i16_to_v16bf16(<16 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i16_to_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v23, v14 +; GCN-NEXT: v_mov_b32_e32 v22, v12 +; GCN-NEXT: v_mov_b32_e32 v21, v10 +; GCN-NEXT: v_mov_b32_e32 v20, v8 +; GCN-NEXT: v_mov_b32_e32 v19, v6 +; GCN-NEXT: v_mov_b32_e32 v18, v4 +; GCN-NEXT: v_mov_b32_e32 v17, v2 +; GCN-NEXT: v_mov_b32_e32 v24, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB46_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB46_4 +; GCN-NEXT: .LBB46_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB46_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB46_2 +; GCN-NEXT: .LBB46_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v23 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v24 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v0, v15, v0 +; GCN-NEXT: v_or_b32_e32 v2, v13, v2 +; GCN-NEXT: v_or_b32_e32 v4, v11, v4 +; GCN-NEXT: v_or_b32_e32 v6, v9, v6 +; GCN-NEXT: v_or_b32_e32 v7, v7, v8 +; GCN-NEXT: v_or_b32_e32 v5, v5, v10 +; GCN-NEXT: v_or_b32_e32 v3, v3, v12 +; GCN-NEXT: v_or_b32_e32 v1, v1, v14 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v16bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v8, 3 +; VI-NEXT: v_add_u16_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v10, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v11, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v12, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v13, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v14, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v15, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v8, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_or_b32_e32 v6, v6, v15 +; VI-NEXT: v_or_b32_e32 v5, v5, v14 +; VI-NEXT: v_or_b32_e32 v4, v4, v13 +; VI-NEXT: v_or_b32_e32 v3, v3, v12 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_or_b32_e32 v1, v1, v10 +; VI-NEXT: v_or_b32_e32 v0, v0, v9 +; VI-NEXT: .LBB46_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i16_to_v16bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v16bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB46_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + +define <16 x i16> @bitcast_v16bf16_to_v16i16(<16 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v16bf16_to_v16i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB47_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB47_4 +; GCN-NEXT: .LBB47_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB47_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB47_2 +; GCN-NEXT: .LBB47_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v8 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_alignbit_b32 v0, v14, v0, 16 +; GCN-NEXT: v_alignbit_b32 v4, v19, v2, 16 +; GCN-NEXT: v_alignbit_b32 v8, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v12, v21, v5, 16 +; GCN-NEXT: v_alignbit_b32 v14, v15, v17, 16 +; GCN-NEXT: v_alignbit_b32 v10, v11, v9, 16 +; GCN-NEXT: v_alignbit_b32 v6, v7, v18, 16 +; GCN-NEXT: v_alignbit_b32 v2, v3, v13, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v24, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v23, 16 +; GCN-NEXT: v_alignbit_b32 v13, v14, v22, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16bf16_to_v16i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, s6, v10 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; VI-NEXT: v_bfe_u32 v10, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v1 +; VI-NEXT: v_add_u32_e32 v10, vcc, s6, v10 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, s6, v11 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; VI-NEXT: v_bfe_u32 v11, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v2 +; VI-NEXT: v_add_u32_e32 v11, vcc, s6, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_bfe_u32 v12, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v3 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v12, v13, vcc +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_bfe_u32 v13, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v4 +; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v13, v14, vcc +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, s6, v14 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_bfe_u32 v14, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v5 +; VI-NEXT: v_add_u32_e32 v14, vcc, s6, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v14, v15, vcc +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, s6, v15 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; VI-NEXT: v_bfe_u32 v15, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v6 +; VI-NEXT: v_add_u32_e32 v15, vcc, s6, v15 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v15, v16, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, s6, v16 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; VI-NEXT: v_bfe_u32 v16, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v7 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v15, 16 +; VI-NEXT: v_alignbit_b32 v6, v6, v14, 16 +; VI-NEXT: v_alignbit_b32 v5, v5, v13, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v12, 16 +; VI-NEXT: v_alignbit_b32 v3, v3, v11, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v10, 16 +; VI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: .LBB47_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v16i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_bfe_u32 v10, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v10, v10, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_bfe_u32 v11, v10, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v11, v11, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; GFX9-NEXT: v_bfe_u32 v11, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v11, v11, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_bfe_u32 v12, v11, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v12, v12, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; GFX9-NEXT: v_bfe_u32 v12, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v12, v12, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v13, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v13, v13, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v13, v13, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v14, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_bfe_u32 v14, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v14, v14, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; GFX9-NEXT: v_bfe_u32 v14, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v14, v14, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v15, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_bfe_u32 v15, v14, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v15, v15, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v16, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v7, v7, v15, s6 +; GFX9-NEXT: v_perm_b32 v6, v6, v14, s6 +; GFX9-NEXT: v_perm_b32 v5, v5, v13, s6 +; GFX9-NEXT: v_perm_b32 v4, v4, v12, s6 +; GFX9-NEXT: v_perm_b32 v3, v3, v11, s6 +; GFX9-NEXT: v_perm_b32 v2, v2, v10, s6 +; GFX9-NEXT: v_perm_b32 v1, v1, v9, s6 +; GFX9-NEXT: v_perm_b32 v0, v0, v8, s6 +; GFX9-NEXT: .LBB47_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v16i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v0 +; GFX11-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v13, v9, 16, 1 +; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v13, v13, v9, 0x7fff +; GFX11-NEXT: v_add3_u32 v11, v11, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v8, v11, v14 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v14, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v12, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v12, v12, v0, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v0, v12, v15 :: v_dual_lshlrev_b32 v15, 16, v4 +; GFX11-NEXT: v_bfe_u32 v12, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v13, v11, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v12, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v11, v12 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX11-NEXT: v_perm_b32 v1, v1, v9, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v13, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add3_u32 v11, v13, v10, 0x7fff +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v13, v2, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v13, v14, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v13, v14, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_bfe_u32 v14, v15, 16, 1 +; GFX11-NEXT: v_bfe_u32 v13, v3, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v12 :: v_dual_lshlrev_b32 v12, 16, v5 +; GFX11-NEXT: v_add3_u32 v14, v14, v15, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_add3_u32 v15, v18, v4, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_add3_u32 v13, v13, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v19, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX11-NEXT: v_add3_u32 v18, v19, v12, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v6 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v15, v17 :: v_dual_add_f32 v15, 0x40c00000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-NEXT: v_add3_u32 v17, v21, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v18, v20, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15 +; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff +; GFX11-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v21, v22, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_add3_u32 v23, v23, v18, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v18 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v21, v22, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_add3_u32 v19, v19, v7, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v23, v24, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v19, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v7, v7, v18, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v17, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_perm_b32 v4, v4, v14, 0x7060302 +; GFX11-NEXT: v_perm_b32 v5, v5, v12, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v3, v11, 0x7060302 +; GFX11-NEXT: .LBB47_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <32 x i8> @bitcast_v16i16_to_v32i8(<16 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i16_to_v32i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v48, v15 +; GCN-NEXT: v_mov_b32_e32 v32, v14 +; GCN-NEXT: v_mov_b32_e32 v37, v12 +; GCN-NEXT: v_mov_b32_e32 v49, v11 +; GCN-NEXT: v_mov_b32_e32 v33, v10 +; GCN-NEXT: v_mov_b32_e32 v36, v8 +; GCN-NEXT: v_mov_b32_e32 v50, v7 +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v38, v4 +; GCN-NEXT: v_mov_b32_e32 v51, v3 +; GCN-NEXT: v_mov_b32_e32 v35, v2 +; GCN-NEXT: v_mov_b32_e32 v39, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v50 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v9 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v49 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v48 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB48_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v38 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v33 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v37 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v32 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v51 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v50 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v49 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v48 +; GCN-NEXT: v_bfe_u32 v7, v51, 8, 8 +; GCN-NEXT: v_bfe_u32 v15, v50, 8, 8 +; GCN-NEXT: v_bfe_u32 v23, v49, 8, 8 +; GCN-NEXT: v_or_b32_e32 v0, v0, v52 +; GCN-NEXT: v_or_b32_e32 v4, v1, v53 +; GCN-NEXT: v_or_b32_e32 v8, v2, v54 +; GCN-NEXT: v_or_b32_e32 v12, v3, v55 +; GCN-NEXT: v_or_b32_e32 v16, v5, v40 +; GCN-NEXT: v_or_b32_e32 v20, v9, v41 +; GCN-NEXT: v_or_b32_e32 v24, v10, v42 +; GCN-NEXT: v_or_b32_e32 v28, v11, v43 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GCN-NEXT: v_bfe_u32 v31, v48, 8, 8 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: .LBB48_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB48_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v35 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v0, v42, v0 +; GCN-NEXT: v_or_b32_e32 v1, v43, v1 +; GCN-NEXT: v_or_b32_e32 v2, v40, v2 +; GCN-NEXT: v_or_b32_e32 v3, v41, v3 +; GCN-NEXT: v_or_b32_e32 v4, v54, v4 +; GCN-NEXT: v_or_b32_e32 v5, v55, v5 +; GCN-NEXT: v_or_b32_e32 v6, v52, v6 +; GCN-NEXT: v_or_b32_e32 v7, v53, v7 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v7 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GCN-NEXT: .LBB48_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i16_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v5 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] +; VI-NEXT: v_mov_b32_e32 v50, v0 +; VI-NEXT: v_mov_b32_e32 v48, v1 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v16, v4 +; VI-NEXT: v_mov_b32_e32 v49, v5 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v51, v7 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 3 +; VI-NEXT: v_add_u16_sdwa v36, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v32, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v14, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v22, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v18, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v30, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v26, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v48, 3, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; VI-NEXT: v_add_u16_e32 v50, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; VI-NEXT: v_add_u16_e32 v35, 3, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; VI-NEXT: v_add_u16_e32 v8, 3, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; VI-NEXT: v_add_u16_e32 v49, 3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 +; VI-NEXT: v_add_u16_e32 v16, 3, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; VI-NEXT: v_add_u16_e32 v51, 3, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v30 +; VI-NEXT: v_add_u16_e32 v24, 3, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; VI-NEXT: v_or_b32_e32 v1, v48, v1 +; VI-NEXT: v_or_b32_e32 v0, v50, v0 +; VI-NEXT: v_or_b32_e32 v3, v35, v3 +; VI-NEXT: v_or_b32_e32 v2, v8, v2 +; VI-NEXT: v_or_b32_e32 v5, v49, v5 +; VI-NEXT: v_or_b32_e32 v4, v16, v4 +; VI-NEXT: v_or_b32_e32 v7, v51, v7 +; VI-NEXT: v_or_b32_e32 v6, v24, v6 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[4:5] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; VI-NEXT: v_bfe_u32 v39, v36, 8, 8 +; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v50 +; VI-NEXT: v_mov_b32_e32 v1, v38 +; VI-NEXT: v_mov_b32_e32 v2, v32 +; VI-NEXT: v_mov_b32_e32 v3, v33 +; VI-NEXT: v_mov_b32_e32 v4, v48 +; VI-NEXT: v_mov_b32_e32 v5, v37 +; VI-NEXT: v_mov_b32_e32 v6, v36 +; VI-NEXT: v_mov_b32_e32 v7, v39 +; VI-NEXT: v_mov_b32_e32 v12, v35 +; VI-NEXT: v_mov_b32_e32 v20, v49 +; VI-NEXT: v_mov_b32_e32 v28, v51 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i16_to_v32i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v35, v35, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v34, v34, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v33, v33, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v34 +; GFX9-NEXT: v_mov_b32_e32 v12, v35 +; GFX9-NEXT: v_mov_b32_e32 v16, v32 +; GFX9-NEXT: v_mov_b32_e32 v20, v33 +; GFX9-NEXT: v_mov_b32_e32 v24, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, v38 +; GFX9-NEXT: v_mov_b32_e32 v6, v37 +; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i16_to_v32i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6 +; GFX11-NEXT: v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4 +; GFX11-NEXT: v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2 +; GFX11-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB48_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v39, v39, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v37, v37, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v35, v35, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v33, v33, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v34, v34, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v36, v36, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v38, v38, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB48_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i16> %a, splat (i16 3) + %a2 = bitcast <16 x i16> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x i16> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define <16 x i16> @bitcast_v32i8_to_v16i16(<32 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i8_to_v16i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v17 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v25 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB49_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v24 +; GCN-NEXT: v_or_b32_e32 v1, v1, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v4, v4, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v7, v7, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v11, v11, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v0, v0, v51 +; GCN-NEXT: v_or_b32_e32 v8, v8, v52 +; GCN-NEXT: v_or_b32_e32 v14, v14, v54 +; GCN-NEXT: v_or_b32_e32 v15, v15, v55 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v3, v32, v3 +; GCN-NEXT: v_or_b32_e32 v2, v33, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v34, v5 +; GCN-NEXT: v_or_b32_e32 v6, v35, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v9, v36, v9 +; GCN-NEXT: v_or_b32_e32 v10, v37, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v12, v39, v12 +; GCN-NEXT: v_or_b32_e32 v13, v38, v13 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v19, v1, v3 +; GCN-NEXT: v_or_b32_e32 v23, v4, v5 +; GCN-NEXT: v_or_b32_e32 v27, v7, v9 +; GCN-NEXT: v_or_b32_e32 v31, v11, v12 +; GCN-NEXT: v_or_b32_e32 v17, v0, v2 +; GCN-NEXT: v_or_b32_e32 v21, v8, v6 +; GCN-NEXT: v_or_b32_e32 v25, v14, v10 +; GCN-NEXT: v_or_b32_e32 v29, v15, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GCN-NEXT: v_alignbit_b32 v1, v19, v2, 16 +; GCN-NEXT: v_alignbit_b32 v5, v23, v6, 16 +; GCN-NEXT: v_alignbit_b32 v9, v27, v10, 16 +; GCN-NEXT: v_alignbit_b32 v13, v31, v13, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v12 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: .LBB49_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB49_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v1, v55, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v5, v53, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v9, v54, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v13, v50, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v8, v52, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_or_b32_e32 v12, v49, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v0, v51, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v4, v48, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; GCN-NEXT: v_or_b32_e32 v3, v38, v3 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_or_b32_e32 v7, v39, v7 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_or_b32_e32 v11, v37, v11 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_or_b32_e32 v15, v36, v15 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_or_b32_e32 v10, v35, v10 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_or_b32_e32 v14, v34, v14 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_or_b32_e32 v2, v33, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v6, v32, v6 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_or_b32_e32 v3, v7, v5 +; GCN-NEXT: v_or_b32_e32 v5, v11, v9 +; GCN-NEXT: v_or_b32_e32 v7, v15, v13 +; GCN-NEXT: v_or_b32_e32 v8, v10, v8 +; GCN-NEXT: v_or_b32_e32 v9, v14, v12 +; GCN-NEXT: v_or_b32_e32 v0, v2, v0 +; GCN-NEXT: v_or_b32_e32 v2, v6, v4 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v31, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v7 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s7, v8 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v9 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v2 +; GCN-NEXT: v_alignbit_b32 v1, v19, v17, 16 +; GCN-NEXT: v_alignbit_b32 v5, v23, v21, 16 +; GCN-NEXT: v_alignbit_b32 v9, v27, v25, 16 +; GCN-NEXT: v_alignbit_b32 v13, v31, v29, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; GCN-NEXT: .LBB49_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v17 +; GCN-NEXT: v_mov_b32_e32 v2, v19 +; GCN-NEXT: v_mov_b32_e32 v4, v21 +; GCN-NEXT: v_mov_b32_e32 v6, v23 +; GCN-NEXT: v_mov_b32_e32 v8, v25 +; GCN-NEXT: v_mov_b32_e32 v10, v27 +; GCN-NEXT: v_mov_b32_e32 v12, v29 +; GCN-NEXT: v_mov_b32_e32 v14, v31 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v16i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB49_2 +; VI-NEXT: .LBB49_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v1, 0x300 +; VI-NEXT: v_add_u16_sdwa v7, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v28 +; VI-NEXT: v_or_b32_sdwa v28, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v26 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v24 +; VI-NEXT: v_or_b32_sdwa v24, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v12 +; VI-NEXT: v_or_b32_sdwa v12, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v34 +; VI-NEXT: v_or_b32_sdwa v10, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v33 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v2, v8, v2 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v12 +; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v31 +; VI-NEXT: v_or_b32_e32 v3, v8, v3 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v16 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v4, v8, v4 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v20 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v5, v8, v5 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v24 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v6, v8, v6 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v28 +; VI-NEXT: v_or_b32_e32 v1, v1, v9 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i8_to_v16i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v33, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v31, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB49_2 +; GFX9-NEXT: .LBB49_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v10, v35, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v10, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v9, v1, s6 +; GFX9-NEXT: v_perm_b32 v2, v8, v2, s6 +; GFX9-NEXT: v_perm_b32 v3, v11, v3, s6 +; GFX9-NEXT: v_perm_b32 v4, v15, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v19, v5, s6 +; GFX9-NEXT: v_perm_b32 v6, v23, v6, s6 +; GFX9-NEXT: v_perm_b32 v7, v27, v7, s6 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i8_to_v16i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v34, v2 :: v_dual_mov_b32 v33, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-NEXT: v_dual_mov_b32 v31, v6 :: v_dual_mov_b32 v32, v4 +; GFX11-NEXT: v_lshlrev_b16 v37, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v38, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v36, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v2 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB49_4 +; GFX11-NEXT: .LBB49_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v37 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v35 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v38 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v36 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v48 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v18 +; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v12 +; GFX11-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v24 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v30 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v49 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v17 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v21 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v19 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v23 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v6, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v8, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v10, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v12, v11, 0x5040100 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB49_2 +; GFX11-NEXT: .LBB49_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v30, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v20, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v5, v22, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v25, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v19, v3 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v49, v4 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v17, v5 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v12, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v14, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v15, v0 +; GFX11-NEXT: v_add_nc_u16 v2, v8, 3 +; GFX11-NEXT: v_or_b32_e32 v1, v39, v1 +; GFX11-NEXT: v_add_nc_u16 v14, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v10, 3 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v13, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v11, v4 +; GFX11-NEXT: v_add_nc_u16 v4, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v11, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v34, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v2, v48, v2 +; GFX11-NEXT: v_or_b32_e32 v4, v38, v4 +; GFX11-NEXT: v_or_b32_e32 v11, v37, v11 +; GFX11-NEXT: v_or_b32_e32 v13, v35, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v36, v14 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-NEXT: v_perm_b32 v0, v11, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v13, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v9, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v14, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v15, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v12, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v17, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v19, v7, 0x5040100 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <16 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <16 x i16> + br label %end + +end: + %phi = phi <16 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i16> %phi +} + +define <16 x bfloat> @bitcast_v16f16_to_v16bf16(<16 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f16_to_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB50_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB50_4 +; GCN-NEXT: .LBB50_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB50_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v31 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB50_2 +; GCN-NEXT: .LBB50_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v16 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v23 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f16_to_v16bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v9, 0x200 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v16, v7 +; VI-NEXT: v_or_b32_e32 v6, v15, v6 +; VI-NEXT: v_or_b32_e32 v5, v14, v5 +; VI-NEXT: v_or_b32_e32 v4, v13, v4 +; VI-NEXT: v_or_b32_e32 v3, v12, v3 +; VI-NEXT: v_or_b32_e32 v2, v11, v2 +; VI-NEXT: v_or_b32_e32 v1, v10, v1 +; VI-NEXT: v_or_b32_e32 v0, v8, v0 +; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v16bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v16bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB50_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} + +define <16 x half> @bitcast_v16bf16_to_v16f16(<16 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v16bf16_to_v16f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB51_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB51_4 +; GCN-NEXT: .LBB51_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB51_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB51_2 +; GCN-NEXT: .LBB51_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v16 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16bf16_to_v16f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_bfe_u32 v9, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v8 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; VI-NEXT: v_bfe_u32 v9, v0, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v9, vcc, v9, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, s6, v9 +; VI-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_bfe_u32 v10, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v9 +; VI-NEXT: v_add_u32_e32 v10, vcc, s6, v10 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; VI-NEXT: v_bfe_u32 v10, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v1 +; VI-NEXT: v_add_u32_e32 v10, vcc, s6, v10 +; VI-NEXT: v_or_b32_e32 v11, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_bfe_u32 v11, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v10 +; VI-NEXT: v_add_u32_e32 v11, vcc, s6, v11 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; VI-NEXT: v_bfe_u32 v11, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v11, vcc, v11, v2 +; VI-NEXT: v_add_u32_e32 v11, vcc, s6, v11 +; VI-NEXT: v_or_b32_e32 v12, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_bfe_u32 v12, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v11 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; VI-NEXT: v_bfe_u32 v12, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v3 +; VI-NEXT: v_add_u32_e32 v12, vcc, s6, v12 +; VI-NEXT: v_or_b32_e32 v13, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v12, v13, vcc +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_bfe_u32 v13, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; VI-NEXT: v_bfe_u32 v13, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v13, vcc, v13, v4 +; VI-NEXT: v_add_u32_e32 v13, vcc, s6, v13 +; VI-NEXT: v_or_b32_e32 v14, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v13, v14, vcc +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_bfe_u32 v14, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v13 +; VI-NEXT: v_add_u32_e32 v14, vcc, s6, v14 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; VI-NEXT: v_bfe_u32 v14, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v14, vcc, v14, v5 +; VI-NEXT: v_add_u32_e32 v14, vcc, s6, v14 +; VI-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v14, v15, vcc +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_bfe_u32 v15, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v14 +; VI-NEXT: v_add_u32_e32 v15, vcc, s6, v15 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; VI-NEXT: v_bfe_u32 v15, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v15, vcc, v15, v6 +; VI-NEXT: v_add_u32_e32 v15, vcc, s6, v15 +; VI-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v15, v16, vcc +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_bfe_u32 v16, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; VI-NEXT: v_add_u32_e32 v16, vcc, s6, v16 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; VI-NEXT: v_bfe_u32 v16, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v16, vcc, v16, v7 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x7fff, v16 +; VI-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v15, 16 +; VI-NEXT: v_alignbit_b32 v6, v6, v14, 16 +; VI-NEXT: v_alignbit_b32 v5, v5, v13, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v12, 16 +; VI-NEXT: v_alignbit_b32 v3, v3, v11, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v10, 16 +; VI-NEXT: v_alignbit_b32 v1, v1, v9, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v16f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v9, v9, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX9-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_bfe_u32 v10, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v10, v10, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v11, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_bfe_u32 v11, v10, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v11, v11, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc +; GFX9-NEXT: v_bfe_u32 v11, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v11, v11, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v12, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_bfe_u32 v12, v11, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v12, v12, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc +; GFX9-NEXT: v_bfe_u32 v12, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v12, v12, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v13, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v13, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v13, v13, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; GFX9-NEXT: v_bfe_u32 v13, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v13, v13, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v14, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_bfe_u32 v14, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v14, v14, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v14, v15, vcc +; GFX9-NEXT: v_bfe_u32 v14, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v14, v14, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v15, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_bfe_u32 v15, v14, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v15, v15, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v16, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX9-NEXT: v_bfe_u32 v16, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v16, v16, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v7, v7, v15, s6 +; GFX9-NEXT: v_perm_b32 v6, v6, v14, s6 +; GFX9-NEXT: v_perm_b32 v5, v5, v13, s6 +; GFX9-NEXT: v_perm_b32 v4, v4, v12, s6 +; GFX9-NEXT: v_perm_b32 v3, v3, v11, s6 +; GFX9-NEXT: v_perm_b32 v2, v2, v10, s6 +; GFX9-NEXT: v_perm_b32 v1, v1, v9, s6 +; GFX9-NEXT: v_perm_b32 v0, v0, v8, s6 +; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v16f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v8 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v9, 0x40c00000, v9 :: v_dual_lshlrev_b32 v8, 16, v0 +; GFX11-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v13, v9, 16, 1 +; GFX11-NEXT: v_bfe_u32 v11, v8, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v13, v13, v9, 0x7fff +; GFX11-NEXT: v_add3_u32 v11, v11, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v8, v11, v14 :: v_dual_and_b32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v14, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v12, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v12, v12, v0, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v0, v12, v15 :: v_dual_lshlrev_b32 v15, 16, v4 +; GFX11-NEXT: v_bfe_u32 v12, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v0, v0, v8, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v13, v11, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v12, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v11, v12 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX11-NEXT: v_perm_b32 v1, v1, v9, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v13, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add3_u32 v11, v13, v10, 0x7fff +; GFX11-NEXT: v_bfe_u32 v13, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v10, v11, v12, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v13, v2, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v13, v14, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo +; GFX11-NEXT: v_add3_u32 v11, v13, v14, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_bfe_u32 v14, v15, 16, 1 +; GFX11-NEXT: v_bfe_u32 v13, v3, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v11, v11, v12 :: v_dual_lshlrev_b32 v12, 16, v5 +; GFX11-NEXT: v_add3_u32 v14, v14, v15, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_add3_u32 v15, v18, v4, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_add3_u32 v13, v13, v3, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v17, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_perm_b32 v2, v2, v10, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v19, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX11-NEXT: v_add3_u32 v18, v19, v12, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v6 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v15, v17 :: v_dual_add_f32 v15, 0x40c00000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-NEXT: v_add3_u32 v17, v21, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v18, v20, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v15 +; GFX11-NEXT: v_add3_u32 v19, v19, v15, 0x7fff +; GFX11-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v23, v18, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v19, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v21, v22, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-NEXT: v_add3_u32 v23, v23, v18, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v18 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v21, v22, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_add3_u32 v19, v19, v7, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v6, v6, v15, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v23, v24, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v19, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v7, v7, v18, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v17, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_perm_b32 v4, v4, v14, 0x7060302 +; GFX11-NEXT: v_perm_b32 v5, v5, v12, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v13, v16, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v3, v3, v11, 0x7060302 +; GFX11-NEXT: .LBB51_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + +define <32 x i8> @bitcast_v16f16_to_v32i8(<16 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f16_to_v32i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v17, v14 +; GCN-NEXT: v_mov_b32_e32 v18, v6 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v17 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB52_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB52_4 +; GCN-NEXT: .LBB52_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB52_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 +; GCN-NEXT: v_bfe_u32 v15, v14, 8, 8 +; GCN-NEXT: v_bfe_u32 v23, v22, 8, 8 +; GCN-NEXT: v_or_b32_e32 v0, v33, v0 +; GCN-NEXT: v_or_b32_e32 v4, v32, v1 +; GCN-NEXT: v_or_b32_e32 v8, v35, v2 +; GCN-NEXT: v_or_b32_e32 v12, v34, v3 +; GCN-NEXT: v_or_b32_e32 v16, v38, v5 +; GCN-NEXT: v_or_b32_e32 v20, v36, v9 +; GCN-NEXT: v_or_b32_e32 v24, v49, v10 +; GCN-NEXT: v_or_b32_e32 v28, v48, v11 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GCN-NEXT: v_bfe_u32 v31, v30, 8, 8 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB52_2 +; GCN-NEXT: .LBB52_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v32 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v6 +; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 +; GCN-NEXT: v_bfe_u32 v15, v14, 8, 8 +; GCN-NEXT: v_bfe_u32 v23, v22, 8, 8 +; GCN-NEXT: v_or_b32_e32 v24, v1, v0 +; GCN-NEXT: v_or_b32_e32 v28, v2, v12 +; GCN-NEXT: v_or_b32_e32 v16, v4, v3 +; GCN-NEXT: v_or_b32_e32 v20, v5, v17 +; GCN-NEXT: v_or_b32_e32 v8, v8, v18 +; GCN-NEXT: v_or_b32_e32 v12, v9, v19 +; GCN-NEXT: v_or_b32_e32 v0, v11, v10 +; GCN-NEXT: v_or_b32_e32 v4, v13, v21 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GCN-NEXT: v_bfe_u32 v31, v30, 8, 8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f16_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v35, v5 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_mov_b32_e32 v33, v3 +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: .LBB52_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v5, 0x200 +; VI-NEXT: v_add_f16_sdwa v14, v33, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v36, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; VI-NEXT: v_add_f16_e32 v33, 0x200, v33 +; VI-NEXT: v_add_f16_sdwa v10, v32, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_or_b32_e32 v12, v33, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_add_f16_sdwa v22, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v1, v2 +; VI-NEXT: v_add_f16_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v32, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; VI-NEXT: v_add_f16_e32 v35, 0x200, v35 +; VI-NEXT: v_add_f16_sdwa v18, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v30, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v26, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v9, v35, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v18 +; VI-NEXT: v_add_f16_e32 v34, 0x200, v34 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v3, v0, v3 +; VI-NEXT: v_or_b32_e32 v8, v34, v8 +; VI-NEXT: v_or_b32_e32 v16, v7, v13 +; VI-NEXT: v_or_b32_e32 v15, v6, v5 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[15:16] +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8 +; VI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; VI-NEXT: v_bfe_u32 v23, v22, 8, 8 +; VI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; VI-NEXT: v_bfe_u32 v37, v36, 8, 8 +; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v8, v32 +; VI-NEXT: v_mov_b32_e32 v12, v33 +; VI-NEXT: v_mov_b32_e32 v16, v34 +; VI-NEXT: v_mov_b32_e32 v20, v35 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v28, v7 +; VI-NEXT: v_mov_b32_e32 v1, v38 +; VI-NEXT: v_mov_b32_e32 v6, v36 +; VI-NEXT: v_mov_b32_e32 v7, v37 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f16_to_v32i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v35, v35, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v34, v34, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v33, v33, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v32, v32, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: .LBB52_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v34 +; GFX9-NEXT: v_mov_b32_e32 v12, v35 +; GFX9-NEXT: v_mov_b32_e32 v16, v32 +; GFX9-NEXT: v_mov_b32_e32 v20, v33 +; GFX9-NEXT: v_mov_b32_e32 v24, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, v38 +; GFX9-NEXT: v_mov_b32_e32 v6, v37 +; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f16_to_v32i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6 +; GFX11-NEXT: v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4 +; GFX11-NEXT: v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2 +; GFX11-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB52_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v39, 0x200, v39 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v37, 0x200, v37 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v35, 0x200, v35 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v33, 0x200, v33 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v34, 0x200, v34 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v36, 0x200, v36 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v38, 0x200, v38 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB52_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x half> %a, splat (half 0xH0200) + %a2 = bitcast <16 x half> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x half> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define <16 x half> @bitcast_v32i8_to_v16f16(<32 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i8_to_v16f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB53_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v30 +; GCN-NEXT: v_or_b32_e32 v0, v0, v32 +; GCN-NEXT: v_or_b32_e32 v1, v1, v33 +; GCN-NEXT: v_or_b32_e32 v2, v2, v34 +; GCN-NEXT: v_or_b32_e32 v3, v3, v35 +; GCN-NEXT: v_or_b32_e32 v4, v4, v36 +; GCN-NEXT: v_or_b32_e32 v5, v5, v37 +; GCN-NEXT: v_or_b32_e32 v6, v6, v38 +; GCN-NEXT: v_or_b32_e32 v7, v7, v39 +; GCN-NEXT: v_or_b32_e32 v8, v8, v48 +; GCN-NEXT: v_or_b32_e32 v9, v9, v49 +; GCN-NEXT: v_or_b32_e32 v10, v10, v50 +; GCN-NEXT: v_or_b32_e32 v11, v11, v51 +; GCN-NEXT: v_or_b32_e32 v12, v12, v52 +; GCN-NEXT: v_or_b32_e32 v13, v13, v53 +; GCN-NEXT: v_or_b32_e32 v14, v14, v54 +; GCN-NEXT: v_or_b32_e32 v15, v15, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: .LBB53_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB53_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v55, v1 +; GCN-NEXT: v_or_b32_e32 v3, v54, v3 +; GCN-NEXT: v_or_b32_e32 v5, v53, v5 +; GCN-NEXT: v_or_b32_e32 v7, v52, v7 +; GCN-NEXT: v_or_b32_e32 v9, v51, v9 +; GCN-NEXT: v_or_b32_e32 v11, v50, v11 +; GCN-NEXT: v_or_b32_e32 v13, v49, v13 +; GCN-NEXT: v_or_b32_e32 v15, v48, v15 +; GCN-NEXT: v_or_b32_e32 v14, v39, v14 +; GCN-NEXT: v_or_b32_e32 v12, v38, v12 +; GCN-NEXT: v_or_b32_e32 v10, v37, v10 +; GCN-NEXT: v_or_b32_e32 v8, v36, v8 +; GCN-NEXT: v_or_b32_e32 v6, v35, v6 +; GCN-NEXT: v_or_b32_e32 v4, v34, v4 +; GCN-NEXT: v_or_b32_e32 v2, v33, v2 +; GCN-NEXT: v_or_b32_e32 v0, v32, v0 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 0x300, v1 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v16 +; GCN-NEXT: .LBB53_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v31 +; GCN-NEXT: v_mov_b32_e32 v2, v23 +; GCN-NEXT: v_mov_b32_e32 v4, v19 +; GCN-NEXT: v_mov_b32_e32 v6, v27 +; GCN-NEXT: v_mov_b32_e32 v8, v17 +; GCN-NEXT: v_mov_b32_e32 v10, v21 +; GCN-NEXT: v_mov_b32_e32 v12, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v29 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v16f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB53_4 +; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB53_2 +; VI-NEXT: .LBB53_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v1, 0x300 +; VI-NEXT: v_add_u16_sdwa v7, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v28 +; VI-NEXT: v_or_b32_sdwa v28, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v26 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v24 +; VI-NEXT: v_or_b32_sdwa v24, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v12 +; VI-NEXT: v_or_b32_sdwa v12, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v34 +; VI-NEXT: v_or_b32_sdwa v10, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v33 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v2, v8, v2 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v12 +; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v31 +; VI-NEXT: v_or_b32_e32 v3, v8, v3 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v16 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v4, v8, v4 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v20 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v5, v8, v5 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v24 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v6, v8, v6 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v28 +; VI-NEXT: v_or_b32_e32 v1, v1, v9 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i8_to_v16f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v33, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v31, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB53_2 +; GFX9-NEXT: .LBB53_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v10, v35, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v10, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v9, v1, s6 +; GFX9-NEXT: v_perm_b32 v2, v8, v2, s6 +; GFX9-NEXT: v_perm_b32 v3, v11, v3, s6 +; GFX9-NEXT: v_perm_b32 v4, v15, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v19, v5, s6 +; GFX9-NEXT: v_perm_b32 v6, v23, v6, s6 +; GFX9-NEXT: v_perm_b32 v7, v27, v7, s6 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i8_to_v16f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v34, v2 :: v_dual_mov_b32 v33, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-NEXT: v_dual_mov_b32 v31, v6 :: v_dual_mov_b32 v32, v4 +; GFX11-NEXT: v_lshlrev_b16 v37, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v38, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v36, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v2 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v37 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v35 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v38 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v36 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v48 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v18 +; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v12 +; GFX11-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v24 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v30 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v49 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v17 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v21 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v19 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v23 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v6, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v8, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v10, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v12, v11, 0x5040100 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB53_2 +; GFX11-NEXT: .LBB53_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v30, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v20, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v5, v22, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v25, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v19, v3 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v49, v4 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v17, v5 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v12, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v14, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v15, v0 +; GFX11-NEXT: v_add_nc_u16 v2, v8, 3 +; GFX11-NEXT: v_or_b32_e32 v1, v39, v1 +; GFX11-NEXT: v_add_nc_u16 v14, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v10, 3 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v13, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v11, v4 +; GFX11-NEXT: v_add_nc_u16 v4, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v11, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v34, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v2, v48, v2 +; GFX11-NEXT: v_or_b32_e32 v4, v38, v4 +; GFX11-NEXT: v_or_b32_e32 v11, v37, v11 +; GFX11-NEXT: v_or_b32_e32 v13, v35, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v36, v14 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-NEXT: v_perm_b32 v0, v11, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v13, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v9, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v14, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v15, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v12, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v17, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v19, v7, 0x5040100 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <16 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <16 x half> + br label %end + +end: + %phi = phi <16 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x half> %phi +} + +define <32 x i8> @bitcast_v16bf16_to_v32i8(<16 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v16bf16_to_v32i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v14 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB54_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB54_4 +; GCN-NEXT: .LBB54_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB54_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v53 +; GCN-NEXT: v_alignbit_b32 v0, v0, v33, 16 +; GCN-NEXT: v_alignbit_b32 v4, v6, v32, 16 +; GCN-NEXT: v_alignbit_b32 v8, v1, v36, 16 +; GCN-NEXT: v_alignbit_b32 v12, v14, v35, 16 +; GCN-NEXT: v_alignbit_b32 v16, v2, v48, 16 +; GCN-NEXT: v_alignbit_b32 v20, v22, v39, 16 +; GCN-NEXT: v_alignbit_b32 v24, v3, v52, 16 +; GCN-NEXT: v_alignbit_b32 v28, v30, v51, 16 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB54_2 +; GCN-NEXT: .LBB54_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v53 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v48 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v54 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v34 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v16 +; GCN-NEXT: v_alignbit_b32 v24, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v28, v30, v2, 16 +; GCN-NEXT: v_alignbit_b32 v16, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v20, v22, v5, 16 +; GCN-NEXT: v_alignbit_b32 v8, v17, v8, 16 +; GCN-NEXT: v_alignbit_b32 v12, v14, v9, 16 +; GCN-NEXT: v_alignbit_b32 v0, v18, v11, 16 +; GCN-NEXT: v_alignbit_b32 v4, v6, v13, 16 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, v12, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, v12, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, v12, v8, 8 +; GCN-NEXT: v_alignbit_b32 v19, v20, v16, 24 +; GCN-NEXT: v_alignbit_b32 v18, v20, v16, 16 +; GCN-NEXT: v_alignbit_b32 v17, v20, v16, 8 +; GCN-NEXT: v_alignbit_b32 v27, v28, v24, 24 +; GCN-NEXT: v_alignbit_b32 v26, v28, v24, 16 +; GCN-NEXT: v_alignbit_b32 v25, v28, v24, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 8, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v31 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16bf16_to_v32i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v5 +; VI-NEXT: v_mov_b32_e32 v32, v4 +; VI-NEXT: v_mov_b32_e32 v35, v3 +; VI-NEXT: v_mov_b32_e32 v34, v2 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: .LBB54_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB54_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v35, v3, v2, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v34 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v34, v3, v2, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v33, v3, v2, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v32, v3, v2, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v6, v3, v2, 16 +; VI-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; VI-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; VI-NEXT: .LBB54_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v1 +; VI-NEXT: v_mov_b32_e32 v8, v34 +; VI-NEXT: v_mov_b32_e32 v12, v35 +; VI-NEXT: v_mov_b32_e32 v16, v32 +; VI-NEXT: v_mov_b32_e32 v20, v33 +; VI-NEXT: v_mov_b32_e32 v24, v6 +; VI-NEXT: v_mov_b32_e32 v28, v7 +; VI-NEXT: v_mov_b32_e32 v1, v38 +; VI-NEXT: v_mov_b32_e32 v6, v37 +; VI-NEXT: v_mov_b32_e32 v7, v36 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16bf16_to_v32i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v33, v5 +; GFX9-NEXT: v_mov_b32_e32 v32, v4 +; GFX9-NEXT: v_mov_b32_e32 v35, v3 +; GFX9-NEXT: v_mov_b32_e32 v34, v2 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v0 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[32:33] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[34:35] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: .LBB54_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB54_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX9-NEXT: v_perm_b32 v3, v0, v5, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v35 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v35 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v34 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v10, v11, vcc +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v11, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v11, vcc +; GFX9-NEXT: v_perm_b32 v11, v9, v13, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v33 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v33 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v10, v14, vcc +; GFX9-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v10, v10, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v10, v15, vcc +; GFX9-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v32 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v15, v16, vcc +; GFX9-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v15, v15, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v15, v16, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v16, v19, vcc +; GFX9-NEXT: v_add3_u32 v15, v15, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v16, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_bfe_u32 v15, v6, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v16, v20, vcc +; GFX9-NEXT: v_add3_u32 v15, v15, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v16, vcc +; GFX9-NEXT: v_perm_b32 v4, v8, v2, s7 +; GFX9-NEXT: v_perm_b32 v12, v1, v0, s7 +; GFX9-NEXT: v_perm_b32 v10, v17, v14, s7 +; GFX9-NEXT: v_perm_b32 v9, v9, v18, s7 +; GFX9-NEXT: v_perm_b32 v16, v7, v19, s7 +; GFX9-NEXT: v_perm_b32 v15, v6, v20, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 8, v15 +; GFX9-NEXT: v_lshrrev_b64 v[27:28], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v9 +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 16, v8 +; GFX9-NEXT: .LBB54_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v34 +; GFX9-NEXT: v_mov_b32_e32 v12, v35 +; GFX9-NEXT: v_mov_b32_e32 v16, v32 +; GFX9-NEXT: v_mov_b32_e32 v20, v33 +; GFX9-NEXT: v_mov_b32_e32 v24, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, v38 +; GFX9-NEXT: v_mov_b32_e32 v6, v37 +; GFX9-NEXT: v_mov_b32_e32 v7, v36 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16bf16_to_v32i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v33, v7 :: v_dual_mov_b32 v32, v6 +; GFX11-NEXT: v_dual_mov_b32 v35, v5 :: v_dual_mov_b32 v34, v4 +; GFX11-NEXT: v_dual_mov_b32 v37, v3 :: v_dual_mov_b32 v36, v2 +; GFX11-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v38, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v33 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v32 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v35 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v34 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[32:33] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[34:35] +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[38:39] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; GFX11-NEXT: .LBB54_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v38 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v36 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v34 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_add3_u32 v1, v9, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc_lo +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v0, 0x400000, v2 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v37 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v0, vcc_lo +; GFX11-NEXT: v_dual_add_f32 v7, 0x40c00000, v8 :: v_dual_add_f32 v6, 0x40c00000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v9, vcc_lo +; GFX11-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v9, v9, v7, 0x7fff +; GFX11-NEXT: v_bfe_u32 v12, v3, 16, 1 +; GFX11-NEXT: v_perm_b32 v1, v5, v4, 0x7060302 +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v9, v11, vcc_lo +; GFX11-NEXT: v_add3_u32 v9, v12, v3, 0x7fff +; GFX11-NEXT: v_bfe_u32 v13, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v35 +; GFX11-NEXT: v_perm_b32 v8, v11, v6, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v9, 0x40c00000, v12 +; GFX11-NEXT: v_add3_u32 v12, v13, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v35 +; GFX11-NEXT: v_bfe_u32 v15, v9, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v7, v12, v13 :: v_dual_add_f32 v12, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_add3_u32 v13, v15, v9, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v14, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX11-NEXT: v_perm_b32 v7, v7, v3, 0x7060302 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX11-NEXT: v_dual_cndmask_b32 v13, v13, v14 :: v_dual_add_f32 v14, 0x40c00000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v16, v17, v12, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; GFX11-NEXT: v_bfe_u32 v15, v10, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_bfe_u32 v19, v14, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v9, v15, v10, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v10 +; GFX11-NEXT: v_add3_u32 v10, v19, v14, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v20, v9, v15 :: v_dual_add_f32 v9, 0x40c00000, v17 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff0000, v33 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_or_b32_e32 v15, 0x400000, v14 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v16, v18, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_bfe_u32 v16, v9, 16, 1 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; GFX11-NEXT: v_dual_cndmask_b32 v14, v10, v15 :: v_dual_add_f32 v15, 0x40c00000, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v17, 0x40c00000, v18 +; GFX11-NEXT: v_add3_u32 v16, v16, v9, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_bfe_u32 v19, v15, 16, 1 +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v17 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v16, v18, vcc_lo +; GFX11-NEXT: v_add3_u32 v16, v19, v15, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_add3_u32 v21, v21, v17, 0x7fff +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v19, v16, v19, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v32 +; GFX11-NEXT: v_perm_b32 v16, v19, v18, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v21, v22, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v17 +; GFX11-NEXT: v_bfe_u32 v9, v10, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v9, v9, v10, 0x7fff +; GFX11-NEXT: v_perm_b32 v10, v20, v13, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 8, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v9, v23, vcc_lo +; GFX11-NEXT: v_perm_b32 v9, v14, v12, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 24, v10 +; GFX11-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 8, v9 +; GFX11-NEXT: v_lshrrev_b64 v[27:28], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: .LBB54_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v38 +; GFX11-NEXT: v_mov_b32_e32 v4, v39 +; GFX11-NEXT: v_mov_b32_e32 v8, v36 +; GFX11-NEXT: v_mov_b32_e32 v12, v37 +; GFX11-NEXT: v_mov_b32_e32 v16, v34 +; GFX11-NEXT: v_mov_b32_e32 v20, v35 +; GFX11-NEXT: v_mov_b32_e32 v24, v32 +; GFX11-NEXT: v_mov_b32_e32 v28, v33 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <16 x bfloat> %a1 to <32 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x bfloat> %a to <32 x i8> + br label %end + +end: + %phi = phi <32 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i8> %phi +} + +define <16 x bfloat> @bitcast_v32i8_to_v16bf16(<32 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i8_to_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB55_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v17 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v25 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v3, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v8, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v13, v13, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v18, v18, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GCN-NEXT: v_or_b32_e32 v31, v1, v0 +; GCN-NEXT: v_or_b32_e32 v35, v36, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v37, v4 +; GCN-NEXT: v_or_b32_e32 v33, v6, v5 +; GCN-NEXT: v_or_b32_e32 v5, v38, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v8 +; GCN-NEXT: v_or_b32_e32 v7, v39, v9 +; GCN-NEXT: v_or_b32_e32 v23, v11, v10 +; GCN-NEXT: v_or_b32_e32 v27, v48, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; GCN-NEXT: v_or_b32_e32 v11, v49, v14 +; GCN-NEXT: v_or_b32_e32 v32, v16, v15 +; GCN-NEXT: v_or_b32_e32 v13, v50, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; GCN-NEXT: v_or_b32_e32 v15, v51, v20 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: .LBB55_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB55_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v28 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v30 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v25 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v55, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v7, v11, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; GCN-NEXT: v_or_b32_e32 v13, v54, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v19 +; GCN-NEXT: v_or_b32_e32 v16, v17, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; GCN-NEXT: v_or_b32_e32 v12, v53, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v10 +; GCN-NEXT: v_or_b32_e32 v4, v52, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v3 +; GCN-NEXT: v_or_b32_e32 v3, v51, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v7 +; GCN-NEXT: v_or_b32_e32 v7, v50, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v13 +; GCN-NEXT: v_or_b32_e32 v11, v49, v15 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v16 +; GCN-NEXT: v_or_b32_e32 v15, v48, v17 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_or_b32_e32 v14, v39, v14 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_or_b32_e32 v9, v38, v9 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v6, v37, v6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_or_b32_e32 v1, v36, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v7, v5 +; GCN-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NEXT: v_or_b32_e32 v7, v15, v13 +; GCN-NEXT: v_or_b32_e32 v10, v14, v12 +; GCN-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v7 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v8 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v10 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v6 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v2 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v1 +; GCN-NEXT: .LBB55_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v31 +; GCN-NEXT: v_mov_b32_e32 v1, v35 +; GCN-NEXT: v_mov_b32_e32 v2, v21 +; GCN-NEXT: v_mov_b32_e32 v4, v33 +; GCN-NEXT: v_mov_b32_e32 v6, v19 +; GCN-NEXT: v_mov_b32_e32 v8, v23 +; GCN-NEXT: v_mov_b32_e32 v9, v27 +; GCN-NEXT: v_mov_b32_e32 v10, v29 +; GCN-NEXT: v_mov_b32_e32 v12, v32 +; GCN-NEXT: v_mov_b32_e32 v14, v34 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i8_to_v16bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v33, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v32, v6 +; VI-NEXT: v_mov_b32_e32 v34, v4 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v2 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB55_2 +; VI-NEXT: .LBB55_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v1, 0x300 +; VI-NEXT: v_add_u16_sdwa v7, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v28 +; VI-NEXT: v_or_b32_sdwa v28, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v26 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v24 +; VI-NEXT: v_or_b32_sdwa v24, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v20 +; VI-NEXT: v_or_b32_sdwa v20, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v16 +; VI-NEXT: v_or_b32_sdwa v16, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v12 +; VI-NEXT: v_or_b32_sdwa v12, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v10 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v34 +; VI-NEXT: v_or_b32_sdwa v10, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v33 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v2, v8, v2 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v12 +; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v31 +; VI-NEXT: v_or_b32_e32 v3, v8, v3 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v16 +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v4, v8, v4 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v20 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v5, v8, v5 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v24 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v6, v8, v6 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v28 +; VI-NEXT: v_or_b32_e32 v1, v1, v9 +; VI-NEXT: v_or_b32_e32 v7, v8, v7 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i8_to_v16bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v33, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v31, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v2 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB55_2 +; GFX9-NEXT: .LBB55_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v10, v35, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v10, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v9, v1, s6 +; GFX9-NEXT: v_perm_b32 v2, v8, v2, s6 +; GFX9-NEXT: v_perm_b32 v3, v11, v3, s6 +; GFX9-NEXT: v_perm_b32 v4, v15, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v19, v5, s6 +; GFX9-NEXT: v_perm_b32 v6, v23, v6, s6 +; GFX9-NEXT: v_perm_b32 v7, v27, v7, s6 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i8_to_v16bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v34, v2 :: v_dual_mov_b32 v33, v0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-NEXT: v_dual_mov_b32 v31, v6 :: v_dual_mov_b32 v32, v4 +; GFX11-NEXT: v_lshlrev_b16 v37, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v35, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v38, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v36, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v9, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v2 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v37 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v35 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v38 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v36 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v48 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v18 +; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v12 +; GFX11-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v24 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v30 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v39 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v49 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v17 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v21 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v19 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v23 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v25 +; GFX11-NEXT: v_perm_b32 v3, v4, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v6, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v8, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v10, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v12, v11, 0x5040100 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB55_2 +; GFX11-NEXT: .LBB55_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v30, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v20, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v5, v22, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v25, v1 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v19, v3 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v49, v4 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v17, v5 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v12, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v14, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v15, v0 +; GFX11-NEXT: v_add_nc_u16 v2, v8, 3 +; GFX11-NEXT: v_or_b32_e32 v1, v39, v1 +; GFX11-NEXT: v_add_nc_u16 v14, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v10, 3 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v13, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v11, v4 +; GFX11-NEXT: v_add_nc_u16 v4, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v11, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v34, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v2, v48, v2 +; GFX11-NEXT: v_or_b32_e32 v4, v38, v4 +; GFX11-NEXT: v_or_b32_e32 v11, v37, v11 +; GFX11-NEXT: v_or_b32_e32 v13, v35, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v36, v14 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-NEXT: v_perm_b32 v0, v11, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v13, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v9, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v14, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v15, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v12, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v17, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v19, v7, 0x5040100 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i8> %a, splat (i8 3) + %a2 = bitcast <32 x i8> %a1 to <16 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i8> %a to <16 x bfloat> + br label %end + +end: + %phi = phi <16 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x bfloat> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll new file mode 100644 index 0000000000000..0adf547e19362 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <9 x float> @bitcast_v9i32_to_v9f32(<9 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v9i32_to_v9f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v9i32_to_v9f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v9i32_to_v9f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v9i32_to_v9f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v9 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i32> %a, splat (i32 3) + %a2 = bitcast <9 x i32> %a1 to <9 x float> + br label %end + +cmp.false: + %a3 = bitcast <9 x i32> %a to <9 x float> + br label %end + +end: + %phi = phi <9 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x float> %phi +} + +define <9 x i32> @bitcast_v9f32_to_v9i32(<9 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v9f32_to_v9i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v9f32_to_v9i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v9f32_to_v9i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v9f32_to_v9i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v9 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v7, 1.0, v7 +; GFX11-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 +; GFX11-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v3, 1.0, v3 +; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <9 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <9 x float> %a1 to <9 x i32> + br label %end + +cmp.false: + %a3 = bitcast <9 x float> %a to <9 x i32> + br label %end + +end: + %phi = phi <9 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i32> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll new file mode 100644 index 0000000000000..f1e0c19f7fca3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -0,0 +1,12523 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <10 x float> @bitcast_v10i32_to_v10f32(<10 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v10i32_to_v10f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v10f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i32_to_v10f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i32_to_v10f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <10 x i32> @bitcast_v10f32_to_v10i32(<10 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v10f32_to_v10i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v10i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <20 x half> @bitcast_v10i32_to_v20f16(<10 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v10i32_to_v20f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v29, v9 +; GCN-NEXT: v_mov_b32_e32 v28, v8 +; GCN-NEXT: v_mov_b32_e32 v27, v7 +; GCN-NEXT: v_mov_b32_e32 v26, v6 +; GCN-NEXT: v_mov_b32_e32 v25, v5 +; GCN-NEXT: v_mov_b32_e32 v24, v4 +; GCN-NEXT: v_mov_b32_e32 v23, v3 +; GCN-NEXT: v_mov_b32_e32 v22, v2 +; GCN-NEXT: v_mov_b32_e32 v21, v1 +; GCN-NEXT: v_mov_b32_e32 v20, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB2_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB2_4 +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB2_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: .LBB2_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v20f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i32_to_v20f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i32_to_v20f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define <10 x i32> @bitcast_v20f16_to_v10i32(<20 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v20f16_to_v10i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v18 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB3_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB3_4 +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB3_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; GCN-NEXT: v_or_b32_e32 v0, v29, v0 +; GCN-NEXT: v_or_b32_e32 v1, v27, v1 +; GCN-NEXT: v_or_b32_e32 v2, v25, v2 +; GCN-NEXT: v_or_b32_e32 v3, v23, v3 +; GCN-NEXT: v_or_b32_e32 v4, v22, v4 +; GCN-NEXT: v_or_b32_e32 v5, v21, v5 +; GCN-NEXT: v_or_b32_e32 v6, v20, v6 +; GCN-NEXT: v_or_b32_e32 v7, v12, v7 +; GCN-NEXT: v_or_b32_e32 v8, v11, v8 +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: .LBB3_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NEXT: v_or_b32_e32 v5, v17, v16 +; GCN-NEXT: v_or_b32_e32 v6, v19, v18 +; GCN-NEXT: v_or_b32_e32 v7, v12, v15 +; GCN-NEXT: v_or_b32_e32 v8, v11, v14 +; GCN-NEXT: v_or_b32_e32 v9, v10, v13 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20f16_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v10, 0x200 +; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v11 +; VI-NEXT: v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v11 +; VI-NEXT: v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v11 +; VI-NEXT: v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v11 +; VI-NEXT: v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v11 +; VI-NEXT: v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v11 +; VI-NEXT: v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v11 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v10i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v10i32_to_v40i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v17, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; GCN-NEXT: .LBB4_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v17, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; GCN-NEXT: .LBB4_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v38 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; GCN-NEXT: v_or_b32_e32 v35, v49, v35 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; GCN-NEXT: v_or_b32_e32 v48, v50, v48 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; GCN-NEXT: v_or_b32_e32 v3, v3, v29 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v22 +; GCN-NEXT: v_or_b32_e32 v4, v4, v37 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GCN-NEXT: v_or_b32_e32 v5, v5, v23 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; GCN-NEXT: v_or_b32_e32 v6, v6, v31 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_or_b32_e32 v7, v7, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v8, v8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; GCN-NEXT: v_or_b32_e32 v9, v9, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v10, v10, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GCN-NEXT: v_or_b32_e32 v19, v32, v33 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v48 +; GCN-NEXT: v_or_b32_e32 v25, v38, v39 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v26, v26, v27 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v27, v34, v36 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_or_b32_e32 v20, v20, v21 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v21, v28, v30 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v15, v22, v17 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v12, v16, v13 +; GCN-NEXT: v_or_b32_e32 v13, v18, v19 +; GCN-NEXT: v_or_b32_e32 v16, v24, v25 +; GCN-NEXT: v_or_b32_e32 v3, v3, v26 +; GCN-NEXT: v_or_b32_e32 v4, v4, v27 +; GCN-NEXT: v_or_b32_e32 v5, v5, v20 +; GCN-NEXT: v_or_b32_e32 v6, v6, v21 +; GCN-NEXT: v_or_b32_e32 v7, v7, v14 +; GCN-NEXT: v_or_b32_e32 v8, v8, v15 +; GCN-NEXT: v_or_b32_e32 v9, v9, v11 +; GCN-NEXT: v_or_b32_e32 v10, v10, v12 +; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v40i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB4_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB4_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB4_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i32_to_v40i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB4_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB4_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB4_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i32_to_v40i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-NEXT: .LBB4_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB4_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-NEXT: .LBB4_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v16 +; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v15 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v36, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v31, 8, v31 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v13 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v39 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-NEXT: v_lshlrev_b16 v37, 8, v37 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-NEXT: v_or_b32_e32 v15, v48, v15 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v36 +; GFX11-NEXT: v_or_b32_e32 v14, v35, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v31 +; GFX11-NEXT: v_or_b32_e32 v13, v30, v13 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-NEXT: v_or_b32_e32 v16, v38, v37 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v29 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v27 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v26 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_lshlrev_b16 v34, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_lshlrev_b16 v32, 8, v32 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v12 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v22 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v34 +; GFX11-NEXT: v_or_b32_e32 v32, v33, v32 +; GFX11-NEXT: v_or_b32_e32 v12, v25, v12 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX11-NEXT: v_or_b32_e32 v11, v20, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v19 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v17 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v30 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v12 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v14 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v40i8_to_v10i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v35, v8 +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v41 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v40 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v55 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v54 +; GCN-NEXT: v_or_b32_e32 v0, v0, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v6, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v8, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v10, v10, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v14, v14, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v18, v18, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v22, v22, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_or_b32_e32 v26, v26, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v36, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v37, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v7, v13, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v9, v15, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v11, v17, v12 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v13, v19, v16 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v15, v21, v20 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; GCN-NEXT: v_or_b32_e32 v17, v23, v24 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v26 +; GCN-NEXT: v_or_b32_e32 v19, v25, v27 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v7, v14, v15 +; GCN-NEXT: v_or_b32_e32 v8, v16, v17 +; GCN-NEXT: v_or_b32_e32 v9, v18, v19 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: .LBB5_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v41 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v55 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v54 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_or_b32_e32 v0, v38, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v39, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v48, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v49, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v50, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v10, v51, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v14, v52, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v18, v27, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v22, v29, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_or_b32_e32 v26, v53, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v36, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v37, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_or_b32_e32 v7, v13, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_or_b32_e32 v9, v15, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_or_b32_e32 v11, v17, v12 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v14 +; GCN-NEXT: v_or_b32_e32 v13, v19, v16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v18 +; GCN-NEXT: v_or_b32_e32 v15, v21, v20 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v22 +; GCN-NEXT: v_or_b32_e32 v17, v23, v24 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x300, v26 +; GCN-NEXT: v_or_b32_e32 v19, v25, v27 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NEXT: v_or_b32_e32 v7, v15, v14 +; GCN-NEXT: v_or_b32_e32 v8, v17, v16 +; GCN-NEXT: v_or_b32_e32 v9, v19, v18 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; GCN-NEXT: .LBB5_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v40i8_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v8 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: .LBB5_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB5_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v9, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v35 +; VI-NEXT: v_add_u16_e32 v3, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v12 +; VI-NEXT: v_add_u16_e32 v4, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v16 +; VI-NEXT: v_add_u16_e32 v5, 3, v18 +; VI-NEXT: v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v20 +; VI-NEXT: v_add_u16_e32 v6, 3, v22 +; VI-NEXT: v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v24 +; VI-NEXT: v_add_u16_e32 v7, 3, v26 +; VI-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v28 +; VI-NEXT: v_add_u16_e32 v8, 3, v30 +; VI-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v8, 3, v39 +; VI-NEXT: v_add_u16_e32 v10, 3, v38 +; VI-NEXT: v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v10, v15, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_add_u16_sdwa v10, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v37 +; VI-NEXT: v_add_u16_e32 v12, 3, v36 +; VI-NEXT: v_or_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: .LBB5_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v40i8_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v8 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: .LBB5_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB5_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v35 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v20 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v28 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v39 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v9, v15, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v37 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: .LBB5_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v40i8_to_v10i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v35, v8 :: v_dual_mov_b32 v34, v6 +; GFX11-NEXT: v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2 +; GFX11-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v36, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v37, off, s32 offset:20 +; GFX11-NEXT: scratch_load_u16 v38, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v2 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v4 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v6 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v66 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB5_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB5_4 +; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB5_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v53 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v54 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v64 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v65 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v48 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v49 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v50 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v51 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v52 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v39 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v37 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v36 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v21 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v23 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v25 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v27 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v29 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: .LBB5_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v10, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v12, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v14, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v18, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v53, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v54, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v55, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v64, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v65, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v48, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v49, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v50, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v51, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v52, v9 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-NEXT: v_add_nc_u16 v5, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v22, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v12, v39, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v36, 3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_or_b32_e32 v5, v21, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v23, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v25, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v27, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v29, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v13, v12 +; GFX11-NEXT: v_or_b32_e32 v12, v15, v14 +; GFX11-NEXT: v_or_b32_e32 v13, v17, v16 +; GFX11-NEXT: v_or_b32_e32 v14, v19, v18 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <5 x double> @bitcast_v10i32_to_v5f64(<10 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v10i32_to_v5f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB6_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v5f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i32_to_v5f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i32_to_v5f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define <10 x i32> @bitcast_v5f64_to_v10i32(<5 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v5f64_to_v10i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB7_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v10i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <5 x i64> @bitcast_v10i32_to_v5i64(<10 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v10i32_to_v5i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB8_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB8_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i32_to_v5i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i32_to_v5i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i32_to_v5i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i32> %a, splat (i32 3) + %a2 = bitcast <10 x i32> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <10 x i32> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + +define <10 x i32> @bitcast_v5i64_to_v10i32(<5 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v5i64_to_v10i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB9_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5i64_to_v10i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5i64_to_v10i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5i64_to_v10i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <10 x i32> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <10 x i32> + br label %end + +end: + %phi = phi <10 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i32> %phi +} + +define <20 x half> @bitcast_v10f32_to_v20f16(<10 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v10f32_to_v20f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v29, v9 +; GCN-NEXT: v_mov_b32_e32 v28, v8 +; GCN-NEXT: v_mov_b32_e32 v27, v7 +; GCN-NEXT: v_mov_b32_e32 v26, v6 +; GCN-NEXT: v_mov_b32_e32 v25, v5 +; GCN-NEXT: v_mov_b32_e32 v24, v4 +; GCN-NEXT: v_mov_b32_e32 v23, v3 +; GCN-NEXT: v_mov_b32_e32 v22, v2 +; GCN-NEXT: v_mov_b32_e32 v21, v1 +; GCN-NEXT: v_mov_b32_e32 v20, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_4 +; GCN-NEXT: .LBB10_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB10_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_2 +; GCN-NEXT: .LBB10_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v20 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v21 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v22 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v23 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v24 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v25 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v26 +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v27 +; GCN-NEXT: v_add_f32_e32 v17, 1.0, v28 +; GCN-NEXT: v_add_f32_e32 v19, 1.0, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v20f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v20f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v20f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define <10 x float> @bitcast_v20f16_to_v10f32(<20 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v20f16_to_v10f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v18 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB11_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB11_4 +; GCN-NEXT: .LBB11_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB11_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; GCN-NEXT: v_or_b32_e32 v0, v29, v0 +; GCN-NEXT: v_or_b32_e32 v1, v27, v1 +; GCN-NEXT: v_or_b32_e32 v2, v25, v2 +; GCN-NEXT: v_or_b32_e32 v3, v23, v3 +; GCN-NEXT: v_or_b32_e32 v4, v22, v4 +; GCN-NEXT: v_or_b32_e32 v5, v21, v5 +; GCN-NEXT: v_or_b32_e32 v6, v20, v6 +; GCN-NEXT: v_or_b32_e32 v7, v12, v7 +; GCN-NEXT: v_or_b32_e32 v8, v11, v8 +; GCN-NEXT: v_or_b32_e32 v9, v10, v9 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB11_2 +; GCN-NEXT: .LBB11_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NEXT: v_or_b32_e32 v5, v17, v16 +; GCN-NEXT: v_or_b32_e32 v6, v19, v18 +; GCN-NEXT: v_or_b32_e32 v7, v12, v15 +; GCN-NEXT: v_or_b32_e32 v8, v11, v14 +; GCN-NEXT: v_or_b32_e32 v9, v10, v13 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20f16_to_v10f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v10, 0x200 +; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v11 +; VI-NEXT: v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v11 +; VI-NEXT: v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v11 +; VI-NEXT: v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v11 +; VI-NEXT: v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v11 +; VI-NEXT: v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v11 +; VI-NEXT: v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v11 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v10f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v10f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v10f32_to_v40i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v17, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; GCN-NEXT: .LBB12_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v17, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; GCN-NEXT: .LBB12_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v38 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; GCN-NEXT: v_or_b32_e32 v35, v49, v35 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; GCN-NEXT: v_or_b32_e32 v48, v50, v48 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; GCN-NEXT: v_or_b32_e32 v3, v3, v29 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v22 +; GCN-NEXT: v_or_b32_e32 v4, v4, v37 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GCN-NEXT: v_or_b32_e32 v5, v5, v23 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; GCN-NEXT: v_or_b32_e32 v6, v6, v31 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_or_b32_e32 v7, v7, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v8, v8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; GCN-NEXT: v_or_b32_e32 v9, v9, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v10, v10, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GCN-NEXT: v_or_b32_e32 v19, v32, v33 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v48 +; GCN-NEXT: v_or_b32_e32 v25, v38, v39 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v26, v26, v27 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v27, v34, v36 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_or_b32_e32 v20, v20, v21 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v21, v28, v30 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v15, v22, v17 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v12, v16, v13 +; GCN-NEXT: v_or_b32_e32 v13, v18, v19 +; GCN-NEXT: v_or_b32_e32 v16, v24, v25 +; GCN-NEXT: v_or_b32_e32 v3, v3, v26 +; GCN-NEXT: v_or_b32_e32 v4, v4, v27 +; GCN-NEXT: v_or_b32_e32 v5, v5, v20 +; GCN-NEXT: v_or_b32_e32 v6, v6, v21 +; GCN-NEXT: v_or_b32_e32 v7, v7, v14 +; GCN-NEXT: v_or_b32_e32 v8, v8, v15 +; GCN-NEXT: v_or_b32_e32 v9, v9, v11 +; GCN-NEXT: v_or_b32_e32 v10, v10, v12 +; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v40i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB12_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v40i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v40i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-NEXT: .LBB12_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v9, 1.0, v9 +; GFX11-NEXT: v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v7, 1.0, v7 +; GFX11-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v3, 1.0, v3 +; GFX11-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-NEXT: .LBB12_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v16 +; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v15 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v36, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v31, 8, v31 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v13 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v39 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-NEXT: v_lshlrev_b16 v37, 8, v37 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-NEXT: v_or_b32_e32 v15, v48, v15 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v36 +; GFX11-NEXT: v_or_b32_e32 v14, v35, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v31 +; GFX11-NEXT: v_or_b32_e32 v13, v30, v13 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-NEXT: v_or_b32_e32 v16, v38, v37 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v29 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v27 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v26 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_lshlrev_b16 v34, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_lshlrev_b16 v32, 8, v32 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v12 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v22 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v34 +; GFX11-NEXT: v_or_b32_e32 v32, v33, v32 +; GFX11-NEXT: v_or_b32_e32 v12, v25, v12 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX11-NEXT: v_or_b32_e32 v11, v20, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v19 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v17 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v30 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v12 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v14 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v40i8_to_v10f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v35, v8 +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB13_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v41 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v40 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v55 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v54 +; GCN-NEXT: v_or_b32_e32 v0, v0, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v6, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v8, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v10, v10, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v14, v14, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v18, v18, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v22, v22, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_or_b32_e32 v26, v26, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v36, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v37, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v7, v13, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v9, v15, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v11, v17, v12 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v13, v19, v16 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v15, v21, v20 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; GCN-NEXT: v_or_b32_e32 v17, v23, v24 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v26 +; GCN-NEXT: v_or_b32_e32 v19, v25, v27 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v7, v14, v15 +; GCN-NEXT: v_or_b32_e32 v8, v16, v17 +; GCN-NEXT: v_or_b32_e32 v9, v18, v19 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: .LBB13_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB13_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v41 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v40 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v55 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v54 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_or_b32_e32 v0, v38, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v39, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v48, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v49, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v50, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v10, v51, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v14, v52, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v18, v27, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v22, v29, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_or_b32_e32 v26, v53, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v36, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v37, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_or_b32_e32 v7, v13, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_or_b32_e32 v9, v15, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_or_b32_e32 v11, v17, v12 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v14 +; GCN-NEXT: v_or_b32_e32 v13, v19, v16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v18 +; GCN-NEXT: v_or_b32_e32 v15, v21, v20 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v22 +; GCN-NEXT: v_or_b32_e32 v17, v23, v24 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x300, v26 +; GCN-NEXT: v_or_b32_e32 v19, v25, v27 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NEXT: v_or_b32_e32 v7, v15, v14 +; GCN-NEXT: v_or_b32_e32 v8, v17, v16 +; GCN-NEXT: v_or_b32_e32 v9, v19, v18 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; GCN-NEXT: .LBB13_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v40i8_to_v10f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v8 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: .LBB13_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB13_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v9, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v35 +; VI-NEXT: v_add_u16_e32 v3, 3, v10 +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v12 +; VI-NEXT: v_add_u16_e32 v4, 3, v14 +; VI-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v16 +; VI-NEXT: v_add_u16_e32 v5, 3, v18 +; VI-NEXT: v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v20 +; VI-NEXT: v_add_u16_e32 v6, 3, v22 +; VI-NEXT: v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v24 +; VI-NEXT: v_add_u16_e32 v7, 3, v26 +; VI-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v28 +; VI-NEXT: v_add_u16_e32 v8, 3, v30 +; VI-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v8, 3, v39 +; VI-NEXT: v_add_u16_e32 v10, 3, v38 +; VI-NEXT: v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v10, v15, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_add_u16_sdwa v10, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v37 +; VI-NEXT: v_add_u16_e32 v12, 3, v36 +; VI-NEXT: v_or_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: .LBB13_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v40i8_to_v10f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v8 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v44 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v12, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v14, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v39, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v38, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v37, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v36, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: .LBB13_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB13_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v35 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v12 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v52, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v4, v51, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v5, v50, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v20 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v5, v49, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v48, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v23, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v28 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v7, v21, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v19, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v39 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v8, v17, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v9, v15, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v37 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v9, v13, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: .LBB13_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v40i8_to_v10f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v35, v8 :: v_dual_mov_b32 v34, v6 +; GFX11-NEXT: v_dual_mov_b32 v33, v4 :: v_dual_mov_b32 v32, v2 +; GFX11-NEXT: v_mov_b32_e32 v31, v0 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v36, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v37, off, s32 offset:20 +; GFX11-NEXT: scratch_load_u16 v38, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v2 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v4 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v6 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v66 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v53 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v54 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v55 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v64 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v65 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v48 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v49 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v50 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v51 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v52 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v39 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v37 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v36 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v21 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v23 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v25 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v27 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v29 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: .LBB13_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v10, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v12, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v14, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v18, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v53, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v54, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v55, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v64, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v65, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v48, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v49, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v50, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v51, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v52, v9 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-NEXT: v_add_nc_u16 v5, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v22, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v12, v39, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v36, 3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_or_b32_e32 v5, v21, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v23, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v25, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v27, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v29, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v11, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v13, v12 +; GFX11-NEXT: v_or_b32_e32 v12, v15, v14 +; GFX11-NEXT: v_or_b32_e32 v13, v17, v16 +; GFX11-NEXT: v_or_b32_e32 v14, v19, v18 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <5 x double> @bitcast_v10f32_to_v5f64(<10 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v10f32_to_v5f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB14_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB14_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v5f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v5f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v5f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define <10 x float> @bitcast_v5f64_to_v10f32(<5 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v5f64_to_v10f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB15_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB15_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v10f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v10f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB15_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v10f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB15_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <5 x i64> @bitcast_v10f32_to_v5i64(<10 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v10f32_to_v5i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB16_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB16_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10f32_to_v5i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10f32_to_v5i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10f32_to_v5i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <10 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <10 x float> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <10 x float> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + +define <10 x float> @bitcast_v5i64_to_v10f32(<5 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v5i64_to_v10f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB17_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB17_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5i64_to_v10f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5i64_to_v10f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5i64_to_v10f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <10 x float> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <10 x float> + br label %end + +end: + %phi = phi <10 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x float> %phi +} + +define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v20f16_to_v40i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v3 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v7 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v11 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_cvt_f16_f32_e32 v46, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v15 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v47, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v19 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GCN-NEXT: v_bfe_u32 v14, v21, 8, 8 +; GCN-NEXT: v_bfe_u32 v11, v4, 8, 8 +; GCN-NEXT: v_bfe_u32 v8, v3, 8, 8 +; GCN-NEXT: v_bfe_u32 v5, v2, 8, 8 +; GCN-NEXT: v_or_b32_e32 v29, v50, v6 +; GCN-NEXT: v_or_b32_e32 v27, v49, v7 +; GCN-NEXT: v_or_b32_e32 v20, v52, v9 +; GCN-NEXT: v_or_b32_e32 v17, v51, v10 +; GCN-NEXT: v_or_b32_e32 v13, v54, v12 +; GCN-NEXT: v_or_b32_e32 v12, v53, v15 +; GCN-NEXT: v_or_b32_e32 v10, v42, v16 +; GCN-NEXT: v_or_b32_e32 v9, v40, v18 +; GCN-NEXT: v_or_b32_e32 v7, v45, v19 +; GCN-NEXT: v_or_b32_e32 v6, v44, v22 +; GCN-NEXT: v_alignbit_b32 v34, v27, v29, 24 +; GCN-NEXT: v_alignbit_b32 v36, v27, v29, 16 +; GCN-NEXT: v_alignbit_b32 v38, v27, v29, 8 +; GCN-NEXT: v_alignbit_b32 v32, v17, v20, 24 +; GCN-NEXT: v_alignbit_b32 v33, v17, v20, 16 +; GCN-NEXT: v_alignbit_b32 v35, v17, v20, 8 +; GCN-NEXT: v_alignbit_b32 v26, v12, v13, 24 +; GCN-NEXT: v_alignbit_b32 v28, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v31, v12, v13, 8 +; GCN-NEXT: v_alignbit_b32 v19, v9, v10, 24 +; GCN-NEXT: v_alignbit_b32 v24, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v25, v9, v10, 8 +; GCN-NEXT: v_alignbit_b32 v15, v6, v7, 24 +; GCN-NEXT: v_alignbit_b32 v16, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v18, v6, v7, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 8, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 8, v6 +; GCN-NEXT: v_bfe_u32 v22, v1, 8, 8 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: .LBB18_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v49 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v21 +; GCN-NEXT: v_bfe_u32 v14, v21, 8, 8 +; GCN-NEXT: v_bfe_u32 v11, v4, 8, 8 +; GCN-NEXT: v_bfe_u32 v8, v3, 8, 8 +; GCN-NEXT: v_bfe_u32 v5, v2, 8, 8 +; GCN-NEXT: v_or_b32_e32 v7, v6, v13 +; GCN-NEXT: v_or_b32_e32 v6, v22, v17 +; GCN-NEXT: v_or_b32_e32 v10, v9, v20 +; GCN-NEXT: v_or_b32_e32 v9, v23, v25 +; GCN-NEXT: v_or_b32_e32 v13, v12, v26 +; GCN-NEXT: v_or_b32_e32 v12, v24, v27 +; GCN-NEXT: v_or_b32_e32 v20, v15, v28 +; GCN-NEXT: v_or_b32_e32 v17, v16, v29 +; GCN-NEXT: v_or_b32_e32 v29, v18, v30 +; GCN-NEXT: v_or_b32_e32 v27, v19, v31 +; GCN-NEXT: v_alignbit_b32 v34, v27, v29, 24 +; GCN-NEXT: v_alignbit_b32 v36, v27, v29, 16 +; GCN-NEXT: v_alignbit_b32 v38, v27, v29, 8 +; GCN-NEXT: v_alignbit_b32 v32, v17, v20, 24 +; GCN-NEXT: v_alignbit_b32 v33, v17, v20, 16 +; GCN-NEXT: v_alignbit_b32 v35, v17, v20, 8 +; GCN-NEXT: v_alignbit_b32 v26, v12, v13, 24 +; GCN-NEXT: v_alignbit_b32 v28, v12, v13, 16 +; GCN-NEXT: v_alignbit_b32 v31, v12, v13, 8 +; GCN-NEXT: v_alignbit_b32 v19, v9, v10, 24 +; GCN-NEXT: v_alignbit_b32 v24, v9, v10, 16 +; GCN-NEXT: v_alignbit_b32 v25, v9, v10, 8 +; GCN-NEXT: v_alignbit_b32 v15, v6, v7, 24 +; GCN-NEXT: v_alignbit_b32 v16, v6, v7, 16 +; GCN-NEXT: v_alignbit_b32 v18, v6, v7, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 8, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 8, v6 +; GCN-NEXT: v_bfe_u32 v22, v1, 8, 8 +; GCN-NEXT: .LBB18_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v14 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 4, v0 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v39 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; GCN-NEXT: v_add_i32_e32 v52, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GCN-NEXT: v_add_i32_e32 v53, vcc, 20, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v19 +; GCN-NEXT: v_add_i32_e32 v54, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 8, v30 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GCN-NEXT: v_add_i32_e32 v55, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v22 +; GCN-NEXT: v_add_i32_e32 v41, vcc, 36, v0 +; GCN-NEXT: v_or_b32_e32 v29, v29, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_or_b32_e32 v27, v27, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_or_b32_e32 v35, v50, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_or_b32_e32 v17, v17, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v13, v13, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_or_b32_e32 v12, v12, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v10, v10, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_or_b32_e32 v9, v9, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v7, v7, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v6, v6, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v23, v34, v36 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v27 +; GCN-NEXT: v_or_b32_e32 v21, v49, v21 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v35 +; GCN-NEXT: v_or_b32_e32 v29, v32, v33 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v4, v11, v4 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v13, v26, v28 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v3, v8, v3 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v19, v24 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v15, v16 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v1, v22, v1 +; GCN-NEXT: v_or_b32_e32 v15, v18, v23 +; GCN-NEXT: v_or_b32_e32 v16, v25, v21 +; GCN-NEXT: v_or_b32_e32 v18, v27, v29 +; GCN-NEXT: v_or_b32_e32 v4, v17, v4 +; GCN-NEXT: v_or_b32_e32 v11, v11, v13 +; GCN-NEXT: v_or_b32_e32 v3, v12, v3 +; GCN-NEXT: v_or_b32_e32 v8, v8, v10 +; GCN-NEXT: v_or_b32_e32 v2, v9, v2 +; GCN-NEXT: v_or_b32_e32 v5, v5, v7 +; GCN-NEXT: v_or_b32_e32 v1, v6, v1 +; GCN-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v52, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v53, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v54, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v2, v55, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v40, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v1, v41, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20f16_to_v40i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v22, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v38, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v48, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB18_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v11, 0x200 +; VI-NEXT: v_add_f16_sdwa v23, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v23 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v26, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v39, v2, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v26 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v21, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v38, v1, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v21 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v25, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v36, v4, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v25 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v19, v6, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v35, v3, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v24, v5, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v33, v6, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v18, v8, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v32, v5, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v18 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v22, v7, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v30, v8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v17, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v20, v9, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v7, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v17 +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v49, v10, v12 +; VI-NEXT: v_or_b32_e32 v48, v9, v11 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[48:49] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[29:30] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[32:33] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[35:36] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[38:39] +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v49 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v48 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v37, 8, v35 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v39 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v38 +; VI-NEXT: v_bfe_u32 v29, v17, 8, 8 +; VI-NEXT: v_bfe_u32 v32, v18, 8, 8 +; VI-NEXT: v_bfe_u32 v35, v19, 8, 8 +; VI-NEXT: v_bfe_u32 v38, v21, 8, 8 +; VI-NEXT: v_bfe_u32 v48, v23, 8, 8 +; VI-NEXT: .LBB18_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v15, v26, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v48 +; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v38 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v21, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v40i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB18_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB18_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v40i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-NEXT: .LBB18_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-NEXT: .LBB18_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v16 +; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v15 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v36, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v31, 8, v31 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v13 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v39 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-NEXT: v_lshlrev_b16 v37, 8, v37 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-NEXT: v_or_b32_e32 v15, v48, v15 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v36 +; GFX11-NEXT: v_or_b32_e32 v14, v35, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v31 +; GFX11-NEXT: v_or_b32_e32 v13, v30, v13 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-NEXT: v_or_b32_e32 v16, v38, v37 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v29 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v27 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v26 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_lshlrev_b16 v34, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_lshlrev_b16 v32, 8, v32 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v12 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v22 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v34 +; GFX11-NEXT: v_or_b32_e32 v32, v33, v32 +; GFX11-NEXT: v_or_b32_e32 v12, v25, v12 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX11-NEXT: v_or_b32_e32 v11, v20, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v19 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v17 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v30 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v12 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v14 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define <20 x half> @bitcast_v40i8_to_v20f16(<40 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v40i8_to_v20f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 8, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v31 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB19_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v56 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v57 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v58 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v59 +; GCN-NEXT: v_or_b32_e32 v0, v0, v36 +; GCN-NEXT: v_or_b32_e32 v1, v1, v37 +; GCN-NEXT: v_or_b32_e32 v2, v2, v38 +; GCN-NEXT: v_or_b32_e32 v3, v3, v39 +; GCN-NEXT: v_or_b32_e32 v4, v4, v48 +; GCN-NEXT: v_or_b32_e32 v5, v5, v49 +; GCN-NEXT: v_or_b32_e32 v6, v6, v50 +; GCN-NEXT: v_or_b32_e32 v7, v7, v51 +; GCN-NEXT: v_or_b32_e32 v8, v8, v52 +; GCN-NEXT: v_or_b32_e32 v9, v9, v53 +; GCN-NEXT: v_or_b32_e32 v10, v10, v54 +; GCN-NEXT: v_or_b32_e32 v11, v11, v55 +; GCN-NEXT: v_or_b32_e32 v12, v12, v40 +; GCN-NEXT: v_or_b32_e32 v13, v13, v41 +; GCN-NEXT: v_or_b32_e32 v14, v14, v42 +; GCN-NEXT: v_or_b32_e32 v15, v15, v43 +; GCN-NEXT: v_or_b32_e32 v16, v16, v44 +; GCN-NEXT: v_or_b32_e32 v17, v17, v45 +; GCN-NEXT: v_or_b32_e32 v18, v18, v46 +; GCN-NEXT: v_or_b32_e32 v19, v19, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: .LBB19_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB19_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v59 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v58 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v57 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v56 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v47, v1 +; GCN-NEXT: v_or_b32_e32 v3, v46, v3 +; GCN-NEXT: v_or_b32_e32 v5, v45, v5 +; GCN-NEXT: v_or_b32_e32 v7, v44, v7 +; GCN-NEXT: v_or_b32_e32 v9, v43, v9 +; GCN-NEXT: v_or_b32_e32 v11, v42, v11 +; GCN-NEXT: v_or_b32_e32 v13, v41, v13 +; GCN-NEXT: v_or_b32_e32 v15, v40, v15 +; GCN-NEXT: v_or_b32_e32 v17, v55, v17 +; GCN-NEXT: v_or_b32_e32 v19, v54, v19 +; GCN-NEXT: v_or_b32_e32 v18, v53, v18 +; GCN-NEXT: v_or_b32_e32 v16, v52, v16 +; GCN-NEXT: v_or_b32_e32 v14, v51, v14 +; GCN-NEXT: v_or_b32_e32 v12, v50, v12 +; GCN-NEXT: v_or_b32_e32 v10, v49, v10 +; GCN-NEXT: v_or_b32_e32 v8, v48, v8 +; GCN-NEXT: v_or_b32_e32 v6, v39, v6 +; GCN-NEXT: v_or_b32_e32 v4, v38, v4 +; GCN-NEXT: v_or_b32_e32 v2, v37, v2 +; GCN-NEXT: v_or_b32_e32 v0, v36, v0 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 0x300, v1 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v20 +; GCN-NEXT: .LBB19_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v31 +; GCN-NEXT: v_mov_b32_e32 v2, v23 +; GCN-NEXT: v_mov_b32_e32 v4, v35 +; GCN-NEXT: v_mov_b32_e32 v6, v27 +; GCN-NEXT: v_mov_b32_e32 v8, v33 +; GCN-NEXT: v_mov_b32_e32 v10, v21 +; GCN-NEXT: v_mov_b32_e32 v12, v25 +; GCN-NEXT: v_mov_b32_e32 v14, v29 +; GCN-NEXT: v_mov_b32_e32 v16, v32 +; GCN-NEXT: v_mov_b32_e32 v18, v34 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v40i8_to_v20f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v34, v10 +; VI-NEXT: v_mov_b32_e32 v33, v8 +; VI-NEXT: v_mov_b32_e32 v35, v6 +; VI-NEXT: v_mov_b32_e32 v38, v4 +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v36, v0 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v31, v14 +; VI-NEXT: v_mov_b32_e32 v37, v12 +; VI-NEXT: v_lshlrev_b16_e32 v39, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v48, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v49, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v10 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v36, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v34, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v37, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v31, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v51, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v53, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v54, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v55, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: .LBB19_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB19_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v1, 0x300 +; VI-NEXT: v_add_u16_sdwa v9, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v0, 3, v54 +; VI-NEXT: v_or_b32_sdwa v10, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v0, 3, v53 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v51 +; VI-NEXT: v_or_b32_sdwa v11, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v28 +; VI-NEXT: v_or_b32_sdwa v12, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v26 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v24 +; VI-NEXT: v_or_b32_sdwa v13, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v20 +; VI-NEXT: v_or_b32_sdwa v14, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v16 +; VI-NEXT: v_or_b32_sdwa v15, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v37 +; VI-NEXT: v_or_b32_sdwa v16, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v34 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v33 +; VI-NEXT: v_or_b32_sdwa v17, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v38 +; VI-NEXT: v_or_b32_sdwa v19, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v36 +; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v19 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v1, v1, v18 +; VI-NEXT: v_or_b32_e32 v2, v17, v2 +; VI-NEXT: v_or_b32_e32 v3, v16, v3 +; VI-NEXT: v_or_b32_e32 v4, v15, v4 +; VI-NEXT: v_or_b32_e32 v5, v14, v5 +; VI-NEXT: v_or_b32_e32 v6, v13, v6 +; VI-NEXT: v_or_b32_e32 v7, v12, v7 +; VI-NEXT: v_or_b32_e32 v8, v11, v8 +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: .LBB19_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v40i8_to_v20f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v31, v10 +; GFX9-NEXT: v_mov_b32_e32 v32, v8 +; GFX9-NEXT: v_mov_b32_e32 v38, v6 +; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v2 +; GFX9-NEXT: v_mov_b32_e32 v36, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v37, v14 +; GFX9-NEXT: v_mov_b32_e32 v34, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v10 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: v_or_b32_sdwa v2, v32, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: v_or_b32_sdwa v3, v34, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v37, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v55, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v53, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 +; GFX9-NEXT: v_or_b32_sdwa v9, v42, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v54, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: .LBB19_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v55 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v53 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v36 +; GFX9-NEXT: v_add_u16_e32 v19, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v19, v39, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v19 +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v19, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v18, v1, s6 +; GFX9-NEXT: v_perm_b32 v2, v17, v2, s6 +; GFX9-NEXT: v_perm_b32 v3, v16, v3, s6 +; GFX9-NEXT: v_perm_b32 v4, v15, v4, s6 +; GFX9-NEXT: v_perm_b32 v5, v14, v5, s6 +; GFX9-NEXT: v_perm_b32 v6, v13, v6, s6 +; GFX9-NEXT: v_perm_b32 v7, v12, v7, s6 +; GFX9-NEXT: v_perm_b32 v8, v11, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: .LBB19_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v40i8_to_v20f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v33, v10 :: v_dual_mov_b32 v34, v8 +; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v35, v4 +; GFX11-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:20 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:4 +; GFX11-NEXT: v_dual_mov_b32 v31, v14 :: v_dual_mov_b32 v32, v12 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v69, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v2 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v4 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v6 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: v_lshlrev_b16 v70, 8, v8 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_lshlrev_b16 v71, 8, v10 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB19_4 +; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB19_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v48 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v49 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v51 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v50 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v55 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v53 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v52 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v54 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v67 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v65 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v68 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v66 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v21 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v25 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v69 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v23 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v27 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v29 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v70 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v71 +; GFX11-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v8, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v10, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v12, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v14, v13, 0x5040100 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: .LBB19_4: ; %cmp.true +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_add_nc_u16 v0, v68, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v66, 3 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v28, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v2, v67, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v70, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v71, v1 +; GFX11-NEXT: v_or_b32_e32 v3, v29, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v27, v2 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v69, v4 +; GFX11-NEXT: v_add_nc_u16 v1, v30, 3 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v3, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v26, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v2, v24, 3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_add_nc_u16 v4, v22, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_perm_b32 v8, v11, v8, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v25, v0 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v21, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v64, v2 +; GFX11-NEXT: v_add_nc_u16 v3, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v19, v4 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v31, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v54, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v53, v1 +; GFX11-NEXT: v_add_nc_u16 v2, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v36, 3 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v17, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v52, v4 +; GFX11-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-NEXT: v_add_nc_u16 v17, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v19, v33, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GFX11-NEXT: v_or_b32_e32 v2, v55, v2 +; GFX11-NEXT: v_or_b32_e32 v4, v51, v4 +; GFX11-NEXT: v_or_b32_e32 v16, v48, v16 +; GFX11-NEXT: v_or_b32_e32 v17, v49, v17 +; GFX11-NEXT: v_or_b32_e32 v18, v50, v18 +; GFX11-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v0 +; GFX11-NEXT: v_perm_b32 v0, v17, v16, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v18, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v19, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v20, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v21, v15, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v14, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v13, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v12, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v10, v9, 0x5040100 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define <5 x double> @bitcast_v20f16_to_v5f64(<20 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v20f16_to_v5f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v18 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB20_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB20_4 +; GCN-NEXT: .LBB20_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB20_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; GCN-NEXT: v_or_b32_e32 v0, v35, v0 +; GCN-NEXT: v_or_b32_e32 v1, v33, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v19 +; GCN-NEXT: v_or_b32_e32 v2, v29, v2 +; GCN-NEXT: v_or_b32_e32 v3, v27, v3 +; GCN-NEXT: v_or_b32_e32 v4, v25, v4 +; GCN-NEXT: v_or_b32_e32 v5, v23, v5 +; GCN-NEXT: v_or_b32_e32 v6, v21, v6 +; GCN-NEXT: v_or_b32_e32 v7, v20, v7 +; GCN-NEXT: v_or_b32_e32 v8, v17, v8 +; GCN-NEXT: v_or_b32_e32 v9, v16, v9 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB20_2 +; GCN-NEXT: .LBB20_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v9, v8 +; GCN-NEXT: v_or_b32_e32 v6, v11, v10 +; GCN-NEXT: v_or_b32_e32 v7, v13, v12 +; GCN-NEXT: v_or_b32_e32 v8, v15, v14 +; GCN-NEXT: v_or_b32_e32 v9, v16, v17 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20f16_to_v5f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v10, 0x200 +; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v11 +; VI-NEXT: v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v11 +; VI-NEXT: v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v11 +; VI-NEXT: v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v11 +; VI-NEXT: v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v11 +; VI-NEXT: v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v11 +; VI-NEXT: v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v11 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v5f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v5f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define <20 x half> @bitcast_v5f64_to_v20f16(<5 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v5f64_to_v20f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB21_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: .LBB21_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB21_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: .LBB21_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v24 +; GCN-NEXT: v_mov_b32_e32 v1, v29 +; GCN-NEXT: v_mov_b32_e32 v2, v20 +; GCN-NEXT: v_mov_b32_e32 v3, v28 +; GCN-NEXT: v_mov_b32_e32 v4, v21 +; GCN-NEXT: v_mov_b32_e32 v5, v27 +; GCN-NEXT: v_mov_b32_e32 v6, v22 +; GCN-NEXT: v_mov_b32_e32 v7, v26 +; GCN-NEXT: v_mov_b32_e32 v8, v23 +; GCN-NEXT: v_mov_b32_e32 v9, v25 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v20f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v20f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v20f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define <5 x i64> @bitcast_v20f16_to_v5i64(<20 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v20f16_to_v5i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v18 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB22_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB22_4 +; GCN-NEXT: .LBB22_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB22_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; GCN-NEXT: v_or_b32_e32 v0, v35, v0 +; GCN-NEXT: v_or_b32_e32 v1, v33, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v19 +; GCN-NEXT: v_or_b32_e32 v2, v29, v2 +; GCN-NEXT: v_or_b32_e32 v3, v27, v3 +; GCN-NEXT: v_or_b32_e32 v4, v25, v4 +; GCN-NEXT: v_or_b32_e32 v5, v23, v5 +; GCN-NEXT: v_or_b32_e32 v6, v21, v6 +; GCN-NEXT: v_or_b32_e32 v7, v20, v7 +; GCN-NEXT: v_or_b32_e32 v8, v17, v8 +; GCN-NEXT: v_or_b32_e32 v9, v16, v9 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB22_2 +; GCN-NEXT: .LBB22_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v9, v8 +; GCN-NEXT: v_or_b32_e32 v6, v11, v10 +; GCN-NEXT: v_or_b32_e32 v7, v13, v12 +; GCN-NEXT: v_or_b32_e32 v8, v15, v14 +; GCN-NEXT: v_or_b32_e32 v9, v16, v17 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20f16_to_v5i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v10, 0x200 +; VI-NEXT: v_add_f16_sdwa v11, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v11 +; VI-NEXT: v_add_f16_sdwa v11, v8, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v11 +; VI-NEXT: v_add_f16_sdwa v11, v7, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v11 +; VI-NEXT: v_add_f16_sdwa v11, v6, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v11 +; VI-NEXT: v_add_f16_sdwa v11, v5, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v11 +; VI-NEXT: v_add_f16_sdwa v11, v4, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v11 +; VI-NEXT: v_add_f16_sdwa v11, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v11 +; VI-NEXT: v_add_f16_sdwa v11, v2, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v11 +; VI-NEXT: v_add_f16_sdwa v11, v1, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v10, v0, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v11 +; VI-NEXT: v_or_b32_e32 v0, v0, v10 +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20f16_to_v5i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20f16_to_v5i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <20 x half> %a, splat (half 0xH0200) + %a2 = bitcast <20 x half> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <20 x half> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + +define <20 x half> @bitcast_v5i64_to_v20f16(<5 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v5i64_to_v20f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, v9 +; GCN-NEXT: v_mov_b32_e32 v21, v8 +; GCN-NEXT: v_mov_b32_e32 v24, v7 +; GCN-NEXT: v_mov_b32_e32 v23, v6 +; GCN-NEXT: v_mov_b32_e32 v26, v5 +; GCN-NEXT: v_mov_b32_e32 v25, v4 +; GCN-NEXT: v_mov_b32_e32 v28, v3 +; GCN-NEXT: v_mov_b32_e32 v27, v2 +; GCN-NEXT: v_mov_b32_e32 v29, v1 +; GCN-NEXT: v_mov_b32_e32 v20, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB23_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB23_4 +; GCN-NEXT: .LBB23_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB23_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v20 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB23_2 +; GCN-NEXT: .LBB23_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v20 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v29, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v27 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v28, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v25 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v26, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v23 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v24, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v21 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v22, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v20 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5i64_to_v20f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5i64_to_v20f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5i64_to_v20f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <20 x half> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <20 x half> + br label %end + +end: + %phi = phi <20 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x half> %phi +} + +define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v40i8_to_v5f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v38, v14 +; GCN-NEXT: v_mov_b32_e32 v37, v12 +; GCN-NEXT: v_mov_b32_e32 v36, v10 +; GCN-NEXT: v_mov_b32_e32 v35, v8 +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v37 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v38 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v47 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v46 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v45 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v44 +; GCN-NEXT: v_or_b32_e32 v0, v0, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v6, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v8, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v10, v10, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v12, v12, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v14, v14, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v16, v16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v20, v20, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v39, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v48, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v49, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v7, v50, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v9, v17, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v11, v19, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v13, v21, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v15, v23, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v17, v25, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v19, v27, v22 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v7, v14, v15 +; GCN-NEXT: v_or_b32_e32 v8, v16, v17 +; GCN-NEXT: v_or_b32_e32 v9, v18, v19 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: .LBB24_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v47 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v44 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v0, v51, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v52, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v53, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v54, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v55, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v10, v40, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v12, v41, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v14, v29, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v16, v42, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v20, v43, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v39, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v48, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v49, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_or_b32_e32 v7, v50, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_or_b32_e32 v9, v17, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_or_b32_e32 v11, v19, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_or_b32_e32 v13, v21, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; GCN-NEXT: v_or_b32_e32 v15, v23, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; GCN-NEXT: v_or_b32_e32 v17, v25, v18 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v20 +; GCN-NEXT: v_or_b32_e32 v19, v27, v22 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NEXT: v_or_b32_e32 v7, v15, v14 +; GCN-NEXT: v_or_b32_e32 v8, v17, v16 +; GCN-NEXT: v_or_b32_e32 v9, v19, v18 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; GCN-NEXT: .LBB24_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v40i8_to_v5f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v36, v10 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v38, v14 +; VI-NEXT: v_mov_b32_e32 v37, v12 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v10 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v36, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v37, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v38, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v39, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v9, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v35 +; VI-NEXT: v_add_u16_e32 v3, 3, v36 +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v37 +; VI-NEXT: v_add_u16_e32 v4, 3, v38 +; VI-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v41, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v16 +; VI-NEXT: v_add_u16_e32 v5, 3, v18 +; VI-NEXT: v_or_b32_sdwa v4, v40, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v55, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v20 +; VI-NEXT: v_add_u16_e32 v6, 3, v22 +; VI-NEXT: v_or_b32_sdwa v5, v54, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v53, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v24 +; VI-NEXT: v_add_u16_e32 v7, 3, v26 +; VI-NEXT: v_or_b32_sdwa v6, v52, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v51, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v28 +; VI-NEXT: v_add_u16_e32 v8, 3, v30 +; VI-NEXT: v_or_b32_sdwa v7, v27, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v8, 3, v50 +; VI-NEXT: v_add_u16_e32 v10, 3, v49 +; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v10, v21, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_add_u16_sdwa v10, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v48 +; VI-NEXT: v_add_u16_e32 v11, 3, v39 +; VI-NEXT: v_or_b32_sdwa v10, v19, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v17, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v40i8_to_v5f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v36, v10 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v38, v14 +; GFX9-NEXT: v_mov_b32_e32 v37, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v10 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v37, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v38, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v39, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v35 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v37 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v41, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v4, v40, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v5, v55, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v20 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v5, v54, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v53, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v6, v52, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v51, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v28 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v7, v27, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v50 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v9, v21, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v48 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v9, v19, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v10, v17, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v40i8_to_v5f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:20 +; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v50, off, s32 offset:4 +; GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v68, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v69, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v70, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v71, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v2 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v4 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v6 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v8 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v10 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB24_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB24_4 +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB24_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v37 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v67 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v68 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v69 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v70 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v71 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v54 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v55 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v65 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v66 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v48 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v39 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v51 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v52 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v53 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v27 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v29 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v17 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v21 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v23 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v25 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: .LBB24_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v36, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v18, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v67, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v68, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v69, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v70, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v71, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v54, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v55, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v64, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v65, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v66, v9 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-NEXT: v_add_nc_u16 v5, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v22, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v11, v50, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v48, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v39, 3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v51, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v52, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v53, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v27, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v29, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v17, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v19, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v21, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v23, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v25, v14 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} + +define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v5f64_to_v40i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB25_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v17, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v18, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v19, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v30, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; GCN-NEXT: .LBB25_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB25_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v17, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v18, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v19, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v30, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v31, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; GCN-NEXT: .LBB25_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v35 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; GCN-NEXT: v_and_b32_e32 v51, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v38 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 12, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; GCN-NEXT: v_or_b32_e32 v49, v49, v50 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; GCN-NEXT: v_or_b32_e32 v48, v51, v48 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 20, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; GCN-NEXT: v_or_b32_e32 v3, v3, v31 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 24, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; GCN-NEXT: v_or_b32_e32 v4, v4, v37 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 28, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GCN-NEXT: v_or_b32_e32 v5, v5, v19 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 32, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 8, v22 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; GCN-NEXT: v_or_b32_e32 v6, v6, v28 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 36, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_or_b32_e32 v7, v7, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v8, v8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; GCN-NEXT: v_or_b32_e32 v9, v9, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v10, v10, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v21 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v49 +; GCN-NEXT: v_or_b32_e32 v22, v32, v35 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v48 +; GCN-NEXT: v_or_b32_e32 v25, v33, v39 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v29, v29, v30 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v30, v34, v36 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_or_b32_e32 v17, v17, v18 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v18, v26, v27 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v15, v23, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v12, v20, v13 +; GCN-NEXT: v_or_b32_e32 v13, v21, v22 +; GCN-NEXT: v_or_b32_e32 v16, v24, v25 +; GCN-NEXT: v_or_b32_e32 v3, v3, v29 +; GCN-NEXT: v_or_b32_e32 v4, v4, v30 +; GCN-NEXT: v_or_b32_e32 v5, v5, v17 +; GCN-NEXT: v_or_b32_e32 v6, v6, v18 +; GCN-NEXT: v_or_b32_e32 v7, v7, v14 +; GCN-NEXT: v_or_b32_e32 v8, v8, v15 +; GCN-NEXT: v_or_b32_e32 v9, v9, v11 +; GCN-NEXT: v_or_b32_e32 v10, v10, v12 +; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v38, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v31, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v28, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v40i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; VI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; VI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; VI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; VI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v40i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX9-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX9-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v40i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-NEXT: .LBB25_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX11-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-NEXT: .LBB25_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v16 +; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v15 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v36, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v31, 8, v31 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v13 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v39 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-NEXT: v_lshlrev_b16 v37, 8, v37 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-NEXT: v_or_b32_e32 v15, v48, v15 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v36 +; GFX11-NEXT: v_or_b32_e32 v14, v35, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v31 +; GFX11-NEXT: v_or_b32_e32 v13, v30, v13 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-NEXT: v_or_b32_e32 v16, v38, v37 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v29 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v27 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v26 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_lshlrev_b16 v34, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_lshlrev_b16 v32, 8, v32 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v12 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v22 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v34 +; GFX11-NEXT: v_or_b32_e32 v32, v33, v32 +; GFX11-NEXT: v_or_b32_e32 v12, v25, v12 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX11-NEXT: v_or_b32_e32 v11, v20, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v19 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v17 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v30 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v12 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v14 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v40i8_to_v5i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v38, v14 +; GCN-NEXT: v_mov_b32_e32 v37, v12 +; GCN-NEXT: v_mov_b32_e32 v36, v10 +; GCN-NEXT: v_mov_b32_e32 v35, v8 +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB26_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v37 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v38 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v47 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v46 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v45 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v44 +; GCN-NEXT: v_or_b32_e32 v0, v0, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v4, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v6, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v8, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v10, v10, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v12, v12, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v14, v14, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v16, v16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v20, v20, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v39, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v48, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v49, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v7, v50, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v9, v17, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v11, v19, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v13, v21, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v15, v23, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v17, v25, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v19, v27, v22 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: v_or_b32_e32 v3, v6, v7 +; GCN-NEXT: v_or_b32_e32 v4, v8, v9 +; GCN-NEXT: v_or_b32_e32 v5, v10, v11 +; GCN-NEXT: v_or_b32_e32 v6, v12, v13 +; GCN-NEXT: v_or_b32_e32 v7, v14, v15 +; GCN-NEXT: v_or_b32_e32 v8, v16, v17 +; GCN-NEXT: v_or_b32_e32 v9, v18, v19 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: .LBB26_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB26_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v47 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v46 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v45 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v44 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_or_b32_e32 v0, v51, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v52, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v53, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v6, v54, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_or_b32_e32 v8, v55, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_or_b32_e32 v10, v40, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_or_b32_e32 v12, v41, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_or_b32_e32 v14, v29, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v16, v42, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_or_b32_e32 v20, v43, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v39, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v48, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v49, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_or_b32_e32 v7, v50, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_or_b32_e32 v9, v17, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_or_b32_e32 v11, v19, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_or_b32_e32 v13, v21, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v14 +; GCN-NEXT: v_or_b32_e32 v15, v23, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; GCN-NEXT: v_or_b32_e32 v17, v25, v18 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v20 +; GCN-NEXT: v_or_b32_e32 v19, v27, v22 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_or_b32_e32 v3, v7, v6 +; GCN-NEXT: v_or_b32_e32 v4, v9, v8 +; GCN-NEXT: v_or_b32_e32 v5, v11, v10 +; GCN-NEXT: v_or_b32_e32 v6, v13, v12 +; GCN-NEXT: v_or_b32_e32 v7, v15, v14 +; GCN-NEXT: v_or_b32_e32 v8, v17, v16 +; GCN-NEXT: v_or_b32_e32 v9, v19, v18 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 +; GCN-NEXT: .LBB26_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v40i8_to_v5i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v36, v10 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v33, v4 +; VI-NEXT: v_mov_b32_e32 v32, v2 +; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; VI-NEXT: v_mov_b32_e32 v38, v14 +; VI-NEXT: v_mov_b32_e32 v37, v12 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v44, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v10 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v33, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v36, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v37, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v38, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v16, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v18, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v20, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v22, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v24, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v26, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v39, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: .LBB26_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v31 +; VI-NEXT: v_add_u16_e32 v1, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v9, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v33 +; VI-NEXT: v_add_u16_e32 v2, 3, v34 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v2, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_add_u16_e32 v2, 3, v35 +; VI-NEXT: v_add_u16_e32 v3, 3, v36 +; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v37 +; VI-NEXT: v_add_u16_e32 v4, 3, v38 +; VI-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v41, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v4, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v16 +; VI-NEXT: v_add_u16_e32 v5, 3, v18 +; VI-NEXT: v_or_b32_sdwa v4, v40, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v5, v55, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: v_add_u16_e32 v5, 3, v20 +; VI-NEXT: v_add_u16_e32 v6, 3, v22 +; VI-NEXT: v_or_b32_sdwa v5, v54, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v6, v53, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_add_u16_sdwa v6, v6, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_add_u16_e32 v6, 3, v24 +; VI-NEXT: v_add_u16_e32 v7, 3, v26 +; VI-NEXT: v_or_b32_sdwa v6, v52, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v7, v51, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_add_u16_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: v_add_u16_e32 v7, 3, v28 +; VI-NEXT: v_add_u16_e32 v8, 3, v30 +; VI-NEXT: v_or_b32_sdwa v7, v27, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_add_u16_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v8, 3, v50 +; VI-NEXT: v_add_u16_e32 v10, 3, v49 +; VI-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v10, v21, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_add_u16_sdwa v10, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v8, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v48 +; VI-NEXT: v_add_u16_e32 v11, 3, v39 +; VI-NEXT: v_or_b32_sdwa v10, v19, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v11, v17, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_add_u16_sdwa v9, v11, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v10, v9 +; VI-NEXT: .LBB26_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v40i8_to_v5i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: v_mov_b32_e32 v36, v10 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v34, v6 +; GFX9-NEXT: v_mov_b32_e32 v33, v4 +; GFX9-NEXT: v_mov_b32_e32 v32, v2 +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_mov_b32_e32 v38, v14 +; GFX9-NEXT: v_mov_b32_e32 v37, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v44, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v42, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v52, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v10 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v37, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v38, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v18, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v22, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v24, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v26, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v28, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v8, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v48, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v39, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: .LBB26_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v31 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v33 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v35 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v43, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v37 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v4, v41, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v16 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v4, v40, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v5, v55, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: v_add_u16_e32 v5, 3, v20 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v5, v54, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v6, v53, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: v_add_u16_e32 v6, 3, v24 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v6, v52, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v7, v51, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_add_u16_e32 v7, 3, v28 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v7, v27, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v8, v25, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v50 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v9, v21, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v48 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v9, v19, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v10, v17, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: .LBB26_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v40i8_to_v5i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-NEXT: s_clause 0x9 +; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:20 +; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v50, off, s32 offset:4 +; GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v68, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v69, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v70, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v71, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v64, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v2 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v4 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v6 +; GFX11-NEXT: s_waitcnt vmcnt(5) +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v8 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v10 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB26_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB26_4 +; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB26_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v37 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v18 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v67 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v68 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v69 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v70 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v71 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v54 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v55 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v64 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v65 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v66 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v48 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v39 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v51 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v52 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v53 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v27 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v29 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v17 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v21 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v23 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v25 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: .LBB26_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v36, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v18, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v67, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v68, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v69, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v70, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v71, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v54, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v55, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v64, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v65, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v66, v9 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v4, v8, v9 +; GFX11-NEXT: v_add_nc_u16 v5, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v22, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v11, v50, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v48, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v39, 3 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v51, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v52, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v53, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v27, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v29, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v17, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v19, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v21, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v23, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v25, v14 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 +; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <40 x i8> %a, splat (i8 3) + %a2 = bitcast <40 x i8> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <40 x i8> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + +define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v5i64_to_v40i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB27_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; GCN-NEXT: .LBB27_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB27_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; GCN-NEXT: v_alignbit_b32 v11, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v12, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v13, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v14, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v15, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v16, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v20, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v21, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v23, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v26, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v27, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v29, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v32, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v35, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; GCN-NEXT: .LBB27_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 8, v35 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 8, v48 +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v38 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 8, v37 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v34 +; GCN-NEXT: v_add_i32_e32 v51, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v20 +; GCN-NEXT: v_or_b32_e32 v35, v49, v35 +; GCN-NEXT: v_add_i32_e32 v49, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; GCN-NEXT: v_or_b32_e32 v48, v50, v48 +; GCN-NEXT: v_add_i32_e32 v50, vcc, 20, v0 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; GCN-NEXT: v_or_b32_e32 v3, v3, v29 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v22 +; GCN-NEXT: v_or_b32_e32 v4, v4, v37 +; GCN-NEXT: v_add_i32_e32 v37, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GCN-NEXT: v_or_b32_e32 v5, v5, v23 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; GCN-NEXT: v_or_b32_e32 v6, v6, v31 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: v_or_b32_e32 v7, v7, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v8, v8, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v24 +; GCN-NEXT: v_or_b32_e32 v9, v9, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_or_b32_e32 v10, v10, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v35 +; GCN-NEXT: v_or_b32_e32 v19, v32, v33 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v48 +; GCN-NEXT: v_or_b32_e32 v25, v38, v39 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v26, v26, v27 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v27, v34, v36 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_or_b32_e32 v20, v20, v21 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v21, v28, v30 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v14, v14, v15 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_or_b32_e32 v15, v22, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v12, v17, v13 +; GCN-NEXT: v_or_b32_e32 v13, v18, v19 +; GCN-NEXT: v_or_b32_e32 v16, v24, v25 +; GCN-NEXT: v_or_b32_e32 v3, v3, v26 +; GCN-NEXT: v_or_b32_e32 v4, v4, v27 +; GCN-NEXT: v_or_b32_e32 v5, v5, v20 +; GCN-NEXT: v_or_b32_e32 v6, v6, v21 +; GCN-NEXT: v_or_b32_e32 v7, v7, v14 +; GCN-NEXT: v_or_b32_e32 v8, v8, v15 +; GCN-NEXT: v_or_b32_e32 v9, v9, v11 +; GCN-NEXT: v_or_b32_e32 v10, v10, v12 +; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v4, v51, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v5, v49, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v6, v50, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v29, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v37, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v23, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v31, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5i64_to_v40i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB27_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB27_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; VI-NEXT: .LBB27_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; VI-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5i64_to_v40i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB27_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB27_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 3, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 3, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 3, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 3, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v10, vcc +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX9-NEXT: .LBB27_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_or_b32_sdwa v15, v48, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v14 +; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v27 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v26 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v12 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v25, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v23, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5i64_to_v40i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-NEXT: .LBB27_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB27_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v3, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, v5, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo +; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v7, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo +; GFX11-NEXT: v_add_co_u32 v9, vcc_lo, v9, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, vcc_lo +; GFX11-NEXT: v_add_co_u32 v1, vcc_lo, v1, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v19, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v21, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v22, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v24, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 8, v1 +; GFX11-NEXT: .LBB27_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v16 +; GFX11-NEXT: v_and_b32_e32 v48, 0xff, v48 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v15 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v36, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v35, 0xff, v35 +; GFX11-NEXT: v_lshlrev_b16 v14, 8, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v31, 8, v31 +; GFX11-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v13 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v39 +; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v38 +; GFX11-NEXT: v_lshlrev_b16 v37, 8, v37 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX11-NEXT: v_or_b32_e32 v15, v48, v15 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v36 +; GFX11-NEXT: v_or_b32_e32 v14, v35, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v31 +; GFX11-NEXT: v_or_b32_e32 v13, v30, v13 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 +; GFX11-NEXT: v_or_b32_e32 v16, v38, v37 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v13 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b16 v13, 8, v29 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v28 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v27 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v26 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_lshlrev_b16 v34, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GFX11-NEXT: v_lshlrev_b16 v32, 8, v32 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v12 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v13, v14, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v23 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v22 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_lshlrev_b16 v11, 8, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v34 +; GFX11-NEXT: v_or_b32_e32 v32, v33, v32 +; GFX11-NEXT: v_or_b32_e32 v12, v25, v12 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v16 +; GFX11-NEXT: v_or_b32_e32 v11, v20, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v19 +; GFX11-NEXT: v_or_b32_e32 v15, v18, v17 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v32 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v30 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v13 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v12 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v14 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v15 +; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <40 x i8> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <40 x i8> + br label %end + +end: + %phi = phi <40 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <40 x i8> %phi +} + +define <5 x i64> @bitcast_v5f64_to_v5i64(<5 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v5f64_to_v5i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: .LBB28_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5f64_to_v5i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5f64_to_v5i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5f64_to_v5i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <5 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <5 x double> %a1 to <5 x i64> + br label %end + +cmp.false: + %a3 = bitcast <5 x double> %a to <5 x i64> + br label %end + +end: + %phi = phi <5 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x i64> %phi +} + +define <5 x double> @bitcast_v5i64_to_v5f64(<5 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v5i64_to_v5f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB29_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: .LBB29_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v5i64_to_v5f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v5i64_to_v5f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v5i64_to_v5f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v10 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <5 x i64> %a, splat (i64 3) + %a2 = bitcast <5 x i64> %a1 to <5 x double> + br label %end + +cmp.false: + %a3 = bitcast <5 x i64> %a to <5 x double> + br label %end + +end: + %phi = phi <5 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <5 x double> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll new file mode 100644 index 0000000000000..1c51395128917 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -0,0 +1,4926 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define float @bitcast_i32_to_f32(i32 %a, i32 %b) { +; GCN-LABEL: bitcast_i32_to_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i32_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i32_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i32_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to float + br label %end + +cmp.false: + %a3 = bitcast i32 %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + +define i32 @bitcast_f32_to_i32(float %a, i32 %b) { +; GCN-LABEL: bitcast_f32_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast float %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + +define <2 x i16> @bitcast_i32_to_v2i16(i32 %a, i32 %b) { +; GCN-LABEL: bitcast_i32_to_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB2_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB2_4 +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB2_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: .LBB2_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i32_to_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i32_to_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i32_to_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + +define i32 @bitcast_v2i16_to_i32(<2 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i16_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB3_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB3_4 +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB3_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: .LBB3_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i16_to_i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_add_u16_e32 v1, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i16_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + +define <2 x half> @bitcast_i32_to_v2f16(i32 %a, i32 %b) { +; GCN-LABEL: bitcast_i32_to_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB4_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB4_4 +; GCN-NEXT: .LBB4_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB4_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: .LBB4_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i32_to_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i32_to_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i32_to_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + +define i32 @bitcast_v2f16_to_i32(<2 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f16_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB5_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB5_4 +; GCN-NEXT: .LBB5_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB5_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: .LBB5_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f16_to_i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + +define <2 x bfloat> @bitcast_i32_to_v2bf16(i32 %a, i32 %b) { +; GCN-LABEL: bitcast_i32_to_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB6_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB6_4 +; GCN-NEXT: .LBB6_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB6_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_2 +; GCN-NEXT: .LBB6_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i32_to_v2bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i32_to_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i32_to_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + +define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v2bf16_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB7_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB7_4 +; GCN-NEXT: .LBB7_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB7_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: .LBB7_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2bf16_to_i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 +; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + +define <1 x i32> @bitcast_i32_to_v1i32(i32 %a, i32 %b) { +; GCN-LABEL: bitcast_i32_to_v1i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB8_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB8_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i32_to_v1i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i32_to_v1i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i32_to_v1i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + +define i32 @bitcast_v1i32_to_i32(<1 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v1i32_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB9_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v1i32_to_i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v1i32_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v1i32_to_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + +define <4 x i8> @bitcast_i32_to_v4i8(i32 %a, i32 %b) { +; GCN-LABEL: bitcast_i32_to_v4i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_4 +; GCN-NEXT: .LBB10_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB10_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_2 +; GCN-NEXT: .LBB10_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i32_to_v4i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB10_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB10_4 +; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB10_3: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: .LBB10_4: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i32_to_v4i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB10_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB10_4 +; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB10_3: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: .LBB10_4: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i32_to_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB10_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB10_4 +; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB10_3: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: .LBB10_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i32 %a, 3 + %a2 = bitcast i32 %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast i32 %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + +define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i8_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB11_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB11_4 +; GCN-NEXT: .LBB11_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB11_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB11_2 +; GCN-NEXT: .LBB11_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i8_to_i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: .LBB11_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v5 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i8_to_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: .LBB11_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i8_to_i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v5, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to i32 + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to i32 + br label %end + +end: + %phi = phi i32 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i32 %phi +} + +define <2 x i16> @bitcast_f32_to_v2i16(float %a, i32 %b) { +; GCN-LABEL: bitcast_f32_to_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB12_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB12_4 +; GCN-NEXT: .LBB12_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB12_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_2 +; GCN-NEXT: .LBB12_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast float %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + +define float @bitcast_v2i16_to_f32(<2 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i16_to_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB13_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB13_4 +; GCN-NEXT: .LBB13_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB13_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB13_2 +; GCN-NEXT: .LBB13_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i16_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_add_u16_e32 v1, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i16_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + +define <2 x half> @bitcast_f32_to_v2f16(float %a, i32 %b) { +; GCN-LABEL: bitcast_f32_to_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB14_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB14_4 +; GCN-NEXT: .LBB14_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB14_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB14_2 +; GCN-NEXT: .LBB14_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast float %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + +define float @bitcast_v2f16_to_f32(<2 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f16_to_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB15_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB15_4 +; GCN-NEXT: .LBB15_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB15_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB15_2 +; GCN-NEXT: .LBB15_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f16_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + +define <2 x bfloat> @bitcast_f32_to_v2bf16(float %a, i32 %b) { +; GCN-LABEL: bitcast_f32_to_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB16_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB16_4 +; GCN-NEXT: .LBB16_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB16_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB16_2 +; GCN-NEXT: .LBB16_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v2bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast float %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + +define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v2bf16_to_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB17_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB17_4 +; GCN-NEXT: .LBB17_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB17_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB17_2 +; GCN-NEXT: .LBB17_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2bf16_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 +; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + +define <1 x i32> @bitcast_f32_to_v1i32(float %a, i32 %b) { +; GCN-LABEL: bitcast_f32_to_v1i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB18_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v1i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v1i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v1i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast float %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + +define float @bitcast_v1i32_to_f32(<1 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v1i32_to_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB19_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB19_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v1i32_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v1i32_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v1i32_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + +define <4 x i8> @bitcast_f32_to_v4i8(float %a, i32 %b) { +; GCN-LABEL: bitcast_f32_to_v4i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB20_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB20_4 +; GCN-NEXT: .LBB20_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB20_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB20_2 +; GCN-NEXT: .LBB20_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f32_to_v4i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB20_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB20_4 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB20_3: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: .LBB20_4: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f32_to_v4i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB20_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB20_4 +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB20_3: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: .LBB20_4: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f32_to_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_4 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB20_3: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: .LBB20_4: ; %cmp.true +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd float %a, 1.000000e+00 + %a2 = bitcast float %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast float %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + +define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i8_to_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB21_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB21_4 +; GCN-NEXT: .LBB21_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB21_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB21_2 +; GCN-NEXT: .LBB21_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i8_to_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB21_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB21_4 +; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB21_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: .LBB21_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v5 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i8_to_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB21_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB21_4 +; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB21_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: .LBB21_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i8_to_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB21_4 +; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB21_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: .LBB21_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v5, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to float + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to float + br label %end + +end: + %phi = phi float [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret float %phi +} + +define <2 x half> @bitcast_v2i16_to_v2f16(<2 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i16_to_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_mov_b32_e32 v3, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB22_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB22_4 +; GCN-NEXT: .LBB22_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB22_3: ; %cmp.false +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB22_2 +; GCN-NEXT: .LBB22_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i16_to_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 3 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i16_to_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + +define <2 x i16> @bitcast_v2f16_to_v2i16(<2 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f16_to_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB23_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: .LBB23_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f16_to_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 0x200 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + +define <2 x bfloat> @bitcast_v2i16_to_v2bf16(<2 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i16_to_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; GCN-NEXT: .LBB24_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i16_to_v2bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 3 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i16_to_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + +define <2 x i16> @bitcast_v2bf16_to_v2i16(<2 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v2bf16_to_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB25_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB25_4 +; GCN-NEXT: .LBB25_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB25_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB25_2 +; GCN-NEXT: .LBB25_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2bf16_to_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 +; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX11-NEXT: .LBB25_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + +define <1 x i32> @bitcast_v2i16_to_v1i32(<2 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i16_to_v1i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB26_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB26_4 +; GCN-NEXT: .LBB26_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB26_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB26_2 +; GCN-NEXT: .LBB26_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i16_to_v1i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_add_u16_e32 v1, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i16_to_v1i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_v1i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + +define <2 x i16> @bitcast_v1i32_to_v2i16(<1 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v1i32_to_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB27_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB27_4 +; GCN-NEXT: .LBB27_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB27_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB27_2 +; GCN-NEXT: .LBB27_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v1i32_to_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v1i32_to_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v1i32_to_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + +define <4 x i8> @bitcast_v2i16_to_v4i8(<2 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i16_to_v4i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB28_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB28_4 +; GCN-NEXT: .LBB28_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB28_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: v_bfe_u32 v3, v4, 8, 8 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_2 +; GCN-NEXT: .LBB28_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: v_bfe_u32 v3, v3, 8, 8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i16_to_v4i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB28_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB28_4 +; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB28_3: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: .LBB28_4: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 3 +; VI-NEXT: v_add_u16_sdwa v2, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v4 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i16_to_v4i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB28_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB28_4 +; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB28_3: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: .LBB28_4: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i16_to_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB28_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB28_4 +; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB28_3: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: .LBB28_4: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i16> %a, splat (i16 3) + %a2 = bitcast <2 x i16> %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x i16> %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + +define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i8_to_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB29_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB29_4 +; GCN-NEXT: .LBB29_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB29_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB29_2 +; GCN-NEXT: .LBB29_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i8_to_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB29_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB29_4 +; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB29_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: .LBB29_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v5 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i8_to_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB29_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB29_4 +; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB29_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: .LBB29_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i8_to_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB29_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB29_4 +; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB29_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: .LBB29_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v5, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to <2 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to <2 x i16> + br label %end + +end: + %phi = phi <2 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i16> %phi +} + +define <2 x bfloat> @bitcast_v2f16_to_v2bf16(<2 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f16_to_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB30_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB30_4 +; GCN-NEXT: .LBB30_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB30_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB30_2 +; GCN-NEXT: .LBB30_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f16_to_v2bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 0x200 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + +define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v2bf16_to_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB31_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB31_4 +; GCN-NEXT: .LBB31_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB31_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB31_2 +; GCN-NEXT: .LBB31_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2bf16_to_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB31_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 +; GFX9-NEXT: .LBB31_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX11-NEXT: .LBB31_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + +define <1 x i32> @bitcast_v2f16_to_v1i32(<2 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f16_to_v1i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB32_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB32_4 +; GCN-NEXT: .LBB32_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB32_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB32_2 +; GCN-NEXT: .LBB32_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f16_to_v1i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_v1i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_v1i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + +define <2 x half> @bitcast_v1i32_to_v2f16(<1 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v1i32_to_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB33_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB33_4 +; GCN-NEXT: .LBB33_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB33_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB33_2 +; GCN-NEXT: .LBB33_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v1i32_to_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v1i32_to_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v1i32_to_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + +define <4 x i8> @bitcast_v2f16_to_v4i8(<2 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f16_to_v4i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB34_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB34_4 +; GCN-NEXT: .LBB34_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB34_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: v_bfe_u32 v3, v2, 8, 8 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB34_2 +; GCN-NEXT: .LBB34_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: v_bfe_u32 v3, v2, 8, 8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f16_to_v4i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB34_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB34_4 +; VI-NEXT: .LBB34_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB34_3: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: .LBB34_4: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v1, 0x200 +; VI-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1 +; VI-NEXT: v_bfe_u32 v3, v2, 8, 8 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f16_to_v4i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB34_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB34_4 +; GFX9-NEXT: .LBB34_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB34_3: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: .LBB34_4: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f16_to_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB34_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB34_4 +; GFX11-NEXT: .LBB34_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB34_3: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-NEXT: .LBB34_4: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x half> %a, splat (half 0xH0200) + %a2 = bitcast <2 x half> %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x half> %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + +define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i8_to_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB35_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB35_4 +; GCN-NEXT: .LBB35_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB35_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB35_2 +; GCN-NEXT: .LBB35_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v5 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v3, v0 +; GCN-NEXT: v_or_b32_e32 v1, v4, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i8_to_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB35_2 +; VI-NEXT: .LBB35_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v5 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i8_to_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB35_2 +; GFX9-NEXT: .LBB35_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i8_to_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB35_2 +; GFX11-NEXT: .LBB35_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v5, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to <2 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to <2 x half> + br label %end + +end: + %phi = phi <2 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x half> %phi +} + +define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v2bf16_to_v1i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB36_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB36_4 +; GCN-NEXT: .LBB36_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB36_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB36_2 +; GCN-NEXT: .LBB36_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2bf16_to_v1i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_v1i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s6 +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_v1i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} + +define <2 x bfloat> @bitcast_v1i32_to_v2bf16(<1 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v1i32_to_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB37_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB37_4 +; GCN-NEXT: .LBB37_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB37_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB37_2 +; GCN-NEXT: .LBB37_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v1i32_to_v2bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v1i32_to_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v1i32_to_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v1 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + +define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v2bf16_to_v4i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB38_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB38_4 +; GCN-NEXT: .LBB38_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB38_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GCN-NEXT: v_alignbit_b32 v0, v2, v5, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB38_2 +; GCN-NEXT: .LBB38_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2bf16_to_v4i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB38_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB38_4 +; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB38_3: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: .LBB38_4: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_bfe_u32 v2, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2bf16_to_v4i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB38_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB38_4 +; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB38_3: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: .LBB38_4: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v4, v0, v1, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2bf16_to_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB38_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB38_4 +; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB38_3: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: .LBB38_4: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v4, v0, v1, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <2 x bfloat> %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x bfloat> %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + +define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i8_to_v2bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB39_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB39_4 +; GCN-NEXT: .LBB39_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB39_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB39_2 +; GCN-NEXT: .LBB39_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v5 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v3, v0 +; GCN-NEXT: v_or_b32_e32 v0, v4, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i8_to_v2bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB39_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB39_4 +; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB39_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB39_2 +; VI-NEXT: .LBB39_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v5 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i8_to_v2bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB39_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB39_4 +; GFX9-NEXT: .LBB39_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB39_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB39_2 +; GFX9-NEXT: .LBB39_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i8_to_v2bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB39_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB39_4 +; GFX11-NEXT: .LBB39_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB39_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB39_2 +; GFX11-NEXT: .LBB39_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v5, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to <2 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to <2 x bfloat> + br label %end + +end: + %phi = phi <2 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x bfloat> %phi +} + +define <4 x i8> @bitcast_v1i32_to_v4i8(<1 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v1i32_to_v4i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB40_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB40_4 +; GCN-NEXT: .LBB40_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB40_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB40_2 +; GCN-NEXT: .LBB40_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v1i32_to_v4i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB40_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB40_4 +; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB40_3: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: .LBB40_4: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v1i32_to_v4i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB40_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB40_4 +; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB40_3: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: .LBB40_4: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v1i32_to_v4i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB40_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB40_4 +; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB40_3: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: .LBB40_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <1 x i32> %a, splat (i32 3) + %a2 = bitcast <1 x i32> %a1 to <4 x i8> + br label %end + +cmp.false: + %a3 = bitcast <1 x i32> %a to <4 x i8> + br label %end + +end: + %phi = phi <4 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i8> %phi +} + +define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i8_to_v1i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB41_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB41_4 +; GCN-NEXT: .LBB41_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB41_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB41_2 +; GCN-NEXT: .LBB41_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i8_to_v1i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB41_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB41_4 +; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB41_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: .LBB41_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v5 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i8_to_v1i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB41_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB41_4 +; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB41_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: .LBB41_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v5 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i8_to_v1i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr0 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v4 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB41_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB41_4 +; GFX11-NEXT: .LBB41_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB41_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v5 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-NEXT: .LBB41_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v5, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i8> %a, splat (i8 3) + %a2 = bitcast <4 x i8> %a1 to <1 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x i8> %a to <1 x i32> + br label %end + +end: + %phi = phi <1 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <1 x i32> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll new file mode 100644 index 0000000000000..922a47ea77fcd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll @@ -0,0 +1,228 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <11 x float> @bitcast_v11i32_to_v11f32(<11 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v11i32_to_v11f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11i32_to_v11f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11i32_to_v11f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11i32_to_v11f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v11 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i32> %a, splat (i32 3) + %a2 = bitcast <11 x i32> %a1 to <11 x float> + br label %end + +cmp.false: + %a3 = bitcast <11 x i32> %a to <11 x float> + br label %end + +end: + %phi = phi <11 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x float> %phi +} + +define <11 x i32> @bitcast_v11f32_to_v11i32(<11 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v11f32_to_v11i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11f32_to_v11i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11f32_to_v11i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11f32_to_v11i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v11 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v9, 1.0, v9 +; GFX11-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v7, 1.0, v7 +; GFX11-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 +; GFX11-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v3, 1.0, v3 +; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <11 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <11 x float> %a1 to <11 x i32> + br label %end + +cmp.false: + %a3 = bitcast <11 x float> %a to <11 x i32> + br label %end + +end: + %phi = phi <11 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i32> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll new file mode 100644 index 0000000000000..f67af98fba0fa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll @@ -0,0 +1,1338 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <12 x float> @bitcast_v12i32_to_v12f32(<12 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v12i32_to_v12f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i32_to_v12f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i32_to_v12f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i32_to_v12f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + +define <12 x i32> @bitcast_v12f32_to_v12i32(<12 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v12f32_to_v12i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f32_to_v12i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v12i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v12i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + +define <6 x double> @bitcast_v12i32_to_v6f64(<12 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v12i32_to_v6f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i32_to_v6f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i32_to_v6f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i32_to_v6f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} + +define <12 x i32> @bitcast_v6f64_to_v12i32(<6 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v6f64_to_v12i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f64_to_v12i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v12i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v12i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + +define <6 x i64> @bitcast_v12i32_to_v6i64(<12 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v12i32_to_v6i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB4_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i32_to_v6i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i32_to_v6i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i32_to_v6i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i32> %a, splat (i32 3) + %a2 = bitcast <12 x i32> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <12 x i32> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + +define <12 x i32> @bitcast_v6i64_to_v12i32(<6 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v6i64_to_v12i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB5_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6i64_to_v12i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6i64_to_v12i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i64_to_v12i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <12 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <12 x i32> + br label %end + +end: + %phi = phi <12 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i32> %phi +} + +define <6 x double> @bitcast_v12f32_to_v6f64(<12 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v12f32_to_v6f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB6_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f32_to_v6f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v6f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v6f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} + +define <12 x float> @bitcast_v6f64_to_v12f32(<6 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v6f64_to_v12f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB7_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f64_to_v12f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v12f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v12f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + +define <6 x i64> @bitcast_v12f32_to_v6i64(<12 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v12f32_to_v6i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB8_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB8_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12f32_to_v6i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12f32_to_v6i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12f32_to_v6i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <12 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <12 x float> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <12 x float> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + +define <12 x float> @bitcast_v6i64_to_v12f32(<6 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v6i64_to_v12f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB9_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6i64_to_v12f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6i64_to_v12f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i64_to_v12f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <12 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <12 x float> + br label %end + +end: + %phi = phi <12 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x float> %phi +} + +define <6 x i64> @bitcast_v6f64_to_v6i64(<6 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v6f64_to_v6i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: .LBB10_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f64_to_v6i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f64_to_v6i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f64_to_v6i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <6 x double> %a1 to <6 x i64> + br label %end + +cmp.false: + %a3 = bitcast <6 x double> %a to <6 x i64> + br label %end + +end: + %phi = phi <6 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i64> %phi +} + +define <6 x double> @bitcast_v6i64_to_v6f64(<6 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v6i64_to_v6f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB11_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: .LBB11_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6i64_to_v6f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6i64_to_v6f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i64_to_v6f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v12 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i64> %a, splat (i64 3) + %a2 = bitcast <6 x i64> %a1 to <6 x double> + br label %end + +cmp.false: + %a3 = bitcast <6 x i64> %a to <6 x double> + br label %end + +end: + %phi = phi <6 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x double> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll new file mode 100644 index 0000000000000..a5764f9da3194 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll @@ -0,0 +1,714 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v7i64_to_v14i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7i64_to_v14i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7i64_to_v14i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7i64_to_v14i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v14 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i64> %a, splat (i64 3) + %a2 = bitcast <7 x i64> %a1 to <14 x i32> + br label %end + +cmp.false: + %a3 = bitcast <7 x i64> %a to <14 x i32> + br label %end + +end: + %phi = phi <14 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i32> %phi +} + +define <7 x i64> @bitcast_v14i32_to_v7i64(<14 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v14i32_to_v7i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14i32_to_v7i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14i32_to_v7i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14i32_to_v7i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v14 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i32> %a, splat (i32 3) + %a2 = bitcast <14 x i32> %a1 to <7 x i64> + br label %end + +cmp.false: + %a3 = bitcast <14 x i32> %a to <7 x i64> + br label %end + +end: + %phi = phi <7 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i64> %phi +} + +define <7 x double> @bitcast_v7i64_to_v7f64(<7 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v7i64_to_v7f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7i64_to_v7f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7i64_to_v7f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7i64_to_v7f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v14 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <7 x i64> %a, splat (i64 3) + %a2 = bitcast <7 x i64> %a1 to <7 x double> + br label %end + +cmp.false: + %a3 = bitcast <7 x i64> %a to <7 x double> + br label %end + +end: + %phi = phi <7 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x double> %phi +} + +define <7 x i64> @bitcast_v7f64_to_v7i64(<7 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v7f64_to_v7i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f64_to_v7i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f64_to_v7i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f64_to_v7i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v14 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <7 x double> %a1 to <7 x i64> + br label %end + +cmp.false: + %a3 = bitcast <7 x double> %a to <7 x i64> + br label %end + +end: + %phi = phi <7 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x i64> %phi +} + +define <7 x double> @bitcast_v14i32_to_v7f64(<14 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v14i32_to_v7f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB4_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14i32_to_v7f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14i32_to_v7f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14i32_to_v7f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v14 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i32> %a, splat (i32 3) + %a2 = bitcast <14 x i32> %a1 to <7 x double> + br label %end + +cmp.false: + %a3 = bitcast <14 x i32> %a to <7 x double> + br label %end + +end: + %phi = phi <7 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <7 x double> %phi +} + +define <14 x i32> @bitcast_v7f64_to_v14i32(<7 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v7f64_to_v14i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB5_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v7f64_to_v14i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v7f64_to_v14i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v7f64_to_v14i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v14 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <7 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <7 x double> %a1 to <14 x i32> + br label %end + +cmp.false: + %a3 = bitcast <7 x double> %a to <14 x i32> + br label %end + +end: + %phi = phi <14 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i32> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll new file mode 100644 index 0000000000000..d09aaf12161a2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -0,0 +1,775 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v3bf16_to_v3f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB0_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB0_4 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB0_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: .LBB0_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3bf16_to_v3f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3bf16_to_v3f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s6 +; GFX9-NEXT: s_movk_i32 s6, 0x7fc0 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3bf16_to_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_alignbit_b32 v1, 0x7fc0, v1, 16 +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <3 x bfloat> %a1 to <3 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x bfloat> %a to <3 x half> + br label %end + +end: + %phi = phi <3 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x half> %phi +} + +define <3 x bfloat> @bitcast_v3f16_to_v3bf16(<3 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v3f16_to_v3bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB1_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB1_4 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB1_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: .LBB1_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f16_to_v3bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 0x200 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, 0x7e000000, v1 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f16_to_v3bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f16_to_v3bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x half> %a, splat (half 0xH0200) + %a2 = bitcast <3 x half> %a1 to <3 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <3 x half> %a to <3 x bfloat> + br label %end + +end: + %phi = phi <3 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x bfloat> %phi +} + +define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v3bf16_to_v3i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB2_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB2_4 +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB2_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: .LBB2_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3bf16_to_v3i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3bf16_to_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s6 +; GFX9-NEXT: s_movk_i32 s6, 0x7fc0 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 16 +; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3bf16_to_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v4, v4, v1, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_alignbit_b32 v1, 0x7fc0, v1, 16 +; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <3 x bfloat> %a1 to <3 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x bfloat> %a to <3 x i16> + br label %end + +end: + %phi = phi <3 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i16> %phi +} + +define <3 x bfloat> @bitcast_v3i16_to_v3bf16(<3 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v3i16_to_v3bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB3_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB3_4 +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB3_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: .LBB3_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3i16_to_v3bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_add_u16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3i16_to_v3bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i16_to_v3bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i16> %a, splat (i16 3) + %a2 = bitcast <3 x i16> %a1 to <3 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <3 x i16> %a to <3 x bfloat> + br label %end + +end: + %phi = phi <3 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x bfloat> %phi +} + +define <3 x i16> @bitcast_v3f16_to_v3i16(<3 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v3f16_to_v3i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: .LBB4_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f16_to_v3i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 0x200 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_or_b32_e32 v1, 0x7e000000, v1 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f16_to_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f16_to_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x half> %a, splat (half 0xH0200) + %a2 = bitcast <3 x half> %a1 to <3 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x half> %a to <3 x i16> + br label %end + +end: + %phi = phi <3 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i16> %phi +} + +define <3 x half> @bitcast_v3i16_to_v3f16(<3 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v3i16_to_v3f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v6, v2 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB5_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB5_4 +; GCN-NEXT: .LBB5_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB5_3: ; %cmp.false +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: .LBB5_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3i16_to_v3f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_add_u16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3i16_to_v3f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i16_to_v3f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i16> %a, splat (i16 3) + %a2 = bitcast <3 x i16> %a1 to <3 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x i16> %a to <3 x half> + br label %end + +end: + %phi = phi <3 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x half> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll new file mode 100644 index 0000000000000..f0ce1784eb107 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -0,0 +1,41662 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <16 x float> @bitcast_v16i32_to_v16f32(<16 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i32_to_v16f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i32_to_v16f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i32_to_v16f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i32_to_v16f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + +define <16 x i32> @bitcast_v16f32_to_v16i32(<16 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f32_to_v16i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f32_to_v16i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f32_to_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f32_to_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + +define <8 x i64> @bitcast_v16i32_to_v8i64(<16 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i32_to_v8i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i32_to_v8i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB2_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i32_to_v8i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB2_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i32_to_v8i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB2_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + +define <16 x i32> @bitcast_v8i64_to_v16i32(<8 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i64_to_v16i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i64_to_v16i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i64_to_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i64_to_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + +define <8 x double> @bitcast_v16i32_to_v8f64(<16 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i32_to_v8f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB4_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i32_to_v8f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB4_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB4_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i32_to_v8f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB4_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB4_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i32_to_v8f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB4_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + +define <16 x i32> @bitcast_v8f64_to_v16i32(<8 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f64_to_v16i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB5_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f64_to_v16i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f64_to_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f64_to_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + +define <32 x i16> @bitcast_v16i32_to_v32i16(<16 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i32_to_v32i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v30, v15 +; GCN-NEXT: v_mov_b32_e32 v28, v14 +; GCN-NEXT: v_mov_b32_e32 v26, v13 +; GCN-NEXT: v_mov_b32_e32 v24, v12 +; GCN-NEXT: v_mov_b32_e32 v22, v11 +; GCN-NEXT: v_mov_b32_e32 v20, v10 +; GCN-NEXT: v_mov_b32_e32 v18, v9 +; GCN-NEXT: v_mov_b32_e32 v32, v8 +; GCN-NEXT: v_mov_b32_e32 v14, v7 +; GCN-NEXT: v_mov_b32_e32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v10, v5 +; GCN-NEXT: v_mov_b32_e32 v8, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v3 +; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB6_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB6_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v16, v32 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i32_to_v32i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB6_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB6_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i32_to_v32i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB6_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i32_to_v32i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB6_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB6_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + +define <16 x i32> @bitcast_v32i16_to_v16i32(<32 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i16_to_v16i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v38, v14 +; GCN-NEXT: v_mov_b32_e32 v37, v12 +; GCN-NEXT: v_mov_b32_e32 v36, v10 +; GCN-NEXT: v_mov_b32_e32 v35, v8 +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB7_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB7_4 +; GCN-NEXT: .LBB7_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB7_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; GCN-NEXT: v_or_b32_e32 v0, v0, v54 +; GCN-NEXT: v_or_b32_e32 v1, v1, v55 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v2, v2, v39 +; GCN-NEXT: v_or_b32_e32 v3, v3, v48 +; GCN-NEXT: v_or_b32_e32 v4, v4, v49 +; GCN-NEXT: v_or_b32_e32 v5, v5, v50 +; GCN-NEXT: v_or_b32_e32 v6, v6, v51 +; GCN-NEXT: v_or_b32_e32 v7, v7, v52 +; GCN-NEXT: v_or_b32_e32 v8, v8, v17 +; GCN-NEXT: v_or_b32_e32 v9, v9, v19 +; GCN-NEXT: v_or_b32_e32 v10, v10, v21 +; GCN-NEXT: v_or_b32_e32 v11, v11, v23 +; GCN-NEXT: v_or_b32_e32 v12, v12, v25 +; GCN-NEXT: v_or_b32_e32 v13, v13, v27 +; GCN-NEXT: v_or_b32_e32 v14, v14, v29 +; GCN-NEXT: v_or_b32_e32 v15, v15, v53 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: .LBB7_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v0, v54, v0 +; GCN-NEXT: v_or_b32_e32 v1, v55, v1 +; GCN-NEXT: v_or_b32_e32 v2, v39, v2 +; GCN-NEXT: v_or_b32_e32 v3, v48, v3 +; GCN-NEXT: v_or_b32_e32 v4, v49, v4 +; GCN-NEXT: v_or_b32_e32 v5, v50, v5 +; GCN-NEXT: v_or_b32_e32 v6, v51, v6 +; GCN-NEXT: v_or_b32_e32 v7, v52, v7 +; GCN-NEXT: v_or_b32_e32 v8, v17, v8 +; GCN-NEXT: v_or_b32_e32 v9, v19, v9 +; GCN-NEXT: v_or_b32_e32 v10, v21, v10 +; GCN-NEXT: v_or_b32_e32 v11, v23, v11 +; GCN-NEXT: v_or_b32_e32 v12, v25, v12 +; GCN-NEXT: v_or_b32_e32 v13, v27, v13 +; GCN-NEXT: v_or_b32_e32 v14, v29, v14 +; GCN-NEXT: v_or_b32_e32 v15, v53, v15 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i16_to_v16i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 3 +; VI-NEXT: v_add_u16_e32 v16, 3, v15 +; VI-NEXT: v_add_u16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_u16_e32 v16, 3, v14 +; VI-NEXT: v_add_u16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v16, v14 +; VI-NEXT: v_add_u16_e32 v16, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v16, v13 +; VI-NEXT: v_add_u16_e32 v16, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v16, v12 +; VI-NEXT: v_add_u16_e32 v16, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v16, v11 +; VI-NEXT: v_add_u16_e32 v16, 3, v10 +; VI-NEXT: v_add_u16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v16, v10 +; VI-NEXT: v_add_u16_e32 v16, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v16, v9 +; VI-NEXT: v_add_u16_e32 v16, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v16, v8 +; VI-NEXT: v_add_u16_e32 v16, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v16, v7 +; VI-NEXT: v_add_u16_e32 v16, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v16, v6 +; VI-NEXT: v_add_u16_e32 v16, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v16, v5 +; VI-NEXT: v_add_u16_e32 v16, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v16, v4 +; VI-NEXT: v_add_u16_e32 v16, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v16, v3 +; VI-NEXT: v_add_u16_e32 v16, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v16, v2 +; VI-NEXT: v_add_u16_e32 v16, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v16, v1 +; VI-NEXT: v_add_u16_e32 v16, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i16_to_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB7_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB7_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i16_to_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB7_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + +define <32 x half> @bitcast_v16i32_to_v32f16(<16 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i32_to_v32f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v33, v15 +; GCN-NEXT: v_mov_b32_e32 v34, v14 +; GCN-NEXT: v_mov_b32_e32 v35, v13 +; GCN-NEXT: v_mov_b32_e32 v36, v12 +; GCN-NEXT: v_mov_b32_e32 v37, v11 +; GCN-NEXT: v_mov_b32_e32 v38, v10 +; GCN-NEXT: v_mov_b32_e32 v39, v9 +; GCN-NEXT: v_mov_b32_e32 v48, v8 +; GCN-NEXT: v_mov_b32_e32 v49, v7 +; GCN-NEXT: v_mov_b32_e32 v50, v6 +; GCN-NEXT: v_mov_b32_e32 v51, v5 +; GCN-NEXT: v_mov_b32_e32 v52, v4 +; GCN-NEXT: v_mov_b32_e32 v53, v3 +; GCN-NEXT: v_mov_b32_e32 v54, v2 +; GCN-NEXT: v_mov_b32_e32 v55, v1 +; GCN-NEXT: v_mov_b32_e32 v32, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB8_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v50 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v51 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v52 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v53 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v54 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: .LBB8_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB8_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v55 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v54 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v53 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v52 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v51 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v50 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 +; GCN-NEXT: .LBB8_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i32_to_v32f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB8_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB8_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i32_to_v32f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB8_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB8_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i32_to_v32f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB8_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB8_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + +define <16 x i32> @bitcast_v32f16_to_v16i32(<32 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f16_to_v16i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; GCN-NEXT: v_or_b32_e32 v0, v44, v0 +; GCN-NEXT: v_or_b32_e32 v1, v42, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; GCN-NEXT: v_or_b32_e32 v2, v52, v2 +; GCN-NEXT: v_or_b32_e32 v3, v50, v3 +; GCN-NEXT: v_or_b32_e32 v4, v48, v4 +; GCN-NEXT: v_or_b32_e32 v5, v38, v5 +; GCN-NEXT: v_or_b32_e32 v6, v36, v6 +; GCN-NEXT: v_or_b32_e32 v7, v34, v7 +; GCN-NEXT: v_or_b32_e32 v8, v33, v8 +; GCN-NEXT: v_or_b32_e32 v9, v32, v9 +; GCN-NEXT: v_or_b32_e32 v10, v31, v10 +; GCN-NEXT: v_or_b32_e32 v11, v21, v11 +; GCN-NEXT: v_or_b32_e32 v12, v19, v12 +; GCN-NEXT: v_or_b32_e32 v13, v18, v13 +; GCN-NEXT: v_or_b32_e32 v14, v17, v14 +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: .LBB9_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB9_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v9, v8 +; GCN-NEXT: v_or_b32_e32 v6, v11, v10 +; GCN-NEXT: v_or_b32_e32 v7, v13, v12 +; GCN-NEXT: v_or_b32_e32 v8, v15, v14 +; GCN-NEXT: v_or_b32_e32 v9, v26, v24 +; GCN-NEXT: v_or_b32_e32 v10, v28, v27 +; GCN-NEXT: v_or_b32_e32 v11, v21, v29 +; GCN-NEXT: v_or_b32_e32 v12, v19, v25 +; GCN-NEXT: v_or_b32_e32 v13, v18, v23 +; GCN-NEXT: v_or_b32_e32 v14, v17, v22 +; GCN-NEXT: v_or_b32_e32 v15, v16, v20 +; GCN-NEXT: .LBB9_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f16_to_v16i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f16_to_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB9_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB9_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f16_to_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB9_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + +define <32 x bfloat> @bitcast_v16i32_to_v32bf16(<16 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i32_to_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v55, v15 +; GCN-NEXT: v_mov_b32_e32 v54, v14 +; GCN-NEXT: v_mov_b32_e32 v53, v13 +; GCN-NEXT: v_mov_b32_e32 v52, v12 +; GCN-NEXT: v_mov_b32_e32 v51, v11 +; GCN-NEXT: v_mov_b32_e32 v50, v10 +; GCN-NEXT: v_mov_b32_e32 v49, v9 +; GCN-NEXT: v_mov_b32_e32 v48, v8 +; GCN-NEXT: v_mov_b32_e32 v39, v7 +; GCN-NEXT: v_mov_b32_e32 v38, v6 +; GCN-NEXT: v_mov_b32_e32 v37, v5 +; GCN-NEXT: v_mov_b32_e32 v36, v4 +; GCN-NEXT: v_mov_b32_e32 v35, v3 +; GCN-NEXT: v_mov_b32_e32 v34, v2 +; GCN-NEXT: v_mov_b32_e32 v33, v1 +; GCN-NEXT: v_mov_b32_e32 v32, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_4 +; GCN-NEXT: .LBB10_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB10_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_2 +; GCN-NEXT: .LBB10_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v50 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v51 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v52 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v53 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v55 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i32_to_v32bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB10_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i32_to_v32bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB10_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i32_to_v32bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB10_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + +define <16 x i32> @bitcast_v32bf16_to_v16i32(<32 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v32bf16_to_v16i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v46 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v30 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB11_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; GCN-NEXT: v_alignbit_b32 v0, v0, v45, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; GCN-NEXT: v_alignbit_b32 v2, v2, v51, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v49, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v39, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v37, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v36, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v34, 16 +; GCN-NEXT: v_alignbit_b32 v8, v8, v33, 16 +; GCN-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; GCN-NEXT: v_alignbit_b32 v10, v10, v31, 16 +; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 +; GCN-NEXT: v_alignbit_b32 v12, v12, v19, 16 +; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 +; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: .LBB11_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB11_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v35 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v11, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v26, v24, 16 +; GCN-NEXT: v_alignbit_b32 v10, v28, v27, 16 +; GCN-NEXT: v_alignbit_b32 v11, v29, v21, 16 +; GCN-NEXT: v_alignbit_b32 v12, v25, v19, 16 +; GCN-NEXT: v_alignbit_b32 v13, v23, v18, 16 +; GCN-NEXT: v_alignbit_b32 v14, v22, v17, 16 +; GCN-NEXT: v_alignbit_b32 v15, v20, v16, 16 +; GCN-NEXT: .LBB11_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32bf16_to_v16i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v12 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v10 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v9 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v8 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v7 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v6 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v5 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v4 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v2 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v1 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v16, s7 +; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v23, v14, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_add3_u32 v21, v21, v17, 0x7fff +; GFX11-NEXT: v_add3_u32 v18, v18, v16, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v16, v18, v19, vcc_lo +; GFX11-NEXT: v_add3_u32 v19, v23, v14, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_bfe_u32 v20, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v20, v20, v15, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v15, v20, v22 :: v_dual_lshlrev_b32 v20, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 +; GFX11-NEXT: v_dual_cndmask_b32 v17, v21, v18 :: v_dual_add_f32 v18, 0x40c00000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_add3_u32 v16, v16, v18, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v14, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_perm_b32 v14, v14, v17, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v16, v16, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add3_u32 v17, v17, v13, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v21, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v17, v22, v19, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_perm_b32 v13, v13, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_add3_u32 v16, v16, v19, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v18, v18, v12, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v12, v18, v22, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v12, v12, v17, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add3_u32 v17, v17, v11, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v19, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v19, v19, v10, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v11, v17, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v22, v18, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v10 +; GFX11-NEXT: v_perm_b32 v11, v11, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v19, v22 :: v_dual_lshlrev_b32 v21, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v19, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_perm_b32 v10, v10, v17, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_add3_u32 v16, v16, v18, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_add3_u32 v18, v18, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v17, v17, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v17, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v22, v19, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v8 +; GFX11-NEXT: v_perm_b32 v9, v9, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v18, v22, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v8, v8, v17, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v19, 0x40c00000, v21 +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add3_u32 v16, v16, v19, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_bfe_u32 v19, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_add3_u32 v19, v19, v6, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v17, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v17, v17, v7, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v22, v18, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v6 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_cndmask_b32 v17, v17, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v6, v6, v17, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v17, 0x40c00000, v19 +; GFX11-NEXT: v_add3_u32 v19, v20, v18, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_bfe_u32 v22, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v16, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v16, v16, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v16, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v16, v22, v17, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v5, v5, v18, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v18, v18, v4, 0x7fff +; GFX11-NEXT: v_dual_add_f32 v17, 0x40c00000, v20 :: v_dual_lshlrev_b32 v20, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v20 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v4, v4, v16, 0x7060302 +; GFX11-NEXT: v_add3_u32 v19, v21, v17, 0x7fff +; GFX11-NEXT: v_bfe_u32 v21, v3, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc_lo +; GFX11-NEXT: v_add3_u32 v19, v21, v3, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v21, v22, v18, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GFX11-NEXT: v_bfe_u32 v24, v2, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v22 +; GFX11-NEXT: v_add3_u32 v20, v24, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v3, v3, v17, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v21, v23, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v20, v21 :: v_dual_lshlrev_b32 v23, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v21, v22, v19, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add_f32_e32 v20, 0x40c00000, v23 +; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v19, v21, v22 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v24, v20, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v21, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v24, v24, v20, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v21, v21, v0, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v23, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v22, v23, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v24, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v21, v26, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 +; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + +define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v16i32_to_v64i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 +; GCN-NEXT: v_alignbit_b32 v24, v12, v11, 24 +; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 +; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GCN-NEXT: .LBB12_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 +; GCN-NEXT: v_alignbit_b32 v24, v12, v11, 24 +; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 +; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GCN-NEXT: .LBB12_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v60 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_or_b32_e32 v60, v1, v18 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GCN-NEXT: v_or_b32_e32 v17, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v58 +; GCN-NEXT: v_or_b32_e32 v58, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v63 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v63, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v46 +; GCN-NEXT: v_or_b32_e32 v46, v1, v2 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v44 +; GCN-NEXT: v_and_b32_e32 v45, 0xff, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v61, 24, v61 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v59 +; GCN-NEXT: v_or_b32_e32 v44, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v40 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 +; GCN-NEXT: v_or_b32_e32 v7, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 24, v56 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v47 +; GCN-NEXT: v_or_b32_e32 v54, v3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v49 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v48 +; GCN-NEXT: v_or_b32_e32 v9, v4, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v42 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v55 +; GCN-NEXT: v_or_b32_e32 v48, v5, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v36 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v34 +; GCN-NEXT: v_or_b32_e32 v34, v6, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v52 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v50 +; GCN-NEXT: v_or_b32_e32 v11, v8, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v30 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v28 +; GCN-NEXT: v_or_b32_e32 v13, v10, v12 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v38 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v35 +; GCN-NEXT: v_or_b32_e32 v28, v12, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 36, v0 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GCN-NEXT: v_or_b32_e32 v21, v14, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v31 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v29 +; GCN-NEXT: v_or_b32_e32 v29, v15, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v52, 0xff, v20 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GCN-NEXT: v_or_b32_e32 v31, v19, v16 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v45 +; GCN-NEXT: v_or_b32_e32 v27, v61, v18 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v45, 0xff, v19 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; GCN-NEXT: v_or_b32_e32 v40, v40, v19 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 24, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v57 +; GCN-NEXT: v_or_b32_e32 v56, v56, v22 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v60 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v63 +; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v46 +; GCN-NEXT: v_or_b32_e32 v49, v49, v47 +; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v44 +; GCN-NEXT: v_or_b32_e32 v51, v51, v43 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v36, v36, v55 +; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GCN-NEXT: v_or_b32_e32 v37, v37, v53 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v30, v30, v50 +; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GCN-NEXT: v_or_b32_e32 v32, v32, v39 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GCN-NEXT: v_or_b32_e32 v24, v24, v35 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v25, v25, v38 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v20, v20, v52 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_or_b32_e32 v26, v26, v42 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v33, v33, v45 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v23, v41, v23 +; GCN-NEXT: v_or_b32_e32 v31, v57, v31 +; GCN-NEXT: v_or_b32_e32 v17, v17, v27 +; GCN-NEXT: v_or_b32_e32 v27, v58, v40 +; GCN-NEXT: v_or_b32_e32 v35, v59, v56 +; GCN-NEXT: v_or_b32_e32 v38, v46, v49 +; GCN-NEXT: v_or_b32_e32 v39, v44, v51 +; GCN-NEXT: v_or_b32_e32 v7, v7, v36 +; GCN-NEXT: v_or_b32_e32 v36, v54, v37 +; GCN-NEXT: v_or_b32_e32 v9, v9, v30 +; GCN-NEXT: v_or_b32_e32 v30, v48, v32 +; GCN-NEXT: v_or_b32_e32 v24, v34, v24 +; GCN-NEXT: v_or_b32_e32 v11, v11, v25 +; GCN-NEXT: v_or_b32_e32 v13, v13, v20 +; GCN-NEXT: v_or_b32_e32 v20, v28, v26 +; GCN-NEXT: v_or_b32_e32 v21, v21, v33 +; GCN-NEXT: v_or_b32_e32 v23, v29, v23 +; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v35, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v39, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16i32_to_v64i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: .LBB12_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB12_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: .LBB12_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v27, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v28 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: v_or_b32_sdwa v2, v2, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16i32_to_v64i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v16 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v26, v23 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: .LBB12_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB12_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB12_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v27, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16i32_to_v64i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB12_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB12_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB12_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v25 +; GFX11-NEXT: v_and_b32_e32 v96, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v86 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v25 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v87 +; GFX11-NEXT: v_or_b32_e32 v24, v96, v24 +; GFX11-NEXT: v_lshlrev_b16 v85, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v86, v85 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v83 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v24 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v82 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v80, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v71, 8, v71 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v70 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v22 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v84 +; GFX11-NEXT: v_or_b32_e32 v23, v83, v23 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v80 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v71 +; GFX11-NEXT: v_or_b32_e32 v22, v70, v22 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v24 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v23 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v25 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v65 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v22 +; GFX11-NEXT: v_or_b32_e32 v21, v23, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v22, v25, v54 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v52 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v51 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v50 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v49 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v24 +; GFX11-NEXT: v_or_b32_e32 v23, v25, v49 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v48 +; GFX11-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v22 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v23 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v37 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v35 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v34 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b16 v69, 8, v69 +; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v68 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v67 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v19, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v22 +; GFX11-NEXT: v_or_b32_e32 v18, v23, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v31 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v30 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v28 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v26 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v69 +; GFX11-NEXT: v_or_b32_e32 v67, v68, v67 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v22 +; GFX11-NEXT: v_or_b32_e32 v17, v23, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v67 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v21 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <16 x i32> %a, splat (i32 3) + %a2 = bitcast <16 x i32> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x i32> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + +define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i8_to_v16i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v38, v14 +; GCN-NEXT: v_mov_b32_e32 v34, v12 +; GCN-NEXT: v_mov_b32_e32 v37, v10 +; GCN-NEXT: v_mov_b32_e32 v33, v8 +; GCN-NEXT: v_mov_b32_e32 v36, v6 +; GCN-NEXT: v_mov_b32_e32 v32, v4 +; GCN-NEXT: v_mov_b32_e32 v35, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v9 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v45 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v44 +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v59 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 24, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB13_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GCN-NEXT: v_or_b32_e32 v0, v0, v42 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_or_b32_e32 v1, v1, v41 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_or_b32_e32 v2, v2, v40 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_or_b32_e32 v3, v3, v55 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v37 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v38 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v50 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v54 +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v53 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v52 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v23 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v49 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v48 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v39 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v63 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v62 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v61 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v21 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 +; GCN-NEXT: v_or_b32_e32 v17, v33, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v20, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; GCN-NEXT: v_or_b32_e32 v26, v28, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GCN-NEXT: v_or_b32_e32 v27, v35, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_or_b32_e32 v29, v36, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v37 +; GCN-NEXT: v_or_b32_e32 v31, v38, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v9, v9, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v13, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v15, v15, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v18, v18, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v32, v7 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v32, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v32, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v32, v4 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v32, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_or_b32_e32 v25, v59, v25 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v30, v57, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v32, v58, v8 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v9 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v8, v10 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v11 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v8, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v8, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v8, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v19, v46, v19 +; GCN-NEXT: v_or_b32_e32 v0, v0, v7 +; GCN-NEXT: v_or_b32_e32 v1, v1, v6 +; GCN-NEXT: v_or_b32_e32 v2, v2, v5 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_or_b32_e32 v4, v17, v21 +; GCN-NEXT: v_or_b32_e32 v5, v20, v22 +; GCN-NEXT: v_or_b32_e32 v6, v23, v24 +; GCN-NEXT: v_or_b32_e32 v7, v26, v28 +; GCN-NEXT: v_or_b32_e32 v8, v27, v25 +; GCN-NEXT: v_or_b32_e32 v9, v29, v30 +; GCN-NEXT: v_or_b32_e32 v10, v31, v32 +; GCN-NEXT: v_or_b32_e32 v11, v33, v34 +; GCN-NEXT: v_or_b32_e32 v12, v35, v12 +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v16 +; GCN-NEXT: v_or_b32_e32 v15, v18, v19 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; kill: killed $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: .LBB13_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB13_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v42, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v41, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v40, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v55, v3 +; GCN-NEXT: s_movk_i32 s7, 0x300 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: s_mov_b32 s6, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v50 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v54 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v52 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v62 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v61 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v60 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v51 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GCN-NEXT: v_or_b32_e32 v6, v17, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GCN-NEXT: v_or_b32_e32 v13, v29, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GCN-NEXT: v_or_b32_e32 v15, v27, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GCN-NEXT: v_or_b32_e32 v17, v44, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; GCN-NEXT: v_or_b32_e32 v20, v45, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; GCN-NEXT: v_or_b32_e32 v23, v56, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v25, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v27, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 +; GCN-NEXT: v_or_b32_e32 v29, v43, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_or_b32_e32 v19, v47, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v31, v36 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v32, v8 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v32, v5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v32, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v32, v7 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v32, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v32, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v32, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; GCN-NEXT: v_or_b32_e32 v16, v59, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; GCN-NEXT: v_or_b32_e32 v18, v57, v18 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; GCN-NEXT: v_or_b32_e32 v22, v58, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v32, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x300, v19 +; GCN-NEXT: v_or_b32_e32 v30, v46, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v0, v31, v0 +; GCN-NEXT: v_or_b32_e32 v1, v8, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v10, v9 +; GCN-NEXT: v_or_b32_e32 v6, v12, v11 +; GCN-NEXT: v_or_b32_e32 v7, v14, v13 +; GCN-NEXT: v_or_b32_e32 v8, v16, v15 +; GCN-NEXT: v_or_b32_e32 v9, v18, v17 +; GCN-NEXT: v_or_b32_e32 v10, v22, v20 +; GCN-NEXT: v_or_b32_e32 v11, v24, v23 +; GCN-NEXT: v_or_b32_e32 v12, v26, v25 +; GCN-NEXT: v_or_b32_e32 v13, v28, v27 +; GCN-NEXT: v_or_b32_e32 v14, v21, v29 +; GCN-NEXT: v_or_b32_e32 v15, v30, v19 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 +; GCN-NEXT: .LBB13_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i8_to_v16i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v15 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 +; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: .LBB13_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB13_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v15, 0x300 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v9, 3, v51 +; VI-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v10, 3, v24 +; VI-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v10, v10, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v11, 3, v30 +; VI-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v12, 3, v60 +; VI-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 3, v42 +; VI-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_or_b32_sdwa v17, v57, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v4, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v53 +; VI-NEXT: v_or_b32_sdwa v16, v23, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v16, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v15, v17, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v28, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v52 +; VI-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v50 +; VI-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v49 +; VI-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v63 +; VI-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v48 +; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v44 +; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v39 +; VI-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v19 +; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: .LBB13_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i8_to_v16i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v24 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: .LBB13_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB13_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v51 +; GFX9-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v10, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v11, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v12, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v15, 3, v53 +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v16, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v16, v57, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v28, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX9-NEXT: .LBB13_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i8_to_v16i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v50, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v51, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v52, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v53, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v54, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v55, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v118, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v119, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v117, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v114, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v116, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v113, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v101, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v102, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v103, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v112, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v87, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v96, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v97, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v0 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v2 +; GFX11-NEXT: s_waitcnt vmcnt(29) +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v4 +; GFX11-NEXT: s_waitcnt vmcnt(27) +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v6 +; GFX11-NEXT: s_waitcnt vmcnt(25) +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: s_waitcnt vmcnt(21) +; GFX11-NEXT: v_lshlrev_b16 v98, 8, v14 +; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: v_lshlrev_b16 v99, 8, v65 +; GFX11-NEXT: s_waitcnt vmcnt(19) +; GFX11-NEXT: v_lshlrev_b16 v81, 8, v66 +; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: v_lshlrev_b16 v82, 8, v67 +; GFX11-NEXT: s_waitcnt vmcnt(17) +; GFX11-NEXT: v_lshlrev_b16 v83, 8, v83 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v84 +; GFX11-NEXT: s_waitcnt vmcnt(15) +; GFX11-NEXT: v_lshlrev_b16 v85, 8, v85 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-NEXT: s_waitcnt vmcnt(13) +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v129 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v130 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v131 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v10 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v37 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v118 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v117 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v6, v113 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v114 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v116 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v101 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v102 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v103 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v112 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v100 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v11 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v10 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v86 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v71 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v70 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v68 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v87 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v96 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v97 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v98 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v99 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v81 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v82 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v83 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v84 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v85 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v9, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v53 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v51 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v48 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v39 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v27 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v29 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v65 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v66 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v67 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v17, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v18, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v19, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v20, v24, v25 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v12, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v13, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v14, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v15, v19, v20 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: .LBB13_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v5, v36, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v37, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v118, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v119, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v2, v117, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v114, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v4, v115, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v116, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v113, v6 +; GFX11-NEXT: v_add_nc_u16 v7, v16, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v6, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v22, 3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v101, v7 +; GFX11-NEXT: v_or_b32_e32 v6, v100, v6 +; GFX11-NEXT: v_or_b32_e32 v8, v102, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v103, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v112, v10 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v10 +; GFX11-NEXT: v_add_nc_u16 v6, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v10, v86, 3 +; GFX11-NEXT: v_add_nc_u16 v11, v80, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v71, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v70, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v69, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v68, 3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v87, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v96, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v97, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v98, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v99, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v81, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v82, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v83, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v84, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v85, v15 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v9, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-NEXT: v_add_nc_u16 v11, v64, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v55, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v54, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v53, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v52, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v51, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v50, 3 +; GFX11-NEXT: v_add_nc_u16 v20, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v22, v48, 3 +; GFX11-NEXT: v_add_nc_u16 v24, v39, 3 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_or_b32_e32 v11, v27, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v29, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v65, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v66, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v67, v15 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v20 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v22 +; GFX11-NEXT: v_or_b32_e32 v20, v25, v24 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v12, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v13, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v14, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v15, v19, v20 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <16 x i32> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <16 x i32> + br label %end + +end: + %phi = phi <16 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x i32> %phi +} + +define <8 x i64> @bitcast_v16f32_to_v8i64(<16 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f32_to_v8i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB14_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB14_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f32_to_v8i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB14_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB14_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f32_to_v8i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB14_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB14_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f32_to_v8i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB14_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + +define <16 x float> @bitcast_v8i64_to_v16f32(<8 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i64_to_v16f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB15_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB15_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i64_to_v16f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i64_to_v16f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB15_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i64_to_v16f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB15_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB15_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + +define <8 x double> @bitcast_v16f32_to_v8f64(<16 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f32_to_v8f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB16_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB16_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f32_to_v8f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB16_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB16_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f32_to_v8f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB16_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB16_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f32_to_v8f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + +define <16 x float> @bitcast_v8f64_to_v16f32(<8 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f64_to_v16f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB17_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB17_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f64_to_v16f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f64_to_v16f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB17_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB17_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f64_to_v16f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB17_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB17_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + +define <32 x i16> @bitcast_v16f32_to_v32i16(<16 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f32_to_v32i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v30, v15 +; GCN-NEXT: v_mov_b32_e32 v28, v14 +; GCN-NEXT: v_mov_b32_e32 v26, v13 +; GCN-NEXT: v_mov_b32_e32 v24, v12 +; GCN-NEXT: v_mov_b32_e32 v22, v11 +; GCN-NEXT: v_mov_b32_e32 v20, v10 +; GCN-NEXT: v_mov_b32_e32 v18, v9 +; GCN-NEXT: v_mov_b32_e32 v32, v8 +; GCN-NEXT: v_mov_b32_e32 v14, v7 +; GCN-NEXT: v_mov_b32_e32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v10, v5 +; GCN-NEXT: v_mov_b32_e32 v8, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v3 +; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB18_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_add_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_add_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_add_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_add_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_add_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_add_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_add_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB18_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v16, v32 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f32_to_v32i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f32_to_v32i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f32_to_v32i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + +define <16 x float> @bitcast_v32i16_to_v16f32(<32 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i16_to_v16f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v38, v14 +; GCN-NEXT: v_mov_b32_e32 v37, v12 +; GCN-NEXT: v_mov_b32_e32 v36, v10 +; GCN-NEXT: v_mov_b32_e32 v35, v8 +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB19_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB19_4 +; GCN-NEXT: .LBB19_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB19_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; GCN-NEXT: v_or_b32_e32 v0, v0, v54 +; GCN-NEXT: v_or_b32_e32 v1, v1, v55 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v2, v2, v39 +; GCN-NEXT: v_or_b32_e32 v3, v3, v48 +; GCN-NEXT: v_or_b32_e32 v4, v4, v49 +; GCN-NEXT: v_or_b32_e32 v5, v5, v50 +; GCN-NEXT: v_or_b32_e32 v6, v6, v51 +; GCN-NEXT: v_or_b32_e32 v7, v7, v52 +; GCN-NEXT: v_or_b32_e32 v8, v8, v17 +; GCN-NEXT: v_or_b32_e32 v9, v9, v19 +; GCN-NEXT: v_or_b32_e32 v10, v10, v21 +; GCN-NEXT: v_or_b32_e32 v11, v11, v23 +; GCN-NEXT: v_or_b32_e32 v12, v12, v25 +; GCN-NEXT: v_or_b32_e32 v13, v13, v27 +; GCN-NEXT: v_or_b32_e32 v14, v14, v29 +; GCN-NEXT: v_or_b32_e32 v15, v15, v53 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB19_2 +; GCN-NEXT: .LBB19_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v0, v54, v0 +; GCN-NEXT: v_or_b32_e32 v1, v55, v1 +; GCN-NEXT: v_or_b32_e32 v2, v39, v2 +; GCN-NEXT: v_or_b32_e32 v3, v48, v3 +; GCN-NEXT: v_or_b32_e32 v4, v49, v4 +; GCN-NEXT: v_or_b32_e32 v5, v50, v5 +; GCN-NEXT: v_or_b32_e32 v6, v51, v6 +; GCN-NEXT: v_or_b32_e32 v7, v52, v7 +; GCN-NEXT: v_or_b32_e32 v8, v17, v8 +; GCN-NEXT: v_or_b32_e32 v9, v19, v9 +; GCN-NEXT: v_or_b32_e32 v10, v21, v10 +; GCN-NEXT: v_or_b32_e32 v11, v23, v11 +; GCN-NEXT: v_or_b32_e32 v12, v25, v12 +; GCN-NEXT: v_or_b32_e32 v13, v27, v13 +; GCN-NEXT: v_or_b32_e32 v14, v29, v14 +; GCN-NEXT: v_or_b32_e32 v15, v53, v15 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i16_to_v16f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 3 +; VI-NEXT: v_add_u16_e32 v16, 3, v15 +; VI-NEXT: v_add_u16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_u16_e32 v16, 3, v14 +; VI-NEXT: v_add_u16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v16, v14 +; VI-NEXT: v_add_u16_e32 v16, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v16, v13 +; VI-NEXT: v_add_u16_e32 v16, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v16, v12 +; VI-NEXT: v_add_u16_e32 v16, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v16, v11 +; VI-NEXT: v_add_u16_e32 v16, 3, v10 +; VI-NEXT: v_add_u16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v16, v10 +; VI-NEXT: v_add_u16_e32 v16, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v16, v9 +; VI-NEXT: v_add_u16_e32 v16, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v16, v8 +; VI-NEXT: v_add_u16_e32 v16, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v16, v7 +; VI-NEXT: v_add_u16_e32 v16, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v16, v6 +; VI-NEXT: v_add_u16_e32 v16, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v16, v5 +; VI-NEXT: v_add_u16_e32 v16, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v16, v4 +; VI-NEXT: v_add_u16_e32 v16, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v16, v3 +; VI-NEXT: v_add_u16_e32 v16, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v16, v2 +; VI-NEXT: v_add_u16_e32 v16, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v16, v1 +; VI-NEXT: v_add_u16_e32 v16, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: .LBB19_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i16_to_v16f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB19_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i16_to_v16f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB19_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + +define <32 x half> @bitcast_v16f32_to_v32f16(<16 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f32_to_v32f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v33, v15 +; GCN-NEXT: v_mov_b32_e32 v34, v14 +; GCN-NEXT: v_mov_b32_e32 v35, v13 +; GCN-NEXT: v_mov_b32_e32 v36, v12 +; GCN-NEXT: v_mov_b32_e32 v37, v11 +; GCN-NEXT: v_mov_b32_e32 v38, v10 +; GCN-NEXT: v_mov_b32_e32 v39, v9 +; GCN-NEXT: v_mov_b32_e32 v48, v8 +; GCN-NEXT: v_mov_b32_e32 v49, v7 +; GCN-NEXT: v_mov_b32_e32 v50, v6 +; GCN-NEXT: v_mov_b32_e32 v51, v5 +; GCN-NEXT: v_mov_b32_e32 v52, v4 +; GCN-NEXT: v_mov_b32_e32 v53, v3 +; GCN-NEXT: v_mov_b32_e32 v54, v2 +; GCN-NEXT: v_mov_b32_e32 v55, v1 +; GCN-NEXT: v_mov_b32_e32 v32, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB20_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v50 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v51 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v52 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v53 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v54 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: .LBB20_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB20_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v32 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v55 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v54 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v53 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v52 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v51 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v50 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v49 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v48 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v39 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v38 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v37 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v36 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v35 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v34 +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 +; GCN-NEXT: .LBB20_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f32_to_v32f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f32_to_v32f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f32_to_v32f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + +define <16 x float> @bitcast_v32f16_to_v16f32(<32 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f16_to_v16f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB21_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; GCN-NEXT: v_or_b32_e32 v0, v44, v0 +; GCN-NEXT: v_or_b32_e32 v1, v42, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; GCN-NEXT: v_or_b32_e32 v2, v52, v2 +; GCN-NEXT: v_or_b32_e32 v3, v50, v3 +; GCN-NEXT: v_or_b32_e32 v4, v48, v4 +; GCN-NEXT: v_or_b32_e32 v5, v38, v5 +; GCN-NEXT: v_or_b32_e32 v6, v36, v6 +; GCN-NEXT: v_or_b32_e32 v7, v34, v7 +; GCN-NEXT: v_or_b32_e32 v8, v33, v8 +; GCN-NEXT: v_or_b32_e32 v9, v32, v9 +; GCN-NEXT: v_or_b32_e32 v10, v31, v10 +; GCN-NEXT: v_or_b32_e32 v11, v21, v11 +; GCN-NEXT: v_or_b32_e32 v12, v19, v12 +; GCN-NEXT: v_or_b32_e32 v13, v18, v13 +; GCN-NEXT: v_or_b32_e32 v14, v17, v14 +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: .LBB21_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB21_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v9, v8 +; GCN-NEXT: v_or_b32_e32 v6, v11, v10 +; GCN-NEXT: v_or_b32_e32 v7, v13, v12 +; GCN-NEXT: v_or_b32_e32 v8, v15, v14 +; GCN-NEXT: v_or_b32_e32 v9, v26, v24 +; GCN-NEXT: v_or_b32_e32 v10, v28, v27 +; GCN-NEXT: v_or_b32_e32 v11, v21, v29 +; GCN-NEXT: v_or_b32_e32 v12, v19, v25 +; GCN-NEXT: v_or_b32_e32 v13, v18, v23 +; GCN-NEXT: v_or_b32_e32 v14, v17, v22 +; GCN-NEXT: v_or_b32_e32 v15, v16, v20 +; GCN-NEXT: .LBB21_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f16_to_v16f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB21_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB21_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f16_to_v16f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB21_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f16_to_v16f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB21_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + +define <32 x bfloat> @bitcast_v16f32_to_v32bf16(<16 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f32_to_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v55, v15 +; GCN-NEXT: v_mov_b32_e32 v54, v14 +; GCN-NEXT: v_mov_b32_e32 v53, v13 +; GCN-NEXT: v_mov_b32_e32 v52, v12 +; GCN-NEXT: v_mov_b32_e32 v51, v11 +; GCN-NEXT: v_mov_b32_e32 v50, v10 +; GCN-NEXT: v_mov_b32_e32 v49, v9 +; GCN-NEXT: v_mov_b32_e32 v48, v8 +; GCN-NEXT: v_mov_b32_e32 v39, v7 +; GCN-NEXT: v_mov_b32_e32 v38, v6 +; GCN-NEXT: v_mov_b32_e32 v37, v5 +; GCN-NEXT: v_mov_b32_e32 v36, v4 +; GCN-NEXT: v_mov_b32_e32 v35, v3 +; GCN-NEXT: v_mov_b32_e32 v34, v2 +; GCN-NEXT: v_mov_b32_e32 v33, v1 +; GCN-NEXT: v_mov_b32_e32 v32, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB22_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB22_4 +; GCN-NEXT: .LBB22_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB22_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB22_2 +; GCN-NEXT: .LBB22_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v32 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v33 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v34 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v35 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v36 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v37 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v38 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v39 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v48 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v49 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v50 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v51 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v52 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v53 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v54 +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v55 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f32_to_v32bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f32_to_v32bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f32_to_v32bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v15, 1.0, v15 :: v_dual_add_f32 v14, 1.0, v14 +; GFX11-NEXT: v_dual_add_f32 v13, 1.0, v13 :: v_dual_add_f32 v12, 1.0, v12 +; GFX11-NEXT: v_dual_add_f32 v11, 1.0, v11 :: v_dual_add_f32 v10, 1.0, v10 +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v7, 1.0, v7 :: v_dual_add_f32 v6, 1.0, v6 +; GFX11-NEXT: v_dual_add_f32 v5, 1.0, v5 :: v_dual_add_f32 v4, 1.0, v4 +; GFX11-NEXT: v_dual_add_f32 v3, 1.0, v3 :: v_dual_add_f32 v2, 1.0, v2 +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + +define <16 x float> @bitcast_v32bf16_to_v16f32(<32 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v32bf16_to_v16f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v46 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v30 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB23_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; GCN-NEXT: v_alignbit_b32 v0, v0, v45, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; GCN-NEXT: v_alignbit_b32 v2, v2, v51, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v49, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v39, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v37, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v36, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v34, 16 +; GCN-NEXT: v_alignbit_b32 v8, v8, v33, 16 +; GCN-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; GCN-NEXT: v_alignbit_b32 v10, v10, v31, 16 +; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 +; GCN-NEXT: v_alignbit_b32 v12, v12, v19, 16 +; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 +; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: .LBB23_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB23_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v35 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v11, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v26, v24, 16 +; GCN-NEXT: v_alignbit_b32 v10, v28, v27, 16 +; GCN-NEXT: v_alignbit_b32 v11, v29, v21, 16 +; GCN-NEXT: v_alignbit_b32 v12, v25, v19, 16 +; GCN-NEXT: v_alignbit_b32 v13, v23, v18, 16 +; GCN-NEXT: v_alignbit_b32 v14, v22, v17, 16 +; GCN-NEXT: v_alignbit_b32 v15, v20, v16, 16 +; GCN-NEXT: .LBB23_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32bf16_to_v16f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v12 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v10 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v9 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v8 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v7 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v6 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v5 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v4 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v2 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v1 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v16f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v16, s7 +; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v16f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v23, v14, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_add3_u32 v21, v21, v17, 0x7fff +; GFX11-NEXT: v_add3_u32 v18, v18, v16, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v16, v18, v19, vcc_lo +; GFX11-NEXT: v_add3_u32 v19, v23, v14, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_bfe_u32 v20, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v20, v20, v15, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v15, v20, v22 :: v_dual_lshlrev_b32 v20, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 +; GFX11-NEXT: v_dual_cndmask_b32 v17, v21, v18 :: v_dual_add_f32 v18, 0x40c00000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_add3_u32 v16, v16, v18, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v14, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_perm_b32 v14, v14, v17, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v16, v16, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add3_u32 v17, v17, v13, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v21, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v17, v22, v19, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_perm_b32 v13, v13, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_add3_u32 v16, v16, v19, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v18, v18, v12, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v12, v18, v22, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v12, v12, v17, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add3_u32 v17, v17, v11, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v19, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v19, v19, v10, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v11, v17, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v22, v18, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v10 +; GFX11-NEXT: v_perm_b32 v11, v11, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v19, v22 :: v_dual_lshlrev_b32 v21, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v19, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_perm_b32 v10, v10, v17, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_add3_u32 v16, v16, v18, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_add3_u32 v18, v18, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v17, v17, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v17, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v22, v19, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v8 +; GFX11-NEXT: v_perm_b32 v9, v9, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v18, v22, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v8, v8, v17, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v19, 0x40c00000, v21 +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add3_u32 v16, v16, v19, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_bfe_u32 v19, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_add3_u32 v19, v19, v6, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v17, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v17, v17, v7, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v22, v18, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v6 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_cndmask_b32 v17, v17, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v6, v6, v17, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v17, 0x40c00000, v19 +; GFX11-NEXT: v_add3_u32 v19, v20, v18, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_bfe_u32 v22, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v16, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v16, v16, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v16, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v16, v22, v17, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v5, v5, v18, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v18, v18, v4, 0x7fff +; GFX11-NEXT: v_dual_add_f32 v17, 0x40c00000, v20 :: v_dual_lshlrev_b32 v20, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v20 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v4, v4, v16, 0x7060302 +; GFX11-NEXT: v_add3_u32 v19, v21, v17, 0x7fff +; GFX11-NEXT: v_bfe_u32 v21, v3, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc_lo +; GFX11-NEXT: v_add3_u32 v19, v21, v3, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v21, v22, v18, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GFX11-NEXT: v_bfe_u32 v24, v2, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v22 +; GFX11-NEXT: v_add3_u32 v20, v24, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v3, v3, v17, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v21, v23, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v20, v21 :: v_dual_lshlrev_b32 v23, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v21, v22, v19, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add_f32_e32 v20, 0x40c00000, v23 +; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v19, v21, v22 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v24, v20, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v21, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v24, v24, v20, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v21, v21, v0, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v23, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v22, v23, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v24, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v21, v26, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 +; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + +define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v16f32_to_v64i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 +; GCN-NEXT: v_alignbit_b32 v24, v12, v11, 24 +; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 +; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GCN-NEXT: .LBB24_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 +; GCN-NEXT: v_alignbit_b32 v24, v12, v11, 24 +; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 +; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v32, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v51, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GCN-NEXT: .LBB24_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v60 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_or_b32_e32 v60, v1, v18 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GCN-NEXT: v_or_b32_e32 v17, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v58 +; GCN-NEXT: v_or_b32_e32 v58, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v63 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v63, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v46 +; GCN-NEXT: v_or_b32_e32 v46, v1, v2 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v44 +; GCN-NEXT: v_and_b32_e32 v45, 0xff, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v61, 24, v61 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v59 +; GCN-NEXT: v_or_b32_e32 v44, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v40 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 +; GCN-NEXT: v_or_b32_e32 v7, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 24, v56 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v47 +; GCN-NEXT: v_or_b32_e32 v54, v3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v49 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v48 +; GCN-NEXT: v_or_b32_e32 v9, v4, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 24, v42 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v55 +; GCN-NEXT: v_or_b32_e32 v48, v5, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v36 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v34 +; GCN-NEXT: v_or_b32_e32 v34, v6, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v52 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v50 +; GCN-NEXT: v_or_b32_e32 v11, v8, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v30 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v28 +; GCN-NEXT: v_or_b32_e32 v13, v10, v12 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v38 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v35 +; GCN-NEXT: v_or_b32_e32 v28, v12, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 36, v0 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GCN-NEXT: v_or_b32_e32 v21, v14, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v31 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v29 +; GCN-NEXT: v_or_b32_e32 v29, v15, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v52, 0xff, v20 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GCN-NEXT: v_or_b32_e32 v31, v19, v16 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v45 +; GCN-NEXT: v_or_b32_e32 v27, v61, v18 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v45, 0xff, v19 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; GCN-NEXT: v_or_b32_e32 v40, v40, v19 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 24, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v57 +; GCN-NEXT: v_or_b32_e32 v56, v56, v22 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v60 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v63 +; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v46 +; GCN-NEXT: v_or_b32_e32 v49, v49, v47 +; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v44 +; GCN-NEXT: v_or_b32_e32 v51, v51, v43 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v36, v36, v55 +; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GCN-NEXT: v_or_b32_e32 v37, v37, v53 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v30, v30, v50 +; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GCN-NEXT: v_or_b32_e32 v32, v32, v39 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GCN-NEXT: v_or_b32_e32 v24, v24, v35 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v25, v25, v38 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v20, v20, v52 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_or_b32_e32 v26, v26, v42 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v33, v33, v45 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v23, v41, v23 +; GCN-NEXT: v_or_b32_e32 v31, v57, v31 +; GCN-NEXT: v_or_b32_e32 v17, v17, v27 +; GCN-NEXT: v_or_b32_e32 v27, v58, v40 +; GCN-NEXT: v_or_b32_e32 v35, v59, v56 +; GCN-NEXT: v_or_b32_e32 v38, v46, v49 +; GCN-NEXT: v_or_b32_e32 v39, v44, v51 +; GCN-NEXT: v_or_b32_e32 v7, v7, v36 +; GCN-NEXT: v_or_b32_e32 v36, v54, v37 +; GCN-NEXT: v_or_b32_e32 v9, v9, v30 +; GCN-NEXT: v_or_b32_e32 v30, v48, v32 +; GCN-NEXT: v_or_b32_e32 v24, v34, v24 +; GCN-NEXT: v_or_b32_e32 v11, v11, v25 +; GCN-NEXT: v_or_b32_e32 v13, v13, v20 +; GCN-NEXT: v_or_b32_e32 v20, v28, v26 +; GCN-NEXT: v_or_b32_e32 v21, v21, v33 +; GCN-NEXT: v_or_b32_e32 v23, v29, v23 +; GCN-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v35, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v39, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v16f32_to_v64i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: .LBB24_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; VI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; VI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; VI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; VI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; VI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v27, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v28 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: v_or_b32_sdwa v2, v2, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v16f32_to_v64i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v16 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v26, v23 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: .LBB24_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v16, 1.0, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 1.0, v15 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_add_f32_e32 v12, 1.0, v12 +; GFX9-NEXT: v_add_f32_e32 v11, 1.0, v11 +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_add_f32_e32 v10, 1.0, v10 +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v7, 1.0, v7 +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_add_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_add_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v27, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v16f32_to_v64i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB24_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v8, 1.0, v8 :: v_dual_add_f32 v15, 1.0, v15 +; GFX11-NEXT: v_dual_add_f32 v16, 1.0, v16 :: v_dual_add_f32 v7, 1.0, v7 +; GFX11-NEXT: v_dual_add_f32 v10, 1.0, v10 :: v_dual_add_f32 v13, 1.0, v13 +; GFX11-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v9, 1.0, v9 +; GFX11-NEXT: v_dual_add_f32 v12, 1.0, v12 :: v_dual_add_f32 v11, 1.0, v11 +; GFX11-NEXT: v_dual_add_f32 v6, 1.0, v6 :: v_dual_add_f32 v5, 1.0, v5 +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_dual_add_f32 v4, 1.0, v4 :: v_dual_add_f32 v3, 1.0, v3 +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB24_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v25 +; GFX11-NEXT: v_and_b32_e32 v96, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v86 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v25 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v87 +; GFX11-NEXT: v_or_b32_e32 v24, v96, v24 +; GFX11-NEXT: v_lshlrev_b16 v85, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v86, v85 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v83 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v24 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v82 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v80, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v71, 8, v71 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v70 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v22 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v84 +; GFX11-NEXT: v_or_b32_e32 v23, v83, v23 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v80 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v71 +; GFX11-NEXT: v_or_b32_e32 v22, v70, v22 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v24 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v23 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v25 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v65 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v22 +; GFX11-NEXT: v_or_b32_e32 v21, v23, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v22, v25, v54 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v52 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v51 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v50 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v49 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v24 +; GFX11-NEXT: v_or_b32_e32 v23, v25, v49 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v48 +; GFX11-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v22 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v23 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v37 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v35 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v34 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b16 v69, 8, v69 +; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v68 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v67 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v19, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v22 +; GFX11-NEXT: v_or_b32_e32 v18, v23, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v31 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v30 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v28 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v26 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v69 +; GFX11-NEXT: v_or_b32_e32 v67, v68, v67 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v22 +; GFX11-NEXT: v_or_b32_e32 v17, v23, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v67 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v21 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <16 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <16 x float> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <16 x float> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + +define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i8_to_v16f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v38, v14 +; GCN-NEXT: v_mov_b32_e32 v34, v12 +; GCN-NEXT: v_mov_b32_e32 v37, v10 +; GCN-NEXT: v_mov_b32_e32 v33, v8 +; GCN-NEXT: v_mov_b32_e32 v36, v6 +; GCN-NEXT: v_mov_b32_e32 v32, v4 +; GCN-NEXT: v_mov_b32_e32 v35, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v9 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v45 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v44 +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v59 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 24, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB25_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GCN-NEXT: v_or_b32_e32 v0, v0, v42 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_or_b32_e32 v1, v1, v41 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_or_b32_e32 v2, v2, v40 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_or_b32_e32 v3, v3, v55 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v37 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v38 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v50 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v54 +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v53 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v52 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v23 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v49 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v48 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v39 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v63 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v62 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v61 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v21 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 +; GCN-NEXT: v_or_b32_e32 v17, v33, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v20, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; GCN-NEXT: v_or_b32_e32 v26, v28, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GCN-NEXT: v_or_b32_e32 v27, v35, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_or_b32_e32 v29, v36, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v37 +; GCN-NEXT: v_or_b32_e32 v31, v38, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v9, v9, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v13, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v15, v15, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v18, v18, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v32, v7 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v32, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v32, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v32, v4 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v32, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_or_b32_e32 v25, v59, v25 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v30, v57, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v32, v58, v8 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v9 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v8, v10 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v11 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v8, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v8, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v8, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v19, v46, v19 +; GCN-NEXT: v_or_b32_e32 v0, v0, v7 +; GCN-NEXT: v_or_b32_e32 v1, v1, v6 +; GCN-NEXT: v_or_b32_e32 v2, v2, v5 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_or_b32_e32 v4, v17, v21 +; GCN-NEXT: v_or_b32_e32 v5, v20, v22 +; GCN-NEXT: v_or_b32_e32 v6, v23, v24 +; GCN-NEXT: v_or_b32_e32 v7, v26, v28 +; GCN-NEXT: v_or_b32_e32 v8, v27, v25 +; GCN-NEXT: v_or_b32_e32 v9, v29, v30 +; GCN-NEXT: v_or_b32_e32 v10, v31, v32 +; GCN-NEXT: v_or_b32_e32 v11, v33, v34 +; GCN-NEXT: v_or_b32_e32 v12, v35, v12 +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v16 +; GCN-NEXT: v_or_b32_e32 v15, v18, v19 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; kill: killed $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: .LBB25_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB25_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v42, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v41, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v40, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v55, v3 +; GCN-NEXT: s_movk_i32 s7, 0x300 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: s_mov_b32 s6, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v50 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v54 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v52 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v62 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v61 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v60 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v51 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GCN-NEXT: v_or_b32_e32 v6, v17, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GCN-NEXT: v_or_b32_e32 v13, v29, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GCN-NEXT: v_or_b32_e32 v15, v27, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GCN-NEXT: v_or_b32_e32 v17, v44, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; GCN-NEXT: v_or_b32_e32 v20, v45, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; GCN-NEXT: v_or_b32_e32 v23, v56, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v25, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v27, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 +; GCN-NEXT: v_or_b32_e32 v29, v43, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_or_b32_e32 v19, v47, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v31, v36 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v32, v8 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v32, v5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v32, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v32, v7 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v32, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v32, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v32, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; GCN-NEXT: v_or_b32_e32 v16, v59, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; GCN-NEXT: v_or_b32_e32 v18, v57, v18 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; GCN-NEXT: v_or_b32_e32 v22, v58, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v32, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x300, v19 +; GCN-NEXT: v_or_b32_e32 v30, v46, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v0, v31, v0 +; GCN-NEXT: v_or_b32_e32 v1, v8, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v10, v9 +; GCN-NEXT: v_or_b32_e32 v6, v12, v11 +; GCN-NEXT: v_or_b32_e32 v7, v14, v13 +; GCN-NEXT: v_or_b32_e32 v8, v16, v15 +; GCN-NEXT: v_or_b32_e32 v9, v18, v17 +; GCN-NEXT: v_or_b32_e32 v10, v22, v20 +; GCN-NEXT: v_or_b32_e32 v11, v24, v23 +; GCN-NEXT: v_or_b32_e32 v12, v26, v25 +; GCN-NEXT: v_or_b32_e32 v13, v28, v27 +; GCN-NEXT: v_or_b32_e32 v14, v21, v29 +; GCN-NEXT: v_or_b32_e32 v15, v30, v19 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 +; GCN-NEXT: .LBB25_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i8_to_v16f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v15 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 +; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: .LBB25_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB25_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v15, 0x300 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v9, 3, v51 +; VI-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v10, 3, v24 +; VI-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v10, v10, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v11, 3, v30 +; VI-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v12, 3, v60 +; VI-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 3, v42 +; VI-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_or_b32_sdwa v17, v57, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v4, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v53 +; VI-NEXT: v_or_b32_sdwa v16, v23, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v16, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v15, v17, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v28, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v52 +; VI-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v50 +; VI-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v49 +; VI-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v63 +; VI-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v48 +; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v44 +; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v39 +; VI-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v19 +; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: .LBB25_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i8_to_v16f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v24 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: .LBB25_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB25_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v51 +; GFX9-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v10, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v11, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v12, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v15, 3, v53 +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v16, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v16, v57, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v28, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX9-NEXT: .LBB25_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i8_to_v16f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v50, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v51, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v52, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v53, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v54, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v55, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v118, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v119, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v117, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v114, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v116, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v113, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v101, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v102, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v103, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v112, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v87, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v96, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v97, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v0 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v2 +; GFX11-NEXT: s_waitcnt vmcnt(29) +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v4 +; GFX11-NEXT: s_waitcnt vmcnt(27) +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v6 +; GFX11-NEXT: s_waitcnt vmcnt(25) +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: s_waitcnt vmcnt(21) +; GFX11-NEXT: v_lshlrev_b16 v98, 8, v14 +; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: v_lshlrev_b16 v99, 8, v65 +; GFX11-NEXT: s_waitcnt vmcnt(19) +; GFX11-NEXT: v_lshlrev_b16 v81, 8, v66 +; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: v_lshlrev_b16 v82, 8, v67 +; GFX11-NEXT: s_waitcnt vmcnt(17) +; GFX11-NEXT: v_lshlrev_b16 v83, 8, v83 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v84 +; GFX11-NEXT: s_waitcnt vmcnt(15) +; GFX11-NEXT: v_lshlrev_b16 v85, 8, v85 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-NEXT: s_waitcnt vmcnt(13) +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v129 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v130 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v131 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v10 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v37 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v118 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v117 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v6, v113 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v114 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v116 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v101 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v102 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v103 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v112 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v100 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v11 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v10 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v86 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v71 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v70 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v68 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v87 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v96 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v97 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v98 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v99 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v81 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v82 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v83 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v84 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v85 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v9, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v53 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v51 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v48 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v39 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v27 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v29 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v65 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v66 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v67 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v17, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v18, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v19, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v20, v24, v25 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v12, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v13, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v14, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v15, v19, v20 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v5, v36, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v37, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v118, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v119, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v2, v117, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v114, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v4, v115, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v116, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v113, v6 +; GFX11-NEXT: v_add_nc_u16 v7, v16, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v6, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v22, 3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v101, v7 +; GFX11-NEXT: v_or_b32_e32 v6, v100, v6 +; GFX11-NEXT: v_or_b32_e32 v8, v102, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v103, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v112, v10 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v10 +; GFX11-NEXT: v_add_nc_u16 v6, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v10, v86, 3 +; GFX11-NEXT: v_add_nc_u16 v11, v80, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v71, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v70, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v69, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v68, 3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v87, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v96, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v97, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v98, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v99, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v81, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v82, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v83, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v84, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v85, v15 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v9, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-NEXT: v_add_nc_u16 v11, v64, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v55, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v54, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v53, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v52, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v51, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v50, 3 +; GFX11-NEXT: v_add_nc_u16 v20, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v22, v48, 3 +; GFX11-NEXT: v_add_nc_u16 v24, v39, 3 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_or_b32_e32 v11, v27, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v29, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v65, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v66, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v67, v15 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v20 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v22 +; GFX11-NEXT: v_or_b32_e32 v20, v25, v24 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v12, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v13, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v14, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v15, v19, v20 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <16 x float> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <16 x float> + br label %end + +end: + %phi = phi <16 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <16 x float> %phi +} + +define <8 x double> @bitcast_v8i64_to_v8f64(<8 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i64_to_v8f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB26_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: .LBB26_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i64_to_v8f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i64_to_v8f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i64_to_v8f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + +define <8 x i64> @bitcast_v8f64_to_v8i64(<8 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f64_to_v8i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB27_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GCN-NEXT: .LBB27_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f64_to_v8i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f64_to_v8i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB27_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: .LBB27_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f64_to_v8i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB27_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: .LBB27_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + +define <32 x i16> @bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i64_to_v32i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v30, v15 +; GCN-NEXT: v_mov_b32_e32 v28, v14 +; GCN-NEXT: v_mov_b32_e32 v26, v13 +; GCN-NEXT: v_mov_b32_e32 v24, v12 +; GCN-NEXT: v_mov_b32_e32 v22, v11 +; GCN-NEXT: v_mov_b32_e32 v20, v10 +; GCN-NEXT: v_mov_b32_e32 v18, v9 +; GCN-NEXT: v_mov_b32_e32 v32, v8 +; GCN-NEXT: v_mov_b32_e32 v14, v7 +; GCN-NEXT: v_mov_b32_e32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v10, v5 +; GCN-NEXT: v_mov_b32_e32 v8, v4 +; GCN-NEXT: v_mov_b32_e32 v6, v3 +; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB28_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 +; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 +; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 +; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 +; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 +; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: .LBB28_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v16, v32 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i64_to_v32i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i64_to_v32i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB28_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB28_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i64_to_v32i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB28_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB28_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + +define <8 x i64> @bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i16_to_v8i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v38, v14 +; GCN-NEXT: v_mov_b32_e32 v37, v12 +; GCN-NEXT: v_mov_b32_e32 v36, v10 +; GCN-NEXT: v_mov_b32_e32 v35, v8 +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB29_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB29_4 +; GCN-NEXT: .LBB29_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB29_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; GCN-NEXT: v_or_b32_e32 v0, v0, v54 +; GCN-NEXT: v_or_b32_e32 v1, v1, v55 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v2, v2, v39 +; GCN-NEXT: v_or_b32_e32 v3, v3, v48 +; GCN-NEXT: v_or_b32_e32 v4, v4, v49 +; GCN-NEXT: v_or_b32_e32 v5, v5, v50 +; GCN-NEXT: v_or_b32_e32 v6, v6, v51 +; GCN-NEXT: v_or_b32_e32 v7, v7, v52 +; GCN-NEXT: v_or_b32_e32 v8, v8, v17 +; GCN-NEXT: v_or_b32_e32 v9, v9, v19 +; GCN-NEXT: v_or_b32_e32 v10, v10, v21 +; GCN-NEXT: v_or_b32_e32 v11, v11, v23 +; GCN-NEXT: v_or_b32_e32 v12, v12, v25 +; GCN-NEXT: v_or_b32_e32 v13, v13, v27 +; GCN-NEXT: v_or_b32_e32 v14, v14, v29 +; GCN-NEXT: v_or_b32_e32 v15, v15, v53 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB29_2 +; GCN-NEXT: .LBB29_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v0, v54, v0 +; GCN-NEXT: v_or_b32_e32 v1, v55, v1 +; GCN-NEXT: v_or_b32_e32 v2, v39, v2 +; GCN-NEXT: v_or_b32_e32 v3, v48, v3 +; GCN-NEXT: v_or_b32_e32 v4, v49, v4 +; GCN-NEXT: v_or_b32_e32 v5, v50, v5 +; GCN-NEXT: v_or_b32_e32 v6, v51, v6 +; GCN-NEXT: v_or_b32_e32 v7, v52, v7 +; GCN-NEXT: v_or_b32_e32 v8, v17, v8 +; GCN-NEXT: v_or_b32_e32 v9, v19, v9 +; GCN-NEXT: v_or_b32_e32 v10, v21, v10 +; GCN-NEXT: v_or_b32_e32 v11, v23, v11 +; GCN-NEXT: v_or_b32_e32 v12, v25, v12 +; GCN-NEXT: v_or_b32_e32 v13, v27, v13 +; GCN-NEXT: v_or_b32_e32 v14, v29, v14 +; GCN-NEXT: v_or_b32_e32 v15, v53, v15 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i16_to_v8i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 3 +; VI-NEXT: v_add_u16_e32 v16, 3, v15 +; VI-NEXT: v_add_u16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_u16_e32 v16, 3, v14 +; VI-NEXT: v_add_u16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v16, v14 +; VI-NEXT: v_add_u16_e32 v16, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v16, v13 +; VI-NEXT: v_add_u16_e32 v16, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v16, v12 +; VI-NEXT: v_add_u16_e32 v16, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v16, v11 +; VI-NEXT: v_add_u16_e32 v16, 3, v10 +; VI-NEXT: v_add_u16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v16, v10 +; VI-NEXT: v_add_u16_e32 v16, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v16, v9 +; VI-NEXT: v_add_u16_e32 v16, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v16, v8 +; VI-NEXT: v_add_u16_e32 v16, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v16, v7 +; VI-NEXT: v_add_u16_e32 v16, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v16, v6 +; VI-NEXT: v_add_u16_e32 v16, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v16, v5 +; VI-NEXT: v_add_u16_e32 v16, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v16, v4 +; VI-NEXT: v_add_u16_e32 v16, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v16, v3 +; VI-NEXT: v_add_u16_e32 v16, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v16, v2 +; VI-NEXT: v_add_u16_e32 v16, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v16, v1 +; VI-NEXT: v_add_u16_e32 v16, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i16_to_v8i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB29_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB29_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i16_to_v8i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB29_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB29_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + +define <32 x half> @bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i64_to_v32f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v34, v15 +; GCN-NEXT: v_mov_b32_e32 v33, v14 +; GCN-NEXT: v_mov_b32_e32 v36, v13 +; GCN-NEXT: v_mov_b32_e32 v35, v12 +; GCN-NEXT: v_mov_b32_e32 v38, v11 +; GCN-NEXT: v_mov_b32_e32 v37, v10 +; GCN-NEXT: v_mov_b32_e32 v48, v9 +; GCN-NEXT: v_mov_b32_e32 v39, v8 +; GCN-NEXT: v_mov_b32_e32 v50, v7 +; GCN-NEXT: v_mov_b32_e32 v49, v6 +; GCN-NEXT: v_mov_b32_e32 v52, v5 +; GCN-NEXT: v_mov_b32_e32 v51, v4 +; GCN-NEXT: v_mov_b32_e32 v54, v3 +; GCN-NEXT: v_mov_b32_e32 v53, v2 +; GCN-NEXT: v_mov_b32_e32 v55, v1 +; GCN-NEXT: v_mov_b32_e32 v32, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB30_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v50 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v49 +; GCN-NEXT: s_waitcnt expcnt(5) +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v52 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v51 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v54 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v53 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: .LBB30_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB30_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v55, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v54, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v52, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v50, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v48, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v38, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v36, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v34, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 +; GCN-NEXT: .LBB30_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i64_to_v32f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB30_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB30_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i64_to_v32f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB30_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB30_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i64_to_v32f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB30_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB30_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + +define <8 x i64> @bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f16_to_v8i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB31_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; GCN-NEXT: v_or_b32_e32 v0, v44, v0 +; GCN-NEXT: v_or_b32_e32 v1, v42, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; GCN-NEXT: v_or_b32_e32 v2, v52, v2 +; GCN-NEXT: v_or_b32_e32 v3, v50, v3 +; GCN-NEXT: v_or_b32_e32 v4, v48, v4 +; GCN-NEXT: v_or_b32_e32 v5, v38, v5 +; GCN-NEXT: v_or_b32_e32 v6, v36, v6 +; GCN-NEXT: v_or_b32_e32 v7, v34, v7 +; GCN-NEXT: v_or_b32_e32 v8, v33, v8 +; GCN-NEXT: v_or_b32_e32 v9, v32, v9 +; GCN-NEXT: v_or_b32_e32 v10, v31, v10 +; GCN-NEXT: v_or_b32_e32 v11, v21, v11 +; GCN-NEXT: v_or_b32_e32 v12, v19, v12 +; GCN-NEXT: v_or_b32_e32 v13, v18, v13 +; GCN-NEXT: v_or_b32_e32 v14, v17, v14 +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: .LBB31_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB31_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v9, v8 +; GCN-NEXT: v_or_b32_e32 v6, v11, v10 +; GCN-NEXT: v_or_b32_e32 v7, v13, v12 +; GCN-NEXT: v_or_b32_e32 v8, v15, v14 +; GCN-NEXT: v_or_b32_e32 v9, v26, v24 +; GCN-NEXT: v_or_b32_e32 v10, v28, v27 +; GCN-NEXT: v_or_b32_e32 v11, v21, v29 +; GCN-NEXT: v_or_b32_e32 v12, v19, v25 +; GCN-NEXT: v_or_b32_e32 v13, v18, v23 +; GCN-NEXT: v_or_b32_e32 v14, v17, v22 +; GCN-NEXT: v_or_b32_e32 v15, v16, v20 +; GCN-NEXT: .LBB31_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f16_to_v8i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB31_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB31_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f16_to_v8i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB31_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB31_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f16_to_v8i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB31_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB31_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + +define <32 x bfloat> @bitcast_v8i64_to_v32bf16(<8 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i64_to_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v55, v15 +; GCN-NEXT: v_mov_b32_e32 v54, v14 +; GCN-NEXT: v_mov_b32_e32 v53, v13 +; GCN-NEXT: v_mov_b32_e32 v52, v12 +; GCN-NEXT: v_mov_b32_e32 v51, v11 +; GCN-NEXT: v_mov_b32_e32 v50, v10 +; GCN-NEXT: v_mov_b32_e32 v49, v9 +; GCN-NEXT: v_mov_b32_e32 v48, v8 +; GCN-NEXT: v_mov_b32_e32 v39, v7 +; GCN-NEXT: v_mov_b32_e32 v38, v6 +; GCN-NEXT: v_mov_b32_e32 v37, v5 +; GCN-NEXT: v_mov_b32_e32 v36, v4 +; GCN-NEXT: v_mov_b32_e32 v35, v3 +; GCN-NEXT: v_mov_b32_e32 v34, v2 +; GCN-NEXT: v_mov_b32_e32 v33, v1 +; GCN-NEXT: v_mov_b32_e32 v32, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB32_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB32_4 +; GCN-NEXT: .LBB32_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB32_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB32_2 +; GCN-NEXT: .LBB32_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v33, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v35, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v36 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v37, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v38 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v39, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v49, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v50 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v51, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v52 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v53, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v55, vcc +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v6 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i64_to_v32bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB32_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB32_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i64_to_v32bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB32_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB32_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i64_to_v32bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB32_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB32_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + +define <8 x i64> @bitcast_v32bf16_to_v8i64(<32 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v32bf16_to_v8i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v46 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v30 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB33_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; GCN-NEXT: v_alignbit_b32 v0, v0, v45, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; GCN-NEXT: v_alignbit_b32 v2, v2, v51, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v49, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v39, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v37, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v36, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v34, 16 +; GCN-NEXT: v_alignbit_b32 v8, v8, v33, 16 +; GCN-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; GCN-NEXT: v_alignbit_b32 v10, v10, v31, 16 +; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 +; GCN-NEXT: v_alignbit_b32 v12, v12, v19, 16 +; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 +; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: .LBB33_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB33_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v35 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v11, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v26, v24, 16 +; GCN-NEXT: v_alignbit_b32 v10, v28, v27, 16 +; GCN-NEXT: v_alignbit_b32 v11, v29, v21, 16 +; GCN-NEXT: v_alignbit_b32 v12, v25, v19, 16 +; GCN-NEXT: v_alignbit_b32 v13, v23, v18, 16 +; GCN-NEXT: v_alignbit_b32 v14, v22, v17, 16 +; GCN-NEXT: v_alignbit_b32 v15, v20, v16, 16 +; GCN-NEXT: .LBB33_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32bf16_to_v8i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v12 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v10 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v9 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v8 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v7 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v6 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v5 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v4 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v2 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v1 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v8i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v16, s7 +; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v8i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v23, v14, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_add3_u32 v21, v21, v17, 0x7fff +; GFX11-NEXT: v_add3_u32 v18, v18, v16, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v16, v18, v19, vcc_lo +; GFX11-NEXT: v_add3_u32 v19, v23, v14, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_bfe_u32 v20, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v20, v20, v15, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v15, v20, v22 :: v_dual_lshlrev_b32 v20, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 +; GFX11-NEXT: v_dual_cndmask_b32 v17, v21, v18 :: v_dual_add_f32 v18, 0x40c00000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_add3_u32 v16, v16, v18, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v14, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_perm_b32 v14, v14, v17, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v16, v16, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add3_u32 v17, v17, v13, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v21, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v17, v22, v19, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_perm_b32 v13, v13, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_add3_u32 v16, v16, v19, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v18, v18, v12, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v12, v18, v22, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v12, v12, v17, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add3_u32 v17, v17, v11, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v19, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v19, v19, v10, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v11, v17, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v22, v18, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v10 +; GFX11-NEXT: v_perm_b32 v11, v11, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v19, v22 :: v_dual_lshlrev_b32 v21, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v19, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_perm_b32 v10, v10, v17, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_add3_u32 v16, v16, v18, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_add3_u32 v18, v18, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v17, v17, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v17, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v22, v19, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v8 +; GFX11-NEXT: v_perm_b32 v9, v9, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v18, v22, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v8, v8, v17, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v19, 0x40c00000, v21 +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add3_u32 v16, v16, v19, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_bfe_u32 v19, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_add3_u32 v19, v19, v6, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v17, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v17, v17, v7, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v22, v18, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v6 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_cndmask_b32 v17, v17, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v6, v6, v17, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v17, 0x40c00000, v19 +; GFX11-NEXT: v_add3_u32 v19, v20, v18, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_bfe_u32 v22, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v16, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v16, v16, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v16, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v16, v22, v17, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v5, v5, v18, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v18, v18, v4, 0x7fff +; GFX11-NEXT: v_dual_add_f32 v17, 0x40c00000, v20 :: v_dual_lshlrev_b32 v20, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v20 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v4, v4, v16, 0x7060302 +; GFX11-NEXT: v_add3_u32 v19, v21, v17, 0x7fff +; GFX11-NEXT: v_bfe_u32 v21, v3, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc_lo +; GFX11-NEXT: v_add3_u32 v19, v21, v3, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v21, v22, v18, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GFX11-NEXT: v_bfe_u32 v24, v2, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v22 +; GFX11-NEXT: v_add3_u32 v20, v24, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v3, v3, v17, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v21, v23, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v20, v21 :: v_dual_lshlrev_b32 v23, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v21, v22, v19, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add_f32_e32 v20, 0x40c00000, v23 +; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v19, v21, v22 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v24, v20, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v21, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v24, v24, v20, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v21, v21, v0, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v23, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v22, v23, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v24, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v21, v26, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 +; GFX11-NEXT: .LBB33_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + +define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i64_to_v64i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB34_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 +; GCN-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 +; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GCN-NEXT: .LBB34_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB34_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v21, v16, v15, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v20, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v28, v14, v13, 8 +; GCN-NEXT: v_alignbit_b32 v23, v12, v11, 24 +; GCN-NEXT: v_alignbit_b32 v25, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v34, v12, v11, 8 +; GCN-NEXT: v_alignbit_b32 v30, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v31, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v48, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v36, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v37, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v54, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v49, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v50, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v46, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v40, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v41, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v58, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v44, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v60, v2, v1, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 8, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v42, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GCN-NEXT: .LBB34_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v60 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_or_b32_e32 v60, v1, v18 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; GCN-NEXT: v_or_b32_e32 v17, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v58 +; GCN-NEXT: v_or_b32_e32 v58, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v63 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v63, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v46 +; GCN-NEXT: v_or_b32_e32 v46, v1, v2 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v44 +; GCN-NEXT: v_and_b32_e32 v45, 0xff, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v61, 24, v61 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v59 +; GCN-NEXT: v_or_b32_e32 v44, v1, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: v_and_b32_e32 v41, 0xff, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v40 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v54 +; GCN-NEXT: v_or_b32_e32 v7, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v57, 0xff, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 24, v56 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v47 +; GCN-NEXT: v_or_b32_e32 v54, v3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v49 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v48 +; GCN-NEXT: v_or_b32_e32 v9, v4, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v42 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v55 +; GCN-NEXT: v_or_b32_e32 v48, v5, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v36 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v34 +; GCN-NEXT: v_or_b32_e32 v34, v6, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v52 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v51 +; GCN-NEXT: v_or_b32_e32 v11, v8, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v51, 0xff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v30 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v28 +; GCN-NEXT: v_or_b32_e32 v13, v10, v12 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v38 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v35 +; GCN-NEXT: v_or_b32_e32 v28, v12, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 36, v0 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GCN-NEXT: v_or_b32_e32 v21, v14, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v32 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v29 +; GCN-NEXT: v_or_b32_e32 v29, v15, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 44, v0 +; GCN-NEXT: v_and_b32_e32 v52, 0xff, v20 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GCN-NEXT: v_or_b32_e32 v32, v19, v16 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v45 +; GCN-NEXT: v_or_b32_e32 v27, v61, v18 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v45, 0xff, v19 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v41 +; GCN-NEXT: v_or_b32_e32 v40, v40, v19 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 24, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v57 +; GCN-NEXT: v_or_b32_e32 v56, v56, v22 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 60, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_and_b32_e32 v57, 0xffff, v60 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v58 +; GCN-NEXT: v_and_b32_e32 v59, 0xffff, v63 +; GCN-NEXT: v_and_b32_e32 v46, 0xffff, v46 +; GCN-NEXT: v_or_b32_e32 v49, v49, v47 +; GCN-NEXT: v_and_b32_e32 v44, 0xffff, v44 +; GCN-NEXT: v_or_b32_e32 v50, v50, v43 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_or_b32_e32 v36, v36, v55 +; GCN-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GCN-NEXT: v_or_b32_e32 v37, v37, v53 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_or_b32_e32 v30, v30, v51 +; GCN-NEXT: v_and_b32_e32 v48, 0xffff, v48 +; GCN-NEXT: v_or_b32_e32 v31, v31, v39 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GCN-NEXT: v_or_b32_e32 v23, v23, v35 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v25, v25, v38 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v20, v20, v52 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_or_b32_e32 v26, v26, v42 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v33, v33, v45 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v24, v41, v24 +; GCN-NEXT: v_or_b32_e32 v32, v57, v32 +; GCN-NEXT: v_or_b32_e32 v17, v17, v27 +; GCN-NEXT: v_or_b32_e32 v27, v58, v40 +; GCN-NEXT: v_or_b32_e32 v35, v59, v56 +; GCN-NEXT: v_or_b32_e32 v38, v46, v49 +; GCN-NEXT: v_or_b32_e32 v39, v44, v50 +; GCN-NEXT: v_or_b32_e32 v7, v7, v36 +; GCN-NEXT: v_or_b32_e32 v36, v54, v37 +; GCN-NEXT: v_or_b32_e32 v9, v9, v30 +; GCN-NEXT: v_or_b32_e32 v30, v48, v31 +; GCN-NEXT: v_or_b32_e32 v23, v34, v23 +; GCN-NEXT: v_or_b32_e32 v11, v11, v25 +; GCN-NEXT: v_or_b32_e32 v13, v13, v20 +; GCN-NEXT: v_or_b32_e32 v20, v28, v26 +; GCN-NEXT: v_or_b32_e32 v21, v21, v33 +; GCN-NEXT: v_or_b32_e32 v24, v29, v24 +; GCN-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v35, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v38, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v39, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v36, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v22, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i64_to_v64i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: .LBB34_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB34_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_addc_u32_e32 v16, vcc, 0, v16, vcc +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: .LBB34_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v27, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v28 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: v_or_b32_sdwa v2, v2, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i64_to_v64i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v16 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v26, v23 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: .LBB34_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB34_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 3, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 3, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 3, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 3, v9 +; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v10, vcc +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, 3, v11 +; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, 0, v12, vcc +; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, 3, v13 +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v14, vcc +; GFX9-NEXT: v_add_co_u32_e32 v15, vcc, 3, v15 +; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v16, vcc +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB34_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v27, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i64_to_v64i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB34_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB34_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v1, vcc_lo, v1, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, 0, v2, vcc_lo +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v3, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v4, vcc_lo +; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, v5, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo +; GFX11-NEXT: v_add_co_u32 v9, vcc_lo, v9, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v10, vcc_lo +; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, v11, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v12, vcc_lo +; GFX11-NEXT: v_add_co_u32 v13, vcc_lo, v13, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v14, vcc_lo +; GFX11-NEXT: v_add_co_u32 v15, vcc_lo, v15, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v16, vcc_lo +; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v7, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB34_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v25 +; GFX11-NEXT: v_and_b32_e32 v96, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v86 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v25 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v87 +; GFX11-NEXT: v_or_b32_e32 v24, v96, v24 +; GFX11-NEXT: v_lshlrev_b16 v85, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v86, v85 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v83 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v24 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v82 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v80, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v71, 8, v71 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v70 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v22 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v84 +; GFX11-NEXT: v_or_b32_e32 v23, v83, v23 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v80 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v71 +; GFX11-NEXT: v_or_b32_e32 v22, v70, v22 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v24 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v23 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v25 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v65 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v22 +; GFX11-NEXT: v_or_b32_e32 v21, v23, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v22, v25, v54 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v52 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v51 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v50 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v49 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v24 +; GFX11-NEXT: v_or_b32_e32 v23, v25, v49 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v48 +; GFX11-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v22 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v23 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v37 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v35 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v34 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b16 v69, 8, v69 +; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v68 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v67 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v19, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v22 +; GFX11-NEXT: v_or_b32_e32 v18, v23, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v31 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v30 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v28 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v26 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v69 +; GFX11-NEXT: v_or_b32_e32 v67, v68, v67 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v22 +; GFX11-NEXT: v_or_b32_e32 v17, v23, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v67 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v21 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i64> %a, splat (i64 3) + %a2 = bitcast <8 x i64> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x i64> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + +define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i8_to_v8i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v38, v14 +; GCN-NEXT: v_mov_b32_e32 v34, v12 +; GCN-NEXT: v_mov_b32_e32 v37, v10 +; GCN-NEXT: v_mov_b32_e32 v33, v8 +; GCN-NEXT: v_mov_b32_e32 v36, v6 +; GCN-NEXT: v_mov_b32_e32 v32, v4 +; GCN-NEXT: v_mov_b32_e32 v35, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v9 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v45 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v44 +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v59 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 24, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB35_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GCN-NEXT: v_or_b32_e32 v0, v0, v42 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_or_b32_e32 v1, v1, v41 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_or_b32_e32 v2, v2, v40 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_or_b32_e32 v3, v3, v55 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v37 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v38 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v50 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v54 +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v53 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v52 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v23 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v49 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v48 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v39 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v63 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v62 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v61 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v21 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 +; GCN-NEXT: v_or_b32_e32 v17, v33, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v20, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; GCN-NEXT: v_or_b32_e32 v26, v28, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GCN-NEXT: v_or_b32_e32 v27, v35, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_or_b32_e32 v29, v36, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v37 +; GCN-NEXT: v_or_b32_e32 v31, v38, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v9, v9, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v13, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v15, v15, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v18, v18, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v32, v7 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v32, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v32, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v32, v4 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v32, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_or_b32_e32 v25, v59, v25 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v30, v57, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v32, v58, v8 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v9 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v8, v10 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v11 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v8, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v8, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v8, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v19, v46, v19 +; GCN-NEXT: v_or_b32_e32 v0, v0, v7 +; GCN-NEXT: v_or_b32_e32 v1, v1, v6 +; GCN-NEXT: v_or_b32_e32 v2, v2, v5 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_or_b32_e32 v4, v17, v21 +; GCN-NEXT: v_or_b32_e32 v5, v20, v22 +; GCN-NEXT: v_or_b32_e32 v6, v23, v24 +; GCN-NEXT: v_or_b32_e32 v7, v26, v28 +; GCN-NEXT: v_or_b32_e32 v8, v27, v25 +; GCN-NEXT: v_or_b32_e32 v9, v29, v30 +; GCN-NEXT: v_or_b32_e32 v10, v31, v32 +; GCN-NEXT: v_or_b32_e32 v11, v33, v34 +; GCN-NEXT: v_or_b32_e32 v12, v35, v12 +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v16 +; GCN-NEXT: v_or_b32_e32 v15, v18, v19 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; kill: killed $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: .LBB35_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB35_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v42, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v41, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v40, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v55, v3 +; GCN-NEXT: s_movk_i32 s7, 0x300 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: s_mov_b32 s6, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v50 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v54 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v52 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v62 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v61 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v60 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v51 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GCN-NEXT: v_or_b32_e32 v6, v17, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GCN-NEXT: v_or_b32_e32 v13, v29, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GCN-NEXT: v_or_b32_e32 v15, v27, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GCN-NEXT: v_or_b32_e32 v17, v44, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; GCN-NEXT: v_or_b32_e32 v20, v45, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; GCN-NEXT: v_or_b32_e32 v23, v56, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v25, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v27, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 +; GCN-NEXT: v_or_b32_e32 v29, v43, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_or_b32_e32 v19, v47, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v31, v36 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v32, v8 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v32, v5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v32, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v32, v7 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v32, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v32, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v32, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; GCN-NEXT: v_or_b32_e32 v16, v59, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; GCN-NEXT: v_or_b32_e32 v18, v57, v18 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; GCN-NEXT: v_or_b32_e32 v22, v58, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v32, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x300, v19 +; GCN-NEXT: v_or_b32_e32 v30, v46, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v0, v31, v0 +; GCN-NEXT: v_or_b32_e32 v1, v8, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v10, v9 +; GCN-NEXT: v_or_b32_e32 v6, v12, v11 +; GCN-NEXT: v_or_b32_e32 v7, v14, v13 +; GCN-NEXT: v_or_b32_e32 v8, v16, v15 +; GCN-NEXT: v_or_b32_e32 v9, v18, v17 +; GCN-NEXT: v_or_b32_e32 v10, v22, v20 +; GCN-NEXT: v_or_b32_e32 v11, v24, v23 +; GCN-NEXT: v_or_b32_e32 v12, v26, v25 +; GCN-NEXT: v_or_b32_e32 v13, v28, v27 +; GCN-NEXT: v_or_b32_e32 v14, v21, v29 +; GCN-NEXT: v_or_b32_e32 v15, v30, v19 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 +; GCN-NEXT: .LBB35_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i8_to_v8i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v15 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 +; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB35_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: .LBB35_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB35_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v15, 0x300 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v9, 3, v51 +; VI-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v10, 3, v24 +; VI-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v10, v10, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v11, 3, v30 +; VI-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v12, 3, v60 +; VI-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 3, v42 +; VI-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_or_b32_sdwa v17, v57, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v4, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v53 +; VI-NEXT: v_or_b32_sdwa v16, v23, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v16, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v15, v17, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v28, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v52 +; VI-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v50 +; VI-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v49 +; VI-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v63 +; VI-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v48 +; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v44 +; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v39 +; VI-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v19 +; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: .LBB35_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i8_to_v8i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v24 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB35_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: .LBB35_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB35_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v51 +; GFX9-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v10, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v11, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v12, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v15, 3, v53 +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v16, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v16, v57, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v28, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX9-NEXT: .LBB35_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i8_to_v8i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v50, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v51, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v52, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v53, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v54, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v55, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v118, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v119, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v117, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v114, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v116, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v113, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v101, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v102, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v103, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v112, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v87, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v96, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v97, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v0 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v2 +; GFX11-NEXT: s_waitcnt vmcnt(29) +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v4 +; GFX11-NEXT: s_waitcnt vmcnt(27) +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v6 +; GFX11-NEXT: s_waitcnt vmcnt(25) +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: s_waitcnt vmcnt(21) +; GFX11-NEXT: v_lshlrev_b16 v98, 8, v14 +; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: v_lshlrev_b16 v99, 8, v65 +; GFX11-NEXT: s_waitcnt vmcnt(19) +; GFX11-NEXT: v_lshlrev_b16 v81, 8, v66 +; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: v_lshlrev_b16 v82, 8, v67 +; GFX11-NEXT: s_waitcnt vmcnt(17) +; GFX11-NEXT: v_lshlrev_b16 v83, 8, v83 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v84 +; GFX11-NEXT: s_waitcnt vmcnt(15) +; GFX11-NEXT: v_lshlrev_b16 v85, 8, v85 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-NEXT: s_waitcnt vmcnt(13) +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v129 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v130 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v131 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v10 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v37 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v118 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v117 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v6, v113 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v114 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v116 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v101 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v102 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v103 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v112 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v100 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v11 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v10 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v86 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v71 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v70 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v68 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v87 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v96 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v97 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v98 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v99 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v81 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v82 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v83 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v84 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v85 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v9, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v53 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v51 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v48 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v39 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v27 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v29 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v65 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v66 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v67 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v17, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v18, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v19, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v20, v24, v25 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v12, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v13, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v14, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v15, v19, v20 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB35_2 +; GFX11-NEXT: .LBB35_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v5, v36, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v37, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v118, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v119, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v2, v117, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v114, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v4, v115, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v116, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v113, v6 +; GFX11-NEXT: v_add_nc_u16 v7, v16, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v6, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v22, 3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v101, v7 +; GFX11-NEXT: v_or_b32_e32 v6, v100, v6 +; GFX11-NEXT: v_or_b32_e32 v8, v102, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v103, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v112, v10 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v10 +; GFX11-NEXT: v_add_nc_u16 v6, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v10, v86, 3 +; GFX11-NEXT: v_add_nc_u16 v11, v80, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v71, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v70, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v69, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v68, 3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v87, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v96, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v97, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v98, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v99, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v81, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v82, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v83, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v84, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v85, v15 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v9, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-NEXT: v_add_nc_u16 v11, v64, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v55, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v54, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v53, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v52, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v51, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v50, 3 +; GFX11-NEXT: v_add_nc_u16 v20, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v22, v48, 3 +; GFX11-NEXT: v_add_nc_u16 v24, v39, 3 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_or_b32_e32 v11, v27, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v29, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v65, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v66, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v67, v15 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v20 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v22 +; GFX11-NEXT: v_or_b32_e32 v20, v25, v24 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v12, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v13, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v14, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v15, v19, v20 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <8 x i64> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <8 x i64> + br label %end + +end: + %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i64> %phi +} + +define <32 x i16> @bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f64_to_v32i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v55, v15 +; GCN-NEXT: v_mov_b32_e32 v54, v14 +; GCN-NEXT: v_mov_b32_e32 v53, v13 +; GCN-NEXT: v_mov_b32_e32 v52, v12 +; GCN-NEXT: v_mov_b32_e32 v51, v11 +; GCN-NEXT: v_mov_b32_e32 v50, v10 +; GCN-NEXT: v_mov_b32_e32 v49, v9 +; GCN-NEXT: v_mov_b32_e32 v48, v8 +; GCN-NEXT: v_mov_b32_e32 v38, v7 +; GCN-NEXT: v_mov_b32_e32 v37, v6 +; GCN-NEXT: v_mov_b32_e32 v36, v5 +; GCN-NEXT: v_mov_b32_e32 v35, v4 +; GCN-NEXT: v_mov_b32_e32 v34, v3 +; GCN-NEXT: v_mov_b32_e32 v33, v2 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB36_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16 +; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16 +; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16 +; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16 +; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16 +; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16 +; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16 +; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: .LBB36_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB36_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 +; GCN-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 +; GCN-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 +; GCN-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 +; GCN-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 +; GCN-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 +; GCN-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 +; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16 +; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16 +; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16 +; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16 +; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16 +; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16 +; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16 +; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: .LBB36_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v33 +; GCN-NEXT: v_mov_b32_e32 v6, v34 +; GCN-NEXT: v_mov_b32_e32 v8, v35 +; GCN-NEXT: v_mov_b32_e32 v10, v36 +; GCN-NEXT: v_mov_b32_e32 v12, v37 +; GCN-NEXT: v_mov_b32_e32 v14, v38 +; GCN-NEXT: v_mov_b32_e32 v16, v48 +; GCN-NEXT: v_mov_b32_e32 v18, v49 +; GCN-NEXT: v_mov_b32_e32 v20, v50 +; GCN-NEXT: v_mov_b32_e32 v22, v51 +; GCN-NEXT: v_mov_b32_e32 v24, v52 +; GCN-NEXT: v_mov_b32_e32 v26, v53 +; GCN-NEXT: v_mov_b32_e32 v28, v54 +; GCN-NEXT: v_mov_b32_e32 v30, v55 +; GCN-NEXT: v_mov_b32_e32 v1, v32 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f64_to_v32i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB36_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB36_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f64_to_v32i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB36_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB36_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f64_to_v32i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB36_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB36_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + +define <8 x double> @bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i16_to_v8f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v38, v14 +; GCN-NEXT: v_mov_b32_e32 v37, v12 +; GCN-NEXT: v_mov_b32_e32 v36, v10 +; GCN-NEXT: v_mov_b32_e32 v35, v8 +; GCN-NEXT: v_mov_b32_e32 v34, v6 +; GCN-NEXT: v_mov_b32_e32 v33, v4 +; GCN-NEXT: v_mov_b32_e32 v32, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB37_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB37_4 +; GCN-NEXT: .LBB37_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB37_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; GCN-NEXT: v_or_b32_e32 v0, v0, v54 +; GCN-NEXT: v_or_b32_e32 v1, v1, v55 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v2, v2, v39 +; GCN-NEXT: v_or_b32_e32 v3, v3, v48 +; GCN-NEXT: v_or_b32_e32 v4, v4, v49 +; GCN-NEXT: v_or_b32_e32 v5, v5, v50 +; GCN-NEXT: v_or_b32_e32 v6, v6, v51 +; GCN-NEXT: v_or_b32_e32 v7, v7, v52 +; GCN-NEXT: v_or_b32_e32 v8, v8, v17 +; GCN-NEXT: v_or_b32_e32 v9, v9, v19 +; GCN-NEXT: v_or_b32_e32 v10, v10, v21 +; GCN-NEXT: v_or_b32_e32 v11, v11, v23 +; GCN-NEXT: v_or_b32_e32 v12, v12, v25 +; GCN-NEXT: v_or_b32_e32 v13, v13, v27 +; GCN-NEXT: v_or_b32_e32 v14, v14, v29 +; GCN-NEXT: v_or_b32_e32 v15, v15, v53 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB37_2 +; GCN-NEXT: .LBB37_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v0, v54, v0 +; GCN-NEXT: v_or_b32_e32 v1, v55, v1 +; GCN-NEXT: v_or_b32_e32 v2, v39, v2 +; GCN-NEXT: v_or_b32_e32 v3, v48, v3 +; GCN-NEXT: v_or_b32_e32 v4, v49, v4 +; GCN-NEXT: v_or_b32_e32 v5, v50, v5 +; GCN-NEXT: v_or_b32_e32 v6, v51, v6 +; GCN-NEXT: v_or_b32_e32 v7, v52, v7 +; GCN-NEXT: v_or_b32_e32 v8, v17, v8 +; GCN-NEXT: v_or_b32_e32 v9, v19, v9 +; GCN-NEXT: v_or_b32_e32 v10, v21, v10 +; GCN-NEXT: v_or_b32_e32 v11, v23, v11 +; GCN-NEXT: v_or_b32_e32 v12, v25, v12 +; GCN-NEXT: v_or_b32_e32 v13, v27, v13 +; GCN-NEXT: v_or_b32_e32 v14, v29, v14 +; GCN-NEXT: v_or_b32_e32 v15, v53, v15 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i16_to_v8f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB37_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 3 +; VI-NEXT: v_add_u16_e32 v16, 3, v15 +; VI-NEXT: v_add_u16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: v_add_u16_e32 v16, 3, v14 +; VI-NEXT: v_add_u16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v16, v14 +; VI-NEXT: v_add_u16_e32 v16, 3, v13 +; VI-NEXT: v_add_u16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v16, v13 +; VI-NEXT: v_add_u16_e32 v16, 3, v12 +; VI-NEXT: v_add_u16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v16, v12 +; VI-NEXT: v_add_u16_e32 v16, 3, v11 +; VI-NEXT: v_add_u16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v16, v11 +; VI-NEXT: v_add_u16_e32 v16, 3, v10 +; VI-NEXT: v_add_u16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v16, v10 +; VI-NEXT: v_add_u16_e32 v16, 3, v9 +; VI-NEXT: v_add_u16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v16, v9 +; VI-NEXT: v_add_u16_e32 v16, 3, v8 +; VI-NEXT: v_add_u16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v16, v8 +; VI-NEXT: v_add_u16_e32 v16, 3, v7 +; VI-NEXT: v_add_u16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v16, v7 +; VI-NEXT: v_add_u16_e32 v16, 3, v6 +; VI-NEXT: v_add_u16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v16, v6 +; VI-NEXT: v_add_u16_e32 v16, 3, v5 +; VI-NEXT: v_add_u16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v16, v5 +; VI-NEXT: v_add_u16_e32 v16, 3, v4 +; VI-NEXT: v_add_u16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v16, v4 +; VI-NEXT: v_add_u16_e32 v16, 3, v3 +; VI-NEXT: v_add_u16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v3, v16, v3 +; VI-NEXT: v_add_u16_e32 v16, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v16, v2 +; VI-NEXT: v_add_u16_e32 v16, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v16, v1 +; VI-NEXT: v_add_u16_e32 v16, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: .LBB37_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i16_to_v8f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB37_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB37_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i16_to_v8f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB37_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB37_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + +define <32 x half> @bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f64_to_v32f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB38_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: .LBB38_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB38_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 +; GCN-NEXT: .LBB38_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v39 +; GCN-NEXT: v_mov_b32_e32 v1, v55 +; GCN-NEXT: v_mov_b32_e32 v2, v32 +; GCN-NEXT: v_mov_b32_e32 v3, v54 +; GCN-NEXT: v_mov_b32_e32 v4, v33 +; GCN-NEXT: v_mov_b32_e32 v5, v53 +; GCN-NEXT: v_mov_b32_e32 v6, v34 +; GCN-NEXT: v_mov_b32_e32 v7, v52 +; GCN-NEXT: v_mov_b32_e32 v8, v35 +; GCN-NEXT: v_mov_b32_e32 v9, v51 +; GCN-NEXT: v_mov_b32_e32 v10, v36 +; GCN-NEXT: v_mov_b32_e32 v11, v50 +; GCN-NEXT: v_mov_b32_e32 v12, v37 +; GCN-NEXT: v_mov_b32_e32 v13, v49 +; GCN-NEXT: v_mov_b32_e32 v14, v38 +; GCN-NEXT: v_mov_b32_e32 v15, v48 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f64_to_v32f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB38_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB38_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f64_to_v32f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB38_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB38_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f64_to_v32f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB38_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB38_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + +define <8 x double> @bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f16_to_v8f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB39_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; GCN-NEXT: v_or_b32_e32 v0, v44, v0 +; GCN-NEXT: v_or_b32_e32 v1, v42, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 +; GCN-NEXT: v_or_b32_e32 v2, v52, v2 +; GCN-NEXT: v_or_b32_e32 v3, v50, v3 +; GCN-NEXT: v_or_b32_e32 v4, v48, v4 +; GCN-NEXT: v_or_b32_e32 v5, v38, v5 +; GCN-NEXT: v_or_b32_e32 v6, v36, v6 +; GCN-NEXT: v_or_b32_e32 v7, v34, v7 +; GCN-NEXT: v_or_b32_e32 v8, v33, v8 +; GCN-NEXT: v_or_b32_e32 v9, v32, v9 +; GCN-NEXT: v_or_b32_e32 v10, v31, v10 +; GCN-NEXT: v_or_b32_e32 v11, v21, v11 +; GCN-NEXT: v_or_b32_e32 v12, v19, v12 +; GCN-NEXT: v_or_b32_e32 v13, v18, v13 +; GCN-NEXT: v_or_b32_e32 v14, v17, v14 +; GCN-NEXT: v_or_b32_e32 v15, v16, v15 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: .LBB39_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB39_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v9, v8 +; GCN-NEXT: v_or_b32_e32 v6, v11, v10 +; GCN-NEXT: v_or_b32_e32 v7, v13, v12 +; GCN-NEXT: v_or_b32_e32 v8, v15, v14 +; GCN-NEXT: v_or_b32_e32 v9, v26, v24 +; GCN-NEXT: v_or_b32_e32 v10, v28, v27 +; GCN-NEXT: v_or_b32_e32 v11, v21, v29 +; GCN-NEXT: v_or_b32_e32 v12, v19, v25 +; GCN-NEXT: v_or_b32_e32 v13, v18, v23 +; GCN-NEXT: v_or_b32_e32 v14, v17, v22 +; GCN-NEXT: v_or_b32_e32 v15, v16, v20 +; GCN-NEXT: .LBB39_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f16_to_v8f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB39_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 0x200 +; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 +; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v17 +; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v17 +; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v17 +; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v17 +; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v17 +; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v17 +; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v17 +; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v17 +; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v17 +; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v17 +; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v17 +; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v17 +; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v17 +; VI-NEXT: v_or_b32_e32 v0, v0, v16 +; VI-NEXT: .LBB39_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f16_to_v8f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB39_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB39_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f16_to_v8f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB39_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB39_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + +define <32 x bfloat> @bitcast_v8f64_to_v32bf16(<8 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f64_to_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB40_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v6 +; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v4 +; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v1 +; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: .LBB40_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB40_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v13 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v12 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v10 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v6 +; GCN-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v5 +; GCN-NEXT: v_and_b32_e32 v38, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v4 +; GCN-NEXT: v_and_b32_e32 v48, 0xffff0000, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; GCN-NEXT: v_and_b32_e32 v50, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v1 +; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v0 +; GCN-NEXT: .LBB40_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v55 +; GCN-NEXT: v_mov_b32_e32 v1, v54 +; GCN-NEXT: v_mov_b32_e32 v2, v53 +; GCN-NEXT: v_mov_b32_e32 v3, v52 +; GCN-NEXT: v_mov_b32_e32 v4, v51 +; GCN-NEXT: v_mov_b32_e32 v5, v50 +; GCN-NEXT: v_mov_b32_e32 v6, v49 +; GCN-NEXT: v_mov_b32_e32 v7, v48 +; GCN-NEXT: v_mov_b32_e32 v8, v39 +; GCN-NEXT: v_mov_b32_e32 v9, v38 +; GCN-NEXT: v_mov_b32_e32 v10, v37 +; GCN-NEXT: v_mov_b32_e32 v11, v36 +; GCN-NEXT: v_mov_b32_e32 v12, v35 +; GCN-NEXT: v_mov_b32_e32 v13, v34 +; GCN-NEXT: v_mov_b32_e32 v14, v33 +; GCN-NEXT: v_mov_b32_e32 v15, v32 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f64_to_v32bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB40_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: .LBB40_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f64_to_v32bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB40_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: .LBB40_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f64_to_v32bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB40_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB40_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + +define <8 x double> @bitcast_v32bf16_to_v8f64(<32 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v32bf16_to_v8f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v28 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v46 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v30 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB41_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v42 +; GCN-NEXT: v_alignbit_b32 v0, v0, v45, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v43, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; GCN-NEXT: v_alignbit_b32 v2, v2, v51, 16 +; GCN-NEXT: v_alignbit_b32 v3, v3, v49, 16 +; GCN-NEXT: v_alignbit_b32 v4, v4, v39, 16 +; GCN-NEXT: v_alignbit_b32 v5, v5, v37, 16 +; GCN-NEXT: v_alignbit_b32 v6, v6, v36, 16 +; GCN-NEXT: v_alignbit_b32 v7, v7, v34, 16 +; GCN-NEXT: v_alignbit_b32 v8, v8, v33, 16 +; GCN-NEXT: v_alignbit_b32 v9, v9, v32, 16 +; GCN-NEXT: v_alignbit_b32 v10, v10, v31, 16 +; GCN-NEXT: v_alignbit_b32 v11, v11, v21, 16 +; GCN-NEXT: v_alignbit_b32 v12, v12, v19, 16 +; GCN-NEXT: v_alignbit_b32 v13, v13, v18, 16 +; GCN-NEXT: v_alignbit_b32 v14, v14, v17, 16 +; GCN-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: .LBB41_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB41_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v45 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v42 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v41 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v40 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v39 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v54 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v53 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v34 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v48 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v31 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v35 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v5, v4, 16 +; GCN-NEXT: v_alignbit_b32 v4, v7, v6, 16 +; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v6, v11, v10, 16 +; GCN-NEXT: v_alignbit_b32 v7, v13, v12, 16 +; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 +; GCN-NEXT: v_alignbit_b32 v9, v26, v24, 16 +; GCN-NEXT: v_alignbit_b32 v10, v28, v27, 16 +; GCN-NEXT: v_alignbit_b32 v11, v29, v21, 16 +; GCN-NEXT: v_alignbit_b32 v12, v25, v19, 16 +; GCN-NEXT: v_alignbit_b32 v13, v23, v18, 16 +; GCN-NEXT: v_alignbit_b32 v14, v22, v17, 16 +; GCN-NEXT: v_alignbit_b32 v15, v20, v16, 16 +; GCN-NEXT: .LBB41_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32bf16_to_v8f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v15, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v14 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v13 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v12 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v11 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v10 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v9 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v8 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v7 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v6 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v5 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v4 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v3 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v2 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v1 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v16, 16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v8f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v14, v14, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v13, v13, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v12, v12, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v11, v11, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v10, v10, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v9, v9, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v8, v8, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v7, v7, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v6, v6, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v5, v5, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v4, v4, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v3, v3, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v2, v2, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v1, v1, v16, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; GFX9-NEXT: v_perm_b32 v0, v0, v16, s7 +; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v8f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v23, v14, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_add3_u32 v21, v21, v17, 0x7fff +; GFX11-NEXT: v_add3_u32 v18, v18, v16, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v16, v18, v19, vcc_lo +; GFX11-NEXT: v_add3_u32 v19, v23, v14, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_or_b32_e32 v18, 0x400000, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_bfe_u32 v20, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v15 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v20, v20, v15, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v15, v20, v22 :: v_dual_lshlrev_b32 v20, 16, v13 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v15, v15, v16, 0x7060302 +; GFX11-NEXT: v_dual_cndmask_b32 v17, v21, v18 :: v_dual_add_f32 v18, 0x40c00000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_add3_u32 v16, v16, v18, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v14, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_perm_b32 v14, v14, v17, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_cndmask_b32 v16, v16, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_add3_u32 v17, v17, v13, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v21, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v17, v22, v19, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_perm_b32 v13, v13, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v21 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_add3_u32 v16, v16, v19, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_bfe_u32 v18, v12, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v18, v18, v12, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v12, v18, v22, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v10 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v12, v12, v17, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add3_u32 v17, v17, v11, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v19, v10, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v19, v19, v10, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v11, v17, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v22, v18, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v10 +; GFX11-NEXT: v_perm_b32 v11, v11, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v19, v22 :: v_dual_lshlrev_b32 v21, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v19, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_perm_b32 v10, v10, v17, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v16, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_add3_u32 v16, v16, v18, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_bfe_u32 v18, v8, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_add3_u32 v18, v18, v8, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v17, v17, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v17, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v22, v19, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v8 +; GFX11-NEXT: v_perm_b32 v9, v9, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v17, v17, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v18, v22, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v8, v8, v17, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v19, 0x40c00000, v21 +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v16, v19, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add3_u32 v16, v16, v19, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_bfe_u32 v19, v6, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v16, v16, v20 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_add3_u32 v19, v19, v6, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v17, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: v_add3_u32 v17, v17, v7, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v17, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v17, v22, v18, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v6 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_cndmask_b32 v17, v17, v20 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_dual_cndmask_b32 v6, v19, v22 :: v_dual_lshlrev_b32 v19, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_perm_b32 v7, v7, v16, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v6, v6, v17, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v17, 0x40c00000, v19 +; GFX11-NEXT: v_add3_u32 v19, v20, v18, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_bfe_u32 v22, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v16, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v16, v16, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v16, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v16, v22, v17, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v5, v5, v18, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v16, v19, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_add3_u32 v18, v18, v4, 0x7fff +; GFX11-NEXT: v_dual_add_f32 v17, 0x40c00000, v20 :: v_dual_lshlrev_b32 v20, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc_lo +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v18, 0x40c00000, v20 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_perm_b32 v4, v4, v16, 0x7060302 +; GFX11-NEXT: v_add3_u32 v19, v21, v17, 0x7fff +; GFX11-NEXT: v_bfe_u32 v21, v3, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc_lo +; GFX11-NEXT: v_add3_u32 v19, v21, v3, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v3 +; GFX11-NEXT: v_add3_u32 v21, v22, v18, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v1 +; GFX11-NEXT: v_bfe_u32 v24, v2, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v20, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v22 +; GFX11-NEXT: v_add3_u32 v20, v24, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v3, v3, v17, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v18, v21, v23, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v2 +; GFX11-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v20, v21 :: v_dual_lshlrev_b32 v23, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v21, v22, v19, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v19 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add_f32_e32 v20, 0x40c00000, v23 +; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v19, v21, v22 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v24, v20, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v20 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v21, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v24, v24, v20, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v21, v21, v0, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v23, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v22, v23, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v22, v23, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-NEXT: v_perm_b32 v1, v1, v19, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v24, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v21, v26, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v20, 0x7060302 +; GFX11-NEXT: .LBB41_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + +define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { +; GCN-LABEL: bitcast_v8f64_to_v64i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB42_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 8 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 8 +; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 24 +; GCN-NEXT: v_alignbit_b32 v21, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 8 +; GCN-NEXT: v_alignbit_b32 v27, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v34, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v32, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v48, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v39, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v40, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v52, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v42, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v54, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v55, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v46, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GCN-NEXT: .LBB42_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB42_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; GCN-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GCN-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GCN-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GCN-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GCN-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GCN-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GCN-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v16, v15, 8 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 24 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v17, v14, v13, 16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v24, v14, v13, 8 +; GCN-NEXT: v_alignbit_b32 v22, v12, v11, 24 +; GCN-NEXT: v_alignbit_b32 v21, v12, v11, 16 +; GCN-NEXT: v_alignbit_b32 v30, v12, v11, 8 +; GCN-NEXT: v_alignbit_b32 v27, v10, v9, 24 +; GCN-NEXT: v_alignbit_b32 v28, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v34, v10, v9, 8 +; GCN-NEXT: v_alignbit_b32 v32, v8, v7, 24 +; GCN-NEXT: v_alignbit_b32 v33, v8, v7, 16 +; GCN-NEXT: v_alignbit_b32 v48, v8, v7, 8 +; GCN-NEXT: v_alignbit_b32 v37, v6, v5, 24 +; GCN-NEXT: v_alignbit_b32 v39, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v40, v6, v5, 8 +; GCN-NEXT: v_alignbit_b32 v49, v4, v3, 24 +; GCN-NEXT: v_alignbit_b32 v52, v4, v3, 16 +; GCN-NEXT: v_alignbit_b32 v42, v4, v3, 8 +; GCN-NEXT: v_alignbit_b32 v54, v2, v1, 24 +; GCN-NEXT: v_alignbit_b32 v55, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v45, v2, v1, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 24, v16 +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 8, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 8, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 24, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v56, 8, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v41, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v59, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v46, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v61, 8, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v60, 24, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v2 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GCN-NEXT: .LBB42_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v45 +; GCN-NEXT: v_or_b32_e32 v45, v1, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GCN-NEXT: v_or_b32_e32 v18, v2, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v42 +; GCN-NEXT: v_or_b32_e32 v42, v3, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v63 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v63, v4, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; GCN-NEXT: v_or_b32_e32 v40, v5, v1 +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 24, v54 +; GCN-NEXT: v_and_b32_e32 v62, 0xff, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v60, 24, v60 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v61 +; GCN-NEXT: v_or_b32_e32 v14, v6, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: v_and_b32_e32 v61, 0xff, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; GCN-NEXT: v_or_b32_e32 v15, v7, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v58, 0xff, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v59 +; GCN-NEXT: v_or_b32_e32 v16, v8, v3 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v52, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v34 +; GCN-NEXT: v_or_b32_e32 v17, v9, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v56 +; GCN-NEXT: v_or_b32_e32 v34, v10, v5 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; GCN-NEXT: v_and_b32_e32 v46, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v30 +; GCN-NEXT: v_or_b32_e32 v30, v11, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v43, 0xff, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v44 +; GCN-NEXT: v_or_b32_e32 v11, v12, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; GCN-NEXT: v_and_b32_e32 v41, 0xff, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; GCN-NEXT: v_or_b32_e32 v13, v13, v8 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v50 +; GCN-NEXT: v_or_b32_e32 v24, v19, v9 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 36, v0 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v22 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GCN-NEXT: v_or_b32_e32 v21, v20, v10 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v51, 0xff, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 24, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v35 +; GCN-NEXT: v_or_b32_e32 v23, v23, v12 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 44, v0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v44, 0xff, v19 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v55 +; GCN-NEXT: v_or_b32_e32 v36, v54, v19 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v54, 0xff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v62 +; GCN-NEXT: v_or_b32_e32 v31, v60, v20 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v28 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v61 +; GCN-NEXT: v_or_b32_e32 v56, v25, v28 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v58 +; GCN-NEXT: v_or_b32_e32 v57, v57, v25 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 60, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_and_b32_e32 v45, 0xffff, v45 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v42, 0xffff, v42 +; GCN-NEXT: v_and_b32_e32 v58, 0xffff, v63 +; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v40 +; GCN-NEXT: v_or_b32_e32 v37, v37, v52 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v39, v39, v47 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_or_b32_e32 v32, v32, v46 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v33, v33, v43 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v27, v27, v41 +; GCN-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GCN-NEXT: v_or_b32_e32 v48, v48, v53 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v49, v49, v50 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_or_b32_e32 v22, v22, v51 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v35, v35, v44 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_or_b32_e32 v29, v29, v54 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v38, v38, v55 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_or_b32_e32 v26, v59, v26 +; GCN-NEXT: v_or_b32_e32 v36, v45, v36 +; GCN-NEXT: v_or_b32_e32 v18, v18, v31 +; GCN-NEXT: v_or_b32_e32 v31, v42, v56 +; GCN-NEXT: v_or_b32_e32 v50, v58, v57 +; GCN-NEXT: v_or_b32_e32 v37, v40, v37 +; GCN-NEXT: v_or_b32_e32 v14, v14, v39 +; GCN-NEXT: v_or_b32_e32 v15, v15, v32 +; GCN-NEXT: v_or_b32_e32 v16, v16, v33 +; GCN-NEXT: v_or_b32_e32 v17, v17, v27 +; GCN-NEXT: v_or_b32_e32 v27, v34, v48 +; GCN-NEXT: v_or_b32_e32 v30, v30, v49 +; GCN-NEXT: v_or_b32_e32 v11, v11, v22 +; GCN-NEXT: v_or_b32_e32 v13, v13, v35 +; GCN-NEXT: v_or_b32_e32 v22, v24, v29 +; GCN-NEXT: v_or_b32_e32 v21, v21, v38 +; GCN-NEXT: v_or_b32_e32 v23, v23, v26 +; GCN-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v50, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v37, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8f64_to_v64i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: .LBB42_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB42_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; VI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; VI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; VI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; VI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; VI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; VI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; VI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: .LBB42_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v27, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v28 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: v_or_b32_sdwa v2, v2, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8f64_to_v64i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v16 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v26, v23 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: .LBB42_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB42_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX9-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX9-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX9-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX9-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX9-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX9-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; GFX9-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB42_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v27, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8f64_to_v64i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB42_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB42_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; GFX11-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; GFX11-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; GFX11-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; GFX11-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; GFX11-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; GFX11-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; GFX11-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB42_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v25 +; GFX11-NEXT: v_and_b32_e32 v96, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v86 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v25 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v87 +; GFX11-NEXT: v_or_b32_e32 v24, v96, v24 +; GFX11-NEXT: v_lshlrev_b16 v85, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v86, v85 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v83 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v24 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v82 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v80, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v71, 8, v71 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v70 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v22 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v84 +; GFX11-NEXT: v_or_b32_e32 v23, v83, v23 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v80 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v71 +; GFX11-NEXT: v_or_b32_e32 v22, v70, v22 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v24 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v23 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v25 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v65 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v22 +; GFX11-NEXT: v_or_b32_e32 v21, v23, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v22, v25, v54 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v52 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v51 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v50 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v49 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v24 +; GFX11-NEXT: v_or_b32_e32 v23, v25, v49 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v48 +; GFX11-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v22 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v23 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v37 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v35 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v34 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b16 v69, 8, v69 +; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v68 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v67 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v19, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v22 +; GFX11-NEXT: v_or_b32_e32 v18, v23, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v31 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v30 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v28 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v26 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v69 +; GFX11-NEXT: v_or_b32_e32 v67, v68, v67 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v22 +; GFX11-NEXT: v_or_b32_e32 v17, v23, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v67 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v21 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) + %a2 = bitcast <8 x double> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <8 x double> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + +define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i8_to_v8f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_mov_b32_e32 v38, v14 +; GCN-NEXT: v_mov_b32_e32 v34, v12 +; GCN-NEXT: v_mov_b32_e32 v37, v10 +; GCN-NEXT: v_mov_b32_e32 v33, v8 +; GCN-NEXT: v_mov_b32_e32 v36, v6 +; GCN-NEXT: v_mov_b32_e32 v32, v4 +; GCN-NEXT: v_mov_b32_e32 v35, v2 +; GCN-NEXT: v_mov_b32_e32 v31, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v52 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v41, 8, v5 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v40, 8, v9 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v55, 8, v13 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v45 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v27, 8, v44 +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: v_lshlrev_b32_e32 v59, 24, v59 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 24, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 8, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v12 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v4 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v2 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 24, v46 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB43_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GCN-NEXT: v_or_b32_e32 v0, v0, v42 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_or_b32_e32 v1, v1, v41 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_or_b32_e32 v2, v2, v40 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GCN-NEXT: v_or_b32_e32 v3, v3, v55 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v37 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v38 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v50 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v54 +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v53 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v52 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v23 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v49 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v48 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v39 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v63 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v62 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v61 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v60 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v21 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 +; GCN-NEXT: v_or_b32_e32 v17, v33, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v34 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v20, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v24, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; GCN-NEXT: v_or_b32_e32 v26, v28, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GCN-NEXT: v_or_b32_e32 v27, v35, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_or_b32_e32 v29, v36, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v37 +; GCN-NEXT: v_or_b32_e32 v31, v38, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_or_b32_e32 v9, v9, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v13, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_or_b32_e32 v15, v15, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_or_b32_e32 v18, v18, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v32, v7 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v32, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v32, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v32, v4 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v32, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_or_b32_e32 v25, v59, v25 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v30, v57, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v32, v58, v8 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v9 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v8, v10 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff, v11 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v8, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v8, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v8, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_or_b32_e32 v19, v46, v19 +; GCN-NEXT: v_or_b32_e32 v0, v0, v7 +; GCN-NEXT: v_or_b32_e32 v1, v1, v6 +; GCN-NEXT: v_or_b32_e32 v2, v2, v5 +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_or_b32_e32 v4, v17, v21 +; GCN-NEXT: v_or_b32_e32 v5, v20, v22 +; GCN-NEXT: v_or_b32_e32 v6, v23, v24 +; GCN-NEXT: v_or_b32_e32 v7, v26, v28 +; GCN-NEXT: v_or_b32_e32 v8, v27, v25 +; GCN-NEXT: v_or_b32_e32 v9, v29, v30 +; GCN-NEXT: v_or_b32_e32 v10, v31, v32 +; GCN-NEXT: v_or_b32_e32 v11, v33, v34 +; GCN-NEXT: v_or_b32_e32 v12, v35, v12 +; GCN-NEXT: v_or_b32_e32 v13, v13, v14 +; GCN-NEXT: v_or_b32_e32 v14, v15, v16 +; GCN-NEXT: v_or_b32_e32 v15, v18, v19 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; kill: killed $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: .LBB43_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB43_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_or_b32_e32 v0, v42, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v41, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v2, v40, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v55, v3 +; GCN-NEXT: s_movk_i32 s7, 0x300 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 +; GCN-NEXT: s_mov_b32 s6, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v50 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v54 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v53 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v52 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v62 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v61 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v60 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v51 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GCN-NEXT: v_or_b32_e32 v6, v17, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v9, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v11, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; GCN-NEXT: v_or_b32_e32 v13, v29, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GCN-NEXT: v_or_b32_e32 v15, v27, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GCN-NEXT: v_or_b32_e32 v17, v44, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; GCN-NEXT: v_or_b32_e32 v20, v45, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; GCN-NEXT: v_or_b32_e32 v23, v56, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v26 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v25, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v27, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 +; GCN-NEXT: v_or_b32_e32 v29, v43, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_or_b32_e32 v19, v47, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v31, v36 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v32, v8 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v32, v5 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v32, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s7, v6 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v32, v7 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v32, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v32, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v32, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s7, v15 +; GCN-NEXT: v_or_b32_e32 v16, v59, v16 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s7, v17 +; GCN-NEXT: v_or_b32_e32 v18, v57, v18 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; GCN-NEXT: v_or_b32_e32 v22, v58, v22 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v32, v26 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 0x300, v19 +; GCN-NEXT: v_or_b32_e32 v30, v46, v30 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v0, v31, v0 +; GCN-NEXT: v_or_b32_e32 v1, v8, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_or_b32_e32 v4, v7, v6 +; GCN-NEXT: v_or_b32_e32 v5, v10, v9 +; GCN-NEXT: v_or_b32_e32 v6, v12, v11 +; GCN-NEXT: v_or_b32_e32 v7, v14, v13 +; GCN-NEXT: v_or_b32_e32 v8, v16, v15 +; GCN-NEXT: v_or_b32_e32 v9, v18, v17 +; GCN-NEXT: v_or_b32_e32 v10, v22, v20 +; GCN-NEXT: v_or_b32_e32 v11, v24, v23 +; GCN-NEXT: v_or_b32_e32 v12, v26, v25 +; GCN-NEXT: v_or_b32_e32 v13, v28, v27 +; GCN-NEXT: v_or_b32_e32 v14, v21, v29 +; GCN-NEXT: v_or_b32_e32 v15, v30, v19 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v14 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v15 +; GCN-NEXT: .LBB43_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i8_to_v8f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v20, 8, v19 +; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v15 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 +; VI-NEXT: v_lshlrev_b16_e32 v61, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v43, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v24 +; VI-NEXT: v_lshlrev_b16_e32 v55, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB43_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: .LBB43_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB43_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v15, 0x300 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u16_e32 v9, 3, v51 +; VI-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v9, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v10, 3, v24 +; VI-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v10, v10, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v11, 3, v30 +; VI-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v11, v11, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v12, 3, v60 +; VI-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v12, v12, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 3, v47 +; VI-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v13, v13, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 3, v42 +; VI-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v14, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v17, 3, v17 +; VI-NEXT: v_or_b32_sdwa v17, v57, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v3 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v4, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v5, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v6, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 3, v53 +; VI-NEXT: v_or_b32_sdwa v16, v23, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v16, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v8, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v2, 0x300, v2 +; VI-NEXT: v_add_u16_sdwa v3, v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v15, v17, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v5 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 0x300, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_sdwa v6, v28, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v6, 0x300, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v7 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v7, 0x300, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_add_u16_e32 v8, 3, v52 +; VI-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v8, 0x300, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v9 +; VI-NEXT: v_add_u16_e32 v9, 3, v50 +; VI-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v9, 0x300, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v10 +; VI-NEXT: v_add_u16_e32 v10, 3, v49 +; VI-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v10, 0x300, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v11 +; VI-NEXT: v_add_u16_e32 v11, 3, v63 +; VI-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v11, 0x300, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v12 +; VI-NEXT: v_add_u16_e32 v12, 3, v48 +; VI-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v12, 0x300, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v13 +; VI-NEXT: v_add_u16_e32 v13, 3, v44 +; VI-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v13, 0x300, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v14 +; VI-NEXT: v_add_u16_e32 v14, 3, v39 +; VI-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v16 +; VI-NEXT: v_add_u16_e32 v16, 3, v19 +; VI-NEXT: v_or_b32_sdwa v16, v21, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v15, v16, v15 +; VI-NEXT: .LBB43_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i8_to_v8f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v62, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v19 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:124 +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v15 +; GFX9-NEXT: s_waitcnt vmcnt(25) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v24 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v30 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v53 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v42 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v44 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v47 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v57 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v60 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v63 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB43_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_or_b32_sdwa v9, v51, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v10, v24, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v30, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v60, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v47, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v42, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v53, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v52, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v50, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v63, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v48, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v44, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v39, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v19, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: .LBB43_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB43_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_add_u16_e32 v9, 3, v51 +; GFX9-NEXT: v_or_b32_sdwa v9, v58, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v9, v9, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v10, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v10, v46, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v10, v10, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v11, 3, v30 +; GFX9-NEXT: v_or_b32_sdwa v11, v43, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v11, v11, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v12, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v12, v40, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v12, v12, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v13, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v13, v54, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v13, v13, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v14, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v14, v14, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v15, 3, v53 +; GFX9-NEXT: v_or_b32_sdwa v15, v23, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v15, v15, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_add_u16_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v16, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v4, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v20, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v5, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v26, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v6, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v38, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v7, v7, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v16, 3, v17 +; GFX9-NEXT: v_or_b32_sdwa v16, v57, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v16, v16, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v8, 3, v8 +; GFX9-NEXT: v_or_b32_sdwa v8, v61, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_sdwa v8, v8, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v2, 3, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, 3, v3 +; GFX9-NEXT: v_or_b32_sdwa v3, v37, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v4, 3, v4 +; GFX9-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v4 +; GFX9-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v5, 3, v5 +; GFX9-NEXT: v_or_b32_sdwa v5, v22, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v5 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v6, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v6, v28, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v6 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v7, 3, v7 +; GFX9-NEXT: v_or_b32_sdwa v7, v62, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v7, v8 +; GFX9-NEXT: v_add_u16_e32 v8, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v8, v59, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v8 +; GFX9-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX9-NEXT: v_add_u16_e32 v9, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v9, v56, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v9 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX9-NEXT: v_add_u16_e32 v10, 3, v49 +; GFX9-NEXT: v_or_b32_sdwa v10, v45, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v10 +; GFX9-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX9-NEXT: v_add_u16_e32 v11, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v11, v41, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v11 +; GFX9-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX9-NEXT: v_add_u16_e32 v12, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v12, v55, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v12 +; GFX9-NEXT: v_or_b32_e32 v12, v12, v13 +; GFX9-NEXT: v_add_u16_e32 v13, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v13 +; GFX9-NEXT: v_or_b32_e32 v13, v13, v14 +; GFX9-NEXT: v_add_u16_e32 v14, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v14, v25, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v14 +; GFX9-NEXT: v_or_b32_e32 v14, v14, v15 +; GFX9-NEXT: v_add_u16_e32 v15, 3, v19 +; GFX9-NEXT: v_or_b32_sdwa v15, v21, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v15 +; GFX9-NEXT: v_or_b32_e32 v15, v15, v16 +; GFX9-NEXT: .LBB43_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i8_to_v8f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v37, v12 +; GFX11-NEXT: v_dual_mov_b32 v36, v10 :: v_dual_mov_b32 v35, v8 +; GFX11-NEXT: v_dual_mov_b32 v34, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_dual_mov_b32 v32, v2 :: v_dual_mov_b32 v31, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v50, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v51, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v52, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v53, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v54, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v55, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v118, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v119, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v117, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v114, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v116, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v113, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v101, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v102, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v103, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v112, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v87, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v96, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v97, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v0 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v2 +; GFX11-NEXT: s_waitcnt vmcnt(29) +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v4 +; GFX11-NEXT: s_waitcnt vmcnt(27) +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v6 +; GFX11-NEXT: s_waitcnt vmcnt(25) +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v8 +; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: s_waitcnt vmcnt(21) +; GFX11-NEXT: v_lshlrev_b16 v98, 8, v14 +; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: v_lshlrev_b16 v99, 8, v65 +; GFX11-NEXT: s_waitcnt vmcnt(19) +; GFX11-NEXT: v_lshlrev_b16 v81, 8, v66 +; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: v_lshlrev_b16 v82, 8, v67 +; GFX11-NEXT: s_waitcnt vmcnt(17) +; GFX11-NEXT: v_lshlrev_b16 v83, 8, v83 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v84 +; GFX11-NEXT: s_waitcnt vmcnt(15) +; GFX11-NEXT: v_lshlrev_b16 v85, 8, v85 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v128 +; GFX11-NEXT: s_waitcnt vmcnt(13) +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v129 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: v_lshlrev_b16 v65, 8, v130 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: v_lshlrev_b16 v66, 8, v131 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v10 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v37 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v34 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v118 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v119 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v35 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v117 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v6, v113 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v114 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v115 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v116 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v101 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v38 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v102 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v103 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v112 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v100 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v11 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v10 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v86 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v71 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v70 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v68 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v87 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v96 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v97 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v98 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v99 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v81 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v82 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v83 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v84 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v85 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v9, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v64 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v55 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v54 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v53 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v52 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v51 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v50 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v49 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v48 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v39 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v27 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v29 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v65 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v66 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v67 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v17 +; GFX11-NEXT: v_or_b32_e32 v17, v18, v19 +; GFX11-NEXT: v_or_b32_e32 v18, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v19, v22, v23 +; GFX11-NEXT: v_or_b32_e32 v20, v24, v25 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v12, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v13, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v14, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v15, v19, v20 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB43_2 +; GFX11-NEXT: .LBB43_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v31, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v32, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v35, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v5, v36, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v37, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v118, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v119, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v2, v117, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v114, v3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v4, v115, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v116, v5 +; GFX11-NEXT: v_or_b32_e32 v6, v113, v6 +; GFX11-NEXT: v_add_nc_u16 v7, v16, 3 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v5 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v6, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v18, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v20, 3 +; GFX11-NEXT: v_add_nc_u16 v10, v22, 3 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v7, v101, v7 +; GFX11-NEXT: v_or_b32_e32 v6, v100, v6 +; GFX11-NEXT: v_or_b32_e32 v8, v102, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v103, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v112, v10 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v6 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v8 +; GFX11-NEXT: v_or_b32_e32 v5, v9, v10 +; GFX11-NEXT: v_add_nc_u16 v6, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v7, v26, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v28, 3 +; GFX11-NEXT: v_add_nc_u16 v9, v30, 3 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v10, v86, 3 +; GFX11-NEXT: v_add_nc_u16 v11, v80, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v71, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v70, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v69, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v68, 3 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v87, v6 +; GFX11-NEXT: v_or_b32_e32 v7, v96, v7 +; GFX11-NEXT: v_or_b32_e32 v8, v97, v8 +; GFX11-NEXT: v_or_b32_e32 v9, v98, v9 +; GFX11-NEXT: v_or_b32_e32 v10, v99, v10 +; GFX11-NEXT: v_or_b32_e32 v11, v81, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v82, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v83, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v84, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v85, v15 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v6 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v7 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v8 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v9 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v10 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX11-NEXT: v_or_b32_e32 v7, v8, v9 +; GFX11-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-NEXT: v_or_b32_e32 v9, v12, v13 +; GFX11-NEXT: v_or_b32_e32 v10, v14, v15 +; GFX11-NEXT: v_add_nc_u16 v11, v64, 3 +; GFX11-NEXT: v_add_nc_u16 v12, v55, 3 +; GFX11-NEXT: v_add_nc_u16 v13, v54, 3 +; GFX11-NEXT: v_add_nc_u16 v14, v53, 3 +; GFX11-NEXT: v_add_nc_u16 v15, v52, 3 +; GFX11-NEXT: v_add_nc_u16 v16, v51, 3 +; GFX11-NEXT: v_add_nc_u16 v18, v50, 3 +; GFX11-NEXT: v_add_nc_u16 v20, v49, 3 +; GFX11-NEXT: v_add_nc_u16 v22, v48, 3 +; GFX11-NEXT: v_add_nc_u16 v24, v39, 3 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GFX11-NEXT: v_or_b32_e32 v11, v27, v11 +; GFX11-NEXT: v_or_b32_e32 v12, v29, v12 +; GFX11-NEXT: v_or_b32_e32 v13, v65, v13 +; GFX11-NEXT: v_or_b32_e32 v14, v66, v14 +; GFX11-NEXT: v_or_b32_e32 v15, v67, v15 +; GFX11-NEXT: v_or_b32_e32 v16, v17, v16 +; GFX11-NEXT: v_or_b32_e32 v17, v19, v18 +; GFX11-NEXT: v_or_b32_e32 v18, v21, v20 +; GFX11-NEXT: v_or_b32_e32 v19, v23, v22 +; GFX11-NEXT: v_or_b32_e32 v20, v25, v24 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v11 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v12 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v13 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v14 +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v15 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v16 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v18 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v19 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v20 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v12 +; GFX11-NEXT: v_or_b32_e32 v12, v13, v14 +; GFX11-NEXT: v_or_b32_e32 v13, v15, v16 +; GFX11-NEXT: v_or_b32_e32 v14, v17, v18 +; GFX11-NEXT: v_or_b32_e32 v15, v19, v20 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <8 x double> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <8 x double> + br label %end + +end: + %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x double> %phi +} + +define <32 x half> @bitcast_v32i16_to_v32f16(<32 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i16_to_v32f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_mov_b32_e32 v62, v30 +; GCN-NEXT: v_mov_b32_e32 v61, v29 +; GCN-NEXT: v_mov_b32_e32 v60, v28 +; GCN-NEXT: v_mov_b32_e32 v59, v27 +; GCN-NEXT: v_mov_b32_e32 v58, v26 +; GCN-NEXT: v_mov_b32_e32 v57, v25 +; GCN-NEXT: v_mov_b32_e32 v56, v24 +; GCN-NEXT: v_mov_b32_e32 v47, v23 +; GCN-NEXT: v_mov_b32_e32 v46, v22 +; GCN-NEXT: v_mov_b32_e32 v45, v21 +; GCN-NEXT: v_mov_b32_e32 v44, v20 +; GCN-NEXT: v_mov_b32_e32 v43, v19 +; GCN-NEXT: v_mov_b32_e32 v42, v18 +; GCN-NEXT: v_mov_b32_e32 v41, v17 +; GCN-NEXT: v_mov_b32_e32 v40, v16 +; GCN-NEXT: v_mov_b32_e32 v55, v15 +; GCN-NEXT: v_mov_b32_e32 v54, v14 +; GCN-NEXT: v_mov_b32_e32 v53, v13 +; GCN-NEXT: v_mov_b32_e32 v52, v12 +; GCN-NEXT: v_mov_b32_e32 v51, v11 +; GCN-NEXT: v_mov_b32_e32 v50, v10 +; GCN-NEXT: v_mov_b32_e32 v49, v9 +; GCN-NEXT: v_mov_b32_e32 v48, v8 +; GCN-NEXT: v_mov_b32_e32 v39, v7 +; GCN-NEXT: v_mov_b32_e32 v38, v6 +; GCN-NEXT: v_mov_b32_e32 v37, v5 +; GCN-NEXT: v_mov_b32_e32 v36, v4 +; GCN-NEXT: v_mov_b32_e32 v35, v3 +; GCN-NEXT: v_mov_b32_e32 v34, v2 +; GCN-NEXT: v_mov_b32_e32 v33, v1 +; GCN-NEXT: v_mov_b32_e32 v32, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB44_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v56 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v58 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v59 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v60 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v62 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v63 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: .LBB44_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB44_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v62 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v61 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v60 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v59 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v58 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v57 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v56 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v47 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v46 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v45 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v44 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v43 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v42 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v41 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v40 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v55 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v54 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v53 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v52 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v51 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v50 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: .LBB44_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i16_to_v32f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB44_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 3 +; VI-NEXT: v_add_u16_sdwa v19, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v19 +; VI-NEXT: v_add_u16_sdwa v19, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 3, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v19 +; VI-NEXT: v_add_u16_sdwa v19, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 3, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v19 +; VI-NEXT: v_add_u16_sdwa v19, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v12, 3, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v19 +; VI-NEXT: v_add_u16_sdwa v19, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v11, 3, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v19 +; VI-NEXT: v_add_u16_sdwa v19, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v10, 3, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v19 +; VI-NEXT: v_add_u16_sdwa v19, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v9, 3, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v19 +; VI-NEXT: v_add_u16_sdwa v19, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v19 +; VI-NEXT: v_add_u16_sdwa v19, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v19 +; VI-NEXT: v_add_u16_sdwa v19, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v19 +; VI-NEXT: v_add_u16_sdwa v19, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v19 +; VI-NEXT: v_add_u16_sdwa v19, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_add_u16_sdwa v17, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v18, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v19 +; VI-NEXT: v_add_u16_sdwa v19, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v16, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v3, v3, v16 +; VI-NEXT: v_or_b32_e32 v2, v2, v19 +; VI-NEXT: v_or_b32_e32 v1, v1, v18 +; VI-NEXT: v_or_b32_e32 v0, v0, v17 +; VI-NEXT: .LBB44_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i16_to_v32f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB44_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB44_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i16_to_v32f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB44_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB44_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + +define <32 x i16> @bitcast_v32f16_to_v32i16(<32 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f16_to_v32i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v32 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB45_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; GCN-NEXT: v_or_b32_e32 v30, v30, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v27 +; GCN-NEXT: v_or_b32_e32 v26, v26, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v23 +; GCN-NEXT: v_or_b32_e32 v22, v22, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v19 +; GCN-NEXT: v_or_b32_e32 v18, v18, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GCN-NEXT: v_or_b32_e32 v14, v14, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GCN-NEXT: v_or_b32_e32 v10, v10, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v7 +; GCN-NEXT: v_or_b32_e32 v6, v6, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v3 +; GCN-NEXT: v_or_b32_e32 v2, v2, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: v_or_b32_e32 v8, v8, v9 +; GCN-NEXT: v_or_b32_e32 v12, v12, v13 +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_or_b32_e32 v20, v20, v21 +; GCN-NEXT: v_or_b32_e32 v24, v24, v25 +; GCN-NEXT: v_or_b32_e32 v28, v28, v29 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v9, 16 +; GCN-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; GCN-NEXT: v_alignbit_b32 v17, v18, v17, 16 +; GCN-NEXT: v_alignbit_b32 v21, v22, v21, 16 +; GCN-NEXT: v_alignbit_b32 v25, v26, v25, 16 +; GCN-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; GCN-NEXT: .LBB45_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f16_to_v32i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB45_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 +; VI-NEXT: v_add_f16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v19, v15 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v19, v14 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v19, v13 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v19, v12 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v19, v11 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v19, v10 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v19, v9 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v19, v8 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v19, v7 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v19, v6 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v19, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v19, v4 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v17, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_or_b32_e32 v2, v19, v2 +; VI-NEXT: v_or_b32_e32 v1, v18, v1 +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: .LBB45_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f16_to_v32i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB45_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB45_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f16_to_v32i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB45_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB45_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + +define <32 x bfloat> @bitcast_v32i16_to_v32bf16(<32 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i16_to_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v55, v30 +; GCN-NEXT: v_mov_b32_e32 v54, v28 +; GCN-NEXT: v_mov_b32_e32 v53, v26 +; GCN-NEXT: v_mov_b32_e32 v52, v24 +; GCN-NEXT: v_mov_b32_e32 v51, v22 +; GCN-NEXT: v_mov_b32_e32 v50, v20 +; GCN-NEXT: v_mov_b32_e32 v49, v18 +; GCN-NEXT: v_mov_b32_e32 v48, v16 +; GCN-NEXT: v_mov_b32_e32 v39, v14 +; GCN-NEXT: v_mov_b32_e32 v38, v12 +; GCN-NEXT: v_mov_b32_e32 v37, v10 +; GCN-NEXT: v_mov_b32_e32 v36, v8 +; GCN-NEXT: v_mov_b32_e32 v35, v6 +; GCN-NEXT: v_mov_b32_e32 v34, v4 +; GCN-NEXT: v_mov_b32_e32 v33, v2 +; GCN-NEXT: v_mov_b32_e32 v32, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v0 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB46_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB46_4 +; GCN-NEXT: .LBB46_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB46_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB46_2 +; GCN-NEXT: .LBB46_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v54 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v53 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v52 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v51 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v50 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v49 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v48 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v39 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v37 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v36 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v35 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v34 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v33 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v32 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GCN-NEXT: v_or_b32_e32 v0, v31, v0 +; GCN-NEXT: v_or_b32_e32 v2, v29, v2 +; GCN-NEXT: v_or_b32_e32 v4, v27, v4 +; GCN-NEXT: v_or_b32_e32 v6, v25, v6 +; GCN-NEXT: v_or_b32_e32 v8, v23, v8 +; GCN-NEXT: v_or_b32_e32 v10, v21, v10 +; GCN-NEXT: v_or_b32_e32 v12, v19, v12 +; GCN-NEXT: v_or_b32_e32 v14, v17, v14 +; GCN-NEXT: v_or_b32_e32 v15, v15, v16 +; GCN-NEXT: v_or_b32_e32 v13, v13, v18 +; GCN-NEXT: v_or_b32_e32 v11, v11, v20 +; GCN-NEXT: v_or_b32_e32 v9, v9, v22 +; GCN-NEXT: v_or_b32_e32 v7, v7, v24 +; GCN-NEXT: v_or_b32_e32 v5, v5, v26 +; GCN-NEXT: v_or_b32_e32 v3, v3, v28 +; GCN-NEXT: v_or_b32_e32 v1, v1, v30 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i16_to_v32bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB46_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v16, 3 +; VI-NEXT: v_add_u16_sdwa v19, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v15, 3, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v19 +; VI-NEXT: v_add_u16_sdwa v19, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 3, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v19 +; VI-NEXT: v_add_u16_sdwa v19, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v13, 3, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v19 +; VI-NEXT: v_add_u16_sdwa v19, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v12, 3, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v19 +; VI-NEXT: v_add_u16_sdwa v19, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v11, 3, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v19 +; VI-NEXT: v_add_u16_sdwa v19, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v10, 3, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v19 +; VI-NEXT: v_add_u16_sdwa v19, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v9, 3, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v19 +; VI-NEXT: v_add_u16_sdwa v19, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v8, 3, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v19 +; VI-NEXT: v_add_u16_sdwa v19, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v7, 3, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v19 +; VI-NEXT: v_add_u16_sdwa v19, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v6, 3, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v19 +; VI-NEXT: v_add_u16_sdwa v19, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v5, 3, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v19 +; VI-NEXT: v_add_u16_sdwa v19, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v4, 3, v4 +; VI-NEXT: v_add_u16_sdwa v17, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v18, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v19 +; VI-NEXT: v_add_u16_sdwa v19, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v16, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v3, v3, v16 +; VI-NEXT: v_or_b32_e32 v2, v2, v19 +; VI-NEXT: v_or_b32_e32 v1, v1, v18 +; VI-NEXT: v_or_b32_e32 v0, v0, v17 +; VI-NEXT: .LBB46_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i16_to_v32bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB46_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB46_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i16_to_v32bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB46_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: .LBB46_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + +define <32 x i16> @bitcast_v32bf16_to_v32i16(<32 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v32bf16_to_v32i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v55 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v31 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB47_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v63 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v61 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v60 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v59 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v58 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v57 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v47 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v45 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: .LBB47_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB47_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v56 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v46 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v45 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v43 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v42 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v41 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v40 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v54 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v53 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v48 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v39 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v34 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v32, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v20 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v30 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 +; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GCN-NEXT: v_and_b32_e32 v52, 0xffff0000, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v19 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v20 +; GCN-NEXT: v_and_b32_e32 v54, 0xffff0000, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v24 +; GCN-NEXT: v_and_b32_e32 v40, 0xffff0000, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v26 +; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v28 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_alignbit_b32 v0, v30, v0, 16 +; GCN-NEXT: v_alignbit_b32 v4, v36, v2, 16 +; GCN-NEXT: v_alignbit_b32 v8, v37, v32, 16 +; GCN-NEXT: v_alignbit_b32 v12, v38, v5, 16 +; GCN-NEXT: v_alignbit_b32 v16, v39, v33, 16 +; GCN-NEXT: v_alignbit_b32 v20, v48, v9, 16 +; GCN-NEXT: v_alignbit_b32 v24, v49, v10, 16 +; GCN-NEXT: v_alignbit_b32 v28, v50, v13, 16 +; GCN-NEXT: v_alignbit_b32 v30, v31, v14, 16 +; GCN-NEXT: v_alignbit_b32 v26, v27, v17, 16 +; GCN-NEXT: v_alignbit_b32 v22, v23, v18, 16 +; GCN-NEXT: v_alignbit_b32 v18, v19, v21, 16 +; GCN-NEXT: v_alignbit_b32 v14, v15, v34, 16 +; GCN-NEXT: v_alignbit_b32 v10, v11, v25, 16 +; GCN-NEXT: v_alignbit_b32 v6, v7, v35, 16 +; GCN-NEXT: v_alignbit_b32 v2, v3, v29, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: v_alignbit_b32 v5, v6, v41, 16 +; GCN-NEXT: v_alignbit_b32 v9, v10, v40, 16 +; GCN-NEXT: v_alignbit_b32 v13, v14, v55, 16 +; GCN-NEXT: v_alignbit_b32 v17, v18, v54, 16 +; GCN-NEXT: v_alignbit_b32 v21, v22, v53, 16 +; GCN-NEXT: v_alignbit_b32 v25, v26, v52, 16 +; GCN-NEXT: v_alignbit_b32 v29, v30, v51, 16 +; GCN-NEXT: .LBB47_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32bf16_to_v32i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, s6, v19 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; VI-NEXT: v_bfe_u32 v19, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v2 +; VI-NEXT: v_add_u32_e32 v19, vcc, s6, v19 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 +; VI-NEXT: v_add_u32_e32 v20, vcc, s6, v20 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; VI-NEXT: v_bfe_u32 v20, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v3 +; VI-NEXT: v_add_u32_e32 v20, vcc, s6, v20 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, s6, v21 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; VI-NEXT: v_bfe_u32 v21, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v4 +; VI-NEXT: v_add_u32_e32 v21, vcc, s6, v21 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 +; VI-NEXT: v_add_u32_e32 v22, vcc, s6, v22 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; VI-NEXT: v_bfe_u32 v22, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v5 +; VI-NEXT: v_add_u32_e32 v22, vcc, s6, v22 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 +; VI-NEXT: v_add_u32_e32 v23, vcc, s6, v23 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc +; VI-NEXT: v_bfe_u32 v23, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v6 +; VI-NEXT: v_add_u32_e32 v23, vcc, s6, v23 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v7 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 +; VI-NEXT: v_add_u32_e32 v24, vcc, s6, v24 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; VI-NEXT: v_bfe_u32 v24, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v7 +; VI-NEXT: v_add_u32_e32 v24, vcc, s6, v24 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v8 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, s6, v25 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc +; VI-NEXT: v_bfe_u32 v25, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v8 +; VI-NEXT: v_add_u32_e32 v25, vcc, s6, v25 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, s6, v26 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; VI-NEXT: v_bfe_u32 v26, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v9 +; VI-NEXT: v_add_u32_e32 v26, vcc, s6, v26 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 +; VI-NEXT: v_add_u32_e32 v27, vcc, s6, v27 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v10 +; VI-NEXT: v_add_u32_e32 v27, vcc, s6, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v11 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 +; VI-NEXT: v_add_u32_e32 v28, vcc, s6, v28 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; VI-NEXT: v_bfe_u32 v28, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v11 +; VI-NEXT: v_add_u32_e32 v28, vcc, s6, v28 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v12 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, s6, v29 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc +; VI-NEXT: v_bfe_u32 v29, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v12 +; VI-NEXT: v_add_u32_e32 v29, vcc, s6, v29 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 +; VI-NEXT: v_add_u32_e32 v30, vcc, s6, v30 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc +; VI-NEXT: v_bfe_u32 v30, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v13 +; VI-NEXT: v_add_u32_e32 v30, vcc, s6, v30 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v15 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v15, v15, v31, 16 +; VI-NEXT: v_alignbit_b32 v14, v14, v30, 16 +; VI-NEXT: v_alignbit_b32 v13, v13, v29, 16 +; VI-NEXT: v_alignbit_b32 v12, v12, v28, 16 +; VI-NEXT: v_alignbit_b32 v11, v11, v27, 16 +; VI-NEXT: v_alignbit_b32 v10, v10, v26, 16 +; VI-NEXT: v_alignbit_b32 v9, v9, v25, 16 +; VI-NEXT: v_alignbit_b32 v8, v8, v24, 16 +; VI-NEXT: v_alignbit_b32 v7, v7, v23, 16 +; VI-NEXT: v_alignbit_b32 v6, v6, v22, 16 +; VI-NEXT: v_alignbit_b32 v5, v5, v21, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v20, 16 +; VI-NEXT: v_alignbit_b32 v3, v3, v19, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: .LBB47_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v32i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v19, v19, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v19, v19, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_bfe_u32 v20, v19, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v20, v20, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v20, v20, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_bfe_u32 v21, v20, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v21, v21, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; GFX9-NEXT: v_bfe_u32 v21, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v21, v21, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v22, v22, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; GFX9-NEXT: v_bfe_u32 v22, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v22, v22, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v23, v23, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc +; GFX9-NEXT: v_bfe_u32 v23, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v23, v23, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v24, v24, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; GFX9-NEXT: v_bfe_u32 v24, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v24, v24, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v25, v25, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc +; GFX9-NEXT: v_bfe_u32 v25, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v25, v25, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v26, v26, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; GFX9-NEXT: v_bfe_u32 v26, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v26, v26, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_bfe_u32 v27, v26, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v27, v27, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc +; GFX9-NEXT: v_bfe_u32 v27, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v27, v27, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v28, v28, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; GFX9-NEXT: v_bfe_u32 v28, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v28, v28, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_bfe_u32 v29, v28, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v29, v29, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc +; GFX9-NEXT: v_bfe_u32 v29, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v29, v29, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_bfe_u32 v30, v29, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v30, v30, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc +; GFX9-NEXT: v_bfe_u32 v30, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v30, v30, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_bfe_u32 v31, v30, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v31, v31, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v31, v31, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v32, v32, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v32, v32, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v31, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v30, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v29, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v28, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v27, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v26, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v25, s6 +; GFX9-NEXT: v_perm_b32 v8, v8, v24, s6 +; GFX9-NEXT: v_perm_b32 v7, v7, v23, s6 +; GFX9-NEXT: v_perm_b32 v6, v6, v22, s6 +; GFX9-NEXT: v_perm_b32 v5, v5, v21, s6 +; GFX9-NEXT: v_perm_b32 v4, v4, v20, s6 +; GFX9-NEXT: v_perm_b32 v3, v3, v19, s6 +; GFX9-NEXT: v_perm_b32 v2, v2, v18, s6 +; GFX9-NEXT: v_perm_b32 v1, v1, v17, s6 +; GFX9-NEXT: v_perm_b32 v0, v0, v16, s6 +; GFX9-NEXT: .LBB47_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v32i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v11 +; GFX11-NEXT: v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_lshlrev_b32 v25, 16, v6 +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v19, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v16 +; GFX11-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_add3_u32 v21, v21, v17, 0x7fff +; GFX11-NEXT: v_add3_u32 v19, v19, v16, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_lshlrev_b32 v27, 16, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v19, v22, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v22, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_add_f32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v31, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v20, v23 :: v_dual_lshlrev_b32 v23, 16, v4 +; GFX11-NEXT: v_bfe_u32 v20, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_add_f32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_perm_b32 v0, v0, v16, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v21, v19, vcc_lo +; GFX11-NEXT: v_add3_u32 v19, v20, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v19, v20 :: v_dual_add_f32 v18, 0x40c00000, v18 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_bfe_u32 v21, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_add3_u32 v19, v21, v18, 0x7fff +; GFX11-NEXT: v_bfe_u32 v21, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-NEXT: v_perm_b32 v1, v1, v17, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v18, v19, v20 :: v_dual_and_b32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_add3_u32 v19, v21, v2, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v21, v22, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v19, v20 :: v_dual_and_b32 v11, 0xffff0000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v19, v21, v22, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v22 +; GFX11-NEXT: v_bfe_u32 v21, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_bfe_u32 v22, v23, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc_lo +; GFX11-NEXT: v_add3_u32 v20, v21, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v20, v22, v23, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v23 +; GFX11-NEXT: v_bfe_u32 v22, v4, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_bfe_u32 v23, v24, 16, 1 +; GFX11-NEXT: v_perm_b32 v3, v3, v19, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v21, v22, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc_lo +; GFX11-NEXT: v_add3_u32 v21, v23, v24, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v24 +; GFX11-NEXT: v_bfe_u32 v23, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_bfe_u32 v24, v25, 16, 1 +; GFX11-NEXT: v_perm_b32 v4, v4, v20, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc_lo +; GFX11-NEXT: v_add3_u32 v22, v23, v5, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc_lo +; GFX11-NEXT: v_add3_u32 v22, v24, v25, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v25 +; GFX11-NEXT: v_bfe_u32 v24, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_bfe_u32 v25, v26, 16, 1 +; GFX11-NEXT: v_perm_b32 v5, v5, v21, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v23, vcc_lo +; GFX11-NEXT: v_add3_u32 v23, v24, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc_lo +; GFX11-NEXT: v_add3_u32 v23, v25, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v26 +; GFX11-NEXT: v_bfe_u32 v25, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_bfe_u32 v26, v27, 16, 1 +; GFX11-NEXT: v_perm_b32 v6, v6, v22, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo +; GFX11-NEXT: v_add3_u32 v24, v25, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc_lo +; GFX11-NEXT: v_add3_u32 v24, v26, v27, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v27 +; GFX11-NEXT: v_bfe_u32 v26, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_bfe_u32 v27, v28, 16, 1 +; GFX11-NEXT: v_perm_b32 v7, v7, v23, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v25, vcc_lo +; GFX11-NEXT: v_add3_u32 v25, v26, v8, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc_lo +; GFX11-NEXT: v_add3_u32 v25, v27, v28, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v28 +; GFX11-NEXT: v_bfe_u32 v27, v9, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_bfe_u32 v28, v29, 16, 1 +; GFX11-NEXT: v_perm_b32 v8, v8, v24, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo +; GFX11-NEXT: v_add3_u32 v26, v27, v9, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc_lo +; GFX11-NEXT: v_add3_u32 v26, v28, v29, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v29 +; GFX11-NEXT: v_bfe_u32 v28, v10, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_bfe_u32 v29, v30, 16, 1 +; GFX11-NEXT: v_perm_b32 v9, v9, v25, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v27, vcc_lo +; GFX11-NEXT: v_add3_u32 v27, v28, v10, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc_lo +; GFX11-NEXT: v_add3_u32 v27, v29, v30, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v30 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_bfe_u32 v30, v31, 16, 1 +; GFX11-NEXT: v_bfe_u32 v29, v11, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v28 :: v_dual_lshlrev_b32 v28, 16, v13 +; GFX11-NEXT: v_add3_u32 v30, v30, v31, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_add3_u32 v31, v34, v12, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_add3_u32 v29, v29, v11, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v33, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v31, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v34, v35, v28, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: v_bfe_u32 v37, v13, 16, 1 +; GFX11-NEXT: v_perm_b32 v10, v10, v26, 0x7060302 +; GFX11-NEXT: v_dual_add_f32 v31, 0x40c00000, v35 :: v_dual_cndmask_b32 v28, v34, v36 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v15 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: v_add3_u32 v33, v37, v13, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_bfe_u32 v38, v14, 16, 1 +; GFX11-NEXT: v_add3_u32 v35, v35, v31, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v34 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v37, v38, v14, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_bfe_u32 v35, v15, 16, 1 +; GFX11-NEXT: v_add3_u32 v39, v39, v34, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v37, v38, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v15, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v14, v14, v31, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v39, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v35, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v15, v15, v34, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_perm_b32 v12, v12, v30, 0x7060302 +; GFX11-NEXT: v_perm_b32 v13, v13, v28, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v29, v32, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v11, v11, v27, 0x7060302 +; GFX11-NEXT: .LBB47_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + +define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v32i16_to_v64i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v4 +; GCN-NEXT: s_waitcnt expcnt(6) +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v6 +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v8 +; GCN-NEXT: s_waitcnt expcnt(3) +; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v10 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v12 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v14 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v49 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB48_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v21 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v38 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v4 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v8 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v12 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v16 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v40, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v49 +; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_bfe_u32 v4, v4, 8, 8 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v4, v8, 8, 8 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: v_bfe_u32 v44, v12, 8, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v4, v16, 8, 8 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v4, v20, 8, 8 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v4, v24, 8, 8 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v4, v28, 8, 8 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v4, v49, 8, 8 +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v52, v1, v46 +; GCN-NEXT: v_or_b32_e32 v51, v2, v47 +; GCN-NEXT: v_or_b32_e32 v48, v3, v57 +; GCN-NEXT: v_or_b32_e32 v37, v5, v59 +; GCN-NEXT: v_or_b32_e32 v32, v6, v60 +; GCN-NEXT: v_or_b32_e32 v6, v7, v61 +; GCN-NEXT: v_or_b32_e32 v58, v9, v62 +; GCN-NEXT: v_or_b32_e32 v56, v11, v63 +; GCN-NEXT: v_or_b32_e32 v42, v13, v10 +; GCN-NEXT: v_or_b32_e32 v54, v15, v14 +; GCN-NEXT: v_or_b32_e32 v50, v17, v22 +; GCN-NEXT: v_or_b32_e32 v39, v18, v33 +; GCN-NEXT: v_or_b32_e32 v36, v19, v26 +; GCN-NEXT: v_or_b32_e32 v31, v21, v34 +; GCN-NEXT: v_or_b32_e32 v18, v23, v30 +; GCN-NEXT: v_or_b32_e32 v2, v25, v35 +; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v37, v48, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v37, v48, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v37, v48, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v55, v6, v32, 24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v6, v32, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v6, v32, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v41, v56, v58, 24 +; GCN-NEXT: v_alignbit_b32 v43, v56, v58, 16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v56, v58, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v39, v50, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v39, v50, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v45, v39, v50, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v56 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: .LBB48_2: ; %Flow +; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v49, v40 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB48_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v29 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v38 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v25 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v2, v30, v2 +; GCN-NEXT: v_or_b32_e32 v4, v35, v4 +; GCN-NEXT: v_or_b32_e32 v6, v26, v6 +; GCN-NEXT: v_or_b32_e32 v8, v34, v8 +; GCN-NEXT: v_or_b32_e32 v12, v22, v12 +; GCN-NEXT: v_or_b32_e32 v16, v33, v16 +; GCN-NEXT: v_or_b32_e32 v10, v10, v17 +; GCN-NEXT: v_or_b32_e32 v14, v14, v18 +; GCN-NEXT: v_or_b32_e32 v13, v62, v13 +; GCN-NEXT: v_or_b32_e32 v15, v63, v15 +; GCN-NEXT: v_or_b32_e32 v9, v60, v9 +; GCN-NEXT: v_or_b32_e32 v11, v61, v11 +; GCN-NEXT: v_or_b32_e32 v5, v57, v5 +; GCN-NEXT: v_or_b32_e32 v7, v59, v7 +; GCN-NEXT: v_or_b32_e32 v1, v46, v1 +; GCN-NEXT: v_or_b32_e32 v3, v47, v3 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v36, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v31, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v39, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v42, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v58, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v56, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v3 +; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v51, v52, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v4, v37, v48, 24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v37, v48, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v8, v37, v48, 8 +; GCN-NEXT: v_alignbit_b32 v55, v6, v32, 24 +; GCN-NEXT: v_alignbit_b32 v20, v6, v32, 16 +; GCN-NEXT: v_alignbit_b32 v12, v6, v32, 8 +; GCN-NEXT: v_alignbit_b32 v41, v56, v58, 24 +; GCN-NEXT: v_alignbit_b32 v43, v56, v58, 16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v56, v58, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v54, v42, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v39, v50, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v39, v50, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v45, v39, v50, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v31, v36, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 24 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v1, v2, v18, 8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v51 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v24, 24, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v37 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v6 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v56 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v56 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v56 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v54 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v54 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v39 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: .LBB48_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v52 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v5, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v51 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v7, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v8 +; GCN-NEXT: v_or_b32_e32 v8, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v10, v1, v3 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; GCN-NEXT: v_or_b32_e32 v12, v1, v3 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v1 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v28 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GCN-NEXT: v_or_b32_e32 v13, v1, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v58 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GCN-NEXT: v_or_b32_e32 v14, v3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v24 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v56 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GCN-NEXT: v_or_b32_e32 v16, v4, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 12, v0 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 24, v55 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v42 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GCN-NEXT: v_or_b32_e32 v17, v6, v9 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v44 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v54 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GCN-NEXT: v_or_b32_e32 v21, v9, v11 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; GCN-NEXT: v_and_b32_e32 v51, 0xff, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v41 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v45 +; GCN-NEXT: v_or_b32_e32 v24, v11, v15 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v55, 0xff, v15 +; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v15 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v39 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; GCN-NEXT: v_or_b32_e32 v22, v15, v19 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 28, v0 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v19 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v36 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GCN-NEXT: v_or_b32_e32 v36, v19, v25 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v45, 0xff, v49 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v25 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v31 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v31, 8, v31 +; GCN-NEXT: v_or_b32_e32 v31, v25, v31 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 36, v0 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v46, 0xff, v50 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v50 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v50, 8, v50 +; GCN-NEXT: v_or_b32_e32 v50, v18, v50 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v53 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v53, 24, v53 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v54, 8, v54 +; GCN-NEXT: v_or_b32_e32 v54, v2, v54 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v56, 0xff, v40 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GCN-NEXT: v_or_b32_e32 v33, v33, v32 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 48, v0 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v57, 0xff, v41 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v41, 24, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_or_b32_e32 v43, v27, v26 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v58, 0xff, v27 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v44, 24, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GCN-NEXT: v_or_b32_e32 v29, v29, v27 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 56, v0 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v59, 0xff, v28 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v60, 24, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v34 +; GCN-NEXT: v_or_b32_e32 v34, v35, v28 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 60, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v59 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_or_b32_e32 v20, v20, v35 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_or_b32_e32 v23, v23, v38 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v30, v30, v48 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v35, v37, v51 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_or_b32_e32 v37, v39, v55 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v38, v49, v42 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_or_b32_e32 v39, v52, v45 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_or_b32_e32 v48, v53, v46 +; GCN-NEXT: v_and_b32_e32 v36, 0xffff, v36 +; GCN-NEXT: v_or_b32_e32 v49, v40, v47 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GCN-NEXT: v_or_b32_e32 v51, v41, v56 +; GCN-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; GCN-NEXT: v_or_b32_e32 v52, v44, v57 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v54 +; GCN-NEXT: v_or_b32_e32 v54, v60, v58 +; GCN-NEXT: v_or_b32_e32 v5, v5, v33 +; GCN-NEXT: v_or_b32_e32 v7, v7, v43 +; GCN-NEXT: v_or_b32_e32 v8, v8, v29 +; GCN-NEXT: v_or_b32_e32 v10, v10, v34 +; GCN-NEXT: v_or_b32_e32 v12, v12, v20 +; GCN-NEXT: v_or_b32_e32 v13, v13, v23 +; GCN-NEXT: v_or_b32_e32 v14, v14, v30 +; GCN-NEXT: v_or_b32_e32 v16, v16, v35 +; GCN-NEXT: v_or_b32_e32 v17, v17, v37 +; GCN-NEXT: v_or_b32_e32 v20, v21, v38 +; GCN-NEXT: v_or_b32_e32 v21, v24, v39 +; GCN-NEXT: v_or_b32_e32 v22, v22, v48 +; GCN-NEXT: v_or_b32_e32 v23, v36, v49 +; GCN-NEXT: v_or_b32_e32 v24, v31, v51 +; GCN-NEXT: v_or_b32_e32 v29, v50, v52 +; GCN-NEXT: v_or_b32_e32 v30, v53, v54 +; GCN-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v8, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v10, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v12, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v13, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v25, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v29, v27, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v28, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32i16_to_v64i8: +; VI: ; %bb.0: +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v28, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v1 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; kill: killed $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v15 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v14 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v13 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v12 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v12 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v11 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v10 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v10 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 24, v8 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v8 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v7 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v6 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[15:16] +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v5 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[13:14] +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v43, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 +; VI-NEXT: v_mov_b32_e32 v25, v50 +; VI-NEXT: v_mov_b32_e32 v41, v1 +; VI-NEXT: v_mov_b32_e32 v54, v2 +; VI-NEXT: v_mov_b32_e32 v57, v3 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_mov_b32_e32 v61, v5 +; VI-NEXT: v_mov_b32_e32 v60, v6 +; VI-NEXT: v_mov_b32_e32 v52, v7 +; VI-NEXT: v_mov_b32_e32 v63, v8 +; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_mov_b32_e32 v53, v10 +; VI-NEXT: v_mov_b32_e32 v17, v11 +; VI-NEXT: v_mov_b32_e32 v44, v12 +; VI-NEXT: v_mov_b32_e32 v58, v13 +; VI-NEXT: v_mov_b32_e32 v56, v14 +; VI-NEXT: v_mov_b32_e32 v50, v15 +; VI-NEXT: v_mov_b32_e32 v62, v16 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: .LBB48_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v18, 3 +; VI-NEXT: v_add_u16_sdwa v26, v16, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v29, v15, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v62, 3, v16 +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 +; VI-NEXT: v_add_u16_e32 v50, 3, v15 +; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; VI-NEXT: v_or_b32_e32 v16, v62, v16 +; VI-NEXT: v_or_b32_e32 v15, v50, v15 +; VI-NEXT: v_add_u16_sdwa v38, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v49, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v36, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v48, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v34, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v39, v5, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v32, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v37, v7, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v30, v10, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v35, v9, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v28, v12, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v33, v11, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v27, v14, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v31, v13, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[15:16] +; VI-NEXT: v_add_u16_e32 v56, 3, v14 +; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v27 +; VI-NEXT: v_add_u16_e32 v58, 3, v13 +; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; VI-NEXT: v_or_b32_e32 v14, v56, v14 +; VI-NEXT: v_or_b32_e32 v13, v58, v13 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; VI-NEXT: v_add_u16_e32 v44, 3, v12 +; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v28 +; VI-NEXT: v_add_u16_e32 v17, 3, v11 +; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 +; VI-NEXT: v_add_u16_e32 v53, 3, v10 +; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 +; VI-NEXT: v_add_u16_e32 v40, 3, v9 +; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v35 +; VI-NEXT: v_or_b32_e32 v12, v44, v12 +; VI-NEXT: v_or_b32_e32 v11, v17, v11 +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v63, 3, v8 +; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 +; VI-NEXT: v_add_u16_e32 v52, 3, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v37 +; VI-NEXT: v_or_b32_e32 v10, v53, v10 +; VI-NEXT: v_or_b32_e32 v9, v40, v9 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_add_u16_e32 v60, 3, v6 +; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; VI-NEXT: v_add_u16_e32 v61, 3, v5 +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; VI-NEXT: v_or_b32_e32 v8, v63, v8 +; VI-NEXT: v_or_b32_e32 v7, v52, v7 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_add_u16_e32 v47, 3, v4 +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; VI-NEXT: v_add_u16_e32 v57, 3, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; VI-NEXT: v_or_b32_e32 v6, v60, v6 +; VI-NEXT: v_or_b32_e32 v5, v61, v5 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; VI-NEXT: v_add_u16_e32 v54, 3, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; VI-NEXT: v_add_u16_e32 v41, 3, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; VI-NEXT: v_or_b32_e32 v4, v47, v4 +; VI-NEXT: v_or_b32_e32 v3, v57, v3 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; VI-NEXT: v_or_b32_e32 v2, v54, v2 +; VI-NEXT: v_or_b32_e32 v1, v41, v1 +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v18, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_bfe_u32 v1, v27, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v28, 8, 8 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v30, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v14 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v51, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v2 +; VI-NEXT: v_bfe_u32 v25, v26, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v1, v32, 8, 8 +; VI-NEXT: v_bfe_u32 v43, v34, 8, 8 +; VI-NEXT: v_bfe_u32 v46, v36, 8, 8 +; VI-NEXT: v_bfe_u32 v59, v38, 8, 8 +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v18 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v22 +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v21 +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v28, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v27, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32i16_to_v64i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v16 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v26, v23 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: .LBB48_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB48_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB48_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v27, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32i16_to_v64i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB48_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB48_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB48_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v25 +; GFX11-NEXT: v_and_b32_e32 v96, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v86 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v25 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v87 +; GFX11-NEXT: v_or_b32_e32 v24, v96, v24 +; GFX11-NEXT: v_lshlrev_b16 v85, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v86, v85 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v83 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v24 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v82 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v80, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v71, 8, v71 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v70 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v22 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v84 +; GFX11-NEXT: v_or_b32_e32 v23, v83, v23 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v80 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v71 +; GFX11-NEXT: v_or_b32_e32 v22, v70, v22 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v24 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v23 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v25 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v65 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v22 +; GFX11-NEXT: v_or_b32_e32 v21, v23, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v22, v25, v54 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v52 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v51 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v50 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v49 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v24 +; GFX11-NEXT: v_or_b32_e32 v23, v25, v49 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v48 +; GFX11-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v22 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v23 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v37 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v35 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v34 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b16 v69, 8, v69 +; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v68 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v67 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v19, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v22 +; GFX11-NEXT: v_or_b32_e32 v18, v23, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v31 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v30 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v28 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v26 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v69 +; GFX11-NEXT: v_or_b32_e32 v67, v68, v67 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v22 +; GFX11-NEXT: v_or_b32_e32 v17, v23, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v67 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v21 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <32 x i16> %a, splat (i16 3) + %a2 = bitcast <32 x i16> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x i16> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + +define <32 x i16> @bitcast_v64i8_to_v32i16(<64 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i8_to_v32i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:104 +; GCN-NEXT: s_waitcnt vmcnt(5) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v13 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v21 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v19 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v29 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v27 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v52 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v35 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v43, 8, v33 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v16 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v34 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v57, 8, v48 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v8 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v49 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v53 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:124 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v50 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB49_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v12 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v20 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v28 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v5, v7 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v56 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v46 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v47 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v62 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v61 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v41 +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v63 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v40 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v58 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v59 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v60 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v21, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v23 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v25, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v27 +; GCN-NEXT: v_or_b32_e32 v23, v28, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v30 +; GCN-NEXT: v_or_b32_e32 v24, v24, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v32 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v33, v29 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v34, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v35, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v12, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v13, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v14, v32 +; GCN-NEXT: v_or_b32_e32 v15, v15, v44 +; GCN-NEXT: v_or_b32_e32 v16, v16, v45 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v32, v10 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v40, v32, v11 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v11, v8 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v11, v9 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v11, v6 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v41, v11, v7 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v7, v0 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v7, v2 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v17 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v11, v18 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v11, v19 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v19, v21 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v19, v22 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v23 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v22, v25 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v23, v26 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v24, v27 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v26, v28 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v30 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v31 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v33, v1, v10 +; GCN-NEXT: v_or_b32_e32 v35, v3, v8 +; GCN-NEXT: v_or_b32_e32 v37, v4, v6 +; GCN-NEXT: v_or_b32_e32 v39, v5, v0 +; GCN-NEXT: v_or_b32_e32 v49, v7, v17 +; GCN-NEXT: v_or_b32_e32 v51, v11, v20 +; GCN-NEXT: v_or_b32_e32 v53, v19, v22 +; GCN-NEXT: v_or_b32_e32 v55, v23, v24 +; GCN-NEXT: v_or_b32_e32 v32, v27, v40 +; GCN-NEXT: v_or_b32_e32 v34, v28, v9 +; GCN-NEXT: v_or_b32_e32 v36, v29, v41 +; GCN-NEXT: v_or_b32_e32 v38, v12, v2 +; GCN-NEXT: v_or_b32_e32 v48, v13, v18 +; GCN-NEXT: v_or_b32_e32 v50, v14, v21 +; GCN-NEXT: v_or_b32_e32 v52, v15, v25 +; GCN-NEXT: v_or_b32_e32 v54, v16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; GCN-NEXT: v_alignbit_b32 v1, v33, v40, 16 +; GCN-NEXT: v_alignbit_b32 v5, v35, v9, 16 +; GCN-NEXT: v_alignbit_b32 v9, v37, v41, 16 +; GCN-NEXT: v_alignbit_b32 v13, v39, v2, 16 +; GCN-NEXT: v_alignbit_b32 v17, v49, v18, 16 +; GCN-NEXT: v_alignbit_b32 v21, v51, v21, 16 +; GCN-NEXT: v_alignbit_b32 v25, v53, v25, 16 +; GCN-NEXT: v_alignbit_b32 v29, v55, v26, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v24 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: .LBB49_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB49_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v45, v1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v24 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v57, v3 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v42 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v44, v5 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v41 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v43, v7 +; GCN-NEXT: s_movk_i32 s7, 0x300 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v58 +; GCN-NEXT: s_mov_b32 s6, 0x3000000 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v40 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v60 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v61 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v62 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v0 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v59 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v47 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v56 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v46 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v0 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v30 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v0 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v0 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v0 +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v34 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v35 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v4, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v9, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v13, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v19, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v23, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v25, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v26, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v26, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v27, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v27, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v29, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v31, v33 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s7, v3 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v32, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v32, v2 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v32, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s7, v4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v32, v6 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s7, v9 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v32, v11 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v32, v15 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s7, v16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v32, v17 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s7, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v32, v18 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s7, v20 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v32, v22 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s7, v26 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v32, v10 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v32, v14 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s7, v27 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v32, v30 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v1, v31, v1 +; GCN-NEXT: v_or_b32_e32 v3, v8, v3 +; GCN-NEXT: v_or_b32_e32 v2, v2, v5 +; GCN-NEXT: v_or_b32_e32 v0, v0, v7 +; GCN-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v11, v9 +; GCN-NEXT: v_or_b32_e32 v6, v15, v13 +; GCN-NEXT: v_or_b32_e32 v7, v17, v16 +; GCN-NEXT: v_or_b32_e32 v8, v21, v19 +; GCN-NEXT: v_or_b32_e32 v9, v24, v23 +; GCN-NEXT: v_or_b32_e32 v11, v18, v25 +; GCN-NEXT: v_or_b32_e32 v13, v22, v20 +; GCN-NEXT: v_or_b32_e32 v10, v10, v26 +; GCN-NEXT: v_or_b32_e32 v12, v14, v12 +; GCN-NEXT: v_or_b32_e32 v14, v28, v27 +; GCN-NEXT: v_or_b32_e32 v15, v30, v29 +; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v55, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v53, vcc, s6, v0 +; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v49, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v38, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v39, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v36, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v37, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v34, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v35, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v32, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v33, vcc, s6, v15 +; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 16 +; GCN-NEXT: v_alignbit_b32 v5, v35, v34, 16 +; GCN-NEXT: v_alignbit_b32 v9, v37, v36, 16 +; GCN-NEXT: v_alignbit_b32 v13, v39, v38, 16 +; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16 +; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16 +; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16 +; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; GCN-NEXT: .LBB49_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v32 +; GCN-NEXT: v_mov_b32_e32 v2, v33 +; GCN-NEXT: v_mov_b32_e32 v4, v34 +; GCN-NEXT: v_mov_b32_e32 v6, v35 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mov_b32_e32 v8, v36 +; GCN-NEXT: v_mov_b32_e32 v10, v37 +; GCN-NEXT: v_mov_b32_e32 v12, v38 +; GCN-NEXT: v_mov_b32_e32 v14, v39 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mov_b32_e32 v16, v48 +; GCN-NEXT: v_mov_b32_e32 v18, v49 +; GCN-NEXT: v_mov_b32_e32 v20, v50 +; GCN-NEXT: v_mov_b32_e32 v22, v51 +; GCN-NEXT: v_mov_b32_e32 v24, v52 +; GCN-NEXT: v_mov_b32_e32 v26, v53 +; GCN-NEXT: v_mov_b32_e32 v28, v54 +; GCN-NEXT: v_mov_b32_e32 v30, v55 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i8_to_v32i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v16 +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v18 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v22 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v31 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB49_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v9, v39, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v10, v20, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v28, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v22, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v30, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v38, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v0, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v1, v1, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v7, v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v8, v8, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v4, v4, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v49, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v48, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v55, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v36, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v44, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v61, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: .LBB49_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB49_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_add_u16_e32 v0, 3, v38 +; VI-NEXT: v_add_u16_e32 v2, 3, v44 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v14, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v2, 3, v35 +; VI-NEXT: v_mov_b32_e32 v3, 0x300 +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v0, 3, v37 +; VI-NEXT: v_or_b32_sdwa v24, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v0, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v12, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v43 +; VI-NEXT: v_or_b32_sdwa v16, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v0, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v11, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v0, 3, v36 +; VI-NEXT: v_or_b32_sdwa v22, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v0, 3, v28 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v10, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v55 +; VI-NEXT: v_or_b32_sdwa v28, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v48 +; VI-NEXT: v_or_b32_sdwa v20, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v49 +; VI-NEXT: v_or_b32_sdwa v30, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v13, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v61 +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v31 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v26, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v12, v16, v12 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v24 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: v_or_b32_e32 v13, v16, v13 +; VI-NEXT: v_or_b32_e32 v14, v14, v26 +; VI-NEXT: v_or_b32_e32 v15, v15, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v29, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v25, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v21, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v17, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: v_or_b32_e32 v4, v17, v4 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v21 +; VI-NEXT: v_or_b32_e32 v5, v17, v5 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v6, v17, v6 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v29 +; VI-NEXT: v_or_b32_e32 v7, v17, v7 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v30 +; VI-NEXT: v_or_b32_e32 v8, v17, v8 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v20 +; VI-NEXT: v_or_b32_e32 v9, v17, v9 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v28 +; VI-NEXT: v_or_b32_e32 v10, v17, v10 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v22 +; VI-NEXT: v_or_b32_e32 v11, v17, v11 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v19, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v23, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v27, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v31, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v31 +; VI-NEXT: v_or_b32_e32 v1, v3, v1 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v27 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v23 +; VI-NEXT: v_or_b32_e32 v3, v3, v19 +; VI-NEXT: .LBB49_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i8_to_v32i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v16 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v18 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v20 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v24 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v32 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB49_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_or_b32_sdwa v9, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_or_b32_sdwa v10, v62, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v63, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v35, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v16, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v38, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: v_or_b32_sdwa v8, v51, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 +; GFX9-NEXT: v_or_b32_sdwa v9, v52, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: v_or_b32_sdwa v10, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 +; GFX9-NEXT: v_or_b32_sdwa v11, v37, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 +; GFX9-NEXT: v_or_b32_sdwa v12, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 +; GFX9-NEXT: v_or_b32_sdwa v13, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 +; GFX9-NEXT: v_or_b32_sdwa v14, v47, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 +; GFX9-NEXT: v_or_b32_sdwa v15, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: .LBB49_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB49_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v35 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v51 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v2 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v1 +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v8, v30, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v28, v9, s6 +; GFX9-NEXT: v_perm_b32 v10, v26, v10, s6 +; GFX9-NEXT: v_perm_b32 v11, v24, v11, s6 +; GFX9-NEXT: v_perm_b32 v12, v22, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v16, v13, s6 +; GFX9-NEXT: v_perm_b32 v14, v20, v14, s6 +; GFX9-NEXT: v_perm_b32 v15, v18, v15, s6 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v31, v48, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v7, v27, v7, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v6, v23, v6, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v5, v19, v5, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v4, v17, v4, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v3, v21, v3, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v2, v25, v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v1, v29, v1, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 +; GFX9-NEXT: .LBB49_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i8_to_v32i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v32, v14 :: v_dual_mov_b32 v31, v8 +; GFX11-NEXT: v_dual_mov_b32 v34, v12 :: v_dual_mov_b32 v35, v10 +; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 +; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v83, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v81, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v119, 8, v0 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v118, 8, v2 +; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: s_waitcnt vmcnt(21) +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v14 +; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: v_lshlrev_b16 v98, 8, v96 +; GFX11-NEXT: s_waitcnt vmcnt(19) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v100 +; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: v_lshlrev_b16 v96, 8, v101 +; GFX11-NEXT: s_waitcnt vmcnt(17) +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v102 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_lshlrev_b16 v112, 8, v103 +; GFX11-NEXT: s_waitcnt vmcnt(15) +; GFX11-NEXT: v_lshlrev_b16 v103, 8, v113 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_lshlrev_b16 v101, 8, v114 +; GFX11-NEXT: s_waitcnt vmcnt(13) +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v115 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: v_lshlrev_b16 v114, 8, v116 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: v_lshlrev_b16 v102, 8, v117 +; GFX11-NEXT: v_lshlrev_b16 v117, 8, v10 +; GFX11-NEXT: v_lshlrev_b16 v113, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v116, 8, v6 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v4 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB49_4 +; GFX11-NEXT: .LBB49_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v35 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v50 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v51 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v39 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v54 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v48 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v55 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v49 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v53 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v52 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v17 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v83 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v81 +; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v6, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v8, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v10, v9, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v71 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v65 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v68 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v87 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v84 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v21 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v19 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v27 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v23 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v98 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v25 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v96 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v29 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v112 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v103 +; GFX11-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v11, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v13, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v15, v14, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v86 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v82 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v97 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v99 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v64 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v66 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v70 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v67 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v101 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v100 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v114 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v102 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v117 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v113 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v116 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v115 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v118 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v119 +; GFX11-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v12, v14, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v16, v15, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v18, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v20, v19, 0x5040100 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB49_2 +; GFX11-NEXT: .LBB49_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v70, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v69, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v66, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v64, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_add_nc_u16 v6, v97, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v118, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v119, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v99, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v64, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v116, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v5 +; GFX11-NEXT: v_or_b32_e32 v2, v115, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v113, v1 +; GFX11-NEXT: v_add_nc_u16 v66, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v114, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v117, v4 +; GFX11-NEXT: v_add_nc_u16 v2, v85, 3 +; GFX11-NEXT: v_add_nc_u16 v67, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v82, 3 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v4 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v84, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v102, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v101, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v100, v1 +; GFX11-NEXT: v_add_nc_u16 v69, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v112, v3 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v103, v4 +; GFX11-NEXT: v_add_nc_u16 v70, 0x300, v1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_add_nc_u16 v1, v80, 3 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v68, 3 +; GFX11-NEXT: v_add_nc_u16 v68, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v0, v71, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v4, v28, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v96, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v29, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v98, v0 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v25, v3 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v27, v4 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v30, 3 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v26, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v3, v20, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v22, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v83, v3 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v81, v4 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v31, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v3, v18, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v32, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v53, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v55, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v54, v1 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v17, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v52, v4 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v17, v36, 3 +; GFX11-NEXT: v_add_nc_u16 v21, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v22, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v23, v35, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-NEXT: v_or_b32_e32 v1, v49, v1 +; GFX11-NEXT: v_or_b32_e32 v17, v50, v17 +; GFX11-NEXT: v_or_b32_e32 v21, v51, v21 +; GFX11-NEXT: v_or_b32_e32 v22, v39, v22 +; GFX11-NEXT: v_or_b32_e32 v23, v48, v23 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v2 +; GFX11-NEXT: v_perm_b32 v0, v21, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v23, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v24, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v26, v18, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v16, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v20, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v19, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v25, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v68, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v70, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v12, v69, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v67, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v66, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v64, v15, 0x5040100 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <32 x i16> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <32 x i16> + br label %end + +end: + %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x i16> %phi +} + +define <32 x bfloat> @bitcast_v32f16_to_v32bf16(<32 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f16_to_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v40, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v41, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v42, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v43, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v44, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v45, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v46, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v47, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v56, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v57, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v58, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v59, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v60, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v61, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v62, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v63 +; GCN-NEXT: v_cvt_f16_f32_e32 v63, v31 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB50_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v55 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v59 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v60 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v61 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v63 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: .LBB50_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB50_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v63 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v62 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v61 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v59 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v58 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v57 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v56 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v46 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v45 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 +; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v39, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v48, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v53, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v55, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v55 +; GCN-NEXT: .LBB50_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f16_to_v32bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v15 +; VI-NEXT: v_add_f16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v15, v19, v15 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v14, v19, v14 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v13, v19, v13 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v12, v19, v12 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v11, v19, v11 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v10, v19, v10 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v9, v19, v9 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v8, v19, v8 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v7, v19, v7 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v6, v19, v6 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v5, v19, v5 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v16, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v18, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v19, v4 +; VI-NEXT: v_add_f16_e32 v19, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v17, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v17 +; VI-NEXT: v_or_b32_e32 v2, v19, v2 +; VI-NEXT: v_or_b32_e32 v1, v18, v1 +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f16_to_v32bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: .LBB50_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f16_to_v32bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: .LBB50_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} + +define <32 x half> @bitcast_v32bf16_to_v32f16(<32 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v32bf16_to_v32f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v29 +; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v63 +; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v31 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB51_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v33 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v34 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v36 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v38 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v48 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v49 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v41 +; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v45 +; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v47 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v57 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v58 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v59 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v60 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v61 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v62 +; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: .LBB51_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB51_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v62 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v60 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v59 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v58 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v56 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v47 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v46 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v45 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v43 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v42 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v41 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v40 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v54 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v53 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v52 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v48 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v39 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v34 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GCN-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v15 +; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v19 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v22 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v23 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v25 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v26 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v27 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v55 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v53 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v33 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v32 +; GCN-NEXT: .LBB51_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32bf16_to_v32f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_bfe_u32 v17, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v16 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x7fff, v17 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; VI-NEXT: v_bfe_u32 v17, v0, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v17, vcc, v17, v0 +; VI-NEXT: v_add_u32_e32 v17, vcc, s6, v17 +; VI-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; VI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; VI-NEXT: v_bfe_u32 v19, v18, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v18 +; VI-NEXT: v_add_u32_e32 v19, vcc, s6, v19 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; VI-NEXT: v_bfe_u32 v19, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v19, vcc, v19, v2 +; VI-NEXT: v_add_u32_e32 v19, vcc, s6, v19 +; VI-NEXT: v_or_b32_e32 v20, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; VI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; VI-NEXT: v_bfe_u32 v20, v19, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v19 +; VI-NEXT: v_add_u32_e32 v20, vcc, s6, v20 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; VI-NEXT: v_bfe_u32 v20, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v20, vcc, v20, v3 +; VI-NEXT: v_add_u32_e32 v20, vcc, s6, v20 +; VI-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; VI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; VI-NEXT: v_bfe_u32 v21, v20, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v20 +; VI-NEXT: v_add_u32_e32 v21, vcc, s6, v21 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; VI-NEXT: v_bfe_u32 v21, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v21, vcc, v21, v4 +; VI-NEXT: v_add_u32_e32 v21, vcc, s6, v21 +; VI-NEXT: v_or_b32_e32 v22, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc +; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; VI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; VI-NEXT: v_bfe_u32 v22, v21, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v21 +; VI-NEXT: v_add_u32_e32 v22, vcc, s6, v22 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; VI-NEXT: v_bfe_u32 v22, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v22, vcc, v22, v5 +; VI-NEXT: v_add_u32_e32 v22, vcc, s6, v22 +; VI-NEXT: v_or_b32_e32 v23, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc +; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; VI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; VI-NEXT: v_bfe_u32 v23, v22, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v22 +; VI-NEXT: v_add_u32_e32 v23, vcc, s6, v23 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc +; VI-NEXT: v_bfe_u32 v23, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v23, vcc, v23, v6 +; VI-NEXT: v_add_u32_e32 v23, vcc, s6, v23 +; VI-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v7 +; VI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; VI-NEXT: v_bfe_u32 v24, v23, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v23 +; VI-NEXT: v_add_u32_e32 v24, vcc, s6, v24 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; VI-NEXT: v_bfe_u32 v24, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v24, vcc, v24, v7 +; VI-NEXT: v_add_u32_e32 v24, vcc, s6, v24 +; VI-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc +; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v8 +; VI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; VI-NEXT: v_bfe_u32 v25, v24, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v24 +; VI-NEXT: v_add_u32_e32 v25, vcc, s6, v25 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc +; VI-NEXT: v_bfe_u32 v25, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v25, vcc, v25, v8 +; VI-NEXT: v_add_u32_e32 v25, vcc, s6, v25 +; VI-NEXT: v_or_b32_e32 v26, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc +; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v9 +; VI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; VI-NEXT: v_bfe_u32 v26, v25, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v25 +; VI-NEXT: v_add_u32_e32 v26, vcc, s6, v26 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; VI-NEXT: v_bfe_u32 v26, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v26, vcc, v26, v9 +; VI-NEXT: v_add_u32_e32 v26, vcc, s6, v26 +; VI-NEXT: v_or_b32_e32 v27, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc +; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; VI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; VI-NEXT: v_bfe_u32 v27, v26, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v26 +; VI-NEXT: v_add_u32_e32 v27, vcc, s6, v27 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc +; VI-NEXT: v_bfe_u32 v27, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v27, vcc, v27, v10 +; VI-NEXT: v_add_u32_e32 v27, vcc, s6, v27 +; VI-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc +; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v11 +; VI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; VI-NEXT: v_bfe_u32 v28, v27, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v27 +; VI-NEXT: v_add_u32_e32 v28, vcc, s6, v28 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; VI-NEXT: v_bfe_u32 v28, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v28, vcc, v28, v11 +; VI-NEXT: v_add_u32_e32 v28, vcc, s6, v28 +; VI-NEXT: v_or_b32_e32 v29, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc +; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v12 +; VI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; VI-NEXT: v_bfe_u32 v29, v28, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v28 +; VI-NEXT: v_add_u32_e32 v29, vcc, s6, v29 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v28 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc +; VI-NEXT: v_bfe_u32 v29, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v29, vcc, v29, v12 +; VI-NEXT: v_add_u32_e32 v29, vcc, s6, v29 +; VI-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc +; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; VI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; VI-NEXT: v_bfe_u32 v30, v29, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v29 +; VI-NEXT: v_add_u32_e32 v30, vcc, s6, v30 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc +; VI-NEXT: v_bfe_u32 v30, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v30, vcc, v30, v13 +; VI-NEXT: v_add_u32_e32 v30, vcc, s6, v30 +; VI-NEXT: v_or_b32_e32 v31, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc +; VI-NEXT: v_lshlrev_b32_e32 v30, 16, v14 +; VI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; VI-NEXT: v_bfe_u32 v31, v30, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v30 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; VI-NEXT: v_bfe_u32 v31, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v31, vcc, v31, v14 +; VI-NEXT: v_add_u32_e32 v31, vcc, s6, v31 +; VI-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; VI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; VI-NEXT: v_bfe_u32 v32, v31, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v31 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; VI-NEXT: v_bfe_u32 v32, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v32, vcc, v32, v15 +; VI-NEXT: v_add_u32_e32 v32, vcc, s6, v32 +; VI-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v15, v15, v31, 16 +; VI-NEXT: v_alignbit_b32 v14, v14, v30, 16 +; VI-NEXT: v_alignbit_b32 v13, v13, v29, 16 +; VI-NEXT: v_alignbit_b32 v12, v12, v28, 16 +; VI-NEXT: v_alignbit_b32 v11, v11, v27, 16 +; VI-NEXT: v_alignbit_b32 v10, v10, v26, 16 +; VI-NEXT: v_alignbit_b32 v9, v9, v25, 16 +; VI-NEXT: v_alignbit_b32 v8, v8, v24, 16 +; VI-NEXT: v_alignbit_b32 v7, v7, v23, 16 +; VI-NEXT: v_alignbit_b32 v6, v6, v22, 16 +; VI-NEXT: v_alignbit_b32 v5, v5, v21, 16 +; VI-NEXT: v_alignbit_b32 v4, v4, v20, 16 +; VI-NEXT: v_alignbit_b32 v3, v3, v19, 16 +; VI-NEXT: v_alignbit_b32 v2, v2, v18, 16 +; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v16, 16 +; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v32f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v17, v17, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v17, v17, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v18, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v19, v19, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v19, v19, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v20, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_bfe_u32 v20, v19, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v20, v20, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v20, v20, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_bfe_u32 v21, v20, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v21, v21, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; GFX9-NEXT: v_bfe_u32 v21, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v21, v21, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v22, v22, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; GFX9-NEXT: v_bfe_u32 v22, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v22, v22, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v23, v23, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc +; GFX9-NEXT: v_bfe_u32 v23, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v23, v23, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v24, v24, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; GFX9-NEXT: v_bfe_u32 v24, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v24, v24, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v25, v25, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc +; GFX9-NEXT: v_bfe_u32 v25, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v25, v25, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v26, v26, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v27, vcc +; GFX9-NEXT: v_bfe_u32 v26, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v26, v26, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v27, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX9-NEXT: v_bfe_u32 v27, v26, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v27, v27, v26, s6 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v26 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v27, v28, vcc +; GFX9-NEXT: v_bfe_u32 v27, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v27, v27, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX9-NEXT: v_bfe_u32 v28, v27, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v28, v28, v27, s6 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v27 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v28, v29, vcc +; GFX9-NEXT: v_bfe_u32 v28, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v28, v28, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v29, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v29, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX9-NEXT: v_bfe_u32 v29, v28, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v29, v29, v28, s6 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v28 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v29, v30, vcc +; GFX9-NEXT: v_bfe_u32 v29, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v29, v29, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v30, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v30, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; GFX9-NEXT: v_bfe_u32 v30, v29, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v30, v30, v29, s6 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v29 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v30, v31, vcc +; GFX9-NEXT: v_bfe_u32 v30, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v30, v30, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v31, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; GFX9-NEXT: v_bfe_u32 v31, v30, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v31, v31, v30, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v30 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v31, v31, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v32, v32, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX9-NEXT: v_bfe_u32 v32, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v32, v32, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v32, v33, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v15, v15, v31, s6 +; GFX9-NEXT: v_perm_b32 v14, v14, v30, s6 +; GFX9-NEXT: v_perm_b32 v13, v13, v29, s6 +; GFX9-NEXT: v_perm_b32 v12, v12, v28, s6 +; GFX9-NEXT: v_perm_b32 v11, v11, v27, s6 +; GFX9-NEXT: v_perm_b32 v10, v10, v26, s6 +; GFX9-NEXT: v_perm_b32 v9, v9, v25, s6 +; GFX9-NEXT: v_perm_b32 v8, v8, v24, s6 +; GFX9-NEXT: v_perm_b32 v7, v7, v23, s6 +; GFX9-NEXT: v_perm_b32 v6, v6, v22, s6 +; GFX9-NEXT: v_perm_b32 v5, v5, v21, s6 +; GFX9-NEXT: v_perm_b32 v4, v4, v20, s6 +; GFX9-NEXT: v_perm_b32 v3, v3, v19, s6 +; GFX9-NEXT: v_perm_b32 v2, v2, v18, s6 +; GFX9-NEXT: v_perm_b32 v1, v1, v17, s6 +; GFX9-NEXT: v_perm_b32 v0, v0, v16, s6 +; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v32f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v28, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v11 +; GFX11-NEXT: v_dual_add_f32 v24, 0x40c00000, v24 :: v_dual_lshlrev_b32 v25, 16, v6 +; GFX11-NEXT: v_bfe_u32 v21, v17, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v19, v16, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v16 +; GFX11-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_add3_u32 v21, v21, v17, 0x7fff +; GFX11-NEXT: v_add3_u32 v19, v19, v16, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_dual_add_f32 v26, 0x40c00000, v26 :: v_dual_lshlrev_b32 v27, 16, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v16, v19, v22, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v22, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; GFX11-NEXT: v_bfe_u32 v20, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_dual_add_f32 v28, 0x40c00000, v28 :: v_dual_add_f32 v29, 0x40c00000, v29 +; GFX11-NEXT: v_add3_u32 v20, v20, v0, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v31, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v20, v23 :: v_dual_lshlrev_b32 v23, 16, v4 +; GFX11-NEXT: v_bfe_u32 v20, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: v_dual_add_f32 v10, 0x40c00000, v10 :: v_dual_add_f32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_perm_b32 v0, v0, v16, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v21, v19, vcc_lo +; GFX11-NEXT: v_add3_u32 v19, v20, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v31, 0x40c00000, v31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v19, v20 :: v_dual_add_f32 v18, 0x40c00000, v18 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_bfe_u32 v21, v18, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX11-NEXT: v_add3_u32 v19, v21, v18, 0x7fff +; GFX11-NEXT: v_bfe_u32 v21, v2, 16, 1 +; GFX11-NEXT: v_bfe_u32 v34, v12, 16, 1 +; GFX11-NEXT: v_perm_b32 v1, v1, v17, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v18, v19, v20 :: v_dual_and_b32 v7, 0xffff0000, v7 +; GFX11-NEXT: v_add3_u32 v19, v21, v2, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v21, v22, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v19, v20 :: v_dual_and_b32 v11, 0xffff0000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v19, v21, v22, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v22 +; GFX11-NEXT: v_bfe_u32 v21, v3, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_bfe_u32 v22, v23, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX11-NEXT: v_perm_b32 v2, v2, v18, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v19, v20, vcc_lo +; GFX11-NEXT: v_add3_u32 v20, v21, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v11 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v20, v22, v23, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v21, 0x400000, v23 +; GFX11-NEXT: v_bfe_u32 v22, v4, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_bfe_u32 v23, v24, 16, 1 +; GFX11-NEXT: v_perm_b32 v3, v3, v19, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v20, v20, v21, vcc_lo +; GFX11-NEXT: v_add3_u32 v21, v22, v4, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v21, v22, vcc_lo +; GFX11-NEXT: v_add3_u32 v21, v23, v24, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v24 +; GFX11-NEXT: v_bfe_u32 v23, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: v_bfe_u32 v24, v25, 16, 1 +; GFX11-NEXT: v_perm_b32 v4, v4, v20, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v21, v22, vcc_lo +; GFX11-NEXT: v_add3_u32 v22, v23, v5, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v22, v23, vcc_lo +; GFX11-NEXT: v_add3_u32 v22, v24, v25, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v25 +; GFX11-NEXT: v_bfe_u32 v24, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: v_bfe_u32 v25, v26, 16, 1 +; GFX11-NEXT: v_perm_b32 v5, v5, v21, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v23, vcc_lo +; GFX11-NEXT: v_add3_u32 v23, v24, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v23, v24, vcc_lo +; GFX11-NEXT: v_add3_u32 v23, v25, v26, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v26 +; GFX11-NEXT: v_bfe_u32 v25, v7, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-NEXT: v_bfe_u32 v26, v27, 16, 1 +; GFX11-NEXT: v_perm_b32 v6, v6, v22, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v23, v23, v24, vcc_lo +; GFX11-NEXT: v_add3_u32 v24, v25, v7, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc_lo +; GFX11-NEXT: v_add3_u32 v24, v26, v27, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v27 +; GFX11-NEXT: v_bfe_u32 v26, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-NEXT: v_bfe_u32 v27, v28, 16, 1 +; GFX11-NEXT: v_perm_b32 v7, v7, v23, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v24, v25, vcc_lo +; GFX11-NEXT: v_add3_u32 v25, v26, v8, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v8 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v8, v25, v26, vcc_lo +; GFX11-NEXT: v_add3_u32 v25, v27, v28, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v26, 0x400000, v28 +; GFX11-NEXT: v_bfe_u32 v27, v9, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_bfe_u32 v28, v29, 16, 1 +; GFX11-NEXT: v_perm_b32 v8, v8, v24, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v25, v25, v26, vcc_lo +; GFX11-NEXT: v_add3_u32 v26, v27, v9, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v26, v27, vcc_lo +; GFX11-NEXT: v_add3_u32 v26, v28, v29, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v27, 0x400000, v29 +; GFX11-NEXT: v_bfe_u32 v28, v10, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 +; GFX11-NEXT: v_bfe_u32 v29, v30, 16, 1 +; GFX11-NEXT: v_perm_b32 v9, v9, v25, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v26, v26, v27, vcc_lo +; GFX11-NEXT: v_add3_u32 v27, v28, v10, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v10 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v10, v27, v28, vcc_lo +; GFX11-NEXT: v_add3_u32 v27, v29, v30, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v28, 0x400000, v30 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_bfe_u32 v30, v31, 16, 1 +; GFX11-NEXT: v_bfe_u32 v29, v11, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v27, v27, v28 :: v_dual_lshlrev_b32 v28, 16, v13 +; GFX11-NEXT: v_add3_u32 v30, v30, v31, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: v_add3_u32 v31, v34, v12, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-NEXT: v_add3_u32 v29, v29, v11, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v30, v30, v33, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_bfe_u32 v35, v28, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v28 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v31, v33, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v34, v35, v28, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v35, 16, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: v_bfe_u32 v37, v13, 16, 1 +; GFX11-NEXT: v_perm_b32 v10, v10, v26, 0x7060302 +; GFX11-NEXT: v_dual_add_f32 v31, 0x40c00000, v35 :: v_dual_cndmask_b32 v28, v34, v36 +; GFX11-NEXT: v_lshlrev_b32_e32 v34, 16, v15 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: v_add3_u32 v33, v37, v13, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_add_f32_e32 v34, 0x40c00000, v34 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-NEXT: v_bfe_u32 v38, v14, 16, 1 +; GFX11-NEXT: v_add3_u32 v35, v35, v31, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_bfe_u32 v39, v34, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v48, 0x400000, v34 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v35, v37, vcc_lo +; GFX11-NEXT: v_add3_u32 v37, v38, v14, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v38, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_bfe_u32 v35, v15, 16, 1 +; GFX11-NEXT: v_add3_u32 v39, v39, v34, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v49, 0x400000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v37, v38, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-NEXT: v_add3_u32 v35, v35, v15, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v14, v14, v31, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v34, v39, v48, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v35, v49, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v15, v15, v34, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v33, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_perm_b32 v12, v12, v30, 0x7060302 +; GFX11-NEXT: v_perm_b32 v13, v13, v28, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v29, v32, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v11, v11, v27, 0x7060302 +; GFX11-NEXT: .LBB51_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + +define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v32f16_to_v64i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_cvt_f16_f32_e32 v38, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v34, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v49, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v35, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v50, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v36, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v51, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v37, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v52, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v54, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v33 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v39 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; kill: killed $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; kill: killed $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; kill: killed $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB52_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v1 +; GCN-NEXT: v_bfe_u32 v33, v31, 8, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v33, v8, 8, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v33, v6, 8, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v33, v5, 8, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v33, v4, 8, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v33, v3, 8, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v33, v2, 8, 8 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_or_b32_e32 v57, v34, v10 +; GCN-NEXT: v_or_b32_e32 v56, v32, v11 +; GCN-NEXT: v_or_b32_e32 v47, v35, v12 +; GCN-NEXT: v_or_b32_e32 v45, v7, v14 +; GCN-NEXT: v_or_b32_e32 v41, v36, v20 +; GCN-NEXT: v_or_b32_e32 v53, v9, v23 +; GCN-NEXT: v_or_b32_e32 v48, v37, v25 +; GCN-NEXT: v_or_b32_e32 v39, v13, v26 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v33, v16, v29 +; GCN-NEXT: v_or_b32_e32 v30, v18, v30 +; GCN-NEXT: v_or_b32_e32 v25, v15, v38 +; GCN-NEXT: v_or_b32_e32 v23, v22, v49 +; GCN-NEXT: v_or_b32_e32 v20, v17, v24 +; GCN-NEXT: v_or_b32_e32 v14, v27, v50 +; GCN-NEXT: v_or_b32_e32 v12, v21, v28 +; GCN-NEXT: v_or_b32_e32 v10, v19, v51 +; GCN-NEXT: v_alignbit_b32 v42, v56, v57, 24 +; GCN-NEXT: v_alignbit_b32 v43, v56, v57, 16 +; GCN-NEXT: v_alignbit_b32 v61, v56, v57, 8 +; GCN-NEXT: v_alignbit_b32 v55, v45, v47, 24 +; GCN-NEXT: v_alignbit_b32 v40, v45, v47, 16 +; GCN-NEXT: v_alignbit_b32 v60, v45, v47, 8 +; GCN-NEXT: v_alignbit_b32 v7, v53, v41, 24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v53, v41, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v59, v53, v41, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v39, v48, 24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v39, v48, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v46, v39, v48, 8 +; GCN-NEXT: v_alignbit_b32 v29, v30, v33, 24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v30, v33, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v26, v30, v33, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 8 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 8 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 8 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v11, 8, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v45 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 8, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v30 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v23 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v14 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v10 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v7, v1, 8, 8 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: .LBB52_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB52_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v10, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v12, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v14, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v20, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v22, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v51 +; GCN-NEXT: v_cvt_f32_f16_e32 v24, v37 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v49 +; GCN-NEXT: v_cvt_f32_f16_e32 v28, v35 +; GCN-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v32 +; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 +; GCN-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; GCN-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; GCN-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v33, v12 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v14 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v17 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v17, v19 +; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 +; GCN-NEXT: v_cvt_f16_f32_e32 v20, v22 +; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 +; GCN-NEXT: v_cvt_f16_f32_e32 v22, v23 +; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 +; GCN-NEXT: v_cvt_f16_f32_e32 v23, v25 +; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 +; GCN-NEXT: v_cvt_f16_f32_e32 v25, v27 +; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GCN-NEXT: v_cvt_f16_f32_e32 v28, v29 +; GCN-NEXT: v_cvt_f16_f32_e32 v29, v30 +; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 +; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v31 +; GCN-NEXT: v_bfe_u32 v12, v31, 8, 8 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v12, v8, 8, 8 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v12, v6, 8, 8 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v12, v5, 8, 8 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v12, v4, 8, 8 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v12, v3, 8, 8 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v12, v2, 8, 8 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v11, v10 +; GCN-NEXT: v_or_b32_e32 v10, v33, v30 +; GCN-NEXT: v_or_b32_e32 v20, v14, v34 +; GCN-NEXT: v_or_b32_e32 v14, v17, v35 +; GCN-NEXT: v_or_b32_e32 v25, v15, v19 +; GCN-NEXT: v_or_b32_e32 v23, v21, v36 +; GCN-NEXT: v_or_b32_e32 v33, v16, v37 +; GCN-NEXT: v_or_b32_e32 v30, v18, v38 +; GCN-NEXT: v_or_b32_e32 v48, v24, v22 +; GCN-NEXT: v_or_b32_e32 v39, v13, v39 +; GCN-NEXT: v_or_b32_e32 v41, v26, v49 +; GCN-NEXT: v_or_b32_e32 v53, v9, v50 +; GCN-NEXT: v_or_b32_e32 v47, v27, v51 +; GCN-NEXT: v_or_b32_e32 v45, v7, v52 +; GCN-NEXT: v_or_b32_e32 v57, v29, v28 +; GCN-NEXT: v_or_b32_e32 v56, v32, v54 +; GCN-NEXT: v_alignbit_b32 v42, v56, v57, 24 +; GCN-NEXT: v_alignbit_b32 v43, v56, v57, 16 +; GCN-NEXT: v_alignbit_b32 v61, v56, v57, 8 +; GCN-NEXT: v_alignbit_b32 v55, v45, v47, 24 +; GCN-NEXT: v_alignbit_b32 v40, v45, v47, 16 +; GCN-NEXT: v_alignbit_b32 v60, v45, v47, 8 +; GCN-NEXT: v_alignbit_b32 v7, v53, v41, 24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v53, v41, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v59, v53, v41, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v39, v48, 24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v39, v48, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v46, v39, v48, 8 +; GCN-NEXT: v_alignbit_b32 v29, v30, v33, 24 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v30, v33, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v26, v30, v33, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v23, v25, 8 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v14, v20, 8 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v7, v10, v12, 8 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v11, 8, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v45 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 8, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v44, 8, v30 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v23 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v14 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v10 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_bfe_u32 v7, v1, 8, 8 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: .LBB52_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v61 +; GCN-NEXT: v_or_b32_e32 v13, v7, v9 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v11 +; GCN-NEXT: v_or_b32_e32 v15, v7, v9 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v60 +; GCN-NEXT: v_or_b32_e32 v16, v7, v9 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v63 +; GCN-NEXT: v_or_b32_e32 v17, v7, v9 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v59 +; GCN-NEXT: v_or_b32_e32 v19, v7, v9 +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v43 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v42 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v31 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v34, 24, v7 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v62 +; GCN-NEXT: v_or_b32_e32 v21, v7, v9 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 4, v0 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 24, v55 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v46 +; GCN-NEXT: v_or_b32_e32 v22, v9, v11 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v49, 24, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v58 +; GCN-NEXT: v_or_b32_e32 v24, v8, v11 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 12, v0 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v11 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v26 +; GCN-NEXT: v_or_b32_e32 v26, v11, v18 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v51, 0xff, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v44 +; GCN-NEXT: v_or_b32_e32 v28, v6, v18 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v54, 0xff, v18 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v30, 24, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v25 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; GCN-NEXT: v_or_b32_e32 v25, v18, v25 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v33, 24, v5 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v23 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GCN-NEXT: v_or_b32_e32 v23, v5, v23 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v44, 0xff, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v29 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 8, v29 +; GCN-NEXT: v_or_b32_e32 v29, v20, v29 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v46, 0xff, v4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GCN-NEXT: v_or_b32_e32 v14, v4, v14 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v53 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v53 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v53 +; GCN-NEXT: v_or_b32_e32 v53, v12, v53 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v56, 0xff, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GCN-NEXT: v_or_b32_e32 v41, v3, v10 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v57, 0xff, v10 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v43, 24, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_or_b32_e32 v37, v38, v10 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v58, 0xff, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GCN-NEXT: v_or_b32_e32 v34, v34, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v59, 0xff, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v45, 24, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v35 +; GCN-NEXT: v_or_b32_e32 v35, v36, v32 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v60, 24, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; GCN-NEXT: v_or_b32_e32 v48, v49, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v59 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v27, v27, v39 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GCN-NEXT: v_or_b32_e32 v31, v31, v49 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_or_b32_e32 v30, v30, v51 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_or_b32_e32 v33, v33, v54 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_or_b32_e32 v39, v50, v42 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_or_b32_e32 v49, v52, v44 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_or_b32_e32 v50, v55, v46 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_or_b32_e32 v51, v40, v47 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_or_b32_e32 v52, v43, v56 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_or_b32_e32 v38, v38, v57 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v53 +; GCN-NEXT: v_or_b32_e32 v54, v45, v58 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v41 +; GCN-NEXT: v_or_b32_e32 v36, v60, v36 +; GCN-NEXT: v_or_b32_e32 v13, v13, v37 +; GCN-NEXT: v_or_b32_e32 v15, v15, v34 +; GCN-NEXT: v_or_b32_e32 v16, v16, v35 +; GCN-NEXT: v_or_b32_e32 v17, v17, v48 +; GCN-NEXT: v_or_b32_e32 v19, v19, v27 +; GCN-NEXT: v_or_b32_e32 v21, v21, v31 +; GCN-NEXT: v_or_b32_e32 v22, v22, v30 +; GCN-NEXT: v_or_b32_e32 v24, v24, v33 +; GCN-NEXT: v_or_b32_e32 v26, v26, v39 +; GCN-NEXT: v_or_b32_e32 v27, v28, v49 +; GCN-NEXT: v_or_b32_e32 v25, v25, v50 +; GCN-NEXT: v_or_b32_e32 v23, v23, v51 +; GCN-NEXT: v_or_b32_e32 v28, v29, v52 +; GCN-NEXT: v_or_b32_e32 v14, v14, v38 +; GCN-NEXT: v_or_b32_e32 v29, v53, v54 +; GCN-NEXT: v_or_b32_e32 v30, v55, v36 +; GCN-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v21, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v18, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v20, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v12, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v14, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v29, v32, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v30, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32f16_to_v64i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v1 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v15 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v14 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v28, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v46, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v59, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v54, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v40, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v2 +; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v1 +; VI-NEXT: .LBB52_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v17, 0x200 +; VI-NEXT: v_add_f16_sdwa v51, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v53, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v29, v2, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v53 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v49, v4, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v28, v1, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 +; VI-NEXT: v_add_f16_sdwa v52, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v55, v4, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 +; VI-NEXT: v_add_f16_sdwa v38, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v54, v3, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v38 +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_add_f16_sdwa v50, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v26, v6, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 +; VI-NEXT: v_add_f16_sdwa v36, v8, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v25, v5, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v36 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_add_f16_sdwa v48, v7, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v23, v8, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v48 +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 +; VI-NEXT: v_add_f16_sdwa v34, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v22, v7, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v34 +; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 +; VI-NEXT: v_add_f16_sdwa v39, v9, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v47, v10, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_sdwa v32, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v46, v9, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 +; VI-NEXT: v_add_f16_sdwa v37, v11, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v21, v12, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v37 +; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 +; VI-NEXT: v_add_f16_sdwa v31, v14, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v20, v11, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v35, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v41, v14, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v35 +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_add_f16_sdwa v30, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v33, v15, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v40, v13, v18 +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v30 +; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 +; VI-NEXT: v_or_b32_e32 v43, v16, v18 +; VI-NEXT: v_or_b32_e32 v42, v15, v17 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[42:43] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[40:41] +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v43 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v42 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v41 +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v20 +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[20:21] +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v21 +; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v47 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[46:47] +; VI-NEXT: v_lshrrev_b32_e32 v47, 8, v23 +; VI-NEXT: v_lshrrev_b64 v[23:24], 24, v[22:23] +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v22 +; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v26 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v25 +; VI-NEXT: v_lshrrev_b64 v[26:27], 24, v[25:26] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[54:55] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[28:29] +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v40 +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v46 +; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v55 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v54 +; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v29 +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v28 +; VI-NEXT: v_bfe_u32 v28, v30, 8, 8 +; VI-NEXT: v_bfe_u32 v29, v31, 8, 8 +; VI-NEXT: v_bfe_u32 v46, v32, 8, 8 +; VI-NEXT: v_bfe_u32 v57, v34, 8, 8 +; VI-NEXT: v_bfe_u32 v59, v36, 8, 8 +; VI-NEXT: v_bfe_u32 v61, v38, 8, 8 +; VI-NEXT: v_bfe_u32 v54, v49, 8, 8 +; VI-NEXT: v_bfe_u32 v40, v51, 8, 8 +; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v55 +; VI-NEXT: v_or_b32_sdwa v2, v2, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v24 +; VI-NEXT: v_or_b32_sdwa v22, v53, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v40 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v63 +; VI-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v60 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v23 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v59 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v44 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v46 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v29 +; VI-NEXT: v_or_b32_sdwa v2, v31, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v28 +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32f16_to_v64i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 24, v16 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_mov_b32_e32 v26, v23 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: .LBB52_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: .LBB52_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v29 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v27, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32f16_to_v64i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB52_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB52_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v25 +; GFX11-NEXT: v_and_b32_e32 v96, 0xff, v96 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v86, 0xff, v86 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v25 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v87 +; GFX11-NEXT: v_or_b32_e32 v24, v96, v24 +; GFX11-NEXT: v_lshlrev_b16 v85, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v25 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_or_b32_e32 v25, v86, v85 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_lshlrev_b16 v84, 8, v84 +; GFX11-NEXT: v_and_b32_e32 v83, 0xff, v83 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v24 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v82 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v80, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_lshlrev_b16 v71, 8, v71 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v81 +; GFX11-NEXT: v_and_b32_e32 v70, 0xff, v70 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v22 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v84 +; GFX11-NEXT: v_or_b32_e32 v23, v83, v23 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v80 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v71 +; GFX11-NEXT: v_or_b32_e32 v22, v70, v22 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v24 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v23 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v25 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v66 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v65 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v55 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v22 +; GFX11-NEXT: v_or_b32_e32 v21, v23, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v22, v25, v54 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v52 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v51 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v50 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v49 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v48 +; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v39 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v53 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v24 +; GFX11-NEXT: v_or_b32_e32 v23, v25, v49 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v48 +; GFX11-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v22 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v23 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v38 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v37 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v35 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v34 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v33 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b16 v69, 8, v69 +; GFX11-NEXT: v_and_b32_e32 v68, 0xff, v68 +; GFX11-NEXT: v_lshlrev_b16 v67, 8, v67 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v19, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v22 +; GFX11-NEXT: v_or_b32_e32 v18, v23, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v32 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v31 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v30 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v29 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v28 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v27 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v26 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v69 +; GFX11-NEXT: v_or_b32_e32 v67, v68, v67 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v22 +; GFX11-NEXT: v_or_b32_e32 v17, v23, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v67, 16, v67 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v67 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v21 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x half> %a, splat (half 0xH0200) + %a2 = bitcast <32 x half> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x half> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + +define <32 x half> @bitcast_v64i8_to_v32f16(<64 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i8_to_v32f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:120 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v9 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v11 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v15 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v17 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v23 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v25 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v27 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v29 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v38 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v37 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v52 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v51 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v50 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v36 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v35 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v34 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v33 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v32 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v30 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v44, 8, v26 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 8, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 8, v48 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB53_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v3, v4 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v22 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v43 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v60 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v58 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v57 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v56 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v42 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v40 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v63 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v62 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v61 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v41 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v59 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v25, v26 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v27, v26 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v29, v27 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v31, v28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v32, v29 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v33, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v34, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v4, v32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v5, v32 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v6, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v34, v7, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v8, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v36, v9, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v10, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v38, v11, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v12, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v48, v13, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v14, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v50, v15, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v16, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v52, v17, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v18, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v54, v19, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v20, v5 +; GCN-NEXT: v_or_b32_e32 v40, v21, v44 +; GCN-NEXT: v_or_b32_e32 v22, v22, v45 +; GCN-NEXT: v_or_b32_e32 v41, v23, v46 +; GCN-NEXT: v_or_b32_e32 v24, v24, v47 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v28 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v29 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v30 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v31 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v32 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v34 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v36 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v18 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v40 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v22 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v24 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; kill: killed $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; kill: killed $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; kill: killed $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; kill: killed $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: .LBB53_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB53_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v59 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_or_b32_e32 v1, v47, v1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v26 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v46, v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v28 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v45, v5 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v41 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v44, v7 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v61 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v62 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v40 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v42 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v30 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v56 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v57 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v58 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v60 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v43 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v31 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v29 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v32, v9 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v32, v11 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v32, v13 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v32, v15 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v32, v17 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v32, v19 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v32, v23 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v32, v24 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v32, v25 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v32, v26 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v32, v27 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v32, v28 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v32, v29 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v32, v30 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v32, v31 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v22, v32, v22 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v32, v20 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v32, v18 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v32, v16 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v14, v32, v14 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v32, v12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v10, v32, v10 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v8, v32, v8 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v32, v6 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v32, v4 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v32, v2 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v0, v32, v0 +; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x300, v1 +; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v41, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v42, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v43, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v48, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v44, vcc, s6, v17 +; GCN-NEXT: v_add_i32_e32 v38, vcc, s6, v19 +; GCN-NEXT: v_add_i32_e32 v21, vcc, s6, v21 +; GCN-NEXT: v_add_i32_e32 v23, vcc, s6, v23 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v24 +; GCN-NEXT: v_add_i32_e32 v24, vcc, s6, v25 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v26 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s6, v27 +; GCN-NEXT: v_add_i32_e32 v15, vcc, s6, v28 +; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v29 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v30 +; GCN-NEXT: v_add_i32_e32 v27, vcc, s6, v31 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v22 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v20 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v18 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v16 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v55, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v39, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v35, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v51, v12 +; GCN-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v33, v16 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v37, v20 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v49, v27 +; GCN-NEXT: v_cvt_f32_f16_e32 v13, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v53, v26 +; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v32, v25 +; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v34, v24 +; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 +; GCN-NEXT: v_cvt_f32_f16_e32 v36, v23 +; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 +; GCN-NEXT: v_cvt_f32_f16_e32 v38, v38 +; GCN-NEXT: v_cvt_f32_f16_e32 v23, v44 +; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 +; GCN-NEXT: v_cvt_f32_f16_e32 v25, v43 +; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 +; GCN-NEXT: v_cvt_f32_f16_e32 v27, v42 +; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 +; GCN-NEXT: v_cvt_f32_f16_e32 v29, v41 +; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 +; GCN-NEXT: v_cvt_f32_f16_e32 v31, v40 +; GCN-NEXT: .LBB53_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v55 +; GCN-NEXT: v_mov_b32_e32 v2, v39 +; GCN-NEXT: v_mov_b32_e32 v4, v35 +; GCN-NEXT: v_mov_b32_e32 v6, v51 +; GCN-NEXT: v_mov_b32_e32 v8, v33 +; GCN-NEXT: v_mov_b32_e32 v10, v37 +; GCN-NEXT: v_mov_b32_e32 v12, v49 +; GCN-NEXT: v_mov_b32_e32 v14, v53 +; GCN-NEXT: v_mov_b32_e32 v16, v32 +; GCN-NEXT: v_mov_b32_e32 v18, v34 +; GCN-NEXT: v_mov_b32_e32 v20, v36 +; GCN-NEXT: v_mov_b32_e32 v22, v38 +; GCN-NEXT: v_mov_b32_e32 v24, v48 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mov_b32_e32 v26, v50 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v28, v52 +; GCN-NEXT: v_mov_b32_e32 v30, v54 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i8_to_v32f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v16 +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v18 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v22 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v31 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB53_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v9, v39, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v10, v20, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v28, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v22, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v30, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v38, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v0, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v1, v1, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v7, v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v8, v8, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v4, v4, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v49, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v48, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v55, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v36, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v44, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v61, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: .LBB53_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB53_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_add_u16_e32 v0, 3, v38 +; VI-NEXT: v_add_u16_e32 v2, 3, v44 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v14, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v2, 3, v35 +; VI-NEXT: v_mov_b32_e32 v3, 0x300 +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v0, 3, v37 +; VI-NEXT: v_or_b32_sdwa v24, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v0, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v12, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v43 +; VI-NEXT: v_or_b32_sdwa v16, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v0, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v11, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v0, 3, v36 +; VI-NEXT: v_or_b32_sdwa v22, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v0, 3, v28 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v10, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v55 +; VI-NEXT: v_or_b32_sdwa v28, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v48 +; VI-NEXT: v_or_b32_sdwa v20, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v49 +; VI-NEXT: v_or_b32_sdwa v30, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v13, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v61 +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v31 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v26, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v12, v16, v12 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v24 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: v_or_b32_e32 v13, v16, v13 +; VI-NEXT: v_or_b32_e32 v14, v14, v26 +; VI-NEXT: v_or_b32_e32 v15, v15, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v29, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v25, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v21, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v17, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: v_or_b32_e32 v4, v17, v4 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v21 +; VI-NEXT: v_or_b32_e32 v5, v17, v5 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v6, v17, v6 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v29 +; VI-NEXT: v_or_b32_e32 v7, v17, v7 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v30 +; VI-NEXT: v_or_b32_e32 v8, v17, v8 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v20 +; VI-NEXT: v_or_b32_e32 v9, v17, v9 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v28 +; VI-NEXT: v_or_b32_e32 v10, v17, v10 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v22 +; VI-NEXT: v_or_b32_e32 v11, v17, v11 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v19, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v23, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v27, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v31, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v31 +; VI-NEXT: v_or_b32_e32 v1, v3, v1 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v27 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v23 +; VI-NEXT: v_or_b32_e32 v3, v3, v19 +; VI-NEXT: .LBB53_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i8_to_v32f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v16 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v18 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v20 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v24 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v32 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB53_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_or_b32_sdwa v9, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_or_b32_sdwa v10, v62, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v63, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v35, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v16, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v38, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: v_or_b32_sdwa v8, v51, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 +; GFX9-NEXT: v_or_b32_sdwa v9, v52, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: v_or_b32_sdwa v10, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 +; GFX9-NEXT: v_or_b32_sdwa v11, v37, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 +; GFX9-NEXT: v_or_b32_sdwa v12, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 +; GFX9-NEXT: v_or_b32_sdwa v13, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 +; GFX9-NEXT: v_or_b32_sdwa v14, v47, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 +; GFX9-NEXT: v_or_b32_sdwa v15, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: .LBB53_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB53_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v35 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v51 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v2 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v1 +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v8, v30, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v28, v9, s6 +; GFX9-NEXT: v_perm_b32 v10, v26, v10, s6 +; GFX9-NEXT: v_perm_b32 v11, v24, v11, s6 +; GFX9-NEXT: v_perm_b32 v12, v22, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v16, v13, s6 +; GFX9-NEXT: v_perm_b32 v14, v20, v14, s6 +; GFX9-NEXT: v_perm_b32 v15, v18, v15, s6 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v31, v48, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v7, v27, v7, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v6, v23, v6, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v5, v19, v5, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v4, v17, v4, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v3, v21, v3, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v2, v25, v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v1, v29, v1, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 +; GFX9-NEXT: .LBB53_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i8_to_v32f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v32, v14 :: v_dual_mov_b32 v31, v8 +; GFX11-NEXT: v_dual_mov_b32 v34, v12 :: v_dual_mov_b32 v35, v10 +; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 +; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v83, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v81, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v119, 8, v0 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v118, 8, v2 +; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: s_waitcnt vmcnt(21) +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v14 +; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: v_lshlrev_b16 v98, 8, v96 +; GFX11-NEXT: s_waitcnt vmcnt(19) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v100 +; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: v_lshlrev_b16 v96, 8, v101 +; GFX11-NEXT: s_waitcnt vmcnt(17) +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v102 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_lshlrev_b16 v112, 8, v103 +; GFX11-NEXT: s_waitcnt vmcnt(15) +; GFX11-NEXT: v_lshlrev_b16 v103, 8, v113 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_lshlrev_b16 v101, 8, v114 +; GFX11-NEXT: s_waitcnt vmcnt(13) +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v115 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: v_lshlrev_b16 v114, 8, v116 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: v_lshlrev_b16 v102, 8, v117 +; GFX11-NEXT: v_lshlrev_b16 v117, 8, v10 +; GFX11-NEXT: v_lshlrev_b16 v113, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v116, 8, v6 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v4 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v35 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v50 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v51 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v39 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v54 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v48 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v55 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v49 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v53 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v52 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v17 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v83 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v81 +; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v6, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v8, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v10, v9, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v71 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v65 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v68 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v87 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v84 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v21 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v19 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v27 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v23 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v98 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v25 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v96 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v29 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v112 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v103 +; GFX11-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v11, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v13, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v15, v14, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v86 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v82 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v97 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v99 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v64 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v66 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v70 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v67 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v101 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v100 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v114 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v102 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v117 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v113 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v116 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v115 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v118 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v119 +; GFX11-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v12, v14, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v16, v15, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v18, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v20, v19, 0x5040100 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB53_2 +; GFX11-NEXT: .LBB53_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v70, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v69, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v66, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v64, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_add_nc_u16 v6, v97, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v118, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v119, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v99, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v64, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v116, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v5 +; GFX11-NEXT: v_or_b32_e32 v2, v115, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v113, v1 +; GFX11-NEXT: v_add_nc_u16 v66, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v114, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v117, v4 +; GFX11-NEXT: v_add_nc_u16 v2, v85, 3 +; GFX11-NEXT: v_add_nc_u16 v67, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v82, 3 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v4 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v84, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v102, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v101, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v100, v1 +; GFX11-NEXT: v_add_nc_u16 v69, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v112, v3 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v103, v4 +; GFX11-NEXT: v_add_nc_u16 v70, 0x300, v1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_add_nc_u16 v1, v80, 3 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v68, 3 +; GFX11-NEXT: v_add_nc_u16 v68, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v0, v71, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v4, v28, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v96, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v29, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v98, v0 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v25, v3 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v27, v4 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v30, 3 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v26, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v3, v20, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v22, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v83, v3 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v81, v4 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v31, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v3, v18, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v32, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v53, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v55, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v54, v1 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v17, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v52, v4 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v17, v36, 3 +; GFX11-NEXT: v_add_nc_u16 v21, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v22, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v23, v35, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-NEXT: v_or_b32_e32 v1, v49, v1 +; GFX11-NEXT: v_or_b32_e32 v17, v50, v17 +; GFX11-NEXT: v_or_b32_e32 v21, v51, v21 +; GFX11-NEXT: v_or_b32_e32 v22, v39, v22 +; GFX11-NEXT: v_or_b32_e32 v23, v48, v23 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v2 +; GFX11-NEXT: v_perm_b32 v0, v21, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v23, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v24, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v26, v18, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v16, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v20, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v19, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v25, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v68, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v70, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v12, v69, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v67, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v66, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v64, v15, 0x5040100 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <32 x half> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <32 x half> + br label %end + +end: + %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x half> %phi +} + +define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v32bf16_to_v64i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v26 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v25 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v27 +; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v30 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v39 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v31 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; kill: killed $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB54_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v51 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v52 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v54 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v50 +; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v42 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v43 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v55 +; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v44 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v28 +; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v46 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v35 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v26, 24, v37 +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v50 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v18 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v55 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v28 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v29 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v57, v5, v34, 16 +; GCN-NEXT: v_alignbit_b32 v56, v14, v32, 16 +; GCN-NEXT: v_alignbit_b32 v47, v11, v36, 16 +; GCN-NEXT: v_alignbit_b32 v45, v10, v7, 16 +; GCN-NEXT: v_alignbit_b32 v41, v13, v38, 16 +; GCN-NEXT: v_alignbit_b32 v53, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v48, v16, v49, 16 +; GCN-NEXT: v_alignbit_b32 v39, v6, v15, 16 +; GCN-NEXT: v_alignbit_b32 v33, v21, v17, 16 +; GCN-NEXT: v_alignbit_b32 v31, v4, v19, 16 +; GCN-NEXT: v_alignbit_b32 v26, v24, v20, 16 +; GCN-NEXT: v_alignbit_b32 v24, v3, v22, 16 +; GCN-NEXT: v_alignbit_b32 v21, v51, v23, 16 +; GCN-NEXT: v_alignbit_b32 v16, v2, v25, 16 +; GCN-NEXT: v_alignbit_b32 v13, v52, v27, 16 +; GCN-NEXT: v_alignbit_b32 v11, v1, v30, 16 +; GCN-NEXT: v_alignbit_b32 v5, v56, v57, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v56, v57, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v61, v56, v57, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v45, v47, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v45, v47, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v60, v45, v47, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v53, v41, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v53, v41, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v59, v53, v41, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v45 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 8, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v31 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v11 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: .LBB54_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB54_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v46 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v30 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v44 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v28 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v43 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v55 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v42 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v49 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v40 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v50 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v38 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v54 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v52 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v37 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v34 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v51 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v35 +; GCN-NEXT: v_add_f32_e32 v33, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v34, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v35, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v6 +; GCN-NEXT: v_add_f32_e32 v36, 0x40c00000, v8 +; GCN-NEXT: v_add_f32_e32 v37, 0x40c00000, v10 +; GCN-NEXT: v_add_f32_e32 v38, 0x40c00000, v11 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v13 +; GCN-NEXT: v_add_f32_e32 v39, 0x40c00000, v14 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v16 +; GCN-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v20 +; GCN-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GCN-NEXT: v_add_f32_e32 v13, 0x40c00000, v18 +; GCN-NEXT: v_add_f32_e32 v18, 0x40c00000, v21 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v22 +; GCN-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GCN-NEXT: v_add_f32_e32 v16, 0x40c00000, v23 +; GCN-NEXT: v_add_f32_e32 v20, 0x40c00000, v24 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v25 +; GCN-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GCN-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GCN-NEXT: v_add_f32_e32 v22, 0x40c00000, v26 +; GCN-NEXT: v_add_f32_e32 v10, 0x40c00000, v27 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GCN-NEXT: v_add_f32_e32 v21, 0x40c00000, v28 +; GCN-NEXT: v_add_f32_e32 v23, 0x40c00000, v29 +; GCN-NEXT: v_add_f32_e32 v14, 0x40c00000, v30 +; GCN-NEXT: v_add_f32_e32 v25, 0x40c00000, v31 +; GCN-NEXT: v_add_f32_e32 v24, 0x40c00000, v32 +; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v35 +; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v37 +; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v12 +; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v21 +; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v24 +; GCN-NEXT: v_lshrrev_b32_e32 v24, 24, v24 +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v21, 24, v21 +; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v12 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v16 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v12, 24, v13 +; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v11 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v37 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v35 +; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v13, v26, v33, 16 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v11, v1, v34, 16 +; GCN-NEXT: v_alignbit_b32 v21, v27, v5, 16 +; GCN-NEXT: v_alignbit_b32 v16, v2, v36, 16 +; GCN-NEXT: v_alignbit_b32 v26, v28, v38, 16 +; GCN-NEXT: v_alignbit_b32 v24, v3, v39, 16 +; GCN-NEXT: v_alignbit_b32 v33, v29, v17, 16 +; GCN-NEXT: v_alignbit_b32 v31, v4, v19, 16 +; GCN-NEXT: v_alignbit_b32 v48, v30, v18, 16 +; GCN-NEXT: v_alignbit_b32 v39, v6, v15, 16 +; GCN-NEXT: v_alignbit_b32 v41, v32, v20, 16 +; GCN-NEXT: v_alignbit_b32 v53, v8, v9, 16 +; GCN-NEXT: v_alignbit_b32 v47, v49, v22, 16 +; GCN-NEXT: v_alignbit_b32 v45, v10, v7, 16 +; GCN-NEXT: v_alignbit_b32 v57, v50, v23, 16 +; GCN-NEXT: v_alignbit_b32 v56, v14, v25, 16 +; GCN-NEXT: v_alignbit_b32 v5, v56, v57, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v56, v57, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v61, v56, v57, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v45, v47, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v45, v47, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v60, v45, v47, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v53, v41, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v53, v41, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: v_alignbit_b32 v59, v53, v41, 8 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v39, v48, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v31, v33, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v24, v26, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v16, v21, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 16 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_alignbit_b32 v5, v11, v13, 8 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v56 +; GCN-NEXT: v_lshrrev_b32_e32 v63, 8, v45 +; GCN-NEXT: v_lshrrev_b32_e32 v62, 8, v53 +; GCN-NEXT: v_lshrrev_b32_e32 v58, 8, v39 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v31 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v24 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v16 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v7, 8, v11 +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GCN-NEXT: .LBB54_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v61 +; GCN-NEXT: v_or_b32_e32 v12, v7, v9 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GCN-NEXT: v_or_b32_e32 v15, v7, v5 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v60 +; GCN-NEXT: v_or_b32_e32 v17, v5, v7 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v63 +; GCN-NEXT: v_or_b32_e32 v18, v5, v7 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v41 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v59 +; GCN-NEXT: v_or_b32_e32 v19, v5, v7 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v5 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v5 +; GCN-NEXT: v_and_b32_e32 v30, 0xff, v14 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v32, 24, v5 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v53 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v62 +; GCN-NEXT: v_or_b32_e32 v20, v5, v7 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v7 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v35, 24, v7 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v48 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GCN-NEXT: v_or_b32_e32 v22, v7, v9 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v10 +; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v9 +; GCN-NEXT: v_and_b32_e32 v9, 0xff, v39 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v58 +; GCN-NEXT: v_or_b32_e32 v23, v9, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v10 +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v33 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GCN-NEXT: v_or_b32_e32 v25, v10, v14 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 16, v0 +; GCN-NEXT: v_and_b32_e32 v51, 0xff, v8 +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v31 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GCN-NEXT: v_or_b32_e32 v28, v8, v14 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 20, v0 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v54, 0xff, v14 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v14 +; GCN-NEXT: v_and_b32_e32 v14, 0xff, v26 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; GCN-NEXT: v_or_b32_e32 v26, v14, v26 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; GCN-NEXT: v_and_b32_e32 v42, 0xff, v6 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v24 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v24, 8, v24 +; GCN-NEXT: v_or_b32_e32 v24, v6, v24 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v44, 0xff, v33 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v33 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v33 +; GCN-NEXT: v_or_b32_e32 v33, v21, v33 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; GCN-NEXT: v_and_b32_e32 v46, 0xff, v4 +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v16 +; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GCN-NEXT: v_or_b32_e32 v16, v4, v16 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 36, v0 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v47, 0xff, v53 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v55, 24, v53 +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v53 +; GCN-NEXT: v_or_b32_e32 v53, v13, v53 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 40, v0 +; GCN-NEXT: v_and_b32_e32 v56, 0xff, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v40, 24, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GCN-NEXT: v_or_b32_e32 v41, v3, v11 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v57, 0xff, v11 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v43, 24, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v36 +; GCN-NEXT: v_or_b32_e32 v36, v37, v11 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; GCN-NEXT: v_and_b32_e32 v58, 0xff, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v37, 24, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; GCN-NEXT: v_or_b32_e32 v32, v32, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v59, 0xff, v30 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v45, 24, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v34 +; GCN-NEXT: v_or_b32_e32 v34, v35, v30 +; GCN-NEXT: v_add_i32_e32 v30, vcc, 56, v0 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v60, 24, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; GCN-NEXT: v_or_b32_e32 v38, v48, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v51 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v42 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v44 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56 +; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v57 +; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v58 +; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v59 +; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_or_b32_e32 v27, v27, v48 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; GCN-NEXT: v_or_b32_e32 v29, v29, v49 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_or_b32_e32 v31, v31, v51 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GCN-NEXT: v_or_b32_e32 v39, v39, v54 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_or_b32_e32 v48, v50, v42 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_or_b32_e32 v49, v52, v44 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GCN-NEXT: v_or_b32_e32 v50, v55, v46 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GCN-NEXT: v_or_b32_e32 v51, v40, v47 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GCN-NEXT: v_or_b32_e32 v52, v43, v56 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GCN-NEXT: v_or_b32_e32 v37, v37, v57 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff, v53 +; GCN-NEXT: v_or_b32_e32 v54, v45, v58 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff, v41 +; GCN-NEXT: v_or_b32_e32 v35, v60, v35 +; GCN-NEXT: v_or_b32_e32 v12, v12, v36 +; GCN-NEXT: v_or_b32_e32 v15, v15, v32 +; GCN-NEXT: v_or_b32_e32 v17, v17, v34 +; GCN-NEXT: v_or_b32_e32 v18, v18, v38 +; GCN-NEXT: v_or_b32_e32 v19, v19, v27 +; GCN-NEXT: v_or_b32_e32 v20, v20, v29 +; GCN-NEXT: v_or_b32_e32 v22, v22, v31 +; GCN-NEXT: v_or_b32_e32 v23, v23, v39 +; GCN-NEXT: v_or_b32_e32 v25, v25, v48 +; GCN-NEXT: v_or_b32_e32 v27, v28, v49 +; GCN-NEXT: v_or_b32_e32 v26, v26, v50 +; GCN-NEXT: v_or_b32_e32 v24, v24, v51 +; GCN-NEXT: v_or_b32_e32 v28, v33, v52 +; GCN-NEXT: v_or_b32_e32 v16, v16, v37 +; GCN-NEXT: v_or_b32_e32 v29, v53, v54 +; GCN-NEXT: v_or_b32_e32 v31, v55, v35 +; GCN-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v15, v5, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v17, v7, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v19, v10, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v20, v8, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v22, v14, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v23, v6, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v25, v21, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v27, v4, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v26, v13, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v24, v3, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; GCN-NEXT: buffer_store_dword v31, v1, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v32bf16_to_v64i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; kill: killed $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v17, 8, v16 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] +; VI-NEXT: v_lshrrev_b32_e32 v22, 24, v16 +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: .LBB54_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB54_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x7fff, v18 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v2, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v2 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v1 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v4 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_cndmask_b32_e32 v4, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; VI-NEXT: v_alignbit_b32 v4, v4, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v3 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_alignbit_b32 v3, v3, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v6, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v6 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v6 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; VI-NEXT: v_cndmask_b32_e32 v6, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; VI-NEXT: v_alignbit_b32 v6, v6, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v5 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_cndmask_b32_e32 v5, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: v_alignbit_b32 v5, v5, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v8 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v8, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v8 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v8 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; VI-NEXT: v_cndmask_b32_e32 v8, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; VI-NEXT: v_alignbit_b32 v8, v8, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v7, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v7 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v7 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; VI-NEXT: v_cndmask_b32_e32 v7, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; VI-NEXT: v_alignbit_b32 v7, v7, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v10 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v10, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v10 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v10 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; VI-NEXT: v_cndmask_b32_e32 v10, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; VI-NEXT: v_alignbit_b32 v10, v10, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v9 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v9, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v9 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v9 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; VI-NEXT: v_cndmask_b32_e32 v9, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; VI-NEXT: v_alignbit_b32 v9, v9, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v12 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v12, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v12 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v12 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; VI-NEXT: v_cndmask_b32_e32 v12, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; VI-NEXT: v_alignbit_b32 v12, v12, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v11 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v11, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v11 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v11 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; VI-NEXT: v_cndmask_b32_e32 v11, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; VI-NEXT: v_alignbit_b32 v11, v11, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v14 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v14, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v14 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v14 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; VI-NEXT: v_cndmask_b32_e32 v14, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; VI-NEXT: v_alignbit_b32 v14, v14, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v13 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v13, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v13 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v13 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; VI-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; VI-NEXT: v_alignbit_b32 v13, v13, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v16, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v16 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v16 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; VI-NEXT: v_cndmask_b32_e32 v16, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; VI-NEXT: v_alignbit_b32 v16, v16, v17, 16 +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 +; VI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; VI-NEXT: v_bfe_u32 v18, v17, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v17 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; VI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; VI-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; VI-NEXT: v_bfe_u32 v18, v15, 16, 1 +; VI-NEXT: v_add_u32_e32 v18, vcc, v18, v15 +; VI-NEXT: v_add_u32_e32 v18, vcc, s6, v18 +; VI-NEXT: v_or_b32_e32 v19, 0x400000, v15 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; VI-NEXT: v_cndmask_b32_e32 v15, v18, v19, vcc +; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; VI-NEXT: v_alignbit_b32 v15, v15, v17, 16 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[3:4] +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[7:8] +; VI-NEXT: v_lshrrev_b64 v[22:23], 24, v[1:2] +; VI-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; VI-NEXT: v_lshrrev_b32_e32 v23, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v26, 24, v16 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v30, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v32, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v35, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v36, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v37, 24, v12 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v12 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v11 +; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 +; VI-NEXT: v_lshrrev_b32_e32 v50, 24, v10 +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v52, 8, v10 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v55, 24, v8 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v43, 8, v7 +; VI-NEXT: v_lshrrev_b32_e32 v44, 24, v6 +; VI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v6 +; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v5 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v5 +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 +; VI-NEXT: v_lshrrev_b32_e32 v59, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v3 +; VI-NEXT: v_lshrrev_b32_e32 v62, 24, v2 +; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v27, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v1 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: .LBB54_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v29 +; VI-NEXT: v_lshlrev_b16_e32 v22, 8, v22 +; VI-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v22, v27, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v28 +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: v_or_b32_sdwa v2, v2, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v59 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v57 +; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v46 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v43 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v24 +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v55 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v52 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v50 +; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v39 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v37 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v35, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v32 +; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 +; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v26 +; VI-NEXT: v_add_u32_e32 v0, vcc, 60, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v32bf16_to_v64i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; kill: killed $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v16 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 24, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 24, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v1 +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[1:2] +; GFX9-NEXT: .LBB54_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB54_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v19, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_bfe_u32 v19, v18, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX9-NEXT: v_add3_u32 v19, v19, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v19, v19, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v19, v20, vcc +; GFX9-NEXT: v_perm_b32 v27, v1, v18, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v19, v1, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX9-NEXT: v_add3_u32 v19, v19, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v19, v20, vcc +; GFX9-NEXT: v_bfe_u32 v19, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v19, v19, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v19, v20, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v3 +; GFX9-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX9-NEXT: v_bfe_u32 v20, v19, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX9-NEXT: v_add3_u32 v20, v20, v19, s6 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v19 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v20, v21, vcc +; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX9-NEXT: v_add3_u32 v20, v20, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc +; GFX9-NEXT: v_perm_b32 v29, v3, v19, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX9-NEXT: v_add3_u32 v20, v20, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v21, vcc +; GFX9-NEXT: v_bfe_u32 v20, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v20, v20, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v21, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v20, v21, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; GFX9-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX9-NEXT: v_bfe_u32 v21, v20, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX9-NEXT: v_add3_u32 v21, v21, v20, s6 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v20, v21, v22, vcc +; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v21, v21, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v22, vcc +; GFX9-NEXT: v_perm_b32 v31, v5, v20, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX9-NEXT: v_add3_u32 v21, v21, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v22, vcc +; GFX9-NEXT: v_bfe_u32 v21, v8, 16, 1 +; GFX9-NEXT: v_add3_u32 v21, v21, v8, s6 +; GFX9-NEXT: v_or_b32_e32 v22, 0x400000, v8 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v21, v22, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; GFX9-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX9-NEXT: v_bfe_u32 v22, v21, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX9-NEXT: v_add3_u32 v22, v22, v21, s6 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v22, v23, vcc +; GFX9-NEXT: v_bfe_u32 v22, v7, 16, 1 +; GFX9-NEXT: v_add3_u32 v22, v22, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v22, v23, vcc +; GFX9-NEXT: v_perm_b32 v33, v7, v21, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GFX9-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX9-NEXT: v_bfe_u32 v22, v7, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX9-NEXT: v_add3_u32 v22, v22, v7, s6 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v7 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX9-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v22, v23, vcc +; GFX9-NEXT: v_bfe_u32 v22, v10, 16, 1 +; GFX9-NEXT: v_add3_u32 v22, v22, v10, s6 +; GFX9-NEXT: v_or_b32_e32 v23, 0x400000, v10 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v22, v23, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX9-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX9-NEXT: v_add3_u32 v23, v23, v22, s6 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v22 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v22, v23, v24, vcc +; GFX9-NEXT: v_bfe_u32 v23, v9, 16, 1 +; GFX9-NEXT: v_add3_u32 v23, v23, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v23, v24, vcc +; GFX9-NEXT: v_perm_b32 v35, v9, v22, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v12 +; GFX9-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX9-NEXT: v_bfe_u32 v23, v9, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX9-NEXT: v_add3_u32 v23, v23, v9, s6 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v9 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX9-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v23, v24, vcc +; GFX9-NEXT: v_bfe_u32 v23, v12, 16, 1 +; GFX9-NEXT: v_add3_u32 v23, v23, v12, s6 +; GFX9-NEXT: v_or_b32_e32 v24, 0x400000, v12 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v23, v24, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v11 +; GFX9-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX9-NEXT: v_bfe_u32 v24, v23, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX9-NEXT: v_add3_u32 v24, v24, v23, s6 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc +; GFX9-NEXT: v_bfe_u32 v24, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v24, v24, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v24, v25, vcc +; GFX9-NEXT: v_perm_b32 v37, v11, v23, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX9-NEXT: v_bfe_u32 v24, v11, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX9-NEXT: v_add3_u32 v24, v24, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v11 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v24, v25, vcc +; GFX9-NEXT: v_bfe_u32 v24, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v24, v24, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v25, 0x400000, v14 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v24, v25, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX9-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX9-NEXT: v_add3_u32 v25, v25, v24, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v25, v26, vcc +; GFX9-NEXT: v_bfe_u32 v25, v13, 16, 1 +; GFX9-NEXT: v_add3_u32 v25, v25, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v25, v26, vcc +; GFX9-NEXT: v_perm_b32 v48, v13, v24, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16 +; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX9-NEXT: v_bfe_u32 v25, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v25, v25, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v13 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v25, v26, vcc +; GFX9-NEXT: v_bfe_u32 v25, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v25, v25, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v26, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v25, v26, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX9-NEXT: v_bfe_u32 v26, v25, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_add3_u32 v26, v26, v25, s6 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v25 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v26, v39, vcc +; GFX9-NEXT: v_bfe_u32 v26, v15, 16, 1 +; GFX9-NEXT: v_add3_u32 v26, v26, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v39, 0x400000, v15 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v26, v39, vcc +; GFX9-NEXT: v_perm_b32 v51, v16, v13, s7 +; GFX9-NEXT: v_perm_b32 v50, v15, v25, s7 +; GFX9-NEXT: v_perm_b32 v28, v2, v17, s7 +; GFX9-NEXT: v_perm_b32 v30, v4, v1, s7 +; GFX9-NEXT: v_perm_b32 v49, v14, v11, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v18 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], 24, v[50:51] +; GFX9-NEXT: v_perm_b32 v32, v6, v3, s7 +; GFX9-NEXT: v_perm_b32 v38, v12, v9, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GFX9-NEXT: v_lshrrev_b64 v[18:19], 24, v[48:49] +; GFX9-NEXT: v_perm_b32 v34, v8, v5, s7 +; GFX9-NEXT: v_perm_b32 v36, v10, v7, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v20 +; GFX9-NEXT: v_lshrrev_b64 v[19:20], 24, v[37:38] +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; GFX9-NEXT: v_lshrrev_b64 v[20:21], 24, v[35:36] +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v22 +; GFX9-NEXT: v_lshrrev_b64 v[21:22], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v23 +; GFX9-NEXT: v_lshrrev_b64 v[22:23], 24, v[29:30] +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v24 +; GFX9-NEXT: v_lshrrev_b64 v[25:26], 24, v[31:32] +; GFX9-NEXT: v_lshrrev_b64 v[23:24], 24, v[27:28] +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 24, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 16, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 +; GFX9-NEXT: v_lshrrev_b32_e32 v46, 24, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v38 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v37 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v59, 24, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 16, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 24, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v32 +; GFX9-NEXT: v_lshrrev_b32_e32 v62, 16, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v31 +; GFX9-NEXT: v_lshrrev_b32_e32 v63, 24, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 8, v30 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v28 +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; GFX9-NEXT: .LBB54_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v23, v28, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v51 +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v22 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v29 +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v30 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v63 +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v45, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v31 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v25 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v32 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v61 +; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v21 +; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v59 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v20 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v36 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 +; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v19 +; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v46 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v48 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v18 +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v17 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 +; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v32bf16_to_v64i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v17 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[9:10] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 24, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 8, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 24, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 24, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v7 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 24, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 24, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v4 +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 24, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 8, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v1 +; GFX11-NEXT: .LBB54_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v18, 0x40c00000, v18 +; GFX11-NEXT: v_add_f32_e32 v19, 0x40c00000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v1, v17, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-NEXT: v_bfe_u32 v22, v18, 16, 1 +; GFX11-NEXT: v_bfe_u32 v24, v19, 16, 1 +; GFX11-NEXT: v_add3_u32 v1, v1, v17, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_or_b32_e32 v17, 0x400000, v18 +; GFX11-NEXT: v_add3_u32 v22, v22, v18, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v20 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_add3_u32 v20, v24, v19, 0x7fff +; GFX11-NEXT: v_bfe_u32 v21, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v21, v21, v2, 0x7fff +; GFX11-NEXT: v_dual_cndmask_b32 v2, v21, v23 :: v_dual_lshlrev_b32 v21, 16, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v19 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v27, v2, v1, 0x7060302 +; GFX11-NEXT: v_dual_add_f32 v18, 0x40c00000, v21 :: v_dual_lshlrev_b32 v21, 16, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v17, v22, v17, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v18 +; GFX11-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v19, v20, v23, vcc_lo +; GFX11-NEXT: v_bfe_u32 v20, v18, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v24, v21, 16, 1 +; GFX11-NEXT: v_perm_b32 v26, v19, v17, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v19, v4, 16, 1 +; GFX11-NEXT: v_add3_u32 v20, v20, v18, 0x7fff +; GFX11-NEXT: v_lshrrev_b32_e32 v81, 16, v2 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; GFX11-NEXT: v_add3_u32 v19, v19, v4, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v18, v20, v22, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v22, 0x400000, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v96, 24, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v26 +; GFX11-NEXT: v_dual_cndmask_b32 v4, v19, v23 :: v_dual_lshlrev_b32 v23, 16, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_add3_u32 v19, v24, v21, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_perm_b32 v29, v4, v18, 0x7060302 +; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-NEXT: v_bfe_u32 v20, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v19, v19, v22 :: v_dual_lshlrev_b32 v22, 16, v5 +; GFX11-NEXT: v_add_f32_e32 v21, 0x40c00000, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v20, v20, v3, 0x7fff +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v68, 16, v4 +; GFX11-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v20, v24, vcc_lo +; GFX11-NEXT: v_bfe_u32 v20, v21, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; GFX11-NEXT: v_bfe_u32 v25, v22, 16, 1 +; GFX11-NEXT: v_perm_b32 v28, v3, v19, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v3, v6, 16, 1 +; GFX11-NEXT: v_add3_u32 v20, v20, v21, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v28 +; GFX11-NEXT: v_add3_u32 v3, v3, v6, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v20, v20, v23 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_add3_u32 v6, v25, v22, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v23, 0x400000, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v24, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v8 +; GFX11-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v21, v21, v5, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v22, v6, v23, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v6, 0x40c00000, v24 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v7 +; GFX11-NEXT: v_perm_b32 v86, v3, v20, 0x7060302 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v21, v25 :: v_dual_add_f32 v8, 0x40c00000, v8 +; GFX11-NEXT: v_bfe_u32 v21, v6, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-NEXT: v_lshrrev_b32_e32 v65, 16, v3 +; GFX11-NEXT: v_perm_b32 v85, v5, v22, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v5, v8, 16, 1 +; GFX11-NEXT: v_add3_u32 v21, v21, v6, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GFX11-NEXT: v_lshrrev_b32_e32 v84, 16, v85 +; GFX11-NEXT: v_add3_u32 v5, v5, v8, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v6, v21, v24, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v25, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v10 +; GFX11-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-NEXT: v_perm_b32 v83, v5, v6, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v30, v23, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v24, 0x400000, v23 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v22 +; GFX11-NEXT: v_add3_u32 v8, v30, v23, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v23, v8, v24 :: v_dual_lshlrev_b32 v24, 16, v9 +; GFX11-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-NEXT: v_add_f32_e32 v8, 0x40c00000, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfe_u32 v31, v24, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v8 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-NEXT: v_bfe_u32 v21, v7, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v30, 0x400000, v7 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v21, v21, v7, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v7, v21, v30, vcc_lo +; GFX11-NEXT: v_bfe_u32 v21, v8, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-NEXT: v_or_b32_e32 v30, 0x400000, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v82, v7, v23, 0x7060302 +; GFX11-NEXT: v_bfe_u32 v7, v10, 16, 1 +; GFX11-NEXT: v_add3_u32 v21, v21, v8, 0x7fff +; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v82 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v7, v7, v10, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v8, v21, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_add3_u32 v10, v31, v24, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v82 +; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v30 :: v_dual_lshlrev_b32 v30, 16, v12 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v70, v7, v8, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v7 +; GFX11-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-NEXT: v_cndmask_b32_e32 v24, v10, v25, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v10, 0x40c00000, v30 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; GFX11-NEXT: v_lshrrev_b32_e32 v64, 8, v70 +; GFX11-NEXT: v_bfe_u32 v30, v12, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v30, v30, v12, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v21, v9, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v9 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_add3_u32 v21, v21, v9, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v31, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v11 +; GFX11-NEXT: v_bfe_u32 v25, v10, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-NEXT: v_perm_b32 v69, v9, v24, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v9, 0x40c00000, v21 +; GFX11-NEXT: v_add3_u32 v21, v25, v10, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v10 +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v12 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v66, 16, v69 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v21, v25, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-NEXT: v_or_b32_e32 v25, 0x400000, v9 +; GFX11-NEXT: v_add3_u32 v21, v32, v9, 0x7fff +; GFX11-NEXT: v_lshrrev_b32_e32 v67, 8, v69 +; GFX11-NEXT: v_cndmask_b32_e32 v12, v30, v31, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v14 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; GFX11-NEXT: v_perm_b32 v55, v12, v10, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v25, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v21, 0x40c00000, v30 +; GFX11-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-NEXT: v_bfe_u32 v31, v11, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v30, 0x400000, v11 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX11-NEXT: v_lshrrev_b32_e32 v51, 8, v55 +; GFX11-NEXT: v_add3_u32 v25, v31, v11, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v31, 16, v13 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v11, v25, v30, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v25, 0x40c00000, v31 +; GFX11-NEXT: v_add3_u32 v30, v32, v21, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v21 +; GFX11-NEXT: v_bfe_u32 v32, v14, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX11-NEXT: v_cndmask_b32_e32 v21, v30, v31, vcc_lo +; GFX11-NEXT: v_add3_u32 v30, v32, v14, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v31, 0x400000, v14 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-NEXT: v_add3_u32 v32, v33, v25, 0x7fff +; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11-NEXT: v_cndmask_b32_e32 v14, v30, v31, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_f32_e32 v30, 0x40c00000, v33 +; GFX11-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-NEXT: v_perm_b32 v54, v11, v9, 0x7060302 +; GFX11-NEXT: v_perm_b32 v50, v14, v21, 0x7060302 +; GFX11-NEXT: v_dual_cndmask_b32 v25, v32, v34 :: v_dual_lshlrev_b32 v34, 16, v15 +; GFX11-NEXT: v_bfe_u32 v35, v13, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX11-NEXT: v_bfe_u32 v33, v30, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-NEXT: v_add3_u32 v31, v35, v13, 0x7fff +; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v24 +; GFX11-NEXT: v_lshrrev_b32_e32 v39, 8, v50 +; GFX11-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v31, 0x40c00000, v34 +; GFX11-NEXT: v_add3_u32 v32, v33, v30, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX11-NEXT: v_bfe_u32 v34, v16, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 +; GFX11-NEXT: v_bfe_u32 v35, v31, 16, 1 +; GFX11-NEXT: v_bfe_u32 v30, v15, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v36, 0x400000, v31 +; GFX11-NEXT: v_or_b32_e32 v37, 0x400000, v15 +; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v33, vcc_lo +; GFX11-NEXT: v_add3_u32 v33, v34, v16, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v34, 0x400000, v16 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-NEXT: v_add3_u32 v35, v35, v31, 0x7fff +; GFX11-NEXT: v_add3_u32 v30, v30, v15, 0x7fff +; GFX11-NEXT: v_perm_b32 v49, v13, v25, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v25 +; GFX11-NEXT: v_cndmask_b32_e32 v16, v33, v34, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX11-NEXT: v_cndmask_b32_e32 v31, v35, v36, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v20 +; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v49 +; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v54 +; GFX11-NEXT: v_cndmask_b32_e32 v15, v30, v37, vcc_lo +; GFX11-NEXT: v_perm_b32 v37, v16, v32, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v30, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v36, v15, v31, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v31 +; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v21 +; GFX11-NEXT: v_lshrrev_b32_e32 v32, 24, v37 +; GFX11-NEXT: v_lshrrev_b64 v[17:18], 24, v[36:37] +; GFX11-NEXT: v_lshrrev_b64 v[18:19], 24, v[49:50] +; GFX11-NEXT: v_lshrrev_b64 v[19:20], 24, v[54:55] +; GFX11-NEXT: v_lshrrev_b64 v[20:21], 24, v[69:70] +; GFX11-NEXT: v_lshrrev_b64 v[21:22], 24, v[82:83] +; GFX11-NEXT: v_lshrrev_b64 v[22:23], 24, v[85:86] +; GFX11-NEXT: v_lshrrev_b64 v[23:24], 24, v[28:29] +; GFX11-NEXT: v_lshrrev_b64 v[24:25], 24, v[26:27] +; GFX11-NEXT: v_lshrrev_b32_e32 v34, 8, v37 +; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v36, 8, v36 +; GFX11-NEXT: v_lshrrev_b32_e32 v37, 24, v50 +; GFX11-NEXT: v_lshrrev_b32_e32 v49, 8, v49 +; GFX11-NEXT: v_lshrrev_b32_e32 v50, 24, v55 +; GFX11-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; GFX11-NEXT: v_lshrrev_b32_e32 v55, 24, v70 +; GFX11-NEXT: v_lshrrev_b32_e32 v69, 24, v83 +; GFX11-NEXT: v_lshrrev_b32_e32 v70, 8, v83 +; GFX11-NEXT: v_lshrrev_b32_e32 v82, 24, v86 +; GFX11-NEXT: v_lshrrev_b32_e32 v83, 8, v86 +; GFX11-NEXT: v_lshrrev_b32_e32 v85, 8, v85 +; GFX11-NEXT: v_lshrrev_b32_e32 v86, 24, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v29, 8, v29 +; GFX11-NEXT: v_lshrrev_b32_e32 v28, 8, v28 +; GFX11-NEXT: v_lshrrev_b32_e32 v27, 8, v27 +; GFX11-NEXT: v_lshrrev_b32_e32 v25, 8, v26 +; GFX11-NEXT: .LBB54_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v25 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v97 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v24 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v25 +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v27 +; GFX11-NEXT: v_or_b32_e32 v24, v26, v24 +; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v81 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v96 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-NEXT: v_lshlrev_b16 v28, 8, v28 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v25 +; GFX11-NEXT: v_or_b32_e32 v25, v26, v27 +; GFX11-NEXT: v_and_b32_e32 v81, 0xff, v87 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v24 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v29 +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v23 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v28 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v86 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v24 +; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v68 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v85 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v84 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v22 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v83 +; GFX11-NEXT: v_and_b32_e32 v65, 0xff, v65 +; GFX11-NEXT: v_lshlrev_b16 v68, 8, v82 +; GFX11-NEXT: v_or_b32_e32 v23, v81, v23 +; GFX11-NEXT: v_or_b32_e32 v25, v25, v26 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v27 +; GFX11-NEXT: v_or_b32_e32 v22, v28, v22 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v29 +; GFX11-NEXT: v_or_b32_e32 v26, v65, v68 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v24 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v23 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v25 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v26 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v80 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v71 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v70 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v52 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v69 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v67 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v22 +; GFX11-NEXT: v_or_b32_e32 v21, v23, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v24 +; GFX11-NEXT: v_or_b32_e32 v22, v25, v26 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v27 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v66 +; GFX11-NEXT: v_lshlrev_b16 v20, 8, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v64 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v38 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v55 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v54 +; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v53 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v19 +; GFX11-NEXT: v_or_b32_e32 v20, v23, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v24 +; GFX11-NEXT: v_or_b32_e32 v23, v25, v26 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v27 +; GFX11-NEXT: v_or_b32_e32 v19, v28, v19 +; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v21 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v22 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v20 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v23 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v19 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v51 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v33 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v50 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v49 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v48 +; GFX11-NEXT: v_lshlrev_b16 v18, 8, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v14 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v39 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v19, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v22 +; GFX11-NEXT: v_or_b32_e32 v18, v23, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v24 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v31 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v37 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v15 +; GFX11-NEXT: v_lshlrev_b16 v22, 8, v36 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v35 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GFX11-NEXT: v_lshlrev_b16 v24, 8, v34 +; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v30 +; GFX11-NEXT: v_lshlrev_b16 v26, 8, v32 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v22 +; GFX11-NEXT: v_or_b32_e32 v17, v23, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v24 +; GFX11-NEXT: v_or_b32_e32 v21, v25, v26 +; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v19 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v18 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v20 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v21 +; GFX11-NEXT: s_clause 0x3 +; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off +; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 +; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <32 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <32 x bfloat> %a1 to <64 x i8> + br label %end + +cmp.false: + %a3 = bitcast <32 x bfloat> %a to <64 x i8> + br label %end + +end: + %phi = phi <64 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <64 x i8> %phi +} + +define <32 x bfloat> @bitcast_v64i8_to_v32bf16(<64 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v64i8_to_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(4) +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 +; GCN-NEXT: s_waitcnt expcnt(2) +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v5 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v11 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v15 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v19 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v21 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v23 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v27 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v32 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v31 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v30 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v26 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(1) +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v24 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v10 +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v17 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v13 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GCN-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v7 +; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GCN-NEXT: ; implicit-def: $vgpr43 +; GCN-NEXT: ; implicit-def: $vgpr35 +; GCN-NEXT: ; implicit-def: $vgpr49 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr33 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr39 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr51 +; GCN-NEXT: ; implicit-def: $vgpr55 +; GCN-NEXT: ; implicit-def: $vgpr41 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr32 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr34 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr36 +; GCN-NEXT: ; implicit-def: $vgpr37 +; GCN-NEXT: ; implicit-def: $vgpr38 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: ; implicit-def: $vgpr48 +; GCN-NEXT: ; implicit-def: $vgpr21 +; GCN-NEXT: ; implicit-def: $vgpr50 +; GCN-NEXT: ; implicit-def: $vgpr23 +; GCN-NEXT: ; implicit-def: $vgpr52 +; GCN-NEXT: ; implicit-def: $vgpr53 +; GCN-NEXT: ; implicit-def: $vgpr54 +; GCN-NEXT: ; implicit-def: $vgpr27 +; GCN-NEXT: ; implicit-def: $vgpr40 +; GCN-NEXT: ; implicit-def: $vgpr29 +; GCN-NEXT: ; implicit-def: $vgpr42 +; GCN-NEXT: ; implicit-def: $vgpr31 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB55_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v3, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v12 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v20 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v5, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v28 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v5, v2 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v13, 0xff, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v29, 0xff, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v31, 0xff, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v32, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v33, 0xff, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v61 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v60 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v56 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v44 +; GCN-NEXT: v_and_b32_e32 v37, 0xff, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v45 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v63 +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v46 +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v59 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v58 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v57 +; GCN-NEXT: v_and_b32_e32 v51, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 24, v17 +; GCN-NEXT: v_and_b32_e32 v52, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v54, 0xff, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v32 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v33 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v35, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v36 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v39, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v49 +; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v50 +; GCN-NEXT: v_or_b32_e32 v26, v51, v26 +; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v52 +; GCN-NEXT: v_or_b32_e32 v30, v53, v30 +; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v54 +; GCN-NEXT: v_or_b32_e32 v43, v5, v55 +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v35, v5, v40 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 +; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v3, v3, v41 +; GCN-NEXT: v_or_b32_e32 v33, v0, v21 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v5, v0, v22 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v7, v0, v7 +; GCN-NEXT: v_or_b32_e32 v51, v1, v19 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v55, v0, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v11 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v11, v0, v13 +; GCN-NEXT: v_or_b32_e32 v32, v4, v15 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v13, v0, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v0, v14 +; GCN-NEXT: v_or_b32_e32 v36, v8, v16 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v37, v0, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v23 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v0, v27 +; GCN-NEXT: v_or_b32_e32 v48, v9, v28 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v0, v29 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v31 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v0, v42 +; GCN-NEXT: v_or_b32_e32 v52, v2, v44 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v53, v0, v45 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v26 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v0, v25 +; GCN-NEXT: v_or_b32_e32 v40, v10, v24 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v0, v46 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v30 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v0, v47 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr20 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr28 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr62 +; GCN-NEXT: ; implicit-def: $vgpr61 +; GCN-NEXT: ; implicit-def: $vgpr60 +; GCN-NEXT: ; implicit-def: $vgpr56 +; GCN-NEXT: ; implicit-def: $vgpr44 +; GCN-NEXT: ; implicit-def: $vgpr47 +; GCN-NEXT: ; implicit-def: $vgpr45 +; GCN-NEXT: ; implicit-def: $vgpr63 +; GCN-NEXT: ; implicit-def: $vgpr46 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr59 +; GCN-NEXT: ; implicit-def: $vgpr58 +; GCN-NEXT: ; implicit-def: $vgpr57 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr25 +; GCN-NEXT: ; implicit-def: $vgpr24 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr22 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; kill: killed $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr26 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; kill: killed $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; kill: killed $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr30 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; kill: killed $vgpr2 +; GCN-NEXT: .LBB55_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB55_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v14 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v3, v30, v3 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v24 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v5, v7, v5 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v10 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_or_b32_e32 v7, v26, v7 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v59 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v58 +; GCN-NEXT: v_and_b32_e32 v10, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v10, v11, v10 +; GCN-NEXT: s_movk_i32 s7, 0x300 +; GCN-NEXT: v_add_i32_e32 v34, vcc, 3, v18 +; GCN-NEXT: s_mov_b32 s6, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v35, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v36, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v38, vcc, 3, v57 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v46 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v47 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v45 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v63 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v56 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v44 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v62 +; GCN-NEXT: v_lshlrev_b32_e32 v13, 8, v61 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v60 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v28 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v2 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v9 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v33, vcc, 3, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v37, vcc, 3, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v34 +; GCN-NEXT: v_and_b32_e32 v34, 0xff, v35 +; GCN-NEXT: v_and_b32_e32 v35, 0xff, v36 +; GCN-NEXT: v_and_b32_e32 v36, 0xff, v38 +; GCN-NEXT: v_and_b32_e32 v38, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v39, 0xff, v17 +; GCN-NEXT: v_and_b32_e32 v48, 0xff, v18 +; GCN-NEXT: v_and_b32_e32 v49, 0xff, v19 +; GCN-NEXT: v_and_b32_e32 v50, 0xff, v21 +; GCN-NEXT: v_and_b32_e32 v51, 0xff, v22 +; GCN-NEXT: v_and_b32_e32 v52, 0xff, v23 +; GCN-NEXT: v_and_b32_e32 v53, 0xff, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xff, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xff, v26 +; GCN-NEXT: v_and_b32_e32 v27, 0xff, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xff, v28 +; GCN-NEXT: v_and_b32_e32 v54, 0xff, v20 +; GCN-NEXT: v_and_b32_e32 v15, 0xff, v29 +; GCN-NEXT: v_and_b32_e32 v16, 0xff, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xff, v30 +; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 +; GCN-NEXT: v_and_b32_e32 v18, 0xff, v31 +; GCN-NEXT: v_and_b32_e32 v19, 0xff, v32 +; GCN-NEXT: v_and_b32_e32 v20, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v21, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v22, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v23, 0xff, v33 +; GCN-NEXT: v_and_b32_e32 v24, 0xff, v37 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v34 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v29, v29, v38 +; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v39 +; GCN-NEXT: v_or_b32_e32 v14, v14, v48 +; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v49 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v32, v32, v50 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v51 +; GCN-NEXT: v_or_b32_e32 v13, v13, v52 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v53 +; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v25, v35, v25 +; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GCN-NEXT: v_or_b32_e32 v11, v11, v27 +; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v28, v28, v54 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GCN-NEXT: v_or_b32_e32 v2, v2, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v17 +; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v12, v17, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v18 +; GCN-NEXT: v_or_b32_e32 v8, v8, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v20 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v19, v19, v21 +; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; GCN-NEXT: v_or_b32_e32 v0, v0, v23 +; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v24 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v9, v22, v9 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s7, v5 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v6, v22, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s7, v7 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v4, v22, v4 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s7, v10 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v1, v22, v1 +; GCN-NEXT: v_add_i32_e32 v22, vcc, s7, v29 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v23, v23, v30 +; GCN-NEXT: v_add_i32_e32 v14, vcc, s7, v14 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v24, v24, v31 +; GCN-NEXT: v_add_i32_e32 v29, vcc, s7, v32 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v30, v30, v33 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s7, v13 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v31, v31, v34 +; GCN-NEXT: v_add_i32_e32 v25, vcc, s7, v25 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v26, v32, v26 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s7, v11 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v27, v32, v27 +; GCN-NEXT: v_add_i32_e32 v28, vcc, s7, v28 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v15, v32, v15 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v2 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v16, v32, v16 +; GCN-NEXT: v_add_i32_e32 v12, vcc, s7, v12 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v17, v32, v17 +; GCN-NEXT: v_add_i32_e32 v8, vcc, s7, v8 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v18, v32, v18 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s7, v19 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v20, v32, v20 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_or_b32_e32 v21, v32, v21 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-NEXT: v_or_b32_e32 v5, v6, v5 +; GCN-NEXT: v_or_b32_e32 v4, v4, v7 +; GCN-NEXT: v_or_b32_e32 v1, v1, v10 +; GCN-NEXT: v_or_b32_e32 v6, v23, v22 +; GCN-NEXT: v_or_b32_e32 v7, v24, v14 +; GCN-NEXT: v_or_b32_e32 v9, v30, v29 +; GCN-NEXT: v_or_b32_e32 v10, v31, v13 +; GCN-NEXT: v_or_b32_e32 v13, v26, v25 +; GCN-NEXT: v_or_b32_e32 v11, v27, v11 +; GCN-NEXT: v_or_b32_e32 v14, v15, v28 +; GCN-NEXT: v_or_b32_e32 v2, v16, v2 +; GCN-NEXT: v_or_b32_e32 v12, v17, v12 +; GCN-NEXT: v_or_b32_e32 v8, v18, v8 +; GCN-NEXT: v_or_b32_e32 v15, v20, v19 +; GCN-NEXT: v_or_b32_e32 v0, v21, v0 +; GCN-NEXT: v_add_i32_e32 v16, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v17, vcc, s6, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 +; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v7 +; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 +; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 +; GCN-NEXT: v_add_i32_e32 v19, vcc, s6, v13 +; GCN-NEXT: v_add_i32_e32 v20, vcc, s6, v11 +; GCN-NEXT: v_add_i32_e32 v13, vcc, s6, v14 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v12 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v15 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v5 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v11 +; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v2 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v13 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 +; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v20 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 +; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v19 +; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v10 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v9 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v18 +; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v18 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 +; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v1 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v17 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v16 +; GCN-NEXT: .LBB55_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v43 +; GCN-NEXT: v_mov_b32_e32 v1, v35 +; GCN-NEXT: v_mov_b32_e32 v2, v49 +; GCN-NEXT: v_mov_b32_e32 v4, v33 +; GCN-NEXT: v_mov_b32_e32 v6, v39 +; GCN-NEXT: v_mov_b32_e32 v8, v51 +; GCN-NEXT: v_mov_b32_e32 v9, v55 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: v_mov_b32_e32 v10, v41 +; GCN-NEXT: v_mov_b32_e32 v12, v32 +; GCN-NEXT: v_mov_b32_e32 v14, v34 +; GCN-NEXT: v_mov_b32_e32 v16, v36 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_mov_b32_e32 v17, v37 +; GCN-NEXT: v_mov_b32_e32 v18, v38 +; GCN-NEXT: v_mov_b32_e32 v20, v48 +; GCN-NEXT: v_mov_b32_e32 v22, v50 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mov_b32_e32 v24, v52 +; GCN-NEXT: v_mov_b32_e32 v25, v53 +; GCN-NEXT: v_mov_b32_e32 v26, v54 +; GCN-NEXT: v_mov_b32_e32 v28, v40 +; GCN-NEXT: v_mov_b32_e32 v30, v42 +; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v64i8_to_v32bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:124 +; VI-NEXT: v_lshlrev_b16_e32 v50, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v51, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v52, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v53, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v54, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v40, 8, v11 +; VI-NEXT: v_lshlrev_b16_e32 v41, 8, v13 +; VI-NEXT: v_lshlrev_b16_e32 v42, 8, v15 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b16_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b16_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; VI-NEXT: v_lshlrev_b16_e32 v27, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: v_lshlrev_b16_e32 v45, 8, v0 +; VI-NEXT: v_lshlrev_b16_e32 v46, 8, v2 +; VI-NEXT: v_lshlrev_b16_e32 v47, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v56, 8, v6 +; VI-NEXT: v_lshlrev_b16_e32 v57, 8, v8 +; VI-NEXT: v_lshlrev_b16_e32 v58, 8, v10 +; VI-NEXT: v_lshlrev_b16_e32 v59, 8, v12 +; VI-NEXT: v_lshlrev_b16_e32 v60, 8, v14 +; VI-NEXT: v_lshlrev_b16_e32 v62, 8, v16 +; VI-NEXT: v_lshlrev_b16_e32 v63, 8, v18 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v20 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b16_e32 v24, 8, v22 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b16_e32 v18, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b16_e32 v26, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v30 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v31 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB55_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v9, v39, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v10, v20, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v28, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v22, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v30, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v35, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v31, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v16, v38, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: ; implicit-def: $vgpr38 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr59 +; VI-NEXT: ; implicit-def: $vgpr62 +; VI-NEXT: ; implicit-def: $vgpr18 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v0, v0, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v1, v1, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v3, v3, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v2, v2, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr50 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: ; implicit-def: $vgpr53 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v5, v5, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v6, v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v7, v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr19 +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v8, v8, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v2, v2, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v3, v3, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v4, v4, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr54 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v4, v4, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v5, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v6, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v49, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v48, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v55, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v10, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v36, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v11, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v37, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v13, v13, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v44, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v61, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v15, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; kill: killed $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr49 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr36 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr46 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr60 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: ; implicit-def: $vgpr26 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: .LBB55_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB55_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: s_waitcnt vmcnt(8) +; VI-NEXT: v_add_u16_e32 v0, 3, v38 +; VI-NEXT: v_add_u16_e32 v2, 3, v44 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v14, v26, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_add_u16_e32 v2, 3, v35 +; VI-NEXT: v_mov_b32_e32 v3, 0x300 +; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v18, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_u16_e32 v0, 3, v37 +; VI-NEXT: v_or_b32_sdwa v24, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u16_e32 v0, 3, v30 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v12, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v43 +; VI-NEXT: v_or_b32_sdwa v16, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u16_e32 v0, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v11, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u16_e32 v0, 3, v36 +; VI-NEXT: v_or_b32_sdwa v22, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u16_e32 v0, 3, v28 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v10, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v55 +; VI-NEXT: v_or_b32_sdwa v28, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v20 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v9, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v48 +; VI-NEXT: v_or_b32_sdwa v20, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v0, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v8, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v0, 3, v49 +; VI-NEXT: v_or_b32_sdwa v30, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_sdwa v13, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v61 +; VI-NEXT: v_or_b32_sdwa v15, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 3, v31 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v26, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v16, 0x300, v16 +; VI-NEXT: v_or_b32_e32 v12, v16, v12 +; VI-NEXT: v_add_u16_e32 v16, 0x300, v24 +; VI-NEXT: v_add_u16_e32 v14, 0x300, v14 +; VI-NEXT: v_add_u16_e32 v15, 0x300, v15 +; VI-NEXT: v_or_b32_e32 v13, v16, v13 +; VI-NEXT: v_or_b32_e32 v14, v14, v26 +; VI-NEXT: v_or_b32_e32 v15, v15, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v7, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v29, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v6, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v25, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v21, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v17, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: v_add_u16_e32 v17, 0x300, v17 +; VI-NEXT: v_or_b32_e32 v4, v17, v4 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v21 +; VI-NEXT: v_or_b32_e32 v5, v17, v5 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v25 +; VI-NEXT: v_or_b32_e32 v6, v17, v6 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v29 +; VI-NEXT: v_or_b32_e32 v7, v17, v7 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v30 +; VI-NEXT: v_or_b32_e32 v8, v17, v8 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v20 +; VI-NEXT: v_or_b32_e32 v9, v17, v9 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v28 +; VI-NEXT: v_or_b32_e32 v10, v17, v10 +; VI-NEXT: v_add_u16_e32 v17, 0x300, v22 +; VI-NEXT: v_or_b32_e32 v11, v17, v11 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v19, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v23, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v27, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v31, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u16_e32 v3, 3, v3 +; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v31 +; VI-NEXT: v_or_b32_e32 v1, v3, v1 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v27 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v23 +; VI-NEXT: v_or_b32_e32 v3, v3, v19 +; VI-NEXT: .LBB55_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v64i8_to_v32bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:4 +; GFX9-NEXT: v_lshlrev_b16_e32 v49, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v53, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v50, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v13 +; GFX9-NEXT: v_lshlrev_b16_e32 v40, 8, v15 +; GFX9-NEXT: v_lshlrev_b16_e32 v43, 8, v17 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v19 +; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 +; GFX9-NEXT: v_lshlrev_b16_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b16_e32 v25, 8, v25 +; GFX9-NEXT: v_lshlrev_b16_e32 v23, 8, v27 +; GFX9-NEXT: v_lshlrev_b16_e32 v29, 8, v29 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b16_e32 v27, 8, v0 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b16_e32 v46, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b16_e32 v45, 8, v4 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b16_e32 v57, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(19) +; GFX9-NEXT: v_lshlrev_b16_e32 v56, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b16_e32 v59, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b16_e32 v58, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b16_e32 v61, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b16_e32 v60, 8, v16 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b16_e32 v22, 8, v18 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v20 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b16_e32 v30, 8, v24 +; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:116 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b16_e32 v24, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b16_e32 v20, 8, v31 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v32 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_lshlrev_b16_e32 v26, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB55_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_or_b32_sdwa v9, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_or_b32_sdwa v10, v62, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v63, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v35, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v13, v34, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v16, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v32, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v38, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr39 +; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr35 +; GFX9-NEXT: ; implicit-def: $vgpr34 +; GFX9-NEXT: ; implicit-def: $vgpr32 +; GFX9-NEXT: ; implicit-def: $vgpr38 +; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; implicit-def: $vgpr56 +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; implicit-def: $vgpr60 +; GFX9-NEXT: ; implicit-def: $vgpr36 +; GFX9-NEXT: ; implicit-def: $vgpr26 +; GFX9-NEXT: ; implicit-def: $vgpr20 +; GFX9-NEXT: ; implicit-def: $vgpr33 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s6 +; GFX9-NEXT: v_perm_b32 v1, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr49 +; GFX9-NEXT: ; implicit-def: $vgpr48 +; GFX9-NEXT: ; implicit-def: $vgpr53 +; GFX9-NEXT: ; implicit-def: $vgpr50 +; GFX9-NEXT: ; implicit-def: $vgpr40 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr19 +; GFX9-NEXT: ; implicit-def: $vgpr23 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr27 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s6 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr55 +; GFX9-NEXT: ; implicit-def: $vgpr54 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v4, v3, s6 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr41 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v4, v4, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v4, v5, v4, s6 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s6 +; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr21 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v6, v7, v6, s6 +; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: ; implicit-def: $vgpr25 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s6 +; GFX9-NEXT: v_or_b32_sdwa v8, v51, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v8, v9, v8, s6 +; GFX9-NEXT: v_or_b32_sdwa v9, v52, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v9, v10, v9, s6 +; GFX9-NEXT: v_or_b32_sdwa v10, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v10, v11, v10, s6 +; GFX9-NEXT: v_or_b32_sdwa v11, v37, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v11, v12, v11, s6 +; GFX9-NEXT: v_or_b32_sdwa v12, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v12, v13, v12, s6 +; GFX9-NEXT: v_or_b32_sdwa v13, v28, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v13, v14, v13, s6 +; GFX9-NEXT: v_or_b32_sdwa v14, v47, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v14, v15, v14, s6 +; GFX9-NEXT: v_or_b32_sdwa v15, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v15, v16, v15, s6 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; kill: killed $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr51 +; GFX9-NEXT: ; implicit-def: $vgpr52 +; GFX9-NEXT: ; implicit-def: $vgpr42 +; GFX9-NEXT: ; implicit-def: $vgpr37 +; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr28 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr47 +; GFX9-NEXT: ; implicit-def: $vgpr18 +; GFX9-NEXT: ; implicit-def: $vgpr29 +; GFX9-NEXT: ; implicit-def: $vgpr46 +; GFX9-NEXT: ; implicit-def: $vgpr57 +; GFX9-NEXT: ; implicit-def: $vgpr59 +; GFX9-NEXT: ; implicit-def: $vgpr61 +; GFX9-NEXT: ; implicit-def: $vgpr22 +; GFX9-NEXT: ; implicit-def: $vgpr30 +; GFX9-NEXT: ; implicit-def: $vgpr24 +; GFX9-NEXT: ; implicit-def: $vgpr31 +; GFX9-NEXT: .LBB55_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB55_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v15, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v28 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v13, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v16, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v44 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v12, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v22, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v11, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v35 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v47 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v24, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v10, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v26, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v9, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v62 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v28, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v51 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v8, 0x300, v0 +; GFX9-NEXT: v_add_u16_e32 v0, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v30, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u16_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v20, 0x300, v3 +; GFX9-NEXT: v_add_u16_e32 v14, 0x300, v2 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v38 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v18, 0x300, v1 +; GFX9-NEXT: s_mov_b32 s6, 0x5040100 +; GFX9-NEXT: v_perm_b32 v8, v30, v8, s6 +; GFX9-NEXT: v_perm_b32 v9, v28, v9, s6 +; GFX9-NEXT: v_perm_b32 v10, v26, v10, s6 +; GFX9-NEXT: v_perm_b32 v11, v24, v11, s6 +; GFX9-NEXT: v_perm_b32 v12, v22, v12, s6 +; GFX9-NEXT: v_perm_b32 v13, v16, v13, s6 +; GFX9-NEXT: v_perm_b32 v14, v20, v14, s6 +; GFX9-NEXT: v_perm_b32 v15, v18, v15, s6 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v7, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u16_e32 v31, 3, v31 +; GFX9-NEXT: v_or_b32_sdwa v31, v48, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v31, 0x300, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v27, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v7, v27, v7, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v6, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v23, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v6, v23, v6, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v5, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v19, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v5, v19, v5, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v4, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v17, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v4, v17, v4, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v3, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v21, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v3, v21, v3, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v25, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v2, v25, v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v29, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v1, v29, v1, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_perm_b32 v0, v31, v0, s6 +; GFX9-NEXT: .LBB55_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v64i8_to_v32bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v32, v14 :: v_dual_mov_b32 v31, v8 +; GFX11-NEXT: v_dual_mov_b32 v34, v12 :: v_dual_mov_b32 v35, v10 +; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v33, v4 +; GFX11-NEXT: v_dual_mov_b32 v37, v2 :: v_dual_mov_b32 v36, v0 +; GFX11-NEXT: s_clause 0x1f +; GFX11-NEXT: scratch_load_u16 v0, off, s32 offset:128 +; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:124 +; GFX11-NEXT: scratch_load_u16 v2, off, s32 offset:120 +; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:116 +; GFX11-NEXT: scratch_load_u16 v4, off, s32 offset:112 +; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:108 +; GFX11-NEXT: scratch_load_u16 v6, off, s32 offset:104 +; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:100 +; GFX11-NEXT: scratch_load_u16 v8, off, s32 offset:96 +; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:92 +; GFX11-NEXT: scratch_load_u16 v10, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v12, off, s32 offset:132 +; GFX11-NEXT: scratch_load_u16 v14, off, s32 +; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:8 +; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:16 +; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:24 +; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:32 +; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:40 +; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:48 +; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:56 +; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:64 +; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:72 +; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:80 +; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:84 +; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:76 +; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:68 +; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:60 +; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:52 +; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:44 +; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:36 +; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:28 +; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:20 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:12 +; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:4 +; GFX11-NEXT: v_lshlrev_b16 v50, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v51, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v49, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v39, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v54, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v48, 8, v11 +; GFX11-NEXT: v_lshlrev_b16 v55, 8, v13 +; GFX11-NEXT: v_lshlrev_b16 v52, 8, v15 +; GFX11-NEXT: v_lshlrev_b16 v53, 8, v17 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v19 +; GFX11-NEXT: v_lshlrev_b16 v83, 8, v21 +; GFX11-NEXT: v_lshlrev_b16 v81, 8, v23 +; GFX11-NEXT: v_lshlrev_b16 v21, 8, v25 +; GFX11-NEXT: v_lshlrev_b16 v19, 8, v27 +; GFX11-NEXT: v_lshlrev_b16 v27, 8, v29 +; GFX11-NEXT: s_waitcnt vmcnt(33) +; GFX11-NEXT: v_lshlrev_b16 v119, 8, v0 +; GFX11-NEXT: s_waitcnt vmcnt(31) +; GFX11-NEXT: v_lshlrev_b16 v118, 8, v2 +; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: s_waitcnt vmcnt(21) +; GFX11-NEXT: v_lshlrev_b16 v23, 8, v14 +; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: v_lshlrev_b16 v98, 8, v96 +; GFX11-NEXT: s_waitcnt vmcnt(19) +; GFX11-NEXT: v_lshlrev_b16 v25, 8, v100 +; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: v_lshlrev_b16 v96, 8, v101 +; GFX11-NEXT: s_waitcnt vmcnt(17) +; GFX11-NEXT: v_lshlrev_b16 v29, 8, v102 +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_lshlrev_b16 v112, 8, v103 +; GFX11-NEXT: s_waitcnt vmcnt(15) +; GFX11-NEXT: v_lshlrev_b16 v103, 8, v113 +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_lshlrev_b16 v101, 8, v114 +; GFX11-NEXT: s_waitcnt vmcnt(13) +; GFX11-NEXT: v_lshlrev_b16 v100, 8, v115 +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: v_lshlrev_b16 v114, 8, v116 +; GFX11-NEXT: s_waitcnt vmcnt(11) +; GFX11-NEXT: v_lshlrev_b16 v102, 8, v117 +; GFX11-NEXT: v_lshlrev_b16 v117, 8, v10 +; GFX11-NEXT: v_lshlrev_b16 v113, 8, v8 +; GFX11-NEXT: v_lshlrev_b16 v116, 8, v6 +; GFX11-NEXT: v_lshlrev_b16 v115, 8, v4 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v36 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v37 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v38 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v31 +; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v35 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v50 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v51 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v34 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v33 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v16 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v18 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v39 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v54 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v48 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v55 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v32 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v20 +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v22 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v49 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v53 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v52 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v17 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v83 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v81 +; GFX11-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v4, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v6, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v8, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v10, v9, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v24 +; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v26 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v28 +; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v30 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v71 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v65 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v80 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v68 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v87 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v84 +; GFX11-NEXT: v_or_b32_e32 v6, v6, v21 +; GFX11-NEXT: v_or_b32_e32 v7, v7, v19 +; GFX11-NEXT: v_or_b32_e32 v8, v8, v27 +; GFX11-NEXT: v_or_b32_e32 v9, v9, v23 +; GFX11-NEXT: v_or_b32_e32 v10, v10, v98 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v25 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v96 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v29 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v112 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v103 +; GFX11-NEXT: v_perm_b32 v6, v7, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v9, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v11, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v13, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v15, v14, 0x5040100 +; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v86 +; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v82 +; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v97 +; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v85 +; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v99 +; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v64 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v69 +; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v66 +; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v70 +; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v67 +; GFX11-NEXT: v_or_b32_e32 v11, v11, v101 +; GFX11-NEXT: v_or_b32_e32 v12, v12, v100 +; GFX11-NEXT: v_or_b32_e32 v13, v13, v114 +; GFX11-NEXT: v_or_b32_e32 v14, v14, v102 +; GFX11-NEXT: v_or_b32_e32 v15, v15, v117 +; GFX11-NEXT: v_or_b32_e32 v16, v16, v113 +; GFX11-NEXT: v_or_b32_e32 v17, v17, v116 +; GFX11-NEXT: v_or_b32_e32 v18, v18, v115 +; GFX11-NEXT: v_or_b32_e32 v19, v19, v118 +; GFX11-NEXT: v_or_b32_e32 v20, v20, v119 +; GFX11-NEXT: v_perm_b32 v11, v12, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v12, v14, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v16, v15, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v18, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v20, v19, 0x5040100 +; GFX11-NEXT: ; implicit-def: $vgpr36 +; GFX11-NEXT: ; implicit-def: $vgpr37 +; GFX11-NEXT: ; implicit-def: $vgpr33 +; GFX11-NEXT: ; implicit-def: $vgpr38 +; GFX11-NEXT: ; implicit-def: $vgpr31 +; GFX11-NEXT: ; implicit-def: $vgpr35 +; GFX11-NEXT: ; implicit-def: $vgpr34 +; GFX11-NEXT: ; implicit-def: $vgpr32 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr18 +; GFX11-NEXT: ; implicit-def: $vgpr20 +; GFX11-NEXT: ; implicit-def: $vgpr22 +; GFX11-NEXT: ; implicit-def: $vgpr24 +; GFX11-NEXT: ; implicit-def: $vgpr26 +; GFX11-NEXT: ; implicit-def: $vgpr28 +; GFX11-NEXT: ; implicit-def: $vgpr30 +; GFX11-NEXT: ; implicit-def: $vgpr71 +; GFX11-NEXT: ; implicit-def: $vgpr65 +; GFX11-NEXT: ; implicit-def: $vgpr80 +; GFX11-NEXT: ; implicit-def: $vgpr68 +; GFX11-NEXT: ; implicit-def: $vgpr87 +; GFX11-NEXT: ; implicit-def: $vgpr84 +; GFX11-NEXT: ; implicit-def: $vgpr86 +; GFX11-NEXT: ; implicit-def: $vgpr82 +; GFX11-NEXT: ; implicit-def: $vgpr97 +; GFX11-NEXT: ; implicit-def: $vgpr85 +; GFX11-NEXT: ; implicit-def: $vgpr99 +; GFX11-NEXT: ; implicit-def: $vgpr64 +; GFX11-NEXT: ; implicit-def: $vgpr69 +; GFX11-NEXT: ; implicit-def: $vgpr66 +; GFX11-NEXT: ; implicit-def: $vgpr70 +; GFX11-NEXT: ; implicit-def: $vgpr67 +; GFX11-NEXT: ; implicit-def: $vgpr50 +; GFX11-NEXT: ; implicit-def: $vgpr51 +; GFX11-NEXT: ; implicit-def: $vgpr49 +; GFX11-NEXT: ; implicit-def: $vgpr39 +; GFX11-NEXT: ; implicit-def: $vgpr54 +; GFX11-NEXT: ; implicit-def: $vgpr48 +; GFX11-NEXT: ; implicit-def: $vgpr55 +; GFX11-NEXT: ; implicit-def: $vgpr52 +; GFX11-NEXT: ; implicit-def: $vgpr53 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr83 +; GFX11-NEXT: ; implicit-def: $vgpr81 +; GFX11-NEXT: ; implicit-def: $vgpr21 +; GFX11-NEXT: ; implicit-def: $vgpr19 +; GFX11-NEXT: ; implicit-def: $vgpr27 +; GFX11-NEXT: ; implicit-def: $vgpr23 +; GFX11-NEXT: ; implicit-def: $vgpr98 +; GFX11-NEXT: ; implicit-def: $vgpr25 +; GFX11-NEXT: ; implicit-def: $vgpr96 +; GFX11-NEXT: ; implicit-def: $vgpr29 +; GFX11-NEXT: ; implicit-def: $vgpr112 +; GFX11-NEXT: ; implicit-def: $vgpr103 +; GFX11-NEXT: ; implicit-def: $vgpr101 +; GFX11-NEXT: ; implicit-def: $vgpr100 +; GFX11-NEXT: ; implicit-def: $vgpr114 +; GFX11-NEXT: ; implicit-def: $vgpr102 +; GFX11-NEXT: ; implicit-def: $vgpr117 +; GFX11-NEXT: ; implicit-def: $vgpr113 +; GFX11-NEXT: ; implicit-def: $vgpr116 +; GFX11-NEXT: ; implicit-def: $vgpr115 +; GFX11-NEXT: ; implicit-def: $vgpr118 +; GFX11-NEXT: ; implicit-def: $vgpr119 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB55_2 +; GFX11-NEXT: .LBB55_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v70, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v67, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v69, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v66, 3 +; GFX11-NEXT: v_add_nc_u16 v5, v64, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_add_nc_u16 v6, v97, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v118, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v119, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v99, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_nc_u16 v15, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v64, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v116, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v5 +; GFX11-NEXT: v_or_b32_e32 v2, v115, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_add_nc_u16 v14, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v113, v1 +; GFX11-NEXT: v_add_nc_u16 v66, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v1, v114, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v117, v4 +; GFX11-NEXT: v_add_nc_u16 v2, v85, 3 +; GFX11-NEXT: v_add_nc_u16 v67, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_add_nc_u16 v0, v86, 3 +; GFX11-NEXT: v_add_nc_u16 v12, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v82, 3 +; GFX11-NEXT: v_add_nc_u16 v13, 0x300, v4 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_add_nc_u16 v3, v87, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v84, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v102, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v101, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v100, v1 +; GFX11-NEXT: v_add_nc_u16 v69, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v112, v3 +; GFX11-NEXT: v_add_nc_u16 v11, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v103, v4 +; GFX11-NEXT: v_add_nc_u16 v70, 0x300, v1 +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_add_nc_u16 v1, v80, 3 +; GFX11-NEXT: v_add_nc_u16 v10, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v68, 3 +; GFX11-NEXT: v_add_nc_u16 v68, 0x300, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u16 v0, v71, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v3, v65, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v4, v28, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v96, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v29, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v0, v98, v0 +; GFX11-NEXT: v_add_nc_u16 v9, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v25, v3 +; GFX11-NEXT: v_add_nc_u16 v25, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v27, v4 +; GFX11-NEXT: v_add_nc_u16 v8, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v30, 3 +; GFX11-NEXT: v_add_nc_u16 v27, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v24, 3 +; GFX11-NEXT: v_add_nc_u16 v7, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v26, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v3, v20, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_add_nc_u16 v4, v22, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_or_b32_e32 v0, v23, v0 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v1, v21, v1 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX11-NEXT: v_add_nc_u16 v19, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v83, v3 +; GFX11-NEXT: v_add_nc_u16 v6, 0x300, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v81, v4 +; GFX11-NEXT: v_add_nc_u16 v20, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v16, 3 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v0, v34, 3 +; GFX11-NEXT: v_add_nc_u16 v16, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v31, 3 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_add_nc_u16 v3, v18, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_add_nc_u16 v4, v32, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v53, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_or_b32_e32 v0, v55, v0 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_or_b32_e32 v1, v54, v1 +; GFX11-NEXT: v_add_nc_u16 v18, 0x300, v2 +; GFX11-NEXT: v_or_b32_e32 v2, v17, v3 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v52, v4 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v1, v33, 3 +; GFX11-NEXT: v_add_nc_u16 v17, v36, 3 +; GFX11-NEXT: v_add_nc_u16 v21, v37, 3 +; GFX11-NEXT: v_add_nc_u16 v22, v38, 3 +; GFX11-NEXT: v_add_nc_u16 v23, v35, 3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v17 +; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v21 +; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v22 +; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v23 +; GFX11-NEXT: v_or_b32_e32 v1, v49, v1 +; GFX11-NEXT: v_or_b32_e32 v17, v50, v17 +; GFX11-NEXT: v_or_b32_e32 v21, v51, v21 +; GFX11-NEXT: v_or_b32_e32 v22, v39, v22 +; GFX11-NEXT: v_or_b32_e32 v23, v48, v23 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v17, 0x300, v17 +; GFX11-NEXT: v_add_nc_u16 v21, 0x300, v21 +; GFX11-NEXT: v_add_nc_u16 v22, 0x300, v22 +; GFX11-NEXT: v_add_nc_u16 v23, 0x300, v23 +; GFX11-NEXT: v_add_nc_u16 v24, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v26, 0x300, v2 +; GFX11-NEXT: v_perm_b32 v0, v21, v17, 0x5040100 +; GFX11-NEXT: v_perm_b32 v1, v22, v1, 0x5040100 +; GFX11-NEXT: v_perm_b32 v2, v23, v4, 0x5040100 +; GFX11-NEXT: v_perm_b32 v3, v24, v3, 0x5040100 +; GFX11-NEXT: v_perm_b32 v4, v26, v18, 0x5040100 +; GFX11-NEXT: v_perm_b32 v5, v16, v5, 0x5040100 +; GFX11-NEXT: v_perm_b32 v6, v20, v6, 0x5040100 +; GFX11-NEXT: v_perm_b32 v7, v19, v7, 0x5040100 +; GFX11-NEXT: v_perm_b32 v8, v27, v8, 0x5040100 +; GFX11-NEXT: v_perm_b32 v9, v25, v9, 0x5040100 +; GFX11-NEXT: v_perm_b32 v10, v68, v10, 0x5040100 +; GFX11-NEXT: v_perm_b32 v11, v70, v11, 0x5040100 +; GFX11-NEXT: v_perm_b32 v12, v69, v12, 0x5040100 +; GFX11-NEXT: v_perm_b32 v13, v67, v13, 0x5040100 +; GFX11-NEXT: v_perm_b32 v14, v66, v14, 0x5040100 +; GFX11-NEXT: v_perm_b32 v15, v64, v15, 0x5040100 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i8> %a, splat (i8 3) + %a2 = bitcast <64 x i8> %a1 to <32 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <64 x i8> %a to <32 x bfloat> + br label %end + +end: + %phi = phi <32 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <32 x bfloat> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll new file mode 100644 index 0000000000000..78f611d83b532 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -0,0 +1,295 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v9i64_to_v18i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v9i64_to_v18i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v9i64_to_v18i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v9i64_to_v18i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <9 x i64> %a, splat (i64 3) + %a2 = bitcast <9 x i64> %a1 to <18 x i32> + br label %end + +cmp.false: + %a3 = bitcast <9 x i64> %a to <18 x i32> + br label %end + +end: + %phi = phi <18 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <18 x i32> %phi +} + +define <9 x i64> @bitcast_v18i32_to_v9i64(<18 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v18i32_to_v9i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v18i32_to_v9i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v18i32_to_v9i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v18i32_to_v9i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v18 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <18 x i32> %a, splat (i32 3) + %a2 = bitcast <18 x i32> %a1 to <9 x i64> + br label %end + +cmp.false: + %a3 = bitcast <18 x i32> %a to <9 x i64> + br label %end + +end: + %phi = phi <9 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <9 x i64> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll new file mode 100644 index 0000000000000..8d2501e42f2d1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -0,0 +1,311 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v10i64_to_v20i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v10i64_to_v20i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v10i64_to_v20i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v10i64_to_v20i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <10 x i64> %a, splat (i64 3) + %a2 = bitcast <10 x i64> %a1 to <20 x i32> + br label %end + +cmp.false: + %a3 = bitcast <10 x i64> %a to <20 x i32> + br label %end + +end: + %phi = phi <20 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <20 x i32> %phi +} + +define <10 x i64> @bitcast_v20i32_to_v10i64(<20 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v20i32_to_v10i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v20i32_to_v10i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v20i32_to_v10i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v20i32_to_v10i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v20 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <20 x i32> %a, splat (i32 3) + %a2 = bitcast <20 x i32> %a1 to <10 x i64> + br label %end + +cmp.false: + %a3 = bitcast <20 x i32> %a to <10 x i64> + br label %end + +end: + %phi = phi <10 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <10 x i64> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll new file mode 100644 index 0000000000000..540888ab607b0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -0,0 +1,7969 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define double @bitcast_i64_to_f64(i64 %a, i32 %b) { +; GCN-LABEL: bitcast_i64_to_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i64_to_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i64_to_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i64_to_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to double + br label %end + +cmp.false: + %a3 = bitcast i64 %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + +define i64 @bitcast_f64_to_i64(double %a, i32 %b) { +; GCN-LABEL: bitcast_f64_to_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast double %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + +define <2 x i32> @bitcast_i64_to_v2i32(i64 %a, i32 %b) { +; GCN-LABEL: bitcast_i64_to_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i64_to_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i64_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i64_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + +define i64 @bitcast_v2i32_to_i64(<2 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i32_to_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i32_to_i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i32_to_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i32_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + +define <2 x float> @bitcast_i64_to_v2f32(i64 %a, i32 %b) { +; GCN-LABEL: bitcast_i64_to_v2f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB4_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i64_to_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i64_to_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i64_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + +define i64 @bitcast_v2f32_to_i64(<2 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f32_to_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB5_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + +define <4 x i16> @bitcast_i64_to_v4i16(i64 %a, i32 %b) { +; GCN-LABEL: bitcast_i64_to_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GCN-NEXT: .LBB6_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GCN-NEXT: .LBB6_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i64_to_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i64_to_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i64_to_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + +define i64 @bitcast_v4i16_to_i64(<4 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i16_to_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB7_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB7_4 +; GCN-NEXT: .LBB7_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB7_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: .LBB7_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i16_to_i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 3 +; VI-NEXT: v_add_u16_e32 v2, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_add_u16_e32 v2, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i16_to_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + +define <4 x half> @bitcast_i64_to_v4f16(i64 %a, i32 %b) { +; GCN-LABEL: bitcast_i64_to_v4f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB8_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB8_4 +; GCN-NEXT: .LBB8_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB8_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB8_2 +; GCN-NEXT: .LBB8_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i64_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i64_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i64_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + +define i64 @bitcast_v4f16_to_i64(<4 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f16_to_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB9_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB9_4 +; GCN-NEXT: .LBB9_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB9_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: .LBB9_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f16_to_i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 0x200 +; VI-NEXT: v_add_f16_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + +define <4 x bfloat> @bitcast_i64_to_v4bf16(i64 %a, i32 %b) { +; GCN-LABEL: bitcast_i64_to_v4bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_4 +; GCN-NEXT: .LBB10_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB10_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_2 +; GCN-NEXT: .LBB10_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i64_to_v4bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i64_to_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i64_to_v4bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + +define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v4bf16_to_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB11_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB11_4 +; GCN-NEXT: .LBB11_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB11_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB11_2 +; GCN-NEXT: .LBB11_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4bf16_to_i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s7 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s7 +; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4bf16_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + +define <8 x i8> @bitcast_i64_to_v8i8(i64 %a, i32 %b) { +; GCN-LABEL: bitcast_i64_to_v8i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB12_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB12_4 +; GCN-NEXT: .LBB12_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB12_3: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_2 +; GCN-NEXT: .LBB12_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_i64_to_v8i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: ; %bb.2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: ; %bb.4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v8 +; VI-NEXT: v_mov_b32_e32 v4, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_i64_to_v8i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: ; %bb.2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: ; %bb.4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_i64_to_v8i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB12_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: .LBB12_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add i64 %a, 3 + %a2 = bitcast i64 %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast i64 %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + +define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i8_to_i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v9, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB13_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB13_4 +; GCN-NEXT: .LBB13_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB13_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB13_2 +; GCN-NEXT: .LBB13_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v0, v8, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v10, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i8_to_i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB13_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB13_4 +; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB13_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: .LBB13_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v9 +; VI-NEXT: v_add_u16_e32 v1, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i8_to_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB13_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB13_4 +; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB13_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: .LBB13_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i8_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB13_4 +; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB13_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: .LBB13_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v9, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v2, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to i64 + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to i64 + br label %end + +end: + %phi = phi i64 [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret i64 %phi +} + +define <2 x i32> @bitcast_f64_to_v2i32(double %a, i32 %b) { +; GCN-LABEL: bitcast_f64_to_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB14_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB14_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB14_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB14_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast double %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + +define double @bitcast_v2i32_to_f64(<2 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i32_to_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB15_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB15_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i32_to_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i32_to_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i32_to_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + +define <2 x float> @bitcast_f64_to_v2f32(double %a, i32 %b) { +; GCN-LABEL: bitcast_f64_to_v2f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB16_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: .LBB16_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB16_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB16_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast double %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + +define double @bitcast_v2f32_to_f64(<2 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f32_to_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB17_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB17_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + +define <4 x i16> @bitcast_f64_to_v4i16(double %a, i32 %b) { +; GCN-LABEL: bitcast_f64_to_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GCN-NEXT: .LBB18_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 +; GCN-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; GCN-NEXT: .LBB18_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v4 +; GCN-NEXT: v_mov_b32_e32 v2, v5 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast double %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + +define double @bitcast_v4i16_to_f64(<4 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i16_to_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB19_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB19_4 +; GCN-NEXT: .LBB19_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB19_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB19_2 +; GCN-NEXT: .LBB19_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i16_to_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 3 +; VI-NEXT: v_add_u16_e32 v2, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_add_u16_e32 v2, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i16_to_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + +define <4 x half> @bitcast_f64_to_v4f16(double %a, i32 %b) { +; GCN-LABEL: bitcast_f64_to_v4f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB20_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: .LBB20_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB20_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: .LBB20_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v4 +; GCN-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast double %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + +define double @bitcast_v4f16_to_f64(<4 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f16_to_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB21_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB21_4 +; GCN-NEXT: .LBB21_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB21_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB21_2 +; GCN-NEXT: .LBB21_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f16_to_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 0x200 +; VI-NEXT: v_add_f16_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + +define <4 x bfloat> @bitcast_f64_to_v4bf16(double %a, i32 %b) { +; GCN-LABEL: bitcast_f64_to_v4bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB22_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: .LBB22_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB22_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GCN-NEXT: .LBB22_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v5 +; GCN-NEXT: v_mov_b32_e32 v1, v4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v4bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v4bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast double %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + +define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v4bf16_to_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB23_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB23_4 +; GCN-NEXT: .LBB23_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB23_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB23_2 +; GCN-NEXT: .LBB23_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4bf16_to_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: .LBB23_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s7 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s7 +; GFX9-NEXT: .LBB23_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4bf16_to_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-NEXT: .LBB23_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + +define <8 x i8> @bitcast_f64_to_v8i8(double %a, i32 %b) { +; GCN-LABEL: bitcast_f64_to_v8i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v9, v1 +; GCN-NEXT: v_mov_b32_e32 v8, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v3, v9, v8, 24 +; GCN-NEXT: v_alignbit_b32 v2, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v1, v9, v8, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GCN-NEXT: .LBB24_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GCN-NEXT: v_alignbit_b32 v3, v9, v8, 24 +; GCN-NEXT: v_alignbit_b32 v2, v9, v8, 16 +; GCN-NEXT: v_alignbit_b32 v1, v9, v8, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GCN-NEXT: .LBB24_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: v_mov_b32_e32 v4, v9 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_f64_to_v8i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: ; %bb.2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: .LBB24_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v8 +; VI-NEXT: v_mov_b32_e32 v4, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_f64_to_v8i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: ; %bb.2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: .LBB24_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_f64_to_v8i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: .LBB24_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd double %a, 1.000000e+00 + %a2 = bitcast double %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast double %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + +define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i8_to_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v9, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB25_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB25_4 +; GCN-NEXT: .LBB25_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB25_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB25_2 +; GCN-NEXT: .LBB25_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v0, v8, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v10, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i8_to_f64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB25_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB25_4 +; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB25_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: .LBB25_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v9 +; VI-NEXT: v_add_u16_e32 v1, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i8_to_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB25_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB25_4 +; GFX9-NEXT: .LBB25_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB25_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB25_2 +; GFX9-NEXT: .LBB25_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i8_to_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB25_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB25_4 +; GFX11-NEXT: .LBB25_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB25_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB25_2 +; GFX11-NEXT: .LBB25_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v9, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v2, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to double + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to double + br label %end + +end: + %phi = phi double [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret double %phi +} + +define <2 x float> @bitcast_v2i32_to_v2f32(<2 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i32_to_v2f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB26_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB26_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i32_to_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i32_to_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i32_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + +define <2 x i32> @bitcast_v2f32_to_v2i32(<2 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f32_to_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB27_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB27_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + +define <4 x i16> @bitcast_v2i32_to_v4i16(<2 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i32_to_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GCN-NEXT: .LBB28_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GCN-NEXT: .LBB28_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i32_to_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i32_to_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i32_to_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + +define <2 x i32> @bitcast_v4i16_to_v2i32(<4 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i16_to_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB29_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB29_4 +; GCN-NEXT: .LBB29_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB29_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB29_2 +; GCN-NEXT: .LBB29_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i16_to_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 3 +; VI-NEXT: v_add_u16_e32 v2, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_add_u16_e32 v2, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i16_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + +define <4 x half> @bitcast_v2i32_to_v4f16(<2 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i32_to_v4f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB30_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB30_4 +; GCN-NEXT: .LBB30_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB30_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB30_2 +; GCN-NEXT: .LBB30_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i32_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i32_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i32_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + +define <2 x i32> @bitcast_v4f16_to_v2i32(<4 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f16_to_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB31_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB31_4 +; GCN-NEXT: .LBB31_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB31_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB31_2 +; GCN-NEXT: .LBB31_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f16_to_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 0x200 +; VI-NEXT: v_add_f16_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + +define <4 x bfloat> @bitcast_v2i32_to_v4bf16(<2 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i32_to_v4bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB32_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB32_4 +; GCN-NEXT: .LBB32_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB32_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB32_2 +; GCN-NEXT: .LBB32_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i32_to_v4bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i32_to_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i32_to_v4bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + +define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v4bf16_to_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB33_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB33_4 +; GCN-NEXT: .LBB33_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB33_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB33_2 +; GCN-NEXT: .LBB33_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4bf16_to_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB33_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: .LBB33_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB33_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s7 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s7 +; GFX9-NEXT: .LBB33_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4bf16_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB33_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-NEXT: .LBB33_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + +define <8 x i8> @bitcast_v2i32_to_v8i8(<2 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v2i32_to_v8i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB34_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB34_4 +; GCN-NEXT: .LBB34_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB34_3: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB34_2 +; GCN-NEXT: .LBB34_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2i32_to_v8i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: ; %bb.2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: ; %bb.4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v8 +; VI-NEXT: v_mov_b32_e32 v4, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2i32_to_v8i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: ; %bb.2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: ; %bb.4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2i32_to_v8i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB34_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: .LBB34_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <2 x i32> %a, splat (i32 3) + %a2 = bitcast <2 x i32> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x i32> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + +define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i8_to_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v9, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB35_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB35_4 +; GCN-NEXT: .LBB35_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB35_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB35_2 +; GCN-NEXT: .LBB35_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v0, v8, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v10, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i8_to_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB35_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB35_4 +; VI-NEXT: .LBB35_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB35_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB35_2 +; VI-NEXT: .LBB35_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v9 +; VI-NEXT: v_add_u16_e32 v1, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i8_to_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB35_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB35_4 +; GFX9-NEXT: .LBB35_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB35_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB35_2 +; GFX9-NEXT: .LBB35_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i8_to_v2i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB35_4 +; GFX11-NEXT: .LBB35_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB35_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB35_2 +; GFX11-NEXT: .LBB35_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v9, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v2, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <2 x i32> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <2 x i32> + br label %end + +end: + %phi = phi <2 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x i32> %phi +} + +define <4 x i16> @bitcast_v2f32_to_v4i16(<2 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f32_to_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB36_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GCN-NEXT: .LBB36_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB36_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GCN-NEXT: .LBB36_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v4 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + +define <2 x float> @bitcast_v4i16_to_v2f32(<4 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i16_to_v2f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB37_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB37_4 +; GCN-NEXT: .LBB37_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB37_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB37_2 +; GCN-NEXT: .LBB37_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i16_to_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 3 +; VI-NEXT: v_add_u16_e32 v2, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: v_add_u16_e32 v2, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i16_to_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + +define <4 x half> @bitcast_v2f32_to_v4f16(<2 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f32_to_v4f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB38_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB38_4 +; GCN-NEXT: .LBB38_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB38_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB38_2 +; GCN-NEXT: .LBB38_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + +define <2 x float> @bitcast_v4f16_to_v2f32(<4 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f16_to_v2f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB39_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB39_4 +; GCN-NEXT: .LBB39_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB39_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v4, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB39_2 +; GCN-NEXT: .LBB39_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f16_to_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 0x200 +; VI-NEXT: v_add_f16_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + +define <4 x bfloat> @bitcast_v2f32_to_v4bf16(<2 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f32_to_v4bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, v1 +; GCN-NEXT: v_mov_b32_e32 v4, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB40_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB40_4 +; GCN-NEXT: .LBB40_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB40_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB40_2 +; GCN-NEXT: .LBB40_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v4bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v4bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + +define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v4bf16_to_v2f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB41_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB41_4 +; GCN-NEXT: .LBB41_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB41_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v0, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB41_2 +; GCN-NEXT: .LBB41_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v3, v2, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4bf16_to_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB41_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v1, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: .LBB41_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB41_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v2, s7 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s7 +; GFX9-NEXT: .LBB41_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4bf16_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB41_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v0 +; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v9, v9, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc_lo +; GFX11-NEXT: v_perm_b32 v0, v0, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x7060302 +; GFX11-NEXT: .LBB41_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + +define <8 x i8> @bitcast_v2f32_to_v8i8(<2 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v2f32_to_v8i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB42_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB42_4 +; GCN-NEXT: .LBB42_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB42_3: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB42_2 +; GCN-NEXT: .LBB42_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v2f32_to_v8i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: ; %bb.2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: ; %bb.4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v8 +; VI-NEXT: v_mov_b32_e32 v4, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v2f32_to_v8i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: ; %bb.2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v9, 1.0, v9 +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: ; %bb.4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v2f32_to_v8i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB42_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v9, 1.0, v9 :: v_dual_add_f32 v8, 1.0, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: .LBB42_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <2 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <2 x float> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <2 x float> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + +define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i8_to_v2f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v9, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v10, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB43_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB43_4 +; GCN-NEXT: .LBB43_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB43_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB43_2 +; GCN-NEXT: .LBB43_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_or_b32_e32 v0, v8, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v10, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i8_to_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB43_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB43_4 +; VI-NEXT: .LBB43_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB43_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB43_2 +; VI-NEXT: .LBB43_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v9 +; VI-NEXT: v_add_u16_e32 v1, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i8_to_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB43_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB43_4 +; GFX9-NEXT: .LBB43_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB43_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB43_2 +; GFX9-NEXT: .LBB43_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i8_to_v2f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB43_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB43_4 +; GFX11-NEXT: .LBB43_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB43_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB43_2 +; GFX11-NEXT: .LBB43_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v9, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v2, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <2 x float> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <2 x float> + br label %end + +end: + %phi = phi <2 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <2 x float> %phi +} + +define <4 x half> @bitcast_v4i16_to_v4f16(<4 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i16_to_v4f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, v3 +; GCN-NEXT: v_mov_b32_e32 v5, v2 +; GCN-NEXT: v_mov_b32_e32 v6, v1 +; GCN-NEXT: v_mov_b32_e32 v7, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB44_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB44_4 +; GCN-NEXT: .LBB44_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB44_3: ; %cmp.false +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB44_2 +; GCN-NEXT: .LBB44_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i16_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_add_u16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i16_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + +define <4 x i16> @bitcast_v4f16_to_v4i16(<4 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f16_to_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB45_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v4 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: .LBB45_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f16_to_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 0x200 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + +define <4 x bfloat> @bitcast_v4i16_to_v4bf16(<4 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i16_to_v4bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v6, v2 +; GCN-NEXT: v_mov_b32_e32 v5, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB46_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB46_4 +; GCN-NEXT: .LBB46_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB46_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB46_2 +; GCN-NEXT: .LBB46_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v5 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v3, v0 +; GCN-NEXT: v_or_b32_e32 v1, v1, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i16_to_v4bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_add_u16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i16_to_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v4bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + +define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v4bf16_to_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB47_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB47_4 +; GCN-NEXT: .LBB47_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB47_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB47_2 +; GCN-NEXT: .LBB47_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4bf16_to_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB47_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: .LBB47_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB47_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s6 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: .LBB47_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4bf16_to_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB47_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc_lo +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 +; GFX11-NEXT: .LBB47_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + +define <8 x i8> @bitcast_v4i16_to_v8i8(<4 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v4i16_to_v8i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v10, v3 +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v10 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB48_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB48_4 +; GCN-NEXT: .LBB48_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB48_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v11 +; GCN-NEXT: v_or_b32_e32 v4, v1, v12 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_bfe_u32 v7, v10, 8, 8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB48_2 +; GCN-NEXT: .LBB48_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_or_b32_e32 v0, v11, v0 +; GCN-NEXT: v_or_b32_e32 v1, v12, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v1 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4i16_to_v8i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; VI-NEXT: v_mov_b32_e32 v9, v0 +; VI-NEXT: v_mov_b32_e32 v8, v1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: ; %bb.2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB48_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v2, 3 +; VI-NEXT: v_add_u16_sdwa v6, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v8, 3, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; VI-NEXT: v_add_u16_e32 v9, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, v8, v1 +; VI-NEXT: v_or_b32_e32 v0, v9, v0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: .LBB48_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v9 +; VI-NEXT: v_mov_b32_e32 v1, v4 +; VI-NEXT: v_mov_b32_e32 v4, v8 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4i16_to_v8i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: ; %bb.2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: ; %bb.4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4i16_to_v8i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB48_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: .LBB48_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <4 x i16> %a, splat (i16 3) + %a2 = bitcast <4 x i16> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x i16> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + +define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i8_to_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v9, v2 +; GCN-NEXT: v_mov_b32_e32 v10, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v1 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB49_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB49_4 +; GCN-NEXT: .LBB49_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB49_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v3, v8 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v4, v7, v1 +; GCN-NEXT: v_or_b32_e32 v1, v5, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v2, v0, v4 +; GCN-NEXT: v_or_b32_e32 v0, v3, v1 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB49_2 +; GCN-NEXT: .LBB49_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v0, v8, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v11, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v5, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; GCN-NEXT: v_or_b32_e32 v3, v7, v3 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v1 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i8_to_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB49_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB49_4 +; VI-NEXT: .LBB49_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB49_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB49_2 +; VI-NEXT: .LBB49_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v9 +; VI-NEXT: v_add_u16_e32 v1, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i8_to_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_4 +; GFX9-NEXT: .LBB49_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB49_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB49_2 +; GFX9-NEXT: .LBB49_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i8_to_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB49_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB49_4 +; GFX11-NEXT: .LBB49_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB49_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB49_2 +; GFX11-NEXT: .LBB49_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v9, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v2, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <4 x i16> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <4 x i16> + br label %end + +end: + %phi = phi <4 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x i16> %phi +} + +define <4 x bfloat> @bitcast_v4f16_to_v4bf16(<4 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f16_to_v4bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB50_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB50_4 +; GCN-NEXT: .LBB50_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB50_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB50_2 +; GCN-NEXT: .LBB50_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f16_to_v4bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 0x200 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v4, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v4, v1 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v4bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} + +define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v4bf16_to_v4f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB51_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB51_4 +; GCN-NEXT: .LBB51_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB51_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB51_2 +; GCN-NEXT: .LBB51_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4bf16_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_bfe_u32 v3, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v3, s6 +; GFX9-NEXT: v_perm_b32 v1, v2, v1, s6 +; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4bf16_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v2 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc_lo +; GFX11-NEXT: v_perm_b32 v1, v2, v1, 0x7060302 +; GFX11-NEXT: .LBB51_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + +define <8 x i8> @bitcast_v4f16_to_v8i8(<4 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v4f16_to_v8i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB52_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB52_4 +; GCN-NEXT: .LBB52_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB52_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GCN-NEXT: v_or_b32_e32 v0, v9, v0 +; GCN-NEXT: v_or_b32_e32 v4, v8, v1 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB52_2 +; GCN-NEXT: .LBB52_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v4, v2, v3 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4f16_to_v8i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: ; %bb.2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB52_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v0, 0x200 +; VI-NEXT: v_add_f16_sdwa v6, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v2, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v1, v9, v1 +; VI-NEXT: v_or_b32_e32 v0, v8, v0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: .LBB52_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v8 +; VI-NEXT: v_mov_b32_e32 v4, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4f16_to_v8i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: ; %bb.2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB52_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: .LBB52_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4f16_to_v8i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB52_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: .LBB52_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x half> %a, splat (half 0xH0200) + %a2 = bitcast <4 x half> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x half> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + +define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i8_to_v4f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v10, v2 +; GCN-NEXT: v_mov_b32_e32 v9, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB53_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB53_4 +; GCN-NEXT: .LBB53_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB53_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GCN-NEXT: v_or_b32_e32 v0, v0, v12 +; GCN-NEXT: v_or_b32_e32 v1, v1, v11 +; GCN-NEXT: v_or_b32_e32 v2, v2, v8 +; GCN-NEXT: v_or_b32_e32 v3, v3, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB53_2 +; GCN-NEXT: .LBB53_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v9 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_or_b32_e32 v0, v5, v0 +; GCN-NEXT: v_or_b32_e32 v1, v8, v1 +; GCN-NEXT: v_or_b32_e32 v2, v11, v2 +; GCN-NEXT: v_or_b32_e32 v3, v12, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x300, v0 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 0x300, v1 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i8_to_v4f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB53_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB53_4 +; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB53_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB53_2 +; VI-NEXT: .LBB53_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v9 +; VI-NEXT: v_add_u16_e32 v1, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i8_to_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB53_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB53_4 +; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB53_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB53_2 +; GFX9-NEXT: .LBB53_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i8_to_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB53_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB53_4 +; GFX11-NEXT: .LBB53_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB53_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB53_2 +; GFX11-NEXT: .LBB53_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v9, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v2, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <4 x half> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <4 x half> + br label %end + +end: + %phi = phi <4 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x half> %phi +} + +define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v4bf16_to_v8i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v2 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB54_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB54_4 +; GCN-NEXT: .LBB54_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB54_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GCN-NEXT: v_alignbit_b32 v0, v0, v10, 16 +; GCN-NEXT: v_alignbit_b32 v4, v6, v8, 16 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB54_2 +; GCN-NEXT: .LBB54_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v9 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v7, 0x40c00000, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v4, v6, v2, 16 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v4bf16_to_v8i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; VI-NEXT: ; %bb.2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB54_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v9 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v3, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v3 +; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v1, v6, v2, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: .LBB54_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v8 +; VI-NEXT: v_mov_b32_e32 v4, v9 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v4bf16_to_v8i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX9-NEXT: ; %bb.2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB54_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v10, v1, v0, s7 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_perm_b32 v11, v1, v2, s7 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v11 +; GFX9-NEXT: .LBB54_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v8 +; GFX9-NEXT: v_mov_b32_e32 v4, v9 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v4bf16_to_v8i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v8 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB54_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v8 +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v3, 16, v9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v6, v6, v1, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; GFX11-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v6, v7 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add3_u32 v9, v9, v3, 0x7fff +; GFX11-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v8, v8, v2, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_perm_b32 v10, v1, v0, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc_lo +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; GFX11-NEXT: v_perm_b32 v11, v2, v3, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[10:11] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v11 +; GFX11-NEXT: .LBB54_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v8 +; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <4 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <4 x bfloat> %a1 to <8 x i8> + br label %end + +cmp.false: + %a3 = bitcast <4 x bfloat> %a to <8 x i8> + br label %end + +end: + %phi = phi <8 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <8 x i8> %phi +} + +define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v8i8_to_v4bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v10, v1 +; GCN-NEXT: v_mov_b32_e32 v9, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB55_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v10 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v3, v11 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v8, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v7, v4 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: .LBB55_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB55_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v10 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v11, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v7, v1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; GCN-NEXT: v_or_b32_e32 v2, v8, v2 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GCN-NEXT: .LBB55_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v5 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v8i8_to_v4bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; VI-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB55_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB55_4 +; VI-NEXT: .LBB55_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB55_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB55_2 +; VI-NEXT: .LBB55_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v9 +; VI-NEXT: v_add_u16_e32 v1, 3, v2 +; VI-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v8i8_to_v4bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v8, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB55_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB55_4 +; GFX9-NEXT: .LBB55_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB55_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB55_2 +; GFX9-NEXT: .LBB55_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v9 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v8i8_to_v4bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v9, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: v_lshlrev_b16 v8, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB55_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB55_4 +; GFX11-NEXT: .LBB55_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB55_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v9 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX11-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB55_2 +; GFX11-NEXT: .LBB55_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v9, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v2, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <8 x i8> %a, splat (i8 3) + %a2 = bitcast <8 x i8> %a1 to <4 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <8 x i8> %a to <4 x bfloat> + br label %end + +end: + %phi = phi <4 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <4 x bfloat> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll new file mode 100644 index 0000000000000..702c1d05a0e3e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -0,0 +1,328 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v11i64_to_v22i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v11i64_to_v22i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v11i64_to_v22i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v11i64_to_v22i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <11 x i64> %a, splat (i64 3) + %a2 = bitcast <11 x i64> %a1 to <22 x i32> + br label %end + +cmp.false: + %a3 = bitcast <11 x i64> %a to <22 x i32> + br label %end + +end: + %phi = phi <22 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <22 x i32> %phi +} + +define <11 x i64> @bitcast_v22i32_to_v11i64(<22 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v22i32_to_v11i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v22i32_to_v11i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v22i32_to_v11i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v22i32_to_v11i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v22 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <22 x i32> %a, splat (i32 3) + %a2 = bitcast <22 x i32> %a1 to <11 x i64> + br label %end + +cmp.false: + %a3 = bitcast <22 x i32> %a to <11 x i64> + br label %end + +end: + %phi = phi <11 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <11 x i64> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll new file mode 100644 index 0000000000000..182c63502d77b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -0,0 +1,344 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v12i64_to_v24i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i64_to_v24i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i64_to_v24i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i64_to_v24i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i64> %a, splat (i64 3) + %a2 = bitcast <12 x i64> %a1 to <24 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x i64> %a to <24 x i32> + br label %end + +end: + %phi = phi <24 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <24 x i32> %phi +} + +define <12 x i64> @bitcast_v24i32_to_v12i64(<24 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v24i32_to_v12i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v24i32_to_v12i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v24i32_to_v12i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v24i32_to_v12i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v24 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <24 x i32> %a, splat (i32 3) + %a2 = bitcast <24 x i32> %a1 to <12 x i64> + br label %end + +cmp.false: + %a3 = bitcast <24 x i32> %a to <12 x i64> + br label %end + +end: + %phi = phi <12 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i64> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll new file mode 100644 index 0000000000000..9869dca91b4d4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -0,0 +1,361 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v13i64_to_v26i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v13i64_to_v26i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v13i64_to_v26i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v13i64_to_v26i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <13 x i64> %a, splat (i64 3) + %a2 = bitcast <13 x i64> %a1 to <26 x i32> + br label %end + +cmp.false: + %a3 = bitcast <13 x i64> %a to <26 x i32> + br label %end + +end: + %phi = phi <26 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <26 x i32> %phi +} + +define <13 x i64> @bitcast_v26i32_to_v13i64(<26 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v26i32_to_v13i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v26i32_to_v13i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v26i32_to_v13i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v26i32_to_v13i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v26 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <26 x i32> %a, splat (i32 3) + %a2 = bitcast <26 x i32> %a1 to <13 x i64> + br label %end + +cmp.false: + %a3 = bitcast <26 x i32> %a to <13 x i64> + br label %end + +end: + %phi = phi <13 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <13 x i64> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll new file mode 100644 index 0000000000000..9f1a9c8dc89c3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -0,0 +1,377 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v14i64_to_v28i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v14i64_to_v28i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v14i64_to_v28i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v14i64_to_v28i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <14 x i64> %a, splat (i64 3) + %a2 = bitcast <14 x i64> %a1 to <28 x i32> + br label %end + +cmp.false: + %a3 = bitcast <14 x i64> %a to <28 x i32> + br label %end + +end: + %phi = phi <28 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <28 x i32> %phi +} + +define <14 x i64> @bitcast_v28i32_to_v14i64(<28 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v28i32_to_v14i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v28i32_to_v14i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v28i32_to_v14i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v28 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v28i32_to_v14i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v28 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <28 x i32> %a, splat (i32 3) + %a2 = bitcast <28 x i32> %a1 to <14 x i64> + br label %end + +cmp.false: + %a3 = bitcast <28 x i32> %a to <14 x i64> + br label %end + +end: + %phi = phi <14 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <14 x i64> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll new file mode 100644 index 0000000000000..80e5a18631189 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -0,0 +1,394 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { +; GCN-LABEL: bitcast_v15i64_to_v30i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v15i64_to_v30i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB0_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_addc_u32_e32 v29, vcc, 0, v29, vcc +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_addc_u32_e32 v27, vcc, 0, v27, vcc +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_addc_u32_e32 v25, vcc, 0, v25, vcc +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_addc_u32_e32 v21, vcc, 0, v21, vcc +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_addc_u32_e32 v17, vcc, 0, v17, vcc +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: .LBB0_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v15i64_to_v30i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB0_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_co_u32_e32 v28, vcc, 3, v28 +; GFX9-NEXT: v_addc_co_u32_e32 v29, vcc, 0, v29, vcc +; GFX9-NEXT: v_add_co_u32_e32 v26, vcc, 3, v26 +; GFX9-NEXT: v_addc_co_u32_e32 v27, vcc, 0, v27, vcc +; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 3, v24 +; GFX9-NEXT: v_addc_co_u32_e32 v25, vcc, 0, v25, vcc +; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 3, v22 +; GFX9-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v23, vcc +; GFX9-NEXT: v_add_co_u32_e32 v20, vcc, 3, v20 +; GFX9-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v21, vcc +; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 3, v18 +; GFX9-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc +; GFX9-NEXT: v_add_co_u32_e32 v16, vcc, 3, v16 +; GFX9-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: .LBB0_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v15i64_to_v30i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB0_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, v28, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v29, vcc_lo +; GFX11-NEXT: v_add_co_u32 v26, vcc_lo, v26, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, v24, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v25, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v22, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v23, vcc_lo +; GFX11-NEXT: v_add_co_u32 v20, vcc_lo, v20, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v21, null, 0, v21, vcc_lo +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, v18, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, 0, v19, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v16, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, 0, v17, vcc_lo +; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX11-NEXT: .LBB0_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <15 x i64> %a, splat (i64 3) + %a2 = bitcast <15 x i64> %a1 to <30 x i32> + br label %end + +cmp.false: + %a3 = bitcast <15 x i64> %a to <30 x i32> + br label %end + +end: + %phi = phi <30 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <30 x i32> %phi +} + +define <15 x i64> @bitcast_v30i32_to_v15i64(<30 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v30i32_to_v15i64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v23 +; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v19 +; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 +; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v11 +; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v30i32_to_v15i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB1_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v29, vcc, 3, v29 +; VI-NEXT: v_add_u32_e32 v28, vcc, 3, v28 +; VI-NEXT: v_add_u32_e32 v27, vcc, 3, v27 +; VI-NEXT: v_add_u32_e32 v26, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v25, vcc, 3, v25 +; VI-NEXT: v_add_u32_e32 v24, vcc, 3, v24 +; VI-NEXT: v_add_u32_e32 v23, vcc, 3, v23 +; VI-NEXT: v_add_u32_e32 v22, vcc, 3, v22 +; VI-NEXT: v_add_u32_e32 v21, vcc, 3, v21 +; VI-NEXT: v_add_u32_e32 v20, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v19, vcc, 3, v19 +; VI-NEXT: v_add_u32_e32 v18, vcc, 3, v18 +; VI-NEXT: v_add_u32_e32 v17, vcc, 3, v17 +; VI-NEXT: v_add_u32_e32 v16, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v15, vcc, 3, v15 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 +; VI-NEXT: v_add_u32_e32 v11, vcc, 3, v11 +; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 +; VI-NEXT: v_add_u32_e32 v9, vcc, 3, v9 +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v7 +; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 +; VI-NEXT: v_add_u32_e32 v5, vcc, 3, v5 +; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: .LBB1_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v30i32_to_v15i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v30 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB1_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v29, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v28, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v27, 3, v27 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 +; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v17, 3, v17 +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: v_add_u32_e32 v15, 3, v15 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_add_u32_e32 v12, 3, v12 +; GFX9-NEXT: v_add_u32_e32 v11, 3, v11 +; GFX9-NEXT: v_add_u32_e32 v10, 3, v10 +; GFX9-NEXT: v_add_u32_e32 v9, 3, v9 +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v7, 3, v7 +; GFX9-NEXT: v_add_u32_e32 v6, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v5, 3, v5 +; GFX9-NEXT: v_add_u32_e32 v4, 3, v4 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: .LBB1_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v30i32_to_v15i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v30 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v29, 3, v29 +; GFX11-NEXT: v_add_nc_u32_e32 v28, 3, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v27, 3, v27 +; GFX11-NEXT: v_add_nc_u32_e32 v26, 3, v26 +; GFX11-NEXT: v_add_nc_u32_e32 v25, 3, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v24, 3, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v23, 3, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v22, 3, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 3, v21 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 3, v20 +; GFX11-NEXT: v_add_nc_u32_e32 v19, 3, v19 +; GFX11-NEXT: v_add_nc_u32_e32 v18, 3, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v17, 3, v17 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 3, v16 +; GFX11-NEXT: v_add_nc_u32_e32 v15, 3, v15 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v12, 3, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v11, 3, v11 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 3, v10 +; GFX11-NEXT: v_add_nc_u32_e32 v9, 3, v9 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 3, v7 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 3, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 3, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 3, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 3, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: .LBB1_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <30 x i32> %a, splat (i32 3) + %a2 = bitcast <30 x i32> %a1 to <15 x i64> + br label %end + +cmp.false: + %a3 = bitcast <30 x i32> %a to <15 x i64> + br label %end + +end: + %phi = phi <15 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <15 x i64> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll new file mode 100644 index 0000000000000..7a3609e29a0c5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -0,0 +1,6092 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define <3 x float> @bitcast_v3i32_to_v3f32(<3 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v3i32_to_v3f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: .LBB0_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3i32_to_v3f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3i32_to_v3f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i32_to_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + +define <3 x i32> @bitcast_v3f32_to_v3i32(<3 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v3f32_to_v3i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: .LBB1_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v3i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + +define <12 x i8> @bitcast_v3i32_to_v12i8(<3 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v3i32_to_v12i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB2_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB2_4 +; GCN-NEXT: .LBB2_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB2_3: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB2_2 +; GCN-NEXT: .LBB2_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3i32_to_v12i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mov_b32_e32 v14, v1 +; VI-NEXT: v_mov_b32_e32 v13, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB2_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; VI-NEXT: .LBB2_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB2_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 +; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 +; VI-NEXT: v_add_u32_e32 v13, vcc, 3, v13 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; VI-NEXT: .LBB2_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v13 +; VI-NEXT: v_mov_b32_e32 v4, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3i32_to_v12i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_mov_b32_e32 v14, v1 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: .LBB2_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v8, 3, v8 +; GFX9-NEXT: v_add_u32_e32 v14, 3, v14 +; GFX9-NEXT: v_add_u32_e32 v13, 3, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: .LBB2_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i32_to_v12i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: v_mov_b32_e32 v14, v1 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX11-NEXT: .LBB2_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB2_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v8, 3, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 3, v14 +; GFX11-NEXT: v_add_nc_u32_e32 v13, 3, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX11-NEXT: .LBB2_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v13 +; GFX11-NEXT: v_mov_b32_e32 v4, v14 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + +define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v12i8_to_v3i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v14, v2 +; GCN-NEXT: v_mov_b32_e32 v13, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB3_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB3_4 +; GCN-NEXT: .LBB3_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB3_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v6, v6, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v5, v7, v8 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB3_2 +; GCN-NEXT: .LBB3_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_or_b32_e32 v0, v12, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v15, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v6, v9, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x300, v6 +; GCN-NEXT: v_or_b32_e32 v5, v7, v8 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i8_to_v3i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v14, v2 +; VI-NEXT: v_mov_b32_e32 v13, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v11 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB3_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB3_4 +; VI-NEXT: .LBB3_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB3_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB3_2 +; VI-NEXT: .LBB3_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v13 +; VI-NEXT: v_add_u16_e32 v1, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v8 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 3, v10 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i8_to_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v11 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB3_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB3_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB3_2 +; GFX9-NEXT: .LBB3_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i8_to_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v11 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB3_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB3_4 +; GFX11-NEXT: .LBB3_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB3_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v15 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v1, v2, v4 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v5 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: .LBB3_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v13, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v14, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v6, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v10, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_or_b32_e32 v0, v16, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v15, v2 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v1, v2, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + +define <6 x bfloat> @bitcast_v3i32_to_v6bf16(<3 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v3i32_to_v6bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v7, v1 +; GCN-NEXT: v_mov_b32_e32 v6, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB4_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB4_4 +; GCN-NEXT: .LBB4_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB4_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB4_2 +; GCN-NEXT: .LBB4_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3i32_to_v6bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3i32_to_v6bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i32_to_v6bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + +define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v6bf16_to_v3i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB5_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB5_4 +; GCN-NEXT: .LBB5_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB5_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB5_2 +; GCN-NEXT: .LBB5_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v2, 16 +; GCN-NEXT: v_alignbit_b32 v2, v5, v3, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6bf16_to_v3i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB5_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: .LBB5_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB5_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s7 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s7 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s7 +; GFX9-NEXT: .LBB5_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX11-NEXT: v_add3_u32 v6, v10, v4, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v7, v8, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v9, v9, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v5, v9, v10 +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v11, vcc_lo +; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: .LBB5_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + +define <6 x half> @bitcast_v3i32_to_v6f16(<3 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v3i32_to_v6f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v7, v1 +; GCN-NEXT: v_mov_b32_e32 v6, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB6_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB6_4 +; GCN-NEXT: .LBB6_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB6_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB6_2 +; GCN-NEXT: .LBB6_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3i32_to_v6f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3i32_to_v6f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i32_to_v6f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} + +define <3 x i32> @bitcast_v6f16_to_v3i32(<6 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v6f16_to_v3i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB7_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB7_4 +; GCN-NEXT: .LBB7_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB7_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GCN-NEXT: v_or_b32_e32 v0, v7, v0 +; GCN-NEXT: v_or_b32_e32 v1, v6, v1 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: .LBB7_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v4, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v5 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f16_to_v3i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB7_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 0x200 +; VI-NEXT: v_add_f16_sdwa v4, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: .LBB7_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + +define <6 x i16> @bitcast_v3i32_to_v6i16(<3 x i32> %a, i32 %b) { +; GCN-LABEL: bitcast_v3i32_to_v6i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB8_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB8_4 +; GCN-NEXT: .LBB8_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB8_3: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB8_2 +; GCN-NEXT: .LBB8_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3i32_to_v6i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3i32_to_v6i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3i32_to_v6i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 3, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <3 x i32> %a, splat (i32 3) + %a2 = bitcast <3 x i32> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x i32> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + +define <3 x i32> @bitcast_v6i16_to_v3i32(<6 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v6i16_to_v3i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v7, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB9_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB9_4 +; GCN-NEXT: .LBB9_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB9_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v0, v0, v6 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_or_b32_e32 v2, v2, v5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: .LBB9_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v6, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6i16_to_v3i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB9_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v4, 3 +; VI-NEXT: v_add_u16_e32 v3, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_u16_e32 v3, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v3, v1 +; VI-NEXT: v_add_u16_e32 v3, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: .LBB9_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6i16_to_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v3i32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <3 x i32> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <3 x i32> + br label %end + +end: + %phi = phi <3 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x i32> %phi +} + +define <12 x i8> @bitcast_v3f32_to_v12i8(<3 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v3f32_to_v12i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v4, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB10_4 +; GCN-NEXT: .LBB10_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB10_3: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB10_2 +; GCN-NEXT: .LBB10_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_alignbit_b32 v11, s4, v8, 24 +; GCN-NEXT: v_alignbit_b32 v10, s4, v8, 16 +; GCN-NEXT: v_alignbit_b32 v9, s4, v8, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v12i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mov_b32_e32 v14, v1 +; VI-NEXT: v_mov_b32_e32 v13, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; VI-NEXT: .LBB10_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB10_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; VI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; VI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; VI-NEXT: .LBB10_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v13 +; VI-NEXT: v_mov_b32_e32 v4, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v12i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_mov_b32_e32 v14, v1 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB10_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: .LBB10_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB10_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX9-NEXT: v_add_f32_e32 v14, 1.0, v14 +; GFX9-NEXT: v_add_f32_e32 v13, 1.0, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: .LBB10_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v12i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: v_mov_b32_e32 v14, v1 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB10_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX11-NEXT: .LBB10_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB10_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_add_f32_e32 v8, 1.0, v8 +; GFX11-NEXT: v_dual_add_f32 v14, 1.0, v14 :: v_dual_add_f32 v13, 1.0, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX11-NEXT: .LBB10_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v13 +; GFX11-NEXT: v_mov_b32_e32 v4, v14 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + +define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v12i8_to_v3f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v14, v2 +; GCN-NEXT: v_mov_b32_e32 v13, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v11 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB11_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB11_4 +; GCN-NEXT: .LBB11_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB11_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v2, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v6, v6, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GCN-NEXT: v_or_b32_e32 v5, v7, v8 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v1, v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, v4, v5 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB11_2 +; GCN-NEXT: .LBB11_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GCN-NEXT: v_or_b32_e32 v0, v12, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v15, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_or_b32_e32 v6, v9, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x300, v6 +; GCN-NEXT: v_or_b32_e32 v5, v7, v8 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v2, v5, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i8_to_v3f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v14, v2 +; VI-NEXT: v_mov_b32_e32 v13, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v11 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB11_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB11_4 +; VI-NEXT: .LBB11_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB11_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB11_2 +; VI-NEXT: .LBB11_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v13 +; VI-NEXT: v_add_u16_e32 v1, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v4, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NEXT: v_add_u16_e32 v4, 3, v8 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v5, 3, v10 +; VI-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v4, 0x300, v4 +; VI-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i8_to_v3f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v11 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB11_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB11_4 +; GFX9-NEXT: .LBB11_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB11_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB11_2 +; GFX9-NEXT: .LBB11_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v4, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i8_to_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v3, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v11 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB11_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB11_4 +; GFX11-NEXT: .LBB11_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB11_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v16 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v15 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX11-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-NEXT: v_or_b32_e32 v5, v8, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v1, v2, v4 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v5 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB11_2 +; GFX11-NEXT: .LBB11_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v13, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v14, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v6, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v8, v10, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX11-NEXT: v_or_b32_e32 v0, v16, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v15, v2 +; GFX11-NEXT: v_or_b32_e32 v4, v7, v4 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX11-NEXT: v_or_b32_e32 v5, v5, v8 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v1, v2, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v3, v5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + +define <6 x bfloat> @bitcast_v3f32_to_v6bf16(<3 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v3f32_to_v6bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v7, v1 +; GCN-NEXT: v_mov_b32_e32 v6, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB12_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB12_4 +; GCN-NEXT: .LBB12_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB12_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB12_2 +; GCN-NEXT: .LBB12_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v6bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v6bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v6bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + +define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v6bf16_to_v3f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB13_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB13_4 +; GCN-NEXT: .LBB13_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB13_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v0, v8, 16 +; GCN-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GCN-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB13_2 +; GCN-NEXT: .LBB13_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v2, 16 +; GCN-NEXT: v_alignbit_b32 v2, v5, v3, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6bf16_to_v3f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB13_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v2, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v3, 16 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v1, v1, v3, 16 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: .LBB13_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v3f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB13_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_perm_b32 v2, v3, v2, s7 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX9-NEXT: v_perm_b32 v1, v3, v1, s7 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s7 +; GFX9-NEXT: .LBB13_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB13_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1 +; GFX11-NEXT: v_add3_u32 v6, v10, v4, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v6, v7 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v7, v8, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v9, v9, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v5, v9, v10 +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v11, vcc_lo +; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302 +; GFX11-NEXT: .LBB13_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + +define <6 x half> @bitcast_v3f32_to_v6f16(<3 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v3f32_to_v6f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v7, v1 +; GCN-NEXT: v_mov_b32_e32 v6, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB14_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB14_4 +; GCN-NEXT: .LBB14_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB14_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v6 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB14_2 +; GCN-NEXT: .LBB14_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v1, 1.0, v6 +; GCN-NEXT: v_add_f32_e32 v3, 1.0, v7 +; GCN-NEXT: v_add_f32_e32 v5, 1.0, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v6f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v6f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v6f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} + +define <3 x float> @bitcast_v6f16_to_v3f32(<6 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v6f16_to_v3f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB15_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB15_4 +; GCN-NEXT: .LBB15_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB15_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GCN-NEXT: v_or_b32_e32 v0, v7, v0 +; GCN-NEXT: v_or_b32_e32 v1, v6, v1 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB15_2 +; GCN-NEXT: .LBB15_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v4, v2 +; GCN-NEXT: v_or_b32_e32 v2, v3, v5 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f16_to_v3f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB15_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 0x200 +; VI-NEXT: v_add_f16_sdwa v4, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v4 +; VI-NEXT: v_add_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 +; VI-NEXT: v_or_b32_e32 v1, v1, v4 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: .LBB15_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v3f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + +define <6 x i16> @bitcast_v3f32_to_v6i16(<3 x float> %a, i32 %b) { +; GCN-LABEL: bitcast_v3f32_to_v6i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v4, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB16_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB16_4 +; GCN-NEXT: .LBB16_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB16_3: ; %cmp.false +; GCN-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB16_2 +; GCN-NEXT: .LBB16_4: ; %cmp.true +; GCN-NEXT: v_add_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_alignbit_b32 v5, s4, v4, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v3f32_to_v6i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_add_f32_e32 v0, 1.0, v0 +; VI-NEXT: ; %bb.2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v3f32_to_v6i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v3f32_to_v6i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v2 :: v_dual_add_f32 v1, 1.0, v1 +; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <3 x float> %a, splat (float 1.000000e+00) + %a2 = bitcast <3 x float> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <3 x float> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + +define <3 x float> @bitcast_v6i16_to_v3f32(<6 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v6i16_to_v3f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, v2 +; GCN-NEXT: v_mov_b32_e32 v7, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB17_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB17_4 +; GCN-NEXT: .LBB17_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB17_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v0, v0, v6 +; GCN-NEXT: v_or_b32_e32 v1, v1, v3 +; GCN-NEXT: v_or_b32_e32 v2, v2, v5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB17_2 +; GCN-NEXT: .LBB17_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v6, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_or_b32_e32 v2, v5, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6i16_to_v3f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB17_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v4, 3 +; VI-NEXT: v_add_u16_e32 v3, 3, v2 +; VI-NEXT: v_add_u16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_add_u16_e32 v3, 3, v1 +; VI-NEXT: v_add_u16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v3, v1 +; VI-NEXT: v_add_u16_e32 v3, 3, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: .LBB17_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6i16_to_v3f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v3f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <3 x float> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <3 x float> + br label %end + +end: + %phi = phi <3 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <3 x float> %phi +} + +define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v12i8_to_v6bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v11 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v9 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v3, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GCN-NEXT: v_or_b32_e32 v11, v1, v0 +; GCN-NEXT: v_or_b32_e32 v13, v14, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GCN-NEXT: v_or_b32_e32 v3, v15, v4 +; GCN-NEXT: v_or_b32_e32 v12, v6, v5 +; GCN-NEXT: v_or_b32_e32 v5, v16, v8 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: .LBB18_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB18_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v10 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GCN-NEXT: v_or_b32_e32 v4, v17, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x300, v3 +; GCN-NEXT: v_or_b32_e32 v3, v16, v5 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_or_b32_e32 v5, v15, v6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_or_b32_e32 v1, v14, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, s7, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s7, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x3000000, v0 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; GCN-NEXT: .LBB18_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, v11 +; GCN-NEXT: v_mov_b32_e32 v1, v13 +; GCN-NEXT: v_mov_b32_e32 v2, v7 +; GCN-NEXT: v_mov_b32_e32 v4, v12 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i8_to_v6bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v14, v2 +; VI-NEXT: v_mov_b32_e32 v13, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v11 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB18_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB18_4 +; VI-NEXT: .LBB18_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB18_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB18_2 +; VI-NEXT: .LBB18_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v13 +; VI-NEXT: v_add_u16_e32 v1, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v3, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v8 +; VI-NEXT: v_add_u16_e32 v4, 3, v10 +; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i8_to_v6bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v11 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB18_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB18_4 +; GFX9-NEXT: .LBB18_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB18_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB18_2 +; GFX9-NEXT: .LBB18_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i8_to_v6bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v11 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB18_4 +; GFX11-NEXT: .LBB18_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB18_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v17 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v12 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB18_2 +; GFX11-NEXT: .LBB18_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v13, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v14, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v6, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v10, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v0, v17, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v15, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + +define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v6bf16_to_v12i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v4 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB19_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB19_4 +; GCN-NEXT: .LBB19_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB19_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v17 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v13 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GCN-NEXT: v_alignbit_b32 v0, v0, v15, 16 +; GCN-NEXT: v_alignbit_b32 v4, v6, v14, 16 +; GCN-NEXT: v_alignbit_b32 v8, v10, v12, 16 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB19_2 +; GCN-NEXT: .LBB19_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v13 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v8, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GCN-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GCN-NEXT: v_alignbit_b32 v4, v6, v2, 16 +; GCN-NEXT: v_alignbit_b32 v8, v10, v8, 16 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v11 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6bf16_to_v12i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v13, v2 +; VI-NEXT: v_mov_b32_e32 v16, v1 +; VI-NEXT: v_mov_b32_e32 v15, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB19_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 +; VI-NEXT: .LBB19_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB19_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v16, v1, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v15, v1, v0, 16 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_bfe_u32 v1, v0, 16, 1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_bfe_u32 v2, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_alignbit_b32 v13, v1, v0, 16 +; VI-NEXT: v_mov_b32_e32 v14, 0x7fc07fc0 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v13 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v15 +; VI-NEXT: .LBB19_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v15 +; VI-NEXT: v_mov_b32_e32 v4, v16 +; VI-NEXT: v_mov_b32_e32 v8, v13 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v12i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_mov_b32_e32 v14, v1 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB19_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: .LBB19_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB19_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v8 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX9-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX9-NEXT: v_add3_u32 v7, v7, v6, s6 +; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX9-NEXT: s_mov_b32 s7, 0x7060302 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX9-NEXT: v_perm_b32 v1, v2, v3, s7 +; GFX9-NEXT: v_perm_b32 v0, v0, v4, s7 +; GFX9-NEXT: v_perm_b32 v11, v5, v7, s7 +; GFX9-NEXT: v_mov_b32_e32 v12, 0x7fc07fc0 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v4 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: .LBB19_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v12i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: v_mov_b32_e32 v14, v1 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX11-NEXT: .LBB19_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB19_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 +; GFX11-NEXT: v_dual_mov_b32 v12, 0x7fc07fc0 :: v_dual_lshlrev_b32 v3, 16, v13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v8 +; GFX11-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v14 +; GFX11-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v0, 0x40c00000, v4 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_add3_u32 v1, v10, v2, 0x7fff +; GFX11-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_bfe_u32 v9, v0, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v7 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v4, vcc_lo +; GFX11-NEXT: v_add3_u32 v4, v8, v3, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v1, v5, 16, 1 +; GFX11-NEXT: v_add3_u32 v9, v9, v0, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_add3_u32 v1, v1, v5, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_perm_b32 v0, v2, v3, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v5, v1, v11, vcc_lo +; GFX11-NEXT: v_perm_b32 v1, v6, v7, 0x7060302 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_perm_b32 v11, v4, v5, 0x7060302 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX11-NEXT: .LBB19_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v13 +; GFX11-NEXT: v_mov_b32_e32 v4, v14 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + +define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v12i8_to_v6f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v13, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v11 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB20_2 +; GCN-NEXT: ; %bb.1: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v12 +; GCN-NEXT: v_or_b32_e32 v1, v1, v14 +; GCN-NEXT: v_or_b32_e32 v2, v2, v15 +; GCN-NEXT: v_or_b32_e32 v3, v3, v7 +; GCN-NEXT: v_or_b32_e32 v4, v4, v16 +; GCN-NEXT: v_or_b32_e32 v5, v5, v17 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: .LBB20_2: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB20_4 +; GCN-NEXT: ; %bb.3: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v13 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v0, v17, v0 +; GCN-NEXT: v_or_b32_e32 v1, v16, v1 +; GCN-NEXT: v_or_b32_e32 v3, v7, v3 +; GCN-NEXT: v_or_b32_e32 v4, v15, v4 +; GCN-NEXT: v_or_b32_e32 v2, v14, v2 +; GCN-NEXT: v_or_b32_e32 v5, v12, v5 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 0x300, v0 +; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x300, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v9, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GCN-NEXT: .LBB20_4: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, v11 +; GCN-NEXT: v_mov_b32_e32 v4, v9 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i8_to_v6f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v14, v2 +; VI-NEXT: v_mov_b32_e32 v13, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v11 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB20_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB20_4 +; VI-NEXT: .LBB20_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB20_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB20_2 +; VI-NEXT: .LBB20_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v13 +; VI-NEXT: v_add_u16_e32 v1, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v3, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v8 +; VI-NEXT: v_add_u16_e32 v4, 3, v10 +; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i8_to_v6f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v11 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB20_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB20_4 +; GFX9-NEXT: .LBB20_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB20_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB20_2 +; GFX9-NEXT: .LBB20_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i8_to_v6f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v11 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB20_4 +; GFX11-NEXT: .LBB20_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB20_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v17 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v12 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB20_2 +; GFX11-NEXT: .LBB20_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v13, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v14, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v6, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v10, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v0, v17, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v15, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} + +define <12 x i8> @bitcast_v6f16_to_v12i8(<6 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v6f16_to_v12i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v15, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v14, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v13, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v12, v4 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB21_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB21_4 +; GCN-NEXT: .LBB21_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB21_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 +; GCN-NEXT: v_or_b32_e32 v0, v14, v0 +; GCN-NEXT: v_or_b32_e32 v4, v13, v1 +; GCN-NEXT: v_or_b32_e32 v8, v12, v2 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GCN-NEXT: v_bfe_u32 v11, v10, 8, 8 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB21_2 +; GCN-NEXT: .LBB21_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v15 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v14 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v6 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v13 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v12 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v10 +; GCN-NEXT: v_bfe_u32 v7, v6, 8, 8 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v4, v2, v4 +; GCN-NEXT: v_or_b32_e32 v8, v3, v5 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GCN-NEXT: v_bfe_u32 v11, v10, 8, 8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f16_to_v12i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: v_mov_b32_e32 v14, v1 +; VI-NEXT: v_mov_b32_e32 v13, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; VI-NEXT: ; %bb.2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB21_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 0x200 +; VI-NEXT: v_add_f16_sdwa v6, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 +; VI-NEXT: v_add_f16_sdwa v2, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_sdwa v10, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v14, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: v_or_b32_e32 v0, v13, v0 +; VI-NEXT: v_or_b32_e32 v11, v8, v3 +; VI-NEXT: v_mov_b32_e32 v12, 0x7e007e00 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v11 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: .LBB21_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v13 +; VI-NEXT: v_mov_b32_e32 v4, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v12i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-NEXT: v_mov_b32_e32 v16, v1 +; GFX9-NEXT: v_mov_b32_e32 v15, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB21_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: .LBB21_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB21_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v14, 0x7e007e00 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v15 +; GFX9-NEXT: .LBB21_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v15 +; GFX9-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v12i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v13, v2 :: v_dual_mov_b32 v16, v1 +; GFX11-NEXT: v_mov_b32_e32 v15, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v15 +; GFX11-NEXT: .LBB21_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB21_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] +; GFX11-NEXT: v_mov_b32_e32 v14, 0x7e007e00 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v16 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[15:16] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v16 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v15 +; GFX11-NEXT: .LBB21_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v15 +; GFX11-NEXT: v_mov_b32_e32 v4, v16 +; GFX11-NEXT: v_mov_b32_e32 v8, v13 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + +define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { +; GCN-LABEL: bitcast_v12i8_to_v6i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v14, v4 +; GCN-NEXT: v_mov_b32_e32 v15, v2 +; GCN-NEXT: v_mov_b32_e32 v13, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v12, 24, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 8, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 8, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB22_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB22_4 +; GCN-NEXT: .LBB22_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB22_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v14 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v15 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v13 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_or_b32_e32 v3, v3, v18 +; GCN-NEXT: v_or_b32_e32 v4, v4, v17 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_or_b32_e32 v1, v7, v1 +; GCN-NEXT: v_or_b32_e32 v6, v12, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v5, v9, v5 +; GCN-NEXT: v_or_b32_e32 v2, v0, v1 +; GCN-NEXT: v_or_b32_e32 v0, v3, v6 +; GCN-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GCN-NEXT: v_alignbit_b32 v1, v2, v6, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr15 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr16 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB22_2 +; GCN-NEXT: .LBB22_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v13 +; GCN-NEXT: s_movk_i32 s6, 0x300 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v15 +; GCN-NEXT: s_mov_b32 s7, 0x3000000 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v14 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v10 +; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GCN-NEXT: v_or_b32_e32 v0, v18, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_or_b32_e32 v2, v16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_or_b32_e32 v4, v17, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; GCN-NEXT: v_or_b32_e32 v1, v12, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_or_b32_e32 v3, v7, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x300, v4 +; GCN-NEXT: v_or_b32_e32 v5, v9, v5 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_or_b32_e32 v1, v3, v2 +; GCN-NEXT: v_or_b32_e32 v3, v5, v4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s7, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v1 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v3 +; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v12i8_to_v6i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v14, v2 +; VI-NEXT: v_mov_b32_e32 v13, v0 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; VI-NEXT: v_lshlrev_b16_e32 v17, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v16, 8, v3 +; VI-NEXT: v_lshlrev_b16_e32 v15, 8, v5 +; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v7 +; VI-NEXT: v_lshlrev_b16_e32 v7, 8, v9 +; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v11 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB22_3 +; VI-NEXT: ; %bb.1: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB22_4 +; VI-NEXT: .LBB22_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; VI-NEXT: .LBB22_3: ; %cmp.false +; VI-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v6, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: ; implicit-def: $vgpr6 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr17 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr12 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB22_2 +; VI-NEXT: .LBB22_4: ; %cmp.true +; VI-NEXT: v_add_u16_e32 v0, 3, v13 +; VI-NEXT: v_add_u16_e32 v1, 3, v14 +; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v2, 0x300 +; VI-NEXT: v_add_u16_e32 v0, 0x300, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_add_u16_e32 v1, 3, v4 +; VI-NEXT: v_add_u16_e32 v3, 3, v6 +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v1, 0x300, v1 +; VI-NEXT: v_add_u16_sdwa v3, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: v_add_u16_e32 v3, 3, v8 +; VI-NEXT: v_add_u16_e32 v4, 3, v10 +; VI-NEXT: v_or_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u16_e32 v3, 0x300, v3 +; VI-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v12i8_to_v6i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12 +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v7 +; GFX9-NEXT: v_lshlrev_b16_e32 v7, 8, v9 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v11 +; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB22_3 +; GFX9-NEXT: ; %bb.1: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB22_4 +; GFX9-NEXT: .LBB22_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: .LBB22_3: ; %cmp.false +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: ; implicit-def: $vgpr13 +; GFX9-NEXT: ; implicit-def: $vgpr14 +; GFX9-NEXT: ; implicit-def: $vgpr4 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr8 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr17 +; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: ; implicit-def: $vgpr12 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB22_2 +; GFX9-NEXT: .LBB22_4: ; %cmp.true +; GFX9-NEXT: v_add_u16_e32 v0, 3, v13 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v14 +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_movk_i32 s6, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_u16_e32 v1, 3, v4 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v6 +; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_add_u16_e32 v2, 3, v8 +; GFX9-NEXT: v_add_u16_e32 v3, 3, v10 +; GFX9-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u16_e32 v2, 0x300, v2 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v12i8_to_v6i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v14, v2 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12 +; GFX11-NEXT: v_lshlrev_b16 v17, 8, v1 +; GFX11-NEXT: v_lshlrev_b16 v15, 8, v3 +; GFX11-NEXT: v_lshlrev_b16 v16, 8, v5 +; GFX11-NEXT: v_lshlrev_b16 v12, 8, v7 +; GFX11-NEXT: v_lshlrev_b16 v5, 8, v9 +; GFX11-NEXT: v_lshlrev_b16 v7, 8, v11 +; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB22_3 +; GFX11-NEXT: ; %bb.1: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB22_4 +; GFX11-NEXT: .LBB22_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: .LBB22_3: ; %cmp.false +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v13 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v14 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v6 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v8 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v17 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX11-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX11-NEXT: v_or_b32_e32 v3, v3, v12 +; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: ; implicit-def: $vgpr13 +; GFX11-NEXT: ; implicit-def: $vgpr14 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr8 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr17 +; GFX11-NEXT: ; implicit-def: $vgpr15 +; GFX11-NEXT: ; implicit-def: $vgpr16 +; GFX11-NEXT: ; implicit-def: $vgpr12 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: ; implicit-def: $vgpr4 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB22_2 +; GFX11-NEXT: .LBB22_4: ; %cmp.true +; GFX11-NEXT: v_add_nc_u16 v0, v13, 3 +; GFX11-NEXT: v_add_nc_u16 v1, v14, 3 +; GFX11-NEXT: v_add_nc_u16 v2, v4, 3 +; GFX11-NEXT: v_add_nc_u16 v3, v6, 3 +; GFX11-NEXT: v_add_nc_u16 v4, v8, 3 +; GFX11-NEXT: v_add_nc_u16 v6, v10, 3 +; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX11-NEXT: v_or_b32_e32 v0, v17, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v15, v1 +; GFX11-NEXT: v_or_b32_e32 v2, v16, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX11-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX11-NEXT: v_or_b32_e32 v5, v7, v6 +; GFX11-NEXT: v_add_nc_u16 v0, 0x300, v0 +; GFX11-NEXT: v_add_nc_u16 v1, 0x300, v1 +; GFX11-NEXT: v_add_nc_u16 v2, 0x300, v2 +; GFX11-NEXT: v_add_nc_u16 v3, 0x300, v3 +; GFX11-NEXT: v_add_nc_u16 v4, 0x300, v4 +; GFX11-NEXT: v_add_nc_u16 v5, 0x300, v5 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v1, v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v5 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <12 x i8> %a, splat (i8 3) + %a2 = bitcast <12 x i8> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <12 x i8> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + +define <12 x i8> @bitcast_v6i16_to_v12i8(<6 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v6i16_to_v12i8: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v15, v5 +; GCN-NEXT: v_mov_b32_e32 v12, v4 +; GCN-NEXT: v_mov_b32_e32 v16, v3 +; GCN-NEXT: v_mov_b32_e32 v13, v2 +; GCN-NEXT: v_mov_b32_e32 v14, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v16 +; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v15 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB23_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB23_4 +; GCN-NEXT: .LBB23_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB23_3: ; %cmp.false +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v14 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v13 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v12 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v16 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v15 +; GCN-NEXT: v_bfe_u32 v7, v16, 8, 8 +; GCN-NEXT: v_or_b32_e32 v0, v0, v17 +; GCN-NEXT: v_or_b32_e32 v4, v1, v18 +; GCN-NEXT: v_or_b32_e32 v8, v2, v19 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GCN-NEXT: v_bfe_u32 v11, v15, 8, 8 +; GCN-NEXT: ; implicit-def: $vgpr14 +; GCN-NEXT: ; implicit-def: $vgpr17 +; GCN-NEXT: ; implicit-def: $vgpr13 +; GCN-NEXT: ; implicit-def: $vgpr18 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr19 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB23_2 +; GCN-NEXT: .LBB23_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; GCN-NEXT: s_mov_b32 s6, 0x30000 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v13 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v12 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_or_b32_e32 v0, v17, v0 +; GCN-NEXT: v_or_b32_e32 v1, v18, v1 +; GCN-NEXT: v_or_b32_e32 v2, v19, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v1 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v2 +; GCN-NEXT: v_alignbit_b32 v3, v4, v0, 24 +; GCN-NEXT: v_alignbit_b32 v2, v4, v0, 16 +; GCN-NEXT: v_alignbit_b32 v1, v4, v0, 8 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v11, 24, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6i16_to_v12i8: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; VI-NEXT: ; implicit-def: $vgpr16 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: ; implicit-def: $vgpr3 +; VI-NEXT: ; implicit-def: $vgpr14 +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr7 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: ; implicit-def: $vgpr11 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_cbranch_execz .LBB23_2 +; VI-NEXT: ; %bb.1: ; %cmp.false +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_mov_b32_e32 v16, v0 +; VI-NEXT: v_mov_b32_e32 v14, v1 +; VI-NEXT: v_mov_b32_e32 v8, v2 +; VI-NEXT: ; implicit-def: $vgpr1 +; VI-NEXT: ; implicit-def: $vgpr2 +; VI-NEXT: .LBB23_2: ; %Flow +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB23_4 +; VI-NEXT: ; %bb.3: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 3 +; VI-NEXT: v_add_u16_sdwa v6, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v13, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v14, 3, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; VI-NEXT: v_add_u16_e32 v16, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; VI-NEXT: v_or_b32_e32 v1, v14, v1 +; VI-NEXT: v_or_b32_e32 v0, v16, v0 +; VI-NEXT: v_add_u16_sdwa v10, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v8, 3, v2 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[0:1] +; VI-NEXT: v_or_b32_e32 v2, v8, v2 +; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[2:3] +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v0 +; VI-NEXT: v_bfe_u32 v7, v6, 8, 8 +; VI-NEXT: .LBB23_4: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v16 +; VI-NEXT: v_mov_b32_e32 v1, v15 +; VI-NEXT: v_mov_b32_e32 v2, v13 +; VI-NEXT: v_mov_b32_e32 v4, v14 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6i16_to_v12i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-NEXT: v_mov_b32_e32 v14, v1 +; GFX9-NEXT: v_mov_b32_e32 v13, v0 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: ; implicit-def: $vgpr2 +; GFX9-NEXT: ; implicit-def: $vgpr3 +; GFX9-NEXT: ; implicit-def: $vgpr5 +; GFX9-NEXT: ; implicit-def: $vgpr6 +; GFX9-NEXT: ; implicit-def: $vgpr7 +; GFX9-NEXT: ; implicit-def: $vgpr9 +; GFX9-NEXT: ; implicit-def: $vgpr10 +; GFX9-NEXT: ; implicit-def: $vgpr11 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB23_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: .LBB23_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB23_4 +; GFX9-NEXT: ; %bb.3: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: .LBB23_4: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v13 +; GFX9-NEXT: v_mov_b32_e32 v4, v14 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v12i8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v13, v0 +; GFX11-NEXT: v_mov_b32_e32 v14, v1 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: ; implicit-def: $vgpr3 +; GFX11-NEXT: ; implicit-def: $vgpr5 +; GFX11-NEXT: ; implicit-def: $vgpr6 +; GFX11-NEXT: ; implicit-def: $vgpr7 +; GFX11-NEXT: ; implicit-def: $vgpr9 +; GFX11-NEXT: ; implicit-def: $vgpr10 +; GFX11-NEXT: ; implicit-def: $vgpr11 +; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execz .LBB23_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.false +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX11-NEXT: .LBB23_2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB23_4 +; GFX11-NEXT: ; %bb.3: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshrrev_b32_e32 v9, 8, v8 +; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], 24, v[13:14] +; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v14 +; GFX11-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v14 +; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 8, v13 +; GFX11-NEXT: .LBB23_4: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: v_mov_b32_e32 v0, v13 +; GFX11-NEXT: v_mov_b32_e32 v4, v14 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <12 x i8> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <12 x i8> + br label %end + +end: + %phi = phi <12 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <12 x i8> %phi +} + +define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v6bf16_to_v6f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v5 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB24_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB24_4 +; GCN-NEXT: .LBB24_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB24_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB24_2 +; GCN-NEXT: .LBB24_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6bf16_to_v6f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB24_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v6 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v5, 16 +; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: .LBB24_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v6f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB24_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v6, v6, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v2, v5, v2, s6 +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s6 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s6 +; GFX9-NEXT: .LBB24_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v6f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB24_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-NEXT: v_add3_u32 v8, v8, v4, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v9, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v10, vcc_lo +; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 +; GFX11-NEXT: .LBB24_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} + +define <6 x bfloat> @bitcast_v6f16_to_v6bf16(<6 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v6f16_to_v6bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v9, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v10, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v11, v5 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB25_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB25_4 +; GCN-NEXT: .LBB25_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB25_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v11 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB25_2 +; GCN-NEXT: .LBB25_4: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v6 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v6, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v7, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v8, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f16_to_v6bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB25_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v4, 0x200 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v6, v2 +; VI-NEXT: v_or_b32_e32 v1, v5, v1 +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: .LBB25_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v6bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v6bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + +define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { +; GCN-LABEL: bitcast_v6bf16_to_v6i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v5 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB26_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB26_4 +; GCN-NEXT: .LBB26_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB26_3: ; %cmp.false +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v9 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr6 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB26_2 +; GCN-NEXT: .LBB26_4: ; %cmp.true +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GCN-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GCN-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GCN-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GCN-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GCN-NEXT: v_add_f32_e32 v6, 0x40c00000, v4 +; GCN-NEXT: v_add_f32_e32 v4, 0x40c00000, v5 +; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GCN-NEXT: v_alignbit_b32 v0, v7, v0, 16 +; GCN-NEXT: v_alignbit_b32 v4, v5, v2, 16 +; GCN-NEXT: v_alignbit_b32 v2, v3, v6, 16 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6bf16_to_v6i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB26_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; VI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; VI-NEXT: v_bfe_u32 v4, v3, 16, 1 +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3 +; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; VI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; VI-NEXT: v_bfe_u32 v4, v0, 16, 1 +; VI-NEXT: s_movk_i32 s6, 0x7fff +; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v4 +; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; VI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; VI-NEXT: v_bfe_u32 v5, v4, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; VI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; VI-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; VI-NEXT: v_bfe_u32 v5, v1, 16, 1 +; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; VI-NEXT: v_add_u32_e32 v5, vcc, s6, v5 +; VI-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; VI-NEXT: v_bfe_u32 v6, v5, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, s6, v6 +; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; VI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; VI-NEXT: v_bfe_u32 v6, v2, 16, 1 +; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6 +; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_alignbit_b32 v2, v2, v5, 16 +; VI-NEXT: v_alignbit_b32 v1, v1, v4, 16 +; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; VI-NEXT: .LBB26_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6bf16_to_v6i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: s_cbranch_execz .LBB26_2 +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX9-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX9-NEXT: s_movk_i32 s6, 0x7fff +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_add3_u32 v4, v4, v3, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX9-NEXT: v_add3_u32 v4, v4, v0, s6 +; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX9-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_add3_u32 v5, v5, v4, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX9-NEXT: v_add3_u32 v5, v5, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX9-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX9-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_add3_u32 v6, v6, v5, s6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc +; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX9-NEXT: v_add3_u32 v6, v6, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc +; GFX9-NEXT: s_mov_b32 s6, 0x7060302 +; GFX9-NEXT: v_perm_b32 v2, v5, v2, s6 +; GFX9-NEXT: v_perm_b32 v1, v4, v1, s6 +; GFX9-NEXT: v_perm_b32 v0, v3, v0, s6 +; GFX9-NEXT: .LBB26_2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6bf16_to_v6i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: s_cbranch_execz .LBB26_2 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_bfe_u32 v8, v4, 16, 1 +; GFX11-NEXT: v_add3_u32 v8, v8, v4, 0x7fff +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX11-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v3, v6, v9, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX11-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc_lo +; GFX11-NEXT: v_add3_u32 v8, v9, v1, 0x7fff +; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-NEXT: v_add3_u32 v6, v6, v2, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v13, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_perm_b32 v2, v5, v2, 0x7060302 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v7, v10, vcc_lo +; GFX11-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_perm_b32 v0, v3, v0, 0x7060302 +; GFX11-NEXT: .LBB26_2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x bfloat> %a, splat (bfloat 0xR40C0) + %a2 = bitcast <6 x bfloat> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x bfloat> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + +define <6 x bfloat> @bitcast_v6i16_to_v6bf16(<6 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v6i16_to_v6bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v9, v4 +; GCN-NEXT: v_mov_b32_e32 v7, v2 +; GCN-NEXT: v_mov_b32_e32 v8, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB27_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB27_4 +; GCN-NEXT: .LBB27_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB27_3: ; %cmp.false +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v9 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB27_2 +; GCN-NEXT: .LBB27_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v8 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GCN-NEXT: v_or_b32_e32 v0, v5, v0 +; GCN-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-NEXT: v_or_b32_e32 v1, v1, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6i16_to_v6bf16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB27_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 3 +; VI-NEXT: v_add_u16_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v5, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: .LBB27_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6i16_to_v6bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v6bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <6 x bfloat> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <6 x bfloat> + br label %end + +end: + %phi = phi <6 x bfloat> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x bfloat> %phi +} + +define <6 x i16> @bitcast_v6f16_to_v6i16(<6 x half> %a, i32 %b) { +; GCN-LABEL: bitcast_v6f16_to_v6i16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB28_2 +; GCN-NEXT: ; %bb.1: ; %cmp.true +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_or_b32_e32 v4, v4, v6 +; GCN-NEXT: v_or_b32_e32 v2, v2, v7 +; GCN-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; GCN-NEXT: .LBB28_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6f16_to_v6i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB28_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v4, 0x200 +; VI-NEXT: v_add_f16_e32 v3, 0x200, v0 +; VI-NEXT: v_add_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v5, 0x200, v1 +; VI-NEXT: v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v6, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v6, v2 +; VI-NEXT: v_or_b32_e32 v1, v5, v1 +; VI-NEXT: v_or_b32_e32 v0, v3, v0 +; VI-NEXT: .LBB28_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6f16_to_v6i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: s_movk_i32 s6, 0x200 +; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6f16_to_v6i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = fadd <6 x half> %a, splat (half 0xH0200) + %a2 = bitcast <6 x half> %a1 to <6 x i16> + br label %end + +cmp.false: + %a3 = bitcast <6 x half> %a to <6 x i16> + br label %end + +end: + %phi = phi <6 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x i16> %phi +} + +define <6 x half> @bitcast_v6i16_to_v6f16(<6 x i16> %a, i32 %b) { +; GCN-LABEL: bitcast_v6i16_to_v6f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v12, v5 +; GCN-NEXT: v_mov_b32_e32 v7, v4 +; GCN-NEXT: v_mov_b32_e32 v8, v3 +; GCN-NEXT: v_mov_b32_e32 v9, v2 +; GCN-NEXT: v_mov_b32_e32 v10, v1 +; GCN-NEXT: v_mov_b32_e32 v11, v0 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GCN-NEXT: ; implicit-def: $vgpr0 +; GCN-NEXT: ; implicit-def: $vgpr1 +; GCN-NEXT: ; implicit-def: $vgpr2 +; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: ; implicit-def: $vgpr4 +; GCN-NEXT: ; implicit-def: $vgpr5 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB29_3 +; GCN-NEXT: ; %bb.1: ; %Flow +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execnz .LBB29_4 +; GCN-NEXT: .LBB29_2: ; %end +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; GCN-NEXT: .LBB29_3: ; %cmp.false +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v10 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v9 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v8 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v7 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v12 +; GCN-NEXT: ; implicit-def: $vgpr12 +; GCN-NEXT: ; implicit-def: $vgpr7 +; GCN-NEXT: ; implicit-def: $vgpr8 +; GCN-NEXT: ; implicit-def: $vgpr9 +; GCN-NEXT: ; implicit-def: $vgpr10 +; GCN-NEXT: ; implicit-def: $vgpr11 +; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB29_2 +; GCN-NEXT: .LBB29_4: ; %cmp.true +; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v12 +; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v7 +; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v8 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v9 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v10 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v11 +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GCN-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GCN-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bitcast_v6i16_to_v6f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; VI-NEXT: s_cbranch_execz .LBB29_2 +; VI-NEXT: ; %bb.1: ; %cmp.true +; VI-NEXT: v_mov_b32_e32 v3, 3 +; VI-NEXT: v_add_u16_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v5, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_e32 v2, 3, v2 +; VI-NEXT: v_add_u16_e32 v1, 3, v1 +; VI-NEXT: v_add_u16_e32 v0, 3, v0 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: .LBB29_2: ; %end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: bitcast_v6i16_to_v6f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; GFX9-NEXT: ; %bb.1: ; %cmp.true +; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX9-NEXT: ; %bb.2: ; %end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: bitcast_v6i16_to_v6f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_mov_b32 s0, exec_lo +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX11-NEXT: ; %bb.1: ; %cmp.true +; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] +; GFX11-NEXT: ; %bb.2: ; %end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <6 x i16> %a, splat (i16 3) + %a2 = bitcast <6 x i16> %a1 to <6 x half> + br label %end + +cmp.false: + %a3 = bitcast <6 x i16> %a to <6 x half> + br label %end + +end: + %phi = phi <6 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <6 x half> %phi +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll deleted file mode 100644 index c7c9e90e19677..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll +++ /dev/null @@ -1,33893 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-codegenprepare-break-large-phis-threshold=4096 < %s | FileCheck -check-prefixes=GFX11 %s - -; This test just checks that the compiler doesn't crash. - - -define amdgpu_ps float @v32i8_to_v8i32(ptr addrspace(4) inreg) #0 { -; GCN-LABEL: v32i8_to_v8i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[0:1], 0x1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1] -; GCN-NEXT: ; return to shader part epilog -; -; VI-LABEL: v32i8_to_v8i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[0:1], 0x4 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1] -; VI-NEXT: ; return to shader part epilog -; -; GFX9-LABEL: v32i8_to_v8i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[0:1], 0x4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[0:1] -; GFX9-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: v32i8_to_v8i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x4 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s0 -; GFX11-NEXT: ; return to shader part epilog -entry: - %1 = load <32 x i8>, ptr addrspace(4) %0 - %2 = bitcast <32 x i8> %1 to <8 x i32> - %3 = extractelement <8 x i32> %2, i32 1 - %4 = icmp ne i32 %3, 0 - %5 = select i1 %4, float 0.0, float 1.0 - ret float %5 -} - -define amdgpu_kernel void @i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; GCN-LABEL: i8ptr_v16i8ptr: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: i8ptr_v16i8ptr: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: i8ptr_v16i8ptr: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: i8ptr_v16i8ptr: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 -; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_endpgm -entry: - %0 = load <16 x i8>, ptr addrspace(1) %in - store <16 x i8> %0, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @f32_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: f32_to_v2i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 2, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: f32_to_v2i16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v2, s2, 1.0 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x20000, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: f32_to_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e64 v1, s2, 1.0 -; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: f32_to_v2i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v0, s2, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_endpgm - %load = load float, ptr addrspace(1) %in, align 4 - %fadd32 = fadd float %load, 1.0 - %bc = bitcast float %fadd32 to <2 x i16> - %add.bitcast = add <2 x i16> %bc, - store <2 x i16> %add.bitcast, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v2i16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v2i16_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s5, s4, 0xffff0000 -; GCN-NEXT: s_add_i32 s4, s4, 2 -; GCN-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NEXT: s_or_b32 s4, s5, s4 -; GCN-NEXT: s_add_i32 s4, s4, 0x20000 -; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v2i16_to_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s2, 0xffff0000 -; VI-NEXT: s_add_i32 s2, s2, 2 -; VI-NEXT: s_and_b32 s1, s2, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s1 -; VI-NEXT: s_add_i32 s0, s0, 0x20000 -; VI-NEXT: v_add_f32_e64 v2, s0, 1.0 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v2i16_to_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v1, s2, 2 op_sel_hi:[1,0] -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v2i16_to_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v0, s2, 2 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <2 x i16>, ptr addrspace(1) %in, align 4 - %add.v2i16 = add <2 x i16> %load, - %bc = bitcast <2 x i16> %add.v2i16 to float - %fadd.bitcast = fadd float %bc, 1.0 - store float %fadd.bitcast, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: f32_to_v2f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0 -; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: f32_to_v2f16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 0x4000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v3, s2, 1.0 -; VI-NEXT: v_add_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 2.0, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: f32_to_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e64 v1, s2, 1.0 -; GFX9-NEXT: v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: f32_to_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v0, s2, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_endpgm - %load = load float, ptr addrspace(1) %in, align 4 - %fadd32 = fadd float %load, 1.0 - %bc = bitcast float %fadd32 to <2 x half> - %add.bitcast = fadd <2 x half> %bc, - store <2 x half> %add.bitcast, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v2f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v2f16_to_f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GCN-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0 -; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v2f16_to_f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 0x4000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_add_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e64 v3, s2, 2.0 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v2f16_to_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v2f16_to_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v0, s2, 2.0 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 1.0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <2 x half>, ptr addrspace(1) %in, align 4 - %add.v2f16 = fadd <2 x half> %load, - %bc = bitcast <2 x half> %add.v2f16 to float - %fadd.bitcast = fadd float %bc, 1.0 - store float %fadd.bitcast, ptr addrspace(1) %out - ret void -} - -define amdgpu_kernel void @v4i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v4i8_to_i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v4i8_to_i32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v4i8_to_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v4i8_to_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <4 x i8>, ptr addrspace(1) %in, align 4 - %bc = bitcast <4 x i8> %load to i32 - store i32 %bc, ptr addrspace(1) %out, align 4 - ret void -} - -define amdgpu_kernel void @i32_to_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: i32_to_v4i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: i32_to_v4i8: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: i32_to_v4i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: i32_to_v4i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm - %load = load i32, ptr addrspace(1) %in, align 4 - %bc = bitcast i32 %load to <4 x i8> - store <4 x i8> %bc, ptr addrspace(1) %out, align 4 - ret void -} - - -define amdgpu_kernel void @bitcast_v2i32_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; GCN-LABEL: bitcast_v2i32_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_i32 s5, s5, 9 -; GCN-NEXT: s_add_i32 s4, s4, 4 -; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v2i32_to_f64: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s3, s3, 9 -; VI-NEXT: s_add_i32 s2, s2, 4 -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v2i32_to_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s3, s3, 9 -; GFX9-NEXT: s_add_i32 s2, s2, 4 -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v2i32_to_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s3, s3, 9 -; GFX11-NEXT: s_add_i32 s2, s2, 4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %val = load <2 x i32>, ptr addrspace(1) %in, align 8 - %add = add <2 x i32> %val, - %bc = bitcast <2 x i32> %add to double - %fadd.bc = fadd double %bc, 1.0 - store double %fadd.bc, ptr addrspace(1) %out, align 8 - ret void -} - - -define amdgpu_kernel void @bitcast_f64_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; GCN-LABEL: bitcast_f64_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 4.0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_f64_to_v2i32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_f64_to_v2i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_f64_to_v2i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 4.0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %val = load double, ptr addrspace(1) %in, align 8 - %add = fadd double %val, 4.0 - %bc = bitcast double %add to <2 x i32> - store <2 x i32> %bc, ptr addrspace(1) %out, align 8 - ret void -} - - -define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, ptr addrspace(1) %out, <2 x i64> %value) { -; GCN-LABEL: bitcast_v2i64_to_v2f64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s9, s[4:5], 0x9 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf -; GCN-NEXT: s_mov_b32 s8, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s9, 0 -; GCN-NEXT: s_mov_b32 s9, s8 -; GCN-NEXT: s_mov_b32 s10, s8 -; GCN-NEXT: s_mov_b32 s11, s8 -; GCN-NEXT: s_cbranch_scc1 .LBB10_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: s_mov_b32 s4, s2 -; GCN-NEXT: s_mov_b32 s5, s3 -; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] -; GCN-NEXT: .LBB10_2: ; %end -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v2i64_to_v2f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s11, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; VI-NEXT: s_mov_b32 s8, 0 -; VI-NEXT: s_mov_b32 s9, s8 -; VI-NEXT: s_mov_b32 s10, s8 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s11, 0 -; VI-NEXT: s_mov_b32 s11, s8 -; VI-NEXT: s_cbranch_scc1 .LBB10_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 -; VI-NEXT: s_mov_b64 s[10:11], s[6:7] -; VI-NEXT: s_mov_b64 s[8:9], s[4:5] -; VI-NEXT: .LBB10_2: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v2i64_to_v2f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s11, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: s_cbranch_scc1 .LBB10_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_mov_b32 s4, s2 -; GFX9-NEXT: s_mov_b32 s5, s3 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX9-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX9-NEXT: .LBB10_2: ; %end -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v2i64_to_v2f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s11, s[4:5], 0x24 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-NEXT: s_mov_b32 s8, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s9, s8 -; GFX11-NEXT: s_mov_b32 s10, s8 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 -; GFX11-NEXT: s_mov_b32 s11, s8 -; GFX11-NEXT: s_cbranch_scc1 .LBB10_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: s_mov_b32 s4, s2 -; GFX11-NEXT: s_mov_b32 s5, s3 -; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX11-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX11-NEXT: .LBB10_2: ; %end -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11 -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: v_mov_b32_e32 v2, s10 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <2 x i64> %value to <2 x double> - br label %end - -end: - %phi = phi <2 x double> [zeroinitializer, %entry], [%cast, %if] - store <2 x double> %phi, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, ptr addrspace(1) %out, <2 x double> %value) { -; GCN-LABEL: bitcast_v2f64_to_v2i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s9, s[4:5], 0x9 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb -; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xf -; GCN-NEXT: s_mov_b32 s8, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s9, 0 -; GCN-NEXT: s_mov_b32 s9, s8 -; GCN-NEXT: s_mov_b32 s10, s8 -; GCN-NEXT: s_mov_b32 s11, s8 -; GCN-NEXT: s_cbranch_scc1 .LBB11_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: s_mov_b32 s4, s2 -; GCN-NEXT: s_mov_b32 s5, s3 -; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] -; GCN-NEXT: .LBB11_2: ; %end -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v2f64_to_v2i64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s11, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; VI-NEXT: s_mov_b32 s8, 0 -; VI-NEXT: s_mov_b32 s9, s8 -; VI-NEXT: s_mov_b32 s10, s8 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s11, 0 -; VI-NEXT: s_mov_b32 s11, s8 -; VI-NEXT: s_cbranch_scc1 .LBB11_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 -; VI-NEXT: s_mov_b64 s[10:11], s[6:7] -; VI-NEXT: s_mov_b64 s[8:9], s[4:5] -; VI-NEXT: .LBB11_2: ; %end -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v2f64_to_v2i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s11, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s11, 0 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: s_cbranch_scc1 .LBB11_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_mov_b32 s4, s2 -; GFX9-NEXT: s_mov_b32 s5, s3 -; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX9-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX9-NEXT: .LBB11_2: ; %end -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v2f64_to_v2i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s11, s[4:5], 0x24 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c -; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c -; GFX11-NEXT: s_mov_b32 s8, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s9, s8 -; GFX11-NEXT: s_mov_b32 s10, s8 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s11, 0 -; GFX11-NEXT: s_mov_b32 s11, s8 -; GFX11-NEXT: s_cbranch_scc1 .LBB11_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: s_mov_b32 s4, s2 -; GFX11-NEXT: s_mov_b32 s5, s3 -; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX11-NEXT: s_mov_b64 s[8:9], s[4:5] -; GFX11-NEXT: .LBB11_2: ; %end -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11 -; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: v_mov_b32_e32 v2, s10 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <2 x double> %value to <2 x i64> - br label %end - -end: - %phi = phi <2 x i64> [zeroinitializer, %entry], [%cast, %if] - store <2 x i64> %phi, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @v4i16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v4i16_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s6, s5, 0xffff0000 -; GCN-NEXT: s_add_i32 s5, s5, 4 -; GCN-NEXT: s_and_b32 s7, s4, 0xffff0000 -; GCN-NEXT: s_add_i32 s4, s4, 4 -; GCN-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NEXT: s_or_b32 s5, s6, s5 -; GCN-NEXT: s_or_b32 s4, s7, s4 -; GCN-NEXT: s_add_i32 s5, s5, 0x40000 -; GCN-NEXT: s_add_i32 s4, s4, 0x40000 -; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v4i16_to_f64: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s2, 0xffff0000 -; VI-NEXT: s_add_i32 s2, s2, 4 -; VI-NEXT: s_and_b32 s5, s3, 0xffff0000 -; VI-NEXT: s_add_i32 s3, s3, 4 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s3, s5, s3 -; VI-NEXT: s_or_b32 s2, s4, s2 -; VI-NEXT: s_add_i32 s3, s3, 0x40000 -; VI-NEXT: s_add_i32 s2, s2, 0x40000 -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v4i16_to_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0] -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v4i16_to_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v1, s3, 4 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, s2, 4 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <4 x i16>, ptr addrspace(1) %in, align 4 - %add.v4i16 = add <4 x i16> %load, - %bc = bitcast <4 x i16> %add.v4i16 to double - %fadd.bitcast = fadd double %bc, 1.0 - store double %fadd.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @v4f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v4f16_to_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GCN-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, s5 -; GCN-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, s4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, s5 -; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 -; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v4f16_to_f64: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0x4400 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s3, 16 -; VI-NEXT: v_add_f16_e64 v1, s3, 4.0 -; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_add_f16_e64 v2, s2, 4.0 -; VI-NEXT: v_add_f16_sdwa v3, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v1, v3 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v4f16_to_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v4f16_to_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, s3, 4.0 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_f16 v0, s2, 4.0 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <4 x half>, ptr addrspace(1) %in, align 4 - %add.v4half = fadd <4 x half> %load, - %bc = bitcast <4 x half> %add.v4half to double - %fadd.bitcast = fadd double %bc, 1.0 - store double %fadd.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @f64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: f64_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0 -; GCN-NEXT: v_add_f32_e32 v2, 2.0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v3, 2.0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: v_or_b32_e32 v0, v0, v3 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: f64_to_v4f16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v4, 0x4000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 -; VI-NEXT: v_add_f16_sdwa v5, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 2.0, v1 -; VI-NEXT: v_add_f16_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, 2.0, v0 -; VI-NEXT: v_or_b32_e32 v1, v1, v5 -; VI-NEXT: v_or_b32_e32 v0, v0, v4 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: f64_to_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[0:1], s[4:5], 1.0 -; GFX9-NEXT: v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: f64_to_v4f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_add_f16 v1, v1, 2.0 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0] -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load double, ptr addrspace(1) %in, align 4 - %fadd32 = fadd double %load, 1.0 - %bc = bitcast double %fadd32 to <4 x half> - %add.bitcast = fadd <4 x half> %bc, - store <4 x half> %add.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @f64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: f64_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_f64 v[0:1], s[4:5], 1.0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 2, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 2, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_or_b32_e32 v0, v3, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x20000, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: f64_to_v4i16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 -; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1 -; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x20000, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x20000, v0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: f64_to_v4i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_f64 v[0:1], s[4:5], 1.0 -; GFX9-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: f64_to_v4i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f64 v[0:1], s[2:3], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 2 op_sel_hi:[1,0] -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load double, ptr addrspace(1) %in, align 4 - %fadd32 = fadd double %load, 1.0 - %bc = bitcast double %fadd32 to <4 x i16> - %add.bitcast = add <4 x i16> %bc, - store <4 x i16> %add.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @v4i16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v4i16_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s2, s5, 0xffff0000 -; GCN-NEXT: s_add_i32 s5, s5, 4 -; GCN-NEXT: s_and_b32 s6, s4, 0xffff0000 -; GCN-NEXT: s_add_i32 s4, s4, 4 -; GCN-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NEXT: s_or_b32 s2, s2, s5 -; GCN-NEXT: s_or_b32 s4, s6, s4 -; GCN-NEXT: s_add_i32 s2, s2, 0x40000 -; GCN-NEXT: s_add_i32 s4, s4, 0x40000 -; GCN-NEXT: s_add_u32 s4, s4, 1 -; GCN-NEXT: s_addc_u32 s5, s2, 0 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v4i16_to_i64: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s2, 0xffff0000 -; VI-NEXT: s_add_i32 s1, s2, 4 -; VI-NEXT: s_and_b32 s2, s3, 0xffff0000 -; VI-NEXT: s_add_i32 s3, s3, 4 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_or_b32 s0, s0, s1 -; VI-NEXT: s_add_i32 s2, s2, 0x40000 -; VI-NEXT: s_add_i32 s0, s0, 0x40000 -; VI-NEXT: s_add_u32 s0, s0, 1 -; VI-NEXT: s_addc_u32 s1, s2, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v4i16_to_i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v4i16_to_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v0, s2, 4 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, s3, 4 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <4 x i16>, ptr addrspace(1) %in, align 4 - %add.v4i16 = add <4 x i16> %load, - %bc = bitcast <4 x i16> %add.v4i16 to i64 - %add.bitcast = add i64 %bc, 1 - store i64 %add.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @v4f16_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v4f16_to_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GCN-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, s5 -; GCN-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, s4 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, s5 -; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 -; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v4f16_to_i64: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 0x4400 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: v_mov_b32_e32 v6, s1 -; VI-NEXT: v_add_f16_e64 v4, s2, 4.0 -; VI-NEXT: v_add_f16_sdwa v5, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e64 v3, s3, 4.0 -; VI-NEXT: v_or_b32_e32 v2, v4, v2 -; VI-NEXT: v_or_b32_e32 v3, v3, v5 -; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v4f16_to_i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v4f16_to_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v0, s2, 4.0 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_f16 v1, s3, 4.0 op_sel_hi:[1,0] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <4 x half>, ptr addrspace(1) %in, align 4 - %add.v4half = fadd <4 x half> %load, - %bc = bitcast <4 x half> %add.v4half to i64 - %add.bitcast = add i64 %bc, 1 - store i64 %add.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @bitcast_i64_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; GCN-LABEL: bitcast_i64_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_u32 s2, s4, 4 -; GCN-NEXT: s_addc_u32 s4, s5, 0 -; GCN-NEXT: s_and_b32 s5, s2, 0xffff0000 -; GCN-NEXT: s_add_i32 s2, s2, 1 -; GCN-NEXT: s_and_b32 s6, s4, 0xffff0000 -; GCN-NEXT: s_add_i32 s4, s4, 3 -; GCN-NEXT: s_and_b32 s2, s2, 0xffff -; GCN-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NEXT: s_or_b32 s2, s5, s2 -; GCN-NEXT: s_or_b32 s4, s6, s4 -; GCN-NEXT: s_add_i32 s5, s2, 0x20000 -; GCN-NEXT: s_add_i32 s4, s4, 0x40000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_i64_to_v4i16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 4 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: s_and_b32 s2, s0, 0xffff0000 -; VI-NEXT: s_add_i32 s0, s0, 1 -; VI-NEXT: s_and_b32 s3, s1, 0xffff0000 -; VI-NEXT: s_add_i32 s1, s1, 3 -; VI-NEXT: s_and_b32 s0, s0, 0xffff -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s2, s0 -; VI-NEXT: s_or_b32 s1, s3, s1 -; VI-NEXT: s_add_i32 s0, s0, 0x20000 -; VI-NEXT: s_add_i32 s1, s1, 0x40000 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_i64_to_v4i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s2, 4 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: v_pk_add_u16 v1, s3, v0 -; GFX9-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_i64_to_v4i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_u32 s2, s2, 4 -; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, 0x40003, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %val = load i64, ptr addrspace(1) %in, align 8 - %add = add i64 %val, 4 - %bc = bitcast i64 %add to <4 x i16> - %add.v4i16 = add <4 x i16> %bc, - store <4 x i16> %add.v4i16, ptr addrspace(1) %out, align 8 - ret void -} - - -define amdgpu_kernel void @bitcast_i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; GCN-LABEL: bitcast_i64_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_u32 s4, s4, 4 -; GCN-NEXT: s_addc_u32 s5, s5, 0 -; GCN-NEXT: s_lshr_b32 s6, s4, 16 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GCN-NEXT: s_lshr_b32 s4, s5, 16 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, s6 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, s5 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, s4 -; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_add_f32_e32 v3, 0x41000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_or_b32_e32 v0, v0, v4 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_i64_to_v4f16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 0x4800 -; VI-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 4 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: s_lshr_b32 s3, s1, 16 -; VI-NEXT: s_lshr_b32 s2, s0, 16 -; VI-NEXT: v_mov_b32_e32 v6, s3 -; VI-NEXT: v_add_f16_e64 v4, s1, 4.0 -; VI-NEXT: v_mov_b32_e32 v5, s2 -; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v5, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v4, v2 -; VI-NEXT: v_add_f16_e64 v2, s0, 1.0 -; VI-NEXT: v_or_b32_e32 v2, v2, v5 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_i64_to_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x48004400 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x40003c00 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s2, 4 -; GFX9-NEXT: s_addc_u32 s3, s3, 0 -; GFX9-NEXT: v_pk_add_f16 v1, s3, v0 -; GFX9-NEXT: v_pk_add_f16 v0, s2, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_i64_to_v4f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_u32 s2, s2, 4 -; GFX11-NEXT: s_addc_u32 s3, s3, 0 -; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2 -; GFX11-NEXT: v_pk_add_f16 v1, 0x48004400, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %val = load i64, ptr addrspace(1) %in, align 8 - %add = add i64 %val, 4 - %bc = bitcast i64 %add to <4 x half> - %add.v4i16 = fadd <4 x half> %bc, - store <4 x half> %add.v4i16, ptr addrspace(1) %out, align 8 - ret void -} - - -define amdgpu_kernel void @v4i16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v4i16_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s6, s4, 0xffff0000 -; GCN-NEXT: s_add_i32 s4, s4, 4 -; GCN-NEXT: s_and_b32 s7, s5, 0xffff0000 -; GCN-NEXT: s_add_i32 s5, s5, 4 -; GCN-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NEXT: s_or_b32 s4, s6, s4 -; GCN-NEXT: s_or_b32 s5, s7, s5 -; GCN-NEXT: s_add_i32 s4, s4, 0x40000 -; GCN-NEXT: s_add_i32 s5, s5, 0x40000 -; GCN-NEXT: v_add_f32_e64 v1, s5, 1.0 -; GCN-NEXT: v_add_f32_e64 v0, s4, 1.0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v4i16_to_v2f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s3, 0xffff0000 -; VI-NEXT: s_add_i32 s1, s3, 4 -; VI-NEXT: s_and_b32 s3, s2, 0xffff0000 -; VI-NEXT: s_add_i32 s2, s2, 4 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s2, s3, s2 -; VI-NEXT: s_or_b32 s0, s0, s1 -; VI-NEXT: s_add_i32 s2, s2, 0x40000 -; VI-NEXT: s_add_i32 s0, s0, 0x40000 -; VI-NEXT: v_add_f32_e64 v3, s0, 1.0 -; VI-NEXT: v_add_f32_e64 v2, s2, 1.0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v4i16_to_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0] -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v4i16_to_v2f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v0, s3, 4 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, s2, 4 op_sel_hi:[1,0] -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v0 :: v_dual_add_f32 v0, 1.0, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <4 x i16>, ptr addrspace(1) %in, align 4 - %add.v4i16 = add <4 x i16> %load, - %bc = bitcast <4 x i16> %add.v4i16 to <2 x float> - %fadd.bitcast = fadd <2 x float> %bc, - store <2 x float> %fadd.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @v4f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v4f16_to_v2f32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, s5 -; GCN-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GCN-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, s5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, s4 -; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 -; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v1, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v3 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v4f16_to_v2f32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 0x4400 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: v_mov_b32_e32 v6, s1 -; VI-NEXT: v_add_f16_e64 v3, s2, 4.0 -; VI-NEXT: v_add_f16_e64 v4, s3, 4.0 -; VI-NEXT: v_add_f16_sdwa v5, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v3, v5 -; VI-NEXT: v_or_b32_e32 v2, v4, v2 -; VI-NEXT: v_add_f32_e32 v3, 1.0, v2 -; VI-NEXT: v_add_f32_e32 v2, 1.0, v5 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v4f16_to_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v4f16_to_v2f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v0, s3, 4.0 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_f16 v2, s2, 4.0 op_sel_hi:[1,0] -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v0 :: v_dual_add_f32 v0, 1.0, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <4 x half>, ptr addrspace(1) %in, align 4 - %add.v4half = fadd <4 x half> %load, - %bc = bitcast <4 x half> %add.v4half to <2 x float> - %fadd.bitcast = fadd <2 x float> %bc, - store <2 x float> %fadd.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @v2f32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v2f32_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_f32_e64 v0, s4, 2.0 -; GCN-NEXT: v_add_f32_e64 v1, s5, 4.0 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_or_b32_e32 v1, v2, v1 -; GCN-NEXT: v_or_b32_e32 v0, v3, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0x40000, v1 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x20000, v0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v2f32_to_v4i16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v2, s3, 4.0 -; VI-NEXT: v_add_f32_e64 v3, s2, 2.0 -; VI-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v3 -; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x40000, v2 -; VI-NEXT: v_or_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x20000, v2 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v2f32_to_v4i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s2, 0x40003 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e64 v0, s4, 2.0 -; GFX9-NEXT: v_add_f32_e64 v1, s5, 4.0 -; GFX9-NEXT: v_pk_add_u16 v1, v1, s2 -; GFX9-NEXT: v_pk_sub_u16 v0, v0, -2 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v2f32_to_v4i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v0, s3, 4.0 -; GFX11-NEXT: v_add_f32_e64 v2, s2, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_add_u16 v1, 0x40003, v0 -; GFX11-NEXT: v_pk_sub_u16 v0, v2, -2 op_sel:[0,1] op_sel_hi:[1,0] -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <2 x float>, ptr addrspace(1) %in, align 4 - %add.v2f32 = fadd <2 x float> %load, - %bc = bitcast <2 x float> %add.v2f32 to <4 x i16> - %add.bitcast = add <4 x i16> %bc, - store <4 x i16> %add.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @v2f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v2f32_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_f32_e64 v0, s5, 4.0 -; GCN-NEXT: v_add_f32_e64 v1, s4, 2.0 -; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 -; GCN-NEXT: v_add_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x41000000, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_add_f32_e32 v3, 2.0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v0, v1 -; GCN-NEXT: v_or_b32_e32 v0, v4, v2 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v2f32_to_v4f16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 0x4800 -; VI-NEXT: v_mov_b32_e32 v3, 0x4000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_f32_e64 v4, s2, 2.0 -; VI-NEXT: v_add_f32_e64 v5, s3, 4.0 -; VI-NEXT: v_add_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 4.0, v5 -; VI-NEXT: v_add_f16_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 1.0, v4 -; VI-NEXT: v_or_b32_e32 v3, v5, v2 -; VI-NEXT: v_or_b32_e32 v2, v4, v6 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v2f32_to_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s2, 0x48004400 -; GFX9-NEXT: s_mov_b32 s3, 0x40003c00 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_f32_e64 v0, s4, 2.0 -; GFX9-NEXT: v_add_f32_e64 v1, s5, 4.0 -; GFX9-NEXT: v_pk_add_f16 v1, v1, s2 -; GFX9-NEXT: v_pk_add_f16 v0, v0, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v2f32_to_v4f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v0, s3, 4.0 -; GFX11-NEXT: v_add_f32_e64 v2, s2, 2.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_pk_add_f16 v1, 0x48004400, v0 -; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <2 x float>, ptr addrspace(1) %in, align 4 - %add.v2f32 = fadd <2 x float> %load, - %bc = bitcast <2 x float> %add.v2f32 to <4 x half> - %add.bitcast = fadd <4 x half> %bc, - store <4 x half> %add.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @v4i16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v4i16_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s2, s4, 0xffff0000 -; GCN-NEXT: s_add_i32 s4, s4, 4 -; GCN-NEXT: s_and_b32 s6, s5, 0xffff0000 -; GCN-NEXT: s_add_i32 s5, s5, 4 -; GCN-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NEXT: s_or_b32 s2, s2, s4 -; GCN-NEXT: s_or_b32 s4, s6, s5 -; GCN-NEXT: s_add_i32 s4, s4, 0x40001 -; GCN-NEXT: s_add_i32 s5, s2, 0x40001 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v4i16_to_v2i32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s3, 0xffff0000 -; VI-NEXT: s_add_i32 s1, s3, 4 -; VI-NEXT: s_and_b32 s3, s2, 0xffff0000 -; VI-NEXT: s_add_i32 s2, s2, 4 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s2, s3, s2 -; VI-NEXT: s_or_b32 s0, s0, s1 -; VI-NEXT: s_add_i32 s0, s0, 0x40001 -; VI-NEXT: s_add_i32 s2, s2, 0x40001 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v4i16_to_v2i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_u16 v0, s4, 4 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, s5, 4 op_sel_hi:[1,0] -; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v4i16_to_v2i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v0, s3, 4 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, s2, 4 op_sel_hi:[1,0] -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <4 x i16>, ptr addrspace(1) %in, align 4 - %add.v4i16 = add <4 x i16> %load, - %bc = bitcast <4 x i16> %add.v4i16 to <2 x i32> - %add.bitcast = add <2 x i32> %bc, - store <2 x i32> %add.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @v4f16_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v4f16_to_v2i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, s5 -; GCN-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GCN-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, s5 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, s4 -; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 4.0, v0 -; GCN-NEXT: v_add_f32_e32 v3, 4.0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v2, 4.0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v3, v1, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 1, v3 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v4f16_to_v2i32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 0x4400 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: v_mov_b32_e32 v5, s0 -; VI-NEXT: v_mov_b32_e32 v6, s1 -; VI-NEXT: v_add_f16_e64 v3, s2, 4.0 -; VI-NEXT: v_add_f16_e64 v4, s3, 4.0 -; VI-NEXT: v_add_f16_sdwa v5, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v3, v5 -; VI-NEXT: v_or_b32_e32 v2, v4, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v2 -; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v5 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v4f16_to_v2i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_pk_add_f16 v0, s4, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, s5, 4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_add_u32_e32 v1, 1, v1 -; GFX9-NEXT: v_add_u32_e32 v0, 1, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v4f16_to_v2i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v0, s3, 4.0 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_f16 v2, s2, 4.0 op_sel_hi:[1,0] -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v0 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v2 -; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <4 x half>, ptr addrspace(1) %in, align 4 - %add.v4half = fadd <4 x half> %load, - %bc = bitcast <4 x half> %add.v4half to <2 x i32> - %add.bitcast = add <2 x i32> %bc, - store <2 x i32> %add.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @v2i32_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v2i32_to_v4i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_i32 s2, s4, 2 -; GCN-NEXT: s_add_i32 s6, s5, 4 -; GCN-NEXT: s_add_i32 s5, s5, 7 -; GCN-NEXT: s_add_i32 s4, s4, 3 -; GCN-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NEXT: s_and_b32 s6, s6, 0xffff0000 -; GCN-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NEXT: s_and_b32 s2, s2, 0xffff0000 -; GCN-NEXT: s_or_b32 s5, s6, s5 -; GCN-NEXT: s_or_b32 s2, s2, s4 -; GCN-NEXT: s_add_i32 s5, s5, 0x40000 -; GCN-NEXT: s_add_i32 s4, s2, 0x20000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v2i32_to_v4i16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s0, s3, 4 -; VI-NEXT: s_add_i32 s1, s2, 2 -; VI-NEXT: s_add_i32 s2, s2, 3 -; VI-NEXT: s_add_i32 s3, s3, 7 -; VI-NEXT: s_and_b32 s1, s1, 0xffff0000 -; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 -; VI-NEXT: s_and_b32 s3, s3, 0xffff -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_or_b32 s0, s0, s3 -; VI-NEXT: s_or_b32 s1, s1, s2 -; VI-NEXT: s_add_i32 s0, s0, 0x40000 -; VI-NEXT: s_add_i32 s1, s1, 0x20000 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v2i32_to_v4i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s2, s4, 2 -; GFX9-NEXT: s_add_i32 s3, s5, 4 -; GFX9-NEXT: v_pk_add_u16 v1, s3, v0 -; GFX9-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v2i32_to_v4i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s3, s3, 4 -; GFX11-NEXT: s_add_i32 s2, s2, 2 -; GFX11-NEXT: v_pk_add_u16 v1, 0x40003, s3 -; GFX11-NEXT: v_pk_sub_u16 v0, s2, -2 op_sel:[0,1] op_sel_hi:[1,0] -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <2 x i32>, ptr addrspace(1) %in, align 4 - %add.v2i32 = add <2 x i32> %load, - %bc = bitcast <2 x i32> %add.v2i32 to <4 x i16> - %add.bitcast = add <4 x i16> %bc, - store <4 x i16> %add.bitcast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @v2i32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GCN-LABEL: v2i32_to_v4f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_i32 s5, s5, 4 -; GCN-NEXT: s_add_i32 s4, s4, 2 -; GCN-NEXT: s_lshr_b32 s6, s5, 16 -; GCN-NEXT: s_lshr_b32 s7, s4, 16 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, s4 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, s5 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, s7 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, s6 -; GCN-NEXT: v_add_f32_e32 v1, 4.0, v1 -; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_add_f32_e32 v3, 0x41000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_add_f32_e32 v2, 2.0, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_or_b32_e32 v0, v0, v2 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: v2i32_to_v4f16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 0x4800 -; VI-NEXT: v_mov_b32_e32 v4, 0x4000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_i32 s1, s3, 4 -; VI-NEXT: s_add_i32 s0, s2, 2 -; VI-NEXT: s_lshr_b32 s2, s1, 16 -; VI-NEXT: v_add_f16_e64 v3, s1, 4.0 -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: v_mov_b32_e32 v5, s2 -; VI-NEXT: v_mov_b32_e32 v6, s1 -; VI-NEXT: v_add_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v3, v2 -; VI-NEXT: v_add_f16_sdwa v2, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e64 v4, s0, 1.0 -; VI-NEXT: v_or_b32_e32 v2, v4, v2 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: v2i32_to_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0x48004400 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x40003c00 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s2, s4, 2 -; GFX9-NEXT: s_add_i32 s3, s5, 4 -; GFX9-NEXT: v_pk_add_f16 v1, s3, v0 -; GFX9-NEXT: v_pk_add_f16 v0, s2, v3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: v2i32_to_v4f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s3, s3, 4 -; GFX11-NEXT: s_add_i32 s2, s2, 2 -; GFX11-NEXT: v_pk_add_f16 v1, 0x48004400, s3 -; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: s_endpgm - %load = load <2 x i32>, ptr addrspace(1) %in, align 4 - %add.v2i32 = add <2 x i32> %load, - %bc = bitcast <2 x i32> %add.v2i32 to <4 x half> - %add.bitcast = fadd <4 x half> %bc, - store <4 x half> %add.bitcast, ptr addrspace(1) %out - ret void -} - -declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32 immarg) - - - -define <2 x i64> @bitcast_v4f32_to_v2i64(<2 x i64> %arg) { -; GCN-LABEL: bitcast_v4f32_to_v2i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: s_buffer_load_dwordx4 s[8:11], s[4:7], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, s9, v5 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_2 -; GCN-NEXT: ; %bb.1: -; GCN-NEXT: v_cvt_f32_u32_e32 v0, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v5 -; GCN-NEXT: s_mov_b32 s4, 0x4f800000 -; GCN-NEXT: s_mov_b32 s5, 0xcf800000 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 -; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v5, vcc -; GCN-NEXT: v_mov_b32_e32 v8, s9 -; GCN-NEXT: v_fma_f32 v0, v1, s4, v0 -; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: v_fma_f32 v0, v1, s5, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v9, v6, v1 -; GCN-NEXT: v_mul_lo_u32 v10, v7, v0 -; GCN-NEXT: v_mul_hi_u32 v11, v6, v0 -; GCN-NEXT: v_mul_lo_u32 v12, v6, v0 -; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v12 -; GCN-NEXT: v_mul_hi_u32 v13, v1, v12 -; GCN-NEXT: v_mul_lo_u32 v12, v1, v12 -; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v9 -; GCN-NEXT: v_mul_lo_u32 v14, v0, v9 -; GCN-NEXT: v_mul_hi_u32 v15, v1, v9 -; GCN-NEXT: v_mul_lo_u32 v9, v1, v9 -; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, v10, v13, vcc -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v10, vcc -; GCN-NEXT: v_mul_hi_u32 v9, v6, v0 -; GCN-NEXT: v_mul_lo_u32 v7, v7, v0 -; GCN-NEXT: v_mul_lo_u32 v10, v6, v0 -; GCN-NEXT: v_mul_lo_u32 v6, v6, v1 -; GCN-NEXT: v_mul_hi_u32 v11, v1, v10 -; GCN-NEXT: v_mul_lo_u32 v12, v1, v10 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GCN-NEXT: v_mul_hi_u32 v7, v1, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v6 -; GCN-NEXT: v_mul_lo_u32 v13, v0, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v6 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; GCN-NEXT: v_mul_hi_u32 v6, s8, v0 -; GCN-NEXT: v_mul_hi_u32 v7, s9, v0 -; GCN-NEXT: v_mul_lo_u32 v0, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v9, s8, v1 -; GCN-NEXT: v_mul_lo_u32 v10, s8, v1 -; GCN-NEXT: v_mul_hi_u32 v11, s9, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s9, v1 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc -; GCN-NEXT: v_mul_hi_u32 v6, v4, v0 -; GCN-NEXT: v_mul_lo_u32 v7, v5, v0 -; GCN-NEXT: v_mul_lo_u32 v9, v4, v0 -; GCN-NEXT: v_mul_lo_u32 v10, v4, v1 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 2, v0 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v0 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GCN-NEXT: v_sub_i32_e32 v7, vcc, s9, v6 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, s8, v9 -; GCN-NEXT: v_subb_u32_e64 v7, s[4:5], v7, v5, vcc -; GCN-NEXT: v_subb_u32_e32 v6, vcc, v8, v6, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4 -; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GCN-NEXT: v_sub_i32_e32 v9, vcc, v9, v4 -; GCN-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v5 -; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5 -; GCN-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v7, v5 -; GCN-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cndmask_b32_e32 v4, v14, v12, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v4, v13, v11, vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GCN-NEXT: .LBB28_2: ; %Flow1 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB28_4 -; GCN-NEXT: ; %bb.3: -; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, v1, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v0, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s8, v1 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, v1, v4 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: .LBB28_4: -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_or_b32_e32 v5, s11, v3 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB28_6 -; GCN-NEXT: ; %bb.5: -; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GCN-NEXT: s_mov_b32 s4, 0x4f800000 -; GCN-NEXT: s_mov_b32 s5, 0xcf800000 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 -; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v8, s11 -; GCN-NEXT: v_fma_f32 v4, v5, s4, v4 -; GCN-NEXT: v_rcp_f32_e32 v4, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GCN-NEXT: v_trunc_f32_e32 v5, v5 -; GCN-NEXT: v_fma_f32 v4, v5, s5, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GCN-NEXT: v_mul_lo_u32 v9, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v7, v4 -; GCN-NEXT: v_mul_hi_u32 v11, v6, v4 -; GCN-NEXT: v_mul_lo_u32 v12, v6, v4 -; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GCN-NEXT: v_mul_hi_u32 v11, v4, v12 -; GCN-NEXT: v_mul_hi_u32 v13, v5, v12 -; GCN-NEXT: v_mul_lo_u32 v12, v5, v12 -; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GCN-NEXT: v_mul_hi_u32 v10, v4, v9 -; GCN-NEXT: v_mul_lo_u32 v14, v4, v9 -; GCN-NEXT: v_mul_hi_u32 v15, v5, v9 -; GCN-NEXT: v_mul_lo_u32 v9, v5, v9 -; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v14 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, v10, v13, vcc -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v15, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v10, vcc -; GCN-NEXT: v_mul_hi_u32 v9, v6, v4 -; GCN-NEXT: v_mul_lo_u32 v7, v7, v4 -; GCN-NEXT: v_mul_lo_u32 v10, v6, v4 -; GCN-NEXT: v_mul_lo_u32 v6, v6, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v5, v10 -; GCN-NEXT: v_mul_lo_u32 v12, v5, v10 -; GCN-NEXT: v_mul_hi_u32 v10, v4, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GCN-NEXT: v_mul_hi_u32 v7, v5, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v4, v6 -; GCN-NEXT: v_mul_lo_u32 v13, v4, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v5, v6 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v13 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc -; GCN-NEXT: v_mul_hi_u32 v6, s10, v4 -; GCN-NEXT: v_mul_hi_u32 v7, s11, v4 -; GCN-NEXT: v_mul_lo_u32 v4, s11, v4 -; GCN-NEXT: v_mul_hi_u32 v9, s10, v5 -; GCN-NEXT: v_mul_lo_u32 v10, s10, v5 -; GCN-NEXT: v_mul_hi_u32 v11, s11, v5 -; GCN-NEXT: v_mul_lo_u32 v5, s11, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v11, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v7, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v9, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 2, v4 -; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v4 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GCN-NEXT: v_sub_i32_e32 v7, vcc, s11, v6 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, s10, v9 -; GCN-NEXT: v_subb_u32_e64 v7, s[4:5], v7, v3, vcc -; GCN-NEXT: v_subb_u32_e32 v6, vcc, v8, v6, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v2 -; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GCN-NEXT: v_sub_i32_e32 v9, vcc, v9, v2 -; GCN-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v2 -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3 -; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 -; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3 -; GCN-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3 -; GCN-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v14, v12, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v2, v13, v11, vcc -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[4:5] -; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN-NEXT: .LBB28_6: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; GCN-NEXT: s_cbranch_execz .LBB28_8 -; GCN-NEXT: ; %bb.7: -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GCN-NEXT: v_mul_lo_u32 v3, v3, v4 -; GCN-NEXT: v_mul_hi_u32 v3, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_mul_hi_u32 v3, s10, v3 -; GCN-NEXT: v_mul_lo_u32 v4, v3, v2 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, v4, v2 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 -; GCN-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: .LBB28_8: -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: v_mov_b32_e32 v3, v5 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v4f32_to_v2i64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_buffer_load_dwordx4 s[8:11], s[4:7], 0x0 -; VI-NEXT: v_mov_b32_e32 v5, v1 -; VI-NEXT: v_mov_b32_e32 v4, v0 -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_or_b32_e32 v1, s9, v5 -; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_2 -; VI-NEXT: ; %bb.1: -; VI-NEXT: v_cvt_f32_u32_e32 v0, v4 -; VI-NEXT: v_cvt_f32_u32_e32 v1, v5 -; VI-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 -; VI-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc -; VI-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 -; VI-NEXT: v_rcp_f32_e32 v0, v0 -; VI-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; VI-NEXT: v_trunc_f32_e32 v1, v1 -; VI-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; VI-NEXT: v_cvt_u32_f32_e32 v8, v1 -; VI-NEXT: v_cvt_u32_f32_e32 v9, v0 -; VI-NEXT: v_mul_lo_u32 v6, v10, v8 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v9, 0 -; VI-NEXT: v_mul_lo_u32 v7, v11, v9 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v6 -; VI-NEXT: v_add_u32_e32 v13, vcc, v1, v7 -; VI-NEXT: v_mul_hi_u32 v12, v9, v0 -; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v13, 0 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v6 -; VI-NEXT: v_addc_u32_e32 v14, vcc, 0, v7, vcc -; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v13, 0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v12, v0 -; VI-NEXT: v_addc_u32_e32 v0, vcc, v14, v1, vcc -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, v9, v0 -; VI-NEXT: v_addc_u32_e32 v13, vcc, v8, v1, vcc -; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v12, 0 -; VI-NEXT: v_mul_lo_u32 v8, v10, v13 -; VI-NEXT: v_mul_lo_u32 v9, v11, v12 -; VI-NEXT: v_mul_hi_u32 v10, v12, v0 -; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v0, 0 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v8 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v9 -; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v1, 0 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v1, 0 -; VI-NEXT: v_add_u32_e32 v8, vcc, v10, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; VI-NEXT: v_addc_u32_e32 v6, vcc, v9, v7, vcc -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, v6, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, v12, v0 -; VI-NEXT: v_addc_u32_e32 v7, vcc, v13, v1, vcc -; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v7, 0 -; VI-NEXT: v_mul_hi_u32 v8, s8, v6 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v0 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v6, 0 -; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v7, 0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v8, v0 -; VI-NEXT: v_addc_u32_e32 v0, vcc, v9, v1, vcc -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, v0, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_mul_lo_u32 v8, v4, v7 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0 -; VI-NEXT: v_mul_lo_u32 v9, v5, v6 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v8 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v9 -; VI-NEXT: v_sub_u32_e32 v8, vcc, s9, v1 -; VI-NEXT: v_sub_u32_e32 v0, vcc, s8, v0 -; VI-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v5, vcc -; VI-NEXT: v_sub_u32_e64 v9, s[4:5], v0, v4 -; VI-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5] -; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 -; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 -; VI-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5 -; VI-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5] -; VI-NEXT: v_add_u32_e64 v9, s[4:5], 2, v6 -; VI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, v7, s[4:5] -; VI-NEXT: v_add_u32_e64 v11, s[4:5], 1, v6 -; VI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, v7, s[4:5] -; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 -; VI-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5] -; VI-NEXT: v_mov_b32_e32 v10, s9 -; VI-NEXT: v_subb_u32_e32 v1, vcc, v10, v1, vcc -; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; VI-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, v11, v9, s[4:5] -; VI-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; VI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; VI-NEXT: .LBB28_2: ; %Flow1 -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; VI-NEXT: s_cbranch_execz .LBB28_4 -; VI-NEXT: ; %bb.3: -; VI-NEXT: v_cvt_f32_u32_e32 v0, v4 -; VI-NEXT: v_sub_u32_e32 v1, vcc, 0, v4 -; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, v0 -; VI-NEXT: v_mul_hi_u32 v1, v0, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; VI-NEXT: v_mul_hi_u32 v0, s8, v0 -; VI-NEXT: v_mul_lo_u32 v1, v0, v4 -; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v0 -; VI-NEXT: v_sub_u32_e32 v1, vcc, s8, v1 -; VI-NEXT: v_sub_u32_e32 v6, vcc, v1, v4 -; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v0 -; VI-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: .LBB28_4: -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_or_b32_e32 v5, s11, v3 -; VI-NEXT: v_mov_b32_e32 v4, 0 -; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; VI-NEXT: ; implicit-def: $vgpr4_vgpr5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB28_6 -; VI-NEXT: ; %bb.5: -; VI-NEXT: v_cvt_f32_u32_e32 v4, v2 -; VI-NEXT: v_cvt_f32_u32_e32 v5, v3 -; VI-NEXT: v_sub_u32_e32 v10, vcc, 0, v2 -; VI-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc -; VI-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4 -; VI-NEXT: v_rcp_f32_e32 v4, v4 -; VI-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; VI-NEXT: v_trunc_f32_e32 v5, v5 -; VI-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4 -; VI-NEXT: v_cvt_u32_f32_e32 v8, v5 -; VI-NEXT: v_cvt_u32_f32_e32 v9, v4 -; VI-NEXT: v_mul_lo_u32 v6, v10, v8 -; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; VI-NEXT: v_mul_lo_u32 v7, v11, v9 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v6 -; VI-NEXT: v_add_u32_e32 v7, vcc, v5, v7 -; VI-NEXT: v_mul_hi_u32 v12, v9, v4 -; VI-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 -; VI-NEXT: v_add_u32_e32 v12, vcc, v12, v5 -; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v6, vcc -; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 -; VI-NEXT: v_add_u32_e32 v4, vcc, v12, v4 -; VI-NEXT: v_addc_u32_e32 v4, vcc, v13, v5, vcc -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, v9, v4 -; VI-NEXT: v_addc_u32_e32 v13, vcc, v8, v5, vcc -; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 -; VI-NEXT: v_mul_lo_u32 v8, v10, v13 -; VI-NEXT: v_mul_lo_u32 v9, v11, v12 -; VI-NEXT: v_mul_hi_u32 v10, v12, v4 -; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v4, 0 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v9 -; VI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v5, 0 -; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v5, 0 -; VI-NEXT: v_add_u32_e32 v8, vcc, v10, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; VI-NEXT: v_addc_u32_e32 v6, vcc, v9, v7, vcc -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, v12, v4 -; VI-NEXT: v_addc_u32_e32 v7, vcc, v13, v5, vcc -; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s10, v7, 0 -; VI-NEXT: v_mul_hi_u32 v8, s10, v6 -; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v4 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc -; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s11, v6, 0 -; VI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s11, v7, 0 -; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v4 -; VI-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, v4, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc -; VI-NEXT: v_mul_lo_u32 v8, v2, v7 -; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0 -; VI-NEXT: v_mul_lo_u32 v9, v3, v6 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8 -; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v9 -; VI-NEXT: v_sub_u32_e32 v8, vcc, s11, v5 -; VI-NEXT: v_sub_u32_e32 v4, vcc, s10, v4 -; VI-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v3, vcc -; VI-NEXT: v_sub_u32_e64 v9, s[4:5], v4, v2 -; VI-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5] -; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 -; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; VI-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 -; VI-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 -; VI-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5] -; VI-NEXT: v_add_u32_e64 v9, s[4:5], 2, v6 -; VI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, v7, s[4:5] -; VI-NEXT: v_add_u32_e64 v11, s[4:5], 1, v6 -; VI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, v7, s[4:5] -; VI-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 -; VI-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5] -; VI-NEXT: v_mov_b32_e32 v10, s11 -; VI-NEXT: v_subb_u32_e32 v5, vcc, v10, v5, vcc -; VI-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 -; VI-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; VI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; VI-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; VI-NEXT: v_cndmask_b32_e64 v2, v11, v9, s[4:5] -; VI-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; VI-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; VI-NEXT: ; implicit-def: $vgpr2_vgpr3 -; VI-NEXT: .LBB28_6: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; VI-NEXT: s_cbranch_execz .LBB28_8 -; VI-NEXT: ; %bb.7: -; VI-NEXT: v_cvt_f32_u32_e32 v3, v2 -; VI-NEXT: v_sub_u32_e32 v4, vcc, 0, v2 -; VI-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; VI-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; VI-NEXT: v_cvt_u32_f32_e32 v3, v3 -; VI-NEXT: v_mul_lo_u32 v4, v4, v3 -; VI-NEXT: v_mul_hi_u32 v4, v3, v4 -; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4 -; VI-NEXT: v_mul_hi_u32 v3, s10, v3 -; VI-NEXT: v_mul_lo_u32 v4, v3, v2 -; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v3 -; VI-NEXT: v_sub_u32_e32 v4, vcc, s10, v4 -; VI-NEXT: v_sub_u32_e32 v6, vcc, v4, v2 -; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 -; VI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; VI-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; VI-NEXT: v_add_u32_e32 v5, vcc, 1, v3 -; VI-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 -; VI-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: .LBB28_8: -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v2, v4 -; VI-NEXT: v_mov_b32_e32 v3, v5 -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v4f32_to_v2i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_buffer_load_dwordx4 s[8:11], s[4:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v1, s9, v5 -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v5 -; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v5, vcc -; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 -; GFX9-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8 -; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v9, 0 -; GFX9-NEXT: v_add3_u32 v12, v1, v6, v7 -; GFX9-NEXT: v_mul_hi_u32 v1, v9, v0 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v9, v12, 0 -; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v1, v6 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v7, vcc -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v12, 0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v13, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v14, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 -; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v10, v12, 0 -; GFX9-NEXT: v_add3_u32 v1, v1, v6, v7 -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v1, 0 -; GFX9-NEXT: v_mul_hi_u32 v10, v12, v0 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v13, v1, 0 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v0, 0 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v9, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s8, v7, 0 -; GFX9-NEXT: v_mul_hi_u32 v8, s8, v6 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s9, v6, 0 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s9, v7, 0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v9, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v0, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v5, v6 -; GFX9-NEXT: v_mul_lo_u32 v9, v4, v7 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v6, 0 -; GFX9-NEXT: v_add3_u32 v1, v1, v9, v8 -; GFX9-NEXT: v_sub_u32_e32 v8, s9, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v8, v5, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v0, v4 -; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v9, s[4:5], 2, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, v7, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v11, s[4:5], 1, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, v7, s[4:5] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v10, s9 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v10, v1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v11, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: .LBB28_2: ; %Flow1 -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; GFX9-NEXT: s_cbranch_execz .LBB28_4 -; GFX9-NEXT: ; %bb.3: -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 0, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, v4 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s8, v1 -; GFX9-NEXT: v_sub_u32_e32 v6, v1, v4 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: .LBB28_4: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_or_b32_e32 v5, s11, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB28_6 -; GFX9-NEXT: ; %bb.5: -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v10, vcc, 0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc -; GFX9-NEXT: v_madmk_f32 v4, v5, 0x4f800000, v4 -; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_madmk_f32 v4, v5, 0xcf800000, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v8 -; GFX9-NEXT: v_mul_lo_u32 v7, v11, v9 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 -; GFX9-NEXT: v_mul_hi_u32 v12, v9, v4 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 -; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 -; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 -; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 -; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s10, v7, 0 -; GFX9-NEXT: v_mul_hi_u32 v8, s10, v6 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v5, vcc -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s11, v6, 0 -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s11, v7, 0 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6 -; GFX9-NEXT: v_mul_lo_u32 v9, v2, v7 -; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v6, 0 -; GFX9-NEXT: v_add3_u32 v5, v5, v9, v8 -; GFX9-NEXT: v_sub_u32_e32 v8, s11, v5 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s10, v4 -; GFX9-NEXT: v_subb_co_u32_e64 v8, s[4:5], v8, v3, vcc -; GFX9-NEXT: v_sub_co_u32_e64 v9, s[4:5], v4, v2 -; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[4:5], 0, v8, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v9, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v9, s[4:5], 2, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, v7, s[4:5] -; GFX9-NEXT: v_add_co_u32_e64 v11, s[4:5], 1, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, v7, s[4:5] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v10, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v10, s11 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v10, v5, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v11, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX9-NEXT: .LBB28_6: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[6:7] -; GFX9-NEXT: s_cbranch_execz .LBB28_8 -; GFX9-NEXT: ; %bb.7: -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v2 -; GFX9-NEXT: v_sub_u32_e32 v4, 0, v2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, v3 -; GFX9-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v4 -; GFX9-NEXT: v_mul_hi_u32 v3, s10, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v3, v2 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 -; GFX9-NEXT: v_sub_u32_e32 v4, s10, v4 -; GFX9-NEXT: v_sub_u32_e32 v6, v4, v2 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: .LBB28_8: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, v5 -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v4f32_to_v2i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_buffer_load_b128 s[4:7], s[0:3], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v1, s5, v5 -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_2 -; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v4 -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v5 -; GFX11-NEXT: v_sub_co_u32 v10, vcc_lo, 0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_co_ci_u32_e64 v11, null, 0, v5, vcc_lo -; GFX11-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GFX11-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v1, v1 -; GFX11-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 -; GFX11-NEXT: v_cvt_u32_f32_e32 v12, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_u32_f32_e32 v13, v0 -; GFX11-NEXT: v_mul_lo_u32 v6, v10, v12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v7, v11, v13 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v10, v13, 0 -; GFX11-NEXT: v_add3_u32 v14, v1, v6, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mul_hi_u32 v15, v13, v0 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v12, v0, 0 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v13, v14, 0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v12, v14, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v15, v6 -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v8 -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v9, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v13, vcc_lo, v13, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, v12, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mul_lo_u32 v6, v11, v13 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v10, v13, 0 -; GFX11-NEXT: v_mul_lo_u32 v7, v10, v12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mul_hi_u32 v11, v13, v0 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v12, v0, 0 -; GFX11-NEXT: v_add3_u32 v10, v1, v7, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v13, v10, 0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v12, v10, 0 -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v11, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v7, v9, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v13, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, v12, v1, vcc_lo -; GFX11-NEXT: v_mul_hi_u32 v11, s4, v8 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, s5, v8, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, s4, v10, 0 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, s5, v10, 0 -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v11, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v1, v7, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v0, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo -; GFX11-NEXT: v_mul_lo_u32 v8, v5, v6 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v6, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_lo_u32 v9, v4, v7 -; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8 -; GFX11-NEXT: v_add_co_u32 v9, s0, v6, 2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v7, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v8, s5, v1 -; GFX11-NEXT: v_sub_co_u32 v11, s0, v0, v4 -; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, v1, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e64 v8, null, v8, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v11, v4 -; GFX11-NEXT: v_subrev_co_ci_u32_e64 v8, null, 0, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, v1, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v8, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, v6, 1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v7, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v12, v10 :: v_dual_cndmask_b32 v4, v11, v9 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v1 :: v_dual_cndmask_b32 v0, v6, v4 -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX11-NEXT: .LBB28_2: ; %Flow1 -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s1 -; GFX11-NEXT: s_cbranch_execz .LBB28_4 -; GFX11-NEXT: ; %bb.3: -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v4 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v1, v1, v0 -; GFX11-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v0, v0, v1 -; GFX11-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_lo_u32 v1, v0, v4 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_nc_u32_e32 v6, v1, v4 -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v6 :: v_dual_cndmask_b32 v0, v0, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo -; GFX11-NEXT: .LBB28_4: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_or_b32_e32 v5, s7, v3 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_xor_b32 s1, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB28_6 -; GFX11-NEXT: ; %bb.5: -; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GFX11-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GFX11-NEXT: v_sub_co_u32 v11, vcc_lo, 0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_co_ci_u32_e64 v12, null, 0, v3, vcc_lo -; GFX11-NEXT: v_fmamk_f32 v4, v5, 0x4f800000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GFX11-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v5, v5 -; GFX11-NEXT: v_fmamk_f32 v4, v5, 0xcf800000, v4 -; GFX11-NEXT: v_cvt_u32_f32_e32 v13, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_u32_f32_e32 v14, v4 -; GFX11-NEXT: v_mul_lo_u32 v6, v11, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v7, v12, v14 -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v11, v14, 0 -; GFX11-NEXT: v_add3_u32 v15, v5, v6, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mul_hi_u32 v16, v14, v4 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v13, v4, 0 -; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v14, v15, 0 -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v13, v15, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v16, v5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7 -; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v8, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v10, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v9 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, v4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, v13, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_mul_lo_u32 v6, v12, v14 -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v11, v14, 0 -; GFX11-NEXT: v_mul_lo_u32 v7, v11, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_hi_u32 v12, v14, v4 -; GFX11-NEXT: v_add3_u32 v11, v5, v7, v6 -; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v13, v4, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v14, v11, 0 -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v13, v11, 0 -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v12, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v6, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v8, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v10, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v14, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, v13, v5, vcc_lo -; GFX11-NEXT: v_mul_hi_u32 v11, s6, v8 -; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, s7, v8, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, s6, v10, 0 -; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, s7, v10, 0 -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v11, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v7, vcc_lo -; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v4, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v5, vcc_lo -; GFX11-NEXT: v_mul_lo_u32 v8, v3, v6 -; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v2, v6, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_lo_u32 v9, v2, v7 -; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, s6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add3_u32 v5, v5, v9, v8 -; GFX11-NEXT: v_add_co_u32 v9, s0, v6, 2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v7, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v8, s7, v5 -; GFX11-NEXT: v_sub_co_u32 v11, s0, v4, v2 -; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, s7, v5, vcc_lo -; GFX11-NEXT: v_sub_co_ci_u32_e64 v8, null, v8, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v11, v2 -; GFX11-NEXT: v_subrev_co_ci_u32_e64 v8, null, 0, v8, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v8, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v5, v3 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v12, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, v6, 1 -; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v7, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_dual_cndmask_b32 v3, v12, v10 :: v_dual_cndmask_b32 v4, v11, v9 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_cndmask_b32 v5, v7, v3 :: v_dual_cndmask_b32 v4, v6, v4 -; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX11-NEXT: .LBB28_6: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s1 -; GFX11-NEXT: s_cbranch_execz .LBB28_8 -; GFX11-NEXT: ; %bb.7: -; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v2 -; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v4, v4, v3 -; GFX11-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v4 -; GFX11-NEXT: v_mul_hi_u32 v3, s6, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_lo_u32 v4, v3, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v3 -; GFX11-NEXT: v_sub_nc_u32_e32 v4, s6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_nc_u32_e32 v6, v4, v2 -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_cndmask_b32 v3, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, v4, v2 -; GFX11-NEXT: v_add_nc_u32_e32 v5, 1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v4, v3, v5 :: v_dual_mov_b32 v5, 0 -; GFX11-NEXT: .LBB28_8: -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %val = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> poison, i32 0, i32 0) - %cast = bitcast <4 x float> %val to <2 x i64> - %div = udiv <2 x i64> %cast, %arg - ret <2 x i64> %div -} - -declare half @llvm.canonicalize.f16(half) - - -define amdgpu_kernel void @bitcast_f32_to_v1i32(ptr addrspace(1) %out) { -; GCN-LABEL: bitcast_f32_to_v1i32: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0x387c0000 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_f32_to_v1i32: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v2, 0x387c0000 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_f32_to_v1i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x387c0000 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_f32_to_v1i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x387c0000 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm - %f16 = call arcp afn half @llvm.canonicalize.f16(half 0xH03F0) - %f32 = fpext half %f16 to float - %v = bitcast float %f32 to <1 x i32> - %v1 = extractelement <1 x i32> %v, i32 0 - store i32 %v1, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @bitcast_v4i64_to_v16i16(i32 %cond, ptr addrspace(1) %out, <4 x i64> %value) { -; GCN-LABEL: bitcast_v4i64_to_v16i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v4i64_to_v16i16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s9, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s9, 0 -; VI-NEXT: s_add_u32 s6, s4, 16 -; VI-NEXT: s_addc_u32 s7, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v4i64_to_v16i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v4i64_to_v16i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_mov_b32_e32 v6, s0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <4 x i64> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <4 x i64> %phi_value to <16 x i16> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <16 x i16> [zeroinitializer, %entry], [%cast, %if] - store <16 x i16> %phi_cast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @bitcast_v4f64_to_v16f16(i32 %cond, ptr addrspace(1) %out, <4 x double> %value) { -; GCN-LABEL: bitcast_v4f64_to_v16f16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s11, 0 -; GCN-NEXT: s_mov_b32 s18, 0 -; GCN-NEXT: s_mov_b32 s15, 0 -; GCN-NEXT: s_mov_b32 s19, 0 -; GCN-NEXT: s_mov_b32 s16, 0 -; GCN-NEXT: s_mov_b32 s20, 0 -; GCN-NEXT: s_mov_b32 s17, 0 -; GCN-NEXT: s_mov_b32 s21, 0 -; GCN-NEXT: s_mov_b32 s8, 0 -; GCN-NEXT: s_mov_b32 s12, 0 -; GCN-NEXT: s_mov_b32 s9, 0 -; GCN-NEXT: s_mov_b32 s13, 0 -; GCN-NEXT: s_mov_b32 s10, 0 -; GCN-NEXT: s_mov_b32 s14, 0 -; GCN-NEXT: s_mov_b32 s7, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, s18 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, s11 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, s19 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, s15 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, s20 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, s16 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, s21 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, s17 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, s12 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, s8 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, s13 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, s9 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, s14 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, s10 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, s6 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, s7 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_or_b32_e32 v2, v5, v4 -; GCN-NEXT: v_or_b32_e32 v3, v7, v6 -; GCN-NEXT: v_or_b32_e32 v4, v9, v8 -; GCN-NEXT: v_or_b32_e32 v5, v11, v10 -; GCN-NEXT: v_or_b32_e32 v6, v13, v12 -; GCN-NEXT: v_or_b32_e32 v7, v15, v14 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v4f64_to_v16f16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s9, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s9, 0 -; VI-NEXT: s_add_u32 s6, s4, 16 -; VI-NEXT: s_addc_u32 s7, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v4f64_to_v16f16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v4f64_to_v16f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_mov_b32_e32 v6, s0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <4 x double> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <4 x double> %phi_value to <16 x half> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <16 x half> [zeroinitializer, %entry], [%cast, %if] - store <16 x half> %phi_cast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @bitcast_v16i16_to_v4i64(i32 %cond, ptr addrspace(1) %out, <16 x i16> %value) { -; GCN-LABEL: bitcast_v16i16_to_v4i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v16i16_to_v4i64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s9, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s9, 0 -; VI-NEXT: s_add_u32 s6, s4, 16 -; VI-NEXT: s_addc_u32 s7, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v16i16_to_v4i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v16i16_to_v4i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_mov_b32_e32 v6, s0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <16 x i16> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <16 x i16> %phi_value to <4 x i64> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <4 x i64> [zeroinitializer, %entry], [%cast, %if] - store <4 x i64> %phi_cast, ptr addrspace(1) %out - ret void -} - - -define amdgpu_kernel void @bitcast_v16f16_to_v4f64(i32 %cond, ptr addrspace(1) %out, <16 x half> %value) { -; GCN-LABEL: bitcast_v16f16_to_v4f64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v16f16_to_v4f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s9, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s9, 0 -; VI-NEXT: s_add_u32 s6, s4, 16 -; VI-NEXT: s_addc_u32 s7, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v16f16_to_v4f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] offset:16 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[10:11] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v16f16_to_v4f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_mov_b32_e32 v6, s0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <16 x half> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <16 x half> %phi_value to <4 x double> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <4 x double> [zeroinitializer, %entry], [%cast, %if] - store <4 x double> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v20f16_to_v5f64(i32 %cond, ptr addrspace(1) %out, <20 x half> %value) { -; GCN-LABEL: bitcast_v20f16_to_v5f64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v20f16_to_v5f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s8, s4, 16 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v20f16_to_v5f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v20f16_to_v5f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <20 x half> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <20 x half> %phi_value to <5 x double> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if] - store <5 x double> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v10f32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) { -; GCN-LABEL: bitcast_v10f32_to_v5f64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v10f32_to_v5f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s8, s4, 16 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v10f32_to_v5f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v10f32_to_v5f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <10 x float> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <10 x float> %phi_value to <5 x double> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if] - store <5 x double> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v10i32_to_v5f64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) { -; GCN-LABEL: bitcast_v10i32_to_v5f64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v10i32_to_v5f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s8, s4, 16 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v10i32_to_v5f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v10i32_to_v5f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <10 x i32> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <10 x i32> %phi_value to <5 x double> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if] - store <5 x double> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v10f32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x float> %value) { -; GCN-LABEL: bitcast_v10f32_to_v5i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v10f32_to_v5i64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s8, s4, 16 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v10f32_to_v5i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v10f32_to_v5i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <10 x float> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <10 x float> %phi_value to <5 x i64> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if] - store <5 x i64> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v10i32_to_v5i64(i32 %cond, ptr addrspace(1) %out, <10 x i32> %value) { -; GCN-LABEL: bitcast_v10i32_to_v5i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v10i32_to_v5i64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s8, s4, 16 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v10i32_to_v5i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v10i32_to_v5i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <10 x i32> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <10 x i32> %phi_value to <5 x i64> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if] - store <5 x i64> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v40i8_to_v5f64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) { -; GCN-LABEL: bitcast_v40i8_to_v5f64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v40i8_to_v5f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s8, s4, 16 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v40i8_to_v5f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v40i8_to_v5f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <40 x i8> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <40 x i8> %phi_value to <5 x double> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <5 x double> [zeroinitializer, %entry], [%cast, %if] - store <5 x double> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v40i8_to_v5i64(i32 %cond, ptr addrspace(1) %out, <40 x i8> %value) { -; GCN-LABEL: bitcast_v40i8_to_v5i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v40i8_to_v5i64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s8, s4, 16 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v40i8_to_v5i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v40i8_to_v5i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <40 x i8> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <40 x i8> %phi_value to <5 x i64> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <5 x i64> [zeroinitializer, %entry], [%cast, %if] - store <5 x i64> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v5f64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) { -; GCN-LABEL: bitcast_v5f64_to_v10f32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v5f64_to_v10f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s7, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_mov_b32 s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s7, 0 -; VI-NEXT: s_add_u32 s8, s4, 16 -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v5f64_to_v10f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 -; GFX9-NEXT: s_cmp_lg_u32 s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v5f64_to_v10f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 -; GFX11-NEXT: s_cmp_lg_u32 s7, 0 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <5 x double> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <5 x double> %phi_value to <10 x float> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <10 x float> [zeroinitializer, %entry], [%cast, %if] - store <10 x float> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v5f64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x double> %value) { -; GCN-LABEL: bitcast_v5f64_to_v10i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v5f64_to_v10i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s7, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_mov_b32 s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s7, 0 -; VI-NEXT: s_add_u32 s8, s4, 16 -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v5f64_to_v10i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 -; GFX9-NEXT: s_cmp_lg_u32 s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v5f64_to_v10i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 -; GFX11-NEXT: s_cmp_lg_u32 s7, 0 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <5 x double> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <5 x double> %phi_value to <10 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <10 x i32> [zeroinitializer, %entry], [%cast, %if] - store <10 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v5i64_to_v10f32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) { -; GCN-LABEL: bitcast_v5i64_to_v10f32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v5i64_to_v10f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s7, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_mov_b32 s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s7, 0 -; VI-NEXT: s_add_u32 s8, s4, 16 -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v5i64_to_v10f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 -; GFX9-NEXT: s_cmp_lg_u32 s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v5i64_to_v10f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 -; GFX11-NEXT: s_cmp_lg_u32 s7, 0 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <5 x i64> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <5 x i64> %phi_value to <10 x float> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <10 x float> [zeroinitializer, %entry], [%cast, %if] - store <10 x float> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v5i64_to_v10i32(i32 %cond, ptr addrspace(1) %out, <5 x i64> %value) { -; GCN-LABEL: bitcast_v5i64_to_v10i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s15, 0xf000 -; GCN-NEXT: s_mov_b32 s14, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 -; GCN-NEXT: buffer_store_dwordx2 v[8:9], off, s[12:15], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v5i64_to_v10i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s7, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_mov_b32 s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s7, 0 -; VI-NEXT: s_add_u32 s8, s4, 16 -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s9 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v5i64_to_v10i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s7, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] offset:16 -; GFX9-NEXT: s_cmp_lg_u32 s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[12:13] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v5i64_to_v10i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_mov_b32_e32 v8, s0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v10, v[8:9], s[4:5] offset:32 -; GFX11-NEXT: s_cmp_lg_u32 s7, 0 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <5 x i64> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <5 x i64> %phi_value to <10 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <10 x i32> [zeroinitializer, %entry], [%cast, %if] - store <10 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v6f64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) { -; GCN-LABEL: bitcast_v6f64_to_v12i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NEXT: s_mov_b32 s18, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: v_mov_b32_e32 v10, s0 -; GCN-NEXT: v_mov_b32_e32 v11, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v6f64_to_v12i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s9, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_mov_b32 s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s9, 0 -; VI-NEXT: s_add_u32 s10, s4, 16 -; VI-NEXT: s_addc_u32 s11, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v6f64_to_v12i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v6f64_to_v12i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v10, s0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32 -; GFX11-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <6 x double> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <6 x double> %phi_value to <12 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <12 x i32> [zeroinitializer, %entry], [%cast, %if] - store <12 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v6f64_to_v12f32(i32 %cond, ptr addrspace(1) %out, <6 x double> %value) { -; GCN-LABEL: bitcast_v6f64_to_v12f32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NEXT: s_mov_b32 s18, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: v_mov_b32_e32 v10, s0 -; GCN-NEXT: v_mov_b32_e32 v11, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v6f64_to_v12f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s9, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_mov_b32 s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s9, 0 -; VI-NEXT: s_add_u32 s10, s4, 16 -; VI-NEXT: s_addc_u32 s11, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v6f64_to_v12f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v6f64_to_v12f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v10, s0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32 -; GFX11-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <6 x double> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <6 x double> %phi_value to <12 x float> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <12 x float> [zeroinitializer, %entry], [%cast, %if] - store <12 x float> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v12i32_to_v6i64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) { -; GCN-LABEL: bitcast_v12i32_to_v6i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NEXT: s_mov_b32 s18, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: v_mov_b32_e32 v10, s0 -; GCN-NEXT: v_mov_b32_e32 v11, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v12i32_to_v6i64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s10, s4, 16 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s11, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v12i32_to_v6i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v12i32_to_v6i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v10, s0 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <12 x i32> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <12 x i32> %phi_value to <6 x i64> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <6 x i64> [zeroinitializer, %entry], [%cast, %if] - store <6 x i64> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v12i32_to_v6f64(i32 %cond, ptr addrspace(1) %out, <12 x i32> %value) { -; GCN-LABEL: bitcast_v12i32_to_v6f64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NEXT: s_mov_b32 s18, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: v_mov_b32_e32 v10, s0 -; GCN-NEXT: v_mov_b32_e32 v11, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v12i32_to_v6f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s10, s4, 16 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s11, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v12i32_to_v6f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v12i32_to_v6f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v10, s0 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <12 x i32> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <12 x i32> %phi_value to <6 x double> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <6 x double> [zeroinitializer, %entry], [%cast, %if] - store <6 x double> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v6i64_to_v12i32(i32 %cond, ptr addrspace(1) %out, <6 x i64> %value) { -; GCN-LABEL: bitcast_v6i64_to_v12i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s1, s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s1, 0 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NEXT: s_mov_b32 s18, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: v_mov_b32_e32 v10, s0 -; GCN-NEXT: v_mov_b32_e32 v11, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v6i64_to_v12i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s9, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_mov_b32 s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s9, 0 -; VI-NEXT: s_add_u32 s10, s4, 16 -; VI-NEXT: s_addc_u32 s11, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v6i64_to_v12i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s9, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[14:15], s[4:5], 0x2c -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:16 -; GFX9-NEXT: s_cmp_lg_u32 s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[14:15] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v6i64_to_v12i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s9, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v10, s0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v12, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:32 -; GFX11-NEXT: s_cmp_lg_u32 s9, 0 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <6 x i64> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <6 x i64> %phi_value to <12 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <12 x i32> [zeroinitializer, %entry], [%cast, %if] - store <12 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v7i64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x i64> %value) { -; GCN-LABEL: bitcast_v7i64_to_v14i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NEXT: s_mov_b32 s18, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v12, s0 -; GCN-NEXT: v_mov_b32_e32 v13, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: v_mov_b32_e32 v10, s0 -; GCN-NEXT: v_mov_b32_e32 v11, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 -; GCN-NEXT: buffer_store_dwordx2 v[12:13], off, s[16:19], 0 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v7i64_to_v14i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s12, s4, 16 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s13, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s8, s4, 48 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v7i64_to_v14i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:48 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v7i64_to_v14i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v13, s0 -; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v10, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v14, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v14, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v14, v[12:13], s[4:5] offset:48 -; GFX11-NEXT: global_store_b128 v14, v[8:11], s[4:5] offset:32 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <7 x i64> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <7 x i64> %phi_value to <14 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <14 x i32> [zeroinitializer, %entry], [%cast, %if] - store <14 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v7f64_to_v14i32(i32 %cond, ptr addrspace(1) %out, <7 x double> %value) { -; GCN-LABEL: bitcast_v7f64_to_v14i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NEXT: s_mov_b32 s18, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s0 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_mov_b32_e32 v7, s0 -; GCN-NEXT: v_mov_b32_e32 v12, s0 -; GCN-NEXT: v_mov_b32_e32 v13, s0 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s0 -; GCN-NEXT: v_mov_b32_e32 v10, s0 -; GCN-NEXT: v_mov_b32_e32 v11, s0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 -; GCN-NEXT: buffer_store_dwordx2 v[12:13], off, s[16:19], 0 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[16:19], 0 offset:32 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v7f64_to_v14i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s12, s4, 16 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s13, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s8, s4, 48 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s9, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: s_add_u32 s0, s4, 32 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v7f64_to_v14i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] offset:48 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v7f64_to_v14i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v14, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v13, s0 -; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_mov_b32_e32 v10, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v14, v[0:3], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v14, v[4:7], s[4:5] -; GFX11-NEXT: global_store_b64 v14, v[12:13], s[4:5] offset:48 -; GFX11-NEXT: global_store_b128 v14, v[8:11], s[4:5] offset:32 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <7 x double> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <7 x double> %phi_value to <14 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <14 x i32> [zeroinitializer, %entry], [%cast, %if] - store <14 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v9i64_to_v18i32(i32 %cond, ptr addrspace(1) %out, <9 x i64> %value) { -; GCN-LABEL: bitcast_v9i64_to_v18i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s6 -; GCN-NEXT: v_mov_b32_e32 v8, s6 -; GCN-NEXT: v_mov_b32_e32 v9, s6 -; GCN-NEXT: v_mov_b32_e32 v10, s6 -; GCN-NEXT: v_mov_b32_e32 v11, s6 -; GCN-NEXT: v_mov_b32_e32 v12, s6 -; GCN-NEXT: v_mov_b32_e32 v13, s6 -; GCN-NEXT: v_mov_b32_e32 v14, s6 -; GCN-NEXT: v_mov_b32_e32 v15, s6 -; GCN-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NEXT: v_mov_b32_e32 v17, s6 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 -; GCN-NEXT: buffer_store_dwordx2 v[16:17], off, s[0:3], 0 offset:64 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v9i64_to_v18i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s16, s4, 48 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s17, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s17 -; VI-NEXT: s_add_u32 s12, s4, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s13, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: s_add_u32 s10, s4, 16 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s11, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_add_u32 s0, s4, 64 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v9i64_to_v18i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[20:21], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] offset:48 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] offset:32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[20:21] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[20:21] offset:64 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v9i64_to_v18i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v18, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 -; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 -; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0 -; GFX11-NEXT: v_mov_b32_e32 v16, s0 -; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: global_store_b128 v18, v[0:3], s[4:5] offset:48 -; GFX11-NEXT: global_store_b128 v18, v[4:7], s[4:5] offset:32 -; GFX11-NEXT: global_store_b128 v18, v[8:11], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v18, v[12:15], s[4:5] -; GFX11-NEXT: global_store_b64 v18, v[16:17], s[4:5] offset:64 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <9 x i64> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <9 x i64> %phi_value to <18 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <18 x i32> [zeroinitializer, %entry], [%cast, %if] - store <18 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v10i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <10 x i64> %value) { -; GCN-LABEL: bitcast_v10i64_to_v20i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s6 -; GCN-NEXT: v_mov_b32_e32 v8, s6 -; GCN-NEXT: v_mov_b32_e32 v9, s6 -; GCN-NEXT: v_mov_b32_e32 v10, s6 -; GCN-NEXT: v_mov_b32_e32 v11, s6 -; GCN-NEXT: v_mov_b32_e32 v12, s6 -; GCN-NEXT: v_mov_b32_e32 v13, s6 -; GCN-NEXT: v_mov_b32_e32 v14, s6 -; GCN-NEXT: v_mov_b32_e32 v15, s6 -; GCN-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NEXT: v_mov_b32_e32 v17, s6 -; GCN-NEXT: v_mov_b32_e32 v18, s6 -; GCN-NEXT: v_mov_b32_e32 v19, s6 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 -; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:64 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v10i64_to_v20i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s18, s4, 48 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s19, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: s_add_u32 s14, s4, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s15, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s14 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s15 -; VI-NEXT: s_add_u32 s14, s4, 16 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s15, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s14 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s15 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: s_add_u32 s0, s4, 64 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v10i64_to_v20i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[22:23], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:48 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[22:23] offset:64 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v10i64_to_v20i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v20, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 -; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 -; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0 -; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0 -; GFX11-NEXT: v_mov_b32_e32 v18, s0 -; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] offset:48 -; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:32 -; GFX11-NEXT: global_store_b128 v20, v[8:11], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v20, v[12:15], s[4:5] -; GFX11-NEXT: global_store_b128 v20, v[16:19], s[4:5] offset:64 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <10 x i64> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <10 x i64> %phi_value to <20 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <20 x i32> [zeroinitializer, %entry], [%cast, %if] - store <20 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v11i64_to_v20i32(i32 %cond, ptr addrspace(1) %out, <11 x i64> %value) { -; GCN-LABEL: bitcast_v11i64_to_v20i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s6 -; GCN-NEXT: v_mov_b32_e32 v8, s6 -; GCN-NEXT: v_mov_b32_e32 v9, s6 -; GCN-NEXT: v_mov_b32_e32 v10, s6 -; GCN-NEXT: v_mov_b32_e32 v11, s6 -; GCN-NEXT: v_mov_b32_e32 v12, s6 -; GCN-NEXT: v_mov_b32_e32 v13, s6 -; GCN-NEXT: v_mov_b32_e32 v14, s6 -; GCN-NEXT: v_mov_b32_e32 v15, s6 -; GCN-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NEXT: v_mov_b32_e32 v17, s6 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 -; GCN-NEXT: buffer_store_dwordx2 v[16:17], off, s[0:3], 0 offset:80 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v11i64_to_v20i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s20, s4, 48 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s21, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: s_add_u32 s16, s4, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s17, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s17 -; VI-NEXT: s_add_u32 s10, s4, 16 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s11, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_mov_b32 s6, s0 -; VI-NEXT: s_mov_b32 s7, s0 -; VI-NEXT: s_mov_b32 s8, s0 -; VI-NEXT: s_mov_b32 s9, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_add_u32 s0, s4, 0x50 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_u32 s0, s4, 64 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v11i64_to_v20i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:48 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[24:25] offset:80 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[24:25] offset:64 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v11i64_to_v20i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v22, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 -; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 -; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v21, s0 -; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v17, s0 -; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0 -; GFX11-NEXT: v_mov_b32_e32 v18, s0 -; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: global_store_b128 v22, v[0:3], s[4:5] offset:48 -; GFX11-NEXT: global_store_b128 v22, v[4:7], s[4:5] offset:32 -; GFX11-NEXT: global_store_b128 v22, v[8:11], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v22, v[12:15], s[4:5] -; GFX11-NEXT: global_store_b64 v22, v[20:21], s[4:5] offset:80 -; GFX11-NEXT: global_store_b128 v22, v[16:19], s[4:5] offset:64 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <11 x i64> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <11 x i64> %phi_value to <22 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <22 x i32> [zeroinitializer, %entry], [%cast, %if] - store <22 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v12i64_to_v22i32(i32 %cond, ptr addrspace(1) %out, <12 x i64> %value) { -; GCN-LABEL: bitcast_v12i64_to_v22i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s6 -; GCN-NEXT: v_mov_b32_e32 v8, s6 -; GCN-NEXT: v_mov_b32_e32 v9, s6 -; GCN-NEXT: v_mov_b32_e32 v10, s6 -; GCN-NEXT: v_mov_b32_e32 v11, s6 -; GCN-NEXT: v_mov_b32_e32 v12, s6 -; GCN-NEXT: v_mov_b32_e32 v13, s6 -; GCN-NEXT: v_mov_b32_e32 v14, s6 -; GCN-NEXT: v_mov_b32_e32 v15, s6 -; GCN-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NEXT: v_mov_b32_e32 v17, s6 -; GCN-NEXT: v_mov_b32_e32 v18, s6 -; GCN-NEXT: v_mov_b32_e32 v19, s6 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v12i64_to_v22i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s22, s4, 0x50 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s23, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s22 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s23 -; VI-NEXT: s_add_u32 s18, s4, 64 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s19, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s18 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s19 -; VI-NEXT: s_add_u32 s14, s4, 48 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s15, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s14 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s15 -; VI-NEXT: s_add_u32 s10, s4, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s11, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s10 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s11 -; VI-NEXT: s_add_u32 s6, s4, 16 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s7, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v12i64_to_v22i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[26:27], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:80 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:64 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:48 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[26:27] -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v12i64_to_v22i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 -; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 -; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0 -; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0 -; GFX11-NEXT: v_dual_mov_b32 v18, s0 :: v_dual_mov_b32 v21, s0 -; GFX11-NEXT: v_dual_mov_b32 v20, s0 :: v_dual_mov_b32 v23, s0 -; GFX11-NEXT: v_mov_b32_e32 v22, s0 -; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: global_store_b128 v24, v[0:3], s[4:5] offset:80 -; GFX11-NEXT: global_store_b128 v24, v[4:7], s[4:5] offset:64 -; GFX11-NEXT: global_store_b128 v24, v[8:11], s[4:5] offset:48 -; GFX11-NEXT: global_store_b128 v24, v[12:15], s[4:5] offset:32 -; GFX11-NEXT: global_store_b128 v24, v[16:19], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v24, v[20:23], s[4:5] -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <12 x i64> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <12 x i64> %phi_value to <24 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <24 x i32> [zeroinitializer, %entry], [%cast, %if] - store <24 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v13i64_to_v24i32(i32 %cond, ptr addrspace(1) %out, <13 x i64> %value) { -; GCN-LABEL: bitcast_v13i64_to_v24i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s6 -; GCN-NEXT: v_mov_b32_e32 v8, s6 -; GCN-NEXT: v_mov_b32_e32 v9, s6 -; GCN-NEXT: v_mov_b32_e32 v10, s6 -; GCN-NEXT: v_mov_b32_e32 v11, s6 -; GCN-NEXT: v_mov_b32_e32 v12, s6 -; GCN-NEXT: v_mov_b32_e32 v13, s6 -; GCN-NEXT: v_mov_b32_e32 v14, s6 -; GCN-NEXT: v_mov_b32_e32 v15, s6 -; GCN-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NEXT: v_mov_b32_e32 v17, s6 -; GCN-NEXT: v_mov_b32_e32 v18, s6 -; GCN-NEXT: v_mov_b32_e32 v19, s6 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:96 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v13i64_to_v24i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2c -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_add_u32 s24, s4, 0x50 -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_addc_u32 s25, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s24 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s25 -; VI-NEXT: s_add_u32 s20, s4, 64 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s21, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s20 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s21 -; VI-NEXT: s_add_u32 s16, s4, 48 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s17, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s17 -; VI-NEXT: s_add_u32 s12, s4, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s13, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s13 -; VI-NEXT: s_add_u32 s6, s4, 16 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s7, s5, 0 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s7 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: s_add_u32 s0, s4, 0x60 -; VI-NEXT: s_addc_u32 s1, s5, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v13i64_to_v24i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[28:29], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:80 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:64 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:48 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[28:29] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[28:29] offset:96 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v13i64_to_v24i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v20, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 -; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 -; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0 -; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0 -; GFX11-NEXT: v_mov_b32_e32 v18, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] offset:80 -; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:64 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s0 -; GFX11-NEXT: v_mov_b32_e32 v5, s0 -; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: global_store_b128 v20, v[8:11], s[4:5] offset:48 -; GFX11-NEXT: global_store_b128 v20, v[12:15], s[4:5] offset:32 -; GFX11-NEXT: global_store_b128 v20, v[16:19], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] -; GFX11-NEXT: global_store_b64 v20, v[4:5], s[4:5] offset:96 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <13 x i64> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <13 x i64> %phi_value to <26 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <26 x i32> [zeroinitializer, %entry], [%cast, %if] - store <26 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v14i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <14 x i64> %value) { -; GCN-LABEL: bitcast_v14i64_to_v26i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s6 -; GCN-NEXT: v_mov_b32_e32 v8, s6 -; GCN-NEXT: v_mov_b32_e32 v9, s6 -; GCN-NEXT: v_mov_b32_e32 v10, s6 -; GCN-NEXT: v_mov_b32_e32 v11, s6 -; GCN-NEXT: v_mov_b32_e32 v12, s6 -; GCN-NEXT: v_mov_b32_e32 v13, s6 -; GCN-NEXT: v_mov_b32_e32 v14, s6 -; GCN-NEXT: v_mov_b32_e32 v15, s6 -; GCN-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NEXT: v_mov_b32_e32 v17, s6 -; GCN-NEXT: v_mov_b32_e32 v18, s6 -; GCN-NEXT: v_mov_b32_e32 v19, s6 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s6 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v14i64_to_v26i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s2, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 0x50 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s4, s0, 64 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s4, s0, 48 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s4, s0, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s4, s0, 16 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_add_u32 s0, s0, 0x60 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v14i64_to_v26i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[30:31], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:80 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:64 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:48 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[30:31] offset:96 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v14i64_to_v26i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v20, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 -; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 -; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0 -; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0 -; GFX11-NEXT: v_mov_b32_e32 v18, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] offset:80 -; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:64 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s0 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s0 -; GFX11-NEXT: v_mov_b32_e32 v7, s0 -; GFX11-NEXT: s_clause 0x4 -; GFX11-NEXT: global_store_b128 v20, v[8:11], s[4:5] offset:48 -; GFX11-NEXT: global_store_b128 v20, v[12:15], s[4:5] offset:32 -; GFX11-NEXT: global_store_b128 v20, v[16:19], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v20, v[0:3], s[4:5] -; GFX11-NEXT: global_store_b128 v20, v[4:7], s[4:5] offset:96 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <14 x i64> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <14 x i64> %phi_value to <28 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <28 x i32> [zeroinitializer, %entry], [%cast, %if] - store <28 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - - -define amdgpu_kernel void @bitcast_v15i64_to_v26i32(i32 %cond, ptr addrspace(1) %out, <15 x i64> %value) { -; GCN-LABEL: bitcast_v15i64_to_v26i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xb -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s6 -; GCN-NEXT: v_mov_b32_e32 v8, s6 -; GCN-NEXT: v_mov_b32_e32 v9, s6 -; GCN-NEXT: v_mov_b32_e32 v10, s6 -; GCN-NEXT: v_mov_b32_e32 v11, s6 -; GCN-NEXT: v_mov_b32_e32 v12, s6 -; GCN-NEXT: v_mov_b32_e32 v13, s6 -; GCN-NEXT: v_mov_b32_e32 v14, s6 -; GCN-NEXT: v_mov_b32_e32 v15, s6 -; GCN-NEXT: v_mov_b32_e32 v16, s6 -; GCN-NEXT: v_mov_b32_e32 v17, s6 -; GCN-NEXT: v_mov_b32_e32 v18, s6 -; GCN-NEXT: v_mov_b32_e32 v19, s6 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NEXT: v_mov_b32_e32 v20, s6 -; GCN-NEXT: v_mov_b32_e32 v21, s6 -; GCN-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_mov_b32_e32 v4, s6 -; GCN-NEXT: v_mov_b32_e32 v5, s6 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s6 -; GCN-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; GCN-NEXT: buffer_store_dwordx2 v[20:21], off, s[0:3], 0 offset:112 -; GCN-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GCN-NEXT: s_endpgm -; -; VI-LABEL: bitcast_v15i64_to_v26i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s0, s[4:5], 0x24 -; VI-NEXT: s_mov_b32 s2, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s0, 0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: s_mov_b32 s12, s2 -; VI-NEXT: s_mov_b32 s13, s2 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 0x50 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s4, s0, 64 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s4, s0, 48 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s4, s0, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s4, s0, 16 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s2 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_mov_b32 s15, s2 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x70 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: s_add_u32 s0, s0, 0x60 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: s_endpgm -; -; GFX9-LABEL: bitcast_v15i64_to_v26i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x2c -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_lg_u32 s0, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[34:35] offset:112 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 -; GFX9-NEXT: s_endpgm -; -; GFX11-LABEL: bitcast_v15i64_to_v26i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x2c -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v22, 0 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s0 -; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s0 -; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v9, s0 -; GFX11-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v11, s0 -; GFX11-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v13, s0 -; GFX11-NEXT: v_dual_mov_b32 v12, s0 :: v_dual_mov_b32 v15, s0 -; GFX11-NEXT: v_dual_mov_b32 v14, s0 :: v_dual_mov_b32 v17, s0 -; GFX11-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v19, s0 -; GFX11-NEXT: v_mov_b32_e32 v18, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v22, v[0:3], s[4:5] offset:80 -; GFX11-NEXT: global_store_b128 v22, v[4:7], s[4:5] offset:64 -; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v20, s0 -; GFX11-NEXT: v_dual_mov_b32 v21, s0 :: v_dual_mov_b32 v4, s0 -; GFX11-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s0 -; GFX11-NEXT: v_mov_b32_e32 v7, s0 -; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: global_store_b128 v22, v[8:11], s[4:5] offset:48 -; GFX11-NEXT: global_store_b128 v22, v[12:15], s[4:5] offset:32 -; GFX11-NEXT: global_store_b128 v22, v[16:19], s[4:5] offset:16 -; GFX11-NEXT: global_store_b128 v22, v[0:3], s[4:5] -; GFX11-NEXT: global_store_b64 v22, v[20:21], s[4:5] offset:112 -; GFX11-NEXT: global_store_b128 v22, v[4:7], s[4:5] offset:96 -; GFX11-NEXT: s_endpgm -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %phi_value = phi <15 x i64> [zeroinitializer, %entry], [%value, %if] - %cast = bitcast <15 x i64> %phi_value to <30 x i32> - %cmp1 = icmp eq i32 %cond, 1 - br i1 %cmp1, label %if, label %end - -end: - %phi_cast = phi <30 x i32> [zeroinitializer, %entry], [%cast, %if] - store <30 x i32> %phi_cast, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v2bf16_to_i32(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v2bf16_to_i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB59_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v5, v0, v3, 16 -; GCN-NEXT: .LBB59_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dword v5, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v2bf16_to_i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dword v[1:2], v4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v2bf16_to_i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dword v[1:2], v4, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v2bf16_to_i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b32 v[1:2], v4, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <2 x bfloat> %value to i32 - br label %end - -end: - %phi = phi i32 [0, %entry], [%cast, %if] - store i32 %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v2bf16_to_v2i16(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v2bf16_to_v2i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB60_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v5, v0, v3, 16 -; GCN-NEXT: .LBB60_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dword v5, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v2bf16_to_v2i16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dword v[1:2], v4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v2bf16_to_v2i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dword v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v2bf16_to_v2i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b32 v[1:2], v0, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <2 x bfloat> %value to <2 x i16> - br label %end - -end: - %phi = phi <2 x i16> [zeroinitializer, %entry], [%cast, %if] - store <2 x i16> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v2bf16_to_v2f16(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v2bf16_to_v2f16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB61_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v3 -; GCN-NEXT: .LBB61_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v3 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v2bf16_to_v2f16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dword v[1:2], v4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v2bf16_to_v2f16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dword v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v2bf16_to_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b32 v[1:2], v0, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <2 x bfloat> %value to <2 x half> - br label %end - -end: - %phi = phi <2 x half> [zeroinitializer, %entry], [%cast, %if] - store <2 x half> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v2bf16_to_v4i8(i32 %cond, ptr addrspace(1) %out, <2 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v2bf16_to_v4i8: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB62_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v5, v0, v3, 16 -; GCN-NEXT: .LBB62_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dword v5, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v2bf16_to_v4i8: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dword v[1:2], v4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v2bf16_to_v4i8: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dword v[1:2], v4, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v2bf16_to_v4i8: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b32 v[1:2], v4, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <2 x bfloat> %value to <4 x i8> - br label %end - -end: - %phi = phi <4 x i8> [zeroinitializer, %entry], [%cast, %if] - store <4 x i8> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v3bf16_to_v3i16(i32 %cond, ptr addrspace(1) %out, <3 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v3bf16_to_v3i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, v6 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB63_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v6, v4, v3, 16 -; GCN-NEXT: .LBB63_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64 offset:4 -; GCN-NEXT: buffer_store_dword v6, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v3bf16_to_v3i16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_short v[3:4], v6 -; VI-NEXT: flat_store_dword v[1:2], v5 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v3bf16_to_v3i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_short v[1:2], v6, off offset:4 -; GFX9-NEXT: global_store_dword v[1:2], v5, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v3bf16_to_v3i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b16 v[1:2], v6, off offset:4 -; GFX11-NEXT: global_store_b32 v[1:2], v5, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <3 x bfloat> %value to <3 x i16> - br label %end - -end: - %phi = phi <3 x i16> [zeroinitializer, %entry], [%cast, %if] - store <3 x i16> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v3bf16_to_v3f16(i32 %cond, ptr addrspace(1) %out, <3 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v3bf16_to_v3f16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB64_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v4 -; GCN-NEXT: .LBB64_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v6 -; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_or_b32_e32 v0, v0, v3 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v3bf16_to_v3f16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_short v[3:4], v6 -; VI-NEXT: flat_store_dword v[1:2], v5 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v3bf16_to_v3f16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_short v[1:2], v6, off offset:4 -; GFX9-NEXT: global_store_dword v[1:2], v5, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v3bf16_to_v3f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b16 v[1:2], v6, off offset:4 -; GFX11-NEXT: global_store_b32 v[1:2], v5, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <3 x bfloat> %value to <3 x half> - br label %end - -end: - %phi = phi <3 x half> [zeroinitializer, %entry], [%cast, %if] - store <3 x half> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_i32_to_v2bf16(i32 %cond, ptr addrspace(1) %out, i32 %value) { -; GCN-LABEL: v_bitcast_i32_to_v2bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB65_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GCN-NEXT: .LBB65_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_i32_to_v2bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dword v[1:2], v4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_i32_to_v2bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dword v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_i32_to_v2bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b32 v[1:2], v0, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast i32 %value to <2 x bfloat> - br label %end - -end: - %phi = phi <2 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <2 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v2i16_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <2 x i16> %value) { -; GCN-LABEL: v_bitcast_v2i16_to_v2bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB66_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GCN-NEXT: .LBB66_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v2i16_to_v2bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dword v[1:2], v4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v2i16_to_v2bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dword v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v2i16_to_v2bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b32 v[1:2], v0, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <2 x i16> %value to <2 x bfloat> - br label %end - -end: - %phi = phi <2 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <2 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v2f16_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <2 x half> %value) { -; GCN-LABEL: v_bitcast_v2f16_to_v2bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB67_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GCN-NEXT: .LBB67_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v2f16_to_v2bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dword v[1:2], v4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v2f16_to_v2bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v0, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dword v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v2f16_to_v2bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b32 v[1:2], v0, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <2 x half> %value to <2 x bfloat> - br label %end - -end: - %phi = phi <2 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <2 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4i8_to_v2bf16(i32 %cond, ptr addrspace(1) %out, <4 x i8> %value) { -; GCN-LABEL: v_bitcast_v4i8_to_v2bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB68_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v0, v3, v0 -; GCN-NEXT: v_or_b32_e32 v7, v5, v4 -; GCN-NEXT: .LBB68_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_alignbit_b32 v0, v3, v0, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4i8_to_v2bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v7, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dword v[1:2], v7 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4i8_to_v2bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dword v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4i8_to_v2bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_cbranch_execz .LBB68_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: .LBB68_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b32 v[1:2], v0, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x i8> %value to <2 x bfloat> - br label %end - -end: - %phi = phi <2 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <2 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v3i16_to_v3bf16(i32 %cond, ptr addrspace(1) %out, <3 x i16> %value) { -; GCN-LABEL: v_bitcast_v3i16_to_v3bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB69_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; GCN-NEXT: .LBB69_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 offset:4 -; GCN-NEXT: v_alignbit_b32 v0, v4, v0, 16 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v3i16_to_v3bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_short v[3:4], v6 -; VI-NEXT: flat_store_dword v[1:2], v5 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v3i16_to_v3bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_short v[1:2], v6, off offset:4 -; GFX9-NEXT: global_store_dword v[1:2], v5, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v3i16_to_v3bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b16 v[1:2], v6, off offset:4 -; GFX11-NEXT: global_store_b32 v[1:2], v5, off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <3 x i16> %value to <3 x bfloat> - br label %end - -end: - %phi = phi <3 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <3 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4bf16_to_v4f16(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v4bf16_to_v4f16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB70_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GCN-NEXT: .LBB70_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v3, v0, v3 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4bf16_to_v4f16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4bf16_to_v4f16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4bf16_to_v4f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x bfloat> %value to <4 x half> - br label %end - -end: - %phi = phi <4 x half> [zeroinitializer, %entry], [%cast, %if] - store <4 x half> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4bf16_to_v4i16(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v4bf16_to_v4i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB71_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 -; GCN-NEXT: .LBB71_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4bf16_to_v4i16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4bf16_to_v4i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4bf16_to_v4i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x bfloat> %value to <4 x i16> - br label %end - -end: - %phi = phi <4 x i16> [zeroinitializer, %entry], [%cast, %if] - store <4 x i16> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4bf16_to_v2i32(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v4bf16_to_v2i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB72_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 -; GCN-NEXT: .LBB72_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4bf16_to_v2i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4bf16_to_v2i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4bf16_to_v2i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x bfloat> %value to <2 x i32> - br label %end - -end: - %phi = phi <2 x i32> [zeroinitializer, %entry], [%cast, %if] - store <2 x i32> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4bf16_to_v2f32(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v4bf16_to_v2f32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB73_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 -; GCN-NEXT: .LBB73_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4bf16_to_v2f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4bf16_to_v2f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4bf16_to_v2f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x bfloat> %value to <2 x float> - br label %end - -end: - %phi = phi <2 x float> [zeroinitializer, %entry], [%cast, %if] - store <2 x float> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4bf16_to_f64(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v4bf16_to_f64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB74_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 -; GCN-NEXT: .LBB74_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4bf16_to_f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_mov_b32_e32 v6, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4bf16_to_f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4bf16_to_f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x bfloat> %value to double - br label %end - -end: - %phi = phi double [0.0, %entry], [%cast, %if] - store double %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4bf16_to_i64(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v4bf16_to_i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB75_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 -; GCN-NEXT: .LBB75_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4bf16_to_i64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_mov_b32_e32 v6, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4bf16_to_i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4bf16_to_i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x bfloat> %value to i64 - br label %end - -end: - %phi = phi i64 [0, %entry], [%cast, %if] - store i64 %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4bf16_to_v8i8(i32 %cond, ptr addrspace(1) %out, <4 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v4bf16_to_v8i8: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v8, v7 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB76_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v7, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 -; GCN-NEXT: .LBB76_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[7:8], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4bf16_to_v8i8: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4bf16_to_v8i8: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4bf16_to_v8i8: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x bfloat> %value to <8 x i8> - br label %end - -end: - %phi = phi <8 x i8> [zeroinitializer, %entry], [%cast, %if] - store <8 x i8> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_i64_to_v4bf16(i32 %cond, ptr addrspace(1) %out, i64 %value) { -; GCN-LABEL: v_bitcast_i64_to_v4bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB77_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: .LBB77_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v0, v5, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_i64_to_v4bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_i64_to_v4bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_i64_to_v4bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast i64 %value to <4 x bfloat> - br label %end - -end: - %phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <4 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v2f32_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <2 x float> %value) { -; GCN-LABEL: v_bitcast_v2f32_to_v4bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB78_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: .LBB78_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v0, v5, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v2f32_to_v4bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v2f32_to_v4bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v2f32_to_v4bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <2 x float> %value to <4 x bfloat> - br label %end - -end: - %phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <4 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v2i32_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <2 x i32> %value) { -; GCN-LABEL: v_bitcast_v2i32_to_v4bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB79_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GCN-NEXT: .LBB79_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v0, v5, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v2i32_to_v4bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v2i32_to_v4bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v2i32_to_v4bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <2 x i32> %value to <4 x bfloat> - br label %end - -end: - %phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <4 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4i16_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <4 x i16> %value) { -; GCN-LABEL: v_bitcast_v4i16_to_v4bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB80_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GCN-NEXT: .LBB80_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v3, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4i16_to_v4bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4i16_to_v4bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4i16_to_v4bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x i16> %value to <4 x bfloat> - br label %end - -end: - %phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <4 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4f16_to_v4bf16(i32 %cond, ptr addrspace(1) %out, <4 x half> %value) { -; GCN-LABEL: v_bitcast_v4f16_to_v4bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB81_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; GCN-NEXT: .LBB81_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v3, v3, v0, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4f16_to_v4bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v6, v5 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v6, v4 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx2 v[1:2], v[5:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4f16_to_v4bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx2 v[1:2], v[5:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4f16_to_v4bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b64 v[1:2], v[5:6], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x half> %value to <4 x bfloat> - br label %end - -end: - %phi = phi <4 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <4 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v6bf16_to_v6i16(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v6bf16_to_v6i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v10, v9 -; GCN-NEXT: v_mov_b32_e32 v0, v9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB82_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v9, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v10, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v0, v6, v7, 16 -; GCN-NEXT: .LBB82_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 -; GCN-NEXT: buffer_store_dwordx2 v[9:10], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v6bf16_to_v6i16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v6, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v7, v6 -; VI-NEXT: v_mov_b32_e32 v8, v6 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v9, v6 -; VI-NEXT: v_mov_b32_e32 v8, v5 -; VI-NEXT: v_mov_b32_e32 v7, v4 -; VI-NEXT: v_mov_b32_e32 v6, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v6bf16_to_v6i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v9, v6 -; GFX9-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v6bf16_to_v6i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_mov_b32_e32 v9, v6 -; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4 -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <6 x bfloat> %value to <6 x i16> - br label %end - -end: - %phi = phi <6 x i16> [zeroinitializer, %entry], [%cast, %if] - store <6 x i16> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v6bf16_to_v6f16(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v6bf16_to_v6f16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB83_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v7 -; GCN-NEXT: .LBB83_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v11 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v0, v7, v0 -; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v6bf16_to_v6f16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v6, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v7, v6 -; VI-NEXT: v_mov_b32_e32 v8, v6 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v9, v6 -; VI-NEXT: v_mov_b32_e32 v8, v5 -; VI-NEXT: v_mov_b32_e32 v7, v4 -; VI-NEXT: v_mov_b32_e32 v6, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v6bf16_to_v6f16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v9, v6 -; GFX9-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v6bf16_to_v6f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_mov_b32_e32 v9, v6 -; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4 -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <6 x bfloat> %value to <6 x half> - br label %end - -end: - %phi = phi <6 x half> [zeroinitializer, %entry], [%cast, %if] - store <6 x half> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v6bf16_to_v12i8(i32 %cond, ptr addrspace(1) %out, <6 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v6bf16_to_v12i8: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v10, v9 -; GCN-NEXT: v_mov_b32_e32 v0, v9 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB84_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_alignbit_b32 v9, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v10, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v0, v6, v7, 16 -; GCN-NEXT: .LBB84_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 -; GCN-NEXT: buffer_store_dwordx2 v[9:10], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v6bf16_to_v12i8: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v6, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v7, v6 -; VI-NEXT: v_mov_b32_e32 v8, v6 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v8, v5 -; VI-NEXT: v_mov_b32_e32 v7, v4 -; VI-NEXT: v_mov_b32_e32 v6, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v6bf16_to_v12i8: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v6bf16_to_v12i8: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4 -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <6 x bfloat> %value to <12 x i8> - br label %end - -end: - %phi = phi <12 x i8> [zeroinitializer, %entry], [%cast, %if] - store <12 x i8> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v6f16_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <6 x half> %value) { -; GCN-LABEL: v_bitcast_v6f16_to_v6bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB85_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GCN-NEXT: .LBB85_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v6f16_to_v6bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v6, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v7, v6 -; VI-NEXT: v_mov_b32_e32 v8, v6 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v9, v6 -; VI-NEXT: v_mov_b32_e32 v8, v5 -; VI-NEXT: v_mov_b32_e32 v7, v4 -; VI-NEXT: v_mov_b32_e32 v6, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v6f16_to_v6bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v9, v6 -; GFX9-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v6f16_to_v6bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_mov_b32_e32 v9, v6 -; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4 -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <6 x half> %value to <6 x bfloat> - br label %end - -end: - %phi = phi <6 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <6 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v6i16_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <6 x i16> %value) { -; GCN-LABEL: v_bitcast_v6i16_to_v6bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB86_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v8 -; GCN-NEXT: .LBB86_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v11 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v6i16_to_v6bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v6, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v7, v6 -; VI-NEXT: v_mov_b32_e32 v8, v6 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v9, v6 -; VI-NEXT: v_mov_b32_e32 v8, v5 -; VI-NEXT: v_mov_b32_e32 v7, v4 -; VI-NEXT: v_mov_b32_e32 v6, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx3 v[1:2], v[6:8] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v6i16_to_v6bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v9, v6 -; GFX9-NEXT: v_mov_b32_e32 v8, v5 -; GFX9-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx3 v[1:2], v[6:8], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v6i16_to_v6bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_mov_b32_e32 v9, v6 -; GFX11-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v7, v4 -; GFX11-NEXT: v_mov_b32_e32 v6, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b96 v[1:2], v[6:8], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <6 x i16> %value to <6 x bfloat> - br label %end - -end: - %phi = phi <6 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <6 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v12i8_to_v6bf16(i32 %cond, ptr addrspace(1) %out, <12 x i8> %value) { -; GCN-LABEL: v_bitcast_v12i8_to_v6bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB87_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; GCN-NEXT: v_or_b32_e32 v16, v3, v0 -; GCN-NEXT: v_or_b32_e32 v18, v5, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GCN-NEXT: v_or_b32_e32 v19, v9, v7 -; GCN-NEXT: v_or_b32_e32 v15, v11, v8 -; GCN-NEXT: v_or_b32_e32 v0, v13, v10 -; GCN-NEXT: .LBB87_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v17 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v0, v0, v7, 16 -; GCN-NEXT: buffer_store_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v12i8_to_v6bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v15, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v16, v15 -; VI-NEXT: v_mov_b32_e32 v17, v15 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB87_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v10 -; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v16, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v14 -; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v17, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: .LBB87_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx3 v[1:2], v[15:17] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v12i8_to_v6bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v15, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v16, v15 -; GFX9-NEXT: v_mov_b32_e32 v17, v15 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB87_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v10 -; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v14 -; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v17, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: .LBB87_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx3 v[1:2], v[15:17], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v12i8_to_v6bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v15, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v16, v15 -; GFX11-NEXT: v_mov_b32_e32 v17, v15 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB87_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v9 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v8 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v10 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v12 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v14 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX11-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v6, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-NEXT: v_or_b32_e32 v15, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v16, v3, v5 -; GFX11-NEXT: v_or_b32_e32 v17, v6, v7 -; GFX11-NEXT: .LBB87_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b96 v[1:2], v[15:17], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <12 x i8> %value to <6 x bfloat> - br label %end - -end: - %phi = phi <6 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <6 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v8bf16_to_v2f64(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v8bf16_to_v2f64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v12, v11 -; GCN-NEXT: v_mov_b32_e32 v13, v11 -; GCN-NEXT: v_mov_b32_e32 v14, v11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB88_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16 -; GCN-NEXT: .LBB88_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8bf16_to_v2f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v10, v7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v10, v6 -; VI-NEXT: v_mov_b32_e32 v9, v5 -; VI-NEXT: v_mov_b32_e32 v8, v4 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8bf16_to_v2f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8bf16_to_v2f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <8 x bfloat> %value to <2 x double> - br label %end - -end: - %phi = phi <2 x double> [zeroinitializer, %entry], [%cast, %if] - store <2 x double> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v8bf16_to_v2i64(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v8bf16_to_v2i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v12, v11 -; GCN-NEXT: v_mov_b32_e32 v13, v11 -; GCN-NEXT: v_mov_b32_e32 v14, v11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB89_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16 -; GCN-NEXT: .LBB89_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8bf16_to_v2i64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v10, v7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v10, v6 -; VI-NEXT: v_mov_b32_e32 v9, v5 -; VI-NEXT: v_mov_b32_e32 v8, v4 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8bf16_to_v2i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8bf16_to_v2i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <8 x bfloat> %value to <2 x i64> - br label %end - -end: - %phi = phi <2 x i64> [zeroinitializer, %entry], [%cast, %if] - store <2 x i64> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v8bf16_to_v4f32(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v8bf16_to_v4f32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v12, v11 -; GCN-NEXT: v_mov_b32_e32 v13, v11 -; GCN-NEXT: v_mov_b32_e32 v14, v11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB90_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16 -; GCN-NEXT: .LBB90_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8bf16_to_v4f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v10, v7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v10, v6 -; VI-NEXT: v_mov_b32_e32 v9, v5 -; VI-NEXT: v_mov_b32_e32 v8, v4 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8bf16_to_v4f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8bf16_to_v4f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <8 x bfloat> %value to <4 x float> - br label %end - -end: - %phi = phi <4 x float> [zeroinitializer, %entry], [%cast, %if] - store <4 x float> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v8bf16_to_v4i32(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v8bf16_to_v4i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v12, v11 -; GCN-NEXT: v_mov_b32_e32 v13, v11 -; GCN-NEXT: v_mov_b32_e32 v14, v11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB91_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16 -; GCN-NEXT: .LBB91_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8bf16_to_v4i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v10, v7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v10, v6 -; VI-NEXT: v_mov_b32_e32 v9, v5 -; VI-NEXT: v_mov_b32_e32 v8, v4 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8bf16_to_v4i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8bf16_to_v4i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <8 x bfloat> %value to <4 x i32> - br label %end - -end: - %phi = phi <4 x i32> [zeroinitializer, %entry], [%cast, %if] - store <4 x i32> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v8bf16_to_v8f16(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v8bf16_to_v8f16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_mov_b32_e32 v14, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB92_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v9 -; GCN-NEXT: .LBB92_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v9, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8bf16_to_v8f16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v10, v7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v10, v6 -; VI-NEXT: v_mov_b32_e32 v9, v5 -; VI-NEXT: v_mov_b32_e32 v8, v4 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8bf16_to_v8f16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8bf16_to_v8f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <8 x bfloat> %value to <8 x half> - br label %end - -end: - %phi = phi <8 x half> [zeroinitializer, %entry], [%cast, %if] - store <8 x half> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v8bf16_to_v8i16(i32 %cond, ptr addrspace(1) %out, <8 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v8bf16_to_v8i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v12, v11 -; GCN-NEXT: v_mov_b32_e32 v13, v11 -; GCN-NEXT: v_mov_b32_e32 v14, v11 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB93_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_alignbit_b32 v11, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v12, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v13, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v14, v8, v9, 16 -; GCN-NEXT: .LBB93_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8bf16_to_v8i16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v10, v7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v10, v6 -; VI-NEXT: v_mov_b32_e32 v9, v5 -; VI-NEXT: v_mov_b32_e32 v8, v4 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8bf16_to_v8i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8bf16_to_v8i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <8 x bfloat> %value to <8 x i16> - br label %end - -end: - %phi = phi <8 x i16> [zeroinitializer, %entry], [%cast, %if] - store <8 x i16> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v8f16_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <8 x half> %value) { -; GCN-LABEL: v_bitcast_v8f16_to_v8bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_mov_b32_e32 v14, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB94_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v9 -; GCN-NEXT: .LBB94_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v0, v9, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8f16_to_v8bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v10, v7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v10, v6 -; VI-NEXT: v_mov_b32_e32 v9, v5 -; VI-NEXT: v_mov_b32_e32 v8, v4 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8f16_to_v8bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8f16_to_v8bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <8 x half> %value to <8 x bfloat> - br label %end - -end: - %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <8 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v8i16_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <8 x i16> %value) { -; GCN-LABEL: v_bitcast_v8i16_to_v8bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_mov_b32_e32 v14, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB95_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v10 -; GCN-NEXT: .LBB95_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v0, v9, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8i16_to_v8bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v10, v7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v10, v6 -; VI-NEXT: v_mov_b32_e32 v9, v5 -; VI-NEXT: v_mov_b32_e32 v8, v4 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8i16_to_v8bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8i16_to_v8bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <8 x i16> %value to <8 x bfloat> - br label %end - -end: - %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <8 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v16i8_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <16 x i8> %value) { -; GCN-LABEL: v_bitcast_v16i8_to_v8bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB96_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v14 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v14, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GCN-NEXT: v_or_b32_e32 v19, v3, v0 -; GCN-NEXT: v_or_b32_e32 v23, v5, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v6 -; GCN-NEXT: v_or_b32_e32 v24, v9, v7 -; GCN-NEXT: v_or_b32_e32 v21, v11, v8 -; GCN-NEXT: v_or_b32_e32 v25, v13, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GCN-NEXT: v_or_b32_e32 v0, v17, v14 -; GCN-NEXT: .LBB96_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v0, v9, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v16i8_to_v8bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB96_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v19, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v10 -; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v20, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v14 -; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v21, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v18 -; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v22, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: .LBB96_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v16i8_to_v8bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB96_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v19, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v10 -; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v20, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v14 -; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v21, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v22, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: .LBB96_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v16i8_to_v8bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB96_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v8 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v12 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v9 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-NEXT: v_lshlrev_b16 v6, 8, v10 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v11 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v14 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v15 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v16 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v17 -; GFX11-NEXT: v_lshlrev_b16 v14, 8, v18 -; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-NEXT: v_or_b32_e32 v19, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v20, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v21, v6, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v22, v8, v9 -; GFX11-NEXT: .LBB96_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <16 x i8> %value to <8 x bfloat> - br label %end - -end: - %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <8 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v2i64_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <2 x i64> %value) { -; GCN-LABEL: v_bitcast_v2i64_to_v8bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB97_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; GCN-NEXT: .LBB97_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v2i64_to_v8bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v10, v7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v10, v6 -; VI-NEXT: v_mov_b32_e32 v9, v5 -; VI-NEXT: v_mov_b32_e32 v8, v4 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v2i64_to_v8bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v2i64_to_v8bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <2 x i64> %value to <8 x bfloat> - br label %end - -end: - %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <8 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v2f64_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <2 x double> %value) { -; GCN-LABEL: v_bitcast_v2f64_to_v8bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB98_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; GCN-NEXT: .LBB98_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v2f64_to_v8bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v10, v7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v10, v6 -; VI-NEXT: v_mov_b32_e32 v9, v5 -; VI-NEXT: v_mov_b32_e32 v8, v4 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v2f64_to_v8bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v2f64_to_v8bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <2 x double> %value to <8 x bfloat> - br label %end - -end: - %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <8 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4i32_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <4 x i32> %value) { -; GCN-LABEL: v_bitcast_v4i32_to_v8bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB99_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; GCN-NEXT: .LBB99_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4i32_to_v8bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v10, v7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v10, v6 -; VI-NEXT: v_mov_b32_e32 v9, v5 -; VI-NEXT: v_mov_b32_e32 v8, v4 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4i32_to_v8bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4i32_to_v8bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x i32> %value to <8 x bfloat> - br label %end - -end: - %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <8 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4f32_to_v8bf16(i32 %cond, ptr addrspace(1) %out, <4 x float> %value) { -; GCN-LABEL: v_bitcast_v4f32_to_v8bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB100_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; GCN-NEXT: .LBB100_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v0, v7, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4f32_to_v8bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v8, v7 -; VI-NEXT: v_mov_b32_e32 v9, v7 -; VI-NEXT: v_mov_b32_e32 v10, v7 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v10, v6 -; VI-NEXT: v_mov_b32_e32 v9, v5 -; VI-NEXT: v_mov_b32_e32 v8, v4 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[7:10] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4f32_to_v8bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4f32_to_v8bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v8, v7 -; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v9, v5 -; GFX11-NEXT: v_dual_mov_b32 v8, v4 :: v_dual_mov_b32 v7, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: global_store_b128 v[1:2], v[7:10], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x float> %value to <8 x bfloat> - br label %end - -end: - %phi = phi <8 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <8 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v16bf16_to_v16i16(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v16bf16_to_v16i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v20, v19 -; GCN-NEXT: v_mov_b32_e32 v21, v19 -; GCN-NEXT: v_mov_b32_e32 v22, v19 -; GCN-NEXT: v_mov_b32_e32 v23, v19 -; GCN-NEXT: v_mov_b32_e32 v24, v19 -; GCN-NEXT: v_mov_b32_e32 v25, v19 -; GCN-NEXT: v_mov_b32_e32 v26, v19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB101_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16 -; GCN-NEXT: .LBB101_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v16bf16_to_v16i16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v12, v11 -; VI-NEXT: v_mov_b32_e32 v13, v11 -; VI-NEXT: v_mov_b32_e32 v14, v11 -; VI-NEXT: v_mov_b32_e32 v15, v11 -; VI-NEXT: v_mov_b32_e32 v16, v11 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v18, v11 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v18, v10 -; VI-NEXT: v_mov_b32_e32 v17, v9 -; VI-NEXT: v_mov_b32_e32 v16, v8 -; VI-NEXT: v_mov_b32_e32 v15, v7 -; VI-NEXT: v_mov_b32_e32 v14, v6 -; VI-NEXT: v_mov_b32_e32 v13, v5 -; VI-NEXT: v_mov_b32_e32 v12, v4 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v16bf16_to_v16i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v11 -; GFX9-NEXT: v_mov_b32_e32 v18, v11 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v16bf16_to_v16i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v13, v11 -; GFX11-NEXT: v_mov_b32_e32 v14, v11 -; GFX11-NEXT: v_mov_b32_e32 v15, v11 -; GFX11-NEXT: v_mov_b32_e32 v16, v11 -; GFX11-NEXT: v_mov_b32_e32 v17, v11 -; GFX11-NEXT: v_mov_b32_e32 v18, v11 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 -; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 -; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 -; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <16 x bfloat> %value to <16 x i16> - br label %end - -end: - %phi = phi <16 x i16> [zeroinitializer, %entry], [%cast, %if] - store <16 x i16> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v16bf16_to_v16f16(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v16bf16_to_v16f16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB102_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v17 -; GCN-NEXT: .LBB102_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v29 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v10, v9 -; GCN-NEXT: v_or_b32_e32 v7, v12, v11 -; GCN-NEXT: v_or_b32_e32 v8, v14, v13 -; GCN-NEXT: v_or_b32_e32 v9, v16, v15 -; GCN-NEXT: v_or_b32_e32 v10, v17, v0 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v16bf16_to_v16f16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v12, v11 -; VI-NEXT: v_mov_b32_e32 v13, v11 -; VI-NEXT: v_mov_b32_e32 v14, v11 -; VI-NEXT: v_mov_b32_e32 v15, v11 -; VI-NEXT: v_mov_b32_e32 v16, v11 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v18, v11 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v18, v10 -; VI-NEXT: v_mov_b32_e32 v17, v9 -; VI-NEXT: v_mov_b32_e32 v16, v8 -; VI-NEXT: v_mov_b32_e32 v15, v7 -; VI-NEXT: v_mov_b32_e32 v14, v6 -; VI-NEXT: v_mov_b32_e32 v13, v5 -; VI-NEXT: v_mov_b32_e32 v12, v4 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v16bf16_to_v16f16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v11 -; GFX9-NEXT: v_mov_b32_e32 v18, v11 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v16bf16_to_v16f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v13, v11 -; GFX11-NEXT: v_mov_b32_e32 v14, v11 -; GFX11-NEXT: v_mov_b32_e32 v15, v11 -; GFX11-NEXT: v_mov_b32_e32 v16, v11 -; GFX11-NEXT: v_mov_b32_e32 v17, v11 -; GFX11-NEXT: v_mov_b32_e32 v18, v11 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 -; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 -; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 -; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <16 x bfloat> %value to <16 x half> - br label %end - -end: - %phi = phi <16 x half> [zeroinitializer, %entry], [%cast, %if] - store <16 x half> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v16bf16_to_v8i32(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v16bf16_to_v8i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v20, v19 -; GCN-NEXT: v_mov_b32_e32 v21, v19 -; GCN-NEXT: v_mov_b32_e32 v22, v19 -; GCN-NEXT: v_mov_b32_e32 v23, v19 -; GCN-NEXT: v_mov_b32_e32 v24, v19 -; GCN-NEXT: v_mov_b32_e32 v25, v19 -; GCN-NEXT: v_mov_b32_e32 v26, v19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB103_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16 -; GCN-NEXT: .LBB103_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v16bf16_to_v8i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v12, v11 -; VI-NEXT: v_mov_b32_e32 v13, v11 -; VI-NEXT: v_mov_b32_e32 v14, v11 -; VI-NEXT: v_mov_b32_e32 v15, v11 -; VI-NEXT: v_mov_b32_e32 v16, v11 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v18, v11 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v18, v10 -; VI-NEXT: v_mov_b32_e32 v17, v9 -; VI-NEXT: v_mov_b32_e32 v16, v8 -; VI-NEXT: v_mov_b32_e32 v15, v7 -; VI-NEXT: v_mov_b32_e32 v14, v6 -; VI-NEXT: v_mov_b32_e32 v13, v5 -; VI-NEXT: v_mov_b32_e32 v12, v4 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v16bf16_to_v8i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v11 -; GFX9-NEXT: v_mov_b32_e32 v18, v11 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v16bf16_to_v8i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v13, v11 -; GFX11-NEXT: v_mov_b32_e32 v14, v11 -; GFX11-NEXT: v_mov_b32_e32 v15, v11 -; GFX11-NEXT: v_mov_b32_e32 v16, v11 -; GFX11-NEXT: v_mov_b32_e32 v17, v11 -; GFX11-NEXT: v_mov_b32_e32 v18, v11 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 -; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 -; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 -; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <16 x bfloat> %value to <8 x i32> - br label %end - -end: - %phi = phi <8 x i32> [zeroinitializer, %entry], [%cast, %if] - store <8 x i32> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v16bf16_to_v8f32(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v16bf16_to_v8f32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v20, v19 -; GCN-NEXT: v_mov_b32_e32 v21, v19 -; GCN-NEXT: v_mov_b32_e32 v22, v19 -; GCN-NEXT: v_mov_b32_e32 v23, v19 -; GCN-NEXT: v_mov_b32_e32 v24, v19 -; GCN-NEXT: v_mov_b32_e32 v25, v19 -; GCN-NEXT: v_mov_b32_e32 v26, v19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB104_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16 -; GCN-NEXT: .LBB104_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v16bf16_to_v8f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v12, v11 -; VI-NEXT: v_mov_b32_e32 v13, v11 -; VI-NEXT: v_mov_b32_e32 v14, v11 -; VI-NEXT: v_mov_b32_e32 v15, v11 -; VI-NEXT: v_mov_b32_e32 v16, v11 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v18, v11 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v18, v10 -; VI-NEXT: v_mov_b32_e32 v17, v9 -; VI-NEXT: v_mov_b32_e32 v16, v8 -; VI-NEXT: v_mov_b32_e32 v15, v7 -; VI-NEXT: v_mov_b32_e32 v14, v6 -; VI-NEXT: v_mov_b32_e32 v13, v5 -; VI-NEXT: v_mov_b32_e32 v12, v4 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v16bf16_to_v8f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v11 -; GFX9-NEXT: v_mov_b32_e32 v18, v11 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v16bf16_to_v8f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v13, v11 -; GFX11-NEXT: v_mov_b32_e32 v14, v11 -; GFX11-NEXT: v_mov_b32_e32 v15, v11 -; GFX11-NEXT: v_mov_b32_e32 v16, v11 -; GFX11-NEXT: v_mov_b32_e32 v17, v11 -; GFX11-NEXT: v_mov_b32_e32 v18, v11 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 -; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 -; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 -; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <16 x bfloat> %value to <8 x float> - br label %end - -end: - %phi = phi <8 x float> [zeroinitializer, %entry], [%cast, %if] - store <8 x float> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v16bf16_to_v4f64(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v16bf16_to_v4f64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v20, v19 -; GCN-NEXT: v_mov_b32_e32 v21, v19 -; GCN-NEXT: v_mov_b32_e32 v22, v19 -; GCN-NEXT: v_mov_b32_e32 v23, v19 -; GCN-NEXT: v_mov_b32_e32 v24, v19 -; GCN-NEXT: v_mov_b32_e32 v25, v19 -; GCN-NEXT: v_mov_b32_e32 v26, v19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB105_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16 -; GCN-NEXT: .LBB105_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v16bf16_to_v4f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v12, v11 -; VI-NEXT: v_mov_b32_e32 v13, v11 -; VI-NEXT: v_mov_b32_e32 v14, v11 -; VI-NEXT: v_mov_b32_e32 v15, v11 -; VI-NEXT: v_mov_b32_e32 v16, v11 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v18, v11 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v18, v10 -; VI-NEXT: v_mov_b32_e32 v17, v9 -; VI-NEXT: v_mov_b32_e32 v16, v8 -; VI-NEXT: v_mov_b32_e32 v15, v7 -; VI-NEXT: v_mov_b32_e32 v14, v6 -; VI-NEXT: v_mov_b32_e32 v13, v5 -; VI-NEXT: v_mov_b32_e32 v12, v4 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v16bf16_to_v4f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v11 -; GFX9-NEXT: v_mov_b32_e32 v18, v11 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v16bf16_to_v4f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v13, v11 -; GFX11-NEXT: v_mov_b32_e32 v14, v11 -; GFX11-NEXT: v_mov_b32_e32 v15, v11 -; GFX11-NEXT: v_mov_b32_e32 v16, v11 -; GFX11-NEXT: v_mov_b32_e32 v17, v11 -; GFX11-NEXT: v_mov_b32_e32 v18, v11 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 -; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 -; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 -; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <16 x bfloat> %value to <4 x double> - br label %end - -end: - %phi = phi <4 x double> [zeroinitializer, %entry], [%cast, %if] - store <4 x double> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v16bf16_to_v4i64(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v16bf16_to_v4i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v20, v19 -; GCN-NEXT: v_mov_b32_e32 v21, v19 -; GCN-NEXT: v_mov_b32_e32 v22, v19 -; GCN-NEXT: v_mov_b32_e32 v23, v19 -; GCN-NEXT: v_mov_b32_e32 v24, v19 -; GCN-NEXT: v_mov_b32_e32 v25, v19 -; GCN-NEXT: v_mov_b32_e32 v26, v19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB106_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16 -; GCN-NEXT: .LBB106_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v16bf16_to_v4i64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v12, v11 -; VI-NEXT: v_mov_b32_e32 v13, v11 -; VI-NEXT: v_mov_b32_e32 v14, v11 -; VI-NEXT: v_mov_b32_e32 v15, v11 -; VI-NEXT: v_mov_b32_e32 v16, v11 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v18, v11 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v18, v10 -; VI-NEXT: v_mov_b32_e32 v17, v9 -; VI-NEXT: v_mov_b32_e32 v16, v8 -; VI-NEXT: v_mov_b32_e32 v15, v7 -; VI-NEXT: v_mov_b32_e32 v14, v6 -; VI-NEXT: v_mov_b32_e32 v13, v5 -; VI-NEXT: v_mov_b32_e32 v12, v4 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v16bf16_to_v4i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v11 -; GFX9-NEXT: v_mov_b32_e32 v18, v11 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v16bf16_to_v4i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v13, v11 -; GFX11-NEXT: v_mov_b32_e32 v14, v11 -; GFX11-NEXT: v_mov_b32_e32 v15, v11 -; GFX11-NEXT: v_mov_b32_e32 v16, v11 -; GFX11-NEXT: v_mov_b32_e32 v17, v11 -; GFX11-NEXT: v_mov_b32_e32 v18, v11 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 -; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 -; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 -; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <16 x bfloat> %value to <4 x i64> - br label %end - -end: - %phi = phi <4 x i64> [zeroinitializer, %entry], [%cast, %if] - store <4 x i64> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v16bf16_to_v32i8(i32 %cond, ptr addrspace(1) %out, <16 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v16bf16_to_v32i8: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v20, v19 -; GCN-NEXT: v_mov_b32_e32 v21, v19 -; GCN-NEXT: v_mov_b32_e32 v22, v19 -; GCN-NEXT: v_mov_b32_e32 v23, v19 -; GCN-NEXT: v_mov_b32_e32 v24, v19 -; GCN-NEXT: v_mov_b32_e32 v25, v19 -; GCN-NEXT: v_mov_b32_e32 v26, v19 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB107_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_alignbit_b32 v19, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v20, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v21, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v22, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v23, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v24, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v25, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v26, v16, v17, 16 -; GCN-NEXT: .LBB107_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[23:26], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[19:22], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v16bf16_to_v32i8: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v12, v11 -; VI-NEXT: v_mov_b32_e32 v13, v11 -; VI-NEXT: v_mov_b32_e32 v14, v11 -; VI-NEXT: v_mov_b32_e32 v15, v11 -; VI-NEXT: v_mov_b32_e32 v16, v11 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v18, v11 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v18, v10 -; VI-NEXT: v_mov_b32_e32 v17, v9 -; VI-NEXT: v_mov_b32_e32 v16, v8 -; VI-NEXT: v_mov_b32_e32 v15, v7 -; VI-NEXT: v_mov_b32_e32 v14, v6 -; VI-NEXT: v_mov_b32_e32 v13, v5 -; VI-NEXT: v_mov_b32_e32 v12, v4 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v16bf16_to_v32i8: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v11 -; GFX9-NEXT: v_mov_b32_e32 v18, v11 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v16bf16_to_v32i8: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v13, v11 -; GFX11-NEXT: v_mov_b32_e32 v14, v11 -; GFX11-NEXT: v_mov_b32_e32 v15, v11 -; GFX11-NEXT: v_mov_b32_e32 v16, v11 -; GFX11-NEXT: v_mov_b32_e32 v17, v11 -; GFX11-NEXT: v_mov_b32_e32 v18, v11 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 -; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 -; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 -; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <16 x bfloat> %value to <32 x i8> - br label %end - -end: - %phi = phi <32 x i8> [zeroinitializer, %entry], [%cast, %if] - store <32 x i8> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v8f32_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <8 x float> %value) { -; GCN-LABEL: v_bitcast_v8f32_to_v16bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_mov_b32_e32 v14, 0 -; GCN-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB108_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; GCN-NEXT: .LBB108_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16 -; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8f32_to_v16bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v12, v11 -; VI-NEXT: v_mov_b32_e32 v13, v11 -; VI-NEXT: v_mov_b32_e32 v14, v11 -; VI-NEXT: v_mov_b32_e32 v15, v11 -; VI-NEXT: v_mov_b32_e32 v16, v11 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v18, v11 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v18, v10 -; VI-NEXT: v_mov_b32_e32 v17, v9 -; VI-NEXT: v_mov_b32_e32 v16, v8 -; VI-NEXT: v_mov_b32_e32 v15, v7 -; VI-NEXT: v_mov_b32_e32 v14, v6 -; VI-NEXT: v_mov_b32_e32 v13, v5 -; VI-NEXT: v_mov_b32_e32 v12, v4 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8f32_to_v16bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v11 -; GFX9-NEXT: v_mov_b32_e32 v18, v11 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8f32_to_v16bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v13, v11 -; GFX11-NEXT: v_mov_b32_e32 v14, v11 -; GFX11-NEXT: v_mov_b32_e32 v15, v11 -; GFX11-NEXT: v_mov_b32_e32 v16, v11 -; GFX11-NEXT: v_mov_b32_e32 v17, v11 -; GFX11-NEXT: v_mov_b32_e32 v18, v11 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 -; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 -; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 -; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <8 x float> %value to <16 x bfloat> - br label %end - -end: - %phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <16 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v8i32_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <8 x i32> %value) { -; GCN-LABEL: v_bitcast_v8i32_to_v16bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_mov_b32_e32 v14, 0 -; GCN-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB109_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; GCN-NEXT: .LBB109_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16 -; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8i32_to_v16bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v12, v11 -; VI-NEXT: v_mov_b32_e32 v13, v11 -; VI-NEXT: v_mov_b32_e32 v14, v11 -; VI-NEXT: v_mov_b32_e32 v15, v11 -; VI-NEXT: v_mov_b32_e32 v16, v11 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v18, v11 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v18, v10 -; VI-NEXT: v_mov_b32_e32 v17, v9 -; VI-NEXT: v_mov_b32_e32 v16, v8 -; VI-NEXT: v_mov_b32_e32 v15, v7 -; VI-NEXT: v_mov_b32_e32 v14, v6 -; VI-NEXT: v_mov_b32_e32 v13, v5 -; VI-NEXT: v_mov_b32_e32 v12, v4 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8i32_to_v16bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v11 -; GFX9-NEXT: v_mov_b32_e32 v18, v11 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8i32_to_v16bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v13, v11 -; GFX11-NEXT: v_mov_b32_e32 v14, v11 -; GFX11-NEXT: v_mov_b32_e32 v15, v11 -; GFX11-NEXT: v_mov_b32_e32 v16, v11 -; GFX11-NEXT: v_mov_b32_e32 v17, v11 -; GFX11-NEXT: v_mov_b32_e32 v18, v11 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 -; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 -; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 -; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <8 x i32> %value to <16 x bfloat> - br label %end - -end: - %phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <16 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4i64_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <4 x i64> %value) { -; GCN-LABEL: v_bitcast_v4i64_to_v16bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_mov_b32_e32 v14, 0 -; GCN-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB110_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; GCN-NEXT: .LBB110_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16 -; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4i64_to_v16bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v12, v11 -; VI-NEXT: v_mov_b32_e32 v13, v11 -; VI-NEXT: v_mov_b32_e32 v14, v11 -; VI-NEXT: v_mov_b32_e32 v15, v11 -; VI-NEXT: v_mov_b32_e32 v16, v11 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v18, v11 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v18, v10 -; VI-NEXT: v_mov_b32_e32 v17, v9 -; VI-NEXT: v_mov_b32_e32 v16, v8 -; VI-NEXT: v_mov_b32_e32 v15, v7 -; VI-NEXT: v_mov_b32_e32 v14, v6 -; VI-NEXT: v_mov_b32_e32 v13, v5 -; VI-NEXT: v_mov_b32_e32 v12, v4 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4i64_to_v16bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v11 -; GFX9-NEXT: v_mov_b32_e32 v18, v11 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4i64_to_v16bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v13, v11 -; GFX11-NEXT: v_mov_b32_e32 v14, v11 -; GFX11-NEXT: v_mov_b32_e32 v15, v11 -; GFX11-NEXT: v_mov_b32_e32 v16, v11 -; GFX11-NEXT: v_mov_b32_e32 v17, v11 -; GFX11-NEXT: v_mov_b32_e32 v18, v11 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 -; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 -; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 -; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x i64> %value to <16 x bfloat> - br label %end - -end: - %phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <16 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v4f64_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <4 x double> %value) { -; GCN-LABEL: v_bitcast_v4f64_to_v16bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_mov_b32_e32 v14, 0 -; GCN-NEXT: v_mov_b32_e32 v15, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB111_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v9 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v6 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v5 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v4 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; GCN-NEXT: .LBB111_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v18 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v17, v16, 16 -; GCN-NEXT: v_alignbit_b32 v8, v15, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v13, v12, 16 -; GCN-NEXT: v_alignbit_b32 v10, v0, v11, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v4f64_to_v16bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v11, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v12, v11 -; VI-NEXT: v_mov_b32_e32 v13, v11 -; VI-NEXT: v_mov_b32_e32 v14, v11 -; VI-NEXT: v_mov_b32_e32 v15, v11 -; VI-NEXT: v_mov_b32_e32 v16, v11 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v18, v11 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v18, v10 -; VI-NEXT: v_mov_b32_e32 v17, v9 -; VI-NEXT: v_mov_b32_e32 v16, v8 -; VI-NEXT: v_mov_b32_e32 v15, v7 -; VI-NEXT: v_mov_b32_e32 v14, v6 -; VI-NEXT: v_mov_b32_e32 v13, v5 -; VI-NEXT: v_mov_b32_e32 v12, v4 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: ; %bb.2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[15:18] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[11:14] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v4f64_to_v16bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v11, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-NEXT: v_mov_b32_e32 v14, v11 -; GFX9-NEXT: v_mov_b32_e32 v15, v11 -; GFX9-NEXT: v_mov_b32_e32 v16, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v11 -; GFX9-NEXT: v_mov_b32_e32 v18, v11 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v18, v10 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v16, v8 -; GFX9-NEXT: v_mov_b32_e32 v15, v7 -; GFX9-NEXT: v_mov_b32_e32 v14, v6 -; GFX9-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: ; %bb.2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v4f64_to_v16bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v11, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v13, v11 -; GFX11-NEXT: v_mov_b32_e32 v14, v11 -; GFX11-NEXT: v_mov_b32_e32 v15, v11 -; GFX11-NEXT: v_mov_b32_e32 v16, v11 -; GFX11-NEXT: v_mov_b32_e32 v17, v11 -; GFX11-NEXT: v_mov_b32_e32 v18, v11 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v18, v10 :: v_dual_mov_b32 v17, v9 -; GFX11-NEXT: v_dual_mov_b32 v16, v8 :: v_dual_mov_b32 v15, v7 -; GFX11-NEXT: v_dual_mov_b32 v14, v6 :: v_dual_mov_b32 v13, v5 -; GFX11-NEXT: v_dual_mov_b32 v12, v4 :: v_dual_mov_b32 v11, v3 -; GFX11-NEXT: ; %bb.2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[1:2], v[15:18], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[11:14], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <4 x double> %value to <16 x bfloat> - br label %end - -end: - %phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <16 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v32i8_to_v16bf16(i32 %cond, ptr addrspace(1) %out, <32 x i8> %value) { -; GCN-LABEL: v_bitcast_v32i8_to_v16bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB112_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 24, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8 -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GCN-NEXT: v_and_b32_e32 v10, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v12 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v14 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 8, v16 -; GCN-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v18 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v20 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v22 -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 8, v24 -; GCN-NEXT: v_and_b32_e32 v24, 0xff, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v26 -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 24, v28 -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 24, v30 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v37 -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 24, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v6, v6, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; GCN-NEXT: v_or_b32_e32 v12, v14, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v20 -; GCN-NEXT: v_or_b32_e32 v18, v22, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v28 -; GCN-NEXT: v_or_b32_e32 v24, v30, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v32 -; GCN-NEXT: v_or_b32_e32 v50, v3, v0 -; GCN-NEXT: v_or_b32_e32 v54, v5, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v6 -; GCN-NEXT: v_or_b32_e32 v55, v9, v7 -; GCN-NEXT: v_or_b32_e32 v52, v11, v8 -; GCN-NEXT: v_or_b32_e32 v40, v13, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v12 -; GCN-NEXT: v_or_b32_e32 v41, v17, v14 -; GCN-NEXT: v_or_b32_e32 v33, v19, v15 -; GCN-NEXT: v_or_b32_e32 v39, v21, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v18 -; GCN-NEXT: v_or_b32_e32 v48, v25, v20 -; GCN-NEXT: v_or_b32_e32 v35, v27, v22 -; GCN-NEXT: v_or_b32_e32 v49, v29, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v24 -; GCN-NEXT: v_or_b32_e32 v0, v31, v26 -; GCN-NEXT: .LBB112_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v53 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v10, v0, v17, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32i8_to_v16bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 -; VI-NEXT: v_mov_b32_e32 v31, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v32, v31 -; VI-NEXT: v_mov_b32_e32 v33, v31 -; VI-NEXT: v_mov_b32_e32 v34, v31 -; VI-NEXT: v_mov_b32_e32 v35, v31 -; VI-NEXT: v_mov_b32_e32 v36, v31 -; VI-NEXT: v_mov_b32_e32 v37, v31 -; VI-NEXT: v_mov_b32_e32 v38, v31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB112_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v10 -; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v32, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v14 -; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v33, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v16 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v18 -; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v34, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v22 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v35, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v26 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v36, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v30 -; VI-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v37, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v48 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v38, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: .LBB112_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[35:38] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[31:34] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32i8_to_v16bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 -; GFX9-NEXT: v_mov_b32_e32 v31, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v32, v31 -; GFX9-NEXT: v_mov_b32_e32 v33, v31 -; GFX9-NEXT: v_mov_b32_e32 v34, v31 -; GFX9-NEXT: v_mov_b32_e32 v35, v31 -; GFX9-NEXT: v_mov_b32_e32 v36, v31 -; GFX9-NEXT: v_mov_b32_e32 v37, v31 -; GFX9-NEXT: v_mov_b32_e32 v38, v31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB112_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_perm_b32 v31, v3, v0, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v8 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v10 -; GFX9-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v32, v3, v0, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v14 -; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v33, v3, v0, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v16 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v34, v3, v0, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v20 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v35, v3, v0, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v24 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v26 -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v36, v3, v0, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v28 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v30 -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v37, v3, v0, s6 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v50 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v48 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v38, v3, v0, s6 -; GFX9-NEXT: .LBB112_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32i8_to_v16bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:12 -; GFX11-NEXT: scratch_load_u16 v48, off, s32 offset:8 -; GFX11-NEXT: scratch_load_u16 v49, off, s32 offset:4 -; GFX11-NEXT: scratch_load_u16 v50, off, s32 -; GFX11-NEXT: v_mov_b32_e32 v31, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v32, v31 -; GFX11-NEXT: v_mov_b32_e32 v33, v31 -; GFX11-NEXT: v_mov_b32_e32 v34, v31 -; GFX11-NEXT: v_mov_b32_e32 v35, v31 -; GFX11-NEXT: v_mov_b32_e32 v36, v31 -; GFX11-NEXT: v_mov_b32_e32 v37, v31 -; GFX11-NEXT: v_mov_b32_e32 v38, v31 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB112_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v9 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v8 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v10 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v12 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v14 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v3, v6, v3 -; GFX11-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v6, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v7, v11, v12 -; GFX11-NEXT: v_perm_b32 v31, v4, v0, 0x5040100 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v15 -; GFX11-NEXT: v_perm_b32 v32, v5, v3, 0x5040100 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v16 -; GFX11-NEXT: v_perm_b32 v33, v7, v6, 0x5040100 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v17 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v18 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v19 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v20 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v21 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v22 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v23 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v24 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v25 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v26 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v27 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v28 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v29 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v30 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v50 -; GFX11-NEXT: v_lshlrev_b16 v14, 8, v49 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v48 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v39 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-NEXT: v_perm_b32 v34, v3, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v35, v5, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v36, v7, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v37, v9, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v38, v11, v10, 0x5040100 -; GFX11-NEXT: .LBB112_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <32 x i8> %value to <16 x bfloat> - br label %end - -end: - %phi = phi <16 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <16 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v32bf16_to_v8i64(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v32bf16_to_v8i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v32, v31 -; GCN-NEXT: v_mov_b32_e32 v33, v31 -; GCN-NEXT: v_mov_b32_e32 v34, v31 -; GCN-NEXT: v_mov_b32_e32 v35, v31 -; GCN-NEXT: v_mov_b32_e32 v36, v31 -; GCN-NEXT: v_mov_b32_e32 v37, v31 -; GCN-NEXT: v_mov_b32_e32 v38, v31 -; GCN-NEXT: v_mov_b32_e32 v48, v31 -; GCN-NEXT: v_mov_b32_e32 v49, v31 -; GCN-NEXT: v_mov_b32_e32 v50, v31 -; GCN-NEXT: v_mov_b32_e32 v51, v31 -; GCN-NEXT: v_mov_b32_e32 v52, v31 -; GCN-NEXT: v_mov_b32_e32 v53, v31 -; GCN-NEXT: v_mov_b32_e32 v54, v31 -; GCN-NEXT: v_mov_b32_e32 v55, v31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB113_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31 -; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16 -; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16 -; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16 -; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16 -; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16 -; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16 -; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16 -; GCN-NEXT: .LBB113_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32bf16_to_v8i64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB113_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB113_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32bf16_to_v8i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB113_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB113_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32bf16_to_v8i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB113_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB113_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <32 x bfloat> %value to <8 x i64> - br label %end - -end: - %phi = phi <8 x i64> [zeroinitializer, %entry], [%cast, %if] - store <8 x i64> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v32bf16_to_v8f64(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v32bf16_to_v8f64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v32, v31 -; GCN-NEXT: v_mov_b32_e32 v33, v31 -; GCN-NEXT: v_mov_b32_e32 v34, v31 -; GCN-NEXT: v_mov_b32_e32 v35, v31 -; GCN-NEXT: v_mov_b32_e32 v36, v31 -; GCN-NEXT: v_mov_b32_e32 v37, v31 -; GCN-NEXT: v_mov_b32_e32 v38, v31 -; GCN-NEXT: v_mov_b32_e32 v48, v31 -; GCN-NEXT: v_mov_b32_e32 v49, v31 -; GCN-NEXT: v_mov_b32_e32 v50, v31 -; GCN-NEXT: v_mov_b32_e32 v51, v31 -; GCN-NEXT: v_mov_b32_e32 v52, v31 -; GCN-NEXT: v_mov_b32_e32 v53, v31 -; GCN-NEXT: v_mov_b32_e32 v54, v31 -; GCN-NEXT: v_mov_b32_e32 v55, v31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB114_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31 -; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16 -; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16 -; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16 -; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16 -; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16 -; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16 -; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16 -; GCN-NEXT: .LBB114_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32bf16_to_v8f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB114_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB114_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32bf16_to_v8f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB114_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB114_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32bf16_to_v8f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB114_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB114_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <32 x bfloat> %value to <8 x double> - br label %end - -end: - %phi = phi <8 x double> [zeroinitializer, %entry], [%cast, %if] - store <8 x double> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v32bf16_to_v16i32(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v32bf16_to_v16i32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v32, v31 -; GCN-NEXT: v_mov_b32_e32 v33, v31 -; GCN-NEXT: v_mov_b32_e32 v34, v31 -; GCN-NEXT: v_mov_b32_e32 v35, v31 -; GCN-NEXT: v_mov_b32_e32 v36, v31 -; GCN-NEXT: v_mov_b32_e32 v37, v31 -; GCN-NEXT: v_mov_b32_e32 v38, v31 -; GCN-NEXT: v_mov_b32_e32 v48, v31 -; GCN-NEXT: v_mov_b32_e32 v49, v31 -; GCN-NEXT: v_mov_b32_e32 v50, v31 -; GCN-NEXT: v_mov_b32_e32 v51, v31 -; GCN-NEXT: v_mov_b32_e32 v52, v31 -; GCN-NEXT: v_mov_b32_e32 v53, v31 -; GCN-NEXT: v_mov_b32_e32 v54, v31 -; GCN-NEXT: v_mov_b32_e32 v55, v31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB115_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31 -; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16 -; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16 -; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16 -; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16 -; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16 -; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16 -; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16 -; GCN-NEXT: .LBB115_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32bf16_to_v16i32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB115_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB115_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32bf16_to_v16i32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB115_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB115_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32bf16_to_v16i32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB115_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB115_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <32 x bfloat> %value to <16 x i32> - br label %end - -end: - %phi = phi <16 x i32> [zeroinitializer, %entry], [%cast, %if] - store <16 x i32> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v32bf16_to_v16f32(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v32bf16_to_v16f32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v32, v31 -; GCN-NEXT: v_mov_b32_e32 v33, v31 -; GCN-NEXT: v_mov_b32_e32 v34, v31 -; GCN-NEXT: v_mov_b32_e32 v35, v31 -; GCN-NEXT: v_mov_b32_e32 v36, v31 -; GCN-NEXT: v_mov_b32_e32 v37, v31 -; GCN-NEXT: v_mov_b32_e32 v38, v31 -; GCN-NEXT: v_mov_b32_e32 v48, v31 -; GCN-NEXT: v_mov_b32_e32 v49, v31 -; GCN-NEXT: v_mov_b32_e32 v50, v31 -; GCN-NEXT: v_mov_b32_e32 v51, v31 -; GCN-NEXT: v_mov_b32_e32 v52, v31 -; GCN-NEXT: v_mov_b32_e32 v53, v31 -; GCN-NEXT: v_mov_b32_e32 v54, v31 -; GCN-NEXT: v_mov_b32_e32 v55, v31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB116_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31 -; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16 -; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16 -; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16 -; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16 -; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16 -; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16 -; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16 -; GCN-NEXT: .LBB116_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32bf16_to_v16f32: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB116_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB116_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32bf16_to_v16f32: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB116_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB116_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32bf16_to_v16f32: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB116_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB116_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <32 x bfloat> %value to <16 x float> - br label %end - -end: - %phi = phi <16 x float> [zeroinitializer, %entry], [%cast, %if] - store <16 x float> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v32bf16_to_v32f16(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v32bf16_to_v32f16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v46, 0 -; GCN-NEXT: v_mov_b32_e32 v60, 0 -; GCN-NEXT: v_mov_b32_e32 v56, 0 -; GCN-NEXT: v_mov_b32_e32 v61, 0 -; GCN-NEXT: v_mov_b32_e32 v57, 0 -; GCN-NEXT: v_mov_b32_e32 v62, 0 -; GCN-NEXT: v_mov_b32_e32 v58, 0 -; GCN-NEXT: v_mov_b32_e32 v63, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: v_mov_b32_e32 v44, 0 -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v45, 0 -; GCN-NEXT: v_mov_b32_e32 v38, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB117_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v59 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v32 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v60, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v61, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v62, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v63, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v28 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v30 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v47 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v29 -; GCN-NEXT: .LBB117_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v56 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v57 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v58 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v42 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v54 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v43 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v55 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v44 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v40 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v45 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v41 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v51 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v52 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v53 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v31 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v36 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v33 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_or_b32_e32 v3, v4, v3 -; GCN-NEXT: v_or_b32_e32 v4, v6, v5 -; GCN-NEXT: v_or_b32_e32 v5, v8, v7 -; GCN-NEXT: v_or_b32_e32 v6, v10, v9 -; GCN-NEXT: v_or_b32_e32 v7, v12, v11 -; GCN-NEXT: v_or_b32_e32 v8, v14, v13 -; GCN-NEXT: v_or_b32_e32 v9, v16, v15 -; GCN-NEXT: v_or_b32_e32 v10, v18, v17 -; GCN-NEXT: v_or_b32_e32 v11, v20, v19 -; GCN-NEXT: v_or_b32_e32 v12, v22, v21 -; GCN-NEXT: v_or_b32_e32 v13, v24, v23 -; GCN-NEXT: v_or_b32_e32 v14, v26, v25 -; GCN-NEXT: v_or_b32_e32 v15, v28, v27 -; GCN-NEXT: v_or_b32_e32 v16, v30, v29 -; GCN-NEXT: v_or_b32_e32 v17, v32, v31 -; GCN-NEXT: v_or_b32_e32 v18, v33, v0 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32bf16_to_v32f16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB117_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB117_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32bf16_to_v32f16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB117_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB117_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32bf16_to_v32f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB117_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB117_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <32 x bfloat> %value to <32 x half> - br label %end - -end: - %phi = phi <32 x half> [zeroinitializer, %entry], [%cast, %if] - store <32 x half> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v32bf16_to_v32i16(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v32bf16_to_v32i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v32, v31 -; GCN-NEXT: v_mov_b32_e32 v33, v31 -; GCN-NEXT: v_mov_b32_e32 v34, v31 -; GCN-NEXT: v_mov_b32_e32 v35, v31 -; GCN-NEXT: v_mov_b32_e32 v36, v31 -; GCN-NEXT: v_mov_b32_e32 v37, v31 -; GCN-NEXT: v_mov_b32_e32 v38, v31 -; GCN-NEXT: v_mov_b32_e32 v48, v31 -; GCN-NEXT: v_mov_b32_e32 v49, v31 -; GCN-NEXT: v_mov_b32_e32 v50, v31 -; GCN-NEXT: v_mov_b32_e32 v51, v31 -; GCN-NEXT: v_mov_b32_e32 v52, v31 -; GCN-NEXT: v_mov_b32_e32 v53, v31 -; GCN-NEXT: v_mov_b32_e32 v54, v31 -; GCN-NEXT: v_mov_b32_e32 v55, v31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB118_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31 -; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16 -; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16 -; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16 -; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16 -; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16 -; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16 -; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16 -; GCN-NEXT: .LBB118_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32bf16_to_v32i16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB118_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB118_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32bf16_to_v32i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB118_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB118_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32bf16_to_v32i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB118_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB118_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <32 x bfloat> %value to <32 x i16> - br label %end - -end: - %phi = phi <32 x i16> [zeroinitializer, %entry], [%cast, %if] - store <32 x i16> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v32bf16_to_v64i8(i32 %cond, ptr addrspace(1) %out, <32 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v32bf16_to_v64i8: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v32, v31 -; GCN-NEXT: v_mov_b32_e32 v33, v31 -; GCN-NEXT: v_mov_b32_e32 v34, v31 -; GCN-NEXT: v_mov_b32_e32 v35, v31 -; GCN-NEXT: v_mov_b32_e32 v36, v31 -; GCN-NEXT: v_mov_b32_e32 v37, v31 -; GCN-NEXT: v_mov_b32_e32 v38, v31 -; GCN-NEXT: v_mov_b32_e32 v48, v31 -; GCN-NEXT: v_mov_b32_e32 v49, v31 -; GCN-NEXT: v_mov_b32_e32 v50, v31 -; GCN-NEXT: v_mov_b32_e32 v51, v31 -; GCN-NEXT: v_mov_b32_e32 v52, v31 -; GCN-NEXT: v_mov_b32_e32 v53, v31 -; GCN-NEXT: v_mov_b32_e32 v54, v31 -; GCN-NEXT: v_mov_b32_e32 v55, v31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB119_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v42 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v31 -; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v33, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v34, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v35, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v36, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v37, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v38, v16, v17, 16 -; GCN-NEXT: v_alignbit_b32 v48, v18, v19, 16 -; GCN-NEXT: v_alignbit_b32 v49, v20, v21, 16 -; GCN-NEXT: v_alignbit_b32 v50, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v51, v24, v25, 16 -; GCN-NEXT: v_alignbit_b32 v52, v26, v27, 16 -; GCN-NEXT: v_alignbit_b32 v53, v28, v29, 16 -; GCN-NEXT: v_alignbit_b32 v54, v30, v54, 16 -; GCN-NEXT: v_alignbit_b32 v55, v55, v39, 16 -; GCN-NEXT: .LBB119_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(5) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32bf16_to_v64i8: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB119_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB119_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32bf16_to_v64i8: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB119_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB119_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32bf16_to_v64i8: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB119_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB119_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <32 x bfloat> %value to <64 x i8> - br label %end - -end: - %phi = phi <64 x i8> [zeroinitializer, %entry], [%cast, %if] - store <64 x i8> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v64i8_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <64 x i8> %value) { -; GCN-LABEL: v_bitcast_v64i8_to_v32bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: v_mov_b32_e32 v38, 0 -; GCN-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NEXT: v_mov_b32_e32 v63, 0 -; GCN-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB120_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v8 -; GCN-NEXT: v_or_b32_e32 v31, v0, v7 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 8, v16 -; GCN-NEXT: v_or_b32_e32 v0, v0, v7 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v7, 0xff, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 8, v24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v7, v8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_and_b32_e32 v8, 0xff, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 8, v44 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, v8, v11 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 24, v4 -; GCN-NEXT: v_and_b32_e32 v18, 0xff, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 24, v6 -; GCN-NEXT: v_and_b32_e32 v20, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 24, v10 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v22, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v26, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v27, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v8, 24, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v28, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v29, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v9, 24, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v30, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v10, 24, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v32, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v21, 24, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v49, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v19, 24, v0 -; GCN-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 24, v43 -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 24, v42 -; GCN-NEXT: v_and_b32_e32 v14, 0xff, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v17 -; GCN-NEXT: v_and_b32_e32 v51, 0xff, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 8, v61 -; GCN-NEXT: v_and_b32_e32 v55, 0xff, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 24, v59 -; GCN-NEXT: v_and_b32_e32 v40, 0xff, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 24, v57 -; GCN-NEXT: v_and_b32_e32 v41, 0xff, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 24, v47 -; GCN-NEXT: v_and_b32_e32 v46, 0xff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 8, v45 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v47, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v39, 24, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v56, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v44, 24, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v58, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v43, 24, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v59, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v60, 8, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v61, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v42, 24, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v62, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v23, 24, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v63, 0xff, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v24, 24, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v33, 8, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v34, 0xff, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v17, 24, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v14 -; GCN-NEXT: v_or_b32_e32 v12, v51, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v41 -; GCN-NEXT: v_or_b32_e32 v45, v46, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v58 -; GCN-NEXT: v_or_b32_e32 v58, v59, v60 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v63 -; GCN-NEXT: v_or_b32_e32 v0, v0, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v34 -; GCN-NEXT: v_or_b32_e32 v50, v50, v3 -; GCN-NEXT: v_or_b32_e32 v52, v52, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_or_b32_e32 v48, v48, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v3, v36 -; GCN-NEXT: v_or_b32_e32 v40, v7, v37 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_or_b32_e32 v41, v8, v11 -; GCN-NEXT: v_or_b32_e32 v22, v6, v20 -; GCN-NEXT: v_or_b32_e32 v20, v9, v35 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v3 -; GCN-NEXT: v_or_b32_e32 v53, v10, v29 -; GCN-NEXT: v_or_b32_e32 v21, v21, v30 -; GCN-NEXT: v_or_b32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v3 -; GCN-NEXT: v_or_b32_e32 v54, v54, v26 -; GCN-NEXT: v_or_b32_e32 v35, v25, v27 -; GCN-NEXT: v_or_b32_e32 v37, v15, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v12 -; GCN-NEXT: v_or_b32_e32 v25, v16, v13 -; GCN-NEXT: v_or_b32_e32 v36, v57, v14 -; GCN-NEXT: v_or_b32_e32 v38, v38, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v45 -; GCN-NEXT: v_or_b32_e32 v39, v39, v46 -; GCN-NEXT: v_or_b32_e32 v63, v44, v47 -; GCN-NEXT: v_or_b32_e32 v29, v43, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v58 -; GCN-NEXT: v_or_b32_e32 v34, v42, v59 -; GCN-NEXT: v_or_b32_e32 v30, v23, v60 -; GCN-NEXT: v_or_b32_e32 v28, v24, v61 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GCN-NEXT: v_or_b32_e32 v0, v17, v62 -; GCN-NEXT: .LBB120_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v31 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v55 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v63 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16 -; GCN-NEXT: v_alignbit_b32 v11, v19, v20, 16 -; GCN-NEXT: v_alignbit_b32 v12, v21, v22, 16 -; GCN-NEXT: v_alignbit_b32 v13, v23, v24, 16 -; GCN-NEXT: v_alignbit_b32 v14, v25, v26, 16 -; GCN-NEXT: v_alignbit_b32 v15, v27, v29, 16 -; GCN-NEXT: v_alignbit_b32 v16, v31, v32, 16 -; GCN-NEXT: v_alignbit_b32 v17, v28, v30, 16 -; GCN-NEXT: v_alignbit_b32 v18, v0, v33, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v64i8_to_v32bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 -; VI-NEXT: v_mov_b32_e32 v31, 0 -; VI-NEXT: v_mov_b32_e32 v32, v31 -; VI-NEXT: v_mov_b32_e32 v33, v31 -; VI-NEXT: v_mov_b32_e32 v34, v31 -; VI-NEXT: v_mov_b32_e32 v35, v31 -; VI-NEXT: v_mov_b32_e32 v36, v31 -; VI-NEXT: v_mov_b32_e32 v37, v31 -; VI-NEXT: v_mov_b32_e32 v38, v31 -; VI-NEXT: v_mov_b32_e32 v48, v31 -; VI-NEXT: v_mov_b32_e32 v49, v31 -; VI-NEXT: v_mov_b32_e32 v50, v31 -; VI-NEXT: v_mov_b32_e32 v51, v31 -; VI-NEXT: v_mov_b32_e32 v52, v31 -; VI-NEXT: v_mov_b32_e32 v53, v31 -; VI-NEXT: v_mov_b32_e32 v54, v31 -; VI-NEXT: v_mov_b32_e32 v55, v31 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB120_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v6 -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v8 -; VI-NEXT: v_lshlrev_b16_e32 v6, 8, v10 -; VI-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v31, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v32, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v12 -; VI-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v33, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v34, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v35, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v36, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v37, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v13 -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v15 -; VI-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v38, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v17 -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v19 -; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v22, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v48, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v21 -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v23 -; VI-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v26, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v49, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v25 -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v27 -; VI-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v50, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v29 -; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v63 -; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v51, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v61 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v59 -; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v52, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v57 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v47 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v53, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v43 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v54, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v41 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v39 -; VI-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v55, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: .LBB120_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[52:55] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[48:51] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[35:38] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[31:34] -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v64i8_to_v32bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 -; GFX9-NEXT: v_mov_b32_e32 v31, 0 -; GFX9-NEXT: v_mov_b32_e32 v32, v31 -; GFX9-NEXT: v_mov_b32_e32 v33, v31 -; GFX9-NEXT: v_mov_b32_e32 v34, v31 -; GFX9-NEXT: v_mov_b32_e32 v35, v31 -; GFX9-NEXT: v_mov_b32_e32 v36, v31 -; GFX9-NEXT: v_mov_b32_e32 v37, v31 -; GFX9-NEXT: v_mov_b32_e32 v38, v31 -; GFX9-NEXT: v_mov_b32_e32 v48, v31 -; GFX9-NEXT: v_mov_b32_e32 v49, v31 -; GFX9-NEXT: v_mov_b32_e32 v50, v31 -; GFX9-NEXT: v_mov_b32_e32 v51, v31 -; GFX9-NEXT: v_mov_b32_e32 v52, v31 -; GFX9-NEXT: v_mov_b32_e32 v53, v31 -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v55, v31 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB120_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v6 -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v8 -; GFX9-NEXT: v_lshlrev_b16_e32 v6, 8, v10 -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: v_or_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v31, v4, v3, s6 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v32, v6, v5, s6 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v12 -; GFX9-NEXT: v_or_b32_sdwa v3, v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v33, v4, v3, s6 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v34, v4, v3, s6 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v35, v4, v3, s6 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v36, v4, v3, s6 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v37, v4, v3, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v13 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v18, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v38, v4, v3, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v17 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19 -; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v22, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v48, v4, v3, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v21 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v26, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v49, v4, v3, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v25 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v27 -; GFX9-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v30, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v50, v4, v3, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v29 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v63 -; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v51, v3, v0, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v61 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v52, v3, v0, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v47 -; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v53, v3, v0, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v45 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v54, v3, v0, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v41 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v55, v3, v0, s6 -; GFX9-NEXT: .LBB120_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[52:55], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[48:51], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v64i8_to_v32bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_u16 v39, off, s32 offset:140 -; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:136 -; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:132 -; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:128 -; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:124 -; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:120 -; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:116 -; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:112 -; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:108 -; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:104 -; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:100 -; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:96 -; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:92 -; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:88 -; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:84 -; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:80 -; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:76 -; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:72 -; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:68 -; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:64 -; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:60 -; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:56 -; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:52 -; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:48 -; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:44 -; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:40 -; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:36 -; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:32 -; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:28 -; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:24 -; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:20 -; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:16 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:12 -; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:8 -; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:4 -; GFX11-NEXT: scratch_load_u16 v130, off, s32 -; GFX11-NEXT: v_mov_b32_e32 v31, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v32, v31 -; GFX11-NEXT: v_mov_b32_e32 v33, v31 -; GFX11-NEXT: v_mov_b32_e32 v34, v31 -; GFX11-NEXT: v_mov_b32_e32 v35, v31 -; GFX11-NEXT: v_mov_b32_e32 v36, v31 -; GFX11-NEXT: v_mov_b32_e32 v37, v31 -; GFX11-NEXT: v_mov_b32_e32 v38, v31 -; GFX11-NEXT: v_mov_b32_e32 v48, v31 -; GFX11-NEXT: v_mov_b32_e32 v49, v31 -; GFX11-NEXT: v_mov_b32_e32 v50, v31 -; GFX11-NEXT: v_mov_b32_e32 v51, v31 -; GFX11-NEXT: v_mov_b32_e32 v52, v31 -; GFX11-NEXT: v_mov_b32_e32 v53, v31 -; GFX11-NEXT: v_mov_b32_e32 v54, v31 -; GFX11-NEXT: v_mov_b32_e32 v55, v31 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB120_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v9 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v14 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_lshlrev_b16 v14, 8, v24 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v8 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v10 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v13 -; GFX11-NEXT: v_perm_b32 v31, v3, v0, 0x5040100 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v12 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v15 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v16 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v5, v6, v7 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v8, v9 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v17 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v18 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v19 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v20 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v21 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v22 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v23 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v25 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v26 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-NEXT: v_perm_b32 v32, v5, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v33, v3, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v34, v7, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v35, v9, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v36, v11, v10, 0x5040100 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v27 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v28 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v29 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v30 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v130 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v129 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v128 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v119 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v118 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v117 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v116 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v115 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v114 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v113 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v112 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v103 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v102 -; GFX11-NEXT: v_lshlrev_b16 v14, 8, v101 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v100 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v99 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-NEXT: v_perm_b32 v37, v3, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v38, v5, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v48, v7, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v49, v9, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v50, v11, v10, 0x5040100 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v98 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v97 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v96 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v87 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v86 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v85 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v84 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v83 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v82 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v81 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v80 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v71 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v70 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v69 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v68 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v67 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v66 -; GFX11-NEXT: v_lshlrev_b16 v14, 8, v65 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v64 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v39 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-NEXT: v_perm_b32 v51, v3, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v52, v5, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v53, v7, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v54, v9, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v55, v11, v10, 0x5040100 -; GFX11-NEXT: .LBB120_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[52:55], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[48:51], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <64 x i8> %value to <32 x bfloat> - br label %end - -end: - %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <32 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v32i16_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <32 x i16> %value) { -; GCN-LABEL: v_bitcast_v32i16_to_v32bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NEXT: v_mov_b32_e32 v46, 0 -; GCN-NEXT: v_mov_b32_e32 v58, 0 -; GCN-NEXT: v_mov_b32_e32 v47, 0 -; GCN-NEXT: v_mov_b32_e32 v59, 0 -; GCN-NEXT: v_mov_b32_e32 v56, 0 -; GCN-NEXT: v_mov_b32_e32 v60, 0 -; GCN-NEXT: v_mov_b32_e32 v57, 0 -; GCN-NEXT: v_mov_b32_e32 v61, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: v_mov_b32_e32 v44, 0 -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v45, 0 -; GCN-NEXT: v_mov_b32_e32 v38, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB121_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v28 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v63 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v62 -; GCN-NEXT: .LBB121_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v58 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v59 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v60 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v56 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v61 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v57 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v42 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v44 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v31 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v36 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v7, 16 -; GCN-NEXT: v_alignbit_b32 v6, v8, v9, 16 -; GCN-NEXT: v_alignbit_b32 v7, v10, v11, 16 -; GCN-NEXT: v_alignbit_b32 v8, v12, v13, 16 -; GCN-NEXT: v_alignbit_b32 v9, v14, v15, 16 -; GCN-NEXT: v_alignbit_b32 v10, v16, v17, 16 -; GCN-NEXT: v_alignbit_b32 v11, v18, v19, 16 -; GCN-NEXT: v_alignbit_b32 v12, v20, v21, 16 -; GCN-NEXT: v_alignbit_b32 v13, v22, v23, 16 -; GCN-NEXT: v_alignbit_b32 v14, v24, v25, 16 -; GCN-NEXT: v_alignbit_b32 v15, v26, v27, 16 -; GCN-NEXT: v_alignbit_b32 v16, v28, v30, 16 -; GCN-NEXT: v_alignbit_b32 v17, v31, v32, 16 -; GCN-NEXT: v_alignbit_b32 v18, v29, v33, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32i16_to_v32bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB121_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB121_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32i16_to_v32bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB121_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB121_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32i16_to_v32bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB121_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB121_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <32 x i16> %value to <32 x bfloat> - br label %end - -end: - %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <32 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v32f16_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <32 x half> %value) { -; GCN-LABEL: v_bitcast_v32f16_to_v32bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v46, 0 -; GCN-NEXT: v_mov_b32_e32 v58, 0 -; GCN-NEXT: v_mov_b32_e32 v47, 0 -; GCN-NEXT: v_mov_b32_e32 v59, 0 -; GCN-NEXT: v_mov_b32_e32 v56, 0 -; GCN-NEXT: v_mov_b32_e32 v60, 0 -; GCN-NEXT: v_mov_b32_e32 v57, 0 -; GCN-NEXT: v_mov_b32_e32 v61, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: v_mov_b32_e32 v44, 0 -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v45, 0 -; GCN-NEXT: v_mov_b32_e32 v38, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB122_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v28 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v62 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v60, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v61, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v20 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v26 -; GCN-NEXT: v_lshlrev_b32_e32 v35, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v28 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v33 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v37, 16, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v62 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v29 -; GCN-NEXT: .LBB122_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v58 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v46 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v59 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v60 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v56 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v61 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v57 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v42 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v43 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v44 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v45 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v50 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v31 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v36 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16 -; GCN-NEXT: v_alignbit_b32 v11, v19, v20, 16 -; GCN-NEXT: v_alignbit_b32 v12, v21, v22, 16 -; GCN-NEXT: v_alignbit_b32 v13, v23, v24, 16 -; GCN-NEXT: v_alignbit_b32 v14, v25, v26, 16 -; GCN-NEXT: v_alignbit_b32 v15, v27, v28, 16 -; GCN-NEXT: v_alignbit_b32 v16, v29, v30, 16 -; GCN-NEXT: v_alignbit_b32 v17, v31, v32, 16 -; GCN-NEXT: v_alignbit_b32 v18, v0, v33, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32f16_to_v32bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB122_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB122_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32f16_to_v32bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB122_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB122_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32f16_to_v32bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB122_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB122_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <32 x half> %value to <32 x bfloat> - br label %end - -end: - %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <32 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v16i32_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <16 x i32> %value) { -; GCN-LABEL: v_bitcast_v16i32_to_v32bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v38, 0 -; GCN-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB123_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v3 -; GCN-NEXT: .LBB123_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16 -; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16 -; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16 -; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16 -; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16 -; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16 -; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16 -; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v16i32_to_v32bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB123_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB123_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v16i32_to_v32bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB123_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB123_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v16i32_to_v32bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB123_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB123_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <16 x i32> %value to <32 x bfloat> - br label %end - -end: - %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <32 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v16f32_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <16 x float> %value) { -; GCN-LABEL: v_bitcast_v16f32_to_v32bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v38, 0 -; GCN-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB124_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v3 -; GCN-NEXT: .LBB124_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16 -; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16 -; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16 -; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16 -; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16 -; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16 -; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16 -; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v16f32_to_v32bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB124_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB124_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v16f32_to_v32bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB124_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB124_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v16f32_to_v32bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB124_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB124_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <16 x float> %value to <32 x bfloat> - br label %end - -end: - %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <32 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v8f64_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <8 x double> %value) { -; GCN-LABEL: v_bitcast_v8f64_to_v32bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v38, 0 -; GCN-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB125_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v3 -; GCN-NEXT: .LBB125_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16 -; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16 -; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16 -; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16 -; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16 -; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16 -; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16 -; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8f64_to_v32bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB125_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB125_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8f64_to_v32bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB125_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB125_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8f64_to_v32bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB125_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB125_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <8 x double> %value to <32 x bfloat> - br label %end - -end: - %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <32 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v8i64_to_v32bf16(i32 %cond, ptr addrspace(1) %out, <8 x i64> %value) { -; GCN-LABEL: v_bitcast_v8i64_to_v32bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v38, 0 -; GCN-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NEXT: v_mov_b32_e32 v29, 0 -; GCN-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB126_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v17 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v16 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v15 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v13 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v12 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v11 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v10 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v9 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v8 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v6 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v5 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v3 -; GCN-NEXT: .LBB126_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v41 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v53 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v52 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v51 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v50 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v49 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v48 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v39 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v38 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v37 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v36 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v34 -; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_alignbit_b32 v4, v5, v6, 16 -; GCN-NEXT: v_alignbit_b32 v5, v7, v8, 16 -; GCN-NEXT: v_alignbit_b32 v6, v9, v10, 16 -; GCN-NEXT: v_alignbit_b32 v7, v11, v12, 16 -; GCN-NEXT: v_alignbit_b32 v8, v13, v14, 16 -; GCN-NEXT: v_alignbit_b32 v9, v15, v16, 16 -; GCN-NEXT: v_alignbit_b32 v10, v17, v18, 16 -; GCN-NEXT: v_alignbit_b32 v11, v33, v32, 16 -; GCN-NEXT: v_alignbit_b32 v12, v31, v30, 16 -; GCN-NEXT: v_alignbit_b32 v13, v29, v28, 16 -; GCN-NEXT: v_alignbit_b32 v14, v27, v26, 16 -; GCN-NEXT: v_alignbit_b32 v15, v25, v24, 16 -; GCN-NEXT: v_alignbit_b32 v16, v23, v22, 16 -; GCN-NEXT: v_alignbit_b32 v17, v21, v20, 16 -; GCN-NEXT: v_alignbit_b32 v18, v0, v19, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8i64_to_v32bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v19, 0 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: v_mov_b32_e32 v20, v19 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v22, v19 -; VI-NEXT: v_mov_b32_e32 v23, v19 -; VI-NEXT: v_mov_b32_e32 v24, v19 -; VI-NEXT: v_mov_b32_e32 v25, v19 -; VI-NEXT: v_mov_b32_e32 v26, v19 -; VI-NEXT: v_mov_b32_e32 v27, v19 -; VI-NEXT: v_mov_b32_e32 v28, v19 -; VI-NEXT: v_mov_b32_e32 v29, v19 -; VI-NEXT: v_mov_b32_e32 v30, v19 -; VI-NEXT: v_mov_b32_e32 v31, v19 -; VI-NEXT: v_mov_b32_e32 v32, v19 -; VI-NEXT: v_mov_b32_e32 v33, v19 -; VI-NEXT: v_mov_b32_e32 v34, v19 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB126_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v34, v18 -; VI-NEXT: v_mov_b32_e32 v33, v17 -; VI-NEXT: v_mov_b32_e32 v32, v16 -; VI-NEXT: v_mov_b32_e32 v31, v15 -; VI-NEXT: v_mov_b32_e32 v30, v14 -; VI-NEXT: v_mov_b32_e32 v29, v13 -; VI-NEXT: v_mov_b32_e32 v28, v12 -; VI-NEXT: v_mov_b32_e32 v27, v11 -; VI-NEXT: v_mov_b32_e32 v26, v10 -; VI-NEXT: v_mov_b32_e32 v25, v9 -; VI-NEXT: v_mov_b32_e32 v24, v8 -; VI-NEXT: v_mov_b32_e32 v23, v7 -; VI-NEXT: v_mov_b32_e32 v22, v6 -; VI-NEXT: v_mov_b32_e32 v21, v5 -; VI-NEXT: v_mov_b32_e32 v20, v4 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: .LBB126_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[27:30] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[23:26] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[19:22] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8i64_to_v32bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v19, 0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v20, v19 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v22, v19 -; GFX9-NEXT: v_mov_b32_e32 v23, v19 -; GFX9-NEXT: v_mov_b32_e32 v24, v19 -; GFX9-NEXT: v_mov_b32_e32 v25, v19 -; GFX9-NEXT: v_mov_b32_e32 v26, v19 -; GFX9-NEXT: v_mov_b32_e32 v27, v19 -; GFX9-NEXT: v_mov_b32_e32 v28, v19 -; GFX9-NEXT: v_mov_b32_e32 v29, v19 -; GFX9-NEXT: v_mov_b32_e32 v30, v19 -; GFX9-NEXT: v_mov_b32_e32 v31, v19 -; GFX9-NEXT: v_mov_b32_e32 v32, v19 -; GFX9-NEXT: v_mov_b32_e32 v33, v19 -; GFX9-NEXT: v_mov_b32_e32 v34, v19 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB126_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v34, v18 -; GFX9-NEXT: v_mov_b32_e32 v33, v17 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v31, v15 -; GFX9-NEXT: v_mov_b32_e32 v30, v14 -; GFX9-NEXT: v_mov_b32_e32 v29, v13 -; GFX9-NEXT: v_mov_b32_e32 v28, v12 -; GFX9-NEXT: v_mov_b32_e32 v27, v11 -; GFX9-NEXT: v_mov_b32_e32 v26, v10 -; GFX9-NEXT: v_mov_b32_e32 v25, v9 -; GFX9-NEXT: v_mov_b32_e32 v24, v8 -; GFX9-NEXT: v_mov_b32_e32 v23, v7 -; GFX9-NEXT: v_mov_b32_e32 v22, v6 -; GFX9-NEXT: v_mov_b32_e32 v21, v5 -; GFX9-NEXT: v_mov_b32_e32 v20, v4 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: .LBB126_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8i64_to_v32bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v19, 0 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v20, v19 -; GFX11-NEXT: v_mov_b32_e32 v21, v19 -; GFX11-NEXT: v_mov_b32_e32 v22, v19 -; GFX11-NEXT: v_mov_b32_e32 v23, v19 -; GFX11-NEXT: v_mov_b32_e32 v24, v19 -; GFX11-NEXT: v_mov_b32_e32 v25, v19 -; GFX11-NEXT: v_mov_b32_e32 v26, v19 -; GFX11-NEXT: v_mov_b32_e32 v27, v19 -; GFX11-NEXT: v_mov_b32_e32 v28, v19 -; GFX11-NEXT: v_mov_b32_e32 v29, v19 -; GFX11-NEXT: v_mov_b32_e32 v30, v19 -; GFX11-NEXT: v_mov_b32_e32 v31, v19 -; GFX11-NEXT: v_mov_b32_e32 v32, v19 -; GFX11-NEXT: v_mov_b32_e32 v33, v19 -; GFX11-NEXT: v_mov_b32_e32 v34, v19 -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB126_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v34, v18 :: v_dual_mov_b32 v33, v17 -; GFX11-NEXT: v_dual_mov_b32 v32, v16 :: v_dual_mov_b32 v31, v15 -; GFX11-NEXT: v_dual_mov_b32 v30, v14 :: v_dual_mov_b32 v29, v13 -; GFX11-NEXT: v_dual_mov_b32 v28, v12 :: v_dual_mov_b32 v27, v11 -; GFX11-NEXT: v_dual_mov_b32 v26, v10 :: v_dual_mov_b32 v25, v9 -; GFX11-NEXT: v_dual_mov_b32 v24, v8 :: v_dual_mov_b32 v23, v7 -; GFX11-NEXT: v_dual_mov_b32 v22, v6 :: v_dual_mov_b32 v21, v5 -; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v19, v3 -; GFX11-NEXT: .LBB126_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[27:30], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[23:26], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[19:22], off -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <8 x i64> %value to <32 x bfloat> - br label %end - -end: - %phi = phi <32 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <32 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - -define <32 x half> @v_bitcast_v8i64_to_v32f16(<8 x i64> %a, i32 %b) { -; GCN-LABEL: v_bitcast_v8i64_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v34, v15 -; GCN-NEXT: v_mov_b32_e32 v33, v14 -; GCN-NEXT: v_mov_b32_e32 v36, v13 -; GCN-NEXT: v_mov_b32_e32 v35, v12 -; GCN-NEXT: v_mov_b32_e32 v38, v11 -; GCN-NEXT: v_mov_b32_e32 v37, v10 -; GCN-NEXT: v_mov_b32_e32 v48, v9 -; GCN-NEXT: v_mov_b32_e32 v39, v8 -; GCN-NEXT: v_mov_b32_e32 v50, v7 -; GCN-NEXT: v_mov_b32_e32 v49, v6 -; GCN-NEXT: v_mov_b32_e32 v52, v5 -; GCN-NEXT: v_mov_b32_e32 v51, v4 -; GCN-NEXT: v_mov_b32_e32 v54, v3 -; GCN-NEXT: v_mov_b32_e32 v53, v2 -; GCN-NEXT: v_mov_b32_e32 v55, v1 -; GCN-NEXT: v_mov_b32_e32 v32, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB127_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v50 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v49 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v52 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v51 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v53 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v55 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v46 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: .LBB127_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB127_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v32 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v55, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v53 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v54, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v51 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v52, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v49 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v50, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v39 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v48, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v37 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v38, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v35 -; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v36, vcc -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v33 -; GCN-NEXT: v_addc_u32_e32 v15, vcc, 0, v34, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v32 -; GCN-NEXT: .LBB127_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8i64_to_v32f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB127_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB127_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8i64_to_v32f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB127_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB127_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8i64_to_v32f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB127_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB127_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false -cmp.true: - %a1 = add <8 x i64> %a, splat (i64 3) - %a2 = bitcast <8 x i64> %a1 to <32 x half> - br label %end -cmp.false: - %a3 = bitcast <8 x i64> %a to <32 x half> - br label %end -end: - %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <32 x half> %phi -} - - -define <32 x i16> @v_bitcast_v8i64_to_v32i16(<8 x i64> %a, i32 %b) { -; GCN-LABEL: v_bitcast_v8i64_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v30, v15 -; GCN-NEXT: v_mov_b32_e32 v28, v14 -; GCN-NEXT: v_mov_b32_e32 v26, v13 -; GCN-NEXT: v_mov_b32_e32 v24, v12 -; GCN-NEXT: v_mov_b32_e32 v22, v11 -; GCN-NEXT: v_mov_b32_e32 v20, v10 -; GCN-NEXT: v_mov_b32_e32 v18, v9 -; GCN-NEXT: v_mov_b32_e32 v32, v8 -; GCN-NEXT: v_mov_b32_e32 v14, v7 -; GCN-NEXT: v_mov_b32_e32 v12, v6 -; GCN-NEXT: v_mov_b32_e32 v10, v5 -; GCN-NEXT: v_mov_b32_e32 v8, v4 -; GCN-NEXT: v_mov_b32_e32 v6, v3 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB128_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB128_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB128_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; GCN-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v32 -; GCN-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; GCN-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; GCN-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; GCN-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; GCN-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; GCN-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; GCN-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; GCN-NEXT: v_alignbit_b32 v17, v18, v32, 16 -; GCN-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; GCN-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; GCN-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; GCN-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GCN-NEXT: .LBB128_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v16, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8i64_to_v32i16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB128_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_u32_e32 v14, vcc, 3, v14 -; VI-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc -; VI-NEXT: v_add_u32_e32 v12, vcc, 3, v12 -; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; VI-NEXT: v_add_u32_e32 v10, vcc, 3, v10 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 3, v8 -; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 3, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: .LBB128_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8i64_to_v32i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB128_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_co_u32_e32 v14, vcc, 3, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 3, v12 -; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 3, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 3, v8 -; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 3, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: .LBB128_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8i64_to_v32i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB128_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_co_u32 v14, vcc_lo, v14, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v15, null, 0, v15, vcc_lo -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, v12, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v13, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v10, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v11, vcc_lo -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v8, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo -; GFX11-NEXT: v_add_co_u32 v6, vcc_lo, v6, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo -; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v4, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 3 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-NEXT: .LBB128_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false -cmp.true: - %a1 = add <8 x i64> %a, splat (i64 3) - %a2 = bitcast <8 x i64> %a1 to <32 x i16> - br label %end -cmp.false: - %a3 = bitcast <8 x i64> %a to <32 x i16> - br label %end -end: - %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <32 x i16> %phi -} - - -define <32 x i16> @v_bitcast_v8f64_to_v32i16(<8 x double> %a, i32 %b) { -; GCN-LABEL: v_bitcast_v8f64_to_v32i16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v55, v15 -; GCN-NEXT: v_mov_b32_e32 v54, v14 -; GCN-NEXT: v_mov_b32_e32 v53, v13 -; GCN-NEXT: v_mov_b32_e32 v52, v12 -; GCN-NEXT: v_mov_b32_e32 v51, v11 -; GCN-NEXT: v_mov_b32_e32 v50, v10 -; GCN-NEXT: v_mov_b32_e32 v49, v9 -; GCN-NEXT: v_mov_b32_e32 v48, v8 -; GCN-NEXT: v_mov_b32_e32 v38, v7 -; GCN-NEXT: v_mov_b32_e32 v37, v6 -; GCN-NEXT: v_mov_b32_e32 v36, v5 -; GCN-NEXT: v_mov_b32_e32 v35, v4 -; GCN-NEXT: v_mov_b32_e32 v34, v3 -; GCN-NEXT: v_mov_b32_e32 v33, v2 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr7 -; GCN-NEXT: ; implicit-def: $vgpr9 -; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB129_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16 -; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16 -; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16 -; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16 -; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16 -; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16 -; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB129_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB129_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[33:34], v[33:34], 1.0 -; GCN-NEXT: v_add_f64 v[35:36], v[35:36], 1.0 -; GCN-NEXT: v_add_f64 v[37:38], v[37:38], 1.0 -; GCN-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 -; GCN-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 -; GCN-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 -; GCN-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 -; GCN-NEXT: v_alignbit_b32 v29, v55, v54, 16 -; GCN-NEXT: v_alignbit_b32 v25, v53, v52, 16 -; GCN-NEXT: v_alignbit_b32 v21, v51, v50, 16 -; GCN-NEXT: v_alignbit_b32 v17, v49, v48, 16 -; GCN-NEXT: v_alignbit_b32 v13, v38, v37, 16 -; GCN-NEXT: v_alignbit_b32 v9, v36, v35, 16 -; GCN-NEXT: v_alignbit_b32 v5, v34, v33, 16 -; GCN-NEXT: v_alignbit_b32 v32, v1, v0, 16 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v38 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v36 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v34 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GCN-NEXT: .LBB129_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v33 -; GCN-NEXT: v_mov_b32_e32 v6, v34 -; GCN-NEXT: v_mov_b32_e32 v8, v35 -; GCN-NEXT: v_mov_b32_e32 v10, v36 -; GCN-NEXT: v_mov_b32_e32 v12, v37 -; GCN-NEXT: v_mov_b32_e32 v14, v38 -; GCN-NEXT: v_mov_b32_e32 v16, v48 -; GCN-NEXT: v_mov_b32_e32 v18, v49 -; GCN-NEXT: v_mov_b32_e32 v20, v50 -; GCN-NEXT: v_mov_b32_e32 v22, v51 -; GCN-NEXT: v_mov_b32_e32 v24, v52 -; GCN-NEXT: v_mov_b32_e32 v26, v53 -; GCN-NEXT: v_mov_b32_e32 v28, v54 -; GCN-NEXT: v_mov_b32_e32 v30, v55 -; GCN-NEXT: v_mov_b32_e32 v1, v32 -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8f64_to_v32i16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB129_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB129_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8f64_to_v32i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB129_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB129_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8f64_to_v32i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB129_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB129_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false -cmp.true: - %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <8 x double> %a1 to <32 x i16> - br label %end -cmp.false: - %a3 = bitcast <8 x double> %a to <32 x i16> - br label %end -end: - %phi = phi <32 x i16> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <32 x i16> %phi -} - - -define <32 x half> @v_bitcast_v8f64_to_v32f16(<8 x double> %a, i32 %b) { -; GCN-LABEL: v_bitcast_v8f64_to_v32f16: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB130_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v3 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v2 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v42 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr8 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr12 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: .LBB130_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB130_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GCN-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GCN-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GCN-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GCN-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GCN-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GCN-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GCN-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v35, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v34, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v33, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v32, v1 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v0 -; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v29 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v27 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v55 -; GCN-NEXT: .LBB130_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, v39 -; GCN-NEXT: v_mov_b32_e32 v1, v55 -; GCN-NEXT: v_mov_b32_e32 v2, v32 -; GCN-NEXT: v_mov_b32_e32 v3, v54 -; GCN-NEXT: v_mov_b32_e32 v4, v33 -; GCN-NEXT: v_mov_b32_e32 v5, v53 -; GCN-NEXT: v_mov_b32_e32 v6, v34 -; GCN-NEXT: v_mov_b32_e32 v7, v52 -; GCN-NEXT: v_mov_b32_e32 v8, v35 -; GCN-NEXT: v_mov_b32_e32 v9, v51 -; GCN-NEXT: v_mov_b32_e32 v10, v36 -; GCN-NEXT: v_mov_b32_e32 v11, v50 -; GCN-NEXT: v_mov_b32_e32 v12, v37 -; GCN-NEXT: v_mov_b32_e32 v13, v49 -; GCN-NEXT: v_mov_b32_e32 v14, v38 -; GCN-NEXT: v_mov_b32_e32 v15, v48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v8f64_to_v32f16: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB130_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; VI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; VI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; VI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; VI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; VI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; VI-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; VI-NEXT: .LBB130_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v8f64_to_v32f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB130_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX9-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX9-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX9-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX9-NEXT: .LBB130_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v8f64_to_v32f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB130_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; GFX11-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; GFX11-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; GFX11-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; GFX11-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 -; GFX11-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; GFX11-NEXT: v_add_f64 v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; GFX11-NEXT: .LBB130_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false -cmp.true: - %a1 = fadd <8 x double> %a, splat (double 1.000000e+00) - %a2 = bitcast <8 x double> %a1 to <32 x half> - br label %end -cmp.false: - %a3 = bitcast <8 x double> %a to <32 x half> - br label %end -end: - %phi = phi <32 x half> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <32 x half> %phi -} - - -define <8 x i64> @v_bitcast_v32f16_to_v8i64(<32 x half> %a, i32 %b) { -; GCN-LABEL: v_bitcast_v32f16_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB131_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GCN-NEXT: v_or_b32_e32 v0, v44, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_or_b32_e32 v3, v50, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v34, v7 -; GCN-NEXT: v_or_b32_e32 v8, v33, v8 -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: v_or_b32_e32 v10, v31, v10 -; GCN-NEXT: v_or_b32_e32 v11, v21, v11 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB131_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB131_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v26, v24 -; GCN-NEXT: v_or_b32_e32 v10, v28, v27 -; GCN-NEXT: v_or_b32_e32 v11, v21, v29 -; GCN-NEXT: v_or_b32_e32 v12, v19, v25 -; GCN-NEXT: v_or_b32_e32 v13, v18, v23 -; GCN-NEXT: v_or_b32_e32 v14, v17, v22 -; GCN-NEXT: v_or_b32_e32 v15, v16, v20 -; GCN-NEXT: .LBB131_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32f16_to_v8i64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB131_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v16, 0x200 -; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 -; VI-NEXT: v_or_b32_e32 v15, v15, v17 -; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 -; VI-NEXT: v_or_b32_e32 v14, v14, v17 -; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 -; VI-NEXT: v_or_b32_e32 v13, v13, v17 -; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 -; VI-NEXT: v_or_b32_e32 v12, v12, v17 -; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 -; VI-NEXT: v_or_b32_e32 v11, v11, v17 -; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 -; VI-NEXT: v_or_b32_e32 v10, v10, v17 -; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v17 -; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v17 -; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v17 -; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v17 -; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v17 -; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v17 -; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v17 -; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v17 -; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 -; VI-NEXT: v_or_b32_e32 v1, v1, v17 -; VI-NEXT: v_or_b32_e32 v0, v0, v16 -; VI-NEXT: .LBB131_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32f16_to_v8i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB131_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_movk_i32 s6, 0x200 -; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB131_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32f16_to_v8i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB131_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB131_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false -cmp.true: - %a1 = fadd <32 x half> %a, splat (half 0xH0200) - %a2 = bitcast <32 x half> %a1 to <8 x i64> - br label %end -cmp.false: - %a3 = bitcast <32 x half> %a to <8 x i64> - br label %end -end: - %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <8 x i64> %phi -} - - -define <8 x double> @v_bitcast_v32f16_to_v8f64(<32 x half> %a, i32 %b) { -; GCN-LABEL: v_bitcast_v32f16_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v46 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v30 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB132_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v43 -; GCN-NEXT: v_or_b32_e32 v0, v44, v0 -; GCN-NEXT: v_or_b32_e32 v1, v42, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v52, v2 -; GCN-NEXT: v_or_b32_e32 v3, v50, v3 -; GCN-NEXT: v_or_b32_e32 v4, v48, v4 -; GCN-NEXT: v_or_b32_e32 v5, v38, v5 -; GCN-NEXT: v_or_b32_e32 v6, v36, v6 -; GCN-NEXT: v_or_b32_e32 v7, v34, v7 -; GCN-NEXT: v_or_b32_e32 v8, v33, v8 -; GCN-NEXT: v_or_b32_e32 v9, v32, v9 -; GCN-NEXT: v_or_b32_e32 v10, v31, v10 -; GCN-NEXT: v_or_b32_e32 v11, v21, v11 -; GCN-NEXT: v_or_b32_e32 v12, v19, v12 -; GCN-NEXT: v_or_b32_e32 v13, v18, v13 -; GCN-NEXT: v_or_b32_e32 v14, v17, v14 -; GCN-NEXT: v_or_b32_e32 v15, v16, v15 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: .LBB132_2: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB132_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v45 -; GCN-NEXT: v_cvt_f32_f16_e32 v1, v44 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v43 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v42 -; GCN-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; GCN-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_or_b32_e32 v0, v1, v0 -; GCN-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-NEXT: v_cvt_f32_f16_e32 v2, v41 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v52 -; GCN-NEXT: v_cvt_f32_f16_e32 v4, v40 -; GCN-NEXT: v_cvt_f32_f16_e32 v5, v50 -; GCN-NEXT: v_cvt_f32_f16_e32 v6, v55 -; GCN-NEXT: v_cvt_f32_f16_e32 v7, v48 -; GCN-NEXT: v_cvt_f32_f16_e32 v8, v54 -; GCN-NEXT: v_cvt_f32_f16_e32 v9, v38 -; GCN-NEXT: v_cvt_f32_f16_e32 v10, v53 -; GCN-NEXT: v_cvt_f32_f16_e32 v11, v36 -; GCN-NEXT: v_cvt_f32_f16_e32 v12, v51 -; GCN-NEXT: v_cvt_f32_f16_e32 v13, v34 -; GCN-NEXT: v_cvt_f32_f16_e32 v14, v49 -; GCN-NEXT: v_cvt_f32_f16_e32 v15, v33 -; GCN-NEXT: v_cvt_f32_f16_e32 v24, v39 -; GCN-NEXT: v_cvt_f32_f16_e32 v26, v32 -; GCN-NEXT: v_cvt_f32_f16_e32 v27, v37 -; GCN-NEXT: v_cvt_f32_f16_e32 v28, v31 -; GCN-NEXT: v_cvt_f32_f16_e32 v29, v35 -; GCN-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v25, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v19, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v23, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v18, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v17, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v16, v16 -; GCN-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; GCN-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; GCN-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; GCN-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; GCN-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; GCN-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; GCN-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; GCN-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; GCN-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; GCN-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; GCN-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; GCN-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; GCN-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; GCN-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; GCN-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; GCN-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; GCN-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; GCN-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; GCN-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; GCN-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; GCN-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; GCN-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; GCN-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; GCN-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; GCN-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; GCN-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; GCN-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; GCN-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v13 -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v15 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_or_b32_e32 v2, v3, v2 -; GCN-NEXT: v_or_b32_e32 v3, v5, v4 -; GCN-NEXT: v_or_b32_e32 v4, v7, v6 -; GCN-NEXT: v_or_b32_e32 v5, v9, v8 -; GCN-NEXT: v_or_b32_e32 v6, v11, v10 -; GCN-NEXT: v_or_b32_e32 v7, v13, v12 -; GCN-NEXT: v_or_b32_e32 v8, v15, v14 -; GCN-NEXT: v_or_b32_e32 v9, v26, v24 -; GCN-NEXT: v_or_b32_e32 v10, v28, v27 -; GCN-NEXT: v_or_b32_e32 v11, v21, v29 -; GCN-NEXT: v_or_b32_e32 v12, v19, v25 -; GCN-NEXT: v_or_b32_e32 v13, v18, v23 -; GCN-NEXT: v_or_b32_e32 v14, v17, v22 -; GCN-NEXT: v_or_b32_e32 v15, v16, v20 -; GCN-NEXT: .LBB132_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32f16_to_v8f64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB132_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v16, 0x200 -; VI-NEXT: v_add_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 -; VI-NEXT: v_or_b32_e32 v15, v15, v17 -; VI-NEXT: v_add_f16_sdwa v17, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 -; VI-NEXT: v_or_b32_e32 v14, v14, v17 -; VI-NEXT: v_add_f16_sdwa v17, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 -; VI-NEXT: v_or_b32_e32 v13, v13, v17 -; VI-NEXT: v_add_f16_sdwa v17, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 -; VI-NEXT: v_or_b32_e32 v12, v12, v17 -; VI-NEXT: v_add_f16_sdwa v17, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 -; VI-NEXT: v_or_b32_e32 v11, v11, v17 -; VI-NEXT: v_add_f16_sdwa v17, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 -; VI-NEXT: v_or_b32_e32 v10, v10, v17 -; VI-NEXT: v_add_f16_sdwa v17, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v17 -; VI-NEXT: v_add_f16_sdwa v17, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v17 -; VI-NEXT: v_add_f16_sdwa v17, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v17 -; VI-NEXT: v_add_f16_sdwa v17, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v17 -; VI-NEXT: v_add_f16_sdwa v17, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v17 -; VI-NEXT: v_add_f16_sdwa v17, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v17 -; VI-NEXT: v_add_f16_sdwa v17, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v17 -; VI-NEXT: v_add_f16_sdwa v17, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v17 -; VI-NEXT: v_add_f16_sdwa v17, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_add_f16_sdwa v16, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, 0x200, v0 -; VI-NEXT: v_or_b32_e32 v1, v1, v17 -; VI-NEXT: v_or_b32_e32 v0, v0, v16 -; VI-NEXT: .LBB132_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32f16_to_v8f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB132_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: s_movk_i32 s6, 0x200 -; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v14, v14, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v13, v13, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v12, v12, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v11, v11, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v10, v10, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v9, v9, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v8, v8, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v7, v7, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v6, v6, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v5, v5, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v4, v4, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v3, v3, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v2, v2, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v1, v1, s6 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_f16 v0, v0, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB132_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32f16_to_v8f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB132_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] -; GFX11-NEXT: v_pk_add_f16 v0, 0x200, v0 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB132_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false -cmp.true: - %a1 = fadd <32 x half> %a, splat (half 0xH0200) - %a2 = bitcast <32 x half> %a1 to <8 x double> - br label %end -cmp.false: - %a3 = bitcast <32 x half> %a to <8 x double> - br label %end -end: - %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <8 x double> %phi -} - - -define <8 x i64> @v_bitcast_v32i16_to_v8i64(<32 x i16> %a, i32 %b) { -; GCN-LABEL: v_bitcast_v32i16_to_v8i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB133_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB133_4 -; GCN-NEXT: .LBB133_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB133_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; GCN-NEXT: v_or_b32_e32 v0, v0, v54 -; GCN-NEXT: v_or_b32_e32 v1, v1, v55 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v48 -; GCN-NEXT: v_or_b32_e32 v4, v4, v49 -; GCN-NEXT: v_or_b32_e32 v5, v5, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_or_b32_e32 v15, v15, v53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB133_2 -; GCN-NEXT: .LBB133_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_or_b32_e32 v4, v49, v4 -; GCN-NEXT: v_or_b32_e32 v5, v50, v5 -; GCN-NEXT: v_or_b32_e32 v6, v51, v6 -; GCN-NEXT: v_or_b32_e32 v7, v52, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_or_b32_e32 v15, v53, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32i16_to_v8i64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB133_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v17, 3 -; VI-NEXT: v_add_u16_e32 v16, 3, v15 -; VI-NEXT: v_add_u16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: v_add_u16_e32 v16, 3, v14 -; VI-NEXT: v_add_u16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v16, v14 -; VI-NEXT: v_add_u16_e32 v16, 3, v13 -; VI-NEXT: v_add_u16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v16, v13 -; VI-NEXT: v_add_u16_e32 v16, 3, v12 -; VI-NEXT: v_add_u16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v16, v12 -; VI-NEXT: v_add_u16_e32 v16, 3, v11 -; VI-NEXT: v_add_u16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v16, v11 -; VI-NEXT: v_add_u16_e32 v16, 3, v10 -; VI-NEXT: v_add_u16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v16, v10 -; VI-NEXT: v_add_u16_e32 v16, 3, v9 -; VI-NEXT: v_add_u16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v16, v9 -; VI-NEXT: v_add_u16_e32 v16, 3, v8 -; VI-NEXT: v_add_u16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v16, v8 -; VI-NEXT: v_add_u16_e32 v16, 3, v7 -; VI-NEXT: v_add_u16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v16, v7 -; VI-NEXT: v_add_u16_e32 v16, 3, v6 -; VI-NEXT: v_add_u16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v16, v6 -; VI-NEXT: v_add_u16_e32 v16, 3, v5 -; VI-NEXT: v_add_u16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v16, v5 -; VI-NEXT: v_add_u16_e32 v16, 3, v4 -; VI-NEXT: v_add_u16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v16, v4 -; VI-NEXT: v_add_u16_e32 v16, 3, v3 -; VI-NEXT: v_add_u16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v16, v3 -; VI-NEXT: v_add_u16_e32 v16, 3, v2 -; VI-NEXT: v_add_u16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v16, v2 -; VI-NEXT: v_add_u16_e32 v16, 3, v1 -; VI-NEXT: v_add_u16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v16, v1 -; VI-NEXT: v_add_u16_e32 v16, 3, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB133_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32i16_to_v8i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB133_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB133_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32i16_to_v8i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB133_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB133_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false -cmp.true: - %a1 = add <32 x i16> %a, splat (i16 3) - %a2 = bitcast <32 x i16> %a1 to <8 x i64> - br label %end -cmp.false: - %a3 = bitcast <32 x i16> %a to <8 x i64> - br label %end -end: - %phi = phi <8 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <8 x i64> %phi -} - - -define <8 x double> @v_bitcast_v32i16_to_v8f64(<32 x i16> %a, i32 %b) { -; GCN-LABEL: v_bitcast_v32i16_to_v8f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v38, v14 -; GCN-NEXT: v_mov_b32_e32 v37, v12 -; GCN-NEXT: v_mov_b32_e32 v36, v10 -; GCN-NEXT: v_mov_b32_e32 v35, v8 -; GCN-NEXT: v_mov_b32_e32 v34, v6 -; GCN-NEXT: v_mov_b32_e32 v33, v4 -; GCN-NEXT: v_mov_b32_e32 v32, v2 -; GCN-NEXT: v_mov_b32_e32 v31, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v39, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v49, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v0 -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB134_3 -; GCN-NEXT: ; %bb.1: ; %Flow -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execnz .LBB134_4 -; GCN-NEXT: .LBB134_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; GCN-NEXT: .LBB134_3: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; GCN-NEXT: v_or_b32_e32 v0, v0, v54 -; GCN-NEXT: v_or_b32_e32 v1, v1, v55 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v33 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v36 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v37 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v38 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v30 -; GCN-NEXT: v_or_b32_e32 v2, v2, v39 -; GCN-NEXT: v_or_b32_e32 v3, v3, v48 -; GCN-NEXT: v_or_b32_e32 v4, v4, v49 -; GCN-NEXT: v_or_b32_e32 v5, v5, v50 -; GCN-NEXT: v_or_b32_e32 v6, v6, v51 -; GCN-NEXT: v_or_b32_e32 v7, v7, v52 -; GCN-NEXT: v_or_b32_e32 v8, v8, v17 -; GCN-NEXT: v_or_b32_e32 v9, v9, v19 -; GCN-NEXT: v_or_b32_e32 v10, v10, v21 -; GCN-NEXT: v_or_b32_e32 v11, v11, v23 -; GCN-NEXT: v_or_b32_e32 v12, v12, v25 -; GCN-NEXT: v_or_b32_e32 v13, v13, v27 -; GCN-NEXT: v_or_b32_e32 v14, v14, v29 -; GCN-NEXT: v_or_b32_e32 v15, v15, v53 -; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr37 -; GCN-NEXT: ; implicit-def: $vgpr38 -; GCN-NEXT: ; implicit-def: $vgpr16 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr20 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr24 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr28 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr39 -; GCN-NEXT: ; implicit-def: $vgpr48 -; GCN-NEXT: ; implicit-def: $vgpr49 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB134_2 -; GCN-NEXT: .LBB134_4: ; %cmp.true -; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v34 -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v36 -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v37 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v38 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v16 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v18 -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v20 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v22 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v24 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v26 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v28 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v30 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v0, v54, v0 -; GCN-NEXT: v_or_b32_e32 v1, v55, v1 -; GCN-NEXT: v_or_b32_e32 v2, v39, v2 -; GCN-NEXT: v_or_b32_e32 v3, v48, v3 -; GCN-NEXT: v_or_b32_e32 v4, v49, v4 -; GCN-NEXT: v_or_b32_e32 v5, v50, v5 -; GCN-NEXT: v_or_b32_e32 v6, v51, v6 -; GCN-NEXT: v_or_b32_e32 v7, v52, v7 -; GCN-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-NEXT: v_or_b32_e32 v9, v19, v9 -; GCN-NEXT: v_or_b32_e32 v10, v21, v10 -; GCN-NEXT: v_or_b32_e32 v11, v23, v11 -; GCN-NEXT: v_or_b32_e32 v12, v25, v12 -; GCN-NEXT: v_or_b32_e32 v13, v27, v13 -; GCN-NEXT: v_or_b32_e32 v14, v29, v14 -; GCN-NEXT: v_or_b32_e32 v15, v53, v15 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s6, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v8, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v9, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v13 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v14 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v15 -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32i16_to_v8f64: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB134_2 -; VI-NEXT: ; %bb.1: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v17, 3 -; VI-NEXT: v_add_u16_e32 v16, 3, v15 -; VI-NEXT: v_add_u16_sdwa v15, v15, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v15, v16, v15 -; VI-NEXT: v_add_u16_e32 v16, 3, v14 -; VI-NEXT: v_add_u16_sdwa v14, v14, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v16, v14 -; VI-NEXT: v_add_u16_e32 v16, 3, v13 -; VI-NEXT: v_add_u16_sdwa v13, v13, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v13, v16, v13 -; VI-NEXT: v_add_u16_e32 v16, 3, v12 -; VI-NEXT: v_add_u16_sdwa v12, v12, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v12, v16, v12 -; VI-NEXT: v_add_u16_e32 v16, 3, v11 -; VI-NEXT: v_add_u16_sdwa v11, v11, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v11, v16, v11 -; VI-NEXT: v_add_u16_e32 v16, 3, v10 -; VI-NEXT: v_add_u16_sdwa v10, v10, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v10, v16, v10 -; VI-NEXT: v_add_u16_e32 v16, 3, v9 -; VI-NEXT: v_add_u16_sdwa v9, v9, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v9, v16, v9 -; VI-NEXT: v_add_u16_e32 v16, 3, v8 -; VI-NEXT: v_add_u16_sdwa v8, v8, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v8, v16, v8 -; VI-NEXT: v_add_u16_e32 v16, 3, v7 -; VI-NEXT: v_add_u16_sdwa v7, v7, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v7, v16, v7 -; VI-NEXT: v_add_u16_e32 v16, 3, v6 -; VI-NEXT: v_add_u16_sdwa v6, v6, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v6, v16, v6 -; VI-NEXT: v_add_u16_e32 v16, 3, v5 -; VI-NEXT: v_add_u16_sdwa v5, v5, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v5, v16, v5 -; VI-NEXT: v_add_u16_e32 v16, 3, v4 -; VI-NEXT: v_add_u16_sdwa v4, v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v16, v4 -; VI-NEXT: v_add_u16_e32 v16, 3, v3 -; VI-NEXT: v_add_u16_sdwa v3, v3, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v3, v16, v3 -; VI-NEXT: v_add_u16_e32 v16, 3, v2 -; VI-NEXT: v_add_u16_sdwa v2, v2, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v16, v2 -; VI-NEXT: v_add_u16_e32 v16, 3, v1 -; VI-NEXT: v_add_u16_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v16, v1 -; VI-NEXT: v_add_u16_e32 v16, 3, v0 -; VI-NEXT: v_add_u16_sdwa v0, v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v16, v0 -; VI-NEXT: .LBB134_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32i16_to_v8f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB134_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.true -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB134_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32i16_to_v8f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v16 -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB134_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v0, v0, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB134_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false -cmp.true: - %a1 = add <32 x i16> %a, splat (i16 3) - %a2 = bitcast <32 x i16> %a1 to <8 x double> - br label %end -cmp.false: - %a3 = bitcast <32 x i16> %a to <8 x double> - br label %end -end: - %phi = phi <8 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <8 x double> %phi -} - - - - -define void @v_bitcast_v32f32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x float> %value) { -; GCN-LABEL: v_bitcast_v32f32_to_v64bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v58, 0 -; GCN-NEXT: v_mov_b32_e32 v59, 0 -; GCN-NEXT: v_mov_b32_e32 v56, 0 -; GCN-NEXT: v_mov_b32_e32 v57, 0 -; GCN-NEXT: v_mov_b32_e32 v46, 0 -; GCN-NEXT: v_mov_b32_e32 v47, 0 -; GCN-NEXT: v_mov_b32_e32 v44, 0 -; GCN-NEXT: v_mov_b32_e32 v45, 0 -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v38, 0 -; GCN-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB135_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v62 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v15 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v14 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v13 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v12 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v10 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v9 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v8 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v6 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v5 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v4 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; GCN-NEXT: .LBB135_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v58 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v56 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v46 -; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v44 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v42 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54 -; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v52 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v48 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v38 -; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v36 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v34 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v32 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32f32_to_v64bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s19, s4 -; VI-NEXT: s_mov_b32 s5, s4 -; VI-NEXT: s_mov_b32 s6, s4 -; VI-NEXT: s_mov_b32 s7, s4 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s4 -; VI-NEXT: s_mov_b32 s10, s4 -; VI-NEXT: s_mov_b32 s11, s4 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s4 -; VI-NEXT: s_mov_b32 s14, s4 -; VI-NEXT: s_mov_b32 s15, s4 -; VI-NEXT: s_mov_b32 s16, s4 -; VI-NEXT: s_mov_b32 s17, s4 -; VI-NEXT: s_mov_b32 s18, s4 -; VI-NEXT: v_mov_b32_e32 v50, s19 -; VI-NEXT: v_mov_b32_e32 v49, s18 -; VI-NEXT: v_mov_b32_e32 v48, s17 -; VI-NEXT: v_mov_b32_e32 v47, s16 -; VI-NEXT: v_mov_b32_e32 v46, s15 -; VI-NEXT: v_mov_b32_e32 v45, s14 -; VI-NEXT: v_mov_b32_e32 v44, s13 -; VI-NEXT: v_mov_b32_e32 v43, s12 -; VI-NEXT: v_mov_b32_e32 v42, s11 -; VI-NEXT: v_mov_b32_e32 v41, s10 -; VI-NEXT: v_mov_b32_e32 v40, s9 -; VI-NEXT: v_mov_b32_e32 v39, s8 -; VI-NEXT: v_mov_b32_e32 v38, s7 -; VI-NEXT: v_mov_b32_e32 v37, s6 -; VI-NEXT: v_mov_b32_e32 v36, s5 -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB135_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v50, v18 -; VI-NEXT: v_mov_b32_e32 v49, v17 -; VI-NEXT: v_mov_b32_e32 v48, v16 -; VI-NEXT: v_mov_b32_e32 v47, v15 -; VI-NEXT: v_mov_b32_e32 v46, v14 -; VI-NEXT: v_mov_b32_e32 v45, v13 -; VI-NEXT: v_mov_b32_e32 v44, v12 -; VI-NEXT: v_mov_b32_e32 v43, v11 -; VI-NEXT: v_mov_b32_e32 v42, v10 -; VI-NEXT: v_mov_b32_e32 v41, v9 -; VI-NEXT: v_mov_b32_e32 v40, v8 -; VI-NEXT: v_mov_b32_e32 v39, v7 -; VI-NEXT: v_mov_b32_e32 v38, v6 -; VI-NEXT: v_mov_b32_e32 v37, v5 -; VI-NEXT: v_mov_b32_e32 v36, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: .LBB135_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38] -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: s_movk_i32 s4, 0x70 -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x60 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x50 -; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] -; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8] -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32f32_to_v64bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s19, s4 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_mov_b32 s6, s4 -; GFX9-NEXT: s_mov_b32 s7, s4 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s4 -; GFX9-NEXT: s_mov_b32 s10, s4 -; GFX9-NEXT: s_mov_b32 s11, s4 -; GFX9-NEXT: s_mov_b32 s12, s4 -; GFX9-NEXT: s_mov_b32 s13, s4 -; GFX9-NEXT: s_mov_b32 s14, s4 -; GFX9-NEXT: s_mov_b32 s15, s4 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s4 -; GFX9-NEXT: s_mov_b32 s18, s4 -; GFX9-NEXT: v_mov_b32_e32 v50, s19 -; GFX9-NEXT: v_mov_b32_e32 v49, s18 -; GFX9-NEXT: v_mov_b32_e32 v48, s17 -; GFX9-NEXT: v_mov_b32_e32 v47, s16 -; GFX9-NEXT: v_mov_b32_e32 v46, s15 -; GFX9-NEXT: v_mov_b32_e32 v45, s14 -; GFX9-NEXT: v_mov_b32_e32 v44, s13 -; GFX9-NEXT: v_mov_b32_e32 v43, s12 -; GFX9-NEXT: v_mov_b32_e32 v42, s11 -; GFX9-NEXT: v_mov_b32_e32 v41, s10 -; GFX9-NEXT: v_mov_b32_e32 v40, s9 -; GFX9-NEXT: v_mov_b32_e32 v39, s8 -; GFX9-NEXT: v_mov_b32_e32 v38, s7 -; GFX9-NEXT: v_mov_b32_e32 v37, s6 -; GFX9-NEXT: v_mov_b32_e32 v36, s5 -; GFX9-NEXT: v_mov_b32_e32 v35, s4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB135_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v50, v18 -; GFX9-NEXT: v_mov_b32_e32 v49, v17 -; GFX9-NEXT: v_mov_b32_e32 v48, v16 -; GFX9-NEXT: v_mov_b32_e32 v47, v15 -; GFX9-NEXT: v_mov_b32_e32 v46, v14 -; GFX9-NEXT: v_mov_b32_e32 v45, v13 -; GFX9-NEXT: v_mov_b32_e32 v44, v12 -; GFX9-NEXT: v_mov_b32_e32 v43, v11 -; GFX9-NEXT: v_mov_b32_e32 v42, v10 -; GFX9-NEXT: v_mov_b32_e32 v41, v9 -; GFX9-NEXT: v_mov_b32_e32 v40, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v7 -; GFX9-NEXT: v_mov_b32_e32 v38, v6 -; GFX9-NEXT: v_mov_b32_e32 v37, v5 -; GFX9-NEXT: v_mov_b32_e32 v36, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB135_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32f32_to_v64bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s15, s0 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s0 -; GFX11-NEXT: s_mov_b32 s10, s0 -; GFX11-NEXT: s_mov_b32 s11, s0 -; GFX11-NEXT: s_mov_b32 s12, s0 -; GFX11-NEXT: s_mov_b32 s13, s0 -; GFX11-NEXT: s_mov_b32 s14, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14 -; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0 -; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12 -; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10 -; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8 -; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6 -; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4 -; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2 -; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56 -; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54 -; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58 -; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60 -; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62 -; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64 -; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB135_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 -; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15 -; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13 -; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11 -; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9 -; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7 -; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5 -; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3 -; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 -; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29 -; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27 -; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25 -; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23 -; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21 -; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19 -; GFX11-NEXT: .LBB135_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off -; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112 -; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96 -; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80 -; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64 -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <32 x float> %value to <64 x bfloat> - br label %end - -end: - %phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <64 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v32i32_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <32 x i32> %value) { -; GCN-LABEL: v_bitcast_v32i32_to_v64bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v58, 0 -; GCN-NEXT: v_mov_b32_e32 v59, 0 -; GCN-NEXT: v_mov_b32_e32 v56, 0 -; GCN-NEXT: v_mov_b32_e32 v57, 0 -; GCN-NEXT: v_mov_b32_e32 v46, 0 -; GCN-NEXT: v_mov_b32_e32 v47, 0 -; GCN-NEXT: v_mov_b32_e32 v44, 0 -; GCN-NEXT: v_mov_b32_e32 v45, 0 -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v38, 0 -; GCN-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NEXT: v_mov_b32_e32 v35, 0 -; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB136_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v63 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v62 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v62 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v61 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v60 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v29 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v28 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v27 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v26 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v25 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v24 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v23 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v22 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v21 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v20 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v19 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v17 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v17 -; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v32, 16, v16 -; GCN-NEXT: v_and_b32_e32 v35, 0xffff0000, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v15 -; GCN-NEXT: v_and_b32_e32 v37, 0xffff0000, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v36, 16, v14 -; GCN-NEXT: v_and_b32_e32 v39, 0xffff0000, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v38, 16, v13 -; GCN-NEXT: v_and_b32_e32 v49, 0xffff0000, v12 -; GCN-NEXT: v_lshlrev_b32_e32 v48, 16, v12 -; GCN-NEXT: v_and_b32_e32 v51, 0xffff0000, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v50, 16, v11 -; GCN-NEXT: v_and_b32_e32 v53, 0xffff0000, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v10 -; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v9 -; GCN-NEXT: v_and_b32_e32 v41, 0xffff0000, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v40, 16, v8 -; GCN-NEXT: v_and_b32_e32 v43, 0xffff0000, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; GCN-NEXT: v_and_b32_e32 v45, 0xffff0000, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v6 -; GCN-NEXT: v_and_b32_e32 v47, 0xffff0000, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v5 -; GCN-NEXT: v_and_b32_e32 v57, 0xffff0000, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v4 -; GCN-NEXT: v_and_b32_e32 v59, 0xffff0000, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; GCN-NEXT: .LBB136_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v58 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v56 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v46 -; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v44 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v42 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v40 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v54 -; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v52 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v50 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v48 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v39 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v38 -; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v37 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v36 -; GCN-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v34 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v32 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_alignbit_b32 v5, v5, v0, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v32i32_to_v64bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s19, s4 -; VI-NEXT: s_mov_b32 s5, s4 -; VI-NEXT: s_mov_b32 s6, s4 -; VI-NEXT: s_mov_b32 s7, s4 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s4 -; VI-NEXT: s_mov_b32 s10, s4 -; VI-NEXT: s_mov_b32 s11, s4 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s4 -; VI-NEXT: s_mov_b32 s14, s4 -; VI-NEXT: s_mov_b32 s15, s4 -; VI-NEXT: s_mov_b32 s16, s4 -; VI-NEXT: s_mov_b32 s17, s4 -; VI-NEXT: s_mov_b32 s18, s4 -; VI-NEXT: v_mov_b32_e32 v50, s19 -; VI-NEXT: v_mov_b32_e32 v49, s18 -; VI-NEXT: v_mov_b32_e32 v48, s17 -; VI-NEXT: v_mov_b32_e32 v47, s16 -; VI-NEXT: v_mov_b32_e32 v46, s15 -; VI-NEXT: v_mov_b32_e32 v45, s14 -; VI-NEXT: v_mov_b32_e32 v44, s13 -; VI-NEXT: v_mov_b32_e32 v43, s12 -; VI-NEXT: v_mov_b32_e32 v42, s11 -; VI-NEXT: v_mov_b32_e32 v41, s10 -; VI-NEXT: v_mov_b32_e32 v40, s9 -; VI-NEXT: v_mov_b32_e32 v39, s8 -; VI-NEXT: v_mov_b32_e32 v38, s7 -; VI-NEXT: v_mov_b32_e32 v37, s6 -; VI-NEXT: v_mov_b32_e32 v36, s5 -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB136_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v50, v18 -; VI-NEXT: v_mov_b32_e32 v49, v17 -; VI-NEXT: v_mov_b32_e32 v48, v16 -; VI-NEXT: v_mov_b32_e32 v47, v15 -; VI-NEXT: v_mov_b32_e32 v46, v14 -; VI-NEXT: v_mov_b32_e32 v45, v13 -; VI-NEXT: v_mov_b32_e32 v44, v12 -; VI-NEXT: v_mov_b32_e32 v43, v11 -; VI-NEXT: v_mov_b32_e32 v42, v10 -; VI-NEXT: v_mov_b32_e32 v41, v9 -; VI-NEXT: v_mov_b32_e32 v40, v8 -; VI-NEXT: v_mov_b32_e32 v39, v7 -; VI-NEXT: v_mov_b32_e32 v38, v6 -; VI-NEXT: v_mov_b32_e32 v37, v5 -; VI-NEXT: v_mov_b32_e32 v36, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: .LBB136_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38] -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: s_movk_i32 s4, 0x70 -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x60 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x50 -; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] -; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8] -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v32i32_to_v64bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s19, s4 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_mov_b32 s6, s4 -; GFX9-NEXT: s_mov_b32 s7, s4 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s4 -; GFX9-NEXT: s_mov_b32 s10, s4 -; GFX9-NEXT: s_mov_b32 s11, s4 -; GFX9-NEXT: s_mov_b32 s12, s4 -; GFX9-NEXT: s_mov_b32 s13, s4 -; GFX9-NEXT: s_mov_b32 s14, s4 -; GFX9-NEXT: s_mov_b32 s15, s4 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s4 -; GFX9-NEXT: s_mov_b32 s18, s4 -; GFX9-NEXT: v_mov_b32_e32 v50, s19 -; GFX9-NEXT: v_mov_b32_e32 v49, s18 -; GFX9-NEXT: v_mov_b32_e32 v48, s17 -; GFX9-NEXT: v_mov_b32_e32 v47, s16 -; GFX9-NEXT: v_mov_b32_e32 v46, s15 -; GFX9-NEXT: v_mov_b32_e32 v45, s14 -; GFX9-NEXT: v_mov_b32_e32 v44, s13 -; GFX9-NEXT: v_mov_b32_e32 v43, s12 -; GFX9-NEXT: v_mov_b32_e32 v42, s11 -; GFX9-NEXT: v_mov_b32_e32 v41, s10 -; GFX9-NEXT: v_mov_b32_e32 v40, s9 -; GFX9-NEXT: v_mov_b32_e32 v39, s8 -; GFX9-NEXT: v_mov_b32_e32 v38, s7 -; GFX9-NEXT: v_mov_b32_e32 v37, s6 -; GFX9-NEXT: v_mov_b32_e32 v36, s5 -; GFX9-NEXT: v_mov_b32_e32 v35, s4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB136_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v50, v18 -; GFX9-NEXT: v_mov_b32_e32 v49, v17 -; GFX9-NEXT: v_mov_b32_e32 v48, v16 -; GFX9-NEXT: v_mov_b32_e32 v47, v15 -; GFX9-NEXT: v_mov_b32_e32 v46, v14 -; GFX9-NEXT: v_mov_b32_e32 v45, v13 -; GFX9-NEXT: v_mov_b32_e32 v44, v12 -; GFX9-NEXT: v_mov_b32_e32 v43, v11 -; GFX9-NEXT: v_mov_b32_e32 v42, v10 -; GFX9-NEXT: v_mov_b32_e32 v41, v9 -; GFX9-NEXT: v_mov_b32_e32 v40, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v7 -; GFX9-NEXT: v_mov_b32_e32 v38, v6 -; GFX9-NEXT: v_mov_b32_e32 v37, v5 -; GFX9-NEXT: v_mov_b32_e32 v36, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB136_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v32i32_to_v64bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s15, s0 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s0 -; GFX11-NEXT: s_mov_b32 s10, s0 -; GFX11-NEXT: s_mov_b32 s11, s0 -; GFX11-NEXT: s_mov_b32 s12, s0 -; GFX11-NEXT: s_mov_b32 s13, s0 -; GFX11-NEXT: s_mov_b32 s14, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14 -; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0 -; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12 -; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10 -; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8 -; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6 -; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4 -; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2 -; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56 -; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54 -; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58 -; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60 -; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62 -; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64 -; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB136_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 -; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15 -; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13 -; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11 -; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9 -; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7 -; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5 -; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3 -; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 -; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29 -; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27 -; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25 -; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23 -; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21 -; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19 -; GFX11-NEXT: .LBB136_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off -; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112 -; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96 -; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80 -; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64 -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <32 x i32> %value to <64 x bfloat> - br label %end - -end: - %phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <64 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v64i16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x i16> %value) { -; GCN-LABEL: v_bitcast_v64i16_to_v64bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:140 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:128 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:108 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:104 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:96 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:92 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:76 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt expcnt(3) -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt expcnt(2) -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt expcnt(1) -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v57, 0 -; GCN-NEXT: v_mov_b32_e32 v59, 0 -; GCN-NEXT: v_mov_b32_e32 v56, 0 -; GCN-NEXT: v_mov_b32_e32 v58, 0 -; GCN-NEXT: v_mov_b32_e32 v45, 0 -; GCN-NEXT: v_mov_b32_e32 v47, 0 -; GCN-NEXT: v_mov_b32_e32 v44, 0 -; GCN-NEXT: v_mov_b32_e32 v46, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB137_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v13 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v14 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v15 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v17 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v18 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v20 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v41 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v21 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v40 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v22 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v24 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v55 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v25 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v54 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v26 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v27 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v53 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v29 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v52 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v30 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v51 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v50 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v49 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v48 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v39 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v38 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v36 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v34 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v33 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v31 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v42, 16, v63 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v62 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v61 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v60 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: .LBB137_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v57 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v56 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v45 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v44 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v42 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v64i16_to_v64bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s19, s4 -; VI-NEXT: s_mov_b32 s5, s4 -; VI-NEXT: s_mov_b32 s6, s4 -; VI-NEXT: s_mov_b32 s7, s4 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s4 -; VI-NEXT: s_mov_b32 s10, s4 -; VI-NEXT: s_mov_b32 s11, s4 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s4 -; VI-NEXT: s_mov_b32 s14, s4 -; VI-NEXT: s_mov_b32 s15, s4 -; VI-NEXT: s_mov_b32 s16, s4 -; VI-NEXT: s_mov_b32 s17, s4 -; VI-NEXT: s_mov_b32 s18, s4 -; VI-NEXT: v_mov_b32_e32 v50, s19 -; VI-NEXT: v_mov_b32_e32 v49, s18 -; VI-NEXT: v_mov_b32_e32 v48, s17 -; VI-NEXT: v_mov_b32_e32 v47, s16 -; VI-NEXT: v_mov_b32_e32 v46, s15 -; VI-NEXT: v_mov_b32_e32 v45, s14 -; VI-NEXT: v_mov_b32_e32 v44, s13 -; VI-NEXT: v_mov_b32_e32 v43, s12 -; VI-NEXT: v_mov_b32_e32 v42, s11 -; VI-NEXT: v_mov_b32_e32 v41, s10 -; VI-NEXT: v_mov_b32_e32 v40, s9 -; VI-NEXT: v_mov_b32_e32 v39, s8 -; VI-NEXT: v_mov_b32_e32 v38, s7 -; VI-NEXT: v_mov_b32_e32 v37, s6 -; VI-NEXT: v_mov_b32_e32 v36, s5 -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB137_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v50, v18 -; VI-NEXT: v_mov_b32_e32 v49, v17 -; VI-NEXT: v_mov_b32_e32 v48, v16 -; VI-NEXT: v_mov_b32_e32 v47, v15 -; VI-NEXT: v_mov_b32_e32 v46, v14 -; VI-NEXT: v_mov_b32_e32 v45, v13 -; VI-NEXT: v_mov_b32_e32 v44, v12 -; VI-NEXT: v_mov_b32_e32 v43, v11 -; VI-NEXT: v_mov_b32_e32 v42, v10 -; VI-NEXT: v_mov_b32_e32 v41, v9 -; VI-NEXT: v_mov_b32_e32 v40, v8 -; VI-NEXT: v_mov_b32_e32 v39, v7 -; VI-NEXT: v_mov_b32_e32 v38, v6 -; VI-NEXT: v_mov_b32_e32 v37, v5 -; VI-NEXT: v_mov_b32_e32 v36, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: .LBB137_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38] -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: s_movk_i32 s4, 0x70 -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x60 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x50 -; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] -; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8] -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v64i16_to_v64bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s19, s4 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_mov_b32 s6, s4 -; GFX9-NEXT: s_mov_b32 s7, s4 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s4 -; GFX9-NEXT: s_mov_b32 s10, s4 -; GFX9-NEXT: s_mov_b32 s11, s4 -; GFX9-NEXT: s_mov_b32 s12, s4 -; GFX9-NEXT: s_mov_b32 s13, s4 -; GFX9-NEXT: s_mov_b32 s14, s4 -; GFX9-NEXT: s_mov_b32 s15, s4 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s4 -; GFX9-NEXT: s_mov_b32 s18, s4 -; GFX9-NEXT: v_mov_b32_e32 v50, s19 -; GFX9-NEXT: v_mov_b32_e32 v49, s18 -; GFX9-NEXT: v_mov_b32_e32 v48, s17 -; GFX9-NEXT: v_mov_b32_e32 v47, s16 -; GFX9-NEXT: v_mov_b32_e32 v46, s15 -; GFX9-NEXT: v_mov_b32_e32 v45, s14 -; GFX9-NEXT: v_mov_b32_e32 v44, s13 -; GFX9-NEXT: v_mov_b32_e32 v43, s12 -; GFX9-NEXT: v_mov_b32_e32 v42, s11 -; GFX9-NEXT: v_mov_b32_e32 v41, s10 -; GFX9-NEXT: v_mov_b32_e32 v40, s9 -; GFX9-NEXT: v_mov_b32_e32 v39, s8 -; GFX9-NEXT: v_mov_b32_e32 v38, s7 -; GFX9-NEXT: v_mov_b32_e32 v37, s6 -; GFX9-NEXT: v_mov_b32_e32 v36, s5 -; GFX9-NEXT: v_mov_b32_e32 v35, s4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB137_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v50, v18 -; GFX9-NEXT: v_mov_b32_e32 v49, v17 -; GFX9-NEXT: v_mov_b32_e32 v48, v16 -; GFX9-NEXT: v_mov_b32_e32 v47, v15 -; GFX9-NEXT: v_mov_b32_e32 v46, v14 -; GFX9-NEXT: v_mov_b32_e32 v45, v13 -; GFX9-NEXT: v_mov_b32_e32 v44, v12 -; GFX9-NEXT: v_mov_b32_e32 v43, v11 -; GFX9-NEXT: v_mov_b32_e32 v42, v10 -; GFX9-NEXT: v_mov_b32_e32 v41, v9 -; GFX9-NEXT: v_mov_b32_e32 v40, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v7 -; GFX9-NEXT: v_mov_b32_e32 v38, v6 -; GFX9-NEXT: v_mov_b32_e32 v37, v5 -; GFX9-NEXT: v_mov_b32_e32 v36, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB137_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v64i16_to_v64bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s15, s0 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s0 -; GFX11-NEXT: s_mov_b32 s10, s0 -; GFX11-NEXT: s_mov_b32 s11, s0 -; GFX11-NEXT: s_mov_b32 s12, s0 -; GFX11-NEXT: s_mov_b32 s13, s0 -; GFX11-NEXT: s_mov_b32 s14, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14 -; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0 -; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12 -; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10 -; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8 -; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6 -; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4 -; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2 -; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56 -; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54 -; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58 -; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60 -; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62 -; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64 -; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB137_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 -; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15 -; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13 -; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11 -; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9 -; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7 -; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5 -; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3 -; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 -; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29 -; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27 -; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25 -; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23 -; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21 -; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19 -; GFX11-NEXT: .LBB137_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off -; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112 -; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96 -; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80 -; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64 -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <64 x i16> %value to <64 x bfloat> - br label %end - -end: - %phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <64 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v64f16_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <64 x half> %value) { -; GCN-LABEL: v_bitcast_v64f16_to_v64bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v29, v16 -; GCN-NEXT: v_mov_b32_e32 v16, v15 -; GCN-NEXT: v_mov_b32_e32 v15, v14 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 -; GCN-NEXT: v_mov_b32_e32 v14, 0 -; GCN-NEXT: v_mov_b32_e32 v57, 0 -; GCN-NEXT: v_mov_b32_e32 v59, 0 -; GCN-NEXT: v_mov_b32_e32 v56, 0 -; GCN-NEXT: v_mov_b32_e32 v58, 0 -; GCN-NEXT: v_mov_b32_e32 v45, 0 -; GCN-NEXT: v_mov_b32_e32 v47, 0 -; GCN-NEXT: v_mov_b32_e32 v44, 0 -; GCN-NEXT: v_mov_b32_e32 v46, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v34, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB138_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_cvt_f16_f32_e32 v57, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v59, v4 -; GCN-NEXT: v_cvt_f16_f32_e32 v56, v5 -; GCN-NEXT: v_cvt_f16_f32_e32 v58, v6 -; GCN-NEXT: v_cvt_f16_f32_e32 v45, v7 -; GCN-NEXT: v_cvt_f16_f32_e32 v47, v8 -; GCN-NEXT: v_cvt_f16_f32_e32 v44, v9 -; GCN-NEXT: v_cvt_f16_f32_e32 v46, v10 -; GCN-NEXT: v_cvt_f16_f32_e32 v41, v11 -; GCN-NEXT: v_cvt_f16_f32_e32 v43, v12 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v13 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v15 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f16_f32_e32 v13, v16 -; GCN-NEXT: v_cvt_f16_f32_e32 v12, v29 -; GCN-NEXT: v_cvt_f16_f32_e32 v17, v17 -; GCN-NEXT: v_cvt_f16_f32_e32 v18, v18 -; GCN-NEXT: v_cvt_f16_f32_e32 v19, v19 -; GCN-NEXT: v_cvt_f16_f32_e32 v20, v20 -; GCN-NEXT: v_cvt_f16_f32_e32 v21, v21 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v51, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v33, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v11, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v52, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v34, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v29, v3 -; GCN-NEXT: v_cvt_f16_f32_e32 v22, v22 -; GCN-NEXT: v_cvt_f16_f32_e32 v30, v30 -; GCN-NEXT: v_cvt_f16_f32_e32 v50, v50 -; GCN-NEXT: v_cvt_f16_f32_e32 v49, v49 -; GCN-NEXT: v_cvt_f16_f32_e32 v23, v23 -; GCN-NEXT: v_cvt_f16_f32_e32 v61, v61 -; GCN-NEXT: v_cvt_f16_f32_e32 v32, v32 -; GCN-NEXT: v_cvt_f16_f32_e32 v48, v48 -; GCN-NEXT: v_cvt_f16_f32_e32 v24, v24 -; GCN-NEXT: v_cvt_f16_f32_e32 v39, v39 -; GCN-NEXT: v_cvt_f16_f32_e32 v60, v60 -; GCN-NEXT: v_cvt_f16_f32_e32 v25, v25 -; GCN-NEXT: v_cvt_f16_f32_e32 v38, v38 -; GCN-NEXT: v_cvt_f16_f32_e32 v37, v37 -; GCN-NEXT: v_cvt_f16_f32_e32 v26, v26 -; GCN-NEXT: v_cvt_f16_f32_e32 v31, v31 -; GCN-NEXT: v_cvt_f16_f32_e32 v36, v36 -; GCN-NEXT: v_cvt_f16_f32_e32 v27, v27 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v63, v63 -; GCN-NEXT: v_cvt_f16_f32_e32 v28, v28 -; GCN-NEXT: v_cvt_f16_f32_e32 v35, v35 -; GCN-NEXT: v_cvt_f16_f32_e32 v62, v62 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v7, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v53, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v8, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v9, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v54, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v10, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v55, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v40, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v42, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v16, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v15, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v14, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v59, 16, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; GCN-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v43 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v17 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v18 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v19 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v20 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v21 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v51 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v33 -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v34 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v29 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v22 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v30 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v50 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v49 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v23 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v61 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v32 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v48 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v24 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v39 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v60 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v25 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v38 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v37 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v31 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v27 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v63 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v35 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v53 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v10 -; GCN-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v51, 16, v40 -; GCN-NEXT: v_mov_b32_e32 v40, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v54, 16, v42 -; GCN-NEXT: v_mov_b32_e32 v42, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v52, 16, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v33, 16, v14 -; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v0 -; GCN-NEXT: .LBB138_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v59 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v57 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v56 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v47 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v45 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v44 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v42 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v40 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v53 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v51 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v34 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v33 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v64f16_to_v64bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s19, s4 -; VI-NEXT: s_mov_b32 s5, s4 -; VI-NEXT: s_mov_b32 s6, s4 -; VI-NEXT: s_mov_b32 s7, s4 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s4 -; VI-NEXT: s_mov_b32 s10, s4 -; VI-NEXT: s_mov_b32 s11, s4 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s4 -; VI-NEXT: s_mov_b32 s14, s4 -; VI-NEXT: s_mov_b32 s15, s4 -; VI-NEXT: s_mov_b32 s16, s4 -; VI-NEXT: s_mov_b32 s17, s4 -; VI-NEXT: s_mov_b32 s18, s4 -; VI-NEXT: v_mov_b32_e32 v50, s19 -; VI-NEXT: v_mov_b32_e32 v49, s18 -; VI-NEXT: v_mov_b32_e32 v48, s17 -; VI-NEXT: v_mov_b32_e32 v47, s16 -; VI-NEXT: v_mov_b32_e32 v46, s15 -; VI-NEXT: v_mov_b32_e32 v45, s14 -; VI-NEXT: v_mov_b32_e32 v44, s13 -; VI-NEXT: v_mov_b32_e32 v43, s12 -; VI-NEXT: v_mov_b32_e32 v42, s11 -; VI-NEXT: v_mov_b32_e32 v41, s10 -; VI-NEXT: v_mov_b32_e32 v40, s9 -; VI-NEXT: v_mov_b32_e32 v39, s8 -; VI-NEXT: v_mov_b32_e32 v38, s7 -; VI-NEXT: v_mov_b32_e32 v37, s6 -; VI-NEXT: v_mov_b32_e32 v36, s5 -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB138_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v50, v18 -; VI-NEXT: v_mov_b32_e32 v49, v17 -; VI-NEXT: v_mov_b32_e32 v48, v16 -; VI-NEXT: v_mov_b32_e32 v47, v15 -; VI-NEXT: v_mov_b32_e32 v46, v14 -; VI-NEXT: v_mov_b32_e32 v45, v13 -; VI-NEXT: v_mov_b32_e32 v44, v12 -; VI-NEXT: v_mov_b32_e32 v43, v11 -; VI-NEXT: v_mov_b32_e32 v42, v10 -; VI-NEXT: v_mov_b32_e32 v41, v9 -; VI-NEXT: v_mov_b32_e32 v40, v8 -; VI-NEXT: v_mov_b32_e32 v39, v7 -; VI-NEXT: v_mov_b32_e32 v38, v6 -; VI-NEXT: v_mov_b32_e32 v37, v5 -; VI-NEXT: v_mov_b32_e32 v36, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: .LBB138_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38] -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: s_movk_i32 s4, 0x70 -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x60 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x50 -; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] -; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8] -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v64f16_to_v64bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s19, s4 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_mov_b32 s6, s4 -; GFX9-NEXT: s_mov_b32 s7, s4 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s4 -; GFX9-NEXT: s_mov_b32 s10, s4 -; GFX9-NEXT: s_mov_b32 s11, s4 -; GFX9-NEXT: s_mov_b32 s12, s4 -; GFX9-NEXT: s_mov_b32 s13, s4 -; GFX9-NEXT: s_mov_b32 s14, s4 -; GFX9-NEXT: s_mov_b32 s15, s4 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s4 -; GFX9-NEXT: s_mov_b32 s18, s4 -; GFX9-NEXT: v_mov_b32_e32 v50, s19 -; GFX9-NEXT: v_mov_b32_e32 v49, s18 -; GFX9-NEXT: v_mov_b32_e32 v48, s17 -; GFX9-NEXT: v_mov_b32_e32 v47, s16 -; GFX9-NEXT: v_mov_b32_e32 v46, s15 -; GFX9-NEXT: v_mov_b32_e32 v45, s14 -; GFX9-NEXT: v_mov_b32_e32 v44, s13 -; GFX9-NEXT: v_mov_b32_e32 v43, s12 -; GFX9-NEXT: v_mov_b32_e32 v42, s11 -; GFX9-NEXT: v_mov_b32_e32 v41, s10 -; GFX9-NEXT: v_mov_b32_e32 v40, s9 -; GFX9-NEXT: v_mov_b32_e32 v39, s8 -; GFX9-NEXT: v_mov_b32_e32 v38, s7 -; GFX9-NEXT: v_mov_b32_e32 v37, s6 -; GFX9-NEXT: v_mov_b32_e32 v36, s5 -; GFX9-NEXT: v_mov_b32_e32 v35, s4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB138_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v50, v18 -; GFX9-NEXT: v_mov_b32_e32 v49, v17 -; GFX9-NEXT: v_mov_b32_e32 v48, v16 -; GFX9-NEXT: v_mov_b32_e32 v47, v15 -; GFX9-NEXT: v_mov_b32_e32 v46, v14 -; GFX9-NEXT: v_mov_b32_e32 v45, v13 -; GFX9-NEXT: v_mov_b32_e32 v44, v12 -; GFX9-NEXT: v_mov_b32_e32 v43, v11 -; GFX9-NEXT: v_mov_b32_e32 v42, v10 -; GFX9-NEXT: v_mov_b32_e32 v41, v9 -; GFX9-NEXT: v_mov_b32_e32 v40, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v7 -; GFX9-NEXT: v_mov_b32_e32 v38, v6 -; GFX9-NEXT: v_mov_b32_e32 v37, v5 -; GFX9-NEXT: v_mov_b32_e32 v36, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB138_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v64f16_to_v64bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s15, s0 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s0 -; GFX11-NEXT: s_mov_b32 s10, s0 -; GFX11-NEXT: s_mov_b32 s11, s0 -; GFX11-NEXT: s_mov_b32 s12, s0 -; GFX11-NEXT: s_mov_b32 s13, s0 -; GFX11-NEXT: s_mov_b32 s14, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14 -; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0 -; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12 -; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10 -; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8 -; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6 -; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4 -; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2 -; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56 -; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54 -; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58 -; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60 -; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62 -; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64 -; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB138_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 -; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15 -; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13 -; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11 -; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9 -; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7 -; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5 -; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3 -; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 -; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29 -; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27 -; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25 -; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23 -; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21 -; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19 -; GFX11-NEXT: .LBB138_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off -; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112 -; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96 -; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80 -; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64 -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <64 x half> %value to <64 x bfloat> - br label %end - -end: - %phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <64 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v128i8_to_v64bf16(i32 %cond, ptr addrspace(1) %out, <128 x i8> %value) { -; GCN-LABEL: v_bitcast_v128i8_to_v64bf16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:224 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:192 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_mov_b32_e32 v14, 0 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_mov_b32_e32 v16, 0 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v56, 0 -; GCN-NEXT: v_mov_b32_e32 v62, 0 -; GCN-NEXT: v_mov_b32_e32 v33, 0 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_mov_b32_e32 v27, 0 -; GCN-NEXT: v_mov_b32_e32 v47, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v23, 0 -; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: v_mov_b32_e32 v28, 0 -; GCN-NEXT: v_mov_b32_e32 v24, 0 -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: v_mov_b32_e32 v26, 0 -; GCN-NEXT: v_mov_b32_e32 v61, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v60, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v44, 0 -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v19, 0 -; GCN-NEXT: v_mov_b32_e32 v25, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v20, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v17, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v18, 0 -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v46, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v21, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB139_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v8 -; GCN-NEXT: v_or_b32_e32 v7, v0, v3 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v8, v0, v3 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v24, v0, v3 -; GCN-NEXT: v_and_b32_e32 v0, 0xff, v48 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v39 -; GCN-NEXT: v_or_b32_e32 v0, v0, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v29 -; GCN-NEXT: v_or_b32_e32 v23, v3, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v37 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v35 -; GCN-NEXT: v_or_b32_e32 v17, v3, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v34 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v15 -; GCN-NEXT: v_or_b32_e32 v18, v3, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v32 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v63 -; GCN-NEXT: v_or_b32_e32 v21, v3, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v59 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v58 -; GCN-NEXT: v_or_b32_e32 v25, v3, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v42 -; GCN-NEXT: v_or_b32_e32 v30, v3, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v55 -; GCN-NEXT: v_or_b32_e32 v35, v3, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: v_or_b32_e32 v55, v3, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: v_or_b32_e32 v42, v3, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: v_or_b32_e32 v45, v3, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: v_or_b32_e32 v32, v3, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: v_or_b32_e32 v57, v3, v4 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v14, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v12, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v22, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v4, v4, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v56, v5, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v33, v5, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v10, v5, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v27, v5, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v52, v6, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v9, v6, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v40, v6, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v36, v6, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v49, v6, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_or_b32_e32 v3, v6, v3 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v31, v11, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v28, v11, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_or_b32_e32 v6, v11, v6 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v26, v13, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v61, v13, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_or_b32_e32 v11, v13, v11 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v41, v15, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v60, v15, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v44, v15, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v19, v15, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v54, v15, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v20, v15, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v51, v15, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v50, v15, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_or_b32_e32 v13, v15, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; GCN-NEXT: v_lshlrev_b32_e32 v47, 16, v24 -; GCN-NEXT: v_lshlrev_b32_e32 v43, 16, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v17 -; GCN-NEXT: v_lshlrev_b32_e32 v53, 16, v18 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v21 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v30 -; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v46, 16, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v45 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v57 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: .LBB139_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v14 -; GCN-NEXT: v_alignbit_b32 v12, v7, v8, 16 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v16 -; GCN-NEXT: v_mov_b32_e32 v16, v13 -; GCN-NEXT: v_alignbit_b32 v13, v7, v8, 16 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v14, v7, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v33 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v62 -; GCN-NEXT: v_alignbit_b32 v15, v4, v7, 16 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[12:15], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v10 -; GCN-NEXT: v_alignbit_b32 v7, v4, v7, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v47 -; GCN-NEXT: v_alignbit_b32 v8, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v52 -; GCN-NEXT: v_alignbit_b32 v9, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v43 -; GCN-NEXT: v_alignbit_b32 v10, v4, v5, 16 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v49 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v36 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v7, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v23 -; GCN-NEXT: v_alignbit_b32 v8, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v31 -; GCN-NEXT: v_alignbit_b32 v9, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v24 -; GCN-NEXT: v_alignbit_b32 v10, v3, v4, 16 -; GCN-NEXT: buffer_store_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v26 -; GCN-NEXT: v_alignbit_b32 v3, v3, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v53 -; GCN-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v41 -; GCN-NEXT: v_alignbit_b32 v5, v5, v6, 16 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_alignbit_b32 v6, v6, v0, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v25 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v17 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v18 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v30 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v50 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v46 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v21 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_alignbit_b32 v3, v0, v3, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_alignbit_b32 v4, v0, v4, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_alignbit_b32 v5, v0, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_alignbit_b32 v6, v0, v6, 16 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v128i8_to_v64bf16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:396 -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s19, s4 -; VI-NEXT: s_mov_b32 s5, s4 -; VI-NEXT: s_mov_b32 s6, s4 -; VI-NEXT: s_mov_b32 s7, s4 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s4 -; VI-NEXT: s_mov_b32 s10, s4 -; VI-NEXT: s_mov_b32 s11, s4 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s4 -; VI-NEXT: s_mov_b32 s14, s4 -; VI-NEXT: s_mov_b32 s15, s4 -; VI-NEXT: s_mov_b32 s16, s4 -; VI-NEXT: s_mov_b32 s17, s4 -; VI-NEXT: s_mov_b32 s18, s4 -; VI-NEXT: v_mov_b32_e32 v46, s19 -; VI-NEXT: v_mov_b32_e32 v45, s18 -; VI-NEXT: v_mov_b32_e32 v44, s17 -; VI-NEXT: v_mov_b32_e32 v43, s16 -; VI-NEXT: v_mov_b32_e32 v42, s15 -; VI-NEXT: v_mov_b32_e32 v41, s14 -; VI-NEXT: v_mov_b32_e32 v40, s13 -; VI-NEXT: v_mov_b32_e32 v39, s12 -; VI-NEXT: v_mov_b32_e32 v38, s11 -; VI-NEXT: v_mov_b32_e32 v37, s10 -; VI-NEXT: v_mov_b32_e32 v36, s9 -; VI-NEXT: v_mov_b32_e32 v35, s8 -; VI-NEXT: v_mov_b32_e32 v34, s7 -; VI-NEXT: v_mov_b32_e32 v33, s6 -; VI-NEXT: v_mov_b32_e32 v32, s5 -; VI-NEXT: v_mov_b32_e32 v31, s4 -; VI-NEXT: v_mov_b32_e32 v62, v46 -; VI-NEXT: v_mov_b32_e32 v61, v45 -; VI-NEXT: v_mov_b32_e32 v60, v44 -; VI-NEXT: v_mov_b32_e32 v59, v43 -; VI-NEXT: v_mov_b32_e32 v58, v42 -; VI-NEXT: v_mov_b32_e32 v57, v41 -; VI-NEXT: v_mov_b32_e32 v56, v40 -; VI-NEXT: v_mov_b32_e32 v55, v39 -; VI-NEXT: v_mov_b32_e32 v54, v38 -; VI-NEXT: v_mov_b32_e32 v53, v37 -; VI-NEXT: v_mov_b32_e32 v52, v36 -; VI-NEXT: v_mov_b32_e32 v51, v35 -; VI-NEXT: v_mov_b32_e32 v50, v34 -; VI-NEXT: v_mov_b32_e32 v49, v33 -; VI-NEXT: v_mov_b32_e32 v48, v32 -; VI-NEXT: v_mov_b32_e32 v47, v31 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:392 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:388 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:380 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:376 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:372 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:368 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:364 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:360 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:356 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:348 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:344 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:340 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:336 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:332 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:280 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:252 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:240 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:232 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:216 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:212 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:208 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:204 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:200 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:196 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:188 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:168 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:160 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:152 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:144 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:136 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:128 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:120 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:112 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB139_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v28, 8, v28 -; VI-NEXT: v_lshlrev_b16_e32 v21, 8, v21 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v12, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v11, v11, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b16_e32 v31, 8, v31 -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v32, 8, v32 -; VI-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; VI-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v32, v33, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; VI-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v33, v33, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v35 -; VI-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v34, v34, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v35, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v36 -; VI-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v35, v35, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v36, 8, v36 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v37 -; VI-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v36, v36, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v37, 8, v37 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v38, 8, v38 -; VI-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v37, v37, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v38, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v30 -; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v24 -; VI-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v39, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v27 -; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v23 -; VI-NEXT: v_or_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v40, v11, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v11, 8, v29 -; VI-NEXT: v_lshlrev_b16_e32 v12, 8, v26 -; VI-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v41, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v17 -; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v25 -; VI-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v42, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v20 -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v19 -; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v43, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v7 -; VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v63 -; VI-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v44, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v45, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v46, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v47, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; VI-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v48, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v49, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v50, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v51, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v52, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v53, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v54, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v55, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v56, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v57, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v58, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v59, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v60, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v61, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v62, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: .LBB139_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x70 -; VI-NEXT: flat_store_dwordx4 v[3:4], v[35:38] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[31:34] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x60 -; VI-NEXT: flat_store_dwordx4 v[3:4], v[59:62] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x50 -; VI-NEXT: flat_store_dwordx4 v[3:4], v[55:58] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[51:54] -; VI-NEXT: flat_store_dwordx4 v[0:1], v[47:50] -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v128i8_to_v64bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:396 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s19, s4 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_mov_b32 s6, s4 -; GFX9-NEXT: s_mov_b32 s7, s4 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s4 -; GFX9-NEXT: s_mov_b32 s10, s4 -; GFX9-NEXT: s_mov_b32 s11, s4 -; GFX9-NEXT: s_mov_b32 s12, s4 -; GFX9-NEXT: s_mov_b32 s13, s4 -; GFX9-NEXT: s_mov_b32 s14, s4 -; GFX9-NEXT: s_mov_b32 s15, s4 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s4 -; GFX9-NEXT: s_mov_b32 s18, s4 -; GFX9-NEXT: v_mov_b32_e32 v46, s19 -; GFX9-NEXT: v_mov_b32_e32 v45, s18 -; GFX9-NEXT: v_mov_b32_e32 v44, s17 -; GFX9-NEXT: v_mov_b32_e32 v43, s16 -; GFX9-NEXT: v_mov_b32_e32 v42, s15 -; GFX9-NEXT: v_mov_b32_e32 v41, s14 -; GFX9-NEXT: v_mov_b32_e32 v40, s13 -; GFX9-NEXT: v_mov_b32_e32 v39, s12 -; GFX9-NEXT: v_mov_b32_e32 v38, s11 -; GFX9-NEXT: v_mov_b32_e32 v37, s10 -; GFX9-NEXT: v_mov_b32_e32 v36, s9 -; GFX9-NEXT: v_mov_b32_e32 v35, s8 -; GFX9-NEXT: v_mov_b32_e32 v34, s7 -; GFX9-NEXT: v_mov_b32_e32 v33, s6 -; GFX9-NEXT: v_mov_b32_e32 v32, s5 -; GFX9-NEXT: v_mov_b32_e32 v31, s4 -; GFX9-NEXT: v_mov_b32_e32 v62, v46 -; GFX9-NEXT: v_mov_b32_e32 v61, v45 -; GFX9-NEXT: v_mov_b32_e32 v60, v44 -; GFX9-NEXT: v_mov_b32_e32 v59, v43 -; GFX9-NEXT: v_mov_b32_e32 v58, v42 -; GFX9-NEXT: v_mov_b32_e32 v57, v41 -; GFX9-NEXT: v_mov_b32_e32 v56, v40 -; GFX9-NEXT: v_mov_b32_e32 v55, v39 -; GFX9-NEXT: v_mov_b32_e32 v54, v38 -; GFX9-NEXT: v_mov_b32_e32 v53, v37 -; GFX9-NEXT: v_mov_b32_e32 v52, v36 -; GFX9-NEXT: v_mov_b32_e32 v51, v35 -; GFX9-NEXT: v_mov_b32_e32 v50, v34 -; GFX9-NEXT: v_mov_b32_e32 v49, v33 -; GFX9-NEXT: v_mov_b32_e32 v48, v32 -; GFX9-NEXT: v_mov_b32_e32 v47, v31 -; GFX9-NEXT: s_waitcnt vmcnt(44) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:392 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:388 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:384 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:380 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:376 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:372 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:368 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:364 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:360 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:356 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:352 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:348 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:344 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:340 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:336 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:332 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:328 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:320 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:304 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:296 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:288 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:280 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:272 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:264 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:256 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:252 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:240 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:232 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:224 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:216 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:212 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:208 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:204 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:200 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:196 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:192 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:188 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:180 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:176 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:172 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:168 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:164 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:160 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:156 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:152 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:148 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:144 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:140 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:136 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:132 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:124 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:120 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB139_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b32 s6, 0x5040100 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v28, 8, v28 -; GFX9-NEXT: v_lshlrev_b16_e32 v21, 8, v21 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_or_b32_sdwa v31, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32 -; GFX9-NEXT: v_or_b32_sdwa v32, v33, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v31, v32, v31, s6 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v32, v34, v33, s6 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v33, 8, v33 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v33, v34, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v33, v34, v33, s6 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v34, v35, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 -; GFX9-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v34, v35, v34, s6 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v35, v36, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 -; GFX9-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v35, v36, v35, s6 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v36, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v36, v37, v36, s6 -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v37, 8, v37 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v37, v38, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v38, 8, v38 -; GFX9-NEXT: v_or_b32_sdwa v38, v39, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v37, v38, v37, s6 -; GFX9-NEXT: v_perm_b32 v38, v11, v12, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v30 -; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v24 -; GFX9-NEXT: v_or_b32_sdwa v11, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v39, v12, v11, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v27 -; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v23 -; GFX9-NEXT: v_or_b32_sdwa v11, v16, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v15, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v40, v12, v11, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v29 -; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v26 -; GFX9-NEXT: v_or_b32_sdwa v11, v18, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v41, v4, v11, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v17 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v25 -; GFX9-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v42, v4, v3, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v20 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v19 -; GFX9-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v43, v4, v3, s6 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v63 -; GFX9-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v44, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v45, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v46, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v47, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX9-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v5 -; GFX9-NEXT: v_or_b32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v48, v5, v4, s6 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v49, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v50, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v51, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v52, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v53, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v54, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v55, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v56, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v57, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v58, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v59, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v60, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v61, v3, v0, s6 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_perm_b32 v62, v3, v0, s6 -; GFX9-NEXT: .LBB139_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[59:62], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[55:58], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[51:54], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:64 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v128i8_to_v64bf16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:600 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:596 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:592 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:588 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:584 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:580 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:576 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:572 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:568 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:564 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:560 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:556 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:552 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:548 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:544 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:540 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:536 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:532 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:528 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:524 -; GFX11-NEXT: scratch_store_b32 off, v76, s32 offset:520 -; GFX11-NEXT: scratch_store_b32 off, v77, s32 offset:516 -; GFX11-NEXT: scratch_store_b32 off, v78, s32 offset:512 -; GFX11-NEXT: scratch_store_b32 off, v79, s32 offset:508 -; GFX11-NEXT: scratch_store_b32 off, v88, s32 offset:504 -; GFX11-NEXT: scratch_store_b32 off, v89, s32 offset:500 -; GFX11-NEXT: scratch_store_b32 off, v90, s32 offset:496 -; GFX11-NEXT: scratch_store_b32 off, v91, s32 offset:492 -; GFX11-NEXT: scratch_store_b32 off, v92, s32 offset:488 -; GFX11-NEXT: scratch_store_b32 off, v93, s32 offset:484 -; GFX11-NEXT: scratch_store_b32 off, v94, s32 offset:480 -; GFX11-NEXT: scratch_store_b32 off, v95, s32 offset:476 -; GFX11-NEXT: s_clause 0x12 -; GFX11-NEXT: scratch_store_b32 off, v104, s32 offset:472 -; GFX11-NEXT: scratch_store_b32 off, v105, s32 offset:468 -; GFX11-NEXT: scratch_store_b32 off, v106, s32 offset:464 -; GFX11-NEXT: scratch_store_b32 off, v107, s32 offset:460 -; GFX11-NEXT: scratch_store_b32 off, v108, s32 offset:456 -; GFX11-NEXT: scratch_store_b32 off, v109, s32 offset:452 -; GFX11-NEXT: scratch_store_b32 off, v110, s32 offset:448 -; GFX11-NEXT: scratch_store_b32 off, v111, s32 offset:444 -; GFX11-NEXT: scratch_store_b32 off, v120, s32 offset:440 -; GFX11-NEXT: scratch_store_b32 off, v121, s32 offset:436 -; GFX11-NEXT: scratch_store_b32 off, v122, s32 offset:432 -; GFX11-NEXT: scratch_store_b32 off, v123, s32 offset:428 -; GFX11-NEXT: scratch_store_b32 off, v124, s32 offset:424 -; GFX11-NEXT: scratch_store_b32 off, v125, s32 offset:420 -; GFX11-NEXT: scratch_store_b32 off, v126, s32 offset:416 -; GFX11-NEXT: scratch_store_b32 off, v127, s32 offset:412 -; GFX11-NEXT: scratch_store_b32 off, v136, s32 offset:408 -; GFX11-NEXT: scratch_store_b32 off, v137, s32 offset:404 -; GFX11-NEXT: scratch_store_b32 off, v138, s32 offset:400 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_u16 v64, off, s32 offset:396 -; GFX11-NEXT: scratch_load_u16 v65, off, s32 offset:392 -; GFX11-NEXT: scratch_load_u16 v66, off, s32 offset:388 -; GFX11-NEXT: scratch_load_u16 v67, off, s32 offset:384 -; GFX11-NEXT: scratch_load_u16 v68, off, s32 offset:380 -; GFX11-NEXT: scratch_load_u16 v69, off, s32 offset:376 -; GFX11-NEXT: scratch_load_u16 v70, off, s32 offset:372 -; GFX11-NEXT: scratch_load_u16 v71, off, s32 offset:368 -; GFX11-NEXT: scratch_load_u16 v80, off, s32 offset:364 -; GFX11-NEXT: scratch_load_u16 v81, off, s32 offset:360 -; GFX11-NEXT: scratch_load_u16 v82, off, s32 offset:356 -; GFX11-NEXT: scratch_load_u16 v83, off, s32 offset:352 -; GFX11-NEXT: scratch_load_u16 v84, off, s32 offset:348 -; GFX11-NEXT: scratch_load_u16 v85, off, s32 offset:344 -; GFX11-NEXT: scratch_load_u16 v86, off, s32 offset:340 -; GFX11-NEXT: scratch_load_u16 v87, off, s32 offset:336 -; GFX11-NEXT: scratch_load_u16 v96, off, s32 offset:332 -; GFX11-NEXT: scratch_load_u16 v97, off, s32 offset:328 -; GFX11-NEXT: scratch_load_u16 v98, off, s32 offset:324 -; GFX11-NEXT: scratch_load_u16 v99, off, s32 offset:320 -; GFX11-NEXT: scratch_load_u16 v100, off, s32 offset:316 -; GFX11-NEXT: scratch_load_u16 v101, off, s32 offset:312 -; GFX11-NEXT: scratch_load_u16 v102, off, s32 offset:308 -; GFX11-NEXT: scratch_load_u16 v103, off, s32 offset:304 -; GFX11-NEXT: scratch_load_u16 v112, off, s32 offset:300 -; GFX11-NEXT: scratch_load_u16 v113, off, s32 offset:296 -; GFX11-NEXT: scratch_load_u16 v114, off, s32 offset:292 -; GFX11-NEXT: scratch_load_u16 v115, off, s32 offset:288 -; GFX11-NEXT: scratch_load_u16 v116, off, s32 offset:284 -; GFX11-NEXT: scratch_load_u16 v117, off, s32 offset:280 -; GFX11-NEXT: scratch_load_u16 v118, off, s32 offset:276 -; GFX11-NEXT: scratch_load_u16 v119, off, s32 offset:272 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_u16 v128, off, s32 offset:268 -; GFX11-NEXT: scratch_load_u16 v129, off, s32 offset:264 -; GFX11-NEXT: scratch_load_u16 v130, off, s32 offset:260 -; GFX11-NEXT: scratch_load_u16 v131, off, s32 offset:256 -; GFX11-NEXT: scratch_load_u16 v132, off, s32 offset:252 -; GFX11-NEXT: scratch_load_u16 v133, off, s32 offset:248 -; GFX11-NEXT: scratch_load_u16 v134, off, s32 offset:244 -; GFX11-NEXT: scratch_load_u16 v135, off, s32 offset:240 -; GFX11-NEXT: scratch_load_u16 v144, off, s32 offset:236 -; GFX11-NEXT: scratch_load_u16 v145, off, s32 offset:232 -; GFX11-NEXT: scratch_load_u16 v146, off, s32 offset:228 -; GFX11-NEXT: scratch_load_u16 v147, off, s32 offset:224 -; GFX11-NEXT: scratch_load_u16 v148, off, s32 offset:220 -; GFX11-NEXT: scratch_load_u16 v149, off, s32 offset:216 -; GFX11-NEXT: scratch_load_u16 v150, off, s32 offset:212 -; GFX11-NEXT: scratch_load_u16 v151, off, s32 offset:208 -; GFX11-NEXT: scratch_load_u16 v160, off, s32 offset:204 -; GFX11-NEXT: scratch_load_u16 v161, off, s32 offset:200 -; GFX11-NEXT: scratch_load_u16 v162, off, s32 offset:196 -; GFX11-NEXT: scratch_load_u16 v163, off, s32 offset:192 -; GFX11-NEXT: scratch_load_u16 v164, off, s32 offset:188 -; GFX11-NEXT: scratch_load_u16 v165, off, s32 offset:184 -; GFX11-NEXT: scratch_load_u16 v166, off, s32 offset:180 -; GFX11-NEXT: scratch_load_u16 v167, off, s32 offset:176 -; GFX11-NEXT: scratch_load_u16 v176, off, s32 offset:172 -; GFX11-NEXT: scratch_load_u16 v177, off, s32 offset:168 -; GFX11-NEXT: scratch_load_u16 v178, off, s32 offset:164 -; GFX11-NEXT: scratch_load_u16 v179, off, s32 offset:160 -; GFX11-NEXT: scratch_load_u16 v180, off, s32 offset:156 -; GFX11-NEXT: scratch_load_u16 v181, off, s32 offset:152 -; GFX11-NEXT: scratch_load_u16 v182, off, s32 offset:148 -; GFX11-NEXT: scratch_load_u16 v183, off, s32 offset:144 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_u16 v63, off, s32 offset:140 -; GFX11-NEXT: scratch_load_u16 v72, off, s32 offset:136 -; GFX11-NEXT: scratch_load_u16 v73, off, s32 offset:132 -; GFX11-NEXT: scratch_load_u16 v74, off, s32 offset:128 -; GFX11-NEXT: scratch_load_u16 v75, off, s32 offset:124 -; GFX11-NEXT: scratch_load_u16 v76, off, s32 offset:120 -; GFX11-NEXT: scratch_load_u16 v77, off, s32 offset:116 -; GFX11-NEXT: scratch_load_u16 v78, off, s32 offset:112 -; GFX11-NEXT: scratch_load_u16 v79, off, s32 offset:108 -; GFX11-NEXT: scratch_load_u16 v88, off, s32 offset:104 -; GFX11-NEXT: scratch_load_u16 v89, off, s32 offset:100 -; GFX11-NEXT: scratch_load_u16 v90, off, s32 offset:96 -; GFX11-NEXT: scratch_load_u16 v91, off, s32 offset:92 -; GFX11-NEXT: scratch_load_u16 v92, off, s32 offset:88 -; GFX11-NEXT: scratch_load_u16 v93, off, s32 offset:84 -; GFX11-NEXT: scratch_load_u16 v94, off, s32 offset:80 -; GFX11-NEXT: scratch_load_u16 v95, off, s32 offset:76 -; GFX11-NEXT: scratch_load_u16 v104, off, s32 offset:72 -; GFX11-NEXT: scratch_load_u16 v105, off, s32 offset:68 -; GFX11-NEXT: scratch_load_u16 v106, off, s32 offset:64 -; GFX11-NEXT: scratch_load_u16 v107, off, s32 offset:60 -; GFX11-NEXT: scratch_load_u16 v108, off, s32 offset:56 -; GFX11-NEXT: scratch_load_u16 v109, off, s32 offset:52 -; GFX11-NEXT: scratch_load_u16 v110, off, s32 offset:48 -; GFX11-NEXT: scratch_load_u16 v111, off, s32 offset:44 -; GFX11-NEXT: scratch_load_u16 v120, off, s32 offset:40 -; GFX11-NEXT: scratch_load_u16 v121, off, s32 offset:36 -; GFX11-NEXT: scratch_load_u16 v122, off, s32 offset:32 -; GFX11-NEXT: scratch_load_u16 v123, off, s32 offset:28 -; GFX11-NEXT: scratch_load_u16 v124, off, s32 offset:24 -; GFX11-NEXT: scratch_load_u16 v125, off, s32 offset:20 -; GFX11-NEXT: scratch_load_u16 v126, off, s32 offset:16 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_u16 v127, off, s32 offset:12 -; GFX11-NEXT: scratch_load_u16 v136, off, s32 offset:8 -; GFX11-NEXT: scratch_load_u16 v137, off, s32 offset:4 -; GFX11-NEXT: scratch_load_u16 v138, off, s32 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s15, s0 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s0 -; GFX11-NEXT: s_mov_b32 s10, s0 -; GFX11-NEXT: s_mov_b32 s11, s0 -; GFX11-NEXT: s_mov_b32 s12, s0 -; GFX11-NEXT: s_mov_b32 s13, s0 -; GFX11-NEXT: s_mov_b32 s14, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v46, s15 :: v_dual_mov_b32 v45, s14 -; GFX11-NEXT: v_dual_mov_b32 v44, s13 :: v_dual_mov_b32 v43, s12 -; GFX11-NEXT: v_dual_mov_b32 v42, s11 :: v_dual_mov_b32 v41, s10 -; GFX11-NEXT: v_dual_mov_b32 v40, s9 :: v_dual_mov_b32 v39, s8 -; GFX11-NEXT: v_dual_mov_b32 v38, s7 :: v_dual_mov_b32 v37, s6 -; GFX11-NEXT: v_dual_mov_b32 v36, s5 :: v_dual_mov_b32 v35, s4 -; GFX11-NEXT: v_dual_mov_b32 v34, s3 :: v_dual_mov_b32 v33, s2 -; GFX11-NEXT: v_dual_mov_b32 v32, s1 :: v_dual_mov_b32 v31, s0 -; GFX11-NEXT: v_dual_mov_b32 v62, v46 :: v_dual_mov_b32 v61, v45 -; GFX11-NEXT: v_dual_mov_b32 v60, v44 :: v_dual_mov_b32 v59, v43 -; GFX11-NEXT: v_dual_mov_b32 v58, v42 :: v_dual_mov_b32 v57, v41 -; GFX11-NEXT: v_dual_mov_b32 v56, v40 :: v_dual_mov_b32 v55, v39 -; GFX11-NEXT: v_dual_mov_b32 v54, v38 :: v_dual_mov_b32 v53, v37 -; GFX11-NEXT: v_dual_mov_b32 v52, v36 :: v_dual_mov_b32 v51, v35 -; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 -; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB139_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v3 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v6 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v8 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v9 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v10 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v15 -; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v16 -; GFX11-NEXT: v_perm_b32 v31, v3, v0, 0x5040100 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v12 -; GFX11-NEXT: v_perm_b32 v32, v5, v4, 0x5040100 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v14 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v17 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v18 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v19 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v20 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v21 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v22 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v23 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v24 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v25 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v26 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v27 -; GFX11-NEXT: v_lshlrev_b16 v14, 8, v28 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v29 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v30 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-NEXT: v_perm_b32 v33, v3, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v34, v5, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v35, v7, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v36, v9, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v37, v11, v10, 0x5040100 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v138 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v137 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v136 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v127 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v126 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v125 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v124 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v123 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v122 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v121 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v120 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v111 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v110 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v109 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v108 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v107 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v106 -; GFX11-NEXT: v_lshlrev_b16 v14, 8, v105 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v104 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v95 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-NEXT: v_perm_b32 v38, v3, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v39, v5, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v40, v7, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v41, v9, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v42, v11, v10, 0x5040100 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v94 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v93 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v92 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v91 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v90 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v89 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v88 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v79 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v78 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v77 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v76 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v75 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v74 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v73 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v72 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v63 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v183 -; GFX11-NEXT: v_lshlrev_b16 v14, 8, v182 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v181 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v180 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-NEXT: v_perm_b32 v43, v3, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v44, v5, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v45, v7, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v46, v9, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v47, v11, v10, 0x5040100 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v179 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v178 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v177 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v176 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v167 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v166 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v165 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v164 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v163 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v162 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v161 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v160 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v151 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v150 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v149 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v148 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v147 -; GFX11-NEXT: v_lshlrev_b16 v14, 8, v146 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v145 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v144 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-NEXT: v_perm_b32 v48, v3, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v49, v5, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v50, v7, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v51, v9, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v52, v11, v10, 0x5040100 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v135 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v134 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v133 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v132 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v131 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v130 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v129 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v128 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v119 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v118 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v117 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v116 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v115 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v114 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v113 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v112 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v103 -; GFX11-NEXT: v_lshlrev_b16 v14, 8, v102 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v101 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v100 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-NEXT: v_perm_b32 v53, v3, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v54, v5, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v55, v7, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v56, v9, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v57, v11, v10, 0x5040100 -; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v99 -; GFX11-NEXT: v_lshlrev_b16 v3, 8, v98 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v97 -; GFX11-NEXT: v_lshlrev_b16 v5, 8, v96 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v87 -; GFX11-NEXT: v_lshlrev_b16 v7, 8, v86 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v85 -; GFX11-NEXT: v_lshlrev_b16 v9, 8, v84 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v83 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v82 -; GFX11-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_or_b32_e32 v3, v4, v5 -; GFX11-NEXT: v_or_b32_e32 v4, v6, v7 -; GFX11-NEXT: v_or_b32_e32 v5, v8, v9 -; GFX11-NEXT: v_or_b32_e32 v6, v10, v11 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v81 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v80 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v71 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v70 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v69 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v68 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v67 -; GFX11-NEXT: v_lshlrev_b16 v14, 8, v66 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v65 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v64 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v9, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v10, v13, v14 -; GFX11-NEXT: v_or_b32_e32 v11, v15, v16 -; GFX11-NEXT: v_perm_b32 v58, v3, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v59, v5, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v60, v7, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v61, v9, v8, 0x5040100 -; GFX11-NEXT: v_perm_b32 v62, v11, v10, 0x5040100 -; GFX11-NEXT: .LBB139_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[31:34], off -; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:112 -; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:96 -; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:80 -; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:64 -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v138, off, s32 offset:400 -; GFX11-NEXT: scratch_load_b32 v137, off, s32 offset:404 -; GFX11-NEXT: scratch_load_b32 v136, off, s32 offset:408 -; GFX11-NEXT: scratch_load_b32 v127, off, s32 offset:412 -; GFX11-NEXT: scratch_load_b32 v126, off, s32 offset:416 -; GFX11-NEXT: scratch_load_b32 v125, off, s32 offset:420 -; GFX11-NEXT: scratch_load_b32 v124, off, s32 offset:424 -; GFX11-NEXT: scratch_load_b32 v123, off, s32 offset:428 -; GFX11-NEXT: scratch_load_b32 v122, off, s32 offset:432 -; GFX11-NEXT: scratch_load_b32 v121, off, s32 offset:436 -; GFX11-NEXT: scratch_load_b32 v120, off, s32 offset:440 -; GFX11-NEXT: scratch_load_b32 v111, off, s32 offset:444 -; GFX11-NEXT: scratch_load_b32 v110, off, s32 offset:448 -; GFX11-NEXT: scratch_load_b32 v109, off, s32 offset:452 -; GFX11-NEXT: scratch_load_b32 v108, off, s32 offset:456 -; GFX11-NEXT: scratch_load_b32 v107, off, s32 offset:460 -; GFX11-NEXT: scratch_load_b32 v106, off, s32 offset:464 -; GFX11-NEXT: scratch_load_b32 v105, off, s32 offset:468 -; GFX11-NEXT: scratch_load_b32 v104, off, s32 offset:472 -; GFX11-NEXT: scratch_load_b32 v95, off, s32 offset:476 -; GFX11-NEXT: scratch_load_b32 v94, off, s32 offset:480 -; GFX11-NEXT: scratch_load_b32 v93, off, s32 offset:484 -; GFX11-NEXT: scratch_load_b32 v92, off, s32 offset:488 -; GFX11-NEXT: scratch_load_b32 v91, off, s32 offset:492 -; GFX11-NEXT: scratch_load_b32 v90, off, s32 offset:496 -; GFX11-NEXT: scratch_load_b32 v89, off, s32 offset:500 -; GFX11-NEXT: scratch_load_b32 v88, off, s32 offset:504 -; GFX11-NEXT: scratch_load_b32 v79, off, s32 offset:508 -; GFX11-NEXT: scratch_load_b32 v78, off, s32 offset:512 -; GFX11-NEXT: scratch_load_b32 v77, off, s32 offset:516 -; GFX11-NEXT: scratch_load_b32 v76, off, s32 offset:520 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:524 -; GFX11-NEXT: s_clause 0x12 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:528 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:532 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:536 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:540 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:544 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:548 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:552 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:556 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:560 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:564 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:568 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:572 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:576 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:580 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:584 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:588 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:592 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:596 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:600 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <128 x i8> %value to <64 x bfloat> - br label %end - -end: - %phi = phi <64 x bfloat> [zeroinitializer, %entry], [%cast, %if] - store <64 x bfloat> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v64bf16_to_v64i16(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v64bf16_to_v64i16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v32, v31 -; GCN-NEXT: v_mov_b32_e32 v33, v31 -; GCN-NEXT: v_mov_b32_e32 v34, v31 -; GCN-NEXT: v_mov_b32_e32 v35, v31 -; GCN-NEXT: v_mov_b32_e32 v36, v31 -; GCN-NEXT: v_mov_b32_e32 v37, v31 -; GCN-NEXT: v_mov_b32_e32 v38, v31 -; GCN-NEXT: v_mov_b32_e32 v48, v31 -; GCN-NEXT: v_mov_b32_e32 v49, v31 -; GCN-NEXT: v_mov_b32_e32 v50, v31 -; GCN-NEXT: v_mov_b32_e32 v51, v31 -; GCN-NEXT: v_mov_b32_e32 v52, v31 -; GCN-NEXT: v_mov_b32_e32 v53, v31 -; GCN-NEXT: v_mov_b32_e32 v54, v31 -; GCN-NEXT: v_mov_b32_e32 v55, v31 -; GCN-NEXT: v_mov_b32_e32 v39, v31 -; GCN-NEXT: v_mov_b32_e32 v40, v31 -; GCN-NEXT: v_mov_b32_e32 v41, v31 -; GCN-NEXT: v_mov_b32_e32 v42, v31 -; GCN-NEXT: v_mov_b32_e32 v43, v31 -; GCN-NEXT: v_mov_b32_e32 v44, v31 -; GCN-NEXT: v_mov_b32_e32 v45, v31 -; GCN-NEXT: v_mov_b32_e32 v46, v31 -; GCN-NEXT: v_mov_b32_e32 v56, v31 -; GCN-NEXT: v_mov_b32_e32 v57, v31 -; GCN-NEXT: v_mov_b32_e32 v58, v31 -; GCN-NEXT: v_mov_b32_e32 v59, v31 -; GCN-NEXT: v_mov_b32_e32 v60, v31 -; GCN-NEXT: v_mov_b32_e32 v61, v31 -; GCN-NEXT: v_mov_b32_e32 v62, v31 -; GCN-NEXT: v_mov_b32_e32 v63, v31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB140_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v36, v4, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v29 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_alignbit_b32 v37, v22, v37, 16 -; GCN-NEXT: v_alignbit_b32 v38, v29, v38, 16 -; GCN-NEXT: v_alignbit_b32 v48, v30, v39, 16 -; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16 -; GCN-NEXT: v_alignbit_b32 v50, v51, v59, 16 -; GCN-NEXT: v_alignbit_b32 v51, v52, v62, 16 -; GCN-NEXT: v_alignbit_b32 v52, v40, v63, 16 -; GCN-NEXT: v_alignbit_b32 v53, v53, v61, 16 -; GCN-NEXT: v_alignbit_b32 v54, v54, v0, 16 -; GCN-NEXT: v_alignbit_b32 v55, v55, v3, 16 -; GCN-NEXT: v_alignbit_b32 v39, v41, v4, 16 -; GCN-NEXT: v_alignbit_b32 v40, v42, v5, 16 -; GCN-NEXT: v_alignbit_b32 v41, v43, v6, 16 -; GCN-NEXT: v_alignbit_b32 v42, v44, v7, 16 -; GCN-NEXT: v_alignbit_b32 v43, v45, v8, 16 -; GCN-NEXT: v_alignbit_b32 v44, v46, v9, 16 -; GCN-NEXT: v_alignbit_b32 v45, v47, v10, 16 -; GCN-NEXT: v_alignbit_b32 v46, v56, v11, 16 -; GCN-NEXT: v_alignbit_b32 v56, v27, v12, 16 -; GCN-NEXT: v_alignbit_b32 v57, v26, v13, 16 -; GCN-NEXT: v_alignbit_b32 v58, v25, v14, 16 -; GCN-NEXT: v_alignbit_b32 v59, v28, v15, 16 -; GCN-NEXT: v_alignbit_b32 v60, v23, v16, 16 -; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16 -; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16 -; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16 -; GCN-NEXT: .LBB140_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[60:63], v[1:2], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_store_dwordx4 v[56:59], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v64bf16_to_v64i16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s19, s4 -; VI-NEXT: s_mov_b32 s5, s4 -; VI-NEXT: s_mov_b32 s6, s4 -; VI-NEXT: s_mov_b32 s7, s4 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s4 -; VI-NEXT: s_mov_b32 s10, s4 -; VI-NEXT: s_mov_b32 s11, s4 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s4 -; VI-NEXT: s_mov_b32 s14, s4 -; VI-NEXT: s_mov_b32 s15, s4 -; VI-NEXT: s_mov_b32 s16, s4 -; VI-NEXT: s_mov_b32 s17, s4 -; VI-NEXT: s_mov_b32 s18, s4 -; VI-NEXT: v_mov_b32_e32 v50, s19 -; VI-NEXT: v_mov_b32_e32 v49, s18 -; VI-NEXT: v_mov_b32_e32 v48, s17 -; VI-NEXT: v_mov_b32_e32 v47, s16 -; VI-NEXT: v_mov_b32_e32 v46, s15 -; VI-NEXT: v_mov_b32_e32 v45, s14 -; VI-NEXT: v_mov_b32_e32 v44, s13 -; VI-NEXT: v_mov_b32_e32 v43, s12 -; VI-NEXT: v_mov_b32_e32 v42, s11 -; VI-NEXT: v_mov_b32_e32 v41, s10 -; VI-NEXT: v_mov_b32_e32 v40, s9 -; VI-NEXT: v_mov_b32_e32 v39, s8 -; VI-NEXT: v_mov_b32_e32 v38, s7 -; VI-NEXT: v_mov_b32_e32 v37, s6 -; VI-NEXT: v_mov_b32_e32 v36, s5 -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB140_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v50, v18 -; VI-NEXT: v_mov_b32_e32 v49, v17 -; VI-NEXT: v_mov_b32_e32 v48, v16 -; VI-NEXT: v_mov_b32_e32 v47, v15 -; VI-NEXT: v_mov_b32_e32 v46, v14 -; VI-NEXT: v_mov_b32_e32 v45, v13 -; VI-NEXT: v_mov_b32_e32 v44, v12 -; VI-NEXT: v_mov_b32_e32 v43, v11 -; VI-NEXT: v_mov_b32_e32 v42, v10 -; VI-NEXT: v_mov_b32_e32 v41, v9 -; VI-NEXT: v_mov_b32_e32 v40, v8 -; VI-NEXT: v_mov_b32_e32 v39, v7 -; VI-NEXT: v_mov_b32_e32 v38, v6 -; VI-NEXT: v_mov_b32_e32 v37, v5 -; VI-NEXT: v_mov_b32_e32 v36, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: .LBB140_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38] -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: s_movk_i32 s4, 0x70 -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x60 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x50 -; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] -; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8] -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v64bf16_to_v64i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s19, s4 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_mov_b32 s6, s4 -; GFX9-NEXT: s_mov_b32 s7, s4 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s4 -; GFX9-NEXT: s_mov_b32 s10, s4 -; GFX9-NEXT: s_mov_b32 s11, s4 -; GFX9-NEXT: s_mov_b32 s12, s4 -; GFX9-NEXT: s_mov_b32 s13, s4 -; GFX9-NEXT: s_mov_b32 s14, s4 -; GFX9-NEXT: s_mov_b32 s15, s4 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s4 -; GFX9-NEXT: s_mov_b32 s18, s4 -; GFX9-NEXT: v_mov_b32_e32 v50, s19 -; GFX9-NEXT: v_mov_b32_e32 v49, s18 -; GFX9-NEXT: v_mov_b32_e32 v48, s17 -; GFX9-NEXT: v_mov_b32_e32 v47, s16 -; GFX9-NEXT: v_mov_b32_e32 v46, s15 -; GFX9-NEXT: v_mov_b32_e32 v45, s14 -; GFX9-NEXT: v_mov_b32_e32 v44, s13 -; GFX9-NEXT: v_mov_b32_e32 v43, s12 -; GFX9-NEXT: v_mov_b32_e32 v42, s11 -; GFX9-NEXT: v_mov_b32_e32 v41, s10 -; GFX9-NEXT: v_mov_b32_e32 v40, s9 -; GFX9-NEXT: v_mov_b32_e32 v39, s8 -; GFX9-NEXT: v_mov_b32_e32 v38, s7 -; GFX9-NEXT: v_mov_b32_e32 v37, s6 -; GFX9-NEXT: v_mov_b32_e32 v36, s5 -; GFX9-NEXT: v_mov_b32_e32 v35, s4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB140_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v50, v18 -; GFX9-NEXT: v_mov_b32_e32 v49, v17 -; GFX9-NEXT: v_mov_b32_e32 v48, v16 -; GFX9-NEXT: v_mov_b32_e32 v47, v15 -; GFX9-NEXT: v_mov_b32_e32 v46, v14 -; GFX9-NEXT: v_mov_b32_e32 v45, v13 -; GFX9-NEXT: v_mov_b32_e32 v44, v12 -; GFX9-NEXT: v_mov_b32_e32 v43, v11 -; GFX9-NEXT: v_mov_b32_e32 v42, v10 -; GFX9-NEXT: v_mov_b32_e32 v41, v9 -; GFX9-NEXT: v_mov_b32_e32 v40, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v7 -; GFX9-NEXT: v_mov_b32_e32 v38, v6 -; GFX9-NEXT: v_mov_b32_e32 v37, v5 -; GFX9-NEXT: v_mov_b32_e32 v36, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB140_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v64bf16_to_v64i16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s15, s0 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s0 -; GFX11-NEXT: s_mov_b32 s10, s0 -; GFX11-NEXT: s_mov_b32 s11, s0 -; GFX11-NEXT: s_mov_b32 s12, s0 -; GFX11-NEXT: s_mov_b32 s13, s0 -; GFX11-NEXT: s_mov_b32 s14, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14 -; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0 -; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12 -; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10 -; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8 -; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6 -; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4 -; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2 -; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56 -; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54 -; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58 -; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60 -; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62 -; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64 -; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB140_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 -; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15 -; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13 -; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11 -; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9 -; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7 -; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5 -; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3 -; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 -; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29 -; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27 -; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25 -; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23 -; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21 -; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19 -; GFX11-NEXT: .LBB140_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off -; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112 -; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96 -; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80 -; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64 -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <64 x bfloat> %value to <64 x i16> - br label %end - -end: - %phi = phi <64 x i16> [zeroinitializer, %entry], [%cast, %if] - store <64 x i16> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v64bf16_to_v64f16(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v64bf16_to_v64f16: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:48 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12 -; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NEXT: v_mov_b32_e32 v58, 0 -; GCN-NEXT: v_mov_b32_e32 v56, 0 -; GCN-NEXT: v_mov_b32_e32 v57, 0 -; GCN-NEXT: v_mov_b32_e32 v45, 0 -; GCN-NEXT: v_mov_b32_e32 v47, 0 -; GCN-NEXT: v_mov_b32_e32 v44, 0 -; GCN-NEXT: v_mov_b32_e32 v46, 0 -; GCN-NEXT: v_mov_b32_e32 v41, 0 -; GCN-NEXT: v_mov_b32_e32 v43, 0 -; GCN-NEXT: v_mov_b32_e32 v40, 0 -; GCN-NEXT: v_mov_b32_e32 v42, 0 -; GCN-NEXT: v_mov_b32_e32 v54, 0 -; GCN-NEXT: v_mov_b32_e32 v55, 0 -; GCN-NEXT: v_mov_b32_e32 v52, 0 -; GCN-NEXT: v_mov_b32_e32 v53, 0 -; GCN-NEXT: v_mov_b32_e32 v49, 0 -; GCN-NEXT: v_mov_b32_e32 v51, 0 -; GCN-NEXT: v_mov_b32_e32 v48, 0 -; GCN-NEXT: v_mov_b32_e32 v50, 0 -; GCN-NEXT: v_mov_b32_e32 v37, 0 -; GCN-NEXT: v_mov_b32_e32 v39, 0 -; GCN-NEXT: v_mov_b32_e32 v36, 0 -; GCN-NEXT: v_mov_b32_e32 v38, 0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mov_b32_e32 v22, 0 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB141_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v61 -; GCN-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v23 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v33 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v28 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v32 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v24 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v31 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v29 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v25 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v63 -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v60 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v62 -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v59 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v47, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v36, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v22 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GCN-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GCN-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GCN-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GCN-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GCN-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GCN-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GCN-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GCN-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v35 -; GCN-NEXT: v_lshrrev_b32_e32 v33, 16, v61 -; GCN-NEXT: v_lshrrev_b32_e32 v34, 16, v34 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v35, 16, v35 -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v59, 16, v59 -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v60, 16, v60 -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v61, 16, v61 -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v62, 16, v62 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v63, 16, v63 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v51 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v52 -; GCN-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v50 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v53 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v54 -; GCN-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v55 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v40 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v41 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v42 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v43 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v44 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v45 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v46 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v47 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v56 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v57 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v58 -; GCN-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v48 -; GCN-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; GCN-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v37 -; GCN-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; GCN-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v30, v3 -; GCN-NEXT: v_cvt_f32_f16_e32 v58, v4 -; GCN-NEXT: v_cvt_f32_f16_e32 v56, v5 -; GCN-NEXT: v_cvt_f32_f16_e32 v57, v6 -; GCN-NEXT: v_cvt_f32_f16_e32 v45, v7 -; GCN-NEXT: v_cvt_f32_f16_e32 v47, v8 -; GCN-NEXT: v_cvt_f32_f16_e32 v44, v9 -; GCN-NEXT: v_cvt_f32_f16_e32 v46, v10 -; GCN-NEXT: v_cvt_f32_f16_e32 v41, v11 -; GCN-NEXT: v_cvt_f32_f16_e32 v43, v12 -; GCN-NEXT: v_cvt_f32_f16_e32 v40, v13 -; GCN-NEXT: v_cvt_f32_f16_e32 v42, v14 -; GCN-NEXT: v_cvt_f32_f16_e32 v54, v15 -; GCN-NEXT: v_cvt_f32_f16_e32 v55, v16 -; GCN-NEXT: v_cvt_f32_f16_e32 v52, v17 -; GCN-NEXT: v_cvt_f32_f16_e32 v53, v18 -; GCN-NEXT: v_cvt_f32_f16_e32 v49, v19 -; GCN-NEXT: v_cvt_f32_f16_e32 v51, v20 -; GCN-NEXT: v_cvt_f32_f16_e32 v48, v21 -; GCN-NEXT: v_cvt_f32_f16_e32 v50, v22 -; GCN-NEXT: v_cvt_f32_f16_e32 v37, v23 -; GCN-NEXT: v_cvt_f32_f16_e32 v39, v24 -; GCN-NEXT: v_cvt_f32_f16_e32 v36, v25 -; GCN-NEXT: v_cvt_f32_f16_e32 v38, v26 -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v27 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v28 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v29 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v31 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v32 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v33 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v34 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v35 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v59 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v60 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v61 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v62 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v63 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v22, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: .LBB141_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v58 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30 -; GCN-NEXT: v_or_b32_e32 v3, v3, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v57 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v56 -; GCN-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v47 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v45 -; GCN-NEXT: v_or_b32_e32 v5, v5, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v44 -; GCN-NEXT: v_or_b32_e32 v6, v6, v0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v43 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v41 -; GCN-NEXT: v_or_b32_e32 v3, v3, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v40 -; GCN-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v54 -; GCN-NEXT: v_or_b32_e32 v5, v5, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v53 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v52 -; GCN-NEXT: v_or_b32_e32 v6, v6, v0 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v49 -; GCN-NEXT: v_or_b32_e32 v3, v3, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v50 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v48 -; GCN-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v39 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v37 -; GCN-NEXT: v_or_b32_e32 v5, v5, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v36 -; GCN-NEXT: v_or_b32_e32 v6, v6, v0 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v0 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v0 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v0 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GCN-NEXT: v_or_b32_e32 v5, v5, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v0 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v4, v4 -; GCN-NEXT: v_or_b32_e32 v4, v4, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_cvt_f16_f32_e32 v5, v22 -; GCN-NEXT: v_or_b32_e32 v5, v5, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v0 -; GCN-NEXT: buffer_store_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v64bf16_to_v64f16: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: s_mov_b32 s4, 0 -; VI-NEXT: s_mov_b32 s19, s4 -; VI-NEXT: s_mov_b32 s5, s4 -; VI-NEXT: s_mov_b32 s6, s4 -; VI-NEXT: s_mov_b32 s7, s4 -; VI-NEXT: s_mov_b32 s8, s4 -; VI-NEXT: s_mov_b32 s9, s4 -; VI-NEXT: s_mov_b32 s10, s4 -; VI-NEXT: s_mov_b32 s11, s4 -; VI-NEXT: s_mov_b32 s12, s4 -; VI-NEXT: s_mov_b32 s13, s4 -; VI-NEXT: s_mov_b32 s14, s4 -; VI-NEXT: s_mov_b32 s15, s4 -; VI-NEXT: s_mov_b32 s16, s4 -; VI-NEXT: s_mov_b32 s17, s4 -; VI-NEXT: s_mov_b32 s18, s4 -; VI-NEXT: v_mov_b32_e32 v50, s19 -; VI-NEXT: v_mov_b32_e32 v49, s18 -; VI-NEXT: v_mov_b32_e32 v48, s17 -; VI-NEXT: v_mov_b32_e32 v47, s16 -; VI-NEXT: v_mov_b32_e32 v46, s15 -; VI-NEXT: v_mov_b32_e32 v45, s14 -; VI-NEXT: v_mov_b32_e32 v44, s13 -; VI-NEXT: v_mov_b32_e32 v43, s12 -; VI-NEXT: v_mov_b32_e32 v42, s11 -; VI-NEXT: v_mov_b32_e32 v41, s10 -; VI-NEXT: v_mov_b32_e32 v40, s9 -; VI-NEXT: v_mov_b32_e32 v39, s8 -; VI-NEXT: v_mov_b32_e32 v38, s7 -; VI-NEXT: v_mov_b32_e32 v37, s6 -; VI-NEXT: v_mov_b32_e32 v36, s5 -; VI-NEXT: v_mov_b32_e32 v35, s4 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB141_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: v_mov_b32_e32 v50, v18 -; VI-NEXT: v_mov_b32_e32 v49, v17 -; VI-NEXT: v_mov_b32_e32 v48, v16 -; VI-NEXT: v_mov_b32_e32 v47, v15 -; VI-NEXT: v_mov_b32_e32 v46, v14 -; VI-NEXT: v_mov_b32_e32 v45, v13 -; VI-NEXT: v_mov_b32_e32 v44, v12 -; VI-NEXT: v_mov_b32_e32 v43, v11 -; VI-NEXT: v_mov_b32_e32 v42, v10 -; VI-NEXT: v_mov_b32_e32 v41, v9 -; VI-NEXT: v_mov_b32_e32 v40, v8 -; VI-NEXT: v_mov_b32_e32 v39, v7 -; VI-NEXT: v_mov_b32_e32 v38, v6 -; VI-NEXT: v_mov_b32_e32 v37, v5 -; VI-NEXT: v_mov_b32_e32 v36, v4 -; VI-NEXT: v_mov_b32_e32 v35, v3 -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: .LBB141_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v3, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[47:50] -; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[43:46] -; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[39:42] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[35:38] -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: s_movk_i32 s4, 0x70 -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x60 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dwordx4 v[3:4], v[17:20] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: s_movk_i32 s4, 0x50 -; VI-NEXT: flat_store_dwordx4 v[3:4], v[13:16] -; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v1 -; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 64, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[3:4], v[9:12] -; VI-NEXT: flat_store_dwordx4 v[0:1], v[5:8] -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v64bf16_to_v64f16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: s_mov_b32 s19, s4 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_mov_b32 s6, s4 -; GFX9-NEXT: s_mov_b32 s7, s4 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s4 -; GFX9-NEXT: s_mov_b32 s10, s4 -; GFX9-NEXT: s_mov_b32 s11, s4 -; GFX9-NEXT: s_mov_b32 s12, s4 -; GFX9-NEXT: s_mov_b32 s13, s4 -; GFX9-NEXT: s_mov_b32 s14, s4 -; GFX9-NEXT: s_mov_b32 s15, s4 -; GFX9-NEXT: s_mov_b32 s16, s4 -; GFX9-NEXT: s_mov_b32 s17, s4 -; GFX9-NEXT: s_mov_b32 s18, s4 -; GFX9-NEXT: v_mov_b32_e32 v50, s19 -; GFX9-NEXT: v_mov_b32_e32 v49, s18 -; GFX9-NEXT: v_mov_b32_e32 v48, s17 -; GFX9-NEXT: v_mov_b32_e32 v47, s16 -; GFX9-NEXT: v_mov_b32_e32 v46, s15 -; GFX9-NEXT: v_mov_b32_e32 v45, s14 -; GFX9-NEXT: v_mov_b32_e32 v44, s13 -; GFX9-NEXT: v_mov_b32_e32 v43, s12 -; GFX9-NEXT: v_mov_b32_e32 v42, s11 -; GFX9-NEXT: v_mov_b32_e32 v41, s10 -; GFX9-NEXT: v_mov_b32_e32 v40, s9 -; GFX9-NEXT: v_mov_b32_e32 v39, s8 -; GFX9-NEXT: v_mov_b32_e32 v38, s7 -; GFX9-NEXT: v_mov_b32_e32 v37, s6 -; GFX9-NEXT: v_mov_b32_e32 v36, s5 -; GFX9-NEXT: v_mov_b32_e32 v35, s4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB141_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: v_mov_b32_e32 v50, v18 -; GFX9-NEXT: v_mov_b32_e32 v49, v17 -; GFX9-NEXT: v_mov_b32_e32 v48, v16 -; GFX9-NEXT: v_mov_b32_e32 v47, v15 -; GFX9-NEXT: v_mov_b32_e32 v46, v14 -; GFX9-NEXT: v_mov_b32_e32 v45, v13 -; GFX9-NEXT: v_mov_b32_e32 v44, v12 -; GFX9-NEXT: v_mov_b32_e32 v43, v11 -; GFX9-NEXT: v_mov_b32_e32 v42, v10 -; GFX9-NEXT: v_mov_b32_e32 v41, v9 -; GFX9-NEXT: v_mov_b32_e32 v40, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v7 -; GFX9-NEXT: v_mov_b32_e32 v38, v6 -; GFX9-NEXT: v_mov_b32_e32 v37, v5 -; GFX9-NEXT: v_mov_b32_e32 v36, v4 -; GFX9-NEXT: v_mov_b32_e32 v35, v3 -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: .LBB141_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[47:50], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[43:46], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[39:42], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[35:38], off -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off offset:64 -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v64bf16_to_v64f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_mov_b32 s15, s0 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s0 -; GFX11-NEXT: s_mov_b32 s10, s0 -; GFX11-NEXT: s_mov_b32 s11, s0 -; GFX11-NEXT: s_mov_b32 s12, s0 -; GFX11-NEXT: s_mov_b32 s13, s0 -; GFX11-NEXT: s_mov_b32 s14, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v66, s15 :: v_dual_mov_b32 v65, s14 -; GFX11-NEXT: v_dual_mov_b32 v52, s1 :: v_dual_mov_b32 v51, s0 -; GFX11-NEXT: v_dual_mov_b32 v64, s13 :: v_dual_mov_b32 v63, s12 -; GFX11-NEXT: v_dual_mov_b32 v62, s11 :: v_dual_mov_b32 v61, s10 -; GFX11-NEXT: v_dual_mov_b32 v60, s9 :: v_dual_mov_b32 v59, s8 -; GFX11-NEXT: v_dual_mov_b32 v58, s7 :: v_dual_mov_b32 v57, s6 -; GFX11-NEXT: v_dual_mov_b32 v56, s5 :: v_dual_mov_b32 v55, s4 -; GFX11-NEXT: v_dual_mov_b32 v54, s3 :: v_dual_mov_b32 v53, s2 -; GFX11-NEXT: v_dual_mov_b32 v35, v51 :: v_dual_mov_b32 v36, v52 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_dual_mov_b32 v39, v55 :: v_dual_mov_b32 v40, v56 -; GFX11-NEXT: v_dual_mov_b32 v37, v53 :: v_dual_mov_b32 v38, v54 -; GFX11-NEXT: v_dual_mov_b32 v41, v57 :: v_dual_mov_b32 v42, v58 -; GFX11-NEXT: v_dual_mov_b32 v43, v59 :: v_dual_mov_b32 v44, v60 -; GFX11-NEXT: v_dual_mov_b32 v45, v61 :: v_dual_mov_b32 v46, v62 -; GFX11-NEXT: v_dual_mov_b32 v47, v63 :: v_dual_mov_b32 v48, v64 -; GFX11-NEXT: v_dual_mov_b32 v49, v65 :: v_dual_mov_b32 v50, v66 -; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-NEXT: s_cbranch_execz .LBB141_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: v_dual_mov_b32 v66, v18 :: v_dual_mov_b32 v65, v17 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v50, v34 :: v_dual_mov_b32 v49, v33 -; GFX11-NEXT: v_dual_mov_b32 v64, v16 :: v_dual_mov_b32 v63, v15 -; GFX11-NEXT: v_dual_mov_b32 v62, v14 :: v_dual_mov_b32 v61, v13 -; GFX11-NEXT: v_dual_mov_b32 v60, v12 :: v_dual_mov_b32 v59, v11 -; GFX11-NEXT: v_dual_mov_b32 v58, v10 :: v_dual_mov_b32 v57, v9 -; GFX11-NEXT: v_dual_mov_b32 v56, v8 :: v_dual_mov_b32 v55, v7 -; GFX11-NEXT: v_dual_mov_b32 v54, v6 :: v_dual_mov_b32 v53, v5 -; GFX11-NEXT: v_dual_mov_b32 v52, v4 :: v_dual_mov_b32 v51, v3 -; GFX11-NEXT: v_dual_mov_b32 v48, v32 :: v_dual_mov_b32 v47, v31 -; GFX11-NEXT: v_dual_mov_b32 v46, v30 :: v_dual_mov_b32 v45, v29 -; GFX11-NEXT: v_dual_mov_b32 v44, v28 :: v_dual_mov_b32 v43, v27 -; GFX11-NEXT: v_dual_mov_b32 v42, v26 :: v_dual_mov_b32 v41, v25 -; GFX11-NEXT: v_dual_mov_b32 v40, v24 :: v_dual_mov_b32 v39, v23 -; GFX11-NEXT: v_dual_mov_b32 v38, v22 :: v_dual_mov_b32 v37, v21 -; GFX11-NEXT: v_dual_mov_b32 v36, v20 :: v_dual_mov_b32 v35, v19 -; GFX11-NEXT: .LBB141_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off -; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:112 -; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:96 -; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:80 -; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off offset:64 -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <64 x bfloat> %value to <64 x half> - br label %end - -end: - %phi = phi <64 x half> [zeroinitializer, %entry], [%cast, %if] - store <64 x half> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v64bf16_to_v128i8(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v64bf16_to_v128i8: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v32, v31 -; GCN-NEXT: v_mov_b32_e32 v33, v31 -; GCN-NEXT: v_mov_b32_e32 v34, v31 -; GCN-NEXT: v_mov_b32_e32 v35, v31 -; GCN-NEXT: v_mov_b32_e32 v36, v31 -; GCN-NEXT: v_mov_b32_e32 v37, v31 -; GCN-NEXT: v_mov_b32_e32 v38, v31 -; GCN-NEXT: v_mov_b32_e32 v48, v31 -; GCN-NEXT: v_mov_b32_e32 v49, v31 -; GCN-NEXT: v_mov_b32_e32 v50, v31 -; GCN-NEXT: v_mov_b32_e32 v51, v31 -; GCN-NEXT: v_mov_b32_e32 v52, v31 -; GCN-NEXT: v_mov_b32_e32 v53, v31 -; GCN-NEXT: v_mov_b32_e32 v54, v31 -; GCN-NEXT: v_mov_b32_e32 v55, v31 -; GCN-NEXT: v_mov_b32_e32 v39, v31 -; GCN-NEXT: v_mov_b32_e32 v40, v31 -; GCN-NEXT: v_mov_b32_e32 v41, v31 -; GCN-NEXT: v_mov_b32_e32 v42, v31 -; GCN-NEXT: v_mov_b32_e32 v43, v31 -; GCN-NEXT: v_mov_b32_e32 v44, v31 -; GCN-NEXT: v_mov_b32_e32 v45, v31 -; GCN-NEXT: v_mov_b32_e32 v46, v31 -; GCN-NEXT: v_mov_b32_e32 v56, v31 -; GCN-NEXT: v_mov_b32_e32 v57, v31 -; GCN-NEXT: v_mov_b32_e32 v58, v31 -; GCN-NEXT: v_mov_b32_e32 v59, v31 -; GCN-NEXT: v_mov_b32_e32 v60, v31 -; GCN-NEXT: v_mov_b32_e32 v61, v31 -; GCN-NEXT: v_mov_b32_e32 v62, v31 -; GCN-NEXT: v_mov_b32_e32 v63, v31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB142_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v36, v4, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v29 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_alignbit_b32 v37, v22, v37, 16 -; GCN-NEXT: v_alignbit_b32 v38, v29, v38, 16 -; GCN-NEXT: v_alignbit_b32 v48, v30, v39, 16 -; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16 -; GCN-NEXT: v_alignbit_b32 v50, v51, v59, 16 -; GCN-NEXT: v_alignbit_b32 v51, v52, v62, 16 -; GCN-NEXT: v_alignbit_b32 v52, v40, v63, 16 -; GCN-NEXT: v_alignbit_b32 v53, v53, v61, 16 -; GCN-NEXT: v_alignbit_b32 v54, v54, v0, 16 -; GCN-NEXT: v_alignbit_b32 v55, v55, v3, 16 -; GCN-NEXT: v_alignbit_b32 v39, v41, v4, 16 -; GCN-NEXT: v_alignbit_b32 v40, v42, v5, 16 -; GCN-NEXT: v_alignbit_b32 v41, v43, v6, 16 -; GCN-NEXT: v_alignbit_b32 v42, v44, v7, 16 -; GCN-NEXT: v_alignbit_b32 v43, v45, v8, 16 -; GCN-NEXT: v_alignbit_b32 v44, v46, v9, 16 -; GCN-NEXT: v_alignbit_b32 v45, v47, v10, 16 -; GCN-NEXT: v_alignbit_b32 v46, v56, v11, 16 -; GCN-NEXT: v_alignbit_b32 v56, v27, v12, 16 -; GCN-NEXT: v_alignbit_b32 v57, v26, v13, 16 -; GCN-NEXT: v_alignbit_b32 v58, v25, v14, 16 -; GCN-NEXT: v_alignbit_b32 v59, v28, v15, 16 -; GCN-NEXT: v_alignbit_b32 v60, v23, v16, 16 -; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16 -; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16 -; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16 -; GCN-NEXT: .LBB142_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[60:63], v[1:2], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_store_dwordx4 v[56:59], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v64bf16_to_v128i8: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: v_mov_b32_e32 v6, v3 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: v_mov_b32_e32 v8, v3 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v10, v3 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: v_mov_b32_e32 v12, v3 -; VI-NEXT: v_mov_b32_e32 v13, v3 -; VI-NEXT: v_mov_b32_e32 v14, v3 -; VI-NEXT: v_mov_b32_e32 v15, v3 -; VI-NEXT: v_mov_b32_e32 v16, v3 -; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v18, v3 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: v_mov_b32_e32 v20, v3 -; VI-NEXT: v_mov_b32_e32 v21, v3 -; VI-NEXT: v_mov_b32_e32 v22, v3 -; VI-NEXT: v_mov_b32_e32 v23, v3 -; VI-NEXT: v_mov_b32_e32 v24, v3 -; VI-NEXT: v_mov_b32_e32 v25, v3 -; VI-NEXT: v_mov_b32_e32 v26, v3 -; VI-NEXT: v_mov_b32_e32 v27, v3 -; VI-NEXT: v_mov_b32_e32 v28, v3 -; VI-NEXT: v_mov_b32_e32 v29, v3 -; VI-NEXT: v_mov_b32_e32 v30, v3 -; VI-NEXT: v_mov_b32_e32 v31, v3 -; VI-NEXT: v_mov_b32_e32 v32, v3 -; VI-NEXT: v_mov_b32_e32 v33, v3 -; VI-NEXT: v_mov_b32_e32 v34, v3 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB142_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: .LBB142_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x70, v1 -; VI-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dwordx4 v[35:36], v[31:34] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x60, v1 -; VI-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[31:32], v[27:30] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x50, v1 -; VI-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[27:28], v[23:26] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 64, v1 -; VI-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[23:24], v[19:22] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[19:20], v[15:18] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[3:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v64bf16_to_v128i8: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-NEXT: v_mov_b32_e32 v13, v3 -; GFX9-NEXT: v_mov_b32_e32 v14, v3 -; GFX9-NEXT: v_mov_b32_e32 v15, v3 -; GFX9-NEXT: v_mov_b32_e32 v16, v3 -; GFX9-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-NEXT: v_mov_b32_e32 v18, v3 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: v_mov_b32_e32 v20, v3 -; GFX9-NEXT: v_mov_b32_e32 v21, v3 -; GFX9-NEXT: v_mov_b32_e32 v22, v3 -; GFX9-NEXT: v_mov_b32_e32 v23, v3 -; GFX9-NEXT: v_mov_b32_e32 v24, v3 -; GFX9-NEXT: v_mov_b32_e32 v25, v3 -; GFX9-NEXT: v_mov_b32_e32 v26, v3 -; GFX9-NEXT: v_mov_b32_e32 v27, v3 -; GFX9-NEXT: v_mov_b32_e32 v28, v3 -; GFX9-NEXT: v_mov_b32_e32 v29, v3 -; GFX9-NEXT: v_mov_b32_e32 v30, v3 -; GFX9-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-NEXT: v_mov_b32_e32 v32, v3 -; GFX9-NEXT: v_mov_b32_e32 v33, v3 -; GFX9-NEXT: v_mov_b32_e32 v34, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB142_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: .LBB142_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off offset:64 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v64bf16_to_v128i8: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: v_mov_b32_e32 v35, 0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v36, v35 -; GFX11-NEXT: v_mov_b32_e32 v37, v35 -; GFX11-NEXT: v_mov_b32_e32 v38, v35 -; GFX11-NEXT: v_mov_b32_e32 v39, v35 -; GFX11-NEXT: v_mov_b32_e32 v40, v35 -; GFX11-NEXT: v_mov_b32_e32 v41, v35 -; GFX11-NEXT: v_mov_b32_e32 v42, v35 -; GFX11-NEXT: v_mov_b32_e32 v43, v35 -; GFX11-NEXT: v_mov_b32_e32 v44, v35 -; GFX11-NEXT: v_mov_b32_e32 v45, v35 -; GFX11-NEXT: v_mov_b32_e32 v46, v35 -; GFX11-NEXT: v_mov_b32_e32 v47, v35 -; GFX11-NEXT: v_mov_b32_e32 v48, v35 -; GFX11-NEXT: v_mov_b32_e32 v49, v35 -; GFX11-NEXT: v_mov_b32_e32 v50, v35 -; GFX11-NEXT: v_mov_b32_e32 v51, v35 -; GFX11-NEXT: v_mov_b32_e32 v52, v35 -; GFX11-NEXT: v_mov_b32_e32 v53, v35 -; GFX11-NEXT: v_mov_b32_e32 v54, v35 -; GFX11-NEXT: v_mov_b32_e32 v55, v35 -; GFX11-NEXT: v_mov_b32_e32 v56, v35 -; GFX11-NEXT: v_mov_b32_e32 v57, v35 -; GFX11-NEXT: v_mov_b32_e32 v58, v35 -; GFX11-NEXT: v_mov_b32_e32 v59, v35 -; GFX11-NEXT: v_mov_b32_e32 v60, v35 -; GFX11-NEXT: v_mov_b32_e32 v61, v35 -; GFX11-NEXT: v_mov_b32_e32 v62, v35 -; GFX11-NEXT: v_mov_b32_e32 v63, v35 -; GFX11-NEXT: v_mov_b32_e32 v64, v35 -; GFX11-NEXT: v_mov_b32_e32 v65, v35 -; GFX11-NEXT: v_mov_b32_e32 v66, v35 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_cbranch_execz .LBB142_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33 -; GFX11-NEXT: v_dual_mov_b32 v64, v32 :: v_dual_mov_b32 v63, v31 -; GFX11-NEXT: v_dual_mov_b32 v62, v30 :: v_dual_mov_b32 v61, v29 -; GFX11-NEXT: v_dual_mov_b32 v60, v28 :: v_dual_mov_b32 v59, v27 -; GFX11-NEXT: v_dual_mov_b32 v58, v26 :: v_dual_mov_b32 v57, v25 -; GFX11-NEXT: v_dual_mov_b32 v56, v24 :: v_dual_mov_b32 v55, v23 -; GFX11-NEXT: v_dual_mov_b32 v54, v22 :: v_dual_mov_b32 v53, v21 -; GFX11-NEXT: v_dual_mov_b32 v52, v20 :: v_dual_mov_b32 v51, v19 -; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v49, v17 -; GFX11-NEXT: v_dual_mov_b32 v48, v16 :: v_dual_mov_b32 v47, v15 -; GFX11-NEXT: v_dual_mov_b32 v46, v14 :: v_dual_mov_b32 v45, v13 -; GFX11-NEXT: v_dual_mov_b32 v44, v12 :: v_dual_mov_b32 v43, v11 -; GFX11-NEXT: v_dual_mov_b32 v42, v10 :: v_dual_mov_b32 v41, v9 -; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7 -; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5 -; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3 -; GFX11-NEXT: .LBB142_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112 -; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:96 -; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:80 -; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:64 -; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <64 x bfloat> %value to <128 x i8> - br label %end - -end: - %phi = phi <128 x i8> [zeroinitializer, %entry], [%cast, %if] - store <128 x i8> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v64bf16_to_v16i64(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v64bf16_to_v16i64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v32, v31 -; GCN-NEXT: v_mov_b32_e32 v33, v31 -; GCN-NEXT: v_mov_b32_e32 v34, v31 -; GCN-NEXT: v_mov_b32_e32 v35, v31 -; GCN-NEXT: v_mov_b32_e32 v36, v31 -; GCN-NEXT: v_mov_b32_e32 v37, v31 -; GCN-NEXT: v_mov_b32_e32 v38, v31 -; GCN-NEXT: v_mov_b32_e32 v48, v31 -; GCN-NEXT: v_mov_b32_e32 v49, v31 -; GCN-NEXT: v_mov_b32_e32 v50, v31 -; GCN-NEXT: v_mov_b32_e32 v51, v31 -; GCN-NEXT: v_mov_b32_e32 v52, v31 -; GCN-NEXT: v_mov_b32_e32 v53, v31 -; GCN-NEXT: v_mov_b32_e32 v54, v31 -; GCN-NEXT: v_mov_b32_e32 v55, v31 -; GCN-NEXT: v_mov_b32_e32 v39, v31 -; GCN-NEXT: v_mov_b32_e32 v40, v31 -; GCN-NEXT: v_mov_b32_e32 v41, v31 -; GCN-NEXT: v_mov_b32_e32 v42, v31 -; GCN-NEXT: v_mov_b32_e32 v43, v31 -; GCN-NEXT: v_mov_b32_e32 v44, v31 -; GCN-NEXT: v_mov_b32_e32 v45, v31 -; GCN-NEXT: v_mov_b32_e32 v46, v31 -; GCN-NEXT: v_mov_b32_e32 v56, v31 -; GCN-NEXT: v_mov_b32_e32 v57, v31 -; GCN-NEXT: v_mov_b32_e32 v58, v31 -; GCN-NEXT: v_mov_b32_e32 v59, v31 -; GCN-NEXT: v_mov_b32_e32 v60, v31 -; GCN-NEXT: v_mov_b32_e32 v61, v31 -; GCN-NEXT: v_mov_b32_e32 v62, v31 -; GCN-NEXT: v_mov_b32_e32 v63, v31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB143_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v36, v4, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v29 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_alignbit_b32 v37, v22, v37, 16 -; GCN-NEXT: v_alignbit_b32 v38, v29, v38, 16 -; GCN-NEXT: v_alignbit_b32 v48, v30, v39, 16 -; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16 -; GCN-NEXT: v_alignbit_b32 v50, v51, v59, 16 -; GCN-NEXT: v_alignbit_b32 v51, v52, v62, 16 -; GCN-NEXT: v_alignbit_b32 v52, v40, v63, 16 -; GCN-NEXT: v_alignbit_b32 v53, v53, v61, 16 -; GCN-NEXT: v_alignbit_b32 v54, v54, v0, 16 -; GCN-NEXT: v_alignbit_b32 v55, v55, v3, 16 -; GCN-NEXT: v_alignbit_b32 v39, v41, v4, 16 -; GCN-NEXT: v_alignbit_b32 v40, v42, v5, 16 -; GCN-NEXT: v_alignbit_b32 v41, v43, v6, 16 -; GCN-NEXT: v_alignbit_b32 v42, v44, v7, 16 -; GCN-NEXT: v_alignbit_b32 v43, v45, v8, 16 -; GCN-NEXT: v_alignbit_b32 v44, v46, v9, 16 -; GCN-NEXT: v_alignbit_b32 v45, v47, v10, 16 -; GCN-NEXT: v_alignbit_b32 v46, v56, v11, 16 -; GCN-NEXT: v_alignbit_b32 v56, v27, v12, 16 -; GCN-NEXT: v_alignbit_b32 v57, v26, v13, 16 -; GCN-NEXT: v_alignbit_b32 v58, v25, v14, 16 -; GCN-NEXT: v_alignbit_b32 v59, v28, v15, 16 -; GCN-NEXT: v_alignbit_b32 v60, v23, v16, 16 -; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16 -; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16 -; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16 -; GCN-NEXT: .LBB143_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[60:63], v[1:2], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_store_dwordx4 v[56:59], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v64bf16_to_v16i64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: v_mov_b32_e32 v6, v3 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: v_mov_b32_e32 v8, v3 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v10, v3 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: v_mov_b32_e32 v12, v3 -; VI-NEXT: v_mov_b32_e32 v13, v3 -; VI-NEXT: v_mov_b32_e32 v14, v3 -; VI-NEXT: v_mov_b32_e32 v15, v3 -; VI-NEXT: v_mov_b32_e32 v16, v3 -; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v18, v3 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: v_mov_b32_e32 v20, v3 -; VI-NEXT: v_mov_b32_e32 v21, v3 -; VI-NEXT: v_mov_b32_e32 v22, v3 -; VI-NEXT: v_mov_b32_e32 v23, v3 -; VI-NEXT: v_mov_b32_e32 v24, v3 -; VI-NEXT: v_mov_b32_e32 v25, v3 -; VI-NEXT: v_mov_b32_e32 v26, v3 -; VI-NEXT: v_mov_b32_e32 v27, v3 -; VI-NEXT: v_mov_b32_e32 v28, v3 -; VI-NEXT: v_mov_b32_e32 v29, v3 -; VI-NEXT: v_mov_b32_e32 v30, v3 -; VI-NEXT: v_mov_b32_e32 v31, v3 -; VI-NEXT: v_mov_b32_e32 v32, v3 -; VI-NEXT: v_mov_b32_e32 v33, v3 -; VI-NEXT: v_mov_b32_e32 v34, v3 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB143_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: .LBB143_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x70, v1 -; VI-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dwordx4 v[35:36], v[31:34] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x60, v1 -; VI-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[31:32], v[27:30] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x50, v1 -; VI-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[27:28], v[23:26] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 64, v1 -; VI-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[23:24], v[19:22] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[19:20], v[15:18] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[3:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v64bf16_to_v16i64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-NEXT: v_mov_b32_e32 v13, v3 -; GFX9-NEXT: v_mov_b32_e32 v14, v3 -; GFX9-NEXT: v_mov_b32_e32 v15, v3 -; GFX9-NEXT: v_mov_b32_e32 v16, v3 -; GFX9-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-NEXT: v_mov_b32_e32 v18, v3 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: v_mov_b32_e32 v20, v3 -; GFX9-NEXT: v_mov_b32_e32 v21, v3 -; GFX9-NEXT: v_mov_b32_e32 v22, v3 -; GFX9-NEXT: v_mov_b32_e32 v23, v3 -; GFX9-NEXT: v_mov_b32_e32 v24, v3 -; GFX9-NEXT: v_mov_b32_e32 v25, v3 -; GFX9-NEXT: v_mov_b32_e32 v26, v3 -; GFX9-NEXT: v_mov_b32_e32 v27, v3 -; GFX9-NEXT: v_mov_b32_e32 v28, v3 -; GFX9-NEXT: v_mov_b32_e32 v29, v3 -; GFX9-NEXT: v_mov_b32_e32 v30, v3 -; GFX9-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-NEXT: v_mov_b32_e32 v32, v3 -; GFX9-NEXT: v_mov_b32_e32 v33, v3 -; GFX9-NEXT: v_mov_b32_e32 v34, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB143_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: .LBB143_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off offset:64 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v64bf16_to_v16i64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: v_mov_b32_e32 v35, 0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v36, v35 -; GFX11-NEXT: v_mov_b32_e32 v37, v35 -; GFX11-NEXT: v_mov_b32_e32 v38, v35 -; GFX11-NEXT: v_mov_b32_e32 v39, v35 -; GFX11-NEXT: v_mov_b32_e32 v40, v35 -; GFX11-NEXT: v_mov_b32_e32 v41, v35 -; GFX11-NEXT: v_mov_b32_e32 v42, v35 -; GFX11-NEXT: v_mov_b32_e32 v43, v35 -; GFX11-NEXT: v_mov_b32_e32 v44, v35 -; GFX11-NEXT: v_mov_b32_e32 v45, v35 -; GFX11-NEXT: v_mov_b32_e32 v46, v35 -; GFX11-NEXT: v_mov_b32_e32 v47, v35 -; GFX11-NEXT: v_mov_b32_e32 v48, v35 -; GFX11-NEXT: v_mov_b32_e32 v49, v35 -; GFX11-NEXT: v_mov_b32_e32 v50, v35 -; GFX11-NEXT: v_mov_b32_e32 v51, v35 -; GFX11-NEXT: v_mov_b32_e32 v52, v35 -; GFX11-NEXT: v_mov_b32_e32 v53, v35 -; GFX11-NEXT: v_mov_b32_e32 v54, v35 -; GFX11-NEXT: v_mov_b32_e32 v55, v35 -; GFX11-NEXT: v_mov_b32_e32 v56, v35 -; GFX11-NEXT: v_mov_b32_e32 v57, v35 -; GFX11-NEXT: v_mov_b32_e32 v58, v35 -; GFX11-NEXT: v_mov_b32_e32 v59, v35 -; GFX11-NEXT: v_mov_b32_e32 v60, v35 -; GFX11-NEXT: v_mov_b32_e32 v61, v35 -; GFX11-NEXT: v_mov_b32_e32 v62, v35 -; GFX11-NEXT: v_mov_b32_e32 v63, v35 -; GFX11-NEXT: v_mov_b32_e32 v64, v35 -; GFX11-NEXT: v_mov_b32_e32 v65, v35 -; GFX11-NEXT: v_mov_b32_e32 v66, v35 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_cbranch_execz .LBB143_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33 -; GFX11-NEXT: v_dual_mov_b32 v64, v32 :: v_dual_mov_b32 v63, v31 -; GFX11-NEXT: v_dual_mov_b32 v62, v30 :: v_dual_mov_b32 v61, v29 -; GFX11-NEXT: v_dual_mov_b32 v60, v28 :: v_dual_mov_b32 v59, v27 -; GFX11-NEXT: v_dual_mov_b32 v58, v26 :: v_dual_mov_b32 v57, v25 -; GFX11-NEXT: v_dual_mov_b32 v56, v24 :: v_dual_mov_b32 v55, v23 -; GFX11-NEXT: v_dual_mov_b32 v54, v22 :: v_dual_mov_b32 v53, v21 -; GFX11-NEXT: v_dual_mov_b32 v52, v20 :: v_dual_mov_b32 v51, v19 -; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v49, v17 -; GFX11-NEXT: v_dual_mov_b32 v48, v16 :: v_dual_mov_b32 v47, v15 -; GFX11-NEXT: v_dual_mov_b32 v46, v14 :: v_dual_mov_b32 v45, v13 -; GFX11-NEXT: v_dual_mov_b32 v44, v12 :: v_dual_mov_b32 v43, v11 -; GFX11-NEXT: v_dual_mov_b32 v42, v10 :: v_dual_mov_b32 v41, v9 -; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7 -; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5 -; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3 -; GFX11-NEXT: .LBB143_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112 -; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:96 -; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:80 -; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:64 -; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <64 x bfloat> %value to <16 x i64> - br label %end - -end: - %phi = phi <16 x i64> [zeroinitializer, %entry], [%cast, %if] - store <16 x i64> %phi, ptr addrspace(1) %out - ret void -} - - -define void @v_bitcast_v64bf16_to_v16f64(i32 %cond, ptr addrspace(1) %out, <64 x bfloat> %value) { -; GCN-LABEL: v_bitcast_v64bf16_to_v16f64: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:140 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:68 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 -; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:24 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v31, 0 -; GCN-NEXT: s_mov_b32 s6, 0 -; GCN-NEXT: v_mov_b32_e32 v32, v31 -; GCN-NEXT: v_mov_b32_e32 v33, v31 -; GCN-NEXT: v_mov_b32_e32 v34, v31 -; GCN-NEXT: v_mov_b32_e32 v35, v31 -; GCN-NEXT: v_mov_b32_e32 v36, v31 -; GCN-NEXT: v_mov_b32_e32 v37, v31 -; GCN-NEXT: v_mov_b32_e32 v38, v31 -; GCN-NEXT: v_mov_b32_e32 v48, v31 -; GCN-NEXT: v_mov_b32_e32 v49, v31 -; GCN-NEXT: v_mov_b32_e32 v50, v31 -; GCN-NEXT: v_mov_b32_e32 v51, v31 -; GCN-NEXT: v_mov_b32_e32 v52, v31 -; GCN-NEXT: v_mov_b32_e32 v53, v31 -; GCN-NEXT: v_mov_b32_e32 v54, v31 -; GCN-NEXT: v_mov_b32_e32 v55, v31 -; GCN-NEXT: v_mov_b32_e32 v39, v31 -; GCN-NEXT: v_mov_b32_e32 v40, v31 -; GCN-NEXT: v_mov_b32_e32 v41, v31 -; GCN-NEXT: v_mov_b32_e32 v42, v31 -; GCN-NEXT: v_mov_b32_e32 v43, v31 -; GCN-NEXT: v_mov_b32_e32 v44, v31 -; GCN-NEXT: v_mov_b32_e32 v45, v31 -; GCN-NEXT: v_mov_b32_e32 v46, v31 -; GCN-NEXT: v_mov_b32_e32 v56, v31 -; GCN-NEXT: v_mov_b32_e32 v57, v31 -; GCN-NEXT: v_mov_b32_e32 v58, v31 -; GCN-NEXT: v_mov_b32_e32 v59, v31 -; GCN-NEXT: v_mov_b32_e32 v60, v31 -; GCN-NEXT: v_mov_b32_e32 v61, v31 -; GCN-NEXT: v_mov_b32_e32 v62, v31 -; GCN-NEXT: v_mov_b32_e32 v63, v31 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB144_2 -; GCN-NEXT: ; %bb.1: ; %if -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v31, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v32, v4, v5, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v7 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v33, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v9 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_alignbit_b32 v34, v0, v3, 16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v13 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_alignbit_b32 v35, v0, v3, 16 -; GCN-NEXT: v_alignbit_b32 v36, v4, v5, 16 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v48, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v37, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v50, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v38, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v51, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v39, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v52, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v49, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v60, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v59, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v45, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v62, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v40, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v63, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v53, 1.0, v0 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v61, 1.0, v0 -; GCN-NEXT: v_mul_f32_e32 v54, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v55, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v41, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v47 -; GCN-NEXT: v_mul_f32_e32 v42, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v43, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v44, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v46, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v56, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v57, 1.0, v29 -; GCN-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v58, 1.0, v11 -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v12 -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v13 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v15 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v16 -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v17 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v18 -; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v19 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_lshrrev_b32_e32 v22, 16, v48 -; GCN-NEXT: v_lshrrev_b32_e32 v29, 16, v50 -; GCN-NEXT: v_lshrrev_b32_e32 v30, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v50, 16, v52 -; GCN-NEXT: v_lshrrev_b32_e32 v51, 16, v60 -; GCN-NEXT: v_lshrrev_b32_e32 v52, 16, v45 -; GCN-NEXT: v_lshrrev_b32_e32 v40, 16, v40 -; GCN-NEXT: v_lshrrev_b32_e32 v53, 16, v53 -; GCN-NEXT: v_lshrrev_b32_e32 v54, 16, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v55, 16, v55 -; GCN-NEXT: v_lshrrev_b32_e32 v41, 16, v41 -; GCN-NEXT: v_lshrrev_b32_e32 v42, 16, v42 -; GCN-NEXT: v_lshrrev_b32_e32 v43, 16, v43 -; GCN-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; GCN-NEXT: v_lshrrev_b32_e32 v45, 16, v46 -; GCN-NEXT: v_lshrrev_b32_e32 v46, 16, v56 -; GCN-NEXT: v_lshrrev_b32_e32 v47, 16, v57 -; GCN-NEXT: v_lshrrev_b32_e32 v56, 16, v58 -; GCN-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GCN-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GCN-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GCN-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GCN-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GCN-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GCN-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GCN-NEXT: v_alignbit_b32 v37, v22, v37, 16 -; GCN-NEXT: v_alignbit_b32 v38, v29, v38, 16 -; GCN-NEXT: v_alignbit_b32 v48, v30, v39, 16 -; GCN-NEXT: v_alignbit_b32 v49, v50, v49, 16 -; GCN-NEXT: v_alignbit_b32 v50, v51, v59, 16 -; GCN-NEXT: v_alignbit_b32 v51, v52, v62, 16 -; GCN-NEXT: v_alignbit_b32 v52, v40, v63, 16 -; GCN-NEXT: v_alignbit_b32 v53, v53, v61, 16 -; GCN-NEXT: v_alignbit_b32 v54, v54, v0, 16 -; GCN-NEXT: v_alignbit_b32 v55, v55, v3, 16 -; GCN-NEXT: v_alignbit_b32 v39, v41, v4, 16 -; GCN-NEXT: v_alignbit_b32 v40, v42, v5, 16 -; GCN-NEXT: v_alignbit_b32 v41, v43, v6, 16 -; GCN-NEXT: v_alignbit_b32 v42, v44, v7, 16 -; GCN-NEXT: v_alignbit_b32 v43, v45, v8, 16 -; GCN-NEXT: v_alignbit_b32 v44, v46, v9, 16 -; GCN-NEXT: v_alignbit_b32 v45, v47, v10, 16 -; GCN-NEXT: v_alignbit_b32 v46, v56, v11, 16 -; GCN-NEXT: v_alignbit_b32 v56, v27, v12, 16 -; GCN-NEXT: v_alignbit_b32 v57, v26, v13, 16 -; GCN-NEXT: v_alignbit_b32 v58, v25, v14, 16 -; GCN-NEXT: v_alignbit_b32 v59, v28, v15, 16 -; GCN-NEXT: v_alignbit_b32 v60, v23, v16, 16 -; GCN-NEXT: v_alignbit_b32 v61, v24, v17, 16 -; GCN-NEXT: v_alignbit_b32 v62, v20, v18, 16 -; GCN-NEXT: v_alignbit_b32 v63, v21, v19, 16 -; GCN-NEXT: .LBB144_2: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s4, s6 -; GCN-NEXT: s_mov_b32 s5, s6 -; GCN-NEXT: buffer_store_dwordx4 v[60:63], v[1:2], s[4:7], 0 addr64 offset:112 -; GCN-NEXT: buffer_store_dwordx4 v[56:59], v[1:2], s[4:7], 0 addr64 offset:96 -; GCN-NEXT: buffer_store_dwordx4 v[43:46], v[1:2], s[4:7], 0 addr64 offset:80 -; GCN-NEXT: buffer_store_dwordx4 v[39:42], v[1:2], s[4:7], 0 addr64 offset:64 -; GCN-NEXT: buffer_store_dwordx4 v[52:55], v[1:2], s[4:7], 0 addr64 offset:48 -; GCN-NEXT: buffer_store_dwordx4 v[48:51], v[1:2], s[4:7], 0 addr64 offset:32 -; GCN-NEXT: buffer_store_dwordx4 v[35:38], v[1:2], s[4:7], 0 addr64 offset:16 -; GCN-NEXT: buffer_store_dwordx4 v[31:34], v[1:2], s[4:7], 0 addr64 -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(6) -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(14) -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(5) -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt expcnt(4) -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: v_bitcast_v64bf16_to_v16f64: -; VI: ; %bb.0: ; %entry -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, 0 -; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: v_mov_b32_e32 v6, v3 -; VI-NEXT: v_mov_b32_e32 v7, v3 -; VI-NEXT: v_mov_b32_e32 v8, v3 -; VI-NEXT: v_mov_b32_e32 v9, v3 -; VI-NEXT: v_mov_b32_e32 v10, v3 -; VI-NEXT: v_mov_b32_e32 v11, v3 -; VI-NEXT: v_mov_b32_e32 v12, v3 -; VI-NEXT: v_mov_b32_e32 v13, v3 -; VI-NEXT: v_mov_b32_e32 v14, v3 -; VI-NEXT: v_mov_b32_e32 v15, v3 -; VI-NEXT: v_mov_b32_e32 v16, v3 -; VI-NEXT: v_mov_b32_e32 v17, v3 -; VI-NEXT: v_mov_b32_e32 v18, v3 -; VI-NEXT: v_mov_b32_e32 v19, v3 -; VI-NEXT: v_mov_b32_e32 v20, v3 -; VI-NEXT: v_mov_b32_e32 v21, v3 -; VI-NEXT: v_mov_b32_e32 v22, v3 -; VI-NEXT: v_mov_b32_e32 v23, v3 -; VI-NEXT: v_mov_b32_e32 v24, v3 -; VI-NEXT: v_mov_b32_e32 v25, v3 -; VI-NEXT: v_mov_b32_e32 v26, v3 -; VI-NEXT: v_mov_b32_e32 v27, v3 -; VI-NEXT: v_mov_b32_e32 v28, v3 -; VI-NEXT: v_mov_b32_e32 v29, v3 -; VI-NEXT: v_mov_b32_e32 v30, v3 -; VI-NEXT: v_mov_b32_e32 v31, v3 -; VI-NEXT: v_mov_b32_e32 v32, v3 -; VI-NEXT: v_mov_b32_e32 v33, v3 -; VI-NEXT: v_mov_b32_e32 v34, v3 -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_cbranch_execz .LBB144_2 -; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: .LBB144_2: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x70, v1 -; VI-NEXT: v_addc_u32_e32 v36, vcc, 0, v2, vcc -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: flat_store_dwordx4 v[35:36], v[31:34] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x60, v1 -; VI-NEXT: v_addc_u32_e32 v32, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[31:32], v[27:30] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v27, vcc, 0x50, v1 -; VI-NEXT: v_addc_u32_e32 v28, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[27:28], v[23:26] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 64, v1 -; VI-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[23:24], v[19:22] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 48, v1 -; VI-NEXT: v_addc_u32_e32 v20, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[19:20], v[15:18] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 32, v1 -; VI-NEXT: v_addc_u32_e32 v16, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[15:16], v[11:14] -; VI-NEXT: s_nop 0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 16, v1 -; VI-NEXT: v_addc_u32_e32 v12, vcc, 0, v2, vcc -; VI-NEXT: flat_store_dwordx4 v[11:12], v[7:10] -; VI-NEXT: flat_store_dwordx4 v[1:2], v[3:6] -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: v_bitcast_v64bf16_to_v16f64: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, v3 -; GFX9-NEXT: v_mov_b32_e32 v6, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, v3 -; GFX9-NEXT: v_mov_b32_e32 v8, v3 -; GFX9-NEXT: v_mov_b32_e32 v9, v3 -; GFX9-NEXT: v_mov_b32_e32 v10, v3 -; GFX9-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-NEXT: v_mov_b32_e32 v12, v3 -; GFX9-NEXT: v_mov_b32_e32 v13, v3 -; GFX9-NEXT: v_mov_b32_e32 v14, v3 -; GFX9-NEXT: v_mov_b32_e32 v15, v3 -; GFX9-NEXT: v_mov_b32_e32 v16, v3 -; GFX9-NEXT: v_mov_b32_e32 v17, v3 -; GFX9-NEXT: v_mov_b32_e32 v18, v3 -; GFX9-NEXT: v_mov_b32_e32 v19, v3 -; GFX9-NEXT: v_mov_b32_e32 v20, v3 -; GFX9-NEXT: v_mov_b32_e32 v21, v3 -; GFX9-NEXT: v_mov_b32_e32 v22, v3 -; GFX9-NEXT: v_mov_b32_e32 v23, v3 -; GFX9-NEXT: v_mov_b32_e32 v24, v3 -; GFX9-NEXT: v_mov_b32_e32 v25, v3 -; GFX9-NEXT: v_mov_b32_e32 v26, v3 -; GFX9-NEXT: v_mov_b32_e32 v27, v3 -; GFX9-NEXT: v_mov_b32_e32 v28, v3 -; GFX9-NEXT: v_mov_b32_e32 v29, v3 -; GFX9-NEXT: v_mov_b32_e32 v30, v3 -; GFX9-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-NEXT: v_mov_b32_e32 v32, v3 -; GFX9-NEXT: v_mov_b32_e32 v33, v3 -; GFX9-NEXT: v_mov_b32_e32 v34, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB144_2 -; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: .LBB144_2: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[31:34], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[27:30], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[23:26], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[19:22], off offset:64 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[15:18], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[11:14], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[7:10], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[1:2], v[3:6], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: v_bitcast_v64bf16_to_v16f64: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:16 -; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: v_mov_b32_e32 v35, 0 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v36, v35 -; GFX11-NEXT: v_mov_b32_e32 v37, v35 -; GFX11-NEXT: v_mov_b32_e32 v38, v35 -; GFX11-NEXT: v_mov_b32_e32 v39, v35 -; GFX11-NEXT: v_mov_b32_e32 v40, v35 -; GFX11-NEXT: v_mov_b32_e32 v41, v35 -; GFX11-NEXT: v_mov_b32_e32 v42, v35 -; GFX11-NEXT: v_mov_b32_e32 v43, v35 -; GFX11-NEXT: v_mov_b32_e32 v44, v35 -; GFX11-NEXT: v_mov_b32_e32 v45, v35 -; GFX11-NEXT: v_mov_b32_e32 v46, v35 -; GFX11-NEXT: v_mov_b32_e32 v47, v35 -; GFX11-NEXT: v_mov_b32_e32 v48, v35 -; GFX11-NEXT: v_mov_b32_e32 v49, v35 -; GFX11-NEXT: v_mov_b32_e32 v50, v35 -; GFX11-NEXT: v_mov_b32_e32 v51, v35 -; GFX11-NEXT: v_mov_b32_e32 v52, v35 -; GFX11-NEXT: v_mov_b32_e32 v53, v35 -; GFX11-NEXT: v_mov_b32_e32 v54, v35 -; GFX11-NEXT: v_mov_b32_e32 v55, v35 -; GFX11-NEXT: v_mov_b32_e32 v56, v35 -; GFX11-NEXT: v_mov_b32_e32 v57, v35 -; GFX11-NEXT: v_mov_b32_e32 v58, v35 -; GFX11-NEXT: v_mov_b32_e32 v59, v35 -; GFX11-NEXT: v_mov_b32_e32 v60, v35 -; GFX11-NEXT: v_mov_b32_e32 v61, v35 -; GFX11-NEXT: v_mov_b32_e32 v62, v35 -; GFX11-NEXT: v_mov_b32_e32 v63, v35 -; GFX11-NEXT: v_mov_b32_e32 v64, v35 -; GFX11-NEXT: v_mov_b32_e32 v65, v35 -; GFX11-NEXT: v_mov_b32_e32 v66, v35 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_cbranch_execz .LBB144_2 -; GFX11-NEXT: ; %bb.1: ; %if -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v66, v34 :: v_dual_mov_b32 v65, v33 -; GFX11-NEXT: v_dual_mov_b32 v64, v32 :: v_dual_mov_b32 v63, v31 -; GFX11-NEXT: v_dual_mov_b32 v62, v30 :: v_dual_mov_b32 v61, v29 -; GFX11-NEXT: v_dual_mov_b32 v60, v28 :: v_dual_mov_b32 v59, v27 -; GFX11-NEXT: v_dual_mov_b32 v58, v26 :: v_dual_mov_b32 v57, v25 -; GFX11-NEXT: v_dual_mov_b32 v56, v24 :: v_dual_mov_b32 v55, v23 -; GFX11-NEXT: v_dual_mov_b32 v54, v22 :: v_dual_mov_b32 v53, v21 -; GFX11-NEXT: v_dual_mov_b32 v52, v20 :: v_dual_mov_b32 v51, v19 -; GFX11-NEXT: v_dual_mov_b32 v50, v18 :: v_dual_mov_b32 v49, v17 -; GFX11-NEXT: v_dual_mov_b32 v48, v16 :: v_dual_mov_b32 v47, v15 -; GFX11-NEXT: v_dual_mov_b32 v46, v14 :: v_dual_mov_b32 v45, v13 -; GFX11-NEXT: v_dual_mov_b32 v44, v12 :: v_dual_mov_b32 v43, v11 -; GFX11-NEXT: v_dual_mov_b32 v42, v10 :: v_dual_mov_b32 v41, v9 -; GFX11-NEXT: v_dual_mov_b32 v40, v8 :: v_dual_mov_b32 v39, v7 -; GFX11-NEXT: v_dual_mov_b32 v38, v6 :: v_dual_mov_b32 v37, v5 -; GFX11-NEXT: v_dual_mov_b32 v36, v4 :: v_dual_mov_b32 v35, v3 -; GFX11-NEXT: .LBB144_2: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: s_clause 0x7 -; GFX11-NEXT: global_store_b128 v[1:2], v[63:66], off offset:112 -; GFX11-NEXT: global_store_b128 v[1:2], v[59:62], off offset:96 -; GFX11-NEXT: global_store_b128 v[1:2], v[55:58], off offset:80 -; GFX11-NEXT: global_store_b128 v[1:2], v[51:54], off offset:64 -; GFX11-NEXT: global_store_b128 v[1:2], v[47:50], off offset:48 -; GFX11-NEXT: global_store_b128 v[1:2], v[43:46], off offset:32 -; GFX11-NEXT: global_store_b128 v[1:2], v[39:42], off offset:16 -; GFX11-NEXT: global_store_b128 v[1:2], v[35:38], off -; GFX11-NEXT: s_clause 0xf -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:76 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] -entry: - %cmp0 = icmp eq i32 %cond, 0 - br i1 %cmp0, label %if, label %end - -if: - %cast = bitcast <64 x bfloat> %value to <16 x double> - br label %end - -end: - %phi = phi <16 x double> [zeroinitializer, %entry], [%cast, %if] - store <16 x double> %phi, ptr addrspace(1) %out - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll new file mode 100644 index 0000000000000..eb4429958dfa5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s + +define amdgpu_kernel void @bitcast_i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GCN-LABEL: bitcast_i8ptr_v16i8ptr: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; VI-LABEL: bitcast_i8ptr_v16i8ptr: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: bitcast_i8ptr_v16i8ptr: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX11-LABEL: bitcast_i8ptr_v16i8ptr: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm +entry: + %0 = load <16 x i8>, ptr addrspace(1) %in + store <16 x i8> %0, ptr addrspace(1) %out + ret void +} From bdd087023f02fb377302595bf7f61e9cae8adb71 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Tue, 8 Apr 2025 05:48:55 +0200 Subject: [PATCH 0937/1029] [clang][bytecode] Fix various issues with multidimensional arrays (#134628) This issue is very convoluted, but in essence, in the new version: For a Pointer P that points to the root of a multidimensional, primitive array: `P.narrow()` does nothing. `P.atIndex(0)` points `P[0]` `P.atIndex(0).atIndex(0)` is the same as `P.atIndex(0)` (as before) `P.atIndex(0).narrow().atIndex(0)` points to `P[0][0]` `P.atIndex(0).narrow().narrow()` is the same as `P.atIndex(0).narrow()`. --- clang/lib/AST/ByteCode/Compiler.cpp | 3 +- clang/lib/AST/ByteCode/Interp.h | 31 ++++++++--- clang/lib/AST/ByteCode/Pointer.h | 35 +++++------- clang/test/AST/ByteCode/arrays.cpp | 84 ++++++++++++++++++++++++++++- 4 files changed, 123 insertions(+), 30 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 021acbd798646..dd246f7ef74fc 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -6148,7 +6148,8 @@ bool Compiler::VisitUnaryOperator(const UnaryOperator *E) { if (!this->visit(SubExpr)) return false; - if (classifyPrim(SubExpr) == PT_Ptr && !E->getType()->isArrayType()) + + if (classifyPrim(SubExpr) == PT_Ptr) return this->emitNarrowPtr(E); return true; diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 6fe1d4b1f95ae..ee69cea039990 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -2059,8 +2059,11 @@ bool OffsetHelper(InterpState &S, CodePtr OpPC, const T &Offset, // useful thing we can do. Any other index has been diagnosed before and // we don't get here. if (Result == 0 && Ptr.isOnePastEnd()) { - S.Stk.push(Ptr.asBlockPointer().Pointee, - Ptr.asBlockPointer().Base); + if (Ptr.getFieldDesc()->isArray()) + S.Stk.push(Ptr.atIndex(0)); + else + S.Stk.push(Ptr.asBlockPointer().Pointee, + Ptr.asBlockPointer().Base); return true; } @@ -2677,8 +2680,16 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) { return false; } - if (!OffsetHelper(S, OpPC, Offset, Ptr)) - return false; + if (Offset.isZero()) { + if (Ptr.getFieldDesc()->isArray() && Ptr.getIndex() == 0) { + S.Stk.push(Ptr.atIndex(0)); + } else { + S.Stk.push(Ptr); + } + } else { + if (!OffsetHelper(S, OpPC, Offset, Ptr)) + return false; + } return NarrowPtr(S, OpPC); } @@ -2693,8 +2704,16 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) { return false; } - if (!OffsetHelper(S, OpPC, Offset, Ptr)) - return false; + if (Offset.isZero()) { + if (Ptr.getFieldDesc()->isArray() && Ptr.getIndex() == 0) { + S.Stk.push(Ptr.atIndex(0)); + } else { + S.Stk.push(Ptr); + } + } else { + if (!OffsetHelper(S, OpPC, Offset, Ptr)) + return false; + } return NarrowPtr(S, OpPC); } diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h index 988237d39fff4..64af5ed9b0a5d 100644 --- a/clang/lib/AST/ByteCode/Pointer.h +++ b/clang/lib/AST/ByteCode/Pointer.h @@ -200,37 +200,28 @@ class Pointer { if (isZero() || isUnknownSizeArray()) return *this; + unsigned Base = asBlockPointer().Base; // Pointer to an array of base types - enter block. - if (asBlockPointer().Base == RootPtrMark) + if (Base == RootPtrMark) return Pointer(asBlockPointer().Pointee, sizeof(InlineDescriptor), Offset == 0 ? Offset : PastEndMark); // Pointer is one past end - magic offset marks that. if (isOnePastEnd()) - return Pointer(asBlockPointer().Pointee, asBlockPointer().Base, - PastEndMark); - - // Primitive arrays are a bit special since they do not have inline - // descriptors. If Offset != Base, then the pointer already points to - // an element and there is nothing to do. Otherwise, the pointer is - // adjusted to the first element of the array. - if (inPrimitiveArray()) { - if (Offset != asBlockPointer().Base) + return Pointer(asBlockPointer().Pointee, Base, PastEndMark); + + if (Offset != Base) { + // If we're pointing to a primitive array element, there's nothing to do. + if (inPrimitiveArray()) return *this; - return Pointer(asBlockPointer().Pointee, asBlockPointer().Base, - Offset + sizeof(InitMapPtr)); + // Pointer is to a composite array element - enter it. + if (Offset != Base) + return Pointer(asBlockPointer().Pointee, Offset, Offset); } - // Pointer is to a field or array element - enter it. - if (Offset != asBlockPointer().Base) - return Pointer(asBlockPointer().Pointee, Offset, Offset); - - // Enter the first element of an array. - if (!getFieldDesc()->isArray()) - return *this; - - const unsigned NewBase = asBlockPointer().Base + sizeof(InlineDescriptor); - return Pointer(asBlockPointer().Pointee, NewBase, NewBase); + // Otherwise, we're pointing to a non-array element or + // are already narrowed to a composite array element. Nothing to do. + return *this; } /// Expands a pointer to the containing array, undoing narrowing. diff --git a/clang/test/AST/ByteCode/arrays.cpp b/clang/test/AST/ByteCode/arrays.cpp index 2ef0cf886b2dc..8af82163fd815 100644 --- a/clang/test/AST/ByteCode/arrays.cpp +++ b/clang/test/AST/ByteCode/arrays.cpp @@ -637,11 +637,93 @@ static_assert(get2() == same_entity_2, "failed to find previous decl"); constexpr int zs[2][2][2][2] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; constexpr int fail(const int &p) { - return (&p)[64]; // both-note {{cannot refer to element 64 of array of 2 elements}} + return (&p)[64]; // both-note 2{{cannot refer to element 64 of array of 2 elements}} \ + // both-note {{cannot refer to element 65 of array of 2 elements}} \ + // both-note {{cannot refer to element 66 of array of 2 elements}} } static_assert(fail(*(&(&(*(*&(&zs[2] - 1)[0] + 2 - 2))[2])[-1][2] - 2)) == 11, ""); // both-error {{not an integral constant expression}} \ // both-note {{in call to}} + +static_assert(fail( // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'fail(zs[1][1][0][0])'}} + *(*(*((* + (zs + 1)) /// int[2][2][2] + + 1) /// int[2][2] + + 2 - 2) /// int[2] + + 2 - 2) /// int + )); + +static_assert(fail( // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'fail(zs[1][0][0][1])'}} + *(*(*((* + (zs + 1)) /// int[2][2][2] + + 0) /// int[2][2] + + 2 - 2) /// int[2] + + 1) /// int + )); + +static_assert(fail( // both-error {{not an integral constant expression}} \ + // both-note {{in call to 'fail(zs[1][0][0][2])'}} + *(*(*((* + (zs + 1)) /// int[2][2][2] + + 0) /// int[2][2] + + 2 - 2) /// int[2] + + 2) /// int + )); + +namespace ZeroIndex { + constexpr char foo(const char *a) { + return a[0]; + } + constexpr const char *f = "abc"; + static_assert(foo(f + 1) == 'b', ""); +} + +namespace MultiDimArrayOffset { +#define assert(x) (x ? void(0) : __builtin_abort()) + struct R { + int a; + }; + + template + class view { + public: + T* V; + T* current; + + constexpr view(T*V) : V(V), current(V) {} + + constexpr void operator+=(unsigned N) { + current += N; + } + + constexpr auto operator*() { + return *current; + } + + }; + + constexpr int foo() { + R buffer[2][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}}; + + auto A = buffer; + A += 1; + assert((**A).a == 5); + assert(buffer == buffer + 1 - 1); + + assert(--A+0 == buffer+0); + + view V(buffer); + assert(*V == &buffer[0][0]); + V += 1; + assert(*V == &buffer[1][0]); + assert(*(V.current) == &buffer[1][0]); + return 1; + } + static_assert(foo() == 1, ""); +} + namespace ZeroSizeTypes { constexpr int (*p1)[0] = 0, (*p2)[0] = 0; constexpr int k = p2 - p1; // both-error {{constexpr variable 'k' must be initialized by a constant expression}} \ From fb9915a3918e3a9659a7f2825ee35bada3a2baf1 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Tue, 8 Apr 2025 06:00:35 +0200 Subject: [PATCH 0938/1029] [clang][bytecode] Fix emitDestruction() for dummy descriptors (#134665) This might happen if the referenced declaration is invalid and thus gets a dummy descriptor. We ran into an assertion later on. --- clang/lib/AST/ByteCode/Compiler.cpp | 4 ++++ clang/test/AST/ByteCode/cxx17.cpp | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index dd246f7ef74fc..db87ea1b6016f 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -6801,6 +6801,10 @@ bool Compiler::emitDestruction(const Descriptor *Desc, assert(!Desc->isPrimitive()); assert(!Desc->isPrimitiveArray()); + // Can happen if the decl is invalid. + if (Desc->isDummy()) + return true; + // Arrays. if (Desc->isArray()) { const Descriptor *ElemDesc = Desc->ElemDesc; diff --git a/clang/test/AST/ByteCode/cxx17.cpp b/clang/test/AST/ByteCode/cxx17.cpp index ecb8a395520a0..9453906579f04 100644 --- a/clang/test/AST/ByteCode/cxx17.cpp +++ b/clang/test/AST/ByteCode/cxx17.cpp @@ -125,3 +125,14 @@ namespace constant { } static_assert(f()); } + + +template struct i; // both-note {{template is declared here}} +template <> struct i<0> {}; + +template constexpr auto c() { + i g; // both-error {{implicit instantiation of undefined template 'i<1>'}} + return 0; +} + +auto y = c<1>(); // both-note {{in instantiation of function template specialization 'c<1>' requested here}} From 65cede26a6b06ba02c08284fada06c46c0289704 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Tue, 8 Apr 2025 06:09:21 +0200 Subject: [PATCH 0939/1029] [clang][bytecode] Fix emitting dtors of zero-sized arrays (#134672) Desc->getNumElems() returning 0 made us underflow here. --- clang/lib/AST/ByteCode/Compiler.cpp | 20 +++++++++++--------- clang/test/AST/ByteCode/cxx23.cpp | 12 ++++++++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index db87ea1b6016f..e4f87d8b2af04 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -6823,15 +6823,17 @@ bool Compiler::emitDestruction(const Descriptor *Desc, return true; } - for (ssize_t I = Desc->getNumElems() - 1; I >= 0; --I) { - if (!this->emitConstUint64(I, Loc)) - return false; - if (!this->emitArrayElemPtrUint64(Loc)) - return false; - if (!this->emitDestruction(ElemDesc, Loc)) - return false; - if (!this->emitPopPtr(Loc)) - return false; + if (size_t N = Desc->getNumElems()) { + for (ssize_t I = N - 1; I >= 0; --I) { + if (!this->emitConstUint64(I, Loc)) + return false; + if (!this->emitArrayElemPtrUint64(Loc)) + return false; + if (!this->emitDestruction(ElemDesc, Loc)) + return false; + if (!this->emitPopPtr(Loc)) + return false; + } } return true; } diff --git a/clang/test/AST/ByteCode/cxx23.cpp b/clang/test/AST/ByteCode/cxx23.cpp index 6a62ac11cde79..d0ade4f5278b1 100644 --- a/clang/test/AST/ByteCode/cxx23.cpp +++ b/clang/test/AST/ByteCode/cxx23.cpp @@ -304,3 +304,15 @@ namespace NonLiteralDtorInParam { // expected23-note {{non-constexpr function '~NonLiteral' cannot be used in a constant expression}} } } + +namespace ZeroSizedArray { + struct S { + constexpr ~S() { + } + }; + constexpr int foo() { + S s[0]; + return 1; + } + static_assert(foo() == 1); +} From 642481a4286c9006958274531ee173b347866c50 Mon Sep 17 00:00:00 2001 From: Aniket Lal Date: Tue, 8 Apr 2025 10:29:30 +0530 Subject: [PATCH 0940/1029] [Clang][OpenCL][AMDGPU] Allow a kernel to call another kernel (#115821) This feature is currently not supported in the compiler. To facilitate this we emit a stub version of each kernel function body with different name mangling scheme, and replaces the respective kernel call-sites appropriately. Fixes https://github.com/llvm/llvm-project/issues/60313 D120566 was an earlier attempt made to upstream a solution for this issue. --------- Co-authored-by: anikelal --- clang/include/clang/AST/Decl.h | 2 + clang/include/clang/AST/GlobalDecl.h | 18 +- clang/lib/AST/Decl.cpp | 4 + clang/lib/AST/Expr.cpp | 6 +- clang/lib/AST/ItaniumMangle.cpp | 15 + clang/lib/AST/Mangle.cpp | 6 +- clang/lib/AST/MicrosoftMangle.cpp | 6 + clang/lib/CodeGen/CGCall.cpp | 21 +- clang/lib/CodeGen/CGExpr.cpp | 12 +- clang/lib/CodeGen/CodeGenFunction.cpp | 20 + clang/lib/CodeGen/CodeGenModule.cpp | 11 + clang/lib/CodeGen/CodeGenTypes.h | 2 +- clang/lib/CodeGen/TargetInfo.cpp | 6 + clang/lib/CodeGen/TargetInfo.h | 2 +- clang/lib/CodeGen/Targets/SPIR.cpp | 8 + .../CodeGenOpenCL/addr-space-struct-arg.cl | 2501 ++++++++++------- .../amdgpu-abi-struct-arg-byref.cl | 289 +- .../CodeGenOpenCL/amdgpu-abi-struct-coerce.cl | 2 +- .../CodeGenOpenCL/amdgpu-enqueue-kernel.cl | 447 +-- clang/test/CodeGenOpenCL/amdgpu-printf.cl | 91 +- .../test/CodeGenOpenCL/cl-uniform-wg-size.cl | 14 +- .../cl20-device-side-enqueue-attributes.cl | 84 +- .../CodeGenOpenCL/cl20-device-side-enqueue.cl | 26 +- .../enqueue-kernel-non-entry-block.cl | 13 +- ...plicit-addrspacecast-function-parameter.cl | 71 +- clang/test/CodeGenOpenCL/kernel-arg-info.cl | 10 + .../test/CodeGenOpenCL/opencl-kernel-call.cl | 959 +++++++ clang/test/CodeGenOpenCL/reflect.cl | 29 +- clang/test/CodeGenOpenCL/sampler.cl | 4 + clang/test/CodeGenOpenCL/spir-calling-conv.cl | 9 +- clang/test/CodeGenOpenCL/visibility.cl | 53 +- .../CodeGenOpenCLCXX/addrspace-of-this.clcpp | 7 +- .../amdgcn-machine-analysis-remarks.cl | 2 +- 33 files changed, 3375 insertions(+), 1375 deletions(-) create mode 100644 clang/test/CodeGenOpenCL/opencl-kernel-call.cl diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index ff1d3497b77c3..798f112ce7200 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -3048,6 +3048,8 @@ class FunctionDecl : public DeclaratorDecl, static FunctionDecl *castFromDeclContext(const DeclContext *DC) { return static_cast(const_cast(DC)); } + + bool isReferenceableKernel() const; }; /// Represents a member of a struct/union/class. diff --git a/clang/include/clang/AST/GlobalDecl.h b/clang/include/clang/AST/GlobalDecl.h index 386693cabb1fb..df11a79a56b3b 100644 --- a/clang/include/clang/AST/GlobalDecl.h +++ b/clang/include/clang/AST/GlobalDecl.h @@ -70,15 +70,15 @@ class GlobalDecl { GlobalDecl(const VarDecl *D) { Init(D);} GlobalDecl(const FunctionDecl *D, unsigned MVIndex = 0) : MultiVersionIndex(MVIndex) { - if (!D->hasAttr()) { - Init(D); + if (D->isReferenceableKernel()) { + Value.setPointerAndInt(D, unsigned(getDefaultKernelReference(D))); return; } - Value.setPointerAndInt(D, unsigned(getDefaultKernelReference(D))); + Init(D); } GlobalDecl(const FunctionDecl *D, KernelReferenceKind Kind) : Value(D, unsigned(Kind)) { - assert(D->hasAttr() && "Decl is not a GPU kernel!"); + assert(D->isReferenceableKernel() && "Decl is not a GPU kernel!"); } GlobalDecl(const NamedDecl *D) { Init(D); } GlobalDecl(const BlockDecl *D) { Init(D); } @@ -131,12 +131,13 @@ class GlobalDecl { KernelReferenceKind getKernelReferenceKind() const { assert(((isa(getDecl()) && - cast(getDecl())->hasAttr()) || + cast(getDecl())->isReferenceableKernel()) || (isa(getDecl()) && cast(getDecl()) ->getTemplatedDecl() ->hasAttr())) && "Decl is not a GPU kernel!"); + return static_cast(Value.getInt()); } @@ -160,8 +161,9 @@ class GlobalDecl { } static KernelReferenceKind getDefaultKernelReference(const FunctionDecl *D) { - return D->getLangOpts().CUDAIsDevice ? KernelReferenceKind::Kernel - : KernelReferenceKind::Stub; + return (D->hasAttr() || D->getLangOpts().CUDAIsDevice) + ? KernelReferenceKind::Kernel + : KernelReferenceKind::Stub; } GlobalDecl getWithDecl(const Decl *D) { @@ -197,7 +199,7 @@ class GlobalDecl { GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind) { assert(isa(getDecl()) && - cast(getDecl())->hasAttr() && + cast(getDecl())->isReferenceableKernel() && "Decl is not a GPU kernel!"); GlobalDecl Result(*this); Result.Value.setInt(unsigned(Kind)); diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 0e4d69392e8c7..83116ecc0f47b 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -5468,6 +5468,10 @@ FunctionDecl *FunctionDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) { /*TrailingRequiresClause=*/{}); } +bool FunctionDecl::isReferenceableKernel() const { + return hasAttr() || hasAttr(); +} + BlockDecl *BlockDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L) { return new (C, DC) BlockDecl(DC, L); } diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index e8e5f2fa0cc12..4deed08d693ac 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -695,9 +695,9 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK, GD = GlobalDecl(CD, Ctor_Base); else if (const CXXDestructorDecl *DD = dyn_cast(ND)) GD = GlobalDecl(DD, Dtor_Base); - else if (ND->hasAttr()) - GD = GlobalDecl(cast(ND)); - else + else if (auto FD = dyn_cast(ND)) { + GD = FD->isReferenceableKernel() ? GlobalDecl(FD) : GlobalDecl(ND); + } else GD = GlobalDecl(ND); MC->mangleName(GD, Out); diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index fdd84d0bf7c5c..eb25b19bbdc74 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -526,6 +526,7 @@ class CXXNameMangler { void mangleSourceName(const IdentifierInfo *II); void mangleRegCallName(const IdentifierInfo *II); void mangleDeviceStubName(const IdentifierInfo *II); + void mangleOCLDeviceStubName(const IdentifierInfo *II); void mangleSourceNameWithAbiTags( const NamedDecl *ND, const AbiTagList *AdditionalAbiTags = nullptr); void mangleLocalName(GlobalDecl GD, @@ -1561,8 +1562,13 @@ void CXXNameMangler::mangleUnqualifiedName( bool IsDeviceStub = FD && FD->hasAttr() && GD.getKernelReferenceKind() == KernelReferenceKind::Stub; + bool IsOCLDeviceStub = + FD && FD->hasAttr() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub; if (IsDeviceStub) mangleDeviceStubName(II); + else if (IsOCLDeviceStub) + mangleOCLDeviceStubName(II); else if (IsRegCall) mangleRegCallName(II); else @@ -1780,6 +1786,15 @@ void CXXNameMangler::mangleDeviceStubName(const IdentifierInfo *II) { << II->getName(); } +void CXXNameMangler::mangleOCLDeviceStubName(const IdentifierInfo *II) { + // ::= __clang_ocl_kern_imp_ + // ::= [n] + // ::= + StringRef OCLDeviceStubNamePrefix = "__clang_ocl_kern_imp_"; + Out << II->getLength() + OCLDeviceStubNamePrefix.size() + << OCLDeviceStubNamePrefix << II->getName(); +} + void CXXNameMangler::mangleSourceName(const IdentifierInfo *II) { // ::= // ::= [n] diff --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp index b44ab23f1d0e1..141957c1cdce0 100644 --- a/clang/lib/AST/Mangle.cpp +++ b/clang/lib/AST/Mangle.cpp @@ -540,9 +540,9 @@ class ASTNameGenerator::Implementation { GD = GlobalDecl(CtorD, Ctor_Complete); else if (const auto *DtorD = dyn_cast(D)) GD = GlobalDecl(DtorD, Dtor_Complete); - else if (D->hasAttr()) - GD = GlobalDecl(cast(D)); - else + else if (const FunctionDecl *FD = dyn_cast(D)) { + GD = FD->isReferenceableKernel() ? GlobalDecl(FD) : GlobalDecl(D); + } else GD = GlobalDecl(D); MC->mangleName(GD, OS); return false; diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index 7e964124a9fec..4d14614fc1ec7 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -1162,9 +1162,15 @@ void MicrosoftCXXNameMangler::mangleUnqualifiedName(GlobalDecl GD, ->getTemplatedDecl() ->hasAttr())) && GD.getKernelReferenceKind() == KernelReferenceKind::Stub; + bool IsOCLDeviceStub = + ND && isa(ND) && ND->hasAttr() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub; if (IsDeviceStub) mangleSourceName( (llvm::Twine("__device_stub__") + II->getName()).str()); + else if (IsOCLDeviceStub) + mangleSourceName( + (llvm::Twine("__clang_ocl_kern_imp_") + II->getName()).str()); else mangleSourceName(II->getName()); break; diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index b202255c3a15b..b25cdf9523ae1 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -499,7 +499,8 @@ CodeGenTypes::arrangeCXXConstructorCall(const CallArgList &args, /// Arrange the argument and result information for the declaration or /// definition of the given function. const CGFunctionInfo & -CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) { +CodeGenTypes::arrangeFunctionDeclaration(const GlobalDecl GD) { + const FunctionDecl *FD = cast(GD.getDecl()); if (const CXXMethodDecl *MD = dyn_cast(FD)) if (MD->isImplicitObjectMemberFunction()) return arrangeCXXMethodDeclaration(MD); @@ -509,6 +510,13 @@ CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) { assert(isa(FTy)); setCUDAKernelCallingConvention(FTy, CGM, FD); + if (FD->hasAttr() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { + const FunctionType *FT = FTy->getAs(); + CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FT); + FTy = FT->getCanonicalTypeUnqualified(); + } + // When declaring a function without a prototype, always use a // non-variadic type. if (CanQual noProto = FTy.getAs()) { @@ -581,13 +589,11 @@ CodeGenTypes::arrangeUnprototypedObjCMessageSend(QualType returnType, const CGFunctionInfo & CodeGenTypes::arrangeGlobalDeclaration(GlobalDecl GD) { // FIXME: Do we need to handle ObjCMethodDecl? - const FunctionDecl *FD = cast(GD.getDecl()); - if (isa(GD.getDecl()) || isa(GD.getDecl())) return arrangeCXXStructorDeclaration(GD); - return arrangeFunctionDeclaration(FD); + return arrangeFunctionDeclaration(GD); } /// Arrange a thunk that takes 'this' as the first parameter followed by @@ -2391,7 +2397,6 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, // Collect function IR attributes from the callee prototype if we have one. AddAttributesFromFunctionProtoType(getContext(), FuncAttrs, CalleeInfo.getCalleeFunctionProtoType()); - const Decl *TargetDecl = CalleeInfo.getCalleeDecl().getDecl(); // Attach assumption attributes to the declaration. If this is a call @@ -2498,7 +2503,11 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, NumElemsParam); } - if (TargetDecl->hasAttr()) { + if (TargetDecl->hasAttr() && + CallingConv != CallingConv::CC_C && + CallingConv != CallingConv::CC_SpirFunction) { + // Check CallingConv to avoid adding uniform-work-group-size attribute to + // OpenCL Kernel Stub if (getLangOpts().OpenCLVersion <= 120) { // OpenCL v1.2 Work groups are always uniform FuncAttrs.addAttribute("uniform-work-group-size", "true"); diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 7fe2d9582178e..de7c577a23493 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -5752,6 +5752,12 @@ static CGCallee EmitDirectCallee(CodeGenFunction &CGF, GlobalDecl GD) { return CGCallee::forDirect(CalleePtr, GD); } +static GlobalDecl getGlobalDeclForDirectCall(const FunctionDecl *FD) { + if (FD->hasAttr()) + return GlobalDecl(FD, KernelReferenceKind::Stub); + return GlobalDecl(FD); +} + CGCallee CodeGenFunction::EmitCallee(const Expr *E) { E = E->IgnoreParens(); @@ -5765,7 +5771,7 @@ CGCallee CodeGenFunction::EmitCallee(const Expr *E) { // Resolve direct calls. } else if (auto DRE = dyn_cast(E)) { if (auto FD = dyn_cast(DRE->getDecl())) { - return EmitDirectCallee(*this, FD); + return EmitDirectCallee(*this, getGlobalDeclForDirectCall(FD)); } } else if (auto ME = dyn_cast(E)) { if (auto FD = dyn_cast(ME->getMemberDecl())) { @@ -6134,6 +6140,10 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType, const auto *FnType = cast(PointeeType); + if (const auto *FD = dyn_cast_or_null(TargetDecl); + FD && FD->hasAttr()) + CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FnType); + // If we are checking indirect calls and this call is indirect, check that the // function pointer is a member of the bit set for the function type. if (SanOpts.has(SanitizerKind::CFIICall) && diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index b55003b9b0bbb..232d48141a0c0 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -1595,6 +1595,26 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn, // Implicit copy-assignment gets the same special treatment as implicit // copy-constructors. emitImplicitAssignmentOperatorBody(Args); + } else if (FD->hasAttr() && + GD.getKernelReferenceKind() == KernelReferenceKind::Kernel) { + CallArgList CallArgs; + for (unsigned i = 0; i < Args.size(); ++i) { + Address ArgAddr = GetAddrOfLocalVar(Args[i]); + QualType ArgQualType = Args[i]->getType(); + RValue ArgRValue = convertTempToRValue(ArgAddr, ArgQualType, Loc); + CallArgs.add(ArgRValue, ArgQualType); + } + GlobalDecl GDStub = GlobalDecl(FD, KernelReferenceKind::Stub); + const FunctionType *FT = cast(FD->getType()); + CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FT); + const CGFunctionInfo &FnInfo = CGM.getTypes().arrangeFreeFunctionCall( + CallArgs, FT, /*ChainCall=*/false); + llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FnInfo); + llvm::Constant *GDStubFunctionPointer = + CGM.getRawFunctionPointer(GDStub, FTy); + CGCallee GDStubCallee = CGCallee::forDirect(GDStubFunctionPointer, GDStub); + EmitCall(FnInfo, GDStubCallee, ReturnValueSlot(), CallArgs, nullptr, false, + Loc); } else if (Body) { EmitFunctionBody(Body); } else diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 8f9cf965af2b9..0154799498f5e 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1903,6 +1903,9 @@ static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD, } else if (FD && FD->hasAttr() && GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { Out << "__device_stub__" << II->getName(); + } else if (FD && FD->hasAttr() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { + Out << "__clang_ocl_kern_imp_" << II->getName(); } else { Out << II->getName(); } @@ -3890,6 +3893,9 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) { // Ignore declarations, they will be emitted on their first use. if (const auto *FD = dyn_cast(Global)) { + if (FD->hasAttr() && FD->doesThisDeclarationHaveABody()) + addDeferredDeclToEmit(GlobalDecl(FD, KernelReferenceKind::Stub)); + // Update deferred annotations with the latest declaration if the function // function was already used or defined. if (FD->hasAttr()) { @@ -4857,6 +4863,11 @@ CodeGenModule::GetAddrOfFunction(GlobalDecl GD, llvm::Type *Ty, bool ForVTable, if (!Ty) { const auto *FD = cast(GD.getDecl()); Ty = getTypes().ConvertType(FD->getType()); + if (FD->hasAttr() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { + const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD); + Ty = getTypes().GetFunctionType(FI); + } } // Devirtualized destructor calls may come through here instead of via diff --git a/clang/lib/CodeGen/CodeGenTypes.h b/clang/lib/CodeGen/CodeGenTypes.h index 5aebf9a212237..307048bcc510d 100644 --- a/clang/lib/CodeGen/CodeGenTypes.h +++ b/clang/lib/CodeGen/CodeGenTypes.h @@ -207,7 +207,7 @@ class CodeGenTypes { /// Free functions are functions that are compatible with an ordinary /// C function pointer type. - const CGFunctionInfo &arrangeFunctionDeclaration(const FunctionDecl *FD); + const CGFunctionInfo &arrangeFunctionDeclaration(const GlobalDecl GD); const CGFunctionInfo &arrangeFreeFunctionCall(const CallArgList &Args, const FunctionType *Ty, bool ChainCall); diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp index 64a9a5554caf7..981488eb4dc37 100644 --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -117,6 +117,12 @@ unsigned TargetCodeGenInfo::getOpenCLKernelCallingConv() const { return llvm::CallingConv::SPIR_KERNEL; } +void TargetCodeGenInfo::setOCLKernelStubCallingConvention( + const FunctionType *&FT) const { + FT = getABIInfo().getContext().adjustFunctionType( + FT, FT->getExtInfo().withCallingConv(CC_C)); +} + llvm::Constant *TargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM, llvm::PointerType *T, QualType QT) const { return llvm::ConstantPointerNull::get(T); diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h index 5df19fbef1e5b..ef4e500f7a38c 100644 --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -400,7 +400,7 @@ class TargetCodeGenInfo { virtual bool shouldEmitDWARFBitFieldSeparators() const { return false; } virtual void setCUDAKernelCallingConvention(const FunctionType *&FT) const {} - + virtual void setOCLKernelStubCallingConvention(const FunctionType *&FT) const; /// Return the device-side type for the CUDA device builtin surface type. virtual llvm::Type *getCUDADeviceBuiltinSurfaceDeviceType() const { // By default, no change from the original one. diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index 225d9dfbd980b..f35c124f50aa0 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -58,6 +58,8 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo { llvm::Type *getSPIRVImageTypeFromHLSLResource( const HLSLAttributedResourceType::Attributes &attributes, llvm::Type *ElementType, llvm::LLVMContext &Ctx) const; + void + setOCLKernelStubCallingConvention(const FunctionType *&FT) const override; }; class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo { public: @@ -230,6 +232,12 @@ void SPIRVTargetCodeGenInfo::setCUDAKernelCallingConvention( } } +void CommonSPIRTargetCodeGenInfo::setOCLKernelStubCallingConvention( + const FunctionType *&FT) const { + FT = getABIInfo().getContext().adjustFunctionType( + FT, FT->getExtInfo().withCallingConv(CC_SpirFunction)); +} + LangAS SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM, const VarDecl *D) const { diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl index effdeb9546800..789aae7a5c34c 100644 --- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5 // RUN: %clang_cc1 %s -emit-llvm -o - -O0 -ffake-address-space-map -triple i686-pc-darwin | FileCheck -check-prefixes=X86 %s // RUN: %clang_cc1 %s -emit-llvm -o - -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN %s // RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN20 %s @@ -46,7 +46,77 @@ struct LargeStructTwoMember { struct LargeStructOneMember g_s; #endif -// +Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { + Mat4X4 out; + return out; +} + +kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { + out[0] = foo(in[1]); +} + +Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { + Mat64X64 out; + return out; +} + +kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { + out[0] = foo_large(in[1]); +} + +void FuncOneMember(struct StructOneMember u) { + u.x = (int2)(0, 0); +} + +void FuncOneLargeMember(struct LargeStructOneMember u) { + u.x[0] = (int2)(0, 0); +} + +#if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables)) + +void test_indirect_arg_globl(void) { + FuncOneLargeMember(g_s); +} +#endif + +kernel void test_indirect_arg_local(void) { + local struct LargeStructOneMember l_s; + FuncOneLargeMember(l_s); +} + +void test_indirect_arg_private(void) { + struct LargeStructOneMember p_s; + FuncOneLargeMember(p_s); +} + +kernel void KernelOneMember(struct StructOneMember u) { + FuncOneMember(u); +} + +kernel void KernelOneMemberSpir(global struct StructOneMember* u) { + FuncOneMember(*u); +} + +kernel void KernelLargeOneMember(struct LargeStructOneMember u) { + FuncOneLargeMember(u); +} + +void FuncTwoMember(struct StructTwoMember u) { + u.y = (int2)(0, 0); +} + +void FuncLargeTwoMember(struct LargeStructTwoMember u) { + u.y[0] = (int2)(0, 0); +} + +kernel void KernelTwoMember(struct StructTwoMember u) { + FuncTwoMember(u); +} + +kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { + FuncLargeTwoMember(u); +} + // X86-LABEL: define void @foo( // X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] { // X86-NEXT: [[ENTRY:.*:]] @@ -54,64 +124,25 @@ struct LargeStructOneMember g_s; // X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo( -// AMDGCN-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 -// AMDGCN-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 -// AMDGCN-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 -// AMDGCN-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] -// -// AMDGCN20-LABEL: define dso_local %struct.Mat4X4 @foo( -// AMDGCN20-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// AMDGCN20-NEXT: [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr -// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0 -// AMDGCN20-NEXT: store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr [[RETVAL_ASCAST]], align 4 -// AMDGCN20-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] -// -// SPIR-LABEL: define dso_local spir_func void @foo( -// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local %struct.Mat4X4 @foo( -// AMDGCN30-GVAR-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 -// AMDGCN30-GVAR-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] -// -// AMDGCN30-LABEL: define dso_local %struct.Mat4X4 @foo( -// AMDGCN30-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN30-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) -// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 -// AMDGCN30-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 -// AMDGCN30-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 -// AMDGCN30-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] -// -Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { - Mat4X4 out; - return out; -} - // // X86-LABEL: define spir_kernel void @ker( // X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // X86-NEXT: [[ENTRY:.*:]] // X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3:[0-9]+]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @__clang_ocl_kern_imp_ker( +// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 // X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 // X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 @@ -121,118 +152,10 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { // X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 // X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1 // X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false) -// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3:[0-9]+]] +// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] // X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false) // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker( -// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 -// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] -// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 -// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 -// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker( -// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr -// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 -// AMDGCN20-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] -// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 -// AMDGCN20-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 -// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_kernel void @ker( -// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 -// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 -// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 -// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 -// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 -// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0 -// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 -// SPIR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1 -// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false) -// SPIR-NEXT: call spir_func void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3:[0-9]+]] -// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false) -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker( -// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 -// AMDGCN30-GVAR-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] -// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 -// AMDGCN30-GVAR-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) -// AMDGCN30-GVAR-NEXT: ret void -// -// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker( -// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) -// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 -// AMDGCN30-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] -// AMDGCN30-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 -// AMDGCN30-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 -// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) -// AMDGCN30-NEXT: ret void -// -kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { - out[0] = foo(in[1]); -} - // // X86-LABEL: define void @foo_large( // X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] { @@ -241,48 +164,22 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { // X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 // X86-NEXT: ret void // -// AMDGCN-LABEL: define dso_local void @foo_large( -// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) -// AMDGCN-NEXT: ret void // -// AMDGCN20-LABEL: define dso_local void @foo_large( -// AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr -// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) -// AMDGCN20-NEXT: ret void +// X86-LABEL: define spir_kernel void @ker_large( +// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3]] +// X86-NEXT: ret void // -// SPIR-LABEL: define dso_local spir_func void @foo_large( -// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local void @foo_large( -// AMDGCN30-GVAR-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) -// AMDGCN30-GVAR-NEXT: ret void -// -// AMDGCN30-LABEL: define dso_local void @foo_large( -// AMDGCN30-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) -// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) -// AMDGCN30-NEXT: ret void -// -Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { - Mat64X64 out; - return out; -} - // -// X86-LABEL: define spir_kernel void @ker_large( -// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// X86-LABEL: define void @__clang_ocl_kern_imp_ker_large( +// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // X86-NEXT: [[ENTRY:.*:]] // X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 @@ -299,11 +196,243 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { // X86-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false) // X86-NEXT: ret void // +// +// X86-LABEL: define void @FuncOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[X]], align 8 +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @FuncOneLargeMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// X86-NEXT: ret void +// +// +// X86-LABEL: define spir_kernel void @test_indirect_arg_local( +// X86-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @__clang_ocl_kern_imp_test_indirect_arg_local( +// X86-SAME: ) #[[ATTR0]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @test_indirect_arg_private( +// X86-SAME: ) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define spir_kernel void @KernelOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false) +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define spir_kernel void @KernelOneMemberSpir( +// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4 +// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false) +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define spir_kernel void @KernelLargeOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false) +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @FuncTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[Y]], align 8 +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @FuncLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// X86-NEXT: ret void +// +// +// X86-LABEL: define spir_kernel void @KernelTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false) +// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define spir_kernel void @KernelLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// X86-NEXT: [[ENTRY:.*:]] +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false) +// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] +// X86-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 +// AMDGCN-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 +// AMDGCN-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 +// AMDGCN-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4:[0-9]+]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_ker( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR4]] +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @foo_large( +// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN-NEXT: ret void +// +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_ker_large( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 @@ -313,71 +442,1150 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { // AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 // AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) -// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) // AMDGCN-NEXT: ret void // -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker_large( -// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) -// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr -// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr -// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) -// AMDGCN20-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) -// AMDGCN20-NEXT: ret void // -// SPIR-LABEL: define dso_local spir_kernel void @ker_large( -// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META6]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 -// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4 -// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 -// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 -// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 -// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0 -// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 -// SPIR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1 -// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false) -// SPIR-NEXT: call spir_func void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] -// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false) -// SPIR-NEXT: ret void +// AMDGCN-LABEL: define dso_local void @FuncOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( +// AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_test_indirect_arg_local( +// AMDGCN-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @test_indirect_arg_private( +// AMDGCN-SAME: ) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( +// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @FuncTwoMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( +// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( +// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN20-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN20-NEXT: [[IN1:%.*]] = addrspacecast ptr addrspace(5) [[IN]] to ptr +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr [[IN1]], i32 0, i32 0 +// AMDGCN20-NEXT: store [9 x i32] [[IN_COERCE]], ptr [[COERCE_DIVE]], align 4 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr [[RETVAL_ASCAST]], align 4 +// AMDGCN20-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4:[0-9]+]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_ker( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN20-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR4]] +// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN20-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @foo_large( +// AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker_large( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_ker_large( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN20-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// AMDGCN20-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN20-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN20-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @FuncOneMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[TMP0]], ptr [[X]], align 8 +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i64 0, i64 0 +// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_globl( +// AMDGCN20-SAME: ) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( +// AMDGCN20-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_test_indirect_arg_local( +// AMDGCN20-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_private( +// AMDGCN20-SAME: ) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[P_S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_S]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[P_S_ASCAST]], i64 800, i1 false) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 +// AMDGCN20-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8 +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 +// AMDGCN20-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8 +// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN20-NEXT: [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr +// AMDGCN20-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8 +// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN20-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 +// AMDGCN20-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( +// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U]], i64 800, i1 false) +// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @FuncTwoMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8 +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: store <2 x i32> [[TMP2]], ptr [[Y]], align 8 +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @FuncLargeTwoMember( +// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 +// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i64 0, i64 0 +// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( +// AMDGCN20-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[TMP0]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN20-NEXT: store <2 x i32> [[TMP3]], ptr [[TMP2]], align 8 +// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 +// AMDGCN20-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8 +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( +// AMDGCN20-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8 +// AMDGCN20-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[TMP2]], align 8 +// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 +// AMDGCN20-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( +// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN20-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 +// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN20-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8 +// AMDGCN20-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) +// AMDGCN20-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// AMDGCN20-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN20-NEXT: [[ENTRY:.*:]] +// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U]], i64 480, i1 false) +// AMDGCN20-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN20-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @foo( +// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @ker( +// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3:[0-9]+]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_ker( +// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META5]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 +// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1 +// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false) +// SPIR-NEXT: call spir_func void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4:[0-9]+]] +// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false) +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @foo_large( +// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @ker_large( +// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META6]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_ker_large( +// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META6]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 +// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4 +// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0 +// SPIR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4 +// SPIR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1 +// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false) +// SPIR-NEXT: call spir_func void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]] +// SPIR-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false) +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @FuncOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[X]], align 8 +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @FuncOneLargeMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0 +// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8 +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @test_indirect_arg_local( +// SPIR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_test_indirect_arg_local( +// SPIR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 +// SPIR-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) +// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @test_indirect_arg_private( +// SPIR-SAME: ) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 +// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META10:![0-9]+]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12:![0-9]+]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_KernelOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMemberSpir( +// SPIR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// SPIR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8 +// SPIR-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 +// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 +// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false) +// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_KernelLargeOneMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @FuncTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[Y]], align 8 +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @FuncLargeTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// SPIR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0 +// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8 +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_KernelTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// SPIR-NEXT: ret void +// +// +// SPIR-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] { +// SPIR-NEXT: [[ENTRY:.*:]] +// SPIR-NEXT: call spir_func void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// SPIR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN30-GVAR-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 +// AMDGCN30-GVAR-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4:[0-9]+]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_ker( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN30-GVAR-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN30-GVAR-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @foo_large( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker_large( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_ker_large( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN30-GVAR-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_globl( +// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( +// AMDGCN30-GVAR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_test_indirect_arg_local( +// AMDGCN30-GVAR-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_private( +// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-GVAR-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @FuncTwoMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @FuncLargeTwoMember( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-GVAR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( +// AMDGCN30-GVAR-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( +// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN30-GVAR-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( +// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-GVAR-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN30-GVAR-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-GVAR-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-GVAR-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] +// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN30-GVAR-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-GVAR-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN30-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 +// AMDGCN30-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 +// AMDGCN30-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4:[0-9]+]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_ker( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN30-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR4]] +// AMDGCN30-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN30-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @foo_large( +// AMDGCN30-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN30-NEXT: ret void // -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker_large( -// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) -// AMDGCN30-GVAR-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker_large( // AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] // AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_ker_large( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN30-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) // AMDGCN30-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN30-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 @@ -387,77 +1595,10 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { // AMDGCN30-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 // AMDGCN30-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 // AMDGCN30-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) -// AMDGCN30-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN30-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN30-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) // AMDGCN30-NEXT: ret void // -kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { - out[0] = foo_large(in[1]); -} - -// -// X86-LABEL: define void @FuncOneMember( -// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 -// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false) -// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 -// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[X]], align 8 -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local void @FuncOneMember( -// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local void @FuncOneMember( -// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr -// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 -// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: store <2 x i32> [[TMP0]], ptr [[X]], align 8 -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_func void @FuncOneMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 -// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[X]], align 8 -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneMember( -// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local void @FuncOneMember( // AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { @@ -472,266 +1613,44 @@ kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { // AMDGCN30-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 // AMDGCN30-NEXT: ret void // -void FuncOneMember(struct StructOneMember u) { - u.x = (int2)(0, 0); -} - -// -// X86-LABEL: define void @FuncOneLargeMember( -// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 -// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false) -// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 -// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0 -// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local void @FuncOneLargeMember( -// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) -// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local void @FuncOneLargeMember( -// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr -// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) -// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 -// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i64 0, i64 0 -// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_func void @FuncOneLargeMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 -// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0 -// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8 -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneLargeMember( -// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) -// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local void @FuncOneLargeMember( // AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) -// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 -// AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 -// AMDGCN30-NEXT: ret void -// -void FuncOneLargeMember(struct LargeStructOneMember u) { - u.x[0] = (int2)(0, 0); -} - -#if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables)) -// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_globl( -// AMDGCN20-SAME: ) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) -// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN20-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_globl( -// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) -// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN30-GVAR-NEXT: ret void -// -void test_indirect_arg_globl(void) { - FuncOneLargeMember(g_s); -} -#endif - -// -// X86-LABEL: define spir_kernel void @test_indirect_arg_local( -// X86-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4 -// X86-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( -// AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( -// AMDGCN20-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_kernel void @test_indirect_arg_local( -// SPIR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 -// SPIR-NEXT: call void @llvm.memcpy.p0.p3.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false) -// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] -// SPIR-NEXT: ret void +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN30-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN30-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 +// AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN30-NEXT: ret void // -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( -// AMDGCN30-GVAR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( // AMDGCN30-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR4]] // AMDGCN30-NEXT: ret void // -kernel void test_indirect_arg_local(void) { - local struct LargeStructOneMember l_s; - FuncOneLargeMember(l_s); -} - -// -// X86-LABEL: define void @test_indirect_arg_private( -// X86-SAME: ) #[[ATTR0]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR3]] -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local void @test_indirect_arg_private( -// AMDGCN-SAME: ) #[[ATTR0]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_private( -// AMDGCN20-SAME: ) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[P_S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_S]] to ptr -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[P_S_ASCAST]], i64 800, i1 false) -// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN20-NEXT: ret void // -// SPIR-LABEL: define dso_local spir_func void @test_indirect_arg_private( -// SPIR-SAME: ) #[[ATTR0]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8 -// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] -// SPIR-NEXT: ret void +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_test_indirect_arg_local( +// AMDGCN30-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) +// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void // -// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_private( -// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local void @test_indirect_arg_private( // AMDGCN30-SAME: ) #[[ATTR0]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] // AMDGCN30-NEXT: [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]] +// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // -void test_indirect_arg_private(void) { - struct LargeStructOneMember p_s; - FuncOneLargeMember(p_s); -} - -// -// X86-LABEL: define spir_kernel void @KernelOneMember( -// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( -// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( -// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 -// AMDGCN20-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8 -// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META10:![0-9]+]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12:![0-9]+]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( -// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 -// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( // AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { @@ -741,69 +1660,21 @@ void test_indirect_arg_private(void) { // AMDGCN30-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 // AMDGCN30-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 // AMDGCN30-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 -// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // -kernel void KernelOneMember(struct StructOneMember u) { - FuncOneMember(u); -} - -// -// X86-LABEL: define spir_kernel void @KernelOneMemberSpir( -// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// X86-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4 -// X86-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 -// X86-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 -// X86-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false) -// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( -// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( -// AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN20-NEXT: [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr -// AMDGCN20-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8 -// AMDGCN20-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 -// AMDGCN20-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] -// AMDGCN20-NEXT: ret void // -// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMemberSpir( -// SPIR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4 -// SPIR-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8 -// SPIR-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4 -// SPIR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4 -// SPIR-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false) -// SPIR-NEXT: call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] -// SPIR-NEXT: ret void +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( +// AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN30-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void // -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( -// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN30-GVAR-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( // AMDGCN30-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { @@ -811,144 +1682,41 @@ kernel void KernelOneMember(struct StructOneMember u) { // AMDGCN30-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN30-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 // AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 -// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 -// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // -kernel void KernelOneMemberSpir(global struct StructOneMember* u) { - FuncOneMember(*u); -} - -// -// X86-LABEL: define spir_kernel void @KernelLargeOneMember( -// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]] -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN20-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U1]], i64 800, i1 false) -// AMDGCN20-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeOneMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-GVAR-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] -// AMDGCN30-GVAR-NEXT: ret void // -// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( -// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// AMDGCN30-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { // AMDGCN30-NEXT: [[ENTRY:.*:]] -// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]] -// AMDGCN30-NEXT: ret void -// -kernel void KernelLargeOneMember(struct LargeStructOneMember u) { - FuncOneLargeMember(u); -} - -// -// X86-LABEL: define void @FuncTwoMember( -// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false) -// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 -// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[Y]], align 8 -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local void @FuncTwoMember( -// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 -// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 -// AMDGCN-NEXT: ret void +// AMDGCN30-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN30-NEXT: store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8 +// AMDGCN30-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 +// AMDGCN30-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void // -// AMDGCN20-LABEL: define dso_local void @FuncTwoMember( -// AMDGCN20-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr -// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8 -// AMDGCN20-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8 -// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: store <2 x i32> [[TMP2]], ptr [[Y]], align 8 -// AMDGCN20-NEXT: ret void // -// SPIR-LABEL: define dso_local spir_func void @FuncTwoMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 -// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[Y]], align 8 -// SPIR-NEXT: ret void +// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN30-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( +// AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN30-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void // -// AMDGCN30-GVAR-LABEL: define dso_local void @FuncTwoMember( -// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 -// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local void @FuncTwoMember( // AMDGCN30-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { @@ -965,75 +1733,6 @@ kernel void KernelLargeOneMember(struct LargeStructOneMember u) { // AMDGCN30-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 // AMDGCN30-NEXT: ret void // -void FuncTwoMember(struct StructTwoMember u) { - u.y = (int2)(0, 0); -} - -// -// X86-LABEL: define void @FuncLargeTwoMember( -// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 -// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false) -// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 -// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0 -// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember( -// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) -// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local void @FuncLargeTwoMember( -// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN20-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr -// AMDGCN20-NEXT: [[DOTCOMPOUNDLITERAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCOMPOUNDLITERAL]] to ptr -// AMDGCN20-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) -// AMDGCN20-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL_ASCAST]], align 8 -// AMDGCN20-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 -// AMDGCN20-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i64 0, i64 0 -// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_func void @FuncLargeTwoMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 -// SPIR-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 -// SPIR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 -// SPIR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0 -// SPIR-NEXT: store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8 -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local void @FuncLargeTwoMember( -// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) -// AMDGCN30-GVAR-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 -// AMDGCN30-GVAR-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local void @FuncLargeTwoMember( // AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { @@ -1048,74 +1747,6 @@ void FuncTwoMember(struct StructTwoMember u) { // AMDGCN30-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 // AMDGCN30-NEXT: ret void // -void FuncLargeTwoMember(struct LargeStructTwoMember u) { - u.y[0] = (int2)(0, 0); -} - -// -// X86-LABEL: define spir_kernel void @KernelTwoMember( -// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( -// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 -// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( -// AMDGCN20-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN20-NEXT: store <2 x i32> [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN20-NEXT: store <2 x i32> [[TMP3]], ptr [[TMP2]], align 8 -// AMDGCN20-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 -// AMDGCN20-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8 -// AMDGCN20-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] -// AMDGCN20-NEXT: ret void -// -// SPIR-LABEL: define dso_local spir_kernel void @KernelTwoMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: call spir_func void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] -// SPIR-NEXT: ret void -// -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( -// AMDGCN30-GVAR-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN30-GVAR-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( // AMDGCN30-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { @@ -1131,67 +1762,25 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN30-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 // AMDGCN30-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN30-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 -// AMDGCN30-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // -kernel void KernelTwoMember(struct StructTwoMember u) { - FuncTwoMember(u); -} - -// -// X86-LABEL: define spir_kernel void @KernelLargeTwoMember( -// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { -// X86-NEXT: [[ENTRY:.*:]] -// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]] -// X86-NEXT: ret void -// -// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( -// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { -// AMDGCN-NEXT: [[ENTRY:.*:]] -// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] -// AMDGCN-NEXT: ret void -// -// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( -// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { -// AMDGCN20-NEXT: [[ENTRY:.*:]] -// AMDGCN20-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN20-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr -// AMDGCN20-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 -// AMDGCN20-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN20-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN20-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 -// AMDGCN20-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN20-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8 -// AMDGCN20-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U1]], i64 480, i1 false) -// AMDGCN20-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] -// AMDGCN20-NEXT: ret void // -// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeTwoMember( -// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] { -// SPIR-NEXT: [[ENTRY:.*:]] -// SPIR-NEXT: call spir_func void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] -// SPIR-NEXT: ret void +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( +// AMDGCN30-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN30-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN30-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN30-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN30-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN30-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN30-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void // -// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( -// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { -// AMDGCN30-GVAR-NEXT: [[ENTRY:.*:]] -// AMDGCN30-GVAR-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN30-GVAR-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 -// AMDGCN30-GVAR-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 -// AMDGCN30-GVAR-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 -// AMDGCN30-GVAR-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 -// AMDGCN30-GVAR-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 -// AMDGCN30-GVAR-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN30-GVAR-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] -// AMDGCN30-GVAR-NEXT: ret void // // AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( // AMDGCN30-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { @@ -1203,12 +1792,18 @@ kernel void KernelTwoMember(struct StructTwoMember u) { // AMDGCN30-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 // AMDGCN30-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 // AMDGCN30-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 -// AMDGCN30-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]] +// AMDGCN30-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] +// AMDGCN30-NEXT: ret void +// +// +// AMDGCN30-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN30-NEXT: [[ENTRY:.*:]] +// AMDGCN30-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN30-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN30-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR4]] // AMDGCN30-NEXT: ret void // -kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { - FuncLargeTwoMember(u); -} //. // X86: [[META4]] = !{i32 1, i32 1} // X86: [[META5]] = !{!"none", !"none"} diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl index 2f8ba99a3e416..6dc488c40da7f 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 --include-generated-funcs // RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -check-prefix=AMDGCN %s typedef int int2 __attribute__((ext_vector_type(2))); @@ -42,6 +42,78 @@ struct LargeStructOneMember g_s; #endif +Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { + Mat4X4 out; + return out; +} + +// Expect two mem copies: one for the argument "in", and one for +// the return value. + +kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { + out[0] = foo(in[1]); +} + +Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { + Mat64X64 out; + return out; +} + +kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { + out[0] = foo_large(in[1]); +} + +void FuncOneMember(struct StructOneMember u) { + u.x = (int2)(0, 0); +} + +void FuncOneLargeMember(struct LargeStructOneMember u) { + u.x[0] = (int2)(0, 0); +} + +#if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables)) +void test_indirect_arg_globl(void) { + FuncOneLargeMember(g_s); +} +#endif + +kernel void test_indirect_arg_local(void) { + local struct LargeStructOneMember l_s; + FuncOneLargeMember(l_s); +} + +void test_indirect_arg_private(void) { + struct LargeStructOneMember p_s; + FuncOneLargeMember(p_s); +} + +kernel void KernelOneMember(struct StructOneMember u) { + FuncOneMember(u); +} + +kernel void KernelOneMemberSpir(global struct StructOneMember* u) { + FuncOneMember(*u); +} + +kernel void KernelLargeOneMember(struct LargeStructOneMember u) { + FuncOneLargeMember(u); +} + +void FuncTwoMember(struct StructTwoMember u) { + u.y = (int2)(0, 0); +} + +void FuncLargeTwoMember(struct LargeStructTwoMember u) { + u.y[0] = (int2)(0, 0); +} + +kernel void KernelTwoMember(struct StructTwoMember u) { + FuncTwoMember(u); +} + +kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { + FuncLargeTwoMember(u); +} // AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo( // AMDGCN-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -54,19 +126,27 @@ struct LargeStructOneMember g_s; // AMDGCN-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr [[RETVAL_ASCAST]], align 4 // AMDGCN-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] // -Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { - Mat4X4 out; - return out; -} - -// Expect two mem copies: one for the argument "in", and one for -// the return value. - +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// AMDGCN-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ker(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4:[0-9]+]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_ker( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr // AMDGCN-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr @@ -78,17 +158,14 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { // AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 // AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 -// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]] +// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR4]] // AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 // AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 // AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) // AMDGCN-NEXT: ret void // -kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { - out[0] = foo(in[1]); -} - +// // AMDGCN-LABEL: define dso_local void @foo_large( // AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -97,16 +174,27 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { // AMDGCN-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) // AMDGCN-NEXT: ret void // -Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { - Mat64X64 out; - return out; -} - +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large( // AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr +// AMDGCN-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ker_large(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_ker_large( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) // AMDGCN-NEXT: [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr @@ -118,14 +206,11 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { // AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 // AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) -// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) // AMDGCN-NEXT: ret void // -kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { - out[0] = foo_large(in[1]); -} - +// // AMDGCN-LABEL: define dso_local void @FuncOneMember( // AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -141,10 +226,7 @@ kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) { // AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr [[X]], align 8 // AMDGCN-NEXT: ret void // -void FuncOneMember(struct StructOneMember u) { - u.x = (int2)(0, 0); -} - +// // AMDGCN-LABEL: define dso_local void @FuncOneLargeMember( // AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -160,37 +242,32 @@ void FuncOneMember(struct StructOneMember u) { // AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 // AMDGCN-NEXT: ret void // -void FuncOneLargeMember(struct LargeStructOneMember u) { - u.x[0] = (int2)(0, 0); -} - -#if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables)) +// // AMDGCN-LABEL: define dso_local void @test_indirect_arg_globl( // AMDGCN-SAME: ) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -void test_indirect_arg_globl(void) { - FuncOneLargeMember(g_s); -} -#endif - +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local( // AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { // AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_test_indirect_arg_local() #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_test_indirect_arg_local( +// AMDGCN-SAME: ) #[[ATTR2]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5) // AMDGCN-NEXT: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void test_indirect_arg_local(void) { - local struct LargeStructOneMember l_s; - FuncOneLargeMember(l_s); -} - +// // AMDGCN-LABEL: define dso_local void @test_indirect_arg_private( // AMDGCN-SAME: ) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -198,14 +275,10 @@ kernel void test_indirect_arg_local(void) { // AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[P_S_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_S]] to ptr // AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[P_S_ASCAST]], i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -void test_indirect_arg_private(void) { - struct LargeStructOneMember p_s; - FuncOneLargeMember(p_s); -} - +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( // AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -215,13 +288,23 @@ void test_indirect_arg_private(void) { // AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 // AMDGCN-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void KernelOneMember(struct StructOneMember u) { - FuncOneMember(u); -} - +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE_DIVE2]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir( // AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -229,32 +312,48 @@ kernel void KernelOneMember(struct StructOneMember u) { // AMDGCN-NEXT: [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr // AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelOneMemberSpir(ptr addrspace(1) noundef align 8 [[TMP0]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMemberSpir( +// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META14]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[U_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[U_ADDR]] to ptr +// AMDGCN-NEXT: store ptr addrspace(1) [[U]], ptr [[U_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR_ASCAST]], align 8 // AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8 -// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void KernelOneMemberSpir(global struct StructOneMember* u) { - FuncOneMember(*u); -} - +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( // AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 // AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr [[TMP0]], align 8 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U1]], i64 800, i1 false) -// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void KernelLargeOneMember(struct LargeStructOneMember u) { - FuncOneLargeMember(u); -} - +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// // AMDGCN-LABEL: define dso_local void @FuncTwoMember( // AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -272,10 +371,7 @@ kernel void KernelLargeOneMember(struct LargeStructOneMember u) { // AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr [[Y]], align 8 // AMDGCN-NEXT: ret void // -void FuncTwoMember(struct StructTwoMember u) { - u.y = (int2)(0, 0); -} - +// // AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember( // AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -291,10 +387,7 @@ void FuncTwoMember(struct StructTwoMember u) { // AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 // AMDGCN-NEXT: ret void // -void FuncLargeTwoMember(struct LargeStructTwoMember u) { - u.y[0] = (int2)(0, 0); -} - +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( // AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] @@ -310,18 +403,31 @@ void FuncLargeTwoMember(struct LargeStructTwoMember u) { // AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 // AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 // AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8 -// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void KernelTwoMember(struct StructTwoMember u) { - FuncTwoMember(u); -} - +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr [[TMP1]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[TMP4]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// // AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( // AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { // AMDGCN-NEXT: [[ENTRY:.*:]] // AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) -// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) // AMDGCN-NEXT: [[U1:%.*]] = addrspacecast ptr addrspace(5) [[U]] to ptr // AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 0 // AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 @@ -329,13 +435,22 @@ kernel void KernelTwoMember(struct StructTwoMember u) { // AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U1]], i32 0, i32 1 // AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 // AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr [[TMP2]], align 8 -// AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U1]], i64 480, i1 false) -// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// AMDGCN-NEXT: [[U1_ASCAST:%.*]] = addrspacecast ptr [[U1]] to ptr addrspace(5) +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U1_ASCAST]]) #[[ATTR4]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[COERCE:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[U:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr +// AMDGCN-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr align 8 [[U]], i64 480, i1 false) +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // AMDGCN-NEXT: ret void // -kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { - FuncLargeTwoMember(u); -} //. // AMDGCN: [[META4]] = !{i32 1, i32 1} // AMDGCN: [[META5]] = !{!"none", !"none"} diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl index 6776e2227847e..4d09fc3ffb70b 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl @@ -423,7 +423,7 @@ struct_char_arr32 func_ret_struct_char_arr32() return s; } -// CHECK: define{{.*}} i32 @func_transparent_union_ret() local_unnamed_addr #1 { +// CHECK: define{{.*}} i32 @func_transparent_union_ret() local_unnamed_addr #[[ATTR1:[0-9]+]] { // CHECK: ret i32 0 transparent_u func_transparent_union_ret() { diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl index caae5666de29e..773daf53b2746 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs --prefix-filecheck-ir-name VAR +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals all --include-generated-funcs --prefix-filecheck-ir-name VAR --version 5 // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -disable-llvm-passes -fno-ident -emit-llvm -o - -triple amdgcn-amd-amdhsa %s -fdenormal-fp-math-f32=preserve-sign | FileCheck %s --check-prefixes=CHECK,NOCPU // // Check no-optnone and target-cpu behavior @@ -70,9 +70,9 @@ kernel void test_target_features_kernel(global int *i) { // CHECK: @llvm.used = appending addrspace(1) global [10 x ptr] [ptr @__test_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr @__test_block_invoke_2_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr @__test_block_invoke_3_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr @__test_block_invoke_4_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr)], section "llvm.metadata" //. // NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@callee -// NOCPU-SAME: (i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define dso_local void @callee( +// NOCPU-SAME: i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5) // NOCPU-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // NOCPU-NEXT: [[ID_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ID_ADDR]] to ptr @@ -88,9 +88,33 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@test -// NOCPU-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define dso_local amdgpu_kernel void @test( +// NOCPU-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { +// NOCPU-NEXT: [[ENTRY:.*:]] +// NOCPU-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// NOCPU-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) +// NOCPU-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// NOCPU-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// NOCPU-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// NOCPU-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// NOCPU-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// NOCPU-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// NOCPU-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1 +// NOCPU-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1 +// NOCPU-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: call void @__clang_ocl_kern_imp_test(ptr addrspace(1) noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR10:[0-9]+]] +// NOCPU-NEXT: ret void +// +// +// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone +// NOCPU-LABEL: define dso_local void @__clang_ocl_kern_imp_test( +// NOCPU-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META5]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // NOCPU-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) // NOCPU-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -213,10 +237,46 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: ret void // // +// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone +// NOCPU-LABEL: define dso_local amdgpu_kernel void @test_target_features_kernel( +// NOCPU-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10:![0-9]+]] { +// NOCPU-NEXT: [[ENTRY:.*:]] +// NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// NOCPU-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// NOCPU-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: call void @__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR10]] +// NOCPU-NEXT: ret void +// +// +// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone +// NOCPU-LABEL: define dso_local void @__clang_ocl_kern_imp_test_target_features_kernel( +// NOCPU-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR5:[0-9]+]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10]] { +// NOCPU-NEXT: [[ENTRY:.*:]] +// NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// NOCPU-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// NOCPU-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) +// NOCPU-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) +// NOCPU-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) +// NOCPU-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// NOCPU-NEXT: [[DEFAULT_QUEUE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEFAULT_QUEUE]] to ptr +// NOCPU-NEXT: [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr +// NOCPU-NEXT: [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr +// NOCPU-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr +// NOCPU-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// NOCPU-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4 +// NOCPU-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() +// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8 +// NOCPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 +// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) +// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) +// NOCPU-NEXT: ret void +// +// // NOCPU: Function Attrs: convergent noinline nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4:[0-9]+]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal void @__test_block_invoke( +// NOCPU-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7:[0-9]+]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -233,9 +293,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent nounwind -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !associated [[META7:![0-9]+]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META9:![0-9]+]] !kernel_arg_type [[META10:![0-9]+]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11:![0-9]+]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal amdgpu_kernel void @__test_block_invoke_kernel( +// NOCPU-SAME: <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR8:[0-9]+]] !associated [[META11:![0-9]+]] !kernel_arg_addr_space [[META12:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -244,9 +304,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent noinline nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2 -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal void @__test_block_invoke_2( +// NOCPU-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -269,9 +329,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent nounwind -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META12:![0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal amdgpu_kernel void @__test_block_invoke_2_kernel( +// NOCPU-SAME: <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR8]] !associated [[META14:![0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -280,9 +340,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent noinline nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3 -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR4]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal void @__test_block_invoke_3( +// NOCPU-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR7]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) @@ -311,9 +371,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent nounwind -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !associated [[META13:![0-9]+]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META15:![0-9]+]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META17:![0-9]+]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal amdgpu_kernel void @__test_block_invoke_3_kernel( +// NOCPU-SAME: <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR8]] !associated [[META15:![0-9]+]] !kernel_arg_addr_space [[META16:![0-9]+]] !kernel_arg_access_qual [[META17:![0-9]+]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META19:![0-9]+]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8 // NOCPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr @@ -322,9 +382,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent noinline nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4 -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal void @__test_block_invoke_4( +// NOCPU-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -335,14 +395,14 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8 // NOCPU-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 // NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8 -// NOCPU-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8:[0-9]+]] +// NOCPU-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR10]] // NOCPU-NEXT: ret void // // // NOCPU: Function Attrs: convergent nounwind -// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel -// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META18:![0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal amdgpu_kernel void @__test_block_invoke_4_kernel( +// NOCPU-SAME: <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR8]] !associated [[META20:![0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // NOCPU-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -350,34 +410,10 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: ret void // // -// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@test_target_features_kernel -// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space [[META19:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META20:![0-9]+]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META11]] { -// NOCPU-NEXT: entry: -// NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// NOCPU-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// NOCPU-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) -// NOCPU-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) -// NOCPU-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) -// NOCPU-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr -// NOCPU-NEXT: [[DEFAULT_QUEUE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEFAULT_QUEUE]] to ptr -// NOCPU-NEXT: [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr -// NOCPU-NEXT: [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr -// NOCPU-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr -// NOCPU-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8 -// NOCPU-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4 -// NOCPU-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() -// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8 -// NOCPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4 -// NOCPU-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false) -// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) -// NOCPU-NEXT: ret void -// -// // NOCPU: Function Attrs: convergent noinline nounwind optnone -// NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke -// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal void @__test_target_features_kernel_block_invoke( +// NOCPU-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR7]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr @@ -389,9 +425,9 @@ kernel void test_target_features_kernel(global int *i) { // // // NOCPU: Function Attrs: convergent nounwind -// NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel -// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !associated [[META21:![0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { -// NOCPU-NEXT: entry: +// NOCPU-LABEL: define internal amdgpu_kernel void @__test_target_features_kernel_block_invoke_kernel( +// NOCPU-SAME: { i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR8]] !associated [[META21:![0-9]+]] !kernel_arg_addr_space [[META12]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META13]] !kernel_arg_base_type [[META13]] !kernel_arg_type_qual [[META10]] { +// NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5) // NOCPU-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -412,10 +448,12 @@ kernel void test_target_features_kernel(global int *i) { // // // +// +// // GFX900: Function Attrs: convergent norecurse nounwind -// GFX900-LABEL: define {{[^@]+}}@callee -// GFX900-SAME: (i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define dso_local void @callee( +// GFX900-SAME: i64 noundef [[ID:%.*]], ptr addrspace(1) noundef [[OUT:%.*]]) #[[ATTR1:[0-9]+]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5) // GFX900-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[ID_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ID_ADDR]] to ptr @@ -431,9 +469,33 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent norecurse nounwind -// GFX900-LABEL: define {{[^@]+}}@test -// GFX900-SAME: (ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define dso_local amdgpu_kernel void @test( +// GFX900-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] { +// GFX900-NEXT: [[ENTRY:.*:]] +// GFX900-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// GFX900-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) +// GFX900-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// GFX900-NEXT: [[D_ADDR:%.*]] = alloca i64, align 8, addrspace(5) +// GFX900-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// GFX900-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// GFX900-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// GFX900-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// GFX900-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14:![0-9]+]] +// GFX900-NEXT: store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16:![0-9]+]] +// GFX900-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]] +// GFX900-NEXT: store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] +// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14]] +// GFX900-NEXT: [[TMP1:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]] +// GFX900-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]] +// GFX900-NEXT: [[TMP3:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] +// GFX900-NEXT: call void @__clang_ocl_kern_imp_test(ptr addrspace(1) noundef align 1 [[TMP0]], i8 noundef signext [[TMP1]], ptr addrspace(1) noundef align 8 [[TMP2]], i64 noundef [[TMP3]]) #[[ATTR8:[0-9]+]] +// GFX900-NEXT: ret void +// +// +// GFX900: Function Attrs: convergent norecurse nounwind +// GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test( +// GFX900-SAME: ptr addrspace(1) noundef align 1 [[A:%.*]], i8 noundef signext [[B:%.*]], ptr addrspace(1) noundef align 8 [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[B_ADDR:%.*]] = alloca i8, align 1, addrspace(5) // GFX900-NEXT: [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -468,14 +530,14 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK20_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK20]] to ptr // GFX900-NEXT: [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr // GFX900-NEXT: [[TMP27_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VARTMP27]] to ptr -// GFX900-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14:![0-9]+]] -// GFX900-NEXT: store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16:![0-9]+]] +// GFX900-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR_ASCAST]], align 8, !tbaa [[TBAA14]] +// GFX900-NEXT: store i8 [[B]], ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]] // GFX900-NEXT: store ptr addrspace(1) [[C]], ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]] // GFX900-NEXT: store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7:[0-9]+]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9:[0-9]+]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR9]] // GFX900-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17:![0-9]+]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]] // GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19:![0-9]+]] // GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21:![0-9]+]] @@ -535,12 +597,12 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 5 // GFX900-NEXT: [[TMP17:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: store i64 [[TMP17]], ptr [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]] // GFX900-NEXT: [[TMP18:%.*]] = getelementptr [1 x i64], ptr [[BLOCK_SIZES_ASCAST]], i32 0, i32 0 // GFX900-NEXT: store i64 100, ptr [[TMP18]], align 8 // GFX900-NEXT: [[TMP19:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP12]], i32 [[TMP13]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[BLOCK12_ASCAST]], i32 1, ptr [[TMP18]]) -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]] // GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 0 // GFX900-NEXT: store i32 32, ptr [[BLOCK_SIZE22]], align 8 // GFX900-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[BLOCK21_ASCAST]], i32 0, i32 1 @@ -559,17 +621,59 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP27_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] // GFX900-NEXT: [[TMP24:%.*]] = load ptr, ptr [[BLOCK20_ASCAST]], align 8, !tbaa [[TBAA16]] // GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]]) -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9]] +// GFX900-NEXT: ret void +// +// +// GFX900: Function Attrs: convergent norecurse nounwind +// GFX900-LABEL: define dso_local amdgpu_kernel void @test_target_features_kernel( +// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { +// GFX900-NEXT: [[ENTRY:.*:]] +// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26:![0-9]+]] +// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26]] +// GFX900-NEXT: call void @__clang_ocl_kern_imp_test_target_features_kernel(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR8]] +// GFX900-NEXT: ret void +// +// +// GFX900: Function Attrs: convergent norecurse nounwind +// GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test_target_features_kernel( +// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { +// GFX900-NEXT: [[ENTRY:.*:]] +// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// GFX900-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) +// GFX900-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) +// GFX900-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) +// GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// GFX900-NEXT: [[DEFAULT_QUEUE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEFAULT_QUEUE]] to ptr +// GFX900-NEXT: [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr +// GFX900-NEXT: [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr +// GFX900-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr +// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR9]] +// GFX900-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] +// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]] +// GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() +// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]] +// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] +// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] +// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR9]] +// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9]] // GFX900-NEXT: ret void // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5:[0-9]+]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal void @__test_block_invoke( +// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6:[0-9]+]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr // GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8 @@ -583,9 +687,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META22:![0-9]+]] !kernel_arg_addr_space [[META23:![0-9]+]] !kernel_arg_access_qual [[META24:![0-9]+]] !kernel_arg_type [[META25:![0-9]+]] !kernel_arg_base_type [[META25]] !kernel_arg_type_qual [[META26:![0-9]+]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal amdgpu_kernel void @__test_block_invoke_kernel( +// GFX900-SAME: <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META28:![0-9]+]] !kernel_arg_addr_space [[META29:![0-9]+]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30:![0-9]+]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -594,9 +698,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2 -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal void @__test_block_invoke_2( +// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr // GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8 @@ -616,9 +720,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META27:![0-9]+]] !kernel_arg_addr_space [[META23]] !kernel_arg_access_qual [[META24]] !kernel_arg_type [[META25]] !kernel_arg_base_type [[META25]] !kernel_arg_type_qual [[META26]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal amdgpu_kernel void @__test_block_invoke_2_kernel( +// GFX900-SAME: <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META31:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -627,15 +731,15 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3 -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR5]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal void @__test_block_invoke_3( +// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]], ptr addrspace(3) noundef [[LP:%.*]]) #[[ATTR6]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[LP_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr // GFX900-NEXT: [[LP_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LP_ADDR]] to ptr // GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8 -// GFX900-NEXT: store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA28:![0-9]+]] +// GFX900-NEXT: store ptr addrspace(3) [[LP]], ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA32:![0-9]+]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6 // GFX900-NEXT: [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA16]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3 @@ -648,16 +752,16 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[TBAA7]] // GFX900-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0 // GFX900-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]] -// GFX900-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA28]] +// GFX900-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA32]] // GFX900-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0 // GFX900-NEXT: store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA17]] // GFX900-NEXT: ret void // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel -// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR5]] !associated [[META29:![0-9]+]] !kernel_arg_addr_space [[META30:![0-9]+]] !kernel_arg_access_qual [[META31:![0-9]+]] !kernel_arg_type [[META32:![0-9]+]] !kernel_arg_base_type [[META32]] !kernel_arg_type_qual [[META33:![0-9]+]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal amdgpu_kernel void @__test_block_invoke_3_kernel( +// GFX900-SAME: <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]], ptr addrspace(3) [[TMP1:%.*]]) #[[ATTR6]] !associated [[META33:![0-9]+]] !kernel_arg_addr_space [[META34:![0-9]+]] !kernel_arg_access_qual [[META35:![0-9]+]] !kernel_arg_type [[META36:![0-9]+]] !kernel_arg_base_type [[META36]] !kernel_arg_type_qual [[META37:![0-9]+]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8 // GFX900-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr @@ -666,9 +770,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4 -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal void @__test_block_invoke_4( +// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr // GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8 @@ -676,14 +780,14 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 // GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]] -// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8:[0-9]+]] +// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8]] // GFX900-NEXT: ret void // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel -// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META34:![0-9]+]] !kernel_arg_addr_space [[META23]] !kernel_arg_access_qual [[META24]] !kernel_arg_type [[META25]] !kernel_arg_base_type [[META25]] !kernel_arg_type_qual [[META26]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal amdgpu_kernel void @__test_block_invoke_4_kernel( +// GFX900-SAME: <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR6]] !associated [[META38:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // GFX900-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -691,40 +795,10 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: ret void // // -// GFX900: Function Attrs: convergent norecurse nounwind -// GFX900-LABEL: define {{[^@]+}}@test_target_features_kernel -// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META35:![0-9]+]] !kernel_arg_access_qual [[META24]] !kernel_arg_type [[META36:![0-9]+]] !kernel_arg_base_type [[META36]] !kernel_arg_type_qual [[META26]] { -// GFX900-NEXT: entry: -// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) -// GFX900-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5) -// GFX900-NEXT: [[NDRANGE:%.*]] = alloca [[STRUCT_NDRANGE_T:%.*]], align 4, addrspace(5) -// GFX900-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) -// GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr -// GFX900-NEXT: [[DEFAULT_QUEUE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEFAULT_QUEUE]] to ptr -// GFX900-NEXT: [[FLAGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[FLAGS]] to ptr -// GFX900-NEXT: [[NDRANGE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NDRANGE]] to ptr -// GFX900-NEXT: [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr -// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA37:![0-9]+]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]] -// GFX900-NEXT: store i32 0, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] -// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]] -// GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() -// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[DEFAULT_QUEUE_ASCAST]], align 8, !tbaa [[TBAA19]] -// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ASCAST]], align 4, !tbaa [[TBAA17]] -// GFX900-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP_ASCAST]], ptr align 4 [[NDRANGE_ASCAST]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] -// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]] -// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]] -// GFX900-NEXT: ret void -// -// // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke -// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal void @__test_target_features_kernel_block_invoke( +// GFX900-SAME: ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR6]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) // GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]] to ptr // GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR_ASCAST]], align 8 @@ -733,9 +807,9 @@ kernel void test_target_features_kernel(global int *i) { // // // GFX900: Function Attrs: convergent nounwind -// GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel -// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !associated [[META39:![0-9]+]] !kernel_arg_addr_space [[META23]] !kernel_arg_access_qual [[META24]] !kernel_arg_type [[META25]] !kernel_arg_base_type [[META25]] !kernel_arg_type_qual [[META26]] { -// GFX900-NEXT: entry: +// GFX900-LABEL: define internal amdgpu_kernel void @__test_target_features_kernel_block_invoke_kernel( +// GFX900-SAME: { i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR6]] !associated [[META39:![0-9]+]] !kernel_arg_addr_space [[META29]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META30]] !kernel_arg_base_type [[META30]] !kernel_arg_type_qual [[META25]] { +// GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5) // GFX900-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8 // GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr @@ -746,22 +820,25 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" } // NOCPU: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // NOCPU: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } -// NOCPU: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -// NOCPU: attributes #[[ATTR4]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// NOCPU: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// NOCPU: attributes #[[ATTR6]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" } -// NOCPU: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } -// NOCPU: attributes #[[ATTR8]] = { convergent nounwind } +// NOCPU: attributes #[[ATTR3]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// NOCPU: attributes #[[ATTR4]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" } +// NOCPU: attributes #[[ATTR5]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" } +// NOCPU: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +// NOCPU: attributes #[[ATTR7]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// NOCPU: attributes #[[ATTR8]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// NOCPU: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } +// NOCPU: attributes #[[ATTR10]] = { convergent nounwind } //. // GFX900: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" } // GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } // GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" } -// GFX900: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } -// GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -// GFX900: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } -// GFX900: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } -// GFX900: attributes #[[ATTR7]] = { nounwind } +// GFX900: attributes #[[ATTR3]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } +// GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } +// GFX900: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +// GFX900: attributes #[[ATTR6]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" } +// GFX900: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn } // GFX900: attributes #[[ATTR8]] = { convergent nounwind } +// GFX900: attributes #[[ATTR9]] = { nounwind } //. // NOCPU: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} // NOCPU: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} @@ -770,20 +847,20 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU: [[META4]] = !{!"none", !"none", !"none", !"none"} // NOCPU: [[META5]] = !{!"char*", !"char", !"long*", !"long"} // NOCPU: [[META6]] = !{!"", !"", !"", !""} -// NOCPU: [[META7]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle} -// NOCPU: [[META8]] = !{i32 0} -// NOCPU: [[META9]] = !{!"none"} -// NOCPU: [[META10]] = !{!"__block_literal"} -// NOCPU: [[META11]] = !{!""} -// NOCPU: [[META12]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle} -// NOCPU: [[META13]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle} -// NOCPU: [[META14]] = !{i32 0, i32 3} -// NOCPU: [[META15]] = !{!"none", !"none"} -// NOCPU: [[META16]] = !{!"__block_literal", !"void*"} -// NOCPU: [[META17]] = !{!"", !""} -// NOCPU: [[META18]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle} -// NOCPU: [[META19]] = !{i32 1} -// NOCPU: [[META20]] = !{!"int*"} +// NOCPU: [[META7]] = !{i32 1} +// NOCPU: [[META8]] = !{!"none"} +// NOCPU: [[META9]] = !{!"int*"} +// NOCPU: [[META10]] = !{!""} +// NOCPU: [[META11]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle} +// NOCPU: [[META12]] = !{i32 0} +// NOCPU: [[META13]] = !{!"__block_literal"} +// NOCPU: [[META14]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle} +// NOCPU: [[META15]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle} +// NOCPU: [[META16]] = !{i32 0, i32 3} +// NOCPU: [[META17]] = !{!"none", !"none"} +// NOCPU: [[META18]] = !{!"__block_literal", !"void*"} +// NOCPU: [[META19]] = !{!"", !""} +// NOCPU: [[META20]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle} // NOCPU: [[META21]] = !{ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle} //. // GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600} @@ -808,23 +885,23 @@ kernel void test_target_features_kernel(global int *i) { // GFX900: [[TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0} // GFX900: [[META20]] = !{!"queue_t", [[META5]], i64 0} // GFX900: [[TBAA_STRUCT21]] = !{i64 0, i64 4, [[TBAA17]]} -// GFX900: [[META22]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle} -// GFX900: [[META23]] = !{i32 0} -// GFX900: [[META24]] = !{!"none"} -// GFX900: [[META25]] = !{!"__block_literal"} -// GFX900: [[META26]] = !{!""} -// GFX900: [[META27]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle} -// GFX900: [[TBAA28]] = !{[[META9]], [[META9]], i64 0} -// GFX900: [[META29]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle} -// GFX900: [[META30]] = !{i32 0, i32 3} -// GFX900: [[META31]] = !{!"none", !"none"} -// GFX900: [[META32]] = !{!"__block_literal", !"void*"} -// GFX900: [[META33]] = !{!"", !""} -// GFX900: [[META34]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle} -// GFX900: [[META35]] = !{i32 1} -// GFX900: [[META36]] = !{!"int*"} -// GFX900: [[TBAA37]] = !{[[META38:![0-9]+]], [[META38]], i64 0} -// GFX900: [[META38]] = !{!"p1 int", [[META9]], i64 0} +// GFX900: [[META22]] = !{i32 1} +// GFX900: [[META23]] = !{!"none"} +// GFX900: [[META24]] = !{!"int*"} +// GFX900: [[META25]] = !{!""} +// GFX900: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0} +// GFX900: [[META27]] = !{!"p1 int", [[META9]], i64 0} +// GFX900: [[META28]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle} +// GFX900: [[META29]] = !{i32 0} +// GFX900: [[META30]] = !{!"__block_literal"} +// GFX900: [[META31]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle} +// GFX900: [[TBAA32]] = !{[[META9]], [[META9]], i64 0} +// GFX900: [[META33]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle} +// GFX900: [[META34]] = !{i32 0, i32 3} +// GFX900: [[META35]] = !{!"none", !"none"} +// GFX900: [[META36]] = !{!"__block_literal", !"void*"} +// GFX900: [[META37]] = !{!"", !""} +// GFX900: [[META38]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle} // GFX900: [[META39]] = !{ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle} //. //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: diff --git a/clang/test/CodeGenOpenCL/amdgpu-printf.cl b/clang/test/CodeGenOpenCL/amdgpu-printf.cl index edf6dbf8657cb..33fee66d6e9fc 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-printf.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-printf.cl @@ -1,43 +1,86 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 4 // RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2))); -// CHECK-LABEL: @test_printf_noargs( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR4:[0-9]+]] -// CHECK-NEXT: ret void -// __kernel void test_printf_noargs() { printf(""); } -// CHECK-LABEL: @test_printf_int( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) -// CHECK-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.1, i32 noundef [[TMP0]]) #[[ATTR4]] -// CHECK-NEXT: ret void -// __kernel void test_printf_int(int i) { printf("%d", i); } -// CHECK-LABEL: @test_printf_str_int( +__kernel void test_printf_str_int(int i) { + char s[] = "foo"; + printf("%s:%d", s, i); +} +// CHECK-LABEL: define dso_local amdgpu_kernel void @test_printf_noargs( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META4]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @__clang_ocl_kern_imp_test_printf_noargs() #[[ATTR5:[0-9]+]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_test_printf_noargs( +// CHECK-SAME: ) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META4]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str) #[[ATTR6:[0-9]+]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local amdgpu_kernel void @test_printf_int( +// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META5:![0-9]+]] !kernel_arg_access_qual [[META6:![0-9]+]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8:![0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: call void @__clang_ocl_kern_imp_test_printf_int(i32 noundef [[TMP0]]) #[[ATTR5]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_test_printf_int( +// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META5]] !kernel_arg_access_qual [[META6]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.1, i32 noundef [[TMP0]]) #[[ATTR6]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local amdgpu_kernel void @test_printf_str_int( +// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META5]] !kernel_arg_access_qual [[META6]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: call void @__clang_ocl_kern_imp_test_printf_str_int(i32 noundef [[TMP0]]) #[[ATTR5]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_test_printf_str_int( +// CHECK-SAME: i32 noundef [[I:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META5]] !kernel_arg_access_qual [[META6]] !kernel_arg_type [[META7]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META8]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5) -// CHECK-NEXT: store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR5:[0-9]+]] +// CHECK-NEXT: store i32 [[I]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR7:[0-9]+]] // CHECK-NEXT: call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false) // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP2]]) #[[ATTR4]] -// CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR5]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA9]] +// CHECK-NEXT: [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP0]]) #[[ATTR6]] +// CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR7]] // CHECK-NEXT: ret void // -__kernel void test_printf_str_int(int i) { - char s[] = "foo"; - printf("%s:%d", s, i); -} +//. +// CHECK: [[META4]] = !{} +// CHECK: [[META5]] = !{i32 0} +// CHECK: [[META6]] = !{!"none"} +// CHECK: [[META7]] = !{!"int"} +// CHECK: [[META8]] = !{!""} +// CHECK: [[TBAA9]] = !{[[META10:![0-9]+]], [[META10]], i64 0} +// CHECK: [[META10]] = !{!"int", [[META11:![0-9]+]], i64 0} +// CHECK: [[META11]] = !{!"omnipotent char", [[META12:![0-9]+]], i64 0} +// CHECK: [[META12]] = !{!"Simple C/C++ TBAA"} +//. diff --git a/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl b/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl index d139621ede4e7..5f32231b18c3d 100644 --- a/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl +++ b/clang/test/CodeGenOpenCL/cl-uniform-wg-size.cl @@ -4,14 +4,20 @@ // RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -foffload-uniform-block -o - %s 2>&1 | FileCheck %s -check-prefixes CHECK,CHECK-UNIFORM kernel void ker() {}; -// CHECK: define{{.*}}@ker() #0 +// CHECK: define{{.*}}@ker() #[[ATTR0:[0-9]+]] +// CHECK: call void @__clang_ocl_kern_imp_ker() #[[ATTR2:[0-9]+]] + +// CHECK: define{{.*}}@__clang_ocl_kern_imp_ker() #[[ATTR1:[0-9]+]] void foo() {}; -// CHECK: define{{.*}}@foo() #1 +// CHECK: define{{.*}}@foo() #[[ATTR1:[0-9]+]] -// CHECK-LABEL: attributes #0 +// CHECK: attributes #[[ATTR0]] // CHECK-UNIFORM: "uniform-work-group-size"="true" // CHECK-NONUNIFORM: "uniform-work-group-size"="false" -// CHECK-LABEL: attributes #1 +// CHECK: attributes #[[ATTR1]] +// CHECK-NOT: uniform-work-group-size + +// CHECK: attributes #[[ATTR2]] // CHECK-NOT: uniform-work-group-size diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl index 451d30b4d86f0..ea1f734391614 100644 --- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl +++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --check-globals all --include-generated-funcs --version 5 // RUN: %clang_cc1 -fno-ident -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" -fdenormal-fp-math-f32=preserve-sign -cl-uniform-work-group-size | FileCheck --check-prefix=SPIR32 %s // RUN: %clang_cc1 -fno-ident -ffp-exception-behavior=strict -fexperimental-strict-floating-point -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck --check-prefix=STRICTFP %s @@ -21,9 +21,26 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { }); } // SPIR32: Function Attrs: convergent noinline norecurse nounwind optnone -// SPIR32-LABEL: define {{[^@]+}}@device_side_enqueue -// SPIR32-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META2:![0-9]+]] !kernel_arg_access_qual [[META3:![0-9]+]] !kernel_arg_type [[META4:![0-9]+]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5:![0-9]+]] { -// SPIR32-NEXT: entry: +// SPIR32-LABEL: define dso_local spir_kernel void @device_side_enqueue( +// SPIR32-SAME: ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META2:![0-9]+]] !kernel_arg_access_qual [[META3:![0-9]+]] !kernel_arg_type [[META4:![0-9]+]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5:![0-9]+]] { +// SPIR32-NEXT: [[ENTRY:.*:]] +// SPIR32-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR32-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// SPIR32-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 +// SPIR32-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 4 +// SPIR32-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 4 +// SPIR32-NEXT: store i32 [[I]], ptr [[I_ADDR]], align 4 +// SPIR32-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 4 +// SPIR32-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4 +// SPIR32-NEXT: [[TMP2:%.*]] = load i32, ptr [[I_ADDR]], align 4 +// SPIR32-NEXT: call spir_func void @__clang_ocl_kern_imp_device_side_enqueue(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 4 [[TMP1]], i32 [[TMP2]]) #[[ATTR5:[0-9]+]] +// SPIR32-NEXT: ret void +// +// +// SPIR32: Function Attrs: convergent noinline norecurse nounwind optnone +// SPIR32-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_device_side_enqueue( +// SPIR32-SAME: ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META2]] !kernel_arg_access_qual [[META3]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5]] { +// SPIR32-NEXT: [[ENTRY:.*:]] // SPIR32-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // SPIR32-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // SPIR32-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 @@ -60,9 +77,9 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // // // SPIR32: Function Attrs: convergent noinline nounwind optnone -// SPIR32-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke -// SPIR32-SAME: (ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] { -// SPIR32-NEXT: entry: +// SPIR32-LABEL: define internal spir_func void @__device_side_enqueue_block_invoke( +// SPIR32-SAME: ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] { +// SPIR32-NEXT: [[ENTRY:.*:]] // SPIR32-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr addrspace(4), align 4 // SPIR32-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr addrspace(4), align 4 // SPIR32-NEXT: store ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR]], align 4 @@ -84,17 +101,34 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // // // SPIR32: Function Attrs: convergent nounwind -// SPIR32-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke_kernel -// SPIR32-SAME: (ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] { -// SPIR32-NEXT: entry: +// SPIR32-LABEL: define spir_kernel void @__device_side_enqueue_block_invoke_kernel( +// SPIR32-SAME: ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] { +// SPIR32-NEXT: [[ENTRY:.*:]] // SPIR32-NEXT: call spir_func void @__device_side_enqueue_block_invoke(ptr addrspace(4) [[TMP0]]) // SPIR32-NEXT: ret void // // // STRICTFP: Function Attrs: convergent noinline norecurse nounwind optnone strictfp -// STRICTFP-LABEL: define {{[^@]+}}@device_side_enqueue -// STRICTFP-SAME: (ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META2:![0-9]+]] !kernel_arg_access_qual [[META3:![0-9]+]] !kernel_arg_type [[META4:![0-9]+]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5:![0-9]+]] { -// STRICTFP-NEXT: entry: +// STRICTFP-LABEL: define dso_local spir_kernel void @device_side_enqueue( +// STRICTFP-SAME: ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META2:![0-9]+]] !kernel_arg_access_qual [[META3:![0-9]+]] !kernel_arg_type [[META4:![0-9]+]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5:![0-9]+]] { +// STRICTFP-NEXT: [[ENTRY:.*:]] +// STRICTFP-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// STRICTFP-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// STRICTFP-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 +// STRICTFP-NEXT: store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 4 +// STRICTFP-NEXT: store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 4 +// STRICTFP-NEXT: store i32 [[I]], ptr [[I_ADDR]], align 4 +// STRICTFP-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 4 +// STRICTFP-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4 +// STRICTFP-NEXT: [[TMP2:%.*]] = load i32, ptr [[I_ADDR]], align 4 +// STRICTFP-NEXT: call spir_func void @__clang_ocl_kern_imp_device_side_enqueue(ptr addrspace(1) align 4 [[TMP0]], ptr addrspace(1) align 4 [[TMP1]], i32 [[TMP2]]) #[[ATTR5:[0-9]+]] +// STRICTFP-NEXT: ret void +// +// +// STRICTFP: Function Attrs: convergent noinline norecurse nounwind optnone strictfp +// STRICTFP-LABEL: define dso_local spir_func void @__clang_ocl_kern_imp_device_side_enqueue( +// STRICTFP-SAME: ptr addrspace(1) align 4 [[A:%.*]], ptr addrspace(1) align 4 [[B:%.*]], i32 [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META2]] !kernel_arg_access_qual [[META3]] !kernel_arg_type [[META4]] !kernel_arg_base_type [[META4]] !kernel_arg_type_qual [[META5]] { +// STRICTFP-NEXT: [[ENTRY:.*:]] // STRICTFP-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // STRICTFP-NEXT: [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // STRICTFP-NEXT: [[I_ADDR:%.*]] = alloca i32, align 4 @@ -109,7 +143,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // STRICTFP-NEXT: store i32 0, ptr [[FLAGS]], align 4 // STRICTFP-NEXT: [[TMP0:%.*]] = load target("spirv.Queue"), ptr [[DEFAULT_QUEUE]], align 4 // STRICTFP-NEXT: [[TMP1:%.*]] = load i32, ptr [[FLAGS]], align 4 -// STRICTFP-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP]], ptr align 4 [[NDRANGE]], i32 4, i1 false) #[[ATTR5:[0-9]+]] +// STRICTFP-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP]], ptr align 4 [[NDRANGE]], i32 4, i1 false) #[[ATTR6:[0-9]+]] // STRICTFP-NEXT: [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 0 // STRICTFP-NEXT: store i32 24, ptr [[BLOCK_SIZE]], align 4 // STRICTFP-NEXT: [[BLOCK_ALIGN:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr [[BLOCK]], i32 0, i32 1 @@ -126,14 +160,14 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // STRICTFP-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 4 // STRICTFP-NEXT: store ptr addrspace(1) [[TMP4]], ptr [[BLOCK_CAPTURED2]], align 4 // STRICTFP-NEXT: [[TMP5:%.*]] = addrspacecast ptr [[BLOCK]] to ptr addrspace(4) -// STRICTFP-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP0]], i32 [[TMP1]], ptr [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]]) #[[ATTR5]] +// STRICTFP-NEXT: [[TMP6:%.*]] = call spir_func i32 @__enqueue_kernel_basic(target("spirv.Queue") [[TMP0]], i32 [[TMP1]], ptr [[TMP]], ptr addrspace(4) addrspacecast (ptr @__device_side_enqueue_block_invoke_kernel to ptr addrspace(4)), ptr addrspace(4) [[TMP5]]) #[[ATTR6]] // STRICTFP-NEXT: ret void // // // STRICTFP: Function Attrs: convergent noinline nounwind optnone strictfp -// STRICTFP-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke -// STRICTFP-SAME: (ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] { -// STRICTFP-NEXT: entry: +// STRICTFP-LABEL: define internal spir_func void @__device_side_enqueue_block_invoke( +// STRICTFP-SAME: ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR2:[0-9]+]] { +// STRICTFP-NEXT: [[ENTRY:.*:]] // STRICTFP-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr addrspace(4), align 4 // STRICTFP-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr addrspace(4), align 4 // STRICTFP-NEXT: store ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], ptr [[DOTBLOCK_DESCRIPTOR_ADDR]], align 4 @@ -144,7 +178,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // STRICTFP-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[BLOCK_CAPTURE_ADDR1]], align 4 // STRICTFP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[TMP0]], i32 [[TMP1]] // STRICTFP-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 -// STRICTFP-NEXT: [[TMP3:%.*]] = call float @llvm.experimental.constrained.fmuladd.f32(float 4.000000e+00, float [[TMP2]], float 1.000000e+00, metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR5]] +// STRICTFP-NEXT: [[TMP3:%.*]] = call float @llvm.experimental.constrained.fmuladd.f32(float 4.000000e+00, float [[TMP2]], float 1.000000e+00, metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR6]] // STRICTFP-NEXT: [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3 // STRICTFP-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[BLOCK_CAPTURE_ADDR2]], align 4 // STRICTFP-NEXT: [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr addrspace(4), ptr addrspace(1), i32, ptr addrspace(1) }>, ptr addrspace(4) [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4 @@ -155,10 +189,10 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // // // STRICTFP: Function Attrs: convergent nounwind -// STRICTFP-LABEL: define {{[^@]+}}@__device_side_enqueue_block_invoke_kernel -// STRICTFP-SAME: (ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] { -// STRICTFP-NEXT: entry: -// STRICTFP-NEXT: call spir_func void @__device_side_enqueue_block_invoke(ptr addrspace(4) [[TMP0]]) #[[ATTR5]] +// STRICTFP-LABEL: define spir_kernel void @__device_side_enqueue_block_invoke_kernel( +// STRICTFP-SAME: ptr addrspace(4) [[TMP0:%.*]]) #[[ATTR4:[0-9]+]] { +// STRICTFP-NEXT: [[ENTRY:.*:]] +// STRICTFP-NEXT: call spir_func void @__device_side_enqueue_block_invoke(ptr addrspace(4) [[TMP0]]) #[[ATTR6]] // STRICTFP-NEXT: ret void // //. @@ -167,13 +201,15 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) { // SPIR32: attributes #[[ATTR2]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } // SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } // SPIR32: attributes #[[ATTR4]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// SPIR32: attributes #[[ATTR5]] = { convergent nounwind "uniform-work-group-size"="true" } //. // STRICTFP: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind optnone strictfp "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" } // STRICTFP: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } // STRICTFP: attributes #[[ATTR2]] = { convergent noinline nounwind optnone strictfp "stack-protector-buffer-size"="8" } // STRICTFP: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind strictfp willreturn memory(inaccessiblemem: readwrite) } // STRICTFP: attributes #[[ATTR4]] = { convergent nounwind "stack-protector-buffer-size"="8" } -// STRICTFP: attributes #[[ATTR5]] = { strictfp } +// STRICTFP: attributes #[[ATTR5]] = { convergent nounwind strictfp "uniform-work-group-size"="false" } +// STRICTFP: attributes #[[ATTR6]] = { strictfp } //. // SPIR32: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} // SPIR32: [[META1:![0-9]+]] = !{i32 2, i32 0} diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl index a1408d38a44c9..3355fe1c25819 100644 --- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -1,12 +1,12 @@ -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86 -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86 -// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefix=CHECK-LIFETIMES +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR,TRIPLESPIR +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR,TRIPLESPIR +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLESPIR +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B32,SPIR,TRIPLESPIR +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=COMMON,B64,SPIR,TRIPLESPIR +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLESPIR +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86,TRIPLEX86 +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=COMMON,B64,X86,TRIPLEX86 +// RUN: %clang_cc1 -no-enable-noundef-analysis %s -cl-std=CL3.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "x86_64-unknown-linux-gnu" | FileCheck %s --check-prefixes=CHECK-LIFETIMES,TRIPLEX86 #pragma OPENCL EXTENSION cl_khr_subgroups : enable @@ -39,7 +39,13 @@ void callee(int id, __global int *out) { out[id] = id; } -// COMMON-LABEL: define{{.*}} spir_kernel void @device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i) +// TRIPLESPIR: define{{.*}} void @device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i) +// TRIPLESPIR: call spir_func void @__clang_ocl_kern_imp_device_side_enqueue({{.*}}) + +// TRIPLEX86: define{{.*}} void @device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i) +// TRIPLEX86: call void @__clang_ocl_kern_imp_device_side_enqueue({{.*}}) + +// COMMON-LABEL: define{{.*}} void @__clang_ocl_kern_imp_device_side_enqueue(ptr addrspace(1) align 4 %{{.*}}, ptr addrspace(1) align 4 %b, i32 %i) kernel void device_side_enqueue(global int *a, global int *b, int i) { // SPIR: %default_queue = alloca target("spirv.Queue") // X86: %default_queue = alloca ptr diff --git a/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl b/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl index b1e45e6d6e6dc..e741cf63f30b5 100644 --- a/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl +++ b/clang/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl @@ -9,8 +9,15 @@ typedef struct {int a;} ndrange_t; kernel void test(int i) { + // AMDGPU-LABEL: define {{.*}} amdgpu_kernel void @test +// AMDGPU-LABEL: call void @__clang_ocl_kern_imp_test(i32 noundef %0) + // SPIR-LABEL: define {{.*}} spir_kernel void @test +// SPIR-LABEL: call spir_func void @__clang_ocl_kern_imp_test(i32 noundef %0) + +// AMDGPU-LABEL: define {{.*}} void @__clang_ocl_kern_imp_test +// SPIR-LABEL: define {{.*}} spir_func void @__clang_ocl_kern_imp_test // COMMON-LABEL: entry: // AMDGPU: %block_sizes = alloca [1 x i64] @@ -36,6 +43,6 @@ kernel void test(int i) { // Check that the temporary is scoped to the `if` // CHECK-DEBUG: ![[TESTFILE:[0-9]+]] = !DIFile(filename: "" -// CHECK-DEBUG: ![[TESTSCOPE:[0-9]+]] = distinct !DISubprogram(name: "test", {{.*}} file: ![[TESTFILE]] -// CHECK-DEBUG: ![[IFSCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[TESTSCOPE]], file: ![[TESTFILE]], line: 26) -// CHECK-DEBUG: ![[TEMPLOCATION]] = !DILocation(line: 27, scope: ![[IFSCOPE]]) +// CHECK-DEBUG: ![[TESTSCOPE:[0-9]+]] = distinct !DISubprogram(name: "test", linkageName: "__clang_ocl_kern_imp_test", {{.*}} file: ![[TESTFILE]] +// CHECK-DEBUG: ![[IFSCOPE:[0-9]+]] = distinct !DILexicalBlock(scope: ![[TESTSCOPE]], file: ![[TESTFILE]], line: 33) +// CHECK-DEBUG: ![[TEMPLOCATION]] = !DILocation(line: 34, scope: ![[IFSCOPE]]) diff --git a/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl b/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl index 4a7bb8227c339..86ca9ae509073 100644 --- a/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl +++ b/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5 // RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s // Check there's no assertion when passing a pointer to an address space @@ -8,57 +8,66 @@ extern void private_ptr(__private int *); extern void local_ptr(__local int *); extern void generic_ptr(__generic int *); +void use_of_private_var() +{ + int x = 0 ; + private_ptr(&x); + generic_ptr(&x); +} + +void addr_of_arg(int x) +{ + private_ptr(&x); + generic_ptr(&x); +} + +__kernel void use_of_local_var() +{ + __local int x; + local_ptr(&x); + generic_ptr(&x); +} + // CHECK-LABEL: define dso_local void @use_of_private_var( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[X:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr -// CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[X]]) #[[ATTR4:[0-9]+]] +// CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[X]]) #[[ATTR5:[0-9]+]] // CHECK-NEXT: store i32 0, ptr [[X_ASCAST]], align 4, !tbaa [[TBAA4:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[X_ASCAST]] to ptr addrspace(5) -// CHECK-NEXT: call void @private_ptr(ptr addrspace(5) noundef [[TMP0]]) #[[ATTR5:[0-9]+]] -// CHECK-NEXT: call void @generic_ptr(ptr noundef [[X_ASCAST]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[X]]) #[[ATTR4]] +// CHECK-NEXT: [[X_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[X_ASCAST]] to ptr addrspace(5) +// CHECK-NEXT: call void @private_ptr(ptr addrspace(5) noundef [[X_ASCAST_ASCAST]]) #[[ATTR6:[0-9]+]] +// CHECK-NEXT: call void @generic_ptr(ptr noundef [[X_ASCAST]]) #[[ATTR6]] +// CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[X]]) #[[ATTR5]] // CHECK-NEXT: ret void // -void use_of_private_var() -{ - int x = 0 ; - private_ptr(&x); - generic_ptr(&x); -} - +// // CHECK-LABEL: define dso_local void @addr_of_arg( // CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr // CHECK-NEXT: store i32 [[X]], ptr [[X_ADDR_ASCAST]], align 4, !tbaa [[TBAA4]] -// CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[X_ADDR_ASCAST]] to ptr addrspace(5) -// CHECK-NEXT: call void @private_ptr(ptr addrspace(5) noundef [[TMP0]]) #[[ATTR5]] -// CHECK-NEXT: call void @generic_ptr(ptr noundef [[X_ADDR_ASCAST]]) #[[ATTR5]] +// CHECK-NEXT: [[X_ADDR_ASCAST_ASCAST:%.*]] = addrspacecast ptr [[X_ADDR_ASCAST]] to ptr addrspace(5) +// CHECK-NEXT: call void @private_ptr(ptr addrspace(5) noundef [[X_ADDR_ASCAST_ASCAST]]) #[[ATTR6]] +// CHECK-NEXT: call void @generic_ptr(ptr noundef [[X_ADDR_ASCAST]]) #[[ATTR6]] // CHECK-NEXT: ret void // -void addr_of_arg(int x) -{ - private_ptr(&x); - generic_ptr(&x); -} - +// // CHECK-LABEL: define dso_local amdgpu_kernel void @use_of_local_var( // CHECK-SAME: ) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: call void @local_ptr(ptr addrspace(3) noundef @use_of_local_var.x) #[[ATTR5]] -// CHECK-NEXT: call void @generic_ptr(ptr noundef addrspacecast (ptr addrspace(3) @use_of_local_var.x to ptr)) #[[ATTR5]] +// CHECK-NEXT: call void @__clang_ocl_kern_imp_use_of_local_var() #[[ATTR6]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local void @__clang_ocl_kern_imp_use_of_local_var( +// CHECK-SAME: ) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @local_ptr(ptr addrspace(3) noundef @use_of_local_var.x) #[[ATTR6]] +// CHECK-NEXT: call void @generic_ptr(ptr noundef addrspacecast (ptr addrspace(3) @use_of_local_var.x to ptr)) #[[ATTR6]] // CHECK-NEXT: ret void // -__kernel void use_of_local_var() -{ - __local int x; - local_ptr(&x); - generic_ptr(&x); -} - //. // CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0} // CHECK: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0} diff --git a/clang/test/CodeGenOpenCL/kernel-arg-info.cl b/clang/test/CodeGenOpenCL/kernel-arg-info.cl index dbb59af9470ce..3a2284223a105 100644 --- a/clang/test/CodeGenOpenCL/kernel-arg-info.cl +++ b/clang/test/CodeGenOpenCL/kernel-arg-info.cl @@ -21,6 +21,7 @@ kernel void foo(global int * globalintp, global int * restrict globalintrestrict *globalintrestrictp = constint + volatileint; } // CHECK: define{{.*}} spir_kernel void @foo{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[MD11:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD12:[0-9]+]] // CHECK: !kernel_arg_type ![[MD13:[0-9]+]] @@ -32,6 +33,7 @@ kernel void foo(global int * globalintp, global int * restrict globalintrestrict kernel void foo2(read_only image1d_t img1, image2d_t img2, write_only image2d_array_t img3, read_write image1d_t img4) { } // CHECK: define{{.*}} spir_kernel void @foo2{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo2{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[MD21:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD22:[0-9]+]] // CHECK: !kernel_arg_type ![[MD23:[0-9]+]] @@ -43,6 +45,7 @@ kernel void foo2(read_only image1d_t img1, image2d_t img2, write_only image2d_ar kernel void foo3(__global half * X) { } // CHECK: define{{.*}} spir_kernel void @foo3{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo3{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[MD31:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD32:[0-9]+]] // CHECK: !kernel_arg_type ![[MD33:[0-9]+]] @@ -55,6 +58,7 @@ typedef unsigned int myunsignedint; kernel void foo4(__global unsigned int * X, __global myunsignedint * Y) { } // CHECK: define{{.*}} spir_kernel void @foo4{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo4{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[MD41:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD42:[0-9]+]] // CHECK: !kernel_arg_type ![[MD43:[0-9]+]] @@ -67,6 +71,7 @@ typedef image1d_t myImage; kernel void foo5(myImage img1, write_only image1d_t img2) { } // CHECK: define{{.*}} spir_kernel void @foo5{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo5{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[MD41:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD51:[0-9]+]] // CHECK: !kernel_arg_type ![[MD52:[0-9]+]] @@ -77,6 +82,8 @@ kernel void foo5(myImage img1, write_only image1d_t img2) { typedef char char16 __attribute__((ext_vector_type(16))); __kernel void foo6(__global char16 arg[]) {} +// CHECK: define{{.*}} spir_kernel void @foo6{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo6{{[^!]+}} // CHECK: !kernel_arg_type ![[MD61:[0-9]+]] // ARGINFO: !kernel_arg_name ![[MD62:[0-9]+]] @@ -87,6 +94,7 @@ kernel void foo7(ROImage ro, WOImage wo, RWImage rw) { } // CHECK: define{{.*}} spir_kernel void @foo7{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo7{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[MD71:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD72:[0-9]+]] // CHECK: !kernel_arg_type ![[MD73:[0-9]+]] @@ -99,6 +107,7 @@ typedef unsigned char uchar; typedef uchar uchar2 __attribute__((ext_vector_type(2))); kernel void foo8(pipe int p1, pipe uchar p2, pipe uchar2 p3, const pipe uchar p4, write_only pipe uchar p5) {} // CHECK: define{{.*}} spir_kernel void @foo8{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo8{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[PIPE_AS_QUAL:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[PIPE_ACCESS_QUAL:[0-9]+]] // CHECK: !kernel_arg_type ![[PIPE_TY:[0-9]+]] @@ -109,6 +118,7 @@ kernel void foo8(pipe int p1, pipe uchar p2, pipe uchar2 p3, const pipe uchar p4 kernel void foo9(signed char sc1, global const signed char* sc2) {} // CHECK: define{{.*}} spir_kernel void @foo9{{[^!]+}} +// ARGINFO: define{{.*}} spir_kernel void @foo9{{[^!]+}} // CHECK: !kernel_arg_addr_space ![[SCHAR_AS_QUAL:[0-9]+]] // CHECK: !kernel_arg_access_qual ![[MD42]] // CHECK: !kernel_arg_type ![[SCHAR_TY:[0-9]+]] diff --git a/clang/test/CodeGenOpenCL/opencl-kernel-call.cl b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl new file mode 100644 index 0000000000000..cdbe510b723b9 --- /dev/null +++ b/clang/test/CodeGenOpenCL/opencl-kernel-call.cl @@ -0,0 +1,959 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-attributes --include-generated-funcs --version 4 +// RUN: %clang_cc1 -O0 -triple i686-pc-darwin -emit-llvm -o - %s | FileCheck -check-prefix=X86 %s +// RUN: %clang_cc1 -O0 -triple amdgcn -emit-llvm -o - %s | FileCheck -check-prefix=AMDGCN %s + +#pragma OPENCL EXTENSION __cl_clang_function_pointers : enable + +typedef int int2 __attribute__((ext_vector_type(2))); + +typedef struct { + int cells[9]; +} Mat3X3; + +typedef struct { + int cells[16]; +} Mat4X4; + +typedef struct { + int cells[1024]; +} Mat32X32; + +typedef struct { + int cells[4096]; +} Mat64X64; + +struct StructOneMember { + int2 x; +}; + +struct StructTwoMember { + int2 x; + int2 y; +}; + +struct LargeStructOneMember { + int2 x[100]; +}; + +struct LargeStructTwoMember { + int2 x[40]; + int2 y[20]; +}; + +Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { + Mat4X4 out; + return out; +} + +Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { + Mat64X64 out; + return out; +} + +void FuncOneMember(struct StructOneMember u) { + u.x = (int2)(0, 0); +} + +void FuncOneLargeMember(struct LargeStructOneMember u) { + u.x[0] = (int2)(0, 0); +} + +void FuncTwoMember(struct StructTwoMember u) { + u.y = (int2)(0, 0); +} + +void FuncLargeTwoMember(struct LargeStructTwoMember u) { + u.y[0] = (int2)(0, 0); +} + +__attribute__((noinline)) kernel void callee_kern(global int *A){ + *A = 1; +} + +kernel void callee_kern_Mat3X3(global Mat3X3 *in, global Mat4X4 *out) { + out[0] = foo(in[1]); +} + +kernel void callee_kern_Mat32X32(global Mat32X32 *in, global Mat64X64 *out) { + out[0] = foo_large(in[1]); +} + +kernel void KernelOneMember(struct StructOneMember u) { + FuncOneMember(u); +} + +kernel void KernelLargeOneMember(struct LargeStructOneMember u) { + FuncOneLargeMember(u); +} + +kernel void KernelTwoMember(struct StructTwoMember u) { + FuncTwoMember(u); +} + +kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) { + FuncLargeTwoMember(u); +} + +__attribute__((noinline)) kernel void ext_callee_kern(global int *A); + +kernel void ext_callee_kern_Mat3X3(global Mat3X3 *in, global Mat4X4 *out); + +kernel void ext_callee_kern_Mat32X32(global Mat32X32 *in, global Mat64X64 *out); + +kernel void ext_KernelOneMember(struct StructOneMember u); + +kernel void ext_KernelLargeOneMember(struct LargeStructOneMember u); + +kernel void ext_KernelTwoMember(struct StructTwoMember u); + +kernel void ext_KernelLargeTwoMember(struct LargeStructTwoMember u); + +kernel void caller_kern(global int* A, global Mat3X3 *mat3X3, global Mat4X4 *mat4X4, global Mat32X32 *mat32X32, global Mat64X64 *mat64X64){ + callee_kern(A); + ext_callee_kern(A); + + callee_kern_Mat3X3(mat3X3, mat4X4); + callee_kern_Mat32X32(mat32X32, mat64X64); + ext_callee_kern_Mat3X3(mat3X3, mat4X4); + ext_callee_kern_Mat32X32(mat32X32, mat64X64); +} + +kernel void caller_kern2(struct StructOneMember structOneMem, global struct StructOneMember* global_structOneMem, struct StructTwoMember structTwoMem){ + KernelOneMember(structOneMem); + ext_KernelOneMember(structOneMem); + KernelTwoMember(structTwoMem); + ext_KernelTwoMember(structTwoMem); +} + +kernel void caller_kern3( struct LargeStructOneMember largeStructOneMem, struct LargeStructTwoMember largeStructTwoMem){ + KernelLargeOneMember(largeStructOneMem); + KernelLargeTwoMember(largeStructTwoMem); + ext_KernelLargeOneMember(largeStructOneMem); + ext_KernelLargeTwoMember(largeStructTwoMem); +} +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @foo( +// X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] { +// X86-NEXT: entry: +// X86-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @foo_large( +// X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @FuncOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[X]], align 8 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @FuncOneLargeMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @FuncTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[Y]], align 8 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @FuncLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false) +// X86-NEXT: store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8 +// X86-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0 +// X86-NEXT: store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @callee_kern( +// X86-SAME: ptr noundef align 4 [[A:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// X86-NEXT: entry: +// X86-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr noundef align 4 [[TMP0]]) #[[ATTR4:[0-9]+]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @__clang_ocl_kern_imp_callee_kern( +// X86-SAME: ptr noundef align 4 [[A:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4 +// X86-NEXT: store i32 1, ptr [[TMP0]], align 4 +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @callee_kern_Mat3X3( +// X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META9:![0-9]+]] !kernel_arg_type [[META10:![0-9]+]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11:![0-9]+]] { +// X86-NEXT: entry: +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr noundef align 4 [[TMP0]], ptr noundef align 4 [[TMP1]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @__clang_ocl_kern_imp_callee_kern_Mat3X3( +// X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { +// X86-NEXT: entry: +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4 +// X86-NEXT: store ptr [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr [[TMP0]], i32 0 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr [[TMP1]], i32 1 +// X86-NEXT: call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[ARRAYIDX1]]) #[[ATTR4]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false) +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @callee_kern_Mat32X32( +// X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { +// X86-NEXT: entry: +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr noundef align 4 [[TMP0]], ptr noundef align 4 [[TMP1]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @__clang_ocl_kern_imp_callee_kern_Mat32X32( +// X86-SAME: ptr noundef align 4 [[IN:%.*]], ptr noundef align 4 [[OUT:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { +// X86-NEXT: entry: +// X86-NEXT: [[IN_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4 +// X86-NEXT: store ptr [[IN]], ptr [[IN_ADDR]], align 4 +// X86-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr [[TMP0]], i32 0 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[IN_ADDR]], align 4 +// X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32:%.*]], ptr [[TMP1]], i32 1 +// X86-NEXT: call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[ARRAYIDX1]]) #[[ATTR4]] +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false) +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @KernelOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false) +// X86-NEXT: call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @KernelLargeOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeOneMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false) +// X86-NEXT: call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @KernelTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false) +// X86-NEXT: call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @KernelLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { +// X86-NEXT: entry: +// X86-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false) +// X86-NEXT: call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @caller_kern( +// X86-SAME: ptr noundef align 4 [[A:%.*]], ptr noundef align 4 [[MAT3X3:%.*]], ptr noundef align 4 [[MAT4X4:%.*]], ptr noundef align 4 [[MAT32X32:%.*]], ptr noundef align 4 [[MAT64X64:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META18:![0-9]+]] !kernel_arg_access_qual [[META19:![0-9]+]] !kernel_arg_type [[META20:![0-9]+]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21:![0-9]+]] { +// X86-NEXT: entry: +// X86-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT3X3_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT4X4_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT32X32_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT64X64_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT3X3]], ptr [[MAT3X3_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT4X4]], ptr [[MAT4X4_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT32X32]], ptr [[MAT32X32_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT64X64]], ptr [[MAT64X64_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4 +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4 +// X86-NEXT: [[TMP2:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4 +// X86-NEXT: [[TMP3:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4 +// X86-NEXT: [[TMP4:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_caller_kern(ptr noundef align 4 [[TMP0]], ptr noundef align 4 [[TMP1]], ptr noundef align 4 [[TMP2]], ptr noundef align 4 [[TMP3]], ptr noundef align 4 [[TMP4]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @__clang_ocl_kern_imp_caller_kern( +// X86-SAME: ptr noundef align 4 [[A:%.*]], ptr noundef align 4 [[MAT3X3:%.*]], ptr noundef align 4 [[MAT4X4:%.*]], ptr noundef align 4 [[MAT32X32:%.*]], ptr noundef align 4 [[MAT64X64:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META18]] !kernel_arg_access_qual [[META19]] !kernel_arg_type [[META20]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21]] { +// X86-NEXT: entry: +// X86-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT3X3_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT4X4_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT32X32_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: [[MAT64X64_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT3X3]], ptr [[MAT3X3_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT4X4]], ptr [[MAT4X4_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT32X32]], ptr [[MAT32X32_ADDR]], align 4 +// X86-NEXT: store ptr [[MAT64X64]], ptr [[MAT64X64_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr noundef align 4 [[TMP0]]) #[[ATTR4]] +// X86-NEXT: [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr noundef align 4 [[TMP1]]) #[[ATTR4]] +// X86-NEXT: [[TMP2:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4 +// X86-NEXT: [[TMP3:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr noundef align 4 [[TMP2]], ptr noundef align 4 [[TMP3]]) #[[ATTR4]] +// X86-NEXT: [[TMP4:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4 +// X86-NEXT: [[TMP5:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr noundef align 4 [[TMP4]], ptr noundef align 4 [[TMP5]]) #[[ATTR4]] +// X86-NEXT: [[TMP6:%.*]] = load ptr, ptr [[MAT3X3_ADDR]], align 4 +// X86-NEXT: [[TMP7:%.*]] = load ptr, ptr [[MAT4X4_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr noundef align 4 [[TMP6]], ptr noundef align 4 [[TMP7]]) #[[ATTR4]] +// X86-NEXT: [[TMP8:%.*]] = load ptr, ptr [[MAT32X32_ADDR]], align 4 +// X86-NEXT: [[TMP9:%.*]] = load ptr, ptr [[MAT64X64_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr noundef align 4 [[TMP8]], ptr noundef align 4 [[TMP9]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @caller_kern2( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[STRUCTONEMEM:%.*]], ptr noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[STRUCTTWOMEM:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { +// X86-NEXT: entry: +// X86-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: store ptr [[GLOBAL_STRUCTONEMEM]], ptr [[GLOBAL_STRUCTONEMEM_ADDR]], align 4 +// X86-NEXT: [[TMP0:%.*]] = load ptr, ptr [[GLOBAL_STRUCTONEMEM_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_caller_kern2(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]], ptr noundef align 8 [[TMP0]], ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @__clang_ocl_kern_imp_caller_kern2( +// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]], ptr noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP1:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { +// X86-NEXT: entry: +// X86-NEXT: [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8 +// X86-NEXT: [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8 +// X86-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr, align 4 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[STRUCTONEMEM]], ptr align 4 [[TMP0]], i32 8, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[STRUCTTWOMEM]], ptr align 4 [[TMP1]], i32 16, i1 false) +// X86-NEXT: store ptr [[GLOBAL_STRUCTONEMEM]], ptr [[GLOBAL_STRUCTONEMEM_ADDR]], align 4 +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[STRUCTONEMEM]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[STRUCTTWOMEM]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define spir_kernel void @caller_kern3( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[LARGESTRUCTONEMEM:%.*]], ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[LARGESTRUCTTWOMEM:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META26:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27:![0-9]+]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { +// X86-NEXT: entry: +// X86-NEXT: call void @__clang_ocl_kern_imp_caller_kern3(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]], ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// X86: Function Attrs: convergent noinline norecurse nounwind optnone +// X86-LABEL: define void @__clang_ocl_kern_imp_caller_kern3( +// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]], ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP1:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META26]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { +// X86-NEXT: entry: +// X86-NEXT: [[LARGESTRUCTONEMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8 +// X86-NEXT: [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8 +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[LARGESTRUCTONEMEM]], ptr align 4 [[TMP0]], i32 800, i1 false) +// X86-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[LARGESTRUCTTWOMEM]], ptr align 4 [[TMP1]], i32 480, i1 false) +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[LARGESTRUCTONEMEM]]) #[[ATTR4]] +// X86-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[LARGESTRUCTTWOMEM]]) #[[ATTR4]] +// X86-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo( +// AMDGCN-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0 +// AMDGCN-NEXT: store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4 +// AMDGCN-NEXT: [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4 +// AMDGCN-NEXT: ret [[STRUCT_MAT4X4]] [[TMP0]] +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @foo_large( +// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @FuncOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @FuncOneLargeMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @FuncTwoMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8 +// AMDGCN-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @callee_kern( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR5:[0-9]+]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]]) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META6]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: store i32 1, ptr addrspace(1) [[TMP0]], align 4 +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @callee_kern_Mat3X3( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META9:![0-9]+]] !kernel_arg_type [[META10:![0-9]+]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11:![0-9]+]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern_Mat3X3( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META10]] !kernel_arg_base_type [[META10]] !kernel_arg_type_qual [[META11]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4 +// AMDGCN-NEXT: [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0 +// AMDGCN-NEXT: store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4 +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @callee_kern_Mat32X32( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_callee_kern_Mat32X32( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META8]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META12]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META11]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8 +// AMDGCN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1 +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false) +// AMDGCN-NEXT: call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false) +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP0]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelOneMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META14]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember( +// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeOneMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META15]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember( +// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelTwoMember( +// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META16]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: call void @FuncTwoMember(<2 x i32> [[TMP3]], <2 x i32> [[TMP5]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember( +// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0 +// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1 +// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_KernelLargeTwoMember( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META13]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META17]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META7]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false) +// AMDGCN-NEXT: call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]], ptr addrspace(1) noundef align 4 [[MAT3X3:%.*]], ptr addrspace(1) noundef align 4 [[MAT4X4:%.*]], ptr addrspace(1) noundef align 4 [[MAT32X32:%.*]], ptr addrspace(1) noundef align 4 [[MAT64X64:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META18:![0-9]+]] !kernel_arg_access_qual [[META19:![0-9]+]] !kernel_arg_type [[META20:![0-9]+]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21:![0-9]+]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT3X3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT4X4_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT32X32_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT64X64_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT3X3]], ptr addrspace(5) [[MAT3X3_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT4X4]], ptr addrspace(5) [[MAT4X4_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT32X32]], ptr addrspace(5) [[MAT32X32_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT64X64]], ptr addrspace(5) [[MAT64X64_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_caller_kern(ptr addrspace(1) noundef align 4 [[TMP0]], ptr addrspace(1) noundef align 4 [[TMP1]], ptr addrspace(1) noundef align 4 [[TMP2]], ptr addrspace(1) noundef align 4 [[TMP3]], ptr addrspace(1) noundef align 4 [[TMP4]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_caller_kern( +// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[A:%.*]], ptr addrspace(1) noundef align 4 [[MAT3X3:%.*]], ptr addrspace(1) noundef align 4 [[MAT4X4:%.*]], ptr addrspace(1) noundef align 4 [[MAT32X32:%.*]], ptr addrspace(1) noundef align 4 [[MAT64X64:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META18]] !kernel_arg_access_qual [[META19]] !kernel_arg_type [[META20]] !kernel_arg_base_type [[META20]] !kernel_arg_type_qual [[META21]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT3X3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT4X4_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT32X32_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[MAT64X64_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT3X3]], ptr addrspace(5) [[MAT3X3_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT4X4]], ptr addrspace(5) [[MAT4X4_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT32X32]], ptr addrspace(5) [[MAT32X32_ADDR]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[MAT64X64]], ptr addrspace(5) [[MAT64X64_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_callee_kern(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern(ptr addrspace(1) noundef align 4 [[TMP1]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP2]], ptr addrspace(1) noundef align 4 [[TMP3]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP4]], ptr addrspace(1) noundef align 4 [[TMP5]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT3X3_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT4X4_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat3X3(ptr addrspace(1) noundef align 4 [[TMP6]], ptr addrspace(1) noundef align 4 [[TMP7]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT32X32_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[MAT64X64_ADDR]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_callee_kern_Mat32X32(ptr addrspace(1) noundef align 4 [[TMP8]], ptr addrspace(1) noundef align 4 [[TMP9]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern2( +// AMDGCN-SAME: <2 x i32> [[STRUCTONEMEM_COERCE:%.*]], ptr addrspace(1) noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], [[STRUCT_STRUCTTWOMEMBER:%.*]] [[STRUCTTWOMEM_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[STRUCTONEMEM_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[STRUCTTWOMEM_COERCE]], 0 +// AMDGCN-NEXT: store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[STRUCTTWOMEM_COERCE]], 1 +// AMDGCN-NEXT: store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[GLOBAL_STRUCTONEMEM]], ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP8]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_caller_kern2(<2 x i32> [[TMP5]], ptr addrspace(1) noundef align 8 [[TMP4]], <2 x i32> [[TMP7]], <2 x i32> [[TMP9]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_caller_kern2( +// AMDGCN-SAME: <2 x i32> [[STRUCTONEMEM_COERCE:%.*]], ptr addrspace(1) noundef align 8 [[GLOBAL_STRUCTONEMEM:%.*]], <2 x i32> [[STRUCTTWOMEM_COERCE0:%.*]], <2 x i32> [[STRUCTTWOMEM_COERCE1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[STRUCTONEMEM:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[STRUCTTWOMEM:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5) +// AMDGCN-NEXT: [[GLOBAL_STRUCTONEMEM_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// AMDGCN-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[STRUCTONEMEM_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: store <2 x i32> [[STRUCTTWOMEM_COERCE0]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: store <2 x i32> [[STRUCTTWOMEM_COERCE1]], ptr addrspace(5) [[TMP1]], align 8 +// AMDGCN-NEXT: store ptr addrspace(1) [[GLOBAL_STRUCTONEMEM]], ptr addrspace(5) [[GLOBAL_STRUCTONEMEM_ADDR]], align 8 +// AMDGCN-NEXT: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelOneMember(<2 x i32> [[TMP2]]) #[[ATTR5]] +// AMDGCN-NEXT: [[COERCE_DIVE2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[STRUCTONEMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE2]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelOneMember(<2 x i32> [[TMP3]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR5]] +// AMDGCN-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP8]], align 8 +// AMDGCN-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[STRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP11:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP10]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelTwoMember(<2 x i32> [[TMP9]], <2 x i32> [[TMP11]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local amdgpu_kernel void @caller_kern3( +// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[LARGESTRUCTONEMEM_COERCE:%.*]], [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[LARGESTRUCTTWOMEM_COERCE:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META26:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27:![0-9]+]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[LARGESTRUCTONEMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[LARGESTRUCTONEMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[LARGESTRUCTONEMEM_COERCE]], 0 +// AMDGCN-NEXT: store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8 +// AMDGCN-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[LARGESTRUCTTWOMEM]], i32 0, i32 0 +// AMDGCN-NEXT: [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[LARGESTRUCTTWOMEM_COERCE]], 0 +// AMDGCN-NEXT: store [40 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8 +// AMDGCN-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[LARGESTRUCTTWOMEM]], i32 0, i32 1 +// AMDGCN-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[LARGESTRUCTTWOMEM_COERCE]], 1 +// AMDGCN-NEXT: store [20 x <2 x i32>] [[TMP5]], ptr addrspace(5) [[TMP4]], align 8 +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_caller_kern3(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +// +// AMDGCN: Function Attrs: convergent noinline norecurse nounwind optnone +// AMDGCN-LABEL: define dso_local void @__clang_ocl_kern_imp_caller_kern3( +// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]], ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP1:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META26]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META27]] !kernel_arg_base_type [[META27]] !kernel_arg_type_qual [[META11]] { +// AMDGCN-NEXT: entry: +// AMDGCN-NEXT: [[LARGESTRUCTONEMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: [[LARGESTRUCTTWOMEM:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[LARGESTRUCTONEMEM]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false) +// AMDGCN-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[LARGESTRUCTTWOMEM]], ptr addrspace(5) align 8 [[TMP1]], i64 480, i1 false) +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeOneMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[LARGESTRUCTONEMEM]]) #[[ATTR5]] +// AMDGCN-NEXT: call void @__clang_ocl_kern_imp_ext_KernelLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[LARGESTRUCTTWOMEM]]) #[[ATTR5]] +// AMDGCN-NEXT: ret void +// +//. +// X86: [[META4]] = !{i32 1} +// X86: [[META5]] = !{!"none"} +// X86: [[META6]] = !{!"int*"} +// X86: [[META7]] = !{!""} +// X86: [[META8]] = !{i32 1, i32 1} +// X86: [[META9]] = !{!"none", !"none"} +// X86: [[META10]] = !{!"Mat3X3*", !"Mat4X4*"} +// X86: [[META11]] = !{!"", !""} +// X86: [[META12]] = !{!"Mat32X32*", !"Mat64X64*"} +// X86: [[META13]] = !{i32 0} +// X86: [[META14]] = !{!"struct StructOneMember"} +// X86: [[META15]] = !{!"struct LargeStructOneMember"} +// X86: [[META16]] = !{!"struct StructTwoMember"} +// X86: [[META17]] = !{!"struct LargeStructTwoMember"} +// X86: [[META18]] = !{i32 1, i32 1, i32 1, i32 1, i32 1} +// X86: [[META19]] = !{!"none", !"none", !"none", !"none", !"none"} +// X86: [[META20]] = !{!"int*", !"Mat3X3*", !"Mat4X4*", !"Mat32X32*", !"Mat64X64*"} +// X86: [[META21]] = !{!"", !"", !"", !"", !""} +// X86: [[META22]] = !{i32 0, i32 1, i32 0} +// X86: [[META23]] = !{!"none", !"none", !"none"} +// X86: [[META24]] = !{!"struct StructOneMember", !"struct StructOneMember*", !"struct StructTwoMember"} +// X86: [[META25]] = !{!"", !"", !""} +// X86: [[META26]] = !{i32 0, i32 0} +// X86: [[META27]] = !{!"struct LargeStructOneMember", !"struct LargeStructTwoMember"} +//. +// AMDGCN: [[META4]] = !{i32 1} +// AMDGCN: [[META5]] = !{!"none"} +// AMDGCN: [[META6]] = !{!"int*"} +// AMDGCN: [[META7]] = !{!""} +// AMDGCN: [[META8]] = !{i32 1, i32 1} +// AMDGCN: [[META9]] = !{!"none", !"none"} +// AMDGCN: [[META10]] = !{!"Mat3X3*", !"Mat4X4*"} +// AMDGCN: [[META11]] = !{!"", !""} +// AMDGCN: [[META12]] = !{!"Mat32X32*", !"Mat64X64*"} +// AMDGCN: [[META13]] = !{i32 0} +// AMDGCN: [[META14]] = !{!"struct StructOneMember"} +// AMDGCN: [[META15]] = !{!"struct LargeStructOneMember"} +// AMDGCN: [[META16]] = !{!"struct StructTwoMember"} +// AMDGCN: [[META17]] = !{!"struct LargeStructTwoMember"} +// AMDGCN: [[META18]] = !{i32 1, i32 1, i32 1, i32 1, i32 1} +// AMDGCN: [[META19]] = !{!"none", !"none", !"none", !"none", !"none"} +// AMDGCN: [[META20]] = !{!"int*", !"Mat3X3*", !"Mat4X4*", !"Mat32X32*", !"Mat64X64*"} +// AMDGCN: [[META21]] = !{!"", !"", !"", !"", !""} +// AMDGCN: [[META22]] = !{i32 0, i32 1, i32 0} +// AMDGCN: [[META23]] = !{!"none", !"none", !"none"} +// AMDGCN: [[META24]] = !{!"struct StructOneMember", !"struct StructOneMember*", !"struct StructTwoMember"} +// AMDGCN: [[META25]] = !{!"", !"", !""} +// AMDGCN: [[META26]] = !{i32 0, i32 0} +// AMDGCN: [[META27]] = !{!"struct LargeStructOneMember", !"struct LargeStructTwoMember"} +//. diff --git a/clang/test/CodeGenOpenCL/reflect.cl b/clang/test/CodeGenOpenCL/reflect.cl index f5b618f6a35d3..4abb40aa3ed50 100644 --- a/clang/test/CodeGenOpenCL/reflect.cl +++ b/clang/test/CodeGenOpenCL/reflect.cl @@ -1,6 +1,13 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 4 // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -emit-llvm -O0 -o - | FileCheck %s +bool device_function() { + return __nvvm_reflect("__CUDA_ARCH") >= 700; +} + +__kernel void kernel_function(__global int *i) { + *i = device_function(); +} // CHECK-LABEL: define dso_local zeroext i1 @device_function( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: @@ -8,24 +15,28 @@ // CHECK-NEXT: [[CMP:%.*]] = icmp uge i32 [[TMP0]], 700 // CHECK-NEXT: ret i1 [[CMP]] // -bool device_function() { - return __nvvm_reflect("__CUDA_ARCH") >= 700; -} - +// // CHECK-LABEL: define dso_local ptx_kernel void @kernel_function( // CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4 // CHECK-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR]], align 4 -// CHECK-NEXT: [[CALL:%.*]] = call zeroext i1 @device_function() #[[ATTR3:[0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR]], align 4 +// CHECK-NEXT: call void @__clang_ocl_kern_imp_kernel_function(ptr addrspace(1) noundef align 4 [[TMP0]]) #[[ATTR3:[0-9]+]] +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local ptx_kernel void @__clang_ocl_kern_imp_kernel_function( +// CHECK-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META5]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 4 +// CHECK-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR]], align 4 +// CHECK-NEXT: [[CALL:%.*]] = call zeroext i1 @device_function() #[[ATTR3]] // CHECK-NEXT: [[CONV:%.*]] = zext i1 [[CALL]] to i32 // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[I_ADDR]], align 4 // CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP0]], align 4 // CHECK-NEXT: ret void // -__kernel void kernel_function(__global int *i) { - *i = device_function(); -} //. // CHECK: [[META3]] = !{i32 1} // CHECK: [[META4]] = !{!"none"} diff --git a/clang/test/CodeGenOpenCL/sampler.cl b/clang/test/CodeGenOpenCL/sampler.cl index 3649afba50fe2..227242be73c5e 100644 --- a/clang/test/CodeGenOpenCL/sampler.cl +++ b/clang/test/CodeGenOpenCL/sampler.cl @@ -36,8 +36,12 @@ void fnc4smp(sampler_t s) {} kernel void foo(sampler_t smp_par) { // CHECK-SPIR-LABEL: define{{.*}} spir_kernel void @foo(target("spirv.Sampler") %smp_par) + // CHECK-SPIR: call spir_func void @__clang_ocl_kern_imp_foo(target("spirv.Sampler") %0) + // CHECK-SPIR-LABEL: define{{.*}} spir_func void @__clang_ocl_kern_imp_foo(target("spirv.Sampler") %smp_par) // CHECK-SPIR: [[smp_par_ptr:%[A-Za-z0-9_\.]+]] = alloca target("spirv.Sampler") // CHECK-X86-LABEL: define{{.*}} spir_kernel void @foo(ptr %smp_par) + // CHECK-X86: call void @__clang_ocl_kern_imp_foo(ptr %0) + // CHECK-X86-LABEL: define{{.*}} void @__clang_ocl_kern_imp_foo(ptr %smp_par) // CHECK-X86: [[smp_par_ptr:%[A-Za-z0-9_\.]+]] = alloca ptr // Case 2b diff --git a/clang/test/CodeGenOpenCL/spir-calling-conv.cl b/clang/test/CodeGenOpenCL/spir-calling-conv.cl index 569ea0cbe1af6..56e1b662a7b4e 100644 --- a/clang/test/CodeGenOpenCL/spir-calling-conv.cl +++ b/clang/test/CodeGenOpenCL/spir-calling-conv.cl @@ -4,15 +4,18 @@ int get_dummy_id(int D); kernel void bar(global int *A); +//CHECK: define{{.*}} spir_kernel void @foo(ptr addrspace(1) noundef align 4 %A) +//CHECK: tail call spir_func void @__clang_ocl_kern_imp_bar(ptr addrspace(1) noundef align 4 %A) + kernel void foo(global int *A) -// CHECK: define{{.*}} spir_kernel void @foo(ptr addrspace(1) noundef align 4 %A) +// CHECK: define{{.*}} spir_func void @__clang_ocl_kern_imp_foo(ptr addrspace(1) noundef align 4 %A) { int id = get_dummy_id(0); // CHECK: %{{[a-z0-9_]+}} = tail call spir_func i32 @get_dummy_id(i32 noundef 0) A[id] = id; bar(A); - // CHECK: tail call spir_kernel void @bar(ptr addrspace(1) noundef align 4 %A) + // CHECK: tail call spir_func void @__clang_ocl_kern_imp_bar(ptr addrspace(1) noundef align 4 %A) } // CHECK: declare spir_func i32 @get_dummy_id(i32 noundef) -// CHECK: declare spir_kernel void @bar(ptr addrspace(1) noundef align 4) +// CHECK: declare spir_func void @__clang_ocl_kern_imp_bar(ptr addrspace(1) noundef align 4) diff --git a/clang/test/CodeGenOpenCL/visibility.cl b/clang/test/CodeGenOpenCL/visibility.cl index addfe33377f93..31807aacf8c2a 100644 --- a/clang/test/CodeGenOpenCL/visibility.cl +++ b/clang/test/CodeGenOpenCL/visibility.cl @@ -37,22 +37,33 @@ __attribute__((visibility("protected"))) extern int ext_protected; // FVIS-PROTECTED: @ext_default = external local_unnamed_addr // FVIS-HIDDEN: @ext_default = external local_unnamed_addr __attribute__((visibility("default"))) extern int ext_default; - // FVIS-DEFAULT: define{{.*}} amdgpu_kernel void @kern() // FVIS-PROTECTED: define protected amdgpu_kernel void @kern() // FVIS-HIDDEN: define protected amdgpu_kernel void @kern() +// FVIS-DEFAULT: define{{.*}} void @__clang_ocl_kern_imp_kern() +// FVIS-PROTECTED: define protected void @__clang_ocl_kern_imp_kern() +// FVIS-HIDDEN: define protected void @__clang_ocl_kern_imp_kern() kernel void kern() {} // FVIS-DEFAULT: define protected amdgpu_kernel void @kern_hidden() // FVIS-PROTECTED: define protected amdgpu_kernel void @kern_hidden() // FVIS-HIDDEN: define protected amdgpu_kernel void @kern_hidden() +// FVIS-DEFAULT: define protected void @__clang_ocl_kern_imp_kern_hidden() +// FVIS-PROTECTED: define protected void @__clang_ocl_kern_imp_kern_hidden() +// FVIS-HIDDEN: define protected void @__clang_ocl_kern_imp_kern_hidden() __attribute__((visibility("hidden"))) kernel void kern_hidden() {} // FVIS-DEFAULT: define protected amdgpu_kernel void @kern_protected() // FVIS-PROTECTED: define protected amdgpu_kernel void @kern_protected() // FVIS-HIDDEN: define protected amdgpu_kernel void @kern_protected() +// FVIS-DEFAULT: define protected void @__clang_ocl_kern_imp_kern_protected() +// FVIS-PROTECTED: define protected void @__clang_ocl_kern_imp_kern_protected() +// FVIS-HIDDEN: define protected void @__clang_ocl_kern_imp_kern_protected() __attribute__((visibility("protected"))) kernel void kern_protected() {} // FVIS-DEFAULT: define{{.*}} amdgpu_kernel void @kern_default() // FVIS-PROTECTED: define{{.*}} amdgpu_kernel void @kern_default() // FVIS-HIDDEN: define{{.*}} amdgpu_kernel void @kern_default() +// FVIS-DEFAULT: define{{.*}} void @__clang_ocl_kern_imp_kern_default() +// FVIS-PROTECTED: define{{.*}} void @__clang_ocl_kern_imp_kern_default() +// FVIS-HIDDEN: define{{.*}} void @__clang_ocl_kern_imp_kern_default() __attribute__((visibility("default"))) kernel void kern_default() {} // FVIS-DEFAULT: define{{.*}} void @func() @@ -85,31 +96,42 @@ __attribute__((visibility("default"))) extern void ext_func_default(); void use() { glob = ext + ext_hidden + ext_protected + ext_default; ext_kern(); + // FVIS-DEFAULT: tail call void @__clang_ocl_kern_imp_ext_kern() + // FVIS-PROTECTED: tail call void @__clang_ocl_kern_imp_ext_kern() + // FVIS-HIDDEN: tail call void @__clang_ocl_kern_imp_ext_kern() ext_kern_hidden(); + // FVIS-DEFAULT: tail call void @__clang_ocl_kern_imp_ext_kern_hidden() + // FVIS-PROTECTED: tail call void @__clang_ocl_kern_imp_ext_kern_hidden() + // FVIS-HIDDEN: tail call void @__clang_ocl_kern_imp_ext_kern_hidden() ext_kern_protected(); + // FVIS-DEFAULT: tail call void @__clang_ocl_kern_imp_ext_kern_protected() + // FVIS-PROTECTED: tail call void @__clang_ocl_kern_imp_ext_kern_protected() + // FVIS-HIDDEN: tail call void @__clang_ocl_kern_imp_ext_kern_protected() ext_kern_default(); + // FVIS-DEFAULT: tail call void @__clang_ocl_kern_imp_ext_kern_default() + // FVIS-PROTECTED: tail call void @__clang_ocl_kern_imp_ext_kern_default() + // FVIS-HIDDEN: tail call void @__clang_ocl_kern_imp_ext_kern_default() ext_func(); ext_func_hidden(); ext_func_protected(); ext_func_default(); } -// FVIS-DEFAULT: declare amdgpu_kernel void @ext_kern() -// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern() -// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern() - -// FVIS-DEFAULT: declare protected amdgpu_kernel void @ext_kern_hidden() -// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern_hidden() -// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern_hidden() +// FVIS-DEFAULT: declare void @__clang_ocl_kern_imp_ext_kern() +// FVIS-PROTECTED: declare protected void @__clang_ocl_kern_imp_ext_kern() +// FVIS-HIDDEN: declare protected void @__clang_ocl_kern_imp_ext_kern() -// FVIS-DEFAULT: declare protected amdgpu_kernel void @ext_kern_protected() -// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern_protected() -// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern_protected() +// FVIS-DEFAULT: declare protected void @__clang_ocl_kern_imp_ext_kern_hidden() +// FVIS-PROTECTED: declare protected void @__clang_ocl_kern_imp_ext_kern_hidden() +// FVIS-HIDDEN: declare protected void @__clang_ocl_kern_imp_ext_kern_hidden() -// FVIS-DEFAULT: declare amdgpu_kernel void @ext_kern_default() -// FVIS-PROTECTED: declare amdgpu_kernel void @ext_kern_default() -// FVIS-HIDDEN: declare amdgpu_kernel void @ext_kern_default() +// FVIS-DEFAULT: declare protected void @__clang_ocl_kern_imp_ext_kern_protected() +// FVIS-PROTECTED: declare protected void @__clang_ocl_kern_imp_ext_kern_protected() +// FVIS-HIDDEN: declare protected void @__clang_ocl_kern_imp_ext_kern_protected() +// FVIS-DEFAULT: declare void @__clang_ocl_kern_imp_ext_kern_default() +// FVIS-PROTECTED: declare void @__clang_ocl_kern_imp_ext_kern_default() +// FVIS-HIDDEN: declare void @__clang_ocl_kern_imp_ext_kern_default() // FVIS-DEFAULT: declare void @ext_func() // FVIS-PROTECTED: declare protected void @ext_func() @@ -126,3 +148,6 @@ void use() { // FVIS-DEFAULT: declare void @ext_func_default() // FVIS-PROTECTED: declare void @ext_func_default() // FVIS-HIDDEN: declare void @ext_func_default() + + + diff --git a/clang/test/CodeGenOpenCLCXX/addrspace-of-this.clcpp b/clang/test/CodeGenOpenCLCXX/addrspace-of-this.clcpp index 2f1b6c196fd51..8395a40095b94 100644 --- a/clang/test/CodeGenOpenCLCXX/addrspace-of-this.clcpp +++ b/clang/test/CodeGenOpenCLCXX/addrspace-of-this.clcpp @@ -125,9 +125,6 @@ __kernel void test__global() { // EXPL: call spir_func void @_ZNU3AS41CC1EOU3AS4S_(ptr addrspace(4) {{[^,]*}} [[C5GEN]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[CALL]]) // IMPL: call void @llvm.memcpy.p0.p4.i32(ptr {{.*}}%c5, ptr addrspace(4) {{.*}}[[CALL]] -// Tests address space of inline members -//COMMON: @_ZNU3AS41C3getEv(ptr addrspace(4) {{[^,]*}} %this) -//COMMON: @_ZNU3AS41CplERU3AS4KS_(ptr dead_on_unwind noalias writable sret(%class.C) align 4 %agg.result, ptr addrspace(4) {{[^,]*}} %this #define TEST(AS) \ __kernel void test##AS() { \ AS C c; \ @@ -190,6 +187,10 @@ TEST(__private) // EXPL: call spir_func noundef align 4 dereferenceable(4) ptr addrspace(4) @_ZNU3AS41CaSERU3AS4KS_(ptr addrspace(4) {{[^,]*}} [[C2GEN]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[C1GEN]]) // IMPL: call void @llvm.memcpy.p4.p4.i32(ptr addrspace(4) {{.*}}[[C2GEN]], ptr addrspace(4) {{.*}}[[C1GEN]] +// Tests address space of inline members +//COMMON: @_ZNU3AS41C3getEv(ptr addrspace(4) {{[^,]*}} %this) +//COMMON: @_ZNU3AS41CplERU3AS4KS_(ptr dead_on_unwind noalias writable sret(%class.C) align 4 %agg.result, ptr addrspace(4) {{[^,]*}} %this + // Test that calling a const method from a non-const method does not crash Clang. class ConstAndNonConstMethod { public: diff --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl index f15130d5f8b61..9331afc305c25 100644 --- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl +++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl @@ -1,5 +1,5 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O1 -verify %s -o /dev/null // expected-remark@+10 {{Function Name: foo}} // expected-remark@+9 {{ TotalSGPRs: 13}} From 0d19efa9d5d872962a0a83c865b4636d58f46519 Mon Sep 17 00:00:00 2001 From: Mingming Liu Date: Mon, 7 Apr 2025 22:22:08 -0700 Subject: [PATCH 0941/1029] [NFC]In codegen pipeline, turn static-data-splitter pass on/off with its own option (#134752) Per discussion in https://github.com/llvm/llvm-project/pull/129781#discussion_r2017489088, we'd like to refactor out the requirement of MFS. --- llvm/lib/CodeGen/TargetPassConfig.cpp | 31 +++++++++------- .../AArch64/constant-pool-partition.ll | 10 +++--- .../CodeGen/X86/constant-pool-partition.ll | 14 ++++---- .../CodeGen/X86/global-variable-partition.ll | 36 +++++++++---------- llvm/test/CodeGen/X86/jump-table-partition.ll | 18 +++++----- 5 files changed, 55 insertions(+), 54 deletions(-) diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index fa1bb84ec5319..1bf6621cdf791 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1235,13 +1235,9 @@ void TargetPassConfig::addMachinePasses() { addPass(createMIRAddFSDiscriminatorsPass( sampleprof::FSDiscriminatorPass::PassLast)); - // Machine function splitter uses the basic block sections feature. - // When used along with `-basic-block-sections=`, the basic-block-sections - // feature takes precedence. This means functions eligible for - // basic-block-sections optimizations (`=all`, or `=list=` with function - // included in the list profile) will get that optimization instead. if (TM->Options.EnableMachineFunctionSplitter || - EnableMachineFunctionSplitter) { + EnableMachineFunctionSplitter || SplitStaticData || + TM->Options.EnableStaticDataPartitioning) { const std::string ProfileFile = getFSProfileFile(TM); if (!ProfileFile.empty()) { if (EnableFSDiscriminator) { @@ -1256,14 +1252,23 @@ void TargetPassConfig::addMachinePasses() { "performance.\n"; } } + } + + // Machine function splitter uses the basic block sections feature. + // When used along with `-basic-block-sections=`, the basic-block-sections + // feature takes precedence. This means functions eligible for + // basic-block-sections optimizations (`=all`, or `=list=` with function + // included in the list profile) will get that optimization instead. + if (TM->Options.EnableMachineFunctionSplitter || + EnableMachineFunctionSplitter) addPass(createMachineFunctionSplitterPass()); - if (SplitStaticData || TM->Options.EnableStaticDataPartitioning) { - // The static data splitter pass is a machine function pass. and - // static data annotator pass is a module-wide pass. See the file comment - // in StaticDataAnnotator.cpp for the motivation. - addPass(createStaticDataSplitterPass()); - addPass(createStaticDataAnnotatorPass()); - } + + if (SplitStaticData || TM->Options.EnableStaticDataPartitioning) { + // The static data splitter pass is a machine function pass. and + // static data annotator pass is a module-wide pass. See the file comment + // in StaticDataAnnotator.cpp for the motivation. + addPass(createStaticDataSplitterPass()); + addPass(createStaticDataAnnotatorPass()); } // We run the BasicBlockSections pass if either we need BB sections or BB // address map (or both). diff --git a/llvm/test/CodeGen/AArch64/constant-pool-partition.ll b/llvm/test/CodeGen/AArch64/constant-pool-partition.ll index ab627b02a1bc7..d4447131e9de1 100644 --- a/llvm/test/CodeGen/AArch64/constant-pool-partition.ll +++ b/llvm/test/CodeGen/AArch64/constant-pool-partition.ll @@ -1,12 +1,10 @@ -; RUN: llc -mtriple=aarch64 -enable-split-machine-functions \ -; RUN: -partition-static-data-sections=true -function-sections=true \ -; RUN: -unique-section-names=false \ +; RUN: llc -mtriple=aarch64 -partition-static-data-sections \ +; RUN: -function-sections -unique-section-names=false \ ; RUN: %s -o - 2>&1 | FileCheck %s --dump-input=always ; Repeat the RUN command above for big-endian systems. -; RUN: llc -mtriple=aarch64_be -enable-split-machine-functions \ -; RUN: -partition-static-data-sections=true -function-sections=true \ -; RUN: -unique-section-names=false \ +; RUN: llc -mtriple=aarch64_be -partition-static-data-sections \ +; RUN: -function-sections -unique-section-names=false \ ; RUN: %s -o - 2>&1 | FileCheck %s --dump-input=always ; Tests that constant pool hotness is aggregated across the module. The diff --git a/llvm/test/CodeGen/X86/constant-pool-partition.ll b/llvm/test/CodeGen/X86/constant-pool-partition.ll index d2c87b7b3fc14..515284fb2cf1a 100644 --- a/llvm/test/CodeGen/X86/constant-pool-partition.ll +++ b/llvm/test/CodeGen/X86/constant-pool-partition.ll @@ -10,18 +10,16 @@ target triple = "x86_64-grtev4-linux-gnu" ; 2. Similarly if a constant is accessed by both cold function and un-profiled ; function, constant pools for this constant should not have .unlikely suffix. -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-split-machine-functions \ -; RUN: -partition-static-data-sections=true -function-sections=true -data-sections=true \ -; RUN: -unique-section-names=false \ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -partition-static-data-sections \ +; RUN: -function-sections -data-sections -unique-section-names=false \ ; RUN: %s -o - 2>&1 | FileCheck %s --dump-input=always -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-split-machine-functions \ -; RUN: -partition-static-data-sections=true -function-sections=true -data-sections=true \ -; RUN: -unique-section-names=true \ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -partition-static-data-sections \ +; RUN: -function-sections -data-sections -unique-section-names \ ; RUN: %s -o - 2>&1 | FileCheck %s --dump-input=always -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-split-machine-functions \ -; RUN: -partition-static-data-sections=true -function-sections=false -data-sections=false \ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -partition-static-data-sections \ +; RUN: -function-sections=false -data-sections=false \ ; RUN: -unique-section-names=false \ ; RUN: %s -o - 2>&1 | FileCheck %s --dump-input=always diff --git a/llvm/test/CodeGen/X86/global-variable-partition.ll b/llvm/test/CodeGen/X86/global-variable-partition.ll index 91084d038cfe0..ce06d1712f840 100644 --- a/llvm/test/CodeGen/X86/global-variable-partition.ll +++ b/llvm/test/CodeGen/X86/global-variable-partition.ll @@ -11,22 +11,22 @@ target triple = "x86_64-unknown-linux-gnu" ; This RUN command sets `-data-sections=true -unique-section-names=true` so data ; sections are uniqufied by numbers. -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-split-machine-functions \ -; RUN: -partition-static-data-sections=true -data-sections=true \ -; RUN: -unique-section-names=true -relocation-model=pic \ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \ +; RUN: -partition-static-data-sections=true \ +; RUN: -data-sections=true -unique-section-names=true \ ; RUN: %s -o - 2>&1 | FileCheck %s --check-prefixes=SYM,COMMON --dump-input=always ; This RUN command sets `-data-sections=true -unique-section-names=false` so ; data sections are uniqufied by variable names. -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-split-machine-functions \ -; RUN: -partition-static-data-sections=true -data-sections=true \ -; RUN: -unique-section-names=false -relocation-model=pic \ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \ +; RUN: -partition-static-data-sections=true \ +; RUN: -data-sections=true -unique-section-names=false \ ; RUN: %s -o - 2>&1 | FileCheck %s --check-prefixes=UNIQ,COMMON --dump-input=always ; This RUN command sets `-data-sections=false -unique-section-names=false`. -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-split-machine-functions \ -; RUN: -partition-static-data-sections=true -data-sections=false \ -; RUN: -unique-section-names=false -relocation-model=pic \ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \ +; RUN: -partition-static-data-sections=true \ +; RUN: -data-sections=false -unique-section-names=false \ ; RUN: %s -o - 2>&1 | FileCheck %s --check-prefixes=AGG,COMMON --dump-input=always ; For @.str and @.str.1 @@ -42,19 +42,19 @@ target triple = "x86_64-unknown-linux-gnu" ; For @hot_relro_array ; COMMON: .type hot_relro_array,@object ; SYM-NEXT: .section .data.rel.ro.hot.hot_relro_array -; UNIQ-NEXT: .section .data.rel.ro.hot.,"aw",@progbits,unique,3 +; UNIQ-NEXT: .section .data.rel.ro.hot.,"aw",@progbits,unique,1 ; AGG-NEXT: .section .data.rel.ro.hot.,"aw",@progbits ; For @hot_data, which is accessed by {cold_func, unprofiled_func, hot_func}. ; COMMON: .type hot_data,@object ; SYM-NEXT: .section .data.hot.hot_data,"aw",@progbits -; UNIQ-NEXT: .section .data.hot.,"aw",@progbits,unique,4 +; UNIQ-NEXT: .section .data.hot.,"aw",@progbits,unique,2 ; AGG-NEXT: .section .data.hot.,"aw",@progbits ; For @hot_bss, which is accessed by {unprofiled_func, hot_func}. ; COMMON: .type hot_bss,@object ; SYM-NEXT: .section .bss.hot.hot_bss,"aw",@nobits -; UNIQ-NEXT: .section .bss.hot.,"aw",@nobits,unique,5 +; UNIQ-NEXT: .section .bss.hot.,"aw",@nobits,unique,3 ; AGG-NEXT: .section .bss.hot.,"aw",@nobits ; For @.str.2 @@ -68,13 +68,13 @@ target triple = "x86_64-unknown-linux-gnu" ; For @cold_bss ; COMMON: .type cold_bss,@object ; SYM-NEXT: .section .bss.unlikely.cold_bss,"aw",@nobits -; UNIQ-NEXT: .section .bss.unlikely.,"aw",@nobits,unique,6 +; UNIQ-NEXT: .section .bss.unlikely.,"aw",@nobits,unique,4 ; AGG-NEXT: .section .bss.unlikely.,"aw",@nobits ; For @cold_data ; COMMON: .type cold_data,@object ; SYM-NEXT: .section .data.unlikely.cold_data,"aw",@progbits -; UNIQ-NEXT: .section .data.unlikely.,"aw",@progbits,unique,7 +; UNIQ-NEXT: .section .data.unlikely.,"aw",@progbits,unique,5 ; AGG-NEXT: .section .data.unlikely.,"aw",@progbits ; For @cold_data_custom_foo_section @@ -87,7 +87,7 @@ target triple = "x86_64-unknown-linux-gnu" ; For @cold_relro_array ; COMMON: .type cold_relro_array,@object ; SYM-NEXT: .section .data.rel.ro.unlikely.cold_relro_array,"aw",@progbits -; UNIQ-NEXT: .section .data.rel.ro.unlikely.,"aw",@progbits,unique,8 +; UNIQ-NEXT: .section .data.rel.ro.unlikely.,"aw",@progbits,unique,6 ; AGG-NEXT: .section .data.rel.ro.unlikely.,"aw",@progbits ; Currently static-data-splitter only analyzes access from code. @@ -97,19 +97,19 @@ target triple = "x86_64-unknown-linux-gnu" ; For @bss2 ; COMMON: .type bss2,@object ; SYM-NEXT: .section .bss.unlikely.bss2,"aw",@nobits -; UNIQ-NEXT: .section .bss.unlikely.,"aw",@nobits,unique,9 +; UNIQ-NEXT: .section .bss.unlikely.,"aw",@nobits,unique,7 ; AGG-NEXT: .section .bss.unlikely.,"aw",@nobits ; For @data3 ; COMMON: .type data3,@object ; SYM-NEXT: .section .data.unlikely.data3,"aw",@progbits -; UNIQ-NEXT: .section .data.unlikely.,"aw",@progbits,unique,10 +; UNIQ-NEXT: .section .data.unlikely.,"aw",@progbits,unique,8 ; AGG-NEXT: .section .data.unlikely.,"aw",@progbits ; For @data_with_unknown_hotness ; SYM: .type .Ldata_with_unknown_hotness,@object # @data_with_unknown_hotness ; SYM: .section .data..Ldata_with_unknown_hotness,"aw",@progbits -; UNIQ: .section .data,"aw",@progbits,unique,11 +; UNIQ: .section .data,"aw",@progbits,unique,9 ; The `.section` directive is omitted for .data with -unique-section-names=false. ; See MCSectionELF::shouldOmitSectionDirective for the implementation details. ; AGG: .data diff --git a/llvm/test/CodeGen/X86/jump-table-partition.ll b/llvm/test/CodeGen/X86/jump-table-partition.ll index 0d76f8a5a91ed..40dbc8131e22b 100644 --- a/llvm/test/CodeGen/X86/jump-table-partition.ll +++ b/llvm/test/CodeGen/X86/jump-table-partition.ll @@ -13,19 +13,19 @@ ; STAT: 2 static-data-splitter - Number of hot jump tables seen ; STAT: 2 static-data-splitter - Number of jump tables with unknown hotness -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-split-machine-functions \ -; RUN: -partition-static-data-sections=true -function-sections=true \ -; RUN: -min-jump-table-entries=2 -unique-section-names=false \ +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -partition-static-data-sections \ +; RUN: -function-sections -unique-section-names=false \ +; RUN: -min-jump-table-entries=2 \ ; RUN: %s -o - 2>&1 | FileCheck %s --check-prefixes=NUM,JT ; Section names will optionally have `.` if -function-sections is enabled. -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-split-machine-functions \ -; RUN: -partition-static-data-sections=true -function-sections=true \ -; RUN: -min-jump-table-entries=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=FUNC,JT +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -partition-static-data-sections \ +; RUN: -function-sections -min-jump-table-entries=2 \ +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefixes=FUNC,JT -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -enable-split-machine-functions \ -; RUN: -partition-static-data-sections=true -function-sections=false \ -; RUN: -min-jump-table-entries=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=FUNCLESS,JT +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -partition-static-data-sections \ +; RUN: -function-sections=false -min-jump-table-entries=2 \ +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefixes=FUNCLESS,JT ; In function @foo, the 2 switch instructions to jt0.* and jt1.* are placed in ; hot-prefixed sections, and the 2 switch instructions to jt2.* and jt3.* are From 2c1bdd4a0811af89eb9631935fbd90f13a04eacb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Csan=C3=A1d=20Hajd=C3=BA?= Date: Tue, 8 Apr 2025 08:47:51 +0200 Subject: [PATCH 0942/1029] [LLD][ELF] Allow merging XO and RX sections, and add `--[no-]xosegment` flag (#132412) Following from the discussion in #132224, this seems like the best approach to deal with a mix of XO and RX output sections in the same binary. This change will also simplify the implementation of the PURECODE section flag for AArch64. To control this behaviour, the `--[no-]xosegment` flag is added to LLD (similarly to `--[no-]rosegment`), which determines whether to allow merging XO and RX sections in the same segment. The default value is `--no-xosegment`, which is a breaking change compared to the previous behaviour. Release notes are also added, since this will be a breaking change. --- lld/ELF/Config.h | 1 + lld/ELF/Driver.cpp | 1 + lld/ELF/Options.td | 4 ++ lld/ELF/Writer.cpp | 14 ++++-- lld/docs/ReleaseNotes.rst | 8 ++++ lld/test/ELF/aarch64-execute-only-mixed.s | 55 +++++++++++++++++++++++ lld/test/ELF/aarch64-execute-only.s | 4 +- lld/test/ELF/arm-execute-only-mixed.s | 55 +++++++++++++++++++++++ lld/test/ELF/arm-execute-only.s | 4 +- 9 files changed, 138 insertions(+), 8 deletions(-) create mode 100644 lld/test/ELF/aarch64-execute-only-mixed.s create mode 100644 lld/test/ELF/arm-execute-only-mixed.s diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 03b3cd4771f49..f0e9592d85dd6 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -341,6 +341,7 @@ struct Config { llvm::DenseSet saveTempsArgs; llvm::SmallVector, 0> shuffleSections; bool singleRoRx; + bool singleXoRx; bool shared; bool symbolic; bool isStatic = false; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 0a220432333cc..b3c5518b42877 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1485,6 +1485,7 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) { ctx.arg.randomizeSectionPadding = args::getInteger(args, OPT_randomize_section_padding, 0); ctx.arg.singleRoRx = !args.hasFlag(OPT_rosegment, OPT_no_rosegment, true); + ctx.arg.singleXoRx = !args.hasFlag(OPT_xosegment, OPT_no_xosegment, false); ctx.arg.soName = args.getLastArgValue(OPT_soname); ctx.arg.sortSection = getSortSection(ctx, args); ctx.arg.splitStackAdjustSize = diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index 62d8f49acde39..76d28096f82c8 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -432,6 +432,10 @@ defm rosegment: BB<"rosegment", "Put read-only non-executable sections in their own segment (default)", "Do not put read-only non-executable sections in their own segment">; +defm xosegment: BB<"xosegment", + "Put execute-only sections in their own segment", + "Do not put execute-only sections in their own segment (default)">; + defm rpath: Eq<"rpath", "Add a DT_RUNPATH to the output">; def relocatable: F<"relocatable">, HelpText<"Create relocatable object file">; diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 2cea6a44b391a..28b24f90716b8 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -2379,10 +2379,16 @@ Writer::createPhdrs(Partition &part) { // so when hasSectionsCommand, since we cannot introduce the extra alignment // needed to create a new LOAD) uint64_t newFlags = computeFlags(ctx, sec->getPhdrFlags()); - // When --no-rosegment is specified, RO and RX sections are compatible. - uint32_t incompatible = flags ^ newFlags; - if (ctx.arg.singleRoRx && !(newFlags & PF_W)) - incompatible &= ~PF_X; + uint64_t incompatible = flags ^ newFlags; + if (!(newFlags & PF_W)) { + // When --no-rosegment is specified, RO and RX sections are compatible. + if (ctx.arg.singleRoRx) + incompatible &= ~PF_X; + // When --no-xosegment is specified (the default), XO and RX sections are + // compatible. + if (ctx.arg.singleXoRx) + incompatible &= ~PF_R; + } if (incompatible) load = nullptr; diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst index 2b7b7fe52ea12..36028514cba27 100644 --- a/lld/docs/ReleaseNotes.rst +++ b/lld/docs/ReleaseNotes.rst @@ -42,8 +42,16 @@ ELF Improvements * Linker script ``OVERLAY`` descriptions now support virtual memory regions (e.g. ``>region``) and ``NOCROSSREFS``. +* Added ``--xosegment`` and ``--no-xosegment`` flags to control whether to place + executable-only and readable-executable sections in the same segment. The + default value is ``--no-xosegment``. + (`#132412 `_) + Breaking changes ---------------- +* Executable-only and readable-executable sections are now allowed to be placed + in the same segment by default. Pass ``--xosegment`` to lld in order to get + the old behavior back. COFF Improvements ----------------- diff --git a/lld/test/ELF/aarch64-execute-only-mixed.s b/lld/test/ELF/aarch64-execute-only-mixed.s new file mode 100644 index 0000000000000..f95a1547bfba2 --- /dev/null +++ b/lld/test/ELF/aarch64-execute-only-mixed.s @@ -0,0 +1,55 @@ +// REQUIRES: aarch64 +// RUN: rm -rf %t && split-file %s %t && cd %t + +// RUN: llvm-mc -filetype=obj -triple=aarch64 start.s -o start.o +// RUN: llvm-mc -filetype=obj -triple=aarch64 xo.s -o xo.o +// RUN: llvm-mc -filetype=obj -triple=aarch64 rx.s -o rx.o +// RUN: ld.lld start.o xo.o -o xo +// RUN: ld.lld start.o rx.o -o rx-default +// RUN: ld.lld --xosegment start.o rx.o -o rx-xosegment +// RUN: ld.lld --no-xosegment start.o rx.o -o rx-no-xosegment +// RUN: llvm-readelf -l xo | FileCheck --check-prefix=CHECK-XO %s +// RUN: llvm-readelf -l rx-default | FileCheck --check-prefix=CHECK-MERGED %s +// RUN: llvm-readelf -l rx-xosegment | FileCheck --check-prefix=CHECK-SEPARATE %s +// RUN: llvm-readelf -l rx-no-xosegment | FileCheck --check-prefix=CHECK-MERGED %s + +// CHECK-XO: PHDR +// CHECK-XO-NEXT: LOAD +// CHECK-XO-NEXT: LOAD 0x000120 0x0000000000210120 0x0000000000210120 0x00000c 0x00000c E 0x10000 +/// Index should match the index of the LOAD segment above. +// CHECK-XO: 02 .text .foo + +// CHECK-MERGED: PHDR +// CHECK-MERGED-NEXT: LOAD +// CHECK-MERGED-NEXT: LOAD 0x000120 0x0000000000210120 0x0000000000210120 0x00000c 0x00000c R E 0x10000 +/// Index should match the index of the LOAD segment above. +// CHECK-MERGED: 02 .text .foo + +// CHECK-SEPARATE: PHDR +// CHECK-SEPARATE-NEXT: LOAD +// CHECK-SEPARATE-NEXT: LOAD 0x000158 0x0000000000210158 0x0000000000210158 0x000008 0x000008 E 0x10000 +// CHECK-SEPARATE-NEXT: LOAD 0x000160 0x0000000000220160 0x0000000000220160 0x000004 0x000004 R E 0x10000 +/// Index should match the index of the LOAD segment above. +// CHECK-SEPARATE: 02 .text +// CHECK-SEPARATE: 03 .foo + +//--- start.s +.section .text,"axy",@progbits,unique,0 +.global _start +_start: + bl foo + ret + +//--- xo.s +.section .foo,"axy",@progbits,unique,0 +.global foo +foo: + ret + +//--- rx.s +/// Ensure that the implicitly-created .text section has the SHF_AARCH64_PURECODE flag. +.section .text,"axy",@progbits,unique,0 +.section .foo,"ax",@progbits,unique,0 +.global foo +foo: + ret diff --git a/lld/test/ELF/aarch64-execute-only.s b/lld/test/ELF/aarch64-execute-only.s index 20908ba9f754f..d4ee783e2c578 100644 --- a/lld/test/ELF/aarch64-execute-only.s +++ b/lld/test/ELF/aarch64-execute-only.s @@ -1,12 +1,12 @@ // REQUIRES: aarch64 // RUN: llvm-mc -filetype=obj -triple=aarch64 %s -o %t.o -// RUN: ld.lld %t.o -o %t.so -shared +// RUN: ld.lld --xosegment %t.o -o %t.so -shared // RUN: llvm-readelf -l %t.so | FileCheck --implicit-check-not=LOAD %s // RUN: echo ".section .foo,\"ax\"; ret" > %t.s // RUN: llvm-mc -filetype=obj -triple=aarch64 %t.s -o %t2.o -// RUN: ld.lld %t.o %t2.o -o %t.so -shared +// RUN: ld.lld --xosegment %t.o %t2.o -o %t.so -shared // RUN: llvm-readelf -l %t.so | FileCheck --check-prefix=DIFF --implicit-check-not=LOAD %s // CHECK: LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x000245 0x000245 R 0x10000 diff --git a/lld/test/ELF/arm-execute-only-mixed.s b/lld/test/ELF/arm-execute-only-mixed.s new file mode 100644 index 0000000000000..17c227cf69983 --- /dev/null +++ b/lld/test/ELF/arm-execute-only-mixed.s @@ -0,0 +1,55 @@ +// REQUIRES: arm +// RUN: rm -rf %t && split-file %s %t && cd %t + +// RUN: llvm-mc -filetype=obj -triple=armv7 start.s -o start.o +// RUN: llvm-mc -filetype=obj -triple=armv7 xo.s -o xo.o +// RUN: llvm-mc -filetype=obj -triple=armv7 rx.s -o rx.o +// RUN: ld.lld start.o xo.o -o xo +// RUN: ld.lld start.o rx.o -o rx-default +// RUN: ld.lld --xosegment start.o rx.o -o rx-xosegment +// RUN: ld.lld --no-xosegment start.o rx.o -o rx-no-xosegment +// RUN: llvm-readelf -l xo | FileCheck --check-prefix=CHECK-XO %s +// RUN: llvm-readelf -l rx-default | FileCheck --check-prefix=CHECK-MERGED %s +// RUN: llvm-readelf -l rx-xosegment | FileCheck --check-prefix=CHECK-SEPARATE %s +// RUN: llvm-readelf -l rx-no-xosegment | FileCheck --check-prefix=CHECK-MERGED %s + +// CHECK-XO: PHDR +// CHECK-XO-NEXT: LOAD +// CHECK-XO-NEXT: LOAD 0x0000b4 0x000200b4 0x000200b4 0x0000c 0x0000c E 0x10000 +/// Index should match the index of the LOAD segment above. +// CHECK-XO: 02 .text .foo + +// CHECK-MERGED: PHDR +// CHECK-MERGED-NEXT: LOAD +// CHECK-MERGED-NEXT: LOAD 0x0000b4 0x000200b4 0x000200b4 0x0000c 0x0000c R E 0x10000 +/// Index should match the index of the LOAD segment above. +// CHECK-MERGED: 02 .text .foo + +// CHECK-SEPARATE: PHDR +// CHECK-SEPARATE-NEXT: LOAD +// CHECK-SEPARATE-NEXT: LOAD 0x0000d4 0x000200d4 0x000200d4 0x00008 0x00008 E 0x10000 +// CHECK-SEPARATE-NEXT: LOAD 0x0000dc 0x000300dc 0x000300dc 0x00004 0x00004 R E 0x10000 +/// Index should match the index of the LOAD segment above. +// CHECK-SEPARATE: 02 .text +// CHECK-SEPARATE: 03 .foo + +//--- start.s +.section .text,"axy",%progbits,unique,0 +.global _start +_start: + bl foo + bx lr + +//--- xo.s +.section .foo,"axy",%progbits,unique,0 +.global foo +foo: + bx lr + +//--- rx.s +/// Ensure that the implicitly-created .text section has the SHF_ARM_PURECODE flag. +.section .text,"axy",%progbits,unique,0 +.section .foo,"ax",%progbits,unique,0 +.global foo +foo: + bx lr diff --git a/lld/test/ELF/arm-execute-only.s b/lld/test/ELF/arm-execute-only.s index e938be5e64a4b..3bb89ad0620f8 100644 --- a/lld/test/ELF/arm-execute-only.s +++ b/lld/test/ELF/arm-execute-only.s @@ -1,13 +1,13 @@ // REQUIRES: arm // RUN: llvm-mc -filetype=obj -triple=armv7-pc-linux %s -o %t.o -// RUN: ld.lld %t.o -o %t.so -shared +// RUN: ld.lld --xosegment %t.o -o %t.so -shared // RUN: llvm-readelf -l %t.so | FileCheck --implicit-check-not=LOAD %s // RUN: echo ".section .foo,\"ax\"; \ // RUN: bx lr" > %t.s // RUN: llvm-mc -filetype=obj -triple=armv7-pc-linux %t.s -o %t2.o -// RUN: ld.lld %t.o %t2.o -o %t.so -shared +// RUN: ld.lld --xosegment %t.o %t2.o -o %t.so -shared // RUN: llvm-readelf -l %t.so | FileCheck --check-prefix=DIFF --implicit-check-not=LOAD %s // CHECK: LOAD 0x000000 0x00000000 0x00000000 0x0016d 0x0016d R 0x10000 From 03f21e2ba3be6cec4cfe2441142cca16a6d1d528 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 8 Apr 2025 15:30:44 +0800 Subject: [PATCH 0943/1029] [LangRef] Update initializes definition (#134370) Specify the initializes attribute in terms of an "initialized" shadow state, such that: * Loads prior to initialization return poison. * Bytes that are not explicitly initialized are written with undef on function return. This is intended to preserve the core semantics of the attribute, but adjusts the wording in a way that is compatible with existing optimizations, such as insertion of spurious loads and removal of uninitialized writes. Fixes https://github.com/llvm/llvm-project/issues/133038. Fixes https://github.com/llvm/llvm-project/issues/133059. --- llvm/docs/LangRef.rst | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index d462609fa0c52..769003a90f959 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -1690,10 +1690,24 @@ Currently, only the following parameter attributes are defined: ``initializes((Lo1, Hi1), ...)`` This attribute indicates that the function initializes the ranges of the - pointer parameter's memory, ``[%p+LoN, %p+HiN)``. Initialization of memory - means the first memory access is a non-volatile, non-atomic write. The - write must happen before the function returns. If the function unwinds, - the write may not happen. + pointer parameter's memory ``[%p+LoN, %p+HiN)``. Colloquially, this means + that all bytes in the specified range are written before the function + returns, and not read prior to the initializing write. If the function + unwinds, the write may not happen. + + Formally, this is specified in terms of an "initialized" shadow state for + all bytes in the range, which is set to "not initialized" at function entry. + If a memory access is performed through a pointer based on the argument, + and an accessed byte has not been marked as "initialized" yet, then: + + * If the byte is stored with a non-volatile, non-atomic write, mark it as + "initialized". + * If the byte is stored with a volatile or atomic write, the behavior is + undefined. + * If the byte is loaded, return a poison value. + + Additionally, if the function returns normally, write an undef value to all + bytes that are part of the range and have not been marked as "initialized". This attribute only holds for the memory accessed via this pointer parameter. Other arbitrary accesses to the same memory via other pointers From 8a53cc8ce5e9ac602d4726c0a24e53499d3522fb Mon Sep 17 00:00:00 2001 From: Shoreshen <372660931@qq.com> Date: Tue, 8 Apr 2025 15:39:12 +0800 Subject: [PATCH 0944/1029] remove failed test (#134793) function `bitcast_v64i16_to_v128i8` in newly added test file `llvm-project/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll` from PR:https://github.com/llvm/llvm-project/pull/133052 failed in expansive check. (passes normal lit check) remove it for now --- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 5072 +---------------- .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll | 8 +- .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll | 8 +- .../test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll | 8 +- 25 files changed, 158 insertions(+), 5106 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 51dc5ceb82b41..56edb29281944 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <32 x float> @bitcast_v32i32_to_v32f32(<32 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v32i32_to_v32f32: @@ -92753,4954 +92753,6 @@ end: ret <64 x i16> %phi } -define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { -; GCN-LABEL: bitcast_v64i16_to_v128i8: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:132 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:116 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:100 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:84 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:76 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:136 -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:68 -; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 -; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 -; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:52 -; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:48 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:40 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 -; GCN-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:24 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(10) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v34, 16, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v7 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 -; GCN-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:16 -; GCN-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GCN-NEXT: buffer_load_dword v36, off, s[0:3], s32 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt vmcnt(8) expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v38 -; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v37 -; GCN-NEXT: ; implicit-def: $vgpr40 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr54 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr50 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr51 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr52 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr53 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr55 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr41 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr42 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr43 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr44 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr45 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr47 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr56 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr57 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr58 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr59 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr60 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr61 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr62 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr63 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; kill: killed $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; kill: killed $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr6 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; kill: killed $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; kill: killed $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr14 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; kill: killed $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr18 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; kill: killed $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr22 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; kill: killed $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr26 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; kill: killed $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr30 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; kill: killed $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr32 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; kill: killed $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr33 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: ; kill: killed $vgpr5 -; GCN-NEXT: ; implicit-def: $vgpr5 -; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_2 -; GCN-NEXT: ; %bb.1: ; %cmp.false -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v40, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GCN-NEXT: v_or_b32_e32 v54, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v50, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; GCN-NEXT: v_or_b32_e32 v51, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v52, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GCN-NEXT: v_or_b32_e32 v53, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v55, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; GCN-NEXT: v_or_b32_e32 v41, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v42, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; GCN-NEXT: v_or_b32_e32 v43, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v44, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GCN-NEXT: v_or_b32_e32 v45, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v46, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; GCN-NEXT: v_or_b32_e32 v47, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v56, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GCN-NEXT: v_or_b32_e32 v57, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v58, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GCN-NEXT: v_or_b32_e32 v59, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v60, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GCN-NEXT: v_or_b32_e32 v61, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v23 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v33, 0xffff, v6 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v62, v1, v6 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v49 -; GCN-NEXT: v_or_b32_e32 v63, v2, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v2, v3, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v48 -; GCN-NEXT: v_or_b32_e32 v6, v5, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v10, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v39 -; GCN-NEXT: v_or_b32_e32 v14, v13, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v18, v15, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v11 -; GCN-NEXT: v_or_b32_e32 v22, v17, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v26, v19, v34 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v30, v21, v27 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v32, v23, v29 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v49 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: v_or_b32_e32 v33, v33, v25 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v48 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v4, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v8, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v12, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v16, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v20, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v24, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v28, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v31, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v7, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v9, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v49, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v48, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v39 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v39, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v11, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v38 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v38, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v37 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_bfe_u32 v1, v37, 8, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v5, v54, v40, 24 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v40, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v54, v40, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v50, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v50, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v50, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v52, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v52, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v52, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v55, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v55, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v55, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v43, v42, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v43, v42, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v43, v42, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v44, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v44, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v44, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v47, v46, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v47, v46, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v47, v46, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v56, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v56, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v56, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v59, v58, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v59, v58, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v59, v58, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v61, v60, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v61, v60, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v61, v60, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v63, v62, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v63, v62, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v63, v62, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v6, v2, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v6, v2, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v6, v2, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v14, v10, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v14, v10, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v14, v10, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v22, v18, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v22, v18, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v22, v18, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v30, v26, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v30, v26, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v30, v26, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; kill: killed $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr36 -; GCN-NEXT: ; implicit-def: $vgpr3 -; GCN-NEXT: ; implicit-def: $vgpr13 -; GCN-NEXT: ; implicit-def: $vgpr15 -; GCN-NEXT: ; implicit-def: $vgpr35 -; GCN-NEXT: ; implicit-def: $vgpr17 -; GCN-NEXT: ; implicit-def: $vgpr21 -; GCN-NEXT: ; implicit-def: $vgpr19 -; GCN-NEXT: ; implicit-def: $vgpr23 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr4 -; GCN-NEXT: ; kill: killed $vgpr4 -; GCN-NEXT: ; implicit-def: $vgpr34 -; GCN-NEXT: ; implicit-def: $vgpr27 -; GCN-NEXT: ; implicit-def: $vgpr29 -; GCN-NEXT: ; implicit-def: $vgpr25 -; GCN-NEXT: .LBB49_2: ; %Flow -; GCN-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB49_4 -; GCN-NEXT: ; %bb.3: ; %cmp.true -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v2, v29, v2 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GCN-NEXT: v_or_b32_e32 v4, v25, v4 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 3, v5 -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v34, v6 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v7, vcc, 3, v5 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v27, v7 -; GCN-NEXT: s_mov_b32 s6, 0x30000 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v8, vcc, 3, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v9, vcc, 3, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v5 -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v11, vcc, 3, v5 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 3, v19 -; GCN-NEXT: v_add_i32_e32 v14, vcc, 3, v23 -; GCN-NEXT: v_add_i32_e32 v16, vcc, 3, v17 -; GCN-NEXT: v_add_i32_e32 v17, vcc, 3, v21 -; GCN-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; GCN-NEXT: v_add_i32_e32 v18, vcc, 3, v35 -; GCN-NEXT: v_add_i32_e32 v5, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 3, v13 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v19, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v20, vcc, 3, v36 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v21, vcc, 3, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v22, vcc, 3, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v23, vcc, 3, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v24, vcc, 3, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v25, vcc, 3, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v26, vcc, 3, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v27, vcc, 3, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v28, vcc, 3, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v29, vcc, 3, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v30, vcc, 3, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v31, vcc, 3, v3 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v32, vcc, 3, v3 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GCN-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GCN-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GCN-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GCN-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GCN-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GCN-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GCN-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GCN-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GCN-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GCN-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GCN-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v8, v33, v8 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v9, v33, v9 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v10, v33, v10 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v11, v33, v11 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v12, v33, v12 -; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v34, v33, v14 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v16, v14, v16 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v17, v14, v17 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v15, v14, v15 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v35, v14, v18 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v5, v14, v5 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v14, v13 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v19, v14, v19 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v20, v14, v20 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v21, v14, v21 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v36, v14, v22 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v23, v14, v23 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v24, v14, v24 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v25, v14, v25 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v37, v14, v26 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v27, v14, v27 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v28, v14, v28 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v29, v14, v29 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v38, v14, v30 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v31, v14, v31 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v39, v14, v32 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v1, v14, v1 -; GCN-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v3, v14, v3 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 0x30000, v2 -; GCN-NEXT: v_add_i32_e32 v33, vcc, s6, v4 -; GCN-NEXT: v_add_i32_e32 v26, vcc, s6, v6 -; GCN-NEXT: v_add_i32_e32 v30, vcc, s6, v7 -; GCN-NEXT: v_add_i32_e32 v18, vcc, s6, v8 -; GCN-NEXT: v_add_i32_e32 v22, vcc, s6, v9 -; GCN-NEXT: v_add_i32_e32 v10, vcc, s6, v10 -; GCN-NEXT: v_add_i32_e32 v14, vcc, s6, v11 -; GCN-NEXT: v_add_i32_e32 v2, vcc, s6, v12 -; GCN-NEXT: v_add_i32_e32 v6, vcc, s6, v34 -; GCN-NEXT: v_add_i32_e32 v62, vcc, s6, v16 -; GCN-NEXT: v_add_i32_e32 v63, vcc, s6, v17 -; GCN-NEXT: v_add_i32_e32 v60, vcc, s6, v15 -; GCN-NEXT: v_add_i32_e32 v61, vcc, s6, v35 -; GCN-NEXT: v_add_i32_e32 v58, vcc, s6, v5 -; GCN-NEXT: v_add_i32_e32 v59, vcc, s6, v13 -; GCN-NEXT: v_add_i32_e32 v56, vcc, s6, v19 -; GCN-NEXT: v_add_i32_e32 v57, vcc, s6, v20 -; GCN-NEXT: v_add_i32_e32 v46, vcc, s6, v21 -; GCN-NEXT: v_add_i32_e32 v47, vcc, s6, v36 -; GCN-NEXT: v_add_i32_e32 v44, vcc, s6, v23 -; GCN-NEXT: v_add_i32_e32 v45, vcc, s6, v24 -; GCN-NEXT: v_add_i32_e32 v42, vcc, s6, v25 -; GCN-NEXT: v_add_i32_e32 v43, vcc, s6, v37 -; GCN-NEXT: v_add_i32_e32 v55, vcc, s6, v27 -; GCN-NEXT: v_add_i32_e32 v41, vcc, s6, v28 -; GCN-NEXT: v_add_i32_e32 v52, vcc, s6, v29 -; GCN-NEXT: v_add_i32_e32 v53, vcc, s6, v38 -; GCN-NEXT: v_add_i32_e32 v50, vcc, s6, v31 -; GCN-NEXT: v_add_i32_e32 v51, vcc, s6, v39 -; GCN-NEXT: v_add_i32_e32 v40, vcc, s6, v1 -; GCN-NEXT: v_add_i32_e32 v54, vcc, s6, v3 -; GCN-NEXT: v_alignbit_b32 v5, v54, v40, 24 -; GCN-NEXT: v_alignbit_b32 v9, v54, v40, 16 -; GCN-NEXT: v_alignbit_b32 v4, v54, v40, 8 -; GCN-NEXT: v_alignbit_b32 v1, v51, v50, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v51, v50, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v8, v51, v50, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v52, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v53, v52, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v11, v53, v52, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v55, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v41, v55, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v12, v41, v55, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v43, v42, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v43, v42, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v20, v43, v42, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v44, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v45, v44, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v24, v45, v44, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v47, v46, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v47, v46, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GCN-NEXT: v_alignbit_b32 v28, v47, v46, 8 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v56, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v56, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v57, v56, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v59, v58, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v59, v58, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v59, v58, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v61, v60, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v61, v60, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v61, v60, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v63, v62, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v63, v62, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v63, v62, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v6, v2, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v6, v2, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v6, v2, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v14, v10, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v14, v10, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v14, v10, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v22, v18, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v22, v18, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v22, v18, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v30, v26, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v30, v26, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v30, v26, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 24 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 16 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_alignbit_b32 v1, v33, v32, 8 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v16, 24, v54 -; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v54 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v54 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v51 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v37, 16, v51 -; GCN-NEXT: v_lshrrev_b32_e32 v31, 8, v51 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v38, 16, v53 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v53 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v39, 16, v41 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v41 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v48, 16, v43 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v43 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GCN-NEXT: v_lshrrev_b32_e32 v49, 16, v45 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v45 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v57 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v59 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v61 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v63 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v6 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v14 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 24, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshrrev_b32_e32 v1, 8, v33 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GCN-NEXT: .LBB49_4: ; %end -; GCN-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v40 -; GCN-NEXT: s_waitcnt vmcnt(6) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v4 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v3, 0xff, v54 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GCN-NEXT: v_or_b32_e32 v3, v3, v4 -; GCN-NEXT: v_and_b32_e32 v4, 0xff, v9 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GCN-NEXT: v_or_b32_e32 v4, v5, v4 -; GCN-NEXT: v_and_b32_e32 v5, 0xff, v7 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 24, v16 -; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GCN-NEXT: v_or_b32_e32 v5, v7, v5 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GCN-NEXT: v_or_b32_e32 v1, v1, v4 -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v3, v3, v5 -; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v1, vcc, 4, v0 -; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v50 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v8 -; GCN-NEXT: v_or_b32_e32 v29, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v51 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v31 -; GCN-NEXT: v_or_b32_e32 v31, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v52 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v11 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v53 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v55 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v12 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v41 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v1, v1, v3 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v42 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v20 -; GCN-NEXT: v_or_b32_e32 v7, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v43 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v8, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v44 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v24 -; GCN-NEXT: v_or_b32_e32 v9, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v45 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v11, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v46 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v28 -; GCN-NEXT: v_or_b32_e32 v12, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v47 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v13, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v56 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v15, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v57 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v16, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v58 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v17, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v59 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v19, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v60 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v20, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v61 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v21, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v62 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v23, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v63 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GCN-NEXT: v_or_b32_e32 v24, v1, v3 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v2 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v6 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v1, v1, v2 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v10 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v10, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v14 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v14, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v18 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v18, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v22 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v22, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v26 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v25, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v30 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v26, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v32 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v27, v1, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v33 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GCN-NEXT: v_or_b32_e32 v28, v1, v2 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v30, v2, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v37 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v33, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v32, v2, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v38 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v34, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v35, v2, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v39 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v36, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v37, v2, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v48 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v38, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v39, v2, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v49 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v48, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v49, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v50, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v51, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v52, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v53, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v54, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v55, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v40, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v41, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v42, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v43, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v44, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v45, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v46, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v47, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v56, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v57, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v58, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v59, v2, v1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v60, v2, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v29 -; GCN-NEXT: v_or_b32_e32 v4, v1, v30 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; GCN-NEXT: v_or_b32_e32 v5, v1, v33 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 8, v0 -; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GCN-NEXT: v_or_b32_e32 v63, v2, v32 -; GCN-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GCN-NEXT: v_or_b32_e32 v61, v3, v34 -; GCN-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v35 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v62, vcc, 20, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GCN-NEXT: v_or_b32_e32 v6, v6, v36 -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v37 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v29, vcc, 28, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GCN-NEXT: v_or_b32_e32 v7, v7, v38 -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GCN-NEXT: v_add_i32_e32 v30, vcc, 32, v0 -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v9 -; GCN-NEXT: v_or_b32_e32 v8, v7, v39 -; GCN-NEXT: v_add_i32_e32 v31, vcc, 36, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v11 -; GCN-NEXT: v_or_b32_e32 v9, v7, v48 -; GCN-NEXT: v_add_i32_e32 v32, vcc, 40, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v12 -; GCN-NEXT: v_or_b32_e32 v11, v7, v49 -; GCN-NEXT: v_add_i32_e32 v33, vcc, 44, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v13 -; GCN-NEXT: v_or_b32_e32 v12, v7, v50 -; GCN-NEXT: v_add_i32_e32 v34, vcc, 48, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v15 -; GCN-NEXT: v_or_b32_e32 v13, v7, v51 -; GCN-NEXT: v_add_i32_e32 v35, vcc, 52, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v16 -; GCN-NEXT: v_or_b32_e32 v15, v7, v52 -; GCN-NEXT: v_add_i32_e32 v36, vcc, 56, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v17 -; GCN-NEXT: v_or_b32_e32 v16, v7, v53 -; GCN-NEXT: v_add_i32_e32 v37, vcc, 60, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v19 -; GCN-NEXT: v_or_b32_e32 v17, v7, v54 -; GCN-NEXT: v_add_i32_e32 v38, vcc, 64, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v20 -; GCN-NEXT: v_or_b32_e32 v19, v7, v55 -; GCN-NEXT: v_add_i32_e32 v39, vcc, 0x44, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v21 -; GCN-NEXT: v_or_b32_e32 v20, v7, v40 -; GCN-NEXT: v_add_i32_e32 v48, vcc, 0x48, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v23 -; GCN-NEXT: v_or_b32_e32 v21, v7, v41 -; GCN-NEXT: v_add_i32_e32 v49, vcc, 0x4c, v0 -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v24 -; GCN-NEXT: v_or_b32_e32 v23, v7, v42 -; GCN-NEXT: v_add_i32_e32 v24, vcc, 0x50, v0 -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GCN-NEXT: v_or_b32_e32 v7, v7, v43 -; GCN-NEXT: v_add_i32_e32 v50, vcc, 0x54, v0 -; GCN-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_and_b32_e32 v51, 0xffff, v51 -; GCN-NEXT: v_or_b32_e32 v44, v51, v44 -; GCN-NEXT: v_add_i32_e32 v51, vcc, 0x58, v0 -; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GCN-NEXT: v_or_b32_e32 v10, v10, v45 -; GCN-NEXT: v_add_i32_e32 v52, vcc, 0x5c, v0 -; GCN-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GCN-NEXT: v_or_b32_e32 v14, v14, v46 -; GCN-NEXT: v_add_i32_e32 v53, vcc, 0x60, v0 -; GCN-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GCN-NEXT: v_or_b32_e32 v18, v18, v47 -; GCN-NEXT: v_add_i32_e32 v54, vcc, 0x64, v0 -; GCN-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GCN-NEXT: v_or_b32_e32 v22, v22, v56 -; GCN-NEXT: v_add_i32_e32 v55, vcc, 0x68, v0 -; GCN-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GCN-NEXT: v_or_b32_e32 v25, v25, v57 -; GCN-NEXT: v_add_i32_e32 v40, vcc, 0x6c, v0 -; GCN-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GCN-NEXT: v_or_b32_e32 v26, v26, v58 -; GCN-NEXT: v_add_i32_e32 v41, vcc, 0x70, v0 -; GCN-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GCN-NEXT: v_or_b32_e32 v27, v27, v59 -; GCN-NEXT: v_add_i32_e32 v42, vcc, 0x74, v0 -; GCN-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GCN-NEXT: v_or_b32_e32 v28, v28, v60 -; GCN-NEXT: v_add_i32_e32 v43, vcc, 0x78, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v63, v3, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v61, v62, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v29, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt expcnt(0) -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v1, v31, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v8, v32, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v9, v33, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v11, v34, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v12, v35, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v13, v36, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v15, v37, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v16, v38, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v17, v39, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v19, v48, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v20, v49, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v23, v50, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v7, v51, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v44, v52, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v10, v53, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v14, v54, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v18, v55, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v22, v40, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v25, v41, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v26, v42, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v27, v43, s[0:3], 0 offen -; GCN-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen -; GCN-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; VI-LABEL: bitcast_v64i16_to_v128i8: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v34, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr55 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr62 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_2 -; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[13:14] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v31, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v10 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v11 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v12 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v13 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v14 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v9, v8 -; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v7, v6 -; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v5, v3 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, v1 -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v37 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v20 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v17 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v18 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, v46 -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] -; VI-NEXT: v_mov_b32_e32 v32, v15 -; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26 -; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24 -; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22 -; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v20 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 -; VI-NEXT: v_mov_b32_e32 v46, v1 -; VI-NEXT: ; implicit-def: $vgpr1 -; VI-NEXT: ; implicit-def: $vgpr3 -; VI-NEXT: ; implicit-def: $vgpr5 -; VI-NEXT: ; implicit-def: $vgpr7 -; VI-NEXT: ; implicit-def: $vgpr9 -; VI-NEXT: ; implicit-def: $vgpr11 -; VI-NEXT: ; implicit-def: $vgpr13 -; VI-NEXT: ; implicit-def: $vgpr15 -; VI-NEXT: ; implicit-def: $vgpr17 -; VI-NEXT: ; implicit-def: $vgpr19 -; VI-NEXT: ; implicit-def: $vgpr21 -; VI-NEXT: ; implicit-def: $vgpr23 -; VI-NEXT: ; implicit-def: $vgpr25 -; VI-NEXT: ; implicit-def: $vgpr27 -; VI-NEXT: ; implicit-def: $vgpr29 -; VI-NEXT: ; implicit-def: $vgpr37 -; VI-NEXT: .LBB49_2: ; %Flow -; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB49_4 -; VI-NEXT: ; %bb.3: ; %cmp.true -; VI-NEXT: v_mov_b32_e32 v31, 3 -; VI-NEXT: v_add_u16_sdwa v51, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v32, 3, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v18, v32, v18 -; VI-NEXT: v_add_u16_e32 v32, 3, v17 -; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v17, v32, v17 -; VI-NEXT: v_add_u16_e32 v32, 3, v20 -; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v20, v32, v20 -; VI-NEXT: v_add_u16_e32 v32, 3, v19 -; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; VI-NEXT: v_add_u16_sdwa v48, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v19, v32, v19 -; VI-NEXT: v_add_u16_e32 v32, 3, v22 -; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 -; VI-NEXT: v_add_u16_sdwa v53, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v22, v32, v22 -; VI-NEXT: v_add_u16_e32 v32, 3, v21 -; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; VI-NEXT: v_add_u16_sdwa v61, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v21, v32, v21 -; VI-NEXT: v_add_u16_e32 v32, 3, v24 -; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v24, v32, v24 -; VI-NEXT: v_add_u16_e32 v32, 3, v23 -; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; VI-NEXT: v_add_u16_sdwa v58, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v23, v32, v23 -; VI-NEXT: v_add_u16_e32 v32, 3, v26 -; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v26, v32, v26 -; VI-NEXT: v_add_u16_e32 v32, 3, v25 -; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; VI-NEXT: v_add_u16_sdwa v39, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v25, v32, v25 -; VI-NEXT: v_add_u16_e32 v32, 3, v28 -; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v28, v32, v28 -; VI-NEXT: v_add_u16_e32 v32, 3, v27 -; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; VI-NEXT: v_add_u16_sdwa v35, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v27, v32, v27 -; VI-NEXT: v_add_u16_e32 v33, 3, v30 -; VI-NEXT: v_add_u16_e32 v34, 3, v29 -; VI-NEXT: v_add_u16_sdwa v32, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 -; VI-NEXT: v_add_u16_sdwa v52, v37, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v30, v33, v29 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 -; VI-NEXT: v_add_u16_e32 v33, 3, v37 -; VI-NEXT: v_add_u16_sdwa v50, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v29, v34, v29 -; VI-NEXT: v_add_u16_e32 v34, 3, v36 -; VI-NEXT: v_or_b32_e32 v37, v33, v32 -; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v50 -; VI-NEXT: v_add_u16_sdwa v57, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v36, v34, v32 -; VI-NEXT: v_add_u16_e32 v33, 3, v2 -; VI-NEXT: v_add_u16_e32 v34, 3, v1 -; VI-NEXT: v_add_u16_sdwa v32, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v57 -; VI-NEXT: v_or_b32_e32 v2, v33, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 -; VI-NEXT: v_add_u16_sdwa v56, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v1, v34, v1 -; VI-NEXT: v_add_u16_e32 v33, 3, v4 -; VI-NEXT: v_add_u16_e32 v34, 3, v3 -; VI-NEXT: v_add_u16_sdwa v32, v3, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 -; VI-NEXT: v_or_b32_e32 v4, v33, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 -; VI-NEXT: v_add_u16_sdwa v47, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v3, v34, v3 -; VI-NEXT: v_add_u16_e32 v33, 3, v6 -; VI-NEXT: v_add_u16_e32 v34, 3, v5 -; VI-NEXT: v_add_u16_sdwa v32, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; VI-NEXT: v_or_b32_e32 v6, v33, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v5, v34, v5 -; VI-NEXT: v_add_u16_sdwa v34, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_add_u16_e32 v38, 3, v8 -; VI-NEXT: v_add_u16_e32 v33, 3, v7 -; VI-NEXT: v_add_u16_sdwa v32, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; VI-NEXT: v_or_b32_e32 v8, v38, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 -; VI-NEXT: v_add_u16_sdwa v59, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v7, v33, v7 -; VI-NEXT: v_add_u16_e32 v33, 3, v10 -; VI-NEXT: v_add_u16_e32 v38, 3, v9 -; VI-NEXT: v_add_u16_sdwa v32, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 -; VI-NEXT: v_or_b32_e32 v10, v33, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 -; VI-NEXT: v_add_u16_sdwa v63, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v9, v38, v9 -; VI-NEXT: v_add_u16_e32 v33, 3, v12 -; VI-NEXT: v_add_u16_e32 v38, 3, v11 -; VI-NEXT: v_add_u16_sdwa v32, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v12, v33, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; VI-NEXT: v_add_u16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v11, v38, v11 -; VI-NEXT: v_add_u16_e32 v38, 3, v14 -; VI-NEXT: v_add_u16_e32 v49, 3, v13 -; VI-NEXT: v_add_u16_sdwa v32, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 -; VI-NEXT: v_add_u16_sdwa v60, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v38, v13 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 -; VI-NEXT: v_add_u16_sdwa v31, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_u16_e32 v16, 3, v16 -; VI-NEXT: v_add_u16_e32 v32, 3, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v16, v16, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v15, v32, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16] -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v13, v49, v13 -; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v14 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14] -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9 -; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 -; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 -; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v60, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v63, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v59, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v34, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v47, 8, 8 -; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v56, 8, 8 -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v57, 8, 8 -; VI-NEXT: v_mov_b32_e32 v46, v35 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v52, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v46, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v1, v39, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v49, v53 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v52, v51 -; VI-NEXT: v_bfe_u32 v31, v51, 8, 8 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 -; VI-NEXT: v_bfe_u32 v35, v58, 8, 8 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v39, v61, 8, 8 -; VI-NEXT: v_bfe_u32 v58, v48, 8, 8 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_bfe_u32 v61, v53, 8, 8 -; VI-NEXT: .LBB49_4: ; %end -; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v43 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v41 -; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v58 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v39 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-LABEL: bitcast_v64i16_to_v128i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr44 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr57 -; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr63 -; GFX9-NEXT: ; implicit-def: $vgpr61 -; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr38 -; GFX9-NEXT: ; implicit-def: $vgpr60 -; GFX9-NEXT: ; implicit-def: $vgpr59 -; GFX9-NEXT: ; implicit-def: $vgpr37 -; GFX9-NEXT: ; implicit-def: $vgpr47 -; GFX9-NEXT: ; implicit-def: $vgpr49 -; GFX9-NEXT: ; implicit-def: $vgpr46 -; GFX9-NEXT: ; implicit-def: $vgpr45 -; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr62 -; GFX9-NEXT: ; kill: killed $vgpr50 -; GFX9-NEXT: ; implicit-def: $vgpr56 -; GFX9-NEXT: ; implicit-def: $vgpr42 -; GFX9-NEXT: ; implicit-def: $vgpr41 -; GFX9-NEXT: ; implicit-def: $vgpr40 -; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr51 -; GFX9-NEXT: ; implicit-def: $vgpr50 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr53 -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr43 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_2 -; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(45) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(46) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[15:16] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[27:28] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[25:26] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[23:24] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[21:22] -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 -; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; GFX9-NEXT: .LBB49_2: ; %Flow -; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB49_4 -; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[15:16] -; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[13:14] -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX9-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[29:30] -; GFX9-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[27:28] -; GFX9-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[25:26] -; GFX9-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[23:24] -; GFX9-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[33:34], 24, v[21:22] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v15 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX9-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; GFX9-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; GFX9-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; GFX9-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; GFX9-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v24 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v22 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v22 -; GFX9-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v21 -; GFX9-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX9-NEXT: v_lshrrev_b64 v[40:41], 24, v[5:6] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v21 -; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX9-NEXT: v_lshrrev_b64 v[41:42], 24, v[3:4] -; GFX9-NEXT: v_lshrrev_b64 v[53:54], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b64 v[42:43], 24, v[1:2] -; GFX9-NEXT: v_lshrrev_b64 v[54:55], 24, v[17:18] -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v59, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 8, v17 -; GFX9-NEXT: .LBB49_4: ; %end -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v63 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v61 -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v38 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v60 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v59 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v55, 8, v44 -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v36, 8, v36 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v35, 8, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v15, v15, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v34 -; GFX9-NEXT: v_or_b32_sdwa v16, v16, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v49 -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v34, 8, v42 -; GFX9-NEXT: v_or_b32_sdwa v34, v58, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v41 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v40 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v52 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v51 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v56 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-LABEL: bitcast_v64i16_to_v128i8: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x13 -; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:88 -; GFX11-NEXT: scratch_store_b32 off, v41, s32 offset:84 -; GFX11-NEXT: scratch_store_b32 off, v42, s32 offset:80 -; GFX11-NEXT: scratch_store_b32 off, v43, s32 offset:76 -; GFX11-NEXT: scratch_store_b32 off, v44, s32 offset:72 -; GFX11-NEXT: scratch_store_b32 off, v45, s32 offset:68 -; GFX11-NEXT: scratch_store_b32 off, v46, s32 offset:64 -; GFX11-NEXT: scratch_store_b32 off, v47, s32 offset:60 -; GFX11-NEXT: scratch_store_b32 off, v56, s32 offset:56 -; GFX11-NEXT: scratch_store_b32 off, v57, s32 offset:52 -; GFX11-NEXT: scratch_store_b32 off, v58, s32 offset:48 -; GFX11-NEXT: scratch_store_b32 off, v59, s32 offset:44 -; GFX11-NEXT: scratch_store_b32 off, v60, s32 offset:40 -; GFX11-NEXT: scratch_store_b32 off, v61, s32 offset:36 -; GFX11-NEXT: scratch_store_b32 off, v62, s32 offset:32 -; GFX11-NEXT: scratch_store_b32 off, v63, s32 offset:28 -; GFX11-NEXT: scratch_store_b32 off, v72, s32 offset:24 -; GFX11-NEXT: scratch_store_b32 off, v73, s32 offset:20 -; GFX11-NEXT: scratch_store_b32 off, v74, s32 offset:16 -; GFX11-NEXT: scratch_store_b32 off, v75, s32 offset:12 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: ; implicit-def: $vgpr74 -; GFX11-NEXT: ; implicit-def: $vgpr72 -; GFX11-NEXT: ; implicit-def: $vgpr64 -; GFX11-NEXT: ; implicit-def: $vgpr63 -; GFX11-NEXT: ; implicit-def: $vgpr62 -; GFX11-NEXT: ; implicit-def: $vgpr60 -; GFX11-NEXT: ; implicit-def: $vgpr57 -; GFX11-NEXT: ; implicit-def: $vgpr47 -; GFX11-NEXT: ; implicit-def: $vgpr54 -; GFX11-NEXT: ; implicit-def: $vgpr45 -; GFX11-NEXT: ; implicit-def: $vgpr43 -; GFX11-NEXT: ; implicit-def: $vgpr42 -; GFX11-NEXT: ; implicit-def: $vgpr183 -; GFX11-NEXT: ; implicit-def: $vgpr181 -; GFX11-NEXT: ; implicit-def: $vgpr53 -; GFX11-NEXT: ; implicit-def: $vgpr179 -; GFX11-NEXT: ; implicit-def: $vgpr177 -; GFX11-NEXT: ; implicit-def: $vgpr167 -; GFX11-NEXT: ; implicit-def: $vgpr165 -; GFX11-NEXT: ; implicit-def: $vgpr164 -; GFX11-NEXT: ; implicit-def: $vgpr52 -; GFX11-NEXT: ; implicit-def: $vgpr161 -; GFX11-NEXT: ; implicit-def: $vgpr151 -; GFX11-NEXT: ; implicit-def: $vgpr150 -; GFX11-NEXT: ; implicit-def: $vgpr147 -; GFX11-NEXT: ; implicit-def: $vgpr145 -; GFX11-NEXT: ; implicit-def: $vgpr51 -; GFX11-NEXT: ; implicit-def: $vgpr144 -; GFX11-NEXT: ; implicit-def: $vgpr133 -; GFX11-NEXT: ; implicit-def: $vgpr131 -; GFX11-NEXT: ; implicit-def: $vgpr129 -; GFX11-NEXT: ; implicit-def: $vgpr119 -; GFX11-NEXT: ; implicit-def: $vgpr50 -; GFX11-NEXT: ; implicit-def: $vgpr117 -; GFX11-NEXT: ; implicit-def: $vgpr116 -; GFX11-NEXT: ; implicit-def: $vgpr115 -; GFX11-NEXT: ; implicit-def: $vgpr102 -; GFX11-NEXT: ; implicit-def: $vgpr100 -; GFX11-NEXT: ; implicit-def: $vgpr49 -; GFX11-NEXT: ; implicit-def: $vgpr98 -; GFX11-NEXT: ; implicit-def: $vgpr97 -; GFX11-NEXT: ; implicit-def: $vgpr96 -; GFX11-NEXT: ; implicit-def: $vgpr86 -; GFX11-NEXT: ; implicit-def: $vgpr82 -; GFX11-NEXT: ; implicit-def: $vgpr48 -; GFX11-NEXT: ; implicit-def: $vgpr80 -; GFX11-NEXT: ; implicit-def: $vgpr70 -; GFX11-NEXT: ; implicit-def: $vgpr68 -; GFX11-NEXT: ; implicit-def: $vgpr75 -; GFX11-NEXT: ; implicit-def: $vgpr73 -; GFX11-NEXT: ; implicit-def: $vgpr66 -; GFX11-NEXT: ; implicit-def: $vgpr61 -; GFX11-NEXT: ; implicit-def: $vgpr59 -; GFX11-NEXT: ; implicit-def: $vgpr58 -; GFX11-NEXT: ; implicit-def: $vgpr56 -; GFX11-NEXT: ; implicit-def: $vgpr46 -; GFX11-NEXT: ; implicit-def: $vgpr65 -; GFX11-NEXT: ; implicit-def: $vgpr44 -; GFX11-NEXT: ; implicit-def: $vgpr41 -; GFX11-NEXT: ; implicit-def: $vgpr40 -; GFX11-NEXT: ; implicit-def: $vgpr182 -; GFX11-NEXT: ; implicit-def: $vgpr180 -; GFX11-NEXT: ; implicit-def: $vgpr38 -; GFX11-NEXT: ; implicit-def: $vgpr178 -; GFX11-NEXT: ; implicit-def: $vgpr176 -; GFX11-NEXT: ; implicit-def: $vgpr166 -; GFX11-NEXT: ; implicit-def: $vgpr163 -; GFX11-NEXT: ; implicit-def: $vgpr162 -; GFX11-NEXT: ; implicit-def: $vgpr37 -; GFX11-NEXT: ; implicit-def: $vgpr160 -; GFX11-NEXT: ; implicit-def: $vgpr149 -; GFX11-NEXT: ; implicit-def: $vgpr148 -; GFX11-NEXT: ; implicit-def: $vgpr146 -; GFX11-NEXT: ; implicit-def: $vgpr135 -; GFX11-NEXT: ; implicit-def: $vgpr36 -; GFX11-NEXT: ; implicit-def: $vgpr134 -; GFX11-NEXT: ; implicit-def: $vgpr132 -; GFX11-NEXT: ; implicit-def: $vgpr130 -; GFX11-NEXT: ; implicit-def: $vgpr128 -; GFX11-NEXT: ; implicit-def: $vgpr118 -; GFX11-NEXT: ; implicit-def: $vgpr35 -; GFX11-NEXT: ; implicit-def: $vgpr114 -; GFX11-NEXT: ; implicit-def: $vgpr113 -; GFX11-NEXT: ; implicit-def: $vgpr112 -; GFX11-NEXT: ; implicit-def: $vgpr103 -; GFX11-NEXT: ; implicit-def: $vgpr101 -; GFX11-NEXT: ; implicit-def: $vgpr99 -; GFX11-NEXT: ; implicit-def: $vgpr87 -; GFX11-NEXT: ; implicit-def: $vgpr85 -; GFX11-NEXT: ; implicit-def: $vgpr84 -; GFX11-NEXT: ; implicit-def: $vgpr83 -; GFX11-NEXT: ; implicit-def: $vgpr81 -; GFX11-NEXT: ; implicit-def: $vgpr71 -; GFX11-NEXT: ; implicit-def: $vgpr69 -; GFX11-NEXT: ; implicit-def: $vgpr34 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 -; GFX11-NEXT: ; implicit-def: $vgpr33 -; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB49_2 -; GFX11-NEXT: ; %bb.1: ; %cmp.false -; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] -; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] -; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 24, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v102, 8, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v131, 24, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v150, 24, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v161, 8, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v167, 24, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v179, 8, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v45, 8, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 24, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v99, 8, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v128, 8, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v73, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v17 -; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] -; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] -; GFX11-NEXT: .LBB49_2: ; %Flow -; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB49_4 -; GFX11-NEXT: ; %bb.3: ; %cmp.true -; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] -; GFX11-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] -; GFX11-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-NEXT: v_lshrrev_b64 v[50:51], 24, v[11:12] -; GFX11-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-NEXT: v_lshrrev_b64 v[51:52], 24, v[9:10] -; GFX11-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-NEXT: v_lshrrev_b64 v[52:53], 24, v[7:8] -; GFX11-NEXT: v_lshrrev_b64 v[64:65], 24, v[1:2] -; GFX11-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-NEXT: v_lshrrev_b64 v[53:54], 24, v[5:6] -; GFX11-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-NEXT: v_lshrrev_b64 v[65:66], 24, v[19:20] -; GFX11-NEXT: v_lshrrev_b64 v[54:55], 24, v[3:4] -; GFX11-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-NEXT: v_lshrrev_b64 v[66:67], 24, v[17:18] -; GFX11-NEXT: v_lshrrev_b32_e32 v68, 24, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v70, 16, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v80, 8, v16 -; GFX11-NEXT: v_lshrrev_b32_e32 v82, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v86, 8, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 24, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 8, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v102, 8, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v115, 24, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v117, 8, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v129, 8, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v131, 24, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v144, 8, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v147, 8, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v150, 24, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v151, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v161, 8, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v164, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v165, 8, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v167, 24, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v177, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v179, 8, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v181, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v183, 8, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v42, 24, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v43, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v45, 8, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v47, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v57, 8, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v60, 24, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v62, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v63, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v72, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v74, 8, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v69, 24, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v71, 16, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v81, 8, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v83, 16, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v84, 8, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v85, 24, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v87, 16, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v99, 8, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v103, 8, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v112, 24, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v114, 8, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v128, 8, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v130, 24, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v134, 8, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v146, 8, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v148, 24, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v149, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v160, 8, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v162, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v163, 8, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v166, 24, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v176, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v178, 8, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v180, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v182, 8, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v40, 24, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v41, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v44, 8, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v46, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v56, 8, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v58, 24, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v61, 8, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v73, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v75, 8, v17 -; GFX11-NEXT: .LBB49_4: ; %end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-NEXT: v_lshlrev_b16 v55, 8, v74 -; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_lshlrev_b16 v39, 8, v64 -; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX11-NEXT: v_lshlrev_b16 v64, 8, v60 -; GFX11-NEXT: v_lshlrev_b16 v54, 8, v54 -; GFX11-NEXT: v_or_b32_e32 v1, v1, v55 -; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v72 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-NEXT: v_and_b32_e32 v67, 0xff, v47 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_or_b32_e32 v39, v55, v39 -; GFX11-NEXT: v_lshlrev_b16 v55, 8, v63 -; GFX11-NEXT: v_or_b32_e32 v54, v67, v54 -; GFX11-NEXT: v_lshlrev_b16 v67, 8, v42 -; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v62 -; GFX11-NEXT: v_lshlrev_b16 v53, 8, v53 -; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_or_b32_e32 v1, v1, v39 -; GFX11-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-NEXT: v_or_b32_e32 v55, v55, v64 -; GFX11-NEXT: v_lshlrev_b16 v64, 8, v57 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 -; GFX11-NEXT: v_lshlrev_b16 v52, 8, v52 -; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v55 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v64 -; GFX11-NEXT: v_lshlrev_b16 v55, 8, v45 -; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v43 -; GFX11-NEXT: v_lshlrev_b16 v51, 8, v51 -; GFX11-NEXT: v_or_b32_e32 v2, v2, v39 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v55 -; GFX11-NEXT: v_or_b32_e32 v54, v64, v67 -; GFX11-NEXT: v_lshlrev_b16 v55, 8, v183 -; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v181 -; GFX11-NEXT: v_or_b32_e32 v3, v3, v39 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v54 -; GFX11-NEXT: v_or_b32_e32 v5, v5, v55 -; GFX11-NEXT: v_or_b32_e32 v53, v64, v53 -; GFX11-NEXT: v_lshlrev_b16 v54, 8, v179 -; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v177 -; GFX11-NEXT: v_lshlrev_b16 v64, 8, v167 -; GFX11-NEXT: v_or_b32_e32 v4, v4, v39 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-NEXT: v_or_b32_e32 v6, v6, v54 -; GFX11-NEXT: v_or_b32_e32 v53, v55, v64 -; GFX11-NEXT: v_lshlrev_b16 v54, 8, v165 -; GFX11-NEXT: v_and_b32_e32 v55, 0xff, v164 -; GFX11-NEXT: v_or_b32_e32 v5, v5, v39 -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v53 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v54 -; GFX11-NEXT: v_or_b32_e32 v52, v55, v52 -; GFX11-NEXT: v_lshlrev_b16 v53, 8, v161 -; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v151 -; GFX11-NEXT: v_lshlrev_b16 v55, 8, v150 -; GFX11-NEXT: v_or_b32_e32 v6, v6, v39 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v52 -; GFX11-NEXT: v_or_b32_e32 v8, v8, v53 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-NEXT: v_or_b32_e32 v52, v54, v55 -; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v145 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX11-NEXT: v_lshlrev_b16 v53, 8, v147 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10 -; GFX11-NEXT: v_lshlrev_b16 v55, 8, v144 -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-NEXT: v_or_b32_e32 v51, v54, v51 -; GFX11-NEXT: v_or_b32_e32 v9, v9, v53 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v55 -; GFX11-NEXT: v_or_b32_e32 v7, v7, v39 -; GFX11-NEXT: v_or_b32_e32 v8, v8, v52 -; GFX11-NEXT: v_lshlrev_b32_e32 v39, 16, v51 -; GFX11-NEXT: v_and_b32_e32 v51, 0xff, v133 -; GFX11-NEXT: v_lshlrev_b16 v52, 8, v131 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11 -; GFX11-NEXT: v_lshlrev_b16 v53, 8, v129 -; GFX11-NEXT: v_and_b32_e32 v54, 0xff, v119 -; GFX11-NEXT: v_lshlrev_b16 v50, 8, v50 -; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12 -; GFX11-NEXT: v_lshlrev_b16 v55, 8, v117 -; GFX11-NEXT: v_and_b32_e32 v64, 0xff, v116 -; GFX11-NEXT: v_lshlrev_b16 v67, 8, v115 -; GFX11-NEXT: v_or_b32_e32 v51, v51, v52 -; GFX11-NEXT: v_or_b32_e32 v11, v11, v53 -; GFX11-NEXT: v_or_b32_e32 v50, v54, v50 -; GFX11-NEXT: v_or_b32_e32 v12, v12, v55 -; GFX11-NEXT: v_or_b32_e32 v52, v64, v67 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-NEXT: v_or_b32_e32 v1, v9, v39 -; GFX11-NEXT: v_or_b32_e32 v2, v10, v51 -; GFX11-NEXT: v_or_b32_e32 v3, v11, v50 -; GFX11-NEXT: v_or_b32_e32 v4, v12, v52 -; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v13 -; GFX11-NEXT: v_lshlrev_b16 v6, 8, v102 -; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v100 -; GFX11-NEXT: v_lshlrev_b16 v8, 8, v49 -; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v14 -; GFX11-NEXT: v_lshlrev_b16 v10, 8, v98 -; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v97 -; GFX11-NEXT: v_lshlrev_b16 v12, 8, v96 -; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v15 -; GFX11-NEXT: v_lshlrev_b16 v14, 8, v86 -; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v82 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v48 -; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v16 -; GFX11-NEXT: v_lshlrev_b16 v13, 8, v80 -; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v70 -; GFX11-NEXT: v_lshlrev_b16 v15, 8, v68 -; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v17 -; GFX11-NEXT: v_lshlrev_b16 v17, 8, v75 -; GFX11-NEXT: v_and_b32_e32 v39, 0xff, v73 -; GFX11-NEXT: v_lshlrev_b16 v48, 8, v66 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-NEXT: v_or_b32_e32 v14, v39, v48 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX11-NEXT: v_or_b32_e32 v7, v9, v10 -; GFX11-NEXT: v_or_b32_e32 v8, v11, v12 -; GFX11-NEXT: v_or_b32_e32 v9, v13, v14 -; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v18 -; GFX11-NEXT: v_lshlrev_b16 v11, 8, v61 -; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v59 -; GFX11-NEXT: v_lshlrev_b16 v13, 8, v58 -; GFX11-NEXT: v_and_b32_e32 v14, 0xff, v19 -; GFX11-NEXT: v_lshlrev_b16 v15, 8, v56 -; GFX11-NEXT: v_and_b32_e32 v16, 0xff, v46 -; GFX11-NEXT: v_lshlrev_b16 v17, 8, v65 -; GFX11-NEXT: v_and_b32_e32 v18, 0xff, v20 -; GFX11-NEXT: v_lshlrev_b16 v19, 8, v44 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v41 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v40 -; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v21 -; GFX11-NEXT: v_lshlrev_b16 v18, 8, v182 -; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v180 -; GFX11-NEXT: v_lshlrev_b16 v20, 8, v38 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v22 -; GFX11-NEXT: v_lshlrev_b16 v22, 8, v178 -; GFX11-NEXT: v_and_b32_e32 v38, 0xff, v176 -; GFX11-NEXT: v_lshlrev_b16 v39, 8, v166 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-NEXT: v_or_b32_e32 v19, v38, v39 -; GFX11-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-NEXT: v_or_b32_e32 v10, v10, v11 -; GFX11-NEXT: v_or_b32_e32 v11, v12, v13 -; GFX11-NEXT: v_or_b32_e32 v12, v14, v15 -; GFX11-NEXT: v_or_b32_e32 v13, v16, v17 -; GFX11-NEXT: v_or_b32_e32 v14, v18, v19 -; GFX11-NEXT: v_and_b32_e32 v15, 0xff, v23 -; GFX11-NEXT: v_lshlrev_b16 v16, 8, v163 -; GFX11-NEXT: v_and_b32_e32 v17, 0xff, v162 -; GFX11-NEXT: v_lshlrev_b16 v18, 8, v37 -; GFX11-NEXT: v_and_b32_e32 v19, 0xff, v24 -; GFX11-NEXT: v_lshlrev_b16 v20, 8, v160 -; GFX11-NEXT: v_and_b32_e32 v21, 0xff, v149 -; GFX11-NEXT: v_lshlrev_b16 v22, 8, v148 -; GFX11-NEXT: v_and_b32_e32 v23, 0xff, v25 -; GFX11-NEXT: v_lshlrev_b16 v24, 8, v146 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v135 -; GFX11-NEXT: v_lshlrev_b16 v21, 8, v36 -; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v26 -; GFX11-NEXT: v_lshlrev_b16 v23, 8, v134 -; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v132 -; GFX11-NEXT: v_lshlrev_b16 v25, 8, v130 -; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v27 -; GFX11-NEXT: v_lshlrev_b16 v27, 8, v128 -; GFX11-NEXT: v_and_b32_e32 v36, 0xff, v118 -; GFX11-NEXT: v_lshlrev_b16 v35, 8, v35 -; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-NEXT: v_or_b32_e32 v24, v36, v35 -; GFX11-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-NEXT: v_or_b32_e32 v15, v15, v16 -; GFX11-NEXT: v_or_b32_e32 v16, v17, v18 -; GFX11-NEXT: v_or_b32_e32 v17, v19, v20 -; GFX11-NEXT: v_or_b32_e32 v18, v21, v22 -; GFX11-NEXT: v_or_b32_e32 v19, v23, v24 -; GFX11-NEXT: v_and_b32_e32 v20, 0xff, v28 -; GFX11-NEXT: v_lshlrev_b16 v21, 8, v114 -; GFX11-NEXT: v_and_b32_e32 v22, 0xff, v113 -; GFX11-NEXT: v_lshlrev_b16 v23, 8, v112 -; GFX11-NEXT: v_and_b32_e32 v24, 0xff, v29 -; GFX11-NEXT: v_lshlrev_b16 v25, 8, v103 -; GFX11-NEXT: v_and_b32_e32 v26, 0xff, v101 -; GFX11-NEXT: v_lshlrev_b16 v27, 8, v34 -; GFX11-NEXT: v_and_b32_e32 v28, 0xff, v30 -; GFX11-NEXT: v_lshlrev_b16 v29, 8, v99 -; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-NEXT: v_and_b32_e32 v25, 0xff, v87 -; GFX11-NEXT: v_lshlrev_b16 v26, 8, v85 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v27, 0xff, v31 -; GFX11-NEXT: v_lshlrev_b16 v28, 8, v84 -; GFX11-NEXT: v_and_b32_e32 v29, 0xff, v83 -; GFX11-NEXT: v_lshlrev_b16 v30, 8, v33 -; GFX11-NEXT: v_and_b32_e32 v31, 0xff, v32 -; GFX11-NEXT: v_lshlrev_b16 v32, 8, v81 -; GFX11-NEXT: v_and_b32_e32 v33, 0xff, v71 -; GFX11-NEXT: v_lshlrev_b16 v34, 8, v69 -; GFX11-NEXT: v_or_b32_e32 v25, v25, v26 -; GFX11-NEXT: v_or_b32_e32 v26, v27, v28 -; GFX11-NEXT: v_or_b32_e32 v27, v29, v30 -; GFX11-NEXT: v_or_b32_e32 v28, v31, v32 -; GFX11-NEXT: v_or_b32_e32 v29, v33, v34 -; GFX11-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; GFX11-NEXT: v_or_b32_e32 v20, v20, v21 -; GFX11-NEXT: v_or_b32_e32 v21, v22, v23 -; GFX11-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-NEXT: s_clause 0x13 -; GFX11-NEXT: scratch_load_b32 v75, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v74, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v73, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v72, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v63, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v62, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v61, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v60, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v59, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v58, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v57, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v56, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v47, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v45, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v44, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v42, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:88 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <64 x i16> %a, splat (i16 3) - %a2 = bitcast <64 x i16> %a1 to <128 x i8> - br label %end - -cmp.false: - %a3 = bitcast <64 x i16> %a to <128 x i8> - br label %end - -end: - %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <128 x i8> %phi -} - define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GCN-LABEL: bitcast_v64bf16_to_v64f16: ; GCN: ; %bb.0: @@ -97989,7 +93041,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GCN-NEXT: ; implicit-def: $vgpr3 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_2 +; GCN-NEXT: s_cbranch_execz .LBB49_2 ; GCN-NEXT: ; %bb.1: ; %cmp.false ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -98344,9 +93396,9 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GCN-NEXT: ; implicit-def: $vgpr63 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr2 -; GCN-NEXT: .LBB50_2: ; %Flow +; GCN-NEXT: .LBB49_2: ; %Flow ; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB50_4 +; GCN-NEXT: s_cbranch_execz .LBB49_4 ; GCN-NEXT: ; %bb.3: ; %cmp.true ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 @@ -98790,7 +93842,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GCN-NEXT: .LBB50_4: ; %end +; GCN-NEXT: .LBB49_4: ; %end ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload @@ -99104,7 +94156,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB50_2 +; VI-NEXT: s_cbranch_execz .LBB49_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -99690,7 +94742,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB50_2: ; %end +; VI-NEXT: .LBB49_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -99737,7 +94789,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB50_2 +; GFX9-NEXT: s_cbranch_execz .LBB49_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 ; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -100227,7 +95279,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v18, v18, v34, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v16, v32, v16, s6 -; GFX9-NEXT: .LBB50_2: ; %end +; GFX9-NEXT: .LBB49_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -100260,7 +95312,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB50_2 +; GFX11-NEXT: s_cbranch_execz .LBB49_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v16 @@ -100770,7 +95822,7 @@ define <64 x half> @bitcast_v64bf16_to_v64f16(<64 x bfloat> %a, i32 %b) { ; GFX11-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v11, v11, v83, 0x7060302 -; GFX11-NEXT: .LBB50_2: ; %end +; GFX11-NEXT: .LBB49_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -101055,7 +96107,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GCN-NEXT: ; kill: killed $vgpr2 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_2 +; GCN-NEXT: s_cbranch_execz .LBB50_2 ; GCN-NEXT: ; %bb.1: ; %cmp.false ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -101319,9 +96371,9 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GCN-NEXT: ; implicit-def: $vgpr63 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: ; implicit-def: $vgpr31 -; GCN-NEXT: .LBB51_2: ; %Flow +; GCN-NEXT: .LBB50_2: ; %Flow ; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB51_4 +; GCN-NEXT: s_cbranch_execz .LBB50_4 ; GCN-NEXT: ; %bb.3: ; %cmp.true ; GCN-NEXT: v_cvt_f32_f16_e32 v31, v31 ; GCN-NEXT: s_waitcnt expcnt(0) @@ -101727,7 +96779,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GCN-NEXT: .LBB51_4: ; %end +; GCN-NEXT: .LBB50_4: ; %end ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload @@ -102077,7 +97129,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB51_2 +; VI-NEXT: s_cbranch_execz .LBB50_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 0x200 ; VI-NEXT: v_add_f16_e32 v33, 0x200, v15 @@ -102177,7 +97229,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 ; VI-NEXT: v_or_b32_e32 v17, v33, v17 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB51_2: ; %end +; VI-NEXT: .LBB50_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -102192,7 +97244,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB51_2 +; GFX9-NEXT: s_cbranch_execz .LBB50_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -102228,7 +97280,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB51_2: ; %end +; GFX9-NEXT: .LBB50_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -102245,7 +97297,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB51_2 +; GFX11-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -102280,7 +97332,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB51_2: ; %end +; GFX11-NEXT: .LBB50_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -102583,7 +97635,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GCN-NEXT: ; implicit-def: $vgpr26 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_2 +; GCN-NEXT: s_cbranch_execz .LBB51_2 ; GCN-NEXT: ; %bb.1: ; %cmp.false ; GCN-NEXT: v_lshrrev_b32_e32 v26, 16, v36 ; GCN-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill @@ -102862,9 +97914,9 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GCN-NEXT: ; implicit-def: $vgpr14 ; GCN-NEXT: ; implicit-def: $vgpr12 ; GCN-NEXT: ; implicit-def: $vgpr11 -; GCN-NEXT: .LBB52_2: ; %Flow +; GCN-NEXT: .LBB51_2: ; %Flow ; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB52_4 +; GCN-NEXT: s_cbranch_execz .LBB51_4 ; GCN-NEXT: ; %bb.3: ; %cmp.true ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v35 @@ -103214,7 +98266,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_alignbit_b32 v1, v39, v11, 16 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GCN-NEXT: .LBB52_4: ; %end +; GCN-NEXT: .LBB51_4: ; %end ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload @@ -103506,7 +98558,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB52_2 +; VI-NEXT: s_cbranch_execz .LBB51_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v16 ; VI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -104092,7 +99144,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_alignbit_b32 v16, v16, v32, 16 -; VI-NEXT: .LBB52_2: ; %end +; VI-NEXT: .LBB51_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -104139,7 +99191,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB52_2 +; GFX9-NEXT: s_cbranch_execz .LBB51_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v16 ; GFX9-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 @@ -104629,7 +99681,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_perm_b32 v18, v18, v34, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_perm_b32 v16, v32, v16, s6 -; GFX9-NEXT: .LBB52_2: ; %end +; GFX9-NEXT: .LBB51_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -104662,7 +99714,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB52_2 +; GFX11-NEXT: s_cbranch_execz .LBB51_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_lshlrev_b32_e32 v33, 16, v17 ; GFX11-NEXT: v_lshlrev_b32_e32 v32, 16, v16 @@ -105172,7 +100224,7 @@ define <64 x i16> @bitcast_v64bf16_to_v64i16(<64 x bfloat> %a, i32 %b) { ; GFX11-NEXT: v_cndmask_b32_e32 v11, v85, v96, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_perm_b32 v11, v11, v83, 0x7060302 -; GFX11-NEXT: .LBB52_2: ; %end +; GFX11-NEXT: .LBB51_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -105370,7 +100422,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GCN-NEXT: ; implicit-def: $vgpr28 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_2 +; GCN-NEXT: s_cbranch_execz .LBB52_2 ; GCN-NEXT: ; %bb.1: ; %cmp.false ; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v1 ; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v3 @@ -105460,9 +100512,9 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GCN-NEXT: ; implicit-def: $vgpr12 ; GCN-NEXT: ; implicit-def: $vgpr14 ; GCN-NEXT: ; implicit-def: $vgpr10 -; GCN-NEXT: .LBB53_2: ; %Flow +; GCN-NEXT: .LBB52_2: ; %Flow ; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB53_4 +; GCN-NEXT: s_cbranch_execz .LBB52_4 ; GCN-NEXT: ; %bb.3: ; %cmp.true ; GCN-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v10 @@ -105767,7 +100819,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v14 ; GCN-NEXT: v_and_b32_e32 v55, 0xffff0000, v10 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v10 -; GCN-NEXT: .LBB53_4: ; %end +; GCN-NEXT: .LBB52_4: ; %end ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload @@ -106063,7 +101115,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB53_2 +; VI-NEXT: s_cbranch_execz .LBB52_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 3 ; VI-NEXT: v_add_u16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -106163,7 +101215,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_e32 v17, v17, v33 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB53_2: ; %end +; VI-NEXT: .LBB52_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -106178,7 +101230,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB53_2 +; GFX9-NEXT: s_cbranch_execz .LBB52_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -106213,7 +101265,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB53_2: ; %end +; GFX9-NEXT: .LBB52_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -106230,7 +101282,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB53_2 +; GFX11-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -106265,7 +101317,7 @@ define <64 x bfloat> @bitcast_v64i16_to_v64bf16(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB53_2: ; %end +; GFX11-NEXT: .LBB52_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -106453,7 +101505,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GCN-NEXT: v_mov_b32_e32 v36, v5 ; GCN-NEXT: v_mov_b32_e32 v46, v6 ; GCN-NEXT: s_xor_b64 exec, exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB54_2 +; GCN-NEXT: s_cbranch_execz .LBB53_2 ; GCN-NEXT: ; %bb.1: ; %cmp.true ; GCN-NEXT: v_cvt_f32_f16_e32 v12, v12 ; GCN-NEXT: v_cvt_f32_f16_e32 v9, v9 @@ -106786,7 +101838,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GCN-NEXT: v_alignbit_b32 v43, v14, v43, 16 ; GCN-NEXT: v_alignbit_b32 v40, v11, v40, 16 ; GCN-NEXT: v_alignbit_b32 v55, v9, v55, 16 -; GCN-NEXT: .LBB54_2: ; %end +; GCN-NEXT: .LBB53_2: ; %end ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v30, v1 ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -107008,7 +102060,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB54_2 +; VI-NEXT: s_cbranch_execz .LBB53_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 0x200 ; VI-NEXT: v_add_f16_e32 v33, 0x200, v15 @@ -107108,7 +102160,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 ; VI-NEXT: v_or_b32_e32 v17, v33, v17 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB54_2: ; %end +; VI-NEXT: .LBB53_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -107123,7 +102175,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB54_2 +; GFX9-NEXT: s_cbranch_execz .LBB53_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: s_movk_i32 s6, 0x200 ; GFX9-NEXT: v_pk_add_f16 v15, v15, s6 op_sel_hi:[1,0] @@ -107159,7 +102211,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_f16 v18, v18, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v17, v17, s6 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_f16 v16, v16, s6 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB54_2: ; %end +; GFX9-NEXT: .LBB53_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -107176,7 +102228,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB54_2 +; GFX11-NEXT: s_cbranch_execz .LBB53_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] @@ -107211,7 +102263,7 @@ define <64 x i16> @bitcast_v64f16_to_v64i16(<64 x half> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-NEXT: .LBB54_2: ; %end +; GFX11-NEXT: .LBB53_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -107424,7 +102476,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GCN-NEXT: ; kill: killed $vgpr1 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_2 +; GCN-NEXT: s_cbranch_execz .LBB54_2 ; GCN-NEXT: ; %bb.1: ; %cmp.false ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -107687,9 +102739,9 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GCN-NEXT: ; implicit-def: $vgpr44 ; GCN-NEXT: ; implicit-def: $vgpr45 ; GCN-NEXT: ; implicit-def: $vgpr46 -; GCN-NEXT: .LBB55_2: ; %Flow +; GCN-NEXT: .LBB54_2: ; %Flow ; GCN-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN-NEXT: s_cbranch_execz .LBB55_4 +; GCN-NEXT: s_cbranch_execz .LBB54_4 ; GCN-NEXT: ; %bb.3: ; %cmp.true ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_add_i32_e32 v1, vcc, 3, v46 @@ -107963,7 +103015,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: .LBB55_4: ; %end +; GCN-NEXT: .LBB54_4: ; %end ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload @@ -108317,7 +103369,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; VI-NEXT: s_cbranch_execz .LBB55_2 +; VI-NEXT: s_cbranch_execz .LBB54_2 ; VI-NEXT: ; %bb.1: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v32, 3 ; VI-NEXT: v_add_u16_sdwa v33, v15, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -108417,7 +103469,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_or_b32_e32 v17, v17, v33 ; VI-NEXT: v_or_b32_e32 v16, v16, v32 -; VI-NEXT: .LBB55_2: ; %end +; VI-NEXT: .LBB54_2: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] @@ -108432,7 +103484,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: s_cbranch_execz .LBB55_2 +; GFX9-NEXT: s_cbranch_execz .LBB54_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.true ; GFX9-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -108467,7 +103519,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GFX9-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX9-NEXT: .LBB55_2: ; %end +; GFX9-NEXT: .LBB54_2: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -108484,7 +103536,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX11-NEXT: s_cbranch_execz .LBB55_2 +; GFX11-NEXT: s_cbranch_execz .LBB54_2 ; GFX11-NEXT: ; %bb.1: ; %cmp.true ; GFX11-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] @@ -108519,7 +103571,7 @@ define <64 x half> @bitcast_v64i16_to_v64f16(<64 x i16> %a, i32 %b) { ; GFX11-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-NEXT: .LBB55_2: ; %end +; GFX11-NEXT: .LBB54_2: ; %end ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 03c6a36ac9861..c2cac55e13b09 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <4 x float> @bitcast_v4i32_to_v4f32(<4 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v4i32_to_v4f32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll index e7262375fbeb0..ee6f2708990bb 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <5 x float> @bitcast_v5i32_to_v5f32(<5 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v5i32_to_v5f32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll index 1185a12a474ea..b3d9e61b65b6f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define half @bitcast_i16_to_f16(i16 %a, i32 %b) { ; GCN-LABEL: bitcast_i16_to_f16: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll index 61f9232ea50a1..ecaf64567500f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <6 x float> @bitcast_v6i32_to_v6f32(<6 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v6i32_to_v6f32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll index 952be022750a6..d2e46475487c2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <7 x float> @bitcast_v7i32_to_v7f32(<7 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v7i32_to_v7f32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 6e7b5dd33ea0b..b52d8a89035bc 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <8 x float> @bitcast_v8i32_to_v8f32(<8 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v8i32_to_v8f32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll index 0adf547e19362..76f16189b7f97 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <9 x float> @bitcast_v9i32_to_v9f32(<9 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v9i32_to_v9f32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index f1e0c19f7fca3..05412efc100f6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <10 x float> @bitcast_v10i32_to_v10f32(<10 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v10i32_to_v10f32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 1c51395128917..332c971e5709f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define float @bitcast_i32_to_f32(i32 %a, i32 %b) { ; GCN-LABEL: bitcast_i32_to_f32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll index 922a47ea77fcd..fa1f3bd96ad2f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <11 x float> @bitcast_v11i32_to_v11f32(<11 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v11i32_to_v11f32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll index f67af98fba0fa..c7af60f324892 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <12 x float> @bitcast_v12i32_to_v12f32(<12 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v12i32_to_v12f32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll index a5764f9da3194..b0fa6a21cd5f1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <14 x i32> @bitcast_v7i64_to_v14i32(<7 x i64> %a, i32 %b) { ; GCN-LABEL: bitcast_v7i64_to_v14i32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index d09aaf12161a2..9bb360f2e3b09 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) { ; GCN-LABEL: bitcast_v3bf16_to_v3f16: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index f0ce1784eb107..0d1008082f586 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <16 x float> @bitcast_v16i32_to_v16f32(<16 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v16i32_to_v16f32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index 78f611d83b532..110c6109b1556 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <18 x i32> @bitcast_v9i64_to_v18i32(<9 x i64> %a, i32 %b) { ; GCN-LABEL: bitcast_v9i64_to_v18i32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index 8d2501e42f2d1..bea2243e8087c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <20 x i32> @bitcast_v10i64_to_v20i32(<10 x i64> %a, i32 %b) { ; GCN-LABEL: bitcast_v10i64_to_v20i32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 540888ab607b0..26ce1771e220d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define double @bitcast_i64_to_f64(i64 %a, i32 %b) { ; GCN-LABEL: bitcast_i64_to_f64: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 702c1d05a0e3e..05fb285362f09 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <22 x i32> @bitcast_v11i64_to_v22i32(<11 x i64> %a, i32 %b) { ; GCN-LABEL: bitcast_v11i64_to_v22i32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 182c63502d77b..45e9c321d4aac 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <24 x i32> @bitcast_v12i64_to_v24i32(<12 x i64> %a, i32 %b) { ; GCN-LABEL: bitcast_v12i64_to_v24i32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 9869dca91b4d4..4dcfaee680984 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <26 x i32> @bitcast_v13i64_to_v26i32(<13 x i64> %a, i32 %b) { ; GCN-LABEL: bitcast_v13i64_to_v26i32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index 9f1a9c8dc89c3..37cf5b81b81e3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <28 x i32> @bitcast_v14i64_to_v28i32(<14 x i64> %a, i32 %b) { ; GCN-LABEL: bitcast_v14i64_to_v28i32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 80e5a18631189..ca0c0bb0d4ca2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <30 x i32> @bitcast_v15i64_to_v30i32(<15 x i64> %a, i32 %b) { ; GCN-LABEL: bitcast_v15i64_to_v30i32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 7a3609e29a0c5..b87e7b0916032 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define <3 x float> @bitcast_v3i32_to_v3f32(<3 x i32> %a, i32 %b) { ; GCN-LABEL: bitcast_v3i32_to_v3f32: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll index eb4429958dfa5..01a1e6b73ac6a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ptr.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_kernel void @bitcast_i8ptr_v16i8ptr(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: bitcast_i8ptr_v16i8ptr: From 308654608cb8bc5bbd5d4b3779cb7d92920dd6b7 Mon Sep 17 00:00:00 2001 From: Orlando Cazalet-Hyams Date: Tue, 1 Apr 2025 15:20:22 +0100 Subject: [PATCH 0945/1029] [Clang][NFC] Move some static functions into CodeGenFunction (#134634) Patches in the Key Instructions (KeyInstr) stack need to access CGF in these functions. 2 CGF fields are passed to these functions already; at this point it felt natural to promote them to CGF methods. --- clang/lib/CodeGen/CGDecl.cpp | 52 +++++++++++++---------------- clang/lib/CodeGen/CodeGenFunction.h | 11 ++++++ 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index eab1ebfb2369b..0af170a36f372 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -928,10 +928,9 @@ static bool canEmitInitWithFewStoresAfterBZero(llvm::Constant *Init, /// For inits that canEmitInitWithFewStoresAfterBZero returned true for, emit /// the scalar stores that would be required. -static void emitStoresForInitAfterBZero(CodeGenModule &CGM, - llvm::Constant *Init, Address Loc, - bool isVolatile, CGBuilderTy &Builder, - bool IsAutoInit) { +void CodeGenFunction::emitStoresForInitAfterBZero(llvm::Constant *Init, + Address Loc, bool isVolatile, + bool IsAutoInit) { assert(!Init->isNullValue() && !isa(Init) && "called emitStoresForInitAfterBZero for zero or undef value."); @@ -952,8 +951,8 @@ static void emitStoresForInitAfterBZero(CodeGenModule &CGM, // If necessary, get a pointer to the element and emit it. if (!Elt->isNullValue() && !isa(Elt)) emitStoresForInitAfterBZero( - CGM, Elt, Builder.CreateConstInBoundsGEP2_32(Loc, 0, i), isVolatile, - Builder, IsAutoInit); + Elt, Builder.CreateConstInBoundsGEP2_32(Loc, 0, i), isVolatile, + IsAutoInit); } return; } @@ -966,9 +965,9 @@ static void emitStoresForInitAfterBZero(CodeGenModule &CGM, // If necessary, get a pointer to the element and emit it. if (!Elt->isNullValue() && !isa(Elt)) - emitStoresForInitAfterBZero(CGM, Elt, + emitStoresForInitAfterBZero(Elt, Builder.CreateConstInBoundsGEP2_32(Loc, 0, i), - isVolatile, Builder, IsAutoInit); + isVolatile, IsAutoInit); } } @@ -1169,10 +1168,10 @@ static Address createUnnamedGlobalForMemcpyFrom(CodeGenModule &CGM, return SrcPtr.withElementType(CGM.Int8Ty); } -static void emitStoresForConstant(CodeGenModule &CGM, const VarDecl &D, - Address Loc, bool isVolatile, - CGBuilderTy &Builder, - llvm::Constant *constant, bool IsAutoInit) { +void CodeGenFunction::emitStoresForConstant(const VarDecl &D, Address Loc, + bool isVolatile, + llvm::Constant *constant, + bool IsAutoInit) { auto *Ty = constant->getType(); uint64_t ConstantSize = CGM.getDataLayout().getTypeAllocSize(Ty); if (!ConstantSize) @@ -1201,8 +1200,7 @@ static void emitStoresForConstant(CodeGenModule &CGM, const VarDecl &D, constant->isNullValue() || isa(constant); if (!valueAlreadyCorrect) { Loc = Loc.withElementType(Ty); - emitStoresForInitAfterBZero(CGM, constant, Loc, isVolatile, Builder, - IsAutoInit); + emitStoresForInitAfterBZero(constant, Loc, isVolatile, IsAutoInit); } return; } @@ -1240,7 +1238,7 @@ static void emitStoresForConstant(CodeGenModule &CGM, const VarDecl &D, CharUnits::fromQuantity(Layout->getElementOffset(i)); Address EltPtr = Builder.CreateConstInBoundsByteGEP( Loc.withElementType(CGM.Int8Ty), CurOff); - emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder, + emitStoresForConstant(D, EltPtr, isVolatile, constant->getAggregateElement(i), IsAutoInit); } return; @@ -1251,7 +1249,7 @@ static void emitStoresForConstant(CodeGenModule &CGM, const VarDecl &D, for (unsigned i = 0; i != ATy->getNumElements(); i++) { Address EltPtr = Builder.CreateConstGEP( Loc.withElementType(ATy->getElementType()), i); - emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder, + emitStoresForConstant(D, EltPtr, isVolatile, constant->getAggregateElement(i), IsAutoInit); } return; @@ -1269,24 +1267,22 @@ static void emitStoresForConstant(CodeGenModule &CGM, const VarDecl &D, I->addAnnotationMetadata("auto-init"); } -static void emitStoresForZeroInit(CodeGenModule &CGM, const VarDecl &D, - Address Loc, bool isVolatile, - CGBuilderTy &Builder) { +void CodeGenFunction::emitStoresForZeroInit(const VarDecl &D, Address Loc, + bool isVolatile) { llvm::Type *ElTy = Loc.getElementType(); llvm::Constant *constant = constWithPadding(CGM, IsPattern::No, llvm::Constant::getNullValue(ElTy)); - emitStoresForConstant(CGM, D, Loc, isVolatile, Builder, constant, + emitStoresForConstant(D, Loc, isVolatile, constant, /*IsAutoInit=*/true); } -static void emitStoresForPatternInit(CodeGenModule &CGM, const VarDecl &D, - Address Loc, bool isVolatile, - CGBuilderTy &Builder) { +void CodeGenFunction::emitStoresForPatternInit(const VarDecl &D, Address Loc, + bool isVolatile) { llvm::Type *ElTy = Loc.getElementType(); llvm::Constant *constant = constWithPadding( CGM, IsPattern::Yes, initializationPatternFor(CGM, ElTy)); assert(!isa(constant)); - emitStoresForConstant(CGM, D, Loc, isVolatile, Builder, constant, + emitStoresForConstant(D, Loc, isVolatile, constant, /*IsAutoInit=*/true); } @@ -1829,7 +1825,7 @@ void CodeGenFunction::emitZeroOrPatternForAutoVarInit(QualType type, if (trivialAutoVarInitMaxSize > 0 && allocSize > trivialAutoVarInitMaxSize) return; - emitStoresForZeroInit(CGM, D, Loc, isVolatile, Builder); + emitStoresForZeroInit(D, Loc, isVolatile); break; case LangOptions::TrivialAutoVarInitKind::Pattern: if (CGM.stopAutoInit()) @@ -1837,7 +1833,7 @@ void CodeGenFunction::emitZeroOrPatternForAutoVarInit(QualType type, if (trivialAutoVarInitMaxSize > 0 && allocSize > trivialAutoVarInitMaxSize) return; - emitStoresForPatternInit(CGM, D, Loc, isVolatile, Builder); + emitStoresForPatternInit(D, Loc, isVolatile); break; } return; @@ -2052,8 +2048,8 @@ void CodeGenFunction::EmitAutoVarInit(const AutoVarEmission &emission) { return EmitStoreThroughLValue(RValue::get(constant), lv, true); } - emitStoresForConstant(CGM, D, Loc.withElementType(CGM.Int8Ty), - type.isVolatileQualified(), Builder, constant, + emitStoresForConstant(D, Loc.withElementType(CGM.Int8Ty), + type.isVolatileQualified(), constant, /*IsAutoInit=*/false); } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 34dee6df9dcfc..2b1062d6d307c 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -2800,6 +2800,17 @@ class CodeGenFunction : public CodeGenTypeCache { }; AllocaTracker *Allocas = nullptr; + /// CGDecl helper. + void emitStoresForConstant(const VarDecl &D, Address Loc, bool isVolatile, + llvm::Constant *constant, bool IsAutoInit); + /// CGDecl helper. + void emitStoresForZeroInit(const VarDecl &D, Address Loc, bool isVolatile); + /// CGDecl helper. + void emitStoresForPatternInit(const VarDecl &D, Address Loc, bool isVolatile); + /// CGDecl helper. + void emitStoresForInitAfterBZero(llvm::Constant *Init, Address Loc, + bool isVolatile, bool IsAutoInit); + public: // Captures all the allocas created during the scope of its RAII object. struct AllocaTrackerRAII { From 87a187cedf09be0d7d1fa6eae1a8b644ee15db12 Mon Sep 17 00:00:00 2001 From: lorenzo chelini Date: Tue, 8 Apr 2025 10:30:47 +0200 Subject: [PATCH 0946/1029] [MLIR][NFC] Retire let constructor for Tosa (#134784) `let constructor` is legacy (do not use in tree!) since the tableGen backend emits most of the glue logic to build a pass. --- .../mlir/Dialect/Tosa/Transforms/Passes.h | 6 ------ .../mlir/Dialect/Tosa/Transforms/Passes.td | 16 +++++----------- .../Conversion/TosaToLinalg/TosaToLinalgPass.cpp | 3 ++- .../Dialect/Tosa/Transforms/TosaInferShapes.cpp | 10 +++------- .../Transforms/TosaLayerwiseConstantFoldPass.cpp | 14 +------------- .../Tosa/Transforms/TosaMakeBroadcastable.cpp | 8 ++------ .../Transforms/TosaOptionalDecompositions.cpp | 8 ++------ 7 files changed, 15 insertions(+), 50 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h index 33bbc069c521d..306e4b1f218e7 100644 --- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h @@ -41,13 +41,7 @@ void populateTosaConstantReduction(MLIRContext *ctx, void populateTosaTypeConversion(TypeConverter &converter); -std::unique_ptr createTosaLayerwiseConstantFoldPass(); -std::unique_ptr createTosaLayerwiseConstantFoldPass( - const TosaLayerwiseConstantFoldPassOptions &options); -std::unique_ptr createTosaInferShapesPass(); -std::unique_ptr createTosaMakeBroadcastablePass(); std::unique_ptr createTosaTestQuantUtilAPIPass(); -std::unique_ptr createTosaOptionalDecompositions(); #define GEN_PASS_REGISTRATION #include "mlir/Dialect/Tosa/Transforms/Passes.h.inc" diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td index 2d5b0b39df078..d005a4cc6859c 100644 --- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td @@ -22,8 +22,6 @@ def TosaLayerwiseConstantFoldPass : Pass<"tosa-layerwise-constant-fold", "func:: Pass that enables folding of full-layer operations on constant tensors. }]; - let constructor = "createTosaLayerwiseConstantFoldPass()"; - let options = [ Option<"aggressiveReduceConstant", "aggressive-reduce-constant", "bool", /*default=*/"false", @@ -32,14 +30,13 @@ def TosaLayerwiseConstantFoldPass : Pass<"tosa-layerwise-constant-fold", "func:: ]; } -def TosaInferShapes : Pass<"tosa-infer-shapes", "func::FuncOp"> { +def TosaInferShapesPass : Pass<"tosa-infer-shapes", "func::FuncOp"> { let summary = "Propagate shapes across TOSA operations"; let description = [{ Pass that uses operand types and propagates shapes to TOSA operations. This includes legalizing rankless and dynamic shapes towards static. }]; - let constructor = "createTosaInferShapesPass()"; let dependentDialects = [ "func::FuncDialect", "tensor::TensorDialect", @@ -47,7 +44,8 @@ def TosaInferShapes : Pass<"tosa-infer-shapes", "func::FuncOp"> { ]; } -def TosaMakeBroadcastable : Pass<"tosa-make-broadcastable", "func::FuncOp"> { +def TosaMakeBroadcastablePass + : Pass<"tosa-make-broadcastable", "func::FuncOp"> { let summary = "TOSA rank Reshape to enable Broadcasting"; let description = [{ Pass that enables broadcast by making all input arrays have the same @@ -56,19 +54,15 @@ def TosaMakeBroadcastable : Pass<"tosa-make-broadcastable", "func::FuncOp"> { approach similar to step 1 of Numpy 4-step broadcasting: https://numpy.org/doc/stable/reference/ufuncs.html#broadcasting }]; - - let constructor = "createTosaMakeBroadcastablePass()"; } -def TosaOptionalDecompositions - : Pass<"tosa-optional-decompositions", "func::FuncOp"> { +def TosaOptionalDecompositionsPass + : Pass<"tosa-optional-decompositions", "func::FuncOp"> { let summary = "Applies Tosa operations optional decompositions"; let description = [{ Pass to apply the Tosa operations decompositions exposed as populate functions in include/mlir/Dialect/Tosa/Transforms/Passes.h }]; - - let constructor = "tosa::createTosaOptionalDecompositions()"; } def TosaLevelType : I32EnumAttr<"TosaLevelEnum", "Tosa level", diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp index 4cf232a7bc767..01a7cd7ac94db 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp @@ -85,7 +85,8 @@ void mlir::tosa::addTosaToLinalgPasses( std::optional validationOptions) { // Optional decompositions are designed to benefit linalg. if (!options.disableTosaDecompositions) - pm.addNestedPass(tosa::createTosaOptionalDecompositions()); + pm.addNestedPass( + tosa::createTosaOptionalDecompositionsPass()); pm.addNestedPass(createCanonicalizerPass()); pm.addNestedPass(tosa::createTosaInferShapesPass()); diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp index 0d4ea9710d723..9aa0051070bd6 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaInferShapes.cpp @@ -25,7 +25,7 @@ namespace mlir { namespace tosa { -#define GEN_PASS_DEF_TOSAINFERSHAPES +#define GEN_PASS_DEF_TOSAINFERSHAPESPASS #include "mlir/Dialect/Tosa/Transforms/Passes.h.inc" } // namespace tosa } // namespace mlir @@ -333,7 +333,7 @@ void validateSameOperandsAndResultRankTrait(Region ®ion) { /// Pass that performs shape propagation across TOSA operations. This includes /// migrating to within the regions of if/while operations. struct TosaInferShapes - : public tosa::impl::TosaInferShapesBase { + : public tosa::impl::TosaInferShapesPassBase { public: void runOnOperation() override { func::FuncOp func = getOperation(); @@ -344,8 +344,4 @@ struct TosaInferShapes validateSameOperandsAndResultRankTrait(func.getBody()); } }; -} // namespace - -std::unique_ptr mlir::tosa::createTosaInferShapesPass() { - return std::make_unique(); -} +} // namespace \ No newline at end of file diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaLayerwiseConstantFoldPass.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaLayerwiseConstantFoldPass.cpp index 9299db7e51a01..f4ce950828646 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaLayerwiseConstantFoldPass.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaLayerwiseConstantFoldPass.cpp @@ -45,9 +45,7 @@ void populateTosaOpsCanonicalizationPatterns(MLIRContext *ctx, struct TosaLayerwiseConstantFoldPass : public tosa::impl::TosaLayerwiseConstantFoldPassBase< TosaLayerwiseConstantFoldPass> { - TosaLayerwiseConstantFoldPass( - const TosaLayerwiseConstantFoldPassOptions &options) - : TosaLayerwiseConstantFoldPassBase(options) {} + using Base::Base; void runOnOperation() override { auto *ctx = &getContext(); @@ -66,13 +64,3 @@ struct TosaLayerwiseConstantFoldPass }; } // namespace - -std::unique_ptr mlir::tosa::createTosaLayerwiseConstantFoldPass() { - return std::make_unique( - TosaLayerwiseConstantFoldPassOptions{false}); -} - -std::unique_ptr mlir::tosa::createTosaLayerwiseConstantFoldPass( - const TosaLayerwiseConstantFoldPassOptions &options) { - return std::make_unique(options); -} diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp index 87b2a2695351b..02a3ad83bdefa 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaMakeBroadcastable.cpp @@ -21,7 +21,7 @@ namespace mlir { namespace tosa { -#define GEN_PASS_DEF_TOSAMAKEBROADCASTABLE +#define GEN_PASS_DEF_TOSAMAKEBROADCASTABLEPASS #include "mlir/Dialect/Tosa/Transforms/Passes.h.inc" } // namespace tosa } // namespace mlir @@ -219,7 +219,7 @@ namespace { /// Pass that enables broadcast by making all input arrays have the same /// number of dimensions. Insert RESHAPE operations to lower rank operand struct TosaMakeBroadcastable - : public tosa::impl::TosaMakeBroadcastableBase { + : public tosa::impl::TosaMakeBroadcastablePassBase { public: void runOnOperation() override { auto func = getOperation(); @@ -250,7 +250,3 @@ struct TosaMakeBroadcastable } }; } // namespace - -std::unique_ptr mlir::tosa::createTosaMakeBroadcastablePass() { - return std::make_unique(); -} diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaOptionalDecompositions.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaOptionalDecompositions.cpp index ffa2ea3d0629f..2092379e65368 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaOptionalDecompositions.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaOptionalDecompositions.cpp @@ -21,7 +21,7 @@ namespace mlir { namespace tosa { -#define GEN_PASS_DEF_TOSAOPTIONALDECOMPOSITIONS +#define GEN_PASS_DEF_TOSAOPTIONALDECOMPOSITIONSPASS #include "mlir/Dialect/Tosa/Transforms/Passes.h.inc" } // namespace tosa } // namespace mlir @@ -31,7 +31,7 @@ using namespace mlir; namespace { struct TosaOptionalDecompositions - : public tosa::impl::TosaOptionalDecompositionsBase< + : public tosa::impl::TosaOptionalDecompositionsPassBase< TosaOptionalDecompositions> { void runOnOperation() override { auto *ctx = &getContext(); @@ -47,7 +47,3 @@ struct TosaOptionalDecompositions }; } // namespace - -std::unique_ptr mlir::tosa::createTosaOptionalDecompositions() { - return std::make_unique(); -} From e7365d3143d762c290c426b172f588756b6f1ef8 Mon Sep 17 00:00:00 2001 From: lorenzo chelini Date: Tue, 8 Apr 2025 10:31:15 +0200 Subject: [PATCH 0947/1029] [MLIR][NFC] Retire let constructor for Reducer (#134786) let constructor is legacy (do not use in tree!) since the tableGen backend emits most of the glue logic to build a pass. --- mlir/include/mlir/Reducer/Passes.h | 4 ---- mlir/include/mlir/Reducer/Passes.td | 8 ++------ mlir/lib/Reducer/OptReductionPass.cpp | 13 ++++++------- mlir/lib/Reducer/ReductionTreePass.cpp | 12 ++++-------- 4 files changed, 12 insertions(+), 25 deletions(-) diff --git a/mlir/include/mlir/Reducer/Passes.h b/mlir/include/mlir/Reducer/Passes.h index 474d87a37fc3c..d961737aaffae 100644 --- a/mlir/include/mlir/Reducer/Passes.h +++ b/mlir/include/mlir/Reducer/Passes.h @@ -15,10 +15,6 @@ namespace mlir { #define GEN_PASS_DECL #include "mlir/Reducer/Passes.h.inc" -std::unique_ptr createReductionTreePass(); - -std::unique_ptr createOptReductionPass(); - /// Generate the code for registering reducer passes. #define GEN_PASS_REGISTRATION #include "mlir/Reducer/Passes.h.inc" diff --git a/mlir/include/mlir/Reducer/Passes.td b/mlir/include/mlir/Reducer/Passes.td index cf89176106050..624e2e1edc329 100644 --- a/mlir/include/mlir/Reducer/Passes.td +++ b/mlir/include/mlir/Reducer/Passes.td @@ -24,11 +24,9 @@ def CommonReductionPassOptions { ]; } -def ReductionTree : Pass<"reduction-tree"> { +def ReductionTreePass : Pass<"reduction-tree"> { let summary = "Reduce the input with reduction-tree algorithm"; - let constructor = "mlir::createReductionTreePass()"; - let options = [ Option<"traversalModeId", "traversal-mode", "unsigned", /* default */"0", @@ -36,11 +34,9 @@ def ReductionTree : Pass<"reduction-tree"> { ] # CommonReductionPassOptions.options; } -def OptReduction : Pass<"opt-reduction-pass", "ModuleOp"> { +def OptReductionPass : Pass<"opt-reduction-pass", "ModuleOp"> { let summary = "A wrapper pass that reduces the file with optimization passes"; - let constructor = "mlir::createOptReductionPass()"; - let options = [ Option<"optPass", "opt-pass", "std::string", /* default */"", "The optimization passes used for reduction, e.g., symbol-dce">, diff --git a/mlir/lib/Reducer/OptReductionPass.cpp b/mlir/lib/Reducer/OptReductionPass.cpp index 8618de5eeee7b..fbbe75d303883 100644 --- a/mlir/lib/Reducer/OptReductionPass.cpp +++ b/mlir/lib/Reducer/OptReductionPass.cpp @@ -16,10 +16,11 @@ #include "mlir/Pass/PassRegistry.h" #include "mlir/Reducer/Passes.h" #include "mlir/Reducer/Tester.h" + #include "llvm/Support/Debug.h" namespace mlir { -#define GEN_PASS_DEF_OPTREDUCTION +#define GEN_PASS_DEF_OPTREDUCTIONPASS #include "mlir/Reducer/Passes.h.inc" } // namespace mlir @@ -29,8 +30,10 @@ using namespace mlir; namespace { -class OptReductionPass : public impl::OptReductionBase { +class OptReductionPass : public impl::OptReductionPassBase { public: + using Base::Base; + /// Runs the pass instance in the pass pipeline. void runOnOperation() override; }; @@ -85,8 +88,4 @@ void OptReductionPass::runOnOperation() { moduleVariant->destroy(); LLVM_DEBUG(llvm::dbgs() << "Pass Complete\n\n"); -} - -std::unique_ptr mlir::createOptReductionPass() { - return std::make_unique(); -} +} \ No newline at end of file diff --git a/mlir/lib/Reducer/ReductionTreePass.cpp b/mlir/lib/Reducer/ReductionTreePass.cpp index ef32adbab5577..7292752c712ae 100644 --- a/mlir/lib/Reducer/ReductionTreePass.cpp +++ b/mlir/lib/Reducer/ReductionTreePass.cpp @@ -29,7 +29,7 @@ #include "llvm/Support/ManagedStatic.h" namespace mlir { -#define GEN_PASS_DEF_REDUCTIONTREE +#define GEN_PASS_DEF_REDUCTIONTREEPASS #include "mlir/Reducer/Passes.h.inc" } // namespace mlir @@ -191,10 +191,10 @@ class ReductionPatternInterfaceCollection /// This class defines the Reduction Tree Pass. It provides a framework to /// to implement a reduction pass using a tree structure to keep track of the /// generated reduced variants. -class ReductionTreePass : public impl::ReductionTreeBase { +class ReductionTreePass + : public impl::ReductionTreePassBase { public: - ReductionTreePass() = default; - ReductionTreePass(const ReductionTreePass &pass) = default; + using Base::Base; LogicalResult initialize(MLIRContext *context) override; @@ -256,7 +256,3 @@ LogicalResult ReductionTreePass::reduceOp(ModuleOp module, Region ®ion) { return module.emitError() << "unsupported traversal mode detected"; } } - -std::unique_ptr mlir::createReductionTreePass() { - return std::make_unique(); -} From 0df0906af67fe81c0cc3ed6230518f4314884389 Mon Sep 17 00:00:00 2001 From: tianleliu Date: Tue, 8 Apr 2025 16:32:34 +0800 Subject: [PATCH 0948/1029] [JumpThreading] Use [BB->SuccIndx] to get probability when updating BB info. (#134585) In case the same src BB targets to the same dest BB in different conditions/edges, such as switch-cases, we should use prob[SrcBB->SuccIndx] instead of prob[SrcBB->DstBB] to get probability. --- llvm/lib/Transforms/Scalar/JumpThreading.cpp | 9 ++-- .../Transforms/JumpThreading/thread-prob-8.ll | 42 +++++++++++++++++++ 2 files changed, 46 insertions(+), 5 deletions(-) create mode 100644 llvm/test/Transforms/JumpThreading/thread-prob-8.ll diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 3548412001ac6..a518e02d762f6 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -2546,17 +2546,16 @@ void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB, // frequency of BB. auto BBOrigFreq = BFI->getBlockFreq(BB); auto NewBBFreq = BFI->getBlockFreq(NewBB); - auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB); auto BBNewFreq = BBOrigFreq - NewBBFreq; BFI->setBlockFreq(BB, BBNewFreq); // Collect updated outgoing edges' frequencies from BB and use them to update // edge probabilities. SmallVector BBSuccFreq; - for (BasicBlock *Succ : successors(BB)) { - auto SuccFreq = (Succ == SuccBB) - ? BB2SuccBBFreq - NewBBFreq - : BBOrigFreq * BPI->getEdgeProbability(BB, Succ); + for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) { + auto BB2SuccBBFreq = + BBOrigFreq * BPI->getEdgeProbability(BB, I.getSuccessorIndex()); + auto SuccFreq = (*I == SuccBB) ? BB2SuccBBFreq - NewBBFreq : BB2SuccBBFreq; BBSuccFreq.push_back(SuccFreq.getFrequency()); } diff --git a/llvm/test/Transforms/JumpThreading/thread-prob-8.ll b/llvm/test/Transforms/JumpThreading/thread-prob-8.ll new file mode 100644 index 0000000000000..b63c789515966 --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/thread-prob-8.ll @@ -0,0 +1,42 @@ +; RUN: opt -debug-only=branch-prob -passes=jump-threading -S %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +; Make sure that edges' probabilities would not accumulate if they are +; the same target BB. +; Edge L0 -> 2 and L0 -> 3 's targets are both L2, but their respective +; probability should not be L0 -> L2, because prob[L0->L2] equls to +; prob[L0->2] + prob[L0->3] + +; CHECK: Computing probabilities for entry +; CHECK: eraseBlock L0 +; CHECK-NOT: set edge L0 -> 0 successor probability to 0x12492492 / 0x80000000 = 14.29% +; CHECK-NOT: set edge L0 -> 1 successor probability to 0x24924925 / 0x80000000 = 28.57% +; CHECK-NOT: set edge L0 -> 2 successor probability to 0x24924925 / 0x80000000 = 28.57% +; CHECK-NOT: set edge L0 -> 3 successor probability to 0x24924925 / 0x80000000 = 28.57% +; CHECK: set edge L0 -> 0 successor probability to 0x1999999a / 0x80000000 = 20.00% +; CHECK: set edge L0 -> 1 successor probability to 0x33333333 / 0x80000000 = 40.00% +; CHECK: set edge L0 -> 2 successor probability to 0x1999999a / 0x80000000 = 20.00% +; CHECK: set edge L0 -> 3 successor probability to 0x1999999a / 0x80000000 = 20.00% +; CHECK-NOT: !0 = !{!"branch_weights", i32 306783378, i32 613566757, i32 613566757, i32 613566757} +; CHECK: !0 = !{!"branch_weights", i32 429496730, i32 858993459, i32 429496730, i32 429496730} +define void @test_switch(i1 %cond, i8 %value) nounwind { +entry: + br i1 %cond, label %L0, label %L4 +L0: + %expr = select i1 %cond, i8 1, i8 %value + switch i8 %expr, label %L3 [ + i8 1, label %L1 + i8 2, label %L2 + i8 3, label %L2 + ], !prof !0 + +L1: + ret void +L2: + ret void +L3: + ret void +L4: + br label %L0 +} +!0 = !{!"branch_weights", i32 1, i32 7, i32 1, i32 1} From ccdc44f643e5158391e618593950927dd1a02e5b Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Tue, 8 Apr 2025 01:32:45 -0700 Subject: [PATCH 0949/1029] [mlir][tosa] Remove perms input for tosa.transpose tests (#134740) Perms is now an attribute, not input. Signed-off-by: Jerry Ge --- mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir | 2 -- 1 file changed, 2 deletions(-) diff --git a/mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir b/mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir index b94c20835a957..b3f4260ede2f5 100644 --- a/mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir +++ b/mlir/test/Dialect/Tosa/tosa-reduce-transposes.mlir @@ -136,7 +136,6 @@ func.func @test_mulop_conversion(%arg0: tensor<1x2x3x4xi32>, %arg1: tensor<1x2x3 // CHECK: tosa.transpose %[[RESHAPED]] {perms = array} : (tensor<1x3x2xi32>) -> tensor<1x2x3xi32> func.func @test_basic_non_broadcasting_reshape(%arg0: tensor<2x3xi32>) -> tensor<1x2x3xi32> { %shape = tosa.const_shape {values = dense<[1, 3, 2]> : tensor<3xindex>} : () -> !tosa.shape<3> - %perms = "tosa.const"() {values = dense<[0, 2, 1]> : tensor<3xi32>} : () -> tensor<3xi32> %1 = tosa.reshape %arg0, %shape : (tensor<2x3xi32>, !tosa.shape<3>) -> tensor<1x3x2xi32> %2 = tosa.transpose %1 {perms = array}: (tensor<1x3x2xi32>) -> tensor<1x2x3xi32> return %2 : tensor<1x2x3xi32> @@ -150,7 +149,6 @@ func.func @test_basic_non_broadcasting_reshape(%arg0: tensor<2x3xi32>) -> tensor // CHECK: return %[[RES]] func.func @test_dynamic_broadcasting_reshape(%arg0: tensor) -> tensor<1x1x?xi32> { %shape = tosa.const_shape {values = dense<[1, -1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> - %perms = "tosa.const"() {values = dense<[0, 2, 1]> : tensor<3xi32>} : () -> tensor<3xi32> %1 = tosa.reshape %arg0, %shape : (tensor, !tosa.shape<3>) -> tensor<1x?x1xi32> %2 = tosa.transpose %1 {perms = array}: (tensor<1x?x1xi32>) -> tensor<1x1x?xi32> return %2 : tensor<1x1x?xi32> From f4328d0d3aee682e77ce070e1835158193dd8dcd Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Tue, 8 Apr 2025 01:33:35 -0700 Subject: [PATCH 0950/1029] [mlir][tosa] Remove out_shape attribute from transpose_2d attributes (#134743) out_shape is no longer an attribute Signed-off-by: Jerry Ge --- mlir/test/Dialect/Tosa/level_check.mlir | 16 ++++++++-------- mlir/test/Dialect/Tosa/ops.mlir | 4 ++-- .../Tosa/tosa-decompose-transpose-conv.mlir | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir index 12addcd315449..8c3b2e526e444 100644 --- a/mlir/test/Dialect/Tosa/level_check.mlir +++ b/mlir/test/Dialect/Tosa/level_check.mlir @@ -889,7 +889,7 @@ func.func @test_rfft2d_input_w(%arg0: tensor<13x8x16384xf32>) -> (tensor<13x8x81 func.func @test_transpose_conv2d_weight_h(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x8193x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x8224x32x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: KH <= MAX_KERNEL}} - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x8193x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x8224x32x16xf32> return %0 : tensor<1x8224x32x16xf32> } @@ -898,7 +898,7 @@ func.func @test_transpose_conv2d_weight_h(%arg0: tensor<1x32x32x8xf32>, %arg1: t func.func @test_transpose_conv2d_weight_w(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x8193x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x8224x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: KW <= MAX_KERNEL}} - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x8193x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x8224x16xf32> return %0 : tensor<1x32x8224x16xf32> } @@ -907,7 +907,7 @@ func.func @test_transpose_conv2d_weight_w(%arg0: tensor<1x32x32x8xf32>, %arg1: t func.func @test_transpose_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x8225x32x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}} - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x8225x32x16xf32> return %0 : tensor<1x8225x32x16xf32> } @@ -916,7 +916,7 @@ func.func @test_transpose_conv2d_pad_top(%arg0: tensor<1x32x32x8xf32>, %arg1: te func.func @test_transpose_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x8225x32x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}} - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x8225x32x16xf32> return %0 : tensor<1x8225x32x16xf32> } @@ -925,7 +925,7 @@ func.func @test_transpose_conv2d_pad_bottom(%arg0: tensor<1x32x32x8xf32>, %arg1: func.func @test_transpose_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x8225x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}} - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x8225x16xf32> return %0 : tensor<1x32x8225x16xf32> } @@ -934,7 +934,7 @@ func.func @test_transpose_conv2d_pad_left(%arg0: tensor<1x32x32x8xf32>, %arg1: t func.func @test_transpose_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x8225x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: pad <= MAX_KERNEL}} - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x8225x16xf32> return %0 : tensor<1x32x8225x16xf32> } @@ -943,7 +943,7 @@ func.func @test_transpose_conv2d_pad_right(%arg0: tensor<1x32x32x8xf32>, %arg1: func.func @test_transpose_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x253984x32x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: stride <= MAX_STRIDE}} - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x253984x32x16xf32> return %0 : tensor<1x253984x32x16xf32> } @@ -952,7 +952,7 @@ func.func @test_transpose_conv2d_stride_y(%arg0: tensor<1x32x32x8xf32>, %arg1: t func.func @test_transpose_conv2d_stride_x(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x253984x16xf32> { // expected-error@+1 {{'tosa.transpose_conv2d' op failed level check: stride <= MAX_STRIDE}} - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x253984x16xf32> return %0 : tensor<1x32x253984x16xf32> } diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir index 017fc9615345d..cc8c298b50ad6 100644 --- a/mlir/test/Dialect/Tosa/ops.mlir +++ b/mlir/test/Dialect/Tosa/ops.mlir @@ -198,14 +198,14 @@ func.func @test_rfft2d_with_local_bound(%arg0: tensor<13x8x16xf32>) -> (tensor<1 // ----- // CHECK-LABEL: transpose_conv2d func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> return %0 : tensor<1x32x32x16xf32> } // ----- // CHECK-LABEL: transpose_conv2d_with_local_bound func.func @test_transpose_conv2d_with_local_bound(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x32x32x16xf32> { - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, out_shape = array, stride = array, local_bound = false} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, out_pad = array, stride = array, local_bound = false} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x16xf32> return %0 : tensor<1x32x32x16xf32> } diff --git a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir index 0f56693edd160..810135f6f531b 100644 --- a/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir +++ b/mlir/test/Dialect/Tosa/tosa-decompose-transpose-conv.mlir @@ -8,7 +8,7 @@ func.func @transpose_conv2d(%arg0: tensor<2x16x14x3xf32>, %arg1: tensor<5x3x6x3x // CHECK: tosa.conv2d %arg0, %[[REV2]], %arg2, %[[ZP]], %[[ZP]] // CHECK-SAME: dilation = array, pad = array, stride = array %zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %zp, %zp {acc_type = f32, out_pad = array, out_shape = array, stride = array} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<2x18x19x5xf32> + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %zp, %zp {acc_type = f32, out_pad = array, stride = array} : (tensor<2x16x14x3xf32>, tensor<5x3x6x3xf32>, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<2x18x19x5xf32> return %0 : tensor<2x18x19x5xf32> } @@ -24,7 +24,7 @@ func.func @transpose_conv2d_quantized(%arg0: tensor<2x16x14x3xi8>, %arg1: tensor // CHECK: tosa.conv2d %arg0, %[[REV2]], %arg2, %[[INPUT_ZP]], %[[WEIGHT_ZP]] {acc_type = i32, dilation = array, pad = array, stride = array} %input_zp = "tosa.const"() {values = dense<-6> : tensor<1xi8>} : () -> tensor<1xi8> %weight_zp = "tosa.const"() {values = dense<11> : tensor<1xi8>} : () -> tensor<1xi8> - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %input_zp, %weight_zp {acc_type = i32, out_pad = array, out_shape = array, stride = array} : (tensor<2x16x14x3xi8>, tensor<5x3x6x3xi8>, tensor<5xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<2x18x19x5xi32> + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %input_zp, %weight_zp {acc_type = i32, out_pad = array, stride = array} : (tensor<2x16x14x3xi8>, tensor<5x3x6x3xi8>, tensor<5xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<2x18x19x5xi32> return %0 : tensor<2x18x19x5xi32> } @@ -82,7 +82,7 @@ func.func @transpose_conv2d_strided(%arg0: tensor<2x17x15x3xf32>, %arg1: tensor< // CHECK-DAG: %[[RESHAPE_ARG2:.+]] = tosa.reshape %arg2, %[[CONST9]] // CHECK: %[[ADD:.+]] = tosa.add %[[SLICE]], %[[RESHAPE_ARG2]] %zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> - %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %zp, %zp {acc_type = f32, out_pad = array, out_shape = array, stride = array} : (tensor<2x17x15x3xf32>, tensor<5x3x5x3xf32>, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<2x35x47x5xf32> + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %zp, %zp {acc_type = f32, out_pad = array, stride = array} : (tensor<2x17x15x3xf32>, tensor<5x3x5x3xf32>, tensor<5xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<2x35x47x5xf32> %1 = tensor.cast %0 : tensor<2x35x47x5xf32> to tensor<2x?x?x5xf32> return %1 : tensor<2x?x?x5xf32> } From f0bdeb4b6aa1ee0653aed8323a5a7485fb930a05 Mon Sep 17 00:00:00 2001 From: Jerry-Ge Date: Tue, 8 Apr 2025 01:34:40 -0700 Subject: [PATCH 0951/1029] [mlir][tosa] Cleanup ops.mlir (#134751) * add missing CHECK-LABEL * removed whitespace for consistency Signed-off-by: Jerry Ge --- mlir/test/Dialect/Tosa/ops.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir index cc8c298b50ad6..248d84da6b8b9 100644 --- a/mlir/test/Dialect/Tosa/ops.mlir +++ b/mlir/test/Dialect/Tosa/ops.mlir @@ -852,7 +852,6 @@ func.func @test_max_pool2d_f8E5M2(%arg0: tensor<1x32x32x8xf8E5M2>) -> tensor<1x3 } // ----- - // CHECK-LABEL: transpose_conv2d_f8E5M2 func.func @test_transpose_conv2d_f8E5M2(%arg0: tensor<1x32x32x8xf8E5M2>, %arg1: tensor<16x1x1x8xf8E5M2>, %arg2: tensor<16xf16>, %arg3: tensor<1xf8E5M2>, %arg4: tensor<1xf8E5M2>) -> tensor<1x32x32x16xf16> { %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f16, out_pad = array, stride = array} : (tensor<1x32x32x8xf8E5M2>, tensor<16x1x1x8xf8E5M2>, tensor<16xf16>, tensor<1xf8E5M2>, tensor<1xf8E5M2>) -> tensor<1x32x32x16xf16> @@ -922,6 +921,7 @@ func.func @test_tile_f8E5M2(%arg0: tensor<13x21x3xf8E5M2>) -> tensor<39x21x6xf8E } // ----- +// CHECK-LABEL: transpose_f8E5M2 func.func @test_transpose_f8E5M2(%arg0: tensor<13x21x3xf8E5M2>) -> tensor<3x13x21xf8E5M2> { %1 = tosa.transpose %arg0 {perms = array} : (tensor<13x21x3xf8E5M2>) -> tensor<3x13x21xf8E5M2> return %1 : tensor<3x13x21xf8E5M2> From bb5006169f9f72a87b4358356976e0fa33353728 Mon Sep 17 00:00:00 2001 From: Pedro Lobo Date: Tue, 8 Apr 2025 09:50:48 +0100 Subject: [PATCH 0952/1029] [CodeGen] Change placeholder from `undef` to `poison` (#134731) Fill default values of a map with `poison` instead of `undef`. There should be no functional difference as the default values are overridden later. --- clang/lib/CodeGen/MicrosoftCXXABI.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp index ba5f74f153d59..7bef436302526 100644 --- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp +++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp @@ -370,7 +370,7 @@ class MicrosoftCXXABI : public CGCXXABI { MicrosoftVTableContext &VTContext = CGM.getMicrosoftVTableContext(); unsigned NumEntries = 1 + SrcRD->getNumVBases(); SmallVector Map(NumEntries, - llvm::UndefValue::get(CGM.IntTy)); + llvm::PoisonValue::get(CGM.IntTy)); Map[0] = llvm::ConstantInt::get(CGM.IntTy, 0); bool AnyDifferent = false; for (const auto &I : SrcRD->vbases()) { From e1fc118f3a2f3fb91a3045ce37a5259430594afc Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell Date: Tue, 8 Apr 2025 09:53:33 +0100 Subject: [PATCH 0953/1029] [CI] Reduce false positives in undef checker (#134687) Only check for diffs containing "undef" in .ll files, this prevents comments like `// We should not have undef values...` triggering the undef checker bot. --- llvm/utils/git/code-format-helper.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py index cb1e56859d083..da1b4cdad6978 100755 --- a/llvm/utils/git/code-format-helper.py +++ b/llvm/utils/git/code-format-helper.py @@ -376,14 +376,25 @@ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str sys.stdout.write(proc.stderr) stdout = proc.stdout + if not stdout: + return None + files = [] + # Split the diff so we have one array entry per file. # Each file is prefixed like: # diff --git a/file b/file for file in re.split("^diff --git ", stdout, 0, re.MULTILINE): + filename = re.match("a/([^ ]+)", file.splitlines()[0])[1] + if filename.endswith(".ll"): + undef_regex = r"\bundef\b" + else: + undef_regex = r"UndefValue::get" # search for additions of undef - if re.search(r"^[+](?!\s*#\s*).*(\bundef\b|UndefValue::get)", file, re.MULTILINE): - files.append(re.match("a/([^ ]+)", file.splitlines()[0])[1]) + if re.search( + r"^[+](?!\s*#\s*).*(" + undef_regex + r")", file, re.MULTILINE + ): + files.append(filename) if not files: return None From 93505f8e0e702f470fdc37ea9a9ca5f04d011c19 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Tue, 8 Apr 2025 09:56:26 +0100 Subject: [PATCH 0954/1029] [DebugInfo][InstCombine] Propagate DILocation when noop-ing invoke (#134678) In InstCombine we may decide that an alloc is removable, and the alloc fn is called by an InvokeInst, we replace that InvokeInst with a invoke of a noop intrinsic; this patch has us also copy the original invoke's DILocation to the new noop invoke. Found using https://github.com/llvm/llvm-project/pull/107279. --- .../InstCombine/InstructionCombining.cpp | 5 +- .../InstCombine/debuginfo-invoke.ll | 51 +++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/InstCombine/debuginfo-invoke.ll diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 3a2fa154b0fdd..856e02c9f1ddb 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -3406,8 +3406,9 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { // Replace invoke with a NOP intrinsic to maintain the original CFG Module *M = II->getModule(); Function *F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::donothing); - InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(), {}, "", - II->getParent()); + auto *NewII = InvokeInst::Create( + F, II->getNormalDest(), II->getUnwindDest(), {}, "", II->getParent()); + NewII->setDebugLoc(II->getDebugLoc()); } // Remove debug intrinsics which describe the value contained within the diff --git a/llvm/test/Transforms/InstCombine/debuginfo-invoke.ll b/llvm/test/Transforms/InstCombine/debuginfo-invoke.ll new file mode 100644 index 0000000000000..287582b9baf3b --- /dev/null +++ b/llvm/test/Transforms/InstCombine/debuginfo-invoke.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=instcombine -S %s -o - | FileCheck %s + +define void @foo() personality ptr null !dbg !4 { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: ) personality ptr null !dbg [[DBG3:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: invoke void @llvm.donothing() +; CHECK-NEXT: to label %[[COMMON_RET:.*]] unwind label %[[LPAD159:.*]], !dbg [[DBG7:![0-9]+]] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[LPAD159]]: +; CHECK-NEXT: [[TMP0:%.*]] = landingpad { ptr, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: br label %[[COMMON_RET]] +; +entry: + %call.i.i895904 = invoke ptr @_Znam(i64 0) + to label %common.ret unwind label %lpad159, !dbg !9 + +common.ret: ; preds = %lpad159, %entry + ret void + +lpad159: ; preds = %entry + %0 = landingpad { ptr, i32 } + cleanup + br label %common.ret +} + +declare ptr @_Znam(i64) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git") +!1 = !DIFile(filename: "ArchiveCommandLine.cpp", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 13, type: !6, scopeLine: 13, unit: !0, retainedNodes: !2) +!6 = distinct !DISubroutineType(types: !7) +!7 = !{null} +!9 = !DILocation(line: 14, column: 20, scope: !4) +;. +; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug) +; CHECK: [[META1]] = !DIFile(filename: "ArchiveCommandLine.cpp", directory: {{.*}}) +; CHECK: [[DBG3]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 13, type: [[META4:![0-9]+]], scopeLine: 13, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]]) +; CHECK: [[META4]] = distinct !DISubroutineType(types: [[META5:![0-9]+]]) +; CHECK: [[META5]] = !{null} +; CHECK: [[META6]] = !{} +; CHECK: [[DBG7]] = !DILocation(line: 14, column: 20, scope: [[DBG3]]) +;. From 9d82ab8a8222f22048074488d5036ae5228088c5 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 8 Apr 2025 10:02:24 +0100 Subject: [PATCH 0955/1029] [BasicAA] Add some test cases for coerced function args --- .../test/Analysis/BasicAA/noalias-inttoptr.ll | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/llvm/test/Analysis/BasicAA/noalias-inttoptr.ll b/llvm/test/Analysis/BasicAA/noalias-inttoptr.ll index 24bbcc55b3202..72f9adc5a74bf 100644 --- a/llvm/test/Analysis/BasicAA/noalias-inttoptr.ll +++ b/llvm/test/Analysis/BasicAA/noalias-inttoptr.ll @@ -58,3 +58,51 @@ define void @test5(i64 %Q_as_int) { store i8 1, ptr %Q ret void } + +; Verify that extractvalue of a coerced ptr argument array are NoAlias a function local object +define void @test_extractvalue([2 x ptr] %Q.coerce) { + ; CHECK-LABEL: Function: test_extractvalue: + ; CHECK: NoAlias: i8* %P, i8* %Q + %P = alloca i8 + %Q = extractvalue [2 x ptr] %Q.coerce, 1 + store i8 0, ptr %P + store i8 1, ptr %Q + ret void +} + +; Same as test_extractvalue with an escape of %P +define void @test_extractvalue_escape([2 x ptr] %Q.coerce) { + ; CHECK-LABEL: Function: test_extractvalue_escape: + ; CHECK: MayAlias: i8* %P, i8* %Q + %P = alloca i8 + call void @escape(ptr %P) + %Q = extractvalue [2 x ptr] %Q.coerce, 1 + store i8 0, ptr %P + store i8 1, ptr %Q + ret void +} + +; Verify that extractvalue of a coerced ptr argument array are NoAlias a function local object +define void @test_extractvalue_int([2 x i64] %Q.coerce) { + ; CHECK-LABEL: Function: test_extractvalue_int: + ; CHECK: NoAlias: i8* %P, i8* %Q + %P = alloca i8 + %Q_as_int = extractvalue [2 x i64] %Q.coerce, 1 + %Q = inttoptr i64 %Q_as_int to ptr + store i8 0, ptr %P + store i8 1, ptr %Q + ret void +} + +; Same as test_extractvalue_int with an escape of %P +define void @test_extractvalue_int_escape([2 x i64] %Q.coerce) { + ; CHECK-LABEL: Function: test_extractvalue_int_escape: + ; CHECK: MayAlias: i8* %P, i8* %Q + %P = alloca i8 + call void @escape(ptr %P) + %Q_as_int = extractvalue [2 x i64] %Q.coerce, 1 + %Q = inttoptr i64 %Q_as_int to ptr + store i8 0, ptr %P + store i8 1, ptr %Q + ret void +} \ No newline at end of file From c23e1cb9362067d3565a78590db8c5e68b74cf17 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 8 Apr 2025 10:05:58 +0100 Subject: [PATCH 0956/1029] [BasicAA] Treat ExtractValue(Argument) similar to Argument in relation to function-local objects. (#134716) This is a much smaller, technically orthogonal patch similar to #134505. It states that a extractvalue(Argument) can be treated like an Argument for alias analysis, where the extractelement acts like a phi / copy. No inttoptr here. --- llvm/lib/Analysis/BasicAliasAnalysis.cpp | 14 ++++++++++++-- llvm/test/Analysis/BasicAA/noalias-inttoptr.ll | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 4d1a95a0c4b43..12d9c8706a8e1 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -1533,6 +1533,16 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize, return Alias; } +// Return true for an Argument or extractvalue(Argument). These are all known +// to not alias with FunctionLocal objects and can come up from coerced function +// arguments. +static bool isArgumentOrArgumentLike(const Value *V) { + if (isa(V)) + return true; + auto *E = dyn_cast(V); + return E && isa(E->getOperand(0)); +} + /// Provides a bunch of ad-hoc rules to disambiguate in common cases, such as /// array references. AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size, @@ -1585,8 +1595,8 @@ AliasResult BasicAAResult::aliasCheck(const Value *V1, LocationSize V1Size, // Function arguments can't alias with things that are known to be // unambigously identified at the function level. - if ((isa(O1) && isIdentifiedFunctionLocal(O2)) || - (isa(O2) && isIdentifiedFunctionLocal(O1))) + if ((isArgumentOrArgumentLike(O1) && isIdentifiedFunctionLocal(O2)) || + (isArgumentOrArgumentLike(O2) && isIdentifiedFunctionLocal(O1))) return AliasResult::NoAlias; // If one pointer is the result of a call/invoke or load and the other is a diff --git a/llvm/test/Analysis/BasicAA/noalias-inttoptr.ll b/llvm/test/Analysis/BasicAA/noalias-inttoptr.ll index 72f9adc5a74bf..cdfdb091f668f 100644 --- a/llvm/test/Analysis/BasicAA/noalias-inttoptr.ll +++ b/llvm/test/Analysis/BasicAA/noalias-inttoptr.ll @@ -73,7 +73,7 @@ define void @test_extractvalue([2 x ptr] %Q.coerce) { ; Same as test_extractvalue with an escape of %P define void @test_extractvalue_escape([2 x ptr] %Q.coerce) { ; CHECK-LABEL: Function: test_extractvalue_escape: - ; CHECK: MayAlias: i8* %P, i8* %Q + ; CHECK: NoAlias: i8* %P, i8* %Q %P = alloca i8 call void @escape(ptr %P) %Q = extractvalue [2 x ptr] %Q.coerce, 1 From fe4f66636300e908e29f6caa69e0bda6d0d6422c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Tue, 8 Apr 2025 11:16:24 +0200 Subject: [PATCH 0957/1029] [CI] Always upload queue/running count (#134814) Before this commit, we only pushed a queue/running count when the value was not zero. This makes building Grafana alerting a bit harder. Changing this to always upload a value for watched workflows. --- .ci/metrics/metrics.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py index 8069e262a68ec..a5ee893650d63 100644 --- a/.ci/metrics/metrics.py +++ b/.ci/metrics/metrics.py @@ -282,6 +282,13 @@ def github_get_metrics( queued_count = collections.Counter() running_count = collections.Counter() + # Initialize all the counters to 0 so we report 0 when no job is queued + # or running. + for wf_name, wf_metric_name in GITHUB_WORKFLOW_TO_TRACK.items(): + for job_name, job_metric_name in GITHUB_JOB_TO_TRACK[wf_metric_name].items(): + queued_count[wf_metric_name + "_" + job_metric_name] = 0 + running_count[wf_metric_name + "_" + job_metric_name] = 0 + # The list of workflows this iteration will process. # MaxSize = GITHUB_WORKFLOWS_MAX_PROCESS_COUNT workflow_seen_as_completed = set() From 446d4f51eb1a172776e69ffb51b5972a0225c0a1 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Tue, 8 Apr 2025 10:27:27 +0100 Subject: [PATCH 0958/1029] [flang][OpenMP][Lower] fix statement context cleanup insertion point (#133891) The statement context is used for lowering clauses for openmp operations using generalised helpers from flang lowering. The statement context stores closures which generate code for cleaning up temporary values generated by the lowering helper. These closures are run when the statement construct is destroyed. Keeping the statement context local to the clause or operation being lowered without any special handling was not correct because any cleanup code would be generated at the insertion point when that statement context went out of scope (which would in general be inside of the newly created container operation). It would be better to generate the cleanup code after the newly created operation (clause processing is synchronous even for deferred tasks). Currently supported clauses are mostly populated with simple scalar values that require no cleanup. Even the simple array sections added by #132994 needed no cleanup because indexing the right values of the array did not create any temporaries. Supporting array sections with vector indexing will generate hlfir.destroy operations for cleanup. This patch fixes where those will be created. Those hlfir.destroy operations don't generate any FIR (or LLVM) code, but the issue still exists theoretically. I wasn't able to find any clauses which have any cleanup to use to test this PR. It is probably NFC for the current lowering. This will be tested in [the PR adding vector subscripting of array sections](https://github.com/llvm/llvm-project/pull/133892). --- flang/lib/Lower/OpenMP/OpenMP.cpp | 323 +++++++++++---------- flang/test/Lower/OpenMP/clause-cleanup.f90 | 17 ++ 2 files changed, 181 insertions(+), 159 deletions(-) create mode 100644 flang/test/Lower/OpenMP/clause-cleanup.f90 diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index b04d57ec30e4f..384d528ca2e63 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1770,7 +1770,6 @@ static void genTaskClauses(lower::AbstractConverter &converter, cp.processPriority(stmtCtx, clauseOps); cp.processUntied(clauseOps); cp.processDetach(clauseOps); - // TODO Support delayed privatization. cp.processTODO( loc, llvm::omp::Directive::OMPD_task); @@ -1921,12 +1920,11 @@ static mlir::omp::LoopNestOp genLoopNestOp( queue, item, clauseOps); } -static void genLoopOp(lower::AbstractConverter &converter, - lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { +static mlir::omp::LoopOp +genLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable, + semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, + mlir::Location loc, const ConstructQueue &queue, + ConstructQueue::const_iterator item) { mlir::omp::LoopOperands loopClauseOps; llvm::SmallVector loopReductionSyms; genLoopClauses(converter, semaCtx, item->clauses, loc, loopClauseOps, @@ -1953,14 +1951,15 @@ static void genLoopOp(lower::AbstractConverter &converter, genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item, loopNestClauseOps, iv, {{loopOp, loopArgs}}, llvm::omp::Directive::OMPD_loop, dsp); + return loopOp; } static mlir::omp::MaskedOp genMaskedOp(lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, mlir::Location loc, const ConstructQueue &queue, ConstructQueue::const_iterator item) { - lower::StatementContext stmtCtx; mlir::omp::MaskedOperands clauseOps; genMaskedClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps); @@ -2164,13 +2163,13 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable, return sectionsOp; } -static void genScopeOp(lower::AbstractConverter &converter, - lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { +static mlir::Operation * +genScopeOp(lower::AbstractConverter &converter, lower::SymMap &symTable, + semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, + mlir::Location loc, const ConstructQueue &queue, + ConstructQueue::const_iterator item) { TODO(loc, "Scope construct"); + return nullptr; } static mlir::omp::SingleOp @@ -2190,11 +2189,11 @@ genSingleOp(lower::AbstractConverter &converter, lower::SymMap &symTable, static mlir::omp::TargetOp genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, mlir::Location loc, const ConstructQueue &queue, ConstructQueue::const_iterator item) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - lower::StatementContext stmtCtx; bool isTargetDevice = llvm::cast(*converter.getModuleOp()) .getIsTargetDevice(); @@ -2373,13 +2372,11 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable, return targetOp; } -static mlir::omp::TargetDataOp -genTargetDataOp(lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { - lower::StatementContext stmtCtx; +static mlir::omp::TargetDataOp genTargetDataOp( + lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item) { mlir::omp::TargetDataOperands clauseOps; llvm::SmallVector useDeviceAddrSyms, useDevicePtrSyms; @@ -2409,10 +2406,10 @@ genTargetDataOp(lower::AbstractConverter &converter, lower::SymMap &symTable, template static OpTy genTargetEnterExitUpdateDataOp( lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, mlir::Location loc, - const ConstructQueue &queue, ConstructQueue::const_iterator item) { + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, + mlir::Location loc, const ConstructQueue &queue, + ConstructQueue::const_iterator item) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - lower::StatementContext stmtCtx; // GCC 9.3.0 emits a (probably) bogus warning about an unused variable. [[maybe_unused]] llvm::omp::Directive directive; @@ -2435,10 +2432,10 @@ static OpTy genTargetEnterExitUpdateDataOp( static mlir::omp::TaskOp genTaskOp(lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, mlir::Location loc, const ConstructQueue &queue, ConstructQueue::const_iterator item) { - lower::StatementContext stmtCtx; mlir::omp::TaskOperands clauseOps; genTaskClauses(converter, semaCtx, symTable, stmtCtx, item->clauses, loc, clauseOps); @@ -2505,13 +2502,11 @@ genTaskyieldOp(lower::AbstractConverter &converter, lower::SymMap &symTable, return converter.getFirOpBuilder().create(loc); } -static mlir::omp::WorkshareOp -genWorkshareOp(lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { - lower::StatementContext stmtCtx; +static mlir::omp::WorkshareOp genWorkshareOp( + lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item) { mlir::omp::WorkshareOperands clauseOps; genWorkshareClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps); @@ -2525,11 +2520,10 @@ genWorkshareOp(lower::AbstractConverter &converter, lower::SymMap &symTable, static mlir::omp::TeamsOp genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, mlir::Location loc, const ConstructQueue &queue, ConstructQueue::const_iterator item) { - lower::StatementContext stmtCtx; - mlir::omp::TeamsOperands clauseOps; llvm::SmallVector reductionSyms; genTeamsClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps, @@ -2553,15 +2547,11 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable, // also be a leaf of a composite construct //===----------------------------------------------------------------------===// -static void genStandaloneDistribute(lower::AbstractConverter &converter, - lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, - mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { - lower::StatementContext stmtCtx; - +static mlir::omp::DistributeOp genStandaloneDistribute( + lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item) { mlir::omp::DistributeOperands distributeClauseOps; genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc, distributeClauseOps); @@ -2585,16 +2575,14 @@ static void genStandaloneDistribute(lower::AbstractConverter &converter, genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item, loopNestClauseOps, iv, {{distributeOp, distributeArgs}}, llvm::omp::Directive::OMPD_distribute, dsp); + return distributeOp; } -static void genStandaloneDo(lower::AbstractConverter &converter, - lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { - lower::StatementContext stmtCtx; - +static mlir::omp::WsloopOp genStandaloneDo( + lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item) { mlir::omp::WsloopOperands wsloopClauseOps; llvm::SmallVector wsloopReductionSyms; genWsloopClauses(converter, semaCtx, stmtCtx, item->clauses, loc, @@ -2621,17 +2609,14 @@ static void genStandaloneDo(lower::AbstractConverter &converter, genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item, loopNestClauseOps, iv, {{wsloopOp, wsloopArgs}}, llvm::omp::Directive::OMPD_do, dsp); + return wsloopOp; } -static void genStandaloneParallel(lower::AbstractConverter &converter, - lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, - mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { - lower::StatementContext stmtCtx; - +static mlir::omp::ParallelOp genStandaloneParallel( + lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item) { mlir::omp::ParallelOperands parallelClauseOps; llvm::SmallVector parallelReductionSyms; genParallelClauses(converter, semaCtx, stmtCtx, item->clauses, loc, @@ -2651,17 +2636,17 @@ static void genStandaloneParallel(lower::AbstractConverter &converter, parallelArgs.priv.vars = parallelClauseOps.privateVars; parallelArgs.reduction.syms = parallelReductionSyms; parallelArgs.reduction.vars = parallelClauseOps.reductionVars; - genParallelOp(converter, symTable, semaCtx, eval, loc, queue, item, - parallelClauseOps, parallelArgs, - enableDelayedPrivatization ? &dsp.value() : nullptr); + return genParallelOp(converter, symTable, semaCtx, eval, loc, queue, item, + parallelClauseOps, parallelArgs, + enableDelayedPrivatization ? &dsp.value() : nullptr); } -static void genStandaloneSimd(lower::AbstractConverter &converter, - lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { +static mlir::omp::SimdOp +genStandaloneSimd(lower::AbstractConverter &converter, lower::SymMap &symTable, + semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, + ConstructQueue::const_iterator item) { mlir::omp::SimdOperands simdClauseOps; llvm::SmallVector simdReductionSyms; genSimdClauses(converter, semaCtx, item->clauses, loc, simdClauseOps, @@ -2688,29 +2673,27 @@ static void genStandaloneSimd(lower::AbstractConverter &converter, genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item, loopNestClauseOps, iv, {{simdOp, simdArgs}}, llvm::omp::Directive::OMPD_simd, dsp); + return simdOp; } -static void genStandaloneTaskloop(lower::AbstractConverter &converter, - lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, - mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { +static mlir::omp::TaskloopOp genStandaloneTaskloop( + lower::AbstractConverter &converter, lower::SymMap &symTable, + semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, + mlir::Location loc, const ConstructQueue &queue, + ConstructQueue::const_iterator item) { TODO(loc, "Taskloop construct"); + return nullptr; } //===----------------------------------------------------------------------===// // Code generation functions for composite constructs //===----------------------------------------------------------------------===// -static void genCompositeDistributeParallelDo( +static mlir::omp::DistributeOp genCompositeDistributeParallelDo( lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - mlir::Location loc, const ConstructQueue &queue, - ConstructQueue::const_iterator item) { - lower::StatementContext stmtCtx; - + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item) { assert(std::distance(item, queue.end()) == 3 && "Invalid leaf constructs"); ConstructQueue::const_iterator distributeItem = item; ConstructQueue::const_iterator parallelItem = std::next(distributeItem); @@ -2769,15 +2752,14 @@ static void genCompositeDistributeParallelDo( loopNestClauseOps, iv, {{distributeOp, distributeArgs}, {wsloopOp, wsloopArgs}}, llvm::omp::Directive::OMPD_distribute_parallel_do, dsp); + return distributeOp; } -static void genCompositeDistributeParallelDoSimd( +static mlir::omp::DistributeOp genCompositeDistributeParallelDoSimd( lower::AbstractConverter &converter, lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - mlir::Location loc, const ConstructQueue &queue, - ConstructQueue::const_iterator item) { - lower::StatementContext stmtCtx; - + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item) { assert(std::distance(item, queue.end()) == 4 && "Invalid leaf constructs"); ConstructQueue::const_iterator distributeItem = item; ConstructQueue::const_iterator parallelItem = std::next(distributeItem); @@ -2861,17 +2843,14 @@ static void genCompositeDistributeParallelDoSimd( {simdOp, simdArgs}}, llvm::omp::Directive::OMPD_distribute_parallel_do_simd, simdItemDSP); + return distributeOp; } -static void genCompositeDistributeSimd(lower::AbstractConverter &converter, - lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, - mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { - lower::StatementContext stmtCtx; - +static mlir::omp::DistributeOp genCompositeDistributeSimd( + lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item) { assert(std::distance(item, queue.end()) == 2 && "Invalid leaf constructs"); ConstructQueue::const_iterator distributeItem = item; ConstructQueue::const_iterator simdItem = std::next(distributeItem); @@ -2918,16 +2897,14 @@ static void genCompositeDistributeSimd(lower::AbstractConverter &converter, loopNestClauseOps, iv, {{distributeOp, distributeArgs}, {simdOp, simdArgs}}, llvm::omp::Directive::OMPD_distribute_simd, dsp); + return distributeOp; } -static void genCompositeDoSimd(lower::AbstractConverter &converter, - lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { - lower::StatementContext stmtCtx; - +static mlir::omp::WsloopOp genCompositeDoSimd( + lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item) { assert(std::distance(item, queue.end()) == 2 && "Invalid leaf constructs"); ConstructQueue::const_iterator doItem = item; ConstructQueue::const_iterator simdItem = std::next(doItem); @@ -2977,30 +2954,29 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter, loopNestClauseOps, iv, {{wsloopOp, wsloopArgs}, {simdOp, simdArgs}}, llvm::omp::Directive::OMPD_do_simd, dsp); + return wsloopOp; } -static void genCompositeTaskloopSimd(lower::AbstractConverter &converter, - lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, - mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { +static mlir::omp::TaskloopOp genCompositeTaskloopSimd( + lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item) { assert(std::distance(item, queue.end()) == 2 && "Invalid leaf constructs"); TODO(loc, "Composite TASKLOOP SIMD"); + return nullptr; } //===----------------------------------------------------------------------===// // Dispatch //===----------------------------------------------------------------------===// -static bool genOMPCompositeDispatch(lower::AbstractConverter &converter, - lower::SymMap &symTable, - semantics::SemanticsContext &semaCtx, - lower::pft::Evaluation &eval, - mlir::Location loc, - const ConstructQueue &queue, - ConstructQueue::const_iterator item) { +static bool genOMPCompositeDispatch( + lower::AbstractConverter &converter, lower::SymMap &symTable, + lower::StatementContext &stmtCtx, semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item, + mlir::Operation *&newOp) { using llvm::omp::Directive; using lower::omp::matchLeafSequence; @@ -3009,20 +2985,21 @@ static bool genOMPCompositeDispatch(lower::AbstractConverter &converter, // correct. Consider per-leaf privatization of composite constructs once // delayed privatization is supported by all participating ops. if (matchLeafSequence(item, queue, Directive::OMPD_distribute_parallel_do)) - genCompositeDistributeParallelDo(converter, symTable, semaCtx, eval, loc, - queue, item); + newOp = genCompositeDistributeParallelDo(converter, symTable, stmtCtx, + semaCtx, eval, loc, queue, item); else if (matchLeafSequence(item, queue, Directive::OMPD_distribute_parallel_do_simd)) - genCompositeDistributeParallelDoSimd(converter, symTable, semaCtx, eval, - loc, queue, item); + newOp = genCompositeDistributeParallelDoSimd( + converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item); else if (matchLeafSequence(item, queue, Directive::OMPD_distribute_simd)) - genCompositeDistributeSimd(converter, symTable, semaCtx, eval, loc, queue, - item); + newOp = genCompositeDistributeSimd(converter, symTable, stmtCtx, semaCtx, + eval, loc, queue, item); else if (matchLeafSequence(item, queue, Directive::OMPD_do_simd)) - genCompositeDoSimd(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genCompositeDoSimd(converter, symTable, stmtCtx, semaCtx, eval, loc, + queue, item); else if (matchLeafSequence(item, queue, Directive::OMPD_taskloop_simd)) - genCompositeTaskloopSimd(converter, symTable, semaCtx, eval, loc, queue, - item); + newOp = genCompositeTaskloopSimd(converter, symTable, stmtCtx, semaCtx, + eval, loc, queue, item); else return false; @@ -3037,46 +3014,64 @@ static void genOMPDispatch(lower::AbstractConverter &converter, ConstructQueue::const_iterator item) { assert(item != queue.end()); + lower::StatementContext stmtCtx; + mlir::Operation *newOp = nullptr; + + // Generate cleanup code for the stmtCtx after newOp + auto finalizeStmtCtx = [&]() { + if (newOp) { + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + fir::FirOpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointAfter(newOp); + stmtCtx.finalizeAndPop(); + } + }; + bool loopLeaf = llvm::omp::getDirectiveAssociation(item->id) == llvm::omp::Association::Loop; if (loopLeaf) { symTable.pushScope(); - if (genOMPCompositeDispatch(converter, symTable, semaCtx, eval, loc, queue, - item)) { + if (genOMPCompositeDispatch(converter, symTable, stmtCtx, semaCtx, eval, + loc, queue, item, newOp)) { symTable.popScope(); + finalizeStmtCtx(); return; } } switch (llvm::omp::Directive dir = item->id) { case llvm::omp::Directive::OMPD_barrier: - genBarrierOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genBarrierOp(converter, symTable, semaCtx, eval, loc, queue, item); break; case llvm::omp::Directive::OMPD_distribute: - genStandaloneDistribute(converter, symTable, semaCtx, eval, loc, queue, - item); + newOp = genStandaloneDistribute(converter, symTable, stmtCtx, semaCtx, eval, + loc, queue, item); break; case llvm::omp::Directive::OMPD_do: - genStandaloneDo(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genStandaloneDo(converter, symTable, stmtCtx, semaCtx, eval, loc, + queue, item); break; case llvm::omp::Directive::OMPD_loop: - genLoopOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genLoopOp(converter, symTable, semaCtx, eval, loc, queue, item); break; case llvm::omp::Directive::OMPD_masked: - genMaskedOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genMaskedOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, + item); break; case llvm::omp::Directive::OMPD_master: - genMasterOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genMasterOp(converter, symTable, semaCtx, eval, loc, queue, item); break; case llvm::omp::Directive::OMPD_ordered: // Block-associated "ordered" construct. - genOrderedRegionOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genOrderedRegionOp(converter, symTable, semaCtx, eval, loc, queue, + item); break; case llvm::omp::Directive::OMPD_parallel: - genStandaloneParallel(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genStandaloneParallel(converter, symTable, stmtCtx, semaCtx, eval, + loc, queue, item); break; case llvm::omp::Directive::OMPD_scan: - genScanOp(converter, symTable, semaCtx, loc, queue, item); + newOp = genScanOp(converter, symTable, semaCtx, loc, queue, item); break; case llvm::omp::Directive::OMPD_section: llvm_unreachable("genOMPDispatch: OMPD_section"); @@ -3089,49 +3084,57 @@ static void genOMPDispatch(lower::AbstractConverter &converter, // in genBodyOfOp break; case llvm::omp::Directive::OMPD_simd: - genStandaloneSimd(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = + genStandaloneSimd(converter, symTable, semaCtx, eval, loc, queue, item); break; case llvm::omp::Directive::OMPD_scope: - genScopeOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genScopeOp(converter, symTable, semaCtx, eval, loc, queue, item); break; case llvm::omp::Directive::OMPD_single: - genSingleOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genSingleOp(converter, symTable, semaCtx, eval, loc, queue, item); break; case llvm::omp::Directive::OMPD_target: - genTargetOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genTargetOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, + item); break; case llvm::omp::Directive::OMPD_target_data: - genTargetDataOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genTargetDataOp(converter, symTable, stmtCtx, semaCtx, eval, loc, + queue, item); break; case llvm::omp::Directive::OMPD_target_enter_data: - genTargetEnterExitUpdateDataOp( - converter, symTable, semaCtx, loc, queue, item); + newOp = genTargetEnterExitUpdateDataOp( + converter, symTable, stmtCtx, semaCtx, loc, queue, item); break; case llvm::omp::Directive::OMPD_target_exit_data: - genTargetEnterExitUpdateDataOp( - converter, symTable, semaCtx, loc, queue, item); + newOp = genTargetEnterExitUpdateDataOp( + converter, symTable, stmtCtx, semaCtx, loc, queue, item); break; case llvm::omp::Directive::OMPD_target_update: - genTargetEnterExitUpdateDataOp( - converter, symTable, semaCtx, loc, queue, item); + newOp = genTargetEnterExitUpdateDataOp( + converter, symTable, stmtCtx, semaCtx, loc, queue, item); break; case llvm::omp::Directive::OMPD_task: - genTaskOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genTaskOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, + item); break; case llvm::omp::Directive::OMPD_taskgroup: - genTaskgroupOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = + genTaskgroupOp(converter, symTable, semaCtx, eval, loc, queue, item); break; case llvm::omp::Directive::OMPD_taskloop: - genStandaloneTaskloop(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genStandaloneTaskloop(converter, symTable, semaCtx, eval, loc, + queue, item); break; case llvm::omp::Directive::OMPD_taskwait: - genTaskwaitOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genTaskwaitOp(converter, symTable, semaCtx, eval, loc, queue, item); break; case llvm::omp::Directive::OMPD_taskyield: - genTaskyieldOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = + genTaskyieldOp(converter, symTable, semaCtx, eval, loc, queue, item); break; case llvm::omp::Directive::OMPD_teams: - genTeamsOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genTeamsOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, + item); break; case llvm::omp::Directive::OMPD_tile: case llvm::omp::Directive::OMPD_unroll: @@ -3139,7 +3142,8 @@ static void genOMPDispatch(lower::AbstractConverter &converter, llvm::omp::getOpenMPDirectiveName(dir) + ")"); // case llvm::omp::Directive::OMPD_workdistribute: case llvm::omp::Directive::OMPD_workshare: - genWorkshareOp(converter, symTable, semaCtx, eval, loc, queue, item); + newOp = genWorkshareOp(converter, symTable, stmtCtx, semaCtx, eval, loc, + queue, item); break; default: // Combined and composite constructs should have been split into a sequence @@ -3149,6 +3153,7 @@ static void genOMPDispatch(lower::AbstractConverter &converter, break; } + finalizeStmtCtx(); if (loopLeaf) symTable.popScope(); } diff --git a/flang/test/Lower/OpenMP/clause-cleanup.f90 b/flang/test/Lower/OpenMP/clause-cleanup.f90 new file mode 100644 index 0000000000000..79de44cf42c72 --- /dev/null +++ b/flang/test/Lower/OpenMP/clause-cleanup.f90 @@ -0,0 +1,17 @@ +!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s + +subroutine test1(a) +integer :: a(:) + +!$omp parallel num_threads(count(a .eq. 1)) +print *, "don't optimize me" +!$omp end parallel +end subroutine + +! CHECK: %[[EXPR:.*]] = hlfir.elemental {{.*}} -> !hlfir.expr> +! CHECK: %[[COUNT:.*]] = hlfir.count %[[EXPR]] +! CHECK: omp.parallel num_threads(%[[COUNT]] : i32) { +! CHECK-NOT: hlfir.destory %[[EXPR]] +! CHECK: omp.terminator +! CHECK: } +! CHECK: hlfir.destroy %[[EXPR]] From 4c09ae0b2ed6a99e4e69ec9e0507c26cdcc301a9 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Tue, 8 Apr 2025 10:29:18 +0100 Subject: [PATCH 0959/1029] [flang][OpenMP] Lowering for CANCEL and CANCELLATIONPOINT (#134248) These will still hit TODOs in OpenMPToLLVMIRConversion.cpp --- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 34 ++++ flang/lib/Lower/OpenMP/ClauseProcessor.h | 2 + flang/lib/Lower/OpenMP/Clauses.cpp | 19 +- flang/lib/Lower/OpenMP/Clauses.h | 4 +- flang/lib/Lower/OpenMP/OpenMP.cpp | 64 ++++++- flang/test/Lower/OpenMP/cancel.f90 | 176 ++++++++++++++++++ flang/test/Lower/OpenMP/cancellationpoint.f90 | 77 ++++++++ 7 files changed, 371 insertions(+), 5 deletions(-) create mode 100644 flang/test/Lower/OpenMP/cancel.f90 create mode 100644 flang/test/Lower/OpenMP/cancellationpoint.f90 diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 46febd33f0ce8..44796994b244c 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -18,6 +18,7 @@ #include "flang/Lower/PFTBuilder.h" #include "flang/Parser/tools.h" #include "flang/Semantics/tools.h" +#include "llvm/Frontend/OpenMP/OMP.h.inc" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" namespace Fortran { @@ -220,6 +221,39 @@ bool ClauseProcessor::processBind(mlir::omp::BindClauseOps &result) const { return false; } +bool ClauseProcessor::processCancelDirectiveName( + mlir::omp::CancelDirectiveNameClauseOps &result) const { + using ConstructType = mlir::omp::ClauseCancellationConstructType; + mlir::MLIRContext *context = &converter.getMLIRContext(); + + ConstructType directive; + if (auto *clause = findUniqueClause()) { + switch (clause->v) { + case llvm::omp::OMP_CANCELLATION_CONSTRUCT_Parallel: + directive = mlir::omp::ClauseCancellationConstructType::Parallel; + break; + case llvm::omp::OMP_CANCELLATION_CONSTRUCT_Loop: + directive = mlir::omp::ClauseCancellationConstructType::Loop; + break; + case llvm::omp::OMP_CANCELLATION_CONSTRUCT_Sections: + directive = mlir::omp::ClauseCancellationConstructType::Sections; + break; + case llvm::omp::OMP_CANCELLATION_CONSTRUCT_Taskgroup: + directive = mlir::omp::ClauseCancellationConstructType::Taskgroup; + break; + case llvm::omp::OMP_CANCELLATION_CONSTRUCT_None: + llvm_unreachable("OMP_CANCELLATION_CONSTRUCT_None"); + break; + } + } else { + llvm_unreachable("cancel construct missing cancellation construct type"); + } + + result.cancelDirective = + mlir::omp::ClauseCancellationConstructTypeAttr::get(context, directive); + return true; +} + bool ClauseProcessor::processCollapse( mlir::Location currentLocation, lower::pft::Evaluation &eval, mlir::omp::LoopRelatedClauseOps &result, diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index 6b1f7a31c7aac..bdddeb145b496 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -56,6 +56,8 @@ class ClauseProcessor { // 'Unique' clauses: They can appear at most once in the clause list. bool processBare(mlir::omp::BareClauseOps &result) const; bool processBind(mlir::omp::BindClauseOps &result) const; + bool processCancelDirectiveName( + mlir::omp::CancelDirectiveNameClauseOps &result) const; bool processCollapse(mlir::Location currentLocation, lower::pft::Evaluation &eval, mlir::omp::LoopRelatedClauseOps &result, diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp index 5b6e099967df5..3ffdf7138b035 100644 --- a/flang/lib/Lower/OpenMP/Clauses.cpp +++ b/flang/lib/Lower/OpenMP/Clauses.cpp @@ -249,7 +249,6 @@ MAKE_EMPTY_CLASS(Weak, Weak); MAKE_EMPTY_CLASS(Write, Write); // Artificial clauses -MAKE_EMPTY_CLASS(CancellationConstructType, CancellationConstructType); MAKE_EMPTY_CLASS(Depobj, Depobj); MAKE_EMPTY_CLASS(Flush, Flush); MAKE_EMPTY_CLASS(MemoryOrder, MemoryOrder); @@ -524,7 +523,23 @@ Bind make(const parser::OmpClause::Bind &inp, return Bind{/*Binding=*/convert(inp.v.v)}; } -// CancellationConstructType: empty +CancellationConstructType +make(const parser::OmpClause::CancellationConstructType &inp, + semantics::SemanticsContext &semaCtx) { + auto name = std::get(inp.v.t); + CLAUSET_ENUM_CONVERT( + convert, llvm::omp::Directive, llvm::omp::CancellationConstructType, + // clang-format off + MS(OMPD_parallel, OMP_CANCELLATION_CONSTRUCT_Parallel) + MS(OMPD_do, OMP_CANCELLATION_CONSTRUCT_Loop) + MS(OMPD_sections, OMP_CANCELLATION_CONSTRUCT_Sections) + MS(OMPD_taskgroup, OMP_CANCELLATION_CONSTRUCT_Taskgroup) + // clang-format on + ); + + return CancellationConstructType{convert(name.v)}; +} + // Capture: empty Collapse make(const parser::OmpClause::Collapse &inp, diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/lib/Lower/OpenMP/Clauses.h index e0a642036a58f..d7ab21d428e32 100644 --- a/flang/lib/Lower/OpenMP/Clauses.h +++ b/flang/lib/Lower/OpenMP/Clauses.h @@ -17,6 +17,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Frontend/OpenMP/ClauseT.h" +#include "llvm/Frontend/OpenMP/OMP.h.inc" #include #include @@ -306,7 +307,8 @@ using Write = tomp::clause::WriteT; using tomp::type::operator==; struct CancellationConstructType { - using EmptyTrait = std::true_type; + using WrapperTrait = std::true_type; + llvm::omp::CancellationConstructType v; }; struct Depobj { using EmptyTrait = std::true_type; diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 384d528ca2e63..312557d5da07e 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1533,6 +1533,24 @@ static OpTy genWrapperOp(lower::AbstractConverter &converter, // Code generation functions for clauses //===----------------------------------------------------------------------===// +static void genCancelClauses(lower::AbstractConverter &converter, + semantics::SemanticsContext &semaCtx, + const List &clauses, mlir::Location loc, + mlir::omp::CancelOperands &clauseOps) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processCancelDirectiveName(clauseOps); + cp.processIf(llvm::omp::Directive::OMPD_cancel, clauseOps); +} + +static void +genCancellationPointClauses(lower::AbstractConverter &converter, + semantics::SemanticsContext &semaCtx, + const List &clauses, mlir::Location loc, + mlir::omp::CancellationPointOperands &clauseOps) { + ClauseProcessor cp(converter, semaCtx, clauses); + cp.processCancelDirectiveName(clauseOps); +} + static void genCriticalDeclareClauses( lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx, const List &clauses, mlir::Location loc, @@ -1849,6 +1867,31 @@ genBarrierOp(lower::AbstractConverter &converter, lower::SymMap &symTable, return converter.getFirOpBuilder().create(loc); } +static mlir::omp::CancelOp genCancelOp(lower::AbstractConverter &converter, + semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, + mlir::Location loc, + const ConstructQueue &queue, + ConstructQueue::const_iterator item) { + mlir::omp::CancelOperands clauseOps; + genCancelClauses(converter, semaCtx, item->clauses, loc, clauseOps); + + return converter.getFirOpBuilder().create(loc, + clauseOps); +} + +static mlir::omp::CancellationPointOp genCancellationPointOp( + lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx, + lower::pft::Evaluation &eval, mlir::Location loc, + const ConstructQueue &queue, ConstructQueue::const_iterator item) { + mlir::omp::CancellationPointOperands clauseOps; + genCancellationPointClauses(converter, semaCtx, item->clauses, loc, + clauseOps); + + return converter.getFirOpBuilder().create( + loc, clauseOps); +} + static mlir::omp::CriticalOp genCriticalOp(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, @@ -3354,7 +3397,15 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, const parser::OpenMPCancelConstruct &cancelConstruct) { - TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct"); + List clauses = makeList(cancelConstruct.v.Clauses().v, [&](auto &&s) { + return makeClause(s, semaCtx); + }); + mlir::Location loc = converter.genLocation(cancelConstruct.source); + + ConstructQueue queue{buildConstructQueue( + converter.getFirOpBuilder().getModule(), semaCtx, eval, + cancelConstruct.source, llvm::omp::Directive::OMPD_cancel, clauses)}; + genCancelOp(converter, semaCtx, eval, loc, queue, queue.begin()); } static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, @@ -3362,7 +3413,16 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, lower::pft::Evaluation &eval, const parser::OpenMPCancellationPointConstruct &cancellationPointConstruct) { - TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct"); + List clauses = + makeList(cancellationPointConstruct.v.Clauses().v, + [&](auto &&s) { return makeClause(s, semaCtx); }); + mlir::Location loc = converter.genLocation(cancellationPointConstruct.source); + + ConstructQueue queue{ + buildConstructQueue(converter.getFirOpBuilder().getModule(), semaCtx, + eval, cancellationPointConstruct.source, + llvm::omp::Directive::OMPD_cancel, clauses)}; + genCancellationPointOp(converter, semaCtx, eval, loc, queue, queue.begin()); } static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, diff --git a/flang/test/Lower/OpenMP/cancel.f90 b/flang/test/Lower/OpenMP/cancel.f90 new file mode 100644 index 0000000000000..fd1f110e5804c --- /dev/null +++ b/flang/test/Lower/OpenMP/cancel.f90 @@ -0,0 +1,176 @@ +! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s +! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s + +subroutine cancel_parallel() + !$omp parallel + !$omp cancel parallel + !$omp end parallel +end subroutine +! CHECK-LABEL: func.func @_QPcancel_parallel() { +! CHECK: omp.parallel { +! CHECK: omp.cancel cancellation_construct_type(parallel) +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } + +subroutine cancel_do() + !$omp parallel do + do i = 1,100 + !$omp cancel do + enddo + !$omp end parallel do +end subroutine +! CHECK-LABEL: func.func @_QPcancel_do() { +! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFcancel_doEi"} +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFcancel_doEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: omp.parallel { +! CHECK: %[[VAL_2:.*]] = arith.constant 1 : i32 +! CHECK: %[[VAL_3:.*]] = arith.constant 100 : i32 +! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i32 +! CHECK: omp.wsloop private(@_QFcancel_doEi_private_i32 %[[VAL_1]]#0 -> %[[VAL_5:.*]] : !fir.ref) { +! CHECK: omp.loop_nest (%[[VAL_6:.*]]) : i32 = (%[[VAL_2]]) to (%[[VAL_3]]) inclusive step (%[[VAL_4]]) { +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFcancel_doEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_7]]#0 : i32, !fir.ref +! CHECK: omp.cancel cancellation_construct_type(loop) +! CHECK: omp.yield +! CHECK: } +! CHECK: } +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } + +subroutine cancel_sections() + !$omp sections + !$omp section + !$omp cancel sections + !$omp end sections +end subroutine +! CHECK-LABEL: func.func @_QPcancel_sections() { +! CHECK: omp.sections { +! CHECK: omp.section { +! CHECK: omp.cancel cancellation_construct_type(sections) +! CHECK: omp.terminator +! CHECK: } +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } + +subroutine cancel_taskgroup() + !$omp taskgroup + !$omp task + !$omp cancel taskgroup + !$omp end task + !$omp end taskgroup +end subroutine +! CHECK-LABEL: func.func @_QPcancel_taskgroup() { +! CHECK: omp.taskgroup { +! CHECK: omp.task { +! CHECK: omp.cancel cancellation_construct_type(taskgroup) +! CHECK: omp.terminator +! CHECK: } +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } + +subroutine cancel_parallel_if(cond) + logical :: cond + !$omp parallel + !$omp cancel parallel if(cond) + !$omp end parallel +end subroutine +! CHECK-LABEL: func.func @_QPcancel_parallel_if( +! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref> {fir.bindc_name = "cond"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFcancel_parallel_ifEcond"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) +! CHECK: omp.parallel { +! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref> +! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.logical<4>) -> i1 +! CHECK: omp.cancel cancellation_construct_type(parallel) if(%[[VAL_4]]) +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } + +subroutine cancel_do_if(cond) + logical :: cond + !$omp parallel do + do i = 1,100 + !$omp cancel do if (cond) + enddo + !$omp end parallel do +end subroutine +! CHECK-LABEL: func.func @_QPcancel_do_if( +! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref> {fir.bindc_name = "cond"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFcancel_do_ifEcond"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) +! CHECK: %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFcancel_do_ifEi"} +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFcancel_do_ifEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: omp.parallel { +! CHECK: %[[VAL_5:.*]] = arith.constant 1 : i32 +! CHECK: %[[VAL_6:.*]] = arith.constant 100 : i32 +! CHECK: %[[VAL_7:.*]] = arith.constant 1 : i32 +! CHECK: omp.wsloop private(@_QFcancel_do_ifEi_private_i32 %[[VAL_4]]#0 -> %[[VAL_8:.*]] : !fir.ref) { +! CHECK: omp.loop_nest (%[[VAL_9:.*]]) : i32 = (%[[VAL_5]]) to (%[[VAL_6]]) inclusive step (%[[VAL_7]]) { +! CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFcancel_do_ifEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: hlfir.assign %[[VAL_9]] to %[[VAL_10]]#0 : i32, !fir.ref +! CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref> +! CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1 +! CHECK: omp.cancel cancellation_construct_type(loop) if(%[[VAL_12]]) +! CHECK: omp.yield +! CHECK: } +! CHECK: } +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } + +subroutine cancel_sections_if(cond) + logical :: cond + !$omp sections + !$omp section + !$omp cancel sections if(cond) + !$omp end sections +end subroutine +! CHECK-LABEL: func.func @_QPcancel_sections_if( +! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref> {fir.bindc_name = "cond"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFcancel_sections_ifEcond"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) +! CHECK: omp.sections { +! CHECK: omp.section { +! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref> +! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.logical<4>) -> i1 +! CHECK: omp.cancel cancellation_construct_type(sections) if(%[[VAL_4]]) +! CHECK: omp.terminator +! CHECK: } +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } + +subroutine cancel_taskgroup_if(cond) + logical :: cond + !$omp taskgroup + !$omp task + !$omp cancel taskgroup if(cond) + !$omp end task + !$omp end taskgroup +end subroutine +! CHECK-LABEL: func.func @_QPcancel_taskgroup_if( +! CHECK-SAME: %[[VAL_0:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: !fir.ref> {fir.bindc_name = "cond"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFcancel_taskgroup_ifEcond"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) +! CHECK: omp.taskgroup { +! CHECK: omp.task { +! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#0 : !fir.ref> +! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.logical<4>) -> i1 +! CHECK: omp.cancel cancellation_construct_type(taskgroup) if(%[[VAL_4]]) +! CHECK: omp.terminator +! CHECK: } +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } diff --git a/flang/test/Lower/OpenMP/cancellationpoint.f90 b/flang/test/Lower/OpenMP/cancellationpoint.f90 new file mode 100644 index 0000000000000..a00f8cddf3824 --- /dev/null +++ b/flang/test/Lower/OpenMP/cancellationpoint.f90 @@ -0,0 +1,77 @@ +! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s +! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s + +subroutine cancellationpoint_parallel() + !$omp parallel + !$omp cancellationpoint parallel + !$omp end parallel +end subroutine +! CHECK-LABEL: func.func @_QPcancellationpoint_parallel() { +! CHECK: omp.parallel { +! CHECK: omp.cancellation_point cancellation_construct_type(parallel) +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } + +subroutine cancellationpoint_do() + !$omp parallel do + do i = 1,100 + !$omp cancellationpoint do + enddo + !$omp end parallel do +end subroutine +! CHECK-LABEL: func.func @_QPcancellationpoint_do() { +! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFcancellationpoint_doEi"} +! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFcancellationpoint_doEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: omp.parallel { +! CHECK: %[[VAL_2:.*]] = arith.constant 1 : i32 +! CHECK: %[[VAL_3:.*]] = arith.constant 100 : i32 +! CHECK: %[[VAL_4:.*]] = arith.constant 1 : i32 +! CHECK: omp.wsloop private(@_QFcancellationpoint_doEi_private_i32 %[[VAL_1]]#0 -> %[[VAL_5:.*]] : !fir.ref) { +! CHECK: omp.loop_nest (%[[VAL_6:.*]]) : i32 = (%[[VAL_2]]) to (%[[VAL_3]]) inclusive step (%[[VAL_4]]) { +! CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFcancellationpoint_doEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) +! CHECK: hlfir.assign %[[VAL_6]] to %[[VAL_7]]#0 : i32, !fir.ref +! CHECK: omp.cancellation_point cancellation_construct_type(loop) +! CHECK: omp.yield +! CHECK: } +! CHECK: } +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } + +subroutine cancellationpoint_sections() + !$omp sections + !$omp section + !$omp cancellationpoint sections + !$omp end sections +end subroutine +! CHECK-LABEL: func.func @_QPcancellationpoint_sections() { +! CHECK: omp.sections { +! CHECK: omp.section { +! CHECK: omp.cancellation_point cancellation_construct_type(sections) +! CHECK: omp.terminator +! CHECK: } +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } + +subroutine cancellationpoint_taskgroup() + !$omp taskgroup + !$omp task + !$omp cancellationpoint taskgroup + !$omp end task + !$omp end taskgroup +end subroutine +! CHECK-LABEL: func.func @_QPcancellationpoint_taskgroup() { +! CHECK: omp.taskgroup { +! CHECK: omp.task { +! CHECK: omp.cancellation_point cancellation_construct_type(taskgroup) +! CHECK: omp.terminator +! CHECK: } +! CHECK: omp.terminator +! CHECK: } +! CHECK: return +! CHECK: } From a5509d62a71995ad3346dac4ef226b5a670d995f Mon Sep 17 00:00:00 2001 From: Jakub Ficek Date: Tue, 8 Apr 2025 11:36:48 +0200 Subject: [PATCH 0960/1029] [clang] fp options fix for __builtin_convertvector (#134102) Add missing CGFPOptionsRAII for fptoi and itofp cases --- clang/lib/CodeGen/CGExprScalar.cpp | 12 ++++++--- clang/test/CodeGen/pragma-fenv_access.c | 36 +++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 140a12d384502..28ae56058a7b4 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -1969,12 +1969,16 @@ Value *ScalarExprEmitter::VisitConvertVectorExpr(ConvertVectorExpr *E) { bool InputSigned = SrcEltType->isSignedIntegerOrEnumerationType(); if (isa(DstEltTy)) Res = Builder.CreateIntCast(Src, DstTy, InputSigned, "conv"); - else if (InputSigned) - Res = Builder.CreateSIToFP(Src, DstTy, "conv"); - else - Res = Builder.CreateUIToFP(Src, DstTy, "conv"); + else { + CodeGenFunction::CGFPOptionsRAII FPOptions(CGF, E); + if (InputSigned) + Res = Builder.CreateSIToFP(Src, DstTy, "conv"); + else + Res = Builder.CreateUIToFP(Src, DstTy, "conv"); + } } else if (isa(DstEltTy)) { assert(SrcEltTy->isFloatingPointTy() && "Unknown real conversion"); + CodeGenFunction::CGFPOptionsRAII FPOptions(CGF, E); if (DstEltType->isSignedIntegerOrEnumerationType()) Res = Builder.CreateFPToSI(Src, DstTy, "conv"); else diff --git a/clang/test/CodeGen/pragma-fenv_access.c b/clang/test/CodeGen/pragma-fenv_access.c index 347e9670c4742..76c38f957d632 100644 --- a/clang/test/CodeGen/pragma-fenv_access.c +++ b/clang/test/CodeGen/pragma-fenv_access.c @@ -251,3 +251,39 @@ vector4float func_21(vector4double x) { } // CHECK-LABEL: @func_21 // STRICT: call <4 x float> @llvm.experimental.constrained.fptrunc.v4f32.v4f64(<4 x double> {{.*}}, metadata !"round.upward", metadata !"fpexcept.strict") + +typedef short vector8short __attribute__((__vector_size__(16))); +typedef double vector8double __attribute__((__vector_size__(64))); +vector8double func_24(vector8short x) { + #pragma STDC FENV_ROUND FE_TOWARDZERO + return __builtin_convertvector(x, vector8double); +} +// CHECK-LABEL: @func_24 +// STRICT: call <8 x double> @llvm.experimental.constrained.sitofp.v8f64.v8i16(<8 x i16> {{.*}}, metadata !"round.towardzero", metadata !"fpexcept.strict") + +typedef unsigned int vector16uint __attribute__((__vector_size__(64))); +typedef double vector16double __attribute__((__vector_size__(128))); +vector16double func_25(vector16uint x) { + #pragma STDC FENV_ROUND FE_DOWNWARD + return __builtin_convertvector(x, vector16double); +} +// CHECK-LABEL: @func_25 +// STRICT: call <16 x double> @llvm.experimental.constrained.uitofp.v16f64.v16i32(<16 x i32> {{.*}}, metadata !"round.downward", metadata !"fpexcept.strict") + +typedef float vector2float __attribute__((__vector_size__(8))); +typedef char vector2char __attribute__((__vector_size__(2))); +vector2char func_22(vector2float x) { + #pragma float_control(except, off) + return __builtin_convertvector(x, vector2char); +} +// CHECK-LABEL: @func_22 +// STRICT: call <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f32(<2 x float> {{.*}}, metadata !"fpexcept.ignore") + +typedef float vector3float __attribute__((__vector_size__(12))); +typedef unsigned long long vector3ulong __attribute__((__vector_size__(24))); +vector3ulong func_23(vector3float x) { + #pragma float_control(except, off) + return __builtin_convertvector(x, vector3ulong); +} +// CHECK-LABEL: @func_23 +// STRICT: call <3 x i64> @llvm.experimental.constrained.fptoui.v3i64.v3f32(<3 x float> {{.*}}, metadata !"fpexcept.ignore") From 6f93c0676f80919c5f96e8c25dad95c159a0b336 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 8 Apr 2025 10:44:02 +0100 Subject: [PATCH 0961/1029] [AMDGPU] Make a few WaitcntBrackets methods const. NFC. (#134824) --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 15965f2bac8aa..49943265500b1 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -387,8 +387,8 @@ class WaitcntBrackets { return LDSDMAStores; } - void print(raw_ostream &); - void dump() { print(dbgs()); } + void print(raw_ostream &) const; + void dump() const { print(dbgs()); } private: struct MergeInfo { @@ -645,9 +645,9 @@ class SIInsertWaitcnts { (void)ForceVMCounter; } - bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); + bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets); bool isPreheaderToFlush(MachineBasicBlock &MBB, - WaitcntBrackets &ScoreBrackets); + const WaitcntBrackets &ScoreBrackets); bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; bool run(MachineFunction &MF); @@ -990,7 +990,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } } -void WaitcntBrackets::print(raw_ostream &OS) { +void WaitcntBrackets::print(raw_ostream &OS) const { OS << '\n'; for (auto T : inst_counter_types(MaxCounter)) { unsigned SR = getScoreRange(T); @@ -2390,8 +2390,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, // Return true if the given machine basic block is a preheader of a loop in // which we want to flush the vmcnt counter, and false otherwise. -bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, - WaitcntBrackets &ScoreBrackets) { +bool SIInsertWaitcnts::isPreheaderToFlush( + MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) { auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false); if (!IsInserted) return Iterator->second; @@ -2427,7 +2427,7 @@ bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const { // loop, and at least one use of a vgpr containing a value that is loaded // outside of the loop. bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, - WaitcntBrackets &Brackets) { + const WaitcntBrackets &Brackets) { bool HasVMemLoad = false; bool HasVMemStore = false; bool UsesVgprLoadedOutside = false; From 6a42fb8fbfc075f28fa7d1d70bb51aef7f18e2f9 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Tue, 8 Apr 2025 10:46:17 +0100 Subject: [PATCH 0962/1029] [LV] Clarify code in isPredicatedInst (NFC) (#134251) --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 807136f6e0e47..72dbef1cffc5f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3236,11 +3236,9 @@ bool LoopVectorizationCostModel::isScalarWithPredication( // TODO: Fold into LoopVectorizationLegality::isMaskRequired. bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { - // If predication is not needed, avoid it. // TODO: We can use the loop-preheader as context point here and get // context sensitive reasoning for isSafeToSpeculativelyExecute. - if (!blockNeedsPredicationForAnyReason(I->getParent()) || - isSafeToSpeculativelyExecute(I) || + if (isSafeToSpeculativelyExecute(I) || (isa(I) && !Legal->isMaskRequired(I)) || isa(I)) return false; @@ -3250,6 +3248,10 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { if (Legal->blockNeedsPredication(I->getParent())) return true; + // If we're not folding the tail by masking, predication is unnecessary. + if (!foldTailByMasking()) + return false; + // All that remain are instructions with side-effects originally executed in // the loop unconditionally, but now execute under a tail-fold mask (only) // having at least one active lane (the first). If the side-effects of the From ff5b649a843a0c1be9dc59612411a1693c95d651 Mon Sep 17 00:00:00 2001 From: MisakaVan <102937903+MisakaVan@users.noreply.github.com> Date: Tue, 8 Apr 2025 18:17:43 +0800 Subject: [PATCH 0963/1029] [libc++] Fix a comment typo in __tree (#134831) "Returns true **is** __root is a proper red black tree" -> "Returns true **if** __root is a proper red black tree" --- libcxx/include/__tree | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libcxx/include/__tree b/libcxx/include/__tree index 9d28381c8c2ce..cdf8d04121d71 100644 --- a/libcxx/include/__tree +++ b/libcxx/include/__tree @@ -135,7 +135,7 @@ unsigned __tree_sub_invariant(_NodePtr __x) { } // Determines if the red black tree rooted at __root is a proper red black tree. -// __root == nullptr is a proper tree. Returns true is __root is a proper +// __root == nullptr is a proper tree. Returns true if __root is a proper // red black tree, else returns false. template _LIBCPP_HIDE_FROM_ABI bool __tree_invariant(_NodePtr __root) { From 8521bd2424bf144ce1d176a3c93d414c4c138104 Mon Sep 17 00:00:00 2001 From: Anatoly Trosinenko Date: Tue, 8 Apr 2025 13:23:10 +0300 Subject: [PATCH 0964/1029] [BOLT][AArch64] Handle PAuth call instructions in isIndirectCall (#133227) Handle `BLRA*` opcodes in AArch64MCPlusBuilder::isIndirectCall, update getRegUsedAsCallDest accordingly. --- bolt/include/bolt/Core/MCPlusBuilder.h | 10 +++++----- bolt/lib/Passes/PAuthGadgetScanner.cpp | 8 +++++--- bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 16 +++++++--------- llvm/lib/Target/AArch64/AArch64InstrInfo.h | 13 +++++++++++++ 4 files changed, 30 insertions(+), 17 deletions(-) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index b5ad219cfc796..cf37a984da93f 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -577,12 +577,12 @@ class MCPlusBuilder { return getNoRegister(); } - /// Returns the register used as call destination, or no-register, if not - /// an indirect call. Sets IsAuthenticatedInternally if the instruction - /// accepts a signed pointer as its operand and authenticates it internally. + /// Returns the register used as the destination of an indirect branch or call + /// instruction. Sets IsAuthenticatedInternally if the instruction accepts + /// a signed pointer as its operand and authenticates it internally. virtual MCPhysReg - getRegUsedAsCallDest(const MCInst &Inst, - bool &IsAuthenticatedInternally) const { + getRegUsedAsIndirectBranchDest(const MCInst &Inst, + bool &IsAuthenticatedInternally) const { llvm_unreachable("not implemented"); return getNoRegister(); } diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp index df9e87bd4e999..a7c22b23e4364 100644 --- a/bolt/lib/Passes/PAuthGadgetScanner.cpp +++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp @@ -498,14 +498,16 @@ static std::shared_ptr shouldReportCallGadget(const BinaryContext &BC, const MCInstReference &Inst, const State &S) { static const GadgetKind CallKind("non-protected call found"); - if (!BC.MIB->isCall(Inst) && !BC.MIB->isBranch(Inst)) + if (!BC.MIB->isIndirectCall(Inst) && !BC.MIB->isIndirectBranch(Inst)) return nullptr; bool IsAuthenticated = false; - MCPhysReg DestReg = BC.MIB->getRegUsedAsCallDest(Inst, IsAuthenticated); - if (IsAuthenticated || DestReg == BC.MIB->getNoRegister()) + MCPhysReg DestReg = + BC.MIB->getRegUsedAsIndirectBranchDest(Inst, IsAuthenticated); + if (IsAuthenticated) return nullptr; + assert(DestReg != BC.MIB->getNoRegister()); LLVM_DEBUG({ traceInst(BC, "Found call inst", Inst); traceReg(BC, "Call destination reg", DestReg); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 0d1908f91e514..106f0a880d780 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "AArch64InstrInfo.h" #include "AArch64MCSymbolizer.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64FixupKinds.h" @@ -277,15 +278,14 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { } } - MCPhysReg - getRegUsedAsCallDest(const MCInst &Inst, - bool &IsAuthenticatedInternally) const override { - assert(isCall(Inst) || isBranch(Inst)); - IsAuthenticatedInternally = false; + MCPhysReg getRegUsedAsIndirectBranchDest( + const MCInst &Inst, bool &IsAuthenticatedInternally) const override { + assert(isIndirectCall(Inst) || isIndirectBranch(Inst)); switch (Inst.getOpcode()) { case AArch64::BR: case AArch64::BLR: + IsAuthenticatedInternally = false; return Inst.getOperand(0).getReg(); case AArch64::BRAA: case AArch64::BRAB: @@ -298,9 +298,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { IsAuthenticatedInternally = true; return Inst.getOperand(0).getReg(); default: - if (isIndirectCall(Inst) || isIndirectBranch(Inst)) - llvm_unreachable("Unhandled indirect branch"); - return getNoRegister(); + llvm_unreachable("Unhandled indirect branch or call"); } } @@ -699,7 +697,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { } bool isIndirectCall(const MCInst &Inst) const override { - return Inst.getOpcode() == AArch64::BLR; + return isIndirectCallOpcode(Inst.getOpcode()); } MCPhysReg getSpRegister(int Size) const { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index b3d3ec1455c8b..0ffaca9af4006 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -726,6 +726,19 @@ static inline bool isIndirectBranchOpcode(int Opc) { return false; } +static inline bool isIndirectCallOpcode(unsigned Opc) { + switch (Opc) { + case AArch64::BLR: + case AArch64::BLRAA: + case AArch64::BLRAB: + case AArch64::BLRAAZ: + case AArch64::BLRABZ: + return true; + default: + return false; + } +} + static inline bool isPTrueOpcode(unsigned Opc) { switch (Opc) { case AArch64::PTRUE_B: From 83fbe6798605c7b1ebec1287f90a41949cb1d235 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 8 Apr 2025 11:28:40 +0100 Subject: [PATCH 0965/1029] [X86] combineX86ShufflesRecursively - iteratively peek through bitcasts to free subvector widening/narrowing sources. (#134701) Generalizes the existing code to repeatedly peek though mixed bitcast/insert_subvector/extract_subvector chains to find the source of the shuffle operand. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 51 +- .../any_extend_vector_inreg_of_broadcast.ll | 13 +- .../vector-interleaved-load-i16-stride-3.ll | 16 +- .../vector-interleaved-store-i16-stride-6.ll | 380 +- .../vector-interleaved-store-i16-stride-7.ll | 5578 +++++++++-------- .../vector-interleaved-store-i8-stride-4.ll | 88 +- .../vector-interleaved-store-i8-stride-5.ll | 1194 ++-- .../vector-interleaved-store-i8-stride-6.ll | 916 ++- .../vector-interleaved-store-i8-stride-7.ll | 1940 +++--- .../CodeGen/X86/vector-shuffle-256-v32.ll | 474 +- .../CodeGen/X86/x86-interleaved-access.ll | 22 +- 11 files changed, 5212 insertions(+), 5460 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bac5684733e60..d86eec1584274 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41115,30 +41115,37 @@ static SDValue combineX86ShufflesRecursively( } } + // Peek through any free bitcasts to insert_subvector vector widenings or + // extract_subvector nodes back to root size. + // TODO: Can resolveTargetShuffleInputsAndMask do some of this? for (auto [I, Op] : enumerate(Ops)) { - // Peek through vector widenings + set out of bounds mask indices to undef. - // TODO: Can resolveTargetShuffleInputsAndMask do some of this? - if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() && - isNullConstant(Op.getOperand(2))) { - Op = Op.getOperand(1); - unsigned Scale = RootSizeInBits / Op.getValueSizeInBits(); - int Lo = I * Mask.size(); - int Hi = (I + 1) * Mask.size(); - int NewHi = Lo + (Mask.size() / Scale); - for (int &M : Mask) { - if (Lo <= M && NewHi <= M && M < Hi) - M = SM_SentinelUndef; - } - } - - // Peek through any free bitcasts/extract_subvector nodes back to root size. SDValue BC = Op; - if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) - BC = peekThroughOneUseBitcasts(BC); - while (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR && - (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 && - isNullConstant(BC.getOperand(1))) { - Op = BC = BC.getOperand(0); + while (1) { + if (BC.getOpcode() == ISD::BITCAST && BC.hasOneUse()) { + BC = BC.getOperand(0); + continue; + } + if (BC.getOpcode() == ISD::INSERT_SUBVECTOR && + BC.getOperand(0).isUndef() && isNullConstant(BC.getOperand(2))) { + // Set out of bounds mask indices to undef. + Op = BC = BC.getOperand(1); + unsigned Scale = RootSizeInBits / Op.getValueSizeInBits(); + int Lo = I * Mask.size(); + int Hi = (I + 1) * Mask.size(); + int NewHi = Lo + (Mask.size() / Scale); + for (int &M : Mask) { + if (Lo <= M && NewHi <= M && M < Hi) + M = SM_SentinelUndef; + } + continue; + } + if (BC.getOpcode() == ISD::EXTRACT_SUBVECTOR && + (RootSizeInBits % BC.getOperand(0).getValueSizeInBits()) == 0 && + isNullConstant(BC.getOperand(1))) { + Op = BC = BC.getOperand(0); + continue; + } + break; } } diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index b075d48627b18..1fada58f05ba9 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -4708,18 +4708,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index a39bc6b668669..da902b3aed5ab 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -1836,7 +1836,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm10 ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -1858,7 +1858,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8)) @@ -1914,7 +1914,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -1936,7 +1936,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8)) @@ -1992,7 +1992,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm10 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -2014,7 +2014,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8)) @@ -2070,7 +2070,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] @@ -2092,7 +2092,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8)) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index 25bad7578c111..ca8fcf2ee0f2c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -4230,91 +4230,91 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm4 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,2,1,8,9,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [8,8,0,9,0,1,0,1] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 ; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15] +; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [16,9,10,17,12,13,18,15] ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[2,1,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,1,8,3,4,9,6,7] -; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm19, %ymm1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm17 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512-FCP-NEXT: vpermt2d %ymm6, %ymm19, %ymm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm17 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm9 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm4 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k2} +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm8 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm2 {%k2} ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm4, %ymm19 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,10,9] -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm2 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm19, %zmm19 +; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm2, %ymm19 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,1,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm20, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,0,9] +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,0,2,1,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm6 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm19, %zmm19 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [1,1,1,1,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm0 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm6 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm5 ; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm7 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm7 +; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm1 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,9,2,3,8,5,6,11] -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm8 -; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm3, %ymm8 +; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm14 +; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm3, %ymm14 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [8,9,20,11,12,21,14,15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm15, %zmm6 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm15, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,0,1,10,10,10,10] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm14[0],ymm8[1],ymm14[1],ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[8],ymm14[8],ymm8[9],ymm14[9],ymm8[10],ymm14[10],ymm8[11],ymm14[11] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[8],ymm5[8],ymm14[9],ymm5[9],ymm14[10],ymm5[10],ymm14[11],ymm5[11] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm26, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 ; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm13 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm14 ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm13 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11] -; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm7, %zmm13 +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm13 ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm13, %ymm3 @@ -4322,7 +4322,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm15, %zmm13 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm0 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm1 @@ -4330,13 +4330,13 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm0)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm6)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm7)) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm19)) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm17)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm17)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm0 & (zmm11 ^ zmm21)) ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax) @@ -4666,91 +4666,91 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm4 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,2,1,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [8,8,0,9,0,1,0,1] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [16,9,10,17,12,13,18,15] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[2,1,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm19, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm6, %ymm19, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm2 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm4, %ymm19 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,10,9] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm2 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm19, %zmm19 +; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm2, %ymm19 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,0,9] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm0 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[0,0,2,1,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm19, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [1,1,1,1,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm7 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm9, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm7 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,9,2,3,8,5,6,11] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm8 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm3, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm14 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm3, %ymm14 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [8,9,20,11,12,21,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm15, %zmm6 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm15, %zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm14, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,0,1,10,10,10,10] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm14[0],ymm8[1],ymm14[1],ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[8],ymm14[8],ymm8[9],ymm14[9],ymm8[10],ymm14[10],ymm8[11],ymm14[11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[8],ymm5[8],ymm14[9],ymm5[9],ymm14[10],ymm5[10],ymm14[11],ymm5[11] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm26, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm13 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm13[4],xmm5[5],xmm13[5],xmm5[6],xmm13[6],xmm5[7],xmm13[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm14 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm13 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm7, %zmm13 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1} ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm13, %ymm3 @@ -4758,7 +4758,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm15, %zmm13 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm3 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm1 @@ -4766,13 +4766,13 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm0)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm7)) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm19)) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm17)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm17)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm0 & (zmm11 ^ zmm21)) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax) @@ -8848,24 +8848,24 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm3 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,2,1,8,9,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [8,8,0,9,0,1,0,1] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 ; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15] ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm4 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm3 -; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm1 +; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [0,1,8,3,4,9,6,7] -; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm29, %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,8,8,10,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,8,8,0,9] ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] @@ -8890,16 +8890,16 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm3 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 -; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm1 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 +; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm1 {%k2} +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2 ; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm4 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm3 -; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm1 +; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm29, %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8984,16 +8984,16 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm18 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} -; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm2 -; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm29, %ymm0 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm22 +; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm22 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -9007,19 +9007,19 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm7 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm7 -; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm7 {%k2} +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm3 +; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm7, %ymm29 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm7 +; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512-FCP-NEXT: vpermi2d %ymm7, %ymm3, %ymm29 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm28, %zmm3 ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm15 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm7 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[0,0,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm8 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm29, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm30, %zmm8 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm29, %zmm23 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7] @@ -9757,24 +9757,24 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,2,1,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [8,8,0,9,0,1,0,1] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 ; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm2 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm29, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,8,8,10,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,8,8,0,9] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] @@ -9799,16 +9799,16 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm1 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm29, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9893,16 +9893,16 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm29, %ymm0 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5 ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero @@ -9916,19 +9916,19 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm7 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm7 {%k2} +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm27, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm7, %ymm29 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX512DQ-FCP-NEXT: vpermi2d %ymm7, %ymm3, %ymm29 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm28, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm15 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm7 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm8 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm29, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm30, %zmm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm29, %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 5aa7c055d408e..7b619344e83f6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -2864,8 +2864,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,16,0,0,17,17,0,0,0,0,0,1,0,0,2,0] -; AVX512-NEXT: vpermi2d %zmm12, %zmm11, %zmm18 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,1,1,0,0,0,0,16,17,0,0,18,0] +; AVX512-NEXT: vpermi2d %zmm11, %zmm12, %zmm18 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] @@ -3126,8 +3126,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,16,0,0,17,17,0,0,0,0,0,1,0,0,2,0] -; AVX512DQ-NEXT: vpermi2d %zmm12, %zmm11, %zmm18 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,1,1,0,0,0,0,16,17,0,0,18,0] +; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm12, %zmm18 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] @@ -5859,32 +5859,34 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i16_stride7_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX512-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512-NEXT: subq $664, %rsp # imm = 0x298 +; AVX512-NEXT: vmovdqa (%rcx), %ymm9 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm13, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512-NEXT: vpshufb %ymm13, %ymm9, %ymm0 +; AVX512-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm14, %ymm8, %ymm1 +; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512-NEXT: vmovdqa (%rsi), %ymm11 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm12, %ymm9, %ymm0 -; AVX512-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm11, %ymm17 +; AVX512-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-NEXT: vpshufb %ymm15, %ymm11, %ymm1 +; AVX512-NEXT: vpshufb %ymm15, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm26 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa (%r9), %ymm1 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19 ; AVX512-NEXT: vmovdqa (%r8), %ymm4 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm20 ; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 32(%r9), %xmm2 @@ -5898,8 +5900,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512-NEXT: vpshufb %ymm12, %ymm10, %ymm3 +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm3 ; AVX512-NEXT: vpshufb %ymm15, %ymm2, %ymm4 ; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5923,73 +5925,74 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3] -; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm26 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3] +; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa (%r9), %xmm4 ; AVX512-NEXT: vmovdqa (%r8), %xmm5 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] -; AVX512-NEXT: vpermi2d %zmm7, %zmm6, %zmm25 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] +; AVX512-NEXT: vpermi2d %zmm7, %zmm6, %zmm24 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm27 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa (%rax), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm6 -; AVX512-NEXT: vmovdqa64 %ymm7, %ymm21 +; AVX512-NEXT: vmovdqa (%rax), %ymm6 +; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512-NEXT: vpshufb %ymm3, %ymm6, %ymm6 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm28 +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm28 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] +; AVX512-NEXT: vmovdqa64 %ymm8, %ymm29 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm30 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm31 ; AVX512-NEXT: vprold $16, %ymm13, %ymm4 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa %ymm9, %ymm8 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm9 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm4 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512-NEXT: vmovdqa 32(%rsi), %xmm5 ; AVX512-NEXT: vprold $16, %xmm5, %xmm6 @@ -6005,12 +6008,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7] ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6024,489 +6026,492 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7] -; AVX512-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512-NEXT: vpermd %zmm2, %zmm1, %zmm27 -; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm5 -; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm14 -; AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15] -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm4 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512-NEXT: vprold $16, %ymm16, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7,8,9],ymm1[10],ymm10[11,12],ymm1[13],ymm10[14,15] -; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7] +; AVX512-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm19 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm1 +; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm5 +; AVX512-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7] +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm26[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] +; AVX512-NEXT: vmovdqa %ymm10, %ymm14 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8,9,10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15] +; AVX512-NEXT: vmovdqa64 %xmm27, %xmm2 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,3,3,4,5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm0[0,0,1,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,1] ; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3] -; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512-NEXT: # ymm6 = mem[2,1,3,2] -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = mem[2,2,2,3] -; AVX512-NEXT: vpermq $232, (%rsp), %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1] -; AVX512-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = mem[2,1,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm19[2,2,2,3] -; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512-NEXT: # xmm15 = mem[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28 -; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm29 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm30 & (zmm29 ^ zmm28)) -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm1 ^ (zmm30 & (zmm9 ^ zmm1)) -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0)) -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512-NEXT: # ymm8 = mem[2,1,3,2] +; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512-NEXT: # ymm6 = mem[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm25[0,2,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm18[2,1,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm17[2,2,2,3] +; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX512-NEXT: # xmm5 = mem[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] +; AVX512-NEXT: vprold $16, %ymm14, %ymm5 +; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7,8,9],ymm5[10],ymm15[11,12],ymm5[13],ymm15[14,15] +; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm15 +; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm28 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm29 & (zmm28 ^ zmm15)) +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm29 & (zmm0 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm9 & (zmm2 ^ zmm1)) +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) -; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[2,3,3,3,6,7,7,7] -; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[0,0,2,1] -; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512-NEXT: # xmm10 = mem[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3] -; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512-NEXT: # ymm12 = mem[0,0,1,1] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm3 +; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512-NEXT: # ymm3 = mem[2,3,3,3,6,7,7,7] +; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512-NEXT: # ymm10 = mem[0,0,2,1] +; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX512-NEXT: # xmm15 = mem[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3] +; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512-NEXT: # ymm17 = mem[0,0,1,1] ; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX512-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512-NEXT: # ymm17 = mem[2,2,2,3] -; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512-NEXT: # ymm18 = mem[2,1,3,2] -; AVX512-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512-NEXT: # ymm19 = mem[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm24[0,0,2,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm23[2,1,3,2] -; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm22[2,2,2,3] +; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512-NEXT: # ymm18 = mem[2,2,2,3] +; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512-NEXT: # ymm20 = mem[2,1,3,2] +; AVX512-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512-NEXT: # ymm26 = mem[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm23[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm22[0,0,2,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm21[2,1,3,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm16[2,2,2,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm5)) -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm9)) -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm2 & (zmm4 ^ zmm3)) -; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm2 -; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm3)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3 +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm2 +; AVX512-NEXT: vmovdqa64 (%rax), %zmm6 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm28)) +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] +; AVX512-NEXT: vpermd %zmm6, %zmm8, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,0,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm4 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm17, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm9 & (zmm2 ^ zmm0)) +; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm4 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 | (zmm3 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm4)) -; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm26 ^ (mem & (zmm3 ^ zmm26)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm2, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm20, %zmm4, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm26[0,1,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 | (zmm2 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm4)) +; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm27, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 ^ (mem & (zmm2 ^ zmm4)) ; AVX512-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm25)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) -; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3)) -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512-NEXT: vpermd (%rax), %zmm6, %zmm6 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm24)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512-NEXT: vpermd %zmm6, %zmm5, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm7)) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX512-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 384(%rax) +; AVX512-NEXT: addq $664, %rsp # imm = 0x298 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride7_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm12 +; AVX512-FCP-NEXT: subq $296, %rsp # imm = 0x128 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm9 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm5 -; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm8 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm9 -; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm3 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX512-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm5 +; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm15 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm11 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm7 +; AVX512-FCP-NEXT: vpor %ymm3, %ymm7, %ymm3 +; AVX512-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm3 +; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm1 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm20 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm3, %zmm24 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm24 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,2,2,3,8,0,9,0] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm23 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,8,0,9,0] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm25 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm21 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,0,8,8,9] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm21 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,2,2,3,0,8,8,9] ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm15 -; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm11, %zmm22 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX512-FCP-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3,4],xmm8[5],xmm1[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm8 -; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm11 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm5 +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm8, %zmm23 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX512-FCP-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm4 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0] -; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm1, %zmm26 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,1,1,0,8,8,9,9] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm26 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [8,9,9,0,0,0,1,1] ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,1,0,8,8,9,9] -; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm27 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm6 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm15 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [8,0,9,0,0,0,1,1] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm27 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm5 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm9 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm29 -; AVX512-FCP-NEXT: vprold $16, %ymm5, %ymm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm29 +; AVX512-FCP-NEXT: vprold $16, %ymm15, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,2,2,3,5,6,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8,9,10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7,8,9,10],ymm8[11],ymm4[12,13],ymm8[14],ymm4[15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,0,11,10] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm31 +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm31 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm25[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm1 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm8 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm30 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7,8,9],ymm8[10],ymm1[11,12],ymm8[13],ymm1[14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7,8,9],ymm1[10],ymm9[11,12],ymm1[13],ymm9[14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm20, %zmm9 -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm0 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm4 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm0 -; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm25[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7,8,9],ymm12[10],ymm7[11,12],ymm12[13],ymm7[14,15] -; AVX512-FCP-NEXT: vprold $16, %ymm1, %ymm12 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm25[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm20, %ymm1 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7,8,9],ymm12[10],ymm0[11,12],ymm12[13],ymm0[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7,8,9],ymm7[10],ymm0[11,12],ymm7[13],ymm0[14,15] +; AVX512-FCP-NEXT: vprold $16, %ymm13, %ymm7 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7,8,9],ymm7[10],ymm14[11,12],ymm7[13],ymm14[14,15] +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3],xmm12[4],xmm14[5,6],xmm12[7] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,1,1,8,8,10,9] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm12 -; AVX512-FCP-NEXT: vprold $16, %xmm3, %xmm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm1[2],xmm13[3,4],xmm1[5],xmm13[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm7[1],xmm15[2,3],xmm7[4],xmm15[5,6],xmm7[7] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,1,8,8,0,9] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 +; AVX512-FCP-NEXT: vprold $16, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm15 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm11 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm11 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,1,3,2,10,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm6 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm10[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3,4],ymm7[5],ymm10[6,7,8,9],ymm7[10],ymm10[11,12],ymm7[13],ymm10[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm24 ^ (zmm1 & (zmm0 ^ zmm24)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm0)) -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm15)) -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm26 ^ (zmm6 & (zmm12 ^ zmm26)) -; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm8 -; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm10 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm28)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm12)) -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm10 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm17[0,0,1,3] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm16[2,2,2,3] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13,14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm16[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm14 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm7)) -; AVX512-FCP-NEXT: vpermd (%rax), %zmm19, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm31)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm14)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm7 ^ (zmm6 & (zmm10 ^ zmm7)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm23 ^ (zmm1 & (zmm21 ^ zmm23)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm30 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm10)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm9 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6)) -; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm22)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm21)) -; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm5 -; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm27)) +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm24 ^ (zmm17 & (zmm12 ^ zmm24)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm12)) +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [14,13,8,8,15,14,8,15,14,13,8,8,15,14,8,15] +; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm14)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [6,0,0,0,7,0,0,7] +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm26 ^ (zmm12 & (zmm7 ^ zmm26)) +; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm13 +; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm14 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm28)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm7)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm11)) +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm24[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7,8],ymm7[9],ymm11[10,11],ymm7[12],ymm11[13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm19[0,0,1,3] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm18[2,2,2,3] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) +; AVX512-FCP-NEXT: vpermd %zmm5, %zmm20, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm31)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm12 & (zmm3 ^ zmm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm25 ^ (zmm17 & (zmm21 ^ zmm25)) +; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm4, %zmm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm30 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm9 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) +; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm23)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm21)) +; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm5 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm29)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm27)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512-FCP-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512-FCP-NEXT: addq $296, %rsp # imm = 0x128 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride7_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512DQ-NEXT: subq $664, %rsp # imm = 0x298 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm9 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm27 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm9, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm8, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm11 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm9, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm17 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm11, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm26 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19 ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm20 ; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm2 @@ -6520,8 +6525,8 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm3 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm2, %ymm4 ; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6545,73 +6550,74 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3] -; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm26 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3] +; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm6, %zmm25 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm6, %zmm24 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm27 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm0 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm6 -; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm21 +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm6 +; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm6, %ymm6 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm28 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm28 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm29 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm29 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm30 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm30 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm31 ; AVX512DQ-NEXT: vprold $16, %ymm13, %ymm4 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm3 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm8 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512DQ-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm9 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm4 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18 +; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm10 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm5 ; AVX512DQ-NEXT: vprold $16, %xmm5, %xmm6 @@ -6627,12 +6633,11 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7] ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6646,458 +6651,459 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7] -; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm5 -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm6, %xmm14 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm23 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm4 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512DQ-NEXT: vprold $16, %ymm16, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7,8,9],ymm1[10],ymm10[11,12],ymm1[13],ymm10[14,15] -; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm19 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm5 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; AVX512DQ-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm6 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm26[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm14 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8,9,10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm2 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm0[0,0,1,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,1] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3] -; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm6 = mem[2,1,3,2] -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = mem[2,2,2,3] -; AVX512DQ-NEXT: vpermq $232, (%rsp), %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[0,2,2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm0 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1] -; AVX512DQ-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = mem[2,1,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm19[2,2,2,3] -; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm15 = mem[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm29 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm30 & (zmm29 ^ zmm28)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm1 ^ (zmm30 & (zmm9 ^ zmm1)) -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0)) -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm8 = mem[2,1,3,2] +; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm6 = mem[2,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm25[0,2,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm18[2,1,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm17[2,2,2,3] +; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm5 = mem[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] +; AVX512DQ-NEXT: vprold $16, %ymm14, %ymm5 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7,8,9],ymm5[10],ymm15[11,12],ymm5[13],ymm15[14,15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm15 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm29 & (zmm28 ^ zmm15)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm29 & (zmm0 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm9 & (zmm2 ^ zmm1)) +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) -; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[2,3,3,3,6,7,7,7] -; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1] -; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm10 = mem[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3] -; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm12 = mem[0,0,1,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm3 +; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm3 = mem[2,3,3,3,6,7,7,7] +; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm10 = mem[0,0,2,1] +; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm15 = mem[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3] +; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm17 = mem[0,0,1,1] ; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX512DQ-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm17 = mem[2,2,2,3] -; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm18 = mem[2,1,3,2] -; AVX512DQ-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm19 = mem[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm24[0,0,2,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm23[2,1,3,2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm22[2,2,2,3] +; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm18 = mem[2,2,2,3] +; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm20 = mem[2,1,3,2] +; AVX512DQ-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm26 = mem[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm23[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm22[0,0,2,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm21[2,1,3,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm16[2,2,2,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm5)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] -; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm9)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm2 & (zmm4 ^ zmm3)) -; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm2 -; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm3)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm28)) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,0,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] +; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm4 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm17, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm9 & (zmm2 ^ zmm0)) +; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm4 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 | (zmm3 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm4)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm26 ^ (mem & (zmm3 ^ zmm26)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm2, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm20, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm26[0,1,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 | (zmm2 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm4)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm27, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 ^ (mem & (zmm2 ^ zmm4)) ; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm25)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm3 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512DQ-NEXT: vpermd (%rax), %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm7 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm24)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512DQ-NEXT: vpermd %zmm6, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm7)) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm27, 384(%rax) -; AVX512DQ-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 384(%rax) +; AVX512DQ-NEXT: addq $664, %rsp # imm = 0x298 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm12 +; AVX512DQ-FCP-NEXT: subq $296, %rsp # imm = 0x128 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm9 -; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm7 +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm7, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm3 +; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm1 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm25 -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm1, %ymm20 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm3, %zmm24 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm24 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,2,2,3,8,0,9,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm23 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,8,0,9,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm25 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm21 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,0,8,8,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm21 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,2,2,3,0,8,8,9] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm15 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm11, %zmm22 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vprold $16, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3,4],xmm8[5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm11 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm5 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm8, %zmm23 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vprold $16, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm4 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm1, %zmm26 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,1,1,0,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm26 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [8,9,9,0,0,0,1,1] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,1,0,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm15 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [8,0,9,0,0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm9 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm29 -; AVX512DQ-FCP-NEXT: vprold $16, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm29 +; AVX512DQ-FCP-NEXT: vprold $16, %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,2,2,3,5,6,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8,9,10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7,8,9,10],ymm8[11],ymm4[12,13],ymm8[14],ymm4[15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,0,11,10] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm31 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm31 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm16 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm25[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm1 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm30 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7,8,9],ymm8[10],ymm1[11,12],ymm8[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7,8,9],ymm1[10],ymm9[11,12],ymm1[13],ymm9[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm20, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm4 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm25[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7,8,9],ymm12[10],ymm7[11,12],ymm12[13],ymm7[14,15] -; AVX512DQ-FCP-NEXT: vprold $16, %ymm1, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm25[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7,8,9],ymm12[10],ymm0[11,12],ymm12[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7,8,9],ymm7[10],ymm0[11,12],ymm7[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vprold $16, %ymm13, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7,8,9],ymm7[10],ymm14[11,12],ymm7[13],ymm14[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3],xmm12[4],xmm14[5,6],xmm12[7] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,1,1,8,8,10,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm12 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm3, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm1[2],xmm13[3,4],xmm1[5],xmm13[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm7[1],xmm15[2,3],xmm7[4],xmm15[5,6],xmm7[7] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,1,8,8,0,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm15 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm11 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm11 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm10[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3,4],ymm7[5],ymm10[6,7,8,9],ymm7[10],ymm10[11,12],ymm7[13],ymm10[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm24 ^ (zmm1 & (zmm0 ^ zmm24)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm15)) -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm26 ^ (zmm6 & (zmm12 ^ zmm26)) -; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm8 -; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm10 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm28)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm10 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm17[0,0,1,3] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm16[2,2,2,3] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm16[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm14 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vpermd (%rax), %zmm19, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm31)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm14)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm7 ^ (zmm6 & (zmm10 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm23 ^ (zmm1 & (zmm21 ^ zmm23)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm30 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm10)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm9 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm22)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm21)) -; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm5 -; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm27)) +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm24 ^ (zmm17 & (zmm12 ^ zmm24)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm12)) +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [14,13,8,8,15,14,8,15,14,13,8,8,15,14,8,15] +; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm14)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [6,0,0,0,7,0,0,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm26 ^ (zmm12 & (zmm7 ^ zmm26)) +; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm13 +; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm14 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm28)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm11)) +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm24[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7,8],ymm7[9],ymm11[10,11],ymm7[12],ymm11[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm19[0,0,1,3] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm18[2,2,2,3] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm31)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm12 & (zmm3 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm25 ^ (zmm17 & (zmm21 ^ zmm25)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm4, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm30 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm9 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm23)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm21)) +; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm4 +; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm29)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm27)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-FCP-NEXT: addq $296, %rsp # imm = 0x128 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -12522,70 +12528,70 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i16_stride7_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $2840, %rsp # imm = 0xB18 +; AVX512-NEXT: subq $2648, %rsp # imm = 0xA58 ; AVX512-NEXT: vmovdqa 96(%rcx), %ymm6 -; AVX512-NEXT: vmovdqa 96(%rdx), %ymm13 +; AVX512-NEXT: vmovdqa 96(%rdx), %ymm15 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX512-NEXT: vmovdqa 96(%rsi), %ymm8 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm0, %ymm6, %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm3 -; AVX512-NEXT: vporq %ymm2, %ymm3, %ymm17 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-NEXT: vpshufb %ymm11, %ymm7, %ymm3 +; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm3 ; AVX512-NEXT: vporq %ymm2, %ymm3, %ymm18 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm3, %ymm8, %ymm2 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512-NEXT: vpshufb %ymm12, %ymm7, %ymm4 +; AVX512-NEXT: vporq %ymm2, %ymm4, %ymm19 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] ; AVX512-NEXT: vmovdqa 64(%r9), %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 64(%rcx), %ymm3 -; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm27 -; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm2 +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm29 +; AVX512-NEXT: vmovdqa 64(%rdx), %ymm5 +; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm4 +; AVX512-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 64(%rsi), %ymm3 -; AVX512-NEXT: vpshufb %ymm10, %ymm3, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm2 +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512-NEXT: vpshufb %ymm12, %ymm5, %ymm4 +; AVX512-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa (%r9), %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa (%r8), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512-NEXT: vmovdqa (%r8), %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa (%rcx), %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 32(%rcx), %ymm4 ; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm0 @@ -12594,28 +12600,28 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm0 +; AVX512-NEXT: vpshufb %ymm3, %ymm5, %ymm0 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm1 +; AVX512-NEXT: vpshufb %ymm12, %ymm3, %ymm1 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm9 ; AVX512-NEXT: vmovdqa 32(%r9), %ymm0 -; AVX512-NEXT: vpshufb %ymm12, %ymm0, %ymm10 +; AVX512-NEXT: vpshufb %ymm11, %ymm0, %ymm10 ; AVX512-NEXT: vpor %ymm10, %ymm9, %ymm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] ; AVX512-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm9 -; AVX512-NEXT: vmovdqa64 %ymm10, %ymm31 -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vmovdqa64 %ymm10, %ymm27 +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] ; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12633,12 +12639,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 96(%r9), %ymm9 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm10[3,3,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm10[2,2,2,2] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,2,2,2] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15] @@ -12649,18 +12654,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX512-NEXT: vprold $16, %ymm9, %ymm8 -; AVX512-NEXT: vpshufb %ymm12, %ymm9, %ymm9 -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6 -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 +; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm7 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (mem & (zmm7 ^ zmm6)) ; AVX512-NEXT: vmovdqa 96(%r8), %ymm6 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7] ; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ymm11) -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ymm12) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm11) ; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm6 @@ -12673,33 +12677,30 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3] ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7] ; AVX512-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX512-NEXT: vpermd %zmm6, %zmm18, %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512-NEXT: vpshufb %ymm11, %ymm6, %ymm6 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] ; AVX512-NEXT: vpandn %ymm7, %ymm12, %ymm7 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] ; AVX512-NEXT: vpbroadcastd 72(%rax), %ymm6 -; AVX512-NEXT: vpandnq %ymm6, %ymm28, %ymm6 +; AVX512-NEXT: vpandnq %ymm6, %ymm30, %ymm6 ; AVX512-NEXT: vmovdqa 64(%rax), %ymm7 ; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX512-NEXT: vpshufb %ymm10, %ymm7, %ymm7 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 64(%r9), %xmm7 ; AVX512-NEXT: vmovdqa 64(%r8), %xmm8 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512-NEXT: vmovdqa64 %xmm8, %xmm17 -; AVX512-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512-NEXT: vmovdqa64 %xmm8, %xmm18 +; AVX512-NEXT: vmovdqa64 %xmm7, %xmm22 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpshufb %xmm14, %xmm6, %xmm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 64(%rcx), %xmm9 @@ -12708,75 +12709,75 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX512-NEXT: vmovdqa 64(%rsi), %xmm10 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3] -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm8 -; AVX512-NEXT: vpandnq %ymm8, %ymm28, %ymm8 -; AVX512-NEXT: vmovdqa (%rax), %ymm12 -; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm11, %ymm12, %ymm13 -; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512-NEXT: vmovdqa 64(%rsi), %xmm8 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm12 +; AVX512-NEXT: vpandnq %ymm12, %ymm30, %ymm12 +; AVX512-NEXT: vmovdqa (%rax), %ymm13 +; AVX512-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm10, %ymm13, %ymm13 +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa (%r9), %xmm6 -; AVX512-NEXT: vmovdqa (%r8), %xmm12 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX512-NEXT: vmovdqa64 %xmm12, %xmm29 -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX512-NEXT: vmovdqa64 %xmm15, %xmm25 -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512-NEXT: vmovdqa (%r8), %xmm13 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; AVX512-NEXT: vmovdqa64 %xmm13, %xmm25 +; AVX512-NEXT: vmovdqa64 %xmm6, %xmm26 +; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512-NEXT: vmovdqa64 %xmm14, %xmm31 +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%rcx), %xmm15 ; AVX512-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,2,1] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX512-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512-NEXT: vmovdqa (%rsi), %xmm14 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; AVX512-NEXT: vmovdqa64 %xmm14, %xmm19 ; AVX512-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,1,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] ; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 32(%rax), %ymm15 -; AVX512-NEXT: vpshufb %ymm11, %ymm15, %ymm11 -; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vmovdqa 32(%rax), %ymm12 +; AVX512-NEXT: vpshufb %ymm10, %ymm12, %ymm10 +; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512-NEXT: vpandnq %ymm12, %ymm19, %ymm12 -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpandnq %ymm12, %ymm17, %ymm12 +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4] ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm31, %ymm6 -; AVX512-NEXT: vpshufb %ymm6, %ymm4, %ymm11 +; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8,9,10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm27, %ymm14 +; AVX512-NEXT: vpshufb %ymm14, %ymm4, %ymm10 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] @@ -12806,49 +12807,47 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpermd %zmm15, %zmm18, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm23, %ymm12 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vmovdqa64 %ymm24, %ymm6 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[1,1,1,1,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512-NEXT: vmovdqa64 %ymm29, %ymm2 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512-NEXT: vmovdqa64 %ymm31, %ymm11 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512-NEXT: vmovdqa64 %ymm27, %ymm10 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vprold $16, %ymm5, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] @@ -12860,8 +12859,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] @@ -12869,11 +12868,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm12 -; AVX512-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX512-NEXT: vmovdqa64 %xmm31, %xmm6 +; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] @@ -12884,8 +12883,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm2 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm5 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12896,379 +12895,384 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6] -; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm0 -; AVX512-NEXT: vpbroadcastd 100(%rax), %ymm1 -; AVX512-NEXT: vpbroadcastd 104(%rax), %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm0)) -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512-NEXT: vpshufb %xmm6, %xmm9, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3] -; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX512-NEXT: vprold $16, %xmm10, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm1 ^ (zmm0 & (zmm14 ^ zmm1)) -; AVX512-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] -; AVX512-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 -; AVX512-NEXT: vpbroadcastd 64(%rax), %ymm3 -; AVX512-NEXT: vpbroadcastd 68(%rax), %ymm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm25 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm19 & (zmm25 ^ zmm1)) -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; AVX512-NEXT: vpshufb %xmm6, %xmm8, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX512-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vpermt2d %zmm2, %zmm17, %zmm1 +; AVX512-NEXT: vpbroadcastd 100(%rax), %ymm2 +; AVX512-NEXT: vpbroadcastd 104(%rax), %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX512-NEXT: vmovdqa %xmm5, %xmm12 +; AVX512-NEXT: vpshufb %xmm5, %xmm9, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3] +; AVX512-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm31, %xmm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX512-NEXT: vprold $16, %xmm21, %xmm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[1,1,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm0 & (zmm5 ^ zmm3)) -; AVX512-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 -; AVX512-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm0)) +; AVX512-NEXT: vprold $16, %xmm8, %xmm4 +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm14 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm2 ^ (zmm1 & (zmm14 ^ zmm2)) +; AVX512-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] +; AVX512-NEXT: vpermt2d %zmm4, %zmm5, %zmm2 +; AVX512-NEXT: vpbroadcastd 64(%rax), %ymm4 +; AVX512-NEXT: vpbroadcastd 68(%rax), %ymm6 +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm23 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm18 & (zmm23 ^ zmm2)) +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX512-NEXT: vpshufb %xmm12, %xmm15, %xmm4 +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3],xmm4[4],xmm6[5,6],xmm4[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm2, %zmm3, %zmm4 +; AVX512-NEXT: vmovdqa64 %xmm16, %xmm6 +; AVX512-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX512-NEXT: vprold $16, %xmm19, %xmm3 +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm16[1,1,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm4 ^ (zmm1 & (zmm7 ^ zmm4)) +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm2, %zmm5, %zmm1 +; AVX512-NEXT: vpbroadcastd (%rax), %ymm2 +; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm18 & (zmm19 ^ zmm1)) ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vmovdqa %ymm3, %ymm11 -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm1[2,1,3,2] +; AVX512-NEXT: vpshufb %ymm10, %ymm3, %ymm1 +; AVX512-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512-NEXT: vprold $16, %ymm7, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] +; AVX512-NEXT: vprold $16, %ymm8, %ymm1 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6,7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7,8,9,10],ymm6[11],ymm0[12,13],ymm6[14],ymm0[15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7,8,9],ymm6[10],ymm8[11,12],ymm6[13],ymm8[14,15] ; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm3[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm4[2,1,3,2] -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512-NEXT: vprold $16, %xmm10, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm2[0,2,2,3] -; AVX512-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512-NEXT: vpshufb %xmm12, %xmm2, %xmm3 -; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6] -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm27 -; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm1 -; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm2 -; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm1)) -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm25 = ymm2[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm4[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm5[2,1,3,2] +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512-NEXT: vprold $16, %xmm11, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[1,1,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] +; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm3[0,2,2,3] +; AVX512-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm4 +; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512-NEXT: vpermt2d %zmm3, %zmm17, %zmm2 +; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm13 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm2)) +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm8 ; AVX512-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm1[2,1,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm4[0,0,1,1] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm3)) -; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[2,1,3,2] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm29 & (ymm3 ^ ymm1)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm28 & (ymm30 ^ ymm3)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = (zmm3 & zmm28) | mem ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = (zmm0 & zmm28) | mem +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm9 & (zmm0 ^ zmm1)) +; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = mem[2,1,3,2] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm31 & (ymm1 ^ ymm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm20 = ymm20 ^ (ymm30 & (ymm20 ^ ymm1)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = (zmm1 & zmm30) | mem ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = (zmm4 & zmm30) | mem ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm23 ^ (zmm28 & (zmm18 ^ zmm23)) +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm30 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm2 ^ (zmm22 & (zmm30 ^ zmm2)) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm3 ^ (zmm2 & (zmm18 ^ zmm3)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm1 ^ (zmm2 & (zmm30 ^ zmm1)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm3 ^ (zmm28 & (zmm23 ^ zmm3)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm0 ^ (zmm2 & (zmm23 ^ zmm0)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm22 & (zmm5 ^ zmm1)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm2 & (zmm5 ^ zmm4)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm0 ^ (mem & (zmm17 ^ zmm0)) -; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm19 & (ymm0 ^ mem)) +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm16 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm1 ^ (mem & (zmm16 ^ zmm1)) +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm18 & (ymm0 ^ mem)) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload -; AVX512-NEXT: # zmm19 = zmm19 | (zmm1 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm21 = ymm21 ^ (ymm1 & (ymm21 ^ ymm0)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm18 # 64-byte Folded Reload +; AVX512-NEXT: # zmm18 = zmm18 | (zmm1 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3] -; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = (zmm0 & zmm1) | mem -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm2[0,1,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm1)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm2 & (zmm11 ^ zmm1)) +; AVX512-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = zmm0[0,1,2,3],mem[0,1,2,3] +; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = (zmm3 & zmm1) | mem +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm20[0,1,2,3],zmm21[0,1,2,3] +; AVX512-NEXT: vmovdqa64 64(%rax), %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] +; AVX512-NEXT: vpermd %zmm2, %zmm21, %zmm20 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm4 & (zmm20 ^ zmm1)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa64 (%rax), %zmm0 +; AVX512-NEXT: vpermd %zmm0, %zmm21, %zmm21 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm4 & (zmm21 ^ zmm1)) ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm1 & (zmm25 ^ zmm14)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm1 & (zmm20 ^ zmm5)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm5 & (zmm2 ^ zmm1)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm1 & (zmm23 ^ zmm14)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm1 & (zmm19 ^ zmm7)) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm22 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm5 & (zmm22 ^ zmm1)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm1 ^ (zmm14 & (zmm7 ^ zmm1)) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512-NEXT: vpermd 64(%rax), %zmm14, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm1)) -; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm1 -; AVX512-NEXT: vpermd (%rax), %zmm14, %zmm14 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm3 & (zmm14 ^ zmm1)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm2)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm1 & (zmm14 ^ zmm22)) +; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm28, %zmm25 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm1 ^ (zmm14 & (zmm25 ^ zmm1)) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 32-byte Folded Reload +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512-NEXT: vpermd %zmm2, %zmm28, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm14)) +; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm29, %zmm2 +; AVX512-NEXT: vpermd %zmm0, %zmm28, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm4 & (zmm14 ^ zmm2)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm7)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm25)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm1)) -; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[0,1,1,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm4 & (zmm22 ^ zmm1)) -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm8 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm1 ^ (zmm4 & (zmm8 ^ zmm1)) -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512-NEXT: # ymm7 = mem[2,2,2,3] -; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512-NEXT: # xmm9 = mem[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3] -; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512-NEXT: # ymm10 = mem[2,3,3,3,6,7,7,7] -; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512-NEXT: # ymm16 = mem[0,0,2,1] -; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX512-NEXT: # xmm12 = mem[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] -; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512-NEXT: # ymm21 = mem[0,0,1,1] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm9 & (zmm2 ^ zmm0)) +; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = mem[0,1,1,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm0 ^ (zmm9 & (zmm7 ^ zmm0)) +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm17, %zmm12 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm0 ^ (zmm9 & (zmm12 ^ zmm0)) +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512-NEXT: # ymm8 = mem[2,2,2,3] +; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX512-NEXT: # xmm10 = mem[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512-NEXT: # ymm11 = mem[2,3,3,3,6,7,7,7] +; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512-NEXT: # ymm25 = mem[0,0,2,1] +; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX512-NEXT: # xmm4 = mem[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm4[0,0,1,3] +; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512-NEXT: # ymm17 = mem[0,0,1,1] ; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX512-NEXT: # xmm15 = mem[0,2,3,3,4,5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512-NEXT: # ymm24 = mem[2,2,2,3] -; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512-NEXT: # ymm26 = mem[2,3,3,3,6,7,7,7] -; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512-NEXT: # ymm27 = mem[0,0,2,1] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512-NEXT: # ymm26 = mem[2,2,2,3] +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] +; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512-NEXT: # ymm28 = mem[2,3,3,3,6,7,7,7] +; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512-NEXT: # ymm29 = mem[0,0,2,1] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2] -; AVX512-NEXT: vpbroadcastd 96(%rax), %ymm10 -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm29 & (zmm9 ^ zmm7)) -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2] -; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm10 -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm3)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm3 & (zmm9 ^ zmm22)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm8)) -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm8 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm28 & (zmm8 ^ zmm3)) -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm28 & (zmm3 ^ zmm1)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm1 & (zmm31 ^ zmm8)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm3)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm17)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm2)) +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm11[2,1,3,2] +; AVX512-NEXT: vpbroadcastd 96(%rax), %ymm11 +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm31 & (zmm10 ^ zmm8)) +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm26, %zmm4 +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm28[2,1,3,2] +; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm11 +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm31 & (zmm8 ^ zmm4)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm4 & (zmm10 ^ zmm7)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm4 & (zmm8 ^ zmm12)) +; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm4 +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm17, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm4 ^ (zmm22 & (zmm7 ^ zmm4)) +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm29, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm22 & (zmm4 ^ zmm0)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm7)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm0 & (zmm13 ^ zmm4)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm16)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (mem & (zmm21 ^ zmm2)) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm23, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm20, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm31, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 512(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm30, 832(%rax) -; AVX512-NEXT: addq $2840, %rsp # imm = 0xB18 +; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm6, 704(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 576(%rax) +; AVX512-NEXT: vmovdqa64 %zmm30, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, 768(%rax) +; AVX512-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512-NEXT: addq $2648, %rsp # imm = 0xA58 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $1576, %rsp # imm = 0x628 -; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm16 +; AVX512-FCP-NEXT: subq $1544, %rsp # imm = 0x608 +; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm2 +; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm16 ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm4 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm4, %ymm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm25 ; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm13 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm9 ; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm9, %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 ; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 ; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 ; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm14 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm15 @@ -13276,10 +13280,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 ; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 ; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm0 @@ -13291,631 +13295,637 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 ; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm6 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm9 -; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm11 -; AVX512-FCP-NEXT: vporq %ymm11, %ymm10, %ymm20 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm6 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm11 +; AVX512-FCP-NEXT: vporq %ymm11, %ymm6, %ymm21 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm12 ; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[1,1,1,1,5,5,5,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm16, %zmm10 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm10 ^ (mem & (zmm12 ^ zmm10)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm11 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (mem & (zmm12 ^ zmm11)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm12 & ymm14) -; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm6 -; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,u,u,u,u],zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm12 & ymm14) +; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm10 & ymm16) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm11 & ymm16) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12 +; AVX512-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm12 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm10)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512-FCP-NEXT: vprold $16, %ymm11, %ymm10 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm11)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512-FCP-NEXT: vprold $16, %ymm6, %ymm11 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm16 & (ymm10 ^ ymm12)) -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm16 & (ymm11 ^ ymm12)) +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm11[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7] -; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX512-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpandn %ymm10, %ymm14, %ymm10 -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm12 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm10 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15] -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm12 = [151522058,0,421010202,421010202] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm24 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,2,3,10,9,11,11] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm12 = [218894094,0,488382238,488382238] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm10 -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm22 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] +; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm4 +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm11, %ymm11 +; AVX512-FCP-NEXT: vpandn %ymm11, %ymm14, %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm30, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm19 & (zmm4 ^ zmm2)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [5,0,0,0,6,0,0,6] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7,8,9,10],ymm4[11],ymm11[12,13],ymm4[14],ymm11[15] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm6 = [151522058,0,421010202,421010202] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,2,2,3,10,9,11,11] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [218894094,0,488382238,488382238] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm16 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13,14,15] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm11 & (zmm1 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] +; AVX512-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512-FCP-NEXT: vpbroadcastd 72(%rax), %ymm1 +; AVX512-FCP-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 64(%rax), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpbroadcastd 72(%rax), %ymm4 -; AVX512-FCP-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vmovdqa 64(%rax), %ymm5 -; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm4 -; AVX512-FCP-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm5 -; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,0,3,10,10,11,11] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm20, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm1 +; AVX512-FCP-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,0,3,10,10,11,11] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13,14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 ; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm1 ; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm19 & (zmm0 ^ zmm5)) +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm12 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm11 & (zmm0 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512-FCP-NEXT: vprold $16, %ymm9, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512-FCP-NEXT: vprold $16, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm16 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,13,8,8,15,14,8,15,14,13,8,8,15,14,8,15] +; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm4)) +; AVX512-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm19 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm18 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm3 ^ (mem & (zmm6 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 -; AVX512-FCP-NEXT: vprold $16, %ymm25, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm23 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm21 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm10 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (mem & (zmm5 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vprold $16, %ymm25, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm25 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm28 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermd 64(%rax), %zmm20, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm16 & (zmm1 ^ zmm3)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6)) -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm23, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm22 & (zmm0 ^ zmm3)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5)) +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm18[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm5 +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm1 ; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,1,3,8,8,9,9] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [8,9,9,0,0,0,1,1] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill -; AVX512-FCP-NEXT: vprold $16, %xmm7, %xmm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,8,8,10,9] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm1 & (zmm3 ^ zmm0)) -; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] ; AVX512-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,8,9,9,11] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 +; AVX512-FCP-NEXT: vprold $16, %xmm7, %xmm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,1,1,8,8,0,9] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm3 & (zmm5 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm6 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,8,9,9,0] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm1 ; AVX512-FCP-NEXT: vpbroadcastd 64(%rax), %ymm6 ; AVX512-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm29 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm21 & (zmm29 ^ zmm0)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm0 & (zmm29 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm30 & (zmm29 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm1 & (zmm29 ^ zmm5)) ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6],xmm3[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm5 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vprold $16, %xmm10, %xmm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm6 ^ (zmm1 & (zmm8 ^ zmm6)) -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm3 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vprold $16, %xmm4, %xmm6 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm3 & (zmm8 ^ zmm5)) +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm24 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm26 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm21 & (zmm26 ^ zmm1)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm0 & (zmm26 ^ zmm8)) -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm11 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm30 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm11 & (zmm30 ^ zmm3)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm1 & (zmm30 ^ zmm8)) +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 -; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm1 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7,8,9],ymm5[10],ymm1[11,12],ymm5[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm11 ; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm13 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,8,9,9,11] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8,9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1)) -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512-FCP-NEXT: vprold $16, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,8,9,9,0] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm5 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7,8,9,10],ymm5[11],ymm8[12,13],ymm5[14],ymm8[15] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm3 ^ (mem & (zmm1 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512-FCP-NEXT: vprold $16, %ymm5, %ymm2 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm30 & (zmm8 ^ zmm5)) -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-FCP-NEXT: vpermd (%rax), %zmm20, %zmm20 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm3)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm0)) +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm12 ^ (zmm17 & (zmm8 ^ zmm12)) +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm25[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] +; AVX512-FCP-NEXT: vpermd %zmm16, %zmm23, %zmm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm3)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm1)) ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm24 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm15 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,8,8,9] -; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm21 -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm14 -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm0 ^ (zmm30 & (zmm15 ^ zmm0)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [6,7,3,3,7,7,6,7] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm27, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd 96(%rax), %ymm23 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm30 & (zmm23 ^ zmm7)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm31 & (zmm23 ^ zmm8)) -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX512-FCP-NEXT: vprold $16, %xmm4, %xmm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm14 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,2,2,3,8,8,8,9] +; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm6 +; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm18 +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm15 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm0 ^ (zmm17 & (zmm14 ^ zmm0)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [6,7,3,3,7,7,6,7] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd 96(%rax), %ymm19 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm24 & (zmm19 ^ zmm12)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm23 & (zmm19 ^ zmm8)) +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512-FCP-NEXT: vprold $16, %xmm2, %xmm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm8 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,2,1,8,8,9,11] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm18 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm12 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm31 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,1,8,8,9,0] +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm22 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm10[1],xmm13[2,3],xmm10[4],xmm13[5,6],xmm10[7] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (zmm28 & (zmm13 ^ zmm0)) -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm0 ^ (zmm26 & (zmm11 ^ zmm0)) +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm30 & (ymm14 ^ ymm0)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm20 = ymm20 ^ (ymm24 & (ymm20 ^ ymm0)) ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,1,3,8,8,9,9] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm6 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm10 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm10[0,0,1,1] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vmovdqa %xmm8, %xmm5 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm10[0,0,1,1] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm27, %ymm10 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm17 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm30 & (zmm10 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [8,9,9,0,0,0,1,1] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm3 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,0,1,1] +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm27 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,0,1,1] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm15 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm24 & (zmm7 ^ zmm1)) ; AVX512-FCP-NEXT: vpbroadcastd 100(%rax), %ymm1 -; AVX512-FCP-NEXT: vpbroadcastd 104(%rax), %ymm17 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm1, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm17 & (zmm19 ^ zmm6)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm31 & (zmm10 ^ zmm15)) +; AVX512-FCP-NEXT: vpbroadcastd 104(%rax), %ymm15 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm24 & (zmm15 ^ zmm3)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm23 & (zmm7 ^ zmm14)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm1 & (zmm19 ^ zmm13)) -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512-FCP-NEXT: vprold $16, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm6 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2,3],xmm6[4],xmm13[5,6],xmm6[7] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm11 ^ (zmm28 & (zmm6 ^ zmm11)) -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm7 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm5 -; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm4 -; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm7 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm17 & (zmm4 ^ zmm5)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm6)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm1 & (zmm15 ^ zmm11)) +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512-FCP-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3],xmm3[4],xmm11[5,6],xmm3[7] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm9 ^ (zmm26 & (zmm3 ^ zmm9)) +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm8 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 +; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm2 +; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm24 & (zmm2 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm3)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm1 & (ymm0 ^ mem)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm1 & (ymm3 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm5 & (ymm9 ^ ymm14)) -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm3[0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm0 = (zmm0 & zmm5) | mem -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 = (zmm2 & zmm5) | mem -; AVX512-FCP-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm1 & (ymm5 ^ ymm0)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm4 & (ymm6 ^ ymm20)) +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm5[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (zmm0 & zmm4) | mem +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = (zmm4 & zmm5) | mem ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm9 = mem[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-FCP-NEXT: vpermd %ymm11, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpandn %ymm9, %ymm13, %ymm9 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm9 -; AVX512-FCP-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512-FCP-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX512-FCP-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm12 = mem[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm12 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,4,5,4,5,5,7] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,17],zero,zero +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpandn %ymm13, %ymm14, %ymm13 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm11 ^ (zmm28 & (zmm3 ^ zmm11)) ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm28 & (zmm6 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm26 & (zmm6 ^ zmm5)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm8 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm26 & (zmm8 ^ zmm5)) ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm5 & (zmm3 ^ zmm0)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm5 & (zmm6 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm5 & (zmm6 ^ zmm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm4 ^ (zmm5 & (zmm8 ^ zmm4)) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (mem & (zmm2 ^ zmm0)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (mem & (zmm4 ^ zmm0)) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512-FCP-NEXT: # zmm0 = zmm0 | (zmm1 & mem) -; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm9 = zmm9 | (zmm1 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm12 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm12 = zmm12 | (zmm1 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm4)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, (%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 640(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 640(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm1, 576(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 832(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 832(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512-FCP-NEXT: addq $1576, %rsp # imm = 0x628 +; AVX512-FCP-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride7_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $2840, %rsp # imm = 0xB18 +; AVX512DQ-NEXT: subq $2648, %rsp # imm = 0xA58 ; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm6 -; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm13 +; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm15 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm8 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm6, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm3 -; AVX512DQ-NEXT: vporq %ymm2, %ymm3, %ymm17 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm7, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm3 ; AVX512DQ-NEXT: vporq %ymm2, %ymm3, %ymm18 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm8, %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm7, %ymm4 +; AVX512DQ-NEXT: vporq %ymm2, %ymm4, %ymm19 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] ; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm26 -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm27 -; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm29 +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm4 +; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm28 +; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm3, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm23 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm3 -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm22 -; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm5, %ymm4 +; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm23 +; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm4 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 @@ -13924,28 +13934,28 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm3, %ymm1 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm1, %ymm9 ; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm0, %ymm10 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm10 ; AVX512DQ-NEXT: vpor %ymm10, %ymm9, %ymm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] ; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm6, %ymm9 -; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm31 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm27 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] ; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13963,12 +13973,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm9 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm10[3,3,3,3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm10[2,2,2,2] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,2,2,2] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15] @@ -13979,18 +13988,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX512DQ-NEXT: vprold $16, %ymm9, %ymm8 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm9, %ymm9 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm7 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (mem & (zmm7 ^ zmm6)) ; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm6 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7] ; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ymm11) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ymm12) ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm11) ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm7, %ymm6 @@ -14003,33 +14011,30 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7] ; AVX512DQ-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm6, %ymm6 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm6, %ymm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] ; AVX512DQ-NEXT: vpandn %ymm7, %ymm12, %ymm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm19 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm17 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] ; AVX512DQ-NEXT: vpbroadcastd 72(%rax), %ymm6 -; AVX512DQ-NEXT: vpandnq %ymm6, %ymm28, %ymm6 +; AVX512DQ-NEXT: vpandnq %ymm6, %ymm30, %ymm6 ; AVX512DQ-NEXT: vmovdqa 64(%rax), %ymm7 ; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm7, %ymm7 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 64(%r9), %xmm7 ; AVX512DQ-NEXT: vmovdqa 64(%r8), %xmm8 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm17 -; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm20 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-NEXT: vpshufb %xmm15, %xmm6, %xmm6 +; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm18 +; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm22 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm6, %xmm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm9 @@ -14038,75 +14043,75 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm14 -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm10 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm8 -; AVX512DQ-NEXT: vpandnq %ymm8, %ymm28, %ymm8 -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm12 -; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm12, %ymm13 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm8 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm12 +; AVX512DQ-NEXT: vpandnq %ymm12, %ymm30, %ymm12 +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm13 +; AVX512DQ-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm13, %ymm13 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm6 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm12 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm29 -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512DQ-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm25 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1] -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm13 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm25 +; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm26 +; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 +; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm31 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm15 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,2,1] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm14 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm19 ; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,1,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] ; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] ; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm15 -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm15, %ymm11 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm12 +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm12, %ymm10 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512DQ-NEXT: vpandnq %ymm12, %ymm19, %ymm12 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpandnq %ymm12, %ymm17, %ymm12 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm6 -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm11 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8,9,10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm14 +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm4, %ymm10 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] @@ -14136,49 +14141,47 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpermd %zmm15, %zmm18, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm12 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm6 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[1,1,1,1,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm2 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm2, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm3 -; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm11 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm3 +; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm10 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vprold $16, %ymm5, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] @@ -14190,8 +14193,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] @@ -14199,11 +14202,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm12 -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] @@ -14214,8 +14217,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm5 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm2 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14226,379 +14229,384 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6] -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm18, %zmm0 -; AVX512DQ-NEXT: vpbroadcastd 100(%rax), %ymm1 -; AVX512DQ-NEXT: vpbroadcastd 104(%rax), %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm0)) -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm9, %xmm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3] -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] -; AVX512DQ-NEXT: vprold $16, %xmm10, %xmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm1 ^ (zmm0 & (zmm14 ^ zmm1)) -; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 -; AVX512DQ-NEXT: vpbroadcastd 64(%rax), %ymm3 -; AVX512DQ-NEXT: vpbroadcastd 68(%rax), %ymm5 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm25 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm19 & (zmm25 ^ zmm1)) -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm8, %xmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm17, %zmm1 +; AVX512DQ-NEXT: vpbroadcastd 100(%rax), %ymm2 +; AVX512DQ-NEXT: vpbroadcastd 104(%rax), %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm12 +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm9, %xmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm5 -; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX512DQ-NEXT: vprold $16, %xmm21, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[1,1,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm0 & (zmm5 ^ zmm3)) -; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 -; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm0)) +; AVX512DQ-NEXT: vprold $16, %xmm8, %xmm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm2 ^ (zmm1 & (zmm14 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm5, %zmm2 +; AVX512DQ-NEXT: vpbroadcastd 64(%rax), %ymm4 +; AVX512DQ-NEXT: vpbroadcastd 68(%rax), %ymm6 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm18 & (zmm23 ^ zmm2)) +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm15, %xmm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3],xmm4[4],xmm6[5,6],xmm4[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm3, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm6 +; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX512DQ-NEXT: vprold $16, %xmm19, %xmm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm16[1,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm4 ^ (zmm1 & (zmm7 ^ zmm4)) +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm2 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm2 +; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm18 & (zmm19 ^ zmm1)) ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm11 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm1[2,1,3,2] +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm3, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm10 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] -; AVX512DQ-NEXT: vprold $16, %ymm7, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] +; AVX512DQ-NEXT: vprold $16, %ymm8, %ymm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6,7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7,8,9,10],ymm6[11],ymm0[12,13],ymm6[14],ymm0[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7,8,9],ymm6[10],ymm8[11,12],ymm6[13],ymm8[14,15] ; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm3[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm4[2,1,3,2] -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512DQ-NEXT: vprold $16, %xmm10, %xmm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm2[0,2,2,3] -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm2, %xmm3 -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6] -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm27 -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm1 -; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm2 -; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm1)) -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm25 = ymm2[2,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm4[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm5[2,1,3,2] +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512DQ-NEXT: vprold $16, %xmm11, %xmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[1,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm3[0,2,2,3] +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm2 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm4 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm17, %zmm2 +; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm8 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm17 = ymm1[2,1,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm4[0,0,1,1] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm3)) -; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm3 = mem[2,1,3,2] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm29 & (ymm3 ^ ymm1)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm28 & (ymm30 ^ ymm3)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = (zmm3 & zmm28) | mem ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = (zmm0 & zmm28) | mem +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm9 & (zmm0 ^ zmm1)) +; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm1 = mem[2,1,3,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm31 & (ymm1 ^ ymm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm20 = ymm20 ^ (ymm30 & (ymm20 ^ ymm1)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = (zmm1 & zmm30) | mem ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm4 = (zmm4 & zmm30) | mem ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm23 ^ (zmm28 & (zmm18 ^ zmm23)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm3 ^ (zmm2 & (zmm18 ^ zmm3)) +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm3 ^ (zmm28 & (zmm23 ^ zmm3)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm0 ^ (zmm2 & (zmm23 ^ zmm0)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm0 ^ (mem & (zmm17 ^ zmm0)) -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 -; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm19 & (ymm0 ^ mem)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm19 = zmm19 | (zmm1 & mem) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3] -; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = (zmm0 & zmm1) | mem -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm2[0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm1)) +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm30 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm2 ^ (zmm22 & (zmm30 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm1 ^ (zmm2 & (zmm30 ^ zmm1)) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm2 & (zmm11 ^ zmm1)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm1 & (zmm25 ^ zmm14)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm1 & (zmm20 ^ zmm5)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm22 & (zmm5 ^ zmm1)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm2 & (zmm5 ^ zmm4)) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm5 & (zmm2 ^ zmm1)) +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm16 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm1 ^ (mem & (zmm16 ^ zmm1)) +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm18 & (ymm0 ^ mem)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm21 = ymm21 ^ (ymm1 & (ymm21 ^ ymm0)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm18 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm18 = zmm18 | (zmm1 & mem) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = zmm0[0,1,2,3],mem[0,1,2,3] +; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = (zmm3 & zmm1) | mem +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm20[0,1,2,3],zmm21[0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm21, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm4 & (zmm20 ^ zmm1)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm0 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm21, %zmm21 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm4 & (zmm21 ^ zmm1)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm1 & (zmm23 ^ zmm14)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm1 & (zmm19 ^ zmm7)) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm22 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm5 & (zmm22 ^ zmm1)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm1 ^ (zmm14 & (zmm7 ^ zmm1)) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512DQ-NEXT: vpermd 64(%rax), %zmm14, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm1)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm1 -; AVX512DQ-NEXT: vpermd (%rax), %zmm14, %zmm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm3 & (zmm14 ^ zmm1)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm2)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm1 & (zmm14 ^ zmm22)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm28, %zmm25 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm1 ^ (zmm14 & (zmm25 ^ zmm1)) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm28, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm14)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm29, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm0, %zmm28, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm4 & (zmm14 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm7)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm25)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm1)) -; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm4 & (zmm22 ^ zmm1)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm1 ^ (zmm4 & (zmm8 ^ zmm1)) -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm7 = mem[2,2,2,3] -; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm9 = mem[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3] -; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm10 = mem[2,3,3,3,6,7,7,7] -; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm16 = mem[0,0,2,1] -; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm12 = mem[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] -; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm21 = mem[0,0,1,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm9 & (zmm2 ^ zmm0)) +; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm0 ^ (zmm9 & (zmm7 ^ zmm0)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm17, %zmm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm0 ^ (zmm9 & (zmm12 ^ zmm0)) +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm8 = mem[2,2,2,3] +; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm10 = mem[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm11 = mem[2,3,3,3,6,7,7,7] +; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm25 = mem[0,0,2,1] +; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm4 = mem[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm4[0,0,1,3] +; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm17 = mem[0,0,1,1] ; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX512DQ-NEXT: # xmm15 = mem[0,2,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm24 = mem[2,2,2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] -; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm26 = mem[2,3,3,3,6,7,7,7] -; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm27 = mem[0,0,2,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm26 = mem[2,2,2,3] +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] +; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm28 = mem[2,3,3,3,6,7,7,7] +; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm29 = mem[0,0,2,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2] -; AVX512DQ-NEXT: vpbroadcastd 96(%rax), %ymm10 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm29 & (zmm9 ^ zmm7)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2] -; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm10 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm3)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm3 & (zmm9 ^ zmm22)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm8)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm3 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm28 & (zmm8 ^ zmm3)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm28 & (zmm3 ^ zmm1)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm1 & (zmm31 ^ zmm8)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm3)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm17)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm2)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm11[2,1,3,2] +; AVX512DQ-NEXT: vpbroadcastd 96(%rax), %ymm11 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm31 & (zmm10 ^ zmm8)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm26, %zmm4 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm28[2,1,3,2] +; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm11 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm31 & (zmm8 ^ zmm4)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm4 & (zmm10 ^ zmm7)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm4 & (zmm8 ^ zmm12)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm4 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm17, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm4 ^ (zmm22 & (zmm7 ^ zmm4)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm29, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm22 & (zmm4 ^ zmm0)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm7)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm0 & (zmm13 ^ zmm4)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm16)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (mem & (zmm21 ^ zmm2)) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 768(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 832(%rax) -; AVX512DQ-NEXT: addq $2840, %rsp # imm = 0xB18 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm6, 704(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 576(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 512(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 768(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 832(%rax) +; AVX512DQ-NEXT: addq $2648, %rsp # imm = 0xA58 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $1576, %rsp # imm = 0x628 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm1, %ymm16 +; AVX512DQ-FCP-NEXT: subq $1544, %rsp # imm = 0x608 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm2, %ymm16 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm4 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm25 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm9, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 ; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 ; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm15 @@ -14606,10 +14614,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 ; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm0 @@ -14621,562 +14629,568 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm9 -; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm9 -; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm11 -; AVX512DQ-FCP-NEXT: vporq %ymm11, %ymm10, %ymm20 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm11 +; AVX512DQ-FCP-NEXT: vporq %ymm11, %ymm6, %ymm21 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] ; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[1,1,1,1,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm16, %zmm10 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm10 ^ (mem & (zmm12 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm11 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (mem & (zmm12 ^ zmm11)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm12 & ymm14) -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,u,u,u,u],zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm12 & ymm14) +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm10 & ymm16) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm21 -; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm11 & ymm16) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm30 +; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm10)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512DQ-FCP-NEXT: vprold $16, %ymm11, %ymm10 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 +; AVX512DQ-FCP-NEXT: vprold $16, %ymm6, %ymm11 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm16 & (ymm10 ^ ymm12)) -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm16 & (ymm11 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm11[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpandn %ymm10, %ymm14, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm12 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15] -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm12 = [151522058,0,421010202,421010202] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm24 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,2,3,10,9,11,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm12 = [218894094,0,488382238,488382238] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm22 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm4 +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm11, %ymm11 +; AVX512DQ-FCP-NEXT: vpandn %ymm11, %ymm14, %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm30, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm19 & (zmm4 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [5,0,0,0,6,0,0,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7,8,9,10],ymm4[11],ymm11[12,13],ymm4[14],ymm11[15] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm6 = [151522058,0,421010202,421010202] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm23 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,2,2,3,10,9,11,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [218894094,0,488382238,488382238] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm16 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13,14,15] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm11 & (zmm1 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vpbroadcastd 72(%rax), %ymm1 +; AVX512DQ-FCP-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7] -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpbroadcastd 72(%rax), %ymm4 -; AVX512DQ-FCP-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm4 -; AVX512DQ-FCP-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,0,3,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm20, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm1 +; AVX512DQ-FCP-NEXT: vpandn %ymm1, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,0,3,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm19 & (zmm0 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm11 & (zmm0 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vprold $16, %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vprold $16, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm16 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,13,8,8,15,14,8,15,14,13,8,8,15,14,8,15] +; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm19 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm18 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm3 ^ (mem & (zmm6 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 -; AVX512DQ-FCP-NEXT: vprold $16, %ymm25, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm21 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm13, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (mem & (zmm5 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vprold $16, %ymm25, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm25 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm28 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd 64(%rax), %zmm20, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm16 & (zmm1 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] +; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm22 & (zmm0 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm18[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [8,9,9,0,0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vprold $16, %xmm7, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,8,8,10,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm1 & (zmm3 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,8,9,9,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm7, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,1,1,8,8,0,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm3 & (zmm5 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm6 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,8,9,9,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm18, %zmm1 ; AVX512DQ-FCP-NEXT: vpbroadcastd 64(%rax), %ymm6 ; AVX512DQ-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm29 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm21 & (zmm29 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm0 & (zmm29 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm30 & (zmm29 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm1 & (zmm29 ^ zmm5)) ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6],xmm3[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vprold $16, %xmm10, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm6 ^ (zmm1 & (zmm8 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vprold $16, %xmm4, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm3 & (zmm8 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm24 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm5 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm26 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm21 & (zmm26 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm0 & (zmm26 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm30 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm11 & (zmm30 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm1 & (zmm30 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7,8,9],ymm5[10],ymm1[11,12],ymm5[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm9[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7,8,9],ymm1[10],ymm6[11,12],ymm1[13],ymm6[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm11 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm13 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,8,9,9,11] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8,9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vprold $16, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,8,9,9,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7,8,9,10],ymm5[11],ymm8[12,13],ymm5[14],ymm8[15] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm3 ^ (mem & (zmm1 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vprold $16, %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm31 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm30 & (zmm8 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vpermd (%rax), %zmm20, %zmm20 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm19, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm12 ^ (zmm17 & (zmm8 ^ zmm12)) +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm25[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7,8,9],ymm5[10],ymm0[11,12],ymm5[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm23, %zmm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm24 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm15 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm14 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm0 ^ (zmm30 & (zmm15 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [6,7,3,3,7,7,6,7] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm27, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd 96(%rax), %ymm23 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm30 & (zmm23 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm31 & (zmm23 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] -; AVX512DQ-FCP-NEXT: vprold $16, %xmm4, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm0 ^ (zmm17 & (zmm14 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd 96(%rax), %ymm19 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm24 & (zmm19 ^ zmm12)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm23 & (zmm19 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vprold $16, %xmm2, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm16 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,2,1,8,8,9,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm18 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm12 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm31 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,0,0,1,8,8,9,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm22 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm10[1],xmm13[2,3],xmm10[4],xmm13[5,6],xmm10[7] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (zmm28 & (zmm13 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm26, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm0 ^ (zmm26 & (zmm11 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm30 & (ymm14 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm20 = ymm20 ^ (ymm24 & (ymm20 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,1,3,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm10 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm10[0,0,1,1] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, %xmm5 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm10[0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm27, %ymm10 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm17 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm30 & (zmm10 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [8,9,9,0,0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm7[0,0,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm27 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm15 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm24 & (zmm7 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vpbroadcastd 100(%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd 104(%rax), %ymm17 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm1, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm17 & (zmm19 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm31 & (zmm10 ^ zmm15)) +; AVX512DQ-FCP-NEXT: vpbroadcastd 104(%rax), %ymm15 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm24 & (zmm15 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm23 & (zmm7 ^ zmm14)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm1 & (zmm19 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vprold $16, %xmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2,3],xmm6[4],xmm13[5,6],xmm6[7] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm11 ^ (zmm28 & (zmm6 ^ zmm11)) -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm5 -; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm4 -; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm7 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm17 & (zmm4 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm1 & (zmm15 ^ zmm11)) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3],xmm3[4],xmm11[5,6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm9 ^ (zmm26 & (zmm3 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm24 & (zmm2 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm3)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm1 & (ymm0 ^ mem)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm1 & (ymm3 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm5 & (ymm9 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm3[0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm0 = (zmm0 & zmm5) | mem -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm2 = (zmm2 & zmm5) | mem -; AVX512DQ-FCP-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm1 & (ymm5 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm4 & (ymm6 ^ ymm20)) +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm5[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (zmm0 & zmm4) | mem +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm4 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = (zmm4 & zmm5) | mem ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm9 = mem[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpandn %ymm9, %ymm13, %ymm9 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm9 -; AVX512DQ-FCP-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm12 = mem[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm12 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,4,5,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,17],zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandn %ymm13, %ymm14, %ymm13 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm11 ^ (zmm28 & (zmm3 ^ zmm11)) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm28 & (zmm6 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm26 & (zmm6 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm5 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm8 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm26 & (zmm8 ^ zmm5)) ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm5 & (zmm3 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm5 & (zmm6 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm5 & (zmm6 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm4 ^ (zmm5 & (zmm8 ^ zmm4)) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (mem & (zmm2 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (mem & (zmm4 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: # zmm0 = zmm0 | (zmm1 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm9 = zmm9 | (zmm1 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm12 = zmm12 | (zmm1 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm4)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, (%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 640(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 576(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 832(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 832(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax) -; AVX512DQ-FCP-NEXT: addq $1576, %rsp # imm = 0x628 +; AVX512DQ-FCP-NEXT: addq $1544, %rsp # imm = 0x608 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll index e74521d5463a4..21b98dbb3843e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll @@ -1696,20 +1696,20 @@ define void @store_i8_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm6 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm3, %ymm7 ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm8 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm8 +; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm5 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm8 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[2,3,6,7],zmm5[2,3,6,7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[4,5,6,7],zmm0[4,5,6,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[4,5,6,7],zmm1[4,5,6,7] ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%r8) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%r8) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%r8) ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1733,20 +1733,20 @@ define void @store_i8_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm1, %xmm6 ; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm3, %ymm7 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm8 -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm8 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm5 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm8 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[2,3,6,7],zmm5[2,3,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[4,5,6,7],zmm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[4,5,6,7],zmm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%r8) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%r8) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -1770,20 +1770,20 @@ define void @store_i8_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm1, %xmm6 ; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm3, %ymm7 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, %xmm8 -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm8 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm5 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm8 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[2,3,6,7],zmm5[2,3,6,7] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[4,5,6,7],zmm0[4,5,6,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[4,5,6,7],zmm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%r8) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%r8) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%r8) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -1807,20 +1807,20 @@ define void @store_i8_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm1, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm3, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[2,3,6,7],zmm5[2,3,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[4,5,6,7],zmm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[4,5,6,7],zmm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%r8) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%r8) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r8) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index d25f8cf6b0bca..302da6ef63796 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -4664,176 +4664,176 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-LABEL: store_i8_stride5_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm1, %ymm10, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512-NEXT: vpshufb %ymm1, %ymm11, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm20 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19] ; AVX512-NEXT: vpshufb %ymm2, %ymm5, %ymm1 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm19 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm17 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] -; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] -; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm29 -; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm20 +; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm19 ; AVX512-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512-NEXT: vpshufb %ymm7, %ymm15, %ymm0 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512-NEXT: vpshufb %ymm8, %ymm15, %ymm0 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm13 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] -; AVX512-NEXT: vpshufb %ymm3, %ymm12, %ymm1 +; AVX512-NEXT: vpshufb %ymm3, %ymm13, %ymm1 ; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm22 ; AVX512-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] ; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm0 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm7 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] -; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm1 +; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm31 ; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm23 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] -; AVX512-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm9, %ymm5, %ymm0 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] +; AVX512-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm14, %ymm5, %ymm0 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] ; AVX512-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] ; AVX512-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm5, %ymm10, %ymm1 +; AVX512-NEXT: vpshufb %ymm5, %ymm11, %ymm1 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] ; AVX512-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm2, %ymm10, %ymm10 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm26 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] -; AVX512-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm10, %ymm12, %ymm1 +; AVX512-NEXT: vpshufb %ymm2, %ymm11, %ymm11 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm26 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] +; AVX512-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm11, %ymm13, %ymm1 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm0, %ymm15, %ymm11 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm25 +; AVX512-NEXT: vpshufb %ymm0, %ymm15, %ymm9 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm25 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0] ; AVX512-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm11 +; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm9 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] ; AVX512-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm15, %ymm12, %ymm12 -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm27 -; AVX512-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX512-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512-NEXT: vpshufb %ymm3, %ymm11, %ymm3 -; AVX512-NEXT: vporq %ymm7, %ymm3, %ymm16 -; AVX512-NEXT: vpshufb %ymm0, %ymm12, %ymm0 -; AVX512-NEXT: vpshufb %ymm15, %ymm11, %ymm3 -; AVX512-NEXT: vporq %ymm0, %ymm3, %ymm17 +; AVX512-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm27 +; AVX512-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX512-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512-NEXT: vpshufb %ymm3, %ymm9, %ymm3 +; AVX512-NEXT: vporq %ymm8, %ymm3, %ymm16 +; AVX512-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512-NEXT: vpshufb %ymm15, %ymm9, %ymm3 +; AVX512-NEXT: vporq %ymm0, %ymm3, %ymm18 ; AVX512-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm20, %ymm0 ; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm15 -; AVX512-NEXT: vpshufb %ymm15, %ymm7, %ymm15 -; AVX512-NEXT: vporq %ymm0, %ymm15, %ymm18 -; AVX512-NEXT: vpshufb %ymm4, %ymm7, %ymm0 +; AVX512-NEXT: vpshufb %ymm15, %ymm8, %ymm15 +; AVX512-NEXT: vpor %ymm0, %ymm15, %ymm15 +; AVX512-NEXT: vpshufb %ymm4, %ymm8, %ymm0 ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm4 -; AVX512-NEXT: vpshufb %ymm10, %ymm11, %ymm0 -; AVX512-NEXT: vpshufb %ymm1, %ymm12, %ymm1 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm1 -; AVX512-NEXT: vpshufb %ymm9, %ymm7, %ymm0 +; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm2 +; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512-NEXT: vporq %ymm2, %ymm1, %ymm20 +; AVX512-NEXT: vpshufb %ymm14, %ymm8, %ymm1 ; AVX512-NEXT: vpshufb %ymm5, %ymm3, %ymm2 +; AVX512-NEXT: vporq %ymm1, %ymm2, %ymm21 ; AVX512-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512-NEXT: vporq %ymm0, %ymm2, %ymm21 -; AVX512-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm10 -; AVX512-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512-NEXT: vmovdqa64 %xmm30, %xmm2 -; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512-NEXT: vmovdqa64 %xmm31, %xmm11 -; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX512-NEXT: vpor %xmm2, %xmm11, %xmm11 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512-NEXT: vpshufb %ymm12, %ymm3, %ymm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = mem[1,1,2,2] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX512-NEXT: vpandnq %ymm15, %ymm28, %ymm15 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 -; AVX512-NEXT: vmovdqa (%r8), %ymm15 -; AVX512-NEXT: vpshufb %ymm12, %ymm15, %ymm12 -; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,2,1,1,4,6,5,5] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,3,2] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm15, %ymm15 -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm12 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm3 +; AVX512-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-NEXT: vmovdqa64 %xmm30, %xmm4 +; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX512-NEXT: vmovdqa64 %xmm31, %xmm9 +; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX512-NEXT: vpor %xmm4, %xmm9, %xmm9 +; AVX512-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = mem[1,1,2,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,1] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX512-NEXT: vpandn %ymm13, %ymm14, %ymm13 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4 +; AVX512-NEXT: vmovdqa (%r8), %ymm13 +; AVX512-NEXT: vpshufb %ymm11, %ymm13, %ymm11 +; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,1,1,4,6,5,5] +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,3,2] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,1] -; AVX512-NEXT: vmovdqa64 (%r8), %zmm15 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7] -; AVX512-NEXT: vpermd %zmm3, %zmm29, %zmm29 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [6,6,6,0,7,7,7,7,0,16,16,16,16,0,17,17] -; AVX512-NEXT: vpermi2d %zmm3, %zmm15, %zmm30 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX512-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512-NEXT: vinserti32x4 $2, %xmm10, %zmm3, %zmm3 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm20[0,0,1,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm5, %zmm5 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm23[0,0,1,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm7, %zmm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm7 & zmm28) -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm17[2,2,3,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm16, %zmm5 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm18, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm5 ^ (zmm8 & (zmm4 ^ zmm5)) -; AVX512-NEXT: vporq %zmm24, %zmm26, %zmm5 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX512-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512-NEXT: vinserti32x4 $2, %xmm9, %zmm1, %zmm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm19[0,0,1,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm23[0,0,1,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm5, %zmm5 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm7 & (zmm5 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & zmm14) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm18[2,2,3,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm16, %zmm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm7 & (zmm0 ^ zmm2)) +; AVX512-NEXT: vporq %zmm24, %zmm26, %zmm2 +; AVX512-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,3,3,6,6,7,7] +; AVX512-NEXT: vporq %zmm25, %zmm27, %zmm5 ; AVX512-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7] -; AVX512-NEXT: vporq %zmm25, %zmm27, %zmm7 -; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,3,3,6,6,7,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5)) -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm21[2,2,3,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm8 & (zmm5 ^ zmm1)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (mem & (zmm29 ^ zmm7)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm4 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (mem & (zmm30 ^ zmm5)) -; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm3[0,0,1,1,4,4,5,5] -; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512-NEXT: vpermd %zmm15, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512-NEXT: vmovdqa64 %zmm12, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm30, 128(%r9) -; AVX512-NEXT: vmovdqa64 %zmm29, 256(%r9) -; AVX512-NEXT: vmovdqa64 %zmm2, 192(%r9) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm7 & (zmm5 ^ zmm2)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm20[2,2,3,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm21[2,2,3,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm7 & (zmm6 ^ zmm2)) +; AVX512-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15] +; AVX512-NEXT: vpermd %zmm2, %zmm7, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm0 & mem) +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] +; AVX512-NEXT: vpermd %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm6)) +; AVX512-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5] +; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,1,4,4,5,5] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) +; AVX512-NEXT: vmovdqa64 %zmm11, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm0, 128(%r9) +; AVX512-NEXT: vmovdqa64 %zmm7, 256(%r9) +; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -4915,53 +4915,52 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm1 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm2 +; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[8],zero,xmm2[u,7],zero,xmm2[9],zero,xmm2[u],zero,xmm2[u,10],zero,xmm2[12],zero,xmm2[u,11] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9,u,11,u],zero,xmm5[10],zero,xmm5[12,u],zero -; AVX512-FCP-NEXT: vpor %xmm4, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm3[6],zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9],zero,xmm3[11,u],zero,xmm3[10],zero,xmm3[12] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[6],zero,xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[11],zero,xmm4[u,10],zero,xmm4[12],zero -; AVX512-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9,u,11,u],zero,xmm3[10],zero,xmm3[12,u],zero +; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm6 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm4[6],zero,xmm4[8,u],zero,xmm4[7],zero,xmm4[9],zero,xmm4[11,u],zero,xmm4[10],zero,xmm4[12] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[6],zero,xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[11],zero,xmm5[u,10],zero,xmm5[12],zero +; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm10 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,1,2,2,2,2,2,2] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX512-FCP-NEXT: vpandn %ymm11, %ymm12, %ymm11 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7] -; AVX512-FCP-NEXT: vpermd %ymm11, %ymm13, %ymm11 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm11 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512-FCP-NEXT: vpermd %zmm7, %zmm13, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm13 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,2,3,3,8,8,9,9] -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm15 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,1,2,2,2,2,2,2] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX512-FCP-NEXT: vpandn %ymm10, %ymm11, %ymm10 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm9 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [4,0,5,5,5,5,0,6] +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm10 +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,8,8,9,9] +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm10 +; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm14 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm13, %zmm1 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,1,1,8,8,9,9] ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] @@ -4973,7 +4972,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm5, %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm5 & zmm12) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & zmm11) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm27[2,2,3,3] @@ -4986,190 +4985,192 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm17 ^ (zmm3 & (zmm18 ^ zmm17)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm3 & (zmm1 ^ zmm0)) ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm5 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm18)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15] ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm18)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 256(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride5_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm10, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm11, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm20 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19] ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm1 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm19 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm17 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm14, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm12, %xmm0 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm28 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm29 -; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm20 +; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm19 ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm15, %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm15, %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm13 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm12, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm13, %ymm1 ; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm22 ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm0 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm7 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm31 ; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm23 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] -; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm5, %ymm0 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] +; AVX512DQ-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm5, %ymm0 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] ; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] ; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm10, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm11, %ymm1 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] ; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm10, %ymm10 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm10, %zmm26 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] -; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm12, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm11, %ymm11 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm26 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] +; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm13, %ymm1 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] ; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm15, %ymm11 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm11, %zmm25 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm15, %ymm9 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm25 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0] ; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm11 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm9 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] ; AVX512DQ-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm12, %ymm12 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm27 -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm12 -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm12, %ymm7 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm11, %ymm3 -; AVX512DQ-NEXT: vporq %ymm7, %ymm3, %ymm16 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm12, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm11, %ymm3 -; AVX512DQ-NEXT: vporq %ymm0, %ymm3, %ymm17 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm27 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm13, %ymm8 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm9, %ymm3 +; AVX512DQ-NEXT: vporq %ymm8, %ymm3, %ymm16 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm13, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm9, %ymm3 +; AVX512DQ-NEXT: vporq %ymm0, %ymm3, %ymm18 ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm15 -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm7, %ymm15 -; AVX512DQ-NEXT: vporq %ymm0, %ymm15, %ymm18 -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm7, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm8, %ymm15 +; AVX512DQ-NEXT: vpor %ymm0, %ymm15, %ymm15 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm8, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm4 -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm11, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm12, %ymm1 -; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm7, %ymm0 +; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512DQ-NEXT: vporq %ymm2, %ymm1, %ymm20 +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm8, %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm2 +; AVX512DQ-NEXT: vporq %ymm1, %ymm2, %ymm21 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-NEXT: vporq %ymm0, %ymm2, %ymm21 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm2 -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm10 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm2 -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm11 -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm11 -; AVX512DQ-NEXT: vpor %xmm2, %xmm11, %xmm11 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = mem[1,1,2,2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX512DQ-NEXT: vpandnq %ymm15, %ymm28, %ymm15 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm15 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm15, %ymm12 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,2,1,1,4,6,5,5] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,3,2] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm15, %ymm15 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm12 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm2 +; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm3 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm4 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm9 +; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX512DQ-NEXT: vpor %xmm4, %xmm9, %xmm9 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = mem[1,1,2,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,1,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX512DQ-NEXT: vpandn %ymm13, %ymm14, %ymm13 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm13 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm13, %ymm11 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,2,1,1,4,6,5,5] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,3,2] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm6 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,1,1] -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm15 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm29, %zmm29 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [6,6,6,0,7,7,7,7,0,16,16,16,16,0,17,17] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm15, %zmm30 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm10, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm20[0,0,1,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm23[0,0,1,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm7, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm7 & zmm28) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm17[2,2,3,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm16, %zmm5 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm18, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm5 ^ (zmm8 & (zmm4 ^ zmm5)) -; AVX512DQ-NEXT: vporq %zmm24, %zmm26, %zmm5 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7] +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm5, %zmm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm9, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm19[0,0,1,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm23[0,0,1,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm5, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm7 & (zmm5 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & zmm14) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm18[2,2,3,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm16, %zmm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm7 & (zmm0 ^ zmm2)) +; AVX512DQ-NEXT: vporq %zmm24, %zmm26, %zmm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,3,3,6,6,7,7] +; AVX512DQ-NEXT: vporq %zmm25, %zmm27, %zmm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,3,3,6,6,7,7] -; AVX512DQ-NEXT: vporq %zmm25, %zmm27, %zmm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,3,3,6,6,7,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm21[2,2,3,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm8 & (zmm5 ^ zmm1)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (mem & (zmm29 ^ zmm7)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm4 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (mem & (zmm30 ^ zmm5)) -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm3[0,0,1,1,4,4,5,5] -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,1,1,4,4,5,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512DQ-NEXT: vpermd %zmm15, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 128(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 256(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%r9) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm7 & (zmm5 ^ zmm2)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm20[2,2,3,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm21[2,2,3,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm6, %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm7 & (zmm6 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm0 & mem) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm6)) +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,1,1,4,4,5,5] +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,1,4,4,5,5] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -5251,53 +5252,52 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm1 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm2 +; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[8],zero,xmm2[u,7],zero,xmm2[9],zero,xmm2[u],zero,xmm2[u,10],zero,xmm2[12],zero,xmm2[u,11] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9,u,11,u],zero,xmm5[10],zero,xmm5[12,u],zero -; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm3[6],zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9],zero,xmm3[11,u],zero,xmm3[10],zero,xmm3[12] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[6],zero,xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[11],zero,xmm4[u,10],zero,xmm4[12],zero -; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9,u,11,u],zero,xmm3[10],zero,xmm3[12,u],zero +; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm4[6],zero,xmm4[8,u],zero,xmm4[7],zero,xmm4[9],zero,xmm4[11,u],zero,xmm4[10],zero,xmm4[12] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[6],zero,xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[11],zero,xmm5[u,10],zero,xmm5[12],zero +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm10 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,1,2,2,2,2,2,2] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX512DQ-FCP-NEXT: vpandn %ymm11, %ymm12, %ymm11 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm13, %ymm11 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512DQ-FCP-NEXT: vpermd %zmm7, %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm13 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,2,3,3,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm15 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,1,2,2,2,2,2,2] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX512DQ-FCP-NEXT: vpandn %ymm10, %ymm11, %ymm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm9 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [4,0,5,5,5,5,0,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm13, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm14 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm13, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,1,1,8,8,9,9] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm2 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] @@ -5309,7 +5309,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm5, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm5 & zmm12) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & zmm11) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm27[2,2,3,3] @@ -5322,15 +5322,17 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm17 ^ (zmm3 & (zmm18 ^ zmm17)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm3 & (zmm1 ^ zmm0)) ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm5 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm18)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15] ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm18)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 256(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 192(%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -5503,149 +5505,151 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512BW-FCP-LABEL: store_i8_stride5_vf64: ; AVX512BW-FCP: # %bb.0: +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm21 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm21, %ymm2 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm10 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] -; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm4 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] -; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm4, %xmm5 -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm13 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] +; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm6, %xmm8 +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm3, %xmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm5, %xmm10 -; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1] -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm18 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] -; AVX512BW-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm22 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm23 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm4, %xmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm20 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm8, %xmm16 +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm15[0,0,1,1] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [3,3,3,0,4,4,4,4] +; AVX512BW-FCP-NEXT: vpermd 32(%rdi), %ymm15, %ymm23 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm24 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm16 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX512BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm23, %ymm22 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm15, %zmm15 +; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm24, %ymm23 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm22 ; AVX512BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12] -; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm15, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12] +; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm22 ; AVX512BW-FCP-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm1 {%k3} -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] -; AVX512BW-FCP-NEXT: vpshufb %zmm15, %zmm22, %zmm22 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm18[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] -; AVX512BW-FCP-NEXT: vpshufb %zmm18, %zmm23, %zmm23 -; AVX512BW-FCP-NEXT: vporq %zmm22, %zmm23, %zmm22 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX512BW-FCP-NEXT: vpshufb %zmm23, %zmm8, %zmm8 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] -; AVX512BW-FCP-NEXT: vpshufb %zmm24, %zmm21, %zmm21 -; AVX512BW-FCP-NEXT: vporq %zmm8, %zmm21, %zmm8 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7] -; AVX512BW-FCP-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 -; AVX512BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm8 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm21 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] -; AVX512BW-FCP-NEXT: vpermd %zmm21, %zmm22, %zmm21 -; AVX512BW-FCP-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 -; AVX512BW-FCP-NEXT: kmovq %rax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm8 {%k4} -; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm12, %xmm16 -; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm1 {%k3} +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm11 +; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm12, %xmm17 +; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm17, %xmm11 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm11, %zmm11 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5] -; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm13, %xmm11 -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm17, %xmm16 -; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm16, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3],xmm13[4],xmm17[4],xmm13[5],xmm17[5],xmm13[6],xmm17[6],xmm13[7],xmm17[7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm13, %xmm13 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm13, %zmm11 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5] +; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm10, %zmm10 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm17 = zmm10[0,0,1,1,4,4,5,5] +; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm18, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm20, %xmm11 +; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm18[0],xmm20[0],xmm18[1],xmm20[1],xmm18[2],xmm20[2],xmm18[3],xmm20[3],xmm18[4],xmm20[4],xmm18[5],xmm20[5],xmm18[6],xmm20[6],xmm18[7],xmm20[7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm18, %zmm10 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] ; AVX512BW-FCP-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C -; AVX512BW-FCP-NEXT: kmovq %rax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm11 {%k4} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] -; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13 +; AVX512BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm10 {%k3} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm17 ; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 -; AVX512BW-FCP-NEXT: kmovq %rax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm11 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm13 -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm7 -; AVX512BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm13, %ymm7 -; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm14, %ymm17 -; AVX512BW-FCP-NEXT: vporq %ymm7, %ymm17, %ymm7 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm7 -; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm17, %ymm18 -; AVX512BW-FCP-NEXT: vporq %ymm15, %ymm18, %ymm15 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm9 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6] -; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm10 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm17, %ymm13 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %ymm18 +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm18, %ymm14 +; AVX512BW-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm17, %ymm19 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm18, %ymm21 +; AVX512BW-FCP-NEXT: vporq %ymm19, %ymm21, %ymm19 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm19[2,2,3,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm13, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm21, %ymm22 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm24, %ymm25 +; AVX512BW-FCP-NEXT: vporq %ymm22, %ymm25, %ymm22 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,3,3] +; AVX512BW-FCP-NEXT: vpermd %ymm24, %ymm15, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm21, %ymm15 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm15, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm15 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6] +; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13 ; AVX512BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm15 {%k1} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[4,5,6,7,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb %zmm19, %zmm9, %zmm9 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[4,5,6,7,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb %zmm23, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vporq %zmm9, %zmm7, %zmm7 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,3,3,6,6,7,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm3[4,5,6,7],zmm5[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb %zmm14, %zmm9, %zmm9 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[4,5,6,7],zmm3[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb %zmm20, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vporq %zmm9, %zmm3, %zmm3 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,3,3,6,6,7,7] +; AVX512BW-FCP-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 +; AVX512BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] +; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 +; AVX512BW-FCP-NEXT: kmovq %rax, %k2 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[27],zero,zero,ymm13[26],zero,ymm13[28],zero,ymm13[30],zero,zero,ymm13[29],zero,ymm13[31],zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm18[27],zero,zero,ymm18[26],zero,ymm18[28],zero,ymm18[30],zero,zero,ymm18[29],zero,ymm18[31],zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm17[27],zero,zero,ymm17[26],zero,ymm17[28],zero,ymm17[30],zero,zero,ymm17[29],zero,ymm17[31],zero,zero +; AVX512BW-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm21[26],zero,ymm21[28],zero,zero,zero,zero,ymm21[29],zero,ymm21[31],zero,zero,ymm21[30] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm24[26],zero,ymm24[28],zero,zero,ymm24[27],zero,ymm24[29],zero,ymm24[31],zero,zero,ymm24[30],zero +; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm4 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm2, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm17[26],zero,ymm17[28],zero,zero,ymm17[27],zero,ymm17[29],zero,ymm17[31],zero,zero,ymm17[30],zero -; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] ; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%r9) +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 64(%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r9) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%r9) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -5819,149 +5823,151 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf64: ; AVX512DQ-BW-FCP: # %bb.0: +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm21, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm4, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm6, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm3, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm5, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm23 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm4, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm8, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm15[0,0,1,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [3,3,3,0,4,4,4,4] +; AVX512DQ-BW-FCP-NEXT: vpermd 32(%rdi), %ymm15, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm16 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14] ; AVX512DQ-BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm23, %ymm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm15, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm24, %ymm23 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm22 ; AVX512DQ-BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm15, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm22 ; AVX512DQ-BW-FCP-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm1 {%k3} -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm15, %zmm22, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm18[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm18, %zmm23, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vporq %zmm22, %zmm23, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm23, %zmm8, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm24, %zmm21, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vporq %zmm8, %zmm21, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7] -; AVX512DQ-BW-FCP-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm8 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm21 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm21, %zmm22, %zmm21 -; AVX512DQ-BW-FCP-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm8 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm12, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm1 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm12, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm17, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm11, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm13, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm17, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm16, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3],xmm13[4],xmm17[4],xmm13[5],xmm17[5],xmm13[6],xmm17[6],xmm13[7],xmm17[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm13, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm13, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm10, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm17 = zmm10[0,0,1,1,4,4,5,5] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm18, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm20, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm18[0],xmm20[0],xmm18[1],xmm20[1],xmm18[2],xmm20[2],xmm18[3],xmm20[3],xmm18[4],xmm20[4],xmm18[5],xmm20[5],xmm18[6],xmm20[6],xmm18[7],xmm20[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm18, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,1,1,4,4,5,5] ; AVX512DQ-BW-FCP-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13 +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm17 ; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm11 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm13, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm14, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm7, %ymm17, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm17, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm15, %ymm18, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm17, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm18, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm17, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm18, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm19, %ymm21, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm19[2,2,3,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm13, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm21, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm24, %ymm25 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm22, %ymm25, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,3,3] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm24, %ymm15, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm21, %ymm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm15, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm15 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13 ; AVX512DQ-BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm15 {%k1} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm9[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm19, %zmm9, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm23, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vporq %zmm9, %zmm7, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,3,3,6,6,7,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm3[4,5,6,7],zmm5[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm14, %zmm9, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[4,5,6,7],zmm3[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm20, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vporq %zmm9, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,3,3,6,6,7,7] +; AVX512DQ-BW-FCP-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108 +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm14[27],zero,zero,ymm14[26],zero,ymm14[28],zero,ymm14[30],zero,zero,ymm14[29],zero,ymm14[31],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[27],zero,zero,ymm13[26],zero,ymm13[28],zero,ymm13[30],zero,zero,ymm13[29],zero,ymm13[31],zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm18[27],zero,zero,ymm18[26],zero,ymm18[28],zero,ymm18[30],zero,zero,ymm18[29],zero,ymm18[31],zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm17[27],zero,zero,ymm17[26],zero,ymm17[28],zero,ymm17[30],zero,zero,ymm17[29],zero,ymm17[31],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm21[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm21[26],zero,ymm21[28],zero,zero,zero,zero,ymm21[29],zero,ymm21[31],zero,zero,ymm21[30] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm24[26],zero,ymm24[28],zero,zero,ymm24[27],zero,ymm24[29],zero,ymm24[31],zero,zero,ymm24[30],zero +; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm7, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm2, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm17[26],zero,ymm17[28],zero,zero,ymm17[27],zero,ymm17[29],zero,ymm17[31],zero,zero,ymm17[30],zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] ; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 128(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 64(%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r9) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 192(%r9) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index 6205be83f5123..5c30c4cdf94d3 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -6359,210 +6359,200 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512BW-LABEL: store_i8_stride6_vf64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512BW-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-NEXT: vpshufb %ymm6, %ymm2, %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vpshufb %ymm6, %ymm3, %ymm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512BW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512BW-NEXT: vpshufb %ymm5, %ymm2, %ymm3 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15],ymm3[24],ymm2[24],ymm3[25],ymm2[25],ymm3[26],ymm2[26],ymm3[27],ymm2[27],ymm3[28],ymm2[28],ymm3[29],ymm2[29],ymm3[30],ymm2[30],ymm3[31],ymm2[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-NEXT: vpshufb %ymm1, %ymm12, %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX512BW-NEXT: vpshufb %ymm1, %ymm13, %ymm8 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[2],ymm0[2],ymm8[3],ymm0[3],ymm8[4],ymm0[4],ymm8[5],ymm0[5],ymm8[6],ymm0[6],ymm8[7],ymm0[7],ymm8[16],ymm0[16],ymm8[17],ymm0[17],ymm8[18],ymm0[18],ymm8[19],ymm0[19],ymm8[20],ymm0[20],ymm8[21],ymm0[21],ymm8[22],ymm0[22],ymm8[23],ymm0[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-NEXT: vpermw %ymm3, %ymm8, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 +; AVX512BW-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-NEXT: vpshufb %ymm11, %ymm3, %ymm0 +; AVX512BW-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX512BW-NEXT: vpshufb %ymm11, %ymm4, %ymm10 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[16],ymm0[16],ymm10[17],ymm0[17],ymm10[18],ymm0[18],ymm10[19],ymm0[19],ymm10[20],ymm0[20],ymm10[21],ymm0[21],ymm10[22],ymm0[22],ymm10[23],ymm0[23] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15],ymm13[24],ymm12[24],ymm13[25],ymm12[25],ymm13[26],ymm12[26],ymm13[27],ymm12[27],ymm13[28],ymm12[28],ymm13[29],ymm12[29],ymm13[30],ymm12[30],ymm13[31],ymm12[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm9, %ymm1, %ymm9 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm10 -; AVX512BW-NEXT: movl $613566756, %eax # imm = 0x24924924 -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm10 {%k1} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-NEXT: vpermw %ymm10, %ymm12, %ymm10 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512BW-NEXT: movl $613566756, %r10d # imm = 0x24924924 +; AVX512BW-NEXT: kmovd %r10d, %k1 +; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm7[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] +; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 +; AVX512BW-NEXT: kmovd %r10d, %k2 +; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k2} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm6[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: movabsq $-9076969306111049208, %r10 # imm = 0x8208208208208208 +; AVX512BW-NEXT: kmovq %r10, %k3 +; AVX512BW-NEXT: vmovdqu8 %zmm9, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512BW-NEXT: vpshufb %ymm5, %ymm9, %ymm13 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512BW-NEXT: vpshufb %ymm5, %ymm10, %ymm5 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[4],ymm13[4],ymm5[5],ymm13[5],ymm5[6],ymm13[6],ymm5[7],ymm13[7],ymm5[16],ymm13[16],ymm5[17],ymm13[17],ymm5[18],ymm13[18],ymm5[19],ymm13[19],ymm5[20],ymm13[20],ymm5[21],ymm13[21],ymm5[22],ymm13[22],ymm5[23],ymm13[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] +; AVX512BW-NEXT: vpermw %ymm13, %ymm8, %ymm8 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512BW-NEXT: vpshufb %ymm11, %ymm16, %ymm5 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %xmm20 -; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm18 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-NEXT: vpshufb %xmm14, %xmm18, %xmm5 +; AVX512BW-NEXT: vpshufb %ymm11, %ymm17, %ymm13 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm13[0],ymm5[0],ymm13[1],ymm5[1],ymm13[2],ymm5[2],ymm13[3],ymm5[3],ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[16],ymm5[16],ymm13[17],ymm5[17],ymm13[18],ymm5[18],ymm13[19],ymm5[19],ymm13[20],ymm5[20],ymm13[21],ymm5[21],ymm13[22],ymm5[22],ymm13[23],ymm5[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] +; AVX512BW-NEXT: vpermw %ymm13, %ymm12, %ymm12 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5 +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm5 {%k1} +; AVX512BW-NEXT: vmovdqa (%r8), %ymm13 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] +; AVX512BW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa (%r9), %ymm12 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k3} +; AVX512BW-NEXT: vmovdqa64 (%rsi), %xmm21 +; AVX512BW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm20 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vpshufb %xmm20, %xmm7, %xmm6 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm22 -; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm21 -; AVX512BW-NEXT: vpshufb %xmm14, %xmm21, %xmm9 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm21[0],xmm18[0],xmm21[1],xmm18[1],xmm21[2],xmm18[2],xmm21[3],xmm18[3],xmm21[4],xmm18[4],xmm21[5],xmm18[5],xmm21[6],xmm18[6],xmm21[7],xmm18[7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm26, %zmm5 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512BW-NEXT: vmovdqa64 (%r8), %xmm25 -; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm23 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm15 = xmm23[0],zero,xmm23[1],zero,xmm23[2],zero,xmm23[3],zero,xmm23[4],zero,xmm23[5],zero,xmm23[6],zero,xmm23[7],zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm23[2,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm19 = xmm19[0],zero,xmm19[1],zero,xmm19[2],zero,xmm19[3],zero,xmm19[4],zero,xmm19[5],zero,xmm19[6],zero,xmm19[7],zero -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm29, %zmm15 -; AVX512BW-NEXT: vmovdqa64 (%r9), %xmm27 -; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm24 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm24[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm28 = xmm24[2,1,2,3] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm28 = xmm28[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpermt2w %zmm28, %zmm29, %zmm19 -; AVX512BW-NEXT: vpshufb %xmm14, %xmm20, %xmm28 -; AVX512BW-NEXT: vpshufb %xmm14, %xmm22, %xmm14 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm14[8],xmm28[8],xmm14[9],xmm28[9],xmm14[10],xmm28[10],xmm14[11],xmm28[11],xmm14[12],xmm28[12],xmm14[13],xmm28[13],xmm14[14],xmm28[14],xmm14[15],xmm28[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm22[0],xmm20[0],xmm22[1],xmm20[1],xmm22[2],xmm20[2],xmm22[3],xmm20[3],xmm22[4],xmm20[4],xmm22[5],xmm20[5],xmm22[6],xmm20[6],xmm22[7],xmm20[7] -; AVX512BW-NEXT: vpermt2w %zmm28, %zmm26, %zmm14 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm26 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm28 = xmm25[2,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm28 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero,xmm28[4],zero,xmm28[5],zero,xmm28[6],zero,xmm28[7],zero -; AVX512BW-NEXT: vpermt2w %zmm28, %zmm29, %zmm26 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm28 = xmm27[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm30 = xmm27[2,1,2,3] +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512BW-NEXT: vpshufb %xmm20, %xmm8, %xmm14 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm27, %zmm6 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %xmm25 +; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX512BW-NEXT: vpshufb %xmm11, %xmm14, %xmm15 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm26 +; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm18 +; AVX512BW-NEXT: vpshufb %xmm11, %xmm18, %xmm19 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm18[0],xmm14[0],xmm18[1],xmm14[1],xmm18[2],xmm14[2],xmm18[3],xmm14[3],xmm18[4],xmm14[4],xmm18[5],xmm14[5],xmm18[6],xmm14[6],xmm18[7],xmm14[7] +; AVX512BW-NEXT: vprold $16, %xmm19, %xmm19 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm15 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[0,0,0,1,4,4,4,5] +; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm6 {%k2} +; AVX512BW-NEXT: vmovdqa64 (%r8), %xmm23 +; AVX512BW-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm19 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero,xmm15[4],zero,xmm15[5],zero,xmm15[6],zero,xmm15[7],zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm15[2,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm24 = xmm24[0],zero,xmm24[1],zero,xmm24[2],zero,xmm24[3],zero,xmm24[4],zero,xmm24[5],zero,xmm24[6],zero,xmm24[7],zero +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm28, %zmm19 +; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm6 {%k1} +; AVX512BW-NEXT: vmovdqa64 (%r9), %xmm24 +; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm19 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm29 = xmm19[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm30 = xmm19[2,1,2,3] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm30 = xmm30[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm29, %zmm28 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm31 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm29 -; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm30 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm30[8],xmm11[8],xmm30[9],xmm11[9],xmm30[10],xmm11[10],xmm30[11],xmm11[11],xmm30[12],xmm11[12],xmm30[13],xmm11[13],xmm30[14],xmm11[14],xmm30[15],xmm11[15] -; AVX512BW-NEXT: vpermt2w %zmm13, %zmm31, %zmm12 -; AVX512BW-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm13[0,1,2,3],zmm4[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm31 = zmm31[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm31 = zmm31[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: movl $-1840700270, %eax # imm = 0x92492492 -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm10 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm31 -; AVX512BW-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm7[0,1,2,3],zmm31[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 -; AVX512BW-NEXT: kmovq %rax, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm10 {%k3} -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512BW-NEXT: vpshufb %ymm6, %ymm8, %ymm1 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[16],ymm1[16],ymm6[17],ymm1[17],ymm6[18],ymm1[18],ymm6[19],ymm1[19],ymm6[20],ymm1[20],ymm6[21],ymm1[21],ymm6[22],ymm1[22],ymm6[23],ymm1[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm8[8],ymm0[9],ymm8[9],ymm0[10],ymm8[10],ymm0[11],ymm8[11],ymm0[12],ymm8[12],ymm0[13],ymm8[13],ymm0[14],ymm8[14],ymm0[15],ymm8[15],ymm0[24],ymm8[24],ymm0[25],ymm8[25],ymm0[26],ymm8[26],ymm0[27],ymm8[27],ymm0[28],ymm8[28],ymm0[29],ymm8[29],ymm0[30],ymm8[30],ymm0[31],ymm8[31] -; AVX512BW-NEXT: vpermw %ymm6, %ymm2, %ymm6 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm2 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-NEXT: vpshufb %ymm1, %ymm16, %ymm6 -; AVX512BW-NEXT: vpshufb %ymm1, %ymm17, %ymm4 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm6, %ymm1, %ymm6 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm6 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r9), %ymm16 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm31, %zmm4 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm6 {%k3} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[4],ymm8[4],ymm0[5],ymm8[5],ymm0[6],ymm8[6],ymm0[7],ymm8[7],ymm0[16],ymm8[16],ymm0[17],ymm8[17],ymm0[18],ymm8[18],ymm0[19],ymm8[19],ymm0[20],ymm8[20],ymm0[21],ymm8[21],ymm0[22],ymm8[22],ymm0[23],ymm8[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm22[8],xmm20[8],xmm22[9],xmm20[9],xmm22[10],xmm20[10],xmm22[11],xmm20[11],xmm22[12],xmm20[12],xmm22[13],xmm20[13],xmm22[14],xmm20[14],xmm22[15],xmm20[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm4 -; AVX512BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512BW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512BW-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm21[8],xmm18[8],xmm21[9],xmm18[9],xmm21[10],xmm18[10],xmm21[11],xmm18[11],xmm21[12],xmm18[12],xmm21[13],xmm18[13],xmm21[14],xmm18[14],xmm21[15],xmm18[15] -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm17 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm8 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm1, %ymm16, %ymm16 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm27[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermt2w %zmm16, %zmm18, %zmm20 -; AVX512BW-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm23[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm18, %zmm13 -; AVX512BW-NEXT: vpshufb %ymm1, %ymm7, %ymm0 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm24[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm18, %zmm1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm11, %xmm0 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm30, %xmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm30[0],xmm11[0],xmm30[1],xmm11[1],xmm30[2],xmm11[2],xmm30[3],xmm11[3],xmm30[4],xmm11[4],xmm30[5],xmm11[5],xmm30[6],xmm11[6],xmm30[7],xmm11[7] -; AVX512BW-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm5 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm5 {%k1} -; AVX512BW-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 -; AVX512BW-NEXT: kmovq %rax, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm19, %zmm5 {%k3} -; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512BW-NEXT: vpshufb %xmm3, %xmm7, %xmm0 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX512BW-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm14 {%k2} -; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqu8 %zmm28, %zmm14 {%k3} -; AVX512BW-NEXT: vmovdqu16 %zmm29, %zmm4 {%k1} -; AVX512BW-NEXT: movl $1227133513, %eax # imm = 0x49249249 -; AVX512BW-NEXT: kmovd %eax, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm4 {%k2} -; AVX512BW-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 -; AVX512BW-NEXT: kmovq %rax, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm20, %zmm4 {%k3} -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm17 {%k2} -; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm17 {%k3} -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512BW-NEXT: vpermt2w %zmm30, %zmm28, %zmm29 +; AVX512BW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 +; AVX512BW-NEXT: kmovq %rcx, %k3 +; AVX512BW-NEXT: vmovdqu8 %zmm29, %zmm6 {%k3} +; AVX512BW-NEXT: vpshufb %xmm20, %xmm21, %xmm29 +; AVX512BW-NEXT: vpshufb %xmm20, %xmm22, %xmm20 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm20[8],xmm29[8],xmm20[9],xmm29[9],xmm20[10],xmm29[10],xmm20[11],xmm29[11],xmm20[12],xmm29[12],xmm20[13],xmm29[13],xmm20[14],xmm29[14],xmm20[15],xmm29[15] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512BW-NEXT: vpermt2w %zmm29, %zmm27, %zmm20 +; AVX512BW-NEXT: vpshufb %xmm11, %xmm25, %xmm27 +; AVX512BW-NEXT: vpshufb %xmm11, %xmm26, %xmm11 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm27[0],xmm11[1],xmm27[1],xmm11[2],xmm27[2],xmm11[3],xmm27[3],xmm11[4],xmm27[4],xmm11[5],xmm27[5],xmm11[6],xmm27[6],xmm11[7],xmm27[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] +; AVX512BW-NEXT: vprold $16, %xmm27, %xmm27 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,0,1,4,4,4,5] +; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm20 {%k2} +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm11 = xmm23[0],zero,xmm23[1],zero,xmm23[2],zero,xmm23[3],zero,xmm23[4],zero,xmm23[5],zero,xmm23[6],zero,xmm23[7],zero +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm27 = xmm23[2,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm27 = xmm27[0],zero,xmm27[1],zero,xmm27[2],zero,xmm27[3],zero,xmm27[4],zero,xmm27[5],zero,xmm27[6],zero,xmm27[7],zero +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm28, %zmm11 +; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm20 {%k1} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm24[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm27 = xmm24[2,1,2,3] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm27[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpermt2w %zmm27, %zmm28, %zmm11 +; AVX512BW-NEXT: vmovdqu8 %zmm11, %zmm20 {%k3} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm16 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm10 +; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm10 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512BW-NEXT: vpshufb %ymm9, %ymm13, %ymm13 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm23[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] +; AVX512BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm16 +; AVX512BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 +; AVX512BW-NEXT: kmovd %ecx, %k2 +; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm10 {%k2} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512BW-NEXT: vpshufb %ymm13, %ymm12, %ymm12 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm24[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm21, %zmm16 +; AVX512BW-NEXT: movabsq $2342443691899625602, %rcx # imm = 0x2082082082082082 +; AVX512BW-NEXT: kmovq %rcx, %k3 +; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm10 {%k3} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm18[8],xmm14[8],xmm18[9],xmm14[9],xmm18[10],xmm14[10],xmm18[11],xmm14[11],xmm18[12],xmm14[12],xmm18[13],xmm14[13],xmm18[14],xmm14[14],xmm18[15],xmm14[15] +; AVX512BW-NEXT: vpermt2w %zmm3, %zmm17, %zmm4 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512BW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm3 +; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k2} +; AVX512BW-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512BW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm19[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm3 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i8_stride6_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm2 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15],ymm9[24],ymm8[24],ymm9[25],ymm8[25],ymm9[26],ymm8[26],ymm9[27],ymm8[27],ymm9[28],ymm8[28],ymm9[29],ymm8[29],ymm9[30],ymm8[30],ymm9[31],ymm8[31] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512BW-FCP-NEXT: vpermw %ymm3, %ymm0, %ymm3 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 @@ -6573,25 +6563,25 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm21 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm21, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm21, %xmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm24 ; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm24, %xmm11 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7] ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm13, %zmm11 +; AVX512BW-FCP-NEXT: vpermt2w %zmm8, %zmm13, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm19 ; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm19, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm19, %xmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm22 ; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm22, %xmm12 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7] ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm16, %zmm10 +; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm16, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %xmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm20 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm17 = [8,9,0,0,0,5,6,7] @@ -6641,57 +6631,57 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm31, %ymm29 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] ; AVX512BW-FCP-NEXT: vpermt2w %zmm29, %zmm0, %zmm25 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15] ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm26, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm26, %zmm9 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX512BW-FCP-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm22, %ymm19 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm0, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm28 -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm28, %ymm1 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm0, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm19 +; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm19, %ymm1 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm1, %ymm26 +; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm1, %ymm1 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm23[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm0, %zmm20 +; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm20 ; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm23 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm23[0],ymm0[0],ymm23[1],ymm0[1],ymm23[2],ymm0[2],ymm23[3],ymm0[3],ymm23[4],ymm0[4],ymm23[5],ymm0[5],ymm23[6],ymm0[6],ymm23[7],ymm0[7],ymm23[16],ymm0[16],ymm23[17],ymm0[17],ymm23[18],ymm0[18],ymm23[19],ymm0[19],ymm23[20],ymm0[20],ymm23[21],ymm0[21],ymm23[22],ymm0[22],ymm23[23],ymm0[23] +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm1 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15],ymm5[24],ymm4[24],ymm5[25],ymm4[25],ymm5[26],ymm4[26],ymm5[27],ymm4[27],ymm5[28],ymm4[28],ymm5[29],ymm4[29],ymm5[30],ymm4[30],ymm5[31],ymm4[31] ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512BW-FCP-NEXT: vpermw %ymm4, %ymm5, %ymm4 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512BW-FCP-NEXT: movl $613566756, %eax # imm = 0x24924924 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm0[4,5,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[4,5,6,7,4,5,6,7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7] ; AVX512BW-FCP-NEXT: movl $-1840700270, %eax # imm = 0x92492492 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[4,5,6,7,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7] ; AVX512BW-FCP-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm4 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm4 {%k3} ; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm23 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm6, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm6, %ymm3 ; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm23 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm23[0],ymm1[0],ymm23[1],ymm1[1],ymm23[2],ymm1[2],ymm23[3],ymm1[3],ymm23[4],ymm1[4],ymm23[5],ymm1[5],ymm23[6],ymm1[6],ymm23[7],ymm1[7],ymm23[16],ymm1[16],ymm23[17],ymm1[17],ymm23[18],ymm1[18],ymm23[19],ymm1[19],ymm23[20],ymm1[20],ymm23[21],ymm1[21],ymm23[22],ymm1[22],ymm23[23],ymm1[23] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm23[0],ymm3[0],ymm23[1],ymm3[1],ymm23[2],ymm3[2],ymm23[3],ymm3[3],ymm23[4],ymm3[4],ymm23[5],ymm3[5],ymm23[6],ymm3[6],ymm23[7],ymm3[7],ymm23[16],ymm3[16],ymm23[17],ymm3[17],ymm23[18],ymm3[18],ymm23[19],ymm3[19],ymm23[20],ymm3[20],ymm23[21],ymm3[21],ymm23[22],ymm3[22],ymm23[23],ymm3[23] ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm30[8],ymm6[8],ymm30[9],ymm6[9],ymm30[10],ymm6[10],ymm30[11],ymm6[11],ymm30[12],ymm6[12],ymm30[13],ymm6[13],ymm30[14],ymm6[14],ymm30[15],ymm6[15],ymm30[24],ymm6[24],ymm30[25],ymm6[25],ymm30[26],ymm6[26],ymm30[27],ymm6[27],ymm30[28],ymm6[28],ymm30[29],ymm6[29],ymm30[30],ymm6[30],ymm30[31],ymm6[31] ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm23 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm23, %ymm6 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 ; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm6 ; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm27, %ymm2 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[16],ymm6[16],ymm2[17],ymm6[17],ymm2[18],ymm6[18],ymm2[19],ymm6[19],ymm2[20],ymm6[20],ymm2[21],ymm6[21],ymm2[22],ymm6[22],ymm2[23],ymm6[23] @@ -6699,20 +6689,20 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k3} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm8 {%k3} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k2} ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm13 {%k3} @@ -6722,15 +6712,15 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm24 {%k2} ; AVX512BW-FCP-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm24 {%k3} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm8 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm24 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm9 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm9 {%k3} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512BW-FCP-NEXT: vzeroupper @@ -6738,210 +6728,200 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512DQ-BW-LABEL: store_i8_stride6_vf64: ; AVX512DQ-BW: # %bb.0: -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-BW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm2, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm3, %ymm1 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm1, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm2, %ymm3 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15],ymm3[24],ymm2[24],ymm3[25],ymm2[25],ymm3[26],ymm2[26],ymm3[27],ymm2[27],ymm3[28],ymm2[28],ymm3[29],ymm2[29],ymm3[30],ymm2[30],ymm3[31],ymm2[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-NEXT: vpshufb %ymm1, %ymm12, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %ymm13 -; AVX512DQ-BW-NEXT: vpshufb %ymm1, %ymm13, %ymm8 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[1],ymm0[1],ymm8[2],ymm0[2],ymm8[3],ymm0[3],ymm8[4],ymm0[4],ymm8[5],ymm0[5],ymm8[6],ymm0[6],ymm8[7],ymm0[7],ymm8[16],ymm0[16],ymm8[17],ymm0[17],ymm8[18],ymm0[18],ymm8[19],ymm0[19],ymm8[20],ymm0[20],ymm8[21],ymm0[21],ymm8[22],ymm0[22],ymm8[23],ymm0[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-NEXT: vpermw %ymm3, %ymm8, %ymm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm11 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm3, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm4, %ymm10 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[16],ymm0[16],ymm10[17],ymm0[17],ymm10[18],ymm0[18],ymm10[19],ymm0[19],ymm10[20],ymm0[20],ymm10[21],ymm0[21],ymm10[22],ymm0[22],ymm10[23],ymm0[23] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15],ymm13[24],ymm12[24],ymm13[25],ymm12[25],ymm13[26],ymm12[26],ymm13[27],ymm12[27],ymm13[28],ymm12[28],ymm13[29],ymm12[29],ymm13[30],ymm12[30],ymm13[31],ymm12[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm9, %ymm1, %ymm9 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm10 -; AVX512DQ-BW-NEXT: movl $613566756, %eax # imm = 0x24924924 -; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQ-BW-NEXT: vpermw %ymm10, %ymm12, %ymm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: movl $613566756, %r10d # imm = 0x24924924 +; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm7[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 +; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm6[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm9 = zmm9[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm9[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: movabsq $-9076969306111049208, %r10 # imm = 0x8208208208208208 +; AVX512DQ-BW-NEXT: kmovq %r10, %k3 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm9, %zmm0 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm9 +; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm9, %ymm13 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm10, %ymm5 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm13[0],ymm5[1],ymm13[1],ymm5[2],ymm13[2],ymm5[3],ymm13[3],ymm5[4],ymm13[4],ymm5[5],ymm13[5],ymm5[6],ymm13[6],ymm5[7],ymm13[7],ymm5[16],ymm13[16],ymm5[17],ymm13[17],ymm5[18],ymm13[18],ymm5[19],ymm13[19],ymm5[20],ymm13[20],ymm5[21],ymm13[21],ymm5[22],ymm13[22],ymm5[23],ymm13[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] +; AVX512DQ-BW-NEXT: vpermw %ymm13, %ymm8, %ymm8 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm16, %ymm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %xmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm18 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm14 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm18, %xmm5 +; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm17, %ymm13 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm13[0],ymm5[0],ymm13[1],ymm5[1],ymm13[2],ymm5[2],ymm13[3],ymm5[3],ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[16],ymm5[16],ymm13[17],ymm5[17],ymm13[18],ymm5[18],ymm13[19],ymm5[19],ymm13[20],ymm5[20],ymm13[21],ymm5[21],ymm13[22],ymm5[22],ymm13[23],ymm5[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm13 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] +; AVX512DQ-BW-NEXT: vpermw %ymm13, %ymm12, %ymm12 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm13 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm5 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm12 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %xmm21 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm20 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm7, %xmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm21 -; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm21, %xmm9 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm21[0],xmm18[0],xmm21[1],xmm18[1],xmm21[2],xmm18[2],xmm21[3],xmm18[3],xmm21[4],xmm18[4],xmm21[5],xmm18[5],xmm21[6],xmm18[6],xmm21[7],xmm18[7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm26 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm26, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %xmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm23 -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm15 = xmm23[0],zero,xmm23[1],zero,xmm23[2],zero,xmm23[3],zero,xmm23[4],zero,xmm23[5],zero,xmm23[6],zero,xmm23[7],zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm23[2,1,2,3] -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm19 = xmm19[0],zero,xmm19[1],zero,xmm19[2],zero,xmm19[3],zero,xmm19[4],zero,xmm19[5],zero,xmm19[6],zero,xmm19[7],zero -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm29, %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %xmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm24 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm24[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm28 = xmm24[2,1,2,3] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm28 = xmm28[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm28, %zmm29, %zmm19 -; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm20, %xmm28 -; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm22, %xmm14 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm14[8],xmm28[8],xmm14[9],xmm28[9],xmm14[10],xmm28[10],xmm14[11],xmm28[11],xmm14[12],xmm28[12],xmm14[13],xmm28[13],xmm14[14],xmm28[14],xmm14[15],xmm28[15] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm22[0],xmm20[0],xmm22[1],xmm20[1],xmm22[2],xmm20[2],xmm22[3],xmm20[3],xmm22[4],xmm20[4],xmm22[5],xmm20[5],xmm22[6],xmm20[6],xmm22[7],xmm20[7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm28, %zmm26, %zmm14 -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm26 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm28 = xmm25[2,1,2,3] -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm28 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero,xmm28[4],zero,xmm28[5],zero,xmm28[6],zero,xmm28[7],zero -; AVX512DQ-BW-NEXT: vpermt2w %zmm28, %zmm29, %zmm26 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm28 = xmm27[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm30 = xmm27[2,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm8, %xmm14 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm27 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm27, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %xmm25 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm14, %xmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %xmm26 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm18 +; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm18, %xmm19 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm18[0],xmm14[0],xmm18[1],xmm14[1],xmm18[2],xmm14[2],xmm18[3],xmm14[3],xmm18[4],xmm14[4],xmm18[5],xmm14[5],xmm18[6],xmm14[6],xmm18[7],xmm14[7] +; AVX512DQ-BW-NEXT: vprold $16, %xmm19, %xmm19 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm15 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm15 = zmm15[0,0,0,1,4,4,4,5] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm6 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %xmm23 +; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %xmm15 +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm19 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero,xmm15[4],zero,xmm15[5],zero,xmm15[6],zero,xmm15[7],zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm24 = xmm15[2,1,2,3] +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm24 = xmm24[0],zero,xmm24[1],zero,xmm24[2],zero,xmm24[3],zero,xmm24[4],zero,xmm24[5],zero,xmm24[6],zero,xmm24[7],zero +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm28, %zmm19 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm6 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %xmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm19 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm29 = xmm19[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm30 = xmm19[2,1,2,3] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm30 = xmm30[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm29, %zmm28 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm30 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm31 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm31, %zmm29 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm30 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[16],ymm12[16],ymm13[17],ymm12[17],ymm13[18],ymm12[18],ymm13[19],ymm12[19],ymm13[20],ymm12[20],ymm13[21],ymm12[21],ymm13[22],ymm12[22],ymm13[23],ymm12[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm30[8],xmm11[8],xmm30[9],xmm11[9],xmm30[10],xmm11[10],xmm30[11],xmm11[11],xmm30[12],xmm11[12],xmm30[13],xmm11[13],xmm30[14],xmm11[14],xmm30[15],xmm11[15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm31, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm13[0,1,2,3],zmm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm31 = zmm31[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm31 = zmm31[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: movl $-1840700270, %eax # imm = 0x92492492 -; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm31, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm7[0,1,2,3],zmm31[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm8 = zmm8[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 -; AVX512DQ-BW-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm10 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm8, %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm0, %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[16],ymm1[16],ymm6[17],ymm1[17],ymm6[18],ymm1[18],ymm6[19],ymm1[19],ymm6[20],ymm1[20],ymm6[21],ymm1[21],ymm6[22],ymm1[22],ymm6[23],ymm1[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm8[8],ymm0[9],ymm8[9],ymm0[10],ymm8[10],ymm0[11],ymm8[11],ymm0[12],ymm8[12],ymm0[13],ymm8[13],ymm0[14],ymm8[14],ymm0[15],ymm8[15],ymm0[24],ymm8[24],ymm0[25],ymm8[25],ymm0[26],ymm8[26],ymm0[27],ymm8[27],ymm0[28],ymm8[28],ymm0[29],ymm8[29],ymm0[30],ymm8[30],ymm0[31],ymm8[31] -; AVX512DQ-BW-NEXT: vpermw %ymm6, %ymm2, %ymm6 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-NEXT: vpshufb %ymm1, %ymm16, %ymm6 -; AVX512DQ-BW-NEXT: vpshufb %ymm1, %ymm17, %ymm4 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm6, %ymm1, %ymm6 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %ymm16 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm31, %zmm4 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm6 {%k3} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[4],ymm8[4],ymm0[5],ymm8[5],ymm0[6],ymm8[6],ymm0[7],ymm8[7],ymm0[16],ymm8[16],ymm0[17],ymm8[17],ymm0[18],ymm8[18],ymm0[19],ymm8[19],ymm0[20],ymm8[20],ymm0[21],ymm8[21],ymm0[22],ymm8[22],ymm0[23],ymm8[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm22[8],xmm20[8],xmm22[9],xmm20[9],xmm22[10],xmm20[10],xmm22[11],xmm20[11],xmm22[12],xmm20[12],xmm22[13],xmm20[13],xmm22[14],xmm20[14],xmm22[15],xmm20[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-BW-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-BW-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm21[8],xmm18[8],xmm21[9],xmm18[9],xmm21[10],xmm18[10],xmm21[11],xmm18[11],xmm21[12],xmm18[12],xmm21[13],xmm18[13],xmm21[14],xmm18[14],xmm21[15],xmm18[15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm17 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512DQ-BW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm25[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm8 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512DQ-BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %ymm1, %ymm16, %ymm16 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm27[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm18, %zmm20 -; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm23[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm18, %zmm13 -; AVX512DQ-BW-NEXT: vpshufb %ymm1, %ymm7, %ymm0 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm24[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm18, %zmm1 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm11, %xmm0 -; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm30, %xmm2 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm30[0],xmm11[0],xmm30[1],xmm11[1],xmm30[2],xmm11[2],xmm30[3],xmm11[3],xmm30[4],xmm11[4],xmm30[5],xmm11[5],xmm30[6],xmm11[6],xmm30[7],xmm11[7] -; AVX512DQ-BW-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 -; AVX512DQ-BW-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm19, %zmm5 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm7, %xmm0 -; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm9, %xmm2 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX512DQ-BW-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm26, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm28, %zmm14 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm29, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: movl $1227133513, %eax # imm = 0x49249249 -; AVX512DQ-BW-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm4 {%k2} -; AVX512DQ-BW-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 -; AVX512DQ-BW-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm20, %zmm4 {%k3} -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm13, %zmm17 {%k2} -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm17 {%k3} -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm28, %zmm29 +; AVX512DQ-BW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k3 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm29, %zmm6 {%k3} +; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm21, %xmm29 +; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm22, %xmm20 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm29 = xmm20[8],xmm29[8],xmm20[9],xmm29[9],xmm20[10],xmm29[10],xmm20[11],xmm29[11],xmm20[12],xmm29[12],xmm20[13],xmm29[13],xmm20[14],xmm29[14],xmm20[15],xmm29[15] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm29, %zmm27, %zmm20 +; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm25, %xmm27 +; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm26, %xmm11 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm27[0],xmm11[1],xmm27[1],xmm11[2],xmm27[2],xmm11[3],xmm27[3],xmm11[4],xmm27[4],xmm11[5],xmm27[5],xmm11[6],xmm27[6],xmm11[7],xmm27[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] +; AVX512DQ-BW-NEXT: vprold $16, %xmm27, %xmm27 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,0,1,4,4,4,5] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm20 {%k2} +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm11 = xmm23[0],zero,xmm23[1],zero,xmm23[2],zero,xmm23[3],zero,xmm23[4],zero,xmm23[5],zero,xmm23[6],zero,xmm23[7],zero +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm27 = xmm23[2,1,2,3] +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm27 = xmm27[0],zero,xmm27[1],zero,xmm27[2],zero,xmm27[3],zero,xmm27[4],zero,xmm27[5],zero,xmm27[6],zero,xmm27[7],zero +; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm28, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm24[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm27 = xmm24[2,1,2,3] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm27[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm27, %zmm28, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm11, %zmm20 {%k3} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm17, %zmm16 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm13, %ymm13 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm23[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] +; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm16 +; AVX512DQ-BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 +; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512DQ-BW-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm12, %ymm12 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm24[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm21, %zmm16 +; AVX512DQ-BW-NEXT: movabsq $2342443691899625602, %rcx # imm = 0x2082082082082082 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k3 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm10 {%k3} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm18[8],xmm14[8],xmm18[9],xmm14[9],xmm18[10],xmm14[10],xmm18[11],xmm14[11],xmm18[12],xmm14[12],xmm18[13],xmm14[13],xmm18[14],xmm14[14],xmm18[15],xmm14[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm17, %zmm4 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm19[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 320(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i8_stride6_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15],ymm9[24],ymm8[24],ymm9[25],ymm8[25],ymm9[26],ymm8[26],ymm9[27],ymm8[27],ymm9[28],ymm8[28],ymm9[29],ymm8[29],ymm9[30],ymm8[30],ymm9[31],ymm8[31] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm3, %ymm0, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 @@ -6952,25 +6932,25 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm21, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm21, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm24 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm24, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm13, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm8, %zmm13, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm19 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm19, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm19, %xmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm22 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm22, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm16, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm16, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %xmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm20 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm17 = [8,9,0,0,0,5,6,7] @@ -7020,57 +7000,57 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm31, %ymm29 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm29, %zmm0, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %ymm22 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm26, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm26, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX512DQ-BW-FCP-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm22, %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm28 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm28, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm0, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm19, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm1, %ymm26 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm1, %ymm1 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm23[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm0, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm23[0],ymm0[0],ymm23[1],ymm0[1],ymm23[2],ymm0[2],ymm23[3],ymm0[3],ymm23[4],ymm0[4],ymm23[5],ymm0[5],ymm23[6],ymm0[6],ymm23[7],ymm0[7],ymm23[16],ymm0[16],ymm23[17],ymm0[17],ymm23[18],ymm0[18],ymm23[19],ymm0[19],ymm23[20],ymm0[20],ymm23[21],ymm0[21],ymm23[22],ymm0[22],ymm23[23],ymm0[23] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15],ymm5[24],ymm4[24],ymm5[25],ymm4[25],ymm5[26],ymm4[26],ymm5[27],ymm4[27],ymm5[28],ymm4[28],ymm5[29],ymm4[29],ymm5[30],ymm4[30],ymm5[31],ymm4[31] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm4, %ymm5, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512DQ-BW-FCP-NEXT: movl $613566756, %eax # imm = 0x24924924 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm0[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[4,5,6,7,4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7] ; AVX512DQ-BW-FCP-NEXT: movl $-1840700270, %eax # imm = 0x92492492 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7] ; AVX512DQ-BW-FCP-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm4 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm4 {%k3} ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm23 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm6, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm6, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm23 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm23[0],ymm1[0],ymm23[1],ymm1[1],ymm23[2],ymm1[2],ymm23[3],ymm1[3],ymm23[4],ymm1[4],ymm23[5],ymm1[5],ymm23[6],ymm1[6],ymm23[7],ymm1[7],ymm23[16],ymm1[16],ymm23[17],ymm1[17],ymm23[18],ymm1[18],ymm23[19],ymm1[19],ymm23[20],ymm1[20],ymm23[21],ymm1[21],ymm23[22],ymm1[22],ymm23[23],ymm1[23] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm23[0],ymm3[0],ymm23[1],ymm3[1],ymm23[2],ymm3[2],ymm23[3],ymm3[3],ymm23[4],ymm3[4],ymm23[5],ymm3[5],ymm23[6],ymm3[6],ymm23[7],ymm3[7],ymm23[16],ymm3[16],ymm23[17],ymm3[17],ymm23[18],ymm3[18],ymm23[19],ymm3[19],ymm23[20],ymm3[20],ymm23[21],ymm3[21],ymm23[22],ymm3[22],ymm23[23],ymm3[23] ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm30[8],ymm6[8],ymm30[9],ymm6[9],ymm30[10],ymm6[10],ymm30[11],ymm6[11],ymm30[12],ymm6[12],ymm30[13],ymm6[13],ymm30[14],ymm6[14],ymm30[15],ymm6[15],ymm30[24],ymm6[24],ymm30[25],ymm6[25],ymm30[26],ymm6[26],ymm30[27],ymm6[27],ymm30[28],ymm6[28],ymm30[29],ymm6[29],ymm30[30],ymm6[30],ymm30[31],ymm6[31] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm23 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm23, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm6 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm27, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[16],ymm6[16],ymm2[17],ymm6[17],ymm2[18],ymm6[18],ymm2[19],ymm6[19],ymm2[20],ymm6[20],ymm2[21],ymm6[21],ymm2[22],ymm6[22],ymm2[23],ymm6[23] @@ -7078,20 +7058,20 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm5, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63,u] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm2 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm8 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm13 {%k3} @@ -7101,15 +7081,15 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm24 {%k2} ; AVX512DQ-BW-FCP-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm24 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm8 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm24 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm9 {%k3} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 02ec9fc66feab..aadb8b7635636 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -9934,297 +9934,297 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-LABEL: store_i8_stride7_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm0 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm0 ; AVX512BW-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm14 -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] -; AVX512BW-NEXT: movl $338170920, %r10d # imm = 0x14281428 -; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm1 {%k2} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm9, %ymm2, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512BW-NEXT: vpshufb %ymm10, %ymm14, %ymm5 -; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[2,3,2,3],zmm1[2,3,2,3] -; AVX512BW-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512BW-NEXT: vpshufb %ymm5, %ymm15, %ymm3 -; AVX512BW-NEXT: vmovdqa64 32(%rcx), %ymm17 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] -; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm11, %ymm17, %ymm6 -; AVX512BW-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512BW-NEXT: vpshufb %ymm6, %ymm17, %ymm7 -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] -; AVX512BW-NEXT: movl $676341840, %r10d # imm = 0x28502850 -; AVX512BW-NEXT: kmovd %r10d, %k3 -; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k3} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm7[2,3,2,3] -; AVX512BW-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18 -; AVX512BW-NEXT: kmovq %r10, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] +; AVX512BW-NEXT: movl $338170920, %r11d # imm = 0x14281428 +; AVX512BW-NEXT: kmovd %r11d, %k2 +; AVX512BW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 {%k2} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512BW-NEXT: vpshufb %ymm8, %ymm2, %ymm4 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512BW-NEXT: vpshufb %ymm9, %ymm13, %ymm5 +; AVX512BW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512BW-NEXT: vpshufb %ymm4, %ymm14, %ymm5 +; AVX512BW-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512BW-NEXT: vpshufb %ymm10, %ymm15, %ymm6 +; AVX512BW-NEXT: vpor %ymm5, %ymm6, %ymm6 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512BW-NEXT: vpshufb %ymm5, %ymm15, %ymm7 +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,2,3,3,4,6,7,7] +; AVX512BW-NEXT: movl $676341840, %r11d # imm = 0x28502850 +; AVX512BW-NEXT: kmovd %r11d, %k3 +; AVX512BW-NEXT: vmovdqu8 %ymm11, %ymm7 {%k3} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[2,3,2,3],zmm7[2,3,2,3] +; AVX512BW-NEXT: movabsq $1742999440035548184, %r11 # imm = 0x183060C183060C18 +; AVX512BW-NEXT: kmovq %r11, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 32(%r9), %ymm16 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm7, %ymm16, %ymm3 -; AVX512BW-NEXT: vmovdqa64 32(%r8), %ymm18 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] -; AVX512BW-NEXT: vpshufb %ymm8, %ymm18, %ymm12 -; AVX512BW-NEXT: vpor %ymm3, %ymm12, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512BW-NEXT: vpshufb %ymm12, %ymm18, %ymm19 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] -; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm13, %ymm16, %ymm20 -; AVX512BW-NEXT: vporq %ymm19, %ymm20, %ymm19 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm19[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060 -; AVX512BW-NEXT: kmovq %r10, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] -; AVX512BW-NEXT: vpermw %zmm3, %zmm19, %zmm19 -; AVX512BW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 -; AVX512BW-NEXT: kmovq %rax, %k4 -; AVX512BW-NEXT: vmovdqu8 %zmm19, %zmm1 {%k4} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-NEXT: vpshufb %ymm19, %ymm15, %ymm21 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512BW-NEXT: vpshufb %ymm20, %ymm17, %ymm22 -; AVX512BW-NEXT: vporq %ymm21, %ymm22, %ymm21 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128] +; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512BW-NEXT: vpshufb %ymm6, %ymm16, %ymm11 +; AVX512BW-NEXT: vmovdqa64 32(%r8), %ymm19 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] +; AVX512BW-NEXT: vpshufb %ymm7, %ymm19, %ymm12 +; AVX512BW-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512BW-NEXT: vpshufb %ymm17, %ymm19, %ymm12 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512BW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb %ymm18, %ymm16, %ymm20 +; AVX512BW-NEXT: vporq %ymm12, %ymm20, %ymm12 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[2,3,2,3],zmm11[2,3,2,3] +; AVX512BW-NEXT: movabsq $6971997760142192736, %r11 # imm = 0x60C183060C183060 +; AVX512BW-NEXT: kmovq %r11, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm11, %zmm1 {%k1} +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] +; AVX512BW-NEXT: vpermw %zmm0, %zmm11, %zmm11 +; AVX512BW-NEXT: movabsq $-9150747060186627967, %r11 # imm = 0x8102040810204081 +; AVX512BW-NEXT: kmovq %r11, %k4 +; AVX512BW-NEXT: vmovdqu8 %zmm11, %zmm1 {%k4} +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512BW-NEXT: vpshufb %ymm11, %ymm14, %ymm20 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512BW-NEXT: vpshufb %ymm12, %ymm15, %ymm21 +; AVX512BW-NEXT: vporq %ymm20, %ymm21, %ymm20 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] -; AVX512BW-NEXT: vpshufb %ymm22, %ymm15, %ymm15 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX512BW-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %ymm25, %ymm17, %ymm17 -; AVX512BW-NEXT: vporq %ymm15, %ymm17, %ymm15 +; AVX512BW-NEXT: vpshufb %ymm22, %ymm14, %ymm14 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512BW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb %ymm23, %ymm15, %ymm15 +; AVX512BW-NEXT: vpor %ymm14, %ymm15, %ymm14 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm20, %zmm14 +; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,1,1,4,4,5,5] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm25 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] +; AVX512BW-NEXT: vpshufb %ymm25, %ymm2, %ymm15 {%k3} ; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm15 -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm17 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,0,1,1,4,4,5,5] -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm21 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] -; AVX512BW-NEXT: vpshufb %ymm21, %ymm2, %ymm17 {%k3} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-NEXT: vpshufb %ymm23, %ymm14, %ymm14 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512BW-NEXT: vpshufb %ymm24, %ymm2, %ymm2 -; AVX512BW-NEXT: vpor %ymm2, %ymm14, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2 -; AVX512BW-NEXT: movabsq $3485998880071096368, %rax # imm = 0x3060C183060C1830 -; AVX512BW-NEXT: kmovq %rax, %k4 -; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm2 {%k4} -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-NEXT: vpshufb %ymm14, %ymm16, %ymm17 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512BW-NEXT: vpshufb %ymm15, %ymm18, %ymm26 -; AVX512BW-NEXT: vporq %ymm17, %ymm26, %ymm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512BW-NEXT: vpshufb %ymm20, %ymm13, %ymm13 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512BW-NEXT: vpshufb %ymm21, %ymm2, %ymm2 +; AVX512BW-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm2, %zmm2 +; AVX512BW-NEXT: movabsq $3485998880071096368, %r11 # imm = 0x3060C183060C1830 +; AVX512BW-NEXT: kmovq %r11, %k4 +; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm2 {%k4} +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512BW-NEXT: vpshufb %ymm13, %ymm16, %ymm15 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512BW-NEXT: vpshufb %ymm14, %ymm19, %ymm24 +; AVX512BW-NEXT: vporq %ymm15, %ymm24, %ymm15 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512BW-NEXT: vpshufb %ymm26, %ymm18, %ymm18 +; AVX512BW-NEXT: vpshufb %ymm26, %ymm19, %ymm19 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] ; AVX512BW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpshufb %ymm28, %ymm16, %ymm16 -; AVX512BW-NEXT: vporq %ymm18, %ymm16, %ymm16 +; AVX512BW-NEXT: vporq %ymm19, %ymm16, %ymm16 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm16 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 +; AVX512BW-NEXT: vmovdqa64 32(%r10), %ymm16 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] -; AVX512BW-NEXT: vpermw %ymm3, %ymm29, %ymm17 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm18 = ymm3[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 -; AVX512BW-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102 -; AVX512BW-NEXT: kmovq %rax, %k5 -; AVX512BW-NEXT: vmovdqu8 %zmm17, %zmm16 {%k5} -; AVX512BW-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3 -; AVX512BW-NEXT: kmovq %rax, %k5 -; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm2 {%k5} -; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm16 -; AVX512BW-NEXT: vpshufb %ymm5, %ymm16, %ymm5 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512BW-NEXT: vpshufb %ymm11, %ymm17, %ymm11 -; AVX512BW-NEXT: vpor %ymm5, %ymm11, %ymm5 -; AVX512BW-NEXT: vpshufb %ymm22, %ymm16, %ymm11 -; AVX512BW-NEXT: vpshufb %ymm25, %ymm17, %ymm18 -; AVX512BW-NEXT: vporq %ymm11, %ymm18, %ymm11 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm11[2,3,2,3],zmm5[2,3,2,3] -; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm25 +; AVX512BW-NEXT: vpermw %ymm16, %ymm29, %ymm19 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm16, %zmm16 +; AVX512BW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 +; AVX512BW-NEXT: kmovq %r10, %k5 +; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm15 {%k5} +; AVX512BW-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 +; AVX512BW-NEXT: kmovq %r10, %k5 +; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm2 {%k5} +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm15 +; AVX512BW-NEXT: vpshufb %ymm4, %ymm15, %ymm4 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512BW-NEXT: vpshufb %ymm10, %ymm16, %ymm10 +; AVX512BW-NEXT: vpor %ymm4, %ymm10, %ymm4 +; AVX512BW-NEXT: vpshufb %ymm22, %ymm15, %ymm10 +; AVX512BW-NEXT: vpshufb %ymm23, %ymm16, %ymm19 +; AVX512BW-NEXT: vporq %ymm10, %ymm19, %ymm10 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm10[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm24 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm27 -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm27[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,1,1,4,4,5,5] -; AVX512BW-NEXT: vpshufb %ymm21, %ymm25, %ymm11 {%k3} -; AVX512BW-NEXT: vpshufb %ymm9, %ymm25, %ymm9 -; AVX512BW-NEXT: vpshufb %ymm10, %ymm27, %ymm10 -; AVX512BW-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[2,3,2,3],zmm9[2,3,2,3] -; AVX512BW-NEXT: vmovdqu8 %zmm9, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 (%r8), %ymm18 -; AVX512BW-NEXT: vpshufb %ymm12, %ymm18, %ymm9 -; AVX512BW-NEXT: vmovdqa64 (%r9), %ymm21 -; AVX512BW-NEXT: vpshufb %ymm13, %ymm21, %ymm10 +; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm27[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,1,1,4,4,5,5] +; AVX512BW-NEXT: vpshufb %ymm25, %ymm24, %ymm10 {%k3} +; AVX512BW-NEXT: vpshufb %ymm8, %ymm24, %ymm8 +; AVX512BW-NEXT: vpshufb %ymm9, %ymm27, %ymm9 +; AVX512BW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[2,3,2,3],zmm8[2,3,2,3] +; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 (%r8), %ymm19 +; AVX512BW-NEXT: vpshufb %ymm17, %ymm19, %ymm8 +; AVX512BW-NEXT: vmovdqa64 (%r9), %ymm22 +; AVX512BW-NEXT: vpshufb %ymm18, %ymm22, %ymm9 +; AVX512BW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512BW-NEXT: vpshufb %ymm26, %ymm19, %ymm9 +; AVX512BW-NEXT: vpshufb %ymm28, %ymm22, %ymm10 ; AVX512BW-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512BW-NEXT: vpshufb %ymm26, %ymm18, %ymm10 -; AVX512BW-NEXT: vpshufb %ymm28, %ymm21, %ymm11 -; AVX512BW-NEXT: vpor %ymm10, %ymm11, %ymm10 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3] -; AVX512BW-NEXT: vpermw %zmm0, %zmm29, %zmm10 -; AVX512BW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 -; AVX512BW-NEXT: kmovq %rax, %k5 -; AVX512BW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k5} -; AVX512BW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C -; AVX512BW-NEXT: kmovq %rax, %k5 -; AVX512BW-NEXT: vmovdqu8 %zmm9, %zmm5 {%k5} -; AVX512BW-NEXT: vpshufb %ymm19, %ymm16, %ymm9 -; AVX512BW-NEXT: vpshufb %ymm20, %ymm17, %ymm10 -; AVX512BW-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm19 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm20 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-NEXT: vpshufb %xmm22, %xmm12, %xmm12 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm12, %zmm26 -; AVX512BW-NEXT: vpshufb %ymm23, %ymm27, %ymm9 -; AVX512BW-NEXT: vpshufb %ymm24, %ymm25, %ymm12 -; AVX512BW-NEXT: vpor %ymm9, %ymm12, %ymm9 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm13 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm23 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-NEXT: vpshufb %xmm23, %xmm24, %xmm24 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm24, %zmm9 -; AVX512BW-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306 -; AVX512BW-NEXT: kmovq %rax, %k5 -; AVX512BW-NEXT: vmovdqu8 %zmm26, %zmm9 {%k5} -; AVX512BW-NEXT: vpshufb %ymm14, %ymm21, %ymm14 -; AVX512BW-NEXT: vpshufb %ymm15, %ymm18, %ymm15 -; AVX512BW-NEXT: vporq %ymm14, %ymm15, %ymm24 -; AVX512BW-NEXT: vmovdqa (%r9), %xmm14 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm15 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-NEXT: vpshufb %xmm26, %xmm28, %xmm28 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[2,3,2,3],zmm8[2,3,2,3] +; AVX512BW-NEXT: vpermw %zmm0, %zmm29, %zmm9 +; AVX512BW-NEXT: movabsq $1161999626690365456, %r10 # imm = 0x1020408102040810 +; AVX512BW-NEXT: kmovq %r10, %k5 +; AVX512BW-NEXT: vmovdqu8 %zmm9, %zmm8 {%k5} +; AVX512BW-NEXT: movabsq $2033499346708139548, %r10 # imm = 0x1C3870E1C3870E1C +; AVX512BW-NEXT: kmovq %r10, %k5 +; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm4 {%k5} +; AVX512BW-NEXT: vpshufb %ymm11, %ymm15, %ymm8 +; AVX512BW-NEXT: vpshufb %ymm12, %ymm16, %ymm9 +; AVX512BW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm17 +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm18 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm23 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-NEXT: vpshufb %xmm23, %xmm11, %xmm11 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm26 +; AVX512BW-NEXT: vpshufb %ymm20, %ymm27, %ymm8 +; AVX512BW-NEXT: vpshufb %ymm21, %ymm24, %ymm11 +; AVX512BW-NEXT: vpor %ymm8, %ymm11, %ymm8 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm20 +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm21 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm25 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-NEXT: vpshufb %xmm25, %xmm28, %xmm28 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm24, %zmm28, %zmm24 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512BW-NEXT: vpermw %zmm0, %zmm28, %zmm28 -; AVX512BW-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020 -; AVX512BW-NEXT: kmovq %rax, %k5 -; AVX512BW-NEXT: vmovdqu8 %zmm28, %zmm24 {%k5} -; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm28 -; AVX512BW-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38 -; AVX512BW-NEXT: kmovq %rax, %k5 -; AVX512BW-NEXT: vmovdqu8 %zmm24, %zmm9 {%k5} -; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm29 -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm24 = ymm27[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm27 = ymm24[2,2,3,3,6,6,7,7] -; AVX512BW-NEXT: vpshufb %ymm4, %ymm25, %ymm27 {%k2} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm28, %zmm8 +; AVX512BW-NEXT: movabsq $435749860008887046, %rcx # imm = 0x60C183060C18306 +; AVX512BW-NEXT: kmovq %rcx, %k5 +; AVX512BW-NEXT: vmovdqu8 %zmm26, %zmm8 {%k5} +; AVX512BW-NEXT: vpshufb %ymm13, %ymm22, %ymm13 +; AVX512BW-NEXT: vpshufb %ymm14, %ymm19, %ymm14 +; AVX512BW-NEXT: vporq %ymm13, %ymm14, %ymm30 +; AVX512BW-NEXT: vmovdqa (%r9), %xmm13 +; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm26 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm14 +; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm28 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm31 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm29 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-NEXT: vpshufb %xmm29, %xmm31, %xmm31 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm30 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm31 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] +; AVX512BW-NEXT: vpermw %zmm0, %zmm31, %zmm31 +; AVX512BW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 +; AVX512BW-NEXT: kmovq %rcx, %k5 +; AVX512BW-NEXT: vmovdqu8 %zmm31, %zmm30 {%k5} +; AVX512BW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 +; AVX512BW-NEXT: kmovq %rcx, %k5 +; AVX512BW-NEXT: vmovdqu8 %zmm30, %zmm8 {%k5} +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm27 = ymm27[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm27 = ymm27[2,2,3,3,6,6,7,7] +; AVX512BW-NEXT: vpshufb %ymm3, %ymm24, %ymm27 {%k2} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm20[0],xmm21[0],xmm20[1],xmm21[1],xmm20[2],xmm21[2],xmm20[3],xmm21[3],xmm20[4],xmm21[4],xmm20[5],xmm21[5],xmm20[6],xmm21[6],xmm20[7],xmm21[7] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm24 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-NEXT: vpshufb %xmm24, %xmm4, %xmm4 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm27[2,3,2,3],zmm4[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm25 -; AVX512BW-NEXT: vpshufb %ymm6, %ymm17, %ymm4 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm19[0],xmm20[0],xmm19[1],xmm20[1],xmm19[2],xmm20[2],xmm19[3],xmm20[3],xmm19[4],xmm20[4],xmm19[5],xmm20[5],xmm19[6],xmm20[6],xmm19[7],xmm20[7] -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm16 = ymm16[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[0,2,3,3,4,6,7,7] -; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-NEXT: vpshufb %xmm16, %xmm6, %xmm6 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[2,3,2,3],zmm6[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm17 -; AVX512BW-NEXT: vmovdqu8 %zmm27, %zmm4 {%k4} -; AVX512BW-NEXT: vpshufb %ymm7, %ymm21, %ymm6 -; AVX512BW-NEXT: vpshufb %ymm8, %ymm18, %ymm7 -; AVX512BW-NEXT: vpor %ymm6, %ymm7, %ymm7 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm17[0],xmm25[0],xmm17[1],xmm25[1],xmm17[2],xmm25[2],xmm17[3],xmm25[3],xmm17[4],xmm25[4],xmm17[5],xmm25[5],xmm17[6],xmm25[6],xmm17[7],xmm25[7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[2,3,2,3],zmm8[0,1,0,1] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512BW-NEXT: vpermw %zmm0, %zmm8, %zmm8 -; AVX512BW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 -; AVX512BW-NEXT: kmovq %rax, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm7 {%k2} -; AVX512BW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E -; AVX512BW-NEXT: kmovq %rax, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k2} -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512BW-NEXT: vpshufb %xmm7, %xmm20, %xmm8 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512BW-NEXT: vpshufb %xmm18, %xmm19, %xmm21 -; AVX512BW-NEXT: vporq %xmm8, %xmm21, %xmm8 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] -; AVX512BW-NEXT: vpshufb %xmm22, %xmm19, %xmm19 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm19[0,1,0,1] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512BW-NEXT: vpshufb %xmm19, %xmm29, %xmm20 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512BW-NEXT: vpshufb %xmm21, %xmm28, %xmm22 -; AVX512BW-NEXT: vporq %xmm20, %xmm22, %xmm20 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512BW-NEXT: vpshufb %xmm23, %xmm22, %xmm22 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,0,1],zmm22[0,1,0,1] -; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512BW-NEXT: vpshufb %xmm8, %xmm25, %xmm22 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm23 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512BW-NEXT: vpshufb %xmm23, %xmm17, %xmm27 -; AVX512BW-NEXT: vporq %xmm22, %xmm27, %xmm22 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm25[8],xmm17[9],xmm25[9],xmm17[10],xmm25[10],xmm17[11],xmm25[11],xmm17[12],xmm25[12],xmm17[13],xmm25[13],xmm17[14],xmm25[14],xmm17[15],xmm25[15] -; AVX512BW-NEXT: vpshufb %xmm26, %xmm17, %xmm17 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm22[0,1,0,1],zmm17[0,1,0,1] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-NEXT: vpermw %zmm3, %zmm22, %zmm3 -; AVX512BW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 -; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm17 {%k1} -; AVX512BW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 -; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm17, %zmm20 {%k1} -; AVX512BW-NEXT: vpshufb %xmm7, %xmm11, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm18, %xmm10, %xmm7 -; AVX512BW-NEXT: vpor %xmm3, %xmm7, %xmm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX512BW-NEXT: vpshufb %xmm16, %xmm7, %xmm7 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm7[0,1,0,1],zmm3[0,1,0,1] -; AVX512BW-NEXT: vpshufb %xmm19, %xmm13, %xmm7 -; AVX512BW-NEXT: vpshufb %xmm21, %xmm12, %xmm10 -; AVX512BW-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX512BW-NEXT: vpshufb %xmm24, %xmm3, %xmm3 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm27[2,3,2,3],zmm3[0,1,0,1] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3],xmm17[4],xmm18[4],xmm17[5],xmm18[5],xmm17[6],xmm18[6],xmm17[7],xmm18[7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm27 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512BW-NEXT: vpshufb %xmm27, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufb %ymm5, %ymm16, %ymm5 +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,2,3,3,4,6,7,7] +; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm5 {%k3} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[2,3,2,3],zmm3[0,1,0,1] +; AVX512BW-NEXT: vmovdqu8 %zmm30, %zmm3 {%k4} +; AVX512BW-NEXT: vpshufb %ymm6, %ymm22, %ymm5 +; AVX512BW-NEXT: vpshufb %ymm7, %ymm19, %ymm6 +; AVX512BW-NEXT: vpor %ymm5, %ymm6, %ymm6 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm28[0],xmm26[0],xmm28[1],xmm26[1],xmm28[2],xmm26[2],xmm28[3],xmm26[3],xmm28[4],xmm26[4],xmm28[5],xmm26[5],xmm28[6],xmm26[6],xmm28[7],xmm26[7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[2,3,2,3],zmm7[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512BW-NEXT: vpermw %zmm0, %zmm7, %zmm7 +; AVX512BW-NEXT: movabsq $580999813345182728, %rcx # imm = 0x810204081020408 +; AVX512BW-NEXT: kmovq %rcx, %k2 +; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm6 {%k2} +; AVX512BW-NEXT: movabsq $1016749673354069774, %rcx # imm = 0xE1C3870E1C3870E +; AVX512BW-NEXT: kmovq %rcx, %k2 +; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k2} +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512BW-NEXT: vpshufb %xmm6, %xmm18, %xmm7 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512BW-NEXT: vpshufb %xmm15, %xmm17, %xmm16 +; AVX512BW-NEXT: vporq %xmm7, %xmm16, %xmm7 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512BW-NEXT: vpshufb %xmm23, %xmm16, %xmm16 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,0,1],zmm16[0,1,0,1] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512BW-NEXT: vpshufb %xmm16, %xmm21, %xmm17 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512BW-NEXT: vpshufb %xmm18, %xmm20, %xmm19 +; AVX512BW-NEXT: vporq %xmm17, %xmm19, %xmm17 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] +; AVX512BW-NEXT: vpshufb %xmm25, %xmm19, %xmm19 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,0,1],zmm19[0,1,0,1] +; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512BW-NEXT: vpshufb %xmm7, %xmm26, %xmm19 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512BW-NEXT: vpshufb %xmm20, %xmm28, %xmm21 +; AVX512BW-NEXT: vporq %xmm19, %xmm21, %xmm19 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm28[8],xmm26[8],xmm28[9],xmm26[9],xmm28[10],xmm26[10],xmm28[11],xmm26[11],xmm28[12],xmm26[12],xmm28[13],xmm26[13],xmm28[14],xmm26[14],xmm28[15],xmm26[15] +; AVX512BW-NEXT: vpshufb %xmm29, %xmm21, %xmm21 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,0,1],zmm21[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] +; AVX512BW-NEXT: vpermw %zmm0, %zmm21, %zmm21 +; AVX512BW-NEXT: movabsq $290499906672591364, %rcx # imm = 0x408102040810204 +; AVX512BW-NEXT: kmovq %rcx, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm21, %zmm19 {%k1} +; AVX512BW-NEXT: movabsq $-8714997200177740921, %rcx # imm = 0x870E1C3870E1C387 +; AVX512BW-NEXT: kmovq %rcx, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm19, %zmm17 {%k1} +; AVX512BW-NEXT: vpshufb %xmm6, %xmm10, %xmm6 +; AVX512BW-NEXT: vpshufb %xmm15, %xmm9, %xmm15 +; AVX512BW-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512BW-NEXT: vpshufb %xmm27, %xmm9, %xmm9 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,0,1],zmm6[0,1,0,1] +; AVX512BW-NEXT: vpshufb %xmm16, %xmm12, %xmm9 +; AVX512BW-NEXT: vpshufb %xmm18, %xmm11, %xmm10 +; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX512BW-NEXT: vpshufb %xmm24, %xmm10, %xmm10 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm10[0,1,0,1],zmm7[0,1,0,1] -; AVX512BW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C -; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm7 {%k1} -; AVX512BW-NEXT: vpshufb %xmm8, %xmm14, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm23, %xmm15, %xmm8 -; AVX512BW-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512BW-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm6[0,1,0,1],zmm3[0,1,0,1] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,0,1],zmm9[0,1,0,1] +; AVX512BW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C +; AVX512BW-NEXT: kmovq %rcx, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm9 {%k1} +; AVX512BW-NEXT: vpshufb %xmm7, %xmm13, %xmm6 +; AVX512BW-NEXT: vpshufb %xmm20, %xmm14, %xmm7 +; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512BW-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm6[0,1,0,1] ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] ; AVX512BW-NEXT: vpermw %zmm0, %zmm6, %zmm0 -; AVX512BW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 -; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 -; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm7 {%k1} -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 +; AVX512BW-NEXT: kmovq %rcx, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm5 {%k1} +; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 +; AVX512BW-NEXT: kmovq %rcx, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm9 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -10232,222 +10232,222 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-LABEL: store_i8_stride7_vf64: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128] ; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm2 -; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm5 +; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm5 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] ; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm5 -; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm16 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm16, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm17 +; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm6 +; AVX512BW-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm16 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] ; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm17, %ymm5 -; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512BW-FCP-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm17, %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm16, %ymm6 -; AVX512BW-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[2,3,2,3],zmm0[2,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm16, %ymm6 +; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm6, %ymm1 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512BW-FCP-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm16, %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm15, %ymm11 +; AVX512BW-FCP-NEXT: vpor %ymm6, %ymm11, %ymm6 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[2,3,2,3],zmm1[2,3,2,3] ; AVX512BW-FCP-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18 ; AVX512BW-FCP-NEXT: kmovq %r10, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %ymm18 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %ymm17 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128] ; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm18, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm17, %ymm11 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm22 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] ; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm22, %ymm12 -; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm12, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm22, %ymm14 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] -; AVX512BW-FCP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm18, %ymm15 -; AVX512BW-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm14[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm22, %ymm12 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512BW-FCP-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm17, %ymm14 +; AVX512BW-FCP-NEXT: vpor %ymm12, %ymm14, %ymm12 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[2,3,2,3],zmm11[2,3,2,3] ; AVX512BW-FCP-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060 ; AVX512BW-FCP-NEXT: kmovq %r10, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm14, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm11, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm11, %zmm11 ; AVX512BW-FCP-NEXT: movabsq $-9150747060186627967, %r10 # imm = 0x8102040810204081 ; AVX512BW-FCP-NEXT: kmovq %r10, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm1, %ymm23 -; AVX512BW-FCP-NEXT: vporq %ymm15, %ymm23, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm11, %zmm1 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm14 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm23 +; AVX512BW-FCP-NEXT: vporq %ymm14, %ymm23, %ymm14 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm11 +; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm13, %ymm13 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] ; AVX512BW-FCP-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm11, %ymm1 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm16, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm17, %ymm26 -; AVX512BW-FCP-NEXT: vporq %ymm1, %ymm26, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm2 +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm16, %ymm26 +; AVX512BW-FCP-NEXT: vporq %ymm2, %ymm26, %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm15, %ymm15 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] ; AVX512BW-FCP-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm17, %ymm17 -; AVX512BW-FCP-NEXT: vporq %ymm16, %ymm17, %ymm16 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vporq %ymm15, %ymm16, %ymm15 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm2, %zmm2 ; AVX512BW-FCP-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 ; AVX512BW-FCP-NEXT: kmovq %r10, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm23, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm18, %ymm23 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm22, %ymm28 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm23, %zmm2 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm17, %ymm23 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm22, %ymm28 ; AVX512BW-FCP-NEXT: vporq %ymm23, %ymm28, %ymm23 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] ; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm22, %ymm22 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] ; AVX512BW-FCP-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm18, %ymm18 -; AVX512BW-FCP-NEXT: vporq %ymm22, %ymm18, %ymm18 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm23, %zmm18 +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm17, %ymm17 +; AVX512BW-FCP-NEXT: vporq %ymm22, %ymm17, %ymm17 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm23, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rax), %ymm22 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] -; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm30, %ymm22 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm23 = ymm2[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm23, %zmm22 -; AVX512BW-FCP-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 -; AVX512BW-FCP-NEXT: kmovq %r10, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm18 {%k3} -; AVX512BW-FCP-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 -; AVX512BW-FCP-NEXT: kmovq %r10, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm1 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm18 -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm18, %ymm19 +; AVX512BW-FCP-NEXT: vpermw %ymm22, %ymm30, %ymm23 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm22 = ymm22[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm22 +; AVX512BW-FCP-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102 +; AVX512BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm17 {%k3} +; AVX512BW-FCP-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3 +; AVX512BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm2 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm17 +; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm17, %ymm18 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm22, %ymm20 -; AVX512BW-FCP-NEXT: vporq %ymm19, %ymm20, %ymm19 -; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm22, %ymm20 -; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm18, %ymm23 -; AVX512BW-FCP-NEXT: vporq %ymm20, %ymm23, %ymm20 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[2,3,2,3],zmm19[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm19, %ymm8 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm22, %ymm19 +; AVX512BW-FCP-NEXT: vporq %ymm18, %ymm19, %ymm18 +; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm22, %ymm19 +; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm17, %ymm23 +; AVX512BW-FCP-NEXT: vporq %ymm19, %ymm23, %ymm19 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[2,3,2,3],zmm18[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm18, %ymm7 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %ymm23 ; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm23, %ymm10 -; AVX512BW-FCP-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm10 -; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm19, %ymm24 -; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm23, %ymm25 -; AVX512BW-FCP-NEXT: vporq %ymm24, %ymm25, %ymm24 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm24[2,3,2,3],zmm8[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %ymm20 -; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm20, %ymm12 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %ymm24 -; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm24, %ymm13 -; AVX512BW-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm20, %ymm13 -; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm24, %ymm25 -; AVX512BW-FCP-NEXT: vporq %ymm13, %ymm25, %ymm13 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[2,3,2,3],zmm12[2,3,2,3] -; AVX512BW-FCP-NEXT: vpermw %zmm10, %zmm30, %zmm13 +; AVX512BW-FCP-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm18, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm23, %ymm24 +; AVX512BW-FCP-NEXT: vporq %ymm10, %ymm24, %ymm10 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm10[2,3,2,3],zmm7[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %ymm19 +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm19, %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %ymm20 +; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm20, %ymm21 +; AVX512BW-FCP-NEXT: vporq %ymm10, %ymm21, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm19, %ymm21 +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm20, %ymm24 +; AVX512BW-FCP-NEXT: vporq %ymm21, %ymm24, %ymm21 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm21[2,3,2,3],zmm10[2,3,2,3] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm30, %zmm21 ; AVX512BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm12 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm10 {%k3} ; AVX512BW-FCP-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm8 {%k3} -; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm19, %ymm12 -; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm23, %ymm13 -; AVX512BW-FCP-NEXT: vpor %ymm12, %ymm13, %ymm14 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm13 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm7 {%k3} +; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm18, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm23, %ymm11 +; AVX512BW-FCP-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm25, %xmm25 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,0,1] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm25, %zmm26 -; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm22, %ymm11 -; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm18, %ymm14 -; AVX512BW-FCP-NEXT: vpor %ymm11, %ymm14, %ymm11 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm27, %xmm27 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm24, %xmm24 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,0,1] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm24, %zmm25 +; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm22, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm17, %ymm13 +; AVX512BW-FCP-NEXT: vpor %ymm10, %ymm13, %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm14 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm26 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm26, %xmm26 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,1,0,1] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm26, %zmm10 ; AVX512BW-FCP-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm26, %zmm11 {%k3} -; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm24, %ymm16 -; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm20, %ymm17 -; AVX512BW-FCP-NEXT: vporq %ymm16, %ymm17, %ymm27 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %xmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %xmm17 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm28, %xmm28 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm28, %zmm27 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512BW-FCP-NEXT: vpermw %zmm10, %zmm28, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm25, %zmm10 {%k3} +; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm20, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm19, %ymm16 +; AVX512BW-FCP-NEXT: vporq %ymm15, %ymm16, %ymm26 +; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %xmm16 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm27, %xmm27 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm27, %zmm26 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm27 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm27, %zmm27 ; AVX512BW-FCP-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm28, %zmm27 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm28 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm27, %zmm26 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm27 ; AVX512BW-FCP-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38 ; AVX512BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm27, %zmm11 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm27 -; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm22, %ymm7 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm26, %zmm10 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm26 +; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm22, %ymm8 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm22 -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm18, %ymm9 -; AVX512BW-FCP-NEXT: vpor %ymm7, %ymm9, %ymm9 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm27[0],xmm22[0],xmm27[1],xmm22[1],xmm27[2],xmm22[2],xmm27[3],xmm22[3],xmm27[4],xmm22[4],xmm27[5],xmm22[5],xmm27[6],xmm22[6],xmm27[7],xmm22[7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm9[2,3,2,3],zmm18[0,1,0,1] -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm29 +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm17, %ymm9 +; AVX512BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm26[0],xmm22[0],xmm26[1],xmm22[1],xmm26[2],xmm22[2],xmm26[3],xmm22[3],xmm26[4],xmm22[4],xmm26[5],xmm22[5],xmm26[6],xmm22[6],xmm26[7],xmm22[7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm17, %xmm17 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm9[2,3,2,3],zmm17[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm28 ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm23, %ymm3 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm23 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm19, %ymm4 +; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm18, %ymm4 ; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm27[0],xmm28[0],xmm27[1],xmm28[1],xmm27[2],xmm28[2],xmm27[3],xmm28[3],xmm27[4],xmm28[4],xmm27[5],xmm28[5],xmm27[6],xmm28[6],xmm27[7],xmm28[7] ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm4[0,1,0,1] -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm19 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm3 {%k2} -; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm24, %ymm4 -; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm20, %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm18 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm20, %ymm4 +; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm19, %ymm5 ; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm19[0],xmm23[0],xmm19[1],xmm23[1],xmm19[2],xmm23[2],xmm19[3],xmm23[3],xmm19[4],xmm23[4],xmm19[5],xmm23[5],xmm19[6],xmm23[6],xmm19[7],xmm23[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm18[0],xmm23[0],xmm18[1],xmm23[1],xmm18[2],xmm23[2],xmm18[3],xmm23[3],xmm18[4],xmm23[4],xmm18[5],xmm23[5],xmm18[6],xmm23[6],xmm18[7],xmm23[7] ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[2,3,2,3],zmm6[0,1,0,1] ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512BW-FCP-NEXT: vpermw %zmm10, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm6, %zmm6 ; AVX512BW-FCP-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k2} @@ -10455,372 +10455,372 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm3 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm29, %xmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm28, %xmm20 -; AVX512BW-FCP-NEXT: vporq %xmm5, %xmm20, %xmm5 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm20, %xmm20 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm5[0,1,0,1],zmm20[0,1,0,1] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm22, %xmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm27, %xmm28 -; AVX512BW-FCP-NEXT: vporq %xmm5, %xmm28, %xmm5 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm27[8],xmm22[9],xmm27[9],xmm22[10],xmm27[10],xmm22[11],xmm27[11],xmm22[12],xmm27[12],xmm22[13],xmm27[13],xmm22[14],xmm27[14],xmm22[15],xmm27[15] -; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm22, %xmm22 +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm28, %xmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm27, %xmm19 +; AVX512BW-FCP-NEXT: vporq %xmm5, %xmm19, %xmm5 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm19, %xmm19 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm5[0,1,0,1],zmm19[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm22, %xmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm26, %xmm27 +; AVX512BW-FCP-NEXT: vporq %xmm5, %xmm27, %xmm5 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm26[8],xmm22[9],xmm26[9],xmm22[10],xmm26[10],xmm22[11],xmm26[11],xmm22[12],xmm26[12],xmm22[13],xmm26[13],xmm22[14],xmm26[14],xmm22[15],xmm26[15] +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm22, %xmm22 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm22[0,1,0,1] -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm23, %xmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm19, %xmm27 -; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm27, %xmm22 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm23[8],xmm19[9],xmm23[9],xmm19[10],xmm23[10],xmm19[11],xmm23[11],xmm19[12],xmm23[12],xmm19[13],xmm23[13],xmm19[14],xmm23[14],xmm19[15],xmm23[15] -; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm19, %xmm19 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm22[0,1,0,1],zmm19[0,1,0,1] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm22, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm23, %xmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm18, %xmm26 +; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm26, %xmm22 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm18[8],xmm23[8],xmm18[9],xmm23[9],xmm18[10],xmm23[10],xmm18[11],xmm23[11],xmm18[12],xmm23[12],xmm18[13],xmm23[13],xmm18[14],xmm23[14],xmm18[15],xmm23[15] +; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm22[0,1,0,1],zmm18[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm22, %zmm22 ; AVX512BW-FCP-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm18 {%k1} ; AVX512BW-FCP-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm12, %xmm6 -; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm6[0,1,0,1],zmm2[0,1,0,1] -; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm15, %xmm6 -; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm14, %xmm9 -; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,0,1],zmm6[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm11, %xmm17 +; AVX512BW-FCP-NEXT: vporq %xmm6, %xmm17, %xmm6 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm9 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,0,1],zmm6[0,1,0,1] +; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm14, %xmm9 +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm13, %xmm11 +; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm9[0,1,0,1] ; AVX512BW-FCP-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm6 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm16, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm17, %xmm7 -; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm7, %xmm2 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] -; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,0,1],zmm2[0,1,0,1] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512BW-FCP-NEXT: vpermw %zmm10, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm15, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm16, %xmm9 +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,0,1],zmm6[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm6, %zmm0 ; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i8_stride7_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm0 +; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm14 -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] -; AVX512DQ-BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] -; AVX512DQ-BW-NEXT: movl $338170920, %r10d # imm = 0x14281428 -; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm2, %ymm1 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm2, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512DQ-BW-NEXT: vpshufb %ymm10, %ymm14, %ymm5 -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm15, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %ymm17 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] -; AVX512DQ-BW-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm17, %ymm6 -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm6, %ymm3 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm17, %ymm7 -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] -; AVX512DQ-BW-NEXT: movl $676341840, %r10d # imm = 0x28502850 -; AVX512DQ-BW-NEXT: kmovd %r10d, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k3} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQ-BW-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18 -; AVX512DQ-BW-NEXT: kmovq %r10, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] +; AVX512DQ-BW-NEXT: movl $338170920, %r11d # imm = 0x14281428 +; AVX512DQ-BW-NEXT: kmovd %r11d, %k2 +; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm2, %ymm1 {%k2} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm2, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm13, %ymm5 +; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm14, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpshufb %ymm10, %ymm15, %ymm6 +; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm6, %ymm6 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm15, %ymm7 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,2,3,3,4,6,7,7] +; AVX512DQ-BW-NEXT: movl $676341840, %r11d # imm = 0x28502850 +; AVX512DQ-BW-NEXT: kmovd %r11d, %k3 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm7 {%k3} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-BW-NEXT: movabsq $1742999440035548184, %r11 # imm = 0x183060C183060C18 +; AVX512DQ-BW-NEXT: kmovq %r11, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %ymm16 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128] -; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm16, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %ymm18 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] -; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm18, %ymm12 -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm12, %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm18, %ymm19 -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] -; AVX512DQ-BW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm16, %ymm20 -; AVX512DQ-BW-NEXT: vporq %ymm19, %ymm20, %ymm19 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm19[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-BW-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060 -; AVX512DQ-BW-NEXT: kmovq %r10, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] -; AVX512DQ-BW-NEXT: vpermw %zmm3, %zmm19, %zmm19 -; AVX512DQ-BW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 -; AVX512DQ-BW-NEXT: kmovq %rax, %k4 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm19, %zmm1 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm15, %ymm21 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm17, %ymm22 -; AVX512DQ-BW-NEXT: vporq %ymm21, %ymm22, %ymm21 +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128] +; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm16, %ymm11 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %ymm19 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] +; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm19, %ymm12 +; AVX512DQ-BW-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm17 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512DQ-BW-NEXT: vpshufb %ymm17, %ymm19, %ymm12 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512DQ-BW-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpshufb %ymm18, %ymm16, %ymm20 +; AVX512DQ-BW-NEXT: vporq %ymm12, %ymm20, %ymm12 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[2,3,2,3],zmm11[2,3,2,3] +; AVX512DQ-BW-NEXT: movabsq $6971997760142192736, %r11 # imm = 0x60C183060C183060 +; AVX512DQ-BW-NEXT: kmovq %r11, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm11, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] +; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm11, %zmm11 +; AVX512DQ-BW-NEXT: movabsq $-9150747060186627967, %r11 # imm = 0x8102040810204081 +; AVX512DQ-BW-NEXT: kmovq %r11, %k4 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm11, %zmm1 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm14, %ymm20 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm15, %ymm21 +; AVX512DQ-BW-NEXT: vporq %ymm20, %ymm21, %ymm20 ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] -; AVX512DQ-BW-NEXT: vpshufb %ymm22, %ymm15, %ymm15 -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] -; AVX512DQ-BW-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm17, %ymm17 -; AVX512DQ-BW-NEXT: vporq %ymm15, %ymm17, %ymm15 +; AVX512DQ-BW-NEXT: vpshufb %ymm22, %ymm14, %ymm14 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512DQ-BW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpshufb %ymm23, %ymm15, %ymm15 +; AVX512DQ-BW-NEXT: vpor %ymm14, %ymm15, %ymm14 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm20, %zmm14 +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,1,1,4,4,5,5] +; AVX512DQ-BW-NEXT: vpbroadcastd {{.*#+}} ymm25 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] +; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm2, %ymm15 {%k3} ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm15 -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm17 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,0,1,1,4,4,5,5] -; AVX512DQ-BW-NEXT: vpbroadcastd {{.*#+}} ymm21 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6] -; AVX512DQ-BW-NEXT: vpshufb %ymm21, %ymm2, %ymm17 {%k3} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512DQ-BW-NEXT: vpshufb %ymm23, %ymm14, %ymm14 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm2, %ymm2 -; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm14, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: movabsq $3485998880071096368, %rax # imm = 0x3060C183060C1830 -; AVX512DQ-BW-NEXT: kmovq %rax, %k4 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm2 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm16, %ymm17 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512DQ-BW-NEXT: vpshufb %ymm15, %ymm18, %ymm26 -; AVX512DQ-BW-NEXT: vporq %ymm17, %ymm26, %ymm17 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm13, %ymm13 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm21 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512DQ-BW-NEXT: vpshufb %ymm21, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: movabsq $3485998880071096368, %r11 # imm = 0x3060C183060C1830 +; AVX512DQ-BW-NEXT: kmovq %r11, %k4 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm14, %zmm2 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm16, %ymm15 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm19, %ymm24 +; AVX512DQ-BW-NEXT: vporq %ymm15, %ymm24, %ymm15 ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512DQ-BW-NEXT: vpshufb %ymm26, %ymm18, %ymm18 +; AVX512DQ-BW-NEXT: vpshufb %ymm26, %ymm19, %ymm19 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] ; AVX512DQ-BW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpshufb %ymm28, %ymm16, %ymm16 -; AVX512DQ-BW-NEXT: vporq %ymm18, %ymm16, %ymm16 +; AVX512DQ-BW-NEXT: vporq %ymm19, %ymm16, %ymm16 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm16 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r10), %ymm16 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] -; AVX512DQ-BW-NEXT: vpermw %ymm3, %ymm29, %ymm17 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm18 = ymm3[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 -; AVX512DQ-BW-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102 -; AVX512DQ-BW-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm17, %zmm16 {%k5} -; AVX512DQ-BW-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3 -; AVX512DQ-BW-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm2 {%k5} -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm16 -; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm16, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm17, %ymm11 -; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm11, %ymm5 -; AVX512DQ-BW-NEXT: vpshufb %ymm22, %ymm16, %ymm11 -; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm17, %ymm18 -; AVX512DQ-BW-NEXT: vporq %ymm11, %ymm18, %ymm11 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm11[2,3,2,3],zmm5[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm25 +; AVX512DQ-BW-NEXT: vpermw %ymm16, %ymm29, %ymm19 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm16, %zmm16 +; AVX512DQ-BW-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 +; AVX512DQ-BW-NEXT: kmovq %r10, %k5 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm15 {%k5} +; AVX512DQ-BW-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 +; AVX512DQ-BW-NEXT: kmovq %r10, %k5 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm2 {%k5} +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm15 +; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm15, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512DQ-BW-NEXT: vpshufb %ymm10, %ymm16, %ymm10 +; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm10, %ymm4 +; AVX512DQ-BW-NEXT: vpshufb %ymm22, %ymm15, %ymm10 +; AVX512DQ-BW-NEXT: vpshufb %ymm23, %ymm16, %ymm19 +; AVX512DQ-BW-NEXT: vporq %ymm10, %ymm19, %ymm10 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm10[2,3,2,3],zmm4[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm24 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm27 -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm27[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,1,1,4,4,5,5] -; AVX512DQ-BW-NEXT: vpshufb %ymm21, %ymm25, %ymm11 {%k3} -; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm25, %ymm9 -; AVX512DQ-BW-NEXT: vpshufb %ymm10, %ymm27, %ymm10 -; AVX512DQ-BW-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[2,3,2,3],zmm9[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm9, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %ymm18 -; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm18, %ymm9 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %ymm21 -; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm21, %ymm10 +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm10 = ymm27[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,1,1,4,4,5,5] +; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm24, %ymm10 {%k3} +; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm24, %ymm8 +; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm27, %ymm9 +; AVX512DQ-BW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[2,3,2,3],zmm8[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %ymm19 +; AVX512DQ-BW-NEXT: vpshufb %ymm17, %ymm19, %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %ymm22 +; AVX512DQ-BW-NEXT: vpshufb %ymm18, %ymm22, %ymm9 +; AVX512DQ-BW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512DQ-BW-NEXT: vpshufb %ymm26, %ymm19, %ymm9 +; AVX512DQ-BW-NEXT: vpshufb %ymm28, %ymm22, %ymm10 ; AVX512DQ-BW-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512DQ-BW-NEXT: vpshufb %ymm26, %ymm18, %ymm10 -; AVX512DQ-BW-NEXT: vpshufb %ymm28, %ymm21, %ymm11 -; AVX512DQ-BW-NEXT: vpor %ymm10, %ymm11, %ymm10 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3] -; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm29, %zmm10 -; AVX512DQ-BW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 -; AVX512DQ-BW-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k5} -; AVX512DQ-BW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C -; AVX512DQ-BW-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm9, %zmm5 {%k5} -; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm16, %ymm9 -; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm17, %ymm10 -; AVX512DQ-BW-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm19 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm20 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm12, %xmm12 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm12, %zmm26 -; AVX512DQ-BW-NEXT: vpshufb %ymm23, %ymm27, %ymm9 -; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm25, %ymm12 -; AVX512DQ-BW-NEXT: vpor %ymm9, %ymm12, %ymm9 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm13 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm23 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm24, %xmm24 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,0,1] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm24, %zmm9 -; AVX512DQ-BW-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306 -; AVX512DQ-BW-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm26, %zmm9 {%k5} -; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm21, %ymm14 -; AVX512DQ-BW-NEXT: vpshufb %ymm15, %ymm18, %ymm15 -; AVX512DQ-BW-NEXT: vporq %ymm14, %ymm15, %ymm24 -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm14 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm15 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm28, %xmm28 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[2,3,2,3],zmm8[2,3,2,3] +; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm29, %zmm9 +; AVX512DQ-BW-NEXT: movabsq $1161999626690365456, %r10 # imm = 0x1020408102040810 +; AVX512DQ-BW-NEXT: kmovq %r10, %k5 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm9, %zmm8 {%k5} +; AVX512DQ-BW-NEXT: movabsq $2033499346708139548, %r10 # imm = 0x1C3870E1C3870E1C +; AVX512DQ-BW-NEXT: kmovq %r10, %k5 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm4 {%k5} +; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm15, %ymm8 +; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm16, %ymm9 +; AVX512DQ-BW-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm17 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm18 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm23 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm11, %xmm11 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm26 +; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm27, %ymm8 +; AVX512DQ-BW-NEXT: vpshufb %ymm21, %ymm24, %ymm11 +; AVX512DQ-BW-NEXT: vpor %ymm8, %ymm11, %ymm8 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm20 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm21 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm25 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm28, %xmm28 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm24, %zmm28, %zmm24 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm28, %zmm28 -; AVX512DQ-BW-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020 -; AVX512DQ-BW-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm28, %zmm24 {%k5} -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm28 -; AVX512DQ-BW-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38 -; AVX512DQ-BW-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm24, %zmm9 {%k5} -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm29 -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm24 = ymm27[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm27 = ymm24[2,2,3,3,6,6,7,7] -; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm25, %ymm27 {%k2} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm28, %zmm8 +; AVX512DQ-BW-NEXT: movabsq $435749860008887046, %rcx # imm = 0x60C183060C18306 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k5 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm26, %zmm8 {%k5} +; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm22, %ymm13 +; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm19, %ymm14 +; AVX512DQ-BW-NEXT: vporq %ymm13, %ymm14, %ymm30 +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm26 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm28 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm31 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm29 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-BW-NEXT: vpshufb %xmm29, %xmm31, %xmm31 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm31 = ymm31[0,1,0,1] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm30, %zmm31, %zmm30 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm31 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] +; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm31, %zmm31 +; AVX512DQ-BW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k5 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm31, %zmm30 {%k5} +; AVX512DQ-BW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k5 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm30, %zmm8 {%k5} +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm27 = ymm27[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm27 = ymm27[2,2,3,3,6,6,7,7] +; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm24, %ymm27 {%k2} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm20[0],xmm21[0],xmm20[1],xmm21[1],xmm20[2],xmm21[2],xmm20[3],xmm21[3],xmm20[4],xmm21[4],xmm20[5],xmm21[5],xmm20[6],xmm21[6],xmm20[7],xmm21[7] ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm24 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-BW-NEXT: vpshufb %xmm24, %xmm4, %xmm4 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm27[2,3,2,3],zmm4[0,1,0,1] -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm25 -; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm17, %ymm4 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm19[0],xmm20[0],xmm19[1],xmm20[1],xmm19[2],xmm20[2],xmm19[3],xmm20[3],xmm19[4],xmm20[4],xmm19[5],xmm20[5],xmm19[6],xmm20[6],xmm19[7],xmm20[7] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm16 = ymm16[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[0,2,3,3,4,6,7,7] -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm6, %xmm6 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[2,3,2,3],zmm6[0,1,0,1] -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm17 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm27, %zmm4 {%k4} -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm21, %ymm6 -; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm18, %ymm7 -; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm7, %ymm7 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm17[0],xmm25[0],xmm17[1],xmm25[1],xmm17[2],xmm25[2],xmm17[3],xmm25[3],xmm17[4],xmm25[4],xmm17[5],xmm25[5],xmm17[6],xmm25[6],xmm17[7],xmm25[7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm8, %xmm8 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[2,3,2,3],zmm8[0,1,0,1] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm8, %zmm8 -; AVX512DQ-BW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 -; AVX512DQ-BW-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E -; AVX512DQ-BW-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm20, %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm19, %xmm21 -; AVX512DQ-BW-NEXT: vporq %xmm8, %xmm21, %xmm8 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] -; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm19, %xmm19 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm19[0,1,0,1] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm29, %xmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm28, %xmm22 -; AVX512DQ-BW-NEXT: vporq %xmm20, %xmm22, %xmm20 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm22, %xmm22 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,0,1],zmm22[0,1,0,1] -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm25, %xmm22 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm23 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm17, %xmm27 -; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm27, %xmm22 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm25[8],xmm17[9],xmm25[9],xmm17[10],xmm25[10],xmm17[11],xmm25[11],xmm17[12],xmm25[12],xmm17[13],xmm25[13],xmm17[14],xmm25[14],xmm17[15],xmm25[15] -; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm17, %xmm17 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm22[0,1,0,1],zmm17[0,1,0,1] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512DQ-BW-NEXT: vpermw %zmm3, %zmm22, %zmm3 -; AVX512DQ-BW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 -; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 -; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm17, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm11, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm10, %xmm7 -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm7, %xmm3 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] -; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm7, %xmm7 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm7[0,1,0,1],zmm3[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm13, %xmm7 -; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm12, %xmm10 -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] +; AVX512DQ-BW-NEXT: vpshufb %xmm24, %xmm3, %xmm3 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm27[2,3,2,3],zmm3[0,1,0,1] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3],xmm17[4],xmm18[4],xmm17[5],xmm18[5],xmm17[6],xmm18[6],xmm17[7],xmm18[7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm27 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-BW-NEXT: vpshufb %xmm27, %xmm3, %xmm3 +; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm16, %ymm5 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,2,3,3,4,6,7,7] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm5 {%k3} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[2,3,2,3],zmm3[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm30, %zmm3 {%k4} +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm22, %ymm5 +; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm19, %ymm6 +; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm6, %ymm6 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm28[0],xmm26[0],xmm28[1],xmm26[1],xmm28[2],xmm26[2],xmm28[3],xmm26[3],xmm28[4],xmm26[4],xmm28[5],xmm26[5],xmm28[6],xmm26[6],xmm28[7],xmm26[7] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[2,3,2,3],zmm7[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: movabsq $580999813345182728, %rcx # imm = 0x810204081020408 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm6 {%k2} +; AVX512DQ-BW-NEXT: movabsq $1016749673354069774, %rcx # imm = 0xE1C3870E1C3870E +; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm18, %xmm7 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm17, %xmm16 +; AVX512DQ-BW-NEXT: vporq %xmm7, %xmm16, %xmm7 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm16, %xmm16 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[0,1,0,1],zmm16[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm21, %xmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm20, %xmm19 +; AVX512DQ-BW-NEXT: vporq %xmm17, %xmm19, %xmm17 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] +; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm19, %xmm19 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm17[0,1,0,1],zmm19[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm26, %xmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm20 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm28, %xmm21 +; AVX512DQ-BW-NEXT: vporq %xmm19, %xmm21, %xmm19 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm28[8],xmm26[8],xmm28[9],xmm26[9],xmm28[10],xmm26[10],xmm28[11],xmm26[11],xmm28[12],xmm26[12],xmm28[13],xmm26[13],xmm28[14],xmm26[14],xmm28[15],xmm26[15] +; AVX512DQ-BW-NEXT: vpshufb %xmm29, %xmm21, %xmm21 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[0,1,0,1],zmm21[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] +; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm21, %zmm21 +; AVX512DQ-BW-NEXT: movabsq $290499906672591364, %rcx # imm = 0x408102040810204 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm21, %zmm19 {%k1} +; AVX512DQ-BW-NEXT: movabsq $-8714997200177740921, %rcx # imm = 0x870E1C3870E1C387 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm19, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm10, %xmm6 +; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm9, %xmm15 +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512DQ-BW-NEXT: vpshufb %xmm27, %xmm9, %xmm9 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,0,1],zmm6[0,1,0,1] +; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm12, %xmm9 +; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm11, %xmm10 +; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] ; AVX512DQ-BW-NEXT: vpshufb %xmm24, %xmm10, %xmm10 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm10[0,1,0,1],zmm7[0,1,0,1] -; AVX512DQ-BW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C -; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm14, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm15, %xmm8 -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm8, %xmm3 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm6[0,1,0,1],zmm3[0,1,0,1] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,0,1],zmm9[0,1,0,1] +; AVX512DQ-BW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C +; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm13, %xmm6 +; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm14, %xmm7 +; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm6[0,1,0,1] ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm6, %zmm0 -; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 -; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 -; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm7 {%k1} -; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 +; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm5, %zmm9 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 384(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -10828,222 +10828,222 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf64: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128] ; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] ; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm16, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm16 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm17, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512DQ-BW-FCP-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm17, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm16, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm16, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm6, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512DQ-BW-FCP-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm16, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm15, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm6, %ymm11, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[2,3,2,3],zmm1[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18 ; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %ymm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128] ; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm18, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm17, %ymm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm22, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm12, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm22, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] -; AVX512DQ-BW-FCP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm18, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm14[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm22, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512DQ-BW-FCP-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm17, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm12, %ymm14, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[2,3,2,3],zmm11[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060 ; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm14, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm11, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm11, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movabsq $-9150747060186627967, %r10 # imm = 0x8102040810204081 ; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm1, %ymm23 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm15, %ymm23, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm11, %zmm1 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm14, %ymm23, %ymm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm13, %ymm13 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] ; AVX512DQ-BW-FCP-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm11, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm16, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm17, %ymm26 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm1, %ymm26, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm13, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm16, %ymm26 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm2, %ymm26, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm15, %ymm15 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] ; AVX512DQ-BW-FCP-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm17, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm16, %ymm17, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm15, %ymm16, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm2, %zmm2 ; AVX512DQ-BW-FCP-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 ; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm23, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm18, %ymm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm22, %ymm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm23, %zmm2 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm17, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm22, %ymm28 ; AVX512DQ-BW-FCP-NEXT: vporq %ymm23, %ymm28, %ymm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm22, %ymm22 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] ; AVX512DQ-BW-FCP-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm18, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm22, %ymm18, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm23, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm17, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm22, %ymm17, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm23, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rax), %ymm22 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm30, %ymm22 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm23 = ymm2[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm23, %zmm22 -; AVX512DQ-BW-FCP-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102 -; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm18 {%k3} -; AVX512DQ-BW-FCP-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3 -; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm1 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm18, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm22, %ymm30, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm22 = ymm22[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm22, %zmm22 +; AVX512DQ-BW-FCP-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102 +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm17 {%k3} +; AVX512DQ-BW-FCP-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3 +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm2 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm17, %ymm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm22, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm19, %ymm20, %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm22, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm18, %ymm23 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm20, %ymm23, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[2,3,2,3],zmm19[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm19, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm22, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm18, %ymm19, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm22, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm17, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm19, %ymm23, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[2,3,2,3],zmm18[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm18, %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %ymm23 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm23, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm19, %ymm24 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm23, %ymm25 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm24, %ymm25, %ymm24 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm24[2,3,2,3],zmm8[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm20, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %ymm24 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm24, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm20, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm24, %ymm25 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm13, %ymm25, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[2,3,2,3],zmm12[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm10, %zmm30, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm7, %ymm10, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm18, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm23, %ymm24 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm10, %ymm24, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm10[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %ymm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm19, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %ymm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm20, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm10, %ymm21, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm19, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm20, %ymm24 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm21, %ymm24, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm21[2,3,2,3],zmm10[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm30, %zmm21 ; AVX512DQ-BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm12 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm10 {%k3} ; AVX512DQ-BW-FCP-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm8 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm19, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm23, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm12, %ymm13, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm7 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm18, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm23, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm25, %xmm25 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm25, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm22, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm18, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm11, %ymm14, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm27, %xmm27 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm24, %xmm24 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm24, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm22, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm17, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm10, %ymm13, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm26 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm26, %xmm26 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm26, %zmm10 ; AVX512DQ-BW-FCP-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm26, %zmm11 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm24, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm20, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vporq %ymm16, %ymm17, %ymm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %xmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm28, %xmm28 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm28, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm10, %zmm28, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm25, %zmm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm20, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm19, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm15, %ymm16, %ymm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm27, %xmm27 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm27, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm27 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm27, %zmm27 ; AVX512DQ-BW-FCP-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm28, %zmm27 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm27, %zmm26 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm27 ; AVX512DQ-BW-FCP-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm27, %zmm11 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm27 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm22, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm26, %zmm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm26 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm22, %ymm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm22 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm18, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm7, %ymm9, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm27[0],xmm22[0],xmm27[1],xmm22[1],xmm27[2],xmm22[2],xmm27[3],xmm22[3],xmm27[4],xmm22[4],xmm27[5],xmm22[5],xmm27[6],xmm22[6],xmm27[7],xmm22[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm9[2,3,2,3],zmm18[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm29 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm17, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm26[0],xmm22[0],xmm26[1],xmm22[1],xmm26[2],xmm22[2],xmm26[3],xmm22[3],xmm26[4],xmm22[4],xmm26[5],xmm22[5],xmm26[6],xmm22[6],xmm26[7],xmm22[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm9[2,3,2,3],zmm17[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm28 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm23, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm19, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm18, %ymm4 ; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm27[0],xmm28[0],xmm27[1],xmm28[1],xmm27[2],xmm28[2],xmm27[3],xmm28[3],xmm27[4],xmm28[4],xmm27[5],xmm28[5],xmm27[6],xmm28[6],xmm27[7],xmm28[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm4[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm24, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm20, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm20, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm19, %ymm5 ; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm19[0],xmm23[0],xmm19[1],xmm23[1],xmm19[2],xmm23[2],xmm19[3],xmm23[3],xmm19[4],xmm23[4],xmm19[5],xmm23[5],xmm19[6],xmm23[6],xmm19[7],xmm23[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm18[0],xmm23[0],xmm18[1],xmm23[1],xmm18[2],xmm23[2],xmm18[3],xmm23[3],xmm18[4],xmm23[4],xmm18[5],xmm23[5],xmm18[6],xmm23[6],xmm18[7],xmm23[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[2,3,2,3],zmm6[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm10, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm6, %zmm6 ; AVX512DQ-BW-FCP-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k2} @@ -11051,75 +11051,75 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm3 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm29, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm28, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vporq %xmm5, %xmm20, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm20, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm5[0,1,0,1],zmm20[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm22, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm27, %xmm28 -; AVX512DQ-BW-FCP-NEXT: vporq %xmm5, %xmm28, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm27[8],xmm22[9],xmm27[9],xmm22[10],xmm27[10],xmm22[11],xmm27[11],xmm22[12],xmm27[12],xmm22[13],xmm27[13],xmm22[14],xmm27[14],xmm22[15],xmm27[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm22, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm28, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm27, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm5, %xmm19, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm19, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm5[0,1,0,1],zmm19[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm22, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm26, %xmm27 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm5, %xmm27, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm26[8],xmm22[9],xmm26[9],xmm22[10],xmm26[10],xmm22[11],xmm26[11],xmm22[12],xmm26[12],xmm22[13],xmm26[13],xmm22[14],xmm26[14],xmm22[15],xmm26[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm22, %xmm22 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm22[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm23, %xmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm19, %xmm27 -; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm27, %xmm22 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm23[8],xmm19[9],xmm23[9],xmm19[10],xmm23[10],xmm19[11],xmm23[11],xmm19[12],xmm23[12],xmm19[13],xmm23[13],xmm19[14],xmm23[14],xmm19[15],xmm23[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm19, %xmm19 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm22[0,1,0,1],zmm19[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm22, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm23, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm18, %xmm26 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm26, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm18[8],xmm23[8],xmm18[9],xmm23[9],xmm18[10],xmm23[10],xmm18[11],xmm23[11],xmm18[12],xmm23[12],xmm18[13],xmm23[13],xmm18[14],xmm23[14],xmm18[15],xmm23[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm22[0,1,0,1],zmm18[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm22, %zmm22 ; AVX512DQ-BW-FCP-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm18 {%k1} ; AVX512DQ-BW-FCP-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm12, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm6[0,1,0,1],zmm2[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm15, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm14, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,0,1],zmm6[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm11, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm6, %xmm17, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,0,1],zmm6[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm14, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm13, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm9[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm6 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm16, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm17, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm7, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,0,1],zmm2[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm10, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm15, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm16, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,0,1],zmm6[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index d8ee8103cee50..dbbd6b19b2829 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -61,23 +61,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; XOPAVX1: # %bb.0: @@ -117,23 +105,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxwd {{.*#+}} ymm1 = [0,0,0,0,0,0,0,512] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxwd {{.*#+}} ymm1 = [0,0,0,0,0,0,0,512] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; XOPAVX1: # %bb.0: @@ -173,23 +149,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,0,0,3] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,0,0,3] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; XOPAVX1: # %bb.0: @@ -229,23 +193,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,0,0,67108864] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,0,0,67108864] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -285,23 +237,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -341,23 +281,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxwq {{.*#+}} ymm1 = [0,0,0,1536] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxwq {{.*#+}} ymm1 = [0,0,0,1536] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -397,23 +325,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,7] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,0,7] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -453,23 +369,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -509,23 +413,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -565,23 +457,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxwd {{.*#+}} ymm1 = [0,0,0,0,0,2560,0,0] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxwd {{.*#+}} ymm1 = [0,0,0,0,0,2560,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -621,23 +501,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,11,0,0] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,11,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -677,23 +545,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,0,201326592,0] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,0,201326592,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -733,23 +589,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -789,23 +633,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxwq {{.*#+}} ymm1 = [0,0,3584,0] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxwq {{.*#+}} ymm1 = [0,0,3584,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -845,23 +677,11 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,15,0] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,15,0] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -1909,24 +1729,12 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX512VLVBMI-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0] +; AVX512VLVBMI-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; XOPAVX1: # %bb.0: @@ -1962,24 +1770,12 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX512VLVBMI-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0] +; AVX512VLVBMI-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; XOPAVX1: # %bb.0: @@ -2015,24 +1811,12 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -2068,24 +1852,12 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -2121,24 +1893,12 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: @@ -2174,24 +1934,12 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VLBW-NEXT: retq ; -; AVX512VLVBMI-SLOW-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-ALL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-ALL: # %bb.0: -; AVX512VLVBMI-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512VLVBMI-FAST-ALL-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-FAST-ALL-NEXT: retq -; -; AVX512VLVBMI-FAST-PERLANE-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VLVBMI-FAST-PERLANE: # %bb.0: -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVBMI-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLVBMI-FAST-PERLANE-NEXT: retq +; AVX512VLVBMI-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index a01e6ca4b175d..d2c64a462a3e7 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1605,20 +1605,20 @@ define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x ; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm6 ; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm7 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6 -; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm8 -; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm9 -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm7 +; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm8 +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,6,7],zmm4[2,3,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm4 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[4,5,6,7] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm5 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm8 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[2,3,6,7],zmm5[2,3,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[4,5,6,7],zmm0[4,5,6,7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[4,5,6,7],zmm1[4,5,6,7] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,6,7],zmm0[2,3,6,7] -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rdi) ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rdi) -; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rdi) +; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rdi) +; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rdi) ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq From 1997073a547528f94b11a111a2eae3676a216127 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Tue, 8 Apr 2025 11:38:27 +0100 Subject: [PATCH 0966/1029] [LLVM][InstCombine][SVE] Refactor sve.mul/fmul combines. (#134116) After https://github.com/llvm/llvm-project/issues/126928 it's now possible to rewrite the existing combines, which mostly only handle cases where a operand is an identity value, to use existing simplify code to unlock general constant folding. --- .../AArch64/AArch64TargetTransformInfo.cpp | 123 ++++++++++----- .../AArch64/sve-intrinsic-fmul-idempotency.ll | 18 +-- .../sve-intrinsic-fmul_u-idempotency.ll | 5 +- .../AArch64/sve-intrinsic-mul-idempotency.ll | 18 +-- .../sve-intrinsic-mul_u-idempotency.ll | 5 +- .../AArch64/sve-intrinsic-simplify-binop.ll | 146 ++++++++++++++++++ 6 files changed, 250 insertions(+), 65 deletions(-) create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 417af74f712e7..234dfa07bdc1a 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1111,6 +1111,19 @@ struct SVEIntrinsicInfo { return *this; } + bool hasMatchingIROpode() const { return IROpcode != 0; } + + unsigned getMatchingIROpode() const { + assert(hasMatchingIROpode() && "Propery not set!"); + return IROpcode; + } + + SVEIntrinsicInfo &setMatchingIROpcode(unsigned Opcode) { + assert(!hasMatchingIROpode() && "Cannot set property twice!"); + IROpcode = Opcode; + return *this; + } + // // Properties relating to the result of inactive lanes. // @@ -1186,6 +1199,7 @@ struct SVEIntrinsicInfo { unsigned GoverningPredicateIdx = std::numeric_limits::max(); Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic; + unsigned IROpcode = 0; enum PredicationStyle { Uninitialized, @@ -1269,7 +1283,8 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) { case Intrinsic::aarch64_sve_fmls: return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u); case Intrinsic::aarch64_sve_fmul: - return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u); + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u) + .setMatchingIROpcode(Instruction::FMul); case Intrinsic::aarch64_sve_fmulx: return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u); case Intrinsic::aarch64_sve_fnmla: @@ -1285,7 +1300,8 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) { case Intrinsic::aarch64_sve_mls: return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u); case Intrinsic::aarch64_sve_mul: - return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u); + return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u) + .setMatchingIROpcode(Instruction::Mul); case Intrinsic::aarch64_sve_sabd: return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u); case Intrinsic::aarch64_sve_smax: @@ -1323,6 +1339,13 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) { case Intrinsic::aarch64_sve_uqsub: return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u); + case Intrinsic::aarch64_sve_fmul_u: + return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( + Instruction::FMul); + case Intrinsic::aarch64_sve_mul_u: + return SVEIntrinsicInfo::defaultUndefOp().setMatchingIROpcode( + Instruction::Mul); + case Intrinsic::aarch64_sve_addqv: case Intrinsic::aarch64_sve_and_z: case Intrinsic::aarch64_sve_bic_z: @@ -2205,45 +2228,63 @@ static std::optional instCombineSVEVectorSub(InstCombiner &IC, return std::nullopt; } -static std::optional instCombineSVEVectorMul(InstCombiner &IC, - IntrinsicInst &II) { - auto *OpPredicate = II.getOperand(0); - auto *OpMultiplicand = II.getOperand(1); - auto *OpMultiplier = II.getOperand(2); +// Simplify `V` by only considering the operations that affect active lanes. +// This function should only return existing Values or newly created Constants. +static Value *stripInactiveLanes(Value *V, const Value *Pg) { + auto *Dup = dyn_cast(V); + if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup && + Dup->getOperand(1) == Pg && isa(Dup->getOperand(2))) + return ConstantVector::getSplat( + cast(V->getType())->getElementCount(), + cast(Dup->getOperand(2))); + + return V; +} - // Return true if a given instruction is a unit splat value, false otherwise. - auto IsUnitSplat = [](auto *I) { - auto *SplatValue = getSplatValue(I); - if (!SplatValue) - return false; - return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); - }; +static std::optional +instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, + const SVEIntrinsicInfo &IInfo) { + const unsigned Opc = IInfo.getMatchingIROpode(); + if (!Instruction::isBinaryOp(Opc)) + return std::nullopt; - // Return true if a given instruction is an aarch64_sve_dup intrinsic call - // with a unit splat value, false otherwise. - auto IsUnitDup = [](auto *I) { - auto *IntrI = dyn_cast(I); - if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) - return false; + Value *Pg = II.getOperand(0); + Value *Op1 = II.getOperand(1); + Value *Op2 = II.getOperand(2); + const DataLayout &DL = II.getDataLayout(); - auto *SplatValue = IntrI->getOperand(2); - return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); - }; + // Canonicalise constants to the RHS. + if (Instruction::isCommutative(Opc) && IInfo.inactiveLanesAreNotDefined() && + isa(Op1) && !isa(Op2)) { + IC.replaceOperand(II, 1, Op2); + IC.replaceOperand(II, 2, Op1); + return &II; + } - if (IsUnitSplat(OpMultiplier)) { - // [f]mul pg %n, (dupx 1) => %n - OpMultiplicand->takeName(&II); - return IC.replaceInstUsesWith(II, OpMultiplicand); - } else if (IsUnitDup(OpMultiplier)) { - // [f]mul pg %n, (dup pg 1) => %n - auto *DupInst = cast(OpMultiplier); - auto *DupPg = DupInst->getOperand(1); - // TODO: this is naive. The optimization is still valid if DupPg - // 'encompasses' OpPredicate, not only if they're the same predicate. - if (OpPredicate == DupPg) { - OpMultiplicand->takeName(&II); - return IC.replaceInstUsesWith(II, OpMultiplicand); - } + // Only active lanes matter when simplifying the operation. + Op1 = stripInactiveLanes(Op1, Pg); + Op2 = stripInactiveLanes(Op2, Pg); + + Value *SimpleII; + if (auto FII = dyn_cast(&II)) + SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL); + else + SimpleII = simplifyBinOp(Opc, Op1, Op2, DL); + + if (SimpleII) { + if (IInfo.inactiveLanesAreNotDefined()) + return IC.replaceInstUsesWith(II, SimpleII); + + Value *Inactive = + II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()); + + // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)). + if (SimpleII == Inactive) + return IC.replaceInstUsesWith(II, SimpleII); + + // Inactive lanes must be preserved. + SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive); + return IC.replaceInstUsesWith(II, SimpleII); } return instCombineSVEVectorBinOp(IC, II); @@ -2650,9 +2691,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, case Intrinsic::aarch64_sve_fadd_u: return instCombineSVEVectorFAddU(IC, II); case Intrinsic::aarch64_sve_fmul: - return instCombineSVEVectorMul(IC, II); + return instCombineSVEVectorMul(IC, II, IInfo); case Intrinsic::aarch64_sve_fmul_u: - return instCombineSVEVectorMul(IC, II); + return instCombineSVEVectorMul(IC, II, IInfo); case Intrinsic::aarch64_sve_fsub: return instCombineSVEVectorFSub(IC, II); case Intrinsic::aarch64_sve_fsub_u: @@ -2664,9 +2705,9 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, Intrinsic::aarch64_sve_mla_u>( IC, II, true); case Intrinsic::aarch64_sve_mul: - return instCombineSVEVectorMul(IC, II); + return instCombineSVEVectorMul(IC, II, IInfo); case Intrinsic::aarch64_sve_mul_u: - return instCombineSVEVectorMul(IC, II); + return instCombineSVEVectorMul(IC, II, IInfo); case Intrinsic::aarch64_sve_sub: return instCombineSVEVectorSub(IC, II); case Intrinsic::aarch64_sve_sub_u: diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll index f612e5a44ebba..3b37e2c1fddef 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul-idempotency.ll @@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu" ; Idempotent fmuls -- should compile to just a ret. define @idempotent_fmul_f16( %pg, %a) #0 { ; CHECK-LABEL: define @idempotent_fmul_f16( -; CHECK-SAME: [[PG:%.*]], [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: ret [[TMP0]] +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: ret [[A]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv8f16(half 1.0) %2 = call @llvm.aarch64.sve.fmul.nxv8f16( %pg, %a, %1) @@ -16,8 +16,8 @@ define @idempotent_fmul_f16( %pg, @idempotent_fmul_f32( %pg, %a) #0 { ; CHECK-LABEL: define @idempotent_fmul_f32( -; CHECK-SAME: [[PG:%.*]], [[TMP0:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: ret [[TMP0]] +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret [[A]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv4f32(float 1.0) %2 = call @llvm.aarch64.sve.fmul.nxv4f32( %pg, %a, %1) @@ -26,8 +26,8 @@ define @idempotent_fmul_f32( %pg, @idempotent_fmul_f64( %pg, %a) #0 { ; CHECK-LABEL: define @idempotent_fmul_f64( -; CHECK-SAME: [[PG:%.*]], [[TMP0:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: ret [[TMP0]] +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret [[A]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0) %2 = call @llvm.aarch64.sve.fmul.nxv2f64( %pg, %a, %1) @@ -37,7 +37,7 @@ define @idempotent_fmul_f64( %pg, @idempotent_fmul_different_argument_order( %pg, %a) #0 { ; CHECK-LABEL: define @idempotent_fmul_different_argument_order( ; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.nxv2f64( [[PG]], splat (double 1.000000e+00), [[A]]) +; CHECK-NEXT: [[TMP1:%.*]] = select [[PG]], [[A]], splat (double 1.000000e+00) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0) @@ -48,8 +48,8 @@ define @idempotent_fmul_different_argument_order( @idempotent_fmul_with_predicated_dup( %pg, %a) #0 { ; CHECK-LABEL: define @idempotent_fmul_with_predicated_dup( -; CHECK-SAME: [[PG:%.*]], [[TMP0:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: ret [[TMP0]] +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret [[A]] ; %1 = call @llvm.aarch64.sve.dup.nxv8f16( poison, %pg, half 1.0) %2 = call @llvm.aarch64.sve.fmul.nxv8f16( %pg, %a, %1) diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll index bd3d7be0a1b80..38ed4272d826c 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-fmul_u-idempotency.ll @@ -36,9 +36,8 @@ define @idempotent_fmul_u_f64( %pg, @idempotent_fmul_u_different_argument_order( %pg, %a) #0 { ; CHECK-LABEL: define @idempotent_fmul_u_different_argument_order( -; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.fmul.u.nxv2f64( [[PG]], splat (double 1.000000e+00), [[A]]) -; CHECK-NEXT: ret [[TMP1]] +; CHECK-SAME: [[PG:%.*]], [[TMP0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret [[TMP0]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv2f64(double 1.0) ; Different argument order to the above tests. diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll index cbdcfc6b110b3..602db4eb1d429 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul-idempotency.ll @@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu" ; Idempotent muls -- should compile to just a ret. define @idempotent_mul_i16( %pg, %a) #0 { ; CHECK-LABEL: define @idempotent_mul_i16( -; CHECK-SAME: [[PG:%.*]], [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: ret [[TMP0]] +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: ret [[A]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv8i16(i16 1) %2 = call @llvm.aarch64.sve.mul.nxv8i16( %pg, %a, %1) @@ -16,8 +16,8 @@ define @idempotent_mul_i16( %pg, @idempotent_mul_i32( %pg, %a) #0 { ; CHECK-LABEL: define @idempotent_mul_i32( -; CHECK-SAME: [[PG:%.*]], [[TMP0:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: ret [[TMP0]] +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret [[A]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv4i32(i32 1) %2 = call @llvm.aarch64.sve.mul.nxv4i32( %pg, %a, %1) @@ -26,8 +26,8 @@ define @idempotent_mul_i32( %pg, @idempotent_mul_i64( %pg, %a) #0 { ; CHECK-LABEL: define @idempotent_mul_i64( -; CHECK-SAME: [[PG:%.*]], [[TMP0:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: ret [[TMP0]] +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret [[A]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 1) %2 = call @llvm.aarch64.sve.mul.nxv2i64( %pg, %a, %1) @@ -37,7 +37,7 @@ define @idempotent_mul_i64( %pg, @idempotent_mul_different_argument_order( %pg, %a) #0 { ; CHECK-LABEL: define @idempotent_mul_different_argument_order( ; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.mul.nxv2i64( [[PG]], splat (i64 1), [[A]]) +; CHECK-NEXT: [[TMP1:%.*]] = select [[PG]], [[A]], splat (i64 1) ; CHECK-NEXT: ret [[TMP1]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 1) @@ -48,8 +48,8 @@ define @idempotent_mul_different_argument_order( @idempotent_mul_with_predicated_dup( %pg, %a) #0 { ; CHECK-LABEL: define @idempotent_mul_with_predicated_dup( -; CHECK-SAME: [[PG:%.*]], [[TMP0:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: ret [[TMP0]] +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret [[A]] ; %1 = call @llvm.aarch64.sve.dup.nxv8i16( poison, %pg, i16 1) %2 = call @llvm.aarch64.sve.mul.nxv8i16( %pg, %a, %1) diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll index 8144e56b979f0..e899c787aa555 100644 --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-mul_u-idempotency.ll @@ -36,9 +36,8 @@ define @idempotent_mul_u_i64( %pg, @idempotent_mul_u_different_argument_order( %pg, %a) #0 { ; CHECK-LABEL: define @idempotent_mul_u_different_argument_order( -; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.mul.u.nxv2i64( [[PG]], splat (i64 1), [[A]]) -; CHECK-NEXT: ret [[TMP1]] +; CHECK-SAME: [[PG:%.*]], [[TMP0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret [[TMP0]] ; %1 = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 1) ; Different argument order to the above tests. diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll new file mode 100644 index 0000000000000..7da55a199df28 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-simplify-binop.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; The follow tests verify the mechanics of simplification. The operation is not +; important beyond being commutative with a known identity value. + +define @commute_constant_to_rhs( %pg, %a) #0 { +; CHECK-LABEL: define @commute_constant_to_rhs( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[R:%.*]] = call @llvm.aarch64.sve.mul.u.nxv4i32( [[PG]], [[A]], splat (i32 303)) +; CHECK-NEXT: ret [[R]] +; + %r = call @llvm.aarch64.sve.mul.u.nxv4i32( %pg, splat (i32 303), %a) + ret %r +} + +; Inactive lanes are important, which make the operation non-commutative. +define @cannot_commute_constant_to_rhs( %pg, %a) #0 { +; CHECK-LABEL: define @cannot_commute_constant_to_rhs( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[R:%.*]] = call @llvm.aarch64.sve.mul.nxv4i32( [[PG]], splat (i32 303), [[A]]) +; CHECK-NEXT: ret [[R]] +; + %r = call @llvm.aarch64.sve.mul.nxv4i32( %pg, splat (i32 303), %a) + ret %r +} + +define @idempotent_mul( %pg, %a) #0 { +; CHECK-LABEL: define @idempotent_mul( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret [[A]] +; + %r = call @llvm.aarch64.sve.mul.nxv4i32( %pg, %a, splat (i32 1)) + ret %r +} + +define @idempotent_mul_ops_reverse( %pg, %a) #0 { +; CHECK-LABEL: define @idempotent_mul_ops_reverse( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[R:%.*]] = select [[PG]], [[A]], splat (i32 1) +; CHECK-NEXT: ret [[R]] +; + %r = call @llvm.aarch64.sve.mul.nxv4i32( %pg, splat (i32 1), %a) + ret %r +} + +define @idempotent_mul_u( %pg, %a) #0 { +; CHECK-LABEL: define @idempotent_mul_u( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret [[A]] +; + %r = call @llvm.aarch64.sve.mul.u.nxv4i32( %pg, %a, splat (i32 1)) + ret %r +} + +define @idempotent_mul_u_ops_reverse( %pg, %a) #0 { +; CHECK-LABEL: define @idempotent_mul_u_ops_reverse( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret [[A]] +; + %r = call @llvm.aarch64.sve.mul.u.nxv4i32( %pg, splat (i32 1), %a) + ret %r +} + +; Show that we only need to know the active lanes are constant. +; TODO: We can do better here because we can use %a directly as part of the +; select because we know only its inactive lanes will be used. +define @constant_mul_after_striping_inactive_lanes( %pg, %a, %b) #0 { +; CHECK-LABEL: define @constant_mul_after_striping_inactive_lanes( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[A_DUP:%.*]] = call @llvm.aarch64.sve.dup.nxv4i32( [[A]], [[PG]], i32 3) +; CHECK-NEXT: [[R:%.*]] = select [[PG]], splat (i32 6), [[A_DUP]] +; CHECK-NEXT: ret [[R]] +; + %a.dup = call @llvm.aarch64.sve.dup.nxv4i32( %a, %pg, i32 3) + %b.dup = call @llvm.aarch64.sve.dup.nxv4i32( %b, %pg, i32 2) + %r = call @llvm.aarch64.sve.mul.nxv4i32( %pg, %a.dup, %b.dup) + ret %r +} + +; Show that we only need to know the active lanes are constant. +define @constant_mul_u_after_striping_inactive_lanes( %pg, %a, %b) #0 { +; CHECK-LABEL: define @constant_mul_u_after_striping_inactive_lanes( +; CHECK-SAME: [[PG:%.*]], [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret splat (i32 6) +; + %a.dup = call @llvm.aarch64.sve.dup.nxv4i32( %a, %pg, i32 3) + %b.dup = call @llvm.aarch64.sve.dup.nxv4i32( %b, %pg, i32 2) + %3 = call @llvm.aarch64.sve.mul.u.nxv4i32( %pg, %a.dup, %b.dup) + ret %3 +} + +; The follow tests demonstrate the operations for which hooks are in place to +; enable simplification. Given the simplications themselves are common code, it +; is assumed they are already well tested elsewhere. + +define @constant_fmul( %pg) #0 { +; CHECK-LABEL: define @constant_fmul( +; CHECK-SAME: [[PG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[R:%.*]] = select [[PG]], splat (float 4.200000e+01), splat (float 7.000000e+00) +; CHECK-NEXT: ret [[R]] +; + %r = call @llvm.aarch64.sve.fmul.nxv4f32( %pg, splat (float 7.0), splat (float 6.0)) + ret %r +} + +define @constant_fmul_u( %pg) #0 { +; CHECK-LABEL: define @constant_fmul_u( +; CHECK-SAME: [[PG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret splat (float 4.200000e+01) +; + %r = call @llvm.aarch64.sve.fmul.u.nxv4f32( %pg, splat (float 7.0), splat (float 6.0)) + ret %r +} + +define @constant_mul( %pg) #0 { +; CHECK-LABEL: define @constant_mul( +; CHECK-SAME: [[PG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[R:%.*]] = select [[PG]], splat (i32 21), splat (i32 7) +; CHECK-NEXT: ret [[R]] +; + %r = call @llvm.aarch64.sve.mul.nxv4i32( %pg, splat (i32 7), splat (i32 3)) + ret %r +} + +define @constant_mul_u( %pg) #0 { +; CHECK-LABEL: define @constant_mul_u( +; CHECK-SAME: [[PG:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: ret splat (i32 21) +; + %r = call @llvm.aarch64.sve.mul.u.nxv4i32( %pg, splat (i32 7), splat (i32 3)) + ret %r +} + + +; repeat only the constant fold tests for fmul(.u) + +declare @llvm.aarch64.sve.dup.nxv4i32(, , i32) + +declare @llvm.aarch64.sve.mul.nxv4i32(, , ) + +declare @llvm.aarch64.sve.mul.u.nxv4i32(, , ) + +attributes #0 = { "target-features"="+sve" } From 7af2b51e761f49974a64c3009882239cea618f2a Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Tue, 8 Apr 2025 11:44:12 +0100 Subject: [PATCH 0967/1029] [AArch64][v8.5A] Omit BTI for non-addr-taken static fns on Linux (#134669) This is a conditional revert of cca40aa8d8aa732, which made LLVM's branch-target-enforcement mode generate BTI at the start of _every_ function, even in the case where the function has internal linkage and its address is never taken for use in an indirect call. The rationale was that it might turn out at link time that a direct call to the function spanned a larger distance than the range of a BL instruction (say, if the translation unit generated multiple code sections and the linker put them a very long way apart). Then the linker might insert a long-branch thunk using an indirect call instruction. SYSVABI64 has now clarified that in this situation the static linker may not assume that the target function is safe to call directly. If it needs to use this strategy, it's responsible for also generating a 'landing pad' near the target function, with a BTI followed by a direct branch, and using that as the target of the long-distance indirect call. https://github.com/ARM-software/abi-aa/commit/606ce44fe4d3419c15cd9ed598f18fb5d520fcfc LLD complies with this spec as of commit 098b0d18add97de. So if we're compiling in a mode that respects SYSVABI64, such as targeting Linux, it's safe to leave out the BTI at the start of a function with internal linkage if we can prove that its address isn't either used in an indirect call in _this_ translation unit or passed out of the object. Therefore, this patch goes back to the behavior before cca40aa8d8aa732, leaving out BTIs in functions that can't be called indirectly, but only if the target triple is Linux. (I wasn't able to find a more precise query for "is this a SYSVABI64-compliant platform?", but Linux certainly is, and this check at least fails in the safe direction - if in doubt, we put in all the BTIs that might be necessary.) --- .../Target/AArch64/AArch64BranchTargets.cpp | 20 +++++++++++++------ .../AArch64/patchable-function-entry-bti.ll | 16 ++++++++++----- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp index b9feb83339d8d..c60fbb63c73ab 100644 --- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp +++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp @@ -65,6 +65,7 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG( dbgs() << "********** AArch64 Branch Targets **********\n" << "********** Function: " << MF.getName() << '\n'); + const Function &F = MF.getFunction(); // LLVM does not consider basic blocks which are the targets of jump tables // to be address-taken (the address can't escape anywhere else), but they are @@ -78,16 +79,23 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) { bool HasWinCFI = MF.hasWinCFI(); for (MachineBasicBlock &MBB : MF) { bool CouldCall = false, CouldJump = false; - // Even in cases where a function has internal linkage and is only called - // directly in its translation unit, it can still be called indirectly if - // the linker decides to add a thunk to it for whatever reason (say, for - // example, if it is finally placed far from its call site and a BL is not - // long-range enough). PLT entries and tail-calls use BR, but when they are + // If the function is address-taken or externally-visible, it could be + // indirectly called. PLT entries and tail-calls use BR, but when they are // are in guarded pages should all use x16 or x17 to hold the called // address, so we don't need to set CouldJump here. BR instructions in // non-guarded pages (which might be non-BTI-aware code) are allowed to // branch to a "BTI c" using any register. - if (&MBB == &*MF.begin()) + // + // For SysV targets, this is enough, because SYSVABI64 says that if the + // static linker later wants to use an indirect branch instruction in a + // long-branch thunk, it's also responsible for adding a 'landing pad' with + // a BTI, and pointing the indirect branch at that. However, at present + // this guarantee only holds for targets complying with SYSVABI64, so for + // other targets we must assume that `CouldCall` is _always_ true due to + // the risk of long-branch thunks at link time. + if (&MBB == &*MF.begin() && + (!MF.getSubtarget().isTargetLinux() || + (F.hasAddressTaken() || !F.hasLocalLinkage()))) CouldCall = true; // If the block itself is address-taken, it could be indirectly branched diff --git a/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll b/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll index 85f5f6fa4674a..6d5dfc9d8fae4 100644 --- a/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll +++ b/llvm/test/CodeGen/AArch64/patchable-function-entry-bti.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple=aarch64 -aarch64-min-jump-table-entries=4 %s -o - | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-min-jump-table-entries=4 %s -o - | FileCheck %s --check-prefixes=CHECK,SYSV +; RUN: llc -mtriple=aarch64-none-elf -aarch64-min-jump-table-entries=4 %s -o - | FileCheck %s --check-prefixes=CHECK,NONSYSV define void @f0() "patchable-function-entry"="0" "branch-target-enforcement" { ; CHECK-LABEL: f0: @@ -48,20 +49,25 @@ define void @f2_1() "patchable-function-entry"="1" "patchable-function-prefix"=" } ;; -fpatchable-function-entry=1 -mbranch-protection=bti -;; We add BTI c even when the function has internal linkage +;; For SysV compliant targets, we don't add BTI (or create the .Lpatch0 symbol) +;; because the function has internal linkage and isn't address-taken. For +;; non-SysV targets, we do add the BTI, because outside SYSVABI64 there's no +;; spec preventing the static linker from using an indirect call instruction in +;; a long-branch thunk inserted at link time. define internal void @f1i(i64 %v) "patchable-function-entry"="1" "branch-target-enforcement" { ; CHECK-LABEL: f1i: ; CHECK-NEXT: .Lfunc_begin3: ; CHECK: // %bb.0: -; CHECK-NEXT: hint #34 -; CHECK-NEXT: .Lpatch1: +; NONSYSV-NEXT: hint #34 +; NONSYSV-NEXT: .Lpatch1: ; CHECK-NEXT: nop ;; Other basic blocks have BTI, but they don't affect our decision to not create .Lpatch0 ; CHECK: .LBB{{.+}} // %sw.bb1 ; CHECK-NEXT: hint #36 ; CHECK: .section __patchable_function_entries,"awo",@progbits,f1i{{$}} ; CHECK-NEXT: .p2align 3 -; CHECK-NEXT: .xword .Lpatch1 +; NONSYSV-NEXT: .xword .Lpatch1 +; SYSV-NEXT: .xword .Lfunc_begin3 entry: switch i64 %v, label %sw.bb0 [ i64 1, label %sw.bb1 From 204d8c0d588b647e63d925705a7fe12c47242fc9 Mon Sep 17 00:00:00 2001 From: Jonathan Thackray Date: Tue, 8 Apr 2025 11:45:47 +0100 Subject: [PATCH 0968/1029] [clang][llvm] Fix AArch64 MOP4{A/S} intrinsic tests (NFC) (#134746) Fix some of the recently-added tests (PRs #127797, #128854, #129226 and #129230) which were incorrectly defined. --- .../sme2-intrinsics/acle_sme2_mop4_1x1.c | 4 +- .../sme2-intrinsics/acle_sme2_mop4_1x2.c | 68 +++++++++---------- .../sme2-intrinsics/acle_sme2_mop4_2x1.c | 2 +- .../sme2-intrinsics/acle_sme2_mop4_2x2.c | 4 +- 4 files changed, 39 insertions(+), 39 deletions(-) diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c index 94a839d053479..3f2a79dc5c7c3 100644 --- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x1.c @@ -3,8 +3,8 @@ // REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s // RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -triple aarch64 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c index 3c8bd372aa547..36e0b754b1d35 100644 --- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_1x2.c @@ -3,8 +3,8 @@ // REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s // RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s @@ -27,7 +27,7 @@ // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za32_s8_s8(svint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za32,_s8_s8)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za32,_s8_s8)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za32_s8_s8( @@ -41,7 +41,7 @@ void test_svmop4a_1x2_za32_s8_s8(svint8_t zn, svint8x2_t zm) __arm_streaming __a // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za32_s8_s8(svint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za32,_s8_s8)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za32,_s8_s8)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za32_u8_u8( @@ -55,7 +55,7 @@ void test_svmop4s_1x2_za32_s8_s8(svint8_t zn, svint8x2_t zm) __arm_streaming __a // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za32_u8_u8(svuint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za32,_u8_u8)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za32,_u8_u8)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za32_u8_u8( @@ -69,7 +69,7 @@ void test_svmop4a_1x2_za32_u8_u8(svuint8_t zn, svuint8x2_t zm) __arm_streaming _ // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za32_u8_u8(svuint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za32,_u8_u8)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za32,_u8_u8)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za32_s8_u8( @@ -83,7 +83,7 @@ void test_svmop4s_1x2_za32_u8_u8(svuint8_t zn, svuint8x2_t zm) __arm_streaming _ // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za32_s8_u8(svint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za32,_s8_u8)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za32,_s8_u8)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za32_s8_u8( @@ -97,7 +97,7 @@ void test_svmop4a_1x2_za32_s8_u8(svint8_t zn, svuint8x2_t zm) __arm_streaming __ // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za32_s8_u8(svint8_t zn, svuint8x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za32,_s8_u8)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za32,_s8_u8)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za32_u8_s8( @@ -111,7 +111,7 @@ void test_svmop4s_1x2_za32_s8_u8(svint8_t zn, svuint8x2_t zm) __arm_streaming __ // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za32_u8_s8(svuint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za32,_u8_s8)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za32,_u8_s8)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za32_u8_s8( @@ -125,7 +125,7 @@ void test_svmop4a_1x2_za32_u8_s8(svuint8_t zn, svint8x2_t zm) __arm_streaming __ // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za32_u8_s8(svuint8_t zn, svint8x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za32,_u8_s8)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za32,_u8_s8)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za32_s16_s16( @@ -139,7 +139,7 @@ void test_svmop4s_1x2_za32_u8_s8(svuint8_t zn, svint8x2_t zm) __arm_streaming __ // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za32_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za32,_s16_s16)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za32,_s16_s16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za32_s16_s16( @@ -153,7 +153,7 @@ void test_svmop4a_1x2_za32_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za32_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za32,_s16_s16)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za32,_s16_s16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za32_u16_u16( @@ -167,7 +167,7 @@ void test_svmop4s_1x2_za32_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za32_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za32,_u16_u16)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za32,_u16_u16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za32_u16_u16( @@ -181,7 +181,7 @@ void test_svmop4a_1x2_za32_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streami // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za32_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za32,_u16_u16)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za32,_u16_u16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za32_f16_f16( @@ -195,7 +195,7 @@ void test_svmop4s_1x2_za32_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streami // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za32_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za32,_f16_f16)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za32,_f16_f16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za32_f16_f16( @@ -209,7 +209,7 @@ void test_svmop4a_1x2_za32_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_strea // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za32_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za32,_f16_f16)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za32,_f16_f16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za32_bf16_bf16( @@ -223,7 +223,7 @@ void test_svmop4s_1x2_za32_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_strea // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za32_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za32,_bf16_bf16)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za32,_bf16_bf16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za32_bf16_bf16( @@ -237,7 +237,7 @@ void test_svmop4a_1x2_za32_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_s // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za32_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za32,_bf16_bf16)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za32,_bf16_bf16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za64_s16_s16( @@ -251,7 +251,7 @@ void test_svmop4s_1x2_za32_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_s // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za64_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za64,_s16_s16)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za64,_s16_s16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za64_s16_s16( @@ -265,7 +265,7 @@ void test_svmop4a_1x2_za64_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za64_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za64,_s16_s16)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za64,_s16_s16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za64_u16_u16( @@ -279,7 +279,7 @@ void test_svmop4s_1x2_za64_s16_s16(svint16_t zn, svint16x2_t zm) __arm_streaming // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za64_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za64,_u16_u16)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za64,_u16_u16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za64_u16_u16( @@ -293,7 +293,7 @@ void test_svmop4a_1x2_za64_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streami // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za64_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za64,_u16_u16)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za64,_u16_u16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za64_s16_u16( @@ -307,7 +307,7 @@ void test_svmop4s_1x2_za64_u16_u16(svuint16_t zn, svuint16x2_t zm) __arm_streami // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za64_s16_u16(svint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za64,_s16_u16)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za64,_s16_u16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za64_s16_u16( @@ -321,7 +321,7 @@ void test_svmop4a_1x2_za64_s16_u16(svint16_t zn, svuint16x2_t zm) __arm_streamin // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za64_s16_u16(svint16_t zn, svuint16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za64,_s16_u16)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za64,_s16_u16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za64_u16_s16( @@ -335,7 +335,7 @@ void test_svmop4s_1x2_za64_s16_u16(svint16_t zn, svuint16x2_t zm) __arm_streamin // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za64_u16_s16(svuint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za64,_u16_s16)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za64,_u16_s16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za64_u16_s16( @@ -349,7 +349,7 @@ void test_svmop4a_1x2_za64_u16_s16(svuint16_t zn, svint16x2_t zm) __arm_streamin // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za64_u16_s16(svuint16_t zn, svint16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za64,_u16_s16)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za64,_u16_s16)(1, zn, zm); } @@ -364,7 +364,7 @@ void test_svmop4s_1x2_za64_u16_s16(svuint16_t zn, svint16x2_t zm) __arm_streamin // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za16_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za16,_f16_f16)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za16,_f16_f16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za16_f16_f16( @@ -378,7 +378,7 @@ void test_svmop4a_1x2_za16_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_strea // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za16_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za16,_f16_f16)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za16,_f16_f16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za32_f32_f32( @@ -392,7 +392,7 @@ void test_svmop4s_1x2_za16_f16_f16(svfloat16_t zn, svfloat16x2_t zm) __arm_strea // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za32_f32_f32(svfloat32_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za32,_f32_f32)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za32,_f32_f32)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za32_f32_f32( @@ -406,7 +406,7 @@ void test_svmop4a_1x2_za32_f32_f32(svfloat32_t zn, svfloat32x2_t zm) __arm_strea // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za32_f32_f32(svfloat32_t zn, svfloat32x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za32,_f32_f32)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za32,_f32_f32)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za64_f64_f64( @@ -420,7 +420,7 @@ void test_svmop4s_1x2_za32_f32_f32(svfloat32_t zn, svfloat32x2_t zm) __arm_strea // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za64_f64_f64(svfloat64_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za64,_f64_f64)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za64,_f64_f64)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za64_f64_f64( @@ -434,7 +434,7 @@ void test_svmop4a_1x2_za64_f64_f64(svfloat64_t zn, svfloat64x2_t zm) __arm_strea // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za64_f64_f64(svfloat64_t zn, svfloat64x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za64,_f64_f64)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za64,_f64_f64)(1, zn, zm); } // CHECK-LABEL: @test_svmop4a_1x2_za16_bf16_bf16( @@ -448,7 +448,7 @@ void test_svmop4s_1x2_za64_f64_f64(svfloat64_t zn, svfloat64x2_t zm) __arm_strea // CPP-CHECK-NEXT: ret void // void test_svmop4a_1x2_za16_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4a,_1x2_,za16,_bf16_bf16)(1, zn, zm); + SME_ACLE_FUNC(svmop4a,_1x2,_za16,_bf16_bf16)(1, zn, zm); } // CHECK-LABEL: @test_svmop4s_1x2_za16_bf16_bf16( @@ -462,5 +462,5 @@ void test_svmop4a_1x2_za16_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_s // CPP-CHECK-NEXT: ret void // void test_svmop4s_1x2_za16_bf16_bf16(svbfloat16_t zn, svbfloat16x2_t zm) __arm_streaming __arm_inout("za") { - SME_ACLE_FUNC(svmop4s,_1x2_,za16,_bf16_bf16)(1, zn, zm); + SME_ACLE_FUNC(svmop4s,_1x2,_za16,_bf16_bf16)(1, zn, zm); } diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c index e42ed95b9b52c..cd12aa523f7cb 100644 --- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x1.c @@ -2,7 +2,7 @@ // REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s // RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x2.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x2.c index 3e9612e3cc582..f5b6c566c2602 100644 --- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x2.c +++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_2x2.c @@ -3,8 +3,8 @@ // REQUIRES: aarch64-registered-target // RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s // RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK // RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme-mop4 -target-feature +sme-f16f16 -target-feature +sme-i16i64 -target-feature +sme-b16b16 -target-feature +sme-f64f64 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s From 739062d2c3b6c21264c415575da47bf59f1dafe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Tue, 8 Apr 2025 12:55:37 +0200 Subject: [PATCH 0969/1029] [SPIR-V] Add spv.gep support for ptrcast legal (#134388) Adds support the the spv.gep intrinsic to the spv ptrcast legalization step. Those intrinsics are generated by the backend thus not directly visible in the tests. This is a pre-requisite to implement addrspacecast legalization for logical SPIR-V. --- .../Target/SPIRV/SPIRVLegalizePointerCast.cpp | 6 +++ .../pointers/getelementptr-downcast-struct.ll | 47 +++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 llvm/test/CodeGen/SPIRV/pointers/getelementptr-downcast-struct.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp index 560869f9fe62a..5ba4fbb02560d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp @@ -170,6 +170,12 @@ class SPIRVLegalizePointerCast : public FunctionPass { DeadInstructions.push_back(Intrin); continue; } + + if (Intrin->getIntrinsicID() == Intrinsic::spv_gep) { + GR->replaceAllUsesWith(CastedOperand, OriginalOperand, + /* DeleteOld= */ false); + continue; + } } llvm_unreachable("Unsupported ptrcast user. Please fix."); diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-downcast-struct.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-downcast-struct.ll new file mode 100644 index 0000000000000..b0a68a30e29be --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-downcast-struct.ll @@ -0,0 +1,47 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - | FileCheck %s --match-full-lines +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#uint64:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#uint_pp:]] = OpTypePointer Private %[[#uint]] +; CHECK-DAG: %[[#uint_0:]] = OpConstant %[[#uint]] 0 +; CHECK-DAG: %[[#uint_1:]] = OpConstant %[[#uint]] 1 +; CHECK-DAG: %[[#uint_10:]] = OpConstant %[[#uint]] 10 +; CHECK-DAG: %[[#t_array:]] = OpTypeArray %[[#uint]] %[[#uint_10]] +; CHECK-DAG: %[[#t_s1:]] = OpTypeStruct %[[#t_array]] +; CHECK-DAG: %[[#t_s2_s_a_s:]] = OpTypeStruct %[[#uint]] %[[#uint]] +; CHECK-DAG: %[[#t_s2_s_a:]] = OpTypeArray %[[#t_s2_s_a_s]] %[[#uint_10]] +; CHECK-DAG: %[[#t_s2_s:]] = OpTypeStruct %[[#t_s2_s_a]] +; CHECK-DAG: %[[#t_s2:]] = OpTypeStruct %[[#t_s2_s]] %[[#uint]] +; CHECK-DAG: %[[#null_s1:]] = OpConstantNull %[[#t_s1]] +; CHECK-DAG: %[[#null_s2:]] = OpConstantNull %[[#t_s2]] +; CHECK-DAG: %[[#ptr_s1:]] = OpTypePointer Private %[[#t_s1]] +; CHECK-DAG: %[[#ptr_s2:]] = OpTypePointer Private %[[#t_s2]] + +%S1 = type { [10 x i32] } +%S2 = type { { [10 x { i32, i32 } ] }, i32 } + +; CHECK-DAG: %[[#global1:]] = OpVariable %[[#ptr_s1]] Private %[[#null_s1]] +@global1 = internal addrspace(10) global %S1 zeroinitializer +; CHECK-DAG: %[[#global2:]] = OpVariable %[[#ptr_s2]] Private %[[#null_s2]] +@global2 = internal addrspace(10) global %S2 zeroinitializer + +define spir_func noundef i32 @foo(i64 noundef %index) local_unnamed_addr { +; CHECK: %[[#index:]] = OpFunctionParameter %[[#uint64]] +entry: +; CHECK: %[[#ptr:]] = OpInBoundsAccessChain %[[#uint_pp]] %[[#global1]] %[[#uint_0]] %[[#index]] + %ptr = getelementptr inbounds %S1, ptr addrspace(10) @global1, i64 0, i32 0, i64 %index +; CHECK: %[[#val:]] = OpLoad %[[#uint]] %[[#ptr]] Aligned 4 + %val = load i32, ptr addrspace(10) %ptr + ret i32 %val +} + +define spir_func noundef i32 @bar(i64 noundef %index) local_unnamed_addr { +; CHECK: %[[#index:]] = OpFunctionParameter %[[#uint64]] +entry: +; CHECK: %[[#ptr:]] = OpInBoundsAccessChain %[[#uint_pp]] %[[#global2]] %[[#uint_0]] %[[#uint_0]] %[[#index]] %[[#uint_1]] + %ptr = getelementptr inbounds %S2, ptr addrspace(10) @global2, i64 0, i32 0, i32 0, i64 %index, i32 1 +; CHECK: %[[#val:]] = OpLoad %[[#uint]] %[[#ptr]] Aligned 4 + %val = load i32, ptr addrspace(10) %ptr + ret i32 %val +} From 483edfeeb55420d509671406b889eadcce24114c Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Tue, 8 Apr 2025 13:05:24 +0200 Subject: [PATCH 0970/1029] [libc++] Use __add_pointer and __remove_pointer builtins when they are fixed (#134147) --- libcxx/include/__config | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libcxx/include/__config b/libcxx/include/__config index ce8bc38acfe3e..d1cbc278862b0 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1073,9 +1073,8 @@ typedef __char32_t char32_t; # define _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(_ClassName) static_assert(true, "") # endif -// TODO(varconst): currently, there are bugs in Clang's intrinsics when handling Objective-C++ `id`, so don't use -// compiler intrinsics in the Objective-C++ mode. -# ifdef __OBJC__ +// TODO(LLVM 22): Remove the workaround +# if defined(__OBJC__) && (!defined(_LIBCPP_CLANG_VER) || _LIBCPP_CLANG_VER < 2001) # define _LIBCPP_WORKAROUND_OBJCXX_COMPILER_INTRINSICS # endif From e06a9ca2cb596c19c23c32a248e1b3b69d7e4c4f Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Tue, 8 Apr 2025 12:09:10 +0100 Subject: [PATCH 0971/1029] [LLVM][CodeGen][SVE] Improve lowering of fixed length masked mem ops. (#134402) Converting fixed length masks, as used by MLOAD, to scalable vectors is done by comparing the mask to zero. When the mask is the result of a compare we can instead promote the operands and regenerate the original compare. At worst this reduces the dependecy chain and in most cases removes the need for multiple compares. --- .../Target/AArch64/AArch64ISelLowering.cpp | 35 ++++++++++++++++--- .../AArch64/sve-fixed-length-masked-gather.ll | 5 ++- .../AArch64/sve-fixed-length-masked-loads.ll | 30 +++++++--------- .../sve-fixed-length-masked-scatter.ll | 5 ++- 4 files changed, 46 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0534d2d546325..3f42501828400 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -20190,6 +20190,12 @@ performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, EVT VecVT = Vec.getValueType(); EVT SubVT = SubVec.getValueType(); + // Promote fixed length vector zeros. + if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() && + Vec.isUndef() && isZerosVector(SubVec.getNode())) + return VecVT.isInteger() ? DAG.getConstant(0, DL, VecVT) + : DAG.getConstantFP(0, DL, VecVT); + // Only do this for legal fixed vector types. if (!VecVT.isFixedLengthVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) || @@ -28697,17 +28703,36 @@ static SDValue convertFixedMaskToScalableVector(SDValue Mask, SDLoc DL(Mask); EVT InVT = Mask.getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); - - auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); + SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); if (ISD::isBuildVectorAllOnes(Mask.getNode())) return Pg; - auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask); - auto Op2 = DAG.getConstant(0, DL, ContainerVT); + bool InvertCond = false; + if (isBitwiseNot(Mask)) { + InvertCond = true; + Mask = Mask.getOperand(0); + } + + SDValue Op1, Op2; + ISD::CondCode CC; + + // When Mask is the result of a SETCC, it's better to regenerate the compare. + if (Mask.getOpcode() == ISD::SETCC) { + Op1 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(0)); + Op2 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(1)); + CC = cast(Mask.getOperand(2))->get(); + } else { + Op1 = convertToScalableVector(DAG, ContainerVT, Mask); + Op2 = DAG.getConstant(0, DL, ContainerVT); + CC = ISD::SETNE; + } + + if (InvertCond) + CC = getSetCCInverse(CC, Op1.getValueType()); return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(), - {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)}); + {Pg, Op1, Op2, DAG.getCondCode(CC)}); } // Convert all fixed length vector loads larger than NEON to masked_loads. diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index a50d0dc37eaf6..093e6cd9328c8 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -460,10 +460,9 @@ define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @masked_gather_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: cmeq v0.2d, v0.2d, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ldr q0, [x1] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [z0.d] ; CHECK-NEXT: str q0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll index 6513b01d00922..34dc0bb5ef2d2 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -401,11 +401,10 @@ define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 -; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 @@ -436,11 +435,10 @@ define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ldr d0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl8 +; VBITS_GE_256-NEXT: ldr d0, [x1] ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 -; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0 @@ -504,11 +502,10 @@ define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 -; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 @@ -603,11 +600,10 @@ define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 +; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 -; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 @@ -638,11 +634,10 @@ define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ldr d0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.b, vl8 +; VBITS_GE_256-NEXT: ldr d0, [x1] ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 -; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0 @@ -706,11 +701,10 @@ define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: ldr q0, [x1] ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 -; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index a42fce70f4f15..ed03f9b322432 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -433,11 +433,10 @@ define void @masked_scatter_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_scatter_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: cmeq v1.2d, v0.2d, #0 -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <2 x i64>, ptr %a From e8dc8add3c04517e673d9dff342a60001c85dc1a Mon Sep 17 00:00:00 2001 From: Kajetan Puchalski Date: Tue, 8 Apr 2025 12:09:52 +0100 Subject: [PATCH 0972/1029] [CMake] Fix using precompiled headers with ccache (#131397) Using precompiled headers with ccache requires special accommodations. Add the required ccache options, clang and gcc compiler flags to CMake. Refactor ccache configuration to pass options directly on the command line for versions of ccache that support it. --------- Signed-off-by: Kajetan Puchalski --- flang/CMakeLists.txt | 9 +++++++++ llvm/CMakeLists.txt | 45 ++++++++++++++++++++++++++++---------------- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index 76eb13295eb07..a2f59214aaf8d 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -452,6 +452,10 @@ if (LLVM_COMPILER_IS_GCC_COMPATIBLE) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fno-semantic-interposition") endif() + # GCC requires this flag in order for precompiled headers to work with ccache + if (CMAKE_CXX_COMPILER_ID MATCHES GCC AND NOT CMAKE_DISABLE_PRECOMPILE_HEADERS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpch-preprocess") + endif() endif() # Clang on Darwin enables non-POSIX extensions by default, which allows the @@ -462,6 +466,11 @@ if (APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_POSIX_C_SOURCE=200809") endif() +# Clang requires this flag in order for precompiled headers to work with ccache +if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND NOT CMAKE_DISABLE_PRECOMPILE_HEADERS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -fno-pch-timestamp") +endif() + list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS) # Determine HOST_LINK_VERSION on Darwin. diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index cfd1a086c0fc2..2efb96bcd4470 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -271,30 +271,43 @@ set(LLVM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build") if(LLVM_CCACHE_BUILD) find_program(CCACHE_PROGRAM ccache) if(CCACHE_PROGRAM) + # ccache --version example output: "ccache version 4.9.1\n(..)" + execute_process(COMMAND ${CCACHE_PROGRAM} --version OUTPUT_VARIABLE CCACHE_VERSION_STR) + string(REGEX MATCH "[0-9]+\.[0-9]+\.?[0-9]*" CCACHE_VERSION "${CCACHE_VERSION_STR}") + set(LLVM_CCACHE_MAXSIZE "" CACHE STRING "Size of ccache") set(LLVM_CCACHE_DIR "" CACHE STRING "Directory to keep ccached data") - set(LLVM_CCACHE_PARAMS "CCACHE_CPP2=yes CCACHE_HASHDIR=yes" - CACHE STRING "Parameters to pass through to ccache") - if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows") - set(CCACHE_PROGRAM "${LLVM_CCACHE_PARAMS} ${CCACHE_PROGRAM}") - if (LLVM_CCACHE_MAXSIZE) - set(CCACHE_PROGRAM "CCACHE_MAXSIZE=${LLVM_CCACHE_MAXSIZE} ${CCACHE_PROGRAM}") + # ccache only supports passing options on the command line from version 4.8.0 + # use a workaround with ad-hoc environment variables for older versions + if (CCACHE_VERSION VERSION_LESS "4.8.0") + set(LLVM_CCACHE_PARAMS "CCACHE_CPP2=yes;CCACHE_HASHDIR=yes;CCACHE_SLOPPINESS=pch_defines,time_macros" + CACHE STRING "Parameters to pass through to ccache") + + set(launcher_params ${LLVM_CCACHE_PARAMS}) + if (CCACHE_MAXSIZE) + set(launcher_params "CCACHE_MAXSIZE=${CCACHE_MAXSIZE};${launcher_params}") endif() - if (LLVM_CCACHE_DIR) - set(CCACHE_PROGRAM "CCACHE_DIR=${LLVM_CCACHE_DIR} ${CCACHE_PROGRAM}") + if (CCACHE_DIR) + set(launcher_params "CCACHE_DIR=${CCACHE_DIR};${launcher_params}") endif() - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PROGRAM}) + set(launcher "${launcher_params};${CCACHE_PROGRAM}") else() - if(LLVM_CCACHE_MAXSIZE OR LLVM_CCACHE_DIR OR - NOT LLVM_CCACHE_PARAMS MATCHES "CCACHE_CPP2=yes CCACHE_HASHDIR=yes") - message(FATAL_ERROR "Ccache configuration through CMake is not supported on Windows. Please use environment variables.") + set(LLVM_CCACHE_PARAMS "run_second_cpp=true;hash_dir=true;sloppiness=pch_defines,time_macros" + CACHE STRING "Parameters to pass through to ccache") + + set(launcher_params ${LLVM_CCACHE_PARAMS}) + if (CCACHE_MAXSIZE) + set(launcher_params "max_size=${CCACHE_MAXSIZE};${launcher_params}") endif() - # RULE_LAUNCH_COMPILE should work with Ninja but currently has issues - # with cmd.exe and some MSVC tools other than cl.exe - set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) - set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) + if (CCACHE_DIR) + set(launcher_params "cache_dir=${CCACHE_DIR};${launcher_params}") + endif() + set(launcher "${CCACHE_PROGRAM};${launcher_params}") endif() + + set(CMAKE_C_COMPILER_LAUNCHER ${launcher}) + set(CMAKE_CXX_COMPILER_LAUNCHER ${launcher}) else() message(FATAL_ERROR "Unable to find the program ccache. Set LLVM_CCACHE_BUILD to OFF") endif() From c2c1031e90adfd8f64c5cd1b5e3b8af663715caf Mon Sep 17 00:00:00 2001 From: Omair Javaid Date: Tue, 8 Apr 2025 16:16:26 +0500 Subject: [PATCH 0973/1029] [Flang][Windows] Fix test_errors.py by enforcing UTF-8 encoding (#134625) This patch fixes UnicodeDecodeError on Windows in test_errors.py. This issue was observed on the flang-arm64-windows-msvc buildbot. Semantics/OpenMP/interop-construct.f90 was crashing due to Python defaulting to the cp1252 codec on Windows. I have fixed this by explicitly setting encoding="utf-8" when reading source files and invoking subprocess.run() in test_errors.py flang-arm64-windows-msvc was running on stagging master which resulted in this issue not being fixed earlier. https://lab.llvm.org/staging/#/builders/206 --- flang/test/Semantics/test_errors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/flang/test/Semantics/test_errors.py b/flang/test/Semantics/test_errors.py index 63ff3367edefd..45684764a00e4 100755 --- a/flang/test/Semantics/test_errors.py +++ b/flang/test/Semantics/test_errors.py @@ -17,7 +17,7 @@ cm.check_args(sys.argv) srcdir = cm.set_source(sys.argv[1]) -with open(srcdir, "r") as f: +with open(srcdir, "r", encoding="utf-8") as f: src = f.readlines() actual = "" expect = "" @@ -39,6 +39,7 @@ check=True, universal_newlines=True, cwd=tmpdir, + encoding="utf-8", ) except subprocess.CalledProcessError as e: log = e.stderr From 69c4e172d9838254f4e8ce4c78de9ecfbbabcae2 Mon Sep 17 00:00:00 2001 From: Michael Klemm Date: Tue, 8 Apr 2025 13:22:05 +0200 Subject: [PATCH 0974/1029] [Flang][OpenMP] Add semantic tests for threadprivate variables with host assoc (#134680) --- flang/test/Semantics/OpenMP/default-none.f90 | 15 +++++++++ .../test/Semantics/OpenMP/threadprivate09.f90 | 33 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 flang/test/Semantics/OpenMP/threadprivate09.f90 diff --git a/flang/test/Semantics/OpenMP/default-none.f90 b/flang/test/Semantics/OpenMP/default-none.f90 index 761c2385466a0..a56657ccee467 100644 --- a/flang/test/Semantics/OpenMP/default-none.f90 +++ b/flang/test/Semantics/OpenMP/default-none.f90 @@ -58,3 +58,18 @@ subroutine sub( aaa) ccc= aaa(ip) end do end subroutine sub + +! Test that threadprivate variables with host association +! have a predetermined DSA +subroutine host_assoc() + integer, save :: i + !$omp threadprivate(i) + real, save :: r + !$omp threadprivate(r) +contains + subroutine internal() +!$omp parallel default(none) + print *, i, r +!$omp end parallel + end subroutine internal +end subroutine host_assoc diff --git a/flang/test/Semantics/OpenMP/threadprivate09.f90 b/flang/test/Semantics/OpenMP/threadprivate09.f90 new file mode 100644 index 0000000000000..64fe048fa3e60 --- /dev/null +++ b/flang/test/Semantics/OpenMP/threadprivate09.f90 @@ -0,0 +1,33 @@ +! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp +! OpenMP Version 5.1 +! Check OpenMP construct validity for the following directives: +! 2.21.2 Threadprivate Directive + +subroutine host_assoc_fail() + integer :: i + ! ERROR: A variable that appears in a THREADPRIVATE directive must be declared in the scope of a module or have the SAVE attribute, either explicitly or implicitly + !$omp threadprivate(i) + real :: r + ! ERROR: A variable that appears in a THREADPRIVATE directive must be declared in the scope of a module or have the SAVE attribute, either explicitly or implicitly + !$omp threadprivate(r) +contains + subroutine internal() +!$omp parallel + print *, i, r +!$omp end parallel + end subroutine internal +end subroutine host_assoc_fail + +! This sub-test is not supposed to emit a compiler error. +subroutine host_assoc() + integer, save :: i + !$omp threadprivate(i) + real, save :: r + !$omp threadprivate(r) +contains + subroutine internal() +!$omp parallel + print *, i, r +!$omp end parallel + end subroutine internal +end subroutine host_assoc From a51e2827845fa3dfc1ef34f325792b35227311b4 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 8 Apr 2025 12:52:38 +0100 Subject: [PATCH 0975/1029] [LV] Check if plan has an early exit via plan's exit blocks. (NFC) (#134720) Add a dedicated function to check if a plan is for a loop with an early exit. This can easily be determined by checking the exit blocks. This allows removing a use of Legal->hasUncountableEarlyExit() from InnerLoopVectorizer. PR: https://github.com/llvm/llvm-project/pull/134720 --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 14 +++++--------- llvm/lib/Transforms/Vectorize/VPlan.h | 8 ++++++++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 72dbef1cffc5f..824e51e6d6884 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7566,14 +7566,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, CM.CostKind); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); - // Set PlanForEarlyExitLoop to true if the BestPlan has been built from a - // loop with an uncountable early exit. The legacy cost model doesn't - // properly model costs for such loops. - bool PlanForEarlyExitLoop = - BestPlan.getVectorLoopRegion() && - BestPlan.getVectorLoopRegion()->getSingleSuccessor() != - BestPlan.getMiddleBlock(); - assert((BestFactor.Width == LegacyVF.Width || PlanForEarlyExitLoop || + // Verify that the VPlan-based and legacy cost models agree, except for VPlans + // with early exits and plans with additional VPlan simplifications. The + // legacy cost model doesn't properly model costs for such loops. + assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() || planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), CostCtx, OrigLoop) || planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width), @@ -7784,7 +7780,7 @@ DenseMap LoopVectorizationPlanner::executePlan( // 2.5 When vectorizing the epilogue, fix reduction resume values from the // additional bypass block. if (VectorizingEpilogue) { - assert(!ILV.Legal->hasUncountableEarlyExit() && + assert(!BestVPlan.hasEarlyExit() && "Epilogue vectorisation not yet supported with early exits"); BasicBlock *PH = OrigLoop->getLoopPreheader(); BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a98d0ecb9a33b..da7aef73f9df3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -3768,6 +3768,14 @@ class VPlan { /// successors of the block in VPlan. The returned block is owned by the VPlan /// and deleted once the VPlan is destroyed. VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB); + + /// Returns true if the VPlan is based on a loop with an early exit. That is + /// the case if the VPlan has either more than one exit block or a single exit + /// block with multiple predecessors (one for the exit via the latch and one + /// via the other early exit). + bool hasEarlyExit() const { + return ExitBlocks.size() > 1 || ExitBlocks[0]->getNumPredecessors() > 1; + } }; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) From 25e08c0b9cafaab09af35ce6a03317ffd503df08 Mon Sep 17 00:00:00 2001 From: Kajetan Puchalski Date: Tue, 8 Apr 2025 13:13:49 +0100 Subject: [PATCH 0976/1029] Revert "[CMake] Fix using precompiled headers with ccache" (#134848) Reverts llvm/llvm-project#131397 Reverting for now on account of build bot failures on certain platforms. --- flang/CMakeLists.txt | 9 --------- llvm/CMakeLists.txt | 45 ++++++++++++++++---------------------------- 2 files changed, 16 insertions(+), 38 deletions(-) diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt index a2f59214aaf8d..76eb13295eb07 100644 --- a/flang/CMakeLists.txt +++ b/flang/CMakeLists.txt @@ -452,10 +452,6 @@ if (LLVM_COMPILER_IS_GCC_COMPATIBLE) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fno-semantic-interposition") endif() - # GCC requires this flag in order for precompiled headers to work with ccache - if (CMAKE_CXX_COMPILER_ID MATCHES GCC AND NOT CMAKE_DISABLE_PRECOMPILE_HEADERS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fpch-preprocess") - endif() endif() # Clang on Darwin enables non-POSIX extensions by default, which allows the @@ -466,11 +462,6 @@ if (APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_POSIX_C_SOURCE=200809") endif() -# Clang requires this flag in order for precompiled headers to work with ccache -if (CMAKE_CXX_COMPILER_ID MATCHES Clang AND NOT CMAKE_DISABLE_PRECOMPILE_HEADERS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -fno-pch-timestamp") -endif() - list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS) # Determine HOST_LINK_VERSION on Darwin. diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 2efb96bcd4470..cfd1a086c0fc2 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -271,43 +271,30 @@ set(LLVM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build") if(LLVM_CCACHE_BUILD) find_program(CCACHE_PROGRAM ccache) if(CCACHE_PROGRAM) - # ccache --version example output: "ccache version 4.9.1\n(..)" - execute_process(COMMAND ${CCACHE_PROGRAM} --version OUTPUT_VARIABLE CCACHE_VERSION_STR) - string(REGEX MATCH "[0-9]+\.[0-9]+\.?[0-9]*" CCACHE_VERSION "${CCACHE_VERSION_STR}") - set(LLVM_CCACHE_MAXSIZE "" CACHE STRING "Size of ccache") set(LLVM_CCACHE_DIR "" CACHE STRING "Directory to keep ccached data") + set(LLVM_CCACHE_PARAMS "CCACHE_CPP2=yes CCACHE_HASHDIR=yes" + CACHE STRING "Parameters to pass through to ccache") - # ccache only supports passing options on the command line from version 4.8.0 - # use a workaround with ad-hoc environment variables for older versions - if (CCACHE_VERSION VERSION_LESS "4.8.0") - set(LLVM_CCACHE_PARAMS "CCACHE_CPP2=yes;CCACHE_HASHDIR=yes;CCACHE_SLOPPINESS=pch_defines,time_macros" - CACHE STRING "Parameters to pass through to ccache") - - set(launcher_params ${LLVM_CCACHE_PARAMS}) - if (CCACHE_MAXSIZE) - set(launcher_params "CCACHE_MAXSIZE=${CCACHE_MAXSIZE};${launcher_params}") + if(NOT CMAKE_SYSTEM_NAME MATCHES "Windows") + set(CCACHE_PROGRAM "${LLVM_CCACHE_PARAMS} ${CCACHE_PROGRAM}") + if (LLVM_CCACHE_MAXSIZE) + set(CCACHE_PROGRAM "CCACHE_MAXSIZE=${LLVM_CCACHE_MAXSIZE} ${CCACHE_PROGRAM}") endif() - if (CCACHE_DIR) - set(launcher_params "CCACHE_DIR=${CCACHE_DIR};${launcher_params}") + if (LLVM_CCACHE_DIR) + set(CCACHE_PROGRAM "CCACHE_DIR=${LLVM_CCACHE_DIR} ${CCACHE_PROGRAM}") endif() - set(launcher "${launcher_params};${CCACHE_PROGRAM}") + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PROGRAM}) else() - set(LLVM_CCACHE_PARAMS "run_second_cpp=true;hash_dir=true;sloppiness=pch_defines,time_macros" - CACHE STRING "Parameters to pass through to ccache") - - set(launcher_params ${LLVM_CCACHE_PARAMS}) - if (CCACHE_MAXSIZE) - set(launcher_params "max_size=${CCACHE_MAXSIZE};${launcher_params}") + if(LLVM_CCACHE_MAXSIZE OR LLVM_CCACHE_DIR OR + NOT LLVM_CCACHE_PARAMS MATCHES "CCACHE_CPP2=yes CCACHE_HASHDIR=yes") + message(FATAL_ERROR "Ccache configuration through CMake is not supported on Windows. Please use environment variables.") endif() - if (CCACHE_DIR) - set(launcher_params "cache_dir=${CCACHE_DIR};${launcher_params}") - endif() - set(launcher "${CCACHE_PROGRAM};${launcher_params}") + # RULE_LAUNCH_COMPILE should work with Ninja but currently has issues + # with cmd.exe and some MSVC tools other than cl.exe + set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) + set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PROGRAM}) endif() - - set(CMAKE_C_COMPILER_LAUNCHER ${launcher}) - set(CMAKE_CXX_COMPILER_LAUNCHER ${launcher}) else() message(FATAL_ERROR "Unable to find the program ccache. Set LLVM_CCACHE_BUILD to OFF") endif() From db7fb704f6d5337399172331b4ee46e846171061 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Tue, 8 Apr 2025 12:14:54 +0000 Subject: [PATCH 0977/1029] [lldb][test] Explain why TestExprFromNonZeroFrame is disabled on Windows It's not scientific but I think the PDB we produce on the Windows on Arm bot simply doesn't have the information needed. Could also be that clang is producing some DWARF, but link.exe is dropping it from the final executable, the effect is the same. --- .../expr-from-non-zero-frame/TestExprFromNonZeroFrame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py b/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py index bc3f0459bd649..4607308ef0911 100644 --- a/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py +++ b/lldb/test/API/commands/expression/expr-from-non-zero-frame/TestExprFromNonZeroFrame.py @@ -7,7 +7,7 @@ class ExprFromNonZeroFrame(TestBase): NO_DEBUG_INFO_TESTCASE = True - # Expression fails to evaluate on Windows, for unknown reasons. + # Requires DWARF debug information. @skipIfWindows def test(self): """ From 79cb6f05da37520949c006e26c5cef1826090d9d Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 8 Apr 2025 07:27:12 -0500 Subject: [PATCH 0978/1029] [Clang] Unify 'nvptx-arch' and 'amdgpu-arch' into 'offload-arch' (#134713) Summary: These two tools do the same thing, we should unify them into a single tool. We create symlinks for backward compatiblity and provide a way to get the old vendor specific behavior with `--amdgpu-only` and `--nvptx-only`. --- clang/tools/CMakeLists.txt | 3 +- clang/tools/amdgpu-arch/AMDGPUArch.cpp | 56 ------------ clang/tools/amdgpu-arch/CMakeLists.txt | 13 --- clang/tools/nvptx-arch/CMakeLists.txt | 12 --- .../AMDGPUArchByHIP.cpp | 0 .../AMDGPUArchByKFD.cpp | 0 clang/tools/offload-arch/CMakeLists.txt | 8 ++ .../NVPTXArch.cpp | 26 +----- clang/tools/offload-arch/OffloadArch.cpp | 87 +++++++++++++++++++ 9 files changed, 97 insertions(+), 108 deletions(-) delete mode 100644 clang/tools/amdgpu-arch/AMDGPUArch.cpp delete mode 100644 clang/tools/amdgpu-arch/CMakeLists.txt delete mode 100644 clang/tools/nvptx-arch/CMakeLists.txt rename clang/tools/{amdgpu-arch => offload-arch}/AMDGPUArchByHIP.cpp (100%) rename clang/tools/{amdgpu-arch => offload-arch}/AMDGPUArchByKFD.cpp (100%) create mode 100644 clang/tools/offload-arch/CMakeLists.txt rename clang/tools/{nvptx-arch => offload-arch}/NVPTXArch.cpp (80%) create mode 100644 clang/tools/offload-arch/OffloadArch.cpp diff --git a/clang/tools/CMakeLists.txt b/clang/tools/CMakeLists.txt index e3557c1328d53..9634eb12080c8 100644 --- a/clang/tools/CMakeLists.txt +++ b/clang/tools/CMakeLists.txt @@ -50,5 +50,4 @@ add_llvm_external_project(clang-tools-extra extra) # libclang may require clang-tidy in clang-tools-extra. add_clang_subdirectory(libclang) -add_clang_subdirectory(amdgpu-arch) -add_clang_subdirectory(nvptx-arch) +add_clang_subdirectory(offload-arch) diff --git a/clang/tools/amdgpu-arch/AMDGPUArch.cpp b/clang/tools/amdgpu-arch/AMDGPUArch.cpp deleted file mode 100644 index 86f3e31f47bbc..0000000000000 --- a/clang/tools/amdgpu-arch/AMDGPUArch.cpp +++ /dev/null @@ -1,56 +0,0 @@ -//===- AMDGPUArch.cpp - list AMDGPU installed ----------*- C++ -*---------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements a tool for detecting name of AMDGPU installed in system. -// This tool is used by AMDGPU OpenMP and HIP driver. -// -//===----------------------------------------------------------------------===// - -#include "clang/Basic/Version.h" -#include "llvm/Support/CommandLine.h" - -using namespace llvm; - -static cl::opt Help("h", cl::desc("Alias for -help"), cl::Hidden); - -// Mark all our options with this category. -static cl::OptionCategory AMDGPUArchCategory("amdgpu-arch options"); - -cl::opt Verbose("verbose", cl::desc("Enable verbose output"), - cl::init(false), cl::cat(AMDGPUArchCategory)); - -static void PrintVersion(raw_ostream &OS) { - OS << clang::getClangToolFullVersion("amdgpu-arch") << '\n'; -} - -int printGPUsByKFD(); -int printGPUsByHIP(); - -int main(int argc, char *argv[]) { - cl::HideUnrelatedOptions(AMDGPUArchCategory); - - cl::SetVersionPrinter(PrintVersion); - cl::ParseCommandLineOptions( - argc, argv, - "A tool to detect the presence of AMDGPU devices on the system. \n\n" - "The tool will output each detected GPU architecture separated by a\n" - "newline character. If multiple GPUs of the same architecture are found\n" - "a string will be printed for each\n"); - - if (Help) { - cl::PrintHelpMessage(); - return 0; - } - -#ifndef _WIN32 - if (!printGPUsByKFD()) - return 0; -#endif - - return printGPUsByHIP(); -} diff --git a/clang/tools/amdgpu-arch/CMakeLists.txt b/clang/tools/amdgpu-arch/CMakeLists.txt deleted file mode 100644 index c4c8de614565a..0000000000000 --- a/clang/tools/amdgpu-arch/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -# //===----------------------------------------------------------------------===// -# // -# // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# // See https://llvm.org/LICENSE.txt for details. -# // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# // -# //===----------------------------------------------------------------------===// - -set(LLVM_LINK_COMPONENTS Support) - -add_clang_tool(amdgpu-arch AMDGPUArch.cpp AMDGPUArchByKFD.cpp AMDGPUArchByHIP.cpp) - -target_link_libraries(amdgpu-arch PRIVATE clangBasic) diff --git a/clang/tools/nvptx-arch/CMakeLists.txt b/clang/tools/nvptx-arch/CMakeLists.txt deleted file mode 100644 index 8f756be2c86d0..0000000000000 --- a/clang/tools/nvptx-arch/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -# //===--------------------------------------------------------------------===// -# // -# // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# // See https://llvm.org/LICENSE.txt for details. -# // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# // -# //===--------------------------------------------------------------------===// - -set(LLVM_LINK_COMPONENTS Support) -add_clang_tool(nvptx-arch NVPTXArch.cpp) - -target_link_libraries(nvptx-arch PRIVATE clangBasic) diff --git a/clang/tools/amdgpu-arch/AMDGPUArchByHIP.cpp b/clang/tools/offload-arch/AMDGPUArchByHIP.cpp similarity index 100% rename from clang/tools/amdgpu-arch/AMDGPUArchByHIP.cpp rename to clang/tools/offload-arch/AMDGPUArchByHIP.cpp diff --git a/clang/tools/amdgpu-arch/AMDGPUArchByKFD.cpp b/clang/tools/offload-arch/AMDGPUArchByKFD.cpp similarity index 100% rename from clang/tools/amdgpu-arch/AMDGPUArchByKFD.cpp rename to clang/tools/offload-arch/AMDGPUArchByKFD.cpp diff --git a/clang/tools/offload-arch/CMakeLists.txt b/clang/tools/offload-arch/CMakeLists.txt new file mode 100644 index 0000000000000..cb50b9c1d6dde --- /dev/null +++ b/clang/tools/offload-arch/CMakeLists.txt @@ -0,0 +1,8 @@ +set(LLVM_LINK_COMPONENTS Support) + +add_clang_tool(offload-arch OffloadArch.cpp NVPTXArch.cpp AMDGPUArchByKFD.cpp AMDGPUArchByHIP.cpp) + +add_clang_symlink(amdgpu-arch offload-arch) +add_clang_symlink(nvptx-arch offload-arch) + +target_link_libraries(offload-arch PRIVATE clangBasic) diff --git a/clang/tools/nvptx-arch/NVPTXArch.cpp b/clang/tools/offload-arch/NVPTXArch.cpp similarity index 80% rename from clang/tools/nvptx-arch/NVPTXArch.cpp rename to clang/tools/offload-arch/NVPTXArch.cpp index 71a48657576e4..c7b7fcdf80500 100644 --- a/clang/tools/nvptx-arch/NVPTXArch.cpp +++ b/clang/tools/offload-arch/NVPTXArch.cpp @@ -21,15 +21,6 @@ using namespace llvm; -static cl::opt Help("h", cl::desc("Alias for -help"), cl::Hidden); - -static void PrintVersion(raw_ostream &OS) { - OS << clang::getClangToolFullVersion("nvptx-arch") << '\n'; -} -// Mark all our options with this category, everything else (except for -version -// and -help) will be hidden. -static cl::OptionCategory NVPTXArchCategory("nvptx-arch options"); - typedef enum cudaError_enum { CUDA_SUCCESS = 0, CUDA_ERROR_NO_DEVICE = 100, @@ -84,22 +75,7 @@ static int handleError(CUresult Err) { return 1; } -int main(int argc, char *argv[]) { - cl::HideUnrelatedOptions(NVPTXArchCategory); - - cl::SetVersionPrinter(PrintVersion); - cl::ParseCommandLineOptions( - argc, argv, - "A tool to detect the presence of NVIDIA devices on the system. \n\n" - "The tool will output each detected GPU architecture separated by a\n" - "newline character. If multiple GPUs of the same architecture are found\n" - "a string will be printed for each\n"); - - if (Help) { - cl::PrintHelpMessage(); - return 0; - } - +int printGPUsByCUDA() { // Attempt to load the NVPTX driver runtime. if (llvm::Error Err = loadCUDA()) { logAllUnhandledErrors(std::move(Err), llvm::errs()); diff --git a/clang/tools/offload-arch/OffloadArch.cpp b/clang/tools/offload-arch/OffloadArch.cpp new file mode 100644 index 0000000000000..74be40214a0ec --- /dev/null +++ b/clang/tools/offload-arch/OffloadArch.cpp @@ -0,0 +1,87 @@ +//===- OffloadArch.cpp - list available GPUs ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/Version.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Path.h" + +using namespace llvm; + +static cl::opt Help("h", cl::desc("Alias for -help"), cl::Hidden); + +// Mark all our options with this category. +static cl::OptionCategory OffloadArchCategory("offload-arch options"); + +enum VendorName { + all, + amdgpu, + nvptx, +}; + +static cl::opt + Only("only", cl::desc("Restrict to vendor:"), cl::cat(OffloadArchCategory), + cl::init(all), + cl::values(clEnumVal(all, "Print all GPUs (default)"), + clEnumVal(amdgpu, "Only print AMD GPUs"), + clEnumVal(nvptx, "Only print NVIDIA GPUs"))); + +cl::opt Verbose("verbose", cl::desc("Enable verbose output"), + cl::init(false), cl::cat(OffloadArchCategory)); + +static void PrintVersion(raw_ostream &OS) { + OS << clang::getClangToolFullVersion("offload-arch") << '\n'; +} + +int printGPUsByKFD(); +int printGPUsByHIP(); +int printGPUsByCUDA(); + +static int printAMD() { +#ifndef _WIN32 + if (!printGPUsByKFD()) + return 0; +#endif + + return printGPUsByHIP(); +} + +static int printNVIDIA() { return printGPUsByCUDA(); } + +int main(int argc, char *argv[]) { + cl::HideUnrelatedOptions(OffloadArchCategory); + + cl::SetVersionPrinter(PrintVersion); + cl::ParseCommandLineOptions( + argc, argv, + "A tool to detect the presence of offloading devices on the system. \n\n" + "The tool will output each detected GPU architecture separated by a\n" + "newline character. If multiple GPUs of the same architecture are found\n" + "a string will be printed for each\n"); + + if (Help) { + cl::PrintHelpMessage(); + return 0; + } + + // If this was invoked from the legacy symlinks provide the same behavior. + bool AMDGPUOnly = Only == VendorName::amdgpu || + sys::path::stem(argv[0]).starts_with("amdgpu-arch"); + bool NVIDIAOnly = Only == VendorName::nvptx || + sys::path::stem(argv[0]).starts_with("nvptx-arch"); + + int NVIDIAResult = 0; + if (!AMDGPUOnly) + NVIDIAResult = printNVIDIA(); + + int AMDResult = 0; + if (!NVIDIAOnly) + AMDResult = printAMD(); + + // We only failed if all cases returned an error. + return AMDResult && NVIDIAResult; +} From fcaefc2c19ebe037df7115f02abf23f94c07e8cc Mon Sep 17 00:00:00 2001 From: Akshat Oke Date: Tue, 8 Apr 2025 17:58:48 +0530 Subject: [PATCH 0979/1029] [AMDGPU][NPM] Port SIPreEmitPeephole to NPM (#130065) --- llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +++++- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 ++-- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 32 ++++++++++++++----- .../AMDGPU/insert-handle-flat-vmem-ds.mir | 1 + ...ort-exec-branches-special-instructions.mir | 1 + .../CodeGen/AMDGPU/set-gpr-idx-peephole.mir | 1 + 7 files changed, 39 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 278f10a670070..03cd45d7de6f2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -210,7 +210,7 @@ extern char &SIWholeQuadModeID; void initializeSILowerControlFlowLegacyPass(PassRegistry &); extern char &SILowerControlFlowLegacyID; -void initializeSIPreEmitPeepholePass(PassRegistry &); +void initializeSIPreEmitPeepholeLegacyPass(PassRegistry &); extern char &SIPreEmitPeepholeID; void initializeSILateBranchLoweringLegacyPass(PassRegistry &); @@ -399,6 +399,13 @@ class SILateBranchLoweringPass static bool isRequired() { return true; } }; +class SIPreEmitPeepholePass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + class AMDGPUSetWavePriorityPass : public PassInfoMixin { public: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index bebb69d765654..538b1b181f643 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -127,6 +127,7 @@ MACHINE_FUNCTION_PASS("si-optimize-exec-masking-pre-ra", SIOptimizeExecMaskingPr MACHINE_FUNCTION_PASS("si-peephole-sdwa", SIPeepholeSDWAPass()) MACHINE_FUNCTION_PASS("si-post-ra-bundler", SIPostRABundlerPass()) MACHINE_FUNCTION_PASS("si-pre-allocate-wwm-regs", SIPreAllocateWWMRegsPass()) +MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) MACHINE_FUNCTION_PASS("si-shrink-instructions", SIShrinkInstructionsPass()) MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass()) #undef MACHINE_FUNCTION_PASS @@ -135,7 +136,6 @@ MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it // already exists. DUMMY_MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f9029d3e496f8..c2bcd53644371 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -542,7 +542,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIModeRegisterLegacyPass(*PR); initializeSIWholeQuadModeLegacyPass(*PR); initializeSILowerControlFlowLegacyPass(*PR); - initializeSIPreEmitPeepholePass(*PR); + initializeSIPreEmitPeepholeLegacyPass(*PR); initializeSILateBranchLoweringLegacyPass(*PR); initializeSIMemoryLegalizerLegacyPass(*PR); initializeSIOptimizeExecMaskingLegacyPass(*PR); @@ -2173,9 +2173,8 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) addPass(AMDGPUSetWavePriorityPass()); - if (TM.getOptLevel() > CodeGenOptLevel::None) { - // TODO: addPass(SIPreEmitPeepholePass()); - } + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(SIPreEmitPeepholePass()); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 2bb70c138a50c..2c2ceedf8a2f6 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -24,7 +24,7 @@ using namespace llvm; namespace { -class SIPreEmitPeephole : public MachineFunctionPass { +class SIPreEmitPeephole { private: const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; @@ -40,24 +40,31 @@ class SIPreEmitPeephole : public MachineFunctionPass { const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); +public: + bool run(MachineFunction &MF); +}; + +class SIPreEmitPeepholeLegacy : public MachineFunctionPass { public: static char ID; - SIPreEmitPeephole() : MachineFunctionPass(ID) { - initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry()); + SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) { + initializeSIPreEmitPeepholeLegacyPass(*PassRegistry::getPassRegistry()); } - bool runOnMachineFunction(MachineFunction &MF) override; + bool runOnMachineFunction(MachineFunction &MF) override { + return SIPreEmitPeephole().run(MF); + } }; } // End anonymous namespace. -INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE, +INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE, "SI peephole optimizations", false, false) -char SIPreEmitPeephole::ID = 0; +char SIPreEmitPeepholeLegacy::ID = 0; -char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID; +char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID; bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { // Match: @@ -410,7 +417,16 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, return true; } -bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { +PreservedAnalyses +llvm::SIPreEmitPeepholePass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + if (!SIPreEmitPeephole().run(MF)) + return PreservedAnalyses::all(); + + return getMachineFunctionPassPreservedAnalyses(); +} + +bool SIPreEmitPeephole::run(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); diff --git a/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir b/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir index d89f306c96a36..785f5bed97904 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=polaris10 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=polaris10 -passes si-pre-emit-peephole %s -o - | FileCheck %s --- diff --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir index 20de119471ba3..2c8739a87626e 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir +++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -passes=si-pre-emit-peephole %s -o - | FileCheck %s # Make sure mandatory skips are not removed around mode defs. --- diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir index 796a70cfe8a39..002d43f937837 100644 --- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir +++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX --- name: simple From 728320f9468ae8f6318f8f36ce675705d5805149 Mon Sep 17 00:00:00 2001 From: TatWai Chong Date: Tue, 8 Apr 2025 05:33:16 -0700 Subject: [PATCH 0980/1029] [mlir][tosa] Increase test coverage for profile-based validation (#134754) Add more tests to increase test coverage. --- mlir/test/Dialect/Tosa/invalid_extension.mlir | 300 +++++++++++++++++- .../Dialect/Tosa/profile_all_unsupported.mlir | 117 +++++++ .../Tosa/profile_pro_fp_unsupported.mlir | 279 +++++++++++++++- .../Tosa/profile_pro_int_unsupported.mlir | 275 +++++++++++++++- 4 files changed, 956 insertions(+), 15 deletions(-) diff --git a/mlir/test/Dialect/Tosa/invalid_extension.mlir b/mlir/test/Dialect/Tosa/invalid_extension.mlir index dd3d114218309..241e603e91c61 100644 --- a/mlir/test/Dialect/Tosa/invalid_extension.mlir +++ b/mlir/test/Dialect/Tosa/invalid_extension.mlir @@ -4,6 +4,282 @@ // RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="profile=pro_int,pro_fp strict-op-spec-alignment" +// ----- +func.func @test_argmax(%arg0: tensor<14x19xbf16>) -> tensor<14xi32> { + // expected-error@+1 {{'tosa.argmax' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.argmax %arg0 {axis = 1 : i32} : (tensor<14x19xbf16>) -> tensor<14xi32> + return %0 : tensor<14xi32> +} + +// ----- +func.func @test_avg_pool2d(%arg0: tensor<1x7x7x9xbf16>, %arg1: tensor<1xbf16>, %arg2: tensor<1xbf16>) -> tensor<1x7x7x9xbf16> { + // expected-error@+1 {{'tosa.avg_pool2d' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.avg_pool2d %arg0, %arg1, %arg2 {acc_type = f32, kernel = array, pad = array, stride = array} : (tensor<1x7x7x9xbf16>, tensor<1xbf16>, tensor<1xbf16>) -> tensor<1x7x7x9xbf16> + return %0 : tensor<1x7x7x9xbf16> +} + +// ----- +func.func @test_conv2d(%arg0: tensor<1x4x4x4xi8>, %arg1: tensor<8x1x1x4xi4>, %arg2: tensor<8xi32>, %arg3: tensor<1xi8>, %arg4: tensor<1xi4>) -> tensor<1x4x4x8xi32> { + // expected-error@+1 {{'tosa.conv2d' op illegal: requires [int4] but not enabled in target}} + %0 = tosa.conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = i32, dilation = array, pad = array, stride = array, local_bound = true} : (tensor<1x4x4x4xi8>, tensor<8x1x1x4xi4>, tensor<8xi32>, tensor<1xi8>, tensor<1xi4>) -> tensor<1x4x4x8xi32> + return %0 : tensor<1x4x4x8xi32> +} + +// ----- +func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xi16>, %arg1: tensor<34x1x1x1x17xi8>, %arg2: tensor<34xi48>, %arg3: tensor<1xi16>, %arg4: tensor<1xi8>) -> tensor<1x4x8x21x34xi48> { + // expected-error@+1 {{'tosa.conv3d' op illegal: requires [int16] but not enabled in target}} + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = i48, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xi16>, tensor<34x1x1x1x17xi8>, tensor<34xi48>, tensor<1xi16>, tensor<1xi8>) -> tensor<1x4x8x21x34xi48> + return %0 : tensor<1x4x8x21x34xi48> +} + +// ----- +func.func @test_depthwise_conv2d(%arg0: tensor<1x4x4x4xbf16>, %arg1: tensor<1x1x4x2xbf16>, %arg2: tensor<8xbf16>, %arg3: tensor<1xbf16>, %arg4: tensor<1xbf16>) -> tensor<1x4x4x8xbf16> { + // expected-error@+1 {{'tosa.depthwise_conv2d' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x4x4x4xbf16>, tensor<1x1x4x2xbf16>, tensor<8xbf16>, tensor<1xbf16>, tensor<1xbf16>) -> tensor<1x4x4x8xbf16> + return %0 : tensor<1x4x4x8xbf16> +} + +// ----- +func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xi16>) -> tensor<1x32x32x8xi16> { + // expected-error@+1 {{'tosa.max_pool2d' op illegal: requires [int16] but not enabled in target}} + %0 = tosa.max_pool2d %arg0 {kernel = array, pad = array, stride = array} : (tensor<1x32x32x8xi16>) -> tensor<1x32x32x8xi16> + return %0 : tensor<1x32x32x8xi16> +} + +// ----- +func.func @test_clamp(%arg0: tensor<13x21x3xi16>) -> tensor<13x21x3xi16> { + // expected-error@+1 {{'tosa.clamp' op illegal: requires [int16] but not enabled in target}} + %0 = tosa.clamp %arg0 {min_val = 0 : i16, max_val = 1 : i16} : (tensor<13x21x3xi16>) -> tensor<13x21x3xi16> + return %0 : tensor<13x21x3xi16> +} + +// ----- +func.func @test_sigmoid(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.sigmoid' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.sigmoid %arg0 : (tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_tanh(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.tanh' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.tanh %arg0 : (tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_add(%arg0: tensor<13x21x1xbf16>, %arg1: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.add' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.add %arg0, %arg1 : (tensor<13x21x1xbf16>, tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_max(%arg0: tensor<13x21x3xbf16>, %arg1: tensor<13x21x1xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.maximum' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.maximum %arg0, %arg1 : (tensor<13x21x3xbf16>, tensor<13x21x1xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_mul(%arg0: tensor<13x21x3xbf16>, %arg1: tensor<13x1x3xbf16>, %shift: tensor<1xi8>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.mul' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.mul %arg0, %arg1, %shift : (tensor<13x21x3xbf16>, tensor<13x1x3xbf16>, tensor<1xi8>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_pow(%arg0: tensor<13x21x3xbf16>, %arg1: tensor<13x21x1xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.pow' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.pow %arg0, %arg1 : (tensor<13x21x3xbf16>, tensor<13x21x1xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_sub(%arg0: tensor<1x21x3xbf16>, %arg1: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.sub' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.sub %arg0, %arg1 : (tensor<1x21x3xbf16>, tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_table(%arg0 : tensor<4x5xi16>, %arg1 : tensor<513xi16>) -> () { + // expected-error@+1 {{'tosa.table' op illegal: requires [int16] but not enabled in target}} + %0 = tosa.table %arg0, %arg1 : (tensor<4x5xi16>, tensor<513xi16>) -> tensor + return +} + +// ----- +func.func @test_abs(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.abs' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.abs %arg0 : (tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_cos(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.cos' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.cos %arg0 : (tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_exp(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.exp' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.exp %arg0 : (tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_floor(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.floor' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.floor %arg0 : (tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_log(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.log' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.log %arg0 : (tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_negate(%arg0: tensor<13x21x3xbf16>, %arg1: tensor<1xbf16>, %arg2: tensor<1xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.negate' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.negate %arg0, %arg1, %arg2 : (tensor<13x21x3xbf16>, tensor<1xbf16>, tensor<1xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_reciprocal(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.reciprocal' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.reciprocal %arg0 : (tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_rsqrt(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.rsqrt' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.rsqrt %arg0 : (tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_equal(%arg0: tensor<13x21x3xbf16>, %arg1: tensor<13x1x3xbf16>) -> tensor<13x21x3xi1> { + // expected-error@+1 {{'tosa.equal' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.equal %arg0, %arg1 : (tensor<13x21x3xbf16>, tensor<13x1x3xbf16>) -> tensor<13x21x3xi1> + return %0 : tensor<13x21x3xi1> +} + +// ----- +func.func @test_reduce_max(%arg0: tensor<13x21x3xbf16>) -> tensor<1x21x3xbf16> { + // expected-error@+1 {{'tosa.reduce_max' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.reduce_max %arg0 {axis = 0 : i32} : (tensor<13x21x3xbf16>) -> tensor<1x21x3xbf16> + return %0 : tensor<1x21x3xbf16> +} + +// ----- +func.func @test_concat(%arg0: tensor<13x21x3xbf16>, %arg1: tensor<13x21x3xbf16>) -> tensor<26x21x3xbf16> { + // expected-error@+1 {{'tosa.concat' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.concat %arg0, %arg1 {axis = 0 : i32} : (tensor<13x21x3xbf16>, tensor<13x21x3xbf16>) -> tensor<26x21x3xbf16> + return %0 : tensor<26x21x3xbf16> +} + +// ----- +func.func @test_pad(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + %padding = tosa.const_shape {values = dense<0> : tensor<6xindex>} : () -> !tosa.shape<6> + // expected-error@+1 {{'tosa.const' op illegal: requires [bf16] but not enabled in target}} + %pad_const = "tosa.const"() {values = dense<3.14> : tensor<1xbf16>} : () -> tensor<1xbf16> + // expected-error@+1 {{'tosa.pad' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.pad %arg0, %padding, %pad_const : (tensor<13x21x3xbf16>, !tosa.shape<6>, tensor<1xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_reshape(%arg0: tensor<13x21x3xbf16>) -> tensor<1x819xbf16> { + %1 = tosa.const_shape {values = dense<[1, 819]> : tensor<2xindex>} : () -> !tosa.shape<2> + // expected-error@+1 {{'tosa.reshape' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.reshape %arg0, %1 : (tensor<13x21x3xbf16>, !tosa.shape<2>) -> tensor<1x819xbf16> + return %0 : tensor<1x819xbf16> +} + +// ----- +func.func @test_reverse(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.reverse' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.reverse %arg0 {axis = 0 : i32} : (tensor<13x21x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_slice(%arg0: tensor<13x21x3xbf16>) -> tensor<4x11x1xbf16> { + %0 = tosa.const_shape {values = dense<[4, 11, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> + %1 = tosa.const_shape {values = dense<[6, 8, 0]> : tensor<3xindex>} : () -> !tosa.shape<3> + // expected-error@+1 {{'tosa.slice' op illegal: requires [bf16] but not enabled in target}} + %2 = tosa.slice %arg0, %0, %1 : (tensor<13x21x3xbf16>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<4x11x1xbf16> + return %2 : tensor<4x11x1xbf16> +} + +// ----- +func.func @test_tile(%arg0: tensor<13x21x3xbf16>) -> tensor<39x21x6xbf16> { + %cst = tosa.const_shape { values = dense<[3, 1, 2]> : tensor<3xindex> } : () -> !tosa.shape<3> + // expected-error@+1 {{'tosa.tile' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.tile %arg0, %cst: (tensor<13x21x3xbf16>, !tosa.shape<3>) -> tensor<39x21x6xbf16> + return %0 : tensor<39x21x6xbf16> +} + +// ----- +func.func @test_transpose(%arg0: tensor<13x21x3xbf16>) -> tensor<3x13x21xbf16> { + // expected-error@+1 {{'tosa.transpose' op illegal: requires [bf16] but not enabled in target}} + %1 = tosa.transpose %arg0 {perms = array} : (tensor<13x21x3xbf16>) -> tensor<3x13x21xbf16> + return %1 : tensor<3x13x21xbf16> +} + +// ----- +func.func @test_gather(%arg0: tensor<13x21x3xbf16>, %arg1: tensor<13x26xi32>) -> tensor<13x26x3xbf16> { + // expected-error@+1 {{'tosa.gather' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.gather %arg0, %arg1 : (tensor<13x21x3xbf16>, tensor<13x26xi32>) -> tensor<13x26x3xbf16> + return %0 : tensor<13x26x3xbf16> +} + +// ----- +func.func @test_scatter(%arg0: tensor<13x21x3xbf16>, %arg1: tensor<13x26xi32>, %arg2: tensor<13x26x3xbf16>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.scatter' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<13x21x3xbf16>, tensor<13x26xi32>, tensor<13x26x3xbf16>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_resize(%arg0: tensor<1x32x32x8xbf16>) -> tensor<1x64x64x8xbf16> { + %scale = tosa.const_shape { values = dense<[4, 2, 4, 2]> : tensor<4xindex> } : () -> !tosa.shape<4> + %offset = tosa.const_shape { values = dense<[-1, -1]> : tensor<2xindex> } : () -> !tosa.shape<2> + %border = tosa.const_shape { values = dense<[1, 1]> : tensor<2xindex> } : () -> !tosa.shape<2> + // expected-error@+1 {{'tosa.resize' op illegal: requires [bf16] but not enabled in target}} + %1 = tosa.resize %arg0, %scale, %offset, %border { mode = "BILINEAR" } : (tensor<1x32x32x8xbf16>, !tosa.shape<4>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<1x64x64x8xbf16> + return %1 : tensor<1x64x64x8xbf16> +} + +// ----- +func.func @test_cast_i8_bf16(%arg0: tensor<13x21x3xi8>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.cast' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.cast %arg0 : (tensor<13x21x3xi8>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + +// ----- +func.func @test_cast_bf16_i8(%arg0: tensor<13x21x3xbf16>) -> tensor<13x21x3xi8> { + // expected-error@+1 {{'tosa.cast' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.cast %arg0 : (tensor<13x21x3xbf16>) -> tensor<13x21x3xi8> + return %0 : tensor<13x21x3xi8> +} + +// ----- +func.func @test_cast_f32_bf16(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xbf16> { + // expected-error@+1 {{'tosa.cast' op illegal: requires [bf16] but not enabled in target}} + %0 = tosa.cast %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xbf16> + return %0 : tensor<13x21x3xbf16> +} + // ----- func.func @test_fft2d(%arg0: tensor<1x4x8xf32>, %arg1: tensor<1x4x8xf32>) -> (tensor<1x4x8xf32>, tensor<1x4x8xf32>) { // expected-error@+1 {{'tosa.fft2d' op illegal: requires [fft] but not enabled in target}} @@ -11,6 +287,28 @@ func.func @test_fft2d(%arg0: tensor<1x4x8xf32>, %arg1: tensor<1x4x8xf32>) -> (te return %0, %1 : tensor<1x4x8xf32>, tensor<1x4x8xf32> } +// ----- +func.func @test_const_i4() -> tensor<3x11x11x3xi4> { + // expected-error@+1 {{'tosa.const' op illegal: requires [int4] but not enabled in target}} + %0 = "tosa.const"() {values = dense<0> : tensor<3x11x11x3xi4>} : () -> tensor<3x11x11x3xi4> + return %0 : tensor<3x11x11x3xi4> +} + +// ----- +func.func @test_const_i48() -> tensor<3x11x11x3xi48> { + // expected-error@+1 {{'tosa.const' op illegal: requires [int16] but not enabled in target}} + %0 = "tosa.const"() {values = dense<0> : tensor<3x11x11x3xi48>} : () -> tensor<3x11x11x3xi48> + return %0 : tensor<3x11x11x3xi48> +} + +// ----- +// CHECK-LABEL: identity +func.func @test_identity(%arg0: tensor<13x21x3xi4>) -> tensor<13x21x3xi4> { + // expected-error@+1 {{'tosa.identity' op illegal: requires [int4] but not enabled in target}} + %0 = tosa.identity %arg0 : (tensor<13x21x3xi4>) -> tensor<13x21x3xi4> + return %0 : tensor<13x21x3xi4> +} + // ----- func.func @test_variable_read_type(%arg0: tensor<2x4x8xi8>) -> () { // expected-error@+1 {{'tosa.variable' op illegal: requires [variable] but not enabled in target}} @@ -79,7 +377,7 @@ func.func @test_single_round_rescale(%arg0: tensor<13x21x3xi8>) -> tensor<13x21x %input_zp = "tosa.const"() {values = dense<0> : tensor<1xi8>} : () -> tensor<1xi8> %output_zp = "tosa.const"() {values = dense<0> : tensor<1xi8>} : () -> tensor<1xi8> // CHECK tosa.rescale - %0 = tosa.rescale %arg0, %multiplier, %shift, %input_zp, %output_zp {rounding_mode = "SINGLE_ROUND", input_zp = 127 : i32, output_zp = -1 : i32, per_channel = false, scale32 = true, input_unsigned = false, output_unsigned = false} : (tensor<13x21x3xi8>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<13x21x3xi8> + %0 = tosa.rescale %arg0, %multiplier, %shift, %input_zp, %output_zp {rounding_mode = "SINGLE_ROUND", per_channel = false, scale32 = true, input_unsigned = false, output_unsigned = false} : (tensor<13x21x3xi8>, tensor<1xi32>, tensor<1xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<13x21x3xi8> return %0 : tensor<13x21x3xi8> } diff --git a/mlir/test/Dialect/Tosa/profile_all_unsupported.mlir b/mlir/test/Dialect/Tosa/profile_all_unsupported.mlir index e9cff00cbde37..99d86e7455264 100644 --- a/mlir/test/Dialect/Tosa/profile_all_unsupported.mlir +++ b/mlir/test/Dialect/Tosa/profile_all_unsupported.mlir @@ -4,6 +4,62 @@ // RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,dynamic,doubleround,inexactround strict-op-spec-alignment" +// ----- +func.func @test_add_i32(%arg0: tensor<13x21x1xi32>, %arg1: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.add' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.add %arg0, %arg1 : (tensor<13x21x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} + +// ----- +func.func @test_int_div(%arg0: tensor<13x21x1xi32>, %arg1: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.int_div' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.int_div %arg0, %arg1 : (tensor<13x21x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} + +// ----- +func.func @test_logical_and(%arg0: tensor<13x21x3xi1>, %arg1: tensor<13x21x1xi1>) -> tensor<13x21x3xi1> { + // expected-error@+1 {{'tosa.logical_and' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.logical_and %arg0, %arg1 : (tensor<13x21x3xi1>, tensor<13x21x1xi1>) -> tensor<13x21x3xi1> + return %0 : tensor<13x21x3xi1> +} + +// ----- +func.func @test_logical_left_shift(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x21x1xi32>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.logical_left_shift' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.logical_left_shift %arg0, %arg1 : (tensor<13x21x3xi32>, tensor<13x21x1xi32>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} + +// ----- +func.func @test_mul(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>, %shift: tensor<1xi8>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.mul' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.mul %arg0, %arg1, %shift : (tensor<13x21x3xi32>, tensor<13x1x3xi32>, tensor<1xi8>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} + +// ----- +func.func @test_sub(%arg0: tensor<1x21x3xi32>, %arg1: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.sub' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.sub %arg0, %arg1 : (tensor<1x21x3xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} + +// ----- +func.func @test_logical_not(%arg0: tensor<1x21x3xi1>) -> tensor<1x21x3xi1> { + // expected-error@+1 {{'tosa.logical_not' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.logical_not %arg0 : (tensor<1x21x3xi1>) -> tensor<1x21x3xi1> + return %0 : tensor<1x21x3xi1> +} + +// ----- +func.func @test_select(%arg0: tensor<1x1x1xi1>, %arg1: tensor<13x21x3xi1>, %arg2: tensor<13x21x3xi1>) -> tensor<13x21x3xi1> { + // expected-error@+1 {{'tosa.select' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.select %arg0, %arg1, %arg2 : (tensor<1x1x1xi1>, tensor<13x21x3xi1>, tensor<13x21x3xi1>) -> tensor<13x21x3xi1> + return %0 : tensor<13x21x3xi1> +} + // ----- func.func @test_table(%arg0 : tensor<4x5xi8>, %arg1 : tensor<513xi8>) -> () { // expected-error@+1 {{'tosa.table' op illegal: requires [pro_int] but not enabled in target}} @@ -122,6 +178,67 @@ func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<1x21x3xi1> { return %0 : tensor<1x21x3xi1> } +// ----- +func.func @test_concat(%arg0: tensor<13x21x3xi1>, %arg1: tensor<13x21x3xi1>) -> tensor<26x21x3xi1> { + // expected-error@+1 {{'tosa.concat' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.concat %arg0, %arg1 {axis = 0 : i32} : (tensor<13x21x3xi1>, tensor<13x21x3xi1>) -> tensor<26x21x3xi1> + return %0 : tensor<26x21x3xi1> +} + +// ----- +func.func @test_pad(%arg0: tensor<13x21x3xi1>) -> tensor<13x21x3xi1> { + // expected-error@+1 {{'tosa.const_shape' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %padding = tosa.const_shape {values = dense<0> : tensor<6xindex>} : () -> !tosa.shape<6> + // expected-error@+1 {{'tosa.const' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %pad_const = "tosa.const"() {values = dense<1> : tensor<1xi1>} : () -> tensor<1xi1> + // expected-error@+1 {{'tosa.pad' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.pad %arg0, %padding, %pad_const : (tensor<13x21x3xi1>, !tosa.shape<6>, tensor<1xi1>) -> tensor<13x21x3xi1> + return %0 : tensor<13x21x3xi1> +} + +// ----- +func.func @test_reshape(%arg0: tensor<13x21x3xi1>) -> tensor<1x819xi1> { + // expected-error@+1 {{'tosa.const_shape' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %1 = tosa.const_shape {values = dense<[1, 819]> : tensor<2xindex>} : () -> !tosa.shape<2> + // expected-error@+1 {{'tosa.reshape' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.reshape %arg0, %1 : (tensor<13x21x3xi1>, !tosa.shape<2>) -> tensor<1x819xi1> + return %0 : tensor<1x819xi1> +} + +// ----- +func.func @test_reverse(%arg0: tensor<13x21x3xi1>) -> tensor<13x21x3xi1> { + // expected-error@+1 {{'tosa.reverse' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.reverse %arg0 {axis = 0 : i32} : (tensor<13x21x3xi1>) -> tensor<13x21x3xi1> + return %0 : tensor<13x21x3xi1> +} + +// ----- +func.func @test_slice(%arg0: tensor<13x21x3xi1>) -> tensor<4x11x1xi1> { + // expected-error@+1 {{'tosa.const_shape' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.const_shape {values = dense<[4, 11, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> + // expected-error@+1 {{'tosa.const_shape' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %1 = tosa.const_shape {values = dense<[6, 8, 0]> : tensor<3xindex>} : () -> !tosa.shape<3> + // expected-error@+1 {{'tosa.slice' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %2 = tosa.slice %arg0, %0, %1 : (tensor<13x21x3xi1>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<4x11x1xi1> + return %2 : tensor<4x11x1xi1> +} + +// ----- +func.func @test_tile(%arg0: tensor<13x21x3xi1>) -> tensor<39x21x6xi1> { + // expected-error@+1 {{'tosa.const_shape' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %cst = tosa.const_shape { values = dense<[3, 1, 2]> : tensor<3xindex> } : () -> !tosa.shape<3> + // expected-error@+1 {{'tosa.tile' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %0 = tosa.tile %arg0, %cst: (tensor<13x21x3xi1>, !tosa.shape<3>) -> tensor<39x21x6xi1> + return %0 : tensor<39x21x6xi1> +} + +// ----- +func.func @test_transpose(%arg0: tensor<13x21x3xi1>) -> tensor<3x13x21xi1> { + // expected-error@+1 {{'tosa.transpose' op illegal: requires any of [pro_int, pro_fp] but not enabled in target}} + %1 = tosa.transpose %arg0 {perms = array} : (tensor<13x21x3xi1>) -> tensor<3x13x21xi1> + return %1 : tensor<3x13x21xi1> +} + // ----- func.func @test_concat(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<26x21x3xf32> { // expected-error@+1 {{'tosa.concat' op illegal: requires [pro_fp] but not enabled in target}} diff --git a/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir b/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir index 54ddef67b0baa..72669c62c95ca 100644 --- a/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir +++ b/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir @@ -2,13 +2,20 @@ // Enable all supported extensions to focus the verification of expected profile requirement errors. //-------------------------------------------------------------------------------------------------- -// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="profile=pro_int extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround strict-op-spec-alignment" +// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="profile=pro_int extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,dynamic,doubleround,inexactround strict-op-spec-alignment" // ----- -func.func @test_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<8x1x1x4xf32>, %arg2: tensor<8xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x4x8xf32> { - // expected-error@+1 {{'tosa.conv2d' op illegal: requires [pro_fp] but not enabled in target}} - %0 = tosa.conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array, local_bound = true} : (tensor<1x4x4x4xf32>, tensor<8x1x1x4xf32>, tensor<8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x4x8xf32> - return %0 : tensor<1x4x4x8xf32> +func.func @test_const_f16() -> tensor<3x11x11x3xf16> { + // expected-error@+1 {{'tosa.const' op illegal: requires [pro_fp] but not enabled in target}} + %0 = "tosa.const"() {values = dense<2.0> : tensor<3x11x11x3xf16>} : () -> tensor<3x11x11x3xf16> + return %0 : tensor<3x11x11x3xf16> +} + +// ----- +func.func @test_const_f32() -> tensor<3x11x11x3xf32> { + // expected-error@+1 {{'tosa.const' op illegal: requires [pro_fp] but not enabled in target}} + %0 = "tosa.const"() {values = dense<3.0> : tensor<3x11x11x3xf32>} : () -> tensor<3x11x11x3xf32> + return %0 : tensor<3x11x11x3xf32> } // ----- @@ -19,9 +26,34 @@ func.func @test_avg_pool2d(%arg0: tensor<1x7x7x9xf32>, %arg1: tensor<1xf32>, %ar } // ----- -func.func @test_matmul(%arg0: tensor<1x14x19xf32>, %arg1: tensor<1x19x28xf32>, %arg2: tensor<1xf32>) -> tensor<1x14x28xf32> { +func.func @test_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor<8x1x1x4xf32>, %arg2: tensor<8xf32>, %arg3: tensor<1xf32>, %arg4: tensor<1xf32>) -> tensor<1x4x4x8xf32> { + // expected-error@+1 {{'tosa.conv2d' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array, local_bound = true} : (tensor<1x4x4x4xf32>, tensor<8x1x1x4xf32>, tensor<8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x4x4x8xf32> + return %0 : tensor<1x4x4x8xf32> +} + +// ----- +func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xf16>, %arg1: tensor<34x1x1x1x17xf16>, %arg2: tensor<34xf16>, %arg3: tensor<1xf16>, %arg4: tensor<1xf16>) -> tensor<1x4x8x21x34xf16> { + // expected-error@+1 {{'tosa.conv3d' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xf16>, tensor<34x1x1x1x17xf16>, tensor<34xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<1x4x8x21x34xf16> + return %0 : tensor<1x4x8x21x34xf16> +} + +// ----- +func.func @test_depthwise_conv2d(%arg0: tensor<1x4x4x4xf16>, %arg1: tensor<1x1x4x2xf16>, %arg2: tensor<8xf16>, %arg3: tensor<1xf16>, %arg4: tensor<1xf16>) -> tensor<1x4x4x8xf16> { + // expected-error@+1 {{'tosa.depthwise_conv2d' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f16, dilation = array, pad = array, stride = array} : (tensor<1x4x4x4xf16>, tensor<1x1x4x2xf16>, tensor<8xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<1x4x4x8xf16> + return %0 : tensor<1x4x4x8xf16> +} + +// ----- +func.func @test_matmul(%arg0: tensor<1x14x19xf32>, %arg1: tensor<1x19x28xf32>) -> tensor<1x14x28xf32> { + // expected-error@+1 {{'tosa.const' op illegal: requires [pro_fp] but not enabled in target}} + %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> + // expected-error@+1 {{'tosa.const' op illegal: requires [pro_fp] but not enabled in target}} + %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> // expected-error@+1 {{'tosa.matmul' op illegal: requires [pro_fp] but not enabled in target}} - %0 = tosa.matmul %arg0, %arg1, %arg2, %arg2 : (tensor<1x14x19xf32>, tensor<1x19x28xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x14x28xf32> + %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf32>, tensor<1x19x28xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x14x28xf32> return %0 : tensor<1x14x28xf32> } @@ -39,6 +71,27 @@ func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x return %0 : tensor<1x32x32x16xf32> } +// ----- +func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32> { + // expected-error@+1 {{'tosa.max_pool2d' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.max_pool2d %arg0 {kernel = array, pad = array, stride = array} : (tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32> + return %0 : tensor<1x32x32x8xf32> +} + +// ----- +func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf16>, %arg1: tensor<16x1x1x8xf16>, %arg2: tensor<16xf16>, %arg3: tensor<1xf16>, %arg4: tensor<1xf16>) -> tensor<1x32x32x16xf16> { + // expected-error@+1 {{'tosa.transpose_conv2d' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f16, out_pad = array, stride = array} : (tensor<1x32x32x8xf16>, tensor<16x1x1x8xf16>, tensor<16xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<1x32x32x16xf16> + return %0 : tensor<1x32x32x16xf16> +} + +// ----- +func.func @test_clamp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.clamp' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.clamp %arg0 {min_val = 0.0 : f32, max_val = 1.0 : f32} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + // ----- func.func @test_add(%arg0: tensor<13x21x1xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { // expected-error@+1 {{'tosa.add' op illegal: requires [pro_fp] but not enabled in target}} @@ -60,3 +113,215 @@ func.func @test_cast_i32_f32(%arg0: tensor<13x21x3xi32>) -> tensor<13x21x3xf32> return %0 : tensor<13x21x3xf32> } +// ----- +func.func @test_max(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x1xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.maximum' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.maximum %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x21x1xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>, %shift: tensor<1xi8>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.mul' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.mul %arg0, %arg1, %shift : (tensor<13x21x3xf32>, tensor<13x1x3xf32>, tensor<1xi8>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_pow(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x1xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.pow' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.pow %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x21x1xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_sub(%arg0: tensor<1x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.sub' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.sub %arg0, %arg1 : (tensor<1x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_abs(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.abs' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.abs %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_ceil(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.ceil' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.ceil %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_cos(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.cos' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.cos %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_exp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.exp' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.exp %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_floor(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.floor' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.floor %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_log(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.log' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.log %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_negate(%arg0: tensor<13x21x3xf32>, %arg1: tensor<1xf32>, %arg2: tensor<1xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.negate' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.negate %arg0, %arg1, %arg2 : (tensor<13x21x3xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_reciprocal(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.reciprocal' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.reciprocal %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_rsqrt(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.rsqrt' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.rsqrt %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_select(%arg0: tensor<1x1x1xi1>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.select' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.select %arg0, %arg1, %arg2 : (tensor<1x1x1xi1>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_sin(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.sin' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.sin %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_equal(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xi1> { + // expected-error@+1 {{'tosa.equal' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.equal %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x1x3xf32>) -> tensor<13x21x3xi1> + return %0 : tensor<13x21x3xi1> +} + +// ----- +func.func @test_greater(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xi1> { + // expected-error@+1 {{'tosa.greater' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.greater %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x1x3xf32>) -> tensor<13x21x3xi1> + return %0 : tensor<13x21x3xi1> +} + +// ----- +func.func @test_reduce_max(%arg0: tensor<13x21x3xf16>) -> tensor<1x21x3xf16> { + // expected-error@+1 {{'tosa.reduce_max' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.reduce_max %arg0 {axis = 0 : i32} : (tensor<13x21x3xf16>) -> tensor<1x21x3xf16> + return %0 : tensor<1x21x3xf16> +} + +// ----- +func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<1x21x3xf32> { + // expected-error@+1 {{'tosa.reduce_sum' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.reduce_sum %arg0 {axis = 0 : i32} : (tensor<13x21x3xf32>) -> tensor<1x21x3xf32> + return %0 : tensor<1x21x3xf32> +} + +// ----- +func.func @test_concat(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<26x21x3xf32> { + // expected-error@+1 {{'tosa.concat' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.concat %arg0, %arg1 {axis = 0 : i32} : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<26x21x3xf32> + return %0 : tensor<26x21x3xf32> +} + +// ----- +func.func @test_pad(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + %padding = tosa.const_shape {values = dense<0> : tensor<6xindex>} : () -> !tosa.shape<6> + // expected-error@+1 {{'tosa.const' op illegal: requires [pro_fp] but not enabled in target}} + %pad_const = "tosa.const"() {values = dense<3.14> : tensor<1xf32>} : () -> tensor<1xf32> + // expected-error@+1 {{'tosa.pad' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.pad %arg0, %padding, %pad_const : (tensor<13x21x3xf32>, !tosa.shape<6>, tensor<1xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<1x819xf32> { + %1 = tosa.const_shape {values = dense<[1, 819]> : tensor<2xindex>} : () -> !tosa.shape<2> + // expected-error@+1 {{'tosa.reshape' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.reshape %arg0, %1 : (tensor<13x21x3xf32>, !tosa.shape<2>) -> tensor<1x819xf32> + return %0 : tensor<1x819xf32> +} + +// ----- +func.func @test_reverse(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.reverse' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.reverse %arg0 {axis = 0 : i32} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<4x11x1xf32> { + %0 = tosa.const_shape {values = dense<[4, 11, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> + %1 = tosa.const_shape {values = dense<[6, 8, 0]> : tensor<3xindex>} : () -> !tosa.shape<3> + // expected-error@+1 {{'tosa.slice' op illegal: requires [pro_fp] but not enabled in target}} + %2 = tosa.slice %arg0, %0, %1 : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<4x11x1xf32> + return %2 : tensor<4x11x1xf32> +} + +// ----- +func.func @test_tile(%arg0: tensor<13x21x3xf32>) -> tensor<39x21x6xf32> { + %cst = tosa.const_shape { values = dense<[3, 1, 2]> : tensor<3xindex> } : () -> !tosa.shape<3> + // expected-error@+1 {{'tosa.tile' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.tile %arg0, %cst: (tensor<13x21x3xf32>, !tosa.shape<3>) -> tensor<39x21x6xf32> + return %0 : tensor<39x21x6xf32> +} + +// ----- +func.func @test_transpose(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3xi32>) -> tensor<3x13x21xf32> { + // expected-error@+1 {{'tosa.transpose' op illegal: requires [pro_fp] but not enabled in target}} + %1 = tosa.transpose %arg0 {perms = array}: (tensor<13x21x3xf32>) -> tensor<3x13x21xf32> + return %1 : tensor<3x13x21xf32> +} + +// ----- +func.func @test_gather(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x26xi32>) -> tensor<13x26x3xf32> { + // expected-error@+1 {{'tosa.gather' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.gather %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x26xi32>) -> tensor<13x26x3xf32> + return %0 : tensor<13x26x3xf32> +} + +// ----- +func.func @test_scatter(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x26xi32>, %arg2: tensor<13x26x3xf32>) -> tensor<13x21x3xf32> { + // expected-error@+1 {{'tosa.scatter' op illegal: requires [pro_fp] but not enabled in target}} + %0 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<13x21x3xf32>, tensor<13x26xi32>, tensor<13x26x3xf32>) -> tensor<13x21x3xf32> + return %0 : tensor<13x21x3xf32> +} + +// ----- +func.func @test_resize(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x8xf32> { + %scale = tosa.const_shape { values = dense<[4, 2, 4, 2]> : tensor<4xindex> } : () -> !tosa.shape<4> + %offset = tosa.const_shape { values = dense<[-1, -1]> : tensor<2xindex> } : () -> !tosa.shape<2> + %border = tosa.const_shape { values = dense<[1, 1]> : tensor<2xindex> } : () -> !tosa.shape<2> + // expected-error@+1 {{'tosa.resize' op illegal: requires [pro_fp] but not enabled in target}} + %1 = tosa.resize %arg0, %scale, %offset, %border { mode = "BILINEAR" } : (tensor<1x32x32x8xf32>, !tosa.shape<4>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<1x64x64x8xf32> + return %1 : tensor<1x64x64x8xf32> +} diff --git a/mlir/test/Dialect/Tosa/profile_pro_int_unsupported.mlir b/mlir/test/Dialect/Tosa/profile_pro_int_unsupported.mlir index c69f78fcb9d1a..e98b906377b22 100644 --- a/mlir/test/Dialect/Tosa/profile_pro_int_unsupported.mlir +++ b/mlir/test/Dialect/Tosa/profile_pro_int_unsupported.mlir @@ -2,7 +2,112 @@ // Enable all supported extensions to focus the verification of expected profile requirement errors. //-------------------------------------------------------------------------------------------------- -// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="profile=pro_fp extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround strict-op-spec-alignment" +// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-validate="profile=pro_fp extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,dynamic,doubleround,inexactround strict-op-spec-alignment" + +// ----- +func.func @test_const_i1() -> tensor<3x11x11x3xi1> { + %0 = "tosa.const"() {values = dense<0> : tensor<3x11x11x3xi1>} : () -> tensor<3x11x11x3xi1> + return %0 : tensor<3x11x11x3xi1> +} + +// ----- +func.func @test_const_i32() -> tensor<3x11x11x3xi32> { + %0 = "tosa.const"() {values = dense<0> : tensor<3x11x11x3xi32>} : () -> tensor<3x11x11x3xi32> + return %0 : tensor<3x11x11x3xi32> +} + +// ----- +func.func @test_argmax(%arg0: tensor<14x19xi8>) -> tensor<14xi32> { + // expected-error@+1 {{'tosa.argmax' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.argmax %arg0 {axis = 1 : i32} : (tensor<14x19xi8>) -> tensor<14xi32> + return %0 : tensor<14xi32> +} + +// ----- +func.func @test_avg_pool2d(%arg0: tensor<1x7x7x9xi8>, %arg1: tensor<1xi8>, %arg2: tensor<1xi8>) -> tensor<1x7x7x9xi8> { + // expected-error@+1 {{'tosa.avg_pool2d' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.avg_pool2d %arg0, %arg1, %arg2 {acc_type = i32, kernel = array, pad = array, stride = array} : (tensor<1x7x7x9xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x7x7x9xi8> + return %0 : tensor<1x7x7x9xi8> +} + +// ----- +func.func @test_conv2d(%arg0: tensor<1x4x4x4xi8>, %arg1: tensor<8x1x1x4xi8>, %arg2: tensor<8xi32>, %arg3: tensor<1xi8>) -> tensor<1x4x4x8xi32> { + // expected-error@+1 {{'tosa.conv2d' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.conv2d %arg0, %arg1, %arg2, %arg3, %arg3 {acc_type = i32, dilation = array, pad = array, stride = array, local_bound = true} : (tensor<1x4x4x4xi8>, tensor<8x1x1x4xi8>, tensor<8xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x4x8xi32> + return %0 : tensor<1x4x4x8xi32> +} + +// ----- +func.func @test_conv3d(%arg0: tensor<1x4x8x21x17xi8>, %arg1: tensor<34x1x1x1x17xi8>, %arg2: tensor<34xi32>, %arg3: tensor<1xi8>, %arg4: tensor<1xi8>) -> tensor<1x4x8x21x34xi32> { + // expected-error@+1 {{'tosa.conv3d' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.conv3d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = i32, dilation = array, pad = array, stride = array} : (tensor<1x4x8x21x17xi8>, tensor<34x1x1x1x17xi8>, tensor<34xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x8x21x34xi32> + return %0 : tensor<1x4x8x21x34xi32> +} + +// ----- +func.func @test_depthwise_conv2d(%arg0: tensor<1x4x4x4xi8>, %arg1: tensor<1x1x4x2xi8>, %arg2: tensor<8xi32>, %arg3: tensor<1xi8>, %arg4: tensor<1xi8>) -> tensor<1x4x4x8xi32> { + // expected-error@+1 {{'tosa.depthwise_conv2d' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.depthwise_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = i32, dilation = array, pad = array, stride = array} : (tensor<1x4x4x4xi8>, tensor<1x1x4x2xi8>, tensor<8xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x4x4x8xi32> + return %0 : tensor<1x4x4x8xi32> +} + +// ----- +func.func @test_matmul(%arg0: tensor<1x14x19xi8>, %arg1: tensor<1x19x28xi8>) -> tensor<1x14x28xi32> { + %azp0 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %bzp0 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + // expected-error@+1 {{'tosa.matmul' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xi8>, tensor<1x19x28xi8>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x14x28xi32> + return %0 : tensor<1x14x28xi32> +} + +// ----- +func.func @test_max_pool2d(%arg0: tensor<1x32x32x8xi8>) -> tensor<1x32x32x8xi8> { + // expected-error@+1 {{'tosa.max_pool2d' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.max_pool2d %arg0 {kernel = array, pad = array, stride = array} : (tensor<1x32x32x8xi8>) -> tensor<1x32x32x8xi8> + return %0 : tensor<1x32x32x8xi8> +} + +// ----- +func.func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xi8>, %arg1: tensor<16x1x1x8xi8>, %arg2: tensor<16xi32>, %arg3: tensor<1xi8>, %arg4: tensor<1xi8>) -> tensor<1x32x32x16xi32> { + // expected-error@+1 {{'tosa.transpose_conv2d' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = i32, out_pad = array, stride = array} : (tensor<1x32x32x8xi8>, tensor<16x1x1x8xi8>, tensor<16xi32>, tensor<1xi8>, tensor<1xi8>) -> tensor<1x32x32x16xi32> + return %0 : tensor<1x32x32x16xi32> +} + +// ----- +func.func @test_clamp(%arg0: tensor<13x21x3xi8>) -> tensor<13x21x3xi8> { + // expected-error@+1 {{'tosa.clamp' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.clamp %arg0 {min_val = 0 : i8, max_val = 1: i8} : (tensor<13x21x3xi8>) -> tensor<13x21x3xi8> + return %0 : tensor<13x21x3xi8> +} + +// ----- +func.func @test_arithmetic_right_shift(%arg0: tensor<13x21x1xi32>, %arg1: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.arithmetic_right_shift' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.arithmetic_right_shift %arg0, %arg1 {round = false} : (tensor<13x21x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} + +// ----- +func.func @test_bitwise_and(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x21x1xi32>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.bitwise_and' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.bitwise_and %arg0, %arg1 : (tensor<13x21x3xi32>, tensor<13x21x1xi32>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} + +// ----- +func.func @test_max(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x21x1xi32>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.maximum' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.maximum %arg0, %arg1 : (tensor<13x21x3xi32>, tensor<13x21x1xi32>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} + +// ----- +func.func @test_mul(%arg0: tensor<13x21x3xi16>, %arg1: tensor<13x1x3xi16>, %shift: tensor<1xi8>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.mul' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.mul %arg0, %arg1, %shift : (tensor<13x21x3xi16>, tensor<13x1x3xi16>, tensor<1xi8>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} // ----- func.func @test_table(%arg0 : tensor<4x5xi8>, %arg1 : tensor<513xi8>) -> () { @@ -12,19 +117,175 @@ func.func @test_table(%arg0 : tensor<4x5xi8>, %arg1 : tensor<513xi8>) -> () { } // ----- -func.func @test_reduce_max(%arg0: tensor<13x21x3xi16>) -> tensor<1x21x3xi16> { +func.func @test_abs(%arg0: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.abs' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.abs %arg0 : (tensor<13x21x3xi32>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} + +// ----- +func.func @test_bitwise_not(%arg0: tensor<13x21x1xi32>) -> tensor<13x21x1xi32> { + // expected-error@+1 {{'tosa.bitwise_not' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.bitwise_not %arg0 : (tensor<13x21x1xi32>) -> tensor<13x21x1xi32> + return %0 : tensor<13x21x1xi32> +} + +// ----- +func.func @test_clz(%arg0: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.clz' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.clz %arg0 : (tensor<13x21x3xi32>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} + +// ----- +func.func @test_negate(%arg0: tensor<13x21x3xi16>, %arg1: tensor<1xi16>, %arg2: tensor<1xi16>) -> tensor<13x21x3xi16> { + // expected-error@+1 {{'tosa.negate' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.negate %arg0, %arg1, %arg2 : (tensor<13x21x3xi16>, tensor<1xi16>, tensor<1xi16>) -> tensor<13x21x3xi16> + return %0 : tensor<13x21x3xi16> +} + +// ----- +func.func @test_select(%arg0: tensor<1x1x1xi1>, %arg1: tensor<13x21x3xi8>, %arg2: tensor<13x21x3xi8>) -> tensor<13x21x3xi8> { + // expected-error@+1 {{'tosa.select' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.select %arg0, %arg1, %arg2 : (tensor<1x1x1xi1>, tensor<13x21x3xi8>, tensor<13x21x3xi8>) -> tensor<13x21x3xi8> + return %0 : tensor<13x21x3xi8> +} + +// ----- +func.func @test_equal(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>) -> tensor<13x21x3xi1> { + // expected-error@+1 {{'tosa.equal' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.equal %arg0, %arg1 : (tensor<13x21x3xi32>, tensor<13x1x3xi32>) -> tensor<13x21x3xi1> + return %0 : tensor<13x21x3xi1> +} + +// ----- +func.func @test_greater(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>) -> tensor<13x21x3xi1> { + // expected-error@+1 {{'tosa.greater' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.greater %arg0, %arg1 : (tensor<13x21x3xi32>, tensor<13x1x3xi32>) -> tensor<13x21x3xi1> + return %0 : tensor<13x21x3xi1> +} + +// ----- +func.func @test_reduce_max(%arg0: tensor<13x21x3xi8>) -> tensor<1x21x3xi8> { // expected-error@+1 {{'tosa.reduce_max' op illegal: requires [pro_int] but not enabled in target}} - %0 = tosa.reduce_max %arg0 {axis = 0 : i32} : (tensor<13x21x3xi16>) -> tensor<1x21x3xi16> - return %0 : tensor<1x21x3xi16> + %0 = tosa.reduce_max %arg0 {axis = 0 : i32} : (tensor<13x21x3xi8>) -> tensor<1x21x3xi8> + return %0 : tensor<1x21x3xi8> +} + +// ----- +func.func @test_reduce_sum(%arg0: tensor<13x21x3xi32>) -> tensor<1x21x3xi32> { + // expected-error@+1 {{'tosa.reduce_sum' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.reduce_sum %arg0 {axis = 0 : i32} : (tensor<13x21x3xi32>) -> tensor<1x21x3xi32> + return %0 : tensor<1x21x3xi32> +} + +// ----- +func.func @test_concat(%arg0: tensor<13x21x3xi16>, %arg1: tensor<13x21x3xi16>) -> tensor<26x21x3xi16> { + // expected-error@+1 {{'tosa.concat' op illegal: requires [pro_int] to work with but not enabled in target}} + %0 = tosa.concat %arg0, %arg1 {axis = 0 : i32} : (tensor<13x21x3xi16>, tensor<13x21x3xi16>) -> tensor<26x21x3xi16> + return %0 : tensor<26x21x3xi16> +} + +// ----- +func.func @test_pad(%arg0: tensor<13x21x3xi8>) -> tensor<13x21x3xi8> { + %padding = tosa.const_shape {values = dense<0> : tensor<6xindex>} : () -> !tosa.shape<6> + %pad_const = "tosa.const"() {values = dense<1> : tensor<1xi8>} : () -> tensor<1xi8> + // expected-error@+1 {{'tosa.pad' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.pad %arg0, %padding, %pad_const : (tensor<13x21x3xi8>, !tosa.shape<6>, tensor<1xi8>) -> tensor<13x21x3xi8> + return %0 : tensor<13x21x3xi8> +} + +// ----- +func.func @test_reshape(%arg0: tensor<13x21x3xi8>) -> tensor<1x819xi8> { + %1 = tosa.const_shape {values = dense<[1, 819]> : tensor<2xindex>} : () -> !tosa.shape<2> + // expected-error@+1 {{'tosa.reshape' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.reshape %arg0, %1 : (tensor<13x21x3xi8>, !tosa.shape<2>) -> tensor<1x819xi8> + return %0 : tensor<1x819xi8> +} + +// ----- +func.func @test_reverse(%arg0: tensor<13x21x3xi8>) -> tensor<13x21x3xi8> { + // expected-error@+1 {{'tosa.reverse' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.reverse %arg0 {axis = 0 : i32} : (tensor<13x21x3xi8>) -> tensor<13x21x3xi8> + return %0 : tensor<13x21x3xi8> } // ----- -func.func @test_cast_i8_i32(%arg0: tensor<13x21x3xi32>) -> tensor<13x21x3xi8> { - // expected-error@+1 {{'tosa.cast' op illegal: requires [pro_int] but not enabled in target}} - %0 = tosa.cast %arg0 : (tensor<13x21x3xi32>) -> tensor<13x21x3xi8> +func.func @test_slice(%arg0: tensor<13x21x3xi8>) -> tensor<4x11x1xi8> { + %0 = tosa.const_shape {values = dense<[4, 11, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> + %1 = tosa.const_shape {values = dense<[6, 8, 0]> : tensor<3xindex>} : () -> !tosa.shape<3> + // expected-error@+1 {{'tosa.slice' op illegal: requires [pro_int] but not enabled in target}} + %2 = tosa.slice %arg0, %0, %1 : (tensor<13x21x3xi8>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<4x11x1xi8> + return %2 : tensor<4x11x1xi8> +} + +// ----- +func.func @test_tile(%arg0: tensor<13x21x3xi8>) -> tensor<39x21x6xi8> { + %cst = tosa.const_shape { values = dense<[3, 1, 2]> : tensor<3xindex> } : () -> !tosa.shape<3> + // expected-error@+1 {{'tosa.tile' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.tile %arg0, %cst: (tensor<13x21x3xi8>, !tosa.shape<3>) -> tensor<39x21x6xi8> + return %0 : tensor<39x21x6xi8> +} + +// ----- +func.func @test_transpose(%arg0: tensor<13x21x3xi8>, %arg1: tensor<3xi32>) -> tensor<3x13x21xi8> { + // expected-error@+1 {{'tosa.transpose' op illegal: requires [pro_int] but not enabled in target}} + %1 = tosa.transpose %arg0 {perms = array}: (tensor<13x21x3xi8>) -> tensor<3x13x21xi8> + return %1 : tensor<3x13x21xi8> +} + +// ----- +func.func @test_gather(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x26xi32>) -> tensor<13x26x3xi32> { + // expected-error@+1 {{'tosa.gather' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.gather %arg0, %arg1 : (tensor<13x21x3xi32>, tensor<13x26xi32>) -> tensor<13x26x3xi32> + return %0 : tensor<13x26x3xi32> +} + +// ----- +func.func @test_scatter(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x26xi32>, %arg2: tensor<13x26x3xi32>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.scatter' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<13x21x3xi32>, tensor<13x26xi32>, tensor<13x26x3xi32>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} + +// ----- +func.func @test_resize(%arg0: tensor<1x32x32x8xi8>) -> tensor<1x64x64x8xi32> { + %scale = tosa.const_shape { values = dense<[4, 2, 4, 2]> : tensor<4xindex> } : () -> !tosa.shape<4> + %offset = tosa.const_shape { values = dense<[-1, -1]> : tensor<2xindex> } : () -> !tosa.shape<2> + %border = tosa.const_shape { values = dense<[1, 1]> : tensor<2xindex> } : () -> !tosa.shape<2> + // expected-error@+1 {{'tosa.resize' op illegal: requires [pro_int] but not enabled in target}} + %1 = tosa.resize %arg0, %scale, %offset, %border { mode = "BILINEAR" } : (tensor<1x32x32x8xi8>, !tosa.shape<4>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<1x64x64x8xi32> + return %1 : tensor<1x64x64x8xi32> +} + +// ----- +func.func @test_cast_i1_i8(%arg0: tensor<13x21x3xi1>) -> tensor<13x21x3xi8> { + // expected-error@+1 {{'tosa.cast' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.cast %arg0 : (tensor<13x21x3xi1>) -> tensor<13x21x3xi8> return %0 : tensor<13x21x3xi8> } +// ----- +func.func @test_cast_i8_i32(%arg0: tensor<13x21x3xi8>) -> tensor<13x21x3xi32> { + // expected-error@+1 {{'tosa.cast' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.cast %arg0 : (tensor<13x21x3xi8>) -> tensor<13x21x3xi32> + return %0 : tensor<13x21x3xi32> +} + +// ----- +func.func @test_cast_i16_i8(%arg0: tensor<13x21x3xi16>) -> tensor<13x21x3xi8> { + // expected-error@+1 {{'tosa.cast' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.cast %arg0 : (tensor<13x21x3xi16>) -> tensor<13x21x3xi8> + return %0 : tensor<13x21x3xi8> +} + +// ----- +func.func @test_cast_i32_i16(%arg0: tensor<13x21x3xi32>) -> tensor<13x21x3xi16> { + // expected-error@+1 {{'tosa.cast' op illegal: requires [pro_int] but not enabled in target}} + %0 = tosa.cast %arg0 : (tensor<13x21x3xi32>) -> tensor<13x21x3xi16> + return %0 : tensor<13x21x3xi16> +} + // ----- func.func @test_rescale(%arg0: tensor<13x21x3xi8>) -> tensor<13x21x3xi32> { %multiplier = "tosa.const"() {values = dense<1073741824> : tensor<1xi32>} : () -> tensor<1xi32> From 7e1b76c2d7e272541ed40df6ce7a15e5cb51d407 Mon Sep 17 00:00:00 2001 From: Kajetan Puchalski Date: Tue, 8 Apr 2025 13:47:25 +0100 Subject: [PATCH 0981/1029] Revert "[flang] Use precompiled parsing headers" (#134851) Reverts llvm/llvm-project#130600 Reverting on account of Windows issues with ccache, will bring it back along with #131137 once those are resolved. --- flang/lib/Frontend/CMakeLists.txt | 8 -------- 1 file changed, 8 deletions(-) diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt index e8a098613e26f..c80373799b015 100644 --- a/flang/lib/Frontend/CMakeLists.txt +++ b/flang/lib/Frontend/CMakeLists.txt @@ -72,11 +72,3 @@ add_flang_library(flangFrontend clangBasic clangDriver ) - -target_precompile_headers(flangFrontend PRIVATE - [["flang/Parser/parsing.h"]] - [["flang/Parser/parse-tree.h"]] - [["flang/Parser/dump-parse-tree.h"]] - [["flang/Lower/PFTBuilder.h"]] - [["flang/Lower/Bridge.h"]] -) From 008c875be85732f72c4df4671167f5be79f449eb Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 8 Apr 2025 14:08:42 +0100 Subject: [PATCH 0982/1029] [AMDGPU] Fix excessive stack usage in SIInsertWaitcnts::run (#134835) Noticed on Windows when running LLVM as part of a graphics driver, with total stack usage limited to about 128 KB. In some cases this function would overflow the stack. On Linux this reduces stack usage in this function from about 32 KB to about 0.5 KB. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 49943265500b1..ccbafa6a1f887 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2623,12 +2623,17 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { else *Brackets = *BI.Incoming; } else { - if (!Brackets) + if (!Brackets) { Brackets = std::make_unique( ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); - else - *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, - WaitEventMaskForInst, SmemAccessCounter); + } else { + // Reinitialize in-place. N.B. do not do this by assigning from a + // temporary because the WaitcntBrackets class is large and it could + // cause this function to use an unreasonable amount of stack space. + Brackets->~WaitcntBrackets(); + new (Brackets.get()) WaitcntBrackets( + ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + } } Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets); From 94b9d75c6d77438008dcfc32812afaaff6e4ec3b Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 8 Apr 2025 09:16:37 -0400 Subject: [PATCH 0983/1029] [gn] port 65813e0e94c04 --- .../Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn index f9249c208d99b..5883d981ab073 100644 --- a/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn @@ -1,14 +1,5 @@ import("//lldb/utils/TableGen/lldb_tablegen.gni") -lldb_tablegen("DynamicLoaderDarwinProperties") { - args = [ "-gen-lldb-property-defs" ] -} - -lldb_tablegen("DynamicLoaderDarwinPropertiesEnum") { - args = [ "-gen-lldb-property-enum-defs" ] - td_file = "DynamicLoaderDarwinProperties.td" -} - static_library("MacOSX-DYLD") { output_name = "lldbPluginDynamicLoaderMacOSXDYLD" configs += [ @@ -16,8 +7,6 @@ static_library("MacOSX-DYLD") { "//llvm/utils/gn/build:lldb_code", ] deps = [ - ":DynamicLoaderDarwinProperties", - ":DynamicLoaderDarwinPropertiesEnum", "//lldb/source/Breakpoint", "//lldb/source/Core", "//lldb/source/Expression", @@ -34,7 +23,6 @@ static_library("MacOSX-DYLD") { include_dirs = [ "//lldb/source" ] sources = [ "DynamicLoaderDarwin.cpp", - "DynamicLoaderDarwinProperties.cpp", "DynamicLoaderMacOS.cpp", "DynamicLoaderMacOSXDYLD.cpp", ] From dae0ef53a0b99c6c2b74143baee5896e8bc5c8e7 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Tue, 8 Apr 2025 06:18:30 -0700 Subject: [PATCH 0984/1029] [MLIR][AMDGPU] Add a wrapper for global LDS load intrinsics in AMDGPU (#133498) Defining a new `amdgpu.global_load` op, which is a thin wrap around ROCDL `global_load_lds` intrinsic, along with its lowering logics to `rocdl.global.load.lds`. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 36 +++++ .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 53 ++++++- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 64 ++++++-- .../Conversion/AMDGPUToROCDL/load_lds.mlir | 143 ++++++++++++++++++ 4 files changed, 285 insertions(+), 11 deletions(-) create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 9cdd961d96ff5..108d7237ff703 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -767,4 +767,40 @@ def AMDGPU_WMMAOp : let hasVerifier = 1; } +def AMDGPU_GatherToLDSOp : + AMDGPU_Op<"gather_to_lds", [SameVariadicOperandSize]>, + Arguments<(ins + Arg:$src, + Variadic:$srcIndices, + Arg:$dst, + Variadic:$dstIndices, + TypeAttr:$transferType + )>, + Results<(outs)> { + let summary = "MLIR wrapper for CDNA mfma instructions"; + let description = [{ + The `amdgpu.global_load` op is a wrapper around the `global_load_lds` instructions. + + Operands: + * `$src`: global memory memref to read from. + * `$srcIndices`: indices into `$src` to read from for this thread. + * `$dst`: LDS memory memref to write to. + * `$dstIndices`: base indices into `$dst` to write to for the subgroup of this thread. + The elements gathered by the subgroup will be written in order of lane ID will be written + into contiguously starting at `$dst[$dstIndices]`. + * `$transferType`: type of the data to be transferred by each thread. This is used to determine + the size of the data to be transferred and the number of threads in the subgroup. + The transfer type must be a scalar type or a vector type with a single element type. + + The `$dst`, along with its indices, points to the memory location the subgroup of this thread + will write to. + + Note: only enabled for gfx942 and later. + }]; + let assemblyFormat = [{ + $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` attr-dict `:` $transferType `,` type($src) `,` type($dst) + }]; + let hasVerifier = 1; +} + #endif // AMDGPU diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 56d40d6d123bf..5f697bdeef566 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -1010,6 +1010,55 @@ struct WMMAOpLowering : public ConvertOpToLLVMPattern { } }; +struct GatherToLDSOpLowering : public ConvertOpToLLVMPattern { + GatherToLDSOpLowering(const LLVMTypeConverter &converter, Chipset chipset) + : ConvertOpToLLVMPattern(converter), chipset(chipset) {} + + Chipset chipset; + + LogicalResult + matchAndRewrite(GatherToLDSOp op, GatherToLDSOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + if (chipset < kGfx942) + return op.emitOpError("chipset not supported"); + + Location loc = op.getLoc(); + + auto srcMemRefType = cast(op.getSrc().getType()); + auto dstMemRefType = cast(op.getSrc().getType()); + + // TODO: instead of only transfering one element per thread, we could + // augment it to transfer multiple elements per thread by issuing multiple + // `global_load_lds` instructions. + Type transferType = op.getTransferType(); + size_t loadWidth = [&]() -> size_t { + if (auto transferVectorType = dyn_cast(transferType)) { + return transferVectorType.getNumElements() * + (transferVectorType.getElementTypeBitWidth() / 8); + } else { + return transferType.getIntOrFloatBitWidth() / 8; + } + }(); + + // Currently only 1, 2, and 4 byte loads are supported. + if (loadWidth != 1 && loadWidth != 2 && loadWidth != 4) + return op.emitOpError("chipset unsupported element size"); + + Value srcPtr = getStridedElementPtr(loc, srcMemRefType, adaptor.getSrc(), + (adaptor.getSrcIndices()), rewriter); + Value dstPtr = getStridedElementPtr(loc, dstMemRefType, adaptor.getDst(), + (adaptor.getDstIndices()), rewriter); + + rewriter.replaceOpWithNewOp( + op, srcPtr, dstPtr, createI32Constant(rewriter, loc, loadWidth), + createI32Constant(rewriter, loc, 0), + createI32Constant(rewriter, loc, 0), ArrayAttr{}, ArrayAttr{}, + ArrayAttr{}); + + return success(); + } +}; + namespace { struct ExtPackedFp8OpLowering final : public ConvertOpToLLVMPattern { @@ -1393,6 +1442,6 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, ROCDL::RawPtrBufferAtomicCmpSwap>, AMDGPUDPPLowering, LDSBarrierOpLowering, SchedBarrierOpLowering, MFMAOpLowering, WMMAOpLowering, ExtPackedFp8OpLowering, - PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering>(converter, - chipset); + PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, + GatherToLDSOpLowering>(converter, chipset); } diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 1e482515a4ee0..7f286f938ee60 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -15,6 +15,7 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h" #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Diagnostics.h" @@ -24,6 +25,7 @@ #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/IR/DerivedTypes.h" #include #include @@ -112,21 +114,31 @@ LogicalResult FatRawBufferCastOp::verify() { return success(); } +static bool hasGlobalMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return true; + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == gpu::AddressSpace::Global; + return false; +} + +static bool hasWorkgroupMemorySpace(Attribute memorySpace) { + if (auto intMemorySpace = dyn_cast(memorySpace)) + return intMemorySpace.getInt() == 3; + if (auto gpuMemorySpace = dyn_cast(memorySpace)) + return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup; + return false; +} + //===----------------------------------------------------------------------===// // RawBuffer*Op //===----------------------------------------------------------------------===// template static LogicalResult verifyRawBufferOp(T &op) { MemRefType bufferType = llvm::cast(op.getMemref().getType()); - Attribute memorySpace = bufferType.getMemorySpace(); - bool isGlobal = false; - if (!memorySpace) - isGlobal = true; - else if (auto intMemorySpace = llvm::dyn_cast(memorySpace)) - isGlobal = intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1; - else if (auto gpuMemorySpace = - llvm::dyn_cast(memorySpace)) - isGlobal = gpuMemorySpace.getValue() == gpu::AddressSpace::Global; + bool isGlobal = hasGlobalMemorySpace(bufferType.getMemorySpace()); if (!isGlobal) return op.emitOpError( @@ -461,6 +473,40 @@ LogicalResult DPPOp::verify() { return success(); } +LogicalResult GatherToLDSOp::verify() { + MemRefType srcType = cast(getSrc().getType()); + MemRefType dstType = cast(getDst().getType()); + + if (!memref::isStaticShapeAndContiguousRowMajor(dstType)) + return emitOpError( + "destination types must have static shape and contiguous"); + + auto elemType = srcType.getElementType(); + // Check $src and $dst element types are the same. + if (elemType != dstType.getElementType()) + return emitOpError("source and destination element types must match"); + + // copy type sizes should be 1, 2, or 4 bytes. + auto transferType = getTransferType(); + size_t transferSize; + if (auto vectorTransfer = dyn_cast(transferType)) { + transferSize = vectorTransfer.getNumElements() * + vectorTransfer.getElementTypeBitWidth(); + } else { + transferSize = transferType.getIntOrFloatBitWidth(); + } + if (transferSize != 8 && transferSize != 16 && transferSize != 32) + return emitOpError("Transfering type size must be 8, 16, or 32 bits"); + + if (!hasGlobalMemorySpace(srcType.getMemorySpace())) + return emitOpError("source memory address space must be Global"); + + if (!hasWorkgroupMemorySpace(dstType.getMemorySpace())) + return emitOpError("destination memory address space must be Workgroup"); + + return success(); +} + #include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.cpp.inc" #define GET_ATTRDEF_CLASSES diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir new file mode 100644 index 0000000000000..b1c16bd5db079 --- /dev/null +++ b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir @@ -0,0 +1,143 @@ +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s + +#gpu_global_addrspace = 1 +#gpu_lds_addrspace = 3 + +// CHECK-LABEL: func @global_load_to_rocdl_f32 +// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xf32, 1>) +func.func @global_load_to_rocdl_f32(%global : memref<128x72xf32, #gpu_global_addrspace>) { + %c0 = arith.constant 0 : index + %c12 = arith.constant 12 : index + %c32 = arith.constant 32 : index + %alloc = memref.alloc() : memref<64x64xf32, #gpu_lds_addrspace> + // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] + + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64 + // CHECK: %[[C12:.*]] = arith.constant 12 : index + // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]] + // CHECK: %[[C32:.*]] = arith.constant 32 : index + // CHECK: %[[IC32:.*]] = builtin.unrealized_conversion_cast %[[C32]] + + // CHECK: %[[ALLOC:.*]] = memref.alloc() + // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast + // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] + + // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64 + // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64 + // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64 + + // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]] + // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1] + + // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64 + // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64 + // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64 + + // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]] + // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]] + amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] + : f32, memref<128x72xf32, #gpu_global_addrspace>, memref<64x64xf32, #gpu_lds_addrspace> + func.return +} + +// CHECK-LABEL: func @global_load_to_rocdl_i8 +// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi8, 1>) +func.func @global_load_to_rocdl_i8(%global : memref<128x72xi8, #gpu_global_addrspace>) { + // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] + + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64 + // CHECK: %[[C12:.*]] = arith.constant 12 : index + // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]] + // CHECK: %[[C32:.*]] = arith.constant 32 : index + // CHECK: %[[IC32:.*]] = builtin.unrealized_conversion_cast %[[C32]] + + // CHECK: %[[ALLOC:.*]] = memref.alloc() + // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]] + // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] + + // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64 + // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64 + // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64 + + // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]] + // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1] + + // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64 + // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64 + // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64 + + // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]] + // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C1]] + %c0 = arith.constant 0 : index + %c12 = arith.constant 12 : index + %c32 = arith.constant 32 : index + %alloc = memref.alloc() : memref<64x64xi8, #gpu_lds_addrspace> + amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] + : i8, memref<128x72xi8, #gpu_global_addrspace>, memref<64x64xi8, #gpu_lds_addrspace> + func.return +} + +// CHECK-LABEL: func @global_load_to_rocdl_vec +// CHECK-SAME: (%[[ARG0:.*]]: memref<128x72xi16, 1>) +func.func @global_load_to_rocdl_vec(%global : memref<128x72xi16, #gpu_global_addrspace>) { + // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] + + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64 + // CHECK: %[[C12:.*]] = arith.constant 12 : index + // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]] + // CHECK: %[[C32:.*]] = arith.constant 32 : index + // CHECK: %[[IC32:.*]] = builtin.unrealized_conversion_cast %[[C32]] + + // CHECK: %[[ALLOC:.*]] = memref.alloc() + // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]] + // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] + + // CHECK: %[[C72:.*]] = llvm.mlir.constant(72 : index) : i64 + // CHECK: %[[MUL:.*]] = llvm.mul %[[IC12]], %[[C72]] : i64 + // CHECK: %[[SRC_OFFSET:.*]] = llvm.add %[[MUL]], %[[IC0]] : i64 + + // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRC_OFFSET]]] + // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1] + + // CHECK: %[[C72_1:.*]] = llvm.mlir.constant(72 : index) : i64 + // CHECK: %[[MUL_2:.*]] = llvm.mul %[[IC32]], %[[C72_1]] : i64 + // CHECK: %[[DST_OFFSET:.*]] = llvm.add %[[MUL_2]], %[[IC0]] : i64 + + // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DST_OFFSET]]] + // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]] + %c0 = arith.constant 0 : index + %c12 = arith.constant 12 : index + %c32 = arith.constant 32 : index + %alloc = memref.alloc() : memref<64x128xi16, #gpu_lds_addrspace> + amdgpu.gather_to_lds %global[%c12, %c0], %alloc[%c32, %c0] + : vector<2 x i16>, memref<128x72xi16, #gpu_global_addrspace>, memref<64x128xi16, #gpu_lds_addrspace> + func.return +} + + +// CHECK-LABEL: func @global_load_to_rocdl_dynamic_indices +// CHECK-SAME: (%[[ARG0:.*]]: memref<512xi32, 1>, %[[SRC_IDX:.*]]: index, %[[DST_IDX:.*]]: index) +func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_global_addrspace>, %src_idx : index, %dst_idx : index) { + // CHECK: %[[DSTIDX_CAST:.*]] = builtin.unrealized_conversion_cast %[[DST_IDX]] + // CHECK: %[[SRCIDX_CAST:.*]] = builtin.unrealized_conversion_cast %[[SRC_IDX]] + // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] + // CHECK: %[[ALLOC:.*]] = memref.alloc() + // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]] + // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] + // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]] + // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1] + // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX_CAST]]] + // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK: rocdl.global.load.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], %[[C4]] + %alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace> + %c0 = arith.constant 0 : index + amdgpu.gather_to_lds %global[%src_idx], %alloc[%dst_idx, %c0] + : i32, memref<512xi32, #gpu_global_addrspace>, memref<4x64xi32, #gpu_lds_addrspace> + func.return +} From fdf20941a8f95b26578fbeb579019f74efe45545 Mon Sep 17 00:00:00 2001 From: wldfngrs Date: Tue, 8 Apr 2025 14:23:38 +0100 Subject: [PATCH 0985/1029] [libc][math] Fix signaling NaN handling for math functions. (#133347) Add tests for signaling NaNs, and fix function behavior for handling signaling NaN input. Fixes https://github.com/llvm/llvm-project/issues/124812 --- libc/src/math/generic/CMakeLists.txt | 3 ++ libc/src/math/generic/acosf.cpp | 7 ++++ libc/src/math/generic/asinf.cpp | 6 +++ libc/src/math/generic/asinhf.cpp | 8 +++- libc/src/math/generic/atan2.cpp | 6 ++- libc/src/math/generic/atan2f.cpp | 7 +++- libc/src/math/generic/atanhf.cpp | 4 ++ libc/src/math/generic/cos.cpp | 6 ++- libc/src/math/generic/cosf.cpp | 5 +++ libc/src/math/generic/cosf16.cpp | 5 +++ libc/src/math/generic/cospif.cpp | 5 +++ libc/src/math/generic/cospif16.cpp | 4 ++ libc/src/math/generic/erff.cpp | 4 ++ libc/src/math/generic/log1p.cpp | 7 +++- libc/src/math/generic/logf.cpp | 5 +++ libc/src/math/generic/pow.cpp | 5 +++ libc/src/math/generic/powf.cpp | 14 ++++++- libc/src/math/generic/sin.cpp | 5 +++ libc/src/math/generic/sincos.cpp | 6 +++ libc/src/math/generic/sincosf.cpp | 6 +++ libc/src/math/generic/sinf.cpp | 5 +++ libc/src/math/generic/sinf16.cpp | 5 +++ libc/src/math/generic/sinpif.cpp | 5 +++ libc/src/math/generic/sinpif16.cpp | 4 ++ libc/src/math/generic/tan.cpp | 4 ++ libc/src/math/generic/tanf.cpp | 5 +++ libc/src/math/generic/tanf16.cpp | 4 ++ libc/src/math/generic/tanpif16.cpp | 5 +++ libc/test/src/math/smoke/CMakeLists.txt | 1 + libc/test/src/math/smoke/acosf_test.cpp | 3 ++ libc/test/src/math/smoke/acoshf_test.cpp | 3 ++ libc/test/src/math/smoke/asinf_test.cpp | 3 ++ libc/test/src/math/smoke/asinhf_test.cpp | 3 ++ libc/test/src/math/smoke/atan2_test.cpp | 12 ++++++ libc/test/src/math/smoke/atan2f_test.cpp | 12 ++++++ libc/test/src/math/smoke/atan_test.cpp | 6 +-- libc/test/src/math/smoke/atanf_test.cpp | 2 + libc/test/src/math/smoke/atanhf_test.cpp | 3 +- libc/test/src/math/smoke/cbrt_test.cpp | 3 ++ libc/test/src/math/smoke/cbrtf_test.cpp | 3 ++ libc/test/src/math/smoke/cos_test.cpp | 3 ++ libc/test/src/math/smoke/cosf16_test.cpp | 3 ++ libc/test/src/math/smoke/cosf_test.cpp | 3 ++ libc/test/src/math/smoke/coshf_test.cpp | 3 ++ libc/test/src/math/smoke/cospif16_test.cpp | 3 ++ libc/test/src/math/smoke/cospif_test.cpp | 3 ++ libc/test/src/math/smoke/erff_test.cpp | 3 ++ libc/test/src/math/smoke/exp10_test.cpp | 3 ++ libc/test/src/math/smoke/exp10f_test.cpp | 3 ++ libc/test/src/math/smoke/exp10m1f_test.cpp | 3 ++ libc/test/src/math/smoke/exp2_test.cpp | 3 ++ libc/test/src/math/smoke/exp2f_test.cpp | 3 ++ libc/test/src/math/smoke/exp2m1f_test.cpp | 3 ++ libc/test/src/math/smoke/exp_test.cpp | 3 ++ libc/test/src/math/smoke/expf_test.cpp | 3 ++ libc/test/src/math/smoke/expm1_test.cpp | 3 ++ libc/test/src/math/smoke/expm1f_test.cpp | 3 ++ libc/test/src/math/smoke/log10_test.cpp | 3 ++ libc/test/src/math/smoke/log10f_test.cpp | 3 ++ libc/test/src/math/smoke/log1p_test.cpp | 3 ++ libc/test/src/math/smoke/log1pf_test.cpp | 3 ++ libc/test/src/math/smoke/log2_test.cpp | 3 ++ libc/test/src/math/smoke/log2f_test.cpp | 3 ++ libc/test/src/math/smoke/log_test.cpp | 3 ++ libc/test/src/math/smoke/logf_test.cpp | 3 ++ libc/test/src/math/smoke/pow_test.cpp | 40 +++++++++++++++++++- libc/test/src/math/smoke/powf_test.cpp | 44 +++++++++++++++++++++- libc/test/src/math/smoke/sin_test.cpp | 3 ++ libc/test/src/math/smoke/sincos_test.cpp | 5 +++ libc/test/src/math/smoke/sincosf_test.cpp | 5 +++ libc/test/src/math/smoke/sinf16_test.cpp | 3 ++ libc/test/src/math/smoke/sinf_test.cpp | 3 ++ libc/test/src/math/smoke/sinhf_test.cpp | 3 ++ libc/test/src/math/smoke/sinpif16_test.cpp | 3 ++ libc/test/src/math/smoke/sinpif_test.cpp | 3 ++ libc/test/src/math/smoke/tan_test.cpp | 3 ++ libc/test/src/math/smoke/tanf16_test.cpp | 3 ++ libc/test/src/math/smoke/tanf_test.cpp | 3 ++ libc/test/src/math/smoke/tanhf_test.cpp | 3 ++ libc/test/src/math/smoke/tanpif16_test.cpp | 3 ++ 80 files changed, 399 insertions(+), 12 deletions(-) diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index f7c36aab77b7d..adbed5b2de48c 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -4169,7 +4169,9 @@ add_entrypoint_object( atan2f_float.h DEPENDS .inv_trigf_utils + libc.hdr.fenv_macros libc.src.__support.FPUtil.double_double + libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.multiply_add libc.src.__support.FPUtil.nearest_integer @@ -4187,6 +4189,7 @@ add_entrypoint_object( DEPENDS .atan_utils libc.src.__support.FPUtil.double_double + libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.multiply_add libc.src.__support.FPUtil.nearest_integer diff --git a/libc/src/math/generic/acosf.cpp b/libc/src/math/generic/acosf.cpp index 509a5ebc4973e..8dd6de2ce7474 100644 --- a/libc/src/math/generic/acosf.cpp +++ b/libc/src/math/generic/acosf.cpp @@ -84,10 +84,17 @@ LLVM_LIBC_FUNCTION(float, acosf, (float x)) { 0x1.921fb6p+1f) : /* x == 1.0f */ 0.0f; + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + // |x| <= +/-inf if (x_abs <= 0x7f80'0000U) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); } + return x + FPBits::quiet_nan().get_val(); } diff --git a/libc/src/math/generic/asinf.cpp b/libc/src/math/generic/asinf.cpp index da854417e85fe..12383bf6dacae 100644 --- a/libc/src/math/generic/asinf.cpp +++ b/libc/src/math/generic/asinf.cpp @@ -108,10 +108,16 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) { // |x| > 1, return NaNs. if (LIBC_UNLIKELY(x_abs > 0x3f80'0000U)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + if (x_abs <= 0x7f80'0000U) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); } + return FPBits::quiet_nan().get_val(); } diff --git a/libc/src/math/generic/asinhf.cpp b/libc/src/math/generic/asinhf.cpp index 37b87a821222a..0bb7065eb1cfe 100644 --- a/libc/src/math/generic/asinhf.cpp +++ b/libc/src/math/generic/asinhf.cpp @@ -61,8 +61,14 @@ LLVM_LIBC_FUNCTION(float, asinhf, (float x)) { }; if (LIBC_UNLIKELY(x_abs >= 0x4bdd'65a5U)) { - if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) + if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits_t::quiet_nan().get_val(); + } + return x; + } // Exceptional cases when x > 2^24. switch (x_abs) { diff --git a/libc/src/math/generic/atan2.cpp b/libc/src/math/generic/atan2.cpp index 8adfe3321a9ee..aa770de33fb1f 100644 --- a/libc/src/math/generic/atan2.cpp +++ b/libc/src/math/generic/atan2.cpp @@ -8,6 +8,7 @@ #include "src/math/atan2.h" #include "atan_utils.h" +#include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/double_double.h" #include "src/__support/FPUtil/multiply_add.h" @@ -111,8 +112,11 @@ LLVM_LIBC_FUNCTION(double, atan2, (double y, double x)) { // Check for exceptional cases, whether inputs are 0, inf, nan, or close to // overflow, or close to underflow. if (LIBC_UNLIKELY(max_exp > 0x7ffU - 128U || min_exp < 128U)) { - if (x_bits.is_nan() || y_bits.is_nan()) + if (x_bits.is_nan() || y_bits.is_nan()) { + if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan()) + fputil::raise_except_if_required(FE_INVALID); return FPBits::quiet_nan().get_val(); + } unsigned x_except = x == 0.0 ? 0 : (FPBits(x_abs).is_inf() ? 2 : 1); unsigned y_except = y == 0.0 ? 0 : (FPBits(y_abs).is_inf() ? 2 : 1); diff --git a/libc/src/math/generic/atan2f.cpp b/libc/src/math/generic/atan2f.cpp index 726cae9c8462b..c04b0eb1cc589 100644 --- a/libc/src/math/generic/atan2f.cpp +++ b/libc/src/math/generic/atan2f.cpp @@ -7,7 +7,9 @@ //===----------------------------------------------------------------------===// #include "src/math/atan2f.h" +#include "hdr/fenv_macros.h" #include "inv_trigf_utils.h" +#include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" #include "src/__support/FPUtil/double_double.h" @@ -264,8 +266,11 @@ LLVM_LIBC_FUNCTION(float, atan2f, (float y, float x)) { double den_d = static_cast(den_f); if (LIBC_UNLIKELY(max_abs >= 0x7f80'0000U || num_d == 0.0)) { - if (x_bits.is_nan() || y_bits.is_nan()) + if (x_bits.is_nan() || y_bits.is_nan()) { + if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan()) + fputil::raise_except_if_required(FE_INVALID); return FPBits::quiet_nan().get_val(); + } double x_d = static_cast(x); double y_d = static_cast(y); size_t x_except = (x_d == 0.0) ? 0 : (x_abs == 0x7f80'0000 ? 2 : 1); diff --git a/libc/src/math/generic/atanhf.cpp b/libc/src/math/generic/atanhf.cpp index a2051bd3e3e67..2149314d2f676 100644 --- a/libc/src/math/generic/atanhf.cpp +++ b/libc/src/math/generic/atanhf.cpp @@ -24,6 +24,10 @@ LLVM_LIBC_FUNCTION(float, atanhf, (float x)) { // |x| >= 1.0 if (LIBC_UNLIKELY(x_abs >= 0x3F80'0000U)) { if (xbits.is_nan()) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } return x; } // |x| == 1.0 diff --git a/libc/src/math/generic/cos.cpp b/libc/src/math/generic/cos.cpp index b60082bf9c308..5da0f86812a89 100644 --- a/libc/src/math/generic/cos.cpp +++ b/libc/src/math/generic/cos.cpp @@ -65,7 +65,11 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) { } else { // Inf or NaN if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) { - // sin(+-Inf) = NaN + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + // cos(+-Inf) = NaN if (xbits.get_mantissa() == 0) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); diff --git a/libc/src/math/generic/cosf.cpp b/libc/src/math/generic/cosf.cpp index 6ea24f9ccd3fa..7cdae09869588 100644 --- a/libc/src/math/generic/cosf.cpp +++ b/libc/src/math/generic/cosf.cpp @@ -117,6 +117,11 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) { // x is inf or nan. if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + if (x_abs == 0x7f80'0000U) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); diff --git a/libc/src/math/generic/cosf16.cpp b/libc/src/math/generic/cosf16.cpp index 4d42db981ce71..99bb03eb71426 100644 --- a/libc/src/math/generic/cosf16.cpp +++ b/libc/src/math/generic/cosf16.cpp @@ -67,6 +67,11 @@ LLVM_LIBC_FUNCTION(float16, cosf16, (float16 x)) { // cos(+/-inf) = NaN, and cos(NaN) = NaN if (xbits.is_inf_or_nan()) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + if (xbits.is_inf()) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); diff --git a/libc/src/math/generic/cospif.cpp b/libc/src/math/generic/cospif.cpp index 29566f4fceacf..5b6880f853b26 100644 --- a/libc/src/math/generic/cospif.cpp +++ b/libc/src/math/generic/cospif.cpp @@ -66,6 +66,11 @@ LLVM_LIBC_FUNCTION(float, cospif, (float x)) { // x is inf or nan. if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + if (x_abs == 0x7f80'0000U) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); diff --git a/libc/src/math/generic/cospif16.cpp b/libc/src/math/generic/cospif16.cpp index ee74bdb4a3693..9dc25920d5cfe 100644 --- a/libc/src/math/generic/cospif16.cpp +++ b/libc/src/math/generic/cospif16.cpp @@ -54,6 +54,10 @@ LLVM_LIBC_FUNCTION(float16, cospif16, (float16 x)) { // Check for NaN or infintiy values if (LIBC_UNLIKELY(x_abs >= 0x7c00)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } // If value is equal to infinity if (x_abs == 0x7c00) { fputil::set_errno_if_required(EDOM); diff --git a/libc/src/math/generic/erff.cpp b/libc/src/math/generic/erff.cpp index 016afe4a68140..44607a52a2e57 100644 --- a/libc/src/math/generic/erff.cpp +++ b/libc/src/math/generic/erff.cpp @@ -135,6 +135,10 @@ LLVM_LIBC_FUNCTION(float, erff, (float x)) { int sign = xbits.is_neg() ? 1 : 0; if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } return (x_abs > 0x7f80'0000) ? x : ONE[sign]; } diff --git a/libc/src/math/generic/log1p.cpp b/libc/src/math/generic/log1p.cpp index 058409fed081d..09f465a6ba774 100644 --- a/libc/src/math/generic/log1p.cpp +++ b/libc/src/math/generic/log1p.cpp @@ -910,7 +910,12 @@ LLVM_LIBC_FUNCTION(double, log1p, (double x)) { return FPBits_t::quiet_nan().get_val(); } // x is +Inf or NaN - return x; + if (xbits.is_inf() && xbits.is_pos()) + return x; + + if (xbits.is_signaling_nan()) + fputil::raise_except_if_required(FE_INVALID); + return FPBits_t::quiet_nan().get_val(); } x_dd.hi = x; } else { diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp index 032d658a941be..e8d2ba2cfe175 100644 --- a/libc/src/math/generic/logf.cpp +++ b/libc/src/math/generic/logf.cpp @@ -132,6 +132,11 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) { return FPBits::quiet_nan().get_val(); } // x is +inf or nan + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + return x; } } diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp index 8a12934f6c4ba..43e99a7acf690 100644 --- a/libc/src/math/generic/pow.cpp +++ b/libc/src/math/generic/pow.cpp @@ -217,6 +217,11 @@ LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) { uint64_t sign = 0; ///////// BEGIN - Check exceptional cases //////////////////////////////////// + // If x or y is signaling NaN + if (x_abs.is_signaling_nan() || y_abs.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } // The double precision number that is closest to 1 is (1 - 2^-53), which has // log2(1 - 2^-53) ~ -1.715...p-53. diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp index 2d7deca3c77bb..dfdfd5d6d5760 100644 --- a/libc/src/math/generic/powf.cpp +++ b/libc/src/math/generic/powf.cpp @@ -664,6 +664,12 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) { // |y * log2(x)| = 0 or > 151. // Hence x^y will either overflow or underflow if x is not zero. if (LIBC_UNLIKELY((y_abs & 0x0007'ffff) == 0) || (y_abs > 0x4f170000)) { + // y is signaling NaN + if (xbits.is_signaling_nan() || ybits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FloatBits::quiet_nan().get_val(); + } + // Exceptional exponents. if (y == 0.0f) return 1.0f; @@ -736,8 +742,8 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) { } } if (y_abs > 0x4f17'0000) { + // if y is NaN if (y_abs > 0x7f80'0000) { - // y is NaN if (x_u == 0x3f80'0000) { // x = 1.0f // pow(1, NaN) = 1 return 1.0f; @@ -759,6 +765,12 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) { // y is finite and non-zero. if (LIBC_UNLIKELY(((x_u & 0x801f'ffffU) == 0) || x_u >= 0x7f80'0000U || x_u < 0x0080'0000U)) { + // if x is signaling NaN + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FloatBits::quiet_nan().get_val(); + } + switch (x_u) { case 0x3f80'0000: // x = 1.0f return 1.0f; diff --git a/libc/src/math/generic/sin.cpp b/libc/src/math/generic/sin.cpp index ba370d4bea4da..a614427bd7ee3 100644 --- a/libc/src/math/generic/sin.cpp +++ b/libc/src/math/generic/sin.cpp @@ -77,6 +77,11 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) { // Inf or NaN if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) { // sin(+-Inf) = NaN + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + if (xbits.get_mantissa() == 0) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); diff --git a/libc/src/math/generic/sincos.cpp b/libc/src/math/generic/sincos.cpp index 0ac2f7f997527..08c8a8298f029 100644 --- a/libc/src/math/generic/sincos.cpp +++ b/libc/src/math/generic/sincos.cpp @@ -85,6 +85,12 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) { } else { // Inf or NaN if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + *sin_x = *cos_x = FPBits::quiet_nan().get_val(); + return; + } + // sin(+-Inf) = NaN if (xbits.get_mantissa() == 0) { fputil::set_errno_if_required(EDOM); diff --git a/libc/src/math/generic/sincosf.cpp b/libc/src/math/generic/sincosf.cpp index 623ef636afb1e..9c7bf181e485e 100644 --- a/libc/src/math/generic/sincosf.cpp +++ b/libc/src/math/generic/sincosf.cpp @@ -145,6 +145,12 @@ LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) { // x is inf or nan. if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + *sinp = *cosp = FPBits::quiet_nan().get_val(); + return; + } + if (x_abs == 0x7f80'0000U) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); diff --git a/libc/src/math/generic/sinf.cpp b/libc/src/math/generic/sinf.cpp index d27ce843a2c92..38ea56f5f28c6 100644 --- a/libc/src/math/generic/sinf.cpp +++ b/libc/src/math/generic/sinf.cpp @@ -136,6 +136,11 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) { #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + if (x_abs == 0x7f80'0000U) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); diff --git a/libc/src/math/generic/sinf16.cpp b/libc/src/math/generic/sinf16.cpp index 85e55a614588a..28debbd52a9a5 100644 --- a/libc/src/math/generic/sinf16.cpp +++ b/libc/src/math/generic/sinf16.cpp @@ -87,6 +87,11 @@ LLVM_LIBC_FUNCTION(float16, sinf16, (float16 x)) { } if (xbits.is_inf_or_nan()) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + if (xbits.is_inf()) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); diff --git a/libc/src/math/generic/sinpif.cpp b/libc/src/math/generic/sinpif.cpp index f572ded06b25a..492689d594d90 100644 --- a/libc/src/math/generic/sinpif.cpp +++ b/libc/src/math/generic/sinpif.cpp @@ -83,6 +83,11 @@ LLVM_LIBC_FUNCTION(float, sinpif, (float x)) { // check for NaN values if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + if (x_abs == 0x7f80'0000U) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); diff --git a/libc/src/math/generic/sinpif16.cpp b/libc/src/math/generic/sinpif16.cpp index 51ea595653b4d..68af484a6c5d3 100644 --- a/libc/src/math/generic/sinpif16.cpp +++ b/libc/src/math/generic/sinpif16.cpp @@ -50,6 +50,10 @@ LLVM_LIBC_FUNCTION(float16, sinpif16, (float16 x)) { if (LIBC_UNLIKELY(x_abs >= 0x6400)) { // Check for NaN or infinity values if (LIBC_UNLIKELY(x_abs >= 0x7c00)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } // If value is equal to infinity if (x_abs == 0x7c00) { fputil::set_errno_if_required(EDOM); diff --git a/libc/src/math/generic/tan.cpp b/libc/src/math/generic/tan.cpp index a899a2128d384..89b812cfc23a0 100644 --- a/libc/src/math/generic/tan.cpp +++ b/libc/src/math/generic/tan.cpp @@ -163,6 +163,10 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) { } else { // Inf or NaN if (LIBC_UNLIKELY(x_e > 2 * FPBits::EXP_BIAS)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } // tan(+-Inf) = NaN if (xbits.get_mantissa() == 0) { fputil::set_errno_if_required(EDOM); diff --git a/libc/src/math/generic/tanf.cpp b/libc/src/math/generic/tanf.cpp index a15aa9796cbd8..ca5e35dca4c91 100644 --- a/libc/src/math/generic/tanf.cpp +++ b/libc/src/math/generic/tanf.cpp @@ -113,6 +113,11 @@ LLVM_LIBC_FUNCTION(float, tanf, (float x)) { if (LIBC_UNLIKELY(x_abs > 0x4d56'd354U)) { // Inf or NaN if (LIBC_UNLIKELY(x_abs >= 0x7f80'0000U)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + if (x_abs == 0x7f80'0000U) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); diff --git a/libc/src/math/generic/tanf16.cpp b/libc/src/math/generic/tanf16.cpp index 97d201b65bbe6..229f4a363670b 100644 --- a/libc/src/math/generic/tanf16.cpp +++ b/libc/src/math/generic/tanf16.cpp @@ -84,6 +84,10 @@ LLVM_LIBC_FUNCTION(float16, tanf16, (float16 x)) { // tan(+/-inf) = NaN, and tan(NaN) = NaN if (LIBC_UNLIKELY(x_abs >= 0x7c00)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } // x = +/-inf if (x_abs == 0x7c00) { fputil::set_errno_if_required(EDOM); diff --git a/libc/src/math/generic/tanpif16.cpp b/libc/src/math/generic/tanpif16.cpp index 71cf25c9741a1..792d405b1bb9e 100644 --- a/libc/src/math/generic/tanpif16.cpp +++ b/libc/src/math/generic/tanpif16.cpp @@ -63,6 +63,11 @@ LLVM_LIBC_FUNCTION(float16, tanpif16, (float16 x)) { if (LIBC_UNLIKELY(x_abs >= 0x6400)) { // Check for NaN or infinity values if (LIBC_UNLIKELY(x_abs >= 0x7c00)) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + // is inf if (x_abs == 0x7c00) { fputil::set_errno_if_required(EDOM); fputil::raise_except_if_required(FE_INVALID); diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index bf6999d5d5649..223d1933bca38 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -4262,6 +4262,7 @@ add_fp_unittest( SRCS pow_test.cpp DEPENDS + libc.src.errno.errno libc.hdr.fenv_macros libc.src.math.pow ) diff --git a/libc/test/src/math/smoke/acosf_test.cpp b/libc/test/src/math/smoke/acosf_test.cpp index e5d56c70f2722..74f68e00011aa 100644 --- a/libc/test/src/math/smoke/acosf_test.cpp +++ b/libc/test/src/math/smoke/acosf_test.cpp @@ -20,6 +20,9 @@ using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcosfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::acosf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acosf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/acoshf_test.cpp b/libc/test/src/math/smoke/acoshf_test.cpp index c4e88259919c3..c5ba88055ac57 100644 --- a/libc/test/src/math/smoke/acoshf_test.cpp +++ b/libc/test/src/math/smoke/acoshf_test.cpp @@ -20,6 +20,9 @@ using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcoshfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::acoshf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acoshf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/asinf_test.cpp b/libc/test/src/math/smoke/asinf_test.cpp index ce1576e2b57df..d817d2b366192 100644 --- a/libc/test/src/math/smoke/asinf_test.cpp +++ b/libc/test/src/math/smoke/asinf_test.cpp @@ -20,6 +20,9 @@ using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAsinfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::asinf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::asinf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/asinhf_test.cpp b/libc/test/src/math/smoke/asinhf_test.cpp index 5b83ce6466113..4a8743c50075f 100644 --- a/libc/test/src/math/smoke/asinhf_test.cpp +++ b/libc/test/src/math/smoke/asinhf_test.cpp @@ -20,6 +20,9 @@ using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAsinhfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::asinhf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::asinhf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/atan2_test.cpp b/libc/test/src/math/smoke/atan2_test.cpp index 1606c3f378cb8..a79845fa0303d 100644 --- a/libc/test/src/math/smoke/atan2_test.cpp +++ b/libc/test/src/math/smoke/atan2_test.cpp @@ -13,6 +13,18 @@ using LlvmLibcAtan2Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtan2Test, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atan2(sNaN, sNaN), + FE_INVALID); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atan2(sNaN, 1.0), + FE_INVALID); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atan2(1.0, sNaN), + FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan2(aNaN, zero)); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan2(1.0, aNaN)); EXPECT_FP_EQ_ALL_ROUNDING(0.0, LIBC_NAMESPACE::atan2(zero, zero)); diff --git a/libc/test/src/math/smoke/atan2f_test.cpp b/libc/test/src/math/smoke/atan2f_test.cpp index 94ec18d8f6b14..1fbcfbe96b2d7 100644 --- a/libc/test/src/math/smoke/atan2f_test.cpp +++ b/libc/test/src/math/smoke/atan2f_test.cpp @@ -18,6 +18,18 @@ using LlvmLibcAtan2fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtan2fTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atan2f(sNaN, sNaN), + FE_INVALID); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atan2f(sNaN, 1.0f), + FE_INVALID); + EXPECT_MATH_ERRNO(0); + + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atan2f(1.0f, sNaN), + FE_INVALID); + EXPECT_MATH_ERRNO(0); + // TODO: Strengthen errno,exception checks and remove these assert macros // after new matchers/test fixtures are added see: // https://github.com/llvm/llvm-project/issues/90653. diff --git a/libc/test/src/math/smoke/atan_test.cpp b/libc/test/src/math/smoke/atan_test.cpp index b83f315ec78fa..6576db9401c60 100644 --- a/libc/test/src/math/smoke/atan_test.cpp +++ b/libc/test/src/math/smoke/atan_test.cpp @@ -13,10 +13,10 @@ using LlvmLibcAtanTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtanTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atan(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atan(aNaN)); - // atan(sNaN) = aNaN. - EXPECT_EQ(FPBits(aNaN).uintval(), - FPBits(LIBC_NAMESPACE::atan(sNaN)).uintval()); EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::atan(zero)); EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::atan(neg_zero)); // atan(+-Inf) = +- pi/2. diff --git a/libc/test/src/math/smoke/atanf_test.cpp b/libc/test/src/math/smoke/atanf_test.cpp index 346b8e8abd199..7d09a28beaa38 100644 --- a/libc/test/src/math/smoke/atanf_test.cpp +++ b/libc/test/src/math/smoke/atanf_test.cpp @@ -19,6 +19,8 @@ using LlvmLibcAtanfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtanfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atanf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); // TODO: Strengthen errno,exception checks and remove these assert macros // after new matchers/test fixtures are added diff --git a/libc/test/src/math/smoke/atanhf_test.cpp b/libc/test/src/math/smoke/atanhf_test.cpp index 8300b47ea9a31..73a5b81b0240b 100644 --- a/libc/test/src/math/smoke/atanhf_test.cpp +++ b/libc/test/src/math/smoke/atanhf_test.cpp @@ -21,7 +21,8 @@ using LlvmLibcAtanhfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtanhfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; - + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atanhf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); // TODO: Strengthen errno,exception checks and remove these assert macros // after new matchers/test fixtures are added, see: // https://github.com/llvm/llvm-project/issues/90653 diff --git a/libc/test/src/math/smoke/cbrt_test.cpp b/libc/test/src/math/smoke/cbrt_test.cpp index 092e6dd1aeed3..9218f0f4092a7 100644 --- a/libc/test/src/math/smoke/cbrt_test.cpp +++ b/libc/test/src/math/smoke/cbrt_test.cpp @@ -15,6 +15,9 @@ using LlvmLibcCbrtTest = LIBC_NAMESPACE::testing::FPTest; using LIBC_NAMESPACE::testing::tlog; TEST_F(LlvmLibcCbrtTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cbrt(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::cbrt(aNaN)); EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::cbrt(inf)); EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, LIBC_NAMESPACE::cbrt(neg_inf)); diff --git a/libc/test/src/math/smoke/cbrtf_test.cpp b/libc/test/src/math/smoke/cbrtf_test.cpp index 202a5ce073358..5dcdf61dd9bff 100644 --- a/libc/test/src/math/smoke/cbrtf_test.cpp +++ b/libc/test/src/math/smoke/cbrtf_test.cpp @@ -15,6 +15,9 @@ using LlvmLibcCbrtfTest = LIBC_NAMESPACE::testing::FPTest; using LIBC_NAMESPACE::testing::tlog; TEST_F(LlvmLibcCbrtfTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cbrtf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::cbrtf(aNaN)); EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::cbrtf(inf)); EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, LIBC_NAMESPACE::cbrtf(neg_inf)); diff --git a/libc/test/src/math/smoke/cos_test.cpp b/libc/test/src/math/smoke/cos_test.cpp index 88d8ead1af992..427d2c484302f 100644 --- a/libc/test/src/math/smoke/cos_test.cpp +++ b/libc/test/src/math/smoke/cos_test.cpp @@ -15,6 +15,9 @@ using LlvmLibcCosTest = LIBC_NAMESPACE::testing::FPTest; using LIBC_NAMESPACE::testing::tlog; TEST_F(LlvmLibcCosTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cos(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::cos(aNaN)); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::cos(inf)); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::cos(neg_inf)); diff --git a/libc/test/src/math/smoke/cosf16_test.cpp b/libc/test/src/math/smoke/cosf16_test.cpp index 9a51d1015da34..2638551fb1d1b 100644 --- a/libc/test/src/math/smoke/cosf16_test.cpp +++ b/libc/test/src/math/smoke/cosf16_test.cpp @@ -16,6 +16,9 @@ using LlvmLibcCosf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCosf16Test, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cosf16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cosf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/cosf_test.cpp b/libc/test/src/math/smoke/cosf_test.cpp index 2e261f9fac3c0..99773583dcb10 100644 --- a/libc/test/src/math/smoke/cosf_test.cpp +++ b/libc/test/src/math/smoke/cosf_test.cpp @@ -20,6 +20,9 @@ using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCosfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cosf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::cosf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/coshf_test.cpp b/libc/test/src/math/smoke/coshf_test.cpp index fd1556b10116d..1611ea1b92926 100644 --- a/libc/test/src/math/smoke/coshf_test.cpp +++ b/libc/test/src/math/smoke/coshf_test.cpp @@ -21,6 +21,9 @@ using LlvmLibcCoshfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCoshfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::coshf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::coshf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/cospif16_test.cpp b/libc/test/src/math/smoke/cospif16_test.cpp index 135267ab2ae6f..edd8ed97b30f6 100644 --- a/libc/test/src/math/smoke/cospif16_test.cpp +++ b/libc/test/src/math/smoke/cospif16_test.cpp @@ -17,6 +17,9 @@ using LlvmLibcCospif16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCospif16Test, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cospif16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cospif16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/cospif_test.cpp b/libc/test/src/math/smoke/cospif_test.cpp index bf6d86bcfe623..20153897dc459 100644 --- a/libc/test/src/math/smoke/cospif_test.cpp +++ b/libc/test/src/math/smoke/cospif_test.cpp @@ -17,6 +17,9 @@ using LlvmLibcCospifTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCospifTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cospif(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cospif(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/erff_test.cpp b/libc/test/src/math/smoke/erff_test.cpp index 7d2c1013752c7..a9f4994d77bb4 100644 --- a/libc/test/src/math/smoke/erff_test.cpp +++ b/libc/test/src/math/smoke/erff_test.cpp @@ -17,6 +17,9 @@ using LlvmLibcErffTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcErffTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::erff(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::erff(aNaN)); EXPECT_FP_EQ_ALL_ROUNDING(1.0f, LIBC_NAMESPACE::erff(inf)); EXPECT_FP_EQ_ALL_ROUNDING(-1.0f, LIBC_NAMESPACE::erff(neg_inf)); diff --git a/libc/test/src/math/smoke/exp10_test.cpp b/libc/test/src/math/smoke/exp10_test.cpp index ca9fc359edeb5..baf8a76810970 100644 --- a/libc/test/src/math/smoke/exp10_test.cpp +++ b/libc/test/src/math/smoke/exp10_test.cpp @@ -18,6 +18,9 @@ using LlvmLibcExp10Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp10Test, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::exp10(aNaN)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::exp10(inf)); EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::exp10(neg_inf)); diff --git a/libc/test/src/math/smoke/exp10f_test.cpp b/libc/test/src/math/smoke/exp10f_test.cpp index bcbfc96efd726..bf39e2cc12d0c 100644 --- a/libc/test/src/math/smoke/exp10f_test.cpp +++ b/libc/test/src/math/smoke/exp10f_test.cpp @@ -20,6 +20,9 @@ using LlvmLibcExp10fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp10fTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10f(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10f(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/exp10m1f_test.cpp b/libc/test/src/math/smoke/exp10m1f_test.cpp index 9c65a38425d77..2c2cfdbb08a3f 100644 --- a/libc/test/src/math/smoke/exp10m1f_test.cpp +++ b/libc/test/src/math/smoke/exp10m1f_test.cpp @@ -16,6 +16,9 @@ using LlvmLibcExp10m1fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp10m1fTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10m1f(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_EQ(FPBits(aNaN).uintval(), FPBits(LIBC_NAMESPACE::exp10m1f(aNaN)).uintval()); EXPECT_EQ(FPBits(neg_aNaN).uintval(), diff --git a/libc/test/src/math/smoke/exp2_test.cpp b/libc/test/src/math/smoke/exp2_test.cpp index d97a384367a09..9ab9129416dad 100644 --- a/libc/test/src/math/smoke/exp2_test.cpp +++ b/libc/test/src/math/smoke/exp2_test.cpp @@ -18,6 +18,9 @@ using LlvmLibcExp2Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp2Test, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp2(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::exp2(aNaN)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::exp2(inf)); EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::exp2(neg_inf)); diff --git a/libc/test/src/math/smoke/exp2f_test.cpp b/libc/test/src/math/smoke/exp2f_test.cpp index d9cdecbf0fe9b..a928389cc41b4 100644 --- a/libc/test/src/math/smoke/exp2f_test.cpp +++ b/libc/test/src/math/smoke/exp2f_test.cpp @@ -20,6 +20,9 @@ using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp2fTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp2f(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp2f(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/exp2m1f_test.cpp b/libc/test/src/math/smoke/exp2m1f_test.cpp index 4657d088f07a8..99bdf0035df0c 100644 --- a/libc/test/src/math/smoke/exp2m1f_test.cpp +++ b/libc/test/src/math/smoke/exp2m1f_test.cpp @@ -18,6 +18,9 @@ using LIBC_NAMESPACE::fputil::testing::RoundingMode; TEST_F(LlvmLibcExp2m1fTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp2m1f(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp2m1f(aNaN)); EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::exp2m1f(inf)); EXPECT_FP_EQ_ALL_ROUNDING(-1.0f, LIBC_NAMESPACE::exp2m1f(neg_inf)); diff --git a/libc/test/src/math/smoke/exp_test.cpp b/libc/test/src/math/smoke/exp_test.cpp index d2467ff883896..f86243092f1fb 100644 --- a/libc/test/src/math/smoke/exp_test.cpp +++ b/libc/test/src/math/smoke/exp_test.cpp @@ -18,6 +18,9 @@ using LlvmLibcExpTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExpTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::exp(aNaN)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::exp(inf)); EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::exp(neg_inf)); diff --git a/libc/test/src/math/smoke/expf_test.cpp b/libc/test/src/math/smoke/expf_test.cpp index 11181ed1402c9..eee8304999275 100644 --- a/libc/test/src/math/smoke/expf_test.cpp +++ b/libc/test/src/math/smoke/expf_test.cpp @@ -20,6 +20,9 @@ using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExpfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/expm1_test.cpp b/libc/test/src/math/smoke/expm1_test.cpp index cebd2d757606b..bc71c53abc7ac 100644 --- a/libc/test/src/math/smoke/expm1_test.cpp +++ b/libc/test/src/math/smoke/expm1_test.cpp @@ -18,6 +18,9 @@ using LlvmLibcExpm1Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExpm1Test, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expm1(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::expm1(aNaN)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::expm1(inf)); EXPECT_FP_EQ_ALL_ROUNDING(-1.0, LIBC_NAMESPACE::expm1(neg_inf)); diff --git a/libc/test/src/math/smoke/expm1f_test.cpp b/libc/test/src/math/smoke/expm1f_test.cpp index f4138aa05ba7e..dfb474d70fb6a 100644 --- a/libc/test/src/math/smoke/expm1f_test.cpp +++ b/libc/test/src/math/smoke/expm1f_test.cpp @@ -20,6 +20,9 @@ using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expm1f(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expm1f(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/log10_test.cpp b/libc/test/src/math/smoke/log10_test.cpp index 9f159f282aad8..ff73850c52101 100644 --- a/libc/test/src/math/smoke/log10_test.cpp +++ b/libc/test/src/math/smoke/log10_test.cpp @@ -18,6 +18,9 @@ using LlvmLibcLog10Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLog10Test, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::log10(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::log10(aNaN)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::log10(inf)); EXPECT_FP_IS_NAN_WITH_EXCEPTION(LIBC_NAMESPACE::log10(neg_inf), FE_INVALID); diff --git a/libc/test/src/math/smoke/log10f_test.cpp b/libc/test/src/math/smoke/log10f_test.cpp index 4e3bf654ca918..a63822140e9b9 100644 --- a/libc/test/src/math/smoke/log10f_test.cpp +++ b/libc/test/src/math/smoke/log10f_test.cpp @@ -17,6 +17,9 @@ using LlvmLibcLog10fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLog10fTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::log10f(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::log10f(aNaN)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::log10f(inf)); EXPECT_FP_IS_NAN_WITH_EXCEPTION(LIBC_NAMESPACE::log10f(neg_inf), FE_INVALID); diff --git a/libc/test/src/math/smoke/log1p_test.cpp b/libc/test/src/math/smoke/log1p_test.cpp index b98c0f26a8bca..631c24b8abcf9 100644 --- a/libc/test/src/math/smoke/log1p_test.cpp +++ b/libc/test/src/math/smoke/log1p_test.cpp @@ -16,6 +16,9 @@ using LlvmLibcLog1pTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLog1pTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::log1p(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::log1p(aNaN)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::log1p(inf)); EXPECT_FP_IS_NAN_WITH_EXCEPTION(LIBC_NAMESPACE::log1p(neg_inf), FE_INVALID); diff --git a/libc/test/src/math/smoke/log1pf_test.cpp b/libc/test/src/math/smoke/log1pf_test.cpp index 1b0a1d589e684..bd828ad58c4c9 100644 --- a/libc/test/src/math/smoke/log1pf_test.cpp +++ b/libc/test/src/math/smoke/log1pf_test.cpp @@ -18,6 +18,9 @@ using LlvmLibcLog1pfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLog1pfTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::log1pf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::log1pf(aNaN)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::log1pf(inf)); EXPECT_FP_IS_NAN_WITH_EXCEPTION(LIBC_NAMESPACE::log1pf(neg_inf), FE_INVALID); diff --git a/libc/test/src/math/smoke/log2_test.cpp b/libc/test/src/math/smoke/log2_test.cpp index 1570d60556df2..9993d442967cb 100644 --- a/libc/test/src/math/smoke/log2_test.cpp +++ b/libc/test/src/math/smoke/log2_test.cpp @@ -18,6 +18,9 @@ using LlvmLibcLog2Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLog2Test, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::log2(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::log2(aNaN)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::log2(inf)); EXPECT_FP_IS_NAN_WITH_EXCEPTION(LIBC_NAMESPACE::log2(neg_inf), FE_INVALID); diff --git a/libc/test/src/math/smoke/log2f_test.cpp b/libc/test/src/math/smoke/log2f_test.cpp index 67b2c5b2db13d..8648b75b88b83 100644 --- a/libc/test/src/math/smoke/log2f_test.cpp +++ b/libc/test/src/math/smoke/log2f_test.cpp @@ -18,6 +18,9 @@ using LlvmLibcLog2fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLog2fTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::log2f(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::log2f(aNaN)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::log2f(inf)); EXPECT_FP_IS_NAN_WITH_EXCEPTION(LIBC_NAMESPACE::log2f(neg_inf), FE_INVALID); diff --git a/libc/test/src/math/smoke/log_test.cpp b/libc/test/src/math/smoke/log_test.cpp index 20b974d7e167d..d31eb0c1db734 100644 --- a/libc/test/src/math/smoke/log_test.cpp +++ b/libc/test/src/math/smoke/log_test.cpp @@ -18,6 +18,9 @@ using LlvmLibcLogTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLogTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::log(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::log(aNaN)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::log(inf)); EXPECT_FP_IS_NAN_WITH_EXCEPTION(LIBC_NAMESPACE::log(neg_inf), FE_INVALID); diff --git a/libc/test/src/math/smoke/logf_test.cpp b/libc/test/src/math/smoke/logf_test.cpp index 1a3102ae2b141..faba50e9b240c 100644 --- a/libc/test/src/math/smoke/logf_test.cpp +++ b/libc/test/src/math/smoke/logf_test.cpp @@ -17,6 +17,9 @@ using LlvmLibcLogfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLogfTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::logf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::logf(aNaN)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::logf(inf)); EXPECT_FP_IS_NAN_WITH_EXCEPTION(LIBC_NAMESPACE::logf(neg_inf), FE_INVALID); diff --git a/libc/test/src/math/smoke/pow_test.cpp b/libc/test/src/math/smoke/pow_test.cpp index f9db7f102962b..b27134aca45d8 100644 --- a/libc/test/src/math/smoke/pow_test.cpp +++ b/libc/test/src/math/smoke/pow_test.cpp @@ -29,7 +29,33 @@ TEST_F(LlvmLibcPowTest, SpecialNumbers) { if (!__r.success) continue; + // pow( sNaN, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(sNaN, sNaN), + FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION( + aNaN, LIBC_NAMESPACE::pow(sNaN, NEG_ODD_INTEGER), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION( + aNaN, LIBC_NAMESPACE::pow(sNaN, NEG_EVEN_INTEGER), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION( + aNaN, LIBC_NAMESPACE::pow(sNaN, POS_ODD_INTEGER), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION( + aNaN, LIBC_NAMESPACE::pow(sNaN, POS_EVEN_INTEGER), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(sNaN, ONE_HALF), + FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(sNaN, zero), + FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(sNaN, neg_zero), + FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(sNaN, inf), + FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(sNaN, neg_inf), + FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(sNaN, aNaN), + FE_INVALID); + // pow( 0.0, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(zero, sNaN), + FE_INVALID); EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::pow(zero, NEG_ODD_INTEGER), FE_DIVBYZERO); EXPECT_FP_EQ_WITH_EXCEPTION( @@ -48,6 +74,8 @@ TEST_F(LlvmLibcPowTest, SpecialNumbers) { EXPECT_FP_IS_NAN(LIBC_NAMESPACE::pow(zero, aNaN)); // pow( -0.0, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(neg_zero, sNaN), + FE_INVALID); EXPECT_FP_EQ_WITH_EXCEPTION( neg_inf, LIBC_NAMESPACE::pow(neg_zero, NEG_ODD_INTEGER), FE_DIVBYZERO); EXPECT_FP_EQ_WITH_EXCEPTION( @@ -66,6 +94,8 @@ TEST_F(LlvmLibcPowTest, SpecialNumbers) { EXPECT_FP_IS_NAN(LIBC_NAMESPACE::pow(neg_zero, aNaN)); // pow( 1.0, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(1.0, sNaN), + FE_INVALID); EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(1.0, zero)); EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(1.0, neg_zero)); EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(1.0, 1.0)); @@ -80,7 +110,9 @@ TEST_F(LlvmLibcPowTest, SpecialNumbers) { EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(1.0, neg_inf)); EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(1.0, aNaN)); - // pow( 1.0, exponent ) + // pow( -1.0, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(-1.0, sNaN), + FE_INVALID); EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(-1.0, zero)); EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(-1.0, neg_zero)); EXPECT_FP_EQ(-1.0, LIBC_NAMESPACE::pow(-1.0, 1.0)); @@ -98,6 +130,8 @@ TEST_F(LlvmLibcPowTest, SpecialNumbers) { EXPECT_FP_IS_NAN(LIBC_NAMESPACE::pow(-1.0, aNaN)); // pow( inf, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(inf, sNaN), + FE_INVALID); EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(inf, zero)); EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(inf, neg_zero)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::pow(inf, 1.0)); @@ -114,6 +148,8 @@ TEST_F(LlvmLibcPowTest, SpecialNumbers) { EXPECT_FP_IS_NAN(LIBC_NAMESPACE::pow(inf, aNaN)); // pow( -inf, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(neg_inf, sNaN), + FE_INVALID); EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(neg_inf, zero)); EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(neg_inf, neg_zero)); EXPECT_FP_EQ(neg_inf, LIBC_NAMESPACE::pow(neg_inf, 1.0)); @@ -130,6 +166,8 @@ TEST_F(LlvmLibcPowTest, SpecialNumbers) { EXPECT_FP_IS_NAN(LIBC_NAMESPACE::pow(neg_inf, aNaN)); // pow ( aNaN, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::pow(aNaN, sNaN), + FE_INVALID); EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(aNaN, zero)); EXPECT_FP_EQ(1.0, LIBC_NAMESPACE::pow(aNaN, neg_zero)); EXPECT_FP_IS_NAN(LIBC_NAMESPACE::pow(aNaN, 1.0)); diff --git a/libc/test/src/math/smoke/powf_test.cpp b/libc/test/src/math/smoke/powf_test.cpp index 9cc95ce0baef9..0d1a650385fbd 100644 --- a/libc/test/src/math/smoke/powf_test.cpp +++ b/libc/test/src/math/smoke/powf_test.cpp @@ -32,7 +32,33 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) { if (!__r.success) continue; + // pow( sNaN, exponent) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(sNaN, sNaN), + FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION( + aNaN, LIBC_NAMESPACE::powf(sNaN, neg_odd_integer), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION( + aNaN, LIBC_NAMESPACE::powf(sNaN, neg_even_integer), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION( + aNaN, LIBC_NAMESPACE::powf(sNaN, pos_odd_integer), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION( + aNaN, LIBC_NAMESPACE::powf(sNaN, pos_even_integer), FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(sNaN, one_half), + FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(sNaN, zero), + FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(sNaN, neg_zero), + FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(sNaN, inf), + FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(sNaN, neg_inf), + FE_INVALID); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(sNaN, aNaN), + FE_INVALID); + // pow( 0.0f, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(zero, sNaN), + FE_INVALID); EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::powf(zero, neg_odd_integer), FE_DIVBYZERO); EXPECT_FP_EQ_WITH_EXCEPTION( @@ -51,6 +77,8 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) { EXPECT_FP_IS_NAN(LIBC_NAMESPACE::powf(zero, aNaN)); // pow( -0.0f, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(neg_zero, sNaN), + FE_INVALID); EXPECT_FP_EQ_WITH_EXCEPTION( neg_inf, LIBC_NAMESPACE::powf(neg_zero, neg_odd_integer), FE_DIVBYZERO); EXPECT_FP_EQ_WITH_EXCEPTION( @@ -69,6 +97,8 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) { EXPECT_FP_IS_NAN(LIBC_NAMESPACE::powf(neg_zero, aNaN)); // pow( 1.0f, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(1.0f, sNaN), + FE_INVALID); EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(1.0f, zero)); EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(1.0f, neg_zero)); EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(1.0f, 1.0f)); @@ -83,7 +113,9 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) { EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(1.0f, neg_inf)); EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(1.0f, aNaN)); - // pow( 1.0f, exponent ) + // pow( -1.0f, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(-1.0f, sNaN), + FE_INVALID); EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(-1.0f, zero)); EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(-1.0f, neg_zero)); EXPECT_FP_EQ(-1.0f, LIBC_NAMESPACE::powf(-1.0f, 1.0f)); @@ -101,6 +133,8 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) { EXPECT_FP_IS_NAN(LIBC_NAMESPACE::powf(-1.0f, aNaN)); // pow( inf, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(inf, sNaN), + FE_INVALID); EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(inf, zero)); EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(inf, neg_zero)); EXPECT_FP_EQ(inf, LIBC_NAMESPACE::powf(inf, 1.0f)); @@ -117,6 +151,8 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) { EXPECT_FP_IS_NAN(LIBC_NAMESPACE::powf(inf, aNaN)); // pow( -inf, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(neg_inf, sNaN), + FE_INVALID); EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(neg_inf, zero)); EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(neg_inf, neg_zero)); EXPECT_FP_EQ(neg_inf, LIBC_NAMESPACE::powf(neg_inf, 1.0f)); @@ -133,6 +169,8 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) { EXPECT_FP_IS_NAN(LIBC_NAMESPACE::powf(neg_inf, aNaN)); // pow ( aNaN, exponent ) + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(aNaN, sNaN), + FE_INVALID); EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(aNaN, zero)); EXPECT_FP_EQ(1.0f, LIBC_NAMESPACE::powf(aNaN, neg_zero)); EXPECT_FP_IS_NAN(LIBC_NAMESPACE::powf(aNaN, 1.0f)); @@ -160,6 +198,8 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) { EXPECT_FP_EQ(zero, LIBC_NAMESPACE::powf(-1.1f, neg_inf)); // Exact powers of 2: + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(2.0f, sNaN), + FE_INVALID); EXPECT_FP_EQ(0x1.0p15f, LIBC_NAMESPACE::powf(2.0f, 15.0f)); EXPECT_FP_EQ(0x1.0p126f, LIBC_NAMESPACE::powf(2.0f, 126.0f)); EXPECT_FP_EQ(0x1.0p-45f, LIBC_NAMESPACE::powf(2.0f, -45.0f)); @@ -178,6 +218,8 @@ TEST_F(LlvmLibcPowfTest, SpecialNumbers) { EXPECT_FP_EQ(100000000.0f, LIBC_NAMESPACE::powf(10.0f, 8.0f)); EXPECT_FP_EQ(1000000000.0f, LIBC_NAMESPACE::powf(10.0f, 9.0f)); EXPECT_FP_EQ(10000000000.0f, LIBC_NAMESPACE::powf(10.0f, 10.0f)); + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::powf(10.0f, sNaN), + FE_INVALID); // Overflow / Underflow: if (ROUNDING_MODES[i] != RoundingMode::Downward && diff --git a/libc/test/src/math/smoke/sin_test.cpp b/libc/test/src/math/smoke/sin_test.cpp index 7dd1b7fda625b..da6d71bfcbe4c 100644 --- a/libc/test/src/math/smoke/sin_test.cpp +++ b/libc/test/src/math/smoke/sin_test.cpp @@ -15,6 +15,9 @@ using LlvmLibcSinTest = LIBC_NAMESPACE::testing::FPTest; using LIBC_NAMESPACE::testing::tlog; TEST_F(LlvmLibcSinTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sin(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::sin(aNaN)); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::sin(inf)); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::sin(neg_inf)); diff --git a/libc/test/src/math/smoke/sincos_test.cpp b/libc/test/src/math/smoke/sincos_test.cpp index 371c0ad63cbf5..8bc584de4e8cd 100644 --- a/libc/test/src/math/smoke/sincos_test.cpp +++ b/libc/test/src/math/smoke/sincos_test.cpp @@ -15,6 +15,11 @@ using LlvmLibcSincosTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSincosTest, SpecialNumbers) { double sin_x, cos_x; + LIBC_NAMESPACE::sincos(sNaN, &sin_x, &cos_x); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, cos_x); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, sin_x); + EXPECT_MATH_ERRNO(0); + LIBC_NAMESPACE::sincos(aNaN, &sin_x, &cos_x); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, cos_x); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, sin_x); diff --git a/libc/test/src/math/smoke/sincosf_test.cpp b/libc/test/src/math/smoke/sincosf_test.cpp index e6896ca3dc21a..5f66868f12a1c 100644 --- a/libc/test/src/math/smoke/sincosf_test.cpp +++ b/libc/test/src/math/smoke/sincosf_test.cpp @@ -21,6 +21,11 @@ TEST_F(LlvmLibcSinCosfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; float sin, cos; + LIBC_NAMESPACE::sincosf(sNaN, &sin, &cos); + EXPECT_FP_EQ(aNaN, cos); + EXPECT_FP_EQ(aNaN, sin); + EXPECT_MATH_ERRNO(0); + LIBC_NAMESPACE::sincosf(aNaN, &sin, &cos); EXPECT_FP_EQ(aNaN, cos); EXPECT_FP_EQ(aNaN, sin); diff --git a/libc/test/src/math/smoke/sinf16_test.cpp b/libc/test/src/math/smoke/sinf16_test.cpp index 2966c3c952fd2..a0e7a7ba321fd 100644 --- a/libc/test/src/math/smoke/sinf16_test.cpp +++ b/libc/test/src/math/smoke/sinf16_test.cpp @@ -16,6 +16,9 @@ using LlvmLibcSinf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinf16Test, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinf16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/sinf_test.cpp b/libc/test/src/math/smoke/sinf_test.cpp index 776c66dcb37bd..de504b4f5335c 100644 --- a/libc/test/src/math/smoke/sinf_test.cpp +++ b/libc/test/src/math/smoke/sinf_test.cpp @@ -20,6 +20,9 @@ using LlvmLibcSinfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/sinhf_test.cpp b/libc/test/src/math/smoke/sinhf_test.cpp index 3cc0656967581..e22cfc7ea14d8 100644 --- a/libc/test/src/math/smoke/sinhf_test.cpp +++ b/libc/test/src/math/smoke/sinhf_test.cpp @@ -21,6 +21,9 @@ using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinhfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinhf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinhf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/sinpif16_test.cpp b/libc/test/src/math/smoke/sinpif16_test.cpp index a79fd5281ee68..b2db6fb9f8626 100644 --- a/libc/test/src/math/smoke/sinpif16_test.cpp +++ b/libc/test/src/math/smoke/sinpif16_test.cpp @@ -17,6 +17,9 @@ using LlvmLibcSinpif16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinpif16Test, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinpif16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinpif16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/sinpif_test.cpp b/libc/test/src/math/smoke/sinpif_test.cpp index 11bda0b6b28cc..1ba5c1d2b720a 100644 --- a/libc/test/src/math/smoke/sinpif_test.cpp +++ b/libc/test/src/math/smoke/sinpif_test.cpp @@ -17,6 +17,9 @@ using LlvmLibcSinpifTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinpifTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinpif(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinpif(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/tan_test.cpp b/libc/test/src/math/smoke/tan_test.cpp index aa5c23d65886d..6538990526753 100644 --- a/libc/test/src/math/smoke/tan_test.cpp +++ b/libc/test/src/math/smoke/tan_test.cpp @@ -15,6 +15,9 @@ using LlvmLibcTanTest = LIBC_NAMESPACE::testing::FPTest; using LIBC_NAMESPACE::testing::tlog; TEST_F(LlvmLibcTanTest, SpecialNumbers) { + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tan(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::tan(aNaN)); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::tan(inf)); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::tan(neg_inf)); diff --git a/libc/test/src/math/smoke/tanf16_test.cpp b/libc/test/src/math/smoke/tanf16_test.cpp index 39d1182ba891e..f65b9fced72c4 100644 --- a/libc/test/src/math/smoke/tanf16_test.cpp +++ b/libc/test/src/math/smoke/tanf16_test.cpp @@ -17,6 +17,9 @@ using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanf16Test, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanf16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/tanf_test.cpp b/libc/test/src/math/smoke/tanf_test.cpp index 93fbfded3f66a..178e9065f430f 100644 --- a/libc/test/src/math/smoke/tanf_test.cpp +++ b/libc/test/src/math/smoke/tanf_test.cpp @@ -20,6 +20,9 @@ using LlvmLibcTanfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/tanhf_test.cpp b/libc/test/src/math/smoke/tanhf_test.cpp index 3b7faa81dac2e..c09761ef531f2 100644 --- a/libc/test/src/math/smoke/tanhf_test.cpp +++ b/libc/test/src/math/smoke/tanhf_test.cpp @@ -20,6 +20,9 @@ using LlvmLibcTanhfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanhfTest, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanhf(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanhf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/tanpif16_test.cpp b/libc/test/src/math/smoke/tanpif16_test.cpp index a378cfb0a62e1..74797d1649b1a 100644 --- a/libc/test/src/math/smoke/tanpif16_test.cpp +++ b/libc/test/src/math/smoke/tanpif16_test.cpp @@ -16,6 +16,9 @@ using LlvmLibcTanpif16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanpif16Test, SpecialNumbers) { LIBC_NAMESPACE::libc_errno = 0; + EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanpif16(sNaN), FE_INVALID); + EXPECT_MATH_ERRNO(0); + EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanpif16(aNaN)); EXPECT_MATH_ERRNO(0); From 35b3886382f0172aa76bfbfc8e731e46c9c11cc3 Mon Sep 17 00:00:00 2001 From: Hans Wennborg Date: Tue, 8 Apr 2025 15:25:28 +0200 Subject: [PATCH 0986/1029] [win/arm64] Enable tail call with inreg arguments when possible (#134671) Tail calls were disabled from callers with inreg parameters in 5dc8aeb with a fixme to check if the callee also takes an inreg parameter. The issue is that inreg parameters (which are passed in x0 or x1 for free and member functions respectively) are supposed to be returned (in x0) at the end of the function. In case of a tail call, that means the callee needs to return the same value as the caller would. We can check for that case, and it's not as niche as it sounds, as that's how Clang will lower one function with an sret return value calling another, such as: ``` struct T { int x; }; struct S { T foo(); T bar(); }; T S::foo() { return bar(); } // foo's sret argument will get passed directly to bar ``` Fixes #133098 --- .../Target/AArch64/AArch64ISelLowering.cpp | 23 ++++++++++++------ .../CodeGen/AArch64/arm64-windows-tailcall.ll | 24 +++++++++++++++++++ 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3f42501828400..e366d7cb54490 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8703,13 +8703,22 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( return false; // On Windows, "inreg" attributes signify non-aggregate indirect returns. - // In this case, it is necessary to save/restore X0 in the callee. Tail - // call opt interferes with this. So we disable tail call opt when the - // caller has an argument with "inreg" attribute. - - // FIXME: Check whether the callee also has an "inreg" argument. - if (i->hasInRegAttr()) - return false; + // In this case, it is necessary to save X0/X1 in the callee and return it + // in X0. Tail call opt may interfere with this, so we disable tail call + // opt when the caller has an "inreg" attribute -- except if the callee + // also has that attribute on the same argument, and the same value is + // passed. + if (i->hasInRegAttr()) { + unsigned ArgIdx = i - CallerF.arg_begin(); + if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx) + return false; + AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx); + if (!Attrs.hasAttribute(Attribute::InReg) || + !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() || + CLI.CB->getArgOperand(ArgIdx) != i) { + return false; + } + } } if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) diff --git a/llvm/test/CodeGen/AArch64/arm64-windows-tailcall.ll b/llvm/test/CodeGen/AArch64/arm64-windows-tailcall.ll index 55799d0dcb2d2..cd0a77a280aec 100644 --- a/llvm/test/CodeGen/AArch64/arm64-windows-tailcall.ll +++ b/llvm/test/CodeGen/AArch64/arm64-windows-tailcall.ll @@ -16,3 +16,27 @@ entry: } declare dso_local void @"?foo"(ptr dereferenceable(4)) + + +declare void @inreg_callee(ptr, ptr inreg sret(%class.C)) + +define void @inreg_caller_1(ptr %a, ptr inreg sret(%class.C) %b) { +; A different value is passed to the inreg parameter, so tail call is not possible. +; CHECK-LABEL: inreg_caller_1 +; CHECK: mov x19, x1 +; CHECK: bl inreg_callee +; CHECK: mov x0, x19 + + tail call void @inreg_callee(ptr %b, ptr inreg sret(%class.C) %a) + ret void +} + +define void @inreg_caller_2(ptr %a, ptr inreg sret(%class.C) %b) { +; The inreg attribute and value line up between caller and callee, so it can +; be tail called. +; CHECK-LABEL: inreg_caller_2 +; CHECK: b inreg_callee + + tail call void @inreg_callee(ptr %a, ptr inreg sret(%class.C) %b) + ret void +} From 3a6b9b3a87387289cb913d40b09f77c842bfd694 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Tue, 8 Apr 2025 15:46:01 +0200 Subject: [PATCH 0987/1029] [mlir][bazel] Fix after dae0ef53a0b99c6c2b74143baee5896e8bc5c8e7 Remove unnecessary include. --- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 1 - utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index 7f286f938ee60..549a4376a4a04 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -25,7 +25,6 @@ #include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" #include "llvm/ADT/TypeSwitch.h" -#include "llvm/IR/DerivedTypes.h" #include #include diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 141986392917e..64b5e84547486 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -1530,6 +1530,7 @@ cc_library( ":GPUDialect", ":IR", ":InferTypeOpInterface", + ":MemRefUtils", ":ROCDLDialect", ":SideEffectInterfaces", ":ViewLikeInterface", From 0e9881745834863a6f5a3a05588886bb3eb75cdf Mon Sep 17 00:00:00 2001 From: Romaric Jodin Date: Tue, 8 Apr 2025 15:50:26 +0200 Subject: [PATCH 0988/1029] libclc: frexp: fix implementation regarding denormals (#134823) Devices not supporting denormals can compare them true against zero. It leads to result not matching the CTS expectation when either supporting or not denormals. For example for 0x1.008p-140 we get {0x1.008p-140, 0} while the CTS expects {0x1.008p-1, -139} when supporting denormals, or {0, 0} when not supporting denormals (flushed to zero). Ref #129871 --- libclc/clc/lib/generic/math/clc_frexp.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libclc/clc/lib/generic/math/clc_frexp.inc b/libclc/clc/lib/generic/math/clc_frexp.inc index 640d02cb3209d..d212b6a1b3376 100644 --- a/libclc/clc/lib/generic/math/clc_frexp.inc +++ b/libclc/clc/lib/generic/math/clc_frexp.inc @@ -26,7 +26,7 @@ __clc_frexp(__CLC_GENTYPE x, __CLC_ADDRESS_SPACE __CLC_INTN *ep) { (ai & (__CLC_INTN)MANTBITS_SP32); __CLC_INTN is_inf_nan_or_zero = - x == __CLC_FP_LIT(0.0) || __clc_isinf(x) || __clc_isnan(x); + ai == (__CLC_INTN)0 || __clc_isinf(x) || __clc_isnan(x); *ep = __clc_select(e, (__CLC_INTN)0, is_inf_nan_or_zero); return __clc_select(__CLC_AS_GENTYPE(i), x, is_inf_nan_or_zero); } From f19c6f23abefac56fde9f2b393c9ffa7595c86c6 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Tue, 8 Apr 2025 09:57:43 -0400 Subject: [PATCH 0989/1029] [Clang][AMDGPU] Improve error message when device libraries for COV6 are missing (#134745) #130963 switches the default to COV6, which requires ROCm 6.3. Currently, if the device libraries for COV6 are not found, the error message is not very helpful. This PR provides a more informative error message in such cases. --- clang/include/clang/Basic/DiagnosticDriverKinds.td | 3 ++- clang/lib/Driver/ToolChains/AMDGPU.cpp | 8 +++++++- clang/lib/Driver/ToolChains/ROCm.h | 6 ++++-- clang/test/Driver/hip-device-libs.hip | 2 +- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index df24cca49aaae..a96b4fb33390c 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -67,7 +67,8 @@ def err_drv_no_cuda_libdevice : Error< "libdevice">; def err_drv_no_rocm_device_lib : Error< - "cannot find ROCm device library%select{| for %1| for ABI version %1}0; provide its path via " + "cannot find ROCm device library%select{| for %1| for ABI version %1" + "%select{|, which requires ROCm %3 or higher}2}0; provide its path via " "'--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build " "without ROCm device library">; def err_drv_no_hip_runtime : Error< diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index dffc70d5e5b69..29c84037b523a 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -935,7 +935,13 @@ bool RocmInstallationDetector::checkCommonBitcodeLibs( return false; } if (ABIVer.requiresLibrary() && getABIVersionPath(ABIVer).empty()) { - D.Diag(diag::err_drv_no_rocm_device_lib) << 2 << ABIVer.toString(); + // Starting from COV6, we will report minimum ROCm version requirement in + // the error message. + if (ABIVer.getAsCodeObjectVersion() < 6) + D.Diag(diag::err_drv_no_rocm_device_lib) << 2 << ABIVer.toString() << 0; + else + D.Diag(diag::err_drv_no_rocm_device_lib) + << 2 << ABIVer.toString() << 1 << "6.3"; return false; } return true; diff --git a/clang/lib/Driver/ToolChains/ROCm.h b/clang/lib/Driver/ToolChains/ROCm.h index a6cc41db383b6..1ba0f1b9f30d6 100644 --- a/clang/lib/Driver/ToolChains/ROCm.h +++ b/clang/lib/Driver/ToolChains/ROCm.h @@ -37,9 +37,11 @@ struct DeviceLibABIVersion { /// and below works with ROCm 5.0 and below which does not have /// abi_version_*.bc. Code object v5 requires abi_version_500.bc. bool requiresLibrary() { return ABIVersion >= 500; } - std::string toString() { + std::string toString() { return Twine(getAsCodeObjectVersion()).str(); } + + unsigned getAsCodeObjectVersion() const { assert(ABIVersion % 100 == 0 && "Not supported"); - return Twine(ABIVersion / 100).str(); + return ABIVersion / 100; } }; diff --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip index c7cafd0027bc5..b123f741bdee5 100644 --- a/clang/test/Driver/hip-device-libs.hip +++ b/clang/test/Driver/hip-device-libs.hip @@ -254,4 +254,4 @@ // NOABI4-NOT: "-mlink-builtin-bitcode" "{{.*}}oclc_abi_version_400.bc" // NOABI4-NOT: "-mlink-builtin-bitcode" "{{.*}}oclc_abi_version_500.bc" // NOABI5: error: cannot find ROCm device library for ABI version 5; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library -// NOABI6: error: cannot find ROCm device library for ABI version 6; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library +// NOABI6: error: cannot find ROCm device library for ABI version 6, which requires ROCm 6.3 or higher; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library From b2dea4fd22b79fa27ef1ebd737401616095a7de6 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Tue, 8 Apr 2025 06:59:38 -0700 Subject: [PATCH 0990/1029] [ctxprof] root autodetection mechanism (#133147) This is an optional mechanism that automatically detects roots. It's a best-effort mechanism, and its main goal is to *avoid* pointing at the message pump function as a root. This is the function that polls message queue(s) in an infinite loop, and is thus a bad root (it never exits). High-level, when collection is requested - which should happen when a server has already been set up and handing requests - we spend a bit of time sampling all the server's threads. Each sample is a stack which we insert in a `PerThreadCallsiteTrie`. After a while, we run for each `PerThreadCallsiteTrie` the root detection logic. We then traverse all the `FunctionData`, find the ones matching the detected roots, and allocate a `ContextRoot` for them. From here, we special case `FunctionData` objects, in `__llvm_ctx_profile_get_context, that have a `CtxRoot` and route them to `__llvm_ctx_profile_start_context`. For this to work, on the llvm side, we need to have all functions call `__llvm_ctx_profile_release_context` because they _might_ be roots. This comes at a slight (percentages) penalty during collection - which we can afford since the overall technique is ~5x faster than normal instrumentation. We can later explore conditionally enabling autoroot detection and avoiding this penalty, if desired. Note that functions that `musttail call` can't have their return instrumented this way, and a subsequent patch will harden the mechanism against this case. The mechanism could be used in combination with explicit root specification, too. --- compiler-rt/lib/ctx_profile/CMakeLists.txt | 2 +- .../lib/ctx_profile/CtxInstrContextNode.h | 1 + .../lib/ctx_profile/CtxInstrProfiling.cpp | 79 ++++++-- .../lib/ctx_profile/CtxInstrProfiling.h | 2 +- .../lib/ctx_profile/RootAutoDetector.cpp | 94 +++++++++ .../lib/ctx_profile/RootAutoDetector.h | 43 ++++ .../TestCases/autodetect-roots.cpp | 188 ++++++++++++++++++ .../TestCases/generate-context.cpp | 5 +- .../llvm/ProfileData/CtxInstrContextNode.h | 1 + .../Instrumentation/PGOCtxProfLowering.cpp | 26 ++- .../PGOProfile/ctx-instrumentation.ll | 50 ++++- 11 files changed, 449 insertions(+), 42 deletions(-) create mode 100644 compiler-rt/test/ctx_profile/TestCases/autodetect-roots.cpp diff --git a/compiler-rt/lib/ctx_profile/CMakeLists.txt b/compiler-rt/lib/ctx_profile/CMakeLists.txt index bb606449c61b1..446ebc96408dd 100644 --- a/compiler-rt/lib/ctx_profile/CMakeLists.txt +++ b/compiler-rt/lib/ctx_profile/CMakeLists.txt @@ -27,7 +27,7 @@ endif() add_compiler_rt_runtime(clang_rt.ctx_profile STATIC ARCHS ${CTX_PROFILE_SUPPORTED_ARCH} - OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc + OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc RTSanitizerCommonSymbolizer CFLAGS ${EXTRA_FLAGS} SOURCES ${CTX_PROFILE_SOURCES} ADDITIONAL_HEADERS ${CTX_PROFILE_HEADERS} diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h index a42bf9ebb01ea..55423d95b3088 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h +++ b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h @@ -127,6 +127,7 @@ class ContextNode final { /// MUTEXDECL takes one parameter, the name of a field that is a mutex. #define CTXPROF_FUNCTION_DATA(PTRDECL, VOLATILE_PTRDECL, MUTEXDECL) \ PTRDECL(FunctionData, Next) \ + VOLATILE_PTRDECL(void, EntryAddress) \ VOLATILE_PTRDECL(ContextRoot, CtxRoot) \ VOLATILE_PTRDECL(ContextNode, FlatCtx) \ MUTEXDECL(Mutex) diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp index e08d555c61ff7..4cf852fe3f667 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp +++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "CtxInstrProfiling.h" +#include "RootAutoDetector.h" #include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_atomic_clang.h" @@ -43,6 +44,12 @@ Arena *FlatCtxArena = nullptr; __thread bool IsUnderContext = false; __sanitizer::atomic_uint8_t ProfilingStarted = {}; +__sanitizer::atomic_uintptr_t RootDetector = {}; +RootAutoDetector *getRootDetector() { + return reinterpret_cast( + __sanitizer::atomic_load_relaxed(&RootDetector)); +} + // utility to taint a pointer by setting the LSB. There is an assumption // throughout that the addresses of contexts are even (really, they should be // align(8), but "even"-ness is the minimum assumption) @@ -201,7 +208,7 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint, return Ret; } -ContextNode *getFlatProfile(FunctionData &Data, GUID Guid, +ContextNode *getFlatProfile(FunctionData &Data, void *Callee, GUID Guid, uint32_t NumCounters) { if (ContextNode *Existing = Data.FlatCtx) return Existing; @@ -232,6 +239,7 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid, auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0); Data.FlatCtx = Ret; + Data.EntryAddress = Callee; Data.Next = reinterpret_cast( __sanitizer::atomic_load_relaxed(&AllFunctionsData)); while (!__sanitizer::atomic_compare_exchange_strong( @@ -296,8 +304,9 @@ ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid, return TheScratchContext; } -ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid, - uint32_t NumCounters) { +ContextNode *getUnhandledContext(FunctionData &Data, void *Callee, GUID Guid, + uint32_t NumCounters, uint32_t NumCallsites, + ContextRoot *CtxRoot) { // 1) if we are currently collecting a contextual profile, fetch a ContextNode // in the `Unhandled` set. We want to do this regardless of `ProfilingStarted` @@ -316,27 +325,32 @@ ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid, // entered once and never exit. They should be assumed to be entered before // profiling starts - because profiling should start after the server is up // and running (which is equivalent to "message pumps are set up"). - ContextRoot *R = __llvm_ctx_profile_current_context_root; - if (!R) { + if (!CtxRoot) { + if (auto *RAD = getRootDetector()) + RAD->sample(); + else if (auto *CR = Data.CtxRoot) + return tryStartContextGivenRoot(CR, Guid, NumCounters, NumCallsites); if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted)) return TheScratchContext; else return markAsScratch( - onContextEnter(*getFlatProfile(Data, Guid, NumCounters))); + onContextEnter(*getFlatProfile(Data, Callee, Guid, NumCounters))); } - auto [Iter, Ins] = R->Unhandled.insert({Guid, nullptr}); + auto [Iter, Ins] = CtxRoot->Unhandled.insert({Guid, nullptr}); if (Ins) - Iter->second = - getCallsiteSlow(Guid, &R->FirstUnhandledCalleeNode, NumCounters, 0); + Iter->second = getCallsiteSlow(Guid, &CtxRoot->FirstUnhandledCalleeNode, + NumCounters, 0); return markAsScratch(onContextEnter(*Iter->second)); } ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, GUID Guid, uint32_t NumCounters, uint32_t NumCallsites) { + auto *CtxRoot = __llvm_ctx_profile_current_context_root; // fast "out" if we're not even doing contextual collection. - if (!__llvm_ctx_profile_current_context_root) - return getUnhandledContext(*Data, Guid, NumCounters); + if (!CtxRoot) + return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites, + nullptr); // also fast "out" if the caller is scratch. We can see if it's scratch by // looking at the interior pointer into the subcontexts vector that the caller @@ -345,7 +359,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, // precisely, aligned - 8 values) auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]); if (!CallsiteContext || isScratch(CallsiteContext)) - return getUnhandledContext(*Data, Guid, NumCounters); + return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites, + CtxRoot); // if the callee isn't the expected one, return scratch. // Signal handler(s) could have been invoked at any point in the execution. @@ -363,7 +378,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, // for that case. auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]); if (ExpectedCallee != Callee) - return getUnhandledContext(*Data, Guid, NumCounters); + return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites, + CtxRoot); auto *Callsite = *CallsiteContext; // in the case of indirect calls, we will have all seen targets forming a @@ -388,21 +404,23 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid, uint32_t Counters, uint32_t Callsites) { + return tryStartContextGivenRoot(FData->getOrAllocateContextRoot(), Guid, Counters, Callsites); } void __llvm_ctx_profile_release_context(FunctionData *FData) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { + const auto *CurrentRoot = __llvm_ctx_profile_current_context_root; + if (!CurrentRoot || FData->CtxRoot != CurrentRoot) + return; IsUnderContext = false; - if (__llvm_ctx_profile_current_context_root) { - __llvm_ctx_profile_current_context_root = nullptr; - assert(FData->CtxRoot); - FData->CtxRoot->Taken.Unlock(); - } + assert(FData->CtxRoot); + __llvm_ctx_profile_current_context_root = nullptr; + FData->CtxRoot->Taken.Unlock(); } -void __llvm_ctx_profile_start_collection() { +void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration) { size_t NumMemUnits = 0; __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( &AllContextsMutex); @@ -418,12 +436,28 @@ void __llvm_ctx_profile_start_collection() { resetContextNode(*Root->FirstUnhandledCalleeNode); __sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0); } + if (AutodetectDuration) { + // we leak RD intentionally. Knowing when to free it is tricky, there's a + // race condition with functions observing the `RootDectector` as non-null. + // This can be addressed but the alternatives have some added complexity and + // it's not (yet) worth it. + auto *RD = new (__sanitizer::InternalAlloc(sizeof(RootAutoDetector))) + RootAutoDetector(AllFunctionsData, RootDetector, AutodetectDuration); + RD->start(); + } else { + __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits); + } __sanitizer::atomic_store_relaxed(&ProfilingStarted, true); - __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits); } bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) { __sanitizer::atomic_store_relaxed(&ProfilingStarted, false); + if (auto *RD = getRootDetector()) { + __sanitizer::Printf("[ctxprof] Expected the root autodetector to have " + "finished well before attempting to fetch a context"); + RD->join(); + } + __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( &AllContextsMutex); @@ -448,8 +482,9 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) { const auto *Pos = reinterpret_cast( __sanitizer::atomic_load_relaxed(&AllFunctionsData)); for (; Pos; Pos = Pos->Next) - Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(), - Pos->FlatCtx->counters_size()); + if (!Pos->CtxRoot) + Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(), + Pos->FlatCtx->counters_size()); Writer.endFlatSection(); return true; } diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h index 6326beaa53085..4983f086d230d 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h +++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h @@ -207,7 +207,7 @@ ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *FData, /// Prepares for collection. Currently this resets counter values but preserves /// internal context tree structure. -void __llvm_ctx_profile_start_collection(); +void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration = 0); /// Completely free allocated memory. void __llvm_ctx_profile_free(); diff --git a/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp b/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp index 483c55c25eefe..4aa169e202ea3 100644 --- a/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp +++ b/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp @@ -8,6 +8,7 @@ #include "RootAutoDetector.h" +#include "CtxInstrProfiling.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap) #include @@ -17,6 +18,99 @@ using namespace __ctx_profile; template using Set = DenseMap; +namespace __sanitizer { +void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context, + bool request_fast, u32 max_depth) { + // We can't implement the fast variant. The fast variant ends up invoking an + // external allocator, because of pthread_attr_getstack. If this happens + // during an allocation of the program being instrumented, a non-reentrant + // lock may be taken (this was observed). The allocator called by + // pthread_attr_getstack will also try to take that lock. + UnwindSlow(pc, max_depth); +} +} // namespace __sanitizer + +RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) { + GenericScopedLock L(&Parent.AllSamplesMutex); + Parent.AllSamples.PushBack(this); +} + +void RootAutoDetector::start() { + atomic_store_relaxed(&Self, reinterpret_cast(this)); + pthread_create( + &WorkerThread, nullptr, + +[](void *Ctx) -> void * { + RootAutoDetector *RAD = reinterpret_cast(Ctx); + SleepForSeconds(RAD->WaitSeconds); + // To avoid holding the AllSamplesMutex, make a snapshot of all the + // thread samples collected so far + Vector SamplesSnapshot; + { + GenericScopedLock M(&RAD->AllSamplesMutex); + SamplesSnapshot.Resize(RAD->AllSamples.Size()); + for (uptr I = 0; I < RAD->AllSamples.Size(); ++I) + SamplesSnapshot[I] = RAD->AllSamples[I]; + } + DenseMap AllRoots; + for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) { + GenericScopedLock(&SamplesSnapshot[I]->M); + SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) { + auto [FAddr, Count] = KVP; + AllRoots[FAddr] += Count; + return true; + }); + } + // FIXME: as a next step, establish a minimum relative nr of samples + // per root that would qualify it as a root. + for (auto *FD = reinterpret_cast( + atomic_load_relaxed(&RAD->FunctionDataListHead)); + FD; FD = FD->Next) { + if (AllRoots.contains(reinterpret_cast(FD->EntryAddress))) { + FD->getOrAllocateContextRoot(); + } + } + atomic_store_relaxed(&RAD->Self, 0); + return nullptr; + }, + this); +} + +void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); } + +void RootAutoDetector::sample() { + // tracking reentry in case we want to re-explore fast stack unwind - which + // does potentially re-enter the runtime because it calls the instrumented + // allocator because of pthread_attr_getstack. See the notes also on + // UnwindImpl above. + static thread_local bool Entered = false; + static thread_local uint64_t Entries = 0; + if (Entered || (++Entries % SampleRate)) + return; + Entered = true; + collectStack(); + Entered = false; +} + +void RootAutoDetector::collectStack() { + GET_CALLER_PC_BP; + BufferedStackTrace CurrentStack; + CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false); + // 2 stack frames would be very unlikely to mean anything, since at least the + // compiler-rt frame - which can't be inlined - should be observable, which + // counts as 1; we can be even more aggressive with this number. + if (CurrentStack.size <= 2) + return; + static thread_local PerThreadSamples *ThisThreadSamples = + new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples))) + PerThreadSamples(*this); + + if (!ThisThreadSamples->M.TryLock()) + return; + + ThisThreadSamples->TrieRoot.insertStack(CurrentStack); + ThisThreadSamples->M.Unlock(); +} + uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const { // this requires --linkopt=-Wl,--export-dynamic Dl_info Info; diff --git a/compiler-rt/lib/ctx_profile/RootAutoDetector.h b/compiler-rt/lib/ctx_profile/RootAutoDetector.h index 85dd5ef1c32d9..2e0307ca03c7f 100644 --- a/compiler-rt/lib/ctx_profile/RootAutoDetector.h +++ b/compiler-rt/lib/ctx_profile/RootAutoDetector.h @@ -12,6 +12,7 @@ #include "sanitizer_common/sanitizer_dense_map.h" #include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_common/sanitizer_stacktrace.h" +#include "sanitizer_common/sanitizer_vector.h" #include #include @@ -53,5 +54,47 @@ class PerThreadCallsiteTrie { /// thread, together with the number of samples that included them. DenseMap determineRoots() const; }; + +class RootAutoDetector final { + // A prime number. We may want to make this configurable at collection start. + static const uint64_t SampleRate = 6113; + const unsigned WaitSeconds; + pthread_t WorkerThread; + + struct PerThreadSamples { + PerThreadSamples(RootAutoDetector &Parent); + + PerThreadCallsiteTrie TrieRoot; + SpinMutex M; + }; + SpinMutex AllSamplesMutex; + SANITIZER_GUARDED_BY(AllSamplesMutex) + Vector AllSamples; + atomic_uintptr_t &FunctionDataListHead; + atomic_uintptr_t &Self; + void collectStack(); + +public: + RootAutoDetector(atomic_uintptr_t &FunctionDataListHead, + atomic_uintptr_t &Self, unsigned WaitSeconds) + : WaitSeconds(WaitSeconds), FunctionDataListHead(FunctionDataListHead), + Self(Self) {} + + // Samples the stack at `SampleRate` (rate observed independently on each + // thread) in thread local `PerThreadCallsiteTrie`s. + void sample(); + + // Start a thread waiting `WaitSeconds`, after which it uses the + // `PerThreadCallsiteTrie` data observed so far over all threads to determine + // roots. Marks those roots by traversing the linked list of FunctionData that + // starts at `FunctionDataListHead`, and assigning their `CtxRoot`. Finally, + // resets the `Self` atomic, so that other threads don't continue calling + // `sample`. + void start(); + + // join the waiting thread. + void join(); +}; + } // namespace __ctx_profile #endif diff --git a/compiler-rt/test/ctx_profile/TestCases/autodetect-roots.cpp b/compiler-rt/test/ctx_profile/TestCases/autodetect-roots.cpp new file mode 100644 index 0000000000000..a6e0d920cbc82 --- /dev/null +++ b/compiler-rt/test/ctx_profile/TestCases/autodetect-roots.cpp @@ -0,0 +1,188 @@ +// Root autodetection test for contextual profiling +// +// Copy the header defining ContextNode. +// RUN: mkdir -p %t_include +// RUN: cp %llvm_src/include/llvm/ProfileData/CtxInstrContextNode.h %t_include/ +// +// Compile with ctx instrumentation "on". We use -profile-context-root as signal +// that we want contextual profiling, but we can specify anything there, that +// won't be matched with any function, and result in the behavior we are aiming +// for here. +// +// RUN: %clangxx %s %ctxprofilelib -I%t_include -O2 -o %t.bin \ +// RUN: -mllvm -profile-context-root="" -g -Wl,-export-dynamic +// +// Run the binary, and observe the profile fetch handler's output. +// RUN %t.bin | FileCheck %s + +#include "CtxInstrContextNode.h" +#include +#include +#include +#include + +using namespace llvm::ctx_profile; +extern "C" void __llvm_ctx_profile_start_collection(unsigned); +extern "C" bool __llvm_ctx_profile_fetch(ProfileWriter &); + +// avoid name mangling +extern "C" { +__attribute__((noinline)) void anotherFunction() {} +__attribute__((noinline)) void mock1() {} +__attribute__((noinline)) void mock2() {} +__attribute__((noinline)) void someFunction(int I) { + if (I % 2) + mock1(); + else + mock2(); + anotherFunction(); +} + +// block inlining because the pre-inliner otherwise will inline this - it's +// too small. +__attribute__((noinline)) void theRoot() { + someFunction(1); +#pragma nounroll + for (auto I = 0; I < 2; ++I) { + someFunction(I); + } + anotherFunction(); +} +} + +class TestProfileWriter : public ProfileWriter { + void printProfile(const ContextNode &Node, const std::string &Indent, + const std::string &Increment) { + std::cout << Indent << "Guid: " << Node.guid() << std::endl; + std::cout << Indent << "Entries: " << Node.entrycount() << std::endl; + std::cout << Indent << Node.counters_size() << " counters and " + << Node.callsites_size() << " callsites" << std::endl; + std::cout << Indent << "Counter values: "; + for (uint32_t I = 0U; I < Node.counters_size(); ++I) + std::cout << Node.counters()[I] << " "; + std::cout << std::endl; + for (uint32_t I = 0U; I < Node.callsites_size(); ++I) + for (const auto *N = Node.subContexts()[I]; N; N = N->next()) { + std::cout << Indent << "At Index " << I << ":" << std::endl; + printProfile(*N, Indent + Increment, Increment); + } + } + + void startContextSection() override { + std::cout << "Entered Context Section" << std::endl; + } + + void endContextSection() override { + std::cout << "Exited Context Section" << std::endl; + } + + void writeContextual(const ContextNode &RootNode, + const ContextNode *Unhandled, + uint64_t EntryCount) override { + std::cout << "Entering Root " << RootNode.guid() + << " with total entry count " << EntryCount << std::endl; + for (const auto *P = Unhandled; P; P = P->next()) + std::cout << "Unhandled GUID: " << P->guid() << " entered " + << P->entrycount() << " times" << std::endl; + printProfile(RootNode, " ", " "); + } + + void startFlatSection() override { + std::cout << "Entered Flat Section" << std::endl; + } + + void writeFlat(GUID Guid, const uint64_t *Buffer, + size_t BufferSize) override { + std::cout << "Flat: " << Guid << " " << Buffer[0]; + for (size_t I = 1U; I < BufferSize; ++I) + std::cout << "," << Buffer[I]; + std::cout << std::endl; + }; + + void endFlatSection() override { + std::cout << "Exited Flat Section" << std::endl; + } +}; + +// Guid:3950394326069683896 is anotherFunction +// Guid:6759619411192316602 is someFunction +// These are expected to be the auto-detected roots. This is because we cannot +// discern (with the current autodetection mechanism) if theRoot +// (Guid:8657661246551306189) is ever re-entered. +// +// CHECK: Entered Context Section +// CHECK-NEXT: Entering Root 6759619411192316602 with total entry count 12463157 +// CHECK-NEXT: Guid: 6759619411192316602 +// CHECK-NEXT: Entries: 5391142 +// CHECK-NEXT: 2 counters and 3 callsites +// CHECK-NEXT: Counter values: 5391142 1832357 +// CHECK-NEXT: At Index 0: +// CHECK-NEXT: Guid: 434762725428799310 +// CHECK-NEXT: Entries: 3558785 +// CHECK-NEXT: 1 counters and 0 callsites +// CHECK-NEXT: Counter values: 3558785 +// CHECK-NEXT: At Index 1: +// CHECK-NEXT: Guid: 5578595117440393467 +// CHECK-NEXT: Entries: 1832357 +// CHECK-NEXT: 1 counters and 0 callsites +// CHECK-NEXT: Counter values: 1832357 +// CHECK-NEXT: At Index 2: +// CHECK-NEXT: Guid: 3950394326069683896 +// CHECK-NEXT: Entries: 5391142 +// CHECK-NEXT: 1 counters and 0 callsites +// CHECK-NEXT: Counter values: 5391142 +// CHECK-NEXT: Entering Root 3950394326069683896 with total entry count 11226401 +// CHECK-NEXT: Guid: 3950394326069683896 +// CHECK-NEXT: Entries: 10767423 +// CHECK-NEXT: 1 counters and 0 callsites +// CHECK-NEXT: Counter values: 10767423 +// CHECK-NEXT: Exited Context Section +// CHECK-NEXT: Entered Flat Section +// CHECK-NEXT: Flat: 2597020043743142491 1 +// CHECK-NEXT: Flat: 4321328481998485159 1 +// CHECK-NEXT: Flat: 8657661246551306189 9114175,18099613 +// CHECK-NEXT: Flat: 434762725428799310 10574815 +// CHECK-NEXT: Flat: 5578595117440393467 5265754 +// CHECK-NEXT: Flat: 12566320182004153844 1 +// CHECK-NEXT: Exited Flat Section + +bool profileWriter() { + TestProfileWriter W; + return __llvm_ctx_profile_fetch(W); +} + +int main(int argc, char **argv) { + std::atomic Stop = false; + std::atomic Started = 0; + std::thread T1([&]() { + ++Started; + while (!Stop) { + theRoot(); + } + }); + + std::thread T2([&]() { + ++Started; + while (!Stop) { + theRoot(); + } + }); + + std::thread T3([&]() { + while (Started < 2) { + } + __llvm_ctx_profile_start_collection(5); + }); + + T3.join(); + using namespace std::chrono_literals; + + std::this_thread::sleep_for(10s); + Stop = true; + T1.join(); + T2.join(); + + // This would be implemented in a specific RPC handler, but here we just call + // it directly. + return !profileWriter(); +} diff --git a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp index 3dc53637a35d8..66403f9058eeb 100644 --- a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp +++ b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp @@ -16,7 +16,8 @@ #include using namespace llvm::ctx_profile; -extern "C" void __llvm_ctx_profile_start_collection(); +extern "C" void +__llvm_ctx_profile_start_collection(unsigned AutoDetectDuration = 0); extern "C" bool __llvm_ctx_profile_fetch(ProfileWriter &); // avoid name mangling @@ -97,7 +98,7 @@ class TestProfileWriter : public ProfileWriter { for (const auto *P = Unhandled; P; P = P->next()) std::cout << "Unhandled GUID: " << P->guid() << " entered " << P->entrycount() << " times" << std::endl; - printProfile(RootNode, "", ""); + printProfile(RootNode, " ", " "); } void startFlatSection() override { diff --git a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h index a42bf9ebb01ea..55423d95b3088 100644 --- a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h +++ b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h @@ -127,6 +127,7 @@ class ContextNode final { /// MUTEXDECL takes one parameter, the name of a field that is a mutex. #define CTXPROF_FUNCTION_DATA(PTRDECL, VOLATILE_PTRDECL, MUTEXDECL) \ PTRDECL(FunctionData, Next) \ + VOLATILE_PTRDECL(void, EntryAddress) \ VOLATILE_PTRDECL(ContextRoot, CtxRoot) \ VOLATILE_PTRDECL(ContextNode, FlatCtx) \ MUTEXDECL(Mutex) diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp index 58748a19db972..2f8d7766bb588 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp @@ -219,6 +219,14 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { Value *TheRootFuctionData = nullptr; Value *ExpectedCalleeTLSAddr = nullptr; Value *CallsiteInfoTLSAddr = nullptr; + const bool HasMusttail = [&F]() { + for (auto &BB : F) + for (auto &I : BB) + if (auto *CB = dyn_cast(&I)) + if (CB->isMustTailCall()) + return true; + return false; + }(); auto &Head = F.getEntryBlock(); for (auto &I : Head) { @@ -243,19 +251,18 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { // regular function) // Don't set a name, they end up taking a lot of space and we don't need // them. - auto *FData = new GlobalVariable(M, FunctionDataTy, false, - GlobalVariable::InternalLinkage, - Constant::getNullValue(FunctionDataTy)); + TheRootFuctionData = new GlobalVariable( + M, FunctionDataTy, false, GlobalVariable::InternalLinkage, + Constant::getNullValue(FunctionDataTy)); if (ContextRootSet.contains(&F)) { Context = Builder.CreateCall( - StartCtx, {FData, Guid, Builder.getInt32(NumCounters), + StartCtx, {TheRootFuctionData, Guid, Builder.getInt32(NumCounters), Builder.getInt32(NumCallsites)}); - TheRootFuctionData = FData; ORE.emit( [&] { return OptimizationRemark(DEBUG_TYPE, "Entrypoint", &F); }); } else { - Context = Builder.CreateCall(GetCtx, {FData, &F, Guid, + Context = Builder.CreateCall(GetCtx, {TheRootFuctionData, &F, Guid, Builder.getInt32(NumCounters), Builder.getInt32(NumCallsites)}); ORE.emit([&] { @@ -339,7 +346,7 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { break; } I.eraseFromParent(); - } else if (TheRootFuctionData && isa(I)) { + } else if (!HasMusttail && isa(I)) { // Remember to release the context if we are an entrypoint. IRBuilder<> Builder(&I); Builder.CreateCall(ReleaseCtx, {TheRootFuctionData}); @@ -351,9 +358,10 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { // to disallow this, (so this then stays as an error), another is to detect // that and then do a wrapper or disallow the tail call. This only affects // instrumentation, when we want to detect the call graph. - if (TheRootFuctionData && !ContextWasReleased) + if (!HasMusttail && !ContextWasReleased) F.getContext().emitError( - "[ctx_prof] An entrypoint was instrumented but it has no `ret` " + "[ctx_prof] A function that doesn't have musttail calls was " + "instrumented but it has no `ret` " "instructions above which to release the context: " + F.getName()); return true; diff --git a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll index ed3cb0824c504..75f292deb71c2 100644 --- a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll +++ b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll @@ -11,13 +11,14 @@ declare void @bar() ;. ; LOWERING: @__llvm_ctx_profile_callsite = external hidden thread_local global ptr ; LOWERING: @__llvm_ctx_profile_expected_callee = external hidden thread_local global ptr -; LOWERING: @[[GLOB0:[0-9]+]] = internal global { ptr, ptr, ptr, i8 } zeroinitializer -; LOWERING: @[[GLOB1:[0-9]+]] = internal global { ptr, ptr, ptr, i8 } zeroinitializer -; LOWERING: @[[GLOB2:[0-9]+]] = internal global { ptr, ptr, ptr, i8 } zeroinitializer -; LOWERING: @[[GLOB3:[0-9]+]] = internal global { ptr, ptr, ptr, i8 } zeroinitializer -; LOWERING: @[[GLOB4:[0-9]+]] = internal global { ptr, ptr, ptr, i8 } zeroinitializer -; LOWERING: @[[GLOB5:[0-9]+]] = internal global { ptr, ptr, ptr, i8 } zeroinitializer -; LOWERING: @[[GLOB6:[0-9]+]] = internal global { ptr, ptr, ptr, i8 } zeroinitializer +; LOWERING: @[[GLOB0:[0-9]+]] = internal global { ptr, ptr, ptr, ptr, i8 } zeroinitializer +; LOWERING: @[[GLOB1:[0-9]+]] = internal global { ptr, ptr, ptr, ptr, i8 } zeroinitializer +; LOWERING: @[[GLOB2:[0-9]+]] = internal global { ptr, ptr, ptr, ptr, i8 } zeroinitializer +; LOWERING: @[[GLOB3:[0-9]+]] = internal global { ptr, ptr, ptr, ptr, i8 } zeroinitializer +; LOWERING: @[[GLOB4:[0-9]+]] = internal global { ptr, ptr, ptr, ptr, i8 } zeroinitializer +; LOWERING: @[[GLOB5:[0-9]+]] = internal global { ptr, ptr, ptr, ptr, i8 } zeroinitializer +; LOWERING: @[[GLOB6:[0-9]+]] = internal global { ptr, ptr, ptr, ptr, i8 } zeroinitializer +; LOWERING: @[[GLOB7:[0-9]+]] = internal global { ptr, ptr, ptr, ptr, i8 } zeroinitializer ;. define void @foo(i32 %a, ptr %fct) { ; INSTRUMENT-LABEL: define void @foo( @@ -67,6 +68,7 @@ define void @foo(i32 %a, ptr %fct) { ; LOWERING-NEXT: call void @bar() ; LOWERING-NEXT: br label [[EXIT]] ; LOWERING: exit: +; LOWERING-NEXT: call void @__llvm_ctx_profile_release_context(ptr @[[GLOB0]]) ; LOWERING-NEXT: ret void ; %t = icmp eq i32 %a, 0 @@ -185,6 +187,7 @@ define void @simple(i32 %a) { ; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2 ; LOWERING-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr +; LOWERING-NEXT: call void @__llvm_ctx_profile_release_context(ptr @[[GLOB3]]) ; LOWERING-NEXT: ret void ; ret void @@ -216,8 +219,10 @@ define i32 @no_callsites(i32 %a) { ; LOWERING-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 4 ; LOWERING-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 1 ; LOWERING-NEXT: store i64 [[TMP7]], ptr [[TMP5]], align 4 +; LOWERING-NEXT: call void @__llvm_ctx_profile_release_context(ptr @[[GLOB4]]) ; LOWERING-NEXT: ret i32 1 ; LOWERING: no: +; LOWERING-NEXT: call void @__llvm_ctx_profile_release_context(ptr @[[GLOB4]]) ; LOWERING-NEXT: ret i32 0 ; %c = icmp eq i32 %a, 0 @@ -250,6 +255,7 @@ define void @no_counters() { ; LOWERING-NEXT: [[TMP10:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [1 x i64], [1 x ptr] }, ptr [[TMP1]], i32 0, i32 2, i32 0 ; LOWERING-NEXT: store volatile ptr [[TMP10]], ptr [[TMP7]], align 8 ; LOWERING-NEXT: call void @bar() +; LOWERING-NEXT: call void @__llvm_ctx_profile_release_context(ptr @[[GLOB5]]) ; LOWERING-NEXT: ret void ; call void @bar() @@ -270,11 +276,40 @@ define void @inlineasm() { ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2 ; LOWERING-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr ; LOWERING-NEXT: call void asm "nop", ""() +; LOWERING-NEXT: call void @__llvm_ctx_profile_release_context(ptr @[[GLOB6]]) ; LOWERING-NEXT: ret void ; call void asm "nop", ""() ret void } + +define void @has_musttail_calls() { +; INSTRUMENT-LABEL: define void @has_musttail_calls() { +; INSTRUMENT-NEXT: call void @llvm.instrprof.increment(ptr @has_musttail_calls, i64 742261418966908927, i32 1, i32 0) +; INSTRUMENT-NEXT: call void @llvm.instrprof.callsite(ptr @has_musttail_calls, i64 742261418966908927, i32 1, i32 0, ptr @bar) +; INSTRUMENT-NEXT: musttail call void @bar() +; INSTRUMENT-NEXT: ret void +; +; LOWERING-LABEL: define void @has_musttail_calls( +; LOWERING-SAME: ) !guid [[META7:![0-9]+]] { +; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB7]], ptr @has_musttail_calls, i64 -4680624981836544329, i32 1, i32 1) +; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 1 +; LOWERING-NEXT: [[TMP4:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_expected_callee) +; LOWERING-NEXT: [[TMP5:%.*]] = getelementptr ptr, ptr [[TMP4]], i64 [[TMP3]] +; LOWERING-NEXT: [[TMP6:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_callsite) +; LOWERING-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i64 [[TMP3]] +; LOWERING-NEXT: [[TMP8:%.*]] = and i64 [[TMP2]], -2 +; LOWERING-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; LOWERING-NEXT: store volatile ptr @bar, ptr [[TMP5]], align 8 +; LOWERING-NEXT: [[TMP10:%.*]] = getelementptr { { i64, ptr, i32, i32 }, [1 x i64], [1 x ptr] }, ptr [[TMP1]], i32 0, i32 2, i32 0 +; LOWERING-NEXT: store volatile ptr [[TMP10]], ptr [[TMP7]], align 8 +; LOWERING-NEXT: musttail call void @bar() +; LOWERING-NEXT: ret void +; + musttail call void @bar() + ret void +} ;. ; LOWERING: attributes #[[ATTR0:[0-9]+]] = { nounwind } ; LOWERING: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } @@ -288,4 +323,5 @@ define void @inlineasm() { ; LOWERING: [[META4]] = !{i64 5679753335911435902} ; LOWERING: [[META5]] = !{i64 5458232184388660970} ; LOWERING: [[META6]] = !{i64 -3771893999295659109} +; LOWERING: [[META7]] = !{i64 -4680624981836544329} ;. From 8b11c39a0fad7f6ec48c323a0deeb3c103005200 Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Tue, 8 Apr 2025 16:16:53 +0200 Subject: [PATCH 0991/1029] [llvm-mt] Do not build llvm-mt if not functional (#134631) llvm-mt requires libxml2 to work, so do not even build it without libxml2. CMake 3.31 and later prefer llvm-mt.exe over Microsoft's mt.exe if available and using clang-cl.exe as CMAKE_CXX_COMPILER. When CMake picks up llvm-mt.exe without libxml2, any build will fail with the message ``` llvm-mt: error: no libxml2 ``` Any test except `--help` already uses `REQUIRES: libxml2`. There is no point in having a non-functional executable. Not building llvm-mt.exe will force CMake to use Microsoft's `mt.exe` instead. Fixes: #134237 --- llvm/docs/CMake.rst | 16 +++------------- llvm/test/CMakeLists.txt | 5 ++++- llvm/test/tools/llvm-mt/help.test | 1 + llvm/tools/llvm-mt/CMakeLists.txt | 5 +++++ 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst index 91e34781ef307..8d01b4d8b3dc7 100644 --- a/llvm/docs/CMake.rst +++ b/llvm/docs/CMake.rst @@ -1191,16 +1191,6 @@ Windows Studio 2010 CMake generator. 0 means use all processors. Default is 0. **CMAKE_MT**:STRING - When compiling with clang-cl, recent CMake versions will default to selecting - `llvm-mt` as the Manifest Tool instead of Microsoft's `mt.exe`. This will - often cause errors like: - - .. code-block:: console - - -- Check for working C compiler: [...]clang-cl.exe - broken - [...] - MT: command [...] failed (exit code 0x1) with the following output: - llvm-mt: error: no libxml2 - ninja: build stopped: subcommand failed. - - To work around this error, set `CMAKE_MT=mt`. + When compiling with clang-cl, CMake may use `llvm-mt` as the Manifest Tool + when available. `llvm-mt` is only present when libxml2 is found at build-time. + To ensure using Microsoft's Manifest Tool set `CMAKE_MT=mt`. diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index a67e2b85d9b53..66849002eb470 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -111,7 +111,6 @@ set(LLVM_TEST_DEPENDS llvm-ml llvm-ml64 llvm-modextract - llvm-mt llvm-nm llvm-objcopy llvm-objdump @@ -173,6 +172,10 @@ if(TARGET LTO) set(LLVM_TEST_DEPENDS ${LLVM_TEST_DEPENDS} LTO) endif() +if (TARGET llvm-mt) + list(APPEND LLVM_TEST_DEPENDS llvm-mt) +endif () + if(LLVM_BUILD_EXAMPLES) list(APPEND LLVM_TEST_DEPENDS Kaleidoscope-Ch3 diff --git a/llvm/test/tools/llvm-mt/help.test b/llvm/test/tools/llvm-mt/help.test index 291e13cd4a263..2331faa26c440 100644 --- a/llvm/test/tools/llvm-mt/help.test +++ b/llvm/test/tools/llvm-mt/help.test @@ -1,3 +1,4 @@ +REQUIRES: libxml2 RUN: llvm-mt /h | FileCheck %s -check-prefix=HELP HELP: OVERVIEW: Manifest Tool diff --git a/llvm/tools/llvm-mt/CMakeLists.txt b/llvm/tools/llvm-mt/CMakeLists.txt index dd427a2640cd6..baa9f986a85b3 100644 --- a/llvm/tools/llvm-mt/CMakeLists.txt +++ b/llvm/tools/llvm-mt/CMakeLists.txt @@ -1,3 +1,8 @@ +if (NOT LLVM_ENABLE_LIBXML2) + message(STATUS "Not building llvm-mt${CMAKE_EXECUTABLE_SUFFIX} because libxml2 is not available") + return() +endif () + set(LLVM_LINK_COMPONENTS Option Support From c1e95b2e5e61616eca20f6bc1bbd71470180e349 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 8 Apr 2025 07:20:25 -0700 Subject: [PATCH 0992/1029] [RISCV] Fix matching bug in VLA shuffle lowering (#134750) Fix https://github.com/llvm/llvm-project/issues/134126. The matching code was previous written as if we were mutating the indices to replace undef elements with preferred values, but the actual lowering code just took a prefix of the index vector. This resulted in us using undef indices for lanes which should have been defined, resulting in incorrect codegen. Longer term, we probably should rewrite the mask, but this seemed like an easier tactical fix. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 14 ++++----- .../RISCV/rvv/fixed-vectors-shuffle-int.ll | 31 ++++++++++++------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 2a1dd2b2def17..e5f37fc218ed2 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5399,7 +5399,8 @@ static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN, /// Is this mask local (i.e. elements only move within their local span), and /// repeating (that is, the same rearrangement is being done within each span)? static bool isLocalRepeatingShuffle(ArrayRef Mask, int Span) { - SmallVector LowSpan(Span, -1); + // Require a prefix from the original mask until the consumer code + // is adjusted to rewrite the mask instead of just taking a prefix. for (auto [I, M] : enumerate(Mask)) { if (M == -1) continue; @@ -5407,9 +5408,7 @@ static bool isLocalRepeatingShuffle(ArrayRef Mask, int Span) { return false; int SpanIdx = I % Span; int Expected = M % Span; - if (LowSpan[SpanIdx] == -1) - LowSpan[SpanIdx] = Expected; - if (LowSpan[SpanIdx] != Expected) + if (Mask[SpanIdx] != Expected) return false; } return true; @@ -5424,14 +5423,13 @@ static bool isLowSourceShuffle(ArrayRef Mask, int Span) { /// span, and then repeats that same result across all remaining spans. Note /// that this doesn't check if all the inputs come from a single span! static bool isSpanSplatShuffle(ArrayRef Mask, int Span) { - SmallVector LowSpan(Span, -1); + // Require a prefix from the original mask until the consumer code + // is adjusted to rewrite the mask instead of just taking a prefix. for (auto [I, M] : enumerate(Mask)) { if (M == -1) continue; int SpanIdx = I % Span; - if (LowSpan[SpanIdx] == -1) - LowSpan[SpanIdx] = M; - if (LowSpan[SpanIdx] != M) + if (Mask[SpanIdx] != M) return false; } return true; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll index 65f78dcfb4bce..2020f3dc6bad2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll @@ -1372,36 +1372,45 @@ define <8 x i64> @shuffle_v8i164_span_splat(<8 x i64> %a) nounwind { ret <8 x i64> %res } -; FIXME: Doing this as a span spat requires rewriting the undef elements in -; the mask not just using a prefix of the mask. +; Doing this as a span splat requires rewriting the undef elements in the mask +; not just using a prefix of the mask. define <8 x i64> @shuffle_v8i64_span_splat_neg(<8 x i64> %a) nounwind { ; CHECK-LABEL: shuffle_v8i64_span_splat_neg: ; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.i v9, 1 -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v10, v9, a0 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v13, v8, v10 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v10, v10, a0 +; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma ; CHECK-NEXT: vrgatherei16.vv v12, v8, v9 -; CHECK-NEXT: vmv.v.v v13, v12 -; CHECK-NEXT: vmv.v.v v14, v12 -; CHECK-NEXT: vmv.v.v v15, v12 +; CHECK-NEXT: vrgatherei16.vv v14, v8, v10 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v9, v10, a0 +; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; CHECK-NEXT: vrgatherei16.vv v15, v8, v9 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %res = shufflevector <8 x i64> %a, <8 x i64> poison, <8 x i32> ret <8 x i64> %res } -; FIXME: A locally repeating shuffle needs to use a mask prefix +; Doing this as a locally repeating shuffle requires rewriting the undef +; elements in the mask not just using a prefix of the mask. define <8 x i32> @shuffle_v8i32_locally_repeating_neg(<8 x i32> %a) { ; CHECK-LABEL: shuffle_v8i32_locally_repeating_neg: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI87_0) ; CHECK-NEXT: addi a0, a0, %lo(.LCPI87_0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma -; CHECK-NEXT: vrgatherei16.vv v11, v9, v12 ; CHECK-NEXT: vrgatherei16.vv v10, v8, v12 -; CHECK-NEXT: vmv2r.v v8, v10 +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %res = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> ret <8 x i32> %res From 97c4cb4d13b2049cdfe884f4816aeaf6770d0c32 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Tue, 8 Apr 2025 22:29:03 +0800 Subject: [PATCH 0993/1029] [SLP][REVEC] getNumElements should not be used as VF when REVEC is enabled. (#134763) --- .../Transforms/Vectorize/SLPVectorizer.cpp | 8 ++-- llvm/test/Transforms/SLPVectorizer/revec.ll | 41 +++++++++++++++++++ 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e6559f26be8c2..dbc4c895109e0 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -16080,11 +16080,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { unsigned VF = std::max(CommonMask.size(), Mask.size()); for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem) - CommonMask[Idx] = - V->getType() != V1->getType() - ? Idx + VF - : Mask[Idx] + cast(V1->getType()) - ->getNumElements(); + CommonMask[Idx] = V->getType() != V1->getType() + ? Idx + VF + : Mask[Idx] + getVF(V1); if (V->getType() != V1->getType()) V1 = createShuffle(V1, nullptr, Mask); InVectors.front() = V; diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll index ce13f478d3811..10f52c7c341cb 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec.ll @@ -481,3 +481,44 @@ for.end.loopexit: %or0 = or <4 x i16> %phi1, zeroinitializer ret void } + +define i32 @test15() { +; CHECK-LABEL: @test15( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr null, i64 480 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr null, i64 160 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr [[TMP1]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[TMP1]], align 16 +; CHECK-NEXT: store <4 x float> [[TMP3]], ptr null, align 16 +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP4]], <4 x float> zeroinitializer, i64 4) +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP5]], <4 x float> zeroinitializer, i64 8) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP6]], <4 x float> zeroinitializer, i64 12) +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> poison, <4 x float> zeroinitializer, i64 8) +; CHECK-NEXT: [[TMP9:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v4f32(<16 x float> [[TMP8]], <4 x float> zeroinitializer, i64 12) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = fadd <16 x float> [[TMP7]], [[TMP11]] +; CHECK-NEXT: store <16 x float> [[TMP12]], ptr [[TMP0]], align 16 +; CHECK-NEXT: ret i32 0 +; +entry: + %0 = getelementptr i8, ptr null, i64 512 + %1 = getelementptr i8, ptr null, i64 528 + %2 = getelementptr i8, ptr null, i64 480 + %3 = getelementptr i8, ptr null, i64 496 + %4 = getelementptr i8, ptr null, i64 160 + %5 = load <4 x float>, ptr %4, align 16 + %6 = getelementptr i8, ptr null, i64 176 + %7 = load <4 x float>, ptr %6, align 16 + store <4 x float> %5, ptr null, align 16 + %8 = fadd <4 x float> zeroinitializer, %5 + %9 = fadd <4 x float> zeroinitializer, %7 + store <4 x float> %8, ptr %2, align 16 + store <4 x float> %9, ptr %3, align 16 + %10 = fadd <4 x float> zeroinitializer, zeroinitializer + %11 = fadd <4 x float> zeroinitializer, zeroinitializer + store <4 x float> %10, ptr %0, align 16 + store <4 x float> %11, ptr %1, align 16 + ret i32 0 +} From 2347aa1fccdb1347450fc2a5af310f320f30c734 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Tue, 8 Apr 2025 22:29:11 +0800 Subject: [PATCH 0994/1029] [SLP][REVEC] Fix the mismatch between the result of getAltInstrMask and the VecTy argument of TargetTransformInfo::isLegalAltInstr. (#134795) We cannot determine ScalarTy from VL because some ScalarTy is determined from VL[0]->getType(), while others are determined from getValueType(VL[0]). Fix "Mask and VecTy are incompatible". --- .../Transforms/Vectorize/SLPVectorizer.cpp | 24 +++++----- .../X86/revec-getAltInstrMask.ll | 47 +++++++++++++++++++ 2 files changed, 60 insertions(+), 11 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/revec-getAltInstrMask.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index dbc4c895109e0..0e6f7e8435e3a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1264,9 +1264,8 @@ static void fixupOrderingIndices(MutableArrayRef Order) { /// \returns a bitset for selecting opcodes. false for Opcode0 and true for /// Opcode1. -static SmallBitVector getAltInstrMask(ArrayRef VL, unsigned Opcode0, - unsigned Opcode1) { - Type *ScalarTy = VL[0]->getType(); +static SmallBitVector getAltInstrMask(ArrayRef VL, Type *ScalarTy, + unsigned Opcode0, unsigned Opcode1) { unsigned ScalarTyNumElements = getNumElements(ScalarTy); SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false); for (unsigned Lane : seq(VL.size())) { @@ -6667,11 +6666,12 @@ void BoUpSLP::reorderTopToBottom() { // to take into account their order when looking for the most used order. if (TE->hasState() && TE->isAltShuffle() && TE->State != TreeEntry::SplitVectorize) { - VectorType *VecTy = - getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size()); + Type *ScalarTy = TE->Scalars[0]->getType(); + VectorType *VecTy = getWidenedType(ScalarTy, TE->Scalars.size()); unsigned Opcode0 = TE->getOpcode(); unsigned Opcode1 = TE->getAltOpcode(); - SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1)); + SmallBitVector OpcodeMask( + getAltInstrMask(TE->Scalars, ScalarTy, Opcode0, Opcode1)); // If this pattern is supported by the target then we consider the order. if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); @@ -8352,12 +8352,13 @@ static bool isAlternateInstruction(const Instruction *I, bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, ArrayRef VL) const { + Type *ScalarTy = S.getMainOp()->getType(); unsigned Opcode0 = S.getOpcode(); unsigned Opcode1 = S.getAltOpcode(); - SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1)); + SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1)); // If this pattern is supported by the target then consider it profitable. - if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()), - Opcode0, Opcode1, OpcodeMask)) + if (TTI->isLegalAltInstr(getWidenedType(ScalarTy, VL.size()), Opcode0, + Opcode1, OpcodeMask)) return true; SmallVector Operands; for (unsigned I : seq(S.getMainOp()->getNumOperands())) { @@ -9270,7 +9271,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, VectorType *VecTy = getWidenedType(ScalarTy, VL.size()); unsigned Opcode0 = LocalState.getOpcode(); unsigned Opcode1 = LocalState.getAltOpcode(); - SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1)); + SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1)); // Enable split node, only if all nodes do not form legal alternate // instruction (like X86 addsub). SmallPtrSet UOp1(llvm::from_range, Op1); @@ -13200,7 +13201,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // order. unsigned Opcode0 = E->getOpcode(); unsigned Opcode1 = E->getAltOpcode(); - SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1)); + SmallBitVector OpcodeMask( + getAltInstrMask(E->Scalars, ScalarTy, Opcode0, Opcode1)); // If this pattern is supported by the target then we consider the // order. if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-getAltInstrMask.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-getAltInstrMask.ll new file mode 100644 index 0000000000000..8380b1cb5f850 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-getAltInstrMask.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mattr=+avx -passes=slp-vectorizer -S -slp-revec %s | FileCheck %s + +define i32 @test() { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr null, align 1 +; CHECK-NEXT: [[WIDE_LOAD136:%.*]] = load <16 x i8>, ptr null, align 1 +; CHECK-NEXT: [[WIDE_LOAD137:%.*]] = load <16 x i8>, ptr null, align 1 +; CHECK-NEXT: [[WIDE_LOAD138:%.*]] = load <16 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i8> zeroinitializer, i8 0, i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> zeroinitializer, i8 0, i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = or <16 x i8> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i8> [[WIDE_LOAD136]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = or <16 x i8> [[WIDE_LOAD137]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or <16 x i8> [[WIDE_LOAD138]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <16 x i8> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult <16 x i8> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = icmp ult <16 x i8> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = icmp ult <16 x i8> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i8> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ult <16 x i8> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <16 x i8> [[TMP11]], zeroinitializer +; CHECK-NEXT: ret i32 0 +; +entry: + %wide.load = load <16 x i8>, ptr null, align 1 + %wide.load136 = load <16 x i8>, ptr null, align 1 + %wide.load137 = load <16 x i8>, ptr null, align 1 + %wide.load138 = load <16 x i8>, ptr null, align 1 + %0 = insertelement <16 x i8> zeroinitializer, i8 0, i64 0 + %1 = insertelement <16 x i8> zeroinitializer, i8 0, i64 0 + %2 = or <16 x i8> %wide.load, zeroinitializer + %3 = or <16 x i8> %wide.load136, zeroinitializer + %4 = or <16 x i8> %wide.load137, zeroinitializer + %5 = or <16 x i8> %wide.load138, zeroinitializer + %6 = icmp ult <16 x i8> %2, zeroinitializer + %7 = icmp ult <16 x i8> %3, zeroinitializer + %8 = icmp ult <16 x i8> %4, zeroinitializer + %9 = icmp ult <16 x i8> %5, zeroinitializer + %10 = or <16 x i8> %0, zeroinitializer + %11 = or <16 x i8> %1, zeroinitializer + %12 = icmp ult <16 x i8> %10, zeroinitializer + %13 = icmp ult <16 x i8> %11, zeroinitializer + ret i32 0 +} From 4f77e50042f8299b25442a263875cfb4564a506d Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Tue, 8 Apr 2025 10:44:34 -0400 Subject: [PATCH 0995/1029] [MLIR][AMDGPU] Fix shared build. NFC --- mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt index 78d78cf48a747..2a019954c8356 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt @@ -16,4 +16,5 @@ add_mlir_dialect_library(MLIRAMDGPUDialect MLIRGPUDialect MLIRIR MLIRSideEffectInterfaces + MLIRMemRefUtils ) From b5045ae9bc84508473e1dac6db8ca5976ea4e069 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Tue, 8 Apr 2025 07:46:55 -0700 Subject: [PATCH 0996/1029] [MLIR][Fix] Fix missing dep in AMDGPUDialect. (#134862) Issue introduced in https://github.com/llvm/llvm-project/pull/133498 From ae3faea1f28f840bddd819d1c45e7f7d3e75703c Mon Sep 17 00:00:00 2001 From: Christopher McGirr <7071833+chrsmcgrr@users.noreply.github.com> Date: Tue, 8 Apr 2025 16:54:11 +0200 Subject: [PATCH 0997/1029] [MLIR][mlir-opt] move action debugger hook flag (#134842) Currently if a developer uses the flag `--mlir-enable-debugger-hook` the debugger hook is not actually enabled. It seems the DebugConfig and the MainMLIROptConfig are not connected. To fix this we can move the `enableDebuggerHook` CL Option to the DebugConfigCLOptions struct so that it can get registered and enabled along with the other debugger flags. AFAICS there are no other uses of the flag so this should be safe. This also adds a small LIT test to check that the hook is enabled by checking the std::cerr output for the log message. --- mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h | 3 --- mlir/lib/Debug/CLOptionsSetup.cpp | 5 +++++ mlir/lib/Tools/mlir-opt/MlirOptMain.cpp | 5 ----- mlir/test/mlir-opt/debuggerhook.mlir | 9 +++++++++ 4 files changed, 14 insertions(+), 8 deletions(-) create mode 100644 mlir/test/mlir-opt/debuggerhook.mlir diff --git a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h index 09bd86b9581df..af379797fe865 100644 --- a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h +++ b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h @@ -238,9 +238,6 @@ class MlirOptMainConfig { /// Elide resources when generating bytecode. bool elideResourceDataFromBytecodeFlag = false; - /// Enable the Debugger action hook: Debugger can intercept MLIR Actions. - bool enableDebuggerActionHookFlag = false; - /// IRDL file to register before processing the input. std::string irdlFileFlag = ""; diff --git a/mlir/lib/Debug/CLOptionsSetup.cpp b/mlir/lib/Debug/CLOptionsSetup.cpp index 340055adf5aab..cb0b0e5c375e0 100644 --- a/mlir/lib/Debug/CLOptionsSetup.cpp +++ b/mlir/lib/Debug/CLOptionsSetup.cpp @@ -64,6 +64,11 @@ struct DebugConfigCLOptions : public DebugConfig { auto [file, line, col] = *locBreakpoint; locBreakpointManager.addBreakpoint(file, line, col); })); + + static cl::opt enableDebuggerHook( + "mlir-enable-debugger-hook", + cl::desc("Enable Debugger hook for debugging MLIR Actions"), + cl::location(enableDebuggerActionHookFlag), cl::init(false)); } tracing::FileLineColLocBreakpointManager locBreakpointManager; }; diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp index 9bbf91de18305..2924a1205f574 100644 --- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp +++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp @@ -126,11 +126,6 @@ struct MlirOptMainConfigCLOptions : public MlirOptMainConfig { "mlir-disable-diagnostic-notes", cl::desc("Disable diagnostic notes."), cl::location(disableDiagnosticNotesFlag), cl::init(false)); - static cl::opt enableDebuggerHook( - "mlir-enable-debugger-hook", - cl::desc("Enable Debugger hook for debugging MLIR Actions"), - cl::location(enableDebuggerActionHookFlag), cl::init(false)); - static cl::opt explicitModule( "no-implicit-module", cl::desc("Disable implicit addition of a top-level module op during " diff --git a/mlir/test/mlir-opt/debuggerhook.mlir b/mlir/test/mlir-opt/debuggerhook.mlir new file mode 100644 index 0000000000000..54f1bf98d66df --- /dev/null +++ b/mlir/test/mlir-opt/debuggerhook.mlir @@ -0,0 +1,9 @@ +// Checks that the debugger hook is enabled when called with the CLI option. +// RUN: mlir-opt %s --mlir-enable-debugger-hook --pass-pipeline="builtin.module(func.func(canonicalize))" --mlir-disable-threading 2>&1 | FileCheck %s + +func.func @foo() { + return +} + +// CHECK: ExecutionContext registered on the context +// CHECK-SAME: (with Debugger hook) From dfe4d9187c60c9888de3410ed3c2a98136a9cd8f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 8 Apr 2025 21:57:29 +0700 Subject: [PATCH 0998/1029] GCStrategy: Use Twine properly for error message (#132760) --- llvm/lib/IR/GCStrategy.cpp | 9 ++++----- .../RewriteStatepointsForGC/unsupported-gc-error.ll | 12 ++++++++++++ 2 files changed, 16 insertions(+), 5 deletions(-) create mode 100644 llvm/test/Transforms/RewriteStatepointsForGC/unsupported-gc-error.ll diff --git a/llvm/lib/IR/GCStrategy.cpp b/llvm/lib/IR/GCStrategy.cpp index c3e35bd58d13e..67f363d26b25f 100644 --- a/llvm/lib/IR/GCStrategy.cpp +++ b/llvm/lib/IR/GCStrategy.cpp @@ -41,10 +41,9 @@ std::unique_ptr llvm::getGCStrategy(const StringRef Name) { // be the builtin GCs if nothing else. The most likely scenario here is // that we got here without running the initializers used by the Registry // itself and it's registration mechanism. - const std::string error = - std::string("unsupported GC: ") + Name.str() + - " (did you remember to link and initialize the library?)"; - report_fatal_error(Twine(error)); + report_fatal_error( + "unsupported GC: " + Name + + " (did you remember to link and initialize the library?)"); } else - report_fatal_error(Twine(std::string("unsupported GC: ") + Name.str())); + report_fatal_error(Twine("unsupported GC: ") + Name); } diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/unsupported-gc-error.ll b/llvm/test/Transforms/RewriteStatepointsForGC/unsupported-gc-error.ll new file mode 100644 index 0000000000000..6a3c89098d2b8 --- /dev/null +++ b/llvm/test/Transforms/RewriteStatepointsForGC/unsupported-gc-error.ll @@ -0,0 +1,12 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: not --crash opt -disable-output -passes=rewrite-statepoints-for-gc %s 2>&1 | FileCheck %s + +; CHECK: LLVM ERROR: unsupported GC: unsupported-gc + +declare void @g() +declare i32 @h() + +define ptr addrspace(1) @f0(ptr addrspace(1) %arg) gc "unsupported-gc" { + call void @g() [ "deopt"(i32 100) ] + ret ptr addrspace(1) %arg +} From 561506144531cf0a760bb437fd74c683931c60ae Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Tue, 8 Apr 2025 08:03:32 -0700 Subject: [PATCH 0999/1029] [dsymutil] Avoid copying binary swiftmodules built from textual (#134719) .swiftinterface files into the dSYM bundle. These typically come only from the SDK (since textual interfaces require library evolution) and thus are a waste of space to copy into the bundle. The information about this is being parsed out of the control block, which means duplicating 5 constants from the Swift frontend. If a file cannot be parsed, dsymutil errs on the side of copying the file anyway. rdar://138186524 --- lldb/source/Core/Statusline.cpp | 15 +- .../tools/dsymutil/Inputs/Binary.swiftmodule | Bin 0 -> 17192 bytes .../dsymutil/Inputs/FromInterface.swiftmodule | Bin 0 -> 17028 bytes llvm/test/tools/dsymutil/swiftmodule.test | 29 +++ .../dsymutil/yaml-object-address-rewrite.test | 3 + llvm/tools/dsymutil/CMakeLists.txt | 1 + llvm/tools/dsymutil/DebugMap.cpp | 8 +- llvm/tools/dsymutil/DwarfLinkerForBinary.cpp | 16 ++ llvm/tools/dsymutil/RelocationMap.h | 1 + llvm/tools/dsymutil/SwiftModule.cpp | 192 ++++++++++++++++++ llvm/tools/dsymutil/SwiftModule.h | 15 ++ 11 files changed, 272 insertions(+), 8 deletions(-) create mode 100644 llvm/test/tools/dsymutil/Inputs/Binary.swiftmodule create mode 100644 llvm/test/tools/dsymutil/Inputs/FromInterface.swiftmodule create mode 100644 llvm/test/tools/dsymutil/swiftmodule.test create mode 100644 llvm/tools/dsymutil/SwiftModule.cpp create mode 100644 llvm/tools/dsymutil/SwiftModule.h diff --git a/lldb/source/Core/Statusline.cpp b/lldb/source/Core/Statusline.cpp index b7650503e16bc..a2ecebbefbfb1 100644 --- a/lldb/source/Core/Statusline.cpp +++ b/lldb/source/Core/Statusline.cpp @@ -12,6 +12,7 @@ #include "lldb/Host/StreamFile.h" #include "lldb/Interpreter/CommandInterpreter.h" #include "lldb/Symbol/SymbolContext.h" +#include "lldb/Target/Process.h" #include "lldb/Target/StackFrame.h" #include "lldb/Utility/AnsiTerminal.h" #include "lldb/Utility/StreamString.h" @@ -126,9 +127,7 @@ void Statusline::Redraw(bool update) { return; } - StreamString stream; - ExecutionContext exe_ctx = - m_debugger.GetCommandInterpreter().GetExecutionContext(); + ExecutionContext exe_ctx = m_debugger.GetSelectedExecutionContext(); // For colors and progress events, the format entity needs access to the // debugger, which requires a target in the execution context. @@ -136,9 +135,15 @@ void Statusline::Redraw(bool update) { exe_ctx.SetTargetPtr(&m_debugger.GetSelectedOrDummyTarget()); SymbolContext symbol_ctx; - if (auto frame_sp = exe_ctx.GetFrameSP()) - symbol_ctx = frame_sp->GetSymbolContext(eSymbolContextEverything); + if (ProcessSP process_sp = exe_ctx.GetProcessSP()) { + Process::StopLocker stop_locker; + if (stop_locker.TryLock(&process_sp->GetRunLock())) { + if (auto frame_sp = exe_ctx.GetFrameSP()) + symbol_ctx = frame_sp->GetSymbolContext(eSymbolContextEverything); + } + } + StreamString stream; if (auto *format = m_debugger.GetStatuslineFormat()) FormatEntity::Format(*format, stream, &symbol_ctx, &exe_ctx, nullptr, nullptr, false, false); diff --git a/llvm/test/tools/dsymutil/Inputs/Binary.swiftmodule b/llvm/test/tools/dsymutil/Inputs/Binary.swiftmodule new file mode 100644 index 0000000000000000000000000000000000000000..7ba817b22b707a23387b728923ae764699578561 GIT binary patch literal 17192 zcmbt*4Rl-8mF~5q_~+iBd*dYFKqirm2P9i|WRnCtmXoV12MmN(A?Bwu%v1>u4Q&At zLJO~LDoY9o#>l{q22pcHZ6JZ!Re2xSw3YaYY4#v7vais?SPw+z);dqRH(ZUa;_=8biydlZ=$wIHp_o7Y3ijSuFh({Rk@GnmA zy?B@MEgX{gK_B%JOX`G3H2X$fGU(x-SNRhvO)Q4@@+VTlaffh1;>EWX@dGjbxKHTC zN|?;S52$<}mQM-CB>t#}7mMrpeu+Oe!S}^@aZ^2;*emo|1Z}#af}!My!JbsAMgrAi z@l|0%^+?=05ORj%wLZ30EDjW~CMB?_DI7V7AxHG;g zil+g4C}5w82XE_9tmxRI*hZD=E<;TuZbjEF1zK7aGSti%sw07FEOIx^q1c83)-Hoh zRctCIkJm&3wq$%&R=G~c^|WeIv5gsOLeR8gb9%z|uDCsdc@*1l+}ee4hMG|l#kOCm zb^30wj>0Mi`>0a&HhVgwR1F#IBLRDotvIY$u~RJS^xa64N0jOqCW~7mB%N(4P#sd} zD}b;4&hLswu3jwtlP zN;sI}k8;A17!M=7=-~%}6j&oAz-*coBPnjp>@bNxpk}e)Ydqo*U}XT|phv(^3_7UN zlYS53tWV~}@qgxz%lt{qpAw=HJ?#aovVgdBeD-ypUx=poL0NcFBB9fexzz+<3Wrtx zSV}nIfl$IhKmTHikIKT46dp>2rBxv^zIDP-o83Q91TDZmWEDfrRD4wzw8+e*#uIDz z7^>kSzl*OLj#m%ISHV1h5io;J&}Ar2${UN@ojb;9nur23SAD>2h<5{ubw57m8bf}Q ztquShYy`ddssXZu&AI7P`#7yin9O2mh8eUg?k>CY1nl4?2RQ;af%?6CuY=Sx$1J%X z(lHnXT4PELQ1cRR&47c>wvAuTVNVhse%PM4K?rx`0LZN3(SQxTnPT!6LoYLthVoxE%a1Q;r1(Jmd~T3~+FqBMgjM zKd~Fw1$kM7BM$yWS$KgHG*}OU-ZGPkZ^?WgUSNI?e++0&@_kAE0F0aMamgfqJjui4 zm_IX3bjgrMh%!G(FZNTHaFPS~vt0zR=1^*4`5}*R%s~>u@iN0xmN`#3aIgR>7r*u! zjq1~~rlsUU$ZIs0zsPG} zBQ^xxKvnPn&9f14G_9s3{THJ!n8Irw9x;&d3&b0+7jyr#1;yp2^o}tB*{4S!!MQnu z(^GD4WWYEK+reZi*3B&O)~+1md#T^SJRue>B?9dw3KS6f z4$hj6yETIET6azB>3#aXG3enBVI{1`JRfp_fi;p$3;gD-!Z8bf99GAc=0V_u7ly!2 z0EZkv`L;h#Dp$4sTIlAd|rTEhB^^KRxh*YPi4Lf}*z9pevWRdM<4Iu*i< zGE$1U*1P>$Hh(7ZfDo`GoCVs=;LPy|Vm~a481p=yrubtq{)Oy9uVp4U?iVs6xQ-4A zh@h4b$eqRZq*4oxaeB4_kmSVz#S#e`oePqEWie1eyb9bMMWAPlHiPH@I*JGD3o!!m zDYMLk`Vz%-A|1*>5c1g?_U$khi;|g+$!-^mx^ZIB%yjPW73LDjA(bEY5a9sj*#GPX z)#ci}Ghix(1u_t29IRzrTcpPz0Im7;i{KcfS1HdME-M1n2yz9MS%A>f^JS*lQUpW| zdIW$kGe~;pC1P}zq||7%i!_6+>fTvIs7VPLPm00{Xbgd4i1B!*q1tJEL_CWZTkV_m4zW_Y=Y8HFdp3Cm=8ME#GgnN*hk}txky(Lku&4&_z~VC z6e1^u)IM@{IO7Q<8-N(gEk$)Dg8mrBL1|jpORsJGkD@62KS`Ma)ZB~IG=;2$Qe9$% zG(bHal#bcDm8yyOs&ulz;!j2q!KFNs5vYzVhYXh^R>2(b&5ZE!;|NayM7i3TI3hB# zODi@qn%#x{AS{IFL1Js$?V*sT)XXZgwGVFZ0rrX7vrL%Q{ZEmU)P&=9#IA?|sbWSu zPT*pgu@^jtg#c}Ku9u8xPX_Gc@wt-RaRfx@AwHsz#AxO)AdGmEwc<~Jy>!-tD4fwk zVvvHwLG;abG}{>>5W2N=oU+Z~cyKYSoDN@bqalcA5ssxSuNZ2^4R$!27Q2(sQUF=1 zEo!K7Hk@GeqVaSVCgMV*)z!!fuuM`5;_=`X&>ID6I8e^)v9|;03t=fO(57}aOl4d| z3k0BH7*YI2V>e8r*dlQf93lgCuoO@?9Ka5$lL{Rd04nlCR6}1N z;-QpWc#ds<9wGo8sOcz7F&o~+&J*Yfh>P1(W+3*EL=Uu& z=a4bN+K7NGk%3nNJ+SskJWR-{{zVp2S3O2Q4D3zJF>FzIahF9#MT+kiDsrU5Hpf@n(qgOSeDFe}u6 zJei0^p~Ek>12j4BuoO^Q_&jg{J5t&p!1a%4nJS|`@zz#QCUlDRApcCj#>is!Wb3@M z!UcS@6iniw!CzrAjinqyua|O!7g9Y>wje*p>5Ps5h*+8YoCprnYv3by^i^&qF!i!j z8or<9A}AfHGD3Y?=?+1>^Cq0-MiuOZLUwLYVNOnhVjXrocb5&H0*k@BG7vWqKYitx z#22No56-nv5VS)?1XhGn0M}TWAr7VZLBt(84fEonO>jrbHR2R&8T;S>ig;yn+Ux3A zT7(^_#KJ?Vp4UuYz5FW8HJz5#Z_u6(Ko0DA7fIRcE zOyu*x_7NWsFM+QxUX6a#g7`o`i~w0t#kxQ4KE0Sa0*hJ~V2XYQ5Dp-dBtuY(5Gelj z*U8F2wE;ixR0EyxcrAK@$KV+nziYNop{FJex9~%OCCQxUMP;(oHxciN;}fL)h5Vl`oL- z(#wxn0ABNF*o(t_K{CG^)9cjnIA)4hj~knz(4=mTHJ2mXKPs8AuityjQ~7eFo-VA8 z8f+v=uwgvWxZmA48nlfEn>vlnkwnw5ZoY3_4p#2v#8k|D)?uF1c9&#I&!1Z^JiJ*r z=MX2o<}(veBQvcmO!}B4obU-5N<3o`GkG8Z9>2+oi@qHRG)19RJwiN3>I7J3a>9H8 zR&wZu{?G3u1I|#eY0QY$*%fTkX(YI8;#9B@XAgfN}!86R5#l>TX`k_yjiBmqcGmLgN>m?gAQ88)hiFmJPi0z zzVf1PTwgP$F;vh79~lCu2%hs0*6k}JDuggY#3>Ep(1Sz#JZ8R-Kdb+pmu>W8aKcH5 zH#W~75Zb=j%YV#?9|7(jA`2bddnRA54hPpEWdmm=-RlURN(0(_o9=h*9y2tDh}9ZM z7x>^t2wL}nyHP{ukkXiRH>mCgwE2U>9lJ-B#xY1~YzB6;0iE%t;b0?z$&kBYcpi>ye}xYzfvn1J+^&2XON_Z_AHI3Z>P!S1P2__* zsjP=K^9Q&6_XENi$^2oAKRba?rVj#AN0kQf{K-F#GYcSY19p7jz{oKXo}AU+*BQ@?1nMib58@iDgz zCP@I zuK=Tc?lIFtCc_2&R(|$3!*Bz)Ee2|JH$ebIuKYc=r#m$t>OvNgK>nah0le4jh)fA5(%Sm6SxIo3v8g*@4H-ec&7iV;@IJgF2T^2iOv!Ks zM1Cpj^wRe8E~SBzXg4^mSr=s!k6tl=aY$XETkx?Lv75zvJrdvgISloclNpyqZ_voe zItHw|n{bGj3f4q48w8Q$B4RiP^l9uJ_r6P>@l_^y=2@&BQJP|i^sutA8PP80UZ1a> z*84FNjz|vkhx#2{wjXnZ3{BvqP6joAHh+Tl2f%l*RSJkTe{KcI-w1hYiM^Q#h>TdsJ}a)qQ^PJ14dgH(2!`FnkQH! zc3pim#nQ8Rd3=Ze>fV_73&}j~5c?3DWbq*O;WfiwK9*=7XK;*^5oze0G?~9A>C6r6 zGz!z{#2>o)1V?B8b1}Ux{GXQCkhhK#-6G{7AD&0CZM$KauUbCUc_Y#~eJ&nssRA%A za%`2Kd4f&@QKexj(S##szByYC-@?Nyz`yzvZ|@Tn=DXrffFU;ji8oOexv>Tu5(+kU z1$DiF=!A3Js6Wyf+b=t$Qk2l;u8zq)vu8csu_MuS*>O(P7#yK4pX0=Pu+IBo?40t9q~ha4mEX2j0XgtN!J zE|$Nw#Q2IqkNJWhTyOqDPeY4;2#cufw2yN}K4LdD?c+a7A<(fDBVQ%lbgOmDy&m~( z{v~3G-0Oe86KKNU=DRzbU!nRkHJ#yBV1+()j#oJiQe>D2%iT zukz{qqN^X_x)a$)(2fK|PhPiIoscQ3p7bFRV+U!#1(B0wO1cvqced3EyC8|pm*e4I z)(aP47ZstL|I$H+htIUb^1N((cdQDWe*x=aEf%7oUizWPAs#{on=~K2=#M7yN#ds2 zBV@iL?Z-f-kbzl@PI2r!quYx5zc5WgJC9oFB*PLwy(^ZTy3ranHsMs>tu(@4#{u(% zecb4b(Ff<)yoA1MkVQ%)Ipl!&alpqz63TQPW}PJRYp=q8U}BtH=F9L}hHlCHGMA4I z$D4e01g=L1WW8}+F?-t){4UH&L~sX8H&ZfQ>h#Xm3ULeO@8OeH3pY{u7p|AG#hA|3 z1?kK2kpONE={I#0Q(cKG58RaFYOgzh>t2)^+MS!+Rd|YGE0iAM4iFc6IU|fMdG}h{ z1Y_~wE%?PAzgTN`Ib-p}wz>C;c4z3Ny3BnvuBtMZ2>4lEyVIU6_K5MvQ~WP+otu)% z=UjYa7~Ns=5!^l8e4anh!{Zlh{D6*YL|iwVNI435s)2|EY9Iw#lxTD-HMn!h{rMd? zJp1XYD*bRQZahI3uvGfB3Po(Vj>47P=_p<8)35j$1*L1yP}em_Sv!7x(W|gI{d~!s ze$Z}U&Mn-WuI=wo%^s>F@ZkjHm4*4 zCco?@+;HH+4L_-q;qz~Y=8969EXyGzqorMT#$npnD&(dusS?y~edx~_|p&0}PC zSkCVBzKudedPKE|*rx#(^tk@SJqlerrY8@*bQ|650eRWWG~60SaT$Ui-EooClf4s4 z-;kW}Ni9N|P9g&e0IX33#hTe_NcZA2WKbfcRENod?8{BBEA?|qk;I!(8-$COIi*ir z3GQxj!G!A!d;qpHRe?R1XfJTp)Rz))rG=7Ie9*q8U4+WUl2UHJbdi-uyY0g@7Ot95 zxQc5rT<4-T_L&?DSOixT;x$Z7D%DaAp`3=JB99kR@)~U(+&JQ<8D-;hW=lmIwAasL zHmzRg4Q(E;FoY7~7-^BHl}dCbWUcRE(2j+S5A_bFJR7qM;OdWx3RoeB#>eO+?pw_S zo3}XV9sRp3bBE6uQ0EQG#>F+iVzll`VuCfAa90DKbvr>~v}M zeDWWup3>}VcI7OIxAGs%Ts%`>&G7j+9(Sz@7ZoL=G1GSp^(1(R`bYo?|P zB?i#3f^q|jIGGzzcUf4g?IZTmDyymxe~GieVWvR4+4RhuBnj(v4Z~YxGQCMW=)2B zc9ijD_&jCOUT9WG|F0Kt;9hMhEhvB=VL1oN27zNK9Rae8HalpsM&miumNQOWs;#7z zw}RXURElW84$&IwHQ-`c_*rd^B!AqCz`|-^v0vyN+%!jyOf;CwmclWKw0a`tO-F;g z*Es_D|6CI&m-VuooSCL49VX`OlrtGu_)mWlGtFiW;Q}oX{85TZ@rS%9DdUHrom;gr ztT>9oClwLp8b>aCJuL=O=58w0V(ByfTMZb2s1+5OIeeK-W4d3;2$S!la29ADh1|ZJ z;?o?FiVE(r?}8CgAC)da0wboVgzHd>6#=CyTC#{!O|TF29FeFMpG62kxtxOnq2`(N z`wEBkC^?Aw>HpXurS1dn@cI~B#X+OBh726=zXFw3TG!_`(bJW(?S?*$w>aKM-=-D;3*VqnYzI(Ucb^n78 z*R82t&2c;My?*t^`qgzDcXt8DZQXhA+SN5%X`PyEKL^LXqREk+`c-__difsO_T78# zs;=y@5(ebasaeRn;ybLaN^ z?s|m!eRIpMo%h`LEzi#T@7n(0gZM=K?DU+plhb+2j(hHX5HInk5%UD_Z+wX{)GC#` z0}B^bw1-toP&Jjb^-Fs#u7Ihew6axQIaM_6+D-pFwKB9hHdQpaT&h&?pk-84Zm_h5 zXDo_CFqQ132Z8DG)}P?PZpmYGRZJBvIv*^acF8W&#K0q# z*4@(9^Fd!lacTL6uw&ZQu-Np9SW($6m#ejbO+C^r3(_;Od4D6hcCvZdx7QcA&Ie~K z8%D+R!D#)cRL(xlSlo|d>bEVeS;3UQx3q?62wU0kk622zk#T8NOJyqr zu{KQ=DW#Rghl|UVP-XXwrRz%e&;ClbtB?s@Zep^SdROm-f|QV{bmMLsxy8g!yRcqQ zRUo5;-BNNXdq%5H6?L?03VWxbWj7N${sH8X#-Qww>8X{b^DQK5@zo(|QE4S@&vRq( zUQ+C6$@EK8MO!0mk^#*;Wi@1^vQ@0Dn<^?-y34y(KuVbGHur9+!WCM`j99#=e%jUg zO*Ua#ZN;w8-jy8J49voHx%!Z_u*w~NMa1Wtc8;5ZZx~K<7%e&emgL|TlVl7OlIQhN zk)$(x?eb|?kGhhS5EH1wVOIuuzn(Ev7X-m)q%>o@I|Lky>0ah-H}%(_ z56ZM^DfWHUhT?_=Ev@R+4qDLv08`Dt8tOr~3l=Ye&6CONdLS2bJmxG=+rpC4e$wh= z*J^!DfCGdFh|1lM!l)SRhFk$y%&;wdC2evKJ!Z};RcshlSGM`h51ROrQY=sUW5PZh^WvU-Xtr57^%D?1iAljIiZJboV2egBifQt&W2Ho zatccS&R$ZMR05%@pwHo6_T5xc(LM@g02|m$eLKR+2B~h^<^La*Oc|!>4yN3$R>K}s z355Ip4OnXfh^^hzTTYOI5gq%UnXyMuXDqRnr=W1KaB8@`aU&}Q1b(7enicWsp+65NX;H<9;4=OsCkl_pHj2e*iu~fl(Dj( zd!tbfaGz4Mc^#SusM(9ge-aI$-lD(^FkASof>KlXx719c2~8CkuKSDwHThFc*&kkE zarwK^$o(rU@|~h;@wSVq-0J@T&thMq0k%&UwMv#((Kuc&!j>z3QN; ziDX$pFDuZKgUFDNmLp{Ym6w#Xq2mpjpTyU;-8SEFmI zZ9!$)I8~OT^t=^ZLD0PJrUIy_`)3A%_Dh(#j;3#h@H~BTFIgDK!2uG_D2V ze>9Yux<*Kzl6q74f8$#?fu{460noh1PMou{m{&?o-7g#9bL9sNE$z_2DQa5&l|%D8 zG^eO(d6$|q^p%>H_vsTgE$8SHH7%deCu&+!Xp~R6g2{*d{wR>~HX)-Jw)79%(0o8m zSrtIhiUz2=m7sSUnm4FfbO)L`YTD44%G-#>{q{>OW$+Zsx5<{Z0-Abi)>5;Mn)PU; z>x5#dkMwkynj>f$ez^cnWO=C)VVSF2sKx41V;7>c_Gb;^H@&QnbkE3!9!@O`7hK>1 z+;)G3T-sBT@;R;;6DgQ$K_O?5jSjBPQMYwXWBoOC?bobcU0?55ySnk3ZTP;m Vu73Nrt=F_KF#l=&FeUr*{{U!EIS>E< literal 0 HcmV?d00001 diff --git a/llvm/test/tools/dsymutil/Inputs/FromInterface.swiftmodule b/llvm/test/tools/dsymutil/Inputs/FromInterface.swiftmodule new file mode 100644 index 0000000000000000000000000000000000000000..2873ee93e137a7651f01638440c22ee3c3b4d558 GIT binary patch literal 17028 zcma)j4R{;XmG)Rt{4*0Y6DI)&vWaZGAlb4ln~>VEOhzLE4un=o%um1N+bZBt(iRXQ zQ25%WvZR1uO*;u@Yf8FnCvxJnP9hwKHM{=42#F1bHB@vBh28ExHaL_l5JU(I-pAej z-g8HiWsO6dC(n&EbAQe~=RNN^SNG)IeFNr#VvhUCMI2XDFcce(SF1`*RI$wltkHON zSG;Pa=@sQF{@5fxY!Qyd_>&fXFvTB;^5TXh-!BV>(CIn3@Hm4_S?~dCem`AaV#I4;JXQ&w?QEdB^TBmP= zbqrQ9*vFKrx7pKKrE1t<9}U=(Y{e19ik)Inr|$-uJfc*`Fj?FhA?a+>f$ES#Um@IF zSMQC~HLTQNT?p6{u#YO$n0Ulc(;cw)DAq1WHeB!&*{3f?mjAnyFlT6=a99=wQo?~0 ze}oeb$9Ndwq=z2@Qecgg0JCXUjHI}$=Y~oAel?2)U(;cS04oCs2Rs6XV$cDVo(y;h zXZ3Cf?w8+e*#uICI8>-SPV(AWJSE(iaslw;9;54poI0~{RZFax93PwWA9 zL0%T&u!BD-3omej2J0cvTV^uxEt&7f3(W7~j{?m}zCX$DhjFt#E}G(xC3%<}^Jk`s zE*tg;QRXM<#eV7%PH+H!wu=DP98OKHJm?XQI!Hn|US@d8GVdt|4i-S=;@5wpQGHt0 z^wd8y2tz7=MB<0wu4tLZYVR@=dTWlcO~8pS)T16f`Xu25BwTn7`D)GOFYwyegbhJA zP!&8t^K3*MO{-~1|HUW_rSO`EM+|2C0`Ugy#oRw_L2;!iy<<#3_URQ!aBkk<^pu<0 z`4I##^uxMpt_>U;_NVw03>b%CJD5zxx|t>3+L>c~FZDZ+C&ZHFM4){{fdWF`fjQH0 zH%IVY>#k`%y-(jahdlg2tc3NL=R+jK{kS)lTap8d|bl_N+p%4Jq~s zijSHuiSwy%`wt+mEDS?qlazjf@!$qWebBKc{$#4aJ{CvJMY@WJoEdl9kMJI$5IHHN z_K~y08IL2`0K`~sDXOax^v5v{O4Gt#dTrx>7DeIzNy-$U=02pRDP$#->JlTQ0qW_b zbj;SHR87X~(#ZmgKN&>?m-9$QpgOV~GF*;W1#`eRGs4S{BRmBVAqo-) z&^O!BY-fl-=+@G4$~H&h!KJWrI()&6h9RCsIGVD&VyKxg*x_ti>`p>U0c5GRsG-K$ zc%0FT#?x7th>MX{S0gLHGD$6n$AeoyZxpEEKsmF=-VUHIgr&4Vo7&kpopBK@5P*hZ zMDd$VJur=8i^NHAhz!)hQb65E06VBoDs<#pA)2$G z|9C$odLbD0DR)Y28wq@M&k z%%CBzY)-EOSSwY~2+?bxmXTWi{(ftQ9}=+u9dJ>iBK|ne0N5zN3AC|L4Sj)#hf{Ll zS+@NJhyZk;rlT;$YuG;ZGm}0$P9^X>MM&bCdxR zPRQKW`+0)JqmSxKRYrZ{&26Ae=oITg{@H+yk;UA})_Hq{3;1R! zn8ZVae~-yDmU0MvUdj<(NcBG1iu@d>GdcnwVrBAkA~;O1fsfqLSGkG6)W=e3_{9aWKUXA@0y=m=~99gga8M5vN$o*artt#4DTAUSH4BBJ4mV z79LLZzGnLBr%N4Yc&n&V{2b)9g=18KkOKFT=Xo8JDc!Qr(&1Yp{h7)zmJ3hVEHfeN@ z2AxTFlbT2;4u}>AsAKWqZ!d|c;#tZ3v1)!V$)8S|-}RW!#rRKB<_|q$DqkjfW_R^C zKx}kQyVs2*bbHzO<30R-iGNQP&!o&}^GEwSud51!bQ8`PwOBACb)1*Y7>%>3lg-PZw544K@-b z*f^1B+UIT>3)&`v&0WTpNTPW}H{Z7}1uOS)Vk%}n<1kNYyGt^q=g%z{9@-?Fb%;}5 z^XW;bk(pK&rhH5iPWXfjC7!m3nLLmHkKbg)CEtkznxoLF9wDB?^#Uw2HEBK%D>-yS z|N6VhfHM?q9yg+Ob_bhv8VRl#ebnCVc22n0jk%k~6ZT}H86Q-meJs%&!}g6W)9$Jf z_qt@lnUBx5p(*~8N%Jv_`9B=lWIZGGTi@$Z)=kn()gkvaqwW^n(#C5qgEc?Gwih;~ z=gm`gNYLe87cw?Sg4gH}5!`l}w)PQ^`D2S2s{1To>68C<$~x-CM%DoiW7}mOE~QiFW_+UbJtq|&V9iP=%W4X68^kJI4z4G zSj10!dJ}mHKdBeKP|fel;!%lk{GXmQq=q~qJmL(P(?V9!8G|eQ^nPK=BA)dKXC?7- ziLms(YCbKS`yA%?Jm$~z%K!Nf#|`U76D{y4`~~lg1)D;NMjf<_YgQkSc^L4ceC0*o zxV~mwW2m4FJ~9kY5j+4-@_fEZB9S*KT$_CC#y4Mjrl}5DrHr?miHEw7L5vw(lF7UyP z5VY&sonLY?eonbO8*46R7dqpU0!yd!#b!x)i9du^2Mj;P6=^>KA zFg>7xS5yp0=Fe3AtV$S%e?AYqdi0?7J{hha4K|MkS}^?>*3s`^XQjf=Oq!8WAm+f? zq0T8sMx*)2ay%WjLSk&_cMDO?T@_J_c*aXqaatvCf%tUjO#PzG8cno-$H(0^m?TM< z2V0QiFBDYVk5w#ar~mU<)?OKuP+GCYnP=9Rj@eii{sx_|H}IM>55ZdU8i5%-qfX(^c+Kx)Pt1wWEFAr!g^mER-Vve@ z5m5a-@sW7ON8$23mp|*L&7A>RI4DAZT!BEi9{BO_q#4h#K6vxL zAda3cpYGCps2f>C0{Me31@KjK&fc=M_!M-l82O<+kVB zgd>u}{Gomam+ZqFAwx4bsf$4kpv|A4<9_fRY?T6H&7WIDVhD$MCf{hO*EZDPlmx_P z&yGDv;IP5Ca~g-0e2)rVdu6|0d=K;m0`U?V=%)0&`M4E{(4f)Q4KyU0rxyqoiJey- zNwM^7K_0)`e`R0H{DowmaftnhO|p0Z`|z6KFCR;^kJC6t%7`>{PMXT!lXQ9mJB`A0 zI`N0EJkHVC|7=XJ3;(AjHsq}nM7KzJ$cGnDZ2K-)=Bt)Zb=`opPM?bhTdM$!3mjYJ zXP%(bKvZd*PBi1lnQzY4Lz{Vc1^8Eg;_Z8a!hCn!2{6PKKJg~XA~)24Lqfr(?x3zW z5S?&N81+XwW5*>2Rf-b2+|@C)d+w~KJ9Z?xFFD4E8iOO$<#U|49sB%X0nq=_HJ6R# z4@%P66;wZ&Z&UY6mm-j!_L|T5b*!=ZyRkTqtSt+T3!Wm%_`PI)Pi62zIt{2HI_Bfa z@%eT1D*&aq$P@FI51zj4D8QNIKPN!4@R~n?_YXKa2AdHOaQX$dy7X+gaq?3h;pvu4dCcei;Ck~HdKy~#Ls&#*r+u6=@)5hSc`yH23W1KL82KvUX44yyOE#K9}*s3ePs`tKr8A-%a!sC{-HxEB!%%-3zKiNNhy z_y=6_W$Jinh(8M*Oqz#N=KUV?V2Z_={7uQ7uacd|-7Uy$k;dmwbOi<^^^~Z7&}MyJA(~{PS2BYq1aw_0kVT4)Gu|*rfT$1%EV=PY^fF9U=22 z={O29g$&GMbc$o=8QoUY|BGo7+IiGUCmEIi>Rqwyr5mhKV>3?WJxUY&bpkL?*e8t6 z7=3V#FG%P+hghUUl0y!N9|wFqB%w^#Vb)0^zxFEp2PVeJWuXkOb$D~;m$`g&INs!| zBXB)BAnT3mirL!_<9A_JB7!?$x|x#cQm1dOR)|})a1WofM!1p6zi_>jEyi@NDM(+A zj|OmaNWZD0nCfy|dEllTS9?7HT=$~X(C*ymuEJ9kTcPw2cYwIq%Nb#8&AZpqCK!(g zH{%z3{9>)$<&4J@cg(+6v^zsD)@SacaaEPMM8MDT+MV`Xu}6$Qmg0Yf>)e!7KJVfi z!{`o^kKpd%rgQxMULL<-;|FwHBjUQ@c*;@8Qw>BUPy;E@qC}%dsllB~?$7VI;n_!5 zRq2Q0apQ5ifThx}Iux*q`6^n-Q- z^KRkhb?tbEYW7eafgk@T(6U=u+#TUIF~<;z$dd-T(Ak&070I<>>qXg{ws|EHF!?1f z;)VkkZum)^44;2HG+&g`bn%Jq4r`t$YhJ^p9Nl%Inywlm&zqxd8C5#q-ibiPs#Ett z0Hr1dsI@(fG8%gl@3Ol9YU9v5-d&u&D#b+~)x^@Jj8=DpcbBB!(RE#%Y#t-C!*X_~ z_iYp+(j%%x#6AtUpvUzm?osI4F+F+c#arlR56H`2rs38wipvoE=#Go5-t3)F`iA7V zPihs)bP^d*0AP(ODAvqYL%NryA%hYjr8-OwWM64|U1^wCiX`5I+8|uK%qxB3N^p0J z3npA=-~+IosS519NPB^+rv8+GD=n0y;)C`z?IKh*mXvb)rHiaQ+HD`Mv2fLd!c|<0 z;W`(!vCrhb$W%`0=9K?TKWDbA;CO1Xe1X>}_JyP<*HXL;AJO8fU|4sd5>Dr^Y z73IR;YS;HfEhs5KxAQ4zk`Zmae?~x&{X=u)AOMA8bw#Q14iKsE>}?) z0*$7xy8cxwJT-x&02+d- za4@GHSKKbo?JgJxakE7wA-Y1^NV)1EEgO-I^(AKJxwnU?11-s&K?Y zRS)7Vmuf0!nFp0!gyK{w38n}0k}6TEot7*rvD!)x36U1dN7;n{|^Fzn69aj1hIntp13PzcV{~A7MEM$_9aBDIEc_j5arDsYc^D)Rr?&U9PR9mA8W2 z2UUt_zz)$G>NVhESom3OjwFA~i@?HaV6k859o#fej!ZO|&z8bbiL`n= z!Y36Gf`wT{~QdPi+Tz2#2!sQzmX2R*y0_A1*U%y-Y$JMVko;rgp< z*Kpij1su0^$31J;)NG}>YqIYg9QTU0c6Pc~@m=fXd+?6$ZNIbHQp^?NS%taHQc)2e zsJz{=r0#qn$&07j9~*SgfRCFC$5e~!+ICO*qRKXpt*BC3YzjWn)~j6FruObwMc+&% zq4rkO@_N^{+tt1wV&XdAj_nU^-MOu9=RLl<+PV$3>F>2{x7+XCxoyW?TkqV)nRt$4 zI>9=oiuC7i++qOLD^WC>&Cp|CYxNo`dyZ6oqckI}9@12iOzi(~Xxnuji-}dad z@6K%xJb+Ks&rZ)tn>n4g+`avt2k;VqnlMiQ|HhXYLv2#IJFs|3MMqe*1XWW>`+&5^ z;tH5bN-Nvc)zd{Yu3hxcQ>#OpV$(%aE2T;W4_e1WWrL+HJZn)Lf~jN=JqXN{xBUbU zc1a$it75un$+=+hj7xTz9^Xr^@x=9{RJlIPMxP75A_g9@wC$3%o(uXaic8DahaEGn z#-*lL#EQxuxm>LcZ0wabFG|nE=KYQ2+QH^!-(FwjIv1R^tREB41)~jPQaSrHYjHn{ zso%D=Wd&1yX=w}55Vo@6AFz~a$(7@u%3T3%BIEL^*2*>rVr`x-Qc5d}4;7ayp~{|F zOZVmMpM8~VS0NL++{k1x^{v?h1t}p@>9xCP=~^(UDVm3 zDeRq!)?G~O_y>?j8i%q&rl(e$&b5-L#aD);C8d?LJW3r>n;Tfyyt@c2ce(}%sJzFbk+iqEe}DsR&mA^ z`;wqx_|5=UEH2-Sr7f;UO%3e>v?XYQW`ashUCZgKJWd}k23!l?+l{xy5|mo-F6Q(u zCPNn-0O}#%^&59d@?MfAc6WtqQRvwM!%L<3(zC+bVH#*S7nEt$QtbPR^~H^gTHDks z9kihTex{nitEmU!E?B$-Hcuw6>xEp*@tCtfZHr4v2S}@reNF3Y0vsSbKveF26h_5h zH{=SyVum}ym(wP9(_`kmQpNf)b#=Sn{D6rsDaG=nZw3U008>>1{!OdX{PY5SAbNQN z)+61vUk?kX$)KZ6@}DhUR$3YE-D=qFfBO1el9j}Qh2YEF8~ASdZ-{bg6}HtSljz>X zdsmzIf`|&u;Z3p^-WLy-I5%LzSfk%4$&_K5>14|7ZZqsQl|Z=fZ@^mXL2MnKzH)*T zjOf_=%&f%{Ds^1QE=zq;`t5hXF8w2zppo}D6NzjRt`QhD~F1glo-DLYp#NnU1E53?TpK?1kgTn z#rmYD;Js&Mm)l=_rKi~XYnm`%DyX>2KWmA#J_Ut?g^SDnnV!c6u`6)GfjIG6GPZ=u*xHp>Q0QV_1o7SPZpPD^r z{3p;5>MaV)0JDYPDkwFDe@o32n$UEC;kwT_P?JC9l>OmV7MH&VjXbc*BHu2m7H@~B z%5DA+@GSNv8escWQJZ9W6^-NdB5b+hM}7c=n*G%L9!;mRtM?8}DXo|NJ*#Q9)t26A z%tjt7eh;0gr2v7D+rp` z-B_5930(0&m!*VFV()HGAGo*EZ5*HUv6HQ%6S6E$0?d5-LID}DMVHGe_P|EA_! z)NG~ZPBdlTUIeQ>w_Xl$!?(x=7Ik|o%2LZ7HZxf*V=8Ya8u!~Twv@qBEZ-qp)(U7Es98(RI%=*#BV8vHOZ}v$L)08b)A*}J za3af#l?cmR{bDUvmm0efowYw#BYqpQ)kO3ts1&rU>hR7s0}ZsIT_$K^g>;zBM!?+c~HNL9Ip|0ON{ LI.swift +# echo ''>B.swift +# echo 'import I'>main.swift +# xcrun swiftc -emit-module-interface-path I.swiftinterface -enable-library-evolution I.swift +# xcrun swiftc -emit-module-path B.swiftmodule B.swift -Xfrontend -no-serialize-debugging-options +# xcrun swiftc -explicit-module-build main.swift -I. -module-cache-path cache -g -Xfrontend -no-serialize-debugging-options +# output is "B.swiftmodule" and "cache/I*.swiftmodule" +# +# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/Binary.swiftmodule +# CHECK: Skipping compiled textual Swift interface: {{.*}}/Inputs/FromInterface.swiftmodule + +# +--- +triple: 'arm64-apple-darwin' +objects: + - filename: '/Inputs/Binary.swiftmodule' + timestamp: 0 + type: 50 + symbols: [] + - filename: '/Inputs/FromInterface.swiftmodule' + timestamp: 0 + type: 50 + symbols: [] +... diff --git a/llvm/test/tools/dsymutil/yaml-object-address-rewrite.test b/llvm/test/tools/dsymutil/yaml-object-address-rewrite.test index dfa0f285c5ce5..74e8c1e7ae777 100644 --- a/llvm/test/tools/dsymutil/yaml-object-address-rewrite.test +++ b/llvm/test/tools/dsymutil/yaml-object-address-rewrite.test @@ -12,10 +12,12 @@ # CHECK-NEXT: objects: # CHECK-NEXT: filename:{{.*}}/Inputs/basic1.macho.x86_64.o # CHECK-NEXT: timestamp: 0 +# CHECK-NEXT: type: 102 # CHECK-NEXT: symbols: # CHECK-NEXT: sym: _main, objAddr: 0x0, binAddr: 0x100000EA0, size: 0x24 # CHECK-NEXT: filename:{{.*}}/Inputs/./libbasic.a(basic2.macho.x86_64.o)' # CHECK-NEXT: timestamp: 0 +# CHECK-NEXT: type: 102 # CHECK-NEXT: symbols: # CHECK-DAG: sym: _foo, objAddr: 0x20, binAddr: 0x100000ED0, size: 0x50 # CHECK-DAG: sym: _private_int, objAddr: 0x560, binAddr: 0x100001004, size: 0x0 @@ -24,6 +26,7 @@ # CHECK-NOT: { sym: # CHECK-NEXT: filename:{{.*}}/Inputs/./libbasic.a(basic3.macho.x86_64.o)' # CHECK-NEXT: timestamp: 0 +# CHECK-NEXT: type: 102 # CHECK-NEXT: symbols: # CHECK-DAG: sym: _val, binAddr: 0x100001008, size: 0x0 # CHECK-DAG: sym: _bar, objAddr: 0x20, binAddr: 0x100000F40, size: 0x50 diff --git a/llvm/tools/dsymutil/CMakeLists.txt b/llvm/tools/dsymutil/CMakeLists.txt index efe28bda68ebf..f88b1d0b20cef 100644 --- a/llvm/tools/dsymutil/CMakeLists.txt +++ b/llvm/tools/dsymutil/CMakeLists.txt @@ -32,6 +32,7 @@ add_llvm_tool(dsymutil MachOUtils.cpp Reproducer.cpp RelocationMap.cpp + SwiftModule.cpp DEPENDS intrinsics_gen diff --git a/llvm/tools/dsymutil/DebugMap.cpp b/llvm/tools/dsymutil/DebugMap.cpp index b38d502dda7c9..f1cd7e402f28d 100644 --- a/llvm/tools/dsymutil/DebugMap.cpp +++ b/llvm/tools/dsymutil/DebugMap.cpp @@ -161,12 +161,13 @@ namespace yaml { // Normalize/Denormalize between YAML and a DebugMapObject. struct MappingTraits::YamlDMO { - YamlDMO(IO &io) { Timestamp = 0; } + YamlDMO(IO &io) {} YamlDMO(IO &io, dsymutil::DebugMapObject &Obj); dsymutil::DebugMapObject denormalize(IO &IO); std::string Filename; - int64_t Timestamp; + int64_t Timestamp = 0; + uint8_t Type = MachO::N_OSO; std::vector Entries; }; @@ -183,6 +184,7 @@ void MappingTraits::mapping( MappingNormalization Norm(io, DMO); io.mapRequired("filename", Norm->Filename); io.mapOptional("timestamp", Norm->Timestamp); + io.mapOptional("type", Norm->Type); io.mapRequired("symbols", Norm->Entries); } @@ -236,6 +238,7 @@ MappingTraits::YamlDMO::YamlDMO( IO &io, dsymutil::DebugMapObject &Obj) { Filename = Obj.Filename; Timestamp = sys::toTimeT(Obj.getTimestamp()); + Type = Obj.getType(); Entries.reserve(Obj.Symbols.size()); for (auto &Entry : Obj.Symbols) Entries.push_back( @@ -286,7 +289,6 @@ MappingTraits::YamlDMO::denormalize(IO &IO) { } } - uint8_t Type = MachO::N_OSO; if (Path.ends_with(".dylib")) { // FIXME: find a more resilient way Type = MachO::N_LIB; diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp index f6a35708dc076..c8fa4dbeffb9e 100644 --- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp +++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp @@ -10,6 +10,7 @@ #include "BinaryHolder.h" #include "DebugMap.h" #include "MachOUtils.h" +#include "SwiftModule.h" #include "dsymutil.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -783,6 +784,21 @@ bool DwarfLinkerForBinary::linkImpl( reportWarning("Could not open '" + File + "'"); continue; } + auto FromInterfaceOrErr = + IsBuiltFromSwiftInterface((*ErrorOrMem)->getBuffer()); + if (!FromInterfaceOrErr) { + reportWarning("Could not parse binary Swift module: " + + toString(FromInterfaceOrErr.takeError()), + Obj->getObjectFilename()); + // Only skip swiftmodules that could be parsed and are + // positively identified as textual. + } else if (*FromInterfaceOrErr) { + if (Options.Verbose) + outs() << "Skipping compiled textual Swift interface: " + << Obj->getObjectFilename() << "\n"; + continue; + } + sys::fs::file_status Stat; if (auto Err = sys::fs::status(File, Stat)) { reportWarning(Err.message()); diff --git a/llvm/tools/dsymutil/RelocationMap.h b/llvm/tools/dsymutil/RelocationMap.h index 3d851acf2b892..5a804cd141c38 100644 --- a/llvm/tools/dsymutil/RelocationMap.h +++ b/llvm/tools/dsymutil/RelocationMap.h @@ -37,6 +37,7 @@ struct SymbolMapping { std::optional ObjectAddress; yaml::Hex64 BinaryAddress; yaml::Hex32 Size; + yaml::Hex8 Type; SymbolMapping(std::optional ObjectAddr, uint64_t BinaryAddress, uint32_t Size) diff --git a/llvm/tools/dsymutil/SwiftModule.cpp b/llvm/tools/dsymutil/SwiftModule.cpp new file mode 100644 index 0000000000000..7b21f30237e4e --- /dev/null +++ b/llvm/tools/dsymutil/SwiftModule.cpp @@ -0,0 +1,192 @@ +//===- tools/dsymutil/SwiftModule.cpp -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/Bitcode/LLVMBitCodes.h" +#include "llvm/Bitstream/BitCodes.h" +#include "llvm/Bitstream/BitstreamReader.h" + +namespace { +// Copied from swift/lib/Serialization/ModuleFormat.h +constexpr unsigned char SWIFTMODULE_SIGNATURE[] = {0xE2, 0x9C, 0xA8, 0x0E}; +constexpr uint16_t expectedMajorVersion = 0; +constexpr unsigned MODULE_BLOCK_ID = llvm::bitc::FIRST_APPLICATION_BLOCKID; +constexpr unsigned CONTROL_BLOCK_ID = llvm::bitc::FIRST_APPLICATION_BLOCKID + 1; +constexpr unsigned METADATA = 1; +constexpr unsigned OPTIONS_BLOCK_ID = llvm::bitc::FIRST_APPLICATION_BLOCKID + 8; +constexpr unsigned IS_BUILT_FROM_INTERFACE = 11; + +llvm::Error checkModuleSignature(llvm::BitstreamCursor &cursor, + llvm::ArrayRef signature) { + for (unsigned char byte : signature) { + if (cursor.AtEndOfStream()) + return llvm::createStringError("malformed bitstream"); + llvm::Expected maybeRead = + cursor.Read(8); + if (!maybeRead) + return maybeRead.takeError(); + if (maybeRead.get() != byte) + return llvm::createStringError("malformed bitstream"); + } + return llvm::Error::success(); +} + +llvm::Error enterTopLevelModuleBlock(llvm::BitstreamCursor &cursor, + unsigned ID) { + llvm::Expected maybeNext = cursor.advance(); + if (!maybeNext) + return maybeNext.takeError(); + llvm::BitstreamEntry next = maybeNext.get(); + + if (next.Kind != llvm::BitstreamEntry::SubBlock) + return llvm::createStringError("malformed bitstream"); + + if (next.ID == llvm::bitc::BLOCKINFO_BLOCK_ID) { + if (cursor.SkipBlock()) + return llvm::createStringError("malformed bitstream"); + return enterTopLevelModuleBlock(cursor, ID); + } + + if (next.ID != ID) + return llvm::createStringError("malformed bitstream"); + + if (llvm::Error Err = cursor.EnterSubBlock(ID)) + return Err; + + return llvm::Error::success(); +} + +llvm::Expected +readOptionsBlock(llvm::BitstreamCursor &cursor, + llvm::SmallVectorImpl &scratch) { + bool is_built_from_interface = false; + while (!cursor.AtEndOfStream()) { + llvm::Expected maybeEntry = cursor.advance(); + if (!maybeEntry) + return maybeEntry.takeError(); + + llvm::BitstreamEntry entry = maybeEntry.get(); + if (entry.Kind == llvm::BitstreamEntry::EndBlock) + break; + + if (entry.Kind == llvm::BitstreamEntry::Error) + return llvm::createStringError("malformed bitstream"); + + if (entry.Kind == llvm::BitstreamEntry::SubBlock) { + if (cursor.SkipBlock()) + return llvm::createStringError("malformed bitstream"); + continue; + } + + scratch.clear(); + llvm::StringRef blobData; + llvm::Expected maybeKind = + cursor.readRecord(entry.ID, scratch, &blobData); + if (!maybeKind) + return maybeKind.takeError(); + unsigned kind = maybeKind.get(); + switch (kind) { + case IS_BUILT_FROM_INTERFACE: + is_built_from_interface = true; + continue; + default: + continue; + } + } + return is_built_from_interface; +} + +llvm::Expected +parseControlBlock(llvm::BitstreamCursor &cursor, + llvm::SmallVectorImpl &scratch) { + // The control block is malformed until we've at least read a major version + // number. + bool versionSeen = false; + + while (!cursor.AtEndOfStream()) { + llvm::Expected maybeEntry = cursor.advance(); + if (!maybeEntry) + return maybeEntry.takeError(); + + llvm::BitstreamEntry entry = maybeEntry.get(); + if (entry.Kind == llvm::BitstreamEntry::EndBlock) + break; + + if (entry.Kind == llvm::BitstreamEntry::Error) + return llvm::createStringError("malformed bitstream"); + + if (entry.Kind == llvm::BitstreamEntry::SubBlock) { + if (entry.ID == OPTIONS_BLOCK_ID) { + if (llvm::Error Err = cursor.EnterSubBlock(OPTIONS_BLOCK_ID)) + return Err; + + return readOptionsBlock(cursor, scratch); + } else { + // Unknown metadata sub-block, possibly for use by a future version of + // the module format. + if (cursor.SkipBlock()) + return llvm::createStringError("malformed bitstream"); + } + continue; + } + + scratch.clear(); + llvm::StringRef blobData; + llvm::Expected maybeKind = + cursor.readRecord(entry.ID, scratch, &blobData); + if (!maybeKind) + return maybeKind.takeError(); + + unsigned kind = maybeKind.get(); + if (kind == METADATA) { + if (versionSeen) + return llvm::createStringError("multiple metadata blocks"); + + uint16_t versionMajor = scratch[0]; + if (versionMajor != expectedMajorVersion) + return llvm::createStringError("unsupported module version"); + + versionSeen = true; + } + } + return llvm::createStringError("could not find control block"); +} + +} // namespace + +llvm::Expected IsBuiltFromSwiftInterface(llvm::StringRef data) { + llvm::BitstreamCursor cursor(data); + if (llvm::Error Err = checkModuleSignature(cursor, SWIFTMODULE_SIGNATURE)) + return llvm::joinErrors( + llvm::createStringError("could not check signature"), std::move(Err)); + if (llvm::Error Err = enterTopLevelModuleBlock(cursor, MODULE_BLOCK_ID)) + return llvm::joinErrors( + llvm::createStringError("could not enter top level block"), + std::move(Err)); + + llvm::BitstreamEntry topLevelEntry; + llvm::SmallVector scratch; + + while (!cursor.AtEndOfStream()) { + llvm::Expected maybeEntry = + cursor.advance(llvm::BitstreamCursor::AF_DontPopBlockAtEnd); + if (!maybeEntry) + return maybeEntry.takeError(); + + topLevelEntry = maybeEntry.get(); + if (topLevelEntry.Kind != llvm::BitstreamEntry::SubBlock) + break; + + if (topLevelEntry.ID == CONTROL_BLOCK_ID) { + if (llvm::Error Err = cursor.EnterSubBlock(CONTROL_BLOCK_ID)) + return Err; + return parseControlBlock(cursor, scratch); + } + } + return llvm::createStringError("no control block found"); +} diff --git a/llvm/tools/dsymutil/SwiftModule.h b/llvm/tools/dsymutil/SwiftModule.h new file mode 100644 index 0000000000000..9a272fd6fed36 --- /dev/null +++ b/llvm/tools/dsymutil/SwiftModule.h @@ -0,0 +1,15 @@ +//===- tools/dsymutil/SwiftModule.h ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TOOLS_DSYMUTIL_SWIFTMODULE_H +#define LLVM_TOOLS_DSYMUTIL_SWIFTMODULE_H + +#include "llvm/Support/Error.h" + +llvm::Expected IsBuiltFromSwiftInterface(llvm::StringRef data); + +#endif From 2cf4254466252f3c8685b03b3de34b06563051e8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 8 Apr 2025 22:08:11 +0700 Subject: [PATCH 1000/1029] Attributor: Add baseline tests for propagating align to atomics (#134836) --- .../Transforms/Attributor/align-atomic.ll | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 llvm/test/Transforms/Attributor/align-atomic.ll diff --git a/llvm/test/Transforms/Attributor/align-atomic.ll b/llvm/test/Transforms/Attributor/align-atomic.ll new file mode 100644 index 0000000000000..764ed7419a079 --- /dev/null +++ b/llvm/test/Transforms/Attributor/align-atomic.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,TUNIT +; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,CGSCC + +define i32 @atomicrmw_add_no_op(ptr align 4 %ptr, i32 %val) { +; CHECK-LABEL: define i32 @atomicrmw_add_no_op( +; CHECK-SAME: ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[PTR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[RESULT:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] seq_cst, align 4 +; CHECK-NEXT: ret i32 [[RESULT]] +; + %result = atomicrmw add ptr %ptr, i32 %val seq_cst, align 4 + ret i32 %result +} + +; Check that the aligment increases to 8 +define i32 @atomicrmw_add_propagate(ptr align 8 %ptr, i32 %val) { +; CHECK-LABEL: define i32 @atomicrmw_add_propagate( +; CHECK-SAME: ptr nofree noundef nonnull align 8 captures(none) dereferenceable(4) [[PTR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RESULT:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] seq_cst, align 2 +; CHECK-NEXT: ret i32 [[RESULT]] +; + %result = atomicrmw add ptr %ptr, i32 %val seq_cst, align 2 + ret i32 %result +} + +; Should increase alignment to 8, not 16. +define ptr @atomicrmw_non_ptr_op_no_propagate(ptr %ptr, ptr align 16 %val) { +; CHECK-LABEL: define ptr @atomicrmw_non_ptr_op_no_propagate( +; CHECK-SAME: ptr nofree noundef nonnull align 2 captures(none) dereferenceable(8) [[PTR:%.*]], ptr nofree align 16 [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RESULT:%.*]] = atomicrmw xchg ptr [[PTR]], ptr [[VAL]] seq_cst, align 2 +; CHECK-NEXT: ret ptr [[RESULT]] +; + %result = atomicrmw xchg ptr %ptr, ptr %val seq_cst, align 2 + ret ptr %result +} + +define i32 @cmpxchg_propagate(ptr align 8 %ptr, i32 %cmp, i32 %val) { +; CHECK-LABEL: define i32 @cmpxchg_propagate( +; CHECK-SAME: ptr nofree noundef nonnull align 8 captures(none) dereferenceable(4) [[PTR:%.*]], i32 [[CMP:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PAIR:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[VAL]] seq_cst monotonic, align 2 +; CHECK-NEXT: [[RESULT:%.*]] = extractvalue { i32, i1 } [[PAIR]], 0 +; CHECK-NEXT: ret i32 [[RESULT]] +; + %pair = cmpxchg ptr %ptr, i32 %cmp, i32 %val seq_cst monotonic, align 2 + %result = extractvalue { i32, i1 } %pair, 0 + ret i32 %result +} + +; Should not increase alignment +define ptr @cmpxchg_no_propagate(ptr %ptr, ptr align 16 %cmp, ptr align 32 %val) { +; CHECK-LABEL: define ptr @cmpxchg_no_propagate( +; CHECK-SAME: ptr nofree noundef nonnull align 2 captures(none) dereferenceable(8) [[PTR:%.*]], ptr nofree align 16 [[CMP:%.*]], ptr nofree align 32 [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[PAIR:%.*]] = cmpxchg ptr [[PTR]], ptr [[CMP]], ptr [[VAL]] seq_cst monotonic, align 2 +; CHECK-NEXT: [[RESULT:%.*]] = extractvalue { ptr, i1 } [[PAIR]], 0 +; CHECK-NEXT: ret ptr [[RESULT]] +; + %pair = cmpxchg ptr %ptr, ptr %cmp, ptr %val seq_cst monotonic, align 2 + %result = extractvalue { ptr, i1 } %pair, 0 + ret ptr %result +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CGSCC: {{.*}} +; TUNIT: {{.*}} From 66f03436092df4e87434269eedf9f7c9e6fdbcba Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 8 Apr 2025 22:12:20 +0700 Subject: [PATCH 1001/1029] Attributor: Propagate align to atomicrmw instructions (#134837) Partially fixes #134480 --- .../Transforms/IPO/AttributorAttributes.cpp | 20 ++++++++++++++----- .../Transforms/Attributor/align-atomic.ll | 2 +- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index a477c90bb4f45..717ba7f688548 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -5283,7 +5283,7 @@ struct AAAlignImpl : AAAlign { /// See AbstractAttribute::manifest(...). ChangeStatus manifest(Attributor &A) override { - ChangeStatus LoadStoreChanged = ChangeStatus::UNCHANGED; + ChangeStatus InstrChanged = ChangeStatus::UNCHANGED; // Check for users that allow alignment annotations. Value &AssociatedValue = getAssociatedValue(); @@ -5297,7 +5297,7 @@ struct AAAlignImpl : AAAlign { STATS_DECLTRACK(AAAlign, Store, "Number of times alignment added to a store"); SI->setAlignment(getAssumedAlign()); - LoadStoreChanged = ChangeStatus::CHANGED; + InstrChanged = ChangeStatus::CHANGED; } } else if (auto *LI = dyn_cast(U.getUser())) { if (LI->getPointerOperand() == &AssociatedValue) @@ -5305,8 +5305,18 @@ struct AAAlignImpl : AAAlign { LI->setAlignment(getAssumedAlign()); STATS_DECLTRACK(AAAlign, Load, "Number of times alignment added to a load"); - LoadStoreChanged = ChangeStatus::CHANGED; + InstrChanged = ChangeStatus::CHANGED; } + } else if (auto *RMW = dyn_cast(U.getUser())) { + if (RMW->getPointerOperand() == &AssociatedValue) { + if (RMW->getAlign() < getAssumedAlign()) { + STATS_DECLTRACK(AAAlign, AtomicRMW, + "Number of times alignment added to atomicrmw"); + + RMW->setAlignment(getAssumedAlign()); + InstrChanged = ChangeStatus::CHANGED; + } + } } } @@ -5315,8 +5325,8 @@ struct AAAlignImpl : AAAlign { Align InheritAlign = getAssociatedValue().getPointerAlignment(A.getDataLayout()); if (InheritAlign >= getAssumedAlign()) - return LoadStoreChanged; - return Changed | LoadStoreChanged; + return InstrChanged; + return Changed | InstrChanged; } // TODO: Provide a helper to determine the implied ABI alignment and check in diff --git a/llvm/test/Transforms/Attributor/align-atomic.ll b/llvm/test/Transforms/Attributor/align-atomic.ll index 764ed7419a079..0931c14685a87 100644 --- a/llvm/test/Transforms/Attributor/align-atomic.ll +++ b/llvm/test/Transforms/Attributor/align-atomic.ll @@ -16,7 +16,7 @@ define i32 @atomicrmw_add_no_op(ptr align 4 %ptr, i32 %val) { define i32 @atomicrmw_add_propagate(ptr align 8 %ptr, i32 %val) { ; CHECK-LABEL: define i32 @atomicrmw_add_propagate( ; CHECK-SAME: ptr nofree noundef nonnull align 8 captures(none) dereferenceable(4) [[PTR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RESULT:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] seq_cst, align 2 +; CHECK-NEXT: [[RESULT:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] seq_cst, align 8 ; CHECK-NEXT: ret i32 [[RESULT]] ; %result = atomicrmw add ptr %ptr, i32 %val seq_cst, align 2 From 34e8f00066ee82cba63808e381eaaae89c1b1c79 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 8 Apr 2025 22:15:50 +0700 Subject: [PATCH 1002/1029] Attributor: Propagate align to cmpxchg instructions (#134838) Fixes #134480 --- llvm/lib/Transforms/IPO/AttributorAttributes.cpp | 9 +++++++++ llvm/test/Transforms/Attributor/align-atomic.ll | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 717ba7f688548..cc6e846f4f211 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -5317,6 +5317,15 @@ struct AAAlignImpl : AAAlign { InstrChanged = ChangeStatus::CHANGED; } } + } else if (auto *CAS = dyn_cast(U.getUser())) { + if (CAS->getPointerOperand() == &AssociatedValue) { + if (CAS->getAlign() < getAssumedAlign()) { + STATS_DECLTRACK(AAAlign, AtomicCmpXchg, + "Number of times alignment added to cmpxchg"); + CAS->setAlignment(getAssumedAlign()); + InstrChanged = ChangeStatus::CHANGED; + } + } } } diff --git a/llvm/test/Transforms/Attributor/align-atomic.ll b/llvm/test/Transforms/Attributor/align-atomic.ll index 0931c14685a87..0b363741cc168 100644 --- a/llvm/test/Transforms/Attributor/align-atomic.ll +++ b/llvm/test/Transforms/Attributor/align-atomic.ll @@ -37,7 +37,7 @@ define ptr @atomicrmw_non_ptr_op_no_propagate(ptr %ptr, ptr align 16 %val) { define i32 @cmpxchg_propagate(ptr align 8 %ptr, i32 %cmp, i32 %val) { ; CHECK-LABEL: define i32 @cmpxchg_propagate( ; CHECK-SAME: ptr nofree noundef nonnull align 8 captures(none) dereferenceable(4) [[PTR:%.*]], i32 [[CMP:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[PAIR:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[VAL]] seq_cst monotonic, align 2 +; CHECK-NEXT: [[PAIR:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[VAL]] seq_cst monotonic, align 8 ; CHECK-NEXT: [[RESULT:%.*]] = extractvalue { i32, i1 } [[PAIR]], 0 ; CHECK-NEXT: ret i32 [[RESULT]] ; From 6c74fe9087fd85059158719ad1ab67e0f5098300 Mon Sep 17 00:00:00 2001 From: Sirraide Date: Tue, 8 Apr 2025 17:21:45 +0200 Subject: [PATCH 1003/1029] [Clang] [NFC] Tablegen component diags headers (#134777) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The component diagnostic headers (i.e. `DiagnosticAST.h` and friends) all follow the same format, and there’s enough of them (and in them) to where updating all of them has become rather tedious (at least it was for me while working on #132348), so this patch instead generates all of them (or rather their contents) via Tablegen. Also, it seems that `%enum_select` currently wouldn’t work in `DiagnosticCommonKinds.td` because the infrastructure for that was missing from `DiagnosticIDs.h`; this patch should fix that as well. --- clang/include/clang/Basic/CMakeLists.txt | 5 ++ clang/include/clang/Basic/DiagnosticAST.h | 40 +------------- .../include/clang/Basic/DiagnosticAnalysis.h | 39 +------------ clang/include/clang/Basic/DiagnosticComment.h | 40 +------------- clang/include/clang/Basic/DiagnosticCrossTU.h | 40 +------------- clang/include/clang/Basic/DiagnosticDriver.h | 40 +------------- .../include/clang/Basic/DiagnosticFrontend.h | 40 +------------- clang/include/clang/Basic/DiagnosticIDs.h | 28 ++-------- .../clang/Basic/DiagnosticInstallAPI.h | 38 +------------ clang/include/clang/Basic/DiagnosticLex.h | 39 +------------ clang/include/clang/Basic/DiagnosticParse.h | 40 +------------- .../clang/Basic/DiagnosticRefactoring.h | 40 +------------- clang/include/clang/Basic/DiagnosticSema.h | 41 +------------- .../clang/Basic/DiagnosticSerialization.h | 40 +------------- .../TableGen/ClangDiagnosticsEmitter.cpp | 55 +++++++++++++++++++ clang/utils/TableGen/TableGen.cpp | 6 ++ clang/utils/TableGen/TableGenBackends.h | 2 + 17 files changed, 85 insertions(+), 488 deletions(-) diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt index 4d5e1eaa3facb..265ea1fc06494 100644 --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -13,6 +13,11 @@ macro(clang_diag_gen component) -gen-clang-diags-compat-ids -clang-component=${component} SOURCE Diagnostic.td TARGET ClangDiagnostic${component}CompatIDs) + + clang_tablegen(Diagnostic${component}Interface.inc + -gen-clang-diags-iface -clang-component=${component} + SOURCE Diagnostic.td + TARGET ClangDiagnostic${component}Interface) endmacro(clang_diag_gen) clang_diag_gen(Analysis) diff --git a/clang/include/clang/Basic/DiagnosticAST.h b/clang/include/clang/Basic/DiagnosticAST.h index 41e2598f7cc3b..be9e303d92629 100644 --- a/clang/include/clang/Basic/DiagnosticAST.h +++ b/clang/include/clang/Basic/DiagnosticAST.h @@ -10,44 +10,6 @@ #define LLVM_CLANG_BASIC_DIAGNOSTICAST_H #include "clang/Basic/Diagnostic.h" - -namespace clang { -namespace diag { -enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ - ENUM, -#define ASTSTART -#include "clang/Basic/DiagnosticASTKinds.inc" -#undef DIAG - NUM_BUILTIN_AST_DIAGNOSTICS -}; - -#define DIAG_ENUM(ENUM_NAME) \ - namespace ENUM_NAME { \ - enum { -#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, -#define DIAG_ENUM_END() \ - } \ - ; \ - } -#include "clang/Basic/DiagnosticASTEnums.inc" -#undef DIAG_ENUM_END -#undef DIAG_ENUM_ITEM -#undef DIAG_ENUM -} // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticASTCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat -} // end namespace clang +#include "clang/Basic/DiagnosticASTInterface.inc" #endif // LLVM_CLANG_BASIC_DIAGNOSTICAST_H diff --git a/clang/include/clang/Basic/DiagnosticAnalysis.h b/clang/include/clang/Basic/DiagnosticAnalysis.h index 5ead092b946c5..8e2635ffbd78d 100644 --- a/clang/include/clang/Basic/DiagnosticAnalysis.h +++ b/clang/include/clang/Basic/DiagnosticAnalysis.h @@ -10,43 +10,6 @@ #define LLVM_CLANG_BASIC_DIAGNOSTICANALYSIS_H #include "clang/Basic/Diagnostic.h" - -namespace clang { -namespace diag { -enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ - ENUM, -#define ANALYSISSTART -#include "clang/Basic/DiagnosticAnalysisKinds.inc" -#undef DIAG - NUM_BUILTIN_ANALYSIS_DIAGNOSTICS -}; -#define DIAG_ENUM(ENUM_NAME) \ - namespace ENUM_NAME { \ - enum { -#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, -#define DIAG_ENUM_END() \ - } \ - ; \ - } -#include "clang/Basic/DiagnosticAnalysisEnums.inc" -#undef DIAG_ENUM_END -#undef DIAG_ENUM_ITEM -#undef DIAG_ENUM -} // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticAnalysisCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat -} // end namespace clang +#include "clang/Basic/DiagnosticAnalysisInterface.inc" #endif // LLVM_CLANG_BASIC_DIAGNOSTICANALYSIS_H diff --git a/clang/include/clang/Basic/DiagnosticComment.h b/clang/include/clang/Basic/DiagnosticComment.h index 08e66e8051834..f2a325c4e167a 100644 --- a/clang/include/clang/Basic/DiagnosticComment.h +++ b/clang/include/clang/Basic/DiagnosticComment.h @@ -10,44 +10,6 @@ #define LLVM_CLANG_BASIC_DIAGNOSTICCOMMENT_H #include "clang/Basic/Diagnostic.h" - -namespace clang { -namespace diag { -enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ - ENUM, -#define COMMENTSTART -#include "clang/Basic/DiagnosticCommentKinds.inc" -#undef DIAG - NUM_BUILTIN_COMMENT_DIAGNOSTICS -}; - -#define DIAG_ENUM(ENUM_NAME) \ - namespace ENUM_NAME { \ - enum { -#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, -#define DIAG_ENUM_END() \ - } \ - ; \ - } -#include "clang/Basic/DiagnosticCommentEnums.inc" -#undef DIAG_ENUM_END -#undef DIAG_ENUM_ITEM -#undef DIAG_ENUM -} // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticCommentCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat -} // end namespace clang +#include "clang/Basic/DiagnosticCommentInterface.inc" #endif // LLVM_CLANG_BASIC_DIAGNOSTICCOMMENT_H diff --git a/clang/include/clang/Basic/DiagnosticCrossTU.h b/clang/include/clang/Basic/DiagnosticCrossTU.h index 761716d781446..4c0aa8cd26698 100644 --- a/clang/include/clang/Basic/DiagnosticCrossTU.h +++ b/clang/include/clang/Basic/DiagnosticCrossTU.h @@ -10,44 +10,6 @@ #define LLVM_CLANG_BASIC_DIAGNOSTICCROSSTU_H #include "clang/Basic/Diagnostic.h" - -namespace clang { -namespace diag { -enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ - ENUM, -#define CROSSTUSTART -#include "clang/Basic/DiagnosticCrossTUKinds.inc" -#undef DIAG - NUM_BUILTIN_CROSSTU_DIAGNOSTICS -}; - -#define DIAG_ENUM(ENUM_NAME) \ - namespace ENUM_NAME { \ - enum { -#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, -#define DIAG_ENUM_END() \ - } \ - ; \ - } -#include "clang/Basic/DiagnosticCrossTUEnums.inc" -#undef DIAG_ENUM_END -#undef DIAG_ENUM_ITEM -#undef DIAG_ENUM -} // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticCrossTUCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat -} // end namespace clang +#include "clang/Basic/DiagnosticCrossTUInterface.inc" #endif // LLVM_CLANG_BASIC_DIAGNOSTICCROSSTU_H diff --git a/clang/include/clang/Basic/DiagnosticDriver.h b/clang/include/clang/Basic/DiagnosticDriver.h index 864a23a49e4cd..e00dcedd68ab1 100644 --- a/clang/include/clang/Basic/DiagnosticDriver.h +++ b/clang/include/clang/Basic/DiagnosticDriver.h @@ -10,44 +10,6 @@ #define LLVM_CLANG_BASIC_DIAGNOSTICDRIVER_H #include "clang/Basic/Diagnostic.h" - -namespace clang { -namespace diag { -enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ - ENUM, -#define DRIVERSTART -#include "clang/Basic/DiagnosticDriverKinds.inc" -#undef DIAG - NUM_BUILTIN_DRIVER_DIAGNOSTICS -}; - -#define DIAG_ENUM(ENUM_NAME) \ - namespace ENUM_NAME { \ - enum { -#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, -#define DIAG_ENUM_END() \ - } \ - ; \ - } -#include "clang/Basic/DiagnosticDriverEnums.inc" -#undef DIAG_ENUM_END -#undef DIAG_ENUM_ITEM -#undef DIAG_ENUM -} // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticDriverCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat -} // end namespace clang +#include "clang/Basic/DiagnosticDriverInterface.inc" #endif // LLVM_CLANG_BASIC_DIAGNOSTICDRIVER_H diff --git a/clang/include/clang/Basic/DiagnosticFrontend.h b/clang/include/clang/Basic/DiagnosticFrontend.h index 3506f05daae54..923ddd3b45ba0 100644 --- a/clang/include/clang/Basic/DiagnosticFrontend.h +++ b/clang/include/clang/Basic/DiagnosticFrontend.h @@ -10,44 +10,6 @@ #define LLVM_CLANG_BASIC_DIAGNOSTICFRONTEND_H #include "clang/Basic/Diagnostic.h" - -namespace clang { -namespace diag { -enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ - ENUM, -#define FRONTENDSTART -#include "clang/Basic/DiagnosticFrontendKinds.inc" -#undef DIAG - NUM_BUILTIN_FRONTEND_DIAGNOSTICS -}; - -#define DIAG_ENUM(ENUM_NAME) \ - namespace ENUM_NAME { \ - enum { -#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, -#define DIAG_ENUM_END() \ - } \ - ; \ - } -#include "clang/Basic/DiagnosticFrontendEnums.inc" -#undef DIAG_ENUM_END -#undef DIAG_ENUM_ITEM -#undef DIAG_ENUM -} // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticFrontendCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat -} // end namespace clang +#include "clang/Basic/DiagnosticFrontendInterface.inc" #endif // LLVM_CLANG_BASIC_DIAGNOSTICFRONTEND_H diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h index f936d4fb7a403..80d52a0d01112 100644 --- a/clang/include/clang/Basic/DiagnosticIDs.h +++ b/clang/include/clang/Basic/DiagnosticIDs.h @@ -71,17 +71,6 @@ namespace clang { /// All of the diagnostics that can be emitted by the frontend. typedef unsigned kind; - // Get typedefs for common diagnostics. - enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, CATEGORY, \ - NOWERROR, SHOWINSYSHEADER, SHOWINSYSMACRO, DEFFERABLE) \ - ENUM, -#define COMMONSTART -#include "clang/Basic/DiagnosticCommonKinds.inc" - NUM_BUILTIN_COMMON_DIAGNOSTICS -#undef DIAG - }; - /// Enum values that allow the client to map NOTEs, WARNINGs, and EXTENSIONs /// to either Ignore (nothing), Remark (emit a remark), Warning /// (emit a warning) or Error (emit as an error). It allows clients to @@ -103,20 +92,13 @@ namespace clang { Remark ///< A diagnostic that indicates normal progress through ///< compilation. }; - } + } // end namespace diag +} // end namespace clang - namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticCommonCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END - } // end namespace diag_compat +// This has to be included *after* the DIAG_START_ enums above are defined. +#include "clang/Basic/DiagnosticCommonInterface.inc" +namespace clang { class DiagnosticMapping { LLVM_PREFERRED_TYPE(diag::Severity) unsigned Severity : 3; diff --git a/clang/include/clang/Basic/DiagnosticInstallAPI.h b/clang/include/clang/Basic/DiagnosticInstallAPI.h index 4619bfeea05a2..9d814522270e5 100644 --- a/clang/include/clang/Basic/DiagnosticInstallAPI.h +++ b/clang/include/clang/Basic/DiagnosticInstallAPI.h @@ -10,42 +10,6 @@ #define LLVM_CLANG_BASIC_DIAGNOSTICINSTALLAPI_H #include "clang/Basic/Diagnostic.h" -namespace clang { -namespace diag { -enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ - ENUM, -#define INSTALLAPISTART -#include "clang/Basic/DiagnosticInstallAPIKinds.inc" -#undef DIAG - NUM_BUILTIN_INSTALLAPI_DIAGNOSTICS -}; +#include "clang/Basic/DiagnosticInstallAPIInterface.inc" -#define DIAG_ENUM(ENUM_NAME) \ - namespace ENUM_NAME { \ - enum { -#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, -#define DIAG_ENUM_END() \ - } \ - ; \ - } -#include "clang/Basic/DiagnosticInstallAPIEnums.inc" -#undef DIAG_ENUM_END -#undef DIAG_ENUM_ITEM -#undef DIAG_ENUM -} // namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticInstallAPICompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat -} // namespace clang #endif // LLVM_CLANG_BASIC_DIAGNOSTICINSTALLAPI_H diff --git a/clang/include/clang/Basic/DiagnosticLex.h b/clang/include/clang/Basic/DiagnosticLex.h index 6fa90f785bbf8..f20f4fcd2d3d7 100644 --- a/clang/include/clang/Basic/DiagnosticLex.h +++ b/clang/include/clang/Basic/DiagnosticLex.h @@ -10,43 +10,6 @@ #define LLVM_CLANG_BASIC_DIAGNOSTICLEX_H #include "clang/Basic/Diagnostic.h" - -namespace clang { -namespace diag { -enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ - ENUM, -#define LEXSTART -#include "clang/Basic/DiagnosticLexKinds.inc" -#undef DIAG - NUM_BUILTIN_LEX_DIAGNOSTICS -}; -#define DIAG_ENUM(ENUM_NAME) \ - namespace ENUM_NAME { \ - enum { -#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, -#define DIAG_ENUM_END() \ - } \ - ; \ - } -#include "clang/Basic/DiagnosticLexEnums.inc" -#undef DIAG_ENUM_END -#undef DIAG_ENUM_ITEM -#undef DIAG_ENUM -} // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticLexCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat -} // end namespace clang +#include "clang/Basic/DiagnosticLexInterface.inc" #endif // LLVM_CLANG_BASIC_DIAGNOSTICLEX_H diff --git a/clang/include/clang/Basic/DiagnosticParse.h b/clang/include/clang/Basic/DiagnosticParse.h index e2a4368a59c4b..1e025bdf02415 100644 --- a/clang/include/clang/Basic/DiagnosticParse.h +++ b/clang/include/clang/Basic/DiagnosticParse.h @@ -10,44 +10,6 @@ #define LLVM_CLANG_BASIC_DIAGNOSTICPARSE_H #include "clang/Basic/Diagnostic.h" - -namespace clang { -namespace diag { -enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ - ENUM, -#define PARSESTART -#include "clang/Basic/DiagnosticParseKinds.inc" -#undef DIAG - NUM_BUILTIN_PARSE_DIAGNOSTICS -}; - -#define DIAG_ENUM(ENUM_NAME) \ - namespace ENUM_NAME { \ - enum { -#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, -#define DIAG_ENUM_END() \ - } \ - ; \ - } -#include "clang/Basic/DiagnosticParseEnums.inc" -#undef DIAG_ENUM_END -#undef DIAG_ENUM_ITEM -#undef DIAG_ENUM -} // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticParseCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat -} // end namespace clang +#include "clang/Basic/DiagnosticParseInterface.inc" #endif // LLVM_CLANG_BASIC_DIAGNOSTICPARSE_H diff --git a/clang/include/clang/Basic/DiagnosticRefactoring.h b/clang/include/clang/Basic/DiagnosticRefactoring.h index b3f3a10925f09..380060e51d0f6 100644 --- a/clang/include/clang/Basic/DiagnosticRefactoring.h +++ b/clang/include/clang/Basic/DiagnosticRefactoring.h @@ -10,44 +10,6 @@ #define LLVM_CLANG_BASIC_DIAGNOSTICREFACTORING_H #include "clang/Basic/Diagnostic.h" - -namespace clang { -namespace diag { -enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ - ENUM, -#define REFACTORINGSTART -#include "clang/Basic/DiagnosticRefactoringKinds.inc" -#undef DIAG - NUM_BUILTIN_REFACTORING_DIAGNOSTICS -}; - -#define DIAG_ENUM(ENUM_NAME) \ - namespace ENUM_NAME { \ - enum { -#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, -#define DIAG_ENUM_END() \ - } \ - ; \ - } -#include "clang/Basic/DiagnosticRefactoringEnums.inc" -#undef DIAG_ENUM_END -#undef DIAG_ENUM_ITEM -#undef DIAG_ENUM -} // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticRefactoringCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat -} // end namespace clang +#include "clang/Basic/DiagnosticRefactoringInterface.inc" #endif // LLVM_CLANG_BASIC_DIAGNOSTICREFACTORING_H diff --git a/clang/include/clang/Basic/DiagnosticSema.h b/clang/include/clang/Basic/DiagnosticSema.h index 943b2f64f427e..26a5f719a299a 100644 --- a/clang/include/clang/Basic/DiagnosticSema.h +++ b/clang/include/clang/Basic/DiagnosticSema.h @@ -10,45 +10,6 @@ #define LLVM_CLANG_BASIC_DIAGNOSTICSEMA_H #include "clang/Basic/Diagnostic.h" - -namespace clang { -namespace diag { -enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ - ENUM, -#define SEMASTART -#include "clang/Basic/DiagnosticSemaKinds.inc" -#undef DIAG - NUM_BUILTIN_SEMA_DIAGNOSTICS -}; - -#define DIAG_ENUM(ENUM_NAME) \ - namespace ENUM_NAME { \ - enum { -#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, -#define DIAG_ENUM_END() \ - } \ - ; \ - } -#include "clang/Basic/DiagnosticSemaEnums.inc" -#undef DIAG_ENUM_END -#undef DIAG_ENUM_ITEM -#undef DIAG_ENUM - -} // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticSemaCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat -} // end namespace clang +#include "clang/Basic/DiagnosticSemaInterface.inc" #endif // LLVM_CLANG_BASIC_DIAGNOSTICSEMA_H diff --git a/clang/include/clang/Basic/DiagnosticSerialization.h b/clang/include/clang/Basic/DiagnosticSerialization.h index c8fb034e9bd4a..27df12e5f098b 100644 --- a/clang/include/clang/Basic/DiagnosticSerialization.h +++ b/clang/include/clang/Basic/DiagnosticSerialization.h @@ -10,44 +10,6 @@ #define LLVM_CLANG_BASIC_DIAGNOSTICSERIALIZATION_H #include "clang/Basic/Diagnostic.h" - -namespace clang { -namespace diag { -enum { -#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ - SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ - ENUM, -#define SERIALIZATIONSTART -#include "clang/Basic/DiagnosticSerializationKinds.inc" -#undef DIAG - NUM_BUILTIN_SERIALIZATION_DIAGNOSTICS -}; - -#define DIAG_ENUM(ENUM_NAME) \ - namespace ENUM_NAME { \ - enum { -#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, -#define DIAG_ENUM_END() \ - } \ - ; \ - } -#include "clang/Basic/DiagnosticSerializationEnums.inc" -#undef DIAG_ENUM_END -#undef DIAG_ENUM_ITEM -#undef DIAG_ENUM -} // end namespace diag - -namespace diag_compat { -#define DIAG_COMPAT_IDS_BEGIN() enum { -#define DIAG_COMPAT_IDS_END() \ - } \ - ; -#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, -#include "clang/Basic/DiagnosticSerializationCompatIDs.inc" -#undef DIAG_COMPAT_ID -#undef DIAG_COMPAT_IDS_BEGIN -#undef DIAG_COMPAT_IDS_END -} // end namespace diag_compat -} // end namespace clang +#include "clang/Basic/DiagnosticSerializationInterface.inc" #endif // LLVM_CLANG_BASIC_DIAGNOSTICSERIALIZATION_H diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp index 73facbc916714..cb309fb2e19e9 100644 --- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp +++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/Format.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" #include "llvm/TableGen/StringToOffsetTable.h" @@ -1562,6 +1563,60 @@ void clang::EmitClangDiagsCompatIDs(const llvm::RecordKeeper &Records, OS << "DIAG_COMPAT_IDS_END()\n"; } +/// ClangDiagsIntefaceEmitter - Emit the diagnostics interface header for +/// a Clang component. +void clang::EmitClangDiagsInterface(llvm::raw_ostream &OS, + const std::string &Component) { + if (Component.empty()) + PrintFatalError("'-gen-clang-diags-iface' requires a component name"); + + std::string ComponentUpper = StringRef(Component).upper(); + const char *Comp = Component.c_str(); + const char *Upper = ComponentUpper.c_str(); + + OS << llvm::format(R"c++( +namespace clang { +namespace diag { +enum { +#define DIAG(ENUM, FLAGS, DEFAULT_MAPPING, DESC, GROUP, SFINAE, NOWERROR, \ + SHOWINSYSHEADER, SHOWINSYSMACRO, DEFERRABLE, CATEGORY) \ + ENUM, +#define %sSTART +#include "clang/Basic/Diagnostic%sKinds.inc" +#undef DIAG + NUM_BUILTIN_%s_DIAGNOSTICS +}; + +#define DIAG_ENUM(ENUM_NAME) \ + namespace ENUM_NAME { \ + enum { +#define DIAG_ENUM_ITEM(IDX, NAME) NAME = IDX, +#define DIAG_ENUM_END() \ + } \ + ; \ + } +#include "clang/Basic/Diagnostic%sEnums.inc" +#undef DIAG_ENUM_END +#undef DIAG_ENUM_ITEM +#undef DIAG_ENUM +} // end namespace diag + +namespace diag_compat { +#define DIAG_COMPAT_IDS_BEGIN() enum { +#define DIAG_COMPAT_IDS_END() \ + } \ + ; +#define DIAG_COMPAT_ID(IDX, NAME, ...) NAME = IDX, +#include "clang/Basic/Diagnostic%sCompatIDs.inc" +#undef DIAG_COMPAT_ID +#undef DIAG_COMPAT_IDS_BEGIN +#undef DIAG_COMPAT_IDS_END +} // end namespace diag_compat +} // end namespace clang +)c++", + Upper, Comp, Upper, Comp, Comp); +} + /// ClangDiagsEnumsEmitter - The top-level class emits .def files containing /// declarations of Clang diagnostic enums for selects. void clang::EmitClangDiagsEnums(const RecordKeeper &Records, raw_ostream &OS, diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp index 4d3d56ed4b9d7..a2c6f002f7359 100644 --- a/clang/utils/TableGen/TableGen.cpp +++ b/clang/utils/TableGen/TableGen.cpp @@ -53,6 +53,7 @@ enum ActionType { GenClangDiagsEnums, GenClangDiagGroups, GenClangDiagsIndexName, + GenClangDiagsInterface, GenClangCommentNodes, GenClangDeclNodes, GenClangStmtNodes, @@ -187,6 +188,8 @@ cl::opt Action( "Generate Clang diagnostic groups"), clEnumValN(GenClangDiagsIndexName, "gen-clang-diags-index-name", "Generate Clang diagnostic name index"), + clEnumValN(GenClangDiagsInterface, "gen-clang-diags-iface", + "Generate Clang diagnostic interface headers"), clEnumValN(GenClangBasicReader, "gen-clang-basic-reader", "Generate Clang BasicReader classes"), clEnumValN(GenClangBasicWriter, "gen-clang-basic-writer", @@ -417,6 +420,9 @@ bool ClangTableGenMain(raw_ostream &OS, const RecordKeeper &Records) { case GenClangDiagsIndexName: EmitClangDiagsIndexName(Records, OS); break; + case GenClangDiagsInterface: + EmitClangDiagsInterface(OS, ClangComponent); + break; case GenClangCommentNodes: EmitClangASTNodes(Records, OS, CommentNodeClassName, ""); break; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index c26ce2825ea99..54031147d38e1 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -102,6 +102,8 @@ void EmitClangDiagGroups(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangDiagsIndexName(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitClangDiagsInterface(llvm::raw_ostream &OS, + const std::string &Component); void EmitClangSACheckers(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); From 4e9cfcf6afa340b3ffae5996f6a511951f528ce1 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Tue, 8 Apr 2025 17:28:20 +0200 Subject: [PATCH 1004/1029] [llvm][bazel] Fix BUILD after 561506144531cf0a760bb437fd74c683931c60ae. --- utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 7431ff306b4d7..b77ddf634eec6 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -4120,6 +4120,8 @@ cc_library( deps = [ ":AllTargetsCodeGens", ":BinaryFormat", + ":BitReader", + ":BitstreamReader", ":CodeGen", ":CodeGenTypes", ":DWARFLinker", From 76d2e0881e19359e262043a149474049f94ea348 Mon Sep 17 00:00:00 2001 From: tdanyluk Date: Tue, 8 Apr 2025 17:34:20 +0200 Subject: [PATCH 1005/1029] [mlir] fix references of attributes which are not defined earlier (#134364) If an attribute is not defined earlier in the same file, but just referenced from its dialect directly, then currently not the correct check is being emited. What would it emit for #toy.shape<[1, 2, 3]>: Earlier: // CHECK: #[['?']]<[1, 2, 3]> Now: // CHECK: #toy.shape<[1, 2, 3]> --- mlir/utils/generate-test-checks.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py index 749bfa13fe734..07440990a58d7 100755 --- a/mlir/utils/generate-test-checks.py +++ b/mlir/utils/generate-test-checks.py @@ -145,10 +145,9 @@ def generate_name(self, source_attribute_name): return attribute_name # Get the saved substitution name for the given attribute name. If no name - # has been generated for the given attribute yet, the source attribute name - # itself is returned. + # has been generated for the given attribute yet, None is returned. def get_name(self, source_attribute_name): - return self.map[source_attribute_name] if source_attribute_name in self.map else '?' + return self.map.get(source_attribute_name) # Return the number of SSA results in a line of type # %0, %1, ... = ... @@ -227,9 +226,9 @@ def process_attribute_references(line, attribute_namer): components = ATTR_RE.split(line) for component in components: m = ATTR_RE.match(component) - if m: - output_line += '#[[' + attribute_namer.get_name(m.group(1)) + ']]' - output_line += component[len(m.group()):] + attribute_name = attribute_namer.get_name(m.group(1)) if m else None + if attribute_name: + output_line += f"#[[{attribute_name}]]{component[len(m.group()):]}" else: output_line += component return output_line From bd49d278c6aa9ac5cc8e5917003d4f710887548d Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Tue, 8 Apr 2025 23:46:13 +0800 Subject: [PATCH 1006/1029] [clang-tidy][NFC] update test name and config for bugprone-unintended-char-ostream-output (#134868) --- .../bugprone/unintended-char-ostream-output-cast-type.cpp | 6 +++--- .../checkers/bugprone/unintended-char-ostream-output.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unintended-char-ostream-output-cast-type.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unintended-char-ostream-output-cast-type.cpp index faea4127ac44a..72020d90e0369 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unintended-char-ostream-output-cast-type.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unintended-char-ostream-output-cast-type.cpp @@ -1,6 +1,6 @@ // RUN: %check_clang_tidy %s bugprone-unintended-char-ostream-output %t -- \ // RUN: -config="{CheckOptions: \ -// RUN: {bugprone-unintended-char-ostream-output.CastTypeName: "uint8_t"}}" +// RUN: {bugprone-unintended-char-ostream-output.CastTypeName: \"unsigned char\"}}" namespace std { @@ -33,12 +33,12 @@ void origin_ostream(std::ostream &os) { unsigned char unsigned_value = 9; os << unsigned_value; // CHECK-MESSAGES: [[@LINE-1]]:6: warning: 'unsigned char' passed to 'operator<<' outputs as character instead of integer - // CHECK-FIXES: os << static_cast(unsigned_value); + // CHECK-FIXES: os << static_cast(unsigned_value); signed char signed_value = 9; os << signed_value; // CHECK-MESSAGES: [[@LINE-1]]:6: warning: 'signed char' passed to 'operator<<' outputs as character instead of integer - // CHECK-FIXES: os << static_cast(signed_value); + // CHECK-FIXES: os << static_cast(signed_value); char char_value = 9; os << char_value; diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unintended-char-ostream-output.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unintended-char-ostream-output.cpp index 0a5cdeb21c01e..573c429bf049f 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unintended-char-ostream-output.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unintended-char-ostream-output.cpp @@ -57,7 +57,7 @@ void based_on_ostream(A &os) { os << char_value; } -void based_on_ostream(std::basic_ostream &os) { +void other_ostream_template_parameters(std::basic_ostream &os) { unsigned char unsigned_value = 9; os << unsigned_value; From b0cb672b9968eeee6eb022e98476957dbdf8e6e2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 8 Apr 2025 22:53:17 +0700 Subject: [PATCH 1007/1029] Inline: Propagate callsite nofpclass attribute (#134800) Fixes #134070 --- clang/test/CodeGenHLSL/builtins/distance.hlsl | 32 ++-- clang/test/CodeGenHLSL/builtins/length.hlsl | 32 ++-- clang/test/CodeGenHLSL/builtins/reflect.hlsl | 24 +-- .../test/CodeGenHLSL/builtins/smoothstep.hlsl | 16 +- clang/test/Headers/__clang_hip_cmath.hip | 52 +++++- clang/test/Headers/__clang_hip_math.hip | 172 +++++++++++------- llvm/lib/Transforms/Utils/InlineFunction.cpp | 10 +- .../Inline/access-attributes-prop.ll | 44 +++++ 8 files changed, 251 insertions(+), 131 deletions(-) diff --git a/clang/test/CodeGenHLSL/builtins/distance.hlsl b/clang/test/CodeGenHLSL/builtins/distance.hlsl index e830903261c8c..ac38cf1853799 100644 --- a/clang/test/CodeGenHLSL/builtins/distance.hlsl +++ b/clang/test/CodeGenHLSL/builtins/distance.hlsl @@ -10,14 +10,14 @@ // CHECK-SAME: half noundef nofpclass(nan inf) [[X:%.*]], half noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[Y]] -// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[SUB_I]]) +// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[SUB_I]]) // CHECK-NEXT: ret half [[ELT_ABS_I]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh( // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[X:%.*]], half noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[Y]] -// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[SUB_I]]) +// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[SUB_I]]) // SPVCHECK-NEXT: ret half [[ELT_ABS_I]] // half test_distance_half(half X, half Y) { return distance(X, Y); } @@ -26,7 +26,7 @@ half test_distance_half(half X, half Y) { return distance(X, Y); } // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[Y]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> [[SUB_I]], <2 x half> [[SUB_I]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> nofpclass(nan inf) [[SUB_I]], <2 x half> nofpclass(nan inf) [[SUB_I]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]]) // CHECK-NEXT: ret half [[TMP0]] // @@ -34,7 +34,7 @@ half test_distance_half(half X, half Y) { return distance(X, Y); } // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[Y]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> [[SUB_I]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> nofpclass(nan inf) [[SUB_I]]) // SPVCHECK-NEXT: ret half [[SPV_LENGTH_I]] // half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); } @@ -43,7 +43,7 @@ half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); } // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[Y]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> [[SUB_I]], <3 x half> [[SUB_I]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> nofpclass(nan inf) [[SUB_I]], <3 x half> nofpclass(nan inf) [[SUB_I]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]]) // CHECK-NEXT: ret half [[TMP0]] // @@ -51,7 +51,7 @@ half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); } // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[Y]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> [[SUB_I]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> nofpclass(nan inf) [[SUB_I]]) // SPVCHECK-NEXT: ret half [[SPV_LENGTH_I]] // half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); } @@ -60,7 +60,7 @@ half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); } // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[Y]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> [[SUB_I]], <4 x half> [[SUB_I]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> nofpclass(nan inf) [[SUB_I]], <4 x half> nofpclass(nan inf) [[SUB_I]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]]) // CHECK-NEXT: ret half [[TMP0]] // @@ -68,7 +68,7 @@ half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); } // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[Y]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> [[SUB_I]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> nofpclass(nan inf) [[SUB_I]]) // SPVCHECK-NEXT: ret half [[SPV_LENGTH_I]] // half test_distance_half4(half4 X, half4 Y) { return distance(X, Y); } @@ -77,14 +77,14 @@ half test_distance_half4(half4 X, half4 Y) { return distance(X, Y); } // CHECK-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[Y]] -// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[SUB_I]]) +// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[SUB_I]]) // CHECK-NEXT: ret float [[ELT_ABS_I]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z19test_distance_floatff( // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[Y]] -// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[SUB_I]]) +// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[SUB_I]]) // SPVCHECK-NEXT: ret float [[ELT_ABS_I]] // float test_distance_float(float X, float Y) { return distance(X, Y); } @@ -93,7 +93,7 @@ float test_distance_float(float X, float Y) { return distance(X, Y); } // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[Y]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> [[SUB_I]], <2 x float> [[SUB_I]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> nofpclass(nan inf) [[SUB_I]], <2 x float> nofpclass(nan inf) [[SUB_I]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]]) // CHECK-NEXT: ret float [[TMP0]] // @@ -101,7 +101,7 @@ float test_distance_float(float X, float Y) { return distance(X, Y); } // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[Y]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> [[SUB_I]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> nofpclass(nan inf) [[SUB_I]]) // SPVCHECK-NEXT: ret float [[SPV_LENGTH_I]] // float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); } @@ -110,7 +110,7 @@ float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); } // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[Y]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> [[SUB_I]], <3 x float> [[SUB_I]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> nofpclass(nan inf) [[SUB_I]], <3 x float> nofpclass(nan inf) [[SUB_I]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]]) // CHECK-NEXT: ret float [[TMP0]] // @@ -118,7 +118,7 @@ float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); } // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[Y]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> [[SUB_I]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> nofpclass(nan inf) [[SUB_I]]) // SPVCHECK-NEXT: ret float [[SPV_LENGTH_I]] // float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); } @@ -127,7 +127,7 @@ float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); } // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[Y]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> [[SUB_I]], <4 x float> [[SUB_I]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> nofpclass(nan inf) [[SUB_I]], <4 x float> nofpclass(nan inf) [[SUB_I]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]]) // CHECK-NEXT: ret float [[TMP0]] // @@ -135,7 +135,7 @@ float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); } // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[Y]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> [[SUB_I]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> nofpclass(nan inf) [[SUB_I]]) // SPVCHECK-NEXT: ret float [[SPV_LENGTH_I]] // float test_distance_float4(float4 X, float4 Y) { return distance(X, Y); } diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl index 2d4bbd995298f..0b17d03d7097d 100644 --- a/clang/test/CodeGenHLSL/builtins/length.hlsl +++ b/clang/test/CodeGenHLSL/builtins/length.hlsl @@ -14,13 +14,13 @@ // CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z16test_length_halfDh( // CHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[P0]]) +// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[P0]]) // CHECK-NEXT: ret half [[ELT_ABS_I]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z16test_length_halfDh( // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[P0]]) +// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[P0]]) // SPVCHECK-NEXT: ret half [[ELT_ABS_I]] // half test_length_half(half p0) @@ -35,14 +35,14 @@ half test_length_half(half p0) // CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh( // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> [[P0]], <2 x half> [[P0]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> nofpclass(nan inf) [[P0]], <2 x half> nofpclass(nan inf) [[P0]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]]) // CHECK-NEXT: ret half [[TMP0]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh( // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> [[P0]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> nofpclass(nan inf) [[P0]]) // SPVCHECK-NEXT: ret half [[SPV_LENGTH_I]] // half test_length_half2(half2 p0) @@ -54,14 +54,14 @@ half test_length_half2(half2 p0) // CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh( // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> [[P0]], <3 x half> [[P0]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> nofpclass(nan inf) [[P0]], <3 x half> nofpclass(nan inf) [[P0]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]]) // CHECK-NEXT: ret half [[TMP0]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh( // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> [[P0]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> nofpclass(nan inf) [[P0]]) // SPVCHECK-NEXT: ret half [[SPV_LENGTH_I]] // half test_length_half3(half3 p0) @@ -73,14 +73,14 @@ half test_length_half3(half3 p0) // CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh( // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> [[P0]], <4 x half> [[P0]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> nofpclass(nan inf) [[P0]], <4 x half> nofpclass(nan inf) [[P0]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]]) // CHECK-NEXT: ret half [[TMP0]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh( // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> [[P0]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> nofpclass(nan inf) [[P0]]) // SPVCHECK-NEXT: ret half [[SPV_LENGTH_I]] // half test_length_half4(half4 p0) @@ -92,13 +92,13 @@ half test_length_half4(half4 p0) // CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z17test_length_floatf( // CHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[P0]]) +// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[P0]]) // CHECK-NEXT: ret float [[ELT_ABS_I]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z17test_length_floatf( // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[P0]]) +// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[P0]]) // SPVCHECK-NEXT: ret float [[ELT_ABS_I]] // float test_length_float(float p0) @@ -110,14 +110,14 @@ float test_length_float(float p0) // CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f( // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> [[P0]], <2 x float> [[P0]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> nofpclass(nan inf) [[P0]], <2 x float> nofpclass(nan inf) [[P0]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]]) // CHECK-NEXT: ret float [[TMP0]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f( // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> [[P0]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> nofpclass(nan inf) [[P0]]) // SPVCHECK-NEXT: ret float [[SPV_LENGTH_I]] // float test_length_float2(float2 p0) @@ -129,14 +129,14 @@ float test_length_float2(float2 p0) // CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f( // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> [[P0]], <3 x float> [[P0]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> nofpclass(nan inf) [[P0]], <3 x float> nofpclass(nan inf) [[P0]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]]) // CHECK-NEXT: ret float [[TMP0]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f( // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> [[P0]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> nofpclass(nan inf) [[P0]]) // SPVCHECK-NEXT: ret float [[SPV_LENGTH_I]] // float test_length_float3(float3 p0) @@ -148,14 +148,14 @@ float test_length_float3(float3 p0) // CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f( // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> [[P0]], <4 x float> [[P0]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> nofpclass(nan inf) [[P0]], <4 x float> nofpclass(nan inf) [[P0]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]]) // CHECK-NEXT: ret float [[TMP0]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f( // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> [[P0]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> nofpclass(nan inf) [[P0]]) // SPVCHECK-NEXT: ret float [[SPV_LENGTH_I]] // float test_length_float4(float4 p0) diff --git a/clang/test/CodeGenHLSL/builtins/reflect.hlsl b/clang/test/CodeGenHLSL/builtins/reflect.hlsl index 35ee059697c4b..c082e63ac1da6 100644 --- a/clang/test/CodeGenHLSL/builtins/reflect.hlsl +++ b/clang/test/CodeGenHLSL/builtins/reflect.hlsl @@ -31,7 +31,7 @@ half test_reflect_half(half I, half N) { // CHECK-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_( // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> [[I]], <2 x half> [[N]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> nofpclass(nan inf) [[I]], <2 x half> nofpclass(nan inf) [[N]]) // CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[HLSL_DOT_I]], 0xH4000 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x half> poison, half [[DOTSCALAR]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <2 x i32> zeroinitializer @@ -42,7 +42,7 @@ half test_reflect_half(half I, half N) { // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_( // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.reflect.v2f16(<2 x half> [[I]], <2 x half> [[N]]) +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.reflect.v2f16(<2 x half> nofpclass(nan inf) [[I]], <2 x half> nofpclass(nan inf) [[N]]) // SPVCHECK-NEXT: ret <2 x half> [[SPV_REFLECT_I]] // half2 test_reflect_half2(half2 I, half2 N) { @@ -52,7 +52,7 @@ half2 test_reflect_half2(half2 I, half2 N) { // CHECK-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_( // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> [[I]], <3 x half> [[N]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> nofpclass(nan inf) [[I]], <3 x half> nofpclass(nan inf) [[N]]) // CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[HLSL_DOT_I]], 0xH4000 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x half> poison, half [[DOTSCALAR]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <3 x i32> zeroinitializer @@ -63,7 +63,7 @@ half2 test_reflect_half2(half2 I, half2 N) { // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_( // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.reflect.v3f16(<3 x half> [[I]], <3 x half> [[N]]) +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.reflect.v3f16(<3 x half> nofpclass(nan inf) [[I]], <3 x half> nofpclass(nan inf) [[N]]) // SPVCHECK-NEXT: ret <3 x half> [[SPV_REFLECT_I]] // half3 test_reflect_half3(half3 I, half3 N) { @@ -73,7 +73,7 @@ half3 test_reflect_half3(half3 I, half3 N) { // CHECK-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_( // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> [[I]], <4 x half> [[N]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> nofpclass(nan inf) [[I]], <4 x half> nofpclass(nan inf) [[N]]) // CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[HLSL_DOT_I]], 0xH4000 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x half> poison, half [[DOTSCALAR]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> zeroinitializer @@ -84,7 +84,7 @@ half3 test_reflect_half3(half3 I, half3 N) { // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_( // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.reflect.v4f16(<4 x half> [[I]], <4 x half> [[N]]) +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.reflect.v4f16(<4 x half> nofpclass(nan inf) [[I]], <4 x half> nofpclass(nan inf) [[N]]) // SPVCHECK-NEXT: ret <4 x half> [[SPV_REFLECT_I]] // half4 test_reflect_half4(half4 I, half4 N) { @@ -116,7 +116,7 @@ float test_reflect_float(float I, float N) { // CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_( // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> [[I]], <2 x float> [[N]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> nofpclass(nan inf) [[I]], <2 x float> nofpclass(nan inf) [[N]]) // CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[HLSL_DOT_I]], 2.000000e+00 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[DOTSCALAR]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> zeroinitializer @@ -127,7 +127,7 @@ float test_reflect_float(float I, float N) { // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_( // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.reflect.v2f32(<2 x float> [[I]], <2 x float> [[N]]) +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.reflect.v2f32(<2 x float> nofpclass(nan inf) [[I]], <2 x float> nofpclass(nan inf) [[N]]) // SPVCHECK-NEXT: ret <2 x float> [[SPV_REFLECT_I]] // float2 test_reflect_float2(float2 I, float2 N) { @@ -137,7 +137,7 @@ float2 test_reflect_float2(float2 I, float2 N) { // CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_( // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> [[I]], <3 x float> [[N]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> nofpclass(nan inf) [[I]], <3 x float> nofpclass(nan inf) [[N]]) // CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[HLSL_DOT_I]], 2.000000e+00 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x float> poison, float [[DOTSCALAR]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <3 x i32> zeroinitializer @@ -148,7 +148,7 @@ float2 test_reflect_float2(float2 I, float2 N) { // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_( // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.reflect.v3f32(<3 x float> [[I]], <3 x float> [[N]]) +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.reflect.v3f32(<3 x float> nofpclass(nan inf) [[I]], <3 x float> nofpclass(nan inf) [[N]]) // SPVCHECK-NEXT: ret <3 x float> [[SPV_REFLECT_I]] // float3 test_reflect_float3(float3 I, float3 N) { @@ -158,7 +158,7 @@ float3 test_reflect_float3(float3 I, float3 N) { // CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_( // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> [[I]], <4 x float> [[N]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> nofpclass(nan inf) [[I]], <4 x float> nofpclass(nan inf) [[N]]) // CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[HLSL_DOT_I]], 2.000000e+00 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[DOTSCALAR]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> zeroinitializer @@ -169,7 +169,7 @@ float3 test_reflect_float3(float3 I, float3 N) { // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_( // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.reflect.v4f32(<4 x float> [[I]], <4 x float> [[N]]) +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.reflect.v4f32(<4 x float> nofpclass(nan inf) [[I]], <4 x float> nofpclass(nan inf) [[N]]) // SPVCHECK-NEXT: ret <4 x float> [[SPV_REFLECT_I]] // float4 test_reflect_float4(float4 I, float4 N) { diff --git a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl index f2328c7330e6c..d3e5c1059029c 100644 --- a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl +++ b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl @@ -22,7 +22,7 @@ // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh( // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[MIN:%.*]], half noundef nofpclass(nan inf) [[MAX:%.*]], half noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.smoothstep.f16(half [[MIN]], half [[MAX]], half [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.smoothstep.f16(half nofpclass(nan inf) [[MIN]], half nofpclass(nan inf) [[MAX]], half nofpclass(nan inf) [[X]]) // SPVCHECK-NEXT: ret half [[SPV_SMOOTHSTEP_I]] // half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, Max, X); } @@ -43,7 +43,7 @@ half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, M // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_( // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.smoothstep.v2f16(<2 x half> [[MIN]], <2 x half> [[MAX]], <2 x half> [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.smoothstep.v2f16(<2 x half> nofpclass(nan inf) [[MIN]], <2 x half> nofpclass(nan inf) [[MAX]], <2 x half> nofpclass(nan inf) [[X]]) // SPVCHECK-NEXT: ret <2 x half> [[SPV_SMOOTHSTEP_I]] // half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(Min, Max, X); } @@ -64,7 +64,7 @@ half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(M // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_( // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.smoothstep.v3f16(<3 x half> [[MIN]], <3 x half> [[MAX]], <3 x half> [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.smoothstep.v3f16(<3 x half> nofpclass(nan inf) [[MIN]], <3 x half> nofpclass(nan inf) [[MAX]], <3 x half> nofpclass(nan inf) [[X]]) // SPVCHECK-NEXT: ret <3 x half> [[SPV_SMOOTHSTEP_I]] // half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(Min, Max, X); } @@ -85,7 +85,7 @@ half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(M // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_( // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.smoothstep.v4f16(<4 x half> [[MIN]], <4 x half> [[MAX]], <4 x half> [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.smoothstep.v4f16(<4 x half> nofpclass(nan inf) [[MIN]], <4 x half> nofpclass(nan inf) [[MAX]], <4 x half> nofpclass(nan inf) [[X]]) // SPVCHECK-NEXT: ret <4 x half> [[SPV_SMOOTHSTEP_I]] // half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(Min, Max, X); } @@ -106,7 +106,7 @@ half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(M // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff( // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[MIN:%.*]], float noundef nofpclass(nan inf) [[MAX:%.*]], float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.smoothstep.f32(float [[MIN]], float [[MAX]], float [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.smoothstep.f32(float nofpclass(nan inf) [[MIN]], float nofpclass(nan inf) [[MAX]], float nofpclass(nan inf) [[X]]) // SPVCHECK-NEXT: ret float [[SPV_SMOOTHSTEP_I]] // float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(Min, Max, X); } @@ -127,7 +127,7 @@ float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(M // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_( // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.smoothstep.v2f32(<2 x float> [[MIN]], <2 x float> [[MAX]], <2 x float> [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.smoothstep.v2f32(<2 x float> nofpclass(nan inf) [[MIN]], <2 x float> nofpclass(nan inf) [[MAX]], <2 x float> nofpclass(nan inf) [[X]]) // SPVCHECK-NEXT: ret <2 x float> [[SPV_SMOOTHSTEP_I]] // float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smoothstep(Min, Max, X); } @@ -148,7 +148,7 @@ float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smooths // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_( // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.smoothstep.v3f32(<3 x float> [[MIN]], <3 x float> [[MAX]], <3 x float> [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.smoothstep.v3f32(<3 x float> nofpclass(nan inf) [[MIN]], <3 x float> nofpclass(nan inf) [[MAX]], <3 x float> nofpclass(nan inf) [[X]]) // SPVCHECK-NEXT: ret <3 x float> [[SPV_SMOOTHSTEP_I]] // float3 test_smoothstep_float3(float3 Min, float3 Max, float3 X) { return smoothstep(Min, Max, X); } @@ -169,7 +169,7 @@ float3 test_smoothstep_float3(float3 Min, float3 Max, float3 X) { return smooths // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_( // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.smoothstep.v4f32(<4 x float> [[MIN]], <4 x float> [[MAX]], <4 x float> [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.smoothstep.v4f32(<4 x float> nofpclass(nan inf) [[MIN]], <4 x float> nofpclass(nan inf) [[MAX]], <4 x float> nofpclass(nan inf) [[X]]) // SPVCHECK-NEXT: ret <4 x float> [[SPV_SMOOTHSTEP_I]] // float4 test_smoothstep_float4(float4 Min, float4 Max, float4 X) { return smoothstep(Min, Max, X); } diff --git a/clang/test/Headers/__clang_hip_cmath.hip b/clang/test/Headers/__clang_hip_cmath.hip index 0c9ff4cdd7808..7d812fd0265a6 100644 --- a/clang/test/Headers/__clang_hip_cmath.hip +++ b/clang/test/Headers/__clang_hip_cmath.hip @@ -24,7 +24,7 @@ // // FINITEONLY-LABEL: @test_fma_f16( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef half @llvm.fma.f16(half [[X:%.*]], half [[Y:%.*]], half [[Z:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef half @llvm.fma.f16(half nofpclass(nan inf) [[X:%.*]], half nofpclass(nan inf) [[Y:%.*]], half nofpclass(nan inf) [[Z:%.*]]) // FINITEONLY-NEXT: ret half [[TMP0]] // extern "C" __device__ _Float16 test_fma_f16(_Float16 x, _Float16 y, @@ -34,12 +34,12 @@ extern "C" __device__ _Float16 test_fma_f16(_Float16 x, _Float16 y, // DEFAULT-LABEL: @test_pow_f16( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef half @__ocml_pown_f16(half noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR7:[0-9]+]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef half @__ocml_pown_f16(half noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR9:[0-9]+]] // DEFAULT-NEXT: ret half [[CALL_I]] // // FINITEONLY-LABEL: @test_pow_f16( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) half @__ocml_pown_f16(half noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR7:[0-9]+]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) half @__ocml_pown_f16(half noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR9:[0-9]+]] // FINITEONLY-NEXT: ret half [[CALL_I]] // extern "C" __device__ _Float16 test_pow_f16(_Float16 x, int y) { @@ -53,7 +53,7 @@ extern "C" __device__ _Float16 test_pow_f16(_Float16 x, int y) { // // FINITEONLY-LABEL: @test_fabs_f32( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fabs.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // extern "C" __device__ float test_fabs_f32(float x) { @@ -62,12 +62,12 @@ extern "C" __device__ float test_fabs_f32(float x) { // DEFAULT-LABEL: @test_sin_f32( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR8:[0-9]+]] +// DEFAULT-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR10:[0-9]+]] // DEFAULT-NEXT: ret float [[CALL_I1]] // // FINITEONLY-LABEL: @test_sin_f32( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8:[0-9]+]] +// FINITEONLY-NEXT: [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR10:[0-9]+]] // FINITEONLY-NEXT: ret float [[CALL_I1]] // extern "C" __device__ float test_sin_f32(float x) { @@ -76,12 +76,12 @@ extern "C" __device__ float test_sin_f32(float x) { // DEFAULT-LABEL: @test_cos_f32( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR8]] +// DEFAULT-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR10]] // DEFAULT-NEXT: ret float [[CALL_I1]] // // FINITEONLY-LABEL: @test_cos_f32( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8]] +// FINITEONLY-NEXT: [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR10]] // FINITEONLY-NEXT: ret float [[CALL_I1]] // extern "C" __device__ float test_cos_f32(float x) { @@ -97,10 +97,46 @@ struct user_bfloat16 { }; namespace user_namespace { +// DEFAULT-LABEL: @_ZN14user_namespace3fmaE13user_bfloat16S0_S0_( +// DEFAULT-NEXT: entry: +// DEFAULT-NEXT: ret void +// +// FINITEONLY-LABEL: @_ZN14user_namespace3fmaE13user_bfloat16S0_S0_( +// FINITEONLY-NEXT: entry: +// FINITEONLY-NEXT: ret void +// __device__ user_bfloat16 fma(const user_bfloat16 a, const user_bfloat16 b, const user_bfloat16 c) { return a; } +// DEFAULT-LABEL: @_ZN14user_namespace8test_fmaEv( +// DEFAULT-NEXT: entry: +// DEFAULT-NEXT: [[A:%.*]] = alloca [[STRUCT_USER_BFLOAT16:%.*]], align 1, addrspace(5) +// DEFAULT-NEXT: [[B:%.*]] = alloca [[STRUCT_USER_BFLOAT16]], align 1, addrspace(5) +// DEFAULT-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr +// DEFAULT-NEXT: [[B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B]] to ptr +// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) [[A]]) #[[ATTR11:[0-9]+]] +// DEFAULT-NEXT: call void @_ZN13user_bfloat16C1Ef(ptr noundef nonnull align 1 dereferenceable(1) [[A_ASCAST]], float noundef 1.000000e+00) #[[ATTR10]] +// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) [[B]]) #[[ATTR11]] +// DEFAULT-NEXT: call void @_ZN13user_bfloat16C1Ef(ptr noundef nonnull align 1 dereferenceable(1) [[B_ASCAST]], float noundef 2.000000e+00) #[[ATTR10]] +// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) [[B]]) #[[ATTR11]] +// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) [[A]]) #[[ATTR11]] +// DEFAULT-NEXT: ret void +// +// FINITEONLY-LABEL: @_ZN14user_namespace8test_fmaEv( +// FINITEONLY-NEXT: entry: +// FINITEONLY-NEXT: [[A:%.*]] = alloca [[STRUCT_USER_BFLOAT16:%.*]], align 1, addrspace(5) +// FINITEONLY-NEXT: [[B:%.*]] = alloca [[STRUCT_USER_BFLOAT16]], align 1, addrspace(5) +// FINITEONLY-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr +// FINITEONLY-NEXT: [[B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B]] to ptr +// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) [[A]]) #[[ATTR11:[0-9]+]] +// FINITEONLY-NEXT: call void @_ZN13user_bfloat16C1Ef(ptr noundef nonnull align 1 dereferenceable(1) [[A_ASCAST]], float noundef nofpclass(nan inf) 1.000000e+00) #[[ATTR10]] +// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) [[B]]) #[[ATTR11]] +// FINITEONLY-NEXT: call void @_ZN13user_bfloat16C1Ef(ptr noundef nonnull align 1 dereferenceable(1) [[B_ASCAST]], float noundef nofpclass(nan inf) 2.000000e+00) #[[ATTR10]] +// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) [[B]]) #[[ATTR11]] +// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) [[A]]) #[[ATTR11]] +// FINITEONLY-NEXT: ret void +// __global__ void test_fma() { user_bfloat16 a = 1.0f, b = 2.0f; fma(a, b, b); diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip index e879fec0ebe5a..df1cd716342a5 100644 --- a/clang/test/Headers/__clang_hip_math.hip +++ b/clang/test/Headers/__clang_hip_math.hip @@ -842,7 +842,7 @@ extern "C" __device__ double test_cbrt(double x) { // // FINITEONLY-LABEL: @test_ceilf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ceil.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ceil.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_ceilf( @@ -866,7 +866,7 @@ extern "C" __device__ float test_ceilf(float x) { // // FINITEONLY-LABEL: @test_ceil( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ceil.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ceil.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_ceil( @@ -890,7 +890,7 @@ extern "C" __device__ double test_ceil(double x) { // // FINITEONLY-LABEL: @test_copysignf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.copysign.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_copysignf( @@ -914,7 +914,7 @@ extern "C" __device__ float test_copysignf(float x, float y) { // // FINITEONLY-LABEL: @test_copysign( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.copysign.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_copysign( @@ -1274,7 +1274,7 @@ extern "C" __device__ double test_erfinv(double x) { // // FINITEONLY-LABEL: @test_exp10f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp10.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp10.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_exp10f( @@ -1322,7 +1322,7 @@ extern "C" __device__ double test_exp10(double x) { // // FINITEONLY-LABEL: @test_exp2f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp2.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp2.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_exp2f( @@ -1370,7 +1370,7 @@ extern "C" __device__ double test_exp2(double x) { // // FINITEONLY-LABEL: @test_expf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_expf( @@ -1466,7 +1466,7 @@ extern "C" __device__ double test_expm1(double x) { // // FINITEONLY-LABEL: @test_fabsf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fabs.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_fabsf( @@ -1490,7 +1490,7 @@ extern "C" __device__ float test_fabsf(float x) { // // FINITEONLY-LABEL: @test_fabs( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fabs.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fabs.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_fabs( @@ -1586,7 +1586,7 @@ extern "C" __device__ float test_fdividef(float x, float y) { // // FINITEONLY-LABEL: @test_floorf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.floor.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.floor.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_floorf( @@ -1610,7 +1610,7 @@ extern "C" __device__ float test_floorf(float x) { // // FINITEONLY-LABEL: @test_floor( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.floor.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.floor.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_floor( @@ -1634,7 +1634,7 @@ extern "C" __device__ double test_floor(double x) { // // FINITEONLY-LABEL: @test_fmaf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]], float nofpclass(nan inf) [[Z:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_fmaf( @@ -1658,7 +1658,7 @@ extern "C" __device__ float test_fmaf(float x, float y, float z) { // // FINITEONLY-LABEL: @test_fma( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]], double nofpclass(nan inf) [[Z:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_fma( @@ -1682,7 +1682,7 @@ extern "C" __device__ double test_fma(double x, double y, double z) { // // FINITEONLY-LABEL: @test_fma_rn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]], double nofpclass(nan inf) [[Z:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_fma_rn( @@ -1706,7 +1706,7 @@ extern "C" __device__ double test_fma_rn(double x, double y, double z) { // // FINITEONLY-LABEL: @test_fmaxf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_fmaxf( @@ -1730,7 +1730,7 @@ extern "C" __device__ float test_fmaxf(float x, float y) { // // FINITEONLY-LABEL: @test_fmax( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_fmax( @@ -1754,7 +1754,7 @@ extern "C" __device__ double test_fmax(double x, double y) { // // FINITEONLY-LABEL: @test_fminf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_fminf( @@ -1778,7 +1778,7 @@ extern "C" __device__ float test_fminf(float x, float y) { // // FINITEONLY-LABEL: @test_fmin( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_fmin( @@ -1843,13 +1843,29 @@ extern "C" __device__ double test_fmod(double x, double y) { return fmod(x, y); } -// CHECK-LABEL: @test_frexpf( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 -// CHECK-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12:![0-9]+]] -// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0 -// CHECK-NEXT: ret float [[TMP2]] +// DEFAULT-LABEL: @test_frexpf( +// DEFAULT-NEXT: entry: +// DEFAULT-NEXT: [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X:%.*]]) +// DEFAULT-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 +// DEFAULT-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12:![0-9]+]] +// DEFAULT-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0 +// DEFAULT-NEXT: ret float [[TMP2]] +// +// FINITEONLY-LABEL: @test_frexpf( +// FINITEONLY-NEXT: entry: +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 +// FINITEONLY-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12:![0-9]+]] +// FINITEONLY-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0 +// FINITEONLY-NEXT: ret float [[TMP2]] +// +// APPROX-LABEL: @test_frexpf( +// APPROX-NEXT: entry: +// APPROX-NEXT: [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X:%.*]]) +// APPROX-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 +// APPROX-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12:![0-9]+]] +// APPROX-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0 +// APPROX-NEXT: ret float [[TMP2]] // // AMDGCNSPIRV-LABEL: @test_frexpf( // AMDGCNSPIRV-NEXT: entry: @@ -1863,13 +1879,29 @@ extern "C" __device__ float test_frexpf(float x, int* y) { return frexpf(x, y); } -// CHECK-LABEL: @test_frexp( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X:%.*]]) -// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 -// CHECK-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12]] -// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0 -// CHECK-NEXT: ret double [[TMP2]] +// DEFAULT-LABEL: @test_frexp( +// DEFAULT-NEXT: entry: +// DEFAULT-NEXT: [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X:%.*]]) +// DEFAULT-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 +// DEFAULT-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12]] +// DEFAULT-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0 +// DEFAULT-NEXT: ret double [[TMP2]] +// +// FINITEONLY-LABEL: @test_frexp( +// FINITEONLY-NEXT: entry: +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 +// FINITEONLY-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12]] +// FINITEONLY-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0 +// FINITEONLY-NEXT: ret double [[TMP2]] +// +// APPROX-LABEL: @test_frexp( +// APPROX-NEXT: entry: +// APPROX-NEXT: [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X:%.*]]) +// APPROX-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 +// APPROX-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12]] +// APPROX-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0 +// APPROX-NEXT: ret double [[TMP2]] // // AMDGCNSPIRV-LABEL: @test_frexp( // AMDGCNSPIRV-NEXT: entry: @@ -2522,7 +2554,7 @@ extern "C" __device__ double test_jn(int x, double y) { // // FINITEONLY-LABEL: @test_ldexpf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float nofpclass(nan inf) [[X:%.*]], i32 [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_ldexpf( @@ -2546,7 +2578,7 @@ extern "C" __device__ float test_ldexpf(float x, int y) { // // FINITEONLY-LABEL: @test_ldexp( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double nofpclass(nan inf) [[X:%.*]], i32 [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_ldexp( @@ -2619,7 +2651,7 @@ extern "C" __device__ double test_lgamma(double x) { // // FINITEONLY-LABEL: @test_llrintf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -2647,7 +2679,7 @@ extern "C" __device__ long long int test_llrintf(float x) { // // FINITEONLY-LABEL: @test_llrint( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -2675,7 +2707,7 @@ extern "C" __device__ long long int test_llrint(double x) { // // FINITEONLY-LABEL: @test_llroundf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -2703,7 +2735,7 @@ extern "C" __device__ long long int test_llroundf(float x) { // // FINITEONLY-LABEL: @test_llround( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -2730,7 +2762,7 @@ extern "C" __device__ long long int test_llround(double x) { // // FINITEONLY-LABEL: @test_log10f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_log10f( @@ -2826,7 +2858,7 @@ extern "C" __device__ double test_log1p(double x) { // // FINITEONLY-LABEL: @test_log2f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log2.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log2.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_log2f( @@ -2922,7 +2954,7 @@ extern "C" __device__ double test_logb(double x) { // // FINITEONLY-LABEL: @test_logf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_logf( @@ -2947,7 +2979,7 @@ extern "C" __device__ float test_logf(float x) { // // FINITEONLY-LABEL: @test_lrintf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -2975,7 +3007,7 @@ extern "C" __device__ long int test_lrintf(float x) { // // FINITEONLY-LABEL: @test_lrint( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -3003,7 +3035,7 @@ extern "C" __device__ long int test_lrint(double x) { // // FINITEONLY-LABEL: @test_lroundf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -3031,7 +3063,7 @@ extern "C" __device__ long int test_lroundf(float x) { // // FINITEONLY-LABEL: @test_lround( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -3795,7 +3827,7 @@ extern "C" __device__ double test_nan_fill() { // // FINITEONLY-LABEL: @test_nearbyintf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.nearbyint.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.nearbyint.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_nearbyintf( @@ -3819,7 +3851,7 @@ extern "C" __device__ float test_nearbyintf(float x) { // // FINITEONLY-LABEL: @test_nearbyint( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.nearbyint.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.nearbyint.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_nearbyint( @@ -4581,7 +4613,7 @@ extern "C" __device__ double test_rhypot(double x, double y) { // // FINITEONLY-LABEL: @test_rintf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.rint.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.rint.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_rintf( @@ -4605,7 +4637,7 @@ extern "C" __device__ float test_rintf(float x) { // // FINITEONLY-LABEL: @test_rint( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.rint.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.rint.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_rint( @@ -4893,7 +4925,7 @@ extern "C" __device__ double test_rnorm4d(double x, double y, double z, double w // // FINITEONLY-LABEL: @test_roundf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.round.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.round.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_roundf( @@ -4917,7 +4949,7 @@ extern "C" __device__ float test_roundf(float x) { // // FINITEONLY-LABEL: @test_round( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.round.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.round.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_round( @@ -4993,7 +5025,7 @@ extern "C" __device__ double test_rsqrt(double x) { // FINITEONLY-NEXT: entry: // FINITEONLY-NEXT: [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648) // FINITEONLY-NEXT: [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32 -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float nofpclass(nan inf) [[X:%.*]], i32 [[CONV_I]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_scalblnf( @@ -5025,7 +5057,7 @@ extern "C" __device__ float test_scalblnf(float x, long int y) { // FINITEONLY-NEXT: entry: // FINITEONLY-NEXT: [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648) // FINITEONLY-NEXT: [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32 -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double nofpclass(nan inf) [[X:%.*]], i32 [[CONV_I]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_scalbln( @@ -5053,7 +5085,7 @@ extern "C" __device__ double test_scalbln(double x, long int y) { // // FINITEONLY-LABEL: @test_scalbnf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float nofpclass(nan inf) [[X:%.*]], i32 [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_scalbnf( @@ -5077,7 +5109,7 @@ extern "C" __device__ float test_scalbnf(float x, int y) { // // FINITEONLY-LABEL: @test_scalbn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double nofpclass(nan inf) [[X:%.*]], i32 [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_scalbn( @@ -5427,7 +5459,7 @@ extern "C" __device__ double test_sinpi(double x) { // // FINITEONLY-LABEL: @test_sqrtf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.sqrt.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.sqrt.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_sqrtf( @@ -5451,7 +5483,7 @@ extern "C" __device__ float test_sqrtf(float x) { // // FINITEONLY-LABEL: @test_sqrt( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_sqrt( @@ -5619,7 +5651,7 @@ extern "C" __device__ double test_tgamma(double x) { // // FINITEONLY-LABEL: @test_truncf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.trunc.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.trunc.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_truncf( @@ -5643,7 +5675,7 @@ extern "C" __device__ float test_truncf(float x) { // // FINITEONLY-LABEL: @test_trunc( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.trunc.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.trunc.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_trunc( @@ -6163,7 +6195,7 @@ extern "C" __device__ float test___fdividef(float x, float y) { // // FINITEONLY-LABEL: @test__fmaf_rn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]], float nofpclass(nan inf) [[Z:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test__fmaf_rn( @@ -6235,7 +6267,7 @@ extern "C" __device__ float test___frcp_rn(float x) { // // FINITEONLY-LABEL: @test___frsqrt_rn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.rsq.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.rsq.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test___frsqrt_rn( @@ -6307,7 +6339,7 @@ extern "C" __device__ float test___fsub_rn(float x, float y) { // // FINITEONLY-LABEL: @test___log10f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test___log10f( @@ -6331,7 +6363,7 @@ extern "C" __device__ float test___log10f(float x) { // // FINITEONLY-LABEL: @test___log2f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.log.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.log.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test___log2f( @@ -6355,7 +6387,7 @@ extern "C" __device__ float test___log2f(float x) { // // FINITEONLY-LABEL: @test___logf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test___logf( @@ -6631,7 +6663,7 @@ extern "C" __device__ double test___drcp_rn(double x) { // // FINITEONLY-LABEL: @test___dsqrt_rn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double nofpclass(nan inf) [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test___dsqrt_rn( @@ -6655,7 +6687,7 @@ extern "C" __device__ double test___dsqrt_rn(double x) { // // FINITEONLY-LABEL: @test__fma_rn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]], double nofpclass(nan inf) [[Z:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test__fma_rn( @@ -6679,7 +6711,7 @@ extern "C" __device__ double test__fma_rn(double x, double y, double z) { // // FINITEONLY-LABEL: @test_float_min( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_float_min( @@ -6703,7 +6735,7 @@ extern "C" __device__ float test_float_min(float x, float y) { // // FINITEONLY-LABEL: @test_float_max( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_float_max( @@ -6727,7 +6759,7 @@ extern "C" __device__ float test_float_max(float x, float y) { // // FINITEONLY-LABEL: @test_double_min( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_double_min( @@ -6751,7 +6783,7 @@ extern "C" __device__ double test_double_min(double x, double y) { // // FINITEONLY-LABEL: @test_double_max( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_double_max( diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 5beee1f681b81..c65bf16b6a937 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1381,7 +1381,8 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB, // behavior was just using a poison value. static const Attribute::AttrKind ExactAttrsToPropagate[] = { Attribute::Dereferenceable, Attribute::DereferenceableOrNull, - Attribute::NonNull, Attribute::Alignment, Attribute::Range}; + Attribute::NonNull, Attribute::NoFPClass, + Attribute::Alignment, Attribute::Range}; for (unsigned I = 0, E = CB.arg_size(); I < E; ++I) { ValidObjParamAttrs.emplace_back(AttrBuilder{CB.getContext()}); @@ -1463,6 +1464,13 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB, NewAB.addRangeAttr(CombinedRange); } } + + if (FPClassTest ExistingNoFP = AL.getParamNoFPClass(I)) { + FPClassTest NewNoFP = + NewAB.getAttribute(Attribute::NoFPClass).getNoFPClass(); + NewAB.addNoFPClassAttr(ExistingNoFP | NewNoFP); + } + AL = AL.addParamAttributes(Context, I, NewAB); } else if (NewInnerCB->getArgOperand(I)->getType()->isPointerTy()) { // Check if the underlying value for the parameter is an argument. diff --git a/llvm/test/Transforms/Inline/access-attributes-prop.ll b/llvm/test/Transforms/Inline/access-attributes-prop.ll index 5bf845d5ba94b..5a102d14b5c90 100644 --- a/llvm/test/Transforms/Inline/access-attributes-prop.ll +++ b/llvm/test/Transforms/Inline/access-attributes-prop.ll @@ -750,3 +750,47 @@ define void @prop_range_direct(i32 %v) { call void @foo4(i32 range(i32 1, 11) %v) ret void } + +declare void @bar_fp(float %x) + +define void @foo_fp(float %x) { +; CHECK-LABEL: define {{[^@]+}}@foo_fp +; CHECK-SAME: (float [[X:%.*]]) { +; CHECK-NEXT: call void @bar_fp(float [[X]]) +; CHECK-NEXT: ret void +; + call void @bar_fp(float %x) + ret void +} + +define void @prop_param_nofpclass(float %x) { +; CHECK-LABEL: define {{[^@]+}}@prop_param_nofpclass +; CHECK-SAME: (float [[X:%.*]]) { +; CHECK-NEXT: call void @bar_fp(float nofpclass(nan inf) [[X]]) +; CHECK-NEXT: ret void +; + call void @foo_fp(float nofpclass(nan inf) %x) + ret void +} + +declare void @func_fp(float) + +define void @union_nofpclass(float %v) { +; CHECK-LABEL: define {{[^@]+}}@union_nofpclass +; CHECK-SAME: (float [[V:%.*]]) { +; CHECK-NEXT: call void @func_fp(float nofpclass(inf) [[V]]) +; CHECK-NEXT: ret void +; + call void @func_fp(float nofpclass(inf) %v) + ret void +} + +define void @prop_nofpclass_union(float %v) { +; CHECK-LABEL: define {{[^@]+}}@prop_nofpclass_union +; CHECK-SAME: (float [[V:%.*]]) { +; CHECK-NEXT: call void @func_fp(float nofpclass(nan inf) [[V]]) +; CHECK-NEXT: ret void +; + call void @union_nofpclass(float nofpclass(nan) %v) + ret void +} From b416e7f5920ff2f80f60e52eeb363269d130adba Mon Sep 17 00:00:00 2001 From: Nuno Lopes Date: Tue, 8 Apr 2025 16:56:02 +0100 Subject: [PATCH 1008/1029] [CI] adjust the undef warning regex so it doesn't catch %undef in .ll files --- llvm/utils/git/code-format-helper.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py index da1b4cdad6978..69d654b87e856 100755 --- a/llvm/utils/git/code-format-helper.py +++ b/llvm/utils/git/code-format-helper.py @@ -387,13 +387,11 @@ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str for file in re.split("^diff --git ", stdout, 0, re.MULTILINE): filename = re.match("a/([^ ]+)", file.splitlines()[0])[1] if filename.endswith(".ll"): - undef_regex = r"\bundef\b" + undef_regex = r"(? Date: Tue, 8 Apr 2025 11:00:41 -0500 Subject: [PATCH 1009/1029] Revert "[AMDGPU] Add buffer.fat.ptr.load.lds intrinsic wrapping raw rsrc version (#133015)" (#134871) This reverts commit d1a05721172272f7aab685b56d99e86814a15bff. There was further discussion on the PR about whether the intinsics should exist in this form. --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 21 ------------------- .../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 21 ------------------- .../lower-buffer-fat-pointers-mem-transfer.ll | 18 ---------------- 3 files changed, 60 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index ae2f6e62c0272..217e43fcce4fd 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1941,27 +1941,6 @@ def int_amdgcn_s_buffer_prefetch_data : DefaultAttrsIntrinsic < } // defset AMDGPUBufferIntrinsics -// A wrapper around raw_ptr_buffer_load_lds that takes the global offset -// from the addrspace(7) pointer argument. -def int_amdgcn_buffer_fat_ptr_load_lds : Intrinsic < - [], - [LLVMQualPointerType<7>, // buffer fat pointer (SGPRx4 + VGPR) - LLVMQualPointerType<3>, // LDS base offset - llvm_i32_ty, // Data byte size: 1/2/4 (/12/16 for gfx950) - llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) - llvm_i32_ty], // auxiliary/cachepolicy(imm): - // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), - // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 - // gfx12+: bits [0-2] = th, bits [3-4] = scope, - // bit 6 = swz - // all: volatile op (bit 31, stripped at lowering) - [IntrWillReturn, IntrArgMemOnly, - ReadOnly>, NoCapture>, - WriteOnly>, NoCapture>, - ImmArg>, ImmArg>, - ImmArg>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>; - // Uses that do not set the done bit should set IntrWriteMem on the // call site. def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [ diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 4448570b6b979..766a4ea250942 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -2167,7 +2167,6 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) { case Intrinsic::memset: case Intrinsic::memset_inline: case Intrinsic::experimental_memset_pattern: - case Intrinsic::amdgcn_buffer_fat_ptr_load_lds: return true; } } @@ -2256,26 +2255,6 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) { SplitUsers.insert(&I); return {NewRsrc, Off}; } - case Intrinsic::amdgcn_buffer_fat_ptr_load_lds: { - Value *BufferPtr = I.getArgOperand(0); - assert(isSplitFatPtr(BufferPtr->getType()) && - "amdgcn.buffer.fat.pointer.load.lds must have a buffer fat pointer " - "as argument 0"); - IRB.SetInsertPoint(&I); - auto [Rsrc, Off] = getPtrParts(BufferPtr); - Value *LDSPtr = I.getArgOperand(1); - Value *LoadSize = I.getArgOperand(2); - Value *ImmOff = I.getArgOperand(3); - Value *Aux = I.getArgOperand(4); - Value *SOffset = IRB.getInt32(0); - Instruction *NewLoad = IRB.CreateIntrinsic( - Intrinsic::amdgcn_raw_ptr_buffer_load_lds, {}, - {Rsrc, LDSPtr, LoadSize, Off, SOffset, ImmOff, Aux}); - copyMetadata(NewLoad, &I); - SplitUsers.insert(&I); - I.replaceAllUsesWith(NewLoad); - return {nullptr, nullptr}; - } } return {nullptr, nullptr}; } diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll index 56d0cdd29ffb2..ee51b0b84554e 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll @@ -1724,21 +1724,3 @@ define void @memset_pattern_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %leng call void @llvm.experimental.memset.pattern.p7.i32.i32(ptr addrspace(7) %ptr, i32 1, i32 %length, i1 false) ret void } - -;;; Buffer load to LDS - -declare void @llvm.amdgcn.buffer.fat.ptr.load.lds(ptr addrspace(7), ptr addrspace(3), i32 immarg, i32 immarg, i32 immarg) - -define void @llvm_amdgcn_buffer_fat_ptr_load_lds(ptr addrspace(7) inreg %p, ptr addrspace(3) inreg %l, i32 %idx) { -; CHECK-LABEL: define void @llvm_amdgcn_buffer_fat_ptr_load_lds( -; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[P:%.*]], ptr addrspace(3) inreg [[L:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[P_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 0 -; CHECK-NEXT: [[P_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[P]], 1 -; CHECK-NEXT: [[Q:%.*]] = add i32 [[P_OFF]], [[IDX]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[P_RSRC]], ptr addrspace(3) [[L]], i32 4, i32 [[Q]], i32 0, i32 16, i32 0) -; CHECK-NEXT: ret void -; - %q = getelementptr i8, ptr addrspace(7) %p, i32 %idx - call void @llvm.amdgcn.buffer.fat.ptr.load.lds(ptr addrspace(7) %q, ptr addrspace(3) %l, i32 4, i32 16, i32 0) - ret void -} From 84fde791a1f285dec7ef0ec4803c5174f182df57 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Tue, 8 Apr 2025 17:02:16 +0100 Subject: [PATCH 1010/1029] [Reassociate] Apply Debugloc to instrs produced when optimizing add (#134676) Currently in Reassociate we may create a set of new instructions when optimizing an `add`, but we do not set DebugLocs on the new instructions; this patch propagates the add's DebugLoc to the new instructions. Found using #107279. --- llvm/lib/Transforms/Scalar/Reassociate.cpp | 11 ++-- .../Reassociate/debugloc-reass-add.ll | 56 +++++++++++++++++++ 2 files changed, 63 insertions(+), 4 deletions(-) create mode 100644 llvm/test/Transforms/Reassociate/debugloc-reass-add.ll diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index f9aef064641d8..0bfce13b07f1c 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -1086,13 +1086,15 @@ static unsigned FindInOperandList(const SmallVectorImpl &Ops, /// Emit a tree of add instructions, summing Ops together /// and returning the result. Insert the tree before I. -static Value *EmitAddTreeOfValues(BasicBlock::iterator It, +static Value *EmitAddTreeOfValues(Instruction *I, SmallVectorImpl &Ops) { if (Ops.size() == 1) return Ops.back(); Value *V1 = Ops.pop_back_val(); - Value *V2 = EmitAddTreeOfValues(It, Ops); - return CreateAdd(V2, V1, "reass.add", It, &*It); + Value *V2 = EmitAddTreeOfValues(I, Ops); + auto *NewAdd = CreateAdd(V2, V1, "reass.add", I->getIterator(), I); + NewAdd->setDebugLoc(I->getDebugLoc()); + return NewAdd; } /// If V is an expression tree that is a multiplication sequence, @@ -1682,7 +1684,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, DummyInst->deleteValue(); unsigned NumAddedValues = NewMulOps.size(); - Value *V = EmitAddTreeOfValues(I->getIterator(), NewMulOps); + Value *V = EmitAddTreeOfValues(I, NewMulOps); // Now that we have inserted the add tree, optimize it. This allows us to // handle cases that require multiple factoring steps, such as this: @@ -1694,6 +1696,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, // Create the multiply. Instruction *V2 = CreateMul(V, MaxOccVal, "reass.mul", I->getIterator(), I); + V2->setDebugLoc(I->getDebugLoc()); // Rerun associate on the multiply in case the inner expression turned into // a multiply. We want to make sure that we keep things in canonical form. diff --git a/llvm/test/Transforms/Reassociate/debugloc-reass-add.ll b/llvm/test/Transforms/Reassociate/debugloc-reass-add.ll new file mode 100644 index 0000000000000..aa04837431cd2 --- /dev/null +++ b/llvm/test/Transforms/Reassociate/debugloc-reass-add.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p=reassociate -S < %s | FileCheck %s + +;; Tests that when we reassociate %add93, we apply its debug location to the new +;; instructions. + +define void @foo(i32 %0) { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: i32 [[TMP0:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[FOR_COND23:.*]] +; CHECK: [[FOR_COND23]]: +; CHECK-NEXT: [[SUB59:%.*]] = sub i32 0, 0 +; CHECK-NEXT: [[MUL68:%.*]] = mul i32 0, [[TMP0]] +; CHECK-NEXT: [[REASS_ADD:%.*]] = add i32 [[MUL68]], [[TMP0]], !dbg [[DBG3:![0-9]+]] +; CHECK-NEXT: [[REASS_MUL1:%.*]] = mul i32 [[REASS_ADD]], [[SUB59]], !dbg [[DBG3]] +; CHECK-NEXT: [[REASS_MUL:%.*]] = add i32 [[REASS_MUL1]], 1, !dbg [[DBG3]] +; CHECK-NEXT: [[CONV95:%.*]] = trunc i32 [[REASS_MUL]] to i16 +; CHECK-NEXT: store i16 [[CONV95]], ptr null, align 2 +; CHECK-NEXT: br label %[[FOR_COND23]] +; +entry: + br label %for.cond23 + +for.cond23: ; preds = %for.cond23, %entry + %sub59 = sub i32 0, 0 + %mul62 = mul i32 %sub59, %0 + %mul68 = mul i32 %mul62, 0 + %mul77 = mul i32 %sub59, %0 + %add84 = or i32 %mul68, %mul77 + %add93 = add i32 %add84, 1, !dbg !4 + %conv95 = trunc i32 %add93 to i16 + store i16 %conv95, ptr null, align 2 + br label %for.cond23 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git") +!1 = !DIFile(filename: "test.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !DILocation(line: 15, column: 50, scope: !5) +!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 14, type: !6, scopeLine: 14, unit: !0, retainedNodes: !2) +!6 = distinct !DISubroutineType(types: !7) +!7 = !{null} +;. +; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C11, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug) +; CHECK: [[META1]] = !DIFile(filename: "test.c", directory: {{.*}}) +; CHECK: [[DBG3]] = !DILocation(line: 15, column: 50, scope: [[META4:![0-9]+]]) +; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 14, type: [[META5:![0-9]+]], scopeLine: 14, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META7:![0-9]+]]) +; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6:![0-9]+]]) +; CHECK: [[META6]] = !{null} +; CHECK: [[META7]] = !{} +;. From bb7ff134dc9eb72707400f00fb874a6a9e47597d Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Tue, 8 Apr 2025 12:03:38 -0400 Subject: [PATCH 1011/1029] [gn] port 6c74fe9087 --- .../gn/secondary/clang/include/clang/Basic/BUILD.gn | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn index dd3b18a0918af..ce34ffc865109 100644 --- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn @@ -57,6 +57,14 @@ foreach(diag_group, diag_groups) { ] td_file = "Diagnostic.td" } + + clang_tablegen("Diagnostic${diag_group}Interface") { + args = [ + "-gen-clang-diags-iface", + "-clang-component=${diag_group}", + ] + td_file = "Diagnostic.td" + } } group("diags_tablegen") { # DiagnosticGroups, DiagnosticIndexName, DiagnosticAllCompatIDs are @@ -70,6 +78,7 @@ group("diags_tablegen") { ":Diagnostic${diag_group}Kinds", ":Diagnostic${diag_group}Enums", ":Diagnostic${diag_group}CompatIDs", + ":Diagnostic${diag_group}Interface", ] } } From 26475f5bdd2f4a042b63fdf1ee62d6634872dee5 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 8 Apr 2025 09:09:19 -0700 Subject: [PATCH 1012/1029] [AArch64] Refactor @plt, @gotpcrel, and @AUTH to use parseDataExpr Following PR #132569 (RISC-V), which added `parseDataExpr` for parsing expressions in data directives (e.g., `.word`), this PR migrates AArch64 `@plt`, `@gotpcrel`, and `@AUTH` from the `parsePrimaryExpr` workaround to `parseDataExpr`. The goal is to align with the GNU assembler model, where relocation specifiers apply to the entire operand rather than individual terms, reducing complexity-especially evident in `@AUTH` parsing. Note: AArch64 ELF lacks an official syntax for data directives (#132570). A prefix notation might be a preferable future direction. I recommend `%specifier(expr)`. AsmParser's `@specifier` parsing is suboptimal, necessitating lexer workarounds. `@` might appear multiple times in an operand. We should not use `@` beyond the existing AArch64 Mach-O instruction operands. In the test elf-reloc-ptrauth.s, many errors are now reported at parse time. Pull Request: https://github.com/llvm/llvm-project/pull/134202 --- llvm/include/llvm/MC/MCParser/MCAsmParser.h | 3 + llvm/lib/MC/MCParser/AsmParser.cpp | 26 +++- .../AArch64/AsmParser/AArch64AsmParser.cpp | 127 ++++++++++-------- .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 2 + .../MC/AArch64/data-directive-specifier.s | 16 ++- llvm/test/MC/AArch64/elf-reloc-ptrauth.s | 86 ++++-------- .../AArch64/label-arithmetic-diags-darwin.s | 14 +- 7 files changed, 144 insertions(+), 130 deletions(-) diff --git a/llvm/include/llvm/MC/MCParser/MCAsmParser.h b/llvm/include/llvm/MC/MCParser/MCAsmParser.h index 7cdd99a207468..952cfcf77e4fc 100644 --- a/llvm/include/llvm/MC/MCParser/MCAsmParser.h +++ b/llvm/include/llvm/MC/MCParser/MCAsmParser.h @@ -332,6 +332,9 @@ class MCAsmParser { /// Parse a .gnu_attribute. bool parseGNUAttribute(SMLoc L, int64_t &Tag, int64_t &IntegerValue); + + bool parseAtSpecifier(const MCExpr *&Res, SMLoc &EndLoc); + const MCExpr *applySpecifier(const MCExpr *E, uint32_t Variant); }; /// Create an MCAsmParser instance for parsing assembly similar to gas syntax diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index d743c73ffcf10..060d2b6452a04 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -670,8 +670,6 @@ class AsmParser : public MCAsmParser { bool parseEscapedString(std::string &Data) override; bool parseAngleBracketString(std::string &Data) override; - const MCExpr *applySpecifier(const MCExpr *E, uint32_t Variant); - // Macro-like directives MCAsmMacro *parseMacroLikeBody(SMLoc DirectiveLoc); void instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc, @@ -1193,7 +1191,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc, Split = std::make_pair(Identifier, VName); } - } else { + } else if (Lexer.getAllowAtInIdentifier()) { Split = Identifier.split('@'); } } else if (MAI.useParensForSpecifier() && @@ -1341,7 +1339,7 @@ bool AsmParser::parseExpression(const MCExpr *&Res) { return parseExpression(Res, EndLoc); } -const MCExpr *AsmParser::applySpecifier(const MCExpr *E, uint32_t Spec) { +const MCExpr *MCAsmParser::applySpecifier(const MCExpr *E, uint32_t Spec) { // Ask the target implementation about this expression first. const MCExpr *NewE = getTargetParser().applySpecifier(E, Spec, Ctx); if (NewE) @@ -1432,6 +1430,23 @@ static std::string angleBracketString(StringRef AltMacroStr) { return Res; } +bool MCAsmParser::parseAtSpecifier(const MCExpr *&Res, SMLoc &EndLoc) { + if (parseOptionalToken(AsmToken::At)) { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected specifier following '@'"); + + auto Spec = MAI.getSpecifierForName(getTok().getIdentifier()); + if (!Spec) + return TokError("invalid specifier '@" + getTok().getIdentifier() + "'"); + + const MCExpr *ModifiedRes = applySpecifier(Res, *Spec); + if (ModifiedRes) + Res = ModifiedRes; + Lex(); + } + return false; +} + /// Parse an expression and return it. /// /// expr ::= expr &&,|| expr -> lowest. @@ -1452,8 +1467,7 @@ bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) { // As a special case, we support 'a op b @ modifier' by rewriting the // expression to include the modifier. This is inefficient, but in general we // expect users to use 'a@modifier op b'. - if (Ctx.getAsmInfo()->useAtForSpecifier() && - parseOptionalToken(AsmToken::At)) { + if (Lexer.getAllowAtInIdentifier() && parseOptionalToken(AsmToken::At)) { if (Lexer.isNot(AsmToken::Identifier)) return TokError("unexpected symbol modifier following '@'"); diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index dd67a312cc2cd..6e2f5fdebcf88 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" @@ -180,6 +181,7 @@ class AArch64AsmParser : public MCTargetAsmParser { bool showMatchError(SMLoc Loc, unsigned ErrCode, uint64_t ErrorInfo, OperandVector &Operands); + bool parseDataExpr(const MCExpr *&Res) override; bool parseAuthExpr(const MCExpr *&Res, SMLoc &EndLoc); bool parseDirectiveArch(SMLoc L); @@ -335,8 +337,6 @@ class AArch64AsmParser : public MCTargetAsmParser { unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; - bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override; - static bool classifySymbolRef(const MCExpr *Expr, AArch64MCExpr::Specifier &ELFSpec, AArch64MCExpr::Specifier &DarwinSpec, @@ -4478,6 +4478,19 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) { if (HasELFModifier) ImmVal = AArch64MCExpr::create(ImmVal, RefKind, getContext()); + SMLoc EndLoc; + if (getContext().getAsmInfo()->hasSubsectionsViaSymbols()) { + if (getParser().parseAtSpecifier(ImmVal, EndLoc)) + return true; + const MCExpr *Term; + if (parseOptionalToken(AsmToken::Plus)) { + if (getParser().parseExpression(Term, EndLoc)) + return true; + ImmVal = + MCBinaryExpr::create(MCBinaryExpr::Add, ImmVal, Term, getContext()); + } + } + return false; } @@ -5007,11 +5020,18 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, // This was not a register so parse other operands that start with an // identifier (like labels) as expressions and create them as immediates. - const MCExpr *IdVal; + const MCExpr *IdVal, *Term; S = getLoc(); if (getParser().parseExpression(IdVal)) return true; - E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + if (getParser().parseAtSpecifier(IdVal, E)) + return true; + if (parseOptionalToken(AsmToken::Plus)) { + if (getParser().parseExpression(Term, E)) + return true; + IdVal = + MCBinaryExpr::create(MCBinaryExpr::Add, IdVal, Term, getContext()); + } Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext())); // Parse an optional shift/extend modifier. @@ -8086,11 +8106,56 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) { return false; } -bool AArch64AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { - // Try @AUTH expressions: they're more complex than the usual symbol variants. - if (!parseAuthExpr(Res, EndLoc)) +bool AArch64AsmParser::parseDataExpr(const MCExpr *&Res) { + SMLoc EndLoc; + + if (getParser().parseExpression(Res)) + return true; + MCAsmParser &Parser = getParser(); + if (!parseOptionalToken(AsmToken::At)) return false; - return getParser().parsePrimaryExpr(Res, EndLoc, nullptr); + if (getLexer().getKind() != AsmToken::Identifier) + return Error(getLoc(), "expected relocation specifier"); + + std::string Identifier = Parser.getTok().getIdentifier().lower(); + SMLoc Loc = getLoc(); + Lex(); + if (Identifier == "auth") + return parseAuthExpr(Res, EndLoc); + + auto Spec = AArch64MCExpr::None; + if (STI->getTargetTriple().isOSBinFormatMachO()) { + if (Identifier == "got") + Spec = AArch64MCExpr::M_GOT; + } else { + // Unofficial, experimental syntax that will be changed. + if (Identifier == "gotpcrel") + Spec = AArch64MCExpr::VK_GOTPCREL; + else if (Identifier == "plt") + Spec = AArch64MCExpr::VK_PLT; + } + if (Spec == AArch64MCExpr::None) + return Error(Loc, "invalid relocation specifier"); + if (auto *SRE = dyn_cast(Res)) + Res = MCSymbolRefExpr::create(&SRE->getSymbol(), Spec, getContext(), + SRE->getLoc()); + else + return Error(Loc, "@ specifier only allowed after a symbol"); + + for (;;) { + std::optional Opcode; + if (parseOptionalToken(AsmToken::Plus)) + Opcode = MCBinaryExpr::Add; + else if (parseOptionalToken(AsmToken::Minus)) + Opcode = MCBinaryExpr::Sub; + else + break; + const MCExpr *Term; + if (getParser().parsePrimaryExpr(Term, EndLoc, nullptr)) + return true; + Res = MCBinaryExpr::create(*Opcode, Res, Term, getContext()); + } + return false; } /// parseAuthExpr @@ -8100,54 +8165,8 @@ bool AArch64AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) { bool AArch64AsmParser::parseAuthExpr(const MCExpr *&Res, SMLoc &EndLoc) { MCAsmParser &Parser = getParser(); MCContext &Ctx = getContext(); - AsmToken Tok = Parser.getTok(); - // Look for '_sym@AUTH' ... - if (Tok.is(AsmToken::Identifier) && Tok.getIdentifier().ends_with("@AUTH")) { - StringRef SymName = Tok.getIdentifier().drop_back(strlen("@AUTH")); - if (SymName.contains('@')) - return TokError( - "combination of @AUTH with other modifiers not supported"); - Res = MCSymbolRefExpr::create(Ctx.getOrCreateSymbol(SymName), Ctx); - - Parser.Lex(); // Eat the identifier. - } else { - // ... or look for a more complex symbol reference, such as ... - SmallVector Tokens; - - // ... '"_long sym"@AUTH' ... - if (Tok.is(AsmToken::String)) - Tokens.resize(2); - // ... or '(_sym + 5)@AUTH'. - else if (Tok.is(AsmToken::LParen)) - Tokens.resize(6); - else - return true; - - if (Parser.getLexer().peekTokens(Tokens) != Tokens.size()) - return true; - - // In either case, the expression ends with '@' 'AUTH'. - if (Tokens[Tokens.size() - 2].isNot(AsmToken::At) || - Tokens[Tokens.size() - 1].isNot(AsmToken::Identifier) || - Tokens[Tokens.size() - 1].getIdentifier() != "AUTH") - return true; - - if (Tok.is(AsmToken::String)) { - StringRef SymName; - if (Parser.parseIdentifier(SymName)) - return true; - Res = MCSymbolRefExpr::create(Ctx.getOrCreateSymbol(SymName), Ctx); - } else { - if (Parser.parsePrimaryExpr(Res, EndLoc, nullptr)) - return true; - } - - Parser.Lex(); // '@' - Parser.Lex(); // 'AUTH' - } - // At this point, we encountered "@AUTH". There is no fallback anymore. if (parseToken(AsmToken::LParen, "expected '('")) return true; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 8cffd9ce557db..83daa836e650d 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -70,6 +70,7 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) { UsesELFSectionDirectiveForBSS = true; SupportsDebugInformation = true; UseDataRegionDirectives = true; + UseAtForSpecifier = false; ExceptionsType = ExceptionHandling::DwarfCFI; @@ -114,6 +115,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { Data64bitsDirective = "\t.xword\t"; UseDataRegionDirectives = false; + UseAtForSpecifier = false; WeakRefDirective = "\t.weak\t"; diff --git a/llvm/test/MC/AArch64/data-directive-specifier.s b/llvm/test/MC/AArch64/data-directive-specifier.s index c4ca5d760b41c..2cb7eb3a3ca81 100644 --- a/llvm/test/MC/AArch64/data-directive-specifier.s +++ b/llvm/test/MC/AArch64/data-directive-specifier.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=aarch64 -filetype=obj %s | llvm-readobj -r - | FileCheck %s -# RUN: not llvm-mc -triple=aarch64 -filetype=obj %s --defsym ERR=1 -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR --implicit-check-not=error: +# RUN: not llvm-mc -triple=aarch64 %s --defsym ERR=1 -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR --implicit-check-not=error: # RUN: not llvm-mc -triple=aarch64 -filetype=obj %s --defsym OBJERR=1 -o /dev/null 2>&1 | FileCheck %s --check-prefix=OBJERR --implicit-check-not=error: .globl g @@ -34,19 +34,21 @@ data1: ## Test parse-time errors .ifdef ERR -# ERR: [[#@LINE+1]]:14: error: invalid variant 'pageoff' -.word extern@pageoff +# ERR: [[#@LINE+1]]:9: error: @ specifier only allowed after a symbol +.quad 3@plt - . + +# ERR: [[#@LINE+1]]:9: error: expected ')' +.quad (l@plt - .) .endif -## Test assemble-time errors .ifdef OBJERR -# OBJERR: [[#@LINE+1]]:7: error: symbol 'und' can not be undefined in a subtraction expression -.word extern@plt - und - .quad g@plt - . .word extern@gotpcrel - . +# OBJERR: [[#@LINE+1]]:7: error: symbol 'und' can not be undefined in a subtraction expression +.word extern@plt - und + # OBJERR: [[#@LINE+1]]:7: error: symbol 'und' can not be undefined in a subtraction expression .word extern@gotpcrel - und .endif diff --git a/llvm/test/MC/AArch64/elf-reloc-ptrauth.s b/llvm/test/MC/AArch64/elf-reloc-ptrauth.s index 0b66811458da5..53e0107c5cca1 100644 --- a/llvm/test/MC/AArch64/elf-reloc-ptrauth.s +++ b/llvm/test/MC/AArch64/elf-reloc-ptrauth.s @@ -1,9 +1,9 @@ -// RUN: llvm-mc -triple=aarch64 %s --defsym=ASMONLY=1 | FileCheck %s --check-prefix=ASM +// RUN: llvm-mc -triple=aarch64 %s | FileCheck %s --check-prefix=ASM // RUN: llvm-mc -triple=aarch64 -filetype=obj %s | \ // RUN: llvm-readelf -S -r -x .test - | FileCheck %s --check-prefix=RELOC -// RELOC: Relocation section '.rela.test' at offset {{.*}} contains 8 entries: +// RELOC: Relocation section '.rela.test' at offset {{.*}} contains 9 entries: // RELOC-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend // RELOC-NEXT: 0000000000000000 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 .helper + 0 // RELOC-NEXT: 0000000000000010 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 _g1 + 0 @@ -13,6 +13,7 @@ // RELOC-NEXT: 0000000000000050 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 _g5 - 3 // RELOC-NEXT: 0000000000000060 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 _g 6 + 0 // RELOC-NEXT: 0000000000000070 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 _g 7 + 7 +// RELOC-NEXT: 0000000000000080 {{.*}} R_AARCH64_AUTH_ABS64 0000000000000000 _g4 + 7 // RELOC: Hex dump of section '.test': // VVVVVVVV addend, not needed for rela @@ -41,6 +42,9 @@ // RELOC-NEXT: 70 00000000 10000000 // ^^^^ discriminator // ^^ 0 no addr diversity 0 reserved 00 ia key 0000 reserved +// RELOC-NEXT: 80 00000000 00000000 +// ^^^^ discriminator +// ^^ 0 no addr diversity 0 reserved 00 ia key 0000 reserved .section .helper .local "_g 6" @@ -61,12 +65,12 @@ _g9: .quad _g0@AUTH(ia,42) .quad 0 -// ASM: .xword _g1@AUTH(ib,0) -.quad _g1@AUTH(ib,0) +// ASM: .xword (+_g1)@AUTH(ib,0) +.quad +_g1@AUTH(ib,0) .quad 0 // ASM: .xword _g2@AUTH(da,5,addr) -.quad _g2@AUTH(da,5,addr) +.quad _g2 @ AUTH(da,5,addr) .quad 0 // ASM: .xword _g3@AUTH(db,65535,addr) @@ -89,29 +93,13 @@ _g9: .quad ("_g 7" + 7)@AUTH(ia,16) .quad 0 -.ifdef ASMONLY - -// ASM: .xword _g10@AUTH(ia,42)+1 -.quad _g10@AUTH(ia,42) + 1 - -// ASM: .xword 1+_g11@AUTH(ia,42) -.quad 1 + _g11@AUTH(ia,42) - -// ASM: .xword 1+_g12@AUTH(ia,42)+1 -.quad 1 + _g12@AUTH(ia,42) + 1 - -// ASM: .xword _g13@AUTH(ia,42)+_g14@AUTH(ia,42) -.quad _g13@AUTH(ia,42) + _g14@AUTH(ia,42) - -// ASM: .xword _g9@AUTH(ia,42)-_g8 -.quad _g9@AUTH(ia,42) - _g8 +.quad 7 + _g4@AUTH(ia,0) .quad 0 -.endif // ASMONLY +// RUN: not llvm-mc -triple=aarch64 --defsym=ERR=1 %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=ERR --implicit-check-not=error: .ifdef ERR -// RUN: not llvm-mc -triple=aarch64 --defsym=ERR=1 %s 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERR // ERR: :[[#@LINE+1]]:15: error: expected '(' .quad sym@AUTH)ia,42) @@ -137,52 +125,34 @@ _g9: // ERR: :[[#@LINE+1]]:21: error: expected ')' .quad sym@AUTH(ia,42( -// ERR: :[[#@LINE+1]]:7: error: combination of @AUTH with other modifiers not supported +// ERR: :[[#@LINE+1]]:14: error: unexpected token .quad sym@PLT@AUTH(ia,42) -// ERR: :[[#@LINE+1]]:11: error: invalid variant 'AUTH@GOT' +// ERR: :[[#@LINE+1]]:15: error: expected '(' .quad sym@AUTH@GOT(ia,42) -// ERR: :[[#@LINE+1]]:18: error: invalid variant 'TLSDESC@AUTH' -.quad "long sym"@TLSDESC@AUTH(ia,42) - -// ERR: :[[#@LINE+1]]:18: error: invalid variant 'AUTH@PLT' +// ERR: :[[#@LINE+1]]:22: error: expected '(' .quad "long sym"@AUTH@PLT(ia,42) -// ERR: :[[#@LINE+1]]:17: error: invalid variant 'GOT@AUTH' +// ERR: :[[#@LINE+1]]:17: error: invalid relocation specifier .quad (sym - 5)@GOT@AUTH(ia,42) -// ERR: :[[#@LINE+1]]:17: error: invalid variant 'AUTH@TLSDESC' -.quad (sym + 5)@AUTH@TLSDESC(ia,42) - -// ERR: :[[#@LINE+1]]:12: error: invalid variant 'AUTH' -.quad +sym@AUTH(ia,42) - -.endif // ERR - -.ifdef ERROBJ -// RUN: not llvm-mc -triple=aarch64 -filetype=obj --defsym=ERROBJ=1 %s -o /dev/null 2>&1 | \ -// RUN: FileCheck %s --check-prefix=ERROBJ +// ERR: :[[#@LINE+1]]:23: error: unexpected token +.quad sym@AUTH(ia,42) + 1 -// ERROBJ: :[[#@LINE+1]]:7: error: expected relocatable expression -.quad sym@AUTH(ia,42) + sym@AUTH(ia,42) +// ERR: :[[#@LINE+1]]:27: error: unexpected token +.quad 1 + sym@AUTH(ia,42) + 1 -// TODO: do we really want to emit an error here? It might not be important -// whether a symbol has an AUTH modifier or not since the compile-time computed -// distance remains the same. Leave it in such state as for now since it -// makes code simpler: subtraction of a non-AUTH symbol and of a constant -// are handled identically. -// ERROBJ: :[[#@LINE+1]]:7: error: Cannot represent a difference across sections +/// @AUTH applies to the whole operand instead of an individual term. +/// Trailing expression parts are not allowed even if the logical subtraction +/// result might make sense. +// ERR: :[[#@LINE+1]]:23: error: unexpected token .quad _g9@AUTH(ia,42) - _g8 -// ERROBJ: :[[#@LINE+1]]:7: error: expected relocatable expression +// ERR: :[[#@LINE+1]]:23: error: unexpected token .quad _g9@AUTH(ia,42) - _g8@AUTH(ia,42) -.quad 0 - -// ERROBJ: :[[#@LINE+1]]:23: error: expected relocatable expression -.quad sym@AUTH(ia,42) + 1 -// ERROBJ: :[[#@LINE+1]]:9: error: expected relocatable expression -.quad 1 + sym@AUTH(ia,42) +// ERR: :[[#@LINE+1]]:24: error: unexpected token +.quad _g13@AUTH(ia,42) + _g14@AUTH(ia,42) -.endif // ERROBJ +.endif // ERR diff --git a/llvm/test/MC/AArch64/label-arithmetic-diags-darwin.s b/llvm/test/MC/AArch64/label-arithmetic-diags-darwin.s index 3e51e487e1288..357e04a828f8e 100644 --- a/llvm/test/MC/AArch64/label-arithmetic-diags-darwin.s +++ b/llvm/test/MC/AArch64/label-arithmetic-diags-darwin.s @@ -1,9 +1,17 @@ +// RUN: not llvm-mc -triple aarch64-darwin -filetype=obj --defsym PARSE=1 %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR // RUN: not llvm-mc -triple aarch64-darwin -filetype=obj %s -o /dev/null 2>&1 | FileCheck %s // RUN: not llvm-mc -triple aarch64-ios -filetype=obj %s -o /dev/null 2>&1 | FileCheck %s Lstart: .space 8 Lend: +.ifdef PARSE + add w0, w1, #(Lend - var@TLVPPAGEOFF) + // ERR: [[#@LINE-1]]:27: error: expected ')' + cmp w0, #(Lend - var@TLVPPAGEOFF) + // ERR: [[#@LINE-1]]:23: error: expected ')' + +.else add w0, w1, #(Lend - external) cmp w0, #(Lend - external) // CHECK: error: unknown AArch64 fixup kind! @@ -13,11 +21,6 @@ Lend: // CHECK-NEXT: cmp w0, #(Lend - external) // CHECK-NEXT: ^ - add w0, w1, #(Lend - var@TLVPPAGEOFF) - // CHECK: [[#@LINE-1]]:3: error: expected relocatable expression - cmp w0, #(Lend - var@TLVPPAGEOFF) - // CHECK: [[#@LINE-1]]:3: error: expected relocatable expression - add w0, w1, #(Lstart - Lend) cmp w0, #(Lstart - Lend) // CHECK: error: fixup value out of range @@ -62,3 +65,4 @@ Lend_across_sec: // CHECK: error: unknown AArch64 fixup kind! // CHECK-NEXT: cmp w0, #(Lend_across_sec - Lprivate2) // CHECK-NEXT: ^ +.endif From 3f38cd07d820248fd2043efb1341fabaac2d84a6 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 8 Apr 2025 23:15:00 +0700 Subject: [PATCH 1013/1029] Revert "Inline: Propagate callsite nofpclass attribute" This reverts commit b0cb672b9968eeee6eb022e98476957dbdf8e6e2. Breaks bot --- clang/test/CodeGenHLSL/builtins/distance.hlsl | 32 ++-- clang/test/CodeGenHLSL/builtins/length.hlsl | 32 ++-- clang/test/CodeGenHLSL/builtins/reflect.hlsl | 24 +-- .../test/CodeGenHLSL/builtins/smoothstep.hlsl | 16 +- clang/test/Headers/__clang_hip_cmath.hip | 52 +----- clang/test/Headers/__clang_hip_math.hip | 172 +++++++----------- llvm/lib/Transforms/Utils/InlineFunction.cpp | 10 +- .../Inline/access-attributes-prop.ll | 44 ----- 8 files changed, 131 insertions(+), 251 deletions(-) diff --git a/clang/test/CodeGenHLSL/builtins/distance.hlsl b/clang/test/CodeGenHLSL/builtins/distance.hlsl index ac38cf1853799..e830903261c8c 100644 --- a/clang/test/CodeGenHLSL/builtins/distance.hlsl +++ b/clang/test/CodeGenHLSL/builtins/distance.hlsl @@ -10,14 +10,14 @@ // CHECK-SAME: half noundef nofpclass(nan inf) [[X:%.*]], half noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[Y]] -// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[SUB_I]]) +// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[SUB_I]]) // CHECK-NEXT: ret half [[ELT_ABS_I]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z18test_distance_halfDhDh( // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[X:%.*]], half noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half [[X]], [[Y]] -// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[SUB_I]]) +// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[SUB_I]]) // SPVCHECK-NEXT: ret half [[ELT_ABS_I]] // half test_distance_half(half X, half Y) { return distance(X, Y); } @@ -26,7 +26,7 @@ half test_distance_half(half X, half Y) { return distance(X, Y); } // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[Y]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> nofpclass(nan inf) [[SUB_I]], <2 x half> nofpclass(nan inf) [[SUB_I]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> [[SUB_I]], <2 x half> [[SUB_I]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]]) // CHECK-NEXT: ret half [[TMP0]] // @@ -34,7 +34,7 @@ half test_distance_half(half X, half Y) { return distance(X, Y); } // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[X:%.*]], <2 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[X]], [[Y]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> nofpclass(nan inf) [[SUB_I]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> [[SUB_I]]) // SPVCHECK-NEXT: ret half [[SPV_LENGTH_I]] // half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); } @@ -43,7 +43,7 @@ half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); } // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[Y]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> nofpclass(nan inf) [[SUB_I]], <3 x half> nofpclass(nan inf) [[SUB_I]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> [[SUB_I]], <3 x half> [[SUB_I]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]]) // CHECK-NEXT: ret half [[TMP0]] // @@ -51,7 +51,7 @@ half test_distance_half2(half2 X, half2 Y) { return distance(X, Y); } // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[X:%.*]], <3 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[X]], [[Y]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> nofpclass(nan inf) [[SUB_I]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> [[SUB_I]]) // SPVCHECK-NEXT: ret half [[SPV_LENGTH_I]] // half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); } @@ -60,7 +60,7 @@ half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); } // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[Y]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> nofpclass(nan inf) [[SUB_I]], <4 x half> nofpclass(nan inf) [[SUB_I]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> [[SUB_I]], <4 x half> [[SUB_I]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]]) // CHECK-NEXT: ret half [[TMP0]] // @@ -68,7 +68,7 @@ half test_distance_half3(half3 X, half3 Y) { return distance(X, Y); } // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[X:%.*]], <4 x half> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[X]], [[Y]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> nofpclass(nan inf) [[SUB_I]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> [[SUB_I]]) // SPVCHECK-NEXT: ret half [[SPV_LENGTH_I]] // half test_distance_half4(half4 X, half4 Y) { return distance(X, Y); } @@ -77,14 +77,14 @@ half test_distance_half4(half4 X, half4 Y) { return distance(X, Y); } // CHECK-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[Y]] -// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[SUB_I]]) +// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[SUB_I]]) // CHECK-NEXT: ret float [[ELT_ABS_I]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z19test_distance_floatff( // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[X:%.*]], float noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float [[X]], [[Y]] -// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[SUB_I]]) +// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[SUB_I]]) // SPVCHECK-NEXT: ret float [[ELT_ABS_I]] // float test_distance_float(float X, float Y) { return distance(X, Y); } @@ -93,7 +93,7 @@ float test_distance_float(float X, float Y) { return distance(X, Y); } // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[Y]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> nofpclass(nan inf) [[SUB_I]], <2 x float> nofpclass(nan inf) [[SUB_I]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> [[SUB_I]], <2 x float> [[SUB_I]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]]) // CHECK-NEXT: ret float [[TMP0]] // @@ -101,7 +101,7 @@ float test_distance_float(float X, float Y) { return distance(X, Y); } // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[X:%.*]], <2 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[X]], [[Y]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> nofpclass(nan inf) [[SUB_I]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> [[SUB_I]]) // SPVCHECK-NEXT: ret float [[SPV_LENGTH_I]] // float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); } @@ -110,7 +110,7 @@ float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); } // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[Y]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> nofpclass(nan inf) [[SUB_I]], <3 x float> nofpclass(nan inf) [[SUB_I]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> [[SUB_I]], <3 x float> [[SUB_I]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]]) // CHECK-NEXT: ret float [[TMP0]] // @@ -118,7 +118,7 @@ float test_distance_float2(float2 X, float2 Y) { return distance(X, Y); } // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[X:%.*]], <3 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[X]], [[Y]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> nofpclass(nan inf) [[SUB_I]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> [[SUB_I]]) // SPVCHECK-NEXT: ret float [[SPV_LENGTH_I]] // float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); } @@ -127,7 +127,7 @@ float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); } // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[Y]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> nofpclass(nan inf) [[SUB_I]], <4 x float> nofpclass(nan inf) [[SUB_I]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> [[SUB_I]], <4 x float> [[SUB_I]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]]) // CHECK-NEXT: ret float [[TMP0]] // @@ -135,7 +135,7 @@ float test_distance_float3(float3 X, float3 Y) { return distance(X, Y); } // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[X:%.*]], <4 x float> noundef nofpclass(nan inf) [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] // SPVCHECK-NEXT: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[X]], [[Y]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> nofpclass(nan inf) [[SUB_I]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> [[SUB_I]]) // SPVCHECK-NEXT: ret float [[SPV_LENGTH_I]] // float test_distance_float4(float4 X, float4 Y) { return distance(X, Y); } diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl index 0b17d03d7097d..2d4bbd995298f 100644 --- a/clang/test/CodeGenHLSL/builtins/length.hlsl +++ b/clang/test/CodeGenHLSL/builtins/length.hlsl @@ -14,13 +14,13 @@ // CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z16test_length_halfDh( // CHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[P0]]) +// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[P0]]) // CHECK-NEXT: ret half [[ELT_ABS_I]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z16test_length_halfDh( // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half nofpclass(nan inf) [[P0]]) +// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.fabs.f16(half [[P0]]) // SPVCHECK-NEXT: ret half [[ELT_ABS_I]] // half test_length_half(half p0) @@ -35,14 +35,14 @@ half test_length_half(half p0) // CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh( // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> nofpclass(nan inf) [[P0]], <2 x half> nofpclass(nan inf) [[P0]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> [[P0]], <2 x half> [[P0]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]]) // CHECK-NEXT: ret half [[TMP0]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half2Dv2_Dh( // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> nofpclass(nan inf) [[P0]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v2f16(<2 x half> [[P0]]) // SPVCHECK-NEXT: ret half [[SPV_LENGTH_I]] // half test_length_half2(half2 p0) @@ -54,14 +54,14 @@ half test_length_half2(half2 p0) // CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh( // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> nofpclass(nan inf) [[P0]], <3 x half> nofpclass(nan inf) [[P0]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> [[P0]], <3 x half> [[P0]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]]) // CHECK-NEXT: ret half [[TMP0]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half3Dv3_Dh( // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> nofpclass(nan inf) [[P0]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v3f16(<3 x half> [[P0]]) // SPVCHECK-NEXT: ret half [[SPV_LENGTH_I]] // half test_length_half3(half3 p0) @@ -73,14 +73,14 @@ half test_length_half3(half3 p0) // CHECK-LABEL: define noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh( // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> nofpclass(nan inf) [[P0]], <4 x half> nofpclass(nan inf) [[P0]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> [[P0]], <4 x half> [[P0]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.sqrt.f16(half [[HLSL_DOT_I]]) // CHECK-NEXT: ret half [[TMP0]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z17test_length_half4Dv4_Dh( // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> nofpclass(nan inf) [[P0]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.length.v4f16(<4 x half> [[P0]]) // SPVCHECK-NEXT: ret half [[SPV_LENGTH_I]] // half test_length_half4(half4 p0) @@ -92,13 +92,13 @@ half test_length_half4(half4 p0) // CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z17test_length_floatf( // CHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[P0]]) +// CHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[P0]]) // CHECK-NEXT: ret float [[ELT_ABS_I]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z17test_length_floatf( // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[P0]]) +// SPVCHECK-NEXT: [[ELT_ABS_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.fabs.f32(float [[P0]]) // SPVCHECK-NEXT: ret float [[ELT_ABS_I]] // float test_length_float(float p0) @@ -110,14 +110,14 @@ float test_length_float(float p0) // CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f( // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> nofpclass(nan inf) [[P0]], <2 x float> nofpclass(nan inf) [[P0]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> [[P0]], <2 x float> [[P0]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]]) // CHECK-NEXT: ret float [[TMP0]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float2Dv2_f( // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> nofpclass(nan inf) [[P0]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v2f32(<2 x float> [[P0]]) // SPVCHECK-NEXT: ret float [[SPV_LENGTH_I]] // float test_length_float2(float2 p0) @@ -129,14 +129,14 @@ float test_length_float2(float2 p0) // CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f( // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> nofpclass(nan inf) [[P0]], <3 x float> nofpclass(nan inf) [[P0]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> [[P0]], <3 x float> [[P0]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]]) // CHECK-NEXT: ret float [[TMP0]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float3Dv3_f( // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> nofpclass(nan inf) [[P0]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v3f32(<3 x float> [[P0]]) // SPVCHECK-NEXT: ret float [[SPV_LENGTH_I]] // float test_length_float3(float3 p0) @@ -148,14 +148,14 @@ float test_length_float3(float3 p0) // CHECK-LABEL: define noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f( // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> nofpclass(nan inf) [[P0]], <4 x float> nofpclass(nan inf) [[P0]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> [[P0]], <4 x float> [[P0]]) // CHECK-NEXT: [[TMP0:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.sqrt.f32(float [[HLSL_DOT_I]]) // CHECK-NEXT: ret float [[TMP0]] // // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z18test_length_float4Dv4_f( // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[P0:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> nofpclass(nan inf) [[P0]]) +// SPVCHECK-NEXT: [[SPV_LENGTH_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.length.v4f32(<4 x float> [[P0]]) // SPVCHECK-NEXT: ret float [[SPV_LENGTH_I]] // float test_length_float4(float4 p0) diff --git a/clang/test/CodeGenHLSL/builtins/reflect.hlsl b/clang/test/CodeGenHLSL/builtins/reflect.hlsl index c082e63ac1da6..35ee059697c4b 100644 --- a/clang/test/CodeGenHLSL/builtins/reflect.hlsl +++ b/clang/test/CodeGenHLSL/builtins/reflect.hlsl @@ -31,7 +31,7 @@ half test_reflect_half(half I, half N) { // CHECK-LABEL: define noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_( // CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> nofpclass(nan inf) [[I]], <2 x half> nofpclass(nan inf) [[N]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> [[I]], <2 x half> [[N]]) // CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[HLSL_DOT_I]], 0xH4000 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x half> poison, half [[DOTSCALAR]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[TMP0]], <2 x half> poison, <2 x i32> zeroinitializer @@ -42,7 +42,7 @@ half test_reflect_half(half I, half N) { // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_reflect_half2Dv2_DhS_( // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.reflect.v2f16(<2 x half> nofpclass(nan inf) [[I]], <2 x half> nofpclass(nan inf) [[N]]) +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.reflect.v2f16(<2 x half> [[I]], <2 x half> [[N]]) // SPVCHECK-NEXT: ret <2 x half> [[SPV_REFLECT_I]] // half2 test_reflect_half2(half2 I, half2 N) { @@ -52,7 +52,7 @@ half2 test_reflect_half2(half2 I, half2 N) { // CHECK-LABEL: define noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_( // CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> nofpclass(nan inf) [[I]], <3 x half> nofpclass(nan inf) [[N]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> [[I]], <3 x half> [[N]]) // CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[HLSL_DOT_I]], 0xH4000 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x half> poison, half [[DOTSCALAR]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <3 x i32> zeroinitializer @@ -63,7 +63,7 @@ half2 test_reflect_half2(half2 I, half2 N) { // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_reflect_half3Dv3_DhS_( // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.reflect.v3f16(<3 x half> nofpclass(nan inf) [[I]], <3 x half> nofpclass(nan inf) [[N]]) +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.reflect.v3f16(<3 x half> [[I]], <3 x half> [[N]]) // SPVCHECK-NEXT: ret <3 x half> [[SPV_REFLECT_I]] // half3 test_reflect_half3(half3 I, half3 N) { @@ -73,7 +73,7 @@ half3 test_reflect_half3(half3 I, half3 N) { // CHECK-LABEL: define noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_( // CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> nofpclass(nan inf) [[I]], <4 x half> nofpclass(nan inf) [[N]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> [[I]], <4 x half> [[N]]) // CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[HLSL_DOT_I]], 0xH4000 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x half> poison, half [[DOTSCALAR]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> zeroinitializer @@ -84,7 +84,7 @@ half3 test_reflect_half3(half3 I, half3 N) { // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_reflect_half4Dv4_DhS_( // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.reflect.v4f16(<4 x half> nofpclass(nan inf) [[I]], <4 x half> nofpclass(nan inf) [[N]]) +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.reflect.v4f16(<4 x half> [[I]], <4 x half> [[N]]) // SPVCHECK-NEXT: ret <4 x half> [[SPV_REFLECT_I]] // half4 test_reflect_half4(half4 I, half4 N) { @@ -116,7 +116,7 @@ float test_reflect_float(float I, float N) { // CHECK-LABEL: define noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_( // CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> nofpclass(nan inf) [[I]], <2 x float> nofpclass(nan inf) [[N]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> [[I]], <2 x float> [[N]]) // CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[HLSL_DOT_I]], 2.000000e+00 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[DOTSCALAR]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> zeroinitializer @@ -127,7 +127,7 @@ float test_reflect_float(float I, float N) { // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_reflect_float2Dv2_fS_( // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.reflect.v2f32(<2 x float> nofpclass(nan inf) [[I]], <2 x float> nofpclass(nan inf) [[N]]) +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.reflect.v2f32(<2 x float> [[I]], <2 x float> [[N]]) // SPVCHECK-NEXT: ret <2 x float> [[SPV_REFLECT_I]] // float2 test_reflect_float2(float2 I, float2 N) { @@ -137,7 +137,7 @@ float2 test_reflect_float2(float2 I, float2 N) { // CHECK-LABEL: define noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_( // CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> nofpclass(nan inf) [[I]], <3 x float> nofpclass(nan inf) [[N]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> [[I]], <3 x float> [[N]]) // CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[HLSL_DOT_I]], 2.000000e+00 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x float> poison, float [[DOTSCALAR]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <3 x i32> zeroinitializer @@ -148,7 +148,7 @@ float2 test_reflect_float2(float2 I, float2 N) { // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_reflect_float3Dv3_fS_( // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.reflect.v3f32(<3 x float> nofpclass(nan inf) [[I]], <3 x float> nofpclass(nan inf) [[N]]) +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.reflect.v3f32(<3 x float> [[I]], <3 x float> [[N]]) // SPVCHECK-NEXT: ret <3 x float> [[SPV_REFLECT_I]] // float3 test_reflect_float3(float3 I, float3 N) { @@ -158,7 +158,7 @@ float3 test_reflect_float3(float3 I, float3 N) { // CHECK-LABEL: define noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_( // CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> nofpclass(nan inf) [[I]], <4 x float> nofpclass(nan inf) [[N]]) +// CHECK-NEXT: [[HLSL_DOT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> [[I]], <4 x float> [[N]]) // CHECK-NEXT: [[DOTSCALAR:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[HLSL_DOT_I]], 2.000000e+00 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float [[DOTSCALAR]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> zeroinitializer @@ -169,7 +169,7 @@ float3 test_reflect_float3(float3 I, float3 N) { // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_reflect_float4Dv4_fS_( // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.reflect.v4f32(<4 x float> nofpclass(nan inf) [[I]], <4 x float> nofpclass(nan inf) [[N]]) +// SPVCHECK-NEXT: [[SPV_REFLECT_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.reflect.v4f32(<4 x float> [[I]], <4 x float> [[N]]) // SPVCHECK-NEXT: ret <4 x float> [[SPV_REFLECT_I]] // float4 test_reflect_float4(float4 I, float4 N) { diff --git a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl index d3e5c1059029c..f2328c7330e6c 100644 --- a/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl +++ b/clang/test/CodeGenHLSL/builtins/smoothstep.hlsl @@ -22,7 +22,7 @@ // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) half @_Z20test_smoothstep_halfDhDhDh( // SPVCHECK-SAME: half noundef nofpclass(nan inf) [[MIN:%.*]], half noundef nofpclass(nan inf) [[MAX:%.*]], half noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.smoothstep.f16(half nofpclass(nan inf) [[MIN]], half nofpclass(nan inf) [[MAX]], half nofpclass(nan inf) [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.smoothstep.f16(half [[MIN]], half [[MAX]], half [[X]]) // SPVCHECK-NEXT: ret half [[SPV_SMOOTHSTEP_I]] // half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, Max, X); } @@ -43,7 +43,7 @@ half test_smoothstep_half(half Min, half Max, half X) { return smoothstep(Min, M // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x half> @_Z21test_smoothstep_half2Dv2_DhS_S_( // SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.smoothstep.v2f16(<2 x half> nofpclass(nan inf) [[MIN]], <2 x half> nofpclass(nan inf) [[MAX]], <2 x half> nofpclass(nan inf) [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.smoothstep.v2f16(<2 x half> [[MIN]], <2 x half> [[MAX]], <2 x half> [[X]]) // SPVCHECK-NEXT: ret <2 x half> [[SPV_SMOOTHSTEP_I]] // half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(Min, Max, X); } @@ -64,7 +64,7 @@ half2 test_smoothstep_half2(half2 Min, half2 Max, half2 X) { return smoothstep(M // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x half> @_Z21test_smoothstep_half3Dv3_DhS_S_( // SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.smoothstep.v3f16(<3 x half> nofpclass(nan inf) [[MIN]], <3 x half> nofpclass(nan inf) [[MAX]], <3 x half> nofpclass(nan inf) [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.smoothstep.v3f16(<3 x half> [[MIN]], <3 x half> [[MAX]], <3 x half> [[X]]) // SPVCHECK-NEXT: ret <3 x half> [[SPV_SMOOTHSTEP_I]] // half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(Min, Max, X); } @@ -85,7 +85,7 @@ half3 test_smoothstep_half3(half3 Min, half3 Max, half3 X) { return smoothstep(M // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x half> @_Z21test_smoothstep_half4Dv4_DhS_S_( // SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x half> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x half> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.smoothstep.v4f16(<4 x half> nofpclass(nan inf) [[MIN]], <4 x half> nofpclass(nan inf) [[MAX]], <4 x half> nofpclass(nan inf) [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.smoothstep.v4f16(<4 x half> [[MIN]], <4 x half> [[MAX]], <4 x half> [[X]]) // SPVCHECK-NEXT: ret <4 x half> [[SPV_SMOOTHSTEP_I]] // half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(Min, Max, X); } @@ -106,7 +106,7 @@ half4 test_smoothstep_half4(half4 Min, half4 Max, half4 X) { return smoothstep(M // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) float @_Z21test_smoothstep_floatfff( // SPVCHECK-SAME: float noundef nofpclass(nan inf) [[MIN:%.*]], float noundef nofpclass(nan inf) [[MAX:%.*]], float noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.smoothstep.f32(float nofpclass(nan inf) [[MIN]], float nofpclass(nan inf) [[MAX]], float nofpclass(nan inf) [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.smoothstep.f32(float [[MIN]], float [[MAX]], float [[X]]) // SPVCHECK-NEXT: ret float [[SPV_SMOOTHSTEP_I]] // float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(Min, Max, X); } @@ -127,7 +127,7 @@ float test_smoothstep_float(float Min, float Max, float X) { return smoothstep(M // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <2 x float> @_Z22test_smoothstep_float2Dv2_fS_S_( // SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <2 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <2 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.smoothstep.v2f32(<2 x float> nofpclass(nan inf) [[MIN]], <2 x float> nofpclass(nan inf) [[MAX]], <2 x float> nofpclass(nan inf) [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.smoothstep.v2f32(<2 x float> [[MIN]], <2 x float> [[MAX]], <2 x float> [[X]]) // SPVCHECK-NEXT: ret <2 x float> [[SPV_SMOOTHSTEP_I]] // float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smoothstep(Min, Max, X); } @@ -148,7 +148,7 @@ float2 test_smoothstep_float2(float2 Min, float2 Max, float2 X) { return smooths // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <3 x float> @_Z22test_smoothstep_float3Dv3_fS_S_( // SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <3 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <3 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.smoothstep.v3f32(<3 x float> nofpclass(nan inf) [[MIN]], <3 x float> nofpclass(nan inf) [[MAX]], <3 x float> nofpclass(nan inf) [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.smoothstep.v3f32(<3 x float> [[MIN]], <3 x float> [[MAX]], <3 x float> [[X]]) // SPVCHECK-NEXT: ret <3 x float> [[SPV_SMOOTHSTEP_I]] // float3 test_smoothstep_float3(float3 Min, float3 Max, float3 X) { return smoothstep(Min, Max, X); } @@ -169,7 +169,7 @@ float3 test_smoothstep_float3(float3 Min, float3 Max, float3 X) { return smooths // SPVCHECK-LABEL: define spir_func noundef nofpclass(nan inf) <4 x float> @_Z22test_smoothstep_float4Dv4_fS_S_( // SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[MIN:%.*]], <4 x float> noundef nofpclass(nan inf) [[MAX:%.*]], <4 x float> noundef nofpclass(nan inf) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // SPVCHECK-NEXT: [[ENTRY:.*:]] -// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.smoothstep.v4f32(<4 x float> nofpclass(nan inf) [[MIN]], <4 x float> nofpclass(nan inf) [[MAX]], <4 x float> nofpclass(nan inf) [[X]]) +// SPVCHECK-NEXT: [[SPV_SMOOTHSTEP_I:%.*]] = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.smoothstep.v4f32(<4 x float> [[MIN]], <4 x float> [[MAX]], <4 x float> [[X]]) // SPVCHECK-NEXT: ret <4 x float> [[SPV_SMOOTHSTEP_I]] // float4 test_smoothstep_float4(float4 Min, float4 Max, float4 X) { return smoothstep(Min, Max, X); } diff --git a/clang/test/Headers/__clang_hip_cmath.hip b/clang/test/Headers/__clang_hip_cmath.hip index 7d812fd0265a6..0c9ff4cdd7808 100644 --- a/clang/test/Headers/__clang_hip_cmath.hip +++ b/clang/test/Headers/__clang_hip_cmath.hip @@ -24,7 +24,7 @@ // // FINITEONLY-LABEL: @test_fma_f16( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef half @llvm.fma.f16(half nofpclass(nan inf) [[X:%.*]], half nofpclass(nan inf) [[Y:%.*]], half nofpclass(nan inf) [[Z:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef half @llvm.fma.f16(half [[X:%.*]], half [[Y:%.*]], half [[Z:%.*]]) // FINITEONLY-NEXT: ret half [[TMP0]] // extern "C" __device__ _Float16 test_fma_f16(_Float16 x, _Float16 y, @@ -34,12 +34,12 @@ extern "C" __device__ _Float16 test_fma_f16(_Float16 x, _Float16 y, // DEFAULT-LABEL: @test_pow_f16( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef half @__ocml_pown_f16(half noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR9:[0-9]+]] +// DEFAULT-NEXT: [[CALL_I:%.*]] = tail call contract noundef half @__ocml_pown_f16(half noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR7:[0-9]+]] // DEFAULT-NEXT: ret half [[CALL_I]] // // FINITEONLY-LABEL: @test_pow_f16( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) half @__ocml_pown_f16(half noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR9:[0-9]+]] +// FINITEONLY-NEXT: [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) half @__ocml_pown_f16(half noundef nofpclass(nan inf) [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR7:[0-9]+]] // FINITEONLY-NEXT: ret half [[CALL_I]] // extern "C" __device__ _Float16 test_pow_f16(_Float16 x, int y) { @@ -53,7 +53,7 @@ extern "C" __device__ _Float16 test_pow_f16(_Float16 x, int y) { // // FINITEONLY-LABEL: @test_fabs_f32( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fabs.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // extern "C" __device__ float test_fabs_f32(float x) { @@ -62,12 +62,12 @@ extern "C" __device__ float test_fabs_f32(float x) { // DEFAULT-LABEL: @test_sin_f32( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR10:[0-9]+]] +// DEFAULT-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_sin_f32(float noundef [[X:%.*]]) #[[ATTR8:[0-9]+]] // DEFAULT-NEXT: ret float [[CALL_I1]] // // FINITEONLY-LABEL: @test_sin_f32( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR10:[0-9]+]] +// FINITEONLY-NEXT: [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8:[0-9]+]] // FINITEONLY-NEXT: ret float [[CALL_I1]] // extern "C" __device__ float test_sin_f32(float x) { @@ -76,12 +76,12 @@ extern "C" __device__ float test_sin_f32(float x) { // DEFAULT-LABEL: @test_cos_f32( // DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR10]] +// DEFAULT-NEXT: [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_cos_f32(float noundef [[X:%.*]]) #[[ATTR8]] // DEFAULT-NEXT: ret float [[CALL_I1]] // // FINITEONLY-LABEL: @test_cos_f32( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR10]] +// FINITEONLY-NEXT: [[CALL_I1:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_cos_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR8]] // FINITEONLY-NEXT: ret float [[CALL_I1]] // extern "C" __device__ float test_cos_f32(float x) { @@ -97,46 +97,10 @@ struct user_bfloat16 { }; namespace user_namespace { -// DEFAULT-LABEL: @_ZN14user_namespace3fmaE13user_bfloat16S0_S0_( -// DEFAULT-NEXT: entry: -// DEFAULT-NEXT: ret void -// -// FINITEONLY-LABEL: @_ZN14user_namespace3fmaE13user_bfloat16S0_S0_( -// FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: ret void -// __device__ user_bfloat16 fma(const user_bfloat16 a, const user_bfloat16 b, const user_bfloat16 c) { return a; } -// DEFAULT-LABEL: @_ZN14user_namespace8test_fmaEv( -// DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[A:%.*]] = alloca [[STRUCT_USER_BFLOAT16:%.*]], align 1, addrspace(5) -// DEFAULT-NEXT: [[B:%.*]] = alloca [[STRUCT_USER_BFLOAT16]], align 1, addrspace(5) -// DEFAULT-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -// DEFAULT-NEXT: [[B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B]] to ptr -// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) [[A]]) #[[ATTR11:[0-9]+]] -// DEFAULT-NEXT: call void @_ZN13user_bfloat16C1Ef(ptr noundef nonnull align 1 dereferenceable(1) [[A_ASCAST]], float noundef 1.000000e+00) #[[ATTR10]] -// DEFAULT-NEXT: call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) [[B]]) #[[ATTR11]] -// DEFAULT-NEXT: call void @_ZN13user_bfloat16C1Ef(ptr noundef nonnull align 1 dereferenceable(1) [[B_ASCAST]], float noundef 2.000000e+00) #[[ATTR10]] -// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) [[B]]) #[[ATTR11]] -// DEFAULT-NEXT: call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) [[A]]) #[[ATTR11]] -// DEFAULT-NEXT: ret void -// -// FINITEONLY-LABEL: @_ZN14user_namespace8test_fmaEv( -// FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[A:%.*]] = alloca [[STRUCT_USER_BFLOAT16:%.*]], align 1, addrspace(5) -// FINITEONLY-NEXT: [[B:%.*]] = alloca [[STRUCT_USER_BFLOAT16]], align 1, addrspace(5) -// FINITEONLY-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -// FINITEONLY-NEXT: [[B_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B]] to ptr -// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) [[A]]) #[[ATTR11:[0-9]+]] -// FINITEONLY-NEXT: call void @_ZN13user_bfloat16C1Ef(ptr noundef nonnull align 1 dereferenceable(1) [[A_ASCAST]], float noundef nofpclass(nan inf) 1.000000e+00) #[[ATTR10]] -// FINITEONLY-NEXT: call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) [[B]]) #[[ATTR11]] -// FINITEONLY-NEXT: call void @_ZN13user_bfloat16C1Ef(ptr noundef nonnull align 1 dereferenceable(1) [[B_ASCAST]], float noundef nofpclass(nan inf) 2.000000e+00) #[[ATTR10]] -// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) [[B]]) #[[ATTR11]] -// FINITEONLY-NEXT: call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) [[A]]) #[[ATTR11]] -// FINITEONLY-NEXT: ret void -// __global__ void test_fma() { user_bfloat16 a = 1.0f, b = 2.0f; fma(a, b, b); diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip index df1cd716342a5..e879fec0ebe5a 100644 --- a/clang/test/Headers/__clang_hip_math.hip +++ b/clang/test/Headers/__clang_hip_math.hip @@ -842,7 +842,7 @@ extern "C" __device__ double test_cbrt(double x) { // // FINITEONLY-LABEL: @test_ceilf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ceil.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ceil.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_ceilf( @@ -866,7 +866,7 @@ extern "C" __device__ float test_ceilf(float x) { // // FINITEONLY-LABEL: @test_ceil( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ceil.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ceil.f64(double [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_ceil( @@ -890,7 +890,7 @@ extern "C" __device__ double test_ceil(double x) { // // FINITEONLY-LABEL: @test_copysignf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.copysign.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.copysign.f32(float [[X:%.*]], float [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_copysignf( @@ -914,7 +914,7 @@ extern "C" __device__ float test_copysignf(float x, float y) { // // FINITEONLY-LABEL: @test_copysign( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.copysign.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.copysign.f64(double [[X:%.*]], double [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_copysign( @@ -1274,7 +1274,7 @@ extern "C" __device__ double test_erfinv(double x) { // // FINITEONLY-LABEL: @test_exp10f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp10.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp10.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_exp10f( @@ -1322,7 +1322,7 @@ extern "C" __device__ double test_exp10(double x) { // // FINITEONLY-LABEL: @test_exp2f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp2.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp2.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_exp2f( @@ -1370,7 +1370,7 @@ extern "C" __device__ double test_exp2(double x) { // // FINITEONLY-LABEL: @test_expf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.exp.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_expf( @@ -1466,7 +1466,7 @@ extern "C" __device__ double test_expm1(double x) { // // FINITEONLY-LABEL: @test_fabsf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fabs.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fabs.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_fabsf( @@ -1490,7 +1490,7 @@ extern "C" __device__ float test_fabsf(float x) { // // FINITEONLY-LABEL: @test_fabs( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fabs.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fabs.f64(double [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_fabs( @@ -1586,7 +1586,7 @@ extern "C" __device__ float test_fdividef(float x, float y) { // // FINITEONLY-LABEL: @test_floorf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.floor.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.floor.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_floorf( @@ -1610,7 +1610,7 @@ extern "C" __device__ float test_floorf(float x) { // // FINITEONLY-LABEL: @test_floor( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.floor.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.floor.f64(double [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_floor( @@ -1634,7 +1634,7 @@ extern "C" __device__ double test_floor(double x) { // // FINITEONLY-LABEL: @test_fmaf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]], float nofpclass(nan inf) [[Z:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_fmaf( @@ -1658,7 +1658,7 @@ extern "C" __device__ float test_fmaf(float x, float y, float z) { // // FINITEONLY-LABEL: @test_fma( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]], double nofpclass(nan inf) [[Z:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_fma( @@ -1682,7 +1682,7 @@ extern "C" __device__ double test_fma(double x, double y, double z) { // // FINITEONLY-LABEL: @test_fma_rn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]], double nofpclass(nan inf) [[Z:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_fma_rn( @@ -1706,7 +1706,7 @@ extern "C" __device__ double test_fma_rn(double x, double y, double z) { // // FINITEONLY-LABEL: @test_fmaxf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_fmaxf( @@ -1730,7 +1730,7 @@ extern "C" __device__ float test_fmaxf(float x, float y) { // // FINITEONLY-LABEL: @test_fmax( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_fmax( @@ -1754,7 +1754,7 @@ extern "C" __device__ double test_fmax(double x, double y) { // // FINITEONLY-LABEL: @test_fminf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_fminf( @@ -1778,7 +1778,7 @@ extern "C" __device__ float test_fminf(float x, float y) { // // FINITEONLY-LABEL: @test_fmin( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_fmin( @@ -1843,29 +1843,13 @@ extern "C" __device__ double test_fmod(double x, double y) { return fmod(x, y); } -// DEFAULT-LABEL: @test_frexpf( -// DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X:%.*]]) -// DEFAULT-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 -// DEFAULT-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12:![0-9]+]] -// DEFAULT-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0 -// DEFAULT-NEXT: ret float [[TMP2]] -// -// FINITEONLY-LABEL: @test_frexpf( -// FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float nofpclass(nan inf) [[X:%.*]]) -// FINITEONLY-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 -// FINITEONLY-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12:![0-9]+]] -// FINITEONLY-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0 -// FINITEONLY-NEXT: ret float [[TMP2]] -// -// APPROX-LABEL: @test_frexpf( -// APPROX-NEXT: entry: -// APPROX-NEXT: [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X:%.*]]) -// APPROX-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 -// APPROX-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12:![0-9]+]] -// APPROX-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0 -// APPROX-NEXT: ret float [[TMP2]] +// CHECK-LABEL: @test_frexpf( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { float, i32 } @llvm.frexp.f32.i32(float [[X:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { float, i32 } [[TMP0]], 1 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12:![0-9]+]] +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { float, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret float [[TMP2]] // // AMDGCNSPIRV-LABEL: @test_frexpf( // AMDGCNSPIRV-NEXT: entry: @@ -1879,29 +1863,13 @@ extern "C" __device__ float test_frexpf(float x, int* y) { return frexpf(x, y); } -// DEFAULT-LABEL: @test_frexp( -// DEFAULT-NEXT: entry: -// DEFAULT-NEXT: [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X:%.*]]) -// DEFAULT-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 -// DEFAULT-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12]] -// DEFAULT-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0 -// DEFAULT-NEXT: ret double [[TMP2]] -// -// FINITEONLY-LABEL: @test_frexp( -// FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double nofpclass(nan inf) [[X:%.*]]) -// FINITEONLY-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 -// FINITEONLY-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12]] -// FINITEONLY-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0 -// FINITEONLY-NEXT: ret double [[TMP2]] -// -// APPROX-LABEL: @test_frexp( -// APPROX-NEXT: entry: -// APPROX-NEXT: [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X:%.*]]) -// APPROX-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 -// APPROX-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12]] -// APPROX-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0 -// APPROX-NEXT: ret double [[TMP2]] +// CHECK-LABEL: @test_frexp( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { double, i32 } @llvm.frexp.f64.i32(double [[X:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { double, i32 } [[TMP0]], 1 +// CHECK-NEXT: store i32 [[TMP1]], ptr [[Y:%.*]], align 4, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { double, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret double [[TMP2]] // // AMDGCNSPIRV-LABEL: @test_frexp( // AMDGCNSPIRV-NEXT: entry: @@ -2554,7 +2522,7 @@ extern "C" __device__ double test_jn(int x, double y) { // // FINITEONLY-LABEL: @test_ldexpf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float nofpclass(nan inf) [[X:%.*]], i32 [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_ldexpf( @@ -2578,7 +2546,7 @@ extern "C" __device__ float test_ldexpf(float x, int y) { // // FINITEONLY-LABEL: @test_ldexp( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double nofpclass(nan inf) [[X:%.*]], i32 [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_ldexp( @@ -2651,7 +2619,7 @@ extern "C" __device__ double test_lgamma(double x) { // // FINITEONLY-LABEL: @test_llrintf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -2679,7 +2647,7 @@ extern "C" __device__ long long int test_llrintf(float x) { // // FINITEONLY-LABEL: @test_llrint( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -2707,7 +2675,7 @@ extern "C" __device__ long long int test_llrint(double x) { // // FINITEONLY-LABEL: @test_llroundf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -2735,7 +2703,7 @@ extern "C" __device__ long long int test_llroundf(float x) { // // FINITEONLY-LABEL: @test_llround( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -2762,7 +2730,7 @@ extern "C" __device__ long long int test_llround(double x) { // // FINITEONLY-LABEL: @test_log10f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_log10f( @@ -2858,7 +2826,7 @@ extern "C" __device__ double test_log1p(double x) { // // FINITEONLY-LABEL: @test_log2f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log2.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log2.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_log2f( @@ -2954,7 +2922,7 @@ extern "C" __device__ double test_logb(double x) { // // FINITEONLY-LABEL: @test_logf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_logf( @@ -2979,7 +2947,7 @@ extern "C" __device__ float test_logf(float x) { // // FINITEONLY-LABEL: @test_lrintf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.rint.f32(float [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -3007,7 +2975,7 @@ extern "C" __device__ long int test_lrintf(float x) { // // FINITEONLY-LABEL: @test_lrint( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.rint.f64(double [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -3035,7 +3003,7 @@ extern "C" __device__ long int test_lrint(double x) { // // FINITEONLY-LABEL: @test_lroundf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.round.f32(float [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi float [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -3063,7 +3031,7 @@ extern "C" __device__ long int test_lroundf(float x) { // // FINITEONLY-LABEL: @test_lround( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract double @llvm.round.f64(double [[X:%.*]]) // FINITEONLY-NEXT: [[CONV_I:%.*]] = fptosi double [[TMP0]] to i64 // FINITEONLY-NEXT: ret i64 [[CONV_I]] // @@ -3827,7 +3795,7 @@ extern "C" __device__ double test_nan_fill() { // // FINITEONLY-LABEL: @test_nearbyintf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.nearbyint.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.nearbyint.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_nearbyintf( @@ -3851,7 +3819,7 @@ extern "C" __device__ float test_nearbyintf(float x) { // // FINITEONLY-LABEL: @test_nearbyint( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.nearbyint.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.nearbyint.f64(double [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_nearbyint( @@ -4613,7 +4581,7 @@ extern "C" __device__ double test_rhypot(double x, double y) { // // FINITEONLY-LABEL: @test_rintf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.rint.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.rint.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_rintf( @@ -4637,7 +4605,7 @@ extern "C" __device__ float test_rintf(float x) { // // FINITEONLY-LABEL: @test_rint( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.rint.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.rint.f64(double [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_rint( @@ -4925,7 +4893,7 @@ extern "C" __device__ double test_rnorm4d(double x, double y, double z, double w // // FINITEONLY-LABEL: @test_roundf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.round.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.round.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_roundf( @@ -4949,7 +4917,7 @@ extern "C" __device__ float test_roundf(float x) { // // FINITEONLY-LABEL: @test_round( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.round.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.round.f64(double [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_round( @@ -5025,7 +4993,7 @@ extern "C" __device__ double test_rsqrt(double x) { // FINITEONLY-NEXT: entry: // FINITEONLY-NEXT: [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648) // FINITEONLY-NEXT: [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32 -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float nofpclass(nan inf) [[X:%.*]], i32 [[CONV_I]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[CONV_I]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_scalblnf( @@ -5057,7 +5025,7 @@ extern "C" __device__ float test_scalblnf(float x, long int y) { // FINITEONLY-NEXT: entry: // FINITEONLY-NEXT: [[SPEC_STORE_SELECT_I:%.*]] = tail call i64 @llvm.smax.i64(i64 [[Y:%.*]], i64 -2147483648) // FINITEONLY-NEXT: [[CONV_I:%.*]] = trunc i64 [[SPEC_STORE_SELECT_I]] to i32 -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double nofpclass(nan inf) [[X:%.*]], i32 [[CONV_I]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[CONV_I]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_scalbln( @@ -5085,7 +5053,7 @@ extern "C" __device__ double test_scalbln(double x, long int y) { // // FINITEONLY-LABEL: @test_scalbnf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float nofpclass(nan inf) [[X:%.*]], i32 [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_scalbnf( @@ -5109,7 +5077,7 @@ extern "C" __device__ float test_scalbnf(float x, int y) { // // FINITEONLY-LABEL: @test_scalbn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double nofpclass(nan inf) [[X:%.*]], i32 [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.ldexp.f64.i32(double [[X:%.*]], i32 [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_scalbn( @@ -5459,7 +5427,7 @@ extern "C" __device__ double test_sinpi(double x) { // // FINITEONLY-LABEL: @test_sqrtf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.sqrt.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.sqrt.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_sqrtf( @@ -5483,7 +5451,7 @@ extern "C" __device__ float test_sqrtf(float x) { // // FINITEONLY-LABEL: @test_sqrt( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_sqrt( @@ -5651,7 +5619,7 @@ extern "C" __device__ double test_tgamma(double x) { // // FINITEONLY-LABEL: @test_truncf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.trunc.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.trunc.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_truncf( @@ -5675,7 +5643,7 @@ extern "C" __device__ float test_truncf(float x) { // // FINITEONLY-LABEL: @test_trunc( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.trunc.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.trunc.f64(double [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_trunc( @@ -6195,7 +6163,7 @@ extern "C" __device__ float test___fdividef(float x, float y) { // // FINITEONLY-LABEL: @test__fmaf_rn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]], float nofpclass(nan inf) [[Z:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.fma.f32(float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test__fmaf_rn( @@ -6267,7 +6235,7 @@ extern "C" __device__ float test___frcp_rn(float x) { // // FINITEONLY-LABEL: @test___frsqrt_rn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.rsq.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.rsq.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test___frsqrt_rn( @@ -6339,7 +6307,7 @@ extern "C" __device__ float test___fsub_rn(float x, float y) { // // FINITEONLY-LABEL: @test___log10f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log10.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test___log10f( @@ -6363,7 +6331,7 @@ extern "C" __device__ float test___log10f(float x) { // // FINITEONLY-LABEL: @test___log2f( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.log.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.amdgcn.log.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test___log2f( @@ -6387,7 +6355,7 @@ extern "C" __device__ float test___log2f(float x) { // // FINITEONLY-LABEL: @test___logf( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.log.f32(float [[X:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test___logf( @@ -6663,7 +6631,7 @@ extern "C" __device__ double test___drcp_rn(double x) { // // FINITEONLY-LABEL: @test___dsqrt_rn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double nofpclass(nan inf) [[X:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double [[X:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test___dsqrt_rn( @@ -6687,7 +6655,7 @@ extern "C" __device__ double test___dsqrt_rn(double x) { // // FINITEONLY-LABEL: @test__fma_rn( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]], double nofpclass(nan inf) [[Z:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.fma.f64(double [[X:%.*]], double [[Y:%.*]], double [[Z:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test__fma_rn( @@ -6711,7 +6679,7 @@ extern "C" __device__ double test__fma_rn(double x, double y, double z) { // // FINITEONLY-LABEL: @test_float_min( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_float_min( @@ -6735,7 +6703,7 @@ extern "C" __device__ float test_float_min(float x, float y) { // // FINITEONLY-LABEL: @test_float_max( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float nofpclass(nan inf) [[X:%.*]], float nofpclass(nan inf) [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) // FINITEONLY-NEXT: ret float [[TMP0]] // // APPROX-LABEL: @test_float_max( @@ -6759,7 +6727,7 @@ extern "C" __device__ float test_float_max(float x, float y) { // // FINITEONLY-LABEL: @test_double_min( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_double_min( @@ -6783,7 +6751,7 @@ extern "C" __device__ double test_double_min(double x, double y) { // // FINITEONLY-LABEL: @test_double_max( // FINITEONLY-NEXT: entry: -// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double nofpclass(nan inf) [[X:%.*]], double nofpclass(nan inf) [[Y:%.*]]) +// FINITEONLY-NEXT: [[TMP0:%.*]] = tail call nnan ninf contract noundef double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]]) // FINITEONLY-NEXT: ret double [[TMP0]] // // APPROX-LABEL: @test_double_max( diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index c65bf16b6a937..5beee1f681b81 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1381,8 +1381,7 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB, // behavior was just using a poison value. static const Attribute::AttrKind ExactAttrsToPropagate[] = { Attribute::Dereferenceable, Attribute::DereferenceableOrNull, - Attribute::NonNull, Attribute::NoFPClass, - Attribute::Alignment, Attribute::Range}; + Attribute::NonNull, Attribute::Alignment, Attribute::Range}; for (unsigned I = 0, E = CB.arg_size(); I < E; ++I) { ValidObjParamAttrs.emplace_back(AttrBuilder{CB.getContext()}); @@ -1464,13 +1463,6 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB, NewAB.addRangeAttr(CombinedRange); } } - - if (FPClassTest ExistingNoFP = AL.getParamNoFPClass(I)) { - FPClassTest NewNoFP = - NewAB.getAttribute(Attribute::NoFPClass).getNoFPClass(); - NewAB.addNoFPClassAttr(ExistingNoFP | NewNoFP); - } - AL = AL.addParamAttributes(Context, I, NewAB); } else if (NewInnerCB->getArgOperand(I)->getType()->isPointerTy()) { // Check if the underlying value for the parameter is an argument. diff --git a/llvm/test/Transforms/Inline/access-attributes-prop.ll b/llvm/test/Transforms/Inline/access-attributes-prop.ll index 5a102d14b5c90..5bf845d5ba94b 100644 --- a/llvm/test/Transforms/Inline/access-attributes-prop.ll +++ b/llvm/test/Transforms/Inline/access-attributes-prop.ll @@ -750,47 +750,3 @@ define void @prop_range_direct(i32 %v) { call void @foo4(i32 range(i32 1, 11) %v) ret void } - -declare void @bar_fp(float %x) - -define void @foo_fp(float %x) { -; CHECK-LABEL: define {{[^@]+}}@foo_fp -; CHECK-SAME: (float [[X:%.*]]) { -; CHECK-NEXT: call void @bar_fp(float [[X]]) -; CHECK-NEXT: ret void -; - call void @bar_fp(float %x) - ret void -} - -define void @prop_param_nofpclass(float %x) { -; CHECK-LABEL: define {{[^@]+}}@prop_param_nofpclass -; CHECK-SAME: (float [[X:%.*]]) { -; CHECK-NEXT: call void @bar_fp(float nofpclass(nan inf) [[X]]) -; CHECK-NEXT: ret void -; - call void @foo_fp(float nofpclass(nan inf) %x) - ret void -} - -declare void @func_fp(float) - -define void @union_nofpclass(float %v) { -; CHECK-LABEL: define {{[^@]+}}@union_nofpclass -; CHECK-SAME: (float [[V:%.*]]) { -; CHECK-NEXT: call void @func_fp(float nofpclass(inf) [[V]]) -; CHECK-NEXT: ret void -; - call void @func_fp(float nofpclass(inf) %v) - ret void -} - -define void @prop_nofpclass_union(float %v) { -; CHECK-LABEL: define {{[^@]+}}@prop_nofpclass_union -; CHECK-SAME: (float [[V:%.*]]) { -; CHECK-NEXT: call void @func_fp(float nofpclass(nan inf) [[V]]) -; CHECK-NEXT: ret void -; - call void @union_nofpclass(float nofpclass(nan) %v) - ret void -} From 16d10546d29355f796cbdb307f0f89d6679d14a0 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Tue, 8 Apr 2025 18:16:18 +0200 Subject: [PATCH 1014/1029] [libc++] Remove _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS (#111964) This macro isn't required if we define all the functions inline. In fact, quite a few of the marked functions have already been inlined. This patch basically only moves code around and adds `_LIBCPP_HIDE_FROM_ABI` to the places where it's been missing so far. This also removes inlining hints, since it dropps `inline` in some places, but that shouldn't make much of a difference. The functions tend to be either really small, so should be inlined anyways, or are big enough that they shouldn't be inlined even with an inlinehint. --- libcxx/.clang-format | 1 - libcxx/docs/DesignDocs/VisibilityMacros.rst | 29 - .../__condition_variable/condition_variable.h | 182 +++--- libcxx/include/__config | 8 - libcxx/include/__locale | 32 +- libcxx/include/__thread/thread.h | 127 ++-- libcxx/include/condition_variable | 66 +-- libcxx/include/future | 48 +- libcxx/include/locale | 547 +++++++++--------- libcxx/include/mutex | 76 ++- libcxx/include/shared_mutex | 100 ++-- libcxx/include/string | 305 ++++------ 12 files changed, 672 insertions(+), 849 deletions(-) diff --git a/libcxx/.clang-format b/libcxx/.clang-format index a6154c7c4a2bc..e25196a05c92c 100644 --- a/libcxx/.clang-format +++ b/libcxx/.clang-format @@ -37,7 +37,6 @@ AttributeMacros: [ '_LIBCPP_HIDDEN', '_LIBCPP_HIDE_FROM_ABI_AFTER_V1', '_LIBCPP_HIDE_FROM_ABI', - '_LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS', '_LIBCPP_NO_SANITIZE', '_LIBCPP_NO_UNIQUE_ADDRESS', '_LIBCPP_NOALIAS', diff --git a/libcxx/docs/DesignDocs/VisibilityMacros.rst b/libcxx/docs/DesignDocs/VisibilityMacros.rst index 83a9a62942bc9..e37e712014c08 100644 --- a/libcxx/docs/DesignDocs/VisibilityMacros.rst +++ b/libcxx/docs/DesignDocs/VisibilityMacros.rst @@ -105,35 +105,6 @@ Visibility Macros the extern template declaration) as exported on Windows, as discussed above. On all other platforms, this macro has an empty definition. -**_LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS** - Mark a symbol as hidden so it will not be exported from shared libraries. This - is intended specifically for method templates of either classes marked with - `_LIBCPP_TYPE_VIS` or classes with an extern template instantiation - declaration marked with `_LIBCPP_EXTERN_TEMPLATE_TYPE_VIS`. - - When building libc++ with hidden visibility, we want explicit template - instantiations to export members, which is consistent with existing Windows - behavior. We also want classes annotated with `_LIBCPP_TYPE_VIS` to export - their members, which is again consistent with existing Windows behavior. - Both these changes are necessary for clients to be able to link against a - libc++ DSO built with hidden visibility without encountering missing symbols. - - An unfortunate side effect, however, is that method templates of classes - either marked `_LIBCPP_TYPE_VIS` or with extern template instantiation - declarations marked with `_LIBCPP_EXTERN_TEMPLATE_TYPE_VIS` also get default - visibility when instantiated. These methods are often implicitly instantiated - inside other libraries which use the libc++ headers, and will therefore end up - being exported from those libraries, since those implicit instantiations will - receive default visibility. This is not acceptable for libraries that wish to - control their visibility, and led to PR30642. - - Consequently, all such problematic method templates are explicitly marked - either hidden (via this macro) or inline, so that they don't leak into client - libraries. The problematic methods were found by running - `bad-visibility-finder `_ - against the libc++ headers after making `_LIBCPP_TYPE_VIS` and - `_LIBCPP_EXTERN_TEMPLATE_TYPE_VIS` expand to default visibility. - Links ===== diff --git a/libcxx/include/__condition_variable/condition_variable.h b/libcxx/include/__condition_variable/condition_variable.h index 82ecb804669e6..1e8edd5dcb009 100644 --- a/libcxx/include/__condition_variable/condition_variable.h +++ b/libcxx/include/__condition_variable/condition_variable.h @@ -39,60 +39,6 @@ _LIBCPP_BEGIN_NAMESPACE_STD _LIBCPP_DECLARE_STRONG_ENUM(cv_status){no_timeout, timeout}; _LIBCPP_DECLARE_STRONG_ENUM_EPILOG(cv_status) -class _LIBCPP_EXPORTED_FROM_ABI condition_variable { - __libcpp_condvar_t __cv_ = _LIBCPP_CONDVAR_INITIALIZER; - -public: - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR condition_variable() _NOEXCEPT = default; - -# if _LIBCPP_HAS_TRIVIAL_CONDVAR_DESTRUCTION - ~condition_variable() = default; -# else - ~condition_variable(); -# endif - - condition_variable(const condition_variable&) = delete; - condition_variable& operator=(const condition_variable&) = delete; - - void notify_one() _NOEXCEPT; - void notify_all() _NOEXCEPT; - - void wait(unique_lock& __lk) _NOEXCEPT; - template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS void wait(unique_lock& __lk, _Predicate __pred); - - template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS cv_status - wait_until(unique_lock& __lk, const chrono::time_point<_Clock, _Duration>& __t); - - template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS bool - wait_until(unique_lock& __lk, const chrono::time_point<_Clock, _Duration>& __t, _Predicate __pred); - - template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS cv_status - wait_for(unique_lock& __lk, const chrono::duration<_Rep, _Period>& __d); - - template - bool _LIBCPP_HIDE_FROM_ABI - wait_for(unique_lock& __lk, const chrono::duration<_Rep, _Period>& __d, _Predicate __pred); - - typedef __libcpp_condvar_t* native_handle_type; - _LIBCPP_HIDE_FROM_ABI native_handle_type native_handle() { return &__cv_; } - -private: - void - __do_timed_wait(unique_lock& __lk, chrono::time_point) _NOEXCEPT; -# if _LIBCPP_HAS_COND_CLOCKWAIT - _LIBCPP_HIDE_FROM_ABI void - __do_timed_wait(unique_lock& __lk, chrono::time_point) _NOEXCEPT; -# endif - template - _LIBCPP_HIDE_FROM_ABI void - __do_timed_wait(unique_lock& __lk, chrono::time_point<_Clock, chrono::nanoseconds>) _NOEXCEPT; -}; -#endif // _LIBCPP_HAS_THREADS - template ::value, int> = 0> inline _LIBCPP_HIDE_FROM_ABI chrono::nanoseconds __safe_nanosecond_cast(chrono::duration<_Rep, _Period> __d) { using namespace chrono; @@ -140,64 +86,106 @@ inline _LIBCPP_HIDE_FROM_ABI chrono::nanoseconds __safe_nanosecond_cast(chrono:: return nanoseconds(__result); } -#if _LIBCPP_HAS_THREADS -template -void condition_variable::wait(unique_lock& __lk, _Predicate __pred) { - while (!__pred()) - wait(__lk); -} +class _LIBCPP_EXPORTED_FROM_ABI condition_variable { + __libcpp_condvar_t __cv_ = _LIBCPP_CONDVAR_INITIALIZER; -template -cv_status condition_variable::wait_until(unique_lock& __lk, const chrono::time_point<_Clock, _Duration>& __t) { - using namespace chrono; - using __clock_tp_ns = time_point<_Clock, nanoseconds>; +public: + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR condition_variable() _NOEXCEPT = default; - typename _Clock::time_point __now = _Clock::now(); - if (__t <= __now) - return cv_status::timeout; +# if _LIBCPP_HAS_TRIVIAL_CONDVAR_DESTRUCTION + ~condition_variable() = default; +# else + ~condition_variable(); +# endif - __clock_tp_ns __t_ns = __clock_tp_ns(std::__safe_nanosecond_cast(__t.time_since_epoch())); + condition_variable(const condition_variable&) = delete; + condition_variable& operator=(const condition_variable&) = delete; - __do_timed_wait(__lk, __t_ns); - return _Clock::now() < __t ? cv_status::no_timeout : cv_status::timeout; -} + void notify_one() _NOEXCEPT; + void notify_all() _NOEXCEPT; + + void wait(unique_lock& __lk) _NOEXCEPT; -template -bool condition_variable::wait_until( - unique_lock& __lk, const chrono::time_point<_Clock, _Duration>& __t, _Predicate __pred) { - while (!__pred()) { - if (wait_until(__lk, __t) == cv_status::timeout) - return __pred(); + template + _LIBCPP_HIDE_FROM_ABI void wait(unique_lock& __lk, _Predicate __pred) { + while (!__pred()) + wait(__lk); } - return true; -} -template -cv_status condition_variable::wait_for(unique_lock& __lk, const chrono::duration<_Rep, _Period>& __d) { - using namespace chrono; - if (__d <= __d.zero()) - return cv_status::timeout; - using __ns_rep = nanoseconds::rep; - steady_clock::time_point __c_now = steady_clock::now(); + template + _LIBCPP_HIDE_FROM_ABI cv_status + wait_until(unique_lock& __lk, const chrono::time_point<_Clock, _Duration>& __t) { + using namespace chrono; + using __clock_tp_ns = time_point<_Clock, nanoseconds>; + + typename _Clock::time_point __now = _Clock::now(); + if (__t <= __now) + return cv_status::timeout; + + __clock_tp_ns __t_ns = __clock_tp_ns(std::__safe_nanosecond_cast(__t.time_since_epoch())); + + __do_timed_wait(__lk, __t_ns); + return _Clock::now() < __t ? cv_status::no_timeout : cv_status::timeout; + } + + template + _LIBCPP_HIDE_FROM_ABI bool + wait_until(unique_lock& __lk, const chrono::time_point<_Clock, _Duration>& __t, _Predicate __pred) { + while (!__pred()) { + if (wait_until(__lk, __t) == cv_status::timeout) + return __pred(); + } + return true; + } + + template + _LIBCPP_HIDE_FROM_ABI cv_status wait_for(unique_lock& __lk, const chrono::duration<_Rep, _Period>& __d) { + using namespace chrono; + if (__d <= __d.zero()) + return cv_status::timeout; + using __ns_rep = nanoseconds::rep; + steady_clock::time_point __c_now = steady_clock::now(); # if _LIBCPP_HAS_COND_CLOCKWAIT - using __clock_tp_ns = time_point; - __ns_rep __now_count_ns = std::__safe_nanosecond_cast(__c_now.time_since_epoch()).count(); + using __clock_tp_ns = time_point; + __ns_rep __now_count_ns = std::__safe_nanosecond_cast(__c_now.time_since_epoch()).count(); # else - using __clock_tp_ns = time_point; - __ns_rep __now_count_ns = std::__safe_nanosecond_cast(system_clock::now().time_since_epoch()).count(); + using __clock_tp_ns = time_point; + __ns_rep __now_count_ns = std::__safe_nanosecond_cast(system_clock::now().time_since_epoch()).count(); # endif - __ns_rep __d_ns_count = std::__safe_nanosecond_cast(__d).count(); + __ns_rep __d_ns_count = std::__safe_nanosecond_cast(__d).count(); - if (__now_count_ns > numeric_limits<__ns_rep>::max() - __d_ns_count) { - __do_timed_wait(__lk, __clock_tp_ns::max()); - } else { - __do_timed_wait(__lk, __clock_tp_ns(nanoseconds(__now_count_ns + __d_ns_count))); + if (__now_count_ns > numeric_limits<__ns_rep>::max() - __d_ns_count) { + __do_timed_wait(__lk, __clock_tp_ns::max()); + } else { + __do_timed_wait(__lk, __clock_tp_ns(nanoseconds(__now_count_ns + __d_ns_count))); + } + + return steady_clock::now() - __c_now < __d ? cv_status::no_timeout : cv_status::timeout; } - return steady_clock::now() - __c_now < __d ? cv_status::no_timeout : cv_status::timeout; -} + template + bool _LIBCPP_HIDE_FROM_ABI + wait_for(unique_lock& __lk, const chrono::duration<_Rep, _Period>& __d, _Predicate __pred); + + typedef __libcpp_condvar_t* native_handle_type; + _LIBCPP_HIDE_FROM_ABI native_handle_type native_handle() { return &__cv_; } + +private: + void + __do_timed_wait(unique_lock& __lk, chrono::time_point) _NOEXCEPT; +# if _LIBCPP_HAS_COND_CLOCKWAIT + _LIBCPP_HIDE_FROM_ABI void + __do_timed_wait(unique_lock& __lk, chrono::time_point) _NOEXCEPT; +# endif + template + _LIBCPP_HIDE_FROM_ABI void + __do_timed_wait(unique_lock& __lk, chrono::time_point<_Clock, chrono::nanoseconds>) _NOEXCEPT; +}; +#endif // _LIBCPP_HAS_THREADS + +#if _LIBCPP_HAS_THREADS template inline bool diff --git a/libcxx/include/__config b/libcxx/include/__config index d1cbc278862b0..084e1d4402e6c 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -383,7 +383,6 @@ typedef __char32_t char32_t; # endif # define _LIBCPP_HIDDEN -# define _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS # define _LIBCPP_TEMPLATE_VIS # define _LIBCPP_TEMPLATE_DATA_VIS # define _LIBCPP_NAMESPACE_VISIBILITY @@ -407,13 +406,6 @@ typedef __char32_t char32_t; # define _LIBCPP_OVERRIDABLE_FUNC_VIS _LIBCPP_VISIBILITY("default") # endif -# if !defined(_LIBCPP_DISABLE_VISIBILITY_ANNOTATIONS) -// The inline should be removed once PR32114 is resolved -# define _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS inline _LIBCPP_HIDDEN -# else -# define _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS -# endif - // This is kept to avoid a huge library-wide diff in the first step. // TODO: Remove this in a follow-up patch # define _LIBCPP_TEMPLATE_VIS diff --git a/libcxx/include/__locale b/libcxx/include/__locale index 47323046fab38..f01ab4e719ca8 100644 --- a/libcxx/include/__locale +++ b/libcxx/include/__locale @@ -45,6 +45,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD class _LIBCPP_EXPORTED_FROM_ABI locale; +template +class collate; + template _LIBCPP_HIDE_FROM_ABI bool has_facet(const locale&) _NOEXCEPT; @@ -84,7 +87,12 @@ public: const locale& operator=(const locale&) _NOEXCEPT; template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS locale combine(const locale&) const; + _LIBCPP_HIDE_FROM_ABI locale combine(const locale& __other) const { + if (!std::has_facet<_Facet>(__other)) + __throw_runtime_error("locale::combine: locale missing facet"); + + return locale(*this, std::addressof(const_cast<_Facet&>(std::use_facet<_Facet>(__other)))); + } // locale operations: string name() const; @@ -93,8 +101,11 @@ public: _LIBCPP_HIDE_FROM_ABI bool operator!=(const locale& __y) const { return !(*this == __y); } # endif template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS bool - operator()(const basic_string<_CharT, _Traits, _Allocator>&, const basic_string<_CharT, _Traits, _Allocator>&) const; + _LIBCPP_HIDE_FROM_ABI bool operator()(const basic_string<_CharT, _Traits, _Allocator>& __x, + const basic_string<_CharT, _Traits, _Allocator>& __y) const { + return std::use_facet >(*this).compare( + __x.data(), __x.data() + __x.size(), __y.data(), __y.data() + __y.size()) < 0; + } // global locale objects: static locale global(const locale&); @@ -155,14 +166,6 @@ inline _LIBCPP_HIDE_FROM_ABI locale::locale(const locale& __other, _Facet* __f) __install_ctor(__other, __f, __f ? __f->id.__get() : 0); } -template -locale locale::combine(const locale& __other) const { - if (!std::has_facet<_Facet>(__other)) - std::__throw_runtime_error("locale::combine: locale missing facet"); - - return locale(*this, std::addressof(const_cast<_Facet&>(std::use_facet<_Facet>(__other)))); -} - template inline _LIBCPP_HIDE_FROM_ABI bool has_facet(const locale& __l) _NOEXCEPT { return __l.has_facet(_Facet::id); @@ -289,13 +292,6 @@ protected: }; # endif -template -bool locale::operator()(const basic_string<_CharT, _Traits, _Allocator>& __x, - const basic_string<_CharT, _Traits, _Allocator>& __y) const { - return std::use_facet >(*this).compare( - __x.data(), __x.data() + __x.size(), __y.data(), __y.data() + __y.size()) < 0; -} - // template class ctype class _LIBCPP_EXPORTED_FROM_ABI ctype_base { diff --git a/libcxx/include/__thread/thread.h b/libcxx/include/__thread/thread.h index 5e22013dec49f..5aae50710bbfc 100644 --- a/libcxx/include/__thread/thread.h +++ b/libcxx/include/__thread/thread.h @@ -152,47 +152,6 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, __thread_id __id) { } # endif // _LIBCPP_HAS_LOCALIZATION -class _LIBCPP_EXPORTED_FROM_ABI thread { - __libcpp_thread_t __t_; - - thread(const thread&); - thread& operator=(const thread&); - -public: - typedef __thread_id id; - typedef __libcpp_thread_t native_handle_type; - - _LIBCPP_HIDE_FROM_ABI thread() _NOEXCEPT : __t_(_LIBCPP_NULL_THREAD) {} -# ifndef _LIBCPP_CXX03_LANG - template , thread>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS explicit thread(_Fp&& __f, _Args&&... __args); -# else // _LIBCPP_CXX03_LANG - template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS explicit thread(_Fp __f); -# endif - ~thread(); - - _LIBCPP_HIDE_FROM_ABI thread(thread&& __t) _NOEXCEPT : __t_(__t.__t_) { __t.__t_ = _LIBCPP_NULL_THREAD; } - - _LIBCPP_HIDE_FROM_ABI thread& operator=(thread&& __t) _NOEXCEPT { - if (!__libcpp_thread_isnull(&__t_)) - terminate(); - __t_ = __t.__t_; - __t.__t_ = _LIBCPP_NULL_THREAD; - return *this; - } - - _LIBCPP_HIDE_FROM_ABI void swap(thread& __t) _NOEXCEPT { std::swap(__t_, __t.__t_); } - - _LIBCPP_HIDE_FROM_ABI bool joinable() const _NOEXCEPT { return !__libcpp_thread_isnull(&__t_); } - void join(); - void detach(); - _LIBCPP_HIDE_FROM_ABI id get_id() const _NOEXCEPT { return __libcpp_thread_get_id(&__t_); } - _LIBCPP_HIDE_FROM_ABI native_handle_type native_handle() _NOEXCEPT { return __t_; } - - static unsigned hardware_concurrency() _NOEXCEPT; -}; - # ifndef _LIBCPP_CXX03_LANG template @@ -210,19 +169,6 @@ _LIBCPP_HIDE_FROM_ABI void* __thread_proxy(void* __vp) { return nullptr; } -template , thread>::value, int> > -thread::thread(_Fp&& __f, _Args&&... __args) { - typedef unique_ptr<__thread_struct> _TSPtr; - _TSPtr __tsp(new __thread_struct); - typedef tuple<_TSPtr, __decay_t<_Fp>, __decay_t<_Args>...> _Gp; - unique_ptr<_Gp> __p(new _Gp(std::move(__tsp), std::forward<_Fp>(__f), std::forward<_Args>(__args)...)); - int __ec = std::__libcpp_thread_create(&__t_, std::addressof(__thread_proxy<_Gp>), __p.get()); - if (__ec == 0) - __p.release(); - else - std::__throw_system_error(__ec, "thread constructor failed"); -} - # else // _LIBCPP_CXX03_LANG template @@ -243,20 +189,69 @@ _LIBCPP_HIDE_FROM_ABI void* __thread_proxy_cxx03(void* __vp) { return nullptr; } -template -thread::thread(_Fp __f) { - typedef __thread_invoke_pair<_Fp> _InvokePair; - typedef unique_ptr<_InvokePair> _PairPtr; - _PairPtr __pp(new _InvokePair(__f)); - int __ec = std::__libcpp_thread_create(&__t_, &__thread_proxy_cxx03<_InvokePair>, __pp.get()); - if (__ec == 0) - __pp.release(); - else - std::__throw_system_error(__ec, "thread constructor failed"); -} - # endif // _LIBCPP_CXX03_LANG +class _LIBCPP_EXPORTED_FROM_ABI thread { + __libcpp_thread_t __t_; + + thread(const thread&); + thread& operator=(const thread&); + +public: + typedef __thread_id id; + typedef __libcpp_thread_t native_handle_type; + + _LIBCPP_HIDE_FROM_ABI thread() _NOEXCEPT : __t_(_LIBCPP_NULL_THREAD) {} + +# ifndef _LIBCPP_CXX03_LANG + template , thread>::value, int> = 0> + _LIBCPP_HIDE_FROM_ABI explicit thread(_Fp&& __f, _Args&&... __args) { + typedef unique_ptr<__thread_struct> _TSPtr; + _TSPtr __tsp(new __thread_struct); + typedef tuple<_TSPtr, __decay_t<_Fp>, __decay_t<_Args>...> _Gp; + unique_ptr<_Gp> __p(new _Gp(std::move(__tsp), std::forward<_Fp>(__f), std::forward<_Args>(__args)...)); + int __ec = std::__libcpp_thread_create(&__t_, std::addressof(__thread_proxy<_Gp>), __p.get()); + if (__ec == 0) + __p.release(); + else + __throw_system_error(__ec, "thread constructor failed"); + } +# else // _LIBCPP_CXX03_LANG + template + _LIBCPP_HIDE_FROM_ABI explicit thread(_Fp __f) { + typedef __thread_invoke_pair<_Fp> _InvokePair; + typedef unique_ptr<_InvokePair> _PairPtr; + _PairPtr __pp(new _InvokePair(__f)); + int __ec = std::__libcpp_thread_create(&__t_, &__thread_proxy_cxx03<_InvokePair>, __pp.get()); + if (__ec == 0) + __pp.release(); + else + __throw_system_error(__ec, "thread constructor failed"); + } +# endif + ~thread(); + + _LIBCPP_HIDE_FROM_ABI thread(thread&& __t) _NOEXCEPT : __t_(__t.__t_) { __t.__t_ = _LIBCPP_NULL_THREAD; } + + _LIBCPP_HIDE_FROM_ABI thread& operator=(thread&& __t) _NOEXCEPT { + if (!__libcpp_thread_isnull(&__t_)) + terminate(); + __t_ = __t.__t_; + __t.__t_ = _LIBCPP_NULL_THREAD; + return *this; + } + + _LIBCPP_HIDE_FROM_ABI void swap(thread& __t) _NOEXCEPT { std::swap(__t_, __t.__t_); } + + _LIBCPP_HIDE_FROM_ABI bool joinable() const _NOEXCEPT { return !__libcpp_thread_isnull(&__t_); } + void join(); + void detach(); + _LIBCPP_HIDE_FROM_ABI id get_id() const _NOEXCEPT { return __libcpp_thread_get_id(&__t_); } + _LIBCPP_HIDE_FROM_ABI native_handle_type native_handle() _NOEXCEPT { return __t_; } + + static unsigned hardware_concurrency() _NOEXCEPT; +}; + inline _LIBCPP_HIDE_FROM_ABI void swap(thread& __x, thread& __y) _NOEXCEPT { __x.swap(__y); } #endif // _LIBCPP_HAS_THREADS diff --git a/libcxx/include/condition_variable b/libcxx/include/condition_variable index 81699bf6adbf7..7f44990547f55 100644 --- a/libcxx/include/condition_variable +++ b/libcxx/include/condition_variable @@ -147,6 +147,21 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD +template +struct __unlock_guard { + _Lock& __lock_; + + _LIBCPP_HIDE_FROM_ABI __unlock_guard(_Lock& __lock) : __lock_(__lock) { __lock_.unlock(); } + + _LIBCPP_HIDE_FROM_ABI ~__unlock_guard() _NOEXCEPT // turns exception to std::terminate + { + __lock_.lock(); + } + + __unlock_guard(const __unlock_guard&) = delete; + __unlock_guard& operator=(const __unlock_guard&) = delete; +}; + class _LIBCPP_EXPORTED_FROM_ABI condition_variable_any { condition_variable __cv_; shared_ptr __mut_; @@ -158,13 +173,25 @@ public: _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT; template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS void wait(_Lock& __lock); + _LIBCPP_HIDE_FROM_ABI void wait(_Lock& __lock) { + shared_ptr __mut = __mut_; + unique_lock __lk(*__mut); + __unlock_guard<_Lock> __unlock(__lock); + lock_guard > __lx(__lk, adopt_lock_t()); + __cv_.wait(__lk); + } // __mut_.unlock(), __lock.lock() + template _LIBCPP_HIDE_FROM_ABI void wait(_Lock& __lock, _Predicate __pred); template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS cv_status - wait_until(_Lock& __lock, const chrono::time_point<_Clock, _Duration>& __t); + _LIBCPP_HIDE_FROM_ABI cv_status wait_until(_Lock& __lock, const chrono::time_point<_Clock, _Duration>& __t) { + shared_ptr __mut = __mut_; + unique_lock __lk(*__mut); + __unlock_guard<_Lock> __unlock(__lock); + lock_guard > __lx(__lk, adopt_lock_t()); + return __cv_.wait_until(__lk, __t); + } // __mut_.unlock(), __lock.lock() template bool _LIBCPP_HIDE_FROM_ABI @@ -204,45 +231,12 @@ inline void condition_variable_any::notify_all() _NOEXCEPT { __cv_.notify_all(); } -template -struct __unlock_guard { - _Lock& __lock_; - - _LIBCPP_HIDE_FROM_ABI __unlock_guard(_Lock& __lock) : __lock_(__lock) { __lock_.unlock(); } - - _LIBCPP_HIDE_FROM_ABI ~__unlock_guard() _NOEXCEPT // turns exception to std::terminate - { - __lock_.lock(); - } - - __unlock_guard(const __unlock_guard&) = delete; - __unlock_guard& operator=(const __unlock_guard&) = delete; -}; - -template -void condition_variable_any::wait(_Lock& __lock) { - shared_ptr __mut = __mut_; - unique_lock __lk(*__mut); - __unlock_guard<_Lock> __unlock(__lock); - lock_guard > __lx(__lk, adopt_lock_t()); - __cv_.wait(__lk); -} // __mut_.unlock(), __lock.lock() - template inline void condition_variable_any::wait(_Lock& __lock, _Predicate __pred) { while (!__pred()) wait(__lock); } -template -cv_status condition_variable_any::wait_until(_Lock& __lock, const chrono::time_point<_Clock, _Duration>& __t) { - shared_ptr __mut = __mut_; - unique_lock __lk(*__mut); - __unlock_guard<_Lock> __unlock(__lock); - lock_guard > __lx(__lk, adopt_lock_t()); - return __cv_.wait_until(__lk, __t); -} // __mut_.unlock(), __lock.lock() - template inline bool condition_variable_any::wait_until(_Lock& __lock, const chrono::time_point<_Clock, _Duration>& __t, _Predicate __pred) { diff --git a/libcxx/include/future b/libcxx/include/future index a08687485bd99..ea1eb65e332f7 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -563,24 +563,20 @@ public: template future_status _LIBCPP_HIDE_FROM_ABI wait_for(const chrono::duration<_Rep, _Period>& __rel_time) const; template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS future_status - wait_until(const chrono::time_point<_Clock, _Duration>& __abs_time) const; + _LIBCPP_HIDE_FROM_ABI future_status wait_until(const chrono::time_point<_Clock, _Duration>& __abs_time) const { + unique_lock __lk(__mut_); + if (__state_ & deferred) + return future_status::deferred; + while (!(__state_ & ready) && _Clock::now() < __abs_time) + __cv_.wait_until(__lk, __abs_time); + if (__state_ & ready) + return future_status::ready; + return future_status::timeout; + } virtual void __execute(); }; -template -future_status __assoc_sub_state::wait_until(const chrono::time_point<_Clock, _Duration>& __abs_time) const { - unique_lock __lk(__mut_); - if (__state_ & deferred) - return future_status::deferred; - while (!(__state_ & ready) && _Clock::now() < __abs_time) - __cv_.wait_until(__lk, __abs_time); - if (__state_ & ready) - return future_status::ready; - return future_status::timeout; -} - template inline future_status __assoc_sub_state::wait_for(const chrono::duration<_Rep, _Period>& __rel_time) const { return wait_until(chrono::steady_clock::now() + __rel_time); @@ -1347,8 +1343,17 @@ class _LIBCPP_EXPORTED_FROM_ABI promise { public: promise(); - template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS promise(allocator_arg_t, const _Allocator& __a); + template + _LIBCPP_HIDE_FROM_ABI promise(allocator_arg_t, const _Alloc& __a0) { + typedef __assoc_sub_state_alloc<_Alloc> _State; + typedef typename __allocator_traits_rebind<_Alloc, _State>::type _A2; + typedef __allocator_destructor<_A2> _D2; + _A2 __a(__a0); + unique_ptr<_State, _D2> __hold(__a.allocate(1), _D2(__a, 1)); + ::new ((void*)std::addressof(*__hold.get())) _State(__a0); + __state_ = std::addressof(*__hold.release()); + } + _LIBCPP_HIDE_FROM_ABI promise(promise&& __rhs) _NOEXCEPT : __state_(__rhs.__state_) { __rhs.__state_ = nullptr; } promise(const promise& __rhs) = delete; ~promise(); @@ -1374,17 +1379,6 @@ public: void set_exception_at_thread_exit(exception_ptr __p); }; -template -promise::promise(allocator_arg_t, const _Alloc& __a0) { - typedef __assoc_sub_state_alloc<_Alloc> _State; - typedef typename __allocator_traits_rebind<_Alloc, _State>::type _A2; - typedef __allocator_destructor<_A2> _D2; - _A2 __a(__a0); - unique_ptr<_State, _D2> __hold(__a.allocate(1), _D2(__a, 1)); - ::new ((void*)std::addressof(*__hold.get())) _State(__a0); - __state_ = std::addressof(*__hold.release()); -} - template inline _LIBCPP_HIDE_FROM_ABI void swap(promise<_Rp>& __x, promise<_Rp>& __y) _NOEXCEPT { __x.swap(__y); diff --git a/libcxx/include/locale b/libcxx/include/locale index fa15302223202..3afdef856ede7 100644 --- a/libcxx/include/locale +++ b/libcxx/include/locale @@ -566,6 +566,104 @@ extern template struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_get; extern template struct _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS __num_get; # endif +template +_LIBCPP_HIDE_FROM_ABI _Tp __do_strtod(const char* __a, char** __p2); + +template <> +inline _LIBCPP_HIDE_FROM_ABI float __do_strtod(const char* __a, char** __p2) { + return __locale::__strtof(__a, __p2, _LIBCPP_GET_C_LOCALE); +} + +template <> +inline _LIBCPP_HIDE_FROM_ABI double __do_strtod(const char* __a, char** __p2) { + return __locale::__strtod(__a, __p2, _LIBCPP_GET_C_LOCALE); +} + +template <> +inline _LIBCPP_HIDE_FROM_ABI long double __do_strtod(const char* __a, char** __p2) { + return __locale::__strtold(__a, __p2, _LIBCPP_GET_C_LOCALE); +} + +template +_LIBCPP_HIDE_FROM_ABI _Tp __num_get_float(const char* __a, const char* __a_end, ios_base::iostate& __err) { + if (__a != __a_end) { + __libcpp_remove_reference_t __save_errno = errno; + errno = 0; + char* __p2; + _Tp __ld = std::__do_strtod<_Tp>(__a, &__p2); + __libcpp_remove_reference_t __current_errno = errno; + if (__current_errno == 0) + errno = __save_errno; + if (__p2 != __a_end) { + __err = ios_base::failbit; + return 0; + } else if (__current_errno == ERANGE) + __err = ios_base::failbit; + return __ld; + } + __err = ios_base::failbit; + return 0; +} + +template +_LIBCPP_HIDE_FROM_ABI _Tp +__num_get_signed_integral(const char* __a, const char* __a_end, ios_base::iostate& __err, int __base) { + if (__a != __a_end) { + __libcpp_remove_reference_t __save_errno = errno; + errno = 0; + char* __p2; + long long __ll = __locale::__strtoll(__a, &__p2, __base, _LIBCPP_GET_C_LOCALE); + __libcpp_remove_reference_t __current_errno = errno; + if (__current_errno == 0) + errno = __save_errno; + if (__p2 != __a_end) { + __err = ios_base::failbit; + return 0; + } else if (__current_errno == ERANGE || __ll < numeric_limits<_Tp>::min() || numeric_limits<_Tp>::max() < __ll) { + __err = ios_base::failbit; + if (__ll > 0) + return numeric_limits<_Tp>::max(); + else + return numeric_limits<_Tp>::min(); + } + return static_cast<_Tp>(__ll); + } + __err = ios_base::failbit; + return 0; +} + +template +_LIBCPP_HIDE_FROM_ABI _Tp +__num_get_unsigned_integral(const char* __a, const char* __a_end, ios_base::iostate& __err, int __base) { + if (__a != __a_end) { + const bool __negate = *__a == '-'; + if (__negate && ++__a == __a_end) { + __err = ios_base::failbit; + return 0; + } + __libcpp_remove_reference_t __save_errno = errno; + errno = 0; + char* __p2; + unsigned long long __ll = __locale::__strtoull(__a, &__p2, __base, _LIBCPP_GET_C_LOCALE); + __libcpp_remove_reference_t __current_errno = errno; + if (__current_errno == 0) + errno = __save_errno; + if (__p2 != __a_end) { + __err = ios_base::failbit; + return 0; + } else if (__current_errno == ERANGE || numeric_limits<_Tp>::max() < __ll) { + __err = ios_base::failbit; + return numeric_limits<_Tp>::max(); + } + _Tp __res = static_cast<_Tp>(__ll); + if (__negate) + __res = -__res; + return __res; + } + __err = ios_base::failbit; + return 0; +} + template > class _LIBCPP_TEMPLATE_VIS num_get : public locale::facet, private __num_get<_CharT> { public: @@ -635,16 +733,175 @@ protected: _LIBCPP_HIDE_FROM_ABI_VIRTUAL ~num_get() override {} template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS iter_type - __do_get_floating_point(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Fp& __v) const; + _LIBCPP_HIDE_FROM_ABI iter_type + __do_get_floating_point(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Fp& __v) const { + // Stage 1, nothing to do + // Stage 2 + char_type __atoms[__num_get_base::__fp_chr_cnt]; + char_type __decimal_point; + char_type __thousands_sep; + string __grouping = this->__stage2_float_prep(__iob, __atoms, __decimal_point, __thousands_sep); + string __buf; + __buf.resize(__buf.capacity()); + char* __a = &__buf[0]; + char* __a_end = __a; + unsigned __g[__num_get_base::__num_get_buf_sz]; + unsigned* __g_end = __g; + unsigned __dc = 0; + bool __in_units = true; + char __exp = 'E'; + bool __is_leading_parsed = false; + for (; __b != __e; ++__b) { + if (__a_end == __a + __buf.size()) { + size_t __tmp = __buf.size(); + __buf.resize(2 * __buf.size()); + __buf.resize(__buf.capacity()); + __a = &__buf[0]; + __a_end = __a + __tmp; + } + if (this->__stage2_float_loop( + *__b, + __in_units, + __exp, + __a, + __a_end, + __decimal_point, + __thousands_sep, + __grouping, + __g, + __g_end, + __dc, + __atoms)) + break; + + // the leading character excluding the sign must be a decimal digit + if (!__is_leading_parsed) { + if (__a_end - __a >= 1 && __a[0] != '-' && __a[0] != '+') { + if (('0' <= __a[0] && __a[0] <= '9') || __a[0] == '.') + __is_leading_parsed = true; + else + break; + } else if (__a_end - __a >= 2 && (__a[0] == '-' || __a[0] == '+')) { + if (('0' <= __a[1] && __a[1] <= '9') || __a[1] == '.') + __is_leading_parsed = true; + else + break; + } + } + } + if (__grouping.size() != 0 && __in_units && __g_end - __g < __num_get_base::__num_get_buf_sz) + *__g_end++ = __dc; + // Stage 3 + __v = std::__num_get_float<_Fp>(__a, __a_end, __err); + // Digit grouping checked + __check_grouping(__grouping, __g, __g_end, __err); + // EOF checked + if (__b == __e) + __err |= ios_base::eofbit; + return __b; + } template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS iter_type - __do_get_signed(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Signed& __v) const; + _LIBCPP_HIDE_FROM_ABI iter_type + __do_get_signed(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Signed& __v) const { + // Stage 1 + int __base = this->__get_base(__iob); + // Stage 2 + char_type __thousands_sep; + const int __atoms_size = __num_get_base::__int_chr_cnt; + char_type __atoms1[__atoms_size]; + const char_type* __atoms = this->__do_widen(__iob, __atoms1); + string __grouping = this->__stage2_int_prep(__iob, __thousands_sep); + string __buf; + __buf.resize(__buf.capacity()); + char* __a = &__buf[0]; + char* __a_end = __a; + unsigned __g[__num_get_base::__num_get_buf_sz]; + unsigned* __g_end = __g; + unsigned __dc = 0; + for (; __b != __e; ++__b) { + if (__a_end == __a + __buf.size()) { + size_t __tmp = __buf.size(); + __buf.resize(2 * __buf.size()); + __buf.resize(__buf.capacity()); + __a = &__buf[0]; + __a_end = __a + __tmp; + } + if (this->__stage2_int_loop( + *__b, + __base, + __a, + __a_end, + __dc, + __thousands_sep, + __grouping, + __g, + __g_end, + const_cast(__atoms))) + break; + } + if (__grouping.size() != 0 && __g_end - __g < __num_get_base::__num_get_buf_sz) + *__g_end++ = __dc; + // Stage 3 + __v = std::__num_get_signed_integral<_Signed>(__a, __a_end, __err, __base); + // Digit grouping checked + __check_grouping(__grouping, __g, __g_end, __err); + // EOF checked + if (__b == __e) + __err |= ios_base::eofbit; + return __b; + } template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS iter_type - __do_get_unsigned(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Unsigned& __v) const; + _LIBCPP_HIDE_FROM_ABI iter_type + __do_get_unsigned(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Unsigned& __v) const { + // Stage 1 + int __base = this->__get_base(__iob); + // Stage 2 + char_type __thousands_sep; + const int __atoms_size = __num_get_base::__int_chr_cnt; + char_type __atoms1[__atoms_size]; + const char_type* __atoms = this->__do_widen(__iob, __atoms1); + string __grouping = this->__stage2_int_prep(__iob, __thousands_sep); + string __buf; + __buf.resize(__buf.capacity()); + char* __a = &__buf[0]; + char* __a_end = __a; + unsigned __g[__num_get_base::__num_get_buf_sz]; + unsigned* __g_end = __g; + unsigned __dc = 0; + for (; __b != __e; ++__b) { + if (__a_end == __a + __buf.size()) { + size_t __tmp = __buf.size(); + __buf.resize(2 * __buf.size()); + __buf.resize(__buf.capacity()); + __a = &__buf[0]; + __a_end = __a + __tmp; + } + if (this->__stage2_int_loop( + *__b, + __base, + __a, + __a_end, + __dc, + __thousands_sep, + __grouping, + __g, + __g_end, + const_cast(__atoms))) + break; + } + if (__grouping.size() != 0 && __g_end - __g < __num_get_base::__num_get_buf_sz) + *__g_end++ = __dc; + // Stage 3 + __v = std::__num_get_unsigned_integral<_Unsigned>(__a, __a_end, __err, __base); + // Digit grouping checked + __check_grouping(__grouping, __g, __g_end, __err); + // EOF checked + if (__b == __e) + __err |= ios_base::eofbit; + return __b; + } virtual iter_type do_get(iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, bool& __v) const; @@ -696,104 +953,6 @@ protected: template locale::id num_get<_CharT, _InputIterator>::id; -template -_LIBCPP_HIDE_FROM_ABI _Tp -__num_get_signed_integral(const char* __a, const char* __a_end, ios_base::iostate& __err, int __base) { - if (__a != __a_end) { - __libcpp_remove_reference_t __save_errno = errno; - errno = 0; - char* __p2; - long long __ll = __locale::__strtoll(__a, &__p2, __base, _LIBCPP_GET_C_LOCALE); - __libcpp_remove_reference_t __current_errno = errno; - if (__current_errno == 0) - errno = __save_errno; - if (__p2 != __a_end) { - __err = ios_base::failbit; - return 0; - } else if (__current_errno == ERANGE || __ll < numeric_limits<_Tp>::min() || numeric_limits<_Tp>::max() < __ll) { - __err = ios_base::failbit; - if (__ll > 0) - return numeric_limits<_Tp>::max(); - else - return numeric_limits<_Tp>::min(); - } - return static_cast<_Tp>(__ll); - } - __err = ios_base::failbit; - return 0; -} - -template -_LIBCPP_HIDE_FROM_ABI _Tp -__num_get_unsigned_integral(const char* __a, const char* __a_end, ios_base::iostate& __err, int __base) { - if (__a != __a_end) { - const bool __negate = *__a == '-'; - if (__negate && ++__a == __a_end) { - __err = ios_base::failbit; - return 0; - } - __libcpp_remove_reference_t __save_errno = errno; - errno = 0; - char* __p2; - unsigned long long __ll = __locale::__strtoull(__a, &__p2, __base, _LIBCPP_GET_C_LOCALE); - __libcpp_remove_reference_t __current_errno = errno; - if (__current_errno == 0) - errno = __save_errno; - if (__p2 != __a_end) { - __err = ios_base::failbit; - return 0; - } else if (__current_errno == ERANGE || numeric_limits<_Tp>::max() < __ll) { - __err = ios_base::failbit; - return numeric_limits<_Tp>::max(); - } - _Tp __res = static_cast<_Tp>(__ll); - if (__negate) - __res = -__res; - return __res; - } - __err = ios_base::failbit; - return 0; -} - -template -_LIBCPP_HIDE_FROM_ABI _Tp __do_strtod(const char* __a, char** __p2); - -template <> -inline _LIBCPP_HIDE_FROM_ABI float __do_strtod(const char* __a, char** __p2) { - return __locale::__strtof(__a, __p2, _LIBCPP_GET_C_LOCALE); -} - -template <> -inline _LIBCPP_HIDE_FROM_ABI double __do_strtod(const char* __a, char** __p2) { - return __locale::__strtod(__a, __p2, _LIBCPP_GET_C_LOCALE); -} - -template <> -inline _LIBCPP_HIDE_FROM_ABI long double __do_strtod(const char* __a, char** __p2) { - return __locale::__strtold(__a, __p2, _LIBCPP_GET_C_LOCALE); -} - -template -_LIBCPP_HIDE_FROM_ABI _Tp __num_get_float(const char* __a, const char* __a_end, ios_base::iostate& __err) { - if (__a != __a_end) { - __libcpp_remove_reference_t __save_errno = errno; - errno = 0; - char* __p2; - _Tp __ld = std::__do_strtod<_Tp>(__a, &__p2); - __libcpp_remove_reference_t __current_errno = errno; - if (__current_errno == 0) - errno = __save_errno; - if (__p2 != __a_end) { - __err = ios_base::failbit; - return 0; - } else if (__current_errno == ERANGE) - __err = ios_base::failbit; - return __ld; - } - __err = ios_base::failbit; - return 0; -} - template _InputIterator num_get<_CharT, _InputIterator>::do_get( iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, bool& __v) const { @@ -823,186 +982,6 @@ _InputIterator num_get<_CharT, _InputIterator>::do_get( return __b; } -// signed - -template -template -_InputIterator num_get<_CharT, _InputIterator>::__do_get_signed( - iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Signed& __v) const { - // Stage 1 - int __base = this->__get_base(__iob); - // Stage 2 - char_type __thousands_sep; - const int __atoms_size = __num_get_base::__int_chr_cnt; - char_type __atoms1[__atoms_size]; - const char_type* __atoms = this->__do_widen(__iob, __atoms1); - string __grouping = this->__stage2_int_prep(__iob, __thousands_sep); - string __buf; - __buf.resize(__buf.capacity()); - char* __a = &__buf[0]; - char* __a_end = __a; - unsigned __g[__num_get_base::__num_get_buf_sz]; - unsigned* __g_end = __g; - unsigned __dc = 0; - for (; __b != __e; ++__b) { - if (__a_end == __a + __buf.size()) { - size_t __tmp = __buf.size(); - __buf.resize(2 * __buf.size()); - __buf.resize(__buf.capacity()); - __a = &__buf[0]; - __a_end = __a + __tmp; - } - if (this->__stage2_int_loop( - *__b, - __base, - __a, - __a_end, - __dc, - __thousands_sep, - __grouping, - __g, - __g_end, - const_cast(__atoms))) - break; - } - if (__grouping.size() != 0 && __g_end - __g < __num_get_base::__num_get_buf_sz) - *__g_end++ = __dc; - // Stage 3 - __v = std::__num_get_signed_integral<_Signed>(__a, __a_end, __err, __base); - // Digit grouping checked - __check_grouping(__grouping, __g, __g_end, __err); - // EOF checked - if (__b == __e) - __err |= ios_base::eofbit; - return __b; -} - -// unsigned - -template -template -_InputIterator num_get<_CharT, _InputIterator>::__do_get_unsigned( - iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Unsigned& __v) const { - // Stage 1 - int __base = this->__get_base(__iob); - // Stage 2 - char_type __thousands_sep; - const int __atoms_size = __num_get_base::__int_chr_cnt; - char_type __atoms1[__atoms_size]; - const char_type* __atoms = this->__do_widen(__iob, __atoms1); - string __grouping = this->__stage2_int_prep(__iob, __thousands_sep); - string __buf; - __buf.resize(__buf.capacity()); - char* __a = &__buf[0]; - char* __a_end = __a; - unsigned __g[__num_get_base::__num_get_buf_sz]; - unsigned* __g_end = __g; - unsigned __dc = 0; - for (; __b != __e; ++__b) { - if (__a_end == __a + __buf.size()) { - size_t __tmp = __buf.size(); - __buf.resize(2 * __buf.size()); - __buf.resize(__buf.capacity()); - __a = &__buf[0]; - __a_end = __a + __tmp; - } - if (this->__stage2_int_loop( - *__b, - __base, - __a, - __a_end, - __dc, - __thousands_sep, - __grouping, - __g, - __g_end, - const_cast(__atoms))) - break; - } - if (__grouping.size() != 0 && __g_end - __g < __num_get_base::__num_get_buf_sz) - *__g_end++ = __dc; - // Stage 3 - __v = std::__num_get_unsigned_integral<_Unsigned>(__a, __a_end, __err, __base); - // Digit grouping checked - __check_grouping(__grouping, __g, __g_end, __err); - // EOF checked - if (__b == __e) - __err |= ios_base::eofbit; - return __b; -} - -// floating point - -template -template -_InputIterator num_get<_CharT, _InputIterator>::__do_get_floating_point( - iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, _Fp& __v) const { - // Stage 1, nothing to do - // Stage 2 - char_type __atoms[__num_get_base::__fp_chr_cnt]; - char_type __decimal_point; - char_type __thousands_sep; - string __grouping = this->__stage2_float_prep(__iob, __atoms, __decimal_point, __thousands_sep); - string __buf; - __buf.resize(__buf.capacity()); - char* __a = &__buf[0]; - char* __a_end = __a; - unsigned __g[__num_get_base::__num_get_buf_sz]; - unsigned* __g_end = __g; - unsigned __dc = 0; - bool __in_units = true; - char __exp = 'E'; - bool __is_leading_parsed = false; - for (; __b != __e; ++__b) { - if (__a_end == __a + __buf.size()) { - size_t __tmp = __buf.size(); - __buf.resize(2 * __buf.size()); - __buf.resize(__buf.capacity()); - __a = &__buf[0]; - __a_end = __a + __tmp; - } - if (this->__stage2_float_loop( - *__b, - __in_units, - __exp, - __a, - __a_end, - __decimal_point, - __thousands_sep, - __grouping, - __g, - __g_end, - __dc, - __atoms)) - break; - - // the leading character excluding the sign must be a decimal digit - if (!__is_leading_parsed) { - if (__a_end - __a >= 1 && __a[0] != '-' && __a[0] != '+') { - if (('0' <= __a[0] && __a[0] <= '9') || __a[0] == '.') - __is_leading_parsed = true; - else - break; - } else if (__a_end - __a >= 2 && (__a[0] == '-' || __a[0] == '+')) { - if (('0' <= __a[1] && __a[1] <= '9') || __a[1] == '.') - __is_leading_parsed = true; - else - break; - } - } - } - if (__grouping.size() != 0 && __in_units && __g_end - __g < __num_get_base::__num_get_buf_sz) - *__g_end++ = __dc; - // Stage 3 - __v = std::__num_get_float<_Fp>(__a, __a_end, __err); - // Digit grouping checked - __check_grouping(__grouping, __g, __g_end, __err); - // EOF checked - if (__b == __e) - __err |= ios_base::eofbit; - return __b; -} - template _InputIterator num_get<_CharT, _InputIterator>::do_get( iter_type __b, iter_type __e, ios_base& __iob, ios_base::iostate& __err, void*& __v) const { diff --git a/libcxx/include/mutex b/libcxx/include/mutex index 9b128e8710aae..b072bc38f3486 100644 --- a/libcxx/include/mutex +++ b/libcxx/include/mutex @@ -256,26 +256,24 @@ public: _LIBCPP_HIDE_FROM_ABI bool try_lock_for(const chrono::duration<_Rep, _Period>& __d) { return try_lock_until(chrono::steady_clock::now() + __d); } + template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS bool - try_lock_until(const chrono::time_point<_Clock, _Duration>& __t); + _LIBCPP_HIDE_FROM_ABI bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __t) { + using namespace chrono; + unique_lock __lk(__m_); + bool __no_timeout = _Clock::now() < __t; + while (__no_timeout && __locked_) + __no_timeout = __cv_.wait_until(__lk, __t) == cv_status::no_timeout; + if (!__locked_) { + __locked_ = true; + return true; + } + return false; + } + void unlock() _NOEXCEPT; }; -template -bool timed_mutex::try_lock_until(const chrono::time_point<_Clock, _Duration>& __t) { - using namespace chrono; - unique_lock __lk(__m_); - bool __no_timeout = _Clock::now() < __t; - while (__no_timeout && __locked_) - __no_timeout = __cv_.wait_until(__lk, __t) == cv_status::no_timeout; - if (!__locked_) { - __locked_ = true; - return true; - } - return false; -} - class _LIBCPP_EXPORTED_FROM_ABI recursive_timed_mutex { mutex __m_; condition_variable __cv_; @@ -295,34 +293,32 @@ public: _LIBCPP_HIDE_FROM_ABI bool try_lock_for(const chrono::duration<_Rep, _Period>& __d) { return try_lock_until(chrono::steady_clock::now() + __d); } + template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS bool - try_lock_until(const chrono::time_point<_Clock, _Duration>& __t); + _LIBCPP_HIDE_FROM_ABI bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __t) { + using namespace chrono; + __thread_id __id = this_thread::get_id(); + unique_lock __lk(__m_); + if (__id == __id_) { + if (__count_ == numeric_limits::max()) + return false; + ++__count_; + return true; + } + bool __no_timeout = _Clock::now() < __t; + while (__no_timeout && __count_ != 0) + __no_timeout = __cv_.wait_until(__lk, __t) == cv_status::no_timeout; + if (__count_ == 0) { + __count_ = 1; + __id_ = __id; + return true; + } + return false; + } + void unlock() _NOEXCEPT; }; -template -bool recursive_timed_mutex::try_lock_until(const chrono::time_point<_Clock, _Duration>& __t) { - using namespace chrono; - __thread_id __id = this_thread::get_id(); - unique_lock __lk(__m_); - if (__id == __id_) { - if (__count_ == numeric_limits::max()) - return false; - ++__count_; - return true; - } - bool __no_timeout = _Clock::now() < __t; - while (__no_timeout && __count_ != 0) - __no_timeout = __cv_.wait_until(__lk, __t) == cv_status::no_timeout; - if (__count_ == 0) { - __count_ = 1; - __id_ = __id; - return true; - } - return false; -} - template _LIBCPP_HIDE_FROM_ABI int try_lock(_L0& __l0, _L1& __l1) { unique_lock<_L0> __u0(__l0, try_to_lock_t()); diff --git a/libcxx/include/shared_mutex b/libcxx/include/shared_mutex index b1e2a5d434400..7821f62f53032 100644 --- a/libcxx/include/shared_mutex +++ b/libcxx/include/shared_mutex @@ -240,10 +240,36 @@ public: _LIBCPP_THREAD_SAFETY_ANNOTATION(__try_acquire_capability__(true)) { return try_lock_until(chrono::steady_clock::now() + __rel_time); } + template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS bool - try_lock_until(const chrono::time_point<_Clock, _Duration>& __abs_time) - _LIBCPP_THREAD_SAFETY_ANNOTATION(__try_acquire_capability__(true)); + _LIBCPP_HIDE_FROM_ABI bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __abs_time) + _LIBCPP_THREAD_SAFETY_ANNOTATION(__try_acquire_capability__(true)) { + unique_lock __lk(__base_.__mut_); + if (__base_.__state_ & __base_.__write_entered_) { + while (true) { + cv_status __status = __base_.__gate1_.wait_until(__lk, __abs_time); + if ((__base_.__state_ & __base_.__write_entered_) == 0) + break; + if (__status == cv_status::timeout) + return false; + } + } + __base_.__state_ |= __base_.__write_entered_; + if (__base_.__state_ & __base_.__n_readers_) { + while (true) { + cv_status __status = __base_.__gate2_.wait_until(__lk, __abs_time); + if ((__base_.__state_ & __base_.__n_readers_) == 0) + break; + if (__status == cv_status::timeout) { + __base_.__state_ &= ~__base_.__write_entered_; + __base_.__gate1_.notify_all(); + return false; + } + } + } + return true; + } + void unlock() _LIBCPP_THREAD_SAFETY_ANNOTATION(__release_capability__()); // Shared ownership @@ -254,60 +280,30 @@ public: _LIBCPP_THREAD_SAFETY_ANNOTATION(__try_acquire_shared_capability__(true)) { return try_lock_shared_until(chrono::steady_clock::now() + __rel_time); } - template - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS bool - try_lock_shared_until(const chrono::time_point<_Clock, _Duration>& __abs_time) - _LIBCPP_THREAD_SAFETY_ANNOTATION(__try_acquire_shared_capability__(true)); - void unlock_shared() _LIBCPP_THREAD_SAFETY_ANNOTATION(__release_shared_capability__()); -}; -template -bool shared_timed_mutex::try_lock_until(const chrono::time_point<_Clock, _Duration>& __abs_time) { - unique_lock __lk(__base_.__mut_); - if (__base_.__state_ & __base_.__write_entered_) { - while (true) { - cv_status __status = __base_.__gate1_.wait_until(__lk, __abs_time); - if ((__base_.__state_ & __base_.__write_entered_) == 0) - break; - if (__status == cv_status::timeout) - return false; - } - } - __base_.__state_ |= __base_.__write_entered_; - if (__base_.__state_ & __base_.__n_readers_) { - while (true) { - cv_status __status = __base_.__gate2_.wait_until(__lk, __abs_time); - if ((__base_.__state_ & __base_.__n_readers_) == 0) - break; - if (__status == cv_status::timeout) { - __base_.__state_ &= ~__base_.__write_entered_; - __base_.__gate1_.notify_all(); - return false; + template + _LIBCPP_HIDE_FROM_ABI bool try_lock_shared_until(const chrono::time_point<_Clock, _Duration>& __abs_time) + _LIBCPP_THREAD_SAFETY_ANNOTATION(__try_acquire_shared_capability__(true)) { + unique_lock __lk(__base_.__mut_); + if ((__base_.__state_ & __base_.__write_entered_) || + (__base_.__state_ & __base_.__n_readers_) == __base_.__n_readers_) { + while (true) { + cv_status __status = __base_.__gate1_.wait_until(__lk, __abs_time); + if ((__base_.__state_ & __base_.__write_entered_) == 0 && + (__base_.__state_ & __base_.__n_readers_) < __base_.__n_readers_) + break; + if (__status == cv_status::timeout) + return false; } } + unsigned __num_readers = (__base_.__state_ & __base_.__n_readers_) + 1; + __base_.__state_ &= ~__base_.__n_readers_; + __base_.__state_ |= __num_readers; + return true; } - return true; -} -template -bool shared_timed_mutex::try_lock_shared_until(const chrono::time_point<_Clock, _Duration>& __abs_time) { - unique_lock __lk(__base_.__mut_); - if ((__base_.__state_ & __base_.__write_entered_) || - (__base_.__state_ & __base_.__n_readers_) == __base_.__n_readers_) { - while (true) { - cv_status __status = __base_.__gate1_.wait_until(__lk, __abs_time); - if ((__base_.__state_ & __base_.__write_entered_) == 0 && - (__base_.__state_ & __base_.__n_readers_) < __base_.__n_readers_) - break; - if (__status == cv_status::timeout) - return false; - } - } - unsigned __num_readers = (__base_.__state_ & __base_.__n_readers_) + 1; - __base_.__state_ &= ~__base_.__n_readers_; - __base_.__state_ |= __num_readers; - return true; -} + void unlock_shared() _LIBCPP_THREAD_SAFETY_ANNOTATION(__release_shared_capability__()); +}; template class shared_lock { diff --git a/libcxx/include/string b/libcxx/include/string index fa87dc2fddb59..3d182cbc26892 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -1117,7 +1117,7 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(const _Tp& __t, size_type __pos, size_type __n, const allocator_type& __a = allocator_type()) : __alloc_(__a) { __self_view __sv0 = __t; @@ -1129,8 +1129,7 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS - _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const _Tp& __t) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const _Tp& __t) { __self_view __sv = __t; __init(__sv.data(), __sv.size()); } @@ -1139,8 +1138,7 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS - _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const _Tp& __t, const allocator_type& __a) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const _Tp& __t, const allocator_type& __a) : __alloc_(__a) { __self_view __sv = __t; __init(__sv.data(), __sv.size()); @@ -1336,8 +1334,7 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string >::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - operator+=(const _Tp& __t) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator+=(const _Tp& __t) { __self_view __sv = __t; return append(__sv); } @@ -1366,8 +1363,7 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - append(const _Tp& __t) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const _Tp& __t) { __self_view __sv = __t; return append(__sv.data(), __sv.size()); } @@ -1378,10 +1374,14 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 - - basic_string& - append(const _Tp& __t, size_type __pos, size_type __n = npos); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + append(const _Tp& __t, size_type __pos, size_type __n = npos) { + __self_view __sv = __t; + size_type __sz = __sv.size(); + if (__pos > __sz) + __throw_out_of_range(); + return append(__sv.data() + __pos, std::min(__n, __sz - __pos)); + } _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const value_type* __s, size_type __n); _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s); @@ -1390,7 +1390,7 @@ public: _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __append_default_init(size_type __n); template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_HIDE_FROM_ABI _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(_InputIterator __first, _InputIterator __last) { const basic_string __temp(__first, __last, __alloc_); append(__temp.data(), __temp.size()); @@ -1398,8 +1398,26 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - append(_ForwardIterator __first, _ForwardIterator __last); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + append(_ForwardIterator __first, _ForwardIterator __last) { + size_type __sz = size(); + size_type __cap = capacity(); + size_type __n = static_cast(std::distance(__first, __last)); + if (__n) { + if (__string_is_trivial_iterator<_ForwardIterator>::value && !__addr_in_range(*__first)) { + if (__cap - __sz < __n) + __grow_by_without_replace(__cap, __sz + __n - __cap, __sz, __sz, 0); + __annotate_increase(__n); + auto __end = __copy_non_overlapping_range(__first, __last, std::__to_address(__get_pointer() + __sz)); + traits_type::assign(*__end, value_type()); + __set_size(__sz + __n); + } else { + const basic_string __temp(__first, __last, __alloc_); + append(__temp.data(), __temp.size()); + } + } + return *this; + } # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_CharT> _Range> @@ -1439,8 +1457,7 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - assign(const _Tp& __t) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const _Tp& __t) { __self_view __sv = __t; return assign(__sv.data(), __sv.size()); } @@ -1484,19 +1501,38 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - assign(const _Tp& __t, size_type __pos, size_type __n = npos); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + assign(const _Tp& __t, size_type __pos, size_type __n = npos) { + __self_view __sv = __t; + size_type __sz = __sv.size(); + if (__pos > __sz) + __throw_out_of_range(); + return assign(__sv.data() + __pos, std::min(__n, __sz - __pos)); + } _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const value_type* __s, size_type __n); _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const value_type* __s); _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(size_type __n, value_type __c); + template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - assign(_InputIterator __first, _InputIterator __last); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + assign(_InputIterator __first, _InputIterator __last) { + __assign_with_sentinel(__first, __last); + return *this; + } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - assign(_ForwardIterator __first, _ForwardIterator __last); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + assign(_ForwardIterator __first, _ForwardIterator __last) { + if (__string_is_trivial_iterator<_ForwardIterator>::value) { + size_type __n = static_cast(std::distance(__first, __last)); + __assign_trivial(__first, __last, __n); + } else { + __assign_with_sentinel(__first, __last); + } + + return *this; + } # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_CharT> _Range> @@ -1526,8 +1562,7 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - insert(size_type __pos1, const _Tp& __t) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos1, const _Tp& __t) { __self_view __sv = __t; return insert(__pos1, __sv.data(), __sv.size()); } @@ -1536,8 +1571,14 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - insert(size_type __pos1, const _Tp& __t, size_type __pos2, size_type __n = npos); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + insert(size_type __pos1, const _Tp& __t, size_type __pos2, size_type __n = npos) { + __self_view __sv = __t; + size_type __str_sz = __sv.size(); + if (__pos2 > __str_sz) + __throw_out_of_range(); + return insert(__pos1, __sv.data() + __pos2, std::min(__n, __str_sz - __pos2)); + } _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos1, const basic_string& __str, size_type __pos2, size_type __n = npos); @@ -1568,12 +1609,18 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator - insert(const_iterator __pos, _InputIterator __first, _InputIterator __last); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator + insert(const_iterator __pos, _InputIterator __first, _InputIterator __last) { + const basic_string __temp(__first, __last, __alloc_); + return insert(__pos, __temp.data(), __temp.data() + __temp.size()); + } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator - insert(const_iterator __pos, _ForwardIterator __first, _ForwardIterator __last); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator + insert(const_iterator __pos, _ForwardIterator __first, _ForwardIterator __last) { + auto __n = static_cast(std::distance(__first, __last)); + return __insert_with_size(__pos, __first, __last, __n); + } # ifndef _LIBCPP_CXX03_LANG _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator @@ -1592,7 +1639,7 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& replace(size_type __pos1, size_type __n1, const _Tp& __t) { __self_view __sv = __t; return replace(__pos1, __n1, __sv.data(), __sv.size()); @@ -1605,8 +1652,14 @@ public: __enable_if_t<__can_be_converted_to_string_view<_CharT, _Traits, _Tp>::value && !__is_same_uncvref<_Tp, basic_string>::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - replace(size_type __pos1, size_type __n1, const _Tp& __t, size_type __pos2, size_type __n2 = npos); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + replace(size_type __pos1, size_type __n1, const _Tp& __t, size_type __pos2, size_type __n2 = npos) { + __self_view __sv = __t; + size_type __str_sz = __sv.size(); + if (__pos2 > __str_sz) + __throw_out_of_range(); + return replace(__pos1, __n1, __sv.data() + __pos2, std::min(__n2, __str_sz - __pos2)); + } _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& replace(size_type __pos, size_type __n1, const value_type* __s, size_type __n2); @@ -1620,7 +1673,7 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& replace(const_iterator __i1, const_iterator __i2, const _Tp& __t) { __self_view __sv = __t; return replace(__i1 - begin(), __i2 - __i1, __sv); @@ -1642,8 +1695,11 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - replace(const_iterator __i1, const_iterator __i2, _InputIterator __j1, _InputIterator __j2); + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + replace(const_iterator __i1, const_iterator __i2, _InputIterator __j1, _InputIterator __j2) { + const basic_string __temp(__j1, __j2, __alloc_); + return replace(__i1, __i2, __temp); + } # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_CharT> _Range> @@ -1710,7 +1766,7 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find(data(), size(), __sv.data(), __pos, __sv.size()); @@ -1741,7 +1797,7 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT { __self_view __sv = __t; return std::__str_rfind(data(), size(), __sv.data(), __pos, __sv.size()); @@ -1772,7 +1828,7 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find_first_of( @@ -1806,7 +1862,7 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find_last_of( @@ -1840,7 +1896,7 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_not_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find_first_not_of( @@ -1874,7 +1930,7 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_not_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT { __self_view __sv = __t; return std::__str_find_last_not_of( @@ -1906,11 +1962,22 @@ public: } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 int - compare(const _Tp& __t) const _NOEXCEPT; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const _Tp& __t) const _NOEXCEPT { + __self_view __sv = __t; + size_t __lhs_sz = size(); + size_t __rhs_sz = __sv.size(); + int __result = traits_type::compare(data(), __sv.data(), std::min(__lhs_sz, __rhs_sz)); + if (__result != 0) + return __result; + if (__lhs_sz < __rhs_sz) + return -1; + if (__lhs_sz > __rhs_sz) + return 1; + return 0; + } template ::value, int> = 0> - _LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS _LIBCPP_CONSTEXPR_SINCE_CXX20 int + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(size_type __pos1, size_type __n1, const _Tp& __t) const { __self_view __sv = __t; return compare(__pos1, __n1, __sv.data(), __sv.size()); @@ -2877,14 +2944,6 @@ basic_string<_CharT, _Traits, _Allocator>::__move_assign(basic_string& __str, tr # endif -template -template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::assign(_InputIterator __first, _InputIterator __last) { - __assign_with_sentinel(__first, __last); - return *this; -} - template template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void @@ -2893,20 +2952,6 @@ basic_string<_CharT, _Traits, _Allocator>::__assign_with_sentinel(_InputIterator assign(__temp.data(), __temp.size()); } -template -template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::assign(_ForwardIterator __first, _ForwardIterator __last) { - if (__string_is_trivial_iterator<_ForwardIterator>::value) { - size_type __n = static_cast(std::distance(__first, __last)); - __assign_trivial(__first, __last, __n); - } else { - __assign_with_sentinel(__first, __last); - } - - return *this; -} - template template _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void @@ -2946,20 +2991,6 @@ basic_string<_CharT, _Traits, _Allocator>::assign(const basic_string& __str, siz return assign(__str.data() + __pos, std::min(__n, __sz - __pos)); } -template -template ::value && - !__is_same_uncvref<_Tp, basic_string<_CharT, _Traits, _Allocator> >::value, - int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::assign(const _Tp& __t, size_type __pos, size_type __n) { - __self_view __sv = __t; - size_type __sz = __sv.size(); - if (__pos > __sz) - this->__throw_out_of_range(); - return assign(__sv.data() + __pos, std::min(__n, __sz - __pos)); -} - template _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_NOINLINE basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::__assign_external(const value_type* __s) { @@ -3059,29 +3090,6 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::pu traits_type::assign(*++__p, value_type()); } -template -template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::append(_ForwardIterator __first, _ForwardIterator __last) { - size_type __sz = size(); - size_type __cap = capacity(); - size_type __n = static_cast(std::distance(__first, __last)); - if (__n) { - if (__string_is_trivial_iterator<_ForwardIterator>::value && !__addr_in_range(*__first)) { - if (__cap - __sz < __n) - __grow_by_without_replace(__cap, __sz + __n - __cap, __sz, __sz, 0); - __annotate_increase(__n); - auto __end = __copy_non_overlapping_range(__first, __last, std::__to_address(__get_pointer() + __sz)); - traits_type::assign(*__end, value_type()); - __set_size(__sz + __n); - } else { - const basic_string __temp(__first, __last, __alloc_); - append(__temp.data(), __temp.size()); - } - } - return *this; -} - template _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::append(const basic_string& __str, size_type __pos, size_type __n) { @@ -3091,20 +3099,6 @@ basic_string<_CharT, _Traits, _Allocator>::append(const basic_string& __str, siz return append(__str.data() + __pos, std::min(__n, __sz - __pos)); } -template -template ::value && - !__is_same_uncvref<_Tp, basic_string<_CharT, _Traits, _Allocator> >::value, - int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::append(const _Tp& __t, size_type __pos, size_type __n) { - __self_view __sv = __t; - size_type __sz = __sv.size(); - if (__pos > __sz) - this->__throw_out_of_range(); - return append(__sv.data() + __pos, std::min(__n, __sz - __pos)); -} - template _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::append(const value_type* __s) { @@ -3169,23 +3163,6 @@ basic_string<_CharT, _Traits, _Allocator>::insert(size_type __pos, size_type __n return *this; } -template -template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::iterator -basic_string<_CharT, _Traits, _Allocator>::insert(const_iterator __pos, _InputIterator __first, _InputIterator __last) { - const basic_string __temp(__first, __last, __alloc_); - return insert(__pos, __temp.data(), __temp.data() + __temp.size()); -} - -template -template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::iterator -basic_string<_CharT, _Traits, _Allocator>::insert( - const_iterator __pos, _ForwardIterator __first, _ForwardIterator __last) { - auto __n = static_cast(std::distance(__first, __last)); - return __insert_with_size(__pos, __first, __last, __n); -} - template template _LIBCPP_CONSTEXPR_SINCE_CXX20 typename basic_string<_CharT, _Traits, _Allocator>::iterator @@ -3213,20 +3190,6 @@ basic_string<_CharT, _Traits, _Allocator>::insert( return insert(__pos1, __str.data() + __pos2, std::min(__n, __str_sz - __pos2)); } -template -template ::value && - !__is_same_uncvref<_Tp, basic_string<_CharT, _Traits, _Allocator> >::value, - int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::insert(size_type __pos1, const _Tp& __t, size_type __pos2, size_type __n) { - __self_view __sv = __t; - size_type __str_sz = __sv.size(); - if (__pos2 > __str_sz) - this->__throw_out_of_range(); - return insert(__pos1, __sv.data() + __pos2, std::min(__n, __str_sz - __pos2)); -} - template _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::insert(size_type __pos, const value_type* __s) { @@ -3330,15 +3293,6 @@ basic_string<_CharT, _Traits, _Allocator>::replace(size_type __pos, size_type __ return __null_terminate_at(__p, __sz - (__n1 - __n2)); } -template -template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::replace( - const_iterator __i1, const_iterator __i2, _InputIterator __j1, _InputIterator __j2) { - const basic_string __temp(__j1, __j2, __alloc_); - return replace(__i1, __i2, __temp); -} - template _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::replace( @@ -3349,21 +3303,6 @@ basic_string<_CharT, _Traits, _Allocator>::replace( return replace(__pos1, __n1, __str.data() + __pos2, std::min(__n2, __str_sz - __pos2)); } -template -template ::value && - !__is_same_uncvref<_Tp, basic_string<_CharT, _Traits, _Allocator> >::value, - int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& -basic_string<_CharT, _Traits, _Allocator>::replace( - size_type __pos1, size_type __n1, const _Tp& __t, size_type __pos2, size_type __n2) { - __self_view __sv = __t; - size_type __str_sz = __sv.size(); - if (__pos2 > __str_sz) - this->__throw_out_of_range(); - return replace(__pos1, __n1, __sv.data() + __pos2, std::min(__n2, __str_sz - __pos2)); -} - template _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator>& basic_string<_CharT, _Traits, _Allocator>::replace(size_type __pos, size_type __n1, const value_type* __s) { @@ -3586,22 +3525,6 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat // compare -template -template ::value, int> > -_LIBCPP_CONSTEXPR_SINCE_CXX20 int basic_string<_CharT, _Traits, _Allocator>::compare(const _Tp& __t) const _NOEXCEPT { - __self_view __sv = __t; - size_t __lhs_sz = size(); - size_t __rhs_sz = __sv.size(); - int __result = traits_type::compare(data(), __sv.data(), std::min(__lhs_sz, __rhs_sz)); - if (__result != 0) - return __result; - if (__lhs_sz < __rhs_sz) - return -1; - if (__lhs_sz > __rhs_sz) - return 1; - return 0; -} - template inline _LIBCPP_CONSTEXPR_SINCE_CXX20 int basic_string<_CharT, _Traits, _Allocator>::compare( size_type __pos1, size_type __n1, const value_type* __s, size_type __n2) const { From df0ccf6df09b5a067c80519b9adfa2c7dc46a350 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Tue, 8 Apr 2025 16:15:08 +0000 Subject: [PATCH 1015/1029] [asan] Disable TestCases/Linux/asan_rt_confict_test-2.cpp to fix build TestCases/Linux/asan_rt_confict_test-2.cpp started failing in https://lab.llvm.org/buildbot/#/builders/66/builds/12265/steps/9/logs/stdio The only change is "[LLD][ELF] Allow merging XO and RX sections, and add --[no-]xosegment flag (#132412)" (https://github.com/llvm/llvm-project/commit/2c1bdd4a0811af89eb9631935fbd90f13a04eacb). Based on the test case (which deliberately tries to mix static and dynamically linked ASan), I suspect it's actually the test case that needs to be fixed (probably with a different error message check). This patch disables TestCases/Linux/asan_rt_confict_test-2.cpp to make the buildbots green while I investigate. --- compiler-rt/test/asan/TestCases/Linux/asan_rt_confict_test-2.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/compiler-rt/test/asan/TestCases/Linux/asan_rt_confict_test-2.cpp b/compiler-rt/test/asan/TestCases/Linux/asan_rt_confict_test-2.cpp index 6328cbb2ce812..38f488197a197 100644 --- a/compiler-rt/test/asan/TestCases/Linux/asan_rt_confict_test-2.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/asan_rt_confict_test-2.cpp @@ -5,6 +5,7 @@ // RUN: %clangxx_asan_static %s %ld_flags_rpath_exe -o %t // RUN: not %run %t 2>&1 | FileCheck %s +// UNSUPPORTED: target={{.*}} // REQUIRES: asan-dynamic-runtime // XFAIL: android From d6c8e8908d421979a428ede9f9a630dac8576c5b Mon Sep 17 00:00:00 2001 From: Dmitry Chestnykh Date: Tue, 8 Apr 2025 19:22:03 +0300 Subject: [PATCH 1016/1029] Rename `F_no_mmap` to `F_mmap` (#134787) The `F_no_mmap` flag was introduced by https://github.com/llvm/llvm-project/commit/68142324290f2932df0e271747cdccc371d6dded --- lld/ELF/Arch/ARM.cpp | 2 +- lld/ELF/Writer.cpp | 4 ++-- llvm/include/llvm/Support/FileOutputBuffer.h | 5 ++--- llvm/lib/Support/FileOutputBuffer.cpp | 2 +- llvm/unittests/Support/FileOutputBufferTest.cpp | 2 +- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp index e667fdc0633c5..e45dd4d354afb 100644 --- a/lld/ELF/Arch/ARM.cpp +++ b/lld/ELF/Arch/ARM.cpp @@ -1489,7 +1489,7 @@ template void elf::writeARMCmseImportLib(Ctx &ctx) { const uint64_t fileSize = sectionHeaderOff + shnum * sizeof(typename ELFT::Shdr); const unsigned flags = - ctx.arg.mmapOutputFile ? 0 : (unsigned)FileOutputBuffer::F_no_mmap; + ctx.arg.mmapOutputFile ? (unsigned)FileOutputBuffer::F_mmap : 0; unlinkAsync(ctx.arg.cmseOutputLib); Expected> bufferOrErr = FileOutputBuffer::create(ctx.arg.cmseOutputLib, fileSize, flags); diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index 28b24f90716b8..cc1538cc76c3c 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -2908,8 +2908,8 @@ template void Writer::openFile() { unsigned flags = 0; if (!ctx.arg.relocatable) flags |= FileOutputBuffer::F_executable; - if (!ctx.arg.mmapOutputFile) - flags |= FileOutputBuffer::F_no_mmap; + if (ctx.arg.mmapOutputFile) + flags |= FileOutputBuffer::F_mmap; Expected> bufferOrErr = FileOutputBuffer::create(ctx.arg.outputFile, fileSize, flags); diff --git a/llvm/include/llvm/Support/FileOutputBuffer.h b/llvm/include/llvm/Support/FileOutputBuffer.h index d4b73522115db..f98e7a5470b55 100644 --- a/llvm/include/llvm/Support/FileOutputBuffer.h +++ b/llvm/include/llvm/Support/FileOutputBuffer.h @@ -31,9 +31,8 @@ class FileOutputBuffer { /// Set the 'x' bit on the resulting file. F_executable = 1, - /// Don't use mmap and instead write an in-memory buffer to a file when this - /// buffer is closed. - F_no_mmap = 2, + /// Use mmap for in-memory file buffer. + F_mmap = 2, }; /// Factory method to create an OutputBuffer object which manages a read/write diff --git a/llvm/lib/Support/FileOutputBuffer.cpp b/llvm/lib/Support/FileOutputBuffer.cpp index 58a06a34e8cf3..a2396d7629488 100644 --- a/llvm/lib/Support/FileOutputBuffer.cpp +++ b/llvm/lib/Support/FileOutputBuffer.cpp @@ -186,7 +186,7 @@ FileOutputBuffer::create(StringRef Path, size_t Size, unsigned Flags) { case fs::file_type::regular_file: case fs::file_type::file_not_found: case fs::file_type::status_error: - if (Flags & F_no_mmap) + if (Flags & F_mmap) return createInMemoryBuffer(Path, Size, Mode); else return createOnDiskBuffer(Path, Size, Mode); diff --git a/llvm/unittests/Support/FileOutputBufferTest.cpp b/llvm/unittests/Support/FileOutputBufferTest.cpp index f7bb0833e5a0e..423a6e12240c0 100644 --- a/llvm/unittests/Support/FileOutputBufferTest.cpp +++ b/llvm/unittests/Support/FileOutputBufferTest.cpp @@ -123,7 +123,7 @@ TEST(FileOutputBuffer, Test) { File5.append("/file5"); { Expected> BufferOrErr = - FileOutputBuffer::create(File5, 8000, FileOutputBuffer::F_no_mmap); + FileOutputBuffer::create(File5, 8000, FileOutputBuffer::F_mmap); ASSERT_NO_ERROR(errorToErrorCode(BufferOrErr.takeError())); std::unique_ptr &Buffer = *BufferOrErr; // Start buffer with special header. From 46d4c3b1f64dfbca2a029ff30434aaa5248fc190 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 8 Apr 2025 17:30:35 +0100 Subject: [PATCH 1017/1029] [X86] combineX86ShuffleChain - always prefer VPERMQ/PD for unary subvector shuffles on AVX2+ targets (#134849) When combining 2 x 128-bit subvectors, don't assume that if the node is already a X86ISD::VPERM2X128 node then there's nothing to do. Fix issue where if we'd somehow combined to X86ISD::VPERM2X128 (typically if the 2 operands had then simplified to a common operand), we can't canonicalise back to X86ISD::VPERMI on AVX2+ targets. This matches the v4f64/v4i64 shuffle lowering preference for VPERMQ/PD over VPERM2F128/I128. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 13 +++++++------ llvm/test/CodeGen/X86/matrix-multiply.ll | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d86eec1584274..47ac1ee571269 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -39827,7 +39827,6 @@ static SDValue combineX86ShuffleChain( // If we're inserting the low subvector, an insert-subvector 'concat' // pattern is quicker than VPERM2X128. - // TODO: Add AVX2 support instead of VPERMQ/VPERMPD. if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) && !Subtarget.hasAVX2()) { if (Depth == 0 && RootOpc == ISD::INSERT_SUBVECTOR) @@ -39838,15 +39837,15 @@ static SDValue combineX86ShuffleChain( return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128); } - if (Depth == 0 && RootOpc == X86ISD::VPERM2X128) - return SDValue(); // Nothing to do! - - // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless - // we need to use the zeroing feature. + // Don't lower to VPERM2X128 here if we have AVX2+, prefer to use + // VPERMQ/VPERMPD for unary shuffles unless we need to use the zeroing + // feature. // Prefer blends for sequential shuffles unless we are optimizing for size. if (UnaryShuffle && !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) && (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) { + if (Depth == 0 && RootOpc == X86ISD::VPERM2X128) + return SDValue(); // Nothing to do! unsigned PermMask = 0; PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0); PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4); @@ -39864,6 +39863,8 @@ static SDValue combineX86ShuffleChain( "Unexpected shuffle sentinel value"); // Prefer blends to X86ISD::VPERM2X128. if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) { + if (Depth == 0 && RootOpc == X86ISD::VPERM2X128) + return SDValue(); // Nothing to do! unsigned PermMask = 0; PermMask |= ((Mask[0] & 3) << 0); PermMask |= ((Mask[1] & 3) << 4); diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index 9b8816bd11f70..1dc8bd9b863aa 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -131,7 +131,7 @@ define <4 x double> @test_mul2x2_f64(<4 x double> %a0, <4 x double> %a1) nounwin ; AVX2-LABEL: test_mul2x2_f64: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] ; AVX2-NEXT: vmulpd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] @@ -142,7 +142,7 @@ define <4 x double> @test_mul2x2_f64(<4 x double> %a0, <4 x double> %a1) nounwin ; AVX512-LABEL: test_mul2x2_f64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3] -; AVX512-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX512-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] ; AVX512-NEXT: vmulpd %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] ; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] From e3d114ceb86782553e8f244975441e5934b35b82 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Tue, 8 Apr 2025 17:45:54 +0100 Subject: [PATCH 1018/1029] [DebugInfo][Reassociate] Propagate source loc when negating mul factor (#134679) As part of RemoveFactorFromExpression, we attempt to remove a factor from a mul/fmul expression; this may involve generating new instructions, e.g. to negate the result if the factor was negative in the original expression. When this happens, the new instructions should have a DebugLoc set from the instruction that the factored expression is being used to compute. Found using https://github.com/llvm/llvm-project/pull/107279. --- .../llvm/Transforms/Scalar/Reassociate.h | 2 +- llvm/lib/Transforms/Scalar/Reassociate.cpp | 12 +++-- .../Reassociate/debugloc-factoring-neg.ll | 45 +++++++++++++++++++ 3 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 llvm/test/Transforms/Reassociate/debugloc-factoring-neg.ll diff --git a/llvm/include/llvm/Transforms/Scalar/Reassociate.h b/llvm/include/llvm/Transforms/Scalar/Reassociate.h index 3b2d2b83ced62..6d56961a71019 100644 --- a/llvm/include/llvm/Transforms/Scalar/Reassociate.h +++ b/llvm/include/llvm/Transforms/Scalar/Reassociate.h @@ -133,7 +133,7 @@ class ReassociatePass : public PassInfoMixin { SmallVectorImpl &Factors); Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl &Ops); - Value *RemoveFactorFromExpression(Value *V, Value *Factor); + Value *RemoveFactorFromExpression(Value *V, Value *Factor, DebugLoc DL); void EraseInst(Instruction *I); void RecursivelyEraseDeadInsts(Instruction *I, OrderedSet &Insts); void OptimizeInst(Instruction *I); diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index 0bfce13b07f1c..49fe6670f63e6 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -1100,7 +1100,10 @@ static Value *EmitAddTreeOfValues(Instruction *I, /// If V is an expression tree that is a multiplication sequence, /// and if this sequence contains a multiply by Factor, /// remove Factor from the tree and return the new tree. -Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) { +/// If new instructions are inserted to generate this tree, DL should be used +/// as the DebugLoc for these instructions. +Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor, + DebugLoc DL) { BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul); if (!BO) return nullptr; @@ -1164,8 +1167,10 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) { V = BO; } - if (NeedsNegate) + if (NeedsNegate) { V = CreateNeg(V, "neg", InsertPt, BO); + cast(V)->setDebugLoc(DL); + } return V; } @@ -1666,7 +1671,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, if (!BOp) continue; - if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) { + if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal, + I->getDebugLoc())) { // The factorized operand may occur several times. Convert them all in // one fell swoop. for (unsigned j = Ops.size(); j != i;) { diff --git a/llvm/test/Transforms/Reassociate/debugloc-factoring-neg.ll b/llvm/test/Transforms/Reassociate/debugloc-factoring-neg.ll new file mode 100644 index 0000000000000..f8e74c8fd8c4e --- /dev/null +++ b/llvm/test/Transforms/Reassociate/debugloc-factoring-neg.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -p=reassociate < %s | FileCheck %s + +;; When reassociating the expression below, we reduce it to using a single +;; multiply and as part of that we generate a sub that negates one of the +;; original terms; this negation "neg" should use the original expression's +;; source location. + +define i32 @foo(i64 %0, i64 %1) { +; CHECK-LABEL: define i32 @foo( +; CHECK-SAME: i64 [[TMP0:%.*]], i64 [[TMP1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[NEG:%.*]] = sub i64 0, [[TMP1]], !dbg [[DBG3:![0-9]+]] +; CHECK-NEXT: [[REASS_ADD:%.*]] = add i64 [[NEG]], [[TMP0]] +; CHECK-NEXT: [[REASS_MUL:%.*]] = mul i64 [[REASS_ADD]], 1000, !dbg [[DBG3]] +; CHECK-NEXT: store i64 [[REASS_MUL]], ptr null, align 8 +; CHECK-NEXT: ret i32 0 +; +entry: + %mul1026 = mul i64 %0, 1000 + %add1028 = or i64 %mul1026, 0 + %mul1029 = mul i64 %1, 1000 + %sub1032 = sub i64 %add1028, %mul1029, !dbg !4 + store i64 %sub1032, ptr null, align 8 + ret i32 0 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git") +!1 = !DIFile(filename: "test.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !DILocation(line: 10, column: 53, scope: !5) +!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 550, type: !7, scopeLine: 557, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!7 = distinct !DISubroutineType(types: !2) +;. +; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C11, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug) +; CHECK: [[META1]] = !DIFile(filename: "test.c", directory: {{.*}}) +; CHECK: [[DBG3]] = !DILocation(line: 10, column: 53, scope: [[META4:![0-9]+]]) +; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 550, type: [[META5:![0-9]+]], scopeLine: 557, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META6:![0-9]+]]) +; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]]) +; CHECK: [[META6]] = !{} +;. From f869d6efeec825c384dd9410fd29f90078e40c30 Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Tue, 8 Apr 2025 09:47:52 -0700 Subject: [PATCH 1019/1029] [LLDB][Minidump]Update MinidumpFileBuilder to read and write in chunks (#129307) I recently received an internal error report that LLDB was OOM'ing when creating a Minidump. In my 64b refactor we made a decision to acquire buffers the size of the largest memory region so we could read all of the contents in one call. This made error handling very simple (and simpler coding for me!) but had the trade off of large allocations if huge pages were enabled. This patch is one I've had on the back burner for awhile, but we can read and write the Minidump memory sections in discrete chunks which we already do for writing to disk. I had to refactor the error handling a bit, but it remains the same. We make a best effort attempt to read as much of the memory region as possible, but fail immediately if we receive an error writing to disk. I did not add new tests for this because our existing test suite is quite good, but I did manually verify a few Minidumps couldn't read beyond the red_zone. ``` (lldb) reg read $sp rsp = 0x00007fffffffc3b0 (lldb) p/x 0x00007fffffffc3b0 - 128 (long) 0x00007fffffffc330 (lldb) memory read 0x00007fffffffc330 0x7fffffffc330: 60 c3 ff ff ff 7f 00 00 60 cd ff ff ff 7f 00 00 `.......`....... 0x7fffffffc340: 60 c3 ff ff ff 7f 00 00 65 e6 26 00 00 00 00 00 `.......e.&..... (lldb) memory read 0x00007fffffffc329 error: could not parse memory info (Success!) ``` I'm not sure how to quantify the memory improvement other than we would allocate the largest size regardless of the size. So a 2gb unreadable region would cause a 2gb allocation even if we were reading 4096 kb. Now we will take the range size or the max chunk size of 128 mb. --- lldb/include/lldb/Target/Process.h | 46 +++++++ .../Minidump/MinidumpFileBuilder.cpp | 112 ++++++++++++------ .../ObjectFile/Minidump/MinidumpFileBuilder.h | 8 ++ lldb/source/Target/Process.cpp | 44 +++++++ 4 files changed, 176 insertions(+), 34 deletions(-) diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h index 2e827d4c5cb74..7bed610b2830d 100644 --- a/lldb/include/lldb/Target/Process.h +++ b/lldb/include/lldb/Target/Process.h @@ -1589,6 +1589,52 @@ class Process : public std::enable_shared_from_this, size_t ReadMemoryFromInferior(lldb::addr_t vm_addr, void *buf, size_t size, Status &error); + // Callback definition for read Memory in chunks + // + // Status, the status returned from ReadMemoryFromInferior + // addr_t, the bytes_addr, start + bytes read so far. + // void*, pointer to the bytes read + // bytes_size, the count of bytes read for this chunk + typedef std::function + ReadMemoryChunkCallback; + + /// Read of memory from a process in discrete chunks, terminating + /// either when all bytes are read, or the supplied callback returns + /// IterationAction::Stop + /// + /// \param[in] vm_addr + /// A virtual load address that indicates where to start reading + /// memory from. + /// + /// \param[in] buf + /// If NULL, a buffer of \a chunk_size will be created and used for the + /// callback. If non NULL, this buffer must be at least \a chunk_size bytes + /// and will be used for storing chunked memory reads. + /// + /// \param[in] chunk_size + /// The minimum size of the byte buffer, and the chunk size of memory + /// to read. + /// + /// \param[in] total_size + /// The total number of bytes to read. + /// + /// \param[in] callback + /// The callback to invoke when a chunk is read from memory. + /// + /// \return + /// The number of bytes that were actually read into \a buf and + /// written to the provided callback. + /// If the returned number is greater than zero, yet less than \a + /// size, then this function will get called again with \a + /// vm_addr, \a buf, and \a size updated appropriately. Zero is + /// returned in the case of an error. + lldb::offset_t ReadMemoryInChunks(lldb::addr_t vm_addr, void *buf, + lldb::addr_t chunk_size, + lldb::offset_t total_size, + ReadMemoryChunkCallback callback); + /// Read a NULL terminated C string from memory /// /// This function will read a cache page at a time until the NULL diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp index c5013ea5e3be4..6ed184273572b 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp @@ -969,6 +969,66 @@ Status MinidumpFileBuilder::DumpDirectories() const { return error; } +Status MinidumpFileBuilder::ReadWriteMemoryInChunks( + lldb_private::DataBufferHeap &data_buffer, + const lldb_private::CoreFileMemoryRange &range, uint64_t &bytes_read) { + + const lldb::addr_t addr = range.range.start(); + const lldb::addr_t size = range.range.size(); + Log *log = GetLog(LLDBLog::Object); + Status addDataError; + Process::ReadMemoryChunkCallback callback = + [&](Status &error, lldb::addr_t current_addr, const void *buf, + uint64_t bytes_read) -> lldb_private::IterationAction { + if (error.Fail() || bytes_read == 0) { + LLDB_LOGF(log, + "Failed to read memory region at: 0x%" PRIx64 + ". Bytes read: %" PRIx64 ", error: %s", + current_addr, bytes_read, error.AsCString()); + + // If we failed in a memory read, we would normally want to skip + // this entire region, if we had already written to the minidump + // file, we can't easily rewind that state. + // + // So if we do encounter an error while reading, we just return + // immediately, any prior bytes read will still be included but + // any bytes partially read before the error are ignored. + return lldb_private::IterationAction::Stop; + } + + // Write to the minidump file with the chunk potentially flushing to + // disk. + // This error will be captured by the outer scope and is considered fatal. + // If we get an error writing to disk we can't easily guarauntee that we + // won't corrupt the minidump. + addDataError = AddData(buf, bytes_read); + if (addDataError.Fail()) + return lldb_private::IterationAction::Stop; + + // If we have a partial read, report it, but only if the partial read + // didn't finish reading the entire region. + if (bytes_read != data_buffer.GetByteSize() && + current_addr + bytes_read != size) { + LLDB_LOGF(log, + "Memory region at: %" PRIx64 " partiall read 0x%" PRIx64 + " bytes out of %" PRIx64 " bytes.", + current_addr, bytes_read, + data_buffer.GetByteSize() - bytes_read); + + // If we've read some bytes, we stop trying to read more and return + // this best effort attempt + return lldb_private::IterationAction::Stop; + } + + // No problems, keep going! + return lldb_private::IterationAction::Continue; + }; + + bytes_read = m_process_sp->ReadMemoryInChunks( + addr, data_buffer.GetBytes(), data_buffer.GetByteSize(), size, callback); + return addDataError; +} + static uint64_t GetLargestRangeSize(const std::vector &ranges) { uint64_t max_size = 0; @@ -987,8 +1047,8 @@ MinidumpFileBuilder::AddMemoryList_32(std::vector &ranges, Log *log = GetLog(LLDBLog::Object); size_t region_index = 0; - auto data_up = - std::make_unique(GetLargestRangeSize(ranges), 0); + lldb_private::DataBufferHeap data_buffer( + std::min(GetLargestRangeSize(ranges), MAX_WRITE_CHUNK_SIZE), 0); for (const auto &core_range : ranges) { // Take the offset before we write. const offset_t offset_for_data = GetCurrentDataEndOffset(); @@ -1003,18 +1063,15 @@ MinidumpFileBuilder::AddMemoryList_32(std::vector &ranges, ++region_index; progress.Increment(1, "Adding Memory Range " + core_range.Dump()); - const size_t bytes_read = - m_process_sp->ReadMemory(addr, data_up->GetBytes(), size, error); - if (error.Fail() || bytes_read == 0) { - LLDB_LOGF(log, "Failed to read memory region. Bytes read: %zu, error: %s", - bytes_read, error.AsCString()); - // Just skip sections with errors or zero bytes in 32b mode + uint64_t bytes_read = 0; + error = ReadWriteMemoryInChunks(data_buffer, core_range, bytes_read); + if (error.Fail()) + return error; + + // If we completely failed to read this range + // we can just omit any of the book keeping. + if (bytes_read == 0) continue; - } else if (bytes_read != size) { - LLDB_LOGF( - log, "Memory region at: %" PRIx64 " failed to read %" PRIx64 " bytes", - addr, size); - } MemoryDescriptor descriptor; descriptor.StartOfMemoryRange = @@ -1026,11 +1083,6 @@ MinidumpFileBuilder::AddMemoryList_32(std::vector &ranges, descriptors.push_back(descriptor); if (m_thread_by_range_end.count(end) > 0) m_thread_by_range_end[end].Stack = descriptor; - - // Add the data to the buffer, flush as needed. - error = AddData(data_up->GetBytes(), bytes_read); - if (error.Fail()) - return error; } // Add a directory that references this list @@ -1088,6 +1140,8 @@ MinidumpFileBuilder::AddMemoryList_64(std::vector &ranges, list_header.BaseRVA = memory_ranges_base_rva; m_data.AppendData(&list_header, sizeof(llvm::minidump::Memory64ListHeader)); + lldb_private::DataBufferHeap data_buffer( + std::min(GetLargestRangeSize(ranges), MAX_WRITE_CHUNK_SIZE), 0); bool cleanup_required = false; std::vector descriptors; // Enumerate the ranges and create the memory descriptors so we can append @@ -1106,8 +1160,6 @@ MinidumpFileBuilder::AddMemoryList_64(std::vector &ranges, Log *log = GetLog(LLDBLog::Object); size_t region_index = 0; - auto data_up = - std::make_unique(GetLargestRangeSize(ranges), 0); for (const auto &core_range : ranges) { const addr_t addr = core_range.range.start(); const addr_t size = core_range.range.size(); @@ -1120,27 +1172,19 @@ MinidumpFileBuilder::AddMemoryList_64(std::vector &ranges, ++region_index; progress.Increment(1, "Adding Memory Range " + core_range.Dump()); - const size_t bytes_read = - m_process_sp->ReadMemory(addr, data_up->GetBytes(), size, error); - if (error.Fail()) { - LLDB_LOGF(log, "Failed to read memory region. Bytes read: %zu, error: %s", - bytes_read, error.AsCString()); - error.Clear(); + uint64_t bytes_read = 0; + error = ReadWriteMemoryInChunks(data_buffer, core_range, bytes_read); + if (error.Fail()) + return error; + + if (bytes_read == 0) { cleanup_required = true; descriptors[region_index].DataSize = 0; } if (bytes_read != size) { - LLDB_LOGF( - log, "Memory region at: %" PRIx64 " failed to read %" PRIx64 " bytes", - addr, size); cleanup_required = true; descriptors[region_index].DataSize = bytes_read; } - - // Add the data to the buffer, flush as needed. - error = AddData(data_up->GetBytes(), bytes_read); - if (error.Fail()) - return error; } // Early return if there is no cleanup needed. diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h index 48293ee1bf5e5..a3f8f00ee215d 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h @@ -142,6 +142,14 @@ class MinidumpFileBuilder { lldb_private::Status AddDirectory(llvm::minidump::StreamType type, uint64_t stream_size); lldb::offset_t GetCurrentDataEndOffset() const; + + // Read a memory region from the process and write it to the file + // in fixed size chunks. + lldb_private::Status + ReadWriteMemoryInChunks(lldb_private::DataBufferHeap &data_buffer, + const lldb_private::CoreFileMemoryRange &range, + uint64_t &bytes_read); + // Stores directories to fill in later std::vector m_directories; // When we write off the threads for the first time, we need to clean them up diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index 0b7ba343c11f2..a9787823b9108 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -2184,6 +2184,50 @@ size_t Process::ReadMemoryFromInferior(addr_t addr, void *buf, size_t size, return bytes_read; } +lldb::offset_t Process::ReadMemoryInChunks(lldb::addr_t vm_addr, void *buf, + lldb::addr_t chunk_size, + lldb::offset_t size, + ReadMemoryChunkCallback callback) { + // Safety check to prevent an infinite loop. + if (chunk_size == 0) + return 0; + + // Buffer for when a NULL buf is provided, initialized + // to 0 bytes, we set it to chunk_size and then replace buf + // with the new buffer. + DataBufferHeap data_buffer; + if (!buf) { + data_buffer.SetByteSize(chunk_size); + buf = data_buffer.GetBytes(); + } + + uint64_t bytes_remaining = size; + uint64_t bytes_read = 0; + Status error; + while (bytes_remaining > 0) { + // Get the next read chunk size as the minimum of the remaining bytes and + // the write chunk max size. + const lldb::addr_t bytes_to_read = std::min(bytes_remaining, chunk_size); + const lldb::addr_t current_addr = vm_addr + bytes_read; + const lldb::addr_t bytes_read_for_chunk = + ReadMemoryFromInferior(current_addr, buf, bytes_to_read, error); + + bytes_read += bytes_read_for_chunk; + // If the bytes read in this chunk would cause us to overflow, something + // went wrong and we should fail fast. + if (bytes_read_for_chunk > bytes_remaining) + return 0; + else + bytes_remaining -= bytes_read_for_chunk; + + if (callback(error, current_addr, buf, bytes_read_for_chunk) == + IterationAction::Stop) + break; + } + + return bytes_read; +} + uint64_t Process::ReadUnsignedIntegerFromMemory(lldb::addr_t vm_addr, size_t integer_byte_size, uint64_t fail_value, From 2721d50d8785603987358afb4d82b986879ca41e Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Tue, 8 Apr 2025 09:49:12 -0700 Subject: [PATCH 1020/1029] Revert "[dsymutil] Avoid copying binary swiftmodules built from textual" This reverts commit 39ace8a63012af7d6ad7bf065c233fd3d5df44a3. while investigating Linux bot failures. --- lldb/source/Core/Statusline.cpp | 15 +- .../tools/dsymutil/Inputs/Binary.swiftmodule | Bin 17192 -> 0 bytes .../dsymutil/Inputs/FromInterface.swiftmodule | Bin 17028 -> 0 bytes llvm/test/tools/dsymutil/swiftmodule.test | 29 --- .../dsymutil/yaml-object-address-rewrite.test | 3 - llvm/tools/dsymutil/CMakeLists.txt | 1 - llvm/tools/dsymutil/DebugMap.cpp | 8 +- llvm/tools/dsymutil/DwarfLinkerForBinary.cpp | 16 -- llvm/tools/dsymutil/RelocationMap.h | 1 - llvm/tools/dsymutil/SwiftModule.cpp | 192 ------------------ llvm/tools/dsymutil/SwiftModule.h | 15 -- 11 files changed, 8 insertions(+), 272 deletions(-) delete mode 100644 llvm/test/tools/dsymutil/Inputs/Binary.swiftmodule delete mode 100644 llvm/test/tools/dsymutil/Inputs/FromInterface.swiftmodule delete mode 100644 llvm/test/tools/dsymutil/swiftmodule.test delete mode 100644 llvm/tools/dsymutil/SwiftModule.cpp delete mode 100644 llvm/tools/dsymutil/SwiftModule.h diff --git a/lldb/source/Core/Statusline.cpp b/lldb/source/Core/Statusline.cpp index a2ecebbefbfb1..b7650503e16bc 100644 --- a/lldb/source/Core/Statusline.cpp +++ b/lldb/source/Core/Statusline.cpp @@ -12,7 +12,6 @@ #include "lldb/Host/StreamFile.h" #include "lldb/Interpreter/CommandInterpreter.h" #include "lldb/Symbol/SymbolContext.h" -#include "lldb/Target/Process.h" #include "lldb/Target/StackFrame.h" #include "lldb/Utility/AnsiTerminal.h" #include "lldb/Utility/StreamString.h" @@ -127,7 +126,9 @@ void Statusline::Redraw(bool update) { return; } - ExecutionContext exe_ctx = m_debugger.GetSelectedExecutionContext(); + StreamString stream; + ExecutionContext exe_ctx = + m_debugger.GetCommandInterpreter().GetExecutionContext(); // For colors and progress events, the format entity needs access to the // debugger, which requires a target in the execution context. @@ -135,15 +136,9 @@ void Statusline::Redraw(bool update) { exe_ctx.SetTargetPtr(&m_debugger.GetSelectedOrDummyTarget()); SymbolContext symbol_ctx; - if (ProcessSP process_sp = exe_ctx.GetProcessSP()) { - Process::StopLocker stop_locker; - if (stop_locker.TryLock(&process_sp->GetRunLock())) { - if (auto frame_sp = exe_ctx.GetFrameSP()) - symbol_ctx = frame_sp->GetSymbolContext(eSymbolContextEverything); - } - } + if (auto frame_sp = exe_ctx.GetFrameSP()) + symbol_ctx = frame_sp->GetSymbolContext(eSymbolContextEverything); - StreamString stream; if (auto *format = m_debugger.GetStatuslineFormat()) FormatEntity::Format(*format, stream, &symbol_ctx, &exe_ctx, nullptr, nullptr, false, false); diff --git a/llvm/test/tools/dsymutil/Inputs/Binary.swiftmodule b/llvm/test/tools/dsymutil/Inputs/Binary.swiftmodule deleted file mode 100644 index 7ba817b22b707a23387b728923ae764699578561..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17192 zcmbt*4Rl-8mF~5q_~+iBd*dYFKqirm2P9i|WRnCtmXoV12MmN(A?Bwu%v1>u4Q&At zLJO~LDoY9o#>l{q22pcHZ6JZ!Re2xSw3YaYY4#v7vais?SPw+z);dqRH(ZUa;_=8biydlZ=$wIHp_o7Y3ijSuFh({Rk@GnmA zy?B@MEgX{gK_B%JOX`G3H2X$fGU(x-SNRhvO)Q4@@+VTlaffh1;>EWX@dGjbxKHTC zN|?;S52$<}mQM-CB>t#}7mMrpeu+Oe!S}^@aZ^2;*emo|1Z}#af}!My!JbsAMgrAi z@l|0%^+?=05ORj%wLZ30EDjW~CMB?_DI7V7AxHG;g zil+g4C}5w82XE_9tmxRI*hZD=E<;TuZbjEF1zK7aGSti%sw07FEOIx^q1c83)-Hoh zRctCIkJm&3wq$%&R=G~c^|WeIv5gsOLeR8gb9%z|uDCsdc@*1l+}ee4hMG|l#kOCm zb^30wj>0Mi`>0a&HhVgwR1F#IBLRDotvIY$u~RJS^xa64N0jOqCW~7mB%N(4P#sd} zD}b;4&hLswu3jwtlP zN;sI}k8;A17!M=7=-~%}6j&oAz-*coBPnjp>@bNxpk}e)Ydqo*U}XT|phv(^3_7UN zlYS53tWV~}@qgxz%lt{qpAw=HJ?#aovVgdBeD-ypUx=poL0NcFBB9fexzz+<3Wrtx zSV}nIfl$IhKmTHikIKT46dp>2rBxv^zIDP-o83Q91TDZmWEDfrRD4wzw8+e*#uIDz z7^>kSzl*OLj#m%ISHV1h5io;J&}Ar2${UN@ojb;9nur23SAD>2h<5{ubw57m8bf}Q ztquShYy`ddssXZu&AI7P`#7yin9O2mh8eUg?k>CY1nl4?2RQ;af%?6CuY=Sx$1J%X z(lHnXT4PELQ1cRR&47c>wvAuTVNVhse%PM4K?rx`0LZN3(SQxTnPT!6LoYLthVoxE%a1Q;r1(Jmd~T3~+FqBMgjM zKd~Fw1$kM7BM$yWS$KgHG*}OU-ZGPkZ^?WgUSNI?e++0&@_kAE0F0aMamgfqJjui4 zm_IX3bjgrMh%!G(FZNTHaFPS~vt0zR=1^*4`5}*R%s~>u@iN0xmN`#3aIgR>7r*u! zjq1~~rlsUU$ZIs0zsPG} zBQ^xxKvnPn&9f14G_9s3{THJ!n8Irw9x;&d3&b0+7jyr#1;yp2^o}tB*{4S!!MQnu z(^GD4WWYEK+reZi*3B&O)~+1md#T^SJRue>B?9dw3KS6f z4$hj6yETIET6azB>3#aXG3enBVI{1`JRfp_fi;p$3;gD-!Z8bf99GAc=0V_u7ly!2 z0EZkv`L;h#Dp$4sTIlAd|rTEhB^^KRxh*YPi4Lf}*z9pevWRdM<4Iu*i< zGE$1U*1P>$Hh(7ZfDo`GoCVs=;LPy|Vm~a481p=yrubtq{)Oy9uVp4U?iVs6xQ-4A zh@h4b$eqRZq*4oxaeB4_kmSVz#S#e`oePqEWie1eyb9bMMWAPlHiPH@I*JGD3o!!m zDYMLk`Vz%-A|1*>5c1g?_U$khi;|g+$!-^mx^ZIB%yjPW73LDjA(bEY5a9sj*#GPX z)#ci}Ghix(1u_t29IRzrTcpPz0Im7;i{KcfS1HdME-M1n2yz9MS%A>f^JS*lQUpW| zdIW$kGe~;pC1P}zq||7%i!_6+>fTvIs7VPLPm00{Xbgd4i1B!*q1tJEL_CWZTkV_m4zW_Y=Y8HFdp3Cm=8ME#GgnN*hk}txky(Lku&4&_z~VC z6e1^u)IM@{IO7Q<8-N(gEk$)Dg8mrBL1|jpORsJGkD@62KS`Ma)ZB~IG=;2$Qe9$% zG(bHal#bcDm8yyOs&ulz;!j2q!KFNs5vYzVhYXh^R>2(b&5ZE!;|NayM7i3TI3hB# zODi@qn%#x{AS{IFL1Js$?V*sT)XXZgwGVFZ0rrX7vrL%Q{ZEmU)P&=9#IA?|sbWSu zPT*pgu@^jtg#c}Ku9u8xPX_Gc@wt-RaRfx@AwHsz#AxO)AdGmEwc<~Jy>!-tD4fwk zVvvHwLG;abG}{>>5W2N=oU+Z~cyKYSoDN@bqalcA5ssxSuNZ2^4R$!27Q2(sQUF=1 zEo!K7Hk@GeqVaSVCgMV*)z!!fuuM`5;_=`X&>ID6I8e^)v9|;03t=fO(57}aOl4d| z3k0BH7*YI2V>e8r*dlQf93lgCuoO@?9Ka5$lL{Rd04nlCR6}1N z;-QpWc#ds<9wGo8sOcz7F&o~+&J*Yfh>P1(W+3*EL=Uu& z=a4bN+K7NGk%3nNJ+SskJWR-{{zVp2S3O2Q4D3zJF>FzIahF9#MT+kiDsrU5Hpf@n(qgOSeDFe}u6 zJei0^p~Ek>12j4BuoO^Q_&jg{J5t&p!1a%4nJS|`@zz#QCUlDRApcCj#>is!Wb3@M z!UcS@6iniw!CzrAjinqyua|O!7g9Y>wje*p>5Ps5h*+8YoCprnYv3by^i^&qF!i!j z8or<9A}AfHGD3Y?=?+1>^Cq0-MiuOZLUwLYVNOnhVjXrocb5&H0*k@BG7vWqKYitx z#22No56-nv5VS)?1XhGn0M}TWAr7VZLBt(84fEonO>jrbHR2R&8T;S>ig;yn+Ux3A zT7(^_#KJ?Vp4UuYz5FW8HJz5#Z_u6(Ko0DA7fIRcE zOyu*x_7NWsFM+QxUX6a#g7`o`i~w0t#kxQ4KE0Sa0*hJ~V2XYQ5Dp-dBtuY(5Gelj z*U8F2wE;ixR0EyxcrAK@$KV+nziYNop{FJex9~%OCCQxUMP;(oHxciN;}fL)h5Vl`oL- z(#wxn0ABNF*o(t_K{CG^)9cjnIA)4hj~knz(4=mTHJ2mXKPs8AuityjQ~7eFo-VA8 z8f+v=uwgvWxZmA48nlfEn>vlnkwnw5ZoY3_4p#2v#8k|D)?uF1c9&#I&!1Z^JiJ*r z=MX2o<}(veBQvcmO!}B4obU-5N<3o`GkG8Z9>2+oi@qHRG)19RJwiN3>I7J3a>9H8 zR&wZu{?G3u1I|#eY0QY$*%fTkX(YI8;#9B@XAgfN}!86R5#l>TX`k_yjiBmqcGmLgN>m?gAQ88)hiFmJPi0z zzVf1PTwgP$F;vh79~lCu2%hs0*6k}JDuggY#3>Ep(1Sz#JZ8R-Kdb+pmu>W8aKcH5 zH#W~75Zb=j%YV#?9|7(jA`2bddnRA54hPpEWdmm=-RlURN(0(_o9=h*9y2tDh}9ZM z7x>^t2wL}nyHP{ukkXiRH>mCgwE2U>9lJ-B#xY1~YzB6;0iE%t;b0?z$&kBYcpi>ye}xYzfvn1J+^&2XON_Z_AHI3Z>P!S1P2__* zsjP=K^9Q&6_XENi$^2oAKRba?rVj#AN0kQf{K-F#GYcSY19p7jz{oKXo}AU+*BQ@?1nMib58@iDgz zCP@I zuK=Tc?lIFtCc_2&R(|$3!*Bz)Ee2|JH$ebIuKYc=r#m$t>OvNgK>nah0le4jh)fA5(%Sm6SxIo3v8g*@4H-ec&7iV;@IJgF2T^2iOv!Ks zM1Cpj^wRe8E~SBzXg4^mSr=s!k6tl=aY$XETkx?Lv75zvJrdvgISloclNpyqZ_voe zItHw|n{bGj3f4q48w8Q$B4RiP^l9uJ_r6P>@l_^y=2@&BQJP|i^sutA8PP80UZ1a> z*84FNjz|vkhx#2{wjXnZ3{BvqP6joAHh+Tl2f%l*RSJkTe{KcI-w1hYiM^Q#h>TdsJ}a)qQ^PJ14dgH(2!`FnkQH! zc3pim#nQ8Rd3=Ze>fV_73&}j~5c?3DWbq*O;WfiwK9*=7XK;*^5oze0G?~9A>C6r6 zGz!z{#2>o)1V?B8b1}Ux{GXQCkhhK#-6G{7AD&0CZM$KauUbCUc_Y#~eJ&nssRA%A za%`2Kd4f&@QKexj(S##szByYC-@?Nyz`yzvZ|@Tn=DXrffFU;ji8oOexv>Tu5(+kU z1$DiF=!A3Js6Wyf+b=t$Qk2l;u8zq)vu8csu_MuS*>O(P7#yK4pX0=Pu+IBo?40t9q~ha4mEX2j0XgtN!J zE|$Nw#Q2IqkNJWhTyOqDPeY4;2#cufw2yN}K4LdD?c+a7A<(fDBVQ%lbgOmDy&m~( z{v~3G-0Oe86KKNU=DRzbU!nRkHJ#yBV1+()j#oJiQe>D2%iT zukz{qqN^X_x)a$)(2fK|PhPiIoscQ3p7bFRV+U!#1(B0wO1cvqced3EyC8|pm*e4I z)(aP47ZstL|I$H+htIUb^1N((cdQDWe*x=aEf%7oUizWPAs#{on=~K2=#M7yN#ds2 zBV@iL?Z-f-kbzl@PI2r!quYx5zc5WgJC9oFB*PLwy(^ZTy3ranHsMs>tu(@4#{u(% zecb4b(Ff<)yoA1MkVQ%)Ipl!&alpqz63TQPW}PJRYp=q8U}BtH=F9L}hHlCHGMA4I z$D4e01g=L1WW8}+F?-t){4UH&L~sX8H&ZfQ>h#Xm3ULeO@8OeH3pY{u7p|AG#hA|3 z1?kK2kpONE={I#0Q(cKG58RaFYOgzh>t2)^+MS!+Rd|YGE0iAM4iFc6IU|fMdG}h{ z1Y_~wE%?PAzgTN`Ib-p}wz>C;c4z3Ny3BnvuBtMZ2>4lEyVIU6_K5MvQ~WP+otu)% z=UjYa7~Ns=5!^l8e4anh!{Zlh{D6*YL|iwVNI435s)2|EY9Iw#lxTD-HMn!h{rMd? zJp1XYD*bRQZahI3uvGfB3Po(Vj>47P=_p<8)35j$1*L1yP}em_Sv!7x(W|gI{d~!s ze$Z}U&Mn-WuI=wo%^s>F@ZkjHm4*4 zCco?@+;HH+4L_-q;qz~Y=8969EXyGzqorMT#$npnD&(dusS?y~edx~_|p&0}PC zSkCVBzKudedPKE|*rx#(^tk@SJqlerrY8@*bQ|650eRWWG~60SaT$Ui-EooClf4s4 z-;kW}Ni9N|P9g&e0IX33#hTe_NcZA2WKbfcRENod?8{BBEA?|qk;I!(8-$COIi*ir z3GQxj!G!A!d;qpHRe?R1XfJTp)Rz))rG=7Ie9*q8U4+WUl2UHJbdi-uyY0g@7Ot95 zxQc5rT<4-T_L&?DSOixT;x$Z7D%DaAp`3=JB99kR@)~U(+&JQ<8D-;hW=lmIwAasL zHmzRg4Q(E;FoY7~7-^BHl}dCbWUcRE(2j+S5A_bFJR7qM;OdWx3RoeB#>eO+?pw_S zo3}XV9sRp3bBE6uQ0EQG#>F+iVzll`VuCfAa90DKbvr>~v}M zeDWWup3>}VcI7OIxAGs%Ts%`>&G7j+9(Sz@7ZoL=G1GSp^(1(R`bYo?|P zB?i#3f^q|jIGGzzcUf4g?IZTmDyymxe~GieVWvR4+4RhuBnj(v4Z~YxGQCMW=)2B zc9ijD_&jCOUT9WG|F0Kt;9hMhEhvB=VL1oN27zNK9Rae8HalpsM&miumNQOWs;#7z zw}RXURElW84$&IwHQ-`c_*rd^B!AqCz`|-^v0vyN+%!jyOf;CwmclWKw0a`tO-F;g z*Es_D|6CI&m-VuooSCL49VX`OlrtGu_)mWlGtFiW;Q}oX{85TZ@rS%9DdUHrom;gr ztT>9oClwLp8b>aCJuL=O=58w0V(ByfTMZb2s1+5OIeeK-W4d3;2$S!la29ADh1|ZJ z;?o?FiVE(r?}8CgAC)da0wboVgzHd>6#=CyTC#{!O|TF29FeFMpG62kxtxOnq2`(N z`wEBkC^?Aw>HpXurS1dn@cI~B#X+OBh726=zXFw3TG!_`(bJW(?S?*$w>aKM-=-D;3*VqnYzI(Ucb^n78 z*R82t&2c;My?*t^`qgzDcXt8DZQXhA+SN5%X`PyEKL^LXqREk+`c-__difsO_T78# zs;=y@5(ebasaeRn;ybLaN^ z?s|m!eRIpMo%h`LEzi#T@7n(0gZM=K?DU+plhb+2j(hHX5HInk5%UD_Z+wX{)GC#` z0}B^bw1-toP&Jjb^-Fs#u7Ihew6axQIaM_6+D-pFwKB9hHdQpaT&h&?pk-84Zm_h5 zXDo_CFqQ132Z8DG)}P?PZpmYGRZJBvIv*^acF8W&#K0q# z*4@(9^Fd!lacTL6uw&ZQu-Np9SW($6m#ejbO+C^r3(_;Od4D6hcCvZdx7QcA&Ie~K z8%D+R!D#)cRL(xlSlo|d>bEVeS;3UQx3q?62wU0kk622zk#T8NOJyqr zu{KQ=DW#Rghl|UVP-XXwrRz%e&;ClbtB?s@Zep^SdROm-f|QV{bmMLsxy8g!yRcqQ zRUo5;-BNNXdq%5H6?L?03VWxbWj7N${sH8X#-Qww>8X{b^DQK5@zo(|QE4S@&vRq( zUQ+C6$@EK8MO!0mk^#*;Wi@1^vQ@0Dn<^?-y34y(KuVbGHur9+!WCM`j99#=e%jUg zO*Ua#ZN;w8-jy8J49voHx%!Z_u*w~NMa1Wtc8;5ZZx~K<7%e&emgL|TlVl7OlIQhN zk)$(x?eb|?kGhhS5EH1wVOIuuzn(Ev7X-m)q%>o@I|Lky>0ah-H}%(_ z56ZM^DfWHUhT?_=Ev@R+4qDLv08`Dt8tOr~3l=Ye&6CONdLS2bJmxG=+rpC4e$wh= z*J^!DfCGdFh|1lM!l)SRhFk$y%&;wdC2evKJ!Z};RcshlSGM`h51ROrQY=sUW5PZh^WvU-Xtr57^%D?1iAljIiZJboV2egBifQt&W2Ho zatccS&R$ZMR05%@pwHo6_T5xc(LM@g02|m$eLKR+2B~h^<^La*Oc|!>4yN3$R>K}s z355Ip4OnXfh^^hzTTYOI5gq%UnXyMuXDqRnr=W1KaB8@`aU&}Q1b(7enicWsp+65NX;H<9;4=OsCkl_pHj2e*iu~fl(Dj( zd!tbfaGz4Mc^#SusM(9ge-aI$-lD(^FkASof>KlXx719c2~8CkuKSDwHThFc*&kkE zarwK^$o(rU@|~h;@wSVq-0J@T&thMq0k%&UwMv#((Kuc&!j>z3QN; ziDX$pFDuZKgUFDNmLp{Ym6w#Xq2mpjpTyU;-8SEFmI zZ9!$)I8~OT^t=^ZLD0PJrUIy_`)3A%_Dh(#j;3#h@H~BTFIgDK!2uG_D2V ze>9Yux<*Kzl6q74f8$#?fu{460noh1PMou{m{&?o-7g#9bL9sNE$z_2DQa5&l|%D8 zG^eO(d6$|q^p%>H_vsTgE$8SHH7%deCu&+!Xp~R6g2{*d{wR>~HX)-Jw)79%(0o8m zSrtIhiUz2=m7sSUnm4FfbO)L`YTD44%G-#>{q{>OW$+Zsx5<{Z0-Abi)>5;Mn)PU; z>x5#dkMwkynj>f$ez^cnWO=C)VVSF2sKx41V;7>c_Gb;^H@&QnbkE3!9!@O`7hK>1 z+;)G3T-sBT@;R;;6DgQ$K_O?5jSjBPQMYwXWBoOC?bobcU0?55ySnk3ZTP;m Vu73Nrt=F_KF#l=&FeUr*{{U!EIS>E< diff --git a/llvm/test/tools/dsymutil/Inputs/FromInterface.swiftmodule b/llvm/test/tools/dsymutil/Inputs/FromInterface.swiftmodule deleted file mode 100644 index 2873ee93e137a7651f01638440c22ee3c3b4d558..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17028 zcma)j4R{;XmG)Rt{4*0Y6DI)&vWaZGAlb4ln~>VEOhzLE4un=o%um1N+bZBt(iRXQ zQ25%WvZR1uO*;u@Yf8FnCvxJnP9hwKHM{=42#F1bHB@vBh28ExHaL_l5JU(I-pAej z-g8HiWsO6dC(n&EbAQe~=RNN^SNG)IeFNr#VvhUCMI2XDFcce(SF1`*RI$wltkHON zSG;Pa=@sQF{@5fxY!Qyd_>&fXFvTB;^5TXh-!BV>(CIn3@Hm4_S?~dCem`AaV#I4;JXQ&w?QEdB^TBmP= zbqrQ9*vFKrx7pKKrE1t<9}U=(Y{e19ik)Inr|$-uJfc*`Fj?FhA?a+>f$ES#Um@IF zSMQC~HLTQNT?p6{u#YO$n0Ulc(;cw)DAq1WHeB!&*{3f?mjAnyFlT6=a99=wQo?~0 ze}oeb$9Ndwq=z2@Qecgg0JCXUjHI}$=Y~oAel?2)U(;cS04oCs2Rs6XV$cDVo(y;h zXZ3Cf?w8+e*#uICI8>-SPV(AWJSE(iaslw;9;54poI0~{RZFax93PwWA9 zL0%T&u!BD-3omej2J0cvTV^uxEt&7f3(W7~j{?m}zCX$DhjFt#E}G(xC3%<}^Jk`s zE*tg;QRXM<#eV7%PH+H!wu=DP98OKHJm?XQI!Hn|US@d8GVdt|4i-S=;@5wpQGHt0 z^wd8y2tz7=MB<0wu4tLZYVR@=dTWlcO~8pS)T16f`Xu25BwTn7`D)GOFYwyegbhJA zP!&8t^K3*MO{-~1|HUW_rSO`EM+|2C0`Ugy#oRw_L2;!iy<<#3_URQ!aBkk<^pu<0 z`4I##^uxMpt_>U;_NVw03>b%CJD5zxx|t>3+L>c~FZDZ+C&ZHFM4){{fdWF`fjQH0 zH%IVY>#k`%y-(jahdlg2tc3NL=R+jK{kS)lTap8d|bl_N+p%4Jq~s zijSHuiSwy%`wt+mEDS?qlazjf@!$qWebBKc{$#4aJ{CvJMY@WJoEdl9kMJI$5IHHN z_K~y08IL2`0K`~sDXOax^v5v{O4Gt#dTrx>7DeIzNy-$U=02pRDP$#->JlTQ0qW_b zbj;SHR87X~(#ZmgKN&>?m-9$QpgOV~GF*;W1#`eRGs4S{BRmBVAqo-) z&^O!BY-fl-=+@G4$~H&h!KJWrI()&6h9RCsIGVD&VyKxg*x_ti>`p>U0c5GRsG-K$ zc%0FT#?x7th>MX{S0gLHGD$6n$AeoyZxpEEKsmF=-VUHIgr&4Vo7&kpopBK@5P*hZ zMDd$VJur=8i^NHAhz!)hQb65E06VBoDs<#pA)2$G z|9C$odLbD0DR)Y28wq@M&k z%%CBzY)-EOSSwY~2+?bxmXTWi{(ftQ9}=+u9dJ>iBK|ne0N5zN3AC|L4Sj)#hf{Ll zS+@NJhyZk;rlT;$YuG;ZGm}0$P9^X>MM&bCdxR zPRQKW`+0)JqmSxKRYrZ{&26Ae=oITg{@H+yk;UA})_Hq{3;1R! zn8ZVae~-yDmU0MvUdj<(NcBG1iu@d>GdcnwVrBAkA~;O1fsfqLSGkG6)W=e3_{9aWKUXA@0y=m=~99gga8M5vN$o*artt#4DTAUSH4BBJ4mV z79LLZzGnLBr%N4Yc&n&V{2b)9g=18KkOKFT=Xo8JDc!Qr(&1Yp{h7)zmJ3hVEHfeN@ z2AxTFlbT2;4u}>AsAKWqZ!d|c;#tZ3v1)!V$)8S|-}RW!#rRKB<_|q$DqkjfW_R^C zKx}kQyVs2*bbHzO<30R-iGNQP&!o&}^GEwSud51!bQ8`PwOBACb)1*Y7>%>3lg-PZw544K@-b z*f^1B+UIT>3)&`v&0WTpNTPW}H{Z7}1uOS)Vk%}n<1kNYyGt^q=g%z{9@-?Fb%;}5 z^XW;bk(pK&rhH5iPWXfjC7!m3nLLmHkKbg)CEtkznxoLF9wDB?^#Uw2HEBK%D>-yS z|N6VhfHM?q9yg+Ob_bhv8VRl#ebnCVc22n0jk%k~6ZT}H86Q-meJs%&!}g6W)9$Jf z_qt@lnUBx5p(*~8N%Jv_`9B=lWIZGGTi@$Z)=kn()gkvaqwW^n(#C5qgEc?Gwih;~ z=gm`gNYLe87cw?Sg4gH}5!`l}w)PQ^`D2S2s{1To>68C<$~x-CM%DoiW7}mOE~QiFW_+UbJtq|&V9iP=%W4X68^kJI4z4G zSj10!dJ}mHKdBeKP|fel;!%lk{GXmQq=q~qJmL(P(?V9!8G|eQ^nPK=BA)dKXC?7- ziLms(YCbKS`yA%?Jm$~z%K!Nf#|`U76D{y4`~~lg1)D;NMjf<_YgQkSc^L4ceC0*o zxV~mwW2m4FJ~9kY5j+4-@_fEZB9S*KT$_CC#y4Mjrl}5DrHr?miHEw7L5vw(lF7UyP z5VY&sonLY?eonbO8*46R7dqpU0!yd!#b!x)i9du^2Mj;P6=^>KA zFg>7xS5yp0=Fe3AtV$S%e?AYqdi0?7J{hha4K|MkS}^?>*3s`^XQjf=Oq!8WAm+f? zq0T8sMx*)2ay%WjLSk&_cMDO?T@_J_c*aXqaatvCf%tUjO#PzG8cno-$H(0^m?TM< z2V0QiFBDYVk5w#ar~mU<)?OKuP+GCYnP=9Rj@eii{sx_|H}IM>55ZdU8i5%-qfX(^c+Kx)Pt1wWEFAr!g^mER-Vve@ z5m5a-@sW7ON8$23mp|*L&7A>RI4DAZT!BEi9{BO_q#4h#K6vxL zAda3cpYGCps2f>C0{Me31@KjK&fc=M_!M-l82O<+kVB zgd>u}{Gomam+ZqFAwx4bsf$4kpv|A4<9_fRY?T6H&7WIDVhD$MCf{hO*EZDPlmx_P z&yGDv;IP5Ca~g-0e2)rVdu6|0d=K;m0`U?V=%)0&`M4E{(4f)Q4KyU0rxyqoiJey- zNwM^7K_0)`e`R0H{DowmaftnhO|p0Z`|z6KFCR;^kJC6t%7`>{PMXT!lXQ9mJB`A0 zI`N0EJkHVC|7=XJ3;(AjHsq}nM7KzJ$cGnDZ2K-)=Bt)Zb=`opPM?bhTdM$!3mjYJ zXP%(bKvZd*PBi1lnQzY4Lz{Vc1^8Eg;_Z8a!hCn!2{6PKKJg~XA~)24Lqfr(?x3zW z5S?&N81+XwW5*>2Rf-b2+|@C)d+w~KJ9Z?xFFD4E8iOO$<#U|49sB%X0nq=_HJ6R# z4@%P66;wZ&Z&UY6mm-j!_L|T5b*!=ZyRkTqtSt+T3!Wm%_`PI)Pi62zIt{2HI_Bfa z@%eT1D*&aq$P@FI51zj4D8QNIKPN!4@R~n?_YXKa2AdHOaQX$dy7X+gaq?3h;pvu4dCcei;Ck~HdKy~#Ls&#*r+u6=@)5hSc`yH23W1KL82KvUX44yyOE#K9}*s3ePs`tKr8A-%a!sC{-HxEB!%%-3zKiNNhy z_y=6_W$Jinh(8M*Oqz#N=KUV?V2Z_={7uQ7uacd|-7Uy$k;dmwbOi<^^^~Z7&}MyJA(~{PS2BYq1aw_0kVT4)Gu|*rfT$1%EV=PY^fF9U=22 z={O29g$&GMbc$o=8QoUY|BGo7+IiGUCmEIi>Rqwyr5mhKV>3?WJxUY&bpkL?*e8t6 z7=3V#FG%P+hghUUl0y!N9|wFqB%w^#Vb)0^zxFEp2PVeJWuXkOb$D~;m$`g&INs!| zBXB)BAnT3mirL!_<9A_JB7!?$x|x#cQm1dOR)|})a1WofM!1p6zi_>jEyi@NDM(+A zj|OmaNWZD0nCfy|dEllTS9?7HT=$~X(C*ymuEJ9kTcPw2cYwIq%Nb#8&AZpqCK!(g zH{%z3{9>)$<&4J@cg(+6v^zsD)@SacaaEPMM8MDT+MV`Xu}6$Qmg0Yf>)e!7KJVfi z!{`o^kKpd%rgQxMULL<-;|FwHBjUQ@c*;@8Qw>BUPy;E@qC}%dsllB~?$7VI;n_!5 zRq2Q0apQ5ifThx}Iux*q`6^n-Q- z^KRkhb?tbEYW7eafgk@T(6U=u+#TUIF~<;z$dd-T(Ak&070I<>>qXg{ws|EHF!?1f z;)VkkZum)^44;2HG+&g`bn%Jq4r`t$YhJ^p9Nl%Inywlm&zqxd8C5#q-ibiPs#Ett z0Hr1dsI@(fG8%gl@3Ol9YU9v5-d&u&D#b+~)x^@Jj8=DpcbBB!(RE#%Y#t-C!*X_~ z_iYp+(j%%x#6AtUpvUzm?osI4F+F+c#arlR56H`2rs38wipvoE=#Go5-t3)F`iA7V zPihs)bP^d*0AP(ODAvqYL%NryA%hYjr8-OwWM64|U1^wCiX`5I+8|uK%qxB3N^p0J z3npA=-~+IosS519NPB^+rv8+GD=n0y;)C`z?IKh*mXvb)rHiaQ+HD`Mv2fLd!c|<0 z;W`(!vCrhb$W%`0=9K?TKWDbA;CO1Xe1X>}_JyP<*HXL;AJO8fU|4sd5>Dr^Y z73IR;YS;HfEhs5KxAQ4zk`Zmae?~x&{X=u)AOMA8bw#Q14iKsE>}?) z0*$7xy8cxwJT-x&02+d- za4@GHSKKbo?JgJxakE7wA-Y1^NV)1EEgO-I^(AKJxwnU?11-s&K?Y zRS)7Vmuf0!nFp0!gyK{w38n}0k}6TEot7*rvD!)x36U1dN7;n{|^Fzn69aj1hIntp13PzcV{~A7MEM$_9aBDIEc_j5arDsYc^D)Rr?&U9PR9mA8W2 z2UUt_zz)$G>NVhESom3OjwFA~i@?HaV6k859o#fej!ZO|&z8bbiL`n= z!Y36Gf`wT{~QdPi+Tz2#2!sQzmX2R*y0_A1*U%y-Y$JMVko;rgp< z*Kpij1su0^$31J;)NG}>YqIYg9QTU0c6Pc~@m=fXd+?6$ZNIbHQp^?NS%taHQc)2e zsJz{=r0#qn$&07j9~*SgfRCFC$5e~!+ICO*qRKXpt*BC3YzjWn)~j6FruObwMc+&% zq4rkO@_N^{+tt1wV&XdAj_nU^-MOu9=RLl<+PV$3>F>2{x7+XCxoyW?TkqV)nRt$4 zI>9=oiuC7i++qOLD^WC>&Cp|CYxNo`dyZ6oqckI}9@12iOzi(~Xxnuji-}dad z@6K%xJb+Ks&rZ)tn>n4g+`avt2k;VqnlMiQ|HhXYLv2#IJFs|3MMqe*1XWW>`+&5^ z;tH5bN-Nvc)zd{Yu3hxcQ>#OpV$(%aE2T;W4_e1WWrL+HJZn)Lf~jN=JqXN{xBUbU zc1a$it75un$+=+hj7xTz9^Xr^@x=9{RJlIPMxP75A_g9@wC$3%o(uXaic8DahaEGn z#-*lL#EQxuxm>LcZ0wabFG|nE=KYQ2+QH^!-(FwjIv1R^tREB41)~jPQaSrHYjHn{ zso%D=Wd&1yX=w}55Vo@6AFz~a$(7@u%3T3%BIEL^*2*>rVr`x-Qc5d}4;7ayp~{|F zOZVmMpM8~VS0NL++{k1x^{v?h1t}p@>9xCP=~^(UDVm3 zDeRq!)?G~O_y>?j8i%q&rl(e$&b5-L#aD);C8d?LJW3r>n;Tfyyt@c2ce(}%sJzFbk+iqEe}DsR&mA^ z`;wqx_|5=UEH2-Sr7f;UO%3e>v?XYQW`ashUCZgKJWd}k23!l?+l{xy5|mo-F6Q(u zCPNn-0O}#%^&59d@?MfAc6WtqQRvwM!%L<3(zC+bVH#*S7nEt$QtbPR^~H^gTHDks z9kihTex{nitEmU!E?B$-Hcuw6>xEp*@tCtfZHr4v2S}@reNF3Y0vsSbKveF26h_5h zH{=SyVum}ym(wP9(_`kmQpNf)b#=Sn{D6rsDaG=nZw3U008>>1{!OdX{PY5SAbNQN z)+61vUk?kX$)KZ6@}DhUR$3YE-D=qFfBO1el9j}Qh2YEF8~ASdZ-{bg6}HtSljz>X zdsmzIf`|&u;Z3p^-WLy-I5%LzSfk%4$&_K5>14|7ZZqsQl|Z=fZ@^mXL2MnKzH)*T zjOf_=%&f%{Ds^1QE=zq;`t5hXF8w2zppo}D6NzjRt`QhD~F1glo-DLYp#NnU1E53?TpK?1kgTn z#rmYD;Js&Mm)l=_rKi~XYnm`%DyX>2KWmA#J_Ut?g^SDnnV!c6u`6)GfjIG6GPZ=u*xHp>Q0QV_1o7SPZpPD^r z{3p;5>MaV)0JDYPDkwFDe@o32n$UEC;kwT_P?JC9l>OmV7MH&VjXbc*BHu2m7H@~B z%5DA+@GSNv8escWQJZ9W6^-NdB5b+hM}7c=n*G%L9!;mRtM?8}DXo|NJ*#Q9)t26A z%tjt7eh;0gr2v7D+rp` z-B_5930(0&m!*VFV()HGAGo*EZ5*HUv6HQ%6S6E$0?d5-LID}DMVHGe_P|EA_! z)NG~ZPBdlTUIeQ>w_Xl$!?(x=7Ik|o%2LZ7HZxf*V=8Ya8u!~Twv@qBEZ-qp)(U7Es98(RI%=*#BV8vHOZ}v$L)08b)A*}J za3af#l?cmR{bDUvmm0efowYw#BYqpQ)kO3ts1&rU>hR7s0}ZsIT_$K^g>;zBM!?+c~HNL9Ip|0ON{ LI.swift -# echo ''>B.swift -# echo 'import I'>main.swift -# xcrun swiftc -emit-module-interface-path I.swiftinterface -enable-library-evolution I.swift -# xcrun swiftc -emit-module-path B.swiftmodule B.swift -Xfrontend -no-serialize-debugging-options -# xcrun swiftc -explicit-module-build main.swift -I. -module-cache-path cache -g -Xfrontend -no-serialize-debugging-options -# output is "B.swiftmodule" and "cache/I*.swiftmodule" -# -# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/Binary.swiftmodule -# CHECK: Skipping compiled textual Swift interface: {{.*}}/Inputs/FromInterface.swiftmodule - -# ---- -triple: 'arm64-apple-darwin' -objects: - - filename: '/Inputs/Binary.swiftmodule' - timestamp: 0 - type: 50 - symbols: [] - - filename: '/Inputs/FromInterface.swiftmodule' - timestamp: 0 - type: 50 - symbols: [] -... diff --git a/llvm/test/tools/dsymutil/yaml-object-address-rewrite.test b/llvm/test/tools/dsymutil/yaml-object-address-rewrite.test index 74e8c1e7ae777..dfa0f285c5ce5 100644 --- a/llvm/test/tools/dsymutil/yaml-object-address-rewrite.test +++ b/llvm/test/tools/dsymutil/yaml-object-address-rewrite.test @@ -12,12 +12,10 @@ # CHECK-NEXT: objects: # CHECK-NEXT: filename:{{.*}}/Inputs/basic1.macho.x86_64.o # CHECK-NEXT: timestamp: 0 -# CHECK-NEXT: type: 102 # CHECK-NEXT: symbols: # CHECK-NEXT: sym: _main, objAddr: 0x0, binAddr: 0x100000EA0, size: 0x24 # CHECK-NEXT: filename:{{.*}}/Inputs/./libbasic.a(basic2.macho.x86_64.o)' # CHECK-NEXT: timestamp: 0 -# CHECK-NEXT: type: 102 # CHECK-NEXT: symbols: # CHECK-DAG: sym: _foo, objAddr: 0x20, binAddr: 0x100000ED0, size: 0x50 # CHECK-DAG: sym: _private_int, objAddr: 0x560, binAddr: 0x100001004, size: 0x0 @@ -26,7 +24,6 @@ # CHECK-NOT: { sym: # CHECK-NEXT: filename:{{.*}}/Inputs/./libbasic.a(basic3.macho.x86_64.o)' # CHECK-NEXT: timestamp: 0 -# CHECK-NEXT: type: 102 # CHECK-NEXT: symbols: # CHECK-DAG: sym: _val, binAddr: 0x100001008, size: 0x0 # CHECK-DAG: sym: _bar, objAddr: 0x20, binAddr: 0x100000F40, size: 0x50 diff --git a/llvm/tools/dsymutil/CMakeLists.txt b/llvm/tools/dsymutil/CMakeLists.txt index f88b1d0b20cef..efe28bda68ebf 100644 --- a/llvm/tools/dsymutil/CMakeLists.txt +++ b/llvm/tools/dsymutil/CMakeLists.txt @@ -32,7 +32,6 @@ add_llvm_tool(dsymutil MachOUtils.cpp Reproducer.cpp RelocationMap.cpp - SwiftModule.cpp DEPENDS intrinsics_gen diff --git a/llvm/tools/dsymutil/DebugMap.cpp b/llvm/tools/dsymutil/DebugMap.cpp index f1cd7e402f28d..b38d502dda7c9 100644 --- a/llvm/tools/dsymutil/DebugMap.cpp +++ b/llvm/tools/dsymutil/DebugMap.cpp @@ -161,13 +161,12 @@ namespace yaml { // Normalize/Denormalize between YAML and a DebugMapObject. struct MappingTraits::YamlDMO { - YamlDMO(IO &io) {} + YamlDMO(IO &io) { Timestamp = 0; } YamlDMO(IO &io, dsymutil::DebugMapObject &Obj); dsymutil::DebugMapObject denormalize(IO &IO); std::string Filename; - int64_t Timestamp = 0; - uint8_t Type = MachO::N_OSO; + int64_t Timestamp; std::vector Entries; }; @@ -184,7 +183,6 @@ void MappingTraits::mapping( MappingNormalization Norm(io, DMO); io.mapRequired("filename", Norm->Filename); io.mapOptional("timestamp", Norm->Timestamp); - io.mapOptional("type", Norm->Type); io.mapRequired("symbols", Norm->Entries); } @@ -238,7 +236,6 @@ MappingTraits::YamlDMO::YamlDMO( IO &io, dsymutil::DebugMapObject &Obj) { Filename = Obj.Filename; Timestamp = sys::toTimeT(Obj.getTimestamp()); - Type = Obj.getType(); Entries.reserve(Obj.Symbols.size()); for (auto &Entry : Obj.Symbols) Entries.push_back( @@ -289,6 +286,7 @@ MappingTraits::YamlDMO::denormalize(IO &IO) { } } + uint8_t Type = MachO::N_OSO; if (Path.ends_with(".dylib")) { // FIXME: find a more resilient way Type = MachO::N_LIB; diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp index c8fa4dbeffb9e..f6a35708dc076 100644 --- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp +++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp @@ -10,7 +10,6 @@ #include "BinaryHolder.h" #include "DebugMap.h" #include "MachOUtils.h" -#include "SwiftModule.h" #include "dsymutil.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -784,21 +783,6 @@ bool DwarfLinkerForBinary::linkImpl( reportWarning("Could not open '" + File + "'"); continue; } - auto FromInterfaceOrErr = - IsBuiltFromSwiftInterface((*ErrorOrMem)->getBuffer()); - if (!FromInterfaceOrErr) { - reportWarning("Could not parse binary Swift module: " + - toString(FromInterfaceOrErr.takeError()), - Obj->getObjectFilename()); - // Only skip swiftmodules that could be parsed and are - // positively identified as textual. - } else if (*FromInterfaceOrErr) { - if (Options.Verbose) - outs() << "Skipping compiled textual Swift interface: " - << Obj->getObjectFilename() << "\n"; - continue; - } - sys::fs::file_status Stat; if (auto Err = sys::fs::status(File, Stat)) { reportWarning(Err.message()); diff --git a/llvm/tools/dsymutil/RelocationMap.h b/llvm/tools/dsymutil/RelocationMap.h index 5a804cd141c38..3d851acf2b892 100644 --- a/llvm/tools/dsymutil/RelocationMap.h +++ b/llvm/tools/dsymutil/RelocationMap.h @@ -37,7 +37,6 @@ struct SymbolMapping { std::optional ObjectAddress; yaml::Hex64 BinaryAddress; yaml::Hex32 Size; - yaml::Hex8 Type; SymbolMapping(std::optional ObjectAddr, uint64_t BinaryAddress, uint32_t Size) diff --git a/llvm/tools/dsymutil/SwiftModule.cpp b/llvm/tools/dsymutil/SwiftModule.cpp deleted file mode 100644 index 7b21f30237e4e..0000000000000 --- a/llvm/tools/dsymutil/SwiftModule.cpp +++ /dev/null @@ -1,192 +0,0 @@ -//===- tools/dsymutil/SwiftModule.cpp -------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "llvm/Bitcode/BitcodeReader.h" -#include "llvm/Bitcode/LLVMBitCodes.h" -#include "llvm/Bitstream/BitCodes.h" -#include "llvm/Bitstream/BitstreamReader.h" - -namespace { -// Copied from swift/lib/Serialization/ModuleFormat.h -constexpr unsigned char SWIFTMODULE_SIGNATURE[] = {0xE2, 0x9C, 0xA8, 0x0E}; -constexpr uint16_t expectedMajorVersion = 0; -constexpr unsigned MODULE_BLOCK_ID = llvm::bitc::FIRST_APPLICATION_BLOCKID; -constexpr unsigned CONTROL_BLOCK_ID = llvm::bitc::FIRST_APPLICATION_BLOCKID + 1; -constexpr unsigned METADATA = 1; -constexpr unsigned OPTIONS_BLOCK_ID = llvm::bitc::FIRST_APPLICATION_BLOCKID + 8; -constexpr unsigned IS_BUILT_FROM_INTERFACE = 11; - -llvm::Error checkModuleSignature(llvm::BitstreamCursor &cursor, - llvm::ArrayRef signature) { - for (unsigned char byte : signature) { - if (cursor.AtEndOfStream()) - return llvm::createStringError("malformed bitstream"); - llvm::Expected maybeRead = - cursor.Read(8); - if (!maybeRead) - return maybeRead.takeError(); - if (maybeRead.get() != byte) - return llvm::createStringError("malformed bitstream"); - } - return llvm::Error::success(); -} - -llvm::Error enterTopLevelModuleBlock(llvm::BitstreamCursor &cursor, - unsigned ID) { - llvm::Expected maybeNext = cursor.advance(); - if (!maybeNext) - return maybeNext.takeError(); - llvm::BitstreamEntry next = maybeNext.get(); - - if (next.Kind != llvm::BitstreamEntry::SubBlock) - return llvm::createStringError("malformed bitstream"); - - if (next.ID == llvm::bitc::BLOCKINFO_BLOCK_ID) { - if (cursor.SkipBlock()) - return llvm::createStringError("malformed bitstream"); - return enterTopLevelModuleBlock(cursor, ID); - } - - if (next.ID != ID) - return llvm::createStringError("malformed bitstream"); - - if (llvm::Error Err = cursor.EnterSubBlock(ID)) - return Err; - - return llvm::Error::success(); -} - -llvm::Expected -readOptionsBlock(llvm::BitstreamCursor &cursor, - llvm::SmallVectorImpl &scratch) { - bool is_built_from_interface = false; - while (!cursor.AtEndOfStream()) { - llvm::Expected maybeEntry = cursor.advance(); - if (!maybeEntry) - return maybeEntry.takeError(); - - llvm::BitstreamEntry entry = maybeEntry.get(); - if (entry.Kind == llvm::BitstreamEntry::EndBlock) - break; - - if (entry.Kind == llvm::BitstreamEntry::Error) - return llvm::createStringError("malformed bitstream"); - - if (entry.Kind == llvm::BitstreamEntry::SubBlock) { - if (cursor.SkipBlock()) - return llvm::createStringError("malformed bitstream"); - continue; - } - - scratch.clear(); - llvm::StringRef blobData; - llvm::Expected maybeKind = - cursor.readRecord(entry.ID, scratch, &blobData); - if (!maybeKind) - return maybeKind.takeError(); - unsigned kind = maybeKind.get(); - switch (kind) { - case IS_BUILT_FROM_INTERFACE: - is_built_from_interface = true; - continue; - default: - continue; - } - } - return is_built_from_interface; -} - -llvm::Expected -parseControlBlock(llvm::BitstreamCursor &cursor, - llvm::SmallVectorImpl &scratch) { - // The control block is malformed until we've at least read a major version - // number. - bool versionSeen = false; - - while (!cursor.AtEndOfStream()) { - llvm::Expected maybeEntry = cursor.advance(); - if (!maybeEntry) - return maybeEntry.takeError(); - - llvm::BitstreamEntry entry = maybeEntry.get(); - if (entry.Kind == llvm::BitstreamEntry::EndBlock) - break; - - if (entry.Kind == llvm::BitstreamEntry::Error) - return llvm::createStringError("malformed bitstream"); - - if (entry.Kind == llvm::BitstreamEntry::SubBlock) { - if (entry.ID == OPTIONS_BLOCK_ID) { - if (llvm::Error Err = cursor.EnterSubBlock(OPTIONS_BLOCK_ID)) - return Err; - - return readOptionsBlock(cursor, scratch); - } else { - // Unknown metadata sub-block, possibly for use by a future version of - // the module format. - if (cursor.SkipBlock()) - return llvm::createStringError("malformed bitstream"); - } - continue; - } - - scratch.clear(); - llvm::StringRef blobData; - llvm::Expected maybeKind = - cursor.readRecord(entry.ID, scratch, &blobData); - if (!maybeKind) - return maybeKind.takeError(); - - unsigned kind = maybeKind.get(); - if (kind == METADATA) { - if (versionSeen) - return llvm::createStringError("multiple metadata blocks"); - - uint16_t versionMajor = scratch[0]; - if (versionMajor != expectedMajorVersion) - return llvm::createStringError("unsupported module version"); - - versionSeen = true; - } - } - return llvm::createStringError("could not find control block"); -} - -} // namespace - -llvm::Expected IsBuiltFromSwiftInterface(llvm::StringRef data) { - llvm::BitstreamCursor cursor(data); - if (llvm::Error Err = checkModuleSignature(cursor, SWIFTMODULE_SIGNATURE)) - return llvm::joinErrors( - llvm::createStringError("could not check signature"), std::move(Err)); - if (llvm::Error Err = enterTopLevelModuleBlock(cursor, MODULE_BLOCK_ID)) - return llvm::joinErrors( - llvm::createStringError("could not enter top level block"), - std::move(Err)); - - llvm::BitstreamEntry topLevelEntry; - llvm::SmallVector scratch; - - while (!cursor.AtEndOfStream()) { - llvm::Expected maybeEntry = - cursor.advance(llvm::BitstreamCursor::AF_DontPopBlockAtEnd); - if (!maybeEntry) - return maybeEntry.takeError(); - - topLevelEntry = maybeEntry.get(); - if (topLevelEntry.Kind != llvm::BitstreamEntry::SubBlock) - break; - - if (topLevelEntry.ID == CONTROL_BLOCK_ID) { - if (llvm::Error Err = cursor.EnterSubBlock(CONTROL_BLOCK_ID)) - return Err; - return parseControlBlock(cursor, scratch); - } - } - return llvm::createStringError("no control block found"); -} diff --git a/llvm/tools/dsymutil/SwiftModule.h b/llvm/tools/dsymutil/SwiftModule.h deleted file mode 100644 index 9a272fd6fed36..0000000000000 --- a/llvm/tools/dsymutil/SwiftModule.h +++ /dev/null @@ -1,15 +0,0 @@ -//===- tools/dsymutil/SwiftModule.h ---------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef LLVM_TOOLS_DSYMUTIL_SWIFTMODULE_H -#define LLVM_TOOLS_DSYMUTIL_SWIFTMODULE_H - -#include "llvm/Support/Error.h" - -llvm::Expected IsBuiltFromSwiftInterface(llvm::StringRef data); - -#endif From 271399831b780d25dce85715727c841843e10d4b Mon Sep 17 00:00:00 2001 From: k-kashapov <52855633+k-kashapov@users.noreply.github.com> Date: Tue, 8 Apr 2025 19:51:13 +0300 Subject: [PATCH 1021/1029] [MSan] Change overflow_size_tls type to IntPtrTy (#117689) As discussed in https://github.com/llvm/llvm-project/pull/109284#discussion_r1838819987: Changed `__msan_va_arg_overflow_size_tls` type from `Int64Ty` to `IntPtrTy`. --- compiler-rt/lib/msan/msan.cpp | 2 +- .../Instrumentation/MemorySanitizer.cpp | 16 ++--- .../MemorySanitizer/i386/vararg.ll | 64 +++++++++---------- 3 files changed, 41 insertions(+), 41 deletions(-) diff --git a/compiler-rt/lib/msan/msan.cpp b/compiler-rt/lib/msan/msan.cpp index 6c27ab21eeebf..a3c0c2e485af3 100644 --- a/compiler-rt/lib/msan/msan.cpp +++ b/compiler-rt/lib/msan/msan.cpp @@ -63,7 +63,7 @@ alignas(16) SANITIZER_INTERFACE_ATTRIBUTE THREADLOCAL u32 __msan_va_arg_origin_tls[kMsanParamTlsSize / sizeof(u32)]; SANITIZER_INTERFACE_ATTRIBUTE -THREADLOCAL u64 __msan_va_arg_overflow_size_tls; +THREADLOCAL uptr __msan_va_arg_overflow_size_tls; SANITIZER_INTERFACE_ATTRIBUTE THREADLOCAL u32 __msan_origin_tls; diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 1cea53f695292..403d5c75379b8 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -907,8 +907,8 @@ void MemorySanitizer::createUserspaceApi(Module &M, getOrInsertGlobal(M, "__msan_va_arg_origin_tls", ArrayType::get(OriginTy, kParamTLSSize / 4)); - VAArgOverflowSizeTLS = - getOrInsertGlobal(M, "__msan_va_arg_overflow_size_tls", IRB.getInt64Ty()); + VAArgOverflowSizeTLS = getOrInsertGlobal(M, "__msan_va_arg_overflow_size_tls", + IRB.getIntPtrTy(M.getDataLayout())); for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; AccessSizeIndex++) { @@ -6496,7 +6496,7 @@ struct VarArgPowerPCHelper : public VarArgHelperBase { assert(!VAArgSize && !VAArgTLSCopy && "finalizeInstrumentation called twice"); IRBuilder<> IRB(MSV.FnPrologueEnd); - VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS); + VAArgSize = IRB.CreateLoad(MS.IntptrTy, MS.VAArgOverflowSizeTLS); Value *CopySize = VAArgSize; if (!VAStartInstrumentationList.empty()) { @@ -6510,7 +6510,7 @@ struct VarArgPowerPCHelper : public VarArgHelperBase { Value *SrcSize = IRB.CreateBinaryIntrinsic( Intrinsic::umin, CopySize, - ConstantInt::get(IRB.getInt64Ty(), kParamTLSSize)); + ConstantInt::get(MS.IntptrTy, kParamTLSSize)); IRB.CreateMemCpy(VAArgTLSCopy, kShadowTLSAlignment, MS.VAArgTLS, kShadowTLSAlignment, SrcSize); } @@ -6893,7 +6893,7 @@ struct VarArgI386Helper : public VarArgHelperBase { assert(!VAArgSize && !VAArgTLSCopy && "finalizeInstrumentation called twice"); IRBuilder<> IRB(MSV.FnPrologueEnd); - VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS); + VAArgSize = IRB.CreateLoad(MS.IntptrTy, MS.VAArgOverflowSizeTLS); Value *CopySize = VAArgSize; if (!VAStartInstrumentationList.empty()) { @@ -6906,7 +6906,7 @@ struct VarArgI386Helper : public VarArgHelperBase { Value *SrcSize = IRB.CreateBinaryIntrinsic( Intrinsic::umin, CopySize, - ConstantInt::get(IRB.getInt64Ty(), kParamTLSSize)); + ConstantInt::get(MS.IntptrTy, kParamTLSSize)); IRB.CreateMemCpy(VAArgTLSCopy, kShadowTLSAlignment, MS.VAArgTLS, kShadowTLSAlignment, SrcSize); } @@ -6978,7 +6978,7 @@ struct VarArgGenericHelper : public VarArgHelperBase { assert(!VAArgSize && !VAArgTLSCopy && "finalizeInstrumentation called twice"); IRBuilder<> IRB(MSV.FnPrologueEnd); - VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS); + VAArgSize = IRB.CreateLoad(MS.IntptrTy, MS.VAArgOverflowSizeTLS); Value *CopySize = VAArgSize; if (!VAStartInstrumentationList.empty()) { @@ -6991,7 +6991,7 @@ struct VarArgGenericHelper : public VarArgHelperBase { Value *SrcSize = IRB.CreateBinaryIntrinsic( Intrinsic::umin, CopySize, - ConstantInt::get(IRB.getInt64Ty(), kParamTLSSize)); + ConstantInt::get(MS.IntptrTy, kParamTLSSize)); IRB.CreateMemCpy(VAArgTLSCopy, kShadowTLSAlignment, MS.VAArgTLS, kShadowTLSAlignment, SrcSize); } diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg.ll index 133d711764c71..f3f417ae1e37a 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg.ll @@ -9,29 +9,29 @@ target triple = "i386-unknown-linux-gnu" define void @VaStart(ptr %s, ...) { ; CHECK-LABEL: define void @VaStart( ; CHECK-SAME: ptr [[S:%.*]], ...) { -; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = alloca i8, i64 [[TMP6]], align 8 -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP4]], i8 0, i64 [[TMP6]], i1 false) -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP6]], i64 800) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP4]], ptr align 8 @__msan_va_arg_tls, i64 [[TMP3]], i1 false) +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr @__msan_va_arg_overflow_size_tls, align 4 +; CHECK-NEXT: [[TMP5:%.*]] = alloca i8, i32 [[TMP4]], align 8 +; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 8 [[TMP5]], i8 0, i32 [[TMP4]], i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP4]], i32 800) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[TMP5]], ptr align 8 @__msan_va_arg_tls, i32 [[TMP3]], i1 false) ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[VL:%.*]] = alloca ptr, align 4 ; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[VL]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[TMP0]], 2147483647 ; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i32 [[TMP1]] to ptr ; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[TMP2]], i8 0, i32 4, i1 false) -; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[VL]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 2147483647 -; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i32 [[TMP10]] to ptr -; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP11]], i8 0, i64 4, i1 false) +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[VL]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = and i32 [[TMP7]], 2147483647 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i32 [[TMP8]] to ptr +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP9]], i8 0, i64 4, i1 false) ; CHECK-NEXT: call void @llvm.va_start.p0(ptr [[VL]]) -; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[VL]] to i32 -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i32 [[TMP12]] to ptr -; CHECK-NEXT: [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP14]] to i32 -; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 2147483647 -; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i32 [[TMP16]] to ptr -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP17]], ptr align 4 [[TMP4]], i64 [[TMP6]], i1 false) +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[VL]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i32 [[TMP10]] to ptr +; CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[TMP12]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = and i32 [[TMP13]], 2147483647 +; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i32 [[TMP14]] to ptr +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP15]], ptr align 4 [[TMP5]], i32 [[TMP4]], i1 false) ; CHECK-NEXT: ret void ; ; KERNEL-LABEL: define void @VaStart( @@ -44,26 +44,26 @@ define void @VaStart(ptr %s, ...) { ; KERNEL-NEXT: [[VA_ARG_OVERFLOW_SIZE:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 4 ; KERNEL-NEXT: [[PARAM_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 5 ; KERNEL-NEXT: [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i32 0, i32 6 -; KERNEL-NEXT: [[TMP2:%.*]] = load i64, ptr [[VA_ARG_OVERFLOW_SIZE]], align 4 -; KERNEL-NEXT: [[TMP3:%.*]] = alloca i8, i64 [[TMP2]], align 8 -; KERNEL-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP3]], i8 0, i64 [[TMP2]], i1 false) -; KERNEL-NEXT: [[TMP4:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 800) -; KERNEL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP3]], ptr align 8 [[VA_ARG_SHADOW]], i64 [[TMP4]], i1 false) +; KERNEL-NEXT: [[TMP2:%.*]] = load i32, ptr [[VA_ARG_OVERFLOW_SIZE]], align 4 +; KERNEL-NEXT: [[TMP3:%.*]] = alloca i8, i32 [[TMP2]], align 8 +; KERNEL-NEXT: call void @llvm.memset.p0.i32(ptr align 8 [[TMP3]], i8 0, i32 [[TMP2]], i1 false) +; KERNEL-NEXT: [[TMP4:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP2]], i32 800) +; KERNEL-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[TMP3]], ptr align 8 [[VA_ARG_SHADOW]], i32 [[TMP4]], i1 false) ; KERNEL-NEXT: call void @llvm.donothing() ; KERNEL-NEXT: [[VL:%.*]] = alloca ptr, align 4 ; KERNEL-NEXT: call void @__msan_unpoison_alloca(ptr [[VL]], i32 4) -; KERNEL-NEXT: [[TMP7:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_1(ptr [[VL]]) -; KERNEL-NEXT: [[TMP8:%.*]] = extractvalue { ptr, ptr } [[TMP7]], 0 -; KERNEL-NEXT: [[TMP9:%.*]] = extractvalue { ptr, ptr } [[TMP7]], 1 -; KERNEL-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP8]], i8 0, i64 4, i1 false) +; KERNEL-NEXT: [[TMP5:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_1(ptr [[VL]]) +; KERNEL-NEXT: [[TMP6:%.*]] = extractvalue { ptr, ptr } [[TMP5]], 0 +; KERNEL-NEXT: [[TMP7:%.*]] = extractvalue { ptr, ptr } [[TMP5]], 1 +; KERNEL-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[TMP6]], i8 0, i64 4, i1 false) ; KERNEL-NEXT: call void @llvm.va_start.p0(ptr [[VL]]) -; KERNEL-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[VL]] to i32 -; KERNEL-NEXT: [[TMP11:%.*]] = inttoptr i32 [[TMP10]] to ptr -; KERNEL-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 4 -; KERNEL-NEXT: [[TMP13:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_1(ptr [[TMP12]]) -; KERNEL-NEXT: [[TMP14:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 0 -; KERNEL-NEXT: [[TMP15:%.*]] = extractvalue { ptr, ptr } [[TMP13]], 1 -; KERNEL-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP14]], ptr align 4 [[TMP3]], i64 [[TMP2]], i1 false) +; KERNEL-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[VL]] to i32 +; KERNEL-NEXT: [[TMP9:%.*]] = inttoptr i32 [[TMP8]] to ptr +; KERNEL-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 4 +; KERNEL-NEXT: [[TMP11:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_store_1(ptr [[TMP10]]) +; KERNEL-NEXT: [[TMP12:%.*]] = extractvalue { ptr, ptr } [[TMP11]], 0 +; KERNEL-NEXT: [[TMP13:%.*]] = extractvalue { ptr, ptr } [[TMP11]], 1 +; KERNEL-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP12]], ptr align 4 [[TMP3]], i32 [[TMP2]], i1 false) ; KERNEL-NEXT: ret void ; %vl = alloca ptr, align 4 From 441f87968df5dfb74d710fa32147789be98c20a6 Mon Sep 17 00:00:00 2001 From: Morris Hafner Date: Tue, 8 Apr 2025 18:53:54 +0200 Subject: [PATCH 1022/1029] [CIR] Upstream CmpOp (#133159) This patch adds support for comparison operators with ClangIR, both integral and floating point. --------- Co-authored-by: Morris Hafner Co-authored-by: Henrich Lauko Co-authored-by: Andy Kaylor --- .../CIR/Dialect/Builder/CIRBaseBuilder.h | 5 + clang/include/clang/CIR/Dialect/IR/CIROps.td | 41 ++ clang/include/clang/CIR/MissingFeatures.h | 1 - clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 79 +++ .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 82 +++ .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h | 13 + clang/test/CIR/CodeGen/cast.cpp | 23 +- clang/test/CIR/CodeGen/cmp.cpp | 470 ++++++++++++++++++ clang/test/CIR/IR/cmp.cir | 359 +++++++++++++ 9 files changed, 1064 insertions(+), 9 deletions(-) create mode 100644 clang/test/CIR/CodeGen/cmp.cpp create mode 100644 clang/test/CIR/IR/cmp.cir diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index c1e93fe790c08..429d217541e28 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -335,6 +335,11 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return createAdd(loc, lhs, rhs, OverflowBehavior::NoUnsignedWrap); } + cir::CmpOp createCompare(mlir::Location loc, cir::CmpOpKind kind, + mlir::Value lhs, mlir::Value rhs) { + return create(loc, getBoolTy(), kind, lhs, rhs); + } + // // Block handling helpers // ---------------------- diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index 609e60ca74b49..19fa532d151c3 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -1001,6 +1001,47 @@ def ForOp : LoopOpBase<"for"> { }]; } +//===----------------------------------------------------------------------===// +// CmpOp +//===----------------------------------------------------------------------===// + +def CmpOpKind_LT : I32EnumAttrCase<"lt", 1>; +def CmpOpKind_LE : I32EnumAttrCase<"le", 2>; +def CmpOpKind_GT : I32EnumAttrCase<"gt", 3>; +def CmpOpKind_GE : I32EnumAttrCase<"ge", 4>; +def CmpOpKind_EQ : I32EnumAttrCase<"eq", 5>; +def CmpOpKind_NE : I32EnumAttrCase<"ne", 6>; + +def CmpOpKind : I32EnumAttr< + "CmpOpKind", + "compare operation kind", + [CmpOpKind_LT, CmpOpKind_LE, CmpOpKind_GT, + CmpOpKind_GE, CmpOpKind_EQ, CmpOpKind_NE]> { + let cppNamespace = "::cir"; +} + +def CmpOp : CIR_Op<"cmp", [Pure, SameTypeOperands]> { + + let summary = "Compare values two values and produce a boolean result"; + let description = [{ + `cir.cmp` compares two input operands of the same type and produces a + `cir.bool` result. The kinds of comparison available are: + [lt,gt,ge,eq,ne] + + ```mlir + %7 = cir.cmp(gt, %1, %2) : i32, !cir.bool + ``` + }]; + + let results = (outs CIR_BoolType:$result); + let arguments = (ins Arg:$kind, + CIR_AnyType:$lhs, CIR_AnyType:$rhs); + + let assemblyFormat = [{ + `(` $kind `,` $lhs `,` $rhs `)` `:` type($lhs) `,` type($result) attr-dict + }]; +} + //===----------------------------------------------------------------------===// // BinOp //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 86fdaf1ddaf51..c1963b15e3199 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -88,7 +88,6 @@ struct MissingFeatures { static bool opGlobalViewAttr() { return false; } static bool lowerModeOptLevel() { return false; } static bool opTBAA() { return false; } - static bool opCmp() { return false; } static bool objCLifetime() { return false; } static bool emitNullabilityCheck() { return false; } static bool astVarDeclInterface() { return false; } diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 3863d21487531..ed49f395b7d51 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -707,6 +707,85 @@ class ScalarExprEmitter : public StmtVisitor { HANDLEBINOP(Xor) HANDLEBINOP(Or) #undef HANDLEBINOP + + mlir::Value emitCmp(const BinaryOperator *e) { + const mlir::Location loc = cgf.getLoc(e->getExprLoc()); + mlir::Value result; + QualType lhsTy = e->getLHS()->getType(); + QualType rhsTy = e->getRHS()->getType(); + + auto clangCmpToCIRCmp = + [](clang::BinaryOperatorKind clangCmp) -> cir::CmpOpKind { + switch (clangCmp) { + case BO_LT: + return cir::CmpOpKind::lt; + case BO_GT: + return cir::CmpOpKind::gt; + case BO_LE: + return cir::CmpOpKind::le; + case BO_GE: + return cir::CmpOpKind::ge; + case BO_EQ: + return cir::CmpOpKind::eq; + case BO_NE: + return cir::CmpOpKind::ne; + default: + llvm_unreachable("unsupported comparison kind for cir.cmp"); + } + }; + + if (lhsTy->getAs()) { + assert(!cir::MissingFeatures::dataMemberType()); + assert(e->getOpcode() == BO_EQ || e->getOpcode() == BO_NE); + mlir::Value lhs = cgf.emitScalarExpr(e->getLHS()); + mlir::Value rhs = cgf.emitScalarExpr(e->getRHS()); + cir::CmpOpKind kind = clangCmpToCIRCmp(e->getOpcode()); + result = builder.createCompare(loc, kind, lhs, rhs); + } else if (!lhsTy->isAnyComplexType() && !rhsTy->isAnyComplexType()) { + BinOpInfo boInfo = emitBinOps(e); + mlir::Value lhs = boInfo.lhs; + mlir::Value rhs = boInfo.rhs; + + if (lhsTy->isVectorType()) { + assert(!cir::MissingFeatures::vectorType()); + cgf.cgm.errorNYI(loc, "vector comparisons"); + result = builder.getBool(false, loc); + } else if (boInfo.isFixedPointOp()) { + assert(!cir::MissingFeatures::fixedPointType()); + cgf.cgm.errorNYI(loc, "fixed point comparisons"); + result = builder.getBool(false, loc); + } else { + // integers and pointers + if (cgf.cgm.getCodeGenOpts().StrictVTablePointers && + mlir::isa(lhs.getType()) && + mlir::isa(rhs.getType())) { + cgf.cgm.errorNYI(loc, "strict vtable pointer comparisons"); + } + + cir::CmpOpKind kind = clangCmpToCIRCmp(e->getOpcode()); + result = builder.createCompare(loc, kind, lhs, rhs); + } + } else { + // Complex Comparison: can only be an equality comparison. + assert(!cir::MissingFeatures::complexType()); + cgf.cgm.errorNYI(loc, "complex comparison"); + result = builder.getBool(false, loc); + } + + return emitScalarConversion(result, cgf.getContext().BoolTy, e->getType(), + e->getExprLoc()); + } + +// Comparisons. +#define VISITCOMP(CODE) \ + mlir::Value VisitBin##CODE(const BinaryOperator *E) { return emitCmp(E); } + VISITCOMP(LT) + VISITCOMP(GT) + VISITCOMP(LE) + VISITCOMP(GE) + VISITCOMP(EQ) + VISITCOMP(NE) +#undef VISITCOMP }; LValue ScalarExprEmitter::emitCompoundAssignLValue( diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 81b80e2e4eafb..7ca36409c9cac 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -21,6 +21,7 @@ #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/BuiltinDialect.h" #include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/Types.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h" @@ -1193,6 +1194,86 @@ mlir::LogicalResult CIRToLLVMBinOpLowering::matchAndRewrite( return mlir::LogicalResult::success(); } +/// Convert from a CIR comparison kind to an LLVM IR integral comparison kind. +static mlir::LLVM::ICmpPredicate +convertCmpKindToICmpPredicate(cir::CmpOpKind kind, bool isSigned) { + using CIR = cir::CmpOpKind; + using LLVMICmp = mlir::LLVM::ICmpPredicate; + switch (kind) { + case CIR::eq: + return LLVMICmp::eq; + case CIR::ne: + return LLVMICmp::ne; + case CIR::lt: + return (isSigned ? LLVMICmp::slt : LLVMICmp::ult); + case CIR::le: + return (isSigned ? LLVMICmp::sle : LLVMICmp::ule); + case CIR::gt: + return (isSigned ? LLVMICmp::sgt : LLVMICmp::ugt); + case CIR::ge: + return (isSigned ? LLVMICmp::sge : LLVMICmp::uge); + } + llvm_unreachable("Unknown CmpOpKind"); +} + +/// Convert from a CIR comparison kind to an LLVM IR floating-point comparison +/// kind. +static mlir::LLVM::FCmpPredicate +convertCmpKindToFCmpPredicate(cir::CmpOpKind kind) { + using CIR = cir::CmpOpKind; + using LLVMFCmp = mlir::LLVM::FCmpPredicate; + switch (kind) { + case CIR::eq: + return LLVMFCmp::oeq; + case CIR::ne: + return LLVMFCmp::une; + case CIR::lt: + return LLVMFCmp::olt; + case CIR::le: + return LLVMFCmp::ole; + case CIR::gt: + return LLVMFCmp::ogt; + case CIR::ge: + return LLVMFCmp::oge; + } + llvm_unreachable("Unknown CmpOpKind"); +} + +mlir::LogicalResult CIRToLLVMCmpOpLowering::matchAndRewrite( + cir::CmpOp cmpOp, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + mlir::Type type = cmpOp.getLhs().getType(); + + assert(!cir::MissingFeatures::dataMemberType()); + assert(!cir::MissingFeatures::methodType()); + + // Lower to LLVM comparison op. + if (mlir::isa(type)) { + bool isSigned = mlir::isa(type) + ? mlir::cast(type).isSigned() + : mlir::cast(type).isSigned(); + mlir::LLVM::ICmpPredicate kind = + convertCmpKindToICmpPredicate(cmpOp.getKind(), isSigned); + rewriter.replaceOpWithNewOp( + cmpOp, kind, adaptor.getLhs(), adaptor.getRhs()); + } else if (auto ptrTy = mlir::dyn_cast(type)) { + mlir::LLVM::ICmpPredicate kind = + convertCmpKindToICmpPredicate(cmpOp.getKind(), + /* isSigned=*/false); + rewriter.replaceOpWithNewOp( + cmpOp, kind, adaptor.getLhs(), adaptor.getRhs()); + } else if (mlir::isa(type)) { + mlir::LLVM::FCmpPredicate kind = + convertCmpKindToFCmpPredicate(cmpOp.getKind()); + rewriter.replaceOpWithNewOp( + cmpOp, kind, adaptor.getLhs(), adaptor.getRhs()); + } else { + return cmpOp.emitError() << "unsupported type for CmpOp: " << type; + } + + return mlir::success(); +} + static void prepareTypeConverter(mlir::LLVMTypeConverter &converter, mlir::DataLayout &dataLayout) { converter.addConversion([&](cir::PointerType type) -> mlir::Type { @@ -1334,6 +1415,7 @@ void ConvertCIRToLLVMPass::runOnOperation() { CIRToLLVMBinOpLowering, CIRToLLVMBrCondOpLowering, CIRToLLVMBrOpLowering, + CIRToLLVMCmpOpLowering, CIRToLLVMConstantOpLowering, CIRToLLVMFuncOpLowering, CIRToLLVMTrapOpLowering, diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h index 6f489fb49f44f..d53c4b31682bb 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h @@ -186,6 +186,19 @@ class CIRToLLVMBinOpLowering : public mlir::OpConversionPattern { mlir::ConversionPatternRewriter &) const override; }; +class CIRToLLVMCmpOpLowering : public mlir::OpConversionPattern { +public: + CIRToLLVMCmpOpLowering(const mlir::TypeConverter &typeConverter, + mlir::MLIRContext *context) + : OpConversionPattern(typeConverter, context) { + setHasBoundedRewriteRecursion(); + } + + mlir::LogicalResult + matchAndRewrite(cir::CmpOp op, OpAdaptor, + mlir::ConversionPatternRewriter &) const override; +}; + class CIRToLLVMBrOpLowering : public mlir::OpConversionPattern { public: using mlir::OpConversionPattern::OpConversionPattern; diff --git a/clang/test/CIR/CodeGen/cast.cpp b/clang/test/CIR/CodeGen/cast.cpp index ceae355a3ae1c..4a880e3479ea2 100644 --- a/clang/test/CIR/CodeGen/cast.cpp +++ b/clang/test/CIR/CodeGen/cast.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -DCIR_ONLY %s -o %t.cir +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir // RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll // RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM @@ -57,16 +57,16 @@ int cStyleCasts_0(unsigned x1, int x2, float x3, short x4, double x5) { // CIR: %{{[0-9]+}} = cir.cast(bool_to_int, %{{[0-9]+}} : !cir.bool), !s32i // LLVM: %{{[0-9]+}} = zext i1 %{{[0-9]+}} to i32 - #ifdef CIR_ONLY bool b2 = x2; // int to bool // CIR: %{{[0-9]+}} = cir.cast(int_to_bool, %{{[0-9]+}} : !s32i), !cir.bool - #endif + // LLVM: %[[INTTOBOOL:[0-9]+]] = icmp ne i32 %{{[0-9]+}}, 0 + // LLVM: zext i1 %[[INTTOBOOL]] to i8 - #ifdef CIR_ONLY void *p; - bool b3 = p; // ptr to bool + bool b3 = p; // ptr to bool // CIR: %{{[0-9]+}} = cir.cast(ptr_to_bool, %{{[0-9]+}} : !cir.ptr), !cir.bool - #endif + // LLVM: %[[PTRTOBOOL:[0-9]+]] = icmp ne ptr %{{[0-9]+}}, null + // LLVM: zext i1 %[[PTRTOBOOL]] to i8 float f; bool b4 = f; // float to bool @@ -77,7 +77,6 @@ int cStyleCasts_0(unsigned x1, int x2, float x3, short x4, double x5) { return 0; } -#ifdef CIR_ONLY bool cptr(void *d) { bool x = d; return x; @@ -88,7 +87,15 @@ bool cptr(void *d) { // CIR: %[[DVAL:[0-9]+]] = cir.load %[[DPTR]] : !cir.ptr>, !cir.ptr // CIR: %{{[0-9]+}} = cir.cast(ptr_to_bool, %[[DVAL]] : !cir.ptr), !cir.bool -#endif + +// LLVM-LABEL: define i1 @cptr(ptr %0) +// LLVM: %[[ARG_STORAGE:.*]] = alloca ptr, i64 1 +// LLVM: %[[RETVAL:.*]] = alloca i8, i64 1 +// LLVM: %[[X_STORAGE:.*]] = alloca i8, i64 1 +// LLVM: store ptr %0, ptr %[[ARG_STORAGE]] +// LLVM: %[[LOADED_PTR:.*]] = load ptr, ptr %[[ARG_STORAGE]] +// LLVM: %[[NULL_CHECK:.*]] = icmp ne ptr %[[LOADED_PTR]], null +// LLVM: ret i1 void should_not_cast() { unsigned x1; diff --git a/clang/test/CIR/CodeGen/cmp.cpp b/clang/test/CIR/CodeGen/cmp.cpp new file mode 100644 index 0000000000000..a4f273a1d5802 --- /dev/null +++ b/clang/test/CIR/CodeGen/cmp.cpp @@ -0,0 +1,470 @@ +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll +// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG + +void c0(int a, int b) { + bool x = a > b; + x = a < b; + x = a <= b; + x = a >= b; + x = a != b; + x = a == b; +} + +// CIR-LABEL: cir.func @c0( + +// CIR: %[[A_PTR:.*]] = cir.alloca !s32i, !cir.ptr, ["a", init] +// CIR: %[[B_PTR:.*]] = cir.alloca !s32i, !cir.ptr, ["b", init] +// CIR: %[[X_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr, ["x", init] + +// CIR: %[[A1:.*]] = cir.load %[[A_PTR]] +// CIR: %[[B1:.*]] = cir.load %[[B_PTR]] +// CIR: %{{.*}} = cir.cmp(gt, %[[A1]], %[[B1]]) : !s32i, !cir.bool +// CIR: cir.store {{.*}}, %[[X_PTR]] + +// CIR: %[[A2:.*]] = cir.load %[[A_PTR]] +// CIR: %[[B2:.*]] = cir.load %[[B_PTR]] +// CIR: %{{.*}} = cir.cmp(lt, %[[A2]], %[[B2]]) : !s32i, !cir.bool + +// CIR: %[[A3:.*]] = cir.load %[[A_PTR]] +// CIR: %[[B3:.*]] = cir.load %[[B_PTR]] +// CIR: %{{.*}} = cir.cmp(le, %[[A3]], %[[B3]]) : !s32i, !cir.bool + +// CIR: %[[A4:.*]] = cir.load %[[A_PTR]] +// CIR: %[[B4:.*]] = cir.load %[[B_PTR]] +// CIR: %{{.*}} = cir.cmp(ge, %[[A4]], %[[B4]]) : !s32i, !cir.bool + +// CIR: %[[A5:.*]] = cir.load %[[A_PTR]] +// CIR: %[[B5:.*]] = cir.load %[[B_PTR]] +// CIR: %{{.*}} = cir.cmp(ne, %[[A5]], %[[B5]]) : !s32i, !cir.bool + +// CIR: %[[A6:.*]] = cir.load %[[A_PTR]] +// CIR: %[[B6:.*]] = cir.load %[[B_PTR]] +// CIR: %{{.*}} = cir.cmp(eq, %[[A6]], %[[B6]]) : !s32i, !cir.bool + +// LLVM-LABEL: define void @c0(i32 %0, i32 %1) { +// LLVM: %[[PTR1:.*]] = alloca i32, i64 1 +// LLVM: %[[PTR2:.*]] = alloca i32, i64 1 +// LLVM: %[[BOOL_PTR:.*]] = alloca i8, i64 1 +// LLVM: store i32 %0, ptr %[[PTR1]] +// LLVM: store i32 %1, ptr %[[PTR2]] + +// LLVM: %[[A1:.*]] = load i32, ptr %[[PTR1]] +// LLVM: %[[B1:.*]] = load i32, ptr %[[PTR2]] +// LLVM: %[[CMP1:.*]] = icmp sgt i32 %[[A1]], %[[B1]] +// LLVM: %[[ZEXT1:.*]] = zext i1 %[[CMP1]] to i8 +// LLVM: store i8 %[[ZEXT1]], ptr %[[BOOL_PTR]] + +// LLVM: %[[A2:.*]] = load i32, ptr %[[PTR1]] +// LLVM: %[[B2:.*]] = load i32, ptr %[[PTR2]] +// LLVM: %[[CMP2:.*]] = icmp slt i32 %[[A2]], %[[B2]] +// LLVM: %[[ZEXT2:.*]] = zext i1 %[[CMP2]] to i8 +// LLVM: store i8 %[[ZEXT2]], ptr %[[BOOL_PTR]] + +// LLVM: %[[A3:.*]] = load i32, ptr %[[PTR1]] +// LLVM: %[[B3:.*]] = load i32, ptr %[[PTR2]] +// LLVM: %[[CMP3:.*]] = icmp sle i32 %[[A3]], %[[B3]] +// LLVM: %[[ZEXT3:.*]] = zext i1 %[[CMP3]] to i8 +// LLVM: store i8 %[[ZEXT3]], ptr %[[BOOL_PTR]] + +// LLVM: %[[A4:.*]] = load i32, ptr %[[PTR1]] +// LLVM: %[[B4:.*]] = load i32, ptr %[[PTR2]] +// LLVM: %[[CMP4:.*]] = icmp sge i32 %[[A4]], %[[B4]] +// LLVM: %[[ZEXT4:.*]] = zext i1 %[[CMP4]] to i8 +// LLVM: store i8 %[[ZEXT4]], ptr %[[BOOL_PTR]] + +// LLVM: %[[A5:.*]] = load i32, ptr %[[PTR1]] +// LLVM: %[[B5:.*]] = load i32, ptr %[[PTR2]] +// LLVM: %[[CMP5:.*]] = icmp ne i32 %[[A5]], %[[B5]] +// LLVM: %[[ZEXT5:.*]] = zext i1 %[[CMP5]] to i8 +// LLVM: store i8 %[[ZEXT5]], ptr %[[BOOL_PTR]] + +// LLVM: %[[A6:.*]] = load i32, ptr %[[PTR1]] +// LLVM: %[[B6:.*]] = load i32, ptr %[[PTR2]] +// LLVM: %[[CMP6:.*]] = icmp eq i32 %[[A6]], %[[B6]] +// LLVM: %[[ZEXT6:.*]] = zext i1 %[[CMP6]] to i8 +// LLVM: store i8 %[[ZEXT6]], ptr %[[BOOL_PTR]] + +// OGCG-LABEL: define dso_local void @_Z2c0ii(i32 {{.*}} %a, i32 {{.*}} %b) {{.*}} { +// OGCG: %[[PTR1:.*]] = alloca i32 +// OGCG: %[[PTR2:.*]] = alloca i32 +// OGCG: %[[BOOL_PTR:.*]] = alloca i8 +// OGCG: store i32 %a, ptr %[[PTR1]] +// OGCG: store i32 %b, ptr %[[PTR2]] + +// OGCO: %[[A1:.*]] = load i32, ptr %[[PTR1]] +// OGCO: %[[B1:.*]] = load i32, ptr %[[PTR2]] +// OGCO: %[[CMP1:.*]] = icmp sgt i32 %[[A1]], %[[B1]] +// OGCO: %[[ZEXT1:.*]] = zext i1 %[[CMP1]] to i8 +// OGCO: store i8 %[[ZEXT1]], ptr %[[BOOL_PTR]] + +// OGCO: %[[A2:.*]] = load i32, ptr %[[PTR1]] +// OGCO: %[[B2:.*]] = load i32, ptr %[[PTR2]] +// OGCO: %[[CMP2:.*]] = icmp slt i32 %[[A2]], %[[B2]] +// OGCO: %[[ZEXT2:.*]] = zext i1 %[[CMP2]] to i8 +// OGCO: store i8 %[[ZEXT2]], ptr %[[BOOL_PTR]] + +// OGCO: %[[A3:.*]] = load i32, ptr %[[PTR1]] +// OGCO: %[[B3:.*]] = load i32, ptr %[[PTR2]] +// OGCO: %[[CMP3:.*]] = icmp sle i32 %[[A3]], %[[B3]] +// OGCO: %[[ZEXT3:.*]] = zext i1 %[[CMP3]] to i8 +// OGCO: store i8 %[[ZEXT3]], ptr %[[BOOL_PTR]] + +// OGCO: %[[A4:.*]] = load i32, ptr %[[PTR1]] +// OGCO: %[[B4:.*]] = load i32, ptr %[[PTR2]] +// OGCO: %[[CMP4:.*]] = icmp sge i32 %[[A4]], %[[B4]] +// OGCO: %[[ZEXT4:.*]] = zext i1 %[[CMP4]] to i8 +// OGCO: store i8 %[[ZEXT4]], ptr %[[BOOL_PTR]] + +// OGCO: %[[A5:.*]] = load i32, ptr %[[PTR1]] +// OGCO: %[[B5:.*]] = load i32, ptr %[[PTR2]] +// OGCO: %[[CMP5:.*]] = icmp ne i32 %[[A5]], %[[B5]] +// OGCO: %[[ZEXT5:.*]] = zext i1 %[[CMP5]] to i8 +// OGCO: store i8 %[[ZEXT5]], ptr %[[BOOL_PTR]] + +// OGCO: %[[A6:.*]] = load i32, ptr %[[PTR1]] +// OGCO: %[[B6:.*]] = load i32, ptr %[[PTR2]] +// OGCO: %[[CMP6:.*]] = icmp eq i32 %[[A6]], %[[B6]] +// OGCO: %[[ZEXT6:.*]] = zext i1 %[[CMP6]] to i8 +// OGCO: store i8 %[[ZEXT6]], ptr %[[BOOL_PTR]] + +void c0_unsigned(unsigned int a, unsigned int b) { + bool x = a > b; + x = a < b; + x = a <= b; + x = a >= b; + x = a != b; + x = a == b; +} + +// CIR-LABEL: cir.func @c0_unsigned( + +// CIR: %[[U_A_PTR:.*]] = cir.alloca !u32i, !cir.ptr, ["a", init] +// CIR: %[[U_B_PTR:.*]] = cir.alloca !u32i, !cir.ptr, ["b", init] +// CIR: %[[U_X_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr, ["x", init] + +// CIR: %[[UA1:.*]] = cir.load %[[U_A_PTR]] +// CIR: %[[UB1:.*]] = cir.load %[[U_B_PTR]] +// CIR: %{{.*}} = cir.cmp(gt, %[[UA1]], %[[UB1]]) : !u32i, !cir.bool + +// CIR: %[[UA2:.*]] = cir.load %[[U_A_PTR]] +// CIR: %[[UB2:.*]] = cir.load %[[U_B_PTR]] +// CIR: %{{.*}} = cir.cmp(lt, %[[UA2]], %[[UB2]]) : !u32i, !cir.bool + +// CIR: %[[UA3:.*]] = cir.load %[[U_A_PTR]] +// CIR: %[[UB3:.*]] = cir.load %[[U_B_PTR]] +// CIR: %{{.*}} = cir.cmp(le, %[[UA3]], %[[UB3]]) : !u32i, !cir.bool + +// CIR: %[[UA4:.*]] = cir.load %[[U_A_PTR]] +// CIR: %[[UB4:.*]] = cir.load %[[U_B_PTR]] +// CIR: %{{.*}} = cir.cmp(ge, %[[UA4]], %[[UB4]]) : !u32i, !cir.bool + +// CIR: %[[UA5:.*]] = cir.load %[[U_A_PTR]] +// CIR: %[[UB5:.*]] = cir.load %[[U_B_PTR]] +// CIR: %{{.*}} = cir.cmp(ne, %[[UA5]], %[[UB5]]) : !u32i, !cir.bool + +// CIR: %[[UA6:.*]] = cir.load %[[U_A_PTR]] +// CIR: %[[UB6:.*]] = cir.load %[[U_B_PTR]] +// CIR: %{{.*}} = cir.cmp(eq, %[[UA6]], %[[UB6]]) : !u32i, !cir.bool + +// LLVM-LABEL: define void @c0_unsigned(i32 %0, i32 %1) { +// LLVM: %[[U_PTR1:.*]] = alloca i32, i64 1 +// LLVM: %[[U_PTR2:.*]] = alloca i32, i64 1 +// LLVM: %[[U_BOOL_PTR:.*]] = alloca i8, i64 1 +// LLVM: store i32 %0, ptr %[[U_PTR1]] +// LLVM: store i32 %1, ptr %[[U_PTR2]] + +// LLVM: %[[UA1:.*]] = load i32, ptr %[[U_PTR1]] +// LLVM: %[[UB1:.*]] = load i32, ptr %[[U_PTR2]] +// LLVM: %[[UCMP1:.*]] = icmp ugt i32 %[[UA1]], %[[UB1]] +// LLVM: %[[UZEXT1:.*]] = zext i1 %[[UCMP1]] to i8 +// LLVM: store i8 %[[UZEXT1]], ptr %[[U_BOOL_PTR]] + +// LLVM: %[[UA2:.*]] = load i32, ptr %[[U_PTR1]] +// LLVM: %[[UB2:.*]] = load i32, ptr %[[U_PTR2]] +// LLVM: %[[UCMP2:.*]] = icmp ult i32 %[[UA2]], %[[UB2]] +// LLVM: %[[UZEXT2:.*]] = zext i1 %[[UCMP2]] to i8 +// LLVM: store i8 %[[UZEXT2]], ptr %[[U_BOOL_PTR]] + +// LLVM: %[[UA3:.*]] = load i32, ptr %[[U_PTR1]] +// LLVM: %[[UB3:.*]] = load i32, ptr %[[U_PTR2]] +// LLVM: %[[UCMP3:.*]] = icmp ule i32 %[[UA3]], %[[UB3]] +// LLVM: %[[UZEXT3:.*]] = zext i1 %[[UCMP3]] to i8 +// LLVM: store i8 %[[UZEXT3]], ptr %[[U_BOOL_PTR]] + +// LLVM: %[[UA4:.*]] = load i32, ptr %[[U_PTR1]] +// LLVM: %[[UB4:.*]] = load i32, ptr %[[U_PTR2]] +// LLVM: %[[UCMP4:.*]] = icmp uge i32 %[[UA4]], %[[UB4]] +// LLVM: %[[UZEXT4:.*]] = zext i1 %[[UCMP4]] to i8 +// LLVM: store i8 %[[UZEXT4]], ptr %[[U_BOOL_PTR]] + +// LLVM: %[[UA5:.*]] = load i32, ptr %[[U_PTR1]] +// LLVM: %[[UB5:.*]] = load i32, ptr %[[U_PTR2]] +// LLVM: %[[UCMP5:.*]] = icmp ne i32 %[[UA5]], %[[UB5]] +// LLVM: %[[UZEXT5:.*]] = zext i1 %[[UCMP5]] to i8 +// LLVM: store i8 %[[UZEXT5]], ptr %[[U_BOOL_PTR]] + +// LLVM: %[[UA6:.*]] = load i32, ptr %[[U_PTR1]] +// LLVM: %[[UB6:.*]] = load i32, ptr %[[U_PTR2]] +// LLVM: %[[UCMP6:.*]] = icmp eq i32 %[[UA6]], %[[UB6]] +// LLVM: %[[UZEXT6:.*]] = zext i1 %[[UCMP6]] to i8 +// LLVM: store i8 %[[UZEXT6]], ptr %[[U_BOOL_PTR]] + +// OGCG-LABEL: define dso_local void @_Z11c0_unsignedjj(i32 {{.*}} %a, i32 {{.*}} %b) {{.*}} { +// OGCG: %[[U_PTR1:.*]] = alloca i32 +// OGCG: %[[U_PTR2:.*]] = alloca i32 +// OGCG: %[[U_BOOL_PTR:.*]] = alloca i8 +// OGCG: store i32 %a, ptr %[[U_PTR1]] +// OGCG: store i32 %b, ptr %[[U_PTR2]] + +// OGCG: %[[UA1:.*]] = load i32, ptr %[[U_PTR1]] +// OGCG: %[[UB1:.*]] = load i32, ptr %[[U_PTR2]] +// OGCG: %[[UCMP1:.*]] = icmp ugt i32 %[[UA1]], %[[UB1]] +// OGCG: %[[UZEXT1:.*]] = zext i1 %[[UCMP1]] to i8 +// OGCG: store i8 %[[UZEXT1]], ptr %[[U_BOOL_PTR]] + +// OGCG: %[[UA2:.*]] = load i32, ptr %[[U_PTR1]] +// OGCG: %[[UB2:.*]] = load i32, ptr %[[U_PTR2]] +// OGCG: %[[UCMP2:.*]] = icmp ult i32 %[[UA2]], %[[UB2]] +// OGCG: %[[UZEXT2:.*]] = zext i1 %[[UCMP2]] to i8 +// OGCG: store i8 %[[UZEXT2]], ptr %[[U_BOOL_PTR]] + +// OGCG: %[[UA3:.*]] = load i32, ptr %[[U_PTR1]] +// OGCG: %[[UB3:.*]] = load i32, ptr %[[U_PTR2]] +// OGCG: %[[UCMP3:.*]] = icmp ule i32 %[[UA3]], %[[UB3]] +// OGCG: %[[UZEXT3:.*]] = zext i1 %[[UCMP3]] to i8 +// OGCG: store i8 %[[UZEXT3]], ptr %[[U_BOOL_PTR]] + +// OGCG: %[[UA4:.*]] = load i32, ptr %[[U_PTR1]] +// OGCG: %[[UB4:.*]] = load i32, ptr %[[U_PTR2]] +// OGCG: %[[UCMP4:.*]] = icmp uge i32 %[[UA4]], %[[UB4]] +// OGCG: %[[UZEXT4:.*]] = zext i1 %[[UCMP4]] to i8 +// OGCG: store i8 %[[UZEXT4]], ptr %[[U_BOOL_PTR]] + +// OGCG: %[[UA5:.*]] = load i32, ptr %[[U_PTR1]] +// OGCG: %[[UB5:.*]] = load i32, ptr %[[U_PTR2]] +// OGCG: %[[UCMP5:.*]] = icmp ne i32 %[[UA5]], %[[UB5]] +// OGCG: %[[UZEXT5:.*]] = zext i1 %[[UCMP5]] to i8 +// OGCG: store i8 %[[UZEXT5]], ptr %[[U_BOOL_PTR]] + +// OGCG: %[[UA6:.*]] = load i32, ptr %[[U_PTR1]] +// OGCG: %[[UB6:.*]] = load i32, ptr %[[U_PTR2]] +// OGCG: %[[UCMP6:.*]] = icmp eq i32 %[[UA6]], %[[UB6]] +// OGCG: %[[UZEXT6:.*]] = zext i1 %[[UCMP6]] to i8 +// OGCG: store i8 %[[UZEXT6]], ptr %[[U_BOOL_PTR]] + +void c0_float(float a, float b) { + bool x = a > b; + x = a < b; + x = a <= b; + x = a >= b; + x = a != b; + x = a == b; +} + +// CIR-LABEL: cir.func @c0_float(%arg0: !cir.float{{.*}}, %arg1: !cir.float{{.*}}) { +// CIR: %[[A_PTR:.*]] = cir.alloca !cir.float, !cir.ptr, ["a", init] +// CIR: %[[B_PTR:.*]] = cir.alloca !cir.float, !cir.ptr, ["b", init] +// CIR: %[[X_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr, ["x", init] + +// CIR: cir.store %arg0, %[[A_PTR]] : !cir.float, !cir.ptr +// CIR: cir.store %arg1, %[[B_PTR]] : !cir.float, !cir.ptr + +// CIR: %[[A1:.*]] = cir.load %[[A_PTR]] : !cir.ptr, !cir.float +// CIR: %[[B1:.*]] = cir.load %[[B_PTR]] : !cir.ptr, !cir.float +// CIR: %[[CMP1:.*]] = cir.cmp(gt, %[[A1]], %[[B1]]) : !cir.float, !cir.bool +// CIR: cir.store %[[CMP1]], %[[X_PTR]] : !cir.bool, !cir.ptr + +// CIR: %[[A2:.*]] = cir.load %[[A_PTR]] : !cir.ptr, !cir.float +// CIR: %[[B2:.*]] = cir.load %[[B_PTR]] : !cir.ptr, !cir.float +// CIR: %[[CMP2:.*]] = cir.cmp(lt, %[[A2]], %[[B2]]) : !cir.float, !cir.bool +// CIR: cir.store %[[CMP2]], %[[X_PTR]] : !cir.bool, !cir.ptr + +// CIR: %[[A3:.*]] = cir.load %[[A_PTR]] : !cir.ptr, !cir.float +// CIR: %[[B3:.*]] = cir.load %[[B_PTR]] : !cir.ptr, !cir.float +// CIR: %[[CMP3:.*]] = cir.cmp(le, %[[A3]], %[[B3]]) : !cir.float, !cir.bool +// CIR: cir.store %[[CMP3]], %[[X_PTR]] : !cir.bool, !cir.ptr + +// CIR: %[[A4:.*]] = cir.load %[[A_PTR]] : !cir.ptr, !cir.float +// CIR: %[[B4:.*]] = cir.load %[[B_PTR]] : !cir.ptr, !cir.float +// CIR: %[[CMP4:.*]] = cir.cmp(ge, %[[A4]], %[[B4]]) : !cir.float, !cir.bool +// CIR: cir.store %[[CMP4]], %[[X_PTR]] : !cir.bool, !cir.ptr + +// CIR: %[[A5:.*]] = cir.load %[[A_PTR]] : !cir.ptr, !cir.float +// CIR: %[[B5:.*]] = cir.load %[[B_PTR]] : !cir.ptr, !cir.float +// CIR: %[[CMP5:.*]] = cir.cmp(ne, %[[A5]], %[[B5]]) : !cir.float, !cir.bool +// CIR: cir.store %[[CMP5]], %[[X_PTR]] : !cir.bool, !cir.ptr + +// CIR: %[[A6:.*]] = cir.load %[[A_PTR]] : !cir.ptr, !cir.float +// CIR: %[[B6:.*]] = cir.load %[[B_PTR]] : !cir.ptr, !cir.float +// CIR: %[[CMP6:.*]] = cir.cmp(eq, %[[A6]], %[[B6]]) : !cir.float, !cir.bool +// CIR: cir.store %[[CMP6]], %[[X_PTR]] : !cir.bool, !cir.ptr + +// LLVM-LABEL: define void @c0_float(float %0, float %1) { +// LLVM: %[[A_PTR:.*]] = alloca float +// LLVM: %[[B_PTR:.*]] = alloca float +// LLVM: store float %0, ptr %[[A_PTR]] +// LLVM: store float %1, ptr %[[B_PTR]] + +// LLVM: load float, ptr %[[A_PTR]] +// LLVM: load float, ptr %[[B_PTR]] +// LLVM: fcmp ogt float %{{.*}}, %{{.*}} +// LLVM: zext i1 %{{.*}} to i8 + +// LLVM: fcmp olt float %{{.*}}, %{{.*}} +// LLVM: fcmp ole float %{{.*}}, %{{.*}} +// LLVM: fcmp oge float %{{.*}}, %{{.*}} +// LLVM: fcmp une float %{{.*}}, %{{.*}} +// LLVM: fcmp oeq float %{{.*}}, %{{.*}} + +// OGCG-LABEL: define dso_local void @_Z8c0_floatff(float {{.*}} %a, float {{.*}} %b) {{.*}} { +// OGCG: %[[A_PTR:.*]] = alloca float +// OGCG: %[[B_PTR:.*]] = alloca float +// OGCG: store float %a, ptr %[[A_PTR]] +// OGCG: store float %b, ptr %[[B_PTR]] + +// OGCG: load float, ptr %[[A_PTR]] +// OGCG: load float, ptr %[[B_PTR]] +// OGCG: fcmp ogt float %{{.*}}, %{{.*}} +// OGCG: zext i1 %{{.*}} to i8 + +// OGCG: fcmp olt float %{{.*}}, %{{.*}} +// OGCG: fcmp ole float %{{.*}}, %{{.*}} +// OGCG: fcmp oge float %{{.*}}, %{{.*}} +// OGCG: fcmp une float %{{.*}}, %{{.*}} +// OGCG: fcmp oeq float %{{.*}}, %{{.*}} + +void pointer_cmp(int *a, int *b) { + bool x = a > b; + x = a < b; + x = a >= b; + x = a <= b; + x = a == b; + x = a != b; +} + +// CIR-LABEL: cir.func @pointer_cmp(%arg0: !cir.ptr{{.*}}, %arg1: !cir.ptr{{.*}}) { +// CIR: %[[A_PTR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["a", init] +// CIR: %[[B_PTR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["b", init] + +// CIR: %[[A1:.*]] = cir.load %[[A_PTR]] : !cir.ptr>, !cir.ptr +// CIR: %[[B1:.*]] = cir.load %[[B_PTR]] : !cir.ptr>, !cir.ptr +// CIR: %{{.*}} = cir.cmp(gt, %[[A1]], %[[B1]]) : !cir.ptr, !cir.bool + +// CIR: cir.cmp(lt, {{.*}}, {{.*}}) : !cir.ptr, !cir.bool +// CIR: cir.cmp(ge, {{.*}}, {{.*}}) : !cir.ptr, !cir.bool +// CIR: cir.cmp(le, {{.*}}, {{.*}}) : !cir.ptr, !cir.bool +// CIR: cir.cmp(eq, {{.*}}, {{.*}}) : !cir.ptr, !cir.bool +// CIR: cir.cmp(ne, {{.*}}, {{.*}}) : !cir.ptr, !cir.bool + +// LLVM-LABEL: define void @pointer_cmp(ptr %0, ptr %1) { +// LLVM: %[[A_PTR:.*]] = alloca ptr +// LLVM: %[[B_PTR:.*]] = alloca ptr +// LLVM: store ptr %0, ptr %[[A_PTR]] +// LLVM: store ptr %1, ptr %[[B_PTR]] + +// LLVM: load ptr, ptr %[[A_PTR]] +// LLVM: load ptr, ptr %[[B_PTR]] +// LLVM: icmp ugt ptr %{{.*}}, %{{.*}} +// LLVM: zext i1 %{{.*}} to i8 +// LLVM: icmp ult ptr %{{.*}}, %{{.*}} +// LLVM: icmp uge ptr %{{.*}}, %{{.*}} +// LLVM: icmp ule ptr %{{.*}}, %{{.*}} +// LLVM: icmp eq ptr %{{.*}}, %{{.*}} +// LLVM: icmp ne ptr %{{.*}}, %{{.*}} + +// OGCG-LABEL: define dso_local void @_Z11pointer_cmpPiS_(ptr {{.*}} %a, ptr {{.*}} %b) {{.*}} { +// OGCG: %[[A_PTR:.*]] = alloca ptr +// OGCG: %[[B_PTR:.*]] = alloca ptr +// OGCG: store ptr %a, ptr %[[A_PTR]] +// OGCG: store ptr %b, ptr %[[B_PTR]] + +// OGCG: load ptr, ptr %[[A_PTR]] +// OGCG: load ptr, ptr %[[B_PTR]] +// OGCG: icmp ugt ptr %{{.*}}, %{{.*}} +// OGCG: zext i1 %{{.*}} to i8 +// OGCG: icmp ult ptr %{{.*}}, %{{.*}} +// OGCG: icmp uge ptr %{{.*}}, %{{.*}} +// OGCG: icmp ule ptr %{{.*}}, %{{.*}} +// OGCG: icmp eq ptr %{{.*}}, %{{.*}} +// OGCG: icmp ne ptr %{{.*}}, %{{.*}} + +void bool_cmp(bool a, bool b) { + bool x = a > b; + x = a < b; + x = a >= b; + x = a <= b; + x = a == b; + x = a != b; +} + +// CIR-LABEL: cir.func @bool_cmp(%arg0: !cir.bool{{.*}}, %arg1: !cir.bool{{.*}}) { +// CIR: %[[A_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr, ["a", init] +// CIR: %[[B_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr, ["b", init] +// CIR: %[[X_PTR:.*]] = cir.alloca !cir.bool, !cir.ptr, ["x", init] + +// CIR: %[[A1:.*]] = cir.load %[[A_PTR]] : !cir.ptr, !cir.bool +// CIR: %[[A1_INT:.*]] = cir.cast(bool_to_int, %[[A1]] : !cir.bool), !s32i +// CIR: %[[B1:.*]] = cir.load %[[B_PTR]] : !cir.ptr, !cir.bool +// CIR: %[[B1_INT:.*]] = cir.cast(bool_to_int, %[[B1]] : !cir.bool), !s32i +// CIR: %{{.*}} = cir.cmp(gt, %[[A1_INT]], %[[B1_INT]]) : !s32i, !cir.bool +// CIR: cir.store {{.*}}, %[[X_PTR]] : !cir.bool, !cir.ptr + +// CIR: cir.cmp(lt +// CIR: cir.cmp(ge +// CIR: cir.cmp(le +// CIR: cir.cmp(eq +// CIR: cir.cmp(ne + +// LLVM-LABEL: define void @bool_cmp(i1 %0, i1 %1) { +// LLVM: %[[A_PTR:.*]] = alloca i8 +// LLVM: %[[B_PTR:.*]] = alloca i8 +// LLVM: %[[X_PTR:.*]] = alloca i8 +// LLVM: %[[A_INIT:.*]] = zext i1 %0 to i8 +// LLVM: store i8 %[[A_INIT]], ptr %[[A_PTR]] +// LLVM: %[[B_INIT:.*]] = zext i1 %1 to i8 +// LLVM: store i8 %[[B_INIT]], ptr %[[B_PTR]] + +// LLVM: %[[A1:.*]] = load i8, ptr %[[A_PTR]] +// LLVM: %[[A1_TRUNC:.*]] = trunc i8 %[[A1]] to i1 +// LLVM: %[[A1_EXT:.*]] = zext i1 %[[A1_TRUNC]] to i32 +// LLVM: %[[B1:.*]] = load i8, ptr %[[B_PTR]] +// LLVM: %[[B1_TRUNC:.*]] = trunc i8 %[[B1]] to i1 +// LLVM: %[[B1_EXT:.*]] = zext i1 %[[B1_TRUNC]] to i32 +// LLVM: %[[CMP1:.*]] = icmp sgt i32 %[[A1_EXT]], %[[B1_EXT]] +// LLVM: %[[CMP1_BOOL:.*]] = zext i1 %[[CMP1]] to i8 +// LLVM: store i8 %[[CMP1_BOOL]], ptr %[[X_PTR]] + +// LLVM: icmp slt +// LLVM: icmp sge +// LLVM: icmp sle +// LLVM: icmp eq +// LLVM: icmp ne + +// OGCG-LABEL: define dso_local void @_Z8bool_cmpbb(i1 {{.*}} %a, i1 {{.*}} %b) {{.*}} { +// OGCG: %[[A_PTR:.*]] = alloca i8 +// OGCG: %[[B_PTR:.*]] = alloca i8 +// OGCG: %[[X_PTR:.*]] = alloca i8 +// OGCG: %[[A_INIT:.*]] = zext i1 %a to i8 +// OGCG: store i8 %[[A_INIT]], ptr %[[A_PTR]] +// OGCG: %[[B_INIT:.*]] = zext i1 %b to i8 +// OGCG: store i8 %[[B_INIT]], ptr %[[B_PTR]] + +// OGCG: %[[A1:.*]] = load i8, ptr %[[A_PTR]] +// OGCG: %[[A1_TRUNC:.*]] = trunc i8 %[[A1]] to i1 +// OGCG: %[[A1_EXT:.*]] = zext i1 %[[A1_TRUNC]] to i32 +// OGCG: %[[B1:.*]] = load i8, ptr %[[B_PTR]] +// OGCG: %[[B1_TRUNC:.*]] = trunc i8 %[[B1]] to i1 +// OGCG: %[[B1_EXT:.*]] = zext i1 %[[B1_TRUNC]] to i32 +// OGCG: %[[CMP1:.*]] = icmp sgt i32 %[[A1_EXT]], %[[B1_EXT]] +// OGCG: %[[CMP1_BOOL:.*]] = zext i1 %[[CMP1]] to i8 +// OGCG: store i8 %[[CMP1_BOOL]], ptr %[[X_PTR]] + +// OGCG: icmp slt +// OGCG: icmp sge +// OGCG: icmp sle +// OGCG: icmp eq +// OGCG: icmp ne diff --git a/clang/test/CIR/IR/cmp.cir b/clang/test/CIR/IR/cmp.cir new file mode 100644 index 0000000000000..a049dc51f1401 --- /dev/null +++ b/clang/test/CIR/IR/cmp.cir @@ -0,0 +1,359 @@ +// RUN: cir-opt %s | cir-opt | FileCheck %s +!s32i = !cir.int +!u32i = !cir.int + +module { + cir.func @c0(%arg0: !s32i, %arg1: !s32i) { + %0 = cir.alloca !s32i, !cir.ptr, ["a", init] {alignment = 4 : i64} + %1 = cir.alloca !s32i, !cir.ptr, ["b", init] {alignment = 4 : i64} + %2 = cir.alloca !cir.bool, !cir.ptr, ["x", init] {alignment = 1 : i64} + cir.store %arg0, %0 : !s32i, !cir.ptr + cir.store %arg1, %1 : !s32i, !cir.ptr + %3 = cir.load %0 : !cir.ptr, !s32i + %4 = cir.load %1 : !cir.ptr, !s32i + %5 = cir.cmp(gt, %3, %4) : !s32i, !cir.bool + cir.store %5, %2 : !cir.bool, !cir.ptr + %6 = cir.load %0 : !cir.ptr, !s32i + %7 = cir.load %1 : !cir.ptr, !s32i + %8 = cir.cmp(lt, %6, %7) : !s32i, !cir.bool + cir.store %8, %2 : !cir.bool, !cir.ptr + %9 = cir.load %0 : !cir.ptr, !s32i + %10 = cir.load %1 : !cir.ptr, !s32i + %11 = cir.cmp(le, %9, %10) : !s32i, !cir.bool + cir.store %11, %2 : !cir.bool, !cir.ptr + %12 = cir.load %0 : !cir.ptr, !s32i + %13 = cir.load %1 : !cir.ptr, !s32i + %14 = cir.cmp(ge, %12, %13) : !s32i, !cir.bool + cir.store %14, %2 : !cir.bool, !cir.ptr + %15 = cir.load %0 : !cir.ptr, !s32i + %16 = cir.load %1 : !cir.ptr, !s32i + %17 = cir.cmp(ne, %15, %16) : !s32i, !cir.bool + cir.store %17, %2 : !cir.bool, !cir.ptr + %18 = cir.load %0 : !cir.ptr, !s32i + %19 = cir.load %1 : !cir.ptr, !s32i + %20 = cir.cmp(eq, %18, %19) : !s32i, !cir.bool + cir.store %20, %2 : !cir.bool, !cir.ptr + cir.return + } + + // CHECK: cir.func @c0(%arg0: !s32i, %arg1: !s32i) { + // CHECK-NEXT: %0 = cir.alloca !s32i, !cir.ptr, ["a", init] {alignment = 4 : i64} + // CHECK-NEXT: %1 = cir.alloca !s32i, !cir.ptr, ["b", init] {alignment = 4 : i64} + // CHECK-NEXT: %2 = cir.alloca !cir.bool, !cir.ptr, ["x", init] {alignment = 1 : i64} + // CHECK-NEXT: cir.store %arg0, %0 : !s32i, !cir.ptr + // CHECK-NEXT: cir.store %arg1, %1 : !s32i, !cir.ptr + // CHECK-NEXT: %3 = cir.load %0 : !cir.ptr, !s32i + // CHECK-NEXT: %4 = cir.load %1 : !cir.ptr, !s32i + // CHECK-NEXT: %5 = cir.cmp(gt, %3, %4) : !s32i, !cir.bool + // CHECK-NEXT: cir.store %5, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %6 = cir.load %0 : !cir.ptr, !s32i + // CHECK-NEXT: %7 = cir.load %1 : !cir.ptr, !s32i + // CHECK-NEXT: %8 = cir.cmp(lt, %6, %7) : !s32i, !cir.bool + // CHECK-NEXT: cir.store %8, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %9 = cir.load %0 : !cir.ptr, !s32i + // CHECK-NEXT: %10 = cir.load %1 : !cir.ptr, !s32i + // CHECK-NEXT: %11 = cir.cmp(le, %9, %10) : !s32i, !cir.bool + // CHECK-NEXT: cir.store %11, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %12 = cir.load %0 : !cir.ptr, !s32i + // CHECK-NEXT: %13 = cir.load %1 : !cir.ptr, !s32i + // CHECK-NEXT: %14 = cir.cmp(ge, %12, %13) : !s32i, !cir.bool + // CHECK-NEXT: cir.store %14, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %15 = cir.load %0 : !cir.ptr, !s32i + // CHECK-NEXT: %16 = cir.load %1 : !cir.ptr, !s32i + // CHECK-NEXT: %17 = cir.cmp(ne, %15, %16) : !s32i, !cir.bool + // CHECK-NEXT: cir.store %17, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %18 = cir.load %0 : !cir.ptr, !s32i + // CHECK-NEXT: %19 = cir.load %1 : !cir.ptr, !s32i + // CHECK-NEXT: %20 = cir.cmp(eq, %18, %19) : !s32i, !cir.bool + // CHECK-NEXT: cir.store %20, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: cir.return + // CHECK-NEXT: } + + cir.func @c0_unsigned(%arg0: !u32i, %arg1: !u32i) { + %0 = cir.alloca !u32i, !cir.ptr, ["a", init] {alignment = 4 : i64} + %1 = cir.alloca !u32i, !cir.ptr, ["b", init] {alignment = 4 : i64} + %2 = cir.alloca !cir.bool, !cir.ptr, ["x", init] {alignment = 1 : i64} + cir.store %arg0, %0 : !u32i, !cir.ptr + cir.store %arg1, %1 : !u32i, !cir.ptr + %3 = cir.load %0 : !cir.ptr, !u32i + %4 = cir.load %1 : !cir.ptr, !u32i + %5 = cir.cmp(gt, %3, %4) : !u32i, !cir.bool + cir.store %5, %2 : !cir.bool, !cir.ptr + %6 = cir.load %0 : !cir.ptr, !u32i + %7 = cir.load %1 : !cir.ptr, !u32i + %8 = cir.cmp(lt, %6, %7) : !u32i, !cir.bool + cir.store %8, %2 : !cir.bool, !cir.ptr + %9 = cir.load %0 : !cir.ptr, !u32i + %10 = cir.load %1 : !cir.ptr, !u32i + %11 = cir.cmp(le, %9, %10) : !u32i, !cir.bool + cir.store %11, %2 : !cir.bool, !cir.ptr + %12 = cir.load %0 : !cir.ptr, !u32i + %13 = cir.load %1 : !cir.ptr, !u32i + %14 = cir.cmp(ge, %12, %13) : !u32i, !cir.bool + cir.store %14, %2 : !cir.bool, !cir.ptr + %15 = cir.load %0 : !cir.ptr, !u32i + %16 = cir.load %1 : !cir.ptr, !u32i + %17 = cir.cmp(ne, %15, %16) : !u32i, !cir.bool + cir.store %17, %2 : !cir.bool, !cir.ptr + %18 = cir.load %0 : !cir.ptr, !u32i + %19 = cir.load %1 : !cir.ptr, !u32i + %20 = cir.cmp(eq, %18, %19) : !u32i, !cir.bool + cir.store %20, %2 : !cir.bool, !cir.ptr + cir.return + } + + // CHECK: cir.func @c0_unsigned(%arg0: !u32i, %arg1: !u32i) { + // CHECK-NEXT: %0 = cir.alloca !u32i, !cir.ptr, ["a", init] {alignment = 4 : i64} + // CHECK-NEXT: %1 = cir.alloca !u32i, !cir.ptr, ["b", init] {alignment = 4 : i64} + // CHECK-NEXT: %2 = cir.alloca !cir.bool, !cir.ptr, ["x", init] {alignment = 1 : i64} + // CHECK-NEXT: cir.store %arg0, %0 : !u32i, !cir.ptr + // CHECK-NEXT: cir.store %arg1, %1 : !u32i, !cir.ptr + // CHECK-NEXT: %3 = cir.load %0 : !cir.ptr, !u32i + // CHECK-NEXT: %4 = cir.load %1 : !cir.ptr, !u32i + // CHECK-NEXT: %5 = cir.cmp(gt, %3, %4) : !u32i, !cir.bool + // CHECK-NEXT: cir.store %5, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %6 = cir.load %0 : !cir.ptr, !u32i + // CHECK-NEXT: %7 = cir.load %1 : !cir.ptr, !u32i + // CHECK-NEXT: %8 = cir.cmp(lt, %6, %7) : !u32i, !cir.bool + // CHECK-NEXT: cir.store %8, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %9 = cir.load %0 : !cir.ptr, !u32i + // CHECK-NEXT: %10 = cir.load %1 : !cir.ptr, !u32i + // CHECK-NEXT: %11 = cir.cmp(le, %9, %10) : !u32i, !cir.bool + // CHECK-NEXT: cir.store %11, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %12 = cir.load %0 : !cir.ptr, !u32i + // CHECK-NEXT: %13 = cir.load %1 : !cir.ptr, !u32i + // CHECK-NEXT: %14 = cir.cmp(ge, %12, %13) : !u32i, !cir.bool + // CHECK-NEXT: cir.store %14, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %15 = cir.load %0 : !cir.ptr, !u32i + // CHECK-NEXT: %16 = cir.load %1 : !cir.ptr, !u32i + // CHECK-NEXT: %17 = cir.cmp(ne, %15, %16) : !u32i, !cir.bool + // CHECK-NEXT: cir.store %17, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %18 = cir.load %0 : !cir.ptr, !u32i + // CHECK-NEXT: %19 = cir.load %1 : !cir.ptr, !u32i + // CHECK-NEXT: %20 = cir.cmp(eq, %18, %19) : !u32i, !cir.bool + // CHECK-NEXT: cir.store %20, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: cir.return + // CHECK-NEXT: } + + cir.func @c0_float(%arg0: !cir.float, %arg1: !cir.float) { + %0 = cir.alloca !cir.float, !cir.ptr, ["a", init] {alignment = 4 : i64} + %1 = cir.alloca !cir.float, !cir.ptr, ["b", init] {alignment = 4 : i64} + %2 = cir.alloca !cir.bool, !cir.ptr, ["x", init] {alignment = 1 : i64} + cir.store %arg0, %0 : !cir.float, !cir.ptr + cir.store %arg1, %1 : !cir.float, !cir.ptr + %3 = cir.load %0 : !cir.ptr, !cir.float + %4 = cir.load %1 : !cir.ptr, !cir.float + %5 = cir.cmp(gt, %3, %4) : !cir.float, !cir.bool + cir.store %5, %2 : !cir.bool, !cir.ptr + %6 = cir.load %0 : !cir.ptr, !cir.float + %7 = cir.load %1 : !cir.ptr, !cir.float + %8 = cir.cmp(lt, %6, %7) : !cir.float, !cir.bool + cir.store %8, %2 : !cir.bool, !cir.ptr + %9 = cir.load %0 : !cir.ptr, !cir.float + %10 = cir.load %1 : !cir.ptr, !cir.float + %11 = cir.cmp(le, %9, %10) : !cir.float, !cir.bool + cir.store %11, %2 : !cir.bool, !cir.ptr + %12 = cir.load %0 : !cir.ptr, !cir.float + %13 = cir.load %1 : !cir.ptr, !cir.float + %14 = cir.cmp(ge, %12, %13) : !cir.float, !cir.bool + cir.store %14, %2 : !cir.bool, !cir.ptr + %15 = cir.load %0 : !cir.ptr, !cir.float + %16 = cir.load %1 : !cir.ptr, !cir.float + %17 = cir.cmp(ne, %15, %16) : !cir.float, !cir.bool + cir.store %17, %2 : !cir.bool, !cir.ptr + %18 = cir.load %0 : !cir.ptr, !cir.float + %19 = cir.load %1 : !cir.ptr, !cir.float + %20 = cir.cmp(eq, %18, %19) : !cir.float, !cir.bool + cir.store %20, %2 : !cir.bool, !cir.ptr + cir.return + } + + // CHECK: cir.func @c0_float(%arg0: !cir.float, %arg1: !cir.float) { + // CHECK-NEXT: %0 = cir.alloca !cir.float, !cir.ptr, ["a", init] {alignment = 4 : i64} + // CHECK-NEXT: %1 = cir.alloca !cir.float, !cir.ptr, ["b", init] {alignment = 4 : i64} + // CHECK-NEXT: %2 = cir.alloca !cir.bool, !cir.ptr, ["x", init] {alignment = 1 : i64} + // CHECK-NEXT: cir.store %arg0, %0 : !cir.float, !cir.ptr + // CHECK-NEXT: cir.store %arg1, %1 : !cir.float, !cir.ptr + // CHECK-NEXT: %3 = cir.load %0 : !cir.ptr, !cir.float + // CHECK-NEXT: %4 = cir.load %1 : !cir.ptr, !cir.float + // CHECK-NEXT: %5 = cir.cmp(gt, %3, %4) : !cir.float, !cir.bool + // CHECK-NEXT: cir.store %5, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %6 = cir.load %0 : !cir.ptr, !cir.float + // CHECK-NEXT: %7 = cir.load %1 : !cir.ptr, !cir.float + // CHECK-NEXT: %8 = cir.cmp(lt, %6, %7) : !cir.float, !cir.bool + // CHECK-NEXT: cir.store %8, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %9 = cir.load %0 : !cir.ptr, !cir.float + // CHECK-NEXT: %10 = cir.load %1 : !cir.ptr, !cir.float + // CHECK-NEXT: %11 = cir.cmp(le, %9, %10) : !cir.float, !cir.bool + // CHECK-NEXT: cir.store %11, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %12 = cir.load %0 : !cir.ptr, !cir.float + // CHECK-NEXT: %13 = cir.load %1 : !cir.ptr, !cir.float + // CHECK-NEXT: %14 = cir.cmp(ge, %12, %13) : !cir.float, !cir.bool + // CHECK-NEXT: cir.store %14, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %15 = cir.load %0 : !cir.ptr, !cir.float + // CHECK-NEXT: %16 = cir.load %1 : !cir.ptr, !cir.float + // CHECK-NEXT: %17 = cir.cmp(ne, %15, %16) : !cir.float, !cir.bool + // CHECK-NEXT: cir.store %17, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %18 = cir.load %0 : !cir.ptr, !cir.float + // CHECK-NEXT: %19 = cir.load %1 : !cir.ptr, !cir.float + // CHECK-NEXT: %20 = cir.cmp(eq, %18, %19) : !cir.float, !cir.bool + // CHECK-NEXT: cir.store %20, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: cir.return + // CHECK-NEXT: } + + cir.func @pointer_cmp(%arg0: !cir.ptr, %arg1: !cir.ptr) { + %0 = cir.alloca !cir.ptr, !cir.ptr>, ["a", init] {alignment = 8 : i64} + %1 = cir.alloca !cir.ptr, !cir.ptr>, ["b", init] {alignment = 8 : i64} + %2 = cir.alloca !cir.bool, !cir.ptr, ["x", init] {alignment = 1 : i64} + cir.store %arg0, %0 : !cir.ptr, !cir.ptr> + cir.store %arg1, %1 : !cir.ptr, !cir.ptr> + %3 = cir.load %0 : !cir.ptr>, !cir.ptr + %4 = cir.load %1 : !cir.ptr>, !cir.ptr + %5 = cir.cmp(gt, %3, %4) : !cir.ptr, !cir.bool + cir.store %5, %2 : !cir.bool, !cir.ptr + %6 = cir.load %0 : !cir.ptr>, !cir.ptr + %7 = cir.load %1 : !cir.ptr>, !cir.ptr + %8 = cir.cmp(lt, %6, %7) : !cir.ptr, !cir.bool + cir.store %8, %2 : !cir.bool, !cir.ptr + %9 = cir.load %0 : !cir.ptr>, !cir.ptr + %10 = cir.load %1 : !cir.ptr>, !cir.ptr + %11 = cir.cmp(ge, %9, %10) : !cir.ptr, !cir.bool + cir.store %11, %2 : !cir.bool, !cir.ptr + %12 = cir.load %0 : !cir.ptr>, !cir.ptr + %13 = cir.load %1 : !cir.ptr>, !cir.ptr + %14 = cir.cmp(le, %12, %13) : !cir.ptr, !cir.bool + cir.store %14, %2 : !cir.bool, !cir.ptr + %15 = cir.load %0 : !cir.ptr>, !cir.ptr + %16 = cir.load %1 : !cir.ptr>, !cir.ptr + %17 = cir.cmp(eq, %15, %16) : !cir.ptr, !cir.bool + cir.store %17, %2 : !cir.bool, !cir.ptr + %18 = cir.load %0 : !cir.ptr>, !cir.ptr + %19 = cir.load %1 : !cir.ptr>, !cir.ptr + %20 = cir.cmp(ne, %18, %19) : !cir.ptr, !cir.bool + cir.store %20, %2 : !cir.bool, !cir.ptr + cir.return + } + + // CHECK: cir.func @pointer_cmp(%arg0: !cir.ptr, %arg1: !cir.ptr) { + // CHECK-NEXT: %0 = cir.alloca !cir.ptr, !cir.ptr>, ["a", init] {alignment = 8 : i64} + // CHECK-NEXT: %1 = cir.alloca !cir.ptr, !cir.ptr>, ["b", init] {alignment = 8 : i64} + // CHECK-NEXT: %2 = cir.alloca !cir.bool, !cir.ptr, ["x", init] {alignment = 1 : i64} + // CHECK-NEXT: cir.store %arg0, %0 : !cir.ptr, !cir.ptr> + // CHECK-NEXT: cir.store %arg1, %1 : !cir.ptr, !cir.ptr> + // CHECK-NEXT: %3 = cir.load %0 : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %4 = cir.load %1 : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %5 = cir.cmp(gt, %3, %4) : !cir.ptr, !cir.bool + // CHECK-NEXT: cir.store %5, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %6 = cir.load %0 : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %7 = cir.load %1 : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %8 = cir.cmp(lt, %6, %7) : !cir.ptr, !cir.bool + // CHECK-NEXT: cir.store %8, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %9 = cir.load %0 : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %10 = cir.load %1 : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %11 = cir.cmp(ge, %9, %10) : !cir.ptr, !cir.bool + // CHECK-NEXT: cir.store %11, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %12 = cir.load %0 : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %13 = cir.load %1 : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %14 = cir.cmp(le, %12, %13) : !cir.ptr, !cir.bool + // CHECK-NEXT: cir.store %14, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %15 = cir.load %0 : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %16 = cir.load %1 : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %17 = cir.cmp(eq, %15, %16) : !cir.ptr, !cir.bool + // CHECK-NEXT: cir.store %17, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %18 = cir.load %0 : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %19 = cir.load %1 : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %20 = cir.cmp(ne, %18, %19) : !cir.ptr, !cir.bool + // CHECK-NEXT: cir.store %20, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: cir.return + // CHECK-NEXT: } + + cir.func @bool_cmp(%arg0: !cir.bool, %arg1: !cir.bool) { + %0 = cir.alloca !cir.bool, !cir.ptr, ["a", init] {alignment = 1 : i64} + %1 = cir.alloca !cir.bool, !cir.ptr, ["b", init] {alignment = 1 : i64} + %2 = cir.alloca !cir.bool, !cir.ptr, ["x", init] {alignment = 1 : i64} + cir.store %arg0, %0 : !cir.bool, !cir.ptr + cir.store %arg1, %1 : !cir.bool, !cir.ptr + %3 = cir.load %0 : !cir.ptr, !cir.bool + %4 = cir.cast(bool_to_int, %3 : !cir.bool), !s32i + %5 = cir.load %1 : !cir.ptr, !cir.bool + %6 = cir.cast(bool_to_int, %5 : !cir.bool), !s32i + %7 = cir.cmp(gt, %4, %6) : !s32i, !cir.bool + cir.store %7, %2 : !cir.bool, !cir.ptr + %8 = cir.load %0 : !cir.ptr, !cir.bool + %9 = cir.cast(bool_to_int, %8 : !cir.bool), !s32i + %10 = cir.load %1 : !cir.ptr, !cir.bool + %11 = cir.cast(bool_to_int, %10 : !cir.bool), !s32i + %12 = cir.cmp(lt, %9, %11) : !s32i, !cir.bool + cir.store %12, %2 : !cir.bool, !cir.ptr + %13 = cir.load %0 : !cir.ptr, !cir.bool + %14 = cir.cast(bool_to_int, %13 : !cir.bool), !s32i + %15 = cir.load %1 : !cir.ptr, !cir.bool + %16 = cir.cast(bool_to_int, %15 : !cir.bool), !s32i + %17 = cir.cmp(ge, %14, %16) : !s32i, !cir.bool + cir.store %17, %2 : !cir.bool, !cir.ptr + %18 = cir.load %0 : !cir.ptr, !cir.bool + %19 = cir.cast(bool_to_int, %18 : !cir.bool), !s32i + %20 = cir.load %1 : !cir.ptr, !cir.bool + %21 = cir.cast(bool_to_int, %20 : !cir.bool), !s32i + %22 = cir.cmp(le, %19, %21) : !s32i, !cir.bool + cir.store %22, %2 : !cir.bool, !cir.ptr + %23 = cir.load %0 : !cir.ptr, !cir.bool + %24 = cir.cast(bool_to_int, %23 : !cir.bool), !s32i + %25 = cir.load %1 : !cir.ptr, !cir.bool + %26 = cir.cast(bool_to_int, %25 : !cir.bool), !s32i + %27 = cir.cmp(eq, %24, %26) : !s32i, !cir.bool + cir.store %27, %2 : !cir.bool, !cir.ptr + %28 = cir.load %0 : !cir.ptr, !cir.bool + %29 = cir.cast(bool_to_int, %28 : !cir.bool), !s32i + %30 = cir.load %1 : !cir.ptr, !cir.bool + %31 = cir.cast(bool_to_int, %30 : !cir.bool), !s32i + %32 = cir.cmp(ne, %29, %31) : !s32i, !cir.bool + cir.store %32, %2 : !cir.bool, !cir.ptr + cir.return + } + + // CHECK: cir.func @bool_cmp(%arg0: !cir.bool, %arg1: !cir.bool) { + // CHECK-NEXT: %0 = cir.alloca !cir.bool, !cir.ptr, ["a", init] {alignment = 1 : i64} + // CHECK-NEXT: %1 = cir.alloca !cir.bool, !cir.ptr, ["b", init] {alignment = 1 : i64} + // CHECK-NEXT: %2 = cir.alloca !cir.bool, !cir.ptr, ["x", init] {alignment = 1 : i64} + // CHECK-NEXT: cir.store %arg0, %0 : !cir.bool, !cir.ptr + // CHECK-NEXT: cir.store %arg1, %1 : !cir.bool, !cir.ptr + // CHECK-NEXT: %3 = cir.load %0 : !cir.ptr, !cir.bool + // CHECK-NEXT: %4 = cir.cast(bool_to_int, %3 : !cir.bool), !s32i + // CHECK-NEXT: %5 = cir.load %1 : !cir.ptr, !cir.bool + // CHECK-NEXT: %6 = cir.cast(bool_to_int, %5 : !cir.bool), !s32i + // CHECK-NEXT: %7 = cir.cmp(gt, %4, %6) : !s32i, !cir.bool + // CHECK-NEXT: cir.store %7, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %8 = cir.load %0 : !cir.ptr, !cir.bool + // CHECK-NEXT: %9 = cir.cast(bool_to_int, %8 : !cir.bool), !s32i + // CHECK-NEXT: %10 = cir.load %1 : !cir.ptr, !cir.bool + // CHECK-NEXT: %11 = cir.cast(bool_to_int, %10 : !cir.bool), !s32i + // CHECK-NEXT: %12 = cir.cmp(lt, %9, %11) : !s32i, !cir.bool + // CHECK-NEXT: cir.store %12, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %13 = cir.load %0 : !cir.ptr, !cir.bool + // CHECK-NEXT: %14 = cir.cast(bool_to_int, %13 : !cir.bool), !s32i + // CHECK-NEXT: %15 = cir.load %1 : !cir.ptr, !cir.bool + // CHECK-NEXT: %16 = cir.cast(bool_to_int, %15 : !cir.bool), !s32i + // CHECK-NEXT: %17 = cir.cmp(ge, %14, %16) : !s32i, !cir.bool + // CHECK-NEXT: cir.store %17, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %18 = cir.load %0 : !cir.ptr, !cir.bool + // CHECK-NEXT: %19 = cir.cast(bool_to_int, %18 : !cir.bool), !s32i + // CHECK-NEXT: %20 = cir.load %1 : !cir.ptr, !cir.bool + // CHECK-NEXT: %21 = cir.cast(bool_to_int, %20 : !cir.bool), !s32i + // CHECK-NEXT: %22 = cir.cmp(le, %19, %21) : !s32i, !cir.bool + // CHECK-NEXT: cir.store %22, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %23 = cir.load %0 : !cir.ptr, !cir.bool + // CHECK-NEXT: %24 = cir.cast(bool_to_int, %23 : !cir.bool), !s32i + // CHECK-NEXT: %25 = cir.load %1 : !cir.ptr, !cir.bool + // CHECK-NEXT: %26 = cir.cast(bool_to_int, %25 : !cir.bool), !s32i + // CHECK-NEXT: %27 = cir.cmp(eq, %24, %26) : !s32i, !cir.bool + // CHECK-NEXT: cir.store %27, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: %28 = cir.load %0 : !cir.ptr, !cir.bool + // CHECK-NEXT: %29 = cir.cast(bool_to_int, %28 : !cir.bool), !s32i + // CHECK-NEXT: %30 = cir.load %1 : !cir.ptr, !cir.bool + // CHECK-NEXT: %31 = cir.cast(bool_to_int, %30 : !cir.bool), !s32i + // CHECK-NEXT: %32 = cir.cmp(ne, %29, %31) : !s32i, !cir.bool + // CHECK-NEXT: cir.store %32, %2 : !cir.bool, !cir.ptr + // CHECK-NEXT: cir.return + // CHECK-NEXT: } +} From 02a708b93b9ebe5e4fbc2b266da94677e6f793d3 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 8 Apr 2025 13:01:54 -0400 Subject: [PATCH 1023/1029] [SLP][NFC]Extract TryToFindDuplicates lambda into a separate function, NFC Reviewers: RKSimon, hiraditya Reviewed By: hiraditya, RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/134873 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 161 ++++++++++-------- 1 file changed, 88 insertions(+), 73 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0e6f7e8435e3a..0d415ad689018 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -9062,87 +9062,101 @@ getMainAltOpsNoStateVL(ArrayRef VL) { return std::make_pair(MainOp, AltOp); } +/// Checks that every instruction appears once in the list and if not, packs +/// them, building \p ReuseShuffleIndices mask. The list of unique scalars is +/// extended by poison values to the whole register size. +static bool tryToFindDuplicates(SmallVectorImpl &VL, + SmallVectorImpl &ReuseShuffleIndices, + const TargetTransformInfo &TTI, + const TargetLibraryInfo &TLI, + const InstructionsState &S, + const BoUpSLP::EdgeInfo &UserTreeIdx, + bool DoNotFail) { + // Check that every instruction appears once in this bundle. + SmallVector UniqueValues; + SmallVector NonUniqueValueVL; + SmallDenseMap UniquePositions(VL.size()); + for (Value *V : VL) { + if (isConstant(V)) { + ReuseShuffleIndices.emplace_back( + isa(V) ? PoisonMaskElem : UniqueValues.size()); + UniqueValues.emplace_back(V); + continue; + } + auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + ReuseShuffleIndices.emplace_back(Res.first->second); + if (Res.second) + UniqueValues.emplace_back(V); + } + size_t NumUniqueScalarValues = UniqueValues.size(); + bool IsFullVectors = hasFullVectorsOrPowerOf2( + TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues); + if (NumUniqueScalarValues == VL.size() && + (VectorizeNonPowerOf2 || IsFullVectors)) { + ReuseShuffleIndices.clear(); + } else { + // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. + if ((UserTreeIdx.UserTE && + UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) || + !hasFullVectorsOrPowerOf2(TTI, getValueType(VL.front()), VL.size())) { + LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " + "for nodes with padding.\n"); + return false; + } + LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); + if (NumUniqueScalarValues <= 1 || !IsFullVectors || + (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) { + return isa(V) || !isConstant(V); + }))) { + if (DoNotFail && UniquePositions.size() > 1 && + NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() && + all_of(UniqueValues, IsaPred)) { + // Find the number of elements, which forms full vectors. + unsigned PWSz = getFullVectorNumberOfElements( + TTI, UniqueValues.front()->getType(), UniqueValues.size()); + PWSz = std::min(PWSz, VL.size()); + if (PWSz == VL.size()) { + ReuseShuffleIndices.clear(); + } else { + NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end()); + NonUniqueValueVL.append( + PWSz - UniqueValues.size(), + PoisonValue::get(UniqueValues.front()->getType())); + // Check that extended with poisons operations are still valid for + // vectorization (div/rem are not allowed). + if (!getSameOpcode(NonUniqueValueVL, TLI).valid()) { + LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + return false; + } + VL = NonUniqueValueVL; + } + return true; + } + LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + return false; + } + VL = UniqueValues; + } + return true; +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx, unsigned InterleaveFactor) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); SmallVector ReuseShuffleIndices; - SmallVector UniqueValues; - SmallVector NonUniqueValueVL; + SmallVector NonUniqueValueVL(VL.begin(), VL.end()); auto TryToFindDuplicates = [&](const InstructionsState &S, bool DoNotFail = false) { - // Check that every instruction appears once in this bundle. - SmallDenseMap UniquePositions(VL.size()); - for (Value *V : VL) { - if (isConstant(V)) { - ReuseShuffleIndices.emplace_back( - isa(V) ? PoisonMaskElem : UniqueValues.size()); - UniqueValues.emplace_back(V); - continue; - } - auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); - ReuseShuffleIndices.emplace_back(Res.first->second); - if (Res.second) - UniqueValues.emplace_back(V); - } - size_t NumUniqueScalarValues = UniqueValues.size(); - bool IsFullVectors = hasFullVectorsOrPowerOf2( - *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues); - if (NumUniqueScalarValues == VL.size() && - (VectorizeNonPowerOf2 || IsFullVectors)) { - ReuseShuffleIndices.clear(); - } else { - // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. - if ((UserTreeIdx.UserTE && - UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) || - !hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), - VL.size())) { - LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " - "for nodes with padding.\n"); - auto Invalid = ScheduleBundle::invalid(); - newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx); - return false; - } - LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (NumUniqueScalarValues <= 1 || !IsFullVectors || - (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) { - return isa(V) || !isConstant(V); - }))) { - if (DoNotFail && UniquePositions.size() > 1 && - NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() && - all_of(UniqueValues, IsaPred)) { - // Find the number of elements, which forms full vectors. - unsigned PWSz = getFullVectorNumberOfElements( - *TTI, UniqueValues.front()->getType(), UniqueValues.size()); - PWSz = std::min(PWSz, VL.size()); - if (PWSz == VL.size()) { - ReuseShuffleIndices.clear(); - } else { - NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end()); - NonUniqueValueVL.append( - PWSz - UniqueValues.size(), - PoisonValue::get(UniqueValues.front()->getType())); - // Check that extended with poisons operations are still valid for - // vectorization (div/rem are not allowed). - if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) { - LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - auto Invalid = ScheduleBundle::invalid(); - newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx); - return false; - } - VL = NonUniqueValueVL; - } - return true; - } - LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - auto Invalid = ScheduleBundle::invalid(); - newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx); - return false; - } - VL = UniqueValues; + if (tryToFindDuplicates(NonUniqueValueVL, ReuseShuffleIndices, *TTI, *TLI, + S, UserTreeIdx, DoNotFail)) { + VL = NonUniqueValueVL; + return true; } - return true; + auto Invalid = ScheduleBundle::invalid(); + newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx); + return false; }; InstructionsState S = getSameOpcode(VL, *TLI); @@ -9610,8 +9624,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, BlockScheduling &BS = *BSRef; + SetVector UniqueValues(VL.begin(), VL.end()); std::optional BundlePtr = - BS.tryScheduleBundle(UniqueValues, this, S); + BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S); #ifdef EXPENSIVE_CHECKS // Make sure we didn't break any internal invariants BS.verify(); From edcbd4a21179ca5e0fa9095d28a38fe10de66322 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 8 Apr 2025 13:02:31 -0400 Subject: [PATCH 1024/1029] [SLP][NFC]Extract a check for strided loads into separate function, NFC Reviewers: hiraditya, RKSimon Reviewed By: RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/134876 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 115 +++++++++++------- 1 file changed, 69 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0d415ad689018..8d411f2cb203a 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5597,6 +5597,71 @@ static bool isMaskedLoadCompress( return TotalVecCost < GatherCost; } +/// Checks if strided loads can be generated out of \p VL loads with pointers \p +/// PointerOps: +/// 1. Target with strided load support is detected. +/// 2. The number of loads is greater than MinProfitableStridedLoads, or the +/// potential stride <= MaxProfitableLoadStride and the potential stride is +/// power-of-2 (to avoid perf regressions for the very small number of loads) +/// and max distance > number of loads, or potential stride is -1. +/// 3. The loads are ordered, or number of unordered loads <= +/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is +/// to avoid extra costs for very expensive shuffles). +/// 4. Any pointer operand is an instruction with the users outside of the +/// current graph (for masked gathers extra extractelement instructions +/// might be required). +static bool isStridedLoad(ArrayRef VL, ArrayRef PointerOps, + ArrayRef Order, + const TargetTransformInfo &TTI, const DataLayout &DL, + ScalarEvolution &SE, + const bool IsAnyPointerUsedOutGraph, const int Diff) { + const unsigned Sz = VL.size(); + const unsigned AbsoluteDiff = std::abs(Diff); + Type *ScalarTy = VL.front()->getType(); + auto *VecTy = getWidenedType(ScalarTy, Sz); + if (IsAnyPointerUsedOutGraph || + (AbsoluteDiff > Sz && + (Sz > MinProfitableStridedLoads || + (AbsoluteDiff <= MaxProfitableLoadStride * Sz && + AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) || + Diff == -(static_cast(Sz) - 1)) { + int Stride = Diff / static_cast(Sz - 1); + if (Diff != Stride * static_cast(Sz - 1)) + return false; + Align Alignment = + cast(Order.empty() ? VL.front() : VL[Order.front()]) + ->getAlign(); + if (!TTI.isLegalStridedLoadStore(VecTy, Alignment)) + return false; + Value *Ptr0; + Value *PtrN; + if (Order.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[Order.front()]; + PtrN = PointerOps[Order.back()]; + } + // Iterate through all pointers and check if all distances are + // unique multiple of Dist. + SmallSet Dists; + for (Value *Ptr : PointerOps) { + int Dist = 0; + if (Ptr == PtrN) + Dist = Diff; + else if (Ptr != Ptr0) + Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE); + // If the strides are not the same or repeated, we can't + // vectorize. + if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) + break; + } + if (Dists.size() == Sz) + return true; + } + return false; +} + BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, @@ -5670,59 +5735,17 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, return LoadsState::Vectorize; // Simple check if not a strided access - clear order. bool IsPossibleStrided = *Diff % (Sz - 1) == 0; - // Try to generate strided load node if: - // 1. Target with strided load support is detected. - // 2. The number of loads is greater than MinProfitableStridedLoads, - // or the potential stride <= MaxProfitableLoadStride and the - // potential stride is power-of-2 (to avoid perf regressions for the very - // small number of loads) and max distance > number of loads, or potential - // stride is -1. - // 3. The loads are ordered, or number of unordered loads <= - // MaxProfitableUnorderedLoads, or loads are in reversed order. - // (this check is to avoid extra costs for very expensive shuffles). - // 4. Any pointer operand is an instruction with the users outside of the - // current graph (for masked gathers extra extractelement instructions - // might be required). + // Try to generate strided load node. auto IsAnyPointerUsedOutGraph = IsPossibleStrided && any_of(PointerOps, [&](Value *V) { return isa(V) && any_of(V->users(), [&](User *U) { return !isVectorized(U) && !MustGather.contains(U); }); }); - const unsigned AbsoluteDiff = std::abs(*Diff); if (IsPossibleStrided && - (IsAnyPointerUsedOutGraph || - (AbsoluteDiff > Sz && - (Sz > MinProfitableStridedLoads || - (AbsoluteDiff <= MaxProfitableLoadStride * Sz && - AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) || - *Diff == -(static_cast(Sz) - 1))) { - int Stride = *Diff / static_cast(Sz - 1); - if (*Diff == Stride * static_cast(Sz - 1)) { - Align Alignment = - cast(Order.empty() ? VL.front() : VL[Order.front()]) - ->getAlign(); - if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) { - // Iterate through all pointers and check if all distances are - // unique multiple of Dist. - SmallSet Dists; - for (Value *Ptr : PointerOps) { - int Dist = 0; - if (Ptr == PtrN) - Dist = *Diff; - else if (Ptr != Ptr0) - Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE); - // If the strides are not the same or repeated, we can't - // vectorize. - if (((Dist / Stride) * Stride) != Dist || - !Dists.insert(Dist).second) - break; - } - if (Dists.size() == Sz) - return LoadsState::StridedVectorize; - } - } - } + isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE, + IsAnyPointerUsedOutGraph, *Diff)) + return LoadsState::StridedVectorize; bool IsMasked; unsigned InterleaveFactor; SmallVector CompressMask; From 231aa3070dcd91e10e9972d20f7557c0068c41e3 Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Tue, 8 Apr 2025 10:06:28 -0700 Subject: [PATCH 1025/1029] [OpenACC][CIR] Basic infrastructure for OpenACC lowering (#134717) This is the first of a few patches that will do infrastructure work to enable the OpenACC lowering via the OpenACC dialect. At the moment this just gets the various function calls that will end up generating OpenACC, plus some tests to validate that we're doing the diagnostics in OpenACC specific locations. Additionally, this adds Stmt and Decl files for CIRGen. --- clang/include/clang/AST/DeclOpenACC.h | 2 + clang/include/clang/AST/GlobalDecl.h | 3 + .../clang/Basic/DiagnosticDriverKinds.td | 5 + clang/lib/AST/DeclOpenACC.cpp | 5 + clang/lib/CIR/CodeGen/CIRGenDecl.cpp | 7 ++ clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp | 34 +++++++ clang/lib/CIR/CodeGen/CIRGenFunction.h | 30 ++++++ clang/lib/CIR/CodeGen/CIRGenModule.cpp | 12 +++ clang/lib/CIR/CodeGen/CIRGenModule.h | 2 + clang/lib/CIR/CodeGen/CIRGenStmt.cpp | 44 ++++++--- clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp | 91 +++++++++++++++++++ clang/lib/CIR/CodeGen/CMakeLists.txt | 2 + clang/lib/Frontend/CompilerInvocation.cpp | 49 ++++++++++ .../openacc-not-implemented-global.cpp | 6 ++ .../openacc-not-implemented.cpp | 20 ++++ clang/test/Driver/openacc-no-cir.c | 6 ++ 16 files changed, 303 insertions(+), 15 deletions(-) create mode 100644 clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp create mode 100644 clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp create mode 100644 clang/test/CIR/CodeGenOpenACC/openacc-not-implemented-global.cpp create mode 100644 clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp create mode 100644 clang/test/Driver/openacc-no-cir.c diff --git a/clang/include/clang/AST/DeclOpenACC.h b/clang/include/clang/AST/DeclOpenACC.h index 26cf721561fb1..8c612fbf1ec07 100644 --- a/clang/include/clang/AST/DeclOpenACC.h +++ b/clang/include/clang/AST/DeclOpenACC.h @@ -59,6 +59,8 @@ class OpenACCConstructDecl : public Decl { } ArrayRef clauses() const { return Clauses; } + static bool classof(const Decl *D) { return classofKind(D->getKind()); } + static bool classofKind(Kind K); }; class OpenACCDeclareDecl final diff --git a/clang/include/clang/AST/GlobalDecl.h b/clang/include/clang/AST/GlobalDecl.h index df11a79a56b3b..baf5371d2682d 100644 --- a/clang/include/clang/AST/GlobalDecl.h +++ b/clang/include/clang/AST/GlobalDecl.h @@ -17,6 +17,7 @@ #include "clang/AST/Attr.h" #include "clang/AST/DeclCXX.h" #include "clang/AST/DeclObjC.h" +#include "clang/AST/DeclOpenACC.h" #include "clang/AST/DeclOpenMP.h" #include "clang/AST/DeclTemplate.h" #include "clang/Basic/ABI.h" @@ -86,6 +87,8 @@ class GlobalDecl { GlobalDecl(const ObjCMethodDecl *D) { Init(D); } GlobalDecl(const OMPDeclareReductionDecl *D) { Init(D); } GlobalDecl(const OMPDeclareMapperDecl *D) { Init(D); } + GlobalDecl(const OpenACCRoutineDecl *D) { Init(D); } + GlobalDecl(const OpenACCDeclareDecl *D) { Init(D); } GlobalDecl(const CXXConstructorDecl *D, CXXCtorType Type) : Value(D, Type) {} GlobalDecl(const CXXDestructorDecl *D, CXXDtorType Type) : Value(D, Type) {} GlobalDecl(const VarDecl *D, DynamicInitKind StubKind) diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index a96b4fb33390c..c69ad3adc5b3b 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -850,4 +850,9 @@ def warn_missing_include_dirs : Warning< def err_drv_malformed_warning_suppression_mapping : Error< "failed to process suppression mapping file '%0': %1">; + +def warn_drv_openacc_without_cir + : Warning<"OpenACC directives will result in no runtime behavior; use " + "-fclangir to enable runtime effect">, + InGroup; } diff --git a/clang/lib/AST/DeclOpenACC.cpp b/clang/lib/AST/DeclOpenACC.cpp index 760c08d21cccd..e0fe7be8fc1a3 100644 --- a/clang/lib/AST/DeclOpenACC.cpp +++ b/clang/lib/AST/DeclOpenACC.cpp @@ -17,6 +17,11 @@ using namespace clang; +bool OpenACCConstructDecl::classofKind(Kind K) { + return OpenACCDeclareDecl::classofKind(K) || + OpenACCRoutineDecl::classofKind(K); +} + OpenACCDeclareDecl * OpenACCDeclareDecl::Create(ASTContext &Ctx, DeclContext *DC, SourceLocation StartLoc, SourceLocation DirLoc, diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp index 5b832b463e752..d0eb648683e8c 100644 --- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp @@ -15,6 +15,7 @@ #include "mlir/IR/Location.h" #include "clang/AST/Attr.h" #include "clang/AST/Decl.h" +#include "clang/AST/DeclOpenACC.h" #include "clang/AST/Expr.h" #include "clang/AST/ExprCXX.h" #include "clang/CIR/MissingFeatures.h" @@ -266,6 +267,12 @@ void CIRGenFunction::emitDecl(const Decl &d) { emitVarDecl(vd); return; } + case Decl::OpenACCDeclare: + emitOpenACCDeclare(cast(d)); + return; + case Decl::OpenACCRoutine: + emitOpenACCRoutine(cast(d)); + return; default: cgm.errorNYI(d.getSourceRange(), "emitDecl: unhandled decl type"); } diff --git a/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp new file mode 100644 index 0000000000000..b588a50aa0404 --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenDeclOpenACC.cpp @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This contains code to emit Decl nodes as CIR code. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenFunction.h" +#include "clang/AST/DeclOpenACC.h" + +using namespace clang; +using namespace clang::CIRGen; + +void CIRGenFunction::emitOpenACCDeclare(const OpenACCDeclareDecl &d) { + getCIRGenModule().errorNYI(d.getSourceRange(), "OpenACC Declare Construct"); +} + +void CIRGenFunction::emitOpenACCRoutine(const OpenACCRoutineDecl &d) { + getCIRGenModule().errorNYI(d.getSourceRange(), "OpenACC Routine Construct"); +} + +void CIRGenModule::emitGlobalOpenACCDecl(const OpenACCConstructDecl *d) { + if (isa(d)) + errorNYI(d->getSourceRange(), "OpenACC Routine Construct"); + else if (isa(d)) + errorNYI(d->getSourceRange(), "OpenACC Declare Construct"); + else + llvm_unreachable("unknown OpenACC declaration kind?"); +} diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 1bedbe28ae625..f505ed8fff311 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -509,6 +509,36 @@ class CIRGenFunction : public CIRGenTypeCache { public: Address createTempAlloca(mlir::Type ty, CharUnits align, mlir::Location loc, const Twine &name, bool insertIntoFnEntryBlock); + + //===--------------------------------------------------------------------===// + // OpenACC Emission + //===--------------------------------------------------------------------===// +public: + mlir::LogicalResult + emitOpenACCComputeConstruct(const OpenACCComputeConstruct &s); + mlir::LogicalResult emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s); + mlir::LogicalResult + emitOpenACCCombinedConstruct(const OpenACCCombinedConstruct &s); + mlir::LogicalResult emitOpenACCDataConstruct(const OpenACCDataConstruct &s); + mlir::LogicalResult + emitOpenACCEnterDataConstruct(const OpenACCEnterDataConstruct &s); + mlir::LogicalResult + emitOpenACCExitDataConstruct(const OpenACCExitDataConstruct &s); + mlir::LogicalResult + emitOpenACCHostDataConstruct(const OpenACCHostDataConstruct &s); + mlir::LogicalResult emitOpenACCWaitConstruct(const OpenACCWaitConstruct &s); + mlir::LogicalResult emitOpenACCInitConstruct(const OpenACCInitConstruct &s); + mlir::LogicalResult + emitOpenACCShutdownConstruct(const OpenACCShutdownConstruct &s); + mlir::LogicalResult emitOpenACCSetConstruct(const OpenACCSetConstruct &s); + mlir::LogicalResult + emitOpenACCUpdateConstruct(const OpenACCUpdateConstruct &s); + mlir::LogicalResult + emitOpenACCAtomicConstruct(const OpenACCAtomicConstruct &s); + mlir::LogicalResult emitOpenACCCacheConstruct(const OpenACCCacheConstruct &s); + + void emitOpenACCDeclare(const OpenACCDeclareDecl &d); + void emitOpenACCRoutine(const OpenACCRoutineDecl &d); }; } // namespace clang::CIRGen diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index d3b3b0632c2f0..f0e9b0349f709 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -16,6 +16,7 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/DeclBase.h" +#include "clang/AST/DeclOpenACC.h" #include "clang/AST/GlobalDecl.h" #include "clang/Basic/SourceManager.h" #include "clang/CIR/Dialect/IR/CIRDialect.h" @@ -91,6 +92,11 @@ mlir::Location CIRGenModule::getLoc(SourceRange cRange) { } void CIRGenModule::emitGlobal(clang::GlobalDecl gd) { + if (const auto *cd = dyn_cast(gd.getDecl())) { + emitGlobalOpenACCDecl(cd); + return; + } + const auto *global = cast(gd.getDecl()); if (const auto *fd = dyn_cast(global)) { @@ -423,6 +429,12 @@ void CIRGenModule::emitTopLevelDecl(Decl *decl) { emitGlobal(vd); break; } + case Decl::OpenACCRoutine: + emitGlobalOpenACCDecl(cast(decl)); + break; + case Decl::OpenACCDeclare: + emitGlobalOpenACCDecl(cast(decl)); + break; } } diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index 6ba1ccc4ddd9f..ab4545effde45 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -113,6 +113,8 @@ class CIRGenModule : public CIRGenTypeCache { void emitGlobalVarDefinition(const clang::VarDecl *vd, bool isTentative = false); + void emitGlobalOpenACCDecl(const clang::OpenACCConstructDecl *cd); + /// Return the result of value-initializing the given type, i.e. a null /// expression of the given type. mlir::Value emitNullConstant(QualType t, mlir::Location loc); diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp index 00d33e7feddff..2551a670b5325 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp @@ -16,6 +16,7 @@ #include "mlir/IR/Builders.h" #include "clang/AST/ExprCXX.h" #include "clang/AST/Stmt.h" +#include "clang/AST/StmtOpenACC.h" using namespace clang; using namespace clang::CIRGen; @@ -85,7 +86,34 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s, return emitWhileStmt(cast(*s)); case Stmt::DoStmtClass: return emitDoStmt(cast(*s)); - + case Stmt::OpenACCComputeConstructClass: + return emitOpenACCComputeConstruct(cast(*s)); + case Stmt::OpenACCLoopConstructClass: + return emitOpenACCLoopConstruct(cast(*s)); + case Stmt::OpenACCCombinedConstructClass: + return emitOpenACCCombinedConstruct(cast(*s)); + case Stmt::OpenACCDataConstructClass: + return emitOpenACCDataConstruct(cast(*s)); + case Stmt::OpenACCEnterDataConstructClass: + return emitOpenACCEnterDataConstruct(cast(*s)); + case Stmt::OpenACCExitDataConstructClass: + return emitOpenACCExitDataConstruct(cast(*s)); + case Stmt::OpenACCHostDataConstructClass: + return emitOpenACCHostDataConstruct(cast(*s)); + case Stmt::OpenACCWaitConstructClass: + return emitOpenACCWaitConstruct(cast(*s)); + case Stmt::OpenACCInitConstructClass: + return emitOpenACCInitConstruct(cast(*s)); + case Stmt::OpenACCShutdownConstructClass: + return emitOpenACCShutdownConstruct(cast(*s)); + case Stmt::OpenACCSetConstructClass: + return emitOpenACCSetConstruct(cast(*s)); + case Stmt::OpenACCUpdateConstructClass: + return emitOpenACCUpdateConstruct(cast(*s)); + case Stmt::OpenACCCacheConstructClass: + return emitOpenACCCacheConstruct(cast(*s)); + case Stmt::OpenACCAtomicConstructClass: + return emitOpenACCAtomicConstruct(cast(*s)); case Stmt::OMPScopeDirectiveClass: case Stmt::OMPErrorDirectiveClass: case Stmt::NoStmtClass: @@ -192,20 +220,6 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s, case Stmt::OMPAssumeDirectiveClass: case Stmt::OMPMaskedDirectiveClass: case Stmt::OMPStripeDirectiveClass: - case Stmt::OpenACCComputeConstructClass: - case Stmt::OpenACCLoopConstructClass: - case Stmt::OpenACCCombinedConstructClass: - case Stmt::OpenACCDataConstructClass: - case Stmt::OpenACCEnterDataConstructClass: - case Stmt::OpenACCExitDataConstructClass: - case Stmt::OpenACCHostDataConstructClass: - case Stmt::OpenACCWaitConstructClass: - case Stmt::OpenACCInitConstructClass: - case Stmt::OpenACCShutdownConstructClass: - case Stmt::OpenACCSetConstructClass: - case Stmt::OpenACCUpdateConstructClass: - case Stmt::OpenACCCacheConstructClass: - case Stmt::OpenACCAtomicConstructClass: case Stmt::ObjCAtCatchStmtClass: case Stmt::ObjCAtFinallyStmtClass: cgm.errorNYI(s->getSourceRange(), diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp new file mode 100644 index 0000000000000..cbae170162ffe --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp @@ -0,0 +1,91 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Emit OpenACC Stmt nodes as CIR code. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenBuilder.h" +#include "CIRGenFunction.h" +#include "clang/AST/StmtOpenACC.h" + +using namespace clang; +using namespace clang::CIRGen; +using namespace cir; + +mlir::LogicalResult +CIRGenFunction::emitOpenACCComputeConstruct(const OpenACCComputeConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Compute Construct"); + return mlir::failure(); +} + +mlir::LogicalResult +CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Loop Construct"); + return mlir::failure(); +} +mlir::LogicalResult CIRGenFunction::emitOpenACCCombinedConstruct( + const OpenACCCombinedConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Combined Construct"); + return mlir::failure(); +} +mlir::LogicalResult +CIRGenFunction::emitOpenACCDataConstruct(const OpenACCDataConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Data Construct"); + return mlir::failure(); +} +mlir::LogicalResult CIRGenFunction::emitOpenACCEnterDataConstruct( + const OpenACCEnterDataConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC EnterData Construct"); + return mlir::failure(); +} +mlir::LogicalResult CIRGenFunction::emitOpenACCExitDataConstruct( + const OpenACCExitDataConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC ExitData Construct"); + return mlir::failure(); +} +mlir::LogicalResult CIRGenFunction::emitOpenACCHostDataConstruct( + const OpenACCHostDataConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC HostData Construct"); + return mlir::failure(); +} +mlir::LogicalResult +CIRGenFunction::emitOpenACCWaitConstruct(const OpenACCWaitConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Wait Construct"); + return mlir::failure(); +} +mlir::LogicalResult +CIRGenFunction::emitOpenACCInitConstruct(const OpenACCInitConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Init Construct"); + return mlir::failure(); +} +mlir::LogicalResult CIRGenFunction::emitOpenACCShutdownConstruct( + const OpenACCShutdownConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Shutdown Construct"); + return mlir::failure(); +} +mlir::LogicalResult +CIRGenFunction::emitOpenACCSetConstruct(const OpenACCSetConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Set Construct"); + return mlir::failure(); +} +mlir::LogicalResult +CIRGenFunction::emitOpenACCUpdateConstruct(const OpenACCUpdateConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Update Construct"); + return mlir::failure(); +} +mlir::LogicalResult +CIRGenFunction::emitOpenACCAtomicConstruct(const OpenACCAtomicConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Atomic Construct"); + return mlir::failure(); +} +mlir::LogicalResult +CIRGenFunction::emitOpenACCCacheConstruct(const OpenACCCacheConstruct &s) { + getCIRGenModule().errorNYI(s.getSourceRange(), "OpenACC Cache Construct"); + return mlir::failure(); +} diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt index da8d63ca569af..11902c708c505 100644 --- a/clang/lib/CIR/CodeGen/CMakeLists.txt +++ b/clang/lib/CIR/CodeGen/CMakeLists.txt @@ -9,6 +9,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) add_clang_library(clangCIR CIRGenerator.cpp CIRGenDecl.cpp + CIRGenDeclOpenACC.cpp CIRGenExpr.cpp CIRGenExprAggregate.cpp CIRGenExprConstant.cpp @@ -16,6 +17,7 @@ add_clang_library(clangCIR CIRGenFunction.cpp CIRGenModule.cpp CIRGenStmt.cpp + CIRGenStmtOpenACC.cpp CIRGenTypes.cpp DEPENDS diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 572c71ef1001c..cfc5c069b0849 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4674,6 +4674,51 @@ static bool isStrictlyPreprocessorAction(frontend::ActionKind Action) { llvm_unreachable("invalid frontend action"); } +static bool isCodeGenAction(frontend::ActionKind Action) { + switch (Action) { + case frontend::EmitAssembly: + case frontend::EmitBC: + case frontend::EmitCIR: + case frontend::EmitHTML: + case frontend::EmitLLVM: + case frontend::EmitLLVMOnly: + case frontend::EmitCodeGenOnly: + case frontend::EmitObj: + case frontend::GenerateModule: + case frontend::GenerateModuleInterface: + case frontend::GenerateReducedModuleInterface: + case frontend::GenerateHeaderUnit: + case frontend::GeneratePCH: + case frontend::GenerateInterfaceStubs: + return true; + case frontend::ASTDeclList: + case frontend::ASTDump: + case frontend::ASTPrint: + case frontend::ASTView: + case frontend::ExtractAPI: + case frontend::FixIt: + case frontend::ParseSyntaxOnly: + case frontend::ModuleFileInfo: + case frontend::VerifyPCH: + case frontend::PluginAction: + case frontend::RewriteObjC: + case frontend::RewriteTest: + case frontend::RunAnalysis: + case frontend::TemplightDump: + case frontend::DumpCompilerOptions: + case frontend::DumpRawTokens: + case frontend::DumpTokens: + case frontend::InitOnly: + case frontend::PrintPreamble: + case frontend::PrintPreprocessedInput: + case frontend::RewriteMacros: + case frontend::RunPreprocessorOnly: + case frontend::PrintDependencyDirectivesSourceMinimizerOutput: + return false; + } + llvm_unreachable("invalid frontend action"); +} + static void GeneratePreprocessorArgs(const PreprocessorOptions &Opts, ArgumentConsumer Consumer, const LangOptions &LangOpts, @@ -5001,6 +5046,10 @@ bool CompilerInvocation::CreateFromArgsImpl( Res.getTargetOpts().HostTriple = Res.getFrontendOpts().AuxTriple; } + if (LangOpts.OpenACC && !Res.getFrontendOpts().UseClangIRPipeline && + isCodeGenAction(Res.getFrontendOpts().ProgramAction)) + Diags.Report(diag::warn_drv_openacc_without_cir); + // Set the triple of the host for OpenMP device compile. if (LangOpts.OpenMPIsTargetDevice) Res.getTargetOpts().HostTriple = Res.getFrontendOpts().AuxTriple; diff --git a/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented-global.cpp b/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented-global.cpp new file mode 100644 index 0000000000000..2aa32b0484f2c --- /dev/null +++ b/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented-global.cpp @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fopenacc -fclangir -emit-cir %s -o %t.cir -verify +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fopenacc -fclangir -emit-llvm %s -o %t-cir.ll -verify + +int Global; +// expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Declare Construct}} +#pragma acc declare create(Global) diff --git a/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp b/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp new file mode 100644 index 0000000000000..61bed79dc14ea --- /dev/null +++ b/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp @@ -0,0 +1,20 @@ +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fopenacc -fclangir -emit-cir %s -o %t.cir -verify +// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fopenacc -fclangir -emit-llvm %s -o %t-cir.ll -verify + +void HelloWorld(int *A, int *B, int *C, int N) { + +// expected-error@+2{{ClangIR code gen Not Yet Implemented: OpenACC Compute Construct}} +// expected-error@+1{{ClangIR code gen Not Yet Implemented: statement}} +#pragma acc parallel + for (unsigned I = 0; I < N; ++I) + A[I] = B[I] + C[I]; + +// expected-error@+2{{ClangIR code gen Not Yet Implemented: OpenACC Loop Construct}} +// expected-error@+1{{ClangIR code gen Not Yet Implemented: statement}} +#pragma acc loop + for (unsigned I = 0; I < N; ++I) + A[I] = B[I] + C[I]; + +// expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Declare Construct}} +#pragma acc declare create(A) +} diff --git a/clang/test/Driver/openacc-no-cir.c b/clang/test/Driver/openacc-no-cir.c new file mode 100644 index 0000000000000..7b67df2b6b886 --- /dev/null +++ b/clang/test/Driver/openacc-no-cir.c @@ -0,0 +1,6 @@ +// RUN: %clang -fopenacc -S %s 2>&1 | FileCheck %s -check-prefix=ERROR +// RUN: %clang -fclangir -fopenacc -S %s 2>&1 | FileCheck %s --allow-empty -check-prefix=NOERROR +// RUN: %clang -fopenacc -fclangir -S %s 2>&1 | FileCheck %s --allow-empty -check-prefix=NOERROR + +// ERROR: OpenACC directives will result in no runtime behavior; use -fclangir to enable runtime effect +// NOERROR-NOT: OpenACC directives From 05a775ad7664d7f4a2392144e8abca3aeff6c7d4 Mon Sep 17 00:00:00 2001 From: Weiwei Chen Date: Tue, 8 Apr 2025 13:08:08 -0400 Subject: [PATCH 1026/1029] Test runs. --- llvm/lib/MCLinker/MCLinker.cpp | 16 +++---- llvm/unittests/MCLinker/MCLinkerTest.cpp | 57 +++++++++++++++++++++--- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/llvm/lib/MCLinker/MCLinker.cpp b/llvm/lib/MCLinker/MCLinker.cpp index df6f7dadb3a62..57bfbe8d309fd 100644 --- a/llvm/lib/MCLinker/MCLinker.cpp +++ b/llvm/lib/MCLinker/MCLinker.cpp @@ -34,8 +34,7 @@ MCInfo::MCInfo(std::unique_ptr &&MachineModuleInfo, std::unique_ptr &&TgtMachine, std::unique_ptr &&McContext, std::optional SplitIdx) - : ModuleAndContext(std::move(MAndContext)), - McContext(std::move(McContext)), + : ModuleAndContext(std::move(MAndContext)), McContext(std::move(McContext)), MachineModuleInfo(std::move(MachineModuleInfo)), TgtMachine(std::move(TgtMachine)), SplitIdx(SplitIdx) { std::string BufStr; @@ -43,8 +42,8 @@ MCInfo::MCInfo(std::unique_ptr &&MachineModuleInfo, llvm::WriteBitcodeToFile(*ModuleAndContext, BufOS); ModuleBuf = WritableMemoryBuffer::getNewUninitMemBuffer(BufStr.size()); memcpy(ModuleBuf->getBufferStart(), BufStr.c_str(), BufStr.size()); - for(Function& F: ModuleAndContext->functions()) { - FnNameToFnPtr.insert( {F.getName(), &F}); + for (Function &F : ModuleAndContext->functions()) { + FnNameToFnPtr.insert({F.getName(), &F}); } } @@ -253,16 +252,15 @@ MCLinker::linkAndPrint(StringRef ModuleName, llvm::CodeGenFileType CodegenType, inconvertibleErrorCode()); } - std::unique_ptr LinkedObj = - WritableMemoryBuffer::getNewUninitMemBuffer(Buf.size()); - memcpy(LinkedObj->getBufferStart(), Buf.c_str(), Buf.size()); - const_cast( LLVMTgtMachine.getObjFileLowering()) ->Initialize(MachineModInfoPass->getMMI().getContext(), TgtMachine); - PassMgr.run(*LinkedModule); + std::unique_ptr LinkedObj = + WritableMemoryBuffer::getNewUninitMemBuffer(Buf.size()); + memcpy(LinkedObj->getBufferStart(), Buf.c_str(), Buf.size()); + // Release some of the AsyncValue memory to avoid // wrong version of LLVMContext destructor being called due to // multiple LLVM being statically linked in dylibs that have diff --git a/llvm/unittests/MCLinker/MCLinkerTest.cpp b/llvm/unittests/MCLinker/MCLinkerTest.cpp index 1563ef6211d33..2b1f1e6570ddf 100644 --- a/llvm/unittests/MCLinker/MCLinkerTest.cpp +++ b/llvm/unittests/MCLinker/MCLinkerTest.cpp @@ -27,6 +27,8 @@ #include "llvm/IR/Module.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/TargetRegistry.h" +#include "llvm/Object/ELFObjectFile.h" +#include "llvm/Object/ObjectFile.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" @@ -86,19 +88,21 @@ class MCLinkerTest : public testing::Test { const char *FooStr = R""""( define void @foo() { call void @baz() + call void @boo() ret void } - define void @baz() { + define internal void @baz() { ret void } define void @bar() { call void @baz() + call void @boo() ret void } - define void @boo() { + define internal void @boo() { ret void } )""""; @@ -144,33 +148,74 @@ TEST_F(MCLinkerTest, SplitModuleCompilerMCLink) { std::unique_ptr MCCtx = getMCContext(*TM); MachineModuleInfoWrapperPass *MMIWP = getMMIWP(*TM, *MCCtx); + // Create codegen pipeline. legacy::PassManager PassMgr; mclinker::addPassesToEmitMC(*TM, PassMgr, true, MMIWP, NumFunctionsBase); + // Generate code. if (!PassMgr.run(*SubModule)) Failed = true; + // Put codegen result back. SMCInfo.McInfos.emplace_back(std::make_unique( std::make_unique(std::move(MMIWP->getMMI())), std::move(SubModule), std::move(TM), std::move(MCCtx), Idx)); }; + // Split the module into per-function submodules and run codegen pipeline for + // each submodule but stop before AsmPrint. splitPerFunction(std::move(M), OutputLambda, SMCInfo.SymbolLinkageTypes, 0); + // Create and run MCLinker. std::unique_ptr TMMCLink = getTargetMachine(); SmallVector SMCInfos{&SMCInfo}; llvm::StringMap SymbolLinkageTypes; MCLinker Linker(SMCInfos, *TMMCLink, SymbolLinkageTypes); - Expected> LinkResult = Linker.linkAndPrint("SplitModuleCompilerMCLink", - llvm::CodeGenFileType::AssemblyFile, true); + llvm::CodeGenFileType::ObjectFile, true); + // Check MCLinker is successful. ASSERT_FALSE((!LinkResult)); - llvm::dbgs() << "Size: " << (*LinkResult)->getBufferSize() << "\n"; - llvm::dbgs() << StringRef((*LinkResult)->getBufferStart()) << "\n"; + // Check the binary object output. + Expected> Binary = + llvm::object::createBinary((*LinkResult)->getMemBufferRef()); + + ASSERT_FALSE((!Binary)); + + llvm::object::ObjectFile *O = + dyn_cast((*Binary).get()); + ASSERT_TRUE(O != nullptr); + + if (!O->isELF()) + GTEST_SKIP(); + + auto *ELFO = dyn_cast(O); + + if (!ELFO) + GTEST_SKIP(); + + for (auto Sym : ELFO->symbols()) { + Expected Name = Sym.getName(); + if (!Name) + GTEST_SKIP(); + + if (*Name == "foo") { + // foo is global + EXPECT_TRUE(Sym.getBinding() == 1); + } else if (*Name == "bar") { + // bar is global + EXPECT_TRUE(Sym.getBinding() == 1); + } else if (*Name == "baz") { + // baz is internal + EXPECT_TRUE(Sym.getBinding() == 0); + } else if (*Name == "boo") { + // boo is internal + EXPECT_TRUE(Sym.getBinding() == 0); + } + } } } // end anonymous namespace From 4b7caf2506639eaf9890e788eb06a36c5115b523 Mon Sep 17 00:00:00 2001 From: Weiwei Chen Date: Tue, 8 Apr 2025 13:09:04 -0400 Subject: [PATCH 1027/1029] minor update. --- llvm/unittests/MCLinker/MCLinkerTest.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/unittests/MCLinker/MCLinkerTest.cpp b/llvm/unittests/MCLinker/MCLinkerTest.cpp index 2b1f1e6570ddf..a5bfb598b4a55 100644 --- a/llvm/unittests/MCLinker/MCLinkerTest.cpp +++ b/llvm/unittests/MCLinker/MCLinkerTest.cpp @@ -185,14 +185,14 @@ TEST_F(MCLinkerTest, SplitModuleCompilerMCLink) { ASSERT_FALSE((!Binary)); - llvm::object::ObjectFile *O = + llvm::object::ObjectFile *Obj = dyn_cast((*Binary).get()); - ASSERT_TRUE(O != nullptr); + ASSERT_TRUE(Obj != nullptr); - if (!O->isELF()) + if (!Obj->isELF()) GTEST_SKIP(); - auto *ELFO = dyn_cast(O); + auto *ELFO = dyn_cast(Obj); if (!ELFO) GTEST_SKIP(); From 06333486c60a74d0aa03f5390ca630911695ea43 Mon Sep 17 00:00:00 2001 From: Weiwei Chen Date: Tue, 8 Apr 2025 13:11:41 -0400 Subject: [PATCH 1028/1029] Initialize all. --- llvm/unittests/MCLinker/MCLinkerTest.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/unittests/MCLinker/MCLinkerTest.cpp b/llvm/unittests/MCLinker/MCLinkerTest.cpp index a5bfb598b4a55..1073f7a402f4c 100644 --- a/llvm/unittests/MCLinker/MCLinkerTest.cpp +++ b/llvm/unittests/MCLinker/MCLinkerTest.cpp @@ -45,10 +45,10 @@ namespace { class MCLinkerTest : public testing::Test { protected: static void SetUpTestCase() { - LLVMInitializeX86TargetInfo(); - LLVMInitializeX86TargetMC(); - LLVMInitializeX86Target(); - LLVMInitializeX86AsmPrinter(); + InitializeAllTargetMCs(); + InitializeAllTargetInfos(); + InitializeAllTargets(); + InitializeAllAsmPrinters(); } // Get TargetMachine. From 9ae27188574755c5572624a514cc48d07a390ad5 Mon Sep 17 00:00:00 2001 From: Weiwei Chen Date: Tue, 8 Apr 2025 13:42:33 -0400 Subject: [PATCH 1029/1029] Fix format. --- llvm/include/llvm/MCLinker/MCPipeline.h | 3 +-- llvm/lib/MCLinker/MCPipeline.cpp | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/MCLinker/MCPipeline.h b/llvm/include/llvm/MCLinker/MCPipeline.h index 6397a236cba80..4501f61e04475 100644 --- a/llvm/include/llvm/MCLinker/MCPipeline.h +++ b/llvm/include/llvm/MCLinker/MCPipeline.h @@ -22,8 +22,7 @@ namespace mclinker { /// Build a pipeline that does machine specific codgen but stops before /// AsmPrint. bool addPassesToEmitMC(llvm::TargetMachine &, llvm::legacy::PassManagerBase &, - bool, - llvm::MachineModuleInfoWrapperPass *, unsigned); + bool, llvm::MachineModuleInfoWrapperPass *, unsigned); /// Build a pipeline that does AsmPrint only. bool addPassesToAsmPrint(llvm::TargetMachine &, llvm::legacy::PassManagerBase &, diff --git a/llvm/lib/MCLinker/MCPipeline.cpp b/llvm/lib/MCLinker/MCPipeline.cpp index db5aa32eecda3..c4f66f12e80e3 100644 --- a/llvm/lib/MCLinker/MCPipeline.cpp +++ b/llvm/lib/MCLinker/MCPipeline.cpp @@ -57,8 +57,8 @@ bool SetMachineFunctionBasePass::doFinalization(llvm::Module &) { /// AsmPrint. Returns true if failed. bool llvm::mclinker::addPassesToEmitMC( llvm::TargetMachine &TgtMachine, llvm::legacy::PassManagerBase &PM, - bool DisableVerify, - llvm::MachineModuleInfoWrapperPass *MMIWP, unsigned NumFnBase) { + bool DisableVerify, llvm::MachineModuleInfoWrapperPass *MMIWP, + unsigned NumFnBase) { // Targets may override createPassConfig to provide a target-specific // subclass. TargetPassConfig *PassConfig = TgtMachine.createPassConfig(PM);